From fffc7c19e647171adc45a0d2816101595ea13eec Mon Sep 17 00:00:00 2001 From: "Matthias J. Kannwischer" Date: Fri, 17 Mar 2023 14:54:17 +0800 Subject: [PATCH] initial commit copy https://gitlab.com/arm-research/security/pqax/-/tree/80809f89f51b0bf69eafc597b2d8dfbbab92db42 --- .gitignore | 5 + .gitmodules | 3 + LICENSE | 25 + Makefile | 443 ++ README.md | 70 + asm/Makefile | 483 ++ ...t_u32_full_33556993_28678040_var_4_4_0_0.s | 2422 +++++++ ..._u32_full_33556993_28678040_var_4_4_10_0.s | 2486 +++++++ ..._u32_full_33556993_28678040_var_4_4_11_0.s | 2422 +++++++ ..._u32_full_33556993_28678040_var_4_4_12_0.s | 2422 +++++++ ..._u32_full_33556993_28678040_var_4_4_13_0.s | 2422 +++++++ ..._u32_full_33556993_28678040_var_4_4_14_0.s | 2506 +++++++ ..._u32_full_33556993_28678040_var_4_4_15_0.s | 2506 +++++++ ..._u32_full_33556993_28678040_var_4_4_16_0.s | 2506 +++++++ ..._u32_full_33556993_28678040_var_4_4_17_0.s | 2486 +++++++ ..._u32_full_33556993_28678040_var_4_4_18_0.s | 2486 +++++++ ...t_u32_full_33556993_28678040_var_4_4_1_0.s | 2422 +++++++ ...t_u32_full_33556993_28678040_var_4_4_2_0.s | 2422 +++++++ ...t_u32_full_33556993_28678040_var_4_4_3_0.s | 2422 +++++++ ...32_full_33556993_28678040_var_4_4_3_z2_0.s | 2422 +++++++ ...32_full_33556993_28678040_var_4_4_3_z2_1.s | 2422 +++++++ ...32_full_33556993_28678040_var_4_4_3_z2_2.s | 2422 +++++++ ...32_full_33556993_28678040_var_4_4_3_z2_3.s | 2422 +++++++ ...32_full_33556993_28678040_var_4_4_3_z2_4.s | 2422 +++++++ ...32_full_33556993_28678040_var_4_4_3_z2_5.s | 2422 +++++++ ...32_full_33556993_28678040_var_4_4_3_z4_0.s | 2422 +++++++ ...32_full_33556993_28678040_var_4_4_3_z4_1.s | 2422 +++++++ ...32_full_33556993_28678040_var_4_4_3_z4_2.s | 2422 +++++++ ...32_full_33556993_28678040_var_4_4_3_z4_3.s | 2422 +++++++ ...32_full_33556993_28678040_var_4_4_3_z4_4.s | 2422 +++++++ ...t_u32_full_33556993_28678040_var_4_4_4_0.s | 2422 +++++++ ...t_u32_full_33556993_28678040_var_4_4_5_0.s | 2422 +++++++ ...t_u32_full_33556993_28678040_var_4_4_6_0.s | 2422 +++++++ ...t_u32_full_33556993_28678040_var_4_4_7_0.s | 2422 +++++++ ...t_u32_full_33556993_28678040_var_4_4_8_0.s | 2422 +++++++ ...t_u32_full_33556993_28678040_var_4_4_9_0.s | 2422 +++++++ ...2_incomplete_33556993_28678040_var_3_3_0.s | 1474 ++++ ...2_incomplete_33556993_28678040_var_3_3_1.s | 1474 ++++ ...2_incomplete_33556993_28678040_var_3_3_2.s | 1474 ++++ ...2_incomplete_33556993_28678040_var_3_3_3.s | 1474 ++++ ...2_incomplete_33556993_28678040_var_3_3_4.s | 1474 ++++ ...2_incomplete_33556993_28678040_var_3_3_5.s | 1474 ++++ ...incomplete_33556993_28678040_var_4_2_0_0.s | 1494 ++++ ...omplete_33556993_28678040_var_4_2_0_z4_0.s | 1494 ++++ ...mplete_33556993_28678040_var_4_2_0_z4_16.s | 1494 ++++ ...mplete_33556993_28678040_var_4_2_10_z4_7.s | 1558 ++++ ...mplete_33556993_28678040_var_4_2_11_z4_7.s | 1494 ++++ ...mplete_33556993_28678040_var_4_2_12_z4_7.s | 1494 ++++ ...mplete_33556993_28678040_var_4_2_13_z4_7.s | 1494 ++++ ...mplete_33556993_28678040_var_4_2_14_z4_7.s | 1578 +++++ ...mplete_33556993_28678040_var_4_2_15_z4_7.s | 1578 +++++ ...mplete_33556993_28678040_var_4_2_16_z4_7.s | 1578 +++++ ...mplete_33556993_28678040_var_4_2_17_z4_7.s | 1558 ++++ ...mplete_33556993_28678040_var_4_2_18_z4_7.s | 1558 ++++ ...mplete_33556993_28678040_var_4_2_19_z4_7.s | 1558 ++++ ...mplete_33556993_28678040_var_4_2_20_z4_7.s | 1558 ++++ ...mplete_33556993_28678040_var_4_2_21_z4_7.s | 1558 ++++ ...plete_33556993_28678040_var_4_2_22_z4_10.s | 1550 ++++ ...plete_33556993_28678040_var_4_2_22_z4_11.s | 1550 ++++ ...plete_33556993_28678040_var_4_2_22_z4_12.s | 1550 ++++ ...plete_33556993_28678040_var_4_2_22_z4_13.s | 1550 ++++ ...plete_33556993_28678040_var_4_2_22_z4_14.s | 1550 ++++ ...plete_33556993_28678040_var_4_2_22_z4_15.s | 1550 ++++ ...mplete_33556993_28678040_var_4_2_22_z4_7.s | 1550 ++++ ...mplete_33556993_28678040_var_4_2_22_z4_8.s | 1550 ++++ ...mplete_33556993_28678040_var_4_2_22_z4_9.s | 1558 ++++ ...mplete_33556993_28678040_var_4_2_24_z4_0.s | 1494 ++++ ...plete_33556993_28678040_var_4_2_24_z4_16.s | 1494 ++++ ...omplete_33556993_28678040_var_4_2_3_z4_0.s | 1494 ++++ ...omplete_33556993_28678040_var_4_2_3_z4_1.s | 1494 ++++ ...omplete_33556993_28678040_var_4_2_3_z4_2.s | 1494 ++++ ...omplete_33556993_28678040_var_4_2_3_z4_3.s | 1494 ++++ ...omplete_33556993_28678040_var_4_2_3_z4_4.s | 1494 ++++ ...omplete_33556993_28678040_var_4_2_3_z4_5.s | 1494 ++++ ...omplete_33556993_28678040_var_4_2_7_z4_0.s | 1494 ++++ ...omplete_33556993_28678040_var_4_2_7_z4_1.s | 1494 ++++ ...mplete_33556993_28678040_var_4_2_7_z4_10.s | 1494 ++++ ...omplete_33556993_28678040_var_4_2_7_z4_2.s | 1494 ++++ ...omplete_33556993_28678040_var_4_2_7_z4_3.s | 1494 ++++ ...omplete_33556993_28678040_var_4_2_7_z4_4.s | 1494 ++++ ...omplete_33556993_28678040_var_4_2_7_z4_5.s | 1494 ++++ ...omplete_33556993_28678040_var_4_2_7_z4_6.s | 1494 ++++ ...omplete_33556993_28678040_var_4_2_7_z4_7.s | 1494 ++++ ...omplete_33556993_28678040_var_4_2_7_z4_8.s | 1494 ++++ ...omplete_33556993_28678040_var_4_2_7_z4_9.s | 1502 ++++ ...omplete_33556993_28678040_var_4_2_8_z4_7.s | 1494 ++++ ...omplete_33556993_28678040_var_4_2_9_z4_7.s | 1494 ++++ ...2_incomplete_33556993_28678040_var_3_3_0.s | 1475 ++++ ...594067788289_60277548896192635_var_3_3_0.s | 2727 +++++++ ...594067788289_60277548896192635_var_3_3_1.s | 2727 +++++++ ...594067788289_60277548896192635_var_3_3_2.s | 2727 +++++++ .../basemul_64_72057594067788289.s | 105 + asm/manual/keccak_f1600/keccak_f1600.py | 508 ++ .../keccak_f1600/keccak_f1600_variants.h | 116 + .../keccak_f1600/keccak_f1600_x1_scalar_C.c | 591 ++ .../keccak_f1600_x1_scalar_asm_v1.s | 413 ++ .../keccak_f1600_x1_scalar_asm_v2.s | 505 ++ .../keccak_f1600_x1_scalar_asm_v3.s | 494 ++ .../keccak_f1600_x1_scalar_asm_v4.s | 495 ++ .../keccak_f1600_x1_scalar_asm_v5.s | 506 ++ .../keccak_f1600_x2_hybrid_asm_v1.s | 417 ++ .../keccak_f1600_x2_hybrid_asm_v2p0.s | 830 +++ .../keccak_f1600_x2_hybrid_asm_v2p1.s | 880 +++ .../keccak_f1600_x2_hybrid_asm_v2p2.s | 971 +++ .../keccak_f1600_x2_hybrid_asm_v2pp0.s | 804 +++ .../keccak_f1600_x2_hybrid_asm_v2pp1.s | 805 +++ .../keccak_f1600_x2_hybrid_asm_v2pp2.s | 804 +++ .../keccak_f1600_x2_v84a_asm_v1.s | 338 + .../keccak_f1600_x2_v84a_asm_v1p0.s | 465 ++ .../keccak_f1600_x2_v84a_asm_v2.s | 375 + .../keccak_f1600_x2_v84a_asm_v2p0.s | 596 ++ .../keccak_f1600_x2_v84a_asm_v2p1.s | 732 ++ .../keccak_f1600_x2_v84a_asm_v2p2.s | 802 +++ .../keccak_f1600_x2_v84a_asm_v2p3.s | 773 ++ .../keccak_f1600_x2_v84a_asm_v2p4.s | 689 ++ .../keccak_f1600_x2_v84a_asm_v2p5.s | 949 +++ .../keccak_f1600_x2_v84a_asm_v2p6.s | 948 +++ .../keccak_f1600_x2_v84a_asm_v2pp0.s | 729 ++ .../keccak_f1600_x2_v84a_asm_v2pp1.s | 755 ++ .../keccak_f1600_x2_v84a_asm_v2pp2.s | 798 +++ .../keccak_f1600_x2_v84a_asm_v2pp3.s | 905 +++ .../keccak_f1600_x2_v84a_asm_v2pp4.s | 797 +++ .../keccak_f1600_x2_v84a_asm_v2pp5.s | 806 +++ .../keccak_f1600_x2_v84a_asm_v2pp6.s | 917 +++ .../keccak_f1600_x2_v84a_asm_v2pp7.s | 901 +++ .../keccak_f1600_x3_hybrid_asm_v3p.s | 971 +++ .../keccak_f1600_x3_hybrid_asm_v6.s | 1377 ++++ .../keccak_f1600_x3_hybrid_asm_v7.s | 924 +++ .../keccak_f1600_x4_hybrid_asm_v1.s | 1142 +++ .../keccak_f1600_x4_hybrid_asm_v2.s | 991 +++ .../keccak_f1600_x4_hybrid_asm_v2p0.s | 993 +++ .../keccak_f1600_x4_hybrid_asm_v3.s | 1015 +++ .../keccak_f1600_x4_hybrid_asm_v3p.s | 1016 +++ .../keccak_f1600_x4_hybrid_asm_v3pp.s | 1022 +++ .../keccak_f1600_x4_hybrid_asm_v4.s | 1018 +++ .../keccak_f1600_x4_hybrid_asm_v4p.s | 1026 +++ .../keccak_f1600_x4_hybrid_asm_v5.s | 1360 ++++ .../keccak_f1600_x4_hybrid_asm_v5p.s | 1337 ++++ .../keccak_f1600_x4_hybrid_asm_v6.s | 1385 ++++ .../keccak_f1600_x4_hybrid_asm_v7.s | 1266 ++++ .../keccak_f1600_x4_hybrid_asm_v8.s | 1367 ++++ .../keccak_f1600_x4_scalar_asm_v1.s | 561 ++ .../keccak_f1600_x4_scalar_asm_v5.s | 543 ++ .../keccak_f1600_x4_v84a_asm_v1p0.s | 452 ++ .../keccak_f1600_x5_hybrid_asm_v8.s | 1635 +++++ .../keccak_f1600_x5_hybrid_asm_v8p.s | 1306 ++++ asm/manual/keccak_f1600/macros.s | 35 + asm/manual/keccak_f1600/third_party/LICENSE | 1 + .../keccak_f1600/third_party/keccakx2_C.c | 330 + .../keccak_f1600/third_party/keccakx2_bas.s | 203 + .../third_party/keccakx2_cothan.c | 404 ++ asm/scripts/ntt_neon/ntt_neon.py | 6200 ++++++++++++++++ asm/scripts/ntt_sve2/ntt_sve2.py | 6249 +++++++++++++++++ envs/cross/.gitignore | 2 + envs/cross/Makefile | 106 + envs/cross/inc/hal_env.h | 9 + envs/cross/inc/test_inc | 1 + envs/cross/src/hal.c | 172 + envs/cross/src/test_common | 1 + envs/cross/src/test_src | 1 + envs/native_linux/.gitignore | 2 + envs/native_linux/Makefile | 102 + envs/native_linux/inc/hal_env.h | 9 + envs/native_linux/inc/test_inc | 1 + envs/native_linux/src/hal.c | 172 + envs/native_linux/src/test_common | 1 + envs/native_mac/.gitignore | 2 + envs/native_mac/Makefile | 78 + envs/native_mac/inc/hal_env.h | 9 + envs/native_mac/inc/test_inc | 1 + envs/native_mac/src/hal.c | 52 + envs/native_mac/src/test_common | 1 + envs/native_mac/src/test_src | 1 + nelight | 1 + sphincsplus/README.md | 20 + sphincsplus/convert-keccak-benchmarks.py | 211 + sphincsplus/convert-sphincs-benchmarks.py | 306 + sphincsplus/keccak-benchmarks.md | 423 ++ sphincsplus/keccak-results.md | 29 + sphincsplus/sphincs-results.md | 65 + sphincsplus/sphincsplus-keccakx2/LICENSE | 121 + sphincsplus/sphincsplus-keccakx2/Makefile | 94 + sphincsplus/sphincsplus-keccakx2/README.md | 38 + sphincsplus/sphincsplus-keccakx2/address.c | 112 + sphincsplus/sphincsplus-keccakx2/address.h | 51 + sphincsplus/sphincsplus-keccakx2/api.h | 77 + sphincsplus/sphincsplus-keccakx2/bench_x2.sh | 92 + .../sphincsplus-keccakx2/benchmarks.md | 17 + .../sphincs-shake-128f-robust_BAS | 14 + .../sphincs-shake-128f-robust_C | 14 + .../sphincs-shake-128f-robust_COTHANV8 | 14 + .../sphincs-shake-128f-simple_BAS | 14 + .../sphincs-shake-128f-simple_C | 14 + .../sphincs-shake-128f-simple_COTHANV8 | 14 + .../sphincs-shake-128s-robust_BAS | 14 + .../sphincs-shake-128s-robust_C | 14 + .../sphincs-shake-128s-robust_COTHANV8 | 14 + .../sphincs-shake-128s-simple_BAS | 14 + .../sphincs-shake-128s-simple_C | 14 + .../sphincs-shake-128s-simple_COTHANV8 | 14 + .../sphincs-shake-192f-robust_BAS | 14 + .../sphincs-shake-192f-robust_C | 14 + .../sphincs-shake-192f-robust_COTHANV8 | 14 + .../sphincs-shake-192f-simple_BAS | 14 + .../sphincs-shake-192f-simple_C | 14 + .../sphincs-shake-192f-simple_COTHANV8 | 14 + .../sphincs-shake-192s-robust_BAS | 14 + .../sphincs-shake-192s-robust_C | 14 + .../sphincs-shake-192s-robust_COTHANV8 | 14 + .../sphincs-shake-192s-simple_BAS | 14 + .../sphincs-shake-192s-simple_C | 14 + .../sphincs-shake-192s-simple_COTHANV8 | 14 + .../sphincs-shake-256f-robust_BAS | 14 + .../sphincs-shake-256f-robust_C | 14 + .../sphincs-shake-256f-robust_COTHANV8 | 14 + .../sphincs-shake-256f-simple_BAS | 14 + .../sphincs-shake-256f-simple_C | 14 + .../sphincs-shake-256f-simple_COTHANV8 | 14 + .../sphincs-shake-256s-robust_BAS | 14 + .../sphincs-shake-256s-robust_C | 14 + .../sphincs-shake-256s-robust_COTHANV8 | 14 + .../sphincs-shake-256s-simple_BAS | 14 + .../sphincs-shake-256s-simple_C | 14 + .../sphincs-shake-256s-simple_COTHANV8 | 14 + .../sphincs-shake-128f-robust_C | 14 + .../sphincs-shake-128f-robust_COTHANV8 | 14 + .../sphincs-shake-128f-simple_C | 14 + .../sphincs-shake-128f-simple_COTHANV8 | 14 + .../sphincs-shake-128s-robust_C | 14 + .../sphincs-shake-128s-robust_COTHANV8 | 14 + .../sphincs-shake-128s-simple_C | 14 + .../sphincs-shake-128s-simple_COTHANV8 | 14 + .../sphincs-shake-192f-robust_C | 14 + .../sphincs-shake-192f-robust_COTHANV8 | 14 + .../sphincs-shake-192f-simple_C | 14 + .../sphincs-shake-192f-simple_COTHANV8 | 14 + .../sphincs-shake-192s-robust_C | 14 + .../sphincs-shake-192s-robust_COTHANV8 | 14 + .../sphincs-shake-192s-simple_C | 14 + .../sphincs-shake-192s-simple_COTHANV8 | 14 + .../sphincs-shake-256f-robust_C | 14 + .../sphincs-shake-256f-robust_COTHANV8 | 14 + .../sphincs-shake-256f-simple_C | 14 + .../sphincs-shake-256f-simple_COTHANV8 | 14 + .../sphincs-shake-256s-robust_C | 14 + .../sphincs-shake-256s-robust_COTHANV8 | 14 + .../sphincs-shake-256s-simple_C | 14 + .../sphincs-shake-256s-simple_COTHANV8 | 14 + .../sphincs-shake-128f-robust_BAS | 14 + .../sphincs-shake-128f-robust_C | 14 + .../sphincs-shake-128f-robust_COTHANV8 | 14 + .../sphincs-shake-128f-simple_BAS | 14 + .../sphincs-shake-128f-simple_C | 14 + .../sphincs-shake-128f-simple_COTHANV8 | 14 + .../sphincs-shake-128s-robust_BAS | 14 + .../sphincs-shake-128s-robust_C | 14 + .../sphincs-shake-128s-robust_COTHANV8 | 14 + .../sphincs-shake-128s-simple_BAS | 14 + .../sphincs-shake-128s-simple_C | 14 + .../sphincs-shake-128s-simple_COTHANV8 | 14 + .../sphincs-shake-192f-robust_BAS | 14 + .../sphincs-shake-192f-robust_C | 14 + .../sphincs-shake-192f-robust_COTHANV8 | 14 + .../sphincs-shake-192f-simple_BAS | 14 + .../sphincs-shake-192f-simple_C | 14 + .../sphincs-shake-192f-simple_COTHANV8 | 14 + .../sphincs-shake-192s-robust_BAS | 14 + .../sphincs-shake-192s-robust_C | 14 + .../sphincs-shake-192s-robust_COTHANV8 | 14 + .../sphincs-shake-192s-simple_BAS | 14 + .../sphincs-shake-192s-simple_C | 14 + .../sphincs-shake-192s-simple_COTHANV8 | 14 + .../sphincs-shake-256f-robust_BAS | 14 + .../sphincs-shake-256f-robust_C | 14 + .../sphincs-shake-256f-robust_COTHANV8 | 14 + .../sphincs-shake-256f-simple_BAS | 14 + .../sphincs-shake-256f-simple_C | 14 + .../sphincs-shake-256f-simple_COTHANV8 | 14 + .../sphincs-shake-256s-robust_BAS | 14 + .../sphincs-shake-256s-robust_C | 14 + .../sphincs-shake-256s-robust_COTHANV8 | 14 + .../sphincs-shake-256s-simple_BAS | 14 + .../sphincs-shake-256s-simple_C | 14 + .../sphincs-shake-256s-simple_COTHANV8 | 14 + .../sphincs-shake-128f-robust_C | 14 + .../sphincs-shake-128f-robust_COTHANV8 | 14 + .../sphincs-shake-128f-simple_C | 14 + .../sphincs-shake-128f-simple_COTHANV8 | 14 + .../sphincs-shake-128s-robust_C | 14 + .../sphincs-shake-128s-robust_COTHANV8 | 14 + .../sphincs-shake-128s-simple_C | 14 + .../sphincs-shake-128s-simple_COTHANV8 | 14 + .../sphincs-shake-192f-robust_C | 14 + .../sphincs-shake-192f-robust_COTHANV8 | 14 + .../sphincs-shake-192f-simple_C | 14 + .../sphincs-shake-192f-simple_COTHANV8 | 14 + .../sphincs-shake-192s-robust_C | 14 + .../sphincs-shake-192s-robust_COTHANV8 | 14 + .../sphincs-shake-192s-simple_C | 14 + .../sphincs-shake-192s-simple_COTHANV8 | 14 + .../sphincs-shake-256f-robust_C | 14 + .../sphincs-shake-256f-robust_COTHANV8 | 14 + .../sphincs-shake-256f-simple_C | 14 + .../sphincs-shake-256f-simple_COTHANV8 | 14 + .../sphincs-shake-256s-robust_C | 14 + .../sphincs-shake-256s-robust_COTHANV8 | 14 + .../sphincs-shake-256s-simple_C | 14 + .../sphincs-shake-256s-simple_COTHANV8 | 14 + .../benchmarks_X1/sphincs-shake-128f-robust_C | 14 + .../sphincs-shake-128f-robust_COTHANV8 | 14 + .../benchmarks_X1/sphincs-shake-128f-simple_C | 14 + .../sphincs-shake-128f-simple_COTHANV8 | 14 + .../benchmarks_X1/sphincs-shake-128s-robust_C | 14 + .../sphincs-shake-128s-robust_COTHANV8 | 14 + .../benchmarks_X1/sphincs-shake-128s-simple_C | 14 + .../sphincs-shake-128s-simple_COTHANV8 | 14 + .../benchmarks_X1/sphincs-shake-192f-robust_C | 14 + .../sphincs-shake-192f-robust_COTHANV8 | 14 + .../benchmarks_X1/sphincs-shake-192f-simple_C | 14 + .../sphincs-shake-192f-simple_COTHANV8 | 14 + .../benchmarks_X1/sphincs-shake-192s-robust_C | 14 + .../sphincs-shake-192s-robust_COTHANV8 | 14 + .../benchmarks_X1/sphincs-shake-192s-simple_C | 14 + .../sphincs-shake-192s-simple_COTHANV8 | 14 + .../benchmarks_X1/sphincs-shake-256f-robust_C | 14 + .../sphincs-shake-256f-robust_COTHANV8 | 14 + .../benchmarks_X1/sphincs-shake-256f-simple_C | 14 + .../sphincs-shake-256f-simple_COTHANV8 | 14 + .../benchmarks_X1/sphincs-shake-256s-robust_C | 14 + .../sphincs-shake-256s-robust_COTHANV8 | 14 + .../benchmarks_X1/sphincs-shake-256s-simple_C | 14 + .../sphincs-shake-256s-simple_COTHANV8 | 14 + .../sphincs-shake-128f-robust_BAS | 14 + .../benchmarks_X2/sphincs-shake-128f-robust_C | 14 + .../sphincs-shake-128f-robust_COTHANV8 | 14 + .../sphincs-shake-128f-simple_BAS | 14 + .../benchmarks_X2/sphincs-shake-128f-simple_C | 14 + .../sphincs-shake-128f-simple_COTHANV8 | 14 + .../sphincs-shake-128s-robust_BAS | 14 + .../benchmarks_X2/sphincs-shake-128s-robust_C | 14 + .../sphincs-shake-128s-robust_COTHANV8 | 14 + .../sphincs-shake-128s-simple_BAS | 14 + .../benchmarks_X2/sphincs-shake-128s-simple_C | 14 + .../sphincs-shake-128s-simple_COTHANV8 | 14 + .../sphincs-shake-192f-robust_BAS | 14 + .../benchmarks_X2/sphincs-shake-192f-robust_C | 14 + .../sphincs-shake-192f-robust_COTHANV8 | 14 + .../sphincs-shake-192f-simple_BAS | 14 + .../benchmarks_X2/sphincs-shake-192f-simple_C | 14 + .../sphincs-shake-192f-simple_COTHANV8 | 14 + .../sphincs-shake-192s-robust_BAS | 14 + .../benchmarks_X2/sphincs-shake-192s-robust_C | 14 + .../sphincs-shake-192s-robust_COTHANV8 | 14 + .../sphincs-shake-192s-simple_BAS | 14 + .../benchmarks_X2/sphincs-shake-192s-simple_C | 14 + .../sphincs-shake-192s-simple_COTHANV8 | 14 + .../sphincs-shake-256f-robust_BAS | 14 + .../benchmarks_X2/sphincs-shake-256f-robust_C | 14 + .../sphincs-shake-256f-robust_COTHANV8 | 14 + .../sphincs-shake-256f-simple_BAS | 14 + .../benchmarks_X2/sphincs-shake-256f-simple_C | 14 + .../sphincs-shake-256f-simple_COTHANV8 | 14 + .../sphincs-shake-256s-robust_BAS | 14 + .../benchmarks_X2/sphincs-shake-256s-robust_C | 14 + .../sphincs-shake-256s-robust_COTHANV8 | 14 + .../sphincs-shake-256s-simple_BAS | 14 + .../benchmarks_X2/sphincs-shake-256s-simple_C | 14 + .../sphincs-shake-256s-simple_COTHANV8 | 14 + sphincsplus/sphincsplus-keccakx2/context.h | 13 + sphincsplus/sphincsplus-keccakx2/f1600x2.h | 18 + sphincsplus/sphincsplus-keccakx2/fips202.c | 762 ++ sphincsplus/sphincsplus-keccakx2/fips202.h | 47 + sphincsplus/sphincsplus-keccakx2/fips202x2.c | 165 + sphincsplus/sphincsplus-keccakx2/fips202x2.h | 24 + sphincsplus/sphincsplus-keccakx2/fors.c | 198 + sphincsplus/sphincsplus-keccakx2/fors.h | 32 + sphincsplus/sphincsplus-keccakx2/hal_env.h | 9 + sphincsplus/sphincsplus-keccakx2/hash.h | 27 + sphincsplus/sphincsplus-keccakx2/hash_shake.c | 93 + .../sphincsplus-keccakx2/hash_shakex2.c | 51 + sphincsplus/sphincsplus-keccakx2/hashx2.h | 14 + .../sphincsplus-keccakx2/keccak_f1600_x2 | 1 + sphincsplus/sphincsplus-keccakx2/make_all.py | 78 + sphincsplus/sphincsplus-keccakx2/merkle.c | 65 + sphincsplus/sphincsplus-keccakx2/merkle.h | 18 + sphincsplus/sphincsplus-keccakx2/params.h | 5 + .../params/params-sphincs-shake-128f.h | 80 + .../params/params-sphincs-shake-128s.h | 80 + .../params/params-sphincs-shake-192f.h | 80 + .../params/params-sphincs-shake-192s.h | 80 + .../params/params-sphincs-shake-256f.h | 80 + .../params/params-sphincs-shake-256s.h | 80 + .../sphincsplus-keccakx2/randombytes.h | 8 + .../sphincsplus-keccakx2/shake_offsets.h | 21 + sphincsplus/sphincsplus-keccakx2/sign.c | 287 + .../sphincsplus-keccakx2/test/benchmark.c | 199 + .../sphincsplus-keccakx2/test/cycles.c | 138 + .../sphincsplus-keccakx2/test/cycles.h | 21 + sphincsplus/sphincsplus-keccakx2/test/fors.c | 41 + .../sphincsplus-keccakx2/test/randombytes.c | 43 + sphincsplus/sphincsplus-keccakx2/test/spx.c | 125 + sphincsplus/sphincsplus-keccakx2/thash.h | 12 + .../thash_shake_robustx2.c | 173 + .../thash_shake_simplex2.c | 123 + sphincsplus/sphincsplus-keccakx2/thashx2.h | 16 + sphincsplus/sphincsplus-keccakx2/utils.c | 154 + sphincsplus/sphincsplus-keccakx2/utils.h | 52 + sphincsplus/sphincsplus-keccakx2/utilsx2.c | 130 + sphincsplus/sphincsplus-keccakx2/utilsx2.h | 28 + sphincsplus/sphincsplus-keccakx2/wots.c | 261 + sphincsplus/sphincsplus-keccakx2/wots.h | 25 + sphincsplus/sphincsplus-keccakx2/wotsx2.h | 40 + sphincsplus/sphincsplus-keccakxN/LICENSE | 22 + sphincsplus/sphincsplus-keccakxN/Makefile | 242 + sphincsplus/sphincsplus-keccakxN/README.md | 42 + sphincsplus/sphincsplus-keccakxN/address.c | 142 + sphincsplus/sphincsplus-keccakxN/address.h | 82 + sphincsplus/sphincsplus-keccakxN/api.h | 107 + sphincsplus/sphincsplus-keccakxN/bench_xN.sh | 114 + .../sphincs-shake-128f-robust_x3 | 16 + .../sphincs-shake-128f-robust_x4 | 16 + .../sphincs-shake-128f-robust_x5 | 16 + .../sphincs-shake-128f-simple_x3 | 16 + .../sphincs-shake-128f-simple_x4 | 16 + .../sphincs-shake-128f-simple_x5 | 16 + .../sphincs-shake-128s-robust_x3 | 16 + .../sphincs-shake-128s-robust_x4 | 16 + .../sphincs-shake-128s-robust_x5 | 16 + .../sphincs-shake-128s-simple_x3 | 16 + .../sphincs-shake-128s-simple_x4 | 16 + .../sphincs-shake-128s-simple_x5 | 16 + .../sphincs-shake-192f-robust_x3 | 16 + .../sphincs-shake-192f-robust_x4 | 16 + .../sphincs-shake-192f-robust_x5 | 16 + .../sphincs-shake-192f-simple_x3 | 16 + .../sphincs-shake-192f-simple_x4 | 16 + .../sphincs-shake-192f-simple_x5 | 16 + .../sphincs-shake-192s-robust_x3 | 16 + .../sphincs-shake-192s-robust_x4 | 16 + .../sphincs-shake-192s-robust_x5 | 16 + .../sphincs-shake-192s-simple_x3 | 16 + .../sphincs-shake-192s-simple_x4 | 16 + .../sphincs-shake-192s-simple_x5 | 16 + .../sphincs-shake-256f-robust_x3 | 16 + .../sphincs-shake-256f-robust_x4 | 16 + .../sphincs-shake-256f-robust_x5 | 16 + .../sphincs-shake-256f-simple_x3 | 16 + .../sphincs-shake-256f-simple_x4 | 16 + .../sphincs-shake-256f-simple_x5 | 16 + .../sphincs-shake-256s-robust_x3 | 16 + .../sphincs-shake-256s-robust_x4 | 16 + .../sphincs-shake-256s-robust_x5 | 16 + .../sphincs-shake-256s-simple_x3 | 16 + .../sphincs-shake-256s-simple_x4 | 16 + .../sphincs-shake-256s-simple_x5 | 16 + .../sphincs-shake-128f-robust_x3 | 16 + .../sphincs-shake-128f-robust_x4 | 16 + .../sphincs-shake-128f-robust_x5 | 16 + .../sphincs-shake-128f-simple_x3 | 16 + .../sphincs-shake-128f-simple_x4 | 16 + .../sphincs-shake-128f-simple_x5 | 16 + .../sphincs-shake-128s-robust_x3 | 16 + .../sphincs-shake-128s-robust_x4 | 16 + .../sphincs-shake-128s-robust_x5 | 16 + .../sphincs-shake-128s-simple_x3 | 16 + .../sphincs-shake-128s-simple_x4 | 16 + .../sphincs-shake-128s-simple_x5 | 16 + .../sphincs-shake-192f-robust_x3 | 16 + .../sphincs-shake-192f-robust_x4 | 16 + .../sphincs-shake-192f-robust_x5 | 16 + .../sphincs-shake-192f-simple_x3 | 16 + .../sphincs-shake-192f-simple_x4 | 16 + .../sphincs-shake-192f-simple_x5 | 16 + .../sphincs-shake-192s-robust_x3 | 16 + .../sphincs-shake-192s-robust_x4 | 16 + .../sphincs-shake-192s-robust_x5 | 16 + .../sphincs-shake-192s-simple_x3 | 16 + .../sphincs-shake-192s-simple_x4 | 16 + .../sphincs-shake-192s-simple_x5 | 16 + .../sphincs-shake-256f-robust_x3 | 16 + .../sphincs-shake-256f-robust_x4 | 16 + .../sphincs-shake-256f-robust_x5 | 16 + .../sphincs-shake-256f-simple_x3 | 16 + .../sphincs-shake-256f-simple_x4 | 16 + .../sphincs-shake-256f-simple_x5 | 16 + .../sphincs-shake-256s-robust_x3 | 16 + .../sphincs-shake-256s-robust_x4 | 16 + .../sphincs-shake-256s-robust_x5 | 16 + .../sphincs-shake-256s-simple_x3 | 16 + .../sphincs-shake-256s-simple_x4 | 16 + .../sphincs-shake-256s-simple_x5 | 16 + .../sphincs-shake-128f-robust_x3 | 16 + .../sphincs-shake-128f-robust_x4 | 16 + .../sphincs-shake-128f-robust_x5 | 16 + .../sphincs-shake-128f-simple_x3 | 16 + .../sphincs-shake-128f-simple_x4 | 16 + .../sphincs-shake-128f-simple_x5 | 16 + .../sphincs-shake-128s-robust_x3 | 16 + .../sphincs-shake-128s-robust_x4 | 16 + .../sphincs-shake-128s-robust_x5 | 16 + .../sphincs-shake-128s-simple_x3 | 16 + .../sphincs-shake-128s-simple_x4 | 16 + .../sphincs-shake-128s-simple_x5 | 16 + .../sphincs-shake-192f-robust_x3 | 16 + .../sphincs-shake-192f-robust_x4 | 16 + .../sphincs-shake-192f-robust_x5 | 16 + .../sphincs-shake-192f-simple_x3 | 16 + .../sphincs-shake-192f-simple_x4 | 16 + .../sphincs-shake-192f-simple_x5 | 16 + .../sphincs-shake-192s-robust_x3 | 16 + .../sphincs-shake-192s-robust_x4 | 16 + .../sphincs-shake-192s-robust_x5 | 16 + .../sphincs-shake-192s-simple_x3 | 16 + .../sphincs-shake-192s-simple_x4 | 16 + .../sphincs-shake-192s-simple_x5 | 16 + .../sphincs-shake-256f-robust_x3 | 16 + .../sphincs-shake-256f-robust_x4 | 16 + .../sphincs-shake-256f-robust_x5 | 16 + .../sphincs-shake-256f-simple_x3 | 16 + .../sphincs-shake-256f-simple_x4 | 16 + .../sphincs-shake-256f-simple_x5 | 16 + .../sphincs-shake-256s-robust_x3 | 16 + .../sphincs-shake-256s-robust_x4 | 16 + .../sphincs-shake-256s-robust_x5 | 16 + .../sphincs-shake-256s-simple_x3 | 16 + .../sphincs-shake-256s-simple_x4 | 16 + .../sphincs-shake-256s-simple_x5 | 16 + .../sphincs-shake-128f-robust_x3 | 16 + .../sphincs-shake-128f-robust_x4 | 16 + .../sphincs-shake-128f-robust_x5 | 16 + .../sphincs-shake-128f-simple_x3 | 16 + .../sphincs-shake-128f-simple_x4 | 16 + .../sphincs-shake-128f-simple_x5 | 16 + .../sphincs-shake-128s-robust_x3 | 16 + .../sphincs-shake-128s-robust_x4 | 16 + .../sphincs-shake-128s-robust_x5 | 16 + .../sphincs-shake-128s-simple_x3 | 16 + .../sphincs-shake-128s-simple_x4 | 16 + .../sphincs-shake-128s-simple_x5 | 16 + .../sphincs-shake-192f-robust_x3 | 16 + .../sphincs-shake-192f-robust_x4 | 16 + .../sphincs-shake-192f-robust_x5 | 16 + .../sphincs-shake-192f-simple_x3 | 16 + .../sphincs-shake-192f-simple_x4 | 16 + .../sphincs-shake-192f-simple_x5 | 16 + .../sphincs-shake-192s-robust_x3 | 16 + .../sphincs-shake-192s-robust_x4 | 16 + .../sphincs-shake-192s-robust_x5 | 16 + .../sphincs-shake-192s-simple_x3 | 16 + .../sphincs-shake-192s-simple_x4 | 16 + .../sphincs-shake-192s-simple_x5 | 16 + .../sphincs-shake-256f-robust_x3 | 16 + .../sphincs-shake-256f-robust_x4 | 16 + .../sphincs-shake-256f-robust_x5 | 16 + .../sphincs-shake-256f-simple_x3 | 16 + .../sphincs-shake-256f-simple_x4 | 16 + .../sphincs-shake-256f-simple_x5 | 16 + .../sphincs-shake-256s-robust_x3 | 16 + .../sphincs-shake-256s-robust_x4 | 16 + .../sphincs-shake-256s-robust_x5 | 16 + .../sphincs-shake-256s-simple_x3 | 16 + .../sphincs-shake-256s-simple_x4 | 16 + .../sphincs-shake-256s-simple_x5 | 16 + .../sphincs-shake-128f-robust_x3 | 16 + .../sphincs-shake-128f-robust_x4 | 16 + .../sphincs-shake-128f-robust_x5 | 16 + .../sphincs-shake-128f-simple_x3 | 16 + .../sphincs-shake-128f-simple_x4 | 16 + .../sphincs-shake-128f-simple_x5 | 16 + .../sphincs-shake-128s-robust_x3 | 16 + .../sphincs-shake-128s-robust_x4 | 16 + .../sphincs-shake-128s-robust_x5 | 16 + .../sphincs-shake-128s-simple_x3 | 16 + .../sphincs-shake-128s-simple_x4 | 16 + .../sphincs-shake-128s-simple_x5 | 16 + .../sphincs-shake-192f-robust_x3 | 16 + .../sphincs-shake-192f-robust_x4 | 16 + .../sphincs-shake-192f-robust_x5 | 16 + .../sphincs-shake-192f-simple_x3 | 16 + .../sphincs-shake-192f-simple_x4 | 16 + .../sphincs-shake-192f-simple_x5 | 16 + .../sphincs-shake-192s-robust_x3 | 16 + .../sphincs-shake-192s-robust_x4 | 16 + .../sphincs-shake-192s-robust_x5 | 16 + .../sphincs-shake-192s-simple_x3 | 16 + .../sphincs-shake-192s-simple_x4 | 16 + .../sphincs-shake-192s-simple_x5 | 16 + .../sphincs-shake-256f-robust_x3 | 16 + .../sphincs-shake-256f-robust_x4 | 16 + .../sphincs-shake-256f-robust_x5 | 16 + .../sphincs-shake-256f-simple_x3 | 16 + .../sphincs-shake-256f-simple_x4 | 16 + .../sphincs-shake-256f-simple_x5 | 16 + .../sphincs-shake-256s-robust_x3 | 16 + .../sphincs-shake-256s-robust_x4 | 16 + .../sphincs-shake-256s-robust_x5 | 16 + .../sphincs-shake-256s-simple_x3 | 16 + .../sphincs-shake-256s-simple_x4 | 16 + .../sphincs-shake-256s-simple_x5 | 16 + .../sphincs-shake-128f-robust_x3 | 16 + .../sphincs-shake-128f-robust_x4 | 16 + .../sphincs-shake-128f-robust_x5 | 16 + .../sphincs-shake-128f-simple_x3 | 16 + .../sphincs-shake-128f-simple_x4 | 16 + .../sphincs-shake-128f-simple_x5 | 16 + .../sphincs-shake-128s-robust_x3 | 16 + .../sphincs-shake-128s-robust_x4 | 16 + .../sphincs-shake-128s-robust_x5 | 16 + .../sphincs-shake-128s-simple_x3 | 16 + .../sphincs-shake-128s-simple_x4 | 16 + .../sphincs-shake-128s-simple_x5 | 16 + .../sphincs-shake-192f-robust_x3 | 16 + .../sphincs-shake-192f-robust_x4 | 16 + .../sphincs-shake-192f-robust_x5 | 16 + .../sphincs-shake-192f-simple_x3 | 16 + .../sphincs-shake-192f-simple_x4 | 16 + .../sphincs-shake-192f-simple_x5 | 16 + .../sphincs-shake-192s-robust_x3 | 16 + .../sphincs-shake-192s-robust_x4 | 16 + .../sphincs-shake-192s-robust_x5 | 16 + .../sphincs-shake-192s-simple_x3 | 16 + .../sphincs-shake-192s-simple_x4 | 16 + .../sphincs-shake-192s-simple_x5 | 16 + .../sphincs-shake-256f-robust_x3 | 16 + .../sphincs-shake-256f-robust_x4 | 16 + .../sphincs-shake-256f-robust_x5 | 16 + .../sphincs-shake-256f-simple_x3 | 16 + .../sphincs-shake-256f-simple_x4 | 16 + .../sphincs-shake-256f-simple_x5 | 16 + .../sphincs-shake-256s-robust_x3 | 16 + .../sphincs-shake-256s-robust_x4 | 16 + .../sphincs-shake-256s-robust_x5 | 16 + .../sphincs-shake-256s-simple_x3 | 16 + .../sphincs-shake-256s-simple_x4 | 16 + .../sphincs-shake-256s-simple_x5 | 16 + sphincsplus/sphincsplus-keccakxN/context.h | 43 + sphincsplus/sphincsplus-keccakxN/f1600x.h | 51 + sphincsplus/sphincsplus-keccakxN/fips202.c | 513 ++ sphincsplus/sphincsplus-keccakxN/fips202.h | 77 + sphincsplus/sphincsplus-keccakxN/fips202x.c | 216 + sphincsplus/sphincsplus-keccakxN/fips202x.h | 40 + sphincsplus/sphincsplus-keccakxN/fors.c | 225 + sphincsplus/sphincsplus-keccakxN/fors.h | 62 + sphincsplus/sphincsplus-keccakxN/hal_env.h | 34 + sphincsplus/sphincsplus-keccakxN/hash.h | 57 + sphincsplus/sphincsplus-keccakxN/hash_shake.c | 123 + .../sphincsplus-keccakxN/hash_shakex.c | 117 + sphincsplus/sphincsplus-keccakxN/hashx.h | 43 + sphincsplus/sphincsplus-keccakxN/keccak_f1600 | 1 + .../sphincsplus-keccakxN/keccak_f1600_dummy.s | 162 + sphincsplus/sphincsplus-keccakxN/macros.s | 30 + sphincsplus/sphincsplus-keccakxN/make_all.py | 76 + sphincsplus/sphincsplus-keccakxN/merkle.c | 93 + sphincsplus/sphincsplus-keccakxN/merkle.h | 48 + sphincsplus/sphincsplus-keccakxN/params.h | 34 + .../params/params-sphincs-shake-128f.h | 80 + .../params/params-sphincs-shake-128s.h | 80 + .../params/params-sphincs-shake-192f.h | 80 + .../params/params-sphincs-shake-192s.h | 80 + .../params/params-sphincs-shake-256f.h | 80 + .../params/params-sphincs-shake-256s.h | 80 + .../sphincsplus-keccakxN/randombytes.h | 38 + .../sphincsplus-keccakxN/shake_offsets.h | 51 + sphincsplus/sphincsplus-keccakxN/sign.c | 317 + .../sphincsplus-keccakxN/test/benchmark.c | 210 + .../sphincsplus-keccakxN/test/cycles.c | 138 + .../sphincsplus-keccakxN/test/cycles.h | 21 + sphincsplus/sphincsplus-keccakxN/test/fors.c | 41 + .../sphincsplus-keccakxN/test/functest.c | 40 + .../sphincsplus-keccakxN/test/randombytes.c | 43 + sphincsplus/sphincsplus-keccakxN/test/spx.c | 125 + sphincsplus/sphincsplus-keccakxN/thash.h | 43 + .../sphincsplus-keccakxN/thash_shake_robust.c | 60 + .../thash_shake_robustx.c | 171 + .../sphincsplus-keccakxN/thash_shake_simple.c | 53 + .../thash_shake_simplex.c | 139 + sphincsplus/sphincsplus-keccakxN/thashx.h | 44 + sphincsplus/sphincsplus-keccakxN/utils.c | 184 + sphincsplus/sphincsplus-keccakxN/utils.h | 82 + sphincsplus/sphincsplus-keccakxN/utilsx.c | 252 + sphincsplus/sphincsplus-keccakxN/utilsx.h | 58 + sphincsplus/sphincsplus-keccakxN/wots.c | 300 + sphincsplus/sphincsplus-keccakxN/wots.h | 55 + sphincsplus/sphincsplus-keccakxN/wotsx.h | 70 + tests/common/misc.c | 134 + tests/common/poly.c | 264 + tests/helloworld/main.c | 46 + tests/helloworld/neon_test.s | 10 + tests/inc/hal.h | 62 + tests/inc/misc.h | 126 + tests/inc/poly.h | 82 + tests/keccak_neon/keccak_f1600_tests.c | 458 ++ tests/keccak_neon/keccak_f1600_tests.h | 174 + tests/keccak_neon/main.c | 220 + .../manual/keccak_f1600_variants.h | 116 + .../manual/keccak_f1600_x1_scalar_C.c | 591 ++ .../manual/keccak_f1600_x1_scalar_asm_v1.s | 413 ++ .../manual/keccak_f1600_x1_scalar_asm_v2.s | 505 ++ .../manual/keccak_f1600_x1_scalar_asm_v3.s | 494 ++ .../manual/keccak_f1600_x1_scalar_asm_v4.s | 495 ++ .../manual/keccak_f1600_x1_scalar_asm_v5.s | 506 ++ .../manual/keccak_f1600_x2_hybrid_asm_v1.s | 417 ++ .../manual/keccak_f1600_x2_hybrid_asm_v2p0.s | 830 +++ .../manual/keccak_f1600_x2_hybrid_asm_v2p1.s | 880 +++ .../manual/keccak_f1600_x2_hybrid_asm_v2p2.s | 971 +++ .../manual/keccak_f1600_x2_hybrid_asm_v2pp0.s | 804 +++ .../manual/keccak_f1600_x2_hybrid_asm_v2pp1.s | 805 +++ .../manual/keccak_f1600_x2_hybrid_asm_v2pp2.s | 804 +++ .../manual/keccak_f1600_x2_v84a_asm_v1.s | 338 + .../manual/keccak_f1600_x2_v84a_asm_v1p0.s | 465 ++ .../manual/keccak_f1600_x2_v84a_asm_v2.s | 375 + .../manual/keccak_f1600_x2_v84a_asm_v2p0.s | 596 ++ .../manual/keccak_f1600_x2_v84a_asm_v2p1.s | 732 ++ .../manual/keccak_f1600_x2_v84a_asm_v2p2.s | 802 +++ .../manual/keccak_f1600_x2_v84a_asm_v2p3.s | 773 ++ .../manual/keccak_f1600_x2_v84a_asm_v2p4.s | 689 ++ .../manual/keccak_f1600_x2_v84a_asm_v2p5.s | 949 +++ .../manual/keccak_f1600_x2_v84a_asm_v2p6.s | 948 +++ .../manual/keccak_f1600_x2_v84a_asm_v2pp0.s | 729 ++ .../manual/keccak_f1600_x2_v84a_asm_v2pp1.s | 755 ++ .../manual/keccak_f1600_x2_v84a_asm_v2pp2.s | 798 +++ .../manual/keccak_f1600_x2_v84a_asm_v2pp3.s | 905 +++ .../manual/keccak_f1600_x2_v84a_asm_v2pp4.s | 797 +++ .../manual/keccak_f1600_x2_v84a_asm_v2pp5.s | 806 +++ .../manual/keccak_f1600_x2_v84a_asm_v2pp6.s | 917 +++ .../manual/keccak_f1600_x2_v84a_asm_v2pp7.s | 901 +++ .../manual/keccak_f1600_x3_hybrid_asm_v3p.s | 971 +++ .../manual/keccak_f1600_x3_hybrid_asm_v6.s | 1377 ++++ .../manual/keccak_f1600_x3_hybrid_asm_v7.s | 924 +++ .../manual/keccak_f1600_x4_hybrid_asm_v1.s | 1142 +++ .../manual/keccak_f1600_x4_hybrid_asm_v2.s | 991 +++ .../manual/keccak_f1600_x4_hybrid_asm_v2p0.s | 993 +++ .../manual/keccak_f1600_x4_hybrid_asm_v3.s | 1015 +++ .../manual/keccak_f1600_x4_hybrid_asm_v3p.s | 1016 +++ .../manual/keccak_f1600_x4_hybrid_asm_v3pp.s | 1022 +++ .../manual/keccak_f1600_x4_hybrid_asm_v4.s | 1018 +++ .../manual/keccak_f1600_x4_hybrid_asm_v4p.s | 1026 +++ .../manual/keccak_f1600_x4_hybrid_asm_v5.s | 1360 ++++ .../manual/keccak_f1600_x4_hybrid_asm_v5p.s | 1337 ++++ .../manual/keccak_f1600_x4_hybrid_asm_v6.s | 1385 ++++ .../manual/keccak_f1600_x4_hybrid_asm_v7.s | 1266 ++++ .../manual/keccak_f1600_x4_hybrid_asm_v8.s | 1367 ++++ .../manual/keccak_f1600_x4_scalar_asm_v1.s | 561 ++ .../manual/keccak_f1600_x4_scalar_asm_v5.s | 543 ++ .../manual/keccak_f1600_x4_v84a_asm_v1p0.s | 452 ++ .../manual/keccak_f1600_x5_hybrid_asm_v8.s | 1635 +++++ .../manual/keccak_f1600_x5_hybrid_asm_v8p.s | 1306 ++++ tests/keccak_neon/manual/macros.s | 35 + .../manual/third_party/keccakx2_C.c | 330 + .../manual/third_party/keccakx2_bas.s | 203 + .../manual/third_party/keccakx2_cothan.c | 404 ++ tests/ntt_kyber/main.c | 229 + tests/ntt_kyber/manual/dummy | 0 ...t_u32_full_33556993_28678040_var_4_4_0_0.s | 2422 +++++++ ..._u32_full_33556993_28678040_var_4_4_10_0.s | 2486 +++++++ ..._u32_full_33556993_28678040_var_4_4_11_0.s | 2422 +++++++ ..._u32_full_33556993_28678040_var_4_4_12_0.s | 2422 +++++++ ..._u32_full_33556993_28678040_var_4_4_13_0.s | 2422 +++++++ ..._u32_full_33556993_28678040_var_4_4_14_0.s | 2506 +++++++ ..._u32_full_33556993_28678040_var_4_4_15_0.s | 2506 +++++++ ..._u32_full_33556993_28678040_var_4_4_16_0.s | 2506 +++++++ ..._u32_full_33556993_28678040_var_4_4_17_0.s | 2486 +++++++ ..._u32_full_33556993_28678040_var_4_4_18_0.s | 2486 +++++++ ...t_u32_full_33556993_28678040_var_4_4_1_0.s | 2422 +++++++ ...t_u32_full_33556993_28678040_var_4_4_2_0.s | 2422 +++++++ ...t_u32_full_33556993_28678040_var_4_4_3_0.s | 2422 +++++++ ...32_full_33556993_28678040_var_4_4_3_z2_0.s | 2422 +++++++ ...32_full_33556993_28678040_var_4_4_3_z2_1.s | 2422 +++++++ ...32_full_33556993_28678040_var_4_4_3_z2_2.s | 2422 +++++++ ...32_full_33556993_28678040_var_4_4_3_z2_3.s | 2422 +++++++ ...32_full_33556993_28678040_var_4_4_3_z2_4.s | 2422 +++++++ ...32_full_33556993_28678040_var_4_4_3_z2_5.s | 2422 +++++++ ...32_full_33556993_28678040_var_4_4_3_z4_0.s | 2422 +++++++ ...32_full_33556993_28678040_var_4_4_3_z4_1.s | 2422 +++++++ ...32_full_33556993_28678040_var_4_4_3_z4_2.s | 2422 +++++++ ...32_full_33556993_28678040_var_4_4_3_z4_3.s | 2422 +++++++ ...32_full_33556993_28678040_var_4_4_3_z4_4.s | 2422 +++++++ ...t_u32_full_33556993_28678040_var_4_4_4_0.s | 2422 +++++++ ...t_u32_full_33556993_28678040_var_4_4_5_0.s | 2422 +++++++ ...t_u32_full_33556993_28678040_var_4_4_6_0.s | 2422 +++++++ ...t_u32_full_33556993_28678040_var_4_4_7_0.s | 2422 +++++++ ...t_u32_full_33556993_28678040_var_4_4_8_0.s | 2422 +++++++ ...t_u32_full_33556993_28678040_var_4_4_9_0.s | 2422 +++++++ ...2_incomplete_33556993_28678040_var_3_3_0.s | 1474 ++++ ...2_incomplete_33556993_28678040_var_3_3_1.s | 1474 ++++ ...2_incomplete_33556993_28678040_var_3_3_2.s | 1474 ++++ ...2_incomplete_33556993_28678040_var_3_3_3.s | 1474 ++++ ...2_incomplete_33556993_28678040_var_3_3_4.s | 1474 ++++ ...2_incomplete_33556993_28678040_var_3_3_5.s | 1474 ++++ ...incomplete_33556993_28678040_var_4_2_0_0.s | 1494 ++++ ...omplete_33556993_28678040_var_4_2_0_z4_0.s | 1494 ++++ ...mplete_33556993_28678040_var_4_2_0_z4_16.s | 1494 ++++ ...mplete_33556993_28678040_var_4_2_10_z4_7.s | 1558 ++++ ...mplete_33556993_28678040_var_4_2_11_z4_7.s | 1494 ++++ ...mplete_33556993_28678040_var_4_2_12_z4_7.s | 1494 ++++ ...mplete_33556993_28678040_var_4_2_13_z4_7.s | 1494 ++++ ...mplete_33556993_28678040_var_4_2_14_z4_7.s | 1578 +++++ ...mplete_33556993_28678040_var_4_2_15_z4_7.s | 1578 +++++ ...mplete_33556993_28678040_var_4_2_16_z4_7.s | 1578 +++++ ...mplete_33556993_28678040_var_4_2_17_z4_7.s | 1558 ++++ ...mplete_33556993_28678040_var_4_2_18_z4_7.s | 1558 ++++ ...mplete_33556993_28678040_var_4_2_19_z4_7.s | 1558 ++++ ...mplete_33556993_28678040_var_4_2_20_z4_7.s | 1558 ++++ ...mplete_33556993_28678040_var_4_2_21_z4_7.s | 1558 ++++ ...plete_33556993_28678040_var_4_2_22_z4_10.s | 1550 ++++ ...plete_33556993_28678040_var_4_2_22_z4_11.s | 1550 ++++ ...plete_33556993_28678040_var_4_2_22_z4_12.s | 1550 ++++ ...plete_33556993_28678040_var_4_2_22_z4_13.s | 1550 ++++ ...plete_33556993_28678040_var_4_2_22_z4_14.s | 1550 ++++ ...plete_33556993_28678040_var_4_2_22_z4_15.s | 1550 ++++ ...mplete_33556993_28678040_var_4_2_22_z4_7.s | 1550 ++++ ...mplete_33556993_28678040_var_4_2_22_z4_8.s | 1550 ++++ ...mplete_33556993_28678040_var_4_2_22_z4_9.s | 1558 ++++ ...mplete_33556993_28678040_var_4_2_24_z4_0.s | 1494 ++++ ...plete_33556993_28678040_var_4_2_24_z4_16.s | 1494 ++++ ...omplete_33556993_28678040_var_4_2_3_z4_0.s | 1494 ++++ ...omplete_33556993_28678040_var_4_2_3_z4_1.s | 1494 ++++ ...omplete_33556993_28678040_var_4_2_3_z4_2.s | 1494 ++++ ...omplete_33556993_28678040_var_4_2_3_z4_3.s | 1494 ++++ ...omplete_33556993_28678040_var_4_2_3_z4_4.s | 1494 ++++ ...omplete_33556993_28678040_var_4_2_3_z4_5.s | 1494 ++++ ...omplete_33556993_28678040_var_4_2_7_z4_0.s | 1494 ++++ ...omplete_33556993_28678040_var_4_2_7_z4_1.s | 1494 ++++ ...mplete_33556993_28678040_var_4_2_7_z4_10.s | 1494 ++++ ...omplete_33556993_28678040_var_4_2_7_z4_2.s | 1494 ++++ ...omplete_33556993_28678040_var_4_2_7_z4_3.s | 1494 ++++ ...omplete_33556993_28678040_var_4_2_7_z4_4.s | 1494 ++++ ...omplete_33556993_28678040_var_4_2_7_z4_5.s | 1494 ++++ ...omplete_33556993_28678040_var_4_2_7_z4_6.s | 1494 ++++ ...omplete_33556993_28678040_var_4_2_7_z4_7.s | 1494 ++++ ...omplete_33556993_28678040_var_4_2_7_z4_8.s | 1494 ++++ ...omplete_33556993_28678040_var_4_2_7_z4_9.s | 1502 ++++ ...omplete_33556993_28678040_var_4_2_8_z4_7.s | 1494 ++++ ...omplete_33556993_28678040_var_4_2_9_z4_7.s | 1494 ++++ tests/ntt_neon/main.c | 121 + tests/ntt_neon/ntt.c | 662 ++ tests/ntt_neon/ntt.h | 261 + ...2_incomplete_33556993_28678040_var_3_3_0.s | 1475 ++++ ...594067788289_60277548896192635_var_3_3_0.s | 2727 +++++++ ...594067788289_60277548896192635_var_3_3_1.s | 2727 +++++++ ...594067788289_60277548896192635_var_3_3_2.s | 2727 +++++++ tests/ntt_sve2/main.c | 47 + .../manual/basemul_64_72057594067788289.s | 105 + tests/ntt_sve2/misc.c | 143 + tests/ntt_sve2/misc.h | 108 + tests/ntt_sve2/ntt.c | 610 ++ tests/ntt_sve2/ntt.h | 294 + 847 files changed, 447562 insertions(+) create mode 100644 .gitignore create mode 100644 .gitmodules create mode 100644 LICENSE create mode 100644 Makefile create mode 100644 README.md create mode 100644 asm/Makefile create mode 100644 asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_0_0.s create mode 100644 asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_10_0.s create mode 100644 asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_11_0.s create mode 100644 asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_12_0.s create mode 100644 asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_13_0.s create mode 100644 asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_14_0.s create mode 100644 asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_15_0.s create mode 100644 asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_16_0.s create mode 100644 asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_17_0.s create mode 100644 asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_18_0.s create mode 100644 asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_1_0.s create mode 100644 asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_2_0.s create mode 100644 asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_3_0.s create mode 100644 asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_3_z2_0.s create mode 100644 asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_3_z2_1.s create mode 100644 asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_3_z2_2.s create mode 100644 asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_3_z2_3.s create mode 100644 asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_3_z2_4.s create mode 100644 asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_3_z2_5.s create mode 100644 asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_3_z4_0.s create mode 100644 asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_3_z4_1.s create mode 100644 asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_3_z4_2.s create mode 100644 asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_3_z4_3.s create mode 100644 asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_3_z4_4.s create mode 100644 asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_4_0.s create mode 100644 asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_5_0.s create mode 100644 asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_6_0.s create mode 100644 asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_7_0.s create mode 100644 asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_8_0.s create mode 100644 asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_9_0.s create mode 100644 asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_3_3_0.s create mode 100644 asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_3_3_1.s create mode 100644 asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_3_3_2.s create mode 100644 asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_3_3_3.s create mode 100644 asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_3_3_4.s create mode 100644 asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_3_3_5.s create mode 100644 asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_0_0.s create mode 100644 asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_0_z4_0.s create mode 100644 asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_0_z4_16.s create mode 100644 asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_10_z4_7.s create mode 100644 asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_11_z4_7.s create mode 100644 asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_12_z4_7.s create mode 100644 asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_13_z4_7.s create mode 100644 asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_14_z4_7.s create mode 100644 asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_15_z4_7.s create mode 100644 asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_16_z4_7.s create mode 100644 asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_17_z4_7.s create mode 100644 asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_18_z4_7.s create mode 100644 asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_19_z4_7.s create mode 100644 asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_20_z4_7.s create mode 100644 asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_21_z4_7.s create mode 100644 asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_10.s create mode 100644 asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_11.s create mode 100644 asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_12.s create mode 100644 asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_13.s create mode 100644 asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_14.s create mode 100644 asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_15.s create mode 100644 asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_7.s create mode 100644 asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_8.s create mode 100644 asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_9.s create mode 100644 asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_24_z4_0.s create mode 100644 asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_24_z4_16.s create mode 100644 asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_0.s create mode 100644 asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_1.s create mode 100644 asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_2.s create mode 100644 asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_3.s create mode 100644 asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_4.s create mode 100644 asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_5.s create mode 100644 asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_0.s create mode 100644 asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_1.s create mode 100644 asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_10.s create mode 100644 asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_2.s create mode 100644 asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_3.s create mode 100644 asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_4.s create mode 100644 asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_5.s create mode 100644 asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_6.s create mode 100644 asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_7.s create mode 100644 asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_8.s create mode 100644 asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_9.s create mode 100644 asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_8_z4_7.s create mode 100644 asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_9_z4_7.s create mode 100644 asm/auto/ntt_sve2/ntt_u32_incomplete_33556993_28678040_var_3_3_0.s create mode 100644 asm/auto/ntt_sve2/ntt_u64_incomplete_72057594067788289_60277548896192635_var_3_3_0.s create mode 100644 asm/auto/ntt_sve2/ntt_u64_incomplete_72057594067788289_60277548896192635_var_3_3_1.s create mode 100644 asm/auto/ntt_sve2/ntt_u64_incomplete_72057594067788289_60277548896192635_var_3_3_2.s create mode 100644 asm/manual/basemul_s64/basemul_64_72057594067788289.s create mode 100644 asm/manual/keccak_f1600/keccak_f1600.py create mode 100644 asm/manual/keccak_f1600/keccak_f1600_variants.h create mode 100644 asm/manual/keccak_f1600/keccak_f1600_x1_scalar_C.c create mode 100644 asm/manual/keccak_f1600/keccak_f1600_x1_scalar_asm_v1.s create mode 100644 asm/manual/keccak_f1600/keccak_f1600_x1_scalar_asm_v2.s create mode 100644 asm/manual/keccak_f1600/keccak_f1600_x1_scalar_asm_v3.s create mode 100644 asm/manual/keccak_f1600/keccak_f1600_x1_scalar_asm_v4.s create mode 100644 asm/manual/keccak_f1600/keccak_f1600_x1_scalar_asm_v5.s create mode 100644 asm/manual/keccak_f1600/keccak_f1600_x2_hybrid_asm_v1.s create mode 100644 asm/manual/keccak_f1600/keccak_f1600_x2_hybrid_asm_v2p0.s create mode 100644 asm/manual/keccak_f1600/keccak_f1600_x2_hybrid_asm_v2p1.s create mode 100644 asm/manual/keccak_f1600/keccak_f1600_x2_hybrid_asm_v2p2.s create mode 100644 asm/manual/keccak_f1600/keccak_f1600_x2_hybrid_asm_v2pp0.s create mode 100644 asm/manual/keccak_f1600/keccak_f1600_x2_hybrid_asm_v2pp1.s create mode 100644 asm/manual/keccak_f1600/keccak_f1600_x2_hybrid_asm_v2pp2.s create mode 100644 asm/manual/keccak_f1600/keccak_f1600_x2_v84a_asm_v1.s create mode 100644 asm/manual/keccak_f1600/keccak_f1600_x2_v84a_asm_v1p0.s create mode 100644 asm/manual/keccak_f1600/keccak_f1600_x2_v84a_asm_v2.s create mode 100644 asm/manual/keccak_f1600/keccak_f1600_x2_v84a_asm_v2p0.s create mode 100644 asm/manual/keccak_f1600/keccak_f1600_x2_v84a_asm_v2p1.s create mode 100644 asm/manual/keccak_f1600/keccak_f1600_x2_v84a_asm_v2p2.s create mode 100644 asm/manual/keccak_f1600/keccak_f1600_x2_v84a_asm_v2p3.s create mode 100644 asm/manual/keccak_f1600/keccak_f1600_x2_v84a_asm_v2p4.s create mode 100644 asm/manual/keccak_f1600/keccak_f1600_x2_v84a_asm_v2p5.s create mode 100644 asm/manual/keccak_f1600/keccak_f1600_x2_v84a_asm_v2p6.s create mode 100644 asm/manual/keccak_f1600/keccak_f1600_x2_v84a_asm_v2pp0.s create mode 100644 asm/manual/keccak_f1600/keccak_f1600_x2_v84a_asm_v2pp1.s create mode 100644 asm/manual/keccak_f1600/keccak_f1600_x2_v84a_asm_v2pp2.s create mode 100644 asm/manual/keccak_f1600/keccak_f1600_x2_v84a_asm_v2pp3.s create mode 100644 asm/manual/keccak_f1600/keccak_f1600_x2_v84a_asm_v2pp4.s create mode 100644 asm/manual/keccak_f1600/keccak_f1600_x2_v84a_asm_v2pp5.s create mode 100644 asm/manual/keccak_f1600/keccak_f1600_x2_v84a_asm_v2pp6.s create mode 100644 asm/manual/keccak_f1600/keccak_f1600_x2_v84a_asm_v2pp7.s create mode 100644 asm/manual/keccak_f1600/keccak_f1600_x3_hybrid_asm_v3p.s create mode 100644 asm/manual/keccak_f1600/keccak_f1600_x3_hybrid_asm_v6.s create mode 100644 asm/manual/keccak_f1600/keccak_f1600_x3_hybrid_asm_v7.s create mode 100644 asm/manual/keccak_f1600/keccak_f1600_x4_hybrid_asm_v1.s create mode 100644 asm/manual/keccak_f1600/keccak_f1600_x4_hybrid_asm_v2.s create mode 100644 asm/manual/keccak_f1600/keccak_f1600_x4_hybrid_asm_v2p0.s create mode 100644 asm/manual/keccak_f1600/keccak_f1600_x4_hybrid_asm_v3.s create mode 100644 asm/manual/keccak_f1600/keccak_f1600_x4_hybrid_asm_v3p.s create mode 100644 asm/manual/keccak_f1600/keccak_f1600_x4_hybrid_asm_v3pp.s create mode 100644 asm/manual/keccak_f1600/keccak_f1600_x4_hybrid_asm_v4.s create mode 100644 asm/manual/keccak_f1600/keccak_f1600_x4_hybrid_asm_v4p.s create mode 100644 asm/manual/keccak_f1600/keccak_f1600_x4_hybrid_asm_v5.s create mode 100644 asm/manual/keccak_f1600/keccak_f1600_x4_hybrid_asm_v5p.s create mode 100644 asm/manual/keccak_f1600/keccak_f1600_x4_hybrid_asm_v6.s create mode 100644 asm/manual/keccak_f1600/keccak_f1600_x4_hybrid_asm_v7.s create mode 100644 asm/manual/keccak_f1600/keccak_f1600_x4_hybrid_asm_v8.s create mode 100644 asm/manual/keccak_f1600/keccak_f1600_x4_scalar_asm_v1.s create mode 100644 asm/manual/keccak_f1600/keccak_f1600_x4_scalar_asm_v5.s create mode 100644 asm/manual/keccak_f1600/keccak_f1600_x4_v84a_asm_v1p0.s create mode 100644 asm/manual/keccak_f1600/keccak_f1600_x5_hybrid_asm_v8.s create mode 100644 asm/manual/keccak_f1600/keccak_f1600_x5_hybrid_asm_v8p.s create mode 100644 asm/manual/keccak_f1600/macros.s create mode 100644 asm/manual/keccak_f1600/third_party/LICENSE create mode 100644 asm/manual/keccak_f1600/third_party/keccakx2_C.c create mode 100644 asm/manual/keccak_f1600/third_party/keccakx2_bas.s create mode 100644 asm/manual/keccak_f1600/third_party/keccakx2_cothan.c create mode 100644 asm/scripts/ntt_neon/ntt_neon.py create mode 100644 asm/scripts/ntt_sve2/ntt_sve2.py create mode 100644 envs/cross/.gitignore create mode 100644 envs/cross/Makefile create mode 100644 envs/cross/inc/hal_env.h create mode 120000 envs/cross/inc/test_inc create mode 100644 envs/cross/src/hal.c create mode 120000 envs/cross/src/test_common create mode 120000 envs/cross/src/test_src create mode 100644 envs/native_linux/.gitignore create mode 100644 envs/native_linux/Makefile create mode 100644 envs/native_linux/inc/hal_env.h create mode 120000 envs/native_linux/inc/test_inc create mode 100644 envs/native_linux/src/hal.c create mode 120000 envs/native_linux/src/test_common create mode 100644 envs/native_mac/.gitignore create mode 100644 envs/native_mac/Makefile create mode 100644 envs/native_mac/inc/hal_env.h create mode 120000 envs/native_mac/inc/test_inc create mode 100644 envs/native_mac/src/hal.c create mode 120000 envs/native_mac/src/test_common create mode 120000 envs/native_mac/src/test_src create mode 160000 nelight create mode 100644 sphincsplus/README.md create mode 100644 sphincsplus/convert-keccak-benchmarks.py create mode 100644 sphincsplus/convert-sphincs-benchmarks.py create mode 100644 sphincsplus/keccak-benchmarks.md create mode 100644 sphincsplus/keccak-results.md create mode 100644 sphincsplus/sphincs-results.md create mode 100644 sphincsplus/sphincsplus-keccakx2/LICENSE create mode 100644 sphincsplus/sphincsplus-keccakx2/Makefile create mode 100644 sphincsplus/sphincsplus-keccakx2/README.md create mode 100644 sphincsplus/sphincsplus-keccakx2/address.c create mode 100644 sphincsplus/sphincsplus-keccakx2/address.h create mode 100644 sphincsplus/sphincsplus-keccakx2/api.h create mode 100644 sphincsplus/sphincsplus-keccakx2/bench_x2.sh create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks.md create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-128f-robust_BAS create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-128f-robust_C create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-128f-robust_COTHANV8 create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-128f-simple_BAS create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-128f-simple_C create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-128f-simple_COTHANV8 create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-128s-robust_BAS create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-128s-robust_C create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-128s-robust_COTHANV8 create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-128s-simple_BAS create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-128s-simple_C create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-128s-simple_COTHANV8 create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-192f-robust_BAS create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-192f-robust_C create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-192f-robust_COTHANV8 create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-192f-simple_BAS create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-192f-simple_C create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-192f-simple_COTHANV8 create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-192s-robust_BAS create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-192s-robust_C create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-192s-robust_COTHANV8 create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-192s-simple_BAS create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-192s-simple_C create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-192s-simple_COTHANV8 create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-256f-robust_BAS create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-256f-robust_C create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-256f-robust_COTHANV8 create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-256f-simple_BAS create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-256f-simple_C create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-256f-simple_COTHANV8 create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-256s-robust_BAS create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-256s-robust_C create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-256s-robust_COTHANV8 create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-256s-simple_BAS create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-256s-simple_C create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-256s-simple_COTHANV8 create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-128f-robust_C create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-128f-robust_COTHANV8 create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-128f-simple_C create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-128f-simple_COTHANV8 create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-128s-robust_C create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-128s-robust_COTHANV8 create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-128s-simple_C create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-128s-simple_COTHANV8 create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-192f-robust_C create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-192f-robust_COTHANV8 create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-192f-simple_C create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-192f-simple_COTHANV8 create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-192s-robust_C create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-192s-robust_COTHANV8 create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-192s-simple_C create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-192s-simple_COTHANV8 create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-256f-robust_C create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-256f-robust_COTHANV8 create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-256f-simple_C create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-256f-simple_COTHANV8 create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-256s-robust_C create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-256s-robust_COTHANV8 create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-256s-simple_C create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-256s-simple_COTHANV8 create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-128f-robust_BAS create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-128f-robust_C create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-128f-robust_COTHANV8 create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-128f-simple_BAS create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-128f-simple_C create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-128f-simple_COTHANV8 create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-128s-robust_BAS create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-128s-robust_C create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-128s-robust_COTHANV8 create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-128s-simple_BAS create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-128s-simple_C create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-128s-simple_COTHANV8 create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-192f-robust_BAS create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-192f-robust_C create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-192f-robust_COTHANV8 create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-192f-simple_BAS create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-192f-simple_C create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-192f-simple_COTHANV8 create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-192s-robust_BAS create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-192s-robust_C create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-192s-robust_COTHANV8 create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-192s-simple_BAS create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-192s-simple_C create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-192s-simple_COTHANV8 create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-256f-robust_BAS create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-256f-robust_C create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-256f-robust_COTHANV8 create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-256f-simple_BAS create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-256f-simple_C create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-256f-simple_COTHANV8 create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-256s-robust_BAS create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-256s-robust_C create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-256s-robust_COTHANV8 create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-256s-simple_BAS create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-256s-simple_C create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-256s-simple_COTHANV8 create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-128f-robust_C create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-128f-robust_COTHANV8 create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-128f-simple_C create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-128f-simple_COTHANV8 create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-128s-robust_C create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-128s-robust_COTHANV8 create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-128s-simple_C create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-128s-simple_COTHANV8 create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-192f-robust_C create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-192f-robust_COTHANV8 create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-192f-simple_C create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-192f-simple_COTHANV8 create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-192s-robust_C create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-192s-robust_COTHANV8 create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-192s-simple_C create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-192s-simple_COTHANV8 create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-256f-robust_C create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-256f-robust_COTHANV8 create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-256f-simple_C create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-256f-simple_COTHANV8 create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-256s-robust_C create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-256s-robust_COTHANV8 create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-256s-simple_C create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-256s-simple_COTHANV8 create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-128f-robust_C create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-128f-robust_COTHANV8 create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-128f-simple_C create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-128f-simple_COTHANV8 create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-128s-robust_C create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-128s-robust_COTHANV8 create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-128s-simple_C create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-128s-simple_COTHANV8 create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-192f-robust_C create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-192f-robust_COTHANV8 create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-192f-simple_C create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-192f-simple_COTHANV8 create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-192s-robust_C create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-192s-robust_COTHANV8 create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-192s-simple_C create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-192s-simple_COTHANV8 create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-256f-robust_C create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-256f-robust_COTHANV8 create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-256f-simple_C create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-256f-simple_COTHANV8 create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-256s-robust_C create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-256s-robust_COTHANV8 create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-256s-simple_C create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-256s-simple_COTHANV8 create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-128f-robust_BAS create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-128f-robust_C create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-128f-robust_COTHANV8 create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-128f-simple_BAS create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-128f-simple_C create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-128f-simple_COTHANV8 create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-128s-robust_BAS create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-128s-robust_C create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-128s-robust_COTHANV8 create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-128s-simple_BAS create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-128s-simple_C create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-128s-simple_COTHANV8 create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-192f-robust_BAS create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-192f-robust_C create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-192f-robust_COTHANV8 create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-192f-simple_BAS create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-192f-simple_C create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-192f-simple_COTHANV8 create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-192s-robust_BAS create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-192s-robust_C create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-192s-robust_COTHANV8 create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-192s-simple_BAS create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-192s-simple_C create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-192s-simple_COTHANV8 create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-256f-robust_BAS create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-256f-robust_C create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-256f-robust_COTHANV8 create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-256f-simple_BAS create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-256f-simple_C create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-256f-simple_COTHANV8 create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-256s-robust_BAS create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-256s-robust_C create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-256s-robust_COTHANV8 create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-256s-simple_BAS create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-256s-simple_C create mode 100644 sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-256s-simple_COTHANV8 create mode 100644 sphincsplus/sphincsplus-keccakx2/context.h create mode 100644 sphincsplus/sphincsplus-keccakx2/f1600x2.h create mode 100644 sphincsplus/sphincsplus-keccakx2/fips202.c create mode 100644 sphincsplus/sphincsplus-keccakx2/fips202.h create mode 100644 sphincsplus/sphincsplus-keccakx2/fips202x2.c create mode 100644 sphincsplus/sphincsplus-keccakx2/fips202x2.h create mode 100644 sphincsplus/sphincsplus-keccakx2/fors.c create mode 100644 sphincsplus/sphincsplus-keccakx2/fors.h create mode 100644 sphincsplus/sphincsplus-keccakx2/hal_env.h create mode 100644 sphincsplus/sphincsplus-keccakx2/hash.h create mode 100644 sphincsplus/sphincsplus-keccakx2/hash_shake.c create mode 100644 sphincsplus/sphincsplus-keccakx2/hash_shakex2.c create mode 100644 sphincsplus/sphincsplus-keccakx2/hashx2.h create mode 120000 sphincsplus/sphincsplus-keccakx2/keccak_f1600_x2 create mode 100644 sphincsplus/sphincsplus-keccakx2/make_all.py create mode 100644 sphincsplus/sphincsplus-keccakx2/merkle.c create mode 100644 sphincsplus/sphincsplus-keccakx2/merkle.h create mode 100644 sphincsplus/sphincsplus-keccakx2/params.h create mode 100644 sphincsplus/sphincsplus-keccakx2/params/params-sphincs-shake-128f.h create mode 100644 sphincsplus/sphincsplus-keccakx2/params/params-sphincs-shake-128s.h create mode 100644 sphincsplus/sphincsplus-keccakx2/params/params-sphincs-shake-192f.h create mode 100644 sphincsplus/sphincsplus-keccakx2/params/params-sphincs-shake-192s.h create mode 100644 sphincsplus/sphincsplus-keccakx2/params/params-sphincs-shake-256f.h create mode 100644 sphincsplus/sphincsplus-keccakx2/params/params-sphincs-shake-256s.h create mode 100644 sphincsplus/sphincsplus-keccakx2/randombytes.h create mode 100644 sphincsplus/sphincsplus-keccakx2/shake_offsets.h create mode 100644 sphincsplus/sphincsplus-keccakx2/sign.c create mode 100644 sphincsplus/sphincsplus-keccakx2/test/benchmark.c create mode 100644 sphincsplus/sphincsplus-keccakx2/test/cycles.c create mode 100644 sphincsplus/sphincsplus-keccakx2/test/cycles.h create mode 100644 sphincsplus/sphincsplus-keccakx2/test/fors.c create mode 100644 sphincsplus/sphincsplus-keccakx2/test/randombytes.c create mode 100644 sphincsplus/sphincsplus-keccakx2/test/spx.c create mode 100644 sphincsplus/sphincsplus-keccakx2/thash.h create mode 100644 sphincsplus/sphincsplus-keccakx2/thash_shake_robustx2.c create mode 100644 sphincsplus/sphincsplus-keccakx2/thash_shake_simplex2.c create mode 100644 sphincsplus/sphincsplus-keccakx2/thashx2.h create mode 100644 sphincsplus/sphincsplus-keccakx2/utils.c create mode 100644 sphincsplus/sphincsplus-keccakx2/utils.h create mode 100644 sphincsplus/sphincsplus-keccakx2/utilsx2.c create mode 100644 sphincsplus/sphincsplus-keccakx2/utilsx2.h create mode 100644 sphincsplus/sphincsplus-keccakx2/wots.c create mode 100644 sphincsplus/sphincsplus-keccakx2/wots.h create mode 100644 sphincsplus/sphincsplus-keccakx2/wotsx2.h create mode 100644 sphincsplus/sphincsplus-keccakxN/LICENSE create mode 100644 sphincsplus/sphincsplus-keccakxN/Makefile create mode 100644 sphincsplus/sphincsplus-keccakxN/README.md create mode 100644 sphincsplus/sphincsplus-keccakxN/address.c create mode 100644 sphincsplus/sphincsplus-keccakxN/address.h create mode 100644 sphincsplus/sphincsplus-keccakxN/api.h create mode 100644 sphincsplus/sphincsplus-keccakxN/bench_xN.sh create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-128f-robust_x3 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-128f-robust_x4 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-128f-robust_x5 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-128f-simple_x3 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-128f-simple_x4 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-128f-simple_x5 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-128s-robust_x3 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-128s-robust_x4 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-128s-robust_x5 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-128s-simple_x3 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-128s-simple_x4 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-128s-simple_x5 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-192f-robust_x3 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-192f-robust_x4 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-192f-robust_x5 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-192f-simple_x3 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-192f-simple_x4 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-192f-simple_x5 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-192s-robust_x3 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-192s-robust_x4 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-192s-robust_x5 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-192s-simple_x3 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-192s-simple_x4 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-192s-simple_x5 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-256f-robust_x3 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-256f-robust_x4 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-256f-robust_x5 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-256f-simple_x3 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-256f-simple_x4 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-256f-simple_x5 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-256s-robust_x3 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-256s-robust_x4 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-256s-robust_x5 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-256s-simple_x3 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-256s-simple_x4 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-256s-simple_x5 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-128f-robust_x3 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-128f-robust_x4 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-128f-robust_x5 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-128f-simple_x3 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-128f-simple_x4 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-128f-simple_x5 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-128s-robust_x3 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-128s-robust_x4 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-128s-robust_x5 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-128s-simple_x3 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-128s-simple_x4 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-128s-simple_x5 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-192f-robust_x3 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-192f-robust_x4 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-192f-robust_x5 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-192f-simple_x3 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-192f-simple_x4 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-192f-simple_x5 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-192s-robust_x3 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-192s-robust_x4 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-192s-robust_x5 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-192s-simple_x3 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-192s-simple_x4 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-192s-simple_x5 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-256f-robust_x3 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-256f-robust_x4 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-256f-robust_x5 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-256f-simple_x3 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-256f-simple_x4 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-256f-simple_x5 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-256s-robust_x3 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-256s-robust_x4 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-256s-robust_x5 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-256s-simple_x3 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-256s-simple_x4 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-256s-simple_x5 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-128f-robust_x3 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-128f-robust_x4 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-128f-robust_x5 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-128f-simple_x3 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-128f-simple_x4 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-128f-simple_x5 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-128s-robust_x3 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-128s-robust_x4 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-128s-robust_x5 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-128s-simple_x3 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-128s-simple_x4 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-128s-simple_x5 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-192f-robust_x3 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-192f-robust_x4 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-192f-robust_x5 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-192f-simple_x3 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-192f-simple_x4 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-192f-simple_x5 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-192s-robust_x3 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-192s-robust_x4 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-192s-robust_x5 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-192s-simple_x3 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-192s-simple_x4 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-192s-simple_x5 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-256f-robust_x3 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-256f-robust_x4 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-256f-robust_x5 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-256f-simple_x3 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-256f-simple_x4 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-256f-simple_x5 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-256s-robust_x3 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-256s-robust_x4 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-256s-robust_x5 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-256s-simple_x3 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-256s-simple_x4 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-256s-simple_x5 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-128f-robust_x3 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-128f-robust_x4 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-128f-robust_x5 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-128f-simple_x3 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-128f-simple_x4 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-128f-simple_x5 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-128s-robust_x3 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-128s-robust_x4 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-128s-robust_x5 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-128s-simple_x3 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-128s-simple_x4 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-128s-simple_x5 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-192f-robust_x3 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-192f-robust_x4 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-192f-robust_x5 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-192f-simple_x3 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-192f-simple_x4 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-192f-simple_x5 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-192s-robust_x3 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-192s-robust_x4 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-192s-robust_x5 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-192s-simple_x3 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-192s-simple_x4 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-192s-simple_x5 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-256f-robust_x3 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-256f-robust_x4 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-256f-robust_x5 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-256f-simple_x3 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-256f-simple_x4 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-256f-simple_x5 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-256s-robust_x3 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-256s-robust_x4 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-256s-robust_x5 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-256s-simple_x3 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-256s-simple_x4 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-256s-simple_x5 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-128f-robust_x3 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-128f-robust_x4 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-128f-robust_x5 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-128f-simple_x3 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-128f-simple_x4 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-128f-simple_x5 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-128s-robust_x3 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-128s-robust_x4 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-128s-robust_x5 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-128s-simple_x3 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-128s-simple_x4 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-128s-simple_x5 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-192f-robust_x3 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-192f-robust_x4 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-192f-robust_x5 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-192f-simple_x3 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-192f-simple_x4 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-192f-simple_x5 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-192s-robust_x3 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-192s-robust_x4 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-192s-robust_x5 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-192s-simple_x3 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-192s-simple_x4 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-192s-simple_x5 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-256f-robust_x3 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-256f-robust_x4 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-256f-robust_x5 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-256f-simple_x3 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-256f-simple_x4 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-256f-simple_x5 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-256s-robust_x3 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-256s-robust_x4 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-256s-robust_x5 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-256s-simple_x3 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-256s-simple_x4 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-256s-simple_x5 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-128f-robust_x3 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-128f-robust_x4 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-128f-robust_x5 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-128f-simple_x3 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-128f-simple_x4 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-128f-simple_x5 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-128s-robust_x3 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-128s-robust_x4 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-128s-robust_x5 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-128s-simple_x3 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-128s-simple_x4 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-128s-simple_x5 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-192f-robust_x3 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-192f-robust_x4 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-192f-robust_x5 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-192f-simple_x3 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-192f-simple_x4 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-192f-simple_x5 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-192s-robust_x3 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-192s-robust_x4 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-192s-robust_x5 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-192s-simple_x3 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-192s-simple_x4 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-192s-simple_x5 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-256f-robust_x3 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-256f-robust_x4 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-256f-robust_x5 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-256f-simple_x3 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-256f-simple_x4 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-256f-simple_x5 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-256s-robust_x3 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-256s-robust_x4 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-256s-robust_x5 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-256s-simple_x3 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-256s-simple_x4 create mode 100644 sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-256s-simple_x5 create mode 100644 sphincsplus/sphincsplus-keccakxN/context.h create mode 100644 sphincsplus/sphincsplus-keccakxN/f1600x.h create mode 100644 sphincsplus/sphincsplus-keccakxN/fips202.c create mode 100644 sphincsplus/sphincsplus-keccakxN/fips202.h create mode 100644 sphincsplus/sphincsplus-keccakxN/fips202x.c create mode 100644 sphincsplus/sphincsplus-keccakxN/fips202x.h create mode 100644 sphincsplus/sphincsplus-keccakxN/fors.c create mode 100644 sphincsplus/sphincsplus-keccakxN/fors.h create mode 100644 sphincsplus/sphincsplus-keccakxN/hal_env.h create mode 100644 sphincsplus/sphincsplus-keccakxN/hash.h create mode 100644 sphincsplus/sphincsplus-keccakxN/hash_shake.c create mode 100644 sphincsplus/sphincsplus-keccakxN/hash_shakex.c create mode 100644 sphincsplus/sphincsplus-keccakxN/hashx.h create mode 120000 sphincsplus/sphincsplus-keccakxN/keccak_f1600 create mode 100644 sphincsplus/sphincsplus-keccakxN/keccak_f1600_dummy.s create mode 100644 sphincsplus/sphincsplus-keccakxN/macros.s create mode 100644 sphincsplus/sphincsplus-keccakxN/make_all.py create mode 100644 sphincsplus/sphincsplus-keccakxN/merkle.c create mode 100644 sphincsplus/sphincsplus-keccakxN/merkle.h create mode 100644 sphincsplus/sphincsplus-keccakxN/params.h create mode 100644 sphincsplus/sphincsplus-keccakxN/params/params-sphincs-shake-128f.h create mode 100644 sphincsplus/sphincsplus-keccakxN/params/params-sphincs-shake-128s.h create mode 100644 sphincsplus/sphincsplus-keccakxN/params/params-sphincs-shake-192f.h create mode 100644 sphincsplus/sphincsplus-keccakxN/params/params-sphincs-shake-192s.h create mode 100644 sphincsplus/sphincsplus-keccakxN/params/params-sphincs-shake-256f.h create mode 100644 sphincsplus/sphincsplus-keccakxN/params/params-sphincs-shake-256s.h create mode 100644 sphincsplus/sphincsplus-keccakxN/randombytes.h create mode 100644 sphincsplus/sphincsplus-keccakxN/shake_offsets.h create mode 100644 sphincsplus/sphincsplus-keccakxN/sign.c create mode 100644 sphincsplus/sphincsplus-keccakxN/test/benchmark.c create mode 100644 sphincsplus/sphincsplus-keccakxN/test/cycles.c create mode 100644 sphincsplus/sphincsplus-keccakxN/test/cycles.h create mode 100644 sphincsplus/sphincsplus-keccakxN/test/fors.c create mode 100644 sphincsplus/sphincsplus-keccakxN/test/functest.c create mode 100644 sphincsplus/sphincsplus-keccakxN/test/randombytes.c create mode 100644 sphincsplus/sphincsplus-keccakxN/test/spx.c create mode 100644 sphincsplus/sphincsplus-keccakxN/thash.h create mode 100644 sphincsplus/sphincsplus-keccakxN/thash_shake_robust.c create mode 100644 sphincsplus/sphincsplus-keccakxN/thash_shake_robustx.c create mode 100644 sphincsplus/sphincsplus-keccakxN/thash_shake_simple.c create mode 100644 sphincsplus/sphincsplus-keccakxN/thash_shake_simplex.c create mode 100644 sphincsplus/sphincsplus-keccakxN/thashx.h create mode 100644 sphincsplus/sphincsplus-keccakxN/utils.c create mode 100644 sphincsplus/sphincsplus-keccakxN/utils.h create mode 100644 sphincsplus/sphincsplus-keccakxN/utilsx.c create mode 100644 sphincsplus/sphincsplus-keccakxN/utilsx.h create mode 100644 sphincsplus/sphincsplus-keccakxN/wots.c create mode 100644 sphincsplus/sphincsplus-keccakxN/wots.h create mode 100644 sphincsplus/sphincsplus-keccakxN/wotsx.h create mode 100644 tests/common/misc.c create mode 100644 tests/common/poly.c create mode 100644 tests/helloworld/main.c create mode 100644 tests/helloworld/neon_test.s create mode 100644 tests/inc/hal.h create mode 100644 tests/inc/misc.h create mode 100644 tests/inc/poly.h create mode 100755 tests/keccak_neon/keccak_f1600_tests.c create mode 100755 tests/keccak_neon/keccak_f1600_tests.h create mode 100755 tests/keccak_neon/main.c create mode 100644 tests/keccak_neon/manual/keccak_f1600_variants.h create mode 100644 tests/keccak_neon/manual/keccak_f1600_x1_scalar_C.c create mode 100644 tests/keccak_neon/manual/keccak_f1600_x1_scalar_asm_v1.s create mode 100644 tests/keccak_neon/manual/keccak_f1600_x1_scalar_asm_v2.s create mode 100644 tests/keccak_neon/manual/keccak_f1600_x1_scalar_asm_v3.s create mode 100644 tests/keccak_neon/manual/keccak_f1600_x1_scalar_asm_v4.s create mode 100644 tests/keccak_neon/manual/keccak_f1600_x1_scalar_asm_v5.s create mode 100644 tests/keccak_neon/manual/keccak_f1600_x2_hybrid_asm_v1.s create mode 100644 tests/keccak_neon/manual/keccak_f1600_x2_hybrid_asm_v2p0.s create mode 100644 tests/keccak_neon/manual/keccak_f1600_x2_hybrid_asm_v2p1.s create mode 100644 tests/keccak_neon/manual/keccak_f1600_x2_hybrid_asm_v2p2.s create mode 100644 tests/keccak_neon/manual/keccak_f1600_x2_hybrid_asm_v2pp0.s create mode 100644 tests/keccak_neon/manual/keccak_f1600_x2_hybrid_asm_v2pp1.s create mode 100644 tests/keccak_neon/manual/keccak_f1600_x2_hybrid_asm_v2pp2.s create mode 100644 tests/keccak_neon/manual/keccak_f1600_x2_v84a_asm_v1.s create mode 100644 tests/keccak_neon/manual/keccak_f1600_x2_v84a_asm_v1p0.s create mode 100644 tests/keccak_neon/manual/keccak_f1600_x2_v84a_asm_v2.s create mode 100644 tests/keccak_neon/manual/keccak_f1600_x2_v84a_asm_v2p0.s create mode 100644 tests/keccak_neon/manual/keccak_f1600_x2_v84a_asm_v2p1.s create mode 100644 tests/keccak_neon/manual/keccak_f1600_x2_v84a_asm_v2p2.s create mode 100644 tests/keccak_neon/manual/keccak_f1600_x2_v84a_asm_v2p3.s create mode 100644 tests/keccak_neon/manual/keccak_f1600_x2_v84a_asm_v2p4.s create mode 100644 tests/keccak_neon/manual/keccak_f1600_x2_v84a_asm_v2p5.s create mode 100644 tests/keccak_neon/manual/keccak_f1600_x2_v84a_asm_v2p6.s create mode 100644 tests/keccak_neon/manual/keccak_f1600_x2_v84a_asm_v2pp0.s create mode 100644 tests/keccak_neon/manual/keccak_f1600_x2_v84a_asm_v2pp1.s create mode 100644 tests/keccak_neon/manual/keccak_f1600_x2_v84a_asm_v2pp2.s create mode 100644 tests/keccak_neon/manual/keccak_f1600_x2_v84a_asm_v2pp3.s create mode 100644 tests/keccak_neon/manual/keccak_f1600_x2_v84a_asm_v2pp4.s create mode 100644 tests/keccak_neon/manual/keccak_f1600_x2_v84a_asm_v2pp5.s create mode 100644 tests/keccak_neon/manual/keccak_f1600_x2_v84a_asm_v2pp6.s create mode 100644 tests/keccak_neon/manual/keccak_f1600_x2_v84a_asm_v2pp7.s create mode 100644 tests/keccak_neon/manual/keccak_f1600_x3_hybrid_asm_v3p.s create mode 100644 tests/keccak_neon/manual/keccak_f1600_x3_hybrid_asm_v6.s create mode 100644 tests/keccak_neon/manual/keccak_f1600_x3_hybrid_asm_v7.s create mode 100644 tests/keccak_neon/manual/keccak_f1600_x4_hybrid_asm_v1.s create mode 100644 tests/keccak_neon/manual/keccak_f1600_x4_hybrid_asm_v2.s create mode 100644 tests/keccak_neon/manual/keccak_f1600_x4_hybrid_asm_v2p0.s create mode 100644 tests/keccak_neon/manual/keccak_f1600_x4_hybrid_asm_v3.s create mode 100644 tests/keccak_neon/manual/keccak_f1600_x4_hybrid_asm_v3p.s create mode 100644 tests/keccak_neon/manual/keccak_f1600_x4_hybrid_asm_v3pp.s create mode 100644 tests/keccak_neon/manual/keccak_f1600_x4_hybrid_asm_v4.s create mode 100644 tests/keccak_neon/manual/keccak_f1600_x4_hybrid_asm_v4p.s create mode 100644 tests/keccak_neon/manual/keccak_f1600_x4_hybrid_asm_v5.s create mode 100644 tests/keccak_neon/manual/keccak_f1600_x4_hybrid_asm_v5p.s create mode 100644 tests/keccak_neon/manual/keccak_f1600_x4_hybrid_asm_v6.s create mode 100644 tests/keccak_neon/manual/keccak_f1600_x4_hybrid_asm_v7.s create mode 100644 tests/keccak_neon/manual/keccak_f1600_x4_hybrid_asm_v8.s create mode 100644 tests/keccak_neon/manual/keccak_f1600_x4_scalar_asm_v1.s create mode 100644 tests/keccak_neon/manual/keccak_f1600_x4_scalar_asm_v5.s create mode 100644 tests/keccak_neon/manual/keccak_f1600_x4_v84a_asm_v1p0.s create mode 100644 tests/keccak_neon/manual/keccak_f1600_x5_hybrid_asm_v8.s create mode 100644 tests/keccak_neon/manual/keccak_f1600_x5_hybrid_asm_v8p.s create mode 100644 tests/keccak_neon/manual/macros.s create mode 100644 tests/keccak_neon/manual/third_party/keccakx2_C.c create mode 100644 tests/keccak_neon/manual/third_party/keccakx2_bas.s create mode 100644 tests/keccak_neon/manual/third_party/keccakx2_cothan.c create mode 100644 tests/ntt_kyber/main.c create mode 100644 tests/ntt_kyber/manual/dummy create mode 100644 tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_0_0.s create mode 100644 tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_10_0.s create mode 100644 tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_11_0.s create mode 100644 tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_12_0.s create mode 100644 tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_13_0.s create mode 100644 tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_14_0.s create mode 100644 tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_15_0.s create mode 100644 tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_16_0.s create mode 100644 tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_17_0.s create mode 100644 tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_18_0.s create mode 100644 tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_1_0.s create mode 100644 tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_2_0.s create mode 100644 tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_0.s create mode 100644 tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z2_0.s create mode 100644 tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z2_1.s create mode 100644 tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z2_2.s create mode 100644 tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z2_3.s create mode 100644 tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z2_4.s create mode 100644 tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z2_5.s create mode 100644 tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z4_0.s create mode 100644 tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z4_1.s create mode 100644 tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z4_2.s create mode 100644 tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z4_3.s create mode 100644 tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z4_4.s create mode 100644 tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_4_0.s create mode 100644 tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_5_0.s create mode 100644 tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_6_0.s create mode 100644 tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_7_0.s create mode 100644 tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_8_0.s create mode 100644 tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_9_0.s create mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_3_3_0.s create mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_3_3_1.s create mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_3_3_2.s create mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_3_3_3.s create mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_3_3_4.s create mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_3_3_5.s create mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_0_0.s create mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_0_z4_0.s create mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_0_z4_16.s create mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_10_z4_7.s create mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_11_z4_7.s create mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_12_z4_7.s create mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_13_z4_7.s create mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_14_z4_7.s create mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_15_z4_7.s create mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_16_z4_7.s create mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_17_z4_7.s create mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_18_z4_7.s create mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_19_z4_7.s create mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_20_z4_7.s create mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_21_z4_7.s create mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_10.s create mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_11.s create mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_12.s create mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_13.s create mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_14.s create mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_15.s create mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_7.s create mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_8.s create mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_9.s create mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_24_z4_0.s create mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_24_z4_16.s create mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_0.s create mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_1.s create mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_2.s create mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_3.s create mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_4.s create mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_5.s create mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_0.s create mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_1.s create mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_10.s create mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_2.s create mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_3.s create mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_4.s create mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_5.s create mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_6.s create mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_7.s create mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_8.s create mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_9.s create mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_8_z4_7.s create mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_9_z4_7.s create mode 100755 tests/ntt_neon/main.c create mode 100755 tests/ntt_neon/ntt.c create mode 100755 tests/ntt_neon/ntt.h create mode 100644 tests/ntt_sve2/auto/ntt_u32_incomplete_33556993_28678040_var_3_3_0.s create mode 100644 tests/ntt_sve2/auto/ntt_u64_incomplete_72057594067788289_60277548896192635_var_3_3_0.s create mode 100644 tests/ntt_sve2/auto/ntt_u64_incomplete_72057594067788289_60277548896192635_var_3_3_1.s create mode 100644 tests/ntt_sve2/auto/ntt_u64_incomplete_72057594067788289_60277548896192635_var_3_3_2.s create mode 100755 tests/ntt_sve2/main.c create mode 100644 tests/ntt_sve2/manual/basemul_64_72057594067788289.s create mode 100755 tests/ntt_sve2/misc.c create mode 100755 tests/ntt_sve2/misc.h create mode 100755 tests/ntt_sve2/ntt.c create mode 100755 tests/ntt_sve2/ntt.h diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..15bc917 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +**/*.pyc +**/*.o +**/*~ + +envs/qemu_v8a/build diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..00af409 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "nelight"] + path = nelight + url = https://github.com/slothy-optimizer/slothy diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..8c93397 --- /dev/null +++ b/LICENSE @@ -0,0 +1,25 @@ +The MIT license, the text of which is below, applies to pqax in general. +pqax includes some third party libraries or modules that are licensed +differently; the corresponding subfolder contains the license that applies in +that case. + +Copyright (c) 2021-2022 Arm Limited +SPDX-License-Identifier: MIT + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..5ac9119 --- /dev/null +++ b/Makefile @@ -0,0 +1,443 @@ + +CODEGEN_DIR=asm + +MANUAL_SRCS_DIR=$(CODEGEN_DIR)/manual +MANUAL_SRCS_KECCAK_NEON_DIR=$(MANUAL_SRCS_DIR)/keccak_f1600 +MANUAL_SRCS_NTT_KYBER_DIR=$(MANUAL_SRCS_DIR)/ntt_kyber/ +MANUAL_SRCS_BASEMUL_S64_DIR=$(MANUAL_SRCS_DIR)/basemul_s64 + +AUTOGEN_SRCS_DIR=$(CODEGEN_DIR)/auto +AUTOGEN_SRCS_NTT_NEON_DIR=$(AUTOGEN_SRCS_DIR)/ntt_neon +AUTOGEN_SRCS_NTT_SVE2_DIR=$(AUTOGEN_SRCS_DIR)/ntt_sve2 + +AUTOGEN_SRCS_ALL=$(wildcard $(AUTOGEN_SRCS_DIR)/*.s) \ + $(wildcard $(AUTOGEN_SRCS_DIR)/*/*.s) \ + $(wildcard $(AUTOGEN_SRCS_DIR)/*/*/*.s) \ + $(wildcard $(AUTOGEN_SRCS_DIR)/*/*/*/*/*.s) + +MANUAL_SRCS_ALL=$(wildcard $(MANUAL_SRCS_DIR)/*.s) \ + $(wildcard $(MANUAL_SRCS_DIR)/*/*.s) \ + $(wildcard $(MANUAL_SRCS_DIR)/*/*/*.s) \ + $(wildcard $(MANUAL_SRCS_DIR)/*/*/*/*/*.s) + +AUTOGEN_SRCS_NTT_NEON_ALL=$(wildcard $(AUTOGEN_SRCS_NTT_NEON_DIR)/*.s) \ + $(wildcard $(AUTOGEN_SRCS_NTT_NEON_DIR)/*/*.s) \ + $(wildcard $(AUTOGEN_SRCS_NTT_NEON_DIR)/*/*/*.s) \ + $(wildcard $(AUTOGEN_SRCS_NTT_NEON_DIR)/*/*/*/*.s) + +AUTOGEN_SRCS_NTT_SVE2_ALL=$(wildcard $(AUTOGEN_SRCS_NTT_SVE2_DIR)/*.s) \ + $(wildcard $(AUTOGEN_SRCS_NTT_SVE2_DIR)/*/*.s) \ + $(wildcard $(AUTOGEN_SRCS_NTT_SVE2_DIR)/*/*/*.s) \ + $(wildcard $(AUTOGEN_SRCS_NTT_SVE2_DIR)/*/*/*/*.s) + +MANUAL_SRCS_NTT_SVE2_ALL=$(wildcard $(MANUAL_SRCS_BASEMUL_S64_DIR)/*.[sch]) + +MANUAL_SRCS_KECCAK_NEON_ALL=$(wildcard $(MANUAL_SRCS_KECCAK_NEON_DIR)/*.[sch]) \ + $(wildcard $(MANUAL_SRCS_KECCAK_NEON_DIR)/*/*.[sch]) \ + $(wildcard $(MANUAL_SRCS_KECCAK_NEON_DIR)/*/*/*.[sch]) \ + $(wildcard $(MANUAL_SRCS_KECCAK_NEON_DIR)/*/*/*/*.[sch]) + +MANUAL_SRCS_NTT_KYBER_ALL=$(wildcard $(MANUAL_SRCS_NTT_KYBER_DIR)/*.[sch]) \ + $(wildcard $(MANUAL_SRCS_NTT_KYBER_DIR)/*/*.[sch]) \ + $(wildcard $(MANUAL_SRCS_NTT_KYBER_DIR)/*/*/*.[sch]) \ + $(wildcard $(MANUAL_SRCS_NTT_KYBER_DIR)/*/*/*/*.[sch]) + +TEST_BASE_DIR=tests + +# Directory and sources for Helloworld dummy test +TEST_HELLOWORLD_DIR=$(TEST_BASE_DIR)/helloworld +TEST_HELLOWORLD_SOURCES_AUTO_DIR=$(TEST_HELLOWORLD_DIR)/auto +TEST_HELLOWORLD_SRC_C=$(wildcard $(TEST_HELLOWORLD_DIR)/*.c) \ + $(wildcard $(TEST_HELLOWORLD_DIR)/*/*.c) +TEST_HELLOWORLD_SRC_ALL=$(TEST_HELLOWORLD_SRC_C) + +# Directory and sources for Neon-NTT test +TEST_NTT_NEON_DIR=$(TEST_BASE_DIR)/ntt_neon +TEST_NTT_NEON_SOURCES_AUTO_DIR=$(TEST_NTT_NEON_DIR)/auto +TEST_NTT_NEON_SRC_C=$(wildcard $(TEST_NTT_NEON_DIR)/*.c) \ + $(wildcard $(TEST_NTT_NEON_DIR)/*/*.c) +TEST_NTT_NEON_SRC_AUTO=$(patsubst $(AUTOGEN_SRCS_NTT_NEON_DIR)/%.s, \ + $(TEST_NTT_NEON_SOURCES_AUTO_DIR)/%.s, \ + $(AUTOGEN_SRCS_NTT_NEON_ALL)) +TEST_NTT_NEON_SRC_ALL=$(TEST_NTT_NEON_SRC_C) $(TEST_NTT_NEON_SRC_AUTO) + +# Directory and sources for SVE2-NTT test +TEST_NTT_SVE2_DIR=$(TEST_BASE_DIR)/ntt_sve2 +TEST_NTT_SVE2_SOURCES_AUTO_DIR=$(TEST_NTT_SVE2_DIR)/auto +TEST_NTT_SVE2_SOURCES_MANUAL_DIR=$(TEST_NTT_SVE2_DIR)/manual +TEST_NTT_SVE2_SRC_C=$(wildcard $(TEST_NTT_SVE2_DIR)/*.c) \ + $(wildcard $(TEST_NTT_SVE2_DIR)/*/*.c) +TEST_NTT_SVE2_SRC_AUTO=$(patsubst $(AUTOGEN_SRCS_NTT_SVE2_DIR)/%.s, \ + $(TEST_NTT_SVE2_SOURCES_AUTO_DIR)/%.s, \ + $(AUTOGEN_SRCS_NTT_SVE2_ALL)) +TEST_NTT_SVE2_SRC_MANUAL=$(patsubst $(MANUAL_SRCS_BASEMUL_S64_DIR)/%.s, \ + $(TEST_NTT_SVE2_SOURCES_MANUAL_DIR)/%.s, \ + $(MANUAL_SRCS_NTT_SVE2_ALL)) +TEST_NTT_SVE2_SRC_ALL=$(TEST_NTT_SVE2_SRC_C) $(TEST_NTT_SVE2_SRC_AUTO) $(TEST_NTT_SVE2_SRC_MANUAL) + +# Directory and sources for KECCAK test +TEST_KECCAK_NEON_DIR=$(TEST_BASE_DIR)/keccak_neon +TEST_KECCAK_NEON_SRC_C=$(wildcard $(TEST_KECCAK_NEON_DIR)/*.c) \ + $(wildcard $(TEST_KECCAK_NEON_DIR)/*/*.c) +TEST_KECCAK_NEON_SOURCES_MANUAL_DIR=$(TEST_KECCAK_NEON_DIR)/manual +TEST_KECCAK_NEON_SRC_MANUAL__=$(patsubst $(MANUAL_SRCS_KECCAK_NEON_DIR)/%.s, \ + $(TEST_KECCAK_NEON_SOURCES_MANUAL_DIR)/%.s, \ + $(MANUAL_SRCS_KECCAK_NEON_ALL)) +TEST_KECCAK_NEON_SRC_MANUAL_=$(patsubst $(MANUAL_SRCS_KECCAK_NEON_DIR)/%.c, \ + $(TEST_KECCAK_NEON_SOURCES_MANUAL_DIR)/%.c, \ + $(TEST_KECCAK_NEON_SRC_MANUAL__)) +TEST_KECCAK_NEON_SRC_MANUAL=$(patsubst $(MANUAL_SRCS_KECCAK_NEON_DIR)/%.h, \ + $(TEST_KECCAK_NEON_SOURCES_MANUAL_DIR)/%.h, \ + $(TEST_KECCAK_NEON_SRC_MANUAL_)) + +TEST_KECCAK_NEON_SRC_ALL=$(TEST_KECCAK_NEON_SRC_C) $(TEST_KECCAK_NEON_SRC_MANUAL) + +# Directory and sources for KECCAK test +TEST_NTT_KYBER_DIR=$(TEST_BASE_DIR)/ntt_kyber +TEST_NTT_KYBER_SRC_C=$(wildcard $(TEST_NTT_KYBER_DIR)/*.c) \ + $(wildcard $(TEST_NTT_KYBER_DIR)/*/*.c) +TEST_NTT_KYBER_SOURCES_MANUAL_DIR=$(TEST_NTT_KYBER_DIR)/manual +TEST_NTT_KYBER_SRC_MANUAL__=$(patsubst $(MANUAL_SRCS_NTT_KYBER_DIR)/%.s, \ + $(TEST_NTT_KYBER_SOURCES_MANUAL_DIR)/%.s, \ + $(MANUAL_SRCS_NTT_KYBER_ALL)) +TEST_NTT_KYBER_SRC_MANUAL_=$(patsubst $(MANUAL_SRCS_NTT_KYBER_DIR)/%.c, \ + $(TEST_NTT_KYBER_SOURCES_MANUAL_DIR)/%.c, \ + $(TEST_NTT_KYBER_SRC_MANUAL__)) +TEST_NTT_KYBER_SRC_MANUAL=$(patsubst $(MANUAL_SRCS_NTT_KYBER_DIR)/%.h, \ + $(TEST_NTT_KYBER_SOURCES_MANUAL_DIR)/%.h, \ + $(TEST_NTT_KYBER_SRC_MANUAL_)) + +TEST_NTT_KYBER_SRC_ALL=$(TEST_NTT_KYBER_SRC_C) $(TEST_NTT_KYBER_SRC_MANUAL) + + +# All sources +TEST_SRC_AUTO_ALL= $(TEST_NTT_NEON_SRC_AUTO) $(TEST_KECCAK_NEON_SRC_MANUAL) $(TEST_NTT_KYBER_SRC_MANUAL) + +# +# Test environments +# + +TEST_ENVS_BASE_DIR=envs + +# QEMU test environment +TEST_ENV_CROSS_BASE=$(TEST_ENVS_BASE_DIR)/cross +TEST_ENV_CROSS_SRC=$(TEST_ENV_CROSS_BASE)/src +TEST_ENV_CROSS_SYMLINK=$(TEST_ENV_CROSS_SRC)/test_src + +# Native test environment for mac +TEST_ENV_NATIVE_MAC_BASE=$(TEST_ENVS_BASE_DIR)/native_mac +TEST_ENV_NATIVE_MAC_SRC=$(TEST_ENV_NATIVE_MAC_BASE)/src +TEST_ENV_NATIVE_MAC_SYMLINK=$(TEST_ENV_NATIVE_MAC_SRC)/test_src + +# Native test environment for linux +TEST_ENV_NATIVE_LINUX_BASE=$(TEST_ENVS_BASE_DIR)/native_linux +TEST_ENV_NATIVE_LINUX_SRC=$(TEST_ENV_NATIVE_LINUX_BASE)/src +TEST_ENV_NATIVE_LINUX_SYMLINK=$(TEST_ENV_NATIVE_LINUX_SRC)/test_src + +# Code generation files +PYTHON_SRCS=$(wildcard $(CODEGEN_DIR)/*.py) \ + $(wildcard $(CODEGEN_DIR)/*/*.py) \ + $(wildcard $(CODEGEN_DIR)/*/*/*.py) \ + $(wildcard $(CODEGEN_DIR)/*/*/*/*/*.py) + +.PHONY: all +all: codegen $(TEST_SRC_AUTO_ALL) + +.PHONY: clean +clean: + make clean -C $(TEST_ENV_CROSS_BASE) + make clean -C $(TEST_ENV_NATIVE_MAC_BASE) + make clean -C $(TEST_ENV_NATIVE_LINUX_BASE) + rm -f $(TEST_SRC_AUTO_ALL) + rm -f $(TEST_ENV_CROSS_SYMLINK) + rm -f $(TEST_ENV_NATIVE_MAC_SYMLINK) + rm -f $(TEST_ENV_NATIVE_LINUX_SYMLINK) + rm -f $(TEST_ENV_CROSS_BASE)/test_loaded_* + rm -f $(TEST_ENV_NATIVE_MAC_BASE)/test_loaded_* + rm -f $(TEST_ENV_NATIVE_LINUX_BASE)/test_loaded_* + +.PHONY: cleanasm +cleanasm: + make clean -C $(CODEGEN_DIR) + +.PHONY: cleanall +cleanall: clean cleanasm + +$(AUTOGEN_SRCS_ALL): $(PYTHON_SRCS) + make -C $(CODEGEN_DIR) + +$(TEST_NTT_NEON_SRC_AUTO): $(TEST_NTT_NEON_SOURCES_AUTO_DIR)/%.s: $(AUTOGEN_SRCS_NTT_NEON_DIR)/%.s + mkdir -p $(@D) + cp $< $@ + +$(TEST_NTT_SVE2_SRC_AUTO): $(TEST_NTT_SVE2_SOURCES_AUTO_DIR)/%.s: $(AUTOGEN_SRCS_NTT_SVE2_DIR)/%.s + mkdir -p $(@D) + cp $< $@ +$(info XXX: $(TEST_NTT_SVE2_SRC_MANUAL)) +$(info YYY: $(TEST_NTT_SVE2_SRC_MANUAL)) +$(TEST_NTT_SVE2_SRC_MANUAL): $(TEST_NTT_SVE2_SOURCES_MANUAL_DIR)/%.s: $(MANUAL_SRCS_BASEMUL_S64_DIR)/%.s + mkdir -p $(@D) + cp $< $@ + +$(TEST_KECCAK_NEON_SRC_MANUAL): $(TEST_KECCAK_NEON_SOURCES_MANUAL_DIR)/%.c: $(MANUAL_SRCS_KECCAK_NEON_DIR)/%.c + mkdir -p $(@D) + cp $< $@ +$(TEST_KECCAK_NEON_SRC_MANUAL): $(TEST_KECCAK_NEON_SOURCES_MANUAL_DIR)/%.s: $(MANUAL_SRCS_KECCAK_NEON_DIR)/%.s + mkdir -p $(@D) + cp $< $@ +$(TEST_KECCAK_NEON_SRC_MANUAL): $(TEST_KECCAK_NEON_SOURCES_MANUAL_DIR)/%.h: $(MANUAL_SRCS_KECCAK_NEON_DIR)/%.h + mkdir -p $(@D) + cp $< $@ + +$(TEST_NTT_KYBER_SRC_MANUAL): $(TEST_NTT_KYBER_SOURCES_MANUAL_DIR)/%.c: $(MANUAL_SRCS_NTT_KYBER_DIR)/%.c + mkdir -p $(@D) + cp $< $@ +$(TEST_NTT_KYBER_SRC_MANUAL): $(TEST_NTT_KYBER_SOURCES_MANUAL_DIR)/%.s: $(MANUAL_SRCS_NTT_KYBER_DIR)/%.s + mkdir -p $(@D) + cp $< $@ +$(TEST_NTT_KYBER_SRC_MANUAL): $(TEST_NTT_KYBER_SOURCES_MANUAL_DIR)/%.h: $(MANUAL_SRCS_NTT_KYBER_DIR)/%.h + mkdir -p $(@D) + cp $< $@ + +.PHONY: codegen +codegen: + make codegen -C $(CODEGEN_DIR) + +# Template on CROSS + +TEST_ENV_CROSS_LINK_HELLOWORLD = $(TEST_ENV_CROSS_BASE)/test_loaded_helloworld +$(TEST_ENV_CROSS_LINK_HELLOWORLD): + rm -f $(TEST_ENV_CROSS_SYMLINK) + ln -s ../../../$(TEST_HELLOWORLD_DIR) $(TEST_ENV_CROSS_SYMLINK) + rm -f $(TEST_ENV_CROSS_BASE)/test_loaded_* + make -C $(TEST_ENV_CROSS_BASE) clean + touch $@ + +.PHONY: build-cross-helloworld +build-cross-helloworld: $(TEST_ENV_CROSS_LINK_HELLOWORLD) + make -C $(TEST_ENV_CROSS_BASE) + +.PHONY: run-cross-helloworld +run-cross-helloworld: $(TEST_ENV_CROSS_LINK_HELLOWORLD) + make run -C $(TEST_ENV_CROSS_BASE) + +# NTT test on cross + +TEST_ENV_CROSS_LINK_NTT_NEON = $(TEST_ENV_CROSS_BASE)/test_loaded_ntt_neon +$(TEST_ENV_CROSS_LINK_NTT_NEON): $(TEST_NTT_NEON_SRC_AUTO) + rm -f $(TEST_ENV_CROSS_SYMLINK) + ln -s ../../../$(TEST_NTT_NEON_DIR) $(TEST_ENV_CROSS_SYMLINK) + rm -f $(TEST_ENV_CROSS_BASE)/test_loaded_* + make -C $(TEST_ENV_CROSS_BASE) clean + touch $@ + +.PHONY: build-cross-ntt_neon +build-cross-ntt_neon: $(TEST_ENV_CROSS_LINK_NTT_NEON) + make -C $(TEST_ENV_CROSS_BASE) + +.PHONY: run-cross-ntt_neon +run-cross-ntt_neon: $(TEST_ENV_CROSS_LINK_NTT_NEON) + make run -C $(TEST_ENV_CROSS_BASE) + +.PHONY: debug-cross-ntt_neon +debug-cross-ntt_neon: $(TEST_ENV_CROSS_LINK_NTT_NEON) + make debug -C $(TEST_ENV_CROSS_BASE) + +# Keccak on CROSS + +TEST_ENV_CROSS_LINK_KECCAK_NEON = $(TEST_ENV_CROSS_BASE)/test_loaded_keccak_neon +$(TEST_ENV_CROSS_LINK_KECCAK_NEON): $(TEST_KECCAK_NEON_SRC_MANUAL) + rm -f $(TEST_ENV_CROSS_SYMLINK) + ln -s ../../../$(TEST_KECCAK_NEON_DIR) $(TEST_ENV_CROSS_SYMLINK) + rm -f $(TEST_ENV_CROSS_BASE)/test_loaded_* + make -C $(TEST_ENV_CROSS_BASE) clean + touch $@ + +.PHONY: build-cross-keccak_neon +build-cross-keccak_neon: $(TEST_ENV_CROSS_LINK_KECCAK_NEON) + make -C $(TEST_ENV_CROSS_BASE) + +.PHONY: run-cross-keccak_neon +run-cross-keccak_neon: $(TEST_ENV_CROSS_LINK_KECCAK_NEON) + make run -C $(TEST_ENV_CROSS_BASE) + +# Kyber NTT on CROSS + +TEST_ENV_CROSS_LINK_NTT_KYBER = $(TEST_ENV_CROSS_BASE)/test_loaded_ntt_kyber +$(TEST_ENV_CROSS_LINK_NTT_KYBER): $(TEST_NTT_KYBER_SRC_MANUAL) + rm -f $(TEST_ENV_CROSS_SYMLINK) + ln -s ../../../$(TEST_NTT_KYBER_DIR) $(TEST_ENV_CROSS_SYMLINK) + rm -f $(TEST_ENV_CROSS_BASE)/test_loaded_* + make -C $(TEST_ENV_CROSS_BASE) clean + touch $@ + +.PHONY: build-cross-ntt_kyber +build-cross-ntt_kyber: $(TEST_ENV_CROSS_LINK_NTT_KYBER) + make -C $(TEST_ENV_CROSS_BASE) + +.PHONY: run-cross-ntt_kyber +run-cross-ntt_kyber: $(TEST_ENV_CROSS_LINK_NTT_KYBER) + make run -C $(TEST_ENV_CROSS_BASE) + +# NTT-SVE2 test on CROSS + +TEST_ENV_CROSS_LINK_NTT_SVE2 = $(TEST_ENV_CROSS_BASE)/test_loaded_ntt_sve2 +$(TEST_ENV_CROSS_LINK_NTT_SVE2): $(TEST_NTT_SVE2_SRC_AUTO) $(TEST_NTT_SVE2_SRC_MANUAL) + rm -f $(TEST_ENV_CROSS_SYMLINK) + ln -s ../../../$(TEST_NTT_SVE2_DIR) $(TEST_ENV_CROSS_SYMLINK) + rm -f $(TEST_ENV_CROSS_BASE)/test_loaded_* + make -C $(TEST_ENV_CROSS_BASE) clean + touch $@ + +.PHONY: build-cross-ntt_sve2 +build-cross-ntt_sve2: $(TEST_ENV_CROSS_LINK_NTT_SVE2) + PLATFORM=v84a make -C $(TEST_ENV_CROSS_BASE) + +.PHONY: run-cross-ntt_sve2 +run-cross-ntt_sve2: $(TEST_ENV_CROSS_LINK_NTT_SVE2) + make run -C $(TEST_ENV_CROSS_BASE) + +.PHONY: debug-cross-ntt_sve2 +debug-cross-ntt_sve2: $(TEST_ENV_CROSS_LINK_NTT_SVE2) + make debug -C $(TEST_ENV_CROSS_BASE) + +# HelloWorld native + +TEST_ENV_NATIVE_LINK_HELLOWORLD = $(TEST_ENV_NATIVE_BASE)/test_loaded_helloworld +$(TEST_ENV_NATIVE_LINK_HELLOWORLD): + rm -f $(TEST_ENV_NATIVE_SYMLINK) + ln -s ../../../$(TEST_HELLOWORLD_DIR) $(TEST_ENV_NATIVE_SYMLINK) + rm -f $(TEST_ENV_NATIVE_BASE)/test_loaded_* + make -C $(TEST_ENV_NATIVE_BASE) clean + touch $@ + +.PHONY: build-native-helloworld +build-native-helloworld: $(TEST_ENV_NATIVE_LINK_HELLOWORLD) + make -C $(TEST_ENV_NATIVE_BASE) + +.PHONY: run-native-helloworld +run-native-helloworld: $(TEST_ENV_NATIVE_LINK_HELLOWORLD) + make run -C $(TEST_ENV_NATIVE_BASE) + +# Keccak native_mac +TEST_ENV_NATIVE_MAC_LINK_KECCAK_NEON = $(TEST_ENV_NATIVE_MAC_BASE)/test_loaded_keccak_neon +$(TEST_ENV_NATIVE_MAC_LINK_KECCAK_NEON): $(TEST_KECCAK_NEON_SRC_MANUAL) + rm -f $(TEST_ENV_NATIVE_MAC_SYMLINK) + ln -s ../../../$(TEST_KECCAK_NEON_DIR) $(TEST_ENV_NATIVE_MAC_SYMLINK) + rm -f $(TEST_ENV_NATIVE_MAC_BASE)/test_loaded_* + make -C $(TEST_ENV_NATIVE_MAC_BASE) clean + touch $@ + +.PHONY: build-native_mac-keccak_neon +build-native_mac-keccak_neon: $(TEST_ENV_NATIVE_MAC_LINK_KECCAK_NEON) + make -C $(TEST_ENV_NATIVE_MAC_BASE) + +.PHONY: run-native_mac-keccak_neon +run-native_mac-keccak_neon: $(TEST_ENV_NATIVE_MAC_LINK_KECCAK_NEON) + make run -C $(TEST_ENV_NATIVE_MAC_BASE) + +# Kyber NTT native_mac +TEST_ENV_NATIVE_MAC_LINK_NTT_KYBER = $(TEST_ENV_NATIVE_MAC_BASE)/test_loaded_ntt_kyber +$(TEST_ENV_NATIVE_MAC_LINK_NTT_KYBER): $(TEST_NTT_KYBER_SRC_MANUAL) + rm -f $(TEST_ENV_NATIVE_MAC_SYMLINK) + ln -s ../../../$(TEST_NTT_KYBER_DIR) $(TEST_ENV_NATIVE_MAC_SYMLINK) + rm -f $(TEST_ENV_NATIVE_MAC_BASE)/test_loaded_* + make -C $(TEST_ENV_NATIVE_MAC_BASE) clean + touch $@ + +.PHONY: build-native_mac-ntt_kyber +build-native_mac-ntt_kyber: $(TEST_ENV_NATIVE_MAC_LINK_NTT_KYBER) + make -C $(TEST_ENV_NATIVE_MAC_BASE) + +.PHONY: run-native_mac-ntt_kyber +run-native_mac-ntt_kyber: $(TEST_ENV_NATIVE_MAC_LINK_NTT_KYBER) + make run -C $(TEST_ENV_NATIVE_MAC_BASE) + +# NTT Neon native_mac + +TEST_ENV_NATIVE_MAC_LINK_NTT_NEON = $(TEST_ENV_NATIVE_MAC_BASE)/test_loaded_ntt_neon +$(TEST_ENV_NATIVE_MAC_LINK_NTT_NEON): $(TEST_NTT_NEON_SRC_AUTO) + rm -f $(TEST_ENV_NATIVE_MAC_SYMLINK) + ln -s ../../../$(TEST_NTT_NEON_DIR) $(TEST_ENV_NATIVE_MAC_SYMLINK) + rm -f $(TEST_ENV_NATIVE_MAC_BASE)/test_loaded_* + make -C $(TEST_ENV_NATIVE_MAC_BASE) clean + touch $@ + +.PHONY: build-native_mac-ntt_neon +build-native_mac-ntt_neon: $(TEST_ENV_NATIVE_MAC_LINK_NTT_NEON) + make -C $(TEST_ENV_NATIVE_MAC_BASE) + +.PHONY: run-native_mac-ntt_neon +run-native_mac-ntt_neon: $(TEST_ENV_NATIVE_MAC_LINK_NTT_NEON) + make run -C $(TEST_ENV_NATIVE_MAC_BASE) + + +# HelloWorld native_linux + +TEST_ENV_NATIVE_LINUX_LINK_HELLOWORLD = $(TEST_ENV_NATIVE_LINUX_BASE)/test_loaded_helloworld +$(TEST_ENV_NATIVE_LINUX_LINK_HELLOWORLD): + rm -f $(TEST_ENV_NATIVE_LINUX_SYMLINK) + ln -s ../../../$(TEST_HELLOWORLD_DIR) $(TEST_ENV_NATIVE_LINUX_SYMLINK) + rm -f $(TEST_ENV_NATIVE_LINUX_BASE)/test_loaded_* + make -C $(TEST_ENV_NATIVE_LINUX_BASE) clean + touch $@ + +.PHONY: build-native_linux-helloworld +build-native_linux-helloworld: $(TEST_ENV_NATIVE_LINUX_LINK_HELLOWORLD) + make -C $(TEST_ENV_NATIVE_LINUX_BASE) + +.PHONY: run-native_linux-helloworld +run-native_linux-helloworld: $(TEST_ENV_NATIVE_LINUX_LINK_HELLOWORLD) + make run -C $(TEST_ENV_NATIVE_LINUX_BASE) + +# Keccak native_linux +TEST_ENV_NATIVE_LINUX_LINK_KECCAK_NEON = $(TEST_ENV_NATIVE_LINUX_BASE)/test_loaded_keccak_neon +$(TEST_ENV_NATIVE_LINUX_LINK_KECCAK_NEON): $(TEST_KECCAK_NEON_SRC_MANUAL) + rm -f $(TEST_ENV_NATIVE_LINUX_SYMLINK) + ln -s ../../../$(TEST_KECCAK_NEON_DIR) $(TEST_ENV_NATIVE_LINUX_SYMLINK) + rm -f $(TEST_ENV_NATIVE_LINUX_BASE)/test_loaded_* + make -C $(TEST_ENV_NATIVE_LINUX_BASE) clean + touch $@ + +.PHONY: build-native_linux-keccak_neon +build-native_linux-keccak_neon: $(TEST_ENV_NATIVE_LINUX_LINK_KECCAK_NEON) + make -C $(TEST_ENV_NATIVE_LINUX_BASE) + +.PHONY: run-native_linux-keccak_neon +run-native_linux-keccak_neon: $(TEST_ENV_NATIVE_LINUX_LINK_KECCAK_NEON) + make run -C $(TEST_ENV_NATIVE_LINUX_BASE) + +# Kyber NTT native_linux +TEST_ENV_NATIVE_LINUX_LINK_NTT_KYBER = $(TEST_ENV_NATIVE_LINUX_BASE)/test_loaded_ntt_kyber +$(TEST_ENV_NATIVE_LINUX_LINK_NTT_KYBER): $(TEST_NTT_KYBER_SRC_MANUAL) + rm -f $(TEST_ENV_NATIVE_LINUX_SYMLINK) + ln -s ../../../$(TEST_NTT_KYBER_DIR) $(TEST_ENV_NATIVE_LINUX_SYMLINK) + rm -f $(TEST_ENV_NATIVE_LINUX_BASE)/test_loaded_* + make -C $(TEST_ENV_NATIVE_LINUX_BASE) clean + touch $@ + +.PHONY: build-native_linux-ntt_kyber +build-native_linux-ntt_kyber: $(TEST_ENV_NATIVE_LINUX_LINK_NTT_KYBER) + make -C $(TEST_ENV_NATIVE_LINUX_BASE) + +.PHONY: run-native_linux-ntt_kyber +run-native_linux-ntt_kyber: $(TEST_ENV_NATIVE_LINUX_LINK_NTT_KYBER) + make run -C $(TEST_ENV_NATIVE_LINUX_BASE) + +# NTT Neon native_linux + +TEST_ENV_NATIVE_LINUX_LINK_NTT_NEON = $(TEST_ENV_NATIVE_LINUX_BASE)/test_loaded_ntt_neon +$(TEST_ENV_NATIVE_LINUX_LINK_NTT_NEON): $(TEST_NTT_NEON_SRC_AUTO) + rm -f $(TEST_ENV_NATIVE_LINUX_SYMLINK) + ln -s ../../../$(TEST_NTT_NEON_DIR) $(TEST_ENV_NATIVE_LINUX_SYMLINK) + rm -f $(TEST_ENV_NATIVE_LINUX_BASE)/test_loaded_* + make -C $(TEST_ENV_NATIVE_LINUX_BASE) clean + touch $@ + +.PHONY: build-native_linux-ntt_neon +build-native_linux-ntt_neon: $(TEST_ENV_NATIVE_LINUX_LINK_NTT_NEON) + make -C $(TEST_ENV_NATIVE_LINUX_BASE) + +.PHONY: run-native_linux-ntt_neon +run-native_linux-ntt_neon: $(TEST_ENV_NATIVE_LINUX_LINK_NTT_NEON) + make run -C $(TEST_ENV_NATIVE_LINUX_BASE) diff --git a/README.md b/README.md new file mode 100644 index 0000000..6f09c67 --- /dev/null +++ b/README.md @@ -0,0 +1,70 @@ +# pqax: Post-Quantum Cryptography on A-profile Arm CPUs + +## Overview + +### Scope + +This repository provides implementations of known Post-Quantum Cryptography (PQC) on A-profile Arm CPUs. + +## Structure + +The main components of the repository are the following: +* [`asm`](asm): Core primitives in optimized assembly, auto-generated or handwritten. +* [`tests`](tests): C-based tests for core primitives using a minimal hardware abstraction layer (HAL). +* [`envs`](envs): Test environments implementing the HAL. +* [`sphincsplus`](sphincsplus): Supporting material for the paper "Hybrid scalar/vector implementations of Keccak on AArch64" + +The following sections explain each component in greater detail. + +### Optimized assembly routines + +The heart of the repository are optimized assembly routines for core components of the post quantum primitives under +consideration, such as the NTT or Keccak. All optimized assembly is contained in the [`asm`](asm) directory, which is structured +as follows: + +* [`asm/manual`](asm/manual) contains assembly that has been written by hand. +* [`asm/scripts`](asm/scripts) contains code generation scripts for various + algorithms around polynomial multiplication or the PQC schemes they're relevant for, as well as other tests and + examples. +* [`asm/auto/`](asm/auto/) contains the assembly auto-generated by the examples in + [`asm/scripts`](asm/scripts/). Its structure mirrors that of [`asm/scripts`](asm/scripts/). + +### Tests + +The core assembly routines are accompanied by C test programs contained in [`tests/`](tests/). For example, the +Keccak implementations from [`asm/manual/keccak_f1600`](asm/manual/keccak_f1600/) are tested in +[`tests/keccak_neon/`](tests/keccak_neon/). +The test files platform-independent and only rely on a small hardware abstraction layer +[`tests/inc/hal.h`](tests/inc/hal.h) which declares stubs for debugging, measuring, and random sources. + +### Test environments + +As mentioned above, the tests from [`tests/`](tests/) can be run in any environment defining the hardware abstraction layer +interface [`tests/inc/hal.h`](tests/inc/hal.h). + +The supported test environments are located in [`envs`](envs/). There are currently two test environments: `native` for +builds on an Arm host, and `cross` for cross-compilation. For the `cross` test environment, you can specific the +environment variable `CYCLES` as one of `NO, PMU, PERF` to indicate the source of cycle counts, and `PLATFORM` as one of +`v8a` or `v84a` to control the compilation target. + +### SPHINCS+ + +See [sphincsplus](sphincsplus) for more details. + +## License + +pqax is licensed under the MIT license. See [LICENSE](LICENSE) for details. + +pqax contains some third party sources, some of which are licensed differently: +* [sphincsplus/sphincsplusx2](sphincsplus/sphincsplusx2): CC0 1.0 Universal Public Domain + Dedication +* [asm/manual/keccak_f1600/third_party/keccakx2_cothan.c](asm/manual/keccak_f1600/third_party/keccakx2_cothan.c): Apache 2.0 +* [asm/manual/keccak_f1600/third_party/keccakx2_bas.s](asm/manual/keccak_f1600/third_party/keccakx2_bas.c): MIT +* [asm/manual/keccak_f1600/third_party/keccakx2_C.s](asm/manual/keccak_f1600/third_party/keccakx2_bas.c): CC0 1.0 Universal Public Domain + Dedication + +## Usage + +To build/run tests, use `make build-{cross,native}-{testname}`, where `test` is one of the tests in +[tests](tests). For example, to cross-compile the Keccak tests for simulation in QEMU, use `CYCLES=NO make +build-cross-keccak_neon`. diff --git a/asm/Makefile b/asm/Makefile new file mode 100644 index 0000000..51ef5a8 --- /dev/null +++ b/asm/Makefile @@ -0,0 +1,483 @@ +.PHONY: all clean codegen + +# All code generation sources +PYTH_SRCS := $(wildcard *.py) $(wildcard */*.py) $(wildcard */*/*.py) $(wildcard */*/*/*.py) + +AUTOGEN_DIR = auto + +PY_CODEGEN_NTT_NEON_TEST = ./scripts/ntt_neon/ntt_neon.py +PY_CODEGEN_NTT_FULL_NEON_TEST = ./scripts/ntt_neon/ntt_neon_full.py + +PY_CODEGEN_NTT_SVE2_TEST = ./scripts/ntt_sve2/ntt_sve2.py +PY_CODEGEN_NTT_FULL_SVE2_TEST = ./scripts/ntt_sve2/ntt_sve2_full.py + +# Autogenerated files for Neon-NTT +AUTOGEN_NTT_NEON_DIR = $(AUTOGEN_DIR)/ntt_neon +NTT_NEON_SRCS := $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_full_33556993_28678040_var_4_4_0_0.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_full_33556993_28678040_var_4_4_1_0.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_full_33556993_28678040_var_4_4_2_0.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_full_33556993_28678040_var_4_4_3_0.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_full_33556993_28678040_var_4_4_4_0.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_full_33556993_28678040_var_4_4_5_0.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_full_33556993_28678040_var_4_4_6_0.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_full_33556993_28678040_var_4_4_7_0.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_full_33556993_28678040_var_4_4_8_0.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_full_33556993_28678040_var_4_4_9_0.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_full_33556993_28678040_var_4_4_10_0.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_full_33556993_28678040_var_4_4_11_0.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_full_33556993_28678040_var_4_4_12_0.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_full_33556993_28678040_var_4_4_13_0.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_full_33556993_28678040_var_4_4_14_0.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_full_33556993_28678040_var_4_4_15_0.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_full_33556993_28678040_var_4_4_16_0.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_full_33556993_28678040_var_4_4_17_0.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_full_33556993_28678040_var_4_4_18_0.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_full_33556993_28678040_var_4_4_3_z2_0.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_full_33556993_28678040_var_4_4_3_z2_1.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_full_33556993_28678040_var_4_4_3_z2_2.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_full_33556993_28678040_var_4_4_3_z2_3.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_full_33556993_28678040_var_4_4_3_z2_4.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_full_33556993_28678040_var_4_4_3_z2_5.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_full_33556993_28678040_var_4_4_3_z4_0.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_full_33556993_28678040_var_4_4_3_z4_1.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_full_33556993_28678040_var_4_4_3_z4_2.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_full_33556993_28678040_var_4_4_3_z4_3.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_full_33556993_28678040_var_4_4_3_z4_4.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_incomplete_33556993_28678040_var_3_3_0.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_incomplete_33556993_28678040_var_3_3_1.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_incomplete_33556993_28678040_var_3_3_2.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_incomplete_33556993_28678040_var_3_3_3.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_incomplete_33556993_28678040_var_3_3_4.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_incomplete_33556993_28678040_var_3_3_5.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_0_0.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_0_z4_16.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_24_z4_0.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_24_z4_16.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_0_z4_0.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_0.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_1.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_2.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_3.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_4.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_5.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_0.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_1.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_2.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_3.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_4.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_5.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_6.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_7.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_8.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_9.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_10.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_8_z4_7.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_9_z4_7.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_10_z4_7.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_11_z4_7.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_12_z4_7.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_13_z4_7.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_14_z4_7.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_15_z4_7.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_16_z4_7.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_17_z4_7.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_18_z4_7.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_19_z4_7.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_20_z4_7.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_21_z4_7.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_7.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_8.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_9.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_10.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_11.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_12.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_13.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_14.s \ + $(AUTOGEN_NTT_NEON_DIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_15.s \ + +# Autogenerated files for Sve2-NTT +AUTOGEN_NTT_SVE2_DIR = $(AUTOGEN_DIR)/ntt_sve2 +NTT_SVE2_SRCS := $(AUTOGEN_NTT_SVE2_DIR)/ntt_u32_incomplete_33556993_28678040_var_3_3_0.s \ + $(AUTOGEN_NTT_SVE2_DIR)/ntt_u32_incomplete_33556993_28678040_var_3_3_1.s \ + $(AUTOGEN_NTT_SVE2_DIR)/ntt_u32_incomplete_33556993_28678040_var_3_3_2.s \ + $(AUTOGEN_NTT_SVE2_DIR)/ntt_u64_incomplete_72057594067788289_60277548896192635_var_3_3_0.s \ + $(AUTOGEN_NTT_SVE2_DIR)/ntt_u64_incomplete_72057594067788289_60277548896192635_var_3_3_1.s \ + $(AUTOGEN_NTT_SVE2_DIR)/ntt_u64_incomplete_72057594067788289_60277548896192635_var_3_3_2.s \ + +# User source files. +AUTOGEN_SRCS := $(NTT_NEON_SRCS) $(NTT_SVE2_SRCS) + +CODEGEN_SOURCES = $(wildcard %.s, ./*.py) $(wildcard %.s, ./*/*.py) $(wildcard %.s, ./*/*/*.py) $(wildcard %.s, ./*/*/*/*.py) $(wildcard %.s, ./*/*/*/*/*.py) + +.phony: codegen +codegen: $(AUTOGEN_SRCS) $(CODEGEN_SOURCES) + +################################################################################## +## NEON ## +################################################################################## + +# Neon NTT, incomplete + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_3_3_0.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 3,3 --schedule 0,0 33556993 28678040 ntt_u32_incomplete_neon_asm_var_3_3_0 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_3_3_1.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 3,3 --schedule 1,1 33556993 28678040 ntt_u32_incomplete_neon_asm_var_3_3_1 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_3_3_2.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 3,3 --schedule 2,2 33556993 28678040 ntt_u32_incomplete_neon_asm_var_3_3_2 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_3_3_3.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 3,3 --schedule 3,3 33556993 28678040 ntt_u32_incomplete_neon_asm_var_3_3_3 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_3_3_4.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 3,3 --schedule 4,4 33556993 28678040 ntt_u32_incomplete_neon_asm_var_3_3_4 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_3_3_5.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 3,3 --schedule 5,5 33556993 28678040 ntt_u32_incomplete_neon_asm_var_3_3_5 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_0_0.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,2 --schedule 0,0 33556993 28678040 ntt_u32_incomplete_neon_asm_var_4_2_0_0 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_0_z4_0.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,2 --schedule 0,z4_0 33556993 28678040 ntt_u32_incomplete_neon_asm_var_4_2_0_z4_0 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_0.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,2 --schedule 3,z4_0 33556993 28678040 ntt_u32_incomplete_neon_asm_var_4_2_3_z4_0 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_1.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,2 --schedule 3,z4_1 33556993 28678040 ntt_u32_incomplete_neon_asm_var_4_2_3_z4_1 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_2.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,2 --schedule 3,z4_2 33556993 28678040 ntt_u32_incomplete_neon_asm_var_4_2_3_z4_2 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_3.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,2 --schedule 3,z4_3 33556993 28678040 ntt_u32_incomplete_neon_asm_var_4_2_3_z4_3 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_4.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,2 --schedule 3,z4_4 33556993 28678040 ntt_u32_incomplete_neon_asm_var_4_2_3_z4_4 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_5.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,2 --schedule 3,z4_5 33556993 28678040 ntt_u32_incomplete_neon_asm_var_4_2_3_z4_5 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_0.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,2 --schedule 7,z4_0 33556993 28678040 ntt_u32_incomplete_neon_asm_var_4_2_7_z4_0 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_1.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,2 --schedule 7,z4_1 33556993 28678040 ntt_u32_incomplete_neon_asm_var_4_2_7_z4_1 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_2.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,2 --schedule 7,z4_2 33556993 28678040 ntt_u32_incomplete_neon_asm_var_4_2_7_z4_2 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_3.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,2 --schedule 7,z4_3 33556993 28678040 ntt_u32_incomplete_neon_asm_var_4_2_7_z4_3 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_4.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,2 --schedule 7,z4_4 33556993 28678040 ntt_u32_incomplete_neon_asm_var_4_2_7_z4_4 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_5.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,2 --schedule 7,z4_5 33556993 28678040 ntt_u32_incomplete_neon_asm_var_4_2_7_z4_5 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_6.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,2 --schedule 7,z4_6 33556993 28678040 ntt_u32_incomplete_neon_asm_var_4_2_7_z4_6 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_7.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,2 --schedule 7,z4_7 33556993 28678040 ntt_u32_incomplete_neon_asm_var_4_2_7_z4_7 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_8.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,2 --schedule 7,z4_8 33556993 28678040 ntt_u32_incomplete_neon_asm_var_4_2_7_z4_8 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_9.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,2 --schedule 7,z4_9 33556993 28678040 ntt_u32_incomplete_neon_asm_var_4_2_7_z4_9 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_10.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,2 --schedule 7,z4_10 33556993 28678040 ntt_u32_incomplete_neon_asm_var_4_2_7_z4_10 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_8_z4_7.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,2 --schedule 8,z4_7 33556993 28678040 ntt_u32_incomplete_neon_asm_var_4_2_8_z4_7 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_8_z4_7.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,2 --schedule 8,z4_7 33556993 28678040 ntt_u32_incomplete_neon_asm_var_4_2_8_z4_7 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_9_z4_7.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,2 --schedule 9,z4_7 33556993 28678040 ntt_u32_incomplete_neon_asm_var_4_2_9_z4_7 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_10_z4_7.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,2 --schedule 10,z4_7 33556993 28678040 ntt_u32_incomplete_neon_asm_var_4_2_10_z4_7 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_11_z4_7.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,2 --schedule 11,z4_7 33556993 28678040 ntt_u32_incomplete_neon_asm_var_4_2_11_z4_7 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_12_z4_7.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,2 --schedule 12,z4_7 33556993 28678040 ntt_u32_incomplete_neon_asm_var_4_2_12_z4_7 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_13_z4_7.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,2 --schedule 13,z4_7 33556993 28678040 ntt_u32_incomplete_neon_asm_var_4_2_13_z4_7 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_14_z4_7.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,2 --schedule 14,z4_7 33556993 28678040 ntt_u32_incomplete_neon_asm_var_4_2_14_z4_7 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_15_z4_7.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,2 --schedule 15,z4_7 33556993 28678040 ntt_u32_incomplete_neon_asm_var_4_2_15_z4_7 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_16_z4_7.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,2 --schedule 16,z4_7 33556993 28678040 ntt_u32_incomplete_neon_asm_var_4_2_16_z4_7 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_17_z4_7.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,2 --schedule 17,z4_7 33556993 28678040 ntt_u32_incomplete_neon_asm_var_4_2_17_z4_7 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_18_z4_7.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,2 --schedule 18,z4_7 33556993 28678040 ntt_u32_incomplete_neon_asm_var_4_2_18_z4_7 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_19_z4_7.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,2 --schedule 19,z4_7 33556993 28678040 ntt_u32_incomplete_neon_asm_var_4_2_19_z4_7 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_20_z4_7.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,2 --schedule 20,z4_7 33556993 28678040 ntt_u32_incomplete_neon_asm_var_4_2_20_z4_7 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_21_z4_7.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,2 --schedule 21,z4_7 33556993 28678040 ntt_u32_incomplete_neon_asm_var_4_2_21_z4_7 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_7.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,2 --schedule 22,z4_7 33556993 28678040 ntt_u32_incomplete_neon_asm_var_4_2_22_z4_7 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_8.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,2 --schedule 22,z4_8 33556993 28678040 ntt_u32_incomplete_neon_asm_var_4_2_22_z4_8 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_9.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,2 --schedule 22,z4_9 33556993 28678040 ntt_u32_incomplete_neon_asm_var_4_2_22_z4_9 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_10.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,2 --schedule 22,z4_10 33556993 28678040 ntt_u32_incomplete_neon_asm_var_4_2_22_z4_10 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_11.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,2 --schedule 22,z4_11 33556993 28678040 ntt_u32_incomplete_neon_asm_var_4_2_22_z4_11 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_12.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,2 --schedule 22,z4_12 33556993 28678040 ntt_u32_incomplete_neon_asm_var_4_2_22_z4_12 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_13.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,2 --schedule 22,z4_13 33556993 28678040 ntt_u32_incomplete_neon_asm_var_4_2_22_z4_13 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_14.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,2 --schedule 22,z4_14 33556993 28678040 ntt_u32_incomplete_neon_asm_var_4_2_22_z4_14 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_15.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,2 --schedule 22,z4_15 33556993 28678040 ntt_u32_incomplete_neon_asm_var_4_2_22_z4_15 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_0_z4_16.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,2 --schedule 0,z4_16 33556993 28678040 ntt_u32_incomplete_neon_asm_var_4_2_0_z4_16 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_24_z4_0.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,2 --schedule 24,z4_0 33556993 28678040 ntt_u32_incomplete_neon_asm_var_4_2_24_z4_0 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_24_z4_16.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,2 --schedule 24,z4_16 33556993 28678040 ntt_u32_incomplete_neon_asm_var_4_2_24_z4_16 + +# Neon NTT, complete + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_0_0.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,4 --schedule 0,0 33556993 28678040 ntt_u32_full_neon_asm_var_4_4_0_0 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_1_0.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,4 --schedule 1,0 33556993 28678040 ntt_u32_full_neon_asm_var_4_4_1_0 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_2_0.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,4 --schedule 2,0 33556993 28678040 ntt_u32_full_neon_asm_var_4_4_2_0 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_3_0.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,4 --schedule 3,0 33556993 28678040 ntt_u32_full_neon_asm_var_4_4_3_0 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_4_0.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,4 --schedule 4,0 33556993 28678040 ntt_u32_full_neon_asm_var_4_4_4_0 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_5_0.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,4 --schedule 5,0 33556993 28678040 ntt_u32_full_neon_asm_var_4_4_5_0 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_6_0.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,4 --schedule 6,0 33556993 28678040 ntt_u32_full_neon_asm_var_4_4_6_0 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_7_0.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,4 --schedule 7,0 33556993 28678040 ntt_u32_full_neon_asm_var_4_4_7_0 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_8_0.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,4 --schedule 8,0 33556993 28678040 ntt_u32_full_neon_asm_var_4_4_8_0 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_9_0.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,4 --schedule 9,0 33556993 28678040 ntt_u32_full_neon_asm_var_4_4_9_0 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_10_0.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,4 --schedule 10,0 33556993 28678040 ntt_u32_full_neon_asm_var_4_4_10_0 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_11_0.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,4 --schedule 11,0 33556993 28678040 ntt_u32_full_neon_asm_var_4_4_11_0 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_12_0.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,4 --schedule 12,0 33556993 28678040 ntt_u32_full_neon_asm_var_4_4_12_0 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_13_0.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,4 --schedule 13,0 33556993 28678040 ntt_u32_full_neon_asm_var_4_4_13_0 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_14_0.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,4 --schedule 14,0 33556993 28678040 ntt_u32_full_neon_asm_var_4_4_14_0 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_15_0.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,4 --schedule 15,0 33556993 28678040 ntt_u32_full_neon_asm_var_4_4_15_0 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_16_0.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,4 --schedule 16,0 33556993 28678040 ntt_u32_full_neon_asm_var_4_4_16_0 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_17_0.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,4 --schedule 17,0 33556993 28678040 ntt_u32_full_neon_asm_var_4_4_17_0 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_18_0.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,4 --schedule 18,0 33556993 28678040 ntt_u32_full_neon_asm_var_4_4_18_0 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_3_z2_0.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,4 --schedule 3,z2_0 33556993 28678040 ntt_u32_full_neon_asm_var_4_4_3_z2_0 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_3_z2_1.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,4 --schedule 3,z2_1 33556993 28678040 ntt_u32_full_neon_asm_var_4_4_3_z2_1 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_3_z2_2.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,4 --schedule 3,z2_2 33556993 28678040 ntt_u32_full_neon_asm_var_4_4_3_z2_2 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_3_z2_3.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,4 --schedule 3,z2_3 33556993 28678040 ntt_u32_full_neon_asm_var_4_4_3_z2_3 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_3_z2_4.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,4 --schedule 3,z2_4 33556993 28678040 ntt_u32_full_neon_asm_var_4_4_3_z2_4 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_3_z2_5.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,4 --schedule 3,z2_5 33556993 28678040 ntt_u32_full_neon_asm_var_4_4_3_z2_5 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_3_z4_0.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,4 --schedule 3,z4_0 33556993 28678040 ntt_u32_full_neon_asm_var_4_4_3_z4_0 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_3_z4_1.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,4 --schedule 3,z4_1 33556993 28678040 ntt_u32_full_neon_asm_var_4_4_3_z4_1 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_3_z4_2.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,4 --schedule 3,z4_2 33556993 28678040 ntt_u32_full_neon_asm_var_4_4_3_z4_2 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_3_z4_3.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,4 --schedule 3,z4_3 33556993 28678040 ntt_u32_full_neon_asm_var_4_4_3_z4_3 + +$(AUTOGEN_DIR)/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_3_z4_4.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_NEON_TEST) --out $@ 256 --layers 4,4 --schedule 3,z4_4 33556993 28678040 ntt_u32_full_neon_asm_var_4_4_3_z4_4 + +################################################################################## +## SVE2 ## +################################################################################## + +# Sve2 NTT, incomplete + +$(AUTOGEN_DIR)/ntt_sve2/ntt_u32_incomplete_33556993_28678040_var_3_3_0.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_SVE2_TEST) --out $@ 256 --layers 3,3 --schedule 0,0 33556993 28678040 ntt_u32_incomplete_sve2_asm_var_3_3_0 + +$(AUTOGEN_DIR)/ntt_sve2/ntt_u32_incomplete_33556993_28678040_var_3_3_1.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_SVE2_TEST) --out $@ 256 --layers 3,3 --schedule 1,11 33556993 28678040 ntt_u32_incomplete_sve2_asm_var_3_3_1 + +$(AUTOGEN_DIR)/ntt_sve2/ntt_u32_incomplete_33556993_28678040_var_3_3_2.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_SVE2_TEST) --out $@ 256 --layers 3,3 --schedule 2,2 33556993 28678040 ntt_u32_incomplete_sve2_asm_var_3_3_2 + +$(AUTOGEN_DIR)/ntt_sve2/ntt_u64_incomplete_72057594067788289_60277548896192635_var_3_3_0.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_SVE2_TEST) --bitwidth 64 --out $@ 256 --layers 3,3 --schedule 0,0 72057594067788289 60277548896192635 ntt_u64_incomplete_sve2_asm_var_3_3_0 + +$(AUTOGEN_DIR)/ntt_sve2/ntt_u64_incomplete_72057594067788289_60277548896192635_var_3_3_1.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_SVE2_TEST) --bitwidth 64 --out $@ 256 --layers 3,3 --schedule 1,11 72057594067788289 60277548896192635 ntt_u64_incomplete_sve2_asm_var_3_3_1 + +$(AUTOGEN_DIR)/ntt_sve2/ntt_u64_incomplete_72057594067788289_60277548896192635_var_3_3_2.s: $(PYTH_SRCS) + mkdir -p $(@D) + python3 $(PY_CODEGEN_NTT_SVE2_TEST) --bitwidth 64 --out $@ 256 --layers 3,3 --schedule 2,2 72057594067788289 60277548896192635 ntt_u64_incomplete_sve2_asm_var_3_3_2 + +clean: + rm -rf $(C_OBJS) $(ASM_OBJS) $(CMSIS_OBJS) $(AUTOGEN_SRCS) $(AUTOGEN_SRCS) + rm -rf $(TARGET) diff --git a/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_0_0.s b/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_0_0.s new file mode 100644 index 0000000..85f29ad --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_0_0.s @@ -0,0 +1,2422 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 26036764 // Layer 6, block 0 +.word 7065381 // Layer 6, block 1 +.word 11280567 // Layer 6, block 2 +.word 19695786 // Layer 6, block 3 +.word 1666225723 // Layer 6, block 0 +.word 452149874 // Layer 6, block 1 +.word 721901190 // Layer 6, block 2 +.word 1260434103 // Layer 6, block 3 +.word 28678040 // Layer 7, block 0 +.word 5637166 // Layer 7, block 2 +.word 18759424 // Layer 7, block 4 +.word 8648030 // Layer 7, block 6 +.word 1835254486 // Layer 7, block 0 +.word 360751090 // Layer 7, block 2 +.word 1200511508 // Layer 7, block 4 +.word 553431680 // Layer 7, block 6 +.word 7232147 // Layer 7, block 1 +.word 7430689 // Layer 7, block 3 +.word 14819378 // Layer 7, block 5 +.word 22112339 // Layer 7, block 7 +.word 462822084 // Layer 7, block 1 +.word 475527802 // Layer 7, block 3 +.word 948367809 // Layer 7, block 5 +.word 1415081692 // Layer 7, block 7 +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14834498 // Layer 6, block 4 +.word 22861321 // Layer 6, block 5 +.word 23033862 // Layer 6, block 6 +.word 32211066 // Layer 6, block 7 +.word 949335415 // Layer 6, block 4 +.word 1463012881 // Layer 6, block 5 +.word 1474054663 // Layer 6, block 6 +.word 2061350894 // Layer 6, block 7 +.word 7103825 // Layer 7, block 8 +.word 24338119 // Layer 7, block 10 +.word 6674394 // Layer 7, block 12 +.word 3716128 // Layer 7, block 14 +.word 454610102 // Layer 7, block 8 +.word 1557520740 // Layer 7, block 10 +.word 427128616 // Layer 7, block 12 +.word 237814041 // Layer 7, block 14 +.word 18577393 // Layer 7, block 9 +.word 17042091 // Layer 7, block 11 +.word 6574213 // Layer 7, block 13 +.word 24666803 // Layer 7, block 15 +.word 1188862414 // Layer 7, block 9 +.word 1090610585 // Layer 7, block 11 +.word 420717521 // Layer 7, block 13 +.word 1578554911 // Layer 7, block 15 +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 11253846 // Layer 6, block 8 +.word 16151303 // Layer 6, block 9 +.word 1821442 // Layer 6, block 10 +.word 23358663 // Layer 6, block 11 +.word 720191176 // Layer 6, block 8 +.word 1033604503 // Layer 6, block 9 +.word 116563391 // Layer 6, block 10 +.word 1494840340 // Layer 6, block 11 +.word 32787475 // Layer 7, block 16 +.word 8269259 // Layer 7, block 18 +.word 20826321 // Layer 7, block 20 +.word 21194054 // Layer 7, block 22 +.word 2098238255 // Layer 7, block 16 +.word 529192186 // Layer 7, block 18 +.word 1332782821 // Layer 7, block 20 +.word 1356315937 // Layer 7, block 22 +.word 28400654 // Layer 7, block 17 +.word 31090287 // Layer 7, block 19 +.word 26776841 // Layer 7, block 21 +.word 22281074 // Layer 7, block 23 +.word 1817503137 // Layer 7, block 17 +.word 1989626512 // Layer 7, block 19 +.word 1713587037 // Layer 7, block 21 +.word 1425879908 // Layer 7, block 23 +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 20504641 // Layer 6, block 12 +.word 7735096 // Layer 6, block 13 +.word 29463916 // Layer 6, block 14 +.word 23172067 // Layer 6, block 15 +.word 1312196872 // Layer 6, block 12 +.word 495008363 // Layer 6, block 13 +.word 1885546712 // Layer 6, block 14 +.word 1482899108 // Layer 6, block 15 +.word 1953000 // Layer 7, block 24 +.word 12766243 // Layer 7, block 26 +.word 16292342 // Layer 7, block 28 +.word 25143337 // Layer 7, block 30 +.word 124982461 // Layer 7, block 24 +.word 816977197 // Layer 7, block 26 +.word 1042630311 // Layer 7, block 28 +.word 1609050759 // Layer 7, block 30 +.word 12486848 // Layer 7, block 25 +.word 31556661 // Layer 7, block 27 +.word 28330310 // Layer 7, block 29 +.word 15137961 // Layer 7, block 31 +.word 799097282 // Layer 7, block 25 +.word 2019472170 // Layer 7, block 27 +.word 1813001465 // Layer 7, block 29 +.word 968755565 // Layer 7, block 31 +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 18663828 // Layer 6, block 16 +.word 25765932 // Layer 6, block 17 +.word 11779122 // Layer 6, block 18 +.word 29112305 // Layer 6, block 19 +.word 1194393831 // Layer 6, block 16 +.word 1648893798 // Layer 6, block 17 +.word 753806275 // Layer 6, block 18 +.word 1863045325 // Layer 6, block 19 +.word 33163184 // Layer 7, block 32 +.word 11550623 // Layer 7, block 34 +.word 25375595 // Layer 7, block 36 +.word 18254638 // Layer 7, block 38 +.word 2122281795 // Layer 7, block 32 +.word 739183455 // Layer 7, block 34 +.word 1623914137 // Layer 7, block 36 +.word 1168207670 // Layer 7, block 38 +.word 9551359 // Layer 7, block 33 +.word 33257316 // Layer 7, block 35 +.word 10387700 // Layer 7, block 37 +.word 4263629 // Layer 7, block 39 +.word 611240324 // Layer 7, block 33 +.word 2128305784 // Layer 7, block 35 +.word 664762063 // Layer 7, block 37 +.word 272851431 // Layer 7, block 39 +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 596073 // Layer 6, block 20 +.word 29039358 // Layer 6, block 21 +.word 6760262 // Layer 6, block 22 +.word 2228887 // Layer 6, block 23 +.word 38145761 // Layer 6, block 20 +.word 1858377074 // Layer 6, block 21 +.word 432623749 // Layer 6, block 22 +.word 142637881 // Layer 6, block 23 +.word 25929180 // Layer 7, block 40 +.word 23508428 // Layer 7, block 42 +.word 22560727 // Layer 7, block 44 +.word 29457393 // Layer 7, block 46 +.word 1659340873 // Layer 7, block 40 +.word 1504424569 // Layer 7, block 42 +.word 1443776334 // Layer 7, block 44 +.word 1885129272 // Layer 7, block 46 +.word 17371159 // Layer 7, block 41 +.word 11558208 // Layer 7, block 43 +.word 15755637 // Layer 7, block 45 +.word 20740787 // Layer 7, block 47 +.word 1111669329 // Layer 7, block 41 +.word 739668858 // Layer 7, block 43 +.word 1008283812 // Layer 7, block 45 +.word 1327309063 // Layer 7, block 47 +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 13624329 // Layer 6, block 24 +.word 9838349 // Layer 6, block 25 +.word 6934560 // Layer 6, block 26 +.word 11310234 // Layer 6, block 27 +.word 871890510 // Layer 6, block 24 +.word 629606282 // Layer 6, block 25 +.word 443777969 // Layer 6, block 26 +.word 723799733 // Layer 6, block 27 +.word 3153984 // Layer 7, block 48 +.word 15599806 // Layer 7, block 50 +.word 23484790 // Layer 7, block 52 +.word 30174454 // Layer 7, block 54 +.word 201839571 // Layer 7, block 48 +.word 998311389 // Layer 7, block 50 +.word 1502911852 // Layer 7, block 52 +.word 1931017673 // Layer 7, block 54 +.word 13598070 // Layer 7, block 49 +.word 31454003 // Layer 7, block 51 +.word 20506260 // Layer 7, block 53 +.word 5928435 // Layer 7, block 55 +.word 870210062 // Layer 7, block 49 +.word 2012902560 // Layer 7, block 51 +.word 1312300480 // Layer 7, block 53 +.word 379390883 // Layer 7, block 55 +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 32798516 // Layer 6, block 28 +.word 9911360 // Layer 6, block 29 +.word 32443170 // Layer 6, block 30 +.word 31293482 // Layer 6, block 31 +.word 2098944825 // Layer 6, block 28 +.word 634278629 // Layer 6, block 29 +.word 2076204416 // Layer 6, block 30 +.word 2002630000 // Layer 6, block 31 +.word 26013877 // Layer 7, block 56 +.word 22928950 // Layer 7, block 58 +.word 24547058 // Layer 7, block 60 +.word 21082546 // Layer 7, block 62 +.word 1664761067 // Layer 7, block 56 +.word 1467340807 // Layer 7, block 58 +.word 1570891816 // Layer 7, block 60 +.word 1349179970 // Layer 7, block 62 +.word 21864746 // Layer 7, block 57 +.word 27678266 // Layer 7, block 59 +.word 30695887 // Layer 7, block 61 +.word 31772478 // Layer 7, block 63 +.word 1399236949 // Layer 7, block 57 +.word 1771273834 // Layer 7, block 59 +.word 1964386839 // Layer 7, block 61 +.word 2033283404 // Layer 7, block 63 +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 2853776 // Layer 6, block 32 +.word 31645959 // Layer 6, block 33 +.word 29723614 // Layer 6, block 34 +.word 31813171 // Layer 6, block 35 +.word 182627725 // Layer 6, block 32 +.word 2025186806 // Layer 6, block 33 +.word 1902166116 // Layer 6, block 34 +.word 2035887557 // Layer 6, block 35 +.word 30377953 // Layer 7, block 64 +.word 4924837 // Layer 7, block 66 +.word 11362575 // Layer 7, block 68 +.word 31398766 // Layer 7, block 70 +.word 1944040616 // Layer 7, block 64 +.word 315165513 // Layer 7, block 66 +.word 727149301 // Layer 7, block 68 +.word 2009367662 // Layer 7, block 70 +.word 27689101 // Layer 7, block 65 +.word 31229525 // Layer 7, block 67 +.word 6544948 // Layer 7, block 69 +.word 13728247 // Layer 7, block 71 +.word 1771967221 // Layer 7, block 65 +.word 1998537064 // Layer 7, block 67 +.word 418844704 // Layer 7, block 69 +.word 878540754 // Layer 7, block 71 +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9116920 // Layer 6, block 36 +.word 26449800 // Layer 6, block 37 +.word 27173300 // Layer 6, block 38 +.word 1574249 // Layer 6, block 39 +.word 583438350 // Layer 6, block 36 +.word 1692658010 // Layer 6, block 37 +.word 1738958476 // Layer 6, block 38 +.word 100744247 // Layer 6, block 39 +.word 6510145 // Layer 7, block 72 +.word 760999 // Layer 7, block 74 +.word 1634503 // Layer 7, block 76 +.word 29546109 // Layer 7, block 78 +.word 416617482 // Layer 7, block 72 +.word 48700219 // Layer 7, block 74 +.word 104600209 // Layer 7, block 76 +.word 1890806663 // Layer 7, block 78 +.word 2195232 // Layer 7, block 73 +.word 4465852 // Layer 7, block 75 +.word 31203102 // Layer 7, block 77 +.word 29916743 // Layer 7, block 79 +.word 140484126 // Layer 7, block 73 +.word 285792715 // Layer 7, block 75 +.word 1996846121 // Layer 7, block 77 +.word 1914525428 // Layer 7, block 79 +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29172999 // Layer 6, block 40 +.word 16825951 // Layer 6, block 41 +.word 11592382 // Layer 6, block 42 +.word 2671395 // Layer 6, block 43 +.word 1866929445 // Layer 6, block 40 +.word 1076778680 // Layer 6, block 41 +.word 741855827 // Layer 6, block 42 +.word 170956232 // Layer 6, block 43 +.word 14579779 // Layer 7, block 80 +.word 24263513 // Layer 7, block 82 +.word 4646776 // Layer 7, block 84 +.word 69049 // Layer 7, block 86 +.word 933034643 // Layer 7, block 80 +.word 1552746321 // Layer 7, block 82 +.word 297370968 // Layer 7, block 84 +.word 4418799 // Layer 7, block 86 +.word 33263488 // Layer 7, block 81 +.word 22493246 // Layer 7, block 83 +.word 22009979 // Layer 7, block 85 +.word 12021234 // Layer 7, block 87 +.word 2128700762 // Layer 7, block 81 +.word 1439457879 // Layer 7, block 83 +.word 1408531152 // Layer 7, block 85 +.word 769300260 // Layer 7, block 87 +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 15720958 // Layer 6, block 44 +.word 4876619 // Layer 6, block 45 +.word 9370171 // Layer 6, block 46 +.word 2197027 // Layer 6, block 47 +.word 1006064525 // Layer 6, block 44 +.word 312079797 // Layer 6, block 45 +.word 599645177 // Layer 6, block 46 +.word 140598997 // Layer 6, block 47 +.word 16117282 // Layer 7, block 88 +.word 9635661 // Layer 7, block 90 +.word 9117520 // Layer 7, block 92 +.word 3506913 // Layer 7, block 94 +.word 1031427326 // Layer 7, block 88 +.word 616635240 // Layer 7, block 90 +.word 583476747 // Layer 7, block 92 +.word 224425303 // Layer 7, block 94 +.word 20014407 // Layer 7, block 89 +.word 25893988 // Layer 7, block 91 +.word 10257619 // Layer 7, block 93 +.word 24501669 // Layer 7, block 95 +.word 1280824291 // Layer 7, block 89 +.word 1657088757 // Layer 7, block 91 +.word 656437514 // Layer 7, block 93 +.word 1567987141 // Layer 7, block 95 +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 23467272 // Layer 6, block 48 +.word 11944835 // Layer 6, block 49 +.word 29768154 // Layer 6, block 50 +.word 3189790 // Layer 6, block 51 +.word 1501790786 // Layer 6, block 48 +.word 764411097 // Layer 6, block 49 +.word 1905016458 // Layer 6, block 50 +.word 204130980 // Layer 6, block 51 +.word 28559032 // Layer 7, block 96 +.word 20151609 // Layer 7, block 98 +.word 11645481 // Layer 7, block 100 +.word 16402437 // Layer 7, block 102 +.word 1827638556 // Layer 7, block 96 +.word 1289604549 // Layer 7, block 98 +.word 745253903 // Layer 7, block 100 +.word 1049675853 // Layer 7, block 102 +.word 1005359 // Layer 7, block 97 +.word 19130139 // Layer 7, block 99 +.word 11690281 // Layer 7, block 101 +.word 5461508 // Layer 7, block 103 +.word 64338065 // Layer 7, block 97 +.word 1224235458 // Layer 7, block 99 +.word 748120885 // Layer 7, block 101 +.word 349509836 // Layer 7, block 103 +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 4898455 // Layer 6, block 52 +.word 22059944 // Layer 6, block 53 +.word 20315246 // Layer 6, block 54 +.word 28615767 // Layer 6, block 55 +.word 313477194 // Layer 6, block 52 +.word 1411728668 // Layer 6, block 53 +.word 1300076517 // Layer 6, block 54 +.word 1831269319 // Layer 6, block 55 +.word 6226096 // Layer 7, block 104 +.word 14029790 // Layer 7, block 106 +.word 7729000 // Layer 7, block 108 +.word 13958531 // Layer 7, block 110 +.word 398439734 // Layer 7, block 104 +.word 897838034 // Layer 7, block 106 +.word 494618249 // Layer 7, block 108 +.word 893277806 // Layer 7, block 110 +.word 31755058 // Layer 7, block 105 +.word 26102744 // Layer 7, block 107 +.word 19175904 // Layer 7, block 109 +.word 19472238 // Layer 7, block 111 +.word 2032168609 // Layer 7, block 105 +.word 1670448121 // Layer 7, block 107 +.word 1227164194 // Layer 7, block 109 +.word 1246128123 // Layer 7, block 111 +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 17302560 // Layer 6, block 56 +.word 8630188 // Layer 6, block 57 +.word 13744680 // Layer 6, block 58 +.word 31890906 // Layer 6, block 59 +.word 1107279328 // Layer 6, block 56 +.word 552289879 // Layer 6, block 57 +.word 879592386 // Layer 6, block 58 +.word 2040862218 // Layer 6, block 59 +.word 4735938 // Layer 7, block 112 +.word 26671657 // Layer 7, block 114 +.word 25810971 // Layer 7, block 116 +.word 25578690 // Layer 7, block 118 +.word 303076900 // Layer 7, block 112 +.word 1706855774 // Layer 7, block 114 +.word 1651776074 // Layer 7, block 116 +.word 1636911225 // Layer 7, block 118 +.word 6957373 // Layer 7, block 113 +.word 25381712 // Layer 7, block 115 +.word 27780827 // Layer 7, block 117 +.word 28062311 // Layer 7, block 119 +.word 445237890 // Layer 7, block 113 +.word 1624305595 // Layer 7, block 115 +.word 1777837237 // Layer 7, block 117 +.word 1795850838 // Layer 7, block 119 +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 26150922 // Layer 6, block 60 +.word 29525906 // Layer 6, block 61 +.word 23080870 // Layer 6, block 62 +.word 1636987 // Layer 6, block 63 +.word 1673531278 // Layer 6, block 60 +.word 1889513769 // Layer 6, block 61 +.word 1477062945 // Layer 6, block 62 +.word 104759172 // Layer 6, block 63 +.word 10674616 // Layer 7, block 120 +.word 9508293 // Layer 7, block 122 +.word 4274200 // Layer 7, block 124 +.word 10066304 // Layer 7, block 126 +.word 683123285 // Layer 7, block 120 +.word 608484310 // Layer 7, block 122 +.word 273527923 // Layer 7, block 124 +.word 644194289 // Layer 7, block 126 +.word 26473446 // Layer 7, block 121 +.word 14853570 // Layer 7, block 123 +.word 32427548 // Layer 7, block 125 +.word 16598340 // Layer 7, block 127 +.word 1694171239 // Layer 7, block 121 +.word 950555930 // Layer 7, block 123 +.word 2075204685 // Layer 7, block 125 +.word 1062212688 // Layer 7, block 127 +.text +.global ntt_u32_full_neon_asm_var_4_4_0_0 +.global _ntt_u32_full_neon_asm_var_4_4_0_0 +ntt_u32_full_neon_asm_var_4_4_0_0: +_ntt_u32_full_neon_asm_var_4_4_0_0: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #800] +ldr q21, [x0, #864] +ldr q20, [x0, #928] +ldr q19, [x0, #992] +ldr q18, [x0, #288] +ldr q17, [x0, #352] +ldr q16, [x0, #416] +ldr q3, [x0, #480] +ldr q2, [x0, #544] +ldr q1, [x0, #608] +ldr q0, [x0, #672] +ldr q15, [x0, #736] +ldr q14, [x0, #32] +ldr q13, [x0, #96] +ldr q12, [x0, #160] +ldr q11, [x0, #224] +sqrdmulh v10.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +mla v22.4S, v10.4S, v31.s[0] +sub v10.4s, v18.4s, v22.4s +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v17.4s, v21.4s +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +mla v20.4S, v21.4S, v31.s[0] +sub v21.4s, v16.4s, v20.4s +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +mla v19.4S, v20.4S, v31.s[0] +sub v20.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +mla v2.4S, v19.4S, v31.s[0] +sub v19.4s, v14.4s, v2.4s +add v14.4s, v14.4s, v2.4s +sqrdmulh v2.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +mla v1.4S, v2.4S, v31.s[0] +sub v2.4s, v13.4s, v1.4s +add v13.4s, v13.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v29.s[0] +mul v0.4S, v0.4S,v30.s[0] +mla v0.4S, v1.4S, v31.s[0] +sub v1.4s, v12.4s, v0.4s +add v12.4s, v12.4s, v0.4s +sqrdmulh v0.4S, v15.4S, v29.s[0] +mul v15.4S, v15.4S,v30.s[0] +mla v15.4S, v0.4S, v31.s[0] +sub v0.4s, v11.4s, v15.4s +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +mla v16.4S, v15.4S, v31.s[0] +sub v15.4s, v12.4s, v16.4s +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +mla v3.4S, v16.4S, v31.s[0] +sub v16.4s, v11.4s, v3.4s +add v11.4s, v11.4s, v3.4s +sqrdmulh v3.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +mla v18.4S, v3.4S, v31.s[0] +sub v3.4s, v14.4s, v18.4s +add v14.4s, v14.4s, v18.4s +sqrdmulh v18.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +mla v17.4S, v18.4S, v31.s[0] +sub v18.4s, v13.4s, v17.4s +add v13.4s, v13.4s, v17.4s +sqrdmulh v17.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +mla v21.4S, v17.4S, v31.s[0] +sub v17.4s, v1.4s, v21.4s +add v1.4s, v1.4s, v21.4s +sqrdmulh v21.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +mla v20.4S, v21.4S, v31.s[0] +sub v21.4s, v0.4s, v20.4s +add v0.4s, v0.4s, v20.4s +sqrdmulh v20.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +mla v10.4S, v20.4S, v31.s[0] +sub v20.4s, v19.4s, v10.4s +add v19.4s, v19.4s, v10.4s +sqrdmulh v10.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +mla v22.4S, v10.4S, v31.s[0] +sub v10.4s, v2.4s, v22.4s +add v2.4s, v2.4s, v22.4s +sqrdmulh v22.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +mla v12.4S, v22.4S, v31.s[0] +sub v22.4s, v14.4s, v12.4s +add v14.4s, v14.4s, v12.4s +sqrdmulh v12.4S, v11.4S, v27.s[0] +mul v11.4S, v11.4S,v28.s[0] +mla v11.4S, v12.4S, v31.s[0] +sub v12.4s, v13.4s, v11.4s +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v15.4S, v27.s[1] +mul v15.4S, v15.4S,v28.s[1] +mla v15.4S, v11.4S, v31.s[0] +sub v11.4s, v3.4s, v15.4s +add v3.4s, v3.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v27.s[1] +mul v16.4S, v16.4S,v28.s[1] +mla v16.4S, v15.4S, v31.s[0] +sub v15.4s, v18.4s, v16.4s +add v18.4s, v18.4s, v16.4s +sqrdmulh v16.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +mla v1.4S, v16.4S, v31.s[0] +sub v16.4s, v19.4s, v1.4s +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v27.s[2] +mul v0.4S, v0.4S,v28.s[2] +mla v0.4S, v1.4S, v31.s[0] +sub v1.4s, v2.4s, v0.4s +add v2.4s, v2.4s, v0.4s +sqrdmulh v0.4S, v17.4S, v27.s[3] +mul v17.4S, v17.4S,v28.s[3] +mla v17.4S, v0.4S, v31.s[0] +sub v0.4s, v20.4s, v17.4s +add v20.4s, v20.4s, v17.4s +sqrdmulh v17.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +mla v21.4S, v17.4S, v31.s[0] +sub v17.4s, v10.4s, v21.4s +add v10.4s, v10.4s, v21.4s +sqrdmulh v21.4S, v13.4S, v25.s[0] +mul v13.4S, v13.4S,v26.s[0] +mla v13.4S, v21.4S, v31.s[0] +sub v21.4s, v14.4s, v13.4s +add v14.4s, v14.4s, v13.4s +sqrdmulh v13.4S, v12.4S, v25.s[1] +mul v12.4S, v12.4S,v26.s[1] +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v18.4S, v25.s[2] +mul v18.4S, v18.4S,v26.s[2] +mla v18.4S, v12.4S, v31.s[0] +sub v12.4s, v3.4s, v18.4s +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v15.4S, v25.s[3] +mul v15.4S, v15.4S,v26.s[3] +mla v15.4S, v18.4S, v31.s[0] +sub v18.4s, v11.4s, v15.4s +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v23.s[0] +mul v2.4S, v2.4S,v24.s[0] +mla v2.4S, v15.4S, v31.s[0] +sub v15.4s, v19.4s, v2.4s +add v19.4s, v19.4s, v2.4s +sqrdmulh v2.4S, v1.4S, v23.s[1] +mul v1.4S, v1.4S,v24.s[1] +mla v1.4S, v2.4S, v31.s[0] +sub v2.4s, v16.4s, v1.4s +add v16.4s, v16.4s, v1.4s +sqrdmulh v1.4S, v10.4S, v23.s[2] +mul v10.4S, v10.4S,v24.s[2] +mla v10.4S, v1.4S, v31.s[0] +sub v1.4s, v20.4s, v10.4s +add v20.4s, v20.4s, v10.4s +sqrdmulh v10.4S, v17.4S, v23.s[3] +mul v17.4S, v17.4S,v24.s[3] +mla v17.4S, v10.4S, v31.s[0] +sub v10.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +str q14, [x0, #32] +str q21, [x0, #96] +str q22, [x0, #160] +str q13, [x0, #224] +str q3, [x0, #288] +str q12, [x0, #352] +str q11, [x0, #416] +str q18, [x0, #480] +str q19, [x0, #544] +str q15, [x0, #608] +str q16, [x0, #672] +str q2, [x0, #736] +str q20, [x0, #800] +str q1, [x0, #864] +str q0, [x0, #928] +str q10, [x0, #992] +ldr q10, [x0, #816] +ldr q0, [x0, #880] +ldr q1, [x0, #944] +ldr q20, [x0, #1008] +ldr q2, [x0, #304] +ldr q16, [x0, #368] +ldr q15, [x0, #432] +ldr q19, [x0, #496] +ldr q18, [x0, #560] +ldr q11, [x0, #624] +ldr q12, [x0, #688] +ldr q3, [x0, #752] +ldr q13, [x0, #48] +ldr q22, [x0, #112] +ldr q21, [x0, #176] +ldr q14, [x0, #240] +sqrdmulh v17.4S, v10.4S, v29.s[0] +mul v10.4S, v10.4S,v30.s[0] +mla v10.4S, v17.4S, v31.s[0] +sub v17.4s, v2.4s, v10.4s +add v2.4s, v2.4s, v10.4s +sqrdmulh v10.4S, v0.4S, v29.s[0] +mul v0.4S, v0.4S,v30.s[0] +mla v0.4S, v10.4S, v31.s[0] +sub v10.4s, v16.4s, v0.4s +add v16.4s, v16.4s, v0.4s +sqrdmulh v0.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +mla v1.4S, v0.4S, v31.s[0] +sub v0.4s, v15.4s, v1.4s +add v15.4s, v15.4s, v1.4s +sqrdmulh v1.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +mla v20.4S, v1.4S, v31.s[0] +sub v1.4s, v19.4s, v20.4s +add v19.4s, v19.4s, v20.4s +sqrdmulh v20.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +mla v18.4S, v20.4S, v31.s[0] +sub v20.4s, v13.4s, v18.4s +add v13.4s, v13.4s, v18.4s +sqrdmulh v18.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +mla v11.4S, v18.4S, v31.s[0] +sub v18.4s, v22.4s, v11.4s +add v22.4s, v22.4s, v11.4s +sqrdmulh v11.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +mla v12.4S, v11.4S, v31.s[0] +sub v11.4s, v21.4s, v12.4s +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +mla v3.4S, v12.4S, v31.s[0] +sub v12.4s, v14.4s, v3.4s +add v14.4s, v14.4s, v3.4s +sqrdmulh v3.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +mla v15.4S, v3.4S, v31.s[0] +sub v3.4s, v21.4s, v15.4s +add v21.4s, v21.4s, v15.4s +sqrdmulh v15.4S, v19.4S, v29.s[1] +mul v19.4S, v19.4S,v30.s[1] +mla v19.4S, v15.4S, v31.s[0] +sub v15.4s, v14.4s, v19.4s +add v14.4s, v14.4s, v19.4s +sqrdmulh v19.4S, v2.4S, v29.s[1] +mul v2.4S, v2.4S,v30.s[1] +mla v2.4S, v19.4S, v31.s[0] +sub v19.4s, v13.4s, v2.4s +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +mla v16.4S, v2.4S, v31.s[0] +sub v2.4s, v22.4s, v16.4s +add v22.4s, v22.4s, v16.4s +sqrdmulh v16.4S, v0.4S, v29.s[2] +mul v0.4S, v0.4S,v30.s[2] +mla v0.4S, v16.4S, v31.s[0] +sub v16.4s, v11.4s, v0.4s +add v11.4s, v11.4s, v0.4s +sqrdmulh v0.4S, v1.4S, v29.s[2] +mul v1.4S, v1.4S,v30.s[2] +mla v1.4S, v0.4S, v31.s[0] +sub v0.4s, v12.4s, v1.4s +add v12.4s, v12.4s, v1.4s +sqrdmulh v1.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +mla v17.4S, v1.4S, v31.s[0] +sub v1.4s, v20.4s, v17.4s +add v20.4s, v20.4s, v17.4s +sqrdmulh v17.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +mla v10.4S, v17.4S, v31.s[0] +sub v17.4s, v18.4s, v10.4s +add v18.4s, v18.4s, v10.4s +sqrdmulh v10.4S, v21.4S, v27.s[0] +mul v21.4S, v21.4S,v28.s[0] +mla v21.4S, v10.4S, v31.s[0] +sub v10.4s, v13.4s, v21.4s +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +mla v14.4S, v21.4S, v31.s[0] +sub v21.4s, v22.4s, v14.4s +add v22.4s, v22.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v27.s[1] +mul v3.4S, v3.4S,v28.s[1] +mla v3.4S, v14.4S, v31.s[0] +sub v14.4s, v19.4s, v3.4s +add v19.4s, v19.4s, v3.4s +sqrdmulh v3.4S, v15.4S, v27.s[1] +mul v15.4S, v15.4S,v28.s[1] +mla v15.4S, v3.4S, v31.s[0] +sub v3.4s, v2.4s, v15.4s +add v2.4s, v2.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v27.s[2] +mul v11.4S, v11.4S,v28.s[2] +mla v11.4S, v15.4S, v31.s[0] +sub v15.4s, v20.4s, v11.4s +add v20.4s, v20.4s, v11.4s +sqrdmulh v11.4S, v12.4S, v27.s[2] +mul v12.4S, v12.4S,v28.s[2] +mla v12.4S, v11.4S, v31.s[0] +sub v11.4s, v18.4s, v12.4s +add v18.4s, v18.4s, v12.4s +sqrdmulh v12.4S, v16.4S, v27.s[3] +mul v16.4S, v16.4S,v28.s[3] +mla v16.4S, v12.4S, v31.s[0] +sub v12.4s, v1.4s, v16.4s +add v1.4s, v1.4s, v16.4s +sqrdmulh v16.4S, v0.4S, v27.s[3] +mul v0.4S, v0.4S,v28.s[3] +mla v0.4S, v16.4S, v31.s[0] +sub v16.4s, v17.4s, v0.4s +add v17.4s, v17.4s, v0.4s +sqrdmulh v0.4S, v22.4S, v25.s[0] +mul v22.4S, v22.4S,v26.s[0] +mla v22.4S, v0.4S, v31.s[0] +sub v0.4s, v13.4s, v22.4s +add v13.4s, v13.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v25.s[1] +mul v21.4S, v21.4S,v26.s[1] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v10.4s, v21.4s +add v10.4s, v10.4s, v21.4s +sqrdmulh v21.4S, v2.4S, v25.s[2] +mul v2.4S, v2.4S,v26.s[2] +mla v2.4S, v21.4S, v31.s[0] +sub v21.4s, v19.4s, v2.4s +add v19.4s, v19.4s, v2.4s +sqrdmulh v2.4S, v3.4S, v25.s[3] +mul v3.4S, v3.4S,v26.s[3] +mla v3.4S, v2.4S, v31.s[0] +sub v2.4s, v14.4s, v3.4s +add v14.4s, v14.4s, v3.4s +sqrdmulh v3.4S, v18.4S, v23.s[0] +mul v18.4S, v18.4S,v24.s[0] +mla v18.4S, v3.4S, v31.s[0] +sub v3.4s, v20.4s, v18.4s +add v20.4s, v20.4s, v18.4s +sqrdmulh v18.4S, v11.4S, v23.s[1] +mul v11.4S, v11.4S,v24.s[1] +mla v11.4S, v18.4S, v31.s[0] +sub v18.4s, v15.4s, v11.4s +add v15.4s, v15.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v23.s[2] +mul v17.4S, v17.4S,v24.s[2] +mla v17.4S, v11.4S, v31.s[0] +sub v11.4s, v1.4s, v17.4s +add v1.4s, v1.4s, v17.4s +sqrdmulh v17.4S, v16.4S, v23.s[3] +mul v16.4S, v16.4S,v24.s[3] +mla v16.4S, v17.4S, v31.s[0] +sub v17.4s, v12.4s, v16.4s +add v12.4s, v12.4s, v16.4s +str q13, [x0, #48] +str q0, [x0, #112] +str q10, [x0, #176] +str q22, [x0, #240] +str q19, [x0, #304] +str q21, [x0, #368] +str q14, [x0, #432] +str q2, [x0, #496] +str q20, [x0, #560] +str q3, [x0, #624] +str q15, [x0, #688] +str q18, [x0, #752] +str q1, [x0, #816] +str q11, [x0, #880] +str q12, [x0, #944] +str q17, [x0, #1008] +ldr q17, [x0, #768] +ldr q12, [x0, #832] +ldr q11, [x0, #896] +ldr q1, [x0, #960] +ldr q18, [x0, #256] +ldr q15, [x0, #320] +ldr q3, [x0, #384] +ldr q20, [x0, #448] +ldr q2, [x0, #512] +ldr q14, [x0, #576] +ldr q21, [x0, #640] +ldr q19, [x0, #704] +ldr q22, [x0, #0] +ldr q10, [x0, #64] +ldr q0, [x0, #128] +ldr q13, [x0, #192] +sqrdmulh v16.4S, v17.4S, v29.s[0] +mul v17.4S, v17.4S,v30.s[0] +mla v17.4S, v16.4S, v31.s[0] +sub v16.4s, v18.4s, v17.4s +add v18.4s, v18.4s, v17.4s +sqrdmulh v17.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +mla v12.4S, v17.4S, v31.s[0] +sub v17.4s, v15.4s, v12.4s +add v15.4s, v15.4s, v12.4s +sqrdmulh v12.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +mla v11.4S, v12.4S, v31.s[0] +sub v12.4s, v3.4s, v11.4s +add v3.4s, v3.4s, v11.4s +sqrdmulh v11.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +mla v1.4S, v11.4S, v31.s[0] +sub v11.4s, v20.4s, v1.4s +add v20.4s, v20.4s, v1.4s +sqrdmulh v1.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +mla v2.4S, v1.4S, v31.s[0] +sub v1.4s, v22.4s, v2.4s +add v22.4s, v22.4s, v2.4s +sqrdmulh v2.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +mla v14.4S, v2.4S, v31.s[0] +sub v2.4s, v10.4s, v14.4s +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +mla v21.4S, v14.4S, v31.s[0] +sub v14.4s, v0.4s, v21.4s +add v0.4s, v0.4s, v21.4s +sqrdmulh v21.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +mla v19.4S, v21.4S, v31.s[0] +sub v21.4s, v13.4s, v19.4s +add v13.4s, v13.4s, v19.4s +sqrdmulh v19.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +mla v3.4S, v19.4S, v31.s[0] +sub v19.4s, v0.4s, v3.4s +add v0.4s, v0.4s, v3.4s +sqrdmulh v3.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +mla v20.4S, v3.4S, v31.s[0] +sub v3.4s, v13.4s, v20.4s +add v13.4s, v13.4s, v20.4s +sqrdmulh v20.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +mla v18.4S, v20.4S, v31.s[0] +sub v20.4s, v22.4s, v18.4s +add v22.4s, v22.4s, v18.4s +sqrdmulh v18.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +mla v15.4S, v18.4S, v31.s[0] +sub v18.4s, v10.4s, v15.4s +add v10.4s, v10.4s, v15.4s +sqrdmulh v15.4S, v12.4S, v29.s[2] +mul v12.4S, v12.4S,v30.s[2] +mla v12.4S, v15.4S, v31.s[0] +sub v15.4s, v14.4s, v12.4s +add v14.4s, v14.4s, v12.4s +sqrdmulh v12.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +mla v11.4S, v12.4S, v31.s[0] +sub v12.4s, v21.4s, v11.4s +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +mla v16.4S, v11.4S, v31.s[0] +sub v11.4s, v1.4s, v16.4s +add v1.4s, v1.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +mla v17.4S, v16.4S, v31.s[0] +sub v16.4s, v2.4s, v17.4s +add v2.4s, v2.4s, v17.4s +sqrdmulh v17.4S, v0.4S, v27.s[0] +mul v0.4S, v0.4S,v28.s[0] +mla v0.4S, v17.4S, v31.s[0] +sub v17.4s, v22.4s, v0.4s +add v22.4s, v22.4s, v0.4s +sqrdmulh v0.4S, v13.4S, v27.s[0] +mul v13.4S, v13.4S,v28.s[0] +mla v13.4S, v0.4S, v31.s[0] +sub v0.4s, v10.4s, v13.4s +add v10.4s, v10.4s, v13.4s +sqrdmulh v13.4S, v19.4S, v27.s[1] +mul v19.4S, v19.4S,v28.s[1] +mla v19.4S, v13.4S, v31.s[0] +sub v13.4s, v20.4s, v19.4s +add v20.4s, v20.4s, v19.4s +sqrdmulh v19.4S, v3.4S, v27.s[1] +mul v3.4S, v3.4S,v28.s[1] +mla v3.4S, v19.4S, v31.s[0] +sub v19.4s, v18.4s, v3.4s +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v14.4S, v27.s[2] +mul v14.4S, v14.4S,v28.s[2] +mla v14.4S, v3.4S, v31.s[0] +sub v3.4s, v1.4s, v14.4s +add v1.4s, v1.4s, v14.4s +sqrdmulh v14.4S, v21.4S, v27.s[2] +mul v21.4S, v21.4S,v28.s[2] +mla v21.4S, v14.4S, v31.s[0] +sub v14.4s, v2.4s, v21.4s +add v2.4s, v2.4s, v21.4s +sqrdmulh v21.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +mla v15.4S, v21.4S, v31.s[0] +sub v21.4s, v11.4s, v15.4s +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v12.4S, v27.s[3] +mul v12.4S, v12.4S,v28.s[3] +mla v12.4S, v15.4S, v31.s[0] +sub v15.4s, v16.4s, v12.4s +add v16.4s, v16.4s, v12.4s +sqrdmulh v12.4S, v10.4S, v25.s[0] +mul v10.4S, v10.4S,v26.s[0] +mla v10.4S, v12.4S, v31.s[0] +sub v12.4s, v22.4s, v10.4s +add v22.4s, v22.4s, v10.4s +sqrdmulh v10.4S, v0.4S, v25.s[1] +mul v0.4S, v0.4S,v26.s[1] +mla v0.4S, v10.4S, v31.s[0] +sub v10.4s, v17.4s, v0.4s +add v17.4s, v17.4s, v0.4s +sqrdmulh v0.4S, v18.4S, v25.s[2] +mul v18.4S, v18.4S,v26.s[2] +mla v18.4S, v0.4S, v31.s[0] +sub v0.4s, v20.4s, v18.4s +add v20.4s, v20.4s, v18.4s +sqrdmulh v18.4S, v19.4S, v25.s[3] +mul v19.4S, v19.4S,v26.s[3] +mla v19.4S, v18.4S, v31.s[0] +sub v18.4s, v13.4s, v19.4s +add v13.4s, v13.4s, v19.4s +sqrdmulh v19.4S, v2.4S, v23.s[0] +mul v2.4S, v2.4S,v24.s[0] +mla v2.4S, v19.4S, v31.s[0] +sub v19.4s, v1.4s, v2.4s +add v1.4s, v1.4s, v2.4s +sqrdmulh v2.4S, v14.4S, v23.s[1] +mul v14.4S, v14.4S,v24.s[1] +mla v14.4S, v2.4S, v31.s[0] +sub v2.4s, v3.4s, v14.4s +add v3.4s, v3.4s, v14.4s +sqrdmulh v14.4S, v16.4S, v23.s[2] +mul v16.4S, v16.4S,v24.s[2] +mla v16.4S, v14.4S, v31.s[0] +sub v14.4s, v11.4s, v16.4s +add v11.4s, v11.4s, v16.4s +sqrdmulh v16.4S, v15.4S, v23.s[3] +mul v15.4S, v15.4S,v24.s[3] +mla v15.4S, v16.4S, v31.s[0] +sub v16.4s, v21.4s, v15.4s +add v21.4s, v21.4s, v15.4s +str q22, [x0, #0] +str q12, [x0, #64] +str q17, [x0, #128] +str q10, [x0, #192] +str q20, [x0, #256] +str q0, [x0, #320] +str q13, [x0, #384] +str q18, [x0, #448] +str q1, [x0, #512] +str q19, [x0, #576] +str q3, [x0, #640] +str q2, [x0, #704] +str q11, [x0, #768] +str q14, [x0, #832] +str q21, [x0, #896] +str q16, [x0, #960] +ldr q16, [x0, #784] +ldr q21, [x0, #848] +ldr q14, [x0, #912] +ldr q11, [x0, #976] +ldr q2, [x0, #272] +ldr q3, [x0, #336] +ldr q19, [x0, #400] +ldr q1, [x0, #464] +ldr q18, [x0, #528] +ldr q13, [x0, #592] +ldr q0, [x0, #656] +ldr q20, [x0, #720] +ldr q10, [x0, #16] +ldr q17, [x0, #80] +ldr q12, [x0, #144] +ldr q22, [x0, #208] +sqrdmulh v15.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +mla v16.4S, v15.4S, v31.s[0] +sub v15.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +mla v21.4S, v16.4S, v31.s[0] +sub v16.4s, v3.4s, v21.4s +add v3.4s, v3.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +mla v14.4S, v21.4S, v31.s[0] +sub v21.4s, v19.4s, v14.4s +add v19.4s, v19.4s, v14.4s +sqrdmulh v14.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +mla v11.4S, v14.4S, v31.s[0] +sub v14.4s, v1.4s, v11.4s +add v1.4s, v1.4s, v11.4s +sqrdmulh v11.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +mla v18.4S, v11.4S, v31.s[0] +sub v11.4s, v10.4s, v18.4s +add v10.4s, v10.4s, v18.4s +sqrdmulh v18.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +mla v13.4S, v18.4S, v31.s[0] +sub v18.4s, v17.4s, v13.4s +add v17.4s, v17.4s, v13.4s +sqrdmulh v13.4S, v0.4S, v29.s[0] +mul v0.4S, v0.4S,v30.s[0] +mla v0.4S, v13.4S, v31.s[0] +sub v13.4s, v12.4s, v0.4s +add v12.4s, v12.4s, v0.4s +sqrdmulh v0.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +mla v20.4S, v0.4S, v31.s[0] +sub v0.4s, v22.4s, v20.4s +add v22.4s, v22.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v29.s[1] +mul v19.4S, v19.4S,v30.s[1] +mla v19.4S, v20.4S, v31.s[0] +sub v20.4s, v12.4s, v19.4s +add v12.4s, v12.4s, v19.4s +sqrdmulh v19.4S, v1.4S, v29.s[1] +mul v1.4S, v1.4S,v30.s[1] +mla v1.4S, v19.4S, v31.s[0] +sub v19.4s, v22.4s, v1.4s +add v22.4s, v22.4s, v1.4s +sqrdmulh v1.4S, v2.4S, v29.s[1] +mul v2.4S, v2.4S,v30.s[1] +mla v2.4S, v1.4S, v31.s[0] +sub v1.4s, v10.4s, v2.4s +add v10.4s, v10.4s, v2.4s +sqrdmulh v2.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +mla v3.4S, v2.4S, v31.s[0] +sub v2.4s, v17.4s, v3.4s +add v17.4s, v17.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +mla v21.4S, v3.4S, v31.s[0] +sub v3.4s, v13.4s, v21.4s +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +mla v14.4S, v21.4S, v31.s[0] +sub v21.4s, v0.4s, v14.4s +add v0.4s, v0.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +mla v15.4S, v14.4S, v31.s[0] +sub v14.4s, v11.4s, v15.4s +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +mla v16.4S, v15.4S, v31.s[0] +sub v15.4s, v18.4s, v16.4s +add v18.4s, v18.4s, v16.4s +sqrdmulh v16.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +mla v12.4S, v16.4S, v31.s[0] +sub v16.4s, v10.4s, v12.4s +add v10.4s, v10.4s, v12.4s +sqrdmulh v12.4S, v22.4S, v27.s[0] +mul v22.4S, v22.4S,v28.s[0] +mla v22.4S, v12.4S, v31.s[0] +sub v12.4s, v17.4s, v22.4s +add v17.4s, v17.4s, v22.4s +sqrdmulh v22.4S, v20.4S, v27.s[1] +mul v20.4S, v20.4S,v28.s[1] +mla v20.4S, v22.4S, v31.s[0] +sub v22.4s, v1.4s, v20.4s +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v27.s[1] +mul v19.4S, v19.4S,v28.s[1] +mla v19.4S, v20.4S, v31.s[0] +sub v20.4s, v2.4s, v19.4s +add v2.4s, v2.4s, v19.4s +sqrdmulh v19.4S, v13.4S, v27.s[2] +mul v13.4S, v13.4S,v28.s[2] +mla v13.4S, v19.4S, v31.s[0] +sub v19.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +sqrdmulh v13.4S, v0.4S, v27.s[2] +mul v0.4S, v0.4S,v28.s[2] +mla v0.4S, v13.4S, v31.s[0] +sub v13.4s, v18.4s, v0.4s +add v18.4s, v18.4s, v0.4s +sqrdmulh v0.4S, v3.4S, v27.s[3] +mul v3.4S, v3.4S,v28.s[3] +mla v3.4S, v0.4S, v31.s[0] +sub v0.4s, v14.4s, v3.4s +add v14.4s, v14.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +mla v21.4S, v3.4S, v31.s[0] +sub v3.4s, v15.4s, v21.4s +add v15.4s, v15.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v25.s[0] +mul v17.4S, v17.4S,v26.s[0] +mla v17.4S, v21.4S, v31.s[0] +sub v21.4s, v10.4s, v17.4s +add v10.4s, v10.4s, v17.4s +sqrdmulh v17.4S, v12.4S, v25.s[1] +mul v12.4S, v12.4S,v26.s[1] +mla v12.4S, v17.4S, v31.s[0] +sub v17.4s, v16.4s, v12.4s +add v16.4s, v16.4s, v12.4s +sqrdmulh v12.4S, v2.4S, v25.s[2] +mul v2.4S, v2.4S,v26.s[2] +mla v2.4S, v12.4S, v31.s[0] +sub v12.4s, v1.4s, v2.4s +add v1.4s, v1.4s, v2.4s +sqrdmulh v2.4S, v20.4S, v25.s[3] +mul v20.4S, v20.4S,v26.s[3] +mla v20.4S, v2.4S, v31.s[0] +sub v2.4s, v22.4s, v20.4s +add v22.4s, v22.4s, v20.4s +sqrdmulh v20.4S, v18.4S, v23.s[0] +mul v18.4S, v18.4S,v24.s[0] +mla v18.4S, v20.4S, v31.s[0] +sub v20.4s, v11.4s, v18.4s +add v11.4s, v11.4s, v18.4s +sqrdmulh v18.4S, v13.4S, v23.s[1] +mul v13.4S, v13.4S,v24.s[1] +mla v13.4S, v18.4S, v31.s[0] +sub v18.4s, v19.4s, v13.4s +add v19.4s, v19.4s, v13.4s +sqrdmulh v13.4S, v15.4S, v23.s[2] +mul v15.4S, v15.4S,v24.s[2] +mla v15.4S, v13.4S, v31.s[0] +sub v13.4s, v14.4s, v15.4s +add v14.4s, v14.4s, v15.4s +sqrdmulh v15.4S, v3.4S, v23.s[3] +mul v3.4S, v3.4S,v24.s[3] +mla v3.4S, v15.4S, v31.s[0] +sub v15.4s, v0.4s, v3.4s +add v0.4s, v0.4s, v3.4s +str q10, [x0, #16] +str q21, [x0, #80] +str q16, [x0, #144] +str q17, [x0, #208] +str q1, [x0, #272] +str q12, [x0, #336] +str q22, [x0, #400] +str q2, [x0, #464] +str q11, [x0, #528] +str q20, [x0, #592] +str q19, [x0, #656] +str q18, [x0, #720] +str q14, [x0, #784] +str q13, [x0, #848] +str q0, [x0, #912] +str q15, [x0, #976] +ldr q4, [x17, #+128] +ldr q5, [x17, #+144] +ldr q6, [x17, #+160] +ldr q7, [x17, #+176] +ldr q8, [x17, #+192] +ldr q9, [x17, #+208] +ldr q3, [x17, #+224] +ldr q10, [x17, #+240] +ldr q21, [x0, #32] +ldr q16, [x0, #48] +ldr q17, [x0, #0] +ldr q1, [x0, #16] +sqrdmulh v12.4S, v21.4S, v5.s[0] +mul v21.4S, v21.4S,v4.s[0] +mla v21.4S, v12.4S, v31.s[0] +sub v12.4s, v17.4s, v21.4s +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v16.4S, v5.s[0] +mul v16.4S, v16.4S,v4.s[0] +mla v16.4S, v21.4S, v31.s[0] +sub v21.4s, v1.4s, v16.4s +add v1.4s, v1.4s, v16.4s +sqrdmulh v16.4S, v1.4S, v5.s[1] +mul v1.4S, v1.4S,v4.s[1] +mla v1.4S, v16.4S, v31.s[0] +sub v16.4s, v17.4s, v1.4s +add v17.4s, v17.4s, v1.4s +sqrdmulh v1.4S, v21.4S, v5.s[2] +mul v21.4S, v21.4S,v4.s[2] +mla v21.4S, v1.4S, v31.s[0] +sub v1.4s, v12.4s, v21.4s +add v12.4s, v12.4s, v21.4s +trn1 v21.4S, v17.4S, v16.4S +trn2 v22.4S, v17.4S, v16.4S +trn1 v2.4S, v12.4S, v1.4S +trn2 v11.4S, v12.4S, v1.4S +trn2 v12.2D, v21.2D, v2.2D +trn2 v1.2D, v22.2D, v11.2D +trn1 v17.2D, v21.2D, v2.2D +trn1 v16.2D, v22.2D, v11.2D +sqrdmulh v11.4S, v12.4S, v7.4S +mul v12.4S, v12.4S,v6.4S +mla v12.4S, v11.4S, v31.s[0] +sub v11.4s, v17.4s, v12.4s +add v17.4s, v17.4s, v12.4s +sqrdmulh v12.4S, v1.4S, v7.4S +mul v1.4S, v1.4S,v6.4S +mla v1.4S, v12.4S, v31.s[0] +sub v12.4s, v16.4s, v1.4s +add v16.4s, v16.4s, v1.4s +sqrdmulh v1.4S, v16.4S, v9.4S +mul v16.4S, v16.4S,v8.4S +mla v16.4S, v1.4S, v31.s[0] +sub v1.4s, v17.4s, v16.4s +add v17.4s, v17.4s, v16.4s +sqrdmulh v16.4S, v12.4S, v10.4S +mul v12.4S, v12.4S,v3.4S +mla v12.4S, v16.4S, v31.s[0] +sub v16.4s, v11.4s, v12.4s +add v11.4s, v11.4s, v12.4s +str q17, [x0, #0] +str q1, [x0, #16] +str q11, [x0, #32] +str q16, [x0, #48] +ldr q16, [x17, #+256] +ldr q11, [x17, #+272] +ldr q1, [x17, #+288] +ldr q17, [x17, #+304] +ldr q12, [x17, #+320] +ldr q22, [x17, #+336] +ldr q2, [x17, #+352] +ldr q21, [x17, #+368] +ldr q10, [x0, #96] +ldr q3, [x0, #112] +ldr q9, [x0, #64] +ldr q8, [x0, #80] +sqrdmulh v7.4S, v10.4S, v11.s[0] +mul v10.4S, v10.4S,v16.s[0] +mla v10.4S, v7.4S, v31.s[0] +sub v7.4s, v9.4s, v10.4s +add v9.4s, v9.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v11.s[0] +mul v3.4S, v3.4S,v16.s[0] +mla v3.4S, v10.4S, v31.s[0] +sub v10.4s, v8.4s, v3.4s +add v8.4s, v8.4s, v3.4s +sqrdmulh v3.4S, v8.4S, v11.s[1] +mul v8.4S, v8.4S,v16.s[1] +mla v8.4S, v3.4S, v31.s[0] +sub v3.4s, v9.4s, v8.4s +add v9.4s, v9.4s, v8.4s +sqrdmulh v8.4S, v10.4S, v11.s[2] +mul v10.4S, v10.4S,v16.s[2] +mla v10.4S, v8.4S, v31.s[0] +sub v8.4s, v7.4s, v10.4s +add v7.4s, v7.4s, v10.4s +trn1 v10.4S, v9.4S, v3.4S +trn2 v6.4S, v9.4S, v3.4S +trn1 v5.4S, v7.4S, v8.4S +trn2 v4.4S, v7.4S, v8.4S +trn2 v7.2D, v10.2D, v5.2D +trn2 v8.2D, v6.2D, v4.2D +trn1 v9.2D, v10.2D, v5.2D +trn1 v3.2D, v6.2D, v4.2D +sqrdmulh v4.4S, v7.4S, v17.4S +mul v7.4S, v7.4S,v1.4S +mla v7.4S, v4.4S, v31.s[0] +sub v4.4s, v9.4s, v7.4s +add v9.4s, v9.4s, v7.4s +sqrdmulh v7.4S, v8.4S, v17.4S +mul v8.4S, v8.4S,v1.4S +mla v8.4S, v7.4S, v31.s[0] +sub v7.4s, v3.4s, v8.4s +add v3.4s, v3.4s, v8.4s +sqrdmulh v8.4S, v3.4S, v22.4S +mul v3.4S, v3.4S,v12.4S +mla v3.4S, v8.4S, v31.s[0] +sub v8.4s, v9.4s, v3.4s +add v9.4s, v9.4s, v3.4s +sqrdmulh v3.4S, v7.4S, v21.4S +mul v7.4S, v7.4S,v2.4S +mla v7.4S, v3.4S, v31.s[0] +sub v3.4s, v4.4s, v7.4s +add v4.4s, v4.4s, v7.4s +str q9, [x0, #64] +str q8, [x0, #80] +str q4, [x0, #96] +str q3, [x0, #112] +ldr q3, [x17, #+384] +ldr q4, [x17, #+400] +ldr q8, [x17, #+416] +ldr q9, [x17, #+432] +ldr q7, [x17, #+448] +ldr q6, [x17, #+464] +ldr q5, [x17, #+480] +ldr q10, [x17, #+496] +ldr q21, [x0, #160] +ldr q2, [x0, #176] +ldr q22, [x0, #128] +ldr q12, [x0, #144] +sqrdmulh v17.4S, v21.4S, v4.s[0] +mul v21.4S, v21.4S,v3.s[0] +mla v21.4S, v17.4S, v31.s[0] +sub v17.4s, v22.4s, v21.4s +add v22.4s, v22.4s, v21.4s +sqrdmulh v21.4S, v2.4S, v4.s[0] +mul v2.4S, v2.4S,v3.s[0] +mla v2.4S, v21.4S, v31.s[0] +sub v21.4s, v12.4s, v2.4s +add v12.4s, v12.4s, v2.4s +sqrdmulh v2.4S, v12.4S, v4.s[1] +mul v12.4S, v12.4S,v3.s[1] +mla v12.4S, v2.4S, v31.s[0] +sub v2.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v21.4S, v4.s[2] +mul v21.4S, v21.4S,v3.s[2] +mla v21.4S, v12.4S, v31.s[0] +sub v12.4s, v17.4s, v21.4s +add v17.4s, v17.4s, v21.4s +trn1 v21.4S, v22.4S, v2.4S +trn2 v1.4S, v22.4S, v2.4S +trn1 v11.4S, v17.4S, v12.4S +trn2 v16.4S, v17.4S, v12.4S +trn2 v17.2D, v21.2D, v11.2D +trn2 v12.2D, v1.2D, v16.2D +trn1 v22.2D, v21.2D, v11.2D +trn1 v2.2D, v1.2D, v16.2D +sqrdmulh v16.4S, v17.4S, v9.4S +mul v17.4S, v17.4S,v8.4S +mla v17.4S, v16.4S, v31.s[0] +sub v16.4s, v22.4s, v17.4s +add v22.4s, v22.4s, v17.4s +sqrdmulh v17.4S, v12.4S, v9.4S +mul v12.4S, v12.4S,v8.4S +mla v12.4S, v17.4S, v31.s[0] +sub v17.4s, v2.4s, v12.4s +add v2.4s, v2.4s, v12.4s +sqrdmulh v12.4S, v2.4S, v6.4S +mul v2.4S, v2.4S,v7.4S +mla v2.4S, v12.4S, v31.s[0] +sub v12.4s, v22.4s, v2.4s +add v22.4s, v22.4s, v2.4s +sqrdmulh v2.4S, v17.4S, v10.4S +mul v17.4S, v17.4S,v5.4S +mla v17.4S, v2.4S, v31.s[0] +sub v2.4s, v16.4s, v17.4s +add v16.4s, v16.4s, v17.4s +str q22, [x0, #128] +str q12, [x0, #144] +str q16, [x0, #160] +str q2, [x0, #176] +ldr q2, [x17, #+512] +ldr q16, [x17, #+528] +ldr q12, [x17, #+544] +ldr q22, [x17, #+560] +ldr q17, [x17, #+576] +ldr q1, [x17, #+592] +ldr q11, [x17, #+608] +ldr q21, [x17, #+624] +ldr q10, [x0, #224] +ldr q5, [x0, #240] +ldr q6, [x0, #192] +ldr q7, [x0, #208] +sqrdmulh v9.4S, v10.4S, v16.s[0] +mul v10.4S, v10.4S,v2.s[0] +mla v10.4S, v9.4S, v31.s[0] +sub v9.4s, v6.4s, v10.4s +add v6.4s, v6.4s, v10.4s +sqrdmulh v10.4S, v5.4S, v16.s[0] +mul v5.4S, v5.4S,v2.s[0] +mla v5.4S, v10.4S, v31.s[0] +sub v10.4s, v7.4s, v5.4s +add v7.4s, v7.4s, v5.4s +sqrdmulh v5.4S, v7.4S, v16.s[1] +mul v7.4S, v7.4S,v2.s[1] +mla v7.4S, v5.4S, v31.s[0] +sub v5.4s, v6.4s, v7.4s +add v6.4s, v6.4s, v7.4s +sqrdmulh v7.4S, v10.4S, v16.s[2] +mul v10.4S, v10.4S,v2.s[2] +mla v10.4S, v7.4S, v31.s[0] +sub v7.4s, v9.4s, v10.4s +add v9.4s, v9.4s, v10.4s +trn1 v10.4S, v6.4S, v5.4S +trn2 v8.4S, v6.4S, v5.4S +trn1 v4.4S, v9.4S, v7.4S +trn2 v3.4S, v9.4S, v7.4S +trn2 v9.2D, v10.2D, v4.2D +trn2 v7.2D, v8.2D, v3.2D +trn1 v6.2D, v10.2D, v4.2D +trn1 v5.2D, v8.2D, v3.2D +sqrdmulh v3.4S, v9.4S, v22.4S +mul v9.4S, v9.4S,v12.4S +mla v9.4S, v3.4S, v31.s[0] +sub v3.4s, v6.4s, v9.4s +add v6.4s, v6.4s, v9.4s +sqrdmulh v9.4S, v7.4S, v22.4S +mul v7.4S, v7.4S,v12.4S +mla v7.4S, v9.4S, v31.s[0] +sub v9.4s, v5.4s, v7.4s +add v5.4s, v5.4s, v7.4s +sqrdmulh v7.4S, v5.4S, v1.4S +mul v5.4S, v5.4S,v17.4S +mla v5.4S, v7.4S, v31.s[0] +sub v7.4s, v6.4s, v5.4s +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v9.4S, v21.4S +mul v9.4S, v9.4S,v11.4S +mla v9.4S, v5.4S, v31.s[0] +sub v5.4s, v3.4s, v9.4s +add v3.4s, v3.4s, v9.4s +str q6, [x0, #192] +str q7, [x0, #208] +str q3, [x0, #224] +str q5, [x0, #240] +ldr q5, [x17, #+640] +ldr q3, [x17, #+656] +ldr q7, [x17, #+672] +ldr q6, [x17, #+688] +ldr q9, [x17, #+704] +ldr q8, [x17, #+720] +ldr q4, [x17, #+736] +ldr q10, [x17, #+752] +ldr q21, [x0, #288] +ldr q11, [x0, #304] +ldr q1, [x0, #256] +ldr q17, [x0, #272] +sqrdmulh v22.4S, v21.4S, v3.s[0] +mul v21.4S, v21.4S,v5.s[0] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v1.4s, v21.4s +add v1.4s, v1.4s, v21.4s +sqrdmulh v21.4S, v11.4S, v3.s[0] +mul v11.4S, v11.4S,v5.s[0] +mla v11.4S, v21.4S, v31.s[0] +sub v21.4s, v17.4s, v11.4s +add v17.4s, v17.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v3.s[1] +mul v17.4S, v17.4S,v5.s[1] +mla v17.4S, v11.4S, v31.s[0] +sub v11.4s, v1.4s, v17.4s +add v1.4s, v1.4s, v17.4s +sqrdmulh v17.4S, v21.4S, v3.s[2] +mul v21.4S, v21.4S,v5.s[2] +mla v21.4S, v17.4S, v31.s[0] +sub v17.4s, v22.4s, v21.4s +add v22.4s, v22.4s, v21.4s +trn1 v21.4S, v1.4S, v11.4S +trn2 v12.4S, v1.4S, v11.4S +trn1 v16.4S, v22.4S, v17.4S +trn2 v2.4S, v22.4S, v17.4S +trn2 v22.2D, v21.2D, v16.2D +trn2 v17.2D, v12.2D, v2.2D +trn1 v1.2D, v21.2D, v16.2D +trn1 v11.2D, v12.2D, v2.2D +sqrdmulh v2.4S, v22.4S, v6.4S +mul v22.4S, v22.4S,v7.4S +mla v22.4S, v2.4S, v31.s[0] +sub v2.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +sqrdmulh v22.4S, v17.4S, v6.4S +mul v17.4S, v17.4S,v7.4S +mla v17.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v17.4s +add v11.4s, v11.4s, v17.4s +sqrdmulh v17.4S, v11.4S, v8.4S +mul v11.4S, v11.4S,v9.4S +mla v11.4S, v17.4S, v31.s[0] +sub v17.4s, v1.4s, v11.4s +add v1.4s, v1.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v10.4S +mul v22.4S, v22.4S,v4.4S +mla v22.4S, v11.4S, v31.s[0] +sub v11.4s, v2.4s, v22.4s +add v2.4s, v2.4s, v22.4s +str q1, [x0, #256] +str q17, [x0, #272] +str q2, [x0, #288] +str q11, [x0, #304] +ldr q11, [x17, #+768] +ldr q2, [x17, #+784] +ldr q17, [x17, #+800] +ldr q1, [x17, #+816] +ldr q22, [x17, #+832] +ldr q12, [x17, #+848] +ldr q16, [x17, #+864] +ldr q21, [x17, #+880] +ldr q10, [x0, #352] +ldr q4, [x0, #368] +ldr q8, [x0, #320] +ldr q9, [x0, #336] +sqrdmulh v6.4S, v10.4S, v2.s[0] +mul v10.4S, v10.4S,v11.s[0] +mla v10.4S, v6.4S, v31.s[0] +sub v6.4s, v8.4s, v10.4s +add v8.4s, v8.4s, v10.4s +sqrdmulh v10.4S, v4.4S, v2.s[0] +mul v4.4S, v4.4S,v11.s[0] +mla v4.4S, v10.4S, v31.s[0] +sub v10.4s, v9.4s, v4.4s +add v9.4s, v9.4s, v4.4s +sqrdmulh v4.4S, v9.4S, v2.s[1] +mul v9.4S, v9.4S,v11.s[1] +mla v9.4S, v4.4S, v31.s[0] +sub v4.4s, v8.4s, v9.4s +add v8.4s, v8.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v2.s[2] +mul v10.4S, v10.4S,v11.s[2] +mla v10.4S, v9.4S, v31.s[0] +sub v9.4s, v6.4s, v10.4s +add v6.4s, v6.4s, v10.4s +trn1 v10.4S, v8.4S, v4.4S +trn2 v7.4S, v8.4S, v4.4S +trn1 v3.4S, v6.4S, v9.4S +trn2 v5.4S, v6.4S, v9.4S +trn2 v6.2D, v10.2D, v3.2D +trn2 v9.2D, v7.2D, v5.2D +trn1 v8.2D, v10.2D, v3.2D +trn1 v4.2D, v7.2D, v5.2D +sqrdmulh v5.4S, v6.4S, v1.4S +mul v6.4S, v6.4S,v17.4S +mla v6.4S, v5.4S, v31.s[0] +sub v5.4s, v8.4s, v6.4s +add v8.4s, v8.4s, v6.4s +sqrdmulh v6.4S, v9.4S, v1.4S +mul v9.4S, v9.4S,v17.4S +mla v9.4S, v6.4S, v31.s[0] +sub v6.4s, v4.4s, v9.4s +add v4.4s, v4.4s, v9.4s +sqrdmulh v9.4S, v4.4S, v12.4S +mul v4.4S, v4.4S,v22.4S +mla v4.4S, v9.4S, v31.s[0] +sub v9.4s, v8.4s, v4.4s +add v8.4s, v8.4s, v4.4s +sqrdmulh v4.4S, v6.4S, v21.4S +mul v6.4S, v6.4S,v16.4S +mla v6.4S, v4.4S, v31.s[0] +sub v4.4s, v5.4s, v6.4s +add v5.4s, v5.4s, v6.4s +str q8, [x0, #320] +str q9, [x0, #336] +str q5, [x0, #352] +str q4, [x0, #368] +ldr q4, [x17, #+896] +ldr q5, [x17, #+912] +ldr q9, [x17, #+928] +ldr q8, [x17, #+944] +ldr q6, [x17, #+960] +ldr q7, [x17, #+976] +ldr q3, [x17, #+992] +ldr q10, [x17, #+1008] +ldr q21, [x0, #416] +ldr q16, [x0, #432] +ldr q12, [x0, #384] +ldr q22, [x0, #400] +sqrdmulh v1.4S, v21.4S, v5.s[0] +mul v21.4S, v21.4S,v4.s[0] +mla v21.4S, v1.4S, v31.s[0] +sub v1.4s, v12.4s, v21.4s +add v12.4s, v12.4s, v21.4s +sqrdmulh v21.4S, v16.4S, v5.s[0] +mul v16.4S, v16.4S,v4.s[0] +mla v16.4S, v21.4S, v31.s[0] +sub v21.4s, v22.4s, v16.4s +add v22.4s, v22.4s, v16.4s +sqrdmulh v16.4S, v22.4S, v5.s[1] +mul v22.4S, v22.4S,v4.s[1] +mla v22.4S, v16.4S, v31.s[0] +sub v16.4s, v12.4s, v22.4s +add v12.4s, v12.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v5.s[2] +mul v21.4S, v21.4S,v4.s[2] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v1.4s, v21.4s +add v1.4s, v1.4s, v21.4s +trn1 v21.4S, v12.4S, v16.4S +trn2 v17.4S, v12.4S, v16.4S +trn1 v2.4S, v1.4S, v22.4S +trn2 v11.4S, v1.4S, v22.4S +trn2 v1.2D, v21.2D, v2.2D +trn2 v22.2D, v17.2D, v11.2D +trn1 v12.2D, v21.2D, v2.2D +trn1 v16.2D, v17.2D, v11.2D +sqrdmulh v11.4S, v1.4S, v8.4S +mul v1.4S, v1.4S,v9.4S +mla v1.4S, v11.4S, v31.s[0] +sub v11.4s, v12.4s, v1.4s +add v12.4s, v12.4s, v1.4s +sqrdmulh v1.4S, v22.4S, v8.4S +mul v22.4S, v22.4S,v9.4S +mla v22.4S, v1.4S, v31.s[0] +sub v1.4s, v16.4s, v22.4s +add v16.4s, v16.4s, v22.4s +sqrdmulh v22.4S, v16.4S, v7.4S +mul v16.4S, v16.4S,v6.4S +mla v16.4S, v22.4S, v31.s[0] +sub v22.4s, v12.4s, v16.4s +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v1.4S, v10.4S +mul v1.4S, v1.4S,v3.4S +mla v1.4S, v16.4S, v31.s[0] +sub v16.4s, v11.4s, v1.4s +add v11.4s, v11.4s, v1.4s +str q12, [x0, #384] +str q22, [x0, #400] +str q11, [x0, #416] +str q16, [x0, #432] +ldr q16, [x17, #+1024] +ldr q11, [x17, #+1040] +ldr q22, [x17, #+1056] +ldr q12, [x17, #+1072] +ldr q1, [x17, #+1088] +ldr q17, [x17, #+1104] +ldr q2, [x17, #+1120] +ldr q21, [x17, #+1136] +ldr q10, [x0, #480] +ldr q3, [x0, #496] +ldr q7, [x0, #448] +ldr q6, [x0, #464] +sqrdmulh v8.4S, v10.4S, v11.s[0] +mul v10.4S, v10.4S,v16.s[0] +mla v10.4S, v8.4S, v31.s[0] +sub v8.4s, v7.4s, v10.4s +add v7.4s, v7.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v11.s[0] +mul v3.4S, v3.4S,v16.s[0] +mla v3.4S, v10.4S, v31.s[0] +sub v10.4s, v6.4s, v3.4s +add v6.4s, v6.4s, v3.4s +sqrdmulh v3.4S, v6.4S, v11.s[1] +mul v6.4S, v6.4S,v16.s[1] +mla v6.4S, v3.4S, v31.s[0] +sub v3.4s, v7.4s, v6.4s +add v7.4s, v7.4s, v6.4s +sqrdmulh v6.4S, v10.4S, v11.s[2] +mul v10.4S, v10.4S,v16.s[2] +mla v10.4S, v6.4S, v31.s[0] +sub v6.4s, v8.4s, v10.4s +add v8.4s, v8.4s, v10.4s +trn1 v10.4S, v7.4S, v3.4S +trn2 v9.4S, v7.4S, v3.4S +trn1 v5.4S, v8.4S, v6.4S +trn2 v4.4S, v8.4S, v6.4S +trn2 v8.2D, v10.2D, v5.2D +trn2 v6.2D, v9.2D, v4.2D +trn1 v7.2D, v10.2D, v5.2D +trn1 v3.2D, v9.2D, v4.2D +sqrdmulh v4.4S, v8.4S, v12.4S +mul v8.4S, v8.4S,v22.4S +mla v8.4S, v4.4S, v31.s[0] +sub v4.4s, v7.4s, v8.4s +add v7.4s, v7.4s, v8.4s +sqrdmulh v8.4S, v6.4S, v12.4S +mul v6.4S, v6.4S,v22.4S +mla v6.4S, v8.4S, v31.s[0] +sub v8.4s, v3.4s, v6.4s +add v3.4s, v3.4s, v6.4s +sqrdmulh v6.4S, v3.4S, v17.4S +mul v3.4S, v3.4S,v1.4S +mla v3.4S, v6.4S, v31.s[0] +sub v6.4s, v7.4s, v3.4s +add v7.4s, v7.4s, v3.4s +sqrdmulh v3.4S, v8.4S, v21.4S +mul v8.4S, v8.4S,v2.4S +mla v8.4S, v3.4S, v31.s[0] +sub v3.4s, v4.4s, v8.4s +add v4.4s, v4.4s, v8.4s +str q7, [x0, #448] +str q6, [x0, #464] +str q4, [x0, #480] +str q3, [x0, #496] +ldr q3, [x17, #+1152] +ldr q4, [x17, #+1168] +ldr q6, [x17, #+1184] +ldr q7, [x17, #+1200] +ldr q8, [x17, #+1216] +ldr q9, [x17, #+1232] +ldr q5, [x17, #+1248] +ldr q10, [x17, #+1264] +ldr q21, [x0, #544] +ldr q2, [x0, #560] +ldr q17, [x0, #512] +ldr q1, [x0, #528] +sqrdmulh v12.4S, v21.4S, v4.s[0] +mul v21.4S, v21.4S,v3.s[0] +mla v21.4S, v12.4S, v31.s[0] +sub v12.4s, v17.4s, v21.4s +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v2.4S, v4.s[0] +mul v2.4S, v2.4S,v3.s[0] +mla v2.4S, v21.4S, v31.s[0] +sub v21.4s, v1.4s, v2.4s +add v1.4s, v1.4s, v2.4s +sqrdmulh v2.4S, v1.4S, v4.s[1] +mul v1.4S, v1.4S,v3.s[1] +mla v1.4S, v2.4S, v31.s[0] +sub v2.4s, v17.4s, v1.4s +add v17.4s, v17.4s, v1.4s +sqrdmulh v1.4S, v21.4S, v4.s[2] +mul v21.4S, v21.4S,v3.s[2] +mla v21.4S, v1.4S, v31.s[0] +sub v1.4s, v12.4s, v21.4s +add v12.4s, v12.4s, v21.4s +trn1 v21.4S, v17.4S, v2.4S +trn2 v22.4S, v17.4S, v2.4S +trn1 v11.4S, v12.4S, v1.4S +trn2 v16.4S, v12.4S, v1.4S +trn2 v12.2D, v21.2D, v11.2D +trn2 v1.2D, v22.2D, v16.2D +trn1 v17.2D, v21.2D, v11.2D +trn1 v2.2D, v22.2D, v16.2D +sqrdmulh v16.4S, v12.4S, v7.4S +mul v12.4S, v12.4S,v6.4S +mla v12.4S, v16.4S, v31.s[0] +sub v16.4s, v17.4s, v12.4s +add v17.4s, v17.4s, v12.4s +sqrdmulh v12.4S, v1.4S, v7.4S +mul v1.4S, v1.4S,v6.4S +mla v1.4S, v12.4S, v31.s[0] +sub v12.4s, v2.4s, v1.4s +add v2.4s, v2.4s, v1.4s +sqrdmulh v1.4S, v2.4S, v9.4S +mul v2.4S, v2.4S,v8.4S +mla v2.4S, v1.4S, v31.s[0] +sub v1.4s, v17.4s, v2.4s +add v17.4s, v17.4s, v2.4s +sqrdmulh v2.4S, v12.4S, v10.4S +mul v12.4S, v12.4S,v5.4S +mla v12.4S, v2.4S, v31.s[0] +sub v2.4s, v16.4s, v12.4s +add v16.4s, v16.4s, v12.4s +str q17, [x0, #512] +str q1, [x0, #528] +str q16, [x0, #544] +str q2, [x0, #560] +ldr q2, [x17, #+1280] +ldr q16, [x17, #+1296] +ldr q1, [x17, #+1312] +ldr q17, [x17, #+1328] +ldr q12, [x17, #+1344] +ldr q22, [x17, #+1360] +ldr q11, [x17, #+1376] +ldr q21, [x17, #+1392] +ldr q10, [x0, #608] +ldr q5, [x0, #624] +ldr q9, [x0, #576] +ldr q8, [x0, #592] +sqrdmulh v7.4S, v10.4S, v16.s[0] +mul v10.4S, v10.4S,v2.s[0] +mla v10.4S, v7.4S, v31.s[0] +sub v7.4s, v9.4s, v10.4s +add v9.4s, v9.4s, v10.4s +sqrdmulh v10.4S, v5.4S, v16.s[0] +mul v5.4S, v5.4S,v2.s[0] +mla v5.4S, v10.4S, v31.s[0] +sub v10.4s, v8.4s, v5.4s +add v8.4s, v8.4s, v5.4s +sqrdmulh v5.4S, v8.4S, v16.s[1] +mul v8.4S, v8.4S,v2.s[1] +mla v8.4S, v5.4S, v31.s[0] +sub v5.4s, v9.4s, v8.4s +add v9.4s, v9.4s, v8.4s +sqrdmulh v8.4S, v10.4S, v16.s[2] +mul v10.4S, v10.4S,v2.s[2] +mla v10.4S, v8.4S, v31.s[0] +sub v8.4s, v7.4s, v10.4s +add v7.4s, v7.4s, v10.4s +trn1 v10.4S, v9.4S, v5.4S +trn2 v6.4S, v9.4S, v5.4S +trn1 v4.4S, v7.4S, v8.4S +trn2 v3.4S, v7.4S, v8.4S +trn2 v7.2D, v10.2D, v4.2D +trn2 v8.2D, v6.2D, v3.2D +trn1 v9.2D, v10.2D, v4.2D +trn1 v5.2D, v6.2D, v3.2D +sqrdmulh v3.4S, v7.4S, v17.4S +mul v7.4S, v7.4S,v1.4S +mla v7.4S, v3.4S, v31.s[0] +sub v3.4s, v9.4s, v7.4s +add v9.4s, v9.4s, v7.4s +sqrdmulh v7.4S, v8.4S, v17.4S +mul v8.4S, v8.4S,v1.4S +mla v8.4S, v7.4S, v31.s[0] +sub v7.4s, v5.4s, v8.4s +add v5.4s, v5.4s, v8.4s +sqrdmulh v8.4S, v5.4S, v22.4S +mul v5.4S, v5.4S,v12.4S +mla v5.4S, v8.4S, v31.s[0] +sub v8.4s, v9.4s, v5.4s +add v9.4s, v9.4s, v5.4s +sqrdmulh v5.4S, v7.4S, v21.4S +mul v7.4S, v7.4S,v11.4S +mla v7.4S, v5.4S, v31.s[0] +sub v5.4s, v3.4s, v7.4s +add v3.4s, v3.4s, v7.4s +str q9, [x0, #576] +str q8, [x0, #592] +str q3, [x0, #608] +str q5, [x0, #624] +ldr q5, [x17, #+1408] +ldr q3, [x17, #+1424] +ldr q8, [x17, #+1440] +ldr q9, [x17, #+1456] +ldr q7, [x17, #+1472] +ldr q6, [x17, #+1488] +ldr q4, [x17, #+1504] +ldr q10, [x17, #+1520] +ldr q21, [x0, #672] +ldr q11, [x0, #688] +ldr q22, [x0, #640] +ldr q12, [x0, #656] +sqrdmulh v17.4S, v21.4S, v3.s[0] +mul v21.4S, v21.4S,v5.s[0] +mla v21.4S, v17.4S, v31.s[0] +sub v17.4s, v22.4s, v21.4s +add v22.4s, v22.4s, v21.4s +sqrdmulh v21.4S, v11.4S, v3.s[0] +mul v11.4S, v11.4S,v5.s[0] +mla v11.4S, v21.4S, v31.s[0] +sub v21.4s, v12.4s, v11.4s +add v12.4s, v12.4s, v11.4s +sqrdmulh v11.4S, v12.4S, v3.s[1] +mul v12.4S, v12.4S,v5.s[1] +mla v12.4S, v11.4S, v31.s[0] +sub v11.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v21.4S, v3.s[2] +mul v21.4S, v21.4S,v5.s[2] +mla v21.4S, v12.4S, v31.s[0] +sub v12.4s, v17.4s, v21.4s +add v17.4s, v17.4s, v21.4s +trn1 v21.4S, v22.4S, v11.4S +trn2 v1.4S, v22.4S, v11.4S +trn1 v16.4S, v17.4S, v12.4S +trn2 v2.4S, v17.4S, v12.4S +trn2 v17.2D, v21.2D, v16.2D +trn2 v12.2D, v1.2D, v2.2D +trn1 v22.2D, v21.2D, v16.2D +trn1 v11.2D, v1.2D, v2.2D +sqrdmulh v2.4S, v17.4S, v9.4S +mul v17.4S, v17.4S,v8.4S +mla v17.4S, v2.4S, v31.s[0] +sub v2.4s, v22.4s, v17.4s +add v22.4s, v22.4s, v17.4s +sqrdmulh v17.4S, v12.4S, v9.4S +mul v12.4S, v12.4S,v8.4S +mla v12.4S, v17.4S, v31.s[0] +sub v17.4s, v11.4s, v12.4s +add v11.4s, v11.4s, v12.4s +sqrdmulh v12.4S, v11.4S, v6.4S +mul v11.4S, v11.4S,v7.4S +mla v11.4S, v12.4S, v31.s[0] +sub v12.4s, v22.4s, v11.4s +add v22.4s, v22.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v10.4S +mul v17.4S, v17.4S,v4.4S +mla v17.4S, v11.4S, v31.s[0] +sub v11.4s, v2.4s, v17.4s +add v2.4s, v2.4s, v17.4s +str q22, [x0, #640] +str q12, [x0, #656] +str q2, [x0, #672] +str q11, [x0, #688] +ldr q11, [x17, #+1536] +ldr q2, [x17, #+1552] +ldr q12, [x17, #+1568] +ldr q22, [x17, #+1584] +ldr q17, [x17, #+1600] +ldr q1, [x17, #+1616] +ldr q16, [x17, #+1632] +ldr q21, [x17, #+1648] +ldr q10, [x0, #736] +ldr q4, [x0, #752] +ldr q6, [x0, #704] +ldr q7, [x0, #720] +sqrdmulh v9.4S, v10.4S, v2.s[0] +mul v10.4S, v10.4S,v11.s[0] +mla v10.4S, v9.4S, v31.s[0] +sub v9.4s, v6.4s, v10.4s +add v6.4s, v6.4s, v10.4s +sqrdmulh v10.4S, v4.4S, v2.s[0] +mul v4.4S, v4.4S,v11.s[0] +mla v4.4S, v10.4S, v31.s[0] +sub v10.4s, v7.4s, v4.4s +add v7.4s, v7.4s, v4.4s +sqrdmulh v4.4S, v7.4S, v2.s[1] +mul v7.4S, v7.4S,v11.s[1] +mla v7.4S, v4.4S, v31.s[0] +sub v4.4s, v6.4s, v7.4s +add v6.4s, v6.4s, v7.4s +sqrdmulh v7.4S, v10.4S, v2.s[2] +mul v10.4S, v10.4S,v11.s[2] +mla v10.4S, v7.4S, v31.s[0] +sub v7.4s, v9.4s, v10.4s +add v9.4s, v9.4s, v10.4s +trn1 v10.4S, v6.4S, v4.4S +trn2 v8.4S, v6.4S, v4.4S +trn1 v3.4S, v9.4S, v7.4S +trn2 v5.4S, v9.4S, v7.4S +trn2 v9.2D, v10.2D, v3.2D +trn2 v7.2D, v8.2D, v5.2D +trn1 v6.2D, v10.2D, v3.2D +trn1 v4.2D, v8.2D, v5.2D +sqrdmulh v5.4S, v9.4S, v22.4S +mul v9.4S, v9.4S,v12.4S +mla v9.4S, v5.4S, v31.s[0] +sub v5.4s, v6.4s, v9.4s +add v6.4s, v6.4s, v9.4s +sqrdmulh v9.4S, v7.4S, v22.4S +mul v7.4S, v7.4S,v12.4S +mla v7.4S, v9.4S, v31.s[0] +sub v9.4s, v4.4s, v7.4s +add v4.4s, v4.4s, v7.4s +sqrdmulh v7.4S, v4.4S, v1.4S +mul v4.4S, v4.4S,v17.4S +mla v4.4S, v7.4S, v31.s[0] +sub v7.4s, v6.4s, v4.4s +add v6.4s, v6.4s, v4.4s +sqrdmulh v4.4S, v9.4S, v21.4S +mul v9.4S, v9.4S,v16.4S +mla v9.4S, v4.4S, v31.s[0] +sub v4.4s, v5.4s, v9.4s +add v5.4s, v5.4s, v9.4s +str q6, [x0, #704] +str q7, [x0, #720] +str q5, [x0, #736] +str q4, [x0, #752] +ldr q4, [x17, #+1664] +ldr q5, [x17, #+1680] +ldr q7, [x17, #+1696] +ldr q6, [x17, #+1712] +ldr q9, [x17, #+1728] +ldr q8, [x17, #+1744] +ldr q3, [x17, #+1760] +ldr q10, [x17, #+1776] +ldr q21, [x0, #800] +ldr q16, [x0, #816] +ldr q1, [x0, #768] +ldr q17, [x0, #784] +sqrdmulh v22.4S, v21.4S, v5.s[0] +mul v21.4S, v21.4S,v4.s[0] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v1.4s, v21.4s +add v1.4s, v1.4s, v21.4s +sqrdmulh v21.4S, v16.4S, v5.s[0] +mul v16.4S, v16.4S,v4.s[0] +mla v16.4S, v21.4S, v31.s[0] +sub v21.4s, v17.4s, v16.4s +add v17.4s, v17.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v5.s[1] +mul v17.4S, v17.4S,v4.s[1] +mla v17.4S, v16.4S, v31.s[0] +sub v16.4s, v1.4s, v17.4s +add v1.4s, v1.4s, v17.4s +sqrdmulh v17.4S, v21.4S, v5.s[2] +mul v21.4S, v21.4S,v4.s[2] +mla v21.4S, v17.4S, v31.s[0] +sub v17.4s, v22.4s, v21.4s +add v22.4s, v22.4s, v21.4s +trn1 v21.4S, v1.4S, v16.4S +trn2 v12.4S, v1.4S, v16.4S +trn1 v2.4S, v22.4S, v17.4S +trn2 v11.4S, v22.4S, v17.4S +trn2 v22.2D, v21.2D, v2.2D +trn2 v17.2D, v12.2D, v11.2D +trn1 v1.2D, v21.2D, v2.2D +trn1 v16.2D, v12.2D, v11.2D +sqrdmulh v11.4S, v22.4S, v6.4S +mul v22.4S, v22.4S,v7.4S +mla v22.4S, v11.4S, v31.s[0] +sub v11.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +sqrdmulh v22.4S, v17.4S, v6.4S +mul v17.4S, v17.4S,v7.4S +mla v17.4S, v22.4S, v31.s[0] +sub v22.4s, v16.4s, v17.4s +add v16.4s, v16.4s, v17.4s +sqrdmulh v17.4S, v16.4S, v8.4S +mul v16.4S, v16.4S,v9.4S +mla v16.4S, v17.4S, v31.s[0] +sub v17.4s, v1.4s, v16.4s +add v1.4s, v1.4s, v16.4s +sqrdmulh v16.4S, v22.4S, v10.4S +mul v22.4S, v22.4S,v3.4S +mla v22.4S, v16.4S, v31.s[0] +sub v16.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +str q1, [x0, #768] +str q17, [x0, #784] +str q11, [x0, #800] +str q16, [x0, #816] +ldr q16, [x17, #+1792] +ldr q11, [x17, #+1808] +ldr q17, [x17, #+1824] +ldr q1, [x17, #+1840] +ldr q22, [x17, #+1856] +ldr q12, [x17, #+1872] +ldr q2, [x17, #+1888] +ldr q21, [x17, #+1904] +ldr q10, [x0, #864] +ldr q3, [x0, #880] +ldr q8, [x0, #832] +ldr q9, [x0, #848] +sqrdmulh v6.4S, v10.4S, v11.s[0] +mul v10.4S, v10.4S,v16.s[0] +mla v10.4S, v6.4S, v31.s[0] +sub v6.4s, v8.4s, v10.4s +add v8.4s, v8.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v11.s[0] +mul v3.4S, v3.4S,v16.s[0] +mla v3.4S, v10.4S, v31.s[0] +sub v10.4s, v9.4s, v3.4s +add v9.4s, v9.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v11.s[1] +mul v9.4S, v9.4S,v16.s[1] +mla v9.4S, v3.4S, v31.s[0] +sub v3.4s, v8.4s, v9.4s +add v8.4s, v8.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v11.s[2] +mul v10.4S, v10.4S,v16.s[2] +mla v10.4S, v9.4S, v31.s[0] +sub v9.4s, v6.4s, v10.4s +add v6.4s, v6.4s, v10.4s +trn1 v10.4S, v8.4S, v3.4S +trn2 v7.4S, v8.4S, v3.4S +trn1 v5.4S, v6.4S, v9.4S +trn2 v4.4S, v6.4S, v9.4S +trn2 v6.2D, v10.2D, v5.2D +trn2 v9.2D, v7.2D, v4.2D +trn1 v8.2D, v10.2D, v5.2D +trn1 v3.2D, v7.2D, v4.2D +sqrdmulh v4.4S, v6.4S, v1.4S +mul v6.4S, v6.4S,v17.4S +mla v6.4S, v4.4S, v31.s[0] +sub v4.4s, v8.4s, v6.4s +add v8.4s, v8.4s, v6.4s +sqrdmulh v6.4S, v9.4S, v1.4S +mul v9.4S, v9.4S,v17.4S +mla v9.4S, v6.4S, v31.s[0] +sub v6.4s, v3.4s, v9.4s +add v3.4s, v3.4s, v9.4s +sqrdmulh v9.4S, v3.4S, v12.4S +mul v3.4S, v3.4S,v22.4S +mla v3.4S, v9.4S, v31.s[0] +sub v9.4s, v8.4s, v3.4s +add v8.4s, v8.4s, v3.4s +sqrdmulh v3.4S, v6.4S, v21.4S +mul v6.4S, v6.4S,v2.4S +mla v6.4S, v3.4S, v31.s[0] +sub v3.4s, v4.4s, v6.4s +add v4.4s, v4.4s, v6.4s +str q8, [x0, #832] +str q9, [x0, #848] +str q4, [x0, #864] +str q3, [x0, #880] +ldr q3, [x17, #+1920] +ldr q4, [x17, #+1936] +ldr q9, [x17, #+1952] +ldr q8, [x17, #+1968] +ldr q6, [x17, #+1984] +ldr q7, [x17, #+2000] +ldr q5, [x17, #+2016] +ldr q10, [x17, #+2032] +ldr q21, [x0, #928] +ldr q2, [x0, #944] +ldr q12, [x0, #896] +ldr q22, [x0, #912] +sqrdmulh v1.4S, v21.4S, v4.s[0] +mul v21.4S, v21.4S,v3.s[0] +mla v21.4S, v1.4S, v31.s[0] +sub v1.4s, v12.4s, v21.4s +add v12.4s, v12.4s, v21.4s +sqrdmulh v21.4S, v2.4S, v4.s[0] +mul v2.4S, v2.4S,v3.s[0] +mla v2.4S, v21.4S, v31.s[0] +sub v21.4s, v22.4s, v2.4s +add v22.4s, v22.4s, v2.4s +sqrdmulh v2.4S, v22.4S, v4.s[1] +mul v22.4S, v22.4S,v3.s[1] +mla v22.4S, v2.4S, v31.s[0] +sub v2.4s, v12.4s, v22.4s +add v12.4s, v12.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v4.s[2] +mul v21.4S, v21.4S,v3.s[2] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v1.4s, v21.4s +add v1.4s, v1.4s, v21.4s +trn1 v21.4S, v12.4S, v2.4S +trn2 v17.4S, v12.4S, v2.4S +trn1 v11.4S, v1.4S, v22.4S +trn2 v16.4S, v1.4S, v22.4S +trn2 v1.2D, v21.2D, v11.2D +trn2 v22.2D, v17.2D, v16.2D +trn1 v12.2D, v21.2D, v11.2D +trn1 v2.2D, v17.2D, v16.2D +sqrdmulh v16.4S, v1.4S, v8.4S +mul v1.4S, v1.4S,v9.4S +mla v1.4S, v16.4S, v31.s[0] +sub v16.4s, v12.4s, v1.4s +add v12.4s, v12.4s, v1.4s +sqrdmulh v1.4S, v22.4S, v8.4S +mul v22.4S, v22.4S,v9.4S +mla v22.4S, v1.4S, v31.s[0] +sub v1.4s, v2.4s, v22.4s +add v2.4s, v2.4s, v22.4s +sqrdmulh v22.4S, v2.4S, v7.4S +mul v2.4S, v2.4S,v6.4S +mla v2.4S, v22.4S, v31.s[0] +sub v22.4s, v12.4s, v2.4s +add v12.4s, v12.4s, v2.4s +sqrdmulh v2.4S, v1.4S, v10.4S +mul v1.4S, v1.4S,v5.4S +mla v1.4S, v2.4S, v31.s[0] +sub v2.4s, v16.4s, v1.4s +add v16.4s, v16.4s, v1.4s +str q12, [x0, #896] +str q22, [x0, #912] +str q16, [x0, #928] +str q2, [x0, #944] +ldr q2, [x17, #+2048] +ldr q16, [x17, #+2064] +ldr q22, [x17, #+2080] +ldr q12, [x17, #+2096] +ldr q1, [x17, #+2112] +ldr q17, [x17, #+2128] +ldr q11, [x17, #+2144] +ldr q21, [x17, #+2160] +ldr q10, [x0, #992] +ldr q5, [x0, #1008] +ldr q7, [x0, #960] +ldr q6, [x0, #976] +sqrdmulh v8.4S, v10.4S, v16.s[0] +mul v10.4S, v10.4S,v2.s[0] +mla v10.4S, v8.4S, v31.s[0] +sub v8.4s, v7.4s, v10.4s +add v7.4s, v7.4s, v10.4s +sqrdmulh v10.4S, v5.4S, v16.s[0] +mul v5.4S, v5.4S,v2.s[0] +mla v5.4S, v10.4S, v31.s[0] +sub v10.4s, v6.4s, v5.4s +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v6.4S, v16.s[1] +mul v6.4S, v6.4S,v2.s[1] +mla v6.4S, v5.4S, v31.s[0] +sub v5.4s, v7.4s, v6.4s +add v7.4s, v7.4s, v6.4s +sqrdmulh v6.4S, v10.4S, v16.s[2] +mul v10.4S, v10.4S,v2.s[2] +mla v10.4S, v6.4S, v31.s[0] +sub v6.4s, v8.4s, v10.4s +add v8.4s, v8.4s, v10.4s +trn1 v10.4S, v7.4S, v5.4S +trn2 v9.4S, v7.4S, v5.4S +trn1 v4.4S, v8.4S, v6.4S +trn2 v3.4S, v8.4S, v6.4S +trn2 v8.2D, v10.2D, v4.2D +trn2 v6.2D, v9.2D, v3.2D +trn1 v7.2D, v10.2D, v4.2D +trn1 v5.2D, v9.2D, v3.2D +sqrdmulh v3.4S, v8.4S, v12.4S +mul v8.4S, v8.4S,v22.4S +mla v8.4S, v3.4S, v31.s[0] +sub v3.4s, v7.4s, v8.4s +add v7.4s, v7.4s, v8.4s +sqrdmulh v8.4S, v6.4S, v12.4S +mul v6.4S, v6.4S,v22.4S +mla v6.4S, v8.4S, v31.s[0] +sub v8.4s, v5.4s, v6.4s +add v5.4s, v5.4s, v6.4s +sqrdmulh v6.4S, v5.4S, v17.4S +mul v5.4S, v5.4S,v1.4S +mla v5.4S, v6.4S, v31.s[0] +sub v6.4s, v7.4s, v5.4s +add v7.4s, v7.4s, v5.4s +sqrdmulh v5.4S, v8.4S, v21.4S +mul v8.4S, v8.4S,v11.4S +mla v8.4S, v5.4S, v31.s[0] +sub v5.4s, v3.4s, v8.4s +add v3.4s, v3.4s, v8.4s +str q7, [x0, #960] +str q6, [x0, #976] +str q3, [x0, #992] +str q5, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 2392 +// Instruction count: 2388 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_10_0.s b/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_10_0.s new file mode 100644 index 0000000..c97d115 --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_10_0.s @@ -0,0 +1,2486 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 26036764 // Layer 6, block 0 +.word 7065381 // Layer 6, block 1 +.word 11280567 // Layer 6, block 2 +.word 19695786 // Layer 6, block 3 +.word 1666225723 // Layer 6, block 0 +.word 452149874 // Layer 6, block 1 +.word 721901190 // Layer 6, block 2 +.word 1260434103 // Layer 6, block 3 +.word 28678040 // Layer 7, block 0 +.word 5637166 // Layer 7, block 2 +.word 18759424 // Layer 7, block 4 +.word 8648030 // Layer 7, block 6 +.word 1835254486 // Layer 7, block 0 +.word 360751090 // Layer 7, block 2 +.word 1200511508 // Layer 7, block 4 +.word 553431680 // Layer 7, block 6 +.word 7232147 // Layer 7, block 1 +.word 7430689 // Layer 7, block 3 +.word 14819378 // Layer 7, block 5 +.word 22112339 // Layer 7, block 7 +.word 462822084 // Layer 7, block 1 +.word 475527802 // Layer 7, block 3 +.word 948367809 // Layer 7, block 5 +.word 1415081692 // Layer 7, block 7 +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14834498 // Layer 6, block 4 +.word 22861321 // Layer 6, block 5 +.word 23033862 // Layer 6, block 6 +.word 32211066 // Layer 6, block 7 +.word 949335415 // Layer 6, block 4 +.word 1463012881 // Layer 6, block 5 +.word 1474054663 // Layer 6, block 6 +.word 2061350894 // Layer 6, block 7 +.word 7103825 // Layer 7, block 8 +.word 24338119 // Layer 7, block 10 +.word 6674394 // Layer 7, block 12 +.word 3716128 // Layer 7, block 14 +.word 454610102 // Layer 7, block 8 +.word 1557520740 // Layer 7, block 10 +.word 427128616 // Layer 7, block 12 +.word 237814041 // Layer 7, block 14 +.word 18577393 // Layer 7, block 9 +.word 17042091 // Layer 7, block 11 +.word 6574213 // Layer 7, block 13 +.word 24666803 // Layer 7, block 15 +.word 1188862414 // Layer 7, block 9 +.word 1090610585 // Layer 7, block 11 +.word 420717521 // Layer 7, block 13 +.word 1578554911 // Layer 7, block 15 +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 11253846 // Layer 6, block 8 +.word 16151303 // Layer 6, block 9 +.word 1821442 // Layer 6, block 10 +.word 23358663 // Layer 6, block 11 +.word 720191176 // Layer 6, block 8 +.word 1033604503 // Layer 6, block 9 +.word 116563391 // Layer 6, block 10 +.word 1494840340 // Layer 6, block 11 +.word 32787475 // Layer 7, block 16 +.word 8269259 // Layer 7, block 18 +.word 20826321 // Layer 7, block 20 +.word 21194054 // Layer 7, block 22 +.word 2098238255 // Layer 7, block 16 +.word 529192186 // Layer 7, block 18 +.word 1332782821 // Layer 7, block 20 +.word 1356315937 // Layer 7, block 22 +.word 28400654 // Layer 7, block 17 +.word 31090287 // Layer 7, block 19 +.word 26776841 // Layer 7, block 21 +.word 22281074 // Layer 7, block 23 +.word 1817503137 // Layer 7, block 17 +.word 1989626512 // Layer 7, block 19 +.word 1713587037 // Layer 7, block 21 +.word 1425879908 // Layer 7, block 23 +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 20504641 // Layer 6, block 12 +.word 7735096 // Layer 6, block 13 +.word 29463916 // Layer 6, block 14 +.word 23172067 // Layer 6, block 15 +.word 1312196872 // Layer 6, block 12 +.word 495008363 // Layer 6, block 13 +.word 1885546712 // Layer 6, block 14 +.word 1482899108 // Layer 6, block 15 +.word 1953000 // Layer 7, block 24 +.word 12766243 // Layer 7, block 26 +.word 16292342 // Layer 7, block 28 +.word 25143337 // Layer 7, block 30 +.word 124982461 // Layer 7, block 24 +.word 816977197 // Layer 7, block 26 +.word 1042630311 // Layer 7, block 28 +.word 1609050759 // Layer 7, block 30 +.word 12486848 // Layer 7, block 25 +.word 31556661 // Layer 7, block 27 +.word 28330310 // Layer 7, block 29 +.word 15137961 // Layer 7, block 31 +.word 799097282 // Layer 7, block 25 +.word 2019472170 // Layer 7, block 27 +.word 1813001465 // Layer 7, block 29 +.word 968755565 // Layer 7, block 31 +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 18663828 // Layer 6, block 16 +.word 25765932 // Layer 6, block 17 +.word 11779122 // Layer 6, block 18 +.word 29112305 // Layer 6, block 19 +.word 1194393831 // Layer 6, block 16 +.word 1648893798 // Layer 6, block 17 +.word 753806275 // Layer 6, block 18 +.word 1863045325 // Layer 6, block 19 +.word 33163184 // Layer 7, block 32 +.word 11550623 // Layer 7, block 34 +.word 25375595 // Layer 7, block 36 +.word 18254638 // Layer 7, block 38 +.word 2122281795 // Layer 7, block 32 +.word 739183455 // Layer 7, block 34 +.word 1623914137 // Layer 7, block 36 +.word 1168207670 // Layer 7, block 38 +.word 9551359 // Layer 7, block 33 +.word 33257316 // Layer 7, block 35 +.word 10387700 // Layer 7, block 37 +.word 4263629 // Layer 7, block 39 +.word 611240324 // Layer 7, block 33 +.word 2128305784 // Layer 7, block 35 +.word 664762063 // Layer 7, block 37 +.word 272851431 // Layer 7, block 39 +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 596073 // Layer 6, block 20 +.word 29039358 // Layer 6, block 21 +.word 6760262 // Layer 6, block 22 +.word 2228887 // Layer 6, block 23 +.word 38145761 // Layer 6, block 20 +.word 1858377074 // Layer 6, block 21 +.word 432623749 // Layer 6, block 22 +.word 142637881 // Layer 6, block 23 +.word 25929180 // Layer 7, block 40 +.word 23508428 // Layer 7, block 42 +.word 22560727 // Layer 7, block 44 +.word 29457393 // Layer 7, block 46 +.word 1659340873 // Layer 7, block 40 +.word 1504424569 // Layer 7, block 42 +.word 1443776334 // Layer 7, block 44 +.word 1885129272 // Layer 7, block 46 +.word 17371159 // Layer 7, block 41 +.word 11558208 // Layer 7, block 43 +.word 15755637 // Layer 7, block 45 +.word 20740787 // Layer 7, block 47 +.word 1111669329 // Layer 7, block 41 +.word 739668858 // Layer 7, block 43 +.word 1008283812 // Layer 7, block 45 +.word 1327309063 // Layer 7, block 47 +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 13624329 // Layer 6, block 24 +.word 9838349 // Layer 6, block 25 +.word 6934560 // Layer 6, block 26 +.word 11310234 // Layer 6, block 27 +.word 871890510 // Layer 6, block 24 +.word 629606282 // Layer 6, block 25 +.word 443777969 // Layer 6, block 26 +.word 723799733 // Layer 6, block 27 +.word 3153984 // Layer 7, block 48 +.word 15599806 // Layer 7, block 50 +.word 23484790 // Layer 7, block 52 +.word 30174454 // Layer 7, block 54 +.word 201839571 // Layer 7, block 48 +.word 998311389 // Layer 7, block 50 +.word 1502911852 // Layer 7, block 52 +.word 1931017673 // Layer 7, block 54 +.word 13598070 // Layer 7, block 49 +.word 31454003 // Layer 7, block 51 +.word 20506260 // Layer 7, block 53 +.word 5928435 // Layer 7, block 55 +.word 870210062 // Layer 7, block 49 +.word 2012902560 // Layer 7, block 51 +.word 1312300480 // Layer 7, block 53 +.word 379390883 // Layer 7, block 55 +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 32798516 // Layer 6, block 28 +.word 9911360 // Layer 6, block 29 +.word 32443170 // Layer 6, block 30 +.word 31293482 // Layer 6, block 31 +.word 2098944825 // Layer 6, block 28 +.word 634278629 // Layer 6, block 29 +.word 2076204416 // Layer 6, block 30 +.word 2002630000 // Layer 6, block 31 +.word 26013877 // Layer 7, block 56 +.word 22928950 // Layer 7, block 58 +.word 24547058 // Layer 7, block 60 +.word 21082546 // Layer 7, block 62 +.word 1664761067 // Layer 7, block 56 +.word 1467340807 // Layer 7, block 58 +.word 1570891816 // Layer 7, block 60 +.word 1349179970 // Layer 7, block 62 +.word 21864746 // Layer 7, block 57 +.word 27678266 // Layer 7, block 59 +.word 30695887 // Layer 7, block 61 +.word 31772478 // Layer 7, block 63 +.word 1399236949 // Layer 7, block 57 +.word 1771273834 // Layer 7, block 59 +.word 1964386839 // Layer 7, block 61 +.word 2033283404 // Layer 7, block 63 +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 2853776 // Layer 6, block 32 +.word 31645959 // Layer 6, block 33 +.word 29723614 // Layer 6, block 34 +.word 31813171 // Layer 6, block 35 +.word 182627725 // Layer 6, block 32 +.word 2025186806 // Layer 6, block 33 +.word 1902166116 // Layer 6, block 34 +.word 2035887557 // Layer 6, block 35 +.word 30377953 // Layer 7, block 64 +.word 4924837 // Layer 7, block 66 +.word 11362575 // Layer 7, block 68 +.word 31398766 // Layer 7, block 70 +.word 1944040616 // Layer 7, block 64 +.word 315165513 // Layer 7, block 66 +.word 727149301 // Layer 7, block 68 +.word 2009367662 // Layer 7, block 70 +.word 27689101 // Layer 7, block 65 +.word 31229525 // Layer 7, block 67 +.word 6544948 // Layer 7, block 69 +.word 13728247 // Layer 7, block 71 +.word 1771967221 // Layer 7, block 65 +.word 1998537064 // Layer 7, block 67 +.word 418844704 // Layer 7, block 69 +.word 878540754 // Layer 7, block 71 +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9116920 // Layer 6, block 36 +.word 26449800 // Layer 6, block 37 +.word 27173300 // Layer 6, block 38 +.word 1574249 // Layer 6, block 39 +.word 583438350 // Layer 6, block 36 +.word 1692658010 // Layer 6, block 37 +.word 1738958476 // Layer 6, block 38 +.word 100744247 // Layer 6, block 39 +.word 6510145 // Layer 7, block 72 +.word 760999 // Layer 7, block 74 +.word 1634503 // Layer 7, block 76 +.word 29546109 // Layer 7, block 78 +.word 416617482 // Layer 7, block 72 +.word 48700219 // Layer 7, block 74 +.word 104600209 // Layer 7, block 76 +.word 1890806663 // Layer 7, block 78 +.word 2195232 // Layer 7, block 73 +.word 4465852 // Layer 7, block 75 +.word 31203102 // Layer 7, block 77 +.word 29916743 // Layer 7, block 79 +.word 140484126 // Layer 7, block 73 +.word 285792715 // Layer 7, block 75 +.word 1996846121 // Layer 7, block 77 +.word 1914525428 // Layer 7, block 79 +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29172999 // Layer 6, block 40 +.word 16825951 // Layer 6, block 41 +.word 11592382 // Layer 6, block 42 +.word 2671395 // Layer 6, block 43 +.word 1866929445 // Layer 6, block 40 +.word 1076778680 // Layer 6, block 41 +.word 741855827 // Layer 6, block 42 +.word 170956232 // Layer 6, block 43 +.word 14579779 // Layer 7, block 80 +.word 24263513 // Layer 7, block 82 +.word 4646776 // Layer 7, block 84 +.word 69049 // Layer 7, block 86 +.word 933034643 // Layer 7, block 80 +.word 1552746321 // Layer 7, block 82 +.word 297370968 // Layer 7, block 84 +.word 4418799 // Layer 7, block 86 +.word 33263488 // Layer 7, block 81 +.word 22493246 // Layer 7, block 83 +.word 22009979 // Layer 7, block 85 +.word 12021234 // Layer 7, block 87 +.word 2128700762 // Layer 7, block 81 +.word 1439457879 // Layer 7, block 83 +.word 1408531152 // Layer 7, block 85 +.word 769300260 // Layer 7, block 87 +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 15720958 // Layer 6, block 44 +.word 4876619 // Layer 6, block 45 +.word 9370171 // Layer 6, block 46 +.word 2197027 // Layer 6, block 47 +.word 1006064525 // Layer 6, block 44 +.word 312079797 // Layer 6, block 45 +.word 599645177 // Layer 6, block 46 +.word 140598997 // Layer 6, block 47 +.word 16117282 // Layer 7, block 88 +.word 9635661 // Layer 7, block 90 +.word 9117520 // Layer 7, block 92 +.word 3506913 // Layer 7, block 94 +.word 1031427326 // Layer 7, block 88 +.word 616635240 // Layer 7, block 90 +.word 583476747 // Layer 7, block 92 +.word 224425303 // Layer 7, block 94 +.word 20014407 // Layer 7, block 89 +.word 25893988 // Layer 7, block 91 +.word 10257619 // Layer 7, block 93 +.word 24501669 // Layer 7, block 95 +.word 1280824291 // Layer 7, block 89 +.word 1657088757 // Layer 7, block 91 +.word 656437514 // Layer 7, block 93 +.word 1567987141 // Layer 7, block 95 +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 23467272 // Layer 6, block 48 +.word 11944835 // Layer 6, block 49 +.word 29768154 // Layer 6, block 50 +.word 3189790 // Layer 6, block 51 +.word 1501790786 // Layer 6, block 48 +.word 764411097 // Layer 6, block 49 +.word 1905016458 // Layer 6, block 50 +.word 204130980 // Layer 6, block 51 +.word 28559032 // Layer 7, block 96 +.word 20151609 // Layer 7, block 98 +.word 11645481 // Layer 7, block 100 +.word 16402437 // Layer 7, block 102 +.word 1827638556 // Layer 7, block 96 +.word 1289604549 // Layer 7, block 98 +.word 745253903 // Layer 7, block 100 +.word 1049675853 // Layer 7, block 102 +.word 1005359 // Layer 7, block 97 +.word 19130139 // Layer 7, block 99 +.word 11690281 // Layer 7, block 101 +.word 5461508 // Layer 7, block 103 +.word 64338065 // Layer 7, block 97 +.word 1224235458 // Layer 7, block 99 +.word 748120885 // Layer 7, block 101 +.word 349509836 // Layer 7, block 103 +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 4898455 // Layer 6, block 52 +.word 22059944 // Layer 6, block 53 +.word 20315246 // Layer 6, block 54 +.word 28615767 // Layer 6, block 55 +.word 313477194 // Layer 6, block 52 +.word 1411728668 // Layer 6, block 53 +.word 1300076517 // Layer 6, block 54 +.word 1831269319 // Layer 6, block 55 +.word 6226096 // Layer 7, block 104 +.word 14029790 // Layer 7, block 106 +.word 7729000 // Layer 7, block 108 +.word 13958531 // Layer 7, block 110 +.word 398439734 // Layer 7, block 104 +.word 897838034 // Layer 7, block 106 +.word 494618249 // Layer 7, block 108 +.word 893277806 // Layer 7, block 110 +.word 31755058 // Layer 7, block 105 +.word 26102744 // Layer 7, block 107 +.word 19175904 // Layer 7, block 109 +.word 19472238 // Layer 7, block 111 +.word 2032168609 // Layer 7, block 105 +.word 1670448121 // Layer 7, block 107 +.word 1227164194 // Layer 7, block 109 +.word 1246128123 // Layer 7, block 111 +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 17302560 // Layer 6, block 56 +.word 8630188 // Layer 6, block 57 +.word 13744680 // Layer 6, block 58 +.word 31890906 // Layer 6, block 59 +.word 1107279328 // Layer 6, block 56 +.word 552289879 // Layer 6, block 57 +.word 879592386 // Layer 6, block 58 +.word 2040862218 // Layer 6, block 59 +.word 4735938 // Layer 7, block 112 +.word 26671657 // Layer 7, block 114 +.word 25810971 // Layer 7, block 116 +.word 25578690 // Layer 7, block 118 +.word 303076900 // Layer 7, block 112 +.word 1706855774 // Layer 7, block 114 +.word 1651776074 // Layer 7, block 116 +.word 1636911225 // Layer 7, block 118 +.word 6957373 // Layer 7, block 113 +.word 25381712 // Layer 7, block 115 +.word 27780827 // Layer 7, block 117 +.word 28062311 // Layer 7, block 119 +.word 445237890 // Layer 7, block 113 +.word 1624305595 // Layer 7, block 115 +.word 1777837237 // Layer 7, block 117 +.word 1795850838 // Layer 7, block 119 +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 26150922 // Layer 6, block 60 +.word 29525906 // Layer 6, block 61 +.word 23080870 // Layer 6, block 62 +.word 1636987 // Layer 6, block 63 +.word 1673531278 // Layer 6, block 60 +.word 1889513769 // Layer 6, block 61 +.word 1477062945 // Layer 6, block 62 +.word 104759172 // Layer 6, block 63 +.word 10674616 // Layer 7, block 120 +.word 9508293 // Layer 7, block 122 +.word 4274200 // Layer 7, block 124 +.word 10066304 // Layer 7, block 126 +.word 683123285 // Layer 7, block 120 +.word 608484310 // Layer 7, block 122 +.word 273527923 // Layer 7, block 124 +.word 644194289 // Layer 7, block 126 +.word 26473446 // Layer 7, block 121 +.word 14853570 // Layer 7, block 123 +.word 32427548 // Layer 7, block 125 +.word 16598340 // Layer 7, block 127 +.word 1694171239 // Layer 7, block 121 +.word 950555930 // Layer 7, block 123 +.word 2075204685 // Layer 7, block 125 +.word 1062212688 // Layer 7, block 127 +.text +.global ntt_u32_full_neon_asm_var_4_4_10_0 +.global _ntt_u32_full_neon_asm_var_4_4_10_0 +ntt_u32_full_neon_asm_var_4_4_10_0: +_ntt_u32_full_neon_asm_var_4_4_10_0: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #928] +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +ldr q20, [x0, #992] +sqrdmulh v19.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q18, [x0, #800] +sqrdmulh v17.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +ldr q16, [x0, #864] +sqrdmulh v3.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +mla v22.4S, v21.4S, v31.s[0] +mla v20.4S, v19.4S, v31.s[0] +mla v18.4S, v17.4S, v31.s[0] +mla v16.4S, v3.4S, v31.s[0] +ldr q3, [x0, #544] +sqrdmulh v17.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +ldr q19, [x0, #608] +sqrdmulh v21.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q2, [x0, #672] +ldr q1, [x0, #416] +sqrdmulh v0.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +sub v15.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +ldr q22, [x0, #736] +ldr q14, [x0, #480] +sqrdmulh v13.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v12.4s, v14.4s, v20.4s +add v14.4s, v14.4s, v20.4s +ldr q20, [x0, #288] +mla v3.4S, v17.4S, v31.s[0] +mla v19.4S, v21.4S, v31.s[0] +sub v21.4s, v20.4s, v18.4s +mla v2.4S, v0.4S, v31.s[0] +mla v22.4S, v13.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +ldr q18, [x0, #352] +sqrdmulh v13.4S, v1.4S, v29.s[1] +mul v1.4S, v1.4S,v30.s[1] +sub v0.4s, v18.4s, v16.4s +sqrdmulh v17.4S, v14.4S, v29.s[1] +mul v14.4S, v14.4S,v30.s[1] +add v18.4s, v18.4s, v16.4s +ldr q16, [x0, #32] +sqrdmulh v11.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v10.4s, v16.4s, v3.4s +add v16.4s, v16.4s, v3.4s +ldr q3, [x0, #96] +sqrdmulh v9.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v8.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +ldr q19, [x0, #160] +mla v1.4S, v13.4S, v31.s[0] +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v19.4s, v2.4s +mla v20.4S, v11.4S, v31.s[0] +mla v18.4S, v9.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +ldr q2, [x0, #224] +sqrdmulh v9.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +sub v11.4s, v2.4s, v22.4s +sqrdmulh v13.4S, v12.4S, v29.s[2] +mul v12.4S, v12.4S,v30.s[2] +add v2.4s, v2.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +sub v7.4s, v19.4s, v1.4s +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v29.s[2] +mul v0.4S, v0.4S,v30.s[2] +sub v6.4s, v2.4s, v14.4s +add v2.4s, v2.4s, v14.4s +mla v15.4S, v9.4S, v31.s[0] +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v16.4s, v20.4s +nop +mla v21.4S, v22.4S, v31.s[0] +mla v0.4S, v1.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +nop +sqrdmulh v20.4S, v7.4S, v27.s[1] +mul v7.4S, v7.4S,v28.s[1] +sub v1.4s, v3.4s, v18.4s +nop +sqrdmulh v22.4S, v6.4S, v27.s[1] +mul v6.4S, v6.4S,v28.s[1] +add v3.4s, v3.4s, v18.4s +nop +sqrdmulh v18.4S, v19.4S, v27.s[0] +mul v19.4S, v19.4S,v28.s[0] +sub v9.4s, v17.4s, v15.4s +add v17.4s, v17.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v27.s[0] +mul v2.4S, v2.4S,v28.s[0] +sub v14.4s, v11.4s, v12.4s +add v11.4s, v11.4s, v12.4s +mla v7.4S, v20.4S, v31.s[0] +mla v6.4S, v22.4S, v31.s[0] +sub v22.4s, v10.4s, v21.4s +nop +mla v19.4S, v18.4S, v31.s[0] +mla v2.4S, v15.4S, v31.s[0] +add v10.4s, v10.4s, v21.4s +nop +sqrdmulh v21.4S, v17.4S, v27.s[2] +mul v17.4S, v17.4S,v28.s[2] +sub v15.4s, v8.4s, v0.4s +nop +sqrdmulh v18.4S, v11.4S, v27.s[2] +mul v11.4S, v11.4S,v28.s[2] +add v8.4s, v8.4s, v0.4s +nop +sqrdmulh v0.4S, v9.4S, v27.s[3] +mul v9.4S, v9.4S,v28.s[3] +sub v20.4s, v13.4s, v7.4s +add v13.4s, v13.4s, v7.4s +sqrdmulh v7.4S, v14.4S, v27.s[3] +mul v14.4S, v14.4S,v28.s[3] +sub v12.4s, v1.4s, v6.4s +add v1.4s, v1.4s, v6.4s +mla v17.4S, v21.4S, v31.s[0] +mla v11.4S, v18.4S, v31.s[0] +sub v18.4s, v16.4s, v19.4s +nop +mla v9.4S, v0.4S, v31.s[0] +mla v14.4S, v7.4S, v31.s[0] +add v16.4s, v16.4s, v19.4s +nop +sqrdmulh v19.4S, v1.4S, v25.s[2] +mul v1.4S, v1.4S,v26.s[2] +sub v7.4s, v3.4s, v2.4s +nop +sqrdmulh v0.4S, v12.4S, v25.s[3] +mul v12.4S, v12.4S,v26.s[3] +add v3.4s, v3.4s, v2.4s +nop +sqrdmulh v2.4S, v7.4S, v25.s[1] +mul v7.4S, v7.4S,v26.s[1] +sub v21.4s, v10.4s, v17.4s +add v10.4s, v10.4s, v17.4s +sqrdmulh v17.4S, v3.4S, v25.s[0] +mul v3.4S, v3.4S,v26.s[0] +sub v6.4s, v8.4s, v11.4s +add v8.4s, v8.4s, v11.4s +mla v1.4S, v19.4S, v31.s[0] +mla v12.4S, v0.4S, v31.s[0] +sub v0.4s, v22.4s, v9.4s +nop +mla v7.4S, v2.4S, v31.s[0] +mla v3.4S, v17.4S, v31.s[0] +add v22.4s, v22.4s, v9.4s +nop +sqrdmulh v9.4S, v8.4S, v23.s[0] +mul v8.4S, v8.4S,v24.s[0] +sub v17.4s, v15.4s, v14.4s +nop +sqrdmulh v2.4S, v6.4S, v23.s[1] +mul v6.4S, v6.4S,v24.s[1] +add v15.4s, v15.4s, v14.4s +nop +sqrdmulh v14.4S, v15.4S, v23.s[2] +mul v15.4S, v15.4S,v24.s[2] +sub v19.4s, v13.4s, v1.4s +add v13.4s, v13.4s, v1.4s +sqrdmulh v1.4S, v17.4S, v23.s[3] +mul v17.4S, v17.4S,v24.s[3] +sub v11.4s, v20.4s, v12.4s +add v20.4s, v20.4s, v12.4s +mla v8.4S, v9.4S, v31.s[0] +mla v6.4S, v2.4S, v31.s[0] +sub v2.4s, v18.4s, v7.4s +str q13, [x0, #288] +mla v15.4S, v14.4S, v31.s[0] +mla v17.4S, v1.4S, v31.s[0] +add v18.4s, v18.4s, v7.4s +str q19, [x0, #352] +ldr q19, [x0, #944] +sqrdmulh v7.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +sub v1.4s, v16.4s, v3.4s +str q20, [x0, #416] +ldr q20, [x0, #1008] +sqrdmulh v14.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v16.4s, v16.4s, v3.4s +str q11, [x0, #480] +ldr q11, [x0, #816] +sqrdmulh v3.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +sub v13.4s, v10.4s, v8.4s +add v10.4s, v10.4s, v8.4s +ldr q8, [x0, #880] +sqrdmulh v9.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v12.4s, v21.4s, v6.4s +add v21.4s, v21.4s, v6.4s +mla v19.4S, v7.4S, v31.s[0] +mla v20.4S, v14.4S, v31.s[0] +sub v14.4s, v22.4s, v15.4s +str q18, [x0, #160] +mla v11.4S, v3.4S, v31.s[0] +mla v8.4S, v9.4S, v31.s[0] +add v22.4s, v22.4s, v15.4s +str q2, [x0, #224] +ldr q2, [x0, #560] +sqrdmulh v15.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +sub v9.4s, v0.4s, v17.4s +str q16, [x0, #32] +ldr q16, [x0, #624] +sqrdmulh v3.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +add v0.4s, v0.4s, v17.4s +str q1, [x0, #96] +ldr q1, [x0, #688] +ldr q17, [x0, #432] +sqrdmulh v18.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +sub v7.4s, v17.4s, v19.4s +add v17.4s, v17.4s, v19.4s +ldr q19, [x0, #752] +ldr q6, [x0, #496] +sqrdmulh v5.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +sub v4.4s, v6.4s, v20.4s +add v6.4s, v6.4s, v20.4s +ldr q20, [x0, #304] +mla v2.4S, v15.4S, v31.s[0] +mla v16.4S, v3.4S, v31.s[0] +sub v3.4s, v20.4s, v11.4s +str q10, [x0, #544] +mla v1.4S, v18.4S, v31.s[0] +mla v19.4S, v5.4S, v31.s[0] +add v20.4s, v20.4s, v11.4s +str q13, [x0, #608] +ldr q13, [x0, #368] +sqrdmulh v11.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v5.4s, v13.4s, v8.4s +str q21, [x0, #672] +sqrdmulh v21.4S, v6.4S, v29.s[1] +mul v6.4S, v6.4S,v30.s[1] +add v13.4s, v13.4s, v8.4s +str q12, [x0, #736] +ldr q12, [x0, #48] +sqrdmulh v8.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v18.4s, v12.4s, v2.4s +add v12.4s, v12.4s, v2.4s +ldr q2, [x0, #112] +sqrdmulh v10.4S, v13.4S, v29.s[1] +mul v13.4S, v13.4S,v30.s[1] +sub v15.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +ldr q16, [x0, #176] +mla v17.4S, v11.4S, v31.s[0] +mla v6.4S, v21.4S, v31.s[0] +sub v21.4s, v16.4s, v1.4s +str q22, [x0, #800] +mla v20.4S, v8.4S, v31.s[0] +mla v13.4S, v10.4S, v31.s[0] +add v16.4s, v16.4s, v1.4s +str q14, [x0, #864] +ldr q14, [x0, #240] +sqrdmulh v1.4S, v7.4S, v29.s[2] +mul v7.4S, v7.4S,v30.s[2] +sub v10.4s, v14.4s, v19.4s +str q0, [x0, #928] +sqrdmulh v0.4S, v4.4S, v29.s[2] +mul v4.4S, v4.4S,v30.s[2] +add v14.4s, v14.4s, v19.4s +str q9, [x0, #992] +sqrdmulh v9.4S, v3.4S, v29.s[2] +mul v3.4S, v3.4S,v30.s[2] +sub v19.4s, v16.4s, v17.4s +add v16.4s, v16.4s, v17.4s +sqrdmulh v17.4S, v5.4S, v29.s[2] +mul v5.4S, v5.4S,v30.s[2] +sub v8.4s, v14.4s, v6.4s +add v14.4s, v14.4s, v6.4s +mla v7.4S, v1.4S, v31.s[0] +mla v4.4S, v0.4S, v31.s[0] +sub v0.4s, v12.4s, v20.4s +nop +mla v3.4S, v9.4S, v31.s[0] +mla v5.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v20.4s +nop +sqrdmulh v20.4S, v19.4S, v27.s[1] +mul v19.4S, v19.4S,v28.s[1] +sub v17.4s, v2.4s, v13.4s +nop +sqrdmulh v9.4S, v8.4S, v27.s[1] +mul v8.4S, v8.4S,v28.s[1] +add v2.4s, v2.4s, v13.4s +nop +sqrdmulh v13.4S, v16.4S, v27.s[0] +mul v16.4S, v16.4S,v28.s[0] +sub v1.4s, v21.4s, v7.4s +add v21.4s, v21.4s, v7.4s +sqrdmulh v7.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +sub v6.4s, v10.4s, v4.4s +add v10.4s, v10.4s, v4.4s +mla v19.4S, v20.4S, v31.s[0] +mla v8.4S, v9.4S, v31.s[0] +sub v9.4s, v18.4s, v3.4s +nop +mla v16.4S, v13.4S, v31.s[0] +mla v14.4S, v7.4S, v31.s[0] +add v18.4s, v18.4s, v3.4s +nop +sqrdmulh v3.4S, v21.4S, v27.s[2] +mul v21.4S, v21.4S,v28.s[2] +sub v7.4s, v15.4s, v5.4s +nop +sqrdmulh v13.4S, v10.4S, v27.s[2] +mul v10.4S, v10.4S,v28.s[2] +add v15.4s, v15.4s, v5.4s +nop +sqrdmulh v5.4S, v1.4S, v27.s[3] +mul v1.4S, v1.4S,v28.s[3] +sub v20.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v27.s[3] +mul v6.4S, v6.4S,v28.s[3] +sub v4.4s, v17.4s, v8.4s +add v17.4s, v17.4s, v8.4s +mla v21.4S, v3.4S, v31.s[0] +mla v10.4S, v13.4S, v31.s[0] +sub v13.4s, v12.4s, v16.4s +nop +mla v1.4S, v5.4S, v31.s[0] +mla v6.4S, v19.4S, v31.s[0] +add v12.4s, v12.4s, v16.4s +nop +sqrdmulh v16.4S, v17.4S, v25.s[2] +mul v17.4S, v17.4S,v26.s[2] +sub v19.4s, v2.4s, v14.4s +nop +sqrdmulh v5.4S, v4.4S, v25.s[3] +mul v4.4S, v4.4S,v26.s[3] +add v2.4s, v2.4s, v14.4s +nop +sqrdmulh v14.4S, v19.4S, v25.s[1] +mul v19.4S, v19.4S,v26.s[1] +sub v3.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v2.4S, v25.s[0] +mul v2.4S, v2.4S,v26.s[0] +sub v8.4s, v15.4s, v10.4s +add v15.4s, v15.4s, v10.4s +mla v17.4S, v16.4S, v31.s[0] +mla v4.4S, v5.4S, v31.s[0] +sub v5.4s, v9.4s, v1.4s +nop +mla v19.4S, v14.4S, v31.s[0] +mla v2.4S, v21.4S, v31.s[0] +add v9.4s, v9.4s, v1.4s +nop +sqrdmulh v1.4S, v15.4S, v23.s[0] +mul v15.4S, v15.4S,v24.s[0] +sub v21.4s, v7.4s, v6.4s +nop +sqrdmulh v14.4S, v8.4S, v23.s[1] +mul v8.4S, v8.4S,v24.s[1] +add v7.4s, v7.4s, v6.4s +nop +sqrdmulh v6.4S, v7.4S, v23.s[2] +mul v7.4S, v7.4S,v24.s[2] +sub v16.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +sqrdmulh v17.4S, v21.4S, v23.s[3] +mul v21.4S, v21.4S,v24.s[3] +sub v10.4s, v20.4s, v4.4s +add v20.4s, v20.4s, v4.4s +mla v15.4S, v1.4S, v31.s[0] +mla v8.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v19.4s +str q0, [x0, #304] +mla v7.4S, v6.4S, v31.s[0] +mla v21.4S, v17.4S, v31.s[0] +add v13.4s, v13.4s, v19.4s +str q16, [x0, #368] +ldr q16, [x0, #896] +sqrdmulh v19.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v17.4s, v12.4s, v2.4s +str q20, [x0, #432] +ldr q20, [x0, #960] +sqrdmulh v6.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v12.4s, v12.4s, v2.4s +str q10, [x0, #496] +ldr q10, [x0, #768] +sqrdmulh v2.4S, v10.4S, v29.s[0] +mul v10.4S, v10.4S,v30.s[0] +sub v0.4s, v18.4s, v15.4s +add v18.4s, v18.4s, v15.4s +ldr q15, [x0, #832] +sqrdmulh v1.4S, v15.4S, v29.s[0] +mul v15.4S, v15.4S,v30.s[0] +sub v4.4s, v3.4s, v8.4s +add v3.4s, v3.4s, v8.4s +mla v16.4S, v19.4S, v31.s[0] +mla v20.4S, v6.4S, v31.s[0] +sub v6.4s, v9.4s, v7.4s +str q13, [x0, #176] +mla v10.4S, v2.4S, v31.s[0] +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v7.4s +str q14, [x0, #240] +ldr q14, [x0, #512] +sqrdmulh v7.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v1.4s, v5.4s, v21.4s +str q12, [x0, #48] +ldr q12, [x0, #576] +sqrdmulh v2.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +add v5.4s, v5.4s, v21.4s +str q17, [x0, #112] +ldr q17, [x0, #640] +ldr q21, [x0, #384] +sqrdmulh v13.4S, v17.4S, v29.s[0] +mul v17.4S, v17.4S,v30.s[0] +sub v19.4s, v21.4s, v16.4s +add v21.4s, v21.4s, v16.4s +ldr q16, [x0, #704] +ldr q8, [x0, #448] +sqrdmulh v22.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v11.4s, v8.4s, v20.4s +add v8.4s, v8.4s, v20.4s +ldr q20, [x0, #256] +mla v14.4S, v7.4S, v31.s[0] +mla v12.4S, v2.4S, v31.s[0] +sub v2.4s, v20.4s, v10.4s +str q18, [x0, #560] +mla v17.4S, v13.4S, v31.s[0] +mla v16.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v10.4s +str q0, [x0, #624] +ldr q0, [x0, #320] +sqrdmulh v10.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v22.4s, v0.4s, v15.4s +str q3, [x0, #688] +sqrdmulh v3.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +add v0.4s, v0.4s, v15.4s +str q4, [x0, #752] +ldr q4, [x0, #0] +sqrdmulh v15.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v13.4s, v4.4s, v14.4s +add v4.4s, v4.4s, v14.4s +ldr q14, [x0, #64] +sqrdmulh v18.4S, v0.4S, v29.s[1] +mul v0.4S, v0.4S,v30.s[1] +sub v7.4s, v14.4s, v12.4s +add v14.4s, v14.4s, v12.4s +ldr q12, [x0, #128] +mla v21.4S, v10.4S, v31.s[0] +mla v8.4S, v3.4S, v31.s[0] +sub v3.4s, v12.4s, v17.4s +str q9, [x0, #816] +mla v20.4S, v15.4S, v31.s[0] +mla v0.4S, v18.4S, v31.s[0] +add v12.4s, v12.4s, v17.4s +str q6, [x0, #880] +ldr q6, [x0, #192] +sqrdmulh v17.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +sub v18.4s, v6.4s, v16.4s +str q5, [x0, #944] +sqrdmulh v5.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +add v6.4s, v6.4s, v16.4s +str q1, [x0, #1008] +sqrdmulh v1.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v16.4s, v12.4s, v21.4s +add v12.4s, v12.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +sub v15.4s, v6.4s, v8.4s +add v6.4s, v6.4s, v8.4s +mla v19.4S, v17.4S, v31.s[0] +mla v11.4S, v5.4S, v31.s[0] +sub v5.4s, v4.4s, v20.4s +nop +mla v2.4S, v1.4S, v31.s[0] +mla v22.4S, v21.4S, v31.s[0] +add v4.4s, v4.4s, v20.4s +nop +sqrdmulh v20.4S, v16.4S, v27.s[1] +mul v16.4S, v16.4S,v28.s[1] +sub v21.4s, v14.4s, v0.4s +nop +sqrdmulh v1.4S, v15.4S, v27.s[1] +mul v15.4S, v15.4S,v28.s[1] +add v14.4s, v14.4s, v0.4s +nop +sqrdmulh v0.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +sub v17.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v27.s[0] +mul v6.4S, v6.4S,v28.s[0] +sub v8.4s, v18.4s, v11.4s +add v18.4s, v18.4s, v11.4s +mla v16.4S, v20.4S, v31.s[0] +mla v15.4S, v1.4S, v31.s[0] +sub v1.4s, v13.4s, v2.4s +nop +mla v12.4S, v0.4S, v31.s[0] +mla v6.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v2.4s +nop +sqrdmulh v2.4S, v3.4S, v27.s[2] +mul v3.4S, v3.4S,v28.s[2] +sub v19.4s, v7.4s, v22.4s +nop +sqrdmulh v0.4S, v18.4S, v27.s[2] +mul v18.4S, v18.4S,v28.s[2] +add v7.4s, v7.4s, v22.4s +nop +sqrdmulh v22.4S, v17.4S, v27.s[3] +mul v17.4S, v17.4S,v28.s[3] +sub v20.4s, v5.4s, v16.4s +add v5.4s, v5.4s, v16.4s +sqrdmulh v16.4S, v8.4S, v27.s[3] +mul v8.4S, v8.4S,v28.s[3] +sub v11.4s, v21.4s, v15.4s +add v21.4s, v21.4s, v15.4s +mla v3.4S, v2.4S, v31.s[0] +mla v18.4S, v0.4S, v31.s[0] +sub v0.4s, v4.4s, v12.4s +nop +mla v17.4S, v22.4S, v31.s[0] +mla v8.4S, v16.4S, v31.s[0] +add v4.4s, v4.4s, v12.4s +nop +sqrdmulh v12.4S, v21.4S, v25.s[2] +mul v21.4S, v21.4S,v26.s[2] +sub v16.4s, v14.4s, v6.4s +nop +sqrdmulh v22.4S, v11.4S, v25.s[3] +mul v11.4S, v11.4S,v26.s[3] +add v14.4s, v14.4s, v6.4s +nop +sqrdmulh v6.4S, v16.4S, v25.s[1] +mul v16.4S, v16.4S,v26.s[1] +sub v2.4s, v13.4s, v3.4s +add v13.4s, v13.4s, v3.4s +sqrdmulh v3.4S, v14.4S, v25.s[0] +mul v14.4S, v14.4S,v26.s[0] +sub v15.4s, v7.4s, v18.4s +add v7.4s, v7.4s, v18.4s +mla v21.4S, v12.4S, v31.s[0] +mla v11.4S, v22.4S, v31.s[0] +sub v22.4s, v1.4s, v17.4s +nop +mla v16.4S, v6.4S, v31.s[0] +mla v14.4S, v3.4S, v31.s[0] +add v1.4s, v1.4s, v17.4s +nop +sqrdmulh v17.4S, v7.4S, v23.s[0] +mul v7.4S, v7.4S,v24.s[0] +sub v3.4s, v19.4s, v8.4s +nop +sqrdmulh v6.4S, v15.4S, v23.s[1] +mul v15.4S, v15.4S,v24.s[1] +add v19.4s, v19.4s, v8.4s +nop +sqrdmulh v8.4S, v19.4S, v23.s[2] +mul v19.4S, v19.4S,v24.s[2] +sub v12.4s, v5.4s, v21.4s +add v5.4s, v5.4s, v21.4s +sqrdmulh v21.4S, v3.4S, v23.s[3] +mul v3.4S, v3.4S,v24.s[3] +sub v18.4s, v20.4s, v11.4s +add v20.4s, v20.4s, v11.4s +mla v7.4S, v17.4S, v31.s[0] +mla v15.4S, v6.4S, v31.s[0] +sub v6.4s, v0.4s, v16.4s +str q5, [x0, #256] +mla v19.4S, v8.4S, v31.s[0] +mla v3.4S, v21.4S, v31.s[0] +add v0.4s, v0.4s, v16.4s +str q12, [x0, #320] +ldr q12, [x0, #912] +sqrdmulh v16.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +sub v21.4s, v4.4s, v14.4s +str q20, [x0, #384] +ldr q20, [x0, #976] +sqrdmulh v8.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v4.4s, v4.4s, v14.4s +str q18, [x0, #448] +ldr q18, [x0, #784] +sqrdmulh v14.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +sub v5.4s, v13.4s, v7.4s +add v13.4s, v13.4s, v7.4s +ldr q7, [x0, #848] +sqrdmulh v17.4S, v7.4S, v29.s[0] +mul v7.4S, v7.4S,v30.s[0] +sub v11.4s, v2.4s, v15.4s +add v2.4s, v2.4s, v15.4s +mla v12.4S, v16.4S, v31.s[0] +mla v20.4S, v8.4S, v31.s[0] +sub v8.4s, v1.4s, v19.4s +str q0, [x0, #128] +mla v18.4S, v14.4S, v31.s[0] +mla v7.4S, v17.4S, v31.s[0] +add v1.4s, v1.4s, v19.4s +str q6, [x0, #192] +ldr q6, [x0, #528] +sqrdmulh v19.4S, v6.4S, v29.s[0] +mul v6.4S, v6.4S,v30.s[0] +sub v17.4s, v22.4s, v3.4s +str q4, [x0, #0] +ldr q4, [x0, #592] +sqrdmulh v14.4S, v4.4S, v29.s[0] +mul v4.4S, v4.4S,v30.s[0] +add v22.4s, v22.4s, v3.4s +str q21, [x0, #64] +ldr q21, [x0, #656] +ldr q3, [x0, #400] +sqrdmulh v0.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +sub v16.4s, v3.4s, v12.4s +add v3.4s, v3.4s, v12.4s +ldr q12, [x0, #720] +ldr q15, [x0, #464] +sqrdmulh v9.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +sub v10.4s, v15.4s, v20.4s +add v15.4s, v15.4s, v20.4s +ldr q20, [x0, #272] +mla v6.4S, v19.4S, v31.s[0] +mla v4.4S, v14.4S, v31.s[0] +sub v14.4s, v20.4s, v18.4s +str q13, [x0, #512] +mla v21.4S, v0.4S, v31.s[0] +mla v12.4S, v9.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +str q5, [x0, #576] +ldr q5, [x0, #336] +sqrdmulh v18.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v9.4s, v5.4s, v7.4s +str q2, [x0, #640] +sqrdmulh v2.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +add v5.4s, v5.4s, v7.4s +str q11, [x0, #704] +ldr q11, [x0, #16] +sqrdmulh v7.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v0.4s, v11.4s, v6.4s +add v11.4s, v11.4s, v6.4s +ldr q6, [x0, #80] +sqrdmulh v13.4S, v5.4S, v29.s[1] +mul v5.4S, v5.4S,v30.s[1] +sub v19.4s, v6.4s, v4.4s +add v6.4s, v6.4s, v4.4s +ldr q4, [x0, #144] +mla v3.4S, v18.4S, v31.s[0] +mla v15.4S, v2.4S, v31.s[0] +sub v2.4s, v4.4s, v21.4s +str q1, [x0, #768] +mla v20.4S, v7.4S, v31.s[0] +mla v5.4S, v13.4S, v31.s[0] +add v4.4s, v4.4s, v21.4s +str q8, [x0, #832] +ldr q8, [x0, #208] +sqrdmulh v21.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +sub v13.4s, v8.4s, v12.4s +str q22, [x0, #896] +sqrdmulh v22.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +add v8.4s, v8.4s, v12.4s +str q17, [x0, #960] +sqrdmulh v17.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v12.4s, v4.4s, v3.4s +add v4.4s, v4.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v29.s[2] +mul v9.4S, v9.4S,v30.s[2] +sub v7.4s, v8.4s, v15.4s +add v8.4s, v8.4s, v15.4s +mla v16.4S, v21.4S, v31.s[0] +mla v10.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v20.4s +nop +mla v14.4S, v17.4S, v31.s[0] +mla v9.4S, v3.4S, v31.s[0] +add v11.4s, v11.4s, v20.4s +nop +sqrdmulh v20.4S, v12.4S, v27.s[1] +mul v12.4S, v12.4S,v28.s[1] +sub v3.4s, v6.4s, v5.4s +nop +sqrdmulh v17.4S, v7.4S, v27.s[1] +mul v7.4S, v7.4S,v28.s[1] +add v6.4s, v6.4s, v5.4s +nop +sqrdmulh v5.4S, v4.4S, v27.s[0] +mul v4.4S, v4.4S,v28.s[0] +sub v21.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v8.4S, v27.s[0] +mul v8.4S, v8.4S,v28.s[0] +sub v15.4s, v13.4s, v10.4s +add v13.4s, v13.4s, v10.4s +mla v12.4S, v20.4S, v31.s[0] +mla v7.4S, v17.4S, v31.s[0] +sub v17.4s, v0.4s, v14.4s +nop +mla v4.4S, v5.4S, v31.s[0] +mla v8.4S, v16.4S, v31.s[0] +add v0.4s, v0.4s, v14.4s +nop +sqrdmulh v14.4S, v2.4S, v27.s[2] +mul v2.4S, v2.4S,v28.s[2] +sub v16.4s, v19.4s, v9.4s +nop +sqrdmulh v5.4S, v13.4S, v27.s[2] +mul v13.4S, v13.4S,v28.s[2] +add v19.4s, v19.4s, v9.4s +nop +sqrdmulh v9.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +sub v20.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +sub v10.4s, v3.4s, v7.4s +add v3.4s, v3.4s, v7.4s +mla v2.4S, v14.4S, v31.s[0] +mla v13.4S, v5.4S, v31.s[0] +sub v5.4s, v11.4s, v4.4s +nop +mla v21.4S, v9.4S, v31.s[0] +mla v15.4S, v12.4S, v31.s[0] +add v11.4s, v11.4s, v4.4s +nop +sqrdmulh v4.4S, v3.4S, v25.s[2] +mul v3.4S, v3.4S,v26.s[2] +sub v12.4s, v6.4s, v8.4s +nop +sqrdmulh v9.4S, v10.4S, v25.s[3] +mul v10.4S, v10.4S,v26.s[3] +add v6.4s, v6.4s, v8.4s +nop +sqrdmulh v8.4S, v12.4S, v25.s[1] +mul v12.4S, v12.4S,v26.s[1] +sub v14.4s, v0.4s, v2.4s +add v0.4s, v0.4s, v2.4s +sqrdmulh v2.4S, v6.4S, v25.s[0] +mul v6.4S, v6.4S,v26.s[0] +sub v7.4s, v19.4s, v13.4s +add v19.4s, v19.4s, v13.4s +mla v3.4S, v4.4S, v31.s[0] +mla v10.4S, v9.4S, v31.s[0] +sub v9.4s, v17.4s, v21.4s +nop +mla v12.4S, v8.4S, v31.s[0] +mla v6.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v21.4s +nop +sqrdmulh v21.4S, v19.4S, v23.s[0] +mul v19.4S, v19.4S,v24.s[0] +sub v2.4s, v16.4s, v15.4s +nop +sqrdmulh v8.4S, v7.4S, v23.s[1] +mul v7.4S, v7.4S,v24.s[1] +add v16.4s, v16.4s, v15.4s +nop +sqrdmulh v15.4S, v16.4S, v23.s[2] +mul v16.4S, v16.4S,v24.s[2] +sub v4.4s, v22.4s, v3.4s +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v2.4S, v23.s[3] +mul v2.4S, v2.4S,v24.s[3] +sub v13.4s, v20.4s, v10.4s +add v20.4s, v20.4s, v10.4s +mla v19.4S, v21.4S, v31.s[0] +mla v7.4S, v8.4S, v31.s[0] +sub v8.4s, v5.4s, v12.4s +str q22, [x0, #272] +mla v16.4S, v15.4S, v31.s[0] +mla v2.4S, v3.4S, v31.s[0] +add v5.4s, v5.4s, v12.4s +str q4, [x0, #336] +sub v23.4s, v11.4s, v6.4s +str q20, [x0, #400] +add v11.4s, v11.4s, v6.4s +str q13, [x0, #464] +sub v13.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sub v19.4s, v14.4s, v7.4s +add v14.4s, v14.4s, v7.4s +sub v7.4s, v17.4s, v16.4s +str q5, [x0, #144] +add v17.4s, v17.4s, v16.4s +str q8, [x0, #208] +sub v8.4s, v9.4s, v2.4s +str q11, [x0, #16] +add v9.4s, v9.4s, v2.4s +str q23, [x0, #80] +str q0, [x0, #528] +str q13, [x0, #592] +str q14, [x0, #656] +str q19, [x0, #720] +str q17, [x0, #784] +str q7, [x0, #848] +str q9, [x0, #912] +str q8, [x0, #976] +ldr q18, [x17, #+128] +ldr q1, [x17, #+144] +ldr q10, [x17, #+160] +ldr q21, [x17, #+176] +ldr q22, [x17, #+192] +ldr q15, [x17, #+208] +ldr q3, [x17, #+224] +ldr q12, [x17, #+240] +ldr q4, [x0, #32] +ldr q30, [x0, #48] +ldr q29, [x0, #0] +ldr q28, [x0, #16] +sqrdmulh v27.4S, v4.4S, v1.s[0] +mul v4.4S, v4.4S,v18.s[0] +mla v4.4S, v27.4S, v31.s[0] +sub v27.4s, v29.4s, v4.4s +add v29.4s, v29.4s, v4.4s +sqrdmulh v4.4S, v30.4S, v1.s[0] +mul v30.4S, v30.4S,v18.s[0] +mla v30.4S, v4.4S, v31.s[0] +sub v4.4s, v28.4s, v30.4s +add v28.4s, v28.4s, v30.4s +sqrdmulh v30.4S, v28.4S, v1.s[1] +mul v28.4S, v28.4S,v18.s[1] +mla v28.4S, v30.4S, v31.s[0] +sub v30.4s, v29.4s, v28.4s +add v29.4s, v29.4s, v28.4s +sqrdmulh v28.4S, v4.4S, v1.s[2] +mul v4.4S, v4.4S,v18.s[2] +mla v4.4S, v28.4S, v31.s[0] +sub v28.4s, v27.4s, v4.4s +add v27.4s, v27.4s, v4.4s +trn1 v4.4S, v29.4S, v30.4S +trn2 v26.4S, v29.4S, v30.4S +trn1 v25.4S, v27.4S, v28.4S +trn2 v24.4S, v27.4S, v28.4S +trn2 v27.2D, v4.2D, v25.2D +trn2 v28.2D, v26.2D, v24.2D +trn1 v29.2D, v4.2D, v25.2D +trn1 v30.2D, v26.2D, v24.2D +sqrdmulh v24.4S, v27.4S, v21.4S +mul v27.4S, v27.4S,v10.4S +mla v27.4S, v24.4S, v31.s[0] +sub v24.4s, v29.4s, v27.4s +add v29.4s, v29.4s, v27.4s +sqrdmulh v27.4S, v28.4S, v21.4S +mul v28.4S, v28.4S,v10.4S +mla v28.4S, v27.4S, v31.s[0] +sub v27.4s, v30.4s, v28.4s +add v30.4s, v30.4s, v28.4s +sqrdmulh v28.4S, v30.4S, v15.4S +mul v30.4S, v30.4S,v22.4S +mla v30.4S, v28.4S, v31.s[0] +sub v28.4s, v29.4s, v30.4s +add v29.4s, v29.4s, v30.4s +sqrdmulh v30.4S, v27.4S, v12.4S +mul v27.4S, v27.4S,v3.4S +mla v27.4S, v30.4S, v31.s[0] +sub v30.4s, v24.4s, v27.4s +add v24.4s, v24.4s, v27.4s +str q29, [x0, #0] +str q28, [x0, #16] +str q24, [x0, #32] +str q30, [x0, #48] +ldr q30, [x17, #+256] +ldr q24, [x17, #+272] +ldr q28, [x17, #+288] +ldr q29, [x17, #+304] +ldr q27, [x17, #+320] +ldr q26, [x17, #+336] +ldr q25, [x17, #+352] +ldr q4, [x17, #+368] +ldr q12, [x0, #96] +ldr q3, [x0, #112] +ldr q15, [x0, #64] +ldr q22, [x0, #80] +sqrdmulh v21.4S, v12.4S, v24.s[0] +mul v12.4S, v12.4S,v30.s[0] +mla v12.4S, v21.4S, v31.s[0] +sub v21.4s, v15.4s, v12.4s +add v15.4s, v15.4s, v12.4s +sqrdmulh v12.4S, v3.4S, v24.s[0] +mul v3.4S, v3.4S,v30.s[0] +mla v3.4S, v12.4S, v31.s[0] +sub v12.4s, v22.4s, v3.4s +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v22.4S, v24.s[1] +mul v22.4S, v22.4S,v30.s[1] +mla v22.4S, v3.4S, v31.s[0] +sub v3.4s, v15.4s, v22.4s +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v12.4S, v24.s[2] +mul v12.4S, v12.4S,v30.s[2] +mla v12.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v12.4s +add v21.4s, v21.4s, v12.4s +trn1 v12.4S, v15.4S, v3.4S +trn2 v10.4S, v15.4S, v3.4S +trn1 v1.4S, v21.4S, v22.4S +trn2 v18.4S, v21.4S, v22.4S +trn2 v21.2D, v12.2D, v1.2D +trn2 v22.2D, v10.2D, v18.2D +trn1 v15.2D, v12.2D, v1.2D +trn1 v3.2D, v10.2D, v18.2D +sqrdmulh v18.4S, v21.4S, v29.4S +mul v21.4S, v21.4S,v28.4S +mla v21.4S, v18.4S, v31.s[0] +sub v18.4s, v15.4s, v21.4s +add v15.4s, v15.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v29.4S +mul v22.4S, v22.4S,v28.4S +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v3.4s, v22.4s +add v3.4s, v3.4s, v22.4s +sqrdmulh v22.4S, v3.4S, v26.4S +mul v3.4S, v3.4S,v27.4S +mla v3.4S, v22.4S, v31.s[0] +sub v22.4s, v15.4s, v3.4s +add v15.4s, v15.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v4.4S +mul v21.4S, v21.4S,v25.4S +mla v21.4S, v3.4S, v31.s[0] +sub v3.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +str q15, [x0, #64] +str q22, [x0, #80] +str q18, [x0, #96] +str q3, [x0, #112] +ldr q3, [x17, #+384] +ldr q18, [x17, #+400] +ldr q22, [x17, #+416] +ldr q15, [x17, #+432] +ldr q21, [x17, #+448] +ldr q10, [x17, #+464] +ldr q1, [x17, #+480] +ldr q12, [x17, #+496] +ldr q4, [x0, #160] +ldr q25, [x0, #176] +ldr q26, [x0, #128] +ldr q27, [x0, #144] +sqrdmulh v29.4S, v4.4S, v18.s[0] +mul v4.4S, v4.4S,v3.s[0] +mla v4.4S, v29.4S, v31.s[0] +sub v29.4s, v26.4s, v4.4s +add v26.4s, v26.4s, v4.4s +sqrdmulh v4.4S, v25.4S, v18.s[0] +mul v25.4S, v25.4S,v3.s[0] +mla v25.4S, v4.4S, v31.s[0] +sub v4.4s, v27.4s, v25.4s +add v27.4s, v27.4s, v25.4s +sqrdmulh v25.4S, v27.4S, v18.s[1] +mul v27.4S, v27.4S,v3.s[1] +mla v27.4S, v25.4S, v31.s[0] +sub v25.4s, v26.4s, v27.4s +add v26.4s, v26.4s, v27.4s +sqrdmulh v27.4S, v4.4S, v18.s[2] +mul v4.4S, v4.4S,v3.s[2] +mla v4.4S, v27.4S, v31.s[0] +sub v27.4s, v29.4s, v4.4s +add v29.4s, v29.4s, v4.4s +trn1 v4.4S, v26.4S, v25.4S +trn2 v28.4S, v26.4S, v25.4S +trn1 v24.4S, v29.4S, v27.4S +trn2 v30.4S, v29.4S, v27.4S +trn2 v29.2D, v4.2D, v24.2D +trn2 v27.2D, v28.2D, v30.2D +trn1 v26.2D, v4.2D, v24.2D +trn1 v25.2D, v28.2D, v30.2D +sqrdmulh v30.4S, v29.4S, v15.4S +mul v29.4S, v29.4S,v22.4S +mla v29.4S, v30.4S, v31.s[0] +sub v30.4s, v26.4s, v29.4s +add v26.4s, v26.4s, v29.4s +sqrdmulh v29.4S, v27.4S, v15.4S +mul v27.4S, v27.4S,v22.4S +mla v27.4S, v29.4S, v31.s[0] +sub v29.4s, v25.4s, v27.4s +add v25.4s, v25.4s, v27.4s +sqrdmulh v27.4S, v25.4S, v10.4S +mul v25.4S, v25.4S,v21.4S +mla v25.4S, v27.4S, v31.s[0] +sub v27.4s, v26.4s, v25.4s +add v26.4s, v26.4s, v25.4s +sqrdmulh v25.4S, v29.4S, v12.4S +mul v29.4S, v29.4S,v1.4S +mla v29.4S, v25.4S, v31.s[0] +sub v25.4s, v30.4s, v29.4s +add v30.4s, v30.4s, v29.4s +str q26, [x0, #128] +str q27, [x0, #144] +str q30, [x0, #160] +str q25, [x0, #176] +ldr q25, [x17, #+512] +ldr q30, [x17, #+528] +ldr q27, [x17, #+544] +ldr q26, [x17, #+560] +ldr q29, [x17, #+576] +ldr q28, [x17, #+592] +ldr q24, [x17, #+608] +ldr q4, [x17, #+624] +ldr q12, [x0, #224] +ldr q1, [x0, #240] +ldr q10, [x0, #192] +ldr q21, [x0, #208] +sqrdmulh v15.4S, v12.4S, v30.s[0] +mul v12.4S, v12.4S,v25.s[0] +mla v12.4S, v15.4S, v31.s[0] +sub v15.4s, v10.4s, v12.4s +add v10.4s, v10.4s, v12.4s +sqrdmulh v12.4S, v1.4S, v30.s[0] +mul v1.4S, v1.4S,v25.s[0] +mla v1.4S, v12.4S, v31.s[0] +sub v12.4s, v21.4s, v1.4s +add v21.4s, v21.4s, v1.4s +sqrdmulh v1.4S, v21.4S, v30.s[1] +mul v21.4S, v21.4S,v25.s[1] +mla v21.4S, v1.4S, v31.s[0] +sub v1.4s, v10.4s, v21.4s +add v10.4s, v10.4s, v21.4s +sqrdmulh v21.4S, v12.4S, v30.s[2] +mul v12.4S, v12.4S,v25.s[2] +mla v12.4S, v21.4S, v31.s[0] +sub v21.4s, v15.4s, v12.4s +add v15.4s, v15.4s, v12.4s +trn1 v12.4S, v10.4S, v1.4S +trn2 v22.4S, v10.4S, v1.4S +trn1 v18.4S, v15.4S, v21.4S +trn2 v3.4S, v15.4S, v21.4S +trn2 v15.2D, v12.2D, v18.2D +trn2 v21.2D, v22.2D, v3.2D +trn1 v10.2D, v12.2D, v18.2D +trn1 v1.2D, v22.2D, v3.2D +sqrdmulh v3.4S, v15.4S, v26.4S +mul v15.4S, v15.4S,v27.4S +mla v15.4S, v3.4S, v31.s[0] +sub v3.4s, v10.4s, v15.4s +add v10.4s, v10.4s, v15.4s +sqrdmulh v15.4S, v21.4S, v26.4S +mul v21.4S, v21.4S,v27.4S +mla v21.4S, v15.4S, v31.s[0] +sub v15.4s, v1.4s, v21.4s +add v1.4s, v1.4s, v21.4s +sqrdmulh v21.4S, v1.4S, v28.4S +mul v1.4S, v1.4S,v29.4S +mla v1.4S, v21.4S, v31.s[0] +sub v21.4s, v10.4s, v1.4s +add v10.4s, v10.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v4.4S +mul v15.4S, v15.4S,v24.4S +mla v15.4S, v1.4S, v31.s[0] +sub v1.4s, v3.4s, v15.4s +add v3.4s, v3.4s, v15.4s +str q10, [x0, #192] +str q21, [x0, #208] +str q3, [x0, #224] +str q1, [x0, #240] +ldr q1, [x17, #+640] +ldr q3, [x17, #+656] +ldr q21, [x17, #+672] +ldr q10, [x17, #+688] +ldr q15, [x17, #+704] +ldr q22, [x17, #+720] +ldr q18, [x17, #+736] +ldr q12, [x17, #+752] +ldr q4, [x0, #288] +ldr q24, [x0, #304] +ldr q28, [x0, #256] +ldr q29, [x0, #272] +sqrdmulh v26.4S, v4.4S, v3.s[0] +mul v4.4S, v4.4S,v1.s[0] +mla v4.4S, v26.4S, v31.s[0] +sub v26.4s, v28.4s, v4.4s +add v28.4s, v28.4s, v4.4s +sqrdmulh v4.4S, v24.4S, v3.s[0] +mul v24.4S, v24.4S,v1.s[0] +mla v24.4S, v4.4S, v31.s[0] +sub v4.4s, v29.4s, v24.4s +add v29.4s, v29.4s, v24.4s +sqrdmulh v24.4S, v29.4S, v3.s[1] +mul v29.4S, v29.4S,v1.s[1] +mla v29.4S, v24.4S, v31.s[0] +sub v24.4s, v28.4s, v29.4s +add v28.4s, v28.4s, v29.4s +sqrdmulh v29.4S, v4.4S, v3.s[2] +mul v4.4S, v4.4S,v1.s[2] +mla v4.4S, v29.4S, v31.s[0] +sub v29.4s, v26.4s, v4.4s +add v26.4s, v26.4s, v4.4s +trn1 v4.4S, v28.4S, v24.4S +trn2 v27.4S, v28.4S, v24.4S +trn1 v30.4S, v26.4S, v29.4S +trn2 v25.4S, v26.4S, v29.4S +trn2 v26.2D, v4.2D, v30.2D +trn2 v29.2D, v27.2D, v25.2D +trn1 v28.2D, v4.2D, v30.2D +trn1 v24.2D, v27.2D, v25.2D +sqrdmulh v25.4S, v26.4S, v10.4S +mul v26.4S, v26.4S,v21.4S +mla v26.4S, v25.4S, v31.s[0] +sub v25.4s, v28.4s, v26.4s +add v28.4s, v28.4s, v26.4s +sqrdmulh v26.4S, v29.4S, v10.4S +mul v29.4S, v29.4S,v21.4S +mla v29.4S, v26.4S, v31.s[0] +sub v26.4s, v24.4s, v29.4s +add v24.4s, v24.4s, v29.4s +sqrdmulh v29.4S, v24.4S, v22.4S +mul v24.4S, v24.4S,v15.4S +mla v24.4S, v29.4S, v31.s[0] +sub v29.4s, v28.4s, v24.4s +add v28.4s, v28.4s, v24.4s +sqrdmulh v24.4S, v26.4S, v12.4S +mul v26.4S, v26.4S,v18.4S +mla v26.4S, v24.4S, v31.s[0] +sub v24.4s, v25.4s, v26.4s +add v25.4s, v25.4s, v26.4s +str q28, [x0, #256] +str q29, [x0, #272] +str q25, [x0, #288] +str q24, [x0, #304] +ldr q24, [x17, #+768] +ldr q25, [x17, #+784] +ldr q29, [x17, #+800] +ldr q28, [x17, #+816] +ldr q26, [x17, #+832] +ldr q27, [x17, #+848] +ldr q30, [x17, #+864] +ldr q4, [x17, #+880] +ldr q12, [x0, #352] +ldr q18, [x0, #368] +ldr q22, [x0, #320] +ldr q15, [x0, #336] +sqrdmulh v10.4S, v12.4S, v25.s[0] +mul v12.4S, v12.4S,v24.s[0] +mla v12.4S, v10.4S, v31.s[0] +sub v10.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v18.4S, v25.s[0] +mul v18.4S, v18.4S,v24.s[0] +mla v18.4S, v12.4S, v31.s[0] +sub v12.4s, v15.4s, v18.4s +add v15.4s, v15.4s, v18.4s +sqrdmulh v18.4S, v15.4S, v25.s[1] +mul v15.4S, v15.4S,v24.s[1] +mla v15.4S, v18.4S, v31.s[0] +sub v18.4s, v22.4s, v15.4s +add v22.4s, v22.4s, v15.4s +sqrdmulh v15.4S, v12.4S, v25.s[2] +mul v12.4S, v12.4S,v24.s[2] +mla v12.4S, v15.4S, v31.s[0] +sub v15.4s, v10.4s, v12.4s +add v10.4s, v10.4s, v12.4s +trn1 v12.4S, v22.4S, v18.4S +trn2 v21.4S, v22.4S, v18.4S +trn1 v3.4S, v10.4S, v15.4S +trn2 v1.4S, v10.4S, v15.4S +trn2 v10.2D, v12.2D, v3.2D +trn2 v15.2D, v21.2D, v1.2D +trn1 v22.2D, v12.2D, v3.2D +trn1 v18.2D, v21.2D, v1.2D +sqrdmulh v1.4S, v10.4S, v28.4S +mul v10.4S, v10.4S,v29.4S +mla v10.4S, v1.4S, v31.s[0] +sub v1.4s, v22.4s, v10.4s +add v22.4s, v22.4s, v10.4s +sqrdmulh v10.4S, v15.4S, v28.4S +mul v15.4S, v15.4S,v29.4S +mla v15.4S, v10.4S, v31.s[0] +sub v10.4s, v18.4s, v15.4s +add v18.4s, v18.4s, v15.4s +sqrdmulh v15.4S, v18.4S, v27.4S +mul v18.4S, v18.4S,v26.4S +mla v18.4S, v15.4S, v31.s[0] +sub v15.4s, v22.4s, v18.4s +add v22.4s, v22.4s, v18.4s +sqrdmulh v18.4S, v10.4S, v4.4S +mul v10.4S, v10.4S,v30.4S +mla v10.4S, v18.4S, v31.s[0] +sub v18.4s, v1.4s, v10.4s +add v1.4s, v1.4s, v10.4s +str q22, [x0, #320] +str q15, [x0, #336] +str q1, [x0, #352] +str q18, [x0, #368] +ldr q18, [x17, #+896] +ldr q1, [x17, #+912] +ldr q15, [x17, #+928] +ldr q22, [x17, #+944] +ldr q10, [x17, #+960] +ldr q21, [x17, #+976] +ldr q3, [x17, #+992] +ldr q12, [x17, #+1008] +ldr q4, [x0, #416] +ldr q30, [x0, #432] +ldr q27, [x0, #384] +ldr q26, [x0, #400] +sqrdmulh v28.4S, v4.4S, v1.s[0] +mul v4.4S, v4.4S,v18.s[0] +mla v4.4S, v28.4S, v31.s[0] +sub v28.4s, v27.4s, v4.4s +add v27.4s, v27.4s, v4.4s +sqrdmulh v4.4S, v30.4S, v1.s[0] +mul v30.4S, v30.4S,v18.s[0] +mla v30.4S, v4.4S, v31.s[0] +sub v4.4s, v26.4s, v30.4s +add v26.4s, v26.4s, v30.4s +sqrdmulh v30.4S, v26.4S, v1.s[1] +mul v26.4S, v26.4S,v18.s[1] +mla v26.4S, v30.4S, v31.s[0] +sub v30.4s, v27.4s, v26.4s +add v27.4s, v27.4s, v26.4s +sqrdmulh v26.4S, v4.4S, v1.s[2] +mul v4.4S, v4.4S,v18.s[2] +mla v4.4S, v26.4S, v31.s[0] +sub v26.4s, v28.4s, v4.4s +add v28.4s, v28.4s, v4.4s +trn1 v4.4S, v27.4S, v30.4S +trn2 v29.4S, v27.4S, v30.4S +trn1 v25.4S, v28.4S, v26.4S +trn2 v24.4S, v28.4S, v26.4S +trn2 v28.2D, v4.2D, v25.2D +trn2 v26.2D, v29.2D, v24.2D +trn1 v27.2D, v4.2D, v25.2D +trn1 v30.2D, v29.2D, v24.2D +sqrdmulh v24.4S, v28.4S, v22.4S +mul v28.4S, v28.4S,v15.4S +mla v28.4S, v24.4S, v31.s[0] +sub v24.4s, v27.4s, v28.4s +add v27.4s, v27.4s, v28.4s +sqrdmulh v28.4S, v26.4S, v22.4S +mul v26.4S, v26.4S,v15.4S +mla v26.4S, v28.4S, v31.s[0] +sub v28.4s, v30.4s, v26.4s +add v30.4s, v30.4s, v26.4s +sqrdmulh v26.4S, v30.4S, v21.4S +mul v30.4S, v30.4S,v10.4S +mla v30.4S, v26.4S, v31.s[0] +sub v26.4s, v27.4s, v30.4s +add v27.4s, v27.4s, v30.4s +sqrdmulh v30.4S, v28.4S, v12.4S +mul v28.4S, v28.4S,v3.4S +mla v28.4S, v30.4S, v31.s[0] +sub v30.4s, v24.4s, v28.4s +add v24.4s, v24.4s, v28.4s +str q27, [x0, #384] +str q26, [x0, #400] +str q24, [x0, #416] +str q30, [x0, #432] +ldr q30, [x17, #+1024] +ldr q24, [x17, #+1040] +ldr q26, [x17, #+1056] +ldr q27, [x17, #+1072] +ldr q28, [x17, #+1088] +ldr q29, [x17, #+1104] +ldr q25, [x17, #+1120] +ldr q4, [x17, #+1136] +ldr q12, [x0, #480] +ldr q3, [x0, #496] +ldr q21, [x0, #448] +ldr q10, [x0, #464] +sqrdmulh v22.4S, v12.4S, v24.s[0] +mul v12.4S, v12.4S,v30.s[0] +mla v12.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v12.4s +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v3.4S, v24.s[0] +mul v3.4S, v3.4S,v30.s[0] +mla v3.4S, v12.4S, v31.s[0] +sub v12.4s, v10.4s, v3.4s +add v10.4s, v10.4s, v3.4s +sqrdmulh v3.4S, v10.4S, v24.s[1] +mul v10.4S, v10.4S,v30.s[1] +mla v10.4S, v3.4S, v31.s[0] +sub v3.4s, v21.4s, v10.4s +add v21.4s, v21.4s, v10.4s +sqrdmulh v10.4S, v12.4S, v24.s[2] +mul v12.4S, v12.4S,v30.s[2] +mla v12.4S, v10.4S, v31.s[0] +sub v10.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +trn1 v12.4S, v21.4S, v3.4S +trn2 v15.4S, v21.4S, v3.4S +trn1 v1.4S, v22.4S, v10.4S +trn2 v18.4S, v22.4S, v10.4S +trn2 v22.2D, v12.2D, v1.2D +trn2 v10.2D, v15.2D, v18.2D +trn1 v21.2D, v12.2D, v1.2D +trn1 v3.2D, v15.2D, v18.2D +sqrdmulh v18.4S, v22.4S, v27.4S +mul v22.4S, v22.4S,v26.4S +mla v22.4S, v18.4S, v31.s[0] +sub v18.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +sqrdmulh v22.4S, v10.4S, v27.4S +mul v10.4S, v10.4S,v26.4S +mla v10.4S, v22.4S, v31.s[0] +sub v22.4s, v3.4s, v10.4s +add v3.4s, v3.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v29.4S +mul v3.4S, v3.4S,v28.4S +mla v3.4S, v10.4S, v31.s[0] +sub v10.4s, v21.4s, v3.4s +add v21.4s, v21.4s, v3.4s +sqrdmulh v3.4S, v22.4S, v4.4S +mul v22.4S, v22.4S,v25.4S +mla v22.4S, v3.4S, v31.s[0] +sub v3.4s, v18.4s, v22.4s +add v18.4s, v18.4s, v22.4s +str q21, [x0, #448] +str q10, [x0, #464] +str q18, [x0, #480] +str q3, [x0, #496] +ldr q3, [x17, #+1152] +ldr q18, [x17, #+1168] +ldr q10, [x17, #+1184] +ldr q21, [x17, #+1200] +ldr q22, [x17, #+1216] +ldr q15, [x17, #+1232] +ldr q1, [x17, #+1248] +ldr q12, [x17, #+1264] +ldr q4, [x0, #544] +ldr q25, [x0, #560] +ldr q29, [x0, #512] +ldr q28, [x0, #528] +sqrdmulh v27.4S, v4.4S, v18.s[0] +mul v4.4S, v4.4S,v3.s[0] +mla v4.4S, v27.4S, v31.s[0] +sub v27.4s, v29.4s, v4.4s +add v29.4s, v29.4s, v4.4s +sqrdmulh v4.4S, v25.4S, v18.s[0] +mul v25.4S, v25.4S,v3.s[0] +mla v25.4S, v4.4S, v31.s[0] +sub v4.4s, v28.4s, v25.4s +add v28.4s, v28.4s, v25.4s +sqrdmulh v25.4S, v28.4S, v18.s[1] +mul v28.4S, v28.4S,v3.s[1] +mla v28.4S, v25.4S, v31.s[0] +sub v25.4s, v29.4s, v28.4s +add v29.4s, v29.4s, v28.4s +sqrdmulh v28.4S, v4.4S, v18.s[2] +mul v4.4S, v4.4S,v3.s[2] +mla v4.4S, v28.4S, v31.s[0] +sub v28.4s, v27.4s, v4.4s +add v27.4s, v27.4s, v4.4s +trn1 v4.4S, v29.4S, v25.4S +trn2 v26.4S, v29.4S, v25.4S +trn1 v24.4S, v27.4S, v28.4S +trn2 v30.4S, v27.4S, v28.4S +trn2 v27.2D, v4.2D, v24.2D +trn2 v28.2D, v26.2D, v30.2D +trn1 v29.2D, v4.2D, v24.2D +trn1 v25.2D, v26.2D, v30.2D +sqrdmulh v30.4S, v27.4S, v21.4S +mul v27.4S, v27.4S,v10.4S +mla v27.4S, v30.4S, v31.s[0] +sub v30.4s, v29.4s, v27.4s +add v29.4s, v29.4s, v27.4s +sqrdmulh v27.4S, v28.4S, v21.4S +mul v28.4S, v28.4S,v10.4S +mla v28.4S, v27.4S, v31.s[0] +sub v27.4s, v25.4s, v28.4s +add v25.4s, v25.4s, v28.4s +sqrdmulh v28.4S, v25.4S, v15.4S +mul v25.4S, v25.4S,v22.4S +mla v25.4S, v28.4S, v31.s[0] +sub v28.4s, v29.4s, v25.4s +add v29.4s, v29.4s, v25.4s +sqrdmulh v25.4S, v27.4S, v12.4S +mul v27.4S, v27.4S,v1.4S +mla v27.4S, v25.4S, v31.s[0] +sub v25.4s, v30.4s, v27.4s +add v30.4s, v30.4s, v27.4s +str q29, [x0, #512] +str q28, [x0, #528] +str q30, [x0, #544] +str q25, [x0, #560] +ldr q25, [x17, #+1280] +ldr q30, [x17, #+1296] +ldr q28, [x17, #+1312] +ldr q29, [x17, #+1328] +ldr q27, [x17, #+1344] +ldr q26, [x17, #+1360] +ldr q24, [x17, #+1376] +ldr q4, [x17, #+1392] +ldr q12, [x0, #608] +ldr q1, [x0, #624] +ldr q15, [x0, #576] +ldr q22, [x0, #592] +sqrdmulh v21.4S, v12.4S, v30.s[0] +mul v12.4S, v12.4S,v25.s[0] +mla v12.4S, v21.4S, v31.s[0] +sub v21.4s, v15.4s, v12.4s +add v15.4s, v15.4s, v12.4s +sqrdmulh v12.4S, v1.4S, v30.s[0] +mul v1.4S, v1.4S,v25.s[0] +mla v1.4S, v12.4S, v31.s[0] +sub v12.4s, v22.4s, v1.4s +add v22.4s, v22.4s, v1.4s +sqrdmulh v1.4S, v22.4S, v30.s[1] +mul v22.4S, v22.4S,v25.s[1] +mla v22.4S, v1.4S, v31.s[0] +sub v1.4s, v15.4s, v22.4s +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v12.4S, v30.s[2] +mul v12.4S, v12.4S,v25.s[2] +mla v12.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v12.4s +add v21.4s, v21.4s, v12.4s +trn1 v12.4S, v15.4S, v1.4S +trn2 v10.4S, v15.4S, v1.4S +trn1 v18.4S, v21.4S, v22.4S +trn2 v3.4S, v21.4S, v22.4S +trn2 v21.2D, v12.2D, v18.2D +trn2 v22.2D, v10.2D, v3.2D +trn1 v15.2D, v12.2D, v18.2D +trn1 v1.2D, v10.2D, v3.2D +sqrdmulh v3.4S, v21.4S, v29.4S +mul v21.4S, v21.4S,v28.4S +mla v21.4S, v3.4S, v31.s[0] +sub v3.4s, v15.4s, v21.4s +add v15.4s, v15.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v29.4S +mul v22.4S, v22.4S,v28.4S +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +sqrdmulh v22.4S, v1.4S, v26.4S +mul v1.4S, v1.4S,v27.4S +mla v1.4S, v22.4S, v31.s[0] +sub v22.4s, v15.4s, v1.4s +add v15.4s, v15.4s, v1.4s +sqrdmulh v1.4S, v21.4S, v4.4S +mul v21.4S, v21.4S,v24.4S +mla v21.4S, v1.4S, v31.s[0] +sub v1.4s, v3.4s, v21.4s +add v3.4s, v3.4s, v21.4s +str q15, [x0, #576] +str q22, [x0, #592] +str q3, [x0, #608] +str q1, [x0, #624] +ldr q1, [x17, #+1408] +ldr q3, [x17, #+1424] +ldr q22, [x17, #+1440] +ldr q15, [x17, #+1456] +ldr q21, [x17, #+1472] +ldr q10, [x17, #+1488] +ldr q18, [x17, #+1504] +ldr q12, [x17, #+1520] +ldr q4, [x0, #672] +ldr q24, [x0, #688] +ldr q26, [x0, #640] +ldr q27, [x0, #656] +sqrdmulh v29.4S, v4.4S, v3.s[0] +mul v4.4S, v4.4S,v1.s[0] +mla v4.4S, v29.4S, v31.s[0] +sub v29.4s, v26.4s, v4.4s +add v26.4s, v26.4s, v4.4s +sqrdmulh v4.4S, v24.4S, v3.s[0] +mul v24.4S, v24.4S,v1.s[0] +mla v24.4S, v4.4S, v31.s[0] +sub v4.4s, v27.4s, v24.4s +add v27.4s, v27.4s, v24.4s +sqrdmulh v24.4S, v27.4S, v3.s[1] +mul v27.4S, v27.4S,v1.s[1] +mla v27.4S, v24.4S, v31.s[0] +sub v24.4s, v26.4s, v27.4s +add v26.4s, v26.4s, v27.4s +sqrdmulh v27.4S, v4.4S, v3.s[2] +mul v4.4S, v4.4S,v1.s[2] +mla v4.4S, v27.4S, v31.s[0] +sub v27.4s, v29.4s, v4.4s +add v29.4s, v29.4s, v4.4s +trn1 v4.4S, v26.4S, v24.4S +trn2 v28.4S, v26.4S, v24.4S +trn1 v30.4S, v29.4S, v27.4S +trn2 v25.4S, v29.4S, v27.4S +trn2 v29.2D, v4.2D, v30.2D +trn2 v27.2D, v28.2D, v25.2D +trn1 v26.2D, v4.2D, v30.2D +trn1 v24.2D, v28.2D, v25.2D +sqrdmulh v25.4S, v29.4S, v15.4S +mul v29.4S, v29.4S,v22.4S +mla v29.4S, v25.4S, v31.s[0] +sub v25.4s, v26.4s, v29.4s +add v26.4s, v26.4s, v29.4s +sqrdmulh v29.4S, v27.4S, v15.4S +mul v27.4S, v27.4S,v22.4S +mla v27.4S, v29.4S, v31.s[0] +sub v29.4s, v24.4s, v27.4s +add v24.4s, v24.4s, v27.4s +sqrdmulh v27.4S, v24.4S, v10.4S +mul v24.4S, v24.4S,v21.4S +mla v24.4S, v27.4S, v31.s[0] +sub v27.4s, v26.4s, v24.4s +add v26.4s, v26.4s, v24.4s +sqrdmulh v24.4S, v29.4S, v12.4S +mul v29.4S, v29.4S,v18.4S +mla v29.4S, v24.4S, v31.s[0] +sub v24.4s, v25.4s, v29.4s +add v25.4s, v25.4s, v29.4s +str q26, [x0, #640] +str q27, [x0, #656] +str q25, [x0, #672] +str q24, [x0, #688] +ldr q24, [x17, #+1536] +ldr q25, [x17, #+1552] +ldr q27, [x17, #+1568] +ldr q26, [x17, #+1584] +ldr q29, [x17, #+1600] +ldr q28, [x17, #+1616] +ldr q30, [x17, #+1632] +ldr q4, [x17, #+1648] +ldr q12, [x0, #736] +ldr q18, [x0, #752] +ldr q10, [x0, #704] +ldr q21, [x0, #720] +sqrdmulh v15.4S, v12.4S, v25.s[0] +mul v12.4S, v12.4S,v24.s[0] +mla v12.4S, v15.4S, v31.s[0] +sub v15.4s, v10.4s, v12.4s +add v10.4s, v10.4s, v12.4s +sqrdmulh v12.4S, v18.4S, v25.s[0] +mul v18.4S, v18.4S,v24.s[0] +mla v18.4S, v12.4S, v31.s[0] +sub v12.4s, v21.4s, v18.4s +add v21.4s, v21.4s, v18.4s +sqrdmulh v18.4S, v21.4S, v25.s[1] +mul v21.4S, v21.4S,v24.s[1] +mla v21.4S, v18.4S, v31.s[0] +sub v18.4s, v10.4s, v21.4s +add v10.4s, v10.4s, v21.4s +sqrdmulh v21.4S, v12.4S, v25.s[2] +mul v12.4S, v12.4S,v24.s[2] +mla v12.4S, v21.4S, v31.s[0] +sub v21.4s, v15.4s, v12.4s +add v15.4s, v15.4s, v12.4s +trn1 v12.4S, v10.4S, v18.4S +trn2 v22.4S, v10.4S, v18.4S +trn1 v3.4S, v15.4S, v21.4S +trn2 v1.4S, v15.4S, v21.4S +trn2 v15.2D, v12.2D, v3.2D +trn2 v21.2D, v22.2D, v1.2D +trn1 v10.2D, v12.2D, v3.2D +trn1 v18.2D, v22.2D, v1.2D +sqrdmulh v1.4S, v15.4S, v26.4S +mul v15.4S, v15.4S,v27.4S +mla v15.4S, v1.4S, v31.s[0] +sub v1.4s, v10.4s, v15.4s +add v10.4s, v10.4s, v15.4s +sqrdmulh v15.4S, v21.4S, v26.4S +mul v21.4S, v21.4S,v27.4S +mla v21.4S, v15.4S, v31.s[0] +sub v15.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v18.4S, v28.4S +mul v18.4S, v18.4S,v29.4S +mla v18.4S, v21.4S, v31.s[0] +sub v21.4s, v10.4s, v18.4s +add v10.4s, v10.4s, v18.4s +sqrdmulh v18.4S, v15.4S, v4.4S +mul v15.4S, v15.4S,v30.4S +mla v15.4S, v18.4S, v31.s[0] +sub v18.4s, v1.4s, v15.4s +add v1.4s, v1.4s, v15.4s +str q10, [x0, #704] +str q21, [x0, #720] +str q1, [x0, #736] +str q18, [x0, #752] +ldr q18, [x17, #+1664] +ldr q1, [x17, #+1680] +ldr q21, [x17, #+1696] +ldr q10, [x17, #+1712] +ldr q15, [x17, #+1728] +ldr q22, [x17, #+1744] +ldr q3, [x17, #+1760] +ldr q12, [x17, #+1776] +ldr q4, [x0, #800] +ldr q30, [x0, #816] +ldr q28, [x0, #768] +ldr q29, [x0, #784] +sqrdmulh v26.4S, v4.4S, v1.s[0] +mul v4.4S, v4.4S,v18.s[0] +mla v4.4S, v26.4S, v31.s[0] +sub v26.4s, v28.4s, v4.4s +add v28.4s, v28.4s, v4.4s +sqrdmulh v4.4S, v30.4S, v1.s[0] +mul v30.4S, v30.4S,v18.s[0] +mla v30.4S, v4.4S, v31.s[0] +sub v4.4s, v29.4s, v30.4s +add v29.4s, v29.4s, v30.4s +sqrdmulh v30.4S, v29.4S, v1.s[1] +mul v29.4S, v29.4S,v18.s[1] +mla v29.4S, v30.4S, v31.s[0] +sub v30.4s, v28.4s, v29.4s +add v28.4s, v28.4s, v29.4s +sqrdmulh v29.4S, v4.4S, v1.s[2] +mul v4.4S, v4.4S,v18.s[2] +mla v4.4S, v29.4S, v31.s[0] +sub v29.4s, v26.4s, v4.4s +add v26.4s, v26.4s, v4.4s +trn1 v4.4S, v28.4S, v30.4S +trn2 v27.4S, v28.4S, v30.4S +trn1 v25.4S, v26.4S, v29.4S +trn2 v24.4S, v26.4S, v29.4S +trn2 v26.2D, v4.2D, v25.2D +trn2 v29.2D, v27.2D, v24.2D +trn1 v28.2D, v4.2D, v25.2D +trn1 v30.2D, v27.2D, v24.2D +sqrdmulh v24.4S, v26.4S, v10.4S +mul v26.4S, v26.4S,v21.4S +mla v26.4S, v24.4S, v31.s[0] +sub v24.4s, v28.4s, v26.4s +add v28.4s, v28.4s, v26.4s +sqrdmulh v26.4S, v29.4S, v10.4S +mul v29.4S, v29.4S,v21.4S +mla v29.4S, v26.4S, v31.s[0] +sub v26.4s, v30.4s, v29.4s +add v30.4s, v30.4s, v29.4s +sqrdmulh v29.4S, v30.4S, v22.4S +mul v30.4S, v30.4S,v15.4S +mla v30.4S, v29.4S, v31.s[0] +sub v29.4s, v28.4s, v30.4s +add v28.4s, v28.4s, v30.4s +sqrdmulh v30.4S, v26.4S, v12.4S +mul v26.4S, v26.4S,v3.4S +mla v26.4S, v30.4S, v31.s[0] +sub v30.4s, v24.4s, v26.4s +add v24.4s, v24.4s, v26.4s +str q28, [x0, #768] +str q29, [x0, #784] +str q24, [x0, #800] +str q30, [x0, #816] +ldr q30, [x17, #+1792] +ldr q24, [x17, #+1808] +ldr q29, [x17, #+1824] +ldr q28, [x17, #+1840] +ldr q26, [x17, #+1856] +ldr q27, [x17, #+1872] +ldr q25, [x17, #+1888] +ldr q4, [x17, #+1904] +ldr q12, [x0, #864] +ldr q3, [x0, #880] +ldr q22, [x0, #832] +ldr q15, [x0, #848] +sqrdmulh v10.4S, v12.4S, v24.s[0] +mul v12.4S, v12.4S,v30.s[0] +mla v12.4S, v10.4S, v31.s[0] +sub v10.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v3.4S, v24.s[0] +mul v3.4S, v3.4S,v30.s[0] +mla v3.4S, v12.4S, v31.s[0] +sub v12.4s, v15.4s, v3.4s +add v15.4s, v15.4s, v3.4s +sqrdmulh v3.4S, v15.4S, v24.s[1] +mul v15.4S, v15.4S,v30.s[1] +mla v15.4S, v3.4S, v31.s[0] +sub v3.4s, v22.4s, v15.4s +add v22.4s, v22.4s, v15.4s +sqrdmulh v15.4S, v12.4S, v24.s[2] +mul v12.4S, v12.4S,v30.s[2] +mla v12.4S, v15.4S, v31.s[0] +sub v15.4s, v10.4s, v12.4s +add v10.4s, v10.4s, v12.4s +trn1 v12.4S, v22.4S, v3.4S +trn2 v21.4S, v22.4S, v3.4S +trn1 v1.4S, v10.4S, v15.4S +trn2 v18.4S, v10.4S, v15.4S +trn2 v10.2D, v12.2D, v1.2D +trn2 v15.2D, v21.2D, v18.2D +trn1 v22.2D, v12.2D, v1.2D +trn1 v3.2D, v21.2D, v18.2D +sqrdmulh v18.4S, v10.4S, v28.4S +mul v10.4S, v10.4S,v29.4S +mla v10.4S, v18.4S, v31.s[0] +sub v18.4s, v22.4s, v10.4s +add v22.4s, v22.4s, v10.4s +sqrdmulh v10.4S, v15.4S, v28.4S +mul v15.4S, v15.4S,v29.4S +mla v15.4S, v10.4S, v31.s[0] +sub v10.4s, v3.4s, v15.4s +add v3.4s, v3.4s, v15.4s +sqrdmulh v15.4S, v3.4S, v27.4S +mul v3.4S, v3.4S,v26.4S +mla v3.4S, v15.4S, v31.s[0] +sub v15.4s, v22.4s, v3.4s +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v10.4S, v4.4S +mul v10.4S, v10.4S,v25.4S +mla v10.4S, v3.4S, v31.s[0] +sub v3.4s, v18.4s, v10.4s +add v18.4s, v18.4s, v10.4s +str q22, [x0, #832] +str q15, [x0, #848] +str q18, [x0, #864] +str q3, [x0, #880] +ldr q3, [x17, #+1920] +ldr q18, [x17, #+1936] +ldr q15, [x17, #+1952] +ldr q22, [x17, #+1968] +ldr q10, [x17, #+1984] +ldr q21, [x17, #+2000] +ldr q1, [x17, #+2016] +ldr q12, [x17, #+2032] +ldr q4, [x0, #928] +ldr q25, [x0, #944] +ldr q27, [x0, #896] +ldr q26, [x0, #912] +sqrdmulh v28.4S, v4.4S, v18.s[0] +mul v4.4S, v4.4S,v3.s[0] +mla v4.4S, v28.4S, v31.s[0] +sub v28.4s, v27.4s, v4.4s +add v27.4s, v27.4s, v4.4s +sqrdmulh v4.4S, v25.4S, v18.s[0] +mul v25.4S, v25.4S,v3.s[0] +mla v25.4S, v4.4S, v31.s[0] +sub v4.4s, v26.4s, v25.4s +add v26.4s, v26.4s, v25.4s +sqrdmulh v25.4S, v26.4S, v18.s[1] +mul v26.4S, v26.4S,v3.s[1] +mla v26.4S, v25.4S, v31.s[0] +sub v25.4s, v27.4s, v26.4s +add v27.4s, v27.4s, v26.4s +sqrdmulh v26.4S, v4.4S, v18.s[2] +mul v4.4S, v4.4S,v3.s[2] +mla v4.4S, v26.4S, v31.s[0] +sub v26.4s, v28.4s, v4.4s +add v28.4s, v28.4s, v4.4s +trn1 v4.4S, v27.4S, v25.4S +trn2 v29.4S, v27.4S, v25.4S +trn1 v24.4S, v28.4S, v26.4S +trn2 v30.4S, v28.4S, v26.4S +trn2 v28.2D, v4.2D, v24.2D +trn2 v26.2D, v29.2D, v30.2D +trn1 v27.2D, v4.2D, v24.2D +trn1 v25.2D, v29.2D, v30.2D +sqrdmulh v30.4S, v28.4S, v22.4S +mul v28.4S, v28.4S,v15.4S +mla v28.4S, v30.4S, v31.s[0] +sub v30.4s, v27.4s, v28.4s +add v27.4s, v27.4s, v28.4s +sqrdmulh v28.4S, v26.4S, v22.4S +mul v26.4S, v26.4S,v15.4S +mla v26.4S, v28.4S, v31.s[0] +sub v28.4s, v25.4s, v26.4s +add v25.4s, v25.4s, v26.4s +sqrdmulh v26.4S, v25.4S, v21.4S +mul v25.4S, v25.4S,v10.4S +mla v25.4S, v26.4S, v31.s[0] +sub v26.4s, v27.4s, v25.4s +add v27.4s, v27.4s, v25.4s +sqrdmulh v25.4S, v28.4S, v12.4S +mul v28.4S, v28.4S,v1.4S +mla v28.4S, v25.4S, v31.s[0] +sub v25.4s, v30.4s, v28.4s +add v30.4s, v30.4s, v28.4s +str q27, [x0, #896] +str q26, [x0, #912] +str q30, [x0, #928] +str q25, [x0, #944] +ldr q25, [x17, #+2048] +ldr q30, [x17, #+2064] +ldr q26, [x17, #+2080] +ldr q27, [x17, #+2096] +ldr q28, [x17, #+2112] +ldr q29, [x17, #+2128] +ldr q24, [x17, #+2144] +ldr q4, [x17, #+2160] +ldr q12, [x0, #992] +ldr q1, [x0, #1008] +ldr q21, [x0, #960] +ldr q10, [x0, #976] +sqrdmulh v22.4S, v12.4S, v30.s[0] +mul v12.4S, v12.4S,v25.s[0] +mla v12.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v12.4s +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v1.4S, v30.s[0] +mul v1.4S, v1.4S,v25.s[0] +mla v1.4S, v12.4S, v31.s[0] +sub v12.4s, v10.4s, v1.4s +add v10.4s, v10.4s, v1.4s +sqrdmulh v1.4S, v10.4S, v30.s[1] +mul v10.4S, v10.4S,v25.s[1] +mla v10.4S, v1.4S, v31.s[0] +sub v1.4s, v21.4s, v10.4s +add v21.4s, v21.4s, v10.4s +sqrdmulh v10.4S, v12.4S, v30.s[2] +mul v12.4S, v12.4S,v25.s[2] +mla v12.4S, v10.4S, v31.s[0] +sub v10.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +trn1 v12.4S, v21.4S, v1.4S +trn2 v15.4S, v21.4S, v1.4S +trn1 v18.4S, v22.4S, v10.4S +trn2 v3.4S, v22.4S, v10.4S +trn2 v22.2D, v12.2D, v18.2D +trn2 v10.2D, v15.2D, v3.2D +trn1 v21.2D, v12.2D, v18.2D +trn1 v1.2D, v15.2D, v3.2D +sqrdmulh v3.4S, v22.4S, v27.4S +mul v22.4S, v22.4S,v26.4S +mla v22.4S, v3.4S, v31.s[0] +sub v3.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +sqrdmulh v22.4S, v10.4S, v27.4S +mul v10.4S, v10.4S,v26.4S +mla v10.4S, v22.4S, v31.s[0] +sub v22.4s, v1.4s, v10.4s +add v1.4s, v1.4s, v10.4s +sqrdmulh v10.4S, v1.4S, v29.4S +mul v1.4S, v1.4S,v28.4S +mla v1.4S, v10.4S, v31.s[0] +sub v10.4s, v21.4s, v1.4s +add v21.4s, v21.4s, v1.4s +sqrdmulh v1.4S, v22.4S, v4.4S +mul v22.4S, v22.4S,v24.4S +mla v22.4S, v1.4S, v31.s[0] +sub v1.4s, v3.4s, v22.4s +add v3.4s, v3.4s, v22.4s +str q21, [x0, #960] +str q10, [x0, #976] +str q3, [x0, #992] +str q1, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 2456 +// Instruction count: 2452 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_11_0.s b/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_11_0.s new file mode 100644 index 0000000..4ee80f8 --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_11_0.s @@ -0,0 +1,2422 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 26036764 // Layer 6, block 0 +.word 7065381 // Layer 6, block 1 +.word 11280567 // Layer 6, block 2 +.word 19695786 // Layer 6, block 3 +.word 1666225723 // Layer 6, block 0 +.word 452149874 // Layer 6, block 1 +.word 721901190 // Layer 6, block 2 +.word 1260434103 // Layer 6, block 3 +.word 28678040 // Layer 7, block 0 +.word 5637166 // Layer 7, block 2 +.word 18759424 // Layer 7, block 4 +.word 8648030 // Layer 7, block 6 +.word 1835254486 // Layer 7, block 0 +.word 360751090 // Layer 7, block 2 +.word 1200511508 // Layer 7, block 4 +.word 553431680 // Layer 7, block 6 +.word 7232147 // Layer 7, block 1 +.word 7430689 // Layer 7, block 3 +.word 14819378 // Layer 7, block 5 +.word 22112339 // Layer 7, block 7 +.word 462822084 // Layer 7, block 1 +.word 475527802 // Layer 7, block 3 +.word 948367809 // Layer 7, block 5 +.word 1415081692 // Layer 7, block 7 +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14834498 // Layer 6, block 4 +.word 22861321 // Layer 6, block 5 +.word 23033862 // Layer 6, block 6 +.word 32211066 // Layer 6, block 7 +.word 949335415 // Layer 6, block 4 +.word 1463012881 // Layer 6, block 5 +.word 1474054663 // Layer 6, block 6 +.word 2061350894 // Layer 6, block 7 +.word 7103825 // Layer 7, block 8 +.word 24338119 // Layer 7, block 10 +.word 6674394 // Layer 7, block 12 +.word 3716128 // Layer 7, block 14 +.word 454610102 // Layer 7, block 8 +.word 1557520740 // Layer 7, block 10 +.word 427128616 // Layer 7, block 12 +.word 237814041 // Layer 7, block 14 +.word 18577393 // Layer 7, block 9 +.word 17042091 // Layer 7, block 11 +.word 6574213 // Layer 7, block 13 +.word 24666803 // Layer 7, block 15 +.word 1188862414 // Layer 7, block 9 +.word 1090610585 // Layer 7, block 11 +.word 420717521 // Layer 7, block 13 +.word 1578554911 // Layer 7, block 15 +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 11253846 // Layer 6, block 8 +.word 16151303 // Layer 6, block 9 +.word 1821442 // Layer 6, block 10 +.word 23358663 // Layer 6, block 11 +.word 720191176 // Layer 6, block 8 +.word 1033604503 // Layer 6, block 9 +.word 116563391 // Layer 6, block 10 +.word 1494840340 // Layer 6, block 11 +.word 32787475 // Layer 7, block 16 +.word 8269259 // Layer 7, block 18 +.word 20826321 // Layer 7, block 20 +.word 21194054 // Layer 7, block 22 +.word 2098238255 // Layer 7, block 16 +.word 529192186 // Layer 7, block 18 +.word 1332782821 // Layer 7, block 20 +.word 1356315937 // Layer 7, block 22 +.word 28400654 // Layer 7, block 17 +.word 31090287 // Layer 7, block 19 +.word 26776841 // Layer 7, block 21 +.word 22281074 // Layer 7, block 23 +.word 1817503137 // Layer 7, block 17 +.word 1989626512 // Layer 7, block 19 +.word 1713587037 // Layer 7, block 21 +.word 1425879908 // Layer 7, block 23 +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 20504641 // Layer 6, block 12 +.word 7735096 // Layer 6, block 13 +.word 29463916 // Layer 6, block 14 +.word 23172067 // Layer 6, block 15 +.word 1312196872 // Layer 6, block 12 +.word 495008363 // Layer 6, block 13 +.word 1885546712 // Layer 6, block 14 +.word 1482899108 // Layer 6, block 15 +.word 1953000 // Layer 7, block 24 +.word 12766243 // Layer 7, block 26 +.word 16292342 // Layer 7, block 28 +.word 25143337 // Layer 7, block 30 +.word 124982461 // Layer 7, block 24 +.word 816977197 // Layer 7, block 26 +.word 1042630311 // Layer 7, block 28 +.word 1609050759 // Layer 7, block 30 +.word 12486848 // Layer 7, block 25 +.word 31556661 // Layer 7, block 27 +.word 28330310 // Layer 7, block 29 +.word 15137961 // Layer 7, block 31 +.word 799097282 // Layer 7, block 25 +.word 2019472170 // Layer 7, block 27 +.word 1813001465 // Layer 7, block 29 +.word 968755565 // Layer 7, block 31 +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 18663828 // Layer 6, block 16 +.word 25765932 // Layer 6, block 17 +.word 11779122 // Layer 6, block 18 +.word 29112305 // Layer 6, block 19 +.word 1194393831 // Layer 6, block 16 +.word 1648893798 // Layer 6, block 17 +.word 753806275 // Layer 6, block 18 +.word 1863045325 // Layer 6, block 19 +.word 33163184 // Layer 7, block 32 +.word 11550623 // Layer 7, block 34 +.word 25375595 // Layer 7, block 36 +.word 18254638 // Layer 7, block 38 +.word 2122281795 // Layer 7, block 32 +.word 739183455 // Layer 7, block 34 +.word 1623914137 // Layer 7, block 36 +.word 1168207670 // Layer 7, block 38 +.word 9551359 // Layer 7, block 33 +.word 33257316 // Layer 7, block 35 +.word 10387700 // Layer 7, block 37 +.word 4263629 // Layer 7, block 39 +.word 611240324 // Layer 7, block 33 +.word 2128305784 // Layer 7, block 35 +.word 664762063 // Layer 7, block 37 +.word 272851431 // Layer 7, block 39 +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 596073 // Layer 6, block 20 +.word 29039358 // Layer 6, block 21 +.word 6760262 // Layer 6, block 22 +.word 2228887 // Layer 6, block 23 +.word 38145761 // Layer 6, block 20 +.word 1858377074 // Layer 6, block 21 +.word 432623749 // Layer 6, block 22 +.word 142637881 // Layer 6, block 23 +.word 25929180 // Layer 7, block 40 +.word 23508428 // Layer 7, block 42 +.word 22560727 // Layer 7, block 44 +.word 29457393 // Layer 7, block 46 +.word 1659340873 // Layer 7, block 40 +.word 1504424569 // Layer 7, block 42 +.word 1443776334 // Layer 7, block 44 +.word 1885129272 // Layer 7, block 46 +.word 17371159 // Layer 7, block 41 +.word 11558208 // Layer 7, block 43 +.word 15755637 // Layer 7, block 45 +.word 20740787 // Layer 7, block 47 +.word 1111669329 // Layer 7, block 41 +.word 739668858 // Layer 7, block 43 +.word 1008283812 // Layer 7, block 45 +.word 1327309063 // Layer 7, block 47 +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 13624329 // Layer 6, block 24 +.word 9838349 // Layer 6, block 25 +.word 6934560 // Layer 6, block 26 +.word 11310234 // Layer 6, block 27 +.word 871890510 // Layer 6, block 24 +.word 629606282 // Layer 6, block 25 +.word 443777969 // Layer 6, block 26 +.word 723799733 // Layer 6, block 27 +.word 3153984 // Layer 7, block 48 +.word 15599806 // Layer 7, block 50 +.word 23484790 // Layer 7, block 52 +.word 30174454 // Layer 7, block 54 +.word 201839571 // Layer 7, block 48 +.word 998311389 // Layer 7, block 50 +.word 1502911852 // Layer 7, block 52 +.word 1931017673 // Layer 7, block 54 +.word 13598070 // Layer 7, block 49 +.word 31454003 // Layer 7, block 51 +.word 20506260 // Layer 7, block 53 +.word 5928435 // Layer 7, block 55 +.word 870210062 // Layer 7, block 49 +.word 2012902560 // Layer 7, block 51 +.word 1312300480 // Layer 7, block 53 +.word 379390883 // Layer 7, block 55 +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 32798516 // Layer 6, block 28 +.word 9911360 // Layer 6, block 29 +.word 32443170 // Layer 6, block 30 +.word 31293482 // Layer 6, block 31 +.word 2098944825 // Layer 6, block 28 +.word 634278629 // Layer 6, block 29 +.word 2076204416 // Layer 6, block 30 +.word 2002630000 // Layer 6, block 31 +.word 26013877 // Layer 7, block 56 +.word 22928950 // Layer 7, block 58 +.word 24547058 // Layer 7, block 60 +.word 21082546 // Layer 7, block 62 +.word 1664761067 // Layer 7, block 56 +.word 1467340807 // Layer 7, block 58 +.word 1570891816 // Layer 7, block 60 +.word 1349179970 // Layer 7, block 62 +.word 21864746 // Layer 7, block 57 +.word 27678266 // Layer 7, block 59 +.word 30695887 // Layer 7, block 61 +.word 31772478 // Layer 7, block 63 +.word 1399236949 // Layer 7, block 57 +.word 1771273834 // Layer 7, block 59 +.word 1964386839 // Layer 7, block 61 +.word 2033283404 // Layer 7, block 63 +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 2853776 // Layer 6, block 32 +.word 31645959 // Layer 6, block 33 +.word 29723614 // Layer 6, block 34 +.word 31813171 // Layer 6, block 35 +.word 182627725 // Layer 6, block 32 +.word 2025186806 // Layer 6, block 33 +.word 1902166116 // Layer 6, block 34 +.word 2035887557 // Layer 6, block 35 +.word 30377953 // Layer 7, block 64 +.word 4924837 // Layer 7, block 66 +.word 11362575 // Layer 7, block 68 +.word 31398766 // Layer 7, block 70 +.word 1944040616 // Layer 7, block 64 +.word 315165513 // Layer 7, block 66 +.word 727149301 // Layer 7, block 68 +.word 2009367662 // Layer 7, block 70 +.word 27689101 // Layer 7, block 65 +.word 31229525 // Layer 7, block 67 +.word 6544948 // Layer 7, block 69 +.word 13728247 // Layer 7, block 71 +.word 1771967221 // Layer 7, block 65 +.word 1998537064 // Layer 7, block 67 +.word 418844704 // Layer 7, block 69 +.word 878540754 // Layer 7, block 71 +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9116920 // Layer 6, block 36 +.word 26449800 // Layer 6, block 37 +.word 27173300 // Layer 6, block 38 +.word 1574249 // Layer 6, block 39 +.word 583438350 // Layer 6, block 36 +.word 1692658010 // Layer 6, block 37 +.word 1738958476 // Layer 6, block 38 +.word 100744247 // Layer 6, block 39 +.word 6510145 // Layer 7, block 72 +.word 760999 // Layer 7, block 74 +.word 1634503 // Layer 7, block 76 +.word 29546109 // Layer 7, block 78 +.word 416617482 // Layer 7, block 72 +.word 48700219 // Layer 7, block 74 +.word 104600209 // Layer 7, block 76 +.word 1890806663 // Layer 7, block 78 +.word 2195232 // Layer 7, block 73 +.word 4465852 // Layer 7, block 75 +.word 31203102 // Layer 7, block 77 +.word 29916743 // Layer 7, block 79 +.word 140484126 // Layer 7, block 73 +.word 285792715 // Layer 7, block 75 +.word 1996846121 // Layer 7, block 77 +.word 1914525428 // Layer 7, block 79 +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29172999 // Layer 6, block 40 +.word 16825951 // Layer 6, block 41 +.word 11592382 // Layer 6, block 42 +.word 2671395 // Layer 6, block 43 +.word 1866929445 // Layer 6, block 40 +.word 1076778680 // Layer 6, block 41 +.word 741855827 // Layer 6, block 42 +.word 170956232 // Layer 6, block 43 +.word 14579779 // Layer 7, block 80 +.word 24263513 // Layer 7, block 82 +.word 4646776 // Layer 7, block 84 +.word 69049 // Layer 7, block 86 +.word 933034643 // Layer 7, block 80 +.word 1552746321 // Layer 7, block 82 +.word 297370968 // Layer 7, block 84 +.word 4418799 // Layer 7, block 86 +.word 33263488 // Layer 7, block 81 +.word 22493246 // Layer 7, block 83 +.word 22009979 // Layer 7, block 85 +.word 12021234 // Layer 7, block 87 +.word 2128700762 // Layer 7, block 81 +.word 1439457879 // Layer 7, block 83 +.word 1408531152 // Layer 7, block 85 +.word 769300260 // Layer 7, block 87 +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 15720958 // Layer 6, block 44 +.word 4876619 // Layer 6, block 45 +.word 9370171 // Layer 6, block 46 +.word 2197027 // Layer 6, block 47 +.word 1006064525 // Layer 6, block 44 +.word 312079797 // Layer 6, block 45 +.word 599645177 // Layer 6, block 46 +.word 140598997 // Layer 6, block 47 +.word 16117282 // Layer 7, block 88 +.word 9635661 // Layer 7, block 90 +.word 9117520 // Layer 7, block 92 +.word 3506913 // Layer 7, block 94 +.word 1031427326 // Layer 7, block 88 +.word 616635240 // Layer 7, block 90 +.word 583476747 // Layer 7, block 92 +.word 224425303 // Layer 7, block 94 +.word 20014407 // Layer 7, block 89 +.word 25893988 // Layer 7, block 91 +.word 10257619 // Layer 7, block 93 +.word 24501669 // Layer 7, block 95 +.word 1280824291 // Layer 7, block 89 +.word 1657088757 // Layer 7, block 91 +.word 656437514 // Layer 7, block 93 +.word 1567987141 // Layer 7, block 95 +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 23467272 // Layer 6, block 48 +.word 11944835 // Layer 6, block 49 +.word 29768154 // Layer 6, block 50 +.word 3189790 // Layer 6, block 51 +.word 1501790786 // Layer 6, block 48 +.word 764411097 // Layer 6, block 49 +.word 1905016458 // Layer 6, block 50 +.word 204130980 // Layer 6, block 51 +.word 28559032 // Layer 7, block 96 +.word 20151609 // Layer 7, block 98 +.word 11645481 // Layer 7, block 100 +.word 16402437 // Layer 7, block 102 +.word 1827638556 // Layer 7, block 96 +.word 1289604549 // Layer 7, block 98 +.word 745253903 // Layer 7, block 100 +.word 1049675853 // Layer 7, block 102 +.word 1005359 // Layer 7, block 97 +.word 19130139 // Layer 7, block 99 +.word 11690281 // Layer 7, block 101 +.word 5461508 // Layer 7, block 103 +.word 64338065 // Layer 7, block 97 +.word 1224235458 // Layer 7, block 99 +.word 748120885 // Layer 7, block 101 +.word 349509836 // Layer 7, block 103 +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 4898455 // Layer 6, block 52 +.word 22059944 // Layer 6, block 53 +.word 20315246 // Layer 6, block 54 +.word 28615767 // Layer 6, block 55 +.word 313477194 // Layer 6, block 52 +.word 1411728668 // Layer 6, block 53 +.word 1300076517 // Layer 6, block 54 +.word 1831269319 // Layer 6, block 55 +.word 6226096 // Layer 7, block 104 +.word 14029790 // Layer 7, block 106 +.word 7729000 // Layer 7, block 108 +.word 13958531 // Layer 7, block 110 +.word 398439734 // Layer 7, block 104 +.word 897838034 // Layer 7, block 106 +.word 494618249 // Layer 7, block 108 +.word 893277806 // Layer 7, block 110 +.word 31755058 // Layer 7, block 105 +.word 26102744 // Layer 7, block 107 +.word 19175904 // Layer 7, block 109 +.word 19472238 // Layer 7, block 111 +.word 2032168609 // Layer 7, block 105 +.word 1670448121 // Layer 7, block 107 +.word 1227164194 // Layer 7, block 109 +.word 1246128123 // Layer 7, block 111 +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 17302560 // Layer 6, block 56 +.word 8630188 // Layer 6, block 57 +.word 13744680 // Layer 6, block 58 +.word 31890906 // Layer 6, block 59 +.word 1107279328 // Layer 6, block 56 +.word 552289879 // Layer 6, block 57 +.word 879592386 // Layer 6, block 58 +.word 2040862218 // Layer 6, block 59 +.word 4735938 // Layer 7, block 112 +.word 26671657 // Layer 7, block 114 +.word 25810971 // Layer 7, block 116 +.word 25578690 // Layer 7, block 118 +.word 303076900 // Layer 7, block 112 +.word 1706855774 // Layer 7, block 114 +.word 1651776074 // Layer 7, block 116 +.word 1636911225 // Layer 7, block 118 +.word 6957373 // Layer 7, block 113 +.word 25381712 // Layer 7, block 115 +.word 27780827 // Layer 7, block 117 +.word 28062311 // Layer 7, block 119 +.word 445237890 // Layer 7, block 113 +.word 1624305595 // Layer 7, block 115 +.word 1777837237 // Layer 7, block 117 +.word 1795850838 // Layer 7, block 119 +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 26150922 // Layer 6, block 60 +.word 29525906 // Layer 6, block 61 +.word 23080870 // Layer 6, block 62 +.word 1636987 // Layer 6, block 63 +.word 1673531278 // Layer 6, block 60 +.word 1889513769 // Layer 6, block 61 +.word 1477062945 // Layer 6, block 62 +.word 104759172 // Layer 6, block 63 +.word 10674616 // Layer 7, block 120 +.word 9508293 // Layer 7, block 122 +.word 4274200 // Layer 7, block 124 +.word 10066304 // Layer 7, block 126 +.word 683123285 // Layer 7, block 120 +.word 608484310 // Layer 7, block 122 +.word 273527923 // Layer 7, block 124 +.word 644194289 // Layer 7, block 126 +.word 26473446 // Layer 7, block 121 +.word 14853570 // Layer 7, block 123 +.word 32427548 // Layer 7, block 125 +.word 16598340 // Layer 7, block 127 +.word 1694171239 // Layer 7, block 121 +.word 950555930 // Layer 7, block 123 +.word 2075204685 // Layer 7, block 125 +.word 1062212688 // Layer 7, block 127 +.text +.global ntt_u32_full_neon_asm_var_4_4_11_0 +.global _ntt_u32_full_neon_asm_var_4_4_11_0 +ntt_u32_full_neon_asm_var_4_4_11_0: +_ntt_u32_full_neon_asm_var_4_4_11_0: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x0, #928] +ldr q29, [x17, #+0] +ldr q28, [x17, #+16] +sqrdmulh v27.4S, v30.4S, v28.s[0] +mul v30.4S, v30.4S,v29.s[0] +ldr q26, [x0, #992] +sqrdmulh v25.4S, v26.4S, v28.s[0] +mul v26.4S, v26.4S,v29.s[0] +ldr q24, [x0, #800] +sqrdmulh v23.4S, v24.4S, v28.s[0] +mul v24.4S, v24.4S,v29.s[0] +ldr q22, [x0, #864] +sqrdmulh v21.4S, v22.4S, v28.s[0] +mul v22.4S, v22.4S,v29.s[0] +ldr q20, [x0, #544] +mla v30.4S, v27.4S, v31.s[0] +sqrdmulh v27.4S, v20.4S, v28.s[0] +ldr q19, [x0, #608] +mla v26.4S, v25.4S, v31.s[0] +sqrdmulh v25.4S, v19.4S, v28.s[0] +ldr q18, [x0, #672] +mla v24.4S, v23.4S, v31.s[0] +sqrdmulh v23.4S, v18.4S, v28.s[0] +ldr q17, [x0, #736] +mla v22.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v17.4S, v28.s[0] +ldr q16, [x0, #416] +ldr q3, [x0, #480] +mul v20.4S, v20.4S,v29.s[0] +sub v2.4s, v16.4s, v30.4s +mul v19.4S, v19.4S,v29.s[0] +add v16.4s, v16.4s, v30.4s +ldr q30, [x0, #288] +ldr q1, [x0, #352] +mla v20.4S, v27.4S, v31.s[0] +sub v27.4s, v3.4s, v26.4s +mla v19.4S, v25.4S, v31.s[0] +add v3.4s, v3.4s, v26.4s +ldr q26, [x0, #32] +ldr q25, [x0, #96] +mul v18.4S, v18.4S,v29.s[0] +sub v0.4s, v30.4s, v24.4s +mul v17.4S, v17.4S,v29.s[0] +add v30.4s, v30.4s, v24.4s +ldr q24, [x0, #160] +ldr q15, [x0, #224] +mla v18.4S, v23.4S, v31.s[0] +sub v23.4s, v1.4s, v22.4s +mla v17.4S, v21.4S, v31.s[0] +add v1.4s, v1.4s, v22.4s +sqrdmulh v22.4S, v16.4S, v28.s[1] +mul v16.4S, v16.4S,v29.s[1] +sqrdmulh v21.4S, v3.4S, v28.s[1] +sub v14.4s, v26.4s, v20.4s +mul v3.4S, v3.4S,v29.s[1] +add v26.4s, v26.4s, v20.4s +sqrdmulh v20.4S, v30.4S, v28.s[1] +sub v13.4s, v25.4s, v19.4s +mul v30.4S, v30.4S,v29.s[1] +add v25.4s, v25.4s, v19.4s +sqrdmulh v19.4S, v1.4S, v28.s[1] +sub v12.4s, v24.4s, v18.4s +mul v1.4S, v1.4S,v29.s[1] +add v24.4s, v24.4s, v18.4s +mla v16.4S, v22.4S, v31.s[0] +sub v22.4s, v15.4s, v17.4s +sqrdmulh v18.4S, v2.4S, v28.s[2] +add v15.4s, v15.4s, v17.4s +mla v3.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v27.4S, v28.s[2] +mla v30.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v0.4S, v28.s[2] +mla v1.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v23.4S, v28.s[2] +ldr q17, [x17, #+32] +ldr q11, [x17, #+48] +mul v2.4S, v2.4S,v29.s[2] +sub v10.4s, v24.4s, v16.4s +mul v27.4S, v27.4S,v29.s[2] +add v24.4s, v24.4s, v16.4s +mla v2.4S, v18.4S, v31.s[0] +sub v18.4s, v15.4s, v3.4s +mla v27.4S, v21.4S, v31.s[0] +add v15.4s, v15.4s, v3.4s +mul v0.4S, v0.4S,v29.s[2] +sub v3.4s, v26.4s, v30.4s +mul v23.4S, v23.4S,v29.s[2] +add v26.4s, v26.4s, v30.4s +mla v0.4S, v20.4S, v31.s[0] +sub v20.4s, v25.4s, v1.4s +mla v23.4S, v19.4S, v31.s[0] +add v25.4s, v25.4s, v1.4s +sqrdmulh v1.4S, v10.4S, v11.s[1] +mul v10.4S, v10.4S,v17.s[1] +sqrdmulh v19.4S, v18.4S, v11.s[1] +sub v30.4s, v12.4s, v2.4s +mul v18.4S, v18.4S,v17.s[1] +add v12.4s, v12.4s, v2.4s +sqrdmulh v2.4S, v24.4S, v11.s[0] +sub v21.4s, v22.4s, v27.4s +mul v24.4S, v24.4S,v17.s[0] +add v22.4s, v22.4s, v27.4s +sqrdmulh v27.4S, v15.4S, v11.s[0] +sub v16.4s, v14.4s, v0.4s +mul v15.4S, v15.4S,v17.s[0] +add v14.4s, v14.4s, v0.4s +ldr q0, [x17, #+64] +ldr q9, [x17, #+80] +mla v10.4S, v1.4S, v31.s[0] +sub v1.4s, v13.4s, v23.4s +sqrdmulh v8.4S, v12.4S, v11.s[2] +add v13.4s, v13.4s, v23.4s +mla v18.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v22.4S, v11.s[2] +mla v24.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v30.4S, v11.s[3] +mla v15.4S, v27.4S, v31.s[0] +sqrdmulh v27.4S, v21.4S, v11.s[3] +ldr q23, [x17, #+96] +ldr q7, [x17, #+112] +mul v12.4S, v12.4S,v17.s[2] +sub v6.4s, v3.4s, v10.4s +mul v22.4S, v22.4S,v17.s[2] +add v3.4s, v3.4s, v10.4s +mla v12.4S, v8.4S, v31.s[0] +sub v8.4s, v20.4s, v18.4s +mla v22.4S, v19.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +mul v30.4S, v30.4S,v17.s[3] +sub v18.4s, v26.4s, v24.4s +mul v21.4S, v21.4S,v17.s[3] +add v26.4s, v26.4s, v24.4s +mla v30.4S, v2.4S, v31.s[0] +sub v2.4s, v25.4s, v15.4s +mla v21.4S, v27.4S, v31.s[0] +add v25.4s, v25.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v9.s[2] +mul v20.4S, v20.4S,v0.s[2] +sqrdmulh v27.4S, v8.4S, v9.s[3] +sub v24.4s, v14.4s, v12.4s +mul v8.4S, v8.4S,v0.s[3] +add v14.4s, v14.4s, v12.4s +sqrdmulh v12.4S, v2.4S, v9.s[1] +sub v19.4s, v13.4s, v22.4s +mul v2.4S, v2.4S,v0.s[1] +add v13.4s, v13.4s, v22.4s +sqrdmulh v22.4S, v25.4S, v9.s[0] +sub v10.4s, v16.4s, v30.4s +mul v25.4S, v25.4S,v0.s[0] +add v16.4s, v16.4s, v30.4s +mla v20.4S, v15.4S, v31.s[0] +sub v15.4s, v1.4s, v21.4s +sqrdmulh v30.4S, v13.4S, v7.s[0] +add v1.4s, v1.4s, v21.4s +mla v8.4S, v27.4S, v31.s[0] +sqrdmulh v27.4S, v19.4S, v7.s[1] +mla v2.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v1.4S, v7.s[2] +mla v25.4S, v22.4S, v31.s[0] +sqrdmulh v22.4S, v15.4S, v7.s[3] +mul v13.4S, v13.4S,v23.s[0] +sub v21.4s, v3.4s, v20.4s +str q21, [x0, #352] +mul v19.4S, v19.4S,v23.s[1] +add v3.4s, v3.4s, v20.4s +str q3, [x0, #288] +mla v13.4S, v30.4S, v31.s[0] +sub v30.4s, v6.4s, v8.4s +str q30, [x0, #480] +mla v19.4S, v27.4S, v31.s[0] +add v6.4s, v6.4s, v8.4s +str q6, [x0, #416] +mul v1.4S, v1.4S,v23.s[2] +sub v6.4s, v18.4s, v2.4s +str q6, [x0, #224] +mul v15.4S, v15.4S,v23.s[3] +add v18.4s, v18.4s, v2.4s +str q18, [x0, #160] +mla v1.4S, v12.4S, v31.s[0] +sub v12.4s, v26.4s, v25.4s +str q12, [x0, #96] +mla v15.4S, v22.4S, v31.s[0] +add v26.4s, v26.4s, v25.4s +str q26, [x0, #32] +ldr q26, [x0, #944] +sqrdmulh v25.4S, v26.4S, v28.s[0] +mul v26.4S, v26.4S,v29.s[0] +ldr q22, [x0, #1008] +sqrdmulh v12.4S, v22.4S, v28.s[0] +sub v18.4s, v14.4s, v13.4s +str q18, [x0, #608] +mul v22.4S, v22.4S,v29.s[0] +add v14.4s, v14.4s, v13.4s +str q14, [x0, #544] +ldr q14, [x0, #816] +sqrdmulh v13.4S, v14.4S, v28.s[0] +sub v18.4s, v24.4s, v19.4s +str q18, [x0, #736] +mul v14.4S, v14.4S,v29.s[0] +add v24.4s, v24.4s, v19.4s +str q24, [x0, #672] +ldr q24, [x0, #880] +sqrdmulh v19.4S, v24.4S, v28.s[0] +sub v18.4s, v16.4s, v1.4s +str q18, [x0, #864] +mul v24.4S, v24.4S,v29.s[0] +add v16.4s, v16.4s, v1.4s +str q16, [x0, #800] +ldr q16, [x0, #560] +mla v26.4S, v25.4S, v31.s[0] +sub v25.4s, v10.4s, v15.4s +str q25, [x0, #992] +sqrdmulh v25.4S, v16.4S, v28.s[0] +add v10.4s, v10.4s, v15.4s +str q10, [x0, #928] +ldr q10, [x0, #624] +mla v22.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v10.4S, v28.s[0] +ldr q15, [x0, #688] +mla v14.4S, v13.4S, v31.s[0] +sqrdmulh v13.4S, v15.4S, v28.s[0] +ldr q1, [x0, #752] +mla v24.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v1.4S, v28.s[0] +ldr q18, [x0, #432] +ldr q2, [x0, #496] +mul v16.4S, v16.4S,v29.s[0] +sub v6.4s, v18.4s, v26.4s +mul v10.4S, v10.4S,v29.s[0] +add v18.4s, v18.4s, v26.4s +ldr q26, [x0, #304] +ldr q8, [x0, #368] +mla v16.4S, v25.4S, v31.s[0] +sub v25.4s, v2.4s, v22.4s +mla v10.4S, v12.4S, v31.s[0] +add v2.4s, v2.4s, v22.4s +ldr q22, [x0, #48] +ldr q12, [x0, #112] +mul v15.4S, v15.4S,v29.s[0] +sub v27.4s, v26.4s, v14.4s +mul v1.4S, v1.4S,v29.s[0] +add v26.4s, v26.4s, v14.4s +ldr q14, [x0, #176] +ldr q30, [x0, #240] +mla v15.4S, v13.4S, v31.s[0] +sub v13.4s, v8.4s, v24.4s +mla v1.4S, v19.4S, v31.s[0] +add v8.4s, v8.4s, v24.4s +sqrdmulh v24.4S, v18.4S, v28.s[1] +mul v18.4S, v18.4S,v29.s[1] +sqrdmulh v19.4S, v2.4S, v28.s[1] +sub v3.4s, v22.4s, v16.4s +mul v2.4S, v2.4S,v29.s[1] +add v22.4s, v22.4s, v16.4s +sqrdmulh v16.4S, v26.4S, v28.s[1] +sub v20.4s, v12.4s, v10.4s +mul v26.4S, v26.4S,v29.s[1] +add v12.4s, v12.4s, v10.4s +sqrdmulh v10.4S, v8.4S, v28.s[1] +sub v21.4s, v14.4s, v15.4s +mul v8.4S, v8.4S,v29.s[1] +add v14.4s, v14.4s, v15.4s +mla v18.4S, v24.4S, v31.s[0] +sub v24.4s, v30.4s, v1.4s +sqrdmulh v15.4S, v6.4S, v28.s[2] +add v30.4s, v30.4s, v1.4s +mla v2.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v25.4S, v28.s[2] +mla v26.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v27.4S, v28.s[2] +mla v8.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v13.4S, v28.s[2] +mul v6.4S, v6.4S,v29.s[2] +sub v1.4s, v14.4s, v18.4s +mul v25.4S, v25.4S,v29.s[2] +add v14.4s, v14.4s, v18.4s +mla v6.4S, v15.4S, v31.s[0] +sub v15.4s, v30.4s, v2.4s +mla v25.4S, v19.4S, v31.s[0] +add v30.4s, v30.4s, v2.4s +mul v27.4S, v27.4S,v29.s[2] +sub v2.4s, v22.4s, v26.4s +mul v13.4S, v13.4S,v29.s[2] +add v22.4s, v22.4s, v26.4s +mla v27.4S, v16.4S, v31.s[0] +sub v16.4s, v12.4s, v8.4s +mla v13.4S, v10.4S, v31.s[0] +add v12.4s, v12.4s, v8.4s +sqrdmulh v8.4S, v1.4S, v11.s[1] +mul v1.4S, v1.4S,v17.s[1] +sqrdmulh v10.4S, v15.4S, v11.s[1] +sub v26.4s, v21.4s, v6.4s +mul v15.4S, v15.4S,v17.s[1] +add v21.4s, v21.4s, v6.4s +sqrdmulh v6.4S, v14.4S, v11.s[0] +sub v19.4s, v24.4s, v25.4s +mul v14.4S, v14.4S,v17.s[0] +add v24.4s, v24.4s, v25.4s +sqrdmulh v25.4S, v30.4S, v11.s[0] +sub v18.4s, v3.4s, v27.4s +mul v30.4S, v30.4S,v17.s[0] +add v3.4s, v3.4s, v27.4s +mla v1.4S, v8.4S, v31.s[0] +sub v8.4s, v20.4s, v13.4s +sqrdmulh v27.4S, v21.4S, v11.s[2] +add v20.4s, v20.4s, v13.4s +mla v15.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v24.4S, v11.s[2] +mla v14.4S, v6.4S, v31.s[0] +sqrdmulh v6.4S, v26.4S, v11.s[3] +mla v30.4S, v25.4S, v31.s[0] +sqrdmulh v25.4S, v19.4S, v11.s[3] +mul v21.4S, v21.4S,v17.s[2] +sub v13.4s, v2.4s, v1.4s +mul v24.4S, v24.4S,v17.s[2] +add v2.4s, v2.4s, v1.4s +mla v21.4S, v27.4S, v31.s[0] +sub v27.4s, v16.4s, v15.4s +mla v24.4S, v10.4S, v31.s[0] +add v16.4s, v16.4s, v15.4s +mul v26.4S, v26.4S,v17.s[3] +sub v15.4s, v22.4s, v14.4s +mul v19.4S, v19.4S,v17.s[3] +add v22.4s, v22.4s, v14.4s +mla v26.4S, v6.4S, v31.s[0] +sub v6.4s, v12.4s, v30.4s +mla v19.4S, v25.4S, v31.s[0] +add v12.4s, v12.4s, v30.4s +sqrdmulh v30.4S, v16.4S, v9.s[2] +mul v16.4S, v16.4S,v0.s[2] +sqrdmulh v25.4S, v27.4S, v9.s[3] +sub v14.4s, v3.4s, v21.4s +mul v27.4S, v27.4S,v0.s[3] +add v3.4s, v3.4s, v21.4s +sqrdmulh v21.4S, v6.4S, v9.s[1] +sub v10.4s, v20.4s, v24.4s +mul v6.4S, v6.4S,v0.s[1] +add v20.4s, v20.4s, v24.4s +sqrdmulh v24.4S, v12.4S, v9.s[0] +sub v1.4s, v18.4s, v26.4s +mul v12.4S, v12.4S,v0.s[0] +add v18.4s, v18.4s, v26.4s +mla v16.4S, v30.4S, v31.s[0] +sub v30.4s, v8.4s, v19.4s +sqrdmulh v26.4S, v20.4S, v7.s[0] +add v8.4s, v8.4s, v19.4s +mla v27.4S, v25.4S, v31.s[0] +sqrdmulh v25.4S, v10.4S, v7.s[1] +mla v6.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v8.4S, v7.s[2] +mla v12.4S, v24.4S, v31.s[0] +sqrdmulh v24.4S, v30.4S, v7.s[3] +mul v20.4S, v20.4S,v23.s[0] +sub v19.4s, v2.4s, v16.4s +str q19, [x0, #368] +mul v10.4S, v10.4S,v23.s[1] +add v2.4s, v2.4s, v16.4s +str q2, [x0, #304] +mla v20.4S, v26.4S, v31.s[0] +sub v26.4s, v13.4s, v27.4s +str q26, [x0, #496] +mla v10.4S, v25.4S, v31.s[0] +add v13.4s, v13.4s, v27.4s +str q13, [x0, #432] +mul v8.4S, v8.4S,v23.s[2] +sub v13.4s, v15.4s, v6.4s +str q13, [x0, #240] +mul v30.4S, v30.4S,v23.s[3] +add v15.4s, v15.4s, v6.4s +str q15, [x0, #176] +mla v8.4S, v21.4S, v31.s[0] +sub v21.4s, v22.4s, v12.4s +str q21, [x0, #112] +mla v30.4S, v24.4S, v31.s[0] +add v22.4s, v22.4s, v12.4s +str q22, [x0, #48] +ldr q22, [x0, #896] +sqrdmulh v12.4S, v22.4S, v28.s[0] +mul v22.4S, v22.4S,v29.s[0] +ldr q24, [x0, #960] +sqrdmulh v21.4S, v24.4S, v28.s[0] +sub v15.4s, v3.4s, v20.4s +str q15, [x0, #624] +mul v24.4S, v24.4S,v29.s[0] +add v3.4s, v3.4s, v20.4s +str q3, [x0, #560] +ldr q3, [x0, #768] +sqrdmulh v20.4S, v3.4S, v28.s[0] +sub v15.4s, v14.4s, v10.4s +str q15, [x0, #752] +mul v3.4S, v3.4S,v29.s[0] +add v14.4s, v14.4s, v10.4s +str q14, [x0, #688] +ldr q14, [x0, #832] +sqrdmulh v10.4S, v14.4S, v28.s[0] +sub v15.4s, v18.4s, v8.4s +str q15, [x0, #880] +mul v14.4S, v14.4S,v29.s[0] +add v18.4s, v18.4s, v8.4s +str q18, [x0, #816] +ldr q18, [x0, #512] +mla v22.4S, v12.4S, v31.s[0] +sub v12.4s, v1.4s, v30.4s +str q12, [x0, #1008] +sqrdmulh v12.4S, v18.4S, v28.s[0] +add v1.4s, v1.4s, v30.4s +str q1, [x0, #944] +ldr q1, [x0, #576] +mla v24.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v1.4S, v28.s[0] +ldr q30, [x0, #640] +mla v3.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v30.4S, v28.s[0] +ldr q8, [x0, #704] +mla v14.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v8.4S, v28.s[0] +ldr q15, [x0, #384] +ldr q6, [x0, #448] +mul v18.4S, v18.4S,v29.s[0] +sub v13.4s, v15.4s, v22.4s +mul v1.4S, v1.4S,v29.s[0] +add v15.4s, v15.4s, v22.4s +ldr q22, [x0, #256] +ldr q27, [x0, #320] +mla v18.4S, v12.4S, v31.s[0] +sub v12.4s, v6.4s, v24.4s +mla v1.4S, v21.4S, v31.s[0] +add v6.4s, v6.4s, v24.4s +ldr q24, [x0, #0] +ldr q21, [x0, #64] +mul v30.4S, v30.4S,v29.s[0] +sub v25.4s, v22.4s, v3.4s +mul v8.4S, v8.4S,v29.s[0] +add v22.4s, v22.4s, v3.4s +ldr q3, [x0, #128] +ldr q26, [x0, #192] +mla v30.4S, v20.4S, v31.s[0] +sub v20.4s, v27.4s, v14.4s +mla v8.4S, v10.4S, v31.s[0] +add v27.4s, v27.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v28.s[1] +mul v15.4S, v15.4S,v29.s[1] +sqrdmulh v10.4S, v6.4S, v28.s[1] +sub v2.4s, v24.4s, v18.4s +mul v6.4S, v6.4S,v29.s[1] +add v24.4s, v24.4s, v18.4s +sqrdmulh v18.4S, v22.4S, v28.s[1] +sub v16.4s, v21.4s, v1.4s +mul v22.4S, v22.4S,v29.s[1] +add v21.4s, v21.4s, v1.4s +sqrdmulh v1.4S, v27.4S, v28.s[1] +sub v19.4s, v3.4s, v30.4s +mul v27.4S, v27.4S,v29.s[1] +add v3.4s, v3.4s, v30.4s +mla v15.4S, v14.4S, v31.s[0] +sub v14.4s, v26.4s, v8.4s +sqrdmulh v30.4S, v13.4S, v28.s[2] +add v26.4s, v26.4s, v8.4s +mla v6.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v12.4S, v28.s[2] +mla v22.4S, v18.4S, v31.s[0] +sqrdmulh v18.4S, v25.4S, v28.s[2] +mla v27.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v20.4S, v28.s[2] +mul v13.4S, v13.4S,v29.s[2] +sub v8.4s, v3.4s, v15.4s +mul v12.4S, v12.4S,v29.s[2] +add v3.4s, v3.4s, v15.4s +mla v13.4S, v30.4S, v31.s[0] +sub v30.4s, v26.4s, v6.4s +mla v12.4S, v10.4S, v31.s[0] +add v26.4s, v26.4s, v6.4s +mul v25.4S, v25.4S,v29.s[2] +sub v6.4s, v24.4s, v22.4s +mul v20.4S, v20.4S,v29.s[2] +add v24.4s, v24.4s, v22.4s +mla v25.4S, v18.4S, v31.s[0] +sub v18.4s, v21.4s, v27.4s +mla v20.4S, v1.4S, v31.s[0] +add v21.4s, v21.4s, v27.4s +sqrdmulh v27.4S, v8.4S, v11.s[1] +mul v8.4S, v8.4S,v17.s[1] +sqrdmulh v1.4S, v30.4S, v11.s[1] +sub v22.4s, v19.4s, v13.4s +mul v30.4S, v30.4S,v17.s[1] +add v19.4s, v19.4s, v13.4s +sqrdmulh v13.4S, v3.4S, v11.s[0] +sub v10.4s, v14.4s, v12.4s +mul v3.4S, v3.4S,v17.s[0] +add v14.4s, v14.4s, v12.4s +sqrdmulh v12.4S, v26.4S, v11.s[0] +sub v15.4s, v2.4s, v25.4s +mul v26.4S, v26.4S,v17.s[0] +add v2.4s, v2.4s, v25.4s +mla v8.4S, v27.4S, v31.s[0] +sub v27.4s, v16.4s, v20.4s +sqrdmulh v25.4S, v19.4S, v11.s[2] +add v16.4s, v16.4s, v20.4s +mla v30.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v14.4S, v11.s[2] +mla v3.4S, v13.4S, v31.s[0] +sqrdmulh v13.4S, v22.4S, v11.s[3] +mla v26.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v10.4S, v11.s[3] +mul v19.4S, v19.4S,v17.s[2] +sub v20.4s, v6.4s, v8.4s +mul v14.4S, v14.4S,v17.s[2] +add v6.4s, v6.4s, v8.4s +mla v19.4S, v25.4S, v31.s[0] +sub v25.4s, v18.4s, v30.4s +mla v14.4S, v1.4S, v31.s[0] +add v18.4s, v18.4s, v30.4s +mul v22.4S, v22.4S,v17.s[3] +sub v30.4s, v24.4s, v3.4s +mul v10.4S, v10.4S,v17.s[3] +add v24.4s, v24.4s, v3.4s +mla v22.4S, v13.4S, v31.s[0] +sub v13.4s, v21.4s, v26.4s +mla v10.4S, v12.4S, v31.s[0] +add v21.4s, v21.4s, v26.4s +sqrdmulh v26.4S, v18.4S, v9.s[2] +mul v18.4S, v18.4S,v0.s[2] +sqrdmulh v12.4S, v25.4S, v9.s[3] +sub v3.4s, v2.4s, v19.4s +mul v25.4S, v25.4S,v0.s[3] +add v2.4s, v2.4s, v19.4s +sqrdmulh v19.4S, v13.4S, v9.s[1] +sub v1.4s, v16.4s, v14.4s +mul v13.4S, v13.4S,v0.s[1] +add v16.4s, v16.4s, v14.4s +sqrdmulh v14.4S, v21.4S, v9.s[0] +sub v8.4s, v15.4s, v22.4s +mul v21.4S, v21.4S,v0.s[0] +add v15.4s, v15.4s, v22.4s +mla v18.4S, v26.4S, v31.s[0] +sub v26.4s, v27.4s, v10.4s +sqrdmulh v22.4S, v16.4S, v7.s[0] +add v27.4s, v27.4s, v10.4s +mla v25.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v1.4S, v7.s[1] +mla v13.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v27.4S, v7.s[2] +mla v21.4S, v14.4S, v31.s[0] +sqrdmulh v14.4S, v26.4S, v7.s[3] +mul v16.4S, v16.4S,v23.s[0] +sub v10.4s, v6.4s, v18.4s +str q10, [x0, #320] +mul v1.4S, v1.4S,v23.s[1] +add v6.4s, v6.4s, v18.4s +str q6, [x0, #256] +mla v16.4S, v22.4S, v31.s[0] +sub v22.4s, v20.4s, v25.4s +str q22, [x0, #448] +mla v1.4S, v12.4S, v31.s[0] +add v20.4s, v20.4s, v25.4s +str q20, [x0, #384] +mul v27.4S, v27.4S,v23.s[2] +sub v20.4s, v30.4s, v13.4s +str q20, [x0, #192] +mul v26.4S, v26.4S,v23.s[3] +add v30.4s, v30.4s, v13.4s +str q30, [x0, #128] +mla v27.4S, v19.4S, v31.s[0] +sub v19.4s, v24.4s, v21.4s +str q19, [x0, #64] +mla v26.4S, v14.4S, v31.s[0] +add v24.4s, v24.4s, v21.4s +str q24, [x0, #0] +ldr q24, [x0, #912] +sqrdmulh v21.4S, v24.4S, v28.s[0] +mul v24.4S, v24.4S,v29.s[0] +ldr q14, [x0, #976] +sqrdmulh v19.4S, v14.4S, v28.s[0] +sub v30.4s, v2.4s, v16.4s +str q30, [x0, #576] +mul v14.4S, v14.4S,v29.s[0] +add v2.4s, v2.4s, v16.4s +str q2, [x0, #512] +ldr q2, [x0, #784] +sqrdmulh v16.4S, v2.4S, v28.s[0] +sub v30.4s, v3.4s, v1.4s +str q30, [x0, #704] +mul v2.4S, v2.4S,v29.s[0] +add v3.4s, v3.4s, v1.4s +str q3, [x0, #640] +ldr q3, [x0, #848] +sqrdmulh v1.4S, v3.4S, v28.s[0] +sub v30.4s, v15.4s, v27.4s +str q30, [x0, #832] +mul v3.4S, v3.4S,v29.s[0] +add v15.4s, v15.4s, v27.4s +str q15, [x0, #768] +ldr q15, [x0, #528] +mla v24.4S, v21.4S, v31.s[0] +sub v21.4s, v8.4s, v26.4s +str q21, [x0, #960] +sqrdmulh v21.4S, v15.4S, v28.s[0] +add v8.4s, v8.4s, v26.4s +str q8, [x0, #896] +ldr q8, [x0, #592] +mla v14.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v8.4S, v28.s[0] +ldr q26, [x0, #656] +mla v2.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v26.4S, v28.s[0] +ldr q27, [x0, #720] +mla v3.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v27.4S, v28.s[0] +ldr q30, [x0, #400] +ldr q13, [x0, #464] +mul v15.4S, v15.4S,v29.s[0] +sub v20.4s, v30.4s, v24.4s +mul v8.4S, v8.4S,v29.s[0] +add v30.4s, v30.4s, v24.4s +ldr q24, [x0, #272] +ldr q25, [x0, #336] +mla v15.4S, v21.4S, v31.s[0] +sub v21.4s, v13.4s, v14.4s +mla v8.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v14.4s +ldr q14, [x0, #16] +ldr q19, [x0, #80] +mul v26.4S, v26.4S,v29.s[0] +sub v12.4s, v24.4s, v2.4s +mul v27.4S, v27.4S,v29.s[0] +add v24.4s, v24.4s, v2.4s +ldr q2, [x0, #144] +ldr q22, [x0, #208] +mla v26.4S, v16.4S, v31.s[0] +sub v16.4s, v25.4s, v3.4s +mla v27.4S, v1.4S, v31.s[0] +add v25.4s, v25.4s, v3.4s +sqrdmulh v3.4S, v30.4S, v28.s[1] +mul v30.4S, v30.4S,v29.s[1] +sqrdmulh v1.4S, v13.4S, v28.s[1] +sub v6.4s, v14.4s, v15.4s +mul v13.4S, v13.4S,v29.s[1] +add v14.4s, v14.4s, v15.4s +sqrdmulh v15.4S, v24.4S, v28.s[1] +sub v18.4s, v19.4s, v8.4s +mul v24.4S, v24.4S,v29.s[1] +add v19.4s, v19.4s, v8.4s +sqrdmulh v8.4S, v25.4S, v28.s[1] +sub v10.4s, v2.4s, v26.4s +mul v25.4S, v25.4S,v29.s[1] +add v2.4s, v2.4s, v26.4s +mla v30.4S, v3.4S, v31.s[0] +sub v3.4s, v22.4s, v27.4s +sqrdmulh v26.4S, v20.4S, v28.s[2] +add v22.4s, v22.4s, v27.4s +mla v13.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v21.4S, v28.s[2] +mla v24.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v12.4S, v28.s[2] +mla v25.4S, v8.4S, v31.s[0] +sqrdmulh v8.4S, v16.4S, v28.s[2] +mul v20.4S, v20.4S,v29.s[2] +sub v27.4s, v2.4s, v30.4s +mul v21.4S, v21.4S,v29.s[2] +add v2.4s, v2.4s, v30.4s +mla v20.4S, v26.4S, v31.s[0] +sub v26.4s, v22.4s, v13.4s +mla v21.4S, v1.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +mul v12.4S, v12.4S,v29.s[2] +sub v13.4s, v14.4s, v24.4s +mul v16.4S, v16.4S,v29.s[2] +add v14.4s, v14.4s, v24.4s +mla v12.4S, v15.4S, v31.s[0] +sub v15.4s, v19.4s, v25.4s +mla v16.4S, v8.4S, v31.s[0] +add v19.4s, v19.4s, v25.4s +sqrdmulh v28.4S, v27.4S, v11.s[1] +mul v27.4S, v27.4S,v17.s[1] +sqrdmulh v29.4S, v26.4S, v11.s[1] +sub v25.4s, v10.4s, v20.4s +mul v26.4S, v26.4S,v17.s[1] +add v10.4s, v10.4s, v20.4s +sqrdmulh v20.4S, v2.4S, v11.s[0] +sub v8.4s, v3.4s, v21.4s +mul v2.4S, v2.4S,v17.s[0] +add v3.4s, v3.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v11.s[0] +sub v24.4s, v6.4s, v12.4s +mul v22.4S, v22.4S,v17.s[0] +add v6.4s, v6.4s, v12.4s +mla v27.4S, v28.4S, v31.s[0] +sub v28.4s, v18.4s, v16.4s +sqrdmulh v12.4S, v10.4S, v11.s[2] +add v18.4s, v18.4s, v16.4s +mla v26.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v3.4S, v11.s[2] +mla v2.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v25.4S, v11.s[3] +mla v22.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v8.4S, v11.s[3] +mul v10.4S, v10.4S,v17.s[2] +sub v16.4s, v13.4s, v27.4s +mul v3.4S, v3.4S,v17.s[2] +add v13.4s, v13.4s, v27.4s +mla v10.4S, v12.4S, v31.s[0] +sub v12.4s, v15.4s, v26.4s +mla v3.4S, v29.4S, v31.s[0] +add v15.4s, v15.4s, v26.4s +mul v25.4S, v25.4S,v17.s[3] +sub v26.4s, v14.4s, v2.4s +mul v8.4S, v8.4S,v17.s[3] +add v14.4s, v14.4s, v2.4s +mla v25.4S, v20.4S, v31.s[0] +sub v20.4s, v19.4s, v22.4s +mla v8.4S, v21.4S, v31.s[0] +add v19.4s, v19.4s, v22.4s +sqrdmulh v11.4S, v15.4S, v9.s[2] +mul v15.4S, v15.4S,v0.s[2] +sqrdmulh v17.4S, v12.4S, v9.s[3] +sub v22.4s, v6.4s, v10.4s +mul v12.4S, v12.4S,v0.s[3] +add v6.4s, v6.4s, v10.4s +sqrdmulh v10.4S, v20.4S, v9.s[1] +sub v21.4s, v18.4s, v3.4s +mul v20.4S, v20.4S,v0.s[1] +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v19.4S, v9.s[0] +sub v2.4s, v24.4s, v25.4s +mul v19.4S, v19.4S,v0.s[0] +add v24.4s, v24.4s, v25.4s +mla v15.4S, v11.4S, v31.s[0] +sub v11.4s, v28.4s, v8.4s +sqrdmulh v25.4S, v18.4S, v7.s[0] +add v28.4s, v28.4s, v8.4s +mla v12.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v21.4S, v7.s[1] +mla v20.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v28.4S, v7.s[2] +mla v19.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v11.4S, v7.s[3] +mul v18.4S, v18.4S,v23.s[0] +sub v8.4s, v13.4s, v15.4s +str q8, [x0, #336] +mul v21.4S, v21.4S,v23.s[1] +add v13.4s, v13.4s, v15.4s +str q13, [x0, #272] +mla v18.4S, v25.4S, v31.s[0] +sub v25.4s, v16.4s, v12.4s +str q25, [x0, #464] +mla v21.4S, v17.4S, v31.s[0] +add v16.4s, v16.4s, v12.4s +str q16, [x0, #400] +mul v28.4S, v28.4S,v23.s[2] +sub v16.4s, v26.4s, v20.4s +str q16, [x0, #208] +mul v11.4S, v11.4S,v23.s[3] +add v26.4s, v26.4s, v20.4s +str q26, [x0, #144] +mla v28.4S, v10.4S, v31.s[0] +sub v10.4s, v14.4s, v19.4s +str q10, [x0, #80] +mla v11.4S, v3.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +str q14, [x0, #16] +sub v7.4s, v6.4s, v18.4s +str q7, [x0, #592] +add v6.4s, v6.4s, v18.4s +str q6, [x0, #528] +sub v6.4s, v22.4s, v21.4s +str q6, [x0, #720] +add v22.4s, v22.4s, v21.4s +str q22, [x0, #656] +sub v22.4s, v24.4s, v28.4s +str q22, [x0, #848] +add v24.4s, v24.4s, v28.4s +str q24, [x0, #784] +sub v24.4s, v2.4s, v11.4s +str q24, [x0, #976] +add v2.4s, v2.4s, v11.4s +str q2, [x0, #912] +ldr q4, [x17, #+128] +ldr q5, [x17, #+144] +ldr q30, [x17, #+160] +ldr q1, [x17, #+176] +ldr q27, [x17, #+192] +ldr q29, [x17, #+208] +ldr q8, [x17, #+224] +ldr q15, [x17, #+240] +ldr q13, [x0, #32] +ldr q25, [x0, #48] +ldr q17, [x0, #0] +ldr q12, [x0, #16] +sqrdmulh v16.4S, v13.4S, v5.s[0] +mul v13.4S, v13.4S,v4.s[0] +mla v13.4S, v16.4S, v31.s[0] +sub v16.4s, v17.4s, v13.4s +add v17.4s, v17.4s, v13.4s +sqrdmulh v13.4S, v25.4S, v5.s[0] +mul v25.4S, v25.4S,v4.s[0] +mla v25.4S, v13.4S, v31.s[0] +sub v13.4s, v12.4s, v25.4s +add v12.4s, v12.4s, v25.4s +sqrdmulh v25.4S, v12.4S, v5.s[1] +mul v12.4S, v12.4S,v4.s[1] +mla v12.4S, v25.4S, v31.s[0] +sub v25.4s, v17.4s, v12.4s +add v17.4s, v17.4s, v12.4s +sqrdmulh v12.4S, v13.4S, v5.s[2] +mul v13.4S, v13.4S,v4.s[2] +mla v13.4S, v12.4S, v31.s[0] +sub v12.4s, v16.4s, v13.4s +add v16.4s, v16.4s, v13.4s +trn1 v13.4S, v17.4S, v25.4S +trn2 v20.4S, v17.4S, v25.4S +trn1 v26.4S, v16.4S, v12.4S +trn2 v10.4S, v16.4S, v12.4S +trn2 v16.2D, v13.2D, v26.2D +trn2 v12.2D, v20.2D, v10.2D +trn1 v17.2D, v13.2D, v26.2D +trn1 v25.2D, v20.2D, v10.2D +sqrdmulh v10.4S, v16.4S, v1.4S +mul v16.4S, v16.4S,v30.4S +mla v16.4S, v10.4S, v31.s[0] +sub v10.4s, v17.4s, v16.4s +add v17.4s, v17.4s, v16.4s +sqrdmulh v16.4S, v12.4S, v1.4S +mul v12.4S, v12.4S,v30.4S +mla v12.4S, v16.4S, v31.s[0] +sub v16.4s, v25.4s, v12.4s +add v25.4s, v25.4s, v12.4s +sqrdmulh v12.4S, v25.4S, v29.4S +mul v25.4S, v25.4S,v27.4S +mla v25.4S, v12.4S, v31.s[0] +sub v12.4s, v17.4s, v25.4s +add v17.4s, v17.4s, v25.4s +sqrdmulh v25.4S, v16.4S, v15.4S +mul v16.4S, v16.4S,v8.4S +mla v16.4S, v25.4S, v31.s[0] +sub v25.4s, v10.4s, v16.4s +add v10.4s, v10.4s, v16.4s +str q17, [x0, #0] +str q12, [x0, #16] +str q10, [x0, #32] +str q25, [x0, #48] +ldr q25, [x17, #+256] +ldr q10, [x17, #+272] +ldr q12, [x17, #+288] +ldr q17, [x17, #+304] +ldr q16, [x17, #+320] +ldr q20, [x17, #+336] +ldr q26, [x17, #+352] +ldr q13, [x17, #+368] +ldr q15, [x0, #96] +ldr q8, [x0, #112] +ldr q29, [x0, #64] +ldr q27, [x0, #80] +sqrdmulh v1.4S, v15.4S, v10.s[0] +mul v15.4S, v15.4S,v25.s[0] +mla v15.4S, v1.4S, v31.s[0] +sub v1.4s, v29.4s, v15.4s +add v29.4s, v29.4s, v15.4s +sqrdmulh v15.4S, v8.4S, v10.s[0] +mul v8.4S, v8.4S,v25.s[0] +mla v8.4S, v15.4S, v31.s[0] +sub v15.4s, v27.4s, v8.4s +add v27.4s, v27.4s, v8.4s +sqrdmulh v8.4S, v27.4S, v10.s[1] +mul v27.4S, v27.4S,v25.s[1] +mla v27.4S, v8.4S, v31.s[0] +sub v8.4s, v29.4s, v27.4s +add v29.4s, v29.4s, v27.4s +sqrdmulh v27.4S, v15.4S, v10.s[2] +mul v15.4S, v15.4S,v25.s[2] +mla v15.4S, v27.4S, v31.s[0] +sub v27.4s, v1.4s, v15.4s +add v1.4s, v1.4s, v15.4s +trn1 v15.4S, v29.4S, v8.4S +trn2 v30.4S, v29.4S, v8.4S +trn1 v5.4S, v1.4S, v27.4S +trn2 v4.4S, v1.4S, v27.4S +trn2 v1.2D, v15.2D, v5.2D +trn2 v27.2D, v30.2D, v4.2D +trn1 v29.2D, v15.2D, v5.2D +trn1 v8.2D, v30.2D, v4.2D +sqrdmulh v4.4S, v1.4S, v17.4S +mul v1.4S, v1.4S,v12.4S +mla v1.4S, v4.4S, v31.s[0] +sub v4.4s, v29.4s, v1.4s +add v29.4s, v29.4s, v1.4s +sqrdmulh v1.4S, v27.4S, v17.4S +mul v27.4S, v27.4S,v12.4S +mla v27.4S, v1.4S, v31.s[0] +sub v1.4s, v8.4s, v27.4s +add v8.4s, v8.4s, v27.4s +sqrdmulh v27.4S, v8.4S, v20.4S +mul v8.4S, v8.4S,v16.4S +mla v8.4S, v27.4S, v31.s[0] +sub v27.4s, v29.4s, v8.4s +add v29.4s, v29.4s, v8.4s +sqrdmulh v8.4S, v1.4S, v13.4S +mul v1.4S, v1.4S,v26.4S +mla v1.4S, v8.4S, v31.s[0] +sub v8.4s, v4.4s, v1.4s +add v4.4s, v4.4s, v1.4s +str q29, [x0, #64] +str q27, [x0, #80] +str q4, [x0, #96] +str q8, [x0, #112] +ldr q8, [x17, #+384] +ldr q4, [x17, #+400] +ldr q27, [x17, #+416] +ldr q29, [x17, #+432] +ldr q1, [x17, #+448] +ldr q30, [x17, #+464] +ldr q5, [x17, #+480] +ldr q15, [x17, #+496] +ldr q13, [x0, #160] +ldr q26, [x0, #176] +ldr q20, [x0, #128] +ldr q16, [x0, #144] +sqrdmulh v17.4S, v13.4S, v4.s[0] +mul v13.4S, v13.4S,v8.s[0] +mla v13.4S, v17.4S, v31.s[0] +sub v17.4s, v20.4s, v13.4s +add v20.4s, v20.4s, v13.4s +sqrdmulh v13.4S, v26.4S, v4.s[0] +mul v26.4S, v26.4S,v8.s[0] +mla v26.4S, v13.4S, v31.s[0] +sub v13.4s, v16.4s, v26.4s +add v16.4s, v16.4s, v26.4s +sqrdmulh v26.4S, v16.4S, v4.s[1] +mul v16.4S, v16.4S,v8.s[1] +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v20.4s, v16.4s +add v20.4s, v20.4s, v16.4s +sqrdmulh v16.4S, v13.4S, v4.s[2] +mul v13.4S, v13.4S,v8.s[2] +mla v13.4S, v16.4S, v31.s[0] +sub v16.4s, v17.4s, v13.4s +add v17.4s, v17.4s, v13.4s +trn1 v13.4S, v20.4S, v26.4S +trn2 v12.4S, v20.4S, v26.4S +trn1 v10.4S, v17.4S, v16.4S +trn2 v25.4S, v17.4S, v16.4S +trn2 v17.2D, v13.2D, v10.2D +trn2 v16.2D, v12.2D, v25.2D +trn1 v20.2D, v13.2D, v10.2D +trn1 v26.2D, v12.2D, v25.2D +sqrdmulh v25.4S, v17.4S, v29.4S +mul v17.4S, v17.4S,v27.4S +mla v17.4S, v25.4S, v31.s[0] +sub v25.4s, v20.4s, v17.4s +add v20.4s, v20.4s, v17.4s +sqrdmulh v17.4S, v16.4S, v29.4S +mul v16.4S, v16.4S,v27.4S +mla v16.4S, v17.4S, v31.s[0] +sub v17.4s, v26.4s, v16.4s +add v26.4s, v26.4s, v16.4s +sqrdmulh v16.4S, v26.4S, v30.4S +mul v26.4S, v26.4S,v1.4S +mla v26.4S, v16.4S, v31.s[0] +sub v16.4s, v20.4s, v26.4s +add v20.4s, v20.4s, v26.4s +sqrdmulh v26.4S, v17.4S, v15.4S +mul v17.4S, v17.4S,v5.4S +mla v17.4S, v26.4S, v31.s[0] +sub v26.4s, v25.4s, v17.4s +add v25.4s, v25.4s, v17.4s +str q20, [x0, #128] +str q16, [x0, #144] +str q25, [x0, #160] +str q26, [x0, #176] +ldr q26, [x17, #+512] +ldr q25, [x17, #+528] +ldr q16, [x17, #+544] +ldr q20, [x17, #+560] +ldr q17, [x17, #+576] +ldr q12, [x17, #+592] +ldr q10, [x17, #+608] +ldr q13, [x17, #+624] +ldr q15, [x0, #224] +ldr q5, [x0, #240] +ldr q30, [x0, #192] +ldr q1, [x0, #208] +sqrdmulh v29.4S, v15.4S, v25.s[0] +mul v15.4S, v15.4S,v26.s[0] +mla v15.4S, v29.4S, v31.s[0] +sub v29.4s, v30.4s, v15.4s +add v30.4s, v30.4s, v15.4s +sqrdmulh v15.4S, v5.4S, v25.s[0] +mul v5.4S, v5.4S,v26.s[0] +mla v5.4S, v15.4S, v31.s[0] +sub v15.4s, v1.4s, v5.4s +add v1.4s, v1.4s, v5.4s +sqrdmulh v5.4S, v1.4S, v25.s[1] +mul v1.4S, v1.4S,v26.s[1] +mla v1.4S, v5.4S, v31.s[0] +sub v5.4s, v30.4s, v1.4s +add v30.4s, v30.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v25.s[2] +mul v15.4S, v15.4S,v26.s[2] +mla v15.4S, v1.4S, v31.s[0] +sub v1.4s, v29.4s, v15.4s +add v29.4s, v29.4s, v15.4s +trn1 v15.4S, v30.4S, v5.4S +trn2 v27.4S, v30.4S, v5.4S +trn1 v4.4S, v29.4S, v1.4S +trn2 v8.4S, v29.4S, v1.4S +trn2 v29.2D, v15.2D, v4.2D +trn2 v1.2D, v27.2D, v8.2D +trn1 v30.2D, v15.2D, v4.2D +trn1 v5.2D, v27.2D, v8.2D +sqrdmulh v8.4S, v29.4S, v20.4S +mul v29.4S, v29.4S,v16.4S +mla v29.4S, v8.4S, v31.s[0] +sub v8.4s, v30.4s, v29.4s +add v30.4s, v30.4s, v29.4s +sqrdmulh v29.4S, v1.4S, v20.4S +mul v1.4S, v1.4S,v16.4S +mla v1.4S, v29.4S, v31.s[0] +sub v29.4s, v5.4s, v1.4s +add v5.4s, v5.4s, v1.4s +sqrdmulh v1.4S, v5.4S, v12.4S +mul v5.4S, v5.4S,v17.4S +mla v5.4S, v1.4S, v31.s[0] +sub v1.4s, v30.4s, v5.4s +add v30.4s, v30.4s, v5.4s +sqrdmulh v5.4S, v29.4S, v13.4S +mul v29.4S, v29.4S,v10.4S +mla v29.4S, v5.4S, v31.s[0] +sub v5.4s, v8.4s, v29.4s +add v8.4s, v8.4s, v29.4s +str q30, [x0, #192] +str q1, [x0, #208] +str q8, [x0, #224] +str q5, [x0, #240] +ldr q5, [x17, #+640] +ldr q8, [x17, #+656] +ldr q1, [x17, #+672] +ldr q30, [x17, #+688] +ldr q29, [x17, #+704] +ldr q27, [x17, #+720] +ldr q4, [x17, #+736] +ldr q15, [x17, #+752] +ldr q13, [x0, #288] +ldr q10, [x0, #304] +ldr q12, [x0, #256] +ldr q17, [x0, #272] +sqrdmulh v20.4S, v13.4S, v8.s[0] +mul v13.4S, v13.4S,v5.s[0] +mla v13.4S, v20.4S, v31.s[0] +sub v20.4s, v12.4s, v13.4s +add v12.4s, v12.4s, v13.4s +sqrdmulh v13.4S, v10.4S, v8.s[0] +mul v10.4S, v10.4S,v5.s[0] +mla v10.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v10.4s +add v17.4s, v17.4s, v10.4s +sqrdmulh v10.4S, v17.4S, v8.s[1] +mul v17.4S, v17.4S,v5.s[1] +mla v17.4S, v10.4S, v31.s[0] +sub v10.4s, v12.4s, v17.4s +add v12.4s, v12.4s, v17.4s +sqrdmulh v17.4S, v13.4S, v8.s[2] +mul v13.4S, v13.4S,v5.s[2] +mla v13.4S, v17.4S, v31.s[0] +sub v17.4s, v20.4s, v13.4s +add v20.4s, v20.4s, v13.4s +trn1 v13.4S, v12.4S, v10.4S +trn2 v16.4S, v12.4S, v10.4S +trn1 v25.4S, v20.4S, v17.4S +trn2 v26.4S, v20.4S, v17.4S +trn2 v20.2D, v13.2D, v25.2D +trn2 v17.2D, v16.2D, v26.2D +trn1 v12.2D, v13.2D, v25.2D +trn1 v10.2D, v16.2D, v26.2D +sqrdmulh v26.4S, v20.4S, v30.4S +mul v20.4S, v20.4S,v1.4S +mla v20.4S, v26.4S, v31.s[0] +sub v26.4s, v12.4s, v20.4s +add v12.4s, v12.4s, v20.4s +sqrdmulh v20.4S, v17.4S, v30.4S +mul v17.4S, v17.4S,v1.4S +mla v17.4S, v20.4S, v31.s[0] +sub v20.4s, v10.4s, v17.4s +add v10.4s, v10.4s, v17.4s +sqrdmulh v17.4S, v10.4S, v27.4S +mul v10.4S, v10.4S,v29.4S +mla v10.4S, v17.4S, v31.s[0] +sub v17.4s, v12.4s, v10.4s +add v12.4s, v12.4s, v10.4s +sqrdmulh v10.4S, v20.4S, v15.4S +mul v20.4S, v20.4S,v4.4S +mla v20.4S, v10.4S, v31.s[0] +sub v10.4s, v26.4s, v20.4s +add v26.4s, v26.4s, v20.4s +str q12, [x0, #256] +str q17, [x0, #272] +str q26, [x0, #288] +str q10, [x0, #304] +ldr q10, [x17, #+768] +ldr q26, [x17, #+784] +ldr q17, [x17, #+800] +ldr q12, [x17, #+816] +ldr q20, [x17, #+832] +ldr q16, [x17, #+848] +ldr q25, [x17, #+864] +ldr q13, [x17, #+880] +ldr q15, [x0, #352] +ldr q4, [x0, #368] +ldr q27, [x0, #320] +ldr q29, [x0, #336] +sqrdmulh v30.4S, v15.4S, v26.s[0] +mul v15.4S, v15.4S,v10.s[0] +mla v15.4S, v30.4S, v31.s[0] +sub v30.4s, v27.4s, v15.4s +add v27.4s, v27.4s, v15.4s +sqrdmulh v15.4S, v4.4S, v26.s[0] +mul v4.4S, v4.4S,v10.s[0] +mla v4.4S, v15.4S, v31.s[0] +sub v15.4s, v29.4s, v4.4s +add v29.4s, v29.4s, v4.4s +sqrdmulh v4.4S, v29.4S, v26.s[1] +mul v29.4S, v29.4S,v10.s[1] +mla v29.4S, v4.4S, v31.s[0] +sub v4.4s, v27.4s, v29.4s +add v27.4s, v27.4s, v29.4s +sqrdmulh v29.4S, v15.4S, v26.s[2] +mul v15.4S, v15.4S,v10.s[2] +mla v15.4S, v29.4S, v31.s[0] +sub v29.4s, v30.4s, v15.4s +add v30.4s, v30.4s, v15.4s +trn1 v15.4S, v27.4S, v4.4S +trn2 v1.4S, v27.4S, v4.4S +trn1 v8.4S, v30.4S, v29.4S +trn2 v5.4S, v30.4S, v29.4S +trn2 v30.2D, v15.2D, v8.2D +trn2 v29.2D, v1.2D, v5.2D +trn1 v27.2D, v15.2D, v8.2D +trn1 v4.2D, v1.2D, v5.2D +sqrdmulh v5.4S, v30.4S, v12.4S +mul v30.4S, v30.4S,v17.4S +mla v30.4S, v5.4S, v31.s[0] +sub v5.4s, v27.4s, v30.4s +add v27.4s, v27.4s, v30.4s +sqrdmulh v30.4S, v29.4S, v12.4S +mul v29.4S, v29.4S,v17.4S +mla v29.4S, v30.4S, v31.s[0] +sub v30.4s, v4.4s, v29.4s +add v4.4s, v4.4s, v29.4s +sqrdmulh v29.4S, v4.4S, v16.4S +mul v4.4S, v4.4S,v20.4S +mla v4.4S, v29.4S, v31.s[0] +sub v29.4s, v27.4s, v4.4s +add v27.4s, v27.4s, v4.4s +sqrdmulh v4.4S, v30.4S, v13.4S +mul v30.4S, v30.4S,v25.4S +mla v30.4S, v4.4S, v31.s[0] +sub v4.4s, v5.4s, v30.4s +add v5.4s, v5.4s, v30.4s +str q27, [x0, #320] +str q29, [x0, #336] +str q5, [x0, #352] +str q4, [x0, #368] +ldr q4, [x17, #+896] +ldr q5, [x17, #+912] +ldr q29, [x17, #+928] +ldr q27, [x17, #+944] +ldr q30, [x17, #+960] +ldr q1, [x17, #+976] +ldr q8, [x17, #+992] +ldr q15, [x17, #+1008] +ldr q13, [x0, #416] +ldr q25, [x0, #432] +ldr q16, [x0, #384] +ldr q20, [x0, #400] +sqrdmulh v12.4S, v13.4S, v5.s[0] +mul v13.4S, v13.4S,v4.s[0] +mla v13.4S, v12.4S, v31.s[0] +sub v12.4s, v16.4s, v13.4s +add v16.4s, v16.4s, v13.4s +sqrdmulh v13.4S, v25.4S, v5.s[0] +mul v25.4S, v25.4S,v4.s[0] +mla v25.4S, v13.4S, v31.s[0] +sub v13.4s, v20.4s, v25.4s +add v20.4s, v20.4s, v25.4s +sqrdmulh v25.4S, v20.4S, v5.s[1] +mul v20.4S, v20.4S,v4.s[1] +mla v20.4S, v25.4S, v31.s[0] +sub v25.4s, v16.4s, v20.4s +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v13.4S, v5.s[2] +mul v13.4S, v13.4S,v4.s[2] +mla v13.4S, v20.4S, v31.s[0] +sub v20.4s, v12.4s, v13.4s +add v12.4s, v12.4s, v13.4s +trn1 v13.4S, v16.4S, v25.4S +trn2 v17.4S, v16.4S, v25.4S +trn1 v26.4S, v12.4S, v20.4S +trn2 v10.4S, v12.4S, v20.4S +trn2 v12.2D, v13.2D, v26.2D +trn2 v20.2D, v17.2D, v10.2D +trn1 v16.2D, v13.2D, v26.2D +trn1 v25.2D, v17.2D, v10.2D +sqrdmulh v10.4S, v12.4S, v27.4S +mul v12.4S, v12.4S,v29.4S +mla v12.4S, v10.4S, v31.s[0] +sub v10.4s, v16.4s, v12.4s +add v16.4s, v16.4s, v12.4s +sqrdmulh v12.4S, v20.4S, v27.4S +mul v20.4S, v20.4S,v29.4S +mla v20.4S, v12.4S, v31.s[0] +sub v12.4s, v25.4s, v20.4s +add v25.4s, v25.4s, v20.4s +sqrdmulh v20.4S, v25.4S, v1.4S +mul v25.4S, v25.4S,v30.4S +mla v25.4S, v20.4S, v31.s[0] +sub v20.4s, v16.4s, v25.4s +add v16.4s, v16.4s, v25.4s +sqrdmulh v25.4S, v12.4S, v15.4S +mul v12.4S, v12.4S,v8.4S +mla v12.4S, v25.4S, v31.s[0] +sub v25.4s, v10.4s, v12.4s +add v10.4s, v10.4s, v12.4s +str q16, [x0, #384] +str q20, [x0, #400] +str q10, [x0, #416] +str q25, [x0, #432] +ldr q25, [x17, #+1024] +ldr q10, [x17, #+1040] +ldr q20, [x17, #+1056] +ldr q16, [x17, #+1072] +ldr q12, [x17, #+1088] +ldr q17, [x17, #+1104] +ldr q26, [x17, #+1120] +ldr q13, [x17, #+1136] +ldr q15, [x0, #480] +ldr q8, [x0, #496] +ldr q1, [x0, #448] +ldr q30, [x0, #464] +sqrdmulh v27.4S, v15.4S, v10.s[0] +mul v15.4S, v15.4S,v25.s[0] +mla v15.4S, v27.4S, v31.s[0] +sub v27.4s, v1.4s, v15.4s +add v1.4s, v1.4s, v15.4s +sqrdmulh v15.4S, v8.4S, v10.s[0] +mul v8.4S, v8.4S,v25.s[0] +mla v8.4S, v15.4S, v31.s[0] +sub v15.4s, v30.4s, v8.4s +add v30.4s, v30.4s, v8.4s +sqrdmulh v8.4S, v30.4S, v10.s[1] +mul v30.4S, v30.4S,v25.s[1] +mla v30.4S, v8.4S, v31.s[0] +sub v8.4s, v1.4s, v30.4s +add v1.4s, v1.4s, v30.4s +sqrdmulh v30.4S, v15.4S, v10.s[2] +mul v15.4S, v15.4S,v25.s[2] +mla v15.4S, v30.4S, v31.s[0] +sub v30.4s, v27.4s, v15.4s +add v27.4s, v27.4s, v15.4s +trn1 v15.4S, v1.4S, v8.4S +trn2 v29.4S, v1.4S, v8.4S +trn1 v5.4S, v27.4S, v30.4S +trn2 v4.4S, v27.4S, v30.4S +trn2 v27.2D, v15.2D, v5.2D +trn2 v30.2D, v29.2D, v4.2D +trn1 v1.2D, v15.2D, v5.2D +trn1 v8.2D, v29.2D, v4.2D +sqrdmulh v4.4S, v27.4S, v16.4S +mul v27.4S, v27.4S,v20.4S +mla v27.4S, v4.4S, v31.s[0] +sub v4.4s, v1.4s, v27.4s +add v1.4s, v1.4s, v27.4s +sqrdmulh v27.4S, v30.4S, v16.4S +mul v30.4S, v30.4S,v20.4S +mla v30.4S, v27.4S, v31.s[0] +sub v27.4s, v8.4s, v30.4s +add v8.4s, v8.4s, v30.4s +sqrdmulh v30.4S, v8.4S, v17.4S +mul v8.4S, v8.4S,v12.4S +mla v8.4S, v30.4S, v31.s[0] +sub v30.4s, v1.4s, v8.4s +add v1.4s, v1.4s, v8.4s +sqrdmulh v8.4S, v27.4S, v13.4S +mul v27.4S, v27.4S,v26.4S +mla v27.4S, v8.4S, v31.s[0] +sub v8.4s, v4.4s, v27.4s +add v4.4s, v4.4s, v27.4s +str q1, [x0, #448] +str q30, [x0, #464] +str q4, [x0, #480] +str q8, [x0, #496] +ldr q8, [x17, #+1152] +ldr q4, [x17, #+1168] +ldr q30, [x17, #+1184] +ldr q1, [x17, #+1200] +ldr q27, [x17, #+1216] +ldr q29, [x17, #+1232] +ldr q5, [x17, #+1248] +ldr q15, [x17, #+1264] +ldr q13, [x0, #544] +ldr q26, [x0, #560] +ldr q17, [x0, #512] +ldr q12, [x0, #528] +sqrdmulh v16.4S, v13.4S, v4.s[0] +mul v13.4S, v13.4S,v8.s[0] +mla v13.4S, v16.4S, v31.s[0] +sub v16.4s, v17.4s, v13.4s +add v17.4s, v17.4s, v13.4s +sqrdmulh v13.4S, v26.4S, v4.s[0] +mul v26.4S, v26.4S,v8.s[0] +mla v26.4S, v13.4S, v31.s[0] +sub v13.4s, v12.4s, v26.4s +add v12.4s, v12.4s, v26.4s +sqrdmulh v26.4S, v12.4S, v4.s[1] +mul v12.4S, v12.4S,v8.s[1] +mla v12.4S, v26.4S, v31.s[0] +sub v26.4s, v17.4s, v12.4s +add v17.4s, v17.4s, v12.4s +sqrdmulh v12.4S, v13.4S, v4.s[2] +mul v13.4S, v13.4S,v8.s[2] +mla v13.4S, v12.4S, v31.s[0] +sub v12.4s, v16.4s, v13.4s +add v16.4s, v16.4s, v13.4s +trn1 v13.4S, v17.4S, v26.4S +trn2 v20.4S, v17.4S, v26.4S +trn1 v10.4S, v16.4S, v12.4S +trn2 v25.4S, v16.4S, v12.4S +trn2 v16.2D, v13.2D, v10.2D +trn2 v12.2D, v20.2D, v25.2D +trn1 v17.2D, v13.2D, v10.2D +trn1 v26.2D, v20.2D, v25.2D +sqrdmulh v25.4S, v16.4S, v1.4S +mul v16.4S, v16.4S,v30.4S +mla v16.4S, v25.4S, v31.s[0] +sub v25.4s, v17.4s, v16.4s +add v17.4s, v17.4s, v16.4s +sqrdmulh v16.4S, v12.4S, v1.4S +mul v12.4S, v12.4S,v30.4S +mla v12.4S, v16.4S, v31.s[0] +sub v16.4s, v26.4s, v12.4s +add v26.4s, v26.4s, v12.4s +sqrdmulh v12.4S, v26.4S, v29.4S +mul v26.4S, v26.4S,v27.4S +mla v26.4S, v12.4S, v31.s[0] +sub v12.4s, v17.4s, v26.4s +add v17.4s, v17.4s, v26.4s +sqrdmulh v26.4S, v16.4S, v15.4S +mul v16.4S, v16.4S,v5.4S +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v25.4s, v16.4s +add v25.4s, v25.4s, v16.4s +str q17, [x0, #512] +str q12, [x0, #528] +str q25, [x0, #544] +str q26, [x0, #560] +ldr q26, [x17, #+1280] +ldr q25, [x17, #+1296] +ldr q12, [x17, #+1312] +ldr q17, [x17, #+1328] +ldr q16, [x17, #+1344] +ldr q20, [x17, #+1360] +ldr q10, [x17, #+1376] +ldr q13, [x17, #+1392] +ldr q15, [x0, #608] +ldr q5, [x0, #624] +ldr q29, [x0, #576] +ldr q27, [x0, #592] +sqrdmulh v1.4S, v15.4S, v25.s[0] +mul v15.4S, v15.4S,v26.s[0] +mla v15.4S, v1.4S, v31.s[0] +sub v1.4s, v29.4s, v15.4s +add v29.4s, v29.4s, v15.4s +sqrdmulh v15.4S, v5.4S, v25.s[0] +mul v5.4S, v5.4S,v26.s[0] +mla v5.4S, v15.4S, v31.s[0] +sub v15.4s, v27.4s, v5.4s +add v27.4s, v27.4s, v5.4s +sqrdmulh v5.4S, v27.4S, v25.s[1] +mul v27.4S, v27.4S,v26.s[1] +mla v27.4S, v5.4S, v31.s[0] +sub v5.4s, v29.4s, v27.4s +add v29.4s, v29.4s, v27.4s +sqrdmulh v27.4S, v15.4S, v25.s[2] +mul v15.4S, v15.4S,v26.s[2] +mla v15.4S, v27.4S, v31.s[0] +sub v27.4s, v1.4s, v15.4s +add v1.4s, v1.4s, v15.4s +trn1 v15.4S, v29.4S, v5.4S +trn2 v30.4S, v29.4S, v5.4S +trn1 v4.4S, v1.4S, v27.4S +trn2 v8.4S, v1.4S, v27.4S +trn2 v1.2D, v15.2D, v4.2D +trn2 v27.2D, v30.2D, v8.2D +trn1 v29.2D, v15.2D, v4.2D +trn1 v5.2D, v30.2D, v8.2D +sqrdmulh v8.4S, v1.4S, v17.4S +mul v1.4S, v1.4S,v12.4S +mla v1.4S, v8.4S, v31.s[0] +sub v8.4s, v29.4s, v1.4s +add v29.4s, v29.4s, v1.4s +sqrdmulh v1.4S, v27.4S, v17.4S +mul v27.4S, v27.4S,v12.4S +mla v27.4S, v1.4S, v31.s[0] +sub v1.4s, v5.4s, v27.4s +add v5.4s, v5.4s, v27.4s +sqrdmulh v27.4S, v5.4S, v20.4S +mul v5.4S, v5.4S,v16.4S +mla v5.4S, v27.4S, v31.s[0] +sub v27.4s, v29.4s, v5.4s +add v29.4s, v29.4s, v5.4s +sqrdmulh v5.4S, v1.4S, v13.4S +mul v1.4S, v1.4S,v10.4S +mla v1.4S, v5.4S, v31.s[0] +sub v5.4s, v8.4s, v1.4s +add v8.4s, v8.4s, v1.4s +str q29, [x0, #576] +str q27, [x0, #592] +str q8, [x0, #608] +str q5, [x0, #624] +ldr q5, [x17, #+1408] +ldr q8, [x17, #+1424] +ldr q27, [x17, #+1440] +ldr q29, [x17, #+1456] +ldr q1, [x17, #+1472] +ldr q30, [x17, #+1488] +ldr q4, [x17, #+1504] +ldr q15, [x17, #+1520] +ldr q13, [x0, #672] +ldr q10, [x0, #688] +ldr q20, [x0, #640] +ldr q16, [x0, #656] +sqrdmulh v17.4S, v13.4S, v8.s[0] +mul v13.4S, v13.4S,v5.s[0] +mla v13.4S, v17.4S, v31.s[0] +sub v17.4s, v20.4s, v13.4s +add v20.4s, v20.4s, v13.4s +sqrdmulh v13.4S, v10.4S, v8.s[0] +mul v10.4S, v10.4S,v5.s[0] +mla v10.4S, v13.4S, v31.s[0] +sub v13.4s, v16.4s, v10.4s +add v16.4s, v16.4s, v10.4s +sqrdmulh v10.4S, v16.4S, v8.s[1] +mul v16.4S, v16.4S,v5.s[1] +mla v16.4S, v10.4S, v31.s[0] +sub v10.4s, v20.4s, v16.4s +add v20.4s, v20.4s, v16.4s +sqrdmulh v16.4S, v13.4S, v8.s[2] +mul v13.4S, v13.4S,v5.s[2] +mla v13.4S, v16.4S, v31.s[0] +sub v16.4s, v17.4s, v13.4s +add v17.4s, v17.4s, v13.4s +trn1 v13.4S, v20.4S, v10.4S +trn2 v12.4S, v20.4S, v10.4S +trn1 v25.4S, v17.4S, v16.4S +trn2 v26.4S, v17.4S, v16.4S +trn2 v17.2D, v13.2D, v25.2D +trn2 v16.2D, v12.2D, v26.2D +trn1 v20.2D, v13.2D, v25.2D +trn1 v10.2D, v12.2D, v26.2D +sqrdmulh v26.4S, v17.4S, v29.4S +mul v17.4S, v17.4S,v27.4S +mla v17.4S, v26.4S, v31.s[0] +sub v26.4s, v20.4s, v17.4s +add v20.4s, v20.4s, v17.4s +sqrdmulh v17.4S, v16.4S, v29.4S +mul v16.4S, v16.4S,v27.4S +mla v16.4S, v17.4S, v31.s[0] +sub v17.4s, v10.4s, v16.4s +add v10.4s, v10.4s, v16.4s +sqrdmulh v16.4S, v10.4S, v30.4S +mul v10.4S, v10.4S,v1.4S +mla v10.4S, v16.4S, v31.s[0] +sub v16.4s, v20.4s, v10.4s +add v20.4s, v20.4s, v10.4s +sqrdmulh v10.4S, v17.4S, v15.4S +mul v17.4S, v17.4S,v4.4S +mla v17.4S, v10.4S, v31.s[0] +sub v10.4s, v26.4s, v17.4s +add v26.4s, v26.4s, v17.4s +str q20, [x0, #640] +str q16, [x0, #656] +str q26, [x0, #672] +str q10, [x0, #688] +ldr q10, [x17, #+1536] +ldr q26, [x17, #+1552] +ldr q16, [x17, #+1568] +ldr q20, [x17, #+1584] +ldr q17, [x17, #+1600] +ldr q12, [x17, #+1616] +ldr q25, [x17, #+1632] +ldr q13, [x17, #+1648] +ldr q15, [x0, #736] +ldr q4, [x0, #752] +ldr q30, [x0, #704] +ldr q1, [x0, #720] +sqrdmulh v29.4S, v15.4S, v26.s[0] +mul v15.4S, v15.4S,v10.s[0] +mla v15.4S, v29.4S, v31.s[0] +sub v29.4s, v30.4s, v15.4s +add v30.4s, v30.4s, v15.4s +sqrdmulh v15.4S, v4.4S, v26.s[0] +mul v4.4S, v4.4S,v10.s[0] +mla v4.4S, v15.4S, v31.s[0] +sub v15.4s, v1.4s, v4.4s +add v1.4s, v1.4s, v4.4s +sqrdmulh v4.4S, v1.4S, v26.s[1] +mul v1.4S, v1.4S,v10.s[1] +mla v1.4S, v4.4S, v31.s[0] +sub v4.4s, v30.4s, v1.4s +add v30.4s, v30.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v26.s[2] +mul v15.4S, v15.4S,v10.s[2] +mla v15.4S, v1.4S, v31.s[0] +sub v1.4s, v29.4s, v15.4s +add v29.4s, v29.4s, v15.4s +trn1 v15.4S, v30.4S, v4.4S +trn2 v27.4S, v30.4S, v4.4S +trn1 v8.4S, v29.4S, v1.4S +trn2 v5.4S, v29.4S, v1.4S +trn2 v29.2D, v15.2D, v8.2D +trn2 v1.2D, v27.2D, v5.2D +trn1 v30.2D, v15.2D, v8.2D +trn1 v4.2D, v27.2D, v5.2D +sqrdmulh v5.4S, v29.4S, v20.4S +mul v29.4S, v29.4S,v16.4S +mla v29.4S, v5.4S, v31.s[0] +sub v5.4s, v30.4s, v29.4s +add v30.4s, v30.4s, v29.4s +sqrdmulh v29.4S, v1.4S, v20.4S +mul v1.4S, v1.4S,v16.4S +mla v1.4S, v29.4S, v31.s[0] +sub v29.4s, v4.4s, v1.4s +add v4.4s, v4.4s, v1.4s +sqrdmulh v1.4S, v4.4S, v12.4S +mul v4.4S, v4.4S,v17.4S +mla v4.4S, v1.4S, v31.s[0] +sub v1.4s, v30.4s, v4.4s +add v30.4s, v30.4s, v4.4s +sqrdmulh v4.4S, v29.4S, v13.4S +mul v29.4S, v29.4S,v25.4S +mla v29.4S, v4.4S, v31.s[0] +sub v4.4s, v5.4s, v29.4s +add v5.4s, v5.4s, v29.4s +str q30, [x0, #704] +str q1, [x0, #720] +str q5, [x0, #736] +str q4, [x0, #752] +ldr q4, [x17, #+1664] +ldr q5, [x17, #+1680] +ldr q1, [x17, #+1696] +ldr q30, [x17, #+1712] +ldr q29, [x17, #+1728] +ldr q27, [x17, #+1744] +ldr q8, [x17, #+1760] +ldr q15, [x17, #+1776] +ldr q13, [x0, #800] +ldr q25, [x0, #816] +ldr q12, [x0, #768] +ldr q17, [x0, #784] +sqrdmulh v20.4S, v13.4S, v5.s[0] +mul v13.4S, v13.4S,v4.s[0] +mla v13.4S, v20.4S, v31.s[0] +sub v20.4s, v12.4s, v13.4s +add v12.4s, v12.4s, v13.4s +sqrdmulh v13.4S, v25.4S, v5.s[0] +mul v25.4S, v25.4S,v4.s[0] +mla v25.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v25.4s +add v17.4s, v17.4s, v25.4s +sqrdmulh v25.4S, v17.4S, v5.s[1] +mul v17.4S, v17.4S,v4.s[1] +mla v17.4S, v25.4S, v31.s[0] +sub v25.4s, v12.4s, v17.4s +add v12.4s, v12.4s, v17.4s +sqrdmulh v17.4S, v13.4S, v5.s[2] +mul v13.4S, v13.4S,v4.s[2] +mla v13.4S, v17.4S, v31.s[0] +sub v17.4s, v20.4s, v13.4s +add v20.4s, v20.4s, v13.4s +trn1 v13.4S, v12.4S, v25.4S +trn2 v16.4S, v12.4S, v25.4S +trn1 v26.4S, v20.4S, v17.4S +trn2 v10.4S, v20.4S, v17.4S +trn2 v20.2D, v13.2D, v26.2D +trn2 v17.2D, v16.2D, v10.2D +trn1 v12.2D, v13.2D, v26.2D +trn1 v25.2D, v16.2D, v10.2D +sqrdmulh v10.4S, v20.4S, v30.4S +mul v20.4S, v20.4S,v1.4S +mla v20.4S, v10.4S, v31.s[0] +sub v10.4s, v12.4s, v20.4s +add v12.4s, v12.4s, v20.4s +sqrdmulh v20.4S, v17.4S, v30.4S +mul v17.4S, v17.4S,v1.4S +mla v17.4S, v20.4S, v31.s[0] +sub v20.4s, v25.4s, v17.4s +add v25.4s, v25.4s, v17.4s +sqrdmulh v17.4S, v25.4S, v27.4S +mul v25.4S, v25.4S,v29.4S +mla v25.4S, v17.4S, v31.s[0] +sub v17.4s, v12.4s, v25.4s +add v12.4s, v12.4s, v25.4s +sqrdmulh v25.4S, v20.4S, v15.4S +mul v20.4S, v20.4S,v8.4S +mla v20.4S, v25.4S, v31.s[0] +sub v25.4s, v10.4s, v20.4s +add v10.4s, v10.4s, v20.4s +str q12, [x0, #768] +str q17, [x0, #784] +str q10, [x0, #800] +str q25, [x0, #816] +ldr q25, [x17, #+1792] +ldr q10, [x17, #+1808] +ldr q17, [x17, #+1824] +ldr q12, [x17, #+1840] +ldr q20, [x17, #+1856] +ldr q16, [x17, #+1872] +ldr q26, [x17, #+1888] +ldr q13, [x17, #+1904] +ldr q15, [x0, #864] +ldr q8, [x0, #880] +ldr q27, [x0, #832] +ldr q29, [x0, #848] +sqrdmulh v30.4S, v15.4S, v10.s[0] +mul v15.4S, v15.4S,v25.s[0] +mla v15.4S, v30.4S, v31.s[0] +sub v30.4s, v27.4s, v15.4s +add v27.4s, v27.4s, v15.4s +sqrdmulh v15.4S, v8.4S, v10.s[0] +mul v8.4S, v8.4S,v25.s[0] +mla v8.4S, v15.4S, v31.s[0] +sub v15.4s, v29.4s, v8.4s +add v29.4s, v29.4s, v8.4s +sqrdmulh v8.4S, v29.4S, v10.s[1] +mul v29.4S, v29.4S,v25.s[1] +mla v29.4S, v8.4S, v31.s[0] +sub v8.4s, v27.4s, v29.4s +add v27.4s, v27.4s, v29.4s +sqrdmulh v29.4S, v15.4S, v10.s[2] +mul v15.4S, v15.4S,v25.s[2] +mla v15.4S, v29.4S, v31.s[0] +sub v29.4s, v30.4s, v15.4s +add v30.4s, v30.4s, v15.4s +trn1 v15.4S, v27.4S, v8.4S +trn2 v1.4S, v27.4S, v8.4S +trn1 v5.4S, v30.4S, v29.4S +trn2 v4.4S, v30.4S, v29.4S +trn2 v30.2D, v15.2D, v5.2D +trn2 v29.2D, v1.2D, v4.2D +trn1 v27.2D, v15.2D, v5.2D +trn1 v8.2D, v1.2D, v4.2D +sqrdmulh v4.4S, v30.4S, v12.4S +mul v30.4S, v30.4S,v17.4S +mla v30.4S, v4.4S, v31.s[0] +sub v4.4s, v27.4s, v30.4s +add v27.4s, v27.4s, v30.4s +sqrdmulh v30.4S, v29.4S, v12.4S +mul v29.4S, v29.4S,v17.4S +mla v29.4S, v30.4S, v31.s[0] +sub v30.4s, v8.4s, v29.4s +add v8.4s, v8.4s, v29.4s +sqrdmulh v29.4S, v8.4S, v16.4S +mul v8.4S, v8.4S,v20.4S +mla v8.4S, v29.4S, v31.s[0] +sub v29.4s, v27.4s, v8.4s +add v27.4s, v27.4s, v8.4s +sqrdmulh v8.4S, v30.4S, v13.4S +mul v30.4S, v30.4S,v26.4S +mla v30.4S, v8.4S, v31.s[0] +sub v8.4s, v4.4s, v30.4s +add v4.4s, v4.4s, v30.4s +str q27, [x0, #832] +str q29, [x0, #848] +str q4, [x0, #864] +str q8, [x0, #880] +ldr q8, [x17, #+1920] +ldr q4, [x17, #+1936] +ldr q29, [x17, #+1952] +ldr q27, [x17, #+1968] +ldr q30, [x17, #+1984] +ldr q1, [x17, #+2000] +ldr q5, [x17, #+2016] +ldr q15, [x17, #+2032] +ldr q13, [x0, #928] +ldr q26, [x0, #944] +ldr q16, [x0, #896] +ldr q20, [x0, #912] +sqrdmulh v12.4S, v13.4S, v4.s[0] +mul v13.4S, v13.4S,v8.s[0] +mla v13.4S, v12.4S, v31.s[0] +sub v12.4s, v16.4s, v13.4s +add v16.4s, v16.4s, v13.4s +sqrdmulh v13.4S, v26.4S, v4.s[0] +mul v26.4S, v26.4S,v8.s[0] +mla v26.4S, v13.4S, v31.s[0] +sub v13.4s, v20.4s, v26.4s +add v20.4s, v20.4s, v26.4s +sqrdmulh v26.4S, v20.4S, v4.s[1] +mul v20.4S, v20.4S,v8.s[1] +mla v20.4S, v26.4S, v31.s[0] +sub v26.4s, v16.4s, v20.4s +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v13.4S, v4.s[2] +mul v13.4S, v13.4S,v8.s[2] +mla v13.4S, v20.4S, v31.s[0] +sub v20.4s, v12.4s, v13.4s +add v12.4s, v12.4s, v13.4s +trn1 v13.4S, v16.4S, v26.4S +trn2 v17.4S, v16.4S, v26.4S +trn1 v10.4S, v12.4S, v20.4S +trn2 v25.4S, v12.4S, v20.4S +trn2 v12.2D, v13.2D, v10.2D +trn2 v20.2D, v17.2D, v25.2D +trn1 v16.2D, v13.2D, v10.2D +trn1 v26.2D, v17.2D, v25.2D +sqrdmulh v25.4S, v12.4S, v27.4S +mul v12.4S, v12.4S,v29.4S +mla v12.4S, v25.4S, v31.s[0] +sub v25.4s, v16.4s, v12.4s +add v16.4s, v16.4s, v12.4s +sqrdmulh v12.4S, v20.4S, v27.4S +mul v20.4S, v20.4S,v29.4S +mla v20.4S, v12.4S, v31.s[0] +sub v12.4s, v26.4s, v20.4s +add v26.4s, v26.4s, v20.4s +sqrdmulh v20.4S, v26.4S, v1.4S +mul v26.4S, v26.4S,v30.4S +mla v26.4S, v20.4S, v31.s[0] +sub v20.4s, v16.4s, v26.4s +add v16.4s, v16.4s, v26.4s +sqrdmulh v26.4S, v12.4S, v15.4S +mul v12.4S, v12.4S,v5.4S +mla v12.4S, v26.4S, v31.s[0] +sub v26.4s, v25.4s, v12.4s +add v25.4s, v25.4s, v12.4s +str q16, [x0, #896] +str q20, [x0, #912] +str q25, [x0, #928] +str q26, [x0, #944] +ldr q26, [x17, #+2048] +ldr q25, [x17, #+2064] +ldr q20, [x17, #+2080] +ldr q16, [x17, #+2096] +ldr q12, [x17, #+2112] +ldr q17, [x17, #+2128] +ldr q10, [x17, #+2144] +ldr q13, [x17, #+2160] +ldr q15, [x0, #992] +ldr q5, [x0, #1008] +ldr q1, [x0, #960] +ldr q30, [x0, #976] +sqrdmulh v27.4S, v15.4S, v25.s[0] +mul v15.4S, v15.4S,v26.s[0] +mla v15.4S, v27.4S, v31.s[0] +sub v27.4s, v1.4s, v15.4s +add v1.4s, v1.4s, v15.4s +sqrdmulh v15.4S, v5.4S, v25.s[0] +mul v5.4S, v5.4S,v26.s[0] +mla v5.4S, v15.4S, v31.s[0] +sub v15.4s, v30.4s, v5.4s +add v30.4s, v30.4s, v5.4s +sqrdmulh v5.4S, v30.4S, v25.s[1] +mul v30.4S, v30.4S,v26.s[1] +mla v30.4S, v5.4S, v31.s[0] +sub v5.4s, v1.4s, v30.4s +add v1.4s, v1.4s, v30.4s +sqrdmulh v30.4S, v15.4S, v25.s[2] +mul v15.4S, v15.4S,v26.s[2] +mla v15.4S, v30.4S, v31.s[0] +sub v30.4s, v27.4s, v15.4s +add v27.4s, v27.4s, v15.4s +trn1 v15.4S, v1.4S, v5.4S +trn2 v29.4S, v1.4S, v5.4S +trn1 v4.4S, v27.4S, v30.4S +trn2 v8.4S, v27.4S, v30.4S +trn2 v27.2D, v15.2D, v4.2D +trn2 v30.2D, v29.2D, v8.2D +trn1 v1.2D, v15.2D, v4.2D +trn1 v5.2D, v29.2D, v8.2D +sqrdmulh v8.4S, v27.4S, v16.4S +mul v27.4S, v27.4S,v20.4S +mla v27.4S, v8.4S, v31.s[0] +sub v8.4s, v1.4s, v27.4s +add v1.4s, v1.4s, v27.4s +sqrdmulh v27.4S, v30.4S, v16.4S +mul v30.4S, v30.4S,v20.4S +mla v30.4S, v27.4S, v31.s[0] +sub v27.4s, v5.4s, v30.4s +add v5.4s, v5.4s, v30.4s +sqrdmulh v30.4S, v5.4S, v17.4S +mul v5.4S, v5.4S,v12.4S +mla v5.4S, v30.4S, v31.s[0] +sub v30.4s, v1.4s, v5.4s +add v1.4s, v1.4s, v5.4s +sqrdmulh v5.4S, v27.4S, v13.4S +mul v27.4S, v27.4S,v10.4S +mla v27.4S, v5.4S, v31.s[0] +sub v5.4s, v8.4s, v27.4s +add v8.4s, v8.4s, v27.4s +str q1, [x0, #960] +str q30, [x0, #976] +str q8, [x0, #992] +str q5, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 2392 +// Instruction count: 2388 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_12_0.s b/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_12_0.s new file mode 100644 index 0000000..01a8251 --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_12_0.s @@ -0,0 +1,2422 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 26036764 // Layer 6, block 0 +.word 7065381 // Layer 6, block 1 +.word 11280567 // Layer 6, block 2 +.word 19695786 // Layer 6, block 3 +.word 1666225723 // Layer 6, block 0 +.word 452149874 // Layer 6, block 1 +.word 721901190 // Layer 6, block 2 +.word 1260434103 // Layer 6, block 3 +.word 28678040 // Layer 7, block 0 +.word 5637166 // Layer 7, block 2 +.word 18759424 // Layer 7, block 4 +.word 8648030 // Layer 7, block 6 +.word 1835254486 // Layer 7, block 0 +.word 360751090 // Layer 7, block 2 +.word 1200511508 // Layer 7, block 4 +.word 553431680 // Layer 7, block 6 +.word 7232147 // Layer 7, block 1 +.word 7430689 // Layer 7, block 3 +.word 14819378 // Layer 7, block 5 +.word 22112339 // Layer 7, block 7 +.word 462822084 // Layer 7, block 1 +.word 475527802 // Layer 7, block 3 +.word 948367809 // Layer 7, block 5 +.word 1415081692 // Layer 7, block 7 +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14834498 // Layer 6, block 4 +.word 22861321 // Layer 6, block 5 +.word 23033862 // Layer 6, block 6 +.word 32211066 // Layer 6, block 7 +.word 949335415 // Layer 6, block 4 +.word 1463012881 // Layer 6, block 5 +.word 1474054663 // Layer 6, block 6 +.word 2061350894 // Layer 6, block 7 +.word 7103825 // Layer 7, block 8 +.word 24338119 // Layer 7, block 10 +.word 6674394 // Layer 7, block 12 +.word 3716128 // Layer 7, block 14 +.word 454610102 // Layer 7, block 8 +.word 1557520740 // Layer 7, block 10 +.word 427128616 // Layer 7, block 12 +.word 237814041 // Layer 7, block 14 +.word 18577393 // Layer 7, block 9 +.word 17042091 // Layer 7, block 11 +.word 6574213 // Layer 7, block 13 +.word 24666803 // Layer 7, block 15 +.word 1188862414 // Layer 7, block 9 +.word 1090610585 // Layer 7, block 11 +.word 420717521 // Layer 7, block 13 +.word 1578554911 // Layer 7, block 15 +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 11253846 // Layer 6, block 8 +.word 16151303 // Layer 6, block 9 +.word 1821442 // Layer 6, block 10 +.word 23358663 // Layer 6, block 11 +.word 720191176 // Layer 6, block 8 +.word 1033604503 // Layer 6, block 9 +.word 116563391 // Layer 6, block 10 +.word 1494840340 // Layer 6, block 11 +.word 32787475 // Layer 7, block 16 +.word 8269259 // Layer 7, block 18 +.word 20826321 // Layer 7, block 20 +.word 21194054 // Layer 7, block 22 +.word 2098238255 // Layer 7, block 16 +.word 529192186 // Layer 7, block 18 +.word 1332782821 // Layer 7, block 20 +.word 1356315937 // Layer 7, block 22 +.word 28400654 // Layer 7, block 17 +.word 31090287 // Layer 7, block 19 +.word 26776841 // Layer 7, block 21 +.word 22281074 // Layer 7, block 23 +.word 1817503137 // Layer 7, block 17 +.word 1989626512 // Layer 7, block 19 +.word 1713587037 // Layer 7, block 21 +.word 1425879908 // Layer 7, block 23 +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 20504641 // Layer 6, block 12 +.word 7735096 // Layer 6, block 13 +.word 29463916 // Layer 6, block 14 +.word 23172067 // Layer 6, block 15 +.word 1312196872 // Layer 6, block 12 +.word 495008363 // Layer 6, block 13 +.word 1885546712 // Layer 6, block 14 +.word 1482899108 // Layer 6, block 15 +.word 1953000 // Layer 7, block 24 +.word 12766243 // Layer 7, block 26 +.word 16292342 // Layer 7, block 28 +.word 25143337 // Layer 7, block 30 +.word 124982461 // Layer 7, block 24 +.word 816977197 // Layer 7, block 26 +.word 1042630311 // Layer 7, block 28 +.word 1609050759 // Layer 7, block 30 +.word 12486848 // Layer 7, block 25 +.word 31556661 // Layer 7, block 27 +.word 28330310 // Layer 7, block 29 +.word 15137961 // Layer 7, block 31 +.word 799097282 // Layer 7, block 25 +.word 2019472170 // Layer 7, block 27 +.word 1813001465 // Layer 7, block 29 +.word 968755565 // Layer 7, block 31 +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 18663828 // Layer 6, block 16 +.word 25765932 // Layer 6, block 17 +.word 11779122 // Layer 6, block 18 +.word 29112305 // Layer 6, block 19 +.word 1194393831 // Layer 6, block 16 +.word 1648893798 // Layer 6, block 17 +.word 753806275 // Layer 6, block 18 +.word 1863045325 // Layer 6, block 19 +.word 33163184 // Layer 7, block 32 +.word 11550623 // Layer 7, block 34 +.word 25375595 // Layer 7, block 36 +.word 18254638 // Layer 7, block 38 +.word 2122281795 // Layer 7, block 32 +.word 739183455 // Layer 7, block 34 +.word 1623914137 // Layer 7, block 36 +.word 1168207670 // Layer 7, block 38 +.word 9551359 // Layer 7, block 33 +.word 33257316 // Layer 7, block 35 +.word 10387700 // Layer 7, block 37 +.word 4263629 // Layer 7, block 39 +.word 611240324 // Layer 7, block 33 +.word 2128305784 // Layer 7, block 35 +.word 664762063 // Layer 7, block 37 +.word 272851431 // Layer 7, block 39 +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 596073 // Layer 6, block 20 +.word 29039358 // Layer 6, block 21 +.word 6760262 // Layer 6, block 22 +.word 2228887 // Layer 6, block 23 +.word 38145761 // Layer 6, block 20 +.word 1858377074 // Layer 6, block 21 +.word 432623749 // Layer 6, block 22 +.word 142637881 // Layer 6, block 23 +.word 25929180 // Layer 7, block 40 +.word 23508428 // Layer 7, block 42 +.word 22560727 // Layer 7, block 44 +.word 29457393 // Layer 7, block 46 +.word 1659340873 // Layer 7, block 40 +.word 1504424569 // Layer 7, block 42 +.word 1443776334 // Layer 7, block 44 +.word 1885129272 // Layer 7, block 46 +.word 17371159 // Layer 7, block 41 +.word 11558208 // Layer 7, block 43 +.word 15755637 // Layer 7, block 45 +.word 20740787 // Layer 7, block 47 +.word 1111669329 // Layer 7, block 41 +.word 739668858 // Layer 7, block 43 +.word 1008283812 // Layer 7, block 45 +.word 1327309063 // Layer 7, block 47 +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 13624329 // Layer 6, block 24 +.word 9838349 // Layer 6, block 25 +.word 6934560 // Layer 6, block 26 +.word 11310234 // Layer 6, block 27 +.word 871890510 // Layer 6, block 24 +.word 629606282 // Layer 6, block 25 +.word 443777969 // Layer 6, block 26 +.word 723799733 // Layer 6, block 27 +.word 3153984 // Layer 7, block 48 +.word 15599806 // Layer 7, block 50 +.word 23484790 // Layer 7, block 52 +.word 30174454 // Layer 7, block 54 +.word 201839571 // Layer 7, block 48 +.word 998311389 // Layer 7, block 50 +.word 1502911852 // Layer 7, block 52 +.word 1931017673 // Layer 7, block 54 +.word 13598070 // Layer 7, block 49 +.word 31454003 // Layer 7, block 51 +.word 20506260 // Layer 7, block 53 +.word 5928435 // Layer 7, block 55 +.word 870210062 // Layer 7, block 49 +.word 2012902560 // Layer 7, block 51 +.word 1312300480 // Layer 7, block 53 +.word 379390883 // Layer 7, block 55 +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 32798516 // Layer 6, block 28 +.word 9911360 // Layer 6, block 29 +.word 32443170 // Layer 6, block 30 +.word 31293482 // Layer 6, block 31 +.word 2098944825 // Layer 6, block 28 +.word 634278629 // Layer 6, block 29 +.word 2076204416 // Layer 6, block 30 +.word 2002630000 // Layer 6, block 31 +.word 26013877 // Layer 7, block 56 +.word 22928950 // Layer 7, block 58 +.word 24547058 // Layer 7, block 60 +.word 21082546 // Layer 7, block 62 +.word 1664761067 // Layer 7, block 56 +.word 1467340807 // Layer 7, block 58 +.word 1570891816 // Layer 7, block 60 +.word 1349179970 // Layer 7, block 62 +.word 21864746 // Layer 7, block 57 +.word 27678266 // Layer 7, block 59 +.word 30695887 // Layer 7, block 61 +.word 31772478 // Layer 7, block 63 +.word 1399236949 // Layer 7, block 57 +.word 1771273834 // Layer 7, block 59 +.word 1964386839 // Layer 7, block 61 +.word 2033283404 // Layer 7, block 63 +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 2853776 // Layer 6, block 32 +.word 31645959 // Layer 6, block 33 +.word 29723614 // Layer 6, block 34 +.word 31813171 // Layer 6, block 35 +.word 182627725 // Layer 6, block 32 +.word 2025186806 // Layer 6, block 33 +.word 1902166116 // Layer 6, block 34 +.word 2035887557 // Layer 6, block 35 +.word 30377953 // Layer 7, block 64 +.word 4924837 // Layer 7, block 66 +.word 11362575 // Layer 7, block 68 +.word 31398766 // Layer 7, block 70 +.word 1944040616 // Layer 7, block 64 +.word 315165513 // Layer 7, block 66 +.word 727149301 // Layer 7, block 68 +.word 2009367662 // Layer 7, block 70 +.word 27689101 // Layer 7, block 65 +.word 31229525 // Layer 7, block 67 +.word 6544948 // Layer 7, block 69 +.word 13728247 // Layer 7, block 71 +.word 1771967221 // Layer 7, block 65 +.word 1998537064 // Layer 7, block 67 +.word 418844704 // Layer 7, block 69 +.word 878540754 // Layer 7, block 71 +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9116920 // Layer 6, block 36 +.word 26449800 // Layer 6, block 37 +.word 27173300 // Layer 6, block 38 +.word 1574249 // Layer 6, block 39 +.word 583438350 // Layer 6, block 36 +.word 1692658010 // Layer 6, block 37 +.word 1738958476 // Layer 6, block 38 +.word 100744247 // Layer 6, block 39 +.word 6510145 // Layer 7, block 72 +.word 760999 // Layer 7, block 74 +.word 1634503 // Layer 7, block 76 +.word 29546109 // Layer 7, block 78 +.word 416617482 // Layer 7, block 72 +.word 48700219 // Layer 7, block 74 +.word 104600209 // Layer 7, block 76 +.word 1890806663 // Layer 7, block 78 +.word 2195232 // Layer 7, block 73 +.word 4465852 // Layer 7, block 75 +.word 31203102 // Layer 7, block 77 +.word 29916743 // Layer 7, block 79 +.word 140484126 // Layer 7, block 73 +.word 285792715 // Layer 7, block 75 +.word 1996846121 // Layer 7, block 77 +.word 1914525428 // Layer 7, block 79 +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29172999 // Layer 6, block 40 +.word 16825951 // Layer 6, block 41 +.word 11592382 // Layer 6, block 42 +.word 2671395 // Layer 6, block 43 +.word 1866929445 // Layer 6, block 40 +.word 1076778680 // Layer 6, block 41 +.word 741855827 // Layer 6, block 42 +.word 170956232 // Layer 6, block 43 +.word 14579779 // Layer 7, block 80 +.word 24263513 // Layer 7, block 82 +.word 4646776 // Layer 7, block 84 +.word 69049 // Layer 7, block 86 +.word 933034643 // Layer 7, block 80 +.word 1552746321 // Layer 7, block 82 +.word 297370968 // Layer 7, block 84 +.word 4418799 // Layer 7, block 86 +.word 33263488 // Layer 7, block 81 +.word 22493246 // Layer 7, block 83 +.word 22009979 // Layer 7, block 85 +.word 12021234 // Layer 7, block 87 +.word 2128700762 // Layer 7, block 81 +.word 1439457879 // Layer 7, block 83 +.word 1408531152 // Layer 7, block 85 +.word 769300260 // Layer 7, block 87 +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 15720958 // Layer 6, block 44 +.word 4876619 // Layer 6, block 45 +.word 9370171 // Layer 6, block 46 +.word 2197027 // Layer 6, block 47 +.word 1006064525 // Layer 6, block 44 +.word 312079797 // Layer 6, block 45 +.word 599645177 // Layer 6, block 46 +.word 140598997 // Layer 6, block 47 +.word 16117282 // Layer 7, block 88 +.word 9635661 // Layer 7, block 90 +.word 9117520 // Layer 7, block 92 +.word 3506913 // Layer 7, block 94 +.word 1031427326 // Layer 7, block 88 +.word 616635240 // Layer 7, block 90 +.word 583476747 // Layer 7, block 92 +.word 224425303 // Layer 7, block 94 +.word 20014407 // Layer 7, block 89 +.word 25893988 // Layer 7, block 91 +.word 10257619 // Layer 7, block 93 +.word 24501669 // Layer 7, block 95 +.word 1280824291 // Layer 7, block 89 +.word 1657088757 // Layer 7, block 91 +.word 656437514 // Layer 7, block 93 +.word 1567987141 // Layer 7, block 95 +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 23467272 // Layer 6, block 48 +.word 11944835 // Layer 6, block 49 +.word 29768154 // Layer 6, block 50 +.word 3189790 // Layer 6, block 51 +.word 1501790786 // Layer 6, block 48 +.word 764411097 // Layer 6, block 49 +.word 1905016458 // Layer 6, block 50 +.word 204130980 // Layer 6, block 51 +.word 28559032 // Layer 7, block 96 +.word 20151609 // Layer 7, block 98 +.word 11645481 // Layer 7, block 100 +.word 16402437 // Layer 7, block 102 +.word 1827638556 // Layer 7, block 96 +.word 1289604549 // Layer 7, block 98 +.word 745253903 // Layer 7, block 100 +.word 1049675853 // Layer 7, block 102 +.word 1005359 // Layer 7, block 97 +.word 19130139 // Layer 7, block 99 +.word 11690281 // Layer 7, block 101 +.word 5461508 // Layer 7, block 103 +.word 64338065 // Layer 7, block 97 +.word 1224235458 // Layer 7, block 99 +.word 748120885 // Layer 7, block 101 +.word 349509836 // Layer 7, block 103 +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 4898455 // Layer 6, block 52 +.word 22059944 // Layer 6, block 53 +.word 20315246 // Layer 6, block 54 +.word 28615767 // Layer 6, block 55 +.word 313477194 // Layer 6, block 52 +.word 1411728668 // Layer 6, block 53 +.word 1300076517 // Layer 6, block 54 +.word 1831269319 // Layer 6, block 55 +.word 6226096 // Layer 7, block 104 +.word 14029790 // Layer 7, block 106 +.word 7729000 // Layer 7, block 108 +.word 13958531 // Layer 7, block 110 +.word 398439734 // Layer 7, block 104 +.word 897838034 // Layer 7, block 106 +.word 494618249 // Layer 7, block 108 +.word 893277806 // Layer 7, block 110 +.word 31755058 // Layer 7, block 105 +.word 26102744 // Layer 7, block 107 +.word 19175904 // Layer 7, block 109 +.word 19472238 // Layer 7, block 111 +.word 2032168609 // Layer 7, block 105 +.word 1670448121 // Layer 7, block 107 +.word 1227164194 // Layer 7, block 109 +.word 1246128123 // Layer 7, block 111 +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 17302560 // Layer 6, block 56 +.word 8630188 // Layer 6, block 57 +.word 13744680 // Layer 6, block 58 +.word 31890906 // Layer 6, block 59 +.word 1107279328 // Layer 6, block 56 +.word 552289879 // Layer 6, block 57 +.word 879592386 // Layer 6, block 58 +.word 2040862218 // Layer 6, block 59 +.word 4735938 // Layer 7, block 112 +.word 26671657 // Layer 7, block 114 +.word 25810971 // Layer 7, block 116 +.word 25578690 // Layer 7, block 118 +.word 303076900 // Layer 7, block 112 +.word 1706855774 // Layer 7, block 114 +.word 1651776074 // Layer 7, block 116 +.word 1636911225 // Layer 7, block 118 +.word 6957373 // Layer 7, block 113 +.word 25381712 // Layer 7, block 115 +.word 27780827 // Layer 7, block 117 +.word 28062311 // Layer 7, block 119 +.word 445237890 // Layer 7, block 113 +.word 1624305595 // Layer 7, block 115 +.word 1777837237 // Layer 7, block 117 +.word 1795850838 // Layer 7, block 119 +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 26150922 // Layer 6, block 60 +.word 29525906 // Layer 6, block 61 +.word 23080870 // Layer 6, block 62 +.word 1636987 // Layer 6, block 63 +.word 1673531278 // Layer 6, block 60 +.word 1889513769 // Layer 6, block 61 +.word 1477062945 // Layer 6, block 62 +.word 104759172 // Layer 6, block 63 +.word 10674616 // Layer 7, block 120 +.word 9508293 // Layer 7, block 122 +.word 4274200 // Layer 7, block 124 +.word 10066304 // Layer 7, block 126 +.word 683123285 // Layer 7, block 120 +.word 608484310 // Layer 7, block 122 +.word 273527923 // Layer 7, block 124 +.word 644194289 // Layer 7, block 126 +.word 26473446 // Layer 7, block 121 +.word 14853570 // Layer 7, block 123 +.word 32427548 // Layer 7, block 125 +.word 16598340 // Layer 7, block 127 +.word 1694171239 // Layer 7, block 121 +.word 950555930 // Layer 7, block 123 +.word 2075204685 // Layer 7, block 125 +.word 1062212688 // Layer 7, block 127 +.text +.global ntt_u32_full_neon_asm_var_4_4_12_0 +.global _ntt_u32_full_neon_asm_var_4_4_12_0 +ntt_u32_full_neon_asm_var_4_4_12_0: +_ntt_u32_full_neon_asm_var_4_4_12_0: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x0, #928] +ldr q29, [x17, #+0] +ldr q28, [x17, #+16] +sqrdmulh v27.4S, v30.4S, v28.s[0] +mul v30.4S, v30.4S,v29.s[0] +ldr q26, [x0, #992] +sqrdmulh v25.4S, v26.4S, v28.s[0] +mul v26.4S, v26.4S,v29.s[0] +ldr q24, [x0, #800] +sqrdmulh v23.4S, v24.4S, v28.s[0] +mul v24.4S, v24.4S,v29.s[0] +ldr q22, [x0, #864] +sqrdmulh v21.4S, v22.4S, v28.s[0] +mul v22.4S, v22.4S,v29.s[0] +ldr q20, [x0, #544] +mla v30.4S, v27.4S, v31.s[0] +sqrdmulh v27.4S, v20.4S, v28.s[0] +ldr q19, [x0, #608] +mla v26.4S, v25.4S, v31.s[0] +sqrdmulh v25.4S, v19.4S, v28.s[0] +ldr q18, [x0, #672] +mla v24.4S, v23.4S, v31.s[0] +sqrdmulh v23.4S, v18.4S, v28.s[0] +ldr q17, [x0, #736] +mla v22.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v17.4S, v28.s[0] +ldr q16, [x0, #416] +ldr q3, [x0, #480] +mul v20.4S, v20.4S,v29.s[0] +sub v2.4s, v16.4s, v30.4s +mul v19.4S, v19.4S,v29.s[0] +add v16.4s, v16.4s, v30.4s +ldr q30, [x0, #288] +ldr q1, [x0, #352] +mla v20.4S, v27.4S, v31.s[0] +sub v27.4s, v3.4s, v26.4s +mla v19.4S, v25.4S, v31.s[0] +add v3.4s, v3.4s, v26.4s +ldr q26, [x0, #32] +ldr q25, [x0, #96] +mul v18.4S, v18.4S,v29.s[0] +sub v0.4s, v30.4s, v24.4s +mul v17.4S, v17.4S,v29.s[0] +add v30.4s, v30.4s, v24.4s +ldr q24, [x0, #160] +ldr q15, [x0, #224] +mla v18.4S, v23.4S, v31.s[0] +sub v23.4s, v1.4s, v22.4s +mla v17.4S, v21.4S, v31.s[0] +add v1.4s, v1.4s, v22.4s +sqrdmulh v22.4S, v16.4S, v28.s[1] +mul v16.4S, v16.4S,v29.s[1] +sqrdmulh v21.4S, v3.4S, v28.s[1] +sub v14.4s, v26.4s, v20.4s +mul v3.4S, v3.4S,v29.s[1] +add v26.4s, v26.4s, v20.4s +sqrdmulh v20.4S, v30.4S, v28.s[1] +sub v13.4s, v25.4s, v19.4s +mul v30.4S, v30.4S,v29.s[1] +add v25.4s, v25.4s, v19.4s +sqrdmulh v19.4S, v1.4S, v28.s[1] +sub v12.4s, v24.4s, v18.4s +mul v1.4S, v1.4S,v29.s[1] +add v24.4s, v24.4s, v18.4s +mla v16.4S, v22.4S, v31.s[0] +sub v22.4s, v15.4s, v17.4s +sqrdmulh v18.4S, v2.4S, v28.s[2] +add v15.4s, v15.4s, v17.4s +mla v3.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v27.4S, v28.s[2] +mla v30.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v0.4S, v28.s[2] +mla v1.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v23.4S, v28.s[2] +ldr q17, [x17, #+32] +ldr q11, [x17, #+48] +mul v2.4S, v2.4S,v29.s[2] +sub v10.4s, v24.4s, v16.4s +mul v27.4S, v27.4S,v29.s[2] +add v24.4s, v24.4s, v16.4s +mla v2.4S, v18.4S, v31.s[0] +sub v18.4s, v15.4s, v3.4s +mla v27.4S, v21.4S, v31.s[0] +add v15.4s, v15.4s, v3.4s +mul v0.4S, v0.4S,v29.s[2] +sub v3.4s, v26.4s, v30.4s +mul v23.4S, v23.4S,v29.s[2] +add v26.4s, v26.4s, v30.4s +mla v0.4S, v20.4S, v31.s[0] +sub v20.4s, v25.4s, v1.4s +mla v23.4S, v19.4S, v31.s[0] +add v25.4s, v25.4s, v1.4s +sqrdmulh v1.4S, v10.4S, v11.s[1] +mul v10.4S, v10.4S,v17.s[1] +sqrdmulh v19.4S, v18.4S, v11.s[1] +sub v30.4s, v12.4s, v2.4s +mul v18.4S, v18.4S,v17.s[1] +add v12.4s, v12.4s, v2.4s +sqrdmulh v2.4S, v24.4S, v11.s[0] +sub v21.4s, v22.4s, v27.4s +mul v24.4S, v24.4S,v17.s[0] +add v22.4s, v22.4s, v27.4s +sqrdmulh v27.4S, v15.4S, v11.s[0] +sub v16.4s, v14.4s, v0.4s +mul v15.4S, v15.4S,v17.s[0] +add v14.4s, v14.4s, v0.4s +ldr q0, [x17, #+64] +ldr q9, [x17, #+80] +mla v10.4S, v1.4S, v31.s[0] +sub v1.4s, v13.4s, v23.4s +sqrdmulh v8.4S, v12.4S, v11.s[2] +add v13.4s, v13.4s, v23.4s +mla v18.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v22.4S, v11.s[2] +mla v24.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v30.4S, v11.s[3] +mla v15.4S, v27.4S, v31.s[0] +sqrdmulh v27.4S, v21.4S, v11.s[3] +ldr q23, [x17, #+96] +ldr q7, [x17, #+112] +mul v12.4S, v12.4S,v17.s[2] +sub v6.4s, v3.4s, v10.4s +mul v22.4S, v22.4S,v17.s[2] +add v3.4s, v3.4s, v10.4s +mla v12.4S, v8.4S, v31.s[0] +sub v8.4s, v20.4s, v18.4s +mla v22.4S, v19.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +mul v30.4S, v30.4S,v17.s[3] +sub v18.4s, v26.4s, v24.4s +mul v21.4S, v21.4S,v17.s[3] +add v26.4s, v26.4s, v24.4s +mla v30.4S, v2.4S, v31.s[0] +sub v2.4s, v25.4s, v15.4s +mla v21.4S, v27.4S, v31.s[0] +add v25.4s, v25.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v9.s[2] +mul v20.4S, v20.4S,v0.s[2] +sqrdmulh v27.4S, v8.4S, v9.s[3] +sub v24.4s, v14.4s, v12.4s +mul v8.4S, v8.4S,v0.s[3] +add v14.4s, v14.4s, v12.4s +sqrdmulh v12.4S, v2.4S, v9.s[1] +sub v19.4s, v13.4s, v22.4s +mul v2.4S, v2.4S,v0.s[1] +add v13.4s, v13.4s, v22.4s +sqrdmulh v22.4S, v25.4S, v9.s[0] +sub v10.4s, v16.4s, v30.4s +mul v25.4S, v25.4S,v0.s[0] +add v16.4s, v16.4s, v30.4s +mla v20.4S, v15.4S, v31.s[0] +sub v15.4s, v1.4s, v21.4s +sqrdmulh v30.4S, v13.4S, v7.s[0] +add v1.4s, v1.4s, v21.4s +mla v8.4S, v27.4S, v31.s[0] +sub v27.4s, v3.4s, v20.4s +sqrdmulh v21.4S, v19.4S, v7.s[1] +add v3.4s, v3.4s, v20.4s +mla v2.4S, v12.4S, v31.s[0] +sub v12.4s, v6.4s, v8.4s +sqrdmulh v20.4S, v1.4S, v7.s[2] +add v6.4s, v6.4s, v8.4s +mla v25.4S, v22.4S, v31.s[0] +sub v22.4s, v18.4s, v2.4s +sqrdmulh v8.4S, v15.4S, v7.s[3] +add v18.4s, v18.4s, v2.4s +mul v13.4S, v13.4S,v23.s[0] +sub v2.4s, v26.4s, v25.4s +mul v19.4S, v19.4S,v23.s[1] +add v26.4s, v26.4s, v25.4s +mla v13.4S, v30.4S, v31.s[0] +str q27, [x0, #352] +mla v19.4S, v21.4S, v31.s[0] +str q3, [x0, #288] +mul v1.4S, v1.4S,v23.s[2] +str q12, [x0, #480] +mul v15.4S, v15.4S,v23.s[3] +str q6, [x0, #416] +mla v1.4S, v20.4S, v31.s[0] +str q22, [x0, #224] +mla v15.4S, v8.4S, v31.s[0] +str q18, [x0, #160] +ldr q18, [x0, #944] +sqrdmulh v8.4S, v18.4S, v28.s[0] +str q2, [x0, #96] +mul v18.4S, v18.4S,v29.s[0] +str q26, [x0, #32] +ldr q26, [x0, #1008] +sqrdmulh v2.4S, v26.4S, v28.s[0] +sub v22.4s, v14.4s, v13.4s +str q22, [x0, #608] +mul v26.4S, v26.4S,v29.s[0] +add v14.4s, v14.4s, v13.4s +ldr q13, [x0, #816] +sqrdmulh v22.4S, v13.4S, v28.s[0] +sub v20.4s, v24.4s, v19.4s +str q14, [x0, #544] +mul v13.4S, v13.4S,v29.s[0] +add v24.4s, v24.4s, v19.4s +ldr q19, [x0, #880] +sqrdmulh v14.4S, v19.4S, v28.s[0] +sub v6.4s, v16.4s, v1.4s +str q20, [x0, #736] +mul v19.4S, v19.4S,v29.s[0] +add v16.4s, v16.4s, v1.4s +ldr q1, [x0, #560] +mla v18.4S, v8.4S, v31.s[0] +sub v8.4s, v10.4s, v15.4s +str q24, [x0, #672] +sqrdmulh v24.4S, v1.4S, v28.s[0] +add v10.4s, v10.4s, v15.4s +ldr q15, [x0, #624] +mla v26.4S, v2.4S, v31.s[0] +str q6, [x0, #864] +sqrdmulh v6.4S, v15.4S, v28.s[0] +ldr q2, [x0, #688] +mla v13.4S, v22.4S, v31.s[0] +str q16, [x0, #800] +sqrdmulh v16.4S, v2.4S, v28.s[0] +ldr q22, [x0, #752] +mla v19.4S, v14.4S, v31.s[0] +str q8, [x0, #992] +sqrdmulh v8.4S, v22.4S, v28.s[0] +ldr q14, [x0, #432] +ldr q20, [x0, #496] +mul v1.4S, v1.4S,v29.s[0] +sub v12.4s, v14.4s, v18.4s +str q10, [x0, #928] +mul v15.4S, v15.4S,v29.s[0] +add v14.4s, v14.4s, v18.4s +ldr q18, [x0, #304] +ldr q10, [x0, #368] +mla v1.4S, v24.4S, v31.s[0] +sub v24.4s, v20.4s, v26.4s +mla v15.4S, v6.4S, v31.s[0] +add v20.4s, v20.4s, v26.4s +ldr q26, [x0, #48] +ldr q6, [x0, #112] +mul v2.4S, v2.4S,v29.s[0] +sub v3.4s, v18.4s, v13.4s +mul v22.4S, v22.4S,v29.s[0] +add v18.4s, v18.4s, v13.4s +ldr q13, [x0, #176] +ldr q21, [x0, #240] +mla v2.4S, v16.4S, v31.s[0] +sub v16.4s, v10.4s, v19.4s +mla v22.4S, v8.4S, v31.s[0] +add v10.4s, v10.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v28.s[1] +mul v14.4S, v14.4S,v29.s[1] +sqrdmulh v8.4S, v20.4S, v28.s[1] +sub v27.4s, v26.4s, v1.4s +mul v20.4S, v20.4S,v29.s[1] +add v26.4s, v26.4s, v1.4s +sqrdmulh v1.4S, v18.4S, v28.s[1] +sub v30.4s, v6.4s, v15.4s +mul v18.4S, v18.4S,v29.s[1] +add v6.4s, v6.4s, v15.4s +sqrdmulh v15.4S, v10.4S, v28.s[1] +sub v25.4s, v13.4s, v2.4s +mul v10.4S, v10.4S,v29.s[1] +add v13.4s, v13.4s, v2.4s +mla v14.4S, v19.4S, v31.s[0] +sub v19.4s, v21.4s, v22.4s +sqrdmulh v2.4S, v12.4S, v28.s[2] +add v21.4s, v21.4s, v22.4s +mla v20.4S, v8.4S, v31.s[0] +sqrdmulh v8.4S, v24.4S, v28.s[2] +mla v18.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v3.4S, v28.s[2] +mla v10.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v16.4S, v28.s[2] +mul v12.4S, v12.4S,v29.s[2] +sub v22.4s, v13.4s, v14.4s +mul v24.4S, v24.4S,v29.s[2] +add v13.4s, v13.4s, v14.4s +mla v12.4S, v2.4S, v31.s[0] +sub v2.4s, v21.4s, v20.4s +mla v24.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v20.4s +mul v3.4S, v3.4S,v29.s[2] +sub v20.4s, v26.4s, v18.4s +mul v16.4S, v16.4S,v29.s[2] +add v26.4s, v26.4s, v18.4s +mla v3.4S, v1.4S, v31.s[0] +sub v1.4s, v6.4s, v10.4s +mla v16.4S, v15.4S, v31.s[0] +add v6.4s, v6.4s, v10.4s +sqrdmulh v10.4S, v22.4S, v11.s[1] +mul v22.4S, v22.4S,v17.s[1] +sqrdmulh v15.4S, v2.4S, v11.s[1] +sub v18.4s, v25.4s, v12.4s +mul v2.4S, v2.4S,v17.s[1] +add v25.4s, v25.4s, v12.4s +sqrdmulh v12.4S, v13.4S, v11.s[0] +sub v8.4s, v19.4s, v24.4s +mul v13.4S, v13.4S,v17.s[0] +add v19.4s, v19.4s, v24.4s +sqrdmulh v24.4S, v21.4S, v11.s[0] +sub v14.4s, v27.4s, v3.4s +mul v21.4S, v21.4S,v17.s[0] +add v27.4s, v27.4s, v3.4s +mla v22.4S, v10.4S, v31.s[0] +sub v10.4s, v30.4s, v16.4s +sqrdmulh v3.4S, v25.4S, v11.s[2] +add v30.4s, v30.4s, v16.4s +mla v2.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v19.4S, v11.s[2] +mla v13.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v18.4S, v11.s[3] +mla v21.4S, v24.4S, v31.s[0] +sqrdmulh v24.4S, v8.4S, v11.s[3] +mul v25.4S, v25.4S,v17.s[2] +sub v16.4s, v20.4s, v22.4s +mul v19.4S, v19.4S,v17.s[2] +add v20.4s, v20.4s, v22.4s +mla v25.4S, v3.4S, v31.s[0] +sub v3.4s, v1.4s, v2.4s +mla v19.4S, v15.4S, v31.s[0] +add v1.4s, v1.4s, v2.4s +mul v18.4S, v18.4S,v17.s[3] +sub v2.4s, v26.4s, v13.4s +mul v8.4S, v8.4S,v17.s[3] +add v26.4s, v26.4s, v13.4s +mla v18.4S, v12.4S, v31.s[0] +sub v12.4s, v6.4s, v21.4s +mla v8.4S, v24.4S, v31.s[0] +add v6.4s, v6.4s, v21.4s +sqrdmulh v21.4S, v1.4S, v9.s[2] +mul v1.4S, v1.4S,v0.s[2] +sqrdmulh v24.4S, v3.4S, v9.s[3] +sub v13.4s, v27.4s, v25.4s +mul v3.4S, v3.4S,v0.s[3] +add v27.4s, v27.4s, v25.4s +sqrdmulh v25.4S, v12.4S, v9.s[1] +sub v15.4s, v30.4s, v19.4s +mul v12.4S, v12.4S,v0.s[1] +add v30.4s, v30.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v9.s[0] +sub v22.4s, v14.4s, v18.4s +mul v6.4S, v6.4S,v0.s[0] +add v14.4s, v14.4s, v18.4s +mla v1.4S, v21.4S, v31.s[0] +sub v21.4s, v10.4s, v8.4s +sqrdmulh v18.4S, v30.4S, v7.s[0] +add v10.4s, v10.4s, v8.4s +mla v3.4S, v24.4S, v31.s[0] +sub v24.4s, v20.4s, v1.4s +sqrdmulh v8.4S, v15.4S, v7.s[1] +add v20.4s, v20.4s, v1.4s +mla v12.4S, v25.4S, v31.s[0] +sub v25.4s, v16.4s, v3.4s +sqrdmulh v1.4S, v10.4S, v7.s[2] +add v16.4s, v16.4s, v3.4s +mla v6.4S, v19.4S, v31.s[0] +sub v19.4s, v2.4s, v12.4s +sqrdmulh v3.4S, v21.4S, v7.s[3] +add v2.4s, v2.4s, v12.4s +mul v30.4S, v30.4S,v23.s[0] +sub v12.4s, v26.4s, v6.4s +mul v15.4S, v15.4S,v23.s[1] +add v26.4s, v26.4s, v6.4s +mla v30.4S, v18.4S, v31.s[0] +str q24, [x0, #368] +mla v15.4S, v8.4S, v31.s[0] +str q20, [x0, #304] +mul v10.4S, v10.4S,v23.s[2] +str q25, [x0, #496] +mul v21.4S, v21.4S,v23.s[3] +str q16, [x0, #432] +mla v10.4S, v1.4S, v31.s[0] +str q19, [x0, #240] +mla v21.4S, v3.4S, v31.s[0] +str q2, [x0, #176] +ldr q2, [x0, #896] +sqrdmulh v3.4S, v2.4S, v28.s[0] +str q12, [x0, #112] +mul v2.4S, v2.4S,v29.s[0] +str q26, [x0, #48] +ldr q26, [x0, #960] +sqrdmulh v12.4S, v26.4S, v28.s[0] +sub v19.4s, v27.4s, v30.4s +str q19, [x0, #624] +mul v26.4S, v26.4S,v29.s[0] +add v27.4s, v27.4s, v30.4s +ldr q30, [x0, #768] +sqrdmulh v19.4S, v30.4S, v28.s[0] +sub v1.4s, v13.4s, v15.4s +str q27, [x0, #560] +mul v30.4S, v30.4S,v29.s[0] +add v13.4s, v13.4s, v15.4s +ldr q15, [x0, #832] +sqrdmulh v27.4S, v15.4S, v28.s[0] +sub v16.4s, v14.4s, v10.4s +str q1, [x0, #752] +mul v15.4S, v15.4S,v29.s[0] +add v14.4s, v14.4s, v10.4s +ldr q10, [x0, #512] +mla v2.4S, v3.4S, v31.s[0] +sub v3.4s, v22.4s, v21.4s +str q13, [x0, #688] +sqrdmulh v13.4S, v10.4S, v28.s[0] +add v22.4s, v22.4s, v21.4s +ldr q21, [x0, #576] +mla v26.4S, v12.4S, v31.s[0] +str q16, [x0, #880] +sqrdmulh v16.4S, v21.4S, v28.s[0] +ldr q12, [x0, #640] +mla v30.4S, v19.4S, v31.s[0] +str q14, [x0, #816] +sqrdmulh v14.4S, v12.4S, v28.s[0] +ldr q19, [x0, #704] +mla v15.4S, v27.4S, v31.s[0] +str q3, [x0, #1008] +sqrdmulh v3.4S, v19.4S, v28.s[0] +ldr q27, [x0, #384] +ldr q1, [x0, #448] +mul v10.4S, v10.4S,v29.s[0] +sub v25.4s, v27.4s, v2.4s +str q22, [x0, #944] +mul v21.4S, v21.4S,v29.s[0] +add v27.4s, v27.4s, v2.4s +ldr q2, [x0, #256] +ldr q22, [x0, #320] +mla v10.4S, v13.4S, v31.s[0] +sub v13.4s, v1.4s, v26.4s +mla v21.4S, v16.4S, v31.s[0] +add v1.4s, v1.4s, v26.4s +ldr q26, [x0, #0] +ldr q16, [x0, #64] +mul v12.4S, v12.4S,v29.s[0] +sub v20.4s, v2.4s, v30.4s +mul v19.4S, v19.4S,v29.s[0] +add v2.4s, v2.4s, v30.4s +ldr q30, [x0, #128] +ldr q8, [x0, #192] +mla v12.4S, v14.4S, v31.s[0] +sub v14.4s, v22.4s, v15.4s +mla v19.4S, v3.4S, v31.s[0] +add v22.4s, v22.4s, v15.4s +sqrdmulh v15.4S, v27.4S, v28.s[1] +mul v27.4S, v27.4S,v29.s[1] +sqrdmulh v3.4S, v1.4S, v28.s[1] +sub v24.4s, v26.4s, v10.4s +mul v1.4S, v1.4S,v29.s[1] +add v26.4s, v26.4s, v10.4s +sqrdmulh v10.4S, v2.4S, v28.s[1] +sub v18.4s, v16.4s, v21.4s +mul v2.4S, v2.4S,v29.s[1] +add v16.4s, v16.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v28.s[1] +sub v6.4s, v30.4s, v12.4s +mul v22.4S, v22.4S,v29.s[1] +add v30.4s, v30.4s, v12.4s +mla v27.4S, v15.4S, v31.s[0] +sub v15.4s, v8.4s, v19.4s +sqrdmulh v12.4S, v25.4S, v28.s[2] +add v8.4s, v8.4s, v19.4s +mla v1.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v13.4S, v28.s[2] +mla v2.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v20.4S, v28.s[2] +mla v22.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v14.4S, v28.s[2] +mul v25.4S, v25.4S,v29.s[2] +sub v19.4s, v30.4s, v27.4s +mul v13.4S, v13.4S,v29.s[2] +add v30.4s, v30.4s, v27.4s +mla v25.4S, v12.4S, v31.s[0] +sub v12.4s, v8.4s, v1.4s +mla v13.4S, v3.4S, v31.s[0] +add v8.4s, v8.4s, v1.4s +mul v20.4S, v20.4S,v29.s[2] +sub v1.4s, v26.4s, v2.4s +mul v14.4S, v14.4S,v29.s[2] +add v26.4s, v26.4s, v2.4s +mla v20.4S, v10.4S, v31.s[0] +sub v10.4s, v16.4s, v22.4s +mla v14.4S, v21.4S, v31.s[0] +add v16.4s, v16.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v11.s[1] +mul v19.4S, v19.4S,v17.s[1] +sqrdmulh v21.4S, v12.4S, v11.s[1] +sub v2.4s, v6.4s, v25.4s +mul v12.4S, v12.4S,v17.s[1] +add v6.4s, v6.4s, v25.4s +sqrdmulh v25.4S, v30.4S, v11.s[0] +sub v3.4s, v15.4s, v13.4s +mul v30.4S, v30.4S,v17.s[0] +add v15.4s, v15.4s, v13.4s +sqrdmulh v13.4S, v8.4S, v11.s[0] +sub v27.4s, v24.4s, v20.4s +mul v8.4S, v8.4S,v17.s[0] +add v24.4s, v24.4s, v20.4s +mla v19.4S, v22.4S, v31.s[0] +sub v22.4s, v18.4s, v14.4s +sqrdmulh v20.4S, v6.4S, v11.s[2] +add v18.4s, v18.4s, v14.4s +mla v12.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v15.4S, v11.s[2] +mla v30.4S, v25.4S, v31.s[0] +sqrdmulh v25.4S, v2.4S, v11.s[3] +mla v8.4S, v13.4S, v31.s[0] +sqrdmulh v13.4S, v3.4S, v11.s[3] +mul v6.4S, v6.4S,v17.s[2] +sub v14.4s, v1.4s, v19.4s +mul v15.4S, v15.4S,v17.s[2] +add v1.4s, v1.4s, v19.4s +mla v6.4S, v20.4S, v31.s[0] +sub v20.4s, v10.4s, v12.4s +mla v15.4S, v21.4S, v31.s[0] +add v10.4s, v10.4s, v12.4s +mul v2.4S, v2.4S,v17.s[3] +sub v12.4s, v26.4s, v30.4s +mul v3.4S, v3.4S,v17.s[3] +add v26.4s, v26.4s, v30.4s +mla v2.4S, v25.4S, v31.s[0] +sub v25.4s, v16.4s, v8.4s +mla v3.4S, v13.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v10.4S, v9.s[2] +mul v10.4S, v10.4S,v0.s[2] +sqrdmulh v13.4S, v20.4S, v9.s[3] +sub v30.4s, v24.4s, v6.4s +mul v20.4S, v20.4S,v0.s[3] +add v24.4s, v24.4s, v6.4s +sqrdmulh v6.4S, v25.4S, v9.s[1] +sub v21.4s, v18.4s, v15.4s +mul v25.4S, v25.4S,v0.s[1] +add v18.4s, v18.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v9.s[0] +sub v19.4s, v27.4s, v2.4s +mul v16.4S, v16.4S,v0.s[0] +add v27.4s, v27.4s, v2.4s +mla v10.4S, v8.4S, v31.s[0] +sub v8.4s, v22.4s, v3.4s +sqrdmulh v2.4S, v18.4S, v7.s[0] +add v22.4s, v22.4s, v3.4s +mla v20.4S, v13.4S, v31.s[0] +sub v13.4s, v1.4s, v10.4s +sqrdmulh v3.4S, v21.4S, v7.s[1] +add v1.4s, v1.4s, v10.4s +mla v25.4S, v6.4S, v31.s[0] +sub v6.4s, v14.4s, v20.4s +sqrdmulh v10.4S, v22.4S, v7.s[2] +add v14.4s, v14.4s, v20.4s +mla v16.4S, v15.4S, v31.s[0] +sub v15.4s, v12.4s, v25.4s +sqrdmulh v20.4S, v8.4S, v7.s[3] +add v12.4s, v12.4s, v25.4s +mul v18.4S, v18.4S,v23.s[0] +sub v25.4s, v26.4s, v16.4s +mul v21.4S, v21.4S,v23.s[1] +add v26.4s, v26.4s, v16.4s +mla v18.4S, v2.4S, v31.s[0] +str q13, [x0, #320] +mla v21.4S, v3.4S, v31.s[0] +str q1, [x0, #256] +mul v22.4S, v22.4S,v23.s[2] +str q6, [x0, #448] +mul v8.4S, v8.4S,v23.s[3] +str q14, [x0, #384] +mla v22.4S, v10.4S, v31.s[0] +str q15, [x0, #192] +mla v8.4S, v20.4S, v31.s[0] +str q12, [x0, #128] +ldr q12, [x0, #912] +sqrdmulh v20.4S, v12.4S, v28.s[0] +str q25, [x0, #64] +mul v12.4S, v12.4S,v29.s[0] +str q26, [x0, #0] +ldr q26, [x0, #976] +sqrdmulh v25.4S, v26.4S, v28.s[0] +sub v15.4s, v24.4s, v18.4s +str q15, [x0, #576] +mul v26.4S, v26.4S,v29.s[0] +add v24.4s, v24.4s, v18.4s +ldr q18, [x0, #784] +sqrdmulh v15.4S, v18.4S, v28.s[0] +sub v10.4s, v30.4s, v21.4s +str q24, [x0, #512] +mul v18.4S, v18.4S,v29.s[0] +add v30.4s, v30.4s, v21.4s +ldr q21, [x0, #848] +sqrdmulh v24.4S, v21.4S, v28.s[0] +sub v14.4s, v27.4s, v22.4s +str q10, [x0, #704] +mul v21.4S, v21.4S,v29.s[0] +add v27.4s, v27.4s, v22.4s +ldr q22, [x0, #528] +mla v12.4S, v20.4S, v31.s[0] +sub v20.4s, v19.4s, v8.4s +str q30, [x0, #640] +sqrdmulh v30.4S, v22.4S, v28.s[0] +add v19.4s, v19.4s, v8.4s +ldr q8, [x0, #592] +mla v26.4S, v25.4S, v31.s[0] +str q14, [x0, #832] +sqrdmulh v14.4S, v8.4S, v28.s[0] +ldr q25, [x0, #656] +mla v18.4S, v15.4S, v31.s[0] +str q27, [x0, #768] +sqrdmulh v27.4S, v25.4S, v28.s[0] +ldr q15, [x0, #720] +mla v21.4S, v24.4S, v31.s[0] +str q20, [x0, #960] +sqrdmulh v20.4S, v15.4S, v28.s[0] +ldr q24, [x0, #400] +ldr q10, [x0, #464] +mul v22.4S, v22.4S,v29.s[0] +sub v6.4s, v24.4s, v12.4s +str q19, [x0, #896] +mul v8.4S, v8.4S,v29.s[0] +add v24.4s, v24.4s, v12.4s +ldr q12, [x0, #272] +ldr q19, [x0, #336] +mla v22.4S, v30.4S, v31.s[0] +sub v30.4s, v10.4s, v26.4s +mla v8.4S, v14.4S, v31.s[0] +add v10.4s, v10.4s, v26.4s +ldr q26, [x0, #16] +ldr q14, [x0, #80] +mul v25.4S, v25.4S,v29.s[0] +sub v1.4s, v12.4s, v18.4s +mul v15.4S, v15.4S,v29.s[0] +add v12.4s, v12.4s, v18.4s +ldr q18, [x0, #144] +ldr q3, [x0, #208] +mla v25.4S, v27.4S, v31.s[0] +sub v27.4s, v19.4s, v21.4s +mla v15.4S, v20.4S, v31.s[0] +add v19.4s, v19.4s, v21.4s +sqrdmulh v21.4S, v24.4S, v28.s[1] +mul v24.4S, v24.4S,v29.s[1] +sqrdmulh v20.4S, v10.4S, v28.s[1] +sub v13.4s, v26.4s, v22.4s +mul v10.4S, v10.4S,v29.s[1] +add v26.4s, v26.4s, v22.4s +sqrdmulh v22.4S, v12.4S, v28.s[1] +sub v2.4s, v14.4s, v8.4s +mul v12.4S, v12.4S,v29.s[1] +add v14.4s, v14.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v28.s[1] +sub v16.4s, v18.4s, v25.4s +mul v19.4S, v19.4S,v29.s[1] +add v18.4s, v18.4s, v25.4s +mla v24.4S, v21.4S, v31.s[0] +sub v21.4s, v3.4s, v15.4s +sqrdmulh v25.4S, v6.4S, v28.s[2] +add v3.4s, v3.4s, v15.4s +mla v10.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v30.4S, v28.s[2] +mla v12.4S, v22.4S, v31.s[0] +sqrdmulh v22.4S, v1.4S, v28.s[2] +mla v19.4S, v8.4S, v31.s[0] +sqrdmulh v8.4S, v27.4S, v28.s[2] +mul v6.4S, v6.4S,v29.s[2] +sub v15.4s, v18.4s, v24.4s +mul v30.4S, v30.4S,v29.s[2] +add v18.4s, v18.4s, v24.4s +mla v6.4S, v25.4S, v31.s[0] +sub v25.4s, v3.4s, v10.4s +mla v30.4S, v20.4S, v31.s[0] +add v3.4s, v3.4s, v10.4s +mul v1.4S, v1.4S,v29.s[2] +sub v10.4s, v26.4s, v12.4s +mul v27.4S, v27.4S,v29.s[2] +add v26.4s, v26.4s, v12.4s +mla v1.4S, v22.4S, v31.s[0] +sub v22.4s, v14.4s, v19.4s +mla v27.4S, v8.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +sqrdmulh v28.4S, v15.4S, v11.s[1] +mul v15.4S, v15.4S,v17.s[1] +sqrdmulh v29.4S, v25.4S, v11.s[1] +sub v19.4s, v16.4s, v6.4s +mul v25.4S, v25.4S,v17.s[1] +add v16.4s, v16.4s, v6.4s +sqrdmulh v6.4S, v18.4S, v11.s[0] +sub v8.4s, v21.4s, v30.4s +mul v18.4S, v18.4S,v17.s[0] +add v21.4s, v21.4s, v30.4s +sqrdmulh v30.4S, v3.4S, v11.s[0] +sub v12.4s, v13.4s, v1.4s +mul v3.4S, v3.4S,v17.s[0] +add v13.4s, v13.4s, v1.4s +mla v15.4S, v28.4S, v31.s[0] +sub v28.4s, v2.4s, v27.4s +sqrdmulh v1.4S, v16.4S, v11.s[2] +add v2.4s, v2.4s, v27.4s +mla v25.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v21.4S, v11.s[2] +mla v18.4S, v6.4S, v31.s[0] +sqrdmulh v6.4S, v19.4S, v11.s[3] +mla v3.4S, v30.4S, v31.s[0] +sqrdmulh v30.4S, v8.4S, v11.s[3] +mul v16.4S, v16.4S,v17.s[2] +sub v27.4s, v10.4s, v15.4s +mul v21.4S, v21.4S,v17.s[2] +add v10.4s, v10.4s, v15.4s +mla v16.4S, v1.4S, v31.s[0] +sub v1.4s, v22.4s, v25.4s +mla v21.4S, v29.4S, v31.s[0] +add v22.4s, v22.4s, v25.4s +mul v19.4S, v19.4S,v17.s[3] +sub v25.4s, v26.4s, v18.4s +mul v8.4S, v8.4S,v17.s[3] +add v26.4s, v26.4s, v18.4s +mla v19.4S, v6.4S, v31.s[0] +sub v6.4s, v14.4s, v3.4s +mla v8.4S, v30.4S, v31.s[0] +add v14.4s, v14.4s, v3.4s +sqrdmulh v11.4S, v22.4S, v9.s[2] +mul v22.4S, v22.4S,v0.s[2] +sqrdmulh v17.4S, v1.4S, v9.s[3] +sub v3.4s, v13.4s, v16.4s +mul v1.4S, v1.4S,v0.s[3] +add v13.4s, v13.4s, v16.4s +sqrdmulh v16.4S, v6.4S, v9.s[1] +sub v30.4s, v2.4s, v21.4s +mul v6.4S, v6.4S,v0.s[1] +add v2.4s, v2.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v9.s[0] +sub v18.4s, v12.4s, v19.4s +mul v14.4S, v14.4S,v0.s[0] +add v12.4s, v12.4s, v19.4s +mla v22.4S, v11.4S, v31.s[0] +sub v11.4s, v28.4s, v8.4s +sqrdmulh v9.4S, v2.4S, v7.s[0] +add v28.4s, v28.4s, v8.4s +mla v1.4S, v17.4S, v31.s[0] +sub v17.4s, v10.4s, v22.4s +sqrdmulh v8.4S, v30.4S, v7.s[1] +add v10.4s, v10.4s, v22.4s +mla v6.4S, v16.4S, v31.s[0] +sub v16.4s, v27.4s, v1.4s +sqrdmulh v22.4S, v28.4S, v7.s[2] +add v27.4s, v27.4s, v1.4s +mla v14.4S, v21.4S, v31.s[0] +sub v21.4s, v25.4s, v6.4s +sqrdmulh v1.4S, v11.4S, v7.s[3] +add v25.4s, v25.4s, v6.4s +mul v2.4S, v2.4S,v23.s[0] +sub v6.4s, v26.4s, v14.4s +mul v30.4S, v30.4S,v23.s[1] +add v26.4s, v26.4s, v14.4s +mla v2.4S, v9.4S, v31.s[0] +str q17, [x0, #336] +mla v30.4S, v8.4S, v31.s[0] +str q10, [x0, #272] +mul v28.4S, v28.4S,v23.s[2] +str q16, [x0, #464] +mul v11.4S, v11.4S,v23.s[3] +str q27, [x0, #400] +mla v28.4S, v22.4S, v31.s[0] +str q21, [x0, #208] +mla v11.4S, v1.4S, v31.s[0] +str q25, [x0, #144] +str q6, [x0, #80] +str q26, [x0, #16] +sub v26.4s, v13.4s, v2.4s +str q26, [x0, #592] +add v13.4s, v13.4s, v2.4s +sub v2.4s, v3.4s, v30.4s +str q13, [x0, #528] +add v3.4s, v3.4s, v30.4s +sub v30.4s, v12.4s, v28.4s +str q2, [x0, #720] +add v12.4s, v12.4s, v28.4s +sub v28.4s, v18.4s, v11.4s +str q3, [x0, #656] +add v18.4s, v18.4s, v11.4s +str q30, [x0, #848] +str q12, [x0, #784] +str q28, [x0, #976] +str q18, [x0, #912] +ldr q4, [x17, #+128] +ldr q5, [x17, #+144] +ldr q24, [x17, #+160] +ldr q20, [x17, #+176] +ldr q15, [x17, #+192] +ldr q29, [x17, #+208] +ldr q19, [x17, #+224] +ldr q0, [x17, #+240] +ldr q14, [x0, #32] +ldr q9, [x0, #48] +ldr q17, [x0, #0] +ldr q8, [x0, #16] +sqrdmulh v10.4S, v14.4S, v5.s[0] +mul v14.4S, v14.4S,v4.s[0] +mla v14.4S, v10.4S, v31.s[0] +sub v10.4s, v17.4s, v14.4s +add v17.4s, v17.4s, v14.4s +sqrdmulh v14.4S, v9.4S, v5.s[0] +mul v9.4S, v9.4S,v4.s[0] +mla v9.4S, v14.4S, v31.s[0] +sub v14.4s, v8.4s, v9.4s +add v8.4s, v8.4s, v9.4s +sqrdmulh v9.4S, v8.4S, v5.s[1] +mul v8.4S, v8.4S,v4.s[1] +mla v8.4S, v9.4S, v31.s[0] +sub v9.4s, v17.4s, v8.4s +add v17.4s, v17.4s, v8.4s +sqrdmulh v8.4S, v14.4S, v5.s[2] +mul v14.4S, v14.4S,v4.s[2] +mla v14.4S, v8.4S, v31.s[0] +sub v8.4s, v10.4s, v14.4s +add v10.4s, v10.4s, v14.4s +trn1 v14.4S, v17.4S, v9.4S +trn2 v16.4S, v17.4S, v9.4S +trn1 v27.4S, v10.4S, v8.4S +trn2 v22.4S, v10.4S, v8.4S +trn2 v10.2D, v14.2D, v27.2D +trn2 v8.2D, v16.2D, v22.2D +trn1 v17.2D, v14.2D, v27.2D +trn1 v9.2D, v16.2D, v22.2D +sqrdmulh v22.4S, v10.4S, v20.4S +mul v10.4S, v10.4S,v24.4S +mla v10.4S, v22.4S, v31.s[0] +sub v22.4s, v17.4s, v10.4s +add v17.4s, v17.4s, v10.4s +sqrdmulh v10.4S, v8.4S, v20.4S +mul v8.4S, v8.4S,v24.4S +mla v8.4S, v10.4S, v31.s[0] +sub v10.4s, v9.4s, v8.4s +add v9.4s, v9.4s, v8.4s +sqrdmulh v8.4S, v9.4S, v29.4S +mul v9.4S, v9.4S,v15.4S +mla v9.4S, v8.4S, v31.s[0] +sub v8.4s, v17.4s, v9.4s +add v17.4s, v17.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v0.4S +mul v10.4S, v10.4S,v19.4S +mla v10.4S, v9.4S, v31.s[0] +sub v9.4s, v22.4s, v10.4s +add v22.4s, v22.4s, v10.4s +str q17, [x0, #0] +str q8, [x0, #16] +str q22, [x0, #32] +str q9, [x0, #48] +ldr q9, [x17, #+256] +ldr q22, [x17, #+272] +ldr q8, [x17, #+288] +ldr q17, [x17, #+304] +ldr q10, [x17, #+320] +ldr q16, [x17, #+336] +ldr q27, [x17, #+352] +ldr q14, [x17, #+368] +ldr q0, [x0, #96] +ldr q19, [x0, #112] +ldr q29, [x0, #64] +ldr q15, [x0, #80] +sqrdmulh v20.4S, v0.4S, v22.s[0] +mul v0.4S, v0.4S,v9.s[0] +mla v0.4S, v20.4S, v31.s[0] +sub v20.4s, v29.4s, v0.4s +add v29.4s, v29.4s, v0.4s +sqrdmulh v0.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v9.s[0] +mla v19.4S, v0.4S, v31.s[0] +sub v0.4s, v15.4s, v19.4s +add v15.4s, v15.4s, v19.4s +sqrdmulh v19.4S, v15.4S, v22.s[1] +mul v15.4S, v15.4S,v9.s[1] +mla v15.4S, v19.4S, v31.s[0] +sub v19.4s, v29.4s, v15.4s +add v29.4s, v29.4s, v15.4s +sqrdmulh v15.4S, v0.4S, v22.s[2] +mul v0.4S, v0.4S,v9.s[2] +mla v0.4S, v15.4S, v31.s[0] +sub v15.4s, v20.4s, v0.4s +add v20.4s, v20.4s, v0.4s +trn1 v0.4S, v29.4S, v19.4S +trn2 v24.4S, v29.4S, v19.4S +trn1 v5.4S, v20.4S, v15.4S +trn2 v4.4S, v20.4S, v15.4S +trn2 v20.2D, v0.2D, v5.2D +trn2 v15.2D, v24.2D, v4.2D +trn1 v29.2D, v0.2D, v5.2D +trn1 v19.2D, v24.2D, v4.2D +sqrdmulh v4.4S, v20.4S, v17.4S +mul v20.4S, v20.4S,v8.4S +mla v20.4S, v4.4S, v31.s[0] +sub v4.4s, v29.4s, v20.4s +add v29.4s, v29.4s, v20.4s +sqrdmulh v20.4S, v15.4S, v17.4S +mul v15.4S, v15.4S,v8.4S +mla v15.4S, v20.4S, v31.s[0] +sub v20.4s, v19.4s, v15.4s +add v19.4s, v19.4s, v15.4s +sqrdmulh v15.4S, v19.4S, v16.4S +mul v19.4S, v19.4S,v10.4S +mla v19.4S, v15.4S, v31.s[0] +sub v15.4s, v29.4s, v19.4s +add v29.4s, v29.4s, v19.4s +sqrdmulh v19.4S, v20.4S, v14.4S +mul v20.4S, v20.4S,v27.4S +mla v20.4S, v19.4S, v31.s[0] +sub v19.4s, v4.4s, v20.4s +add v4.4s, v4.4s, v20.4s +str q29, [x0, #64] +str q15, [x0, #80] +str q4, [x0, #96] +str q19, [x0, #112] +ldr q19, [x17, #+384] +ldr q4, [x17, #+400] +ldr q15, [x17, #+416] +ldr q29, [x17, #+432] +ldr q20, [x17, #+448] +ldr q24, [x17, #+464] +ldr q5, [x17, #+480] +ldr q0, [x17, #+496] +ldr q14, [x0, #160] +ldr q27, [x0, #176] +ldr q16, [x0, #128] +ldr q10, [x0, #144] +sqrdmulh v17.4S, v14.4S, v4.s[0] +mul v14.4S, v14.4S,v19.s[0] +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v16.4s, v14.4s +add v16.4s, v16.4s, v14.4s +sqrdmulh v14.4S, v27.4S, v4.s[0] +mul v27.4S, v27.4S,v19.s[0] +mla v27.4S, v14.4S, v31.s[0] +sub v14.4s, v10.4s, v27.4s +add v10.4s, v10.4s, v27.4s +sqrdmulh v27.4S, v10.4S, v4.s[1] +mul v10.4S, v10.4S,v19.s[1] +mla v10.4S, v27.4S, v31.s[0] +sub v27.4s, v16.4s, v10.4s +add v16.4s, v16.4s, v10.4s +sqrdmulh v10.4S, v14.4S, v4.s[2] +mul v14.4S, v14.4S,v19.s[2] +mla v14.4S, v10.4S, v31.s[0] +sub v10.4s, v17.4s, v14.4s +add v17.4s, v17.4s, v14.4s +trn1 v14.4S, v16.4S, v27.4S +trn2 v8.4S, v16.4S, v27.4S +trn1 v22.4S, v17.4S, v10.4S +trn2 v9.4S, v17.4S, v10.4S +trn2 v17.2D, v14.2D, v22.2D +trn2 v10.2D, v8.2D, v9.2D +trn1 v16.2D, v14.2D, v22.2D +trn1 v27.2D, v8.2D, v9.2D +sqrdmulh v9.4S, v17.4S, v29.4S +mul v17.4S, v17.4S,v15.4S +mla v17.4S, v9.4S, v31.s[0] +sub v9.4s, v16.4s, v17.4s +add v16.4s, v16.4s, v17.4s +sqrdmulh v17.4S, v10.4S, v29.4S +mul v10.4S, v10.4S,v15.4S +mla v10.4S, v17.4S, v31.s[0] +sub v17.4s, v27.4s, v10.4s +add v27.4s, v27.4s, v10.4s +sqrdmulh v10.4S, v27.4S, v24.4S +mul v27.4S, v27.4S,v20.4S +mla v27.4S, v10.4S, v31.s[0] +sub v10.4s, v16.4s, v27.4s +add v16.4s, v16.4s, v27.4s +sqrdmulh v27.4S, v17.4S, v0.4S +mul v17.4S, v17.4S,v5.4S +mla v17.4S, v27.4S, v31.s[0] +sub v27.4s, v9.4s, v17.4s +add v9.4s, v9.4s, v17.4s +str q16, [x0, #128] +str q10, [x0, #144] +str q9, [x0, #160] +str q27, [x0, #176] +ldr q27, [x17, #+512] +ldr q9, [x17, #+528] +ldr q10, [x17, #+544] +ldr q16, [x17, #+560] +ldr q17, [x17, #+576] +ldr q8, [x17, #+592] +ldr q22, [x17, #+608] +ldr q14, [x17, #+624] +ldr q0, [x0, #224] +ldr q5, [x0, #240] +ldr q24, [x0, #192] +ldr q20, [x0, #208] +sqrdmulh v29.4S, v0.4S, v9.s[0] +mul v0.4S, v0.4S,v27.s[0] +mla v0.4S, v29.4S, v31.s[0] +sub v29.4s, v24.4s, v0.4s +add v24.4s, v24.4s, v0.4s +sqrdmulh v0.4S, v5.4S, v9.s[0] +mul v5.4S, v5.4S,v27.s[0] +mla v5.4S, v0.4S, v31.s[0] +sub v0.4s, v20.4s, v5.4s +add v20.4s, v20.4s, v5.4s +sqrdmulh v5.4S, v20.4S, v9.s[1] +mul v20.4S, v20.4S,v27.s[1] +mla v20.4S, v5.4S, v31.s[0] +sub v5.4s, v24.4s, v20.4s +add v24.4s, v24.4s, v20.4s +sqrdmulh v20.4S, v0.4S, v9.s[2] +mul v0.4S, v0.4S,v27.s[2] +mla v0.4S, v20.4S, v31.s[0] +sub v20.4s, v29.4s, v0.4s +add v29.4s, v29.4s, v0.4s +trn1 v0.4S, v24.4S, v5.4S +trn2 v15.4S, v24.4S, v5.4S +trn1 v4.4S, v29.4S, v20.4S +trn2 v19.4S, v29.4S, v20.4S +trn2 v29.2D, v0.2D, v4.2D +trn2 v20.2D, v15.2D, v19.2D +trn1 v24.2D, v0.2D, v4.2D +trn1 v5.2D, v15.2D, v19.2D +sqrdmulh v19.4S, v29.4S, v16.4S +mul v29.4S, v29.4S,v10.4S +mla v29.4S, v19.4S, v31.s[0] +sub v19.4s, v24.4s, v29.4s +add v24.4s, v24.4s, v29.4s +sqrdmulh v29.4S, v20.4S, v16.4S +mul v20.4S, v20.4S,v10.4S +mla v20.4S, v29.4S, v31.s[0] +sub v29.4s, v5.4s, v20.4s +add v5.4s, v5.4s, v20.4s +sqrdmulh v20.4S, v5.4S, v8.4S +mul v5.4S, v5.4S,v17.4S +mla v5.4S, v20.4S, v31.s[0] +sub v20.4s, v24.4s, v5.4s +add v24.4s, v24.4s, v5.4s +sqrdmulh v5.4S, v29.4S, v14.4S +mul v29.4S, v29.4S,v22.4S +mla v29.4S, v5.4S, v31.s[0] +sub v5.4s, v19.4s, v29.4s +add v19.4s, v19.4s, v29.4s +str q24, [x0, #192] +str q20, [x0, #208] +str q19, [x0, #224] +str q5, [x0, #240] +ldr q5, [x17, #+640] +ldr q19, [x17, #+656] +ldr q20, [x17, #+672] +ldr q24, [x17, #+688] +ldr q29, [x17, #+704] +ldr q15, [x17, #+720] +ldr q4, [x17, #+736] +ldr q0, [x17, #+752] +ldr q14, [x0, #288] +ldr q22, [x0, #304] +ldr q8, [x0, #256] +ldr q17, [x0, #272] +sqrdmulh v16.4S, v14.4S, v19.s[0] +mul v14.4S, v14.4S,v5.s[0] +mla v14.4S, v16.4S, v31.s[0] +sub v16.4s, v8.4s, v14.4s +add v8.4s, v8.4s, v14.4s +sqrdmulh v14.4S, v22.4S, v19.s[0] +mul v22.4S, v22.4S,v5.s[0] +mla v22.4S, v14.4S, v31.s[0] +sub v14.4s, v17.4s, v22.4s +add v17.4s, v17.4s, v22.4s +sqrdmulh v22.4S, v17.4S, v19.s[1] +mul v17.4S, v17.4S,v5.s[1] +mla v17.4S, v22.4S, v31.s[0] +sub v22.4s, v8.4s, v17.4s +add v8.4s, v8.4s, v17.4s +sqrdmulh v17.4S, v14.4S, v19.s[2] +mul v14.4S, v14.4S,v5.s[2] +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v16.4s, v14.4s +add v16.4s, v16.4s, v14.4s +trn1 v14.4S, v8.4S, v22.4S +trn2 v10.4S, v8.4S, v22.4S +trn1 v9.4S, v16.4S, v17.4S +trn2 v27.4S, v16.4S, v17.4S +trn2 v16.2D, v14.2D, v9.2D +trn2 v17.2D, v10.2D, v27.2D +trn1 v8.2D, v14.2D, v9.2D +trn1 v22.2D, v10.2D, v27.2D +sqrdmulh v27.4S, v16.4S, v24.4S +mul v16.4S, v16.4S,v20.4S +mla v16.4S, v27.4S, v31.s[0] +sub v27.4s, v8.4s, v16.4s +add v8.4s, v8.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v24.4S +mul v17.4S, v17.4S,v20.4S +mla v17.4S, v16.4S, v31.s[0] +sub v16.4s, v22.4s, v17.4s +add v22.4s, v22.4s, v17.4s +sqrdmulh v17.4S, v22.4S, v15.4S +mul v22.4S, v22.4S,v29.4S +mla v22.4S, v17.4S, v31.s[0] +sub v17.4s, v8.4s, v22.4s +add v8.4s, v8.4s, v22.4s +sqrdmulh v22.4S, v16.4S, v0.4S +mul v16.4S, v16.4S,v4.4S +mla v16.4S, v22.4S, v31.s[0] +sub v22.4s, v27.4s, v16.4s +add v27.4s, v27.4s, v16.4s +str q8, [x0, #256] +str q17, [x0, #272] +str q27, [x0, #288] +str q22, [x0, #304] +ldr q22, [x17, #+768] +ldr q27, [x17, #+784] +ldr q17, [x17, #+800] +ldr q8, [x17, #+816] +ldr q16, [x17, #+832] +ldr q10, [x17, #+848] +ldr q9, [x17, #+864] +ldr q14, [x17, #+880] +ldr q0, [x0, #352] +ldr q4, [x0, #368] +ldr q15, [x0, #320] +ldr q29, [x0, #336] +sqrdmulh v24.4S, v0.4S, v27.s[0] +mul v0.4S, v0.4S,v22.s[0] +mla v0.4S, v24.4S, v31.s[0] +sub v24.4s, v15.4s, v0.4s +add v15.4s, v15.4s, v0.4s +sqrdmulh v0.4S, v4.4S, v27.s[0] +mul v4.4S, v4.4S,v22.s[0] +mla v4.4S, v0.4S, v31.s[0] +sub v0.4s, v29.4s, v4.4s +add v29.4s, v29.4s, v4.4s +sqrdmulh v4.4S, v29.4S, v27.s[1] +mul v29.4S, v29.4S,v22.s[1] +mla v29.4S, v4.4S, v31.s[0] +sub v4.4s, v15.4s, v29.4s +add v15.4s, v15.4s, v29.4s +sqrdmulh v29.4S, v0.4S, v27.s[2] +mul v0.4S, v0.4S,v22.s[2] +mla v0.4S, v29.4S, v31.s[0] +sub v29.4s, v24.4s, v0.4s +add v24.4s, v24.4s, v0.4s +trn1 v0.4S, v15.4S, v4.4S +trn2 v20.4S, v15.4S, v4.4S +trn1 v19.4S, v24.4S, v29.4S +trn2 v5.4S, v24.4S, v29.4S +trn2 v24.2D, v0.2D, v19.2D +trn2 v29.2D, v20.2D, v5.2D +trn1 v15.2D, v0.2D, v19.2D +trn1 v4.2D, v20.2D, v5.2D +sqrdmulh v5.4S, v24.4S, v8.4S +mul v24.4S, v24.4S,v17.4S +mla v24.4S, v5.4S, v31.s[0] +sub v5.4s, v15.4s, v24.4s +add v15.4s, v15.4s, v24.4s +sqrdmulh v24.4S, v29.4S, v8.4S +mul v29.4S, v29.4S,v17.4S +mla v29.4S, v24.4S, v31.s[0] +sub v24.4s, v4.4s, v29.4s +add v4.4s, v4.4s, v29.4s +sqrdmulh v29.4S, v4.4S, v10.4S +mul v4.4S, v4.4S,v16.4S +mla v4.4S, v29.4S, v31.s[0] +sub v29.4s, v15.4s, v4.4s +add v15.4s, v15.4s, v4.4s +sqrdmulh v4.4S, v24.4S, v14.4S +mul v24.4S, v24.4S,v9.4S +mla v24.4S, v4.4S, v31.s[0] +sub v4.4s, v5.4s, v24.4s +add v5.4s, v5.4s, v24.4s +str q15, [x0, #320] +str q29, [x0, #336] +str q5, [x0, #352] +str q4, [x0, #368] +ldr q4, [x17, #+896] +ldr q5, [x17, #+912] +ldr q29, [x17, #+928] +ldr q15, [x17, #+944] +ldr q24, [x17, #+960] +ldr q20, [x17, #+976] +ldr q19, [x17, #+992] +ldr q0, [x17, #+1008] +ldr q14, [x0, #416] +ldr q9, [x0, #432] +ldr q10, [x0, #384] +ldr q16, [x0, #400] +sqrdmulh v8.4S, v14.4S, v5.s[0] +mul v14.4S, v14.4S,v4.s[0] +mla v14.4S, v8.4S, v31.s[0] +sub v8.4s, v10.4s, v14.4s +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v9.4S, v5.s[0] +mul v9.4S, v9.4S,v4.s[0] +mla v9.4S, v14.4S, v31.s[0] +sub v14.4s, v16.4s, v9.4s +add v16.4s, v16.4s, v9.4s +sqrdmulh v9.4S, v16.4S, v5.s[1] +mul v16.4S, v16.4S,v4.s[1] +mla v16.4S, v9.4S, v31.s[0] +sub v9.4s, v10.4s, v16.4s +add v10.4s, v10.4s, v16.4s +sqrdmulh v16.4S, v14.4S, v5.s[2] +mul v14.4S, v14.4S,v4.s[2] +mla v14.4S, v16.4S, v31.s[0] +sub v16.4s, v8.4s, v14.4s +add v8.4s, v8.4s, v14.4s +trn1 v14.4S, v10.4S, v9.4S +trn2 v17.4S, v10.4S, v9.4S +trn1 v27.4S, v8.4S, v16.4S +trn2 v22.4S, v8.4S, v16.4S +trn2 v8.2D, v14.2D, v27.2D +trn2 v16.2D, v17.2D, v22.2D +trn1 v10.2D, v14.2D, v27.2D +trn1 v9.2D, v17.2D, v22.2D +sqrdmulh v22.4S, v8.4S, v15.4S +mul v8.4S, v8.4S,v29.4S +mla v8.4S, v22.4S, v31.s[0] +sub v22.4s, v10.4s, v8.4s +add v10.4s, v10.4s, v8.4s +sqrdmulh v8.4S, v16.4S, v15.4S +mul v16.4S, v16.4S,v29.4S +mla v16.4S, v8.4S, v31.s[0] +sub v8.4s, v9.4s, v16.4s +add v9.4s, v9.4s, v16.4s +sqrdmulh v16.4S, v9.4S, v20.4S +mul v9.4S, v9.4S,v24.4S +mla v9.4S, v16.4S, v31.s[0] +sub v16.4s, v10.4s, v9.4s +add v10.4s, v10.4s, v9.4s +sqrdmulh v9.4S, v8.4S, v0.4S +mul v8.4S, v8.4S,v19.4S +mla v8.4S, v9.4S, v31.s[0] +sub v9.4s, v22.4s, v8.4s +add v22.4s, v22.4s, v8.4s +str q10, [x0, #384] +str q16, [x0, #400] +str q22, [x0, #416] +str q9, [x0, #432] +ldr q9, [x17, #+1024] +ldr q22, [x17, #+1040] +ldr q16, [x17, #+1056] +ldr q10, [x17, #+1072] +ldr q8, [x17, #+1088] +ldr q17, [x17, #+1104] +ldr q27, [x17, #+1120] +ldr q14, [x17, #+1136] +ldr q0, [x0, #480] +ldr q19, [x0, #496] +ldr q20, [x0, #448] +ldr q24, [x0, #464] +sqrdmulh v15.4S, v0.4S, v22.s[0] +mul v0.4S, v0.4S,v9.s[0] +mla v0.4S, v15.4S, v31.s[0] +sub v15.4s, v20.4s, v0.4s +add v20.4s, v20.4s, v0.4s +sqrdmulh v0.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v9.s[0] +mla v19.4S, v0.4S, v31.s[0] +sub v0.4s, v24.4s, v19.4s +add v24.4s, v24.4s, v19.4s +sqrdmulh v19.4S, v24.4S, v22.s[1] +mul v24.4S, v24.4S,v9.s[1] +mla v24.4S, v19.4S, v31.s[0] +sub v19.4s, v20.4s, v24.4s +add v20.4s, v20.4s, v24.4s +sqrdmulh v24.4S, v0.4S, v22.s[2] +mul v0.4S, v0.4S,v9.s[2] +mla v0.4S, v24.4S, v31.s[0] +sub v24.4s, v15.4s, v0.4s +add v15.4s, v15.4s, v0.4s +trn1 v0.4S, v20.4S, v19.4S +trn2 v29.4S, v20.4S, v19.4S +trn1 v5.4S, v15.4S, v24.4S +trn2 v4.4S, v15.4S, v24.4S +trn2 v15.2D, v0.2D, v5.2D +trn2 v24.2D, v29.2D, v4.2D +trn1 v20.2D, v0.2D, v5.2D +trn1 v19.2D, v29.2D, v4.2D +sqrdmulh v4.4S, v15.4S, v10.4S +mul v15.4S, v15.4S,v16.4S +mla v15.4S, v4.4S, v31.s[0] +sub v4.4s, v20.4s, v15.4s +add v20.4s, v20.4s, v15.4s +sqrdmulh v15.4S, v24.4S, v10.4S +mul v24.4S, v24.4S,v16.4S +mla v24.4S, v15.4S, v31.s[0] +sub v15.4s, v19.4s, v24.4s +add v19.4s, v19.4s, v24.4s +sqrdmulh v24.4S, v19.4S, v17.4S +mul v19.4S, v19.4S,v8.4S +mla v19.4S, v24.4S, v31.s[0] +sub v24.4s, v20.4s, v19.4s +add v20.4s, v20.4s, v19.4s +sqrdmulh v19.4S, v15.4S, v14.4S +mul v15.4S, v15.4S,v27.4S +mla v15.4S, v19.4S, v31.s[0] +sub v19.4s, v4.4s, v15.4s +add v4.4s, v4.4s, v15.4s +str q20, [x0, #448] +str q24, [x0, #464] +str q4, [x0, #480] +str q19, [x0, #496] +ldr q19, [x17, #+1152] +ldr q4, [x17, #+1168] +ldr q24, [x17, #+1184] +ldr q20, [x17, #+1200] +ldr q15, [x17, #+1216] +ldr q29, [x17, #+1232] +ldr q5, [x17, #+1248] +ldr q0, [x17, #+1264] +ldr q14, [x0, #544] +ldr q27, [x0, #560] +ldr q17, [x0, #512] +ldr q8, [x0, #528] +sqrdmulh v10.4S, v14.4S, v4.s[0] +mul v14.4S, v14.4S,v19.s[0] +mla v14.4S, v10.4S, v31.s[0] +sub v10.4s, v17.4s, v14.4s +add v17.4s, v17.4s, v14.4s +sqrdmulh v14.4S, v27.4S, v4.s[0] +mul v27.4S, v27.4S,v19.s[0] +mla v27.4S, v14.4S, v31.s[0] +sub v14.4s, v8.4s, v27.4s +add v8.4s, v8.4s, v27.4s +sqrdmulh v27.4S, v8.4S, v4.s[1] +mul v8.4S, v8.4S,v19.s[1] +mla v8.4S, v27.4S, v31.s[0] +sub v27.4s, v17.4s, v8.4s +add v17.4s, v17.4s, v8.4s +sqrdmulh v8.4S, v14.4S, v4.s[2] +mul v14.4S, v14.4S,v19.s[2] +mla v14.4S, v8.4S, v31.s[0] +sub v8.4s, v10.4s, v14.4s +add v10.4s, v10.4s, v14.4s +trn1 v14.4S, v17.4S, v27.4S +trn2 v16.4S, v17.4S, v27.4S +trn1 v22.4S, v10.4S, v8.4S +trn2 v9.4S, v10.4S, v8.4S +trn2 v10.2D, v14.2D, v22.2D +trn2 v8.2D, v16.2D, v9.2D +trn1 v17.2D, v14.2D, v22.2D +trn1 v27.2D, v16.2D, v9.2D +sqrdmulh v9.4S, v10.4S, v20.4S +mul v10.4S, v10.4S,v24.4S +mla v10.4S, v9.4S, v31.s[0] +sub v9.4s, v17.4s, v10.4s +add v17.4s, v17.4s, v10.4s +sqrdmulh v10.4S, v8.4S, v20.4S +mul v8.4S, v8.4S,v24.4S +mla v8.4S, v10.4S, v31.s[0] +sub v10.4s, v27.4s, v8.4s +add v27.4s, v27.4s, v8.4s +sqrdmulh v8.4S, v27.4S, v29.4S +mul v27.4S, v27.4S,v15.4S +mla v27.4S, v8.4S, v31.s[0] +sub v8.4s, v17.4s, v27.4s +add v17.4s, v17.4s, v27.4s +sqrdmulh v27.4S, v10.4S, v0.4S +mul v10.4S, v10.4S,v5.4S +mla v10.4S, v27.4S, v31.s[0] +sub v27.4s, v9.4s, v10.4s +add v9.4s, v9.4s, v10.4s +str q17, [x0, #512] +str q8, [x0, #528] +str q9, [x0, #544] +str q27, [x0, #560] +ldr q27, [x17, #+1280] +ldr q9, [x17, #+1296] +ldr q8, [x17, #+1312] +ldr q17, [x17, #+1328] +ldr q10, [x17, #+1344] +ldr q16, [x17, #+1360] +ldr q22, [x17, #+1376] +ldr q14, [x17, #+1392] +ldr q0, [x0, #608] +ldr q5, [x0, #624] +ldr q29, [x0, #576] +ldr q15, [x0, #592] +sqrdmulh v20.4S, v0.4S, v9.s[0] +mul v0.4S, v0.4S,v27.s[0] +mla v0.4S, v20.4S, v31.s[0] +sub v20.4s, v29.4s, v0.4s +add v29.4s, v29.4s, v0.4s +sqrdmulh v0.4S, v5.4S, v9.s[0] +mul v5.4S, v5.4S,v27.s[0] +mla v5.4S, v0.4S, v31.s[0] +sub v0.4s, v15.4s, v5.4s +add v15.4s, v15.4s, v5.4s +sqrdmulh v5.4S, v15.4S, v9.s[1] +mul v15.4S, v15.4S,v27.s[1] +mla v15.4S, v5.4S, v31.s[0] +sub v5.4s, v29.4s, v15.4s +add v29.4s, v29.4s, v15.4s +sqrdmulh v15.4S, v0.4S, v9.s[2] +mul v0.4S, v0.4S,v27.s[2] +mla v0.4S, v15.4S, v31.s[0] +sub v15.4s, v20.4s, v0.4s +add v20.4s, v20.4s, v0.4s +trn1 v0.4S, v29.4S, v5.4S +trn2 v24.4S, v29.4S, v5.4S +trn1 v4.4S, v20.4S, v15.4S +trn2 v19.4S, v20.4S, v15.4S +trn2 v20.2D, v0.2D, v4.2D +trn2 v15.2D, v24.2D, v19.2D +trn1 v29.2D, v0.2D, v4.2D +trn1 v5.2D, v24.2D, v19.2D +sqrdmulh v19.4S, v20.4S, v17.4S +mul v20.4S, v20.4S,v8.4S +mla v20.4S, v19.4S, v31.s[0] +sub v19.4s, v29.4s, v20.4s +add v29.4s, v29.4s, v20.4s +sqrdmulh v20.4S, v15.4S, v17.4S +mul v15.4S, v15.4S,v8.4S +mla v15.4S, v20.4S, v31.s[0] +sub v20.4s, v5.4s, v15.4s +add v5.4s, v5.4s, v15.4s +sqrdmulh v15.4S, v5.4S, v16.4S +mul v5.4S, v5.4S,v10.4S +mla v5.4S, v15.4S, v31.s[0] +sub v15.4s, v29.4s, v5.4s +add v29.4s, v29.4s, v5.4s +sqrdmulh v5.4S, v20.4S, v14.4S +mul v20.4S, v20.4S,v22.4S +mla v20.4S, v5.4S, v31.s[0] +sub v5.4s, v19.4s, v20.4s +add v19.4s, v19.4s, v20.4s +str q29, [x0, #576] +str q15, [x0, #592] +str q19, [x0, #608] +str q5, [x0, #624] +ldr q5, [x17, #+1408] +ldr q19, [x17, #+1424] +ldr q15, [x17, #+1440] +ldr q29, [x17, #+1456] +ldr q20, [x17, #+1472] +ldr q24, [x17, #+1488] +ldr q4, [x17, #+1504] +ldr q0, [x17, #+1520] +ldr q14, [x0, #672] +ldr q22, [x0, #688] +ldr q16, [x0, #640] +ldr q10, [x0, #656] +sqrdmulh v17.4S, v14.4S, v19.s[0] +mul v14.4S, v14.4S,v5.s[0] +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v16.4s, v14.4s +add v16.4s, v16.4s, v14.4s +sqrdmulh v14.4S, v22.4S, v19.s[0] +mul v22.4S, v22.4S,v5.s[0] +mla v22.4S, v14.4S, v31.s[0] +sub v14.4s, v10.4s, v22.4s +add v10.4s, v10.4s, v22.4s +sqrdmulh v22.4S, v10.4S, v19.s[1] +mul v10.4S, v10.4S,v5.s[1] +mla v10.4S, v22.4S, v31.s[0] +sub v22.4s, v16.4s, v10.4s +add v16.4s, v16.4s, v10.4s +sqrdmulh v10.4S, v14.4S, v19.s[2] +mul v14.4S, v14.4S,v5.s[2] +mla v14.4S, v10.4S, v31.s[0] +sub v10.4s, v17.4s, v14.4s +add v17.4s, v17.4s, v14.4s +trn1 v14.4S, v16.4S, v22.4S +trn2 v8.4S, v16.4S, v22.4S +trn1 v9.4S, v17.4S, v10.4S +trn2 v27.4S, v17.4S, v10.4S +trn2 v17.2D, v14.2D, v9.2D +trn2 v10.2D, v8.2D, v27.2D +trn1 v16.2D, v14.2D, v9.2D +trn1 v22.2D, v8.2D, v27.2D +sqrdmulh v27.4S, v17.4S, v29.4S +mul v17.4S, v17.4S,v15.4S +mla v17.4S, v27.4S, v31.s[0] +sub v27.4s, v16.4s, v17.4s +add v16.4s, v16.4s, v17.4s +sqrdmulh v17.4S, v10.4S, v29.4S +mul v10.4S, v10.4S,v15.4S +mla v10.4S, v17.4S, v31.s[0] +sub v17.4s, v22.4s, v10.4s +add v22.4s, v22.4s, v10.4s +sqrdmulh v10.4S, v22.4S, v24.4S +mul v22.4S, v22.4S,v20.4S +mla v22.4S, v10.4S, v31.s[0] +sub v10.4s, v16.4s, v22.4s +add v16.4s, v16.4s, v22.4s +sqrdmulh v22.4S, v17.4S, v0.4S +mul v17.4S, v17.4S,v4.4S +mla v17.4S, v22.4S, v31.s[0] +sub v22.4s, v27.4s, v17.4s +add v27.4s, v27.4s, v17.4s +str q16, [x0, #640] +str q10, [x0, #656] +str q27, [x0, #672] +str q22, [x0, #688] +ldr q22, [x17, #+1536] +ldr q27, [x17, #+1552] +ldr q10, [x17, #+1568] +ldr q16, [x17, #+1584] +ldr q17, [x17, #+1600] +ldr q8, [x17, #+1616] +ldr q9, [x17, #+1632] +ldr q14, [x17, #+1648] +ldr q0, [x0, #736] +ldr q4, [x0, #752] +ldr q24, [x0, #704] +ldr q20, [x0, #720] +sqrdmulh v29.4S, v0.4S, v27.s[0] +mul v0.4S, v0.4S,v22.s[0] +mla v0.4S, v29.4S, v31.s[0] +sub v29.4s, v24.4s, v0.4s +add v24.4s, v24.4s, v0.4s +sqrdmulh v0.4S, v4.4S, v27.s[0] +mul v4.4S, v4.4S,v22.s[0] +mla v4.4S, v0.4S, v31.s[0] +sub v0.4s, v20.4s, v4.4s +add v20.4s, v20.4s, v4.4s +sqrdmulh v4.4S, v20.4S, v27.s[1] +mul v20.4S, v20.4S,v22.s[1] +mla v20.4S, v4.4S, v31.s[0] +sub v4.4s, v24.4s, v20.4s +add v24.4s, v24.4s, v20.4s +sqrdmulh v20.4S, v0.4S, v27.s[2] +mul v0.4S, v0.4S,v22.s[2] +mla v0.4S, v20.4S, v31.s[0] +sub v20.4s, v29.4s, v0.4s +add v29.4s, v29.4s, v0.4s +trn1 v0.4S, v24.4S, v4.4S +trn2 v15.4S, v24.4S, v4.4S +trn1 v19.4S, v29.4S, v20.4S +trn2 v5.4S, v29.4S, v20.4S +trn2 v29.2D, v0.2D, v19.2D +trn2 v20.2D, v15.2D, v5.2D +trn1 v24.2D, v0.2D, v19.2D +trn1 v4.2D, v15.2D, v5.2D +sqrdmulh v5.4S, v29.4S, v16.4S +mul v29.4S, v29.4S,v10.4S +mla v29.4S, v5.4S, v31.s[0] +sub v5.4s, v24.4s, v29.4s +add v24.4s, v24.4s, v29.4s +sqrdmulh v29.4S, v20.4S, v16.4S +mul v20.4S, v20.4S,v10.4S +mla v20.4S, v29.4S, v31.s[0] +sub v29.4s, v4.4s, v20.4s +add v4.4s, v4.4s, v20.4s +sqrdmulh v20.4S, v4.4S, v8.4S +mul v4.4S, v4.4S,v17.4S +mla v4.4S, v20.4S, v31.s[0] +sub v20.4s, v24.4s, v4.4s +add v24.4s, v24.4s, v4.4s +sqrdmulh v4.4S, v29.4S, v14.4S +mul v29.4S, v29.4S,v9.4S +mla v29.4S, v4.4S, v31.s[0] +sub v4.4s, v5.4s, v29.4s +add v5.4s, v5.4s, v29.4s +str q24, [x0, #704] +str q20, [x0, #720] +str q5, [x0, #736] +str q4, [x0, #752] +ldr q4, [x17, #+1664] +ldr q5, [x17, #+1680] +ldr q20, [x17, #+1696] +ldr q24, [x17, #+1712] +ldr q29, [x17, #+1728] +ldr q15, [x17, #+1744] +ldr q19, [x17, #+1760] +ldr q0, [x17, #+1776] +ldr q14, [x0, #800] +ldr q9, [x0, #816] +ldr q8, [x0, #768] +ldr q17, [x0, #784] +sqrdmulh v16.4S, v14.4S, v5.s[0] +mul v14.4S, v14.4S,v4.s[0] +mla v14.4S, v16.4S, v31.s[0] +sub v16.4s, v8.4s, v14.4s +add v8.4s, v8.4s, v14.4s +sqrdmulh v14.4S, v9.4S, v5.s[0] +mul v9.4S, v9.4S,v4.s[0] +mla v9.4S, v14.4S, v31.s[0] +sub v14.4s, v17.4s, v9.4s +add v17.4s, v17.4s, v9.4s +sqrdmulh v9.4S, v17.4S, v5.s[1] +mul v17.4S, v17.4S,v4.s[1] +mla v17.4S, v9.4S, v31.s[0] +sub v9.4s, v8.4s, v17.4s +add v8.4s, v8.4s, v17.4s +sqrdmulh v17.4S, v14.4S, v5.s[2] +mul v14.4S, v14.4S,v4.s[2] +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v16.4s, v14.4s +add v16.4s, v16.4s, v14.4s +trn1 v14.4S, v8.4S, v9.4S +trn2 v10.4S, v8.4S, v9.4S +trn1 v27.4S, v16.4S, v17.4S +trn2 v22.4S, v16.4S, v17.4S +trn2 v16.2D, v14.2D, v27.2D +trn2 v17.2D, v10.2D, v22.2D +trn1 v8.2D, v14.2D, v27.2D +trn1 v9.2D, v10.2D, v22.2D +sqrdmulh v22.4S, v16.4S, v24.4S +mul v16.4S, v16.4S,v20.4S +mla v16.4S, v22.4S, v31.s[0] +sub v22.4s, v8.4s, v16.4s +add v8.4s, v8.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v24.4S +mul v17.4S, v17.4S,v20.4S +mla v17.4S, v16.4S, v31.s[0] +sub v16.4s, v9.4s, v17.4s +add v9.4s, v9.4s, v17.4s +sqrdmulh v17.4S, v9.4S, v15.4S +mul v9.4S, v9.4S,v29.4S +mla v9.4S, v17.4S, v31.s[0] +sub v17.4s, v8.4s, v9.4s +add v8.4s, v8.4s, v9.4s +sqrdmulh v9.4S, v16.4S, v0.4S +mul v16.4S, v16.4S,v19.4S +mla v16.4S, v9.4S, v31.s[0] +sub v9.4s, v22.4s, v16.4s +add v22.4s, v22.4s, v16.4s +str q8, [x0, #768] +str q17, [x0, #784] +str q22, [x0, #800] +str q9, [x0, #816] +ldr q9, [x17, #+1792] +ldr q22, [x17, #+1808] +ldr q17, [x17, #+1824] +ldr q8, [x17, #+1840] +ldr q16, [x17, #+1856] +ldr q10, [x17, #+1872] +ldr q27, [x17, #+1888] +ldr q14, [x17, #+1904] +ldr q0, [x0, #864] +ldr q19, [x0, #880] +ldr q15, [x0, #832] +ldr q29, [x0, #848] +sqrdmulh v24.4S, v0.4S, v22.s[0] +mul v0.4S, v0.4S,v9.s[0] +mla v0.4S, v24.4S, v31.s[0] +sub v24.4s, v15.4s, v0.4s +add v15.4s, v15.4s, v0.4s +sqrdmulh v0.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v9.s[0] +mla v19.4S, v0.4S, v31.s[0] +sub v0.4s, v29.4s, v19.4s +add v29.4s, v29.4s, v19.4s +sqrdmulh v19.4S, v29.4S, v22.s[1] +mul v29.4S, v29.4S,v9.s[1] +mla v29.4S, v19.4S, v31.s[0] +sub v19.4s, v15.4s, v29.4s +add v15.4s, v15.4s, v29.4s +sqrdmulh v29.4S, v0.4S, v22.s[2] +mul v0.4S, v0.4S,v9.s[2] +mla v0.4S, v29.4S, v31.s[0] +sub v29.4s, v24.4s, v0.4s +add v24.4s, v24.4s, v0.4s +trn1 v0.4S, v15.4S, v19.4S +trn2 v20.4S, v15.4S, v19.4S +trn1 v5.4S, v24.4S, v29.4S +trn2 v4.4S, v24.4S, v29.4S +trn2 v24.2D, v0.2D, v5.2D +trn2 v29.2D, v20.2D, v4.2D +trn1 v15.2D, v0.2D, v5.2D +trn1 v19.2D, v20.2D, v4.2D +sqrdmulh v4.4S, v24.4S, v8.4S +mul v24.4S, v24.4S,v17.4S +mla v24.4S, v4.4S, v31.s[0] +sub v4.4s, v15.4s, v24.4s +add v15.4s, v15.4s, v24.4s +sqrdmulh v24.4S, v29.4S, v8.4S +mul v29.4S, v29.4S,v17.4S +mla v29.4S, v24.4S, v31.s[0] +sub v24.4s, v19.4s, v29.4s +add v19.4s, v19.4s, v29.4s +sqrdmulh v29.4S, v19.4S, v10.4S +mul v19.4S, v19.4S,v16.4S +mla v19.4S, v29.4S, v31.s[0] +sub v29.4s, v15.4s, v19.4s +add v15.4s, v15.4s, v19.4s +sqrdmulh v19.4S, v24.4S, v14.4S +mul v24.4S, v24.4S,v27.4S +mla v24.4S, v19.4S, v31.s[0] +sub v19.4s, v4.4s, v24.4s +add v4.4s, v4.4s, v24.4s +str q15, [x0, #832] +str q29, [x0, #848] +str q4, [x0, #864] +str q19, [x0, #880] +ldr q19, [x17, #+1920] +ldr q4, [x17, #+1936] +ldr q29, [x17, #+1952] +ldr q15, [x17, #+1968] +ldr q24, [x17, #+1984] +ldr q20, [x17, #+2000] +ldr q5, [x17, #+2016] +ldr q0, [x17, #+2032] +ldr q14, [x0, #928] +ldr q27, [x0, #944] +ldr q10, [x0, #896] +ldr q16, [x0, #912] +sqrdmulh v8.4S, v14.4S, v4.s[0] +mul v14.4S, v14.4S,v19.s[0] +mla v14.4S, v8.4S, v31.s[0] +sub v8.4s, v10.4s, v14.4s +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v27.4S, v4.s[0] +mul v27.4S, v27.4S,v19.s[0] +mla v27.4S, v14.4S, v31.s[0] +sub v14.4s, v16.4s, v27.4s +add v16.4s, v16.4s, v27.4s +sqrdmulh v27.4S, v16.4S, v4.s[1] +mul v16.4S, v16.4S,v19.s[1] +mla v16.4S, v27.4S, v31.s[0] +sub v27.4s, v10.4s, v16.4s +add v10.4s, v10.4s, v16.4s +sqrdmulh v16.4S, v14.4S, v4.s[2] +mul v14.4S, v14.4S,v19.s[2] +mla v14.4S, v16.4S, v31.s[0] +sub v16.4s, v8.4s, v14.4s +add v8.4s, v8.4s, v14.4s +trn1 v14.4S, v10.4S, v27.4S +trn2 v17.4S, v10.4S, v27.4S +trn1 v22.4S, v8.4S, v16.4S +trn2 v9.4S, v8.4S, v16.4S +trn2 v8.2D, v14.2D, v22.2D +trn2 v16.2D, v17.2D, v9.2D +trn1 v10.2D, v14.2D, v22.2D +trn1 v27.2D, v17.2D, v9.2D +sqrdmulh v9.4S, v8.4S, v15.4S +mul v8.4S, v8.4S,v29.4S +mla v8.4S, v9.4S, v31.s[0] +sub v9.4s, v10.4s, v8.4s +add v10.4s, v10.4s, v8.4s +sqrdmulh v8.4S, v16.4S, v15.4S +mul v16.4S, v16.4S,v29.4S +mla v16.4S, v8.4S, v31.s[0] +sub v8.4s, v27.4s, v16.4s +add v27.4s, v27.4s, v16.4s +sqrdmulh v16.4S, v27.4S, v20.4S +mul v27.4S, v27.4S,v24.4S +mla v27.4S, v16.4S, v31.s[0] +sub v16.4s, v10.4s, v27.4s +add v10.4s, v10.4s, v27.4s +sqrdmulh v27.4S, v8.4S, v0.4S +mul v8.4S, v8.4S,v5.4S +mla v8.4S, v27.4S, v31.s[0] +sub v27.4s, v9.4s, v8.4s +add v9.4s, v9.4s, v8.4s +str q10, [x0, #896] +str q16, [x0, #912] +str q9, [x0, #928] +str q27, [x0, #944] +ldr q27, [x17, #+2048] +ldr q9, [x17, #+2064] +ldr q16, [x17, #+2080] +ldr q10, [x17, #+2096] +ldr q8, [x17, #+2112] +ldr q17, [x17, #+2128] +ldr q22, [x17, #+2144] +ldr q14, [x17, #+2160] +ldr q0, [x0, #992] +ldr q5, [x0, #1008] +ldr q20, [x0, #960] +ldr q24, [x0, #976] +sqrdmulh v15.4S, v0.4S, v9.s[0] +mul v0.4S, v0.4S,v27.s[0] +mla v0.4S, v15.4S, v31.s[0] +sub v15.4s, v20.4s, v0.4s +add v20.4s, v20.4s, v0.4s +sqrdmulh v0.4S, v5.4S, v9.s[0] +mul v5.4S, v5.4S,v27.s[0] +mla v5.4S, v0.4S, v31.s[0] +sub v0.4s, v24.4s, v5.4s +add v24.4s, v24.4s, v5.4s +sqrdmulh v5.4S, v24.4S, v9.s[1] +mul v24.4S, v24.4S,v27.s[1] +mla v24.4S, v5.4S, v31.s[0] +sub v5.4s, v20.4s, v24.4s +add v20.4s, v20.4s, v24.4s +sqrdmulh v24.4S, v0.4S, v9.s[2] +mul v0.4S, v0.4S,v27.s[2] +mla v0.4S, v24.4S, v31.s[0] +sub v24.4s, v15.4s, v0.4s +add v15.4s, v15.4s, v0.4s +trn1 v0.4S, v20.4S, v5.4S +trn2 v29.4S, v20.4S, v5.4S +trn1 v4.4S, v15.4S, v24.4S +trn2 v19.4S, v15.4S, v24.4S +trn2 v15.2D, v0.2D, v4.2D +trn2 v24.2D, v29.2D, v19.2D +trn1 v20.2D, v0.2D, v4.2D +trn1 v5.2D, v29.2D, v19.2D +sqrdmulh v19.4S, v15.4S, v10.4S +mul v15.4S, v15.4S,v16.4S +mla v15.4S, v19.4S, v31.s[0] +sub v19.4s, v20.4s, v15.4s +add v20.4s, v20.4s, v15.4s +sqrdmulh v15.4S, v24.4S, v10.4S +mul v24.4S, v24.4S,v16.4S +mla v24.4S, v15.4S, v31.s[0] +sub v15.4s, v5.4s, v24.4s +add v5.4s, v5.4s, v24.4s +sqrdmulh v24.4S, v5.4S, v17.4S +mul v5.4S, v5.4S,v8.4S +mla v5.4S, v24.4S, v31.s[0] +sub v24.4s, v20.4s, v5.4s +add v20.4s, v20.4s, v5.4s +sqrdmulh v5.4S, v15.4S, v14.4S +mul v15.4S, v15.4S,v22.4S +mla v15.4S, v5.4S, v31.s[0] +sub v5.4s, v19.4s, v15.4s +add v19.4s, v19.4s, v15.4s +str q20, [x0, #960] +str q24, [x0, #976] +str q19, [x0, #992] +str q5, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 2392 +// Instruction count: 2388 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_13_0.s b/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_13_0.s new file mode 100644 index 0000000..34599c2 --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_13_0.s @@ -0,0 +1,2422 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 26036764 // Layer 6, block 0 +.word 7065381 // Layer 6, block 1 +.word 11280567 // Layer 6, block 2 +.word 19695786 // Layer 6, block 3 +.word 1666225723 // Layer 6, block 0 +.word 452149874 // Layer 6, block 1 +.word 721901190 // Layer 6, block 2 +.word 1260434103 // Layer 6, block 3 +.word 28678040 // Layer 7, block 0 +.word 5637166 // Layer 7, block 2 +.word 18759424 // Layer 7, block 4 +.word 8648030 // Layer 7, block 6 +.word 1835254486 // Layer 7, block 0 +.word 360751090 // Layer 7, block 2 +.word 1200511508 // Layer 7, block 4 +.word 553431680 // Layer 7, block 6 +.word 7232147 // Layer 7, block 1 +.word 7430689 // Layer 7, block 3 +.word 14819378 // Layer 7, block 5 +.word 22112339 // Layer 7, block 7 +.word 462822084 // Layer 7, block 1 +.word 475527802 // Layer 7, block 3 +.word 948367809 // Layer 7, block 5 +.word 1415081692 // Layer 7, block 7 +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14834498 // Layer 6, block 4 +.word 22861321 // Layer 6, block 5 +.word 23033862 // Layer 6, block 6 +.word 32211066 // Layer 6, block 7 +.word 949335415 // Layer 6, block 4 +.word 1463012881 // Layer 6, block 5 +.word 1474054663 // Layer 6, block 6 +.word 2061350894 // Layer 6, block 7 +.word 7103825 // Layer 7, block 8 +.word 24338119 // Layer 7, block 10 +.word 6674394 // Layer 7, block 12 +.word 3716128 // Layer 7, block 14 +.word 454610102 // Layer 7, block 8 +.word 1557520740 // Layer 7, block 10 +.word 427128616 // Layer 7, block 12 +.word 237814041 // Layer 7, block 14 +.word 18577393 // Layer 7, block 9 +.word 17042091 // Layer 7, block 11 +.word 6574213 // Layer 7, block 13 +.word 24666803 // Layer 7, block 15 +.word 1188862414 // Layer 7, block 9 +.word 1090610585 // Layer 7, block 11 +.word 420717521 // Layer 7, block 13 +.word 1578554911 // Layer 7, block 15 +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 11253846 // Layer 6, block 8 +.word 16151303 // Layer 6, block 9 +.word 1821442 // Layer 6, block 10 +.word 23358663 // Layer 6, block 11 +.word 720191176 // Layer 6, block 8 +.word 1033604503 // Layer 6, block 9 +.word 116563391 // Layer 6, block 10 +.word 1494840340 // Layer 6, block 11 +.word 32787475 // Layer 7, block 16 +.word 8269259 // Layer 7, block 18 +.word 20826321 // Layer 7, block 20 +.word 21194054 // Layer 7, block 22 +.word 2098238255 // Layer 7, block 16 +.word 529192186 // Layer 7, block 18 +.word 1332782821 // Layer 7, block 20 +.word 1356315937 // Layer 7, block 22 +.word 28400654 // Layer 7, block 17 +.word 31090287 // Layer 7, block 19 +.word 26776841 // Layer 7, block 21 +.word 22281074 // Layer 7, block 23 +.word 1817503137 // Layer 7, block 17 +.word 1989626512 // Layer 7, block 19 +.word 1713587037 // Layer 7, block 21 +.word 1425879908 // Layer 7, block 23 +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 20504641 // Layer 6, block 12 +.word 7735096 // Layer 6, block 13 +.word 29463916 // Layer 6, block 14 +.word 23172067 // Layer 6, block 15 +.word 1312196872 // Layer 6, block 12 +.word 495008363 // Layer 6, block 13 +.word 1885546712 // Layer 6, block 14 +.word 1482899108 // Layer 6, block 15 +.word 1953000 // Layer 7, block 24 +.word 12766243 // Layer 7, block 26 +.word 16292342 // Layer 7, block 28 +.word 25143337 // Layer 7, block 30 +.word 124982461 // Layer 7, block 24 +.word 816977197 // Layer 7, block 26 +.word 1042630311 // Layer 7, block 28 +.word 1609050759 // Layer 7, block 30 +.word 12486848 // Layer 7, block 25 +.word 31556661 // Layer 7, block 27 +.word 28330310 // Layer 7, block 29 +.word 15137961 // Layer 7, block 31 +.word 799097282 // Layer 7, block 25 +.word 2019472170 // Layer 7, block 27 +.word 1813001465 // Layer 7, block 29 +.word 968755565 // Layer 7, block 31 +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 18663828 // Layer 6, block 16 +.word 25765932 // Layer 6, block 17 +.word 11779122 // Layer 6, block 18 +.word 29112305 // Layer 6, block 19 +.word 1194393831 // Layer 6, block 16 +.word 1648893798 // Layer 6, block 17 +.word 753806275 // Layer 6, block 18 +.word 1863045325 // Layer 6, block 19 +.word 33163184 // Layer 7, block 32 +.word 11550623 // Layer 7, block 34 +.word 25375595 // Layer 7, block 36 +.word 18254638 // Layer 7, block 38 +.word 2122281795 // Layer 7, block 32 +.word 739183455 // Layer 7, block 34 +.word 1623914137 // Layer 7, block 36 +.word 1168207670 // Layer 7, block 38 +.word 9551359 // Layer 7, block 33 +.word 33257316 // Layer 7, block 35 +.word 10387700 // Layer 7, block 37 +.word 4263629 // Layer 7, block 39 +.word 611240324 // Layer 7, block 33 +.word 2128305784 // Layer 7, block 35 +.word 664762063 // Layer 7, block 37 +.word 272851431 // Layer 7, block 39 +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 596073 // Layer 6, block 20 +.word 29039358 // Layer 6, block 21 +.word 6760262 // Layer 6, block 22 +.word 2228887 // Layer 6, block 23 +.word 38145761 // Layer 6, block 20 +.word 1858377074 // Layer 6, block 21 +.word 432623749 // Layer 6, block 22 +.word 142637881 // Layer 6, block 23 +.word 25929180 // Layer 7, block 40 +.word 23508428 // Layer 7, block 42 +.word 22560727 // Layer 7, block 44 +.word 29457393 // Layer 7, block 46 +.word 1659340873 // Layer 7, block 40 +.word 1504424569 // Layer 7, block 42 +.word 1443776334 // Layer 7, block 44 +.word 1885129272 // Layer 7, block 46 +.word 17371159 // Layer 7, block 41 +.word 11558208 // Layer 7, block 43 +.word 15755637 // Layer 7, block 45 +.word 20740787 // Layer 7, block 47 +.word 1111669329 // Layer 7, block 41 +.word 739668858 // Layer 7, block 43 +.word 1008283812 // Layer 7, block 45 +.word 1327309063 // Layer 7, block 47 +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 13624329 // Layer 6, block 24 +.word 9838349 // Layer 6, block 25 +.word 6934560 // Layer 6, block 26 +.word 11310234 // Layer 6, block 27 +.word 871890510 // Layer 6, block 24 +.word 629606282 // Layer 6, block 25 +.word 443777969 // Layer 6, block 26 +.word 723799733 // Layer 6, block 27 +.word 3153984 // Layer 7, block 48 +.word 15599806 // Layer 7, block 50 +.word 23484790 // Layer 7, block 52 +.word 30174454 // Layer 7, block 54 +.word 201839571 // Layer 7, block 48 +.word 998311389 // Layer 7, block 50 +.word 1502911852 // Layer 7, block 52 +.word 1931017673 // Layer 7, block 54 +.word 13598070 // Layer 7, block 49 +.word 31454003 // Layer 7, block 51 +.word 20506260 // Layer 7, block 53 +.word 5928435 // Layer 7, block 55 +.word 870210062 // Layer 7, block 49 +.word 2012902560 // Layer 7, block 51 +.word 1312300480 // Layer 7, block 53 +.word 379390883 // Layer 7, block 55 +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 32798516 // Layer 6, block 28 +.word 9911360 // Layer 6, block 29 +.word 32443170 // Layer 6, block 30 +.word 31293482 // Layer 6, block 31 +.word 2098944825 // Layer 6, block 28 +.word 634278629 // Layer 6, block 29 +.word 2076204416 // Layer 6, block 30 +.word 2002630000 // Layer 6, block 31 +.word 26013877 // Layer 7, block 56 +.word 22928950 // Layer 7, block 58 +.word 24547058 // Layer 7, block 60 +.word 21082546 // Layer 7, block 62 +.word 1664761067 // Layer 7, block 56 +.word 1467340807 // Layer 7, block 58 +.word 1570891816 // Layer 7, block 60 +.word 1349179970 // Layer 7, block 62 +.word 21864746 // Layer 7, block 57 +.word 27678266 // Layer 7, block 59 +.word 30695887 // Layer 7, block 61 +.word 31772478 // Layer 7, block 63 +.word 1399236949 // Layer 7, block 57 +.word 1771273834 // Layer 7, block 59 +.word 1964386839 // Layer 7, block 61 +.word 2033283404 // Layer 7, block 63 +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 2853776 // Layer 6, block 32 +.word 31645959 // Layer 6, block 33 +.word 29723614 // Layer 6, block 34 +.word 31813171 // Layer 6, block 35 +.word 182627725 // Layer 6, block 32 +.word 2025186806 // Layer 6, block 33 +.word 1902166116 // Layer 6, block 34 +.word 2035887557 // Layer 6, block 35 +.word 30377953 // Layer 7, block 64 +.word 4924837 // Layer 7, block 66 +.word 11362575 // Layer 7, block 68 +.word 31398766 // Layer 7, block 70 +.word 1944040616 // Layer 7, block 64 +.word 315165513 // Layer 7, block 66 +.word 727149301 // Layer 7, block 68 +.word 2009367662 // Layer 7, block 70 +.word 27689101 // Layer 7, block 65 +.word 31229525 // Layer 7, block 67 +.word 6544948 // Layer 7, block 69 +.word 13728247 // Layer 7, block 71 +.word 1771967221 // Layer 7, block 65 +.word 1998537064 // Layer 7, block 67 +.word 418844704 // Layer 7, block 69 +.word 878540754 // Layer 7, block 71 +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9116920 // Layer 6, block 36 +.word 26449800 // Layer 6, block 37 +.word 27173300 // Layer 6, block 38 +.word 1574249 // Layer 6, block 39 +.word 583438350 // Layer 6, block 36 +.word 1692658010 // Layer 6, block 37 +.word 1738958476 // Layer 6, block 38 +.word 100744247 // Layer 6, block 39 +.word 6510145 // Layer 7, block 72 +.word 760999 // Layer 7, block 74 +.word 1634503 // Layer 7, block 76 +.word 29546109 // Layer 7, block 78 +.word 416617482 // Layer 7, block 72 +.word 48700219 // Layer 7, block 74 +.word 104600209 // Layer 7, block 76 +.word 1890806663 // Layer 7, block 78 +.word 2195232 // Layer 7, block 73 +.word 4465852 // Layer 7, block 75 +.word 31203102 // Layer 7, block 77 +.word 29916743 // Layer 7, block 79 +.word 140484126 // Layer 7, block 73 +.word 285792715 // Layer 7, block 75 +.word 1996846121 // Layer 7, block 77 +.word 1914525428 // Layer 7, block 79 +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29172999 // Layer 6, block 40 +.word 16825951 // Layer 6, block 41 +.word 11592382 // Layer 6, block 42 +.word 2671395 // Layer 6, block 43 +.word 1866929445 // Layer 6, block 40 +.word 1076778680 // Layer 6, block 41 +.word 741855827 // Layer 6, block 42 +.word 170956232 // Layer 6, block 43 +.word 14579779 // Layer 7, block 80 +.word 24263513 // Layer 7, block 82 +.word 4646776 // Layer 7, block 84 +.word 69049 // Layer 7, block 86 +.word 933034643 // Layer 7, block 80 +.word 1552746321 // Layer 7, block 82 +.word 297370968 // Layer 7, block 84 +.word 4418799 // Layer 7, block 86 +.word 33263488 // Layer 7, block 81 +.word 22493246 // Layer 7, block 83 +.word 22009979 // Layer 7, block 85 +.word 12021234 // Layer 7, block 87 +.word 2128700762 // Layer 7, block 81 +.word 1439457879 // Layer 7, block 83 +.word 1408531152 // Layer 7, block 85 +.word 769300260 // Layer 7, block 87 +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 15720958 // Layer 6, block 44 +.word 4876619 // Layer 6, block 45 +.word 9370171 // Layer 6, block 46 +.word 2197027 // Layer 6, block 47 +.word 1006064525 // Layer 6, block 44 +.word 312079797 // Layer 6, block 45 +.word 599645177 // Layer 6, block 46 +.word 140598997 // Layer 6, block 47 +.word 16117282 // Layer 7, block 88 +.word 9635661 // Layer 7, block 90 +.word 9117520 // Layer 7, block 92 +.word 3506913 // Layer 7, block 94 +.word 1031427326 // Layer 7, block 88 +.word 616635240 // Layer 7, block 90 +.word 583476747 // Layer 7, block 92 +.word 224425303 // Layer 7, block 94 +.word 20014407 // Layer 7, block 89 +.word 25893988 // Layer 7, block 91 +.word 10257619 // Layer 7, block 93 +.word 24501669 // Layer 7, block 95 +.word 1280824291 // Layer 7, block 89 +.word 1657088757 // Layer 7, block 91 +.word 656437514 // Layer 7, block 93 +.word 1567987141 // Layer 7, block 95 +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 23467272 // Layer 6, block 48 +.word 11944835 // Layer 6, block 49 +.word 29768154 // Layer 6, block 50 +.word 3189790 // Layer 6, block 51 +.word 1501790786 // Layer 6, block 48 +.word 764411097 // Layer 6, block 49 +.word 1905016458 // Layer 6, block 50 +.word 204130980 // Layer 6, block 51 +.word 28559032 // Layer 7, block 96 +.word 20151609 // Layer 7, block 98 +.word 11645481 // Layer 7, block 100 +.word 16402437 // Layer 7, block 102 +.word 1827638556 // Layer 7, block 96 +.word 1289604549 // Layer 7, block 98 +.word 745253903 // Layer 7, block 100 +.word 1049675853 // Layer 7, block 102 +.word 1005359 // Layer 7, block 97 +.word 19130139 // Layer 7, block 99 +.word 11690281 // Layer 7, block 101 +.word 5461508 // Layer 7, block 103 +.word 64338065 // Layer 7, block 97 +.word 1224235458 // Layer 7, block 99 +.word 748120885 // Layer 7, block 101 +.word 349509836 // Layer 7, block 103 +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 4898455 // Layer 6, block 52 +.word 22059944 // Layer 6, block 53 +.word 20315246 // Layer 6, block 54 +.word 28615767 // Layer 6, block 55 +.word 313477194 // Layer 6, block 52 +.word 1411728668 // Layer 6, block 53 +.word 1300076517 // Layer 6, block 54 +.word 1831269319 // Layer 6, block 55 +.word 6226096 // Layer 7, block 104 +.word 14029790 // Layer 7, block 106 +.word 7729000 // Layer 7, block 108 +.word 13958531 // Layer 7, block 110 +.word 398439734 // Layer 7, block 104 +.word 897838034 // Layer 7, block 106 +.word 494618249 // Layer 7, block 108 +.word 893277806 // Layer 7, block 110 +.word 31755058 // Layer 7, block 105 +.word 26102744 // Layer 7, block 107 +.word 19175904 // Layer 7, block 109 +.word 19472238 // Layer 7, block 111 +.word 2032168609 // Layer 7, block 105 +.word 1670448121 // Layer 7, block 107 +.word 1227164194 // Layer 7, block 109 +.word 1246128123 // Layer 7, block 111 +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 17302560 // Layer 6, block 56 +.word 8630188 // Layer 6, block 57 +.word 13744680 // Layer 6, block 58 +.word 31890906 // Layer 6, block 59 +.word 1107279328 // Layer 6, block 56 +.word 552289879 // Layer 6, block 57 +.word 879592386 // Layer 6, block 58 +.word 2040862218 // Layer 6, block 59 +.word 4735938 // Layer 7, block 112 +.word 26671657 // Layer 7, block 114 +.word 25810971 // Layer 7, block 116 +.word 25578690 // Layer 7, block 118 +.word 303076900 // Layer 7, block 112 +.word 1706855774 // Layer 7, block 114 +.word 1651776074 // Layer 7, block 116 +.word 1636911225 // Layer 7, block 118 +.word 6957373 // Layer 7, block 113 +.word 25381712 // Layer 7, block 115 +.word 27780827 // Layer 7, block 117 +.word 28062311 // Layer 7, block 119 +.word 445237890 // Layer 7, block 113 +.word 1624305595 // Layer 7, block 115 +.word 1777837237 // Layer 7, block 117 +.word 1795850838 // Layer 7, block 119 +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 26150922 // Layer 6, block 60 +.word 29525906 // Layer 6, block 61 +.word 23080870 // Layer 6, block 62 +.word 1636987 // Layer 6, block 63 +.word 1673531278 // Layer 6, block 60 +.word 1889513769 // Layer 6, block 61 +.word 1477062945 // Layer 6, block 62 +.word 104759172 // Layer 6, block 63 +.word 10674616 // Layer 7, block 120 +.word 9508293 // Layer 7, block 122 +.word 4274200 // Layer 7, block 124 +.word 10066304 // Layer 7, block 126 +.word 683123285 // Layer 7, block 120 +.word 608484310 // Layer 7, block 122 +.word 273527923 // Layer 7, block 124 +.word 644194289 // Layer 7, block 126 +.word 26473446 // Layer 7, block 121 +.word 14853570 // Layer 7, block 123 +.word 32427548 // Layer 7, block 125 +.word 16598340 // Layer 7, block 127 +.word 1694171239 // Layer 7, block 121 +.word 950555930 // Layer 7, block 123 +.word 2075204685 // Layer 7, block 125 +.word 1062212688 // Layer 7, block 127 +.text +.global ntt_u32_full_neon_asm_var_4_4_13_0 +.global _ntt_u32_full_neon_asm_var_4_4_13_0 +ntt_u32_full_neon_asm_var_4_4_13_0: +_ntt_u32_full_neon_asm_var_4_4_13_0: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #928] +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +ldr q20, [x0, #992] +sqrdmulh v19.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q18, [x0, #800] +sqrdmulh v17.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +ldr q16, [x0, #864] +sqrdmulh v3.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +ldr q2, [x0, #544] +mla v22.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v2.4S, v29.s[0] +ldr q1, [x0, #608] +mla v20.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v1.4S, v29.s[0] +ldr q0, [x0, #672] +mla v18.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v0.4S, v29.s[0] +ldr q15, [x0, #736] +mla v16.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v15.4S, v29.s[0] +ldr q14, [x0, #416] +ldr q13, [x0, #480] +mul v2.4S, v2.4S,v30.s[0] +sub v12.4s, v14.4s, v22.4s +mul v1.4S, v1.4S,v30.s[0] +add v14.4s, v14.4s, v22.4s +ldr q22, [x0, #288] +ldr q11, [x0, #352] +mla v2.4S, v21.4S, v31.s[0] +sub v21.4s, v13.4s, v20.4s +mla v1.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v20.4s +ldr q20, [x0, #32] +ldr q19, [x0, #96] +mul v0.4S, v0.4S,v30.s[0] +sub v10.4s, v22.4s, v18.4s +mul v15.4S, v15.4S,v30.s[0] +add v22.4s, v22.4s, v18.4s +ldr q18, [x0, #160] +ldr q9, [x0, #224] +mla v0.4S, v17.4S, v31.s[0] +sub v17.4s, v11.4s, v16.4s +mla v15.4S, v3.4S, v31.s[0] +add v11.4s, v11.4s, v16.4s +sqrdmulh v16.4S, v14.4S, v29.s[1] +mul v14.4S, v14.4S,v30.s[1] +sqrdmulh v3.4S, v13.4S, v29.s[1] +sub v8.4s, v20.4s, v2.4s +mul v13.4S, v13.4S,v30.s[1] +add v20.4s, v20.4s, v2.4s +sqrdmulh v2.4S, v22.4S, v29.s[1] +sub v7.4s, v19.4s, v1.4s +mul v22.4S, v22.4S,v30.s[1] +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v11.4S, v29.s[1] +sub v6.4s, v18.4s, v0.4s +mul v11.4S, v11.4S,v30.s[1] +add v18.4s, v18.4s, v0.4s +mla v14.4S, v16.4S, v31.s[0] +sub v16.4s, v9.4s, v15.4s +sqrdmulh v0.4S, v12.4S, v29.s[2] +add v9.4s, v9.4s, v15.4s +mla v13.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v21.4S, v29.s[2] +mla v22.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v10.4S, v29.s[2] +mla v11.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v17.4S, v29.s[2] +mul v12.4S, v12.4S,v30.s[2] +sub v15.4s, v18.4s, v14.4s +mul v21.4S, v21.4S,v30.s[2] +add v18.4s, v18.4s, v14.4s +mla v12.4S, v0.4S, v31.s[0] +sub v0.4s, v9.4s, v13.4s +mla v21.4S, v3.4S, v31.s[0] +add v9.4s, v9.4s, v13.4s +mul v10.4S, v10.4S,v30.s[2] +sub v13.4s, v20.4s, v22.4s +mul v17.4S, v17.4S,v30.s[2] +add v20.4s, v20.4s, v22.4s +mla v10.4S, v2.4S, v31.s[0] +sub v2.4s, v19.4s, v11.4s +mla v17.4S, v1.4S, v31.s[0] +add v19.4s, v19.4s, v11.4s +sqrdmulh v11.4S, v15.4S, v27.s[1] +mul v15.4S, v15.4S,v28.s[1] +sqrdmulh v1.4S, v0.4S, v27.s[1] +sub v22.4s, v6.4s, v12.4s +mul v0.4S, v0.4S,v28.s[1] +add v6.4s, v6.4s, v12.4s +sqrdmulh v12.4S, v18.4S, v27.s[0] +sub v3.4s, v16.4s, v21.4s +mul v18.4S, v18.4S,v28.s[0] +add v16.4s, v16.4s, v21.4s +sqrdmulh v21.4S, v9.4S, v27.s[0] +sub v14.4s, v8.4s, v10.4s +mul v9.4S, v9.4S,v28.s[0] +add v8.4s, v8.4s, v10.4s +mla v15.4S, v11.4S, v31.s[0] +sub v11.4s, v7.4s, v17.4s +sqrdmulh v10.4S, v6.4S, v27.s[2] +add v7.4s, v7.4s, v17.4s +mla v0.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v16.4S, v27.s[2] +mla v18.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v22.4S, v27.s[3] +mla v9.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v3.4S, v27.s[3] +mul v6.4S, v6.4S,v28.s[2] +sub v17.4s, v13.4s, v15.4s +mul v16.4S, v16.4S,v28.s[2] +add v13.4s, v13.4s, v15.4s +mla v6.4S, v10.4S, v31.s[0] +sub v10.4s, v2.4s, v0.4s +mla v16.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v0.4s +mul v22.4S, v22.4S,v28.s[3] +sub v0.4s, v20.4s, v18.4s +mul v3.4S, v3.4S,v28.s[3] +add v20.4s, v20.4s, v18.4s +mla v22.4S, v12.4S, v31.s[0] +sub v12.4s, v19.4s, v9.4s +mla v3.4S, v21.4S, v31.s[0] +add v19.4s, v19.4s, v9.4s +sqrdmulh v9.4S, v2.4S, v25.s[2] +mul v2.4S, v2.4S,v26.s[2] +sqrdmulh v21.4S, v10.4S, v25.s[3] +sub v18.4s, v8.4s, v6.4s +mul v10.4S, v10.4S,v26.s[3] +add v8.4s, v8.4s, v6.4s +sqrdmulh v6.4S, v12.4S, v25.s[1] +sub v1.4s, v7.4s, v16.4s +mul v12.4S, v12.4S,v26.s[1] +add v7.4s, v7.4s, v16.4s +sqrdmulh v16.4S, v19.4S, v25.s[0] +sub v15.4s, v14.4s, v22.4s +mul v19.4S, v19.4S,v26.s[0] +add v14.4s, v14.4s, v22.4s +mla v2.4S, v9.4S, v31.s[0] +sub v9.4s, v11.4s, v3.4s +sqrdmulh v22.4S, v7.4S, v23.s[0] +add v11.4s, v11.4s, v3.4s +mla v10.4S, v21.4S, v31.s[0] +sub v21.4s, v13.4s, v2.4s +sqrdmulh v3.4S, v1.4S, v23.s[1] +add v13.4s, v13.4s, v2.4s +mla v12.4S, v6.4S, v31.s[0] +sub v6.4s, v17.4s, v10.4s +sqrdmulh v2.4S, v11.4S, v23.s[2] +add v17.4s, v17.4s, v10.4s +mla v19.4S, v16.4S, v31.s[0] +sub v16.4s, v0.4s, v12.4s +sqrdmulh v10.4S, v9.4S, v23.s[3] +add v0.4s, v0.4s, v12.4s +mul v7.4S, v7.4S,v24.s[0] +sub v12.4s, v20.4s, v19.4s +mul v1.4S, v1.4S,v24.s[1] +add v20.4s, v20.4s, v19.4s +mla v7.4S, v22.4S, v31.s[0] +str q21, [x0, #352] +mla v1.4S, v3.4S, v31.s[0] +str q13, [x0, #288] +mul v11.4S, v11.4S,v24.s[2] +str q6, [x0, #480] +mul v9.4S, v9.4S,v24.s[3] +str q17, [x0, #416] +mla v11.4S, v2.4S, v31.s[0] +str q16, [x0, #224] +mla v9.4S, v10.4S, v31.s[0] +str q0, [x0, #160] +ldr q0, [x0, #944] +sqrdmulh v10.4S, v0.4S, v29.s[0] +str q12, [x0, #96] +mul v0.4S, v0.4S,v30.s[0] +str q20, [x0, #32] +ldr q20, [x0, #1008] +sqrdmulh v12.4S, v20.4S, v29.s[0] +sub v16.4s, v8.4s, v7.4s +str q16, [x0, #608] +mul v20.4S, v20.4S,v30.s[0] +add v8.4s, v8.4s, v7.4s +ldr q7, [x0, #816] +sqrdmulh v16.4S, v7.4S, v29.s[0] +sub v2.4s, v18.4s, v1.4s +str q8, [x0, #544] +mul v7.4S, v7.4S,v30.s[0] +add v18.4s, v18.4s, v1.4s +ldr q1, [x0, #880] +sqrdmulh v8.4S, v1.4S, v29.s[0] +sub v17.4s, v14.4s, v11.4s +str q2, [x0, #736] +mul v1.4S, v1.4S,v30.s[0] +add v14.4s, v14.4s, v11.4s +ldr q11, [x0, #560] +mla v0.4S, v10.4S, v31.s[0] +sub v10.4s, v15.4s, v9.4s +str q18, [x0, #672] +sqrdmulh v18.4S, v11.4S, v29.s[0] +add v15.4s, v15.4s, v9.4s +ldr q9, [x0, #624] +mla v20.4S, v12.4S, v31.s[0] +str q17, [x0, #864] +sqrdmulh v17.4S, v9.4S, v29.s[0] +ldr q12, [x0, #688] +mla v7.4S, v16.4S, v31.s[0] +str q14, [x0, #800] +sqrdmulh v14.4S, v12.4S, v29.s[0] +ldr q16, [x0, #752] +mla v1.4S, v8.4S, v31.s[0] +str q10, [x0, #992] +sqrdmulh v10.4S, v16.4S, v29.s[0] +ldr q8, [x0, #432] +ldr q2, [x0, #496] +mul v11.4S, v11.4S,v30.s[0] +sub v6.4s, v8.4s, v0.4s +str q15, [x0, #928] +mul v9.4S, v9.4S,v30.s[0] +add v8.4s, v8.4s, v0.4s +ldr q0, [x0, #304] +ldr q15, [x0, #368] +mla v11.4S, v18.4S, v31.s[0] +sub v18.4s, v2.4s, v20.4s +mla v9.4S, v17.4S, v31.s[0] +add v2.4s, v2.4s, v20.4s +ldr q20, [x0, #48] +ldr q17, [x0, #112] +mul v12.4S, v12.4S,v30.s[0] +sub v13.4s, v0.4s, v7.4s +mul v16.4S, v16.4S,v30.s[0] +add v0.4s, v0.4s, v7.4s +ldr q7, [x0, #176] +ldr q3, [x0, #240] +mla v12.4S, v14.4S, v31.s[0] +sub v14.4s, v15.4s, v1.4s +mla v16.4S, v10.4S, v31.s[0] +add v15.4s, v15.4s, v1.4s +sqrdmulh v1.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +sqrdmulh v10.4S, v2.4S, v29.s[1] +sub v21.4s, v20.4s, v11.4s +mul v2.4S, v2.4S,v30.s[1] +add v20.4s, v20.4s, v11.4s +sqrdmulh v11.4S, v0.4S, v29.s[1] +sub v22.4s, v17.4s, v9.4s +mul v0.4S, v0.4S,v30.s[1] +add v17.4s, v17.4s, v9.4s +sqrdmulh v9.4S, v15.4S, v29.s[1] +sub v19.4s, v7.4s, v12.4s +mul v15.4S, v15.4S,v30.s[1] +add v7.4s, v7.4s, v12.4s +mla v8.4S, v1.4S, v31.s[0] +sub v1.4s, v3.4s, v16.4s +sqrdmulh v12.4S, v6.4S, v29.s[2] +add v3.4s, v3.4s, v16.4s +mla v2.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v18.4S, v29.s[2] +mla v0.4S, v11.4S, v31.s[0] +sqrdmulh v11.4S, v13.4S, v29.s[2] +mla v15.4S, v9.4S, v31.s[0] +sqrdmulh v9.4S, v14.4S, v29.s[2] +mul v6.4S, v6.4S,v30.s[2] +sub v16.4s, v7.4s, v8.4s +mul v18.4S, v18.4S,v30.s[2] +add v7.4s, v7.4s, v8.4s +mla v6.4S, v12.4S, v31.s[0] +sub v12.4s, v3.4s, v2.4s +mla v18.4S, v10.4S, v31.s[0] +add v3.4s, v3.4s, v2.4s +mul v13.4S, v13.4S,v30.s[2] +sub v2.4s, v20.4s, v0.4s +mul v14.4S, v14.4S,v30.s[2] +add v20.4s, v20.4s, v0.4s +mla v13.4S, v11.4S, v31.s[0] +sub v11.4s, v17.4s, v15.4s +mla v14.4S, v9.4S, v31.s[0] +add v17.4s, v17.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v27.s[1] +mul v16.4S, v16.4S,v28.s[1] +sqrdmulh v9.4S, v12.4S, v27.s[1] +sub v0.4s, v19.4s, v6.4s +mul v12.4S, v12.4S,v28.s[1] +add v19.4s, v19.4s, v6.4s +sqrdmulh v6.4S, v7.4S, v27.s[0] +sub v10.4s, v1.4s, v18.4s +mul v7.4S, v7.4S,v28.s[0] +add v1.4s, v1.4s, v18.4s +sqrdmulh v18.4S, v3.4S, v27.s[0] +sub v8.4s, v21.4s, v13.4s +mul v3.4S, v3.4S,v28.s[0] +add v21.4s, v21.4s, v13.4s +mla v16.4S, v15.4S, v31.s[0] +sub v15.4s, v22.4s, v14.4s +sqrdmulh v13.4S, v19.4S, v27.s[2] +add v22.4s, v22.4s, v14.4s +mla v12.4S, v9.4S, v31.s[0] +sqrdmulh v9.4S, v1.4S, v27.s[2] +mla v7.4S, v6.4S, v31.s[0] +sqrdmulh v6.4S, v0.4S, v27.s[3] +mla v3.4S, v18.4S, v31.s[0] +sqrdmulh v18.4S, v10.4S, v27.s[3] +mul v19.4S, v19.4S,v28.s[2] +sub v14.4s, v2.4s, v16.4s +mul v1.4S, v1.4S,v28.s[2] +add v2.4s, v2.4s, v16.4s +mla v19.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v12.4s +mla v1.4S, v9.4S, v31.s[0] +add v11.4s, v11.4s, v12.4s +mul v0.4S, v0.4S,v28.s[3] +sub v12.4s, v20.4s, v7.4s +mul v10.4S, v10.4S,v28.s[3] +add v20.4s, v20.4s, v7.4s +mla v0.4S, v6.4S, v31.s[0] +sub v6.4s, v17.4s, v3.4s +mla v10.4S, v18.4S, v31.s[0] +add v17.4s, v17.4s, v3.4s +sqrdmulh v3.4S, v11.4S, v25.s[2] +mul v11.4S, v11.4S,v26.s[2] +sqrdmulh v18.4S, v13.4S, v25.s[3] +sub v7.4s, v21.4s, v19.4s +mul v13.4S, v13.4S,v26.s[3] +add v21.4s, v21.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v25.s[1] +sub v9.4s, v22.4s, v1.4s +mul v6.4S, v6.4S,v26.s[1] +add v22.4s, v22.4s, v1.4s +sqrdmulh v1.4S, v17.4S, v25.s[0] +sub v16.4s, v8.4s, v0.4s +mul v17.4S, v17.4S,v26.s[0] +add v8.4s, v8.4s, v0.4s +mla v11.4S, v3.4S, v31.s[0] +sub v3.4s, v15.4s, v10.4s +sqrdmulh v0.4S, v22.4S, v23.s[0] +add v15.4s, v15.4s, v10.4s +mla v13.4S, v18.4S, v31.s[0] +sub v18.4s, v2.4s, v11.4s +sqrdmulh v10.4S, v9.4S, v23.s[1] +add v2.4s, v2.4s, v11.4s +mla v6.4S, v19.4S, v31.s[0] +sub v19.4s, v14.4s, v13.4s +sqrdmulh v11.4S, v15.4S, v23.s[2] +add v14.4s, v14.4s, v13.4s +mla v17.4S, v1.4S, v31.s[0] +sub v1.4s, v12.4s, v6.4s +sqrdmulh v13.4S, v3.4S, v23.s[3] +add v12.4s, v12.4s, v6.4s +mul v22.4S, v22.4S,v24.s[0] +sub v6.4s, v20.4s, v17.4s +mul v9.4S, v9.4S,v24.s[1] +add v20.4s, v20.4s, v17.4s +mla v22.4S, v0.4S, v31.s[0] +str q18, [x0, #368] +mla v9.4S, v10.4S, v31.s[0] +str q2, [x0, #304] +mul v15.4S, v15.4S,v24.s[2] +str q19, [x0, #496] +mul v3.4S, v3.4S,v24.s[3] +str q14, [x0, #432] +mla v15.4S, v11.4S, v31.s[0] +str q1, [x0, #240] +mla v3.4S, v13.4S, v31.s[0] +str q12, [x0, #176] +ldr q12, [x0, #896] +sqrdmulh v13.4S, v12.4S, v29.s[0] +str q6, [x0, #112] +mul v12.4S, v12.4S,v30.s[0] +str q20, [x0, #48] +ldr q20, [x0, #960] +sqrdmulh v6.4S, v20.4S, v29.s[0] +sub v1.4s, v21.4s, v22.4s +str q1, [x0, #624] +mul v20.4S, v20.4S,v30.s[0] +add v21.4s, v21.4s, v22.4s +ldr q22, [x0, #768] +sqrdmulh v1.4S, v22.4S, v29.s[0] +sub v11.4s, v7.4s, v9.4s +str q21, [x0, #560] +mul v22.4S, v22.4S,v30.s[0] +add v7.4s, v7.4s, v9.4s +ldr q9, [x0, #832] +sqrdmulh v21.4S, v9.4S, v29.s[0] +sub v14.4s, v8.4s, v15.4s +str q11, [x0, #752] +mul v9.4S, v9.4S,v30.s[0] +add v8.4s, v8.4s, v15.4s +ldr q15, [x0, #512] +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v16.4s, v3.4s +str q7, [x0, #688] +sqrdmulh v7.4S, v15.4S, v29.s[0] +add v16.4s, v16.4s, v3.4s +ldr q3, [x0, #576] +mla v20.4S, v6.4S, v31.s[0] +str q14, [x0, #880] +sqrdmulh v14.4S, v3.4S, v29.s[0] +ldr q6, [x0, #640] +mla v22.4S, v1.4S, v31.s[0] +str q8, [x0, #816] +sqrdmulh v8.4S, v6.4S, v29.s[0] +ldr q1, [x0, #704] +mla v9.4S, v21.4S, v31.s[0] +str q13, [x0, #1008] +sqrdmulh v13.4S, v1.4S, v29.s[0] +ldr q21, [x0, #384] +ldr q11, [x0, #448] +mul v15.4S, v15.4S,v30.s[0] +sub v19.4s, v21.4s, v12.4s +str q16, [x0, #944] +mul v3.4S, v3.4S,v30.s[0] +add v21.4s, v21.4s, v12.4s +ldr q12, [x0, #256] +ldr q16, [x0, #320] +mla v15.4S, v7.4S, v31.s[0] +sub v7.4s, v11.4s, v20.4s +mla v3.4S, v14.4S, v31.s[0] +add v11.4s, v11.4s, v20.4s +ldr q20, [x0, #0] +ldr q14, [x0, #64] +mul v6.4S, v6.4S,v30.s[0] +sub v2.4s, v12.4s, v22.4s +mul v1.4S, v1.4S,v30.s[0] +add v12.4s, v12.4s, v22.4s +ldr q22, [x0, #128] +ldr q10, [x0, #192] +mla v6.4S, v8.4S, v31.s[0] +sub v8.4s, v16.4s, v9.4s +mla v1.4S, v13.4S, v31.s[0] +add v16.4s, v16.4s, v9.4s +sqrdmulh v9.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sqrdmulh v13.4S, v11.4S, v29.s[1] +sub v18.4s, v20.4s, v15.4s +mul v11.4S, v11.4S,v30.s[1] +add v20.4s, v20.4s, v15.4s +sqrdmulh v15.4S, v12.4S, v29.s[1] +sub v0.4s, v14.4s, v3.4s +mul v12.4S, v12.4S,v30.s[1] +add v14.4s, v14.4s, v3.4s +sqrdmulh v3.4S, v16.4S, v29.s[1] +sub v17.4s, v22.4s, v6.4s +mul v16.4S, v16.4S,v30.s[1] +add v22.4s, v22.4s, v6.4s +mla v21.4S, v9.4S, v31.s[0] +sub v9.4s, v10.4s, v1.4s +sqrdmulh v6.4S, v19.4S, v29.s[2] +add v10.4s, v10.4s, v1.4s +mla v11.4S, v13.4S, v31.s[0] +sqrdmulh v13.4S, v7.4S, v29.s[2] +mla v12.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v2.4S, v29.s[2] +mla v16.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v8.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +sub v1.4s, v22.4s, v21.4s +mul v7.4S, v7.4S,v30.s[2] +add v22.4s, v22.4s, v21.4s +mla v19.4S, v6.4S, v31.s[0] +sub v6.4s, v10.4s, v11.4s +mla v7.4S, v13.4S, v31.s[0] +add v10.4s, v10.4s, v11.4s +mul v2.4S, v2.4S,v30.s[2] +sub v11.4s, v20.4s, v12.4s +mul v8.4S, v8.4S,v30.s[2] +add v20.4s, v20.4s, v12.4s +mla v2.4S, v15.4S, v31.s[0] +sub v15.4s, v14.4s, v16.4s +mla v8.4S, v3.4S, v31.s[0] +add v14.4s, v14.4s, v16.4s +sqrdmulh v16.4S, v1.4S, v27.s[1] +mul v1.4S, v1.4S,v28.s[1] +sqrdmulh v3.4S, v6.4S, v27.s[1] +sub v12.4s, v17.4s, v19.4s +mul v6.4S, v6.4S,v28.s[1] +add v17.4s, v17.4s, v19.4s +sqrdmulh v19.4S, v22.4S, v27.s[0] +sub v13.4s, v9.4s, v7.4s +mul v22.4S, v22.4S,v28.s[0] +add v9.4s, v9.4s, v7.4s +sqrdmulh v7.4S, v10.4S, v27.s[0] +sub v21.4s, v18.4s, v2.4s +mul v10.4S, v10.4S,v28.s[0] +add v18.4s, v18.4s, v2.4s +mla v1.4S, v16.4S, v31.s[0] +sub v16.4s, v0.4s, v8.4s +sqrdmulh v2.4S, v17.4S, v27.s[2] +add v0.4s, v0.4s, v8.4s +mla v6.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v9.4S, v27.s[2] +mla v22.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v12.4S, v27.s[3] +mla v10.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v13.4S, v27.s[3] +mul v17.4S, v17.4S,v28.s[2] +sub v8.4s, v11.4s, v1.4s +mul v9.4S, v9.4S,v28.s[2] +add v11.4s, v11.4s, v1.4s +mla v17.4S, v2.4S, v31.s[0] +sub v2.4s, v15.4s, v6.4s +mla v9.4S, v3.4S, v31.s[0] +add v15.4s, v15.4s, v6.4s +mul v12.4S, v12.4S,v28.s[3] +sub v6.4s, v20.4s, v22.4s +mul v13.4S, v13.4S,v28.s[3] +add v20.4s, v20.4s, v22.4s +mla v12.4S, v19.4S, v31.s[0] +sub v19.4s, v14.4s, v10.4s +mla v13.4S, v7.4S, v31.s[0] +add v14.4s, v14.4s, v10.4s +sqrdmulh v10.4S, v15.4S, v25.s[2] +mul v15.4S, v15.4S,v26.s[2] +sqrdmulh v7.4S, v2.4S, v25.s[3] +sub v22.4s, v18.4s, v17.4s +mul v2.4S, v2.4S,v26.s[3] +add v18.4s, v18.4s, v17.4s +sqrdmulh v17.4S, v19.4S, v25.s[1] +sub v3.4s, v0.4s, v9.4s +mul v19.4S, v19.4S,v26.s[1] +add v0.4s, v0.4s, v9.4s +sqrdmulh v9.4S, v14.4S, v25.s[0] +sub v1.4s, v21.4s, v12.4s +mul v14.4S, v14.4S,v26.s[0] +add v21.4s, v21.4s, v12.4s +mla v15.4S, v10.4S, v31.s[0] +sub v10.4s, v16.4s, v13.4s +sqrdmulh v12.4S, v0.4S, v23.s[0] +add v16.4s, v16.4s, v13.4s +mla v2.4S, v7.4S, v31.s[0] +sub v7.4s, v11.4s, v15.4s +sqrdmulh v13.4S, v3.4S, v23.s[1] +add v11.4s, v11.4s, v15.4s +mla v19.4S, v17.4S, v31.s[0] +sub v17.4s, v8.4s, v2.4s +sqrdmulh v15.4S, v16.4S, v23.s[2] +add v8.4s, v8.4s, v2.4s +mla v14.4S, v9.4S, v31.s[0] +sub v9.4s, v6.4s, v19.4s +sqrdmulh v2.4S, v10.4S, v23.s[3] +add v6.4s, v6.4s, v19.4s +mul v0.4S, v0.4S,v24.s[0] +sub v19.4s, v20.4s, v14.4s +mul v3.4S, v3.4S,v24.s[1] +add v20.4s, v20.4s, v14.4s +mla v0.4S, v12.4S, v31.s[0] +str q7, [x0, #320] +mla v3.4S, v13.4S, v31.s[0] +str q11, [x0, #256] +mul v16.4S, v16.4S,v24.s[2] +str q17, [x0, #448] +mul v10.4S, v10.4S,v24.s[3] +str q8, [x0, #384] +mla v16.4S, v15.4S, v31.s[0] +str q9, [x0, #192] +mla v10.4S, v2.4S, v31.s[0] +str q6, [x0, #128] +ldr q6, [x0, #912] +sqrdmulh v2.4S, v6.4S, v29.s[0] +str q19, [x0, #64] +mul v6.4S, v6.4S,v30.s[0] +str q20, [x0, #0] +ldr q20, [x0, #976] +sqrdmulh v19.4S, v20.4S, v29.s[0] +sub v9.4s, v18.4s, v0.4s +str q9, [x0, #576] +mul v20.4S, v20.4S,v30.s[0] +add v18.4s, v18.4s, v0.4s +ldr q0, [x0, #784] +sqrdmulh v9.4S, v0.4S, v29.s[0] +sub v15.4s, v22.4s, v3.4s +str q18, [x0, #512] +mul v0.4S, v0.4S,v30.s[0] +add v22.4s, v22.4s, v3.4s +ldr q3, [x0, #848] +sqrdmulh v18.4S, v3.4S, v29.s[0] +sub v8.4s, v21.4s, v16.4s +str q15, [x0, #704] +mul v3.4S, v3.4S,v30.s[0] +add v21.4s, v21.4s, v16.4s +ldr q16, [x0, #528] +mla v6.4S, v2.4S, v31.s[0] +sub v2.4s, v1.4s, v10.4s +str q22, [x0, #640] +sqrdmulh v22.4S, v16.4S, v29.s[0] +add v1.4s, v1.4s, v10.4s +ldr q10, [x0, #592] +mla v20.4S, v19.4S, v31.s[0] +str q8, [x0, #832] +sqrdmulh v8.4S, v10.4S, v29.s[0] +ldr q19, [x0, #656] +mla v0.4S, v9.4S, v31.s[0] +str q21, [x0, #768] +sqrdmulh v21.4S, v19.4S, v29.s[0] +ldr q9, [x0, #720] +mla v3.4S, v18.4S, v31.s[0] +str q2, [x0, #960] +sqrdmulh v2.4S, v9.4S, v29.s[0] +ldr q18, [x0, #400] +ldr q15, [x0, #464] +mul v16.4S, v16.4S,v30.s[0] +sub v17.4s, v18.4s, v6.4s +str q1, [x0, #896] +mul v10.4S, v10.4S,v30.s[0] +add v18.4s, v18.4s, v6.4s +ldr q6, [x0, #272] +ldr q1, [x0, #336] +mla v16.4S, v22.4S, v31.s[0] +sub v22.4s, v15.4s, v20.4s +mla v10.4S, v8.4S, v31.s[0] +add v15.4s, v15.4s, v20.4s +ldr q20, [x0, #16] +ldr q8, [x0, #80] +mul v19.4S, v19.4S,v30.s[0] +sub v11.4s, v6.4s, v0.4s +mul v9.4S, v9.4S,v30.s[0] +add v6.4s, v6.4s, v0.4s +ldr q0, [x0, #144] +ldr q13, [x0, #208] +mla v19.4S, v21.4S, v31.s[0] +sub v21.4s, v1.4s, v3.4s +mla v9.4S, v2.4S, v31.s[0] +add v1.4s, v1.4s, v3.4s +sqrdmulh v3.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sqrdmulh v2.4S, v15.4S, v29.s[1] +sub v7.4s, v20.4s, v16.4s +mul v15.4S, v15.4S,v30.s[1] +add v20.4s, v20.4s, v16.4s +sqrdmulh v16.4S, v6.4S, v29.s[1] +sub v12.4s, v8.4s, v10.4s +mul v6.4S, v6.4S,v30.s[1] +add v8.4s, v8.4s, v10.4s +sqrdmulh v10.4S, v1.4S, v29.s[1] +sub v14.4s, v0.4s, v19.4s +mul v1.4S, v1.4S,v30.s[1] +add v0.4s, v0.4s, v19.4s +mla v18.4S, v3.4S, v31.s[0] +sub v3.4s, v13.4s, v9.4s +sqrdmulh v19.4S, v17.4S, v29.s[2] +add v13.4s, v13.4s, v9.4s +mla v15.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v22.4S, v29.s[2] +mla v6.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v11.4S, v29.s[2] +mla v1.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v21.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +sub v9.4s, v0.4s, v18.4s +mul v22.4S, v22.4S,v30.s[2] +add v0.4s, v0.4s, v18.4s +mla v17.4S, v19.4S, v31.s[0] +sub v19.4s, v13.4s, v15.4s +mla v22.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v15.4s +mul v11.4S, v11.4S,v30.s[2] +sub v15.4s, v20.4s, v6.4s +mul v21.4S, v21.4S,v30.s[2] +add v20.4s, v20.4s, v6.4s +mla v11.4S, v16.4S, v31.s[0] +sub v16.4s, v8.4s, v1.4s +mla v21.4S, v10.4S, v31.s[0] +add v8.4s, v8.4s, v1.4s +sqrdmulh v29.4S, v9.4S, v27.s[1] +mul v9.4S, v9.4S,v28.s[1] +sqrdmulh v30.4S, v19.4S, v27.s[1] +sub v1.4s, v14.4s, v17.4s +mul v19.4S, v19.4S,v28.s[1] +add v14.4s, v14.4s, v17.4s +sqrdmulh v17.4S, v0.4S, v27.s[0] +sub v10.4s, v3.4s, v22.4s +mul v0.4S, v0.4S,v28.s[0] +add v3.4s, v3.4s, v22.4s +sqrdmulh v22.4S, v13.4S, v27.s[0] +sub v6.4s, v7.4s, v11.4s +mul v13.4S, v13.4S,v28.s[0] +add v7.4s, v7.4s, v11.4s +mla v9.4S, v29.4S, v31.s[0] +sub v29.4s, v12.4s, v21.4s +sqrdmulh v11.4S, v14.4S, v27.s[2] +add v12.4s, v12.4s, v21.4s +mla v19.4S, v30.4S, v31.s[0] +sqrdmulh v30.4S, v3.4S, v27.s[2] +mla v0.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v1.4S, v27.s[3] +mla v13.4S, v22.4S, v31.s[0] +sqrdmulh v22.4S, v10.4S, v27.s[3] +mul v14.4S, v14.4S,v28.s[2] +sub v21.4s, v15.4s, v9.4s +mul v3.4S, v3.4S,v28.s[2] +add v15.4s, v15.4s, v9.4s +mla v14.4S, v11.4S, v31.s[0] +sub v11.4s, v16.4s, v19.4s +mla v3.4S, v30.4S, v31.s[0] +add v16.4s, v16.4s, v19.4s +mul v1.4S, v1.4S,v28.s[3] +sub v19.4s, v20.4s, v0.4s +mul v10.4S, v10.4S,v28.s[3] +add v20.4s, v20.4s, v0.4s +mla v1.4S, v17.4S, v31.s[0] +sub v17.4s, v8.4s, v13.4s +mla v10.4S, v22.4S, v31.s[0] +add v8.4s, v8.4s, v13.4s +sqrdmulh v27.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sqrdmulh v28.4S, v11.4S, v25.s[3] +sub v13.4s, v7.4s, v14.4s +mul v11.4S, v11.4S,v26.s[3] +add v7.4s, v7.4s, v14.4s +sqrdmulh v14.4S, v17.4S, v25.s[1] +sub v22.4s, v12.4s, v3.4s +mul v17.4S, v17.4S,v26.s[1] +add v12.4s, v12.4s, v3.4s +sqrdmulh v3.4S, v8.4S, v25.s[0] +sub v0.4s, v6.4s, v1.4s +mul v8.4S, v8.4S,v26.s[0] +add v6.4s, v6.4s, v1.4s +mla v16.4S, v27.4S, v31.s[0] +sub v27.4s, v29.4s, v10.4s +sqrdmulh v25.4S, v12.4S, v23.s[0] +add v29.4s, v29.4s, v10.4s +mla v11.4S, v28.4S, v31.s[0] +sub v28.4s, v15.4s, v16.4s +sqrdmulh v10.4S, v22.4S, v23.s[1] +add v15.4s, v15.4s, v16.4s +mla v17.4S, v14.4S, v31.s[0] +sub v14.4s, v21.4s, v11.4s +sqrdmulh v16.4S, v29.4S, v23.s[2] +add v21.4s, v21.4s, v11.4s +mla v8.4S, v3.4S, v31.s[0] +sub v3.4s, v19.4s, v17.4s +sqrdmulh v11.4S, v27.4S, v23.s[3] +add v19.4s, v19.4s, v17.4s +mul v12.4S, v12.4S,v24.s[0] +sub v17.4s, v20.4s, v8.4s +mul v22.4S, v22.4S,v24.s[1] +add v20.4s, v20.4s, v8.4s +mla v12.4S, v25.4S, v31.s[0] +str q28, [x0, #336] +mla v22.4S, v10.4S, v31.s[0] +str q15, [x0, #272] +mul v29.4S, v29.4S,v24.s[2] +str q14, [x0, #464] +mul v27.4S, v27.4S,v24.s[3] +str q21, [x0, #400] +mla v29.4S, v16.4S, v31.s[0] +str q3, [x0, #208] +mla v27.4S, v11.4S, v31.s[0] +str q19, [x0, #144] +str q17, [x0, #80] +str q20, [x0, #16] +sub v20.4s, v7.4s, v12.4s +str q20, [x0, #592] +add v7.4s, v7.4s, v12.4s +sub v12.4s, v13.4s, v22.4s +str q7, [x0, #528] +add v13.4s, v13.4s, v22.4s +sub v22.4s, v6.4s, v29.4s +str q12, [x0, #720] +add v6.4s, v6.4s, v29.4s +sub v29.4s, v0.4s, v27.4s +str q13, [x0, #656] +add v0.4s, v0.4s, v27.4s +str q22, [x0, #848] +str q6, [x0, #784] +str q29, [x0, #976] +str q0, [x0, #912] +ldr q4, [x17, #+128] +ldr q5, [x17, #+144] +ldr q18, [x17, #+160] +ldr q2, [x17, #+176] +ldr q9, [x17, #+192] +ldr q30, [x17, #+208] +ldr q1, [x17, #+224] +ldr q26, [x17, #+240] +ldr q8, [x0, #32] +ldr q25, [x0, #48] +ldr q28, [x0, #0] +ldr q10, [x0, #16] +sqrdmulh v15.4S, v8.4S, v5.s[0] +mul v8.4S, v8.4S,v4.s[0] +mla v8.4S, v15.4S, v31.s[0] +sub v15.4s, v28.4s, v8.4s +add v28.4s, v28.4s, v8.4s +sqrdmulh v8.4S, v25.4S, v5.s[0] +mul v25.4S, v25.4S,v4.s[0] +mla v25.4S, v8.4S, v31.s[0] +sub v8.4s, v10.4s, v25.4s +add v10.4s, v10.4s, v25.4s +sqrdmulh v25.4S, v10.4S, v5.s[1] +mul v10.4S, v10.4S,v4.s[1] +mla v10.4S, v25.4S, v31.s[0] +sub v25.4s, v28.4s, v10.4s +add v28.4s, v28.4s, v10.4s +sqrdmulh v10.4S, v8.4S, v5.s[2] +mul v8.4S, v8.4S,v4.s[2] +mla v8.4S, v10.4S, v31.s[0] +sub v10.4s, v15.4s, v8.4s +add v15.4s, v15.4s, v8.4s +trn1 v8.4S, v28.4S, v25.4S +trn2 v14.4S, v28.4S, v25.4S +trn1 v21.4S, v15.4S, v10.4S +trn2 v16.4S, v15.4S, v10.4S +trn2 v15.2D, v8.2D, v21.2D +trn2 v10.2D, v14.2D, v16.2D +trn1 v28.2D, v8.2D, v21.2D +trn1 v25.2D, v14.2D, v16.2D +sqrdmulh v16.4S, v15.4S, v2.4S +mul v15.4S, v15.4S,v18.4S +mla v15.4S, v16.4S, v31.s[0] +sub v16.4s, v28.4s, v15.4s +add v28.4s, v28.4s, v15.4s +sqrdmulh v15.4S, v10.4S, v2.4S +mul v10.4S, v10.4S,v18.4S +mla v10.4S, v15.4S, v31.s[0] +sub v15.4s, v25.4s, v10.4s +add v25.4s, v25.4s, v10.4s +sqrdmulh v10.4S, v25.4S, v30.4S +mul v25.4S, v25.4S,v9.4S +mla v25.4S, v10.4S, v31.s[0] +sub v10.4s, v28.4s, v25.4s +add v28.4s, v28.4s, v25.4s +sqrdmulh v25.4S, v15.4S, v26.4S +mul v15.4S, v15.4S,v1.4S +mla v15.4S, v25.4S, v31.s[0] +sub v25.4s, v16.4s, v15.4s +add v16.4s, v16.4s, v15.4s +str q28, [x0, #0] +str q10, [x0, #16] +str q16, [x0, #32] +str q25, [x0, #48] +ldr q25, [x17, #+256] +ldr q16, [x17, #+272] +ldr q10, [x17, #+288] +ldr q28, [x17, #+304] +ldr q15, [x17, #+320] +ldr q14, [x17, #+336] +ldr q21, [x17, #+352] +ldr q8, [x17, #+368] +ldr q26, [x0, #96] +ldr q1, [x0, #112] +ldr q30, [x0, #64] +ldr q9, [x0, #80] +sqrdmulh v2.4S, v26.4S, v16.s[0] +mul v26.4S, v26.4S,v25.s[0] +mla v26.4S, v2.4S, v31.s[0] +sub v2.4s, v30.4s, v26.4s +add v30.4s, v30.4s, v26.4s +sqrdmulh v26.4S, v1.4S, v16.s[0] +mul v1.4S, v1.4S,v25.s[0] +mla v1.4S, v26.4S, v31.s[0] +sub v26.4s, v9.4s, v1.4s +add v9.4s, v9.4s, v1.4s +sqrdmulh v1.4S, v9.4S, v16.s[1] +mul v9.4S, v9.4S,v25.s[1] +mla v9.4S, v1.4S, v31.s[0] +sub v1.4s, v30.4s, v9.4s +add v30.4s, v30.4s, v9.4s +sqrdmulh v9.4S, v26.4S, v16.s[2] +mul v26.4S, v26.4S,v25.s[2] +mla v26.4S, v9.4S, v31.s[0] +sub v9.4s, v2.4s, v26.4s +add v2.4s, v2.4s, v26.4s +trn1 v26.4S, v30.4S, v1.4S +trn2 v18.4S, v30.4S, v1.4S +trn1 v5.4S, v2.4S, v9.4S +trn2 v4.4S, v2.4S, v9.4S +trn2 v2.2D, v26.2D, v5.2D +trn2 v9.2D, v18.2D, v4.2D +trn1 v30.2D, v26.2D, v5.2D +trn1 v1.2D, v18.2D, v4.2D +sqrdmulh v4.4S, v2.4S, v28.4S +mul v2.4S, v2.4S,v10.4S +mla v2.4S, v4.4S, v31.s[0] +sub v4.4s, v30.4s, v2.4s +add v30.4s, v30.4s, v2.4s +sqrdmulh v2.4S, v9.4S, v28.4S +mul v9.4S, v9.4S,v10.4S +mla v9.4S, v2.4S, v31.s[0] +sub v2.4s, v1.4s, v9.4s +add v1.4s, v1.4s, v9.4s +sqrdmulh v9.4S, v1.4S, v14.4S +mul v1.4S, v1.4S,v15.4S +mla v1.4S, v9.4S, v31.s[0] +sub v9.4s, v30.4s, v1.4s +add v30.4s, v30.4s, v1.4s +sqrdmulh v1.4S, v2.4S, v8.4S +mul v2.4S, v2.4S,v21.4S +mla v2.4S, v1.4S, v31.s[0] +sub v1.4s, v4.4s, v2.4s +add v4.4s, v4.4s, v2.4s +str q30, [x0, #64] +str q9, [x0, #80] +str q4, [x0, #96] +str q1, [x0, #112] +ldr q1, [x17, #+384] +ldr q4, [x17, #+400] +ldr q9, [x17, #+416] +ldr q30, [x17, #+432] +ldr q2, [x17, #+448] +ldr q18, [x17, #+464] +ldr q5, [x17, #+480] +ldr q26, [x17, #+496] +ldr q8, [x0, #160] +ldr q21, [x0, #176] +ldr q14, [x0, #128] +ldr q15, [x0, #144] +sqrdmulh v28.4S, v8.4S, v4.s[0] +mul v8.4S, v8.4S,v1.s[0] +mla v8.4S, v28.4S, v31.s[0] +sub v28.4s, v14.4s, v8.4s +add v14.4s, v14.4s, v8.4s +sqrdmulh v8.4S, v21.4S, v4.s[0] +mul v21.4S, v21.4S,v1.s[0] +mla v21.4S, v8.4S, v31.s[0] +sub v8.4s, v15.4s, v21.4s +add v15.4s, v15.4s, v21.4s +sqrdmulh v21.4S, v15.4S, v4.s[1] +mul v15.4S, v15.4S,v1.s[1] +mla v15.4S, v21.4S, v31.s[0] +sub v21.4s, v14.4s, v15.4s +add v14.4s, v14.4s, v15.4s +sqrdmulh v15.4S, v8.4S, v4.s[2] +mul v8.4S, v8.4S,v1.s[2] +mla v8.4S, v15.4S, v31.s[0] +sub v15.4s, v28.4s, v8.4s +add v28.4s, v28.4s, v8.4s +trn1 v8.4S, v14.4S, v21.4S +trn2 v10.4S, v14.4S, v21.4S +trn1 v16.4S, v28.4S, v15.4S +trn2 v25.4S, v28.4S, v15.4S +trn2 v28.2D, v8.2D, v16.2D +trn2 v15.2D, v10.2D, v25.2D +trn1 v14.2D, v8.2D, v16.2D +trn1 v21.2D, v10.2D, v25.2D +sqrdmulh v25.4S, v28.4S, v30.4S +mul v28.4S, v28.4S,v9.4S +mla v28.4S, v25.4S, v31.s[0] +sub v25.4s, v14.4s, v28.4s +add v14.4s, v14.4s, v28.4s +sqrdmulh v28.4S, v15.4S, v30.4S +mul v15.4S, v15.4S,v9.4S +mla v15.4S, v28.4S, v31.s[0] +sub v28.4s, v21.4s, v15.4s +add v21.4s, v21.4s, v15.4s +sqrdmulh v15.4S, v21.4S, v18.4S +mul v21.4S, v21.4S,v2.4S +mla v21.4S, v15.4S, v31.s[0] +sub v15.4s, v14.4s, v21.4s +add v14.4s, v14.4s, v21.4s +sqrdmulh v21.4S, v28.4S, v26.4S +mul v28.4S, v28.4S,v5.4S +mla v28.4S, v21.4S, v31.s[0] +sub v21.4s, v25.4s, v28.4s +add v25.4s, v25.4s, v28.4s +str q14, [x0, #128] +str q15, [x0, #144] +str q25, [x0, #160] +str q21, [x0, #176] +ldr q21, [x17, #+512] +ldr q25, [x17, #+528] +ldr q15, [x17, #+544] +ldr q14, [x17, #+560] +ldr q28, [x17, #+576] +ldr q10, [x17, #+592] +ldr q16, [x17, #+608] +ldr q8, [x17, #+624] +ldr q26, [x0, #224] +ldr q5, [x0, #240] +ldr q18, [x0, #192] +ldr q2, [x0, #208] +sqrdmulh v30.4S, v26.4S, v25.s[0] +mul v26.4S, v26.4S,v21.s[0] +mla v26.4S, v30.4S, v31.s[0] +sub v30.4s, v18.4s, v26.4s +add v18.4s, v18.4s, v26.4s +sqrdmulh v26.4S, v5.4S, v25.s[0] +mul v5.4S, v5.4S,v21.s[0] +mla v5.4S, v26.4S, v31.s[0] +sub v26.4s, v2.4s, v5.4s +add v2.4s, v2.4s, v5.4s +sqrdmulh v5.4S, v2.4S, v25.s[1] +mul v2.4S, v2.4S,v21.s[1] +mla v2.4S, v5.4S, v31.s[0] +sub v5.4s, v18.4s, v2.4s +add v18.4s, v18.4s, v2.4s +sqrdmulh v2.4S, v26.4S, v25.s[2] +mul v26.4S, v26.4S,v21.s[2] +mla v26.4S, v2.4S, v31.s[0] +sub v2.4s, v30.4s, v26.4s +add v30.4s, v30.4s, v26.4s +trn1 v26.4S, v18.4S, v5.4S +trn2 v9.4S, v18.4S, v5.4S +trn1 v4.4S, v30.4S, v2.4S +trn2 v1.4S, v30.4S, v2.4S +trn2 v30.2D, v26.2D, v4.2D +trn2 v2.2D, v9.2D, v1.2D +trn1 v18.2D, v26.2D, v4.2D +trn1 v5.2D, v9.2D, v1.2D +sqrdmulh v1.4S, v30.4S, v14.4S +mul v30.4S, v30.4S,v15.4S +mla v30.4S, v1.4S, v31.s[0] +sub v1.4s, v18.4s, v30.4s +add v18.4s, v18.4s, v30.4s +sqrdmulh v30.4S, v2.4S, v14.4S +mul v2.4S, v2.4S,v15.4S +mla v2.4S, v30.4S, v31.s[0] +sub v30.4s, v5.4s, v2.4s +add v5.4s, v5.4s, v2.4s +sqrdmulh v2.4S, v5.4S, v10.4S +mul v5.4S, v5.4S,v28.4S +mla v5.4S, v2.4S, v31.s[0] +sub v2.4s, v18.4s, v5.4s +add v18.4s, v18.4s, v5.4s +sqrdmulh v5.4S, v30.4S, v8.4S +mul v30.4S, v30.4S,v16.4S +mla v30.4S, v5.4S, v31.s[0] +sub v5.4s, v1.4s, v30.4s +add v1.4s, v1.4s, v30.4s +str q18, [x0, #192] +str q2, [x0, #208] +str q1, [x0, #224] +str q5, [x0, #240] +ldr q5, [x17, #+640] +ldr q1, [x17, #+656] +ldr q2, [x17, #+672] +ldr q18, [x17, #+688] +ldr q30, [x17, #+704] +ldr q9, [x17, #+720] +ldr q4, [x17, #+736] +ldr q26, [x17, #+752] +ldr q8, [x0, #288] +ldr q16, [x0, #304] +ldr q10, [x0, #256] +ldr q28, [x0, #272] +sqrdmulh v14.4S, v8.4S, v1.s[0] +mul v8.4S, v8.4S,v5.s[0] +mla v8.4S, v14.4S, v31.s[0] +sub v14.4s, v10.4s, v8.4s +add v10.4s, v10.4s, v8.4s +sqrdmulh v8.4S, v16.4S, v1.s[0] +mul v16.4S, v16.4S,v5.s[0] +mla v16.4S, v8.4S, v31.s[0] +sub v8.4s, v28.4s, v16.4s +add v28.4s, v28.4s, v16.4s +sqrdmulh v16.4S, v28.4S, v1.s[1] +mul v28.4S, v28.4S,v5.s[1] +mla v28.4S, v16.4S, v31.s[0] +sub v16.4s, v10.4s, v28.4s +add v10.4s, v10.4s, v28.4s +sqrdmulh v28.4S, v8.4S, v1.s[2] +mul v8.4S, v8.4S,v5.s[2] +mla v8.4S, v28.4S, v31.s[0] +sub v28.4s, v14.4s, v8.4s +add v14.4s, v14.4s, v8.4s +trn1 v8.4S, v10.4S, v16.4S +trn2 v15.4S, v10.4S, v16.4S +trn1 v25.4S, v14.4S, v28.4S +trn2 v21.4S, v14.4S, v28.4S +trn2 v14.2D, v8.2D, v25.2D +trn2 v28.2D, v15.2D, v21.2D +trn1 v10.2D, v8.2D, v25.2D +trn1 v16.2D, v15.2D, v21.2D +sqrdmulh v21.4S, v14.4S, v18.4S +mul v14.4S, v14.4S,v2.4S +mla v14.4S, v21.4S, v31.s[0] +sub v21.4s, v10.4s, v14.4s +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v28.4S, v18.4S +mul v28.4S, v28.4S,v2.4S +mla v28.4S, v14.4S, v31.s[0] +sub v14.4s, v16.4s, v28.4s +add v16.4s, v16.4s, v28.4s +sqrdmulh v28.4S, v16.4S, v9.4S +mul v16.4S, v16.4S,v30.4S +mla v16.4S, v28.4S, v31.s[0] +sub v28.4s, v10.4s, v16.4s +add v10.4s, v10.4s, v16.4s +sqrdmulh v16.4S, v14.4S, v26.4S +mul v14.4S, v14.4S,v4.4S +mla v14.4S, v16.4S, v31.s[0] +sub v16.4s, v21.4s, v14.4s +add v21.4s, v21.4s, v14.4s +str q10, [x0, #256] +str q28, [x0, #272] +str q21, [x0, #288] +str q16, [x0, #304] +ldr q16, [x17, #+768] +ldr q21, [x17, #+784] +ldr q28, [x17, #+800] +ldr q10, [x17, #+816] +ldr q14, [x17, #+832] +ldr q15, [x17, #+848] +ldr q25, [x17, #+864] +ldr q8, [x17, #+880] +ldr q26, [x0, #352] +ldr q4, [x0, #368] +ldr q9, [x0, #320] +ldr q30, [x0, #336] +sqrdmulh v18.4S, v26.4S, v21.s[0] +mul v26.4S, v26.4S,v16.s[0] +mla v26.4S, v18.4S, v31.s[0] +sub v18.4s, v9.4s, v26.4s +add v9.4s, v9.4s, v26.4s +sqrdmulh v26.4S, v4.4S, v21.s[0] +mul v4.4S, v4.4S,v16.s[0] +mla v4.4S, v26.4S, v31.s[0] +sub v26.4s, v30.4s, v4.4s +add v30.4s, v30.4s, v4.4s +sqrdmulh v4.4S, v30.4S, v21.s[1] +mul v30.4S, v30.4S,v16.s[1] +mla v30.4S, v4.4S, v31.s[0] +sub v4.4s, v9.4s, v30.4s +add v9.4s, v9.4s, v30.4s +sqrdmulh v30.4S, v26.4S, v21.s[2] +mul v26.4S, v26.4S,v16.s[2] +mla v26.4S, v30.4S, v31.s[0] +sub v30.4s, v18.4s, v26.4s +add v18.4s, v18.4s, v26.4s +trn1 v26.4S, v9.4S, v4.4S +trn2 v2.4S, v9.4S, v4.4S +trn1 v1.4S, v18.4S, v30.4S +trn2 v5.4S, v18.4S, v30.4S +trn2 v18.2D, v26.2D, v1.2D +trn2 v30.2D, v2.2D, v5.2D +trn1 v9.2D, v26.2D, v1.2D +trn1 v4.2D, v2.2D, v5.2D +sqrdmulh v5.4S, v18.4S, v10.4S +mul v18.4S, v18.4S,v28.4S +mla v18.4S, v5.4S, v31.s[0] +sub v5.4s, v9.4s, v18.4s +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v30.4S, v10.4S +mul v30.4S, v30.4S,v28.4S +mla v30.4S, v18.4S, v31.s[0] +sub v18.4s, v4.4s, v30.4s +add v4.4s, v4.4s, v30.4s +sqrdmulh v30.4S, v4.4S, v15.4S +mul v4.4S, v4.4S,v14.4S +mla v4.4S, v30.4S, v31.s[0] +sub v30.4s, v9.4s, v4.4s +add v9.4s, v9.4s, v4.4s +sqrdmulh v4.4S, v18.4S, v8.4S +mul v18.4S, v18.4S,v25.4S +mla v18.4S, v4.4S, v31.s[0] +sub v4.4s, v5.4s, v18.4s +add v5.4s, v5.4s, v18.4s +str q9, [x0, #320] +str q30, [x0, #336] +str q5, [x0, #352] +str q4, [x0, #368] +ldr q4, [x17, #+896] +ldr q5, [x17, #+912] +ldr q30, [x17, #+928] +ldr q9, [x17, #+944] +ldr q18, [x17, #+960] +ldr q2, [x17, #+976] +ldr q1, [x17, #+992] +ldr q26, [x17, #+1008] +ldr q8, [x0, #416] +ldr q25, [x0, #432] +ldr q15, [x0, #384] +ldr q14, [x0, #400] +sqrdmulh v10.4S, v8.4S, v5.s[0] +mul v8.4S, v8.4S,v4.s[0] +mla v8.4S, v10.4S, v31.s[0] +sub v10.4s, v15.4s, v8.4s +add v15.4s, v15.4s, v8.4s +sqrdmulh v8.4S, v25.4S, v5.s[0] +mul v25.4S, v25.4S,v4.s[0] +mla v25.4S, v8.4S, v31.s[0] +sub v8.4s, v14.4s, v25.4s +add v14.4s, v14.4s, v25.4s +sqrdmulh v25.4S, v14.4S, v5.s[1] +mul v14.4S, v14.4S,v4.s[1] +mla v14.4S, v25.4S, v31.s[0] +sub v25.4s, v15.4s, v14.4s +add v15.4s, v15.4s, v14.4s +sqrdmulh v14.4S, v8.4S, v5.s[2] +mul v8.4S, v8.4S,v4.s[2] +mla v8.4S, v14.4S, v31.s[0] +sub v14.4s, v10.4s, v8.4s +add v10.4s, v10.4s, v8.4s +trn1 v8.4S, v15.4S, v25.4S +trn2 v28.4S, v15.4S, v25.4S +trn1 v21.4S, v10.4S, v14.4S +trn2 v16.4S, v10.4S, v14.4S +trn2 v10.2D, v8.2D, v21.2D +trn2 v14.2D, v28.2D, v16.2D +trn1 v15.2D, v8.2D, v21.2D +trn1 v25.2D, v28.2D, v16.2D +sqrdmulh v16.4S, v10.4S, v9.4S +mul v10.4S, v10.4S,v30.4S +mla v10.4S, v16.4S, v31.s[0] +sub v16.4s, v15.4s, v10.4s +add v15.4s, v15.4s, v10.4s +sqrdmulh v10.4S, v14.4S, v9.4S +mul v14.4S, v14.4S,v30.4S +mla v14.4S, v10.4S, v31.s[0] +sub v10.4s, v25.4s, v14.4s +add v25.4s, v25.4s, v14.4s +sqrdmulh v14.4S, v25.4S, v2.4S +mul v25.4S, v25.4S,v18.4S +mla v25.4S, v14.4S, v31.s[0] +sub v14.4s, v15.4s, v25.4s +add v15.4s, v15.4s, v25.4s +sqrdmulh v25.4S, v10.4S, v26.4S +mul v10.4S, v10.4S,v1.4S +mla v10.4S, v25.4S, v31.s[0] +sub v25.4s, v16.4s, v10.4s +add v16.4s, v16.4s, v10.4s +str q15, [x0, #384] +str q14, [x0, #400] +str q16, [x0, #416] +str q25, [x0, #432] +ldr q25, [x17, #+1024] +ldr q16, [x17, #+1040] +ldr q14, [x17, #+1056] +ldr q15, [x17, #+1072] +ldr q10, [x17, #+1088] +ldr q28, [x17, #+1104] +ldr q21, [x17, #+1120] +ldr q8, [x17, #+1136] +ldr q26, [x0, #480] +ldr q1, [x0, #496] +ldr q2, [x0, #448] +ldr q18, [x0, #464] +sqrdmulh v9.4S, v26.4S, v16.s[0] +mul v26.4S, v26.4S,v25.s[0] +mla v26.4S, v9.4S, v31.s[0] +sub v9.4s, v2.4s, v26.4s +add v2.4s, v2.4s, v26.4s +sqrdmulh v26.4S, v1.4S, v16.s[0] +mul v1.4S, v1.4S,v25.s[0] +mla v1.4S, v26.4S, v31.s[0] +sub v26.4s, v18.4s, v1.4s +add v18.4s, v18.4s, v1.4s +sqrdmulh v1.4S, v18.4S, v16.s[1] +mul v18.4S, v18.4S,v25.s[1] +mla v18.4S, v1.4S, v31.s[0] +sub v1.4s, v2.4s, v18.4s +add v2.4s, v2.4s, v18.4s +sqrdmulh v18.4S, v26.4S, v16.s[2] +mul v26.4S, v26.4S,v25.s[2] +mla v26.4S, v18.4S, v31.s[0] +sub v18.4s, v9.4s, v26.4s +add v9.4s, v9.4s, v26.4s +trn1 v26.4S, v2.4S, v1.4S +trn2 v30.4S, v2.4S, v1.4S +trn1 v5.4S, v9.4S, v18.4S +trn2 v4.4S, v9.4S, v18.4S +trn2 v9.2D, v26.2D, v5.2D +trn2 v18.2D, v30.2D, v4.2D +trn1 v2.2D, v26.2D, v5.2D +trn1 v1.2D, v30.2D, v4.2D +sqrdmulh v4.4S, v9.4S, v15.4S +mul v9.4S, v9.4S,v14.4S +mla v9.4S, v4.4S, v31.s[0] +sub v4.4s, v2.4s, v9.4s +add v2.4s, v2.4s, v9.4s +sqrdmulh v9.4S, v18.4S, v15.4S +mul v18.4S, v18.4S,v14.4S +mla v18.4S, v9.4S, v31.s[0] +sub v9.4s, v1.4s, v18.4s +add v1.4s, v1.4s, v18.4s +sqrdmulh v18.4S, v1.4S, v28.4S +mul v1.4S, v1.4S,v10.4S +mla v1.4S, v18.4S, v31.s[0] +sub v18.4s, v2.4s, v1.4s +add v2.4s, v2.4s, v1.4s +sqrdmulh v1.4S, v9.4S, v8.4S +mul v9.4S, v9.4S,v21.4S +mla v9.4S, v1.4S, v31.s[0] +sub v1.4s, v4.4s, v9.4s +add v4.4s, v4.4s, v9.4s +str q2, [x0, #448] +str q18, [x0, #464] +str q4, [x0, #480] +str q1, [x0, #496] +ldr q1, [x17, #+1152] +ldr q4, [x17, #+1168] +ldr q18, [x17, #+1184] +ldr q2, [x17, #+1200] +ldr q9, [x17, #+1216] +ldr q30, [x17, #+1232] +ldr q5, [x17, #+1248] +ldr q26, [x17, #+1264] +ldr q8, [x0, #544] +ldr q21, [x0, #560] +ldr q28, [x0, #512] +ldr q10, [x0, #528] +sqrdmulh v15.4S, v8.4S, v4.s[0] +mul v8.4S, v8.4S,v1.s[0] +mla v8.4S, v15.4S, v31.s[0] +sub v15.4s, v28.4s, v8.4s +add v28.4s, v28.4s, v8.4s +sqrdmulh v8.4S, v21.4S, v4.s[0] +mul v21.4S, v21.4S,v1.s[0] +mla v21.4S, v8.4S, v31.s[0] +sub v8.4s, v10.4s, v21.4s +add v10.4s, v10.4s, v21.4s +sqrdmulh v21.4S, v10.4S, v4.s[1] +mul v10.4S, v10.4S,v1.s[1] +mla v10.4S, v21.4S, v31.s[0] +sub v21.4s, v28.4s, v10.4s +add v28.4s, v28.4s, v10.4s +sqrdmulh v10.4S, v8.4S, v4.s[2] +mul v8.4S, v8.4S,v1.s[2] +mla v8.4S, v10.4S, v31.s[0] +sub v10.4s, v15.4s, v8.4s +add v15.4s, v15.4s, v8.4s +trn1 v8.4S, v28.4S, v21.4S +trn2 v14.4S, v28.4S, v21.4S +trn1 v16.4S, v15.4S, v10.4S +trn2 v25.4S, v15.4S, v10.4S +trn2 v15.2D, v8.2D, v16.2D +trn2 v10.2D, v14.2D, v25.2D +trn1 v28.2D, v8.2D, v16.2D +trn1 v21.2D, v14.2D, v25.2D +sqrdmulh v25.4S, v15.4S, v2.4S +mul v15.4S, v15.4S,v18.4S +mla v15.4S, v25.4S, v31.s[0] +sub v25.4s, v28.4s, v15.4s +add v28.4s, v28.4s, v15.4s +sqrdmulh v15.4S, v10.4S, v2.4S +mul v10.4S, v10.4S,v18.4S +mla v10.4S, v15.4S, v31.s[0] +sub v15.4s, v21.4s, v10.4s +add v21.4s, v21.4s, v10.4s +sqrdmulh v10.4S, v21.4S, v30.4S +mul v21.4S, v21.4S,v9.4S +mla v21.4S, v10.4S, v31.s[0] +sub v10.4s, v28.4s, v21.4s +add v28.4s, v28.4s, v21.4s +sqrdmulh v21.4S, v15.4S, v26.4S +mul v15.4S, v15.4S,v5.4S +mla v15.4S, v21.4S, v31.s[0] +sub v21.4s, v25.4s, v15.4s +add v25.4s, v25.4s, v15.4s +str q28, [x0, #512] +str q10, [x0, #528] +str q25, [x0, #544] +str q21, [x0, #560] +ldr q21, [x17, #+1280] +ldr q25, [x17, #+1296] +ldr q10, [x17, #+1312] +ldr q28, [x17, #+1328] +ldr q15, [x17, #+1344] +ldr q14, [x17, #+1360] +ldr q16, [x17, #+1376] +ldr q8, [x17, #+1392] +ldr q26, [x0, #608] +ldr q5, [x0, #624] +ldr q30, [x0, #576] +ldr q9, [x0, #592] +sqrdmulh v2.4S, v26.4S, v25.s[0] +mul v26.4S, v26.4S,v21.s[0] +mla v26.4S, v2.4S, v31.s[0] +sub v2.4s, v30.4s, v26.4s +add v30.4s, v30.4s, v26.4s +sqrdmulh v26.4S, v5.4S, v25.s[0] +mul v5.4S, v5.4S,v21.s[0] +mla v5.4S, v26.4S, v31.s[0] +sub v26.4s, v9.4s, v5.4s +add v9.4s, v9.4s, v5.4s +sqrdmulh v5.4S, v9.4S, v25.s[1] +mul v9.4S, v9.4S,v21.s[1] +mla v9.4S, v5.4S, v31.s[0] +sub v5.4s, v30.4s, v9.4s +add v30.4s, v30.4s, v9.4s +sqrdmulh v9.4S, v26.4S, v25.s[2] +mul v26.4S, v26.4S,v21.s[2] +mla v26.4S, v9.4S, v31.s[0] +sub v9.4s, v2.4s, v26.4s +add v2.4s, v2.4s, v26.4s +trn1 v26.4S, v30.4S, v5.4S +trn2 v18.4S, v30.4S, v5.4S +trn1 v4.4S, v2.4S, v9.4S +trn2 v1.4S, v2.4S, v9.4S +trn2 v2.2D, v26.2D, v4.2D +trn2 v9.2D, v18.2D, v1.2D +trn1 v30.2D, v26.2D, v4.2D +trn1 v5.2D, v18.2D, v1.2D +sqrdmulh v1.4S, v2.4S, v28.4S +mul v2.4S, v2.4S,v10.4S +mla v2.4S, v1.4S, v31.s[0] +sub v1.4s, v30.4s, v2.4s +add v30.4s, v30.4s, v2.4s +sqrdmulh v2.4S, v9.4S, v28.4S +mul v9.4S, v9.4S,v10.4S +mla v9.4S, v2.4S, v31.s[0] +sub v2.4s, v5.4s, v9.4s +add v5.4s, v5.4s, v9.4s +sqrdmulh v9.4S, v5.4S, v14.4S +mul v5.4S, v5.4S,v15.4S +mla v5.4S, v9.4S, v31.s[0] +sub v9.4s, v30.4s, v5.4s +add v30.4s, v30.4s, v5.4s +sqrdmulh v5.4S, v2.4S, v8.4S +mul v2.4S, v2.4S,v16.4S +mla v2.4S, v5.4S, v31.s[0] +sub v5.4s, v1.4s, v2.4s +add v1.4s, v1.4s, v2.4s +str q30, [x0, #576] +str q9, [x0, #592] +str q1, [x0, #608] +str q5, [x0, #624] +ldr q5, [x17, #+1408] +ldr q1, [x17, #+1424] +ldr q9, [x17, #+1440] +ldr q30, [x17, #+1456] +ldr q2, [x17, #+1472] +ldr q18, [x17, #+1488] +ldr q4, [x17, #+1504] +ldr q26, [x17, #+1520] +ldr q8, [x0, #672] +ldr q16, [x0, #688] +ldr q14, [x0, #640] +ldr q15, [x0, #656] +sqrdmulh v28.4S, v8.4S, v1.s[0] +mul v8.4S, v8.4S,v5.s[0] +mla v8.4S, v28.4S, v31.s[0] +sub v28.4s, v14.4s, v8.4s +add v14.4s, v14.4s, v8.4s +sqrdmulh v8.4S, v16.4S, v1.s[0] +mul v16.4S, v16.4S,v5.s[0] +mla v16.4S, v8.4S, v31.s[0] +sub v8.4s, v15.4s, v16.4s +add v15.4s, v15.4s, v16.4s +sqrdmulh v16.4S, v15.4S, v1.s[1] +mul v15.4S, v15.4S,v5.s[1] +mla v15.4S, v16.4S, v31.s[0] +sub v16.4s, v14.4s, v15.4s +add v14.4s, v14.4s, v15.4s +sqrdmulh v15.4S, v8.4S, v1.s[2] +mul v8.4S, v8.4S,v5.s[2] +mla v8.4S, v15.4S, v31.s[0] +sub v15.4s, v28.4s, v8.4s +add v28.4s, v28.4s, v8.4s +trn1 v8.4S, v14.4S, v16.4S +trn2 v10.4S, v14.4S, v16.4S +trn1 v25.4S, v28.4S, v15.4S +trn2 v21.4S, v28.4S, v15.4S +trn2 v28.2D, v8.2D, v25.2D +trn2 v15.2D, v10.2D, v21.2D +trn1 v14.2D, v8.2D, v25.2D +trn1 v16.2D, v10.2D, v21.2D +sqrdmulh v21.4S, v28.4S, v30.4S +mul v28.4S, v28.4S,v9.4S +mla v28.4S, v21.4S, v31.s[0] +sub v21.4s, v14.4s, v28.4s +add v14.4s, v14.4s, v28.4s +sqrdmulh v28.4S, v15.4S, v30.4S +mul v15.4S, v15.4S,v9.4S +mla v15.4S, v28.4S, v31.s[0] +sub v28.4s, v16.4s, v15.4s +add v16.4s, v16.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v18.4S +mul v16.4S, v16.4S,v2.4S +mla v16.4S, v15.4S, v31.s[0] +sub v15.4s, v14.4s, v16.4s +add v14.4s, v14.4s, v16.4s +sqrdmulh v16.4S, v28.4S, v26.4S +mul v28.4S, v28.4S,v4.4S +mla v28.4S, v16.4S, v31.s[0] +sub v16.4s, v21.4s, v28.4s +add v21.4s, v21.4s, v28.4s +str q14, [x0, #640] +str q15, [x0, #656] +str q21, [x0, #672] +str q16, [x0, #688] +ldr q16, [x17, #+1536] +ldr q21, [x17, #+1552] +ldr q15, [x17, #+1568] +ldr q14, [x17, #+1584] +ldr q28, [x17, #+1600] +ldr q10, [x17, #+1616] +ldr q25, [x17, #+1632] +ldr q8, [x17, #+1648] +ldr q26, [x0, #736] +ldr q4, [x0, #752] +ldr q18, [x0, #704] +ldr q2, [x0, #720] +sqrdmulh v30.4S, v26.4S, v21.s[0] +mul v26.4S, v26.4S,v16.s[0] +mla v26.4S, v30.4S, v31.s[0] +sub v30.4s, v18.4s, v26.4s +add v18.4s, v18.4s, v26.4s +sqrdmulh v26.4S, v4.4S, v21.s[0] +mul v4.4S, v4.4S,v16.s[0] +mla v4.4S, v26.4S, v31.s[0] +sub v26.4s, v2.4s, v4.4s +add v2.4s, v2.4s, v4.4s +sqrdmulh v4.4S, v2.4S, v21.s[1] +mul v2.4S, v2.4S,v16.s[1] +mla v2.4S, v4.4S, v31.s[0] +sub v4.4s, v18.4s, v2.4s +add v18.4s, v18.4s, v2.4s +sqrdmulh v2.4S, v26.4S, v21.s[2] +mul v26.4S, v26.4S,v16.s[2] +mla v26.4S, v2.4S, v31.s[0] +sub v2.4s, v30.4s, v26.4s +add v30.4s, v30.4s, v26.4s +trn1 v26.4S, v18.4S, v4.4S +trn2 v9.4S, v18.4S, v4.4S +trn1 v1.4S, v30.4S, v2.4S +trn2 v5.4S, v30.4S, v2.4S +trn2 v30.2D, v26.2D, v1.2D +trn2 v2.2D, v9.2D, v5.2D +trn1 v18.2D, v26.2D, v1.2D +trn1 v4.2D, v9.2D, v5.2D +sqrdmulh v5.4S, v30.4S, v14.4S +mul v30.4S, v30.4S,v15.4S +mla v30.4S, v5.4S, v31.s[0] +sub v5.4s, v18.4s, v30.4s +add v18.4s, v18.4s, v30.4s +sqrdmulh v30.4S, v2.4S, v14.4S +mul v2.4S, v2.4S,v15.4S +mla v2.4S, v30.4S, v31.s[0] +sub v30.4s, v4.4s, v2.4s +add v4.4s, v4.4s, v2.4s +sqrdmulh v2.4S, v4.4S, v10.4S +mul v4.4S, v4.4S,v28.4S +mla v4.4S, v2.4S, v31.s[0] +sub v2.4s, v18.4s, v4.4s +add v18.4s, v18.4s, v4.4s +sqrdmulh v4.4S, v30.4S, v8.4S +mul v30.4S, v30.4S,v25.4S +mla v30.4S, v4.4S, v31.s[0] +sub v4.4s, v5.4s, v30.4s +add v5.4s, v5.4s, v30.4s +str q18, [x0, #704] +str q2, [x0, #720] +str q5, [x0, #736] +str q4, [x0, #752] +ldr q4, [x17, #+1664] +ldr q5, [x17, #+1680] +ldr q2, [x17, #+1696] +ldr q18, [x17, #+1712] +ldr q30, [x17, #+1728] +ldr q9, [x17, #+1744] +ldr q1, [x17, #+1760] +ldr q26, [x17, #+1776] +ldr q8, [x0, #800] +ldr q25, [x0, #816] +ldr q10, [x0, #768] +ldr q28, [x0, #784] +sqrdmulh v14.4S, v8.4S, v5.s[0] +mul v8.4S, v8.4S,v4.s[0] +mla v8.4S, v14.4S, v31.s[0] +sub v14.4s, v10.4s, v8.4s +add v10.4s, v10.4s, v8.4s +sqrdmulh v8.4S, v25.4S, v5.s[0] +mul v25.4S, v25.4S,v4.s[0] +mla v25.4S, v8.4S, v31.s[0] +sub v8.4s, v28.4s, v25.4s +add v28.4s, v28.4s, v25.4s +sqrdmulh v25.4S, v28.4S, v5.s[1] +mul v28.4S, v28.4S,v4.s[1] +mla v28.4S, v25.4S, v31.s[0] +sub v25.4s, v10.4s, v28.4s +add v10.4s, v10.4s, v28.4s +sqrdmulh v28.4S, v8.4S, v5.s[2] +mul v8.4S, v8.4S,v4.s[2] +mla v8.4S, v28.4S, v31.s[0] +sub v28.4s, v14.4s, v8.4s +add v14.4s, v14.4s, v8.4s +trn1 v8.4S, v10.4S, v25.4S +trn2 v15.4S, v10.4S, v25.4S +trn1 v21.4S, v14.4S, v28.4S +trn2 v16.4S, v14.4S, v28.4S +trn2 v14.2D, v8.2D, v21.2D +trn2 v28.2D, v15.2D, v16.2D +trn1 v10.2D, v8.2D, v21.2D +trn1 v25.2D, v15.2D, v16.2D +sqrdmulh v16.4S, v14.4S, v18.4S +mul v14.4S, v14.4S,v2.4S +mla v14.4S, v16.4S, v31.s[0] +sub v16.4s, v10.4s, v14.4s +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v28.4S, v18.4S +mul v28.4S, v28.4S,v2.4S +mla v28.4S, v14.4S, v31.s[0] +sub v14.4s, v25.4s, v28.4s +add v25.4s, v25.4s, v28.4s +sqrdmulh v28.4S, v25.4S, v9.4S +mul v25.4S, v25.4S,v30.4S +mla v25.4S, v28.4S, v31.s[0] +sub v28.4s, v10.4s, v25.4s +add v10.4s, v10.4s, v25.4s +sqrdmulh v25.4S, v14.4S, v26.4S +mul v14.4S, v14.4S,v1.4S +mla v14.4S, v25.4S, v31.s[0] +sub v25.4s, v16.4s, v14.4s +add v16.4s, v16.4s, v14.4s +str q10, [x0, #768] +str q28, [x0, #784] +str q16, [x0, #800] +str q25, [x0, #816] +ldr q25, [x17, #+1792] +ldr q16, [x17, #+1808] +ldr q28, [x17, #+1824] +ldr q10, [x17, #+1840] +ldr q14, [x17, #+1856] +ldr q15, [x17, #+1872] +ldr q21, [x17, #+1888] +ldr q8, [x17, #+1904] +ldr q26, [x0, #864] +ldr q1, [x0, #880] +ldr q9, [x0, #832] +ldr q30, [x0, #848] +sqrdmulh v18.4S, v26.4S, v16.s[0] +mul v26.4S, v26.4S,v25.s[0] +mla v26.4S, v18.4S, v31.s[0] +sub v18.4s, v9.4s, v26.4s +add v9.4s, v9.4s, v26.4s +sqrdmulh v26.4S, v1.4S, v16.s[0] +mul v1.4S, v1.4S,v25.s[0] +mla v1.4S, v26.4S, v31.s[0] +sub v26.4s, v30.4s, v1.4s +add v30.4s, v30.4s, v1.4s +sqrdmulh v1.4S, v30.4S, v16.s[1] +mul v30.4S, v30.4S,v25.s[1] +mla v30.4S, v1.4S, v31.s[0] +sub v1.4s, v9.4s, v30.4s +add v9.4s, v9.4s, v30.4s +sqrdmulh v30.4S, v26.4S, v16.s[2] +mul v26.4S, v26.4S,v25.s[2] +mla v26.4S, v30.4S, v31.s[0] +sub v30.4s, v18.4s, v26.4s +add v18.4s, v18.4s, v26.4s +trn1 v26.4S, v9.4S, v1.4S +trn2 v2.4S, v9.4S, v1.4S +trn1 v5.4S, v18.4S, v30.4S +trn2 v4.4S, v18.4S, v30.4S +trn2 v18.2D, v26.2D, v5.2D +trn2 v30.2D, v2.2D, v4.2D +trn1 v9.2D, v26.2D, v5.2D +trn1 v1.2D, v2.2D, v4.2D +sqrdmulh v4.4S, v18.4S, v10.4S +mul v18.4S, v18.4S,v28.4S +mla v18.4S, v4.4S, v31.s[0] +sub v4.4s, v9.4s, v18.4s +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v30.4S, v10.4S +mul v30.4S, v30.4S,v28.4S +mla v30.4S, v18.4S, v31.s[0] +sub v18.4s, v1.4s, v30.4s +add v1.4s, v1.4s, v30.4s +sqrdmulh v30.4S, v1.4S, v15.4S +mul v1.4S, v1.4S,v14.4S +mla v1.4S, v30.4S, v31.s[0] +sub v30.4s, v9.4s, v1.4s +add v9.4s, v9.4s, v1.4s +sqrdmulh v1.4S, v18.4S, v8.4S +mul v18.4S, v18.4S,v21.4S +mla v18.4S, v1.4S, v31.s[0] +sub v1.4s, v4.4s, v18.4s +add v4.4s, v4.4s, v18.4s +str q9, [x0, #832] +str q30, [x0, #848] +str q4, [x0, #864] +str q1, [x0, #880] +ldr q1, [x17, #+1920] +ldr q4, [x17, #+1936] +ldr q30, [x17, #+1952] +ldr q9, [x17, #+1968] +ldr q18, [x17, #+1984] +ldr q2, [x17, #+2000] +ldr q5, [x17, #+2016] +ldr q26, [x17, #+2032] +ldr q8, [x0, #928] +ldr q21, [x0, #944] +ldr q15, [x0, #896] +ldr q14, [x0, #912] +sqrdmulh v10.4S, v8.4S, v4.s[0] +mul v8.4S, v8.4S,v1.s[0] +mla v8.4S, v10.4S, v31.s[0] +sub v10.4s, v15.4s, v8.4s +add v15.4s, v15.4s, v8.4s +sqrdmulh v8.4S, v21.4S, v4.s[0] +mul v21.4S, v21.4S,v1.s[0] +mla v21.4S, v8.4S, v31.s[0] +sub v8.4s, v14.4s, v21.4s +add v14.4s, v14.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v4.s[1] +mul v14.4S, v14.4S,v1.s[1] +mla v14.4S, v21.4S, v31.s[0] +sub v21.4s, v15.4s, v14.4s +add v15.4s, v15.4s, v14.4s +sqrdmulh v14.4S, v8.4S, v4.s[2] +mul v8.4S, v8.4S,v1.s[2] +mla v8.4S, v14.4S, v31.s[0] +sub v14.4s, v10.4s, v8.4s +add v10.4s, v10.4s, v8.4s +trn1 v8.4S, v15.4S, v21.4S +trn2 v28.4S, v15.4S, v21.4S +trn1 v16.4S, v10.4S, v14.4S +trn2 v25.4S, v10.4S, v14.4S +trn2 v10.2D, v8.2D, v16.2D +trn2 v14.2D, v28.2D, v25.2D +trn1 v15.2D, v8.2D, v16.2D +trn1 v21.2D, v28.2D, v25.2D +sqrdmulh v25.4S, v10.4S, v9.4S +mul v10.4S, v10.4S,v30.4S +mla v10.4S, v25.4S, v31.s[0] +sub v25.4s, v15.4s, v10.4s +add v15.4s, v15.4s, v10.4s +sqrdmulh v10.4S, v14.4S, v9.4S +mul v14.4S, v14.4S,v30.4S +mla v14.4S, v10.4S, v31.s[0] +sub v10.4s, v21.4s, v14.4s +add v21.4s, v21.4s, v14.4s +sqrdmulh v14.4S, v21.4S, v2.4S +mul v21.4S, v21.4S,v18.4S +mla v21.4S, v14.4S, v31.s[0] +sub v14.4s, v15.4s, v21.4s +add v15.4s, v15.4s, v21.4s +sqrdmulh v21.4S, v10.4S, v26.4S +mul v10.4S, v10.4S,v5.4S +mla v10.4S, v21.4S, v31.s[0] +sub v21.4s, v25.4s, v10.4s +add v25.4s, v25.4s, v10.4s +str q15, [x0, #896] +str q14, [x0, #912] +str q25, [x0, #928] +str q21, [x0, #944] +ldr q21, [x17, #+2048] +ldr q25, [x17, #+2064] +ldr q14, [x17, #+2080] +ldr q15, [x17, #+2096] +ldr q10, [x17, #+2112] +ldr q28, [x17, #+2128] +ldr q16, [x17, #+2144] +ldr q8, [x17, #+2160] +ldr q26, [x0, #992] +ldr q5, [x0, #1008] +ldr q2, [x0, #960] +ldr q18, [x0, #976] +sqrdmulh v9.4S, v26.4S, v25.s[0] +mul v26.4S, v26.4S,v21.s[0] +mla v26.4S, v9.4S, v31.s[0] +sub v9.4s, v2.4s, v26.4s +add v2.4s, v2.4s, v26.4s +sqrdmulh v26.4S, v5.4S, v25.s[0] +mul v5.4S, v5.4S,v21.s[0] +mla v5.4S, v26.4S, v31.s[0] +sub v26.4s, v18.4s, v5.4s +add v18.4s, v18.4s, v5.4s +sqrdmulh v5.4S, v18.4S, v25.s[1] +mul v18.4S, v18.4S,v21.s[1] +mla v18.4S, v5.4S, v31.s[0] +sub v5.4s, v2.4s, v18.4s +add v2.4s, v2.4s, v18.4s +sqrdmulh v18.4S, v26.4S, v25.s[2] +mul v26.4S, v26.4S,v21.s[2] +mla v26.4S, v18.4S, v31.s[0] +sub v18.4s, v9.4s, v26.4s +add v9.4s, v9.4s, v26.4s +trn1 v26.4S, v2.4S, v5.4S +trn2 v30.4S, v2.4S, v5.4S +trn1 v4.4S, v9.4S, v18.4S +trn2 v1.4S, v9.4S, v18.4S +trn2 v9.2D, v26.2D, v4.2D +trn2 v18.2D, v30.2D, v1.2D +trn1 v2.2D, v26.2D, v4.2D +trn1 v5.2D, v30.2D, v1.2D +sqrdmulh v1.4S, v9.4S, v15.4S +mul v9.4S, v9.4S,v14.4S +mla v9.4S, v1.4S, v31.s[0] +sub v1.4s, v2.4s, v9.4s +add v2.4s, v2.4s, v9.4s +sqrdmulh v9.4S, v18.4S, v15.4S +mul v18.4S, v18.4S,v14.4S +mla v18.4S, v9.4S, v31.s[0] +sub v9.4s, v5.4s, v18.4s +add v5.4s, v5.4s, v18.4s +sqrdmulh v18.4S, v5.4S, v28.4S +mul v5.4S, v5.4S,v10.4S +mla v5.4S, v18.4S, v31.s[0] +sub v18.4s, v2.4s, v5.4s +add v2.4s, v2.4s, v5.4s +sqrdmulh v5.4S, v9.4S, v8.4S +mul v9.4S, v9.4S,v16.4S +mla v9.4S, v5.4S, v31.s[0] +sub v5.4s, v1.4s, v9.4s +add v1.4s, v1.4s, v9.4s +str q2, [x0, #960] +str q18, [x0, #976] +str q1, [x0, #992] +str q5, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 2392 +// Instruction count: 2388 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_14_0.s b/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_14_0.s new file mode 100644 index 0000000..dae0130 --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_14_0.s @@ -0,0 +1,2506 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 26036764 // Layer 6, block 0 +.word 7065381 // Layer 6, block 1 +.word 11280567 // Layer 6, block 2 +.word 19695786 // Layer 6, block 3 +.word 1666225723 // Layer 6, block 0 +.word 452149874 // Layer 6, block 1 +.word 721901190 // Layer 6, block 2 +.word 1260434103 // Layer 6, block 3 +.word 28678040 // Layer 7, block 0 +.word 5637166 // Layer 7, block 2 +.word 18759424 // Layer 7, block 4 +.word 8648030 // Layer 7, block 6 +.word 1835254486 // Layer 7, block 0 +.word 360751090 // Layer 7, block 2 +.word 1200511508 // Layer 7, block 4 +.word 553431680 // Layer 7, block 6 +.word 7232147 // Layer 7, block 1 +.word 7430689 // Layer 7, block 3 +.word 14819378 // Layer 7, block 5 +.word 22112339 // Layer 7, block 7 +.word 462822084 // Layer 7, block 1 +.word 475527802 // Layer 7, block 3 +.word 948367809 // Layer 7, block 5 +.word 1415081692 // Layer 7, block 7 +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14834498 // Layer 6, block 4 +.word 22861321 // Layer 6, block 5 +.word 23033862 // Layer 6, block 6 +.word 32211066 // Layer 6, block 7 +.word 949335415 // Layer 6, block 4 +.word 1463012881 // Layer 6, block 5 +.word 1474054663 // Layer 6, block 6 +.word 2061350894 // Layer 6, block 7 +.word 7103825 // Layer 7, block 8 +.word 24338119 // Layer 7, block 10 +.word 6674394 // Layer 7, block 12 +.word 3716128 // Layer 7, block 14 +.word 454610102 // Layer 7, block 8 +.word 1557520740 // Layer 7, block 10 +.word 427128616 // Layer 7, block 12 +.word 237814041 // Layer 7, block 14 +.word 18577393 // Layer 7, block 9 +.word 17042091 // Layer 7, block 11 +.word 6574213 // Layer 7, block 13 +.word 24666803 // Layer 7, block 15 +.word 1188862414 // Layer 7, block 9 +.word 1090610585 // Layer 7, block 11 +.word 420717521 // Layer 7, block 13 +.word 1578554911 // Layer 7, block 15 +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 11253846 // Layer 6, block 8 +.word 16151303 // Layer 6, block 9 +.word 1821442 // Layer 6, block 10 +.word 23358663 // Layer 6, block 11 +.word 720191176 // Layer 6, block 8 +.word 1033604503 // Layer 6, block 9 +.word 116563391 // Layer 6, block 10 +.word 1494840340 // Layer 6, block 11 +.word 32787475 // Layer 7, block 16 +.word 8269259 // Layer 7, block 18 +.word 20826321 // Layer 7, block 20 +.word 21194054 // Layer 7, block 22 +.word 2098238255 // Layer 7, block 16 +.word 529192186 // Layer 7, block 18 +.word 1332782821 // Layer 7, block 20 +.word 1356315937 // Layer 7, block 22 +.word 28400654 // Layer 7, block 17 +.word 31090287 // Layer 7, block 19 +.word 26776841 // Layer 7, block 21 +.word 22281074 // Layer 7, block 23 +.word 1817503137 // Layer 7, block 17 +.word 1989626512 // Layer 7, block 19 +.word 1713587037 // Layer 7, block 21 +.word 1425879908 // Layer 7, block 23 +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 20504641 // Layer 6, block 12 +.word 7735096 // Layer 6, block 13 +.word 29463916 // Layer 6, block 14 +.word 23172067 // Layer 6, block 15 +.word 1312196872 // Layer 6, block 12 +.word 495008363 // Layer 6, block 13 +.word 1885546712 // Layer 6, block 14 +.word 1482899108 // Layer 6, block 15 +.word 1953000 // Layer 7, block 24 +.word 12766243 // Layer 7, block 26 +.word 16292342 // Layer 7, block 28 +.word 25143337 // Layer 7, block 30 +.word 124982461 // Layer 7, block 24 +.word 816977197 // Layer 7, block 26 +.word 1042630311 // Layer 7, block 28 +.word 1609050759 // Layer 7, block 30 +.word 12486848 // Layer 7, block 25 +.word 31556661 // Layer 7, block 27 +.word 28330310 // Layer 7, block 29 +.word 15137961 // Layer 7, block 31 +.word 799097282 // Layer 7, block 25 +.word 2019472170 // Layer 7, block 27 +.word 1813001465 // Layer 7, block 29 +.word 968755565 // Layer 7, block 31 +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 18663828 // Layer 6, block 16 +.word 25765932 // Layer 6, block 17 +.word 11779122 // Layer 6, block 18 +.word 29112305 // Layer 6, block 19 +.word 1194393831 // Layer 6, block 16 +.word 1648893798 // Layer 6, block 17 +.word 753806275 // Layer 6, block 18 +.word 1863045325 // Layer 6, block 19 +.word 33163184 // Layer 7, block 32 +.word 11550623 // Layer 7, block 34 +.word 25375595 // Layer 7, block 36 +.word 18254638 // Layer 7, block 38 +.word 2122281795 // Layer 7, block 32 +.word 739183455 // Layer 7, block 34 +.word 1623914137 // Layer 7, block 36 +.word 1168207670 // Layer 7, block 38 +.word 9551359 // Layer 7, block 33 +.word 33257316 // Layer 7, block 35 +.word 10387700 // Layer 7, block 37 +.word 4263629 // Layer 7, block 39 +.word 611240324 // Layer 7, block 33 +.word 2128305784 // Layer 7, block 35 +.word 664762063 // Layer 7, block 37 +.word 272851431 // Layer 7, block 39 +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 596073 // Layer 6, block 20 +.word 29039358 // Layer 6, block 21 +.word 6760262 // Layer 6, block 22 +.word 2228887 // Layer 6, block 23 +.word 38145761 // Layer 6, block 20 +.word 1858377074 // Layer 6, block 21 +.word 432623749 // Layer 6, block 22 +.word 142637881 // Layer 6, block 23 +.word 25929180 // Layer 7, block 40 +.word 23508428 // Layer 7, block 42 +.word 22560727 // Layer 7, block 44 +.word 29457393 // Layer 7, block 46 +.word 1659340873 // Layer 7, block 40 +.word 1504424569 // Layer 7, block 42 +.word 1443776334 // Layer 7, block 44 +.word 1885129272 // Layer 7, block 46 +.word 17371159 // Layer 7, block 41 +.word 11558208 // Layer 7, block 43 +.word 15755637 // Layer 7, block 45 +.word 20740787 // Layer 7, block 47 +.word 1111669329 // Layer 7, block 41 +.word 739668858 // Layer 7, block 43 +.word 1008283812 // Layer 7, block 45 +.word 1327309063 // Layer 7, block 47 +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 13624329 // Layer 6, block 24 +.word 9838349 // Layer 6, block 25 +.word 6934560 // Layer 6, block 26 +.word 11310234 // Layer 6, block 27 +.word 871890510 // Layer 6, block 24 +.word 629606282 // Layer 6, block 25 +.word 443777969 // Layer 6, block 26 +.word 723799733 // Layer 6, block 27 +.word 3153984 // Layer 7, block 48 +.word 15599806 // Layer 7, block 50 +.word 23484790 // Layer 7, block 52 +.word 30174454 // Layer 7, block 54 +.word 201839571 // Layer 7, block 48 +.word 998311389 // Layer 7, block 50 +.word 1502911852 // Layer 7, block 52 +.word 1931017673 // Layer 7, block 54 +.word 13598070 // Layer 7, block 49 +.word 31454003 // Layer 7, block 51 +.word 20506260 // Layer 7, block 53 +.word 5928435 // Layer 7, block 55 +.word 870210062 // Layer 7, block 49 +.word 2012902560 // Layer 7, block 51 +.word 1312300480 // Layer 7, block 53 +.word 379390883 // Layer 7, block 55 +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 32798516 // Layer 6, block 28 +.word 9911360 // Layer 6, block 29 +.word 32443170 // Layer 6, block 30 +.word 31293482 // Layer 6, block 31 +.word 2098944825 // Layer 6, block 28 +.word 634278629 // Layer 6, block 29 +.word 2076204416 // Layer 6, block 30 +.word 2002630000 // Layer 6, block 31 +.word 26013877 // Layer 7, block 56 +.word 22928950 // Layer 7, block 58 +.word 24547058 // Layer 7, block 60 +.word 21082546 // Layer 7, block 62 +.word 1664761067 // Layer 7, block 56 +.word 1467340807 // Layer 7, block 58 +.word 1570891816 // Layer 7, block 60 +.word 1349179970 // Layer 7, block 62 +.word 21864746 // Layer 7, block 57 +.word 27678266 // Layer 7, block 59 +.word 30695887 // Layer 7, block 61 +.word 31772478 // Layer 7, block 63 +.word 1399236949 // Layer 7, block 57 +.word 1771273834 // Layer 7, block 59 +.word 1964386839 // Layer 7, block 61 +.word 2033283404 // Layer 7, block 63 +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 2853776 // Layer 6, block 32 +.word 31645959 // Layer 6, block 33 +.word 29723614 // Layer 6, block 34 +.word 31813171 // Layer 6, block 35 +.word 182627725 // Layer 6, block 32 +.word 2025186806 // Layer 6, block 33 +.word 1902166116 // Layer 6, block 34 +.word 2035887557 // Layer 6, block 35 +.word 30377953 // Layer 7, block 64 +.word 4924837 // Layer 7, block 66 +.word 11362575 // Layer 7, block 68 +.word 31398766 // Layer 7, block 70 +.word 1944040616 // Layer 7, block 64 +.word 315165513 // Layer 7, block 66 +.word 727149301 // Layer 7, block 68 +.word 2009367662 // Layer 7, block 70 +.word 27689101 // Layer 7, block 65 +.word 31229525 // Layer 7, block 67 +.word 6544948 // Layer 7, block 69 +.word 13728247 // Layer 7, block 71 +.word 1771967221 // Layer 7, block 65 +.word 1998537064 // Layer 7, block 67 +.word 418844704 // Layer 7, block 69 +.word 878540754 // Layer 7, block 71 +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9116920 // Layer 6, block 36 +.word 26449800 // Layer 6, block 37 +.word 27173300 // Layer 6, block 38 +.word 1574249 // Layer 6, block 39 +.word 583438350 // Layer 6, block 36 +.word 1692658010 // Layer 6, block 37 +.word 1738958476 // Layer 6, block 38 +.word 100744247 // Layer 6, block 39 +.word 6510145 // Layer 7, block 72 +.word 760999 // Layer 7, block 74 +.word 1634503 // Layer 7, block 76 +.word 29546109 // Layer 7, block 78 +.word 416617482 // Layer 7, block 72 +.word 48700219 // Layer 7, block 74 +.word 104600209 // Layer 7, block 76 +.word 1890806663 // Layer 7, block 78 +.word 2195232 // Layer 7, block 73 +.word 4465852 // Layer 7, block 75 +.word 31203102 // Layer 7, block 77 +.word 29916743 // Layer 7, block 79 +.word 140484126 // Layer 7, block 73 +.word 285792715 // Layer 7, block 75 +.word 1996846121 // Layer 7, block 77 +.word 1914525428 // Layer 7, block 79 +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29172999 // Layer 6, block 40 +.word 16825951 // Layer 6, block 41 +.word 11592382 // Layer 6, block 42 +.word 2671395 // Layer 6, block 43 +.word 1866929445 // Layer 6, block 40 +.word 1076778680 // Layer 6, block 41 +.word 741855827 // Layer 6, block 42 +.word 170956232 // Layer 6, block 43 +.word 14579779 // Layer 7, block 80 +.word 24263513 // Layer 7, block 82 +.word 4646776 // Layer 7, block 84 +.word 69049 // Layer 7, block 86 +.word 933034643 // Layer 7, block 80 +.word 1552746321 // Layer 7, block 82 +.word 297370968 // Layer 7, block 84 +.word 4418799 // Layer 7, block 86 +.word 33263488 // Layer 7, block 81 +.word 22493246 // Layer 7, block 83 +.word 22009979 // Layer 7, block 85 +.word 12021234 // Layer 7, block 87 +.word 2128700762 // Layer 7, block 81 +.word 1439457879 // Layer 7, block 83 +.word 1408531152 // Layer 7, block 85 +.word 769300260 // Layer 7, block 87 +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 15720958 // Layer 6, block 44 +.word 4876619 // Layer 6, block 45 +.word 9370171 // Layer 6, block 46 +.word 2197027 // Layer 6, block 47 +.word 1006064525 // Layer 6, block 44 +.word 312079797 // Layer 6, block 45 +.word 599645177 // Layer 6, block 46 +.word 140598997 // Layer 6, block 47 +.word 16117282 // Layer 7, block 88 +.word 9635661 // Layer 7, block 90 +.word 9117520 // Layer 7, block 92 +.word 3506913 // Layer 7, block 94 +.word 1031427326 // Layer 7, block 88 +.word 616635240 // Layer 7, block 90 +.word 583476747 // Layer 7, block 92 +.word 224425303 // Layer 7, block 94 +.word 20014407 // Layer 7, block 89 +.word 25893988 // Layer 7, block 91 +.word 10257619 // Layer 7, block 93 +.word 24501669 // Layer 7, block 95 +.word 1280824291 // Layer 7, block 89 +.word 1657088757 // Layer 7, block 91 +.word 656437514 // Layer 7, block 93 +.word 1567987141 // Layer 7, block 95 +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 23467272 // Layer 6, block 48 +.word 11944835 // Layer 6, block 49 +.word 29768154 // Layer 6, block 50 +.word 3189790 // Layer 6, block 51 +.word 1501790786 // Layer 6, block 48 +.word 764411097 // Layer 6, block 49 +.word 1905016458 // Layer 6, block 50 +.word 204130980 // Layer 6, block 51 +.word 28559032 // Layer 7, block 96 +.word 20151609 // Layer 7, block 98 +.word 11645481 // Layer 7, block 100 +.word 16402437 // Layer 7, block 102 +.word 1827638556 // Layer 7, block 96 +.word 1289604549 // Layer 7, block 98 +.word 745253903 // Layer 7, block 100 +.word 1049675853 // Layer 7, block 102 +.word 1005359 // Layer 7, block 97 +.word 19130139 // Layer 7, block 99 +.word 11690281 // Layer 7, block 101 +.word 5461508 // Layer 7, block 103 +.word 64338065 // Layer 7, block 97 +.word 1224235458 // Layer 7, block 99 +.word 748120885 // Layer 7, block 101 +.word 349509836 // Layer 7, block 103 +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 4898455 // Layer 6, block 52 +.word 22059944 // Layer 6, block 53 +.word 20315246 // Layer 6, block 54 +.word 28615767 // Layer 6, block 55 +.word 313477194 // Layer 6, block 52 +.word 1411728668 // Layer 6, block 53 +.word 1300076517 // Layer 6, block 54 +.word 1831269319 // Layer 6, block 55 +.word 6226096 // Layer 7, block 104 +.word 14029790 // Layer 7, block 106 +.word 7729000 // Layer 7, block 108 +.word 13958531 // Layer 7, block 110 +.word 398439734 // Layer 7, block 104 +.word 897838034 // Layer 7, block 106 +.word 494618249 // Layer 7, block 108 +.word 893277806 // Layer 7, block 110 +.word 31755058 // Layer 7, block 105 +.word 26102744 // Layer 7, block 107 +.word 19175904 // Layer 7, block 109 +.word 19472238 // Layer 7, block 111 +.word 2032168609 // Layer 7, block 105 +.word 1670448121 // Layer 7, block 107 +.word 1227164194 // Layer 7, block 109 +.word 1246128123 // Layer 7, block 111 +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 17302560 // Layer 6, block 56 +.word 8630188 // Layer 6, block 57 +.word 13744680 // Layer 6, block 58 +.word 31890906 // Layer 6, block 59 +.word 1107279328 // Layer 6, block 56 +.word 552289879 // Layer 6, block 57 +.word 879592386 // Layer 6, block 58 +.word 2040862218 // Layer 6, block 59 +.word 4735938 // Layer 7, block 112 +.word 26671657 // Layer 7, block 114 +.word 25810971 // Layer 7, block 116 +.word 25578690 // Layer 7, block 118 +.word 303076900 // Layer 7, block 112 +.word 1706855774 // Layer 7, block 114 +.word 1651776074 // Layer 7, block 116 +.word 1636911225 // Layer 7, block 118 +.word 6957373 // Layer 7, block 113 +.word 25381712 // Layer 7, block 115 +.word 27780827 // Layer 7, block 117 +.word 28062311 // Layer 7, block 119 +.word 445237890 // Layer 7, block 113 +.word 1624305595 // Layer 7, block 115 +.word 1777837237 // Layer 7, block 117 +.word 1795850838 // Layer 7, block 119 +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 26150922 // Layer 6, block 60 +.word 29525906 // Layer 6, block 61 +.word 23080870 // Layer 6, block 62 +.word 1636987 // Layer 6, block 63 +.word 1673531278 // Layer 6, block 60 +.word 1889513769 // Layer 6, block 61 +.word 1477062945 // Layer 6, block 62 +.word 104759172 // Layer 6, block 63 +.word 10674616 // Layer 7, block 120 +.word 9508293 // Layer 7, block 122 +.word 4274200 // Layer 7, block 124 +.word 10066304 // Layer 7, block 126 +.word 683123285 // Layer 7, block 120 +.word 608484310 // Layer 7, block 122 +.word 273527923 // Layer 7, block 124 +.word 644194289 // Layer 7, block 126 +.word 26473446 // Layer 7, block 121 +.word 14853570 // Layer 7, block 123 +.word 32427548 // Layer 7, block 125 +.word 16598340 // Layer 7, block 127 +.word 1694171239 // Layer 7, block 121 +.word 950555930 // Layer 7, block 123 +.word 2075204685 // Layer 7, block 125 +.word 1062212688 // Layer 7, block 127 +.text +.global ntt_u32_full_neon_asm_var_4_4_14_0 +.global _ntt_u32_full_neon_asm_var_4_4_14_0 +ntt_u32_full_neon_asm_var_4_4_14_0: +_ntt_u32_full_neon_asm_var_4_4_14_0: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x0, #928] +ldr q29, [x17, #+0] +ldr q28, [x17, #+16] +sqrdmulh v27.4S, v30.4S, v28.s[0] +mul v30.4S, v30.4S,v29.s[0] +ldr q26, [x0, #992] +sqrdmulh v25.4S, v26.4S, v28.s[0] +mul v26.4S, v26.4S,v29.s[0] +ldr q24, [x0, #800] +sqrdmulh v23.4S, v24.4S, v28.s[0] +mul v24.4S, v24.4S,v29.s[0] +ldr q22, [x0, #864] +sqrdmulh v21.4S, v22.4S, v28.s[0] +mul v22.4S, v22.4S,v29.s[0] +ldr q20, [x0, #544] +mla v30.4S, v27.4S, v31.s[0] +sqrdmulh v27.4S, v20.4S, v28.s[0] +ldr q19, [x0, #608] +mla v26.4S, v25.4S, v31.s[0] +sqrdmulh v25.4S, v19.4S, v28.s[0] +nop +ldr q18, [x0, #672] +mla v24.4S, v23.4S, v31.s[0] +sqrdmulh v23.4S, v18.4S, v28.s[0] +nop +ldr q17, [x0, #736] +mla v22.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v17.4S, v28.s[0] +nop +ldr q16, [x0, #416] +ldr q3, [x0, #480] +mul v20.4S, v20.4S,v29.s[0] +sub v2.4s, v16.4s, v30.4s +mul v19.4S, v19.4S,v29.s[0] +add v16.4s, v16.4s, v30.4s +ldr q30, [x0, #288] +ldr q1, [x0, #352] +mla v20.4S, v27.4S, v31.s[0] +sub v27.4s, v3.4s, v26.4s +mla v19.4S, v25.4S, v31.s[0] +add v3.4s, v3.4s, v26.4s +ldr q26, [x0, #32] +ldr q25, [x0, #96] +mul v18.4S, v18.4S,v29.s[0] +sub v0.4s, v30.4s, v24.4s +mul v17.4S, v17.4S,v29.s[0] +add v30.4s, v30.4s, v24.4s +ldr q24, [x0, #160] +ldr q15, [x0, #224] +mla v18.4S, v23.4S, v31.s[0] +sub v23.4s, v1.4s, v22.4s +mla v17.4S, v21.4S, v31.s[0] +add v1.4s, v1.4s, v22.4s +sqrdmulh v22.4S, v16.4S, v28.s[1] +nop +mul v16.4S, v16.4S,v29.s[1] +nop +sqrdmulh v21.4S, v3.4S, v28.s[1] +sub v14.4s, v26.4s, v20.4s +mul v3.4S, v3.4S,v29.s[1] +add v26.4s, v26.4s, v20.4s +sqrdmulh v20.4S, v30.4S, v28.s[1] +sub v13.4s, v25.4s, v19.4s +mul v30.4S, v30.4S,v29.s[1] +add v25.4s, v25.4s, v19.4s +sqrdmulh v19.4S, v1.4S, v28.s[1] +sub v12.4s, v24.4s, v18.4s +mul v1.4S, v1.4S,v29.s[1] +add v24.4s, v24.4s, v18.4s +mla v16.4S, v22.4S, v31.s[0] +sub v22.4s, v15.4s, v17.4s +sqrdmulh v18.4S, v2.4S, v28.s[2] +add v15.4s, v15.4s, v17.4s +mla v3.4S, v21.4S, v31.s[0] +nop +sqrdmulh v21.4S, v27.4S, v28.s[2] +nop +mla v30.4S, v20.4S, v31.s[0] +nop +sqrdmulh v20.4S, v0.4S, v28.s[2] +nop +mla v1.4S, v19.4S, v31.s[0] +nop +sqrdmulh v19.4S, v23.4S, v28.s[2] +nop +ldr q17, [x17, #+32] +ldr q11, [x17, #+48] +mul v2.4S, v2.4S,v29.s[2] +sub v10.4s, v24.4s, v16.4s +mul v27.4S, v27.4S,v29.s[2] +add v24.4s, v24.4s, v16.4s +mla v2.4S, v18.4S, v31.s[0] +sub v18.4s, v15.4s, v3.4s +mla v27.4S, v21.4S, v31.s[0] +add v15.4s, v15.4s, v3.4s +mul v0.4S, v0.4S,v29.s[2] +sub v3.4s, v26.4s, v30.4s +mul v23.4S, v23.4S,v29.s[2] +add v26.4s, v26.4s, v30.4s +mla v0.4S, v20.4S, v31.s[0] +sub v20.4s, v25.4s, v1.4s +mla v23.4S, v19.4S, v31.s[0] +add v25.4s, v25.4s, v1.4s +sqrdmulh v1.4S, v10.4S, v11.s[1] +nop +mul v10.4S, v10.4S,v17.s[1] +nop +sqrdmulh v19.4S, v18.4S, v11.s[1] +sub v30.4s, v12.4s, v2.4s +mul v18.4S, v18.4S,v17.s[1] +add v12.4s, v12.4s, v2.4s +sqrdmulh v2.4S, v24.4S, v11.s[0] +sub v21.4s, v22.4s, v27.4s +mul v24.4S, v24.4S,v17.s[0] +add v22.4s, v22.4s, v27.4s +sqrdmulh v27.4S, v15.4S, v11.s[0] +sub v16.4s, v14.4s, v0.4s +mul v15.4S, v15.4S,v17.s[0] +add v14.4s, v14.4s, v0.4s +ldr q0, [x17, #+64] +ldr q9, [x17, #+80] +mla v10.4S, v1.4S, v31.s[0] +sub v1.4s, v13.4s, v23.4s +sqrdmulh v8.4S, v12.4S, v11.s[2] +add v13.4s, v13.4s, v23.4s +mla v18.4S, v19.4S, v31.s[0] +nop +sqrdmulh v19.4S, v22.4S, v11.s[2] +nop +mla v24.4S, v2.4S, v31.s[0] +nop +sqrdmulh v2.4S, v30.4S, v11.s[3] +nop +mla v15.4S, v27.4S, v31.s[0] +nop +sqrdmulh v27.4S, v21.4S, v11.s[3] +nop +ldr q23, [x17, #+96] +ldr q7, [x17, #+112] +mul v12.4S, v12.4S,v17.s[2] +sub v6.4s, v3.4s, v10.4s +mul v22.4S, v22.4S,v17.s[2] +add v3.4s, v3.4s, v10.4s +mla v12.4S, v8.4S, v31.s[0] +sub v8.4s, v20.4s, v18.4s +mla v22.4S, v19.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +mul v30.4S, v30.4S,v17.s[3] +sub v18.4s, v26.4s, v24.4s +mul v21.4S, v21.4S,v17.s[3] +add v26.4s, v26.4s, v24.4s +mla v30.4S, v2.4S, v31.s[0] +sub v2.4s, v25.4s, v15.4s +mla v21.4S, v27.4S, v31.s[0] +add v25.4s, v25.4s, v15.4s +sqrdmulh v15.4S, v8.4S, v9.s[3] +nop +mul v8.4S, v8.4S,v0.s[3] +nop +sqrdmulh v27.4S, v20.4S, v9.s[2] +sub v24.4s, v14.4s, v12.4s +mul v20.4S, v20.4S,v0.s[2] +add v14.4s, v14.4s, v12.4s +sqrdmulh v12.4S, v2.4S, v9.s[1] +sub v19.4s, v13.4s, v22.4s +mul v2.4S, v2.4S,v0.s[1] +add v13.4s, v13.4s, v22.4s +sqrdmulh v22.4S, v25.4S, v9.s[0] +sub v10.4s, v16.4s, v30.4s +mul v25.4S, v25.4S,v0.s[0] +add v16.4s, v16.4s, v30.4s +mla v8.4S, v15.4S, v31.s[0] +sub v15.4s, v1.4s, v21.4s +sqrdmulh v30.4S, v13.4S, v7.s[0] +add v1.4s, v1.4s, v21.4s +mla v20.4S, v27.4S, v31.s[0] +sub v27.4s, v6.4s, v8.4s +sqrdmulh v21.4S, v19.4S, v7.s[1] +add v6.4s, v6.4s, v8.4s +mla v2.4S, v12.4S, v31.s[0] +sub v12.4s, v3.4s, v20.4s +sqrdmulh v8.4S, v1.4S, v7.s[2] +add v3.4s, v3.4s, v20.4s +mla v25.4S, v22.4S, v31.s[0] +sub v22.4s, v18.4s, v2.4s +sqrdmulh v20.4S, v15.4S, v7.s[3] +add v18.4s, v18.4s, v2.4s +mul v13.4S, v13.4S,v23.s[0] +sub v2.4s, v26.4s, v25.4s +mul v19.4S, v19.4S,v23.s[1] +add v26.4s, v26.4s, v25.4s +mla v13.4S, v30.4S, v31.s[0] +str q12, [x0, #352] +mla v19.4S, v21.4S, v31.s[0] +str q3, [x0, #288] +mul v1.4S, v1.4S,v23.s[2] +str q27, [x0, #480] +mul v15.4S, v15.4S,v23.s[3] +str q6, [x0, #416] +mla v1.4S, v8.4S, v31.s[0] +str q22, [x0, #224] +mla v15.4S, v20.4S, v31.s[0] +str q18, [x0, #160] +ldr q18, [x0, #944] +sqrdmulh v20.4S, v18.4S, v28.s[0] +str q2, [x0, #96] +mul v18.4S, v18.4S,v29.s[0] +str q26, [x0, #32] +ldr q26, [x0, #1008] +sqrdmulh v2.4S, v26.4S, v28.s[0] +sub v22.4s, v14.4s, v13.4s +str q22, [x0, #608] +mul v26.4S, v26.4S,v29.s[0] +add v14.4s, v14.4s, v13.4s +ldr q13, [x0, #816] +sqrdmulh v22.4S, v13.4S, v28.s[0] +sub v8.4s, v24.4s, v19.4s +str q14, [x0, #544] +mul v13.4S, v13.4S,v29.s[0] +add v24.4s, v24.4s, v19.4s +ldr q19, [x0, #880] +sqrdmulh v14.4S, v19.4S, v28.s[0] +sub v6.4s, v16.4s, v1.4s +str q8, [x0, #736] +mul v19.4S, v19.4S,v29.s[0] +add v16.4s, v16.4s, v1.4s +ldr q1, [x0, #560] +mla v18.4S, v20.4S, v31.s[0] +sub v20.4s, v10.4s, v15.4s +str q24, [x0, #672] +sqrdmulh v24.4S, v1.4S, v28.s[0] +add v10.4s, v10.4s, v15.4s +ldr q15, [x0, #624] +mla v26.4S, v2.4S, v31.s[0] +str q6, [x0, #864] +sqrdmulh v6.4S, v15.4S, v28.s[0] +nop +ldr q2, [x0, #688] +mla v13.4S, v22.4S, v31.s[0] +str q16, [x0, #800] +sqrdmulh v16.4S, v2.4S, v28.s[0] +nop +ldr q22, [x0, #752] +mla v19.4S, v14.4S, v31.s[0] +str q20, [x0, #992] +sqrdmulh v20.4S, v22.4S, v28.s[0] +nop +ldr q14, [x0, #432] +ldr q8, [x0, #496] +mul v1.4S, v1.4S,v29.s[0] +sub v27.4s, v14.4s, v18.4s +str q10, [x0, #928] +mul v15.4S, v15.4S,v29.s[0] +add v14.4s, v14.4s, v18.4s +ldr q18, [x0, #304] +ldr q10, [x0, #368] +mla v1.4S, v24.4S, v31.s[0] +sub v24.4s, v8.4s, v26.4s +mla v15.4S, v6.4S, v31.s[0] +add v8.4s, v8.4s, v26.4s +ldr q26, [x0, #48] +ldr q6, [x0, #112] +mul v2.4S, v2.4S,v29.s[0] +sub v3.4s, v18.4s, v13.4s +mul v22.4S, v22.4S,v29.s[0] +add v18.4s, v18.4s, v13.4s +ldr q13, [x0, #176] +ldr q21, [x0, #240] +mla v2.4S, v16.4S, v31.s[0] +sub v16.4s, v10.4s, v19.4s +mla v22.4S, v20.4S, v31.s[0] +add v10.4s, v10.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v28.s[1] +nop +mul v14.4S, v14.4S,v29.s[1] +nop +sqrdmulh v20.4S, v8.4S, v28.s[1] +sub v12.4s, v26.4s, v1.4s +mul v8.4S, v8.4S,v29.s[1] +add v26.4s, v26.4s, v1.4s +sqrdmulh v1.4S, v18.4S, v28.s[1] +sub v30.4s, v6.4s, v15.4s +mul v18.4S, v18.4S,v29.s[1] +add v6.4s, v6.4s, v15.4s +sqrdmulh v15.4S, v10.4S, v28.s[1] +sub v25.4s, v13.4s, v2.4s +mul v10.4S, v10.4S,v29.s[1] +add v13.4s, v13.4s, v2.4s +mla v14.4S, v19.4S, v31.s[0] +sub v19.4s, v21.4s, v22.4s +sqrdmulh v2.4S, v27.4S, v28.s[2] +add v21.4s, v21.4s, v22.4s +mla v8.4S, v20.4S, v31.s[0] +nop +sqrdmulh v20.4S, v24.4S, v28.s[2] +nop +mla v18.4S, v1.4S, v31.s[0] +nop +sqrdmulh v1.4S, v3.4S, v28.s[2] +nop +mla v10.4S, v15.4S, v31.s[0] +nop +sqrdmulh v15.4S, v16.4S, v28.s[2] +nop +mul v27.4S, v27.4S,v29.s[2] +sub v22.4s, v13.4s, v14.4s +mul v24.4S, v24.4S,v29.s[2] +add v13.4s, v13.4s, v14.4s +mla v27.4S, v2.4S, v31.s[0] +sub v2.4s, v21.4s, v8.4s +mla v24.4S, v20.4S, v31.s[0] +add v21.4s, v21.4s, v8.4s +mul v3.4S, v3.4S,v29.s[2] +sub v8.4s, v26.4s, v18.4s +mul v16.4S, v16.4S,v29.s[2] +add v26.4s, v26.4s, v18.4s +mla v3.4S, v1.4S, v31.s[0] +sub v1.4s, v6.4s, v10.4s +mla v16.4S, v15.4S, v31.s[0] +add v6.4s, v6.4s, v10.4s +sqrdmulh v10.4S, v22.4S, v11.s[1] +nop +mul v22.4S, v22.4S,v17.s[1] +nop +sqrdmulh v15.4S, v2.4S, v11.s[1] +sub v18.4s, v25.4s, v27.4s +mul v2.4S, v2.4S,v17.s[1] +add v25.4s, v25.4s, v27.4s +sqrdmulh v27.4S, v13.4S, v11.s[0] +sub v20.4s, v19.4s, v24.4s +mul v13.4S, v13.4S,v17.s[0] +add v19.4s, v19.4s, v24.4s +sqrdmulh v24.4S, v21.4S, v11.s[0] +sub v14.4s, v12.4s, v3.4s +mul v21.4S, v21.4S,v17.s[0] +add v12.4s, v12.4s, v3.4s +mla v22.4S, v10.4S, v31.s[0] +sub v10.4s, v30.4s, v16.4s +sqrdmulh v3.4S, v25.4S, v11.s[2] +add v30.4s, v30.4s, v16.4s +mla v2.4S, v15.4S, v31.s[0] +nop +sqrdmulh v15.4S, v19.4S, v11.s[2] +nop +mla v13.4S, v27.4S, v31.s[0] +nop +sqrdmulh v27.4S, v18.4S, v11.s[3] +nop +mla v21.4S, v24.4S, v31.s[0] +nop +sqrdmulh v24.4S, v20.4S, v11.s[3] +nop +mul v25.4S, v25.4S,v17.s[2] +sub v16.4s, v8.4s, v22.4s +mul v19.4S, v19.4S,v17.s[2] +add v8.4s, v8.4s, v22.4s +mla v25.4S, v3.4S, v31.s[0] +sub v3.4s, v1.4s, v2.4s +mla v19.4S, v15.4S, v31.s[0] +add v1.4s, v1.4s, v2.4s +mul v18.4S, v18.4S,v17.s[3] +sub v2.4s, v26.4s, v13.4s +mul v20.4S, v20.4S,v17.s[3] +add v26.4s, v26.4s, v13.4s +mla v18.4S, v27.4S, v31.s[0] +sub v27.4s, v6.4s, v21.4s +mla v20.4S, v24.4S, v31.s[0] +add v6.4s, v6.4s, v21.4s +sqrdmulh v21.4S, v3.4S, v9.s[3] +nop +mul v3.4S, v3.4S,v0.s[3] +nop +sqrdmulh v24.4S, v1.4S, v9.s[2] +sub v13.4s, v12.4s, v25.4s +mul v1.4S, v1.4S,v0.s[2] +add v12.4s, v12.4s, v25.4s +sqrdmulh v25.4S, v27.4S, v9.s[1] +sub v15.4s, v30.4s, v19.4s +mul v27.4S, v27.4S,v0.s[1] +add v30.4s, v30.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v9.s[0] +sub v22.4s, v14.4s, v18.4s +mul v6.4S, v6.4S,v0.s[0] +add v14.4s, v14.4s, v18.4s +mla v3.4S, v21.4S, v31.s[0] +sub v21.4s, v10.4s, v20.4s +sqrdmulh v18.4S, v30.4S, v7.s[0] +add v10.4s, v10.4s, v20.4s +mla v1.4S, v24.4S, v31.s[0] +sub v24.4s, v16.4s, v3.4s +sqrdmulh v20.4S, v15.4S, v7.s[1] +add v16.4s, v16.4s, v3.4s +mla v27.4S, v25.4S, v31.s[0] +sub v25.4s, v8.4s, v1.4s +sqrdmulh v3.4S, v10.4S, v7.s[2] +add v8.4s, v8.4s, v1.4s +mla v6.4S, v19.4S, v31.s[0] +sub v19.4s, v2.4s, v27.4s +sqrdmulh v1.4S, v21.4S, v7.s[3] +add v2.4s, v2.4s, v27.4s +mul v30.4S, v30.4S,v23.s[0] +sub v27.4s, v26.4s, v6.4s +mul v15.4S, v15.4S,v23.s[1] +add v26.4s, v26.4s, v6.4s +mla v30.4S, v18.4S, v31.s[0] +str q25, [x0, #368] +mla v15.4S, v20.4S, v31.s[0] +str q8, [x0, #304] +mul v10.4S, v10.4S,v23.s[2] +str q24, [x0, #496] +mul v21.4S, v21.4S,v23.s[3] +str q16, [x0, #432] +mla v10.4S, v3.4S, v31.s[0] +str q19, [x0, #240] +mla v21.4S, v1.4S, v31.s[0] +str q2, [x0, #176] +ldr q2, [x0, #896] +sqrdmulh v1.4S, v2.4S, v28.s[0] +str q27, [x0, #112] +mul v2.4S, v2.4S,v29.s[0] +str q26, [x0, #48] +ldr q26, [x0, #960] +sqrdmulh v27.4S, v26.4S, v28.s[0] +sub v19.4s, v12.4s, v30.4s +str q19, [x0, #624] +mul v26.4S, v26.4S,v29.s[0] +add v12.4s, v12.4s, v30.4s +ldr q30, [x0, #768] +sqrdmulh v19.4S, v30.4S, v28.s[0] +sub v3.4s, v13.4s, v15.4s +str q12, [x0, #560] +mul v30.4S, v30.4S,v29.s[0] +add v13.4s, v13.4s, v15.4s +ldr q15, [x0, #832] +sqrdmulh v12.4S, v15.4S, v28.s[0] +sub v16.4s, v14.4s, v10.4s +str q3, [x0, #752] +mul v15.4S, v15.4S,v29.s[0] +add v14.4s, v14.4s, v10.4s +ldr q10, [x0, #512] +mla v2.4S, v1.4S, v31.s[0] +sub v1.4s, v22.4s, v21.4s +str q13, [x0, #688] +sqrdmulh v13.4S, v10.4S, v28.s[0] +add v22.4s, v22.4s, v21.4s +ldr q21, [x0, #576] +mla v26.4S, v27.4S, v31.s[0] +str q16, [x0, #880] +sqrdmulh v16.4S, v21.4S, v28.s[0] +nop +ldr q27, [x0, #640] +mla v30.4S, v19.4S, v31.s[0] +str q14, [x0, #816] +sqrdmulh v14.4S, v27.4S, v28.s[0] +nop +ldr q19, [x0, #704] +mla v15.4S, v12.4S, v31.s[0] +str q1, [x0, #1008] +sqrdmulh v1.4S, v19.4S, v28.s[0] +nop +ldr q12, [x0, #384] +ldr q3, [x0, #448] +mul v10.4S, v10.4S,v29.s[0] +sub v24.4s, v12.4s, v2.4s +str q22, [x0, #944] +mul v21.4S, v21.4S,v29.s[0] +add v12.4s, v12.4s, v2.4s +ldr q2, [x0, #256] +ldr q22, [x0, #320] +mla v10.4S, v13.4S, v31.s[0] +sub v13.4s, v3.4s, v26.4s +mla v21.4S, v16.4S, v31.s[0] +add v3.4s, v3.4s, v26.4s +ldr q26, [x0, #0] +ldr q16, [x0, #64] +mul v27.4S, v27.4S,v29.s[0] +sub v8.4s, v2.4s, v30.4s +mul v19.4S, v19.4S,v29.s[0] +add v2.4s, v2.4s, v30.4s +ldr q30, [x0, #128] +ldr q20, [x0, #192] +mla v27.4S, v14.4S, v31.s[0] +sub v14.4s, v22.4s, v15.4s +mla v19.4S, v1.4S, v31.s[0] +add v22.4s, v22.4s, v15.4s +sqrdmulh v15.4S, v12.4S, v28.s[1] +nop +mul v12.4S, v12.4S,v29.s[1] +nop +sqrdmulh v1.4S, v3.4S, v28.s[1] +sub v25.4s, v26.4s, v10.4s +mul v3.4S, v3.4S,v29.s[1] +add v26.4s, v26.4s, v10.4s +sqrdmulh v10.4S, v2.4S, v28.s[1] +sub v18.4s, v16.4s, v21.4s +mul v2.4S, v2.4S,v29.s[1] +add v16.4s, v16.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v28.s[1] +sub v6.4s, v30.4s, v27.4s +mul v22.4S, v22.4S,v29.s[1] +add v30.4s, v30.4s, v27.4s +mla v12.4S, v15.4S, v31.s[0] +sub v15.4s, v20.4s, v19.4s +sqrdmulh v27.4S, v24.4S, v28.s[2] +add v20.4s, v20.4s, v19.4s +mla v3.4S, v1.4S, v31.s[0] +nop +sqrdmulh v1.4S, v13.4S, v28.s[2] +nop +mla v2.4S, v10.4S, v31.s[0] +nop +sqrdmulh v10.4S, v8.4S, v28.s[2] +nop +mla v22.4S, v21.4S, v31.s[0] +nop +sqrdmulh v21.4S, v14.4S, v28.s[2] +nop +mul v24.4S, v24.4S,v29.s[2] +sub v19.4s, v30.4s, v12.4s +mul v13.4S, v13.4S,v29.s[2] +add v30.4s, v30.4s, v12.4s +mla v24.4S, v27.4S, v31.s[0] +sub v27.4s, v20.4s, v3.4s +mla v13.4S, v1.4S, v31.s[0] +add v20.4s, v20.4s, v3.4s +mul v8.4S, v8.4S,v29.s[2] +sub v3.4s, v26.4s, v2.4s +mul v14.4S, v14.4S,v29.s[2] +add v26.4s, v26.4s, v2.4s +mla v8.4S, v10.4S, v31.s[0] +sub v10.4s, v16.4s, v22.4s +mla v14.4S, v21.4S, v31.s[0] +add v16.4s, v16.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v11.s[1] +nop +mul v19.4S, v19.4S,v17.s[1] +nop +sqrdmulh v21.4S, v27.4S, v11.s[1] +sub v2.4s, v6.4s, v24.4s +mul v27.4S, v27.4S,v17.s[1] +add v6.4s, v6.4s, v24.4s +sqrdmulh v24.4S, v30.4S, v11.s[0] +sub v1.4s, v15.4s, v13.4s +mul v30.4S, v30.4S,v17.s[0] +add v15.4s, v15.4s, v13.4s +sqrdmulh v13.4S, v20.4S, v11.s[0] +sub v12.4s, v25.4s, v8.4s +mul v20.4S, v20.4S,v17.s[0] +add v25.4s, v25.4s, v8.4s +mla v19.4S, v22.4S, v31.s[0] +sub v22.4s, v18.4s, v14.4s +sqrdmulh v8.4S, v6.4S, v11.s[2] +add v18.4s, v18.4s, v14.4s +mla v27.4S, v21.4S, v31.s[0] +nop +sqrdmulh v21.4S, v15.4S, v11.s[2] +nop +mla v30.4S, v24.4S, v31.s[0] +nop +sqrdmulh v24.4S, v2.4S, v11.s[3] +nop +mla v20.4S, v13.4S, v31.s[0] +nop +sqrdmulh v13.4S, v1.4S, v11.s[3] +nop +mul v6.4S, v6.4S,v17.s[2] +sub v14.4s, v3.4s, v19.4s +mul v15.4S, v15.4S,v17.s[2] +add v3.4s, v3.4s, v19.4s +mla v6.4S, v8.4S, v31.s[0] +sub v8.4s, v10.4s, v27.4s +mla v15.4S, v21.4S, v31.s[0] +add v10.4s, v10.4s, v27.4s +mul v2.4S, v2.4S,v17.s[3] +sub v27.4s, v26.4s, v30.4s +mul v1.4S, v1.4S,v17.s[3] +add v26.4s, v26.4s, v30.4s +mla v2.4S, v24.4S, v31.s[0] +sub v24.4s, v16.4s, v20.4s +mla v1.4S, v13.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v9.s[3] +nop +mul v8.4S, v8.4S,v0.s[3] +nop +sqrdmulh v13.4S, v10.4S, v9.s[2] +sub v30.4s, v25.4s, v6.4s +mul v10.4S, v10.4S,v0.s[2] +add v25.4s, v25.4s, v6.4s +sqrdmulh v6.4S, v24.4S, v9.s[1] +sub v21.4s, v18.4s, v15.4s +mul v24.4S, v24.4S,v0.s[1] +add v18.4s, v18.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v9.s[0] +sub v19.4s, v12.4s, v2.4s +mul v16.4S, v16.4S,v0.s[0] +add v12.4s, v12.4s, v2.4s +mla v8.4S, v20.4S, v31.s[0] +sub v20.4s, v22.4s, v1.4s +sqrdmulh v2.4S, v18.4S, v7.s[0] +add v22.4s, v22.4s, v1.4s +mla v10.4S, v13.4S, v31.s[0] +sub v13.4s, v14.4s, v8.4s +sqrdmulh v1.4S, v21.4S, v7.s[1] +add v14.4s, v14.4s, v8.4s +mla v24.4S, v6.4S, v31.s[0] +sub v6.4s, v3.4s, v10.4s +sqrdmulh v8.4S, v22.4S, v7.s[2] +add v3.4s, v3.4s, v10.4s +mla v16.4S, v15.4S, v31.s[0] +sub v15.4s, v27.4s, v24.4s +sqrdmulh v10.4S, v20.4S, v7.s[3] +add v27.4s, v27.4s, v24.4s +mul v18.4S, v18.4S,v23.s[0] +sub v24.4s, v26.4s, v16.4s +mul v21.4S, v21.4S,v23.s[1] +add v26.4s, v26.4s, v16.4s +mla v18.4S, v2.4S, v31.s[0] +str q6, [x0, #320] +mla v21.4S, v1.4S, v31.s[0] +str q3, [x0, #256] +mul v22.4S, v22.4S,v23.s[2] +str q13, [x0, #448] +mul v20.4S, v20.4S,v23.s[3] +str q14, [x0, #384] +mla v22.4S, v8.4S, v31.s[0] +str q15, [x0, #192] +mla v20.4S, v10.4S, v31.s[0] +str q27, [x0, #128] +ldr q27, [x0, #912] +sqrdmulh v10.4S, v27.4S, v28.s[0] +str q24, [x0, #64] +mul v27.4S, v27.4S,v29.s[0] +str q26, [x0, #0] +ldr q26, [x0, #976] +sqrdmulh v24.4S, v26.4S, v28.s[0] +sub v15.4s, v25.4s, v18.4s +str q15, [x0, #576] +mul v26.4S, v26.4S,v29.s[0] +add v25.4s, v25.4s, v18.4s +ldr q18, [x0, #784] +sqrdmulh v15.4S, v18.4S, v28.s[0] +sub v8.4s, v30.4s, v21.4s +str q25, [x0, #512] +mul v18.4S, v18.4S,v29.s[0] +add v30.4s, v30.4s, v21.4s +ldr q21, [x0, #848] +sqrdmulh v25.4S, v21.4S, v28.s[0] +sub v14.4s, v12.4s, v22.4s +str q8, [x0, #704] +mul v21.4S, v21.4S,v29.s[0] +add v12.4s, v12.4s, v22.4s +ldr q22, [x0, #528] +mla v27.4S, v10.4S, v31.s[0] +sub v10.4s, v19.4s, v20.4s +str q30, [x0, #640] +sqrdmulh v30.4S, v22.4S, v28.s[0] +add v19.4s, v19.4s, v20.4s +ldr q20, [x0, #592] +mla v26.4S, v24.4S, v31.s[0] +str q14, [x0, #832] +sqrdmulh v14.4S, v20.4S, v28.s[0] +nop +ldr q24, [x0, #656] +mla v18.4S, v15.4S, v31.s[0] +str q12, [x0, #768] +sqrdmulh v12.4S, v24.4S, v28.s[0] +nop +ldr q15, [x0, #720] +mla v21.4S, v25.4S, v31.s[0] +str q10, [x0, #960] +sqrdmulh v10.4S, v15.4S, v28.s[0] +nop +ldr q25, [x0, #400] +ldr q8, [x0, #464] +mul v22.4S, v22.4S,v29.s[0] +sub v13.4s, v25.4s, v27.4s +str q19, [x0, #896] +mul v20.4S, v20.4S,v29.s[0] +add v25.4s, v25.4s, v27.4s +ldr q27, [x0, #272] +ldr q19, [x0, #336] +mla v22.4S, v30.4S, v31.s[0] +sub v30.4s, v8.4s, v26.4s +mla v20.4S, v14.4S, v31.s[0] +add v8.4s, v8.4s, v26.4s +ldr q26, [x0, #16] +ldr q14, [x0, #80] +mul v24.4S, v24.4S,v29.s[0] +sub v3.4s, v27.4s, v18.4s +mul v15.4S, v15.4S,v29.s[0] +add v27.4s, v27.4s, v18.4s +ldr q18, [x0, #144] +ldr q1, [x0, #208] +mla v24.4S, v12.4S, v31.s[0] +sub v12.4s, v19.4s, v21.4s +mla v15.4S, v10.4S, v31.s[0] +add v19.4s, v19.4s, v21.4s +sqrdmulh v21.4S, v25.4S, v28.s[1] +nop +mul v25.4S, v25.4S,v29.s[1] +nop +sqrdmulh v10.4S, v8.4S, v28.s[1] +sub v6.4s, v26.4s, v22.4s +mul v8.4S, v8.4S,v29.s[1] +add v26.4s, v26.4s, v22.4s +sqrdmulh v22.4S, v27.4S, v28.s[1] +sub v2.4s, v14.4s, v20.4s +mul v27.4S, v27.4S,v29.s[1] +add v14.4s, v14.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v28.s[1] +sub v16.4s, v18.4s, v24.4s +mul v19.4S, v19.4S,v29.s[1] +add v18.4s, v18.4s, v24.4s +mla v25.4S, v21.4S, v31.s[0] +sub v21.4s, v1.4s, v15.4s +sqrdmulh v24.4S, v13.4S, v28.s[2] +add v1.4s, v1.4s, v15.4s +mla v8.4S, v10.4S, v31.s[0] +nop +sqrdmulh v10.4S, v30.4S, v28.s[2] +nop +mla v27.4S, v22.4S, v31.s[0] +nop +sqrdmulh v22.4S, v3.4S, v28.s[2] +nop +mla v19.4S, v20.4S, v31.s[0] +nop +sqrdmulh v20.4S, v12.4S, v28.s[2] +nop +mul v13.4S, v13.4S,v29.s[2] +sub v15.4s, v18.4s, v25.4s +mul v30.4S, v30.4S,v29.s[2] +add v18.4s, v18.4s, v25.4s +mla v13.4S, v24.4S, v31.s[0] +sub v24.4s, v1.4s, v8.4s +mla v30.4S, v10.4S, v31.s[0] +add v1.4s, v1.4s, v8.4s +mul v3.4S, v3.4S,v29.s[2] +sub v8.4s, v26.4s, v27.4s +mul v12.4S, v12.4S,v29.s[2] +add v26.4s, v26.4s, v27.4s +mla v3.4S, v22.4S, v31.s[0] +sub v22.4s, v14.4s, v19.4s +mla v12.4S, v20.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +sqrdmulh v28.4S, v15.4S, v11.s[1] +nop +mul v15.4S, v15.4S,v17.s[1] +nop +sqrdmulh v29.4S, v24.4S, v11.s[1] +sub v19.4s, v16.4s, v13.4s +mul v24.4S, v24.4S,v17.s[1] +add v16.4s, v16.4s, v13.4s +sqrdmulh v13.4S, v18.4S, v11.s[0] +sub v20.4s, v21.4s, v30.4s +mul v18.4S, v18.4S,v17.s[0] +add v21.4s, v21.4s, v30.4s +sqrdmulh v30.4S, v1.4S, v11.s[0] +sub v27.4s, v6.4s, v3.4s +mul v1.4S, v1.4S,v17.s[0] +add v6.4s, v6.4s, v3.4s +mla v15.4S, v28.4S, v31.s[0] +sub v28.4s, v2.4s, v12.4s +sqrdmulh v3.4S, v16.4S, v11.s[2] +add v2.4s, v2.4s, v12.4s +mla v24.4S, v29.4S, v31.s[0] +nop +sqrdmulh v29.4S, v21.4S, v11.s[2] +nop +mla v18.4S, v13.4S, v31.s[0] +nop +sqrdmulh v13.4S, v19.4S, v11.s[3] +nop +mla v1.4S, v30.4S, v31.s[0] +nop +sqrdmulh v30.4S, v20.4S, v11.s[3] +nop +mul v16.4S, v16.4S,v17.s[2] +sub v12.4s, v8.4s, v15.4s +mul v21.4S, v21.4S,v17.s[2] +add v8.4s, v8.4s, v15.4s +mla v16.4S, v3.4S, v31.s[0] +sub v3.4s, v22.4s, v24.4s +mla v21.4S, v29.4S, v31.s[0] +add v22.4s, v22.4s, v24.4s +mul v19.4S, v19.4S,v17.s[3] +sub v24.4s, v26.4s, v18.4s +mul v20.4S, v20.4S,v17.s[3] +add v26.4s, v26.4s, v18.4s +mla v19.4S, v13.4S, v31.s[0] +sub v13.4s, v14.4s, v1.4s +mla v20.4S, v30.4S, v31.s[0] +add v14.4s, v14.4s, v1.4s +sqrdmulh v11.4S, v3.4S, v9.s[3] +nop +mul v3.4S, v3.4S,v0.s[3] +nop +sqrdmulh v17.4S, v22.4S, v9.s[2] +sub v1.4s, v6.4s, v16.4s +mul v22.4S, v22.4S,v0.s[2] +add v6.4s, v6.4s, v16.4s +sqrdmulh v16.4S, v13.4S, v9.s[1] +sub v30.4s, v2.4s, v21.4s +mul v13.4S, v13.4S,v0.s[1] +add v2.4s, v2.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v9.s[0] +sub v18.4s, v27.4s, v19.4s +mul v14.4S, v14.4S,v0.s[0] +add v27.4s, v27.4s, v19.4s +mla v3.4S, v11.4S, v31.s[0] +sub v11.4s, v28.4s, v20.4s +sqrdmulh v9.4S, v2.4S, v7.s[0] +add v28.4s, v28.4s, v20.4s +mla v22.4S, v17.4S, v31.s[0] +sub v17.4s, v12.4s, v3.4s +sqrdmulh v20.4S, v30.4S, v7.s[1] +add v12.4s, v12.4s, v3.4s +mla v13.4S, v16.4S, v31.s[0] +sub v16.4s, v8.4s, v22.4s +sqrdmulh v3.4S, v28.4S, v7.s[2] +add v8.4s, v8.4s, v22.4s +mla v14.4S, v21.4S, v31.s[0] +sub v21.4s, v24.4s, v13.4s +sqrdmulh v22.4S, v11.4S, v7.s[3] +add v24.4s, v24.4s, v13.4s +mul v2.4S, v2.4S,v23.s[0] +sub v13.4s, v26.4s, v14.4s +mul v30.4S, v30.4S,v23.s[1] +add v26.4s, v26.4s, v14.4s +mla v2.4S, v9.4S, v31.s[0] +str q16, [x0, #336] +mla v30.4S, v20.4S, v31.s[0] +str q8, [x0, #272] +mul v28.4S, v28.4S,v23.s[2] +str q17, [x0, #464] +mul v11.4S, v11.4S,v23.s[3] +str q12, [x0, #400] +mla v28.4S, v3.4S, v31.s[0] +str q21, [x0, #208] +mla v11.4S, v22.4S, v31.s[0] +str q24, [x0, #144] +str q13, [x0, #80] +str q26, [x0, #16] +sub v26.4s, v6.4s, v2.4s +str q26, [x0, #592] +add v6.4s, v6.4s, v2.4s +sub v2.4s, v1.4s, v30.4s +str q6, [x0, #528] +add v1.4s, v1.4s, v30.4s +sub v30.4s, v27.4s, v28.4s +str q2, [x0, #720] +add v27.4s, v27.4s, v28.4s +sub v28.4s, v18.4s, v11.4s +str q1, [x0, #656] +add v18.4s, v18.4s, v11.4s +str q30, [x0, #848] +str q27, [x0, #784] +str q28, [x0, #976] +str q18, [x0, #912] +ldr q4, [x17, #+128] +ldr q5, [x17, #+144] +ldr q25, [x17, #+160] +ldr q10, [x17, #+176] +ldr q15, [x17, #+192] +ldr q29, [x17, #+208] +ldr q19, [x17, #+224] +ldr q0, [x17, #+240] +ldr q14, [x0, #32] +ldr q9, [x0, #48] +ldr q16, [x0, #0] +ldr q20, [x0, #16] +sqrdmulh v8.4S, v14.4S, v5.s[0] +mul v14.4S, v14.4S,v4.s[0] +mla v14.4S, v8.4S, v31.s[0] +sub v8.4s, v16.4s, v14.4s +add v16.4s, v16.4s, v14.4s +sqrdmulh v14.4S, v9.4S, v5.s[0] +mul v9.4S, v9.4S,v4.s[0] +mla v9.4S, v14.4S, v31.s[0] +sub v14.4s, v20.4s, v9.4s +add v20.4s, v20.4s, v9.4s +sqrdmulh v9.4S, v20.4S, v5.s[1] +mul v20.4S, v20.4S,v4.s[1] +mla v20.4S, v9.4S, v31.s[0] +sub v9.4s, v16.4s, v20.4s +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v14.4S, v5.s[2] +mul v14.4S, v14.4S,v4.s[2] +mla v14.4S, v20.4S, v31.s[0] +sub v20.4s, v8.4s, v14.4s +add v8.4s, v8.4s, v14.4s +trn1 v14.4S, v16.4S, v9.4S +trn2 v17.4S, v16.4S, v9.4S +trn1 v12.4S, v8.4S, v20.4S +trn2 v3.4S, v8.4S, v20.4S +trn2 v8.2D, v14.2D, v12.2D +trn2 v20.2D, v17.2D, v3.2D +trn1 v16.2D, v14.2D, v12.2D +trn1 v9.2D, v17.2D, v3.2D +sqrdmulh v3.4S, v8.4S, v10.4S +mul v8.4S, v8.4S,v25.4S +mla v8.4S, v3.4S, v31.s[0] +sub v3.4s, v16.4s, v8.4s +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v20.4S, v10.4S +mul v20.4S, v20.4S,v25.4S +mla v20.4S, v8.4S, v31.s[0] +sub v8.4s, v9.4s, v20.4s +add v9.4s, v9.4s, v20.4s +sqrdmulh v20.4S, v9.4S, v29.4S +mul v9.4S, v9.4S,v15.4S +mla v9.4S, v20.4S, v31.s[0] +sub v20.4s, v16.4s, v9.4s +add v16.4s, v16.4s, v9.4s +sqrdmulh v9.4S, v8.4S, v0.4S +mul v8.4S, v8.4S,v19.4S +mla v8.4S, v9.4S, v31.s[0] +sub v9.4s, v3.4s, v8.4s +add v3.4s, v3.4s, v8.4s +str q16, [x0, #0] +str q20, [x0, #16] +str q3, [x0, #32] +str q9, [x0, #48] +ldr q9, [x17, #+256] +ldr q3, [x17, #+272] +ldr q20, [x17, #+288] +ldr q16, [x17, #+304] +ldr q8, [x17, #+320] +ldr q17, [x17, #+336] +ldr q12, [x17, #+352] +ldr q14, [x17, #+368] +ldr q0, [x0, #96] +ldr q19, [x0, #112] +ldr q29, [x0, #64] +ldr q15, [x0, #80] +sqrdmulh v10.4S, v0.4S, v3.s[0] +mul v0.4S, v0.4S,v9.s[0] +mla v0.4S, v10.4S, v31.s[0] +sub v10.4s, v29.4s, v0.4s +add v29.4s, v29.4s, v0.4s +sqrdmulh v0.4S, v19.4S, v3.s[0] +mul v19.4S, v19.4S,v9.s[0] +mla v19.4S, v0.4S, v31.s[0] +sub v0.4s, v15.4s, v19.4s +add v15.4s, v15.4s, v19.4s +sqrdmulh v19.4S, v15.4S, v3.s[1] +mul v15.4S, v15.4S,v9.s[1] +mla v15.4S, v19.4S, v31.s[0] +sub v19.4s, v29.4s, v15.4s +add v29.4s, v29.4s, v15.4s +sqrdmulh v15.4S, v0.4S, v3.s[2] +mul v0.4S, v0.4S,v9.s[2] +mla v0.4S, v15.4S, v31.s[0] +sub v15.4s, v10.4s, v0.4s +add v10.4s, v10.4s, v0.4s +trn1 v0.4S, v29.4S, v19.4S +trn2 v25.4S, v29.4S, v19.4S +trn1 v5.4S, v10.4S, v15.4S +trn2 v4.4S, v10.4S, v15.4S +trn2 v10.2D, v0.2D, v5.2D +trn2 v15.2D, v25.2D, v4.2D +trn1 v29.2D, v0.2D, v5.2D +trn1 v19.2D, v25.2D, v4.2D +sqrdmulh v4.4S, v10.4S, v16.4S +mul v10.4S, v10.4S,v20.4S +mla v10.4S, v4.4S, v31.s[0] +sub v4.4s, v29.4s, v10.4s +add v29.4s, v29.4s, v10.4s +sqrdmulh v10.4S, v15.4S, v16.4S +mul v15.4S, v15.4S,v20.4S +mla v15.4S, v10.4S, v31.s[0] +sub v10.4s, v19.4s, v15.4s +add v19.4s, v19.4s, v15.4s +sqrdmulh v15.4S, v19.4S, v17.4S +mul v19.4S, v19.4S,v8.4S +mla v19.4S, v15.4S, v31.s[0] +sub v15.4s, v29.4s, v19.4s +add v29.4s, v29.4s, v19.4s +sqrdmulh v19.4S, v10.4S, v14.4S +mul v10.4S, v10.4S,v12.4S +mla v10.4S, v19.4S, v31.s[0] +sub v19.4s, v4.4s, v10.4s +add v4.4s, v4.4s, v10.4s +str q29, [x0, #64] +str q15, [x0, #80] +str q4, [x0, #96] +str q19, [x0, #112] +ldr q19, [x17, #+384] +ldr q4, [x17, #+400] +ldr q15, [x17, #+416] +ldr q29, [x17, #+432] +ldr q10, [x17, #+448] +ldr q25, [x17, #+464] +ldr q5, [x17, #+480] +ldr q0, [x17, #+496] +ldr q14, [x0, #160] +ldr q12, [x0, #176] +ldr q17, [x0, #128] +ldr q8, [x0, #144] +sqrdmulh v16.4S, v14.4S, v4.s[0] +mul v14.4S, v14.4S,v19.s[0] +mla v14.4S, v16.4S, v31.s[0] +sub v16.4s, v17.4s, v14.4s +add v17.4s, v17.4s, v14.4s +sqrdmulh v14.4S, v12.4S, v4.s[0] +mul v12.4S, v12.4S,v19.s[0] +mla v12.4S, v14.4S, v31.s[0] +sub v14.4s, v8.4s, v12.4s +add v8.4s, v8.4s, v12.4s +sqrdmulh v12.4S, v8.4S, v4.s[1] +mul v8.4S, v8.4S,v19.s[1] +mla v8.4S, v12.4S, v31.s[0] +sub v12.4s, v17.4s, v8.4s +add v17.4s, v17.4s, v8.4s +sqrdmulh v8.4S, v14.4S, v4.s[2] +mul v14.4S, v14.4S,v19.s[2] +mla v14.4S, v8.4S, v31.s[0] +sub v8.4s, v16.4s, v14.4s +add v16.4s, v16.4s, v14.4s +trn1 v14.4S, v17.4S, v12.4S +trn2 v20.4S, v17.4S, v12.4S +trn1 v3.4S, v16.4S, v8.4S +trn2 v9.4S, v16.4S, v8.4S +trn2 v16.2D, v14.2D, v3.2D +trn2 v8.2D, v20.2D, v9.2D +trn1 v17.2D, v14.2D, v3.2D +trn1 v12.2D, v20.2D, v9.2D +sqrdmulh v9.4S, v16.4S, v29.4S +mul v16.4S, v16.4S,v15.4S +mla v16.4S, v9.4S, v31.s[0] +sub v9.4s, v17.4s, v16.4s +add v17.4s, v17.4s, v16.4s +sqrdmulh v16.4S, v8.4S, v29.4S +mul v8.4S, v8.4S,v15.4S +mla v8.4S, v16.4S, v31.s[0] +sub v16.4s, v12.4s, v8.4s +add v12.4s, v12.4s, v8.4s +sqrdmulh v8.4S, v12.4S, v25.4S +mul v12.4S, v12.4S,v10.4S +mla v12.4S, v8.4S, v31.s[0] +sub v8.4s, v17.4s, v12.4s +add v17.4s, v17.4s, v12.4s +sqrdmulh v12.4S, v16.4S, v0.4S +mul v16.4S, v16.4S,v5.4S +mla v16.4S, v12.4S, v31.s[0] +sub v12.4s, v9.4s, v16.4s +add v9.4s, v9.4s, v16.4s +str q17, [x0, #128] +str q8, [x0, #144] +str q9, [x0, #160] +str q12, [x0, #176] +ldr q12, [x17, #+512] +ldr q9, [x17, #+528] +ldr q8, [x17, #+544] +ldr q17, [x17, #+560] +ldr q16, [x17, #+576] +ldr q20, [x17, #+592] +ldr q3, [x17, #+608] +ldr q14, [x17, #+624] +ldr q0, [x0, #224] +ldr q5, [x0, #240] +ldr q25, [x0, #192] +ldr q10, [x0, #208] +sqrdmulh v29.4S, v0.4S, v9.s[0] +mul v0.4S, v0.4S,v12.s[0] +mla v0.4S, v29.4S, v31.s[0] +sub v29.4s, v25.4s, v0.4s +add v25.4s, v25.4s, v0.4s +sqrdmulh v0.4S, v5.4S, v9.s[0] +mul v5.4S, v5.4S,v12.s[0] +mla v5.4S, v0.4S, v31.s[0] +sub v0.4s, v10.4s, v5.4s +add v10.4s, v10.4s, v5.4s +sqrdmulh v5.4S, v10.4S, v9.s[1] +mul v10.4S, v10.4S,v12.s[1] +mla v10.4S, v5.4S, v31.s[0] +sub v5.4s, v25.4s, v10.4s +add v25.4s, v25.4s, v10.4s +sqrdmulh v10.4S, v0.4S, v9.s[2] +mul v0.4S, v0.4S,v12.s[2] +mla v0.4S, v10.4S, v31.s[0] +sub v10.4s, v29.4s, v0.4s +add v29.4s, v29.4s, v0.4s +trn1 v0.4S, v25.4S, v5.4S +trn2 v15.4S, v25.4S, v5.4S +trn1 v4.4S, v29.4S, v10.4S +trn2 v19.4S, v29.4S, v10.4S +trn2 v29.2D, v0.2D, v4.2D +trn2 v10.2D, v15.2D, v19.2D +trn1 v25.2D, v0.2D, v4.2D +trn1 v5.2D, v15.2D, v19.2D +sqrdmulh v19.4S, v29.4S, v17.4S +mul v29.4S, v29.4S,v8.4S +mla v29.4S, v19.4S, v31.s[0] +sub v19.4s, v25.4s, v29.4s +add v25.4s, v25.4s, v29.4s +sqrdmulh v29.4S, v10.4S, v17.4S +mul v10.4S, v10.4S,v8.4S +mla v10.4S, v29.4S, v31.s[0] +sub v29.4s, v5.4s, v10.4s +add v5.4s, v5.4s, v10.4s +sqrdmulh v10.4S, v5.4S, v20.4S +mul v5.4S, v5.4S,v16.4S +mla v5.4S, v10.4S, v31.s[0] +sub v10.4s, v25.4s, v5.4s +add v25.4s, v25.4s, v5.4s +sqrdmulh v5.4S, v29.4S, v14.4S +mul v29.4S, v29.4S,v3.4S +mla v29.4S, v5.4S, v31.s[0] +sub v5.4s, v19.4s, v29.4s +add v19.4s, v19.4s, v29.4s +str q25, [x0, #192] +str q10, [x0, #208] +str q19, [x0, #224] +str q5, [x0, #240] +ldr q5, [x17, #+640] +ldr q19, [x17, #+656] +ldr q10, [x17, #+672] +ldr q25, [x17, #+688] +ldr q29, [x17, #+704] +ldr q15, [x17, #+720] +ldr q4, [x17, #+736] +ldr q0, [x17, #+752] +ldr q14, [x0, #288] +ldr q3, [x0, #304] +ldr q20, [x0, #256] +ldr q16, [x0, #272] +sqrdmulh v17.4S, v14.4S, v19.s[0] +mul v14.4S, v14.4S,v5.s[0] +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v20.4s, v14.4s +add v20.4s, v20.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v19.s[0] +mul v3.4S, v3.4S,v5.s[0] +mla v3.4S, v14.4S, v31.s[0] +sub v14.4s, v16.4s, v3.4s +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v16.4S, v19.s[1] +mul v16.4S, v16.4S,v5.s[1] +mla v16.4S, v3.4S, v31.s[0] +sub v3.4s, v20.4s, v16.4s +add v20.4s, v20.4s, v16.4s +sqrdmulh v16.4S, v14.4S, v19.s[2] +mul v14.4S, v14.4S,v5.s[2] +mla v14.4S, v16.4S, v31.s[0] +sub v16.4s, v17.4s, v14.4s +add v17.4s, v17.4s, v14.4s +trn1 v14.4S, v20.4S, v3.4S +trn2 v8.4S, v20.4S, v3.4S +trn1 v9.4S, v17.4S, v16.4S +trn2 v12.4S, v17.4S, v16.4S +trn2 v17.2D, v14.2D, v9.2D +trn2 v16.2D, v8.2D, v12.2D +trn1 v20.2D, v14.2D, v9.2D +trn1 v3.2D, v8.2D, v12.2D +sqrdmulh v12.4S, v17.4S, v25.4S +mul v17.4S, v17.4S,v10.4S +mla v17.4S, v12.4S, v31.s[0] +sub v12.4s, v20.4s, v17.4s +add v20.4s, v20.4s, v17.4s +sqrdmulh v17.4S, v16.4S, v25.4S +mul v16.4S, v16.4S,v10.4S +mla v16.4S, v17.4S, v31.s[0] +sub v17.4s, v3.4s, v16.4s +add v3.4s, v3.4s, v16.4s +sqrdmulh v16.4S, v3.4S, v15.4S +mul v3.4S, v3.4S,v29.4S +mla v3.4S, v16.4S, v31.s[0] +sub v16.4s, v20.4s, v3.4s +add v20.4s, v20.4s, v3.4s +sqrdmulh v3.4S, v17.4S, v0.4S +mul v17.4S, v17.4S,v4.4S +mla v17.4S, v3.4S, v31.s[0] +sub v3.4s, v12.4s, v17.4s +add v12.4s, v12.4s, v17.4s +str q20, [x0, #256] +str q16, [x0, #272] +str q12, [x0, #288] +str q3, [x0, #304] +ldr q3, [x17, #+768] +ldr q12, [x17, #+784] +ldr q16, [x17, #+800] +ldr q20, [x17, #+816] +ldr q17, [x17, #+832] +ldr q8, [x17, #+848] +ldr q9, [x17, #+864] +ldr q14, [x17, #+880] +ldr q0, [x0, #352] +ldr q4, [x0, #368] +ldr q15, [x0, #320] +ldr q29, [x0, #336] +sqrdmulh v25.4S, v0.4S, v12.s[0] +mul v0.4S, v0.4S,v3.s[0] +mla v0.4S, v25.4S, v31.s[0] +sub v25.4s, v15.4s, v0.4s +add v15.4s, v15.4s, v0.4s +sqrdmulh v0.4S, v4.4S, v12.s[0] +mul v4.4S, v4.4S,v3.s[0] +mla v4.4S, v0.4S, v31.s[0] +sub v0.4s, v29.4s, v4.4s +add v29.4s, v29.4s, v4.4s +sqrdmulh v4.4S, v29.4S, v12.s[1] +mul v29.4S, v29.4S,v3.s[1] +mla v29.4S, v4.4S, v31.s[0] +sub v4.4s, v15.4s, v29.4s +add v15.4s, v15.4s, v29.4s +sqrdmulh v29.4S, v0.4S, v12.s[2] +mul v0.4S, v0.4S,v3.s[2] +mla v0.4S, v29.4S, v31.s[0] +sub v29.4s, v25.4s, v0.4s +add v25.4s, v25.4s, v0.4s +trn1 v0.4S, v15.4S, v4.4S +trn2 v10.4S, v15.4S, v4.4S +trn1 v19.4S, v25.4S, v29.4S +trn2 v5.4S, v25.4S, v29.4S +trn2 v25.2D, v0.2D, v19.2D +trn2 v29.2D, v10.2D, v5.2D +trn1 v15.2D, v0.2D, v19.2D +trn1 v4.2D, v10.2D, v5.2D +sqrdmulh v5.4S, v25.4S, v20.4S +mul v25.4S, v25.4S,v16.4S +mla v25.4S, v5.4S, v31.s[0] +sub v5.4s, v15.4s, v25.4s +add v15.4s, v15.4s, v25.4s +sqrdmulh v25.4S, v29.4S, v20.4S +mul v29.4S, v29.4S,v16.4S +mla v29.4S, v25.4S, v31.s[0] +sub v25.4s, v4.4s, v29.4s +add v4.4s, v4.4s, v29.4s +sqrdmulh v29.4S, v4.4S, v8.4S +mul v4.4S, v4.4S,v17.4S +mla v4.4S, v29.4S, v31.s[0] +sub v29.4s, v15.4s, v4.4s +add v15.4s, v15.4s, v4.4s +sqrdmulh v4.4S, v25.4S, v14.4S +mul v25.4S, v25.4S,v9.4S +mla v25.4S, v4.4S, v31.s[0] +sub v4.4s, v5.4s, v25.4s +add v5.4s, v5.4s, v25.4s +str q15, [x0, #320] +str q29, [x0, #336] +str q5, [x0, #352] +str q4, [x0, #368] +ldr q4, [x17, #+896] +ldr q5, [x17, #+912] +ldr q29, [x17, #+928] +ldr q15, [x17, #+944] +ldr q25, [x17, #+960] +ldr q10, [x17, #+976] +ldr q19, [x17, #+992] +ldr q0, [x17, #+1008] +ldr q14, [x0, #416] +ldr q9, [x0, #432] +ldr q8, [x0, #384] +ldr q17, [x0, #400] +sqrdmulh v20.4S, v14.4S, v5.s[0] +mul v14.4S, v14.4S,v4.s[0] +mla v14.4S, v20.4S, v31.s[0] +sub v20.4s, v8.4s, v14.4s +add v8.4s, v8.4s, v14.4s +sqrdmulh v14.4S, v9.4S, v5.s[0] +mul v9.4S, v9.4S,v4.s[0] +mla v9.4S, v14.4S, v31.s[0] +sub v14.4s, v17.4s, v9.4s +add v17.4s, v17.4s, v9.4s +sqrdmulh v9.4S, v17.4S, v5.s[1] +mul v17.4S, v17.4S,v4.s[1] +mla v17.4S, v9.4S, v31.s[0] +sub v9.4s, v8.4s, v17.4s +add v8.4s, v8.4s, v17.4s +sqrdmulh v17.4S, v14.4S, v5.s[2] +mul v14.4S, v14.4S,v4.s[2] +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v20.4s, v14.4s +add v20.4s, v20.4s, v14.4s +trn1 v14.4S, v8.4S, v9.4S +trn2 v16.4S, v8.4S, v9.4S +trn1 v12.4S, v20.4S, v17.4S +trn2 v3.4S, v20.4S, v17.4S +trn2 v20.2D, v14.2D, v12.2D +trn2 v17.2D, v16.2D, v3.2D +trn1 v8.2D, v14.2D, v12.2D +trn1 v9.2D, v16.2D, v3.2D +sqrdmulh v3.4S, v20.4S, v15.4S +mul v20.4S, v20.4S,v29.4S +mla v20.4S, v3.4S, v31.s[0] +sub v3.4s, v8.4s, v20.4s +add v8.4s, v8.4s, v20.4s +sqrdmulh v20.4S, v17.4S, v15.4S +mul v17.4S, v17.4S,v29.4S +mla v17.4S, v20.4S, v31.s[0] +sub v20.4s, v9.4s, v17.4s +add v9.4s, v9.4s, v17.4s +sqrdmulh v17.4S, v9.4S, v10.4S +mul v9.4S, v9.4S,v25.4S +mla v9.4S, v17.4S, v31.s[0] +sub v17.4s, v8.4s, v9.4s +add v8.4s, v8.4s, v9.4s +sqrdmulh v9.4S, v20.4S, v0.4S +mul v20.4S, v20.4S,v19.4S +mla v20.4S, v9.4S, v31.s[0] +sub v9.4s, v3.4s, v20.4s +add v3.4s, v3.4s, v20.4s +str q8, [x0, #384] +str q17, [x0, #400] +str q3, [x0, #416] +str q9, [x0, #432] +ldr q9, [x17, #+1024] +ldr q3, [x17, #+1040] +ldr q17, [x17, #+1056] +ldr q8, [x17, #+1072] +ldr q20, [x17, #+1088] +ldr q16, [x17, #+1104] +ldr q12, [x17, #+1120] +ldr q14, [x17, #+1136] +ldr q0, [x0, #480] +ldr q19, [x0, #496] +ldr q10, [x0, #448] +ldr q25, [x0, #464] +sqrdmulh v15.4S, v0.4S, v3.s[0] +mul v0.4S, v0.4S,v9.s[0] +mla v0.4S, v15.4S, v31.s[0] +sub v15.4s, v10.4s, v0.4s +add v10.4s, v10.4s, v0.4s +sqrdmulh v0.4S, v19.4S, v3.s[0] +mul v19.4S, v19.4S,v9.s[0] +mla v19.4S, v0.4S, v31.s[0] +sub v0.4s, v25.4s, v19.4s +add v25.4s, v25.4s, v19.4s +sqrdmulh v19.4S, v25.4S, v3.s[1] +mul v25.4S, v25.4S,v9.s[1] +mla v25.4S, v19.4S, v31.s[0] +sub v19.4s, v10.4s, v25.4s +add v10.4s, v10.4s, v25.4s +sqrdmulh v25.4S, v0.4S, v3.s[2] +mul v0.4S, v0.4S,v9.s[2] +mla v0.4S, v25.4S, v31.s[0] +sub v25.4s, v15.4s, v0.4s +add v15.4s, v15.4s, v0.4s +trn1 v0.4S, v10.4S, v19.4S +trn2 v29.4S, v10.4S, v19.4S +trn1 v5.4S, v15.4S, v25.4S +trn2 v4.4S, v15.4S, v25.4S +trn2 v15.2D, v0.2D, v5.2D +trn2 v25.2D, v29.2D, v4.2D +trn1 v10.2D, v0.2D, v5.2D +trn1 v19.2D, v29.2D, v4.2D +sqrdmulh v4.4S, v15.4S, v8.4S +mul v15.4S, v15.4S,v17.4S +mla v15.4S, v4.4S, v31.s[0] +sub v4.4s, v10.4s, v15.4s +add v10.4s, v10.4s, v15.4s +sqrdmulh v15.4S, v25.4S, v8.4S +mul v25.4S, v25.4S,v17.4S +mla v25.4S, v15.4S, v31.s[0] +sub v15.4s, v19.4s, v25.4s +add v19.4s, v19.4s, v25.4s +sqrdmulh v25.4S, v19.4S, v16.4S +mul v19.4S, v19.4S,v20.4S +mla v19.4S, v25.4S, v31.s[0] +sub v25.4s, v10.4s, v19.4s +add v10.4s, v10.4s, v19.4s +sqrdmulh v19.4S, v15.4S, v14.4S +mul v15.4S, v15.4S,v12.4S +mla v15.4S, v19.4S, v31.s[0] +sub v19.4s, v4.4s, v15.4s +add v4.4s, v4.4s, v15.4s +str q10, [x0, #448] +str q25, [x0, #464] +str q4, [x0, #480] +str q19, [x0, #496] +ldr q19, [x17, #+1152] +ldr q4, [x17, #+1168] +ldr q25, [x17, #+1184] +ldr q10, [x17, #+1200] +ldr q15, [x17, #+1216] +ldr q29, [x17, #+1232] +ldr q5, [x17, #+1248] +ldr q0, [x17, #+1264] +ldr q14, [x0, #544] +ldr q12, [x0, #560] +ldr q16, [x0, #512] +ldr q20, [x0, #528] +sqrdmulh v8.4S, v14.4S, v4.s[0] +mul v14.4S, v14.4S,v19.s[0] +mla v14.4S, v8.4S, v31.s[0] +sub v8.4s, v16.4s, v14.4s +add v16.4s, v16.4s, v14.4s +sqrdmulh v14.4S, v12.4S, v4.s[0] +mul v12.4S, v12.4S,v19.s[0] +mla v12.4S, v14.4S, v31.s[0] +sub v14.4s, v20.4s, v12.4s +add v20.4s, v20.4s, v12.4s +sqrdmulh v12.4S, v20.4S, v4.s[1] +mul v20.4S, v20.4S,v19.s[1] +mla v20.4S, v12.4S, v31.s[0] +sub v12.4s, v16.4s, v20.4s +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v14.4S, v4.s[2] +mul v14.4S, v14.4S,v19.s[2] +mla v14.4S, v20.4S, v31.s[0] +sub v20.4s, v8.4s, v14.4s +add v8.4s, v8.4s, v14.4s +trn1 v14.4S, v16.4S, v12.4S +trn2 v17.4S, v16.4S, v12.4S +trn1 v3.4S, v8.4S, v20.4S +trn2 v9.4S, v8.4S, v20.4S +trn2 v8.2D, v14.2D, v3.2D +trn2 v20.2D, v17.2D, v9.2D +trn1 v16.2D, v14.2D, v3.2D +trn1 v12.2D, v17.2D, v9.2D +sqrdmulh v9.4S, v8.4S, v10.4S +mul v8.4S, v8.4S,v25.4S +mla v8.4S, v9.4S, v31.s[0] +sub v9.4s, v16.4s, v8.4s +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v20.4S, v10.4S +mul v20.4S, v20.4S,v25.4S +mla v20.4S, v8.4S, v31.s[0] +sub v8.4s, v12.4s, v20.4s +add v12.4s, v12.4s, v20.4s +sqrdmulh v20.4S, v12.4S, v29.4S +mul v12.4S, v12.4S,v15.4S +mla v12.4S, v20.4S, v31.s[0] +sub v20.4s, v16.4s, v12.4s +add v16.4s, v16.4s, v12.4s +sqrdmulh v12.4S, v8.4S, v0.4S +mul v8.4S, v8.4S,v5.4S +mla v8.4S, v12.4S, v31.s[0] +sub v12.4s, v9.4s, v8.4s +add v9.4s, v9.4s, v8.4s +str q16, [x0, #512] +str q20, [x0, #528] +str q9, [x0, #544] +str q12, [x0, #560] +ldr q12, [x17, #+1280] +ldr q9, [x17, #+1296] +ldr q20, [x17, #+1312] +ldr q16, [x17, #+1328] +ldr q8, [x17, #+1344] +ldr q17, [x17, #+1360] +ldr q3, [x17, #+1376] +ldr q14, [x17, #+1392] +ldr q0, [x0, #608] +ldr q5, [x0, #624] +ldr q29, [x0, #576] +ldr q15, [x0, #592] +sqrdmulh v10.4S, v0.4S, v9.s[0] +mul v0.4S, v0.4S,v12.s[0] +mla v0.4S, v10.4S, v31.s[0] +sub v10.4s, v29.4s, v0.4s +add v29.4s, v29.4s, v0.4s +sqrdmulh v0.4S, v5.4S, v9.s[0] +mul v5.4S, v5.4S,v12.s[0] +mla v5.4S, v0.4S, v31.s[0] +sub v0.4s, v15.4s, v5.4s +add v15.4s, v15.4s, v5.4s +sqrdmulh v5.4S, v15.4S, v9.s[1] +mul v15.4S, v15.4S,v12.s[1] +mla v15.4S, v5.4S, v31.s[0] +sub v5.4s, v29.4s, v15.4s +add v29.4s, v29.4s, v15.4s +sqrdmulh v15.4S, v0.4S, v9.s[2] +mul v0.4S, v0.4S,v12.s[2] +mla v0.4S, v15.4S, v31.s[0] +sub v15.4s, v10.4s, v0.4s +add v10.4s, v10.4s, v0.4s +trn1 v0.4S, v29.4S, v5.4S +trn2 v25.4S, v29.4S, v5.4S +trn1 v4.4S, v10.4S, v15.4S +trn2 v19.4S, v10.4S, v15.4S +trn2 v10.2D, v0.2D, v4.2D +trn2 v15.2D, v25.2D, v19.2D +trn1 v29.2D, v0.2D, v4.2D +trn1 v5.2D, v25.2D, v19.2D +sqrdmulh v19.4S, v10.4S, v16.4S +mul v10.4S, v10.4S,v20.4S +mla v10.4S, v19.4S, v31.s[0] +sub v19.4s, v29.4s, v10.4s +add v29.4s, v29.4s, v10.4s +sqrdmulh v10.4S, v15.4S, v16.4S +mul v15.4S, v15.4S,v20.4S +mla v15.4S, v10.4S, v31.s[0] +sub v10.4s, v5.4s, v15.4s +add v5.4s, v5.4s, v15.4s +sqrdmulh v15.4S, v5.4S, v17.4S +mul v5.4S, v5.4S,v8.4S +mla v5.4S, v15.4S, v31.s[0] +sub v15.4s, v29.4s, v5.4s +add v29.4s, v29.4s, v5.4s +sqrdmulh v5.4S, v10.4S, v14.4S +mul v10.4S, v10.4S,v3.4S +mla v10.4S, v5.4S, v31.s[0] +sub v5.4s, v19.4s, v10.4s +add v19.4s, v19.4s, v10.4s +str q29, [x0, #576] +str q15, [x0, #592] +str q19, [x0, #608] +str q5, [x0, #624] +ldr q5, [x17, #+1408] +ldr q19, [x17, #+1424] +ldr q15, [x17, #+1440] +ldr q29, [x17, #+1456] +ldr q10, [x17, #+1472] +ldr q25, [x17, #+1488] +ldr q4, [x17, #+1504] +ldr q0, [x17, #+1520] +ldr q14, [x0, #672] +ldr q3, [x0, #688] +ldr q17, [x0, #640] +ldr q8, [x0, #656] +sqrdmulh v16.4S, v14.4S, v19.s[0] +mul v14.4S, v14.4S,v5.s[0] +mla v14.4S, v16.4S, v31.s[0] +sub v16.4s, v17.4s, v14.4s +add v17.4s, v17.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v19.s[0] +mul v3.4S, v3.4S,v5.s[0] +mla v3.4S, v14.4S, v31.s[0] +sub v14.4s, v8.4s, v3.4s +add v8.4s, v8.4s, v3.4s +sqrdmulh v3.4S, v8.4S, v19.s[1] +mul v8.4S, v8.4S,v5.s[1] +mla v8.4S, v3.4S, v31.s[0] +sub v3.4s, v17.4s, v8.4s +add v17.4s, v17.4s, v8.4s +sqrdmulh v8.4S, v14.4S, v19.s[2] +mul v14.4S, v14.4S,v5.s[2] +mla v14.4S, v8.4S, v31.s[0] +sub v8.4s, v16.4s, v14.4s +add v16.4s, v16.4s, v14.4s +trn1 v14.4S, v17.4S, v3.4S +trn2 v20.4S, v17.4S, v3.4S +trn1 v9.4S, v16.4S, v8.4S +trn2 v12.4S, v16.4S, v8.4S +trn2 v16.2D, v14.2D, v9.2D +trn2 v8.2D, v20.2D, v12.2D +trn1 v17.2D, v14.2D, v9.2D +trn1 v3.2D, v20.2D, v12.2D +sqrdmulh v12.4S, v16.4S, v29.4S +mul v16.4S, v16.4S,v15.4S +mla v16.4S, v12.4S, v31.s[0] +sub v12.4s, v17.4s, v16.4s +add v17.4s, v17.4s, v16.4s +sqrdmulh v16.4S, v8.4S, v29.4S +mul v8.4S, v8.4S,v15.4S +mla v8.4S, v16.4S, v31.s[0] +sub v16.4s, v3.4s, v8.4s +add v3.4s, v3.4s, v8.4s +sqrdmulh v8.4S, v3.4S, v25.4S +mul v3.4S, v3.4S,v10.4S +mla v3.4S, v8.4S, v31.s[0] +sub v8.4s, v17.4s, v3.4s +add v17.4s, v17.4s, v3.4s +sqrdmulh v3.4S, v16.4S, v0.4S +mul v16.4S, v16.4S,v4.4S +mla v16.4S, v3.4S, v31.s[0] +sub v3.4s, v12.4s, v16.4s +add v12.4s, v12.4s, v16.4s +str q17, [x0, #640] +str q8, [x0, #656] +str q12, [x0, #672] +str q3, [x0, #688] +ldr q3, [x17, #+1536] +ldr q12, [x17, #+1552] +ldr q8, [x17, #+1568] +ldr q17, [x17, #+1584] +ldr q16, [x17, #+1600] +ldr q20, [x17, #+1616] +ldr q9, [x17, #+1632] +ldr q14, [x17, #+1648] +ldr q0, [x0, #736] +ldr q4, [x0, #752] +ldr q25, [x0, #704] +ldr q10, [x0, #720] +sqrdmulh v29.4S, v0.4S, v12.s[0] +mul v0.4S, v0.4S,v3.s[0] +mla v0.4S, v29.4S, v31.s[0] +sub v29.4s, v25.4s, v0.4s +add v25.4s, v25.4s, v0.4s +sqrdmulh v0.4S, v4.4S, v12.s[0] +mul v4.4S, v4.4S,v3.s[0] +mla v4.4S, v0.4S, v31.s[0] +sub v0.4s, v10.4s, v4.4s +add v10.4s, v10.4s, v4.4s +sqrdmulh v4.4S, v10.4S, v12.s[1] +mul v10.4S, v10.4S,v3.s[1] +mla v10.4S, v4.4S, v31.s[0] +sub v4.4s, v25.4s, v10.4s +add v25.4s, v25.4s, v10.4s +sqrdmulh v10.4S, v0.4S, v12.s[2] +mul v0.4S, v0.4S,v3.s[2] +mla v0.4S, v10.4S, v31.s[0] +sub v10.4s, v29.4s, v0.4s +add v29.4s, v29.4s, v0.4s +trn1 v0.4S, v25.4S, v4.4S +trn2 v15.4S, v25.4S, v4.4S +trn1 v19.4S, v29.4S, v10.4S +trn2 v5.4S, v29.4S, v10.4S +trn2 v29.2D, v0.2D, v19.2D +trn2 v10.2D, v15.2D, v5.2D +trn1 v25.2D, v0.2D, v19.2D +trn1 v4.2D, v15.2D, v5.2D +sqrdmulh v5.4S, v29.4S, v17.4S +mul v29.4S, v29.4S,v8.4S +mla v29.4S, v5.4S, v31.s[0] +sub v5.4s, v25.4s, v29.4s +add v25.4s, v25.4s, v29.4s +sqrdmulh v29.4S, v10.4S, v17.4S +mul v10.4S, v10.4S,v8.4S +mla v10.4S, v29.4S, v31.s[0] +sub v29.4s, v4.4s, v10.4s +add v4.4s, v4.4s, v10.4s +sqrdmulh v10.4S, v4.4S, v20.4S +mul v4.4S, v4.4S,v16.4S +mla v4.4S, v10.4S, v31.s[0] +sub v10.4s, v25.4s, v4.4s +add v25.4s, v25.4s, v4.4s +sqrdmulh v4.4S, v29.4S, v14.4S +mul v29.4S, v29.4S,v9.4S +mla v29.4S, v4.4S, v31.s[0] +sub v4.4s, v5.4s, v29.4s +add v5.4s, v5.4s, v29.4s +str q25, [x0, #704] +str q10, [x0, #720] +str q5, [x0, #736] +str q4, [x0, #752] +ldr q4, [x17, #+1664] +ldr q5, [x17, #+1680] +ldr q10, [x17, #+1696] +ldr q25, [x17, #+1712] +ldr q29, [x17, #+1728] +ldr q15, [x17, #+1744] +ldr q19, [x17, #+1760] +ldr q0, [x17, #+1776] +ldr q14, [x0, #800] +ldr q9, [x0, #816] +ldr q20, [x0, #768] +ldr q16, [x0, #784] +sqrdmulh v17.4S, v14.4S, v5.s[0] +mul v14.4S, v14.4S,v4.s[0] +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v20.4s, v14.4s +add v20.4s, v20.4s, v14.4s +sqrdmulh v14.4S, v9.4S, v5.s[0] +mul v9.4S, v9.4S,v4.s[0] +mla v9.4S, v14.4S, v31.s[0] +sub v14.4s, v16.4s, v9.4s +add v16.4s, v16.4s, v9.4s +sqrdmulh v9.4S, v16.4S, v5.s[1] +mul v16.4S, v16.4S,v4.s[1] +mla v16.4S, v9.4S, v31.s[0] +sub v9.4s, v20.4s, v16.4s +add v20.4s, v20.4s, v16.4s +sqrdmulh v16.4S, v14.4S, v5.s[2] +mul v14.4S, v14.4S,v4.s[2] +mla v14.4S, v16.4S, v31.s[0] +sub v16.4s, v17.4s, v14.4s +add v17.4s, v17.4s, v14.4s +trn1 v14.4S, v20.4S, v9.4S +trn2 v8.4S, v20.4S, v9.4S +trn1 v12.4S, v17.4S, v16.4S +trn2 v3.4S, v17.4S, v16.4S +trn2 v17.2D, v14.2D, v12.2D +trn2 v16.2D, v8.2D, v3.2D +trn1 v20.2D, v14.2D, v12.2D +trn1 v9.2D, v8.2D, v3.2D +sqrdmulh v3.4S, v17.4S, v25.4S +mul v17.4S, v17.4S,v10.4S +mla v17.4S, v3.4S, v31.s[0] +sub v3.4s, v20.4s, v17.4s +add v20.4s, v20.4s, v17.4s +sqrdmulh v17.4S, v16.4S, v25.4S +mul v16.4S, v16.4S,v10.4S +mla v16.4S, v17.4S, v31.s[0] +sub v17.4s, v9.4s, v16.4s +add v9.4s, v9.4s, v16.4s +sqrdmulh v16.4S, v9.4S, v15.4S +mul v9.4S, v9.4S,v29.4S +mla v9.4S, v16.4S, v31.s[0] +sub v16.4s, v20.4s, v9.4s +add v20.4s, v20.4s, v9.4s +sqrdmulh v9.4S, v17.4S, v0.4S +mul v17.4S, v17.4S,v19.4S +mla v17.4S, v9.4S, v31.s[0] +sub v9.4s, v3.4s, v17.4s +add v3.4s, v3.4s, v17.4s +str q20, [x0, #768] +str q16, [x0, #784] +str q3, [x0, #800] +str q9, [x0, #816] +ldr q9, [x17, #+1792] +ldr q3, [x17, #+1808] +ldr q16, [x17, #+1824] +ldr q20, [x17, #+1840] +ldr q17, [x17, #+1856] +ldr q8, [x17, #+1872] +ldr q12, [x17, #+1888] +ldr q14, [x17, #+1904] +ldr q0, [x0, #864] +ldr q19, [x0, #880] +ldr q15, [x0, #832] +ldr q29, [x0, #848] +sqrdmulh v25.4S, v0.4S, v3.s[0] +mul v0.4S, v0.4S,v9.s[0] +mla v0.4S, v25.4S, v31.s[0] +sub v25.4s, v15.4s, v0.4s +add v15.4s, v15.4s, v0.4s +sqrdmulh v0.4S, v19.4S, v3.s[0] +mul v19.4S, v19.4S,v9.s[0] +mla v19.4S, v0.4S, v31.s[0] +sub v0.4s, v29.4s, v19.4s +add v29.4s, v29.4s, v19.4s +sqrdmulh v19.4S, v29.4S, v3.s[1] +mul v29.4S, v29.4S,v9.s[1] +mla v29.4S, v19.4S, v31.s[0] +sub v19.4s, v15.4s, v29.4s +add v15.4s, v15.4s, v29.4s +sqrdmulh v29.4S, v0.4S, v3.s[2] +mul v0.4S, v0.4S,v9.s[2] +mla v0.4S, v29.4S, v31.s[0] +sub v29.4s, v25.4s, v0.4s +add v25.4s, v25.4s, v0.4s +trn1 v0.4S, v15.4S, v19.4S +trn2 v10.4S, v15.4S, v19.4S +trn1 v5.4S, v25.4S, v29.4S +trn2 v4.4S, v25.4S, v29.4S +trn2 v25.2D, v0.2D, v5.2D +trn2 v29.2D, v10.2D, v4.2D +trn1 v15.2D, v0.2D, v5.2D +trn1 v19.2D, v10.2D, v4.2D +sqrdmulh v4.4S, v25.4S, v20.4S +mul v25.4S, v25.4S,v16.4S +mla v25.4S, v4.4S, v31.s[0] +sub v4.4s, v15.4s, v25.4s +add v15.4s, v15.4s, v25.4s +sqrdmulh v25.4S, v29.4S, v20.4S +mul v29.4S, v29.4S,v16.4S +mla v29.4S, v25.4S, v31.s[0] +sub v25.4s, v19.4s, v29.4s +add v19.4s, v19.4s, v29.4s +sqrdmulh v29.4S, v19.4S, v8.4S +mul v19.4S, v19.4S,v17.4S +mla v19.4S, v29.4S, v31.s[0] +sub v29.4s, v15.4s, v19.4s +add v15.4s, v15.4s, v19.4s +sqrdmulh v19.4S, v25.4S, v14.4S +mul v25.4S, v25.4S,v12.4S +mla v25.4S, v19.4S, v31.s[0] +sub v19.4s, v4.4s, v25.4s +add v4.4s, v4.4s, v25.4s +str q15, [x0, #832] +str q29, [x0, #848] +str q4, [x0, #864] +str q19, [x0, #880] +ldr q19, [x17, #+1920] +ldr q4, [x17, #+1936] +ldr q29, [x17, #+1952] +ldr q15, [x17, #+1968] +ldr q25, [x17, #+1984] +ldr q10, [x17, #+2000] +ldr q5, [x17, #+2016] +ldr q0, [x17, #+2032] +ldr q14, [x0, #928] +ldr q12, [x0, #944] +ldr q8, [x0, #896] +ldr q17, [x0, #912] +sqrdmulh v20.4S, v14.4S, v4.s[0] +mul v14.4S, v14.4S,v19.s[0] +mla v14.4S, v20.4S, v31.s[0] +sub v20.4s, v8.4s, v14.4s +add v8.4s, v8.4s, v14.4s +sqrdmulh v14.4S, v12.4S, v4.s[0] +mul v12.4S, v12.4S,v19.s[0] +mla v12.4S, v14.4S, v31.s[0] +sub v14.4s, v17.4s, v12.4s +add v17.4s, v17.4s, v12.4s +sqrdmulh v12.4S, v17.4S, v4.s[1] +mul v17.4S, v17.4S,v19.s[1] +mla v17.4S, v12.4S, v31.s[0] +sub v12.4s, v8.4s, v17.4s +add v8.4s, v8.4s, v17.4s +sqrdmulh v17.4S, v14.4S, v4.s[2] +mul v14.4S, v14.4S,v19.s[2] +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v20.4s, v14.4s +add v20.4s, v20.4s, v14.4s +trn1 v14.4S, v8.4S, v12.4S +trn2 v16.4S, v8.4S, v12.4S +trn1 v3.4S, v20.4S, v17.4S +trn2 v9.4S, v20.4S, v17.4S +trn2 v20.2D, v14.2D, v3.2D +trn2 v17.2D, v16.2D, v9.2D +trn1 v8.2D, v14.2D, v3.2D +trn1 v12.2D, v16.2D, v9.2D +sqrdmulh v9.4S, v20.4S, v15.4S +mul v20.4S, v20.4S,v29.4S +mla v20.4S, v9.4S, v31.s[0] +sub v9.4s, v8.4s, v20.4s +add v8.4s, v8.4s, v20.4s +sqrdmulh v20.4S, v17.4S, v15.4S +mul v17.4S, v17.4S,v29.4S +mla v17.4S, v20.4S, v31.s[0] +sub v20.4s, v12.4s, v17.4s +add v12.4s, v12.4s, v17.4s +sqrdmulh v17.4S, v12.4S, v10.4S +mul v12.4S, v12.4S,v25.4S +mla v12.4S, v17.4S, v31.s[0] +sub v17.4s, v8.4s, v12.4s +add v8.4s, v8.4s, v12.4s +sqrdmulh v12.4S, v20.4S, v0.4S +mul v20.4S, v20.4S,v5.4S +mla v20.4S, v12.4S, v31.s[0] +sub v12.4s, v9.4s, v20.4s +add v9.4s, v9.4s, v20.4s +str q8, [x0, #896] +str q17, [x0, #912] +str q9, [x0, #928] +str q12, [x0, #944] +ldr q12, [x17, #+2048] +ldr q9, [x17, #+2064] +ldr q17, [x17, #+2080] +ldr q8, [x17, #+2096] +ldr q20, [x17, #+2112] +ldr q16, [x17, #+2128] +ldr q3, [x17, #+2144] +ldr q14, [x17, #+2160] +ldr q0, [x0, #992] +ldr q5, [x0, #1008] +ldr q10, [x0, #960] +ldr q25, [x0, #976] +sqrdmulh v15.4S, v0.4S, v9.s[0] +mul v0.4S, v0.4S,v12.s[0] +mla v0.4S, v15.4S, v31.s[0] +sub v15.4s, v10.4s, v0.4s +add v10.4s, v10.4s, v0.4s +sqrdmulh v0.4S, v5.4S, v9.s[0] +mul v5.4S, v5.4S,v12.s[0] +mla v5.4S, v0.4S, v31.s[0] +sub v0.4s, v25.4s, v5.4s +add v25.4s, v25.4s, v5.4s +sqrdmulh v5.4S, v25.4S, v9.s[1] +mul v25.4S, v25.4S,v12.s[1] +mla v25.4S, v5.4S, v31.s[0] +sub v5.4s, v10.4s, v25.4s +add v10.4s, v10.4s, v25.4s +sqrdmulh v25.4S, v0.4S, v9.s[2] +mul v0.4S, v0.4S,v12.s[2] +mla v0.4S, v25.4S, v31.s[0] +sub v25.4s, v15.4s, v0.4s +add v15.4s, v15.4s, v0.4s +trn1 v0.4S, v10.4S, v5.4S +trn2 v29.4S, v10.4S, v5.4S +trn1 v4.4S, v15.4S, v25.4S +trn2 v19.4S, v15.4S, v25.4S +trn2 v15.2D, v0.2D, v4.2D +trn2 v25.2D, v29.2D, v19.2D +trn1 v10.2D, v0.2D, v4.2D +trn1 v5.2D, v29.2D, v19.2D +sqrdmulh v19.4S, v15.4S, v8.4S +mul v15.4S, v15.4S,v17.4S +mla v15.4S, v19.4S, v31.s[0] +sub v19.4s, v10.4s, v15.4s +add v10.4s, v10.4s, v15.4s +sqrdmulh v15.4S, v25.4S, v8.4S +mul v25.4S, v25.4S,v17.4S +mla v25.4S, v15.4S, v31.s[0] +sub v15.4s, v5.4s, v25.4s +add v5.4s, v5.4s, v25.4s +sqrdmulh v25.4S, v5.4S, v16.4S +mul v5.4S, v5.4S,v20.4S +mla v5.4S, v25.4S, v31.s[0] +sub v25.4s, v10.4s, v5.4s +add v10.4s, v10.4s, v5.4s +sqrdmulh v5.4S, v15.4S, v14.4S +mul v15.4S, v15.4S,v3.4S +mla v15.4S, v5.4S, v31.s[0] +sub v5.4s, v19.4s, v15.4s +add v19.4s, v19.4s, v15.4s +str q10, [x0, #960] +str q25, [x0, #976] +str q19, [x0, #992] +str q5, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 2476 +// Instruction count: 2472 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_15_0.s b/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_15_0.s new file mode 100644 index 0000000..eda1068 --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_15_0.s @@ -0,0 +1,2506 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 26036764 // Layer 6, block 0 +.word 7065381 // Layer 6, block 1 +.word 11280567 // Layer 6, block 2 +.word 19695786 // Layer 6, block 3 +.word 1666225723 // Layer 6, block 0 +.word 452149874 // Layer 6, block 1 +.word 721901190 // Layer 6, block 2 +.word 1260434103 // Layer 6, block 3 +.word 28678040 // Layer 7, block 0 +.word 5637166 // Layer 7, block 2 +.word 18759424 // Layer 7, block 4 +.word 8648030 // Layer 7, block 6 +.word 1835254486 // Layer 7, block 0 +.word 360751090 // Layer 7, block 2 +.word 1200511508 // Layer 7, block 4 +.word 553431680 // Layer 7, block 6 +.word 7232147 // Layer 7, block 1 +.word 7430689 // Layer 7, block 3 +.word 14819378 // Layer 7, block 5 +.word 22112339 // Layer 7, block 7 +.word 462822084 // Layer 7, block 1 +.word 475527802 // Layer 7, block 3 +.word 948367809 // Layer 7, block 5 +.word 1415081692 // Layer 7, block 7 +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14834498 // Layer 6, block 4 +.word 22861321 // Layer 6, block 5 +.word 23033862 // Layer 6, block 6 +.word 32211066 // Layer 6, block 7 +.word 949335415 // Layer 6, block 4 +.word 1463012881 // Layer 6, block 5 +.word 1474054663 // Layer 6, block 6 +.word 2061350894 // Layer 6, block 7 +.word 7103825 // Layer 7, block 8 +.word 24338119 // Layer 7, block 10 +.word 6674394 // Layer 7, block 12 +.word 3716128 // Layer 7, block 14 +.word 454610102 // Layer 7, block 8 +.word 1557520740 // Layer 7, block 10 +.word 427128616 // Layer 7, block 12 +.word 237814041 // Layer 7, block 14 +.word 18577393 // Layer 7, block 9 +.word 17042091 // Layer 7, block 11 +.word 6574213 // Layer 7, block 13 +.word 24666803 // Layer 7, block 15 +.word 1188862414 // Layer 7, block 9 +.word 1090610585 // Layer 7, block 11 +.word 420717521 // Layer 7, block 13 +.word 1578554911 // Layer 7, block 15 +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 11253846 // Layer 6, block 8 +.word 16151303 // Layer 6, block 9 +.word 1821442 // Layer 6, block 10 +.word 23358663 // Layer 6, block 11 +.word 720191176 // Layer 6, block 8 +.word 1033604503 // Layer 6, block 9 +.word 116563391 // Layer 6, block 10 +.word 1494840340 // Layer 6, block 11 +.word 32787475 // Layer 7, block 16 +.word 8269259 // Layer 7, block 18 +.word 20826321 // Layer 7, block 20 +.word 21194054 // Layer 7, block 22 +.word 2098238255 // Layer 7, block 16 +.word 529192186 // Layer 7, block 18 +.word 1332782821 // Layer 7, block 20 +.word 1356315937 // Layer 7, block 22 +.word 28400654 // Layer 7, block 17 +.word 31090287 // Layer 7, block 19 +.word 26776841 // Layer 7, block 21 +.word 22281074 // Layer 7, block 23 +.word 1817503137 // Layer 7, block 17 +.word 1989626512 // Layer 7, block 19 +.word 1713587037 // Layer 7, block 21 +.word 1425879908 // Layer 7, block 23 +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 20504641 // Layer 6, block 12 +.word 7735096 // Layer 6, block 13 +.word 29463916 // Layer 6, block 14 +.word 23172067 // Layer 6, block 15 +.word 1312196872 // Layer 6, block 12 +.word 495008363 // Layer 6, block 13 +.word 1885546712 // Layer 6, block 14 +.word 1482899108 // Layer 6, block 15 +.word 1953000 // Layer 7, block 24 +.word 12766243 // Layer 7, block 26 +.word 16292342 // Layer 7, block 28 +.word 25143337 // Layer 7, block 30 +.word 124982461 // Layer 7, block 24 +.word 816977197 // Layer 7, block 26 +.word 1042630311 // Layer 7, block 28 +.word 1609050759 // Layer 7, block 30 +.word 12486848 // Layer 7, block 25 +.word 31556661 // Layer 7, block 27 +.word 28330310 // Layer 7, block 29 +.word 15137961 // Layer 7, block 31 +.word 799097282 // Layer 7, block 25 +.word 2019472170 // Layer 7, block 27 +.word 1813001465 // Layer 7, block 29 +.word 968755565 // Layer 7, block 31 +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 18663828 // Layer 6, block 16 +.word 25765932 // Layer 6, block 17 +.word 11779122 // Layer 6, block 18 +.word 29112305 // Layer 6, block 19 +.word 1194393831 // Layer 6, block 16 +.word 1648893798 // Layer 6, block 17 +.word 753806275 // Layer 6, block 18 +.word 1863045325 // Layer 6, block 19 +.word 33163184 // Layer 7, block 32 +.word 11550623 // Layer 7, block 34 +.word 25375595 // Layer 7, block 36 +.word 18254638 // Layer 7, block 38 +.word 2122281795 // Layer 7, block 32 +.word 739183455 // Layer 7, block 34 +.word 1623914137 // Layer 7, block 36 +.word 1168207670 // Layer 7, block 38 +.word 9551359 // Layer 7, block 33 +.word 33257316 // Layer 7, block 35 +.word 10387700 // Layer 7, block 37 +.word 4263629 // Layer 7, block 39 +.word 611240324 // Layer 7, block 33 +.word 2128305784 // Layer 7, block 35 +.word 664762063 // Layer 7, block 37 +.word 272851431 // Layer 7, block 39 +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 596073 // Layer 6, block 20 +.word 29039358 // Layer 6, block 21 +.word 6760262 // Layer 6, block 22 +.word 2228887 // Layer 6, block 23 +.word 38145761 // Layer 6, block 20 +.word 1858377074 // Layer 6, block 21 +.word 432623749 // Layer 6, block 22 +.word 142637881 // Layer 6, block 23 +.word 25929180 // Layer 7, block 40 +.word 23508428 // Layer 7, block 42 +.word 22560727 // Layer 7, block 44 +.word 29457393 // Layer 7, block 46 +.word 1659340873 // Layer 7, block 40 +.word 1504424569 // Layer 7, block 42 +.word 1443776334 // Layer 7, block 44 +.word 1885129272 // Layer 7, block 46 +.word 17371159 // Layer 7, block 41 +.word 11558208 // Layer 7, block 43 +.word 15755637 // Layer 7, block 45 +.word 20740787 // Layer 7, block 47 +.word 1111669329 // Layer 7, block 41 +.word 739668858 // Layer 7, block 43 +.word 1008283812 // Layer 7, block 45 +.word 1327309063 // Layer 7, block 47 +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 13624329 // Layer 6, block 24 +.word 9838349 // Layer 6, block 25 +.word 6934560 // Layer 6, block 26 +.word 11310234 // Layer 6, block 27 +.word 871890510 // Layer 6, block 24 +.word 629606282 // Layer 6, block 25 +.word 443777969 // Layer 6, block 26 +.word 723799733 // Layer 6, block 27 +.word 3153984 // Layer 7, block 48 +.word 15599806 // Layer 7, block 50 +.word 23484790 // Layer 7, block 52 +.word 30174454 // Layer 7, block 54 +.word 201839571 // Layer 7, block 48 +.word 998311389 // Layer 7, block 50 +.word 1502911852 // Layer 7, block 52 +.word 1931017673 // Layer 7, block 54 +.word 13598070 // Layer 7, block 49 +.word 31454003 // Layer 7, block 51 +.word 20506260 // Layer 7, block 53 +.word 5928435 // Layer 7, block 55 +.word 870210062 // Layer 7, block 49 +.word 2012902560 // Layer 7, block 51 +.word 1312300480 // Layer 7, block 53 +.word 379390883 // Layer 7, block 55 +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 32798516 // Layer 6, block 28 +.word 9911360 // Layer 6, block 29 +.word 32443170 // Layer 6, block 30 +.word 31293482 // Layer 6, block 31 +.word 2098944825 // Layer 6, block 28 +.word 634278629 // Layer 6, block 29 +.word 2076204416 // Layer 6, block 30 +.word 2002630000 // Layer 6, block 31 +.word 26013877 // Layer 7, block 56 +.word 22928950 // Layer 7, block 58 +.word 24547058 // Layer 7, block 60 +.word 21082546 // Layer 7, block 62 +.word 1664761067 // Layer 7, block 56 +.word 1467340807 // Layer 7, block 58 +.word 1570891816 // Layer 7, block 60 +.word 1349179970 // Layer 7, block 62 +.word 21864746 // Layer 7, block 57 +.word 27678266 // Layer 7, block 59 +.word 30695887 // Layer 7, block 61 +.word 31772478 // Layer 7, block 63 +.word 1399236949 // Layer 7, block 57 +.word 1771273834 // Layer 7, block 59 +.word 1964386839 // Layer 7, block 61 +.word 2033283404 // Layer 7, block 63 +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 2853776 // Layer 6, block 32 +.word 31645959 // Layer 6, block 33 +.word 29723614 // Layer 6, block 34 +.word 31813171 // Layer 6, block 35 +.word 182627725 // Layer 6, block 32 +.word 2025186806 // Layer 6, block 33 +.word 1902166116 // Layer 6, block 34 +.word 2035887557 // Layer 6, block 35 +.word 30377953 // Layer 7, block 64 +.word 4924837 // Layer 7, block 66 +.word 11362575 // Layer 7, block 68 +.word 31398766 // Layer 7, block 70 +.word 1944040616 // Layer 7, block 64 +.word 315165513 // Layer 7, block 66 +.word 727149301 // Layer 7, block 68 +.word 2009367662 // Layer 7, block 70 +.word 27689101 // Layer 7, block 65 +.word 31229525 // Layer 7, block 67 +.word 6544948 // Layer 7, block 69 +.word 13728247 // Layer 7, block 71 +.word 1771967221 // Layer 7, block 65 +.word 1998537064 // Layer 7, block 67 +.word 418844704 // Layer 7, block 69 +.word 878540754 // Layer 7, block 71 +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9116920 // Layer 6, block 36 +.word 26449800 // Layer 6, block 37 +.word 27173300 // Layer 6, block 38 +.word 1574249 // Layer 6, block 39 +.word 583438350 // Layer 6, block 36 +.word 1692658010 // Layer 6, block 37 +.word 1738958476 // Layer 6, block 38 +.word 100744247 // Layer 6, block 39 +.word 6510145 // Layer 7, block 72 +.word 760999 // Layer 7, block 74 +.word 1634503 // Layer 7, block 76 +.word 29546109 // Layer 7, block 78 +.word 416617482 // Layer 7, block 72 +.word 48700219 // Layer 7, block 74 +.word 104600209 // Layer 7, block 76 +.word 1890806663 // Layer 7, block 78 +.word 2195232 // Layer 7, block 73 +.word 4465852 // Layer 7, block 75 +.word 31203102 // Layer 7, block 77 +.word 29916743 // Layer 7, block 79 +.word 140484126 // Layer 7, block 73 +.word 285792715 // Layer 7, block 75 +.word 1996846121 // Layer 7, block 77 +.word 1914525428 // Layer 7, block 79 +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29172999 // Layer 6, block 40 +.word 16825951 // Layer 6, block 41 +.word 11592382 // Layer 6, block 42 +.word 2671395 // Layer 6, block 43 +.word 1866929445 // Layer 6, block 40 +.word 1076778680 // Layer 6, block 41 +.word 741855827 // Layer 6, block 42 +.word 170956232 // Layer 6, block 43 +.word 14579779 // Layer 7, block 80 +.word 24263513 // Layer 7, block 82 +.word 4646776 // Layer 7, block 84 +.word 69049 // Layer 7, block 86 +.word 933034643 // Layer 7, block 80 +.word 1552746321 // Layer 7, block 82 +.word 297370968 // Layer 7, block 84 +.word 4418799 // Layer 7, block 86 +.word 33263488 // Layer 7, block 81 +.word 22493246 // Layer 7, block 83 +.word 22009979 // Layer 7, block 85 +.word 12021234 // Layer 7, block 87 +.word 2128700762 // Layer 7, block 81 +.word 1439457879 // Layer 7, block 83 +.word 1408531152 // Layer 7, block 85 +.word 769300260 // Layer 7, block 87 +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 15720958 // Layer 6, block 44 +.word 4876619 // Layer 6, block 45 +.word 9370171 // Layer 6, block 46 +.word 2197027 // Layer 6, block 47 +.word 1006064525 // Layer 6, block 44 +.word 312079797 // Layer 6, block 45 +.word 599645177 // Layer 6, block 46 +.word 140598997 // Layer 6, block 47 +.word 16117282 // Layer 7, block 88 +.word 9635661 // Layer 7, block 90 +.word 9117520 // Layer 7, block 92 +.word 3506913 // Layer 7, block 94 +.word 1031427326 // Layer 7, block 88 +.word 616635240 // Layer 7, block 90 +.word 583476747 // Layer 7, block 92 +.word 224425303 // Layer 7, block 94 +.word 20014407 // Layer 7, block 89 +.word 25893988 // Layer 7, block 91 +.word 10257619 // Layer 7, block 93 +.word 24501669 // Layer 7, block 95 +.word 1280824291 // Layer 7, block 89 +.word 1657088757 // Layer 7, block 91 +.word 656437514 // Layer 7, block 93 +.word 1567987141 // Layer 7, block 95 +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 23467272 // Layer 6, block 48 +.word 11944835 // Layer 6, block 49 +.word 29768154 // Layer 6, block 50 +.word 3189790 // Layer 6, block 51 +.word 1501790786 // Layer 6, block 48 +.word 764411097 // Layer 6, block 49 +.word 1905016458 // Layer 6, block 50 +.word 204130980 // Layer 6, block 51 +.word 28559032 // Layer 7, block 96 +.word 20151609 // Layer 7, block 98 +.word 11645481 // Layer 7, block 100 +.word 16402437 // Layer 7, block 102 +.word 1827638556 // Layer 7, block 96 +.word 1289604549 // Layer 7, block 98 +.word 745253903 // Layer 7, block 100 +.word 1049675853 // Layer 7, block 102 +.word 1005359 // Layer 7, block 97 +.word 19130139 // Layer 7, block 99 +.word 11690281 // Layer 7, block 101 +.word 5461508 // Layer 7, block 103 +.word 64338065 // Layer 7, block 97 +.word 1224235458 // Layer 7, block 99 +.word 748120885 // Layer 7, block 101 +.word 349509836 // Layer 7, block 103 +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 4898455 // Layer 6, block 52 +.word 22059944 // Layer 6, block 53 +.word 20315246 // Layer 6, block 54 +.word 28615767 // Layer 6, block 55 +.word 313477194 // Layer 6, block 52 +.word 1411728668 // Layer 6, block 53 +.word 1300076517 // Layer 6, block 54 +.word 1831269319 // Layer 6, block 55 +.word 6226096 // Layer 7, block 104 +.word 14029790 // Layer 7, block 106 +.word 7729000 // Layer 7, block 108 +.word 13958531 // Layer 7, block 110 +.word 398439734 // Layer 7, block 104 +.word 897838034 // Layer 7, block 106 +.word 494618249 // Layer 7, block 108 +.word 893277806 // Layer 7, block 110 +.word 31755058 // Layer 7, block 105 +.word 26102744 // Layer 7, block 107 +.word 19175904 // Layer 7, block 109 +.word 19472238 // Layer 7, block 111 +.word 2032168609 // Layer 7, block 105 +.word 1670448121 // Layer 7, block 107 +.word 1227164194 // Layer 7, block 109 +.word 1246128123 // Layer 7, block 111 +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 17302560 // Layer 6, block 56 +.word 8630188 // Layer 6, block 57 +.word 13744680 // Layer 6, block 58 +.word 31890906 // Layer 6, block 59 +.word 1107279328 // Layer 6, block 56 +.word 552289879 // Layer 6, block 57 +.word 879592386 // Layer 6, block 58 +.word 2040862218 // Layer 6, block 59 +.word 4735938 // Layer 7, block 112 +.word 26671657 // Layer 7, block 114 +.word 25810971 // Layer 7, block 116 +.word 25578690 // Layer 7, block 118 +.word 303076900 // Layer 7, block 112 +.word 1706855774 // Layer 7, block 114 +.word 1651776074 // Layer 7, block 116 +.word 1636911225 // Layer 7, block 118 +.word 6957373 // Layer 7, block 113 +.word 25381712 // Layer 7, block 115 +.word 27780827 // Layer 7, block 117 +.word 28062311 // Layer 7, block 119 +.word 445237890 // Layer 7, block 113 +.word 1624305595 // Layer 7, block 115 +.word 1777837237 // Layer 7, block 117 +.word 1795850838 // Layer 7, block 119 +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 26150922 // Layer 6, block 60 +.word 29525906 // Layer 6, block 61 +.word 23080870 // Layer 6, block 62 +.word 1636987 // Layer 6, block 63 +.word 1673531278 // Layer 6, block 60 +.word 1889513769 // Layer 6, block 61 +.word 1477062945 // Layer 6, block 62 +.word 104759172 // Layer 6, block 63 +.word 10674616 // Layer 7, block 120 +.word 9508293 // Layer 7, block 122 +.word 4274200 // Layer 7, block 124 +.word 10066304 // Layer 7, block 126 +.word 683123285 // Layer 7, block 120 +.word 608484310 // Layer 7, block 122 +.word 273527923 // Layer 7, block 124 +.word 644194289 // Layer 7, block 126 +.word 26473446 // Layer 7, block 121 +.word 14853570 // Layer 7, block 123 +.word 32427548 // Layer 7, block 125 +.word 16598340 // Layer 7, block 127 +.word 1694171239 // Layer 7, block 121 +.word 950555930 // Layer 7, block 123 +.word 2075204685 // Layer 7, block 125 +.word 1062212688 // Layer 7, block 127 +.text +.global ntt_u32_full_neon_asm_var_4_4_15_0 +.global _ntt_u32_full_neon_asm_var_4_4_15_0 +ntt_u32_full_neon_asm_var_4_4_15_0: +_ntt_u32_full_neon_asm_var_4_4_15_0: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x0, #992] +ldr q29, [x17, #+0] +ldr q28, [x17, #+16] +sqrdmulh v27.4S, v30.4S, v28.s[0] +mul v30.4S, v30.4S,v29.s[0] +ldr q26, [x0, #928] +sqrdmulh v25.4S, v26.4S, v28.s[0] +mul v26.4S, v26.4S,v29.s[0] +ldr q24, [x0, #864] +sqrdmulh v23.4S, v24.4S, v28.s[0] +mul v24.4S, v24.4S,v29.s[0] +ldr q22, [x0, #800] +sqrdmulh v21.4S, v22.4S, v28.s[0] +mul v22.4S, v22.4S,v29.s[0] +ldr q20, [x0, #736] +mla v30.4S, v27.4S, v31.s[0] +sqrdmulh v27.4S, v20.4S, v28.s[0] +ldr q19, [x0, #672] +mla v26.4S, v25.4S, v31.s[0] +sqrdmulh v25.4S, v19.4S, v28.s[0] +nop +ldr q18, [x0, #608] +mla v24.4S, v23.4S, v31.s[0] +sqrdmulh v23.4S, v18.4S, v28.s[0] +nop +ldr q17, [x0, #544] +mla v22.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v17.4S, v28.s[0] +nop +ldr q16, [x0, #480] +ldr q3, [x0, #416] +mul v20.4S, v20.4S,v29.s[0] +sub v2.4s, v16.4s, v30.4s +mul v19.4S, v19.4S,v29.s[0] +add v16.4s, v16.4s, v30.4s +ldr q30, [x0, #352] +ldr q1, [x0, #288] +mla v20.4S, v27.4S, v31.s[0] +sub v27.4s, v3.4s, v26.4s +mla v19.4S, v25.4S, v31.s[0] +add v3.4s, v3.4s, v26.4s +ldr q26, [x0, #224] +ldr q25, [x0, #160] +mul v18.4S, v18.4S,v29.s[0] +sub v0.4s, v30.4s, v24.4s +mul v17.4S, v17.4S,v29.s[0] +add v30.4s, v30.4s, v24.4s +ldr q24, [x0, #96] +ldr q15, [x0, #32] +mla v18.4S, v23.4S, v31.s[0] +sub v23.4s, v1.4s, v22.4s +mla v17.4S, v21.4S, v31.s[0] +add v1.4s, v1.4s, v22.4s +sqrdmulh v22.4S, v2.4S, v28.s[2] +nop +mul v2.4S, v2.4S,v29.s[2] +nop +sqrdmulh v21.4S, v27.4S, v28.s[2] +sub v14.4s, v26.4s, v20.4s +mul v27.4S, v27.4S,v29.s[2] +add v26.4s, v26.4s, v20.4s +sqrdmulh v20.4S, v0.4S, v28.s[2] +sub v13.4s, v25.4s, v19.4s +mul v0.4S, v0.4S,v29.s[2] +add v25.4s, v25.4s, v19.4s +sqrdmulh v19.4S, v23.4S, v28.s[2] +sub v12.4s, v24.4s, v18.4s +mul v23.4S, v23.4S,v29.s[2] +add v24.4s, v24.4s, v18.4s +mla v2.4S, v22.4S, v31.s[0] +sub v22.4s, v15.4s, v17.4s +sqrdmulh v18.4S, v16.4S, v28.s[1] +add v15.4s, v15.4s, v17.4s +mla v27.4S, v21.4S, v31.s[0] +nop +sqrdmulh v21.4S, v3.4S, v28.s[1] +nop +mla v0.4S, v20.4S, v31.s[0] +nop +sqrdmulh v20.4S, v30.4S, v28.s[1] +nop +mla v23.4S, v19.4S, v31.s[0] +nop +sqrdmulh v19.4S, v1.4S, v28.s[1] +nop +ldr q17, [x17, #+32] +ldr q11, [x17, #+48] +mul v16.4S, v16.4S,v29.s[1] +sub v10.4s, v14.4s, v2.4s +mul v3.4S, v3.4S,v29.s[1] +add v14.4s, v14.4s, v2.4s +mla v16.4S, v18.4S, v31.s[0] +sub v18.4s, v13.4s, v27.4s +mla v3.4S, v21.4S, v31.s[0] +add v13.4s, v13.4s, v27.4s +mul v30.4S, v30.4S,v29.s[1] +sub v27.4s, v12.4s, v0.4s +mul v1.4S, v1.4S,v29.s[1] +add v12.4s, v12.4s, v0.4s +mla v30.4S, v20.4S, v31.s[0] +sub v20.4s, v22.4s, v23.4s +mla v1.4S, v19.4S, v31.s[0] +add v22.4s, v22.4s, v23.4s +sqrdmulh v23.4S, v10.4S, v11.s[3] +nop +mul v10.4S, v10.4S,v17.s[3] +nop +sqrdmulh v19.4S, v18.4S, v11.s[3] +sub v0.4s, v26.4s, v16.4s +mul v18.4S, v18.4S,v17.s[3] +add v26.4s, v26.4s, v16.4s +sqrdmulh v16.4S, v14.4S, v11.s[2] +sub v21.4s, v25.4s, v3.4s +mul v14.4S, v14.4S,v17.s[2] +add v25.4s, v25.4s, v3.4s +sqrdmulh v3.4S, v13.4S, v11.s[2] +sub v2.4s, v24.4s, v30.4s +mul v13.4S, v13.4S,v17.s[2] +add v24.4s, v24.4s, v30.4s +ldr q30, [x17, #+96] +ldr q9, [x17, #+112] +mla v10.4S, v23.4S, v31.s[0] +sub v23.4s, v15.4s, v1.4s +sqrdmulh v8.4S, v0.4S, v11.s[1] +add v15.4s, v15.4s, v1.4s +mla v18.4S, v19.4S, v31.s[0] +nop +sqrdmulh v19.4S, v21.4S, v11.s[1] +nop +mla v14.4S, v16.4S, v31.s[0] +nop +sqrdmulh v16.4S, v26.4S, v11.s[0] +nop +mla v13.4S, v3.4S, v31.s[0] +nop +sqrdmulh v3.4S, v25.4S, v11.s[0] +nop +ldr q1, [x17, #+64] +ldr q7, [x17, #+80] +mul v0.4S, v0.4S,v17.s[1] +sub v6.4s, v27.4s, v10.4s +mul v21.4S, v21.4S,v17.s[1] +add v27.4s, v27.4s, v10.4s +mla v0.4S, v8.4S, v31.s[0] +sub v8.4s, v20.4s, v18.4s +mla v21.4S, v19.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +mul v26.4S, v26.4S,v17.s[0] +sub v18.4s, v12.4s, v14.4s +mul v25.4S, v25.4S,v17.s[0] +add v12.4s, v12.4s, v14.4s +mla v26.4S, v16.4S, v31.s[0] +sub v16.4s, v22.4s, v13.4s +mla v25.4S, v3.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v6.4S, v9.s[3] +nop +mul v6.4S, v6.4S,v30.s[3] +nop +sqrdmulh v3.4S, v27.4S, v9.s[2] +sub v14.4s, v2.4s, v0.4s +mul v27.4S, v27.4S,v30.s[2] +add v2.4s, v2.4s, v0.4s +sqrdmulh v0.4S, v18.4S, v9.s[1] +sub v19.4s, v23.4s, v21.4s +mul v18.4S, v18.4S,v30.s[1] +add v23.4s, v23.4s, v21.4s +sqrdmulh v21.4S, v12.4S, v9.s[0] +sub v10.4s, v24.4s, v26.4s +mul v12.4S, v12.4S,v30.s[0] +add v24.4s, v24.4s, v26.4s +mla v6.4S, v13.4S, v31.s[0] +sub v13.4s, v15.4s, v25.4s +sqrdmulh v26.4S, v14.4S, v7.s[3] +add v15.4s, v15.4s, v25.4s +mla v27.4S, v3.4S, v31.s[0] +sub v3.4s, v8.4s, v6.4s +sqrdmulh v25.4S, v2.4S, v7.s[2] +add v8.4s, v8.4s, v6.4s +mla v18.4S, v0.4S, v31.s[0] +sub v0.4s, v20.4s, v27.4s +sqrdmulh v6.4S, v10.4S, v7.s[1] +add v20.4s, v20.4s, v27.4s +mla v12.4S, v21.4S, v31.s[0] +sub v21.4s, v16.4s, v18.4s +sqrdmulh v27.4S, v24.4S, v7.s[0] +add v16.4s, v16.4s, v18.4s +mul v14.4S, v14.4S,v1.s[3] +sub v18.4s, v22.4s, v12.4s +mul v2.4S, v2.4S,v1.s[2] +add v22.4s, v22.4s, v12.4s +mla v14.4S, v26.4S, v31.s[0] +str q3, [x0, #992] +mla v2.4S, v25.4S, v31.s[0] +str q8, [x0, #928] +mul v10.4S, v10.4S,v1.s[1] +str q0, [x0, #864] +mul v24.4S, v24.4S,v1.s[0] +str q20, [x0, #800] +mla v10.4S, v6.4S, v31.s[0] +str q21, [x0, #736] +mla v24.4S, v27.4S, v31.s[0] +str q16, [x0, #672] +ldr q16, [x0, #1008] +sqrdmulh v27.4S, v16.4S, v28.s[0] +str q18, [x0, #608] +mul v16.4S, v16.4S,v29.s[0] +str q22, [x0, #544] +ldr q22, [x0, #944] +sqrdmulh v18.4S, v22.4S, v28.s[0] +sub v21.4s, v19.4s, v14.4s +str q21, [x0, #480] +mul v22.4S, v22.4S,v29.s[0] +add v19.4s, v19.4s, v14.4s +ldr q14, [x0, #880] +sqrdmulh v21.4S, v14.4S, v28.s[0] +sub v6.4s, v23.4s, v2.4s +str q19, [x0, #416] +mul v14.4S, v14.4S,v29.s[0] +add v23.4s, v23.4s, v2.4s +ldr q2, [x0, #816] +sqrdmulh v19.4S, v2.4S, v28.s[0] +sub v20.4s, v13.4s, v10.4s +str q6, [x0, #352] +mul v2.4S, v2.4S,v29.s[0] +add v13.4s, v13.4s, v10.4s +ldr q10, [x0, #752] +mla v16.4S, v27.4S, v31.s[0] +sub v27.4s, v15.4s, v24.4s +str q23, [x0, #288] +sqrdmulh v23.4S, v10.4S, v28.s[0] +add v15.4s, v15.4s, v24.4s +ldr q24, [x0, #688] +mla v22.4S, v18.4S, v31.s[0] +str q20, [x0, #224] +sqrdmulh v20.4S, v24.4S, v28.s[0] +nop +ldr q18, [x0, #624] +mla v14.4S, v21.4S, v31.s[0] +str q13, [x0, #160] +sqrdmulh v13.4S, v18.4S, v28.s[0] +nop +ldr q21, [x0, #560] +mla v2.4S, v19.4S, v31.s[0] +str q27, [x0, #96] +sqrdmulh v27.4S, v21.4S, v28.s[0] +nop +ldr q19, [x0, #496] +ldr q6, [x0, #432] +mul v10.4S, v10.4S,v29.s[0] +sub v0.4s, v19.4s, v16.4s +str q15, [x0, #32] +mul v24.4S, v24.4S,v29.s[0] +add v19.4s, v19.4s, v16.4s +ldr q16, [x0, #368] +ldr q15, [x0, #304] +mla v10.4S, v23.4S, v31.s[0] +sub v23.4s, v6.4s, v22.4s +mla v24.4S, v20.4S, v31.s[0] +add v6.4s, v6.4s, v22.4s +ldr q22, [x0, #240] +ldr q20, [x0, #176] +mul v18.4S, v18.4S,v29.s[0] +sub v8.4s, v16.4s, v14.4s +mul v21.4S, v21.4S,v29.s[0] +add v16.4s, v16.4s, v14.4s +ldr q14, [x0, #112] +ldr q25, [x0, #48] +mla v18.4S, v13.4S, v31.s[0] +sub v13.4s, v15.4s, v2.4s +mla v21.4S, v27.4S, v31.s[0] +add v15.4s, v15.4s, v2.4s +sqrdmulh v2.4S, v0.4S, v28.s[2] +nop +mul v0.4S, v0.4S,v29.s[2] +nop +sqrdmulh v27.4S, v23.4S, v28.s[2] +sub v3.4s, v22.4s, v10.4s +mul v23.4S, v23.4S,v29.s[2] +add v22.4s, v22.4s, v10.4s +sqrdmulh v10.4S, v8.4S, v28.s[2] +sub v26.4s, v20.4s, v24.4s +mul v8.4S, v8.4S,v29.s[2] +add v20.4s, v20.4s, v24.4s +sqrdmulh v24.4S, v13.4S, v28.s[2] +sub v12.4s, v14.4s, v18.4s +mul v13.4S, v13.4S,v29.s[2] +add v14.4s, v14.4s, v18.4s +mla v0.4S, v2.4S, v31.s[0] +sub v2.4s, v25.4s, v21.4s +sqrdmulh v18.4S, v19.4S, v28.s[1] +add v25.4s, v25.4s, v21.4s +mla v23.4S, v27.4S, v31.s[0] +nop +sqrdmulh v27.4S, v6.4S, v28.s[1] +nop +mla v8.4S, v10.4S, v31.s[0] +nop +sqrdmulh v10.4S, v16.4S, v28.s[1] +nop +mla v13.4S, v24.4S, v31.s[0] +nop +sqrdmulh v24.4S, v15.4S, v28.s[1] +nop +mul v19.4S, v19.4S,v29.s[1] +sub v21.4s, v3.4s, v0.4s +mul v6.4S, v6.4S,v29.s[1] +add v3.4s, v3.4s, v0.4s +mla v19.4S, v18.4S, v31.s[0] +sub v18.4s, v26.4s, v23.4s +mla v6.4S, v27.4S, v31.s[0] +add v26.4s, v26.4s, v23.4s +mul v16.4S, v16.4S,v29.s[1] +sub v23.4s, v12.4s, v8.4s +mul v15.4S, v15.4S,v29.s[1] +add v12.4s, v12.4s, v8.4s +mla v16.4S, v10.4S, v31.s[0] +sub v10.4s, v2.4s, v13.4s +mla v15.4S, v24.4S, v31.s[0] +add v2.4s, v2.4s, v13.4s +sqrdmulh v13.4S, v21.4S, v11.s[3] +nop +mul v21.4S, v21.4S,v17.s[3] +nop +sqrdmulh v24.4S, v18.4S, v11.s[3] +sub v8.4s, v22.4s, v19.4s +mul v18.4S, v18.4S,v17.s[3] +add v22.4s, v22.4s, v19.4s +sqrdmulh v19.4S, v3.4S, v11.s[2] +sub v27.4s, v20.4s, v6.4s +mul v3.4S, v3.4S,v17.s[2] +add v20.4s, v20.4s, v6.4s +sqrdmulh v6.4S, v26.4S, v11.s[2] +sub v0.4s, v14.4s, v16.4s +mul v26.4S, v26.4S,v17.s[2] +add v14.4s, v14.4s, v16.4s +mla v21.4S, v13.4S, v31.s[0] +sub v13.4s, v25.4s, v15.4s +sqrdmulh v16.4S, v8.4S, v11.s[1] +add v25.4s, v25.4s, v15.4s +mla v18.4S, v24.4S, v31.s[0] +nop +sqrdmulh v24.4S, v27.4S, v11.s[1] +nop +mla v3.4S, v19.4S, v31.s[0] +nop +sqrdmulh v19.4S, v22.4S, v11.s[0] +nop +mla v26.4S, v6.4S, v31.s[0] +nop +sqrdmulh v6.4S, v20.4S, v11.s[0] +nop +mul v8.4S, v8.4S,v17.s[1] +sub v15.4s, v23.4s, v21.4s +mul v27.4S, v27.4S,v17.s[1] +add v23.4s, v23.4s, v21.4s +mla v8.4S, v16.4S, v31.s[0] +sub v16.4s, v10.4s, v18.4s +mla v27.4S, v24.4S, v31.s[0] +add v10.4s, v10.4s, v18.4s +mul v22.4S, v22.4S,v17.s[0] +sub v18.4s, v12.4s, v3.4s +mul v20.4S, v20.4S,v17.s[0] +add v12.4s, v12.4s, v3.4s +mla v22.4S, v19.4S, v31.s[0] +sub v19.4s, v2.4s, v26.4s +mla v20.4S, v6.4S, v31.s[0] +add v2.4s, v2.4s, v26.4s +sqrdmulh v26.4S, v15.4S, v9.s[3] +nop +mul v15.4S, v15.4S,v30.s[3] +nop +sqrdmulh v6.4S, v23.4S, v9.s[2] +sub v3.4s, v0.4s, v8.4s +mul v23.4S, v23.4S,v30.s[2] +add v0.4s, v0.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v9.s[1] +sub v24.4s, v13.4s, v27.4s +mul v18.4S, v18.4S,v30.s[1] +add v13.4s, v13.4s, v27.4s +sqrdmulh v27.4S, v12.4S, v9.s[0] +sub v21.4s, v14.4s, v22.4s +mul v12.4S, v12.4S,v30.s[0] +add v14.4s, v14.4s, v22.4s +mla v15.4S, v26.4S, v31.s[0] +sub v26.4s, v25.4s, v20.4s +sqrdmulh v22.4S, v3.4S, v7.s[3] +add v25.4s, v25.4s, v20.4s +mla v23.4S, v6.4S, v31.s[0] +sub v6.4s, v16.4s, v15.4s +sqrdmulh v20.4S, v0.4S, v7.s[2] +add v16.4s, v16.4s, v15.4s +mla v18.4S, v8.4S, v31.s[0] +sub v8.4s, v10.4s, v23.4s +sqrdmulh v15.4S, v21.4S, v7.s[1] +add v10.4s, v10.4s, v23.4s +mla v12.4S, v27.4S, v31.s[0] +sub v27.4s, v19.4s, v18.4s +sqrdmulh v23.4S, v14.4S, v7.s[0] +add v19.4s, v19.4s, v18.4s +mul v3.4S, v3.4S,v1.s[3] +sub v18.4s, v2.4s, v12.4s +mul v0.4S, v0.4S,v1.s[2] +add v2.4s, v2.4s, v12.4s +mla v3.4S, v22.4S, v31.s[0] +str q6, [x0, #1008] +mla v0.4S, v20.4S, v31.s[0] +str q16, [x0, #944] +mul v21.4S, v21.4S,v1.s[1] +str q8, [x0, #880] +mul v14.4S, v14.4S,v1.s[0] +str q10, [x0, #816] +mla v21.4S, v15.4S, v31.s[0] +str q27, [x0, #752] +mla v14.4S, v23.4S, v31.s[0] +str q19, [x0, #688] +ldr q19, [x0, #960] +sqrdmulh v23.4S, v19.4S, v28.s[0] +str q18, [x0, #624] +mul v19.4S, v19.4S,v29.s[0] +str q2, [x0, #560] +ldr q2, [x0, #896] +sqrdmulh v18.4S, v2.4S, v28.s[0] +sub v27.4s, v24.4s, v3.4s +str q27, [x0, #496] +mul v2.4S, v2.4S,v29.s[0] +add v24.4s, v24.4s, v3.4s +ldr q3, [x0, #832] +sqrdmulh v27.4S, v3.4S, v28.s[0] +sub v15.4s, v13.4s, v0.4s +str q24, [x0, #432] +mul v3.4S, v3.4S,v29.s[0] +add v13.4s, v13.4s, v0.4s +ldr q0, [x0, #768] +sqrdmulh v24.4S, v0.4S, v28.s[0] +sub v10.4s, v26.4s, v21.4s +str q15, [x0, #368] +mul v0.4S, v0.4S,v29.s[0] +add v26.4s, v26.4s, v21.4s +ldr q21, [x0, #704] +mla v19.4S, v23.4S, v31.s[0] +sub v23.4s, v25.4s, v14.4s +str q13, [x0, #304] +sqrdmulh v13.4S, v21.4S, v28.s[0] +add v25.4s, v25.4s, v14.4s +ldr q14, [x0, #640] +mla v2.4S, v18.4S, v31.s[0] +str q10, [x0, #240] +sqrdmulh v10.4S, v14.4S, v28.s[0] +nop +ldr q18, [x0, #576] +mla v3.4S, v27.4S, v31.s[0] +str q26, [x0, #176] +sqrdmulh v26.4S, v18.4S, v28.s[0] +nop +ldr q27, [x0, #512] +mla v0.4S, v24.4S, v31.s[0] +str q23, [x0, #112] +sqrdmulh v23.4S, v27.4S, v28.s[0] +nop +ldr q24, [x0, #448] +ldr q15, [x0, #384] +mul v21.4S, v21.4S,v29.s[0] +sub v8.4s, v24.4s, v19.4s +str q25, [x0, #48] +mul v14.4S, v14.4S,v29.s[0] +add v24.4s, v24.4s, v19.4s +ldr q19, [x0, #320] +ldr q25, [x0, #256] +mla v21.4S, v13.4S, v31.s[0] +sub v13.4s, v15.4s, v2.4s +mla v14.4S, v10.4S, v31.s[0] +add v15.4s, v15.4s, v2.4s +ldr q2, [x0, #192] +ldr q10, [x0, #128] +mul v18.4S, v18.4S,v29.s[0] +sub v16.4s, v19.4s, v3.4s +mul v27.4S, v27.4S,v29.s[0] +add v19.4s, v19.4s, v3.4s +ldr q3, [x0, #64] +ldr q20, [x0, #0] +mla v18.4S, v26.4S, v31.s[0] +sub v26.4s, v25.4s, v0.4s +mla v27.4S, v23.4S, v31.s[0] +add v25.4s, v25.4s, v0.4s +sqrdmulh v0.4S, v8.4S, v28.s[2] +nop +mul v8.4S, v8.4S,v29.s[2] +nop +sqrdmulh v23.4S, v13.4S, v28.s[2] +sub v6.4s, v2.4s, v21.4s +mul v13.4S, v13.4S,v29.s[2] +add v2.4s, v2.4s, v21.4s +sqrdmulh v21.4S, v16.4S, v28.s[2] +sub v22.4s, v10.4s, v14.4s +mul v16.4S, v16.4S,v29.s[2] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v26.4S, v28.s[2] +sub v12.4s, v3.4s, v18.4s +mul v26.4S, v26.4S,v29.s[2] +add v3.4s, v3.4s, v18.4s +mla v8.4S, v0.4S, v31.s[0] +sub v0.4s, v20.4s, v27.4s +sqrdmulh v18.4S, v24.4S, v28.s[1] +add v20.4s, v20.4s, v27.4s +mla v13.4S, v23.4S, v31.s[0] +nop +sqrdmulh v23.4S, v15.4S, v28.s[1] +nop +mla v16.4S, v21.4S, v31.s[0] +nop +sqrdmulh v21.4S, v19.4S, v28.s[1] +nop +mla v26.4S, v14.4S, v31.s[0] +nop +sqrdmulh v14.4S, v25.4S, v28.s[1] +nop +mul v24.4S, v24.4S,v29.s[1] +sub v27.4s, v6.4s, v8.4s +mul v15.4S, v15.4S,v29.s[1] +add v6.4s, v6.4s, v8.4s +mla v24.4S, v18.4S, v31.s[0] +sub v18.4s, v22.4s, v13.4s +mla v15.4S, v23.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +mul v19.4S, v19.4S,v29.s[1] +sub v13.4s, v12.4s, v16.4s +mul v25.4S, v25.4S,v29.s[1] +add v12.4s, v12.4s, v16.4s +mla v19.4S, v21.4S, v31.s[0] +sub v21.4s, v0.4s, v26.4s +mla v25.4S, v14.4S, v31.s[0] +add v0.4s, v0.4s, v26.4s +sqrdmulh v26.4S, v27.4S, v11.s[3] +nop +mul v27.4S, v27.4S,v17.s[3] +nop +sqrdmulh v14.4S, v18.4S, v11.s[3] +sub v16.4s, v2.4s, v24.4s +mul v18.4S, v18.4S,v17.s[3] +add v2.4s, v2.4s, v24.4s +sqrdmulh v24.4S, v6.4S, v11.s[2] +sub v23.4s, v10.4s, v15.4s +mul v6.4S, v6.4S,v17.s[2] +add v10.4s, v10.4s, v15.4s +sqrdmulh v15.4S, v22.4S, v11.s[2] +sub v8.4s, v3.4s, v19.4s +mul v22.4S, v22.4S,v17.s[2] +add v3.4s, v3.4s, v19.4s +mla v27.4S, v26.4S, v31.s[0] +sub v26.4s, v20.4s, v25.4s +sqrdmulh v19.4S, v16.4S, v11.s[1] +add v20.4s, v20.4s, v25.4s +mla v18.4S, v14.4S, v31.s[0] +nop +sqrdmulh v14.4S, v23.4S, v11.s[1] +nop +mla v6.4S, v24.4S, v31.s[0] +nop +sqrdmulh v24.4S, v2.4S, v11.s[0] +nop +mla v22.4S, v15.4S, v31.s[0] +nop +sqrdmulh v15.4S, v10.4S, v11.s[0] +nop +mul v16.4S, v16.4S,v17.s[1] +sub v25.4s, v13.4s, v27.4s +mul v23.4S, v23.4S,v17.s[1] +add v13.4s, v13.4s, v27.4s +mla v16.4S, v19.4S, v31.s[0] +sub v19.4s, v21.4s, v18.4s +mla v23.4S, v14.4S, v31.s[0] +add v21.4s, v21.4s, v18.4s +mul v2.4S, v2.4S,v17.s[0] +sub v18.4s, v12.4s, v6.4s +mul v10.4S, v10.4S,v17.s[0] +add v12.4s, v12.4s, v6.4s +mla v2.4S, v24.4S, v31.s[0] +sub v24.4s, v0.4s, v22.4s +mla v10.4S, v15.4S, v31.s[0] +add v0.4s, v0.4s, v22.4s +sqrdmulh v22.4S, v25.4S, v9.s[3] +nop +mul v25.4S, v25.4S,v30.s[3] +nop +sqrdmulh v15.4S, v13.4S, v9.s[2] +sub v6.4s, v8.4s, v16.4s +mul v13.4S, v13.4S,v30.s[2] +add v8.4s, v8.4s, v16.4s +sqrdmulh v16.4S, v18.4S, v9.s[1] +sub v14.4s, v26.4s, v23.4s +mul v18.4S, v18.4S,v30.s[1] +add v26.4s, v26.4s, v23.4s +sqrdmulh v23.4S, v12.4S, v9.s[0] +sub v27.4s, v3.4s, v2.4s +mul v12.4S, v12.4S,v30.s[0] +add v3.4s, v3.4s, v2.4s +mla v25.4S, v22.4S, v31.s[0] +sub v22.4s, v20.4s, v10.4s +sqrdmulh v2.4S, v6.4S, v7.s[3] +add v20.4s, v20.4s, v10.4s +mla v13.4S, v15.4S, v31.s[0] +sub v15.4s, v19.4s, v25.4s +sqrdmulh v10.4S, v8.4S, v7.s[2] +add v19.4s, v19.4s, v25.4s +mla v18.4S, v16.4S, v31.s[0] +sub v16.4s, v21.4s, v13.4s +sqrdmulh v25.4S, v27.4S, v7.s[1] +add v21.4s, v21.4s, v13.4s +mla v12.4S, v23.4S, v31.s[0] +sub v23.4s, v24.4s, v18.4s +sqrdmulh v13.4S, v3.4S, v7.s[0] +add v24.4s, v24.4s, v18.4s +mul v6.4S, v6.4S,v1.s[3] +sub v18.4s, v0.4s, v12.4s +mul v8.4S, v8.4S,v1.s[2] +add v0.4s, v0.4s, v12.4s +mla v6.4S, v2.4S, v31.s[0] +str q15, [x0, #960] +mla v8.4S, v10.4S, v31.s[0] +str q19, [x0, #896] +mul v27.4S, v27.4S,v1.s[1] +str q16, [x0, #832] +mul v3.4S, v3.4S,v1.s[0] +str q21, [x0, #768] +mla v27.4S, v25.4S, v31.s[0] +str q23, [x0, #704] +mla v3.4S, v13.4S, v31.s[0] +str q24, [x0, #640] +ldr q24, [x0, #976] +sqrdmulh v13.4S, v24.4S, v28.s[0] +str q18, [x0, #576] +mul v24.4S, v24.4S,v29.s[0] +str q0, [x0, #512] +ldr q0, [x0, #912] +sqrdmulh v18.4S, v0.4S, v28.s[0] +sub v23.4s, v14.4s, v6.4s +str q23, [x0, #448] +mul v0.4S, v0.4S,v29.s[0] +add v14.4s, v14.4s, v6.4s +ldr q6, [x0, #848] +sqrdmulh v23.4S, v6.4S, v28.s[0] +sub v25.4s, v26.4s, v8.4s +str q14, [x0, #384] +mul v6.4S, v6.4S,v29.s[0] +add v26.4s, v26.4s, v8.4s +ldr q8, [x0, #784] +sqrdmulh v14.4S, v8.4S, v28.s[0] +sub v21.4s, v22.4s, v27.4s +str q25, [x0, #320] +mul v8.4S, v8.4S,v29.s[0] +add v22.4s, v22.4s, v27.4s +ldr q27, [x0, #720] +mla v24.4S, v13.4S, v31.s[0] +sub v13.4s, v20.4s, v3.4s +str q26, [x0, #256] +sqrdmulh v26.4S, v27.4S, v28.s[0] +add v20.4s, v20.4s, v3.4s +ldr q3, [x0, #656] +mla v0.4S, v18.4S, v31.s[0] +str q21, [x0, #192] +sqrdmulh v21.4S, v3.4S, v28.s[0] +nop +ldr q18, [x0, #592] +mla v6.4S, v23.4S, v31.s[0] +str q22, [x0, #128] +sqrdmulh v22.4S, v18.4S, v28.s[0] +nop +ldr q23, [x0, #528] +mla v8.4S, v14.4S, v31.s[0] +str q13, [x0, #64] +sqrdmulh v13.4S, v23.4S, v28.s[0] +nop +ldr q14, [x0, #464] +ldr q25, [x0, #400] +mul v27.4S, v27.4S,v29.s[0] +sub v16.4s, v14.4s, v24.4s +str q20, [x0, #0] +mul v3.4S, v3.4S,v29.s[0] +add v14.4s, v14.4s, v24.4s +ldr q24, [x0, #336] +ldr q20, [x0, #272] +mla v27.4S, v26.4S, v31.s[0] +sub v26.4s, v25.4s, v0.4s +mla v3.4S, v21.4S, v31.s[0] +add v25.4s, v25.4s, v0.4s +ldr q0, [x0, #208] +ldr q21, [x0, #144] +mul v18.4S, v18.4S,v29.s[0] +sub v19.4s, v24.4s, v6.4s +mul v23.4S, v23.4S,v29.s[0] +add v24.4s, v24.4s, v6.4s +ldr q6, [x0, #80] +ldr q10, [x0, #16] +mla v18.4S, v22.4S, v31.s[0] +sub v22.4s, v20.4s, v8.4s +mla v23.4S, v13.4S, v31.s[0] +add v20.4s, v20.4s, v8.4s +sqrdmulh v8.4S, v16.4S, v28.s[2] +nop +mul v16.4S, v16.4S,v29.s[2] +nop +sqrdmulh v13.4S, v26.4S, v28.s[2] +sub v15.4s, v0.4s, v27.4s +mul v26.4S, v26.4S,v29.s[2] +add v0.4s, v0.4s, v27.4s +sqrdmulh v27.4S, v19.4S, v28.s[2] +sub v2.4s, v21.4s, v3.4s +mul v19.4S, v19.4S,v29.s[2] +add v21.4s, v21.4s, v3.4s +sqrdmulh v3.4S, v22.4S, v28.s[2] +sub v12.4s, v6.4s, v18.4s +mul v22.4S, v22.4S,v29.s[2] +add v6.4s, v6.4s, v18.4s +mla v16.4S, v8.4S, v31.s[0] +sub v8.4s, v10.4s, v23.4s +sqrdmulh v18.4S, v14.4S, v28.s[1] +add v10.4s, v10.4s, v23.4s +mla v26.4S, v13.4S, v31.s[0] +nop +sqrdmulh v13.4S, v25.4S, v28.s[1] +nop +mla v19.4S, v27.4S, v31.s[0] +nop +sqrdmulh v27.4S, v24.4S, v28.s[1] +nop +mla v22.4S, v3.4S, v31.s[0] +nop +sqrdmulh v3.4S, v20.4S, v28.s[1] +nop +mul v14.4S, v14.4S,v29.s[1] +sub v23.4s, v15.4s, v16.4s +mul v25.4S, v25.4S,v29.s[1] +add v15.4s, v15.4s, v16.4s +mla v14.4S, v18.4S, v31.s[0] +sub v18.4s, v2.4s, v26.4s +mla v25.4S, v13.4S, v31.s[0] +add v2.4s, v2.4s, v26.4s +mul v24.4S, v24.4S,v29.s[1] +sub v26.4s, v12.4s, v19.4s +mul v20.4S, v20.4S,v29.s[1] +add v12.4s, v12.4s, v19.4s +mla v24.4S, v27.4S, v31.s[0] +sub v27.4s, v8.4s, v22.4s +mla v20.4S, v3.4S, v31.s[0] +add v8.4s, v8.4s, v22.4s +sqrdmulh v28.4S, v23.4S, v11.s[3] +nop +mul v23.4S, v23.4S,v17.s[3] +nop +sqrdmulh v29.4S, v18.4S, v11.s[3] +sub v22.4s, v0.4s, v14.4s +mul v18.4S, v18.4S,v17.s[3] +add v0.4s, v0.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v11.s[2] +sub v3.4s, v21.4s, v25.4s +mul v15.4S, v15.4S,v17.s[2] +add v21.4s, v21.4s, v25.4s +sqrdmulh v25.4S, v2.4S, v11.s[2] +sub v19.4s, v6.4s, v24.4s +mul v2.4S, v2.4S,v17.s[2] +add v6.4s, v6.4s, v24.4s +mla v23.4S, v28.4S, v31.s[0] +sub v28.4s, v10.4s, v20.4s +sqrdmulh v24.4S, v22.4S, v11.s[1] +add v10.4s, v10.4s, v20.4s +mla v18.4S, v29.4S, v31.s[0] +nop +sqrdmulh v29.4S, v3.4S, v11.s[1] +nop +mla v15.4S, v14.4S, v31.s[0] +nop +sqrdmulh v14.4S, v0.4S, v11.s[0] +nop +mla v2.4S, v25.4S, v31.s[0] +nop +sqrdmulh v25.4S, v21.4S, v11.s[0] +nop +mul v22.4S, v22.4S,v17.s[1] +sub v20.4s, v26.4s, v23.4s +mul v3.4S, v3.4S,v17.s[1] +add v26.4s, v26.4s, v23.4s +mla v22.4S, v24.4S, v31.s[0] +sub v24.4s, v27.4s, v18.4s +mla v3.4S, v29.4S, v31.s[0] +add v27.4s, v27.4s, v18.4s +mul v0.4S, v0.4S,v17.s[0] +sub v18.4s, v12.4s, v15.4s +mul v21.4S, v21.4S,v17.s[0] +add v12.4s, v12.4s, v15.4s +mla v0.4S, v14.4S, v31.s[0] +sub v14.4s, v8.4s, v2.4s +mla v21.4S, v25.4S, v31.s[0] +add v8.4s, v8.4s, v2.4s +sqrdmulh v11.4S, v20.4S, v9.s[3] +nop +mul v20.4S, v20.4S,v30.s[3] +nop +sqrdmulh v17.4S, v26.4S, v9.s[2] +sub v2.4s, v19.4s, v22.4s +mul v26.4S, v26.4S,v30.s[2] +add v19.4s, v19.4s, v22.4s +sqrdmulh v22.4S, v18.4S, v9.s[1] +sub v25.4s, v28.4s, v3.4s +mul v18.4S, v18.4S,v30.s[1] +add v28.4s, v28.4s, v3.4s +sqrdmulh v3.4S, v12.4S, v9.s[0] +sub v15.4s, v6.4s, v0.4s +mul v12.4S, v12.4S,v30.s[0] +add v6.4s, v6.4s, v0.4s +mla v20.4S, v11.4S, v31.s[0] +sub v11.4s, v10.4s, v21.4s +sqrdmulh v9.4S, v2.4S, v7.s[3] +add v10.4s, v10.4s, v21.4s +mla v26.4S, v17.4S, v31.s[0] +sub v17.4s, v24.4s, v20.4s +sqrdmulh v21.4S, v19.4S, v7.s[2] +add v24.4s, v24.4s, v20.4s +mla v18.4S, v22.4S, v31.s[0] +sub v22.4s, v27.4s, v26.4s +sqrdmulh v20.4S, v15.4S, v7.s[1] +add v27.4s, v27.4s, v26.4s +mla v12.4S, v3.4S, v31.s[0] +sub v3.4s, v14.4s, v18.4s +sqrdmulh v26.4S, v6.4S, v7.s[0] +add v14.4s, v14.4s, v18.4s +mul v2.4S, v2.4S,v1.s[3] +sub v18.4s, v8.4s, v12.4s +mul v19.4S, v19.4S,v1.s[2] +add v8.4s, v8.4s, v12.4s +mla v2.4S, v9.4S, v31.s[0] +str q17, [x0, #976] +mla v19.4S, v21.4S, v31.s[0] +str q24, [x0, #912] +mul v15.4S, v15.4S,v1.s[1] +str q22, [x0, #848] +mul v6.4S, v6.4S,v1.s[0] +str q27, [x0, #784] +mla v15.4S, v20.4S, v31.s[0] +str q3, [x0, #720] +mla v6.4S, v26.4S, v31.s[0] +str q14, [x0, #656] +str q18, [x0, #592] +str q8, [x0, #528] +sub v8.4s, v25.4s, v2.4s +str q8, [x0, #464] +add v25.4s, v25.4s, v2.4s +sub v2.4s, v28.4s, v19.4s +str q25, [x0, #400] +add v28.4s, v28.4s, v19.4s +sub v19.4s, v11.4s, v15.4s +str q2, [x0, #336] +add v11.4s, v11.4s, v15.4s +sub v15.4s, v10.4s, v6.4s +str q28, [x0, #272] +add v10.4s, v10.4s, v6.4s +str q19, [x0, #208] +str q11, [x0, #144] +str q15, [x0, #80] +str q10, [x0, #16] +ldr q4, [x17, #+128] +ldr q5, [x17, #+144] +ldr q16, [x17, #+160] +ldr q13, [x17, #+176] +ldr q23, [x17, #+192] +ldr q29, [x17, #+208] +ldr q0, [x17, #+224] +ldr q30, [x17, #+240] +ldr q12, [x0, #32] +ldr q9, [x0, #48] +ldr q17, [x0, #0] +ldr q21, [x0, #16] +sqrdmulh v24.4S, v12.4S, v5.s[0] +mul v12.4S, v12.4S,v4.s[0] +mla v12.4S, v24.4S, v31.s[0] +sub v24.4s, v17.4s, v12.4s +add v17.4s, v17.4s, v12.4s +sqrdmulh v12.4S, v9.4S, v5.s[0] +mul v9.4S, v9.4S,v4.s[0] +mla v9.4S, v12.4S, v31.s[0] +sub v12.4s, v21.4s, v9.4s +add v21.4s, v21.4s, v9.4s +sqrdmulh v9.4S, v21.4S, v5.s[1] +mul v21.4S, v21.4S,v4.s[1] +mla v21.4S, v9.4S, v31.s[0] +sub v9.4s, v17.4s, v21.4s +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v12.4S, v5.s[2] +mul v12.4S, v12.4S,v4.s[2] +mla v12.4S, v21.4S, v31.s[0] +sub v21.4s, v24.4s, v12.4s +add v24.4s, v24.4s, v12.4s +trn1 v12.4S, v17.4S, v9.4S +trn2 v22.4S, v17.4S, v9.4S +trn1 v27.4S, v24.4S, v21.4S +trn2 v20.4S, v24.4S, v21.4S +trn2 v24.2D, v12.2D, v27.2D +trn2 v21.2D, v22.2D, v20.2D +trn1 v17.2D, v12.2D, v27.2D +trn1 v9.2D, v22.2D, v20.2D +sqrdmulh v20.4S, v24.4S, v13.4S +mul v24.4S, v24.4S,v16.4S +mla v24.4S, v20.4S, v31.s[0] +sub v20.4s, v17.4s, v24.4s +add v17.4s, v17.4s, v24.4s +sqrdmulh v24.4S, v21.4S, v13.4S +mul v21.4S, v21.4S,v16.4S +mla v21.4S, v24.4S, v31.s[0] +sub v24.4s, v9.4s, v21.4s +add v9.4s, v9.4s, v21.4s +sqrdmulh v21.4S, v9.4S, v29.4S +mul v9.4S, v9.4S,v23.4S +mla v9.4S, v21.4S, v31.s[0] +sub v21.4s, v17.4s, v9.4s +add v17.4s, v17.4s, v9.4s +sqrdmulh v9.4S, v24.4S, v30.4S +mul v24.4S, v24.4S,v0.4S +mla v24.4S, v9.4S, v31.s[0] +sub v9.4s, v20.4s, v24.4s +add v20.4s, v20.4s, v24.4s +str q17, [x0, #0] +str q21, [x0, #16] +str q20, [x0, #32] +str q9, [x0, #48] +ldr q9, [x17, #+256] +ldr q20, [x17, #+272] +ldr q21, [x17, #+288] +ldr q17, [x17, #+304] +ldr q24, [x17, #+320] +ldr q22, [x17, #+336] +ldr q27, [x17, #+352] +ldr q12, [x17, #+368] +ldr q30, [x0, #96] +ldr q0, [x0, #112] +ldr q29, [x0, #64] +ldr q23, [x0, #80] +sqrdmulh v13.4S, v30.4S, v20.s[0] +mul v30.4S, v30.4S,v9.s[0] +mla v30.4S, v13.4S, v31.s[0] +sub v13.4s, v29.4s, v30.4s +add v29.4s, v29.4s, v30.4s +sqrdmulh v30.4S, v0.4S, v20.s[0] +mul v0.4S, v0.4S,v9.s[0] +mla v0.4S, v30.4S, v31.s[0] +sub v30.4s, v23.4s, v0.4s +add v23.4s, v23.4s, v0.4s +sqrdmulh v0.4S, v23.4S, v20.s[1] +mul v23.4S, v23.4S,v9.s[1] +mla v23.4S, v0.4S, v31.s[0] +sub v0.4s, v29.4s, v23.4s +add v29.4s, v29.4s, v23.4s +sqrdmulh v23.4S, v30.4S, v20.s[2] +mul v30.4S, v30.4S,v9.s[2] +mla v30.4S, v23.4S, v31.s[0] +sub v23.4s, v13.4s, v30.4s +add v13.4s, v13.4s, v30.4s +trn1 v30.4S, v29.4S, v0.4S +trn2 v16.4S, v29.4S, v0.4S +trn1 v5.4S, v13.4S, v23.4S +trn2 v4.4S, v13.4S, v23.4S +trn2 v13.2D, v30.2D, v5.2D +trn2 v23.2D, v16.2D, v4.2D +trn1 v29.2D, v30.2D, v5.2D +trn1 v0.2D, v16.2D, v4.2D +sqrdmulh v4.4S, v13.4S, v17.4S +mul v13.4S, v13.4S,v21.4S +mla v13.4S, v4.4S, v31.s[0] +sub v4.4s, v29.4s, v13.4s +add v29.4s, v29.4s, v13.4s +sqrdmulh v13.4S, v23.4S, v17.4S +mul v23.4S, v23.4S,v21.4S +mla v23.4S, v13.4S, v31.s[0] +sub v13.4s, v0.4s, v23.4s +add v0.4s, v0.4s, v23.4s +sqrdmulh v23.4S, v0.4S, v22.4S +mul v0.4S, v0.4S,v24.4S +mla v0.4S, v23.4S, v31.s[0] +sub v23.4s, v29.4s, v0.4s +add v29.4s, v29.4s, v0.4s +sqrdmulh v0.4S, v13.4S, v12.4S +mul v13.4S, v13.4S,v27.4S +mla v13.4S, v0.4S, v31.s[0] +sub v0.4s, v4.4s, v13.4s +add v4.4s, v4.4s, v13.4s +str q29, [x0, #64] +str q23, [x0, #80] +str q4, [x0, #96] +str q0, [x0, #112] +ldr q0, [x17, #+384] +ldr q4, [x17, #+400] +ldr q23, [x17, #+416] +ldr q29, [x17, #+432] +ldr q13, [x17, #+448] +ldr q16, [x17, #+464] +ldr q5, [x17, #+480] +ldr q30, [x17, #+496] +ldr q12, [x0, #160] +ldr q27, [x0, #176] +ldr q22, [x0, #128] +ldr q24, [x0, #144] +sqrdmulh v17.4S, v12.4S, v4.s[0] +mul v12.4S, v12.4S,v0.s[0] +mla v12.4S, v17.4S, v31.s[0] +sub v17.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v27.4S, v4.s[0] +mul v27.4S, v27.4S,v0.s[0] +mla v27.4S, v12.4S, v31.s[0] +sub v12.4s, v24.4s, v27.4s +add v24.4s, v24.4s, v27.4s +sqrdmulh v27.4S, v24.4S, v4.s[1] +mul v24.4S, v24.4S,v0.s[1] +mla v24.4S, v27.4S, v31.s[0] +sub v27.4s, v22.4s, v24.4s +add v22.4s, v22.4s, v24.4s +sqrdmulh v24.4S, v12.4S, v4.s[2] +mul v12.4S, v12.4S,v0.s[2] +mla v12.4S, v24.4S, v31.s[0] +sub v24.4s, v17.4s, v12.4s +add v17.4s, v17.4s, v12.4s +trn1 v12.4S, v22.4S, v27.4S +trn2 v21.4S, v22.4S, v27.4S +trn1 v20.4S, v17.4S, v24.4S +trn2 v9.4S, v17.4S, v24.4S +trn2 v17.2D, v12.2D, v20.2D +trn2 v24.2D, v21.2D, v9.2D +trn1 v22.2D, v12.2D, v20.2D +trn1 v27.2D, v21.2D, v9.2D +sqrdmulh v9.4S, v17.4S, v29.4S +mul v17.4S, v17.4S,v23.4S +mla v17.4S, v9.4S, v31.s[0] +sub v9.4s, v22.4s, v17.4s +add v22.4s, v22.4s, v17.4s +sqrdmulh v17.4S, v24.4S, v29.4S +mul v24.4S, v24.4S,v23.4S +mla v24.4S, v17.4S, v31.s[0] +sub v17.4s, v27.4s, v24.4s +add v27.4s, v27.4s, v24.4s +sqrdmulh v24.4S, v27.4S, v16.4S +mul v27.4S, v27.4S,v13.4S +mla v27.4S, v24.4S, v31.s[0] +sub v24.4s, v22.4s, v27.4s +add v22.4s, v22.4s, v27.4s +sqrdmulh v27.4S, v17.4S, v30.4S +mul v17.4S, v17.4S,v5.4S +mla v17.4S, v27.4S, v31.s[0] +sub v27.4s, v9.4s, v17.4s +add v9.4s, v9.4s, v17.4s +str q22, [x0, #128] +str q24, [x0, #144] +str q9, [x0, #160] +str q27, [x0, #176] +ldr q27, [x17, #+512] +ldr q9, [x17, #+528] +ldr q24, [x17, #+544] +ldr q22, [x17, #+560] +ldr q17, [x17, #+576] +ldr q21, [x17, #+592] +ldr q20, [x17, #+608] +ldr q12, [x17, #+624] +ldr q30, [x0, #224] +ldr q5, [x0, #240] +ldr q16, [x0, #192] +ldr q13, [x0, #208] +sqrdmulh v29.4S, v30.4S, v9.s[0] +mul v30.4S, v30.4S,v27.s[0] +mla v30.4S, v29.4S, v31.s[0] +sub v29.4s, v16.4s, v30.4s +add v16.4s, v16.4s, v30.4s +sqrdmulh v30.4S, v5.4S, v9.s[0] +mul v5.4S, v5.4S,v27.s[0] +mla v5.4S, v30.4S, v31.s[0] +sub v30.4s, v13.4s, v5.4s +add v13.4s, v13.4s, v5.4s +sqrdmulh v5.4S, v13.4S, v9.s[1] +mul v13.4S, v13.4S,v27.s[1] +mla v13.4S, v5.4S, v31.s[0] +sub v5.4s, v16.4s, v13.4s +add v16.4s, v16.4s, v13.4s +sqrdmulh v13.4S, v30.4S, v9.s[2] +mul v30.4S, v30.4S,v27.s[2] +mla v30.4S, v13.4S, v31.s[0] +sub v13.4s, v29.4s, v30.4s +add v29.4s, v29.4s, v30.4s +trn1 v30.4S, v16.4S, v5.4S +trn2 v23.4S, v16.4S, v5.4S +trn1 v4.4S, v29.4S, v13.4S +trn2 v0.4S, v29.4S, v13.4S +trn2 v29.2D, v30.2D, v4.2D +trn2 v13.2D, v23.2D, v0.2D +trn1 v16.2D, v30.2D, v4.2D +trn1 v5.2D, v23.2D, v0.2D +sqrdmulh v0.4S, v29.4S, v22.4S +mul v29.4S, v29.4S,v24.4S +mla v29.4S, v0.4S, v31.s[0] +sub v0.4s, v16.4s, v29.4s +add v16.4s, v16.4s, v29.4s +sqrdmulh v29.4S, v13.4S, v22.4S +mul v13.4S, v13.4S,v24.4S +mla v13.4S, v29.4S, v31.s[0] +sub v29.4s, v5.4s, v13.4s +add v5.4s, v5.4s, v13.4s +sqrdmulh v13.4S, v5.4S, v21.4S +mul v5.4S, v5.4S,v17.4S +mla v5.4S, v13.4S, v31.s[0] +sub v13.4s, v16.4s, v5.4s +add v16.4s, v16.4s, v5.4s +sqrdmulh v5.4S, v29.4S, v12.4S +mul v29.4S, v29.4S,v20.4S +mla v29.4S, v5.4S, v31.s[0] +sub v5.4s, v0.4s, v29.4s +add v0.4s, v0.4s, v29.4s +str q16, [x0, #192] +str q13, [x0, #208] +str q0, [x0, #224] +str q5, [x0, #240] +ldr q5, [x17, #+640] +ldr q0, [x17, #+656] +ldr q13, [x17, #+672] +ldr q16, [x17, #+688] +ldr q29, [x17, #+704] +ldr q23, [x17, #+720] +ldr q4, [x17, #+736] +ldr q30, [x17, #+752] +ldr q12, [x0, #288] +ldr q20, [x0, #304] +ldr q21, [x0, #256] +ldr q17, [x0, #272] +sqrdmulh v22.4S, v12.4S, v0.s[0] +mul v12.4S, v12.4S,v5.s[0] +mla v12.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v12.4s +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v20.4S, v0.s[0] +mul v20.4S, v20.4S,v5.s[0] +mla v20.4S, v12.4S, v31.s[0] +sub v12.4s, v17.4s, v20.4s +add v17.4s, v17.4s, v20.4s +sqrdmulh v20.4S, v17.4S, v0.s[1] +mul v17.4S, v17.4S,v5.s[1] +mla v17.4S, v20.4S, v31.s[0] +sub v20.4s, v21.4s, v17.4s +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v12.4S, v0.s[2] +mul v12.4S, v12.4S,v5.s[2] +mla v12.4S, v17.4S, v31.s[0] +sub v17.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +trn1 v12.4S, v21.4S, v20.4S +trn2 v24.4S, v21.4S, v20.4S +trn1 v9.4S, v22.4S, v17.4S +trn2 v27.4S, v22.4S, v17.4S +trn2 v22.2D, v12.2D, v9.2D +trn2 v17.2D, v24.2D, v27.2D +trn1 v21.2D, v12.2D, v9.2D +trn1 v20.2D, v24.2D, v27.2D +sqrdmulh v27.4S, v22.4S, v16.4S +mul v22.4S, v22.4S,v13.4S +mla v22.4S, v27.4S, v31.s[0] +sub v27.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +sqrdmulh v22.4S, v17.4S, v16.4S +mul v17.4S, v17.4S,v13.4S +mla v17.4S, v22.4S, v31.s[0] +sub v22.4s, v20.4s, v17.4s +add v20.4s, v20.4s, v17.4s +sqrdmulh v17.4S, v20.4S, v23.4S +mul v20.4S, v20.4S,v29.4S +mla v20.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v20.4s +add v21.4s, v21.4s, v20.4s +sqrdmulh v20.4S, v22.4S, v30.4S +mul v22.4S, v22.4S,v4.4S +mla v22.4S, v20.4S, v31.s[0] +sub v20.4s, v27.4s, v22.4s +add v27.4s, v27.4s, v22.4s +str q21, [x0, #256] +str q17, [x0, #272] +str q27, [x0, #288] +str q20, [x0, #304] +ldr q20, [x17, #+768] +ldr q27, [x17, #+784] +ldr q17, [x17, #+800] +ldr q21, [x17, #+816] +ldr q22, [x17, #+832] +ldr q24, [x17, #+848] +ldr q9, [x17, #+864] +ldr q12, [x17, #+880] +ldr q30, [x0, #352] +ldr q4, [x0, #368] +ldr q23, [x0, #320] +ldr q29, [x0, #336] +sqrdmulh v16.4S, v30.4S, v27.s[0] +mul v30.4S, v30.4S,v20.s[0] +mla v30.4S, v16.4S, v31.s[0] +sub v16.4s, v23.4s, v30.4s +add v23.4s, v23.4s, v30.4s +sqrdmulh v30.4S, v4.4S, v27.s[0] +mul v4.4S, v4.4S,v20.s[0] +mla v4.4S, v30.4S, v31.s[0] +sub v30.4s, v29.4s, v4.4s +add v29.4s, v29.4s, v4.4s +sqrdmulh v4.4S, v29.4S, v27.s[1] +mul v29.4S, v29.4S,v20.s[1] +mla v29.4S, v4.4S, v31.s[0] +sub v4.4s, v23.4s, v29.4s +add v23.4s, v23.4s, v29.4s +sqrdmulh v29.4S, v30.4S, v27.s[2] +mul v30.4S, v30.4S,v20.s[2] +mla v30.4S, v29.4S, v31.s[0] +sub v29.4s, v16.4s, v30.4s +add v16.4s, v16.4s, v30.4s +trn1 v30.4S, v23.4S, v4.4S +trn2 v13.4S, v23.4S, v4.4S +trn1 v0.4S, v16.4S, v29.4S +trn2 v5.4S, v16.4S, v29.4S +trn2 v16.2D, v30.2D, v0.2D +trn2 v29.2D, v13.2D, v5.2D +trn1 v23.2D, v30.2D, v0.2D +trn1 v4.2D, v13.2D, v5.2D +sqrdmulh v5.4S, v16.4S, v21.4S +mul v16.4S, v16.4S,v17.4S +mla v16.4S, v5.4S, v31.s[0] +sub v5.4s, v23.4s, v16.4s +add v23.4s, v23.4s, v16.4s +sqrdmulh v16.4S, v29.4S, v21.4S +mul v29.4S, v29.4S,v17.4S +mla v29.4S, v16.4S, v31.s[0] +sub v16.4s, v4.4s, v29.4s +add v4.4s, v4.4s, v29.4s +sqrdmulh v29.4S, v4.4S, v24.4S +mul v4.4S, v4.4S,v22.4S +mla v4.4S, v29.4S, v31.s[0] +sub v29.4s, v23.4s, v4.4s +add v23.4s, v23.4s, v4.4s +sqrdmulh v4.4S, v16.4S, v12.4S +mul v16.4S, v16.4S,v9.4S +mla v16.4S, v4.4S, v31.s[0] +sub v4.4s, v5.4s, v16.4s +add v5.4s, v5.4s, v16.4s +str q23, [x0, #320] +str q29, [x0, #336] +str q5, [x0, #352] +str q4, [x0, #368] +ldr q4, [x17, #+896] +ldr q5, [x17, #+912] +ldr q29, [x17, #+928] +ldr q23, [x17, #+944] +ldr q16, [x17, #+960] +ldr q13, [x17, #+976] +ldr q0, [x17, #+992] +ldr q30, [x17, #+1008] +ldr q12, [x0, #416] +ldr q9, [x0, #432] +ldr q24, [x0, #384] +ldr q22, [x0, #400] +sqrdmulh v21.4S, v12.4S, v5.s[0] +mul v12.4S, v12.4S,v4.s[0] +mla v12.4S, v21.4S, v31.s[0] +sub v21.4s, v24.4s, v12.4s +add v24.4s, v24.4s, v12.4s +sqrdmulh v12.4S, v9.4S, v5.s[0] +mul v9.4S, v9.4S,v4.s[0] +mla v9.4S, v12.4S, v31.s[0] +sub v12.4s, v22.4s, v9.4s +add v22.4s, v22.4s, v9.4s +sqrdmulh v9.4S, v22.4S, v5.s[1] +mul v22.4S, v22.4S,v4.s[1] +mla v22.4S, v9.4S, v31.s[0] +sub v9.4s, v24.4s, v22.4s +add v24.4s, v24.4s, v22.4s +sqrdmulh v22.4S, v12.4S, v5.s[2] +mul v12.4S, v12.4S,v4.s[2] +mla v12.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v12.4s +add v21.4s, v21.4s, v12.4s +trn1 v12.4S, v24.4S, v9.4S +trn2 v17.4S, v24.4S, v9.4S +trn1 v27.4S, v21.4S, v22.4S +trn2 v20.4S, v21.4S, v22.4S +trn2 v21.2D, v12.2D, v27.2D +trn2 v22.2D, v17.2D, v20.2D +trn1 v24.2D, v12.2D, v27.2D +trn1 v9.2D, v17.2D, v20.2D +sqrdmulh v20.4S, v21.4S, v23.4S +mul v21.4S, v21.4S,v29.4S +mla v21.4S, v20.4S, v31.s[0] +sub v20.4s, v24.4s, v21.4s +add v24.4s, v24.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v23.4S +mul v22.4S, v22.4S,v29.4S +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v9.4s, v22.4s +add v9.4s, v9.4s, v22.4s +sqrdmulh v22.4S, v9.4S, v13.4S +mul v9.4S, v9.4S,v16.4S +mla v9.4S, v22.4S, v31.s[0] +sub v22.4s, v24.4s, v9.4s +add v24.4s, v24.4s, v9.4s +sqrdmulh v9.4S, v21.4S, v30.4S +mul v21.4S, v21.4S,v0.4S +mla v21.4S, v9.4S, v31.s[0] +sub v9.4s, v20.4s, v21.4s +add v20.4s, v20.4s, v21.4s +str q24, [x0, #384] +str q22, [x0, #400] +str q20, [x0, #416] +str q9, [x0, #432] +ldr q9, [x17, #+1024] +ldr q20, [x17, #+1040] +ldr q22, [x17, #+1056] +ldr q24, [x17, #+1072] +ldr q21, [x17, #+1088] +ldr q17, [x17, #+1104] +ldr q27, [x17, #+1120] +ldr q12, [x17, #+1136] +ldr q30, [x0, #480] +ldr q0, [x0, #496] +ldr q13, [x0, #448] +ldr q16, [x0, #464] +sqrdmulh v23.4S, v30.4S, v20.s[0] +mul v30.4S, v30.4S,v9.s[0] +mla v30.4S, v23.4S, v31.s[0] +sub v23.4s, v13.4s, v30.4s +add v13.4s, v13.4s, v30.4s +sqrdmulh v30.4S, v0.4S, v20.s[0] +mul v0.4S, v0.4S,v9.s[0] +mla v0.4S, v30.4S, v31.s[0] +sub v30.4s, v16.4s, v0.4s +add v16.4s, v16.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v20.s[1] +mul v16.4S, v16.4S,v9.s[1] +mla v16.4S, v0.4S, v31.s[0] +sub v0.4s, v13.4s, v16.4s +add v13.4s, v13.4s, v16.4s +sqrdmulh v16.4S, v30.4S, v20.s[2] +mul v30.4S, v30.4S,v9.s[2] +mla v30.4S, v16.4S, v31.s[0] +sub v16.4s, v23.4s, v30.4s +add v23.4s, v23.4s, v30.4s +trn1 v30.4S, v13.4S, v0.4S +trn2 v29.4S, v13.4S, v0.4S +trn1 v5.4S, v23.4S, v16.4S +trn2 v4.4S, v23.4S, v16.4S +trn2 v23.2D, v30.2D, v5.2D +trn2 v16.2D, v29.2D, v4.2D +trn1 v13.2D, v30.2D, v5.2D +trn1 v0.2D, v29.2D, v4.2D +sqrdmulh v4.4S, v23.4S, v24.4S +mul v23.4S, v23.4S,v22.4S +mla v23.4S, v4.4S, v31.s[0] +sub v4.4s, v13.4s, v23.4s +add v13.4s, v13.4s, v23.4s +sqrdmulh v23.4S, v16.4S, v24.4S +mul v16.4S, v16.4S,v22.4S +mla v16.4S, v23.4S, v31.s[0] +sub v23.4s, v0.4s, v16.4s +add v0.4s, v0.4s, v16.4s +sqrdmulh v16.4S, v0.4S, v17.4S +mul v0.4S, v0.4S,v21.4S +mla v0.4S, v16.4S, v31.s[0] +sub v16.4s, v13.4s, v0.4s +add v13.4s, v13.4s, v0.4s +sqrdmulh v0.4S, v23.4S, v12.4S +mul v23.4S, v23.4S,v27.4S +mla v23.4S, v0.4S, v31.s[0] +sub v0.4s, v4.4s, v23.4s +add v4.4s, v4.4s, v23.4s +str q13, [x0, #448] +str q16, [x0, #464] +str q4, [x0, #480] +str q0, [x0, #496] +ldr q0, [x17, #+1152] +ldr q4, [x17, #+1168] +ldr q16, [x17, #+1184] +ldr q13, [x17, #+1200] +ldr q23, [x17, #+1216] +ldr q29, [x17, #+1232] +ldr q5, [x17, #+1248] +ldr q30, [x17, #+1264] +ldr q12, [x0, #544] +ldr q27, [x0, #560] +ldr q17, [x0, #512] +ldr q21, [x0, #528] +sqrdmulh v24.4S, v12.4S, v4.s[0] +mul v12.4S, v12.4S,v0.s[0] +mla v12.4S, v24.4S, v31.s[0] +sub v24.4s, v17.4s, v12.4s +add v17.4s, v17.4s, v12.4s +sqrdmulh v12.4S, v27.4S, v4.s[0] +mul v27.4S, v27.4S,v0.s[0] +mla v27.4S, v12.4S, v31.s[0] +sub v12.4s, v21.4s, v27.4s +add v21.4s, v21.4s, v27.4s +sqrdmulh v27.4S, v21.4S, v4.s[1] +mul v21.4S, v21.4S,v0.s[1] +mla v21.4S, v27.4S, v31.s[0] +sub v27.4s, v17.4s, v21.4s +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v12.4S, v4.s[2] +mul v12.4S, v12.4S,v0.s[2] +mla v12.4S, v21.4S, v31.s[0] +sub v21.4s, v24.4s, v12.4s +add v24.4s, v24.4s, v12.4s +trn1 v12.4S, v17.4S, v27.4S +trn2 v22.4S, v17.4S, v27.4S +trn1 v20.4S, v24.4S, v21.4S +trn2 v9.4S, v24.4S, v21.4S +trn2 v24.2D, v12.2D, v20.2D +trn2 v21.2D, v22.2D, v9.2D +trn1 v17.2D, v12.2D, v20.2D +trn1 v27.2D, v22.2D, v9.2D +sqrdmulh v9.4S, v24.4S, v13.4S +mul v24.4S, v24.4S,v16.4S +mla v24.4S, v9.4S, v31.s[0] +sub v9.4s, v17.4s, v24.4s +add v17.4s, v17.4s, v24.4s +sqrdmulh v24.4S, v21.4S, v13.4S +mul v21.4S, v21.4S,v16.4S +mla v21.4S, v24.4S, v31.s[0] +sub v24.4s, v27.4s, v21.4s +add v27.4s, v27.4s, v21.4s +sqrdmulh v21.4S, v27.4S, v29.4S +mul v27.4S, v27.4S,v23.4S +mla v27.4S, v21.4S, v31.s[0] +sub v21.4s, v17.4s, v27.4s +add v17.4s, v17.4s, v27.4s +sqrdmulh v27.4S, v24.4S, v30.4S +mul v24.4S, v24.4S,v5.4S +mla v24.4S, v27.4S, v31.s[0] +sub v27.4s, v9.4s, v24.4s +add v9.4s, v9.4s, v24.4s +str q17, [x0, #512] +str q21, [x0, #528] +str q9, [x0, #544] +str q27, [x0, #560] +ldr q27, [x17, #+1280] +ldr q9, [x17, #+1296] +ldr q21, [x17, #+1312] +ldr q17, [x17, #+1328] +ldr q24, [x17, #+1344] +ldr q22, [x17, #+1360] +ldr q20, [x17, #+1376] +ldr q12, [x17, #+1392] +ldr q30, [x0, #608] +ldr q5, [x0, #624] +ldr q29, [x0, #576] +ldr q23, [x0, #592] +sqrdmulh v13.4S, v30.4S, v9.s[0] +mul v30.4S, v30.4S,v27.s[0] +mla v30.4S, v13.4S, v31.s[0] +sub v13.4s, v29.4s, v30.4s +add v29.4s, v29.4s, v30.4s +sqrdmulh v30.4S, v5.4S, v9.s[0] +mul v5.4S, v5.4S,v27.s[0] +mla v5.4S, v30.4S, v31.s[0] +sub v30.4s, v23.4s, v5.4s +add v23.4s, v23.4s, v5.4s +sqrdmulh v5.4S, v23.4S, v9.s[1] +mul v23.4S, v23.4S,v27.s[1] +mla v23.4S, v5.4S, v31.s[0] +sub v5.4s, v29.4s, v23.4s +add v29.4s, v29.4s, v23.4s +sqrdmulh v23.4S, v30.4S, v9.s[2] +mul v30.4S, v30.4S,v27.s[2] +mla v30.4S, v23.4S, v31.s[0] +sub v23.4s, v13.4s, v30.4s +add v13.4s, v13.4s, v30.4s +trn1 v30.4S, v29.4S, v5.4S +trn2 v16.4S, v29.4S, v5.4S +trn1 v4.4S, v13.4S, v23.4S +trn2 v0.4S, v13.4S, v23.4S +trn2 v13.2D, v30.2D, v4.2D +trn2 v23.2D, v16.2D, v0.2D +trn1 v29.2D, v30.2D, v4.2D +trn1 v5.2D, v16.2D, v0.2D +sqrdmulh v0.4S, v13.4S, v17.4S +mul v13.4S, v13.4S,v21.4S +mla v13.4S, v0.4S, v31.s[0] +sub v0.4s, v29.4s, v13.4s +add v29.4s, v29.4s, v13.4s +sqrdmulh v13.4S, v23.4S, v17.4S +mul v23.4S, v23.4S,v21.4S +mla v23.4S, v13.4S, v31.s[0] +sub v13.4s, v5.4s, v23.4s +add v5.4s, v5.4s, v23.4s +sqrdmulh v23.4S, v5.4S, v22.4S +mul v5.4S, v5.4S,v24.4S +mla v5.4S, v23.4S, v31.s[0] +sub v23.4s, v29.4s, v5.4s +add v29.4s, v29.4s, v5.4s +sqrdmulh v5.4S, v13.4S, v12.4S +mul v13.4S, v13.4S,v20.4S +mla v13.4S, v5.4S, v31.s[0] +sub v5.4s, v0.4s, v13.4s +add v0.4s, v0.4s, v13.4s +str q29, [x0, #576] +str q23, [x0, #592] +str q0, [x0, #608] +str q5, [x0, #624] +ldr q5, [x17, #+1408] +ldr q0, [x17, #+1424] +ldr q23, [x17, #+1440] +ldr q29, [x17, #+1456] +ldr q13, [x17, #+1472] +ldr q16, [x17, #+1488] +ldr q4, [x17, #+1504] +ldr q30, [x17, #+1520] +ldr q12, [x0, #672] +ldr q20, [x0, #688] +ldr q22, [x0, #640] +ldr q24, [x0, #656] +sqrdmulh v17.4S, v12.4S, v0.s[0] +mul v12.4S, v12.4S,v5.s[0] +mla v12.4S, v17.4S, v31.s[0] +sub v17.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v20.4S, v0.s[0] +mul v20.4S, v20.4S,v5.s[0] +mla v20.4S, v12.4S, v31.s[0] +sub v12.4s, v24.4s, v20.4s +add v24.4s, v24.4s, v20.4s +sqrdmulh v20.4S, v24.4S, v0.s[1] +mul v24.4S, v24.4S,v5.s[1] +mla v24.4S, v20.4S, v31.s[0] +sub v20.4s, v22.4s, v24.4s +add v22.4s, v22.4s, v24.4s +sqrdmulh v24.4S, v12.4S, v0.s[2] +mul v12.4S, v12.4S,v5.s[2] +mla v12.4S, v24.4S, v31.s[0] +sub v24.4s, v17.4s, v12.4s +add v17.4s, v17.4s, v12.4s +trn1 v12.4S, v22.4S, v20.4S +trn2 v21.4S, v22.4S, v20.4S +trn1 v9.4S, v17.4S, v24.4S +trn2 v27.4S, v17.4S, v24.4S +trn2 v17.2D, v12.2D, v9.2D +trn2 v24.2D, v21.2D, v27.2D +trn1 v22.2D, v12.2D, v9.2D +trn1 v20.2D, v21.2D, v27.2D +sqrdmulh v27.4S, v17.4S, v29.4S +mul v17.4S, v17.4S,v23.4S +mla v17.4S, v27.4S, v31.s[0] +sub v27.4s, v22.4s, v17.4s +add v22.4s, v22.4s, v17.4s +sqrdmulh v17.4S, v24.4S, v29.4S +mul v24.4S, v24.4S,v23.4S +mla v24.4S, v17.4S, v31.s[0] +sub v17.4s, v20.4s, v24.4s +add v20.4s, v20.4s, v24.4s +sqrdmulh v24.4S, v20.4S, v16.4S +mul v20.4S, v20.4S,v13.4S +mla v20.4S, v24.4S, v31.s[0] +sub v24.4s, v22.4s, v20.4s +add v22.4s, v22.4s, v20.4s +sqrdmulh v20.4S, v17.4S, v30.4S +mul v17.4S, v17.4S,v4.4S +mla v17.4S, v20.4S, v31.s[0] +sub v20.4s, v27.4s, v17.4s +add v27.4s, v27.4s, v17.4s +str q22, [x0, #640] +str q24, [x0, #656] +str q27, [x0, #672] +str q20, [x0, #688] +ldr q20, [x17, #+1536] +ldr q27, [x17, #+1552] +ldr q24, [x17, #+1568] +ldr q22, [x17, #+1584] +ldr q17, [x17, #+1600] +ldr q21, [x17, #+1616] +ldr q9, [x17, #+1632] +ldr q12, [x17, #+1648] +ldr q30, [x0, #736] +ldr q4, [x0, #752] +ldr q16, [x0, #704] +ldr q13, [x0, #720] +sqrdmulh v29.4S, v30.4S, v27.s[0] +mul v30.4S, v30.4S,v20.s[0] +mla v30.4S, v29.4S, v31.s[0] +sub v29.4s, v16.4s, v30.4s +add v16.4s, v16.4s, v30.4s +sqrdmulh v30.4S, v4.4S, v27.s[0] +mul v4.4S, v4.4S,v20.s[0] +mla v4.4S, v30.4S, v31.s[0] +sub v30.4s, v13.4s, v4.4s +add v13.4s, v13.4s, v4.4s +sqrdmulh v4.4S, v13.4S, v27.s[1] +mul v13.4S, v13.4S,v20.s[1] +mla v13.4S, v4.4S, v31.s[0] +sub v4.4s, v16.4s, v13.4s +add v16.4s, v16.4s, v13.4s +sqrdmulh v13.4S, v30.4S, v27.s[2] +mul v30.4S, v30.4S,v20.s[2] +mla v30.4S, v13.4S, v31.s[0] +sub v13.4s, v29.4s, v30.4s +add v29.4s, v29.4s, v30.4s +trn1 v30.4S, v16.4S, v4.4S +trn2 v23.4S, v16.4S, v4.4S +trn1 v0.4S, v29.4S, v13.4S +trn2 v5.4S, v29.4S, v13.4S +trn2 v29.2D, v30.2D, v0.2D +trn2 v13.2D, v23.2D, v5.2D +trn1 v16.2D, v30.2D, v0.2D +trn1 v4.2D, v23.2D, v5.2D +sqrdmulh v5.4S, v29.4S, v22.4S +mul v29.4S, v29.4S,v24.4S +mla v29.4S, v5.4S, v31.s[0] +sub v5.4s, v16.4s, v29.4s +add v16.4s, v16.4s, v29.4s +sqrdmulh v29.4S, v13.4S, v22.4S +mul v13.4S, v13.4S,v24.4S +mla v13.4S, v29.4S, v31.s[0] +sub v29.4s, v4.4s, v13.4s +add v4.4s, v4.4s, v13.4s +sqrdmulh v13.4S, v4.4S, v21.4S +mul v4.4S, v4.4S,v17.4S +mla v4.4S, v13.4S, v31.s[0] +sub v13.4s, v16.4s, v4.4s +add v16.4s, v16.4s, v4.4s +sqrdmulh v4.4S, v29.4S, v12.4S +mul v29.4S, v29.4S,v9.4S +mla v29.4S, v4.4S, v31.s[0] +sub v4.4s, v5.4s, v29.4s +add v5.4s, v5.4s, v29.4s +str q16, [x0, #704] +str q13, [x0, #720] +str q5, [x0, #736] +str q4, [x0, #752] +ldr q4, [x17, #+1664] +ldr q5, [x17, #+1680] +ldr q13, [x17, #+1696] +ldr q16, [x17, #+1712] +ldr q29, [x17, #+1728] +ldr q23, [x17, #+1744] +ldr q0, [x17, #+1760] +ldr q30, [x17, #+1776] +ldr q12, [x0, #800] +ldr q9, [x0, #816] +ldr q21, [x0, #768] +ldr q17, [x0, #784] +sqrdmulh v22.4S, v12.4S, v5.s[0] +mul v12.4S, v12.4S,v4.s[0] +mla v12.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v12.4s +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v9.4S, v5.s[0] +mul v9.4S, v9.4S,v4.s[0] +mla v9.4S, v12.4S, v31.s[0] +sub v12.4s, v17.4s, v9.4s +add v17.4s, v17.4s, v9.4s +sqrdmulh v9.4S, v17.4S, v5.s[1] +mul v17.4S, v17.4S,v4.s[1] +mla v17.4S, v9.4S, v31.s[0] +sub v9.4s, v21.4s, v17.4s +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v12.4S, v5.s[2] +mul v12.4S, v12.4S,v4.s[2] +mla v12.4S, v17.4S, v31.s[0] +sub v17.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +trn1 v12.4S, v21.4S, v9.4S +trn2 v24.4S, v21.4S, v9.4S +trn1 v27.4S, v22.4S, v17.4S +trn2 v20.4S, v22.4S, v17.4S +trn2 v22.2D, v12.2D, v27.2D +trn2 v17.2D, v24.2D, v20.2D +trn1 v21.2D, v12.2D, v27.2D +trn1 v9.2D, v24.2D, v20.2D +sqrdmulh v20.4S, v22.4S, v16.4S +mul v22.4S, v22.4S,v13.4S +mla v22.4S, v20.4S, v31.s[0] +sub v20.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +sqrdmulh v22.4S, v17.4S, v16.4S +mul v17.4S, v17.4S,v13.4S +mla v17.4S, v22.4S, v31.s[0] +sub v22.4s, v9.4s, v17.4s +add v9.4s, v9.4s, v17.4s +sqrdmulh v17.4S, v9.4S, v23.4S +mul v9.4S, v9.4S,v29.4S +mla v9.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v9.4s +add v21.4s, v21.4s, v9.4s +sqrdmulh v9.4S, v22.4S, v30.4S +mul v22.4S, v22.4S,v0.4S +mla v22.4S, v9.4S, v31.s[0] +sub v9.4s, v20.4s, v22.4s +add v20.4s, v20.4s, v22.4s +str q21, [x0, #768] +str q17, [x0, #784] +str q20, [x0, #800] +str q9, [x0, #816] +ldr q9, [x17, #+1792] +ldr q20, [x17, #+1808] +ldr q17, [x17, #+1824] +ldr q21, [x17, #+1840] +ldr q22, [x17, #+1856] +ldr q24, [x17, #+1872] +ldr q27, [x17, #+1888] +ldr q12, [x17, #+1904] +ldr q30, [x0, #864] +ldr q0, [x0, #880] +ldr q23, [x0, #832] +ldr q29, [x0, #848] +sqrdmulh v16.4S, v30.4S, v20.s[0] +mul v30.4S, v30.4S,v9.s[0] +mla v30.4S, v16.4S, v31.s[0] +sub v16.4s, v23.4s, v30.4s +add v23.4s, v23.4s, v30.4s +sqrdmulh v30.4S, v0.4S, v20.s[0] +mul v0.4S, v0.4S,v9.s[0] +mla v0.4S, v30.4S, v31.s[0] +sub v30.4s, v29.4s, v0.4s +add v29.4s, v29.4s, v0.4s +sqrdmulh v0.4S, v29.4S, v20.s[1] +mul v29.4S, v29.4S,v9.s[1] +mla v29.4S, v0.4S, v31.s[0] +sub v0.4s, v23.4s, v29.4s +add v23.4s, v23.4s, v29.4s +sqrdmulh v29.4S, v30.4S, v20.s[2] +mul v30.4S, v30.4S,v9.s[2] +mla v30.4S, v29.4S, v31.s[0] +sub v29.4s, v16.4s, v30.4s +add v16.4s, v16.4s, v30.4s +trn1 v30.4S, v23.4S, v0.4S +trn2 v13.4S, v23.4S, v0.4S +trn1 v5.4S, v16.4S, v29.4S +trn2 v4.4S, v16.4S, v29.4S +trn2 v16.2D, v30.2D, v5.2D +trn2 v29.2D, v13.2D, v4.2D +trn1 v23.2D, v30.2D, v5.2D +trn1 v0.2D, v13.2D, v4.2D +sqrdmulh v4.4S, v16.4S, v21.4S +mul v16.4S, v16.4S,v17.4S +mla v16.4S, v4.4S, v31.s[0] +sub v4.4s, v23.4s, v16.4s +add v23.4s, v23.4s, v16.4s +sqrdmulh v16.4S, v29.4S, v21.4S +mul v29.4S, v29.4S,v17.4S +mla v29.4S, v16.4S, v31.s[0] +sub v16.4s, v0.4s, v29.4s +add v0.4s, v0.4s, v29.4s +sqrdmulh v29.4S, v0.4S, v24.4S +mul v0.4S, v0.4S,v22.4S +mla v0.4S, v29.4S, v31.s[0] +sub v29.4s, v23.4s, v0.4s +add v23.4s, v23.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v12.4S +mul v16.4S, v16.4S,v27.4S +mla v16.4S, v0.4S, v31.s[0] +sub v0.4s, v4.4s, v16.4s +add v4.4s, v4.4s, v16.4s +str q23, [x0, #832] +str q29, [x0, #848] +str q4, [x0, #864] +str q0, [x0, #880] +ldr q0, [x17, #+1920] +ldr q4, [x17, #+1936] +ldr q29, [x17, #+1952] +ldr q23, [x17, #+1968] +ldr q16, [x17, #+1984] +ldr q13, [x17, #+2000] +ldr q5, [x17, #+2016] +ldr q30, [x17, #+2032] +ldr q12, [x0, #928] +ldr q27, [x0, #944] +ldr q24, [x0, #896] +ldr q22, [x0, #912] +sqrdmulh v21.4S, v12.4S, v4.s[0] +mul v12.4S, v12.4S,v0.s[0] +mla v12.4S, v21.4S, v31.s[0] +sub v21.4s, v24.4s, v12.4s +add v24.4s, v24.4s, v12.4s +sqrdmulh v12.4S, v27.4S, v4.s[0] +mul v27.4S, v27.4S,v0.s[0] +mla v27.4S, v12.4S, v31.s[0] +sub v12.4s, v22.4s, v27.4s +add v22.4s, v22.4s, v27.4s +sqrdmulh v27.4S, v22.4S, v4.s[1] +mul v22.4S, v22.4S,v0.s[1] +mla v22.4S, v27.4S, v31.s[0] +sub v27.4s, v24.4s, v22.4s +add v24.4s, v24.4s, v22.4s +sqrdmulh v22.4S, v12.4S, v4.s[2] +mul v12.4S, v12.4S,v0.s[2] +mla v12.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v12.4s +add v21.4s, v21.4s, v12.4s +trn1 v12.4S, v24.4S, v27.4S +trn2 v17.4S, v24.4S, v27.4S +trn1 v20.4S, v21.4S, v22.4S +trn2 v9.4S, v21.4S, v22.4S +trn2 v21.2D, v12.2D, v20.2D +trn2 v22.2D, v17.2D, v9.2D +trn1 v24.2D, v12.2D, v20.2D +trn1 v27.2D, v17.2D, v9.2D +sqrdmulh v9.4S, v21.4S, v23.4S +mul v21.4S, v21.4S,v29.4S +mla v21.4S, v9.4S, v31.s[0] +sub v9.4s, v24.4s, v21.4s +add v24.4s, v24.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v23.4S +mul v22.4S, v22.4S,v29.4S +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v27.4s, v22.4s +add v27.4s, v27.4s, v22.4s +sqrdmulh v22.4S, v27.4S, v13.4S +mul v27.4S, v27.4S,v16.4S +mla v27.4S, v22.4S, v31.s[0] +sub v22.4s, v24.4s, v27.4s +add v24.4s, v24.4s, v27.4s +sqrdmulh v27.4S, v21.4S, v30.4S +mul v21.4S, v21.4S,v5.4S +mla v21.4S, v27.4S, v31.s[0] +sub v27.4s, v9.4s, v21.4s +add v9.4s, v9.4s, v21.4s +str q24, [x0, #896] +str q22, [x0, #912] +str q9, [x0, #928] +str q27, [x0, #944] +ldr q27, [x17, #+2048] +ldr q9, [x17, #+2064] +ldr q22, [x17, #+2080] +ldr q24, [x17, #+2096] +ldr q21, [x17, #+2112] +ldr q17, [x17, #+2128] +ldr q20, [x17, #+2144] +ldr q12, [x17, #+2160] +ldr q30, [x0, #992] +ldr q5, [x0, #1008] +ldr q13, [x0, #960] +ldr q16, [x0, #976] +sqrdmulh v23.4S, v30.4S, v9.s[0] +mul v30.4S, v30.4S,v27.s[0] +mla v30.4S, v23.4S, v31.s[0] +sub v23.4s, v13.4s, v30.4s +add v13.4s, v13.4s, v30.4s +sqrdmulh v30.4S, v5.4S, v9.s[0] +mul v5.4S, v5.4S,v27.s[0] +mla v5.4S, v30.4S, v31.s[0] +sub v30.4s, v16.4s, v5.4s +add v16.4s, v16.4s, v5.4s +sqrdmulh v5.4S, v16.4S, v9.s[1] +mul v16.4S, v16.4S,v27.s[1] +mla v16.4S, v5.4S, v31.s[0] +sub v5.4s, v13.4s, v16.4s +add v13.4s, v13.4s, v16.4s +sqrdmulh v16.4S, v30.4S, v9.s[2] +mul v30.4S, v30.4S,v27.s[2] +mla v30.4S, v16.4S, v31.s[0] +sub v16.4s, v23.4s, v30.4s +add v23.4s, v23.4s, v30.4s +trn1 v30.4S, v13.4S, v5.4S +trn2 v29.4S, v13.4S, v5.4S +trn1 v4.4S, v23.4S, v16.4S +trn2 v0.4S, v23.4S, v16.4S +trn2 v23.2D, v30.2D, v4.2D +trn2 v16.2D, v29.2D, v0.2D +trn1 v13.2D, v30.2D, v4.2D +trn1 v5.2D, v29.2D, v0.2D +sqrdmulh v0.4S, v23.4S, v24.4S +mul v23.4S, v23.4S,v22.4S +mla v23.4S, v0.4S, v31.s[0] +sub v0.4s, v13.4s, v23.4s +add v13.4s, v13.4s, v23.4s +sqrdmulh v23.4S, v16.4S, v24.4S +mul v16.4S, v16.4S,v22.4S +mla v16.4S, v23.4S, v31.s[0] +sub v23.4s, v5.4s, v16.4s +add v5.4s, v5.4s, v16.4s +sqrdmulh v16.4S, v5.4S, v17.4S +mul v5.4S, v5.4S,v21.4S +mla v5.4S, v16.4S, v31.s[0] +sub v16.4s, v13.4s, v5.4s +add v13.4s, v13.4s, v5.4s +sqrdmulh v5.4S, v23.4S, v12.4S +mul v23.4S, v23.4S,v20.4S +mla v23.4S, v5.4S, v31.s[0] +sub v5.4s, v0.4s, v23.4s +add v0.4s, v0.4s, v23.4s +str q13, [x0, #960] +str q16, [x0, #976] +str q0, [x0, #992] +str q5, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 2476 +// Instruction count: 2472 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_16_0.s b/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_16_0.s new file mode 100644 index 0000000..2b07129 --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_16_0.s @@ -0,0 +1,2506 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 26036764 // Layer 6, block 0 +.word 7065381 // Layer 6, block 1 +.word 11280567 // Layer 6, block 2 +.word 19695786 // Layer 6, block 3 +.word 1666225723 // Layer 6, block 0 +.word 452149874 // Layer 6, block 1 +.word 721901190 // Layer 6, block 2 +.word 1260434103 // Layer 6, block 3 +.word 28678040 // Layer 7, block 0 +.word 5637166 // Layer 7, block 2 +.word 18759424 // Layer 7, block 4 +.word 8648030 // Layer 7, block 6 +.word 1835254486 // Layer 7, block 0 +.word 360751090 // Layer 7, block 2 +.word 1200511508 // Layer 7, block 4 +.word 553431680 // Layer 7, block 6 +.word 7232147 // Layer 7, block 1 +.word 7430689 // Layer 7, block 3 +.word 14819378 // Layer 7, block 5 +.word 22112339 // Layer 7, block 7 +.word 462822084 // Layer 7, block 1 +.word 475527802 // Layer 7, block 3 +.word 948367809 // Layer 7, block 5 +.word 1415081692 // Layer 7, block 7 +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14834498 // Layer 6, block 4 +.word 22861321 // Layer 6, block 5 +.word 23033862 // Layer 6, block 6 +.word 32211066 // Layer 6, block 7 +.word 949335415 // Layer 6, block 4 +.word 1463012881 // Layer 6, block 5 +.word 1474054663 // Layer 6, block 6 +.word 2061350894 // Layer 6, block 7 +.word 7103825 // Layer 7, block 8 +.word 24338119 // Layer 7, block 10 +.word 6674394 // Layer 7, block 12 +.word 3716128 // Layer 7, block 14 +.word 454610102 // Layer 7, block 8 +.word 1557520740 // Layer 7, block 10 +.word 427128616 // Layer 7, block 12 +.word 237814041 // Layer 7, block 14 +.word 18577393 // Layer 7, block 9 +.word 17042091 // Layer 7, block 11 +.word 6574213 // Layer 7, block 13 +.word 24666803 // Layer 7, block 15 +.word 1188862414 // Layer 7, block 9 +.word 1090610585 // Layer 7, block 11 +.word 420717521 // Layer 7, block 13 +.word 1578554911 // Layer 7, block 15 +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 11253846 // Layer 6, block 8 +.word 16151303 // Layer 6, block 9 +.word 1821442 // Layer 6, block 10 +.word 23358663 // Layer 6, block 11 +.word 720191176 // Layer 6, block 8 +.word 1033604503 // Layer 6, block 9 +.word 116563391 // Layer 6, block 10 +.word 1494840340 // Layer 6, block 11 +.word 32787475 // Layer 7, block 16 +.word 8269259 // Layer 7, block 18 +.word 20826321 // Layer 7, block 20 +.word 21194054 // Layer 7, block 22 +.word 2098238255 // Layer 7, block 16 +.word 529192186 // Layer 7, block 18 +.word 1332782821 // Layer 7, block 20 +.word 1356315937 // Layer 7, block 22 +.word 28400654 // Layer 7, block 17 +.word 31090287 // Layer 7, block 19 +.word 26776841 // Layer 7, block 21 +.word 22281074 // Layer 7, block 23 +.word 1817503137 // Layer 7, block 17 +.word 1989626512 // Layer 7, block 19 +.word 1713587037 // Layer 7, block 21 +.word 1425879908 // Layer 7, block 23 +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 20504641 // Layer 6, block 12 +.word 7735096 // Layer 6, block 13 +.word 29463916 // Layer 6, block 14 +.word 23172067 // Layer 6, block 15 +.word 1312196872 // Layer 6, block 12 +.word 495008363 // Layer 6, block 13 +.word 1885546712 // Layer 6, block 14 +.word 1482899108 // Layer 6, block 15 +.word 1953000 // Layer 7, block 24 +.word 12766243 // Layer 7, block 26 +.word 16292342 // Layer 7, block 28 +.word 25143337 // Layer 7, block 30 +.word 124982461 // Layer 7, block 24 +.word 816977197 // Layer 7, block 26 +.word 1042630311 // Layer 7, block 28 +.word 1609050759 // Layer 7, block 30 +.word 12486848 // Layer 7, block 25 +.word 31556661 // Layer 7, block 27 +.word 28330310 // Layer 7, block 29 +.word 15137961 // Layer 7, block 31 +.word 799097282 // Layer 7, block 25 +.word 2019472170 // Layer 7, block 27 +.word 1813001465 // Layer 7, block 29 +.word 968755565 // Layer 7, block 31 +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 18663828 // Layer 6, block 16 +.word 25765932 // Layer 6, block 17 +.word 11779122 // Layer 6, block 18 +.word 29112305 // Layer 6, block 19 +.word 1194393831 // Layer 6, block 16 +.word 1648893798 // Layer 6, block 17 +.word 753806275 // Layer 6, block 18 +.word 1863045325 // Layer 6, block 19 +.word 33163184 // Layer 7, block 32 +.word 11550623 // Layer 7, block 34 +.word 25375595 // Layer 7, block 36 +.word 18254638 // Layer 7, block 38 +.word 2122281795 // Layer 7, block 32 +.word 739183455 // Layer 7, block 34 +.word 1623914137 // Layer 7, block 36 +.word 1168207670 // Layer 7, block 38 +.word 9551359 // Layer 7, block 33 +.word 33257316 // Layer 7, block 35 +.word 10387700 // Layer 7, block 37 +.word 4263629 // Layer 7, block 39 +.word 611240324 // Layer 7, block 33 +.word 2128305784 // Layer 7, block 35 +.word 664762063 // Layer 7, block 37 +.word 272851431 // Layer 7, block 39 +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 596073 // Layer 6, block 20 +.word 29039358 // Layer 6, block 21 +.word 6760262 // Layer 6, block 22 +.word 2228887 // Layer 6, block 23 +.word 38145761 // Layer 6, block 20 +.word 1858377074 // Layer 6, block 21 +.word 432623749 // Layer 6, block 22 +.word 142637881 // Layer 6, block 23 +.word 25929180 // Layer 7, block 40 +.word 23508428 // Layer 7, block 42 +.word 22560727 // Layer 7, block 44 +.word 29457393 // Layer 7, block 46 +.word 1659340873 // Layer 7, block 40 +.word 1504424569 // Layer 7, block 42 +.word 1443776334 // Layer 7, block 44 +.word 1885129272 // Layer 7, block 46 +.word 17371159 // Layer 7, block 41 +.word 11558208 // Layer 7, block 43 +.word 15755637 // Layer 7, block 45 +.word 20740787 // Layer 7, block 47 +.word 1111669329 // Layer 7, block 41 +.word 739668858 // Layer 7, block 43 +.word 1008283812 // Layer 7, block 45 +.word 1327309063 // Layer 7, block 47 +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 13624329 // Layer 6, block 24 +.word 9838349 // Layer 6, block 25 +.word 6934560 // Layer 6, block 26 +.word 11310234 // Layer 6, block 27 +.word 871890510 // Layer 6, block 24 +.word 629606282 // Layer 6, block 25 +.word 443777969 // Layer 6, block 26 +.word 723799733 // Layer 6, block 27 +.word 3153984 // Layer 7, block 48 +.word 15599806 // Layer 7, block 50 +.word 23484790 // Layer 7, block 52 +.word 30174454 // Layer 7, block 54 +.word 201839571 // Layer 7, block 48 +.word 998311389 // Layer 7, block 50 +.word 1502911852 // Layer 7, block 52 +.word 1931017673 // Layer 7, block 54 +.word 13598070 // Layer 7, block 49 +.word 31454003 // Layer 7, block 51 +.word 20506260 // Layer 7, block 53 +.word 5928435 // Layer 7, block 55 +.word 870210062 // Layer 7, block 49 +.word 2012902560 // Layer 7, block 51 +.word 1312300480 // Layer 7, block 53 +.word 379390883 // Layer 7, block 55 +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 32798516 // Layer 6, block 28 +.word 9911360 // Layer 6, block 29 +.word 32443170 // Layer 6, block 30 +.word 31293482 // Layer 6, block 31 +.word 2098944825 // Layer 6, block 28 +.word 634278629 // Layer 6, block 29 +.word 2076204416 // Layer 6, block 30 +.word 2002630000 // Layer 6, block 31 +.word 26013877 // Layer 7, block 56 +.word 22928950 // Layer 7, block 58 +.word 24547058 // Layer 7, block 60 +.word 21082546 // Layer 7, block 62 +.word 1664761067 // Layer 7, block 56 +.word 1467340807 // Layer 7, block 58 +.word 1570891816 // Layer 7, block 60 +.word 1349179970 // Layer 7, block 62 +.word 21864746 // Layer 7, block 57 +.word 27678266 // Layer 7, block 59 +.word 30695887 // Layer 7, block 61 +.word 31772478 // Layer 7, block 63 +.word 1399236949 // Layer 7, block 57 +.word 1771273834 // Layer 7, block 59 +.word 1964386839 // Layer 7, block 61 +.word 2033283404 // Layer 7, block 63 +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 2853776 // Layer 6, block 32 +.word 31645959 // Layer 6, block 33 +.word 29723614 // Layer 6, block 34 +.word 31813171 // Layer 6, block 35 +.word 182627725 // Layer 6, block 32 +.word 2025186806 // Layer 6, block 33 +.word 1902166116 // Layer 6, block 34 +.word 2035887557 // Layer 6, block 35 +.word 30377953 // Layer 7, block 64 +.word 4924837 // Layer 7, block 66 +.word 11362575 // Layer 7, block 68 +.word 31398766 // Layer 7, block 70 +.word 1944040616 // Layer 7, block 64 +.word 315165513 // Layer 7, block 66 +.word 727149301 // Layer 7, block 68 +.word 2009367662 // Layer 7, block 70 +.word 27689101 // Layer 7, block 65 +.word 31229525 // Layer 7, block 67 +.word 6544948 // Layer 7, block 69 +.word 13728247 // Layer 7, block 71 +.word 1771967221 // Layer 7, block 65 +.word 1998537064 // Layer 7, block 67 +.word 418844704 // Layer 7, block 69 +.word 878540754 // Layer 7, block 71 +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9116920 // Layer 6, block 36 +.word 26449800 // Layer 6, block 37 +.word 27173300 // Layer 6, block 38 +.word 1574249 // Layer 6, block 39 +.word 583438350 // Layer 6, block 36 +.word 1692658010 // Layer 6, block 37 +.word 1738958476 // Layer 6, block 38 +.word 100744247 // Layer 6, block 39 +.word 6510145 // Layer 7, block 72 +.word 760999 // Layer 7, block 74 +.word 1634503 // Layer 7, block 76 +.word 29546109 // Layer 7, block 78 +.word 416617482 // Layer 7, block 72 +.word 48700219 // Layer 7, block 74 +.word 104600209 // Layer 7, block 76 +.word 1890806663 // Layer 7, block 78 +.word 2195232 // Layer 7, block 73 +.word 4465852 // Layer 7, block 75 +.word 31203102 // Layer 7, block 77 +.word 29916743 // Layer 7, block 79 +.word 140484126 // Layer 7, block 73 +.word 285792715 // Layer 7, block 75 +.word 1996846121 // Layer 7, block 77 +.word 1914525428 // Layer 7, block 79 +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29172999 // Layer 6, block 40 +.word 16825951 // Layer 6, block 41 +.word 11592382 // Layer 6, block 42 +.word 2671395 // Layer 6, block 43 +.word 1866929445 // Layer 6, block 40 +.word 1076778680 // Layer 6, block 41 +.word 741855827 // Layer 6, block 42 +.word 170956232 // Layer 6, block 43 +.word 14579779 // Layer 7, block 80 +.word 24263513 // Layer 7, block 82 +.word 4646776 // Layer 7, block 84 +.word 69049 // Layer 7, block 86 +.word 933034643 // Layer 7, block 80 +.word 1552746321 // Layer 7, block 82 +.word 297370968 // Layer 7, block 84 +.word 4418799 // Layer 7, block 86 +.word 33263488 // Layer 7, block 81 +.word 22493246 // Layer 7, block 83 +.word 22009979 // Layer 7, block 85 +.word 12021234 // Layer 7, block 87 +.word 2128700762 // Layer 7, block 81 +.word 1439457879 // Layer 7, block 83 +.word 1408531152 // Layer 7, block 85 +.word 769300260 // Layer 7, block 87 +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 15720958 // Layer 6, block 44 +.word 4876619 // Layer 6, block 45 +.word 9370171 // Layer 6, block 46 +.word 2197027 // Layer 6, block 47 +.word 1006064525 // Layer 6, block 44 +.word 312079797 // Layer 6, block 45 +.word 599645177 // Layer 6, block 46 +.word 140598997 // Layer 6, block 47 +.word 16117282 // Layer 7, block 88 +.word 9635661 // Layer 7, block 90 +.word 9117520 // Layer 7, block 92 +.word 3506913 // Layer 7, block 94 +.word 1031427326 // Layer 7, block 88 +.word 616635240 // Layer 7, block 90 +.word 583476747 // Layer 7, block 92 +.word 224425303 // Layer 7, block 94 +.word 20014407 // Layer 7, block 89 +.word 25893988 // Layer 7, block 91 +.word 10257619 // Layer 7, block 93 +.word 24501669 // Layer 7, block 95 +.word 1280824291 // Layer 7, block 89 +.word 1657088757 // Layer 7, block 91 +.word 656437514 // Layer 7, block 93 +.word 1567987141 // Layer 7, block 95 +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 23467272 // Layer 6, block 48 +.word 11944835 // Layer 6, block 49 +.word 29768154 // Layer 6, block 50 +.word 3189790 // Layer 6, block 51 +.word 1501790786 // Layer 6, block 48 +.word 764411097 // Layer 6, block 49 +.word 1905016458 // Layer 6, block 50 +.word 204130980 // Layer 6, block 51 +.word 28559032 // Layer 7, block 96 +.word 20151609 // Layer 7, block 98 +.word 11645481 // Layer 7, block 100 +.word 16402437 // Layer 7, block 102 +.word 1827638556 // Layer 7, block 96 +.word 1289604549 // Layer 7, block 98 +.word 745253903 // Layer 7, block 100 +.word 1049675853 // Layer 7, block 102 +.word 1005359 // Layer 7, block 97 +.word 19130139 // Layer 7, block 99 +.word 11690281 // Layer 7, block 101 +.word 5461508 // Layer 7, block 103 +.word 64338065 // Layer 7, block 97 +.word 1224235458 // Layer 7, block 99 +.word 748120885 // Layer 7, block 101 +.word 349509836 // Layer 7, block 103 +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 4898455 // Layer 6, block 52 +.word 22059944 // Layer 6, block 53 +.word 20315246 // Layer 6, block 54 +.word 28615767 // Layer 6, block 55 +.word 313477194 // Layer 6, block 52 +.word 1411728668 // Layer 6, block 53 +.word 1300076517 // Layer 6, block 54 +.word 1831269319 // Layer 6, block 55 +.word 6226096 // Layer 7, block 104 +.word 14029790 // Layer 7, block 106 +.word 7729000 // Layer 7, block 108 +.word 13958531 // Layer 7, block 110 +.word 398439734 // Layer 7, block 104 +.word 897838034 // Layer 7, block 106 +.word 494618249 // Layer 7, block 108 +.word 893277806 // Layer 7, block 110 +.word 31755058 // Layer 7, block 105 +.word 26102744 // Layer 7, block 107 +.word 19175904 // Layer 7, block 109 +.word 19472238 // Layer 7, block 111 +.word 2032168609 // Layer 7, block 105 +.word 1670448121 // Layer 7, block 107 +.word 1227164194 // Layer 7, block 109 +.word 1246128123 // Layer 7, block 111 +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 17302560 // Layer 6, block 56 +.word 8630188 // Layer 6, block 57 +.word 13744680 // Layer 6, block 58 +.word 31890906 // Layer 6, block 59 +.word 1107279328 // Layer 6, block 56 +.word 552289879 // Layer 6, block 57 +.word 879592386 // Layer 6, block 58 +.word 2040862218 // Layer 6, block 59 +.word 4735938 // Layer 7, block 112 +.word 26671657 // Layer 7, block 114 +.word 25810971 // Layer 7, block 116 +.word 25578690 // Layer 7, block 118 +.word 303076900 // Layer 7, block 112 +.word 1706855774 // Layer 7, block 114 +.word 1651776074 // Layer 7, block 116 +.word 1636911225 // Layer 7, block 118 +.word 6957373 // Layer 7, block 113 +.word 25381712 // Layer 7, block 115 +.word 27780827 // Layer 7, block 117 +.word 28062311 // Layer 7, block 119 +.word 445237890 // Layer 7, block 113 +.word 1624305595 // Layer 7, block 115 +.word 1777837237 // Layer 7, block 117 +.word 1795850838 // Layer 7, block 119 +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 26150922 // Layer 6, block 60 +.word 29525906 // Layer 6, block 61 +.word 23080870 // Layer 6, block 62 +.word 1636987 // Layer 6, block 63 +.word 1673531278 // Layer 6, block 60 +.word 1889513769 // Layer 6, block 61 +.word 1477062945 // Layer 6, block 62 +.word 104759172 // Layer 6, block 63 +.word 10674616 // Layer 7, block 120 +.word 9508293 // Layer 7, block 122 +.word 4274200 // Layer 7, block 124 +.word 10066304 // Layer 7, block 126 +.word 683123285 // Layer 7, block 120 +.word 608484310 // Layer 7, block 122 +.word 273527923 // Layer 7, block 124 +.word 644194289 // Layer 7, block 126 +.word 26473446 // Layer 7, block 121 +.word 14853570 // Layer 7, block 123 +.word 32427548 // Layer 7, block 125 +.word 16598340 // Layer 7, block 127 +.word 1694171239 // Layer 7, block 121 +.word 950555930 // Layer 7, block 123 +.word 2075204685 // Layer 7, block 125 +.word 1062212688 // Layer 7, block 127 +.text +.global ntt_u32_full_neon_asm_var_4_4_16_0 +.global _ntt_u32_full_neon_asm_var_4_4_16_0 +ntt_u32_full_neon_asm_var_4_4_16_0: +_ntt_u32_full_neon_asm_var_4_4_16_0: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x0, #992] +ldr q29, [x17, #+0] +ldr q28, [x17, #+16] +sqrdmulh v27.4S, v30.4S, v28.s[0] +mul v30.4S, v30.4S,v29.s[0] +ldr q26, [x0, #928] +sqrdmulh v25.4S, v26.4S, v28.s[0] +mul v26.4S, v26.4S,v29.s[0] +ldr q24, [x0, #864] +sqrdmulh v23.4S, v24.4S, v28.s[0] +mul v24.4S, v24.4S,v29.s[0] +ldr q22, [x0, #800] +sqrdmulh v21.4S, v22.4S, v28.s[0] +mul v22.4S, v22.4S,v29.s[0] +ldr q20, [x0, #736] +mla v30.4S, v27.4S, v31.s[0] +sqrdmulh v27.4S, v20.4S, v28.s[0] +ldr q19, [x0, #672] +mla v26.4S, v25.4S, v31.s[0] +sqrdmulh v25.4S, v19.4S, v28.s[0] +nop +ldr q18, [x0, #608] +mla v24.4S, v23.4S, v31.s[0] +sqrdmulh v23.4S, v18.4S, v28.s[0] +nop +ldr q17, [x0, #544] +mla v22.4S, v21.4S, v31.s[0] +nop +sqrdmulh v21.4S, v17.4S, v28.s[0] +ldr q16, [x0, #480] +ldr q3, [x0, #416] +mul v20.4S, v20.4S,v29.s[0] +sub v2.4s, v16.4s, v30.4s +mul v19.4S, v19.4S,v29.s[0] +add v16.4s, v16.4s, v30.4s +ldr q30, [x0, #352] +ldr q1, [x0, #288] +mla v20.4S, v27.4S, v31.s[0] +sub v27.4s, v3.4s, v26.4s +mla v19.4S, v25.4S, v31.s[0] +add v3.4s, v3.4s, v26.4s +ldr q26, [x0, #224] +ldr q25, [x0, #160] +mul v18.4S, v18.4S,v29.s[0] +sub v0.4s, v30.4s, v24.4s +mul v17.4S, v17.4S,v29.s[0] +add v30.4s, v30.4s, v24.4s +ldr q24, [x0, #96] +ldr q15, [x0, #32] +mla v18.4S, v23.4S, v31.s[0] +sub v23.4s, v1.4s, v22.4s +mla v17.4S, v21.4S, v31.s[0] +add v1.4s, v1.4s, v22.4s +sqrdmulh v22.4S, v2.4S, v28.s[2] +nop +mul v2.4S, v2.4S,v29.s[2] +nop +sqrdmulh v21.4S, v27.4S, v28.s[2] +sub v14.4s, v26.4s, v20.4s +mul v27.4S, v27.4S,v29.s[2] +add v26.4s, v26.4s, v20.4s +sqrdmulh v20.4S, v16.4S, v28.s[1] +sub v13.4s, v25.4s, v19.4s +mul v16.4S, v16.4S,v29.s[1] +add v25.4s, v25.4s, v19.4s +sqrdmulh v19.4S, v3.4S, v28.s[1] +sub v12.4s, v24.4s, v18.4s +mul v3.4S, v3.4S,v29.s[1] +add v24.4s, v24.4s, v18.4s +mla v2.4S, v22.4S, v31.s[0] +sub v22.4s, v15.4s, v17.4s +sqrdmulh v18.4S, v0.4S, v28.s[2] +add v15.4s, v15.4s, v17.4s +mla v27.4S, v21.4S, v31.s[0] +nop +sqrdmulh v21.4S, v23.4S, v28.s[2] +nop +mla v16.4S, v20.4S, v31.s[0] +nop +sqrdmulh v20.4S, v30.4S, v28.s[1] +nop +mla v3.4S, v19.4S, v31.s[0] +nop +sqrdmulh v19.4S, v1.4S, v28.s[1] +nop +ldr q17, [x17, #+32] +ldr q11, [x17, #+48] +mul v0.4S, v0.4S,v29.s[2] +sub v10.4s, v14.4s, v2.4s +mul v23.4S, v23.4S,v29.s[2] +add v14.4s, v14.4s, v2.4s +mla v0.4S, v18.4S, v31.s[0] +sub v18.4s, v13.4s, v27.4s +mla v23.4S, v21.4S, v31.s[0] +add v13.4s, v13.4s, v27.4s +mul v30.4S, v30.4S,v29.s[1] +sub v27.4s, v26.4s, v16.4s +mul v1.4S, v1.4S,v29.s[1] +add v26.4s, v26.4s, v16.4s +mla v30.4S, v20.4S, v31.s[0] +sub v20.4s, v25.4s, v3.4s +mla v1.4S, v19.4S, v31.s[0] +add v25.4s, v25.4s, v3.4s +sqrdmulh v3.4S, v10.4S, v11.s[3] +nop +mul v10.4S, v10.4S,v17.s[3] +nop +sqrdmulh v19.4S, v14.4S, v11.s[2] +sub v16.4s, v12.4s, v0.4s +mul v14.4S, v14.4S,v17.s[2] +add v12.4s, v12.4s, v0.4s +sqrdmulh v0.4S, v27.4S, v11.s[1] +sub v21.4s, v22.4s, v23.4s +mul v27.4S, v27.4S,v17.s[1] +add v22.4s, v22.4s, v23.4s +sqrdmulh v23.4S, v26.4S, v11.s[0] +sub v2.4s, v24.4s, v30.4s +mul v26.4S, v26.4S,v17.s[0] +add v24.4s, v24.4s, v30.4s +ldr q30, [x17, #+96] +ldr q9, [x17, #+112] +mla v10.4S, v3.4S, v31.s[0] +sub v3.4s, v15.4s, v1.4s +sqrdmulh v8.4S, v18.4S, v11.s[3] +add v15.4s, v15.4s, v1.4s +mla v14.4S, v19.4S, v31.s[0] +nop +sqrdmulh v19.4S, v13.4S, v11.s[2] +nop +mla v27.4S, v0.4S, v31.s[0] +nop +sqrdmulh v0.4S, v20.4S, v11.s[1] +nop +mla v26.4S, v23.4S, v31.s[0] +nop +sqrdmulh v23.4S, v25.4S, v11.s[0] +nop +ldr q1, [x17, #+64] +ldr q7, [x17, #+80] +mul v18.4S, v18.4S,v17.s[3] +sub v6.4s, v16.4s, v10.4s +mul v13.4S, v13.4S,v17.s[2] +add v16.4s, v16.4s, v10.4s +mla v18.4S, v8.4S, v31.s[0] +sub v8.4s, v12.4s, v14.4s +mla v13.4S, v19.4S, v31.s[0] +add v12.4s, v12.4s, v14.4s +mul v20.4S, v20.4S,v17.s[1] +sub v14.4s, v2.4s, v27.4s +mul v25.4S, v25.4S,v17.s[0] +add v2.4s, v2.4s, v27.4s +mla v20.4S, v0.4S, v31.s[0] +sub v0.4s, v24.4s, v26.4s +mla v25.4S, v23.4S, v31.s[0] +add v24.4s, v24.4s, v26.4s +sqrdmulh v26.4S, v6.4S, v9.s[3] +nop +mul v6.4S, v6.4S,v30.s[3] +nop +sqrdmulh v23.4S, v16.4S, v9.s[2] +sub v27.4s, v21.4s, v18.4s +mul v16.4S, v16.4S,v30.s[2] +add v21.4s, v21.4s, v18.4s +sqrdmulh v18.4S, v8.4S, v9.s[1] +sub v19.4s, v22.4s, v13.4s +mul v8.4S, v8.4S,v30.s[1] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v12.4S, v9.s[0] +sub v10.4s, v3.4s, v20.4s +mul v12.4S, v12.4S,v30.s[0] +add v3.4s, v3.4s, v20.4s +mla v6.4S, v26.4S, v31.s[0] +sub v26.4s, v15.4s, v25.4s +sqrdmulh v20.4S, v14.4S, v7.s[3] +add v15.4s, v15.4s, v25.4s +mla v16.4S, v23.4S, v31.s[0] +sub v23.4s, v27.4s, v6.4s +sqrdmulh v25.4S, v2.4S, v7.s[2] +add v27.4s, v27.4s, v6.4s +mla v8.4S, v18.4S, v31.s[0] +sub v18.4s, v21.4s, v16.4s +sqrdmulh v6.4S, v0.4S, v7.s[1] +add v21.4s, v21.4s, v16.4s +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v19.4s, v8.4s +sqrdmulh v16.4S, v24.4S, v7.s[0] +add v19.4s, v19.4s, v8.4s +mul v14.4S, v14.4S,v1.s[3] +sub v8.4s, v22.4s, v12.4s +mul v2.4S, v2.4S,v1.s[2] +add v22.4s, v22.4s, v12.4s +mla v14.4S, v20.4S, v31.s[0] +str q23, [x0, #992] +mla v2.4S, v25.4S, v31.s[0] +str q27, [x0, #928] +mul v0.4S, v0.4S,v1.s[1] +str q18, [x0, #864] +mul v24.4S, v24.4S,v1.s[0] +str q21, [x0, #800] +mla v0.4S, v6.4S, v31.s[0] +str q13, [x0, #736] +mla v24.4S, v16.4S, v31.s[0] +str q19, [x0, #672] +ldr q19, [x0, #1008] +sqrdmulh v16.4S, v19.4S, v28.s[0] +str q8, [x0, #608] +mul v19.4S, v19.4S,v29.s[0] +str q22, [x0, #544] +ldr q22, [x0, #944] +sqrdmulh v8.4S, v22.4S, v28.s[0] +sub v13.4s, v10.4s, v14.4s +str q13, [x0, #480] +mul v22.4S, v22.4S,v29.s[0] +add v10.4s, v10.4s, v14.4s +ldr q14, [x0, #880] +sqrdmulh v13.4S, v14.4S, v28.s[0] +sub v6.4s, v3.4s, v2.4s +str q10, [x0, #416] +mul v14.4S, v14.4S,v29.s[0] +add v3.4s, v3.4s, v2.4s +ldr q2, [x0, #816] +sqrdmulh v10.4S, v2.4S, v28.s[0] +sub v21.4s, v26.4s, v0.4s +str q6, [x0, #352] +mul v2.4S, v2.4S,v29.s[0] +add v26.4s, v26.4s, v0.4s +ldr q0, [x0, #752] +mla v19.4S, v16.4S, v31.s[0] +sub v16.4s, v15.4s, v24.4s +str q3, [x0, #288] +sqrdmulh v3.4S, v0.4S, v28.s[0] +add v15.4s, v15.4s, v24.4s +ldr q24, [x0, #688] +mla v22.4S, v8.4S, v31.s[0] +str q21, [x0, #224] +sqrdmulh v21.4S, v24.4S, v28.s[0] +nop +ldr q8, [x0, #624] +mla v14.4S, v13.4S, v31.s[0] +str q26, [x0, #160] +sqrdmulh v26.4S, v8.4S, v28.s[0] +nop +ldr q13, [x0, #560] +mla v2.4S, v10.4S, v31.s[0] +nop +sqrdmulh v10.4S, v13.4S, v28.s[0] +str q16, [x0, #96] +ldr q16, [x0, #496] +ldr q6, [x0, #432] +mul v0.4S, v0.4S,v29.s[0] +sub v18.4s, v16.4s, v19.4s +str q15, [x0, #32] +mul v24.4S, v24.4S,v29.s[0] +add v16.4s, v16.4s, v19.4s +ldr q19, [x0, #368] +ldr q15, [x0, #304] +mla v0.4S, v3.4S, v31.s[0] +sub v3.4s, v6.4s, v22.4s +mla v24.4S, v21.4S, v31.s[0] +add v6.4s, v6.4s, v22.4s +ldr q22, [x0, #240] +ldr q21, [x0, #176] +mul v8.4S, v8.4S,v29.s[0] +sub v27.4s, v19.4s, v14.4s +mul v13.4S, v13.4S,v29.s[0] +add v19.4s, v19.4s, v14.4s +ldr q14, [x0, #112] +ldr q25, [x0, #48] +mla v8.4S, v26.4S, v31.s[0] +sub v26.4s, v15.4s, v2.4s +mla v13.4S, v10.4S, v31.s[0] +add v15.4s, v15.4s, v2.4s +sqrdmulh v2.4S, v18.4S, v28.s[2] +nop +mul v18.4S, v18.4S,v29.s[2] +nop +sqrdmulh v10.4S, v3.4S, v28.s[2] +sub v23.4s, v22.4s, v0.4s +mul v3.4S, v3.4S,v29.s[2] +add v22.4s, v22.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v28.s[1] +sub v20.4s, v21.4s, v24.4s +mul v16.4S, v16.4S,v29.s[1] +add v21.4s, v21.4s, v24.4s +sqrdmulh v24.4S, v6.4S, v28.s[1] +sub v12.4s, v14.4s, v8.4s +mul v6.4S, v6.4S,v29.s[1] +add v14.4s, v14.4s, v8.4s +mla v18.4S, v2.4S, v31.s[0] +sub v2.4s, v25.4s, v13.4s +sqrdmulh v8.4S, v27.4S, v28.s[2] +add v25.4s, v25.4s, v13.4s +mla v3.4S, v10.4S, v31.s[0] +nop +sqrdmulh v10.4S, v26.4S, v28.s[2] +nop +mla v16.4S, v0.4S, v31.s[0] +nop +sqrdmulh v0.4S, v19.4S, v28.s[1] +nop +mla v6.4S, v24.4S, v31.s[0] +nop +sqrdmulh v24.4S, v15.4S, v28.s[1] +nop +mul v27.4S, v27.4S,v29.s[2] +sub v13.4s, v23.4s, v18.4s +mul v26.4S, v26.4S,v29.s[2] +add v23.4s, v23.4s, v18.4s +mla v27.4S, v8.4S, v31.s[0] +sub v8.4s, v20.4s, v3.4s +mla v26.4S, v10.4S, v31.s[0] +add v20.4s, v20.4s, v3.4s +mul v19.4S, v19.4S,v29.s[1] +sub v3.4s, v22.4s, v16.4s +mul v15.4S, v15.4S,v29.s[1] +add v22.4s, v22.4s, v16.4s +mla v19.4S, v0.4S, v31.s[0] +sub v0.4s, v21.4s, v6.4s +mla v15.4S, v24.4S, v31.s[0] +add v21.4s, v21.4s, v6.4s +sqrdmulh v6.4S, v13.4S, v11.s[3] +nop +mul v13.4S, v13.4S,v17.s[3] +nop +sqrdmulh v24.4S, v23.4S, v11.s[2] +sub v16.4s, v12.4s, v27.4s +mul v23.4S, v23.4S,v17.s[2] +add v12.4s, v12.4s, v27.4s +sqrdmulh v27.4S, v3.4S, v11.s[1] +sub v10.4s, v2.4s, v26.4s +mul v3.4S, v3.4S,v17.s[1] +add v2.4s, v2.4s, v26.4s +sqrdmulh v26.4S, v22.4S, v11.s[0] +sub v18.4s, v14.4s, v19.4s +mul v22.4S, v22.4S,v17.s[0] +add v14.4s, v14.4s, v19.4s +mla v13.4S, v6.4S, v31.s[0] +sub v6.4s, v25.4s, v15.4s +sqrdmulh v19.4S, v8.4S, v11.s[3] +add v25.4s, v25.4s, v15.4s +mla v23.4S, v24.4S, v31.s[0] +nop +sqrdmulh v24.4S, v20.4S, v11.s[2] +nop +mla v3.4S, v27.4S, v31.s[0] +nop +sqrdmulh v27.4S, v0.4S, v11.s[1] +nop +mla v22.4S, v26.4S, v31.s[0] +nop +sqrdmulh v26.4S, v21.4S, v11.s[0] +nop +mul v8.4S, v8.4S,v17.s[3] +sub v15.4s, v16.4s, v13.4s +mul v20.4S, v20.4S,v17.s[2] +add v16.4s, v16.4s, v13.4s +mla v8.4S, v19.4S, v31.s[0] +sub v19.4s, v12.4s, v23.4s +mla v20.4S, v24.4S, v31.s[0] +add v12.4s, v12.4s, v23.4s +mul v0.4S, v0.4S,v17.s[1] +sub v23.4s, v18.4s, v3.4s +mul v21.4S, v21.4S,v17.s[0] +add v18.4s, v18.4s, v3.4s +mla v0.4S, v27.4S, v31.s[0] +sub v27.4s, v14.4s, v22.4s +mla v21.4S, v26.4S, v31.s[0] +add v14.4s, v14.4s, v22.4s +sqrdmulh v22.4S, v15.4S, v9.s[3] +nop +mul v15.4S, v15.4S,v30.s[3] +nop +sqrdmulh v26.4S, v16.4S, v9.s[2] +sub v3.4s, v10.4s, v8.4s +mul v16.4S, v16.4S,v30.s[2] +add v10.4s, v10.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v9.s[1] +sub v24.4s, v2.4s, v20.4s +mul v19.4S, v19.4S,v30.s[1] +add v2.4s, v2.4s, v20.4s +sqrdmulh v20.4S, v12.4S, v9.s[0] +sub v13.4s, v6.4s, v0.4s +mul v12.4S, v12.4S,v30.s[0] +add v6.4s, v6.4s, v0.4s +mla v15.4S, v22.4S, v31.s[0] +sub v22.4s, v25.4s, v21.4s +sqrdmulh v0.4S, v23.4S, v7.s[3] +add v25.4s, v25.4s, v21.4s +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v3.4s, v15.4s +sqrdmulh v21.4S, v18.4S, v7.s[2] +add v3.4s, v3.4s, v15.4s +mla v19.4S, v8.4S, v31.s[0] +sub v8.4s, v10.4s, v16.4s +sqrdmulh v15.4S, v27.4S, v7.s[1] +add v10.4s, v10.4s, v16.4s +mla v12.4S, v20.4S, v31.s[0] +sub v20.4s, v24.4s, v19.4s +sqrdmulh v16.4S, v14.4S, v7.s[0] +add v24.4s, v24.4s, v19.4s +mul v23.4S, v23.4S,v1.s[3] +sub v19.4s, v2.4s, v12.4s +mul v18.4S, v18.4S,v1.s[2] +add v2.4s, v2.4s, v12.4s +mla v23.4S, v0.4S, v31.s[0] +str q26, [x0, #1008] +mla v18.4S, v21.4S, v31.s[0] +str q3, [x0, #944] +mul v27.4S, v27.4S,v1.s[1] +str q8, [x0, #880] +mul v14.4S, v14.4S,v1.s[0] +str q10, [x0, #816] +mla v27.4S, v15.4S, v31.s[0] +str q20, [x0, #752] +mla v14.4S, v16.4S, v31.s[0] +str q24, [x0, #688] +ldr q24, [x0, #960] +sqrdmulh v16.4S, v24.4S, v28.s[0] +str q19, [x0, #624] +mul v24.4S, v24.4S,v29.s[0] +str q2, [x0, #560] +ldr q2, [x0, #896] +sqrdmulh v19.4S, v2.4S, v28.s[0] +sub v20.4s, v13.4s, v23.4s +str q20, [x0, #496] +mul v2.4S, v2.4S,v29.s[0] +add v13.4s, v13.4s, v23.4s +ldr q23, [x0, #832] +sqrdmulh v20.4S, v23.4S, v28.s[0] +sub v15.4s, v6.4s, v18.4s +str q13, [x0, #432] +mul v23.4S, v23.4S,v29.s[0] +add v6.4s, v6.4s, v18.4s +ldr q18, [x0, #768] +sqrdmulh v13.4S, v18.4S, v28.s[0] +sub v10.4s, v22.4s, v27.4s +str q15, [x0, #368] +mul v18.4S, v18.4S,v29.s[0] +add v22.4s, v22.4s, v27.4s +ldr q27, [x0, #704] +mla v24.4S, v16.4S, v31.s[0] +sub v16.4s, v25.4s, v14.4s +str q6, [x0, #304] +sqrdmulh v6.4S, v27.4S, v28.s[0] +add v25.4s, v25.4s, v14.4s +ldr q14, [x0, #640] +mla v2.4S, v19.4S, v31.s[0] +str q10, [x0, #240] +sqrdmulh v10.4S, v14.4S, v28.s[0] +nop +ldr q19, [x0, #576] +mla v23.4S, v20.4S, v31.s[0] +str q22, [x0, #176] +sqrdmulh v22.4S, v19.4S, v28.s[0] +nop +ldr q20, [x0, #512] +mla v18.4S, v13.4S, v31.s[0] +nop +sqrdmulh v13.4S, v20.4S, v28.s[0] +str q16, [x0, #112] +ldr q16, [x0, #448] +ldr q15, [x0, #384] +mul v27.4S, v27.4S,v29.s[0] +sub v8.4s, v16.4s, v24.4s +str q25, [x0, #48] +mul v14.4S, v14.4S,v29.s[0] +add v16.4s, v16.4s, v24.4s +ldr q24, [x0, #320] +ldr q25, [x0, #256] +mla v27.4S, v6.4S, v31.s[0] +sub v6.4s, v15.4s, v2.4s +mla v14.4S, v10.4S, v31.s[0] +add v15.4s, v15.4s, v2.4s +ldr q2, [x0, #192] +ldr q10, [x0, #128] +mul v19.4S, v19.4S,v29.s[0] +sub v3.4s, v24.4s, v23.4s +mul v20.4S, v20.4S,v29.s[0] +add v24.4s, v24.4s, v23.4s +ldr q23, [x0, #64] +ldr q21, [x0, #0] +mla v19.4S, v22.4S, v31.s[0] +sub v22.4s, v25.4s, v18.4s +mla v20.4S, v13.4S, v31.s[0] +add v25.4s, v25.4s, v18.4s +sqrdmulh v18.4S, v8.4S, v28.s[2] +nop +mul v8.4S, v8.4S,v29.s[2] +nop +sqrdmulh v13.4S, v6.4S, v28.s[2] +sub v26.4s, v2.4s, v27.4s +mul v6.4S, v6.4S,v29.s[2] +add v2.4s, v2.4s, v27.4s +sqrdmulh v27.4S, v16.4S, v28.s[1] +sub v0.4s, v10.4s, v14.4s +mul v16.4S, v16.4S,v29.s[1] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v28.s[1] +sub v12.4s, v23.4s, v19.4s +mul v15.4S, v15.4S,v29.s[1] +add v23.4s, v23.4s, v19.4s +mla v8.4S, v18.4S, v31.s[0] +sub v18.4s, v21.4s, v20.4s +sqrdmulh v19.4S, v3.4S, v28.s[2] +add v21.4s, v21.4s, v20.4s +mla v6.4S, v13.4S, v31.s[0] +nop +sqrdmulh v13.4S, v22.4S, v28.s[2] +nop +mla v16.4S, v27.4S, v31.s[0] +nop +sqrdmulh v27.4S, v24.4S, v28.s[1] +nop +mla v15.4S, v14.4S, v31.s[0] +nop +sqrdmulh v14.4S, v25.4S, v28.s[1] +nop +mul v3.4S, v3.4S,v29.s[2] +sub v20.4s, v26.4s, v8.4s +mul v22.4S, v22.4S,v29.s[2] +add v26.4s, v26.4s, v8.4s +mla v3.4S, v19.4S, v31.s[0] +sub v19.4s, v0.4s, v6.4s +mla v22.4S, v13.4S, v31.s[0] +add v0.4s, v0.4s, v6.4s +mul v24.4S, v24.4S,v29.s[1] +sub v6.4s, v2.4s, v16.4s +mul v25.4S, v25.4S,v29.s[1] +add v2.4s, v2.4s, v16.4s +mla v24.4S, v27.4S, v31.s[0] +sub v27.4s, v10.4s, v15.4s +mla v25.4S, v14.4S, v31.s[0] +add v10.4s, v10.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v11.s[3] +nop +mul v20.4S, v20.4S,v17.s[3] +nop +sqrdmulh v14.4S, v26.4S, v11.s[2] +sub v16.4s, v12.4s, v3.4s +mul v26.4S, v26.4S,v17.s[2] +add v12.4s, v12.4s, v3.4s +sqrdmulh v3.4S, v6.4S, v11.s[1] +sub v13.4s, v18.4s, v22.4s +mul v6.4S, v6.4S,v17.s[1] +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v2.4S, v11.s[0] +sub v8.4s, v23.4s, v24.4s +mul v2.4S, v2.4S,v17.s[0] +add v23.4s, v23.4s, v24.4s +mla v20.4S, v15.4S, v31.s[0] +sub v15.4s, v21.4s, v25.4s +sqrdmulh v24.4S, v19.4S, v11.s[3] +add v21.4s, v21.4s, v25.4s +mla v26.4S, v14.4S, v31.s[0] +nop +sqrdmulh v14.4S, v0.4S, v11.s[2] +nop +mla v6.4S, v3.4S, v31.s[0] +nop +sqrdmulh v3.4S, v27.4S, v11.s[1] +nop +mla v2.4S, v22.4S, v31.s[0] +nop +sqrdmulh v22.4S, v10.4S, v11.s[0] +nop +mul v19.4S, v19.4S,v17.s[3] +sub v25.4s, v16.4s, v20.4s +mul v0.4S, v0.4S,v17.s[2] +add v16.4s, v16.4s, v20.4s +mla v19.4S, v24.4S, v31.s[0] +sub v24.4s, v12.4s, v26.4s +mla v0.4S, v14.4S, v31.s[0] +add v12.4s, v12.4s, v26.4s +mul v27.4S, v27.4S,v17.s[1] +sub v26.4s, v8.4s, v6.4s +mul v10.4S, v10.4S,v17.s[0] +add v8.4s, v8.4s, v6.4s +mla v27.4S, v3.4S, v31.s[0] +sub v3.4s, v23.4s, v2.4s +mla v10.4S, v22.4S, v31.s[0] +add v23.4s, v23.4s, v2.4s +sqrdmulh v2.4S, v25.4S, v9.s[3] +nop +mul v25.4S, v25.4S,v30.s[3] +nop +sqrdmulh v22.4S, v16.4S, v9.s[2] +sub v6.4s, v13.4s, v19.4s +mul v16.4S, v16.4S,v30.s[2] +add v13.4s, v13.4s, v19.4s +sqrdmulh v19.4S, v24.4S, v9.s[1] +sub v14.4s, v18.4s, v0.4s +mul v24.4S, v24.4S,v30.s[1] +add v18.4s, v18.4s, v0.4s +sqrdmulh v0.4S, v12.4S, v9.s[0] +sub v20.4s, v15.4s, v27.4s +mul v12.4S, v12.4S,v30.s[0] +add v15.4s, v15.4s, v27.4s +mla v25.4S, v2.4S, v31.s[0] +sub v2.4s, v21.4s, v10.4s +sqrdmulh v27.4S, v26.4S, v7.s[3] +add v21.4s, v21.4s, v10.4s +mla v16.4S, v22.4S, v31.s[0] +sub v22.4s, v6.4s, v25.4s +sqrdmulh v10.4S, v8.4S, v7.s[2] +add v6.4s, v6.4s, v25.4s +mla v24.4S, v19.4S, v31.s[0] +sub v19.4s, v13.4s, v16.4s +sqrdmulh v25.4S, v3.4S, v7.s[1] +add v13.4s, v13.4s, v16.4s +mla v12.4S, v0.4S, v31.s[0] +sub v0.4s, v14.4s, v24.4s +sqrdmulh v16.4S, v23.4S, v7.s[0] +add v14.4s, v14.4s, v24.4s +mul v26.4S, v26.4S,v1.s[3] +sub v24.4s, v18.4s, v12.4s +mul v8.4S, v8.4S,v1.s[2] +add v18.4s, v18.4s, v12.4s +mla v26.4S, v27.4S, v31.s[0] +str q22, [x0, #960] +mla v8.4S, v10.4S, v31.s[0] +str q6, [x0, #896] +mul v3.4S, v3.4S,v1.s[1] +str q19, [x0, #832] +mul v23.4S, v23.4S,v1.s[0] +str q13, [x0, #768] +mla v3.4S, v25.4S, v31.s[0] +str q0, [x0, #704] +mla v23.4S, v16.4S, v31.s[0] +str q14, [x0, #640] +ldr q14, [x0, #976] +sqrdmulh v16.4S, v14.4S, v28.s[0] +str q24, [x0, #576] +mul v14.4S, v14.4S,v29.s[0] +str q18, [x0, #512] +ldr q18, [x0, #912] +sqrdmulh v24.4S, v18.4S, v28.s[0] +sub v0.4s, v20.4s, v26.4s +str q0, [x0, #448] +mul v18.4S, v18.4S,v29.s[0] +add v20.4s, v20.4s, v26.4s +ldr q26, [x0, #848] +sqrdmulh v0.4S, v26.4S, v28.s[0] +sub v25.4s, v15.4s, v8.4s +str q20, [x0, #384] +mul v26.4S, v26.4S,v29.s[0] +add v15.4s, v15.4s, v8.4s +ldr q8, [x0, #784] +sqrdmulh v20.4S, v8.4S, v28.s[0] +sub v13.4s, v2.4s, v3.4s +str q25, [x0, #320] +mul v8.4S, v8.4S,v29.s[0] +add v2.4s, v2.4s, v3.4s +ldr q3, [x0, #720] +mla v14.4S, v16.4S, v31.s[0] +sub v16.4s, v21.4s, v23.4s +str q15, [x0, #256] +sqrdmulh v15.4S, v3.4S, v28.s[0] +add v21.4s, v21.4s, v23.4s +ldr q23, [x0, #656] +mla v18.4S, v24.4S, v31.s[0] +str q13, [x0, #192] +sqrdmulh v13.4S, v23.4S, v28.s[0] +nop +ldr q24, [x0, #592] +mla v26.4S, v0.4S, v31.s[0] +str q2, [x0, #128] +sqrdmulh v2.4S, v24.4S, v28.s[0] +nop +ldr q0, [x0, #528] +mla v8.4S, v20.4S, v31.s[0] +nop +sqrdmulh v20.4S, v0.4S, v28.s[0] +str q16, [x0, #64] +ldr q16, [x0, #464] +ldr q25, [x0, #400] +mul v3.4S, v3.4S,v29.s[0] +sub v19.4s, v16.4s, v14.4s +str q21, [x0, #0] +mul v23.4S, v23.4S,v29.s[0] +add v16.4s, v16.4s, v14.4s +ldr q14, [x0, #336] +ldr q21, [x0, #272] +mla v3.4S, v15.4S, v31.s[0] +sub v15.4s, v25.4s, v18.4s +mla v23.4S, v13.4S, v31.s[0] +add v25.4s, v25.4s, v18.4s +ldr q18, [x0, #208] +ldr q13, [x0, #144] +mul v24.4S, v24.4S,v29.s[0] +sub v6.4s, v14.4s, v26.4s +mul v0.4S, v0.4S,v29.s[0] +add v14.4s, v14.4s, v26.4s +ldr q26, [x0, #80] +ldr q10, [x0, #16] +mla v24.4S, v2.4S, v31.s[0] +sub v2.4s, v21.4s, v8.4s +mla v0.4S, v20.4S, v31.s[0] +add v21.4s, v21.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v28.s[2] +nop +mul v19.4S, v19.4S,v29.s[2] +nop +sqrdmulh v20.4S, v15.4S, v28.s[2] +sub v22.4s, v18.4s, v3.4s +mul v15.4S, v15.4S,v29.s[2] +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v16.4S, v28.s[1] +sub v27.4s, v13.4s, v23.4s +mul v16.4S, v16.4S,v29.s[1] +add v13.4s, v13.4s, v23.4s +sqrdmulh v23.4S, v25.4S, v28.s[1] +sub v12.4s, v26.4s, v24.4s +mul v25.4S, v25.4S,v29.s[1] +add v26.4s, v26.4s, v24.4s +mla v19.4S, v8.4S, v31.s[0] +sub v8.4s, v10.4s, v0.4s +sqrdmulh v24.4S, v6.4S, v28.s[2] +add v10.4s, v10.4s, v0.4s +mla v15.4S, v20.4S, v31.s[0] +nop +sqrdmulh v20.4S, v2.4S, v28.s[2] +nop +mla v16.4S, v3.4S, v31.s[0] +nop +sqrdmulh v3.4S, v14.4S, v28.s[1] +nop +mla v25.4S, v23.4S, v31.s[0] +nop +sqrdmulh v23.4S, v21.4S, v28.s[1] +nop +mul v6.4S, v6.4S,v29.s[2] +sub v0.4s, v22.4s, v19.4s +mul v2.4S, v2.4S,v29.s[2] +add v22.4s, v22.4s, v19.4s +mla v6.4S, v24.4S, v31.s[0] +sub v24.4s, v27.4s, v15.4s +mla v2.4S, v20.4S, v31.s[0] +add v27.4s, v27.4s, v15.4s +mul v14.4S, v14.4S,v29.s[1] +sub v15.4s, v18.4s, v16.4s +mul v21.4S, v21.4S,v29.s[1] +add v18.4s, v18.4s, v16.4s +mla v14.4S, v3.4S, v31.s[0] +sub v3.4s, v13.4s, v25.4s +mla v21.4S, v23.4S, v31.s[0] +add v13.4s, v13.4s, v25.4s +sqrdmulh v28.4S, v0.4S, v11.s[3] +nop +mul v0.4S, v0.4S,v17.s[3] +nop +sqrdmulh v29.4S, v22.4S, v11.s[2] +sub v25.4s, v12.4s, v6.4s +mul v22.4S, v22.4S,v17.s[2] +add v12.4s, v12.4s, v6.4s +sqrdmulh v6.4S, v15.4S, v11.s[1] +sub v23.4s, v8.4s, v2.4s +mul v15.4S, v15.4S,v17.s[1] +add v8.4s, v8.4s, v2.4s +sqrdmulh v2.4S, v18.4S, v11.s[0] +sub v16.4s, v26.4s, v14.4s +mul v18.4S, v18.4S,v17.s[0] +add v26.4s, v26.4s, v14.4s +mla v0.4S, v28.4S, v31.s[0] +sub v28.4s, v10.4s, v21.4s +sqrdmulh v14.4S, v24.4S, v11.s[3] +add v10.4s, v10.4s, v21.4s +mla v22.4S, v29.4S, v31.s[0] +nop +sqrdmulh v29.4S, v27.4S, v11.s[2] +nop +mla v15.4S, v6.4S, v31.s[0] +nop +sqrdmulh v6.4S, v3.4S, v11.s[1] +nop +mla v18.4S, v2.4S, v31.s[0] +nop +sqrdmulh v2.4S, v13.4S, v11.s[0] +nop +mul v24.4S, v24.4S,v17.s[3] +sub v21.4s, v25.4s, v0.4s +mul v27.4S, v27.4S,v17.s[2] +add v25.4s, v25.4s, v0.4s +mla v24.4S, v14.4S, v31.s[0] +sub v14.4s, v12.4s, v22.4s +mla v27.4S, v29.4S, v31.s[0] +add v12.4s, v12.4s, v22.4s +mul v3.4S, v3.4S,v17.s[1] +sub v22.4s, v16.4s, v15.4s +mul v13.4S, v13.4S,v17.s[0] +add v16.4s, v16.4s, v15.4s +mla v3.4S, v6.4S, v31.s[0] +sub v6.4s, v26.4s, v18.4s +mla v13.4S, v2.4S, v31.s[0] +add v26.4s, v26.4s, v18.4s +sqrdmulh v11.4S, v21.4S, v9.s[3] +nop +mul v21.4S, v21.4S,v30.s[3] +nop +sqrdmulh v17.4S, v25.4S, v9.s[2] +sub v18.4s, v23.4s, v24.4s +mul v25.4S, v25.4S,v30.s[2] +add v23.4s, v23.4s, v24.4s +sqrdmulh v24.4S, v14.4S, v9.s[1] +sub v2.4s, v8.4s, v27.4s +mul v14.4S, v14.4S,v30.s[1] +add v8.4s, v8.4s, v27.4s +sqrdmulh v27.4S, v12.4S, v9.s[0] +sub v15.4s, v28.4s, v3.4s +mul v12.4S, v12.4S,v30.s[0] +add v28.4s, v28.4s, v3.4s +mla v21.4S, v11.4S, v31.s[0] +sub v11.4s, v10.4s, v13.4s +sqrdmulh v9.4S, v22.4S, v7.s[3] +add v10.4s, v10.4s, v13.4s +mla v25.4S, v17.4S, v31.s[0] +sub v17.4s, v18.4s, v21.4s +sqrdmulh v13.4S, v16.4S, v7.s[2] +add v18.4s, v18.4s, v21.4s +mla v14.4S, v24.4S, v31.s[0] +sub v24.4s, v23.4s, v25.4s +sqrdmulh v21.4S, v6.4S, v7.s[1] +add v23.4s, v23.4s, v25.4s +mla v12.4S, v27.4S, v31.s[0] +sub v27.4s, v2.4s, v14.4s +sqrdmulh v25.4S, v26.4S, v7.s[0] +add v2.4s, v2.4s, v14.4s +mul v22.4S, v22.4S,v1.s[3] +sub v14.4s, v8.4s, v12.4s +mul v16.4S, v16.4S,v1.s[2] +add v8.4s, v8.4s, v12.4s +mla v22.4S, v9.4S, v31.s[0] +str q17, [x0, #976] +mla v16.4S, v13.4S, v31.s[0] +str q18, [x0, #912] +mul v6.4S, v6.4S,v1.s[1] +str q24, [x0, #848] +mul v26.4S, v26.4S,v1.s[0] +str q23, [x0, #784] +mla v6.4S, v21.4S, v31.s[0] +str q27, [x0, #720] +mla v26.4S, v25.4S, v31.s[0] +str q2, [x0, #656] +str q14, [x0, #592] +str q8, [x0, #528] +sub v8.4s, v15.4s, v22.4s +str q8, [x0, #464] +add v15.4s, v15.4s, v22.4s +sub v22.4s, v28.4s, v16.4s +str q15, [x0, #400] +add v28.4s, v28.4s, v16.4s +sub v16.4s, v11.4s, v6.4s +str q22, [x0, #336] +add v11.4s, v11.4s, v6.4s +sub v6.4s, v10.4s, v26.4s +str q28, [x0, #272] +add v10.4s, v10.4s, v26.4s +str q16, [x0, #208] +str q11, [x0, #144] +str q6, [x0, #80] +str q10, [x0, #16] +ldr q4, [x17, #+128] +ldr q5, [x17, #+144] +ldr q19, [x17, #+160] +ldr q20, [x17, #+176] +ldr q0, [x17, #+192] +ldr q29, [x17, #+208] +ldr q3, [x17, #+224] +ldr q30, [x17, #+240] +ldr q12, [x0, #32] +ldr q9, [x0, #48] +ldr q17, [x0, #0] +ldr q13, [x0, #16] +sqrdmulh v18.4S, v12.4S, v5.s[0] +mul v12.4S, v12.4S,v4.s[0] +mla v12.4S, v18.4S, v31.s[0] +sub v18.4s, v17.4s, v12.4s +add v17.4s, v17.4s, v12.4s +sqrdmulh v12.4S, v9.4S, v5.s[0] +mul v9.4S, v9.4S,v4.s[0] +mla v9.4S, v12.4S, v31.s[0] +sub v12.4s, v13.4s, v9.4s +add v13.4s, v13.4s, v9.4s +sqrdmulh v9.4S, v13.4S, v5.s[1] +mul v13.4S, v13.4S,v4.s[1] +mla v13.4S, v9.4S, v31.s[0] +sub v9.4s, v17.4s, v13.4s +add v17.4s, v17.4s, v13.4s +sqrdmulh v13.4S, v12.4S, v5.s[2] +mul v12.4S, v12.4S,v4.s[2] +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v18.4s, v12.4s +add v18.4s, v18.4s, v12.4s +trn1 v12.4S, v17.4S, v9.4S +trn2 v24.4S, v17.4S, v9.4S +trn1 v23.4S, v18.4S, v13.4S +trn2 v21.4S, v18.4S, v13.4S +trn2 v18.2D, v12.2D, v23.2D +trn2 v13.2D, v24.2D, v21.2D +trn1 v17.2D, v12.2D, v23.2D +trn1 v9.2D, v24.2D, v21.2D +sqrdmulh v21.4S, v18.4S, v20.4S +mul v18.4S, v18.4S,v19.4S +mla v18.4S, v21.4S, v31.s[0] +sub v21.4s, v17.4s, v18.4s +add v17.4s, v17.4s, v18.4s +sqrdmulh v18.4S, v13.4S, v20.4S +mul v13.4S, v13.4S,v19.4S +mla v13.4S, v18.4S, v31.s[0] +sub v18.4s, v9.4s, v13.4s +add v9.4s, v9.4s, v13.4s +sqrdmulh v13.4S, v9.4S, v29.4S +mul v9.4S, v9.4S,v0.4S +mla v9.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v9.4s +add v17.4s, v17.4s, v9.4s +sqrdmulh v9.4S, v18.4S, v30.4S +mul v18.4S, v18.4S,v3.4S +mla v18.4S, v9.4S, v31.s[0] +sub v9.4s, v21.4s, v18.4s +add v21.4s, v21.4s, v18.4s +str q17, [x0, #0] +str q13, [x0, #16] +str q21, [x0, #32] +str q9, [x0, #48] +ldr q9, [x17, #+256] +ldr q21, [x17, #+272] +ldr q13, [x17, #+288] +ldr q17, [x17, #+304] +ldr q18, [x17, #+320] +ldr q24, [x17, #+336] +ldr q23, [x17, #+352] +ldr q12, [x17, #+368] +ldr q30, [x0, #96] +ldr q3, [x0, #112] +ldr q29, [x0, #64] +ldr q0, [x0, #80] +sqrdmulh v20.4S, v30.4S, v21.s[0] +mul v30.4S, v30.4S,v9.s[0] +mla v30.4S, v20.4S, v31.s[0] +sub v20.4s, v29.4s, v30.4s +add v29.4s, v29.4s, v30.4s +sqrdmulh v30.4S, v3.4S, v21.s[0] +mul v3.4S, v3.4S,v9.s[0] +mla v3.4S, v30.4S, v31.s[0] +sub v30.4s, v0.4s, v3.4s +add v0.4s, v0.4s, v3.4s +sqrdmulh v3.4S, v0.4S, v21.s[1] +mul v0.4S, v0.4S,v9.s[1] +mla v0.4S, v3.4S, v31.s[0] +sub v3.4s, v29.4s, v0.4s +add v29.4s, v29.4s, v0.4s +sqrdmulh v0.4S, v30.4S, v21.s[2] +mul v30.4S, v30.4S,v9.s[2] +mla v30.4S, v0.4S, v31.s[0] +sub v0.4s, v20.4s, v30.4s +add v20.4s, v20.4s, v30.4s +trn1 v30.4S, v29.4S, v3.4S +trn2 v19.4S, v29.4S, v3.4S +trn1 v5.4S, v20.4S, v0.4S +trn2 v4.4S, v20.4S, v0.4S +trn2 v20.2D, v30.2D, v5.2D +trn2 v0.2D, v19.2D, v4.2D +trn1 v29.2D, v30.2D, v5.2D +trn1 v3.2D, v19.2D, v4.2D +sqrdmulh v4.4S, v20.4S, v17.4S +mul v20.4S, v20.4S,v13.4S +mla v20.4S, v4.4S, v31.s[0] +sub v4.4s, v29.4s, v20.4s +add v29.4s, v29.4s, v20.4s +sqrdmulh v20.4S, v0.4S, v17.4S +mul v0.4S, v0.4S,v13.4S +mla v0.4S, v20.4S, v31.s[0] +sub v20.4s, v3.4s, v0.4s +add v3.4s, v3.4s, v0.4s +sqrdmulh v0.4S, v3.4S, v24.4S +mul v3.4S, v3.4S,v18.4S +mla v3.4S, v0.4S, v31.s[0] +sub v0.4s, v29.4s, v3.4s +add v29.4s, v29.4s, v3.4s +sqrdmulh v3.4S, v20.4S, v12.4S +mul v20.4S, v20.4S,v23.4S +mla v20.4S, v3.4S, v31.s[0] +sub v3.4s, v4.4s, v20.4s +add v4.4s, v4.4s, v20.4s +str q29, [x0, #64] +str q0, [x0, #80] +str q4, [x0, #96] +str q3, [x0, #112] +ldr q3, [x17, #+384] +ldr q4, [x17, #+400] +ldr q0, [x17, #+416] +ldr q29, [x17, #+432] +ldr q20, [x17, #+448] +ldr q19, [x17, #+464] +ldr q5, [x17, #+480] +ldr q30, [x17, #+496] +ldr q12, [x0, #160] +ldr q23, [x0, #176] +ldr q24, [x0, #128] +ldr q18, [x0, #144] +sqrdmulh v17.4S, v12.4S, v4.s[0] +mul v12.4S, v12.4S,v3.s[0] +mla v12.4S, v17.4S, v31.s[0] +sub v17.4s, v24.4s, v12.4s +add v24.4s, v24.4s, v12.4s +sqrdmulh v12.4S, v23.4S, v4.s[0] +mul v23.4S, v23.4S,v3.s[0] +mla v23.4S, v12.4S, v31.s[0] +sub v12.4s, v18.4s, v23.4s +add v18.4s, v18.4s, v23.4s +sqrdmulh v23.4S, v18.4S, v4.s[1] +mul v18.4S, v18.4S,v3.s[1] +mla v18.4S, v23.4S, v31.s[0] +sub v23.4s, v24.4s, v18.4s +add v24.4s, v24.4s, v18.4s +sqrdmulh v18.4S, v12.4S, v4.s[2] +mul v12.4S, v12.4S,v3.s[2] +mla v12.4S, v18.4S, v31.s[0] +sub v18.4s, v17.4s, v12.4s +add v17.4s, v17.4s, v12.4s +trn1 v12.4S, v24.4S, v23.4S +trn2 v13.4S, v24.4S, v23.4S +trn1 v21.4S, v17.4S, v18.4S +trn2 v9.4S, v17.4S, v18.4S +trn2 v17.2D, v12.2D, v21.2D +trn2 v18.2D, v13.2D, v9.2D +trn1 v24.2D, v12.2D, v21.2D +trn1 v23.2D, v13.2D, v9.2D +sqrdmulh v9.4S, v17.4S, v29.4S +mul v17.4S, v17.4S,v0.4S +mla v17.4S, v9.4S, v31.s[0] +sub v9.4s, v24.4s, v17.4s +add v24.4s, v24.4s, v17.4s +sqrdmulh v17.4S, v18.4S, v29.4S +mul v18.4S, v18.4S,v0.4S +mla v18.4S, v17.4S, v31.s[0] +sub v17.4s, v23.4s, v18.4s +add v23.4s, v23.4s, v18.4s +sqrdmulh v18.4S, v23.4S, v19.4S +mul v23.4S, v23.4S,v20.4S +mla v23.4S, v18.4S, v31.s[0] +sub v18.4s, v24.4s, v23.4s +add v24.4s, v24.4s, v23.4s +sqrdmulh v23.4S, v17.4S, v30.4S +mul v17.4S, v17.4S,v5.4S +mla v17.4S, v23.4S, v31.s[0] +sub v23.4s, v9.4s, v17.4s +add v9.4s, v9.4s, v17.4s +str q24, [x0, #128] +str q18, [x0, #144] +str q9, [x0, #160] +str q23, [x0, #176] +ldr q23, [x17, #+512] +ldr q9, [x17, #+528] +ldr q18, [x17, #+544] +ldr q24, [x17, #+560] +ldr q17, [x17, #+576] +ldr q13, [x17, #+592] +ldr q21, [x17, #+608] +ldr q12, [x17, #+624] +ldr q30, [x0, #224] +ldr q5, [x0, #240] +ldr q19, [x0, #192] +ldr q20, [x0, #208] +sqrdmulh v29.4S, v30.4S, v9.s[0] +mul v30.4S, v30.4S,v23.s[0] +mla v30.4S, v29.4S, v31.s[0] +sub v29.4s, v19.4s, v30.4s +add v19.4s, v19.4s, v30.4s +sqrdmulh v30.4S, v5.4S, v9.s[0] +mul v5.4S, v5.4S,v23.s[0] +mla v5.4S, v30.4S, v31.s[0] +sub v30.4s, v20.4s, v5.4s +add v20.4s, v20.4s, v5.4s +sqrdmulh v5.4S, v20.4S, v9.s[1] +mul v20.4S, v20.4S,v23.s[1] +mla v20.4S, v5.4S, v31.s[0] +sub v5.4s, v19.4s, v20.4s +add v19.4s, v19.4s, v20.4s +sqrdmulh v20.4S, v30.4S, v9.s[2] +mul v30.4S, v30.4S,v23.s[2] +mla v30.4S, v20.4S, v31.s[0] +sub v20.4s, v29.4s, v30.4s +add v29.4s, v29.4s, v30.4s +trn1 v30.4S, v19.4S, v5.4S +trn2 v0.4S, v19.4S, v5.4S +trn1 v4.4S, v29.4S, v20.4S +trn2 v3.4S, v29.4S, v20.4S +trn2 v29.2D, v30.2D, v4.2D +trn2 v20.2D, v0.2D, v3.2D +trn1 v19.2D, v30.2D, v4.2D +trn1 v5.2D, v0.2D, v3.2D +sqrdmulh v3.4S, v29.4S, v24.4S +mul v29.4S, v29.4S,v18.4S +mla v29.4S, v3.4S, v31.s[0] +sub v3.4s, v19.4s, v29.4s +add v19.4s, v19.4s, v29.4s +sqrdmulh v29.4S, v20.4S, v24.4S +mul v20.4S, v20.4S,v18.4S +mla v20.4S, v29.4S, v31.s[0] +sub v29.4s, v5.4s, v20.4s +add v5.4s, v5.4s, v20.4s +sqrdmulh v20.4S, v5.4S, v13.4S +mul v5.4S, v5.4S,v17.4S +mla v5.4S, v20.4S, v31.s[0] +sub v20.4s, v19.4s, v5.4s +add v19.4s, v19.4s, v5.4s +sqrdmulh v5.4S, v29.4S, v12.4S +mul v29.4S, v29.4S,v21.4S +mla v29.4S, v5.4S, v31.s[0] +sub v5.4s, v3.4s, v29.4s +add v3.4s, v3.4s, v29.4s +str q19, [x0, #192] +str q20, [x0, #208] +str q3, [x0, #224] +str q5, [x0, #240] +ldr q5, [x17, #+640] +ldr q3, [x17, #+656] +ldr q20, [x17, #+672] +ldr q19, [x17, #+688] +ldr q29, [x17, #+704] +ldr q0, [x17, #+720] +ldr q4, [x17, #+736] +ldr q30, [x17, #+752] +ldr q12, [x0, #288] +ldr q21, [x0, #304] +ldr q13, [x0, #256] +ldr q17, [x0, #272] +sqrdmulh v24.4S, v12.4S, v3.s[0] +mul v12.4S, v12.4S,v5.s[0] +mla v12.4S, v24.4S, v31.s[0] +sub v24.4s, v13.4s, v12.4s +add v13.4s, v13.4s, v12.4s +sqrdmulh v12.4S, v21.4S, v3.s[0] +mul v21.4S, v21.4S,v5.s[0] +mla v21.4S, v12.4S, v31.s[0] +sub v12.4s, v17.4s, v21.4s +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v3.s[1] +mul v17.4S, v17.4S,v5.s[1] +mla v17.4S, v21.4S, v31.s[0] +sub v21.4s, v13.4s, v17.4s +add v13.4s, v13.4s, v17.4s +sqrdmulh v17.4S, v12.4S, v3.s[2] +mul v12.4S, v12.4S,v5.s[2] +mla v12.4S, v17.4S, v31.s[0] +sub v17.4s, v24.4s, v12.4s +add v24.4s, v24.4s, v12.4s +trn1 v12.4S, v13.4S, v21.4S +trn2 v18.4S, v13.4S, v21.4S +trn1 v9.4S, v24.4S, v17.4S +trn2 v23.4S, v24.4S, v17.4S +trn2 v24.2D, v12.2D, v9.2D +trn2 v17.2D, v18.2D, v23.2D +trn1 v13.2D, v12.2D, v9.2D +trn1 v21.2D, v18.2D, v23.2D +sqrdmulh v23.4S, v24.4S, v19.4S +mul v24.4S, v24.4S,v20.4S +mla v24.4S, v23.4S, v31.s[0] +sub v23.4s, v13.4s, v24.4s +add v13.4s, v13.4s, v24.4s +sqrdmulh v24.4S, v17.4S, v19.4S +mul v17.4S, v17.4S,v20.4S +mla v17.4S, v24.4S, v31.s[0] +sub v24.4s, v21.4s, v17.4s +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v21.4S, v0.4S +mul v21.4S, v21.4S,v29.4S +mla v21.4S, v17.4S, v31.s[0] +sub v17.4s, v13.4s, v21.4s +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v24.4S, v30.4S +mul v24.4S, v24.4S,v4.4S +mla v24.4S, v21.4S, v31.s[0] +sub v21.4s, v23.4s, v24.4s +add v23.4s, v23.4s, v24.4s +str q13, [x0, #256] +str q17, [x0, #272] +str q23, [x0, #288] +str q21, [x0, #304] +ldr q21, [x17, #+768] +ldr q23, [x17, #+784] +ldr q17, [x17, #+800] +ldr q13, [x17, #+816] +ldr q24, [x17, #+832] +ldr q18, [x17, #+848] +ldr q9, [x17, #+864] +ldr q12, [x17, #+880] +ldr q30, [x0, #352] +ldr q4, [x0, #368] +ldr q0, [x0, #320] +ldr q29, [x0, #336] +sqrdmulh v19.4S, v30.4S, v23.s[0] +mul v30.4S, v30.4S,v21.s[0] +mla v30.4S, v19.4S, v31.s[0] +sub v19.4s, v0.4s, v30.4s +add v0.4s, v0.4s, v30.4s +sqrdmulh v30.4S, v4.4S, v23.s[0] +mul v4.4S, v4.4S,v21.s[0] +mla v4.4S, v30.4S, v31.s[0] +sub v30.4s, v29.4s, v4.4s +add v29.4s, v29.4s, v4.4s +sqrdmulh v4.4S, v29.4S, v23.s[1] +mul v29.4S, v29.4S,v21.s[1] +mla v29.4S, v4.4S, v31.s[0] +sub v4.4s, v0.4s, v29.4s +add v0.4s, v0.4s, v29.4s +sqrdmulh v29.4S, v30.4S, v23.s[2] +mul v30.4S, v30.4S,v21.s[2] +mla v30.4S, v29.4S, v31.s[0] +sub v29.4s, v19.4s, v30.4s +add v19.4s, v19.4s, v30.4s +trn1 v30.4S, v0.4S, v4.4S +trn2 v20.4S, v0.4S, v4.4S +trn1 v3.4S, v19.4S, v29.4S +trn2 v5.4S, v19.4S, v29.4S +trn2 v19.2D, v30.2D, v3.2D +trn2 v29.2D, v20.2D, v5.2D +trn1 v0.2D, v30.2D, v3.2D +trn1 v4.2D, v20.2D, v5.2D +sqrdmulh v5.4S, v19.4S, v13.4S +mul v19.4S, v19.4S,v17.4S +mla v19.4S, v5.4S, v31.s[0] +sub v5.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v29.4S, v13.4S +mul v29.4S, v29.4S,v17.4S +mla v29.4S, v19.4S, v31.s[0] +sub v19.4s, v4.4s, v29.4s +add v4.4s, v4.4s, v29.4s +sqrdmulh v29.4S, v4.4S, v18.4S +mul v4.4S, v4.4S,v24.4S +mla v4.4S, v29.4S, v31.s[0] +sub v29.4s, v0.4s, v4.4s +add v0.4s, v0.4s, v4.4s +sqrdmulh v4.4S, v19.4S, v12.4S +mul v19.4S, v19.4S,v9.4S +mla v19.4S, v4.4S, v31.s[0] +sub v4.4s, v5.4s, v19.4s +add v5.4s, v5.4s, v19.4s +str q0, [x0, #320] +str q29, [x0, #336] +str q5, [x0, #352] +str q4, [x0, #368] +ldr q4, [x17, #+896] +ldr q5, [x17, #+912] +ldr q29, [x17, #+928] +ldr q0, [x17, #+944] +ldr q19, [x17, #+960] +ldr q20, [x17, #+976] +ldr q3, [x17, #+992] +ldr q30, [x17, #+1008] +ldr q12, [x0, #416] +ldr q9, [x0, #432] +ldr q18, [x0, #384] +ldr q24, [x0, #400] +sqrdmulh v13.4S, v12.4S, v5.s[0] +mul v12.4S, v12.4S,v4.s[0] +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v18.4s, v12.4s +add v18.4s, v18.4s, v12.4s +sqrdmulh v12.4S, v9.4S, v5.s[0] +mul v9.4S, v9.4S,v4.s[0] +mla v9.4S, v12.4S, v31.s[0] +sub v12.4s, v24.4s, v9.4s +add v24.4s, v24.4s, v9.4s +sqrdmulh v9.4S, v24.4S, v5.s[1] +mul v24.4S, v24.4S,v4.s[1] +mla v24.4S, v9.4S, v31.s[0] +sub v9.4s, v18.4s, v24.4s +add v18.4s, v18.4s, v24.4s +sqrdmulh v24.4S, v12.4S, v5.s[2] +mul v12.4S, v12.4S,v4.s[2] +mla v12.4S, v24.4S, v31.s[0] +sub v24.4s, v13.4s, v12.4s +add v13.4s, v13.4s, v12.4s +trn1 v12.4S, v18.4S, v9.4S +trn2 v17.4S, v18.4S, v9.4S +trn1 v23.4S, v13.4S, v24.4S +trn2 v21.4S, v13.4S, v24.4S +trn2 v13.2D, v12.2D, v23.2D +trn2 v24.2D, v17.2D, v21.2D +trn1 v18.2D, v12.2D, v23.2D +trn1 v9.2D, v17.2D, v21.2D +sqrdmulh v21.4S, v13.4S, v0.4S +mul v13.4S, v13.4S,v29.4S +mla v13.4S, v21.4S, v31.s[0] +sub v21.4s, v18.4s, v13.4s +add v18.4s, v18.4s, v13.4s +sqrdmulh v13.4S, v24.4S, v0.4S +mul v24.4S, v24.4S,v29.4S +mla v24.4S, v13.4S, v31.s[0] +sub v13.4s, v9.4s, v24.4s +add v9.4s, v9.4s, v24.4s +sqrdmulh v24.4S, v9.4S, v20.4S +mul v9.4S, v9.4S,v19.4S +mla v9.4S, v24.4S, v31.s[0] +sub v24.4s, v18.4s, v9.4s +add v18.4s, v18.4s, v9.4s +sqrdmulh v9.4S, v13.4S, v30.4S +mul v13.4S, v13.4S,v3.4S +mla v13.4S, v9.4S, v31.s[0] +sub v9.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +str q18, [x0, #384] +str q24, [x0, #400] +str q21, [x0, #416] +str q9, [x0, #432] +ldr q9, [x17, #+1024] +ldr q21, [x17, #+1040] +ldr q24, [x17, #+1056] +ldr q18, [x17, #+1072] +ldr q13, [x17, #+1088] +ldr q17, [x17, #+1104] +ldr q23, [x17, #+1120] +ldr q12, [x17, #+1136] +ldr q30, [x0, #480] +ldr q3, [x0, #496] +ldr q20, [x0, #448] +ldr q19, [x0, #464] +sqrdmulh v0.4S, v30.4S, v21.s[0] +mul v30.4S, v30.4S,v9.s[0] +mla v30.4S, v0.4S, v31.s[0] +sub v0.4s, v20.4s, v30.4s +add v20.4s, v20.4s, v30.4s +sqrdmulh v30.4S, v3.4S, v21.s[0] +mul v3.4S, v3.4S,v9.s[0] +mla v3.4S, v30.4S, v31.s[0] +sub v30.4s, v19.4s, v3.4s +add v19.4s, v19.4s, v3.4s +sqrdmulh v3.4S, v19.4S, v21.s[1] +mul v19.4S, v19.4S,v9.s[1] +mla v19.4S, v3.4S, v31.s[0] +sub v3.4s, v20.4s, v19.4s +add v20.4s, v20.4s, v19.4s +sqrdmulh v19.4S, v30.4S, v21.s[2] +mul v30.4S, v30.4S,v9.s[2] +mla v30.4S, v19.4S, v31.s[0] +sub v19.4s, v0.4s, v30.4s +add v0.4s, v0.4s, v30.4s +trn1 v30.4S, v20.4S, v3.4S +trn2 v29.4S, v20.4S, v3.4S +trn1 v5.4S, v0.4S, v19.4S +trn2 v4.4S, v0.4S, v19.4S +trn2 v0.2D, v30.2D, v5.2D +trn2 v19.2D, v29.2D, v4.2D +trn1 v20.2D, v30.2D, v5.2D +trn1 v3.2D, v29.2D, v4.2D +sqrdmulh v4.4S, v0.4S, v18.4S +mul v0.4S, v0.4S,v24.4S +mla v0.4S, v4.4S, v31.s[0] +sub v4.4s, v20.4s, v0.4s +add v20.4s, v20.4s, v0.4s +sqrdmulh v0.4S, v19.4S, v18.4S +mul v19.4S, v19.4S,v24.4S +mla v19.4S, v0.4S, v31.s[0] +sub v0.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v3.4S, v17.4S +mul v3.4S, v3.4S,v13.4S +mla v3.4S, v19.4S, v31.s[0] +sub v19.4s, v20.4s, v3.4s +add v20.4s, v20.4s, v3.4s +sqrdmulh v3.4S, v0.4S, v12.4S +mul v0.4S, v0.4S,v23.4S +mla v0.4S, v3.4S, v31.s[0] +sub v3.4s, v4.4s, v0.4s +add v4.4s, v4.4s, v0.4s +str q20, [x0, #448] +str q19, [x0, #464] +str q4, [x0, #480] +str q3, [x0, #496] +ldr q3, [x17, #+1152] +ldr q4, [x17, #+1168] +ldr q19, [x17, #+1184] +ldr q20, [x17, #+1200] +ldr q0, [x17, #+1216] +ldr q29, [x17, #+1232] +ldr q5, [x17, #+1248] +ldr q30, [x17, #+1264] +ldr q12, [x0, #544] +ldr q23, [x0, #560] +ldr q17, [x0, #512] +ldr q13, [x0, #528] +sqrdmulh v18.4S, v12.4S, v4.s[0] +mul v12.4S, v12.4S,v3.s[0] +mla v12.4S, v18.4S, v31.s[0] +sub v18.4s, v17.4s, v12.4s +add v17.4s, v17.4s, v12.4s +sqrdmulh v12.4S, v23.4S, v4.s[0] +mul v23.4S, v23.4S,v3.s[0] +mla v23.4S, v12.4S, v31.s[0] +sub v12.4s, v13.4s, v23.4s +add v13.4s, v13.4s, v23.4s +sqrdmulh v23.4S, v13.4S, v4.s[1] +mul v13.4S, v13.4S,v3.s[1] +mla v13.4S, v23.4S, v31.s[0] +sub v23.4s, v17.4s, v13.4s +add v17.4s, v17.4s, v13.4s +sqrdmulh v13.4S, v12.4S, v4.s[2] +mul v12.4S, v12.4S,v3.s[2] +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v18.4s, v12.4s +add v18.4s, v18.4s, v12.4s +trn1 v12.4S, v17.4S, v23.4S +trn2 v24.4S, v17.4S, v23.4S +trn1 v21.4S, v18.4S, v13.4S +trn2 v9.4S, v18.4S, v13.4S +trn2 v18.2D, v12.2D, v21.2D +trn2 v13.2D, v24.2D, v9.2D +trn1 v17.2D, v12.2D, v21.2D +trn1 v23.2D, v24.2D, v9.2D +sqrdmulh v9.4S, v18.4S, v20.4S +mul v18.4S, v18.4S,v19.4S +mla v18.4S, v9.4S, v31.s[0] +sub v9.4s, v17.4s, v18.4s +add v17.4s, v17.4s, v18.4s +sqrdmulh v18.4S, v13.4S, v20.4S +mul v13.4S, v13.4S,v19.4S +mla v13.4S, v18.4S, v31.s[0] +sub v18.4s, v23.4s, v13.4s +add v23.4s, v23.4s, v13.4s +sqrdmulh v13.4S, v23.4S, v29.4S +mul v23.4S, v23.4S,v0.4S +mla v23.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v23.4s +add v17.4s, v17.4s, v23.4s +sqrdmulh v23.4S, v18.4S, v30.4S +mul v18.4S, v18.4S,v5.4S +mla v18.4S, v23.4S, v31.s[0] +sub v23.4s, v9.4s, v18.4s +add v9.4s, v9.4s, v18.4s +str q17, [x0, #512] +str q13, [x0, #528] +str q9, [x0, #544] +str q23, [x0, #560] +ldr q23, [x17, #+1280] +ldr q9, [x17, #+1296] +ldr q13, [x17, #+1312] +ldr q17, [x17, #+1328] +ldr q18, [x17, #+1344] +ldr q24, [x17, #+1360] +ldr q21, [x17, #+1376] +ldr q12, [x17, #+1392] +ldr q30, [x0, #608] +ldr q5, [x0, #624] +ldr q29, [x0, #576] +ldr q0, [x0, #592] +sqrdmulh v20.4S, v30.4S, v9.s[0] +mul v30.4S, v30.4S,v23.s[0] +mla v30.4S, v20.4S, v31.s[0] +sub v20.4s, v29.4s, v30.4s +add v29.4s, v29.4s, v30.4s +sqrdmulh v30.4S, v5.4S, v9.s[0] +mul v5.4S, v5.4S,v23.s[0] +mla v5.4S, v30.4S, v31.s[0] +sub v30.4s, v0.4s, v5.4s +add v0.4s, v0.4s, v5.4s +sqrdmulh v5.4S, v0.4S, v9.s[1] +mul v0.4S, v0.4S,v23.s[1] +mla v0.4S, v5.4S, v31.s[0] +sub v5.4s, v29.4s, v0.4s +add v29.4s, v29.4s, v0.4s +sqrdmulh v0.4S, v30.4S, v9.s[2] +mul v30.4S, v30.4S,v23.s[2] +mla v30.4S, v0.4S, v31.s[0] +sub v0.4s, v20.4s, v30.4s +add v20.4s, v20.4s, v30.4s +trn1 v30.4S, v29.4S, v5.4S +trn2 v19.4S, v29.4S, v5.4S +trn1 v4.4S, v20.4S, v0.4S +trn2 v3.4S, v20.4S, v0.4S +trn2 v20.2D, v30.2D, v4.2D +trn2 v0.2D, v19.2D, v3.2D +trn1 v29.2D, v30.2D, v4.2D +trn1 v5.2D, v19.2D, v3.2D +sqrdmulh v3.4S, v20.4S, v17.4S +mul v20.4S, v20.4S,v13.4S +mla v20.4S, v3.4S, v31.s[0] +sub v3.4s, v29.4s, v20.4s +add v29.4s, v29.4s, v20.4s +sqrdmulh v20.4S, v0.4S, v17.4S +mul v0.4S, v0.4S,v13.4S +mla v0.4S, v20.4S, v31.s[0] +sub v20.4s, v5.4s, v0.4s +add v5.4s, v5.4s, v0.4s +sqrdmulh v0.4S, v5.4S, v24.4S +mul v5.4S, v5.4S,v18.4S +mla v5.4S, v0.4S, v31.s[0] +sub v0.4s, v29.4s, v5.4s +add v29.4s, v29.4s, v5.4s +sqrdmulh v5.4S, v20.4S, v12.4S +mul v20.4S, v20.4S,v21.4S +mla v20.4S, v5.4S, v31.s[0] +sub v5.4s, v3.4s, v20.4s +add v3.4s, v3.4s, v20.4s +str q29, [x0, #576] +str q0, [x0, #592] +str q3, [x0, #608] +str q5, [x0, #624] +ldr q5, [x17, #+1408] +ldr q3, [x17, #+1424] +ldr q0, [x17, #+1440] +ldr q29, [x17, #+1456] +ldr q20, [x17, #+1472] +ldr q19, [x17, #+1488] +ldr q4, [x17, #+1504] +ldr q30, [x17, #+1520] +ldr q12, [x0, #672] +ldr q21, [x0, #688] +ldr q24, [x0, #640] +ldr q18, [x0, #656] +sqrdmulh v17.4S, v12.4S, v3.s[0] +mul v12.4S, v12.4S,v5.s[0] +mla v12.4S, v17.4S, v31.s[0] +sub v17.4s, v24.4s, v12.4s +add v24.4s, v24.4s, v12.4s +sqrdmulh v12.4S, v21.4S, v3.s[0] +mul v21.4S, v21.4S,v5.s[0] +mla v21.4S, v12.4S, v31.s[0] +sub v12.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v18.4S, v3.s[1] +mul v18.4S, v18.4S,v5.s[1] +mla v18.4S, v21.4S, v31.s[0] +sub v21.4s, v24.4s, v18.4s +add v24.4s, v24.4s, v18.4s +sqrdmulh v18.4S, v12.4S, v3.s[2] +mul v12.4S, v12.4S,v5.s[2] +mla v12.4S, v18.4S, v31.s[0] +sub v18.4s, v17.4s, v12.4s +add v17.4s, v17.4s, v12.4s +trn1 v12.4S, v24.4S, v21.4S +trn2 v13.4S, v24.4S, v21.4S +trn1 v9.4S, v17.4S, v18.4S +trn2 v23.4S, v17.4S, v18.4S +trn2 v17.2D, v12.2D, v9.2D +trn2 v18.2D, v13.2D, v23.2D +trn1 v24.2D, v12.2D, v9.2D +trn1 v21.2D, v13.2D, v23.2D +sqrdmulh v23.4S, v17.4S, v29.4S +mul v17.4S, v17.4S,v0.4S +mla v17.4S, v23.4S, v31.s[0] +sub v23.4s, v24.4s, v17.4s +add v24.4s, v24.4s, v17.4s +sqrdmulh v17.4S, v18.4S, v29.4S +mul v18.4S, v18.4S,v0.4S +mla v18.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v18.4s +add v21.4s, v21.4s, v18.4s +sqrdmulh v18.4S, v21.4S, v19.4S +mul v21.4S, v21.4S,v20.4S +mla v21.4S, v18.4S, v31.s[0] +sub v18.4s, v24.4s, v21.4s +add v24.4s, v24.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v30.4S +mul v17.4S, v17.4S,v4.4S +mla v17.4S, v21.4S, v31.s[0] +sub v21.4s, v23.4s, v17.4s +add v23.4s, v23.4s, v17.4s +str q24, [x0, #640] +str q18, [x0, #656] +str q23, [x0, #672] +str q21, [x0, #688] +ldr q21, [x17, #+1536] +ldr q23, [x17, #+1552] +ldr q18, [x17, #+1568] +ldr q24, [x17, #+1584] +ldr q17, [x17, #+1600] +ldr q13, [x17, #+1616] +ldr q9, [x17, #+1632] +ldr q12, [x17, #+1648] +ldr q30, [x0, #736] +ldr q4, [x0, #752] +ldr q19, [x0, #704] +ldr q20, [x0, #720] +sqrdmulh v29.4S, v30.4S, v23.s[0] +mul v30.4S, v30.4S,v21.s[0] +mla v30.4S, v29.4S, v31.s[0] +sub v29.4s, v19.4s, v30.4s +add v19.4s, v19.4s, v30.4s +sqrdmulh v30.4S, v4.4S, v23.s[0] +mul v4.4S, v4.4S,v21.s[0] +mla v4.4S, v30.4S, v31.s[0] +sub v30.4s, v20.4s, v4.4s +add v20.4s, v20.4s, v4.4s +sqrdmulh v4.4S, v20.4S, v23.s[1] +mul v20.4S, v20.4S,v21.s[1] +mla v20.4S, v4.4S, v31.s[0] +sub v4.4s, v19.4s, v20.4s +add v19.4s, v19.4s, v20.4s +sqrdmulh v20.4S, v30.4S, v23.s[2] +mul v30.4S, v30.4S,v21.s[2] +mla v30.4S, v20.4S, v31.s[0] +sub v20.4s, v29.4s, v30.4s +add v29.4s, v29.4s, v30.4s +trn1 v30.4S, v19.4S, v4.4S +trn2 v0.4S, v19.4S, v4.4S +trn1 v3.4S, v29.4S, v20.4S +trn2 v5.4S, v29.4S, v20.4S +trn2 v29.2D, v30.2D, v3.2D +trn2 v20.2D, v0.2D, v5.2D +trn1 v19.2D, v30.2D, v3.2D +trn1 v4.2D, v0.2D, v5.2D +sqrdmulh v5.4S, v29.4S, v24.4S +mul v29.4S, v29.4S,v18.4S +mla v29.4S, v5.4S, v31.s[0] +sub v5.4s, v19.4s, v29.4s +add v19.4s, v19.4s, v29.4s +sqrdmulh v29.4S, v20.4S, v24.4S +mul v20.4S, v20.4S,v18.4S +mla v20.4S, v29.4S, v31.s[0] +sub v29.4s, v4.4s, v20.4s +add v4.4s, v4.4s, v20.4s +sqrdmulh v20.4S, v4.4S, v13.4S +mul v4.4S, v4.4S,v17.4S +mla v4.4S, v20.4S, v31.s[0] +sub v20.4s, v19.4s, v4.4s +add v19.4s, v19.4s, v4.4s +sqrdmulh v4.4S, v29.4S, v12.4S +mul v29.4S, v29.4S,v9.4S +mla v29.4S, v4.4S, v31.s[0] +sub v4.4s, v5.4s, v29.4s +add v5.4s, v5.4s, v29.4s +str q19, [x0, #704] +str q20, [x0, #720] +str q5, [x0, #736] +str q4, [x0, #752] +ldr q4, [x17, #+1664] +ldr q5, [x17, #+1680] +ldr q20, [x17, #+1696] +ldr q19, [x17, #+1712] +ldr q29, [x17, #+1728] +ldr q0, [x17, #+1744] +ldr q3, [x17, #+1760] +ldr q30, [x17, #+1776] +ldr q12, [x0, #800] +ldr q9, [x0, #816] +ldr q13, [x0, #768] +ldr q17, [x0, #784] +sqrdmulh v24.4S, v12.4S, v5.s[0] +mul v12.4S, v12.4S,v4.s[0] +mla v12.4S, v24.4S, v31.s[0] +sub v24.4s, v13.4s, v12.4s +add v13.4s, v13.4s, v12.4s +sqrdmulh v12.4S, v9.4S, v5.s[0] +mul v9.4S, v9.4S,v4.s[0] +mla v9.4S, v12.4S, v31.s[0] +sub v12.4s, v17.4s, v9.4s +add v17.4s, v17.4s, v9.4s +sqrdmulh v9.4S, v17.4S, v5.s[1] +mul v17.4S, v17.4S,v4.s[1] +mla v17.4S, v9.4S, v31.s[0] +sub v9.4s, v13.4s, v17.4s +add v13.4s, v13.4s, v17.4s +sqrdmulh v17.4S, v12.4S, v5.s[2] +mul v12.4S, v12.4S,v4.s[2] +mla v12.4S, v17.4S, v31.s[0] +sub v17.4s, v24.4s, v12.4s +add v24.4s, v24.4s, v12.4s +trn1 v12.4S, v13.4S, v9.4S +trn2 v18.4S, v13.4S, v9.4S +trn1 v23.4S, v24.4S, v17.4S +trn2 v21.4S, v24.4S, v17.4S +trn2 v24.2D, v12.2D, v23.2D +trn2 v17.2D, v18.2D, v21.2D +trn1 v13.2D, v12.2D, v23.2D +trn1 v9.2D, v18.2D, v21.2D +sqrdmulh v21.4S, v24.4S, v19.4S +mul v24.4S, v24.4S,v20.4S +mla v24.4S, v21.4S, v31.s[0] +sub v21.4s, v13.4s, v24.4s +add v13.4s, v13.4s, v24.4s +sqrdmulh v24.4S, v17.4S, v19.4S +mul v17.4S, v17.4S,v20.4S +mla v17.4S, v24.4S, v31.s[0] +sub v24.4s, v9.4s, v17.4s +add v9.4s, v9.4s, v17.4s +sqrdmulh v17.4S, v9.4S, v0.4S +mul v9.4S, v9.4S,v29.4S +mla v9.4S, v17.4S, v31.s[0] +sub v17.4s, v13.4s, v9.4s +add v13.4s, v13.4s, v9.4s +sqrdmulh v9.4S, v24.4S, v30.4S +mul v24.4S, v24.4S,v3.4S +mla v24.4S, v9.4S, v31.s[0] +sub v9.4s, v21.4s, v24.4s +add v21.4s, v21.4s, v24.4s +str q13, [x0, #768] +str q17, [x0, #784] +str q21, [x0, #800] +str q9, [x0, #816] +ldr q9, [x17, #+1792] +ldr q21, [x17, #+1808] +ldr q17, [x17, #+1824] +ldr q13, [x17, #+1840] +ldr q24, [x17, #+1856] +ldr q18, [x17, #+1872] +ldr q23, [x17, #+1888] +ldr q12, [x17, #+1904] +ldr q30, [x0, #864] +ldr q3, [x0, #880] +ldr q0, [x0, #832] +ldr q29, [x0, #848] +sqrdmulh v19.4S, v30.4S, v21.s[0] +mul v30.4S, v30.4S,v9.s[0] +mla v30.4S, v19.4S, v31.s[0] +sub v19.4s, v0.4s, v30.4s +add v0.4s, v0.4s, v30.4s +sqrdmulh v30.4S, v3.4S, v21.s[0] +mul v3.4S, v3.4S,v9.s[0] +mla v3.4S, v30.4S, v31.s[0] +sub v30.4s, v29.4s, v3.4s +add v29.4s, v29.4s, v3.4s +sqrdmulh v3.4S, v29.4S, v21.s[1] +mul v29.4S, v29.4S,v9.s[1] +mla v29.4S, v3.4S, v31.s[0] +sub v3.4s, v0.4s, v29.4s +add v0.4s, v0.4s, v29.4s +sqrdmulh v29.4S, v30.4S, v21.s[2] +mul v30.4S, v30.4S,v9.s[2] +mla v30.4S, v29.4S, v31.s[0] +sub v29.4s, v19.4s, v30.4s +add v19.4s, v19.4s, v30.4s +trn1 v30.4S, v0.4S, v3.4S +trn2 v20.4S, v0.4S, v3.4S +trn1 v5.4S, v19.4S, v29.4S +trn2 v4.4S, v19.4S, v29.4S +trn2 v19.2D, v30.2D, v5.2D +trn2 v29.2D, v20.2D, v4.2D +trn1 v0.2D, v30.2D, v5.2D +trn1 v3.2D, v20.2D, v4.2D +sqrdmulh v4.4S, v19.4S, v13.4S +mul v19.4S, v19.4S,v17.4S +mla v19.4S, v4.4S, v31.s[0] +sub v4.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v29.4S, v13.4S +mul v29.4S, v29.4S,v17.4S +mla v29.4S, v19.4S, v31.s[0] +sub v19.4s, v3.4s, v29.4s +add v3.4s, v3.4s, v29.4s +sqrdmulh v29.4S, v3.4S, v18.4S +mul v3.4S, v3.4S,v24.4S +mla v3.4S, v29.4S, v31.s[0] +sub v29.4s, v0.4s, v3.4s +add v0.4s, v0.4s, v3.4s +sqrdmulh v3.4S, v19.4S, v12.4S +mul v19.4S, v19.4S,v23.4S +mla v19.4S, v3.4S, v31.s[0] +sub v3.4s, v4.4s, v19.4s +add v4.4s, v4.4s, v19.4s +str q0, [x0, #832] +str q29, [x0, #848] +str q4, [x0, #864] +str q3, [x0, #880] +ldr q3, [x17, #+1920] +ldr q4, [x17, #+1936] +ldr q29, [x17, #+1952] +ldr q0, [x17, #+1968] +ldr q19, [x17, #+1984] +ldr q20, [x17, #+2000] +ldr q5, [x17, #+2016] +ldr q30, [x17, #+2032] +ldr q12, [x0, #928] +ldr q23, [x0, #944] +ldr q18, [x0, #896] +ldr q24, [x0, #912] +sqrdmulh v13.4S, v12.4S, v4.s[0] +mul v12.4S, v12.4S,v3.s[0] +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v18.4s, v12.4s +add v18.4s, v18.4s, v12.4s +sqrdmulh v12.4S, v23.4S, v4.s[0] +mul v23.4S, v23.4S,v3.s[0] +mla v23.4S, v12.4S, v31.s[0] +sub v12.4s, v24.4s, v23.4s +add v24.4s, v24.4s, v23.4s +sqrdmulh v23.4S, v24.4S, v4.s[1] +mul v24.4S, v24.4S,v3.s[1] +mla v24.4S, v23.4S, v31.s[0] +sub v23.4s, v18.4s, v24.4s +add v18.4s, v18.4s, v24.4s +sqrdmulh v24.4S, v12.4S, v4.s[2] +mul v12.4S, v12.4S,v3.s[2] +mla v12.4S, v24.4S, v31.s[0] +sub v24.4s, v13.4s, v12.4s +add v13.4s, v13.4s, v12.4s +trn1 v12.4S, v18.4S, v23.4S +trn2 v17.4S, v18.4S, v23.4S +trn1 v21.4S, v13.4S, v24.4S +trn2 v9.4S, v13.4S, v24.4S +trn2 v13.2D, v12.2D, v21.2D +trn2 v24.2D, v17.2D, v9.2D +trn1 v18.2D, v12.2D, v21.2D +trn1 v23.2D, v17.2D, v9.2D +sqrdmulh v9.4S, v13.4S, v0.4S +mul v13.4S, v13.4S,v29.4S +mla v13.4S, v9.4S, v31.s[0] +sub v9.4s, v18.4s, v13.4s +add v18.4s, v18.4s, v13.4s +sqrdmulh v13.4S, v24.4S, v0.4S +mul v24.4S, v24.4S,v29.4S +mla v24.4S, v13.4S, v31.s[0] +sub v13.4s, v23.4s, v24.4s +add v23.4s, v23.4s, v24.4s +sqrdmulh v24.4S, v23.4S, v20.4S +mul v23.4S, v23.4S,v19.4S +mla v23.4S, v24.4S, v31.s[0] +sub v24.4s, v18.4s, v23.4s +add v18.4s, v18.4s, v23.4s +sqrdmulh v23.4S, v13.4S, v30.4S +mul v13.4S, v13.4S,v5.4S +mla v13.4S, v23.4S, v31.s[0] +sub v23.4s, v9.4s, v13.4s +add v9.4s, v9.4s, v13.4s +str q18, [x0, #896] +str q24, [x0, #912] +str q9, [x0, #928] +str q23, [x0, #944] +ldr q23, [x17, #+2048] +ldr q9, [x17, #+2064] +ldr q24, [x17, #+2080] +ldr q18, [x17, #+2096] +ldr q13, [x17, #+2112] +ldr q17, [x17, #+2128] +ldr q21, [x17, #+2144] +ldr q12, [x17, #+2160] +ldr q30, [x0, #992] +ldr q5, [x0, #1008] +ldr q20, [x0, #960] +ldr q19, [x0, #976] +sqrdmulh v0.4S, v30.4S, v9.s[0] +mul v30.4S, v30.4S,v23.s[0] +mla v30.4S, v0.4S, v31.s[0] +sub v0.4s, v20.4s, v30.4s +add v20.4s, v20.4s, v30.4s +sqrdmulh v30.4S, v5.4S, v9.s[0] +mul v5.4S, v5.4S,v23.s[0] +mla v5.4S, v30.4S, v31.s[0] +sub v30.4s, v19.4s, v5.4s +add v19.4s, v19.4s, v5.4s +sqrdmulh v5.4S, v19.4S, v9.s[1] +mul v19.4S, v19.4S,v23.s[1] +mla v19.4S, v5.4S, v31.s[0] +sub v5.4s, v20.4s, v19.4s +add v20.4s, v20.4s, v19.4s +sqrdmulh v19.4S, v30.4S, v9.s[2] +mul v30.4S, v30.4S,v23.s[2] +mla v30.4S, v19.4S, v31.s[0] +sub v19.4s, v0.4s, v30.4s +add v0.4s, v0.4s, v30.4s +trn1 v30.4S, v20.4S, v5.4S +trn2 v29.4S, v20.4S, v5.4S +trn1 v4.4S, v0.4S, v19.4S +trn2 v3.4S, v0.4S, v19.4S +trn2 v0.2D, v30.2D, v4.2D +trn2 v19.2D, v29.2D, v3.2D +trn1 v20.2D, v30.2D, v4.2D +trn1 v5.2D, v29.2D, v3.2D +sqrdmulh v3.4S, v0.4S, v18.4S +mul v0.4S, v0.4S,v24.4S +mla v0.4S, v3.4S, v31.s[0] +sub v3.4s, v20.4s, v0.4s +add v20.4s, v20.4s, v0.4s +sqrdmulh v0.4S, v19.4S, v18.4S +mul v19.4S, v19.4S,v24.4S +mla v19.4S, v0.4S, v31.s[0] +sub v0.4s, v5.4s, v19.4s +add v5.4s, v5.4s, v19.4s +sqrdmulh v19.4S, v5.4S, v17.4S +mul v5.4S, v5.4S,v13.4S +mla v5.4S, v19.4S, v31.s[0] +sub v19.4s, v20.4s, v5.4s +add v20.4s, v20.4s, v5.4s +sqrdmulh v5.4S, v0.4S, v12.4S +mul v0.4S, v0.4S,v21.4S +mla v0.4S, v5.4S, v31.s[0] +sub v5.4s, v3.4s, v0.4s +add v3.4s, v3.4s, v0.4s +str q20, [x0, #960] +str q19, [x0, #976] +str q3, [x0, #992] +str q5, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 2476 +// Instruction count: 2472 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_17_0.s b/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_17_0.s new file mode 100644 index 0000000..eeedda8 --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_17_0.s @@ -0,0 +1,2486 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 26036764 // Layer 6, block 0 +.word 7065381 // Layer 6, block 1 +.word 11280567 // Layer 6, block 2 +.word 19695786 // Layer 6, block 3 +.word 1666225723 // Layer 6, block 0 +.word 452149874 // Layer 6, block 1 +.word 721901190 // Layer 6, block 2 +.word 1260434103 // Layer 6, block 3 +.word 28678040 // Layer 7, block 0 +.word 5637166 // Layer 7, block 2 +.word 18759424 // Layer 7, block 4 +.word 8648030 // Layer 7, block 6 +.word 1835254486 // Layer 7, block 0 +.word 360751090 // Layer 7, block 2 +.word 1200511508 // Layer 7, block 4 +.word 553431680 // Layer 7, block 6 +.word 7232147 // Layer 7, block 1 +.word 7430689 // Layer 7, block 3 +.word 14819378 // Layer 7, block 5 +.word 22112339 // Layer 7, block 7 +.word 462822084 // Layer 7, block 1 +.word 475527802 // Layer 7, block 3 +.word 948367809 // Layer 7, block 5 +.word 1415081692 // Layer 7, block 7 +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14834498 // Layer 6, block 4 +.word 22861321 // Layer 6, block 5 +.word 23033862 // Layer 6, block 6 +.word 32211066 // Layer 6, block 7 +.word 949335415 // Layer 6, block 4 +.word 1463012881 // Layer 6, block 5 +.word 1474054663 // Layer 6, block 6 +.word 2061350894 // Layer 6, block 7 +.word 7103825 // Layer 7, block 8 +.word 24338119 // Layer 7, block 10 +.word 6674394 // Layer 7, block 12 +.word 3716128 // Layer 7, block 14 +.word 454610102 // Layer 7, block 8 +.word 1557520740 // Layer 7, block 10 +.word 427128616 // Layer 7, block 12 +.word 237814041 // Layer 7, block 14 +.word 18577393 // Layer 7, block 9 +.word 17042091 // Layer 7, block 11 +.word 6574213 // Layer 7, block 13 +.word 24666803 // Layer 7, block 15 +.word 1188862414 // Layer 7, block 9 +.word 1090610585 // Layer 7, block 11 +.word 420717521 // Layer 7, block 13 +.word 1578554911 // Layer 7, block 15 +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 11253846 // Layer 6, block 8 +.word 16151303 // Layer 6, block 9 +.word 1821442 // Layer 6, block 10 +.word 23358663 // Layer 6, block 11 +.word 720191176 // Layer 6, block 8 +.word 1033604503 // Layer 6, block 9 +.word 116563391 // Layer 6, block 10 +.word 1494840340 // Layer 6, block 11 +.word 32787475 // Layer 7, block 16 +.word 8269259 // Layer 7, block 18 +.word 20826321 // Layer 7, block 20 +.word 21194054 // Layer 7, block 22 +.word 2098238255 // Layer 7, block 16 +.word 529192186 // Layer 7, block 18 +.word 1332782821 // Layer 7, block 20 +.word 1356315937 // Layer 7, block 22 +.word 28400654 // Layer 7, block 17 +.word 31090287 // Layer 7, block 19 +.word 26776841 // Layer 7, block 21 +.word 22281074 // Layer 7, block 23 +.word 1817503137 // Layer 7, block 17 +.word 1989626512 // Layer 7, block 19 +.word 1713587037 // Layer 7, block 21 +.word 1425879908 // Layer 7, block 23 +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 20504641 // Layer 6, block 12 +.word 7735096 // Layer 6, block 13 +.word 29463916 // Layer 6, block 14 +.word 23172067 // Layer 6, block 15 +.word 1312196872 // Layer 6, block 12 +.word 495008363 // Layer 6, block 13 +.word 1885546712 // Layer 6, block 14 +.word 1482899108 // Layer 6, block 15 +.word 1953000 // Layer 7, block 24 +.word 12766243 // Layer 7, block 26 +.word 16292342 // Layer 7, block 28 +.word 25143337 // Layer 7, block 30 +.word 124982461 // Layer 7, block 24 +.word 816977197 // Layer 7, block 26 +.word 1042630311 // Layer 7, block 28 +.word 1609050759 // Layer 7, block 30 +.word 12486848 // Layer 7, block 25 +.word 31556661 // Layer 7, block 27 +.word 28330310 // Layer 7, block 29 +.word 15137961 // Layer 7, block 31 +.word 799097282 // Layer 7, block 25 +.word 2019472170 // Layer 7, block 27 +.word 1813001465 // Layer 7, block 29 +.word 968755565 // Layer 7, block 31 +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 18663828 // Layer 6, block 16 +.word 25765932 // Layer 6, block 17 +.word 11779122 // Layer 6, block 18 +.word 29112305 // Layer 6, block 19 +.word 1194393831 // Layer 6, block 16 +.word 1648893798 // Layer 6, block 17 +.word 753806275 // Layer 6, block 18 +.word 1863045325 // Layer 6, block 19 +.word 33163184 // Layer 7, block 32 +.word 11550623 // Layer 7, block 34 +.word 25375595 // Layer 7, block 36 +.word 18254638 // Layer 7, block 38 +.word 2122281795 // Layer 7, block 32 +.word 739183455 // Layer 7, block 34 +.word 1623914137 // Layer 7, block 36 +.word 1168207670 // Layer 7, block 38 +.word 9551359 // Layer 7, block 33 +.word 33257316 // Layer 7, block 35 +.word 10387700 // Layer 7, block 37 +.word 4263629 // Layer 7, block 39 +.word 611240324 // Layer 7, block 33 +.word 2128305784 // Layer 7, block 35 +.word 664762063 // Layer 7, block 37 +.word 272851431 // Layer 7, block 39 +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 596073 // Layer 6, block 20 +.word 29039358 // Layer 6, block 21 +.word 6760262 // Layer 6, block 22 +.word 2228887 // Layer 6, block 23 +.word 38145761 // Layer 6, block 20 +.word 1858377074 // Layer 6, block 21 +.word 432623749 // Layer 6, block 22 +.word 142637881 // Layer 6, block 23 +.word 25929180 // Layer 7, block 40 +.word 23508428 // Layer 7, block 42 +.word 22560727 // Layer 7, block 44 +.word 29457393 // Layer 7, block 46 +.word 1659340873 // Layer 7, block 40 +.word 1504424569 // Layer 7, block 42 +.word 1443776334 // Layer 7, block 44 +.word 1885129272 // Layer 7, block 46 +.word 17371159 // Layer 7, block 41 +.word 11558208 // Layer 7, block 43 +.word 15755637 // Layer 7, block 45 +.word 20740787 // Layer 7, block 47 +.word 1111669329 // Layer 7, block 41 +.word 739668858 // Layer 7, block 43 +.word 1008283812 // Layer 7, block 45 +.word 1327309063 // Layer 7, block 47 +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 13624329 // Layer 6, block 24 +.word 9838349 // Layer 6, block 25 +.word 6934560 // Layer 6, block 26 +.word 11310234 // Layer 6, block 27 +.word 871890510 // Layer 6, block 24 +.word 629606282 // Layer 6, block 25 +.word 443777969 // Layer 6, block 26 +.word 723799733 // Layer 6, block 27 +.word 3153984 // Layer 7, block 48 +.word 15599806 // Layer 7, block 50 +.word 23484790 // Layer 7, block 52 +.word 30174454 // Layer 7, block 54 +.word 201839571 // Layer 7, block 48 +.word 998311389 // Layer 7, block 50 +.word 1502911852 // Layer 7, block 52 +.word 1931017673 // Layer 7, block 54 +.word 13598070 // Layer 7, block 49 +.word 31454003 // Layer 7, block 51 +.word 20506260 // Layer 7, block 53 +.word 5928435 // Layer 7, block 55 +.word 870210062 // Layer 7, block 49 +.word 2012902560 // Layer 7, block 51 +.word 1312300480 // Layer 7, block 53 +.word 379390883 // Layer 7, block 55 +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 32798516 // Layer 6, block 28 +.word 9911360 // Layer 6, block 29 +.word 32443170 // Layer 6, block 30 +.word 31293482 // Layer 6, block 31 +.word 2098944825 // Layer 6, block 28 +.word 634278629 // Layer 6, block 29 +.word 2076204416 // Layer 6, block 30 +.word 2002630000 // Layer 6, block 31 +.word 26013877 // Layer 7, block 56 +.word 22928950 // Layer 7, block 58 +.word 24547058 // Layer 7, block 60 +.word 21082546 // Layer 7, block 62 +.word 1664761067 // Layer 7, block 56 +.word 1467340807 // Layer 7, block 58 +.word 1570891816 // Layer 7, block 60 +.word 1349179970 // Layer 7, block 62 +.word 21864746 // Layer 7, block 57 +.word 27678266 // Layer 7, block 59 +.word 30695887 // Layer 7, block 61 +.word 31772478 // Layer 7, block 63 +.word 1399236949 // Layer 7, block 57 +.word 1771273834 // Layer 7, block 59 +.word 1964386839 // Layer 7, block 61 +.word 2033283404 // Layer 7, block 63 +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 2853776 // Layer 6, block 32 +.word 31645959 // Layer 6, block 33 +.word 29723614 // Layer 6, block 34 +.word 31813171 // Layer 6, block 35 +.word 182627725 // Layer 6, block 32 +.word 2025186806 // Layer 6, block 33 +.word 1902166116 // Layer 6, block 34 +.word 2035887557 // Layer 6, block 35 +.word 30377953 // Layer 7, block 64 +.word 4924837 // Layer 7, block 66 +.word 11362575 // Layer 7, block 68 +.word 31398766 // Layer 7, block 70 +.word 1944040616 // Layer 7, block 64 +.word 315165513 // Layer 7, block 66 +.word 727149301 // Layer 7, block 68 +.word 2009367662 // Layer 7, block 70 +.word 27689101 // Layer 7, block 65 +.word 31229525 // Layer 7, block 67 +.word 6544948 // Layer 7, block 69 +.word 13728247 // Layer 7, block 71 +.word 1771967221 // Layer 7, block 65 +.word 1998537064 // Layer 7, block 67 +.word 418844704 // Layer 7, block 69 +.word 878540754 // Layer 7, block 71 +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9116920 // Layer 6, block 36 +.word 26449800 // Layer 6, block 37 +.word 27173300 // Layer 6, block 38 +.word 1574249 // Layer 6, block 39 +.word 583438350 // Layer 6, block 36 +.word 1692658010 // Layer 6, block 37 +.word 1738958476 // Layer 6, block 38 +.word 100744247 // Layer 6, block 39 +.word 6510145 // Layer 7, block 72 +.word 760999 // Layer 7, block 74 +.word 1634503 // Layer 7, block 76 +.word 29546109 // Layer 7, block 78 +.word 416617482 // Layer 7, block 72 +.word 48700219 // Layer 7, block 74 +.word 104600209 // Layer 7, block 76 +.word 1890806663 // Layer 7, block 78 +.word 2195232 // Layer 7, block 73 +.word 4465852 // Layer 7, block 75 +.word 31203102 // Layer 7, block 77 +.word 29916743 // Layer 7, block 79 +.word 140484126 // Layer 7, block 73 +.word 285792715 // Layer 7, block 75 +.word 1996846121 // Layer 7, block 77 +.word 1914525428 // Layer 7, block 79 +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29172999 // Layer 6, block 40 +.word 16825951 // Layer 6, block 41 +.word 11592382 // Layer 6, block 42 +.word 2671395 // Layer 6, block 43 +.word 1866929445 // Layer 6, block 40 +.word 1076778680 // Layer 6, block 41 +.word 741855827 // Layer 6, block 42 +.word 170956232 // Layer 6, block 43 +.word 14579779 // Layer 7, block 80 +.word 24263513 // Layer 7, block 82 +.word 4646776 // Layer 7, block 84 +.word 69049 // Layer 7, block 86 +.word 933034643 // Layer 7, block 80 +.word 1552746321 // Layer 7, block 82 +.word 297370968 // Layer 7, block 84 +.word 4418799 // Layer 7, block 86 +.word 33263488 // Layer 7, block 81 +.word 22493246 // Layer 7, block 83 +.word 22009979 // Layer 7, block 85 +.word 12021234 // Layer 7, block 87 +.word 2128700762 // Layer 7, block 81 +.word 1439457879 // Layer 7, block 83 +.word 1408531152 // Layer 7, block 85 +.word 769300260 // Layer 7, block 87 +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 15720958 // Layer 6, block 44 +.word 4876619 // Layer 6, block 45 +.word 9370171 // Layer 6, block 46 +.word 2197027 // Layer 6, block 47 +.word 1006064525 // Layer 6, block 44 +.word 312079797 // Layer 6, block 45 +.word 599645177 // Layer 6, block 46 +.word 140598997 // Layer 6, block 47 +.word 16117282 // Layer 7, block 88 +.word 9635661 // Layer 7, block 90 +.word 9117520 // Layer 7, block 92 +.word 3506913 // Layer 7, block 94 +.word 1031427326 // Layer 7, block 88 +.word 616635240 // Layer 7, block 90 +.word 583476747 // Layer 7, block 92 +.word 224425303 // Layer 7, block 94 +.word 20014407 // Layer 7, block 89 +.word 25893988 // Layer 7, block 91 +.word 10257619 // Layer 7, block 93 +.word 24501669 // Layer 7, block 95 +.word 1280824291 // Layer 7, block 89 +.word 1657088757 // Layer 7, block 91 +.word 656437514 // Layer 7, block 93 +.word 1567987141 // Layer 7, block 95 +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 23467272 // Layer 6, block 48 +.word 11944835 // Layer 6, block 49 +.word 29768154 // Layer 6, block 50 +.word 3189790 // Layer 6, block 51 +.word 1501790786 // Layer 6, block 48 +.word 764411097 // Layer 6, block 49 +.word 1905016458 // Layer 6, block 50 +.word 204130980 // Layer 6, block 51 +.word 28559032 // Layer 7, block 96 +.word 20151609 // Layer 7, block 98 +.word 11645481 // Layer 7, block 100 +.word 16402437 // Layer 7, block 102 +.word 1827638556 // Layer 7, block 96 +.word 1289604549 // Layer 7, block 98 +.word 745253903 // Layer 7, block 100 +.word 1049675853 // Layer 7, block 102 +.word 1005359 // Layer 7, block 97 +.word 19130139 // Layer 7, block 99 +.word 11690281 // Layer 7, block 101 +.word 5461508 // Layer 7, block 103 +.word 64338065 // Layer 7, block 97 +.word 1224235458 // Layer 7, block 99 +.word 748120885 // Layer 7, block 101 +.word 349509836 // Layer 7, block 103 +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 4898455 // Layer 6, block 52 +.word 22059944 // Layer 6, block 53 +.word 20315246 // Layer 6, block 54 +.word 28615767 // Layer 6, block 55 +.word 313477194 // Layer 6, block 52 +.word 1411728668 // Layer 6, block 53 +.word 1300076517 // Layer 6, block 54 +.word 1831269319 // Layer 6, block 55 +.word 6226096 // Layer 7, block 104 +.word 14029790 // Layer 7, block 106 +.word 7729000 // Layer 7, block 108 +.word 13958531 // Layer 7, block 110 +.word 398439734 // Layer 7, block 104 +.word 897838034 // Layer 7, block 106 +.word 494618249 // Layer 7, block 108 +.word 893277806 // Layer 7, block 110 +.word 31755058 // Layer 7, block 105 +.word 26102744 // Layer 7, block 107 +.word 19175904 // Layer 7, block 109 +.word 19472238 // Layer 7, block 111 +.word 2032168609 // Layer 7, block 105 +.word 1670448121 // Layer 7, block 107 +.word 1227164194 // Layer 7, block 109 +.word 1246128123 // Layer 7, block 111 +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 17302560 // Layer 6, block 56 +.word 8630188 // Layer 6, block 57 +.word 13744680 // Layer 6, block 58 +.word 31890906 // Layer 6, block 59 +.word 1107279328 // Layer 6, block 56 +.word 552289879 // Layer 6, block 57 +.word 879592386 // Layer 6, block 58 +.word 2040862218 // Layer 6, block 59 +.word 4735938 // Layer 7, block 112 +.word 26671657 // Layer 7, block 114 +.word 25810971 // Layer 7, block 116 +.word 25578690 // Layer 7, block 118 +.word 303076900 // Layer 7, block 112 +.word 1706855774 // Layer 7, block 114 +.word 1651776074 // Layer 7, block 116 +.word 1636911225 // Layer 7, block 118 +.word 6957373 // Layer 7, block 113 +.word 25381712 // Layer 7, block 115 +.word 27780827 // Layer 7, block 117 +.word 28062311 // Layer 7, block 119 +.word 445237890 // Layer 7, block 113 +.word 1624305595 // Layer 7, block 115 +.word 1777837237 // Layer 7, block 117 +.word 1795850838 // Layer 7, block 119 +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 26150922 // Layer 6, block 60 +.word 29525906 // Layer 6, block 61 +.word 23080870 // Layer 6, block 62 +.word 1636987 // Layer 6, block 63 +.word 1673531278 // Layer 6, block 60 +.word 1889513769 // Layer 6, block 61 +.word 1477062945 // Layer 6, block 62 +.word 104759172 // Layer 6, block 63 +.word 10674616 // Layer 7, block 120 +.word 9508293 // Layer 7, block 122 +.word 4274200 // Layer 7, block 124 +.word 10066304 // Layer 7, block 126 +.word 683123285 // Layer 7, block 120 +.word 608484310 // Layer 7, block 122 +.word 273527923 // Layer 7, block 124 +.word 644194289 // Layer 7, block 126 +.word 26473446 // Layer 7, block 121 +.word 14853570 // Layer 7, block 123 +.word 32427548 // Layer 7, block 125 +.word 16598340 // Layer 7, block 127 +.word 1694171239 // Layer 7, block 121 +.word 950555930 // Layer 7, block 123 +.word 2075204685 // Layer 7, block 125 +.word 1062212688 // Layer 7, block 127 +.text +.global ntt_u32_full_neon_asm_var_4_4_17_0 +.global _ntt_u32_full_neon_asm_var_4_4_17_0 +ntt_u32_full_neon_asm_var_4_4_17_0: +_ntt_u32_full_neon_asm_var_4_4_17_0: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x0, #992] +sqrdmulh v27.4S, v28.4S, v29.s[0] +mul v28.4S, v28.4S,v30.s[0] +ldr q26, [x0, #928] +sqrdmulh v25.4S, v26.4S, v29.s[0] +mul v26.4S, v26.4S,v30.s[0] +ldr q24, [x0, #864] +sqrdmulh v23.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +ldr q22, [x0, #800] +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +ldr q20, [x0, #736] +mla v28.4S, v27.4S, v31.s[0] +sqrdmulh v27.4S, v20.4S, v29.s[0] +ldr q19, [x0, #672] +mla v26.4S, v25.4S, v31.s[0] +sqrdmulh v25.4S, v19.4S, v29.s[0] +ldr q18, [x0, #608] +mla v24.4S, v23.4S, v31.s[0] +sqrdmulh v23.4S, v18.4S, v29.s[0] +ldr q17, [x0, #544] +mla v22.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v17.4S, v29.s[0] +ldr q16, [x0, #480] +ldr q3, [x0, #416] +mul v20.4S, v20.4S,v30.s[0] +sub v2.4s, v16.4s, v28.4s +mul v19.4S, v19.4S,v30.s[0] +add v16.4s, v16.4s, v28.4s +ldr q28, [x0, #352] +ldr q1, [x0, #288] +mla v20.4S, v27.4S, v31.s[0] +sub v27.4s, v3.4s, v26.4s +mla v19.4S, v25.4S, v31.s[0] +add v3.4s, v3.4s, v26.4s +ldr q26, [x0, #224] +ldr q25, [x0, #160] +mul v18.4S, v18.4S,v30.s[0] +sub v0.4s, v28.4s, v24.4s +mul v17.4S, v17.4S,v30.s[0] +add v28.4s, v28.4s, v24.4s +ldr q24, [x0, #96] +ldr q15, [x0, #32] +mla v18.4S, v23.4S, v31.s[0] +sub v23.4s, v1.4s, v22.4s +mla v17.4S, v21.4S, v31.s[0] +add v1.4s, v1.4s, v22.4s +sqrdmulh v22.4S, v2.4S, v29.s[2] +nop +mul v2.4S, v2.4S,v30.s[2] +nop +sqrdmulh v21.4S, v27.4S, v29.s[2] +sub v14.4s, v26.4s, v20.4s +mul v27.4S, v27.4S,v30.s[2] +add v26.4s, v26.4s, v20.4s +sqrdmulh v20.4S, v16.4S, v29.s[1] +sub v13.4s, v25.4s, v19.4s +mul v16.4S, v16.4S,v30.s[1] +add v25.4s, v25.4s, v19.4s +sqrdmulh v19.4S, v3.4S, v29.s[1] +sub v12.4s, v24.4s, v18.4s +mul v3.4S, v3.4S,v30.s[1] +add v24.4s, v24.4s, v18.4s +mla v2.4S, v22.4S, v31.s[0] +sub v22.4s, v15.4s, v17.4s +sqrdmulh v18.4S, v0.4S, v29.s[2] +add v15.4s, v15.4s, v17.4s +mla v27.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v23.4S, v29.s[2] +nop +mla v16.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v28.4S, v29.s[1] +nop +mla v3.4S, v19.4S, v31.s[0] +nop +sqrdmulh v19.4S, v1.4S, v29.s[1] +nop +ldr q17, [x17, #+32] +ldr q11, [x17, #+48] +mul v0.4S, v0.4S,v30.s[2] +sub v10.4s, v14.4s, v2.4s +mul v23.4S, v23.4S,v30.s[2] +add v14.4s, v14.4s, v2.4s +mla v0.4S, v18.4S, v31.s[0] +sub v18.4s, v13.4s, v27.4s +mla v23.4S, v21.4S, v31.s[0] +add v13.4s, v13.4s, v27.4s +mul v28.4S, v28.4S,v30.s[1] +sub v27.4s, v26.4s, v16.4s +mul v1.4S, v1.4S,v30.s[1] +add v26.4s, v26.4s, v16.4s +mla v28.4S, v20.4S, v31.s[0] +sub v20.4s, v25.4s, v3.4s +mla v1.4S, v19.4S, v31.s[0] +add v25.4s, v25.4s, v3.4s +sqrdmulh v3.4S, v10.4S, v11.s[3] +nop +mul v10.4S, v10.4S,v17.s[3] +nop +sqrdmulh v19.4S, v14.4S, v11.s[2] +sub v16.4s, v12.4s, v0.4s +mul v14.4S, v14.4S,v17.s[2] +add v12.4s, v12.4s, v0.4s +sqrdmulh v0.4S, v27.4S, v11.s[1] +sub v21.4s, v22.4s, v23.4s +mul v27.4S, v27.4S,v17.s[1] +add v22.4s, v22.4s, v23.4s +sqrdmulh v23.4S, v26.4S, v11.s[0] +sub v2.4s, v24.4s, v28.4s +mul v26.4S, v26.4S,v17.s[0] +add v24.4s, v24.4s, v28.4s +ldr q28, [x17, #+96] +ldr q9, [x17, #+112] +mla v10.4S, v3.4S, v31.s[0] +sub v3.4s, v15.4s, v1.4s +sqrdmulh v8.4S, v18.4S, v11.s[3] +add v15.4s, v15.4s, v1.4s +mla v14.4S, v19.4S, v31.s[0] +nop +sqrdmulh v19.4S, v13.4S, v11.s[2] +nop +mla v27.4S, v0.4S, v31.s[0] +nop +sqrdmulh v0.4S, v20.4S, v11.s[1] +nop +mla v26.4S, v23.4S, v31.s[0] +nop +sqrdmulh v23.4S, v25.4S, v11.s[0] +nop +ldr q1, [x17, #+64] +ldr q7, [x17, #+80] +mul v18.4S, v18.4S,v17.s[3] +sub v6.4s, v16.4s, v10.4s +mul v13.4S, v13.4S,v17.s[2] +add v16.4s, v16.4s, v10.4s +mla v18.4S, v8.4S, v31.s[0] +sub v8.4s, v12.4s, v14.4s +mla v13.4S, v19.4S, v31.s[0] +add v12.4s, v12.4s, v14.4s +mul v20.4S, v20.4S,v17.s[1] +sub v14.4s, v2.4s, v27.4s +mul v25.4S, v25.4S,v17.s[0] +add v2.4s, v2.4s, v27.4s +mla v20.4S, v0.4S, v31.s[0] +sub v0.4s, v24.4s, v26.4s +mla v25.4S, v23.4S, v31.s[0] +add v24.4s, v24.4s, v26.4s +sqrdmulh v26.4S, v6.4S, v9.s[3] +nop +mul v6.4S, v6.4S,v28.s[3] +nop +sqrdmulh v23.4S, v16.4S, v9.s[2] +sub v27.4s, v21.4s, v18.4s +mul v16.4S, v16.4S,v28.s[2] +add v21.4s, v21.4s, v18.4s +sqrdmulh v18.4S, v8.4S, v9.s[1] +sub v19.4s, v22.4s, v13.4s +mul v8.4S, v8.4S,v28.s[1] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v12.4S, v9.s[0] +sub v10.4s, v3.4s, v20.4s +mul v12.4S, v12.4S,v28.s[0] +add v3.4s, v3.4s, v20.4s +mla v6.4S, v26.4S, v31.s[0] +sub v26.4s, v15.4s, v25.4s +sqrdmulh v20.4S, v14.4S, v7.s[3] +add v15.4s, v15.4s, v25.4s +mla v16.4S, v23.4S, v31.s[0] +sub v23.4s, v27.4s, v6.4s +sqrdmulh v25.4S, v2.4S, v7.s[2] +add v27.4s, v27.4s, v6.4s +mla v8.4S, v18.4S, v31.s[0] +sub v18.4s, v21.4s, v16.4s +sqrdmulh v6.4S, v0.4S, v7.s[1] +add v21.4s, v21.4s, v16.4s +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v19.4s, v8.4s +sqrdmulh v16.4S, v24.4S, v7.s[0] +add v19.4s, v19.4s, v8.4s +mul v14.4S, v14.4S,v1.s[3] +sub v8.4s, v22.4s, v12.4s +mul v2.4S, v2.4S,v1.s[2] +add v22.4s, v22.4s, v12.4s +mla v14.4S, v20.4S, v31.s[0] +str q23, [x0, #992] +mla v2.4S, v25.4S, v31.s[0] +str q27, [x0, #928] +mul v0.4S, v0.4S,v1.s[1] +str q18, [x0, #864] +mul v24.4S, v24.4S,v1.s[0] +str q21, [x0, #800] +mla v0.4S, v6.4S, v31.s[0] +str q13, [x0, #736] +mla v24.4S, v16.4S, v31.s[0] +str q19, [x0, #672] +ldr q19, [x0, #1008] +sqrdmulh v16.4S, v19.4S, v29.s[0] +str q8, [x0, #608] +mul v19.4S, v19.4S,v30.s[0] +sub v8.4s, v10.4s, v14.4s +ldr q13, [x0, #944] +sqrdmulh v6.4S, v13.4S, v29.s[0] +str q22, [x0, #544] +mul v13.4S, v13.4S,v30.s[0] +add v10.4s, v10.4s, v14.4s +ldr q14, [x0, #880] +sqrdmulh v22.4S, v14.4S, v29.s[0] +str q8, [x0, #480] +mul v14.4S, v14.4S,v30.s[0] +sub v8.4s, v3.4s, v2.4s +ldr q21, [x0, #816] +sqrdmulh v18.4S, v21.4S, v29.s[0] +str q10, [x0, #416] +mul v21.4S, v21.4S,v30.s[0] +add v3.4s, v3.4s, v2.4s +ldr q2, [x0, #752] +mla v19.4S, v16.4S, v31.s[0] +str q8, [x0, #352] +sqrdmulh v8.4S, v2.4S, v29.s[0] +sub v16.4s, v26.4s, v0.4s +ldr q10, [x0, #688] +mla v13.4S, v6.4S, v31.s[0] +str q3, [x0, #288] +sqrdmulh v3.4S, v10.4S, v29.s[0] +add v26.4s, v26.4s, v0.4s +ldr q0, [x0, #624] +mla v14.4S, v22.4S, v31.s[0] +str q16, [x0, #224] +sqrdmulh v16.4S, v0.4S, v29.s[0] +sub v22.4s, v15.4s, v24.4s +ldr q6, [x0, #560] +mla v21.4S, v18.4S, v31.s[0] +str q26, [x0, #160] +sqrdmulh v26.4S, v6.4S, v29.s[0] +add v15.4s, v15.4s, v24.4s +ldr q24, [x0, #496] +ldr q18, [x0, #432] +mul v2.4S, v2.4S,v30.s[0] +sub v27.4s, v24.4s, v19.4s +mul v10.4S, v10.4S,v30.s[0] +add v24.4s, v24.4s, v19.4s +ldr q19, [x0, #368] +ldr q25, [x0, #304] +mla v2.4S, v8.4S, v31.s[0] +sub v8.4s, v18.4s, v13.4s +mla v10.4S, v3.4S, v31.s[0] +add v18.4s, v18.4s, v13.4s +ldr q13, [x0, #240] +ldr q3, [x0, #176] +mul v0.4S, v0.4S,v30.s[0] +sub v23.4s, v19.4s, v14.4s +mul v6.4S, v6.4S,v30.s[0] +add v19.4s, v19.4s, v14.4s +ldr q14, [x0, #112] +ldr q20, [x0, #48] +mla v0.4S, v16.4S, v31.s[0] +sub v16.4s, v25.4s, v21.4s +mla v6.4S, v26.4S, v31.s[0] +add v25.4s, v25.4s, v21.4s +sqrdmulh v21.4S, v27.4S, v29.s[2] +nop +mul v27.4S, v27.4S,v30.s[2] +nop +sqrdmulh v26.4S, v8.4S, v29.s[2] +sub v12.4s, v13.4s, v2.4s +mul v8.4S, v8.4S,v30.s[2] +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v24.4S, v29.s[1] +sub v5.4s, v3.4s, v10.4s +mul v24.4S, v24.4S,v30.s[1] +add v3.4s, v3.4s, v10.4s +sqrdmulh v10.4S, v18.4S, v29.s[1] +sub v4.4s, v14.4s, v0.4s +mul v18.4S, v18.4S,v30.s[1] +add v14.4s, v14.4s, v0.4s +mla v27.4S, v21.4S, v31.s[0] +sub v21.4s, v20.4s, v6.4s +sqrdmulh v0.4S, v23.4S, v29.s[2] +add v20.4s, v20.4s, v6.4s +mla v8.4S, v26.4S, v31.s[0] +str q22, [x0, #96] +sqrdmulh v22.4S, v16.4S, v29.s[2] +nop +mla v24.4S, v2.4S, v31.s[0] +str q15, [x0, #32] +sqrdmulh v15.4S, v19.4S, v29.s[1] +nop +mla v18.4S, v10.4S, v31.s[0] +nop +sqrdmulh v10.4S, v25.4S, v29.s[1] +nop +mul v23.4S, v23.4S,v30.s[2] +sub v2.4s, v12.4s, v27.4s +mul v16.4S, v16.4S,v30.s[2] +add v12.4s, v12.4s, v27.4s +mla v23.4S, v0.4S, v31.s[0] +sub v0.4s, v5.4s, v8.4s +mla v16.4S, v22.4S, v31.s[0] +add v5.4s, v5.4s, v8.4s +mul v19.4S, v19.4S,v30.s[1] +sub v8.4s, v13.4s, v24.4s +mul v25.4S, v25.4S,v30.s[1] +add v13.4s, v13.4s, v24.4s +mla v19.4S, v15.4S, v31.s[0] +sub v15.4s, v3.4s, v18.4s +mla v25.4S, v10.4S, v31.s[0] +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v2.4S, v11.s[3] +nop +mul v2.4S, v2.4S,v17.s[3] +nop +sqrdmulh v10.4S, v12.4S, v11.s[2] +sub v24.4s, v4.4s, v23.4s +mul v12.4S, v12.4S,v17.s[2] +add v4.4s, v4.4s, v23.4s +sqrdmulh v23.4S, v8.4S, v11.s[1] +sub v22.4s, v21.4s, v16.4s +mul v8.4S, v8.4S,v17.s[1] +add v21.4s, v21.4s, v16.4s +sqrdmulh v16.4S, v13.4S, v11.s[0] +sub v27.4s, v14.4s, v19.4s +mul v13.4S, v13.4S,v17.s[0] +add v14.4s, v14.4s, v19.4s +mla v2.4S, v18.4S, v31.s[0] +sub v18.4s, v20.4s, v25.4s +sqrdmulh v19.4S, v0.4S, v11.s[3] +add v20.4s, v20.4s, v25.4s +mla v12.4S, v10.4S, v31.s[0] +nop +sqrdmulh v10.4S, v5.4S, v11.s[2] +nop +mla v8.4S, v23.4S, v31.s[0] +nop +sqrdmulh v23.4S, v15.4S, v11.s[1] +nop +mla v13.4S, v16.4S, v31.s[0] +nop +sqrdmulh v16.4S, v3.4S, v11.s[0] +nop +mul v0.4S, v0.4S,v17.s[3] +sub v25.4s, v24.4s, v2.4s +mul v5.4S, v5.4S,v17.s[2] +add v24.4s, v24.4s, v2.4s +mla v0.4S, v19.4S, v31.s[0] +sub v19.4s, v4.4s, v12.4s +mla v5.4S, v10.4S, v31.s[0] +add v4.4s, v4.4s, v12.4s +mul v15.4S, v15.4S,v17.s[1] +sub v12.4s, v27.4s, v8.4s +mul v3.4S, v3.4S,v17.s[0] +add v27.4s, v27.4s, v8.4s +mla v15.4S, v23.4S, v31.s[0] +sub v23.4s, v14.4s, v13.4s +mla v3.4S, v16.4S, v31.s[0] +add v14.4s, v14.4s, v13.4s +sqrdmulh v13.4S, v25.4S, v9.s[3] +nop +mul v25.4S, v25.4S,v28.s[3] +nop +sqrdmulh v16.4S, v24.4S, v9.s[2] +sub v8.4s, v22.4s, v0.4s +mul v24.4S, v24.4S,v28.s[2] +add v22.4s, v22.4s, v0.4s +sqrdmulh v0.4S, v19.4S, v9.s[1] +sub v10.4s, v21.4s, v5.4s +mul v19.4S, v19.4S,v28.s[1] +add v21.4s, v21.4s, v5.4s +sqrdmulh v5.4S, v4.4S, v9.s[0] +sub v2.4s, v18.4s, v15.4s +mul v4.4S, v4.4S,v28.s[0] +add v18.4s, v18.4s, v15.4s +mla v25.4S, v13.4S, v31.s[0] +sub v13.4s, v20.4s, v3.4s +sqrdmulh v15.4S, v12.4S, v7.s[3] +add v20.4s, v20.4s, v3.4s +mla v24.4S, v16.4S, v31.s[0] +sub v16.4s, v8.4s, v25.4s +sqrdmulh v3.4S, v27.4S, v7.s[2] +add v8.4s, v8.4s, v25.4s +mla v19.4S, v0.4S, v31.s[0] +sub v0.4s, v22.4s, v24.4s +sqrdmulh v25.4S, v23.4S, v7.s[1] +add v22.4s, v22.4s, v24.4s +mla v4.4S, v5.4S, v31.s[0] +sub v5.4s, v10.4s, v19.4s +sqrdmulh v24.4S, v14.4S, v7.s[0] +add v10.4s, v10.4s, v19.4s +mul v12.4S, v12.4S,v1.s[3] +sub v19.4s, v21.4s, v4.4s +mul v27.4S, v27.4S,v1.s[2] +add v21.4s, v21.4s, v4.4s +mla v12.4S, v15.4S, v31.s[0] +str q16, [x0, #1008] +mla v27.4S, v3.4S, v31.s[0] +str q8, [x0, #944] +mul v23.4S, v23.4S,v1.s[1] +str q0, [x0, #880] +mul v14.4S, v14.4S,v1.s[0] +str q22, [x0, #816] +mla v23.4S, v25.4S, v31.s[0] +str q5, [x0, #752] +mla v14.4S, v24.4S, v31.s[0] +str q10, [x0, #688] +ldr q10, [x0, #960] +sqrdmulh v24.4S, v10.4S, v29.s[0] +str q19, [x0, #624] +mul v10.4S, v10.4S,v30.s[0] +sub v19.4s, v2.4s, v12.4s +ldr q5, [x0, #896] +sqrdmulh v25.4S, v5.4S, v29.s[0] +str q21, [x0, #560] +mul v5.4S, v5.4S,v30.s[0] +add v2.4s, v2.4s, v12.4s +ldr q12, [x0, #832] +sqrdmulh v21.4S, v12.4S, v29.s[0] +str q19, [x0, #496] +mul v12.4S, v12.4S,v30.s[0] +sub v19.4s, v18.4s, v27.4s +ldr q22, [x0, #768] +sqrdmulh v0.4S, v22.4S, v29.s[0] +str q2, [x0, #432] +mul v22.4S, v22.4S,v30.s[0] +add v18.4s, v18.4s, v27.4s +ldr q27, [x0, #704] +mla v10.4S, v24.4S, v31.s[0] +str q19, [x0, #368] +sqrdmulh v19.4S, v27.4S, v29.s[0] +sub v24.4s, v13.4s, v23.4s +ldr q2, [x0, #640] +mla v5.4S, v25.4S, v31.s[0] +str q18, [x0, #304] +sqrdmulh v18.4S, v2.4S, v29.s[0] +add v13.4s, v13.4s, v23.4s +ldr q23, [x0, #576] +mla v12.4S, v21.4S, v31.s[0] +str q24, [x0, #240] +sqrdmulh v24.4S, v23.4S, v29.s[0] +sub v21.4s, v20.4s, v14.4s +ldr q25, [x0, #512] +mla v22.4S, v0.4S, v31.s[0] +str q13, [x0, #176] +sqrdmulh v13.4S, v25.4S, v29.s[0] +add v20.4s, v20.4s, v14.4s +ldr q14, [x0, #448] +ldr q0, [x0, #384] +mul v27.4S, v27.4S,v30.s[0] +sub v8.4s, v14.4s, v10.4s +mul v2.4S, v2.4S,v30.s[0] +add v14.4s, v14.4s, v10.4s +ldr q10, [x0, #320] +ldr q3, [x0, #256] +mla v27.4S, v19.4S, v31.s[0] +sub v19.4s, v0.4s, v5.4s +mla v2.4S, v18.4S, v31.s[0] +add v0.4s, v0.4s, v5.4s +ldr q5, [x0, #192] +ldr q18, [x0, #128] +mul v23.4S, v23.4S,v30.s[0] +sub v16.4s, v10.4s, v12.4s +mul v25.4S, v25.4S,v30.s[0] +add v10.4s, v10.4s, v12.4s +ldr q12, [x0, #64] +ldr q15, [x0, #0] +mla v23.4S, v24.4S, v31.s[0] +sub v24.4s, v3.4s, v22.4s +mla v25.4S, v13.4S, v31.s[0] +add v3.4s, v3.4s, v22.4s +sqrdmulh v22.4S, v8.4S, v29.s[2] +nop +mul v8.4S, v8.4S,v30.s[2] +nop +sqrdmulh v13.4S, v19.4S, v29.s[2] +sub v4.4s, v5.4s, v27.4s +mul v19.4S, v19.4S,v30.s[2] +add v5.4s, v5.4s, v27.4s +sqrdmulh v27.4S, v14.4S, v29.s[1] +sub v26.4s, v18.4s, v2.4s +mul v14.4S, v14.4S,v30.s[1] +add v18.4s, v18.4s, v2.4s +sqrdmulh v2.4S, v0.4S, v29.s[1] +sub v6.4s, v12.4s, v23.4s +mul v0.4S, v0.4S,v30.s[1] +add v12.4s, v12.4s, v23.4s +mla v8.4S, v22.4S, v31.s[0] +sub v22.4s, v15.4s, v25.4s +sqrdmulh v23.4S, v16.4S, v29.s[2] +add v15.4s, v15.4s, v25.4s +mla v19.4S, v13.4S, v31.s[0] +str q21, [x0, #112] +sqrdmulh v21.4S, v24.4S, v29.s[2] +nop +mla v14.4S, v27.4S, v31.s[0] +str q20, [x0, #48] +sqrdmulh v20.4S, v10.4S, v29.s[1] +nop +mla v0.4S, v2.4S, v31.s[0] +nop +sqrdmulh v2.4S, v3.4S, v29.s[1] +nop +mul v16.4S, v16.4S,v30.s[2] +sub v27.4s, v4.4s, v8.4s +mul v24.4S, v24.4S,v30.s[2] +add v4.4s, v4.4s, v8.4s +mla v16.4S, v23.4S, v31.s[0] +sub v23.4s, v26.4s, v19.4s +mla v24.4S, v21.4S, v31.s[0] +add v26.4s, v26.4s, v19.4s +mul v10.4S, v10.4S,v30.s[1] +sub v19.4s, v5.4s, v14.4s +mul v3.4S, v3.4S,v30.s[1] +add v5.4s, v5.4s, v14.4s +mla v10.4S, v20.4S, v31.s[0] +sub v20.4s, v18.4s, v0.4s +mla v3.4S, v2.4S, v31.s[0] +add v18.4s, v18.4s, v0.4s +sqrdmulh v0.4S, v27.4S, v11.s[3] +nop +mul v27.4S, v27.4S,v17.s[3] +nop +sqrdmulh v2.4S, v4.4S, v11.s[2] +sub v14.4s, v6.4s, v16.4s +mul v4.4S, v4.4S,v17.s[2] +add v6.4s, v6.4s, v16.4s +sqrdmulh v16.4S, v19.4S, v11.s[1] +sub v21.4s, v22.4s, v24.4s +mul v19.4S, v19.4S,v17.s[1] +add v22.4s, v22.4s, v24.4s +sqrdmulh v24.4S, v5.4S, v11.s[0] +sub v8.4s, v12.4s, v10.4s +mul v5.4S, v5.4S,v17.s[0] +add v12.4s, v12.4s, v10.4s +mla v27.4S, v0.4S, v31.s[0] +sub v0.4s, v15.4s, v3.4s +sqrdmulh v10.4S, v23.4S, v11.s[3] +add v15.4s, v15.4s, v3.4s +mla v4.4S, v2.4S, v31.s[0] +nop +sqrdmulh v2.4S, v26.4S, v11.s[2] +nop +mla v19.4S, v16.4S, v31.s[0] +nop +sqrdmulh v16.4S, v20.4S, v11.s[1] +nop +mla v5.4S, v24.4S, v31.s[0] +nop +sqrdmulh v24.4S, v18.4S, v11.s[0] +nop +mul v23.4S, v23.4S,v17.s[3] +sub v3.4s, v14.4s, v27.4s +mul v26.4S, v26.4S,v17.s[2] +add v14.4s, v14.4s, v27.4s +mla v23.4S, v10.4S, v31.s[0] +sub v10.4s, v6.4s, v4.4s +mla v26.4S, v2.4S, v31.s[0] +add v6.4s, v6.4s, v4.4s +mul v20.4S, v20.4S,v17.s[1] +sub v4.4s, v8.4s, v19.4s +mul v18.4S, v18.4S,v17.s[0] +add v8.4s, v8.4s, v19.4s +mla v20.4S, v16.4S, v31.s[0] +sub v16.4s, v12.4s, v5.4s +mla v18.4S, v24.4S, v31.s[0] +add v12.4s, v12.4s, v5.4s +sqrdmulh v5.4S, v3.4S, v9.s[3] +nop +mul v3.4S, v3.4S,v28.s[3] +nop +sqrdmulh v24.4S, v14.4S, v9.s[2] +sub v19.4s, v21.4s, v23.4s +mul v14.4S, v14.4S,v28.s[2] +add v21.4s, v21.4s, v23.4s +sqrdmulh v23.4S, v10.4S, v9.s[1] +sub v2.4s, v22.4s, v26.4s +mul v10.4S, v10.4S,v28.s[1] +add v22.4s, v22.4s, v26.4s +sqrdmulh v26.4S, v6.4S, v9.s[0] +sub v27.4s, v0.4s, v20.4s +mul v6.4S, v6.4S,v28.s[0] +add v0.4s, v0.4s, v20.4s +mla v3.4S, v5.4S, v31.s[0] +sub v5.4s, v15.4s, v18.4s +sqrdmulh v20.4S, v4.4S, v7.s[3] +add v15.4s, v15.4s, v18.4s +mla v14.4S, v24.4S, v31.s[0] +sub v24.4s, v19.4s, v3.4s +sqrdmulh v18.4S, v8.4S, v7.s[2] +add v19.4s, v19.4s, v3.4s +mla v10.4S, v23.4S, v31.s[0] +sub v23.4s, v21.4s, v14.4s +sqrdmulh v3.4S, v16.4S, v7.s[1] +add v21.4s, v21.4s, v14.4s +mla v6.4S, v26.4S, v31.s[0] +sub v26.4s, v2.4s, v10.4s +sqrdmulh v14.4S, v12.4S, v7.s[0] +add v2.4s, v2.4s, v10.4s +mul v4.4S, v4.4S,v1.s[3] +sub v10.4s, v22.4s, v6.4s +mul v8.4S, v8.4S,v1.s[2] +add v22.4s, v22.4s, v6.4s +mla v4.4S, v20.4S, v31.s[0] +str q24, [x0, #960] +mla v8.4S, v18.4S, v31.s[0] +str q19, [x0, #896] +mul v16.4S, v16.4S,v1.s[1] +str q23, [x0, #832] +mul v12.4S, v12.4S,v1.s[0] +str q21, [x0, #768] +mla v16.4S, v3.4S, v31.s[0] +str q26, [x0, #704] +mla v12.4S, v14.4S, v31.s[0] +str q2, [x0, #640] +ldr q2, [x0, #976] +sqrdmulh v14.4S, v2.4S, v29.s[0] +str q10, [x0, #576] +mul v2.4S, v2.4S,v30.s[0] +sub v10.4s, v27.4s, v4.4s +ldr q26, [x0, #912] +sqrdmulh v3.4S, v26.4S, v29.s[0] +str q22, [x0, #512] +mul v26.4S, v26.4S,v30.s[0] +add v27.4s, v27.4s, v4.4s +ldr q4, [x0, #848] +sqrdmulh v22.4S, v4.4S, v29.s[0] +str q10, [x0, #448] +mul v4.4S, v4.4S,v30.s[0] +sub v10.4s, v0.4s, v8.4s +ldr q21, [x0, #784] +sqrdmulh v23.4S, v21.4S, v29.s[0] +str q27, [x0, #384] +mul v21.4S, v21.4S,v30.s[0] +add v0.4s, v0.4s, v8.4s +ldr q8, [x0, #720] +mla v2.4S, v14.4S, v31.s[0] +str q10, [x0, #320] +sqrdmulh v10.4S, v8.4S, v29.s[0] +sub v14.4s, v5.4s, v16.4s +ldr q27, [x0, #656] +mla v26.4S, v3.4S, v31.s[0] +str q0, [x0, #256] +sqrdmulh v0.4S, v27.4S, v29.s[0] +add v5.4s, v5.4s, v16.4s +ldr q16, [x0, #592] +mla v4.4S, v22.4S, v31.s[0] +str q14, [x0, #192] +sqrdmulh v14.4S, v16.4S, v29.s[0] +sub v22.4s, v15.4s, v12.4s +ldr q3, [x0, #528] +mla v21.4S, v23.4S, v31.s[0] +str q5, [x0, #128] +sqrdmulh v5.4S, v3.4S, v29.s[0] +add v15.4s, v15.4s, v12.4s +ldr q12, [x0, #464] +ldr q23, [x0, #400] +mul v8.4S, v8.4S,v30.s[0] +sub v19.4s, v12.4s, v2.4s +mul v27.4S, v27.4S,v30.s[0] +add v12.4s, v12.4s, v2.4s +ldr q2, [x0, #336] +ldr q18, [x0, #272] +mla v8.4S, v10.4S, v31.s[0] +sub v10.4s, v23.4s, v26.4s +mla v27.4S, v0.4S, v31.s[0] +add v23.4s, v23.4s, v26.4s +ldr q26, [x0, #208] +ldr q0, [x0, #144] +mul v16.4S, v16.4S,v30.s[0] +sub v24.4s, v2.4s, v4.4s +mul v3.4S, v3.4S,v30.s[0] +add v2.4s, v2.4s, v4.4s +ldr q4, [x0, #80] +ldr q20, [x0, #16] +mla v16.4S, v14.4S, v31.s[0] +sub v14.4s, v18.4s, v21.4s +mla v3.4S, v5.4S, v31.s[0] +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v19.4S, v29.s[2] +nop +mul v19.4S, v19.4S,v30.s[2] +nop +sqrdmulh v5.4S, v10.4S, v29.s[2] +sub v6.4s, v26.4s, v8.4s +mul v10.4S, v10.4S,v30.s[2] +add v26.4s, v26.4s, v8.4s +sqrdmulh v8.4S, v12.4S, v29.s[1] +sub v13.4s, v0.4s, v27.4s +mul v12.4S, v12.4S,v30.s[1] +add v0.4s, v0.4s, v27.4s +sqrdmulh v27.4S, v23.4S, v29.s[1] +sub v25.4s, v4.4s, v16.4s +mul v23.4S, v23.4S,v30.s[1] +add v4.4s, v4.4s, v16.4s +mla v19.4S, v21.4S, v31.s[0] +sub v21.4s, v20.4s, v3.4s +sqrdmulh v16.4S, v24.4S, v29.s[2] +add v20.4s, v20.4s, v3.4s +mla v10.4S, v5.4S, v31.s[0] +str q22, [x0, #64] +sqrdmulh v22.4S, v14.4S, v29.s[2] +nop +mla v12.4S, v8.4S, v31.s[0] +str q15, [x0, #0] +sqrdmulh v15.4S, v2.4S, v29.s[1] +nop +mla v23.4S, v27.4S, v31.s[0] +nop +sqrdmulh v27.4S, v18.4S, v29.s[1] +nop +mul v24.4S, v24.4S,v30.s[2] +sub v8.4s, v6.4s, v19.4s +mul v14.4S, v14.4S,v30.s[2] +add v6.4s, v6.4s, v19.4s +mla v24.4S, v16.4S, v31.s[0] +sub v16.4s, v13.4s, v10.4s +mla v14.4S, v22.4S, v31.s[0] +add v13.4s, v13.4s, v10.4s +mul v2.4S, v2.4S,v30.s[1] +sub v10.4s, v26.4s, v12.4s +mul v18.4S, v18.4S,v30.s[1] +add v26.4s, v26.4s, v12.4s +mla v2.4S, v15.4S, v31.s[0] +sub v15.4s, v0.4s, v23.4s +mla v18.4S, v27.4S, v31.s[0] +add v0.4s, v0.4s, v23.4s +sqrdmulh v29.4S, v8.4S, v11.s[3] +nop +mul v8.4S, v8.4S,v17.s[3] +nop +sqrdmulh v30.4S, v6.4S, v11.s[2] +sub v23.4s, v25.4s, v24.4s +mul v6.4S, v6.4S,v17.s[2] +add v25.4s, v25.4s, v24.4s +sqrdmulh v24.4S, v10.4S, v11.s[1] +sub v27.4s, v21.4s, v14.4s +mul v10.4S, v10.4S,v17.s[1] +add v21.4s, v21.4s, v14.4s +sqrdmulh v14.4S, v26.4S, v11.s[0] +sub v12.4s, v4.4s, v2.4s +mul v26.4S, v26.4S,v17.s[0] +add v4.4s, v4.4s, v2.4s +mla v8.4S, v29.4S, v31.s[0] +sub v29.4s, v20.4s, v18.4s +sqrdmulh v2.4S, v16.4S, v11.s[3] +add v20.4s, v20.4s, v18.4s +mla v6.4S, v30.4S, v31.s[0] +nop +sqrdmulh v30.4S, v13.4S, v11.s[2] +nop +mla v10.4S, v24.4S, v31.s[0] +nop +sqrdmulh v24.4S, v15.4S, v11.s[1] +nop +mla v26.4S, v14.4S, v31.s[0] +nop +sqrdmulh v14.4S, v0.4S, v11.s[0] +nop +mul v16.4S, v16.4S,v17.s[3] +sub v18.4s, v23.4s, v8.4s +mul v13.4S, v13.4S,v17.s[2] +add v23.4s, v23.4s, v8.4s +mla v16.4S, v2.4S, v31.s[0] +sub v2.4s, v25.4s, v6.4s +mla v13.4S, v30.4S, v31.s[0] +add v25.4s, v25.4s, v6.4s +mul v15.4S, v15.4S,v17.s[1] +sub v6.4s, v12.4s, v10.4s +mul v0.4S, v0.4S,v17.s[0] +add v12.4s, v12.4s, v10.4s +mla v15.4S, v24.4S, v31.s[0] +sub v24.4s, v4.4s, v26.4s +mla v0.4S, v14.4S, v31.s[0] +add v4.4s, v4.4s, v26.4s +sqrdmulh v11.4S, v18.4S, v9.s[3] +nop +mul v18.4S, v18.4S,v28.s[3] +nop +sqrdmulh v17.4S, v23.4S, v9.s[2] +sub v26.4s, v27.4s, v16.4s +mul v23.4S, v23.4S,v28.s[2] +add v27.4s, v27.4s, v16.4s +sqrdmulh v16.4S, v2.4S, v9.s[1] +sub v14.4s, v21.4s, v13.4s +mul v2.4S, v2.4S,v28.s[1] +add v21.4s, v21.4s, v13.4s +sqrdmulh v13.4S, v25.4S, v9.s[0] +sub v10.4s, v29.4s, v15.4s +mul v25.4S, v25.4S,v28.s[0] +add v29.4s, v29.4s, v15.4s +mla v18.4S, v11.4S, v31.s[0] +sub v11.4s, v20.4s, v0.4s +sqrdmulh v9.4S, v6.4S, v7.s[3] +add v20.4s, v20.4s, v0.4s +mla v23.4S, v17.4S, v31.s[0] +sub v17.4s, v26.4s, v18.4s +sqrdmulh v0.4S, v12.4S, v7.s[2] +add v26.4s, v26.4s, v18.4s +mla v2.4S, v16.4S, v31.s[0] +sub v16.4s, v27.4s, v23.4s +sqrdmulh v18.4S, v24.4S, v7.s[1] +add v27.4s, v27.4s, v23.4s +mla v25.4S, v13.4S, v31.s[0] +sub v13.4s, v14.4s, v2.4s +sqrdmulh v23.4S, v4.4S, v7.s[0] +add v14.4s, v14.4s, v2.4s +mul v6.4S, v6.4S,v1.s[3] +sub v2.4s, v21.4s, v25.4s +mul v12.4S, v12.4S,v1.s[2] +add v21.4s, v21.4s, v25.4s +mla v6.4S, v9.4S, v31.s[0] +str q17, [x0, #976] +mla v12.4S, v0.4S, v31.s[0] +str q26, [x0, #912] +mul v24.4S, v24.4S,v1.s[1] +str q16, [x0, #848] +mul v4.4S, v4.4S,v1.s[0] +str q27, [x0, #784] +mla v24.4S, v18.4S, v31.s[0] +str q13, [x0, #720] +mla v4.4S, v23.4S, v31.s[0] +str q14, [x0, #656] +str q2, [x0, #592] +sub v2.4s, v10.4s, v6.4s +str q21, [x0, #528] +add v10.4s, v10.4s, v6.4s +str q2, [x0, #464] +sub v2.4s, v29.4s, v12.4s +str q10, [x0, #400] +add v29.4s, v29.4s, v12.4s +str q2, [x0, #336] +sub v2.4s, v11.4s, v24.4s +str q29, [x0, #272] +add v11.4s, v11.4s, v24.4s +str q2, [x0, #208] +sub v2.4s, v20.4s, v4.4s +str q11, [x0, #144] +add v20.4s, v20.4s, v4.4s +str q2, [x0, #80] +str q20, [x0, #16] +ldr q3, [x17, #+128] +ldr q5, [x17, #+144] +ldr q19, [x17, #+160] +ldr q22, [x17, #+176] +ldr q8, [x17, #+192] +ldr q30, [x17, #+208] +ldr q15, [x17, #+224] +ldr q28, [x17, #+240] +ldr q25, [x0, #32] +ldr q9, [x0, #48] +ldr q17, [x0, #0] +ldr q0, [x0, #16] +sqrdmulh v26.4S, v25.4S, v5.s[0] +mul v25.4S, v25.4S,v3.s[0] +mla v25.4S, v26.4S, v31.s[0] +sub v26.4s, v17.4s, v25.4s +add v17.4s, v17.4s, v25.4s +sqrdmulh v25.4S, v9.4S, v5.s[0] +mul v9.4S, v9.4S,v3.s[0] +mla v9.4S, v25.4S, v31.s[0] +sub v25.4s, v0.4s, v9.4s +add v0.4s, v0.4s, v9.4s +sqrdmulh v9.4S, v0.4S, v5.s[1] +mul v0.4S, v0.4S,v3.s[1] +mla v0.4S, v9.4S, v31.s[0] +sub v9.4s, v17.4s, v0.4s +add v17.4s, v17.4s, v0.4s +sqrdmulh v0.4S, v25.4S, v5.s[2] +mul v25.4S, v25.4S,v3.s[2] +mla v25.4S, v0.4S, v31.s[0] +sub v0.4s, v26.4s, v25.4s +add v26.4s, v26.4s, v25.4s +trn1 v25.4S, v17.4S, v9.4S +trn2 v16.4S, v17.4S, v9.4S +trn1 v27.4S, v26.4S, v0.4S +trn2 v18.4S, v26.4S, v0.4S +trn2 v26.2D, v25.2D, v27.2D +trn2 v0.2D, v16.2D, v18.2D +trn1 v17.2D, v25.2D, v27.2D +trn1 v9.2D, v16.2D, v18.2D +sqrdmulh v18.4S, v26.4S, v22.4S +mul v26.4S, v26.4S,v19.4S +mla v26.4S, v18.4S, v31.s[0] +sub v18.4s, v17.4s, v26.4s +add v17.4s, v17.4s, v26.4s +sqrdmulh v26.4S, v0.4S, v22.4S +mul v0.4S, v0.4S,v19.4S +mla v0.4S, v26.4S, v31.s[0] +sub v26.4s, v9.4s, v0.4s +add v9.4s, v9.4s, v0.4s +sqrdmulh v0.4S, v9.4S, v30.4S +mul v9.4S, v9.4S,v8.4S +mla v9.4S, v0.4S, v31.s[0] +sub v0.4s, v17.4s, v9.4s +add v17.4s, v17.4s, v9.4s +sqrdmulh v9.4S, v26.4S, v28.4S +mul v26.4S, v26.4S,v15.4S +mla v26.4S, v9.4S, v31.s[0] +sub v9.4s, v18.4s, v26.4s +add v18.4s, v18.4s, v26.4s +str q17, [x0, #0] +str q0, [x0, #16] +str q18, [x0, #32] +str q9, [x0, #48] +ldr q9, [x17, #+256] +ldr q18, [x17, #+272] +ldr q0, [x17, #+288] +ldr q17, [x17, #+304] +ldr q26, [x17, #+320] +ldr q16, [x17, #+336] +ldr q27, [x17, #+352] +ldr q25, [x17, #+368] +ldr q28, [x0, #96] +ldr q15, [x0, #112] +ldr q30, [x0, #64] +ldr q8, [x0, #80] +sqrdmulh v22.4S, v28.4S, v18.s[0] +mul v28.4S, v28.4S,v9.s[0] +mla v28.4S, v22.4S, v31.s[0] +sub v22.4s, v30.4s, v28.4s +add v30.4s, v30.4s, v28.4s +sqrdmulh v28.4S, v15.4S, v18.s[0] +mul v15.4S, v15.4S,v9.s[0] +mla v15.4S, v28.4S, v31.s[0] +sub v28.4s, v8.4s, v15.4s +add v8.4s, v8.4s, v15.4s +sqrdmulh v15.4S, v8.4S, v18.s[1] +mul v8.4S, v8.4S,v9.s[1] +mla v8.4S, v15.4S, v31.s[0] +sub v15.4s, v30.4s, v8.4s +add v30.4s, v30.4s, v8.4s +sqrdmulh v8.4S, v28.4S, v18.s[2] +mul v28.4S, v28.4S,v9.s[2] +mla v28.4S, v8.4S, v31.s[0] +sub v8.4s, v22.4s, v28.4s +add v22.4s, v22.4s, v28.4s +trn1 v28.4S, v30.4S, v15.4S +trn2 v19.4S, v30.4S, v15.4S +trn1 v5.4S, v22.4S, v8.4S +trn2 v3.4S, v22.4S, v8.4S +trn2 v22.2D, v28.2D, v5.2D +trn2 v8.2D, v19.2D, v3.2D +trn1 v30.2D, v28.2D, v5.2D +trn1 v15.2D, v19.2D, v3.2D +sqrdmulh v3.4S, v22.4S, v17.4S +mul v22.4S, v22.4S,v0.4S +mla v22.4S, v3.4S, v31.s[0] +sub v3.4s, v30.4s, v22.4s +add v30.4s, v30.4s, v22.4s +sqrdmulh v22.4S, v8.4S, v17.4S +mul v8.4S, v8.4S,v0.4S +mla v8.4S, v22.4S, v31.s[0] +sub v22.4s, v15.4s, v8.4s +add v15.4s, v15.4s, v8.4s +sqrdmulh v8.4S, v15.4S, v16.4S +mul v15.4S, v15.4S,v26.4S +mla v15.4S, v8.4S, v31.s[0] +sub v8.4s, v30.4s, v15.4s +add v30.4s, v30.4s, v15.4s +sqrdmulh v15.4S, v22.4S, v25.4S +mul v22.4S, v22.4S,v27.4S +mla v22.4S, v15.4S, v31.s[0] +sub v15.4s, v3.4s, v22.4s +add v3.4s, v3.4s, v22.4s +str q30, [x0, #64] +str q8, [x0, #80] +str q3, [x0, #96] +str q15, [x0, #112] +ldr q15, [x17, #+384] +ldr q3, [x17, #+400] +ldr q8, [x17, #+416] +ldr q30, [x17, #+432] +ldr q22, [x17, #+448] +ldr q19, [x17, #+464] +ldr q5, [x17, #+480] +ldr q28, [x17, #+496] +ldr q25, [x0, #160] +ldr q27, [x0, #176] +ldr q16, [x0, #128] +ldr q26, [x0, #144] +sqrdmulh v17.4S, v25.4S, v3.s[0] +mul v25.4S, v25.4S,v15.s[0] +mla v25.4S, v17.4S, v31.s[0] +sub v17.4s, v16.4s, v25.4s +add v16.4s, v16.4s, v25.4s +sqrdmulh v25.4S, v27.4S, v3.s[0] +mul v27.4S, v27.4S,v15.s[0] +mla v27.4S, v25.4S, v31.s[0] +sub v25.4s, v26.4s, v27.4s +add v26.4s, v26.4s, v27.4s +sqrdmulh v27.4S, v26.4S, v3.s[1] +mul v26.4S, v26.4S,v15.s[1] +mla v26.4S, v27.4S, v31.s[0] +sub v27.4s, v16.4s, v26.4s +add v16.4s, v16.4s, v26.4s +sqrdmulh v26.4S, v25.4S, v3.s[2] +mul v25.4S, v25.4S,v15.s[2] +mla v25.4S, v26.4S, v31.s[0] +sub v26.4s, v17.4s, v25.4s +add v17.4s, v17.4s, v25.4s +trn1 v25.4S, v16.4S, v27.4S +trn2 v0.4S, v16.4S, v27.4S +trn1 v18.4S, v17.4S, v26.4S +trn2 v9.4S, v17.4S, v26.4S +trn2 v17.2D, v25.2D, v18.2D +trn2 v26.2D, v0.2D, v9.2D +trn1 v16.2D, v25.2D, v18.2D +trn1 v27.2D, v0.2D, v9.2D +sqrdmulh v9.4S, v17.4S, v30.4S +mul v17.4S, v17.4S,v8.4S +mla v17.4S, v9.4S, v31.s[0] +sub v9.4s, v16.4s, v17.4s +add v16.4s, v16.4s, v17.4s +sqrdmulh v17.4S, v26.4S, v30.4S +mul v26.4S, v26.4S,v8.4S +mla v26.4S, v17.4S, v31.s[0] +sub v17.4s, v27.4s, v26.4s +add v27.4s, v27.4s, v26.4s +sqrdmulh v26.4S, v27.4S, v19.4S +mul v27.4S, v27.4S,v22.4S +mla v27.4S, v26.4S, v31.s[0] +sub v26.4s, v16.4s, v27.4s +add v16.4s, v16.4s, v27.4s +sqrdmulh v27.4S, v17.4S, v28.4S +mul v17.4S, v17.4S,v5.4S +mla v17.4S, v27.4S, v31.s[0] +sub v27.4s, v9.4s, v17.4s +add v9.4s, v9.4s, v17.4s +str q16, [x0, #128] +str q26, [x0, #144] +str q9, [x0, #160] +str q27, [x0, #176] +ldr q27, [x17, #+512] +ldr q9, [x17, #+528] +ldr q26, [x17, #+544] +ldr q16, [x17, #+560] +ldr q17, [x17, #+576] +ldr q0, [x17, #+592] +ldr q18, [x17, #+608] +ldr q25, [x17, #+624] +ldr q28, [x0, #224] +ldr q5, [x0, #240] +ldr q19, [x0, #192] +ldr q22, [x0, #208] +sqrdmulh v30.4S, v28.4S, v9.s[0] +mul v28.4S, v28.4S,v27.s[0] +mla v28.4S, v30.4S, v31.s[0] +sub v30.4s, v19.4s, v28.4s +add v19.4s, v19.4s, v28.4s +sqrdmulh v28.4S, v5.4S, v9.s[0] +mul v5.4S, v5.4S,v27.s[0] +mla v5.4S, v28.4S, v31.s[0] +sub v28.4s, v22.4s, v5.4s +add v22.4s, v22.4s, v5.4s +sqrdmulh v5.4S, v22.4S, v9.s[1] +mul v22.4S, v22.4S,v27.s[1] +mla v22.4S, v5.4S, v31.s[0] +sub v5.4s, v19.4s, v22.4s +add v19.4s, v19.4s, v22.4s +sqrdmulh v22.4S, v28.4S, v9.s[2] +mul v28.4S, v28.4S,v27.s[2] +mla v28.4S, v22.4S, v31.s[0] +sub v22.4s, v30.4s, v28.4s +add v30.4s, v30.4s, v28.4s +trn1 v28.4S, v19.4S, v5.4S +trn2 v8.4S, v19.4S, v5.4S +trn1 v3.4S, v30.4S, v22.4S +trn2 v15.4S, v30.4S, v22.4S +trn2 v30.2D, v28.2D, v3.2D +trn2 v22.2D, v8.2D, v15.2D +trn1 v19.2D, v28.2D, v3.2D +trn1 v5.2D, v8.2D, v15.2D +sqrdmulh v15.4S, v30.4S, v16.4S +mul v30.4S, v30.4S,v26.4S +mla v30.4S, v15.4S, v31.s[0] +sub v15.4s, v19.4s, v30.4s +add v19.4s, v19.4s, v30.4s +sqrdmulh v30.4S, v22.4S, v16.4S +mul v22.4S, v22.4S,v26.4S +mla v22.4S, v30.4S, v31.s[0] +sub v30.4s, v5.4s, v22.4s +add v5.4s, v5.4s, v22.4s +sqrdmulh v22.4S, v5.4S, v0.4S +mul v5.4S, v5.4S,v17.4S +mla v5.4S, v22.4S, v31.s[0] +sub v22.4s, v19.4s, v5.4s +add v19.4s, v19.4s, v5.4s +sqrdmulh v5.4S, v30.4S, v25.4S +mul v30.4S, v30.4S,v18.4S +mla v30.4S, v5.4S, v31.s[0] +sub v5.4s, v15.4s, v30.4s +add v15.4s, v15.4s, v30.4s +str q19, [x0, #192] +str q22, [x0, #208] +str q15, [x0, #224] +str q5, [x0, #240] +ldr q5, [x17, #+640] +ldr q15, [x17, #+656] +ldr q22, [x17, #+672] +ldr q19, [x17, #+688] +ldr q30, [x17, #+704] +ldr q8, [x17, #+720] +ldr q3, [x17, #+736] +ldr q28, [x17, #+752] +ldr q25, [x0, #288] +ldr q18, [x0, #304] +ldr q0, [x0, #256] +ldr q17, [x0, #272] +sqrdmulh v16.4S, v25.4S, v15.s[0] +mul v25.4S, v25.4S,v5.s[0] +mla v25.4S, v16.4S, v31.s[0] +sub v16.4s, v0.4s, v25.4s +add v0.4s, v0.4s, v25.4s +sqrdmulh v25.4S, v18.4S, v15.s[0] +mul v18.4S, v18.4S,v5.s[0] +mla v18.4S, v25.4S, v31.s[0] +sub v25.4s, v17.4s, v18.4s +add v17.4s, v17.4s, v18.4s +sqrdmulh v18.4S, v17.4S, v15.s[1] +mul v17.4S, v17.4S,v5.s[1] +mla v17.4S, v18.4S, v31.s[0] +sub v18.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +sqrdmulh v17.4S, v25.4S, v15.s[2] +mul v25.4S, v25.4S,v5.s[2] +mla v25.4S, v17.4S, v31.s[0] +sub v17.4s, v16.4s, v25.4s +add v16.4s, v16.4s, v25.4s +trn1 v25.4S, v0.4S, v18.4S +trn2 v26.4S, v0.4S, v18.4S +trn1 v9.4S, v16.4S, v17.4S +trn2 v27.4S, v16.4S, v17.4S +trn2 v16.2D, v25.2D, v9.2D +trn2 v17.2D, v26.2D, v27.2D +trn1 v0.2D, v25.2D, v9.2D +trn1 v18.2D, v26.2D, v27.2D +sqrdmulh v27.4S, v16.4S, v19.4S +mul v16.4S, v16.4S,v22.4S +mla v16.4S, v27.4S, v31.s[0] +sub v27.4s, v0.4s, v16.4s +add v0.4s, v0.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v19.4S +mul v17.4S, v17.4S,v22.4S +mla v17.4S, v16.4S, v31.s[0] +sub v16.4s, v18.4s, v17.4s +add v18.4s, v18.4s, v17.4s +sqrdmulh v17.4S, v18.4S, v8.4S +mul v18.4S, v18.4S,v30.4S +mla v18.4S, v17.4S, v31.s[0] +sub v17.4s, v0.4s, v18.4s +add v0.4s, v0.4s, v18.4s +sqrdmulh v18.4S, v16.4S, v28.4S +mul v16.4S, v16.4S,v3.4S +mla v16.4S, v18.4S, v31.s[0] +sub v18.4s, v27.4s, v16.4s +add v27.4s, v27.4s, v16.4s +str q0, [x0, #256] +str q17, [x0, #272] +str q27, [x0, #288] +str q18, [x0, #304] +ldr q18, [x17, #+768] +ldr q27, [x17, #+784] +ldr q17, [x17, #+800] +ldr q0, [x17, #+816] +ldr q16, [x17, #+832] +ldr q26, [x17, #+848] +ldr q9, [x17, #+864] +ldr q25, [x17, #+880] +ldr q28, [x0, #352] +ldr q3, [x0, #368] +ldr q8, [x0, #320] +ldr q30, [x0, #336] +sqrdmulh v19.4S, v28.4S, v27.s[0] +mul v28.4S, v28.4S,v18.s[0] +mla v28.4S, v19.4S, v31.s[0] +sub v19.4s, v8.4s, v28.4s +add v8.4s, v8.4s, v28.4s +sqrdmulh v28.4S, v3.4S, v27.s[0] +mul v3.4S, v3.4S,v18.s[0] +mla v3.4S, v28.4S, v31.s[0] +sub v28.4s, v30.4s, v3.4s +add v30.4s, v30.4s, v3.4s +sqrdmulh v3.4S, v30.4S, v27.s[1] +mul v30.4S, v30.4S,v18.s[1] +mla v30.4S, v3.4S, v31.s[0] +sub v3.4s, v8.4s, v30.4s +add v8.4s, v8.4s, v30.4s +sqrdmulh v30.4S, v28.4S, v27.s[2] +mul v28.4S, v28.4S,v18.s[2] +mla v28.4S, v30.4S, v31.s[0] +sub v30.4s, v19.4s, v28.4s +add v19.4s, v19.4s, v28.4s +trn1 v28.4S, v8.4S, v3.4S +trn2 v22.4S, v8.4S, v3.4S +trn1 v15.4S, v19.4S, v30.4S +trn2 v5.4S, v19.4S, v30.4S +trn2 v19.2D, v28.2D, v15.2D +trn2 v30.2D, v22.2D, v5.2D +trn1 v8.2D, v28.2D, v15.2D +trn1 v3.2D, v22.2D, v5.2D +sqrdmulh v5.4S, v19.4S, v0.4S +mul v19.4S, v19.4S,v17.4S +mla v19.4S, v5.4S, v31.s[0] +sub v5.4s, v8.4s, v19.4s +add v8.4s, v8.4s, v19.4s +sqrdmulh v19.4S, v30.4S, v0.4S +mul v30.4S, v30.4S,v17.4S +mla v30.4S, v19.4S, v31.s[0] +sub v19.4s, v3.4s, v30.4s +add v3.4s, v3.4s, v30.4s +sqrdmulh v30.4S, v3.4S, v26.4S +mul v3.4S, v3.4S,v16.4S +mla v3.4S, v30.4S, v31.s[0] +sub v30.4s, v8.4s, v3.4s +add v8.4s, v8.4s, v3.4s +sqrdmulh v3.4S, v19.4S, v25.4S +mul v19.4S, v19.4S,v9.4S +mla v19.4S, v3.4S, v31.s[0] +sub v3.4s, v5.4s, v19.4s +add v5.4s, v5.4s, v19.4s +str q8, [x0, #320] +str q30, [x0, #336] +str q5, [x0, #352] +str q3, [x0, #368] +ldr q3, [x17, #+896] +ldr q5, [x17, #+912] +ldr q30, [x17, #+928] +ldr q8, [x17, #+944] +ldr q19, [x17, #+960] +ldr q22, [x17, #+976] +ldr q15, [x17, #+992] +ldr q28, [x17, #+1008] +ldr q25, [x0, #416] +ldr q9, [x0, #432] +ldr q26, [x0, #384] +ldr q16, [x0, #400] +sqrdmulh v0.4S, v25.4S, v5.s[0] +mul v25.4S, v25.4S,v3.s[0] +mla v25.4S, v0.4S, v31.s[0] +sub v0.4s, v26.4s, v25.4s +add v26.4s, v26.4s, v25.4s +sqrdmulh v25.4S, v9.4S, v5.s[0] +mul v9.4S, v9.4S,v3.s[0] +mla v9.4S, v25.4S, v31.s[0] +sub v25.4s, v16.4s, v9.4s +add v16.4s, v16.4s, v9.4s +sqrdmulh v9.4S, v16.4S, v5.s[1] +mul v16.4S, v16.4S,v3.s[1] +mla v16.4S, v9.4S, v31.s[0] +sub v9.4s, v26.4s, v16.4s +add v26.4s, v26.4s, v16.4s +sqrdmulh v16.4S, v25.4S, v5.s[2] +mul v25.4S, v25.4S,v3.s[2] +mla v25.4S, v16.4S, v31.s[0] +sub v16.4s, v0.4s, v25.4s +add v0.4s, v0.4s, v25.4s +trn1 v25.4S, v26.4S, v9.4S +trn2 v17.4S, v26.4S, v9.4S +trn1 v27.4S, v0.4S, v16.4S +trn2 v18.4S, v0.4S, v16.4S +trn2 v0.2D, v25.2D, v27.2D +trn2 v16.2D, v17.2D, v18.2D +trn1 v26.2D, v25.2D, v27.2D +trn1 v9.2D, v17.2D, v18.2D +sqrdmulh v18.4S, v0.4S, v8.4S +mul v0.4S, v0.4S,v30.4S +mla v0.4S, v18.4S, v31.s[0] +sub v18.4s, v26.4s, v0.4s +add v26.4s, v26.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v8.4S +mul v16.4S, v16.4S,v30.4S +mla v16.4S, v0.4S, v31.s[0] +sub v0.4s, v9.4s, v16.4s +add v9.4s, v9.4s, v16.4s +sqrdmulh v16.4S, v9.4S, v22.4S +mul v9.4S, v9.4S,v19.4S +mla v9.4S, v16.4S, v31.s[0] +sub v16.4s, v26.4s, v9.4s +add v26.4s, v26.4s, v9.4s +sqrdmulh v9.4S, v0.4S, v28.4S +mul v0.4S, v0.4S,v15.4S +mla v0.4S, v9.4S, v31.s[0] +sub v9.4s, v18.4s, v0.4s +add v18.4s, v18.4s, v0.4s +str q26, [x0, #384] +str q16, [x0, #400] +str q18, [x0, #416] +str q9, [x0, #432] +ldr q9, [x17, #+1024] +ldr q18, [x17, #+1040] +ldr q16, [x17, #+1056] +ldr q26, [x17, #+1072] +ldr q0, [x17, #+1088] +ldr q17, [x17, #+1104] +ldr q27, [x17, #+1120] +ldr q25, [x17, #+1136] +ldr q28, [x0, #480] +ldr q15, [x0, #496] +ldr q22, [x0, #448] +ldr q19, [x0, #464] +sqrdmulh v8.4S, v28.4S, v18.s[0] +mul v28.4S, v28.4S,v9.s[0] +mla v28.4S, v8.4S, v31.s[0] +sub v8.4s, v22.4s, v28.4s +add v22.4s, v22.4s, v28.4s +sqrdmulh v28.4S, v15.4S, v18.s[0] +mul v15.4S, v15.4S,v9.s[0] +mla v15.4S, v28.4S, v31.s[0] +sub v28.4s, v19.4s, v15.4s +add v19.4s, v19.4s, v15.4s +sqrdmulh v15.4S, v19.4S, v18.s[1] +mul v19.4S, v19.4S,v9.s[1] +mla v19.4S, v15.4S, v31.s[0] +sub v15.4s, v22.4s, v19.4s +add v22.4s, v22.4s, v19.4s +sqrdmulh v19.4S, v28.4S, v18.s[2] +mul v28.4S, v28.4S,v9.s[2] +mla v28.4S, v19.4S, v31.s[0] +sub v19.4s, v8.4s, v28.4s +add v8.4s, v8.4s, v28.4s +trn1 v28.4S, v22.4S, v15.4S +trn2 v30.4S, v22.4S, v15.4S +trn1 v5.4S, v8.4S, v19.4S +trn2 v3.4S, v8.4S, v19.4S +trn2 v8.2D, v28.2D, v5.2D +trn2 v19.2D, v30.2D, v3.2D +trn1 v22.2D, v28.2D, v5.2D +trn1 v15.2D, v30.2D, v3.2D +sqrdmulh v3.4S, v8.4S, v26.4S +mul v8.4S, v8.4S,v16.4S +mla v8.4S, v3.4S, v31.s[0] +sub v3.4s, v22.4s, v8.4s +add v22.4s, v22.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v26.4S +mul v19.4S, v19.4S,v16.4S +mla v19.4S, v8.4S, v31.s[0] +sub v8.4s, v15.4s, v19.4s +add v15.4s, v15.4s, v19.4s +sqrdmulh v19.4S, v15.4S, v17.4S +mul v15.4S, v15.4S,v0.4S +mla v15.4S, v19.4S, v31.s[0] +sub v19.4s, v22.4s, v15.4s +add v22.4s, v22.4s, v15.4s +sqrdmulh v15.4S, v8.4S, v25.4S +mul v8.4S, v8.4S,v27.4S +mla v8.4S, v15.4S, v31.s[0] +sub v15.4s, v3.4s, v8.4s +add v3.4s, v3.4s, v8.4s +str q22, [x0, #448] +str q19, [x0, #464] +str q3, [x0, #480] +str q15, [x0, #496] +ldr q15, [x17, #+1152] +ldr q3, [x17, #+1168] +ldr q19, [x17, #+1184] +ldr q22, [x17, #+1200] +ldr q8, [x17, #+1216] +ldr q30, [x17, #+1232] +ldr q5, [x17, #+1248] +ldr q28, [x17, #+1264] +ldr q25, [x0, #544] +ldr q27, [x0, #560] +ldr q17, [x0, #512] +ldr q0, [x0, #528] +sqrdmulh v26.4S, v25.4S, v3.s[0] +mul v25.4S, v25.4S,v15.s[0] +mla v25.4S, v26.4S, v31.s[0] +sub v26.4s, v17.4s, v25.4s +add v17.4s, v17.4s, v25.4s +sqrdmulh v25.4S, v27.4S, v3.s[0] +mul v27.4S, v27.4S,v15.s[0] +mla v27.4S, v25.4S, v31.s[0] +sub v25.4s, v0.4s, v27.4s +add v0.4s, v0.4s, v27.4s +sqrdmulh v27.4S, v0.4S, v3.s[1] +mul v0.4S, v0.4S,v15.s[1] +mla v0.4S, v27.4S, v31.s[0] +sub v27.4s, v17.4s, v0.4s +add v17.4s, v17.4s, v0.4s +sqrdmulh v0.4S, v25.4S, v3.s[2] +mul v25.4S, v25.4S,v15.s[2] +mla v25.4S, v0.4S, v31.s[0] +sub v0.4s, v26.4s, v25.4s +add v26.4s, v26.4s, v25.4s +trn1 v25.4S, v17.4S, v27.4S +trn2 v16.4S, v17.4S, v27.4S +trn1 v18.4S, v26.4S, v0.4S +trn2 v9.4S, v26.4S, v0.4S +trn2 v26.2D, v25.2D, v18.2D +trn2 v0.2D, v16.2D, v9.2D +trn1 v17.2D, v25.2D, v18.2D +trn1 v27.2D, v16.2D, v9.2D +sqrdmulh v9.4S, v26.4S, v22.4S +mul v26.4S, v26.4S,v19.4S +mla v26.4S, v9.4S, v31.s[0] +sub v9.4s, v17.4s, v26.4s +add v17.4s, v17.4s, v26.4s +sqrdmulh v26.4S, v0.4S, v22.4S +mul v0.4S, v0.4S,v19.4S +mla v0.4S, v26.4S, v31.s[0] +sub v26.4s, v27.4s, v0.4s +add v27.4s, v27.4s, v0.4s +sqrdmulh v0.4S, v27.4S, v30.4S +mul v27.4S, v27.4S,v8.4S +mla v27.4S, v0.4S, v31.s[0] +sub v0.4s, v17.4s, v27.4s +add v17.4s, v17.4s, v27.4s +sqrdmulh v27.4S, v26.4S, v28.4S +mul v26.4S, v26.4S,v5.4S +mla v26.4S, v27.4S, v31.s[0] +sub v27.4s, v9.4s, v26.4s +add v9.4s, v9.4s, v26.4s +str q17, [x0, #512] +str q0, [x0, #528] +str q9, [x0, #544] +str q27, [x0, #560] +ldr q27, [x17, #+1280] +ldr q9, [x17, #+1296] +ldr q0, [x17, #+1312] +ldr q17, [x17, #+1328] +ldr q26, [x17, #+1344] +ldr q16, [x17, #+1360] +ldr q18, [x17, #+1376] +ldr q25, [x17, #+1392] +ldr q28, [x0, #608] +ldr q5, [x0, #624] +ldr q30, [x0, #576] +ldr q8, [x0, #592] +sqrdmulh v22.4S, v28.4S, v9.s[0] +mul v28.4S, v28.4S,v27.s[0] +mla v28.4S, v22.4S, v31.s[0] +sub v22.4s, v30.4s, v28.4s +add v30.4s, v30.4s, v28.4s +sqrdmulh v28.4S, v5.4S, v9.s[0] +mul v5.4S, v5.4S,v27.s[0] +mla v5.4S, v28.4S, v31.s[0] +sub v28.4s, v8.4s, v5.4s +add v8.4s, v8.4s, v5.4s +sqrdmulh v5.4S, v8.4S, v9.s[1] +mul v8.4S, v8.4S,v27.s[1] +mla v8.4S, v5.4S, v31.s[0] +sub v5.4s, v30.4s, v8.4s +add v30.4s, v30.4s, v8.4s +sqrdmulh v8.4S, v28.4S, v9.s[2] +mul v28.4S, v28.4S,v27.s[2] +mla v28.4S, v8.4S, v31.s[0] +sub v8.4s, v22.4s, v28.4s +add v22.4s, v22.4s, v28.4s +trn1 v28.4S, v30.4S, v5.4S +trn2 v19.4S, v30.4S, v5.4S +trn1 v3.4S, v22.4S, v8.4S +trn2 v15.4S, v22.4S, v8.4S +trn2 v22.2D, v28.2D, v3.2D +trn2 v8.2D, v19.2D, v15.2D +trn1 v30.2D, v28.2D, v3.2D +trn1 v5.2D, v19.2D, v15.2D +sqrdmulh v15.4S, v22.4S, v17.4S +mul v22.4S, v22.4S,v0.4S +mla v22.4S, v15.4S, v31.s[0] +sub v15.4s, v30.4s, v22.4s +add v30.4s, v30.4s, v22.4s +sqrdmulh v22.4S, v8.4S, v17.4S +mul v8.4S, v8.4S,v0.4S +mla v8.4S, v22.4S, v31.s[0] +sub v22.4s, v5.4s, v8.4s +add v5.4s, v5.4s, v8.4s +sqrdmulh v8.4S, v5.4S, v16.4S +mul v5.4S, v5.4S,v26.4S +mla v5.4S, v8.4S, v31.s[0] +sub v8.4s, v30.4s, v5.4s +add v30.4s, v30.4s, v5.4s +sqrdmulh v5.4S, v22.4S, v25.4S +mul v22.4S, v22.4S,v18.4S +mla v22.4S, v5.4S, v31.s[0] +sub v5.4s, v15.4s, v22.4s +add v15.4s, v15.4s, v22.4s +str q30, [x0, #576] +str q8, [x0, #592] +str q15, [x0, #608] +str q5, [x0, #624] +ldr q5, [x17, #+1408] +ldr q15, [x17, #+1424] +ldr q8, [x17, #+1440] +ldr q30, [x17, #+1456] +ldr q22, [x17, #+1472] +ldr q19, [x17, #+1488] +ldr q3, [x17, #+1504] +ldr q28, [x17, #+1520] +ldr q25, [x0, #672] +ldr q18, [x0, #688] +ldr q16, [x0, #640] +ldr q26, [x0, #656] +sqrdmulh v17.4S, v25.4S, v15.s[0] +mul v25.4S, v25.4S,v5.s[0] +mla v25.4S, v17.4S, v31.s[0] +sub v17.4s, v16.4s, v25.4s +add v16.4s, v16.4s, v25.4s +sqrdmulh v25.4S, v18.4S, v15.s[0] +mul v18.4S, v18.4S,v5.s[0] +mla v18.4S, v25.4S, v31.s[0] +sub v25.4s, v26.4s, v18.4s +add v26.4s, v26.4s, v18.4s +sqrdmulh v18.4S, v26.4S, v15.s[1] +mul v26.4S, v26.4S,v5.s[1] +mla v26.4S, v18.4S, v31.s[0] +sub v18.4s, v16.4s, v26.4s +add v16.4s, v16.4s, v26.4s +sqrdmulh v26.4S, v25.4S, v15.s[2] +mul v25.4S, v25.4S,v5.s[2] +mla v25.4S, v26.4S, v31.s[0] +sub v26.4s, v17.4s, v25.4s +add v17.4s, v17.4s, v25.4s +trn1 v25.4S, v16.4S, v18.4S +trn2 v0.4S, v16.4S, v18.4S +trn1 v9.4S, v17.4S, v26.4S +trn2 v27.4S, v17.4S, v26.4S +trn2 v17.2D, v25.2D, v9.2D +trn2 v26.2D, v0.2D, v27.2D +trn1 v16.2D, v25.2D, v9.2D +trn1 v18.2D, v0.2D, v27.2D +sqrdmulh v27.4S, v17.4S, v30.4S +mul v17.4S, v17.4S,v8.4S +mla v17.4S, v27.4S, v31.s[0] +sub v27.4s, v16.4s, v17.4s +add v16.4s, v16.4s, v17.4s +sqrdmulh v17.4S, v26.4S, v30.4S +mul v26.4S, v26.4S,v8.4S +mla v26.4S, v17.4S, v31.s[0] +sub v17.4s, v18.4s, v26.4s +add v18.4s, v18.4s, v26.4s +sqrdmulh v26.4S, v18.4S, v19.4S +mul v18.4S, v18.4S,v22.4S +mla v18.4S, v26.4S, v31.s[0] +sub v26.4s, v16.4s, v18.4s +add v16.4s, v16.4s, v18.4s +sqrdmulh v18.4S, v17.4S, v28.4S +mul v17.4S, v17.4S,v3.4S +mla v17.4S, v18.4S, v31.s[0] +sub v18.4s, v27.4s, v17.4s +add v27.4s, v27.4s, v17.4s +str q16, [x0, #640] +str q26, [x0, #656] +str q27, [x0, #672] +str q18, [x0, #688] +ldr q18, [x17, #+1536] +ldr q27, [x17, #+1552] +ldr q26, [x17, #+1568] +ldr q16, [x17, #+1584] +ldr q17, [x17, #+1600] +ldr q0, [x17, #+1616] +ldr q9, [x17, #+1632] +ldr q25, [x17, #+1648] +ldr q28, [x0, #736] +ldr q3, [x0, #752] +ldr q19, [x0, #704] +ldr q22, [x0, #720] +sqrdmulh v30.4S, v28.4S, v27.s[0] +mul v28.4S, v28.4S,v18.s[0] +mla v28.4S, v30.4S, v31.s[0] +sub v30.4s, v19.4s, v28.4s +add v19.4s, v19.4s, v28.4s +sqrdmulh v28.4S, v3.4S, v27.s[0] +mul v3.4S, v3.4S,v18.s[0] +mla v3.4S, v28.4S, v31.s[0] +sub v28.4s, v22.4s, v3.4s +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v22.4S, v27.s[1] +mul v22.4S, v22.4S,v18.s[1] +mla v22.4S, v3.4S, v31.s[0] +sub v3.4s, v19.4s, v22.4s +add v19.4s, v19.4s, v22.4s +sqrdmulh v22.4S, v28.4S, v27.s[2] +mul v28.4S, v28.4S,v18.s[2] +mla v28.4S, v22.4S, v31.s[0] +sub v22.4s, v30.4s, v28.4s +add v30.4s, v30.4s, v28.4s +trn1 v28.4S, v19.4S, v3.4S +trn2 v8.4S, v19.4S, v3.4S +trn1 v15.4S, v30.4S, v22.4S +trn2 v5.4S, v30.4S, v22.4S +trn2 v30.2D, v28.2D, v15.2D +trn2 v22.2D, v8.2D, v5.2D +trn1 v19.2D, v28.2D, v15.2D +trn1 v3.2D, v8.2D, v5.2D +sqrdmulh v5.4S, v30.4S, v16.4S +mul v30.4S, v30.4S,v26.4S +mla v30.4S, v5.4S, v31.s[0] +sub v5.4s, v19.4s, v30.4s +add v19.4s, v19.4s, v30.4s +sqrdmulh v30.4S, v22.4S, v16.4S +mul v22.4S, v22.4S,v26.4S +mla v22.4S, v30.4S, v31.s[0] +sub v30.4s, v3.4s, v22.4s +add v3.4s, v3.4s, v22.4s +sqrdmulh v22.4S, v3.4S, v0.4S +mul v3.4S, v3.4S,v17.4S +mla v3.4S, v22.4S, v31.s[0] +sub v22.4s, v19.4s, v3.4s +add v19.4s, v19.4s, v3.4s +sqrdmulh v3.4S, v30.4S, v25.4S +mul v30.4S, v30.4S,v9.4S +mla v30.4S, v3.4S, v31.s[0] +sub v3.4s, v5.4s, v30.4s +add v5.4s, v5.4s, v30.4s +str q19, [x0, #704] +str q22, [x0, #720] +str q5, [x0, #736] +str q3, [x0, #752] +ldr q3, [x17, #+1664] +ldr q5, [x17, #+1680] +ldr q22, [x17, #+1696] +ldr q19, [x17, #+1712] +ldr q30, [x17, #+1728] +ldr q8, [x17, #+1744] +ldr q15, [x17, #+1760] +ldr q28, [x17, #+1776] +ldr q25, [x0, #800] +ldr q9, [x0, #816] +ldr q0, [x0, #768] +ldr q17, [x0, #784] +sqrdmulh v16.4S, v25.4S, v5.s[0] +mul v25.4S, v25.4S,v3.s[0] +mla v25.4S, v16.4S, v31.s[0] +sub v16.4s, v0.4s, v25.4s +add v0.4s, v0.4s, v25.4s +sqrdmulh v25.4S, v9.4S, v5.s[0] +mul v9.4S, v9.4S,v3.s[0] +mla v9.4S, v25.4S, v31.s[0] +sub v25.4s, v17.4s, v9.4s +add v17.4s, v17.4s, v9.4s +sqrdmulh v9.4S, v17.4S, v5.s[1] +mul v17.4S, v17.4S,v3.s[1] +mla v17.4S, v9.4S, v31.s[0] +sub v9.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +sqrdmulh v17.4S, v25.4S, v5.s[2] +mul v25.4S, v25.4S,v3.s[2] +mla v25.4S, v17.4S, v31.s[0] +sub v17.4s, v16.4s, v25.4s +add v16.4s, v16.4s, v25.4s +trn1 v25.4S, v0.4S, v9.4S +trn2 v26.4S, v0.4S, v9.4S +trn1 v27.4S, v16.4S, v17.4S +trn2 v18.4S, v16.4S, v17.4S +trn2 v16.2D, v25.2D, v27.2D +trn2 v17.2D, v26.2D, v18.2D +trn1 v0.2D, v25.2D, v27.2D +trn1 v9.2D, v26.2D, v18.2D +sqrdmulh v18.4S, v16.4S, v19.4S +mul v16.4S, v16.4S,v22.4S +mla v16.4S, v18.4S, v31.s[0] +sub v18.4s, v0.4s, v16.4s +add v0.4s, v0.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v19.4S +mul v17.4S, v17.4S,v22.4S +mla v17.4S, v16.4S, v31.s[0] +sub v16.4s, v9.4s, v17.4s +add v9.4s, v9.4s, v17.4s +sqrdmulh v17.4S, v9.4S, v8.4S +mul v9.4S, v9.4S,v30.4S +mla v9.4S, v17.4S, v31.s[0] +sub v17.4s, v0.4s, v9.4s +add v0.4s, v0.4s, v9.4s +sqrdmulh v9.4S, v16.4S, v28.4S +mul v16.4S, v16.4S,v15.4S +mla v16.4S, v9.4S, v31.s[0] +sub v9.4s, v18.4s, v16.4s +add v18.4s, v18.4s, v16.4s +str q0, [x0, #768] +str q17, [x0, #784] +str q18, [x0, #800] +str q9, [x0, #816] +ldr q9, [x17, #+1792] +ldr q18, [x17, #+1808] +ldr q17, [x17, #+1824] +ldr q0, [x17, #+1840] +ldr q16, [x17, #+1856] +ldr q26, [x17, #+1872] +ldr q27, [x17, #+1888] +ldr q25, [x17, #+1904] +ldr q28, [x0, #864] +ldr q15, [x0, #880] +ldr q8, [x0, #832] +ldr q30, [x0, #848] +sqrdmulh v19.4S, v28.4S, v18.s[0] +mul v28.4S, v28.4S,v9.s[0] +mla v28.4S, v19.4S, v31.s[0] +sub v19.4s, v8.4s, v28.4s +add v8.4s, v8.4s, v28.4s +sqrdmulh v28.4S, v15.4S, v18.s[0] +mul v15.4S, v15.4S,v9.s[0] +mla v15.4S, v28.4S, v31.s[0] +sub v28.4s, v30.4s, v15.4s +add v30.4s, v30.4s, v15.4s +sqrdmulh v15.4S, v30.4S, v18.s[1] +mul v30.4S, v30.4S,v9.s[1] +mla v30.4S, v15.4S, v31.s[0] +sub v15.4s, v8.4s, v30.4s +add v8.4s, v8.4s, v30.4s +sqrdmulh v30.4S, v28.4S, v18.s[2] +mul v28.4S, v28.4S,v9.s[2] +mla v28.4S, v30.4S, v31.s[0] +sub v30.4s, v19.4s, v28.4s +add v19.4s, v19.4s, v28.4s +trn1 v28.4S, v8.4S, v15.4S +trn2 v22.4S, v8.4S, v15.4S +trn1 v5.4S, v19.4S, v30.4S +trn2 v3.4S, v19.4S, v30.4S +trn2 v19.2D, v28.2D, v5.2D +trn2 v30.2D, v22.2D, v3.2D +trn1 v8.2D, v28.2D, v5.2D +trn1 v15.2D, v22.2D, v3.2D +sqrdmulh v3.4S, v19.4S, v0.4S +mul v19.4S, v19.4S,v17.4S +mla v19.4S, v3.4S, v31.s[0] +sub v3.4s, v8.4s, v19.4s +add v8.4s, v8.4s, v19.4s +sqrdmulh v19.4S, v30.4S, v0.4S +mul v30.4S, v30.4S,v17.4S +mla v30.4S, v19.4S, v31.s[0] +sub v19.4s, v15.4s, v30.4s +add v15.4s, v15.4s, v30.4s +sqrdmulh v30.4S, v15.4S, v26.4S +mul v15.4S, v15.4S,v16.4S +mla v15.4S, v30.4S, v31.s[0] +sub v30.4s, v8.4s, v15.4s +add v8.4s, v8.4s, v15.4s +sqrdmulh v15.4S, v19.4S, v25.4S +mul v19.4S, v19.4S,v27.4S +mla v19.4S, v15.4S, v31.s[0] +sub v15.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +str q8, [x0, #832] +str q30, [x0, #848] +str q3, [x0, #864] +str q15, [x0, #880] +ldr q15, [x17, #+1920] +ldr q3, [x17, #+1936] +ldr q30, [x17, #+1952] +ldr q8, [x17, #+1968] +ldr q19, [x17, #+1984] +ldr q22, [x17, #+2000] +ldr q5, [x17, #+2016] +ldr q28, [x17, #+2032] +ldr q25, [x0, #928] +ldr q27, [x0, #944] +ldr q26, [x0, #896] +ldr q16, [x0, #912] +sqrdmulh v0.4S, v25.4S, v3.s[0] +mul v25.4S, v25.4S,v15.s[0] +mla v25.4S, v0.4S, v31.s[0] +sub v0.4s, v26.4s, v25.4s +add v26.4s, v26.4s, v25.4s +sqrdmulh v25.4S, v27.4S, v3.s[0] +mul v27.4S, v27.4S,v15.s[0] +mla v27.4S, v25.4S, v31.s[0] +sub v25.4s, v16.4s, v27.4s +add v16.4s, v16.4s, v27.4s +sqrdmulh v27.4S, v16.4S, v3.s[1] +mul v16.4S, v16.4S,v15.s[1] +mla v16.4S, v27.4S, v31.s[0] +sub v27.4s, v26.4s, v16.4s +add v26.4s, v26.4s, v16.4s +sqrdmulh v16.4S, v25.4S, v3.s[2] +mul v25.4S, v25.4S,v15.s[2] +mla v25.4S, v16.4S, v31.s[0] +sub v16.4s, v0.4s, v25.4s +add v0.4s, v0.4s, v25.4s +trn1 v25.4S, v26.4S, v27.4S +trn2 v17.4S, v26.4S, v27.4S +trn1 v18.4S, v0.4S, v16.4S +trn2 v9.4S, v0.4S, v16.4S +trn2 v0.2D, v25.2D, v18.2D +trn2 v16.2D, v17.2D, v9.2D +trn1 v26.2D, v25.2D, v18.2D +trn1 v27.2D, v17.2D, v9.2D +sqrdmulh v9.4S, v0.4S, v8.4S +mul v0.4S, v0.4S,v30.4S +mla v0.4S, v9.4S, v31.s[0] +sub v9.4s, v26.4s, v0.4s +add v26.4s, v26.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v8.4S +mul v16.4S, v16.4S,v30.4S +mla v16.4S, v0.4S, v31.s[0] +sub v0.4s, v27.4s, v16.4s +add v27.4s, v27.4s, v16.4s +sqrdmulh v16.4S, v27.4S, v22.4S +mul v27.4S, v27.4S,v19.4S +mla v27.4S, v16.4S, v31.s[0] +sub v16.4s, v26.4s, v27.4s +add v26.4s, v26.4s, v27.4s +sqrdmulh v27.4S, v0.4S, v28.4S +mul v0.4S, v0.4S,v5.4S +mla v0.4S, v27.4S, v31.s[0] +sub v27.4s, v9.4s, v0.4s +add v9.4s, v9.4s, v0.4s +str q26, [x0, #896] +str q16, [x0, #912] +str q9, [x0, #928] +str q27, [x0, #944] +ldr q27, [x17, #+2048] +ldr q9, [x17, #+2064] +ldr q16, [x17, #+2080] +ldr q26, [x17, #+2096] +ldr q0, [x17, #+2112] +ldr q17, [x17, #+2128] +ldr q18, [x17, #+2144] +ldr q25, [x17, #+2160] +ldr q28, [x0, #992] +ldr q5, [x0, #1008] +ldr q22, [x0, #960] +ldr q19, [x0, #976] +sqrdmulh v8.4S, v28.4S, v9.s[0] +mul v28.4S, v28.4S,v27.s[0] +mla v28.4S, v8.4S, v31.s[0] +sub v8.4s, v22.4s, v28.4s +add v22.4s, v22.4s, v28.4s +sqrdmulh v28.4S, v5.4S, v9.s[0] +mul v5.4S, v5.4S,v27.s[0] +mla v5.4S, v28.4S, v31.s[0] +sub v28.4s, v19.4s, v5.4s +add v19.4s, v19.4s, v5.4s +sqrdmulh v5.4S, v19.4S, v9.s[1] +mul v19.4S, v19.4S,v27.s[1] +mla v19.4S, v5.4S, v31.s[0] +sub v5.4s, v22.4s, v19.4s +add v22.4s, v22.4s, v19.4s +sqrdmulh v19.4S, v28.4S, v9.s[2] +mul v28.4S, v28.4S,v27.s[2] +mla v28.4S, v19.4S, v31.s[0] +sub v19.4s, v8.4s, v28.4s +add v8.4s, v8.4s, v28.4s +trn1 v28.4S, v22.4S, v5.4S +trn2 v30.4S, v22.4S, v5.4S +trn1 v3.4S, v8.4S, v19.4S +trn2 v15.4S, v8.4S, v19.4S +trn2 v8.2D, v28.2D, v3.2D +trn2 v19.2D, v30.2D, v15.2D +trn1 v22.2D, v28.2D, v3.2D +trn1 v5.2D, v30.2D, v15.2D +sqrdmulh v15.4S, v8.4S, v26.4S +mul v8.4S, v8.4S,v16.4S +mla v8.4S, v15.4S, v31.s[0] +sub v15.4s, v22.4s, v8.4s +add v22.4s, v22.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v26.4S +mul v19.4S, v19.4S,v16.4S +mla v19.4S, v8.4S, v31.s[0] +sub v8.4s, v5.4s, v19.4s +add v5.4s, v5.4s, v19.4s +sqrdmulh v19.4S, v5.4S, v17.4S +mul v5.4S, v5.4S,v0.4S +mla v5.4S, v19.4S, v31.s[0] +sub v19.4s, v22.4s, v5.4s +add v22.4s, v22.4s, v5.4s +sqrdmulh v5.4S, v8.4S, v25.4S +mul v8.4S, v8.4S,v18.4S +mla v8.4S, v5.4S, v31.s[0] +sub v5.4s, v15.4s, v8.4s +add v15.4s, v15.4s, v8.4s +str q22, [x0, #960] +str q19, [x0, #976] +str q15, [x0, #992] +str q5, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 2456 +// Instruction count: 2452 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_18_0.s b/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_18_0.s new file mode 100644 index 0000000..5783747 --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_18_0.s @@ -0,0 +1,2486 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 26036764 // Layer 6, block 0 +.word 7065381 // Layer 6, block 1 +.word 11280567 // Layer 6, block 2 +.word 19695786 // Layer 6, block 3 +.word 1666225723 // Layer 6, block 0 +.word 452149874 // Layer 6, block 1 +.word 721901190 // Layer 6, block 2 +.word 1260434103 // Layer 6, block 3 +.word 28678040 // Layer 7, block 0 +.word 5637166 // Layer 7, block 2 +.word 18759424 // Layer 7, block 4 +.word 8648030 // Layer 7, block 6 +.word 1835254486 // Layer 7, block 0 +.word 360751090 // Layer 7, block 2 +.word 1200511508 // Layer 7, block 4 +.word 553431680 // Layer 7, block 6 +.word 7232147 // Layer 7, block 1 +.word 7430689 // Layer 7, block 3 +.word 14819378 // Layer 7, block 5 +.word 22112339 // Layer 7, block 7 +.word 462822084 // Layer 7, block 1 +.word 475527802 // Layer 7, block 3 +.word 948367809 // Layer 7, block 5 +.word 1415081692 // Layer 7, block 7 +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14834498 // Layer 6, block 4 +.word 22861321 // Layer 6, block 5 +.word 23033862 // Layer 6, block 6 +.word 32211066 // Layer 6, block 7 +.word 949335415 // Layer 6, block 4 +.word 1463012881 // Layer 6, block 5 +.word 1474054663 // Layer 6, block 6 +.word 2061350894 // Layer 6, block 7 +.word 7103825 // Layer 7, block 8 +.word 24338119 // Layer 7, block 10 +.word 6674394 // Layer 7, block 12 +.word 3716128 // Layer 7, block 14 +.word 454610102 // Layer 7, block 8 +.word 1557520740 // Layer 7, block 10 +.word 427128616 // Layer 7, block 12 +.word 237814041 // Layer 7, block 14 +.word 18577393 // Layer 7, block 9 +.word 17042091 // Layer 7, block 11 +.word 6574213 // Layer 7, block 13 +.word 24666803 // Layer 7, block 15 +.word 1188862414 // Layer 7, block 9 +.word 1090610585 // Layer 7, block 11 +.word 420717521 // Layer 7, block 13 +.word 1578554911 // Layer 7, block 15 +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 11253846 // Layer 6, block 8 +.word 16151303 // Layer 6, block 9 +.word 1821442 // Layer 6, block 10 +.word 23358663 // Layer 6, block 11 +.word 720191176 // Layer 6, block 8 +.word 1033604503 // Layer 6, block 9 +.word 116563391 // Layer 6, block 10 +.word 1494840340 // Layer 6, block 11 +.word 32787475 // Layer 7, block 16 +.word 8269259 // Layer 7, block 18 +.word 20826321 // Layer 7, block 20 +.word 21194054 // Layer 7, block 22 +.word 2098238255 // Layer 7, block 16 +.word 529192186 // Layer 7, block 18 +.word 1332782821 // Layer 7, block 20 +.word 1356315937 // Layer 7, block 22 +.word 28400654 // Layer 7, block 17 +.word 31090287 // Layer 7, block 19 +.word 26776841 // Layer 7, block 21 +.word 22281074 // Layer 7, block 23 +.word 1817503137 // Layer 7, block 17 +.word 1989626512 // Layer 7, block 19 +.word 1713587037 // Layer 7, block 21 +.word 1425879908 // Layer 7, block 23 +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 20504641 // Layer 6, block 12 +.word 7735096 // Layer 6, block 13 +.word 29463916 // Layer 6, block 14 +.word 23172067 // Layer 6, block 15 +.word 1312196872 // Layer 6, block 12 +.word 495008363 // Layer 6, block 13 +.word 1885546712 // Layer 6, block 14 +.word 1482899108 // Layer 6, block 15 +.word 1953000 // Layer 7, block 24 +.word 12766243 // Layer 7, block 26 +.word 16292342 // Layer 7, block 28 +.word 25143337 // Layer 7, block 30 +.word 124982461 // Layer 7, block 24 +.word 816977197 // Layer 7, block 26 +.word 1042630311 // Layer 7, block 28 +.word 1609050759 // Layer 7, block 30 +.word 12486848 // Layer 7, block 25 +.word 31556661 // Layer 7, block 27 +.word 28330310 // Layer 7, block 29 +.word 15137961 // Layer 7, block 31 +.word 799097282 // Layer 7, block 25 +.word 2019472170 // Layer 7, block 27 +.word 1813001465 // Layer 7, block 29 +.word 968755565 // Layer 7, block 31 +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 18663828 // Layer 6, block 16 +.word 25765932 // Layer 6, block 17 +.word 11779122 // Layer 6, block 18 +.word 29112305 // Layer 6, block 19 +.word 1194393831 // Layer 6, block 16 +.word 1648893798 // Layer 6, block 17 +.word 753806275 // Layer 6, block 18 +.word 1863045325 // Layer 6, block 19 +.word 33163184 // Layer 7, block 32 +.word 11550623 // Layer 7, block 34 +.word 25375595 // Layer 7, block 36 +.word 18254638 // Layer 7, block 38 +.word 2122281795 // Layer 7, block 32 +.word 739183455 // Layer 7, block 34 +.word 1623914137 // Layer 7, block 36 +.word 1168207670 // Layer 7, block 38 +.word 9551359 // Layer 7, block 33 +.word 33257316 // Layer 7, block 35 +.word 10387700 // Layer 7, block 37 +.word 4263629 // Layer 7, block 39 +.word 611240324 // Layer 7, block 33 +.word 2128305784 // Layer 7, block 35 +.word 664762063 // Layer 7, block 37 +.word 272851431 // Layer 7, block 39 +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 596073 // Layer 6, block 20 +.word 29039358 // Layer 6, block 21 +.word 6760262 // Layer 6, block 22 +.word 2228887 // Layer 6, block 23 +.word 38145761 // Layer 6, block 20 +.word 1858377074 // Layer 6, block 21 +.word 432623749 // Layer 6, block 22 +.word 142637881 // Layer 6, block 23 +.word 25929180 // Layer 7, block 40 +.word 23508428 // Layer 7, block 42 +.word 22560727 // Layer 7, block 44 +.word 29457393 // Layer 7, block 46 +.word 1659340873 // Layer 7, block 40 +.word 1504424569 // Layer 7, block 42 +.word 1443776334 // Layer 7, block 44 +.word 1885129272 // Layer 7, block 46 +.word 17371159 // Layer 7, block 41 +.word 11558208 // Layer 7, block 43 +.word 15755637 // Layer 7, block 45 +.word 20740787 // Layer 7, block 47 +.word 1111669329 // Layer 7, block 41 +.word 739668858 // Layer 7, block 43 +.word 1008283812 // Layer 7, block 45 +.word 1327309063 // Layer 7, block 47 +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 13624329 // Layer 6, block 24 +.word 9838349 // Layer 6, block 25 +.word 6934560 // Layer 6, block 26 +.word 11310234 // Layer 6, block 27 +.word 871890510 // Layer 6, block 24 +.word 629606282 // Layer 6, block 25 +.word 443777969 // Layer 6, block 26 +.word 723799733 // Layer 6, block 27 +.word 3153984 // Layer 7, block 48 +.word 15599806 // Layer 7, block 50 +.word 23484790 // Layer 7, block 52 +.word 30174454 // Layer 7, block 54 +.word 201839571 // Layer 7, block 48 +.word 998311389 // Layer 7, block 50 +.word 1502911852 // Layer 7, block 52 +.word 1931017673 // Layer 7, block 54 +.word 13598070 // Layer 7, block 49 +.word 31454003 // Layer 7, block 51 +.word 20506260 // Layer 7, block 53 +.word 5928435 // Layer 7, block 55 +.word 870210062 // Layer 7, block 49 +.word 2012902560 // Layer 7, block 51 +.word 1312300480 // Layer 7, block 53 +.word 379390883 // Layer 7, block 55 +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 32798516 // Layer 6, block 28 +.word 9911360 // Layer 6, block 29 +.word 32443170 // Layer 6, block 30 +.word 31293482 // Layer 6, block 31 +.word 2098944825 // Layer 6, block 28 +.word 634278629 // Layer 6, block 29 +.word 2076204416 // Layer 6, block 30 +.word 2002630000 // Layer 6, block 31 +.word 26013877 // Layer 7, block 56 +.word 22928950 // Layer 7, block 58 +.word 24547058 // Layer 7, block 60 +.word 21082546 // Layer 7, block 62 +.word 1664761067 // Layer 7, block 56 +.word 1467340807 // Layer 7, block 58 +.word 1570891816 // Layer 7, block 60 +.word 1349179970 // Layer 7, block 62 +.word 21864746 // Layer 7, block 57 +.word 27678266 // Layer 7, block 59 +.word 30695887 // Layer 7, block 61 +.word 31772478 // Layer 7, block 63 +.word 1399236949 // Layer 7, block 57 +.word 1771273834 // Layer 7, block 59 +.word 1964386839 // Layer 7, block 61 +.word 2033283404 // Layer 7, block 63 +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 2853776 // Layer 6, block 32 +.word 31645959 // Layer 6, block 33 +.word 29723614 // Layer 6, block 34 +.word 31813171 // Layer 6, block 35 +.word 182627725 // Layer 6, block 32 +.word 2025186806 // Layer 6, block 33 +.word 1902166116 // Layer 6, block 34 +.word 2035887557 // Layer 6, block 35 +.word 30377953 // Layer 7, block 64 +.word 4924837 // Layer 7, block 66 +.word 11362575 // Layer 7, block 68 +.word 31398766 // Layer 7, block 70 +.word 1944040616 // Layer 7, block 64 +.word 315165513 // Layer 7, block 66 +.word 727149301 // Layer 7, block 68 +.word 2009367662 // Layer 7, block 70 +.word 27689101 // Layer 7, block 65 +.word 31229525 // Layer 7, block 67 +.word 6544948 // Layer 7, block 69 +.word 13728247 // Layer 7, block 71 +.word 1771967221 // Layer 7, block 65 +.word 1998537064 // Layer 7, block 67 +.word 418844704 // Layer 7, block 69 +.word 878540754 // Layer 7, block 71 +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9116920 // Layer 6, block 36 +.word 26449800 // Layer 6, block 37 +.word 27173300 // Layer 6, block 38 +.word 1574249 // Layer 6, block 39 +.word 583438350 // Layer 6, block 36 +.word 1692658010 // Layer 6, block 37 +.word 1738958476 // Layer 6, block 38 +.word 100744247 // Layer 6, block 39 +.word 6510145 // Layer 7, block 72 +.word 760999 // Layer 7, block 74 +.word 1634503 // Layer 7, block 76 +.word 29546109 // Layer 7, block 78 +.word 416617482 // Layer 7, block 72 +.word 48700219 // Layer 7, block 74 +.word 104600209 // Layer 7, block 76 +.word 1890806663 // Layer 7, block 78 +.word 2195232 // Layer 7, block 73 +.word 4465852 // Layer 7, block 75 +.word 31203102 // Layer 7, block 77 +.word 29916743 // Layer 7, block 79 +.word 140484126 // Layer 7, block 73 +.word 285792715 // Layer 7, block 75 +.word 1996846121 // Layer 7, block 77 +.word 1914525428 // Layer 7, block 79 +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29172999 // Layer 6, block 40 +.word 16825951 // Layer 6, block 41 +.word 11592382 // Layer 6, block 42 +.word 2671395 // Layer 6, block 43 +.word 1866929445 // Layer 6, block 40 +.word 1076778680 // Layer 6, block 41 +.word 741855827 // Layer 6, block 42 +.word 170956232 // Layer 6, block 43 +.word 14579779 // Layer 7, block 80 +.word 24263513 // Layer 7, block 82 +.word 4646776 // Layer 7, block 84 +.word 69049 // Layer 7, block 86 +.word 933034643 // Layer 7, block 80 +.word 1552746321 // Layer 7, block 82 +.word 297370968 // Layer 7, block 84 +.word 4418799 // Layer 7, block 86 +.word 33263488 // Layer 7, block 81 +.word 22493246 // Layer 7, block 83 +.word 22009979 // Layer 7, block 85 +.word 12021234 // Layer 7, block 87 +.word 2128700762 // Layer 7, block 81 +.word 1439457879 // Layer 7, block 83 +.word 1408531152 // Layer 7, block 85 +.word 769300260 // Layer 7, block 87 +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 15720958 // Layer 6, block 44 +.word 4876619 // Layer 6, block 45 +.word 9370171 // Layer 6, block 46 +.word 2197027 // Layer 6, block 47 +.word 1006064525 // Layer 6, block 44 +.word 312079797 // Layer 6, block 45 +.word 599645177 // Layer 6, block 46 +.word 140598997 // Layer 6, block 47 +.word 16117282 // Layer 7, block 88 +.word 9635661 // Layer 7, block 90 +.word 9117520 // Layer 7, block 92 +.word 3506913 // Layer 7, block 94 +.word 1031427326 // Layer 7, block 88 +.word 616635240 // Layer 7, block 90 +.word 583476747 // Layer 7, block 92 +.word 224425303 // Layer 7, block 94 +.word 20014407 // Layer 7, block 89 +.word 25893988 // Layer 7, block 91 +.word 10257619 // Layer 7, block 93 +.word 24501669 // Layer 7, block 95 +.word 1280824291 // Layer 7, block 89 +.word 1657088757 // Layer 7, block 91 +.word 656437514 // Layer 7, block 93 +.word 1567987141 // Layer 7, block 95 +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 23467272 // Layer 6, block 48 +.word 11944835 // Layer 6, block 49 +.word 29768154 // Layer 6, block 50 +.word 3189790 // Layer 6, block 51 +.word 1501790786 // Layer 6, block 48 +.word 764411097 // Layer 6, block 49 +.word 1905016458 // Layer 6, block 50 +.word 204130980 // Layer 6, block 51 +.word 28559032 // Layer 7, block 96 +.word 20151609 // Layer 7, block 98 +.word 11645481 // Layer 7, block 100 +.word 16402437 // Layer 7, block 102 +.word 1827638556 // Layer 7, block 96 +.word 1289604549 // Layer 7, block 98 +.word 745253903 // Layer 7, block 100 +.word 1049675853 // Layer 7, block 102 +.word 1005359 // Layer 7, block 97 +.word 19130139 // Layer 7, block 99 +.word 11690281 // Layer 7, block 101 +.word 5461508 // Layer 7, block 103 +.word 64338065 // Layer 7, block 97 +.word 1224235458 // Layer 7, block 99 +.word 748120885 // Layer 7, block 101 +.word 349509836 // Layer 7, block 103 +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 4898455 // Layer 6, block 52 +.word 22059944 // Layer 6, block 53 +.word 20315246 // Layer 6, block 54 +.word 28615767 // Layer 6, block 55 +.word 313477194 // Layer 6, block 52 +.word 1411728668 // Layer 6, block 53 +.word 1300076517 // Layer 6, block 54 +.word 1831269319 // Layer 6, block 55 +.word 6226096 // Layer 7, block 104 +.word 14029790 // Layer 7, block 106 +.word 7729000 // Layer 7, block 108 +.word 13958531 // Layer 7, block 110 +.word 398439734 // Layer 7, block 104 +.word 897838034 // Layer 7, block 106 +.word 494618249 // Layer 7, block 108 +.word 893277806 // Layer 7, block 110 +.word 31755058 // Layer 7, block 105 +.word 26102744 // Layer 7, block 107 +.word 19175904 // Layer 7, block 109 +.word 19472238 // Layer 7, block 111 +.word 2032168609 // Layer 7, block 105 +.word 1670448121 // Layer 7, block 107 +.word 1227164194 // Layer 7, block 109 +.word 1246128123 // Layer 7, block 111 +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 17302560 // Layer 6, block 56 +.word 8630188 // Layer 6, block 57 +.word 13744680 // Layer 6, block 58 +.word 31890906 // Layer 6, block 59 +.word 1107279328 // Layer 6, block 56 +.word 552289879 // Layer 6, block 57 +.word 879592386 // Layer 6, block 58 +.word 2040862218 // Layer 6, block 59 +.word 4735938 // Layer 7, block 112 +.word 26671657 // Layer 7, block 114 +.word 25810971 // Layer 7, block 116 +.word 25578690 // Layer 7, block 118 +.word 303076900 // Layer 7, block 112 +.word 1706855774 // Layer 7, block 114 +.word 1651776074 // Layer 7, block 116 +.word 1636911225 // Layer 7, block 118 +.word 6957373 // Layer 7, block 113 +.word 25381712 // Layer 7, block 115 +.word 27780827 // Layer 7, block 117 +.word 28062311 // Layer 7, block 119 +.word 445237890 // Layer 7, block 113 +.word 1624305595 // Layer 7, block 115 +.word 1777837237 // Layer 7, block 117 +.word 1795850838 // Layer 7, block 119 +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 26150922 // Layer 6, block 60 +.word 29525906 // Layer 6, block 61 +.word 23080870 // Layer 6, block 62 +.word 1636987 // Layer 6, block 63 +.word 1673531278 // Layer 6, block 60 +.word 1889513769 // Layer 6, block 61 +.word 1477062945 // Layer 6, block 62 +.word 104759172 // Layer 6, block 63 +.word 10674616 // Layer 7, block 120 +.word 9508293 // Layer 7, block 122 +.word 4274200 // Layer 7, block 124 +.word 10066304 // Layer 7, block 126 +.word 683123285 // Layer 7, block 120 +.word 608484310 // Layer 7, block 122 +.word 273527923 // Layer 7, block 124 +.word 644194289 // Layer 7, block 126 +.word 26473446 // Layer 7, block 121 +.word 14853570 // Layer 7, block 123 +.word 32427548 // Layer 7, block 125 +.word 16598340 // Layer 7, block 127 +.word 1694171239 // Layer 7, block 121 +.word 950555930 // Layer 7, block 123 +.word 2075204685 // Layer 7, block 125 +.word 1062212688 // Layer 7, block 127 +.text +.global ntt_u32_full_neon_asm_var_4_4_18_0 +.global _ntt_u32_full_neon_asm_var_4_4_18_0 +ntt_u32_full_neon_asm_var_4_4_18_0: +_ntt_u32_full_neon_asm_var_4_4_18_0: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x0, #992] +sqrdmulh v27.4S, v28.4S, v29.s[0] +mul v28.4S, v28.4S,v30.s[0] +ldr q26, [x0, #928] +sqrdmulh v25.4S, v26.4S, v29.s[0] +mul v26.4S, v26.4S,v30.s[0] +ldr q24, [x0, #864] +sqrdmulh v23.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +ldr q22, [x0, #800] +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +ldr q20, [x0, #736] +sqrdmulh v19.4S, v20.4S, v29.s[0] +mla v28.4S, v27.4S, v31.s[0] +ldr q27, [x0, #672] +sqrdmulh v18.4S, v27.4S, v29.s[0] +mla v26.4S, v25.4S, v31.s[0] +ldr q25, [x0, #608] +sqrdmulh v17.4S, v25.4S, v29.s[0] +mla v24.4S, v23.4S, v31.s[0] +ldr q23, [x0, #544] +sqrdmulh v16.4S, v23.4S, v29.s[0] +mla v22.4S, v21.4S, v31.s[0] +ldr q21, [x0, #480] +ldr q3, [x0, #416] +mul v27.4S, v27.4S,v30.s[0] +mul v20.4S, v20.4S,v30.s[0] +sub v2.4s, v21.4s, v28.4s +add v21.4s, v21.4s, v28.4s +ldr q28, [x0, #352] +ldr q1, [x0, #288] +mla v27.4S, v18.4S, v31.s[0] +mla v20.4S, v19.4S, v31.s[0] +sub v19.4s, v3.4s, v26.4s +add v3.4s, v3.4s, v26.4s +ldr q26, [x0, #224] +ldr q18, [x0, #160] +mul v23.4S, v23.4S,v30.s[0] +mul v25.4S, v25.4S,v30.s[0] +sub v0.4s, v28.4s, v24.4s +add v28.4s, v28.4s, v24.4s +ldr q24, [x0, #96] +ldr q15, [x0, #32] +mla v23.4S, v16.4S, v31.s[0] +mla v25.4S, v17.4S, v31.s[0] +sub v17.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +sqrdmulh v22.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v16.4s, v26.4s, v20.4s +nop +sqrdmulh v14.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +add v26.4s, v26.4s, v20.4s +nop +sqrdmulh v20.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v13.4s, v18.4s, v27.4s +add v18.4s, v18.4s, v27.4s +sqrdmulh v27.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v12.4s, v24.4s, v25.4s +add v24.4s, v24.4s, v25.4s +sqrdmulh v25.4S, v0.4S, v29.s[2] +mla v2.4S, v22.4S, v31.s[0] +sub v22.4s, v15.4s, v23.4s +sqrdmulh v11.4S, v17.4S, v29.s[2] +mla v19.4S, v14.4S, v31.s[0] +add v15.4s, v15.4s, v23.4s +nop +sqrdmulh v23.4S, v28.4S, v29.s[1] +mla v21.4S, v20.4S, v31.s[0] +nop +sqrdmulh v20.4S, v1.4S, v29.s[1] +mla v3.4S, v27.4S, v31.s[0] +nop +nop +ldr q27, [x17, #+32] +ldr q14, [x17, #+48] +mul v17.4S, v17.4S,v30.s[2] +mul v0.4S, v0.4S,v30.s[2] +sub v10.4s, v16.4s, v2.4s +add v16.4s, v16.4s, v2.4s +mla v17.4S, v11.4S, v31.s[0] +mla v0.4S, v25.4S, v31.s[0] +sub v25.4s, v13.4s, v19.4s +add v13.4s, v13.4s, v19.4s +mul v1.4S, v1.4S,v30.s[1] +mul v28.4S, v28.4S,v30.s[1] +sub v19.4s, v26.4s, v21.4s +add v26.4s, v26.4s, v21.4s +mla v1.4S, v20.4S, v31.s[0] +mla v28.4S, v23.4S, v31.s[0] +sub v23.4s, v18.4s, v3.4s +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v10.4S, v14.s[3] +mul v10.4S, v10.4S,v27.s[3] +nop +nop +sqrdmulh v20.4S, v16.4S, v14.s[2] +mul v16.4S, v16.4S,v27.s[2] +sub v21.4s, v12.4s, v0.4s +add v12.4s, v12.4s, v0.4s +sqrdmulh v0.4S, v19.4S, v14.s[1] +mul v19.4S, v19.4S,v27.s[1] +sub v11.4s, v22.4s, v17.4s +add v22.4s, v22.4s, v17.4s +sqrdmulh v17.4S, v26.4S, v14.s[0] +mul v26.4S, v26.4S,v27.s[0] +sub v2.4s, v24.4s, v28.4s +add v24.4s, v24.4s, v28.4s +ldr q28, [x17, #+96] +ldr q9, [x17, #+112] +sqrdmulh v8.4S, v25.4S, v14.s[3] +mla v10.4S, v3.4S, v31.s[0] +sub v3.4s, v15.4s, v1.4s +add v15.4s, v15.4s, v1.4s +sqrdmulh v1.4S, v13.4S, v14.s[2] +mla v16.4S, v20.4S, v31.s[0] +nop +nop +sqrdmulh v20.4S, v23.4S, v14.s[1] +mla v19.4S, v0.4S, v31.s[0] +nop +nop +sqrdmulh v0.4S, v18.4S, v14.s[0] +mla v26.4S, v17.4S, v31.s[0] +nop +nop +ldr q17, [x17, #+64] +ldr q7, [x17, #+80] +mul v13.4S, v13.4S,v27.s[2] +mul v25.4S, v25.4S,v27.s[3] +sub v6.4s, v21.4s, v10.4s +add v21.4s, v21.4s, v10.4s +mla v13.4S, v1.4S, v31.s[0] +mla v25.4S, v8.4S, v31.s[0] +sub v8.4s, v12.4s, v16.4s +add v12.4s, v12.4s, v16.4s +mul v18.4S, v18.4S,v27.s[0] +mul v23.4S, v23.4S,v27.s[1] +sub v16.4s, v2.4s, v19.4s +add v2.4s, v2.4s, v19.4s +mla v18.4S, v0.4S, v31.s[0] +mla v23.4S, v20.4S, v31.s[0] +sub v20.4s, v24.4s, v26.4s +add v24.4s, v24.4s, v26.4s +sqrdmulh v26.4S, v6.4S, v9.s[3] +mul v6.4S, v6.4S,v28.s[3] +nop +nop +sqrdmulh v0.4S, v21.4S, v9.s[2] +mul v21.4S, v21.4S,v28.s[2] +sub v19.4s, v11.4s, v25.4s +add v11.4s, v11.4s, v25.4s +sqrdmulh v25.4S, v8.4S, v9.s[1] +mul v8.4S, v8.4S,v28.s[1] +sub v1.4s, v22.4s, v13.4s +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v12.4S, v9.s[0] +mul v12.4S, v12.4S,v28.s[0] +sub v10.4s, v3.4s, v23.4s +add v3.4s, v3.4s, v23.4s +sqrdmulh v23.4S, v16.4S, v7.s[3] +mla v6.4S, v26.4S, v31.s[0] +sub v26.4s, v15.4s, v18.4s +add v15.4s, v15.4s, v18.4s +sqrdmulh v18.4S, v2.4S, v7.s[2] +mla v21.4S, v0.4S, v31.s[0] +sub v0.4s, v19.4s, v6.4s +str q0, [x0, #992] +sqrdmulh v0.4S, v20.4S, v7.s[1] +mla v8.4S, v25.4S, v31.s[0] +add v19.4s, v19.4s, v6.4s +str q19, [x0, #928] +sqrdmulh v19.4S, v24.4S, v7.s[0] +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v21.4s +str q13, [x0, #864] +mul v2.4S, v2.4S,v17.s[2] +mul v16.4S, v16.4S,v17.s[3] +add v11.4s, v11.4s, v21.4s +sub v21.4s, v1.4s, v8.4s +mla v2.4S, v18.4S, v31.s[0] +mla v16.4S, v23.4S, v31.s[0] +add v1.4s, v1.4s, v8.4s +str q11, [x0, #800] +mul v24.4S, v24.4S,v17.s[0] +mul v20.4S, v20.4S,v17.s[1] +sub v11.4s, v22.4s, v12.4s +str q21, [x0, #736] +mla v24.4S, v19.4S, v31.s[0] +mla v20.4S, v0.4S, v31.s[0] +add v22.4s, v22.4s, v12.4s +str q1, [x0, #672] +ldr q1, [x0, #1008] +sqrdmulh v12.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +str q11, [x0, #608] +sub v11.4s, v10.4s, v16.4s +ldr q0, [x0, #944] +sqrdmulh v19.4S, v0.4S, v29.s[0] +mul v0.4S, v0.4S,v30.s[0] +str q22, [x0, #544] +add v10.4s, v10.4s, v16.4s +ldr q16, [x0, #880] +sqrdmulh v22.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +str q11, [x0, #480] +sub v11.4s, v3.4s, v2.4s +ldr q21, [x0, #816] +sqrdmulh v8.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +str q10, [x0, #416] +add v3.4s, v3.4s, v2.4s +ldr q2, [x0, #752] +sqrdmulh v10.4S, v2.4S, v29.s[0] +mla v1.4S, v12.4S, v31.s[0] +str q11, [x0, #352] +sub v11.4s, v26.4s, v20.4s +ldr q12, [x0, #688] +sqrdmulh v23.4S, v12.4S, v29.s[0] +mla v0.4S, v19.4S, v31.s[0] +str q3, [x0, #288] +add v26.4s, v26.4s, v20.4s +ldr q20, [x0, #624] +sqrdmulh v3.4S, v20.4S, v29.s[0] +mla v16.4S, v22.4S, v31.s[0] +str q11, [x0, #224] +sub v11.4s, v15.4s, v24.4s +ldr q22, [x0, #560] +sqrdmulh v19.4S, v22.4S, v29.s[0] +mla v21.4S, v8.4S, v31.s[0] +str q26, [x0, #160] +add v15.4s, v15.4s, v24.4s +ldr q24, [x0, #496] +ldr q26, [x0, #432] +mul v12.4S, v12.4S,v30.s[0] +mul v2.4S, v2.4S,v30.s[0] +sub v8.4s, v24.4s, v1.4s +add v24.4s, v24.4s, v1.4s +ldr q1, [x0, #368] +ldr q18, [x0, #304] +mla v12.4S, v23.4S, v31.s[0] +mla v2.4S, v10.4S, v31.s[0] +sub v10.4s, v26.4s, v0.4s +add v26.4s, v26.4s, v0.4s +ldr q0, [x0, #240] +ldr q23, [x0, #176] +mul v22.4S, v22.4S,v30.s[0] +mul v20.4S, v20.4S,v30.s[0] +sub v13.4s, v1.4s, v16.4s +add v1.4s, v1.4s, v16.4s +ldr q16, [x0, #112] +ldr q6, [x0, #48] +mla v22.4S, v19.4S, v31.s[0] +mla v20.4S, v3.4S, v31.s[0] +sub v3.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v8.4S, v29.s[2] +mul v8.4S, v8.4S,v30.s[2] +sub v19.4s, v0.4s, v2.4s +nop +sqrdmulh v25.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +add v0.4s, v0.4s, v2.4s +nop +sqrdmulh v2.4S, v24.4S, v29.s[1] +mul v24.4S, v24.4S,v30.s[1] +sub v5.4s, v23.4s, v12.4s +add v23.4s, v23.4s, v12.4s +sqrdmulh v12.4S, v26.4S, v29.s[1] +mul v26.4S, v26.4S,v30.s[1] +sub v4.4s, v16.4s, v20.4s +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v13.4S, v29.s[2] +mla v8.4S, v21.4S, v31.s[0] +sub v21.4s, v6.4s, v22.4s +str q11, [x0, #96] +sqrdmulh v11.4S, v3.4S, v29.s[2] +mla v10.4S, v25.4S, v31.s[0] +add v6.4s, v6.4s, v22.4s +nop +sqrdmulh v22.4S, v1.4S, v29.s[1] +mla v24.4S, v2.4S, v31.s[0] +str q15, [x0, #32] +nop +sqrdmulh v15.4S, v18.4S, v29.s[1] +mla v26.4S, v12.4S, v31.s[0] +nop +nop +mul v3.4S, v3.4S,v30.s[2] +mul v13.4S, v13.4S,v30.s[2] +sub v12.4s, v19.4s, v8.4s +add v19.4s, v19.4s, v8.4s +mla v3.4S, v11.4S, v31.s[0] +mla v13.4S, v20.4S, v31.s[0] +sub v20.4s, v5.4s, v10.4s +add v5.4s, v5.4s, v10.4s +mul v18.4S, v18.4S,v30.s[1] +mul v1.4S, v1.4S,v30.s[1] +sub v10.4s, v0.4s, v24.4s +add v0.4s, v0.4s, v24.4s +mla v18.4S, v15.4S, v31.s[0] +mla v1.4S, v22.4S, v31.s[0] +sub v22.4s, v23.4s, v26.4s +add v23.4s, v23.4s, v26.4s +sqrdmulh v26.4S, v12.4S, v14.s[3] +mul v12.4S, v12.4S,v27.s[3] +nop +nop +sqrdmulh v15.4S, v19.4S, v14.s[2] +mul v19.4S, v19.4S,v27.s[2] +sub v24.4s, v4.4s, v13.4s +add v4.4s, v4.4s, v13.4s +sqrdmulh v13.4S, v10.4S, v14.s[1] +mul v10.4S, v10.4S,v27.s[1] +sub v11.4s, v21.4s, v3.4s +add v21.4s, v21.4s, v3.4s +sqrdmulh v3.4S, v0.4S, v14.s[0] +mul v0.4S, v0.4S,v27.s[0] +sub v8.4s, v16.4s, v1.4s +add v16.4s, v16.4s, v1.4s +sqrdmulh v1.4S, v20.4S, v14.s[3] +mla v12.4S, v26.4S, v31.s[0] +sub v26.4s, v6.4s, v18.4s +add v6.4s, v6.4s, v18.4s +sqrdmulh v18.4S, v5.4S, v14.s[2] +mla v19.4S, v15.4S, v31.s[0] +nop +nop +sqrdmulh v15.4S, v22.4S, v14.s[1] +mla v10.4S, v13.4S, v31.s[0] +nop +nop +sqrdmulh v13.4S, v23.4S, v14.s[0] +mla v0.4S, v3.4S, v31.s[0] +nop +nop +mul v5.4S, v5.4S,v27.s[2] +mul v20.4S, v20.4S,v27.s[3] +sub v3.4s, v24.4s, v12.4s +add v24.4s, v24.4s, v12.4s +mla v5.4S, v18.4S, v31.s[0] +mla v20.4S, v1.4S, v31.s[0] +sub v1.4s, v4.4s, v19.4s +add v4.4s, v4.4s, v19.4s +mul v23.4S, v23.4S,v27.s[0] +mul v22.4S, v22.4S,v27.s[1] +sub v19.4s, v8.4s, v10.4s +add v8.4s, v8.4s, v10.4s +mla v23.4S, v13.4S, v31.s[0] +mla v22.4S, v15.4S, v31.s[0] +sub v15.4s, v16.4s, v0.4s +add v16.4s, v16.4s, v0.4s +sqrdmulh v0.4S, v3.4S, v9.s[3] +mul v3.4S, v3.4S,v28.s[3] +nop +nop +sqrdmulh v13.4S, v24.4S, v9.s[2] +mul v24.4S, v24.4S,v28.s[2] +sub v10.4s, v11.4s, v20.4s +add v11.4s, v11.4s, v20.4s +sqrdmulh v20.4S, v1.4S, v9.s[1] +mul v1.4S, v1.4S,v28.s[1] +sub v18.4s, v21.4s, v5.4s +add v21.4s, v21.4s, v5.4s +sqrdmulh v5.4S, v4.4S, v9.s[0] +mul v4.4S, v4.4S,v28.s[0] +sub v12.4s, v26.4s, v22.4s +add v26.4s, v26.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v7.s[3] +mla v3.4S, v0.4S, v31.s[0] +sub v0.4s, v6.4s, v23.4s +add v6.4s, v6.4s, v23.4s +sqrdmulh v23.4S, v8.4S, v7.s[2] +mla v24.4S, v13.4S, v31.s[0] +sub v13.4s, v10.4s, v3.4s +str q13, [x0, #1008] +sqrdmulh v13.4S, v15.4S, v7.s[1] +mla v1.4S, v20.4S, v31.s[0] +add v10.4s, v10.4s, v3.4s +str q10, [x0, #944] +sqrdmulh v10.4S, v16.4S, v7.s[0] +mla v4.4S, v5.4S, v31.s[0] +sub v5.4s, v11.4s, v24.4s +str q5, [x0, #880] +mul v8.4S, v8.4S,v17.s[2] +mul v19.4S, v19.4S,v17.s[3] +add v11.4s, v11.4s, v24.4s +sub v24.4s, v18.4s, v1.4s +mla v8.4S, v23.4S, v31.s[0] +mla v19.4S, v22.4S, v31.s[0] +add v18.4s, v18.4s, v1.4s +str q11, [x0, #816] +mul v16.4S, v16.4S,v17.s[0] +mul v15.4S, v15.4S,v17.s[1] +sub v11.4s, v21.4s, v4.4s +str q24, [x0, #752] +mla v16.4S, v10.4S, v31.s[0] +mla v15.4S, v13.4S, v31.s[0] +add v21.4s, v21.4s, v4.4s +str q18, [x0, #688] +ldr q18, [x0, #960] +sqrdmulh v4.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +str q11, [x0, #624] +sub v11.4s, v12.4s, v19.4s +ldr q13, [x0, #896] +sqrdmulh v10.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +str q21, [x0, #560] +add v12.4s, v12.4s, v19.4s +ldr q19, [x0, #832] +sqrdmulh v21.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +str q11, [x0, #496] +sub v11.4s, v26.4s, v8.4s +ldr q24, [x0, #768] +sqrdmulh v1.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +str q12, [x0, #432] +add v26.4s, v26.4s, v8.4s +ldr q8, [x0, #704] +sqrdmulh v12.4S, v8.4S, v29.s[0] +mla v18.4S, v4.4S, v31.s[0] +str q11, [x0, #368] +sub v11.4s, v0.4s, v15.4s +ldr q4, [x0, #640] +sqrdmulh v22.4S, v4.4S, v29.s[0] +mla v13.4S, v10.4S, v31.s[0] +str q26, [x0, #304] +add v0.4s, v0.4s, v15.4s +ldr q15, [x0, #576] +sqrdmulh v26.4S, v15.4S, v29.s[0] +mla v19.4S, v21.4S, v31.s[0] +str q11, [x0, #240] +sub v11.4s, v6.4s, v16.4s +ldr q21, [x0, #512] +sqrdmulh v10.4S, v21.4S, v29.s[0] +mla v24.4S, v1.4S, v31.s[0] +str q0, [x0, #176] +add v6.4s, v6.4s, v16.4s +ldr q16, [x0, #448] +ldr q0, [x0, #384] +mul v4.4S, v4.4S,v30.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v1.4s, v16.4s, v18.4s +add v16.4s, v16.4s, v18.4s +ldr q18, [x0, #320] +ldr q23, [x0, #256] +mla v4.4S, v22.4S, v31.s[0] +mla v8.4S, v12.4S, v31.s[0] +sub v12.4s, v0.4s, v13.4s +add v0.4s, v0.4s, v13.4s +ldr q13, [x0, #192] +ldr q22, [x0, #128] +mul v21.4S, v21.4S,v30.s[0] +mul v15.4S, v15.4S,v30.s[0] +sub v5.4s, v18.4s, v19.4s +add v18.4s, v18.4s, v19.4s +ldr q19, [x0, #64] +ldr q3, [x0, #0] +mla v21.4S, v10.4S, v31.s[0] +mla v15.4S, v26.4S, v31.s[0] +sub v26.4s, v23.4s, v24.4s +add v23.4s, v23.4s, v24.4s +sqrdmulh v24.4S, v1.4S, v29.s[2] +mul v1.4S, v1.4S,v30.s[2] +sub v10.4s, v13.4s, v8.4s +nop +sqrdmulh v20.4S, v12.4S, v29.s[2] +mul v12.4S, v12.4S,v30.s[2] +add v13.4s, v13.4s, v8.4s +nop +sqrdmulh v8.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v2.4s, v22.4s, v4.4s +add v22.4s, v22.4s, v4.4s +sqrdmulh v4.4S, v0.4S, v29.s[1] +mul v0.4S, v0.4S,v30.s[1] +sub v25.4s, v19.4s, v15.4s +add v19.4s, v19.4s, v15.4s +sqrdmulh v15.4S, v5.4S, v29.s[2] +mla v1.4S, v24.4S, v31.s[0] +sub v24.4s, v3.4s, v21.4s +str q11, [x0, #112] +sqrdmulh v11.4S, v26.4S, v29.s[2] +mla v12.4S, v20.4S, v31.s[0] +add v3.4s, v3.4s, v21.4s +nop +sqrdmulh v21.4S, v18.4S, v29.s[1] +mla v16.4S, v8.4S, v31.s[0] +str q6, [x0, #48] +nop +sqrdmulh v6.4S, v23.4S, v29.s[1] +mla v0.4S, v4.4S, v31.s[0] +nop +nop +mul v26.4S, v26.4S,v30.s[2] +mul v5.4S, v5.4S,v30.s[2] +sub v4.4s, v10.4s, v1.4s +add v10.4s, v10.4s, v1.4s +mla v26.4S, v11.4S, v31.s[0] +mla v5.4S, v15.4S, v31.s[0] +sub v15.4s, v2.4s, v12.4s +add v2.4s, v2.4s, v12.4s +mul v23.4S, v23.4S,v30.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v12.4s, v13.4s, v16.4s +add v13.4s, v13.4s, v16.4s +mla v23.4S, v6.4S, v31.s[0] +mla v18.4S, v21.4S, v31.s[0] +sub v21.4s, v22.4s, v0.4s +add v22.4s, v22.4s, v0.4s +sqrdmulh v0.4S, v4.4S, v14.s[3] +mul v4.4S, v4.4S,v27.s[3] +nop +nop +sqrdmulh v6.4S, v10.4S, v14.s[2] +mul v10.4S, v10.4S,v27.s[2] +sub v16.4s, v25.4s, v5.4s +add v25.4s, v25.4s, v5.4s +sqrdmulh v5.4S, v12.4S, v14.s[1] +mul v12.4S, v12.4S,v27.s[1] +sub v11.4s, v24.4s, v26.4s +add v24.4s, v24.4s, v26.4s +sqrdmulh v26.4S, v13.4S, v14.s[0] +mul v13.4S, v13.4S,v27.s[0] +sub v1.4s, v19.4s, v18.4s +add v19.4s, v19.4s, v18.4s +sqrdmulh v18.4S, v15.4S, v14.s[3] +mla v4.4S, v0.4S, v31.s[0] +sub v0.4s, v3.4s, v23.4s +add v3.4s, v3.4s, v23.4s +sqrdmulh v23.4S, v2.4S, v14.s[2] +mla v10.4S, v6.4S, v31.s[0] +nop +nop +sqrdmulh v6.4S, v21.4S, v14.s[1] +mla v12.4S, v5.4S, v31.s[0] +nop +nop +sqrdmulh v5.4S, v22.4S, v14.s[0] +mla v13.4S, v26.4S, v31.s[0] +nop +nop +mul v2.4S, v2.4S,v27.s[2] +mul v15.4S, v15.4S,v27.s[3] +sub v26.4s, v16.4s, v4.4s +add v16.4s, v16.4s, v4.4s +mla v2.4S, v23.4S, v31.s[0] +mla v15.4S, v18.4S, v31.s[0] +sub v18.4s, v25.4s, v10.4s +add v25.4s, v25.4s, v10.4s +mul v22.4S, v22.4S,v27.s[0] +mul v21.4S, v21.4S,v27.s[1] +sub v10.4s, v1.4s, v12.4s +add v1.4s, v1.4s, v12.4s +mla v22.4S, v5.4S, v31.s[0] +mla v21.4S, v6.4S, v31.s[0] +sub v6.4s, v19.4s, v13.4s +add v19.4s, v19.4s, v13.4s +sqrdmulh v13.4S, v26.4S, v9.s[3] +mul v26.4S, v26.4S,v28.s[3] +nop +nop +sqrdmulh v5.4S, v16.4S, v9.s[2] +mul v16.4S, v16.4S,v28.s[2] +sub v12.4s, v11.4s, v15.4s +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v18.4S, v9.s[1] +mul v18.4S, v18.4S,v28.s[1] +sub v23.4s, v24.4s, v2.4s +add v24.4s, v24.4s, v2.4s +sqrdmulh v2.4S, v25.4S, v9.s[0] +mul v25.4S, v25.4S,v28.s[0] +sub v4.4s, v0.4s, v21.4s +add v0.4s, v0.4s, v21.4s +sqrdmulh v21.4S, v10.4S, v7.s[3] +mla v26.4S, v13.4S, v31.s[0] +sub v13.4s, v3.4s, v22.4s +add v3.4s, v3.4s, v22.4s +sqrdmulh v22.4S, v1.4S, v7.s[2] +mla v16.4S, v5.4S, v31.s[0] +sub v5.4s, v12.4s, v26.4s +str q5, [x0, #960] +sqrdmulh v5.4S, v6.4S, v7.s[1] +mla v18.4S, v15.4S, v31.s[0] +add v12.4s, v12.4s, v26.4s +str q12, [x0, #896] +sqrdmulh v12.4S, v19.4S, v7.s[0] +mla v25.4S, v2.4S, v31.s[0] +sub v2.4s, v11.4s, v16.4s +str q2, [x0, #832] +mul v1.4S, v1.4S,v17.s[2] +mul v10.4S, v10.4S,v17.s[3] +add v11.4s, v11.4s, v16.4s +sub v16.4s, v23.4s, v18.4s +mla v1.4S, v22.4S, v31.s[0] +mla v10.4S, v21.4S, v31.s[0] +add v23.4s, v23.4s, v18.4s +str q11, [x0, #768] +mul v19.4S, v19.4S,v17.s[0] +mul v6.4S, v6.4S,v17.s[1] +sub v11.4s, v24.4s, v25.4s +str q16, [x0, #704] +mla v19.4S, v12.4S, v31.s[0] +mla v6.4S, v5.4S, v31.s[0] +add v24.4s, v24.4s, v25.4s +str q23, [x0, #640] +ldr q23, [x0, #976] +sqrdmulh v25.4S, v23.4S, v29.s[0] +mul v23.4S, v23.4S,v30.s[0] +str q11, [x0, #576] +sub v11.4s, v4.4s, v10.4s +ldr q5, [x0, #912] +sqrdmulh v12.4S, v5.4S, v29.s[0] +mul v5.4S, v5.4S,v30.s[0] +str q24, [x0, #512] +add v4.4s, v4.4s, v10.4s +ldr q10, [x0, #848] +sqrdmulh v24.4S, v10.4S, v29.s[0] +mul v10.4S, v10.4S,v30.s[0] +str q11, [x0, #448] +sub v11.4s, v0.4s, v1.4s +ldr q16, [x0, #784] +sqrdmulh v18.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +str q4, [x0, #384] +add v0.4s, v0.4s, v1.4s +ldr q1, [x0, #720] +sqrdmulh v4.4S, v1.4S, v29.s[0] +mla v23.4S, v25.4S, v31.s[0] +str q11, [x0, #320] +sub v11.4s, v13.4s, v6.4s +ldr q25, [x0, #656] +sqrdmulh v21.4S, v25.4S, v29.s[0] +mla v5.4S, v12.4S, v31.s[0] +str q0, [x0, #256] +add v13.4s, v13.4s, v6.4s +ldr q6, [x0, #592] +sqrdmulh v0.4S, v6.4S, v29.s[0] +mla v10.4S, v24.4S, v31.s[0] +str q11, [x0, #192] +sub v11.4s, v3.4s, v19.4s +ldr q24, [x0, #528] +sqrdmulh v12.4S, v24.4S, v29.s[0] +mla v16.4S, v18.4S, v31.s[0] +str q13, [x0, #128] +add v3.4s, v3.4s, v19.4s +ldr q19, [x0, #464] +ldr q13, [x0, #400] +mul v25.4S, v25.4S,v30.s[0] +mul v1.4S, v1.4S,v30.s[0] +sub v18.4s, v19.4s, v23.4s +add v19.4s, v19.4s, v23.4s +ldr q23, [x0, #336] +ldr q22, [x0, #272] +mla v25.4S, v21.4S, v31.s[0] +mla v1.4S, v4.4S, v31.s[0] +sub v4.4s, v13.4s, v5.4s +add v13.4s, v13.4s, v5.4s +ldr q5, [x0, #208] +ldr q21, [x0, #144] +mul v24.4S, v24.4S,v30.s[0] +mul v6.4S, v6.4S,v30.s[0] +sub v2.4s, v23.4s, v10.4s +add v23.4s, v23.4s, v10.4s +ldr q10, [x0, #80] +ldr q26, [x0, #16] +mla v24.4S, v12.4S, v31.s[0] +mla v6.4S, v0.4S, v31.s[0] +sub v0.4s, v22.4s, v16.4s +add v22.4s, v22.4s, v16.4s +sqrdmulh v16.4S, v18.4S, v29.s[2] +mul v18.4S, v18.4S,v30.s[2] +sub v12.4s, v5.4s, v1.4s +nop +sqrdmulh v15.4S, v4.4S, v29.s[2] +mul v4.4S, v4.4S,v30.s[2] +add v5.4s, v5.4s, v1.4s +nop +sqrdmulh v1.4S, v19.4S, v29.s[1] +mul v19.4S, v19.4S,v30.s[1] +sub v8.4s, v21.4s, v25.4s +add v21.4s, v21.4s, v25.4s +sqrdmulh v25.4S, v13.4S, v29.s[1] +mul v13.4S, v13.4S,v30.s[1] +sub v20.4s, v10.4s, v6.4s +add v10.4s, v10.4s, v6.4s +sqrdmulh v6.4S, v2.4S, v29.s[2] +mla v18.4S, v16.4S, v31.s[0] +sub v16.4s, v26.4s, v24.4s +str q11, [x0, #64] +sqrdmulh v11.4S, v0.4S, v29.s[2] +mla v4.4S, v15.4S, v31.s[0] +add v26.4s, v26.4s, v24.4s +nop +sqrdmulh v24.4S, v23.4S, v29.s[1] +mla v19.4S, v1.4S, v31.s[0] +str q3, [x0, #0] +nop +sqrdmulh v3.4S, v22.4S, v29.s[1] +mla v13.4S, v25.4S, v31.s[0] +nop +nop +mul v0.4S, v0.4S,v30.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v25.4s, v12.4s, v18.4s +add v12.4s, v12.4s, v18.4s +mla v0.4S, v11.4S, v31.s[0] +mla v2.4S, v6.4S, v31.s[0] +sub v6.4s, v8.4s, v4.4s +add v8.4s, v8.4s, v4.4s +mul v22.4S, v22.4S,v30.s[1] +mul v23.4S, v23.4S,v30.s[1] +sub v4.4s, v5.4s, v19.4s +add v5.4s, v5.4s, v19.4s +mla v22.4S, v3.4S, v31.s[0] +mla v23.4S, v24.4S, v31.s[0] +sub v24.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +sqrdmulh v29.4S, v25.4S, v14.s[3] +mul v25.4S, v25.4S,v27.s[3] +nop +nop +sqrdmulh v30.4S, v12.4S, v14.s[2] +mul v12.4S, v12.4S,v27.s[2] +sub v13.4s, v20.4s, v2.4s +add v20.4s, v20.4s, v2.4s +sqrdmulh v2.4S, v4.4S, v14.s[1] +mul v4.4S, v4.4S,v27.s[1] +sub v3.4s, v16.4s, v0.4s +add v16.4s, v16.4s, v0.4s +sqrdmulh v0.4S, v5.4S, v14.s[0] +mul v5.4S, v5.4S,v27.s[0] +sub v19.4s, v10.4s, v23.4s +add v10.4s, v10.4s, v23.4s +sqrdmulh v23.4S, v6.4S, v14.s[3] +mla v25.4S, v29.4S, v31.s[0] +sub v29.4s, v26.4s, v22.4s +add v26.4s, v26.4s, v22.4s +sqrdmulh v22.4S, v8.4S, v14.s[2] +mla v12.4S, v30.4S, v31.s[0] +nop +nop +sqrdmulh v30.4S, v24.4S, v14.s[1] +mla v4.4S, v2.4S, v31.s[0] +nop +nop +sqrdmulh v2.4S, v21.4S, v14.s[0] +mla v5.4S, v0.4S, v31.s[0] +nop +nop +mul v8.4S, v8.4S,v27.s[2] +mul v6.4S, v6.4S,v27.s[3] +sub v0.4s, v13.4s, v25.4s +add v13.4s, v13.4s, v25.4s +mla v8.4S, v22.4S, v31.s[0] +mla v6.4S, v23.4S, v31.s[0] +sub v23.4s, v20.4s, v12.4s +add v20.4s, v20.4s, v12.4s +mul v21.4S, v21.4S,v27.s[0] +mul v24.4S, v24.4S,v27.s[1] +sub v12.4s, v19.4s, v4.4s +add v19.4s, v19.4s, v4.4s +mla v21.4S, v2.4S, v31.s[0] +mla v24.4S, v30.4S, v31.s[0] +sub v30.4s, v10.4s, v5.4s +add v10.4s, v10.4s, v5.4s +sqrdmulh v14.4S, v0.4S, v9.s[3] +mul v0.4S, v0.4S,v28.s[3] +nop +nop +sqrdmulh v27.4S, v13.4S, v9.s[2] +mul v13.4S, v13.4S,v28.s[2] +sub v5.4s, v3.4s, v6.4s +add v3.4s, v3.4s, v6.4s +sqrdmulh v6.4S, v23.4S, v9.s[1] +mul v23.4S, v23.4S,v28.s[1] +sub v2.4s, v16.4s, v8.4s +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v20.4S, v9.s[0] +mul v20.4S, v20.4S,v28.s[0] +sub v4.4s, v29.4s, v24.4s +add v29.4s, v29.4s, v24.4s +sqrdmulh v9.4S, v12.4S, v7.s[3] +mla v0.4S, v14.4S, v31.s[0] +sub v14.4s, v26.4s, v21.4s +add v26.4s, v26.4s, v21.4s +sqrdmulh v21.4S, v19.4S, v7.s[2] +mla v13.4S, v27.4S, v31.s[0] +sub v27.4s, v5.4s, v0.4s +str q27, [x0, #976] +sqrdmulh v27.4S, v30.4S, v7.s[1] +mla v23.4S, v6.4S, v31.s[0] +add v5.4s, v5.4s, v0.4s +str q5, [x0, #912] +sqrdmulh v5.4S, v10.4S, v7.s[0] +mla v20.4S, v8.4S, v31.s[0] +sub v8.4s, v3.4s, v13.4s +str q8, [x0, #848] +mul v19.4S, v19.4S,v17.s[2] +mul v12.4S, v12.4S,v17.s[3] +add v3.4s, v3.4s, v13.4s +sub v13.4s, v2.4s, v23.4s +mla v19.4S, v21.4S, v31.s[0] +mla v12.4S, v9.4S, v31.s[0] +add v2.4s, v2.4s, v23.4s +str q3, [x0, #784] +mul v10.4S, v10.4S,v17.s[0] +mul v30.4S, v30.4S,v17.s[1] +sub v3.4s, v16.4s, v20.4s +str q13, [x0, #720] +mla v10.4S, v5.4S, v31.s[0] +mla v30.4S, v27.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +str q2, [x0, #656] +str q3, [x0, #592] +sub v3.4s, v4.4s, v12.4s +str q16, [x0, #528] +add v4.4s, v4.4s, v12.4s +str q3, [x0, #464] +sub v3.4s, v29.4s, v19.4s +str q4, [x0, #400] +add v29.4s, v29.4s, v19.4s +str q3, [x0, #336] +sub v3.4s, v14.4s, v30.4s +str q29, [x0, #272] +add v14.4s, v14.4s, v30.4s +str q3, [x0, #208] +sub v3.4s, v26.4s, v10.4s +str q14, [x0, #144] +add v26.4s, v26.4s, v10.4s +str q3, [x0, #80] +str q26, [x0, #16] +ldr q15, [x17, #+128] +ldr q1, [x17, #+144] +ldr q18, [x17, #+160] +ldr q11, [x17, #+176] +ldr q25, [x17, #+192] +ldr q22, [x17, #+208] +ldr q24, [x17, #+224] +ldr q28, [x17, #+240] +ldr q6, [x0, #32] +ldr q0, [x0, #48] +ldr q8, [x0, #0] +ldr q21, [x0, #16] +sqrdmulh v9.4S, v6.4S, v1.s[0] +mul v6.4S, v6.4S,v15.s[0] +mla v6.4S, v9.4S, v31.s[0] +sub v9.4s, v8.4s, v6.4s +add v8.4s, v8.4s, v6.4s +sqrdmulh v6.4S, v0.4S, v1.s[0] +mul v0.4S, v0.4S,v15.s[0] +mla v0.4S, v6.4S, v31.s[0] +sub v6.4s, v21.4s, v0.4s +add v21.4s, v21.4s, v0.4s +sqrdmulh v0.4S, v21.4S, v1.s[1] +mul v21.4S, v21.4S,v15.s[1] +mla v21.4S, v0.4S, v31.s[0] +sub v0.4s, v8.4s, v21.4s +add v8.4s, v8.4s, v21.4s +sqrdmulh v21.4S, v6.4S, v1.s[2] +mul v6.4S, v6.4S,v15.s[2] +mla v6.4S, v21.4S, v31.s[0] +sub v21.4s, v9.4s, v6.4s +add v9.4s, v9.4s, v6.4s +trn1 v6.4S, v8.4S, v0.4S +trn2 v23.4S, v8.4S, v0.4S +trn1 v13.4S, v9.4S, v21.4S +trn2 v5.4S, v9.4S, v21.4S +trn2 v9.2D, v6.2D, v13.2D +trn2 v21.2D, v23.2D, v5.2D +trn1 v8.2D, v6.2D, v13.2D +trn1 v0.2D, v23.2D, v5.2D +sqrdmulh v5.4S, v9.4S, v11.4S +mul v9.4S, v9.4S,v18.4S +mla v9.4S, v5.4S, v31.s[0] +sub v5.4s, v8.4s, v9.4s +add v8.4s, v8.4s, v9.4s +sqrdmulh v9.4S, v21.4S, v11.4S +mul v21.4S, v21.4S,v18.4S +mla v21.4S, v9.4S, v31.s[0] +sub v9.4s, v0.4s, v21.4s +add v0.4s, v0.4s, v21.4s +sqrdmulh v21.4S, v0.4S, v22.4S +mul v0.4S, v0.4S,v25.4S +mla v0.4S, v21.4S, v31.s[0] +sub v21.4s, v8.4s, v0.4s +add v8.4s, v8.4s, v0.4s +sqrdmulh v0.4S, v9.4S, v28.4S +mul v9.4S, v9.4S,v24.4S +mla v9.4S, v0.4S, v31.s[0] +sub v0.4s, v5.4s, v9.4s +add v5.4s, v5.4s, v9.4s +str q8, [x0, #0] +str q21, [x0, #16] +str q5, [x0, #32] +str q0, [x0, #48] +ldr q0, [x17, #+256] +ldr q5, [x17, #+272] +ldr q21, [x17, #+288] +ldr q8, [x17, #+304] +ldr q9, [x17, #+320] +ldr q23, [x17, #+336] +ldr q13, [x17, #+352] +ldr q6, [x17, #+368] +ldr q28, [x0, #96] +ldr q24, [x0, #112] +ldr q22, [x0, #64] +ldr q25, [x0, #80] +sqrdmulh v11.4S, v28.4S, v5.s[0] +mul v28.4S, v28.4S,v0.s[0] +mla v28.4S, v11.4S, v31.s[0] +sub v11.4s, v22.4s, v28.4s +add v22.4s, v22.4s, v28.4s +sqrdmulh v28.4S, v24.4S, v5.s[0] +mul v24.4S, v24.4S,v0.s[0] +mla v24.4S, v28.4S, v31.s[0] +sub v28.4s, v25.4s, v24.4s +add v25.4s, v25.4s, v24.4s +sqrdmulh v24.4S, v25.4S, v5.s[1] +mul v25.4S, v25.4S,v0.s[1] +mla v25.4S, v24.4S, v31.s[0] +sub v24.4s, v22.4s, v25.4s +add v22.4s, v22.4s, v25.4s +sqrdmulh v25.4S, v28.4S, v5.s[2] +mul v28.4S, v28.4S,v0.s[2] +mla v28.4S, v25.4S, v31.s[0] +sub v25.4s, v11.4s, v28.4s +add v11.4s, v11.4s, v28.4s +trn1 v28.4S, v22.4S, v24.4S +trn2 v18.4S, v22.4S, v24.4S +trn1 v1.4S, v11.4S, v25.4S +trn2 v15.4S, v11.4S, v25.4S +trn2 v11.2D, v28.2D, v1.2D +trn2 v25.2D, v18.2D, v15.2D +trn1 v22.2D, v28.2D, v1.2D +trn1 v24.2D, v18.2D, v15.2D +sqrdmulh v15.4S, v11.4S, v8.4S +mul v11.4S, v11.4S,v21.4S +mla v11.4S, v15.4S, v31.s[0] +sub v15.4s, v22.4s, v11.4s +add v22.4s, v22.4s, v11.4s +sqrdmulh v11.4S, v25.4S, v8.4S +mul v25.4S, v25.4S,v21.4S +mla v25.4S, v11.4S, v31.s[0] +sub v11.4s, v24.4s, v25.4s +add v24.4s, v24.4s, v25.4s +sqrdmulh v25.4S, v24.4S, v23.4S +mul v24.4S, v24.4S,v9.4S +mla v24.4S, v25.4S, v31.s[0] +sub v25.4s, v22.4s, v24.4s +add v22.4s, v22.4s, v24.4s +sqrdmulh v24.4S, v11.4S, v6.4S +mul v11.4S, v11.4S,v13.4S +mla v11.4S, v24.4S, v31.s[0] +sub v24.4s, v15.4s, v11.4s +add v15.4s, v15.4s, v11.4s +str q22, [x0, #64] +str q25, [x0, #80] +str q15, [x0, #96] +str q24, [x0, #112] +ldr q24, [x17, #+384] +ldr q15, [x17, #+400] +ldr q25, [x17, #+416] +ldr q22, [x17, #+432] +ldr q11, [x17, #+448] +ldr q18, [x17, #+464] +ldr q1, [x17, #+480] +ldr q28, [x17, #+496] +ldr q6, [x0, #160] +ldr q13, [x0, #176] +ldr q23, [x0, #128] +ldr q9, [x0, #144] +sqrdmulh v8.4S, v6.4S, v15.s[0] +mul v6.4S, v6.4S,v24.s[0] +mla v6.4S, v8.4S, v31.s[0] +sub v8.4s, v23.4s, v6.4s +add v23.4s, v23.4s, v6.4s +sqrdmulh v6.4S, v13.4S, v15.s[0] +mul v13.4S, v13.4S,v24.s[0] +mla v13.4S, v6.4S, v31.s[0] +sub v6.4s, v9.4s, v13.4s +add v9.4s, v9.4s, v13.4s +sqrdmulh v13.4S, v9.4S, v15.s[1] +mul v9.4S, v9.4S,v24.s[1] +mla v9.4S, v13.4S, v31.s[0] +sub v13.4s, v23.4s, v9.4s +add v23.4s, v23.4s, v9.4s +sqrdmulh v9.4S, v6.4S, v15.s[2] +mul v6.4S, v6.4S,v24.s[2] +mla v6.4S, v9.4S, v31.s[0] +sub v9.4s, v8.4s, v6.4s +add v8.4s, v8.4s, v6.4s +trn1 v6.4S, v23.4S, v13.4S +trn2 v21.4S, v23.4S, v13.4S +trn1 v5.4S, v8.4S, v9.4S +trn2 v0.4S, v8.4S, v9.4S +trn2 v8.2D, v6.2D, v5.2D +trn2 v9.2D, v21.2D, v0.2D +trn1 v23.2D, v6.2D, v5.2D +trn1 v13.2D, v21.2D, v0.2D +sqrdmulh v0.4S, v8.4S, v22.4S +mul v8.4S, v8.4S,v25.4S +mla v8.4S, v0.4S, v31.s[0] +sub v0.4s, v23.4s, v8.4s +add v23.4s, v23.4s, v8.4s +sqrdmulh v8.4S, v9.4S, v22.4S +mul v9.4S, v9.4S,v25.4S +mla v9.4S, v8.4S, v31.s[0] +sub v8.4s, v13.4s, v9.4s +add v13.4s, v13.4s, v9.4s +sqrdmulh v9.4S, v13.4S, v18.4S +mul v13.4S, v13.4S,v11.4S +mla v13.4S, v9.4S, v31.s[0] +sub v9.4s, v23.4s, v13.4s +add v23.4s, v23.4s, v13.4s +sqrdmulh v13.4S, v8.4S, v28.4S +mul v8.4S, v8.4S,v1.4S +mla v8.4S, v13.4S, v31.s[0] +sub v13.4s, v0.4s, v8.4s +add v0.4s, v0.4s, v8.4s +str q23, [x0, #128] +str q9, [x0, #144] +str q0, [x0, #160] +str q13, [x0, #176] +ldr q13, [x17, #+512] +ldr q0, [x17, #+528] +ldr q9, [x17, #+544] +ldr q23, [x17, #+560] +ldr q8, [x17, #+576] +ldr q21, [x17, #+592] +ldr q5, [x17, #+608] +ldr q6, [x17, #+624] +ldr q28, [x0, #224] +ldr q1, [x0, #240] +ldr q18, [x0, #192] +ldr q11, [x0, #208] +sqrdmulh v22.4S, v28.4S, v0.s[0] +mul v28.4S, v28.4S,v13.s[0] +mla v28.4S, v22.4S, v31.s[0] +sub v22.4s, v18.4s, v28.4s +add v18.4s, v18.4s, v28.4s +sqrdmulh v28.4S, v1.4S, v0.s[0] +mul v1.4S, v1.4S,v13.s[0] +mla v1.4S, v28.4S, v31.s[0] +sub v28.4s, v11.4s, v1.4s +add v11.4s, v11.4s, v1.4s +sqrdmulh v1.4S, v11.4S, v0.s[1] +mul v11.4S, v11.4S,v13.s[1] +mla v11.4S, v1.4S, v31.s[0] +sub v1.4s, v18.4s, v11.4s +add v18.4s, v18.4s, v11.4s +sqrdmulh v11.4S, v28.4S, v0.s[2] +mul v28.4S, v28.4S,v13.s[2] +mla v28.4S, v11.4S, v31.s[0] +sub v11.4s, v22.4s, v28.4s +add v22.4s, v22.4s, v28.4s +trn1 v28.4S, v18.4S, v1.4S +trn2 v25.4S, v18.4S, v1.4S +trn1 v15.4S, v22.4S, v11.4S +trn2 v24.4S, v22.4S, v11.4S +trn2 v22.2D, v28.2D, v15.2D +trn2 v11.2D, v25.2D, v24.2D +trn1 v18.2D, v28.2D, v15.2D +trn1 v1.2D, v25.2D, v24.2D +sqrdmulh v24.4S, v22.4S, v23.4S +mul v22.4S, v22.4S,v9.4S +mla v22.4S, v24.4S, v31.s[0] +sub v24.4s, v18.4s, v22.4s +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v11.4S, v23.4S +mul v11.4S, v11.4S,v9.4S +mla v11.4S, v22.4S, v31.s[0] +sub v22.4s, v1.4s, v11.4s +add v1.4s, v1.4s, v11.4s +sqrdmulh v11.4S, v1.4S, v21.4S +mul v1.4S, v1.4S,v8.4S +mla v1.4S, v11.4S, v31.s[0] +sub v11.4s, v18.4s, v1.4s +add v18.4s, v18.4s, v1.4s +sqrdmulh v1.4S, v22.4S, v6.4S +mul v22.4S, v22.4S,v5.4S +mla v22.4S, v1.4S, v31.s[0] +sub v1.4s, v24.4s, v22.4s +add v24.4s, v24.4s, v22.4s +str q18, [x0, #192] +str q11, [x0, #208] +str q24, [x0, #224] +str q1, [x0, #240] +ldr q1, [x17, #+640] +ldr q24, [x17, #+656] +ldr q11, [x17, #+672] +ldr q18, [x17, #+688] +ldr q22, [x17, #+704] +ldr q25, [x17, #+720] +ldr q15, [x17, #+736] +ldr q28, [x17, #+752] +ldr q6, [x0, #288] +ldr q5, [x0, #304] +ldr q21, [x0, #256] +ldr q8, [x0, #272] +sqrdmulh v23.4S, v6.4S, v24.s[0] +mul v6.4S, v6.4S,v1.s[0] +mla v6.4S, v23.4S, v31.s[0] +sub v23.4s, v21.4s, v6.4s +add v21.4s, v21.4s, v6.4s +sqrdmulh v6.4S, v5.4S, v24.s[0] +mul v5.4S, v5.4S,v1.s[0] +mla v5.4S, v6.4S, v31.s[0] +sub v6.4s, v8.4s, v5.4s +add v8.4s, v8.4s, v5.4s +sqrdmulh v5.4S, v8.4S, v24.s[1] +mul v8.4S, v8.4S,v1.s[1] +mla v8.4S, v5.4S, v31.s[0] +sub v5.4s, v21.4s, v8.4s +add v21.4s, v21.4s, v8.4s +sqrdmulh v8.4S, v6.4S, v24.s[2] +mul v6.4S, v6.4S,v1.s[2] +mla v6.4S, v8.4S, v31.s[0] +sub v8.4s, v23.4s, v6.4s +add v23.4s, v23.4s, v6.4s +trn1 v6.4S, v21.4S, v5.4S +trn2 v9.4S, v21.4S, v5.4S +trn1 v0.4S, v23.4S, v8.4S +trn2 v13.4S, v23.4S, v8.4S +trn2 v23.2D, v6.2D, v0.2D +trn2 v8.2D, v9.2D, v13.2D +trn1 v21.2D, v6.2D, v0.2D +trn1 v5.2D, v9.2D, v13.2D +sqrdmulh v13.4S, v23.4S, v18.4S +mul v23.4S, v23.4S,v11.4S +mla v23.4S, v13.4S, v31.s[0] +sub v13.4s, v21.4s, v23.4s +add v21.4s, v21.4s, v23.4s +sqrdmulh v23.4S, v8.4S, v18.4S +mul v8.4S, v8.4S,v11.4S +mla v8.4S, v23.4S, v31.s[0] +sub v23.4s, v5.4s, v8.4s +add v5.4s, v5.4s, v8.4s +sqrdmulh v8.4S, v5.4S, v25.4S +mul v5.4S, v5.4S,v22.4S +mla v5.4S, v8.4S, v31.s[0] +sub v8.4s, v21.4s, v5.4s +add v21.4s, v21.4s, v5.4s +sqrdmulh v5.4S, v23.4S, v28.4S +mul v23.4S, v23.4S,v15.4S +mla v23.4S, v5.4S, v31.s[0] +sub v5.4s, v13.4s, v23.4s +add v13.4s, v13.4s, v23.4s +str q21, [x0, #256] +str q8, [x0, #272] +str q13, [x0, #288] +str q5, [x0, #304] +ldr q5, [x17, #+768] +ldr q13, [x17, #+784] +ldr q8, [x17, #+800] +ldr q21, [x17, #+816] +ldr q23, [x17, #+832] +ldr q9, [x17, #+848] +ldr q0, [x17, #+864] +ldr q6, [x17, #+880] +ldr q28, [x0, #352] +ldr q15, [x0, #368] +ldr q25, [x0, #320] +ldr q22, [x0, #336] +sqrdmulh v18.4S, v28.4S, v13.s[0] +mul v28.4S, v28.4S,v5.s[0] +mla v28.4S, v18.4S, v31.s[0] +sub v18.4s, v25.4s, v28.4s +add v25.4s, v25.4s, v28.4s +sqrdmulh v28.4S, v15.4S, v13.s[0] +mul v15.4S, v15.4S,v5.s[0] +mla v15.4S, v28.4S, v31.s[0] +sub v28.4s, v22.4s, v15.4s +add v22.4s, v22.4s, v15.4s +sqrdmulh v15.4S, v22.4S, v13.s[1] +mul v22.4S, v22.4S,v5.s[1] +mla v22.4S, v15.4S, v31.s[0] +sub v15.4s, v25.4s, v22.4s +add v25.4s, v25.4s, v22.4s +sqrdmulh v22.4S, v28.4S, v13.s[2] +mul v28.4S, v28.4S,v5.s[2] +mla v28.4S, v22.4S, v31.s[0] +sub v22.4s, v18.4s, v28.4s +add v18.4s, v18.4s, v28.4s +trn1 v28.4S, v25.4S, v15.4S +trn2 v11.4S, v25.4S, v15.4S +trn1 v24.4S, v18.4S, v22.4S +trn2 v1.4S, v18.4S, v22.4S +trn2 v18.2D, v28.2D, v24.2D +trn2 v22.2D, v11.2D, v1.2D +trn1 v25.2D, v28.2D, v24.2D +trn1 v15.2D, v11.2D, v1.2D +sqrdmulh v1.4S, v18.4S, v21.4S +mul v18.4S, v18.4S,v8.4S +mla v18.4S, v1.4S, v31.s[0] +sub v1.4s, v25.4s, v18.4s +add v25.4s, v25.4s, v18.4s +sqrdmulh v18.4S, v22.4S, v21.4S +mul v22.4S, v22.4S,v8.4S +mla v22.4S, v18.4S, v31.s[0] +sub v18.4s, v15.4s, v22.4s +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v15.4S, v9.4S +mul v15.4S, v15.4S,v23.4S +mla v15.4S, v22.4S, v31.s[0] +sub v22.4s, v25.4s, v15.4s +add v25.4s, v25.4s, v15.4s +sqrdmulh v15.4S, v18.4S, v6.4S +mul v18.4S, v18.4S,v0.4S +mla v18.4S, v15.4S, v31.s[0] +sub v15.4s, v1.4s, v18.4s +add v1.4s, v1.4s, v18.4s +str q25, [x0, #320] +str q22, [x0, #336] +str q1, [x0, #352] +str q15, [x0, #368] +ldr q15, [x17, #+896] +ldr q1, [x17, #+912] +ldr q22, [x17, #+928] +ldr q25, [x17, #+944] +ldr q18, [x17, #+960] +ldr q11, [x17, #+976] +ldr q24, [x17, #+992] +ldr q28, [x17, #+1008] +ldr q6, [x0, #416] +ldr q0, [x0, #432] +ldr q9, [x0, #384] +ldr q23, [x0, #400] +sqrdmulh v21.4S, v6.4S, v1.s[0] +mul v6.4S, v6.4S,v15.s[0] +mla v6.4S, v21.4S, v31.s[0] +sub v21.4s, v9.4s, v6.4s +add v9.4s, v9.4s, v6.4s +sqrdmulh v6.4S, v0.4S, v1.s[0] +mul v0.4S, v0.4S,v15.s[0] +mla v0.4S, v6.4S, v31.s[0] +sub v6.4s, v23.4s, v0.4s +add v23.4s, v23.4s, v0.4s +sqrdmulh v0.4S, v23.4S, v1.s[1] +mul v23.4S, v23.4S,v15.s[1] +mla v23.4S, v0.4S, v31.s[0] +sub v0.4s, v9.4s, v23.4s +add v9.4s, v9.4s, v23.4s +sqrdmulh v23.4S, v6.4S, v1.s[2] +mul v6.4S, v6.4S,v15.s[2] +mla v6.4S, v23.4S, v31.s[0] +sub v23.4s, v21.4s, v6.4s +add v21.4s, v21.4s, v6.4s +trn1 v6.4S, v9.4S, v0.4S +trn2 v8.4S, v9.4S, v0.4S +trn1 v13.4S, v21.4S, v23.4S +trn2 v5.4S, v21.4S, v23.4S +trn2 v21.2D, v6.2D, v13.2D +trn2 v23.2D, v8.2D, v5.2D +trn1 v9.2D, v6.2D, v13.2D +trn1 v0.2D, v8.2D, v5.2D +sqrdmulh v5.4S, v21.4S, v25.4S +mul v21.4S, v21.4S,v22.4S +mla v21.4S, v5.4S, v31.s[0] +sub v5.4s, v9.4s, v21.4s +add v9.4s, v9.4s, v21.4s +sqrdmulh v21.4S, v23.4S, v25.4S +mul v23.4S, v23.4S,v22.4S +mla v23.4S, v21.4S, v31.s[0] +sub v21.4s, v0.4s, v23.4s +add v0.4s, v0.4s, v23.4s +sqrdmulh v23.4S, v0.4S, v11.4S +mul v0.4S, v0.4S,v18.4S +mla v0.4S, v23.4S, v31.s[0] +sub v23.4s, v9.4s, v0.4s +add v9.4s, v9.4s, v0.4s +sqrdmulh v0.4S, v21.4S, v28.4S +mul v21.4S, v21.4S,v24.4S +mla v21.4S, v0.4S, v31.s[0] +sub v0.4s, v5.4s, v21.4s +add v5.4s, v5.4s, v21.4s +str q9, [x0, #384] +str q23, [x0, #400] +str q5, [x0, #416] +str q0, [x0, #432] +ldr q0, [x17, #+1024] +ldr q5, [x17, #+1040] +ldr q23, [x17, #+1056] +ldr q9, [x17, #+1072] +ldr q21, [x17, #+1088] +ldr q8, [x17, #+1104] +ldr q13, [x17, #+1120] +ldr q6, [x17, #+1136] +ldr q28, [x0, #480] +ldr q24, [x0, #496] +ldr q11, [x0, #448] +ldr q18, [x0, #464] +sqrdmulh v25.4S, v28.4S, v5.s[0] +mul v28.4S, v28.4S,v0.s[0] +mla v28.4S, v25.4S, v31.s[0] +sub v25.4s, v11.4s, v28.4s +add v11.4s, v11.4s, v28.4s +sqrdmulh v28.4S, v24.4S, v5.s[0] +mul v24.4S, v24.4S,v0.s[0] +mla v24.4S, v28.4S, v31.s[0] +sub v28.4s, v18.4s, v24.4s +add v18.4s, v18.4s, v24.4s +sqrdmulh v24.4S, v18.4S, v5.s[1] +mul v18.4S, v18.4S,v0.s[1] +mla v18.4S, v24.4S, v31.s[0] +sub v24.4s, v11.4s, v18.4s +add v11.4s, v11.4s, v18.4s +sqrdmulh v18.4S, v28.4S, v5.s[2] +mul v28.4S, v28.4S,v0.s[2] +mla v28.4S, v18.4S, v31.s[0] +sub v18.4s, v25.4s, v28.4s +add v25.4s, v25.4s, v28.4s +trn1 v28.4S, v11.4S, v24.4S +trn2 v22.4S, v11.4S, v24.4S +trn1 v1.4S, v25.4S, v18.4S +trn2 v15.4S, v25.4S, v18.4S +trn2 v25.2D, v28.2D, v1.2D +trn2 v18.2D, v22.2D, v15.2D +trn1 v11.2D, v28.2D, v1.2D +trn1 v24.2D, v22.2D, v15.2D +sqrdmulh v15.4S, v25.4S, v9.4S +mul v25.4S, v25.4S,v23.4S +mla v25.4S, v15.4S, v31.s[0] +sub v15.4s, v11.4s, v25.4s +add v11.4s, v11.4s, v25.4s +sqrdmulh v25.4S, v18.4S, v9.4S +mul v18.4S, v18.4S,v23.4S +mla v18.4S, v25.4S, v31.s[0] +sub v25.4s, v24.4s, v18.4s +add v24.4s, v24.4s, v18.4s +sqrdmulh v18.4S, v24.4S, v8.4S +mul v24.4S, v24.4S,v21.4S +mla v24.4S, v18.4S, v31.s[0] +sub v18.4s, v11.4s, v24.4s +add v11.4s, v11.4s, v24.4s +sqrdmulh v24.4S, v25.4S, v6.4S +mul v25.4S, v25.4S,v13.4S +mla v25.4S, v24.4S, v31.s[0] +sub v24.4s, v15.4s, v25.4s +add v15.4s, v15.4s, v25.4s +str q11, [x0, #448] +str q18, [x0, #464] +str q15, [x0, #480] +str q24, [x0, #496] +ldr q24, [x17, #+1152] +ldr q15, [x17, #+1168] +ldr q18, [x17, #+1184] +ldr q11, [x17, #+1200] +ldr q25, [x17, #+1216] +ldr q22, [x17, #+1232] +ldr q1, [x17, #+1248] +ldr q28, [x17, #+1264] +ldr q6, [x0, #544] +ldr q13, [x0, #560] +ldr q8, [x0, #512] +ldr q21, [x0, #528] +sqrdmulh v9.4S, v6.4S, v15.s[0] +mul v6.4S, v6.4S,v24.s[0] +mla v6.4S, v9.4S, v31.s[0] +sub v9.4s, v8.4s, v6.4s +add v8.4s, v8.4s, v6.4s +sqrdmulh v6.4S, v13.4S, v15.s[0] +mul v13.4S, v13.4S,v24.s[0] +mla v13.4S, v6.4S, v31.s[0] +sub v6.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +sqrdmulh v13.4S, v21.4S, v15.s[1] +mul v21.4S, v21.4S,v24.s[1] +mla v21.4S, v13.4S, v31.s[0] +sub v13.4s, v8.4s, v21.4s +add v8.4s, v8.4s, v21.4s +sqrdmulh v21.4S, v6.4S, v15.s[2] +mul v6.4S, v6.4S,v24.s[2] +mla v6.4S, v21.4S, v31.s[0] +sub v21.4s, v9.4s, v6.4s +add v9.4s, v9.4s, v6.4s +trn1 v6.4S, v8.4S, v13.4S +trn2 v23.4S, v8.4S, v13.4S +trn1 v5.4S, v9.4S, v21.4S +trn2 v0.4S, v9.4S, v21.4S +trn2 v9.2D, v6.2D, v5.2D +trn2 v21.2D, v23.2D, v0.2D +trn1 v8.2D, v6.2D, v5.2D +trn1 v13.2D, v23.2D, v0.2D +sqrdmulh v0.4S, v9.4S, v11.4S +mul v9.4S, v9.4S,v18.4S +mla v9.4S, v0.4S, v31.s[0] +sub v0.4s, v8.4s, v9.4s +add v8.4s, v8.4s, v9.4s +sqrdmulh v9.4S, v21.4S, v11.4S +mul v21.4S, v21.4S,v18.4S +mla v21.4S, v9.4S, v31.s[0] +sub v9.4s, v13.4s, v21.4s +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v13.4S, v22.4S +mul v13.4S, v13.4S,v25.4S +mla v13.4S, v21.4S, v31.s[0] +sub v21.4s, v8.4s, v13.4s +add v8.4s, v8.4s, v13.4s +sqrdmulh v13.4S, v9.4S, v28.4S +mul v9.4S, v9.4S,v1.4S +mla v9.4S, v13.4S, v31.s[0] +sub v13.4s, v0.4s, v9.4s +add v0.4s, v0.4s, v9.4s +str q8, [x0, #512] +str q21, [x0, #528] +str q0, [x0, #544] +str q13, [x0, #560] +ldr q13, [x17, #+1280] +ldr q0, [x17, #+1296] +ldr q21, [x17, #+1312] +ldr q8, [x17, #+1328] +ldr q9, [x17, #+1344] +ldr q23, [x17, #+1360] +ldr q5, [x17, #+1376] +ldr q6, [x17, #+1392] +ldr q28, [x0, #608] +ldr q1, [x0, #624] +ldr q22, [x0, #576] +ldr q25, [x0, #592] +sqrdmulh v11.4S, v28.4S, v0.s[0] +mul v28.4S, v28.4S,v13.s[0] +mla v28.4S, v11.4S, v31.s[0] +sub v11.4s, v22.4s, v28.4s +add v22.4s, v22.4s, v28.4s +sqrdmulh v28.4S, v1.4S, v0.s[0] +mul v1.4S, v1.4S,v13.s[0] +mla v1.4S, v28.4S, v31.s[0] +sub v28.4s, v25.4s, v1.4s +add v25.4s, v25.4s, v1.4s +sqrdmulh v1.4S, v25.4S, v0.s[1] +mul v25.4S, v25.4S,v13.s[1] +mla v25.4S, v1.4S, v31.s[0] +sub v1.4s, v22.4s, v25.4s +add v22.4s, v22.4s, v25.4s +sqrdmulh v25.4S, v28.4S, v0.s[2] +mul v28.4S, v28.4S,v13.s[2] +mla v28.4S, v25.4S, v31.s[0] +sub v25.4s, v11.4s, v28.4s +add v11.4s, v11.4s, v28.4s +trn1 v28.4S, v22.4S, v1.4S +trn2 v18.4S, v22.4S, v1.4S +trn1 v15.4S, v11.4S, v25.4S +trn2 v24.4S, v11.4S, v25.4S +trn2 v11.2D, v28.2D, v15.2D +trn2 v25.2D, v18.2D, v24.2D +trn1 v22.2D, v28.2D, v15.2D +trn1 v1.2D, v18.2D, v24.2D +sqrdmulh v24.4S, v11.4S, v8.4S +mul v11.4S, v11.4S,v21.4S +mla v11.4S, v24.4S, v31.s[0] +sub v24.4s, v22.4s, v11.4s +add v22.4s, v22.4s, v11.4s +sqrdmulh v11.4S, v25.4S, v8.4S +mul v25.4S, v25.4S,v21.4S +mla v25.4S, v11.4S, v31.s[0] +sub v11.4s, v1.4s, v25.4s +add v1.4s, v1.4s, v25.4s +sqrdmulh v25.4S, v1.4S, v23.4S +mul v1.4S, v1.4S,v9.4S +mla v1.4S, v25.4S, v31.s[0] +sub v25.4s, v22.4s, v1.4s +add v22.4s, v22.4s, v1.4s +sqrdmulh v1.4S, v11.4S, v6.4S +mul v11.4S, v11.4S,v5.4S +mla v11.4S, v1.4S, v31.s[0] +sub v1.4s, v24.4s, v11.4s +add v24.4s, v24.4s, v11.4s +str q22, [x0, #576] +str q25, [x0, #592] +str q24, [x0, #608] +str q1, [x0, #624] +ldr q1, [x17, #+1408] +ldr q24, [x17, #+1424] +ldr q25, [x17, #+1440] +ldr q22, [x17, #+1456] +ldr q11, [x17, #+1472] +ldr q18, [x17, #+1488] +ldr q15, [x17, #+1504] +ldr q28, [x17, #+1520] +ldr q6, [x0, #672] +ldr q5, [x0, #688] +ldr q23, [x0, #640] +ldr q9, [x0, #656] +sqrdmulh v8.4S, v6.4S, v24.s[0] +mul v6.4S, v6.4S,v1.s[0] +mla v6.4S, v8.4S, v31.s[0] +sub v8.4s, v23.4s, v6.4s +add v23.4s, v23.4s, v6.4s +sqrdmulh v6.4S, v5.4S, v24.s[0] +mul v5.4S, v5.4S,v1.s[0] +mla v5.4S, v6.4S, v31.s[0] +sub v6.4s, v9.4s, v5.4s +add v9.4s, v9.4s, v5.4s +sqrdmulh v5.4S, v9.4S, v24.s[1] +mul v9.4S, v9.4S,v1.s[1] +mla v9.4S, v5.4S, v31.s[0] +sub v5.4s, v23.4s, v9.4s +add v23.4s, v23.4s, v9.4s +sqrdmulh v9.4S, v6.4S, v24.s[2] +mul v6.4S, v6.4S,v1.s[2] +mla v6.4S, v9.4S, v31.s[0] +sub v9.4s, v8.4s, v6.4s +add v8.4s, v8.4s, v6.4s +trn1 v6.4S, v23.4S, v5.4S +trn2 v21.4S, v23.4S, v5.4S +trn1 v0.4S, v8.4S, v9.4S +trn2 v13.4S, v8.4S, v9.4S +trn2 v8.2D, v6.2D, v0.2D +trn2 v9.2D, v21.2D, v13.2D +trn1 v23.2D, v6.2D, v0.2D +trn1 v5.2D, v21.2D, v13.2D +sqrdmulh v13.4S, v8.4S, v22.4S +mul v8.4S, v8.4S,v25.4S +mla v8.4S, v13.4S, v31.s[0] +sub v13.4s, v23.4s, v8.4s +add v23.4s, v23.4s, v8.4s +sqrdmulh v8.4S, v9.4S, v22.4S +mul v9.4S, v9.4S,v25.4S +mla v9.4S, v8.4S, v31.s[0] +sub v8.4s, v5.4s, v9.4s +add v5.4s, v5.4s, v9.4s +sqrdmulh v9.4S, v5.4S, v18.4S +mul v5.4S, v5.4S,v11.4S +mla v5.4S, v9.4S, v31.s[0] +sub v9.4s, v23.4s, v5.4s +add v23.4s, v23.4s, v5.4s +sqrdmulh v5.4S, v8.4S, v28.4S +mul v8.4S, v8.4S,v15.4S +mla v8.4S, v5.4S, v31.s[0] +sub v5.4s, v13.4s, v8.4s +add v13.4s, v13.4s, v8.4s +str q23, [x0, #640] +str q9, [x0, #656] +str q13, [x0, #672] +str q5, [x0, #688] +ldr q5, [x17, #+1536] +ldr q13, [x17, #+1552] +ldr q9, [x17, #+1568] +ldr q23, [x17, #+1584] +ldr q8, [x17, #+1600] +ldr q21, [x17, #+1616] +ldr q0, [x17, #+1632] +ldr q6, [x17, #+1648] +ldr q28, [x0, #736] +ldr q15, [x0, #752] +ldr q18, [x0, #704] +ldr q11, [x0, #720] +sqrdmulh v22.4S, v28.4S, v13.s[0] +mul v28.4S, v28.4S,v5.s[0] +mla v28.4S, v22.4S, v31.s[0] +sub v22.4s, v18.4s, v28.4s +add v18.4s, v18.4s, v28.4s +sqrdmulh v28.4S, v15.4S, v13.s[0] +mul v15.4S, v15.4S,v5.s[0] +mla v15.4S, v28.4S, v31.s[0] +sub v28.4s, v11.4s, v15.4s +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v13.s[1] +mul v11.4S, v11.4S,v5.s[1] +mla v11.4S, v15.4S, v31.s[0] +sub v15.4s, v18.4s, v11.4s +add v18.4s, v18.4s, v11.4s +sqrdmulh v11.4S, v28.4S, v13.s[2] +mul v28.4S, v28.4S,v5.s[2] +mla v28.4S, v11.4S, v31.s[0] +sub v11.4s, v22.4s, v28.4s +add v22.4s, v22.4s, v28.4s +trn1 v28.4S, v18.4S, v15.4S +trn2 v25.4S, v18.4S, v15.4S +trn1 v24.4S, v22.4S, v11.4S +trn2 v1.4S, v22.4S, v11.4S +trn2 v22.2D, v28.2D, v24.2D +trn2 v11.2D, v25.2D, v1.2D +trn1 v18.2D, v28.2D, v24.2D +trn1 v15.2D, v25.2D, v1.2D +sqrdmulh v1.4S, v22.4S, v23.4S +mul v22.4S, v22.4S,v9.4S +mla v22.4S, v1.4S, v31.s[0] +sub v1.4s, v18.4s, v22.4s +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v11.4S, v23.4S +mul v11.4S, v11.4S,v9.4S +mla v11.4S, v22.4S, v31.s[0] +sub v22.4s, v15.4s, v11.4s +add v15.4s, v15.4s, v11.4s +sqrdmulh v11.4S, v15.4S, v21.4S +mul v15.4S, v15.4S,v8.4S +mla v15.4S, v11.4S, v31.s[0] +sub v11.4s, v18.4s, v15.4s +add v18.4s, v18.4s, v15.4s +sqrdmulh v15.4S, v22.4S, v6.4S +mul v22.4S, v22.4S,v0.4S +mla v22.4S, v15.4S, v31.s[0] +sub v15.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +str q18, [x0, #704] +str q11, [x0, #720] +str q1, [x0, #736] +str q15, [x0, #752] +ldr q15, [x17, #+1664] +ldr q1, [x17, #+1680] +ldr q11, [x17, #+1696] +ldr q18, [x17, #+1712] +ldr q22, [x17, #+1728] +ldr q25, [x17, #+1744] +ldr q24, [x17, #+1760] +ldr q28, [x17, #+1776] +ldr q6, [x0, #800] +ldr q0, [x0, #816] +ldr q21, [x0, #768] +ldr q8, [x0, #784] +sqrdmulh v23.4S, v6.4S, v1.s[0] +mul v6.4S, v6.4S,v15.s[0] +mla v6.4S, v23.4S, v31.s[0] +sub v23.4s, v21.4s, v6.4s +add v21.4s, v21.4s, v6.4s +sqrdmulh v6.4S, v0.4S, v1.s[0] +mul v0.4S, v0.4S,v15.s[0] +mla v0.4S, v6.4S, v31.s[0] +sub v6.4s, v8.4s, v0.4s +add v8.4s, v8.4s, v0.4s +sqrdmulh v0.4S, v8.4S, v1.s[1] +mul v8.4S, v8.4S,v15.s[1] +mla v8.4S, v0.4S, v31.s[0] +sub v0.4s, v21.4s, v8.4s +add v21.4s, v21.4s, v8.4s +sqrdmulh v8.4S, v6.4S, v1.s[2] +mul v6.4S, v6.4S,v15.s[2] +mla v6.4S, v8.4S, v31.s[0] +sub v8.4s, v23.4s, v6.4s +add v23.4s, v23.4s, v6.4s +trn1 v6.4S, v21.4S, v0.4S +trn2 v9.4S, v21.4S, v0.4S +trn1 v13.4S, v23.4S, v8.4S +trn2 v5.4S, v23.4S, v8.4S +trn2 v23.2D, v6.2D, v13.2D +trn2 v8.2D, v9.2D, v5.2D +trn1 v21.2D, v6.2D, v13.2D +trn1 v0.2D, v9.2D, v5.2D +sqrdmulh v5.4S, v23.4S, v18.4S +mul v23.4S, v23.4S,v11.4S +mla v23.4S, v5.4S, v31.s[0] +sub v5.4s, v21.4s, v23.4s +add v21.4s, v21.4s, v23.4s +sqrdmulh v23.4S, v8.4S, v18.4S +mul v8.4S, v8.4S,v11.4S +mla v8.4S, v23.4S, v31.s[0] +sub v23.4s, v0.4s, v8.4s +add v0.4s, v0.4s, v8.4s +sqrdmulh v8.4S, v0.4S, v25.4S +mul v0.4S, v0.4S,v22.4S +mla v0.4S, v8.4S, v31.s[0] +sub v8.4s, v21.4s, v0.4s +add v21.4s, v21.4s, v0.4s +sqrdmulh v0.4S, v23.4S, v28.4S +mul v23.4S, v23.4S,v24.4S +mla v23.4S, v0.4S, v31.s[0] +sub v0.4s, v5.4s, v23.4s +add v5.4s, v5.4s, v23.4s +str q21, [x0, #768] +str q8, [x0, #784] +str q5, [x0, #800] +str q0, [x0, #816] +ldr q0, [x17, #+1792] +ldr q5, [x17, #+1808] +ldr q8, [x17, #+1824] +ldr q21, [x17, #+1840] +ldr q23, [x17, #+1856] +ldr q9, [x17, #+1872] +ldr q13, [x17, #+1888] +ldr q6, [x17, #+1904] +ldr q28, [x0, #864] +ldr q24, [x0, #880] +ldr q25, [x0, #832] +ldr q22, [x0, #848] +sqrdmulh v18.4S, v28.4S, v5.s[0] +mul v28.4S, v28.4S,v0.s[0] +mla v28.4S, v18.4S, v31.s[0] +sub v18.4s, v25.4s, v28.4s +add v25.4s, v25.4s, v28.4s +sqrdmulh v28.4S, v24.4S, v5.s[0] +mul v24.4S, v24.4S,v0.s[0] +mla v24.4S, v28.4S, v31.s[0] +sub v28.4s, v22.4s, v24.4s +add v22.4s, v22.4s, v24.4s +sqrdmulh v24.4S, v22.4S, v5.s[1] +mul v22.4S, v22.4S,v0.s[1] +mla v22.4S, v24.4S, v31.s[0] +sub v24.4s, v25.4s, v22.4s +add v25.4s, v25.4s, v22.4s +sqrdmulh v22.4S, v28.4S, v5.s[2] +mul v28.4S, v28.4S,v0.s[2] +mla v28.4S, v22.4S, v31.s[0] +sub v22.4s, v18.4s, v28.4s +add v18.4s, v18.4s, v28.4s +trn1 v28.4S, v25.4S, v24.4S +trn2 v11.4S, v25.4S, v24.4S +trn1 v1.4S, v18.4S, v22.4S +trn2 v15.4S, v18.4S, v22.4S +trn2 v18.2D, v28.2D, v1.2D +trn2 v22.2D, v11.2D, v15.2D +trn1 v25.2D, v28.2D, v1.2D +trn1 v24.2D, v11.2D, v15.2D +sqrdmulh v15.4S, v18.4S, v21.4S +mul v18.4S, v18.4S,v8.4S +mla v18.4S, v15.4S, v31.s[0] +sub v15.4s, v25.4s, v18.4s +add v25.4s, v25.4s, v18.4s +sqrdmulh v18.4S, v22.4S, v21.4S +mul v22.4S, v22.4S,v8.4S +mla v22.4S, v18.4S, v31.s[0] +sub v18.4s, v24.4s, v22.4s +add v24.4s, v24.4s, v22.4s +sqrdmulh v22.4S, v24.4S, v9.4S +mul v24.4S, v24.4S,v23.4S +mla v24.4S, v22.4S, v31.s[0] +sub v22.4s, v25.4s, v24.4s +add v25.4s, v25.4s, v24.4s +sqrdmulh v24.4S, v18.4S, v6.4S +mul v18.4S, v18.4S,v13.4S +mla v18.4S, v24.4S, v31.s[0] +sub v24.4s, v15.4s, v18.4s +add v15.4s, v15.4s, v18.4s +str q25, [x0, #832] +str q22, [x0, #848] +str q15, [x0, #864] +str q24, [x0, #880] +ldr q24, [x17, #+1920] +ldr q15, [x17, #+1936] +ldr q22, [x17, #+1952] +ldr q25, [x17, #+1968] +ldr q18, [x17, #+1984] +ldr q11, [x17, #+2000] +ldr q1, [x17, #+2016] +ldr q28, [x17, #+2032] +ldr q6, [x0, #928] +ldr q13, [x0, #944] +ldr q9, [x0, #896] +ldr q23, [x0, #912] +sqrdmulh v21.4S, v6.4S, v15.s[0] +mul v6.4S, v6.4S,v24.s[0] +mla v6.4S, v21.4S, v31.s[0] +sub v21.4s, v9.4s, v6.4s +add v9.4s, v9.4s, v6.4s +sqrdmulh v6.4S, v13.4S, v15.s[0] +mul v13.4S, v13.4S,v24.s[0] +mla v13.4S, v6.4S, v31.s[0] +sub v6.4s, v23.4s, v13.4s +add v23.4s, v23.4s, v13.4s +sqrdmulh v13.4S, v23.4S, v15.s[1] +mul v23.4S, v23.4S,v24.s[1] +mla v23.4S, v13.4S, v31.s[0] +sub v13.4s, v9.4s, v23.4s +add v9.4s, v9.4s, v23.4s +sqrdmulh v23.4S, v6.4S, v15.s[2] +mul v6.4S, v6.4S,v24.s[2] +mla v6.4S, v23.4S, v31.s[0] +sub v23.4s, v21.4s, v6.4s +add v21.4s, v21.4s, v6.4s +trn1 v6.4S, v9.4S, v13.4S +trn2 v8.4S, v9.4S, v13.4S +trn1 v5.4S, v21.4S, v23.4S +trn2 v0.4S, v21.4S, v23.4S +trn2 v21.2D, v6.2D, v5.2D +trn2 v23.2D, v8.2D, v0.2D +trn1 v9.2D, v6.2D, v5.2D +trn1 v13.2D, v8.2D, v0.2D +sqrdmulh v0.4S, v21.4S, v25.4S +mul v21.4S, v21.4S,v22.4S +mla v21.4S, v0.4S, v31.s[0] +sub v0.4s, v9.4s, v21.4s +add v9.4s, v9.4s, v21.4s +sqrdmulh v21.4S, v23.4S, v25.4S +mul v23.4S, v23.4S,v22.4S +mla v23.4S, v21.4S, v31.s[0] +sub v21.4s, v13.4s, v23.4s +add v13.4s, v13.4s, v23.4s +sqrdmulh v23.4S, v13.4S, v11.4S +mul v13.4S, v13.4S,v18.4S +mla v13.4S, v23.4S, v31.s[0] +sub v23.4s, v9.4s, v13.4s +add v9.4s, v9.4s, v13.4s +sqrdmulh v13.4S, v21.4S, v28.4S +mul v21.4S, v21.4S,v1.4S +mla v21.4S, v13.4S, v31.s[0] +sub v13.4s, v0.4s, v21.4s +add v0.4s, v0.4s, v21.4s +str q9, [x0, #896] +str q23, [x0, #912] +str q0, [x0, #928] +str q13, [x0, #944] +ldr q13, [x17, #+2048] +ldr q0, [x17, #+2064] +ldr q23, [x17, #+2080] +ldr q9, [x17, #+2096] +ldr q21, [x17, #+2112] +ldr q8, [x17, #+2128] +ldr q5, [x17, #+2144] +ldr q6, [x17, #+2160] +ldr q28, [x0, #992] +ldr q1, [x0, #1008] +ldr q11, [x0, #960] +ldr q18, [x0, #976] +sqrdmulh v25.4S, v28.4S, v0.s[0] +mul v28.4S, v28.4S,v13.s[0] +mla v28.4S, v25.4S, v31.s[0] +sub v25.4s, v11.4s, v28.4s +add v11.4s, v11.4s, v28.4s +sqrdmulh v28.4S, v1.4S, v0.s[0] +mul v1.4S, v1.4S,v13.s[0] +mla v1.4S, v28.4S, v31.s[0] +sub v28.4s, v18.4s, v1.4s +add v18.4s, v18.4s, v1.4s +sqrdmulh v1.4S, v18.4S, v0.s[1] +mul v18.4S, v18.4S,v13.s[1] +mla v18.4S, v1.4S, v31.s[0] +sub v1.4s, v11.4s, v18.4s +add v11.4s, v11.4s, v18.4s +sqrdmulh v18.4S, v28.4S, v0.s[2] +mul v28.4S, v28.4S,v13.s[2] +mla v28.4S, v18.4S, v31.s[0] +sub v18.4s, v25.4s, v28.4s +add v25.4s, v25.4s, v28.4s +trn1 v28.4S, v11.4S, v1.4S +trn2 v22.4S, v11.4S, v1.4S +trn1 v15.4S, v25.4S, v18.4S +trn2 v24.4S, v25.4S, v18.4S +trn2 v25.2D, v28.2D, v15.2D +trn2 v18.2D, v22.2D, v24.2D +trn1 v11.2D, v28.2D, v15.2D +trn1 v1.2D, v22.2D, v24.2D +sqrdmulh v24.4S, v25.4S, v9.4S +mul v25.4S, v25.4S,v23.4S +mla v25.4S, v24.4S, v31.s[0] +sub v24.4s, v11.4s, v25.4s +add v11.4s, v11.4s, v25.4s +sqrdmulh v25.4S, v18.4S, v9.4S +mul v18.4S, v18.4S,v23.4S +mla v18.4S, v25.4S, v31.s[0] +sub v25.4s, v1.4s, v18.4s +add v1.4s, v1.4s, v18.4s +sqrdmulh v18.4S, v1.4S, v8.4S +mul v1.4S, v1.4S,v21.4S +mla v1.4S, v18.4S, v31.s[0] +sub v18.4s, v11.4s, v1.4s +add v11.4s, v11.4s, v1.4s +sqrdmulh v1.4S, v25.4S, v6.4S +mul v25.4S, v25.4S,v5.4S +mla v25.4S, v1.4S, v31.s[0] +sub v1.4s, v24.4s, v25.4s +add v24.4s, v24.4s, v25.4s +str q11, [x0, #960] +str q18, [x0, #976] +str q24, [x0, #992] +str q1, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 2456 +// Instruction count: 2452 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_1_0.s b/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_1_0.s new file mode 100644 index 0000000..d6c8d9d --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_1_0.s @@ -0,0 +1,2422 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 26036764 // Layer 6, block 0 +.word 7065381 // Layer 6, block 1 +.word 11280567 // Layer 6, block 2 +.word 19695786 // Layer 6, block 3 +.word 1666225723 // Layer 6, block 0 +.word 452149874 // Layer 6, block 1 +.word 721901190 // Layer 6, block 2 +.word 1260434103 // Layer 6, block 3 +.word 28678040 // Layer 7, block 0 +.word 5637166 // Layer 7, block 2 +.word 18759424 // Layer 7, block 4 +.word 8648030 // Layer 7, block 6 +.word 1835254486 // Layer 7, block 0 +.word 360751090 // Layer 7, block 2 +.word 1200511508 // Layer 7, block 4 +.word 553431680 // Layer 7, block 6 +.word 7232147 // Layer 7, block 1 +.word 7430689 // Layer 7, block 3 +.word 14819378 // Layer 7, block 5 +.word 22112339 // Layer 7, block 7 +.word 462822084 // Layer 7, block 1 +.word 475527802 // Layer 7, block 3 +.word 948367809 // Layer 7, block 5 +.word 1415081692 // Layer 7, block 7 +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14834498 // Layer 6, block 4 +.word 22861321 // Layer 6, block 5 +.word 23033862 // Layer 6, block 6 +.word 32211066 // Layer 6, block 7 +.word 949335415 // Layer 6, block 4 +.word 1463012881 // Layer 6, block 5 +.word 1474054663 // Layer 6, block 6 +.word 2061350894 // Layer 6, block 7 +.word 7103825 // Layer 7, block 8 +.word 24338119 // Layer 7, block 10 +.word 6674394 // Layer 7, block 12 +.word 3716128 // Layer 7, block 14 +.word 454610102 // Layer 7, block 8 +.word 1557520740 // Layer 7, block 10 +.word 427128616 // Layer 7, block 12 +.word 237814041 // Layer 7, block 14 +.word 18577393 // Layer 7, block 9 +.word 17042091 // Layer 7, block 11 +.word 6574213 // Layer 7, block 13 +.word 24666803 // Layer 7, block 15 +.word 1188862414 // Layer 7, block 9 +.word 1090610585 // Layer 7, block 11 +.word 420717521 // Layer 7, block 13 +.word 1578554911 // Layer 7, block 15 +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 11253846 // Layer 6, block 8 +.word 16151303 // Layer 6, block 9 +.word 1821442 // Layer 6, block 10 +.word 23358663 // Layer 6, block 11 +.word 720191176 // Layer 6, block 8 +.word 1033604503 // Layer 6, block 9 +.word 116563391 // Layer 6, block 10 +.word 1494840340 // Layer 6, block 11 +.word 32787475 // Layer 7, block 16 +.word 8269259 // Layer 7, block 18 +.word 20826321 // Layer 7, block 20 +.word 21194054 // Layer 7, block 22 +.word 2098238255 // Layer 7, block 16 +.word 529192186 // Layer 7, block 18 +.word 1332782821 // Layer 7, block 20 +.word 1356315937 // Layer 7, block 22 +.word 28400654 // Layer 7, block 17 +.word 31090287 // Layer 7, block 19 +.word 26776841 // Layer 7, block 21 +.word 22281074 // Layer 7, block 23 +.word 1817503137 // Layer 7, block 17 +.word 1989626512 // Layer 7, block 19 +.word 1713587037 // Layer 7, block 21 +.word 1425879908 // Layer 7, block 23 +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 20504641 // Layer 6, block 12 +.word 7735096 // Layer 6, block 13 +.word 29463916 // Layer 6, block 14 +.word 23172067 // Layer 6, block 15 +.word 1312196872 // Layer 6, block 12 +.word 495008363 // Layer 6, block 13 +.word 1885546712 // Layer 6, block 14 +.word 1482899108 // Layer 6, block 15 +.word 1953000 // Layer 7, block 24 +.word 12766243 // Layer 7, block 26 +.word 16292342 // Layer 7, block 28 +.word 25143337 // Layer 7, block 30 +.word 124982461 // Layer 7, block 24 +.word 816977197 // Layer 7, block 26 +.word 1042630311 // Layer 7, block 28 +.word 1609050759 // Layer 7, block 30 +.word 12486848 // Layer 7, block 25 +.word 31556661 // Layer 7, block 27 +.word 28330310 // Layer 7, block 29 +.word 15137961 // Layer 7, block 31 +.word 799097282 // Layer 7, block 25 +.word 2019472170 // Layer 7, block 27 +.word 1813001465 // Layer 7, block 29 +.word 968755565 // Layer 7, block 31 +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 18663828 // Layer 6, block 16 +.word 25765932 // Layer 6, block 17 +.word 11779122 // Layer 6, block 18 +.word 29112305 // Layer 6, block 19 +.word 1194393831 // Layer 6, block 16 +.word 1648893798 // Layer 6, block 17 +.word 753806275 // Layer 6, block 18 +.word 1863045325 // Layer 6, block 19 +.word 33163184 // Layer 7, block 32 +.word 11550623 // Layer 7, block 34 +.word 25375595 // Layer 7, block 36 +.word 18254638 // Layer 7, block 38 +.word 2122281795 // Layer 7, block 32 +.word 739183455 // Layer 7, block 34 +.word 1623914137 // Layer 7, block 36 +.word 1168207670 // Layer 7, block 38 +.word 9551359 // Layer 7, block 33 +.word 33257316 // Layer 7, block 35 +.word 10387700 // Layer 7, block 37 +.word 4263629 // Layer 7, block 39 +.word 611240324 // Layer 7, block 33 +.word 2128305784 // Layer 7, block 35 +.word 664762063 // Layer 7, block 37 +.word 272851431 // Layer 7, block 39 +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 596073 // Layer 6, block 20 +.word 29039358 // Layer 6, block 21 +.word 6760262 // Layer 6, block 22 +.word 2228887 // Layer 6, block 23 +.word 38145761 // Layer 6, block 20 +.word 1858377074 // Layer 6, block 21 +.word 432623749 // Layer 6, block 22 +.word 142637881 // Layer 6, block 23 +.word 25929180 // Layer 7, block 40 +.word 23508428 // Layer 7, block 42 +.word 22560727 // Layer 7, block 44 +.word 29457393 // Layer 7, block 46 +.word 1659340873 // Layer 7, block 40 +.word 1504424569 // Layer 7, block 42 +.word 1443776334 // Layer 7, block 44 +.word 1885129272 // Layer 7, block 46 +.word 17371159 // Layer 7, block 41 +.word 11558208 // Layer 7, block 43 +.word 15755637 // Layer 7, block 45 +.word 20740787 // Layer 7, block 47 +.word 1111669329 // Layer 7, block 41 +.word 739668858 // Layer 7, block 43 +.word 1008283812 // Layer 7, block 45 +.word 1327309063 // Layer 7, block 47 +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 13624329 // Layer 6, block 24 +.word 9838349 // Layer 6, block 25 +.word 6934560 // Layer 6, block 26 +.word 11310234 // Layer 6, block 27 +.word 871890510 // Layer 6, block 24 +.word 629606282 // Layer 6, block 25 +.word 443777969 // Layer 6, block 26 +.word 723799733 // Layer 6, block 27 +.word 3153984 // Layer 7, block 48 +.word 15599806 // Layer 7, block 50 +.word 23484790 // Layer 7, block 52 +.word 30174454 // Layer 7, block 54 +.word 201839571 // Layer 7, block 48 +.word 998311389 // Layer 7, block 50 +.word 1502911852 // Layer 7, block 52 +.word 1931017673 // Layer 7, block 54 +.word 13598070 // Layer 7, block 49 +.word 31454003 // Layer 7, block 51 +.word 20506260 // Layer 7, block 53 +.word 5928435 // Layer 7, block 55 +.word 870210062 // Layer 7, block 49 +.word 2012902560 // Layer 7, block 51 +.word 1312300480 // Layer 7, block 53 +.word 379390883 // Layer 7, block 55 +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 32798516 // Layer 6, block 28 +.word 9911360 // Layer 6, block 29 +.word 32443170 // Layer 6, block 30 +.word 31293482 // Layer 6, block 31 +.word 2098944825 // Layer 6, block 28 +.word 634278629 // Layer 6, block 29 +.word 2076204416 // Layer 6, block 30 +.word 2002630000 // Layer 6, block 31 +.word 26013877 // Layer 7, block 56 +.word 22928950 // Layer 7, block 58 +.word 24547058 // Layer 7, block 60 +.word 21082546 // Layer 7, block 62 +.word 1664761067 // Layer 7, block 56 +.word 1467340807 // Layer 7, block 58 +.word 1570891816 // Layer 7, block 60 +.word 1349179970 // Layer 7, block 62 +.word 21864746 // Layer 7, block 57 +.word 27678266 // Layer 7, block 59 +.word 30695887 // Layer 7, block 61 +.word 31772478 // Layer 7, block 63 +.word 1399236949 // Layer 7, block 57 +.word 1771273834 // Layer 7, block 59 +.word 1964386839 // Layer 7, block 61 +.word 2033283404 // Layer 7, block 63 +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 2853776 // Layer 6, block 32 +.word 31645959 // Layer 6, block 33 +.word 29723614 // Layer 6, block 34 +.word 31813171 // Layer 6, block 35 +.word 182627725 // Layer 6, block 32 +.word 2025186806 // Layer 6, block 33 +.word 1902166116 // Layer 6, block 34 +.word 2035887557 // Layer 6, block 35 +.word 30377953 // Layer 7, block 64 +.word 4924837 // Layer 7, block 66 +.word 11362575 // Layer 7, block 68 +.word 31398766 // Layer 7, block 70 +.word 1944040616 // Layer 7, block 64 +.word 315165513 // Layer 7, block 66 +.word 727149301 // Layer 7, block 68 +.word 2009367662 // Layer 7, block 70 +.word 27689101 // Layer 7, block 65 +.word 31229525 // Layer 7, block 67 +.word 6544948 // Layer 7, block 69 +.word 13728247 // Layer 7, block 71 +.word 1771967221 // Layer 7, block 65 +.word 1998537064 // Layer 7, block 67 +.word 418844704 // Layer 7, block 69 +.word 878540754 // Layer 7, block 71 +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9116920 // Layer 6, block 36 +.word 26449800 // Layer 6, block 37 +.word 27173300 // Layer 6, block 38 +.word 1574249 // Layer 6, block 39 +.word 583438350 // Layer 6, block 36 +.word 1692658010 // Layer 6, block 37 +.word 1738958476 // Layer 6, block 38 +.word 100744247 // Layer 6, block 39 +.word 6510145 // Layer 7, block 72 +.word 760999 // Layer 7, block 74 +.word 1634503 // Layer 7, block 76 +.word 29546109 // Layer 7, block 78 +.word 416617482 // Layer 7, block 72 +.word 48700219 // Layer 7, block 74 +.word 104600209 // Layer 7, block 76 +.word 1890806663 // Layer 7, block 78 +.word 2195232 // Layer 7, block 73 +.word 4465852 // Layer 7, block 75 +.word 31203102 // Layer 7, block 77 +.word 29916743 // Layer 7, block 79 +.word 140484126 // Layer 7, block 73 +.word 285792715 // Layer 7, block 75 +.word 1996846121 // Layer 7, block 77 +.word 1914525428 // Layer 7, block 79 +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29172999 // Layer 6, block 40 +.word 16825951 // Layer 6, block 41 +.word 11592382 // Layer 6, block 42 +.word 2671395 // Layer 6, block 43 +.word 1866929445 // Layer 6, block 40 +.word 1076778680 // Layer 6, block 41 +.word 741855827 // Layer 6, block 42 +.word 170956232 // Layer 6, block 43 +.word 14579779 // Layer 7, block 80 +.word 24263513 // Layer 7, block 82 +.word 4646776 // Layer 7, block 84 +.word 69049 // Layer 7, block 86 +.word 933034643 // Layer 7, block 80 +.word 1552746321 // Layer 7, block 82 +.word 297370968 // Layer 7, block 84 +.word 4418799 // Layer 7, block 86 +.word 33263488 // Layer 7, block 81 +.word 22493246 // Layer 7, block 83 +.word 22009979 // Layer 7, block 85 +.word 12021234 // Layer 7, block 87 +.word 2128700762 // Layer 7, block 81 +.word 1439457879 // Layer 7, block 83 +.word 1408531152 // Layer 7, block 85 +.word 769300260 // Layer 7, block 87 +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 15720958 // Layer 6, block 44 +.word 4876619 // Layer 6, block 45 +.word 9370171 // Layer 6, block 46 +.word 2197027 // Layer 6, block 47 +.word 1006064525 // Layer 6, block 44 +.word 312079797 // Layer 6, block 45 +.word 599645177 // Layer 6, block 46 +.word 140598997 // Layer 6, block 47 +.word 16117282 // Layer 7, block 88 +.word 9635661 // Layer 7, block 90 +.word 9117520 // Layer 7, block 92 +.word 3506913 // Layer 7, block 94 +.word 1031427326 // Layer 7, block 88 +.word 616635240 // Layer 7, block 90 +.word 583476747 // Layer 7, block 92 +.word 224425303 // Layer 7, block 94 +.word 20014407 // Layer 7, block 89 +.word 25893988 // Layer 7, block 91 +.word 10257619 // Layer 7, block 93 +.word 24501669 // Layer 7, block 95 +.word 1280824291 // Layer 7, block 89 +.word 1657088757 // Layer 7, block 91 +.word 656437514 // Layer 7, block 93 +.word 1567987141 // Layer 7, block 95 +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 23467272 // Layer 6, block 48 +.word 11944835 // Layer 6, block 49 +.word 29768154 // Layer 6, block 50 +.word 3189790 // Layer 6, block 51 +.word 1501790786 // Layer 6, block 48 +.word 764411097 // Layer 6, block 49 +.word 1905016458 // Layer 6, block 50 +.word 204130980 // Layer 6, block 51 +.word 28559032 // Layer 7, block 96 +.word 20151609 // Layer 7, block 98 +.word 11645481 // Layer 7, block 100 +.word 16402437 // Layer 7, block 102 +.word 1827638556 // Layer 7, block 96 +.word 1289604549 // Layer 7, block 98 +.word 745253903 // Layer 7, block 100 +.word 1049675853 // Layer 7, block 102 +.word 1005359 // Layer 7, block 97 +.word 19130139 // Layer 7, block 99 +.word 11690281 // Layer 7, block 101 +.word 5461508 // Layer 7, block 103 +.word 64338065 // Layer 7, block 97 +.word 1224235458 // Layer 7, block 99 +.word 748120885 // Layer 7, block 101 +.word 349509836 // Layer 7, block 103 +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 4898455 // Layer 6, block 52 +.word 22059944 // Layer 6, block 53 +.word 20315246 // Layer 6, block 54 +.word 28615767 // Layer 6, block 55 +.word 313477194 // Layer 6, block 52 +.word 1411728668 // Layer 6, block 53 +.word 1300076517 // Layer 6, block 54 +.word 1831269319 // Layer 6, block 55 +.word 6226096 // Layer 7, block 104 +.word 14029790 // Layer 7, block 106 +.word 7729000 // Layer 7, block 108 +.word 13958531 // Layer 7, block 110 +.word 398439734 // Layer 7, block 104 +.word 897838034 // Layer 7, block 106 +.word 494618249 // Layer 7, block 108 +.word 893277806 // Layer 7, block 110 +.word 31755058 // Layer 7, block 105 +.word 26102744 // Layer 7, block 107 +.word 19175904 // Layer 7, block 109 +.word 19472238 // Layer 7, block 111 +.word 2032168609 // Layer 7, block 105 +.word 1670448121 // Layer 7, block 107 +.word 1227164194 // Layer 7, block 109 +.word 1246128123 // Layer 7, block 111 +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 17302560 // Layer 6, block 56 +.word 8630188 // Layer 6, block 57 +.word 13744680 // Layer 6, block 58 +.word 31890906 // Layer 6, block 59 +.word 1107279328 // Layer 6, block 56 +.word 552289879 // Layer 6, block 57 +.word 879592386 // Layer 6, block 58 +.word 2040862218 // Layer 6, block 59 +.word 4735938 // Layer 7, block 112 +.word 26671657 // Layer 7, block 114 +.word 25810971 // Layer 7, block 116 +.word 25578690 // Layer 7, block 118 +.word 303076900 // Layer 7, block 112 +.word 1706855774 // Layer 7, block 114 +.word 1651776074 // Layer 7, block 116 +.word 1636911225 // Layer 7, block 118 +.word 6957373 // Layer 7, block 113 +.word 25381712 // Layer 7, block 115 +.word 27780827 // Layer 7, block 117 +.word 28062311 // Layer 7, block 119 +.word 445237890 // Layer 7, block 113 +.word 1624305595 // Layer 7, block 115 +.word 1777837237 // Layer 7, block 117 +.word 1795850838 // Layer 7, block 119 +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 26150922 // Layer 6, block 60 +.word 29525906 // Layer 6, block 61 +.word 23080870 // Layer 6, block 62 +.word 1636987 // Layer 6, block 63 +.word 1673531278 // Layer 6, block 60 +.word 1889513769 // Layer 6, block 61 +.word 1477062945 // Layer 6, block 62 +.word 104759172 // Layer 6, block 63 +.word 10674616 // Layer 7, block 120 +.word 9508293 // Layer 7, block 122 +.word 4274200 // Layer 7, block 124 +.word 10066304 // Layer 7, block 126 +.word 683123285 // Layer 7, block 120 +.word 608484310 // Layer 7, block 122 +.word 273527923 // Layer 7, block 124 +.word 644194289 // Layer 7, block 126 +.word 26473446 // Layer 7, block 121 +.word 14853570 // Layer 7, block 123 +.word 32427548 // Layer 7, block 125 +.word 16598340 // Layer 7, block 127 +.word 1694171239 // Layer 7, block 121 +.word 950555930 // Layer 7, block 123 +.word 2075204685 // Layer 7, block 125 +.word 1062212688 // Layer 7, block 127 +.text +.global ntt_u32_full_neon_asm_var_4_4_1_0 +.global _ntt_u32_full_neon_asm_var_4_4_1_0 +ntt_u32_full_neon_asm_var_4_4_1_0: +_ntt_u32_full_neon_asm_var_4_4_1_0: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #800] +ldr q21, [x0, #864] +ldr q20, [x0, #928] +ldr q19, [x0, #992] +ldr q18, [x0, #288] +ldr q17, [x0, #352] +ldr q16, [x0, #416] +ldr q3, [x0, #480] +ldr q2, [x0, #544] +ldr q1, [x0, #608] +ldr q0, [x0, #672] +ldr q15, [x0, #736] +ldr q14, [x0, #32] +ldr q13, [x0, #96] +ldr q12, [x0, #160] +ldr q11, [x0, #224] +sqrdmulh v10.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sqrdmulh v9.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +mla v22.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +mla v21.4S, v9.4S, v31.s[0] +sub v9.4s, v18.4s, v22.4s +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +mla v20.4S, v10.4S, v31.s[0] +sub v10.4s, v17.4s, v21.4s +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +mla v19.4S, v22.4S, v31.s[0] +sub v22.4s, v16.4s, v20.4s +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +mla v2.4S, v21.4S, v31.s[0] +sub v21.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v0.4S, v29.s[0] +mul v0.4S, v0.4S,v30.s[0] +mla v1.4S, v20.4S, v31.s[0] +sub v20.4s, v14.4s, v2.4s +add v14.4s, v14.4s, v2.4s +sqrdmulh v2.4S, v15.4S, v29.s[0] +mul v15.4S, v15.4S,v30.s[0] +mla v0.4S, v19.4S, v31.s[0] +sub v19.4s, v13.4s, v1.4s +add v13.4s, v13.4s, v1.4s +sqrdmulh v1.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +mla v15.4S, v2.4S, v31.s[0] +sub v2.4s, v12.4s, v0.4s +add v12.4s, v12.4s, v0.4s +sqrdmulh v0.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +mla v16.4S, v1.4S, v31.s[0] +sub v1.4s, v11.4s, v15.4s +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +mla v3.4S, v0.4S, v31.s[0] +sub v0.4s, v12.4s, v16.4s +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +mla v18.4S, v15.4S, v31.s[0] +sub v15.4s, v11.4s, v3.4s +add v11.4s, v11.4s, v3.4s +sqrdmulh v3.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +mla v17.4S, v16.4S, v31.s[0] +sub v16.4s, v14.4s, v18.4s +add v14.4s, v14.4s, v18.4s +sqrdmulh v18.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +mla v22.4S, v3.4S, v31.s[0] +sub v3.4s, v13.4s, v17.4s +add v13.4s, v13.4s, v17.4s +sqrdmulh v17.4S, v9.4S, v29.s[2] +mul v9.4S, v9.4S,v30.s[2] +mla v21.4S, v18.4S, v31.s[0] +sub v18.4s, v2.4s, v22.4s +add v2.4s, v2.4s, v22.4s +sqrdmulh v22.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +mla v9.4S, v17.4S, v31.s[0] +sub v17.4s, v1.4s, v21.4s +add v1.4s, v1.4s, v21.4s +sqrdmulh v21.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +mla v10.4S, v22.4S, v31.s[0] +sub v22.4s, v20.4s, v9.4s +add v20.4s, v20.4s, v9.4s +sqrdmulh v9.4S, v11.4S, v27.s[0] +mul v11.4S, v11.4S,v28.s[0] +mla v12.4S, v21.4S, v31.s[0] +sub v21.4s, v19.4s, v10.4s +add v19.4s, v19.4s, v10.4s +sqrdmulh v10.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +mla v11.4S, v9.4S, v31.s[0] +sub v9.4s, v14.4s, v12.4s +add v14.4s, v14.4s, v12.4s +sqrdmulh v12.4S, v15.4S, v27.s[1] +mul v15.4S, v15.4S,v28.s[1] +mla v0.4S, v10.4S, v31.s[0] +sub v10.4s, v13.4s, v11.4s +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v2.4S, v27.s[2] +mul v2.4S, v2.4S,v28.s[2] +mla v15.4S, v12.4S, v31.s[0] +sub v12.4s, v16.4s, v0.4s +add v16.4s, v16.4s, v0.4s +sqrdmulh v0.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +mla v2.4S, v11.4S, v31.s[0] +sub v11.4s, v3.4s, v15.4s +add v3.4s, v3.4s, v15.4s +sqrdmulh v15.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +mla v1.4S, v0.4S, v31.s[0] +sub v0.4s, v20.4s, v2.4s +add v20.4s, v20.4s, v2.4s +sqrdmulh v2.4S, v17.4S, v27.s[3] +mul v17.4S, v17.4S,v28.s[3] +mla v18.4S, v15.4S, v31.s[0] +sub v15.4s, v19.4s, v1.4s +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v13.4S, v25.s[0] +mul v13.4S, v13.4S,v26.s[0] +mla v17.4S, v2.4S, v31.s[0] +sub v2.4s, v22.4s, v18.4s +add v22.4s, v22.4s, v18.4s +sqrdmulh v18.4S, v10.4S, v25.s[1] +mul v10.4S, v10.4S,v26.s[1] +mla v13.4S, v1.4S, v31.s[0] +sub v1.4s, v21.4s, v17.4s +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v3.4S, v25.s[2] +mul v3.4S, v3.4S,v26.s[2] +mla v10.4S, v18.4S, v31.s[0] +sub v18.4s, v14.4s, v13.4s +add v14.4s, v14.4s, v13.4s +sqrdmulh v13.4S, v11.4S, v25.s[3] +mul v11.4S, v11.4S,v26.s[3] +mla v3.4S, v17.4S, v31.s[0] +sub v17.4s, v9.4s, v10.4s +add v9.4s, v9.4s, v10.4s +sqrdmulh v10.4S, v19.4S, v23.s[0] +mul v19.4S, v19.4S,v24.s[0] +mla v11.4S, v13.4S, v31.s[0] +sub v13.4s, v16.4s, v3.4s +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v15.4S, v23.s[1] +mul v15.4S, v15.4S,v24.s[1] +mla v19.4S, v10.4S, v31.s[0] +sub v10.4s, v12.4s, v11.4s +add v12.4s, v12.4s, v11.4s +sqrdmulh v11.4S, v21.4S, v23.s[2] +mul v21.4S, v21.4S,v24.s[2] +mla v15.4S, v3.4S, v31.s[0] +sub v3.4s, v20.4s, v19.4s +add v20.4s, v20.4s, v19.4s +sqrdmulh v19.4S, v1.4S, v23.s[3] +mul v1.4S, v1.4S,v24.s[3] +mla v21.4S, v11.4S, v31.s[0] +sub v11.4s, v0.4s, v15.4s +add v0.4s, v0.4s, v15.4s +mla v1.4S, v19.4S, v31.s[0] +sub v19.4s, v22.4s, v21.4s +add v22.4s, v22.4s, v21.4s +sub v21.4s, v2.4s, v1.4s +add v2.4s, v2.4s, v1.4s +str q14, [x0, #32] +str q18, [x0, #96] +str q9, [x0, #160] +str q17, [x0, #224] +str q16, [x0, #288] +str q13, [x0, #352] +str q12, [x0, #416] +str q10, [x0, #480] +str q20, [x0, #544] +str q3, [x0, #608] +str q0, [x0, #672] +str q11, [x0, #736] +str q22, [x0, #800] +str q19, [x0, #864] +str q2, [x0, #928] +str q21, [x0, #992] +ldr q21, [x0, #816] +ldr q2, [x0, #880] +ldr q19, [x0, #944] +ldr q22, [x0, #1008] +ldr q11, [x0, #304] +ldr q0, [x0, #368] +ldr q3, [x0, #432] +ldr q20, [x0, #496] +ldr q10, [x0, #560] +ldr q12, [x0, #624] +ldr q13, [x0, #688] +ldr q16, [x0, #752] +ldr q17, [x0, #48] +ldr q9, [x0, #112] +ldr q18, [x0, #176] +ldr q14, [x0, #240] +sqrdmulh v1.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +sqrdmulh v15.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +mla v21.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +mla v2.4S, v15.4S, v31.s[0] +sub v15.4s, v11.4s, v21.4s +add v11.4s, v11.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +mla v19.4S, v1.4S, v31.s[0] +sub v1.4s, v0.4s, v2.4s +add v0.4s, v0.4s, v2.4s +sqrdmulh v2.4S, v10.4S, v29.s[0] +mul v10.4S, v10.4S,v30.s[0] +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +mla v10.4S, v2.4S, v31.s[0] +sub v2.4s, v20.4s, v22.4s +add v20.4s, v20.4s, v22.4s +sqrdmulh v22.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +mla v12.4S, v19.4S, v31.s[0] +sub v19.4s, v17.4s, v10.4s +add v17.4s, v17.4s, v10.4s +sqrdmulh v10.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +mla v13.4S, v22.4S, v31.s[0] +sub v22.4s, v9.4s, v12.4s +add v9.4s, v9.4s, v12.4s +sqrdmulh v12.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +mla v16.4S, v10.4S, v31.s[0] +sub v10.4s, v18.4s, v13.4s +add v18.4s, v18.4s, v13.4s +sqrdmulh v13.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +mla v3.4S, v12.4S, v31.s[0] +sub v12.4s, v14.4s, v16.4s +add v14.4s, v14.4s, v16.4s +sqrdmulh v16.4S, v11.4S, v29.s[1] +mul v11.4S, v11.4S,v30.s[1] +mla v20.4S, v13.4S, v31.s[0] +sub v13.4s, v18.4s, v3.4s +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v0.4S, v29.s[1] +mul v0.4S, v0.4S,v30.s[1] +mla v11.4S, v16.4S, v31.s[0] +sub v16.4s, v14.4s, v20.4s +add v14.4s, v14.4s, v20.4s +sqrdmulh v20.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +mla v0.4S, v3.4S, v31.s[0] +sub v3.4s, v17.4s, v11.4s +add v17.4s, v17.4s, v11.4s +sqrdmulh v11.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +mla v21.4S, v20.4S, v31.s[0] +sub v20.4s, v9.4s, v0.4s +add v9.4s, v9.4s, v0.4s +sqrdmulh v0.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +mla v2.4S, v11.4S, v31.s[0] +sub v11.4s, v10.4s, v21.4s +add v10.4s, v10.4s, v21.4s +sqrdmulh v21.4S, v1.4S, v29.s[2] +mul v1.4S, v1.4S,v30.s[2] +mla v15.4S, v0.4S, v31.s[0] +sub v0.4s, v12.4s, v2.4s +add v12.4s, v12.4s, v2.4s +sqrdmulh v2.4S, v18.4S, v27.s[0] +mul v18.4S, v18.4S,v28.s[0] +mla v1.4S, v21.4S, v31.s[0] +sub v21.4s, v19.4s, v15.4s +add v19.4s, v19.4s, v15.4s +sqrdmulh v15.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +mla v18.4S, v2.4S, v31.s[0] +sub v2.4s, v22.4s, v1.4s +add v22.4s, v22.4s, v1.4s +sqrdmulh v1.4S, v13.4S, v27.s[1] +mul v13.4S, v13.4S,v28.s[1] +mla v14.4S, v15.4S, v31.s[0] +sub v15.4s, v17.4s, v18.4s +add v17.4s, v17.4s, v18.4s +sqrdmulh v18.4S, v16.4S, v27.s[1] +mul v16.4S, v16.4S,v28.s[1] +mla v13.4S, v1.4S, v31.s[0] +sub v1.4s, v9.4s, v14.4s +add v9.4s, v9.4s, v14.4s +sqrdmulh v14.4S, v10.4S, v27.s[2] +mul v10.4S, v10.4S,v28.s[2] +mla v16.4S, v18.4S, v31.s[0] +sub v18.4s, v3.4s, v13.4s +add v3.4s, v3.4s, v13.4s +sqrdmulh v13.4S, v12.4S, v27.s[2] +mul v12.4S, v12.4S,v28.s[2] +mla v10.4S, v14.4S, v31.s[0] +sub v14.4s, v20.4s, v16.4s +add v20.4s, v20.4s, v16.4s +sqrdmulh v16.4S, v11.4S, v27.s[3] +mul v11.4S, v11.4S,v28.s[3] +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v19.4s, v10.4s +add v19.4s, v19.4s, v10.4s +sqrdmulh v10.4S, v0.4S, v27.s[3] +mul v0.4S, v0.4S,v28.s[3] +mla v11.4S, v16.4S, v31.s[0] +sub v16.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v9.4S, v25.s[0] +mul v9.4S, v9.4S,v26.s[0] +mla v0.4S, v10.4S, v31.s[0] +sub v10.4s, v21.4s, v11.4s +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v1.4S, v25.s[1] +mul v1.4S, v1.4S,v26.s[1] +mla v9.4S, v12.4S, v31.s[0] +sub v12.4s, v2.4s, v0.4s +add v2.4s, v2.4s, v0.4s +sqrdmulh v0.4S, v20.4S, v25.s[2] +mul v20.4S, v20.4S,v26.s[2] +mla v1.4S, v11.4S, v31.s[0] +sub v11.4s, v17.4s, v9.4s +add v17.4s, v17.4s, v9.4s +sqrdmulh v9.4S, v14.4S, v25.s[3] +mul v14.4S, v14.4S,v26.s[3] +mla v20.4S, v0.4S, v31.s[0] +sub v0.4s, v15.4s, v1.4s +add v15.4s, v15.4s, v1.4s +sqrdmulh v1.4S, v22.4S, v23.s[0] +mul v22.4S, v22.4S,v24.s[0] +mla v14.4S, v9.4S, v31.s[0] +sub v9.4s, v3.4s, v20.4s +add v3.4s, v3.4s, v20.4s +sqrdmulh v20.4S, v16.4S, v23.s[1] +mul v16.4S, v16.4S,v24.s[1] +mla v22.4S, v1.4S, v31.s[0] +sub v1.4s, v18.4s, v14.4s +add v18.4s, v18.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v23.s[2] +mul v2.4S, v2.4S,v24.s[2] +mla v16.4S, v20.4S, v31.s[0] +sub v20.4s, v19.4s, v22.4s +add v19.4s, v19.4s, v22.4s +sqrdmulh v22.4S, v12.4S, v23.s[3] +mul v12.4S, v12.4S,v24.s[3] +mla v2.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v16.4s +add v13.4s, v13.4s, v16.4s +mla v12.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v2.4s +add v21.4s, v21.4s, v2.4s +sub v2.4s, v10.4s, v12.4s +add v10.4s, v10.4s, v12.4s +str q17, [x0, #48] +str q11, [x0, #112] +str q15, [x0, #176] +str q0, [x0, #240] +str q3, [x0, #304] +str q9, [x0, #368] +str q18, [x0, #432] +str q1, [x0, #496] +str q19, [x0, #560] +str q20, [x0, #624] +str q13, [x0, #688] +str q14, [x0, #752] +str q21, [x0, #816] +str q22, [x0, #880] +str q10, [x0, #944] +str q2, [x0, #1008] +ldr q2, [x0, #768] +ldr q10, [x0, #832] +ldr q22, [x0, #896] +ldr q21, [x0, #960] +ldr q14, [x0, #256] +ldr q13, [x0, #320] +ldr q20, [x0, #384] +ldr q19, [x0, #448] +ldr q1, [x0, #512] +ldr q18, [x0, #576] +ldr q9, [x0, #640] +ldr q3, [x0, #704] +ldr q0, [x0, #0] +ldr q15, [x0, #64] +ldr q11, [x0, #128] +ldr q17, [x0, #192] +sqrdmulh v12.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +sqrdmulh v16.4S, v10.4S, v29.s[0] +mul v10.4S, v10.4S,v30.s[0] +mla v2.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +mla v10.4S, v16.4S, v31.s[0] +sub v16.4s, v14.4s, v2.4s +add v14.4s, v14.4s, v2.4s +sqrdmulh v2.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +mla v22.4S, v12.4S, v31.s[0] +sub v12.4s, v13.4s, v10.4s +add v13.4s, v13.4s, v10.4s +sqrdmulh v10.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +mla v21.4S, v2.4S, v31.s[0] +sub v2.4s, v20.4s, v22.4s +add v20.4s, v20.4s, v22.4s +sqrdmulh v22.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +mla v1.4S, v10.4S, v31.s[0] +sub v10.4s, v19.4s, v21.4s +add v19.4s, v19.4s, v21.4s +sqrdmulh v21.4S, v9.4S, v29.s[0] +mul v9.4S, v9.4S,v30.s[0] +mla v18.4S, v22.4S, v31.s[0] +sub v22.4s, v0.4s, v1.4s +add v0.4s, v0.4s, v1.4s +sqrdmulh v1.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +mla v9.4S, v21.4S, v31.s[0] +sub v21.4s, v15.4s, v18.4s +add v15.4s, v15.4s, v18.4s +sqrdmulh v18.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +mla v3.4S, v1.4S, v31.s[0] +sub v1.4s, v11.4s, v9.4s +add v11.4s, v11.4s, v9.4s +sqrdmulh v9.4S, v19.4S, v29.s[1] +mul v19.4S, v19.4S,v30.s[1] +mla v20.4S, v18.4S, v31.s[0] +sub v18.4s, v17.4s, v3.4s +add v17.4s, v17.4s, v3.4s +sqrdmulh v3.4S, v14.4S, v29.s[1] +mul v14.4S, v14.4S,v30.s[1] +mla v19.4S, v9.4S, v31.s[0] +sub v9.4s, v11.4s, v20.4s +add v11.4s, v11.4s, v20.4s +sqrdmulh v20.4S, v13.4S, v29.s[1] +mul v13.4S, v13.4S,v30.s[1] +mla v14.4S, v3.4S, v31.s[0] +sub v3.4s, v17.4s, v19.4s +add v17.4s, v17.4s, v19.4s +sqrdmulh v19.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +mla v13.4S, v20.4S, v31.s[0] +sub v20.4s, v0.4s, v14.4s +add v0.4s, v0.4s, v14.4s +sqrdmulh v14.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +mla v2.4S, v19.4S, v31.s[0] +sub v19.4s, v15.4s, v13.4s +add v15.4s, v15.4s, v13.4s +sqrdmulh v13.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +mla v10.4S, v14.4S, v31.s[0] +sub v14.4s, v1.4s, v2.4s +add v1.4s, v1.4s, v2.4s +sqrdmulh v2.4S, v12.4S, v29.s[2] +mul v12.4S, v12.4S,v30.s[2] +mla v16.4S, v13.4S, v31.s[0] +sub v13.4s, v18.4s, v10.4s +add v18.4s, v18.4s, v10.4s +sqrdmulh v10.4S, v11.4S, v27.s[0] +mul v11.4S, v11.4S,v28.s[0] +mla v12.4S, v2.4S, v31.s[0] +sub v2.4s, v22.4s, v16.4s +add v22.4s, v22.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v27.s[0] +mul v17.4S, v17.4S,v28.s[0] +mla v11.4S, v10.4S, v31.s[0] +sub v10.4s, v21.4s, v12.4s +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v9.4S, v27.s[1] +mul v9.4S, v9.4S,v28.s[1] +mla v17.4S, v16.4S, v31.s[0] +sub v16.4s, v0.4s, v11.4s +add v0.4s, v0.4s, v11.4s +sqrdmulh v11.4S, v3.4S, v27.s[1] +mul v3.4S, v3.4S,v28.s[1] +mla v9.4S, v12.4S, v31.s[0] +sub v12.4s, v15.4s, v17.4s +add v15.4s, v15.4s, v17.4s +sqrdmulh v17.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +mla v3.4S, v11.4S, v31.s[0] +sub v11.4s, v20.4s, v9.4s +add v20.4s, v20.4s, v9.4s +sqrdmulh v9.4S, v18.4S, v27.s[2] +mul v18.4S, v18.4S,v28.s[2] +mla v1.4S, v17.4S, v31.s[0] +sub v17.4s, v19.4s, v3.4s +add v19.4s, v19.4s, v3.4s +sqrdmulh v3.4S, v14.4S, v27.s[3] +mul v14.4S, v14.4S,v28.s[3] +mla v18.4S, v9.4S, v31.s[0] +sub v9.4s, v22.4s, v1.4s +add v22.4s, v22.4s, v1.4s +sqrdmulh v1.4S, v13.4S, v27.s[3] +mul v13.4S, v13.4S,v28.s[3] +mla v14.4S, v3.4S, v31.s[0] +sub v3.4s, v21.4s, v18.4s +add v21.4s, v21.4s, v18.4s +sqrdmulh v18.4S, v15.4S, v25.s[0] +mul v15.4S, v15.4S,v26.s[0] +mla v13.4S, v1.4S, v31.s[0] +sub v1.4s, v2.4s, v14.4s +add v2.4s, v2.4s, v14.4s +sqrdmulh v14.4S, v12.4S, v25.s[1] +mul v12.4S, v12.4S,v26.s[1] +mla v15.4S, v18.4S, v31.s[0] +sub v18.4s, v10.4s, v13.4s +add v10.4s, v10.4s, v13.4s +sqrdmulh v13.4S, v19.4S, v25.s[2] +mul v19.4S, v19.4S,v26.s[2] +mla v12.4S, v14.4S, v31.s[0] +sub v14.4s, v0.4s, v15.4s +add v0.4s, v0.4s, v15.4s +sqrdmulh v15.4S, v17.4S, v25.s[3] +mul v17.4S, v17.4S,v26.s[3] +mla v19.4S, v13.4S, v31.s[0] +sub v13.4s, v16.4s, v12.4s +add v16.4s, v16.4s, v12.4s +sqrdmulh v12.4S, v21.4S, v23.s[0] +mul v21.4S, v21.4S,v24.s[0] +mla v17.4S, v15.4S, v31.s[0] +sub v15.4s, v20.4s, v19.4s +add v20.4s, v20.4s, v19.4s +sqrdmulh v19.4S, v3.4S, v23.s[1] +mul v3.4S, v3.4S,v24.s[1] +mla v21.4S, v12.4S, v31.s[0] +sub v12.4s, v11.4s, v17.4s +add v11.4s, v11.4s, v17.4s +sqrdmulh v17.4S, v10.4S, v23.s[2] +mul v10.4S, v10.4S,v24.s[2] +mla v3.4S, v19.4S, v31.s[0] +sub v19.4s, v22.4s, v21.4s +add v22.4s, v22.4s, v21.4s +sqrdmulh v21.4S, v18.4S, v23.s[3] +mul v18.4S, v18.4S,v24.s[3] +mla v10.4S, v17.4S, v31.s[0] +sub v17.4s, v9.4s, v3.4s +add v9.4s, v9.4s, v3.4s +mla v18.4S, v21.4S, v31.s[0] +sub v21.4s, v2.4s, v10.4s +add v2.4s, v2.4s, v10.4s +sub v10.4s, v1.4s, v18.4s +add v1.4s, v1.4s, v18.4s +str q0, [x0, #0] +str q14, [x0, #64] +str q16, [x0, #128] +str q13, [x0, #192] +str q20, [x0, #256] +str q15, [x0, #320] +str q11, [x0, #384] +str q12, [x0, #448] +str q22, [x0, #512] +str q19, [x0, #576] +str q9, [x0, #640] +str q17, [x0, #704] +str q2, [x0, #768] +str q21, [x0, #832] +str q1, [x0, #896] +str q10, [x0, #960] +ldr q10, [x0, #784] +ldr q1, [x0, #848] +ldr q21, [x0, #912] +ldr q2, [x0, #976] +ldr q17, [x0, #272] +ldr q9, [x0, #336] +ldr q19, [x0, #400] +ldr q22, [x0, #464] +ldr q12, [x0, #528] +ldr q11, [x0, #592] +ldr q15, [x0, #656] +ldr q20, [x0, #720] +ldr q13, [x0, #16] +ldr q16, [x0, #80] +ldr q14, [x0, #144] +ldr q0, [x0, #208] +sqrdmulh v18.4S, v10.4S, v29.s[0] +mul v10.4S, v10.4S,v30.s[0] +sqrdmulh v3.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +mla v10.4S, v18.4S, v31.s[0] +sqrdmulh v18.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +mla v1.4S, v3.4S, v31.s[0] +sub v3.4s, v17.4s, v10.4s +add v17.4s, v17.4s, v10.4s +sqrdmulh v10.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +mla v21.4S, v18.4S, v31.s[0] +sub v18.4s, v9.4s, v1.4s +add v9.4s, v9.4s, v1.4s +sqrdmulh v1.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +mla v2.4S, v10.4S, v31.s[0] +sub v10.4s, v19.4s, v21.4s +add v19.4s, v19.4s, v21.4s +sqrdmulh v21.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +mla v12.4S, v1.4S, v31.s[0] +sub v1.4s, v22.4s, v2.4s +add v22.4s, v22.4s, v2.4s +sqrdmulh v2.4S, v15.4S, v29.s[0] +mul v15.4S, v15.4S,v30.s[0] +mla v11.4S, v21.4S, v31.s[0] +sub v21.4s, v13.4s, v12.4s +add v13.4s, v13.4s, v12.4s +sqrdmulh v12.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +mla v15.4S, v2.4S, v31.s[0] +sub v2.4s, v16.4s, v11.4s +add v16.4s, v16.4s, v11.4s +sqrdmulh v11.4S, v19.4S, v29.s[1] +mul v19.4S, v19.4S,v30.s[1] +mla v20.4S, v12.4S, v31.s[0] +sub v12.4s, v14.4s, v15.4s +add v14.4s, v14.4s, v15.4s +sqrdmulh v15.4S, v22.4S, v29.s[1] +mul v22.4S, v22.4S,v30.s[1] +mla v19.4S, v11.4S, v31.s[0] +sub v11.4s, v0.4s, v20.4s +add v0.4s, v0.4s, v20.4s +sqrdmulh v20.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +mla v22.4S, v15.4S, v31.s[0] +sub v15.4s, v14.4s, v19.4s +add v14.4s, v14.4s, v19.4s +sqrdmulh v19.4S, v9.4S, v29.s[1] +mul v9.4S, v9.4S,v30.s[1] +mla v17.4S, v20.4S, v31.s[0] +sub v20.4s, v0.4s, v22.4s +add v0.4s, v0.4s, v22.4s +sqrdmulh v22.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +mla v9.4S, v19.4S, v31.s[0] +sub v19.4s, v13.4s, v17.4s +add v13.4s, v13.4s, v17.4s +sqrdmulh v17.4S, v1.4S, v29.s[2] +mul v1.4S, v1.4S,v30.s[2] +mla v10.4S, v22.4S, v31.s[0] +sub v22.4s, v16.4s, v9.4s +add v16.4s, v16.4s, v9.4s +sqrdmulh v9.4S, v3.4S, v29.s[2] +mul v3.4S, v3.4S,v30.s[2] +mla v1.4S, v17.4S, v31.s[0] +sub v17.4s, v12.4s, v10.4s +add v12.4s, v12.4s, v10.4s +sqrdmulh v10.4S, v18.4S, v29.s[2] +mul v18.4S, v18.4S,v30.s[2] +mla v3.4S, v9.4S, v31.s[0] +sub v9.4s, v11.4s, v1.4s +add v11.4s, v11.4s, v1.4s +sqrdmulh v1.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +mla v18.4S, v10.4S, v31.s[0] +sub v10.4s, v21.4s, v3.4s +add v21.4s, v21.4s, v3.4s +sqrdmulh v3.4S, v0.4S, v27.s[0] +mul v0.4S, v0.4S,v28.s[0] +mla v14.4S, v1.4S, v31.s[0] +sub v1.4s, v2.4s, v18.4s +add v2.4s, v2.4s, v18.4s +sqrdmulh v18.4S, v15.4S, v27.s[1] +mul v15.4S, v15.4S,v28.s[1] +mla v0.4S, v3.4S, v31.s[0] +sub v3.4s, v13.4s, v14.4s +add v13.4s, v13.4s, v14.4s +sqrdmulh v14.4S, v20.4S, v27.s[1] +mul v20.4S, v20.4S,v28.s[1] +mla v15.4S, v18.4S, v31.s[0] +sub v18.4s, v16.4s, v0.4s +add v16.4s, v16.4s, v0.4s +sqrdmulh v0.4S, v12.4S, v27.s[2] +mul v12.4S, v12.4S,v28.s[2] +mla v20.4S, v14.4S, v31.s[0] +sub v14.4s, v19.4s, v15.4s +add v19.4s, v19.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v27.s[2] +mul v11.4S, v11.4S,v28.s[2] +mla v12.4S, v0.4S, v31.s[0] +sub v0.4s, v22.4s, v20.4s +add v22.4s, v22.4s, v20.4s +sqrdmulh v20.4S, v17.4S, v27.s[3] +mul v17.4S, v17.4S,v28.s[3] +mla v11.4S, v15.4S, v31.s[0] +sub v15.4s, v21.4s, v12.4s +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v9.4S, v27.s[3] +mul v9.4S, v9.4S,v28.s[3] +mla v17.4S, v20.4S, v31.s[0] +sub v20.4s, v2.4s, v11.4s +add v2.4s, v2.4s, v11.4s +sqrdmulh v11.4S, v16.4S, v25.s[0] +mul v16.4S, v16.4S,v26.s[0] +mla v9.4S, v12.4S, v31.s[0] +sub v12.4s, v10.4s, v17.4s +add v10.4s, v10.4s, v17.4s +sqrdmulh v17.4S, v18.4S, v25.s[1] +mul v18.4S, v18.4S,v26.s[1] +mla v16.4S, v11.4S, v31.s[0] +sub v11.4s, v1.4s, v9.4s +add v1.4s, v1.4s, v9.4s +sqrdmulh v9.4S, v22.4S, v25.s[2] +mul v22.4S, v22.4S,v26.s[2] +mla v18.4S, v17.4S, v31.s[0] +sub v17.4s, v13.4s, v16.4s +add v13.4s, v13.4s, v16.4s +sqrdmulh v16.4S, v0.4S, v25.s[3] +mul v0.4S, v0.4S,v26.s[3] +mla v22.4S, v9.4S, v31.s[0] +sub v9.4s, v3.4s, v18.4s +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v2.4S, v23.s[0] +mul v2.4S, v2.4S,v24.s[0] +mla v0.4S, v16.4S, v31.s[0] +sub v16.4s, v19.4s, v22.4s +add v19.4s, v19.4s, v22.4s +sqrdmulh v22.4S, v20.4S, v23.s[1] +mul v20.4S, v20.4S,v24.s[1] +mla v2.4S, v18.4S, v31.s[0] +sub v18.4s, v14.4s, v0.4s +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v1.4S, v23.s[2] +mul v1.4S, v1.4S,v24.s[2] +mla v20.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v2.4s +add v21.4s, v21.4s, v2.4s +sqrdmulh v2.4S, v11.4S, v23.s[3] +mul v11.4S, v11.4S,v24.s[3] +mla v1.4S, v0.4S, v31.s[0] +sub v0.4s, v15.4s, v20.4s +add v15.4s, v15.4s, v20.4s +mla v11.4S, v2.4S, v31.s[0] +sub v2.4s, v10.4s, v1.4s +add v10.4s, v10.4s, v1.4s +sub v1.4s, v12.4s, v11.4s +add v12.4s, v12.4s, v11.4s +str q13, [x0, #16] +str q17, [x0, #80] +str q3, [x0, #144] +str q9, [x0, #208] +str q19, [x0, #272] +str q16, [x0, #336] +str q14, [x0, #400] +str q18, [x0, #464] +str q21, [x0, #528] +str q22, [x0, #592] +str q15, [x0, #656] +str q0, [x0, #720] +str q10, [x0, #784] +str q2, [x0, #848] +str q12, [x0, #912] +str q1, [x0, #976] +ldr q4, [x17, #+128] +ldr q5, [x17, #+144] +ldr q6, [x17, #+160] +ldr q7, [x17, #+176] +ldr q8, [x17, #+192] +ldr q20, [x17, #+208] +ldr q11, [x17, #+224] +ldr q13, [x17, #+240] +ldr q17, [x0, #32] +ldr q3, [x0, #48] +ldr q9, [x0, #0] +ldr q19, [x0, #16] +sqrdmulh v16.4S, v17.4S, v5.s[0] +mul v17.4S, v17.4S,v4.s[0] +mla v17.4S, v16.4S, v31.s[0] +sub v16.4s, v9.4s, v17.4s +add v9.4s, v9.4s, v17.4s +sqrdmulh v17.4S, v3.4S, v5.s[0] +mul v3.4S, v3.4S,v4.s[0] +mla v3.4S, v17.4S, v31.s[0] +sub v17.4s, v19.4s, v3.4s +add v19.4s, v19.4s, v3.4s +sqrdmulh v3.4S, v19.4S, v5.s[1] +mul v19.4S, v19.4S,v4.s[1] +mla v19.4S, v3.4S, v31.s[0] +sub v3.4s, v9.4s, v19.4s +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v17.4S, v5.s[2] +mul v17.4S, v17.4S,v4.s[2] +mla v17.4S, v19.4S, v31.s[0] +sub v19.4s, v16.4s, v17.4s +add v16.4s, v16.4s, v17.4s +trn1 v17.4S, v9.4S, v3.4S +trn2 v14.4S, v9.4S, v3.4S +trn1 v18.4S, v16.4S, v19.4S +trn2 v21.4S, v16.4S, v19.4S +trn2 v16.2D, v17.2D, v18.2D +trn2 v19.2D, v14.2D, v21.2D +trn1 v9.2D, v17.2D, v18.2D +trn1 v3.2D, v14.2D, v21.2D +sqrdmulh v21.4S, v16.4S, v7.4S +mul v16.4S, v16.4S,v6.4S +mla v16.4S, v21.4S, v31.s[0] +sub v21.4s, v9.4s, v16.4s +add v9.4s, v9.4s, v16.4s +sqrdmulh v16.4S, v19.4S, v7.4S +mul v19.4S, v19.4S,v6.4S +mla v19.4S, v16.4S, v31.s[0] +sub v16.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v3.4S, v20.4S +mul v3.4S, v3.4S,v8.4S +mla v3.4S, v19.4S, v31.s[0] +sub v19.4s, v9.4s, v3.4s +add v9.4s, v9.4s, v3.4s +sqrdmulh v3.4S, v16.4S, v13.4S +mul v16.4S, v16.4S,v11.4S +mla v16.4S, v3.4S, v31.s[0] +sub v3.4s, v21.4s, v16.4s +add v21.4s, v21.4s, v16.4s +str q9, [x0, #0] +str q19, [x0, #16] +str q21, [x0, #32] +str q3, [x0, #48] +ldr q3, [x17, #+256] +ldr q21, [x17, #+272] +ldr q19, [x17, #+288] +ldr q9, [x17, #+304] +ldr q16, [x17, #+320] +ldr q14, [x17, #+336] +ldr q18, [x17, #+352] +ldr q17, [x17, #+368] +ldr q13, [x0, #96] +ldr q11, [x0, #112] +ldr q20, [x0, #64] +ldr q8, [x0, #80] +sqrdmulh v7.4S, v13.4S, v21.s[0] +mul v13.4S, v13.4S,v3.s[0] +mla v13.4S, v7.4S, v31.s[0] +sub v7.4s, v20.4s, v13.4s +add v20.4s, v20.4s, v13.4s +sqrdmulh v13.4S, v11.4S, v21.s[0] +mul v11.4S, v11.4S,v3.s[0] +mla v11.4S, v13.4S, v31.s[0] +sub v13.4s, v8.4s, v11.4s +add v8.4s, v8.4s, v11.4s +sqrdmulh v11.4S, v8.4S, v21.s[1] +mul v8.4S, v8.4S,v3.s[1] +mla v8.4S, v11.4S, v31.s[0] +sub v11.4s, v20.4s, v8.4s +add v20.4s, v20.4s, v8.4s +sqrdmulh v8.4S, v13.4S, v21.s[2] +mul v13.4S, v13.4S,v3.s[2] +mla v13.4S, v8.4S, v31.s[0] +sub v8.4s, v7.4s, v13.4s +add v7.4s, v7.4s, v13.4s +trn1 v13.4S, v20.4S, v11.4S +trn2 v6.4S, v20.4S, v11.4S +trn1 v5.4S, v7.4S, v8.4S +trn2 v4.4S, v7.4S, v8.4S +trn2 v7.2D, v13.2D, v5.2D +trn2 v8.2D, v6.2D, v4.2D +trn1 v20.2D, v13.2D, v5.2D +trn1 v11.2D, v6.2D, v4.2D +sqrdmulh v4.4S, v7.4S, v9.4S +mul v7.4S, v7.4S,v19.4S +mla v7.4S, v4.4S, v31.s[0] +sub v4.4s, v20.4s, v7.4s +add v20.4s, v20.4s, v7.4s +sqrdmulh v7.4S, v8.4S, v9.4S +mul v8.4S, v8.4S,v19.4S +mla v8.4S, v7.4S, v31.s[0] +sub v7.4s, v11.4s, v8.4s +add v11.4s, v11.4s, v8.4s +sqrdmulh v8.4S, v11.4S, v14.4S +mul v11.4S, v11.4S,v16.4S +mla v11.4S, v8.4S, v31.s[0] +sub v8.4s, v20.4s, v11.4s +add v20.4s, v20.4s, v11.4s +sqrdmulh v11.4S, v7.4S, v17.4S +mul v7.4S, v7.4S,v18.4S +mla v7.4S, v11.4S, v31.s[0] +sub v11.4s, v4.4s, v7.4s +add v4.4s, v4.4s, v7.4s +str q20, [x0, #64] +str q8, [x0, #80] +str q4, [x0, #96] +str q11, [x0, #112] +ldr q11, [x17, #+384] +ldr q4, [x17, #+400] +ldr q8, [x17, #+416] +ldr q20, [x17, #+432] +ldr q7, [x17, #+448] +ldr q6, [x17, #+464] +ldr q5, [x17, #+480] +ldr q13, [x17, #+496] +ldr q17, [x0, #160] +ldr q18, [x0, #176] +ldr q14, [x0, #128] +ldr q16, [x0, #144] +sqrdmulh v9.4S, v17.4S, v4.s[0] +mul v17.4S, v17.4S,v11.s[0] +mla v17.4S, v9.4S, v31.s[0] +sub v9.4s, v14.4s, v17.4s +add v14.4s, v14.4s, v17.4s +sqrdmulh v17.4S, v18.4S, v4.s[0] +mul v18.4S, v18.4S,v11.s[0] +mla v18.4S, v17.4S, v31.s[0] +sub v17.4s, v16.4s, v18.4s +add v16.4s, v16.4s, v18.4s +sqrdmulh v18.4S, v16.4S, v4.s[1] +mul v16.4S, v16.4S,v11.s[1] +mla v16.4S, v18.4S, v31.s[0] +sub v18.4s, v14.4s, v16.4s +add v14.4s, v14.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v4.s[2] +mul v17.4S, v17.4S,v11.s[2] +mla v17.4S, v16.4S, v31.s[0] +sub v16.4s, v9.4s, v17.4s +add v9.4s, v9.4s, v17.4s +trn1 v17.4S, v14.4S, v18.4S +trn2 v19.4S, v14.4S, v18.4S +trn1 v21.4S, v9.4S, v16.4S +trn2 v3.4S, v9.4S, v16.4S +trn2 v9.2D, v17.2D, v21.2D +trn2 v16.2D, v19.2D, v3.2D +trn1 v14.2D, v17.2D, v21.2D +trn1 v18.2D, v19.2D, v3.2D +sqrdmulh v3.4S, v9.4S, v20.4S +mul v9.4S, v9.4S,v8.4S +mla v9.4S, v3.4S, v31.s[0] +sub v3.4s, v14.4s, v9.4s +add v14.4s, v14.4s, v9.4s +sqrdmulh v9.4S, v16.4S, v20.4S +mul v16.4S, v16.4S,v8.4S +mla v16.4S, v9.4S, v31.s[0] +sub v9.4s, v18.4s, v16.4s +add v18.4s, v18.4s, v16.4s +sqrdmulh v16.4S, v18.4S, v6.4S +mul v18.4S, v18.4S,v7.4S +mla v18.4S, v16.4S, v31.s[0] +sub v16.4s, v14.4s, v18.4s +add v14.4s, v14.4s, v18.4s +sqrdmulh v18.4S, v9.4S, v13.4S +mul v9.4S, v9.4S,v5.4S +mla v9.4S, v18.4S, v31.s[0] +sub v18.4s, v3.4s, v9.4s +add v3.4s, v3.4s, v9.4s +str q14, [x0, #128] +str q16, [x0, #144] +str q3, [x0, #160] +str q18, [x0, #176] +ldr q18, [x17, #+512] +ldr q3, [x17, #+528] +ldr q16, [x17, #+544] +ldr q14, [x17, #+560] +ldr q9, [x17, #+576] +ldr q19, [x17, #+592] +ldr q21, [x17, #+608] +ldr q17, [x17, #+624] +ldr q13, [x0, #224] +ldr q5, [x0, #240] +ldr q6, [x0, #192] +ldr q7, [x0, #208] +sqrdmulh v20.4S, v13.4S, v3.s[0] +mul v13.4S, v13.4S,v18.s[0] +mla v13.4S, v20.4S, v31.s[0] +sub v20.4s, v6.4s, v13.4s +add v6.4s, v6.4s, v13.4s +sqrdmulh v13.4S, v5.4S, v3.s[0] +mul v5.4S, v5.4S,v18.s[0] +mla v5.4S, v13.4S, v31.s[0] +sub v13.4s, v7.4s, v5.4s +add v7.4s, v7.4s, v5.4s +sqrdmulh v5.4S, v7.4S, v3.s[1] +mul v7.4S, v7.4S,v18.s[1] +mla v7.4S, v5.4S, v31.s[0] +sub v5.4s, v6.4s, v7.4s +add v6.4s, v6.4s, v7.4s +sqrdmulh v7.4S, v13.4S, v3.s[2] +mul v13.4S, v13.4S,v18.s[2] +mla v13.4S, v7.4S, v31.s[0] +sub v7.4s, v20.4s, v13.4s +add v20.4s, v20.4s, v13.4s +trn1 v13.4S, v6.4S, v5.4S +trn2 v8.4S, v6.4S, v5.4S +trn1 v4.4S, v20.4S, v7.4S +trn2 v11.4S, v20.4S, v7.4S +trn2 v20.2D, v13.2D, v4.2D +trn2 v7.2D, v8.2D, v11.2D +trn1 v6.2D, v13.2D, v4.2D +trn1 v5.2D, v8.2D, v11.2D +sqrdmulh v11.4S, v20.4S, v14.4S +mul v20.4S, v20.4S,v16.4S +mla v20.4S, v11.4S, v31.s[0] +sub v11.4s, v6.4s, v20.4s +add v6.4s, v6.4s, v20.4s +sqrdmulh v20.4S, v7.4S, v14.4S +mul v7.4S, v7.4S,v16.4S +mla v7.4S, v20.4S, v31.s[0] +sub v20.4s, v5.4s, v7.4s +add v5.4s, v5.4s, v7.4s +sqrdmulh v7.4S, v5.4S, v19.4S +mul v5.4S, v5.4S,v9.4S +mla v5.4S, v7.4S, v31.s[0] +sub v7.4s, v6.4s, v5.4s +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v20.4S, v17.4S +mul v20.4S, v20.4S,v21.4S +mla v20.4S, v5.4S, v31.s[0] +sub v5.4s, v11.4s, v20.4s +add v11.4s, v11.4s, v20.4s +str q6, [x0, #192] +str q7, [x0, #208] +str q11, [x0, #224] +str q5, [x0, #240] +ldr q5, [x17, #+640] +ldr q11, [x17, #+656] +ldr q7, [x17, #+672] +ldr q6, [x17, #+688] +ldr q20, [x17, #+704] +ldr q8, [x17, #+720] +ldr q4, [x17, #+736] +ldr q13, [x17, #+752] +ldr q17, [x0, #288] +ldr q21, [x0, #304] +ldr q19, [x0, #256] +ldr q9, [x0, #272] +sqrdmulh v14.4S, v17.4S, v11.s[0] +mul v17.4S, v17.4S,v5.s[0] +mla v17.4S, v14.4S, v31.s[0] +sub v14.4s, v19.4s, v17.4s +add v19.4s, v19.4s, v17.4s +sqrdmulh v17.4S, v21.4S, v11.s[0] +mul v21.4S, v21.4S,v5.s[0] +mla v21.4S, v17.4S, v31.s[0] +sub v17.4s, v9.4s, v21.4s +add v9.4s, v9.4s, v21.4s +sqrdmulh v21.4S, v9.4S, v11.s[1] +mul v9.4S, v9.4S,v5.s[1] +mla v9.4S, v21.4S, v31.s[0] +sub v21.4s, v19.4s, v9.4s +add v19.4s, v19.4s, v9.4s +sqrdmulh v9.4S, v17.4S, v11.s[2] +mul v17.4S, v17.4S,v5.s[2] +mla v17.4S, v9.4S, v31.s[0] +sub v9.4s, v14.4s, v17.4s +add v14.4s, v14.4s, v17.4s +trn1 v17.4S, v19.4S, v21.4S +trn2 v16.4S, v19.4S, v21.4S +trn1 v3.4S, v14.4S, v9.4S +trn2 v18.4S, v14.4S, v9.4S +trn2 v14.2D, v17.2D, v3.2D +trn2 v9.2D, v16.2D, v18.2D +trn1 v19.2D, v17.2D, v3.2D +trn1 v21.2D, v16.2D, v18.2D +sqrdmulh v18.4S, v14.4S, v6.4S +mul v14.4S, v14.4S,v7.4S +mla v14.4S, v18.4S, v31.s[0] +sub v18.4s, v19.4s, v14.4s +add v19.4s, v19.4s, v14.4s +sqrdmulh v14.4S, v9.4S, v6.4S +mul v9.4S, v9.4S,v7.4S +mla v9.4S, v14.4S, v31.s[0] +sub v14.4s, v21.4s, v9.4s +add v21.4s, v21.4s, v9.4s +sqrdmulh v9.4S, v21.4S, v8.4S +mul v21.4S, v21.4S,v20.4S +mla v21.4S, v9.4S, v31.s[0] +sub v9.4s, v19.4s, v21.4s +add v19.4s, v19.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v13.4S +mul v14.4S, v14.4S,v4.4S +mla v14.4S, v21.4S, v31.s[0] +sub v21.4s, v18.4s, v14.4s +add v18.4s, v18.4s, v14.4s +str q19, [x0, #256] +str q9, [x0, #272] +str q18, [x0, #288] +str q21, [x0, #304] +ldr q21, [x17, #+768] +ldr q18, [x17, #+784] +ldr q9, [x17, #+800] +ldr q19, [x17, #+816] +ldr q14, [x17, #+832] +ldr q16, [x17, #+848] +ldr q3, [x17, #+864] +ldr q17, [x17, #+880] +ldr q13, [x0, #352] +ldr q4, [x0, #368] +ldr q8, [x0, #320] +ldr q20, [x0, #336] +sqrdmulh v6.4S, v13.4S, v18.s[0] +mul v13.4S, v13.4S,v21.s[0] +mla v13.4S, v6.4S, v31.s[0] +sub v6.4s, v8.4s, v13.4s +add v8.4s, v8.4s, v13.4s +sqrdmulh v13.4S, v4.4S, v18.s[0] +mul v4.4S, v4.4S,v21.s[0] +mla v4.4S, v13.4S, v31.s[0] +sub v13.4s, v20.4s, v4.4s +add v20.4s, v20.4s, v4.4s +sqrdmulh v4.4S, v20.4S, v18.s[1] +mul v20.4S, v20.4S,v21.s[1] +mla v20.4S, v4.4S, v31.s[0] +sub v4.4s, v8.4s, v20.4s +add v8.4s, v8.4s, v20.4s +sqrdmulh v20.4S, v13.4S, v18.s[2] +mul v13.4S, v13.4S,v21.s[2] +mla v13.4S, v20.4S, v31.s[0] +sub v20.4s, v6.4s, v13.4s +add v6.4s, v6.4s, v13.4s +trn1 v13.4S, v8.4S, v4.4S +trn2 v7.4S, v8.4S, v4.4S +trn1 v11.4S, v6.4S, v20.4S +trn2 v5.4S, v6.4S, v20.4S +trn2 v6.2D, v13.2D, v11.2D +trn2 v20.2D, v7.2D, v5.2D +trn1 v8.2D, v13.2D, v11.2D +trn1 v4.2D, v7.2D, v5.2D +sqrdmulh v5.4S, v6.4S, v19.4S +mul v6.4S, v6.4S,v9.4S +mla v6.4S, v5.4S, v31.s[0] +sub v5.4s, v8.4s, v6.4s +add v8.4s, v8.4s, v6.4s +sqrdmulh v6.4S, v20.4S, v19.4S +mul v20.4S, v20.4S,v9.4S +mla v20.4S, v6.4S, v31.s[0] +sub v6.4s, v4.4s, v20.4s +add v4.4s, v4.4s, v20.4s +sqrdmulh v20.4S, v4.4S, v16.4S +mul v4.4S, v4.4S,v14.4S +mla v4.4S, v20.4S, v31.s[0] +sub v20.4s, v8.4s, v4.4s +add v8.4s, v8.4s, v4.4s +sqrdmulh v4.4S, v6.4S, v17.4S +mul v6.4S, v6.4S,v3.4S +mla v6.4S, v4.4S, v31.s[0] +sub v4.4s, v5.4s, v6.4s +add v5.4s, v5.4s, v6.4s +str q8, [x0, #320] +str q20, [x0, #336] +str q5, [x0, #352] +str q4, [x0, #368] +ldr q4, [x17, #+896] +ldr q5, [x17, #+912] +ldr q20, [x17, #+928] +ldr q8, [x17, #+944] +ldr q6, [x17, #+960] +ldr q7, [x17, #+976] +ldr q11, [x17, #+992] +ldr q13, [x17, #+1008] +ldr q17, [x0, #416] +ldr q3, [x0, #432] +ldr q16, [x0, #384] +ldr q14, [x0, #400] +sqrdmulh v19.4S, v17.4S, v5.s[0] +mul v17.4S, v17.4S,v4.s[0] +mla v17.4S, v19.4S, v31.s[0] +sub v19.4s, v16.4s, v17.4s +add v16.4s, v16.4s, v17.4s +sqrdmulh v17.4S, v3.4S, v5.s[0] +mul v3.4S, v3.4S,v4.s[0] +mla v3.4S, v17.4S, v31.s[0] +sub v17.4s, v14.4s, v3.4s +add v14.4s, v14.4s, v3.4s +sqrdmulh v3.4S, v14.4S, v5.s[1] +mul v14.4S, v14.4S,v4.s[1] +mla v14.4S, v3.4S, v31.s[0] +sub v3.4s, v16.4s, v14.4s +add v16.4s, v16.4s, v14.4s +sqrdmulh v14.4S, v17.4S, v5.s[2] +mul v17.4S, v17.4S,v4.s[2] +mla v17.4S, v14.4S, v31.s[0] +sub v14.4s, v19.4s, v17.4s +add v19.4s, v19.4s, v17.4s +trn1 v17.4S, v16.4S, v3.4S +trn2 v9.4S, v16.4S, v3.4S +trn1 v18.4S, v19.4S, v14.4S +trn2 v21.4S, v19.4S, v14.4S +trn2 v19.2D, v17.2D, v18.2D +trn2 v14.2D, v9.2D, v21.2D +trn1 v16.2D, v17.2D, v18.2D +trn1 v3.2D, v9.2D, v21.2D +sqrdmulh v21.4S, v19.4S, v8.4S +mul v19.4S, v19.4S,v20.4S +mla v19.4S, v21.4S, v31.s[0] +sub v21.4s, v16.4s, v19.4s +add v16.4s, v16.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v8.4S +mul v14.4S, v14.4S,v20.4S +mla v14.4S, v19.4S, v31.s[0] +sub v19.4s, v3.4s, v14.4s +add v3.4s, v3.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v7.4S +mul v3.4S, v3.4S,v6.4S +mla v3.4S, v14.4S, v31.s[0] +sub v14.4s, v16.4s, v3.4s +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v19.4S, v13.4S +mul v19.4S, v19.4S,v11.4S +mla v19.4S, v3.4S, v31.s[0] +sub v3.4s, v21.4s, v19.4s +add v21.4s, v21.4s, v19.4s +str q16, [x0, #384] +str q14, [x0, #400] +str q21, [x0, #416] +str q3, [x0, #432] +ldr q3, [x17, #+1024] +ldr q21, [x17, #+1040] +ldr q14, [x17, #+1056] +ldr q16, [x17, #+1072] +ldr q19, [x17, #+1088] +ldr q9, [x17, #+1104] +ldr q18, [x17, #+1120] +ldr q17, [x17, #+1136] +ldr q13, [x0, #480] +ldr q11, [x0, #496] +ldr q7, [x0, #448] +ldr q6, [x0, #464] +sqrdmulh v8.4S, v13.4S, v21.s[0] +mul v13.4S, v13.4S,v3.s[0] +mla v13.4S, v8.4S, v31.s[0] +sub v8.4s, v7.4s, v13.4s +add v7.4s, v7.4s, v13.4s +sqrdmulh v13.4S, v11.4S, v21.s[0] +mul v11.4S, v11.4S,v3.s[0] +mla v11.4S, v13.4S, v31.s[0] +sub v13.4s, v6.4s, v11.4s +add v6.4s, v6.4s, v11.4s +sqrdmulh v11.4S, v6.4S, v21.s[1] +mul v6.4S, v6.4S,v3.s[1] +mla v6.4S, v11.4S, v31.s[0] +sub v11.4s, v7.4s, v6.4s +add v7.4s, v7.4s, v6.4s +sqrdmulh v6.4S, v13.4S, v21.s[2] +mul v13.4S, v13.4S,v3.s[2] +mla v13.4S, v6.4S, v31.s[0] +sub v6.4s, v8.4s, v13.4s +add v8.4s, v8.4s, v13.4s +trn1 v13.4S, v7.4S, v11.4S +trn2 v20.4S, v7.4S, v11.4S +trn1 v5.4S, v8.4S, v6.4S +trn2 v4.4S, v8.4S, v6.4S +trn2 v8.2D, v13.2D, v5.2D +trn2 v6.2D, v20.2D, v4.2D +trn1 v7.2D, v13.2D, v5.2D +trn1 v11.2D, v20.2D, v4.2D +sqrdmulh v4.4S, v8.4S, v16.4S +mul v8.4S, v8.4S,v14.4S +mla v8.4S, v4.4S, v31.s[0] +sub v4.4s, v7.4s, v8.4s +add v7.4s, v7.4s, v8.4s +sqrdmulh v8.4S, v6.4S, v16.4S +mul v6.4S, v6.4S,v14.4S +mla v6.4S, v8.4S, v31.s[0] +sub v8.4s, v11.4s, v6.4s +add v11.4s, v11.4s, v6.4s +sqrdmulh v6.4S, v11.4S, v9.4S +mul v11.4S, v11.4S,v19.4S +mla v11.4S, v6.4S, v31.s[0] +sub v6.4s, v7.4s, v11.4s +add v7.4s, v7.4s, v11.4s +sqrdmulh v11.4S, v8.4S, v17.4S +mul v8.4S, v8.4S,v18.4S +mla v8.4S, v11.4S, v31.s[0] +sub v11.4s, v4.4s, v8.4s +add v4.4s, v4.4s, v8.4s +str q7, [x0, #448] +str q6, [x0, #464] +str q4, [x0, #480] +str q11, [x0, #496] +ldr q11, [x17, #+1152] +ldr q4, [x17, #+1168] +ldr q6, [x17, #+1184] +ldr q7, [x17, #+1200] +ldr q8, [x17, #+1216] +ldr q20, [x17, #+1232] +ldr q5, [x17, #+1248] +ldr q13, [x17, #+1264] +ldr q17, [x0, #544] +ldr q18, [x0, #560] +ldr q9, [x0, #512] +ldr q19, [x0, #528] +sqrdmulh v16.4S, v17.4S, v4.s[0] +mul v17.4S, v17.4S,v11.s[0] +mla v17.4S, v16.4S, v31.s[0] +sub v16.4s, v9.4s, v17.4s +add v9.4s, v9.4s, v17.4s +sqrdmulh v17.4S, v18.4S, v4.s[0] +mul v18.4S, v18.4S,v11.s[0] +mla v18.4S, v17.4S, v31.s[0] +sub v17.4s, v19.4s, v18.4s +add v19.4s, v19.4s, v18.4s +sqrdmulh v18.4S, v19.4S, v4.s[1] +mul v19.4S, v19.4S,v11.s[1] +mla v19.4S, v18.4S, v31.s[0] +sub v18.4s, v9.4s, v19.4s +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v17.4S, v4.s[2] +mul v17.4S, v17.4S,v11.s[2] +mla v17.4S, v19.4S, v31.s[0] +sub v19.4s, v16.4s, v17.4s +add v16.4s, v16.4s, v17.4s +trn1 v17.4S, v9.4S, v18.4S +trn2 v14.4S, v9.4S, v18.4S +trn1 v21.4S, v16.4S, v19.4S +trn2 v3.4S, v16.4S, v19.4S +trn2 v16.2D, v17.2D, v21.2D +trn2 v19.2D, v14.2D, v3.2D +trn1 v9.2D, v17.2D, v21.2D +trn1 v18.2D, v14.2D, v3.2D +sqrdmulh v3.4S, v16.4S, v7.4S +mul v16.4S, v16.4S,v6.4S +mla v16.4S, v3.4S, v31.s[0] +sub v3.4s, v9.4s, v16.4s +add v9.4s, v9.4s, v16.4s +sqrdmulh v16.4S, v19.4S, v7.4S +mul v19.4S, v19.4S,v6.4S +mla v19.4S, v16.4S, v31.s[0] +sub v16.4s, v18.4s, v19.4s +add v18.4s, v18.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v20.4S +mul v18.4S, v18.4S,v8.4S +mla v18.4S, v19.4S, v31.s[0] +sub v19.4s, v9.4s, v18.4s +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v16.4S, v13.4S +mul v16.4S, v16.4S,v5.4S +mla v16.4S, v18.4S, v31.s[0] +sub v18.4s, v3.4s, v16.4s +add v3.4s, v3.4s, v16.4s +str q9, [x0, #512] +str q19, [x0, #528] +str q3, [x0, #544] +str q18, [x0, #560] +ldr q18, [x17, #+1280] +ldr q3, [x17, #+1296] +ldr q19, [x17, #+1312] +ldr q9, [x17, #+1328] +ldr q16, [x17, #+1344] +ldr q14, [x17, #+1360] +ldr q21, [x17, #+1376] +ldr q17, [x17, #+1392] +ldr q13, [x0, #608] +ldr q5, [x0, #624] +ldr q20, [x0, #576] +ldr q8, [x0, #592] +sqrdmulh v7.4S, v13.4S, v3.s[0] +mul v13.4S, v13.4S,v18.s[0] +mla v13.4S, v7.4S, v31.s[0] +sub v7.4s, v20.4s, v13.4s +add v20.4s, v20.4s, v13.4s +sqrdmulh v13.4S, v5.4S, v3.s[0] +mul v5.4S, v5.4S,v18.s[0] +mla v5.4S, v13.4S, v31.s[0] +sub v13.4s, v8.4s, v5.4s +add v8.4s, v8.4s, v5.4s +sqrdmulh v5.4S, v8.4S, v3.s[1] +mul v8.4S, v8.4S,v18.s[1] +mla v8.4S, v5.4S, v31.s[0] +sub v5.4s, v20.4s, v8.4s +add v20.4s, v20.4s, v8.4s +sqrdmulh v8.4S, v13.4S, v3.s[2] +mul v13.4S, v13.4S,v18.s[2] +mla v13.4S, v8.4S, v31.s[0] +sub v8.4s, v7.4s, v13.4s +add v7.4s, v7.4s, v13.4s +trn1 v13.4S, v20.4S, v5.4S +trn2 v6.4S, v20.4S, v5.4S +trn1 v4.4S, v7.4S, v8.4S +trn2 v11.4S, v7.4S, v8.4S +trn2 v7.2D, v13.2D, v4.2D +trn2 v8.2D, v6.2D, v11.2D +trn1 v20.2D, v13.2D, v4.2D +trn1 v5.2D, v6.2D, v11.2D +sqrdmulh v11.4S, v7.4S, v9.4S +mul v7.4S, v7.4S,v19.4S +mla v7.4S, v11.4S, v31.s[0] +sub v11.4s, v20.4s, v7.4s +add v20.4s, v20.4s, v7.4s +sqrdmulh v7.4S, v8.4S, v9.4S +mul v8.4S, v8.4S,v19.4S +mla v8.4S, v7.4S, v31.s[0] +sub v7.4s, v5.4s, v8.4s +add v5.4s, v5.4s, v8.4s +sqrdmulh v8.4S, v5.4S, v14.4S +mul v5.4S, v5.4S,v16.4S +mla v5.4S, v8.4S, v31.s[0] +sub v8.4s, v20.4s, v5.4s +add v20.4s, v20.4s, v5.4s +sqrdmulh v5.4S, v7.4S, v17.4S +mul v7.4S, v7.4S,v21.4S +mla v7.4S, v5.4S, v31.s[0] +sub v5.4s, v11.4s, v7.4s +add v11.4s, v11.4s, v7.4s +str q20, [x0, #576] +str q8, [x0, #592] +str q11, [x0, #608] +str q5, [x0, #624] +ldr q5, [x17, #+1408] +ldr q11, [x17, #+1424] +ldr q8, [x17, #+1440] +ldr q20, [x17, #+1456] +ldr q7, [x17, #+1472] +ldr q6, [x17, #+1488] +ldr q4, [x17, #+1504] +ldr q13, [x17, #+1520] +ldr q17, [x0, #672] +ldr q21, [x0, #688] +ldr q14, [x0, #640] +ldr q16, [x0, #656] +sqrdmulh v9.4S, v17.4S, v11.s[0] +mul v17.4S, v17.4S,v5.s[0] +mla v17.4S, v9.4S, v31.s[0] +sub v9.4s, v14.4s, v17.4s +add v14.4s, v14.4s, v17.4s +sqrdmulh v17.4S, v21.4S, v11.s[0] +mul v21.4S, v21.4S,v5.s[0] +mla v21.4S, v17.4S, v31.s[0] +sub v17.4s, v16.4s, v21.4s +add v16.4s, v16.4s, v21.4s +sqrdmulh v21.4S, v16.4S, v11.s[1] +mul v16.4S, v16.4S,v5.s[1] +mla v16.4S, v21.4S, v31.s[0] +sub v21.4s, v14.4s, v16.4s +add v14.4s, v14.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v11.s[2] +mul v17.4S, v17.4S,v5.s[2] +mla v17.4S, v16.4S, v31.s[0] +sub v16.4s, v9.4s, v17.4s +add v9.4s, v9.4s, v17.4s +trn1 v17.4S, v14.4S, v21.4S +trn2 v19.4S, v14.4S, v21.4S +trn1 v3.4S, v9.4S, v16.4S +trn2 v18.4S, v9.4S, v16.4S +trn2 v9.2D, v17.2D, v3.2D +trn2 v16.2D, v19.2D, v18.2D +trn1 v14.2D, v17.2D, v3.2D +trn1 v21.2D, v19.2D, v18.2D +sqrdmulh v18.4S, v9.4S, v20.4S +mul v9.4S, v9.4S,v8.4S +mla v9.4S, v18.4S, v31.s[0] +sub v18.4s, v14.4s, v9.4s +add v14.4s, v14.4s, v9.4s +sqrdmulh v9.4S, v16.4S, v20.4S +mul v16.4S, v16.4S,v8.4S +mla v16.4S, v9.4S, v31.s[0] +sub v9.4s, v21.4s, v16.4s +add v21.4s, v21.4s, v16.4s +sqrdmulh v16.4S, v21.4S, v6.4S +mul v21.4S, v21.4S,v7.4S +mla v21.4S, v16.4S, v31.s[0] +sub v16.4s, v14.4s, v21.4s +add v14.4s, v14.4s, v21.4s +sqrdmulh v21.4S, v9.4S, v13.4S +mul v9.4S, v9.4S,v4.4S +mla v9.4S, v21.4S, v31.s[0] +sub v21.4s, v18.4s, v9.4s +add v18.4s, v18.4s, v9.4s +str q14, [x0, #640] +str q16, [x0, #656] +str q18, [x0, #672] +str q21, [x0, #688] +ldr q21, [x17, #+1536] +ldr q18, [x17, #+1552] +ldr q16, [x17, #+1568] +ldr q14, [x17, #+1584] +ldr q9, [x17, #+1600] +ldr q19, [x17, #+1616] +ldr q3, [x17, #+1632] +ldr q17, [x17, #+1648] +ldr q13, [x0, #736] +ldr q4, [x0, #752] +ldr q6, [x0, #704] +ldr q7, [x0, #720] +sqrdmulh v20.4S, v13.4S, v18.s[0] +mul v13.4S, v13.4S,v21.s[0] +mla v13.4S, v20.4S, v31.s[0] +sub v20.4s, v6.4s, v13.4s +add v6.4s, v6.4s, v13.4s +sqrdmulh v13.4S, v4.4S, v18.s[0] +mul v4.4S, v4.4S,v21.s[0] +mla v4.4S, v13.4S, v31.s[0] +sub v13.4s, v7.4s, v4.4s +add v7.4s, v7.4s, v4.4s +sqrdmulh v4.4S, v7.4S, v18.s[1] +mul v7.4S, v7.4S,v21.s[1] +mla v7.4S, v4.4S, v31.s[0] +sub v4.4s, v6.4s, v7.4s +add v6.4s, v6.4s, v7.4s +sqrdmulh v7.4S, v13.4S, v18.s[2] +mul v13.4S, v13.4S,v21.s[2] +mla v13.4S, v7.4S, v31.s[0] +sub v7.4s, v20.4s, v13.4s +add v20.4s, v20.4s, v13.4s +trn1 v13.4S, v6.4S, v4.4S +trn2 v8.4S, v6.4S, v4.4S +trn1 v11.4S, v20.4S, v7.4S +trn2 v5.4S, v20.4S, v7.4S +trn2 v20.2D, v13.2D, v11.2D +trn2 v7.2D, v8.2D, v5.2D +trn1 v6.2D, v13.2D, v11.2D +trn1 v4.2D, v8.2D, v5.2D +sqrdmulh v5.4S, v20.4S, v14.4S +mul v20.4S, v20.4S,v16.4S +mla v20.4S, v5.4S, v31.s[0] +sub v5.4s, v6.4s, v20.4s +add v6.4s, v6.4s, v20.4s +sqrdmulh v20.4S, v7.4S, v14.4S +mul v7.4S, v7.4S,v16.4S +mla v7.4S, v20.4S, v31.s[0] +sub v20.4s, v4.4s, v7.4s +add v4.4s, v4.4s, v7.4s +sqrdmulh v7.4S, v4.4S, v19.4S +mul v4.4S, v4.4S,v9.4S +mla v4.4S, v7.4S, v31.s[0] +sub v7.4s, v6.4s, v4.4s +add v6.4s, v6.4s, v4.4s +sqrdmulh v4.4S, v20.4S, v17.4S +mul v20.4S, v20.4S,v3.4S +mla v20.4S, v4.4S, v31.s[0] +sub v4.4s, v5.4s, v20.4s +add v5.4s, v5.4s, v20.4s +str q6, [x0, #704] +str q7, [x0, #720] +str q5, [x0, #736] +str q4, [x0, #752] +ldr q4, [x17, #+1664] +ldr q5, [x17, #+1680] +ldr q7, [x17, #+1696] +ldr q6, [x17, #+1712] +ldr q20, [x17, #+1728] +ldr q8, [x17, #+1744] +ldr q11, [x17, #+1760] +ldr q13, [x17, #+1776] +ldr q17, [x0, #800] +ldr q3, [x0, #816] +ldr q19, [x0, #768] +ldr q9, [x0, #784] +sqrdmulh v14.4S, v17.4S, v5.s[0] +mul v17.4S, v17.4S,v4.s[0] +mla v17.4S, v14.4S, v31.s[0] +sub v14.4s, v19.4s, v17.4s +add v19.4s, v19.4s, v17.4s +sqrdmulh v17.4S, v3.4S, v5.s[0] +mul v3.4S, v3.4S,v4.s[0] +mla v3.4S, v17.4S, v31.s[0] +sub v17.4s, v9.4s, v3.4s +add v9.4s, v9.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v5.s[1] +mul v9.4S, v9.4S,v4.s[1] +mla v9.4S, v3.4S, v31.s[0] +sub v3.4s, v19.4s, v9.4s +add v19.4s, v19.4s, v9.4s +sqrdmulh v9.4S, v17.4S, v5.s[2] +mul v17.4S, v17.4S,v4.s[2] +mla v17.4S, v9.4S, v31.s[0] +sub v9.4s, v14.4s, v17.4s +add v14.4s, v14.4s, v17.4s +trn1 v17.4S, v19.4S, v3.4S +trn2 v16.4S, v19.4S, v3.4S +trn1 v18.4S, v14.4S, v9.4S +trn2 v21.4S, v14.4S, v9.4S +trn2 v14.2D, v17.2D, v18.2D +trn2 v9.2D, v16.2D, v21.2D +trn1 v19.2D, v17.2D, v18.2D +trn1 v3.2D, v16.2D, v21.2D +sqrdmulh v21.4S, v14.4S, v6.4S +mul v14.4S, v14.4S,v7.4S +mla v14.4S, v21.4S, v31.s[0] +sub v21.4s, v19.4s, v14.4s +add v19.4s, v19.4s, v14.4s +sqrdmulh v14.4S, v9.4S, v6.4S +mul v9.4S, v9.4S,v7.4S +mla v9.4S, v14.4S, v31.s[0] +sub v14.4s, v3.4s, v9.4s +add v3.4s, v3.4s, v9.4s +sqrdmulh v9.4S, v3.4S, v8.4S +mul v3.4S, v3.4S,v20.4S +mla v3.4S, v9.4S, v31.s[0] +sub v9.4s, v19.4s, v3.4s +add v19.4s, v19.4s, v3.4s +sqrdmulh v3.4S, v14.4S, v13.4S +mul v14.4S, v14.4S,v11.4S +mla v14.4S, v3.4S, v31.s[0] +sub v3.4s, v21.4s, v14.4s +add v21.4s, v21.4s, v14.4s +str q19, [x0, #768] +str q9, [x0, #784] +str q21, [x0, #800] +str q3, [x0, #816] +ldr q3, [x17, #+1792] +ldr q21, [x17, #+1808] +ldr q9, [x17, #+1824] +ldr q19, [x17, #+1840] +ldr q14, [x17, #+1856] +ldr q16, [x17, #+1872] +ldr q18, [x17, #+1888] +ldr q17, [x17, #+1904] +ldr q13, [x0, #864] +ldr q11, [x0, #880] +ldr q8, [x0, #832] +ldr q20, [x0, #848] +sqrdmulh v6.4S, v13.4S, v21.s[0] +mul v13.4S, v13.4S,v3.s[0] +mla v13.4S, v6.4S, v31.s[0] +sub v6.4s, v8.4s, v13.4s +add v8.4s, v8.4s, v13.4s +sqrdmulh v13.4S, v11.4S, v21.s[0] +mul v11.4S, v11.4S,v3.s[0] +mla v11.4S, v13.4S, v31.s[0] +sub v13.4s, v20.4s, v11.4s +add v20.4s, v20.4s, v11.4s +sqrdmulh v11.4S, v20.4S, v21.s[1] +mul v20.4S, v20.4S,v3.s[1] +mla v20.4S, v11.4S, v31.s[0] +sub v11.4s, v8.4s, v20.4s +add v8.4s, v8.4s, v20.4s +sqrdmulh v20.4S, v13.4S, v21.s[2] +mul v13.4S, v13.4S,v3.s[2] +mla v13.4S, v20.4S, v31.s[0] +sub v20.4s, v6.4s, v13.4s +add v6.4s, v6.4s, v13.4s +trn1 v13.4S, v8.4S, v11.4S +trn2 v7.4S, v8.4S, v11.4S +trn1 v5.4S, v6.4S, v20.4S +trn2 v4.4S, v6.4S, v20.4S +trn2 v6.2D, v13.2D, v5.2D +trn2 v20.2D, v7.2D, v4.2D +trn1 v8.2D, v13.2D, v5.2D +trn1 v11.2D, v7.2D, v4.2D +sqrdmulh v4.4S, v6.4S, v19.4S +mul v6.4S, v6.4S,v9.4S +mla v6.4S, v4.4S, v31.s[0] +sub v4.4s, v8.4s, v6.4s +add v8.4s, v8.4s, v6.4s +sqrdmulh v6.4S, v20.4S, v19.4S +mul v20.4S, v20.4S,v9.4S +mla v20.4S, v6.4S, v31.s[0] +sub v6.4s, v11.4s, v20.4s +add v11.4s, v11.4s, v20.4s +sqrdmulh v20.4S, v11.4S, v16.4S +mul v11.4S, v11.4S,v14.4S +mla v11.4S, v20.4S, v31.s[0] +sub v20.4s, v8.4s, v11.4s +add v8.4s, v8.4s, v11.4s +sqrdmulh v11.4S, v6.4S, v17.4S +mul v6.4S, v6.4S,v18.4S +mla v6.4S, v11.4S, v31.s[0] +sub v11.4s, v4.4s, v6.4s +add v4.4s, v4.4s, v6.4s +str q8, [x0, #832] +str q20, [x0, #848] +str q4, [x0, #864] +str q11, [x0, #880] +ldr q11, [x17, #+1920] +ldr q4, [x17, #+1936] +ldr q20, [x17, #+1952] +ldr q8, [x17, #+1968] +ldr q6, [x17, #+1984] +ldr q7, [x17, #+2000] +ldr q5, [x17, #+2016] +ldr q13, [x17, #+2032] +ldr q17, [x0, #928] +ldr q18, [x0, #944] +ldr q16, [x0, #896] +ldr q14, [x0, #912] +sqrdmulh v19.4S, v17.4S, v4.s[0] +mul v17.4S, v17.4S,v11.s[0] +mla v17.4S, v19.4S, v31.s[0] +sub v19.4s, v16.4s, v17.4s +add v16.4s, v16.4s, v17.4s +sqrdmulh v17.4S, v18.4S, v4.s[0] +mul v18.4S, v18.4S,v11.s[0] +mla v18.4S, v17.4S, v31.s[0] +sub v17.4s, v14.4s, v18.4s +add v14.4s, v14.4s, v18.4s +sqrdmulh v18.4S, v14.4S, v4.s[1] +mul v14.4S, v14.4S,v11.s[1] +mla v14.4S, v18.4S, v31.s[0] +sub v18.4s, v16.4s, v14.4s +add v16.4s, v16.4s, v14.4s +sqrdmulh v14.4S, v17.4S, v4.s[2] +mul v17.4S, v17.4S,v11.s[2] +mla v17.4S, v14.4S, v31.s[0] +sub v14.4s, v19.4s, v17.4s +add v19.4s, v19.4s, v17.4s +trn1 v17.4S, v16.4S, v18.4S +trn2 v9.4S, v16.4S, v18.4S +trn1 v21.4S, v19.4S, v14.4S +trn2 v3.4S, v19.4S, v14.4S +trn2 v19.2D, v17.2D, v21.2D +trn2 v14.2D, v9.2D, v3.2D +trn1 v16.2D, v17.2D, v21.2D +trn1 v18.2D, v9.2D, v3.2D +sqrdmulh v3.4S, v19.4S, v8.4S +mul v19.4S, v19.4S,v20.4S +mla v19.4S, v3.4S, v31.s[0] +sub v3.4s, v16.4s, v19.4s +add v16.4s, v16.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v8.4S +mul v14.4S, v14.4S,v20.4S +mla v14.4S, v19.4S, v31.s[0] +sub v19.4s, v18.4s, v14.4s +add v18.4s, v18.4s, v14.4s +sqrdmulh v14.4S, v18.4S, v7.4S +mul v18.4S, v18.4S,v6.4S +mla v18.4S, v14.4S, v31.s[0] +sub v14.4s, v16.4s, v18.4s +add v16.4s, v16.4s, v18.4s +sqrdmulh v18.4S, v19.4S, v13.4S +mul v19.4S, v19.4S,v5.4S +mla v19.4S, v18.4S, v31.s[0] +sub v18.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +str q16, [x0, #896] +str q14, [x0, #912] +str q3, [x0, #928] +str q18, [x0, #944] +ldr q18, [x17, #+2048] +ldr q3, [x17, #+2064] +ldr q14, [x17, #+2080] +ldr q16, [x17, #+2096] +ldr q19, [x17, #+2112] +ldr q9, [x17, #+2128] +ldr q21, [x17, #+2144] +ldr q17, [x17, #+2160] +ldr q13, [x0, #992] +ldr q5, [x0, #1008] +ldr q7, [x0, #960] +ldr q6, [x0, #976] +sqrdmulh v8.4S, v13.4S, v3.s[0] +mul v13.4S, v13.4S,v18.s[0] +mla v13.4S, v8.4S, v31.s[0] +sub v8.4s, v7.4s, v13.4s +add v7.4s, v7.4s, v13.4s +sqrdmulh v13.4S, v5.4S, v3.s[0] +mul v5.4S, v5.4S,v18.s[0] +mla v5.4S, v13.4S, v31.s[0] +sub v13.4s, v6.4s, v5.4s +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v6.4S, v3.s[1] +mul v6.4S, v6.4S,v18.s[1] +mla v6.4S, v5.4S, v31.s[0] +sub v5.4s, v7.4s, v6.4s +add v7.4s, v7.4s, v6.4s +sqrdmulh v6.4S, v13.4S, v3.s[2] +mul v13.4S, v13.4S,v18.s[2] +mla v13.4S, v6.4S, v31.s[0] +sub v6.4s, v8.4s, v13.4s +add v8.4s, v8.4s, v13.4s +trn1 v13.4S, v7.4S, v5.4S +trn2 v20.4S, v7.4S, v5.4S +trn1 v4.4S, v8.4S, v6.4S +trn2 v11.4S, v8.4S, v6.4S +trn2 v8.2D, v13.2D, v4.2D +trn2 v6.2D, v20.2D, v11.2D +trn1 v7.2D, v13.2D, v4.2D +trn1 v5.2D, v20.2D, v11.2D +sqrdmulh v11.4S, v8.4S, v16.4S +mul v8.4S, v8.4S,v14.4S +mla v8.4S, v11.4S, v31.s[0] +sub v11.4s, v7.4s, v8.4s +add v7.4s, v7.4s, v8.4s +sqrdmulh v8.4S, v6.4S, v16.4S +mul v6.4S, v6.4S,v14.4S +mla v6.4S, v8.4S, v31.s[0] +sub v8.4s, v5.4s, v6.4s +add v5.4s, v5.4s, v6.4s +sqrdmulh v6.4S, v5.4S, v9.4S +mul v5.4S, v5.4S,v19.4S +mla v5.4S, v6.4S, v31.s[0] +sub v6.4s, v7.4s, v5.4s +add v7.4s, v7.4s, v5.4s +sqrdmulh v5.4S, v8.4S, v17.4S +mul v8.4S, v8.4S,v21.4S +mla v8.4S, v5.4S, v31.s[0] +sub v5.4s, v11.4s, v8.4s +add v11.4s, v11.4s, v8.4s +str q7, [x0, #960] +str q6, [x0, #976] +str q11, [x0, #992] +str q5, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 2392 +// Instruction count: 2388 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_2_0.s b/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_2_0.s new file mode 100644 index 0000000..1f606a8 --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_2_0.s @@ -0,0 +1,2422 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 26036764 // Layer 6, block 0 +.word 7065381 // Layer 6, block 1 +.word 11280567 // Layer 6, block 2 +.word 19695786 // Layer 6, block 3 +.word 1666225723 // Layer 6, block 0 +.word 452149874 // Layer 6, block 1 +.word 721901190 // Layer 6, block 2 +.word 1260434103 // Layer 6, block 3 +.word 28678040 // Layer 7, block 0 +.word 5637166 // Layer 7, block 2 +.word 18759424 // Layer 7, block 4 +.word 8648030 // Layer 7, block 6 +.word 1835254486 // Layer 7, block 0 +.word 360751090 // Layer 7, block 2 +.word 1200511508 // Layer 7, block 4 +.word 553431680 // Layer 7, block 6 +.word 7232147 // Layer 7, block 1 +.word 7430689 // Layer 7, block 3 +.word 14819378 // Layer 7, block 5 +.word 22112339 // Layer 7, block 7 +.word 462822084 // Layer 7, block 1 +.word 475527802 // Layer 7, block 3 +.word 948367809 // Layer 7, block 5 +.word 1415081692 // Layer 7, block 7 +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14834498 // Layer 6, block 4 +.word 22861321 // Layer 6, block 5 +.word 23033862 // Layer 6, block 6 +.word 32211066 // Layer 6, block 7 +.word 949335415 // Layer 6, block 4 +.word 1463012881 // Layer 6, block 5 +.word 1474054663 // Layer 6, block 6 +.word 2061350894 // Layer 6, block 7 +.word 7103825 // Layer 7, block 8 +.word 24338119 // Layer 7, block 10 +.word 6674394 // Layer 7, block 12 +.word 3716128 // Layer 7, block 14 +.word 454610102 // Layer 7, block 8 +.word 1557520740 // Layer 7, block 10 +.word 427128616 // Layer 7, block 12 +.word 237814041 // Layer 7, block 14 +.word 18577393 // Layer 7, block 9 +.word 17042091 // Layer 7, block 11 +.word 6574213 // Layer 7, block 13 +.word 24666803 // Layer 7, block 15 +.word 1188862414 // Layer 7, block 9 +.word 1090610585 // Layer 7, block 11 +.word 420717521 // Layer 7, block 13 +.word 1578554911 // Layer 7, block 15 +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 11253846 // Layer 6, block 8 +.word 16151303 // Layer 6, block 9 +.word 1821442 // Layer 6, block 10 +.word 23358663 // Layer 6, block 11 +.word 720191176 // Layer 6, block 8 +.word 1033604503 // Layer 6, block 9 +.word 116563391 // Layer 6, block 10 +.word 1494840340 // Layer 6, block 11 +.word 32787475 // Layer 7, block 16 +.word 8269259 // Layer 7, block 18 +.word 20826321 // Layer 7, block 20 +.word 21194054 // Layer 7, block 22 +.word 2098238255 // Layer 7, block 16 +.word 529192186 // Layer 7, block 18 +.word 1332782821 // Layer 7, block 20 +.word 1356315937 // Layer 7, block 22 +.word 28400654 // Layer 7, block 17 +.word 31090287 // Layer 7, block 19 +.word 26776841 // Layer 7, block 21 +.word 22281074 // Layer 7, block 23 +.word 1817503137 // Layer 7, block 17 +.word 1989626512 // Layer 7, block 19 +.word 1713587037 // Layer 7, block 21 +.word 1425879908 // Layer 7, block 23 +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 20504641 // Layer 6, block 12 +.word 7735096 // Layer 6, block 13 +.word 29463916 // Layer 6, block 14 +.word 23172067 // Layer 6, block 15 +.word 1312196872 // Layer 6, block 12 +.word 495008363 // Layer 6, block 13 +.word 1885546712 // Layer 6, block 14 +.word 1482899108 // Layer 6, block 15 +.word 1953000 // Layer 7, block 24 +.word 12766243 // Layer 7, block 26 +.word 16292342 // Layer 7, block 28 +.word 25143337 // Layer 7, block 30 +.word 124982461 // Layer 7, block 24 +.word 816977197 // Layer 7, block 26 +.word 1042630311 // Layer 7, block 28 +.word 1609050759 // Layer 7, block 30 +.word 12486848 // Layer 7, block 25 +.word 31556661 // Layer 7, block 27 +.word 28330310 // Layer 7, block 29 +.word 15137961 // Layer 7, block 31 +.word 799097282 // Layer 7, block 25 +.word 2019472170 // Layer 7, block 27 +.word 1813001465 // Layer 7, block 29 +.word 968755565 // Layer 7, block 31 +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 18663828 // Layer 6, block 16 +.word 25765932 // Layer 6, block 17 +.word 11779122 // Layer 6, block 18 +.word 29112305 // Layer 6, block 19 +.word 1194393831 // Layer 6, block 16 +.word 1648893798 // Layer 6, block 17 +.word 753806275 // Layer 6, block 18 +.word 1863045325 // Layer 6, block 19 +.word 33163184 // Layer 7, block 32 +.word 11550623 // Layer 7, block 34 +.word 25375595 // Layer 7, block 36 +.word 18254638 // Layer 7, block 38 +.word 2122281795 // Layer 7, block 32 +.word 739183455 // Layer 7, block 34 +.word 1623914137 // Layer 7, block 36 +.word 1168207670 // Layer 7, block 38 +.word 9551359 // Layer 7, block 33 +.word 33257316 // Layer 7, block 35 +.word 10387700 // Layer 7, block 37 +.word 4263629 // Layer 7, block 39 +.word 611240324 // Layer 7, block 33 +.word 2128305784 // Layer 7, block 35 +.word 664762063 // Layer 7, block 37 +.word 272851431 // Layer 7, block 39 +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 596073 // Layer 6, block 20 +.word 29039358 // Layer 6, block 21 +.word 6760262 // Layer 6, block 22 +.word 2228887 // Layer 6, block 23 +.word 38145761 // Layer 6, block 20 +.word 1858377074 // Layer 6, block 21 +.word 432623749 // Layer 6, block 22 +.word 142637881 // Layer 6, block 23 +.word 25929180 // Layer 7, block 40 +.word 23508428 // Layer 7, block 42 +.word 22560727 // Layer 7, block 44 +.word 29457393 // Layer 7, block 46 +.word 1659340873 // Layer 7, block 40 +.word 1504424569 // Layer 7, block 42 +.word 1443776334 // Layer 7, block 44 +.word 1885129272 // Layer 7, block 46 +.word 17371159 // Layer 7, block 41 +.word 11558208 // Layer 7, block 43 +.word 15755637 // Layer 7, block 45 +.word 20740787 // Layer 7, block 47 +.word 1111669329 // Layer 7, block 41 +.word 739668858 // Layer 7, block 43 +.word 1008283812 // Layer 7, block 45 +.word 1327309063 // Layer 7, block 47 +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 13624329 // Layer 6, block 24 +.word 9838349 // Layer 6, block 25 +.word 6934560 // Layer 6, block 26 +.word 11310234 // Layer 6, block 27 +.word 871890510 // Layer 6, block 24 +.word 629606282 // Layer 6, block 25 +.word 443777969 // Layer 6, block 26 +.word 723799733 // Layer 6, block 27 +.word 3153984 // Layer 7, block 48 +.word 15599806 // Layer 7, block 50 +.word 23484790 // Layer 7, block 52 +.word 30174454 // Layer 7, block 54 +.word 201839571 // Layer 7, block 48 +.word 998311389 // Layer 7, block 50 +.word 1502911852 // Layer 7, block 52 +.word 1931017673 // Layer 7, block 54 +.word 13598070 // Layer 7, block 49 +.word 31454003 // Layer 7, block 51 +.word 20506260 // Layer 7, block 53 +.word 5928435 // Layer 7, block 55 +.word 870210062 // Layer 7, block 49 +.word 2012902560 // Layer 7, block 51 +.word 1312300480 // Layer 7, block 53 +.word 379390883 // Layer 7, block 55 +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 32798516 // Layer 6, block 28 +.word 9911360 // Layer 6, block 29 +.word 32443170 // Layer 6, block 30 +.word 31293482 // Layer 6, block 31 +.word 2098944825 // Layer 6, block 28 +.word 634278629 // Layer 6, block 29 +.word 2076204416 // Layer 6, block 30 +.word 2002630000 // Layer 6, block 31 +.word 26013877 // Layer 7, block 56 +.word 22928950 // Layer 7, block 58 +.word 24547058 // Layer 7, block 60 +.word 21082546 // Layer 7, block 62 +.word 1664761067 // Layer 7, block 56 +.word 1467340807 // Layer 7, block 58 +.word 1570891816 // Layer 7, block 60 +.word 1349179970 // Layer 7, block 62 +.word 21864746 // Layer 7, block 57 +.word 27678266 // Layer 7, block 59 +.word 30695887 // Layer 7, block 61 +.word 31772478 // Layer 7, block 63 +.word 1399236949 // Layer 7, block 57 +.word 1771273834 // Layer 7, block 59 +.word 1964386839 // Layer 7, block 61 +.word 2033283404 // Layer 7, block 63 +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 2853776 // Layer 6, block 32 +.word 31645959 // Layer 6, block 33 +.word 29723614 // Layer 6, block 34 +.word 31813171 // Layer 6, block 35 +.word 182627725 // Layer 6, block 32 +.word 2025186806 // Layer 6, block 33 +.word 1902166116 // Layer 6, block 34 +.word 2035887557 // Layer 6, block 35 +.word 30377953 // Layer 7, block 64 +.word 4924837 // Layer 7, block 66 +.word 11362575 // Layer 7, block 68 +.word 31398766 // Layer 7, block 70 +.word 1944040616 // Layer 7, block 64 +.word 315165513 // Layer 7, block 66 +.word 727149301 // Layer 7, block 68 +.word 2009367662 // Layer 7, block 70 +.word 27689101 // Layer 7, block 65 +.word 31229525 // Layer 7, block 67 +.word 6544948 // Layer 7, block 69 +.word 13728247 // Layer 7, block 71 +.word 1771967221 // Layer 7, block 65 +.word 1998537064 // Layer 7, block 67 +.word 418844704 // Layer 7, block 69 +.word 878540754 // Layer 7, block 71 +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9116920 // Layer 6, block 36 +.word 26449800 // Layer 6, block 37 +.word 27173300 // Layer 6, block 38 +.word 1574249 // Layer 6, block 39 +.word 583438350 // Layer 6, block 36 +.word 1692658010 // Layer 6, block 37 +.word 1738958476 // Layer 6, block 38 +.word 100744247 // Layer 6, block 39 +.word 6510145 // Layer 7, block 72 +.word 760999 // Layer 7, block 74 +.word 1634503 // Layer 7, block 76 +.word 29546109 // Layer 7, block 78 +.word 416617482 // Layer 7, block 72 +.word 48700219 // Layer 7, block 74 +.word 104600209 // Layer 7, block 76 +.word 1890806663 // Layer 7, block 78 +.word 2195232 // Layer 7, block 73 +.word 4465852 // Layer 7, block 75 +.word 31203102 // Layer 7, block 77 +.word 29916743 // Layer 7, block 79 +.word 140484126 // Layer 7, block 73 +.word 285792715 // Layer 7, block 75 +.word 1996846121 // Layer 7, block 77 +.word 1914525428 // Layer 7, block 79 +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29172999 // Layer 6, block 40 +.word 16825951 // Layer 6, block 41 +.word 11592382 // Layer 6, block 42 +.word 2671395 // Layer 6, block 43 +.word 1866929445 // Layer 6, block 40 +.word 1076778680 // Layer 6, block 41 +.word 741855827 // Layer 6, block 42 +.word 170956232 // Layer 6, block 43 +.word 14579779 // Layer 7, block 80 +.word 24263513 // Layer 7, block 82 +.word 4646776 // Layer 7, block 84 +.word 69049 // Layer 7, block 86 +.word 933034643 // Layer 7, block 80 +.word 1552746321 // Layer 7, block 82 +.word 297370968 // Layer 7, block 84 +.word 4418799 // Layer 7, block 86 +.word 33263488 // Layer 7, block 81 +.word 22493246 // Layer 7, block 83 +.word 22009979 // Layer 7, block 85 +.word 12021234 // Layer 7, block 87 +.word 2128700762 // Layer 7, block 81 +.word 1439457879 // Layer 7, block 83 +.word 1408531152 // Layer 7, block 85 +.word 769300260 // Layer 7, block 87 +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 15720958 // Layer 6, block 44 +.word 4876619 // Layer 6, block 45 +.word 9370171 // Layer 6, block 46 +.word 2197027 // Layer 6, block 47 +.word 1006064525 // Layer 6, block 44 +.word 312079797 // Layer 6, block 45 +.word 599645177 // Layer 6, block 46 +.word 140598997 // Layer 6, block 47 +.word 16117282 // Layer 7, block 88 +.word 9635661 // Layer 7, block 90 +.word 9117520 // Layer 7, block 92 +.word 3506913 // Layer 7, block 94 +.word 1031427326 // Layer 7, block 88 +.word 616635240 // Layer 7, block 90 +.word 583476747 // Layer 7, block 92 +.word 224425303 // Layer 7, block 94 +.word 20014407 // Layer 7, block 89 +.word 25893988 // Layer 7, block 91 +.word 10257619 // Layer 7, block 93 +.word 24501669 // Layer 7, block 95 +.word 1280824291 // Layer 7, block 89 +.word 1657088757 // Layer 7, block 91 +.word 656437514 // Layer 7, block 93 +.word 1567987141 // Layer 7, block 95 +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 23467272 // Layer 6, block 48 +.word 11944835 // Layer 6, block 49 +.word 29768154 // Layer 6, block 50 +.word 3189790 // Layer 6, block 51 +.word 1501790786 // Layer 6, block 48 +.word 764411097 // Layer 6, block 49 +.word 1905016458 // Layer 6, block 50 +.word 204130980 // Layer 6, block 51 +.word 28559032 // Layer 7, block 96 +.word 20151609 // Layer 7, block 98 +.word 11645481 // Layer 7, block 100 +.word 16402437 // Layer 7, block 102 +.word 1827638556 // Layer 7, block 96 +.word 1289604549 // Layer 7, block 98 +.word 745253903 // Layer 7, block 100 +.word 1049675853 // Layer 7, block 102 +.word 1005359 // Layer 7, block 97 +.word 19130139 // Layer 7, block 99 +.word 11690281 // Layer 7, block 101 +.word 5461508 // Layer 7, block 103 +.word 64338065 // Layer 7, block 97 +.word 1224235458 // Layer 7, block 99 +.word 748120885 // Layer 7, block 101 +.word 349509836 // Layer 7, block 103 +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 4898455 // Layer 6, block 52 +.word 22059944 // Layer 6, block 53 +.word 20315246 // Layer 6, block 54 +.word 28615767 // Layer 6, block 55 +.word 313477194 // Layer 6, block 52 +.word 1411728668 // Layer 6, block 53 +.word 1300076517 // Layer 6, block 54 +.word 1831269319 // Layer 6, block 55 +.word 6226096 // Layer 7, block 104 +.word 14029790 // Layer 7, block 106 +.word 7729000 // Layer 7, block 108 +.word 13958531 // Layer 7, block 110 +.word 398439734 // Layer 7, block 104 +.word 897838034 // Layer 7, block 106 +.word 494618249 // Layer 7, block 108 +.word 893277806 // Layer 7, block 110 +.word 31755058 // Layer 7, block 105 +.word 26102744 // Layer 7, block 107 +.word 19175904 // Layer 7, block 109 +.word 19472238 // Layer 7, block 111 +.word 2032168609 // Layer 7, block 105 +.word 1670448121 // Layer 7, block 107 +.word 1227164194 // Layer 7, block 109 +.word 1246128123 // Layer 7, block 111 +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 17302560 // Layer 6, block 56 +.word 8630188 // Layer 6, block 57 +.word 13744680 // Layer 6, block 58 +.word 31890906 // Layer 6, block 59 +.word 1107279328 // Layer 6, block 56 +.word 552289879 // Layer 6, block 57 +.word 879592386 // Layer 6, block 58 +.word 2040862218 // Layer 6, block 59 +.word 4735938 // Layer 7, block 112 +.word 26671657 // Layer 7, block 114 +.word 25810971 // Layer 7, block 116 +.word 25578690 // Layer 7, block 118 +.word 303076900 // Layer 7, block 112 +.word 1706855774 // Layer 7, block 114 +.word 1651776074 // Layer 7, block 116 +.word 1636911225 // Layer 7, block 118 +.word 6957373 // Layer 7, block 113 +.word 25381712 // Layer 7, block 115 +.word 27780827 // Layer 7, block 117 +.word 28062311 // Layer 7, block 119 +.word 445237890 // Layer 7, block 113 +.word 1624305595 // Layer 7, block 115 +.word 1777837237 // Layer 7, block 117 +.word 1795850838 // Layer 7, block 119 +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 26150922 // Layer 6, block 60 +.word 29525906 // Layer 6, block 61 +.word 23080870 // Layer 6, block 62 +.word 1636987 // Layer 6, block 63 +.word 1673531278 // Layer 6, block 60 +.word 1889513769 // Layer 6, block 61 +.word 1477062945 // Layer 6, block 62 +.word 104759172 // Layer 6, block 63 +.word 10674616 // Layer 7, block 120 +.word 9508293 // Layer 7, block 122 +.word 4274200 // Layer 7, block 124 +.word 10066304 // Layer 7, block 126 +.word 683123285 // Layer 7, block 120 +.word 608484310 // Layer 7, block 122 +.word 273527923 // Layer 7, block 124 +.word 644194289 // Layer 7, block 126 +.word 26473446 // Layer 7, block 121 +.word 14853570 // Layer 7, block 123 +.word 32427548 // Layer 7, block 125 +.word 16598340 // Layer 7, block 127 +.word 1694171239 // Layer 7, block 121 +.word 950555930 // Layer 7, block 123 +.word 2075204685 // Layer 7, block 125 +.word 1062212688 // Layer 7, block 127 +.text +.global ntt_u32_full_neon_asm_var_4_4_2_0 +.global _ntt_u32_full_neon_asm_var_4_4_2_0 +ntt_u32_full_neon_asm_var_4_4_2_0: +_ntt_u32_full_neon_asm_var_4_4_2_0: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #800] +ldr q21, [x0, #864] +ldr q20, [x0, #928] +ldr q19, [x0, #992] +ldr q18, [x0, #288] +ldr q17, [x0, #352] +ldr q16, [x0, #416] +ldr q3, [x0, #480] +sqrdmulh v2.4S, v22.4S, v29.s[0] +ldr q1, [x0, #544] +mul v22.4S, v22.4S,v30.s[0] +ldr q0, [x0, #608] +sqrdmulh v15.4S, v21.4S, v29.s[0] +ldr q14, [x0, #672] +mul v21.4S, v21.4S,v30.s[0] +ldr q13, [x0, #736] +mla v22.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q12, [x0, #32] +mla v21.4S, v15.4S, v31.s[0] +sub v15.4s, v18.4s, v22.4s +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q11, [x0, #96] +mla v20.4S, v2.4S, v31.s[0] +sub v2.4s, v17.4s, v21.4s +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v1.4S, v29.s[0] +ldr q10, [x0, #160] +mul v1.4S, v1.4S,v30.s[0] +mla v19.4S, v22.4S, v31.s[0] +sub v22.4s, v16.4s, v20.4s +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v0.4S, v29.s[0] +ldr q9, [x0, #224] +mul v0.4S, v0.4S,v30.s[0] +mla v1.4S, v21.4S, v31.s[0] +sub v21.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +mla v0.4S, v20.4S, v31.s[0] +sub v20.4s, v12.4s, v1.4s +add v12.4s, v12.4s, v1.4s +sqrdmulh v1.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +mla v14.4S, v19.4S, v31.s[0] +sub v19.4s, v11.4s, v0.4s +add v11.4s, v11.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +mla v13.4S, v1.4S, v31.s[0] +sub v1.4s, v10.4s, v14.4s +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +mla v16.4S, v0.4S, v31.s[0] +sub v0.4s, v9.4s, v13.4s +add v9.4s, v9.4s, v13.4s +sqrdmulh v13.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +mla v3.4S, v14.4S, v31.s[0] +sub v14.4s, v10.4s, v16.4s +add v10.4s, v10.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +mla v18.4S, v13.4S, v31.s[0] +sub v13.4s, v9.4s, v3.4s +add v9.4s, v9.4s, v3.4s +sqrdmulh v3.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +mla v17.4S, v16.4S, v31.s[0] +sub v16.4s, v12.4s, v18.4s +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +mla v22.4S, v3.4S, v31.s[0] +sub v3.4s, v11.4s, v17.4s +add v11.4s, v11.4s, v17.4s +sqrdmulh v17.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +mla v21.4S, v18.4S, v31.s[0] +sub v18.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +sqrdmulh v22.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +mla v15.4S, v17.4S, v31.s[0] +sub v17.4s, v0.4s, v21.4s +add v0.4s, v0.4s, v21.4s +sqrdmulh v21.4S, v10.4S, v27.s[0] +mul v10.4S, v10.4S,v28.s[0] +mla v2.4S, v22.4S, v31.s[0] +sub v22.4s, v20.4s, v15.4s +add v20.4s, v20.4s, v15.4s +sqrdmulh v15.4S, v9.4S, v27.s[0] +mul v9.4S, v9.4S,v28.s[0] +mla v10.4S, v21.4S, v31.s[0] +sub v21.4s, v19.4s, v2.4s +add v19.4s, v19.4s, v2.4s +sqrdmulh v2.4S, v14.4S, v27.s[1] +mul v14.4S, v14.4S,v28.s[1] +mla v9.4S, v15.4S, v31.s[0] +sub v15.4s, v12.4s, v10.4s +add v12.4s, v12.4s, v10.4s +sqrdmulh v10.4S, v13.4S, v27.s[1] +mul v13.4S, v13.4S,v28.s[1] +mla v14.4S, v2.4S, v31.s[0] +sub v2.4s, v11.4s, v9.4s +add v11.4s, v11.4s, v9.4s +sqrdmulh v9.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +mla v13.4S, v10.4S, v31.s[0] +sub v10.4s, v16.4s, v14.4s +add v16.4s, v16.4s, v14.4s +sqrdmulh v14.4S, v0.4S, v27.s[2] +mul v0.4S, v0.4S,v28.s[2] +mla v1.4S, v9.4S, v31.s[0] +sub v9.4s, v3.4s, v13.4s +add v3.4s, v3.4s, v13.4s +sqrdmulh v13.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +mla v0.4S, v14.4S, v31.s[0] +sub v14.4s, v20.4s, v1.4s +add v20.4s, v20.4s, v1.4s +sqrdmulh v1.4S, v17.4S, v27.s[3] +mul v17.4S, v17.4S,v28.s[3] +mla v18.4S, v13.4S, v31.s[0] +sub v13.4s, v19.4s, v0.4s +add v19.4s, v19.4s, v0.4s +sqrdmulh v0.4S, v11.4S, v25.s[0] +mul v11.4S, v11.4S,v26.s[0] +mla v17.4S, v1.4S, v31.s[0] +sub v1.4s, v22.4s, v18.4s +add v22.4s, v22.4s, v18.4s +sqrdmulh v18.4S, v2.4S, v25.s[1] +mul v2.4S, v2.4S,v26.s[1] +mla v11.4S, v0.4S, v31.s[0] +sub v0.4s, v21.4s, v17.4s +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v3.4S, v25.s[2] +mul v3.4S, v3.4S,v26.s[2] +mla v2.4S, v18.4S, v31.s[0] +sub v18.4s, v12.4s, v11.4s +add v12.4s, v12.4s, v11.4s +sqrdmulh v11.4S, v9.4S, v25.s[3] +mul v9.4S, v9.4S,v26.s[3] +mla v3.4S, v17.4S, v31.s[0] +sub v17.4s, v15.4s, v2.4s +add v15.4s, v15.4s, v2.4s +str q12, [x0, #32] +sqrdmulh v12.4S, v19.4S, v23.s[0] +str q18, [x0, #96] +mul v19.4S, v19.4S,v24.s[0] +ldr q18, [x0, #816] +mla v9.4S, v11.4S, v31.s[0] +ldr q11, [x0, #880] +sub v2.4s, v16.4s, v3.4s +add v16.4s, v16.4s, v3.4s +str q15, [x0, #160] +sqrdmulh v15.4S, v13.4S, v23.s[1] +str q17, [x0, #224] +mul v13.4S, v13.4S,v24.s[1] +ldr q17, [x0, #944] +mla v19.4S, v12.4S, v31.s[0] +ldr q12, [x0, #1008] +sub v3.4s, v10.4s, v9.4s +add v10.4s, v10.4s, v9.4s +str q16, [x0, #288] +sqrdmulh v16.4S, v21.4S, v23.s[2] +str q2, [x0, #352] +mul v21.4S, v21.4S,v24.s[2] +ldr q2, [x0, #304] +mla v13.4S, v15.4S, v31.s[0] +ldr q15, [x0, #368] +sub v9.4s, v20.4s, v19.4s +add v20.4s, v20.4s, v19.4s +str q10, [x0, #416] +sqrdmulh v10.4S, v0.4S, v23.s[3] +str q3, [x0, #480] +mul v0.4S, v0.4S,v24.s[3] +ldr q3, [x0, #432] +mla v21.4S, v16.4S, v31.s[0] +ldr q16, [x0, #496] +sub v19.4s, v14.4s, v13.4s +add v14.4s, v14.4s, v13.4s +str q20, [x0, #544] +sqrdmulh v20.4S, v18.4S, v29.s[0] +str q9, [x0, #608] +ldr q9, [x0, #560] +mul v18.4S, v18.4S,v30.s[0] +ldr q13, [x0, #624] +mla v0.4S, v10.4S, v31.s[0] +sub v10.4s, v22.4s, v21.4s +add v22.4s, v22.4s, v21.4s +str q14, [x0, #672] +sqrdmulh v14.4S, v11.4S, v29.s[0] +str q19, [x0, #736] +ldr q19, [x0, #688] +mul v11.4S, v11.4S,v30.s[0] +ldr q21, [x0, #752] +mla v18.4S, v20.4S, v31.s[0] +sub v20.4s, v1.4s, v0.4s +add v1.4s, v1.4s, v0.4s +str q22, [x0, #800] +sqrdmulh v22.4S, v17.4S, v29.s[0] +str q10, [x0, #864] +mul v17.4S, v17.4S,v30.s[0] +ldr q10, [x0, #48] +mla v11.4S, v14.4S, v31.s[0] +sub v14.4s, v2.4s, v18.4s +add v2.4s, v2.4s, v18.4s +str q1, [x0, #928] +sqrdmulh v1.4S, v12.4S, v29.s[0] +str q20, [x0, #992] +mul v12.4S, v12.4S,v30.s[0] +ldr q20, [x0, #112] +mla v17.4S, v22.4S, v31.s[0] +sub v22.4s, v15.4s, v11.4s +add v15.4s, v15.4s, v11.4s +sqrdmulh v11.4S, v9.4S, v29.s[0] +ldr q18, [x0, #176] +mul v9.4S, v9.4S,v30.s[0] +mla v12.4S, v1.4S, v31.s[0] +sub v1.4s, v3.4s, v17.4s +add v3.4s, v3.4s, v17.4s +sqrdmulh v17.4S, v13.4S, v29.s[0] +ldr q0, [x0, #240] +mul v13.4S, v13.4S,v30.s[0] +mla v9.4S, v11.4S, v31.s[0] +sub v11.4s, v16.4s, v12.4s +add v16.4s, v16.4s, v12.4s +sqrdmulh v12.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +mla v13.4S, v17.4S, v31.s[0] +sub v17.4s, v10.4s, v9.4s +add v10.4s, v10.4s, v9.4s +sqrdmulh v9.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +mla v19.4S, v12.4S, v31.s[0] +sub v12.4s, v20.4s, v13.4s +add v20.4s, v20.4s, v13.4s +sqrdmulh v13.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +mla v21.4S, v9.4S, v31.s[0] +sub v9.4s, v18.4s, v19.4s +add v18.4s, v18.4s, v19.4s +sqrdmulh v19.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +mla v3.4S, v13.4S, v31.s[0] +sub v13.4s, v0.4s, v21.4s +add v0.4s, v0.4s, v21.4s +sqrdmulh v21.4S, v2.4S, v29.s[1] +mul v2.4S, v2.4S,v30.s[1] +mla v16.4S, v19.4S, v31.s[0] +sub v19.4s, v18.4s, v3.4s +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +mla v2.4S, v21.4S, v31.s[0] +sub v21.4s, v0.4s, v16.4s +add v0.4s, v0.4s, v16.4s +sqrdmulh v16.4S, v1.4S, v29.s[2] +mul v1.4S, v1.4S,v30.s[2] +mla v15.4S, v3.4S, v31.s[0] +sub v3.4s, v10.4s, v2.4s +add v10.4s, v10.4s, v2.4s +sqrdmulh v2.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +mla v1.4S, v16.4S, v31.s[0] +sub v16.4s, v20.4s, v15.4s +add v20.4s, v20.4s, v15.4s +sqrdmulh v15.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +mla v11.4S, v2.4S, v31.s[0] +sub v2.4s, v9.4s, v1.4s +add v9.4s, v9.4s, v1.4s +sqrdmulh v1.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +mla v14.4S, v15.4S, v31.s[0] +sub v15.4s, v13.4s, v11.4s +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v18.4S, v27.s[0] +mul v18.4S, v18.4S,v28.s[0] +mla v22.4S, v1.4S, v31.s[0] +sub v1.4s, v17.4s, v14.4s +add v17.4s, v17.4s, v14.4s +sqrdmulh v14.4S, v0.4S, v27.s[0] +mul v0.4S, v0.4S,v28.s[0] +mla v18.4S, v11.4S, v31.s[0] +sub v11.4s, v12.4s, v22.4s +add v12.4s, v12.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v27.s[1] +mul v19.4S, v19.4S,v28.s[1] +mla v0.4S, v14.4S, v31.s[0] +sub v14.4s, v10.4s, v18.4s +add v10.4s, v10.4s, v18.4s +sqrdmulh v18.4S, v21.4S, v27.s[1] +mul v21.4S, v21.4S,v28.s[1] +mla v19.4S, v22.4S, v31.s[0] +sub v22.4s, v20.4s, v0.4s +add v20.4s, v20.4s, v0.4s +sqrdmulh v0.4S, v9.4S, v27.s[2] +mul v9.4S, v9.4S,v28.s[2] +mla v21.4S, v18.4S, v31.s[0] +sub v18.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v13.4S, v27.s[2] +mul v13.4S, v13.4S,v28.s[2] +mla v9.4S, v0.4S, v31.s[0] +sub v0.4s, v16.4s, v21.4s +add v16.4s, v16.4s, v21.4s +sqrdmulh v21.4S, v2.4S, v27.s[3] +mul v2.4S, v2.4S,v28.s[3] +mla v13.4S, v19.4S, v31.s[0] +sub v19.4s, v17.4s, v9.4s +add v17.4s, v17.4s, v9.4s +sqrdmulh v9.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +mla v2.4S, v21.4S, v31.s[0] +sub v21.4s, v12.4s, v13.4s +add v12.4s, v12.4s, v13.4s +sqrdmulh v13.4S, v20.4S, v25.s[0] +mul v20.4S, v20.4S,v26.s[0] +mla v15.4S, v9.4S, v31.s[0] +sub v9.4s, v1.4s, v2.4s +add v1.4s, v1.4s, v2.4s +sqrdmulh v2.4S, v22.4S, v25.s[1] +mul v22.4S, v22.4S,v26.s[1] +mla v20.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v15.4s +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +mla v22.4S, v2.4S, v31.s[0] +sub v2.4s, v10.4s, v20.4s +add v10.4s, v10.4s, v20.4s +sqrdmulh v20.4S, v0.4S, v25.s[3] +mul v0.4S, v0.4S,v26.s[3] +mla v16.4S, v15.4S, v31.s[0] +sub v15.4s, v14.4s, v22.4s +add v14.4s, v14.4s, v22.4s +str q10, [x0, #48] +sqrdmulh v10.4S, v12.4S, v23.s[0] +str q2, [x0, #112] +mul v12.4S, v12.4S,v24.s[0] +ldr q2, [x0, #768] +mla v0.4S, v20.4S, v31.s[0] +ldr q20, [x0, #832] +sub v22.4s, v3.4s, v16.4s +add v3.4s, v3.4s, v16.4s +str q14, [x0, #176] +sqrdmulh v14.4S, v21.4S, v23.s[1] +str q15, [x0, #240] +mul v21.4S, v21.4S,v24.s[1] +ldr q15, [x0, #896] +mla v12.4S, v10.4S, v31.s[0] +ldr q10, [x0, #960] +sub v16.4s, v18.4s, v0.4s +add v18.4s, v18.4s, v0.4s +str q3, [x0, #304] +sqrdmulh v3.4S, v11.4S, v23.s[2] +str q22, [x0, #368] +mul v11.4S, v11.4S,v24.s[2] +ldr q22, [x0, #256] +mla v21.4S, v14.4S, v31.s[0] +ldr q14, [x0, #320] +sub v0.4s, v17.4s, v12.4s +add v17.4s, v17.4s, v12.4s +str q18, [x0, #432] +sqrdmulh v18.4S, v13.4S, v23.s[3] +str q16, [x0, #496] +mul v13.4S, v13.4S,v24.s[3] +ldr q16, [x0, #384] +mla v11.4S, v3.4S, v31.s[0] +ldr q3, [x0, #448] +sub v12.4s, v19.4s, v21.4s +add v19.4s, v19.4s, v21.4s +str q17, [x0, #560] +sqrdmulh v17.4S, v2.4S, v29.s[0] +str q0, [x0, #624] +ldr q0, [x0, #512] +mul v2.4S, v2.4S,v30.s[0] +ldr q21, [x0, #576] +mla v13.4S, v18.4S, v31.s[0] +sub v18.4s, v1.4s, v11.4s +add v1.4s, v1.4s, v11.4s +str q19, [x0, #688] +sqrdmulh v19.4S, v20.4S, v29.s[0] +str q12, [x0, #752] +ldr q12, [x0, #640] +mul v20.4S, v20.4S,v30.s[0] +ldr q11, [x0, #704] +mla v2.4S, v17.4S, v31.s[0] +sub v17.4s, v9.4s, v13.4s +add v9.4s, v9.4s, v13.4s +str q1, [x0, #816] +sqrdmulh v1.4S, v15.4S, v29.s[0] +str q18, [x0, #880] +mul v15.4S, v15.4S,v30.s[0] +ldr q18, [x0, #0] +mla v20.4S, v19.4S, v31.s[0] +sub v19.4s, v22.4s, v2.4s +add v22.4s, v22.4s, v2.4s +str q9, [x0, #944] +sqrdmulh v9.4S, v10.4S, v29.s[0] +str q17, [x0, #1008] +mul v10.4S, v10.4S,v30.s[0] +ldr q17, [x0, #64] +mla v15.4S, v1.4S, v31.s[0] +sub v1.4s, v14.4s, v20.4s +add v14.4s, v14.4s, v20.4s +sqrdmulh v20.4S, v0.4S, v29.s[0] +ldr q2, [x0, #128] +mul v0.4S, v0.4S,v30.s[0] +mla v10.4S, v9.4S, v31.s[0] +sub v9.4s, v16.4s, v15.4s +add v16.4s, v16.4s, v15.4s +sqrdmulh v15.4S, v21.4S, v29.s[0] +ldr q13, [x0, #192] +mul v21.4S, v21.4S,v30.s[0] +mla v0.4S, v20.4S, v31.s[0] +sub v20.4s, v3.4s, v10.4s +add v3.4s, v3.4s, v10.4s +sqrdmulh v10.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +mla v21.4S, v15.4S, v31.s[0] +sub v15.4s, v18.4s, v0.4s +add v18.4s, v18.4s, v0.4s +sqrdmulh v0.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +mla v12.4S, v10.4S, v31.s[0] +sub v10.4s, v17.4s, v21.4s +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +mla v11.4S, v0.4S, v31.s[0] +sub v0.4s, v2.4s, v12.4s +add v2.4s, v2.4s, v12.4s +sqrdmulh v12.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +mla v16.4S, v21.4S, v31.s[0] +sub v21.4s, v13.4s, v11.4s +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v29.s[1] +mul v22.4S, v22.4S,v30.s[1] +mla v3.4S, v12.4S, v31.s[0] +sub v12.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v14.4S, v29.s[1] +mul v14.4S, v14.4S,v30.s[1] +mla v22.4S, v11.4S, v31.s[0] +sub v11.4s, v13.4s, v3.4s +add v13.4s, v13.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v29.s[2] +mul v9.4S, v9.4S,v30.s[2] +mla v14.4S, v16.4S, v31.s[0] +sub v16.4s, v18.4s, v22.4s +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +mla v9.4S, v3.4S, v31.s[0] +sub v3.4s, v17.4s, v14.4s +add v17.4s, v17.4s, v14.4s +sqrdmulh v14.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +mla v20.4S, v22.4S, v31.s[0] +sub v22.4s, v0.4s, v9.4s +add v0.4s, v0.4s, v9.4s +sqrdmulh v9.4S, v1.4S, v29.s[2] +mul v1.4S, v1.4S,v30.s[2] +mla v19.4S, v14.4S, v31.s[0] +sub v14.4s, v21.4s, v20.4s +add v21.4s, v21.4s, v20.4s +sqrdmulh v20.4S, v2.4S, v27.s[0] +mul v2.4S, v2.4S,v28.s[0] +mla v1.4S, v9.4S, v31.s[0] +sub v9.4s, v15.4s, v19.4s +add v15.4s, v15.4s, v19.4s +sqrdmulh v19.4S, v13.4S, v27.s[0] +mul v13.4S, v13.4S,v28.s[0] +mla v2.4S, v20.4S, v31.s[0] +sub v20.4s, v10.4s, v1.4s +add v10.4s, v10.4s, v1.4s +sqrdmulh v1.4S, v12.4S, v27.s[1] +mul v12.4S, v12.4S,v28.s[1] +mla v13.4S, v19.4S, v31.s[0] +sub v19.4s, v18.4s, v2.4s +add v18.4s, v18.4s, v2.4s +sqrdmulh v2.4S, v11.4S, v27.s[1] +mul v11.4S, v11.4S,v28.s[1] +mla v12.4S, v1.4S, v31.s[0] +sub v1.4s, v17.4s, v13.4s +add v17.4s, v17.4s, v13.4s +sqrdmulh v13.4S, v0.4S, v27.s[2] +mul v0.4S, v0.4S,v28.s[2] +mla v11.4S, v2.4S, v31.s[0] +sub v2.4s, v16.4s, v12.4s +add v16.4s, v16.4s, v12.4s +sqrdmulh v12.4S, v21.4S, v27.s[2] +mul v21.4S, v21.4S,v28.s[2] +mla v0.4S, v13.4S, v31.s[0] +sub v13.4s, v3.4s, v11.4s +add v3.4s, v3.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v27.s[3] +mul v22.4S, v22.4S,v28.s[3] +mla v21.4S, v12.4S, v31.s[0] +sub v12.4s, v15.4s, v0.4s +add v15.4s, v15.4s, v0.4s +sqrdmulh v0.4S, v14.4S, v27.s[3] +mul v14.4S, v14.4S,v28.s[3] +mla v22.4S, v11.4S, v31.s[0] +sub v11.4s, v10.4s, v21.4s +add v10.4s, v10.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v25.s[0] +mul v17.4S, v17.4S,v26.s[0] +mla v14.4S, v0.4S, v31.s[0] +sub v0.4s, v9.4s, v22.4s +add v9.4s, v9.4s, v22.4s +sqrdmulh v22.4S, v1.4S, v25.s[1] +mul v1.4S, v1.4S,v26.s[1] +mla v17.4S, v21.4S, v31.s[0] +sub v21.4s, v20.4s, v14.4s +add v20.4s, v20.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v25.s[2] +mul v3.4S, v3.4S,v26.s[2] +mla v1.4S, v22.4S, v31.s[0] +sub v22.4s, v18.4s, v17.4s +add v18.4s, v18.4s, v17.4s +sqrdmulh v17.4S, v13.4S, v25.s[3] +mul v13.4S, v13.4S,v26.s[3] +mla v3.4S, v14.4S, v31.s[0] +sub v14.4s, v19.4s, v1.4s +add v19.4s, v19.4s, v1.4s +str q18, [x0, #0] +sqrdmulh v18.4S, v10.4S, v23.s[0] +str q22, [x0, #64] +mul v10.4S, v10.4S,v24.s[0] +ldr q22, [x0, #784] +mla v13.4S, v17.4S, v31.s[0] +ldr q17, [x0, #848] +sub v1.4s, v16.4s, v3.4s +add v16.4s, v16.4s, v3.4s +str q19, [x0, #128] +sqrdmulh v19.4S, v11.4S, v23.s[1] +str q14, [x0, #192] +mul v11.4S, v11.4S,v24.s[1] +ldr q14, [x0, #912] +mla v10.4S, v18.4S, v31.s[0] +ldr q18, [x0, #976] +sub v3.4s, v2.4s, v13.4s +add v2.4s, v2.4s, v13.4s +str q16, [x0, #256] +sqrdmulh v16.4S, v20.4S, v23.s[2] +str q1, [x0, #320] +mul v20.4S, v20.4S,v24.s[2] +ldr q1, [x0, #272] +mla v11.4S, v19.4S, v31.s[0] +ldr q19, [x0, #336] +sub v13.4s, v15.4s, v10.4s +add v15.4s, v15.4s, v10.4s +str q2, [x0, #384] +sqrdmulh v2.4S, v21.4S, v23.s[3] +str q3, [x0, #448] +mul v21.4S, v21.4S,v24.s[3] +ldr q3, [x0, #400] +mla v20.4S, v16.4S, v31.s[0] +ldr q16, [x0, #464] +sub v10.4s, v12.4s, v11.4s +add v12.4s, v12.4s, v11.4s +str q15, [x0, #512] +sqrdmulh v15.4S, v22.4S, v29.s[0] +str q13, [x0, #576] +ldr q13, [x0, #528] +mul v22.4S, v22.4S,v30.s[0] +ldr q11, [x0, #592] +mla v21.4S, v2.4S, v31.s[0] +sub v2.4s, v9.4s, v20.4s +add v9.4s, v9.4s, v20.4s +str q12, [x0, #640] +sqrdmulh v12.4S, v17.4S, v29.s[0] +str q10, [x0, #704] +ldr q10, [x0, #656] +mul v17.4S, v17.4S,v30.s[0] +ldr q20, [x0, #720] +mla v22.4S, v15.4S, v31.s[0] +sub v15.4s, v0.4s, v21.4s +add v0.4s, v0.4s, v21.4s +str q9, [x0, #768] +sqrdmulh v9.4S, v14.4S, v29.s[0] +str q2, [x0, #832] +mul v14.4S, v14.4S,v30.s[0] +ldr q2, [x0, #16] +mla v17.4S, v12.4S, v31.s[0] +sub v12.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +str q0, [x0, #896] +sqrdmulh v0.4S, v18.4S, v29.s[0] +str q15, [x0, #960] +mul v18.4S, v18.4S,v30.s[0] +ldr q15, [x0, #80] +mla v14.4S, v9.4S, v31.s[0] +sub v9.4s, v19.4s, v17.4s +add v19.4s, v19.4s, v17.4s +sqrdmulh v17.4S, v13.4S, v29.s[0] +ldr q22, [x0, #144] +mul v13.4S, v13.4S,v30.s[0] +mla v18.4S, v0.4S, v31.s[0] +sub v0.4s, v3.4s, v14.4s +add v3.4s, v3.4s, v14.4s +sqrdmulh v14.4S, v11.4S, v29.s[0] +ldr q21, [x0, #208] +mul v11.4S, v11.4S,v30.s[0] +mla v13.4S, v17.4S, v31.s[0] +sub v17.4s, v16.4s, v18.4s +add v16.4s, v16.4s, v18.4s +sqrdmulh v18.4S, v10.4S, v29.s[0] +mul v10.4S, v10.4S,v30.s[0] +mla v11.4S, v14.4S, v31.s[0] +sub v14.4s, v2.4s, v13.4s +add v2.4s, v2.4s, v13.4s +sqrdmulh v13.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +mla v10.4S, v18.4S, v31.s[0] +sub v18.4s, v15.4s, v11.4s +add v15.4s, v15.4s, v11.4s +sqrdmulh v11.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +mla v20.4S, v13.4S, v31.s[0] +sub v13.4s, v22.4s, v10.4s +add v22.4s, v22.4s, v10.4s +sqrdmulh v10.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +mla v3.4S, v11.4S, v31.s[0] +sub v11.4s, v21.4s, v20.4s +add v21.4s, v21.4s, v20.4s +sqrdmulh v20.4S, v1.4S, v29.s[1] +mul v1.4S, v1.4S,v30.s[1] +mla v16.4S, v10.4S, v31.s[0] +sub v10.4s, v22.4s, v3.4s +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v19.4S, v29.s[1] +mul v19.4S, v19.4S,v30.s[1] +mla v1.4S, v20.4S, v31.s[0] +sub v20.4s, v21.4s, v16.4s +add v21.4s, v21.4s, v16.4s +sqrdmulh v16.4S, v0.4S, v29.s[2] +mul v0.4S, v0.4S,v30.s[2] +mla v19.4S, v3.4S, v31.s[0] +sub v3.4s, v2.4s, v1.4s +add v2.4s, v2.4s, v1.4s +sqrdmulh v1.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +mla v0.4S, v16.4S, v31.s[0] +sub v16.4s, v15.4s, v19.4s +add v15.4s, v15.4s, v19.4s +sqrdmulh v19.4S, v12.4S, v29.s[2] +mul v12.4S, v12.4S,v30.s[2] +mla v17.4S, v1.4S, v31.s[0] +sub v1.4s, v13.4s, v0.4s +add v13.4s, v13.4s, v0.4s +sqrdmulh v0.4S, v9.4S, v29.s[2] +mul v9.4S, v9.4S,v30.s[2] +mla v12.4S, v19.4S, v31.s[0] +sub v19.4s, v11.4s, v17.4s +add v11.4s, v11.4s, v17.4s +sqrdmulh v17.4S, v22.4S, v27.s[0] +mul v22.4S, v22.4S,v28.s[0] +mla v9.4S, v0.4S, v31.s[0] +sub v0.4s, v14.4s, v12.4s +add v14.4s, v14.4s, v12.4s +sqrdmulh v12.4S, v21.4S, v27.s[0] +mul v21.4S, v21.4S,v28.s[0] +mla v22.4S, v17.4S, v31.s[0] +sub v17.4s, v18.4s, v9.4s +add v18.4s, v18.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v27.s[1] +mul v10.4S, v10.4S,v28.s[1] +mla v21.4S, v12.4S, v31.s[0] +sub v12.4s, v2.4s, v22.4s +add v2.4s, v2.4s, v22.4s +sqrdmulh v22.4S, v20.4S, v27.s[1] +mul v20.4S, v20.4S,v28.s[1] +mla v10.4S, v9.4S, v31.s[0] +sub v9.4s, v15.4s, v21.4s +add v15.4s, v15.4s, v21.4s +sqrdmulh v21.4S, v13.4S, v27.s[2] +mul v13.4S, v13.4S,v28.s[2] +mla v20.4S, v22.4S, v31.s[0] +sub v22.4s, v3.4s, v10.4s +add v3.4s, v3.4s, v10.4s +sqrdmulh v10.4S, v11.4S, v27.s[2] +mul v11.4S, v11.4S,v28.s[2] +mla v13.4S, v21.4S, v31.s[0] +sub v21.4s, v16.4s, v20.4s +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v1.4S, v27.s[3] +mul v1.4S, v1.4S,v28.s[3] +mla v11.4S, v10.4S, v31.s[0] +sub v10.4s, v14.4s, v13.4s +add v14.4s, v14.4s, v13.4s +sqrdmulh v13.4S, v19.4S, v27.s[3] +mul v19.4S, v19.4S,v28.s[3] +mla v1.4S, v20.4S, v31.s[0] +sub v20.4s, v18.4s, v11.4s +add v18.4s, v18.4s, v11.4s +sqrdmulh v11.4S, v15.4S, v25.s[0] +mul v15.4S, v15.4S,v26.s[0] +mla v19.4S, v13.4S, v31.s[0] +sub v13.4s, v0.4s, v1.4s +add v0.4s, v0.4s, v1.4s +sqrdmulh v1.4S, v9.4S, v25.s[1] +mul v9.4S, v9.4S,v26.s[1] +mla v15.4S, v11.4S, v31.s[0] +sub v11.4s, v17.4s, v19.4s +add v17.4s, v17.4s, v19.4s +sqrdmulh v19.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +mla v9.4S, v1.4S, v31.s[0] +sub v1.4s, v2.4s, v15.4s +add v2.4s, v2.4s, v15.4s +sqrdmulh v15.4S, v21.4S, v25.s[3] +mul v21.4S, v21.4S,v26.s[3] +mla v16.4S, v19.4S, v31.s[0] +sub v19.4s, v12.4s, v9.4s +add v12.4s, v12.4s, v9.4s +str q2, [x0, #16] +sqrdmulh v2.4S, v18.4S, v23.s[0] +str q1, [x0, #80] +mul v18.4S, v18.4S,v24.s[0] +mla v21.4S, v15.4S, v31.s[0] +sub v15.4s, v3.4s, v16.4s +add v3.4s, v3.4s, v16.4s +str q12, [x0, #144] +sqrdmulh v12.4S, v20.4S, v23.s[1] +str q19, [x0, #208] +mul v20.4S, v20.4S,v24.s[1] +mla v18.4S, v2.4S, v31.s[0] +sub v2.4s, v22.4s, v21.4s +add v22.4s, v22.4s, v21.4s +str q3, [x0, #272] +sqrdmulh v3.4S, v17.4S, v23.s[2] +str q15, [x0, #336] +mul v17.4S, v17.4S,v24.s[2] +mla v20.4S, v12.4S, v31.s[0] +sub v12.4s, v14.4s, v18.4s +add v14.4s, v14.4s, v18.4s +str q22, [x0, #400] +sqrdmulh v22.4S, v11.4S, v23.s[3] +str q2, [x0, #464] +mul v11.4S, v11.4S,v24.s[3] +mla v17.4S, v3.4S, v31.s[0] +sub v3.4s, v10.4s, v20.4s +add v10.4s, v10.4s, v20.4s +str q14, [x0, #528] +str q12, [x0, #592] +mla v11.4S, v22.4S, v31.s[0] +sub v22.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +str q10, [x0, #656] +str q3, [x0, #720] +sub v3.4s, v13.4s, v11.4s +add v13.4s, v13.4s, v11.4s +str q0, [x0, #784] +str q22, [x0, #848] +str q13, [x0, #912] +str q3, [x0, #976] +ldr q4, [x17, #+128] +ldr q5, [x17, #+144] +ldr q6, [x17, #+160] +ldr q7, [x17, #+176] +ldr q8, [x17, #+192] +ldr q9, [x17, #+208] +ldr q1, [x17, #+224] +ldr q16, [x17, #+240] +ldr q19, [x0, #32] +ldr q21, [x0, #48] +ldr q15, [x0, #0] +ldr q18, [x0, #16] +sqrdmulh v2.4S, v19.4S, v5.s[0] +mul v19.4S, v19.4S,v4.s[0] +mla v19.4S, v2.4S, v31.s[0] +sub v2.4s, v15.4s, v19.4s +add v15.4s, v15.4s, v19.4s +sqrdmulh v19.4S, v21.4S, v5.s[0] +mul v21.4S, v21.4S,v4.s[0] +mla v21.4S, v19.4S, v31.s[0] +sub v19.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v18.4S, v5.s[1] +mul v18.4S, v18.4S,v4.s[1] +mla v18.4S, v21.4S, v31.s[0] +sub v21.4s, v15.4s, v18.4s +add v15.4s, v15.4s, v18.4s +sqrdmulh v18.4S, v19.4S, v5.s[2] +mul v19.4S, v19.4S,v4.s[2] +mla v19.4S, v18.4S, v31.s[0] +sub v18.4s, v2.4s, v19.4s +add v2.4s, v2.4s, v19.4s +trn1 v19.4S, v15.4S, v21.4S +trn2 v20.4S, v15.4S, v21.4S +trn1 v14.4S, v2.4S, v18.4S +trn2 v12.4S, v2.4S, v18.4S +trn2 v2.2D, v19.2D, v14.2D +trn2 v18.2D, v20.2D, v12.2D +trn1 v15.2D, v19.2D, v14.2D +trn1 v21.2D, v20.2D, v12.2D +sqrdmulh v12.4S, v2.4S, v7.4S +mul v2.4S, v2.4S,v6.4S +mla v2.4S, v12.4S, v31.s[0] +sub v12.4s, v15.4s, v2.4s +add v15.4s, v15.4s, v2.4s +sqrdmulh v2.4S, v18.4S, v7.4S +mul v18.4S, v18.4S,v6.4S +mla v18.4S, v2.4S, v31.s[0] +sub v2.4s, v21.4s, v18.4s +add v21.4s, v21.4s, v18.4s +sqrdmulh v18.4S, v21.4S, v9.4S +mul v21.4S, v21.4S,v8.4S +mla v21.4S, v18.4S, v31.s[0] +sub v18.4s, v15.4s, v21.4s +add v15.4s, v15.4s, v21.4s +sqrdmulh v21.4S, v2.4S, v16.4S +mul v2.4S, v2.4S,v1.4S +mla v2.4S, v21.4S, v31.s[0] +sub v21.4s, v12.4s, v2.4s +add v12.4s, v12.4s, v2.4s +str q15, [x0, #0] +str q18, [x0, #16] +str q12, [x0, #32] +str q21, [x0, #48] +ldr q21, [x17, #+256] +ldr q12, [x17, #+272] +ldr q18, [x17, #+288] +ldr q15, [x17, #+304] +ldr q2, [x17, #+320] +ldr q20, [x17, #+336] +ldr q14, [x17, #+352] +ldr q19, [x17, #+368] +ldr q16, [x0, #96] +ldr q1, [x0, #112] +ldr q9, [x0, #64] +ldr q8, [x0, #80] +sqrdmulh v7.4S, v16.4S, v12.s[0] +mul v16.4S, v16.4S,v21.s[0] +mla v16.4S, v7.4S, v31.s[0] +sub v7.4s, v9.4s, v16.4s +add v9.4s, v9.4s, v16.4s +sqrdmulh v16.4S, v1.4S, v12.s[0] +mul v1.4S, v1.4S,v21.s[0] +mla v1.4S, v16.4S, v31.s[0] +sub v16.4s, v8.4s, v1.4s +add v8.4s, v8.4s, v1.4s +sqrdmulh v1.4S, v8.4S, v12.s[1] +mul v8.4S, v8.4S,v21.s[1] +mla v8.4S, v1.4S, v31.s[0] +sub v1.4s, v9.4s, v8.4s +add v9.4s, v9.4s, v8.4s +sqrdmulh v8.4S, v16.4S, v12.s[2] +mul v16.4S, v16.4S,v21.s[2] +mla v16.4S, v8.4S, v31.s[0] +sub v8.4s, v7.4s, v16.4s +add v7.4s, v7.4s, v16.4s +trn1 v16.4S, v9.4S, v1.4S +trn2 v6.4S, v9.4S, v1.4S +trn1 v5.4S, v7.4S, v8.4S +trn2 v4.4S, v7.4S, v8.4S +trn2 v7.2D, v16.2D, v5.2D +trn2 v8.2D, v6.2D, v4.2D +trn1 v9.2D, v16.2D, v5.2D +trn1 v1.2D, v6.2D, v4.2D +sqrdmulh v4.4S, v7.4S, v15.4S +mul v7.4S, v7.4S,v18.4S +mla v7.4S, v4.4S, v31.s[0] +sub v4.4s, v9.4s, v7.4s +add v9.4s, v9.4s, v7.4s +sqrdmulh v7.4S, v8.4S, v15.4S +mul v8.4S, v8.4S,v18.4S +mla v8.4S, v7.4S, v31.s[0] +sub v7.4s, v1.4s, v8.4s +add v1.4s, v1.4s, v8.4s +sqrdmulh v8.4S, v1.4S, v20.4S +mul v1.4S, v1.4S,v2.4S +mla v1.4S, v8.4S, v31.s[0] +sub v8.4s, v9.4s, v1.4s +add v9.4s, v9.4s, v1.4s +sqrdmulh v1.4S, v7.4S, v19.4S +mul v7.4S, v7.4S,v14.4S +mla v7.4S, v1.4S, v31.s[0] +sub v1.4s, v4.4s, v7.4s +add v4.4s, v4.4s, v7.4s +str q9, [x0, #64] +str q8, [x0, #80] +str q4, [x0, #96] +str q1, [x0, #112] +ldr q1, [x17, #+384] +ldr q4, [x17, #+400] +ldr q8, [x17, #+416] +ldr q9, [x17, #+432] +ldr q7, [x17, #+448] +ldr q6, [x17, #+464] +ldr q5, [x17, #+480] +ldr q16, [x17, #+496] +ldr q19, [x0, #160] +ldr q14, [x0, #176] +ldr q20, [x0, #128] +ldr q2, [x0, #144] +sqrdmulh v15.4S, v19.4S, v4.s[0] +mul v19.4S, v19.4S,v1.s[0] +mla v19.4S, v15.4S, v31.s[0] +sub v15.4s, v20.4s, v19.4s +add v20.4s, v20.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v4.s[0] +mul v14.4S, v14.4S,v1.s[0] +mla v14.4S, v19.4S, v31.s[0] +sub v19.4s, v2.4s, v14.4s +add v2.4s, v2.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v4.s[1] +mul v2.4S, v2.4S,v1.s[1] +mla v2.4S, v14.4S, v31.s[0] +sub v14.4s, v20.4s, v2.4s +add v20.4s, v20.4s, v2.4s +sqrdmulh v2.4S, v19.4S, v4.s[2] +mul v19.4S, v19.4S,v1.s[2] +mla v19.4S, v2.4S, v31.s[0] +sub v2.4s, v15.4s, v19.4s +add v15.4s, v15.4s, v19.4s +trn1 v19.4S, v20.4S, v14.4S +trn2 v18.4S, v20.4S, v14.4S +trn1 v12.4S, v15.4S, v2.4S +trn2 v21.4S, v15.4S, v2.4S +trn2 v15.2D, v19.2D, v12.2D +trn2 v2.2D, v18.2D, v21.2D +trn1 v20.2D, v19.2D, v12.2D +trn1 v14.2D, v18.2D, v21.2D +sqrdmulh v21.4S, v15.4S, v9.4S +mul v15.4S, v15.4S,v8.4S +mla v15.4S, v21.4S, v31.s[0] +sub v21.4s, v20.4s, v15.4s +add v20.4s, v20.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v9.4S +mul v2.4S, v2.4S,v8.4S +mla v2.4S, v15.4S, v31.s[0] +sub v15.4s, v14.4s, v2.4s +add v14.4s, v14.4s, v2.4s +sqrdmulh v2.4S, v14.4S, v6.4S +mul v14.4S, v14.4S,v7.4S +mla v14.4S, v2.4S, v31.s[0] +sub v2.4s, v20.4s, v14.4s +add v20.4s, v20.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v16.4S +mul v15.4S, v15.4S,v5.4S +mla v15.4S, v14.4S, v31.s[0] +sub v14.4s, v21.4s, v15.4s +add v21.4s, v21.4s, v15.4s +str q20, [x0, #128] +str q2, [x0, #144] +str q21, [x0, #160] +str q14, [x0, #176] +ldr q14, [x17, #+512] +ldr q21, [x17, #+528] +ldr q2, [x17, #+544] +ldr q20, [x17, #+560] +ldr q15, [x17, #+576] +ldr q18, [x17, #+592] +ldr q12, [x17, #+608] +ldr q19, [x17, #+624] +ldr q16, [x0, #224] +ldr q5, [x0, #240] +ldr q6, [x0, #192] +ldr q7, [x0, #208] +sqrdmulh v9.4S, v16.4S, v21.s[0] +mul v16.4S, v16.4S,v14.s[0] +mla v16.4S, v9.4S, v31.s[0] +sub v9.4s, v6.4s, v16.4s +add v6.4s, v6.4s, v16.4s +sqrdmulh v16.4S, v5.4S, v21.s[0] +mul v5.4S, v5.4S,v14.s[0] +mla v5.4S, v16.4S, v31.s[0] +sub v16.4s, v7.4s, v5.4s +add v7.4s, v7.4s, v5.4s +sqrdmulh v5.4S, v7.4S, v21.s[1] +mul v7.4S, v7.4S,v14.s[1] +mla v7.4S, v5.4S, v31.s[0] +sub v5.4s, v6.4s, v7.4s +add v6.4s, v6.4s, v7.4s +sqrdmulh v7.4S, v16.4S, v21.s[2] +mul v16.4S, v16.4S,v14.s[2] +mla v16.4S, v7.4S, v31.s[0] +sub v7.4s, v9.4s, v16.4s +add v9.4s, v9.4s, v16.4s +trn1 v16.4S, v6.4S, v5.4S +trn2 v8.4S, v6.4S, v5.4S +trn1 v4.4S, v9.4S, v7.4S +trn2 v1.4S, v9.4S, v7.4S +trn2 v9.2D, v16.2D, v4.2D +trn2 v7.2D, v8.2D, v1.2D +trn1 v6.2D, v16.2D, v4.2D +trn1 v5.2D, v8.2D, v1.2D +sqrdmulh v1.4S, v9.4S, v20.4S +mul v9.4S, v9.4S,v2.4S +mla v9.4S, v1.4S, v31.s[0] +sub v1.4s, v6.4s, v9.4s +add v6.4s, v6.4s, v9.4s +sqrdmulh v9.4S, v7.4S, v20.4S +mul v7.4S, v7.4S,v2.4S +mla v7.4S, v9.4S, v31.s[0] +sub v9.4s, v5.4s, v7.4s +add v5.4s, v5.4s, v7.4s +sqrdmulh v7.4S, v5.4S, v18.4S +mul v5.4S, v5.4S,v15.4S +mla v5.4S, v7.4S, v31.s[0] +sub v7.4s, v6.4s, v5.4s +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v9.4S, v19.4S +mul v9.4S, v9.4S,v12.4S +mla v9.4S, v5.4S, v31.s[0] +sub v5.4s, v1.4s, v9.4s +add v1.4s, v1.4s, v9.4s +str q6, [x0, #192] +str q7, [x0, #208] +str q1, [x0, #224] +str q5, [x0, #240] +ldr q5, [x17, #+640] +ldr q1, [x17, #+656] +ldr q7, [x17, #+672] +ldr q6, [x17, #+688] +ldr q9, [x17, #+704] +ldr q8, [x17, #+720] +ldr q4, [x17, #+736] +ldr q16, [x17, #+752] +ldr q19, [x0, #288] +ldr q12, [x0, #304] +ldr q18, [x0, #256] +ldr q15, [x0, #272] +sqrdmulh v20.4S, v19.4S, v1.s[0] +mul v19.4S, v19.4S,v5.s[0] +mla v19.4S, v20.4S, v31.s[0] +sub v20.4s, v18.4s, v19.4s +add v18.4s, v18.4s, v19.4s +sqrdmulh v19.4S, v12.4S, v1.s[0] +mul v12.4S, v12.4S,v5.s[0] +mla v12.4S, v19.4S, v31.s[0] +sub v19.4s, v15.4s, v12.4s +add v15.4s, v15.4s, v12.4s +sqrdmulh v12.4S, v15.4S, v1.s[1] +mul v15.4S, v15.4S,v5.s[1] +mla v15.4S, v12.4S, v31.s[0] +sub v12.4s, v18.4s, v15.4s +add v18.4s, v18.4s, v15.4s +sqrdmulh v15.4S, v19.4S, v1.s[2] +mul v19.4S, v19.4S,v5.s[2] +mla v19.4S, v15.4S, v31.s[0] +sub v15.4s, v20.4s, v19.4s +add v20.4s, v20.4s, v19.4s +trn1 v19.4S, v18.4S, v12.4S +trn2 v2.4S, v18.4S, v12.4S +trn1 v21.4S, v20.4S, v15.4S +trn2 v14.4S, v20.4S, v15.4S +trn2 v20.2D, v19.2D, v21.2D +trn2 v15.2D, v2.2D, v14.2D +trn1 v18.2D, v19.2D, v21.2D +trn1 v12.2D, v2.2D, v14.2D +sqrdmulh v14.4S, v20.4S, v6.4S +mul v20.4S, v20.4S,v7.4S +mla v20.4S, v14.4S, v31.s[0] +sub v14.4s, v18.4s, v20.4s +add v18.4s, v18.4s, v20.4s +sqrdmulh v20.4S, v15.4S, v6.4S +mul v15.4S, v15.4S,v7.4S +mla v15.4S, v20.4S, v31.s[0] +sub v20.4s, v12.4s, v15.4s +add v12.4s, v12.4s, v15.4s +sqrdmulh v15.4S, v12.4S, v8.4S +mul v12.4S, v12.4S,v9.4S +mla v12.4S, v15.4S, v31.s[0] +sub v15.4s, v18.4s, v12.4s +add v18.4s, v18.4s, v12.4s +sqrdmulh v12.4S, v20.4S, v16.4S +mul v20.4S, v20.4S,v4.4S +mla v20.4S, v12.4S, v31.s[0] +sub v12.4s, v14.4s, v20.4s +add v14.4s, v14.4s, v20.4s +str q18, [x0, #256] +str q15, [x0, #272] +str q14, [x0, #288] +str q12, [x0, #304] +ldr q12, [x17, #+768] +ldr q14, [x17, #+784] +ldr q15, [x17, #+800] +ldr q18, [x17, #+816] +ldr q20, [x17, #+832] +ldr q2, [x17, #+848] +ldr q21, [x17, #+864] +ldr q19, [x17, #+880] +ldr q16, [x0, #352] +ldr q4, [x0, #368] +ldr q8, [x0, #320] +ldr q9, [x0, #336] +sqrdmulh v6.4S, v16.4S, v14.s[0] +mul v16.4S, v16.4S,v12.s[0] +mla v16.4S, v6.4S, v31.s[0] +sub v6.4s, v8.4s, v16.4s +add v8.4s, v8.4s, v16.4s +sqrdmulh v16.4S, v4.4S, v14.s[0] +mul v4.4S, v4.4S,v12.s[0] +mla v4.4S, v16.4S, v31.s[0] +sub v16.4s, v9.4s, v4.4s +add v9.4s, v9.4s, v4.4s +sqrdmulh v4.4S, v9.4S, v14.s[1] +mul v9.4S, v9.4S,v12.s[1] +mla v9.4S, v4.4S, v31.s[0] +sub v4.4s, v8.4s, v9.4s +add v8.4s, v8.4s, v9.4s +sqrdmulh v9.4S, v16.4S, v14.s[2] +mul v16.4S, v16.4S,v12.s[2] +mla v16.4S, v9.4S, v31.s[0] +sub v9.4s, v6.4s, v16.4s +add v6.4s, v6.4s, v16.4s +trn1 v16.4S, v8.4S, v4.4S +trn2 v7.4S, v8.4S, v4.4S +trn1 v1.4S, v6.4S, v9.4S +trn2 v5.4S, v6.4S, v9.4S +trn2 v6.2D, v16.2D, v1.2D +trn2 v9.2D, v7.2D, v5.2D +trn1 v8.2D, v16.2D, v1.2D +trn1 v4.2D, v7.2D, v5.2D +sqrdmulh v5.4S, v6.4S, v18.4S +mul v6.4S, v6.4S,v15.4S +mla v6.4S, v5.4S, v31.s[0] +sub v5.4s, v8.4s, v6.4s +add v8.4s, v8.4s, v6.4s +sqrdmulh v6.4S, v9.4S, v18.4S +mul v9.4S, v9.4S,v15.4S +mla v9.4S, v6.4S, v31.s[0] +sub v6.4s, v4.4s, v9.4s +add v4.4s, v4.4s, v9.4s +sqrdmulh v9.4S, v4.4S, v2.4S +mul v4.4S, v4.4S,v20.4S +mla v4.4S, v9.4S, v31.s[0] +sub v9.4s, v8.4s, v4.4s +add v8.4s, v8.4s, v4.4s +sqrdmulh v4.4S, v6.4S, v19.4S +mul v6.4S, v6.4S,v21.4S +mla v6.4S, v4.4S, v31.s[0] +sub v4.4s, v5.4s, v6.4s +add v5.4s, v5.4s, v6.4s +str q8, [x0, #320] +str q9, [x0, #336] +str q5, [x0, #352] +str q4, [x0, #368] +ldr q4, [x17, #+896] +ldr q5, [x17, #+912] +ldr q9, [x17, #+928] +ldr q8, [x17, #+944] +ldr q6, [x17, #+960] +ldr q7, [x17, #+976] +ldr q1, [x17, #+992] +ldr q16, [x17, #+1008] +ldr q19, [x0, #416] +ldr q21, [x0, #432] +ldr q2, [x0, #384] +ldr q20, [x0, #400] +sqrdmulh v18.4S, v19.4S, v5.s[0] +mul v19.4S, v19.4S,v4.s[0] +mla v19.4S, v18.4S, v31.s[0] +sub v18.4s, v2.4s, v19.4s +add v2.4s, v2.4s, v19.4s +sqrdmulh v19.4S, v21.4S, v5.s[0] +mul v21.4S, v21.4S,v4.s[0] +mla v21.4S, v19.4S, v31.s[0] +sub v19.4s, v20.4s, v21.4s +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v20.4S, v5.s[1] +mul v20.4S, v20.4S,v4.s[1] +mla v20.4S, v21.4S, v31.s[0] +sub v21.4s, v2.4s, v20.4s +add v2.4s, v2.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v5.s[2] +mul v19.4S, v19.4S,v4.s[2] +mla v19.4S, v20.4S, v31.s[0] +sub v20.4s, v18.4s, v19.4s +add v18.4s, v18.4s, v19.4s +trn1 v19.4S, v2.4S, v21.4S +trn2 v15.4S, v2.4S, v21.4S +trn1 v14.4S, v18.4S, v20.4S +trn2 v12.4S, v18.4S, v20.4S +trn2 v18.2D, v19.2D, v14.2D +trn2 v20.2D, v15.2D, v12.2D +trn1 v2.2D, v19.2D, v14.2D +trn1 v21.2D, v15.2D, v12.2D +sqrdmulh v12.4S, v18.4S, v8.4S +mul v18.4S, v18.4S,v9.4S +mla v18.4S, v12.4S, v31.s[0] +sub v12.4s, v2.4s, v18.4s +add v2.4s, v2.4s, v18.4s +sqrdmulh v18.4S, v20.4S, v8.4S +mul v20.4S, v20.4S,v9.4S +mla v20.4S, v18.4S, v31.s[0] +sub v18.4s, v21.4s, v20.4s +add v21.4s, v21.4s, v20.4s +sqrdmulh v20.4S, v21.4S, v7.4S +mul v21.4S, v21.4S,v6.4S +mla v21.4S, v20.4S, v31.s[0] +sub v20.4s, v2.4s, v21.4s +add v2.4s, v2.4s, v21.4s +sqrdmulh v21.4S, v18.4S, v16.4S +mul v18.4S, v18.4S,v1.4S +mla v18.4S, v21.4S, v31.s[0] +sub v21.4s, v12.4s, v18.4s +add v12.4s, v12.4s, v18.4s +str q2, [x0, #384] +str q20, [x0, #400] +str q12, [x0, #416] +str q21, [x0, #432] +ldr q21, [x17, #+1024] +ldr q12, [x17, #+1040] +ldr q20, [x17, #+1056] +ldr q2, [x17, #+1072] +ldr q18, [x17, #+1088] +ldr q15, [x17, #+1104] +ldr q14, [x17, #+1120] +ldr q19, [x17, #+1136] +ldr q16, [x0, #480] +ldr q1, [x0, #496] +ldr q7, [x0, #448] +ldr q6, [x0, #464] +sqrdmulh v8.4S, v16.4S, v12.s[0] +mul v16.4S, v16.4S,v21.s[0] +mla v16.4S, v8.4S, v31.s[0] +sub v8.4s, v7.4s, v16.4s +add v7.4s, v7.4s, v16.4s +sqrdmulh v16.4S, v1.4S, v12.s[0] +mul v1.4S, v1.4S,v21.s[0] +mla v1.4S, v16.4S, v31.s[0] +sub v16.4s, v6.4s, v1.4s +add v6.4s, v6.4s, v1.4s +sqrdmulh v1.4S, v6.4S, v12.s[1] +mul v6.4S, v6.4S,v21.s[1] +mla v6.4S, v1.4S, v31.s[0] +sub v1.4s, v7.4s, v6.4s +add v7.4s, v7.4s, v6.4s +sqrdmulh v6.4S, v16.4S, v12.s[2] +mul v16.4S, v16.4S,v21.s[2] +mla v16.4S, v6.4S, v31.s[0] +sub v6.4s, v8.4s, v16.4s +add v8.4s, v8.4s, v16.4s +trn1 v16.4S, v7.4S, v1.4S +trn2 v9.4S, v7.4S, v1.4S +trn1 v5.4S, v8.4S, v6.4S +trn2 v4.4S, v8.4S, v6.4S +trn2 v8.2D, v16.2D, v5.2D +trn2 v6.2D, v9.2D, v4.2D +trn1 v7.2D, v16.2D, v5.2D +trn1 v1.2D, v9.2D, v4.2D +sqrdmulh v4.4S, v8.4S, v2.4S +mul v8.4S, v8.4S,v20.4S +mla v8.4S, v4.4S, v31.s[0] +sub v4.4s, v7.4s, v8.4s +add v7.4s, v7.4s, v8.4s +sqrdmulh v8.4S, v6.4S, v2.4S +mul v6.4S, v6.4S,v20.4S +mla v6.4S, v8.4S, v31.s[0] +sub v8.4s, v1.4s, v6.4s +add v1.4s, v1.4s, v6.4s +sqrdmulh v6.4S, v1.4S, v15.4S +mul v1.4S, v1.4S,v18.4S +mla v1.4S, v6.4S, v31.s[0] +sub v6.4s, v7.4s, v1.4s +add v7.4s, v7.4s, v1.4s +sqrdmulh v1.4S, v8.4S, v19.4S +mul v8.4S, v8.4S,v14.4S +mla v8.4S, v1.4S, v31.s[0] +sub v1.4s, v4.4s, v8.4s +add v4.4s, v4.4s, v8.4s +str q7, [x0, #448] +str q6, [x0, #464] +str q4, [x0, #480] +str q1, [x0, #496] +ldr q1, [x17, #+1152] +ldr q4, [x17, #+1168] +ldr q6, [x17, #+1184] +ldr q7, [x17, #+1200] +ldr q8, [x17, #+1216] +ldr q9, [x17, #+1232] +ldr q5, [x17, #+1248] +ldr q16, [x17, #+1264] +ldr q19, [x0, #544] +ldr q14, [x0, #560] +ldr q15, [x0, #512] +ldr q18, [x0, #528] +sqrdmulh v2.4S, v19.4S, v4.s[0] +mul v19.4S, v19.4S,v1.s[0] +mla v19.4S, v2.4S, v31.s[0] +sub v2.4s, v15.4s, v19.4s +add v15.4s, v15.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v4.s[0] +mul v14.4S, v14.4S,v1.s[0] +mla v14.4S, v19.4S, v31.s[0] +sub v19.4s, v18.4s, v14.4s +add v18.4s, v18.4s, v14.4s +sqrdmulh v14.4S, v18.4S, v4.s[1] +mul v18.4S, v18.4S,v1.s[1] +mla v18.4S, v14.4S, v31.s[0] +sub v14.4s, v15.4s, v18.4s +add v15.4s, v15.4s, v18.4s +sqrdmulh v18.4S, v19.4S, v4.s[2] +mul v19.4S, v19.4S,v1.s[2] +mla v19.4S, v18.4S, v31.s[0] +sub v18.4s, v2.4s, v19.4s +add v2.4s, v2.4s, v19.4s +trn1 v19.4S, v15.4S, v14.4S +trn2 v20.4S, v15.4S, v14.4S +trn1 v12.4S, v2.4S, v18.4S +trn2 v21.4S, v2.4S, v18.4S +trn2 v2.2D, v19.2D, v12.2D +trn2 v18.2D, v20.2D, v21.2D +trn1 v15.2D, v19.2D, v12.2D +trn1 v14.2D, v20.2D, v21.2D +sqrdmulh v21.4S, v2.4S, v7.4S +mul v2.4S, v2.4S,v6.4S +mla v2.4S, v21.4S, v31.s[0] +sub v21.4s, v15.4s, v2.4s +add v15.4s, v15.4s, v2.4s +sqrdmulh v2.4S, v18.4S, v7.4S +mul v18.4S, v18.4S,v6.4S +mla v18.4S, v2.4S, v31.s[0] +sub v2.4s, v14.4s, v18.4s +add v14.4s, v14.4s, v18.4s +sqrdmulh v18.4S, v14.4S, v9.4S +mul v14.4S, v14.4S,v8.4S +mla v14.4S, v18.4S, v31.s[0] +sub v18.4s, v15.4s, v14.4s +add v15.4s, v15.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v16.4S +mul v2.4S, v2.4S,v5.4S +mla v2.4S, v14.4S, v31.s[0] +sub v14.4s, v21.4s, v2.4s +add v21.4s, v21.4s, v2.4s +str q15, [x0, #512] +str q18, [x0, #528] +str q21, [x0, #544] +str q14, [x0, #560] +ldr q14, [x17, #+1280] +ldr q21, [x17, #+1296] +ldr q18, [x17, #+1312] +ldr q15, [x17, #+1328] +ldr q2, [x17, #+1344] +ldr q20, [x17, #+1360] +ldr q12, [x17, #+1376] +ldr q19, [x17, #+1392] +ldr q16, [x0, #608] +ldr q5, [x0, #624] +ldr q9, [x0, #576] +ldr q8, [x0, #592] +sqrdmulh v7.4S, v16.4S, v21.s[0] +mul v16.4S, v16.4S,v14.s[0] +mla v16.4S, v7.4S, v31.s[0] +sub v7.4s, v9.4s, v16.4s +add v9.4s, v9.4s, v16.4s +sqrdmulh v16.4S, v5.4S, v21.s[0] +mul v5.4S, v5.4S,v14.s[0] +mla v5.4S, v16.4S, v31.s[0] +sub v16.4s, v8.4s, v5.4s +add v8.4s, v8.4s, v5.4s +sqrdmulh v5.4S, v8.4S, v21.s[1] +mul v8.4S, v8.4S,v14.s[1] +mla v8.4S, v5.4S, v31.s[0] +sub v5.4s, v9.4s, v8.4s +add v9.4s, v9.4s, v8.4s +sqrdmulh v8.4S, v16.4S, v21.s[2] +mul v16.4S, v16.4S,v14.s[2] +mla v16.4S, v8.4S, v31.s[0] +sub v8.4s, v7.4s, v16.4s +add v7.4s, v7.4s, v16.4s +trn1 v16.4S, v9.4S, v5.4S +trn2 v6.4S, v9.4S, v5.4S +trn1 v4.4S, v7.4S, v8.4S +trn2 v1.4S, v7.4S, v8.4S +trn2 v7.2D, v16.2D, v4.2D +trn2 v8.2D, v6.2D, v1.2D +trn1 v9.2D, v16.2D, v4.2D +trn1 v5.2D, v6.2D, v1.2D +sqrdmulh v1.4S, v7.4S, v15.4S +mul v7.4S, v7.4S,v18.4S +mla v7.4S, v1.4S, v31.s[0] +sub v1.4s, v9.4s, v7.4s +add v9.4s, v9.4s, v7.4s +sqrdmulh v7.4S, v8.4S, v15.4S +mul v8.4S, v8.4S,v18.4S +mla v8.4S, v7.4S, v31.s[0] +sub v7.4s, v5.4s, v8.4s +add v5.4s, v5.4s, v8.4s +sqrdmulh v8.4S, v5.4S, v20.4S +mul v5.4S, v5.4S,v2.4S +mla v5.4S, v8.4S, v31.s[0] +sub v8.4s, v9.4s, v5.4s +add v9.4s, v9.4s, v5.4s +sqrdmulh v5.4S, v7.4S, v19.4S +mul v7.4S, v7.4S,v12.4S +mla v7.4S, v5.4S, v31.s[0] +sub v5.4s, v1.4s, v7.4s +add v1.4s, v1.4s, v7.4s +str q9, [x0, #576] +str q8, [x0, #592] +str q1, [x0, #608] +str q5, [x0, #624] +ldr q5, [x17, #+1408] +ldr q1, [x17, #+1424] +ldr q8, [x17, #+1440] +ldr q9, [x17, #+1456] +ldr q7, [x17, #+1472] +ldr q6, [x17, #+1488] +ldr q4, [x17, #+1504] +ldr q16, [x17, #+1520] +ldr q19, [x0, #672] +ldr q12, [x0, #688] +ldr q20, [x0, #640] +ldr q2, [x0, #656] +sqrdmulh v15.4S, v19.4S, v1.s[0] +mul v19.4S, v19.4S,v5.s[0] +mla v19.4S, v15.4S, v31.s[0] +sub v15.4s, v20.4s, v19.4s +add v20.4s, v20.4s, v19.4s +sqrdmulh v19.4S, v12.4S, v1.s[0] +mul v12.4S, v12.4S,v5.s[0] +mla v12.4S, v19.4S, v31.s[0] +sub v19.4s, v2.4s, v12.4s +add v2.4s, v2.4s, v12.4s +sqrdmulh v12.4S, v2.4S, v1.s[1] +mul v2.4S, v2.4S,v5.s[1] +mla v2.4S, v12.4S, v31.s[0] +sub v12.4s, v20.4s, v2.4s +add v20.4s, v20.4s, v2.4s +sqrdmulh v2.4S, v19.4S, v1.s[2] +mul v19.4S, v19.4S,v5.s[2] +mla v19.4S, v2.4S, v31.s[0] +sub v2.4s, v15.4s, v19.4s +add v15.4s, v15.4s, v19.4s +trn1 v19.4S, v20.4S, v12.4S +trn2 v18.4S, v20.4S, v12.4S +trn1 v21.4S, v15.4S, v2.4S +trn2 v14.4S, v15.4S, v2.4S +trn2 v15.2D, v19.2D, v21.2D +trn2 v2.2D, v18.2D, v14.2D +trn1 v20.2D, v19.2D, v21.2D +trn1 v12.2D, v18.2D, v14.2D +sqrdmulh v14.4S, v15.4S, v9.4S +mul v15.4S, v15.4S,v8.4S +mla v15.4S, v14.4S, v31.s[0] +sub v14.4s, v20.4s, v15.4s +add v20.4s, v20.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v9.4S +mul v2.4S, v2.4S,v8.4S +mla v2.4S, v15.4S, v31.s[0] +sub v15.4s, v12.4s, v2.4s +add v12.4s, v12.4s, v2.4s +sqrdmulh v2.4S, v12.4S, v6.4S +mul v12.4S, v12.4S,v7.4S +mla v12.4S, v2.4S, v31.s[0] +sub v2.4s, v20.4s, v12.4s +add v20.4s, v20.4s, v12.4s +sqrdmulh v12.4S, v15.4S, v16.4S +mul v15.4S, v15.4S,v4.4S +mla v15.4S, v12.4S, v31.s[0] +sub v12.4s, v14.4s, v15.4s +add v14.4s, v14.4s, v15.4s +str q20, [x0, #640] +str q2, [x0, #656] +str q14, [x0, #672] +str q12, [x0, #688] +ldr q12, [x17, #+1536] +ldr q14, [x17, #+1552] +ldr q2, [x17, #+1568] +ldr q20, [x17, #+1584] +ldr q15, [x17, #+1600] +ldr q18, [x17, #+1616] +ldr q21, [x17, #+1632] +ldr q19, [x17, #+1648] +ldr q16, [x0, #736] +ldr q4, [x0, #752] +ldr q6, [x0, #704] +ldr q7, [x0, #720] +sqrdmulh v9.4S, v16.4S, v14.s[0] +mul v16.4S, v16.4S,v12.s[0] +mla v16.4S, v9.4S, v31.s[0] +sub v9.4s, v6.4s, v16.4s +add v6.4s, v6.4s, v16.4s +sqrdmulh v16.4S, v4.4S, v14.s[0] +mul v4.4S, v4.4S,v12.s[0] +mla v4.4S, v16.4S, v31.s[0] +sub v16.4s, v7.4s, v4.4s +add v7.4s, v7.4s, v4.4s +sqrdmulh v4.4S, v7.4S, v14.s[1] +mul v7.4S, v7.4S,v12.s[1] +mla v7.4S, v4.4S, v31.s[0] +sub v4.4s, v6.4s, v7.4s +add v6.4s, v6.4s, v7.4s +sqrdmulh v7.4S, v16.4S, v14.s[2] +mul v16.4S, v16.4S,v12.s[2] +mla v16.4S, v7.4S, v31.s[0] +sub v7.4s, v9.4s, v16.4s +add v9.4s, v9.4s, v16.4s +trn1 v16.4S, v6.4S, v4.4S +trn2 v8.4S, v6.4S, v4.4S +trn1 v1.4S, v9.4S, v7.4S +trn2 v5.4S, v9.4S, v7.4S +trn2 v9.2D, v16.2D, v1.2D +trn2 v7.2D, v8.2D, v5.2D +trn1 v6.2D, v16.2D, v1.2D +trn1 v4.2D, v8.2D, v5.2D +sqrdmulh v5.4S, v9.4S, v20.4S +mul v9.4S, v9.4S,v2.4S +mla v9.4S, v5.4S, v31.s[0] +sub v5.4s, v6.4s, v9.4s +add v6.4s, v6.4s, v9.4s +sqrdmulh v9.4S, v7.4S, v20.4S +mul v7.4S, v7.4S,v2.4S +mla v7.4S, v9.4S, v31.s[0] +sub v9.4s, v4.4s, v7.4s +add v4.4s, v4.4s, v7.4s +sqrdmulh v7.4S, v4.4S, v18.4S +mul v4.4S, v4.4S,v15.4S +mla v4.4S, v7.4S, v31.s[0] +sub v7.4s, v6.4s, v4.4s +add v6.4s, v6.4s, v4.4s +sqrdmulh v4.4S, v9.4S, v19.4S +mul v9.4S, v9.4S,v21.4S +mla v9.4S, v4.4S, v31.s[0] +sub v4.4s, v5.4s, v9.4s +add v5.4s, v5.4s, v9.4s +str q6, [x0, #704] +str q7, [x0, #720] +str q5, [x0, #736] +str q4, [x0, #752] +ldr q4, [x17, #+1664] +ldr q5, [x17, #+1680] +ldr q7, [x17, #+1696] +ldr q6, [x17, #+1712] +ldr q9, [x17, #+1728] +ldr q8, [x17, #+1744] +ldr q1, [x17, #+1760] +ldr q16, [x17, #+1776] +ldr q19, [x0, #800] +ldr q21, [x0, #816] +ldr q18, [x0, #768] +ldr q15, [x0, #784] +sqrdmulh v20.4S, v19.4S, v5.s[0] +mul v19.4S, v19.4S,v4.s[0] +mla v19.4S, v20.4S, v31.s[0] +sub v20.4s, v18.4s, v19.4s +add v18.4s, v18.4s, v19.4s +sqrdmulh v19.4S, v21.4S, v5.s[0] +mul v21.4S, v21.4S,v4.s[0] +mla v21.4S, v19.4S, v31.s[0] +sub v19.4s, v15.4s, v21.4s +add v15.4s, v15.4s, v21.4s +sqrdmulh v21.4S, v15.4S, v5.s[1] +mul v15.4S, v15.4S,v4.s[1] +mla v15.4S, v21.4S, v31.s[0] +sub v21.4s, v18.4s, v15.4s +add v18.4s, v18.4s, v15.4s +sqrdmulh v15.4S, v19.4S, v5.s[2] +mul v19.4S, v19.4S,v4.s[2] +mla v19.4S, v15.4S, v31.s[0] +sub v15.4s, v20.4s, v19.4s +add v20.4s, v20.4s, v19.4s +trn1 v19.4S, v18.4S, v21.4S +trn2 v2.4S, v18.4S, v21.4S +trn1 v14.4S, v20.4S, v15.4S +trn2 v12.4S, v20.4S, v15.4S +trn2 v20.2D, v19.2D, v14.2D +trn2 v15.2D, v2.2D, v12.2D +trn1 v18.2D, v19.2D, v14.2D +trn1 v21.2D, v2.2D, v12.2D +sqrdmulh v12.4S, v20.4S, v6.4S +mul v20.4S, v20.4S,v7.4S +mla v20.4S, v12.4S, v31.s[0] +sub v12.4s, v18.4s, v20.4s +add v18.4s, v18.4s, v20.4s +sqrdmulh v20.4S, v15.4S, v6.4S +mul v15.4S, v15.4S,v7.4S +mla v15.4S, v20.4S, v31.s[0] +sub v20.4s, v21.4s, v15.4s +add v21.4s, v21.4s, v15.4s +sqrdmulh v15.4S, v21.4S, v8.4S +mul v21.4S, v21.4S,v9.4S +mla v21.4S, v15.4S, v31.s[0] +sub v15.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v20.4S, v16.4S +mul v20.4S, v20.4S,v1.4S +mla v20.4S, v21.4S, v31.s[0] +sub v21.4s, v12.4s, v20.4s +add v12.4s, v12.4s, v20.4s +str q18, [x0, #768] +str q15, [x0, #784] +str q12, [x0, #800] +str q21, [x0, #816] +ldr q21, [x17, #+1792] +ldr q12, [x17, #+1808] +ldr q15, [x17, #+1824] +ldr q18, [x17, #+1840] +ldr q20, [x17, #+1856] +ldr q2, [x17, #+1872] +ldr q14, [x17, #+1888] +ldr q19, [x17, #+1904] +ldr q16, [x0, #864] +ldr q1, [x0, #880] +ldr q8, [x0, #832] +ldr q9, [x0, #848] +sqrdmulh v6.4S, v16.4S, v12.s[0] +mul v16.4S, v16.4S,v21.s[0] +mla v16.4S, v6.4S, v31.s[0] +sub v6.4s, v8.4s, v16.4s +add v8.4s, v8.4s, v16.4s +sqrdmulh v16.4S, v1.4S, v12.s[0] +mul v1.4S, v1.4S,v21.s[0] +mla v1.4S, v16.4S, v31.s[0] +sub v16.4s, v9.4s, v1.4s +add v9.4s, v9.4s, v1.4s +sqrdmulh v1.4S, v9.4S, v12.s[1] +mul v9.4S, v9.4S,v21.s[1] +mla v9.4S, v1.4S, v31.s[0] +sub v1.4s, v8.4s, v9.4s +add v8.4s, v8.4s, v9.4s +sqrdmulh v9.4S, v16.4S, v12.s[2] +mul v16.4S, v16.4S,v21.s[2] +mla v16.4S, v9.4S, v31.s[0] +sub v9.4s, v6.4s, v16.4s +add v6.4s, v6.4s, v16.4s +trn1 v16.4S, v8.4S, v1.4S +trn2 v7.4S, v8.4S, v1.4S +trn1 v5.4S, v6.4S, v9.4S +trn2 v4.4S, v6.4S, v9.4S +trn2 v6.2D, v16.2D, v5.2D +trn2 v9.2D, v7.2D, v4.2D +trn1 v8.2D, v16.2D, v5.2D +trn1 v1.2D, v7.2D, v4.2D +sqrdmulh v4.4S, v6.4S, v18.4S +mul v6.4S, v6.4S,v15.4S +mla v6.4S, v4.4S, v31.s[0] +sub v4.4s, v8.4s, v6.4s +add v8.4s, v8.4s, v6.4s +sqrdmulh v6.4S, v9.4S, v18.4S +mul v9.4S, v9.4S,v15.4S +mla v9.4S, v6.4S, v31.s[0] +sub v6.4s, v1.4s, v9.4s +add v1.4s, v1.4s, v9.4s +sqrdmulh v9.4S, v1.4S, v2.4S +mul v1.4S, v1.4S,v20.4S +mla v1.4S, v9.4S, v31.s[0] +sub v9.4s, v8.4s, v1.4s +add v8.4s, v8.4s, v1.4s +sqrdmulh v1.4S, v6.4S, v19.4S +mul v6.4S, v6.4S,v14.4S +mla v6.4S, v1.4S, v31.s[0] +sub v1.4s, v4.4s, v6.4s +add v4.4s, v4.4s, v6.4s +str q8, [x0, #832] +str q9, [x0, #848] +str q4, [x0, #864] +str q1, [x0, #880] +ldr q1, [x17, #+1920] +ldr q4, [x17, #+1936] +ldr q9, [x17, #+1952] +ldr q8, [x17, #+1968] +ldr q6, [x17, #+1984] +ldr q7, [x17, #+2000] +ldr q5, [x17, #+2016] +ldr q16, [x17, #+2032] +ldr q19, [x0, #928] +ldr q14, [x0, #944] +ldr q2, [x0, #896] +ldr q20, [x0, #912] +sqrdmulh v18.4S, v19.4S, v4.s[0] +mul v19.4S, v19.4S,v1.s[0] +mla v19.4S, v18.4S, v31.s[0] +sub v18.4s, v2.4s, v19.4s +add v2.4s, v2.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v4.s[0] +mul v14.4S, v14.4S,v1.s[0] +mla v14.4S, v19.4S, v31.s[0] +sub v19.4s, v20.4s, v14.4s +add v20.4s, v20.4s, v14.4s +sqrdmulh v14.4S, v20.4S, v4.s[1] +mul v20.4S, v20.4S,v1.s[1] +mla v20.4S, v14.4S, v31.s[0] +sub v14.4s, v2.4s, v20.4s +add v2.4s, v2.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v4.s[2] +mul v19.4S, v19.4S,v1.s[2] +mla v19.4S, v20.4S, v31.s[0] +sub v20.4s, v18.4s, v19.4s +add v18.4s, v18.4s, v19.4s +trn1 v19.4S, v2.4S, v14.4S +trn2 v15.4S, v2.4S, v14.4S +trn1 v12.4S, v18.4S, v20.4S +trn2 v21.4S, v18.4S, v20.4S +trn2 v18.2D, v19.2D, v12.2D +trn2 v20.2D, v15.2D, v21.2D +trn1 v2.2D, v19.2D, v12.2D +trn1 v14.2D, v15.2D, v21.2D +sqrdmulh v21.4S, v18.4S, v8.4S +mul v18.4S, v18.4S,v9.4S +mla v18.4S, v21.4S, v31.s[0] +sub v21.4s, v2.4s, v18.4s +add v2.4s, v2.4s, v18.4s +sqrdmulh v18.4S, v20.4S, v8.4S +mul v20.4S, v20.4S,v9.4S +mla v20.4S, v18.4S, v31.s[0] +sub v18.4s, v14.4s, v20.4s +add v14.4s, v14.4s, v20.4s +sqrdmulh v20.4S, v14.4S, v7.4S +mul v14.4S, v14.4S,v6.4S +mla v14.4S, v20.4S, v31.s[0] +sub v20.4s, v2.4s, v14.4s +add v2.4s, v2.4s, v14.4s +sqrdmulh v14.4S, v18.4S, v16.4S +mul v18.4S, v18.4S,v5.4S +mla v18.4S, v14.4S, v31.s[0] +sub v14.4s, v21.4s, v18.4s +add v21.4s, v21.4s, v18.4s +str q2, [x0, #896] +str q20, [x0, #912] +str q21, [x0, #928] +str q14, [x0, #944] +ldr q14, [x17, #+2048] +ldr q21, [x17, #+2064] +ldr q20, [x17, #+2080] +ldr q2, [x17, #+2096] +ldr q18, [x17, #+2112] +ldr q15, [x17, #+2128] +ldr q12, [x17, #+2144] +ldr q19, [x17, #+2160] +ldr q16, [x0, #992] +ldr q5, [x0, #1008] +ldr q7, [x0, #960] +ldr q6, [x0, #976] +sqrdmulh v8.4S, v16.4S, v21.s[0] +mul v16.4S, v16.4S,v14.s[0] +mla v16.4S, v8.4S, v31.s[0] +sub v8.4s, v7.4s, v16.4s +add v7.4s, v7.4s, v16.4s +sqrdmulh v16.4S, v5.4S, v21.s[0] +mul v5.4S, v5.4S,v14.s[0] +mla v5.4S, v16.4S, v31.s[0] +sub v16.4s, v6.4s, v5.4s +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v6.4S, v21.s[1] +mul v6.4S, v6.4S,v14.s[1] +mla v6.4S, v5.4S, v31.s[0] +sub v5.4s, v7.4s, v6.4s +add v7.4s, v7.4s, v6.4s +sqrdmulh v6.4S, v16.4S, v21.s[2] +mul v16.4S, v16.4S,v14.s[2] +mla v16.4S, v6.4S, v31.s[0] +sub v6.4s, v8.4s, v16.4s +add v8.4s, v8.4s, v16.4s +trn1 v16.4S, v7.4S, v5.4S +trn2 v9.4S, v7.4S, v5.4S +trn1 v4.4S, v8.4S, v6.4S +trn2 v1.4S, v8.4S, v6.4S +trn2 v8.2D, v16.2D, v4.2D +trn2 v6.2D, v9.2D, v1.2D +trn1 v7.2D, v16.2D, v4.2D +trn1 v5.2D, v9.2D, v1.2D +sqrdmulh v1.4S, v8.4S, v2.4S +mul v8.4S, v8.4S,v20.4S +mla v8.4S, v1.4S, v31.s[0] +sub v1.4s, v7.4s, v8.4s +add v7.4s, v7.4s, v8.4s +sqrdmulh v8.4S, v6.4S, v2.4S +mul v6.4S, v6.4S,v20.4S +mla v6.4S, v8.4S, v31.s[0] +sub v8.4s, v5.4s, v6.4s +add v5.4s, v5.4s, v6.4s +sqrdmulh v6.4S, v5.4S, v15.4S +mul v5.4S, v5.4S,v18.4S +mla v5.4S, v6.4S, v31.s[0] +sub v6.4s, v7.4s, v5.4s +add v7.4s, v7.4s, v5.4s +sqrdmulh v5.4S, v8.4S, v19.4S +mul v8.4S, v8.4S,v12.4S +mla v8.4S, v5.4S, v31.s[0] +sub v5.4s, v1.4s, v8.4s +add v1.4s, v1.4s, v8.4s +str q7, [x0, #960] +str q6, [x0, #976] +str q1, [x0, #992] +str q5, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 2392 +// Instruction count: 2388 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_3_0.s b/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_3_0.s new file mode 100644 index 0000000..d3538e3 --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_3_0.s @@ -0,0 +1,2422 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 26036764 // Layer 6, block 0 +.word 7065381 // Layer 6, block 1 +.word 11280567 // Layer 6, block 2 +.word 19695786 // Layer 6, block 3 +.word 1666225723 // Layer 6, block 0 +.word 452149874 // Layer 6, block 1 +.word 721901190 // Layer 6, block 2 +.word 1260434103 // Layer 6, block 3 +.word 28678040 // Layer 7, block 0 +.word 5637166 // Layer 7, block 2 +.word 18759424 // Layer 7, block 4 +.word 8648030 // Layer 7, block 6 +.word 1835254486 // Layer 7, block 0 +.word 360751090 // Layer 7, block 2 +.word 1200511508 // Layer 7, block 4 +.word 553431680 // Layer 7, block 6 +.word 7232147 // Layer 7, block 1 +.word 7430689 // Layer 7, block 3 +.word 14819378 // Layer 7, block 5 +.word 22112339 // Layer 7, block 7 +.word 462822084 // Layer 7, block 1 +.word 475527802 // Layer 7, block 3 +.word 948367809 // Layer 7, block 5 +.word 1415081692 // Layer 7, block 7 +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14834498 // Layer 6, block 4 +.word 22861321 // Layer 6, block 5 +.word 23033862 // Layer 6, block 6 +.word 32211066 // Layer 6, block 7 +.word 949335415 // Layer 6, block 4 +.word 1463012881 // Layer 6, block 5 +.word 1474054663 // Layer 6, block 6 +.word 2061350894 // Layer 6, block 7 +.word 7103825 // Layer 7, block 8 +.word 24338119 // Layer 7, block 10 +.word 6674394 // Layer 7, block 12 +.word 3716128 // Layer 7, block 14 +.word 454610102 // Layer 7, block 8 +.word 1557520740 // Layer 7, block 10 +.word 427128616 // Layer 7, block 12 +.word 237814041 // Layer 7, block 14 +.word 18577393 // Layer 7, block 9 +.word 17042091 // Layer 7, block 11 +.word 6574213 // Layer 7, block 13 +.word 24666803 // Layer 7, block 15 +.word 1188862414 // Layer 7, block 9 +.word 1090610585 // Layer 7, block 11 +.word 420717521 // Layer 7, block 13 +.word 1578554911 // Layer 7, block 15 +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 11253846 // Layer 6, block 8 +.word 16151303 // Layer 6, block 9 +.word 1821442 // Layer 6, block 10 +.word 23358663 // Layer 6, block 11 +.word 720191176 // Layer 6, block 8 +.word 1033604503 // Layer 6, block 9 +.word 116563391 // Layer 6, block 10 +.word 1494840340 // Layer 6, block 11 +.word 32787475 // Layer 7, block 16 +.word 8269259 // Layer 7, block 18 +.word 20826321 // Layer 7, block 20 +.word 21194054 // Layer 7, block 22 +.word 2098238255 // Layer 7, block 16 +.word 529192186 // Layer 7, block 18 +.word 1332782821 // Layer 7, block 20 +.word 1356315937 // Layer 7, block 22 +.word 28400654 // Layer 7, block 17 +.word 31090287 // Layer 7, block 19 +.word 26776841 // Layer 7, block 21 +.word 22281074 // Layer 7, block 23 +.word 1817503137 // Layer 7, block 17 +.word 1989626512 // Layer 7, block 19 +.word 1713587037 // Layer 7, block 21 +.word 1425879908 // Layer 7, block 23 +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 20504641 // Layer 6, block 12 +.word 7735096 // Layer 6, block 13 +.word 29463916 // Layer 6, block 14 +.word 23172067 // Layer 6, block 15 +.word 1312196872 // Layer 6, block 12 +.word 495008363 // Layer 6, block 13 +.word 1885546712 // Layer 6, block 14 +.word 1482899108 // Layer 6, block 15 +.word 1953000 // Layer 7, block 24 +.word 12766243 // Layer 7, block 26 +.word 16292342 // Layer 7, block 28 +.word 25143337 // Layer 7, block 30 +.word 124982461 // Layer 7, block 24 +.word 816977197 // Layer 7, block 26 +.word 1042630311 // Layer 7, block 28 +.word 1609050759 // Layer 7, block 30 +.word 12486848 // Layer 7, block 25 +.word 31556661 // Layer 7, block 27 +.word 28330310 // Layer 7, block 29 +.word 15137961 // Layer 7, block 31 +.word 799097282 // Layer 7, block 25 +.word 2019472170 // Layer 7, block 27 +.word 1813001465 // Layer 7, block 29 +.word 968755565 // Layer 7, block 31 +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 18663828 // Layer 6, block 16 +.word 25765932 // Layer 6, block 17 +.word 11779122 // Layer 6, block 18 +.word 29112305 // Layer 6, block 19 +.word 1194393831 // Layer 6, block 16 +.word 1648893798 // Layer 6, block 17 +.word 753806275 // Layer 6, block 18 +.word 1863045325 // Layer 6, block 19 +.word 33163184 // Layer 7, block 32 +.word 11550623 // Layer 7, block 34 +.word 25375595 // Layer 7, block 36 +.word 18254638 // Layer 7, block 38 +.word 2122281795 // Layer 7, block 32 +.word 739183455 // Layer 7, block 34 +.word 1623914137 // Layer 7, block 36 +.word 1168207670 // Layer 7, block 38 +.word 9551359 // Layer 7, block 33 +.word 33257316 // Layer 7, block 35 +.word 10387700 // Layer 7, block 37 +.word 4263629 // Layer 7, block 39 +.word 611240324 // Layer 7, block 33 +.word 2128305784 // Layer 7, block 35 +.word 664762063 // Layer 7, block 37 +.word 272851431 // Layer 7, block 39 +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 596073 // Layer 6, block 20 +.word 29039358 // Layer 6, block 21 +.word 6760262 // Layer 6, block 22 +.word 2228887 // Layer 6, block 23 +.word 38145761 // Layer 6, block 20 +.word 1858377074 // Layer 6, block 21 +.word 432623749 // Layer 6, block 22 +.word 142637881 // Layer 6, block 23 +.word 25929180 // Layer 7, block 40 +.word 23508428 // Layer 7, block 42 +.word 22560727 // Layer 7, block 44 +.word 29457393 // Layer 7, block 46 +.word 1659340873 // Layer 7, block 40 +.word 1504424569 // Layer 7, block 42 +.word 1443776334 // Layer 7, block 44 +.word 1885129272 // Layer 7, block 46 +.word 17371159 // Layer 7, block 41 +.word 11558208 // Layer 7, block 43 +.word 15755637 // Layer 7, block 45 +.word 20740787 // Layer 7, block 47 +.word 1111669329 // Layer 7, block 41 +.word 739668858 // Layer 7, block 43 +.word 1008283812 // Layer 7, block 45 +.word 1327309063 // Layer 7, block 47 +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 13624329 // Layer 6, block 24 +.word 9838349 // Layer 6, block 25 +.word 6934560 // Layer 6, block 26 +.word 11310234 // Layer 6, block 27 +.word 871890510 // Layer 6, block 24 +.word 629606282 // Layer 6, block 25 +.word 443777969 // Layer 6, block 26 +.word 723799733 // Layer 6, block 27 +.word 3153984 // Layer 7, block 48 +.word 15599806 // Layer 7, block 50 +.word 23484790 // Layer 7, block 52 +.word 30174454 // Layer 7, block 54 +.word 201839571 // Layer 7, block 48 +.word 998311389 // Layer 7, block 50 +.word 1502911852 // Layer 7, block 52 +.word 1931017673 // Layer 7, block 54 +.word 13598070 // Layer 7, block 49 +.word 31454003 // Layer 7, block 51 +.word 20506260 // Layer 7, block 53 +.word 5928435 // Layer 7, block 55 +.word 870210062 // Layer 7, block 49 +.word 2012902560 // Layer 7, block 51 +.word 1312300480 // Layer 7, block 53 +.word 379390883 // Layer 7, block 55 +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 32798516 // Layer 6, block 28 +.word 9911360 // Layer 6, block 29 +.word 32443170 // Layer 6, block 30 +.word 31293482 // Layer 6, block 31 +.word 2098944825 // Layer 6, block 28 +.word 634278629 // Layer 6, block 29 +.word 2076204416 // Layer 6, block 30 +.word 2002630000 // Layer 6, block 31 +.word 26013877 // Layer 7, block 56 +.word 22928950 // Layer 7, block 58 +.word 24547058 // Layer 7, block 60 +.word 21082546 // Layer 7, block 62 +.word 1664761067 // Layer 7, block 56 +.word 1467340807 // Layer 7, block 58 +.word 1570891816 // Layer 7, block 60 +.word 1349179970 // Layer 7, block 62 +.word 21864746 // Layer 7, block 57 +.word 27678266 // Layer 7, block 59 +.word 30695887 // Layer 7, block 61 +.word 31772478 // Layer 7, block 63 +.word 1399236949 // Layer 7, block 57 +.word 1771273834 // Layer 7, block 59 +.word 1964386839 // Layer 7, block 61 +.word 2033283404 // Layer 7, block 63 +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 2853776 // Layer 6, block 32 +.word 31645959 // Layer 6, block 33 +.word 29723614 // Layer 6, block 34 +.word 31813171 // Layer 6, block 35 +.word 182627725 // Layer 6, block 32 +.word 2025186806 // Layer 6, block 33 +.word 1902166116 // Layer 6, block 34 +.word 2035887557 // Layer 6, block 35 +.word 30377953 // Layer 7, block 64 +.word 4924837 // Layer 7, block 66 +.word 11362575 // Layer 7, block 68 +.word 31398766 // Layer 7, block 70 +.word 1944040616 // Layer 7, block 64 +.word 315165513 // Layer 7, block 66 +.word 727149301 // Layer 7, block 68 +.word 2009367662 // Layer 7, block 70 +.word 27689101 // Layer 7, block 65 +.word 31229525 // Layer 7, block 67 +.word 6544948 // Layer 7, block 69 +.word 13728247 // Layer 7, block 71 +.word 1771967221 // Layer 7, block 65 +.word 1998537064 // Layer 7, block 67 +.word 418844704 // Layer 7, block 69 +.word 878540754 // Layer 7, block 71 +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9116920 // Layer 6, block 36 +.word 26449800 // Layer 6, block 37 +.word 27173300 // Layer 6, block 38 +.word 1574249 // Layer 6, block 39 +.word 583438350 // Layer 6, block 36 +.word 1692658010 // Layer 6, block 37 +.word 1738958476 // Layer 6, block 38 +.word 100744247 // Layer 6, block 39 +.word 6510145 // Layer 7, block 72 +.word 760999 // Layer 7, block 74 +.word 1634503 // Layer 7, block 76 +.word 29546109 // Layer 7, block 78 +.word 416617482 // Layer 7, block 72 +.word 48700219 // Layer 7, block 74 +.word 104600209 // Layer 7, block 76 +.word 1890806663 // Layer 7, block 78 +.word 2195232 // Layer 7, block 73 +.word 4465852 // Layer 7, block 75 +.word 31203102 // Layer 7, block 77 +.word 29916743 // Layer 7, block 79 +.word 140484126 // Layer 7, block 73 +.word 285792715 // Layer 7, block 75 +.word 1996846121 // Layer 7, block 77 +.word 1914525428 // Layer 7, block 79 +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29172999 // Layer 6, block 40 +.word 16825951 // Layer 6, block 41 +.word 11592382 // Layer 6, block 42 +.word 2671395 // Layer 6, block 43 +.word 1866929445 // Layer 6, block 40 +.word 1076778680 // Layer 6, block 41 +.word 741855827 // Layer 6, block 42 +.word 170956232 // Layer 6, block 43 +.word 14579779 // Layer 7, block 80 +.word 24263513 // Layer 7, block 82 +.word 4646776 // Layer 7, block 84 +.word 69049 // Layer 7, block 86 +.word 933034643 // Layer 7, block 80 +.word 1552746321 // Layer 7, block 82 +.word 297370968 // Layer 7, block 84 +.word 4418799 // Layer 7, block 86 +.word 33263488 // Layer 7, block 81 +.word 22493246 // Layer 7, block 83 +.word 22009979 // Layer 7, block 85 +.word 12021234 // Layer 7, block 87 +.word 2128700762 // Layer 7, block 81 +.word 1439457879 // Layer 7, block 83 +.word 1408531152 // Layer 7, block 85 +.word 769300260 // Layer 7, block 87 +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 15720958 // Layer 6, block 44 +.word 4876619 // Layer 6, block 45 +.word 9370171 // Layer 6, block 46 +.word 2197027 // Layer 6, block 47 +.word 1006064525 // Layer 6, block 44 +.word 312079797 // Layer 6, block 45 +.word 599645177 // Layer 6, block 46 +.word 140598997 // Layer 6, block 47 +.word 16117282 // Layer 7, block 88 +.word 9635661 // Layer 7, block 90 +.word 9117520 // Layer 7, block 92 +.word 3506913 // Layer 7, block 94 +.word 1031427326 // Layer 7, block 88 +.word 616635240 // Layer 7, block 90 +.word 583476747 // Layer 7, block 92 +.word 224425303 // Layer 7, block 94 +.word 20014407 // Layer 7, block 89 +.word 25893988 // Layer 7, block 91 +.word 10257619 // Layer 7, block 93 +.word 24501669 // Layer 7, block 95 +.word 1280824291 // Layer 7, block 89 +.word 1657088757 // Layer 7, block 91 +.word 656437514 // Layer 7, block 93 +.word 1567987141 // Layer 7, block 95 +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 23467272 // Layer 6, block 48 +.word 11944835 // Layer 6, block 49 +.word 29768154 // Layer 6, block 50 +.word 3189790 // Layer 6, block 51 +.word 1501790786 // Layer 6, block 48 +.word 764411097 // Layer 6, block 49 +.word 1905016458 // Layer 6, block 50 +.word 204130980 // Layer 6, block 51 +.word 28559032 // Layer 7, block 96 +.word 20151609 // Layer 7, block 98 +.word 11645481 // Layer 7, block 100 +.word 16402437 // Layer 7, block 102 +.word 1827638556 // Layer 7, block 96 +.word 1289604549 // Layer 7, block 98 +.word 745253903 // Layer 7, block 100 +.word 1049675853 // Layer 7, block 102 +.word 1005359 // Layer 7, block 97 +.word 19130139 // Layer 7, block 99 +.word 11690281 // Layer 7, block 101 +.word 5461508 // Layer 7, block 103 +.word 64338065 // Layer 7, block 97 +.word 1224235458 // Layer 7, block 99 +.word 748120885 // Layer 7, block 101 +.word 349509836 // Layer 7, block 103 +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 4898455 // Layer 6, block 52 +.word 22059944 // Layer 6, block 53 +.word 20315246 // Layer 6, block 54 +.word 28615767 // Layer 6, block 55 +.word 313477194 // Layer 6, block 52 +.word 1411728668 // Layer 6, block 53 +.word 1300076517 // Layer 6, block 54 +.word 1831269319 // Layer 6, block 55 +.word 6226096 // Layer 7, block 104 +.word 14029790 // Layer 7, block 106 +.word 7729000 // Layer 7, block 108 +.word 13958531 // Layer 7, block 110 +.word 398439734 // Layer 7, block 104 +.word 897838034 // Layer 7, block 106 +.word 494618249 // Layer 7, block 108 +.word 893277806 // Layer 7, block 110 +.word 31755058 // Layer 7, block 105 +.word 26102744 // Layer 7, block 107 +.word 19175904 // Layer 7, block 109 +.word 19472238 // Layer 7, block 111 +.word 2032168609 // Layer 7, block 105 +.word 1670448121 // Layer 7, block 107 +.word 1227164194 // Layer 7, block 109 +.word 1246128123 // Layer 7, block 111 +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 17302560 // Layer 6, block 56 +.word 8630188 // Layer 6, block 57 +.word 13744680 // Layer 6, block 58 +.word 31890906 // Layer 6, block 59 +.word 1107279328 // Layer 6, block 56 +.word 552289879 // Layer 6, block 57 +.word 879592386 // Layer 6, block 58 +.word 2040862218 // Layer 6, block 59 +.word 4735938 // Layer 7, block 112 +.word 26671657 // Layer 7, block 114 +.word 25810971 // Layer 7, block 116 +.word 25578690 // Layer 7, block 118 +.word 303076900 // Layer 7, block 112 +.word 1706855774 // Layer 7, block 114 +.word 1651776074 // Layer 7, block 116 +.word 1636911225 // Layer 7, block 118 +.word 6957373 // Layer 7, block 113 +.word 25381712 // Layer 7, block 115 +.word 27780827 // Layer 7, block 117 +.word 28062311 // Layer 7, block 119 +.word 445237890 // Layer 7, block 113 +.word 1624305595 // Layer 7, block 115 +.word 1777837237 // Layer 7, block 117 +.word 1795850838 // Layer 7, block 119 +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 26150922 // Layer 6, block 60 +.word 29525906 // Layer 6, block 61 +.word 23080870 // Layer 6, block 62 +.word 1636987 // Layer 6, block 63 +.word 1673531278 // Layer 6, block 60 +.word 1889513769 // Layer 6, block 61 +.word 1477062945 // Layer 6, block 62 +.word 104759172 // Layer 6, block 63 +.word 10674616 // Layer 7, block 120 +.word 9508293 // Layer 7, block 122 +.word 4274200 // Layer 7, block 124 +.word 10066304 // Layer 7, block 126 +.word 683123285 // Layer 7, block 120 +.word 608484310 // Layer 7, block 122 +.word 273527923 // Layer 7, block 124 +.word 644194289 // Layer 7, block 126 +.word 26473446 // Layer 7, block 121 +.word 14853570 // Layer 7, block 123 +.word 32427548 // Layer 7, block 125 +.word 16598340 // Layer 7, block 127 +.word 1694171239 // Layer 7, block 121 +.word 950555930 // Layer 7, block 123 +.word 2075204685 // Layer 7, block 125 +.word 1062212688 // Layer 7, block 127 +.text +.global ntt_u32_full_neon_asm_var_4_4_3_0 +.global _ntt_u32_full_neon_asm_var_4_4_3_0 +ntt_u32_full_neon_asm_var_4_4_3_0: +_ntt_u32_full_neon_asm_var_4_4_3_0: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #800] +ldr q21, [x0, #864] +ldr q20, [x0, #928] +ldr q19, [x0, #992] +ldr q18, [x0, #288] +ldr q17, [x0, #352] +ldr q16, [x0, #416] +ldr q3, [x0, #480] +sqrdmulh v2.4S, v22.4S, v29.s[0] +ldr q1, [x0, #544] +mul v22.4S, v22.4S,v30.s[0] +ldr q0, [x0, #608] +sqrdmulh v15.4S, v21.4S, v29.s[0] +ldr q14, [x0, #672] +mul v21.4S, v21.4S,v30.s[0] +ldr q13, [x0, #736] +mla v22.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q12, [x0, #32] +sub v11.4s, v18.4s, v22.4s +mla v21.4S, v15.4S, v31.s[0] +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q15, [x0, #96] +sub v10.4s, v17.4s, v21.4s +mla v20.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v1.4S, v29.s[0] +ldr q2, [x0, #160] +mul v1.4S, v1.4S,v30.s[0] +sub v9.4s, v16.4s, v20.4s +mla v19.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v0.4S, v29.s[0] +ldr q22, [x0, #224] +mul v0.4S, v0.4S,v30.s[0] +sub v8.4s, v3.4s, v19.4s +mla v1.4S, v21.4S, v31.s[0] +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v21.4s, v12.4s, v1.4s +mla v0.4S, v20.4S, v31.s[0] +add v12.4s, v12.4s, v1.4s +sqrdmulh v1.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +sub v20.4s, v15.4s, v0.4s +mla v14.4S, v19.4S, v31.s[0] +add v15.4s, v15.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v19.4s, v2.4s, v14.4s +mla v13.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v1.4s, v22.4s, v13.4s +mla v16.4S, v0.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v0.4s, v2.4s, v16.4s +mla v3.4S, v14.4S, v31.s[0] +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v14.4s, v22.4s, v3.4s +mla v18.4S, v13.4S, v31.s[0] +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v29.s[2] +mul v9.4S, v9.4S,v30.s[2] +sub v13.4s, v12.4s, v18.4s +mla v17.4S, v16.4S, v31.s[0] +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v8.4S, v29.s[2] +mul v8.4S, v8.4S,v30.s[2] +sub v16.4s, v15.4s, v17.4s +mla v9.4S, v3.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +sqrdmulh v17.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v3.4s, v19.4s, v9.4s +mla v8.4S, v18.4S, v31.s[0] +add v19.4s, v19.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v18.4s, v1.4s, v8.4s +mla v11.4S, v17.4S, v31.s[0] +add v1.4s, v1.4s, v8.4s +sqrdmulh v8.4S, v2.4S, v27.s[0] +mul v2.4S, v2.4S,v28.s[0] +sub v17.4s, v21.4s, v11.4s +mla v10.4S, v9.4S, v31.s[0] +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v27.s[0] +mul v22.4S, v22.4S,v28.s[0] +sub v9.4s, v20.4s, v10.4s +mla v2.4S, v8.4S, v31.s[0] +add v20.4s, v20.4s, v10.4s +sqrdmulh v10.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v8.4s, v12.4s, v2.4s +mla v22.4S, v11.4S, v31.s[0] +add v12.4s, v12.4s, v2.4s +sqrdmulh v2.4S, v14.4S, v27.s[1] +mul v14.4S, v14.4S,v28.s[1] +sub v11.4s, v15.4s, v22.4s +mla v0.4S, v10.4S, v31.s[0] +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v27.s[2] +mul v19.4S, v19.4S,v28.s[2] +sub v10.4s, v13.4s, v0.4s +mla v14.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v0.4s +sqrdmulh v0.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +sub v2.4s, v16.4s, v14.4s +mla v19.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v27.s[3] +mul v3.4S, v3.4S,v28.s[3] +sub v22.4s, v21.4s, v19.4s +mla v1.4S, v0.4S, v31.s[0] +add v21.4s, v21.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v0.4s, v20.4s, v1.4s +mla v3.4S, v14.4S, v31.s[0] +add v20.4s, v20.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v25.s[0] +mul v15.4S, v15.4S,v26.s[0] +sub v14.4s, v17.4s, v3.4s +mla v18.4S, v19.4S, v31.s[0] +add v17.4s, v17.4s, v3.4s +sqrdmulh v3.4S, v11.4S, v25.s[1] +mul v11.4S, v11.4S,v26.s[1] +sub v19.4s, v9.4s, v18.4s +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v1.4s, v12.4s, v15.4s +mla v11.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v25.s[3] +mul v2.4S, v2.4S,v26.s[3] +sub v3.4s, v8.4s, v11.4s +mla v16.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v11.4s +str q12, [x0, #32] +sqrdmulh v12.4S, v20.4S, v23.s[0] +str q1, [x0, #96] +mul v20.4S, v20.4S,v24.s[0] +ldr q1, [x0, #816] +sub v11.4s, v13.4s, v16.4s +ldr q18, [x0, #880] +mla v2.4S, v15.4S, v31.s[0] +add v13.4s, v13.4s, v16.4s +str q8, [x0, #160] +sqrdmulh v8.4S, v0.4S, v23.s[1] +str q3, [x0, #224] +mul v0.4S, v0.4S,v24.s[1] +ldr q3, [x0, #944] +sub v16.4s, v10.4s, v2.4s +ldr q15, [x0, #1008] +mla v20.4S, v12.4S, v31.s[0] +add v10.4s, v10.4s, v2.4s +str q13, [x0, #288] +sqrdmulh v13.4S, v9.4S, v23.s[2] +str q11, [x0, #352] +mul v9.4S, v9.4S,v24.s[2] +ldr q11, [x0, #304] +sub v2.4s, v21.4s, v20.4s +ldr q12, [x0, #368] +mla v0.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v20.4s +str q10, [x0, #416] +sqrdmulh v10.4S, v19.4S, v23.s[3] +str q16, [x0, #480] +mul v19.4S, v19.4S,v24.s[3] +ldr q16, [x0, #432] +sub v20.4s, v22.4s, v0.4s +ldr q8, [x0, #496] +mla v9.4S, v13.4S, v31.s[0] +add v22.4s, v22.4s, v0.4s +str q21, [x0, #544] +sqrdmulh v21.4S, v1.4S, v29.s[0] +str q2, [x0, #608] +ldr q2, [x0, #560] +mul v1.4S, v1.4S,v30.s[0] +ldr q0, [x0, #624] +sub v13.4s, v17.4s, v9.4s +mla v19.4S, v10.4S, v31.s[0] +add v17.4s, v17.4s, v9.4s +str q22, [x0, #672] +sqrdmulh v22.4S, v18.4S, v29.s[0] +str q20, [x0, #736] +ldr q20, [x0, #688] +mul v18.4S, v18.4S,v30.s[0] +ldr q9, [x0, #752] +sub v10.4s, v14.4s, v19.4s +mla v1.4S, v21.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +str q17, [x0, #800] +sqrdmulh v17.4S, v3.4S, v29.s[0] +str q13, [x0, #864] +mul v3.4S, v3.4S,v30.s[0] +ldr q13, [x0, #48] +sub v19.4s, v11.4s, v1.4s +mla v18.4S, v22.4S, v31.s[0] +add v11.4s, v11.4s, v1.4s +str q14, [x0, #928] +sqrdmulh v14.4S, v15.4S, v29.s[0] +str q10, [x0, #992] +mul v15.4S, v15.4S,v30.s[0] +ldr q10, [x0, #112] +sub v1.4s, v12.4s, v18.4s +mla v3.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v2.4S, v29.s[0] +ldr q17, [x0, #176] +mul v2.4S, v2.4S,v30.s[0] +sub v22.4s, v16.4s, v3.4s +mla v15.4S, v14.4S, v31.s[0] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v0.4S, v29.s[0] +ldr q14, [x0, #240] +mul v0.4S, v0.4S,v30.s[0] +sub v21.4s, v8.4s, v15.4s +mla v2.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +sub v18.4s, v13.4s, v2.4s +mla v0.4S, v3.4S, v31.s[0] +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v9.4S, v29.s[0] +mul v9.4S, v9.4S,v30.s[0] +sub v3.4s, v10.4s, v0.4s +mla v20.4S, v15.4S, v31.s[0] +add v10.4s, v10.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v15.4s, v17.4s, v20.4s +mla v9.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +sub v2.4s, v14.4s, v9.4s +mla v16.4S, v0.4S, v31.s[0] +add v14.4s, v14.4s, v9.4s +sqrdmulh v9.4S, v11.4S, v29.s[1] +mul v11.4S, v11.4S,v30.s[1] +sub v0.4s, v17.4s, v16.4s +mla v8.4S, v20.4S, v31.s[0] +add v17.4s, v17.4s, v16.4s +sqrdmulh v16.4S, v12.4S, v29.s[1] +mul v12.4S, v12.4S,v30.s[1] +sub v20.4s, v14.4s, v8.4s +mla v11.4S, v9.4S, v31.s[0] +add v14.4s, v14.4s, v8.4s +sqrdmulh v8.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +sub v9.4s, v13.4s, v11.4s +mla v12.4S, v16.4S, v31.s[0] +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +sub v16.4s, v10.4s, v12.4s +mla v22.4S, v8.4S, v31.s[0] +add v10.4s, v10.4s, v12.4s +sqrdmulh v12.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +sub v8.4s, v15.4s, v22.4s +mla v21.4S, v11.4S, v31.s[0] +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v1.4S, v29.s[2] +mul v1.4S, v1.4S,v30.s[2] +sub v11.4s, v2.4s, v21.4s +mla v19.4S, v12.4S, v31.s[0] +add v2.4s, v2.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v27.s[0] +mul v17.4S, v17.4S,v28.s[0] +sub v12.4s, v18.4s, v19.4s +mla v1.4S, v22.4S, v31.s[0] +add v18.4s, v18.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +sub v22.4s, v3.4s, v1.4s +mla v17.4S, v21.4S, v31.s[0] +add v3.4s, v3.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v21.4s, v13.4s, v17.4s +mla v14.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v17.4s +sqrdmulh v17.4S, v20.4S, v27.s[1] +mul v20.4S, v20.4S,v28.s[1] +sub v19.4s, v10.4s, v14.4s +mla v0.4S, v1.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v27.s[2] +mul v15.4S, v15.4S,v28.s[2] +sub v1.4s, v9.4s, v0.4s +mla v20.4S, v17.4S, v31.s[0] +add v9.4s, v9.4s, v0.4s +sqrdmulh v0.4S, v2.4S, v27.s[2] +mul v2.4S, v2.4S,v28.s[2] +sub v17.4s, v16.4s, v20.4s +mla v15.4S, v14.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v27.s[3] +mul v8.4S, v8.4S,v28.s[3] +sub v14.4s, v18.4s, v15.4s +mla v2.4S, v0.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v27.s[3] +mul v11.4S, v11.4S,v28.s[3] +sub v0.4s, v3.4s, v2.4s +mla v8.4S, v20.4S, v31.s[0] +add v3.4s, v3.4s, v2.4s +sqrdmulh v2.4S, v10.4S, v25.s[0] +mul v10.4S, v10.4S,v26.s[0] +sub v20.4s, v12.4s, v8.4s +mla v11.4S, v15.4S, v31.s[0] +add v12.4s, v12.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v25.s[1] +mul v19.4S, v19.4S,v26.s[1] +sub v15.4s, v22.4s, v11.4s +mla v10.4S, v2.4S, v31.s[0] +add v22.4s, v22.4s, v11.4s +sqrdmulh v11.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v2.4s, v13.4s, v10.4s +mla v19.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v10.4s +sqrdmulh v10.4S, v17.4S, v25.s[3] +mul v17.4S, v17.4S,v26.s[3] +sub v8.4s, v21.4s, v19.4s +mla v16.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v19.4s +str q13, [x0, #48] +sqrdmulh v13.4S, v3.4S, v23.s[0] +str q2, [x0, #112] +mul v3.4S, v3.4S,v24.s[0] +ldr q2, [x0, #768] +sub v19.4s, v9.4s, v16.4s +ldr q11, [x0, #832] +mla v17.4S, v10.4S, v31.s[0] +add v9.4s, v9.4s, v16.4s +str q21, [x0, #176] +sqrdmulh v21.4S, v0.4S, v23.s[1] +str q8, [x0, #240] +mul v0.4S, v0.4S,v24.s[1] +ldr q8, [x0, #896] +sub v16.4s, v1.4s, v17.4s +ldr q10, [x0, #960] +mla v3.4S, v13.4S, v31.s[0] +add v1.4s, v1.4s, v17.4s +str q9, [x0, #304] +sqrdmulh v9.4S, v22.4S, v23.s[2] +str q19, [x0, #368] +mul v22.4S, v22.4S,v24.s[2] +ldr q19, [x0, #256] +sub v17.4s, v18.4s, v3.4s +ldr q13, [x0, #320] +mla v0.4S, v21.4S, v31.s[0] +add v18.4s, v18.4s, v3.4s +str q1, [x0, #432] +sqrdmulh v1.4S, v15.4S, v23.s[3] +str q16, [x0, #496] +mul v15.4S, v15.4S,v24.s[3] +ldr q16, [x0, #384] +sub v3.4s, v14.4s, v0.4s +ldr q21, [x0, #448] +mla v22.4S, v9.4S, v31.s[0] +add v14.4s, v14.4s, v0.4s +str q18, [x0, #560] +sqrdmulh v18.4S, v2.4S, v29.s[0] +str q17, [x0, #624] +ldr q17, [x0, #512] +mul v2.4S, v2.4S,v30.s[0] +ldr q0, [x0, #576] +sub v9.4s, v12.4s, v22.4s +mla v15.4S, v1.4S, v31.s[0] +add v12.4s, v12.4s, v22.4s +str q14, [x0, #688] +sqrdmulh v14.4S, v11.4S, v29.s[0] +str q3, [x0, #752] +ldr q3, [x0, #640] +mul v11.4S, v11.4S,v30.s[0] +ldr q22, [x0, #704] +sub v1.4s, v20.4s, v15.4s +mla v2.4S, v18.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +str q12, [x0, #816] +sqrdmulh v12.4S, v8.4S, v29.s[0] +str q9, [x0, #880] +mul v8.4S, v8.4S,v30.s[0] +ldr q9, [x0, #0] +sub v15.4s, v19.4s, v2.4s +mla v11.4S, v14.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +str q20, [x0, #944] +sqrdmulh v20.4S, v10.4S, v29.s[0] +str q1, [x0, #1008] +mul v10.4S, v10.4S,v30.s[0] +ldr q1, [x0, #64] +sub v2.4s, v13.4s, v11.4s +mla v8.4S, v12.4S, v31.s[0] +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v29.s[0] +ldr q12, [x0, #128] +mul v17.4S, v17.4S,v30.s[0] +sub v14.4s, v16.4s, v8.4s +mla v10.4S, v20.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v0.4S, v29.s[0] +ldr q20, [x0, #192] +mul v0.4S, v0.4S,v30.s[0] +sub v18.4s, v21.4s, v10.4s +mla v17.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +sub v11.4s, v9.4s, v17.4s +mla v0.4S, v8.4S, v31.s[0] +add v9.4s, v9.4s, v17.4s +sqrdmulh v17.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v8.4s, v1.4s, v0.4s +mla v3.4S, v10.4S, v31.s[0] +add v1.4s, v1.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v10.4s, v12.4s, v3.4s +mla v22.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v17.4s, v20.4s, v22.4s +mla v16.4S, v0.4S, v31.s[0] +add v20.4s, v20.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[1] +mul v19.4S, v19.4S,v30.s[1] +sub v0.4s, v12.4s, v16.4s +mla v21.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v13.4S, v29.s[1] +mul v13.4S, v13.4S,v30.s[1] +sub v3.4s, v20.4s, v21.4s +mla v19.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v22.4s, v9.4s, v19.4s +mla v13.4S, v16.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v29.s[2] +mul v18.4S, v18.4S,v30.s[2] +sub v16.4s, v1.4s, v13.4s +mla v14.4S, v21.4S, v31.s[0] +add v1.4s, v1.4s, v13.4s +sqrdmulh v13.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +sub v21.4s, v10.4s, v14.4s +mla v18.4S, v19.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v19.4s, v17.4s, v18.4s +mla v15.4S, v13.4S, v31.s[0] +add v17.4s, v17.4s, v18.4s +sqrdmulh v18.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +sub v13.4s, v11.4s, v15.4s +mla v2.4S, v14.4S, v31.s[0] +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v27.s[0] +mul v20.4S, v20.4S,v28.s[0] +sub v14.4s, v8.4s, v2.4s +mla v12.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v2.4s +sqrdmulh v2.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v18.4s, v9.4s, v12.4s +mla v20.4S, v15.4S, v31.s[0] +add v9.4s, v9.4s, v12.4s +sqrdmulh v12.4S, v3.4S, v27.s[1] +mul v3.4S, v3.4S,v28.s[1] +sub v15.4s, v1.4s, v20.4s +mla v0.4S, v2.4S, v31.s[0] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v10.4S, v27.s[2] +mul v10.4S, v10.4S,v28.s[2] +sub v2.4s, v22.4s, v0.4s +mla v3.4S, v12.4S, v31.s[0] +add v22.4s, v22.4s, v0.4s +sqrdmulh v0.4S, v17.4S, v27.s[2] +mul v17.4S, v17.4S,v28.s[2] +sub v12.4s, v16.4s, v3.4s +mla v10.4S, v20.4S, v31.s[0] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +sub v20.4s, v11.4s, v10.4s +mla v17.4S, v0.4S, v31.s[0] +add v11.4s, v11.4s, v10.4s +sqrdmulh v10.4S, v19.4S, v27.s[3] +mul v19.4S, v19.4S,v28.s[3] +sub v0.4s, v8.4s, v17.4s +mla v21.4S, v3.4S, v31.s[0] +add v8.4s, v8.4s, v17.4s +sqrdmulh v17.4S, v1.4S, v25.s[0] +mul v1.4S, v1.4S,v26.s[0] +sub v3.4s, v13.4s, v21.4s +mla v19.4S, v10.4S, v31.s[0] +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v15.4S, v25.s[1] +mul v15.4S, v15.4S,v26.s[1] +sub v10.4s, v14.4s, v19.4s +mla v1.4S, v17.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +sqrdmulh v19.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v17.4s, v9.4s, v1.4s +mla v15.4S, v21.4S, v31.s[0] +add v9.4s, v9.4s, v1.4s +sqrdmulh v1.4S, v12.4S, v25.s[3] +mul v12.4S, v12.4S,v26.s[3] +sub v21.4s, v18.4s, v15.4s +mla v16.4S, v19.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +str q9, [x0, #0] +sqrdmulh v9.4S, v8.4S, v23.s[0] +str q17, [x0, #64] +mul v8.4S, v8.4S,v24.s[0] +ldr q17, [x0, #784] +sub v15.4s, v22.4s, v16.4s +ldr q19, [x0, #848] +mla v12.4S, v1.4S, v31.s[0] +add v22.4s, v22.4s, v16.4s +str q18, [x0, #128] +sqrdmulh v18.4S, v0.4S, v23.s[1] +str q21, [x0, #192] +mul v0.4S, v0.4S,v24.s[1] +ldr q21, [x0, #912] +sub v16.4s, v2.4s, v12.4s +ldr q1, [x0, #976] +mla v8.4S, v9.4S, v31.s[0] +add v2.4s, v2.4s, v12.4s +str q22, [x0, #256] +sqrdmulh v22.4S, v14.4S, v23.s[2] +str q15, [x0, #320] +mul v14.4S, v14.4S,v24.s[2] +ldr q15, [x0, #272] +sub v12.4s, v11.4s, v8.4s +ldr q9, [x0, #336] +mla v0.4S, v18.4S, v31.s[0] +add v11.4s, v11.4s, v8.4s +str q2, [x0, #384] +sqrdmulh v2.4S, v10.4S, v23.s[3] +str q16, [x0, #448] +mul v10.4S, v10.4S,v24.s[3] +ldr q16, [x0, #400] +sub v8.4s, v20.4s, v0.4s +ldr q18, [x0, #464] +mla v14.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v0.4s +str q11, [x0, #512] +sqrdmulh v11.4S, v17.4S, v29.s[0] +str q12, [x0, #576] +ldr q12, [x0, #528] +mul v17.4S, v17.4S,v30.s[0] +ldr q0, [x0, #592] +sub v22.4s, v13.4s, v14.4s +mla v10.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v14.4s +str q20, [x0, #640] +sqrdmulh v20.4S, v19.4S, v29.s[0] +str q8, [x0, #704] +ldr q8, [x0, #656] +mul v19.4S, v19.4S,v30.s[0] +ldr q14, [x0, #720] +sub v2.4s, v3.4s, v10.4s +mla v17.4S, v11.4S, v31.s[0] +add v3.4s, v3.4s, v10.4s +str q13, [x0, #768] +sqrdmulh v13.4S, v21.4S, v29.s[0] +str q22, [x0, #832] +mul v21.4S, v21.4S,v30.s[0] +ldr q22, [x0, #16] +sub v10.4s, v15.4s, v17.4s +mla v19.4S, v20.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +str q3, [x0, #896] +sqrdmulh v3.4S, v1.4S, v29.s[0] +str q2, [x0, #960] +mul v1.4S, v1.4S,v30.s[0] +ldr q2, [x0, #80] +sub v17.4s, v9.4s, v19.4s +mla v21.4S, v13.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v12.4S, v29.s[0] +ldr q13, [x0, #144] +mul v12.4S, v12.4S,v30.s[0] +sub v20.4s, v16.4s, v21.4s +mla v1.4S, v3.4S, v31.s[0] +add v16.4s, v16.4s, v21.4s +sqrdmulh v21.4S, v0.4S, v29.s[0] +ldr q3, [x0, #208] +mul v0.4S, v0.4S,v30.s[0] +sub v11.4s, v18.4s, v1.4s +mla v12.4S, v19.4S, v31.s[0] +add v18.4s, v18.4s, v1.4s +sqrdmulh v1.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v19.4s, v22.4s, v12.4s +mla v0.4S, v21.4S, v31.s[0] +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v21.4s, v2.4s, v0.4s +mla v8.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v1.4s, v13.4s, v8.4s +mla v14.4S, v12.4S, v31.s[0] +add v13.4s, v13.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v12.4s, v3.4s, v14.4s +mla v16.4S, v0.4S, v31.s[0] +add v3.4s, v3.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +sub v0.4s, v13.4s, v16.4s +mla v18.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v16.4s +sqrdmulh v16.4S, v9.4S, v29.s[1] +mul v9.4S, v9.4S,v30.s[1] +sub v8.4s, v3.4s, v18.4s +mla v15.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +sub v14.4s, v22.4s, v15.4s +mla v9.4S, v16.4S, v31.s[0] +add v22.4s, v22.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v16.4s, v2.4s, v9.4s +mla v20.4S, v18.4S, v31.s[0] +add v2.4s, v2.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v18.4s, v1.4s, v20.4s +mla v11.4S, v15.4S, v31.s[0] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +sub v15.4s, v12.4s, v11.4s +mla v10.4S, v9.4S, v31.s[0] +add v12.4s, v12.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v27.s[0] +mul v13.4S, v13.4S,v28.s[0] +sub v9.4s, v19.4s, v10.4s +mla v17.4S, v20.4S, v31.s[0] +add v19.4s, v19.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v27.s[0] +mul v3.4S, v3.4S,v28.s[0] +sub v20.4s, v21.4s, v17.4s +mla v13.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v11.4s, v22.4s, v13.4s +mla v3.4S, v10.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v8.4S, v27.s[1] +mul v8.4S, v8.4S,v28.s[1] +sub v10.4s, v2.4s, v3.4s +mla v0.4S, v17.4S, v31.s[0] +add v2.4s, v2.4s, v3.4s +sqrdmulh v3.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +sub v17.4s, v14.4s, v0.4s +mla v8.4S, v13.4S, v31.s[0] +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v12.4S, v27.s[2] +mul v12.4S, v12.4S,v28.s[2] +sub v13.4s, v16.4s, v8.4s +mla v1.4S, v3.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v3.4s, v19.4s, v1.4s +mla v12.4S, v0.4S, v31.s[0] +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +sub v0.4s, v21.4s, v12.4s +mla v18.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v2.4S, v25.s[0] +mul v2.4S, v2.4S,v26.s[0] +sub v8.4s, v9.4s, v18.4s +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v10.4S, v25.s[1] +mul v10.4S, v10.4S,v26.s[1] +sub v1.4s, v20.4s, v15.4s +mla v2.4S, v12.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v12.4s, v22.4s, v2.4s +mla v10.4S, v18.4S, v31.s[0] +add v22.4s, v22.4s, v2.4s +sqrdmulh v2.4S, v13.4S, v25.s[3] +mul v13.4S, v13.4S,v26.s[3] +sub v18.4s, v11.4s, v10.4s +mla v16.4S, v15.4S, v31.s[0] +add v11.4s, v11.4s, v10.4s +str q22, [x0, #16] +sqrdmulh v22.4S, v21.4S, v23.s[0] +str q12, [x0, #80] +mul v21.4S, v21.4S,v24.s[0] +sub v12.4s, v14.4s, v16.4s +mla v13.4S, v2.4S, v31.s[0] +add v14.4s, v14.4s, v16.4s +str q11, [x0, #144] +sqrdmulh v11.4S, v0.4S, v23.s[1] +str q18, [x0, #208] +mul v0.4S, v0.4S,v24.s[1] +sub v18.4s, v17.4s, v13.4s +mla v21.4S, v22.4S, v31.s[0] +add v17.4s, v17.4s, v13.4s +str q14, [x0, #272] +sqrdmulh v14.4S, v20.4S, v23.s[2] +str q12, [x0, #336] +mul v20.4S, v20.4S,v24.s[2] +sub v12.4s, v19.4s, v21.4s +mla v0.4S, v11.4S, v31.s[0] +add v19.4s, v19.4s, v21.4s +str q17, [x0, #400] +sqrdmulh v17.4S, v1.4S, v23.s[3] +str q18, [x0, #464] +mul v1.4S, v1.4S,v24.s[3] +sub v18.4s, v3.4s, v0.4s +mla v20.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v0.4s +str q19, [x0, #528] +str q12, [x0, #592] +sub v12.4s, v9.4s, v20.4s +mla v1.4S, v17.4S, v31.s[0] +add v9.4s, v9.4s, v20.4s +str q3, [x0, #656] +str q18, [x0, #720] +sub v18.4s, v8.4s, v1.4s +add v8.4s, v8.4s, v1.4s +str q9, [x0, #784] +str q12, [x0, #848] +str q8, [x0, #912] +str q18, [x0, #976] +ldr q4, [x17, #+128] +ldr q5, [x17, #+144] +ldr q6, [x17, #+160] +ldr q7, [x17, #+176] +ldr q15, [x17, #+192] +ldr q10, [x17, #+208] +ldr q2, [x17, #+224] +ldr q16, [x17, #+240] +ldr q22, [x0, #32] +ldr q13, [x0, #48] +ldr q11, [x0, #0] +ldr q21, [x0, #16] +sqrdmulh v14.4S, v22.4S, v5.s[0] +mul v22.4S, v22.4S,v4.s[0] +mla v22.4S, v14.4S, v31.s[0] +sub v14.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +sqrdmulh v22.4S, v13.4S, v5.s[0] +mul v13.4S, v13.4S,v4.s[0] +mla v13.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +sqrdmulh v13.4S, v21.4S, v5.s[1] +mul v21.4S, v21.4S,v4.s[1] +mla v21.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v21.4s +add v11.4s, v11.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v5.s[2] +mul v22.4S, v22.4S,v4.s[2] +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v14.4s, v22.4s +add v14.4s, v14.4s, v22.4s +trn1 v22.4S, v11.4S, v13.4S +trn2 v0.4S, v11.4S, v13.4S +trn1 v19.4S, v14.4S, v21.4S +trn2 v17.4S, v14.4S, v21.4S +trn2 v14.2D, v22.2D, v19.2D +trn2 v21.2D, v0.2D, v17.2D +trn1 v11.2D, v22.2D, v19.2D +trn1 v13.2D, v0.2D, v17.2D +sqrdmulh v17.4S, v14.4S, v7.4S +mul v14.4S, v14.4S,v6.4S +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v11.4s, v14.4s +add v11.4s, v11.4s, v14.4s +sqrdmulh v14.4S, v21.4S, v7.4S +mul v21.4S, v21.4S,v6.4S +mla v21.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v21.4s +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v13.4S, v10.4S +mul v13.4S, v13.4S,v15.4S +mla v13.4S, v21.4S, v31.s[0] +sub v21.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +sqrdmulh v13.4S, v14.4S, v16.4S +mul v14.4S, v14.4S,v2.4S +mla v14.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v14.4s +add v17.4s, v17.4s, v14.4s +str q11, [x0, #0] +str q21, [x0, #16] +str q17, [x0, #32] +str q13, [x0, #48] +ldr q13, [x17, #+256] +ldr q17, [x17, #+272] +ldr q21, [x17, #+288] +ldr q11, [x17, #+304] +ldr q14, [x17, #+320] +ldr q0, [x17, #+336] +ldr q19, [x17, #+352] +ldr q22, [x17, #+368] +ldr q16, [x0, #96] +ldr q2, [x0, #112] +ldr q10, [x0, #64] +ldr q15, [x0, #80] +sqrdmulh v7.4S, v16.4S, v17.s[0] +mul v16.4S, v16.4S,v13.s[0] +mla v16.4S, v7.4S, v31.s[0] +sub v7.4s, v10.4s, v16.4s +add v10.4s, v10.4s, v16.4s +sqrdmulh v16.4S, v2.4S, v17.s[0] +mul v2.4S, v2.4S,v13.s[0] +mla v2.4S, v16.4S, v31.s[0] +sub v16.4s, v15.4s, v2.4s +add v15.4s, v15.4s, v2.4s +sqrdmulh v2.4S, v15.4S, v17.s[1] +mul v15.4S, v15.4S,v13.s[1] +mla v15.4S, v2.4S, v31.s[0] +sub v2.4s, v10.4s, v15.4s +add v10.4s, v10.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v17.s[2] +mul v16.4S, v16.4S,v13.s[2] +mla v16.4S, v15.4S, v31.s[0] +sub v15.4s, v7.4s, v16.4s +add v7.4s, v7.4s, v16.4s +trn1 v16.4S, v10.4S, v2.4S +trn2 v6.4S, v10.4S, v2.4S +trn1 v5.4S, v7.4S, v15.4S +trn2 v4.4S, v7.4S, v15.4S +trn2 v7.2D, v16.2D, v5.2D +trn2 v15.2D, v6.2D, v4.2D +trn1 v10.2D, v16.2D, v5.2D +trn1 v2.2D, v6.2D, v4.2D +sqrdmulh v4.4S, v7.4S, v11.4S +mul v7.4S, v7.4S,v21.4S +mla v7.4S, v4.4S, v31.s[0] +sub v4.4s, v10.4s, v7.4s +add v10.4s, v10.4s, v7.4s +sqrdmulh v7.4S, v15.4S, v11.4S +mul v15.4S, v15.4S,v21.4S +mla v15.4S, v7.4S, v31.s[0] +sub v7.4s, v2.4s, v15.4s +add v2.4s, v2.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v0.4S +mul v2.4S, v2.4S,v14.4S +mla v2.4S, v15.4S, v31.s[0] +sub v15.4s, v10.4s, v2.4s +add v10.4s, v10.4s, v2.4s +sqrdmulh v2.4S, v7.4S, v22.4S +mul v7.4S, v7.4S,v19.4S +mla v7.4S, v2.4S, v31.s[0] +sub v2.4s, v4.4s, v7.4s +add v4.4s, v4.4s, v7.4s +str q10, [x0, #64] +str q15, [x0, #80] +str q4, [x0, #96] +str q2, [x0, #112] +ldr q2, [x17, #+384] +ldr q4, [x17, #+400] +ldr q15, [x17, #+416] +ldr q10, [x17, #+432] +ldr q7, [x17, #+448] +ldr q6, [x17, #+464] +ldr q5, [x17, #+480] +ldr q16, [x17, #+496] +ldr q22, [x0, #160] +ldr q19, [x0, #176] +ldr q0, [x0, #128] +ldr q14, [x0, #144] +sqrdmulh v11.4S, v22.4S, v4.s[0] +mul v22.4S, v22.4S,v2.s[0] +mla v22.4S, v11.4S, v31.s[0] +sub v11.4s, v0.4s, v22.4s +add v0.4s, v0.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v4.s[0] +mul v19.4S, v19.4S,v2.s[0] +mla v19.4S, v22.4S, v31.s[0] +sub v22.4s, v14.4s, v19.4s +add v14.4s, v14.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v4.s[1] +mul v14.4S, v14.4S,v2.s[1] +mla v14.4S, v19.4S, v31.s[0] +sub v19.4s, v0.4s, v14.4s +add v0.4s, v0.4s, v14.4s +sqrdmulh v14.4S, v22.4S, v4.s[2] +mul v22.4S, v22.4S,v2.s[2] +mla v22.4S, v14.4S, v31.s[0] +sub v14.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +trn1 v22.4S, v0.4S, v19.4S +trn2 v21.4S, v0.4S, v19.4S +trn1 v17.4S, v11.4S, v14.4S +trn2 v13.4S, v11.4S, v14.4S +trn2 v11.2D, v22.2D, v17.2D +trn2 v14.2D, v21.2D, v13.2D +trn1 v0.2D, v22.2D, v17.2D +trn1 v19.2D, v21.2D, v13.2D +sqrdmulh v13.4S, v11.4S, v10.4S +mul v11.4S, v11.4S,v15.4S +mla v11.4S, v13.4S, v31.s[0] +sub v13.4s, v0.4s, v11.4s +add v0.4s, v0.4s, v11.4s +sqrdmulh v11.4S, v14.4S, v10.4S +mul v14.4S, v14.4S,v15.4S +mla v14.4S, v11.4S, v31.s[0] +sub v11.4s, v19.4s, v14.4s +add v19.4s, v19.4s, v14.4s +sqrdmulh v14.4S, v19.4S, v6.4S +mul v19.4S, v19.4S,v7.4S +mla v19.4S, v14.4S, v31.s[0] +sub v14.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v11.4S, v16.4S +mul v11.4S, v11.4S,v5.4S +mla v11.4S, v19.4S, v31.s[0] +sub v19.4s, v13.4s, v11.4s +add v13.4s, v13.4s, v11.4s +str q0, [x0, #128] +str q14, [x0, #144] +str q13, [x0, #160] +str q19, [x0, #176] +ldr q19, [x17, #+512] +ldr q13, [x17, #+528] +ldr q14, [x17, #+544] +ldr q0, [x17, #+560] +ldr q11, [x17, #+576] +ldr q21, [x17, #+592] +ldr q17, [x17, #+608] +ldr q22, [x17, #+624] +ldr q16, [x0, #224] +ldr q5, [x0, #240] +ldr q6, [x0, #192] +ldr q7, [x0, #208] +sqrdmulh v10.4S, v16.4S, v13.s[0] +mul v16.4S, v16.4S,v19.s[0] +mla v16.4S, v10.4S, v31.s[0] +sub v10.4s, v6.4s, v16.4s +add v6.4s, v6.4s, v16.4s +sqrdmulh v16.4S, v5.4S, v13.s[0] +mul v5.4S, v5.4S,v19.s[0] +mla v5.4S, v16.4S, v31.s[0] +sub v16.4s, v7.4s, v5.4s +add v7.4s, v7.4s, v5.4s +sqrdmulh v5.4S, v7.4S, v13.s[1] +mul v7.4S, v7.4S,v19.s[1] +mla v7.4S, v5.4S, v31.s[0] +sub v5.4s, v6.4s, v7.4s +add v6.4s, v6.4s, v7.4s +sqrdmulh v7.4S, v16.4S, v13.s[2] +mul v16.4S, v16.4S,v19.s[2] +mla v16.4S, v7.4S, v31.s[0] +sub v7.4s, v10.4s, v16.4s +add v10.4s, v10.4s, v16.4s +trn1 v16.4S, v6.4S, v5.4S +trn2 v15.4S, v6.4S, v5.4S +trn1 v4.4S, v10.4S, v7.4S +trn2 v2.4S, v10.4S, v7.4S +trn2 v10.2D, v16.2D, v4.2D +trn2 v7.2D, v15.2D, v2.2D +trn1 v6.2D, v16.2D, v4.2D +trn1 v5.2D, v15.2D, v2.2D +sqrdmulh v2.4S, v10.4S, v0.4S +mul v10.4S, v10.4S,v14.4S +mla v10.4S, v2.4S, v31.s[0] +sub v2.4s, v6.4s, v10.4s +add v6.4s, v6.4s, v10.4s +sqrdmulh v10.4S, v7.4S, v0.4S +mul v7.4S, v7.4S,v14.4S +mla v7.4S, v10.4S, v31.s[0] +sub v10.4s, v5.4s, v7.4s +add v5.4s, v5.4s, v7.4s +sqrdmulh v7.4S, v5.4S, v21.4S +mul v5.4S, v5.4S,v11.4S +mla v5.4S, v7.4S, v31.s[0] +sub v7.4s, v6.4s, v5.4s +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v10.4S, v22.4S +mul v10.4S, v10.4S,v17.4S +mla v10.4S, v5.4S, v31.s[0] +sub v5.4s, v2.4s, v10.4s +add v2.4s, v2.4s, v10.4s +str q6, [x0, #192] +str q7, [x0, #208] +str q2, [x0, #224] +str q5, [x0, #240] +ldr q5, [x17, #+640] +ldr q2, [x17, #+656] +ldr q7, [x17, #+672] +ldr q6, [x17, #+688] +ldr q10, [x17, #+704] +ldr q15, [x17, #+720] +ldr q4, [x17, #+736] +ldr q16, [x17, #+752] +ldr q22, [x0, #288] +ldr q17, [x0, #304] +ldr q21, [x0, #256] +ldr q11, [x0, #272] +sqrdmulh v0.4S, v22.4S, v2.s[0] +mul v22.4S, v22.4S,v5.s[0] +mla v22.4S, v0.4S, v31.s[0] +sub v0.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +sqrdmulh v22.4S, v17.4S, v2.s[0] +mul v17.4S, v17.4S,v5.s[0] +mla v17.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v17.4s +add v11.4s, v11.4s, v17.4s +sqrdmulh v17.4S, v11.4S, v2.s[1] +mul v11.4S, v11.4S,v5.s[1] +mla v11.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v11.4s +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v2.s[2] +mul v22.4S, v22.4S,v5.s[2] +mla v22.4S, v11.4S, v31.s[0] +sub v11.4s, v0.4s, v22.4s +add v0.4s, v0.4s, v22.4s +trn1 v22.4S, v21.4S, v17.4S +trn2 v14.4S, v21.4S, v17.4S +trn1 v13.4S, v0.4S, v11.4S +trn2 v19.4S, v0.4S, v11.4S +trn2 v0.2D, v22.2D, v13.2D +trn2 v11.2D, v14.2D, v19.2D +trn1 v21.2D, v22.2D, v13.2D +trn1 v17.2D, v14.2D, v19.2D +sqrdmulh v19.4S, v0.4S, v6.4S +mul v0.4S, v0.4S,v7.4S +mla v0.4S, v19.4S, v31.s[0] +sub v19.4s, v21.4s, v0.4s +add v21.4s, v21.4s, v0.4s +sqrdmulh v0.4S, v11.4S, v6.4S +mul v11.4S, v11.4S,v7.4S +mla v11.4S, v0.4S, v31.s[0] +sub v0.4s, v17.4s, v11.4s +add v17.4s, v17.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v15.4S +mul v17.4S, v17.4S,v10.4S +mla v17.4S, v11.4S, v31.s[0] +sub v11.4s, v21.4s, v17.4s +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v0.4S, v16.4S +mul v0.4S, v0.4S,v4.4S +mla v0.4S, v17.4S, v31.s[0] +sub v17.4s, v19.4s, v0.4s +add v19.4s, v19.4s, v0.4s +str q21, [x0, #256] +str q11, [x0, #272] +str q19, [x0, #288] +str q17, [x0, #304] +ldr q17, [x17, #+768] +ldr q19, [x17, #+784] +ldr q11, [x17, #+800] +ldr q21, [x17, #+816] +ldr q0, [x17, #+832] +ldr q14, [x17, #+848] +ldr q13, [x17, #+864] +ldr q22, [x17, #+880] +ldr q16, [x0, #352] +ldr q4, [x0, #368] +ldr q15, [x0, #320] +ldr q10, [x0, #336] +sqrdmulh v6.4S, v16.4S, v19.s[0] +mul v16.4S, v16.4S,v17.s[0] +mla v16.4S, v6.4S, v31.s[0] +sub v6.4s, v15.4s, v16.4s +add v15.4s, v15.4s, v16.4s +sqrdmulh v16.4S, v4.4S, v19.s[0] +mul v4.4S, v4.4S,v17.s[0] +mla v4.4S, v16.4S, v31.s[0] +sub v16.4s, v10.4s, v4.4s +add v10.4s, v10.4s, v4.4s +sqrdmulh v4.4S, v10.4S, v19.s[1] +mul v10.4S, v10.4S,v17.s[1] +mla v10.4S, v4.4S, v31.s[0] +sub v4.4s, v15.4s, v10.4s +add v15.4s, v15.4s, v10.4s +sqrdmulh v10.4S, v16.4S, v19.s[2] +mul v16.4S, v16.4S,v17.s[2] +mla v16.4S, v10.4S, v31.s[0] +sub v10.4s, v6.4s, v16.4s +add v6.4s, v6.4s, v16.4s +trn1 v16.4S, v15.4S, v4.4S +trn2 v7.4S, v15.4S, v4.4S +trn1 v2.4S, v6.4S, v10.4S +trn2 v5.4S, v6.4S, v10.4S +trn2 v6.2D, v16.2D, v2.2D +trn2 v10.2D, v7.2D, v5.2D +trn1 v15.2D, v16.2D, v2.2D +trn1 v4.2D, v7.2D, v5.2D +sqrdmulh v5.4S, v6.4S, v21.4S +mul v6.4S, v6.4S,v11.4S +mla v6.4S, v5.4S, v31.s[0] +sub v5.4s, v15.4s, v6.4s +add v15.4s, v15.4s, v6.4s +sqrdmulh v6.4S, v10.4S, v21.4S +mul v10.4S, v10.4S,v11.4S +mla v10.4S, v6.4S, v31.s[0] +sub v6.4s, v4.4s, v10.4s +add v4.4s, v4.4s, v10.4s +sqrdmulh v10.4S, v4.4S, v14.4S +mul v4.4S, v4.4S,v0.4S +mla v4.4S, v10.4S, v31.s[0] +sub v10.4s, v15.4s, v4.4s +add v15.4s, v15.4s, v4.4s +sqrdmulh v4.4S, v6.4S, v22.4S +mul v6.4S, v6.4S,v13.4S +mla v6.4S, v4.4S, v31.s[0] +sub v4.4s, v5.4s, v6.4s +add v5.4s, v5.4s, v6.4s +str q15, [x0, #320] +str q10, [x0, #336] +str q5, [x0, #352] +str q4, [x0, #368] +ldr q4, [x17, #+896] +ldr q5, [x17, #+912] +ldr q10, [x17, #+928] +ldr q15, [x17, #+944] +ldr q6, [x17, #+960] +ldr q7, [x17, #+976] +ldr q2, [x17, #+992] +ldr q16, [x17, #+1008] +ldr q22, [x0, #416] +ldr q13, [x0, #432] +ldr q14, [x0, #384] +ldr q0, [x0, #400] +sqrdmulh v21.4S, v22.4S, v5.s[0] +mul v22.4S, v22.4S,v4.s[0] +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v14.4s, v22.4s +add v14.4s, v14.4s, v22.4s +sqrdmulh v22.4S, v13.4S, v5.s[0] +mul v13.4S, v13.4S,v4.s[0] +mla v13.4S, v22.4S, v31.s[0] +sub v22.4s, v0.4s, v13.4s +add v0.4s, v0.4s, v13.4s +sqrdmulh v13.4S, v0.4S, v5.s[1] +mul v0.4S, v0.4S,v4.s[1] +mla v0.4S, v13.4S, v31.s[0] +sub v13.4s, v14.4s, v0.4s +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v22.4S, v5.s[2] +mul v22.4S, v22.4S,v4.s[2] +mla v22.4S, v0.4S, v31.s[0] +sub v0.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +trn1 v22.4S, v14.4S, v13.4S +trn2 v11.4S, v14.4S, v13.4S +trn1 v19.4S, v21.4S, v0.4S +trn2 v17.4S, v21.4S, v0.4S +trn2 v21.2D, v22.2D, v19.2D +trn2 v0.2D, v11.2D, v17.2D +trn1 v14.2D, v22.2D, v19.2D +trn1 v13.2D, v11.2D, v17.2D +sqrdmulh v17.4S, v21.4S, v15.4S +mul v21.4S, v21.4S,v10.4S +mla v21.4S, v17.4S, v31.s[0] +sub v17.4s, v14.4s, v21.4s +add v14.4s, v14.4s, v21.4s +sqrdmulh v21.4S, v0.4S, v15.4S +mul v0.4S, v0.4S,v10.4S +mla v0.4S, v21.4S, v31.s[0] +sub v21.4s, v13.4s, v0.4s +add v13.4s, v13.4s, v0.4s +sqrdmulh v0.4S, v13.4S, v7.4S +mul v13.4S, v13.4S,v6.4S +mla v13.4S, v0.4S, v31.s[0] +sub v0.4s, v14.4s, v13.4s +add v14.4s, v14.4s, v13.4s +sqrdmulh v13.4S, v21.4S, v16.4S +mul v21.4S, v21.4S,v2.4S +mla v21.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v21.4s +add v17.4s, v17.4s, v21.4s +str q14, [x0, #384] +str q0, [x0, #400] +str q17, [x0, #416] +str q13, [x0, #432] +ldr q13, [x17, #+1024] +ldr q17, [x17, #+1040] +ldr q0, [x17, #+1056] +ldr q14, [x17, #+1072] +ldr q21, [x17, #+1088] +ldr q11, [x17, #+1104] +ldr q19, [x17, #+1120] +ldr q22, [x17, #+1136] +ldr q16, [x0, #480] +ldr q2, [x0, #496] +ldr q7, [x0, #448] +ldr q6, [x0, #464] +sqrdmulh v15.4S, v16.4S, v17.s[0] +mul v16.4S, v16.4S,v13.s[0] +mla v16.4S, v15.4S, v31.s[0] +sub v15.4s, v7.4s, v16.4s +add v7.4s, v7.4s, v16.4s +sqrdmulh v16.4S, v2.4S, v17.s[0] +mul v2.4S, v2.4S,v13.s[0] +mla v2.4S, v16.4S, v31.s[0] +sub v16.4s, v6.4s, v2.4s +add v6.4s, v6.4s, v2.4s +sqrdmulh v2.4S, v6.4S, v17.s[1] +mul v6.4S, v6.4S,v13.s[1] +mla v6.4S, v2.4S, v31.s[0] +sub v2.4s, v7.4s, v6.4s +add v7.4s, v7.4s, v6.4s +sqrdmulh v6.4S, v16.4S, v17.s[2] +mul v16.4S, v16.4S,v13.s[2] +mla v16.4S, v6.4S, v31.s[0] +sub v6.4s, v15.4s, v16.4s +add v15.4s, v15.4s, v16.4s +trn1 v16.4S, v7.4S, v2.4S +trn2 v10.4S, v7.4S, v2.4S +trn1 v5.4S, v15.4S, v6.4S +trn2 v4.4S, v15.4S, v6.4S +trn2 v15.2D, v16.2D, v5.2D +trn2 v6.2D, v10.2D, v4.2D +trn1 v7.2D, v16.2D, v5.2D +trn1 v2.2D, v10.2D, v4.2D +sqrdmulh v4.4S, v15.4S, v14.4S +mul v15.4S, v15.4S,v0.4S +mla v15.4S, v4.4S, v31.s[0] +sub v4.4s, v7.4s, v15.4s +add v7.4s, v7.4s, v15.4s +sqrdmulh v15.4S, v6.4S, v14.4S +mul v6.4S, v6.4S,v0.4S +mla v6.4S, v15.4S, v31.s[0] +sub v15.4s, v2.4s, v6.4s +add v2.4s, v2.4s, v6.4s +sqrdmulh v6.4S, v2.4S, v11.4S +mul v2.4S, v2.4S,v21.4S +mla v2.4S, v6.4S, v31.s[0] +sub v6.4s, v7.4s, v2.4s +add v7.4s, v7.4s, v2.4s +sqrdmulh v2.4S, v15.4S, v22.4S +mul v15.4S, v15.4S,v19.4S +mla v15.4S, v2.4S, v31.s[0] +sub v2.4s, v4.4s, v15.4s +add v4.4s, v4.4s, v15.4s +str q7, [x0, #448] +str q6, [x0, #464] +str q4, [x0, #480] +str q2, [x0, #496] +ldr q2, [x17, #+1152] +ldr q4, [x17, #+1168] +ldr q6, [x17, #+1184] +ldr q7, [x17, #+1200] +ldr q15, [x17, #+1216] +ldr q10, [x17, #+1232] +ldr q5, [x17, #+1248] +ldr q16, [x17, #+1264] +ldr q22, [x0, #544] +ldr q19, [x0, #560] +ldr q11, [x0, #512] +ldr q21, [x0, #528] +sqrdmulh v14.4S, v22.4S, v4.s[0] +mul v22.4S, v22.4S,v2.s[0] +mla v22.4S, v14.4S, v31.s[0] +sub v14.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v4.s[0] +mul v19.4S, v19.4S,v2.s[0] +mla v19.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v19.4s +add v21.4s, v21.4s, v19.4s +sqrdmulh v19.4S, v21.4S, v4.s[1] +mul v21.4S, v21.4S,v2.s[1] +mla v21.4S, v19.4S, v31.s[0] +sub v19.4s, v11.4s, v21.4s +add v11.4s, v11.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v4.s[2] +mul v22.4S, v22.4S,v2.s[2] +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v14.4s, v22.4s +add v14.4s, v14.4s, v22.4s +trn1 v22.4S, v11.4S, v19.4S +trn2 v0.4S, v11.4S, v19.4S +trn1 v17.4S, v14.4S, v21.4S +trn2 v13.4S, v14.4S, v21.4S +trn2 v14.2D, v22.2D, v17.2D +trn2 v21.2D, v0.2D, v13.2D +trn1 v11.2D, v22.2D, v17.2D +trn1 v19.2D, v0.2D, v13.2D +sqrdmulh v13.4S, v14.4S, v7.4S +mul v14.4S, v14.4S,v6.4S +mla v14.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v14.4s +add v11.4s, v11.4s, v14.4s +sqrdmulh v14.4S, v21.4S, v7.4S +mul v21.4S, v21.4S,v6.4S +mla v21.4S, v14.4S, v31.s[0] +sub v14.4s, v19.4s, v21.4s +add v19.4s, v19.4s, v21.4s +sqrdmulh v21.4S, v19.4S, v10.4S +mul v19.4S, v19.4S,v15.4S +mla v19.4S, v21.4S, v31.s[0] +sub v21.4s, v11.4s, v19.4s +add v11.4s, v11.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v16.4S +mul v14.4S, v14.4S,v5.4S +mla v14.4S, v19.4S, v31.s[0] +sub v19.4s, v13.4s, v14.4s +add v13.4s, v13.4s, v14.4s +str q11, [x0, #512] +str q21, [x0, #528] +str q13, [x0, #544] +str q19, [x0, #560] +ldr q19, [x17, #+1280] +ldr q13, [x17, #+1296] +ldr q21, [x17, #+1312] +ldr q11, [x17, #+1328] +ldr q14, [x17, #+1344] +ldr q0, [x17, #+1360] +ldr q17, [x17, #+1376] +ldr q22, [x17, #+1392] +ldr q16, [x0, #608] +ldr q5, [x0, #624] +ldr q10, [x0, #576] +ldr q15, [x0, #592] +sqrdmulh v7.4S, v16.4S, v13.s[0] +mul v16.4S, v16.4S,v19.s[0] +mla v16.4S, v7.4S, v31.s[0] +sub v7.4s, v10.4s, v16.4s +add v10.4s, v10.4s, v16.4s +sqrdmulh v16.4S, v5.4S, v13.s[0] +mul v5.4S, v5.4S,v19.s[0] +mla v5.4S, v16.4S, v31.s[0] +sub v16.4s, v15.4s, v5.4s +add v15.4s, v15.4s, v5.4s +sqrdmulh v5.4S, v15.4S, v13.s[1] +mul v15.4S, v15.4S,v19.s[1] +mla v15.4S, v5.4S, v31.s[0] +sub v5.4s, v10.4s, v15.4s +add v10.4s, v10.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v13.s[2] +mul v16.4S, v16.4S,v19.s[2] +mla v16.4S, v15.4S, v31.s[0] +sub v15.4s, v7.4s, v16.4s +add v7.4s, v7.4s, v16.4s +trn1 v16.4S, v10.4S, v5.4S +trn2 v6.4S, v10.4S, v5.4S +trn1 v4.4S, v7.4S, v15.4S +trn2 v2.4S, v7.4S, v15.4S +trn2 v7.2D, v16.2D, v4.2D +trn2 v15.2D, v6.2D, v2.2D +trn1 v10.2D, v16.2D, v4.2D +trn1 v5.2D, v6.2D, v2.2D +sqrdmulh v2.4S, v7.4S, v11.4S +mul v7.4S, v7.4S,v21.4S +mla v7.4S, v2.4S, v31.s[0] +sub v2.4s, v10.4s, v7.4s +add v10.4s, v10.4s, v7.4s +sqrdmulh v7.4S, v15.4S, v11.4S +mul v15.4S, v15.4S,v21.4S +mla v15.4S, v7.4S, v31.s[0] +sub v7.4s, v5.4s, v15.4s +add v5.4s, v5.4s, v15.4s +sqrdmulh v15.4S, v5.4S, v0.4S +mul v5.4S, v5.4S,v14.4S +mla v5.4S, v15.4S, v31.s[0] +sub v15.4s, v10.4s, v5.4s +add v10.4s, v10.4s, v5.4s +sqrdmulh v5.4S, v7.4S, v22.4S +mul v7.4S, v7.4S,v17.4S +mla v7.4S, v5.4S, v31.s[0] +sub v5.4s, v2.4s, v7.4s +add v2.4s, v2.4s, v7.4s +str q10, [x0, #576] +str q15, [x0, #592] +str q2, [x0, #608] +str q5, [x0, #624] +ldr q5, [x17, #+1408] +ldr q2, [x17, #+1424] +ldr q15, [x17, #+1440] +ldr q10, [x17, #+1456] +ldr q7, [x17, #+1472] +ldr q6, [x17, #+1488] +ldr q4, [x17, #+1504] +ldr q16, [x17, #+1520] +ldr q22, [x0, #672] +ldr q17, [x0, #688] +ldr q0, [x0, #640] +ldr q14, [x0, #656] +sqrdmulh v11.4S, v22.4S, v2.s[0] +mul v22.4S, v22.4S,v5.s[0] +mla v22.4S, v11.4S, v31.s[0] +sub v11.4s, v0.4s, v22.4s +add v0.4s, v0.4s, v22.4s +sqrdmulh v22.4S, v17.4S, v2.s[0] +mul v17.4S, v17.4S,v5.s[0] +mla v17.4S, v22.4S, v31.s[0] +sub v22.4s, v14.4s, v17.4s +add v14.4s, v14.4s, v17.4s +sqrdmulh v17.4S, v14.4S, v2.s[1] +mul v14.4S, v14.4S,v5.s[1] +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v0.4s, v14.4s +add v0.4s, v0.4s, v14.4s +sqrdmulh v14.4S, v22.4S, v2.s[2] +mul v22.4S, v22.4S,v5.s[2] +mla v22.4S, v14.4S, v31.s[0] +sub v14.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +trn1 v22.4S, v0.4S, v17.4S +trn2 v21.4S, v0.4S, v17.4S +trn1 v13.4S, v11.4S, v14.4S +trn2 v19.4S, v11.4S, v14.4S +trn2 v11.2D, v22.2D, v13.2D +trn2 v14.2D, v21.2D, v19.2D +trn1 v0.2D, v22.2D, v13.2D +trn1 v17.2D, v21.2D, v19.2D +sqrdmulh v19.4S, v11.4S, v10.4S +mul v11.4S, v11.4S,v15.4S +mla v11.4S, v19.4S, v31.s[0] +sub v19.4s, v0.4s, v11.4s +add v0.4s, v0.4s, v11.4s +sqrdmulh v11.4S, v14.4S, v10.4S +mul v14.4S, v14.4S,v15.4S +mla v14.4S, v11.4S, v31.s[0] +sub v11.4s, v17.4s, v14.4s +add v17.4s, v17.4s, v14.4s +sqrdmulh v14.4S, v17.4S, v6.4S +mul v17.4S, v17.4S,v7.4S +mla v17.4S, v14.4S, v31.s[0] +sub v14.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +sqrdmulh v17.4S, v11.4S, v16.4S +mul v11.4S, v11.4S,v4.4S +mla v11.4S, v17.4S, v31.s[0] +sub v17.4s, v19.4s, v11.4s +add v19.4s, v19.4s, v11.4s +str q0, [x0, #640] +str q14, [x0, #656] +str q19, [x0, #672] +str q17, [x0, #688] +ldr q17, [x17, #+1536] +ldr q19, [x17, #+1552] +ldr q14, [x17, #+1568] +ldr q0, [x17, #+1584] +ldr q11, [x17, #+1600] +ldr q21, [x17, #+1616] +ldr q13, [x17, #+1632] +ldr q22, [x17, #+1648] +ldr q16, [x0, #736] +ldr q4, [x0, #752] +ldr q6, [x0, #704] +ldr q7, [x0, #720] +sqrdmulh v10.4S, v16.4S, v19.s[0] +mul v16.4S, v16.4S,v17.s[0] +mla v16.4S, v10.4S, v31.s[0] +sub v10.4s, v6.4s, v16.4s +add v6.4s, v6.4s, v16.4s +sqrdmulh v16.4S, v4.4S, v19.s[0] +mul v4.4S, v4.4S,v17.s[0] +mla v4.4S, v16.4S, v31.s[0] +sub v16.4s, v7.4s, v4.4s +add v7.4s, v7.4s, v4.4s +sqrdmulh v4.4S, v7.4S, v19.s[1] +mul v7.4S, v7.4S,v17.s[1] +mla v7.4S, v4.4S, v31.s[0] +sub v4.4s, v6.4s, v7.4s +add v6.4s, v6.4s, v7.4s +sqrdmulh v7.4S, v16.4S, v19.s[2] +mul v16.4S, v16.4S,v17.s[2] +mla v16.4S, v7.4S, v31.s[0] +sub v7.4s, v10.4s, v16.4s +add v10.4s, v10.4s, v16.4s +trn1 v16.4S, v6.4S, v4.4S +trn2 v15.4S, v6.4S, v4.4S +trn1 v2.4S, v10.4S, v7.4S +trn2 v5.4S, v10.4S, v7.4S +trn2 v10.2D, v16.2D, v2.2D +trn2 v7.2D, v15.2D, v5.2D +trn1 v6.2D, v16.2D, v2.2D +trn1 v4.2D, v15.2D, v5.2D +sqrdmulh v5.4S, v10.4S, v0.4S +mul v10.4S, v10.4S,v14.4S +mla v10.4S, v5.4S, v31.s[0] +sub v5.4s, v6.4s, v10.4s +add v6.4s, v6.4s, v10.4s +sqrdmulh v10.4S, v7.4S, v0.4S +mul v7.4S, v7.4S,v14.4S +mla v7.4S, v10.4S, v31.s[0] +sub v10.4s, v4.4s, v7.4s +add v4.4s, v4.4s, v7.4s +sqrdmulh v7.4S, v4.4S, v21.4S +mul v4.4S, v4.4S,v11.4S +mla v4.4S, v7.4S, v31.s[0] +sub v7.4s, v6.4s, v4.4s +add v6.4s, v6.4s, v4.4s +sqrdmulh v4.4S, v10.4S, v22.4S +mul v10.4S, v10.4S,v13.4S +mla v10.4S, v4.4S, v31.s[0] +sub v4.4s, v5.4s, v10.4s +add v5.4s, v5.4s, v10.4s +str q6, [x0, #704] +str q7, [x0, #720] +str q5, [x0, #736] +str q4, [x0, #752] +ldr q4, [x17, #+1664] +ldr q5, [x17, #+1680] +ldr q7, [x17, #+1696] +ldr q6, [x17, #+1712] +ldr q10, [x17, #+1728] +ldr q15, [x17, #+1744] +ldr q2, [x17, #+1760] +ldr q16, [x17, #+1776] +ldr q22, [x0, #800] +ldr q13, [x0, #816] +ldr q21, [x0, #768] +ldr q11, [x0, #784] +sqrdmulh v0.4S, v22.4S, v5.s[0] +mul v22.4S, v22.4S,v4.s[0] +mla v22.4S, v0.4S, v31.s[0] +sub v0.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +sqrdmulh v22.4S, v13.4S, v5.s[0] +mul v13.4S, v13.4S,v4.s[0] +mla v13.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +sqrdmulh v13.4S, v11.4S, v5.s[1] +mul v11.4S, v11.4S,v4.s[1] +mla v11.4S, v13.4S, v31.s[0] +sub v13.4s, v21.4s, v11.4s +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v5.s[2] +mul v22.4S, v22.4S,v4.s[2] +mla v22.4S, v11.4S, v31.s[0] +sub v11.4s, v0.4s, v22.4s +add v0.4s, v0.4s, v22.4s +trn1 v22.4S, v21.4S, v13.4S +trn2 v14.4S, v21.4S, v13.4S +trn1 v19.4S, v0.4S, v11.4S +trn2 v17.4S, v0.4S, v11.4S +trn2 v0.2D, v22.2D, v19.2D +trn2 v11.2D, v14.2D, v17.2D +trn1 v21.2D, v22.2D, v19.2D +trn1 v13.2D, v14.2D, v17.2D +sqrdmulh v17.4S, v0.4S, v6.4S +mul v0.4S, v0.4S,v7.4S +mla v0.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v0.4s +add v21.4s, v21.4s, v0.4s +sqrdmulh v0.4S, v11.4S, v6.4S +mul v11.4S, v11.4S,v7.4S +mla v11.4S, v0.4S, v31.s[0] +sub v0.4s, v13.4s, v11.4s +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v15.4S +mul v13.4S, v13.4S,v10.4S +mla v13.4S, v11.4S, v31.s[0] +sub v11.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +sqrdmulh v13.4S, v0.4S, v16.4S +mul v0.4S, v0.4S,v2.4S +mla v0.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v0.4s +add v17.4s, v17.4s, v0.4s +str q21, [x0, #768] +str q11, [x0, #784] +str q17, [x0, #800] +str q13, [x0, #816] +ldr q13, [x17, #+1792] +ldr q17, [x17, #+1808] +ldr q11, [x17, #+1824] +ldr q21, [x17, #+1840] +ldr q0, [x17, #+1856] +ldr q14, [x17, #+1872] +ldr q19, [x17, #+1888] +ldr q22, [x17, #+1904] +ldr q16, [x0, #864] +ldr q2, [x0, #880] +ldr q15, [x0, #832] +ldr q10, [x0, #848] +sqrdmulh v6.4S, v16.4S, v17.s[0] +mul v16.4S, v16.4S,v13.s[0] +mla v16.4S, v6.4S, v31.s[0] +sub v6.4s, v15.4s, v16.4s +add v15.4s, v15.4s, v16.4s +sqrdmulh v16.4S, v2.4S, v17.s[0] +mul v2.4S, v2.4S,v13.s[0] +mla v2.4S, v16.4S, v31.s[0] +sub v16.4s, v10.4s, v2.4s +add v10.4s, v10.4s, v2.4s +sqrdmulh v2.4S, v10.4S, v17.s[1] +mul v10.4S, v10.4S,v13.s[1] +mla v10.4S, v2.4S, v31.s[0] +sub v2.4s, v15.4s, v10.4s +add v15.4s, v15.4s, v10.4s +sqrdmulh v10.4S, v16.4S, v17.s[2] +mul v16.4S, v16.4S,v13.s[2] +mla v16.4S, v10.4S, v31.s[0] +sub v10.4s, v6.4s, v16.4s +add v6.4s, v6.4s, v16.4s +trn1 v16.4S, v15.4S, v2.4S +trn2 v7.4S, v15.4S, v2.4S +trn1 v5.4S, v6.4S, v10.4S +trn2 v4.4S, v6.4S, v10.4S +trn2 v6.2D, v16.2D, v5.2D +trn2 v10.2D, v7.2D, v4.2D +trn1 v15.2D, v16.2D, v5.2D +trn1 v2.2D, v7.2D, v4.2D +sqrdmulh v4.4S, v6.4S, v21.4S +mul v6.4S, v6.4S,v11.4S +mla v6.4S, v4.4S, v31.s[0] +sub v4.4s, v15.4s, v6.4s +add v15.4s, v15.4s, v6.4s +sqrdmulh v6.4S, v10.4S, v21.4S +mul v10.4S, v10.4S,v11.4S +mla v10.4S, v6.4S, v31.s[0] +sub v6.4s, v2.4s, v10.4s +add v2.4s, v2.4s, v10.4s +sqrdmulh v10.4S, v2.4S, v14.4S +mul v2.4S, v2.4S,v0.4S +mla v2.4S, v10.4S, v31.s[0] +sub v10.4s, v15.4s, v2.4s +add v15.4s, v15.4s, v2.4s +sqrdmulh v2.4S, v6.4S, v22.4S +mul v6.4S, v6.4S,v19.4S +mla v6.4S, v2.4S, v31.s[0] +sub v2.4s, v4.4s, v6.4s +add v4.4s, v4.4s, v6.4s +str q15, [x0, #832] +str q10, [x0, #848] +str q4, [x0, #864] +str q2, [x0, #880] +ldr q2, [x17, #+1920] +ldr q4, [x17, #+1936] +ldr q10, [x17, #+1952] +ldr q15, [x17, #+1968] +ldr q6, [x17, #+1984] +ldr q7, [x17, #+2000] +ldr q5, [x17, #+2016] +ldr q16, [x17, #+2032] +ldr q22, [x0, #928] +ldr q19, [x0, #944] +ldr q14, [x0, #896] +ldr q0, [x0, #912] +sqrdmulh v21.4S, v22.4S, v4.s[0] +mul v22.4S, v22.4S,v2.s[0] +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v14.4s, v22.4s +add v14.4s, v14.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v4.s[0] +mul v19.4S, v19.4S,v2.s[0] +mla v19.4S, v22.4S, v31.s[0] +sub v22.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v0.4S, v4.s[1] +mul v0.4S, v0.4S,v2.s[1] +mla v0.4S, v19.4S, v31.s[0] +sub v19.4s, v14.4s, v0.4s +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v22.4S, v4.s[2] +mul v22.4S, v22.4S,v2.s[2] +mla v22.4S, v0.4S, v31.s[0] +sub v0.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +trn1 v22.4S, v14.4S, v19.4S +trn2 v11.4S, v14.4S, v19.4S +trn1 v17.4S, v21.4S, v0.4S +trn2 v13.4S, v21.4S, v0.4S +trn2 v21.2D, v22.2D, v17.2D +trn2 v0.2D, v11.2D, v13.2D +trn1 v14.2D, v22.2D, v17.2D +trn1 v19.2D, v11.2D, v13.2D +sqrdmulh v13.4S, v21.4S, v15.4S +mul v21.4S, v21.4S,v10.4S +mla v21.4S, v13.4S, v31.s[0] +sub v13.4s, v14.4s, v21.4s +add v14.4s, v14.4s, v21.4s +sqrdmulh v21.4S, v0.4S, v15.4S +mul v0.4S, v0.4S,v10.4S +mla v0.4S, v21.4S, v31.s[0] +sub v21.4s, v19.4s, v0.4s +add v19.4s, v19.4s, v0.4s +sqrdmulh v0.4S, v19.4S, v7.4S +mul v19.4S, v19.4S,v6.4S +mla v19.4S, v0.4S, v31.s[0] +sub v0.4s, v14.4s, v19.4s +add v14.4s, v14.4s, v19.4s +sqrdmulh v19.4S, v21.4S, v16.4S +mul v21.4S, v21.4S,v5.4S +mla v21.4S, v19.4S, v31.s[0] +sub v19.4s, v13.4s, v21.4s +add v13.4s, v13.4s, v21.4s +str q14, [x0, #896] +str q0, [x0, #912] +str q13, [x0, #928] +str q19, [x0, #944] +ldr q19, [x17, #+2048] +ldr q13, [x17, #+2064] +ldr q0, [x17, #+2080] +ldr q14, [x17, #+2096] +ldr q21, [x17, #+2112] +ldr q11, [x17, #+2128] +ldr q17, [x17, #+2144] +ldr q22, [x17, #+2160] +ldr q16, [x0, #992] +ldr q5, [x0, #1008] +ldr q7, [x0, #960] +ldr q6, [x0, #976] +sqrdmulh v15.4S, v16.4S, v13.s[0] +mul v16.4S, v16.4S,v19.s[0] +mla v16.4S, v15.4S, v31.s[0] +sub v15.4s, v7.4s, v16.4s +add v7.4s, v7.4s, v16.4s +sqrdmulh v16.4S, v5.4S, v13.s[0] +mul v5.4S, v5.4S,v19.s[0] +mla v5.4S, v16.4S, v31.s[0] +sub v16.4s, v6.4s, v5.4s +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v6.4S, v13.s[1] +mul v6.4S, v6.4S,v19.s[1] +mla v6.4S, v5.4S, v31.s[0] +sub v5.4s, v7.4s, v6.4s +add v7.4s, v7.4s, v6.4s +sqrdmulh v6.4S, v16.4S, v13.s[2] +mul v16.4S, v16.4S,v19.s[2] +mla v16.4S, v6.4S, v31.s[0] +sub v6.4s, v15.4s, v16.4s +add v15.4s, v15.4s, v16.4s +trn1 v16.4S, v7.4S, v5.4S +trn2 v10.4S, v7.4S, v5.4S +trn1 v4.4S, v15.4S, v6.4S +trn2 v2.4S, v15.4S, v6.4S +trn2 v15.2D, v16.2D, v4.2D +trn2 v6.2D, v10.2D, v2.2D +trn1 v7.2D, v16.2D, v4.2D +trn1 v5.2D, v10.2D, v2.2D +sqrdmulh v2.4S, v15.4S, v14.4S +mul v15.4S, v15.4S,v0.4S +mla v15.4S, v2.4S, v31.s[0] +sub v2.4s, v7.4s, v15.4s +add v7.4s, v7.4s, v15.4s +sqrdmulh v15.4S, v6.4S, v14.4S +mul v6.4S, v6.4S,v0.4S +mla v6.4S, v15.4S, v31.s[0] +sub v15.4s, v5.4s, v6.4s +add v5.4s, v5.4s, v6.4s +sqrdmulh v6.4S, v5.4S, v11.4S +mul v5.4S, v5.4S,v21.4S +mla v5.4S, v6.4S, v31.s[0] +sub v6.4s, v7.4s, v5.4s +add v7.4s, v7.4s, v5.4s +sqrdmulh v5.4S, v15.4S, v22.4S +mul v15.4S, v15.4S,v17.4S +mla v15.4S, v5.4S, v31.s[0] +sub v5.4s, v2.4s, v15.4s +add v2.4s, v2.4s, v15.4s +str q7, [x0, #960] +str q6, [x0, #976] +str q2, [x0, #992] +str q5, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 2392 +// Instruction count: 2388 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_3_z2_0.s b/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_3_z2_0.s new file mode 100644 index 0000000..dadc45d --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_3_z2_0.s @@ -0,0 +1,2422 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 26036764 // Layer 6, block 0 +.word 7065381 // Layer 6, block 1 +.word 11280567 // Layer 6, block 2 +.word 19695786 // Layer 6, block 3 +.word 1666225723 // Layer 6, block 0 +.word 452149874 // Layer 6, block 1 +.word 721901190 // Layer 6, block 2 +.word 1260434103 // Layer 6, block 3 +.word 28678040 // Layer 7, block 0 +.word 5637166 // Layer 7, block 2 +.word 18759424 // Layer 7, block 4 +.word 8648030 // Layer 7, block 6 +.word 1835254486 // Layer 7, block 0 +.word 360751090 // Layer 7, block 2 +.word 1200511508 // Layer 7, block 4 +.word 553431680 // Layer 7, block 6 +.word 7232147 // Layer 7, block 1 +.word 7430689 // Layer 7, block 3 +.word 14819378 // Layer 7, block 5 +.word 22112339 // Layer 7, block 7 +.word 462822084 // Layer 7, block 1 +.word 475527802 // Layer 7, block 3 +.word 948367809 // Layer 7, block 5 +.word 1415081692 // Layer 7, block 7 +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14834498 // Layer 6, block 4 +.word 22861321 // Layer 6, block 5 +.word 23033862 // Layer 6, block 6 +.word 32211066 // Layer 6, block 7 +.word 949335415 // Layer 6, block 4 +.word 1463012881 // Layer 6, block 5 +.word 1474054663 // Layer 6, block 6 +.word 2061350894 // Layer 6, block 7 +.word 7103825 // Layer 7, block 8 +.word 24338119 // Layer 7, block 10 +.word 6674394 // Layer 7, block 12 +.word 3716128 // Layer 7, block 14 +.word 454610102 // Layer 7, block 8 +.word 1557520740 // Layer 7, block 10 +.word 427128616 // Layer 7, block 12 +.word 237814041 // Layer 7, block 14 +.word 18577393 // Layer 7, block 9 +.word 17042091 // Layer 7, block 11 +.word 6574213 // Layer 7, block 13 +.word 24666803 // Layer 7, block 15 +.word 1188862414 // Layer 7, block 9 +.word 1090610585 // Layer 7, block 11 +.word 420717521 // Layer 7, block 13 +.word 1578554911 // Layer 7, block 15 +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 11253846 // Layer 6, block 8 +.word 16151303 // Layer 6, block 9 +.word 1821442 // Layer 6, block 10 +.word 23358663 // Layer 6, block 11 +.word 720191176 // Layer 6, block 8 +.word 1033604503 // Layer 6, block 9 +.word 116563391 // Layer 6, block 10 +.word 1494840340 // Layer 6, block 11 +.word 32787475 // Layer 7, block 16 +.word 8269259 // Layer 7, block 18 +.word 20826321 // Layer 7, block 20 +.word 21194054 // Layer 7, block 22 +.word 2098238255 // Layer 7, block 16 +.word 529192186 // Layer 7, block 18 +.word 1332782821 // Layer 7, block 20 +.word 1356315937 // Layer 7, block 22 +.word 28400654 // Layer 7, block 17 +.word 31090287 // Layer 7, block 19 +.word 26776841 // Layer 7, block 21 +.word 22281074 // Layer 7, block 23 +.word 1817503137 // Layer 7, block 17 +.word 1989626512 // Layer 7, block 19 +.word 1713587037 // Layer 7, block 21 +.word 1425879908 // Layer 7, block 23 +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 20504641 // Layer 6, block 12 +.word 7735096 // Layer 6, block 13 +.word 29463916 // Layer 6, block 14 +.word 23172067 // Layer 6, block 15 +.word 1312196872 // Layer 6, block 12 +.word 495008363 // Layer 6, block 13 +.word 1885546712 // Layer 6, block 14 +.word 1482899108 // Layer 6, block 15 +.word 1953000 // Layer 7, block 24 +.word 12766243 // Layer 7, block 26 +.word 16292342 // Layer 7, block 28 +.word 25143337 // Layer 7, block 30 +.word 124982461 // Layer 7, block 24 +.word 816977197 // Layer 7, block 26 +.word 1042630311 // Layer 7, block 28 +.word 1609050759 // Layer 7, block 30 +.word 12486848 // Layer 7, block 25 +.word 31556661 // Layer 7, block 27 +.word 28330310 // Layer 7, block 29 +.word 15137961 // Layer 7, block 31 +.word 799097282 // Layer 7, block 25 +.word 2019472170 // Layer 7, block 27 +.word 1813001465 // Layer 7, block 29 +.word 968755565 // Layer 7, block 31 +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 18663828 // Layer 6, block 16 +.word 25765932 // Layer 6, block 17 +.word 11779122 // Layer 6, block 18 +.word 29112305 // Layer 6, block 19 +.word 1194393831 // Layer 6, block 16 +.word 1648893798 // Layer 6, block 17 +.word 753806275 // Layer 6, block 18 +.word 1863045325 // Layer 6, block 19 +.word 33163184 // Layer 7, block 32 +.word 11550623 // Layer 7, block 34 +.word 25375595 // Layer 7, block 36 +.word 18254638 // Layer 7, block 38 +.word 2122281795 // Layer 7, block 32 +.word 739183455 // Layer 7, block 34 +.word 1623914137 // Layer 7, block 36 +.word 1168207670 // Layer 7, block 38 +.word 9551359 // Layer 7, block 33 +.word 33257316 // Layer 7, block 35 +.word 10387700 // Layer 7, block 37 +.word 4263629 // Layer 7, block 39 +.word 611240324 // Layer 7, block 33 +.word 2128305784 // Layer 7, block 35 +.word 664762063 // Layer 7, block 37 +.word 272851431 // Layer 7, block 39 +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 596073 // Layer 6, block 20 +.word 29039358 // Layer 6, block 21 +.word 6760262 // Layer 6, block 22 +.word 2228887 // Layer 6, block 23 +.word 38145761 // Layer 6, block 20 +.word 1858377074 // Layer 6, block 21 +.word 432623749 // Layer 6, block 22 +.word 142637881 // Layer 6, block 23 +.word 25929180 // Layer 7, block 40 +.word 23508428 // Layer 7, block 42 +.word 22560727 // Layer 7, block 44 +.word 29457393 // Layer 7, block 46 +.word 1659340873 // Layer 7, block 40 +.word 1504424569 // Layer 7, block 42 +.word 1443776334 // Layer 7, block 44 +.word 1885129272 // Layer 7, block 46 +.word 17371159 // Layer 7, block 41 +.word 11558208 // Layer 7, block 43 +.word 15755637 // Layer 7, block 45 +.word 20740787 // Layer 7, block 47 +.word 1111669329 // Layer 7, block 41 +.word 739668858 // Layer 7, block 43 +.word 1008283812 // Layer 7, block 45 +.word 1327309063 // Layer 7, block 47 +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 13624329 // Layer 6, block 24 +.word 9838349 // Layer 6, block 25 +.word 6934560 // Layer 6, block 26 +.word 11310234 // Layer 6, block 27 +.word 871890510 // Layer 6, block 24 +.word 629606282 // Layer 6, block 25 +.word 443777969 // Layer 6, block 26 +.word 723799733 // Layer 6, block 27 +.word 3153984 // Layer 7, block 48 +.word 15599806 // Layer 7, block 50 +.word 23484790 // Layer 7, block 52 +.word 30174454 // Layer 7, block 54 +.word 201839571 // Layer 7, block 48 +.word 998311389 // Layer 7, block 50 +.word 1502911852 // Layer 7, block 52 +.word 1931017673 // Layer 7, block 54 +.word 13598070 // Layer 7, block 49 +.word 31454003 // Layer 7, block 51 +.word 20506260 // Layer 7, block 53 +.word 5928435 // Layer 7, block 55 +.word 870210062 // Layer 7, block 49 +.word 2012902560 // Layer 7, block 51 +.word 1312300480 // Layer 7, block 53 +.word 379390883 // Layer 7, block 55 +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 32798516 // Layer 6, block 28 +.word 9911360 // Layer 6, block 29 +.word 32443170 // Layer 6, block 30 +.word 31293482 // Layer 6, block 31 +.word 2098944825 // Layer 6, block 28 +.word 634278629 // Layer 6, block 29 +.word 2076204416 // Layer 6, block 30 +.word 2002630000 // Layer 6, block 31 +.word 26013877 // Layer 7, block 56 +.word 22928950 // Layer 7, block 58 +.word 24547058 // Layer 7, block 60 +.word 21082546 // Layer 7, block 62 +.word 1664761067 // Layer 7, block 56 +.word 1467340807 // Layer 7, block 58 +.word 1570891816 // Layer 7, block 60 +.word 1349179970 // Layer 7, block 62 +.word 21864746 // Layer 7, block 57 +.word 27678266 // Layer 7, block 59 +.word 30695887 // Layer 7, block 61 +.word 31772478 // Layer 7, block 63 +.word 1399236949 // Layer 7, block 57 +.word 1771273834 // Layer 7, block 59 +.word 1964386839 // Layer 7, block 61 +.word 2033283404 // Layer 7, block 63 +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 2853776 // Layer 6, block 32 +.word 31645959 // Layer 6, block 33 +.word 29723614 // Layer 6, block 34 +.word 31813171 // Layer 6, block 35 +.word 182627725 // Layer 6, block 32 +.word 2025186806 // Layer 6, block 33 +.word 1902166116 // Layer 6, block 34 +.word 2035887557 // Layer 6, block 35 +.word 30377953 // Layer 7, block 64 +.word 4924837 // Layer 7, block 66 +.word 11362575 // Layer 7, block 68 +.word 31398766 // Layer 7, block 70 +.word 1944040616 // Layer 7, block 64 +.word 315165513 // Layer 7, block 66 +.word 727149301 // Layer 7, block 68 +.word 2009367662 // Layer 7, block 70 +.word 27689101 // Layer 7, block 65 +.word 31229525 // Layer 7, block 67 +.word 6544948 // Layer 7, block 69 +.word 13728247 // Layer 7, block 71 +.word 1771967221 // Layer 7, block 65 +.word 1998537064 // Layer 7, block 67 +.word 418844704 // Layer 7, block 69 +.word 878540754 // Layer 7, block 71 +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9116920 // Layer 6, block 36 +.word 26449800 // Layer 6, block 37 +.word 27173300 // Layer 6, block 38 +.word 1574249 // Layer 6, block 39 +.word 583438350 // Layer 6, block 36 +.word 1692658010 // Layer 6, block 37 +.word 1738958476 // Layer 6, block 38 +.word 100744247 // Layer 6, block 39 +.word 6510145 // Layer 7, block 72 +.word 760999 // Layer 7, block 74 +.word 1634503 // Layer 7, block 76 +.word 29546109 // Layer 7, block 78 +.word 416617482 // Layer 7, block 72 +.word 48700219 // Layer 7, block 74 +.word 104600209 // Layer 7, block 76 +.word 1890806663 // Layer 7, block 78 +.word 2195232 // Layer 7, block 73 +.word 4465852 // Layer 7, block 75 +.word 31203102 // Layer 7, block 77 +.word 29916743 // Layer 7, block 79 +.word 140484126 // Layer 7, block 73 +.word 285792715 // Layer 7, block 75 +.word 1996846121 // Layer 7, block 77 +.word 1914525428 // Layer 7, block 79 +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29172999 // Layer 6, block 40 +.word 16825951 // Layer 6, block 41 +.word 11592382 // Layer 6, block 42 +.word 2671395 // Layer 6, block 43 +.word 1866929445 // Layer 6, block 40 +.word 1076778680 // Layer 6, block 41 +.word 741855827 // Layer 6, block 42 +.word 170956232 // Layer 6, block 43 +.word 14579779 // Layer 7, block 80 +.word 24263513 // Layer 7, block 82 +.word 4646776 // Layer 7, block 84 +.word 69049 // Layer 7, block 86 +.word 933034643 // Layer 7, block 80 +.word 1552746321 // Layer 7, block 82 +.word 297370968 // Layer 7, block 84 +.word 4418799 // Layer 7, block 86 +.word 33263488 // Layer 7, block 81 +.word 22493246 // Layer 7, block 83 +.word 22009979 // Layer 7, block 85 +.word 12021234 // Layer 7, block 87 +.word 2128700762 // Layer 7, block 81 +.word 1439457879 // Layer 7, block 83 +.word 1408531152 // Layer 7, block 85 +.word 769300260 // Layer 7, block 87 +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 15720958 // Layer 6, block 44 +.word 4876619 // Layer 6, block 45 +.word 9370171 // Layer 6, block 46 +.word 2197027 // Layer 6, block 47 +.word 1006064525 // Layer 6, block 44 +.word 312079797 // Layer 6, block 45 +.word 599645177 // Layer 6, block 46 +.word 140598997 // Layer 6, block 47 +.word 16117282 // Layer 7, block 88 +.word 9635661 // Layer 7, block 90 +.word 9117520 // Layer 7, block 92 +.word 3506913 // Layer 7, block 94 +.word 1031427326 // Layer 7, block 88 +.word 616635240 // Layer 7, block 90 +.word 583476747 // Layer 7, block 92 +.word 224425303 // Layer 7, block 94 +.word 20014407 // Layer 7, block 89 +.word 25893988 // Layer 7, block 91 +.word 10257619 // Layer 7, block 93 +.word 24501669 // Layer 7, block 95 +.word 1280824291 // Layer 7, block 89 +.word 1657088757 // Layer 7, block 91 +.word 656437514 // Layer 7, block 93 +.word 1567987141 // Layer 7, block 95 +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 23467272 // Layer 6, block 48 +.word 11944835 // Layer 6, block 49 +.word 29768154 // Layer 6, block 50 +.word 3189790 // Layer 6, block 51 +.word 1501790786 // Layer 6, block 48 +.word 764411097 // Layer 6, block 49 +.word 1905016458 // Layer 6, block 50 +.word 204130980 // Layer 6, block 51 +.word 28559032 // Layer 7, block 96 +.word 20151609 // Layer 7, block 98 +.word 11645481 // Layer 7, block 100 +.word 16402437 // Layer 7, block 102 +.word 1827638556 // Layer 7, block 96 +.word 1289604549 // Layer 7, block 98 +.word 745253903 // Layer 7, block 100 +.word 1049675853 // Layer 7, block 102 +.word 1005359 // Layer 7, block 97 +.word 19130139 // Layer 7, block 99 +.word 11690281 // Layer 7, block 101 +.word 5461508 // Layer 7, block 103 +.word 64338065 // Layer 7, block 97 +.word 1224235458 // Layer 7, block 99 +.word 748120885 // Layer 7, block 101 +.word 349509836 // Layer 7, block 103 +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 4898455 // Layer 6, block 52 +.word 22059944 // Layer 6, block 53 +.word 20315246 // Layer 6, block 54 +.word 28615767 // Layer 6, block 55 +.word 313477194 // Layer 6, block 52 +.word 1411728668 // Layer 6, block 53 +.word 1300076517 // Layer 6, block 54 +.word 1831269319 // Layer 6, block 55 +.word 6226096 // Layer 7, block 104 +.word 14029790 // Layer 7, block 106 +.word 7729000 // Layer 7, block 108 +.word 13958531 // Layer 7, block 110 +.word 398439734 // Layer 7, block 104 +.word 897838034 // Layer 7, block 106 +.word 494618249 // Layer 7, block 108 +.word 893277806 // Layer 7, block 110 +.word 31755058 // Layer 7, block 105 +.word 26102744 // Layer 7, block 107 +.word 19175904 // Layer 7, block 109 +.word 19472238 // Layer 7, block 111 +.word 2032168609 // Layer 7, block 105 +.word 1670448121 // Layer 7, block 107 +.word 1227164194 // Layer 7, block 109 +.word 1246128123 // Layer 7, block 111 +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 17302560 // Layer 6, block 56 +.word 8630188 // Layer 6, block 57 +.word 13744680 // Layer 6, block 58 +.word 31890906 // Layer 6, block 59 +.word 1107279328 // Layer 6, block 56 +.word 552289879 // Layer 6, block 57 +.word 879592386 // Layer 6, block 58 +.word 2040862218 // Layer 6, block 59 +.word 4735938 // Layer 7, block 112 +.word 26671657 // Layer 7, block 114 +.word 25810971 // Layer 7, block 116 +.word 25578690 // Layer 7, block 118 +.word 303076900 // Layer 7, block 112 +.word 1706855774 // Layer 7, block 114 +.word 1651776074 // Layer 7, block 116 +.word 1636911225 // Layer 7, block 118 +.word 6957373 // Layer 7, block 113 +.word 25381712 // Layer 7, block 115 +.word 27780827 // Layer 7, block 117 +.word 28062311 // Layer 7, block 119 +.word 445237890 // Layer 7, block 113 +.word 1624305595 // Layer 7, block 115 +.word 1777837237 // Layer 7, block 117 +.word 1795850838 // Layer 7, block 119 +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 26150922 // Layer 6, block 60 +.word 29525906 // Layer 6, block 61 +.word 23080870 // Layer 6, block 62 +.word 1636987 // Layer 6, block 63 +.word 1673531278 // Layer 6, block 60 +.word 1889513769 // Layer 6, block 61 +.word 1477062945 // Layer 6, block 62 +.word 104759172 // Layer 6, block 63 +.word 10674616 // Layer 7, block 120 +.word 9508293 // Layer 7, block 122 +.word 4274200 // Layer 7, block 124 +.word 10066304 // Layer 7, block 126 +.word 683123285 // Layer 7, block 120 +.word 608484310 // Layer 7, block 122 +.word 273527923 // Layer 7, block 124 +.word 644194289 // Layer 7, block 126 +.word 26473446 // Layer 7, block 121 +.word 14853570 // Layer 7, block 123 +.word 32427548 // Layer 7, block 125 +.word 16598340 // Layer 7, block 127 +.word 1694171239 // Layer 7, block 121 +.word 950555930 // Layer 7, block 123 +.word 2075204685 // Layer 7, block 125 +.word 1062212688 // Layer 7, block 127 +.text +.global ntt_u32_full_neon_asm_var_4_4_3_z2_0 +.global _ntt_u32_full_neon_asm_var_4_4_3_z2_0 +ntt_u32_full_neon_asm_var_4_4_3_z2_0: +_ntt_u32_full_neon_asm_var_4_4_3_z2_0: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #800] +ldr q21, [x0, #864] +ldr q20, [x0, #928] +ldr q19, [x0, #992] +ldr q18, [x0, #288] +ldr q17, [x0, #352] +ldr q16, [x0, #416] +ldr q3, [x0, #480] +sqrdmulh v2.4S, v22.4S, v29.s[0] +ldr q1, [x0, #544] +mul v22.4S, v22.4S,v30.s[0] +ldr q0, [x0, #608] +sqrdmulh v15.4S, v21.4S, v29.s[0] +ldr q14, [x0, #672] +mul v21.4S, v21.4S,v30.s[0] +ldr q13, [x0, #736] +mla v22.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q12, [x0, #32] +sub v11.4s, v18.4s, v22.4s +mla v21.4S, v15.4S, v31.s[0] +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q15, [x0, #96] +sub v10.4s, v17.4s, v21.4s +mla v20.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v1.4S, v29.s[0] +ldr q2, [x0, #160] +mul v1.4S, v1.4S,v30.s[0] +sub v9.4s, v16.4s, v20.4s +mla v19.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v0.4S, v29.s[0] +ldr q22, [x0, #224] +mul v0.4S, v0.4S,v30.s[0] +sub v8.4s, v3.4s, v19.4s +mla v1.4S, v21.4S, v31.s[0] +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v21.4s, v12.4s, v1.4s +mla v0.4S, v20.4S, v31.s[0] +add v12.4s, v12.4s, v1.4s +sqrdmulh v1.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +sub v20.4s, v15.4s, v0.4s +mla v14.4S, v19.4S, v31.s[0] +add v15.4s, v15.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v19.4s, v2.4s, v14.4s +mla v13.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v1.4s, v22.4s, v13.4s +mla v16.4S, v0.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v0.4s, v2.4s, v16.4s +mla v3.4S, v14.4S, v31.s[0] +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v14.4s, v22.4s, v3.4s +mla v18.4S, v13.4S, v31.s[0] +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v29.s[2] +mul v9.4S, v9.4S,v30.s[2] +sub v13.4s, v12.4s, v18.4s +mla v17.4S, v16.4S, v31.s[0] +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v8.4S, v29.s[2] +mul v8.4S, v8.4S,v30.s[2] +sub v16.4s, v15.4s, v17.4s +mla v9.4S, v3.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +sqrdmulh v17.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v3.4s, v19.4s, v9.4s +mla v8.4S, v18.4S, v31.s[0] +add v19.4s, v19.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v18.4s, v1.4s, v8.4s +mla v11.4S, v17.4S, v31.s[0] +add v1.4s, v1.4s, v8.4s +sqrdmulh v8.4S, v2.4S, v27.s[0] +mul v2.4S, v2.4S,v28.s[0] +sub v17.4s, v21.4s, v11.4s +mla v10.4S, v9.4S, v31.s[0] +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v27.s[0] +mul v22.4S, v22.4S,v28.s[0] +sub v9.4s, v20.4s, v10.4s +mla v2.4S, v8.4S, v31.s[0] +add v20.4s, v20.4s, v10.4s +sqrdmulh v10.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v8.4s, v12.4s, v2.4s +mla v22.4S, v11.4S, v31.s[0] +add v12.4s, v12.4s, v2.4s +sqrdmulh v2.4S, v14.4S, v27.s[1] +mul v14.4S, v14.4S,v28.s[1] +sub v11.4s, v15.4s, v22.4s +mla v0.4S, v10.4S, v31.s[0] +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v27.s[2] +mul v19.4S, v19.4S,v28.s[2] +sub v10.4s, v13.4s, v0.4s +mla v14.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v0.4s +sqrdmulh v0.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +sub v2.4s, v16.4s, v14.4s +mla v19.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v27.s[3] +mul v3.4S, v3.4S,v28.s[3] +sub v22.4s, v21.4s, v19.4s +mla v1.4S, v0.4S, v31.s[0] +add v21.4s, v21.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v0.4s, v20.4s, v1.4s +mla v3.4S, v14.4S, v31.s[0] +add v20.4s, v20.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v25.s[0] +mul v15.4S, v15.4S,v26.s[0] +sub v14.4s, v17.4s, v3.4s +mla v18.4S, v19.4S, v31.s[0] +add v17.4s, v17.4s, v3.4s +sqrdmulh v3.4S, v11.4S, v25.s[1] +mul v11.4S, v11.4S,v26.s[1] +sub v19.4s, v9.4s, v18.4s +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v1.4s, v12.4s, v15.4s +mla v11.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v25.s[3] +mul v2.4S, v2.4S,v26.s[3] +sub v3.4s, v8.4s, v11.4s +mla v16.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v11.4s +str q12, [x0, #32] +sqrdmulh v12.4S, v20.4S, v23.s[0] +str q1, [x0, #96] +mul v20.4S, v20.4S,v24.s[0] +ldr q1, [x0, #816] +sub v11.4s, v13.4s, v16.4s +ldr q18, [x0, #880] +mla v2.4S, v15.4S, v31.s[0] +add v13.4s, v13.4s, v16.4s +str q8, [x0, #160] +sqrdmulh v8.4S, v0.4S, v23.s[1] +str q3, [x0, #224] +mul v0.4S, v0.4S,v24.s[1] +ldr q3, [x0, #944] +sub v16.4s, v10.4s, v2.4s +ldr q15, [x0, #1008] +mla v20.4S, v12.4S, v31.s[0] +add v10.4s, v10.4s, v2.4s +str q13, [x0, #288] +sqrdmulh v13.4S, v9.4S, v23.s[2] +str q11, [x0, #352] +mul v9.4S, v9.4S,v24.s[2] +ldr q11, [x0, #304] +sub v2.4s, v21.4s, v20.4s +ldr q12, [x0, #368] +mla v0.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v20.4s +str q10, [x0, #416] +sqrdmulh v10.4S, v19.4S, v23.s[3] +str q16, [x0, #480] +mul v19.4S, v19.4S,v24.s[3] +ldr q16, [x0, #432] +sub v20.4s, v22.4s, v0.4s +ldr q8, [x0, #496] +mla v9.4S, v13.4S, v31.s[0] +add v22.4s, v22.4s, v0.4s +str q21, [x0, #544] +sqrdmulh v21.4S, v1.4S, v29.s[0] +str q2, [x0, #608] +ldr q2, [x0, #560] +mul v1.4S, v1.4S,v30.s[0] +ldr q0, [x0, #624] +sub v13.4s, v17.4s, v9.4s +mla v19.4S, v10.4S, v31.s[0] +add v17.4s, v17.4s, v9.4s +str q22, [x0, #672] +sqrdmulh v22.4S, v18.4S, v29.s[0] +str q20, [x0, #736] +ldr q20, [x0, #688] +mul v18.4S, v18.4S,v30.s[0] +ldr q9, [x0, #752] +sub v10.4s, v14.4s, v19.4s +mla v1.4S, v21.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +str q17, [x0, #800] +sqrdmulh v17.4S, v3.4S, v29.s[0] +str q13, [x0, #864] +mul v3.4S, v3.4S,v30.s[0] +ldr q13, [x0, #48] +sub v19.4s, v11.4s, v1.4s +mla v18.4S, v22.4S, v31.s[0] +add v11.4s, v11.4s, v1.4s +str q14, [x0, #928] +sqrdmulh v14.4S, v15.4S, v29.s[0] +str q10, [x0, #992] +mul v15.4S, v15.4S,v30.s[0] +ldr q10, [x0, #112] +sub v1.4s, v12.4s, v18.4s +mla v3.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v2.4S, v29.s[0] +ldr q17, [x0, #176] +mul v2.4S, v2.4S,v30.s[0] +sub v22.4s, v16.4s, v3.4s +mla v15.4S, v14.4S, v31.s[0] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v0.4S, v29.s[0] +ldr q14, [x0, #240] +mul v0.4S, v0.4S,v30.s[0] +sub v21.4s, v8.4s, v15.4s +mla v2.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +sub v18.4s, v13.4s, v2.4s +mla v0.4S, v3.4S, v31.s[0] +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v9.4S, v29.s[0] +mul v9.4S, v9.4S,v30.s[0] +sub v3.4s, v10.4s, v0.4s +mla v20.4S, v15.4S, v31.s[0] +add v10.4s, v10.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v15.4s, v17.4s, v20.4s +mla v9.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +sub v2.4s, v14.4s, v9.4s +mla v16.4S, v0.4S, v31.s[0] +add v14.4s, v14.4s, v9.4s +sqrdmulh v9.4S, v11.4S, v29.s[1] +mul v11.4S, v11.4S,v30.s[1] +sub v0.4s, v17.4s, v16.4s +mla v8.4S, v20.4S, v31.s[0] +add v17.4s, v17.4s, v16.4s +sqrdmulh v16.4S, v12.4S, v29.s[1] +mul v12.4S, v12.4S,v30.s[1] +sub v20.4s, v14.4s, v8.4s +mla v11.4S, v9.4S, v31.s[0] +add v14.4s, v14.4s, v8.4s +sqrdmulh v8.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +sub v9.4s, v13.4s, v11.4s +mla v12.4S, v16.4S, v31.s[0] +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +sub v16.4s, v10.4s, v12.4s +mla v22.4S, v8.4S, v31.s[0] +add v10.4s, v10.4s, v12.4s +sqrdmulh v12.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +sub v8.4s, v15.4s, v22.4s +mla v21.4S, v11.4S, v31.s[0] +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v1.4S, v29.s[2] +mul v1.4S, v1.4S,v30.s[2] +sub v11.4s, v2.4s, v21.4s +mla v19.4S, v12.4S, v31.s[0] +add v2.4s, v2.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v27.s[0] +mul v17.4S, v17.4S,v28.s[0] +sub v12.4s, v18.4s, v19.4s +mla v1.4S, v22.4S, v31.s[0] +add v18.4s, v18.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +sub v22.4s, v3.4s, v1.4s +mla v17.4S, v21.4S, v31.s[0] +add v3.4s, v3.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v21.4s, v13.4s, v17.4s +mla v14.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v17.4s +sqrdmulh v17.4S, v20.4S, v27.s[1] +mul v20.4S, v20.4S,v28.s[1] +sub v19.4s, v10.4s, v14.4s +mla v0.4S, v1.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v27.s[2] +mul v15.4S, v15.4S,v28.s[2] +sub v1.4s, v9.4s, v0.4s +mla v20.4S, v17.4S, v31.s[0] +add v9.4s, v9.4s, v0.4s +sqrdmulh v0.4S, v2.4S, v27.s[2] +mul v2.4S, v2.4S,v28.s[2] +sub v17.4s, v16.4s, v20.4s +mla v15.4S, v14.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v27.s[3] +mul v8.4S, v8.4S,v28.s[3] +sub v14.4s, v18.4s, v15.4s +mla v2.4S, v0.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v27.s[3] +mul v11.4S, v11.4S,v28.s[3] +sub v0.4s, v3.4s, v2.4s +mla v8.4S, v20.4S, v31.s[0] +add v3.4s, v3.4s, v2.4s +sqrdmulh v2.4S, v10.4S, v25.s[0] +mul v10.4S, v10.4S,v26.s[0] +sub v20.4s, v12.4s, v8.4s +mla v11.4S, v15.4S, v31.s[0] +add v12.4s, v12.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v25.s[1] +mul v19.4S, v19.4S,v26.s[1] +sub v15.4s, v22.4s, v11.4s +mla v10.4S, v2.4S, v31.s[0] +add v22.4s, v22.4s, v11.4s +sqrdmulh v11.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v2.4s, v13.4s, v10.4s +mla v19.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v10.4s +sqrdmulh v10.4S, v17.4S, v25.s[3] +mul v17.4S, v17.4S,v26.s[3] +sub v8.4s, v21.4s, v19.4s +mla v16.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v19.4s +str q13, [x0, #48] +sqrdmulh v13.4S, v3.4S, v23.s[0] +str q2, [x0, #112] +mul v3.4S, v3.4S,v24.s[0] +ldr q2, [x0, #768] +sub v19.4s, v9.4s, v16.4s +ldr q11, [x0, #832] +mla v17.4S, v10.4S, v31.s[0] +add v9.4s, v9.4s, v16.4s +str q21, [x0, #176] +sqrdmulh v21.4S, v0.4S, v23.s[1] +str q8, [x0, #240] +mul v0.4S, v0.4S,v24.s[1] +ldr q8, [x0, #896] +sub v16.4s, v1.4s, v17.4s +ldr q10, [x0, #960] +mla v3.4S, v13.4S, v31.s[0] +add v1.4s, v1.4s, v17.4s +str q9, [x0, #304] +sqrdmulh v9.4S, v22.4S, v23.s[2] +str q19, [x0, #368] +mul v22.4S, v22.4S,v24.s[2] +ldr q19, [x0, #256] +sub v17.4s, v18.4s, v3.4s +ldr q13, [x0, #320] +mla v0.4S, v21.4S, v31.s[0] +add v18.4s, v18.4s, v3.4s +str q1, [x0, #432] +sqrdmulh v1.4S, v15.4S, v23.s[3] +str q16, [x0, #496] +mul v15.4S, v15.4S,v24.s[3] +ldr q16, [x0, #384] +sub v3.4s, v14.4s, v0.4s +ldr q21, [x0, #448] +mla v22.4S, v9.4S, v31.s[0] +add v14.4s, v14.4s, v0.4s +str q18, [x0, #560] +sqrdmulh v18.4S, v2.4S, v29.s[0] +str q17, [x0, #624] +ldr q17, [x0, #512] +mul v2.4S, v2.4S,v30.s[0] +ldr q0, [x0, #576] +sub v9.4s, v12.4s, v22.4s +mla v15.4S, v1.4S, v31.s[0] +add v12.4s, v12.4s, v22.4s +str q14, [x0, #688] +sqrdmulh v14.4S, v11.4S, v29.s[0] +str q3, [x0, #752] +ldr q3, [x0, #640] +mul v11.4S, v11.4S,v30.s[0] +ldr q22, [x0, #704] +sub v1.4s, v20.4s, v15.4s +mla v2.4S, v18.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +str q12, [x0, #816] +sqrdmulh v12.4S, v8.4S, v29.s[0] +str q9, [x0, #880] +mul v8.4S, v8.4S,v30.s[0] +ldr q9, [x0, #0] +sub v15.4s, v19.4s, v2.4s +mla v11.4S, v14.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +str q20, [x0, #944] +sqrdmulh v20.4S, v10.4S, v29.s[0] +str q1, [x0, #1008] +mul v10.4S, v10.4S,v30.s[0] +ldr q1, [x0, #64] +sub v2.4s, v13.4s, v11.4s +mla v8.4S, v12.4S, v31.s[0] +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v29.s[0] +ldr q12, [x0, #128] +mul v17.4S, v17.4S,v30.s[0] +sub v14.4s, v16.4s, v8.4s +mla v10.4S, v20.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v0.4S, v29.s[0] +ldr q20, [x0, #192] +mul v0.4S, v0.4S,v30.s[0] +sub v18.4s, v21.4s, v10.4s +mla v17.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +sub v11.4s, v9.4s, v17.4s +mla v0.4S, v8.4S, v31.s[0] +add v9.4s, v9.4s, v17.4s +sqrdmulh v17.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v8.4s, v1.4s, v0.4s +mla v3.4S, v10.4S, v31.s[0] +add v1.4s, v1.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v10.4s, v12.4s, v3.4s +mla v22.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v17.4s, v20.4s, v22.4s +mla v16.4S, v0.4S, v31.s[0] +add v20.4s, v20.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[1] +mul v19.4S, v19.4S,v30.s[1] +sub v0.4s, v12.4s, v16.4s +mla v21.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v13.4S, v29.s[1] +mul v13.4S, v13.4S,v30.s[1] +sub v3.4s, v20.4s, v21.4s +mla v19.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v22.4s, v9.4s, v19.4s +mla v13.4S, v16.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v29.s[2] +mul v18.4S, v18.4S,v30.s[2] +sub v16.4s, v1.4s, v13.4s +mla v14.4S, v21.4S, v31.s[0] +add v1.4s, v1.4s, v13.4s +sqrdmulh v13.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +sub v21.4s, v10.4s, v14.4s +mla v18.4S, v19.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v19.4s, v17.4s, v18.4s +mla v15.4S, v13.4S, v31.s[0] +add v17.4s, v17.4s, v18.4s +sqrdmulh v18.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +sub v13.4s, v11.4s, v15.4s +mla v2.4S, v14.4S, v31.s[0] +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v27.s[0] +mul v20.4S, v20.4S,v28.s[0] +sub v14.4s, v8.4s, v2.4s +mla v12.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v2.4s +sqrdmulh v2.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v18.4s, v9.4s, v12.4s +mla v20.4S, v15.4S, v31.s[0] +add v9.4s, v9.4s, v12.4s +sqrdmulh v12.4S, v3.4S, v27.s[1] +mul v3.4S, v3.4S,v28.s[1] +sub v15.4s, v1.4s, v20.4s +mla v0.4S, v2.4S, v31.s[0] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v10.4S, v27.s[2] +mul v10.4S, v10.4S,v28.s[2] +sub v2.4s, v22.4s, v0.4s +mla v3.4S, v12.4S, v31.s[0] +add v22.4s, v22.4s, v0.4s +sqrdmulh v0.4S, v17.4S, v27.s[2] +mul v17.4S, v17.4S,v28.s[2] +sub v12.4s, v16.4s, v3.4s +mla v10.4S, v20.4S, v31.s[0] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +sub v20.4s, v11.4s, v10.4s +mla v17.4S, v0.4S, v31.s[0] +add v11.4s, v11.4s, v10.4s +sqrdmulh v10.4S, v19.4S, v27.s[3] +mul v19.4S, v19.4S,v28.s[3] +sub v0.4s, v8.4s, v17.4s +mla v21.4S, v3.4S, v31.s[0] +add v8.4s, v8.4s, v17.4s +sqrdmulh v17.4S, v1.4S, v25.s[0] +mul v1.4S, v1.4S,v26.s[0] +sub v3.4s, v13.4s, v21.4s +mla v19.4S, v10.4S, v31.s[0] +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v15.4S, v25.s[1] +mul v15.4S, v15.4S,v26.s[1] +sub v10.4s, v14.4s, v19.4s +mla v1.4S, v17.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +sqrdmulh v19.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v17.4s, v9.4s, v1.4s +mla v15.4S, v21.4S, v31.s[0] +add v9.4s, v9.4s, v1.4s +sqrdmulh v1.4S, v12.4S, v25.s[3] +mul v12.4S, v12.4S,v26.s[3] +sub v21.4s, v18.4s, v15.4s +mla v16.4S, v19.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +str q9, [x0, #0] +sqrdmulh v9.4S, v8.4S, v23.s[0] +str q17, [x0, #64] +mul v8.4S, v8.4S,v24.s[0] +ldr q17, [x0, #784] +sub v15.4s, v22.4s, v16.4s +ldr q19, [x0, #848] +mla v12.4S, v1.4S, v31.s[0] +add v22.4s, v22.4s, v16.4s +str q18, [x0, #128] +sqrdmulh v18.4S, v0.4S, v23.s[1] +str q21, [x0, #192] +mul v0.4S, v0.4S,v24.s[1] +ldr q21, [x0, #912] +sub v16.4s, v2.4s, v12.4s +ldr q1, [x0, #976] +mla v8.4S, v9.4S, v31.s[0] +add v2.4s, v2.4s, v12.4s +str q22, [x0, #256] +sqrdmulh v22.4S, v14.4S, v23.s[2] +str q15, [x0, #320] +mul v14.4S, v14.4S,v24.s[2] +ldr q15, [x0, #272] +sub v12.4s, v11.4s, v8.4s +ldr q9, [x0, #336] +mla v0.4S, v18.4S, v31.s[0] +add v11.4s, v11.4s, v8.4s +str q2, [x0, #384] +sqrdmulh v2.4S, v10.4S, v23.s[3] +str q16, [x0, #448] +mul v10.4S, v10.4S,v24.s[3] +ldr q16, [x0, #400] +sub v8.4s, v20.4s, v0.4s +ldr q18, [x0, #464] +mla v14.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v0.4s +str q11, [x0, #512] +sqrdmulh v11.4S, v17.4S, v29.s[0] +str q12, [x0, #576] +ldr q12, [x0, #528] +mul v17.4S, v17.4S,v30.s[0] +ldr q0, [x0, #592] +sub v22.4s, v13.4s, v14.4s +mla v10.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v14.4s +str q20, [x0, #640] +sqrdmulh v20.4S, v19.4S, v29.s[0] +str q8, [x0, #704] +ldr q8, [x0, #656] +mul v19.4S, v19.4S,v30.s[0] +ldr q14, [x0, #720] +sub v2.4s, v3.4s, v10.4s +mla v17.4S, v11.4S, v31.s[0] +add v3.4s, v3.4s, v10.4s +str q13, [x0, #768] +sqrdmulh v13.4S, v21.4S, v29.s[0] +str q22, [x0, #832] +mul v21.4S, v21.4S,v30.s[0] +ldr q22, [x0, #16] +sub v10.4s, v15.4s, v17.4s +mla v19.4S, v20.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +str q3, [x0, #896] +sqrdmulh v3.4S, v1.4S, v29.s[0] +str q2, [x0, #960] +mul v1.4S, v1.4S,v30.s[0] +ldr q2, [x0, #80] +sub v17.4s, v9.4s, v19.4s +mla v21.4S, v13.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v12.4S, v29.s[0] +ldr q13, [x0, #144] +mul v12.4S, v12.4S,v30.s[0] +sub v20.4s, v16.4s, v21.4s +mla v1.4S, v3.4S, v31.s[0] +add v16.4s, v16.4s, v21.4s +sqrdmulh v21.4S, v0.4S, v29.s[0] +ldr q3, [x0, #208] +mul v0.4S, v0.4S,v30.s[0] +sub v11.4s, v18.4s, v1.4s +mla v12.4S, v19.4S, v31.s[0] +add v18.4s, v18.4s, v1.4s +sqrdmulh v1.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v19.4s, v22.4s, v12.4s +mla v0.4S, v21.4S, v31.s[0] +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v21.4s, v2.4s, v0.4s +mla v8.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v1.4s, v13.4s, v8.4s +mla v14.4S, v12.4S, v31.s[0] +add v13.4s, v13.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v12.4s, v3.4s, v14.4s +mla v16.4S, v0.4S, v31.s[0] +add v3.4s, v3.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +sub v0.4s, v13.4s, v16.4s +mla v18.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v16.4s +sqrdmulh v16.4S, v9.4S, v29.s[1] +mul v9.4S, v9.4S,v30.s[1] +sub v8.4s, v3.4s, v18.4s +mla v15.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +sub v14.4s, v22.4s, v15.4s +mla v9.4S, v16.4S, v31.s[0] +add v22.4s, v22.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v16.4s, v2.4s, v9.4s +mla v20.4S, v18.4S, v31.s[0] +add v2.4s, v2.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v18.4s, v1.4s, v20.4s +mla v11.4S, v15.4S, v31.s[0] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +sub v15.4s, v12.4s, v11.4s +mla v10.4S, v9.4S, v31.s[0] +add v12.4s, v12.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v27.s[0] +mul v13.4S, v13.4S,v28.s[0] +sub v9.4s, v19.4s, v10.4s +mla v17.4S, v20.4S, v31.s[0] +add v19.4s, v19.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v27.s[0] +mul v3.4S, v3.4S,v28.s[0] +sub v20.4s, v21.4s, v17.4s +mla v13.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v11.4s, v22.4s, v13.4s +mla v3.4S, v10.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v8.4S, v27.s[1] +mul v8.4S, v8.4S,v28.s[1] +sub v10.4s, v2.4s, v3.4s +mla v0.4S, v17.4S, v31.s[0] +add v2.4s, v2.4s, v3.4s +sqrdmulh v3.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +sub v17.4s, v14.4s, v0.4s +mla v8.4S, v13.4S, v31.s[0] +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v12.4S, v27.s[2] +mul v12.4S, v12.4S,v28.s[2] +sub v13.4s, v16.4s, v8.4s +mla v1.4S, v3.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v3.4s, v19.4s, v1.4s +mla v12.4S, v0.4S, v31.s[0] +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +sub v0.4s, v21.4s, v12.4s +mla v18.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v2.4S, v25.s[0] +mul v2.4S, v2.4S,v26.s[0] +sub v8.4s, v9.4s, v18.4s +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v10.4S, v25.s[1] +mul v10.4S, v10.4S,v26.s[1] +sub v1.4s, v20.4s, v15.4s +mla v2.4S, v12.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v12.4s, v22.4s, v2.4s +mla v10.4S, v18.4S, v31.s[0] +add v22.4s, v22.4s, v2.4s +sqrdmulh v2.4S, v13.4S, v25.s[3] +mul v13.4S, v13.4S,v26.s[3] +sub v18.4s, v11.4s, v10.4s +mla v16.4S, v15.4S, v31.s[0] +add v11.4s, v11.4s, v10.4s +str q22, [x0, #16] +sqrdmulh v22.4S, v21.4S, v23.s[0] +str q12, [x0, #80] +mul v21.4S, v21.4S,v24.s[0] +sub v12.4s, v14.4s, v16.4s +mla v13.4S, v2.4S, v31.s[0] +add v14.4s, v14.4s, v16.4s +str q11, [x0, #144] +sqrdmulh v11.4S, v0.4S, v23.s[1] +str q18, [x0, #208] +mul v0.4S, v0.4S,v24.s[1] +sub v18.4s, v17.4s, v13.4s +mla v21.4S, v22.4S, v31.s[0] +add v17.4s, v17.4s, v13.4s +str q14, [x0, #272] +sqrdmulh v14.4S, v20.4S, v23.s[2] +str q12, [x0, #336] +mul v20.4S, v20.4S,v24.s[2] +sub v12.4s, v19.4s, v21.4s +mla v0.4S, v11.4S, v31.s[0] +add v19.4s, v19.4s, v21.4s +str q17, [x0, #400] +sqrdmulh v17.4S, v1.4S, v23.s[3] +str q18, [x0, #464] +mul v1.4S, v1.4S,v24.s[3] +sub v18.4s, v3.4s, v0.4s +mla v20.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v0.4s +str q19, [x0, #528] +str q12, [x0, #592] +sub v12.4s, v9.4s, v20.4s +mla v1.4S, v17.4S, v31.s[0] +add v9.4s, v9.4s, v20.4s +str q3, [x0, #656] +str q18, [x0, #720] +sub v18.4s, v8.4s, v1.4s +add v8.4s, v8.4s, v1.4s +str q9, [x0, #784] +str q12, [x0, #848] +str q8, [x0, #912] +str q18, [x0, #976] +ldr q4, [x17, #+128] +ldr q5, [x17, #+144] +ldr q6, [x17, #+160] +ldr q7, [x17, #+176] +ldr q15, [x17, #+192] +ldr q10, [x17, #+208] +ldr q2, [x17, #+224] +ldr q16, [x17, #+240] +ldr q22, [x0, #32] +ldr q13, [x0, #48] +ldr q11, [x0, #0] +ldr q21, [x0, #16] +sqrdmulh v14.4S, v22.4S, v5.s[0] +mul v22.4S, v22.4S,v4.s[0] +mla v22.4S, v14.4S, v31.s[0] +sub v14.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +sqrdmulh v22.4S, v13.4S, v5.s[0] +mul v13.4S, v13.4S,v4.s[0] +mla v13.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +sqrdmulh v13.4S, v21.4S, v5.s[1] +mul v21.4S, v21.4S,v4.s[1] +mla v21.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v21.4s +add v11.4s, v11.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v5.s[2] +mul v22.4S, v22.4S,v4.s[2] +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v14.4s, v22.4s +add v14.4s, v14.4s, v22.4s +trn1 v22.4S, v11.4S, v13.4S +trn2 v0.4S, v11.4S, v13.4S +trn1 v19.4S, v14.4S, v21.4S +trn2 v17.4S, v14.4S, v21.4S +trn2 v14.2D, v22.2D, v19.2D +trn2 v21.2D, v0.2D, v17.2D +trn1 v11.2D, v22.2D, v19.2D +trn1 v13.2D, v0.2D, v17.2D +sqrdmulh v17.4S, v14.4S, v7.4S +mul v14.4S, v14.4S,v6.4S +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v11.4s, v14.4s +add v11.4s, v11.4s, v14.4s +sqrdmulh v14.4S, v21.4S, v7.4S +mul v21.4S, v21.4S,v6.4S +mla v21.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v21.4s +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v13.4S, v10.4S +mul v13.4S, v13.4S,v15.4S +mla v13.4S, v21.4S, v31.s[0] +sub v21.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +sqrdmulh v13.4S, v14.4S, v16.4S +mul v14.4S, v14.4S,v2.4S +mla v14.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v14.4s +add v17.4s, v17.4s, v14.4s +str q11, [x0, #0] +str q21, [x0, #16] +str q17, [x0, #32] +str q13, [x0, #48] +ldr q16, [x17, #+1152] +ldr q2, [x17, #+1168] +ldr q10, [x17, #+1184] +ldr q15, [x17, #+1200] +ldr q7, [x17, #+1216] +ldr q6, [x17, #+1232] +ldr q5, [x17, #+1248] +ldr q4, [x17, #+1264] +ldr q13, [x0, #544] +ldr q17, [x0, #560] +ldr q21, [x0, #512] +ldr q11, [x0, #528] +sqrdmulh v14.4S, v13.4S, v2.s[0] +mul v13.4S, v13.4S,v16.s[0] +mla v13.4S, v14.4S, v31.s[0] +sub v14.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +sqrdmulh v13.4S, v17.4S, v2.s[0] +mul v17.4S, v17.4S,v16.s[0] +mla v17.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v17.4s +add v11.4s, v11.4s, v17.4s +sqrdmulh v17.4S, v11.4S, v2.s[1] +mul v11.4S, v11.4S,v16.s[1] +mla v11.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v11.4s +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v2.s[2] +mul v13.4S, v13.4S,v16.s[2] +mla v13.4S, v11.4S, v31.s[0] +sub v11.4s, v14.4s, v13.4s +add v14.4s, v14.4s, v13.4s +trn1 v13.4S, v21.4S, v17.4S +trn2 v0.4S, v21.4S, v17.4S +trn1 v19.4S, v14.4S, v11.4S +trn2 v22.4S, v14.4S, v11.4S +trn2 v14.2D, v13.2D, v19.2D +trn2 v11.2D, v0.2D, v22.2D +trn1 v21.2D, v13.2D, v19.2D +trn1 v17.2D, v0.2D, v22.2D +sqrdmulh v22.4S, v14.4S, v15.4S +mul v14.4S, v14.4S,v10.4S +mla v14.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v14.4s +add v21.4s, v21.4s, v14.4s +sqrdmulh v14.4S, v11.4S, v15.4S +mul v11.4S, v11.4S,v10.4S +mla v11.4S, v14.4S, v31.s[0] +sub v14.4s, v17.4s, v11.4s +add v17.4s, v17.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v6.4S +mul v17.4S, v17.4S,v7.4S +mla v17.4S, v11.4S, v31.s[0] +sub v11.4s, v21.4s, v17.4s +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v14.4S, v4.4S +mul v14.4S, v14.4S,v5.4S +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v22.4s, v14.4s +add v22.4s, v22.4s, v14.4s +str q21, [x0, #512] +str q11, [x0, #528] +str q22, [x0, #544] +str q17, [x0, #560] +ldr q4, [x17, #+256] +ldr q5, [x17, #+272] +ldr q6, [x17, #+288] +ldr q7, [x17, #+304] +ldr q15, [x17, #+320] +ldr q10, [x17, #+336] +ldr q2, [x17, #+352] +ldr q16, [x17, #+368] +ldr q17, [x0, #96] +ldr q22, [x0, #112] +ldr q11, [x0, #64] +ldr q21, [x0, #80] +sqrdmulh v14.4S, v17.4S, v5.s[0] +mul v17.4S, v17.4S,v4.s[0] +mla v17.4S, v14.4S, v31.s[0] +sub v14.4s, v11.4s, v17.4s +add v11.4s, v11.4s, v17.4s +sqrdmulh v17.4S, v22.4S, v5.s[0] +mul v22.4S, v22.4S,v4.s[0] +mla v22.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v5.s[1] +mul v21.4S, v21.4S,v4.s[1] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v21.4s +add v11.4s, v11.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v5.s[2] +mul v17.4S, v17.4S,v4.s[2] +mla v17.4S, v21.4S, v31.s[0] +sub v21.4s, v14.4s, v17.4s +add v14.4s, v14.4s, v17.4s +trn1 v17.4S, v11.4S, v22.4S +trn2 v0.4S, v11.4S, v22.4S +trn1 v19.4S, v14.4S, v21.4S +trn2 v13.4S, v14.4S, v21.4S +trn2 v14.2D, v17.2D, v19.2D +trn2 v21.2D, v0.2D, v13.2D +trn1 v11.2D, v17.2D, v19.2D +trn1 v22.2D, v0.2D, v13.2D +sqrdmulh v13.4S, v14.4S, v7.4S +mul v14.4S, v14.4S,v6.4S +mla v14.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v14.4s +add v11.4s, v11.4s, v14.4s +sqrdmulh v14.4S, v21.4S, v7.4S +mul v21.4S, v21.4S,v6.4S +mla v21.4S, v14.4S, v31.s[0] +sub v14.4s, v22.4s, v21.4s +add v22.4s, v22.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v10.4S +mul v22.4S, v22.4S,v15.4S +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +sqrdmulh v22.4S, v14.4S, v16.4S +mul v14.4S, v14.4S,v2.4S +mla v14.4S, v22.4S, v31.s[0] +sub v22.4s, v13.4s, v14.4s +add v13.4s, v13.4s, v14.4s +str q11, [x0, #64] +str q21, [x0, #80] +str q13, [x0, #96] +str q22, [x0, #112] +ldr q16, [x17, #+1280] +ldr q2, [x17, #+1296] +ldr q10, [x17, #+1312] +ldr q15, [x17, #+1328] +ldr q7, [x17, #+1344] +ldr q6, [x17, #+1360] +ldr q5, [x17, #+1376] +ldr q4, [x17, #+1392] +ldr q22, [x0, #608] +ldr q13, [x0, #624] +ldr q21, [x0, #576] +ldr q11, [x0, #592] +sqrdmulh v14.4S, v22.4S, v2.s[0] +mul v22.4S, v22.4S,v16.s[0] +mla v22.4S, v14.4S, v31.s[0] +sub v14.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +sqrdmulh v22.4S, v13.4S, v2.s[0] +mul v13.4S, v13.4S,v16.s[0] +mla v13.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +sqrdmulh v13.4S, v11.4S, v2.s[1] +mul v11.4S, v11.4S,v16.s[1] +mla v11.4S, v13.4S, v31.s[0] +sub v13.4s, v21.4s, v11.4s +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v2.s[2] +mul v22.4S, v22.4S,v16.s[2] +mla v22.4S, v11.4S, v31.s[0] +sub v11.4s, v14.4s, v22.4s +add v14.4s, v14.4s, v22.4s +trn1 v22.4S, v21.4S, v13.4S +trn2 v0.4S, v21.4S, v13.4S +trn1 v19.4S, v14.4S, v11.4S +trn2 v17.4S, v14.4S, v11.4S +trn2 v14.2D, v22.2D, v19.2D +trn2 v11.2D, v0.2D, v17.2D +trn1 v21.2D, v22.2D, v19.2D +trn1 v13.2D, v0.2D, v17.2D +sqrdmulh v17.4S, v14.4S, v15.4S +mul v14.4S, v14.4S,v10.4S +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v14.4s +add v21.4s, v21.4s, v14.4s +sqrdmulh v14.4S, v11.4S, v15.4S +mul v11.4S, v11.4S,v10.4S +mla v11.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v11.4s +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v6.4S +mul v13.4S, v13.4S,v7.4S +mla v13.4S, v11.4S, v31.s[0] +sub v11.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +sqrdmulh v13.4S, v14.4S, v4.4S +mul v14.4S, v14.4S,v5.4S +mla v14.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v14.4s +add v17.4s, v17.4s, v14.4s +str q21, [x0, #576] +str q11, [x0, #592] +str q17, [x0, #608] +str q13, [x0, #624] +ldr q4, [x17, #+384] +ldr q5, [x17, #+400] +ldr q6, [x17, #+416] +ldr q7, [x17, #+432] +ldr q15, [x17, #+448] +ldr q10, [x17, #+464] +ldr q2, [x17, #+480] +ldr q16, [x17, #+496] +ldr q13, [x0, #160] +ldr q17, [x0, #176] +ldr q11, [x0, #128] +ldr q21, [x0, #144] +sqrdmulh v14.4S, v13.4S, v5.s[0] +mul v13.4S, v13.4S,v4.s[0] +mla v13.4S, v14.4S, v31.s[0] +sub v14.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +sqrdmulh v13.4S, v17.4S, v5.s[0] +mul v17.4S, v17.4S,v4.s[0] +mla v17.4S, v13.4S, v31.s[0] +sub v13.4s, v21.4s, v17.4s +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v21.4S, v5.s[1] +mul v21.4S, v21.4S,v4.s[1] +mla v21.4S, v17.4S, v31.s[0] +sub v17.4s, v11.4s, v21.4s +add v11.4s, v11.4s, v21.4s +sqrdmulh v21.4S, v13.4S, v5.s[2] +mul v13.4S, v13.4S,v4.s[2] +mla v13.4S, v21.4S, v31.s[0] +sub v21.4s, v14.4s, v13.4s +add v14.4s, v14.4s, v13.4s +trn1 v13.4S, v11.4S, v17.4S +trn2 v0.4S, v11.4S, v17.4S +trn1 v19.4S, v14.4S, v21.4S +trn2 v22.4S, v14.4S, v21.4S +trn2 v14.2D, v13.2D, v19.2D +trn2 v21.2D, v0.2D, v22.2D +trn1 v11.2D, v13.2D, v19.2D +trn1 v17.2D, v0.2D, v22.2D +sqrdmulh v22.4S, v14.4S, v7.4S +mul v14.4S, v14.4S,v6.4S +mla v14.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v14.4s +add v11.4s, v11.4s, v14.4s +sqrdmulh v14.4S, v21.4S, v7.4S +mul v21.4S, v21.4S,v6.4S +mla v21.4S, v14.4S, v31.s[0] +sub v14.4s, v17.4s, v21.4s +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v10.4S +mul v17.4S, v17.4S,v15.4S +mla v17.4S, v21.4S, v31.s[0] +sub v21.4s, v11.4s, v17.4s +add v11.4s, v11.4s, v17.4s +sqrdmulh v17.4S, v14.4S, v16.4S +mul v14.4S, v14.4S,v2.4S +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v22.4s, v14.4s +add v22.4s, v22.4s, v14.4s +str q11, [x0, #128] +str q21, [x0, #144] +str q22, [x0, #160] +str q17, [x0, #176] +ldr q16, [x17, #+1408] +ldr q2, [x17, #+1424] +ldr q10, [x17, #+1440] +ldr q15, [x17, #+1456] +ldr q7, [x17, #+1472] +ldr q6, [x17, #+1488] +ldr q5, [x17, #+1504] +ldr q4, [x17, #+1520] +ldr q17, [x0, #672] +ldr q22, [x0, #688] +ldr q21, [x0, #640] +ldr q11, [x0, #656] +sqrdmulh v14.4S, v17.4S, v2.s[0] +mul v17.4S, v17.4S,v16.s[0] +mla v17.4S, v14.4S, v31.s[0] +sub v14.4s, v21.4s, v17.4s +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v22.4S, v2.s[0] +mul v22.4S, v22.4S,v16.s[0] +mla v22.4S, v17.4S, v31.s[0] +sub v17.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +sqrdmulh v22.4S, v11.4S, v2.s[1] +mul v11.4S, v11.4S,v16.s[1] +mla v11.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v11.4s +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v2.s[2] +mul v17.4S, v17.4S,v16.s[2] +mla v17.4S, v11.4S, v31.s[0] +sub v11.4s, v14.4s, v17.4s +add v14.4s, v14.4s, v17.4s +trn1 v17.4S, v21.4S, v22.4S +trn2 v0.4S, v21.4S, v22.4S +trn1 v19.4S, v14.4S, v11.4S +trn2 v13.4S, v14.4S, v11.4S +trn2 v14.2D, v17.2D, v19.2D +trn2 v11.2D, v0.2D, v13.2D +trn1 v21.2D, v17.2D, v19.2D +trn1 v22.2D, v0.2D, v13.2D +sqrdmulh v13.4S, v14.4S, v15.4S +mul v14.4S, v14.4S,v10.4S +mla v14.4S, v13.4S, v31.s[0] +sub v13.4s, v21.4s, v14.4s +add v21.4s, v21.4s, v14.4s +sqrdmulh v14.4S, v11.4S, v15.4S +mul v11.4S, v11.4S,v10.4S +mla v11.4S, v14.4S, v31.s[0] +sub v14.4s, v22.4s, v11.4s +add v22.4s, v22.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v6.4S +mul v22.4S, v22.4S,v7.4S +mla v22.4S, v11.4S, v31.s[0] +sub v11.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +sqrdmulh v22.4S, v14.4S, v4.4S +mul v14.4S, v14.4S,v5.4S +mla v14.4S, v22.4S, v31.s[0] +sub v22.4s, v13.4s, v14.4s +add v13.4s, v13.4s, v14.4s +str q21, [x0, #640] +str q11, [x0, #656] +str q13, [x0, #672] +str q22, [x0, #688] +ldr q4, [x17, #+512] +ldr q5, [x17, #+528] +ldr q6, [x17, #+544] +ldr q7, [x17, #+560] +ldr q15, [x17, #+576] +ldr q10, [x17, #+592] +ldr q2, [x17, #+608] +ldr q16, [x17, #+624] +ldr q22, [x0, #224] +ldr q13, [x0, #240] +ldr q11, [x0, #192] +ldr q21, [x0, #208] +sqrdmulh v14.4S, v22.4S, v5.s[0] +mul v22.4S, v22.4S,v4.s[0] +mla v22.4S, v14.4S, v31.s[0] +sub v14.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +sqrdmulh v22.4S, v13.4S, v5.s[0] +mul v13.4S, v13.4S,v4.s[0] +mla v13.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +sqrdmulh v13.4S, v21.4S, v5.s[1] +mul v21.4S, v21.4S,v4.s[1] +mla v21.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v21.4s +add v11.4s, v11.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v5.s[2] +mul v22.4S, v22.4S,v4.s[2] +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v14.4s, v22.4s +add v14.4s, v14.4s, v22.4s +trn1 v22.4S, v11.4S, v13.4S +trn2 v0.4S, v11.4S, v13.4S +trn1 v19.4S, v14.4S, v21.4S +trn2 v17.4S, v14.4S, v21.4S +trn2 v14.2D, v22.2D, v19.2D +trn2 v21.2D, v0.2D, v17.2D +trn1 v11.2D, v22.2D, v19.2D +trn1 v13.2D, v0.2D, v17.2D +sqrdmulh v17.4S, v14.4S, v7.4S +mul v14.4S, v14.4S,v6.4S +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v11.4s, v14.4s +add v11.4s, v11.4s, v14.4s +sqrdmulh v14.4S, v21.4S, v7.4S +mul v21.4S, v21.4S,v6.4S +mla v21.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v21.4s +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v13.4S, v10.4S +mul v13.4S, v13.4S,v15.4S +mla v13.4S, v21.4S, v31.s[0] +sub v21.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +sqrdmulh v13.4S, v14.4S, v16.4S +mul v14.4S, v14.4S,v2.4S +mla v14.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v14.4s +add v17.4s, v17.4s, v14.4s +str q11, [x0, #192] +str q21, [x0, #208] +str q17, [x0, #224] +str q13, [x0, #240] +ldr q16, [x17, #+1536] +ldr q2, [x17, #+1552] +ldr q10, [x17, #+1568] +ldr q15, [x17, #+1584] +ldr q7, [x17, #+1600] +ldr q6, [x17, #+1616] +ldr q5, [x17, #+1632] +ldr q4, [x17, #+1648] +ldr q13, [x0, #736] +ldr q17, [x0, #752] +ldr q21, [x0, #704] +ldr q11, [x0, #720] +sqrdmulh v14.4S, v13.4S, v2.s[0] +mul v13.4S, v13.4S,v16.s[0] +mla v13.4S, v14.4S, v31.s[0] +sub v14.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +sqrdmulh v13.4S, v17.4S, v2.s[0] +mul v17.4S, v17.4S,v16.s[0] +mla v17.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v17.4s +add v11.4s, v11.4s, v17.4s +sqrdmulh v17.4S, v11.4S, v2.s[1] +mul v11.4S, v11.4S,v16.s[1] +mla v11.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v11.4s +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v2.s[2] +mul v13.4S, v13.4S,v16.s[2] +mla v13.4S, v11.4S, v31.s[0] +sub v11.4s, v14.4s, v13.4s +add v14.4s, v14.4s, v13.4s +trn1 v13.4S, v21.4S, v17.4S +trn2 v0.4S, v21.4S, v17.4S +trn1 v19.4S, v14.4S, v11.4S +trn2 v22.4S, v14.4S, v11.4S +trn2 v14.2D, v13.2D, v19.2D +trn2 v11.2D, v0.2D, v22.2D +trn1 v21.2D, v13.2D, v19.2D +trn1 v17.2D, v0.2D, v22.2D +sqrdmulh v22.4S, v14.4S, v15.4S +mul v14.4S, v14.4S,v10.4S +mla v14.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v14.4s +add v21.4s, v21.4s, v14.4s +sqrdmulh v14.4S, v11.4S, v15.4S +mul v11.4S, v11.4S,v10.4S +mla v11.4S, v14.4S, v31.s[0] +sub v14.4s, v17.4s, v11.4s +add v17.4s, v17.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v6.4S +mul v17.4S, v17.4S,v7.4S +mla v17.4S, v11.4S, v31.s[0] +sub v11.4s, v21.4s, v17.4s +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v14.4S, v4.4S +mul v14.4S, v14.4S,v5.4S +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v22.4s, v14.4s +add v22.4s, v22.4s, v14.4s +str q21, [x0, #704] +str q11, [x0, #720] +str q22, [x0, #736] +str q17, [x0, #752] +ldr q4, [x17, #+640] +ldr q5, [x17, #+656] +ldr q6, [x17, #+672] +ldr q7, [x17, #+688] +ldr q15, [x17, #+704] +ldr q10, [x17, #+720] +ldr q2, [x17, #+736] +ldr q16, [x17, #+752] +ldr q17, [x0, #288] +ldr q22, [x0, #304] +ldr q11, [x0, #256] +ldr q21, [x0, #272] +sqrdmulh v14.4S, v17.4S, v5.s[0] +mul v17.4S, v17.4S,v4.s[0] +mla v17.4S, v14.4S, v31.s[0] +sub v14.4s, v11.4s, v17.4s +add v11.4s, v11.4s, v17.4s +sqrdmulh v17.4S, v22.4S, v5.s[0] +mul v22.4S, v22.4S,v4.s[0] +mla v22.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v5.s[1] +mul v21.4S, v21.4S,v4.s[1] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v21.4s +add v11.4s, v11.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v5.s[2] +mul v17.4S, v17.4S,v4.s[2] +mla v17.4S, v21.4S, v31.s[0] +sub v21.4s, v14.4s, v17.4s +add v14.4s, v14.4s, v17.4s +trn1 v17.4S, v11.4S, v22.4S +trn2 v0.4S, v11.4S, v22.4S +trn1 v19.4S, v14.4S, v21.4S +trn2 v13.4S, v14.4S, v21.4S +trn2 v14.2D, v17.2D, v19.2D +trn2 v21.2D, v0.2D, v13.2D +trn1 v11.2D, v17.2D, v19.2D +trn1 v22.2D, v0.2D, v13.2D +sqrdmulh v13.4S, v14.4S, v7.4S +mul v14.4S, v14.4S,v6.4S +mla v14.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v14.4s +add v11.4s, v11.4s, v14.4s +sqrdmulh v14.4S, v21.4S, v7.4S +mul v21.4S, v21.4S,v6.4S +mla v21.4S, v14.4S, v31.s[0] +sub v14.4s, v22.4s, v21.4s +add v22.4s, v22.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v10.4S +mul v22.4S, v22.4S,v15.4S +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +sqrdmulh v22.4S, v14.4S, v16.4S +mul v14.4S, v14.4S,v2.4S +mla v14.4S, v22.4S, v31.s[0] +sub v22.4s, v13.4s, v14.4s +add v13.4s, v13.4s, v14.4s +str q11, [x0, #256] +str q21, [x0, #272] +str q13, [x0, #288] +str q22, [x0, #304] +ldr q16, [x17, #+1664] +ldr q2, [x17, #+1680] +ldr q10, [x17, #+1696] +ldr q15, [x17, #+1712] +ldr q7, [x17, #+1728] +ldr q6, [x17, #+1744] +ldr q5, [x17, #+1760] +ldr q4, [x17, #+1776] +ldr q22, [x0, #800] +ldr q13, [x0, #816] +ldr q21, [x0, #768] +ldr q11, [x0, #784] +sqrdmulh v14.4S, v22.4S, v2.s[0] +mul v22.4S, v22.4S,v16.s[0] +mla v22.4S, v14.4S, v31.s[0] +sub v14.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +sqrdmulh v22.4S, v13.4S, v2.s[0] +mul v13.4S, v13.4S,v16.s[0] +mla v13.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +sqrdmulh v13.4S, v11.4S, v2.s[1] +mul v11.4S, v11.4S,v16.s[1] +mla v11.4S, v13.4S, v31.s[0] +sub v13.4s, v21.4s, v11.4s +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v2.s[2] +mul v22.4S, v22.4S,v16.s[2] +mla v22.4S, v11.4S, v31.s[0] +sub v11.4s, v14.4s, v22.4s +add v14.4s, v14.4s, v22.4s +trn1 v22.4S, v21.4S, v13.4S +trn2 v0.4S, v21.4S, v13.4S +trn1 v19.4S, v14.4S, v11.4S +trn2 v17.4S, v14.4S, v11.4S +trn2 v14.2D, v22.2D, v19.2D +trn2 v11.2D, v0.2D, v17.2D +trn1 v21.2D, v22.2D, v19.2D +trn1 v13.2D, v0.2D, v17.2D +sqrdmulh v17.4S, v14.4S, v15.4S +mul v14.4S, v14.4S,v10.4S +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v14.4s +add v21.4s, v21.4s, v14.4s +sqrdmulh v14.4S, v11.4S, v15.4S +mul v11.4S, v11.4S,v10.4S +mla v11.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v11.4s +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v6.4S +mul v13.4S, v13.4S,v7.4S +mla v13.4S, v11.4S, v31.s[0] +sub v11.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +sqrdmulh v13.4S, v14.4S, v4.4S +mul v14.4S, v14.4S,v5.4S +mla v14.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v14.4s +add v17.4s, v17.4s, v14.4s +str q21, [x0, #768] +str q11, [x0, #784] +str q17, [x0, #800] +str q13, [x0, #816] +ldr q4, [x17, #+768] +ldr q5, [x17, #+784] +ldr q6, [x17, #+800] +ldr q7, [x17, #+816] +ldr q15, [x17, #+832] +ldr q10, [x17, #+848] +ldr q2, [x17, #+864] +ldr q16, [x17, #+880] +ldr q13, [x0, #352] +ldr q17, [x0, #368] +ldr q11, [x0, #320] +ldr q21, [x0, #336] +sqrdmulh v14.4S, v13.4S, v5.s[0] +mul v13.4S, v13.4S,v4.s[0] +mla v13.4S, v14.4S, v31.s[0] +sub v14.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +sqrdmulh v13.4S, v17.4S, v5.s[0] +mul v17.4S, v17.4S,v4.s[0] +mla v17.4S, v13.4S, v31.s[0] +sub v13.4s, v21.4s, v17.4s +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v21.4S, v5.s[1] +mul v21.4S, v21.4S,v4.s[1] +mla v21.4S, v17.4S, v31.s[0] +sub v17.4s, v11.4s, v21.4s +add v11.4s, v11.4s, v21.4s +sqrdmulh v21.4S, v13.4S, v5.s[2] +mul v13.4S, v13.4S,v4.s[2] +mla v13.4S, v21.4S, v31.s[0] +sub v21.4s, v14.4s, v13.4s +add v14.4s, v14.4s, v13.4s +trn1 v13.4S, v11.4S, v17.4S +trn2 v0.4S, v11.4S, v17.4S +trn1 v19.4S, v14.4S, v21.4S +trn2 v22.4S, v14.4S, v21.4S +trn2 v14.2D, v13.2D, v19.2D +trn2 v21.2D, v0.2D, v22.2D +trn1 v11.2D, v13.2D, v19.2D +trn1 v17.2D, v0.2D, v22.2D +sqrdmulh v22.4S, v14.4S, v7.4S +mul v14.4S, v14.4S,v6.4S +mla v14.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v14.4s +add v11.4s, v11.4s, v14.4s +sqrdmulh v14.4S, v21.4S, v7.4S +mul v21.4S, v21.4S,v6.4S +mla v21.4S, v14.4S, v31.s[0] +sub v14.4s, v17.4s, v21.4s +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v10.4S +mul v17.4S, v17.4S,v15.4S +mla v17.4S, v21.4S, v31.s[0] +sub v21.4s, v11.4s, v17.4s +add v11.4s, v11.4s, v17.4s +sqrdmulh v17.4S, v14.4S, v16.4S +mul v14.4S, v14.4S,v2.4S +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v22.4s, v14.4s +add v22.4s, v22.4s, v14.4s +str q11, [x0, #320] +str q21, [x0, #336] +str q22, [x0, #352] +str q17, [x0, #368] +ldr q16, [x17, #+1792] +ldr q2, [x17, #+1808] +ldr q10, [x17, #+1824] +ldr q15, [x17, #+1840] +ldr q7, [x17, #+1856] +ldr q6, [x17, #+1872] +ldr q5, [x17, #+1888] +ldr q4, [x17, #+1904] +ldr q17, [x0, #864] +ldr q22, [x0, #880] +ldr q21, [x0, #832] +ldr q11, [x0, #848] +sqrdmulh v14.4S, v17.4S, v2.s[0] +mul v17.4S, v17.4S,v16.s[0] +mla v17.4S, v14.4S, v31.s[0] +sub v14.4s, v21.4s, v17.4s +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v22.4S, v2.s[0] +mul v22.4S, v22.4S,v16.s[0] +mla v22.4S, v17.4S, v31.s[0] +sub v17.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +sqrdmulh v22.4S, v11.4S, v2.s[1] +mul v11.4S, v11.4S,v16.s[1] +mla v11.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v11.4s +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v2.s[2] +mul v17.4S, v17.4S,v16.s[2] +mla v17.4S, v11.4S, v31.s[0] +sub v11.4s, v14.4s, v17.4s +add v14.4s, v14.4s, v17.4s +trn1 v17.4S, v21.4S, v22.4S +trn2 v0.4S, v21.4S, v22.4S +trn1 v19.4S, v14.4S, v11.4S +trn2 v13.4S, v14.4S, v11.4S +trn2 v14.2D, v17.2D, v19.2D +trn2 v11.2D, v0.2D, v13.2D +trn1 v21.2D, v17.2D, v19.2D +trn1 v22.2D, v0.2D, v13.2D +sqrdmulh v13.4S, v14.4S, v15.4S +mul v14.4S, v14.4S,v10.4S +mla v14.4S, v13.4S, v31.s[0] +sub v13.4s, v21.4s, v14.4s +add v21.4s, v21.4s, v14.4s +sqrdmulh v14.4S, v11.4S, v15.4S +mul v11.4S, v11.4S,v10.4S +mla v11.4S, v14.4S, v31.s[0] +sub v14.4s, v22.4s, v11.4s +add v22.4s, v22.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v6.4S +mul v22.4S, v22.4S,v7.4S +mla v22.4S, v11.4S, v31.s[0] +sub v11.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +sqrdmulh v22.4S, v14.4S, v4.4S +mul v14.4S, v14.4S,v5.4S +mla v14.4S, v22.4S, v31.s[0] +sub v22.4s, v13.4s, v14.4s +add v13.4s, v13.4s, v14.4s +str q21, [x0, #832] +str q11, [x0, #848] +str q13, [x0, #864] +str q22, [x0, #880] +ldr q4, [x17, #+896] +ldr q5, [x17, #+912] +ldr q6, [x17, #+928] +ldr q7, [x17, #+944] +ldr q15, [x17, #+960] +ldr q10, [x17, #+976] +ldr q2, [x17, #+992] +ldr q16, [x17, #+1008] +ldr q22, [x0, #416] +ldr q13, [x0, #432] +ldr q11, [x0, #384] +ldr q21, [x0, #400] +sqrdmulh v14.4S, v22.4S, v5.s[0] +mul v22.4S, v22.4S,v4.s[0] +mla v22.4S, v14.4S, v31.s[0] +sub v14.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +sqrdmulh v22.4S, v13.4S, v5.s[0] +mul v13.4S, v13.4S,v4.s[0] +mla v13.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +sqrdmulh v13.4S, v21.4S, v5.s[1] +mul v21.4S, v21.4S,v4.s[1] +mla v21.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v21.4s +add v11.4s, v11.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v5.s[2] +mul v22.4S, v22.4S,v4.s[2] +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v14.4s, v22.4s +add v14.4s, v14.4s, v22.4s +trn1 v22.4S, v11.4S, v13.4S +trn2 v0.4S, v11.4S, v13.4S +trn1 v19.4S, v14.4S, v21.4S +trn2 v17.4S, v14.4S, v21.4S +trn2 v14.2D, v22.2D, v19.2D +trn2 v21.2D, v0.2D, v17.2D +trn1 v11.2D, v22.2D, v19.2D +trn1 v13.2D, v0.2D, v17.2D +sqrdmulh v17.4S, v14.4S, v7.4S +mul v14.4S, v14.4S,v6.4S +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v11.4s, v14.4s +add v11.4s, v11.4s, v14.4s +sqrdmulh v14.4S, v21.4S, v7.4S +mul v21.4S, v21.4S,v6.4S +mla v21.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v21.4s +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v13.4S, v10.4S +mul v13.4S, v13.4S,v15.4S +mla v13.4S, v21.4S, v31.s[0] +sub v21.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +sqrdmulh v13.4S, v14.4S, v16.4S +mul v14.4S, v14.4S,v2.4S +mla v14.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v14.4s +add v17.4s, v17.4s, v14.4s +str q11, [x0, #384] +str q21, [x0, #400] +str q17, [x0, #416] +str q13, [x0, #432] +ldr q16, [x17, #+1920] +ldr q2, [x17, #+1936] +ldr q10, [x17, #+1952] +ldr q15, [x17, #+1968] +ldr q7, [x17, #+1984] +ldr q6, [x17, #+2000] +ldr q5, [x17, #+2016] +ldr q4, [x17, #+2032] +ldr q13, [x0, #928] +ldr q17, [x0, #944] +ldr q21, [x0, #896] +ldr q11, [x0, #912] +sqrdmulh v14.4S, v13.4S, v2.s[0] +mul v13.4S, v13.4S,v16.s[0] +mla v13.4S, v14.4S, v31.s[0] +sub v14.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +sqrdmulh v13.4S, v17.4S, v2.s[0] +mul v17.4S, v17.4S,v16.s[0] +mla v17.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v17.4s +add v11.4s, v11.4s, v17.4s +sqrdmulh v17.4S, v11.4S, v2.s[1] +mul v11.4S, v11.4S,v16.s[1] +mla v11.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v11.4s +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v2.s[2] +mul v13.4S, v13.4S,v16.s[2] +mla v13.4S, v11.4S, v31.s[0] +sub v11.4s, v14.4s, v13.4s +add v14.4s, v14.4s, v13.4s +trn1 v13.4S, v21.4S, v17.4S +trn2 v0.4S, v21.4S, v17.4S +trn1 v19.4S, v14.4S, v11.4S +trn2 v22.4S, v14.4S, v11.4S +trn2 v14.2D, v13.2D, v19.2D +trn2 v11.2D, v0.2D, v22.2D +trn1 v21.2D, v13.2D, v19.2D +trn1 v17.2D, v0.2D, v22.2D +sqrdmulh v22.4S, v14.4S, v15.4S +mul v14.4S, v14.4S,v10.4S +mla v14.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v14.4s +add v21.4s, v21.4s, v14.4s +sqrdmulh v14.4S, v11.4S, v15.4S +mul v11.4S, v11.4S,v10.4S +mla v11.4S, v14.4S, v31.s[0] +sub v14.4s, v17.4s, v11.4s +add v17.4s, v17.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v6.4S +mul v17.4S, v17.4S,v7.4S +mla v17.4S, v11.4S, v31.s[0] +sub v11.4s, v21.4s, v17.4s +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v14.4S, v4.4S +mul v14.4S, v14.4S,v5.4S +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v22.4s, v14.4s +add v22.4s, v22.4s, v14.4s +str q21, [x0, #896] +str q11, [x0, #912] +str q22, [x0, #928] +str q17, [x0, #944] +ldr q4, [x17, #+1024] +ldr q5, [x17, #+1040] +ldr q6, [x17, #+1056] +ldr q7, [x17, #+1072] +ldr q15, [x17, #+1088] +ldr q10, [x17, #+1104] +ldr q2, [x17, #+1120] +ldr q16, [x17, #+1136] +ldr q17, [x0, #480] +ldr q22, [x0, #496] +ldr q11, [x0, #448] +ldr q21, [x0, #464] +sqrdmulh v14.4S, v17.4S, v5.s[0] +mul v17.4S, v17.4S,v4.s[0] +mla v17.4S, v14.4S, v31.s[0] +sub v14.4s, v11.4s, v17.4s +add v11.4s, v11.4s, v17.4s +sqrdmulh v17.4S, v22.4S, v5.s[0] +mul v22.4S, v22.4S,v4.s[0] +mla v22.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v5.s[1] +mul v21.4S, v21.4S,v4.s[1] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v21.4s +add v11.4s, v11.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v5.s[2] +mul v17.4S, v17.4S,v4.s[2] +mla v17.4S, v21.4S, v31.s[0] +sub v21.4s, v14.4s, v17.4s +add v14.4s, v14.4s, v17.4s +trn1 v17.4S, v11.4S, v22.4S +trn2 v0.4S, v11.4S, v22.4S +trn1 v19.4S, v14.4S, v21.4S +trn2 v13.4S, v14.4S, v21.4S +trn2 v14.2D, v17.2D, v19.2D +trn2 v21.2D, v0.2D, v13.2D +trn1 v11.2D, v17.2D, v19.2D +trn1 v22.2D, v0.2D, v13.2D +sqrdmulh v13.4S, v14.4S, v7.4S +mul v14.4S, v14.4S,v6.4S +mla v14.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v14.4s +add v11.4s, v11.4s, v14.4s +sqrdmulh v14.4S, v21.4S, v7.4S +mul v21.4S, v21.4S,v6.4S +mla v21.4S, v14.4S, v31.s[0] +sub v14.4s, v22.4s, v21.4s +add v22.4s, v22.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v10.4S +mul v22.4S, v22.4S,v15.4S +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +sqrdmulh v22.4S, v14.4S, v16.4S +mul v14.4S, v14.4S,v2.4S +mla v14.4S, v22.4S, v31.s[0] +sub v22.4s, v13.4s, v14.4s +add v13.4s, v13.4s, v14.4s +str q11, [x0, #448] +str q21, [x0, #464] +str q13, [x0, #480] +str q22, [x0, #496] +ldr q16, [x17, #+2048] +ldr q2, [x17, #+2064] +ldr q10, [x17, #+2080] +ldr q15, [x17, #+2096] +ldr q7, [x17, #+2112] +ldr q6, [x17, #+2128] +ldr q5, [x17, #+2144] +ldr q4, [x17, #+2160] +ldr q22, [x0, #992] +ldr q13, [x0, #1008] +ldr q21, [x0, #960] +ldr q11, [x0, #976] +sqrdmulh v14.4S, v22.4S, v2.s[0] +mul v22.4S, v22.4S,v16.s[0] +mla v22.4S, v14.4S, v31.s[0] +sub v14.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +sqrdmulh v22.4S, v13.4S, v2.s[0] +mul v13.4S, v13.4S,v16.s[0] +mla v13.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +sqrdmulh v13.4S, v11.4S, v2.s[1] +mul v11.4S, v11.4S,v16.s[1] +mla v11.4S, v13.4S, v31.s[0] +sub v13.4s, v21.4s, v11.4s +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v2.s[2] +mul v22.4S, v22.4S,v16.s[2] +mla v22.4S, v11.4S, v31.s[0] +sub v11.4s, v14.4s, v22.4s +add v14.4s, v14.4s, v22.4s +trn1 v22.4S, v21.4S, v13.4S +trn2 v0.4S, v21.4S, v13.4S +trn1 v19.4S, v14.4S, v11.4S +trn2 v17.4S, v14.4S, v11.4S +trn2 v14.2D, v22.2D, v19.2D +trn2 v11.2D, v0.2D, v17.2D +trn1 v21.2D, v22.2D, v19.2D +trn1 v13.2D, v0.2D, v17.2D +sqrdmulh v17.4S, v14.4S, v15.4S +mul v14.4S, v14.4S,v10.4S +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v14.4s +add v21.4s, v21.4s, v14.4s +sqrdmulh v14.4S, v11.4S, v15.4S +mul v11.4S, v11.4S,v10.4S +mla v11.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v11.4s +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v6.4S +mul v13.4S, v13.4S,v7.4S +mla v13.4S, v11.4S, v31.s[0] +sub v11.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +sqrdmulh v13.4S, v14.4S, v4.4S +mul v14.4S, v14.4S,v5.4S +mla v14.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v14.4s +add v17.4s, v17.4s, v14.4s +str q21, [x0, #960] +str q11, [x0, #976] +str q17, [x0, #992] +str q13, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 2392 +// Instruction count: 2388 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_3_z2_1.s b/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_3_z2_1.s new file mode 100644 index 0000000..e51be66 --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_3_z2_1.s @@ -0,0 +1,2422 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 26036764 // Layer 6, block 0 +.word 7065381 // Layer 6, block 1 +.word 11280567 // Layer 6, block 2 +.word 19695786 // Layer 6, block 3 +.word 1666225723 // Layer 6, block 0 +.word 452149874 // Layer 6, block 1 +.word 721901190 // Layer 6, block 2 +.word 1260434103 // Layer 6, block 3 +.word 28678040 // Layer 7, block 0 +.word 5637166 // Layer 7, block 2 +.word 18759424 // Layer 7, block 4 +.word 8648030 // Layer 7, block 6 +.word 1835254486 // Layer 7, block 0 +.word 360751090 // Layer 7, block 2 +.word 1200511508 // Layer 7, block 4 +.word 553431680 // Layer 7, block 6 +.word 7232147 // Layer 7, block 1 +.word 7430689 // Layer 7, block 3 +.word 14819378 // Layer 7, block 5 +.word 22112339 // Layer 7, block 7 +.word 462822084 // Layer 7, block 1 +.word 475527802 // Layer 7, block 3 +.word 948367809 // Layer 7, block 5 +.word 1415081692 // Layer 7, block 7 +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14834498 // Layer 6, block 4 +.word 22861321 // Layer 6, block 5 +.word 23033862 // Layer 6, block 6 +.word 32211066 // Layer 6, block 7 +.word 949335415 // Layer 6, block 4 +.word 1463012881 // Layer 6, block 5 +.word 1474054663 // Layer 6, block 6 +.word 2061350894 // Layer 6, block 7 +.word 7103825 // Layer 7, block 8 +.word 24338119 // Layer 7, block 10 +.word 6674394 // Layer 7, block 12 +.word 3716128 // Layer 7, block 14 +.word 454610102 // Layer 7, block 8 +.word 1557520740 // Layer 7, block 10 +.word 427128616 // Layer 7, block 12 +.word 237814041 // Layer 7, block 14 +.word 18577393 // Layer 7, block 9 +.word 17042091 // Layer 7, block 11 +.word 6574213 // Layer 7, block 13 +.word 24666803 // Layer 7, block 15 +.word 1188862414 // Layer 7, block 9 +.word 1090610585 // Layer 7, block 11 +.word 420717521 // Layer 7, block 13 +.word 1578554911 // Layer 7, block 15 +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 11253846 // Layer 6, block 8 +.word 16151303 // Layer 6, block 9 +.word 1821442 // Layer 6, block 10 +.word 23358663 // Layer 6, block 11 +.word 720191176 // Layer 6, block 8 +.word 1033604503 // Layer 6, block 9 +.word 116563391 // Layer 6, block 10 +.word 1494840340 // Layer 6, block 11 +.word 32787475 // Layer 7, block 16 +.word 8269259 // Layer 7, block 18 +.word 20826321 // Layer 7, block 20 +.word 21194054 // Layer 7, block 22 +.word 2098238255 // Layer 7, block 16 +.word 529192186 // Layer 7, block 18 +.word 1332782821 // Layer 7, block 20 +.word 1356315937 // Layer 7, block 22 +.word 28400654 // Layer 7, block 17 +.word 31090287 // Layer 7, block 19 +.word 26776841 // Layer 7, block 21 +.word 22281074 // Layer 7, block 23 +.word 1817503137 // Layer 7, block 17 +.word 1989626512 // Layer 7, block 19 +.word 1713587037 // Layer 7, block 21 +.word 1425879908 // Layer 7, block 23 +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 20504641 // Layer 6, block 12 +.word 7735096 // Layer 6, block 13 +.word 29463916 // Layer 6, block 14 +.word 23172067 // Layer 6, block 15 +.word 1312196872 // Layer 6, block 12 +.word 495008363 // Layer 6, block 13 +.word 1885546712 // Layer 6, block 14 +.word 1482899108 // Layer 6, block 15 +.word 1953000 // Layer 7, block 24 +.word 12766243 // Layer 7, block 26 +.word 16292342 // Layer 7, block 28 +.word 25143337 // Layer 7, block 30 +.word 124982461 // Layer 7, block 24 +.word 816977197 // Layer 7, block 26 +.word 1042630311 // Layer 7, block 28 +.word 1609050759 // Layer 7, block 30 +.word 12486848 // Layer 7, block 25 +.word 31556661 // Layer 7, block 27 +.word 28330310 // Layer 7, block 29 +.word 15137961 // Layer 7, block 31 +.word 799097282 // Layer 7, block 25 +.word 2019472170 // Layer 7, block 27 +.word 1813001465 // Layer 7, block 29 +.word 968755565 // Layer 7, block 31 +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 18663828 // Layer 6, block 16 +.word 25765932 // Layer 6, block 17 +.word 11779122 // Layer 6, block 18 +.word 29112305 // Layer 6, block 19 +.word 1194393831 // Layer 6, block 16 +.word 1648893798 // Layer 6, block 17 +.word 753806275 // Layer 6, block 18 +.word 1863045325 // Layer 6, block 19 +.word 33163184 // Layer 7, block 32 +.word 11550623 // Layer 7, block 34 +.word 25375595 // Layer 7, block 36 +.word 18254638 // Layer 7, block 38 +.word 2122281795 // Layer 7, block 32 +.word 739183455 // Layer 7, block 34 +.word 1623914137 // Layer 7, block 36 +.word 1168207670 // Layer 7, block 38 +.word 9551359 // Layer 7, block 33 +.word 33257316 // Layer 7, block 35 +.word 10387700 // Layer 7, block 37 +.word 4263629 // Layer 7, block 39 +.word 611240324 // Layer 7, block 33 +.word 2128305784 // Layer 7, block 35 +.word 664762063 // Layer 7, block 37 +.word 272851431 // Layer 7, block 39 +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 596073 // Layer 6, block 20 +.word 29039358 // Layer 6, block 21 +.word 6760262 // Layer 6, block 22 +.word 2228887 // Layer 6, block 23 +.word 38145761 // Layer 6, block 20 +.word 1858377074 // Layer 6, block 21 +.word 432623749 // Layer 6, block 22 +.word 142637881 // Layer 6, block 23 +.word 25929180 // Layer 7, block 40 +.word 23508428 // Layer 7, block 42 +.word 22560727 // Layer 7, block 44 +.word 29457393 // Layer 7, block 46 +.word 1659340873 // Layer 7, block 40 +.word 1504424569 // Layer 7, block 42 +.word 1443776334 // Layer 7, block 44 +.word 1885129272 // Layer 7, block 46 +.word 17371159 // Layer 7, block 41 +.word 11558208 // Layer 7, block 43 +.word 15755637 // Layer 7, block 45 +.word 20740787 // Layer 7, block 47 +.word 1111669329 // Layer 7, block 41 +.word 739668858 // Layer 7, block 43 +.word 1008283812 // Layer 7, block 45 +.word 1327309063 // Layer 7, block 47 +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 13624329 // Layer 6, block 24 +.word 9838349 // Layer 6, block 25 +.word 6934560 // Layer 6, block 26 +.word 11310234 // Layer 6, block 27 +.word 871890510 // Layer 6, block 24 +.word 629606282 // Layer 6, block 25 +.word 443777969 // Layer 6, block 26 +.word 723799733 // Layer 6, block 27 +.word 3153984 // Layer 7, block 48 +.word 15599806 // Layer 7, block 50 +.word 23484790 // Layer 7, block 52 +.word 30174454 // Layer 7, block 54 +.word 201839571 // Layer 7, block 48 +.word 998311389 // Layer 7, block 50 +.word 1502911852 // Layer 7, block 52 +.word 1931017673 // Layer 7, block 54 +.word 13598070 // Layer 7, block 49 +.word 31454003 // Layer 7, block 51 +.word 20506260 // Layer 7, block 53 +.word 5928435 // Layer 7, block 55 +.word 870210062 // Layer 7, block 49 +.word 2012902560 // Layer 7, block 51 +.word 1312300480 // Layer 7, block 53 +.word 379390883 // Layer 7, block 55 +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 32798516 // Layer 6, block 28 +.word 9911360 // Layer 6, block 29 +.word 32443170 // Layer 6, block 30 +.word 31293482 // Layer 6, block 31 +.word 2098944825 // Layer 6, block 28 +.word 634278629 // Layer 6, block 29 +.word 2076204416 // Layer 6, block 30 +.word 2002630000 // Layer 6, block 31 +.word 26013877 // Layer 7, block 56 +.word 22928950 // Layer 7, block 58 +.word 24547058 // Layer 7, block 60 +.word 21082546 // Layer 7, block 62 +.word 1664761067 // Layer 7, block 56 +.word 1467340807 // Layer 7, block 58 +.word 1570891816 // Layer 7, block 60 +.word 1349179970 // Layer 7, block 62 +.word 21864746 // Layer 7, block 57 +.word 27678266 // Layer 7, block 59 +.word 30695887 // Layer 7, block 61 +.word 31772478 // Layer 7, block 63 +.word 1399236949 // Layer 7, block 57 +.word 1771273834 // Layer 7, block 59 +.word 1964386839 // Layer 7, block 61 +.word 2033283404 // Layer 7, block 63 +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 2853776 // Layer 6, block 32 +.word 31645959 // Layer 6, block 33 +.word 29723614 // Layer 6, block 34 +.word 31813171 // Layer 6, block 35 +.word 182627725 // Layer 6, block 32 +.word 2025186806 // Layer 6, block 33 +.word 1902166116 // Layer 6, block 34 +.word 2035887557 // Layer 6, block 35 +.word 30377953 // Layer 7, block 64 +.word 4924837 // Layer 7, block 66 +.word 11362575 // Layer 7, block 68 +.word 31398766 // Layer 7, block 70 +.word 1944040616 // Layer 7, block 64 +.word 315165513 // Layer 7, block 66 +.word 727149301 // Layer 7, block 68 +.word 2009367662 // Layer 7, block 70 +.word 27689101 // Layer 7, block 65 +.word 31229525 // Layer 7, block 67 +.word 6544948 // Layer 7, block 69 +.word 13728247 // Layer 7, block 71 +.word 1771967221 // Layer 7, block 65 +.word 1998537064 // Layer 7, block 67 +.word 418844704 // Layer 7, block 69 +.word 878540754 // Layer 7, block 71 +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9116920 // Layer 6, block 36 +.word 26449800 // Layer 6, block 37 +.word 27173300 // Layer 6, block 38 +.word 1574249 // Layer 6, block 39 +.word 583438350 // Layer 6, block 36 +.word 1692658010 // Layer 6, block 37 +.word 1738958476 // Layer 6, block 38 +.word 100744247 // Layer 6, block 39 +.word 6510145 // Layer 7, block 72 +.word 760999 // Layer 7, block 74 +.word 1634503 // Layer 7, block 76 +.word 29546109 // Layer 7, block 78 +.word 416617482 // Layer 7, block 72 +.word 48700219 // Layer 7, block 74 +.word 104600209 // Layer 7, block 76 +.word 1890806663 // Layer 7, block 78 +.word 2195232 // Layer 7, block 73 +.word 4465852 // Layer 7, block 75 +.word 31203102 // Layer 7, block 77 +.word 29916743 // Layer 7, block 79 +.word 140484126 // Layer 7, block 73 +.word 285792715 // Layer 7, block 75 +.word 1996846121 // Layer 7, block 77 +.word 1914525428 // Layer 7, block 79 +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29172999 // Layer 6, block 40 +.word 16825951 // Layer 6, block 41 +.word 11592382 // Layer 6, block 42 +.word 2671395 // Layer 6, block 43 +.word 1866929445 // Layer 6, block 40 +.word 1076778680 // Layer 6, block 41 +.word 741855827 // Layer 6, block 42 +.word 170956232 // Layer 6, block 43 +.word 14579779 // Layer 7, block 80 +.word 24263513 // Layer 7, block 82 +.word 4646776 // Layer 7, block 84 +.word 69049 // Layer 7, block 86 +.word 933034643 // Layer 7, block 80 +.word 1552746321 // Layer 7, block 82 +.word 297370968 // Layer 7, block 84 +.word 4418799 // Layer 7, block 86 +.word 33263488 // Layer 7, block 81 +.word 22493246 // Layer 7, block 83 +.word 22009979 // Layer 7, block 85 +.word 12021234 // Layer 7, block 87 +.word 2128700762 // Layer 7, block 81 +.word 1439457879 // Layer 7, block 83 +.word 1408531152 // Layer 7, block 85 +.word 769300260 // Layer 7, block 87 +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 15720958 // Layer 6, block 44 +.word 4876619 // Layer 6, block 45 +.word 9370171 // Layer 6, block 46 +.word 2197027 // Layer 6, block 47 +.word 1006064525 // Layer 6, block 44 +.word 312079797 // Layer 6, block 45 +.word 599645177 // Layer 6, block 46 +.word 140598997 // Layer 6, block 47 +.word 16117282 // Layer 7, block 88 +.word 9635661 // Layer 7, block 90 +.word 9117520 // Layer 7, block 92 +.word 3506913 // Layer 7, block 94 +.word 1031427326 // Layer 7, block 88 +.word 616635240 // Layer 7, block 90 +.word 583476747 // Layer 7, block 92 +.word 224425303 // Layer 7, block 94 +.word 20014407 // Layer 7, block 89 +.word 25893988 // Layer 7, block 91 +.word 10257619 // Layer 7, block 93 +.word 24501669 // Layer 7, block 95 +.word 1280824291 // Layer 7, block 89 +.word 1657088757 // Layer 7, block 91 +.word 656437514 // Layer 7, block 93 +.word 1567987141 // Layer 7, block 95 +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 23467272 // Layer 6, block 48 +.word 11944835 // Layer 6, block 49 +.word 29768154 // Layer 6, block 50 +.word 3189790 // Layer 6, block 51 +.word 1501790786 // Layer 6, block 48 +.word 764411097 // Layer 6, block 49 +.word 1905016458 // Layer 6, block 50 +.word 204130980 // Layer 6, block 51 +.word 28559032 // Layer 7, block 96 +.word 20151609 // Layer 7, block 98 +.word 11645481 // Layer 7, block 100 +.word 16402437 // Layer 7, block 102 +.word 1827638556 // Layer 7, block 96 +.word 1289604549 // Layer 7, block 98 +.word 745253903 // Layer 7, block 100 +.word 1049675853 // Layer 7, block 102 +.word 1005359 // Layer 7, block 97 +.word 19130139 // Layer 7, block 99 +.word 11690281 // Layer 7, block 101 +.word 5461508 // Layer 7, block 103 +.word 64338065 // Layer 7, block 97 +.word 1224235458 // Layer 7, block 99 +.word 748120885 // Layer 7, block 101 +.word 349509836 // Layer 7, block 103 +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 4898455 // Layer 6, block 52 +.word 22059944 // Layer 6, block 53 +.word 20315246 // Layer 6, block 54 +.word 28615767 // Layer 6, block 55 +.word 313477194 // Layer 6, block 52 +.word 1411728668 // Layer 6, block 53 +.word 1300076517 // Layer 6, block 54 +.word 1831269319 // Layer 6, block 55 +.word 6226096 // Layer 7, block 104 +.word 14029790 // Layer 7, block 106 +.word 7729000 // Layer 7, block 108 +.word 13958531 // Layer 7, block 110 +.word 398439734 // Layer 7, block 104 +.word 897838034 // Layer 7, block 106 +.word 494618249 // Layer 7, block 108 +.word 893277806 // Layer 7, block 110 +.word 31755058 // Layer 7, block 105 +.word 26102744 // Layer 7, block 107 +.word 19175904 // Layer 7, block 109 +.word 19472238 // Layer 7, block 111 +.word 2032168609 // Layer 7, block 105 +.word 1670448121 // Layer 7, block 107 +.word 1227164194 // Layer 7, block 109 +.word 1246128123 // Layer 7, block 111 +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 17302560 // Layer 6, block 56 +.word 8630188 // Layer 6, block 57 +.word 13744680 // Layer 6, block 58 +.word 31890906 // Layer 6, block 59 +.word 1107279328 // Layer 6, block 56 +.word 552289879 // Layer 6, block 57 +.word 879592386 // Layer 6, block 58 +.word 2040862218 // Layer 6, block 59 +.word 4735938 // Layer 7, block 112 +.word 26671657 // Layer 7, block 114 +.word 25810971 // Layer 7, block 116 +.word 25578690 // Layer 7, block 118 +.word 303076900 // Layer 7, block 112 +.word 1706855774 // Layer 7, block 114 +.word 1651776074 // Layer 7, block 116 +.word 1636911225 // Layer 7, block 118 +.word 6957373 // Layer 7, block 113 +.word 25381712 // Layer 7, block 115 +.word 27780827 // Layer 7, block 117 +.word 28062311 // Layer 7, block 119 +.word 445237890 // Layer 7, block 113 +.word 1624305595 // Layer 7, block 115 +.word 1777837237 // Layer 7, block 117 +.word 1795850838 // Layer 7, block 119 +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 26150922 // Layer 6, block 60 +.word 29525906 // Layer 6, block 61 +.word 23080870 // Layer 6, block 62 +.word 1636987 // Layer 6, block 63 +.word 1673531278 // Layer 6, block 60 +.word 1889513769 // Layer 6, block 61 +.word 1477062945 // Layer 6, block 62 +.word 104759172 // Layer 6, block 63 +.word 10674616 // Layer 7, block 120 +.word 9508293 // Layer 7, block 122 +.word 4274200 // Layer 7, block 124 +.word 10066304 // Layer 7, block 126 +.word 683123285 // Layer 7, block 120 +.word 608484310 // Layer 7, block 122 +.word 273527923 // Layer 7, block 124 +.word 644194289 // Layer 7, block 126 +.word 26473446 // Layer 7, block 121 +.word 14853570 // Layer 7, block 123 +.word 32427548 // Layer 7, block 125 +.word 16598340 // Layer 7, block 127 +.word 1694171239 // Layer 7, block 121 +.word 950555930 // Layer 7, block 123 +.word 2075204685 // Layer 7, block 125 +.word 1062212688 // Layer 7, block 127 +.text +.global ntt_u32_full_neon_asm_var_4_4_3_z2_1 +.global _ntt_u32_full_neon_asm_var_4_4_3_z2_1 +ntt_u32_full_neon_asm_var_4_4_3_z2_1: +_ntt_u32_full_neon_asm_var_4_4_3_z2_1: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #800] +ldr q21, [x0, #864] +ldr q20, [x0, #928] +ldr q19, [x0, #992] +ldr q18, [x0, #288] +ldr q17, [x0, #352] +ldr q16, [x0, #416] +ldr q3, [x0, #480] +sqrdmulh v2.4S, v22.4S, v29.s[0] +ldr q1, [x0, #544] +mul v22.4S, v22.4S,v30.s[0] +ldr q0, [x0, #608] +sqrdmulh v15.4S, v21.4S, v29.s[0] +ldr q14, [x0, #672] +mul v21.4S, v21.4S,v30.s[0] +ldr q13, [x0, #736] +mla v22.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q12, [x0, #32] +sub v11.4s, v18.4s, v22.4s +mla v21.4S, v15.4S, v31.s[0] +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q15, [x0, #96] +sub v10.4s, v17.4s, v21.4s +mla v20.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v1.4S, v29.s[0] +ldr q2, [x0, #160] +mul v1.4S, v1.4S,v30.s[0] +sub v9.4s, v16.4s, v20.4s +mla v19.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v0.4S, v29.s[0] +ldr q22, [x0, #224] +mul v0.4S, v0.4S,v30.s[0] +sub v8.4s, v3.4s, v19.4s +mla v1.4S, v21.4S, v31.s[0] +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v21.4s, v12.4s, v1.4s +mla v0.4S, v20.4S, v31.s[0] +add v12.4s, v12.4s, v1.4s +sqrdmulh v1.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +sub v20.4s, v15.4s, v0.4s +mla v14.4S, v19.4S, v31.s[0] +add v15.4s, v15.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v19.4s, v2.4s, v14.4s +mla v13.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v1.4s, v22.4s, v13.4s +mla v16.4S, v0.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v0.4s, v2.4s, v16.4s +mla v3.4S, v14.4S, v31.s[0] +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v14.4s, v22.4s, v3.4s +mla v18.4S, v13.4S, v31.s[0] +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v29.s[2] +mul v9.4S, v9.4S,v30.s[2] +sub v13.4s, v12.4s, v18.4s +mla v17.4S, v16.4S, v31.s[0] +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v8.4S, v29.s[2] +mul v8.4S, v8.4S,v30.s[2] +sub v16.4s, v15.4s, v17.4s +mla v9.4S, v3.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +sqrdmulh v17.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v3.4s, v19.4s, v9.4s +mla v8.4S, v18.4S, v31.s[0] +add v19.4s, v19.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v18.4s, v1.4s, v8.4s +mla v11.4S, v17.4S, v31.s[0] +add v1.4s, v1.4s, v8.4s +sqrdmulh v8.4S, v2.4S, v27.s[0] +mul v2.4S, v2.4S,v28.s[0] +sub v17.4s, v21.4s, v11.4s +mla v10.4S, v9.4S, v31.s[0] +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v27.s[0] +mul v22.4S, v22.4S,v28.s[0] +sub v9.4s, v20.4s, v10.4s +mla v2.4S, v8.4S, v31.s[0] +add v20.4s, v20.4s, v10.4s +sqrdmulh v10.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v8.4s, v12.4s, v2.4s +mla v22.4S, v11.4S, v31.s[0] +add v12.4s, v12.4s, v2.4s +sqrdmulh v2.4S, v14.4S, v27.s[1] +mul v14.4S, v14.4S,v28.s[1] +sub v11.4s, v15.4s, v22.4s +mla v0.4S, v10.4S, v31.s[0] +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v27.s[2] +mul v19.4S, v19.4S,v28.s[2] +sub v10.4s, v13.4s, v0.4s +mla v14.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v0.4s +sqrdmulh v0.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +sub v2.4s, v16.4s, v14.4s +mla v19.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v27.s[3] +mul v3.4S, v3.4S,v28.s[3] +sub v22.4s, v21.4s, v19.4s +mla v1.4S, v0.4S, v31.s[0] +add v21.4s, v21.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v0.4s, v20.4s, v1.4s +mla v3.4S, v14.4S, v31.s[0] +add v20.4s, v20.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v25.s[0] +mul v15.4S, v15.4S,v26.s[0] +sub v14.4s, v17.4s, v3.4s +mla v18.4S, v19.4S, v31.s[0] +add v17.4s, v17.4s, v3.4s +sqrdmulh v3.4S, v11.4S, v25.s[1] +mul v11.4S, v11.4S,v26.s[1] +sub v19.4s, v9.4s, v18.4s +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v1.4s, v12.4s, v15.4s +mla v11.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v25.s[3] +mul v2.4S, v2.4S,v26.s[3] +sub v3.4s, v8.4s, v11.4s +mla v16.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v11.4s +str q12, [x0, #32] +sqrdmulh v12.4S, v20.4S, v23.s[0] +str q1, [x0, #96] +mul v20.4S, v20.4S,v24.s[0] +ldr q1, [x0, #816] +sub v11.4s, v13.4s, v16.4s +ldr q18, [x0, #880] +mla v2.4S, v15.4S, v31.s[0] +add v13.4s, v13.4s, v16.4s +str q8, [x0, #160] +sqrdmulh v8.4S, v0.4S, v23.s[1] +str q3, [x0, #224] +mul v0.4S, v0.4S,v24.s[1] +ldr q3, [x0, #944] +sub v16.4s, v10.4s, v2.4s +ldr q15, [x0, #1008] +mla v20.4S, v12.4S, v31.s[0] +add v10.4s, v10.4s, v2.4s +str q13, [x0, #288] +sqrdmulh v13.4S, v9.4S, v23.s[2] +str q11, [x0, #352] +mul v9.4S, v9.4S,v24.s[2] +ldr q11, [x0, #304] +sub v2.4s, v21.4s, v20.4s +ldr q12, [x0, #368] +mla v0.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v20.4s +str q10, [x0, #416] +sqrdmulh v10.4S, v19.4S, v23.s[3] +str q16, [x0, #480] +mul v19.4S, v19.4S,v24.s[3] +ldr q16, [x0, #432] +sub v20.4s, v22.4s, v0.4s +ldr q8, [x0, #496] +mla v9.4S, v13.4S, v31.s[0] +add v22.4s, v22.4s, v0.4s +str q21, [x0, #544] +sqrdmulh v21.4S, v1.4S, v29.s[0] +str q2, [x0, #608] +ldr q2, [x0, #560] +mul v1.4S, v1.4S,v30.s[0] +ldr q0, [x0, #624] +sub v13.4s, v17.4s, v9.4s +mla v19.4S, v10.4S, v31.s[0] +add v17.4s, v17.4s, v9.4s +str q22, [x0, #672] +sqrdmulh v22.4S, v18.4S, v29.s[0] +str q20, [x0, #736] +ldr q20, [x0, #688] +mul v18.4S, v18.4S,v30.s[0] +ldr q9, [x0, #752] +sub v10.4s, v14.4s, v19.4s +mla v1.4S, v21.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +str q17, [x0, #800] +sqrdmulh v17.4S, v3.4S, v29.s[0] +str q13, [x0, #864] +mul v3.4S, v3.4S,v30.s[0] +ldr q13, [x0, #48] +sub v19.4s, v11.4s, v1.4s +mla v18.4S, v22.4S, v31.s[0] +add v11.4s, v11.4s, v1.4s +str q14, [x0, #928] +sqrdmulh v14.4S, v15.4S, v29.s[0] +str q10, [x0, #992] +mul v15.4S, v15.4S,v30.s[0] +ldr q10, [x0, #112] +sub v1.4s, v12.4s, v18.4s +mla v3.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v2.4S, v29.s[0] +ldr q17, [x0, #176] +mul v2.4S, v2.4S,v30.s[0] +sub v22.4s, v16.4s, v3.4s +mla v15.4S, v14.4S, v31.s[0] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v0.4S, v29.s[0] +ldr q14, [x0, #240] +mul v0.4S, v0.4S,v30.s[0] +sub v21.4s, v8.4s, v15.4s +mla v2.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +sub v18.4s, v13.4s, v2.4s +mla v0.4S, v3.4S, v31.s[0] +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v9.4S, v29.s[0] +mul v9.4S, v9.4S,v30.s[0] +sub v3.4s, v10.4s, v0.4s +mla v20.4S, v15.4S, v31.s[0] +add v10.4s, v10.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v15.4s, v17.4s, v20.4s +mla v9.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +sub v2.4s, v14.4s, v9.4s +mla v16.4S, v0.4S, v31.s[0] +add v14.4s, v14.4s, v9.4s +sqrdmulh v9.4S, v11.4S, v29.s[1] +mul v11.4S, v11.4S,v30.s[1] +sub v0.4s, v17.4s, v16.4s +mla v8.4S, v20.4S, v31.s[0] +add v17.4s, v17.4s, v16.4s +sqrdmulh v16.4S, v12.4S, v29.s[1] +mul v12.4S, v12.4S,v30.s[1] +sub v20.4s, v14.4s, v8.4s +mla v11.4S, v9.4S, v31.s[0] +add v14.4s, v14.4s, v8.4s +sqrdmulh v8.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +sub v9.4s, v13.4s, v11.4s +mla v12.4S, v16.4S, v31.s[0] +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +sub v16.4s, v10.4s, v12.4s +mla v22.4S, v8.4S, v31.s[0] +add v10.4s, v10.4s, v12.4s +sqrdmulh v12.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +sub v8.4s, v15.4s, v22.4s +mla v21.4S, v11.4S, v31.s[0] +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v1.4S, v29.s[2] +mul v1.4S, v1.4S,v30.s[2] +sub v11.4s, v2.4s, v21.4s +mla v19.4S, v12.4S, v31.s[0] +add v2.4s, v2.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v27.s[0] +mul v17.4S, v17.4S,v28.s[0] +sub v12.4s, v18.4s, v19.4s +mla v1.4S, v22.4S, v31.s[0] +add v18.4s, v18.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +sub v22.4s, v3.4s, v1.4s +mla v17.4S, v21.4S, v31.s[0] +add v3.4s, v3.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v21.4s, v13.4s, v17.4s +mla v14.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v17.4s +sqrdmulh v17.4S, v20.4S, v27.s[1] +mul v20.4S, v20.4S,v28.s[1] +sub v19.4s, v10.4s, v14.4s +mla v0.4S, v1.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v27.s[2] +mul v15.4S, v15.4S,v28.s[2] +sub v1.4s, v9.4s, v0.4s +mla v20.4S, v17.4S, v31.s[0] +add v9.4s, v9.4s, v0.4s +sqrdmulh v0.4S, v2.4S, v27.s[2] +mul v2.4S, v2.4S,v28.s[2] +sub v17.4s, v16.4s, v20.4s +mla v15.4S, v14.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v27.s[3] +mul v8.4S, v8.4S,v28.s[3] +sub v14.4s, v18.4s, v15.4s +mla v2.4S, v0.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v27.s[3] +mul v11.4S, v11.4S,v28.s[3] +sub v0.4s, v3.4s, v2.4s +mla v8.4S, v20.4S, v31.s[0] +add v3.4s, v3.4s, v2.4s +sqrdmulh v2.4S, v10.4S, v25.s[0] +mul v10.4S, v10.4S,v26.s[0] +sub v20.4s, v12.4s, v8.4s +mla v11.4S, v15.4S, v31.s[0] +add v12.4s, v12.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v25.s[1] +mul v19.4S, v19.4S,v26.s[1] +sub v15.4s, v22.4s, v11.4s +mla v10.4S, v2.4S, v31.s[0] +add v22.4s, v22.4s, v11.4s +sqrdmulh v11.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v2.4s, v13.4s, v10.4s +mla v19.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v10.4s +sqrdmulh v10.4S, v17.4S, v25.s[3] +mul v17.4S, v17.4S,v26.s[3] +sub v8.4s, v21.4s, v19.4s +mla v16.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v19.4s +str q13, [x0, #48] +sqrdmulh v13.4S, v3.4S, v23.s[0] +str q2, [x0, #112] +mul v3.4S, v3.4S,v24.s[0] +ldr q2, [x0, #768] +sub v19.4s, v9.4s, v16.4s +ldr q11, [x0, #832] +mla v17.4S, v10.4S, v31.s[0] +add v9.4s, v9.4s, v16.4s +str q21, [x0, #176] +sqrdmulh v21.4S, v0.4S, v23.s[1] +str q8, [x0, #240] +mul v0.4S, v0.4S,v24.s[1] +ldr q8, [x0, #896] +sub v16.4s, v1.4s, v17.4s +ldr q10, [x0, #960] +mla v3.4S, v13.4S, v31.s[0] +add v1.4s, v1.4s, v17.4s +str q9, [x0, #304] +sqrdmulh v9.4S, v22.4S, v23.s[2] +str q19, [x0, #368] +mul v22.4S, v22.4S,v24.s[2] +ldr q19, [x0, #256] +sub v17.4s, v18.4s, v3.4s +ldr q13, [x0, #320] +mla v0.4S, v21.4S, v31.s[0] +add v18.4s, v18.4s, v3.4s +str q1, [x0, #432] +sqrdmulh v1.4S, v15.4S, v23.s[3] +str q16, [x0, #496] +mul v15.4S, v15.4S,v24.s[3] +ldr q16, [x0, #384] +sub v3.4s, v14.4s, v0.4s +ldr q21, [x0, #448] +mla v22.4S, v9.4S, v31.s[0] +add v14.4s, v14.4s, v0.4s +str q18, [x0, #560] +sqrdmulh v18.4S, v2.4S, v29.s[0] +str q17, [x0, #624] +ldr q17, [x0, #512] +mul v2.4S, v2.4S,v30.s[0] +ldr q0, [x0, #576] +sub v9.4s, v12.4s, v22.4s +mla v15.4S, v1.4S, v31.s[0] +add v12.4s, v12.4s, v22.4s +str q14, [x0, #688] +sqrdmulh v14.4S, v11.4S, v29.s[0] +str q3, [x0, #752] +ldr q3, [x0, #640] +mul v11.4S, v11.4S,v30.s[0] +ldr q22, [x0, #704] +sub v1.4s, v20.4s, v15.4s +mla v2.4S, v18.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +str q12, [x0, #816] +sqrdmulh v12.4S, v8.4S, v29.s[0] +str q9, [x0, #880] +mul v8.4S, v8.4S,v30.s[0] +ldr q9, [x0, #0] +sub v15.4s, v19.4s, v2.4s +mla v11.4S, v14.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +str q20, [x0, #944] +sqrdmulh v20.4S, v10.4S, v29.s[0] +str q1, [x0, #1008] +mul v10.4S, v10.4S,v30.s[0] +ldr q1, [x0, #64] +sub v2.4s, v13.4s, v11.4s +mla v8.4S, v12.4S, v31.s[0] +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v29.s[0] +ldr q12, [x0, #128] +mul v17.4S, v17.4S,v30.s[0] +sub v14.4s, v16.4s, v8.4s +mla v10.4S, v20.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v0.4S, v29.s[0] +ldr q20, [x0, #192] +mul v0.4S, v0.4S,v30.s[0] +sub v18.4s, v21.4s, v10.4s +mla v17.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +sub v11.4s, v9.4s, v17.4s +mla v0.4S, v8.4S, v31.s[0] +add v9.4s, v9.4s, v17.4s +sqrdmulh v17.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v8.4s, v1.4s, v0.4s +mla v3.4S, v10.4S, v31.s[0] +add v1.4s, v1.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v10.4s, v12.4s, v3.4s +mla v22.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v17.4s, v20.4s, v22.4s +mla v16.4S, v0.4S, v31.s[0] +add v20.4s, v20.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[1] +mul v19.4S, v19.4S,v30.s[1] +sub v0.4s, v12.4s, v16.4s +mla v21.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v13.4S, v29.s[1] +mul v13.4S, v13.4S,v30.s[1] +sub v3.4s, v20.4s, v21.4s +mla v19.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v22.4s, v9.4s, v19.4s +mla v13.4S, v16.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v29.s[2] +mul v18.4S, v18.4S,v30.s[2] +sub v16.4s, v1.4s, v13.4s +mla v14.4S, v21.4S, v31.s[0] +add v1.4s, v1.4s, v13.4s +sqrdmulh v13.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +sub v21.4s, v10.4s, v14.4s +mla v18.4S, v19.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v19.4s, v17.4s, v18.4s +mla v15.4S, v13.4S, v31.s[0] +add v17.4s, v17.4s, v18.4s +sqrdmulh v18.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +sub v13.4s, v11.4s, v15.4s +mla v2.4S, v14.4S, v31.s[0] +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v27.s[0] +mul v20.4S, v20.4S,v28.s[0] +sub v14.4s, v8.4s, v2.4s +mla v12.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v2.4s +sqrdmulh v2.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v18.4s, v9.4s, v12.4s +mla v20.4S, v15.4S, v31.s[0] +add v9.4s, v9.4s, v12.4s +sqrdmulh v12.4S, v3.4S, v27.s[1] +mul v3.4S, v3.4S,v28.s[1] +sub v15.4s, v1.4s, v20.4s +mla v0.4S, v2.4S, v31.s[0] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v10.4S, v27.s[2] +mul v10.4S, v10.4S,v28.s[2] +sub v2.4s, v22.4s, v0.4s +mla v3.4S, v12.4S, v31.s[0] +add v22.4s, v22.4s, v0.4s +sqrdmulh v0.4S, v17.4S, v27.s[2] +mul v17.4S, v17.4S,v28.s[2] +sub v12.4s, v16.4s, v3.4s +mla v10.4S, v20.4S, v31.s[0] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +sub v20.4s, v11.4s, v10.4s +mla v17.4S, v0.4S, v31.s[0] +add v11.4s, v11.4s, v10.4s +sqrdmulh v10.4S, v19.4S, v27.s[3] +mul v19.4S, v19.4S,v28.s[3] +sub v0.4s, v8.4s, v17.4s +mla v21.4S, v3.4S, v31.s[0] +add v8.4s, v8.4s, v17.4s +sqrdmulh v17.4S, v1.4S, v25.s[0] +mul v1.4S, v1.4S,v26.s[0] +sub v3.4s, v13.4s, v21.4s +mla v19.4S, v10.4S, v31.s[0] +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v15.4S, v25.s[1] +mul v15.4S, v15.4S,v26.s[1] +sub v10.4s, v14.4s, v19.4s +mla v1.4S, v17.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +sqrdmulh v19.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v17.4s, v9.4s, v1.4s +mla v15.4S, v21.4S, v31.s[0] +add v9.4s, v9.4s, v1.4s +sqrdmulh v1.4S, v12.4S, v25.s[3] +mul v12.4S, v12.4S,v26.s[3] +sub v21.4s, v18.4s, v15.4s +mla v16.4S, v19.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +str q9, [x0, #0] +sqrdmulh v9.4S, v8.4S, v23.s[0] +str q17, [x0, #64] +mul v8.4S, v8.4S,v24.s[0] +ldr q17, [x0, #784] +sub v15.4s, v22.4s, v16.4s +ldr q19, [x0, #848] +mla v12.4S, v1.4S, v31.s[0] +add v22.4s, v22.4s, v16.4s +str q18, [x0, #128] +sqrdmulh v18.4S, v0.4S, v23.s[1] +str q21, [x0, #192] +mul v0.4S, v0.4S,v24.s[1] +ldr q21, [x0, #912] +sub v16.4s, v2.4s, v12.4s +ldr q1, [x0, #976] +mla v8.4S, v9.4S, v31.s[0] +add v2.4s, v2.4s, v12.4s +str q22, [x0, #256] +sqrdmulh v22.4S, v14.4S, v23.s[2] +str q15, [x0, #320] +mul v14.4S, v14.4S,v24.s[2] +ldr q15, [x0, #272] +sub v12.4s, v11.4s, v8.4s +ldr q9, [x0, #336] +mla v0.4S, v18.4S, v31.s[0] +add v11.4s, v11.4s, v8.4s +str q2, [x0, #384] +sqrdmulh v2.4S, v10.4S, v23.s[3] +str q16, [x0, #448] +mul v10.4S, v10.4S,v24.s[3] +ldr q16, [x0, #400] +sub v8.4s, v20.4s, v0.4s +ldr q18, [x0, #464] +mla v14.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v0.4s +str q11, [x0, #512] +sqrdmulh v11.4S, v17.4S, v29.s[0] +str q12, [x0, #576] +ldr q12, [x0, #528] +mul v17.4S, v17.4S,v30.s[0] +ldr q0, [x0, #592] +sub v22.4s, v13.4s, v14.4s +mla v10.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v14.4s +str q20, [x0, #640] +sqrdmulh v20.4S, v19.4S, v29.s[0] +str q8, [x0, #704] +ldr q8, [x0, #656] +mul v19.4S, v19.4S,v30.s[0] +ldr q14, [x0, #720] +sub v2.4s, v3.4s, v10.4s +mla v17.4S, v11.4S, v31.s[0] +add v3.4s, v3.4s, v10.4s +str q13, [x0, #768] +sqrdmulh v13.4S, v21.4S, v29.s[0] +str q22, [x0, #832] +mul v21.4S, v21.4S,v30.s[0] +ldr q22, [x0, #16] +sub v10.4s, v15.4s, v17.4s +mla v19.4S, v20.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +str q3, [x0, #896] +sqrdmulh v3.4S, v1.4S, v29.s[0] +str q2, [x0, #960] +mul v1.4S, v1.4S,v30.s[0] +ldr q2, [x0, #80] +sub v17.4s, v9.4s, v19.4s +mla v21.4S, v13.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v12.4S, v29.s[0] +ldr q13, [x0, #144] +mul v12.4S, v12.4S,v30.s[0] +sub v20.4s, v16.4s, v21.4s +mla v1.4S, v3.4S, v31.s[0] +add v16.4s, v16.4s, v21.4s +sqrdmulh v21.4S, v0.4S, v29.s[0] +ldr q3, [x0, #208] +mul v0.4S, v0.4S,v30.s[0] +sub v11.4s, v18.4s, v1.4s +mla v12.4S, v19.4S, v31.s[0] +add v18.4s, v18.4s, v1.4s +sqrdmulh v1.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v19.4s, v22.4s, v12.4s +mla v0.4S, v21.4S, v31.s[0] +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v21.4s, v2.4s, v0.4s +mla v8.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v1.4s, v13.4s, v8.4s +mla v14.4S, v12.4S, v31.s[0] +add v13.4s, v13.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v12.4s, v3.4s, v14.4s +mla v16.4S, v0.4S, v31.s[0] +add v3.4s, v3.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +sub v0.4s, v13.4s, v16.4s +mla v18.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v16.4s +sqrdmulh v16.4S, v9.4S, v29.s[1] +mul v9.4S, v9.4S,v30.s[1] +sub v8.4s, v3.4s, v18.4s +mla v15.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +sub v14.4s, v22.4s, v15.4s +mla v9.4S, v16.4S, v31.s[0] +add v22.4s, v22.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v16.4s, v2.4s, v9.4s +mla v20.4S, v18.4S, v31.s[0] +add v2.4s, v2.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v18.4s, v1.4s, v20.4s +mla v11.4S, v15.4S, v31.s[0] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +sub v15.4s, v12.4s, v11.4s +mla v10.4S, v9.4S, v31.s[0] +add v12.4s, v12.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v27.s[0] +mul v13.4S, v13.4S,v28.s[0] +sub v9.4s, v19.4s, v10.4s +mla v17.4S, v20.4S, v31.s[0] +add v19.4s, v19.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v27.s[0] +mul v3.4S, v3.4S,v28.s[0] +sub v20.4s, v21.4s, v17.4s +mla v13.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v11.4s, v22.4s, v13.4s +mla v3.4S, v10.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v8.4S, v27.s[1] +mul v8.4S, v8.4S,v28.s[1] +sub v10.4s, v2.4s, v3.4s +mla v0.4S, v17.4S, v31.s[0] +add v2.4s, v2.4s, v3.4s +sqrdmulh v3.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +sub v17.4s, v14.4s, v0.4s +mla v8.4S, v13.4S, v31.s[0] +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v12.4S, v27.s[2] +mul v12.4S, v12.4S,v28.s[2] +sub v13.4s, v16.4s, v8.4s +mla v1.4S, v3.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v3.4s, v19.4s, v1.4s +mla v12.4S, v0.4S, v31.s[0] +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +sub v0.4s, v21.4s, v12.4s +mla v18.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v2.4S, v25.s[0] +mul v2.4S, v2.4S,v26.s[0] +sub v8.4s, v9.4s, v18.4s +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v10.4S, v25.s[1] +mul v10.4S, v10.4S,v26.s[1] +sub v1.4s, v20.4s, v15.4s +mla v2.4S, v12.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v12.4s, v22.4s, v2.4s +mla v10.4S, v18.4S, v31.s[0] +add v22.4s, v22.4s, v2.4s +sqrdmulh v2.4S, v13.4S, v25.s[3] +mul v13.4S, v13.4S,v26.s[3] +sub v18.4s, v11.4s, v10.4s +mla v16.4S, v15.4S, v31.s[0] +add v11.4s, v11.4s, v10.4s +str q22, [x0, #16] +sqrdmulh v22.4S, v21.4S, v23.s[0] +str q12, [x0, #80] +mul v21.4S, v21.4S,v24.s[0] +sub v12.4s, v14.4s, v16.4s +mla v13.4S, v2.4S, v31.s[0] +add v14.4s, v14.4s, v16.4s +str q11, [x0, #144] +sqrdmulh v11.4S, v0.4S, v23.s[1] +str q18, [x0, #208] +mul v0.4S, v0.4S,v24.s[1] +sub v18.4s, v17.4s, v13.4s +mla v21.4S, v22.4S, v31.s[0] +add v17.4s, v17.4s, v13.4s +str q14, [x0, #272] +sqrdmulh v14.4S, v20.4S, v23.s[2] +str q12, [x0, #336] +mul v20.4S, v20.4S,v24.s[2] +sub v12.4s, v19.4s, v21.4s +mla v0.4S, v11.4S, v31.s[0] +add v19.4s, v19.4s, v21.4s +str q17, [x0, #400] +sqrdmulh v17.4S, v1.4S, v23.s[3] +str q18, [x0, #464] +mul v1.4S, v1.4S,v24.s[3] +sub v18.4s, v3.4s, v0.4s +mla v20.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v0.4s +str q19, [x0, #528] +str q12, [x0, #592] +sub v12.4s, v9.4s, v20.4s +mla v1.4S, v17.4S, v31.s[0] +add v9.4s, v9.4s, v20.4s +str q3, [x0, #656] +str q18, [x0, #720] +sub v18.4s, v8.4s, v1.4s +add v8.4s, v8.4s, v1.4s +str q9, [x0, #784] +str q12, [x0, #848] +str q8, [x0, #912] +str q18, [x0, #976] +ldr q4, [x17, #+128] +ldr q5, [x17, #+144] +ldr q6, [x17, #+160] +ldr q7, [x17, #+176] +ldr q15, [x17, #+192] +ldr q10, [x17, #+208] +ldr q2, [x17, #+224] +ldr q16, [x17, #+240] +ldr q22, [x0, #32] +ldr q13, [x0, #48] +ldr q11, [x0, #0] +ldr q21, [x0, #16] +ldr q14, [x17, #+1152] +ldr q0, [x17, #+1168] +ldr q19, [x17, #+1184] +ldr q17, [x17, #+1200] +ldr q20, [x17, #+1216] +ldr q3, [x17, #+1232] +ldr q1, [x17, #+1248] +ldr q9, [x17, #+1264] +ldr q12, [x0, #544] +ldr q8, [x0, #560] +ldr q18, [x0, #512] +ldr q30, [x0, #528] +sqrdmulh v29.4S, v22.4S, v5.s[0] +mul v22.4S, v22.4S,v4.s[0] +mla v22.4S, v29.4S, v31.s[0] +sub v29.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +sqrdmulh v22.4S, v12.4S, v0.s[0] +mul v12.4S, v12.4S,v14.s[0] +mla v12.4S, v22.4S, v31.s[0] +sub v22.4s, v18.4s, v12.4s +add v18.4s, v18.4s, v12.4s +sqrdmulh v12.4S, v13.4S, v5.s[0] +mul v13.4S, v13.4S,v4.s[0] +mla v13.4S, v12.4S, v31.s[0] +sub v12.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +sqrdmulh v13.4S, v8.4S, v0.s[0] +mul v8.4S, v8.4S,v14.s[0] +mla v8.4S, v13.4S, v31.s[0] +sub v13.4s, v30.4s, v8.4s +add v30.4s, v30.4s, v8.4s +sqrdmulh v8.4S, v21.4S, v5.s[1] +mul v21.4S, v21.4S,v4.s[1] +mla v21.4S, v8.4S, v31.s[0] +sub v8.4s, v11.4s, v21.4s +add v11.4s, v11.4s, v21.4s +sqrdmulh v21.4S, v30.4S, v0.s[1] +mul v30.4S, v30.4S,v14.s[1] +mla v30.4S, v21.4S, v31.s[0] +sub v21.4s, v18.4s, v30.4s +add v18.4s, v18.4s, v30.4s +sqrdmulh v30.4S, v12.4S, v5.s[2] +mul v12.4S, v12.4S,v4.s[2] +mla v12.4S, v30.4S, v31.s[0] +sub v30.4s, v29.4s, v12.4s +add v29.4s, v29.4s, v12.4s +sqrdmulh v12.4S, v13.4S, v0.s[2] +mul v13.4S, v13.4S,v14.s[2] +mla v13.4S, v12.4S, v31.s[0] +sub v12.4s, v22.4s, v13.4s +add v22.4s, v22.4s, v13.4s +trn1 v13.4S, v11.4S, v8.4S +trn2 v28.4S, v11.4S, v8.4S +trn1 v27.4S, v29.4S, v30.4S +trn2 v26.4S, v29.4S, v30.4S +trn2 v29.2D, v13.2D, v27.2D +trn2 v30.2D, v28.2D, v26.2D +trn1 v11.2D, v13.2D, v27.2D +trn1 v8.2D, v28.2D, v26.2D +trn1 v26.4S, v18.4S, v21.4S +trn2 v28.4S, v18.4S, v21.4S +trn1 v27.4S, v22.4S, v12.4S +trn2 v13.4S, v22.4S, v12.4S +trn2 v22.2D, v26.2D, v27.2D +trn2 v12.2D, v28.2D, v13.2D +trn1 v18.2D, v26.2D, v27.2D +trn1 v21.2D, v28.2D, v13.2D +sqrdmulh v13.4S, v29.4S, v7.4S +mul v29.4S, v29.4S,v6.4S +mla v29.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v29.4s +add v11.4s, v11.4s, v29.4s +sqrdmulh v29.4S, v22.4S, v17.4S +mul v22.4S, v22.4S,v19.4S +mla v22.4S, v29.4S, v31.s[0] +sub v29.4s, v18.4s, v22.4s +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v30.4S, v7.4S +mul v30.4S, v30.4S,v6.4S +mla v30.4S, v22.4S, v31.s[0] +sub v22.4s, v8.4s, v30.4s +add v8.4s, v8.4s, v30.4s +sqrdmulh v30.4S, v12.4S, v17.4S +mul v12.4S, v12.4S,v19.4S +mla v12.4S, v30.4S, v31.s[0] +sub v30.4s, v21.4s, v12.4s +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v8.4S, v10.4S +mul v8.4S, v8.4S,v15.4S +mla v8.4S, v12.4S, v31.s[0] +sub v12.4s, v11.4s, v8.4s +add v11.4s, v11.4s, v8.4s +sqrdmulh v8.4S, v21.4S, v3.4S +mul v21.4S, v21.4S,v20.4S +mla v21.4S, v8.4S, v31.s[0] +sub v8.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v16.4S +mul v22.4S, v22.4S,v2.4S +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v13.4s, v22.4s +add v13.4s, v13.4s, v22.4s +sqrdmulh v22.4S, v30.4S, v9.4S +mul v30.4S, v30.4S,v1.4S +mla v30.4S, v22.4S, v31.s[0] +sub v22.4s, v29.4s, v30.4s +add v29.4s, v29.4s, v30.4s +str q11, [x0, #0] +str q12, [x0, #16] +str q13, [x0, #32] +str q21, [x0, #48] +str q18, [x0, #512] +str q8, [x0, #528] +str q29, [x0, #544] +str q22, [x0, #560] +ldr q9, [x17, #+256] +ldr q1, [x17, #+272] +ldr q3, [x17, #+288] +ldr q20, [x17, #+304] +ldr q17, [x17, #+320] +ldr q19, [x17, #+336] +ldr q0, [x17, #+352] +ldr q14, [x17, #+368] +ldr q22, [x0, #96] +ldr q29, [x0, #112] +ldr q8, [x0, #64] +ldr q18, [x0, #80] +ldr q16, [x17, #+1280] +ldr q2, [x17, #+1296] +ldr q10, [x17, #+1312] +ldr q15, [x17, #+1328] +ldr q7, [x17, #+1344] +ldr q6, [x17, #+1360] +ldr q5, [x17, #+1376] +ldr q4, [x17, #+1392] +ldr q21, [x0, #608] +ldr q13, [x0, #624] +ldr q12, [x0, #576] +ldr q11, [x0, #592] +sqrdmulh v30.4S, v22.4S, v1.s[0] +mul v22.4S, v22.4S,v9.s[0] +mla v22.4S, v30.4S, v31.s[0] +sub v30.4s, v8.4s, v22.4s +add v8.4s, v8.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v2.s[0] +mul v21.4S, v21.4S,v16.s[0] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v12.4s, v21.4s +add v12.4s, v12.4s, v21.4s +sqrdmulh v21.4S, v29.4S, v1.s[0] +mul v29.4S, v29.4S,v9.s[0] +mla v29.4S, v21.4S, v31.s[0] +sub v21.4s, v18.4s, v29.4s +add v18.4s, v18.4s, v29.4s +sqrdmulh v29.4S, v13.4S, v2.s[0] +mul v13.4S, v13.4S,v16.s[0] +mla v13.4S, v29.4S, v31.s[0] +sub v29.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +sqrdmulh v13.4S, v18.4S, v1.s[1] +mul v18.4S, v18.4S,v9.s[1] +mla v18.4S, v13.4S, v31.s[0] +sub v13.4s, v8.4s, v18.4s +add v8.4s, v8.4s, v18.4s +sqrdmulh v18.4S, v11.4S, v2.s[1] +mul v11.4S, v11.4S,v16.s[1] +mla v11.4S, v18.4S, v31.s[0] +sub v18.4s, v12.4s, v11.4s +add v12.4s, v12.4s, v11.4s +sqrdmulh v11.4S, v21.4S, v1.s[2] +mul v21.4S, v21.4S,v9.s[2] +mla v21.4S, v11.4S, v31.s[0] +sub v11.4s, v30.4s, v21.4s +add v30.4s, v30.4s, v21.4s +sqrdmulh v21.4S, v29.4S, v2.s[2] +mul v29.4S, v29.4S,v16.s[2] +mla v29.4S, v21.4S, v31.s[0] +sub v21.4s, v22.4s, v29.4s +add v22.4s, v22.4s, v29.4s +trn1 v29.4S, v8.4S, v13.4S +trn2 v28.4S, v8.4S, v13.4S +trn1 v27.4S, v30.4S, v11.4S +trn2 v26.4S, v30.4S, v11.4S +trn2 v30.2D, v29.2D, v27.2D +trn2 v11.2D, v28.2D, v26.2D +trn1 v8.2D, v29.2D, v27.2D +trn1 v13.2D, v28.2D, v26.2D +trn1 v26.4S, v12.4S, v18.4S +trn2 v28.4S, v12.4S, v18.4S +trn1 v27.4S, v22.4S, v21.4S +trn2 v29.4S, v22.4S, v21.4S +trn2 v22.2D, v26.2D, v27.2D +trn2 v21.2D, v28.2D, v29.2D +trn1 v12.2D, v26.2D, v27.2D +trn1 v18.2D, v28.2D, v29.2D +sqrdmulh v29.4S, v30.4S, v20.4S +mul v30.4S, v30.4S,v3.4S +mla v30.4S, v29.4S, v31.s[0] +sub v29.4s, v8.4s, v30.4s +add v8.4s, v8.4s, v30.4s +sqrdmulh v30.4S, v22.4S, v15.4S +mul v22.4S, v22.4S,v10.4S +mla v22.4S, v30.4S, v31.s[0] +sub v30.4s, v12.4s, v22.4s +add v12.4s, v12.4s, v22.4s +sqrdmulh v22.4S, v11.4S, v20.4S +mul v11.4S, v11.4S,v3.4S +mla v11.4S, v22.4S, v31.s[0] +sub v22.4s, v13.4s, v11.4s +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v21.4S, v15.4S +mul v21.4S, v21.4S,v10.4S +mla v21.4S, v11.4S, v31.s[0] +sub v11.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v13.4S, v19.4S +mul v13.4S, v13.4S,v17.4S +mla v13.4S, v21.4S, v31.s[0] +sub v21.4s, v8.4s, v13.4s +add v8.4s, v8.4s, v13.4s +sqrdmulh v13.4S, v18.4S, v6.4S +mul v18.4S, v18.4S,v7.4S +mla v18.4S, v13.4S, v31.s[0] +sub v13.4s, v12.4s, v18.4s +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v22.4S, v14.4S +mul v22.4S, v22.4S,v0.4S +mla v22.4S, v18.4S, v31.s[0] +sub v18.4s, v29.4s, v22.4s +add v29.4s, v29.4s, v22.4s +sqrdmulh v22.4S, v11.4S, v4.4S +mul v11.4S, v11.4S,v5.4S +mla v11.4S, v22.4S, v31.s[0] +sub v22.4s, v30.4s, v11.4s +add v30.4s, v30.4s, v11.4s +str q8, [x0, #64] +str q21, [x0, #80] +str q29, [x0, #96] +str q18, [x0, #112] +str q12, [x0, #576] +str q13, [x0, #592] +str q30, [x0, #608] +str q22, [x0, #624] +ldr q4, [x17, #+384] +ldr q5, [x17, #+400] +ldr q6, [x17, #+416] +ldr q7, [x17, #+432] +ldr q15, [x17, #+448] +ldr q10, [x17, #+464] +ldr q2, [x17, #+480] +ldr q16, [x17, #+496] +ldr q22, [x0, #160] +ldr q30, [x0, #176] +ldr q13, [x0, #128] +ldr q12, [x0, #144] +ldr q14, [x17, #+1408] +ldr q0, [x17, #+1424] +ldr q19, [x17, #+1440] +ldr q17, [x17, #+1456] +ldr q20, [x17, #+1472] +ldr q3, [x17, #+1488] +ldr q1, [x17, #+1504] +ldr q9, [x17, #+1520] +ldr q18, [x0, #672] +ldr q29, [x0, #688] +ldr q21, [x0, #640] +ldr q8, [x0, #656] +sqrdmulh v11.4S, v22.4S, v5.s[0] +mul v22.4S, v22.4S,v4.s[0] +mla v22.4S, v11.4S, v31.s[0] +sub v11.4s, v13.4s, v22.4s +add v13.4s, v13.4s, v22.4s +sqrdmulh v22.4S, v18.4S, v0.s[0] +mul v18.4S, v18.4S,v14.s[0] +mla v18.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v18.4s +add v21.4s, v21.4s, v18.4s +sqrdmulh v18.4S, v30.4S, v5.s[0] +mul v30.4S, v30.4S,v4.s[0] +mla v30.4S, v18.4S, v31.s[0] +sub v18.4s, v12.4s, v30.4s +add v12.4s, v12.4s, v30.4s +sqrdmulh v30.4S, v29.4S, v0.s[0] +mul v29.4S, v29.4S,v14.s[0] +mla v29.4S, v30.4S, v31.s[0] +sub v30.4s, v8.4s, v29.4s +add v8.4s, v8.4s, v29.4s +sqrdmulh v29.4S, v12.4S, v5.s[1] +mul v12.4S, v12.4S,v4.s[1] +mla v12.4S, v29.4S, v31.s[0] +sub v29.4s, v13.4s, v12.4s +add v13.4s, v13.4s, v12.4s +sqrdmulh v12.4S, v8.4S, v0.s[1] +mul v8.4S, v8.4S,v14.s[1] +mla v8.4S, v12.4S, v31.s[0] +sub v12.4s, v21.4s, v8.4s +add v21.4s, v21.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v5.s[2] +mul v18.4S, v18.4S,v4.s[2] +mla v18.4S, v8.4S, v31.s[0] +sub v8.4s, v11.4s, v18.4s +add v11.4s, v11.4s, v18.4s +sqrdmulh v18.4S, v30.4S, v0.s[2] +mul v30.4S, v30.4S,v14.s[2] +mla v30.4S, v18.4S, v31.s[0] +sub v18.4s, v22.4s, v30.4s +add v22.4s, v22.4s, v30.4s +trn1 v30.4S, v13.4S, v29.4S +trn2 v28.4S, v13.4S, v29.4S +trn1 v27.4S, v11.4S, v8.4S +trn2 v26.4S, v11.4S, v8.4S +trn2 v11.2D, v30.2D, v27.2D +trn2 v8.2D, v28.2D, v26.2D +trn1 v13.2D, v30.2D, v27.2D +trn1 v29.2D, v28.2D, v26.2D +trn1 v26.4S, v21.4S, v12.4S +trn2 v28.4S, v21.4S, v12.4S +trn1 v27.4S, v22.4S, v18.4S +trn2 v30.4S, v22.4S, v18.4S +trn2 v22.2D, v26.2D, v27.2D +trn2 v18.2D, v28.2D, v30.2D +trn1 v21.2D, v26.2D, v27.2D +trn1 v12.2D, v28.2D, v30.2D +sqrdmulh v30.4S, v11.4S, v7.4S +mul v11.4S, v11.4S,v6.4S +mla v11.4S, v30.4S, v31.s[0] +sub v30.4s, v13.4s, v11.4s +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v17.4S +mul v22.4S, v22.4S,v19.4S +mla v22.4S, v11.4S, v31.s[0] +sub v11.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +sqrdmulh v22.4S, v8.4S, v7.4S +mul v8.4S, v8.4S,v6.4S +mla v8.4S, v22.4S, v31.s[0] +sub v22.4s, v29.4s, v8.4s +add v29.4s, v29.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v17.4S +mul v18.4S, v18.4S,v19.4S +mla v18.4S, v8.4S, v31.s[0] +sub v8.4s, v12.4s, v18.4s +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v29.4S, v10.4S +mul v29.4S, v29.4S,v15.4S +mla v29.4S, v18.4S, v31.s[0] +sub v18.4s, v13.4s, v29.4s +add v13.4s, v13.4s, v29.4s +sqrdmulh v29.4S, v12.4S, v3.4S +mul v12.4S, v12.4S,v20.4S +mla v12.4S, v29.4S, v31.s[0] +sub v29.4s, v21.4s, v12.4s +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v22.4S, v16.4S +mul v22.4S, v22.4S,v2.4S +mla v22.4S, v12.4S, v31.s[0] +sub v12.4s, v30.4s, v22.4s +add v30.4s, v30.4s, v22.4s +sqrdmulh v22.4S, v8.4S, v9.4S +mul v8.4S, v8.4S,v1.4S +mla v8.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v8.4s +add v11.4s, v11.4s, v8.4s +str q13, [x0, #128] +str q18, [x0, #144] +str q30, [x0, #160] +str q12, [x0, #176] +str q21, [x0, #640] +str q29, [x0, #656] +str q11, [x0, #672] +str q22, [x0, #688] +ldr q9, [x17, #+512] +ldr q1, [x17, #+528] +ldr q3, [x17, #+544] +ldr q20, [x17, #+560] +ldr q17, [x17, #+576] +ldr q19, [x17, #+592] +ldr q0, [x17, #+608] +ldr q14, [x17, #+624] +ldr q22, [x0, #224] +ldr q11, [x0, #240] +ldr q29, [x0, #192] +ldr q21, [x0, #208] +ldr q16, [x17, #+1536] +ldr q2, [x17, #+1552] +ldr q10, [x17, #+1568] +ldr q15, [x17, #+1584] +ldr q7, [x17, #+1600] +ldr q6, [x17, #+1616] +ldr q5, [x17, #+1632] +ldr q4, [x17, #+1648] +ldr q12, [x0, #736] +ldr q30, [x0, #752] +ldr q18, [x0, #704] +ldr q13, [x0, #720] +sqrdmulh v8.4S, v22.4S, v1.s[0] +mul v22.4S, v22.4S,v9.s[0] +mla v22.4S, v8.4S, v31.s[0] +sub v8.4s, v29.4s, v22.4s +add v29.4s, v29.4s, v22.4s +sqrdmulh v22.4S, v12.4S, v2.s[0] +mul v12.4S, v12.4S,v16.s[0] +mla v12.4S, v22.4S, v31.s[0] +sub v22.4s, v18.4s, v12.4s +add v18.4s, v18.4s, v12.4s +sqrdmulh v12.4S, v11.4S, v1.s[0] +mul v11.4S, v11.4S,v9.s[0] +mla v11.4S, v12.4S, v31.s[0] +sub v12.4s, v21.4s, v11.4s +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v30.4S, v2.s[0] +mul v30.4S, v30.4S,v16.s[0] +mla v30.4S, v11.4S, v31.s[0] +sub v11.4s, v13.4s, v30.4s +add v13.4s, v13.4s, v30.4s +sqrdmulh v30.4S, v21.4S, v1.s[1] +mul v21.4S, v21.4S,v9.s[1] +mla v21.4S, v30.4S, v31.s[0] +sub v30.4s, v29.4s, v21.4s +add v29.4s, v29.4s, v21.4s +sqrdmulh v21.4S, v13.4S, v2.s[1] +mul v13.4S, v13.4S,v16.s[1] +mla v13.4S, v21.4S, v31.s[0] +sub v21.4s, v18.4s, v13.4s +add v18.4s, v18.4s, v13.4s +sqrdmulh v13.4S, v12.4S, v1.s[2] +mul v12.4S, v12.4S,v9.s[2] +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v8.4s, v12.4s +add v8.4s, v8.4s, v12.4s +sqrdmulh v12.4S, v11.4S, v2.s[2] +mul v11.4S, v11.4S,v16.s[2] +mla v11.4S, v12.4S, v31.s[0] +sub v12.4s, v22.4s, v11.4s +add v22.4s, v22.4s, v11.4s +trn1 v11.4S, v29.4S, v30.4S +trn2 v28.4S, v29.4S, v30.4S +trn1 v27.4S, v8.4S, v13.4S +trn2 v26.4S, v8.4S, v13.4S +trn2 v8.2D, v11.2D, v27.2D +trn2 v13.2D, v28.2D, v26.2D +trn1 v29.2D, v11.2D, v27.2D +trn1 v30.2D, v28.2D, v26.2D +trn1 v26.4S, v18.4S, v21.4S +trn2 v28.4S, v18.4S, v21.4S +trn1 v27.4S, v22.4S, v12.4S +trn2 v11.4S, v22.4S, v12.4S +trn2 v22.2D, v26.2D, v27.2D +trn2 v12.2D, v28.2D, v11.2D +trn1 v18.2D, v26.2D, v27.2D +trn1 v21.2D, v28.2D, v11.2D +sqrdmulh v11.4S, v8.4S, v20.4S +mul v8.4S, v8.4S,v3.4S +mla v8.4S, v11.4S, v31.s[0] +sub v11.4s, v29.4s, v8.4s +add v29.4s, v29.4s, v8.4s +sqrdmulh v8.4S, v22.4S, v15.4S +mul v22.4S, v22.4S,v10.4S +mla v22.4S, v8.4S, v31.s[0] +sub v8.4s, v18.4s, v22.4s +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v13.4S, v20.4S +mul v13.4S, v13.4S,v3.4S +mla v13.4S, v22.4S, v31.s[0] +sub v22.4s, v30.4s, v13.4s +add v30.4s, v30.4s, v13.4s +sqrdmulh v13.4S, v12.4S, v15.4S +mul v12.4S, v12.4S,v10.4S +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v21.4s, v12.4s +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v30.4S, v19.4S +mul v30.4S, v30.4S,v17.4S +mla v30.4S, v12.4S, v31.s[0] +sub v12.4s, v29.4s, v30.4s +add v29.4s, v29.4s, v30.4s +sqrdmulh v30.4S, v21.4S, v6.4S +mul v21.4S, v21.4S,v7.4S +mla v21.4S, v30.4S, v31.s[0] +sub v30.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v14.4S +mul v22.4S, v22.4S,v0.4S +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +sqrdmulh v22.4S, v13.4S, v4.4S +mul v13.4S, v13.4S,v5.4S +mla v13.4S, v22.4S, v31.s[0] +sub v22.4s, v8.4s, v13.4s +add v8.4s, v8.4s, v13.4s +str q29, [x0, #192] +str q12, [x0, #208] +str q11, [x0, #224] +str q21, [x0, #240] +str q18, [x0, #704] +str q30, [x0, #720] +str q8, [x0, #736] +str q22, [x0, #752] +ldr q4, [x17, #+640] +ldr q5, [x17, #+656] +ldr q6, [x17, #+672] +ldr q7, [x17, #+688] +ldr q15, [x17, #+704] +ldr q10, [x17, #+720] +ldr q2, [x17, #+736] +ldr q16, [x17, #+752] +ldr q22, [x0, #288] +ldr q8, [x0, #304] +ldr q30, [x0, #256] +ldr q18, [x0, #272] +ldr q14, [x17, #+1664] +ldr q0, [x17, #+1680] +ldr q19, [x17, #+1696] +ldr q17, [x17, #+1712] +ldr q20, [x17, #+1728] +ldr q3, [x17, #+1744] +ldr q1, [x17, #+1760] +ldr q9, [x17, #+1776] +ldr q21, [x0, #800] +ldr q11, [x0, #816] +ldr q12, [x0, #768] +ldr q29, [x0, #784] +sqrdmulh v13.4S, v22.4S, v5.s[0] +mul v22.4S, v22.4S,v4.s[0] +mla v22.4S, v13.4S, v31.s[0] +sub v13.4s, v30.4s, v22.4s +add v30.4s, v30.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v0.s[0] +mul v21.4S, v21.4S,v14.s[0] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v12.4s, v21.4s +add v12.4s, v12.4s, v21.4s +sqrdmulh v21.4S, v8.4S, v5.s[0] +mul v8.4S, v8.4S,v4.s[0] +mla v8.4S, v21.4S, v31.s[0] +sub v21.4s, v18.4s, v8.4s +add v18.4s, v18.4s, v8.4s +sqrdmulh v8.4S, v11.4S, v0.s[0] +mul v11.4S, v11.4S,v14.s[0] +mla v11.4S, v8.4S, v31.s[0] +sub v8.4s, v29.4s, v11.4s +add v29.4s, v29.4s, v11.4s +sqrdmulh v11.4S, v18.4S, v5.s[1] +mul v18.4S, v18.4S,v4.s[1] +mla v18.4S, v11.4S, v31.s[0] +sub v11.4s, v30.4s, v18.4s +add v30.4s, v30.4s, v18.4s +sqrdmulh v18.4S, v29.4S, v0.s[1] +mul v29.4S, v29.4S,v14.s[1] +mla v29.4S, v18.4S, v31.s[0] +sub v18.4s, v12.4s, v29.4s +add v12.4s, v12.4s, v29.4s +sqrdmulh v29.4S, v21.4S, v5.s[2] +mul v21.4S, v21.4S,v4.s[2] +mla v21.4S, v29.4S, v31.s[0] +sub v29.4s, v13.4s, v21.4s +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v8.4S, v0.s[2] +mul v8.4S, v8.4S,v14.s[2] +mla v8.4S, v21.4S, v31.s[0] +sub v21.4s, v22.4s, v8.4s +add v22.4s, v22.4s, v8.4s +trn1 v8.4S, v30.4S, v11.4S +trn2 v28.4S, v30.4S, v11.4S +trn1 v27.4S, v13.4S, v29.4S +trn2 v26.4S, v13.4S, v29.4S +trn2 v13.2D, v8.2D, v27.2D +trn2 v29.2D, v28.2D, v26.2D +trn1 v30.2D, v8.2D, v27.2D +trn1 v11.2D, v28.2D, v26.2D +trn1 v26.4S, v12.4S, v18.4S +trn2 v28.4S, v12.4S, v18.4S +trn1 v27.4S, v22.4S, v21.4S +trn2 v8.4S, v22.4S, v21.4S +trn2 v22.2D, v26.2D, v27.2D +trn2 v21.2D, v28.2D, v8.2D +trn1 v12.2D, v26.2D, v27.2D +trn1 v18.2D, v28.2D, v8.2D +sqrdmulh v8.4S, v13.4S, v7.4S +mul v13.4S, v13.4S,v6.4S +mla v13.4S, v8.4S, v31.s[0] +sub v8.4s, v30.4s, v13.4s +add v30.4s, v30.4s, v13.4s +sqrdmulh v13.4S, v22.4S, v17.4S +mul v22.4S, v22.4S,v19.4S +mla v22.4S, v13.4S, v31.s[0] +sub v13.4s, v12.4s, v22.4s +add v12.4s, v12.4s, v22.4s +sqrdmulh v22.4S, v29.4S, v7.4S +mul v29.4S, v29.4S,v6.4S +mla v29.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v29.4s +add v11.4s, v11.4s, v29.4s +sqrdmulh v29.4S, v21.4S, v17.4S +mul v21.4S, v21.4S,v19.4S +mla v21.4S, v29.4S, v31.s[0] +sub v29.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v11.4S, v10.4S +mul v11.4S, v11.4S,v15.4S +mla v11.4S, v21.4S, v31.s[0] +sub v21.4s, v30.4s, v11.4s +add v30.4s, v30.4s, v11.4s +sqrdmulh v11.4S, v18.4S, v3.4S +mul v18.4S, v18.4S,v20.4S +mla v18.4S, v11.4S, v31.s[0] +sub v11.4s, v12.4s, v18.4s +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v22.4S, v16.4S +mul v22.4S, v22.4S,v2.4S +mla v22.4S, v18.4S, v31.s[0] +sub v18.4s, v8.4s, v22.4s +add v8.4s, v8.4s, v22.4s +sqrdmulh v22.4S, v29.4S, v9.4S +mul v29.4S, v29.4S,v1.4S +mla v29.4S, v22.4S, v31.s[0] +sub v22.4s, v13.4s, v29.4s +add v13.4s, v13.4s, v29.4s +str q30, [x0, #256] +str q21, [x0, #272] +str q8, [x0, #288] +str q18, [x0, #304] +str q12, [x0, #768] +str q11, [x0, #784] +str q13, [x0, #800] +str q22, [x0, #816] +ldr q9, [x17, #+768] +ldr q1, [x17, #+784] +ldr q3, [x17, #+800] +ldr q20, [x17, #+816] +ldr q17, [x17, #+832] +ldr q19, [x17, #+848] +ldr q0, [x17, #+864] +ldr q14, [x17, #+880] +ldr q22, [x0, #352] +ldr q13, [x0, #368] +ldr q11, [x0, #320] +ldr q12, [x0, #336] +ldr q16, [x17, #+1792] +ldr q2, [x17, #+1808] +ldr q10, [x17, #+1824] +ldr q15, [x17, #+1840] +ldr q7, [x17, #+1856] +ldr q6, [x17, #+1872] +ldr q5, [x17, #+1888] +ldr q4, [x17, #+1904] +ldr q18, [x0, #864] +ldr q8, [x0, #880] +ldr q21, [x0, #832] +ldr q30, [x0, #848] +sqrdmulh v29.4S, v22.4S, v1.s[0] +mul v22.4S, v22.4S,v9.s[0] +mla v22.4S, v29.4S, v31.s[0] +sub v29.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +sqrdmulh v22.4S, v18.4S, v2.s[0] +mul v18.4S, v18.4S,v16.s[0] +mla v18.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v18.4s +add v21.4s, v21.4s, v18.4s +sqrdmulh v18.4S, v13.4S, v1.s[0] +mul v13.4S, v13.4S,v9.s[0] +mla v13.4S, v18.4S, v31.s[0] +sub v18.4s, v12.4s, v13.4s +add v12.4s, v12.4s, v13.4s +sqrdmulh v13.4S, v8.4S, v2.s[0] +mul v8.4S, v8.4S,v16.s[0] +mla v8.4S, v13.4S, v31.s[0] +sub v13.4s, v30.4s, v8.4s +add v30.4s, v30.4s, v8.4s +sqrdmulh v8.4S, v12.4S, v1.s[1] +mul v12.4S, v12.4S,v9.s[1] +mla v12.4S, v8.4S, v31.s[0] +sub v8.4s, v11.4s, v12.4s +add v11.4s, v11.4s, v12.4s +sqrdmulh v12.4S, v30.4S, v2.s[1] +mul v30.4S, v30.4S,v16.s[1] +mla v30.4S, v12.4S, v31.s[0] +sub v12.4s, v21.4s, v30.4s +add v21.4s, v21.4s, v30.4s +sqrdmulh v30.4S, v18.4S, v1.s[2] +mul v18.4S, v18.4S,v9.s[2] +mla v18.4S, v30.4S, v31.s[0] +sub v30.4s, v29.4s, v18.4s +add v29.4s, v29.4s, v18.4s +sqrdmulh v18.4S, v13.4S, v2.s[2] +mul v13.4S, v13.4S,v16.s[2] +mla v13.4S, v18.4S, v31.s[0] +sub v18.4s, v22.4s, v13.4s +add v22.4s, v22.4s, v13.4s +trn1 v13.4S, v11.4S, v8.4S +trn2 v28.4S, v11.4S, v8.4S +trn1 v27.4S, v29.4S, v30.4S +trn2 v26.4S, v29.4S, v30.4S +trn2 v29.2D, v13.2D, v27.2D +trn2 v30.2D, v28.2D, v26.2D +trn1 v11.2D, v13.2D, v27.2D +trn1 v8.2D, v28.2D, v26.2D +trn1 v26.4S, v21.4S, v12.4S +trn2 v28.4S, v21.4S, v12.4S +trn1 v27.4S, v22.4S, v18.4S +trn2 v13.4S, v22.4S, v18.4S +trn2 v22.2D, v26.2D, v27.2D +trn2 v18.2D, v28.2D, v13.2D +trn1 v21.2D, v26.2D, v27.2D +trn1 v12.2D, v28.2D, v13.2D +sqrdmulh v13.4S, v29.4S, v20.4S +mul v29.4S, v29.4S,v3.4S +mla v29.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v29.4s +add v11.4s, v11.4s, v29.4s +sqrdmulh v29.4S, v22.4S, v15.4S +mul v22.4S, v22.4S,v10.4S +mla v22.4S, v29.4S, v31.s[0] +sub v29.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +sqrdmulh v22.4S, v30.4S, v20.4S +mul v30.4S, v30.4S,v3.4S +mla v30.4S, v22.4S, v31.s[0] +sub v22.4s, v8.4s, v30.4s +add v8.4s, v8.4s, v30.4s +sqrdmulh v30.4S, v18.4S, v15.4S +mul v18.4S, v18.4S,v10.4S +mla v18.4S, v30.4S, v31.s[0] +sub v30.4s, v12.4s, v18.4s +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v8.4S, v19.4S +mul v8.4S, v8.4S,v17.4S +mla v8.4S, v18.4S, v31.s[0] +sub v18.4s, v11.4s, v8.4s +add v11.4s, v11.4s, v8.4s +sqrdmulh v8.4S, v12.4S, v6.4S +mul v12.4S, v12.4S,v7.4S +mla v12.4S, v8.4S, v31.s[0] +sub v8.4s, v21.4s, v12.4s +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v22.4S, v14.4S +mul v22.4S, v22.4S,v0.4S +mla v22.4S, v12.4S, v31.s[0] +sub v12.4s, v13.4s, v22.4s +add v13.4s, v13.4s, v22.4s +sqrdmulh v22.4S, v30.4S, v4.4S +mul v30.4S, v30.4S,v5.4S +mla v30.4S, v22.4S, v31.s[0] +sub v22.4s, v29.4s, v30.4s +add v29.4s, v29.4s, v30.4s +str q11, [x0, #320] +str q18, [x0, #336] +str q13, [x0, #352] +str q12, [x0, #368] +str q21, [x0, #832] +str q8, [x0, #848] +str q29, [x0, #864] +str q22, [x0, #880] +ldr q4, [x17, #+896] +ldr q5, [x17, #+912] +ldr q6, [x17, #+928] +ldr q7, [x17, #+944] +ldr q15, [x17, #+960] +ldr q10, [x17, #+976] +ldr q2, [x17, #+992] +ldr q16, [x17, #+1008] +ldr q22, [x0, #416] +ldr q29, [x0, #432] +ldr q8, [x0, #384] +ldr q21, [x0, #400] +ldr q14, [x17, #+1920] +ldr q0, [x17, #+1936] +ldr q19, [x17, #+1952] +ldr q17, [x17, #+1968] +ldr q20, [x17, #+1984] +ldr q3, [x17, #+2000] +ldr q1, [x17, #+2016] +ldr q9, [x17, #+2032] +ldr q12, [x0, #928] +ldr q13, [x0, #944] +ldr q18, [x0, #896] +ldr q11, [x0, #912] +sqrdmulh v30.4S, v22.4S, v5.s[0] +mul v22.4S, v22.4S,v4.s[0] +mla v22.4S, v30.4S, v31.s[0] +sub v30.4s, v8.4s, v22.4s +add v8.4s, v8.4s, v22.4s +sqrdmulh v22.4S, v12.4S, v0.s[0] +mul v12.4S, v12.4S,v14.s[0] +mla v12.4S, v22.4S, v31.s[0] +sub v22.4s, v18.4s, v12.4s +add v18.4s, v18.4s, v12.4s +sqrdmulh v12.4S, v29.4S, v5.s[0] +mul v29.4S, v29.4S,v4.s[0] +mla v29.4S, v12.4S, v31.s[0] +sub v12.4s, v21.4s, v29.4s +add v21.4s, v21.4s, v29.4s +sqrdmulh v29.4S, v13.4S, v0.s[0] +mul v13.4S, v13.4S,v14.s[0] +mla v13.4S, v29.4S, v31.s[0] +sub v29.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +sqrdmulh v13.4S, v21.4S, v5.s[1] +mul v21.4S, v21.4S,v4.s[1] +mla v21.4S, v13.4S, v31.s[0] +sub v13.4s, v8.4s, v21.4s +add v8.4s, v8.4s, v21.4s +sqrdmulh v21.4S, v11.4S, v0.s[1] +mul v11.4S, v11.4S,v14.s[1] +mla v11.4S, v21.4S, v31.s[0] +sub v21.4s, v18.4s, v11.4s +add v18.4s, v18.4s, v11.4s +sqrdmulh v11.4S, v12.4S, v5.s[2] +mul v12.4S, v12.4S,v4.s[2] +mla v12.4S, v11.4S, v31.s[0] +sub v11.4s, v30.4s, v12.4s +add v30.4s, v30.4s, v12.4s +sqrdmulh v12.4S, v29.4S, v0.s[2] +mul v29.4S, v29.4S,v14.s[2] +mla v29.4S, v12.4S, v31.s[0] +sub v12.4s, v22.4s, v29.4s +add v22.4s, v22.4s, v29.4s +trn1 v29.4S, v8.4S, v13.4S +trn2 v28.4S, v8.4S, v13.4S +trn1 v27.4S, v30.4S, v11.4S +trn2 v26.4S, v30.4S, v11.4S +trn2 v30.2D, v29.2D, v27.2D +trn2 v11.2D, v28.2D, v26.2D +trn1 v8.2D, v29.2D, v27.2D +trn1 v13.2D, v28.2D, v26.2D +trn1 v26.4S, v18.4S, v21.4S +trn2 v28.4S, v18.4S, v21.4S +trn1 v27.4S, v22.4S, v12.4S +trn2 v29.4S, v22.4S, v12.4S +trn2 v22.2D, v26.2D, v27.2D +trn2 v12.2D, v28.2D, v29.2D +trn1 v18.2D, v26.2D, v27.2D +trn1 v21.2D, v28.2D, v29.2D +sqrdmulh v29.4S, v30.4S, v7.4S +mul v30.4S, v30.4S,v6.4S +mla v30.4S, v29.4S, v31.s[0] +sub v29.4s, v8.4s, v30.4s +add v8.4s, v8.4s, v30.4s +sqrdmulh v30.4S, v22.4S, v17.4S +mul v22.4S, v22.4S,v19.4S +mla v22.4S, v30.4S, v31.s[0] +sub v30.4s, v18.4s, v22.4s +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v11.4S, v7.4S +mul v11.4S, v11.4S,v6.4S +mla v11.4S, v22.4S, v31.s[0] +sub v22.4s, v13.4s, v11.4s +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v12.4S, v17.4S +mul v12.4S, v12.4S,v19.4S +mla v12.4S, v11.4S, v31.s[0] +sub v11.4s, v21.4s, v12.4s +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v13.4S, v10.4S +mul v13.4S, v13.4S,v15.4S +mla v13.4S, v12.4S, v31.s[0] +sub v12.4s, v8.4s, v13.4s +add v8.4s, v8.4s, v13.4s +sqrdmulh v13.4S, v21.4S, v3.4S +mul v21.4S, v21.4S,v20.4S +mla v21.4S, v13.4S, v31.s[0] +sub v13.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v16.4S +mul v22.4S, v22.4S,v2.4S +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v29.4s, v22.4s +add v29.4s, v29.4s, v22.4s +sqrdmulh v22.4S, v11.4S, v9.4S +mul v11.4S, v11.4S,v1.4S +mla v11.4S, v22.4S, v31.s[0] +sub v22.4s, v30.4s, v11.4s +add v30.4s, v30.4s, v11.4s +str q8, [x0, #384] +str q12, [x0, #400] +str q29, [x0, #416] +str q21, [x0, #432] +str q18, [x0, #896] +str q13, [x0, #912] +str q30, [x0, #928] +str q22, [x0, #944] +ldr q9, [x17, #+1024] +ldr q1, [x17, #+1040] +ldr q3, [x17, #+1056] +ldr q20, [x17, #+1072] +ldr q17, [x17, #+1088] +ldr q19, [x17, #+1104] +ldr q0, [x17, #+1120] +ldr q14, [x17, #+1136] +ldr q22, [x0, #480] +ldr q30, [x0, #496] +ldr q13, [x0, #448] +ldr q18, [x0, #464] +ldr q16, [x17, #+2048] +ldr q2, [x17, #+2064] +ldr q10, [x17, #+2080] +ldr q15, [x17, #+2096] +ldr q7, [x17, #+2112] +ldr q6, [x17, #+2128] +ldr q5, [x17, #+2144] +ldr q4, [x17, #+2160] +ldr q21, [x0, #992] +ldr q29, [x0, #1008] +ldr q12, [x0, #960] +ldr q8, [x0, #976] +sqrdmulh v11.4S, v22.4S, v1.s[0] +mul v22.4S, v22.4S,v9.s[0] +mla v22.4S, v11.4S, v31.s[0] +sub v11.4s, v13.4s, v22.4s +add v13.4s, v13.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v2.s[0] +mul v21.4S, v21.4S,v16.s[0] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v12.4s, v21.4s +add v12.4s, v12.4s, v21.4s +sqrdmulh v21.4S, v30.4S, v1.s[0] +mul v30.4S, v30.4S,v9.s[0] +mla v30.4S, v21.4S, v31.s[0] +sub v21.4s, v18.4s, v30.4s +add v18.4s, v18.4s, v30.4s +sqrdmulh v30.4S, v29.4S, v2.s[0] +mul v29.4S, v29.4S,v16.s[0] +mla v29.4S, v30.4S, v31.s[0] +sub v30.4s, v8.4s, v29.4s +add v8.4s, v8.4s, v29.4s +sqrdmulh v29.4S, v18.4S, v1.s[1] +mul v18.4S, v18.4S,v9.s[1] +mla v18.4S, v29.4S, v31.s[0] +sub v29.4s, v13.4s, v18.4s +add v13.4s, v13.4s, v18.4s +sqrdmulh v18.4S, v8.4S, v2.s[1] +mul v8.4S, v8.4S,v16.s[1] +mla v8.4S, v18.4S, v31.s[0] +sub v18.4s, v12.4s, v8.4s +add v12.4s, v12.4s, v8.4s +sqrdmulh v8.4S, v21.4S, v1.s[2] +mul v21.4S, v21.4S,v9.s[2] +mla v21.4S, v8.4S, v31.s[0] +sub v8.4s, v11.4s, v21.4s +add v11.4s, v11.4s, v21.4s +sqrdmulh v21.4S, v30.4S, v2.s[2] +mul v30.4S, v30.4S,v16.s[2] +mla v30.4S, v21.4S, v31.s[0] +sub v21.4s, v22.4s, v30.4s +add v22.4s, v22.4s, v30.4s +trn1 v30.4S, v13.4S, v29.4S +trn2 v28.4S, v13.4S, v29.4S +trn1 v27.4S, v11.4S, v8.4S +trn2 v26.4S, v11.4S, v8.4S +trn2 v11.2D, v30.2D, v27.2D +trn2 v8.2D, v28.2D, v26.2D +trn1 v13.2D, v30.2D, v27.2D +trn1 v29.2D, v28.2D, v26.2D +trn1 v26.4S, v12.4S, v18.4S +trn2 v28.4S, v12.4S, v18.4S +trn1 v27.4S, v22.4S, v21.4S +trn2 v30.4S, v22.4S, v21.4S +trn2 v22.2D, v26.2D, v27.2D +trn2 v21.2D, v28.2D, v30.2D +trn1 v12.2D, v26.2D, v27.2D +trn1 v18.2D, v28.2D, v30.2D +sqrdmulh v30.4S, v11.4S, v20.4S +mul v11.4S, v11.4S,v3.4S +mla v11.4S, v30.4S, v31.s[0] +sub v30.4s, v13.4s, v11.4s +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v15.4S +mul v22.4S, v22.4S,v10.4S +mla v22.4S, v11.4S, v31.s[0] +sub v11.4s, v12.4s, v22.4s +add v12.4s, v12.4s, v22.4s +sqrdmulh v22.4S, v8.4S, v20.4S +mul v8.4S, v8.4S,v3.4S +mla v8.4S, v22.4S, v31.s[0] +sub v22.4s, v29.4s, v8.4s +add v29.4s, v29.4s, v8.4s +sqrdmulh v8.4S, v21.4S, v15.4S +mul v21.4S, v21.4S,v10.4S +mla v21.4S, v8.4S, v31.s[0] +sub v8.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v29.4S, v19.4S +mul v29.4S, v29.4S,v17.4S +mla v29.4S, v21.4S, v31.s[0] +sub v21.4s, v13.4s, v29.4s +add v13.4s, v13.4s, v29.4s +sqrdmulh v29.4S, v18.4S, v6.4S +mul v18.4S, v18.4S,v7.4S +mla v18.4S, v29.4S, v31.s[0] +sub v29.4s, v12.4s, v18.4s +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v22.4S, v14.4S +mul v22.4S, v22.4S,v0.4S +mla v22.4S, v18.4S, v31.s[0] +sub v18.4s, v30.4s, v22.4s +add v30.4s, v30.4s, v22.4s +sqrdmulh v22.4S, v8.4S, v4.4S +mul v8.4S, v8.4S,v5.4S +mla v8.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v8.4s +add v11.4s, v11.4s, v8.4s +str q13, [x0, #448] +str q21, [x0, #464] +str q30, [x0, #480] +str q18, [x0, #496] +str q12, [x0, #960] +str q29, [x0, #976] +str q11, [x0, #992] +str q22, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 2392 +// Instruction count: 2388 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_3_z2_2.s b/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_3_z2_2.s new file mode 100644 index 0000000..61444d0 --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_3_z2_2.s @@ -0,0 +1,2422 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 26036764 // Layer 6, block 0 +.word 7065381 // Layer 6, block 1 +.word 11280567 // Layer 6, block 2 +.word 19695786 // Layer 6, block 3 +.word 1666225723 // Layer 6, block 0 +.word 452149874 // Layer 6, block 1 +.word 721901190 // Layer 6, block 2 +.word 1260434103 // Layer 6, block 3 +.word 28678040 // Layer 7, block 0 +.word 5637166 // Layer 7, block 2 +.word 18759424 // Layer 7, block 4 +.word 8648030 // Layer 7, block 6 +.word 1835254486 // Layer 7, block 0 +.word 360751090 // Layer 7, block 2 +.word 1200511508 // Layer 7, block 4 +.word 553431680 // Layer 7, block 6 +.word 7232147 // Layer 7, block 1 +.word 7430689 // Layer 7, block 3 +.word 14819378 // Layer 7, block 5 +.word 22112339 // Layer 7, block 7 +.word 462822084 // Layer 7, block 1 +.word 475527802 // Layer 7, block 3 +.word 948367809 // Layer 7, block 5 +.word 1415081692 // Layer 7, block 7 +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14834498 // Layer 6, block 4 +.word 22861321 // Layer 6, block 5 +.word 23033862 // Layer 6, block 6 +.word 32211066 // Layer 6, block 7 +.word 949335415 // Layer 6, block 4 +.word 1463012881 // Layer 6, block 5 +.word 1474054663 // Layer 6, block 6 +.word 2061350894 // Layer 6, block 7 +.word 7103825 // Layer 7, block 8 +.word 24338119 // Layer 7, block 10 +.word 6674394 // Layer 7, block 12 +.word 3716128 // Layer 7, block 14 +.word 454610102 // Layer 7, block 8 +.word 1557520740 // Layer 7, block 10 +.word 427128616 // Layer 7, block 12 +.word 237814041 // Layer 7, block 14 +.word 18577393 // Layer 7, block 9 +.word 17042091 // Layer 7, block 11 +.word 6574213 // Layer 7, block 13 +.word 24666803 // Layer 7, block 15 +.word 1188862414 // Layer 7, block 9 +.word 1090610585 // Layer 7, block 11 +.word 420717521 // Layer 7, block 13 +.word 1578554911 // Layer 7, block 15 +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 11253846 // Layer 6, block 8 +.word 16151303 // Layer 6, block 9 +.word 1821442 // Layer 6, block 10 +.word 23358663 // Layer 6, block 11 +.word 720191176 // Layer 6, block 8 +.word 1033604503 // Layer 6, block 9 +.word 116563391 // Layer 6, block 10 +.word 1494840340 // Layer 6, block 11 +.word 32787475 // Layer 7, block 16 +.word 8269259 // Layer 7, block 18 +.word 20826321 // Layer 7, block 20 +.word 21194054 // Layer 7, block 22 +.word 2098238255 // Layer 7, block 16 +.word 529192186 // Layer 7, block 18 +.word 1332782821 // Layer 7, block 20 +.word 1356315937 // Layer 7, block 22 +.word 28400654 // Layer 7, block 17 +.word 31090287 // Layer 7, block 19 +.word 26776841 // Layer 7, block 21 +.word 22281074 // Layer 7, block 23 +.word 1817503137 // Layer 7, block 17 +.word 1989626512 // Layer 7, block 19 +.word 1713587037 // Layer 7, block 21 +.word 1425879908 // Layer 7, block 23 +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 20504641 // Layer 6, block 12 +.word 7735096 // Layer 6, block 13 +.word 29463916 // Layer 6, block 14 +.word 23172067 // Layer 6, block 15 +.word 1312196872 // Layer 6, block 12 +.word 495008363 // Layer 6, block 13 +.word 1885546712 // Layer 6, block 14 +.word 1482899108 // Layer 6, block 15 +.word 1953000 // Layer 7, block 24 +.word 12766243 // Layer 7, block 26 +.word 16292342 // Layer 7, block 28 +.word 25143337 // Layer 7, block 30 +.word 124982461 // Layer 7, block 24 +.word 816977197 // Layer 7, block 26 +.word 1042630311 // Layer 7, block 28 +.word 1609050759 // Layer 7, block 30 +.word 12486848 // Layer 7, block 25 +.word 31556661 // Layer 7, block 27 +.word 28330310 // Layer 7, block 29 +.word 15137961 // Layer 7, block 31 +.word 799097282 // Layer 7, block 25 +.word 2019472170 // Layer 7, block 27 +.word 1813001465 // Layer 7, block 29 +.word 968755565 // Layer 7, block 31 +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 18663828 // Layer 6, block 16 +.word 25765932 // Layer 6, block 17 +.word 11779122 // Layer 6, block 18 +.word 29112305 // Layer 6, block 19 +.word 1194393831 // Layer 6, block 16 +.word 1648893798 // Layer 6, block 17 +.word 753806275 // Layer 6, block 18 +.word 1863045325 // Layer 6, block 19 +.word 33163184 // Layer 7, block 32 +.word 11550623 // Layer 7, block 34 +.word 25375595 // Layer 7, block 36 +.word 18254638 // Layer 7, block 38 +.word 2122281795 // Layer 7, block 32 +.word 739183455 // Layer 7, block 34 +.word 1623914137 // Layer 7, block 36 +.word 1168207670 // Layer 7, block 38 +.word 9551359 // Layer 7, block 33 +.word 33257316 // Layer 7, block 35 +.word 10387700 // Layer 7, block 37 +.word 4263629 // Layer 7, block 39 +.word 611240324 // Layer 7, block 33 +.word 2128305784 // Layer 7, block 35 +.word 664762063 // Layer 7, block 37 +.word 272851431 // Layer 7, block 39 +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 596073 // Layer 6, block 20 +.word 29039358 // Layer 6, block 21 +.word 6760262 // Layer 6, block 22 +.word 2228887 // Layer 6, block 23 +.word 38145761 // Layer 6, block 20 +.word 1858377074 // Layer 6, block 21 +.word 432623749 // Layer 6, block 22 +.word 142637881 // Layer 6, block 23 +.word 25929180 // Layer 7, block 40 +.word 23508428 // Layer 7, block 42 +.word 22560727 // Layer 7, block 44 +.word 29457393 // Layer 7, block 46 +.word 1659340873 // Layer 7, block 40 +.word 1504424569 // Layer 7, block 42 +.word 1443776334 // Layer 7, block 44 +.word 1885129272 // Layer 7, block 46 +.word 17371159 // Layer 7, block 41 +.word 11558208 // Layer 7, block 43 +.word 15755637 // Layer 7, block 45 +.word 20740787 // Layer 7, block 47 +.word 1111669329 // Layer 7, block 41 +.word 739668858 // Layer 7, block 43 +.word 1008283812 // Layer 7, block 45 +.word 1327309063 // Layer 7, block 47 +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 13624329 // Layer 6, block 24 +.word 9838349 // Layer 6, block 25 +.word 6934560 // Layer 6, block 26 +.word 11310234 // Layer 6, block 27 +.word 871890510 // Layer 6, block 24 +.word 629606282 // Layer 6, block 25 +.word 443777969 // Layer 6, block 26 +.word 723799733 // Layer 6, block 27 +.word 3153984 // Layer 7, block 48 +.word 15599806 // Layer 7, block 50 +.word 23484790 // Layer 7, block 52 +.word 30174454 // Layer 7, block 54 +.word 201839571 // Layer 7, block 48 +.word 998311389 // Layer 7, block 50 +.word 1502911852 // Layer 7, block 52 +.word 1931017673 // Layer 7, block 54 +.word 13598070 // Layer 7, block 49 +.word 31454003 // Layer 7, block 51 +.word 20506260 // Layer 7, block 53 +.word 5928435 // Layer 7, block 55 +.word 870210062 // Layer 7, block 49 +.word 2012902560 // Layer 7, block 51 +.word 1312300480 // Layer 7, block 53 +.word 379390883 // Layer 7, block 55 +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 32798516 // Layer 6, block 28 +.word 9911360 // Layer 6, block 29 +.word 32443170 // Layer 6, block 30 +.word 31293482 // Layer 6, block 31 +.word 2098944825 // Layer 6, block 28 +.word 634278629 // Layer 6, block 29 +.word 2076204416 // Layer 6, block 30 +.word 2002630000 // Layer 6, block 31 +.word 26013877 // Layer 7, block 56 +.word 22928950 // Layer 7, block 58 +.word 24547058 // Layer 7, block 60 +.word 21082546 // Layer 7, block 62 +.word 1664761067 // Layer 7, block 56 +.word 1467340807 // Layer 7, block 58 +.word 1570891816 // Layer 7, block 60 +.word 1349179970 // Layer 7, block 62 +.word 21864746 // Layer 7, block 57 +.word 27678266 // Layer 7, block 59 +.word 30695887 // Layer 7, block 61 +.word 31772478 // Layer 7, block 63 +.word 1399236949 // Layer 7, block 57 +.word 1771273834 // Layer 7, block 59 +.word 1964386839 // Layer 7, block 61 +.word 2033283404 // Layer 7, block 63 +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 2853776 // Layer 6, block 32 +.word 31645959 // Layer 6, block 33 +.word 29723614 // Layer 6, block 34 +.word 31813171 // Layer 6, block 35 +.word 182627725 // Layer 6, block 32 +.word 2025186806 // Layer 6, block 33 +.word 1902166116 // Layer 6, block 34 +.word 2035887557 // Layer 6, block 35 +.word 30377953 // Layer 7, block 64 +.word 4924837 // Layer 7, block 66 +.word 11362575 // Layer 7, block 68 +.word 31398766 // Layer 7, block 70 +.word 1944040616 // Layer 7, block 64 +.word 315165513 // Layer 7, block 66 +.word 727149301 // Layer 7, block 68 +.word 2009367662 // Layer 7, block 70 +.word 27689101 // Layer 7, block 65 +.word 31229525 // Layer 7, block 67 +.word 6544948 // Layer 7, block 69 +.word 13728247 // Layer 7, block 71 +.word 1771967221 // Layer 7, block 65 +.word 1998537064 // Layer 7, block 67 +.word 418844704 // Layer 7, block 69 +.word 878540754 // Layer 7, block 71 +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9116920 // Layer 6, block 36 +.word 26449800 // Layer 6, block 37 +.word 27173300 // Layer 6, block 38 +.word 1574249 // Layer 6, block 39 +.word 583438350 // Layer 6, block 36 +.word 1692658010 // Layer 6, block 37 +.word 1738958476 // Layer 6, block 38 +.word 100744247 // Layer 6, block 39 +.word 6510145 // Layer 7, block 72 +.word 760999 // Layer 7, block 74 +.word 1634503 // Layer 7, block 76 +.word 29546109 // Layer 7, block 78 +.word 416617482 // Layer 7, block 72 +.word 48700219 // Layer 7, block 74 +.word 104600209 // Layer 7, block 76 +.word 1890806663 // Layer 7, block 78 +.word 2195232 // Layer 7, block 73 +.word 4465852 // Layer 7, block 75 +.word 31203102 // Layer 7, block 77 +.word 29916743 // Layer 7, block 79 +.word 140484126 // Layer 7, block 73 +.word 285792715 // Layer 7, block 75 +.word 1996846121 // Layer 7, block 77 +.word 1914525428 // Layer 7, block 79 +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29172999 // Layer 6, block 40 +.word 16825951 // Layer 6, block 41 +.word 11592382 // Layer 6, block 42 +.word 2671395 // Layer 6, block 43 +.word 1866929445 // Layer 6, block 40 +.word 1076778680 // Layer 6, block 41 +.word 741855827 // Layer 6, block 42 +.word 170956232 // Layer 6, block 43 +.word 14579779 // Layer 7, block 80 +.word 24263513 // Layer 7, block 82 +.word 4646776 // Layer 7, block 84 +.word 69049 // Layer 7, block 86 +.word 933034643 // Layer 7, block 80 +.word 1552746321 // Layer 7, block 82 +.word 297370968 // Layer 7, block 84 +.word 4418799 // Layer 7, block 86 +.word 33263488 // Layer 7, block 81 +.word 22493246 // Layer 7, block 83 +.word 22009979 // Layer 7, block 85 +.word 12021234 // Layer 7, block 87 +.word 2128700762 // Layer 7, block 81 +.word 1439457879 // Layer 7, block 83 +.word 1408531152 // Layer 7, block 85 +.word 769300260 // Layer 7, block 87 +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 15720958 // Layer 6, block 44 +.word 4876619 // Layer 6, block 45 +.word 9370171 // Layer 6, block 46 +.word 2197027 // Layer 6, block 47 +.word 1006064525 // Layer 6, block 44 +.word 312079797 // Layer 6, block 45 +.word 599645177 // Layer 6, block 46 +.word 140598997 // Layer 6, block 47 +.word 16117282 // Layer 7, block 88 +.word 9635661 // Layer 7, block 90 +.word 9117520 // Layer 7, block 92 +.word 3506913 // Layer 7, block 94 +.word 1031427326 // Layer 7, block 88 +.word 616635240 // Layer 7, block 90 +.word 583476747 // Layer 7, block 92 +.word 224425303 // Layer 7, block 94 +.word 20014407 // Layer 7, block 89 +.word 25893988 // Layer 7, block 91 +.word 10257619 // Layer 7, block 93 +.word 24501669 // Layer 7, block 95 +.word 1280824291 // Layer 7, block 89 +.word 1657088757 // Layer 7, block 91 +.word 656437514 // Layer 7, block 93 +.word 1567987141 // Layer 7, block 95 +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 23467272 // Layer 6, block 48 +.word 11944835 // Layer 6, block 49 +.word 29768154 // Layer 6, block 50 +.word 3189790 // Layer 6, block 51 +.word 1501790786 // Layer 6, block 48 +.word 764411097 // Layer 6, block 49 +.word 1905016458 // Layer 6, block 50 +.word 204130980 // Layer 6, block 51 +.word 28559032 // Layer 7, block 96 +.word 20151609 // Layer 7, block 98 +.word 11645481 // Layer 7, block 100 +.word 16402437 // Layer 7, block 102 +.word 1827638556 // Layer 7, block 96 +.word 1289604549 // Layer 7, block 98 +.word 745253903 // Layer 7, block 100 +.word 1049675853 // Layer 7, block 102 +.word 1005359 // Layer 7, block 97 +.word 19130139 // Layer 7, block 99 +.word 11690281 // Layer 7, block 101 +.word 5461508 // Layer 7, block 103 +.word 64338065 // Layer 7, block 97 +.word 1224235458 // Layer 7, block 99 +.word 748120885 // Layer 7, block 101 +.word 349509836 // Layer 7, block 103 +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 4898455 // Layer 6, block 52 +.word 22059944 // Layer 6, block 53 +.word 20315246 // Layer 6, block 54 +.word 28615767 // Layer 6, block 55 +.word 313477194 // Layer 6, block 52 +.word 1411728668 // Layer 6, block 53 +.word 1300076517 // Layer 6, block 54 +.word 1831269319 // Layer 6, block 55 +.word 6226096 // Layer 7, block 104 +.word 14029790 // Layer 7, block 106 +.word 7729000 // Layer 7, block 108 +.word 13958531 // Layer 7, block 110 +.word 398439734 // Layer 7, block 104 +.word 897838034 // Layer 7, block 106 +.word 494618249 // Layer 7, block 108 +.word 893277806 // Layer 7, block 110 +.word 31755058 // Layer 7, block 105 +.word 26102744 // Layer 7, block 107 +.word 19175904 // Layer 7, block 109 +.word 19472238 // Layer 7, block 111 +.word 2032168609 // Layer 7, block 105 +.word 1670448121 // Layer 7, block 107 +.word 1227164194 // Layer 7, block 109 +.word 1246128123 // Layer 7, block 111 +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 17302560 // Layer 6, block 56 +.word 8630188 // Layer 6, block 57 +.word 13744680 // Layer 6, block 58 +.word 31890906 // Layer 6, block 59 +.word 1107279328 // Layer 6, block 56 +.word 552289879 // Layer 6, block 57 +.word 879592386 // Layer 6, block 58 +.word 2040862218 // Layer 6, block 59 +.word 4735938 // Layer 7, block 112 +.word 26671657 // Layer 7, block 114 +.word 25810971 // Layer 7, block 116 +.word 25578690 // Layer 7, block 118 +.word 303076900 // Layer 7, block 112 +.word 1706855774 // Layer 7, block 114 +.word 1651776074 // Layer 7, block 116 +.word 1636911225 // Layer 7, block 118 +.word 6957373 // Layer 7, block 113 +.word 25381712 // Layer 7, block 115 +.word 27780827 // Layer 7, block 117 +.word 28062311 // Layer 7, block 119 +.word 445237890 // Layer 7, block 113 +.word 1624305595 // Layer 7, block 115 +.word 1777837237 // Layer 7, block 117 +.word 1795850838 // Layer 7, block 119 +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 26150922 // Layer 6, block 60 +.word 29525906 // Layer 6, block 61 +.word 23080870 // Layer 6, block 62 +.word 1636987 // Layer 6, block 63 +.word 1673531278 // Layer 6, block 60 +.word 1889513769 // Layer 6, block 61 +.word 1477062945 // Layer 6, block 62 +.word 104759172 // Layer 6, block 63 +.word 10674616 // Layer 7, block 120 +.word 9508293 // Layer 7, block 122 +.word 4274200 // Layer 7, block 124 +.word 10066304 // Layer 7, block 126 +.word 683123285 // Layer 7, block 120 +.word 608484310 // Layer 7, block 122 +.word 273527923 // Layer 7, block 124 +.word 644194289 // Layer 7, block 126 +.word 26473446 // Layer 7, block 121 +.word 14853570 // Layer 7, block 123 +.word 32427548 // Layer 7, block 125 +.word 16598340 // Layer 7, block 127 +.word 1694171239 // Layer 7, block 121 +.word 950555930 // Layer 7, block 123 +.word 2075204685 // Layer 7, block 125 +.word 1062212688 // Layer 7, block 127 +.text +.global ntt_u32_full_neon_asm_var_4_4_3_z2_2 +.global _ntt_u32_full_neon_asm_var_4_4_3_z2_2 +ntt_u32_full_neon_asm_var_4_4_3_z2_2: +_ntt_u32_full_neon_asm_var_4_4_3_z2_2: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #800] +ldr q21, [x0, #864] +ldr q20, [x0, #928] +ldr q19, [x0, #992] +ldr q18, [x0, #288] +ldr q17, [x0, #352] +ldr q16, [x0, #416] +ldr q3, [x0, #480] +sqrdmulh v2.4S, v22.4S, v29.s[0] +ldr q1, [x0, #544] +mul v22.4S, v22.4S,v30.s[0] +ldr q0, [x0, #608] +sqrdmulh v15.4S, v21.4S, v29.s[0] +ldr q14, [x0, #672] +mul v21.4S, v21.4S,v30.s[0] +ldr q13, [x0, #736] +mla v22.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q12, [x0, #32] +sub v11.4s, v18.4s, v22.4s +mla v21.4S, v15.4S, v31.s[0] +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q15, [x0, #96] +sub v10.4s, v17.4s, v21.4s +mla v20.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v1.4S, v29.s[0] +ldr q2, [x0, #160] +mul v1.4S, v1.4S,v30.s[0] +sub v9.4s, v16.4s, v20.4s +mla v19.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v0.4S, v29.s[0] +ldr q22, [x0, #224] +mul v0.4S, v0.4S,v30.s[0] +sub v8.4s, v3.4s, v19.4s +mla v1.4S, v21.4S, v31.s[0] +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v21.4s, v12.4s, v1.4s +mla v0.4S, v20.4S, v31.s[0] +add v12.4s, v12.4s, v1.4s +sqrdmulh v1.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +sub v20.4s, v15.4s, v0.4s +mla v14.4S, v19.4S, v31.s[0] +add v15.4s, v15.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v19.4s, v2.4s, v14.4s +mla v13.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v1.4s, v22.4s, v13.4s +mla v16.4S, v0.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v0.4s, v2.4s, v16.4s +mla v3.4S, v14.4S, v31.s[0] +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v14.4s, v22.4s, v3.4s +mla v18.4S, v13.4S, v31.s[0] +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v29.s[2] +mul v9.4S, v9.4S,v30.s[2] +sub v13.4s, v12.4s, v18.4s +mla v17.4S, v16.4S, v31.s[0] +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v8.4S, v29.s[2] +mul v8.4S, v8.4S,v30.s[2] +sub v16.4s, v15.4s, v17.4s +mla v9.4S, v3.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +sqrdmulh v17.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v3.4s, v19.4s, v9.4s +mla v8.4S, v18.4S, v31.s[0] +add v19.4s, v19.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v18.4s, v1.4s, v8.4s +mla v11.4S, v17.4S, v31.s[0] +add v1.4s, v1.4s, v8.4s +sqrdmulh v8.4S, v2.4S, v27.s[0] +mul v2.4S, v2.4S,v28.s[0] +sub v17.4s, v21.4s, v11.4s +mla v10.4S, v9.4S, v31.s[0] +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v27.s[0] +mul v22.4S, v22.4S,v28.s[0] +sub v9.4s, v20.4s, v10.4s +mla v2.4S, v8.4S, v31.s[0] +add v20.4s, v20.4s, v10.4s +sqrdmulh v10.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v8.4s, v12.4s, v2.4s +mla v22.4S, v11.4S, v31.s[0] +add v12.4s, v12.4s, v2.4s +sqrdmulh v2.4S, v14.4S, v27.s[1] +mul v14.4S, v14.4S,v28.s[1] +sub v11.4s, v15.4s, v22.4s +mla v0.4S, v10.4S, v31.s[0] +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v27.s[2] +mul v19.4S, v19.4S,v28.s[2] +sub v10.4s, v13.4s, v0.4s +mla v14.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v0.4s +sqrdmulh v0.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +sub v2.4s, v16.4s, v14.4s +mla v19.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v27.s[3] +mul v3.4S, v3.4S,v28.s[3] +sub v22.4s, v21.4s, v19.4s +mla v1.4S, v0.4S, v31.s[0] +add v21.4s, v21.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v0.4s, v20.4s, v1.4s +mla v3.4S, v14.4S, v31.s[0] +add v20.4s, v20.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v25.s[0] +mul v15.4S, v15.4S,v26.s[0] +sub v14.4s, v17.4s, v3.4s +mla v18.4S, v19.4S, v31.s[0] +add v17.4s, v17.4s, v3.4s +sqrdmulh v3.4S, v11.4S, v25.s[1] +mul v11.4S, v11.4S,v26.s[1] +sub v19.4s, v9.4s, v18.4s +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v1.4s, v12.4s, v15.4s +mla v11.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v25.s[3] +mul v2.4S, v2.4S,v26.s[3] +sub v3.4s, v8.4s, v11.4s +mla v16.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v11.4s +str q12, [x0, #32] +sqrdmulh v12.4S, v20.4S, v23.s[0] +str q1, [x0, #96] +mul v20.4S, v20.4S,v24.s[0] +ldr q1, [x0, #816] +sub v11.4s, v13.4s, v16.4s +ldr q18, [x0, #880] +mla v2.4S, v15.4S, v31.s[0] +add v13.4s, v13.4s, v16.4s +str q8, [x0, #160] +sqrdmulh v8.4S, v0.4S, v23.s[1] +str q3, [x0, #224] +mul v0.4S, v0.4S,v24.s[1] +ldr q3, [x0, #944] +sub v16.4s, v10.4s, v2.4s +ldr q15, [x0, #1008] +mla v20.4S, v12.4S, v31.s[0] +add v10.4s, v10.4s, v2.4s +str q13, [x0, #288] +sqrdmulh v13.4S, v9.4S, v23.s[2] +str q11, [x0, #352] +mul v9.4S, v9.4S,v24.s[2] +ldr q11, [x0, #304] +sub v2.4s, v21.4s, v20.4s +ldr q12, [x0, #368] +mla v0.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v20.4s +str q10, [x0, #416] +sqrdmulh v10.4S, v19.4S, v23.s[3] +str q16, [x0, #480] +mul v19.4S, v19.4S,v24.s[3] +ldr q16, [x0, #432] +sub v20.4s, v22.4s, v0.4s +ldr q8, [x0, #496] +mla v9.4S, v13.4S, v31.s[0] +add v22.4s, v22.4s, v0.4s +str q21, [x0, #544] +sqrdmulh v21.4S, v1.4S, v29.s[0] +str q2, [x0, #608] +ldr q2, [x0, #560] +mul v1.4S, v1.4S,v30.s[0] +ldr q0, [x0, #624] +sub v13.4s, v17.4s, v9.4s +mla v19.4S, v10.4S, v31.s[0] +add v17.4s, v17.4s, v9.4s +str q22, [x0, #672] +sqrdmulh v22.4S, v18.4S, v29.s[0] +str q20, [x0, #736] +ldr q20, [x0, #688] +mul v18.4S, v18.4S,v30.s[0] +ldr q9, [x0, #752] +sub v10.4s, v14.4s, v19.4s +mla v1.4S, v21.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +str q17, [x0, #800] +sqrdmulh v17.4S, v3.4S, v29.s[0] +str q13, [x0, #864] +mul v3.4S, v3.4S,v30.s[0] +ldr q13, [x0, #48] +sub v19.4s, v11.4s, v1.4s +mla v18.4S, v22.4S, v31.s[0] +add v11.4s, v11.4s, v1.4s +str q14, [x0, #928] +sqrdmulh v14.4S, v15.4S, v29.s[0] +str q10, [x0, #992] +mul v15.4S, v15.4S,v30.s[0] +ldr q10, [x0, #112] +sub v1.4s, v12.4s, v18.4s +mla v3.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v2.4S, v29.s[0] +ldr q17, [x0, #176] +mul v2.4S, v2.4S,v30.s[0] +sub v22.4s, v16.4s, v3.4s +mla v15.4S, v14.4S, v31.s[0] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v0.4S, v29.s[0] +ldr q14, [x0, #240] +mul v0.4S, v0.4S,v30.s[0] +sub v21.4s, v8.4s, v15.4s +mla v2.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +sub v18.4s, v13.4s, v2.4s +mla v0.4S, v3.4S, v31.s[0] +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v9.4S, v29.s[0] +mul v9.4S, v9.4S,v30.s[0] +sub v3.4s, v10.4s, v0.4s +mla v20.4S, v15.4S, v31.s[0] +add v10.4s, v10.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v15.4s, v17.4s, v20.4s +mla v9.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +sub v2.4s, v14.4s, v9.4s +mla v16.4S, v0.4S, v31.s[0] +add v14.4s, v14.4s, v9.4s +sqrdmulh v9.4S, v11.4S, v29.s[1] +mul v11.4S, v11.4S,v30.s[1] +sub v0.4s, v17.4s, v16.4s +mla v8.4S, v20.4S, v31.s[0] +add v17.4s, v17.4s, v16.4s +sqrdmulh v16.4S, v12.4S, v29.s[1] +mul v12.4S, v12.4S,v30.s[1] +sub v20.4s, v14.4s, v8.4s +mla v11.4S, v9.4S, v31.s[0] +add v14.4s, v14.4s, v8.4s +sqrdmulh v8.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +sub v9.4s, v13.4s, v11.4s +mla v12.4S, v16.4S, v31.s[0] +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +sub v16.4s, v10.4s, v12.4s +mla v22.4S, v8.4S, v31.s[0] +add v10.4s, v10.4s, v12.4s +sqrdmulh v12.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +sub v8.4s, v15.4s, v22.4s +mla v21.4S, v11.4S, v31.s[0] +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v1.4S, v29.s[2] +mul v1.4S, v1.4S,v30.s[2] +sub v11.4s, v2.4s, v21.4s +mla v19.4S, v12.4S, v31.s[0] +add v2.4s, v2.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v27.s[0] +mul v17.4S, v17.4S,v28.s[0] +sub v12.4s, v18.4s, v19.4s +mla v1.4S, v22.4S, v31.s[0] +add v18.4s, v18.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +sub v22.4s, v3.4s, v1.4s +mla v17.4S, v21.4S, v31.s[0] +add v3.4s, v3.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v21.4s, v13.4s, v17.4s +mla v14.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v17.4s +sqrdmulh v17.4S, v20.4S, v27.s[1] +mul v20.4S, v20.4S,v28.s[1] +sub v19.4s, v10.4s, v14.4s +mla v0.4S, v1.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v27.s[2] +mul v15.4S, v15.4S,v28.s[2] +sub v1.4s, v9.4s, v0.4s +mla v20.4S, v17.4S, v31.s[0] +add v9.4s, v9.4s, v0.4s +sqrdmulh v0.4S, v2.4S, v27.s[2] +mul v2.4S, v2.4S,v28.s[2] +sub v17.4s, v16.4s, v20.4s +mla v15.4S, v14.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v27.s[3] +mul v8.4S, v8.4S,v28.s[3] +sub v14.4s, v18.4s, v15.4s +mla v2.4S, v0.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v27.s[3] +mul v11.4S, v11.4S,v28.s[3] +sub v0.4s, v3.4s, v2.4s +mla v8.4S, v20.4S, v31.s[0] +add v3.4s, v3.4s, v2.4s +sqrdmulh v2.4S, v10.4S, v25.s[0] +mul v10.4S, v10.4S,v26.s[0] +sub v20.4s, v12.4s, v8.4s +mla v11.4S, v15.4S, v31.s[0] +add v12.4s, v12.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v25.s[1] +mul v19.4S, v19.4S,v26.s[1] +sub v15.4s, v22.4s, v11.4s +mla v10.4S, v2.4S, v31.s[0] +add v22.4s, v22.4s, v11.4s +sqrdmulh v11.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v2.4s, v13.4s, v10.4s +mla v19.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v10.4s +sqrdmulh v10.4S, v17.4S, v25.s[3] +mul v17.4S, v17.4S,v26.s[3] +sub v8.4s, v21.4s, v19.4s +mla v16.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v19.4s +str q13, [x0, #48] +sqrdmulh v13.4S, v3.4S, v23.s[0] +str q2, [x0, #112] +mul v3.4S, v3.4S,v24.s[0] +ldr q2, [x0, #768] +sub v19.4s, v9.4s, v16.4s +ldr q11, [x0, #832] +mla v17.4S, v10.4S, v31.s[0] +add v9.4s, v9.4s, v16.4s +str q21, [x0, #176] +sqrdmulh v21.4S, v0.4S, v23.s[1] +str q8, [x0, #240] +mul v0.4S, v0.4S,v24.s[1] +ldr q8, [x0, #896] +sub v16.4s, v1.4s, v17.4s +ldr q10, [x0, #960] +mla v3.4S, v13.4S, v31.s[0] +add v1.4s, v1.4s, v17.4s +str q9, [x0, #304] +sqrdmulh v9.4S, v22.4S, v23.s[2] +str q19, [x0, #368] +mul v22.4S, v22.4S,v24.s[2] +ldr q19, [x0, #256] +sub v17.4s, v18.4s, v3.4s +ldr q13, [x0, #320] +mla v0.4S, v21.4S, v31.s[0] +add v18.4s, v18.4s, v3.4s +str q1, [x0, #432] +sqrdmulh v1.4S, v15.4S, v23.s[3] +str q16, [x0, #496] +mul v15.4S, v15.4S,v24.s[3] +ldr q16, [x0, #384] +sub v3.4s, v14.4s, v0.4s +ldr q21, [x0, #448] +mla v22.4S, v9.4S, v31.s[0] +add v14.4s, v14.4s, v0.4s +str q18, [x0, #560] +sqrdmulh v18.4S, v2.4S, v29.s[0] +str q17, [x0, #624] +ldr q17, [x0, #512] +mul v2.4S, v2.4S,v30.s[0] +ldr q0, [x0, #576] +sub v9.4s, v12.4s, v22.4s +mla v15.4S, v1.4S, v31.s[0] +add v12.4s, v12.4s, v22.4s +str q14, [x0, #688] +sqrdmulh v14.4S, v11.4S, v29.s[0] +str q3, [x0, #752] +ldr q3, [x0, #640] +mul v11.4S, v11.4S,v30.s[0] +ldr q22, [x0, #704] +sub v1.4s, v20.4s, v15.4s +mla v2.4S, v18.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +str q12, [x0, #816] +sqrdmulh v12.4S, v8.4S, v29.s[0] +str q9, [x0, #880] +mul v8.4S, v8.4S,v30.s[0] +ldr q9, [x0, #0] +sub v15.4s, v19.4s, v2.4s +mla v11.4S, v14.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +str q20, [x0, #944] +sqrdmulh v20.4S, v10.4S, v29.s[0] +str q1, [x0, #1008] +mul v10.4S, v10.4S,v30.s[0] +ldr q1, [x0, #64] +sub v2.4s, v13.4s, v11.4s +mla v8.4S, v12.4S, v31.s[0] +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v29.s[0] +ldr q12, [x0, #128] +mul v17.4S, v17.4S,v30.s[0] +sub v14.4s, v16.4s, v8.4s +mla v10.4S, v20.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v0.4S, v29.s[0] +ldr q20, [x0, #192] +mul v0.4S, v0.4S,v30.s[0] +sub v18.4s, v21.4s, v10.4s +mla v17.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +sub v11.4s, v9.4s, v17.4s +mla v0.4S, v8.4S, v31.s[0] +add v9.4s, v9.4s, v17.4s +sqrdmulh v17.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v8.4s, v1.4s, v0.4s +mla v3.4S, v10.4S, v31.s[0] +add v1.4s, v1.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v10.4s, v12.4s, v3.4s +mla v22.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v17.4s, v20.4s, v22.4s +mla v16.4S, v0.4S, v31.s[0] +add v20.4s, v20.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[1] +mul v19.4S, v19.4S,v30.s[1] +sub v0.4s, v12.4s, v16.4s +mla v21.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v13.4S, v29.s[1] +mul v13.4S, v13.4S,v30.s[1] +sub v3.4s, v20.4s, v21.4s +mla v19.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v22.4s, v9.4s, v19.4s +mla v13.4S, v16.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v29.s[2] +mul v18.4S, v18.4S,v30.s[2] +sub v16.4s, v1.4s, v13.4s +mla v14.4S, v21.4S, v31.s[0] +add v1.4s, v1.4s, v13.4s +sqrdmulh v13.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +sub v21.4s, v10.4s, v14.4s +mla v18.4S, v19.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v19.4s, v17.4s, v18.4s +mla v15.4S, v13.4S, v31.s[0] +add v17.4s, v17.4s, v18.4s +sqrdmulh v18.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +sub v13.4s, v11.4s, v15.4s +mla v2.4S, v14.4S, v31.s[0] +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v27.s[0] +mul v20.4S, v20.4S,v28.s[0] +sub v14.4s, v8.4s, v2.4s +mla v12.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v2.4s +sqrdmulh v2.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v18.4s, v9.4s, v12.4s +mla v20.4S, v15.4S, v31.s[0] +add v9.4s, v9.4s, v12.4s +sqrdmulh v12.4S, v3.4S, v27.s[1] +mul v3.4S, v3.4S,v28.s[1] +sub v15.4s, v1.4s, v20.4s +mla v0.4S, v2.4S, v31.s[0] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v10.4S, v27.s[2] +mul v10.4S, v10.4S,v28.s[2] +sub v2.4s, v22.4s, v0.4s +mla v3.4S, v12.4S, v31.s[0] +add v22.4s, v22.4s, v0.4s +sqrdmulh v0.4S, v17.4S, v27.s[2] +mul v17.4S, v17.4S,v28.s[2] +sub v12.4s, v16.4s, v3.4s +mla v10.4S, v20.4S, v31.s[0] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +sub v20.4s, v11.4s, v10.4s +mla v17.4S, v0.4S, v31.s[0] +add v11.4s, v11.4s, v10.4s +sqrdmulh v10.4S, v19.4S, v27.s[3] +mul v19.4S, v19.4S,v28.s[3] +sub v0.4s, v8.4s, v17.4s +mla v21.4S, v3.4S, v31.s[0] +add v8.4s, v8.4s, v17.4s +sqrdmulh v17.4S, v1.4S, v25.s[0] +mul v1.4S, v1.4S,v26.s[0] +sub v3.4s, v13.4s, v21.4s +mla v19.4S, v10.4S, v31.s[0] +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v15.4S, v25.s[1] +mul v15.4S, v15.4S,v26.s[1] +sub v10.4s, v14.4s, v19.4s +mla v1.4S, v17.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +sqrdmulh v19.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v17.4s, v9.4s, v1.4s +mla v15.4S, v21.4S, v31.s[0] +add v9.4s, v9.4s, v1.4s +sqrdmulh v1.4S, v12.4S, v25.s[3] +mul v12.4S, v12.4S,v26.s[3] +sub v21.4s, v18.4s, v15.4s +mla v16.4S, v19.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +str q9, [x0, #0] +sqrdmulh v9.4S, v8.4S, v23.s[0] +str q17, [x0, #64] +mul v8.4S, v8.4S,v24.s[0] +ldr q17, [x0, #784] +sub v15.4s, v22.4s, v16.4s +ldr q19, [x0, #848] +mla v12.4S, v1.4S, v31.s[0] +add v22.4s, v22.4s, v16.4s +str q18, [x0, #128] +sqrdmulh v18.4S, v0.4S, v23.s[1] +str q21, [x0, #192] +mul v0.4S, v0.4S,v24.s[1] +ldr q21, [x0, #912] +sub v16.4s, v2.4s, v12.4s +ldr q1, [x0, #976] +mla v8.4S, v9.4S, v31.s[0] +add v2.4s, v2.4s, v12.4s +str q22, [x0, #256] +sqrdmulh v22.4S, v14.4S, v23.s[2] +str q15, [x0, #320] +mul v14.4S, v14.4S,v24.s[2] +ldr q15, [x0, #272] +sub v12.4s, v11.4s, v8.4s +ldr q9, [x0, #336] +mla v0.4S, v18.4S, v31.s[0] +add v11.4s, v11.4s, v8.4s +str q2, [x0, #384] +sqrdmulh v2.4S, v10.4S, v23.s[3] +str q16, [x0, #448] +mul v10.4S, v10.4S,v24.s[3] +ldr q16, [x0, #400] +sub v8.4s, v20.4s, v0.4s +ldr q18, [x0, #464] +mla v14.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v0.4s +str q11, [x0, #512] +sqrdmulh v11.4S, v17.4S, v29.s[0] +str q12, [x0, #576] +ldr q12, [x0, #528] +mul v17.4S, v17.4S,v30.s[0] +ldr q0, [x0, #592] +sub v22.4s, v13.4s, v14.4s +mla v10.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v14.4s +str q20, [x0, #640] +sqrdmulh v20.4S, v19.4S, v29.s[0] +str q8, [x0, #704] +ldr q8, [x0, #656] +mul v19.4S, v19.4S,v30.s[0] +ldr q14, [x0, #720] +sub v2.4s, v3.4s, v10.4s +mla v17.4S, v11.4S, v31.s[0] +add v3.4s, v3.4s, v10.4s +str q13, [x0, #768] +sqrdmulh v13.4S, v21.4S, v29.s[0] +str q22, [x0, #832] +mul v21.4S, v21.4S,v30.s[0] +ldr q22, [x0, #16] +sub v10.4s, v15.4s, v17.4s +mla v19.4S, v20.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +str q3, [x0, #896] +sqrdmulh v3.4S, v1.4S, v29.s[0] +str q2, [x0, #960] +mul v1.4S, v1.4S,v30.s[0] +ldr q2, [x0, #80] +sub v17.4s, v9.4s, v19.4s +mla v21.4S, v13.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v12.4S, v29.s[0] +ldr q13, [x0, #144] +mul v12.4S, v12.4S,v30.s[0] +sub v20.4s, v16.4s, v21.4s +mla v1.4S, v3.4S, v31.s[0] +add v16.4s, v16.4s, v21.4s +sqrdmulh v21.4S, v0.4S, v29.s[0] +ldr q3, [x0, #208] +mul v0.4S, v0.4S,v30.s[0] +sub v11.4s, v18.4s, v1.4s +mla v12.4S, v19.4S, v31.s[0] +add v18.4s, v18.4s, v1.4s +sqrdmulh v1.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v19.4s, v22.4s, v12.4s +mla v0.4S, v21.4S, v31.s[0] +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v21.4s, v2.4s, v0.4s +mla v8.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v1.4s, v13.4s, v8.4s +mla v14.4S, v12.4S, v31.s[0] +add v13.4s, v13.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v12.4s, v3.4s, v14.4s +mla v16.4S, v0.4S, v31.s[0] +add v3.4s, v3.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +sub v0.4s, v13.4s, v16.4s +mla v18.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v16.4s +sqrdmulh v16.4S, v9.4S, v29.s[1] +mul v9.4S, v9.4S,v30.s[1] +sub v8.4s, v3.4s, v18.4s +mla v15.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +sub v14.4s, v22.4s, v15.4s +mla v9.4S, v16.4S, v31.s[0] +add v22.4s, v22.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v16.4s, v2.4s, v9.4s +mla v20.4S, v18.4S, v31.s[0] +add v2.4s, v2.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v18.4s, v1.4s, v20.4s +mla v11.4S, v15.4S, v31.s[0] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +sub v15.4s, v12.4s, v11.4s +mla v10.4S, v9.4S, v31.s[0] +add v12.4s, v12.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v27.s[0] +mul v13.4S, v13.4S,v28.s[0] +sub v9.4s, v19.4s, v10.4s +mla v17.4S, v20.4S, v31.s[0] +add v19.4s, v19.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v27.s[0] +mul v3.4S, v3.4S,v28.s[0] +sub v20.4s, v21.4s, v17.4s +mla v13.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v11.4s, v22.4s, v13.4s +mla v3.4S, v10.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v8.4S, v27.s[1] +mul v8.4S, v8.4S,v28.s[1] +sub v10.4s, v2.4s, v3.4s +mla v0.4S, v17.4S, v31.s[0] +add v2.4s, v2.4s, v3.4s +sqrdmulh v3.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +sub v17.4s, v14.4s, v0.4s +mla v8.4S, v13.4S, v31.s[0] +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v12.4S, v27.s[2] +mul v12.4S, v12.4S,v28.s[2] +sub v13.4s, v16.4s, v8.4s +mla v1.4S, v3.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v3.4s, v19.4s, v1.4s +mla v12.4S, v0.4S, v31.s[0] +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +sub v0.4s, v21.4s, v12.4s +mla v18.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v2.4S, v25.s[0] +mul v2.4S, v2.4S,v26.s[0] +sub v8.4s, v9.4s, v18.4s +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v10.4S, v25.s[1] +mul v10.4S, v10.4S,v26.s[1] +sub v1.4s, v20.4s, v15.4s +mla v2.4S, v12.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v12.4s, v22.4s, v2.4s +mla v10.4S, v18.4S, v31.s[0] +add v22.4s, v22.4s, v2.4s +sqrdmulh v2.4S, v13.4S, v25.s[3] +mul v13.4S, v13.4S,v26.s[3] +sub v18.4s, v11.4s, v10.4s +mla v16.4S, v15.4S, v31.s[0] +add v11.4s, v11.4s, v10.4s +str q22, [x0, #16] +sqrdmulh v22.4S, v21.4S, v23.s[0] +str q12, [x0, #80] +mul v21.4S, v21.4S,v24.s[0] +sub v12.4s, v14.4s, v16.4s +mla v13.4S, v2.4S, v31.s[0] +add v14.4s, v14.4s, v16.4s +str q11, [x0, #144] +sqrdmulh v11.4S, v0.4S, v23.s[1] +str q18, [x0, #208] +mul v0.4S, v0.4S,v24.s[1] +sub v18.4s, v17.4s, v13.4s +mla v21.4S, v22.4S, v31.s[0] +add v17.4s, v17.4s, v13.4s +str q14, [x0, #272] +sqrdmulh v14.4S, v20.4S, v23.s[2] +str q12, [x0, #336] +mul v20.4S, v20.4S,v24.s[2] +sub v12.4s, v19.4s, v21.4s +mla v0.4S, v11.4S, v31.s[0] +add v19.4s, v19.4s, v21.4s +str q17, [x0, #400] +sqrdmulh v17.4S, v1.4S, v23.s[3] +str q18, [x0, #464] +mul v1.4S, v1.4S,v24.s[3] +sub v18.4s, v3.4s, v0.4s +mla v20.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v0.4s +str q19, [x0, #528] +str q12, [x0, #592] +sub v12.4s, v9.4s, v20.4s +mla v1.4S, v17.4S, v31.s[0] +add v9.4s, v9.4s, v20.4s +str q3, [x0, #656] +str q18, [x0, #720] +sub v18.4s, v8.4s, v1.4s +add v8.4s, v8.4s, v1.4s +str q9, [x0, #784] +str q12, [x0, #848] +str q8, [x0, #912] +str q18, [x0, #976] +ldr q4, [x17, #+128] +ldr q5, [x17, #+144] +ldr q6, [x17, #+160] +ldr q7, [x17, #+176] +ldr q15, [x17, #+192] +ldr q10, [x17, #+208] +ldr q2, [x17, #+224] +ldr q16, [x17, #+240] +ldr q22, [x0, #32] +ldr q13, [x0, #48] +ldr q11, [x0, #0] +ldr q21, [x0, #16] +ldr q14, [x17, #+1152] +ldr q0, [x17, #+1168] +ldr q19, [x17, #+1184] +ldr q17, [x17, #+1200] +ldr q20, [x17, #+1216] +ldr q3, [x17, #+1232] +ldr q1, [x17, #+1248] +ldr q9, [x17, #+1264] +ldr q12, [x0, #544] +ldr q8, [x0, #560] +ldr q18, [x0, #512] +ldr q30, [x0, #528] +sqrdmulh v29.4S, v22.4S, v5.s[0] +mul v22.4S, v22.4S,v4.s[0] +mla v22.4S, v29.4S, v31.s[0] +sub v29.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +sqrdmulh v22.4S, v13.4S, v5.s[0] +mul v13.4S, v13.4S,v4.s[0] +mla v13.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +sqrdmulh v13.4S, v12.4S, v0.s[0] +mul v12.4S, v12.4S,v14.s[0] +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v18.4s, v12.4s +add v18.4s, v18.4s, v12.4s +sqrdmulh v12.4S, v8.4S, v0.s[0] +mul v8.4S, v8.4S,v14.s[0] +mla v8.4S, v12.4S, v31.s[0] +sub v12.4s, v30.4s, v8.4s +add v30.4s, v30.4s, v8.4s +sqrdmulh v8.4S, v21.4S, v5.s[1] +mul v21.4S, v21.4S,v4.s[1] +mla v21.4S, v8.4S, v31.s[0] +sub v8.4s, v11.4s, v21.4s +add v11.4s, v11.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v5.s[2] +mul v22.4S, v22.4S,v4.s[2] +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v29.4s, v22.4s +add v29.4s, v29.4s, v22.4s +sqrdmulh v22.4S, v30.4S, v0.s[1] +mul v30.4S, v30.4S,v14.s[1] +mla v30.4S, v22.4S, v31.s[0] +sub v22.4s, v18.4s, v30.4s +add v18.4s, v18.4s, v30.4s +sqrdmulh v30.4S, v12.4S, v0.s[2] +mul v12.4S, v12.4S,v14.s[2] +mla v12.4S, v30.4S, v31.s[0] +sub v30.4s, v13.4s, v12.4s +add v13.4s, v13.4s, v12.4s +trn1 v12.4S, v11.4S, v8.4S +trn2 v28.4S, v11.4S, v8.4S +trn1 v27.4S, v29.4S, v21.4S +trn2 v26.4S, v29.4S, v21.4S +trn2 v29.2D, v12.2D, v27.2D +trn2 v21.2D, v28.2D, v26.2D +trn1 v11.2D, v12.2D, v27.2D +trn1 v8.2D, v28.2D, v26.2D +trn1 v26.4S, v18.4S, v22.4S +trn2 v28.4S, v18.4S, v22.4S +trn1 v27.4S, v13.4S, v30.4S +trn2 v12.4S, v13.4S, v30.4S +trn2 v13.2D, v26.2D, v27.2D +trn2 v30.2D, v28.2D, v12.2D +trn1 v18.2D, v26.2D, v27.2D +trn1 v22.2D, v28.2D, v12.2D +sqrdmulh v12.4S, v29.4S, v7.4S +mul v29.4S, v29.4S,v6.4S +mla v29.4S, v12.4S, v31.s[0] +sub v12.4s, v11.4s, v29.4s +add v11.4s, v11.4s, v29.4s +sqrdmulh v29.4S, v21.4S, v7.4S +mul v21.4S, v21.4S,v6.4S +mla v21.4S, v29.4S, v31.s[0] +sub v29.4s, v8.4s, v21.4s +add v8.4s, v8.4s, v21.4s +sqrdmulh v21.4S, v13.4S, v17.4S +mul v13.4S, v13.4S,v19.4S +mla v13.4S, v21.4S, v31.s[0] +sub v21.4s, v18.4s, v13.4s +add v18.4s, v18.4s, v13.4s +sqrdmulh v13.4S, v30.4S, v17.4S +mul v30.4S, v30.4S,v19.4S +mla v30.4S, v13.4S, v31.s[0] +sub v13.4s, v22.4s, v30.4s +add v22.4s, v22.4s, v30.4s +sqrdmulh v30.4S, v8.4S, v10.4S +mul v8.4S, v8.4S,v15.4S +mla v8.4S, v30.4S, v31.s[0] +sub v30.4s, v11.4s, v8.4s +add v11.4s, v11.4s, v8.4s +sqrdmulh v8.4S, v29.4S, v16.4S +mul v29.4S, v29.4S,v2.4S +mla v29.4S, v8.4S, v31.s[0] +sub v8.4s, v12.4s, v29.4s +add v12.4s, v12.4s, v29.4s +sqrdmulh v29.4S, v22.4S, v3.4S +mul v22.4S, v22.4S,v20.4S +mla v22.4S, v29.4S, v31.s[0] +sub v29.4s, v18.4s, v22.4s +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v13.4S, v9.4S +mul v13.4S, v13.4S,v1.4S +mla v13.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +str q11, [x0, #0] +str q30, [x0, #16] +str q12, [x0, #32] +str q8, [x0, #48] +str q18, [x0, #512] +str q29, [x0, #528] +str q21, [x0, #544] +str q22, [x0, #560] +ldr q9, [x17, #+256] +ldr q1, [x17, #+272] +ldr q3, [x17, #+288] +ldr q20, [x17, #+304] +ldr q17, [x17, #+320] +ldr q19, [x17, #+336] +ldr q0, [x17, #+352] +ldr q14, [x17, #+368] +ldr q22, [x0, #96] +ldr q21, [x0, #112] +ldr q29, [x0, #64] +ldr q18, [x0, #80] +ldr q16, [x17, #+1280] +ldr q2, [x17, #+1296] +ldr q10, [x17, #+1312] +ldr q15, [x17, #+1328] +ldr q7, [x17, #+1344] +ldr q6, [x17, #+1360] +ldr q5, [x17, #+1376] +ldr q4, [x17, #+1392] +ldr q8, [x0, #608] +ldr q12, [x0, #624] +ldr q30, [x0, #576] +ldr q11, [x0, #592] +sqrdmulh v13.4S, v22.4S, v1.s[0] +mul v22.4S, v22.4S,v9.s[0] +mla v22.4S, v13.4S, v31.s[0] +sub v13.4s, v29.4s, v22.4s +add v29.4s, v29.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v1.s[0] +mul v21.4S, v21.4S,v9.s[0] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v8.4S, v2.s[0] +mul v8.4S, v8.4S,v16.s[0] +mla v8.4S, v21.4S, v31.s[0] +sub v21.4s, v30.4s, v8.4s +add v30.4s, v30.4s, v8.4s +sqrdmulh v8.4S, v12.4S, v2.s[0] +mul v12.4S, v12.4S,v16.s[0] +mla v12.4S, v8.4S, v31.s[0] +sub v8.4s, v11.4s, v12.4s +add v11.4s, v11.4s, v12.4s +sqrdmulh v12.4S, v18.4S, v1.s[1] +mul v18.4S, v18.4S,v9.s[1] +mla v18.4S, v12.4S, v31.s[0] +sub v12.4s, v29.4s, v18.4s +add v29.4s, v29.4s, v18.4s +sqrdmulh v18.4S, v22.4S, v1.s[2] +mul v22.4S, v22.4S,v9.s[2] +mla v22.4S, v18.4S, v31.s[0] +sub v18.4s, v13.4s, v22.4s +add v13.4s, v13.4s, v22.4s +sqrdmulh v22.4S, v11.4S, v2.s[1] +mul v11.4S, v11.4S,v16.s[1] +mla v11.4S, v22.4S, v31.s[0] +sub v22.4s, v30.4s, v11.4s +add v30.4s, v30.4s, v11.4s +sqrdmulh v11.4S, v8.4S, v2.s[2] +mul v8.4S, v8.4S,v16.s[2] +mla v8.4S, v11.4S, v31.s[0] +sub v11.4s, v21.4s, v8.4s +add v21.4s, v21.4s, v8.4s +trn1 v8.4S, v29.4S, v12.4S +trn2 v28.4S, v29.4S, v12.4S +trn1 v27.4S, v13.4S, v18.4S +trn2 v26.4S, v13.4S, v18.4S +trn2 v13.2D, v8.2D, v27.2D +trn2 v18.2D, v28.2D, v26.2D +trn1 v29.2D, v8.2D, v27.2D +trn1 v12.2D, v28.2D, v26.2D +trn1 v26.4S, v30.4S, v22.4S +trn2 v28.4S, v30.4S, v22.4S +trn1 v27.4S, v21.4S, v11.4S +trn2 v8.4S, v21.4S, v11.4S +trn2 v21.2D, v26.2D, v27.2D +trn2 v11.2D, v28.2D, v8.2D +trn1 v30.2D, v26.2D, v27.2D +trn1 v22.2D, v28.2D, v8.2D +sqrdmulh v8.4S, v13.4S, v20.4S +mul v13.4S, v13.4S,v3.4S +mla v13.4S, v8.4S, v31.s[0] +sub v8.4s, v29.4s, v13.4s +add v29.4s, v29.4s, v13.4s +sqrdmulh v13.4S, v18.4S, v20.4S +mul v18.4S, v18.4S,v3.4S +mla v18.4S, v13.4S, v31.s[0] +sub v13.4s, v12.4s, v18.4s +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v21.4S, v15.4S +mul v21.4S, v21.4S,v10.4S +mla v21.4S, v18.4S, v31.s[0] +sub v18.4s, v30.4s, v21.4s +add v30.4s, v30.4s, v21.4s +sqrdmulh v21.4S, v11.4S, v15.4S +mul v11.4S, v11.4S,v10.4S +mla v11.4S, v21.4S, v31.s[0] +sub v21.4s, v22.4s, v11.4s +add v22.4s, v22.4s, v11.4s +sqrdmulh v11.4S, v12.4S, v19.4S +mul v12.4S, v12.4S,v17.4S +mla v12.4S, v11.4S, v31.s[0] +sub v11.4s, v29.4s, v12.4s +add v29.4s, v29.4s, v12.4s +sqrdmulh v12.4S, v13.4S, v14.4S +mul v13.4S, v13.4S,v0.4S +mla v13.4S, v12.4S, v31.s[0] +sub v12.4s, v8.4s, v13.4s +add v8.4s, v8.4s, v13.4s +sqrdmulh v13.4S, v22.4S, v6.4S +mul v22.4S, v22.4S,v7.4S +mla v22.4S, v13.4S, v31.s[0] +sub v13.4s, v30.4s, v22.4s +add v30.4s, v30.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v4.4S +mul v21.4S, v21.4S,v5.4S +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +str q29, [x0, #64] +str q11, [x0, #80] +str q8, [x0, #96] +str q12, [x0, #112] +str q30, [x0, #576] +str q13, [x0, #592] +str q18, [x0, #608] +str q22, [x0, #624] +ldr q4, [x17, #+384] +ldr q5, [x17, #+400] +ldr q6, [x17, #+416] +ldr q7, [x17, #+432] +ldr q15, [x17, #+448] +ldr q10, [x17, #+464] +ldr q2, [x17, #+480] +ldr q16, [x17, #+496] +ldr q22, [x0, #160] +ldr q18, [x0, #176] +ldr q13, [x0, #128] +ldr q30, [x0, #144] +ldr q14, [x17, #+1408] +ldr q0, [x17, #+1424] +ldr q19, [x17, #+1440] +ldr q17, [x17, #+1456] +ldr q20, [x17, #+1472] +ldr q3, [x17, #+1488] +ldr q1, [x17, #+1504] +ldr q9, [x17, #+1520] +ldr q12, [x0, #672] +ldr q8, [x0, #688] +ldr q11, [x0, #640] +ldr q29, [x0, #656] +sqrdmulh v21.4S, v22.4S, v5.s[0] +mul v22.4S, v22.4S,v4.s[0] +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v13.4s, v22.4s +add v13.4s, v13.4s, v22.4s +sqrdmulh v22.4S, v18.4S, v5.s[0] +mul v18.4S, v18.4S,v4.s[0] +mla v18.4S, v22.4S, v31.s[0] +sub v22.4s, v30.4s, v18.4s +add v30.4s, v30.4s, v18.4s +sqrdmulh v18.4S, v12.4S, v0.s[0] +mul v12.4S, v12.4S,v14.s[0] +mla v12.4S, v18.4S, v31.s[0] +sub v18.4s, v11.4s, v12.4s +add v11.4s, v11.4s, v12.4s +sqrdmulh v12.4S, v8.4S, v0.s[0] +mul v8.4S, v8.4S,v14.s[0] +mla v8.4S, v12.4S, v31.s[0] +sub v12.4s, v29.4s, v8.4s +add v29.4s, v29.4s, v8.4s +sqrdmulh v8.4S, v30.4S, v5.s[1] +mul v30.4S, v30.4S,v4.s[1] +mla v30.4S, v8.4S, v31.s[0] +sub v8.4s, v13.4s, v30.4s +add v13.4s, v13.4s, v30.4s +sqrdmulh v30.4S, v22.4S, v5.s[2] +mul v22.4S, v22.4S,v4.s[2] +mla v22.4S, v30.4S, v31.s[0] +sub v30.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +sqrdmulh v22.4S, v29.4S, v0.s[1] +mul v29.4S, v29.4S,v14.s[1] +mla v29.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v29.4s +add v11.4s, v11.4s, v29.4s +sqrdmulh v29.4S, v12.4S, v0.s[2] +mul v12.4S, v12.4S,v14.s[2] +mla v12.4S, v29.4S, v31.s[0] +sub v29.4s, v18.4s, v12.4s +add v18.4s, v18.4s, v12.4s +trn1 v12.4S, v13.4S, v8.4S +trn2 v28.4S, v13.4S, v8.4S +trn1 v27.4S, v21.4S, v30.4S +trn2 v26.4S, v21.4S, v30.4S +trn2 v21.2D, v12.2D, v27.2D +trn2 v30.2D, v28.2D, v26.2D +trn1 v13.2D, v12.2D, v27.2D +trn1 v8.2D, v28.2D, v26.2D +trn1 v26.4S, v11.4S, v22.4S +trn2 v28.4S, v11.4S, v22.4S +trn1 v27.4S, v18.4S, v29.4S +trn2 v12.4S, v18.4S, v29.4S +trn2 v18.2D, v26.2D, v27.2D +trn2 v29.2D, v28.2D, v12.2D +trn1 v11.2D, v26.2D, v27.2D +trn1 v22.2D, v28.2D, v12.2D +sqrdmulh v12.4S, v21.4S, v7.4S +mul v21.4S, v21.4S,v6.4S +mla v21.4S, v12.4S, v31.s[0] +sub v12.4s, v13.4s, v21.4s +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v30.4S, v7.4S +mul v30.4S, v30.4S,v6.4S +mla v30.4S, v21.4S, v31.s[0] +sub v21.4s, v8.4s, v30.4s +add v8.4s, v8.4s, v30.4s +sqrdmulh v30.4S, v18.4S, v17.4S +mul v18.4S, v18.4S,v19.4S +mla v18.4S, v30.4S, v31.s[0] +sub v30.4s, v11.4s, v18.4s +add v11.4s, v11.4s, v18.4s +sqrdmulh v18.4S, v29.4S, v17.4S +mul v29.4S, v29.4S,v19.4S +mla v29.4S, v18.4S, v31.s[0] +sub v18.4s, v22.4s, v29.4s +add v22.4s, v22.4s, v29.4s +sqrdmulh v29.4S, v8.4S, v10.4S +mul v8.4S, v8.4S,v15.4S +mla v8.4S, v29.4S, v31.s[0] +sub v29.4s, v13.4s, v8.4s +add v13.4s, v13.4s, v8.4s +sqrdmulh v8.4S, v21.4S, v16.4S +mul v21.4S, v21.4S,v2.4S +mla v21.4S, v8.4S, v31.s[0] +sub v8.4s, v12.4s, v21.4s +add v12.4s, v12.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v3.4S +mul v22.4S, v22.4S,v20.4S +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +sqrdmulh v22.4S, v18.4S, v9.4S +mul v18.4S, v18.4S,v1.4S +mla v18.4S, v22.4S, v31.s[0] +sub v22.4s, v30.4s, v18.4s +add v30.4s, v30.4s, v18.4s +str q13, [x0, #128] +str q29, [x0, #144] +str q12, [x0, #160] +str q8, [x0, #176] +str q11, [x0, #640] +str q21, [x0, #656] +str q30, [x0, #672] +str q22, [x0, #688] +ldr q9, [x17, #+512] +ldr q1, [x17, #+528] +ldr q3, [x17, #+544] +ldr q20, [x17, #+560] +ldr q17, [x17, #+576] +ldr q19, [x17, #+592] +ldr q0, [x17, #+608] +ldr q14, [x17, #+624] +ldr q22, [x0, #224] +ldr q30, [x0, #240] +ldr q21, [x0, #192] +ldr q11, [x0, #208] +ldr q16, [x17, #+1536] +ldr q2, [x17, #+1552] +ldr q10, [x17, #+1568] +ldr q15, [x17, #+1584] +ldr q7, [x17, #+1600] +ldr q6, [x17, #+1616] +ldr q5, [x17, #+1632] +ldr q4, [x17, #+1648] +ldr q8, [x0, #736] +ldr q12, [x0, #752] +ldr q29, [x0, #704] +ldr q13, [x0, #720] +sqrdmulh v18.4S, v22.4S, v1.s[0] +mul v22.4S, v22.4S,v9.s[0] +mla v22.4S, v18.4S, v31.s[0] +sub v18.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +sqrdmulh v22.4S, v30.4S, v1.s[0] +mul v30.4S, v30.4S,v9.s[0] +mla v30.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v30.4s +add v11.4s, v11.4s, v30.4s +sqrdmulh v30.4S, v8.4S, v2.s[0] +mul v8.4S, v8.4S,v16.s[0] +mla v8.4S, v30.4S, v31.s[0] +sub v30.4s, v29.4s, v8.4s +add v29.4s, v29.4s, v8.4s +sqrdmulh v8.4S, v12.4S, v2.s[0] +mul v12.4S, v12.4S,v16.s[0] +mla v12.4S, v8.4S, v31.s[0] +sub v8.4s, v13.4s, v12.4s +add v13.4s, v13.4s, v12.4s +sqrdmulh v12.4S, v11.4S, v1.s[1] +mul v11.4S, v11.4S,v9.s[1] +mla v11.4S, v12.4S, v31.s[0] +sub v12.4s, v21.4s, v11.4s +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v1.s[2] +mul v22.4S, v22.4S,v9.s[2] +mla v22.4S, v11.4S, v31.s[0] +sub v11.4s, v18.4s, v22.4s +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v13.4S, v2.s[1] +mul v13.4S, v13.4S,v16.s[1] +mla v13.4S, v22.4S, v31.s[0] +sub v22.4s, v29.4s, v13.4s +add v29.4s, v29.4s, v13.4s +sqrdmulh v13.4S, v8.4S, v2.s[2] +mul v8.4S, v8.4S,v16.s[2] +mla v8.4S, v13.4S, v31.s[0] +sub v13.4s, v30.4s, v8.4s +add v30.4s, v30.4s, v8.4s +trn1 v8.4S, v21.4S, v12.4S +trn2 v28.4S, v21.4S, v12.4S +trn1 v27.4S, v18.4S, v11.4S +trn2 v26.4S, v18.4S, v11.4S +trn2 v18.2D, v8.2D, v27.2D +trn2 v11.2D, v28.2D, v26.2D +trn1 v21.2D, v8.2D, v27.2D +trn1 v12.2D, v28.2D, v26.2D +trn1 v26.4S, v29.4S, v22.4S +trn2 v28.4S, v29.4S, v22.4S +trn1 v27.4S, v30.4S, v13.4S +trn2 v8.4S, v30.4S, v13.4S +trn2 v30.2D, v26.2D, v27.2D +trn2 v13.2D, v28.2D, v8.2D +trn1 v29.2D, v26.2D, v27.2D +trn1 v22.2D, v28.2D, v8.2D +sqrdmulh v8.4S, v18.4S, v20.4S +mul v18.4S, v18.4S,v3.4S +mla v18.4S, v8.4S, v31.s[0] +sub v8.4s, v21.4s, v18.4s +add v21.4s, v21.4s, v18.4s +sqrdmulh v18.4S, v11.4S, v20.4S +mul v11.4S, v11.4S,v3.4S +mla v11.4S, v18.4S, v31.s[0] +sub v18.4s, v12.4s, v11.4s +add v12.4s, v12.4s, v11.4s +sqrdmulh v11.4S, v30.4S, v15.4S +mul v30.4S, v30.4S,v10.4S +mla v30.4S, v11.4S, v31.s[0] +sub v11.4s, v29.4s, v30.4s +add v29.4s, v29.4s, v30.4s +sqrdmulh v30.4S, v13.4S, v15.4S +mul v13.4S, v13.4S,v10.4S +mla v13.4S, v30.4S, v31.s[0] +sub v30.4s, v22.4s, v13.4s +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v12.4S, v19.4S +mul v12.4S, v12.4S,v17.4S +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v21.4s, v12.4s +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v18.4S, v14.4S +mul v18.4S, v18.4S,v0.4S +mla v18.4S, v12.4S, v31.s[0] +sub v12.4s, v8.4s, v18.4s +add v8.4s, v8.4s, v18.4s +sqrdmulh v18.4S, v22.4S, v6.4S +mul v22.4S, v22.4S,v7.4S +mla v22.4S, v18.4S, v31.s[0] +sub v18.4s, v29.4s, v22.4s +add v29.4s, v29.4s, v22.4s +sqrdmulh v22.4S, v30.4S, v4.4S +mul v30.4S, v30.4S,v5.4S +mla v30.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v30.4s +add v11.4s, v11.4s, v30.4s +str q21, [x0, #192] +str q13, [x0, #208] +str q8, [x0, #224] +str q12, [x0, #240] +str q29, [x0, #704] +str q18, [x0, #720] +str q11, [x0, #736] +str q22, [x0, #752] +ldr q4, [x17, #+640] +ldr q5, [x17, #+656] +ldr q6, [x17, #+672] +ldr q7, [x17, #+688] +ldr q15, [x17, #+704] +ldr q10, [x17, #+720] +ldr q2, [x17, #+736] +ldr q16, [x17, #+752] +ldr q22, [x0, #288] +ldr q11, [x0, #304] +ldr q18, [x0, #256] +ldr q29, [x0, #272] +ldr q14, [x17, #+1664] +ldr q0, [x17, #+1680] +ldr q19, [x17, #+1696] +ldr q17, [x17, #+1712] +ldr q20, [x17, #+1728] +ldr q3, [x17, #+1744] +ldr q1, [x17, #+1760] +ldr q9, [x17, #+1776] +ldr q12, [x0, #800] +ldr q8, [x0, #816] +ldr q13, [x0, #768] +ldr q21, [x0, #784] +sqrdmulh v30.4S, v22.4S, v5.s[0] +mul v22.4S, v22.4S,v4.s[0] +mla v22.4S, v30.4S, v31.s[0] +sub v30.4s, v18.4s, v22.4s +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v11.4S, v5.s[0] +mul v11.4S, v11.4S,v4.s[0] +mla v11.4S, v22.4S, v31.s[0] +sub v22.4s, v29.4s, v11.4s +add v29.4s, v29.4s, v11.4s +sqrdmulh v11.4S, v12.4S, v0.s[0] +mul v12.4S, v12.4S,v14.s[0] +mla v12.4S, v11.4S, v31.s[0] +sub v11.4s, v13.4s, v12.4s +add v13.4s, v13.4s, v12.4s +sqrdmulh v12.4S, v8.4S, v0.s[0] +mul v8.4S, v8.4S,v14.s[0] +mla v8.4S, v12.4S, v31.s[0] +sub v12.4s, v21.4s, v8.4s +add v21.4s, v21.4s, v8.4s +sqrdmulh v8.4S, v29.4S, v5.s[1] +mul v29.4S, v29.4S,v4.s[1] +mla v29.4S, v8.4S, v31.s[0] +sub v8.4s, v18.4s, v29.4s +add v18.4s, v18.4s, v29.4s +sqrdmulh v29.4S, v22.4S, v5.s[2] +mul v22.4S, v22.4S,v4.s[2] +mla v22.4S, v29.4S, v31.s[0] +sub v29.4s, v30.4s, v22.4s +add v30.4s, v30.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v0.s[1] +mul v21.4S, v21.4S,v14.s[1] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v13.4s, v21.4s +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v12.4S, v0.s[2] +mul v12.4S, v12.4S,v14.s[2] +mla v12.4S, v21.4S, v31.s[0] +sub v21.4s, v11.4s, v12.4s +add v11.4s, v11.4s, v12.4s +trn1 v12.4S, v18.4S, v8.4S +trn2 v28.4S, v18.4S, v8.4S +trn1 v27.4S, v30.4S, v29.4S +trn2 v26.4S, v30.4S, v29.4S +trn2 v30.2D, v12.2D, v27.2D +trn2 v29.2D, v28.2D, v26.2D +trn1 v18.2D, v12.2D, v27.2D +trn1 v8.2D, v28.2D, v26.2D +trn1 v26.4S, v13.4S, v22.4S +trn2 v28.4S, v13.4S, v22.4S +trn1 v27.4S, v11.4S, v21.4S +trn2 v12.4S, v11.4S, v21.4S +trn2 v11.2D, v26.2D, v27.2D +trn2 v21.2D, v28.2D, v12.2D +trn1 v13.2D, v26.2D, v27.2D +trn1 v22.2D, v28.2D, v12.2D +sqrdmulh v12.4S, v30.4S, v7.4S +mul v30.4S, v30.4S,v6.4S +mla v30.4S, v12.4S, v31.s[0] +sub v12.4s, v18.4s, v30.4s +add v18.4s, v18.4s, v30.4s +sqrdmulh v30.4S, v29.4S, v7.4S +mul v29.4S, v29.4S,v6.4S +mla v29.4S, v30.4S, v31.s[0] +sub v30.4s, v8.4s, v29.4s +add v8.4s, v8.4s, v29.4s +sqrdmulh v29.4S, v11.4S, v17.4S +mul v11.4S, v11.4S,v19.4S +mla v11.4S, v29.4S, v31.s[0] +sub v29.4s, v13.4s, v11.4s +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v21.4S, v17.4S +mul v21.4S, v21.4S,v19.4S +mla v21.4S, v11.4S, v31.s[0] +sub v11.4s, v22.4s, v21.4s +add v22.4s, v22.4s, v21.4s +sqrdmulh v21.4S, v8.4S, v10.4S +mul v8.4S, v8.4S,v15.4S +mla v8.4S, v21.4S, v31.s[0] +sub v21.4s, v18.4s, v8.4s +add v18.4s, v18.4s, v8.4s +sqrdmulh v8.4S, v30.4S, v16.4S +mul v30.4S, v30.4S,v2.4S +mla v30.4S, v8.4S, v31.s[0] +sub v8.4s, v12.4s, v30.4s +add v12.4s, v12.4s, v30.4s +sqrdmulh v30.4S, v22.4S, v3.4S +mul v22.4S, v22.4S,v20.4S +mla v22.4S, v30.4S, v31.s[0] +sub v30.4s, v13.4s, v22.4s +add v13.4s, v13.4s, v22.4s +sqrdmulh v22.4S, v11.4S, v9.4S +mul v11.4S, v11.4S,v1.4S +mla v11.4S, v22.4S, v31.s[0] +sub v22.4s, v29.4s, v11.4s +add v29.4s, v29.4s, v11.4s +str q18, [x0, #256] +str q21, [x0, #272] +str q12, [x0, #288] +str q8, [x0, #304] +str q13, [x0, #768] +str q30, [x0, #784] +str q29, [x0, #800] +str q22, [x0, #816] +ldr q9, [x17, #+768] +ldr q1, [x17, #+784] +ldr q3, [x17, #+800] +ldr q20, [x17, #+816] +ldr q17, [x17, #+832] +ldr q19, [x17, #+848] +ldr q0, [x17, #+864] +ldr q14, [x17, #+880] +ldr q22, [x0, #352] +ldr q29, [x0, #368] +ldr q30, [x0, #320] +ldr q13, [x0, #336] +ldr q16, [x17, #+1792] +ldr q2, [x17, #+1808] +ldr q10, [x17, #+1824] +ldr q15, [x17, #+1840] +ldr q7, [x17, #+1856] +ldr q6, [x17, #+1872] +ldr q5, [x17, #+1888] +ldr q4, [x17, #+1904] +ldr q8, [x0, #864] +ldr q12, [x0, #880] +ldr q21, [x0, #832] +ldr q18, [x0, #848] +sqrdmulh v11.4S, v22.4S, v1.s[0] +mul v22.4S, v22.4S,v9.s[0] +mla v22.4S, v11.4S, v31.s[0] +sub v11.4s, v30.4s, v22.4s +add v30.4s, v30.4s, v22.4s +sqrdmulh v22.4S, v29.4S, v1.s[0] +mul v29.4S, v29.4S,v9.s[0] +mla v29.4S, v22.4S, v31.s[0] +sub v22.4s, v13.4s, v29.4s +add v13.4s, v13.4s, v29.4s +sqrdmulh v29.4S, v8.4S, v2.s[0] +mul v8.4S, v8.4S,v16.s[0] +mla v8.4S, v29.4S, v31.s[0] +sub v29.4s, v21.4s, v8.4s +add v21.4s, v21.4s, v8.4s +sqrdmulh v8.4S, v12.4S, v2.s[0] +mul v12.4S, v12.4S,v16.s[0] +mla v12.4S, v8.4S, v31.s[0] +sub v8.4s, v18.4s, v12.4s +add v18.4s, v18.4s, v12.4s +sqrdmulh v12.4S, v13.4S, v1.s[1] +mul v13.4S, v13.4S,v9.s[1] +mla v13.4S, v12.4S, v31.s[0] +sub v12.4s, v30.4s, v13.4s +add v30.4s, v30.4s, v13.4s +sqrdmulh v13.4S, v22.4S, v1.s[2] +mul v22.4S, v22.4S,v9.s[2] +mla v22.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +sqrdmulh v22.4S, v18.4S, v2.s[1] +mul v18.4S, v18.4S,v16.s[1] +mla v18.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v18.4s +add v21.4s, v21.4s, v18.4s +sqrdmulh v18.4S, v8.4S, v2.s[2] +mul v8.4S, v8.4S,v16.s[2] +mla v8.4S, v18.4S, v31.s[0] +sub v18.4s, v29.4s, v8.4s +add v29.4s, v29.4s, v8.4s +trn1 v8.4S, v30.4S, v12.4S +trn2 v28.4S, v30.4S, v12.4S +trn1 v27.4S, v11.4S, v13.4S +trn2 v26.4S, v11.4S, v13.4S +trn2 v11.2D, v8.2D, v27.2D +trn2 v13.2D, v28.2D, v26.2D +trn1 v30.2D, v8.2D, v27.2D +trn1 v12.2D, v28.2D, v26.2D +trn1 v26.4S, v21.4S, v22.4S +trn2 v28.4S, v21.4S, v22.4S +trn1 v27.4S, v29.4S, v18.4S +trn2 v8.4S, v29.4S, v18.4S +trn2 v29.2D, v26.2D, v27.2D +trn2 v18.2D, v28.2D, v8.2D +trn1 v21.2D, v26.2D, v27.2D +trn1 v22.2D, v28.2D, v8.2D +sqrdmulh v8.4S, v11.4S, v20.4S +mul v11.4S, v11.4S,v3.4S +mla v11.4S, v8.4S, v31.s[0] +sub v8.4s, v30.4s, v11.4s +add v30.4s, v30.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v20.4S +mul v13.4S, v13.4S,v3.4S +mla v13.4S, v11.4S, v31.s[0] +sub v11.4s, v12.4s, v13.4s +add v12.4s, v12.4s, v13.4s +sqrdmulh v13.4S, v29.4S, v15.4S +mul v29.4S, v29.4S,v10.4S +mla v29.4S, v13.4S, v31.s[0] +sub v13.4s, v21.4s, v29.4s +add v21.4s, v21.4s, v29.4s +sqrdmulh v29.4S, v18.4S, v15.4S +mul v18.4S, v18.4S,v10.4S +mla v18.4S, v29.4S, v31.s[0] +sub v29.4s, v22.4s, v18.4s +add v22.4s, v22.4s, v18.4s +sqrdmulh v18.4S, v12.4S, v19.4S +mul v12.4S, v12.4S,v17.4S +mla v12.4S, v18.4S, v31.s[0] +sub v18.4s, v30.4s, v12.4s +add v30.4s, v30.4s, v12.4s +sqrdmulh v12.4S, v11.4S, v14.4S +mul v11.4S, v11.4S,v0.4S +mla v11.4S, v12.4S, v31.s[0] +sub v12.4s, v8.4s, v11.4s +add v8.4s, v8.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v6.4S +mul v22.4S, v22.4S,v7.4S +mla v22.4S, v11.4S, v31.s[0] +sub v11.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +sqrdmulh v22.4S, v29.4S, v4.4S +mul v29.4S, v29.4S,v5.4S +mla v29.4S, v22.4S, v31.s[0] +sub v22.4s, v13.4s, v29.4s +add v13.4s, v13.4s, v29.4s +str q30, [x0, #320] +str q18, [x0, #336] +str q8, [x0, #352] +str q12, [x0, #368] +str q21, [x0, #832] +str q11, [x0, #848] +str q13, [x0, #864] +str q22, [x0, #880] +ldr q4, [x17, #+896] +ldr q5, [x17, #+912] +ldr q6, [x17, #+928] +ldr q7, [x17, #+944] +ldr q15, [x17, #+960] +ldr q10, [x17, #+976] +ldr q2, [x17, #+992] +ldr q16, [x17, #+1008] +ldr q22, [x0, #416] +ldr q13, [x0, #432] +ldr q11, [x0, #384] +ldr q21, [x0, #400] +ldr q14, [x17, #+1920] +ldr q0, [x17, #+1936] +ldr q19, [x17, #+1952] +ldr q17, [x17, #+1968] +ldr q20, [x17, #+1984] +ldr q3, [x17, #+2000] +ldr q1, [x17, #+2016] +ldr q9, [x17, #+2032] +ldr q12, [x0, #928] +ldr q8, [x0, #944] +ldr q18, [x0, #896] +ldr q30, [x0, #912] +sqrdmulh v29.4S, v22.4S, v5.s[0] +mul v22.4S, v22.4S,v4.s[0] +mla v22.4S, v29.4S, v31.s[0] +sub v29.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +sqrdmulh v22.4S, v13.4S, v5.s[0] +mul v13.4S, v13.4S,v4.s[0] +mla v13.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +sqrdmulh v13.4S, v12.4S, v0.s[0] +mul v12.4S, v12.4S,v14.s[0] +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v18.4s, v12.4s +add v18.4s, v18.4s, v12.4s +sqrdmulh v12.4S, v8.4S, v0.s[0] +mul v8.4S, v8.4S,v14.s[0] +mla v8.4S, v12.4S, v31.s[0] +sub v12.4s, v30.4s, v8.4s +add v30.4s, v30.4s, v8.4s +sqrdmulh v8.4S, v21.4S, v5.s[1] +mul v21.4S, v21.4S,v4.s[1] +mla v21.4S, v8.4S, v31.s[0] +sub v8.4s, v11.4s, v21.4s +add v11.4s, v11.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v5.s[2] +mul v22.4S, v22.4S,v4.s[2] +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v29.4s, v22.4s +add v29.4s, v29.4s, v22.4s +sqrdmulh v22.4S, v30.4S, v0.s[1] +mul v30.4S, v30.4S,v14.s[1] +mla v30.4S, v22.4S, v31.s[0] +sub v22.4s, v18.4s, v30.4s +add v18.4s, v18.4s, v30.4s +sqrdmulh v30.4S, v12.4S, v0.s[2] +mul v12.4S, v12.4S,v14.s[2] +mla v12.4S, v30.4S, v31.s[0] +sub v30.4s, v13.4s, v12.4s +add v13.4s, v13.4s, v12.4s +trn1 v12.4S, v11.4S, v8.4S +trn2 v28.4S, v11.4S, v8.4S +trn1 v27.4S, v29.4S, v21.4S +trn2 v26.4S, v29.4S, v21.4S +trn2 v29.2D, v12.2D, v27.2D +trn2 v21.2D, v28.2D, v26.2D +trn1 v11.2D, v12.2D, v27.2D +trn1 v8.2D, v28.2D, v26.2D +trn1 v26.4S, v18.4S, v22.4S +trn2 v28.4S, v18.4S, v22.4S +trn1 v27.4S, v13.4S, v30.4S +trn2 v12.4S, v13.4S, v30.4S +trn2 v13.2D, v26.2D, v27.2D +trn2 v30.2D, v28.2D, v12.2D +trn1 v18.2D, v26.2D, v27.2D +trn1 v22.2D, v28.2D, v12.2D +sqrdmulh v12.4S, v29.4S, v7.4S +mul v29.4S, v29.4S,v6.4S +mla v29.4S, v12.4S, v31.s[0] +sub v12.4s, v11.4s, v29.4s +add v11.4s, v11.4s, v29.4s +sqrdmulh v29.4S, v21.4S, v7.4S +mul v21.4S, v21.4S,v6.4S +mla v21.4S, v29.4S, v31.s[0] +sub v29.4s, v8.4s, v21.4s +add v8.4s, v8.4s, v21.4s +sqrdmulh v21.4S, v13.4S, v17.4S +mul v13.4S, v13.4S,v19.4S +mla v13.4S, v21.4S, v31.s[0] +sub v21.4s, v18.4s, v13.4s +add v18.4s, v18.4s, v13.4s +sqrdmulh v13.4S, v30.4S, v17.4S +mul v30.4S, v30.4S,v19.4S +mla v30.4S, v13.4S, v31.s[0] +sub v13.4s, v22.4s, v30.4s +add v22.4s, v22.4s, v30.4s +sqrdmulh v30.4S, v8.4S, v10.4S +mul v8.4S, v8.4S,v15.4S +mla v8.4S, v30.4S, v31.s[0] +sub v30.4s, v11.4s, v8.4s +add v11.4s, v11.4s, v8.4s +sqrdmulh v8.4S, v29.4S, v16.4S +mul v29.4S, v29.4S,v2.4S +mla v29.4S, v8.4S, v31.s[0] +sub v8.4s, v12.4s, v29.4s +add v12.4s, v12.4s, v29.4s +sqrdmulh v29.4S, v22.4S, v3.4S +mul v22.4S, v22.4S,v20.4S +mla v22.4S, v29.4S, v31.s[0] +sub v29.4s, v18.4s, v22.4s +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v13.4S, v9.4S +mul v13.4S, v13.4S,v1.4S +mla v13.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +str q11, [x0, #384] +str q30, [x0, #400] +str q12, [x0, #416] +str q8, [x0, #432] +str q18, [x0, #896] +str q29, [x0, #912] +str q21, [x0, #928] +str q22, [x0, #944] +ldr q9, [x17, #+1024] +ldr q1, [x17, #+1040] +ldr q3, [x17, #+1056] +ldr q20, [x17, #+1072] +ldr q17, [x17, #+1088] +ldr q19, [x17, #+1104] +ldr q0, [x17, #+1120] +ldr q14, [x17, #+1136] +ldr q22, [x0, #480] +ldr q21, [x0, #496] +ldr q29, [x0, #448] +ldr q18, [x0, #464] +ldr q16, [x17, #+2048] +ldr q2, [x17, #+2064] +ldr q10, [x17, #+2080] +ldr q15, [x17, #+2096] +ldr q7, [x17, #+2112] +ldr q6, [x17, #+2128] +ldr q5, [x17, #+2144] +ldr q4, [x17, #+2160] +ldr q8, [x0, #992] +ldr q12, [x0, #1008] +ldr q30, [x0, #960] +ldr q11, [x0, #976] +sqrdmulh v13.4S, v22.4S, v1.s[0] +mul v22.4S, v22.4S,v9.s[0] +mla v22.4S, v13.4S, v31.s[0] +sub v13.4s, v29.4s, v22.4s +add v29.4s, v29.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v1.s[0] +mul v21.4S, v21.4S,v9.s[0] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v8.4S, v2.s[0] +mul v8.4S, v8.4S,v16.s[0] +mla v8.4S, v21.4S, v31.s[0] +sub v21.4s, v30.4s, v8.4s +add v30.4s, v30.4s, v8.4s +sqrdmulh v8.4S, v12.4S, v2.s[0] +mul v12.4S, v12.4S,v16.s[0] +mla v12.4S, v8.4S, v31.s[0] +sub v8.4s, v11.4s, v12.4s +add v11.4s, v11.4s, v12.4s +sqrdmulh v12.4S, v18.4S, v1.s[1] +mul v18.4S, v18.4S,v9.s[1] +mla v18.4S, v12.4S, v31.s[0] +sub v12.4s, v29.4s, v18.4s +add v29.4s, v29.4s, v18.4s +sqrdmulh v18.4S, v22.4S, v1.s[2] +mul v22.4S, v22.4S,v9.s[2] +mla v22.4S, v18.4S, v31.s[0] +sub v18.4s, v13.4s, v22.4s +add v13.4s, v13.4s, v22.4s +sqrdmulh v22.4S, v11.4S, v2.s[1] +mul v11.4S, v11.4S,v16.s[1] +mla v11.4S, v22.4S, v31.s[0] +sub v22.4s, v30.4s, v11.4s +add v30.4s, v30.4s, v11.4s +sqrdmulh v11.4S, v8.4S, v2.s[2] +mul v8.4S, v8.4S,v16.s[2] +mla v8.4S, v11.4S, v31.s[0] +sub v11.4s, v21.4s, v8.4s +add v21.4s, v21.4s, v8.4s +trn1 v8.4S, v29.4S, v12.4S +trn2 v28.4S, v29.4S, v12.4S +trn1 v27.4S, v13.4S, v18.4S +trn2 v26.4S, v13.4S, v18.4S +trn2 v13.2D, v8.2D, v27.2D +trn2 v18.2D, v28.2D, v26.2D +trn1 v29.2D, v8.2D, v27.2D +trn1 v12.2D, v28.2D, v26.2D +trn1 v26.4S, v30.4S, v22.4S +trn2 v28.4S, v30.4S, v22.4S +trn1 v27.4S, v21.4S, v11.4S +trn2 v8.4S, v21.4S, v11.4S +trn2 v21.2D, v26.2D, v27.2D +trn2 v11.2D, v28.2D, v8.2D +trn1 v30.2D, v26.2D, v27.2D +trn1 v22.2D, v28.2D, v8.2D +sqrdmulh v8.4S, v13.4S, v20.4S +mul v13.4S, v13.4S,v3.4S +mla v13.4S, v8.4S, v31.s[0] +sub v8.4s, v29.4s, v13.4s +add v29.4s, v29.4s, v13.4s +sqrdmulh v13.4S, v18.4S, v20.4S +mul v18.4S, v18.4S,v3.4S +mla v18.4S, v13.4S, v31.s[0] +sub v13.4s, v12.4s, v18.4s +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v21.4S, v15.4S +mul v21.4S, v21.4S,v10.4S +mla v21.4S, v18.4S, v31.s[0] +sub v18.4s, v30.4s, v21.4s +add v30.4s, v30.4s, v21.4s +sqrdmulh v21.4S, v11.4S, v15.4S +mul v11.4S, v11.4S,v10.4S +mla v11.4S, v21.4S, v31.s[0] +sub v21.4s, v22.4s, v11.4s +add v22.4s, v22.4s, v11.4s +sqrdmulh v11.4S, v12.4S, v19.4S +mul v12.4S, v12.4S,v17.4S +mla v12.4S, v11.4S, v31.s[0] +sub v11.4s, v29.4s, v12.4s +add v29.4s, v29.4s, v12.4s +sqrdmulh v12.4S, v13.4S, v14.4S +mul v13.4S, v13.4S,v0.4S +mla v13.4S, v12.4S, v31.s[0] +sub v12.4s, v8.4s, v13.4s +add v8.4s, v8.4s, v13.4s +sqrdmulh v13.4S, v22.4S, v6.4S +mul v22.4S, v22.4S,v7.4S +mla v22.4S, v13.4S, v31.s[0] +sub v13.4s, v30.4s, v22.4s +add v30.4s, v30.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v4.4S +mul v21.4S, v21.4S,v5.4S +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +str q29, [x0, #448] +str q11, [x0, #464] +str q8, [x0, #480] +str q12, [x0, #496] +str q30, [x0, #960] +str q13, [x0, #976] +str q18, [x0, #992] +str q22, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 2392 +// Instruction count: 2388 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_3_z2_3.s b/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_3_z2_3.s new file mode 100644 index 0000000..097a1c9 --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_3_z2_3.s @@ -0,0 +1,2422 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 26036764 // Layer 6, block 0 +.word 7065381 // Layer 6, block 1 +.word 11280567 // Layer 6, block 2 +.word 19695786 // Layer 6, block 3 +.word 1666225723 // Layer 6, block 0 +.word 452149874 // Layer 6, block 1 +.word 721901190 // Layer 6, block 2 +.word 1260434103 // Layer 6, block 3 +.word 28678040 // Layer 7, block 0 +.word 5637166 // Layer 7, block 2 +.word 18759424 // Layer 7, block 4 +.word 8648030 // Layer 7, block 6 +.word 1835254486 // Layer 7, block 0 +.word 360751090 // Layer 7, block 2 +.word 1200511508 // Layer 7, block 4 +.word 553431680 // Layer 7, block 6 +.word 7232147 // Layer 7, block 1 +.word 7430689 // Layer 7, block 3 +.word 14819378 // Layer 7, block 5 +.word 22112339 // Layer 7, block 7 +.word 462822084 // Layer 7, block 1 +.word 475527802 // Layer 7, block 3 +.word 948367809 // Layer 7, block 5 +.word 1415081692 // Layer 7, block 7 +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14834498 // Layer 6, block 4 +.word 22861321 // Layer 6, block 5 +.word 23033862 // Layer 6, block 6 +.word 32211066 // Layer 6, block 7 +.word 949335415 // Layer 6, block 4 +.word 1463012881 // Layer 6, block 5 +.word 1474054663 // Layer 6, block 6 +.word 2061350894 // Layer 6, block 7 +.word 7103825 // Layer 7, block 8 +.word 24338119 // Layer 7, block 10 +.word 6674394 // Layer 7, block 12 +.word 3716128 // Layer 7, block 14 +.word 454610102 // Layer 7, block 8 +.word 1557520740 // Layer 7, block 10 +.word 427128616 // Layer 7, block 12 +.word 237814041 // Layer 7, block 14 +.word 18577393 // Layer 7, block 9 +.word 17042091 // Layer 7, block 11 +.word 6574213 // Layer 7, block 13 +.word 24666803 // Layer 7, block 15 +.word 1188862414 // Layer 7, block 9 +.word 1090610585 // Layer 7, block 11 +.word 420717521 // Layer 7, block 13 +.word 1578554911 // Layer 7, block 15 +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 11253846 // Layer 6, block 8 +.word 16151303 // Layer 6, block 9 +.word 1821442 // Layer 6, block 10 +.word 23358663 // Layer 6, block 11 +.word 720191176 // Layer 6, block 8 +.word 1033604503 // Layer 6, block 9 +.word 116563391 // Layer 6, block 10 +.word 1494840340 // Layer 6, block 11 +.word 32787475 // Layer 7, block 16 +.word 8269259 // Layer 7, block 18 +.word 20826321 // Layer 7, block 20 +.word 21194054 // Layer 7, block 22 +.word 2098238255 // Layer 7, block 16 +.word 529192186 // Layer 7, block 18 +.word 1332782821 // Layer 7, block 20 +.word 1356315937 // Layer 7, block 22 +.word 28400654 // Layer 7, block 17 +.word 31090287 // Layer 7, block 19 +.word 26776841 // Layer 7, block 21 +.word 22281074 // Layer 7, block 23 +.word 1817503137 // Layer 7, block 17 +.word 1989626512 // Layer 7, block 19 +.word 1713587037 // Layer 7, block 21 +.word 1425879908 // Layer 7, block 23 +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 20504641 // Layer 6, block 12 +.word 7735096 // Layer 6, block 13 +.word 29463916 // Layer 6, block 14 +.word 23172067 // Layer 6, block 15 +.word 1312196872 // Layer 6, block 12 +.word 495008363 // Layer 6, block 13 +.word 1885546712 // Layer 6, block 14 +.word 1482899108 // Layer 6, block 15 +.word 1953000 // Layer 7, block 24 +.word 12766243 // Layer 7, block 26 +.word 16292342 // Layer 7, block 28 +.word 25143337 // Layer 7, block 30 +.word 124982461 // Layer 7, block 24 +.word 816977197 // Layer 7, block 26 +.word 1042630311 // Layer 7, block 28 +.word 1609050759 // Layer 7, block 30 +.word 12486848 // Layer 7, block 25 +.word 31556661 // Layer 7, block 27 +.word 28330310 // Layer 7, block 29 +.word 15137961 // Layer 7, block 31 +.word 799097282 // Layer 7, block 25 +.word 2019472170 // Layer 7, block 27 +.word 1813001465 // Layer 7, block 29 +.word 968755565 // Layer 7, block 31 +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 18663828 // Layer 6, block 16 +.word 25765932 // Layer 6, block 17 +.word 11779122 // Layer 6, block 18 +.word 29112305 // Layer 6, block 19 +.word 1194393831 // Layer 6, block 16 +.word 1648893798 // Layer 6, block 17 +.word 753806275 // Layer 6, block 18 +.word 1863045325 // Layer 6, block 19 +.word 33163184 // Layer 7, block 32 +.word 11550623 // Layer 7, block 34 +.word 25375595 // Layer 7, block 36 +.word 18254638 // Layer 7, block 38 +.word 2122281795 // Layer 7, block 32 +.word 739183455 // Layer 7, block 34 +.word 1623914137 // Layer 7, block 36 +.word 1168207670 // Layer 7, block 38 +.word 9551359 // Layer 7, block 33 +.word 33257316 // Layer 7, block 35 +.word 10387700 // Layer 7, block 37 +.word 4263629 // Layer 7, block 39 +.word 611240324 // Layer 7, block 33 +.word 2128305784 // Layer 7, block 35 +.word 664762063 // Layer 7, block 37 +.word 272851431 // Layer 7, block 39 +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 596073 // Layer 6, block 20 +.word 29039358 // Layer 6, block 21 +.word 6760262 // Layer 6, block 22 +.word 2228887 // Layer 6, block 23 +.word 38145761 // Layer 6, block 20 +.word 1858377074 // Layer 6, block 21 +.word 432623749 // Layer 6, block 22 +.word 142637881 // Layer 6, block 23 +.word 25929180 // Layer 7, block 40 +.word 23508428 // Layer 7, block 42 +.word 22560727 // Layer 7, block 44 +.word 29457393 // Layer 7, block 46 +.word 1659340873 // Layer 7, block 40 +.word 1504424569 // Layer 7, block 42 +.word 1443776334 // Layer 7, block 44 +.word 1885129272 // Layer 7, block 46 +.word 17371159 // Layer 7, block 41 +.word 11558208 // Layer 7, block 43 +.word 15755637 // Layer 7, block 45 +.word 20740787 // Layer 7, block 47 +.word 1111669329 // Layer 7, block 41 +.word 739668858 // Layer 7, block 43 +.word 1008283812 // Layer 7, block 45 +.word 1327309063 // Layer 7, block 47 +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 13624329 // Layer 6, block 24 +.word 9838349 // Layer 6, block 25 +.word 6934560 // Layer 6, block 26 +.word 11310234 // Layer 6, block 27 +.word 871890510 // Layer 6, block 24 +.word 629606282 // Layer 6, block 25 +.word 443777969 // Layer 6, block 26 +.word 723799733 // Layer 6, block 27 +.word 3153984 // Layer 7, block 48 +.word 15599806 // Layer 7, block 50 +.word 23484790 // Layer 7, block 52 +.word 30174454 // Layer 7, block 54 +.word 201839571 // Layer 7, block 48 +.word 998311389 // Layer 7, block 50 +.word 1502911852 // Layer 7, block 52 +.word 1931017673 // Layer 7, block 54 +.word 13598070 // Layer 7, block 49 +.word 31454003 // Layer 7, block 51 +.word 20506260 // Layer 7, block 53 +.word 5928435 // Layer 7, block 55 +.word 870210062 // Layer 7, block 49 +.word 2012902560 // Layer 7, block 51 +.word 1312300480 // Layer 7, block 53 +.word 379390883 // Layer 7, block 55 +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 32798516 // Layer 6, block 28 +.word 9911360 // Layer 6, block 29 +.word 32443170 // Layer 6, block 30 +.word 31293482 // Layer 6, block 31 +.word 2098944825 // Layer 6, block 28 +.word 634278629 // Layer 6, block 29 +.word 2076204416 // Layer 6, block 30 +.word 2002630000 // Layer 6, block 31 +.word 26013877 // Layer 7, block 56 +.word 22928950 // Layer 7, block 58 +.word 24547058 // Layer 7, block 60 +.word 21082546 // Layer 7, block 62 +.word 1664761067 // Layer 7, block 56 +.word 1467340807 // Layer 7, block 58 +.word 1570891816 // Layer 7, block 60 +.word 1349179970 // Layer 7, block 62 +.word 21864746 // Layer 7, block 57 +.word 27678266 // Layer 7, block 59 +.word 30695887 // Layer 7, block 61 +.word 31772478 // Layer 7, block 63 +.word 1399236949 // Layer 7, block 57 +.word 1771273834 // Layer 7, block 59 +.word 1964386839 // Layer 7, block 61 +.word 2033283404 // Layer 7, block 63 +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 2853776 // Layer 6, block 32 +.word 31645959 // Layer 6, block 33 +.word 29723614 // Layer 6, block 34 +.word 31813171 // Layer 6, block 35 +.word 182627725 // Layer 6, block 32 +.word 2025186806 // Layer 6, block 33 +.word 1902166116 // Layer 6, block 34 +.word 2035887557 // Layer 6, block 35 +.word 30377953 // Layer 7, block 64 +.word 4924837 // Layer 7, block 66 +.word 11362575 // Layer 7, block 68 +.word 31398766 // Layer 7, block 70 +.word 1944040616 // Layer 7, block 64 +.word 315165513 // Layer 7, block 66 +.word 727149301 // Layer 7, block 68 +.word 2009367662 // Layer 7, block 70 +.word 27689101 // Layer 7, block 65 +.word 31229525 // Layer 7, block 67 +.word 6544948 // Layer 7, block 69 +.word 13728247 // Layer 7, block 71 +.word 1771967221 // Layer 7, block 65 +.word 1998537064 // Layer 7, block 67 +.word 418844704 // Layer 7, block 69 +.word 878540754 // Layer 7, block 71 +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9116920 // Layer 6, block 36 +.word 26449800 // Layer 6, block 37 +.word 27173300 // Layer 6, block 38 +.word 1574249 // Layer 6, block 39 +.word 583438350 // Layer 6, block 36 +.word 1692658010 // Layer 6, block 37 +.word 1738958476 // Layer 6, block 38 +.word 100744247 // Layer 6, block 39 +.word 6510145 // Layer 7, block 72 +.word 760999 // Layer 7, block 74 +.word 1634503 // Layer 7, block 76 +.word 29546109 // Layer 7, block 78 +.word 416617482 // Layer 7, block 72 +.word 48700219 // Layer 7, block 74 +.word 104600209 // Layer 7, block 76 +.word 1890806663 // Layer 7, block 78 +.word 2195232 // Layer 7, block 73 +.word 4465852 // Layer 7, block 75 +.word 31203102 // Layer 7, block 77 +.word 29916743 // Layer 7, block 79 +.word 140484126 // Layer 7, block 73 +.word 285792715 // Layer 7, block 75 +.word 1996846121 // Layer 7, block 77 +.word 1914525428 // Layer 7, block 79 +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29172999 // Layer 6, block 40 +.word 16825951 // Layer 6, block 41 +.word 11592382 // Layer 6, block 42 +.word 2671395 // Layer 6, block 43 +.word 1866929445 // Layer 6, block 40 +.word 1076778680 // Layer 6, block 41 +.word 741855827 // Layer 6, block 42 +.word 170956232 // Layer 6, block 43 +.word 14579779 // Layer 7, block 80 +.word 24263513 // Layer 7, block 82 +.word 4646776 // Layer 7, block 84 +.word 69049 // Layer 7, block 86 +.word 933034643 // Layer 7, block 80 +.word 1552746321 // Layer 7, block 82 +.word 297370968 // Layer 7, block 84 +.word 4418799 // Layer 7, block 86 +.word 33263488 // Layer 7, block 81 +.word 22493246 // Layer 7, block 83 +.word 22009979 // Layer 7, block 85 +.word 12021234 // Layer 7, block 87 +.word 2128700762 // Layer 7, block 81 +.word 1439457879 // Layer 7, block 83 +.word 1408531152 // Layer 7, block 85 +.word 769300260 // Layer 7, block 87 +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 15720958 // Layer 6, block 44 +.word 4876619 // Layer 6, block 45 +.word 9370171 // Layer 6, block 46 +.word 2197027 // Layer 6, block 47 +.word 1006064525 // Layer 6, block 44 +.word 312079797 // Layer 6, block 45 +.word 599645177 // Layer 6, block 46 +.word 140598997 // Layer 6, block 47 +.word 16117282 // Layer 7, block 88 +.word 9635661 // Layer 7, block 90 +.word 9117520 // Layer 7, block 92 +.word 3506913 // Layer 7, block 94 +.word 1031427326 // Layer 7, block 88 +.word 616635240 // Layer 7, block 90 +.word 583476747 // Layer 7, block 92 +.word 224425303 // Layer 7, block 94 +.word 20014407 // Layer 7, block 89 +.word 25893988 // Layer 7, block 91 +.word 10257619 // Layer 7, block 93 +.word 24501669 // Layer 7, block 95 +.word 1280824291 // Layer 7, block 89 +.word 1657088757 // Layer 7, block 91 +.word 656437514 // Layer 7, block 93 +.word 1567987141 // Layer 7, block 95 +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 23467272 // Layer 6, block 48 +.word 11944835 // Layer 6, block 49 +.word 29768154 // Layer 6, block 50 +.word 3189790 // Layer 6, block 51 +.word 1501790786 // Layer 6, block 48 +.word 764411097 // Layer 6, block 49 +.word 1905016458 // Layer 6, block 50 +.word 204130980 // Layer 6, block 51 +.word 28559032 // Layer 7, block 96 +.word 20151609 // Layer 7, block 98 +.word 11645481 // Layer 7, block 100 +.word 16402437 // Layer 7, block 102 +.word 1827638556 // Layer 7, block 96 +.word 1289604549 // Layer 7, block 98 +.word 745253903 // Layer 7, block 100 +.word 1049675853 // Layer 7, block 102 +.word 1005359 // Layer 7, block 97 +.word 19130139 // Layer 7, block 99 +.word 11690281 // Layer 7, block 101 +.word 5461508 // Layer 7, block 103 +.word 64338065 // Layer 7, block 97 +.word 1224235458 // Layer 7, block 99 +.word 748120885 // Layer 7, block 101 +.word 349509836 // Layer 7, block 103 +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 4898455 // Layer 6, block 52 +.word 22059944 // Layer 6, block 53 +.word 20315246 // Layer 6, block 54 +.word 28615767 // Layer 6, block 55 +.word 313477194 // Layer 6, block 52 +.word 1411728668 // Layer 6, block 53 +.word 1300076517 // Layer 6, block 54 +.word 1831269319 // Layer 6, block 55 +.word 6226096 // Layer 7, block 104 +.word 14029790 // Layer 7, block 106 +.word 7729000 // Layer 7, block 108 +.word 13958531 // Layer 7, block 110 +.word 398439734 // Layer 7, block 104 +.word 897838034 // Layer 7, block 106 +.word 494618249 // Layer 7, block 108 +.word 893277806 // Layer 7, block 110 +.word 31755058 // Layer 7, block 105 +.word 26102744 // Layer 7, block 107 +.word 19175904 // Layer 7, block 109 +.word 19472238 // Layer 7, block 111 +.word 2032168609 // Layer 7, block 105 +.word 1670448121 // Layer 7, block 107 +.word 1227164194 // Layer 7, block 109 +.word 1246128123 // Layer 7, block 111 +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 17302560 // Layer 6, block 56 +.word 8630188 // Layer 6, block 57 +.word 13744680 // Layer 6, block 58 +.word 31890906 // Layer 6, block 59 +.word 1107279328 // Layer 6, block 56 +.word 552289879 // Layer 6, block 57 +.word 879592386 // Layer 6, block 58 +.word 2040862218 // Layer 6, block 59 +.word 4735938 // Layer 7, block 112 +.word 26671657 // Layer 7, block 114 +.word 25810971 // Layer 7, block 116 +.word 25578690 // Layer 7, block 118 +.word 303076900 // Layer 7, block 112 +.word 1706855774 // Layer 7, block 114 +.word 1651776074 // Layer 7, block 116 +.word 1636911225 // Layer 7, block 118 +.word 6957373 // Layer 7, block 113 +.word 25381712 // Layer 7, block 115 +.word 27780827 // Layer 7, block 117 +.word 28062311 // Layer 7, block 119 +.word 445237890 // Layer 7, block 113 +.word 1624305595 // Layer 7, block 115 +.word 1777837237 // Layer 7, block 117 +.word 1795850838 // Layer 7, block 119 +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 26150922 // Layer 6, block 60 +.word 29525906 // Layer 6, block 61 +.word 23080870 // Layer 6, block 62 +.word 1636987 // Layer 6, block 63 +.word 1673531278 // Layer 6, block 60 +.word 1889513769 // Layer 6, block 61 +.word 1477062945 // Layer 6, block 62 +.word 104759172 // Layer 6, block 63 +.word 10674616 // Layer 7, block 120 +.word 9508293 // Layer 7, block 122 +.word 4274200 // Layer 7, block 124 +.word 10066304 // Layer 7, block 126 +.word 683123285 // Layer 7, block 120 +.word 608484310 // Layer 7, block 122 +.word 273527923 // Layer 7, block 124 +.word 644194289 // Layer 7, block 126 +.word 26473446 // Layer 7, block 121 +.word 14853570 // Layer 7, block 123 +.word 32427548 // Layer 7, block 125 +.word 16598340 // Layer 7, block 127 +.word 1694171239 // Layer 7, block 121 +.word 950555930 // Layer 7, block 123 +.word 2075204685 // Layer 7, block 125 +.word 1062212688 // Layer 7, block 127 +.text +.global ntt_u32_full_neon_asm_var_4_4_3_z2_3 +.global _ntt_u32_full_neon_asm_var_4_4_3_z2_3 +ntt_u32_full_neon_asm_var_4_4_3_z2_3: +_ntt_u32_full_neon_asm_var_4_4_3_z2_3: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #800] +ldr q21, [x0, #864] +ldr q20, [x0, #928] +ldr q19, [x0, #992] +ldr q18, [x0, #288] +ldr q17, [x0, #352] +ldr q16, [x0, #416] +ldr q3, [x0, #480] +sqrdmulh v2.4S, v22.4S, v29.s[0] +ldr q1, [x0, #544] +mul v22.4S, v22.4S,v30.s[0] +ldr q0, [x0, #608] +sqrdmulh v15.4S, v21.4S, v29.s[0] +ldr q14, [x0, #672] +mul v21.4S, v21.4S,v30.s[0] +ldr q13, [x0, #736] +mla v22.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q12, [x0, #32] +sub v11.4s, v18.4s, v22.4s +mla v21.4S, v15.4S, v31.s[0] +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q15, [x0, #96] +sub v10.4s, v17.4s, v21.4s +mla v20.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v1.4S, v29.s[0] +ldr q2, [x0, #160] +mul v1.4S, v1.4S,v30.s[0] +sub v9.4s, v16.4s, v20.4s +mla v19.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v0.4S, v29.s[0] +ldr q22, [x0, #224] +mul v0.4S, v0.4S,v30.s[0] +sub v8.4s, v3.4s, v19.4s +mla v1.4S, v21.4S, v31.s[0] +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v21.4s, v12.4s, v1.4s +mla v0.4S, v20.4S, v31.s[0] +add v12.4s, v12.4s, v1.4s +sqrdmulh v1.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +sub v20.4s, v15.4s, v0.4s +mla v14.4S, v19.4S, v31.s[0] +add v15.4s, v15.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v19.4s, v2.4s, v14.4s +mla v13.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v1.4s, v22.4s, v13.4s +mla v16.4S, v0.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v0.4s, v2.4s, v16.4s +mla v3.4S, v14.4S, v31.s[0] +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v14.4s, v22.4s, v3.4s +mla v18.4S, v13.4S, v31.s[0] +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v29.s[2] +mul v9.4S, v9.4S,v30.s[2] +sub v13.4s, v12.4s, v18.4s +mla v17.4S, v16.4S, v31.s[0] +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v8.4S, v29.s[2] +mul v8.4S, v8.4S,v30.s[2] +sub v16.4s, v15.4s, v17.4s +mla v9.4S, v3.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +sqrdmulh v17.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v3.4s, v19.4s, v9.4s +mla v8.4S, v18.4S, v31.s[0] +add v19.4s, v19.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v18.4s, v1.4s, v8.4s +mla v11.4S, v17.4S, v31.s[0] +add v1.4s, v1.4s, v8.4s +sqrdmulh v8.4S, v2.4S, v27.s[0] +mul v2.4S, v2.4S,v28.s[0] +sub v17.4s, v21.4s, v11.4s +mla v10.4S, v9.4S, v31.s[0] +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v27.s[0] +mul v22.4S, v22.4S,v28.s[0] +sub v9.4s, v20.4s, v10.4s +mla v2.4S, v8.4S, v31.s[0] +add v20.4s, v20.4s, v10.4s +sqrdmulh v10.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v8.4s, v12.4s, v2.4s +mla v22.4S, v11.4S, v31.s[0] +add v12.4s, v12.4s, v2.4s +sqrdmulh v2.4S, v14.4S, v27.s[1] +mul v14.4S, v14.4S,v28.s[1] +sub v11.4s, v15.4s, v22.4s +mla v0.4S, v10.4S, v31.s[0] +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v27.s[2] +mul v19.4S, v19.4S,v28.s[2] +sub v10.4s, v13.4s, v0.4s +mla v14.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v0.4s +sqrdmulh v0.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +sub v2.4s, v16.4s, v14.4s +mla v19.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v27.s[3] +mul v3.4S, v3.4S,v28.s[3] +sub v22.4s, v21.4s, v19.4s +mla v1.4S, v0.4S, v31.s[0] +add v21.4s, v21.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v0.4s, v20.4s, v1.4s +mla v3.4S, v14.4S, v31.s[0] +add v20.4s, v20.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v25.s[0] +mul v15.4S, v15.4S,v26.s[0] +sub v14.4s, v17.4s, v3.4s +mla v18.4S, v19.4S, v31.s[0] +add v17.4s, v17.4s, v3.4s +sqrdmulh v3.4S, v11.4S, v25.s[1] +mul v11.4S, v11.4S,v26.s[1] +sub v19.4s, v9.4s, v18.4s +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v1.4s, v12.4s, v15.4s +mla v11.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v25.s[3] +mul v2.4S, v2.4S,v26.s[3] +sub v3.4s, v8.4s, v11.4s +mla v16.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v11.4s +str q12, [x0, #32] +sqrdmulh v12.4S, v20.4S, v23.s[0] +str q1, [x0, #96] +mul v20.4S, v20.4S,v24.s[0] +ldr q1, [x0, #816] +sub v11.4s, v13.4s, v16.4s +ldr q18, [x0, #880] +mla v2.4S, v15.4S, v31.s[0] +add v13.4s, v13.4s, v16.4s +str q8, [x0, #160] +sqrdmulh v8.4S, v0.4S, v23.s[1] +str q3, [x0, #224] +mul v0.4S, v0.4S,v24.s[1] +ldr q3, [x0, #944] +sub v16.4s, v10.4s, v2.4s +ldr q15, [x0, #1008] +mla v20.4S, v12.4S, v31.s[0] +add v10.4s, v10.4s, v2.4s +str q13, [x0, #288] +sqrdmulh v13.4S, v9.4S, v23.s[2] +str q11, [x0, #352] +mul v9.4S, v9.4S,v24.s[2] +ldr q11, [x0, #304] +sub v2.4s, v21.4s, v20.4s +ldr q12, [x0, #368] +mla v0.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v20.4s +str q10, [x0, #416] +sqrdmulh v10.4S, v19.4S, v23.s[3] +str q16, [x0, #480] +mul v19.4S, v19.4S,v24.s[3] +ldr q16, [x0, #432] +sub v20.4s, v22.4s, v0.4s +ldr q8, [x0, #496] +mla v9.4S, v13.4S, v31.s[0] +add v22.4s, v22.4s, v0.4s +str q21, [x0, #544] +sqrdmulh v21.4S, v1.4S, v29.s[0] +str q2, [x0, #608] +ldr q2, [x0, #560] +mul v1.4S, v1.4S,v30.s[0] +ldr q0, [x0, #624] +sub v13.4s, v17.4s, v9.4s +mla v19.4S, v10.4S, v31.s[0] +add v17.4s, v17.4s, v9.4s +str q22, [x0, #672] +sqrdmulh v22.4S, v18.4S, v29.s[0] +str q20, [x0, #736] +ldr q20, [x0, #688] +mul v18.4S, v18.4S,v30.s[0] +ldr q9, [x0, #752] +sub v10.4s, v14.4s, v19.4s +mla v1.4S, v21.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +str q17, [x0, #800] +sqrdmulh v17.4S, v3.4S, v29.s[0] +str q13, [x0, #864] +mul v3.4S, v3.4S,v30.s[0] +ldr q13, [x0, #48] +sub v19.4s, v11.4s, v1.4s +mla v18.4S, v22.4S, v31.s[0] +add v11.4s, v11.4s, v1.4s +str q14, [x0, #928] +sqrdmulh v14.4S, v15.4S, v29.s[0] +str q10, [x0, #992] +mul v15.4S, v15.4S,v30.s[0] +ldr q10, [x0, #112] +sub v1.4s, v12.4s, v18.4s +mla v3.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v2.4S, v29.s[0] +ldr q17, [x0, #176] +mul v2.4S, v2.4S,v30.s[0] +sub v22.4s, v16.4s, v3.4s +mla v15.4S, v14.4S, v31.s[0] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v0.4S, v29.s[0] +ldr q14, [x0, #240] +mul v0.4S, v0.4S,v30.s[0] +sub v21.4s, v8.4s, v15.4s +mla v2.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +sub v18.4s, v13.4s, v2.4s +mla v0.4S, v3.4S, v31.s[0] +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v9.4S, v29.s[0] +mul v9.4S, v9.4S,v30.s[0] +sub v3.4s, v10.4s, v0.4s +mla v20.4S, v15.4S, v31.s[0] +add v10.4s, v10.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v15.4s, v17.4s, v20.4s +mla v9.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +sub v2.4s, v14.4s, v9.4s +mla v16.4S, v0.4S, v31.s[0] +add v14.4s, v14.4s, v9.4s +sqrdmulh v9.4S, v11.4S, v29.s[1] +mul v11.4S, v11.4S,v30.s[1] +sub v0.4s, v17.4s, v16.4s +mla v8.4S, v20.4S, v31.s[0] +add v17.4s, v17.4s, v16.4s +sqrdmulh v16.4S, v12.4S, v29.s[1] +mul v12.4S, v12.4S,v30.s[1] +sub v20.4s, v14.4s, v8.4s +mla v11.4S, v9.4S, v31.s[0] +add v14.4s, v14.4s, v8.4s +sqrdmulh v8.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +sub v9.4s, v13.4s, v11.4s +mla v12.4S, v16.4S, v31.s[0] +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +sub v16.4s, v10.4s, v12.4s +mla v22.4S, v8.4S, v31.s[0] +add v10.4s, v10.4s, v12.4s +sqrdmulh v12.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +sub v8.4s, v15.4s, v22.4s +mla v21.4S, v11.4S, v31.s[0] +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v1.4S, v29.s[2] +mul v1.4S, v1.4S,v30.s[2] +sub v11.4s, v2.4s, v21.4s +mla v19.4S, v12.4S, v31.s[0] +add v2.4s, v2.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v27.s[0] +mul v17.4S, v17.4S,v28.s[0] +sub v12.4s, v18.4s, v19.4s +mla v1.4S, v22.4S, v31.s[0] +add v18.4s, v18.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +sub v22.4s, v3.4s, v1.4s +mla v17.4S, v21.4S, v31.s[0] +add v3.4s, v3.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v21.4s, v13.4s, v17.4s +mla v14.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v17.4s +sqrdmulh v17.4S, v20.4S, v27.s[1] +mul v20.4S, v20.4S,v28.s[1] +sub v19.4s, v10.4s, v14.4s +mla v0.4S, v1.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v27.s[2] +mul v15.4S, v15.4S,v28.s[2] +sub v1.4s, v9.4s, v0.4s +mla v20.4S, v17.4S, v31.s[0] +add v9.4s, v9.4s, v0.4s +sqrdmulh v0.4S, v2.4S, v27.s[2] +mul v2.4S, v2.4S,v28.s[2] +sub v17.4s, v16.4s, v20.4s +mla v15.4S, v14.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v27.s[3] +mul v8.4S, v8.4S,v28.s[3] +sub v14.4s, v18.4s, v15.4s +mla v2.4S, v0.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v27.s[3] +mul v11.4S, v11.4S,v28.s[3] +sub v0.4s, v3.4s, v2.4s +mla v8.4S, v20.4S, v31.s[0] +add v3.4s, v3.4s, v2.4s +sqrdmulh v2.4S, v10.4S, v25.s[0] +mul v10.4S, v10.4S,v26.s[0] +sub v20.4s, v12.4s, v8.4s +mla v11.4S, v15.4S, v31.s[0] +add v12.4s, v12.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v25.s[1] +mul v19.4S, v19.4S,v26.s[1] +sub v15.4s, v22.4s, v11.4s +mla v10.4S, v2.4S, v31.s[0] +add v22.4s, v22.4s, v11.4s +sqrdmulh v11.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v2.4s, v13.4s, v10.4s +mla v19.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v10.4s +sqrdmulh v10.4S, v17.4S, v25.s[3] +mul v17.4S, v17.4S,v26.s[3] +sub v8.4s, v21.4s, v19.4s +mla v16.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v19.4s +str q13, [x0, #48] +sqrdmulh v13.4S, v3.4S, v23.s[0] +str q2, [x0, #112] +mul v3.4S, v3.4S,v24.s[0] +ldr q2, [x0, #768] +sub v19.4s, v9.4s, v16.4s +ldr q11, [x0, #832] +mla v17.4S, v10.4S, v31.s[0] +add v9.4s, v9.4s, v16.4s +str q21, [x0, #176] +sqrdmulh v21.4S, v0.4S, v23.s[1] +str q8, [x0, #240] +mul v0.4S, v0.4S,v24.s[1] +ldr q8, [x0, #896] +sub v16.4s, v1.4s, v17.4s +ldr q10, [x0, #960] +mla v3.4S, v13.4S, v31.s[0] +add v1.4s, v1.4s, v17.4s +str q9, [x0, #304] +sqrdmulh v9.4S, v22.4S, v23.s[2] +str q19, [x0, #368] +mul v22.4S, v22.4S,v24.s[2] +ldr q19, [x0, #256] +sub v17.4s, v18.4s, v3.4s +ldr q13, [x0, #320] +mla v0.4S, v21.4S, v31.s[0] +add v18.4s, v18.4s, v3.4s +str q1, [x0, #432] +sqrdmulh v1.4S, v15.4S, v23.s[3] +str q16, [x0, #496] +mul v15.4S, v15.4S,v24.s[3] +ldr q16, [x0, #384] +sub v3.4s, v14.4s, v0.4s +ldr q21, [x0, #448] +mla v22.4S, v9.4S, v31.s[0] +add v14.4s, v14.4s, v0.4s +str q18, [x0, #560] +sqrdmulh v18.4S, v2.4S, v29.s[0] +str q17, [x0, #624] +ldr q17, [x0, #512] +mul v2.4S, v2.4S,v30.s[0] +ldr q0, [x0, #576] +sub v9.4s, v12.4s, v22.4s +mla v15.4S, v1.4S, v31.s[0] +add v12.4s, v12.4s, v22.4s +str q14, [x0, #688] +sqrdmulh v14.4S, v11.4S, v29.s[0] +str q3, [x0, #752] +ldr q3, [x0, #640] +mul v11.4S, v11.4S,v30.s[0] +ldr q22, [x0, #704] +sub v1.4s, v20.4s, v15.4s +mla v2.4S, v18.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +str q12, [x0, #816] +sqrdmulh v12.4S, v8.4S, v29.s[0] +str q9, [x0, #880] +mul v8.4S, v8.4S,v30.s[0] +ldr q9, [x0, #0] +sub v15.4s, v19.4s, v2.4s +mla v11.4S, v14.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +str q20, [x0, #944] +sqrdmulh v20.4S, v10.4S, v29.s[0] +str q1, [x0, #1008] +mul v10.4S, v10.4S,v30.s[0] +ldr q1, [x0, #64] +sub v2.4s, v13.4s, v11.4s +mla v8.4S, v12.4S, v31.s[0] +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v29.s[0] +ldr q12, [x0, #128] +mul v17.4S, v17.4S,v30.s[0] +sub v14.4s, v16.4s, v8.4s +mla v10.4S, v20.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v0.4S, v29.s[0] +ldr q20, [x0, #192] +mul v0.4S, v0.4S,v30.s[0] +sub v18.4s, v21.4s, v10.4s +mla v17.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +sub v11.4s, v9.4s, v17.4s +mla v0.4S, v8.4S, v31.s[0] +add v9.4s, v9.4s, v17.4s +sqrdmulh v17.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v8.4s, v1.4s, v0.4s +mla v3.4S, v10.4S, v31.s[0] +add v1.4s, v1.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v10.4s, v12.4s, v3.4s +mla v22.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v17.4s, v20.4s, v22.4s +mla v16.4S, v0.4S, v31.s[0] +add v20.4s, v20.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[1] +mul v19.4S, v19.4S,v30.s[1] +sub v0.4s, v12.4s, v16.4s +mla v21.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v13.4S, v29.s[1] +mul v13.4S, v13.4S,v30.s[1] +sub v3.4s, v20.4s, v21.4s +mla v19.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v22.4s, v9.4s, v19.4s +mla v13.4S, v16.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v29.s[2] +mul v18.4S, v18.4S,v30.s[2] +sub v16.4s, v1.4s, v13.4s +mla v14.4S, v21.4S, v31.s[0] +add v1.4s, v1.4s, v13.4s +sqrdmulh v13.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +sub v21.4s, v10.4s, v14.4s +mla v18.4S, v19.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v19.4s, v17.4s, v18.4s +mla v15.4S, v13.4S, v31.s[0] +add v17.4s, v17.4s, v18.4s +sqrdmulh v18.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +sub v13.4s, v11.4s, v15.4s +mla v2.4S, v14.4S, v31.s[0] +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v27.s[0] +mul v20.4S, v20.4S,v28.s[0] +sub v14.4s, v8.4s, v2.4s +mla v12.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v2.4s +sqrdmulh v2.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v18.4s, v9.4s, v12.4s +mla v20.4S, v15.4S, v31.s[0] +add v9.4s, v9.4s, v12.4s +sqrdmulh v12.4S, v3.4S, v27.s[1] +mul v3.4S, v3.4S,v28.s[1] +sub v15.4s, v1.4s, v20.4s +mla v0.4S, v2.4S, v31.s[0] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v10.4S, v27.s[2] +mul v10.4S, v10.4S,v28.s[2] +sub v2.4s, v22.4s, v0.4s +mla v3.4S, v12.4S, v31.s[0] +add v22.4s, v22.4s, v0.4s +sqrdmulh v0.4S, v17.4S, v27.s[2] +mul v17.4S, v17.4S,v28.s[2] +sub v12.4s, v16.4s, v3.4s +mla v10.4S, v20.4S, v31.s[0] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +sub v20.4s, v11.4s, v10.4s +mla v17.4S, v0.4S, v31.s[0] +add v11.4s, v11.4s, v10.4s +sqrdmulh v10.4S, v19.4S, v27.s[3] +mul v19.4S, v19.4S,v28.s[3] +sub v0.4s, v8.4s, v17.4s +mla v21.4S, v3.4S, v31.s[0] +add v8.4s, v8.4s, v17.4s +sqrdmulh v17.4S, v1.4S, v25.s[0] +mul v1.4S, v1.4S,v26.s[0] +sub v3.4s, v13.4s, v21.4s +mla v19.4S, v10.4S, v31.s[0] +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v15.4S, v25.s[1] +mul v15.4S, v15.4S,v26.s[1] +sub v10.4s, v14.4s, v19.4s +mla v1.4S, v17.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +sqrdmulh v19.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v17.4s, v9.4s, v1.4s +mla v15.4S, v21.4S, v31.s[0] +add v9.4s, v9.4s, v1.4s +sqrdmulh v1.4S, v12.4S, v25.s[3] +mul v12.4S, v12.4S,v26.s[3] +sub v21.4s, v18.4s, v15.4s +mla v16.4S, v19.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +str q9, [x0, #0] +sqrdmulh v9.4S, v8.4S, v23.s[0] +str q17, [x0, #64] +mul v8.4S, v8.4S,v24.s[0] +ldr q17, [x0, #784] +sub v15.4s, v22.4s, v16.4s +ldr q19, [x0, #848] +mla v12.4S, v1.4S, v31.s[0] +add v22.4s, v22.4s, v16.4s +str q18, [x0, #128] +sqrdmulh v18.4S, v0.4S, v23.s[1] +str q21, [x0, #192] +mul v0.4S, v0.4S,v24.s[1] +ldr q21, [x0, #912] +sub v16.4s, v2.4s, v12.4s +ldr q1, [x0, #976] +mla v8.4S, v9.4S, v31.s[0] +add v2.4s, v2.4s, v12.4s +str q22, [x0, #256] +sqrdmulh v22.4S, v14.4S, v23.s[2] +str q15, [x0, #320] +mul v14.4S, v14.4S,v24.s[2] +ldr q15, [x0, #272] +sub v12.4s, v11.4s, v8.4s +ldr q9, [x0, #336] +mla v0.4S, v18.4S, v31.s[0] +add v11.4s, v11.4s, v8.4s +str q2, [x0, #384] +sqrdmulh v2.4S, v10.4S, v23.s[3] +str q16, [x0, #448] +mul v10.4S, v10.4S,v24.s[3] +ldr q16, [x0, #400] +sub v8.4s, v20.4s, v0.4s +ldr q18, [x0, #464] +mla v14.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v0.4s +str q11, [x0, #512] +sqrdmulh v11.4S, v17.4S, v29.s[0] +str q12, [x0, #576] +ldr q12, [x0, #528] +mul v17.4S, v17.4S,v30.s[0] +ldr q0, [x0, #592] +sub v22.4s, v13.4s, v14.4s +mla v10.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v14.4s +str q20, [x0, #640] +sqrdmulh v20.4S, v19.4S, v29.s[0] +str q8, [x0, #704] +ldr q8, [x0, #656] +mul v19.4S, v19.4S,v30.s[0] +ldr q14, [x0, #720] +sub v2.4s, v3.4s, v10.4s +mla v17.4S, v11.4S, v31.s[0] +add v3.4s, v3.4s, v10.4s +str q13, [x0, #768] +sqrdmulh v13.4S, v21.4S, v29.s[0] +str q22, [x0, #832] +mul v21.4S, v21.4S,v30.s[0] +ldr q22, [x0, #16] +sub v10.4s, v15.4s, v17.4s +mla v19.4S, v20.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +str q3, [x0, #896] +sqrdmulh v3.4S, v1.4S, v29.s[0] +str q2, [x0, #960] +mul v1.4S, v1.4S,v30.s[0] +ldr q2, [x0, #80] +sub v17.4s, v9.4s, v19.4s +mla v21.4S, v13.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v12.4S, v29.s[0] +ldr q13, [x0, #144] +mul v12.4S, v12.4S,v30.s[0] +sub v20.4s, v16.4s, v21.4s +mla v1.4S, v3.4S, v31.s[0] +add v16.4s, v16.4s, v21.4s +sqrdmulh v21.4S, v0.4S, v29.s[0] +ldr q3, [x0, #208] +mul v0.4S, v0.4S,v30.s[0] +sub v11.4s, v18.4s, v1.4s +mla v12.4S, v19.4S, v31.s[0] +add v18.4s, v18.4s, v1.4s +sqrdmulh v1.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v19.4s, v22.4s, v12.4s +mla v0.4S, v21.4S, v31.s[0] +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v21.4s, v2.4s, v0.4s +mla v8.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v1.4s, v13.4s, v8.4s +mla v14.4S, v12.4S, v31.s[0] +add v13.4s, v13.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v12.4s, v3.4s, v14.4s +mla v16.4S, v0.4S, v31.s[0] +add v3.4s, v3.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +sub v0.4s, v13.4s, v16.4s +mla v18.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v16.4s +sqrdmulh v16.4S, v9.4S, v29.s[1] +mul v9.4S, v9.4S,v30.s[1] +sub v8.4s, v3.4s, v18.4s +mla v15.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +sub v14.4s, v22.4s, v15.4s +mla v9.4S, v16.4S, v31.s[0] +add v22.4s, v22.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v16.4s, v2.4s, v9.4s +mla v20.4S, v18.4S, v31.s[0] +add v2.4s, v2.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v18.4s, v1.4s, v20.4s +mla v11.4S, v15.4S, v31.s[0] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +sub v15.4s, v12.4s, v11.4s +mla v10.4S, v9.4S, v31.s[0] +add v12.4s, v12.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v27.s[0] +mul v13.4S, v13.4S,v28.s[0] +sub v9.4s, v19.4s, v10.4s +mla v17.4S, v20.4S, v31.s[0] +add v19.4s, v19.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v27.s[0] +mul v3.4S, v3.4S,v28.s[0] +sub v20.4s, v21.4s, v17.4s +mla v13.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v11.4s, v22.4s, v13.4s +mla v3.4S, v10.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v8.4S, v27.s[1] +mul v8.4S, v8.4S,v28.s[1] +sub v10.4s, v2.4s, v3.4s +mla v0.4S, v17.4S, v31.s[0] +add v2.4s, v2.4s, v3.4s +sqrdmulh v3.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +sub v17.4s, v14.4s, v0.4s +mla v8.4S, v13.4S, v31.s[0] +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v12.4S, v27.s[2] +mul v12.4S, v12.4S,v28.s[2] +sub v13.4s, v16.4s, v8.4s +mla v1.4S, v3.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v3.4s, v19.4s, v1.4s +mla v12.4S, v0.4S, v31.s[0] +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +sub v0.4s, v21.4s, v12.4s +mla v18.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v2.4S, v25.s[0] +mul v2.4S, v2.4S,v26.s[0] +sub v8.4s, v9.4s, v18.4s +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v10.4S, v25.s[1] +mul v10.4S, v10.4S,v26.s[1] +sub v1.4s, v20.4s, v15.4s +mla v2.4S, v12.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v12.4s, v22.4s, v2.4s +mla v10.4S, v18.4S, v31.s[0] +add v22.4s, v22.4s, v2.4s +sqrdmulh v2.4S, v13.4S, v25.s[3] +mul v13.4S, v13.4S,v26.s[3] +sub v18.4s, v11.4s, v10.4s +mla v16.4S, v15.4S, v31.s[0] +add v11.4s, v11.4s, v10.4s +str q22, [x0, #16] +sqrdmulh v22.4S, v21.4S, v23.s[0] +str q12, [x0, #80] +mul v21.4S, v21.4S,v24.s[0] +sub v12.4s, v14.4s, v16.4s +mla v13.4S, v2.4S, v31.s[0] +add v14.4s, v14.4s, v16.4s +str q11, [x0, #144] +sqrdmulh v11.4S, v0.4S, v23.s[1] +str q18, [x0, #208] +mul v0.4S, v0.4S,v24.s[1] +sub v18.4s, v17.4s, v13.4s +mla v21.4S, v22.4S, v31.s[0] +add v17.4s, v17.4s, v13.4s +str q14, [x0, #272] +sqrdmulh v14.4S, v20.4S, v23.s[2] +str q12, [x0, #336] +mul v20.4S, v20.4S,v24.s[2] +sub v12.4s, v19.4s, v21.4s +mla v0.4S, v11.4S, v31.s[0] +add v19.4s, v19.4s, v21.4s +str q17, [x0, #400] +sqrdmulh v17.4S, v1.4S, v23.s[3] +str q18, [x0, #464] +mul v1.4S, v1.4S,v24.s[3] +sub v18.4s, v3.4s, v0.4s +mla v20.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v0.4s +str q19, [x0, #528] +str q12, [x0, #592] +sub v12.4s, v9.4s, v20.4s +mla v1.4S, v17.4S, v31.s[0] +add v9.4s, v9.4s, v20.4s +str q3, [x0, #656] +str q18, [x0, #720] +sub v18.4s, v8.4s, v1.4s +add v8.4s, v8.4s, v1.4s +str q9, [x0, #784] +str q12, [x0, #848] +str q8, [x0, #912] +str q18, [x0, #976] +ldr q4, [x0, #32] +ldr q5, [x0, #48] +ldr q6, [x17, #+128] +ldr q7, [x17, #+144] +ldr q15, [x0, #0] +ldr q10, [x0, #16] +ldr q2, [x17, #+1152] +ldr q16, [x17, #+1168] +sqrdmulh v22.4S, v4.4S, v7.s[0] +ldr q13, [x0, #544] +mul v4.4S, v4.4S,v6.s[0] +ldr q11, [x0, #560] +mla v4.4S, v22.4S, v31.s[0] +sub v22.4s, v15.4s, v4.4s +add v15.4s, v15.4s, v4.4s +sqrdmulh v4.4S, v5.4S, v7.s[0] +ldr q21, [x0, #512] +mul v5.4S, v5.4S,v6.s[0] +ldr q14, [x0, #528] +mla v5.4S, v4.4S, v31.s[0] +sub v4.4s, v10.4s, v5.4s +add v10.4s, v10.4s, v5.4s +sqrdmulh v5.4S, v13.4S, v16.s[0] +mul v13.4S, v13.4S,v2.s[0] +mla v13.4S, v5.4S, v31.s[0] +sub v5.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +sqrdmulh v13.4S, v11.4S, v16.s[0] +mul v11.4S, v11.4S,v2.s[0] +mla v11.4S, v13.4S, v31.s[0] +sub v13.4s, v14.4s, v11.4s +add v14.4s, v14.4s, v11.4s +sqrdmulh v11.4S, v10.4S, v7.s[1] +mul v10.4S, v10.4S,v6.s[1] +mla v10.4S, v11.4S, v31.s[0] +sub v11.4s, v15.4s, v10.4s +add v15.4s, v15.4s, v10.4s +sqrdmulh v10.4S, v4.4S, v7.s[2] +mul v4.4S, v4.4S,v6.s[2] +mla v4.4S, v10.4S, v31.s[0] +sub v10.4s, v22.4s, v4.4s +add v22.4s, v22.4s, v4.4s +sqrdmulh v4.4S, v14.4S, v16.s[1] +mul v14.4S, v14.4S,v2.s[1] +mla v14.4S, v4.4S, v31.s[0] +sub v4.4s, v21.4s, v14.4s +add v21.4s, v21.4s, v14.4s +sqrdmulh v14.4S, v13.4S, v16.s[2] +mul v13.4S, v13.4S,v2.s[2] +mla v13.4S, v14.4S, v31.s[0] +sub v14.4s, v5.4s, v13.4s +add v5.4s, v5.4s, v13.4s +trn1 v13.4S, v15.4S, v11.4S +trn2 v0.4S, v15.4S, v11.4S +trn1 v19.4S, v22.4S, v10.4S +trn2 v17.4S, v22.4S, v10.4S +trn2 v22.2D, v13.2D, v19.2D +trn2 v10.2D, v0.2D, v17.2D +trn1 v15.2D, v13.2D, v19.2D +trn1 v11.2D, v0.2D, v17.2D +ldr q17, [x17, #+160] +ldr q0, [x17, #+176] +trn1 v19.4S, v21.4S, v4.4S +trn2 v13.4S, v21.4S, v4.4S +trn1 v20.4S, v5.4S, v14.4S +trn2 v3.4S, v5.4S, v14.4S +trn2 v5.2D, v19.2D, v20.2D +trn2 v14.2D, v13.2D, v3.2D +trn1 v21.2D, v19.2D, v20.2D +trn1 v4.2D, v13.2D, v3.2D +ldr q3, [x17, #+1184] +ldr q13, [x17, #+1200] +sqrdmulh v20.4S, v22.4S, v0.4S +mul v22.4S, v22.4S,v17.4S +mla v22.4S, v20.4S, v31.s[0] +sub v20.4s, v15.4s, v22.4s +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v10.4S, v0.4S +mul v10.4S, v10.4S,v17.4S +mla v10.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v10.4s +add v11.4s, v11.4s, v10.4s +sqrdmulh v10.4S, v5.4S, v13.4S +mul v5.4S, v5.4S,v3.4S +mla v5.4S, v10.4S, v31.s[0] +sub v10.4s, v21.4s, v5.4s +add v21.4s, v21.4s, v5.4s +ldr q5, [x17, #+192] +ldr q19, [x17, #+208] +sqrdmulh v1.4S, v14.4S, v13.4S +mul v14.4S, v14.4S,v3.4S +mla v14.4S, v1.4S, v31.s[0] +sub v1.4s, v4.4s, v14.4s +add v4.4s, v4.4s, v14.4s +ldr q14, [x17, #+224] +ldr q9, [x17, #+240] +sqrdmulh v12.4S, v11.4S, v19.4S +mul v11.4S, v11.4S,v5.4S +mla v11.4S, v12.4S, v31.s[0] +sub v12.4s, v15.4s, v11.4s +add v15.4s, v15.4s, v11.4s +ldr q11, [x17, #+1216] +ldr q8, [x17, #+1232] +sqrdmulh v18.4S, v22.4S, v9.4S +mul v22.4S, v22.4S,v14.4S +mla v22.4S, v18.4S, v31.s[0] +sub v18.4s, v20.4s, v22.4s +add v20.4s, v20.4s, v22.4s +ldr q22, [x17, #+1248] +ldr q30, [x17, #+1264] +sqrdmulh v29.4S, v4.4S, v8.4S +ldr q28, [x0, #96] +mul v4.4S, v4.4S,v11.4S +mla v4.4S, v29.4S, v31.s[0] +sub v29.4s, v21.4s, v4.4s +add v21.4s, v21.4s, v4.4s +sqrdmulh v4.4S, v1.4S, v30.4S +ldr q27, [x0, #112] +mul v1.4S, v1.4S,v22.4S +mla v1.4S, v4.4S, v31.s[0] +sub v4.4s, v10.4s, v1.4s +add v10.4s, v10.4s, v1.4s +str q15, [x0, #0] +str q12, [x0, #16] +str q20, [x0, #32] +str q18, [x0, #48] +str q21, [x0, #512] +str q29, [x0, #528] +str q10, [x0, #544] +str q4, [x0, #560] +ldr q30, [x17, #+256] +ldr q22, [x17, #+272] +ldr q8, [x0, #64] +ldr q11, [x0, #80] +ldr q13, [x17, #+1280] +ldr q3, [x17, #+1296] +sqrdmulh v16.4S, v28.4S, v22.s[0] +ldr q2, [x0, #608] +mul v28.4S, v28.4S,v30.s[0] +ldr q4, [x0, #624] +mla v28.4S, v16.4S, v31.s[0] +sub v16.4s, v8.4s, v28.4s +add v8.4s, v8.4s, v28.4s +sqrdmulh v28.4S, v27.4S, v22.s[0] +ldr q10, [x0, #576] +mul v27.4S, v27.4S,v30.s[0] +ldr q29, [x0, #592] +mla v27.4S, v28.4S, v31.s[0] +sub v28.4s, v11.4s, v27.4s +add v11.4s, v11.4s, v27.4s +sqrdmulh v27.4S, v2.4S, v3.s[0] +mul v2.4S, v2.4S,v13.s[0] +mla v2.4S, v27.4S, v31.s[0] +sub v27.4s, v10.4s, v2.4s +add v10.4s, v10.4s, v2.4s +sqrdmulh v2.4S, v4.4S, v3.s[0] +mul v4.4S, v4.4S,v13.s[0] +mla v4.4S, v2.4S, v31.s[0] +sub v2.4s, v29.4s, v4.4s +add v29.4s, v29.4s, v4.4s +sqrdmulh v4.4S, v11.4S, v22.s[1] +mul v11.4S, v11.4S,v30.s[1] +mla v11.4S, v4.4S, v31.s[0] +sub v4.4s, v8.4s, v11.4s +add v8.4s, v8.4s, v11.4s +sqrdmulh v11.4S, v28.4S, v22.s[2] +mul v28.4S, v28.4S,v30.s[2] +mla v28.4S, v11.4S, v31.s[0] +sub v11.4s, v16.4s, v28.4s +add v16.4s, v16.4s, v28.4s +sqrdmulh v28.4S, v29.4S, v3.s[1] +mul v29.4S, v29.4S,v13.s[1] +mla v29.4S, v28.4S, v31.s[0] +sub v28.4s, v10.4s, v29.4s +add v10.4s, v10.4s, v29.4s +sqrdmulh v29.4S, v2.4S, v3.s[2] +mul v2.4S, v2.4S,v13.s[2] +mla v2.4S, v29.4S, v31.s[0] +sub v29.4s, v27.4s, v2.4s +add v27.4s, v27.4s, v2.4s +trn1 v2.4S, v8.4S, v4.4S +trn2 v21.4S, v8.4S, v4.4S +trn1 v9.4S, v16.4S, v11.4S +trn2 v14.4S, v16.4S, v11.4S +trn2 v16.2D, v2.2D, v9.2D +trn2 v11.2D, v21.2D, v14.2D +trn1 v8.2D, v2.2D, v9.2D +trn1 v4.2D, v21.2D, v14.2D +ldr q14, [x17, #+288] +ldr q21, [x17, #+304] +trn1 v9.4S, v10.4S, v28.4S +trn2 v2.4S, v10.4S, v28.4S +trn1 v19.4S, v27.4S, v29.4S +trn2 v5.4S, v27.4S, v29.4S +trn2 v27.2D, v9.2D, v19.2D +trn2 v29.2D, v2.2D, v5.2D +trn1 v10.2D, v9.2D, v19.2D +trn1 v28.2D, v2.2D, v5.2D +ldr q5, [x17, #+1312] +ldr q2, [x17, #+1328] +sqrdmulh v19.4S, v16.4S, v21.4S +mul v16.4S, v16.4S,v14.4S +mla v16.4S, v19.4S, v31.s[0] +sub v19.4s, v8.4s, v16.4s +add v8.4s, v8.4s, v16.4s +sqrdmulh v16.4S, v11.4S, v21.4S +mul v11.4S, v11.4S,v14.4S +mla v11.4S, v16.4S, v31.s[0] +sub v16.4s, v4.4s, v11.4s +add v4.4s, v4.4s, v11.4s +sqrdmulh v11.4S, v27.4S, v2.4S +mul v27.4S, v27.4S,v5.4S +mla v27.4S, v11.4S, v31.s[0] +sub v11.4s, v10.4s, v27.4s +add v10.4s, v10.4s, v27.4s +ldr q27, [x17, #+320] +ldr q9, [x17, #+336] +sqrdmulh v0.4S, v29.4S, v2.4S +mul v29.4S, v29.4S,v5.4S +mla v29.4S, v0.4S, v31.s[0] +sub v0.4s, v28.4s, v29.4s +add v28.4s, v28.4s, v29.4s +ldr q29, [x17, #+352] +ldr q17, [x17, #+368] +sqrdmulh v7.4S, v4.4S, v9.4S +mul v4.4S, v4.4S,v27.4S +mla v4.4S, v7.4S, v31.s[0] +sub v7.4s, v8.4s, v4.4s +add v8.4s, v8.4s, v4.4s +ldr q4, [x17, #+1344] +ldr q6, [x17, #+1360] +sqrdmulh v18.4S, v16.4S, v17.4S +mul v16.4S, v16.4S,v29.4S +mla v16.4S, v18.4S, v31.s[0] +sub v18.4s, v19.4s, v16.4s +add v19.4s, v19.4s, v16.4s +ldr q16, [x17, #+1376] +ldr q20, [x17, #+1392] +sqrdmulh v12.4S, v28.4S, v6.4S +ldr q15, [x0, #160] +mul v28.4S, v28.4S,v4.4S +mla v28.4S, v12.4S, v31.s[0] +sub v12.4s, v10.4s, v28.4s +add v10.4s, v10.4s, v28.4s +sqrdmulh v28.4S, v0.4S, v20.4S +ldr q1, [x0, #176] +mul v0.4S, v0.4S,v16.4S +mla v0.4S, v28.4S, v31.s[0] +sub v28.4s, v11.4s, v0.4s +add v11.4s, v11.4s, v0.4s +str q8, [x0, #64] +str q7, [x0, #80] +str q19, [x0, #96] +str q18, [x0, #112] +str q10, [x0, #576] +str q12, [x0, #592] +str q11, [x0, #608] +str q28, [x0, #624] +ldr q20, [x17, #+384] +ldr q16, [x17, #+400] +ldr q6, [x0, #128] +ldr q4, [x0, #144] +ldr q2, [x17, #+1408] +ldr q5, [x17, #+1424] +sqrdmulh v3.4S, v15.4S, v16.s[0] +ldr q13, [x0, #672] +mul v15.4S, v15.4S,v20.s[0] +ldr q28, [x0, #688] +mla v15.4S, v3.4S, v31.s[0] +sub v3.4s, v6.4s, v15.4s +add v6.4s, v6.4s, v15.4s +sqrdmulh v15.4S, v1.4S, v16.s[0] +ldr q11, [x0, #640] +mul v1.4S, v1.4S,v20.s[0] +ldr q12, [x0, #656] +mla v1.4S, v15.4S, v31.s[0] +sub v15.4s, v4.4s, v1.4s +add v4.4s, v4.4s, v1.4s +sqrdmulh v1.4S, v13.4S, v5.s[0] +mul v13.4S, v13.4S,v2.s[0] +mla v13.4S, v1.4S, v31.s[0] +sub v1.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +sqrdmulh v13.4S, v28.4S, v5.s[0] +mul v28.4S, v28.4S,v2.s[0] +mla v28.4S, v13.4S, v31.s[0] +sub v13.4s, v12.4s, v28.4s +add v12.4s, v12.4s, v28.4s +sqrdmulh v28.4S, v4.4S, v16.s[1] +mul v4.4S, v4.4S,v20.s[1] +mla v4.4S, v28.4S, v31.s[0] +sub v28.4s, v6.4s, v4.4s +add v6.4s, v6.4s, v4.4s +sqrdmulh v4.4S, v15.4S, v16.s[2] +mul v15.4S, v15.4S,v20.s[2] +mla v15.4S, v4.4S, v31.s[0] +sub v4.4s, v3.4s, v15.4s +add v3.4s, v3.4s, v15.4s +sqrdmulh v15.4S, v12.4S, v5.s[1] +mul v12.4S, v12.4S,v2.s[1] +mla v12.4S, v15.4S, v31.s[0] +sub v15.4s, v11.4s, v12.4s +add v11.4s, v11.4s, v12.4s +sqrdmulh v12.4S, v13.4S, v5.s[2] +mul v13.4S, v13.4S,v2.s[2] +mla v13.4S, v12.4S, v31.s[0] +sub v12.4s, v1.4s, v13.4s +add v1.4s, v1.4s, v13.4s +trn1 v13.4S, v6.4S, v28.4S +trn2 v10.4S, v6.4S, v28.4S +trn1 v17.4S, v3.4S, v4.4S +trn2 v29.4S, v3.4S, v4.4S +trn2 v3.2D, v13.2D, v17.2D +trn2 v4.2D, v10.2D, v29.2D +trn1 v6.2D, v13.2D, v17.2D +trn1 v28.2D, v10.2D, v29.2D +ldr q29, [x17, #+416] +ldr q10, [x17, #+432] +trn1 v17.4S, v11.4S, v15.4S +trn2 v13.4S, v11.4S, v15.4S +trn1 v9.4S, v1.4S, v12.4S +trn2 v27.4S, v1.4S, v12.4S +trn2 v1.2D, v17.2D, v9.2D +trn2 v12.2D, v13.2D, v27.2D +trn1 v11.2D, v17.2D, v9.2D +trn1 v15.2D, v13.2D, v27.2D +ldr q27, [x17, #+1440] +ldr q13, [x17, #+1456] +sqrdmulh v9.4S, v3.4S, v10.4S +mul v3.4S, v3.4S,v29.4S +mla v3.4S, v9.4S, v31.s[0] +sub v9.4s, v6.4s, v3.4s +add v6.4s, v6.4s, v3.4s +sqrdmulh v3.4S, v4.4S, v10.4S +mul v4.4S, v4.4S,v29.4S +mla v4.4S, v3.4S, v31.s[0] +sub v3.4s, v28.4s, v4.4s +add v28.4s, v28.4s, v4.4s +sqrdmulh v4.4S, v1.4S, v13.4S +mul v1.4S, v1.4S,v27.4S +mla v1.4S, v4.4S, v31.s[0] +sub v4.4s, v11.4s, v1.4s +add v11.4s, v11.4s, v1.4s +ldr q1, [x17, #+448] +ldr q17, [x17, #+464] +sqrdmulh v21.4S, v12.4S, v13.4S +mul v12.4S, v12.4S,v27.4S +mla v12.4S, v21.4S, v31.s[0] +sub v21.4s, v15.4s, v12.4s +add v15.4s, v15.4s, v12.4s +ldr q12, [x17, #+480] +ldr q14, [x17, #+496] +sqrdmulh v22.4S, v28.4S, v17.4S +mul v28.4S, v28.4S,v1.4S +mla v28.4S, v22.4S, v31.s[0] +sub v22.4s, v6.4s, v28.4s +add v6.4s, v6.4s, v28.4s +ldr q28, [x17, #+1472] +ldr q30, [x17, #+1488] +sqrdmulh v18.4S, v3.4S, v14.4S +mul v3.4S, v3.4S,v12.4S +mla v3.4S, v18.4S, v31.s[0] +sub v18.4s, v9.4s, v3.4s +add v9.4s, v9.4s, v3.4s +ldr q3, [x17, #+1504] +ldr q19, [x17, #+1520] +sqrdmulh v7.4S, v15.4S, v30.4S +ldr q8, [x0, #224] +mul v15.4S, v15.4S,v28.4S +mla v15.4S, v7.4S, v31.s[0] +sub v7.4s, v11.4s, v15.4s +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v21.4S, v19.4S +ldr q0, [x0, #240] +mul v21.4S, v21.4S,v3.4S +mla v21.4S, v15.4S, v31.s[0] +sub v15.4s, v4.4s, v21.4s +add v4.4s, v4.4s, v21.4s +str q6, [x0, #128] +str q22, [x0, #144] +str q9, [x0, #160] +str q18, [x0, #176] +str q11, [x0, #640] +str q7, [x0, #656] +str q4, [x0, #672] +str q15, [x0, #688] +ldr q19, [x17, #+512] +ldr q3, [x17, #+528] +ldr q30, [x0, #192] +ldr q28, [x0, #208] +ldr q13, [x17, #+1536] +ldr q27, [x17, #+1552] +sqrdmulh v5.4S, v8.4S, v3.s[0] +ldr q2, [x0, #736] +mul v8.4S, v8.4S,v19.s[0] +ldr q15, [x0, #752] +mla v8.4S, v5.4S, v31.s[0] +sub v5.4s, v30.4s, v8.4s +add v30.4s, v30.4s, v8.4s +sqrdmulh v8.4S, v0.4S, v3.s[0] +ldr q4, [x0, #704] +mul v0.4S, v0.4S,v19.s[0] +ldr q7, [x0, #720] +mla v0.4S, v8.4S, v31.s[0] +sub v8.4s, v28.4s, v0.4s +add v28.4s, v28.4s, v0.4s +sqrdmulh v0.4S, v2.4S, v27.s[0] +mul v2.4S, v2.4S,v13.s[0] +mla v2.4S, v0.4S, v31.s[0] +sub v0.4s, v4.4s, v2.4s +add v4.4s, v4.4s, v2.4s +sqrdmulh v2.4S, v15.4S, v27.s[0] +mul v15.4S, v15.4S,v13.s[0] +mla v15.4S, v2.4S, v31.s[0] +sub v2.4s, v7.4s, v15.4s +add v7.4s, v7.4s, v15.4s +sqrdmulh v15.4S, v28.4S, v3.s[1] +mul v28.4S, v28.4S,v19.s[1] +mla v28.4S, v15.4S, v31.s[0] +sub v15.4s, v30.4s, v28.4s +add v30.4s, v30.4s, v28.4s +sqrdmulh v28.4S, v8.4S, v3.s[2] +mul v8.4S, v8.4S,v19.s[2] +mla v8.4S, v28.4S, v31.s[0] +sub v28.4s, v5.4s, v8.4s +add v5.4s, v5.4s, v8.4s +sqrdmulh v8.4S, v7.4S, v27.s[1] +mul v7.4S, v7.4S,v13.s[1] +mla v7.4S, v8.4S, v31.s[0] +sub v8.4s, v4.4s, v7.4s +add v4.4s, v4.4s, v7.4s +sqrdmulh v7.4S, v2.4S, v27.s[2] +mul v2.4S, v2.4S,v13.s[2] +mla v2.4S, v7.4S, v31.s[0] +sub v7.4s, v0.4s, v2.4s +add v0.4s, v0.4s, v2.4s +trn1 v2.4S, v30.4S, v15.4S +trn2 v11.4S, v30.4S, v15.4S +trn1 v14.4S, v5.4S, v28.4S +trn2 v12.4S, v5.4S, v28.4S +trn2 v5.2D, v2.2D, v14.2D +trn2 v28.2D, v11.2D, v12.2D +trn1 v30.2D, v2.2D, v14.2D +trn1 v15.2D, v11.2D, v12.2D +ldr q12, [x17, #+544] +ldr q11, [x17, #+560] +trn1 v14.4S, v4.4S, v8.4S +trn2 v2.4S, v4.4S, v8.4S +trn1 v17.4S, v0.4S, v7.4S +trn2 v1.4S, v0.4S, v7.4S +trn2 v0.2D, v14.2D, v17.2D +trn2 v7.2D, v2.2D, v1.2D +trn1 v4.2D, v14.2D, v17.2D +trn1 v8.2D, v2.2D, v1.2D +ldr q1, [x17, #+1568] +ldr q2, [x17, #+1584] +sqrdmulh v17.4S, v5.4S, v11.4S +mul v5.4S, v5.4S,v12.4S +mla v5.4S, v17.4S, v31.s[0] +sub v17.4s, v30.4s, v5.4s +add v30.4s, v30.4s, v5.4s +sqrdmulh v5.4S, v28.4S, v11.4S +mul v28.4S, v28.4S,v12.4S +mla v28.4S, v5.4S, v31.s[0] +sub v5.4s, v15.4s, v28.4s +add v15.4s, v15.4s, v28.4s +sqrdmulh v28.4S, v0.4S, v2.4S +mul v0.4S, v0.4S,v1.4S +mla v0.4S, v28.4S, v31.s[0] +sub v28.4s, v4.4s, v0.4s +add v4.4s, v4.4s, v0.4s +ldr q0, [x17, #+576] +ldr q14, [x17, #+592] +sqrdmulh v10.4S, v7.4S, v2.4S +mul v7.4S, v7.4S,v1.4S +mla v7.4S, v10.4S, v31.s[0] +sub v10.4s, v8.4s, v7.4s +add v8.4s, v8.4s, v7.4s +ldr q7, [x17, #+608] +ldr q29, [x17, #+624] +sqrdmulh v16.4S, v15.4S, v14.4S +mul v15.4S, v15.4S,v0.4S +mla v15.4S, v16.4S, v31.s[0] +sub v16.4s, v30.4s, v15.4s +add v30.4s, v30.4s, v15.4s +ldr q15, [x17, #+1600] +ldr q20, [x17, #+1616] +sqrdmulh v18.4S, v5.4S, v29.4S +mul v5.4S, v5.4S,v7.4S +mla v5.4S, v18.4S, v31.s[0] +sub v18.4s, v17.4s, v5.4s +add v17.4s, v17.4s, v5.4s +ldr q5, [x17, #+1632] +ldr q9, [x17, #+1648] +sqrdmulh v22.4S, v8.4S, v20.4S +ldr q6, [x0, #288] +mul v8.4S, v8.4S,v15.4S +mla v8.4S, v22.4S, v31.s[0] +sub v22.4s, v4.4s, v8.4s +add v4.4s, v4.4s, v8.4s +sqrdmulh v8.4S, v10.4S, v9.4S +ldr q21, [x0, #304] +mul v10.4S, v10.4S,v5.4S +mla v10.4S, v8.4S, v31.s[0] +sub v8.4s, v28.4s, v10.4s +add v28.4s, v28.4s, v10.4s +str q30, [x0, #192] +str q16, [x0, #208] +str q17, [x0, #224] +str q18, [x0, #240] +str q4, [x0, #704] +str q22, [x0, #720] +str q28, [x0, #736] +str q8, [x0, #752] +ldr q9, [x17, #+640] +ldr q5, [x17, #+656] +ldr q20, [x0, #256] +ldr q15, [x0, #272] +ldr q2, [x17, #+1664] +ldr q1, [x17, #+1680] +sqrdmulh v27.4S, v6.4S, v5.s[0] +ldr q13, [x0, #800] +mul v6.4S, v6.4S,v9.s[0] +ldr q8, [x0, #816] +mla v6.4S, v27.4S, v31.s[0] +sub v27.4s, v20.4s, v6.4s +add v20.4s, v20.4s, v6.4s +sqrdmulh v6.4S, v21.4S, v5.s[0] +ldr q28, [x0, #768] +mul v21.4S, v21.4S,v9.s[0] +ldr q22, [x0, #784] +mla v21.4S, v6.4S, v31.s[0] +sub v6.4s, v15.4s, v21.4s +add v15.4s, v15.4s, v21.4s +sqrdmulh v21.4S, v13.4S, v1.s[0] +mul v13.4S, v13.4S,v2.s[0] +mla v13.4S, v21.4S, v31.s[0] +sub v21.4s, v28.4s, v13.4s +add v28.4s, v28.4s, v13.4s +sqrdmulh v13.4S, v8.4S, v1.s[0] +mul v8.4S, v8.4S,v2.s[0] +mla v8.4S, v13.4S, v31.s[0] +sub v13.4s, v22.4s, v8.4s +add v22.4s, v22.4s, v8.4s +sqrdmulh v8.4S, v15.4S, v5.s[1] +mul v15.4S, v15.4S,v9.s[1] +mla v15.4S, v8.4S, v31.s[0] +sub v8.4s, v20.4s, v15.4s +add v20.4s, v20.4s, v15.4s +sqrdmulh v15.4S, v6.4S, v5.s[2] +mul v6.4S, v6.4S,v9.s[2] +mla v6.4S, v15.4S, v31.s[0] +sub v15.4s, v27.4s, v6.4s +add v27.4s, v27.4s, v6.4s +sqrdmulh v6.4S, v22.4S, v1.s[1] +mul v22.4S, v22.4S,v2.s[1] +mla v22.4S, v6.4S, v31.s[0] +sub v6.4s, v28.4s, v22.4s +add v28.4s, v28.4s, v22.4s +sqrdmulh v22.4S, v13.4S, v1.s[2] +mul v13.4S, v13.4S,v2.s[2] +mla v13.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +trn1 v13.4S, v20.4S, v8.4S +trn2 v4.4S, v20.4S, v8.4S +trn1 v29.4S, v27.4S, v15.4S +trn2 v7.4S, v27.4S, v15.4S +trn2 v27.2D, v13.2D, v29.2D +trn2 v15.2D, v4.2D, v7.2D +trn1 v20.2D, v13.2D, v29.2D +trn1 v8.2D, v4.2D, v7.2D +ldr q7, [x17, #+672] +ldr q4, [x17, #+688] +trn1 v29.4S, v28.4S, v6.4S +trn2 v13.4S, v28.4S, v6.4S +trn1 v14.4S, v21.4S, v22.4S +trn2 v0.4S, v21.4S, v22.4S +trn2 v21.2D, v29.2D, v14.2D +trn2 v22.2D, v13.2D, v0.2D +trn1 v28.2D, v29.2D, v14.2D +trn1 v6.2D, v13.2D, v0.2D +ldr q0, [x17, #+1696] +ldr q13, [x17, #+1712] +sqrdmulh v14.4S, v27.4S, v4.4S +mul v27.4S, v27.4S,v7.4S +mla v27.4S, v14.4S, v31.s[0] +sub v14.4s, v20.4s, v27.4s +add v20.4s, v20.4s, v27.4s +sqrdmulh v27.4S, v15.4S, v4.4S +mul v15.4S, v15.4S,v7.4S +mla v15.4S, v27.4S, v31.s[0] +sub v27.4s, v8.4s, v15.4s +add v8.4s, v8.4s, v15.4s +sqrdmulh v15.4S, v21.4S, v13.4S +mul v21.4S, v21.4S,v0.4S +mla v21.4S, v15.4S, v31.s[0] +sub v15.4s, v28.4s, v21.4s +add v28.4s, v28.4s, v21.4s +ldr q21, [x17, #+704] +ldr q29, [x17, #+720] +sqrdmulh v11.4S, v22.4S, v13.4S +mul v22.4S, v22.4S,v0.4S +mla v22.4S, v11.4S, v31.s[0] +sub v11.4s, v6.4s, v22.4s +add v6.4s, v6.4s, v22.4s +ldr q22, [x17, #+736] +ldr q12, [x17, #+752] +sqrdmulh v3.4S, v8.4S, v29.4S +mul v8.4S, v8.4S,v21.4S +mla v8.4S, v3.4S, v31.s[0] +sub v3.4s, v20.4s, v8.4s +add v20.4s, v20.4s, v8.4s +ldr q8, [x17, #+1728] +ldr q19, [x17, #+1744] +sqrdmulh v18.4S, v27.4S, v12.4S +mul v27.4S, v27.4S,v22.4S +mla v27.4S, v18.4S, v31.s[0] +sub v18.4s, v14.4s, v27.4s +add v14.4s, v14.4s, v27.4s +ldr q27, [x17, #+1760] +ldr q17, [x17, #+1776] +sqrdmulh v16.4S, v6.4S, v19.4S +ldr q30, [x0, #352] +mul v6.4S, v6.4S,v8.4S +mla v6.4S, v16.4S, v31.s[0] +sub v16.4s, v28.4s, v6.4s +add v28.4s, v28.4s, v6.4s +sqrdmulh v6.4S, v11.4S, v17.4S +ldr q10, [x0, #368] +mul v11.4S, v11.4S,v27.4S +mla v11.4S, v6.4S, v31.s[0] +sub v6.4s, v15.4s, v11.4s +add v15.4s, v15.4s, v11.4s +str q20, [x0, #256] +str q3, [x0, #272] +str q14, [x0, #288] +str q18, [x0, #304] +str q28, [x0, #768] +str q16, [x0, #784] +str q15, [x0, #800] +str q6, [x0, #816] +ldr q17, [x17, #+768] +ldr q27, [x17, #+784] +ldr q19, [x0, #320] +ldr q8, [x0, #336] +ldr q13, [x17, #+1792] +ldr q0, [x17, #+1808] +sqrdmulh v1.4S, v30.4S, v27.s[0] +ldr q2, [x0, #864] +mul v30.4S, v30.4S,v17.s[0] +ldr q6, [x0, #880] +mla v30.4S, v1.4S, v31.s[0] +sub v1.4s, v19.4s, v30.4s +add v19.4s, v19.4s, v30.4s +sqrdmulh v30.4S, v10.4S, v27.s[0] +ldr q15, [x0, #832] +mul v10.4S, v10.4S,v17.s[0] +ldr q16, [x0, #848] +mla v10.4S, v30.4S, v31.s[0] +sub v30.4s, v8.4s, v10.4s +add v8.4s, v8.4s, v10.4s +sqrdmulh v10.4S, v2.4S, v0.s[0] +mul v2.4S, v2.4S,v13.s[0] +mla v2.4S, v10.4S, v31.s[0] +sub v10.4s, v15.4s, v2.4s +add v15.4s, v15.4s, v2.4s +sqrdmulh v2.4S, v6.4S, v0.s[0] +mul v6.4S, v6.4S,v13.s[0] +mla v6.4S, v2.4S, v31.s[0] +sub v2.4s, v16.4s, v6.4s +add v16.4s, v16.4s, v6.4s +sqrdmulh v6.4S, v8.4S, v27.s[1] +mul v8.4S, v8.4S,v17.s[1] +mla v8.4S, v6.4S, v31.s[0] +sub v6.4s, v19.4s, v8.4s +add v19.4s, v19.4s, v8.4s +sqrdmulh v8.4S, v30.4S, v27.s[2] +mul v30.4S, v30.4S,v17.s[2] +mla v30.4S, v8.4S, v31.s[0] +sub v8.4s, v1.4s, v30.4s +add v1.4s, v1.4s, v30.4s +sqrdmulh v30.4S, v16.4S, v0.s[1] +mul v16.4S, v16.4S,v13.s[1] +mla v16.4S, v30.4S, v31.s[0] +sub v30.4s, v15.4s, v16.4s +add v15.4s, v15.4s, v16.4s +sqrdmulh v16.4S, v2.4S, v0.s[2] +mul v2.4S, v2.4S,v13.s[2] +mla v2.4S, v16.4S, v31.s[0] +sub v16.4s, v10.4s, v2.4s +add v10.4s, v10.4s, v2.4s +trn1 v2.4S, v19.4S, v6.4S +trn2 v28.4S, v19.4S, v6.4S +trn1 v12.4S, v1.4S, v8.4S +trn2 v22.4S, v1.4S, v8.4S +trn2 v1.2D, v2.2D, v12.2D +trn2 v8.2D, v28.2D, v22.2D +trn1 v19.2D, v2.2D, v12.2D +trn1 v6.2D, v28.2D, v22.2D +ldr q22, [x17, #+800] +ldr q28, [x17, #+816] +trn1 v12.4S, v15.4S, v30.4S +trn2 v2.4S, v15.4S, v30.4S +trn1 v29.4S, v10.4S, v16.4S +trn2 v21.4S, v10.4S, v16.4S +trn2 v10.2D, v12.2D, v29.2D +trn2 v16.2D, v2.2D, v21.2D +trn1 v15.2D, v12.2D, v29.2D +trn1 v30.2D, v2.2D, v21.2D +ldr q21, [x17, #+1824] +ldr q2, [x17, #+1840] +sqrdmulh v29.4S, v1.4S, v28.4S +mul v1.4S, v1.4S,v22.4S +mla v1.4S, v29.4S, v31.s[0] +sub v29.4s, v19.4s, v1.4s +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v8.4S, v28.4S +mul v8.4S, v8.4S,v22.4S +mla v8.4S, v1.4S, v31.s[0] +sub v1.4s, v6.4s, v8.4s +add v6.4s, v6.4s, v8.4s +sqrdmulh v8.4S, v10.4S, v2.4S +mul v10.4S, v10.4S,v21.4S +mla v10.4S, v8.4S, v31.s[0] +sub v8.4s, v15.4s, v10.4s +add v15.4s, v15.4s, v10.4s +ldr q10, [x17, #+832] +ldr q12, [x17, #+848] +sqrdmulh v4.4S, v16.4S, v2.4S +mul v16.4S, v16.4S,v21.4S +mla v16.4S, v4.4S, v31.s[0] +sub v4.4s, v30.4s, v16.4s +add v30.4s, v30.4s, v16.4s +ldr q16, [x17, #+864] +ldr q7, [x17, #+880] +sqrdmulh v5.4S, v6.4S, v12.4S +mul v6.4S, v6.4S,v10.4S +mla v6.4S, v5.4S, v31.s[0] +sub v5.4s, v19.4s, v6.4s +add v19.4s, v19.4s, v6.4s +ldr q6, [x17, #+1856] +ldr q9, [x17, #+1872] +sqrdmulh v18.4S, v1.4S, v7.4S +mul v1.4S, v1.4S,v16.4S +mla v1.4S, v18.4S, v31.s[0] +sub v18.4s, v29.4s, v1.4s +add v29.4s, v29.4s, v1.4s +ldr q1, [x17, #+1888] +ldr q14, [x17, #+1904] +sqrdmulh v3.4S, v30.4S, v9.4S +ldr q20, [x0, #416] +mul v30.4S, v30.4S,v6.4S +mla v30.4S, v3.4S, v31.s[0] +sub v3.4s, v15.4s, v30.4s +add v15.4s, v15.4s, v30.4s +sqrdmulh v30.4S, v4.4S, v14.4S +ldr q11, [x0, #432] +mul v4.4S, v4.4S,v1.4S +mla v4.4S, v30.4S, v31.s[0] +sub v30.4s, v8.4s, v4.4s +add v8.4s, v8.4s, v4.4s +str q19, [x0, #320] +str q5, [x0, #336] +str q29, [x0, #352] +str q18, [x0, #368] +str q15, [x0, #832] +str q3, [x0, #848] +str q8, [x0, #864] +str q30, [x0, #880] +ldr q14, [x17, #+896] +ldr q1, [x17, #+912] +ldr q9, [x0, #384] +ldr q6, [x0, #400] +ldr q2, [x17, #+1920] +ldr q21, [x17, #+1936] +sqrdmulh v0.4S, v20.4S, v1.s[0] +ldr q13, [x0, #928] +mul v20.4S, v20.4S,v14.s[0] +ldr q30, [x0, #944] +mla v20.4S, v0.4S, v31.s[0] +sub v0.4s, v9.4s, v20.4s +add v9.4s, v9.4s, v20.4s +sqrdmulh v20.4S, v11.4S, v1.s[0] +ldr q8, [x0, #896] +mul v11.4S, v11.4S,v14.s[0] +ldr q3, [x0, #912] +mla v11.4S, v20.4S, v31.s[0] +sub v20.4s, v6.4s, v11.4s +add v6.4s, v6.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v21.s[0] +mul v13.4S, v13.4S,v2.s[0] +mla v13.4S, v11.4S, v31.s[0] +sub v11.4s, v8.4s, v13.4s +add v8.4s, v8.4s, v13.4s +sqrdmulh v13.4S, v30.4S, v21.s[0] +mul v30.4S, v30.4S,v2.s[0] +mla v30.4S, v13.4S, v31.s[0] +sub v13.4s, v3.4s, v30.4s +add v3.4s, v3.4s, v30.4s +sqrdmulh v30.4S, v6.4S, v1.s[1] +mul v6.4S, v6.4S,v14.s[1] +mla v6.4S, v30.4S, v31.s[0] +sub v30.4s, v9.4s, v6.4s +add v9.4s, v9.4s, v6.4s +sqrdmulh v6.4S, v20.4S, v1.s[2] +mul v20.4S, v20.4S,v14.s[2] +mla v20.4S, v6.4S, v31.s[0] +sub v6.4s, v0.4s, v20.4s +add v0.4s, v0.4s, v20.4s +sqrdmulh v20.4S, v3.4S, v21.s[1] +mul v3.4S, v3.4S,v2.s[1] +mla v3.4S, v20.4S, v31.s[0] +sub v20.4s, v8.4s, v3.4s +add v8.4s, v8.4s, v3.4s +sqrdmulh v3.4S, v13.4S, v21.s[2] +mul v13.4S, v13.4S,v2.s[2] +mla v13.4S, v3.4S, v31.s[0] +sub v3.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +trn1 v13.4S, v9.4S, v30.4S +trn2 v15.4S, v9.4S, v30.4S +trn1 v7.4S, v0.4S, v6.4S +trn2 v16.4S, v0.4S, v6.4S +trn2 v0.2D, v13.2D, v7.2D +trn2 v6.2D, v15.2D, v16.2D +trn1 v9.2D, v13.2D, v7.2D +trn1 v30.2D, v15.2D, v16.2D +ldr q16, [x17, #+928] +ldr q15, [x17, #+944] +trn1 v7.4S, v8.4S, v20.4S +trn2 v13.4S, v8.4S, v20.4S +trn1 v12.4S, v11.4S, v3.4S +trn2 v10.4S, v11.4S, v3.4S +trn2 v11.2D, v7.2D, v12.2D +trn2 v3.2D, v13.2D, v10.2D +trn1 v8.2D, v7.2D, v12.2D +trn1 v20.2D, v13.2D, v10.2D +ldr q10, [x17, #+1952] +ldr q13, [x17, #+1968] +sqrdmulh v12.4S, v0.4S, v15.4S +mul v0.4S, v0.4S,v16.4S +mla v0.4S, v12.4S, v31.s[0] +sub v12.4s, v9.4s, v0.4s +add v9.4s, v9.4s, v0.4s +sqrdmulh v0.4S, v6.4S, v15.4S +mul v6.4S, v6.4S,v16.4S +mla v6.4S, v0.4S, v31.s[0] +sub v0.4s, v30.4s, v6.4s +add v30.4s, v30.4s, v6.4s +sqrdmulh v6.4S, v11.4S, v13.4S +mul v11.4S, v11.4S,v10.4S +mla v11.4S, v6.4S, v31.s[0] +sub v6.4s, v8.4s, v11.4s +add v8.4s, v8.4s, v11.4s +ldr q11, [x17, #+960] +ldr q7, [x17, #+976] +sqrdmulh v28.4S, v3.4S, v13.4S +mul v3.4S, v3.4S,v10.4S +mla v3.4S, v28.4S, v31.s[0] +sub v28.4s, v20.4s, v3.4s +add v20.4s, v20.4s, v3.4s +ldr q3, [x17, #+992] +ldr q22, [x17, #+1008] +sqrdmulh v27.4S, v30.4S, v7.4S +mul v30.4S, v30.4S,v11.4S +mla v30.4S, v27.4S, v31.s[0] +sub v27.4s, v9.4s, v30.4s +add v9.4s, v9.4s, v30.4s +ldr q30, [x17, #+1984] +ldr q17, [x17, #+2000] +sqrdmulh v18.4S, v0.4S, v22.4S +mul v0.4S, v0.4S,v3.4S +mla v0.4S, v18.4S, v31.s[0] +sub v18.4s, v12.4s, v0.4s +add v12.4s, v12.4s, v0.4s +ldr q0, [x17, #+2016] +ldr q29, [x17, #+2032] +sqrdmulh v5.4S, v20.4S, v17.4S +ldr q19, [x0, #480] +mul v20.4S, v20.4S,v30.4S +mla v20.4S, v5.4S, v31.s[0] +sub v5.4s, v8.4s, v20.4s +add v8.4s, v8.4s, v20.4s +sqrdmulh v20.4S, v28.4S, v29.4S +ldr q4, [x0, #496] +mul v28.4S, v28.4S,v0.4S +mla v28.4S, v20.4S, v31.s[0] +sub v20.4s, v6.4s, v28.4s +add v6.4s, v6.4s, v28.4s +str q9, [x0, #384] +str q27, [x0, #400] +str q12, [x0, #416] +str q18, [x0, #432] +str q8, [x0, #896] +str q5, [x0, #912] +str q6, [x0, #928] +str q20, [x0, #944] +ldr q29, [x17, #+1024] +ldr q0, [x17, #+1040] +ldr q17, [x0, #448] +ldr q30, [x0, #464] +ldr q13, [x17, #+2048] +ldr q10, [x17, #+2064] +sqrdmulh v21.4S, v19.4S, v0.s[0] +ldr q2, [x0, #992] +mul v19.4S, v19.4S,v29.s[0] +ldr q20, [x0, #1008] +mla v19.4S, v21.4S, v31.s[0] +sub v21.4s, v17.4s, v19.4s +add v17.4s, v17.4s, v19.4s +sqrdmulh v19.4S, v4.4S, v0.s[0] +ldr q6, [x0, #960] +mul v4.4S, v4.4S,v29.s[0] +ldr q5, [x0, #976] +mla v4.4S, v19.4S, v31.s[0] +sub v19.4s, v30.4s, v4.4s +add v30.4s, v30.4s, v4.4s +sqrdmulh v4.4S, v2.4S, v10.s[0] +mul v2.4S, v2.4S,v13.s[0] +mla v2.4S, v4.4S, v31.s[0] +sub v4.4s, v6.4s, v2.4s +add v6.4s, v6.4s, v2.4s +sqrdmulh v2.4S, v20.4S, v10.s[0] +mul v20.4S, v20.4S,v13.s[0] +mla v20.4S, v2.4S, v31.s[0] +sub v2.4s, v5.4s, v20.4s +add v5.4s, v5.4s, v20.4s +sqrdmulh v20.4S, v30.4S, v0.s[1] +mul v30.4S, v30.4S,v29.s[1] +mla v30.4S, v20.4S, v31.s[0] +sub v20.4s, v17.4s, v30.4s +add v17.4s, v17.4s, v30.4s +sqrdmulh v30.4S, v19.4S, v0.s[2] +mul v19.4S, v19.4S,v29.s[2] +mla v19.4S, v30.4S, v31.s[0] +sub v30.4s, v21.4s, v19.4s +add v21.4s, v21.4s, v19.4s +sqrdmulh v19.4S, v5.4S, v10.s[1] +mul v5.4S, v5.4S,v13.s[1] +mla v5.4S, v19.4S, v31.s[0] +sub v19.4s, v6.4s, v5.4s +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v2.4S, v10.s[2] +mul v2.4S, v2.4S,v13.s[2] +mla v2.4S, v5.4S, v31.s[0] +sub v5.4s, v4.4s, v2.4s +add v4.4s, v4.4s, v2.4s +trn1 v2.4S, v17.4S, v20.4S +trn2 v8.4S, v17.4S, v20.4S +trn1 v22.4S, v21.4S, v30.4S +trn2 v3.4S, v21.4S, v30.4S +trn2 v21.2D, v2.2D, v22.2D +trn2 v30.2D, v8.2D, v3.2D +trn1 v17.2D, v2.2D, v22.2D +trn1 v20.2D, v8.2D, v3.2D +ldr q3, [x17, #+1056] +ldr q8, [x17, #+1072] +trn1 v22.4S, v6.4S, v19.4S +trn2 v2.4S, v6.4S, v19.4S +trn1 v7.4S, v4.4S, v5.4S +trn2 v11.4S, v4.4S, v5.4S +trn2 v4.2D, v22.2D, v7.2D +trn2 v5.2D, v2.2D, v11.2D +trn1 v6.2D, v22.2D, v7.2D +trn1 v19.2D, v2.2D, v11.2D +ldr q11, [x17, #+2080] +ldr q2, [x17, #+2096] +sqrdmulh v7.4S, v21.4S, v8.4S +mul v21.4S, v21.4S,v3.4S +mla v21.4S, v7.4S, v31.s[0] +sub v7.4s, v17.4s, v21.4s +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v30.4S, v8.4S +mul v30.4S, v30.4S,v3.4S +mla v30.4S, v21.4S, v31.s[0] +sub v21.4s, v20.4s, v30.4s +add v20.4s, v20.4s, v30.4s +sqrdmulh v30.4S, v4.4S, v2.4S +mul v4.4S, v4.4S,v11.4S +mla v4.4S, v30.4S, v31.s[0] +sub v30.4s, v6.4s, v4.4s +add v6.4s, v6.4s, v4.4s +ldr q4, [x17, #+1088] +ldr q22, [x17, #+1104] +sqrdmulh v15.4S, v5.4S, v2.4S +mul v5.4S, v5.4S,v11.4S +mla v5.4S, v15.4S, v31.s[0] +sub v15.4s, v19.4s, v5.4s +add v19.4s, v19.4s, v5.4s +ldr q5, [x17, #+1120] +ldr q16, [x17, #+1136] +sqrdmulh v1.4S, v20.4S, v22.4S +mul v20.4S, v20.4S,v4.4S +mla v20.4S, v1.4S, v31.s[0] +sub v1.4s, v17.4s, v20.4s +add v17.4s, v17.4s, v20.4s +ldr q20, [x17, #+2112] +ldr q14, [x17, #+2128] +sqrdmulh v18.4S, v21.4S, v16.4S +mul v21.4S, v21.4S,v5.4S +mla v21.4S, v18.4S, v31.s[0] +sub v18.4s, v7.4s, v21.4s +add v7.4s, v7.4s, v21.4s +ldr q21, [x17, #+2144] +ldr q12, [x17, #+2160] +sqrdmulh v27.4S, v19.4S, v14.4S +mul v19.4S, v19.4S,v20.4S +mla v19.4S, v27.4S, v31.s[0] +sub v27.4s, v6.4s, v19.4s +add v6.4s, v6.4s, v19.4s +sqrdmulh v19.4S, v15.4S, v12.4S +mul v15.4S, v15.4S,v21.4S +mla v15.4S, v19.4S, v31.s[0] +sub v19.4s, v30.4s, v15.4s +add v30.4s, v30.4s, v15.4s +str q17, [x0, #448] +str q1, [x0, #464] +str q7, [x0, #480] +str q18, [x0, #496] +str q6, [x0, #960] +str q27, [x0, #976] +str q30, [x0, #992] +str q19, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 2392 +// Instruction count: 2388 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_3_z2_4.s b/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_3_z2_4.s new file mode 100644 index 0000000..b9b1089 --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_3_z2_4.s @@ -0,0 +1,2422 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 26036764 // Layer 6, block 0 +.word 7065381 // Layer 6, block 1 +.word 11280567 // Layer 6, block 2 +.word 19695786 // Layer 6, block 3 +.word 1666225723 // Layer 6, block 0 +.word 452149874 // Layer 6, block 1 +.word 721901190 // Layer 6, block 2 +.word 1260434103 // Layer 6, block 3 +.word 28678040 // Layer 7, block 0 +.word 5637166 // Layer 7, block 2 +.word 18759424 // Layer 7, block 4 +.word 8648030 // Layer 7, block 6 +.word 1835254486 // Layer 7, block 0 +.word 360751090 // Layer 7, block 2 +.word 1200511508 // Layer 7, block 4 +.word 553431680 // Layer 7, block 6 +.word 7232147 // Layer 7, block 1 +.word 7430689 // Layer 7, block 3 +.word 14819378 // Layer 7, block 5 +.word 22112339 // Layer 7, block 7 +.word 462822084 // Layer 7, block 1 +.word 475527802 // Layer 7, block 3 +.word 948367809 // Layer 7, block 5 +.word 1415081692 // Layer 7, block 7 +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14834498 // Layer 6, block 4 +.word 22861321 // Layer 6, block 5 +.word 23033862 // Layer 6, block 6 +.word 32211066 // Layer 6, block 7 +.word 949335415 // Layer 6, block 4 +.word 1463012881 // Layer 6, block 5 +.word 1474054663 // Layer 6, block 6 +.word 2061350894 // Layer 6, block 7 +.word 7103825 // Layer 7, block 8 +.word 24338119 // Layer 7, block 10 +.word 6674394 // Layer 7, block 12 +.word 3716128 // Layer 7, block 14 +.word 454610102 // Layer 7, block 8 +.word 1557520740 // Layer 7, block 10 +.word 427128616 // Layer 7, block 12 +.word 237814041 // Layer 7, block 14 +.word 18577393 // Layer 7, block 9 +.word 17042091 // Layer 7, block 11 +.word 6574213 // Layer 7, block 13 +.word 24666803 // Layer 7, block 15 +.word 1188862414 // Layer 7, block 9 +.word 1090610585 // Layer 7, block 11 +.word 420717521 // Layer 7, block 13 +.word 1578554911 // Layer 7, block 15 +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 11253846 // Layer 6, block 8 +.word 16151303 // Layer 6, block 9 +.word 1821442 // Layer 6, block 10 +.word 23358663 // Layer 6, block 11 +.word 720191176 // Layer 6, block 8 +.word 1033604503 // Layer 6, block 9 +.word 116563391 // Layer 6, block 10 +.word 1494840340 // Layer 6, block 11 +.word 32787475 // Layer 7, block 16 +.word 8269259 // Layer 7, block 18 +.word 20826321 // Layer 7, block 20 +.word 21194054 // Layer 7, block 22 +.word 2098238255 // Layer 7, block 16 +.word 529192186 // Layer 7, block 18 +.word 1332782821 // Layer 7, block 20 +.word 1356315937 // Layer 7, block 22 +.word 28400654 // Layer 7, block 17 +.word 31090287 // Layer 7, block 19 +.word 26776841 // Layer 7, block 21 +.word 22281074 // Layer 7, block 23 +.word 1817503137 // Layer 7, block 17 +.word 1989626512 // Layer 7, block 19 +.word 1713587037 // Layer 7, block 21 +.word 1425879908 // Layer 7, block 23 +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 20504641 // Layer 6, block 12 +.word 7735096 // Layer 6, block 13 +.word 29463916 // Layer 6, block 14 +.word 23172067 // Layer 6, block 15 +.word 1312196872 // Layer 6, block 12 +.word 495008363 // Layer 6, block 13 +.word 1885546712 // Layer 6, block 14 +.word 1482899108 // Layer 6, block 15 +.word 1953000 // Layer 7, block 24 +.word 12766243 // Layer 7, block 26 +.word 16292342 // Layer 7, block 28 +.word 25143337 // Layer 7, block 30 +.word 124982461 // Layer 7, block 24 +.word 816977197 // Layer 7, block 26 +.word 1042630311 // Layer 7, block 28 +.word 1609050759 // Layer 7, block 30 +.word 12486848 // Layer 7, block 25 +.word 31556661 // Layer 7, block 27 +.word 28330310 // Layer 7, block 29 +.word 15137961 // Layer 7, block 31 +.word 799097282 // Layer 7, block 25 +.word 2019472170 // Layer 7, block 27 +.word 1813001465 // Layer 7, block 29 +.word 968755565 // Layer 7, block 31 +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 18663828 // Layer 6, block 16 +.word 25765932 // Layer 6, block 17 +.word 11779122 // Layer 6, block 18 +.word 29112305 // Layer 6, block 19 +.word 1194393831 // Layer 6, block 16 +.word 1648893798 // Layer 6, block 17 +.word 753806275 // Layer 6, block 18 +.word 1863045325 // Layer 6, block 19 +.word 33163184 // Layer 7, block 32 +.word 11550623 // Layer 7, block 34 +.word 25375595 // Layer 7, block 36 +.word 18254638 // Layer 7, block 38 +.word 2122281795 // Layer 7, block 32 +.word 739183455 // Layer 7, block 34 +.word 1623914137 // Layer 7, block 36 +.word 1168207670 // Layer 7, block 38 +.word 9551359 // Layer 7, block 33 +.word 33257316 // Layer 7, block 35 +.word 10387700 // Layer 7, block 37 +.word 4263629 // Layer 7, block 39 +.word 611240324 // Layer 7, block 33 +.word 2128305784 // Layer 7, block 35 +.word 664762063 // Layer 7, block 37 +.word 272851431 // Layer 7, block 39 +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 596073 // Layer 6, block 20 +.word 29039358 // Layer 6, block 21 +.word 6760262 // Layer 6, block 22 +.word 2228887 // Layer 6, block 23 +.word 38145761 // Layer 6, block 20 +.word 1858377074 // Layer 6, block 21 +.word 432623749 // Layer 6, block 22 +.word 142637881 // Layer 6, block 23 +.word 25929180 // Layer 7, block 40 +.word 23508428 // Layer 7, block 42 +.word 22560727 // Layer 7, block 44 +.word 29457393 // Layer 7, block 46 +.word 1659340873 // Layer 7, block 40 +.word 1504424569 // Layer 7, block 42 +.word 1443776334 // Layer 7, block 44 +.word 1885129272 // Layer 7, block 46 +.word 17371159 // Layer 7, block 41 +.word 11558208 // Layer 7, block 43 +.word 15755637 // Layer 7, block 45 +.word 20740787 // Layer 7, block 47 +.word 1111669329 // Layer 7, block 41 +.word 739668858 // Layer 7, block 43 +.word 1008283812 // Layer 7, block 45 +.word 1327309063 // Layer 7, block 47 +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 13624329 // Layer 6, block 24 +.word 9838349 // Layer 6, block 25 +.word 6934560 // Layer 6, block 26 +.word 11310234 // Layer 6, block 27 +.word 871890510 // Layer 6, block 24 +.word 629606282 // Layer 6, block 25 +.word 443777969 // Layer 6, block 26 +.word 723799733 // Layer 6, block 27 +.word 3153984 // Layer 7, block 48 +.word 15599806 // Layer 7, block 50 +.word 23484790 // Layer 7, block 52 +.word 30174454 // Layer 7, block 54 +.word 201839571 // Layer 7, block 48 +.word 998311389 // Layer 7, block 50 +.word 1502911852 // Layer 7, block 52 +.word 1931017673 // Layer 7, block 54 +.word 13598070 // Layer 7, block 49 +.word 31454003 // Layer 7, block 51 +.word 20506260 // Layer 7, block 53 +.word 5928435 // Layer 7, block 55 +.word 870210062 // Layer 7, block 49 +.word 2012902560 // Layer 7, block 51 +.word 1312300480 // Layer 7, block 53 +.word 379390883 // Layer 7, block 55 +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 32798516 // Layer 6, block 28 +.word 9911360 // Layer 6, block 29 +.word 32443170 // Layer 6, block 30 +.word 31293482 // Layer 6, block 31 +.word 2098944825 // Layer 6, block 28 +.word 634278629 // Layer 6, block 29 +.word 2076204416 // Layer 6, block 30 +.word 2002630000 // Layer 6, block 31 +.word 26013877 // Layer 7, block 56 +.word 22928950 // Layer 7, block 58 +.word 24547058 // Layer 7, block 60 +.word 21082546 // Layer 7, block 62 +.word 1664761067 // Layer 7, block 56 +.word 1467340807 // Layer 7, block 58 +.word 1570891816 // Layer 7, block 60 +.word 1349179970 // Layer 7, block 62 +.word 21864746 // Layer 7, block 57 +.word 27678266 // Layer 7, block 59 +.word 30695887 // Layer 7, block 61 +.word 31772478 // Layer 7, block 63 +.word 1399236949 // Layer 7, block 57 +.word 1771273834 // Layer 7, block 59 +.word 1964386839 // Layer 7, block 61 +.word 2033283404 // Layer 7, block 63 +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 2853776 // Layer 6, block 32 +.word 31645959 // Layer 6, block 33 +.word 29723614 // Layer 6, block 34 +.word 31813171 // Layer 6, block 35 +.word 182627725 // Layer 6, block 32 +.word 2025186806 // Layer 6, block 33 +.word 1902166116 // Layer 6, block 34 +.word 2035887557 // Layer 6, block 35 +.word 30377953 // Layer 7, block 64 +.word 4924837 // Layer 7, block 66 +.word 11362575 // Layer 7, block 68 +.word 31398766 // Layer 7, block 70 +.word 1944040616 // Layer 7, block 64 +.word 315165513 // Layer 7, block 66 +.word 727149301 // Layer 7, block 68 +.word 2009367662 // Layer 7, block 70 +.word 27689101 // Layer 7, block 65 +.word 31229525 // Layer 7, block 67 +.word 6544948 // Layer 7, block 69 +.word 13728247 // Layer 7, block 71 +.word 1771967221 // Layer 7, block 65 +.word 1998537064 // Layer 7, block 67 +.word 418844704 // Layer 7, block 69 +.word 878540754 // Layer 7, block 71 +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9116920 // Layer 6, block 36 +.word 26449800 // Layer 6, block 37 +.word 27173300 // Layer 6, block 38 +.word 1574249 // Layer 6, block 39 +.word 583438350 // Layer 6, block 36 +.word 1692658010 // Layer 6, block 37 +.word 1738958476 // Layer 6, block 38 +.word 100744247 // Layer 6, block 39 +.word 6510145 // Layer 7, block 72 +.word 760999 // Layer 7, block 74 +.word 1634503 // Layer 7, block 76 +.word 29546109 // Layer 7, block 78 +.word 416617482 // Layer 7, block 72 +.word 48700219 // Layer 7, block 74 +.word 104600209 // Layer 7, block 76 +.word 1890806663 // Layer 7, block 78 +.word 2195232 // Layer 7, block 73 +.word 4465852 // Layer 7, block 75 +.word 31203102 // Layer 7, block 77 +.word 29916743 // Layer 7, block 79 +.word 140484126 // Layer 7, block 73 +.word 285792715 // Layer 7, block 75 +.word 1996846121 // Layer 7, block 77 +.word 1914525428 // Layer 7, block 79 +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29172999 // Layer 6, block 40 +.word 16825951 // Layer 6, block 41 +.word 11592382 // Layer 6, block 42 +.word 2671395 // Layer 6, block 43 +.word 1866929445 // Layer 6, block 40 +.word 1076778680 // Layer 6, block 41 +.word 741855827 // Layer 6, block 42 +.word 170956232 // Layer 6, block 43 +.word 14579779 // Layer 7, block 80 +.word 24263513 // Layer 7, block 82 +.word 4646776 // Layer 7, block 84 +.word 69049 // Layer 7, block 86 +.word 933034643 // Layer 7, block 80 +.word 1552746321 // Layer 7, block 82 +.word 297370968 // Layer 7, block 84 +.word 4418799 // Layer 7, block 86 +.word 33263488 // Layer 7, block 81 +.word 22493246 // Layer 7, block 83 +.word 22009979 // Layer 7, block 85 +.word 12021234 // Layer 7, block 87 +.word 2128700762 // Layer 7, block 81 +.word 1439457879 // Layer 7, block 83 +.word 1408531152 // Layer 7, block 85 +.word 769300260 // Layer 7, block 87 +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 15720958 // Layer 6, block 44 +.word 4876619 // Layer 6, block 45 +.word 9370171 // Layer 6, block 46 +.word 2197027 // Layer 6, block 47 +.word 1006064525 // Layer 6, block 44 +.word 312079797 // Layer 6, block 45 +.word 599645177 // Layer 6, block 46 +.word 140598997 // Layer 6, block 47 +.word 16117282 // Layer 7, block 88 +.word 9635661 // Layer 7, block 90 +.word 9117520 // Layer 7, block 92 +.word 3506913 // Layer 7, block 94 +.word 1031427326 // Layer 7, block 88 +.word 616635240 // Layer 7, block 90 +.word 583476747 // Layer 7, block 92 +.word 224425303 // Layer 7, block 94 +.word 20014407 // Layer 7, block 89 +.word 25893988 // Layer 7, block 91 +.word 10257619 // Layer 7, block 93 +.word 24501669 // Layer 7, block 95 +.word 1280824291 // Layer 7, block 89 +.word 1657088757 // Layer 7, block 91 +.word 656437514 // Layer 7, block 93 +.word 1567987141 // Layer 7, block 95 +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 23467272 // Layer 6, block 48 +.word 11944835 // Layer 6, block 49 +.word 29768154 // Layer 6, block 50 +.word 3189790 // Layer 6, block 51 +.word 1501790786 // Layer 6, block 48 +.word 764411097 // Layer 6, block 49 +.word 1905016458 // Layer 6, block 50 +.word 204130980 // Layer 6, block 51 +.word 28559032 // Layer 7, block 96 +.word 20151609 // Layer 7, block 98 +.word 11645481 // Layer 7, block 100 +.word 16402437 // Layer 7, block 102 +.word 1827638556 // Layer 7, block 96 +.word 1289604549 // Layer 7, block 98 +.word 745253903 // Layer 7, block 100 +.word 1049675853 // Layer 7, block 102 +.word 1005359 // Layer 7, block 97 +.word 19130139 // Layer 7, block 99 +.word 11690281 // Layer 7, block 101 +.word 5461508 // Layer 7, block 103 +.word 64338065 // Layer 7, block 97 +.word 1224235458 // Layer 7, block 99 +.word 748120885 // Layer 7, block 101 +.word 349509836 // Layer 7, block 103 +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 4898455 // Layer 6, block 52 +.word 22059944 // Layer 6, block 53 +.word 20315246 // Layer 6, block 54 +.word 28615767 // Layer 6, block 55 +.word 313477194 // Layer 6, block 52 +.word 1411728668 // Layer 6, block 53 +.word 1300076517 // Layer 6, block 54 +.word 1831269319 // Layer 6, block 55 +.word 6226096 // Layer 7, block 104 +.word 14029790 // Layer 7, block 106 +.word 7729000 // Layer 7, block 108 +.word 13958531 // Layer 7, block 110 +.word 398439734 // Layer 7, block 104 +.word 897838034 // Layer 7, block 106 +.word 494618249 // Layer 7, block 108 +.word 893277806 // Layer 7, block 110 +.word 31755058 // Layer 7, block 105 +.word 26102744 // Layer 7, block 107 +.word 19175904 // Layer 7, block 109 +.word 19472238 // Layer 7, block 111 +.word 2032168609 // Layer 7, block 105 +.word 1670448121 // Layer 7, block 107 +.word 1227164194 // Layer 7, block 109 +.word 1246128123 // Layer 7, block 111 +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 17302560 // Layer 6, block 56 +.word 8630188 // Layer 6, block 57 +.word 13744680 // Layer 6, block 58 +.word 31890906 // Layer 6, block 59 +.word 1107279328 // Layer 6, block 56 +.word 552289879 // Layer 6, block 57 +.word 879592386 // Layer 6, block 58 +.word 2040862218 // Layer 6, block 59 +.word 4735938 // Layer 7, block 112 +.word 26671657 // Layer 7, block 114 +.word 25810971 // Layer 7, block 116 +.word 25578690 // Layer 7, block 118 +.word 303076900 // Layer 7, block 112 +.word 1706855774 // Layer 7, block 114 +.word 1651776074 // Layer 7, block 116 +.word 1636911225 // Layer 7, block 118 +.word 6957373 // Layer 7, block 113 +.word 25381712 // Layer 7, block 115 +.word 27780827 // Layer 7, block 117 +.word 28062311 // Layer 7, block 119 +.word 445237890 // Layer 7, block 113 +.word 1624305595 // Layer 7, block 115 +.word 1777837237 // Layer 7, block 117 +.word 1795850838 // Layer 7, block 119 +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 26150922 // Layer 6, block 60 +.word 29525906 // Layer 6, block 61 +.word 23080870 // Layer 6, block 62 +.word 1636987 // Layer 6, block 63 +.word 1673531278 // Layer 6, block 60 +.word 1889513769 // Layer 6, block 61 +.word 1477062945 // Layer 6, block 62 +.word 104759172 // Layer 6, block 63 +.word 10674616 // Layer 7, block 120 +.word 9508293 // Layer 7, block 122 +.word 4274200 // Layer 7, block 124 +.word 10066304 // Layer 7, block 126 +.word 683123285 // Layer 7, block 120 +.word 608484310 // Layer 7, block 122 +.word 273527923 // Layer 7, block 124 +.word 644194289 // Layer 7, block 126 +.word 26473446 // Layer 7, block 121 +.word 14853570 // Layer 7, block 123 +.word 32427548 // Layer 7, block 125 +.word 16598340 // Layer 7, block 127 +.word 1694171239 // Layer 7, block 121 +.word 950555930 // Layer 7, block 123 +.word 2075204685 // Layer 7, block 125 +.word 1062212688 // Layer 7, block 127 +.text +.global ntt_u32_full_neon_asm_var_4_4_3_z2_4 +.global _ntt_u32_full_neon_asm_var_4_4_3_z2_4 +ntt_u32_full_neon_asm_var_4_4_3_z2_4: +_ntt_u32_full_neon_asm_var_4_4_3_z2_4: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #800] +ldr q21, [x0, #864] +ldr q20, [x0, #928] +ldr q19, [x0, #992] +ldr q18, [x0, #288] +ldr q17, [x0, #352] +ldr q16, [x0, #416] +ldr q3, [x0, #480] +sqrdmulh v2.4S, v22.4S, v29.s[0] +ldr q1, [x0, #544] +mul v22.4S, v22.4S,v30.s[0] +ldr q0, [x0, #608] +sqrdmulh v15.4S, v21.4S, v29.s[0] +ldr q14, [x0, #672] +mul v21.4S, v21.4S,v30.s[0] +ldr q13, [x0, #736] +mla v22.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q12, [x0, #32] +sub v11.4s, v18.4s, v22.4s +mla v21.4S, v15.4S, v31.s[0] +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q15, [x0, #96] +sub v10.4s, v17.4s, v21.4s +mla v20.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v1.4S, v29.s[0] +ldr q2, [x0, #160] +mul v1.4S, v1.4S,v30.s[0] +sub v9.4s, v16.4s, v20.4s +mla v19.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v0.4S, v29.s[0] +ldr q22, [x0, #224] +mul v0.4S, v0.4S,v30.s[0] +sub v8.4s, v3.4s, v19.4s +mla v1.4S, v21.4S, v31.s[0] +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v21.4s, v12.4s, v1.4s +mla v0.4S, v20.4S, v31.s[0] +add v12.4s, v12.4s, v1.4s +sqrdmulh v1.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +sub v20.4s, v15.4s, v0.4s +mla v14.4S, v19.4S, v31.s[0] +add v15.4s, v15.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v19.4s, v2.4s, v14.4s +mla v13.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v1.4s, v22.4s, v13.4s +mla v16.4S, v0.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v0.4s, v2.4s, v16.4s +mla v3.4S, v14.4S, v31.s[0] +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v14.4s, v22.4s, v3.4s +mla v18.4S, v13.4S, v31.s[0] +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v29.s[2] +mul v9.4S, v9.4S,v30.s[2] +sub v13.4s, v12.4s, v18.4s +mla v17.4S, v16.4S, v31.s[0] +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v8.4S, v29.s[2] +mul v8.4S, v8.4S,v30.s[2] +sub v16.4s, v15.4s, v17.4s +mla v9.4S, v3.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +sqrdmulh v17.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v3.4s, v19.4s, v9.4s +mla v8.4S, v18.4S, v31.s[0] +add v19.4s, v19.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v18.4s, v1.4s, v8.4s +mla v11.4S, v17.4S, v31.s[0] +add v1.4s, v1.4s, v8.4s +sqrdmulh v8.4S, v2.4S, v27.s[0] +mul v2.4S, v2.4S,v28.s[0] +sub v17.4s, v21.4s, v11.4s +mla v10.4S, v9.4S, v31.s[0] +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v27.s[0] +mul v22.4S, v22.4S,v28.s[0] +sub v9.4s, v20.4s, v10.4s +mla v2.4S, v8.4S, v31.s[0] +add v20.4s, v20.4s, v10.4s +sqrdmulh v10.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v8.4s, v12.4s, v2.4s +mla v22.4S, v11.4S, v31.s[0] +add v12.4s, v12.4s, v2.4s +sqrdmulh v2.4S, v14.4S, v27.s[1] +mul v14.4S, v14.4S,v28.s[1] +sub v11.4s, v15.4s, v22.4s +mla v0.4S, v10.4S, v31.s[0] +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v27.s[2] +mul v19.4S, v19.4S,v28.s[2] +sub v10.4s, v13.4s, v0.4s +mla v14.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v0.4s +sqrdmulh v0.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +sub v2.4s, v16.4s, v14.4s +mla v19.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v27.s[3] +mul v3.4S, v3.4S,v28.s[3] +sub v22.4s, v21.4s, v19.4s +mla v1.4S, v0.4S, v31.s[0] +add v21.4s, v21.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v0.4s, v20.4s, v1.4s +mla v3.4S, v14.4S, v31.s[0] +add v20.4s, v20.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v25.s[0] +mul v15.4S, v15.4S,v26.s[0] +sub v14.4s, v17.4s, v3.4s +mla v18.4S, v19.4S, v31.s[0] +add v17.4s, v17.4s, v3.4s +sqrdmulh v3.4S, v11.4S, v25.s[1] +mul v11.4S, v11.4S,v26.s[1] +sub v19.4s, v9.4s, v18.4s +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v1.4s, v12.4s, v15.4s +mla v11.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v25.s[3] +mul v2.4S, v2.4S,v26.s[3] +sub v3.4s, v8.4s, v11.4s +mla v16.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v11.4s +str q12, [x0, #32] +sqrdmulh v12.4S, v20.4S, v23.s[0] +str q1, [x0, #96] +mul v20.4S, v20.4S,v24.s[0] +ldr q1, [x0, #816] +sub v11.4s, v13.4s, v16.4s +ldr q18, [x0, #880] +mla v2.4S, v15.4S, v31.s[0] +add v13.4s, v13.4s, v16.4s +str q8, [x0, #160] +sqrdmulh v8.4S, v0.4S, v23.s[1] +str q3, [x0, #224] +mul v0.4S, v0.4S,v24.s[1] +ldr q3, [x0, #944] +sub v16.4s, v10.4s, v2.4s +ldr q15, [x0, #1008] +mla v20.4S, v12.4S, v31.s[0] +add v10.4s, v10.4s, v2.4s +str q13, [x0, #288] +sqrdmulh v13.4S, v9.4S, v23.s[2] +str q11, [x0, #352] +mul v9.4S, v9.4S,v24.s[2] +ldr q11, [x0, #304] +sub v2.4s, v21.4s, v20.4s +ldr q12, [x0, #368] +mla v0.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v20.4s +str q10, [x0, #416] +sqrdmulh v10.4S, v19.4S, v23.s[3] +str q16, [x0, #480] +mul v19.4S, v19.4S,v24.s[3] +ldr q16, [x0, #432] +sub v20.4s, v22.4s, v0.4s +ldr q8, [x0, #496] +mla v9.4S, v13.4S, v31.s[0] +add v22.4s, v22.4s, v0.4s +str q21, [x0, #544] +sqrdmulh v21.4S, v1.4S, v29.s[0] +str q2, [x0, #608] +ldr q2, [x0, #560] +mul v1.4S, v1.4S,v30.s[0] +ldr q0, [x0, #624] +sub v13.4s, v17.4s, v9.4s +mla v19.4S, v10.4S, v31.s[0] +add v17.4s, v17.4s, v9.4s +str q22, [x0, #672] +sqrdmulh v22.4S, v18.4S, v29.s[0] +str q20, [x0, #736] +ldr q20, [x0, #688] +mul v18.4S, v18.4S,v30.s[0] +ldr q9, [x0, #752] +sub v10.4s, v14.4s, v19.4s +mla v1.4S, v21.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +str q17, [x0, #800] +sqrdmulh v17.4S, v3.4S, v29.s[0] +str q13, [x0, #864] +mul v3.4S, v3.4S,v30.s[0] +ldr q13, [x0, #48] +sub v19.4s, v11.4s, v1.4s +mla v18.4S, v22.4S, v31.s[0] +add v11.4s, v11.4s, v1.4s +str q14, [x0, #928] +sqrdmulh v14.4S, v15.4S, v29.s[0] +str q10, [x0, #992] +mul v15.4S, v15.4S,v30.s[0] +ldr q10, [x0, #112] +sub v1.4s, v12.4s, v18.4s +mla v3.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v2.4S, v29.s[0] +ldr q17, [x0, #176] +mul v2.4S, v2.4S,v30.s[0] +sub v22.4s, v16.4s, v3.4s +mla v15.4S, v14.4S, v31.s[0] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v0.4S, v29.s[0] +ldr q14, [x0, #240] +mul v0.4S, v0.4S,v30.s[0] +sub v21.4s, v8.4s, v15.4s +mla v2.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +sub v18.4s, v13.4s, v2.4s +mla v0.4S, v3.4S, v31.s[0] +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v9.4S, v29.s[0] +mul v9.4S, v9.4S,v30.s[0] +sub v3.4s, v10.4s, v0.4s +mla v20.4S, v15.4S, v31.s[0] +add v10.4s, v10.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v15.4s, v17.4s, v20.4s +mla v9.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +sub v2.4s, v14.4s, v9.4s +mla v16.4S, v0.4S, v31.s[0] +add v14.4s, v14.4s, v9.4s +sqrdmulh v9.4S, v11.4S, v29.s[1] +mul v11.4S, v11.4S,v30.s[1] +sub v0.4s, v17.4s, v16.4s +mla v8.4S, v20.4S, v31.s[0] +add v17.4s, v17.4s, v16.4s +sqrdmulh v16.4S, v12.4S, v29.s[1] +mul v12.4S, v12.4S,v30.s[1] +sub v20.4s, v14.4s, v8.4s +mla v11.4S, v9.4S, v31.s[0] +add v14.4s, v14.4s, v8.4s +sqrdmulh v8.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +sub v9.4s, v13.4s, v11.4s +mla v12.4S, v16.4S, v31.s[0] +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +sub v16.4s, v10.4s, v12.4s +mla v22.4S, v8.4S, v31.s[0] +add v10.4s, v10.4s, v12.4s +sqrdmulh v12.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +sub v8.4s, v15.4s, v22.4s +mla v21.4S, v11.4S, v31.s[0] +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v1.4S, v29.s[2] +mul v1.4S, v1.4S,v30.s[2] +sub v11.4s, v2.4s, v21.4s +mla v19.4S, v12.4S, v31.s[0] +add v2.4s, v2.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v27.s[0] +mul v17.4S, v17.4S,v28.s[0] +sub v12.4s, v18.4s, v19.4s +mla v1.4S, v22.4S, v31.s[0] +add v18.4s, v18.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +sub v22.4s, v3.4s, v1.4s +mla v17.4S, v21.4S, v31.s[0] +add v3.4s, v3.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v21.4s, v13.4s, v17.4s +mla v14.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v17.4s +sqrdmulh v17.4S, v20.4S, v27.s[1] +mul v20.4S, v20.4S,v28.s[1] +sub v19.4s, v10.4s, v14.4s +mla v0.4S, v1.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v27.s[2] +mul v15.4S, v15.4S,v28.s[2] +sub v1.4s, v9.4s, v0.4s +mla v20.4S, v17.4S, v31.s[0] +add v9.4s, v9.4s, v0.4s +sqrdmulh v0.4S, v2.4S, v27.s[2] +mul v2.4S, v2.4S,v28.s[2] +sub v17.4s, v16.4s, v20.4s +mla v15.4S, v14.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v27.s[3] +mul v8.4S, v8.4S,v28.s[3] +sub v14.4s, v18.4s, v15.4s +mla v2.4S, v0.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v27.s[3] +mul v11.4S, v11.4S,v28.s[3] +sub v0.4s, v3.4s, v2.4s +mla v8.4S, v20.4S, v31.s[0] +add v3.4s, v3.4s, v2.4s +sqrdmulh v2.4S, v10.4S, v25.s[0] +mul v10.4S, v10.4S,v26.s[0] +sub v20.4s, v12.4s, v8.4s +mla v11.4S, v15.4S, v31.s[0] +add v12.4s, v12.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v25.s[1] +mul v19.4S, v19.4S,v26.s[1] +sub v15.4s, v22.4s, v11.4s +mla v10.4S, v2.4S, v31.s[0] +add v22.4s, v22.4s, v11.4s +sqrdmulh v11.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v2.4s, v13.4s, v10.4s +mla v19.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v10.4s +sqrdmulh v10.4S, v17.4S, v25.s[3] +mul v17.4S, v17.4S,v26.s[3] +sub v8.4s, v21.4s, v19.4s +mla v16.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v19.4s +str q13, [x0, #48] +sqrdmulh v13.4S, v3.4S, v23.s[0] +str q2, [x0, #112] +mul v3.4S, v3.4S,v24.s[0] +ldr q2, [x0, #768] +sub v19.4s, v9.4s, v16.4s +ldr q11, [x0, #832] +mla v17.4S, v10.4S, v31.s[0] +add v9.4s, v9.4s, v16.4s +str q21, [x0, #176] +sqrdmulh v21.4S, v0.4S, v23.s[1] +str q8, [x0, #240] +mul v0.4S, v0.4S,v24.s[1] +ldr q8, [x0, #896] +sub v16.4s, v1.4s, v17.4s +ldr q10, [x0, #960] +mla v3.4S, v13.4S, v31.s[0] +add v1.4s, v1.4s, v17.4s +str q9, [x0, #304] +sqrdmulh v9.4S, v22.4S, v23.s[2] +str q19, [x0, #368] +mul v22.4S, v22.4S,v24.s[2] +ldr q19, [x0, #256] +sub v17.4s, v18.4s, v3.4s +ldr q13, [x0, #320] +mla v0.4S, v21.4S, v31.s[0] +add v18.4s, v18.4s, v3.4s +str q1, [x0, #432] +sqrdmulh v1.4S, v15.4S, v23.s[3] +str q16, [x0, #496] +mul v15.4S, v15.4S,v24.s[3] +ldr q16, [x0, #384] +sub v3.4s, v14.4s, v0.4s +ldr q21, [x0, #448] +mla v22.4S, v9.4S, v31.s[0] +add v14.4s, v14.4s, v0.4s +str q18, [x0, #560] +sqrdmulh v18.4S, v2.4S, v29.s[0] +str q17, [x0, #624] +ldr q17, [x0, #512] +mul v2.4S, v2.4S,v30.s[0] +ldr q0, [x0, #576] +sub v9.4s, v12.4s, v22.4s +mla v15.4S, v1.4S, v31.s[0] +add v12.4s, v12.4s, v22.4s +str q14, [x0, #688] +sqrdmulh v14.4S, v11.4S, v29.s[0] +str q3, [x0, #752] +ldr q3, [x0, #640] +mul v11.4S, v11.4S,v30.s[0] +ldr q22, [x0, #704] +sub v1.4s, v20.4s, v15.4s +mla v2.4S, v18.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +str q12, [x0, #816] +sqrdmulh v12.4S, v8.4S, v29.s[0] +str q9, [x0, #880] +mul v8.4S, v8.4S,v30.s[0] +ldr q9, [x0, #0] +sub v15.4s, v19.4s, v2.4s +mla v11.4S, v14.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +str q20, [x0, #944] +sqrdmulh v20.4S, v10.4S, v29.s[0] +str q1, [x0, #1008] +mul v10.4S, v10.4S,v30.s[0] +ldr q1, [x0, #64] +sub v2.4s, v13.4s, v11.4s +mla v8.4S, v12.4S, v31.s[0] +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v29.s[0] +ldr q12, [x0, #128] +mul v17.4S, v17.4S,v30.s[0] +sub v14.4s, v16.4s, v8.4s +mla v10.4S, v20.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v0.4S, v29.s[0] +ldr q20, [x0, #192] +mul v0.4S, v0.4S,v30.s[0] +sub v18.4s, v21.4s, v10.4s +mla v17.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +sub v11.4s, v9.4s, v17.4s +mla v0.4S, v8.4S, v31.s[0] +add v9.4s, v9.4s, v17.4s +sqrdmulh v17.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v8.4s, v1.4s, v0.4s +mla v3.4S, v10.4S, v31.s[0] +add v1.4s, v1.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v10.4s, v12.4s, v3.4s +mla v22.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v17.4s, v20.4s, v22.4s +mla v16.4S, v0.4S, v31.s[0] +add v20.4s, v20.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[1] +mul v19.4S, v19.4S,v30.s[1] +sub v0.4s, v12.4s, v16.4s +mla v21.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v13.4S, v29.s[1] +mul v13.4S, v13.4S,v30.s[1] +sub v3.4s, v20.4s, v21.4s +mla v19.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v22.4s, v9.4s, v19.4s +mla v13.4S, v16.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v29.s[2] +mul v18.4S, v18.4S,v30.s[2] +sub v16.4s, v1.4s, v13.4s +mla v14.4S, v21.4S, v31.s[0] +add v1.4s, v1.4s, v13.4s +sqrdmulh v13.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +sub v21.4s, v10.4s, v14.4s +mla v18.4S, v19.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v19.4s, v17.4s, v18.4s +mla v15.4S, v13.4S, v31.s[0] +add v17.4s, v17.4s, v18.4s +sqrdmulh v18.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +sub v13.4s, v11.4s, v15.4s +mla v2.4S, v14.4S, v31.s[0] +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v27.s[0] +mul v20.4S, v20.4S,v28.s[0] +sub v14.4s, v8.4s, v2.4s +mla v12.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v2.4s +sqrdmulh v2.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v18.4s, v9.4s, v12.4s +mla v20.4S, v15.4S, v31.s[0] +add v9.4s, v9.4s, v12.4s +sqrdmulh v12.4S, v3.4S, v27.s[1] +mul v3.4S, v3.4S,v28.s[1] +sub v15.4s, v1.4s, v20.4s +mla v0.4S, v2.4S, v31.s[0] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v10.4S, v27.s[2] +mul v10.4S, v10.4S,v28.s[2] +sub v2.4s, v22.4s, v0.4s +mla v3.4S, v12.4S, v31.s[0] +add v22.4s, v22.4s, v0.4s +sqrdmulh v0.4S, v17.4S, v27.s[2] +mul v17.4S, v17.4S,v28.s[2] +sub v12.4s, v16.4s, v3.4s +mla v10.4S, v20.4S, v31.s[0] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +sub v20.4s, v11.4s, v10.4s +mla v17.4S, v0.4S, v31.s[0] +add v11.4s, v11.4s, v10.4s +sqrdmulh v10.4S, v19.4S, v27.s[3] +mul v19.4S, v19.4S,v28.s[3] +sub v0.4s, v8.4s, v17.4s +mla v21.4S, v3.4S, v31.s[0] +add v8.4s, v8.4s, v17.4s +sqrdmulh v17.4S, v1.4S, v25.s[0] +mul v1.4S, v1.4S,v26.s[0] +sub v3.4s, v13.4s, v21.4s +mla v19.4S, v10.4S, v31.s[0] +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v15.4S, v25.s[1] +mul v15.4S, v15.4S,v26.s[1] +sub v10.4s, v14.4s, v19.4s +mla v1.4S, v17.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +sqrdmulh v19.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v17.4s, v9.4s, v1.4s +mla v15.4S, v21.4S, v31.s[0] +add v9.4s, v9.4s, v1.4s +sqrdmulh v1.4S, v12.4S, v25.s[3] +mul v12.4S, v12.4S,v26.s[3] +sub v21.4s, v18.4s, v15.4s +mla v16.4S, v19.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +str q9, [x0, #0] +sqrdmulh v9.4S, v8.4S, v23.s[0] +str q17, [x0, #64] +mul v8.4S, v8.4S,v24.s[0] +ldr q17, [x0, #784] +sub v15.4s, v22.4s, v16.4s +ldr q19, [x0, #848] +mla v12.4S, v1.4S, v31.s[0] +add v22.4s, v22.4s, v16.4s +str q18, [x0, #128] +sqrdmulh v18.4S, v0.4S, v23.s[1] +str q21, [x0, #192] +mul v0.4S, v0.4S,v24.s[1] +ldr q21, [x0, #912] +sub v16.4s, v2.4s, v12.4s +ldr q1, [x0, #976] +mla v8.4S, v9.4S, v31.s[0] +add v2.4s, v2.4s, v12.4s +str q22, [x0, #256] +sqrdmulh v22.4S, v14.4S, v23.s[2] +str q15, [x0, #320] +mul v14.4S, v14.4S,v24.s[2] +ldr q15, [x0, #272] +sub v12.4s, v11.4s, v8.4s +ldr q9, [x0, #336] +mla v0.4S, v18.4S, v31.s[0] +add v11.4s, v11.4s, v8.4s +str q2, [x0, #384] +sqrdmulh v2.4S, v10.4S, v23.s[3] +str q16, [x0, #448] +mul v10.4S, v10.4S,v24.s[3] +ldr q16, [x0, #400] +sub v8.4s, v20.4s, v0.4s +ldr q18, [x0, #464] +mla v14.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v0.4s +str q11, [x0, #512] +sqrdmulh v11.4S, v17.4S, v29.s[0] +str q12, [x0, #576] +ldr q12, [x0, #528] +mul v17.4S, v17.4S,v30.s[0] +ldr q0, [x0, #592] +sub v22.4s, v13.4s, v14.4s +mla v10.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v14.4s +str q20, [x0, #640] +sqrdmulh v20.4S, v19.4S, v29.s[0] +str q8, [x0, #704] +ldr q8, [x0, #656] +mul v19.4S, v19.4S,v30.s[0] +ldr q14, [x0, #720] +sub v2.4s, v3.4s, v10.4s +mla v17.4S, v11.4S, v31.s[0] +add v3.4s, v3.4s, v10.4s +str q13, [x0, #768] +sqrdmulh v13.4S, v21.4S, v29.s[0] +str q22, [x0, #832] +mul v21.4S, v21.4S,v30.s[0] +ldr q22, [x0, #16] +sub v10.4s, v15.4s, v17.4s +mla v19.4S, v20.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +str q3, [x0, #896] +sqrdmulh v3.4S, v1.4S, v29.s[0] +str q2, [x0, #960] +mul v1.4S, v1.4S,v30.s[0] +ldr q2, [x0, #80] +sub v17.4s, v9.4s, v19.4s +mla v21.4S, v13.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v12.4S, v29.s[0] +ldr q13, [x0, #144] +mul v12.4S, v12.4S,v30.s[0] +sub v20.4s, v16.4s, v21.4s +mla v1.4S, v3.4S, v31.s[0] +add v16.4s, v16.4s, v21.4s +sqrdmulh v21.4S, v0.4S, v29.s[0] +ldr q3, [x0, #208] +mul v0.4S, v0.4S,v30.s[0] +sub v11.4s, v18.4s, v1.4s +mla v12.4S, v19.4S, v31.s[0] +add v18.4s, v18.4s, v1.4s +sqrdmulh v1.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v19.4s, v22.4s, v12.4s +mla v0.4S, v21.4S, v31.s[0] +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v21.4s, v2.4s, v0.4s +mla v8.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v1.4s, v13.4s, v8.4s +mla v14.4S, v12.4S, v31.s[0] +add v13.4s, v13.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v12.4s, v3.4s, v14.4s +mla v16.4S, v0.4S, v31.s[0] +add v3.4s, v3.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +sub v0.4s, v13.4s, v16.4s +mla v18.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v16.4s +sqrdmulh v16.4S, v9.4S, v29.s[1] +mul v9.4S, v9.4S,v30.s[1] +sub v8.4s, v3.4s, v18.4s +mla v15.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +sub v14.4s, v22.4s, v15.4s +mla v9.4S, v16.4S, v31.s[0] +add v22.4s, v22.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v16.4s, v2.4s, v9.4s +mla v20.4S, v18.4S, v31.s[0] +add v2.4s, v2.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v18.4s, v1.4s, v20.4s +mla v11.4S, v15.4S, v31.s[0] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +sub v15.4s, v12.4s, v11.4s +mla v10.4S, v9.4S, v31.s[0] +add v12.4s, v12.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v27.s[0] +mul v13.4S, v13.4S,v28.s[0] +sub v9.4s, v19.4s, v10.4s +mla v17.4S, v20.4S, v31.s[0] +add v19.4s, v19.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v27.s[0] +mul v3.4S, v3.4S,v28.s[0] +sub v20.4s, v21.4s, v17.4s +mla v13.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v11.4s, v22.4s, v13.4s +mla v3.4S, v10.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v8.4S, v27.s[1] +mul v8.4S, v8.4S,v28.s[1] +sub v10.4s, v2.4s, v3.4s +mla v0.4S, v17.4S, v31.s[0] +add v2.4s, v2.4s, v3.4s +sqrdmulh v3.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +sub v17.4s, v14.4s, v0.4s +mla v8.4S, v13.4S, v31.s[0] +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v12.4S, v27.s[2] +mul v12.4S, v12.4S,v28.s[2] +sub v13.4s, v16.4s, v8.4s +mla v1.4S, v3.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v3.4s, v19.4s, v1.4s +mla v12.4S, v0.4S, v31.s[0] +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +sub v0.4s, v21.4s, v12.4s +mla v18.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v2.4S, v25.s[0] +mul v2.4S, v2.4S,v26.s[0] +sub v8.4s, v9.4s, v18.4s +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v10.4S, v25.s[1] +mul v10.4S, v10.4S,v26.s[1] +sub v1.4s, v20.4s, v15.4s +mla v2.4S, v12.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v12.4s, v22.4s, v2.4s +mla v10.4S, v18.4S, v31.s[0] +add v22.4s, v22.4s, v2.4s +sqrdmulh v2.4S, v13.4S, v25.s[3] +mul v13.4S, v13.4S,v26.s[3] +sub v18.4s, v11.4s, v10.4s +mla v16.4S, v15.4S, v31.s[0] +add v11.4s, v11.4s, v10.4s +str q22, [x0, #16] +sqrdmulh v22.4S, v21.4S, v23.s[0] +str q12, [x0, #80] +mul v21.4S, v21.4S,v24.s[0] +sub v12.4s, v14.4s, v16.4s +mla v13.4S, v2.4S, v31.s[0] +add v14.4s, v14.4s, v16.4s +str q11, [x0, #144] +sqrdmulh v11.4S, v0.4S, v23.s[1] +str q18, [x0, #208] +mul v0.4S, v0.4S,v24.s[1] +sub v18.4s, v17.4s, v13.4s +mla v21.4S, v22.4S, v31.s[0] +add v17.4s, v17.4s, v13.4s +str q14, [x0, #272] +sqrdmulh v14.4S, v20.4S, v23.s[2] +str q12, [x0, #336] +mul v20.4S, v20.4S,v24.s[2] +sub v12.4s, v19.4s, v21.4s +mla v0.4S, v11.4S, v31.s[0] +add v19.4s, v19.4s, v21.4s +str q17, [x0, #400] +sqrdmulh v17.4S, v1.4S, v23.s[3] +str q18, [x0, #464] +mul v1.4S, v1.4S,v24.s[3] +sub v18.4s, v3.4s, v0.4s +mla v20.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v0.4s +str q19, [x0, #528] +str q12, [x0, #592] +sub v12.4s, v9.4s, v20.4s +mla v1.4S, v17.4S, v31.s[0] +add v9.4s, v9.4s, v20.4s +str q3, [x0, #656] +str q18, [x0, #720] +sub v18.4s, v8.4s, v1.4s +add v8.4s, v8.4s, v1.4s +str q9, [x0, #784] +str q12, [x0, #848] +str q8, [x0, #912] +str q18, [x0, #976] +ldr q4, [x0, #32] +ldr q5, [x0, #48] +ldr q6, [x17, #+128] +ldr q7, [x17, #+144] +ldr q15, [x0, #0] +ldr q10, [x0, #16] +ldr q2, [x17, #+1152] +ldr q16, [x17, #+1168] +sqrdmulh v22.4S, v4.4S, v7.s[0] +ldr q13, [x0, #544] +mul v4.4S, v4.4S,v6.s[0] +ldr q11, [x0, #560] +mla v4.4S, v22.4S, v31.s[0] +sub v22.4s, v15.4s, v4.4s +add v15.4s, v15.4s, v4.4s +sqrdmulh v4.4S, v5.4S, v7.s[0] +ldr q21, [x0, #512] +mul v5.4S, v5.4S,v6.s[0] +ldr q14, [x0, #528] +mla v5.4S, v4.4S, v31.s[0] +sub v4.4s, v10.4s, v5.4s +add v10.4s, v10.4s, v5.4s +sqrdmulh v5.4S, v13.4S, v16.s[0] +mul v13.4S, v13.4S,v2.s[0] +mla v13.4S, v5.4S, v31.s[0] +sub v5.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +sqrdmulh v13.4S, v11.4S, v16.s[0] +mul v11.4S, v11.4S,v2.s[0] +mla v11.4S, v13.4S, v31.s[0] +sub v13.4s, v14.4s, v11.4s +add v14.4s, v14.4s, v11.4s +sqrdmulh v11.4S, v10.4S, v7.s[1] +mul v10.4S, v10.4S,v6.s[1] +mla v10.4S, v11.4S, v31.s[0] +sub v11.4s, v15.4s, v10.4s +add v15.4s, v15.4s, v10.4s +sqrdmulh v10.4S, v4.4S, v7.s[2] +mul v4.4S, v4.4S,v6.s[2] +mla v4.4S, v10.4S, v31.s[0] +sub v10.4s, v22.4s, v4.4s +add v22.4s, v22.4s, v4.4s +sqrdmulh v4.4S, v14.4S, v16.s[1] +mul v14.4S, v14.4S,v2.s[1] +mla v14.4S, v4.4S, v31.s[0] +sub v4.4s, v21.4s, v14.4s +trn1 v0.4S, v15.4S, v11.4S +trn2 v19.4S, v15.4S, v11.4S +add v21.4s, v21.4s, v14.4s +trn1 v14.4S, v22.4S, v10.4S +trn2 v17.4S, v22.4S, v10.4S +sqrdmulh v20.4S, v13.4S, v16.s[2] +ldr q3, [x17, #+160] +mul v13.4S, v13.4S,v2.s[2] +ldr q1, [x17, #+176] +mla v13.4S, v20.4S, v31.s[0] +trn2 v22.2D, v0.2D, v14.2D +trn2 v10.2D, v19.2D, v17.2D +sub v20.4s, v5.4s, v13.4s +trn1 v15.2D, v0.2D, v14.2D +trn1 v11.2D, v19.2D, v17.2D +add v5.4s, v5.4s, v13.4s +sqrdmulh v13.4S, v22.4S, v1.4S +mul v22.4S, v22.4S,v3.4S +mla v22.4S, v13.4S, v31.s[0] +trn1 v13.4S, v21.4S, v4.4S +trn2 v17.4S, v21.4S, v4.4S +sub v19.4s, v15.4s, v22.4s +trn1 v14.4S, v5.4S, v20.4S +trn2 v0.4S, v5.4S, v20.4S +add v15.4s, v15.4s, v22.4s +trn2 v5.2D, v13.2D, v14.2D +trn2 v20.2D, v17.2D, v0.2D +sqrdmulh v22.4S, v10.4S, v1.4S +trn1 v21.2D, v13.2D, v14.2D +trn1 v4.2D, v17.2D, v0.2D +ldr q0, [x17, #+1184] +ldr q17, [x17, #+1200] +mul v10.4S, v10.4S,v3.4S +mla v10.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v10.4s +add v11.4s, v11.4s, v10.4s +sqrdmulh v10.4S, v5.4S, v17.4S +mul v5.4S, v5.4S,v0.4S +mla v5.4S, v10.4S, v31.s[0] +sub v10.4s, v21.4s, v5.4s +add v21.4s, v21.4s, v5.4s +ldr q5, [x17, #+192] +ldr q14, [x17, #+208] +sqrdmulh v13.4S, v20.4S, v17.4S +mul v20.4S, v20.4S,v0.4S +mla v20.4S, v13.4S, v31.s[0] +sub v13.4s, v4.4s, v20.4s +add v4.4s, v4.4s, v20.4s +ldr q20, [x17, #+224] +ldr q9, [x17, #+240] +sqrdmulh v12.4S, v11.4S, v14.4S +mul v11.4S, v11.4S,v5.4S +mla v11.4S, v12.4S, v31.s[0] +sub v12.4s, v15.4s, v11.4s +add v15.4s, v15.4s, v11.4s +ldr q11, [x17, #+1216] +ldr q8, [x17, #+1232] +sqrdmulh v18.4S, v22.4S, v9.4S +mul v22.4S, v22.4S,v20.4S +mla v22.4S, v18.4S, v31.s[0] +sub v18.4s, v19.4s, v22.4s +add v19.4s, v19.4s, v22.4s +ldr q22, [x17, #+1248] +ldr q30, [x17, #+1264] +sqrdmulh v29.4S, v4.4S, v8.4S +ldr q28, [x0, #96] +mul v4.4S, v4.4S,v11.4S +mla v4.4S, v29.4S, v31.s[0] +sub v29.4s, v21.4s, v4.4s +add v21.4s, v21.4s, v4.4s +sqrdmulh v4.4S, v13.4S, v30.4S +ldr q27, [x0, #112] +mul v13.4S, v13.4S,v22.4S +mla v13.4S, v4.4S, v31.s[0] +sub v4.4s, v10.4s, v13.4s +add v10.4s, v10.4s, v13.4s +str q15, [x0, #0] +str q12, [x0, #16] +str q19, [x0, #32] +str q18, [x0, #48] +str q21, [x0, #512] +str q29, [x0, #528] +str q10, [x0, #544] +str q4, [x0, #560] +ldr q30, [x17, #+256] +ldr q22, [x17, #+272] +ldr q8, [x0, #64] +ldr q11, [x0, #80] +ldr q17, [x17, #+1280] +ldr q0, [x17, #+1296] +sqrdmulh v16.4S, v28.4S, v22.s[0] +ldr q2, [x0, #608] +mul v28.4S, v28.4S,v30.s[0] +ldr q4, [x0, #624] +mla v28.4S, v16.4S, v31.s[0] +sub v16.4s, v8.4s, v28.4s +add v8.4s, v8.4s, v28.4s +sqrdmulh v28.4S, v27.4S, v22.s[0] +ldr q10, [x0, #576] +mul v27.4S, v27.4S,v30.s[0] +ldr q29, [x0, #592] +mla v27.4S, v28.4S, v31.s[0] +sub v28.4s, v11.4s, v27.4s +add v11.4s, v11.4s, v27.4s +sqrdmulh v27.4S, v2.4S, v0.s[0] +mul v2.4S, v2.4S,v17.s[0] +mla v2.4S, v27.4S, v31.s[0] +sub v27.4s, v10.4s, v2.4s +add v10.4s, v10.4s, v2.4s +sqrdmulh v2.4S, v4.4S, v0.s[0] +mul v4.4S, v4.4S,v17.s[0] +mla v4.4S, v2.4S, v31.s[0] +sub v2.4s, v29.4s, v4.4s +add v29.4s, v29.4s, v4.4s +sqrdmulh v4.4S, v11.4S, v22.s[1] +mul v11.4S, v11.4S,v30.s[1] +mla v11.4S, v4.4S, v31.s[0] +sub v4.4s, v8.4s, v11.4s +add v8.4s, v8.4s, v11.4s +sqrdmulh v11.4S, v28.4S, v22.s[2] +mul v28.4S, v28.4S,v30.s[2] +mla v28.4S, v11.4S, v31.s[0] +sub v11.4s, v16.4s, v28.4s +add v16.4s, v16.4s, v28.4s +sqrdmulh v28.4S, v29.4S, v0.s[1] +mul v29.4S, v29.4S,v17.s[1] +mla v29.4S, v28.4S, v31.s[0] +sub v28.4s, v10.4s, v29.4s +trn1 v21.4S, v8.4S, v4.4S +trn2 v9.4S, v8.4S, v4.4S +add v10.4s, v10.4s, v29.4s +trn1 v29.4S, v16.4S, v11.4S +trn2 v20.4S, v16.4S, v11.4S +sqrdmulh v14.4S, v2.4S, v0.s[2] +ldr q5, [x17, #+288] +mul v2.4S, v2.4S,v17.s[2] +ldr q1, [x17, #+304] +mla v2.4S, v14.4S, v31.s[0] +trn2 v16.2D, v21.2D, v29.2D +trn2 v11.2D, v9.2D, v20.2D +sub v14.4s, v27.4s, v2.4s +trn1 v8.2D, v21.2D, v29.2D +trn1 v4.2D, v9.2D, v20.2D +add v27.4s, v27.4s, v2.4s +sqrdmulh v2.4S, v16.4S, v1.4S +mul v16.4S, v16.4S,v5.4S +mla v16.4S, v2.4S, v31.s[0] +trn1 v2.4S, v10.4S, v28.4S +trn2 v20.4S, v10.4S, v28.4S +sub v9.4s, v8.4s, v16.4s +trn1 v29.4S, v27.4S, v14.4S +trn2 v21.4S, v27.4S, v14.4S +add v8.4s, v8.4s, v16.4s +trn2 v27.2D, v2.2D, v29.2D +trn2 v14.2D, v20.2D, v21.2D +sqrdmulh v16.4S, v11.4S, v1.4S +trn1 v10.2D, v2.2D, v29.2D +trn1 v28.2D, v20.2D, v21.2D +ldr q21, [x17, #+1312] +ldr q20, [x17, #+1328] +mul v11.4S, v11.4S,v5.4S +mla v11.4S, v16.4S, v31.s[0] +sub v16.4s, v4.4s, v11.4s +add v4.4s, v4.4s, v11.4s +sqrdmulh v11.4S, v27.4S, v20.4S +mul v27.4S, v27.4S,v21.4S +mla v27.4S, v11.4S, v31.s[0] +sub v11.4s, v10.4s, v27.4s +add v10.4s, v10.4s, v27.4s +ldr q27, [x17, #+320] +ldr q29, [x17, #+336] +sqrdmulh v2.4S, v14.4S, v20.4S +mul v14.4S, v14.4S,v21.4S +mla v14.4S, v2.4S, v31.s[0] +sub v2.4s, v28.4s, v14.4s +add v28.4s, v28.4s, v14.4s +ldr q14, [x17, #+352] +ldr q3, [x17, #+368] +sqrdmulh v7.4S, v4.4S, v29.4S +mul v4.4S, v4.4S,v27.4S +mla v4.4S, v7.4S, v31.s[0] +sub v7.4s, v8.4s, v4.4s +add v8.4s, v8.4s, v4.4s +ldr q4, [x17, #+1344] +ldr q6, [x17, #+1360] +sqrdmulh v18.4S, v16.4S, v3.4S +mul v16.4S, v16.4S,v14.4S +mla v16.4S, v18.4S, v31.s[0] +sub v18.4s, v9.4s, v16.4s +add v9.4s, v9.4s, v16.4s +ldr q16, [x17, #+1376] +ldr q19, [x17, #+1392] +sqrdmulh v12.4S, v28.4S, v6.4S +ldr q15, [x0, #160] +mul v28.4S, v28.4S,v4.4S +mla v28.4S, v12.4S, v31.s[0] +sub v12.4s, v10.4s, v28.4s +add v10.4s, v10.4s, v28.4s +sqrdmulh v28.4S, v2.4S, v19.4S +ldr q13, [x0, #176] +mul v2.4S, v2.4S,v16.4S +mla v2.4S, v28.4S, v31.s[0] +sub v28.4s, v11.4s, v2.4s +add v11.4s, v11.4s, v2.4s +str q8, [x0, #64] +str q7, [x0, #80] +str q9, [x0, #96] +str q18, [x0, #112] +str q10, [x0, #576] +str q12, [x0, #592] +str q11, [x0, #608] +str q28, [x0, #624] +ldr q19, [x17, #+384] +ldr q16, [x17, #+400] +ldr q6, [x0, #128] +ldr q4, [x0, #144] +ldr q20, [x17, #+1408] +ldr q21, [x17, #+1424] +sqrdmulh v0.4S, v15.4S, v16.s[0] +ldr q17, [x0, #672] +mul v15.4S, v15.4S,v19.s[0] +ldr q28, [x0, #688] +mla v15.4S, v0.4S, v31.s[0] +sub v0.4s, v6.4s, v15.4s +add v6.4s, v6.4s, v15.4s +sqrdmulh v15.4S, v13.4S, v16.s[0] +ldr q11, [x0, #640] +mul v13.4S, v13.4S,v19.s[0] +ldr q12, [x0, #656] +mla v13.4S, v15.4S, v31.s[0] +sub v15.4s, v4.4s, v13.4s +add v4.4s, v4.4s, v13.4s +sqrdmulh v13.4S, v17.4S, v21.s[0] +mul v17.4S, v17.4S,v20.s[0] +mla v17.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v17.4s +add v11.4s, v11.4s, v17.4s +sqrdmulh v17.4S, v28.4S, v21.s[0] +mul v28.4S, v28.4S,v20.s[0] +mla v28.4S, v17.4S, v31.s[0] +sub v17.4s, v12.4s, v28.4s +add v12.4s, v12.4s, v28.4s +sqrdmulh v28.4S, v4.4S, v16.s[1] +mul v4.4S, v4.4S,v19.s[1] +mla v4.4S, v28.4S, v31.s[0] +sub v28.4s, v6.4s, v4.4s +add v6.4s, v6.4s, v4.4s +sqrdmulh v4.4S, v15.4S, v16.s[2] +mul v15.4S, v15.4S,v19.s[2] +mla v15.4S, v4.4S, v31.s[0] +sub v4.4s, v0.4s, v15.4s +add v0.4s, v0.4s, v15.4s +sqrdmulh v15.4S, v12.4S, v21.s[1] +mul v12.4S, v12.4S,v20.s[1] +mla v12.4S, v15.4S, v31.s[0] +sub v15.4s, v11.4s, v12.4s +trn1 v10.4S, v6.4S, v28.4S +trn2 v3.4S, v6.4S, v28.4S +add v11.4s, v11.4s, v12.4s +trn1 v12.4S, v0.4S, v4.4S +trn2 v14.4S, v0.4S, v4.4S +sqrdmulh v29.4S, v17.4S, v21.s[2] +ldr q27, [x17, #+416] +mul v17.4S, v17.4S,v20.s[2] +ldr q1, [x17, #+432] +mla v17.4S, v29.4S, v31.s[0] +trn2 v0.2D, v10.2D, v12.2D +trn2 v4.2D, v3.2D, v14.2D +sub v29.4s, v13.4s, v17.4s +trn1 v6.2D, v10.2D, v12.2D +trn1 v28.2D, v3.2D, v14.2D +add v13.4s, v13.4s, v17.4s +sqrdmulh v17.4S, v0.4S, v1.4S +mul v0.4S, v0.4S,v27.4S +mla v0.4S, v17.4S, v31.s[0] +trn1 v17.4S, v11.4S, v15.4S +trn2 v14.4S, v11.4S, v15.4S +sub v3.4s, v6.4s, v0.4s +trn1 v12.4S, v13.4S, v29.4S +trn2 v10.4S, v13.4S, v29.4S +add v6.4s, v6.4s, v0.4s +trn2 v13.2D, v17.2D, v12.2D +trn2 v29.2D, v14.2D, v10.2D +sqrdmulh v0.4S, v4.4S, v1.4S +trn1 v11.2D, v17.2D, v12.2D +trn1 v15.2D, v14.2D, v10.2D +ldr q10, [x17, #+1440] +ldr q14, [x17, #+1456] +mul v4.4S, v4.4S,v27.4S +mla v4.4S, v0.4S, v31.s[0] +sub v0.4s, v28.4s, v4.4s +add v28.4s, v28.4s, v4.4s +sqrdmulh v4.4S, v13.4S, v14.4S +mul v13.4S, v13.4S,v10.4S +mla v13.4S, v4.4S, v31.s[0] +sub v4.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +ldr q13, [x17, #+448] +ldr q12, [x17, #+464] +sqrdmulh v17.4S, v29.4S, v14.4S +mul v29.4S, v29.4S,v10.4S +mla v29.4S, v17.4S, v31.s[0] +sub v17.4s, v15.4s, v29.4s +add v15.4s, v15.4s, v29.4s +ldr q29, [x17, #+480] +ldr q5, [x17, #+496] +sqrdmulh v22.4S, v28.4S, v12.4S +mul v28.4S, v28.4S,v13.4S +mla v28.4S, v22.4S, v31.s[0] +sub v22.4s, v6.4s, v28.4s +add v6.4s, v6.4s, v28.4s +ldr q28, [x17, #+1472] +ldr q30, [x17, #+1488] +sqrdmulh v18.4S, v0.4S, v5.4S +mul v0.4S, v0.4S,v29.4S +mla v0.4S, v18.4S, v31.s[0] +sub v18.4s, v3.4s, v0.4s +add v3.4s, v3.4s, v0.4s +ldr q0, [x17, #+1504] +ldr q9, [x17, #+1520] +sqrdmulh v7.4S, v15.4S, v30.4S +ldr q8, [x0, #224] +mul v15.4S, v15.4S,v28.4S +mla v15.4S, v7.4S, v31.s[0] +sub v7.4s, v11.4s, v15.4s +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v17.4S, v9.4S +ldr q2, [x0, #240] +mul v17.4S, v17.4S,v0.4S +mla v17.4S, v15.4S, v31.s[0] +sub v15.4s, v4.4s, v17.4s +add v4.4s, v4.4s, v17.4s +str q6, [x0, #128] +str q22, [x0, #144] +str q3, [x0, #160] +str q18, [x0, #176] +str q11, [x0, #640] +str q7, [x0, #656] +str q4, [x0, #672] +str q15, [x0, #688] +ldr q9, [x17, #+512] +ldr q0, [x17, #+528] +ldr q30, [x0, #192] +ldr q28, [x0, #208] +ldr q14, [x17, #+1536] +ldr q10, [x17, #+1552] +sqrdmulh v21.4S, v8.4S, v0.s[0] +ldr q20, [x0, #736] +mul v8.4S, v8.4S,v9.s[0] +ldr q15, [x0, #752] +mla v8.4S, v21.4S, v31.s[0] +sub v21.4s, v30.4s, v8.4s +add v30.4s, v30.4s, v8.4s +sqrdmulh v8.4S, v2.4S, v0.s[0] +ldr q4, [x0, #704] +mul v2.4S, v2.4S,v9.s[0] +ldr q7, [x0, #720] +mla v2.4S, v8.4S, v31.s[0] +sub v8.4s, v28.4s, v2.4s +add v28.4s, v28.4s, v2.4s +sqrdmulh v2.4S, v20.4S, v10.s[0] +mul v20.4S, v20.4S,v14.s[0] +mla v20.4S, v2.4S, v31.s[0] +sub v2.4s, v4.4s, v20.4s +add v4.4s, v4.4s, v20.4s +sqrdmulh v20.4S, v15.4S, v10.s[0] +mul v15.4S, v15.4S,v14.s[0] +mla v15.4S, v20.4S, v31.s[0] +sub v20.4s, v7.4s, v15.4s +add v7.4s, v7.4s, v15.4s +sqrdmulh v15.4S, v28.4S, v0.s[1] +mul v28.4S, v28.4S,v9.s[1] +mla v28.4S, v15.4S, v31.s[0] +sub v15.4s, v30.4s, v28.4s +add v30.4s, v30.4s, v28.4s +sqrdmulh v28.4S, v8.4S, v0.s[2] +mul v8.4S, v8.4S,v9.s[2] +mla v8.4S, v28.4S, v31.s[0] +sub v28.4s, v21.4s, v8.4s +add v21.4s, v21.4s, v8.4s +sqrdmulh v8.4S, v7.4S, v10.s[1] +mul v7.4S, v7.4S,v14.s[1] +mla v7.4S, v8.4S, v31.s[0] +sub v8.4s, v4.4s, v7.4s +trn1 v11.4S, v30.4S, v15.4S +trn2 v5.4S, v30.4S, v15.4S +add v4.4s, v4.4s, v7.4s +trn1 v7.4S, v21.4S, v28.4S +trn2 v29.4S, v21.4S, v28.4S +sqrdmulh v12.4S, v20.4S, v10.s[2] +ldr q13, [x17, #+544] +mul v20.4S, v20.4S,v14.s[2] +ldr q1, [x17, #+560] +mla v20.4S, v12.4S, v31.s[0] +trn2 v21.2D, v11.2D, v7.2D +trn2 v28.2D, v5.2D, v29.2D +sub v12.4s, v2.4s, v20.4s +trn1 v30.2D, v11.2D, v7.2D +trn1 v15.2D, v5.2D, v29.2D +add v2.4s, v2.4s, v20.4s +sqrdmulh v20.4S, v21.4S, v1.4S +mul v21.4S, v21.4S,v13.4S +mla v21.4S, v20.4S, v31.s[0] +trn1 v20.4S, v4.4S, v8.4S +trn2 v29.4S, v4.4S, v8.4S +sub v5.4s, v30.4s, v21.4s +trn1 v7.4S, v2.4S, v12.4S +trn2 v11.4S, v2.4S, v12.4S +add v30.4s, v30.4s, v21.4s +trn2 v2.2D, v20.2D, v7.2D +trn2 v12.2D, v29.2D, v11.2D +sqrdmulh v21.4S, v28.4S, v1.4S +trn1 v4.2D, v20.2D, v7.2D +trn1 v8.2D, v29.2D, v11.2D +ldr q11, [x17, #+1568] +ldr q29, [x17, #+1584] +mul v28.4S, v28.4S,v13.4S +mla v28.4S, v21.4S, v31.s[0] +sub v21.4s, v15.4s, v28.4s +add v15.4s, v15.4s, v28.4s +sqrdmulh v28.4S, v2.4S, v29.4S +mul v2.4S, v2.4S,v11.4S +mla v2.4S, v28.4S, v31.s[0] +sub v28.4s, v4.4s, v2.4s +add v4.4s, v4.4s, v2.4s +ldr q2, [x17, #+576] +ldr q7, [x17, #+592] +sqrdmulh v20.4S, v12.4S, v29.4S +mul v12.4S, v12.4S,v11.4S +mla v12.4S, v20.4S, v31.s[0] +sub v20.4s, v8.4s, v12.4s +add v8.4s, v8.4s, v12.4s +ldr q12, [x17, #+608] +ldr q27, [x17, #+624] +sqrdmulh v16.4S, v15.4S, v7.4S +mul v15.4S, v15.4S,v2.4S +mla v15.4S, v16.4S, v31.s[0] +sub v16.4s, v30.4s, v15.4s +add v30.4s, v30.4s, v15.4s +ldr q15, [x17, #+1600] +ldr q19, [x17, #+1616] +sqrdmulh v18.4S, v21.4S, v27.4S +mul v21.4S, v21.4S,v12.4S +mla v21.4S, v18.4S, v31.s[0] +sub v18.4s, v5.4s, v21.4s +add v5.4s, v5.4s, v21.4s +ldr q21, [x17, #+1632] +ldr q3, [x17, #+1648] +sqrdmulh v22.4S, v8.4S, v19.4S +ldr q6, [x0, #288] +mul v8.4S, v8.4S,v15.4S +mla v8.4S, v22.4S, v31.s[0] +sub v22.4s, v4.4s, v8.4s +add v4.4s, v4.4s, v8.4s +sqrdmulh v8.4S, v20.4S, v3.4S +ldr q17, [x0, #304] +mul v20.4S, v20.4S,v21.4S +mla v20.4S, v8.4S, v31.s[0] +sub v8.4s, v28.4s, v20.4s +add v28.4s, v28.4s, v20.4s +str q30, [x0, #192] +str q16, [x0, #208] +str q5, [x0, #224] +str q18, [x0, #240] +str q4, [x0, #704] +str q22, [x0, #720] +str q28, [x0, #736] +str q8, [x0, #752] +ldr q3, [x17, #+640] +ldr q21, [x17, #+656] +ldr q19, [x0, #256] +ldr q15, [x0, #272] +ldr q29, [x17, #+1664] +ldr q11, [x17, #+1680] +sqrdmulh v10.4S, v6.4S, v21.s[0] +ldr q14, [x0, #800] +mul v6.4S, v6.4S,v3.s[0] +ldr q8, [x0, #816] +mla v6.4S, v10.4S, v31.s[0] +sub v10.4s, v19.4s, v6.4s +add v19.4s, v19.4s, v6.4s +sqrdmulh v6.4S, v17.4S, v21.s[0] +ldr q28, [x0, #768] +mul v17.4S, v17.4S,v3.s[0] +ldr q22, [x0, #784] +mla v17.4S, v6.4S, v31.s[0] +sub v6.4s, v15.4s, v17.4s +add v15.4s, v15.4s, v17.4s +sqrdmulh v17.4S, v14.4S, v11.s[0] +mul v14.4S, v14.4S,v29.s[0] +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v28.4s, v14.4s +add v28.4s, v28.4s, v14.4s +sqrdmulh v14.4S, v8.4S, v11.s[0] +mul v8.4S, v8.4S,v29.s[0] +mla v8.4S, v14.4S, v31.s[0] +sub v14.4s, v22.4s, v8.4s +add v22.4s, v22.4s, v8.4s +sqrdmulh v8.4S, v15.4S, v21.s[1] +mul v15.4S, v15.4S,v3.s[1] +mla v15.4S, v8.4S, v31.s[0] +sub v8.4s, v19.4s, v15.4s +add v19.4s, v19.4s, v15.4s +sqrdmulh v15.4S, v6.4S, v21.s[2] +mul v6.4S, v6.4S,v3.s[2] +mla v6.4S, v15.4S, v31.s[0] +sub v15.4s, v10.4s, v6.4s +add v10.4s, v10.4s, v6.4s +sqrdmulh v6.4S, v22.4S, v11.s[1] +mul v22.4S, v22.4S,v29.s[1] +mla v22.4S, v6.4S, v31.s[0] +sub v6.4s, v28.4s, v22.4s +trn1 v4.4S, v19.4S, v8.4S +trn2 v27.4S, v19.4S, v8.4S +add v28.4s, v28.4s, v22.4s +trn1 v22.4S, v10.4S, v15.4S +trn2 v12.4S, v10.4S, v15.4S +sqrdmulh v7.4S, v14.4S, v11.s[2] +ldr q2, [x17, #+672] +mul v14.4S, v14.4S,v29.s[2] +ldr q1, [x17, #+688] +mla v14.4S, v7.4S, v31.s[0] +trn2 v10.2D, v4.2D, v22.2D +trn2 v15.2D, v27.2D, v12.2D +sub v7.4s, v17.4s, v14.4s +trn1 v19.2D, v4.2D, v22.2D +trn1 v8.2D, v27.2D, v12.2D +add v17.4s, v17.4s, v14.4s +sqrdmulh v14.4S, v10.4S, v1.4S +mul v10.4S, v10.4S,v2.4S +mla v10.4S, v14.4S, v31.s[0] +trn1 v14.4S, v28.4S, v6.4S +trn2 v12.4S, v28.4S, v6.4S +sub v27.4s, v19.4s, v10.4s +trn1 v22.4S, v17.4S, v7.4S +trn2 v4.4S, v17.4S, v7.4S +add v19.4s, v19.4s, v10.4s +trn2 v17.2D, v14.2D, v22.2D +trn2 v7.2D, v12.2D, v4.2D +sqrdmulh v10.4S, v15.4S, v1.4S +trn1 v28.2D, v14.2D, v22.2D +trn1 v6.2D, v12.2D, v4.2D +ldr q4, [x17, #+1696] +ldr q12, [x17, #+1712] +mul v15.4S, v15.4S,v2.4S +mla v15.4S, v10.4S, v31.s[0] +sub v10.4s, v8.4s, v15.4s +add v8.4s, v8.4s, v15.4s +sqrdmulh v15.4S, v17.4S, v12.4S +mul v17.4S, v17.4S,v4.4S +mla v17.4S, v15.4S, v31.s[0] +sub v15.4s, v28.4s, v17.4s +add v28.4s, v28.4s, v17.4s +ldr q17, [x17, #+704] +ldr q22, [x17, #+720] +sqrdmulh v14.4S, v7.4S, v12.4S +mul v7.4S, v7.4S,v4.4S +mla v7.4S, v14.4S, v31.s[0] +sub v14.4s, v6.4s, v7.4s +add v6.4s, v6.4s, v7.4s +ldr q7, [x17, #+736] +ldr q13, [x17, #+752] +sqrdmulh v0.4S, v8.4S, v22.4S +mul v8.4S, v8.4S,v17.4S +mla v8.4S, v0.4S, v31.s[0] +sub v0.4s, v19.4s, v8.4s +add v19.4s, v19.4s, v8.4s +ldr q8, [x17, #+1728] +ldr q9, [x17, #+1744] +sqrdmulh v18.4S, v10.4S, v13.4S +mul v10.4S, v10.4S,v7.4S +mla v10.4S, v18.4S, v31.s[0] +sub v18.4s, v27.4s, v10.4s +add v27.4s, v27.4s, v10.4s +ldr q10, [x17, #+1760] +ldr q5, [x17, #+1776] +sqrdmulh v16.4S, v6.4S, v9.4S +ldr q30, [x0, #352] +mul v6.4S, v6.4S,v8.4S +mla v6.4S, v16.4S, v31.s[0] +sub v16.4s, v28.4s, v6.4s +add v28.4s, v28.4s, v6.4s +sqrdmulh v6.4S, v14.4S, v5.4S +ldr q20, [x0, #368] +mul v14.4S, v14.4S,v10.4S +mla v14.4S, v6.4S, v31.s[0] +sub v6.4s, v15.4s, v14.4s +add v15.4s, v15.4s, v14.4s +str q19, [x0, #256] +str q0, [x0, #272] +str q27, [x0, #288] +str q18, [x0, #304] +str q28, [x0, #768] +str q16, [x0, #784] +str q15, [x0, #800] +str q6, [x0, #816] +ldr q5, [x17, #+768] +ldr q10, [x17, #+784] +ldr q9, [x0, #320] +ldr q8, [x0, #336] +ldr q12, [x17, #+1792] +ldr q4, [x17, #+1808] +sqrdmulh v11.4S, v30.4S, v10.s[0] +ldr q29, [x0, #864] +mul v30.4S, v30.4S,v5.s[0] +ldr q6, [x0, #880] +mla v30.4S, v11.4S, v31.s[0] +sub v11.4s, v9.4s, v30.4s +add v9.4s, v9.4s, v30.4s +sqrdmulh v30.4S, v20.4S, v10.s[0] +ldr q15, [x0, #832] +mul v20.4S, v20.4S,v5.s[0] +ldr q16, [x0, #848] +mla v20.4S, v30.4S, v31.s[0] +sub v30.4s, v8.4s, v20.4s +add v8.4s, v8.4s, v20.4s +sqrdmulh v20.4S, v29.4S, v4.s[0] +mul v29.4S, v29.4S,v12.s[0] +mla v29.4S, v20.4S, v31.s[0] +sub v20.4s, v15.4s, v29.4s +add v15.4s, v15.4s, v29.4s +sqrdmulh v29.4S, v6.4S, v4.s[0] +mul v6.4S, v6.4S,v12.s[0] +mla v6.4S, v29.4S, v31.s[0] +sub v29.4s, v16.4s, v6.4s +add v16.4s, v16.4s, v6.4s +sqrdmulh v6.4S, v8.4S, v10.s[1] +mul v8.4S, v8.4S,v5.s[1] +mla v8.4S, v6.4S, v31.s[0] +sub v6.4s, v9.4s, v8.4s +add v9.4s, v9.4s, v8.4s +sqrdmulh v8.4S, v30.4S, v10.s[2] +mul v30.4S, v30.4S,v5.s[2] +mla v30.4S, v8.4S, v31.s[0] +sub v8.4s, v11.4s, v30.4s +add v11.4s, v11.4s, v30.4s +sqrdmulh v30.4S, v16.4S, v4.s[1] +mul v16.4S, v16.4S,v12.s[1] +mla v16.4S, v30.4S, v31.s[0] +sub v30.4s, v15.4s, v16.4s +trn1 v28.4S, v9.4S, v6.4S +trn2 v13.4S, v9.4S, v6.4S +add v15.4s, v15.4s, v16.4s +trn1 v16.4S, v11.4S, v8.4S +trn2 v7.4S, v11.4S, v8.4S +sqrdmulh v22.4S, v29.4S, v4.s[2] +ldr q17, [x17, #+800] +mul v29.4S, v29.4S,v12.s[2] +ldr q1, [x17, #+816] +mla v29.4S, v22.4S, v31.s[0] +trn2 v11.2D, v28.2D, v16.2D +trn2 v8.2D, v13.2D, v7.2D +sub v22.4s, v20.4s, v29.4s +trn1 v9.2D, v28.2D, v16.2D +trn1 v6.2D, v13.2D, v7.2D +add v20.4s, v20.4s, v29.4s +sqrdmulh v29.4S, v11.4S, v1.4S +mul v11.4S, v11.4S,v17.4S +mla v11.4S, v29.4S, v31.s[0] +trn1 v29.4S, v15.4S, v30.4S +trn2 v7.4S, v15.4S, v30.4S +sub v13.4s, v9.4s, v11.4s +trn1 v16.4S, v20.4S, v22.4S +trn2 v28.4S, v20.4S, v22.4S +add v9.4s, v9.4s, v11.4s +trn2 v20.2D, v29.2D, v16.2D +trn2 v22.2D, v7.2D, v28.2D +sqrdmulh v11.4S, v8.4S, v1.4S +trn1 v15.2D, v29.2D, v16.2D +trn1 v30.2D, v7.2D, v28.2D +ldr q28, [x17, #+1824] +ldr q7, [x17, #+1840] +mul v8.4S, v8.4S,v17.4S +mla v8.4S, v11.4S, v31.s[0] +sub v11.4s, v6.4s, v8.4s +add v6.4s, v6.4s, v8.4s +sqrdmulh v8.4S, v20.4S, v7.4S +mul v20.4S, v20.4S,v28.4S +mla v20.4S, v8.4S, v31.s[0] +sub v8.4s, v15.4s, v20.4s +add v15.4s, v15.4s, v20.4s +ldr q20, [x17, #+832] +ldr q16, [x17, #+848] +sqrdmulh v29.4S, v22.4S, v7.4S +mul v22.4S, v22.4S,v28.4S +mla v22.4S, v29.4S, v31.s[0] +sub v29.4s, v30.4s, v22.4s +add v30.4s, v30.4s, v22.4s +ldr q22, [x17, #+864] +ldr q2, [x17, #+880] +sqrdmulh v21.4S, v6.4S, v16.4S +mul v6.4S, v6.4S,v20.4S +mla v6.4S, v21.4S, v31.s[0] +sub v21.4s, v9.4s, v6.4s +add v9.4s, v9.4s, v6.4s +ldr q6, [x17, #+1856] +ldr q3, [x17, #+1872] +sqrdmulh v18.4S, v11.4S, v2.4S +mul v11.4S, v11.4S,v22.4S +mla v11.4S, v18.4S, v31.s[0] +sub v18.4s, v13.4s, v11.4s +add v13.4s, v13.4s, v11.4s +ldr q11, [x17, #+1888] +ldr q27, [x17, #+1904] +sqrdmulh v0.4S, v30.4S, v3.4S +ldr q19, [x0, #416] +mul v30.4S, v30.4S,v6.4S +mla v30.4S, v0.4S, v31.s[0] +sub v0.4s, v15.4s, v30.4s +add v15.4s, v15.4s, v30.4s +sqrdmulh v30.4S, v29.4S, v27.4S +ldr q14, [x0, #432] +mul v29.4S, v29.4S,v11.4S +mla v29.4S, v30.4S, v31.s[0] +sub v30.4s, v8.4s, v29.4s +add v8.4s, v8.4s, v29.4s +str q9, [x0, #320] +str q21, [x0, #336] +str q13, [x0, #352] +str q18, [x0, #368] +str q15, [x0, #832] +str q0, [x0, #848] +str q8, [x0, #864] +str q30, [x0, #880] +ldr q27, [x17, #+896] +ldr q11, [x17, #+912] +ldr q3, [x0, #384] +ldr q6, [x0, #400] +ldr q7, [x17, #+1920] +ldr q28, [x17, #+1936] +sqrdmulh v4.4S, v19.4S, v11.s[0] +ldr q12, [x0, #928] +mul v19.4S, v19.4S,v27.s[0] +ldr q30, [x0, #944] +mla v19.4S, v4.4S, v31.s[0] +sub v4.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v11.s[0] +ldr q8, [x0, #896] +mul v14.4S, v14.4S,v27.s[0] +ldr q0, [x0, #912] +mla v14.4S, v19.4S, v31.s[0] +sub v19.4s, v6.4s, v14.4s +add v6.4s, v6.4s, v14.4s +sqrdmulh v14.4S, v12.4S, v28.s[0] +mul v12.4S, v12.4S,v7.s[0] +mla v12.4S, v14.4S, v31.s[0] +sub v14.4s, v8.4s, v12.4s +add v8.4s, v8.4s, v12.4s +sqrdmulh v12.4S, v30.4S, v28.s[0] +mul v30.4S, v30.4S,v7.s[0] +mla v30.4S, v12.4S, v31.s[0] +sub v12.4s, v0.4s, v30.4s +add v0.4s, v0.4s, v30.4s +sqrdmulh v30.4S, v6.4S, v11.s[1] +mul v6.4S, v6.4S,v27.s[1] +mla v6.4S, v30.4S, v31.s[0] +sub v30.4s, v3.4s, v6.4s +add v3.4s, v3.4s, v6.4s +sqrdmulh v6.4S, v19.4S, v11.s[2] +mul v19.4S, v19.4S,v27.s[2] +mla v19.4S, v6.4S, v31.s[0] +sub v6.4s, v4.4s, v19.4s +add v4.4s, v4.4s, v19.4s +sqrdmulh v19.4S, v0.4S, v28.s[1] +mul v0.4S, v0.4S,v7.s[1] +mla v0.4S, v19.4S, v31.s[0] +sub v19.4s, v8.4s, v0.4s +trn1 v15.4S, v3.4S, v30.4S +trn2 v2.4S, v3.4S, v30.4S +add v8.4s, v8.4s, v0.4s +trn1 v0.4S, v4.4S, v6.4S +trn2 v22.4S, v4.4S, v6.4S +sqrdmulh v16.4S, v12.4S, v28.s[2] +ldr q20, [x17, #+928] +mul v12.4S, v12.4S,v7.s[2] +ldr q1, [x17, #+944] +mla v12.4S, v16.4S, v31.s[0] +trn2 v4.2D, v15.2D, v0.2D +trn2 v6.2D, v2.2D, v22.2D +sub v16.4s, v14.4s, v12.4s +trn1 v3.2D, v15.2D, v0.2D +trn1 v30.2D, v2.2D, v22.2D +add v14.4s, v14.4s, v12.4s +sqrdmulh v12.4S, v4.4S, v1.4S +mul v4.4S, v4.4S,v20.4S +mla v4.4S, v12.4S, v31.s[0] +trn1 v12.4S, v8.4S, v19.4S +trn2 v22.4S, v8.4S, v19.4S +sub v2.4s, v3.4s, v4.4s +trn1 v0.4S, v14.4S, v16.4S +trn2 v15.4S, v14.4S, v16.4S +add v3.4s, v3.4s, v4.4s +trn2 v14.2D, v12.2D, v0.2D +trn2 v16.2D, v22.2D, v15.2D +sqrdmulh v4.4S, v6.4S, v1.4S +trn1 v8.2D, v12.2D, v0.2D +trn1 v19.2D, v22.2D, v15.2D +ldr q15, [x17, #+1952] +ldr q22, [x17, #+1968] +mul v6.4S, v6.4S,v20.4S +mla v6.4S, v4.4S, v31.s[0] +sub v4.4s, v30.4s, v6.4s +add v30.4s, v30.4s, v6.4s +sqrdmulh v6.4S, v14.4S, v22.4S +mul v14.4S, v14.4S,v15.4S +mla v14.4S, v6.4S, v31.s[0] +sub v6.4s, v8.4s, v14.4s +add v8.4s, v8.4s, v14.4s +ldr q14, [x17, #+960] +ldr q0, [x17, #+976] +sqrdmulh v12.4S, v16.4S, v22.4S +mul v16.4S, v16.4S,v15.4S +mla v16.4S, v12.4S, v31.s[0] +sub v12.4s, v19.4s, v16.4s +add v19.4s, v19.4s, v16.4s +ldr q16, [x17, #+992] +ldr q17, [x17, #+1008] +sqrdmulh v10.4S, v30.4S, v0.4S +mul v30.4S, v30.4S,v14.4S +mla v30.4S, v10.4S, v31.s[0] +sub v10.4s, v3.4s, v30.4s +add v3.4s, v3.4s, v30.4s +ldr q30, [x17, #+1984] +ldr q5, [x17, #+2000] +sqrdmulh v18.4S, v4.4S, v17.4S +mul v4.4S, v4.4S,v16.4S +mla v4.4S, v18.4S, v31.s[0] +sub v18.4s, v2.4s, v4.4s +add v2.4s, v2.4s, v4.4s +ldr q4, [x17, #+2016] +ldr q13, [x17, #+2032] +sqrdmulh v21.4S, v19.4S, v5.4S +ldr q9, [x0, #480] +mul v19.4S, v19.4S,v30.4S +mla v19.4S, v21.4S, v31.s[0] +sub v21.4s, v8.4s, v19.4s +add v8.4s, v8.4s, v19.4s +sqrdmulh v19.4S, v12.4S, v13.4S +ldr q29, [x0, #496] +mul v12.4S, v12.4S,v4.4S +mla v12.4S, v19.4S, v31.s[0] +sub v19.4s, v6.4s, v12.4s +add v6.4s, v6.4s, v12.4s +str q3, [x0, #384] +str q10, [x0, #400] +str q2, [x0, #416] +str q18, [x0, #432] +str q8, [x0, #896] +str q21, [x0, #912] +str q6, [x0, #928] +str q19, [x0, #944] +ldr q13, [x17, #+1024] +ldr q4, [x17, #+1040] +ldr q5, [x0, #448] +ldr q30, [x0, #464] +ldr q22, [x17, #+2048] +ldr q15, [x17, #+2064] +sqrdmulh v28.4S, v9.4S, v4.s[0] +ldr q7, [x0, #992] +mul v9.4S, v9.4S,v13.s[0] +ldr q19, [x0, #1008] +mla v9.4S, v28.4S, v31.s[0] +sub v28.4s, v5.4s, v9.4s +add v5.4s, v5.4s, v9.4s +sqrdmulh v9.4S, v29.4S, v4.s[0] +ldr q6, [x0, #960] +mul v29.4S, v29.4S,v13.s[0] +ldr q21, [x0, #976] +mla v29.4S, v9.4S, v31.s[0] +sub v9.4s, v30.4s, v29.4s +add v30.4s, v30.4s, v29.4s +sqrdmulh v29.4S, v7.4S, v15.s[0] +mul v7.4S, v7.4S,v22.s[0] +mla v7.4S, v29.4S, v31.s[0] +sub v29.4s, v6.4s, v7.4s +add v6.4s, v6.4s, v7.4s +sqrdmulh v7.4S, v19.4S, v15.s[0] +mul v19.4S, v19.4S,v22.s[0] +mla v19.4S, v7.4S, v31.s[0] +sub v7.4s, v21.4s, v19.4s +add v21.4s, v21.4s, v19.4s +sqrdmulh v19.4S, v30.4S, v4.s[1] +mul v30.4S, v30.4S,v13.s[1] +mla v30.4S, v19.4S, v31.s[0] +sub v19.4s, v5.4s, v30.4s +add v5.4s, v5.4s, v30.4s +sqrdmulh v30.4S, v9.4S, v4.s[2] +mul v9.4S, v9.4S,v13.s[2] +mla v9.4S, v30.4S, v31.s[0] +sub v30.4s, v28.4s, v9.4s +add v28.4s, v28.4s, v9.4s +sqrdmulh v9.4S, v21.4S, v15.s[1] +mul v21.4S, v21.4S,v22.s[1] +mla v21.4S, v9.4S, v31.s[0] +sub v9.4s, v6.4s, v21.4s +trn1 v8.4S, v5.4S, v19.4S +trn2 v17.4S, v5.4S, v19.4S +add v6.4s, v6.4s, v21.4s +trn1 v21.4S, v28.4S, v30.4S +trn2 v16.4S, v28.4S, v30.4S +sqrdmulh v0.4S, v7.4S, v15.s[2] +ldr q14, [x17, #+1056] +mul v7.4S, v7.4S,v22.s[2] +ldr q1, [x17, #+1072] +mla v7.4S, v0.4S, v31.s[0] +trn2 v28.2D, v8.2D, v21.2D +trn2 v30.2D, v17.2D, v16.2D +sub v0.4s, v29.4s, v7.4s +trn1 v5.2D, v8.2D, v21.2D +trn1 v19.2D, v17.2D, v16.2D +add v29.4s, v29.4s, v7.4s +sqrdmulh v7.4S, v28.4S, v1.4S +mul v28.4S, v28.4S,v14.4S +mla v28.4S, v7.4S, v31.s[0] +trn1 v7.4S, v6.4S, v9.4S +trn2 v16.4S, v6.4S, v9.4S +sub v17.4s, v5.4s, v28.4s +trn1 v21.4S, v29.4S, v0.4S +trn2 v8.4S, v29.4S, v0.4S +add v5.4s, v5.4s, v28.4s +trn2 v29.2D, v7.2D, v21.2D +trn2 v0.2D, v16.2D, v8.2D +sqrdmulh v28.4S, v30.4S, v1.4S +trn1 v6.2D, v7.2D, v21.2D +trn1 v9.2D, v16.2D, v8.2D +ldr q8, [x17, #+2080] +ldr q16, [x17, #+2096] +mul v30.4S, v30.4S,v14.4S +mla v30.4S, v28.4S, v31.s[0] +sub v28.4s, v19.4s, v30.4s +add v19.4s, v19.4s, v30.4s +sqrdmulh v30.4S, v29.4S, v16.4S +mul v29.4S, v29.4S,v8.4S +mla v29.4S, v30.4S, v31.s[0] +sub v30.4s, v6.4s, v29.4s +add v6.4s, v6.4s, v29.4s +ldr q29, [x17, #+1088] +ldr q21, [x17, #+1104] +sqrdmulh v7.4S, v0.4S, v16.4S +mul v0.4S, v0.4S,v8.4S +mla v0.4S, v7.4S, v31.s[0] +sub v7.4s, v9.4s, v0.4s +add v9.4s, v9.4s, v0.4s +ldr q0, [x17, #+1120] +ldr q20, [x17, #+1136] +sqrdmulh v11.4S, v19.4S, v21.4S +mul v19.4S, v19.4S,v29.4S +mla v19.4S, v11.4S, v31.s[0] +sub v11.4s, v5.4s, v19.4s +add v5.4s, v5.4s, v19.4s +ldr q19, [x17, #+2112] +ldr q27, [x17, #+2128] +sqrdmulh v18.4S, v28.4S, v20.4S +mul v28.4S, v28.4S,v0.4S +mla v28.4S, v18.4S, v31.s[0] +sub v18.4s, v17.4s, v28.4s +add v17.4s, v17.4s, v28.4s +ldr q28, [x17, #+2144] +ldr q2, [x17, #+2160] +sqrdmulh v10.4S, v9.4S, v27.4S +mul v9.4S, v9.4S,v19.4S +mla v9.4S, v10.4S, v31.s[0] +sub v10.4s, v6.4s, v9.4s +add v6.4s, v6.4s, v9.4s +sqrdmulh v9.4S, v7.4S, v2.4S +mul v7.4S, v7.4S,v28.4S +mla v7.4S, v9.4S, v31.s[0] +sub v9.4s, v30.4s, v7.4s +add v30.4s, v30.4s, v7.4s +str q5, [x0, #448] +str q11, [x0, #464] +str q17, [x0, #480] +str q18, [x0, #496] +str q6, [x0, #960] +str q10, [x0, #976] +str q30, [x0, #992] +str q9, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 2392 +// Instruction count: 2388 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_3_z2_5.s b/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_3_z2_5.s new file mode 100644 index 0000000..64b3010 --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_3_z2_5.s @@ -0,0 +1,2422 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 26036764 // Layer 6, block 0 +.word 7065381 // Layer 6, block 1 +.word 11280567 // Layer 6, block 2 +.word 19695786 // Layer 6, block 3 +.word 1666225723 // Layer 6, block 0 +.word 452149874 // Layer 6, block 1 +.word 721901190 // Layer 6, block 2 +.word 1260434103 // Layer 6, block 3 +.word 28678040 // Layer 7, block 0 +.word 5637166 // Layer 7, block 2 +.word 18759424 // Layer 7, block 4 +.word 8648030 // Layer 7, block 6 +.word 1835254486 // Layer 7, block 0 +.word 360751090 // Layer 7, block 2 +.word 1200511508 // Layer 7, block 4 +.word 553431680 // Layer 7, block 6 +.word 7232147 // Layer 7, block 1 +.word 7430689 // Layer 7, block 3 +.word 14819378 // Layer 7, block 5 +.word 22112339 // Layer 7, block 7 +.word 462822084 // Layer 7, block 1 +.word 475527802 // Layer 7, block 3 +.word 948367809 // Layer 7, block 5 +.word 1415081692 // Layer 7, block 7 +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14834498 // Layer 6, block 4 +.word 22861321 // Layer 6, block 5 +.word 23033862 // Layer 6, block 6 +.word 32211066 // Layer 6, block 7 +.word 949335415 // Layer 6, block 4 +.word 1463012881 // Layer 6, block 5 +.word 1474054663 // Layer 6, block 6 +.word 2061350894 // Layer 6, block 7 +.word 7103825 // Layer 7, block 8 +.word 24338119 // Layer 7, block 10 +.word 6674394 // Layer 7, block 12 +.word 3716128 // Layer 7, block 14 +.word 454610102 // Layer 7, block 8 +.word 1557520740 // Layer 7, block 10 +.word 427128616 // Layer 7, block 12 +.word 237814041 // Layer 7, block 14 +.word 18577393 // Layer 7, block 9 +.word 17042091 // Layer 7, block 11 +.word 6574213 // Layer 7, block 13 +.word 24666803 // Layer 7, block 15 +.word 1188862414 // Layer 7, block 9 +.word 1090610585 // Layer 7, block 11 +.word 420717521 // Layer 7, block 13 +.word 1578554911 // Layer 7, block 15 +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 11253846 // Layer 6, block 8 +.word 16151303 // Layer 6, block 9 +.word 1821442 // Layer 6, block 10 +.word 23358663 // Layer 6, block 11 +.word 720191176 // Layer 6, block 8 +.word 1033604503 // Layer 6, block 9 +.word 116563391 // Layer 6, block 10 +.word 1494840340 // Layer 6, block 11 +.word 32787475 // Layer 7, block 16 +.word 8269259 // Layer 7, block 18 +.word 20826321 // Layer 7, block 20 +.word 21194054 // Layer 7, block 22 +.word 2098238255 // Layer 7, block 16 +.word 529192186 // Layer 7, block 18 +.word 1332782821 // Layer 7, block 20 +.word 1356315937 // Layer 7, block 22 +.word 28400654 // Layer 7, block 17 +.word 31090287 // Layer 7, block 19 +.word 26776841 // Layer 7, block 21 +.word 22281074 // Layer 7, block 23 +.word 1817503137 // Layer 7, block 17 +.word 1989626512 // Layer 7, block 19 +.word 1713587037 // Layer 7, block 21 +.word 1425879908 // Layer 7, block 23 +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 20504641 // Layer 6, block 12 +.word 7735096 // Layer 6, block 13 +.word 29463916 // Layer 6, block 14 +.word 23172067 // Layer 6, block 15 +.word 1312196872 // Layer 6, block 12 +.word 495008363 // Layer 6, block 13 +.word 1885546712 // Layer 6, block 14 +.word 1482899108 // Layer 6, block 15 +.word 1953000 // Layer 7, block 24 +.word 12766243 // Layer 7, block 26 +.word 16292342 // Layer 7, block 28 +.word 25143337 // Layer 7, block 30 +.word 124982461 // Layer 7, block 24 +.word 816977197 // Layer 7, block 26 +.word 1042630311 // Layer 7, block 28 +.word 1609050759 // Layer 7, block 30 +.word 12486848 // Layer 7, block 25 +.word 31556661 // Layer 7, block 27 +.word 28330310 // Layer 7, block 29 +.word 15137961 // Layer 7, block 31 +.word 799097282 // Layer 7, block 25 +.word 2019472170 // Layer 7, block 27 +.word 1813001465 // Layer 7, block 29 +.word 968755565 // Layer 7, block 31 +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 18663828 // Layer 6, block 16 +.word 25765932 // Layer 6, block 17 +.word 11779122 // Layer 6, block 18 +.word 29112305 // Layer 6, block 19 +.word 1194393831 // Layer 6, block 16 +.word 1648893798 // Layer 6, block 17 +.word 753806275 // Layer 6, block 18 +.word 1863045325 // Layer 6, block 19 +.word 33163184 // Layer 7, block 32 +.word 11550623 // Layer 7, block 34 +.word 25375595 // Layer 7, block 36 +.word 18254638 // Layer 7, block 38 +.word 2122281795 // Layer 7, block 32 +.word 739183455 // Layer 7, block 34 +.word 1623914137 // Layer 7, block 36 +.word 1168207670 // Layer 7, block 38 +.word 9551359 // Layer 7, block 33 +.word 33257316 // Layer 7, block 35 +.word 10387700 // Layer 7, block 37 +.word 4263629 // Layer 7, block 39 +.word 611240324 // Layer 7, block 33 +.word 2128305784 // Layer 7, block 35 +.word 664762063 // Layer 7, block 37 +.word 272851431 // Layer 7, block 39 +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 596073 // Layer 6, block 20 +.word 29039358 // Layer 6, block 21 +.word 6760262 // Layer 6, block 22 +.word 2228887 // Layer 6, block 23 +.word 38145761 // Layer 6, block 20 +.word 1858377074 // Layer 6, block 21 +.word 432623749 // Layer 6, block 22 +.word 142637881 // Layer 6, block 23 +.word 25929180 // Layer 7, block 40 +.word 23508428 // Layer 7, block 42 +.word 22560727 // Layer 7, block 44 +.word 29457393 // Layer 7, block 46 +.word 1659340873 // Layer 7, block 40 +.word 1504424569 // Layer 7, block 42 +.word 1443776334 // Layer 7, block 44 +.word 1885129272 // Layer 7, block 46 +.word 17371159 // Layer 7, block 41 +.word 11558208 // Layer 7, block 43 +.word 15755637 // Layer 7, block 45 +.word 20740787 // Layer 7, block 47 +.word 1111669329 // Layer 7, block 41 +.word 739668858 // Layer 7, block 43 +.word 1008283812 // Layer 7, block 45 +.word 1327309063 // Layer 7, block 47 +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 13624329 // Layer 6, block 24 +.word 9838349 // Layer 6, block 25 +.word 6934560 // Layer 6, block 26 +.word 11310234 // Layer 6, block 27 +.word 871890510 // Layer 6, block 24 +.word 629606282 // Layer 6, block 25 +.word 443777969 // Layer 6, block 26 +.word 723799733 // Layer 6, block 27 +.word 3153984 // Layer 7, block 48 +.word 15599806 // Layer 7, block 50 +.word 23484790 // Layer 7, block 52 +.word 30174454 // Layer 7, block 54 +.word 201839571 // Layer 7, block 48 +.word 998311389 // Layer 7, block 50 +.word 1502911852 // Layer 7, block 52 +.word 1931017673 // Layer 7, block 54 +.word 13598070 // Layer 7, block 49 +.word 31454003 // Layer 7, block 51 +.word 20506260 // Layer 7, block 53 +.word 5928435 // Layer 7, block 55 +.word 870210062 // Layer 7, block 49 +.word 2012902560 // Layer 7, block 51 +.word 1312300480 // Layer 7, block 53 +.word 379390883 // Layer 7, block 55 +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 32798516 // Layer 6, block 28 +.word 9911360 // Layer 6, block 29 +.word 32443170 // Layer 6, block 30 +.word 31293482 // Layer 6, block 31 +.word 2098944825 // Layer 6, block 28 +.word 634278629 // Layer 6, block 29 +.word 2076204416 // Layer 6, block 30 +.word 2002630000 // Layer 6, block 31 +.word 26013877 // Layer 7, block 56 +.word 22928950 // Layer 7, block 58 +.word 24547058 // Layer 7, block 60 +.word 21082546 // Layer 7, block 62 +.word 1664761067 // Layer 7, block 56 +.word 1467340807 // Layer 7, block 58 +.word 1570891816 // Layer 7, block 60 +.word 1349179970 // Layer 7, block 62 +.word 21864746 // Layer 7, block 57 +.word 27678266 // Layer 7, block 59 +.word 30695887 // Layer 7, block 61 +.word 31772478 // Layer 7, block 63 +.word 1399236949 // Layer 7, block 57 +.word 1771273834 // Layer 7, block 59 +.word 1964386839 // Layer 7, block 61 +.word 2033283404 // Layer 7, block 63 +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 2853776 // Layer 6, block 32 +.word 31645959 // Layer 6, block 33 +.word 29723614 // Layer 6, block 34 +.word 31813171 // Layer 6, block 35 +.word 182627725 // Layer 6, block 32 +.word 2025186806 // Layer 6, block 33 +.word 1902166116 // Layer 6, block 34 +.word 2035887557 // Layer 6, block 35 +.word 30377953 // Layer 7, block 64 +.word 4924837 // Layer 7, block 66 +.word 11362575 // Layer 7, block 68 +.word 31398766 // Layer 7, block 70 +.word 1944040616 // Layer 7, block 64 +.word 315165513 // Layer 7, block 66 +.word 727149301 // Layer 7, block 68 +.word 2009367662 // Layer 7, block 70 +.word 27689101 // Layer 7, block 65 +.word 31229525 // Layer 7, block 67 +.word 6544948 // Layer 7, block 69 +.word 13728247 // Layer 7, block 71 +.word 1771967221 // Layer 7, block 65 +.word 1998537064 // Layer 7, block 67 +.word 418844704 // Layer 7, block 69 +.word 878540754 // Layer 7, block 71 +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9116920 // Layer 6, block 36 +.word 26449800 // Layer 6, block 37 +.word 27173300 // Layer 6, block 38 +.word 1574249 // Layer 6, block 39 +.word 583438350 // Layer 6, block 36 +.word 1692658010 // Layer 6, block 37 +.word 1738958476 // Layer 6, block 38 +.word 100744247 // Layer 6, block 39 +.word 6510145 // Layer 7, block 72 +.word 760999 // Layer 7, block 74 +.word 1634503 // Layer 7, block 76 +.word 29546109 // Layer 7, block 78 +.word 416617482 // Layer 7, block 72 +.word 48700219 // Layer 7, block 74 +.word 104600209 // Layer 7, block 76 +.word 1890806663 // Layer 7, block 78 +.word 2195232 // Layer 7, block 73 +.word 4465852 // Layer 7, block 75 +.word 31203102 // Layer 7, block 77 +.word 29916743 // Layer 7, block 79 +.word 140484126 // Layer 7, block 73 +.word 285792715 // Layer 7, block 75 +.word 1996846121 // Layer 7, block 77 +.word 1914525428 // Layer 7, block 79 +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29172999 // Layer 6, block 40 +.word 16825951 // Layer 6, block 41 +.word 11592382 // Layer 6, block 42 +.word 2671395 // Layer 6, block 43 +.word 1866929445 // Layer 6, block 40 +.word 1076778680 // Layer 6, block 41 +.word 741855827 // Layer 6, block 42 +.word 170956232 // Layer 6, block 43 +.word 14579779 // Layer 7, block 80 +.word 24263513 // Layer 7, block 82 +.word 4646776 // Layer 7, block 84 +.word 69049 // Layer 7, block 86 +.word 933034643 // Layer 7, block 80 +.word 1552746321 // Layer 7, block 82 +.word 297370968 // Layer 7, block 84 +.word 4418799 // Layer 7, block 86 +.word 33263488 // Layer 7, block 81 +.word 22493246 // Layer 7, block 83 +.word 22009979 // Layer 7, block 85 +.word 12021234 // Layer 7, block 87 +.word 2128700762 // Layer 7, block 81 +.word 1439457879 // Layer 7, block 83 +.word 1408531152 // Layer 7, block 85 +.word 769300260 // Layer 7, block 87 +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 15720958 // Layer 6, block 44 +.word 4876619 // Layer 6, block 45 +.word 9370171 // Layer 6, block 46 +.word 2197027 // Layer 6, block 47 +.word 1006064525 // Layer 6, block 44 +.word 312079797 // Layer 6, block 45 +.word 599645177 // Layer 6, block 46 +.word 140598997 // Layer 6, block 47 +.word 16117282 // Layer 7, block 88 +.word 9635661 // Layer 7, block 90 +.word 9117520 // Layer 7, block 92 +.word 3506913 // Layer 7, block 94 +.word 1031427326 // Layer 7, block 88 +.word 616635240 // Layer 7, block 90 +.word 583476747 // Layer 7, block 92 +.word 224425303 // Layer 7, block 94 +.word 20014407 // Layer 7, block 89 +.word 25893988 // Layer 7, block 91 +.word 10257619 // Layer 7, block 93 +.word 24501669 // Layer 7, block 95 +.word 1280824291 // Layer 7, block 89 +.word 1657088757 // Layer 7, block 91 +.word 656437514 // Layer 7, block 93 +.word 1567987141 // Layer 7, block 95 +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 23467272 // Layer 6, block 48 +.word 11944835 // Layer 6, block 49 +.word 29768154 // Layer 6, block 50 +.word 3189790 // Layer 6, block 51 +.word 1501790786 // Layer 6, block 48 +.word 764411097 // Layer 6, block 49 +.word 1905016458 // Layer 6, block 50 +.word 204130980 // Layer 6, block 51 +.word 28559032 // Layer 7, block 96 +.word 20151609 // Layer 7, block 98 +.word 11645481 // Layer 7, block 100 +.word 16402437 // Layer 7, block 102 +.word 1827638556 // Layer 7, block 96 +.word 1289604549 // Layer 7, block 98 +.word 745253903 // Layer 7, block 100 +.word 1049675853 // Layer 7, block 102 +.word 1005359 // Layer 7, block 97 +.word 19130139 // Layer 7, block 99 +.word 11690281 // Layer 7, block 101 +.word 5461508 // Layer 7, block 103 +.word 64338065 // Layer 7, block 97 +.word 1224235458 // Layer 7, block 99 +.word 748120885 // Layer 7, block 101 +.word 349509836 // Layer 7, block 103 +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 4898455 // Layer 6, block 52 +.word 22059944 // Layer 6, block 53 +.word 20315246 // Layer 6, block 54 +.word 28615767 // Layer 6, block 55 +.word 313477194 // Layer 6, block 52 +.word 1411728668 // Layer 6, block 53 +.word 1300076517 // Layer 6, block 54 +.word 1831269319 // Layer 6, block 55 +.word 6226096 // Layer 7, block 104 +.word 14029790 // Layer 7, block 106 +.word 7729000 // Layer 7, block 108 +.word 13958531 // Layer 7, block 110 +.word 398439734 // Layer 7, block 104 +.word 897838034 // Layer 7, block 106 +.word 494618249 // Layer 7, block 108 +.word 893277806 // Layer 7, block 110 +.word 31755058 // Layer 7, block 105 +.word 26102744 // Layer 7, block 107 +.word 19175904 // Layer 7, block 109 +.word 19472238 // Layer 7, block 111 +.word 2032168609 // Layer 7, block 105 +.word 1670448121 // Layer 7, block 107 +.word 1227164194 // Layer 7, block 109 +.word 1246128123 // Layer 7, block 111 +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 17302560 // Layer 6, block 56 +.word 8630188 // Layer 6, block 57 +.word 13744680 // Layer 6, block 58 +.word 31890906 // Layer 6, block 59 +.word 1107279328 // Layer 6, block 56 +.word 552289879 // Layer 6, block 57 +.word 879592386 // Layer 6, block 58 +.word 2040862218 // Layer 6, block 59 +.word 4735938 // Layer 7, block 112 +.word 26671657 // Layer 7, block 114 +.word 25810971 // Layer 7, block 116 +.word 25578690 // Layer 7, block 118 +.word 303076900 // Layer 7, block 112 +.word 1706855774 // Layer 7, block 114 +.word 1651776074 // Layer 7, block 116 +.word 1636911225 // Layer 7, block 118 +.word 6957373 // Layer 7, block 113 +.word 25381712 // Layer 7, block 115 +.word 27780827 // Layer 7, block 117 +.word 28062311 // Layer 7, block 119 +.word 445237890 // Layer 7, block 113 +.word 1624305595 // Layer 7, block 115 +.word 1777837237 // Layer 7, block 117 +.word 1795850838 // Layer 7, block 119 +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 26150922 // Layer 6, block 60 +.word 29525906 // Layer 6, block 61 +.word 23080870 // Layer 6, block 62 +.word 1636987 // Layer 6, block 63 +.word 1673531278 // Layer 6, block 60 +.word 1889513769 // Layer 6, block 61 +.word 1477062945 // Layer 6, block 62 +.word 104759172 // Layer 6, block 63 +.word 10674616 // Layer 7, block 120 +.word 9508293 // Layer 7, block 122 +.word 4274200 // Layer 7, block 124 +.word 10066304 // Layer 7, block 126 +.word 683123285 // Layer 7, block 120 +.word 608484310 // Layer 7, block 122 +.word 273527923 // Layer 7, block 124 +.word 644194289 // Layer 7, block 126 +.word 26473446 // Layer 7, block 121 +.word 14853570 // Layer 7, block 123 +.word 32427548 // Layer 7, block 125 +.word 16598340 // Layer 7, block 127 +.word 1694171239 // Layer 7, block 121 +.word 950555930 // Layer 7, block 123 +.word 2075204685 // Layer 7, block 125 +.word 1062212688 // Layer 7, block 127 +.text +.global ntt_u32_full_neon_asm_var_4_4_3_z2_5 +.global _ntt_u32_full_neon_asm_var_4_4_3_z2_5 +ntt_u32_full_neon_asm_var_4_4_3_z2_5: +_ntt_u32_full_neon_asm_var_4_4_3_z2_5: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #800] +ldr q21, [x0, #864] +ldr q20, [x0, #928] +ldr q19, [x0, #992] +ldr q18, [x0, #288] +ldr q17, [x0, #352] +ldr q16, [x0, #416] +ldr q3, [x0, #480] +sqrdmulh v2.4S, v22.4S, v29.s[0] +ldr q1, [x0, #544] +mul v22.4S, v22.4S,v30.s[0] +ldr q0, [x0, #608] +sqrdmulh v15.4S, v21.4S, v29.s[0] +ldr q14, [x0, #672] +mul v21.4S, v21.4S,v30.s[0] +ldr q13, [x0, #736] +mla v22.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q12, [x0, #32] +sub v11.4s, v18.4s, v22.4s +mla v21.4S, v15.4S, v31.s[0] +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q15, [x0, #96] +sub v10.4s, v17.4s, v21.4s +mla v20.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v1.4S, v29.s[0] +ldr q2, [x0, #160] +mul v1.4S, v1.4S,v30.s[0] +sub v9.4s, v16.4s, v20.4s +mla v19.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v0.4S, v29.s[0] +ldr q22, [x0, #224] +mul v0.4S, v0.4S,v30.s[0] +sub v8.4s, v3.4s, v19.4s +mla v1.4S, v21.4S, v31.s[0] +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v21.4s, v12.4s, v1.4s +mla v0.4S, v20.4S, v31.s[0] +add v12.4s, v12.4s, v1.4s +sqrdmulh v1.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +sub v20.4s, v15.4s, v0.4s +mla v14.4S, v19.4S, v31.s[0] +add v15.4s, v15.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v19.4s, v2.4s, v14.4s +mla v13.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v1.4s, v22.4s, v13.4s +mla v16.4S, v0.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v0.4s, v2.4s, v16.4s +mla v3.4S, v14.4S, v31.s[0] +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v14.4s, v22.4s, v3.4s +mla v18.4S, v13.4S, v31.s[0] +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v29.s[2] +mul v9.4S, v9.4S,v30.s[2] +sub v13.4s, v12.4s, v18.4s +mla v17.4S, v16.4S, v31.s[0] +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v8.4S, v29.s[2] +mul v8.4S, v8.4S,v30.s[2] +sub v16.4s, v15.4s, v17.4s +mla v9.4S, v3.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +sqrdmulh v17.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v3.4s, v19.4s, v9.4s +mla v8.4S, v18.4S, v31.s[0] +add v19.4s, v19.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v18.4s, v1.4s, v8.4s +mla v11.4S, v17.4S, v31.s[0] +add v1.4s, v1.4s, v8.4s +sqrdmulh v8.4S, v2.4S, v27.s[0] +mul v2.4S, v2.4S,v28.s[0] +sub v17.4s, v21.4s, v11.4s +mla v10.4S, v9.4S, v31.s[0] +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v27.s[0] +mul v22.4S, v22.4S,v28.s[0] +sub v9.4s, v20.4s, v10.4s +mla v2.4S, v8.4S, v31.s[0] +add v20.4s, v20.4s, v10.4s +sqrdmulh v10.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v8.4s, v12.4s, v2.4s +mla v22.4S, v11.4S, v31.s[0] +add v12.4s, v12.4s, v2.4s +sqrdmulh v2.4S, v14.4S, v27.s[1] +mul v14.4S, v14.4S,v28.s[1] +sub v11.4s, v15.4s, v22.4s +mla v0.4S, v10.4S, v31.s[0] +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v27.s[2] +mul v19.4S, v19.4S,v28.s[2] +sub v10.4s, v13.4s, v0.4s +mla v14.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v0.4s +sqrdmulh v0.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +sub v2.4s, v16.4s, v14.4s +mla v19.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v27.s[3] +mul v3.4S, v3.4S,v28.s[3] +sub v22.4s, v21.4s, v19.4s +mla v1.4S, v0.4S, v31.s[0] +add v21.4s, v21.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v0.4s, v20.4s, v1.4s +mla v3.4S, v14.4S, v31.s[0] +add v20.4s, v20.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v25.s[0] +mul v15.4S, v15.4S,v26.s[0] +sub v14.4s, v17.4s, v3.4s +mla v18.4S, v19.4S, v31.s[0] +add v17.4s, v17.4s, v3.4s +sqrdmulh v3.4S, v11.4S, v25.s[1] +mul v11.4S, v11.4S,v26.s[1] +sub v19.4s, v9.4s, v18.4s +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v1.4s, v12.4s, v15.4s +mla v11.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v25.s[3] +mul v2.4S, v2.4S,v26.s[3] +sub v3.4s, v8.4s, v11.4s +mla v16.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v11.4s +str q12, [x0, #32] +sqrdmulh v12.4S, v20.4S, v23.s[0] +str q1, [x0, #96] +mul v20.4S, v20.4S,v24.s[0] +ldr q1, [x0, #816] +sub v11.4s, v13.4s, v16.4s +ldr q18, [x0, #880] +mla v2.4S, v15.4S, v31.s[0] +add v13.4s, v13.4s, v16.4s +str q8, [x0, #160] +sqrdmulh v8.4S, v0.4S, v23.s[1] +str q3, [x0, #224] +mul v0.4S, v0.4S,v24.s[1] +ldr q3, [x0, #944] +sub v16.4s, v10.4s, v2.4s +ldr q15, [x0, #1008] +mla v20.4S, v12.4S, v31.s[0] +add v10.4s, v10.4s, v2.4s +str q13, [x0, #288] +sqrdmulh v13.4S, v9.4S, v23.s[2] +str q11, [x0, #352] +mul v9.4S, v9.4S,v24.s[2] +ldr q11, [x0, #304] +sub v2.4s, v21.4s, v20.4s +ldr q12, [x0, #368] +mla v0.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v20.4s +str q10, [x0, #416] +sqrdmulh v10.4S, v19.4S, v23.s[3] +str q16, [x0, #480] +mul v19.4S, v19.4S,v24.s[3] +ldr q16, [x0, #432] +sub v20.4s, v22.4s, v0.4s +ldr q8, [x0, #496] +mla v9.4S, v13.4S, v31.s[0] +add v22.4s, v22.4s, v0.4s +str q21, [x0, #544] +sqrdmulh v21.4S, v1.4S, v29.s[0] +str q2, [x0, #608] +ldr q2, [x0, #560] +mul v1.4S, v1.4S,v30.s[0] +ldr q0, [x0, #624] +sub v13.4s, v17.4s, v9.4s +mla v19.4S, v10.4S, v31.s[0] +add v17.4s, v17.4s, v9.4s +str q22, [x0, #672] +sqrdmulh v22.4S, v18.4S, v29.s[0] +str q20, [x0, #736] +ldr q20, [x0, #688] +mul v18.4S, v18.4S,v30.s[0] +ldr q9, [x0, #752] +sub v10.4s, v14.4s, v19.4s +mla v1.4S, v21.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +str q17, [x0, #800] +sqrdmulh v17.4S, v3.4S, v29.s[0] +str q13, [x0, #864] +mul v3.4S, v3.4S,v30.s[0] +ldr q13, [x0, #48] +sub v19.4s, v11.4s, v1.4s +mla v18.4S, v22.4S, v31.s[0] +add v11.4s, v11.4s, v1.4s +str q14, [x0, #928] +sqrdmulh v14.4S, v15.4S, v29.s[0] +str q10, [x0, #992] +mul v15.4S, v15.4S,v30.s[0] +ldr q10, [x0, #112] +sub v1.4s, v12.4s, v18.4s +mla v3.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v2.4S, v29.s[0] +ldr q17, [x0, #176] +mul v2.4S, v2.4S,v30.s[0] +sub v22.4s, v16.4s, v3.4s +mla v15.4S, v14.4S, v31.s[0] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v0.4S, v29.s[0] +ldr q14, [x0, #240] +mul v0.4S, v0.4S,v30.s[0] +sub v21.4s, v8.4s, v15.4s +mla v2.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +sub v18.4s, v13.4s, v2.4s +mla v0.4S, v3.4S, v31.s[0] +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v9.4S, v29.s[0] +mul v9.4S, v9.4S,v30.s[0] +sub v3.4s, v10.4s, v0.4s +mla v20.4S, v15.4S, v31.s[0] +add v10.4s, v10.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v15.4s, v17.4s, v20.4s +mla v9.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +sub v2.4s, v14.4s, v9.4s +mla v16.4S, v0.4S, v31.s[0] +add v14.4s, v14.4s, v9.4s +sqrdmulh v9.4S, v11.4S, v29.s[1] +mul v11.4S, v11.4S,v30.s[1] +sub v0.4s, v17.4s, v16.4s +mla v8.4S, v20.4S, v31.s[0] +add v17.4s, v17.4s, v16.4s +sqrdmulh v16.4S, v12.4S, v29.s[1] +mul v12.4S, v12.4S,v30.s[1] +sub v20.4s, v14.4s, v8.4s +mla v11.4S, v9.4S, v31.s[0] +add v14.4s, v14.4s, v8.4s +sqrdmulh v8.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +sub v9.4s, v13.4s, v11.4s +mla v12.4S, v16.4S, v31.s[0] +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +sub v16.4s, v10.4s, v12.4s +mla v22.4S, v8.4S, v31.s[0] +add v10.4s, v10.4s, v12.4s +sqrdmulh v12.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +sub v8.4s, v15.4s, v22.4s +mla v21.4S, v11.4S, v31.s[0] +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v1.4S, v29.s[2] +mul v1.4S, v1.4S,v30.s[2] +sub v11.4s, v2.4s, v21.4s +mla v19.4S, v12.4S, v31.s[0] +add v2.4s, v2.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v27.s[0] +mul v17.4S, v17.4S,v28.s[0] +sub v12.4s, v18.4s, v19.4s +mla v1.4S, v22.4S, v31.s[0] +add v18.4s, v18.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +sub v22.4s, v3.4s, v1.4s +mla v17.4S, v21.4S, v31.s[0] +add v3.4s, v3.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v21.4s, v13.4s, v17.4s +mla v14.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v17.4s +sqrdmulh v17.4S, v20.4S, v27.s[1] +mul v20.4S, v20.4S,v28.s[1] +sub v19.4s, v10.4s, v14.4s +mla v0.4S, v1.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v27.s[2] +mul v15.4S, v15.4S,v28.s[2] +sub v1.4s, v9.4s, v0.4s +mla v20.4S, v17.4S, v31.s[0] +add v9.4s, v9.4s, v0.4s +sqrdmulh v0.4S, v2.4S, v27.s[2] +mul v2.4S, v2.4S,v28.s[2] +sub v17.4s, v16.4s, v20.4s +mla v15.4S, v14.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v27.s[3] +mul v8.4S, v8.4S,v28.s[3] +sub v14.4s, v18.4s, v15.4s +mla v2.4S, v0.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v27.s[3] +mul v11.4S, v11.4S,v28.s[3] +sub v0.4s, v3.4s, v2.4s +mla v8.4S, v20.4S, v31.s[0] +add v3.4s, v3.4s, v2.4s +sqrdmulh v2.4S, v10.4S, v25.s[0] +mul v10.4S, v10.4S,v26.s[0] +sub v20.4s, v12.4s, v8.4s +mla v11.4S, v15.4S, v31.s[0] +add v12.4s, v12.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v25.s[1] +mul v19.4S, v19.4S,v26.s[1] +sub v15.4s, v22.4s, v11.4s +mla v10.4S, v2.4S, v31.s[0] +add v22.4s, v22.4s, v11.4s +sqrdmulh v11.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v2.4s, v13.4s, v10.4s +mla v19.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v10.4s +sqrdmulh v10.4S, v17.4S, v25.s[3] +mul v17.4S, v17.4S,v26.s[3] +sub v8.4s, v21.4s, v19.4s +mla v16.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v19.4s +str q13, [x0, #48] +sqrdmulh v13.4S, v3.4S, v23.s[0] +str q2, [x0, #112] +mul v3.4S, v3.4S,v24.s[0] +ldr q2, [x0, #768] +sub v19.4s, v9.4s, v16.4s +ldr q11, [x0, #832] +mla v17.4S, v10.4S, v31.s[0] +add v9.4s, v9.4s, v16.4s +str q21, [x0, #176] +sqrdmulh v21.4S, v0.4S, v23.s[1] +str q8, [x0, #240] +mul v0.4S, v0.4S,v24.s[1] +ldr q8, [x0, #896] +sub v16.4s, v1.4s, v17.4s +ldr q10, [x0, #960] +mla v3.4S, v13.4S, v31.s[0] +add v1.4s, v1.4s, v17.4s +str q9, [x0, #304] +sqrdmulh v9.4S, v22.4S, v23.s[2] +str q19, [x0, #368] +mul v22.4S, v22.4S,v24.s[2] +ldr q19, [x0, #256] +sub v17.4s, v18.4s, v3.4s +ldr q13, [x0, #320] +mla v0.4S, v21.4S, v31.s[0] +add v18.4s, v18.4s, v3.4s +str q1, [x0, #432] +sqrdmulh v1.4S, v15.4S, v23.s[3] +str q16, [x0, #496] +mul v15.4S, v15.4S,v24.s[3] +ldr q16, [x0, #384] +sub v3.4s, v14.4s, v0.4s +ldr q21, [x0, #448] +mla v22.4S, v9.4S, v31.s[0] +add v14.4s, v14.4s, v0.4s +str q18, [x0, #560] +sqrdmulh v18.4S, v2.4S, v29.s[0] +str q17, [x0, #624] +ldr q17, [x0, #512] +mul v2.4S, v2.4S,v30.s[0] +ldr q0, [x0, #576] +sub v9.4s, v12.4s, v22.4s +mla v15.4S, v1.4S, v31.s[0] +add v12.4s, v12.4s, v22.4s +str q14, [x0, #688] +sqrdmulh v14.4S, v11.4S, v29.s[0] +str q3, [x0, #752] +ldr q3, [x0, #640] +mul v11.4S, v11.4S,v30.s[0] +ldr q22, [x0, #704] +sub v1.4s, v20.4s, v15.4s +mla v2.4S, v18.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +str q12, [x0, #816] +sqrdmulh v12.4S, v8.4S, v29.s[0] +str q9, [x0, #880] +mul v8.4S, v8.4S,v30.s[0] +ldr q9, [x0, #0] +sub v15.4s, v19.4s, v2.4s +mla v11.4S, v14.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +str q20, [x0, #944] +sqrdmulh v20.4S, v10.4S, v29.s[0] +str q1, [x0, #1008] +mul v10.4S, v10.4S,v30.s[0] +ldr q1, [x0, #64] +sub v2.4s, v13.4s, v11.4s +mla v8.4S, v12.4S, v31.s[0] +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v29.s[0] +ldr q12, [x0, #128] +mul v17.4S, v17.4S,v30.s[0] +sub v14.4s, v16.4s, v8.4s +mla v10.4S, v20.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v0.4S, v29.s[0] +ldr q20, [x0, #192] +mul v0.4S, v0.4S,v30.s[0] +sub v18.4s, v21.4s, v10.4s +mla v17.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +sub v11.4s, v9.4s, v17.4s +mla v0.4S, v8.4S, v31.s[0] +add v9.4s, v9.4s, v17.4s +sqrdmulh v17.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v8.4s, v1.4s, v0.4s +mla v3.4S, v10.4S, v31.s[0] +add v1.4s, v1.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v10.4s, v12.4s, v3.4s +mla v22.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v17.4s, v20.4s, v22.4s +mla v16.4S, v0.4S, v31.s[0] +add v20.4s, v20.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[1] +mul v19.4S, v19.4S,v30.s[1] +sub v0.4s, v12.4s, v16.4s +mla v21.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v13.4S, v29.s[1] +mul v13.4S, v13.4S,v30.s[1] +sub v3.4s, v20.4s, v21.4s +mla v19.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v22.4s, v9.4s, v19.4s +mla v13.4S, v16.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v29.s[2] +mul v18.4S, v18.4S,v30.s[2] +sub v16.4s, v1.4s, v13.4s +mla v14.4S, v21.4S, v31.s[0] +add v1.4s, v1.4s, v13.4s +sqrdmulh v13.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +sub v21.4s, v10.4s, v14.4s +mla v18.4S, v19.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v19.4s, v17.4s, v18.4s +mla v15.4S, v13.4S, v31.s[0] +add v17.4s, v17.4s, v18.4s +sqrdmulh v18.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +sub v13.4s, v11.4s, v15.4s +mla v2.4S, v14.4S, v31.s[0] +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v27.s[0] +mul v20.4S, v20.4S,v28.s[0] +sub v14.4s, v8.4s, v2.4s +mla v12.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v2.4s +sqrdmulh v2.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v18.4s, v9.4s, v12.4s +mla v20.4S, v15.4S, v31.s[0] +add v9.4s, v9.4s, v12.4s +sqrdmulh v12.4S, v3.4S, v27.s[1] +mul v3.4S, v3.4S,v28.s[1] +sub v15.4s, v1.4s, v20.4s +mla v0.4S, v2.4S, v31.s[0] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v10.4S, v27.s[2] +mul v10.4S, v10.4S,v28.s[2] +sub v2.4s, v22.4s, v0.4s +mla v3.4S, v12.4S, v31.s[0] +add v22.4s, v22.4s, v0.4s +sqrdmulh v0.4S, v17.4S, v27.s[2] +mul v17.4S, v17.4S,v28.s[2] +sub v12.4s, v16.4s, v3.4s +mla v10.4S, v20.4S, v31.s[0] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +sub v20.4s, v11.4s, v10.4s +mla v17.4S, v0.4S, v31.s[0] +add v11.4s, v11.4s, v10.4s +sqrdmulh v10.4S, v19.4S, v27.s[3] +mul v19.4S, v19.4S,v28.s[3] +sub v0.4s, v8.4s, v17.4s +mla v21.4S, v3.4S, v31.s[0] +add v8.4s, v8.4s, v17.4s +sqrdmulh v17.4S, v1.4S, v25.s[0] +mul v1.4S, v1.4S,v26.s[0] +sub v3.4s, v13.4s, v21.4s +mla v19.4S, v10.4S, v31.s[0] +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v15.4S, v25.s[1] +mul v15.4S, v15.4S,v26.s[1] +sub v10.4s, v14.4s, v19.4s +mla v1.4S, v17.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +sqrdmulh v19.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v17.4s, v9.4s, v1.4s +mla v15.4S, v21.4S, v31.s[0] +add v9.4s, v9.4s, v1.4s +sqrdmulh v1.4S, v12.4S, v25.s[3] +mul v12.4S, v12.4S,v26.s[3] +sub v21.4s, v18.4s, v15.4s +mla v16.4S, v19.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +str q9, [x0, #0] +sqrdmulh v9.4S, v8.4S, v23.s[0] +str q17, [x0, #64] +mul v8.4S, v8.4S,v24.s[0] +ldr q17, [x0, #784] +sub v15.4s, v22.4s, v16.4s +ldr q19, [x0, #848] +mla v12.4S, v1.4S, v31.s[0] +add v22.4s, v22.4s, v16.4s +str q18, [x0, #128] +sqrdmulh v18.4S, v0.4S, v23.s[1] +str q21, [x0, #192] +mul v0.4S, v0.4S,v24.s[1] +ldr q21, [x0, #912] +sub v16.4s, v2.4s, v12.4s +ldr q1, [x0, #976] +mla v8.4S, v9.4S, v31.s[0] +add v2.4s, v2.4s, v12.4s +str q22, [x0, #256] +sqrdmulh v22.4S, v14.4S, v23.s[2] +str q15, [x0, #320] +mul v14.4S, v14.4S,v24.s[2] +ldr q15, [x0, #272] +sub v12.4s, v11.4s, v8.4s +ldr q9, [x0, #336] +mla v0.4S, v18.4S, v31.s[0] +add v11.4s, v11.4s, v8.4s +str q2, [x0, #384] +sqrdmulh v2.4S, v10.4S, v23.s[3] +str q16, [x0, #448] +mul v10.4S, v10.4S,v24.s[3] +ldr q16, [x0, #400] +sub v8.4s, v20.4s, v0.4s +ldr q18, [x0, #464] +mla v14.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v0.4s +str q11, [x0, #512] +sqrdmulh v11.4S, v17.4S, v29.s[0] +str q12, [x0, #576] +ldr q12, [x0, #528] +mul v17.4S, v17.4S,v30.s[0] +ldr q0, [x0, #592] +sub v22.4s, v13.4s, v14.4s +mla v10.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v14.4s +str q20, [x0, #640] +sqrdmulh v20.4S, v19.4S, v29.s[0] +str q8, [x0, #704] +ldr q8, [x0, #656] +mul v19.4S, v19.4S,v30.s[0] +ldr q14, [x0, #720] +sub v2.4s, v3.4s, v10.4s +mla v17.4S, v11.4S, v31.s[0] +add v3.4s, v3.4s, v10.4s +str q13, [x0, #768] +sqrdmulh v13.4S, v21.4S, v29.s[0] +str q22, [x0, #832] +mul v21.4S, v21.4S,v30.s[0] +ldr q22, [x0, #16] +sub v10.4s, v15.4s, v17.4s +mla v19.4S, v20.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +str q3, [x0, #896] +sqrdmulh v3.4S, v1.4S, v29.s[0] +str q2, [x0, #960] +mul v1.4S, v1.4S,v30.s[0] +ldr q2, [x0, #80] +sub v17.4s, v9.4s, v19.4s +mla v21.4S, v13.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v12.4S, v29.s[0] +ldr q13, [x0, #144] +mul v12.4S, v12.4S,v30.s[0] +sub v20.4s, v16.4s, v21.4s +mla v1.4S, v3.4S, v31.s[0] +add v16.4s, v16.4s, v21.4s +sqrdmulh v21.4S, v0.4S, v29.s[0] +ldr q3, [x0, #208] +mul v0.4S, v0.4S,v30.s[0] +sub v11.4s, v18.4s, v1.4s +mla v12.4S, v19.4S, v31.s[0] +add v18.4s, v18.4s, v1.4s +sqrdmulh v1.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v19.4s, v22.4s, v12.4s +mla v0.4S, v21.4S, v31.s[0] +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v21.4s, v2.4s, v0.4s +mla v8.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v1.4s, v13.4s, v8.4s +mla v14.4S, v12.4S, v31.s[0] +add v13.4s, v13.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v12.4s, v3.4s, v14.4s +mla v16.4S, v0.4S, v31.s[0] +add v3.4s, v3.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +sub v0.4s, v13.4s, v16.4s +mla v18.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v16.4s +sqrdmulh v16.4S, v9.4S, v29.s[1] +mul v9.4S, v9.4S,v30.s[1] +sub v8.4s, v3.4s, v18.4s +mla v15.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +sub v14.4s, v22.4s, v15.4s +mla v9.4S, v16.4S, v31.s[0] +add v22.4s, v22.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v16.4s, v2.4s, v9.4s +mla v20.4S, v18.4S, v31.s[0] +add v2.4s, v2.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v18.4s, v1.4s, v20.4s +mla v11.4S, v15.4S, v31.s[0] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +sub v15.4s, v12.4s, v11.4s +mla v10.4S, v9.4S, v31.s[0] +add v12.4s, v12.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v27.s[0] +mul v13.4S, v13.4S,v28.s[0] +sub v9.4s, v19.4s, v10.4s +mla v17.4S, v20.4S, v31.s[0] +add v19.4s, v19.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v27.s[0] +mul v3.4S, v3.4S,v28.s[0] +sub v20.4s, v21.4s, v17.4s +mla v13.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v11.4s, v22.4s, v13.4s +mla v3.4S, v10.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v8.4S, v27.s[1] +mul v8.4S, v8.4S,v28.s[1] +sub v10.4s, v2.4s, v3.4s +mla v0.4S, v17.4S, v31.s[0] +add v2.4s, v2.4s, v3.4s +sqrdmulh v3.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +sub v17.4s, v14.4s, v0.4s +mla v8.4S, v13.4S, v31.s[0] +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v12.4S, v27.s[2] +mul v12.4S, v12.4S,v28.s[2] +sub v13.4s, v16.4s, v8.4s +mla v1.4S, v3.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v3.4s, v19.4s, v1.4s +mla v12.4S, v0.4S, v31.s[0] +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +sub v0.4s, v21.4s, v12.4s +mla v18.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v2.4S, v25.s[0] +mul v2.4S, v2.4S,v26.s[0] +sub v8.4s, v9.4s, v18.4s +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v10.4S, v25.s[1] +mul v10.4S, v10.4S,v26.s[1] +sub v1.4s, v20.4s, v15.4s +mla v2.4S, v12.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v12.4s, v22.4s, v2.4s +mla v10.4S, v18.4S, v31.s[0] +add v22.4s, v22.4s, v2.4s +sqrdmulh v2.4S, v13.4S, v25.s[3] +mul v13.4S, v13.4S,v26.s[3] +sub v18.4s, v11.4s, v10.4s +mla v16.4S, v15.4S, v31.s[0] +add v11.4s, v11.4s, v10.4s +str q22, [x0, #16] +sqrdmulh v22.4S, v21.4S, v23.s[0] +str q12, [x0, #80] +mul v21.4S, v21.4S,v24.s[0] +sub v12.4s, v14.4s, v16.4s +mla v13.4S, v2.4S, v31.s[0] +add v14.4s, v14.4s, v16.4s +str q11, [x0, #144] +sqrdmulh v11.4S, v0.4S, v23.s[1] +str q18, [x0, #208] +mul v0.4S, v0.4S,v24.s[1] +sub v18.4s, v17.4s, v13.4s +mla v21.4S, v22.4S, v31.s[0] +add v17.4s, v17.4s, v13.4s +str q14, [x0, #272] +sqrdmulh v14.4S, v20.4S, v23.s[2] +str q12, [x0, #336] +mul v20.4S, v20.4S,v24.s[2] +sub v12.4s, v19.4s, v21.4s +mla v0.4S, v11.4S, v31.s[0] +add v19.4s, v19.4s, v21.4s +str q17, [x0, #400] +sqrdmulh v17.4S, v1.4S, v23.s[3] +str q18, [x0, #464] +mul v1.4S, v1.4S,v24.s[3] +sub v18.4s, v3.4s, v0.4s +mla v20.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v0.4s +str q19, [x0, #528] +str q12, [x0, #592] +sub v12.4s, v9.4s, v20.4s +mla v1.4S, v17.4S, v31.s[0] +add v9.4s, v9.4s, v20.4s +str q3, [x0, #656] +str q18, [x0, #720] +sub v18.4s, v8.4s, v1.4s +add v8.4s, v8.4s, v1.4s +str q9, [x0, #784] +str q12, [x0, #848] +str q8, [x0, #912] +str q18, [x0, #976] +ldr q4, [x0, #32] +ldr q5, [x0, #48] +ldr q6, [x17, #+128] +ldr q7, [x17, #+144] +ldr q15, [x0, #0] +ldr q10, [x0, #16] +ldr q2, [x17, #+1152] +ldr q16, [x17, #+1168] +sqrdmulh v22.4S, v4.4S, v7.s[0] +ldr q13, [x0, #544] +mul v4.4S, v4.4S,v6.s[0] +ldr q11, [x0, #560] +sqrdmulh v21.4S, v5.4S, v7.s[0] +ldr q14, [x0, #512] +mul v5.4S, v5.4S,v6.s[0] +ldr q0, [x0, #528] +mla v4.4S, v22.4S, v31.s[0] +sqrdmulh v22.4S, v13.4S, v16.s[0] +mul v13.4S, v13.4S,v2.s[0] +mla v5.4S, v21.4S, v31.s[0] +sub v21.4s, v15.4s, v4.4s +add v15.4s, v15.4s, v4.4s +sqrdmulh v4.4S, v11.4S, v16.s[0] +mul v11.4S, v11.4S,v2.s[0] +mla v13.4S, v22.4S, v31.s[0] +sub v22.4s, v10.4s, v5.4s +add v10.4s, v10.4s, v5.4s +sqrdmulh v5.4S, v10.4S, v7.s[1] +mul v10.4S, v10.4S,v6.s[1] +mla v11.4S, v4.4S, v31.s[0] +sub v4.4s, v14.4s, v13.4s +add v14.4s, v14.4s, v13.4s +sqrdmulh v13.4S, v22.4S, v7.s[2] +mul v22.4S, v22.4S,v6.s[2] +mla v10.4S, v5.4S, v31.s[0] +sub v5.4s, v0.4s, v11.4s +add v0.4s, v0.4s, v11.4s +sqrdmulh v11.4S, v0.4S, v16.s[1] +mul v0.4S, v0.4S,v2.s[1] +mla v22.4S, v13.4S, v31.s[0] +sub v13.4s, v15.4s, v10.4s +add v15.4s, v15.4s, v10.4s +sqrdmulh v10.4S, v5.4S, v16.s[2] +mul v5.4S, v5.4S,v2.s[2] +mla v0.4S, v11.4S, v31.s[0] +sub v11.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +trn1 v22.4S, v15.4S, v13.4S +trn2 v19.4S, v15.4S, v13.4S +trn1 v17.4S, v21.4S, v11.4S +trn2 v20.4S, v21.4S, v11.4S +trn2 v21.2D, v22.2D, v17.2D +trn2 v11.2D, v19.2D, v20.2D +trn1 v15.2D, v22.2D, v17.2D +trn1 v13.2D, v19.2D, v20.2D +ldr q20, [x17, #+160] +ldr q19, [x17, #+176] +sqrdmulh v17.4S, v21.4S, v19.4S +mul v21.4S, v21.4S,v20.4S +mla v5.4S, v10.4S, v31.s[0] +sub v10.4s, v14.4s, v0.4s +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v11.4S, v19.4S +mul v11.4S, v11.4S,v20.4S +mla v21.4S, v17.4S, v31.s[0] +sub v17.4s, v4.4s, v5.4s +add v4.4s, v4.4s, v5.4s +trn1 v5.4S, v14.4S, v10.4S +trn2 v22.4S, v14.4S, v10.4S +trn1 v3.4S, v4.4S, v17.4S +trn2 v1.4S, v4.4S, v17.4S +trn2 v4.2D, v5.2D, v3.2D +trn2 v17.2D, v22.2D, v1.2D +trn1 v14.2D, v5.2D, v3.2D +trn1 v10.2D, v22.2D, v1.2D +ldr q1, [x17, #+1184] +ldr q22, [x17, #+1200] +sqrdmulh v3.4S, v4.4S, v22.4S +ldr q5, [x17, #+192] +ldr q9, [x17, #+208] +mul v4.4S, v4.4S,v1.4S +mla v11.4S, v0.4S, v31.s[0] +sub v0.4s, v15.4s, v21.4s +add v15.4s, v15.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v22.4S +ldr q12, [x17, #+224] +ldr q8, [x17, #+240] +mul v17.4S, v17.4S,v1.4S +mla v4.4S, v3.4S, v31.s[0] +sub v3.4s, v13.4s, v11.4s +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v9.4S +ldr q18, [x17, #+1216] +ldr q30, [x17, #+1232] +mul v13.4S, v13.4S,v5.4S +mla v17.4S, v21.4S, v31.s[0] +sub v21.4s, v14.4s, v4.4s +add v14.4s, v14.4s, v4.4s +sqrdmulh v4.4S, v3.4S, v8.4S +ldr q29, [x17, #+1248] +ldr q28, [x17, #+1264] +mul v3.4S, v3.4S,v12.4S +mla v13.4S, v11.4S, v31.s[0] +sub v11.4s, v10.4s, v17.4s +add v10.4s, v10.4s, v17.4s +sqrdmulh v17.4S, v10.4S, v30.4S +ldr q27, [x0, #96] +mul v10.4S, v10.4S,v18.4S +mla v3.4S, v4.4S, v31.s[0] +sub v4.4s, v15.4s, v13.4s +add v15.4s, v15.4s, v13.4s +sqrdmulh v13.4S, v11.4S, v28.4S +ldr q26, [x0, #112] +mul v11.4S, v11.4S,v29.4S +mla v10.4S, v17.4S, v31.s[0] +sub v17.4s, v0.4s, v3.4s +add v0.4s, v0.4s, v3.4s +mla v11.4S, v13.4S, v31.s[0] +sub v13.4s, v14.4s, v10.4s +add v14.4s, v14.4s, v10.4s +sub v10.4s, v21.4s, v11.4s +add v21.4s, v21.4s, v11.4s +str q15, [x0, #0] +str q4, [x0, #16] +str q0, [x0, #32] +str q17, [x0, #48] +str q14, [x0, #512] +str q13, [x0, #528] +str q21, [x0, #544] +str q10, [x0, #560] +ldr q28, [x17, #+256] +ldr q29, [x17, #+272] +ldr q30, [x0, #64] +ldr q18, [x0, #80] +ldr q22, [x17, #+1280] +ldr q1, [x17, #+1296] +sqrdmulh v16.4S, v27.4S, v29.s[0] +ldr q2, [x0, #608] +mul v27.4S, v27.4S,v28.s[0] +ldr q10, [x0, #624] +sqrdmulh v21.4S, v26.4S, v29.s[0] +ldr q13, [x0, #576] +mul v26.4S, v26.4S,v28.s[0] +ldr q14, [x0, #592] +mla v27.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v2.4S, v1.s[0] +mul v2.4S, v2.4S,v22.s[0] +mla v26.4S, v21.4S, v31.s[0] +sub v21.4s, v30.4s, v27.4s +add v30.4s, v30.4s, v27.4s +sqrdmulh v27.4S, v10.4S, v1.s[0] +mul v10.4S, v10.4S,v22.s[0] +mla v2.4S, v16.4S, v31.s[0] +sub v16.4s, v18.4s, v26.4s +add v18.4s, v18.4s, v26.4s +sqrdmulh v26.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v28.s[1] +mla v10.4S, v27.4S, v31.s[0] +sub v27.4s, v13.4s, v2.4s +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v28.s[2] +mla v18.4S, v26.4S, v31.s[0] +sub v26.4s, v14.4s, v10.4s +add v14.4s, v14.4s, v10.4s +sqrdmulh v10.4S, v14.4S, v1.s[1] +mul v14.4S, v14.4S,v22.s[1] +mla v16.4S, v2.4S, v31.s[0] +sub v2.4s, v30.4s, v18.4s +add v30.4s, v30.4s, v18.4s +sqrdmulh v18.4S, v26.4S, v1.s[2] +mul v26.4S, v26.4S,v22.s[2] +mla v14.4S, v10.4S, v31.s[0] +sub v10.4s, v21.4s, v16.4s +add v21.4s, v21.4s, v16.4s +trn1 v16.4S, v30.4S, v2.4S +trn2 v8.4S, v30.4S, v2.4S +trn1 v12.4S, v21.4S, v10.4S +trn2 v9.4S, v21.4S, v10.4S +trn2 v21.2D, v16.2D, v12.2D +trn2 v10.2D, v8.2D, v9.2D +trn1 v30.2D, v16.2D, v12.2D +trn1 v2.2D, v8.2D, v9.2D +ldr q9, [x17, #+288] +ldr q8, [x17, #+304] +sqrdmulh v12.4S, v21.4S, v8.4S +mul v21.4S, v21.4S,v9.4S +mla v26.4S, v18.4S, v31.s[0] +sub v18.4s, v13.4s, v14.4s +add v13.4s, v13.4s, v14.4s +sqrdmulh v14.4S, v10.4S, v8.4S +mul v10.4S, v10.4S,v9.4S +mla v21.4S, v12.4S, v31.s[0] +sub v12.4s, v27.4s, v26.4s +add v27.4s, v27.4s, v26.4s +trn1 v26.4S, v13.4S, v18.4S +trn2 v16.4S, v13.4S, v18.4S +trn1 v5.4S, v27.4S, v12.4S +trn2 v19.4S, v27.4S, v12.4S +trn2 v27.2D, v26.2D, v5.2D +trn2 v12.2D, v16.2D, v19.2D +trn1 v13.2D, v26.2D, v5.2D +trn1 v18.2D, v16.2D, v19.2D +ldr q19, [x17, #+1312] +ldr q16, [x17, #+1328] +sqrdmulh v5.4S, v27.4S, v16.4S +ldr q26, [x17, #+320] +ldr q20, [x17, #+336] +mul v27.4S, v27.4S,v19.4S +mla v10.4S, v14.4S, v31.s[0] +sub v14.4s, v30.4s, v21.4s +add v30.4s, v30.4s, v21.4s +sqrdmulh v21.4S, v12.4S, v16.4S +ldr q7, [x17, #+352] +ldr q6, [x17, #+368] +mul v12.4S, v12.4S,v19.4S +mla v27.4S, v5.4S, v31.s[0] +sub v5.4s, v2.4s, v10.4s +add v2.4s, v2.4s, v10.4s +sqrdmulh v10.4S, v2.4S, v20.4S +ldr q17, [x17, #+1344] +ldr q0, [x17, #+1360] +mul v2.4S, v2.4S,v26.4S +mla v12.4S, v21.4S, v31.s[0] +sub v21.4s, v13.4s, v27.4s +add v13.4s, v13.4s, v27.4s +sqrdmulh v27.4S, v5.4S, v6.4S +ldr q4, [x17, #+1376] +ldr q15, [x17, #+1392] +mul v5.4S, v5.4S,v7.4S +mla v2.4S, v10.4S, v31.s[0] +sub v10.4s, v18.4s, v12.4s +add v18.4s, v18.4s, v12.4s +sqrdmulh v12.4S, v18.4S, v0.4S +ldr q11, [x0, #160] +mul v18.4S, v18.4S,v17.4S +mla v5.4S, v27.4S, v31.s[0] +sub v27.4s, v30.4s, v2.4s +add v30.4s, v30.4s, v2.4s +sqrdmulh v2.4S, v10.4S, v15.4S +ldr q3, [x0, #176] +mul v10.4S, v10.4S,v4.4S +mla v18.4S, v12.4S, v31.s[0] +sub v12.4s, v14.4s, v5.4s +add v14.4s, v14.4s, v5.4s +mla v10.4S, v2.4S, v31.s[0] +sub v2.4s, v13.4s, v18.4s +add v13.4s, v13.4s, v18.4s +sub v18.4s, v21.4s, v10.4s +add v21.4s, v21.4s, v10.4s +str q30, [x0, #64] +str q27, [x0, #80] +str q14, [x0, #96] +str q12, [x0, #112] +str q13, [x0, #576] +str q2, [x0, #592] +str q21, [x0, #608] +str q18, [x0, #624] +ldr q15, [x17, #+384] +ldr q4, [x17, #+400] +ldr q0, [x0, #128] +ldr q17, [x0, #144] +ldr q16, [x17, #+1408] +ldr q19, [x17, #+1424] +sqrdmulh v1.4S, v11.4S, v4.s[0] +ldr q22, [x0, #672] +mul v11.4S, v11.4S,v15.s[0] +ldr q18, [x0, #688] +sqrdmulh v21.4S, v3.4S, v4.s[0] +ldr q2, [x0, #640] +mul v3.4S, v3.4S,v15.s[0] +ldr q13, [x0, #656] +mla v11.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v22.4S, v19.s[0] +mul v22.4S, v22.4S,v16.s[0] +mla v3.4S, v21.4S, v31.s[0] +sub v21.4s, v0.4s, v11.4s +add v0.4s, v0.4s, v11.4s +sqrdmulh v11.4S, v18.4S, v19.s[0] +mul v18.4S, v18.4S,v16.s[0] +mla v22.4S, v1.4S, v31.s[0] +sub v1.4s, v17.4s, v3.4s +add v17.4s, v17.4s, v3.4s +sqrdmulh v3.4S, v17.4S, v4.s[1] +mul v17.4S, v17.4S,v15.s[1] +mla v18.4S, v11.4S, v31.s[0] +sub v11.4s, v2.4s, v22.4s +add v2.4s, v2.4s, v22.4s +sqrdmulh v22.4S, v1.4S, v4.s[2] +mul v1.4S, v1.4S,v15.s[2] +mla v17.4S, v3.4S, v31.s[0] +sub v3.4s, v13.4s, v18.4s +add v13.4s, v13.4s, v18.4s +sqrdmulh v18.4S, v13.4S, v19.s[1] +mul v13.4S, v13.4S,v16.s[1] +mla v1.4S, v22.4S, v31.s[0] +sub v22.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +sqrdmulh v17.4S, v3.4S, v19.s[2] +mul v3.4S, v3.4S,v16.s[2] +mla v13.4S, v18.4S, v31.s[0] +sub v18.4s, v21.4s, v1.4s +add v21.4s, v21.4s, v1.4s +trn1 v1.4S, v0.4S, v22.4S +trn2 v6.4S, v0.4S, v22.4S +trn1 v7.4S, v21.4S, v18.4S +trn2 v20.4S, v21.4S, v18.4S +trn2 v21.2D, v1.2D, v7.2D +trn2 v18.2D, v6.2D, v20.2D +trn1 v0.2D, v1.2D, v7.2D +trn1 v22.2D, v6.2D, v20.2D +ldr q20, [x17, #+416] +ldr q6, [x17, #+432] +sqrdmulh v7.4S, v21.4S, v6.4S +mul v21.4S, v21.4S,v20.4S +mla v3.4S, v17.4S, v31.s[0] +sub v17.4s, v2.4s, v13.4s +add v2.4s, v2.4s, v13.4s +sqrdmulh v13.4S, v18.4S, v6.4S +mul v18.4S, v18.4S,v20.4S +mla v21.4S, v7.4S, v31.s[0] +sub v7.4s, v11.4s, v3.4s +add v11.4s, v11.4s, v3.4s +trn1 v3.4S, v2.4S, v17.4S +trn2 v1.4S, v2.4S, v17.4S +trn1 v26.4S, v11.4S, v7.4S +trn2 v8.4S, v11.4S, v7.4S +trn2 v11.2D, v3.2D, v26.2D +trn2 v7.2D, v1.2D, v8.2D +trn1 v2.2D, v3.2D, v26.2D +trn1 v17.2D, v1.2D, v8.2D +ldr q8, [x17, #+1440] +ldr q1, [x17, #+1456] +sqrdmulh v26.4S, v11.4S, v1.4S +ldr q3, [x17, #+448] +ldr q9, [x17, #+464] +mul v11.4S, v11.4S,v8.4S +mla v18.4S, v13.4S, v31.s[0] +sub v13.4s, v0.4s, v21.4s +add v0.4s, v0.4s, v21.4s +sqrdmulh v21.4S, v7.4S, v1.4S +ldr q29, [x17, #+480] +ldr q28, [x17, #+496] +mul v7.4S, v7.4S,v8.4S +mla v11.4S, v26.4S, v31.s[0] +sub v26.4s, v22.4s, v18.4s +add v22.4s, v22.4s, v18.4s +sqrdmulh v18.4S, v22.4S, v9.4S +ldr q12, [x17, #+1472] +ldr q14, [x17, #+1488] +mul v22.4S, v22.4S,v3.4S +mla v7.4S, v21.4S, v31.s[0] +sub v21.4s, v2.4s, v11.4s +add v2.4s, v2.4s, v11.4s +sqrdmulh v11.4S, v26.4S, v28.4S +ldr q27, [x17, #+1504] +ldr q30, [x17, #+1520] +mul v26.4S, v26.4S,v29.4S +mla v22.4S, v18.4S, v31.s[0] +sub v18.4s, v17.4s, v7.4s +add v17.4s, v17.4s, v7.4s +sqrdmulh v7.4S, v17.4S, v14.4S +ldr q10, [x0, #224] +mul v17.4S, v17.4S,v12.4S +mla v26.4S, v11.4S, v31.s[0] +sub v11.4s, v0.4s, v22.4s +add v0.4s, v0.4s, v22.4s +sqrdmulh v22.4S, v18.4S, v30.4S +ldr q5, [x0, #240] +mul v18.4S, v18.4S,v27.4S +mla v17.4S, v7.4S, v31.s[0] +sub v7.4s, v13.4s, v26.4s +add v13.4s, v13.4s, v26.4s +mla v18.4S, v22.4S, v31.s[0] +sub v22.4s, v2.4s, v17.4s +add v2.4s, v2.4s, v17.4s +sub v17.4s, v21.4s, v18.4s +add v21.4s, v21.4s, v18.4s +str q0, [x0, #128] +str q11, [x0, #144] +str q13, [x0, #160] +str q7, [x0, #176] +str q2, [x0, #640] +str q22, [x0, #656] +str q21, [x0, #672] +str q17, [x0, #688] +ldr q30, [x17, #+512] +ldr q27, [x17, #+528] +ldr q14, [x0, #192] +ldr q12, [x0, #208] +ldr q1, [x17, #+1536] +ldr q8, [x17, #+1552] +sqrdmulh v19.4S, v10.4S, v27.s[0] +ldr q16, [x0, #736] +mul v10.4S, v10.4S,v30.s[0] +ldr q17, [x0, #752] +sqrdmulh v21.4S, v5.4S, v27.s[0] +ldr q22, [x0, #704] +mul v5.4S, v5.4S,v30.s[0] +ldr q2, [x0, #720] +mla v10.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v16.4S, v8.s[0] +mul v16.4S, v16.4S,v1.s[0] +mla v5.4S, v21.4S, v31.s[0] +sub v21.4s, v14.4s, v10.4s +add v14.4s, v14.4s, v10.4s +sqrdmulh v10.4S, v17.4S, v8.s[0] +mul v17.4S, v17.4S,v1.s[0] +mla v16.4S, v19.4S, v31.s[0] +sub v19.4s, v12.4s, v5.4s +add v12.4s, v12.4s, v5.4s +sqrdmulh v5.4S, v12.4S, v27.s[1] +mul v12.4S, v12.4S,v30.s[1] +mla v17.4S, v10.4S, v31.s[0] +sub v10.4s, v22.4s, v16.4s +add v22.4s, v22.4s, v16.4s +sqrdmulh v16.4S, v19.4S, v27.s[2] +mul v19.4S, v19.4S,v30.s[2] +mla v12.4S, v5.4S, v31.s[0] +sub v5.4s, v2.4s, v17.4s +add v2.4s, v2.4s, v17.4s +sqrdmulh v17.4S, v2.4S, v8.s[1] +mul v2.4S, v2.4S,v1.s[1] +mla v19.4S, v16.4S, v31.s[0] +sub v16.4s, v14.4s, v12.4s +add v14.4s, v14.4s, v12.4s +sqrdmulh v12.4S, v5.4S, v8.s[2] +mul v5.4S, v5.4S,v1.s[2] +mla v2.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v19.4s +add v21.4s, v21.4s, v19.4s +trn1 v19.4S, v14.4S, v16.4S +trn2 v28.4S, v14.4S, v16.4S +trn1 v29.4S, v21.4S, v17.4S +trn2 v9.4S, v21.4S, v17.4S +trn2 v21.2D, v19.2D, v29.2D +trn2 v17.2D, v28.2D, v9.2D +trn1 v14.2D, v19.2D, v29.2D +trn1 v16.2D, v28.2D, v9.2D +ldr q9, [x17, #+544] +ldr q28, [x17, #+560] +sqrdmulh v29.4S, v21.4S, v28.4S +mul v21.4S, v21.4S,v9.4S +mla v5.4S, v12.4S, v31.s[0] +sub v12.4s, v22.4s, v2.4s +add v22.4s, v22.4s, v2.4s +sqrdmulh v2.4S, v17.4S, v28.4S +mul v17.4S, v17.4S,v9.4S +mla v21.4S, v29.4S, v31.s[0] +sub v29.4s, v10.4s, v5.4s +add v10.4s, v10.4s, v5.4s +trn1 v5.4S, v22.4S, v12.4S +trn2 v19.4S, v22.4S, v12.4S +trn1 v3.4S, v10.4S, v29.4S +trn2 v6.4S, v10.4S, v29.4S +trn2 v10.2D, v5.2D, v3.2D +trn2 v29.2D, v19.2D, v6.2D +trn1 v22.2D, v5.2D, v3.2D +trn1 v12.2D, v19.2D, v6.2D +ldr q6, [x17, #+1568] +ldr q19, [x17, #+1584] +sqrdmulh v3.4S, v10.4S, v19.4S +ldr q5, [x17, #+576] +ldr q20, [x17, #+592] +mul v10.4S, v10.4S,v6.4S +mla v17.4S, v2.4S, v31.s[0] +sub v2.4s, v14.4s, v21.4s +add v14.4s, v14.4s, v21.4s +sqrdmulh v21.4S, v29.4S, v19.4S +ldr q4, [x17, #+608] +ldr q15, [x17, #+624] +mul v29.4S, v29.4S,v6.4S +mla v10.4S, v3.4S, v31.s[0] +sub v3.4s, v16.4s, v17.4s +add v16.4s, v16.4s, v17.4s +sqrdmulh v17.4S, v16.4S, v20.4S +ldr q7, [x17, #+1600] +ldr q13, [x17, #+1616] +mul v16.4S, v16.4S,v5.4S +mla v29.4S, v21.4S, v31.s[0] +sub v21.4s, v22.4s, v10.4s +add v22.4s, v22.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v15.4S +ldr q11, [x17, #+1632] +ldr q0, [x17, #+1648] +mul v3.4S, v3.4S,v4.4S +mla v16.4S, v17.4S, v31.s[0] +sub v17.4s, v12.4s, v29.4s +add v12.4s, v12.4s, v29.4s +sqrdmulh v29.4S, v12.4S, v13.4S +ldr q18, [x0, #288] +mul v12.4S, v12.4S,v7.4S +mla v3.4S, v10.4S, v31.s[0] +sub v10.4s, v14.4s, v16.4s +add v14.4s, v14.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v0.4S +ldr q26, [x0, #304] +mul v17.4S, v17.4S,v11.4S +mla v12.4S, v29.4S, v31.s[0] +sub v29.4s, v2.4s, v3.4s +add v2.4s, v2.4s, v3.4s +mla v17.4S, v16.4S, v31.s[0] +sub v16.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +sub v12.4s, v21.4s, v17.4s +add v21.4s, v21.4s, v17.4s +str q14, [x0, #192] +str q10, [x0, #208] +str q2, [x0, #224] +str q29, [x0, #240] +str q22, [x0, #704] +str q16, [x0, #720] +str q21, [x0, #736] +str q12, [x0, #752] +ldr q0, [x17, #+640] +ldr q11, [x17, #+656] +ldr q13, [x0, #256] +ldr q7, [x0, #272] +ldr q19, [x17, #+1664] +ldr q6, [x17, #+1680] +sqrdmulh v8.4S, v18.4S, v11.s[0] +ldr q1, [x0, #800] +mul v18.4S, v18.4S,v0.s[0] +ldr q12, [x0, #816] +sqrdmulh v21.4S, v26.4S, v11.s[0] +ldr q16, [x0, #768] +mul v26.4S, v26.4S,v0.s[0] +ldr q22, [x0, #784] +mla v18.4S, v8.4S, v31.s[0] +sqrdmulh v8.4S, v1.4S, v6.s[0] +mul v1.4S, v1.4S,v19.s[0] +mla v26.4S, v21.4S, v31.s[0] +sub v21.4s, v13.4s, v18.4s +add v13.4s, v13.4s, v18.4s +sqrdmulh v18.4S, v12.4S, v6.s[0] +mul v12.4S, v12.4S,v19.s[0] +mla v1.4S, v8.4S, v31.s[0] +sub v8.4s, v7.4s, v26.4s +add v7.4s, v7.4s, v26.4s +sqrdmulh v26.4S, v7.4S, v11.s[1] +mul v7.4S, v7.4S,v0.s[1] +mla v12.4S, v18.4S, v31.s[0] +sub v18.4s, v16.4s, v1.4s +add v16.4s, v16.4s, v1.4s +sqrdmulh v1.4S, v8.4S, v11.s[2] +mul v8.4S, v8.4S,v0.s[2] +mla v7.4S, v26.4S, v31.s[0] +sub v26.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v22.4S, v6.s[1] +mul v22.4S, v22.4S,v19.s[1] +mla v8.4S, v1.4S, v31.s[0] +sub v1.4s, v13.4s, v7.4s +add v13.4s, v13.4s, v7.4s +sqrdmulh v7.4S, v26.4S, v6.s[2] +mul v26.4S, v26.4S,v19.s[2] +mla v22.4S, v12.4S, v31.s[0] +sub v12.4s, v21.4s, v8.4s +add v21.4s, v21.4s, v8.4s +trn1 v8.4S, v13.4S, v1.4S +trn2 v15.4S, v13.4S, v1.4S +trn1 v4.4S, v21.4S, v12.4S +trn2 v20.4S, v21.4S, v12.4S +trn2 v21.2D, v8.2D, v4.2D +trn2 v12.2D, v15.2D, v20.2D +trn1 v13.2D, v8.2D, v4.2D +trn1 v1.2D, v15.2D, v20.2D +ldr q20, [x17, #+672] +ldr q15, [x17, #+688] +sqrdmulh v4.4S, v21.4S, v15.4S +mul v21.4S, v21.4S,v20.4S +mla v26.4S, v7.4S, v31.s[0] +sub v7.4s, v16.4s, v22.4s +add v16.4s, v16.4s, v22.4s +sqrdmulh v22.4S, v12.4S, v15.4S +mul v12.4S, v12.4S,v20.4S +mla v21.4S, v4.4S, v31.s[0] +sub v4.4s, v18.4s, v26.4s +add v18.4s, v18.4s, v26.4s +trn1 v26.4S, v16.4S, v7.4S +trn2 v8.4S, v16.4S, v7.4S +trn1 v5.4S, v18.4S, v4.4S +trn2 v28.4S, v18.4S, v4.4S +trn2 v18.2D, v26.2D, v5.2D +trn2 v4.2D, v8.2D, v28.2D +trn1 v16.2D, v26.2D, v5.2D +trn1 v7.2D, v8.2D, v28.2D +ldr q28, [x17, #+1696] +ldr q8, [x17, #+1712] +sqrdmulh v5.4S, v18.4S, v8.4S +ldr q26, [x17, #+704] +ldr q9, [x17, #+720] +mul v18.4S, v18.4S,v28.4S +mla v12.4S, v22.4S, v31.s[0] +sub v22.4s, v13.4s, v21.4s +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v4.4S, v8.4S +ldr q27, [x17, #+736] +ldr q30, [x17, #+752] +mul v4.4S, v4.4S,v28.4S +mla v18.4S, v5.4S, v31.s[0] +sub v5.4s, v1.4s, v12.4s +add v1.4s, v1.4s, v12.4s +sqrdmulh v12.4S, v1.4S, v9.4S +ldr q29, [x17, #+1728] +ldr q2, [x17, #+1744] +mul v1.4S, v1.4S,v26.4S +mla v4.4S, v21.4S, v31.s[0] +sub v21.4s, v16.4s, v18.4s +add v16.4s, v16.4s, v18.4s +sqrdmulh v18.4S, v5.4S, v30.4S +ldr q10, [x17, #+1760] +ldr q14, [x17, #+1776] +mul v5.4S, v5.4S,v27.4S +mla v1.4S, v12.4S, v31.s[0] +sub v12.4s, v7.4s, v4.4s +add v7.4s, v7.4s, v4.4s +sqrdmulh v4.4S, v7.4S, v2.4S +ldr q17, [x0, #352] +mul v7.4S, v7.4S,v29.4S +mla v5.4S, v18.4S, v31.s[0] +sub v18.4s, v13.4s, v1.4s +add v13.4s, v13.4s, v1.4s +sqrdmulh v1.4S, v12.4S, v14.4S +ldr q3, [x0, #368] +mul v12.4S, v12.4S,v10.4S +mla v7.4S, v4.4S, v31.s[0] +sub v4.4s, v22.4s, v5.4s +add v22.4s, v22.4s, v5.4s +mla v12.4S, v1.4S, v31.s[0] +sub v1.4s, v16.4s, v7.4s +add v16.4s, v16.4s, v7.4s +sub v7.4s, v21.4s, v12.4s +add v21.4s, v21.4s, v12.4s +str q13, [x0, #256] +str q18, [x0, #272] +str q22, [x0, #288] +str q4, [x0, #304] +str q16, [x0, #768] +str q1, [x0, #784] +str q21, [x0, #800] +str q7, [x0, #816] +ldr q14, [x17, #+768] +ldr q10, [x17, #+784] +ldr q2, [x0, #320] +ldr q29, [x0, #336] +ldr q8, [x17, #+1792] +ldr q28, [x17, #+1808] +sqrdmulh v6.4S, v17.4S, v10.s[0] +ldr q19, [x0, #864] +mul v17.4S, v17.4S,v14.s[0] +ldr q7, [x0, #880] +sqrdmulh v21.4S, v3.4S, v10.s[0] +ldr q1, [x0, #832] +mul v3.4S, v3.4S,v14.s[0] +ldr q16, [x0, #848] +mla v17.4S, v6.4S, v31.s[0] +sqrdmulh v6.4S, v19.4S, v28.s[0] +mul v19.4S, v19.4S,v8.s[0] +mla v3.4S, v21.4S, v31.s[0] +sub v21.4s, v2.4s, v17.4s +add v2.4s, v2.4s, v17.4s +sqrdmulh v17.4S, v7.4S, v28.s[0] +mul v7.4S, v7.4S,v8.s[0] +mla v19.4S, v6.4S, v31.s[0] +sub v6.4s, v29.4s, v3.4s +add v29.4s, v29.4s, v3.4s +sqrdmulh v3.4S, v29.4S, v10.s[1] +mul v29.4S, v29.4S,v14.s[1] +mla v7.4S, v17.4S, v31.s[0] +sub v17.4s, v1.4s, v19.4s +add v1.4s, v1.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v10.s[2] +mul v6.4S, v6.4S,v14.s[2] +mla v29.4S, v3.4S, v31.s[0] +sub v3.4s, v16.4s, v7.4s +add v16.4s, v16.4s, v7.4s +sqrdmulh v7.4S, v16.4S, v28.s[1] +mul v16.4S, v16.4S,v8.s[1] +mla v6.4S, v19.4S, v31.s[0] +sub v19.4s, v2.4s, v29.4s +add v2.4s, v2.4s, v29.4s +sqrdmulh v29.4S, v3.4S, v28.s[2] +mul v3.4S, v3.4S,v8.s[2] +mla v16.4S, v7.4S, v31.s[0] +sub v7.4s, v21.4s, v6.4s +add v21.4s, v21.4s, v6.4s +trn1 v6.4S, v2.4S, v19.4S +trn2 v30.4S, v2.4S, v19.4S +trn1 v27.4S, v21.4S, v7.4S +trn2 v9.4S, v21.4S, v7.4S +trn2 v21.2D, v6.2D, v27.2D +trn2 v7.2D, v30.2D, v9.2D +trn1 v2.2D, v6.2D, v27.2D +trn1 v19.2D, v30.2D, v9.2D +ldr q9, [x17, #+800] +ldr q30, [x17, #+816] +sqrdmulh v27.4S, v21.4S, v30.4S +mul v21.4S, v21.4S,v9.4S +mla v3.4S, v29.4S, v31.s[0] +sub v29.4s, v1.4s, v16.4s +add v1.4s, v1.4s, v16.4s +sqrdmulh v16.4S, v7.4S, v30.4S +mul v7.4S, v7.4S,v9.4S +mla v21.4S, v27.4S, v31.s[0] +sub v27.4s, v17.4s, v3.4s +add v17.4s, v17.4s, v3.4s +trn1 v3.4S, v1.4S, v29.4S +trn2 v6.4S, v1.4S, v29.4S +trn1 v26.4S, v17.4S, v27.4S +trn2 v15.4S, v17.4S, v27.4S +trn2 v17.2D, v3.2D, v26.2D +trn2 v27.2D, v6.2D, v15.2D +trn1 v1.2D, v3.2D, v26.2D +trn1 v29.2D, v6.2D, v15.2D +ldr q15, [x17, #+1824] +ldr q6, [x17, #+1840] +sqrdmulh v26.4S, v17.4S, v6.4S +ldr q3, [x17, #+832] +ldr q20, [x17, #+848] +mul v17.4S, v17.4S,v15.4S +mla v7.4S, v16.4S, v31.s[0] +sub v16.4s, v2.4s, v21.4s +add v2.4s, v2.4s, v21.4s +sqrdmulh v21.4S, v27.4S, v6.4S +ldr q11, [x17, #+864] +ldr q0, [x17, #+880] +mul v27.4S, v27.4S,v15.4S +mla v17.4S, v26.4S, v31.s[0] +sub v26.4s, v19.4s, v7.4s +add v19.4s, v19.4s, v7.4s +sqrdmulh v7.4S, v19.4S, v20.4S +ldr q4, [x17, #+1856] +ldr q22, [x17, #+1872] +mul v19.4S, v19.4S,v3.4S +mla v27.4S, v21.4S, v31.s[0] +sub v21.4s, v1.4s, v17.4s +add v1.4s, v1.4s, v17.4s +sqrdmulh v17.4S, v26.4S, v0.4S +ldr q18, [x17, #+1888] +ldr q13, [x17, #+1904] +mul v26.4S, v26.4S,v11.4S +mla v19.4S, v7.4S, v31.s[0] +sub v7.4s, v29.4s, v27.4s +add v29.4s, v29.4s, v27.4s +sqrdmulh v27.4S, v29.4S, v22.4S +ldr q12, [x0, #416] +mul v29.4S, v29.4S,v4.4S +mla v26.4S, v17.4S, v31.s[0] +sub v17.4s, v2.4s, v19.4s +add v2.4s, v2.4s, v19.4s +sqrdmulh v19.4S, v7.4S, v13.4S +ldr q5, [x0, #432] +mul v7.4S, v7.4S,v18.4S +mla v29.4S, v27.4S, v31.s[0] +sub v27.4s, v16.4s, v26.4s +add v16.4s, v16.4s, v26.4s +mla v7.4S, v19.4S, v31.s[0] +sub v19.4s, v1.4s, v29.4s +add v1.4s, v1.4s, v29.4s +sub v29.4s, v21.4s, v7.4s +add v21.4s, v21.4s, v7.4s +str q2, [x0, #320] +str q17, [x0, #336] +str q16, [x0, #352] +str q27, [x0, #368] +str q1, [x0, #832] +str q19, [x0, #848] +str q21, [x0, #864] +str q29, [x0, #880] +ldr q13, [x17, #+896] +ldr q18, [x17, #+912] +ldr q22, [x0, #384] +ldr q4, [x0, #400] +ldr q6, [x17, #+1920] +ldr q15, [x17, #+1936] +sqrdmulh v28.4S, v12.4S, v18.s[0] +ldr q8, [x0, #928] +mul v12.4S, v12.4S,v13.s[0] +ldr q29, [x0, #944] +sqrdmulh v21.4S, v5.4S, v18.s[0] +ldr q19, [x0, #896] +mul v5.4S, v5.4S,v13.s[0] +ldr q1, [x0, #912] +mla v12.4S, v28.4S, v31.s[0] +sqrdmulh v28.4S, v8.4S, v15.s[0] +mul v8.4S, v8.4S,v6.s[0] +mla v5.4S, v21.4S, v31.s[0] +sub v21.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v29.4S, v15.s[0] +mul v29.4S, v29.4S,v6.s[0] +mla v8.4S, v28.4S, v31.s[0] +sub v28.4s, v4.4s, v5.4s +add v4.4s, v4.4s, v5.4s +sqrdmulh v5.4S, v4.4S, v18.s[1] +mul v4.4S, v4.4S,v13.s[1] +mla v29.4S, v12.4S, v31.s[0] +sub v12.4s, v19.4s, v8.4s +add v19.4s, v19.4s, v8.4s +sqrdmulh v8.4S, v28.4S, v18.s[2] +mul v28.4S, v28.4S,v13.s[2] +mla v4.4S, v5.4S, v31.s[0] +sub v5.4s, v1.4s, v29.4s +add v1.4s, v1.4s, v29.4s +sqrdmulh v29.4S, v1.4S, v15.s[1] +mul v1.4S, v1.4S,v6.s[1] +mla v28.4S, v8.4S, v31.s[0] +sub v8.4s, v22.4s, v4.4s +add v22.4s, v22.4s, v4.4s +sqrdmulh v4.4S, v5.4S, v15.s[2] +mul v5.4S, v5.4S,v6.s[2] +mla v1.4S, v29.4S, v31.s[0] +sub v29.4s, v21.4s, v28.4s +add v21.4s, v21.4s, v28.4s +trn1 v28.4S, v22.4S, v8.4S +trn2 v0.4S, v22.4S, v8.4S +trn1 v11.4S, v21.4S, v29.4S +trn2 v20.4S, v21.4S, v29.4S +trn2 v21.2D, v28.2D, v11.2D +trn2 v29.2D, v0.2D, v20.2D +trn1 v22.2D, v28.2D, v11.2D +trn1 v8.2D, v0.2D, v20.2D +ldr q20, [x17, #+928] +ldr q0, [x17, #+944] +sqrdmulh v11.4S, v21.4S, v0.4S +mul v21.4S, v21.4S,v20.4S +mla v5.4S, v4.4S, v31.s[0] +sub v4.4s, v19.4s, v1.4s +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v29.4S, v0.4S +mul v29.4S, v29.4S,v20.4S +mla v21.4S, v11.4S, v31.s[0] +sub v11.4s, v12.4s, v5.4s +add v12.4s, v12.4s, v5.4s +trn1 v5.4S, v19.4S, v4.4S +trn2 v28.4S, v19.4S, v4.4S +trn1 v3.4S, v12.4S, v11.4S +trn2 v30.4S, v12.4S, v11.4S +trn2 v12.2D, v5.2D, v3.2D +trn2 v11.2D, v28.2D, v30.2D +trn1 v19.2D, v5.2D, v3.2D +trn1 v4.2D, v28.2D, v30.2D +ldr q30, [x17, #+1952] +ldr q28, [x17, #+1968] +sqrdmulh v3.4S, v12.4S, v28.4S +ldr q5, [x17, #+960] +ldr q9, [x17, #+976] +mul v12.4S, v12.4S,v30.4S +mla v29.4S, v1.4S, v31.s[0] +sub v1.4s, v22.4s, v21.4s +add v22.4s, v22.4s, v21.4s +sqrdmulh v21.4S, v11.4S, v28.4S +ldr q10, [x17, #+992] +ldr q14, [x17, #+1008] +mul v11.4S, v11.4S,v30.4S +mla v12.4S, v3.4S, v31.s[0] +sub v3.4s, v8.4s, v29.4s +add v8.4s, v8.4s, v29.4s +sqrdmulh v29.4S, v8.4S, v9.4S +ldr q27, [x17, #+1984] +ldr q16, [x17, #+2000] +mul v8.4S, v8.4S,v5.4S +mla v11.4S, v21.4S, v31.s[0] +sub v21.4s, v19.4s, v12.4s +add v19.4s, v19.4s, v12.4s +sqrdmulh v12.4S, v3.4S, v14.4S +ldr q17, [x17, #+2016] +ldr q2, [x17, #+2032] +mul v3.4S, v3.4S,v10.4S +mla v8.4S, v29.4S, v31.s[0] +sub v29.4s, v4.4s, v11.4s +add v4.4s, v4.4s, v11.4s +sqrdmulh v11.4S, v4.4S, v16.4S +ldr q7, [x0, #480] +mul v4.4S, v4.4S,v27.4S +mla v3.4S, v12.4S, v31.s[0] +sub v12.4s, v22.4s, v8.4s +add v22.4s, v22.4s, v8.4s +sqrdmulh v8.4S, v29.4S, v2.4S +ldr q26, [x0, #496] +mul v29.4S, v29.4S,v17.4S +mla v4.4S, v11.4S, v31.s[0] +sub v11.4s, v1.4s, v3.4s +add v1.4s, v1.4s, v3.4s +mla v29.4S, v8.4S, v31.s[0] +sub v8.4s, v19.4s, v4.4s +add v19.4s, v19.4s, v4.4s +sub v4.4s, v21.4s, v29.4s +add v21.4s, v21.4s, v29.4s +str q22, [x0, #384] +str q12, [x0, #400] +str q1, [x0, #416] +str q11, [x0, #432] +str q19, [x0, #896] +str q8, [x0, #912] +str q21, [x0, #928] +str q4, [x0, #944] +ldr q2, [x17, #+1024] +ldr q17, [x17, #+1040] +ldr q16, [x0, #448] +ldr q27, [x0, #464] +ldr q28, [x17, #+2048] +ldr q30, [x17, #+2064] +sqrdmulh v15.4S, v7.4S, v17.s[0] +ldr q6, [x0, #992] +mul v7.4S, v7.4S,v2.s[0] +ldr q4, [x0, #1008] +sqrdmulh v21.4S, v26.4S, v17.s[0] +ldr q8, [x0, #960] +mul v26.4S, v26.4S,v2.s[0] +ldr q19, [x0, #976] +mla v7.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v6.4S, v30.s[0] +mul v6.4S, v6.4S,v28.s[0] +mla v26.4S, v21.4S, v31.s[0] +sub v21.4s, v16.4s, v7.4s +add v16.4s, v16.4s, v7.4s +sqrdmulh v7.4S, v4.4S, v30.s[0] +mul v4.4S, v4.4S,v28.s[0] +mla v6.4S, v15.4S, v31.s[0] +sub v15.4s, v27.4s, v26.4s +add v27.4s, v27.4s, v26.4s +sqrdmulh v26.4S, v27.4S, v17.s[1] +mul v27.4S, v27.4S,v2.s[1] +mla v4.4S, v7.4S, v31.s[0] +sub v7.4s, v8.4s, v6.4s +add v8.4s, v8.4s, v6.4s +sqrdmulh v6.4S, v15.4S, v17.s[2] +mul v15.4S, v15.4S,v2.s[2] +mla v27.4S, v26.4S, v31.s[0] +sub v26.4s, v19.4s, v4.4s +add v19.4s, v19.4s, v4.4s +sqrdmulh v4.4S, v19.4S, v30.s[1] +mul v19.4S, v19.4S,v28.s[1] +mla v15.4S, v6.4S, v31.s[0] +sub v6.4s, v16.4s, v27.4s +add v16.4s, v16.4s, v27.4s +sqrdmulh v27.4S, v26.4S, v30.s[2] +mul v26.4S, v26.4S,v28.s[2] +mla v19.4S, v4.4S, v31.s[0] +sub v4.4s, v21.4s, v15.4s +add v21.4s, v21.4s, v15.4s +trn1 v15.4S, v16.4S, v6.4S +trn2 v14.4S, v16.4S, v6.4S +trn1 v10.4S, v21.4S, v4.4S +trn2 v9.4S, v21.4S, v4.4S +trn2 v21.2D, v15.2D, v10.2D +trn2 v4.2D, v14.2D, v9.2D +trn1 v16.2D, v15.2D, v10.2D +trn1 v6.2D, v14.2D, v9.2D +ldr q9, [x17, #+1056] +ldr q14, [x17, #+1072] +sqrdmulh v10.4S, v21.4S, v14.4S +mul v21.4S, v21.4S,v9.4S +mla v26.4S, v27.4S, v31.s[0] +sub v27.4s, v8.4s, v19.4s +add v8.4s, v8.4s, v19.4s +sqrdmulh v19.4S, v4.4S, v14.4S +mul v4.4S, v4.4S,v9.4S +mla v21.4S, v10.4S, v31.s[0] +sub v10.4s, v7.4s, v26.4s +add v7.4s, v7.4s, v26.4s +trn1 v26.4S, v8.4S, v27.4S +trn2 v15.4S, v8.4S, v27.4S +trn1 v5.4S, v7.4S, v10.4S +trn2 v0.4S, v7.4S, v10.4S +trn2 v7.2D, v26.2D, v5.2D +trn2 v10.2D, v15.2D, v0.2D +trn1 v8.2D, v26.2D, v5.2D +trn1 v27.2D, v15.2D, v0.2D +ldr q0, [x17, #+2080] +ldr q15, [x17, #+2096] +sqrdmulh v5.4S, v7.4S, v15.4S +ldr q26, [x17, #+1088] +ldr q20, [x17, #+1104] +mul v7.4S, v7.4S,v0.4S +mla v4.4S, v19.4S, v31.s[0] +sub v19.4s, v16.4s, v21.4s +add v16.4s, v16.4s, v21.4s +sqrdmulh v21.4S, v10.4S, v15.4S +ldr q18, [x17, #+1120] +ldr q13, [x17, #+1136] +mul v10.4S, v10.4S,v0.4S +mla v7.4S, v5.4S, v31.s[0] +sub v5.4s, v6.4s, v4.4s +add v6.4s, v6.4s, v4.4s +sqrdmulh v4.4S, v6.4S, v20.4S +ldr q11, [x17, #+2112] +ldr q1, [x17, #+2128] +mul v6.4S, v6.4S,v26.4S +mla v10.4S, v21.4S, v31.s[0] +sub v21.4s, v8.4s, v7.4s +add v8.4s, v8.4s, v7.4s +sqrdmulh v7.4S, v5.4S, v13.4S +ldr q12, [x17, #+2144] +ldr q22, [x17, #+2160] +mul v5.4S, v5.4S,v18.4S +mla v6.4S, v4.4S, v31.s[0] +sub v4.4s, v27.4s, v10.4s +add v27.4s, v27.4s, v10.4s +sqrdmulh v10.4S, v27.4S, v1.4S +mul v27.4S, v27.4S,v11.4S +mla v5.4S, v7.4S, v31.s[0] +sub v7.4s, v16.4s, v6.4s +add v16.4s, v16.4s, v6.4s +sqrdmulh v6.4S, v4.4S, v22.4S +mul v4.4S, v4.4S,v12.4S +mla v27.4S, v10.4S, v31.s[0] +sub v10.4s, v19.4s, v5.4s +add v19.4s, v19.4s, v5.4s +mla v4.4S, v6.4S, v31.s[0] +sub v6.4s, v8.4s, v27.4s +add v8.4s, v8.4s, v27.4s +sub v27.4s, v21.4s, v4.4s +add v21.4s, v21.4s, v4.4s +str q16, [x0, #448] +str q7, [x0, #464] +str q19, [x0, #480] +str q10, [x0, #496] +str q8, [x0, #960] +str q6, [x0, #976] +str q21, [x0, #992] +str q27, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 2392 +// Instruction count: 2388 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_3_z4_0.s b/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_3_z4_0.s new file mode 100644 index 0000000..7675702 --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_3_z4_0.s @@ -0,0 +1,2422 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 26036764 // Layer 6, block 0 +.word 7065381 // Layer 6, block 1 +.word 11280567 // Layer 6, block 2 +.word 19695786 // Layer 6, block 3 +.word 1666225723 // Layer 6, block 0 +.word 452149874 // Layer 6, block 1 +.word 721901190 // Layer 6, block 2 +.word 1260434103 // Layer 6, block 3 +.word 28678040 // Layer 7, block 0 +.word 5637166 // Layer 7, block 2 +.word 18759424 // Layer 7, block 4 +.word 8648030 // Layer 7, block 6 +.word 1835254486 // Layer 7, block 0 +.word 360751090 // Layer 7, block 2 +.word 1200511508 // Layer 7, block 4 +.word 553431680 // Layer 7, block 6 +.word 7232147 // Layer 7, block 1 +.word 7430689 // Layer 7, block 3 +.word 14819378 // Layer 7, block 5 +.word 22112339 // Layer 7, block 7 +.word 462822084 // Layer 7, block 1 +.word 475527802 // Layer 7, block 3 +.word 948367809 // Layer 7, block 5 +.word 1415081692 // Layer 7, block 7 +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14834498 // Layer 6, block 4 +.word 22861321 // Layer 6, block 5 +.word 23033862 // Layer 6, block 6 +.word 32211066 // Layer 6, block 7 +.word 949335415 // Layer 6, block 4 +.word 1463012881 // Layer 6, block 5 +.word 1474054663 // Layer 6, block 6 +.word 2061350894 // Layer 6, block 7 +.word 7103825 // Layer 7, block 8 +.word 24338119 // Layer 7, block 10 +.word 6674394 // Layer 7, block 12 +.word 3716128 // Layer 7, block 14 +.word 454610102 // Layer 7, block 8 +.word 1557520740 // Layer 7, block 10 +.word 427128616 // Layer 7, block 12 +.word 237814041 // Layer 7, block 14 +.word 18577393 // Layer 7, block 9 +.word 17042091 // Layer 7, block 11 +.word 6574213 // Layer 7, block 13 +.word 24666803 // Layer 7, block 15 +.word 1188862414 // Layer 7, block 9 +.word 1090610585 // Layer 7, block 11 +.word 420717521 // Layer 7, block 13 +.word 1578554911 // Layer 7, block 15 +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 11253846 // Layer 6, block 8 +.word 16151303 // Layer 6, block 9 +.word 1821442 // Layer 6, block 10 +.word 23358663 // Layer 6, block 11 +.word 720191176 // Layer 6, block 8 +.word 1033604503 // Layer 6, block 9 +.word 116563391 // Layer 6, block 10 +.word 1494840340 // Layer 6, block 11 +.word 32787475 // Layer 7, block 16 +.word 8269259 // Layer 7, block 18 +.word 20826321 // Layer 7, block 20 +.word 21194054 // Layer 7, block 22 +.word 2098238255 // Layer 7, block 16 +.word 529192186 // Layer 7, block 18 +.word 1332782821 // Layer 7, block 20 +.word 1356315937 // Layer 7, block 22 +.word 28400654 // Layer 7, block 17 +.word 31090287 // Layer 7, block 19 +.word 26776841 // Layer 7, block 21 +.word 22281074 // Layer 7, block 23 +.word 1817503137 // Layer 7, block 17 +.word 1989626512 // Layer 7, block 19 +.word 1713587037 // Layer 7, block 21 +.word 1425879908 // Layer 7, block 23 +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 20504641 // Layer 6, block 12 +.word 7735096 // Layer 6, block 13 +.word 29463916 // Layer 6, block 14 +.word 23172067 // Layer 6, block 15 +.word 1312196872 // Layer 6, block 12 +.word 495008363 // Layer 6, block 13 +.word 1885546712 // Layer 6, block 14 +.word 1482899108 // Layer 6, block 15 +.word 1953000 // Layer 7, block 24 +.word 12766243 // Layer 7, block 26 +.word 16292342 // Layer 7, block 28 +.word 25143337 // Layer 7, block 30 +.word 124982461 // Layer 7, block 24 +.word 816977197 // Layer 7, block 26 +.word 1042630311 // Layer 7, block 28 +.word 1609050759 // Layer 7, block 30 +.word 12486848 // Layer 7, block 25 +.word 31556661 // Layer 7, block 27 +.word 28330310 // Layer 7, block 29 +.word 15137961 // Layer 7, block 31 +.word 799097282 // Layer 7, block 25 +.word 2019472170 // Layer 7, block 27 +.word 1813001465 // Layer 7, block 29 +.word 968755565 // Layer 7, block 31 +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 18663828 // Layer 6, block 16 +.word 25765932 // Layer 6, block 17 +.word 11779122 // Layer 6, block 18 +.word 29112305 // Layer 6, block 19 +.word 1194393831 // Layer 6, block 16 +.word 1648893798 // Layer 6, block 17 +.word 753806275 // Layer 6, block 18 +.word 1863045325 // Layer 6, block 19 +.word 33163184 // Layer 7, block 32 +.word 11550623 // Layer 7, block 34 +.word 25375595 // Layer 7, block 36 +.word 18254638 // Layer 7, block 38 +.word 2122281795 // Layer 7, block 32 +.word 739183455 // Layer 7, block 34 +.word 1623914137 // Layer 7, block 36 +.word 1168207670 // Layer 7, block 38 +.word 9551359 // Layer 7, block 33 +.word 33257316 // Layer 7, block 35 +.word 10387700 // Layer 7, block 37 +.word 4263629 // Layer 7, block 39 +.word 611240324 // Layer 7, block 33 +.word 2128305784 // Layer 7, block 35 +.word 664762063 // Layer 7, block 37 +.word 272851431 // Layer 7, block 39 +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 596073 // Layer 6, block 20 +.word 29039358 // Layer 6, block 21 +.word 6760262 // Layer 6, block 22 +.word 2228887 // Layer 6, block 23 +.word 38145761 // Layer 6, block 20 +.word 1858377074 // Layer 6, block 21 +.word 432623749 // Layer 6, block 22 +.word 142637881 // Layer 6, block 23 +.word 25929180 // Layer 7, block 40 +.word 23508428 // Layer 7, block 42 +.word 22560727 // Layer 7, block 44 +.word 29457393 // Layer 7, block 46 +.word 1659340873 // Layer 7, block 40 +.word 1504424569 // Layer 7, block 42 +.word 1443776334 // Layer 7, block 44 +.word 1885129272 // Layer 7, block 46 +.word 17371159 // Layer 7, block 41 +.word 11558208 // Layer 7, block 43 +.word 15755637 // Layer 7, block 45 +.word 20740787 // Layer 7, block 47 +.word 1111669329 // Layer 7, block 41 +.word 739668858 // Layer 7, block 43 +.word 1008283812 // Layer 7, block 45 +.word 1327309063 // Layer 7, block 47 +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 13624329 // Layer 6, block 24 +.word 9838349 // Layer 6, block 25 +.word 6934560 // Layer 6, block 26 +.word 11310234 // Layer 6, block 27 +.word 871890510 // Layer 6, block 24 +.word 629606282 // Layer 6, block 25 +.word 443777969 // Layer 6, block 26 +.word 723799733 // Layer 6, block 27 +.word 3153984 // Layer 7, block 48 +.word 15599806 // Layer 7, block 50 +.word 23484790 // Layer 7, block 52 +.word 30174454 // Layer 7, block 54 +.word 201839571 // Layer 7, block 48 +.word 998311389 // Layer 7, block 50 +.word 1502911852 // Layer 7, block 52 +.word 1931017673 // Layer 7, block 54 +.word 13598070 // Layer 7, block 49 +.word 31454003 // Layer 7, block 51 +.word 20506260 // Layer 7, block 53 +.word 5928435 // Layer 7, block 55 +.word 870210062 // Layer 7, block 49 +.word 2012902560 // Layer 7, block 51 +.word 1312300480 // Layer 7, block 53 +.word 379390883 // Layer 7, block 55 +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 32798516 // Layer 6, block 28 +.word 9911360 // Layer 6, block 29 +.word 32443170 // Layer 6, block 30 +.word 31293482 // Layer 6, block 31 +.word 2098944825 // Layer 6, block 28 +.word 634278629 // Layer 6, block 29 +.word 2076204416 // Layer 6, block 30 +.word 2002630000 // Layer 6, block 31 +.word 26013877 // Layer 7, block 56 +.word 22928950 // Layer 7, block 58 +.word 24547058 // Layer 7, block 60 +.word 21082546 // Layer 7, block 62 +.word 1664761067 // Layer 7, block 56 +.word 1467340807 // Layer 7, block 58 +.word 1570891816 // Layer 7, block 60 +.word 1349179970 // Layer 7, block 62 +.word 21864746 // Layer 7, block 57 +.word 27678266 // Layer 7, block 59 +.word 30695887 // Layer 7, block 61 +.word 31772478 // Layer 7, block 63 +.word 1399236949 // Layer 7, block 57 +.word 1771273834 // Layer 7, block 59 +.word 1964386839 // Layer 7, block 61 +.word 2033283404 // Layer 7, block 63 +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 2853776 // Layer 6, block 32 +.word 31645959 // Layer 6, block 33 +.word 29723614 // Layer 6, block 34 +.word 31813171 // Layer 6, block 35 +.word 182627725 // Layer 6, block 32 +.word 2025186806 // Layer 6, block 33 +.word 1902166116 // Layer 6, block 34 +.word 2035887557 // Layer 6, block 35 +.word 30377953 // Layer 7, block 64 +.word 4924837 // Layer 7, block 66 +.word 11362575 // Layer 7, block 68 +.word 31398766 // Layer 7, block 70 +.word 1944040616 // Layer 7, block 64 +.word 315165513 // Layer 7, block 66 +.word 727149301 // Layer 7, block 68 +.word 2009367662 // Layer 7, block 70 +.word 27689101 // Layer 7, block 65 +.word 31229525 // Layer 7, block 67 +.word 6544948 // Layer 7, block 69 +.word 13728247 // Layer 7, block 71 +.word 1771967221 // Layer 7, block 65 +.word 1998537064 // Layer 7, block 67 +.word 418844704 // Layer 7, block 69 +.word 878540754 // Layer 7, block 71 +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9116920 // Layer 6, block 36 +.word 26449800 // Layer 6, block 37 +.word 27173300 // Layer 6, block 38 +.word 1574249 // Layer 6, block 39 +.word 583438350 // Layer 6, block 36 +.word 1692658010 // Layer 6, block 37 +.word 1738958476 // Layer 6, block 38 +.word 100744247 // Layer 6, block 39 +.word 6510145 // Layer 7, block 72 +.word 760999 // Layer 7, block 74 +.word 1634503 // Layer 7, block 76 +.word 29546109 // Layer 7, block 78 +.word 416617482 // Layer 7, block 72 +.word 48700219 // Layer 7, block 74 +.word 104600209 // Layer 7, block 76 +.word 1890806663 // Layer 7, block 78 +.word 2195232 // Layer 7, block 73 +.word 4465852 // Layer 7, block 75 +.word 31203102 // Layer 7, block 77 +.word 29916743 // Layer 7, block 79 +.word 140484126 // Layer 7, block 73 +.word 285792715 // Layer 7, block 75 +.word 1996846121 // Layer 7, block 77 +.word 1914525428 // Layer 7, block 79 +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29172999 // Layer 6, block 40 +.word 16825951 // Layer 6, block 41 +.word 11592382 // Layer 6, block 42 +.word 2671395 // Layer 6, block 43 +.word 1866929445 // Layer 6, block 40 +.word 1076778680 // Layer 6, block 41 +.word 741855827 // Layer 6, block 42 +.word 170956232 // Layer 6, block 43 +.word 14579779 // Layer 7, block 80 +.word 24263513 // Layer 7, block 82 +.word 4646776 // Layer 7, block 84 +.word 69049 // Layer 7, block 86 +.word 933034643 // Layer 7, block 80 +.word 1552746321 // Layer 7, block 82 +.word 297370968 // Layer 7, block 84 +.word 4418799 // Layer 7, block 86 +.word 33263488 // Layer 7, block 81 +.word 22493246 // Layer 7, block 83 +.word 22009979 // Layer 7, block 85 +.word 12021234 // Layer 7, block 87 +.word 2128700762 // Layer 7, block 81 +.word 1439457879 // Layer 7, block 83 +.word 1408531152 // Layer 7, block 85 +.word 769300260 // Layer 7, block 87 +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 15720958 // Layer 6, block 44 +.word 4876619 // Layer 6, block 45 +.word 9370171 // Layer 6, block 46 +.word 2197027 // Layer 6, block 47 +.word 1006064525 // Layer 6, block 44 +.word 312079797 // Layer 6, block 45 +.word 599645177 // Layer 6, block 46 +.word 140598997 // Layer 6, block 47 +.word 16117282 // Layer 7, block 88 +.word 9635661 // Layer 7, block 90 +.word 9117520 // Layer 7, block 92 +.word 3506913 // Layer 7, block 94 +.word 1031427326 // Layer 7, block 88 +.word 616635240 // Layer 7, block 90 +.word 583476747 // Layer 7, block 92 +.word 224425303 // Layer 7, block 94 +.word 20014407 // Layer 7, block 89 +.word 25893988 // Layer 7, block 91 +.word 10257619 // Layer 7, block 93 +.word 24501669 // Layer 7, block 95 +.word 1280824291 // Layer 7, block 89 +.word 1657088757 // Layer 7, block 91 +.word 656437514 // Layer 7, block 93 +.word 1567987141 // Layer 7, block 95 +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 23467272 // Layer 6, block 48 +.word 11944835 // Layer 6, block 49 +.word 29768154 // Layer 6, block 50 +.word 3189790 // Layer 6, block 51 +.word 1501790786 // Layer 6, block 48 +.word 764411097 // Layer 6, block 49 +.word 1905016458 // Layer 6, block 50 +.word 204130980 // Layer 6, block 51 +.word 28559032 // Layer 7, block 96 +.word 20151609 // Layer 7, block 98 +.word 11645481 // Layer 7, block 100 +.word 16402437 // Layer 7, block 102 +.word 1827638556 // Layer 7, block 96 +.word 1289604549 // Layer 7, block 98 +.word 745253903 // Layer 7, block 100 +.word 1049675853 // Layer 7, block 102 +.word 1005359 // Layer 7, block 97 +.word 19130139 // Layer 7, block 99 +.word 11690281 // Layer 7, block 101 +.word 5461508 // Layer 7, block 103 +.word 64338065 // Layer 7, block 97 +.word 1224235458 // Layer 7, block 99 +.word 748120885 // Layer 7, block 101 +.word 349509836 // Layer 7, block 103 +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 4898455 // Layer 6, block 52 +.word 22059944 // Layer 6, block 53 +.word 20315246 // Layer 6, block 54 +.word 28615767 // Layer 6, block 55 +.word 313477194 // Layer 6, block 52 +.word 1411728668 // Layer 6, block 53 +.word 1300076517 // Layer 6, block 54 +.word 1831269319 // Layer 6, block 55 +.word 6226096 // Layer 7, block 104 +.word 14029790 // Layer 7, block 106 +.word 7729000 // Layer 7, block 108 +.word 13958531 // Layer 7, block 110 +.word 398439734 // Layer 7, block 104 +.word 897838034 // Layer 7, block 106 +.word 494618249 // Layer 7, block 108 +.word 893277806 // Layer 7, block 110 +.word 31755058 // Layer 7, block 105 +.word 26102744 // Layer 7, block 107 +.word 19175904 // Layer 7, block 109 +.word 19472238 // Layer 7, block 111 +.word 2032168609 // Layer 7, block 105 +.word 1670448121 // Layer 7, block 107 +.word 1227164194 // Layer 7, block 109 +.word 1246128123 // Layer 7, block 111 +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 17302560 // Layer 6, block 56 +.word 8630188 // Layer 6, block 57 +.word 13744680 // Layer 6, block 58 +.word 31890906 // Layer 6, block 59 +.word 1107279328 // Layer 6, block 56 +.word 552289879 // Layer 6, block 57 +.word 879592386 // Layer 6, block 58 +.word 2040862218 // Layer 6, block 59 +.word 4735938 // Layer 7, block 112 +.word 26671657 // Layer 7, block 114 +.word 25810971 // Layer 7, block 116 +.word 25578690 // Layer 7, block 118 +.word 303076900 // Layer 7, block 112 +.word 1706855774 // Layer 7, block 114 +.word 1651776074 // Layer 7, block 116 +.word 1636911225 // Layer 7, block 118 +.word 6957373 // Layer 7, block 113 +.word 25381712 // Layer 7, block 115 +.word 27780827 // Layer 7, block 117 +.word 28062311 // Layer 7, block 119 +.word 445237890 // Layer 7, block 113 +.word 1624305595 // Layer 7, block 115 +.word 1777837237 // Layer 7, block 117 +.word 1795850838 // Layer 7, block 119 +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 26150922 // Layer 6, block 60 +.word 29525906 // Layer 6, block 61 +.word 23080870 // Layer 6, block 62 +.word 1636987 // Layer 6, block 63 +.word 1673531278 // Layer 6, block 60 +.word 1889513769 // Layer 6, block 61 +.word 1477062945 // Layer 6, block 62 +.word 104759172 // Layer 6, block 63 +.word 10674616 // Layer 7, block 120 +.word 9508293 // Layer 7, block 122 +.word 4274200 // Layer 7, block 124 +.word 10066304 // Layer 7, block 126 +.word 683123285 // Layer 7, block 120 +.word 608484310 // Layer 7, block 122 +.word 273527923 // Layer 7, block 124 +.word 644194289 // Layer 7, block 126 +.word 26473446 // Layer 7, block 121 +.word 14853570 // Layer 7, block 123 +.word 32427548 // Layer 7, block 125 +.word 16598340 // Layer 7, block 127 +.word 1694171239 // Layer 7, block 121 +.word 950555930 // Layer 7, block 123 +.word 2075204685 // Layer 7, block 125 +.word 1062212688 // Layer 7, block 127 +.text +.global ntt_u32_full_neon_asm_var_4_4_3_z4_0 +.global _ntt_u32_full_neon_asm_var_4_4_3_z4_0 +ntt_u32_full_neon_asm_var_4_4_3_z4_0: +_ntt_u32_full_neon_asm_var_4_4_3_z4_0: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #800] +ldr q21, [x0, #864] +ldr q20, [x0, #928] +ldr q19, [x0, #992] +ldr q18, [x0, #288] +ldr q17, [x0, #352] +ldr q16, [x0, #416] +ldr q3, [x0, #480] +sqrdmulh v2.4S, v22.4S, v29.s[0] +ldr q1, [x0, #544] +mul v22.4S, v22.4S,v30.s[0] +ldr q0, [x0, #608] +sqrdmulh v15.4S, v21.4S, v29.s[0] +ldr q14, [x0, #672] +mul v21.4S, v21.4S,v30.s[0] +ldr q13, [x0, #736] +mla v22.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q12, [x0, #32] +sub v11.4s, v18.4s, v22.4s +mla v21.4S, v15.4S, v31.s[0] +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q15, [x0, #96] +sub v10.4s, v17.4s, v21.4s +mla v20.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v1.4S, v29.s[0] +ldr q2, [x0, #160] +mul v1.4S, v1.4S,v30.s[0] +sub v9.4s, v16.4s, v20.4s +mla v19.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v0.4S, v29.s[0] +ldr q22, [x0, #224] +mul v0.4S, v0.4S,v30.s[0] +sub v8.4s, v3.4s, v19.4s +mla v1.4S, v21.4S, v31.s[0] +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v21.4s, v12.4s, v1.4s +mla v0.4S, v20.4S, v31.s[0] +add v12.4s, v12.4s, v1.4s +sqrdmulh v1.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +sub v20.4s, v15.4s, v0.4s +mla v14.4S, v19.4S, v31.s[0] +add v15.4s, v15.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v19.4s, v2.4s, v14.4s +mla v13.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v1.4s, v22.4s, v13.4s +mla v16.4S, v0.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v0.4s, v2.4s, v16.4s +mla v3.4S, v14.4S, v31.s[0] +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v14.4s, v22.4s, v3.4s +mla v18.4S, v13.4S, v31.s[0] +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v29.s[2] +mul v9.4S, v9.4S,v30.s[2] +sub v13.4s, v12.4s, v18.4s +mla v17.4S, v16.4S, v31.s[0] +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v8.4S, v29.s[2] +mul v8.4S, v8.4S,v30.s[2] +sub v16.4s, v15.4s, v17.4s +mla v9.4S, v3.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +sqrdmulh v17.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v3.4s, v19.4s, v9.4s +mla v8.4S, v18.4S, v31.s[0] +add v19.4s, v19.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v18.4s, v1.4s, v8.4s +mla v11.4S, v17.4S, v31.s[0] +add v1.4s, v1.4s, v8.4s +sqrdmulh v8.4S, v2.4S, v27.s[0] +mul v2.4S, v2.4S,v28.s[0] +sub v17.4s, v21.4s, v11.4s +mla v10.4S, v9.4S, v31.s[0] +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v27.s[0] +mul v22.4S, v22.4S,v28.s[0] +sub v9.4s, v20.4s, v10.4s +mla v2.4S, v8.4S, v31.s[0] +add v20.4s, v20.4s, v10.4s +sqrdmulh v10.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v8.4s, v12.4s, v2.4s +mla v22.4S, v11.4S, v31.s[0] +add v12.4s, v12.4s, v2.4s +sqrdmulh v2.4S, v14.4S, v27.s[1] +mul v14.4S, v14.4S,v28.s[1] +sub v11.4s, v15.4s, v22.4s +mla v0.4S, v10.4S, v31.s[0] +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v27.s[2] +mul v19.4S, v19.4S,v28.s[2] +sub v10.4s, v13.4s, v0.4s +mla v14.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v0.4s +sqrdmulh v0.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +sub v2.4s, v16.4s, v14.4s +mla v19.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v27.s[3] +mul v3.4S, v3.4S,v28.s[3] +sub v22.4s, v21.4s, v19.4s +mla v1.4S, v0.4S, v31.s[0] +add v21.4s, v21.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v0.4s, v20.4s, v1.4s +mla v3.4S, v14.4S, v31.s[0] +add v20.4s, v20.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v25.s[0] +mul v15.4S, v15.4S,v26.s[0] +sub v14.4s, v17.4s, v3.4s +mla v18.4S, v19.4S, v31.s[0] +add v17.4s, v17.4s, v3.4s +sqrdmulh v3.4S, v11.4S, v25.s[1] +mul v11.4S, v11.4S,v26.s[1] +sub v19.4s, v9.4s, v18.4s +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v1.4s, v12.4s, v15.4s +mla v11.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v25.s[3] +mul v2.4S, v2.4S,v26.s[3] +sub v3.4s, v8.4s, v11.4s +mla v16.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v11.4s +str q12, [x0, #32] +sqrdmulh v12.4S, v20.4S, v23.s[0] +str q1, [x0, #96] +mul v20.4S, v20.4S,v24.s[0] +ldr q1, [x0, #816] +sub v11.4s, v13.4s, v16.4s +ldr q18, [x0, #880] +mla v2.4S, v15.4S, v31.s[0] +add v13.4s, v13.4s, v16.4s +str q8, [x0, #160] +sqrdmulh v8.4S, v0.4S, v23.s[1] +str q3, [x0, #224] +mul v0.4S, v0.4S,v24.s[1] +ldr q3, [x0, #944] +sub v16.4s, v10.4s, v2.4s +ldr q15, [x0, #1008] +mla v20.4S, v12.4S, v31.s[0] +add v10.4s, v10.4s, v2.4s +str q13, [x0, #288] +sqrdmulh v13.4S, v9.4S, v23.s[2] +str q11, [x0, #352] +mul v9.4S, v9.4S,v24.s[2] +ldr q11, [x0, #304] +sub v2.4s, v21.4s, v20.4s +ldr q12, [x0, #368] +mla v0.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v20.4s +str q10, [x0, #416] +sqrdmulh v10.4S, v19.4S, v23.s[3] +str q16, [x0, #480] +mul v19.4S, v19.4S,v24.s[3] +ldr q16, [x0, #432] +sub v20.4s, v22.4s, v0.4s +ldr q8, [x0, #496] +mla v9.4S, v13.4S, v31.s[0] +add v22.4s, v22.4s, v0.4s +str q21, [x0, #544] +sqrdmulh v21.4S, v1.4S, v29.s[0] +str q2, [x0, #608] +ldr q2, [x0, #560] +mul v1.4S, v1.4S,v30.s[0] +ldr q0, [x0, #624] +sub v13.4s, v17.4s, v9.4s +mla v19.4S, v10.4S, v31.s[0] +add v17.4s, v17.4s, v9.4s +str q22, [x0, #672] +sqrdmulh v22.4S, v18.4S, v29.s[0] +str q20, [x0, #736] +ldr q20, [x0, #688] +mul v18.4S, v18.4S,v30.s[0] +ldr q9, [x0, #752] +sub v10.4s, v14.4s, v19.4s +mla v1.4S, v21.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +str q17, [x0, #800] +sqrdmulh v17.4S, v3.4S, v29.s[0] +str q13, [x0, #864] +mul v3.4S, v3.4S,v30.s[0] +ldr q13, [x0, #48] +sub v19.4s, v11.4s, v1.4s +mla v18.4S, v22.4S, v31.s[0] +add v11.4s, v11.4s, v1.4s +str q14, [x0, #928] +sqrdmulh v14.4S, v15.4S, v29.s[0] +str q10, [x0, #992] +mul v15.4S, v15.4S,v30.s[0] +ldr q10, [x0, #112] +sub v1.4s, v12.4s, v18.4s +mla v3.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v2.4S, v29.s[0] +ldr q17, [x0, #176] +mul v2.4S, v2.4S,v30.s[0] +sub v22.4s, v16.4s, v3.4s +mla v15.4S, v14.4S, v31.s[0] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v0.4S, v29.s[0] +ldr q14, [x0, #240] +mul v0.4S, v0.4S,v30.s[0] +sub v21.4s, v8.4s, v15.4s +mla v2.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +sub v18.4s, v13.4s, v2.4s +mla v0.4S, v3.4S, v31.s[0] +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v9.4S, v29.s[0] +mul v9.4S, v9.4S,v30.s[0] +sub v3.4s, v10.4s, v0.4s +mla v20.4S, v15.4S, v31.s[0] +add v10.4s, v10.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v15.4s, v17.4s, v20.4s +mla v9.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +sub v2.4s, v14.4s, v9.4s +mla v16.4S, v0.4S, v31.s[0] +add v14.4s, v14.4s, v9.4s +sqrdmulh v9.4S, v11.4S, v29.s[1] +mul v11.4S, v11.4S,v30.s[1] +sub v0.4s, v17.4s, v16.4s +mla v8.4S, v20.4S, v31.s[0] +add v17.4s, v17.4s, v16.4s +sqrdmulh v16.4S, v12.4S, v29.s[1] +mul v12.4S, v12.4S,v30.s[1] +sub v20.4s, v14.4s, v8.4s +mla v11.4S, v9.4S, v31.s[0] +add v14.4s, v14.4s, v8.4s +sqrdmulh v8.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +sub v9.4s, v13.4s, v11.4s +mla v12.4S, v16.4S, v31.s[0] +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +sub v16.4s, v10.4s, v12.4s +mla v22.4S, v8.4S, v31.s[0] +add v10.4s, v10.4s, v12.4s +sqrdmulh v12.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +sub v8.4s, v15.4s, v22.4s +mla v21.4S, v11.4S, v31.s[0] +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v1.4S, v29.s[2] +mul v1.4S, v1.4S,v30.s[2] +sub v11.4s, v2.4s, v21.4s +mla v19.4S, v12.4S, v31.s[0] +add v2.4s, v2.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v27.s[0] +mul v17.4S, v17.4S,v28.s[0] +sub v12.4s, v18.4s, v19.4s +mla v1.4S, v22.4S, v31.s[0] +add v18.4s, v18.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +sub v22.4s, v3.4s, v1.4s +mla v17.4S, v21.4S, v31.s[0] +add v3.4s, v3.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v21.4s, v13.4s, v17.4s +mla v14.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v17.4s +sqrdmulh v17.4S, v20.4S, v27.s[1] +mul v20.4S, v20.4S,v28.s[1] +sub v19.4s, v10.4s, v14.4s +mla v0.4S, v1.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v27.s[2] +mul v15.4S, v15.4S,v28.s[2] +sub v1.4s, v9.4s, v0.4s +mla v20.4S, v17.4S, v31.s[0] +add v9.4s, v9.4s, v0.4s +sqrdmulh v0.4S, v2.4S, v27.s[2] +mul v2.4S, v2.4S,v28.s[2] +sub v17.4s, v16.4s, v20.4s +mla v15.4S, v14.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v27.s[3] +mul v8.4S, v8.4S,v28.s[3] +sub v14.4s, v18.4s, v15.4s +mla v2.4S, v0.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v27.s[3] +mul v11.4S, v11.4S,v28.s[3] +sub v0.4s, v3.4s, v2.4s +mla v8.4S, v20.4S, v31.s[0] +add v3.4s, v3.4s, v2.4s +sqrdmulh v2.4S, v10.4S, v25.s[0] +mul v10.4S, v10.4S,v26.s[0] +sub v20.4s, v12.4s, v8.4s +mla v11.4S, v15.4S, v31.s[0] +add v12.4s, v12.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v25.s[1] +mul v19.4S, v19.4S,v26.s[1] +sub v15.4s, v22.4s, v11.4s +mla v10.4S, v2.4S, v31.s[0] +add v22.4s, v22.4s, v11.4s +sqrdmulh v11.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v2.4s, v13.4s, v10.4s +mla v19.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v10.4s +sqrdmulh v10.4S, v17.4S, v25.s[3] +mul v17.4S, v17.4S,v26.s[3] +sub v8.4s, v21.4s, v19.4s +mla v16.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v19.4s +str q13, [x0, #48] +sqrdmulh v13.4S, v3.4S, v23.s[0] +str q2, [x0, #112] +mul v3.4S, v3.4S,v24.s[0] +ldr q2, [x0, #768] +sub v19.4s, v9.4s, v16.4s +ldr q11, [x0, #832] +mla v17.4S, v10.4S, v31.s[0] +add v9.4s, v9.4s, v16.4s +str q21, [x0, #176] +sqrdmulh v21.4S, v0.4S, v23.s[1] +str q8, [x0, #240] +mul v0.4S, v0.4S,v24.s[1] +ldr q8, [x0, #896] +sub v16.4s, v1.4s, v17.4s +ldr q10, [x0, #960] +mla v3.4S, v13.4S, v31.s[0] +add v1.4s, v1.4s, v17.4s +str q9, [x0, #304] +sqrdmulh v9.4S, v22.4S, v23.s[2] +str q19, [x0, #368] +mul v22.4S, v22.4S,v24.s[2] +ldr q19, [x0, #256] +sub v17.4s, v18.4s, v3.4s +ldr q13, [x0, #320] +mla v0.4S, v21.4S, v31.s[0] +add v18.4s, v18.4s, v3.4s +str q1, [x0, #432] +sqrdmulh v1.4S, v15.4S, v23.s[3] +str q16, [x0, #496] +mul v15.4S, v15.4S,v24.s[3] +ldr q16, [x0, #384] +sub v3.4s, v14.4s, v0.4s +ldr q21, [x0, #448] +mla v22.4S, v9.4S, v31.s[0] +add v14.4s, v14.4s, v0.4s +str q18, [x0, #560] +sqrdmulh v18.4S, v2.4S, v29.s[0] +str q17, [x0, #624] +ldr q17, [x0, #512] +mul v2.4S, v2.4S,v30.s[0] +ldr q0, [x0, #576] +sub v9.4s, v12.4s, v22.4s +mla v15.4S, v1.4S, v31.s[0] +add v12.4s, v12.4s, v22.4s +str q14, [x0, #688] +sqrdmulh v14.4S, v11.4S, v29.s[0] +str q3, [x0, #752] +ldr q3, [x0, #640] +mul v11.4S, v11.4S,v30.s[0] +ldr q22, [x0, #704] +sub v1.4s, v20.4s, v15.4s +mla v2.4S, v18.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +str q12, [x0, #816] +sqrdmulh v12.4S, v8.4S, v29.s[0] +str q9, [x0, #880] +mul v8.4S, v8.4S,v30.s[0] +ldr q9, [x0, #0] +sub v15.4s, v19.4s, v2.4s +mla v11.4S, v14.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +str q20, [x0, #944] +sqrdmulh v20.4S, v10.4S, v29.s[0] +str q1, [x0, #1008] +mul v10.4S, v10.4S,v30.s[0] +ldr q1, [x0, #64] +sub v2.4s, v13.4s, v11.4s +mla v8.4S, v12.4S, v31.s[0] +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v29.s[0] +ldr q12, [x0, #128] +mul v17.4S, v17.4S,v30.s[0] +sub v14.4s, v16.4s, v8.4s +mla v10.4S, v20.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v0.4S, v29.s[0] +ldr q20, [x0, #192] +mul v0.4S, v0.4S,v30.s[0] +sub v18.4s, v21.4s, v10.4s +mla v17.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +sub v11.4s, v9.4s, v17.4s +mla v0.4S, v8.4S, v31.s[0] +add v9.4s, v9.4s, v17.4s +sqrdmulh v17.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v8.4s, v1.4s, v0.4s +mla v3.4S, v10.4S, v31.s[0] +add v1.4s, v1.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v10.4s, v12.4s, v3.4s +mla v22.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v17.4s, v20.4s, v22.4s +mla v16.4S, v0.4S, v31.s[0] +add v20.4s, v20.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[1] +mul v19.4S, v19.4S,v30.s[1] +sub v0.4s, v12.4s, v16.4s +mla v21.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v13.4S, v29.s[1] +mul v13.4S, v13.4S,v30.s[1] +sub v3.4s, v20.4s, v21.4s +mla v19.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v22.4s, v9.4s, v19.4s +mla v13.4S, v16.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v29.s[2] +mul v18.4S, v18.4S,v30.s[2] +sub v16.4s, v1.4s, v13.4s +mla v14.4S, v21.4S, v31.s[0] +add v1.4s, v1.4s, v13.4s +sqrdmulh v13.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +sub v21.4s, v10.4s, v14.4s +mla v18.4S, v19.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v19.4s, v17.4s, v18.4s +mla v15.4S, v13.4S, v31.s[0] +add v17.4s, v17.4s, v18.4s +sqrdmulh v18.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +sub v13.4s, v11.4s, v15.4s +mla v2.4S, v14.4S, v31.s[0] +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v27.s[0] +mul v20.4S, v20.4S,v28.s[0] +sub v14.4s, v8.4s, v2.4s +mla v12.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v2.4s +sqrdmulh v2.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v18.4s, v9.4s, v12.4s +mla v20.4S, v15.4S, v31.s[0] +add v9.4s, v9.4s, v12.4s +sqrdmulh v12.4S, v3.4S, v27.s[1] +mul v3.4S, v3.4S,v28.s[1] +sub v15.4s, v1.4s, v20.4s +mla v0.4S, v2.4S, v31.s[0] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v10.4S, v27.s[2] +mul v10.4S, v10.4S,v28.s[2] +sub v2.4s, v22.4s, v0.4s +mla v3.4S, v12.4S, v31.s[0] +add v22.4s, v22.4s, v0.4s +sqrdmulh v0.4S, v17.4S, v27.s[2] +mul v17.4S, v17.4S,v28.s[2] +sub v12.4s, v16.4s, v3.4s +mla v10.4S, v20.4S, v31.s[0] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +sub v20.4s, v11.4s, v10.4s +mla v17.4S, v0.4S, v31.s[0] +add v11.4s, v11.4s, v10.4s +sqrdmulh v10.4S, v19.4S, v27.s[3] +mul v19.4S, v19.4S,v28.s[3] +sub v0.4s, v8.4s, v17.4s +mla v21.4S, v3.4S, v31.s[0] +add v8.4s, v8.4s, v17.4s +sqrdmulh v17.4S, v1.4S, v25.s[0] +mul v1.4S, v1.4S,v26.s[0] +sub v3.4s, v13.4s, v21.4s +mla v19.4S, v10.4S, v31.s[0] +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v15.4S, v25.s[1] +mul v15.4S, v15.4S,v26.s[1] +sub v10.4s, v14.4s, v19.4s +mla v1.4S, v17.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +sqrdmulh v19.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v17.4s, v9.4s, v1.4s +mla v15.4S, v21.4S, v31.s[0] +add v9.4s, v9.4s, v1.4s +sqrdmulh v1.4S, v12.4S, v25.s[3] +mul v12.4S, v12.4S,v26.s[3] +sub v21.4s, v18.4s, v15.4s +mla v16.4S, v19.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +str q9, [x0, #0] +sqrdmulh v9.4S, v8.4S, v23.s[0] +str q17, [x0, #64] +mul v8.4S, v8.4S,v24.s[0] +ldr q17, [x0, #784] +sub v15.4s, v22.4s, v16.4s +ldr q19, [x0, #848] +mla v12.4S, v1.4S, v31.s[0] +add v22.4s, v22.4s, v16.4s +str q18, [x0, #128] +sqrdmulh v18.4S, v0.4S, v23.s[1] +str q21, [x0, #192] +mul v0.4S, v0.4S,v24.s[1] +ldr q21, [x0, #912] +sub v16.4s, v2.4s, v12.4s +ldr q1, [x0, #976] +mla v8.4S, v9.4S, v31.s[0] +add v2.4s, v2.4s, v12.4s +str q22, [x0, #256] +sqrdmulh v22.4S, v14.4S, v23.s[2] +str q15, [x0, #320] +mul v14.4S, v14.4S,v24.s[2] +ldr q15, [x0, #272] +sub v12.4s, v11.4s, v8.4s +ldr q9, [x0, #336] +mla v0.4S, v18.4S, v31.s[0] +add v11.4s, v11.4s, v8.4s +str q2, [x0, #384] +sqrdmulh v2.4S, v10.4S, v23.s[3] +str q16, [x0, #448] +mul v10.4S, v10.4S,v24.s[3] +ldr q16, [x0, #400] +sub v8.4s, v20.4s, v0.4s +ldr q18, [x0, #464] +mla v14.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v0.4s +str q11, [x0, #512] +sqrdmulh v11.4S, v17.4S, v29.s[0] +str q12, [x0, #576] +ldr q12, [x0, #528] +mul v17.4S, v17.4S,v30.s[0] +ldr q0, [x0, #592] +sub v22.4s, v13.4s, v14.4s +mla v10.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v14.4s +str q20, [x0, #640] +sqrdmulh v20.4S, v19.4S, v29.s[0] +str q8, [x0, #704] +ldr q8, [x0, #656] +mul v19.4S, v19.4S,v30.s[0] +ldr q14, [x0, #720] +sub v2.4s, v3.4s, v10.4s +mla v17.4S, v11.4S, v31.s[0] +add v3.4s, v3.4s, v10.4s +str q13, [x0, #768] +sqrdmulh v13.4S, v21.4S, v29.s[0] +str q22, [x0, #832] +mul v21.4S, v21.4S,v30.s[0] +ldr q22, [x0, #16] +sub v10.4s, v15.4s, v17.4s +mla v19.4S, v20.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +str q3, [x0, #896] +sqrdmulh v3.4S, v1.4S, v29.s[0] +str q2, [x0, #960] +mul v1.4S, v1.4S,v30.s[0] +ldr q2, [x0, #80] +sub v17.4s, v9.4s, v19.4s +mla v21.4S, v13.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v12.4S, v29.s[0] +ldr q13, [x0, #144] +mul v12.4S, v12.4S,v30.s[0] +sub v20.4s, v16.4s, v21.4s +mla v1.4S, v3.4S, v31.s[0] +add v16.4s, v16.4s, v21.4s +sqrdmulh v21.4S, v0.4S, v29.s[0] +ldr q3, [x0, #208] +mul v0.4S, v0.4S,v30.s[0] +sub v11.4s, v18.4s, v1.4s +mla v12.4S, v19.4S, v31.s[0] +add v18.4s, v18.4s, v1.4s +sqrdmulh v1.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v19.4s, v22.4s, v12.4s +mla v0.4S, v21.4S, v31.s[0] +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v21.4s, v2.4s, v0.4s +mla v8.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v1.4s, v13.4s, v8.4s +mla v14.4S, v12.4S, v31.s[0] +add v13.4s, v13.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v12.4s, v3.4s, v14.4s +mla v16.4S, v0.4S, v31.s[0] +add v3.4s, v3.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +sub v0.4s, v13.4s, v16.4s +mla v18.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v16.4s +sqrdmulh v16.4S, v9.4S, v29.s[1] +mul v9.4S, v9.4S,v30.s[1] +sub v8.4s, v3.4s, v18.4s +mla v15.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +sub v14.4s, v22.4s, v15.4s +mla v9.4S, v16.4S, v31.s[0] +add v22.4s, v22.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v16.4s, v2.4s, v9.4s +mla v20.4S, v18.4S, v31.s[0] +add v2.4s, v2.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v18.4s, v1.4s, v20.4s +mla v11.4S, v15.4S, v31.s[0] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +sub v15.4s, v12.4s, v11.4s +mla v10.4S, v9.4S, v31.s[0] +add v12.4s, v12.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v27.s[0] +mul v13.4S, v13.4S,v28.s[0] +sub v9.4s, v19.4s, v10.4s +mla v17.4S, v20.4S, v31.s[0] +add v19.4s, v19.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v27.s[0] +mul v3.4S, v3.4S,v28.s[0] +sub v20.4s, v21.4s, v17.4s +mla v13.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v11.4s, v22.4s, v13.4s +mla v3.4S, v10.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v8.4S, v27.s[1] +mul v8.4S, v8.4S,v28.s[1] +sub v10.4s, v2.4s, v3.4s +mla v0.4S, v17.4S, v31.s[0] +add v2.4s, v2.4s, v3.4s +sqrdmulh v3.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +sub v17.4s, v14.4s, v0.4s +mla v8.4S, v13.4S, v31.s[0] +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v12.4S, v27.s[2] +mul v12.4S, v12.4S,v28.s[2] +sub v13.4s, v16.4s, v8.4s +mla v1.4S, v3.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v3.4s, v19.4s, v1.4s +mla v12.4S, v0.4S, v31.s[0] +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +sub v0.4s, v21.4s, v12.4s +mla v18.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v2.4S, v25.s[0] +mul v2.4S, v2.4S,v26.s[0] +sub v8.4s, v9.4s, v18.4s +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v10.4S, v25.s[1] +mul v10.4S, v10.4S,v26.s[1] +sub v1.4s, v20.4s, v15.4s +mla v2.4S, v12.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v12.4s, v22.4s, v2.4s +mla v10.4S, v18.4S, v31.s[0] +add v22.4s, v22.4s, v2.4s +sqrdmulh v2.4S, v13.4S, v25.s[3] +mul v13.4S, v13.4S,v26.s[3] +sub v18.4s, v11.4s, v10.4s +mla v16.4S, v15.4S, v31.s[0] +add v11.4s, v11.4s, v10.4s +str q22, [x0, #16] +sqrdmulh v22.4S, v21.4S, v23.s[0] +str q12, [x0, #80] +mul v21.4S, v21.4S,v24.s[0] +sub v12.4s, v14.4s, v16.4s +mla v13.4S, v2.4S, v31.s[0] +add v14.4s, v14.4s, v16.4s +str q11, [x0, #144] +sqrdmulh v11.4S, v0.4S, v23.s[1] +str q18, [x0, #208] +mul v0.4S, v0.4S,v24.s[1] +sub v18.4s, v17.4s, v13.4s +mla v21.4S, v22.4S, v31.s[0] +add v17.4s, v17.4s, v13.4s +str q14, [x0, #272] +sqrdmulh v14.4S, v20.4S, v23.s[2] +str q12, [x0, #336] +mul v20.4S, v20.4S,v24.s[2] +sub v12.4s, v19.4s, v21.4s +mla v0.4S, v11.4S, v31.s[0] +add v19.4s, v19.4s, v21.4s +str q17, [x0, #400] +sqrdmulh v17.4S, v1.4S, v23.s[3] +str q18, [x0, #464] +mul v1.4S, v1.4S,v24.s[3] +sub v18.4s, v3.4s, v0.4s +mla v20.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v0.4s +str q19, [x0, #528] +str q12, [x0, #592] +sub v12.4s, v9.4s, v20.4s +mla v1.4S, v17.4S, v31.s[0] +add v9.4s, v9.4s, v20.4s +str q3, [x0, #656] +str q18, [x0, #720] +sub v18.4s, v8.4s, v1.4s +add v8.4s, v8.4s, v1.4s +str q9, [x0, #784] +str q12, [x0, #848] +str q8, [x0, #912] +str q18, [x0, #976] +ldr q4, [x17, #+128] +ldr q5, [x17, #+144] +ldr q6, [x17, #+160] +ldr q7, [x17, #+176] +ldr q15, [x17, #+192] +ldr q10, [x17, #+208] +ldr q2, [x17, #+224] +ldr q16, [x17, #+240] +ldr q22, [x0, #32] +ldr q13, [x0, #48] +ldr q11, [x0, #0] +ldr q21, [x0, #16] +sqrdmulh v14.4S, v22.4S, v5.s[0] +mul v22.4S, v22.4S,v4.s[0] +mla v22.4S, v14.4S, v31.s[0] +sub v14.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +sqrdmulh v22.4S, v13.4S, v5.s[0] +mul v13.4S, v13.4S,v4.s[0] +mla v13.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +sqrdmulh v13.4S, v21.4S, v5.s[1] +mul v21.4S, v21.4S,v4.s[1] +mla v21.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v21.4s +add v11.4s, v11.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v5.s[2] +mul v22.4S, v22.4S,v4.s[2] +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v14.4s, v22.4s +add v14.4s, v14.4s, v22.4s +trn1 v22.4S, v11.4S, v13.4S +trn2 v0.4S, v11.4S, v13.4S +trn1 v19.4S, v14.4S, v21.4S +trn2 v17.4S, v14.4S, v21.4S +trn2 v14.2D, v22.2D, v19.2D +trn2 v21.2D, v0.2D, v17.2D +trn1 v11.2D, v22.2D, v19.2D +trn1 v13.2D, v0.2D, v17.2D +sqrdmulh v17.4S, v14.4S, v7.4S +mul v14.4S, v14.4S,v6.4S +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v11.4s, v14.4s +add v11.4s, v11.4s, v14.4s +sqrdmulh v14.4S, v21.4S, v7.4S +mul v21.4S, v21.4S,v6.4S +mla v21.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v21.4s +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v13.4S, v10.4S +mul v13.4S, v13.4S,v15.4S +mla v13.4S, v21.4S, v31.s[0] +sub v21.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +sqrdmulh v13.4S, v14.4S, v16.4S +mul v14.4S, v14.4S,v2.4S +mla v14.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v14.4s +add v17.4s, v17.4s, v14.4s +str q11, [x0, #0] +str q21, [x0, #16] +str q17, [x0, #32] +str q13, [x0, #48] +ldr q16, [x17, #+256] +ldr q2, [x17, #+272] +ldr q10, [x17, #+288] +ldr q15, [x17, #+304] +ldr q7, [x17, #+320] +ldr q6, [x17, #+336] +ldr q5, [x17, #+352] +ldr q4, [x17, #+368] +ldr q13, [x0, #96] +ldr q17, [x0, #112] +ldr q21, [x0, #64] +ldr q11, [x0, #80] +sqrdmulh v14.4S, v13.4S, v2.s[0] +mul v13.4S, v13.4S,v16.s[0] +mla v13.4S, v14.4S, v31.s[0] +sub v14.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +sqrdmulh v13.4S, v17.4S, v2.s[0] +mul v17.4S, v17.4S,v16.s[0] +mla v17.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v17.4s +add v11.4s, v11.4s, v17.4s +sqrdmulh v17.4S, v11.4S, v2.s[1] +mul v11.4S, v11.4S,v16.s[1] +mla v11.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v11.4s +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v2.s[2] +mul v13.4S, v13.4S,v16.s[2] +mla v13.4S, v11.4S, v31.s[0] +sub v11.4s, v14.4s, v13.4s +add v14.4s, v14.4s, v13.4s +trn1 v13.4S, v21.4S, v17.4S +trn2 v0.4S, v21.4S, v17.4S +trn1 v19.4S, v14.4S, v11.4S +trn2 v22.4S, v14.4S, v11.4S +trn2 v14.2D, v13.2D, v19.2D +trn2 v11.2D, v0.2D, v22.2D +trn1 v21.2D, v13.2D, v19.2D +trn1 v17.2D, v0.2D, v22.2D +sqrdmulh v22.4S, v14.4S, v15.4S +mul v14.4S, v14.4S,v10.4S +mla v14.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v14.4s +add v21.4s, v21.4s, v14.4s +sqrdmulh v14.4S, v11.4S, v15.4S +mul v11.4S, v11.4S,v10.4S +mla v11.4S, v14.4S, v31.s[0] +sub v14.4s, v17.4s, v11.4s +add v17.4s, v17.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v6.4S +mul v17.4S, v17.4S,v7.4S +mla v17.4S, v11.4S, v31.s[0] +sub v11.4s, v21.4s, v17.4s +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v14.4S, v4.4S +mul v14.4S, v14.4S,v5.4S +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v22.4s, v14.4s +add v22.4s, v22.4s, v14.4s +str q21, [x0, #64] +str q11, [x0, #80] +str q22, [x0, #96] +str q17, [x0, #112] +ldr q4, [x17, #+384] +ldr q5, [x17, #+400] +ldr q6, [x17, #+416] +ldr q7, [x17, #+432] +ldr q15, [x17, #+448] +ldr q10, [x17, #+464] +ldr q2, [x17, #+480] +ldr q16, [x17, #+496] +ldr q17, [x0, #160] +ldr q22, [x0, #176] +ldr q11, [x0, #128] +ldr q21, [x0, #144] +sqrdmulh v14.4S, v17.4S, v5.s[0] +mul v17.4S, v17.4S,v4.s[0] +mla v17.4S, v14.4S, v31.s[0] +sub v14.4s, v11.4s, v17.4s +add v11.4s, v11.4s, v17.4s +sqrdmulh v17.4S, v22.4S, v5.s[0] +mul v22.4S, v22.4S,v4.s[0] +mla v22.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v5.s[1] +mul v21.4S, v21.4S,v4.s[1] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v21.4s +add v11.4s, v11.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v5.s[2] +mul v17.4S, v17.4S,v4.s[2] +mla v17.4S, v21.4S, v31.s[0] +sub v21.4s, v14.4s, v17.4s +add v14.4s, v14.4s, v17.4s +trn1 v17.4S, v11.4S, v22.4S +trn2 v0.4S, v11.4S, v22.4S +trn1 v19.4S, v14.4S, v21.4S +trn2 v13.4S, v14.4S, v21.4S +trn2 v14.2D, v17.2D, v19.2D +trn2 v21.2D, v0.2D, v13.2D +trn1 v11.2D, v17.2D, v19.2D +trn1 v22.2D, v0.2D, v13.2D +sqrdmulh v13.4S, v14.4S, v7.4S +mul v14.4S, v14.4S,v6.4S +mla v14.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v14.4s +add v11.4s, v11.4s, v14.4s +sqrdmulh v14.4S, v21.4S, v7.4S +mul v21.4S, v21.4S,v6.4S +mla v21.4S, v14.4S, v31.s[0] +sub v14.4s, v22.4s, v21.4s +add v22.4s, v22.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v10.4S +mul v22.4S, v22.4S,v15.4S +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +sqrdmulh v22.4S, v14.4S, v16.4S +mul v14.4S, v14.4S,v2.4S +mla v14.4S, v22.4S, v31.s[0] +sub v22.4s, v13.4s, v14.4s +add v13.4s, v13.4s, v14.4s +str q11, [x0, #128] +str q21, [x0, #144] +str q13, [x0, #160] +str q22, [x0, #176] +ldr q16, [x17, #+512] +ldr q2, [x17, #+528] +ldr q10, [x17, #+544] +ldr q15, [x17, #+560] +ldr q7, [x17, #+576] +ldr q6, [x17, #+592] +ldr q5, [x17, #+608] +ldr q4, [x17, #+624] +ldr q22, [x0, #224] +ldr q13, [x0, #240] +ldr q21, [x0, #192] +ldr q11, [x0, #208] +sqrdmulh v14.4S, v22.4S, v2.s[0] +mul v22.4S, v22.4S,v16.s[0] +mla v22.4S, v14.4S, v31.s[0] +sub v14.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +sqrdmulh v22.4S, v13.4S, v2.s[0] +mul v13.4S, v13.4S,v16.s[0] +mla v13.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +sqrdmulh v13.4S, v11.4S, v2.s[1] +mul v11.4S, v11.4S,v16.s[1] +mla v11.4S, v13.4S, v31.s[0] +sub v13.4s, v21.4s, v11.4s +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v2.s[2] +mul v22.4S, v22.4S,v16.s[2] +mla v22.4S, v11.4S, v31.s[0] +sub v11.4s, v14.4s, v22.4s +add v14.4s, v14.4s, v22.4s +trn1 v22.4S, v21.4S, v13.4S +trn2 v0.4S, v21.4S, v13.4S +trn1 v19.4S, v14.4S, v11.4S +trn2 v17.4S, v14.4S, v11.4S +trn2 v14.2D, v22.2D, v19.2D +trn2 v11.2D, v0.2D, v17.2D +trn1 v21.2D, v22.2D, v19.2D +trn1 v13.2D, v0.2D, v17.2D +sqrdmulh v17.4S, v14.4S, v15.4S +mul v14.4S, v14.4S,v10.4S +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v14.4s +add v21.4s, v21.4s, v14.4s +sqrdmulh v14.4S, v11.4S, v15.4S +mul v11.4S, v11.4S,v10.4S +mla v11.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v11.4s +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v6.4S +mul v13.4S, v13.4S,v7.4S +mla v13.4S, v11.4S, v31.s[0] +sub v11.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +sqrdmulh v13.4S, v14.4S, v4.4S +mul v14.4S, v14.4S,v5.4S +mla v14.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v14.4s +add v17.4s, v17.4s, v14.4s +str q21, [x0, #192] +str q11, [x0, #208] +str q17, [x0, #224] +str q13, [x0, #240] +ldr q4, [x17, #+640] +ldr q5, [x17, #+656] +ldr q6, [x17, #+672] +ldr q7, [x17, #+688] +ldr q15, [x17, #+704] +ldr q10, [x17, #+720] +ldr q2, [x17, #+736] +ldr q16, [x17, #+752] +ldr q13, [x0, #288] +ldr q17, [x0, #304] +ldr q11, [x0, #256] +ldr q21, [x0, #272] +sqrdmulh v14.4S, v13.4S, v5.s[0] +mul v13.4S, v13.4S,v4.s[0] +mla v13.4S, v14.4S, v31.s[0] +sub v14.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +sqrdmulh v13.4S, v17.4S, v5.s[0] +mul v17.4S, v17.4S,v4.s[0] +mla v17.4S, v13.4S, v31.s[0] +sub v13.4s, v21.4s, v17.4s +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v21.4S, v5.s[1] +mul v21.4S, v21.4S,v4.s[1] +mla v21.4S, v17.4S, v31.s[0] +sub v17.4s, v11.4s, v21.4s +add v11.4s, v11.4s, v21.4s +sqrdmulh v21.4S, v13.4S, v5.s[2] +mul v13.4S, v13.4S,v4.s[2] +mla v13.4S, v21.4S, v31.s[0] +sub v21.4s, v14.4s, v13.4s +add v14.4s, v14.4s, v13.4s +trn1 v13.4S, v11.4S, v17.4S +trn2 v0.4S, v11.4S, v17.4S +trn1 v19.4S, v14.4S, v21.4S +trn2 v22.4S, v14.4S, v21.4S +trn2 v14.2D, v13.2D, v19.2D +trn2 v21.2D, v0.2D, v22.2D +trn1 v11.2D, v13.2D, v19.2D +trn1 v17.2D, v0.2D, v22.2D +sqrdmulh v22.4S, v14.4S, v7.4S +mul v14.4S, v14.4S,v6.4S +mla v14.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v14.4s +add v11.4s, v11.4s, v14.4s +sqrdmulh v14.4S, v21.4S, v7.4S +mul v21.4S, v21.4S,v6.4S +mla v21.4S, v14.4S, v31.s[0] +sub v14.4s, v17.4s, v21.4s +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v10.4S +mul v17.4S, v17.4S,v15.4S +mla v17.4S, v21.4S, v31.s[0] +sub v21.4s, v11.4s, v17.4s +add v11.4s, v11.4s, v17.4s +sqrdmulh v17.4S, v14.4S, v16.4S +mul v14.4S, v14.4S,v2.4S +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v22.4s, v14.4s +add v22.4s, v22.4s, v14.4s +str q11, [x0, #256] +str q21, [x0, #272] +str q22, [x0, #288] +str q17, [x0, #304] +ldr q16, [x17, #+768] +ldr q2, [x17, #+784] +ldr q10, [x17, #+800] +ldr q15, [x17, #+816] +ldr q7, [x17, #+832] +ldr q6, [x17, #+848] +ldr q5, [x17, #+864] +ldr q4, [x17, #+880] +ldr q17, [x0, #352] +ldr q22, [x0, #368] +ldr q21, [x0, #320] +ldr q11, [x0, #336] +sqrdmulh v14.4S, v17.4S, v2.s[0] +mul v17.4S, v17.4S,v16.s[0] +mla v17.4S, v14.4S, v31.s[0] +sub v14.4s, v21.4s, v17.4s +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v22.4S, v2.s[0] +mul v22.4S, v22.4S,v16.s[0] +mla v22.4S, v17.4S, v31.s[0] +sub v17.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +sqrdmulh v22.4S, v11.4S, v2.s[1] +mul v11.4S, v11.4S,v16.s[1] +mla v11.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v11.4s +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v2.s[2] +mul v17.4S, v17.4S,v16.s[2] +mla v17.4S, v11.4S, v31.s[0] +sub v11.4s, v14.4s, v17.4s +add v14.4s, v14.4s, v17.4s +trn1 v17.4S, v21.4S, v22.4S +trn2 v0.4S, v21.4S, v22.4S +trn1 v19.4S, v14.4S, v11.4S +trn2 v13.4S, v14.4S, v11.4S +trn2 v14.2D, v17.2D, v19.2D +trn2 v11.2D, v0.2D, v13.2D +trn1 v21.2D, v17.2D, v19.2D +trn1 v22.2D, v0.2D, v13.2D +sqrdmulh v13.4S, v14.4S, v15.4S +mul v14.4S, v14.4S,v10.4S +mla v14.4S, v13.4S, v31.s[0] +sub v13.4s, v21.4s, v14.4s +add v21.4s, v21.4s, v14.4s +sqrdmulh v14.4S, v11.4S, v15.4S +mul v11.4S, v11.4S,v10.4S +mla v11.4S, v14.4S, v31.s[0] +sub v14.4s, v22.4s, v11.4s +add v22.4s, v22.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v6.4S +mul v22.4S, v22.4S,v7.4S +mla v22.4S, v11.4S, v31.s[0] +sub v11.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +sqrdmulh v22.4S, v14.4S, v4.4S +mul v14.4S, v14.4S,v5.4S +mla v14.4S, v22.4S, v31.s[0] +sub v22.4s, v13.4s, v14.4s +add v13.4s, v13.4s, v14.4s +str q21, [x0, #320] +str q11, [x0, #336] +str q13, [x0, #352] +str q22, [x0, #368] +ldr q4, [x17, #+896] +ldr q5, [x17, #+912] +ldr q6, [x17, #+928] +ldr q7, [x17, #+944] +ldr q15, [x17, #+960] +ldr q10, [x17, #+976] +ldr q2, [x17, #+992] +ldr q16, [x17, #+1008] +ldr q22, [x0, #416] +ldr q13, [x0, #432] +ldr q11, [x0, #384] +ldr q21, [x0, #400] +sqrdmulh v14.4S, v22.4S, v5.s[0] +mul v22.4S, v22.4S,v4.s[0] +mla v22.4S, v14.4S, v31.s[0] +sub v14.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +sqrdmulh v22.4S, v13.4S, v5.s[0] +mul v13.4S, v13.4S,v4.s[0] +mla v13.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +sqrdmulh v13.4S, v21.4S, v5.s[1] +mul v21.4S, v21.4S,v4.s[1] +mla v21.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v21.4s +add v11.4s, v11.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v5.s[2] +mul v22.4S, v22.4S,v4.s[2] +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v14.4s, v22.4s +add v14.4s, v14.4s, v22.4s +trn1 v22.4S, v11.4S, v13.4S +trn2 v0.4S, v11.4S, v13.4S +trn1 v19.4S, v14.4S, v21.4S +trn2 v17.4S, v14.4S, v21.4S +trn2 v14.2D, v22.2D, v19.2D +trn2 v21.2D, v0.2D, v17.2D +trn1 v11.2D, v22.2D, v19.2D +trn1 v13.2D, v0.2D, v17.2D +sqrdmulh v17.4S, v14.4S, v7.4S +mul v14.4S, v14.4S,v6.4S +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v11.4s, v14.4s +add v11.4s, v11.4s, v14.4s +sqrdmulh v14.4S, v21.4S, v7.4S +mul v21.4S, v21.4S,v6.4S +mla v21.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v21.4s +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v13.4S, v10.4S +mul v13.4S, v13.4S,v15.4S +mla v13.4S, v21.4S, v31.s[0] +sub v21.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +sqrdmulh v13.4S, v14.4S, v16.4S +mul v14.4S, v14.4S,v2.4S +mla v14.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v14.4s +add v17.4s, v17.4s, v14.4s +str q11, [x0, #384] +str q21, [x0, #400] +str q17, [x0, #416] +str q13, [x0, #432] +ldr q16, [x17, #+1024] +ldr q2, [x17, #+1040] +ldr q10, [x17, #+1056] +ldr q15, [x17, #+1072] +ldr q7, [x17, #+1088] +ldr q6, [x17, #+1104] +ldr q5, [x17, #+1120] +ldr q4, [x17, #+1136] +ldr q13, [x0, #480] +ldr q17, [x0, #496] +ldr q21, [x0, #448] +ldr q11, [x0, #464] +sqrdmulh v14.4S, v13.4S, v2.s[0] +mul v13.4S, v13.4S,v16.s[0] +mla v13.4S, v14.4S, v31.s[0] +sub v14.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +sqrdmulh v13.4S, v17.4S, v2.s[0] +mul v17.4S, v17.4S,v16.s[0] +mla v17.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v17.4s +add v11.4s, v11.4s, v17.4s +sqrdmulh v17.4S, v11.4S, v2.s[1] +mul v11.4S, v11.4S,v16.s[1] +mla v11.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v11.4s +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v2.s[2] +mul v13.4S, v13.4S,v16.s[2] +mla v13.4S, v11.4S, v31.s[0] +sub v11.4s, v14.4s, v13.4s +add v14.4s, v14.4s, v13.4s +trn1 v13.4S, v21.4S, v17.4S +trn2 v0.4S, v21.4S, v17.4S +trn1 v19.4S, v14.4S, v11.4S +trn2 v22.4S, v14.4S, v11.4S +trn2 v14.2D, v13.2D, v19.2D +trn2 v11.2D, v0.2D, v22.2D +trn1 v21.2D, v13.2D, v19.2D +trn1 v17.2D, v0.2D, v22.2D +sqrdmulh v22.4S, v14.4S, v15.4S +mul v14.4S, v14.4S,v10.4S +mla v14.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v14.4s +add v21.4s, v21.4s, v14.4s +sqrdmulh v14.4S, v11.4S, v15.4S +mul v11.4S, v11.4S,v10.4S +mla v11.4S, v14.4S, v31.s[0] +sub v14.4s, v17.4s, v11.4s +add v17.4s, v17.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v6.4S +mul v17.4S, v17.4S,v7.4S +mla v17.4S, v11.4S, v31.s[0] +sub v11.4s, v21.4s, v17.4s +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v14.4S, v4.4S +mul v14.4S, v14.4S,v5.4S +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v22.4s, v14.4s +add v22.4s, v22.4s, v14.4s +str q21, [x0, #448] +str q11, [x0, #464] +str q22, [x0, #480] +str q17, [x0, #496] +ldr q4, [x17, #+1152] +ldr q5, [x17, #+1168] +ldr q6, [x17, #+1184] +ldr q7, [x17, #+1200] +ldr q15, [x17, #+1216] +ldr q10, [x17, #+1232] +ldr q2, [x17, #+1248] +ldr q16, [x17, #+1264] +ldr q17, [x0, #544] +ldr q22, [x0, #560] +ldr q11, [x0, #512] +ldr q21, [x0, #528] +sqrdmulh v14.4S, v17.4S, v5.s[0] +mul v17.4S, v17.4S,v4.s[0] +mla v17.4S, v14.4S, v31.s[0] +sub v14.4s, v11.4s, v17.4s +add v11.4s, v11.4s, v17.4s +sqrdmulh v17.4S, v22.4S, v5.s[0] +mul v22.4S, v22.4S,v4.s[0] +mla v22.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v5.s[1] +mul v21.4S, v21.4S,v4.s[1] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v21.4s +add v11.4s, v11.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v5.s[2] +mul v17.4S, v17.4S,v4.s[2] +mla v17.4S, v21.4S, v31.s[0] +sub v21.4s, v14.4s, v17.4s +add v14.4s, v14.4s, v17.4s +trn1 v17.4S, v11.4S, v22.4S +trn2 v0.4S, v11.4S, v22.4S +trn1 v19.4S, v14.4S, v21.4S +trn2 v13.4S, v14.4S, v21.4S +trn2 v14.2D, v17.2D, v19.2D +trn2 v21.2D, v0.2D, v13.2D +trn1 v11.2D, v17.2D, v19.2D +trn1 v22.2D, v0.2D, v13.2D +sqrdmulh v13.4S, v14.4S, v7.4S +mul v14.4S, v14.4S,v6.4S +mla v14.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v14.4s +add v11.4s, v11.4s, v14.4s +sqrdmulh v14.4S, v21.4S, v7.4S +mul v21.4S, v21.4S,v6.4S +mla v21.4S, v14.4S, v31.s[0] +sub v14.4s, v22.4s, v21.4s +add v22.4s, v22.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v10.4S +mul v22.4S, v22.4S,v15.4S +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +sqrdmulh v22.4S, v14.4S, v16.4S +mul v14.4S, v14.4S,v2.4S +mla v14.4S, v22.4S, v31.s[0] +sub v22.4s, v13.4s, v14.4s +add v13.4s, v13.4s, v14.4s +str q11, [x0, #512] +str q21, [x0, #528] +str q13, [x0, #544] +str q22, [x0, #560] +ldr q16, [x17, #+1280] +ldr q2, [x17, #+1296] +ldr q10, [x17, #+1312] +ldr q15, [x17, #+1328] +ldr q7, [x17, #+1344] +ldr q6, [x17, #+1360] +ldr q5, [x17, #+1376] +ldr q4, [x17, #+1392] +ldr q22, [x0, #608] +ldr q13, [x0, #624] +ldr q21, [x0, #576] +ldr q11, [x0, #592] +sqrdmulh v14.4S, v22.4S, v2.s[0] +mul v22.4S, v22.4S,v16.s[0] +mla v22.4S, v14.4S, v31.s[0] +sub v14.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +sqrdmulh v22.4S, v13.4S, v2.s[0] +mul v13.4S, v13.4S,v16.s[0] +mla v13.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +sqrdmulh v13.4S, v11.4S, v2.s[1] +mul v11.4S, v11.4S,v16.s[1] +mla v11.4S, v13.4S, v31.s[0] +sub v13.4s, v21.4s, v11.4s +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v2.s[2] +mul v22.4S, v22.4S,v16.s[2] +mla v22.4S, v11.4S, v31.s[0] +sub v11.4s, v14.4s, v22.4s +add v14.4s, v14.4s, v22.4s +trn1 v22.4S, v21.4S, v13.4S +trn2 v0.4S, v21.4S, v13.4S +trn1 v19.4S, v14.4S, v11.4S +trn2 v17.4S, v14.4S, v11.4S +trn2 v14.2D, v22.2D, v19.2D +trn2 v11.2D, v0.2D, v17.2D +trn1 v21.2D, v22.2D, v19.2D +trn1 v13.2D, v0.2D, v17.2D +sqrdmulh v17.4S, v14.4S, v15.4S +mul v14.4S, v14.4S,v10.4S +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v14.4s +add v21.4s, v21.4s, v14.4s +sqrdmulh v14.4S, v11.4S, v15.4S +mul v11.4S, v11.4S,v10.4S +mla v11.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v11.4s +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v6.4S +mul v13.4S, v13.4S,v7.4S +mla v13.4S, v11.4S, v31.s[0] +sub v11.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +sqrdmulh v13.4S, v14.4S, v4.4S +mul v14.4S, v14.4S,v5.4S +mla v14.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v14.4s +add v17.4s, v17.4s, v14.4s +str q21, [x0, #576] +str q11, [x0, #592] +str q17, [x0, #608] +str q13, [x0, #624] +ldr q4, [x17, #+1408] +ldr q5, [x17, #+1424] +ldr q6, [x17, #+1440] +ldr q7, [x17, #+1456] +ldr q15, [x17, #+1472] +ldr q10, [x17, #+1488] +ldr q2, [x17, #+1504] +ldr q16, [x17, #+1520] +ldr q13, [x0, #672] +ldr q17, [x0, #688] +ldr q11, [x0, #640] +ldr q21, [x0, #656] +sqrdmulh v14.4S, v13.4S, v5.s[0] +mul v13.4S, v13.4S,v4.s[0] +mla v13.4S, v14.4S, v31.s[0] +sub v14.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +sqrdmulh v13.4S, v17.4S, v5.s[0] +mul v17.4S, v17.4S,v4.s[0] +mla v17.4S, v13.4S, v31.s[0] +sub v13.4s, v21.4s, v17.4s +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v21.4S, v5.s[1] +mul v21.4S, v21.4S,v4.s[1] +mla v21.4S, v17.4S, v31.s[0] +sub v17.4s, v11.4s, v21.4s +add v11.4s, v11.4s, v21.4s +sqrdmulh v21.4S, v13.4S, v5.s[2] +mul v13.4S, v13.4S,v4.s[2] +mla v13.4S, v21.4S, v31.s[0] +sub v21.4s, v14.4s, v13.4s +add v14.4s, v14.4s, v13.4s +trn1 v13.4S, v11.4S, v17.4S +trn2 v0.4S, v11.4S, v17.4S +trn1 v19.4S, v14.4S, v21.4S +trn2 v22.4S, v14.4S, v21.4S +trn2 v14.2D, v13.2D, v19.2D +trn2 v21.2D, v0.2D, v22.2D +trn1 v11.2D, v13.2D, v19.2D +trn1 v17.2D, v0.2D, v22.2D +sqrdmulh v22.4S, v14.4S, v7.4S +mul v14.4S, v14.4S,v6.4S +mla v14.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v14.4s +add v11.4s, v11.4s, v14.4s +sqrdmulh v14.4S, v21.4S, v7.4S +mul v21.4S, v21.4S,v6.4S +mla v21.4S, v14.4S, v31.s[0] +sub v14.4s, v17.4s, v21.4s +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v10.4S +mul v17.4S, v17.4S,v15.4S +mla v17.4S, v21.4S, v31.s[0] +sub v21.4s, v11.4s, v17.4s +add v11.4s, v11.4s, v17.4s +sqrdmulh v17.4S, v14.4S, v16.4S +mul v14.4S, v14.4S,v2.4S +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v22.4s, v14.4s +add v22.4s, v22.4s, v14.4s +str q11, [x0, #640] +str q21, [x0, #656] +str q22, [x0, #672] +str q17, [x0, #688] +ldr q16, [x17, #+1536] +ldr q2, [x17, #+1552] +ldr q10, [x17, #+1568] +ldr q15, [x17, #+1584] +ldr q7, [x17, #+1600] +ldr q6, [x17, #+1616] +ldr q5, [x17, #+1632] +ldr q4, [x17, #+1648] +ldr q17, [x0, #736] +ldr q22, [x0, #752] +ldr q21, [x0, #704] +ldr q11, [x0, #720] +sqrdmulh v14.4S, v17.4S, v2.s[0] +mul v17.4S, v17.4S,v16.s[0] +mla v17.4S, v14.4S, v31.s[0] +sub v14.4s, v21.4s, v17.4s +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v22.4S, v2.s[0] +mul v22.4S, v22.4S,v16.s[0] +mla v22.4S, v17.4S, v31.s[0] +sub v17.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +sqrdmulh v22.4S, v11.4S, v2.s[1] +mul v11.4S, v11.4S,v16.s[1] +mla v11.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v11.4s +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v2.s[2] +mul v17.4S, v17.4S,v16.s[2] +mla v17.4S, v11.4S, v31.s[0] +sub v11.4s, v14.4s, v17.4s +add v14.4s, v14.4s, v17.4s +trn1 v17.4S, v21.4S, v22.4S +trn2 v0.4S, v21.4S, v22.4S +trn1 v19.4S, v14.4S, v11.4S +trn2 v13.4S, v14.4S, v11.4S +trn2 v14.2D, v17.2D, v19.2D +trn2 v11.2D, v0.2D, v13.2D +trn1 v21.2D, v17.2D, v19.2D +trn1 v22.2D, v0.2D, v13.2D +sqrdmulh v13.4S, v14.4S, v15.4S +mul v14.4S, v14.4S,v10.4S +mla v14.4S, v13.4S, v31.s[0] +sub v13.4s, v21.4s, v14.4s +add v21.4s, v21.4s, v14.4s +sqrdmulh v14.4S, v11.4S, v15.4S +mul v11.4S, v11.4S,v10.4S +mla v11.4S, v14.4S, v31.s[0] +sub v14.4s, v22.4s, v11.4s +add v22.4s, v22.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v6.4S +mul v22.4S, v22.4S,v7.4S +mla v22.4S, v11.4S, v31.s[0] +sub v11.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +sqrdmulh v22.4S, v14.4S, v4.4S +mul v14.4S, v14.4S,v5.4S +mla v14.4S, v22.4S, v31.s[0] +sub v22.4s, v13.4s, v14.4s +add v13.4s, v13.4s, v14.4s +str q21, [x0, #704] +str q11, [x0, #720] +str q13, [x0, #736] +str q22, [x0, #752] +ldr q4, [x17, #+1664] +ldr q5, [x17, #+1680] +ldr q6, [x17, #+1696] +ldr q7, [x17, #+1712] +ldr q15, [x17, #+1728] +ldr q10, [x17, #+1744] +ldr q2, [x17, #+1760] +ldr q16, [x17, #+1776] +ldr q22, [x0, #800] +ldr q13, [x0, #816] +ldr q11, [x0, #768] +ldr q21, [x0, #784] +sqrdmulh v14.4S, v22.4S, v5.s[0] +mul v22.4S, v22.4S,v4.s[0] +mla v22.4S, v14.4S, v31.s[0] +sub v14.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +sqrdmulh v22.4S, v13.4S, v5.s[0] +mul v13.4S, v13.4S,v4.s[0] +mla v13.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +sqrdmulh v13.4S, v21.4S, v5.s[1] +mul v21.4S, v21.4S,v4.s[1] +mla v21.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v21.4s +add v11.4s, v11.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v5.s[2] +mul v22.4S, v22.4S,v4.s[2] +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v14.4s, v22.4s +add v14.4s, v14.4s, v22.4s +trn1 v22.4S, v11.4S, v13.4S +trn2 v0.4S, v11.4S, v13.4S +trn1 v19.4S, v14.4S, v21.4S +trn2 v17.4S, v14.4S, v21.4S +trn2 v14.2D, v22.2D, v19.2D +trn2 v21.2D, v0.2D, v17.2D +trn1 v11.2D, v22.2D, v19.2D +trn1 v13.2D, v0.2D, v17.2D +sqrdmulh v17.4S, v14.4S, v7.4S +mul v14.4S, v14.4S,v6.4S +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v11.4s, v14.4s +add v11.4s, v11.4s, v14.4s +sqrdmulh v14.4S, v21.4S, v7.4S +mul v21.4S, v21.4S,v6.4S +mla v21.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v21.4s +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v13.4S, v10.4S +mul v13.4S, v13.4S,v15.4S +mla v13.4S, v21.4S, v31.s[0] +sub v21.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +sqrdmulh v13.4S, v14.4S, v16.4S +mul v14.4S, v14.4S,v2.4S +mla v14.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v14.4s +add v17.4s, v17.4s, v14.4s +str q11, [x0, #768] +str q21, [x0, #784] +str q17, [x0, #800] +str q13, [x0, #816] +ldr q16, [x17, #+1792] +ldr q2, [x17, #+1808] +ldr q10, [x17, #+1824] +ldr q15, [x17, #+1840] +ldr q7, [x17, #+1856] +ldr q6, [x17, #+1872] +ldr q5, [x17, #+1888] +ldr q4, [x17, #+1904] +ldr q13, [x0, #864] +ldr q17, [x0, #880] +ldr q21, [x0, #832] +ldr q11, [x0, #848] +sqrdmulh v14.4S, v13.4S, v2.s[0] +mul v13.4S, v13.4S,v16.s[0] +mla v13.4S, v14.4S, v31.s[0] +sub v14.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +sqrdmulh v13.4S, v17.4S, v2.s[0] +mul v17.4S, v17.4S,v16.s[0] +mla v17.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v17.4s +add v11.4s, v11.4s, v17.4s +sqrdmulh v17.4S, v11.4S, v2.s[1] +mul v11.4S, v11.4S,v16.s[1] +mla v11.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v11.4s +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v2.s[2] +mul v13.4S, v13.4S,v16.s[2] +mla v13.4S, v11.4S, v31.s[0] +sub v11.4s, v14.4s, v13.4s +add v14.4s, v14.4s, v13.4s +trn1 v13.4S, v21.4S, v17.4S +trn2 v0.4S, v21.4S, v17.4S +trn1 v19.4S, v14.4S, v11.4S +trn2 v22.4S, v14.4S, v11.4S +trn2 v14.2D, v13.2D, v19.2D +trn2 v11.2D, v0.2D, v22.2D +trn1 v21.2D, v13.2D, v19.2D +trn1 v17.2D, v0.2D, v22.2D +sqrdmulh v22.4S, v14.4S, v15.4S +mul v14.4S, v14.4S,v10.4S +mla v14.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v14.4s +add v21.4s, v21.4s, v14.4s +sqrdmulh v14.4S, v11.4S, v15.4S +mul v11.4S, v11.4S,v10.4S +mla v11.4S, v14.4S, v31.s[0] +sub v14.4s, v17.4s, v11.4s +add v17.4s, v17.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v6.4S +mul v17.4S, v17.4S,v7.4S +mla v17.4S, v11.4S, v31.s[0] +sub v11.4s, v21.4s, v17.4s +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v14.4S, v4.4S +mul v14.4S, v14.4S,v5.4S +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v22.4s, v14.4s +add v22.4s, v22.4s, v14.4s +str q21, [x0, #832] +str q11, [x0, #848] +str q22, [x0, #864] +str q17, [x0, #880] +ldr q4, [x17, #+1920] +ldr q5, [x17, #+1936] +ldr q6, [x17, #+1952] +ldr q7, [x17, #+1968] +ldr q15, [x17, #+1984] +ldr q10, [x17, #+2000] +ldr q2, [x17, #+2016] +ldr q16, [x17, #+2032] +ldr q17, [x0, #928] +ldr q22, [x0, #944] +ldr q11, [x0, #896] +ldr q21, [x0, #912] +sqrdmulh v14.4S, v17.4S, v5.s[0] +mul v17.4S, v17.4S,v4.s[0] +mla v17.4S, v14.4S, v31.s[0] +sub v14.4s, v11.4s, v17.4s +add v11.4s, v11.4s, v17.4s +sqrdmulh v17.4S, v22.4S, v5.s[0] +mul v22.4S, v22.4S,v4.s[0] +mla v22.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v5.s[1] +mul v21.4S, v21.4S,v4.s[1] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v21.4s +add v11.4s, v11.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v5.s[2] +mul v17.4S, v17.4S,v4.s[2] +mla v17.4S, v21.4S, v31.s[0] +sub v21.4s, v14.4s, v17.4s +add v14.4s, v14.4s, v17.4s +trn1 v17.4S, v11.4S, v22.4S +trn2 v0.4S, v11.4S, v22.4S +trn1 v19.4S, v14.4S, v21.4S +trn2 v13.4S, v14.4S, v21.4S +trn2 v14.2D, v17.2D, v19.2D +trn2 v21.2D, v0.2D, v13.2D +trn1 v11.2D, v17.2D, v19.2D +trn1 v22.2D, v0.2D, v13.2D +sqrdmulh v13.4S, v14.4S, v7.4S +mul v14.4S, v14.4S,v6.4S +mla v14.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v14.4s +add v11.4s, v11.4s, v14.4s +sqrdmulh v14.4S, v21.4S, v7.4S +mul v21.4S, v21.4S,v6.4S +mla v21.4S, v14.4S, v31.s[0] +sub v14.4s, v22.4s, v21.4s +add v22.4s, v22.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v10.4S +mul v22.4S, v22.4S,v15.4S +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +sqrdmulh v22.4S, v14.4S, v16.4S +mul v14.4S, v14.4S,v2.4S +mla v14.4S, v22.4S, v31.s[0] +sub v22.4s, v13.4s, v14.4s +add v13.4s, v13.4s, v14.4s +str q11, [x0, #896] +str q21, [x0, #912] +str q13, [x0, #928] +str q22, [x0, #944] +ldr q16, [x17, #+2048] +ldr q2, [x17, #+2064] +ldr q10, [x17, #+2080] +ldr q15, [x17, #+2096] +ldr q7, [x17, #+2112] +ldr q6, [x17, #+2128] +ldr q5, [x17, #+2144] +ldr q4, [x17, #+2160] +ldr q22, [x0, #992] +ldr q13, [x0, #1008] +ldr q21, [x0, #960] +ldr q11, [x0, #976] +sqrdmulh v14.4S, v22.4S, v2.s[0] +mul v22.4S, v22.4S,v16.s[0] +mla v22.4S, v14.4S, v31.s[0] +sub v14.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +sqrdmulh v22.4S, v13.4S, v2.s[0] +mul v13.4S, v13.4S,v16.s[0] +mla v13.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +sqrdmulh v13.4S, v11.4S, v2.s[1] +mul v11.4S, v11.4S,v16.s[1] +mla v11.4S, v13.4S, v31.s[0] +sub v13.4s, v21.4s, v11.4s +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v2.s[2] +mul v22.4S, v22.4S,v16.s[2] +mla v22.4S, v11.4S, v31.s[0] +sub v11.4s, v14.4s, v22.4s +add v14.4s, v14.4s, v22.4s +trn1 v22.4S, v21.4S, v13.4S +trn2 v0.4S, v21.4S, v13.4S +trn1 v19.4S, v14.4S, v11.4S +trn2 v17.4S, v14.4S, v11.4S +trn2 v14.2D, v22.2D, v19.2D +trn2 v11.2D, v0.2D, v17.2D +trn1 v21.2D, v22.2D, v19.2D +trn1 v13.2D, v0.2D, v17.2D +sqrdmulh v17.4S, v14.4S, v15.4S +mul v14.4S, v14.4S,v10.4S +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v14.4s +add v21.4s, v21.4s, v14.4s +sqrdmulh v14.4S, v11.4S, v15.4S +mul v11.4S, v11.4S,v10.4S +mla v11.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v11.4s +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v6.4S +mul v13.4S, v13.4S,v7.4S +mla v13.4S, v11.4S, v31.s[0] +sub v11.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +sqrdmulh v13.4S, v14.4S, v4.4S +mul v14.4S, v14.4S,v5.4S +mla v14.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v14.4s +add v17.4s, v17.4s, v14.4s +str q21, [x0, #960] +str q11, [x0, #976] +str q17, [x0, #992] +str q13, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 2392 +// Instruction count: 2388 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_3_z4_1.s b/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_3_z4_1.s new file mode 100644 index 0000000..02d26b2 --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_3_z4_1.s @@ -0,0 +1,2422 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 26036764 // Layer 6, block 0 +.word 7065381 // Layer 6, block 1 +.word 11280567 // Layer 6, block 2 +.word 19695786 // Layer 6, block 3 +.word 1666225723 // Layer 6, block 0 +.word 452149874 // Layer 6, block 1 +.word 721901190 // Layer 6, block 2 +.word 1260434103 // Layer 6, block 3 +.word 28678040 // Layer 7, block 0 +.word 5637166 // Layer 7, block 2 +.word 18759424 // Layer 7, block 4 +.word 8648030 // Layer 7, block 6 +.word 1835254486 // Layer 7, block 0 +.word 360751090 // Layer 7, block 2 +.word 1200511508 // Layer 7, block 4 +.word 553431680 // Layer 7, block 6 +.word 7232147 // Layer 7, block 1 +.word 7430689 // Layer 7, block 3 +.word 14819378 // Layer 7, block 5 +.word 22112339 // Layer 7, block 7 +.word 462822084 // Layer 7, block 1 +.word 475527802 // Layer 7, block 3 +.word 948367809 // Layer 7, block 5 +.word 1415081692 // Layer 7, block 7 +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14834498 // Layer 6, block 4 +.word 22861321 // Layer 6, block 5 +.word 23033862 // Layer 6, block 6 +.word 32211066 // Layer 6, block 7 +.word 949335415 // Layer 6, block 4 +.word 1463012881 // Layer 6, block 5 +.word 1474054663 // Layer 6, block 6 +.word 2061350894 // Layer 6, block 7 +.word 7103825 // Layer 7, block 8 +.word 24338119 // Layer 7, block 10 +.word 6674394 // Layer 7, block 12 +.word 3716128 // Layer 7, block 14 +.word 454610102 // Layer 7, block 8 +.word 1557520740 // Layer 7, block 10 +.word 427128616 // Layer 7, block 12 +.word 237814041 // Layer 7, block 14 +.word 18577393 // Layer 7, block 9 +.word 17042091 // Layer 7, block 11 +.word 6574213 // Layer 7, block 13 +.word 24666803 // Layer 7, block 15 +.word 1188862414 // Layer 7, block 9 +.word 1090610585 // Layer 7, block 11 +.word 420717521 // Layer 7, block 13 +.word 1578554911 // Layer 7, block 15 +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 11253846 // Layer 6, block 8 +.word 16151303 // Layer 6, block 9 +.word 1821442 // Layer 6, block 10 +.word 23358663 // Layer 6, block 11 +.word 720191176 // Layer 6, block 8 +.word 1033604503 // Layer 6, block 9 +.word 116563391 // Layer 6, block 10 +.word 1494840340 // Layer 6, block 11 +.word 32787475 // Layer 7, block 16 +.word 8269259 // Layer 7, block 18 +.word 20826321 // Layer 7, block 20 +.word 21194054 // Layer 7, block 22 +.word 2098238255 // Layer 7, block 16 +.word 529192186 // Layer 7, block 18 +.word 1332782821 // Layer 7, block 20 +.word 1356315937 // Layer 7, block 22 +.word 28400654 // Layer 7, block 17 +.word 31090287 // Layer 7, block 19 +.word 26776841 // Layer 7, block 21 +.word 22281074 // Layer 7, block 23 +.word 1817503137 // Layer 7, block 17 +.word 1989626512 // Layer 7, block 19 +.word 1713587037 // Layer 7, block 21 +.word 1425879908 // Layer 7, block 23 +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 20504641 // Layer 6, block 12 +.word 7735096 // Layer 6, block 13 +.word 29463916 // Layer 6, block 14 +.word 23172067 // Layer 6, block 15 +.word 1312196872 // Layer 6, block 12 +.word 495008363 // Layer 6, block 13 +.word 1885546712 // Layer 6, block 14 +.word 1482899108 // Layer 6, block 15 +.word 1953000 // Layer 7, block 24 +.word 12766243 // Layer 7, block 26 +.word 16292342 // Layer 7, block 28 +.word 25143337 // Layer 7, block 30 +.word 124982461 // Layer 7, block 24 +.word 816977197 // Layer 7, block 26 +.word 1042630311 // Layer 7, block 28 +.word 1609050759 // Layer 7, block 30 +.word 12486848 // Layer 7, block 25 +.word 31556661 // Layer 7, block 27 +.word 28330310 // Layer 7, block 29 +.word 15137961 // Layer 7, block 31 +.word 799097282 // Layer 7, block 25 +.word 2019472170 // Layer 7, block 27 +.word 1813001465 // Layer 7, block 29 +.word 968755565 // Layer 7, block 31 +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 18663828 // Layer 6, block 16 +.word 25765932 // Layer 6, block 17 +.word 11779122 // Layer 6, block 18 +.word 29112305 // Layer 6, block 19 +.word 1194393831 // Layer 6, block 16 +.word 1648893798 // Layer 6, block 17 +.word 753806275 // Layer 6, block 18 +.word 1863045325 // Layer 6, block 19 +.word 33163184 // Layer 7, block 32 +.word 11550623 // Layer 7, block 34 +.word 25375595 // Layer 7, block 36 +.word 18254638 // Layer 7, block 38 +.word 2122281795 // Layer 7, block 32 +.word 739183455 // Layer 7, block 34 +.word 1623914137 // Layer 7, block 36 +.word 1168207670 // Layer 7, block 38 +.word 9551359 // Layer 7, block 33 +.word 33257316 // Layer 7, block 35 +.word 10387700 // Layer 7, block 37 +.word 4263629 // Layer 7, block 39 +.word 611240324 // Layer 7, block 33 +.word 2128305784 // Layer 7, block 35 +.word 664762063 // Layer 7, block 37 +.word 272851431 // Layer 7, block 39 +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 596073 // Layer 6, block 20 +.word 29039358 // Layer 6, block 21 +.word 6760262 // Layer 6, block 22 +.word 2228887 // Layer 6, block 23 +.word 38145761 // Layer 6, block 20 +.word 1858377074 // Layer 6, block 21 +.word 432623749 // Layer 6, block 22 +.word 142637881 // Layer 6, block 23 +.word 25929180 // Layer 7, block 40 +.word 23508428 // Layer 7, block 42 +.word 22560727 // Layer 7, block 44 +.word 29457393 // Layer 7, block 46 +.word 1659340873 // Layer 7, block 40 +.word 1504424569 // Layer 7, block 42 +.word 1443776334 // Layer 7, block 44 +.word 1885129272 // Layer 7, block 46 +.word 17371159 // Layer 7, block 41 +.word 11558208 // Layer 7, block 43 +.word 15755637 // Layer 7, block 45 +.word 20740787 // Layer 7, block 47 +.word 1111669329 // Layer 7, block 41 +.word 739668858 // Layer 7, block 43 +.word 1008283812 // Layer 7, block 45 +.word 1327309063 // Layer 7, block 47 +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 13624329 // Layer 6, block 24 +.word 9838349 // Layer 6, block 25 +.word 6934560 // Layer 6, block 26 +.word 11310234 // Layer 6, block 27 +.word 871890510 // Layer 6, block 24 +.word 629606282 // Layer 6, block 25 +.word 443777969 // Layer 6, block 26 +.word 723799733 // Layer 6, block 27 +.word 3153984 // Layer 7, block 48 +.word 15599806 // Layer 7, block 50 +.word 23484790 // Layer 7, block 52 +.word 30174454 // Layer 7, block 54 +.word 201839571 // Layer 7, block 48 +.word 998311389 // Layer 7, block 50 +.word 1502911852 // Layer 7, block 52 +.word 1931017673 // Layer 7, block 54 +.word 13598070 // Layer 7, block 49 +.word 31454003 // Layer 7, block 51 +.word 20506260 // Layer 7, block 53 +.word 5928435 // Layer 7, block 55 +.word 870210062 // Layer 7, block 49 +.word 2012902560 // Layer 7, block 51 +.word 1312300480 // Layer 7, block 53 +.word 379390883 // Layer 7, block 55 +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 32798516 // Layer 6, block 28 +.word 9911360 // Layer 6, block 29 +.word 32443170 // Layer 6, block 30 +.word 31293482 // Layer 6, block 31 +.word 2098944825 // Layer 6, block 28 +.word 634278629 // Layer 6, block 29 +.word 2076204416 // Layer 6, block 30 +.word 2002630000 // Layer 6, block 31 +.word 26013877 // Layer 7, block 56 +.word 22928950 // Layer 7, block 58 +.word 24547058 // Layer 7, block 60 +.word 21082546 // Layer 7, block 62 +.word 1664761067 // Layer 7, block 56 +.word 1467340807 // Layer 7, block 58 +.word 1570891816 // Layer 7, block 60 +.word 1349179970 // Layer 7, block 62 +.word 21864746 // Layer 7, block 57 +.word 27678266 // Layer 7, block 59 +.word 30695887 // Layer 7, block 61 +.word 31772478 // Layer 7, block 63 +.word 1399236949 // Layer 7, block 57 +.word 1771273834 // Layer 7, block 59 +.word 1964386839 // Layer 7, block 61 +.word 2033283404 // Layer 7, block 63 +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 2853776 // Layer 6, block 32 +.word 31645959 // Layer 6, block 33 +.word 29723614 // Layer 6, block 34 +.word 31813171 // Layer 6, block 35 +.word 182627725 // Layer 6, block 32 +.word 2025186806 // Layer 6, block 33 +.word 1902166116 // Layer 6, block 34 +.word 2035887557 // Layer 6, block 35 +.word 30377953 // Layer 7, block 64 +.word 4924837 // Layer 7, block 66 +.word 11362575 // Layer 7, block 68 +.word 31398766 // Layer 7, block 70 +.word 1944040616 // Layer 7, block 64 +.word 315165513 // Layer 7, block 66 +.word 727149301 // Layer 7, block 68 +.word 2009367662 // Layer 7, block 70 +.word 27689101 // Layer 7, block 65 +.word 31229525 // Layer 7, block 67 +.word 6544948 // Layer 7, block 69 +.word 13728247 // Layer 7, block 71 +.word 1771967221 // Layer 7, block 65 +.word 1998537064 // Layer 7, block 67 +.word 418844704 // Layer 7, block 69 +.word 878540754 // Layer 7, block 71 +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9116920 // Layer 6, block 36 +.word 26449800 // Layer 6, block 37 +.word 27173300 // Layer 6, block 38 +.word 1574249 // Layer 6, block 39 +.word 583438350 // Layer 6, block 36 +.word 1692658010 // Layer 6, block 37 +.word 1738958476 // Layer 6, block 38 +.word 100744247 // Layer 6, block 39 +.word 6510145 // Layer 7, block 72 +.word 760999 // Layer 7, block 74 +.word 1634503 // Layer 7, block 76 +.word 29546109 // Layer 7, block 78 +.word 416617482 // Layer 7, block 72 +.word 48700219 // Layer 7, block 74 +.word 104600209 // Layer 7, block 76 +.word 1890806663 // Layer 7, block 78 +.word 2195232 // Layer 7, block 73 +.word 4465852 // Layer 7, block 75 +.word 31203102 // Layer 7, block 77 +.word 29916743 // Layer 7, block 79 +.word 140484126 // Layer 7, block 73 +.word 285792715 // Layer 7, block 75 +.word 1996846121 // Layer 7, block 77 +.word 1914525428 // Layer 7, block 79 +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29172999 // Layer 6, block 40 +.word 16825951 // Layer 6, block 41 +.word 11592382 // Layer 6, block 42 +.word 2671395 // Layer 6, block 43 +.word 1866929445 // Layer 6, block 40 +.word 1076778680 // Layer 6, block 41 +.word 741855827 // Layer 6, block 42 +.word 170956232 // Layer 6, block 43 +.word 14579779 // Layer 7, block 80 +.word 24263513 // Layer 7, block 82 +.word 4646776 // Layer 7, block 84 +.word 69049 // Layer 7, block 86 +.word 933034643 // Layer 7, block 80 +.word 1552746321 // Layer 7, block 82 +.word 297370968 // Layer 7, block 84 +.word 4418799 // Layer 7, block 86 +.word 33263488 // Layer 7, block 81 +.word 22493246 // Layer 7, block 83 +.word 22009979 // Layer 7, block 85 +.word 12021234 // Layer 7, block 87 +.word 2128700762 // Layer 7, block 81 +.word 1439457879 // Layer 7, block 83 +.word 1408531152 // Layer 7, block 85 +.word 769300260 // Layer 7, block 87 +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 15720958 // Layer 6, block 44 +.word 4876619 // Layer 6, block 45 +.word 9370171 // Layer 6, block 46 +.word 2197027 // Layer 6, block 47 +.word 1006064525 // Layer 6, block 44 +.word 312079797 // Layer 6, block 45 +.word 599645177 // Layer 6, block 46 +.word 140598997 // Layer 6, block 47 +.word 16117282 // Layer 7, block 88 +.word 9635661 // Layer 7, block 90 +.word 9117520 // Layer 7, block 92 +.word 3506913 // Layer 7, block 94 +.word 1031427326 // Layer 7, block 88 +.word 616635240 // Layer 7, block 90 +.word 583476747 // Layer 7, block 92 +.word 224425303 // Layer 7, block 94 +.word 20014407 // Layer 7, block 89 +.word 25893988 // Layer 7, block 91 +.word 10257619 // Layer 7, block 93 +.word 24501669 // Layer 7, block 95 +.word 1280824291 // Layer 7, block 89 +.word 1657088757 // Layer 7, block 91 +.word 656437514 // Layer 7, block 93 +.word 1567987141 // Layer 7, block 95 +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 23467272 // Layer 6, block 48 +.word 11944835 // Layer 6, block 49 +.word 29768154 // Layer 6, block 50 +.word 3189790 // Layer 6, block 51 +.word 1501790786 // Layer 6, block 48 +.word 764411097 // Layer 6, block 49 +.word 1905016458 // Layer 6, block 50 +.word 204130980 // Layer 6, block 51 +.word 28559032 // Layer 7, block 96 +.word 20151609 // Layer 7, block 98 +.word 11645481 // Layer 7, block 100 +.word 16402437 // Layer 7, block 102 +.word 1827638556 // Layer 7, block 96 +.word 1289604549 // Layer 7, block 98 +.word 745253903 // Layer 7, block 100 +.word 1049675853 // Layer 7, block 102 +.word 1005359 // Layer 7, block 97 +.word 19130139 // Layer 7, block 99 +.word 11690281 // Layer 7, block 101 +.word 5461508 // Layer 7, block 103 +.word 64338065 // Layer 7, block 97 +.word 1224235458 // Layer 7, block 99 +.word 748120885 // Layer 7, block 101 +.word 349509836 // Layer 7, block 103 +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 4898455 // Layer 6, block 52 +.word 22059944 // Layer 6, block 53 +.word 20315246 // Layer 6, block 54 +.word 28615767 // Layer 6, block 55 +.word 313477194 // Layer 6, block 52 +.word 1411728668 // Layer 6, block 53 +.word 1300076517 // Layer 6, block 54 +.word 1831269319 // Layer 6, block 55 +.word 6226096 // Layer 7, block 104 +.word 14029790 // Layer 7, block 106 +.word 7729000 // Layer 7, block 108 +.word 13958531 // Layer 7, block 110 +.word 398439734 // Layer 7, block 104 +.word 897838034 // Layer 7, block 106 +.word 494618249 // Layer 7, block 108 +.word 893277806 // Layer 7, block 110 +.word 31755058 // Layer 7, block 105 +.word 26102744 // Layer 7, block 107 +.word 19175904 // Layer 7, block 109 +.word 19472238 // Layer 7, block 111 +.word 2032168609 // Layer 7, block 105 +.word 1670448121 // Layer 7, block 107 +.word 1227164194 // Layer 7, block 109 +.word 1246128123 // Layer 7, block 111 +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 17302560 // Layer 6, block 56 +.word 8630188 // Layer 6, block 57 +.word 13744680 // Layer 6, block 58 +.word 31890906 // Layer 6, block 59 +.word 1107279328 // Layer 6, block 56 +.word 552289879 // Layer 6, block 57 +.word 879592386 // Layer 6, block 58 +.word 2040862218 // Layer 6, block 59 +.word 4735938 // Layer 7, block 112 +.word 26671657 // Layer 7, block 114 +.word 25810971 // Layer 7, block 116 +.word 25578690 // Layer 7, block 118 +.word 303076900 // Layer 7, block 112 +.word 1706855774 // Layer 7, block 114 +.word 1651776074 // Layer 7, block 116 +.word 1636911225 // Layer 7, block 118 +.word 6957373 // Layer 7, block 113 +.word 25381712 // Layer 7, block 115 +.word 27780827 // Layer 7, block 117 +.word 28062311 // Layer 7, block 119 +.word 445237890 // Layer 7, block 113 +.word 1624305595 // Layer 7, block 115 +.word 1777837237 // Layer 7, block 117 +.word 1795850838 // Layer 7, block 119 +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 26150922 // Layer 6, block 60 +.word 29525906 // Layer 6, block 61 +.word 23080870 // Layer 6, block 62 +.word 1636987 // Layer 6, block 63 +.word 1673531278 // Layer 6, block 60 +.word 1889513769 // Layer 6, block 61 +.word 1477062945 // Layer 6, block 62 +.word 104759172 // Layer 6, block 63 +.word 10674616 // Layer 7, block 120 +.word 9508293 // Layer 7, block 122 +.word 4274200 // Layer 7, block 124 +.word 10066304 // Layer 7, block 126 +.word 683123285 // Layer 7, block 120 +.word 608484310 // Layer 7, block 122 +.word 273527923 // Layer 7, block 124 +.word 644194289 // Layer 7, block 126 +.word 26473446 // Layer 7, block 121 +.word 14853570 // Layer 7, block 123 +.word 32427548 // Layer 7, block 125 +.word 16598340 // Layer 7, block 127 +.word 1694171239 // Layer 7, block 121 +.word 950555930 // Layer 7, block 123 +.word 2075204685 // Layer 7, block 125 +.word 1062212688 // Layer 7, block 127 +.text +.global ntt_u32_full_neon_asm_var_4_4_3_z4_1 +.global _ntt_u32_full_neon_asm_var_4_4_3_z4_1 +ntt_u32_full_neon_asm_var_4_4_3_z4_1: +_ntt_u32_full_neon_asm_var_4_4_3_z4_1: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #800] +ldr q21, [x0, #864] +ldr q20, [x0, #928] +ldr q19, [x0, #992] +ldr q18, [x0, #288] +ldr q17, [x0, #352] +ldr q16, [x0, #416] +ldr q3, [x0, #480] +sqrdmulh v2.4S, v22.4S, v29.s[0] +ldr q1, [x0, #544] +mul v22.4S, v22.4S,v30.s[0] +ldr q0, [x0, #608] +sqrdmulh v15.4S, v21.4S, v29.s[0] +ldr q14, [x0, #672] +mul v21.4S, v21.4S,v30.s[0] +ldr q13, [x0, #736] +mla v22.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q12, [x0, #32] +sub v11.4s, v18.4s, v22.4s +mla v21.4S, v15.4S, v31.s[0] +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q15, [x0, #96] +sub v10.4s, v17.4s, v21.4s +mla v20.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v1.4S, v29.s[0] +ldr q2, [x0, #160] +mul v1.4S, v1.4S,v30.s[0] +sub v9.4s, v16.4s, v20.4s +mla v19.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v0.4S, v29.s[0] +ldr q22, [x0, #224] +mul v0.4S, v0.4S,v30.s[0] +sub v8.4s, v3.4s, v19.4s +mla v1.4S, v21.4S, v31.s[0] +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v21.4s, v12.4s, v1.4s +mla v0.4S, v20.4S, v31.s[0] +add v12.4s, v12.4s, v1.4s +sqrdmulh v1.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +sub v20.4s, v15.4s, v0.4s +mla v14.4S, v19.4S, v31.s[0] +add v15.4s, v15.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v19.4s, v2.4s, v14.4s +mla v13.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v1.4s, v22.4s, v13.4s +mla v16.4S, v0.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v0.4s, v2.4s, v16.4s +mla v3.4S, v14.4S, v31.s[0] +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v14.4s, v22.4s, v3.4s +mla v18.4S, v13.4S, v31.s[0] +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v29.s[2] +mul v9.4S, v9.4S,v30.s[2] +sub v13.4s, v12.4s, v18.4s +mla v17.4S, v16.4S, v31.s[0] +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v8.4S, v29.s[2] +mul v8.4S, v8.4S,v30.s[2] +sub v16.4s, v15.4s, v17.4s +mla v9.4S, v3.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +sqrdmulh v17.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v3.4s, v19.4s, v9.4s +mla v8.4S, v18.4S, v31.s[0] +add v19.4s, v19.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v18.4s, v1.4s, v8.4s +mla v11.4S, v17.4S, v31.s[0] +add v1.4s, v1.4s, v8.4s +sqrdmulh v8.4S, v2.4S, v27.s[0] +mul v2.4S, v2.4S,v28.s[0] +sub v17.4s, v21.4s, v11.4s +mla v10.4S, v9.4S, v31.s[0] +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v27.s[0] +mul v22.4S, v22.4S,v28.s[0] +sub v9.4s, v20.4s, v10.4s +mla v2.4S, v8.4S, v31.s[0] +add v20.4s, v20.4s, v10.4s +sqrdmulh v10.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v8.4s, v12.4s, v2.4s +mla v22.4S, v11.4S, v31.s[0] +add v12.4s, v12.4s, v2.4s +sqrdmulh v2.4S, v14.4S, v27.s[1] +mul v14.4S, v14.4S,v28.s[1] +sub v11.4s, v15.4s, v22.4s +mla v0.4S, v10.4S, v31.s[0] +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v27.s[2] +mul v19.4S, v19.4S,v28.s[2] +sub v10.4s, v13.4s, v0.4s +mla v14.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v0.4s +sqrdmulh v0.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +sub v2.4s, v16.4s, v14.4s +mla v19.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v27.s[3] +mul v3.4S, v3.4S,v28.s[3] +sub v22.4s, v21.4s, v19.4s +mla v1.4S, v0.4S, v31.s[0] +add v21.4s, v21.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v0.4s, v20.4s, v1.4s +mla v3.4S, v14.4S, v31.s[0] +add v20.4s, v20.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v25.s[0] +mul v15.4S, v15.4S,v26.s[0] +sub v14.4s, v17.4s, v3.4s +mla v18.4S, v19.4S, v31.s[0] +add v17.4s, v17.4s, v3.4s +sqrdmulh v3.4S, v11.4S, v25.s[1] +mul v11.4S, v11.4S,v26.s[1] +sub v19.4s, v9.4s, v18.4s +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v1.4s, v12.4s, v15.4s +mla v11.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v25.s[3] +mul v2.4S, v2.4S,v26.s[3] +sub v3.4s, v8.4s, v11.4s +mla v16.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v11.4s +str q12, [x0, #32] +sqrdmulh v12.4S, v20.4S, v23.s[0] +str q1, [x0, #96] +mul v20.4S, v20.4S,v24.s[0] +ldr q1, [x0, #816] +sub v11.4s, v13.4s, v16.4s +ldr q18, [x0, #880] +mla v2.4S, v15.4S, v31.s[0] +add v13.4s, v13.4s, v16.4s +str q8, [x0, #160] +sqrdmulh v8.4S, v0.4S, v23.s[1] +str q3, [x0, #224] +mul v0.4S, v0.4S,v24.s[1] +ldr q3, [x0, #944] +sub v16.4s, v10.4s, v2.4s +ldr q15, [x0, #1008] +mla v20.4S, v12.4S, v31.s[0] +add v10.4s, v10.4s, v2.4s +str q13, [x0, #288] +sqrdmulh v13.4S, v9.4S, v23.s[2] +str q11, [x0, #352] +mul v9.4S, v9.4S,v24.s[2] +ldr q11, [x0, #304] +sub v2.4s, v21.4s, v20.4s +ldr q12, [x0, #368] +mla v0.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v20.4s +str q10, [x0, #416] +sqrdmulh v10.4S, v19.4S, v23.s[3] +str q16, [x0, #480] +mul v19.4S, v19.4S,v24.s[3] +ldr q16, [x0, #432] +sub v20.4s, v22.4s, v0.4s +ldr q8, [x0, #496] +mla v9.4S, v13.4S, v31.s[0] +add v22.4s, v22.4s, v0.4s +str q21, [x0, #544] +sqrdmulh v21.4S, v1.4S, v29.s[0] +str q2, [x0, #608] +ldr q2, [x0, #560] +mul v1.4S, v1.4S,v30.s[0] +ldr q0, [x0, #624] +sub v13.4s, v17.4s, v9.4s +mla v19.4S, v10.4S, v31.s[0] +add v17.4s, v17.4s, v9.4s +str q22, [x0, #672] +sqrdmulh v22.4S, v18.4S, v29.s[0] +str q20, [x0, #736] +ldr q20, [x0, #688] +mul v18.4S, v18.4S,v30.s[0] +ldr q9, [x0, #752] +sub v10.4s, v14.4s, v19.4s +mla v1.4S, v21.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +str q17, [x0, #800] +sqrdmulh v17.4S, v3.4S, v29.s[0] +str q13, [x0, #864] +mul v3.4S, v3.4S,v30.s[0] +ldr q13, [x0, #48] +sub v19.4s, v11.4s, v1.4s +mla v18.4S, v22.4S, v31.s[0] +add v11.4s, v11.4s, v1.4s +str q14, [x0, #928] +sqrdmulh v14.4S, v15.4S, v29.s[0] +str q10, [x0, #992] +mul v15.4S, v15.4S,v30.s[0] +ldr q10, [x0, #112] +sub v1.4s, v12.4s, v18.4s +mla v3.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v2.4S, v29.s[0] +ldr q17, [x0, #176] +mul v2.4S, v2.4S,v30.s[0] +sub v22.4s, v16.4s, v3.4s +mla v15.4S, v14.4S, v31.s[0] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v0.4S, v29.s[0] +ldr q14, [x0, #240] +mul v0.4S, v0.4S,v30.s[0] +sub v21.4s, v8.4s, v15.4s +mla v2.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +sub v18.4s, v13.4s, v2.4s +mla v0.4S, v3.4S, v31.s[0] +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v9.4S, v29.s[0] +mul v9.4S, v9.4S,v30.s[0] +sub v3.4s, v10.4s, v0.4s +mla v20.4S, v15.4S, v31.s[0] +add v10.4s, v10.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v15.4s, v17.4s, v20.4s +mla v9.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +sub v2.4s, v14.4s, v9.4s +mla v16.4S, v0.4S, v31.s[0] +add v14.4s, v14.4s, v9.4s +sqrdmulh v9.4S, v11.4S, v29.s[1] +mul v11.4S, v11.4S,v30.s[1] +sub v0.4s, v17.4s, v16.4s +mla v8.4S, v20.4S, v31.s[0] +add v17.4s, v17.4s, v16.4s +sqrdmulh v16.4S, v12.4S, v29.s[1] +mul v12.4S, v12.4S,v30.s[1] +sub v20.4s, v14.4s, v8.4s +mla v11.4S, v9.4S, v31.s[0] +add v14.4s, v14.4s, v8.4s +sqrdmulh v8.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +sub v9.4s, v13.4s, v11.4s +mla v12.4S, v16.4S, v31.s[0] +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +sub v16.4s, v10.4s, v12.4s +mla v22.4S, v8.4S, v31.s[0] +add v10.4s, v10.4s, v12.4s +sqrdmulh v12.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +sub v8.4s, v15.4s, v22.4s +mla v21.4S, v11.4S, v31.s[0] +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v1.4S, v29.s[2] +mul v1.4S, v1.4S,v30.s[2] +sub v11.4s, v2.4s, v21.4s +mla v19.4S, v12.4S, v31.s[0] +add v2.4s, v2.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v27.s[0] +mul v17.4S, v17.4S,v28.s[0] +sub v12.4s, v18.4s, v19.4s +mla v1.4S, v22.4S, v31.s[0] +add v18.4s, v18.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +sub v22.4s, v3.4s, v1.4s +mla v17.4S, v21.4S, v31.s[0] +add v3.4s, v3.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v21.4s, v13.4s, v17.4s +mla v14.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v17.4s +sqrdmulh v17.4S, v20.4S, v27.s[1] +mul v20.4S, v20.4S,v28.s[1] +sub v19.4s, v10.4s, v14.4s +mla v0.4S, v1.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v27.s[2] +mul v15.4S, v15.4S,v28.s[2] +sub v1.4s, v9.4s, v0.4s +mla v20.4S, v17.4S, v31.s[0] +add v9.4s, v9.4s, v0.4s +sqrdmulh v0.4S, v2.4S, v27.s[2] +mul v2.4S, v2.4S,v28.s[2] +sub v17.4s, v16.4s, v20.4s +mla v15.4S, v14.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v27.s[3] +mul v8.4S, v8.4S,v28.s[3] +sub v14.4s, v18.4s, v15.4s +mla v2.4S, v0.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v27.s[3] +mul v11.4S, v11.4S,v28.s[3] +sub v0.4s, v3.4s, v2.4s +mla v8.4S, v20.4S, v31.s[0] +add v3.4s, v3.4s, v2.4s +sqrdmulh v2.4S, v10.4S, v25.s[0] +mul v10.4S, v10.4S,v26.s[0] +sub v20.4s, v12.4s, v8.4s +mla v11.4S, v15.4S, v31.s[0] +add v12.4s, v12.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v25.s[1] +mul v19.4S, v19.4S,v26.s[1] +sub v15.4s, v22.4s, v11.4s +mla v10.4S, v2.4S, v31.s[0] +add v22.4s, v22.4s, v11.4s +sqrdmulh v11.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v2.4s, v13.4s, v10.4s +mla v19.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v10.4s +sqrdmulh v10.4S, v17.4S, v25.s[3] +mul v17.4S, v17.4S,v26.s[3] +sub v8.4s, v21.4s, v19.4s +mla v16.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v19.4s +str q13, [x0, #48] +sqrdmulh v13.4S, v3.4S, v23.s[0] +str q2, [x0, #112] +mul v3.4S, v3.4S,v24.s[0] +ldr q2, [x0, #768] +sub v19.4s, v9.4s, v16.4s +ldr q11, [x0, #832] +mla v17.4S, v10.4S, v31.s[0] +add v9.4s, v9.4s, v16.4s +str q21, [x0, #176] +sqrdmulh v21.4S, v0.4S, v23.s[1] +str q8, [x0, #240] +mul v0.4S, v0.4S,v24.s[1] +ldr q8, [x0, #896] +sub v16.4s, v1.4s, v17.4s +ldr q10, [x0, #960] +mla v3.4S, v13.4S, v31.s[0] +add v1.4s, v1.4s, v17.4s +str q9, [x0, #304] +sqrdmulh v9.4S, v22.4S, v23.s[2] +str q19, [x0, #368] +mul v22.4S, v22.4S,v24.s[2] +ldr q19, [x0, #256] +sub v17.4s, v18.4s, v3.4s +ldr q13, [x0, #320] +mla v0.4S, v21.4S, v31.s[0] +add v18.4s, v18.4s, v3.4s +str q1, [x0, #432] +sqrdmulh v1.4S, v15.4S, v23.s[3] +str q16, [x0, #496] +mul v15.4S, v15.4S,v24.s[3] +ldr q16, [x0, #384] +sub v3.4s, v14.4s, v0.4s +ldr q21, [x0, #448] +mla v22.4S, v9.4S, v31.s[0] +add v14.4s, v14.4s, v0.4s +str q18, [x0, #560] +sqrdmulh v18.4S, v2.4S, v29.s[0] +str q17, [x0, #624] +ldr q17, [x0, #512] +mul v2.4S, v2.4S,v30.s[0] +ldr q0, [x0, #576] +sub v9.4s, v12.4s, v22.4s +mla v15.4S, v1.4S, v31.s[0] +add v12.4s, v12.4s, v22.4s +str q14, [x0, #688] +sqrdmulh v14.4S, v11.4S, v29.s[0] +str q3, [x0, #752] +ldr q3, [x0, #640] +mul v11.4S, v11.4S,v30.s[0] +ldr q22, [x0, #704] +sub v1.4s, v20.4s, v15.4s +mla v2.4S, v18.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +str q12, [x0, #816] +sqrdmulh v12.4S, v8.4S, v29.s[0] +str q9, [x0, #880] +mul v8.4S, v8.4S,v30.s[0] +ldr q9, [x0, #0] +sub v15.4s, v19.4s, v2.4s +mla v11.4S, v14.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +str q20, [x0, #944] +sqrdmulh v20.4S, v10.4S, v29.s[0] +str q1, [x0, #1008] +mul v10.4S, v10.4S,v30.s[0] +ldr q1, [x0, #64] +sub v2.4s, v13.4s, v11.4s +mla v8.4S, v12.4S, v31.s[0] +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v29.s[0] +ldr q12, [x0, #128] +mul v17.4S, v17.4S,v30.s[0] +sub v14.4s, v16.4s, v8.4s +mla v10.4S, v20.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v0.4S, v29.s[0] +ldr q20, [x0, #192] +mul v0.4S, v0.4S,v30.s[0] +sub v18.4s, v21.4s, v10.4s +mla v17.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +sub v11.4s, v9.4s, v17.4s +mla v0.4S, v8.4S, v31.s[0] +add v9.4s, v9.4s, v17.4s +sqrdmulh v17.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v8.4s, v1.4s, v0.4s +mla v3.4S, v10.4S, v31.s[0] +add v1.4s, v1.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v10.4s, v12.4s, v3.4s +mla v22.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v17.4s, v20.4s, v22.4s +mla v16.4S, v0.4S, v31.s[0] +add v20.4s, v20.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[1] +mul v19.4S, v19.4S,v30.s[1] +sub v0.4s, v12.4s, v16.4s +mla v21.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v13.4S, v29.s[1] +mul v13.4S, v13.4S,v30.s[1] +sub v3.4s, v20.4s, v21.4s +mla v19.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v22.4s, v9.4s, v19.4s +mla v13.4S, v16.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v29.s[2] +mul v18.4S, v18.4S,v30.s[2] +sub v16.4s, v1.4s, v13.4s +mla v14.4S, v21.4S, v31.s[0] +add v1.4s, v1.4s, v13.4s +sqrdmulh v13.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +sub v21.4s, v10.4s, v14.4s +mla v18.4S, v19.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v19.4s, v17.4s, v18.4s +mla v15.4S, v13.4S, v31.s[0] +add v17.4s, v17.4s, v18.4s +sqrdmulh v18.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +sub v13.4s, v11.4s, v15.4s +mla v2.4S, v14.4S, v31.s[0] +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v27.s[0] +mul v20.4S, v20.4S,v28.s[0] +sub v14.4s, v8.4s, v2.4s +mla v12.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v2.4s +sqrdmulh v2.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v18.4s, v9.4s, v12.4s +mla v20.4S, v15.4S, v31.s[0] +add v9.4s, v9.4s, v12.4s +sqrdmulh v12.4S, v3.4S, v27.s[1] +mul v3.4S, v3.4S,v28.s[1] +sub v15.4s, v1.4s, v20.4s +mla v0.4S, v2.4S, v31.s[0] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v10.4S, v27.s[2] +mul v10.4S, v10.4S,v28.s[2] +sub v2.4s, v22.4s, v0.4s +mla v3.4S, v12.4S, v31.s[0] +add v22.4s, v22.4s, v0.4s +sqrdmulh v0.4S, v17.4S, v27.s[2] +mul v17.4S, v17.4S,v28.s[2] +sub v12.4s, v16.4s, v3.4s +mla v10.4S, v20.4S, v31.s[0] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +sub v20.4s, v11.4s, v10.4s +mla v17.4S, v0.4S, v31.s[0] +add v11.4s, v11.4s, v10.4s +sqrdmulh v10.4S, v19.4S, v27.s[3] +mul v19.4S, v19.4S,v28.s[3] +sub v0.4s, v8.4s, v17.4s +mla v21.4S, v3.4S, v31.s[0] +add v8.4s, v8.4s, v17.4s +sqrdmulh v17.4S, v1.4S, v25.s[0] +mul v1.4S, v1.4S,v26.s[0] +sub v3.4s, v13.4s, v21.4s +mla v19.4S, v10.4S, v31.s[0] +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v15.4S, v25.s[1] +mul v15.4S, v15.4S,v26.s[1] +sub v10.4s, v14.4s, v19.4s +mla v1.4S, v17.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +sqrdmulh v19.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v17.4s, v9.4s, v1.4s +mla v15.4S, v21.4S, v31.s[0] +add v9.4s, v9.4s, v1.4s +sqrdmulh v1.4S, v12.4S, v25.s[3] +mul v12.4S, v12.4S,v26.s[3] +sub v21.4s, v18.4s, v15.4s +mla v16.4S, v19.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +str q9, [x0, #0] +sqrdmulh v9.4S, v8.4S, v23.s[0] +str q17, [x0, #64] +mul v8.4S, v8.4S,v24.s[0] +ldr q17, [x0, #784] +sub v15.4s, v22.4s, v16.4s +ldr q19, [x0, #848] +mla v12.4S, v1.4S, v31.s[0] +add v22.4s, v22.4s, v16.4s +str q18, [x0, #128] +sqrdmulh v18.4S, v0.4S, v23.s[1] +str q21, [x0, #192] +mul v0.4S, v0.4S,v24.s[1] +ldr q21, [x0, #912] +sub v16.4s, v2.4s, v12.4s +ldr q1, [x0, #976] +mla v8.4S, v9.4S, v31.s[0] +add v2.4s, v2.4s, v12.4s +str q22, [x0, #256] +sqrdmulh v22.4S, v14.4S, v23.s[2] +str q15, [x0, #320] +mul v14.4S, v14.4S,v24.s[2] +ldr q15, [x0, #272] +sub v12.4s, v11.4s, v8.4s +ldr q9, [x0, #336] +mla v0.4S, v18.4S, v31.s[0] +add v11.4s, v11.4s, v8.4s +str q2, [x0, #384] +sqrdmulh v2.4S, v10.4S, v23.s[3] +str q16, [x0, #448] +mul v10.4S, v10.4S,v24.s[3] +ldr q16, [x0, #400] +sub v8.4s, v20.4s, v0.4s +ldr q18, [x0, #464] +mla v14.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v0.4s +str q11, [x0, #512] +sqrdmulh v11.4S, v17.4S, v29.s[0] +str q12, [x0, #576] +ldr q12, [x0, #528] +mul v17.4S, v17.4S,v30.s[0] +ldr q0, [x0, #592] +sub v22.4s, v13.4s, v14.4s +mla v10.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v14.4s +str q20, [x0, #640] +sqrdmulh v20.4S, v19.4S, v29.s[0] +str q8, [x0, #704] +ldr q8, [x0, #656] +mul v19.4S, v19.4S,v30.s[0] +ldr q14, [x0, #720] +sub v2.4s, v3.4s, v10.4s +mla v17.4S, v11.4S, v31.s[0] +add v3.4s, v3.4s, v10.4s +str q13, [x0, #768] +sqrdmulh v13.4S, v21.4S, v29.s[0] +str q22, [x0, #832] +mul v21.4S, v21.4S,v30.s[0] +ldr q22, [x0, #16] +sub v10.4s, v15.4s, v17.4s +mla v19.4S, v20.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +str q3, [x0, #896] +sqrdmulh v3.4S, v1.4S, v29.s[0] +str q2, [x0, #960] +mul v1.4S, v1.4S,v30.s[0] +ldr q2, [x0, #80] +sub v17.4s, v9.4s, v19.4s +mla v21.4S, v13.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v12.4S, v29.s[0] +ldr q13, [x0, #144] +mul v12.4S, v12.4S,v30.s[0] +sub v20.4s, v16.4s, v21.4s +mla v1.4S, v3.4S, v31.s[0] +add v16.4s, v16.4s, v21.4s +sqrdmulh v21.4S, v0.4S, v29.s[0] +ldr q3, [x0, #208] +mul v0.4S, v0.4S,v30.s[0] +sub v11.4s, v18.4s, v1.4s +mla v12.4S, v19.4S, v31.s[0] +add v18.4s, v18.4s, v1.4s +sqrdmulh v1.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v19.4s, v22.4s, v12.4s +mla v0.4S, v21.4S, v31.s[0] +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v21.4s, v2.4s, v0.4s +mla v8.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v1.4s, v13.4s, v8.4s +mla v14.4S, v12.4S, v31.s[0] +add v13.4s, v13.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v12.4s, v3.4s, v14.4s +mla v16.4S, v0.4S, v31.s[0] +add v3.4s, v3.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +sub v0.4s, v13.4s, v16.4s +mla v18.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v16.4s +sqrdmulh v16.4S, v9.4S, v29.s[1] +mul v9.4S, v9.4S,v30.s[1] +sub v8.4s, v3.4s, v18.4s +mla v15.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +sub v14.4s, v22.4s, v15.4s +mla v9.4S, v16.4S, v31.s[0] +add v22.4s, v22.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v16.4s, v2.4s, v9.4s +mla v20.4S, v18.4S, v31.s[0] +add v2.4s, v2.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v18.4s, v1.4s, v20.4s +mla v11.4S, v15.4S, v31.s[0] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +sub v15.4s, v12.4s, v11.4s +mla v10.4S, v9.4S, v31.s[0] +add v12.4s, v12.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v27.s[0] +mul v13.4S, v13.4S,v28.s[0] +sub v9.4s, v19.4s, v10.4s +mla v17.4S, v20.4S, v31.s[0] +add v19.4s, v19.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v27.s[0] +mul v3.4S, v3.4S,v28.s[0] +sub v20.4s, v21.4s, v17.4s +mla v13.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v11.4s, v22.4s, v13.4s +mla v3.4S, v10.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v8.4S, v27.s[1] +mul v8.4S, v8.4S,v28.s[1] +sub v10.4s, v2.4s, v3.4s +mla v0.4S, v17.4S, v31.s[0] +add v2.4s, v2.4s, v3.4s +sqrdmulh v3.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +sub v17.4s, v14.4s, v0.4s +mla v8.4S, v13.4S, v31.s[0] +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v12.4S, v27.s[2] +mul v12.4S, v12.4S,v28.s[2] +sub v13.4s, v16.4s, v8.4s +mla v1.4S, v3.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v3.4s, v19.4s, v1.4s +mla v12.4S, v0.4S, v31.s[0] +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +sub v0.4s, v21.4s, v12.4s +mla v18.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v2.4S, v25.s[0] +mul v2.4S, v2.4S,v26.s[0] +sub v8.4s, v9.4s, v18.4s +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v10.4S, v25.s[1] +mul v10.4S, v10.4S,v26.s[1] +sub v1.4s, v20.4s, v15.4s +mla v2.4S, v12.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v12.4s, v22.4s, v2.4s +mla v10.4S, v18.4S, v31.s[0] +add v22.4s, v22.4s, v2.4s +sqrdmulh v2.4S, v13.4S, v25.s[3] +mul v13.4S, v13.4S,v26.s[3] +sub v18.4s, v11.4s, v10.4s +mla v16.4S, v15.4S, v31.s[0] +add v11.4s, v11.4s, v10.4s +str q22, [x0, #16] +sqrdmulh v22.4S, v21.4S, v23.s[0] +str q12, [x0, #80] +mul v21.4S, v21.4S,v24.s[0] +sub v12.4s, v14.4s, v16.4s +mla v13.4S, v2.4S, v31.s[0] +add v14.4s, v14.4s, v16.4s +str q11, [x0, #144] +sqrdmulh v11.4S, v0.4S, v23.s[1] +str q18, [x0, #208] +mul v0.4S, v0.4S,v24.s[1] +sub v18.4s, v17.4s, v13.4s +mla v21.4S, v22.4S, v31.s[0] +add v17.4s, v17.4s, v13.4s +str q14, [x0, #272] +sqrdmulh v14.4S, v20.4S, v23.s[2] +str q12, [x0, #336] +mul v20.4S, v20.4S,v24.s[2] +sub v12.4s, v19.4s, v21.4s +mla v0.4S, v11.4S, v31.s[0] +add v19.4s, v19.4s, v21.4s +str q17, [x0, #400] +sqrdmulh v17.4S, v1.4S, v23.s[3] +str q18, [x0, #464] +mul v1.4S, v1.4S,v24.s[3] +sub v18.4s, v3.4s, v0.4s +mla v20.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v0.4s +str q19, [x0, #528] +str q12, [x0, #592] +sub v12.4s, v9.4s, v20.4s +mla v1.4S, v17.4S, v31.s[0] +add v9.4s, v9.4s, v20.4s +str q3, [x0, #656] +str q18, [x0, #720] +sub v18.4s, v8.4s, v1.4s +add v8.4s, v8.4s, v1.4s +str q9, [x0, #784] +str q12, [x0, #848] +str q8, [x0, #912] +str q18, [x0, #976] +ldr q4, [x0, #32] +ldr q5, [x0, #48] +ldr q6, [x0, #0] +ldr q7, [x0, #16] +ldr q15, [x0, #96] +ldr q10, [x0, #112] +ldr q2, [x0, #64] +ldr q16, [x0, #80] +ldr q22, [x0, #160] +ldr q13, [x0, #176] +ldr q11, [x0, #128] +ldr q21, [x0, #144] +ldr q14, [x0, #224] +ldr q0, [x0, #240] +ldr q19, [x0, #192] +ldr q17, [x0, #208] +ldr q20, [x17, #+128] +ldr q3, [x17, #+144] +ldr q1, [x17, #+256] +ldr q9, [x17, #+272] +ldr q12, [x17, #+384] +ldr q8, [x17, #+400] +ldr q18, [x17, #+512] +ldr q30, [x17, #+528] +sqrdmulh v29.4S, v4.4S, v3.s[0] +mul v4.4S, v4.4S,v20.s[0] +sqrdmulh v28.4S, v5.4S, v3.s[0] +mul v5.4S, v5.4S,v20.s[0] +mla v4.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v15.4S, v9.s[0] +mul v15.4S, v15.4S,v1.s[0] +mla v5.4S, v28.4S, v31.s[0] +sub v28.4s, v6.4s, v4.4s +add v6.4s, v6.4s, v4.4s +sqrdmulh v4.4S, v10.4S, v9.s[0] +mul v10.4S, v10.4S,v1.s[0] +mla v15.4S, v29.4S, v31.s[0] +sub v29.4s, v7.4s, v5.4s +add v7.4s, v7.4s, v5.4s +sqrdmulh v5.4S, v7.4S, v3.s[1] +mul v7.4S, v7.4S,v20.s[1] +mla v10.4S, v4.4S, v31.s[0] +sub v4.4s, v2.4s, v15.4s +add v2.4s, v2.4s, v15.4s +sqrdmulh v15.4S, v29.4S, v3.s[2] +mul v29.4S, v29.4S,v20.s[2] +mla v7.4S, v5.4S, v31.s[0] +sub v5.4s, v16.4s, v10.4s +add v16.4s, v16.4s, v10.4s +sqrdmulh v10.4S, v16.4S, v9.s[1] +mul v16.4S, v16.4S,v1.s[1] +mla v29.4S, v15.4S, v31.s[0] +sub v15.4s, v6.4s, v7.4s +add v6.4s, v6.4s, v7.4s +sqrdmulh v3.4S, v5.4S, v9.s[2] +mul v5.4S, v5.4S,v1.s[2] +mla v16.4S, v10.4S, v31.s[0] +sub v10.4s, v28.4s, v29.4s +add v28.4s, v28.4s, v29.4s +sqrdmulh v29.4S, v22.4S, v8.s[0] +mul v22.4S, v22.4S,v12.s[0] +mla v5.4S, v3.4S, v31.s[0] +sub v3.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +sqrdmulh v9.4S, v13.4S, v8.s[0] +mul v13.4S, v13.4S,v12.s[0] +mla v22.4S, v29.4S, v31.s[0] +sub v29.4s, v4.4s, v5.4s +add v4.4s, v4.4s, v5.4s +sqrdmulh v5.4S, v14.4S, v30.s[0] +mul v14.4S, v14.4S,v18.s[0] +mla v13.4S, v9.4S, v31.s[0] +sub v9.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +sqrdmulh v22.4S, v0.4S, v30.s[0] +mul v0.4S, v0.4S,v18.s[0] +mla v14.4S, v5.4S, v31.s[0] +sub v5.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +sqrdmulh v13.4S, v21.4S, v8.s[1] +mul v21.4S, v21.4S,v12.s[1] +mla v0.4S, v22.4S, v31.s[0] +sub v22.4s, v19.4s, v14.4s +add v19.4s, v19.4s, v14.4s +sqrdmulh v14.4S, v5.4S, v8.s[2] +mul v5.4S, v5.4S,v12.s[2] +mla v21.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v0.4s +add v17.4s, v17.4s, v0.4s +sqrdmulh v0.4S, v17.4S, v30.s[1] +mul v17.4S, v17.4S,v18.s[1] +mla v5.4S, v14.4S, v31.s[0] +sub v14.4s, v11.4s, v21.4s +add v11.4s, v11.4s, v21.4s +sqrdmulh v8.4S, v13.4S, v30.s[2] +mul v13.4S, v13.4S,v18.s[2] +mla v17.4S, v0.4S, v31.s[0] +sub v0.4s, v9.4s, v5.4s +add v9.4s, v9.4s, v5.4s +mla v13.4S, v8.4S, v31.s[0] +sub v8.4s, v19.4s, v17.4s +add v19.4s, v19.4s, v17.4s +sub v30.4s, v22.4s, v13.4s +add v22.4s, v22.4s, v13.4s +trn1 v13.4S, v6.4S, v15.4S +trn2 v18.4S, v6.4S, v15.4S +trn1 v17.4S, v28.4S, v10.4S +trn2 v5.4S, v28.4S, v10.4S +trn2 v28.2D, v13.2D, v17.2D +trn2 v10.2D, v18.2D, v5.2D +trn1 v6.2D, v13.2D, v17.2D +trn1 v15.2D, v18.2D, v5.2D +trn1 v5.4S, v2.4S, v3.4S +trn2 v18.4S, v2.4S, v3.4S +trn1 v17.4S, v4.4S, v29.4S +trn2 v13.4S, v4.4S, v29.4S +trn2 v4.2D, v5.2D, v17.2D +trn2 v29.2D, v18.2D, v13.2D +trn1 v2.2D, v5.2D, v17.2D +trn1 v3.2D, v18.2D, v13.2D +trn1 v13.4S, v11.4S, v14.4S +trn2 v18.4S, v11.4S, v14.4S +trn1 v17.4S, v9.4S, v0.4S +trn2 v5.4S, v9.4S, v0.4S +trn2 v9.2D, v13.2D, v17.2D +trn2 v0.2D, v18.2D, v5.2D +trn1 v11.2D, v13.2D, v17.2D +trn1 v14.2D, v18.2D, v5.2D +trn1 v5.4S, v19.4S, v8.4S +trn2 v18.4S, v19.4S, v8.4S +trn1 v17.4S, v22.4S, v30.4S +trn2 v13.4S, v22.4S, v30.4S +trn2 v22.2D, v5.2D, v17.2D +trn2 v30.2D, v18.2D, v13.2D +trn1 v19.2D, v5.2D, v17.2D +trn1 v8.2D, v18.2D, v13.2D +ldr q13, [x17, #+160] +ldr q18, [x17, #+176] +sqrdmulh v17.4S, v28.4S, v18.4S +mul v28.4S, v28.4S,v13.4S +sqrdmulh v5.4S, v10.4S, v18.4S +mul v10.4S, v10.4S,v13.4S +mla v28.4S, v17.4S, v31.s[0] +ldr q17, [x17, #+288] +ldr q18, [x17, #+304] +sqrdmulh v13.4S, v4.4S, v18.4S +mul v4.4S, v4.4S,v17.4S +mla v10.4S, v5.4S, v31.s[0] +sub v5.4s, v6.4s, v28.4s +add v6.4s, v6.4s, v28.4s +sqrdmulh v28.4S, v29.4S, v18.4S +mul v29.4S, v29.4S,v17.4S +mla v4.4S, v13.4S, v31.s[0] +sub v13.4s, v15.4s, v10.4s +add v15.4s, v15.4s, v10.4s +ldr q10, [x17, #+192] +ldr q18, [x17, #+208] +sqrdmulh v17.4S, v15.4S, v18.4S +mul v15.4S, v15.4S,v10.4S +mla v29.4S, v28.4S, v31.s[0] +sub v28.4s, v2.4s, v4.4s +add v2.4s, v2.4s, v4.4s +ldr q4, [x17, #+224] +ldr q18, [x17, #+240] +sqrdmulh v10.4S, v13.4S, v18.4S +mul v13.4S, v13.4S,v4.4S +mla v15.4S, v17.4S, v31.s[0] +sub v17.4s, v3.4s, v29.4s +add v3.4s, v3.4s, v29.4s +ldr q29, [x17, #+320] +ldr q18, [x17, #+336] +sqrdmulh v4.4S, v3.4S, v18.4S +mul v3.4S, v3.4S,v29.4S +mla v13.4S, v10.4S, v31.s[0] +sub v10.4s, v6.4s, v15.4s +add v6.4s, v6.4s, v15.4s +ldr q15, [x17, #+352] +ldr q18, [x17, #+368] +sqrdmulh v29.4S, v17.4S, v18.4S +mul v17.4S, v17.4S,v15.4S +mla v3.4S, v4.4S, v31.s[0] +sub v4.4s, v5.4s, v13.4s +add v5.4s, v5.4s, v13.4s +mla v17.4S, v29.4S, v31.s[0] +sub v29.4s, v2.4s, v3.4s +add v2.4s, v2.4s, v3.4s +sub v3.4s, v28.4s, v17.4s +add v28.4s, v28.4s, v17.4s +str q6, [x0, #0] +str q10, [x0, #16] +str q5, [x0, #32] +str q4, [x0, #48] +str q2, [x0, #64] +str q29, [x0, #80] +str q28, [x0, #96] +str q3, [x0, #112] +ldr q3, [x17, #+416] +ldr q28, [x17, #+432] +sqrdmulh v29.4S, v9.4S, v28.4S +mul v9.4S, v9.4S,v3.4S +sqrdmulh v2.4S, v0.4S, v28.4S +mul v0.4S, v0.4S,v3.4S +mla v9.4S, v29.4S, v31.s[0] +ldr q29, [x17, #+544] +ldr q28, [x17, #+560] +sqrdmulh v3.4S, v22.4S, v28.4S +mul v22.4S, v22.4S,v29.4S +mla v0.4S, v2.4S, v31.s[0] +sub v2.4s, v11.4s, v9.4s +add v11.4s, v11.4s, v9.4s +sqrdmulh v9.4S, v30.4S, v28.4S +mul v30.4S, v30.4S,v29.4S +mla v22.4S, v3.4S, v31.s[0] +sub v3.4s, v14.4s, v0.4s +add v14.4s, v14.4s, v0.4s +ldr q0, [x17, #+448] +ldr q28, [x17, #+464] +sqrdmulh v29.4S, v14.4S, v28.4S +mul v14.4S, v14.4S,v0.4S +mla v30.4S, v9.4S, v31.s[0] +sub v9.4s, v19.4s, v22.4s +add v19.4s, v19.4s, v22.4s +ldr q22, [x17, #+480] +ldr q28, [x17, #+496] +sqrdmulh v0.4S, v3.4S, v28.4S +mul v3.4S, v3.4S,v22.4S +mla v14.4S, v29.4S, v31.s[0] +sub v29.4s, v8.4s, v30.4s +add v8.4s, v8.4s, v30.4s +ldr q30, [x17, #+576] +ldr q28, [x17, #+592] +sqrdmulh v22.4S, v8.4S, v28.4S +mul v8.4S, v8.4S,v30.4S +mla v3.4S, v0.4S, v31.s[0] +sub v0.4s, v11.4s, v14.4s +add v11.4s, v11.4s, v14.4s +ldr q14, [x17, #+608] +ldr q28, [x17, #+624] +sqrdmulh v30.4S, v29.4S, v28.4S +mul v29.4S, v29.4S,v14.4S +mla v8.4S, v22.4S, v31.s[0] +sub v22.4s, v2.4s, v3.4s +add v2.4s, v2.4s, v3.4s +mla v29.4S, v30.4S, v31.s[0] +sub v30.4s, v19.4s, v8.4s +add v19.4s, v19.4s, v8.4s +sub v8.4s, v9.4s, v29.4s +add v9.4s, v9.4s, v29.4s +str q11, [x0, #128] +str q0, [x0, #144] +str q2, [x0, #160] +str q22, [x0, #176] +str q19, [x0, #192] +str q30, [x0, #208] +str q9, [x0, #224] +str q8, [x0, #240] +ldr q8, [x0, #288] +ldr q9, [x0, #304] +ldr q30, [x0, #256] +ldr q19, [x0, #272] +ldr q22, [x0, #352] +ldr q2, [x0, #368] +ldr q0, [x0, #320] +ldr q11, [x0, #336] +ldr q29, [x0, #416] +ldr q3, [x0, #432] +ldr q28, [x0, #384] +ldr q14, [x0, #400] +ldr q4, [x0, #480] +ldr q5, [x0, #496] +ldr q10, [x0, #448] +ldr q6, [x0, #464] +ldr q17, [x17, #+640] +ldr q13, [x17, #+656] +ldr q18, [x17, #+768] +ldr q15, [x17, #+784] +ldr q12, [x17, #+896] +ldr q21, [x17, #+912] +ldr q1, [x17, #+1024] +ldr q16, [x17, #+1040] +sqrdmulh v20.4S, v8.4S, v13.s[0] +mul v8.4S, v8.4S,v17.s[0] +sqrdmulh v7.4S, v9.4S, v13.s[0] +mul v9.4S, v9.4S,v17.s[0] +mla v8.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v22.4S, v15.s[0] +mul v22.4S, v22.4S,v18.s[0] +mla v9.4S, v7.4S, v31.s[0] +sub v7.4s, v30.4s, v8.4s +add v30.4s, v30.4s, v8.4s +sqrdmulh v8.4S, v2.4S, v15.s[0] +mul v2.4S, v2.4S,v18.s[0] +mla v22.4S, v20.4S, v31.s[0] +sub v20.4s, v19.4s, v9.4s +add v19.4s, v19.4s, v9.4s +sqrdmulh v9.4S, v19.4S, v13.s[1] +mul v19.4S, v19.4S,v17.s[1] +mla v2.4S, v8.4S, v31.s[0] +sub v8.4s, v0.4s, v22.4s +add v0.4s, v0.4s, v22.4s +sqrdmulh v22.4S, v20.4S, v13.s[2] +mul v20.4S, v20.4S,v17.s[2] +mla v19.4S, v9.4S, v31.s[0] +sub v9.4s, v11.4s, v2.4s +add v11.4s, v11.4s, v2.4s +sqrdmulh v2.4S, v11.4S, v15.s[1] +mul v11.4S, v11.4S,v18.s[1] +mla v20.4S, v22.4S, v31.s[0] +sub v22.4s, v30.4s, v19.4s +add v30.4s, v30.4s, v19.4s +sqrdmulh v13.4S, v9.4S, v15.s[2] +mul v9.4S, v9.4S,v18.s[2] +mla v11.4S, v2.4S, v31.s[0] +sub v2.4s, v7.4s, v20.4s +add v7.4s, v7.4s, v20.4s +sqrdmulh v20.4S, v29.4S, v21.s[0] +mul v29.4S, v29.4S,v12.s[0] +mla v9.4S, v13.4S, v31.s[0] +sub v13.4s, v0.4s, v11.4s +add v0.4s, v0.4s, v11.4s +sqrdmulh v15.4S, v3.4S, v21.s[0] +mul v3.4S, v3.4S,v12.s[0] +mla v29.4S, v20.4S, v31.s[0] +sub v20.4s, v8.4s, v9.4s +add v8.4s, v8.4s, v9.4s +sqrdmulh v9.4S, v4.4S, v16.s[0] +mul v4.4S, v4.4S,v1.s[0] +mla v3.4S, v15.4S, v31.s[0] +sub v15.4s, v28.4s, v29.4s +add v28.4s, v28.4s, v29.4s +sqrdmulh v29.4S, v5.4S, v16.s[0] +mul v5.4S, v5.4S,v1.s[0] +mla v4.4S, v9.4S, v31.s[0] +sub v9.4s, v14.4s, v3.4s +add v14.4s, v14.4s, v3.4s +sqrdmulh v3.4S, v14.4S, v21.s[1] +mul v14.4S, v14.4S,v12.s[1] +mla v5.4S, v29.4S, v31.s[0] +sub v29.4s, v10.4s, v4.4s +add v10.4s, v10.4s, v4.4s +sqrdmulh v4.4S, v9.4S, v21.s[2] +mul v9.4S, v9.4S,v12.s[2] +mla v14.4S, v3.4S, v31.s[0] +sub v3.4s, v6.4s, v5.4s +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v6.4S, v16.s[1] +mul v6.4S, v6.4S,v1.s[1] +mla v9.4S, v4.4S, v31.s[0] +sub v4.4s, v28.4s, v14.4s +add v28.4s, v28.4s, v14.4s +sqrdmulh v21.4S, v3.4S, v16.s[2] +mul v3.4S, v3.4S,v1.s[2] +mla v6.4S, v5.4S, v31.s[0] +sub v5.4s, v15.4s, v9.4s +add v15.4s, v15.4s, v9.4s +mla v3.4S, v21.4S, v31.s[0] +sub v21.4s, v10.4s, v6.4s +add v10.4s, v10.4s, v6.4s +sub v16.4s, v29.4s, v3.4s +add v29.4s, v29.4s, v3.4s +trn1 v3.4S, v30.4S, v22.4S +trn2 v1.4S, v30.4S, v22.4S +trn1 v6.4S, v7.4S, v2.4S +trn2 v9.4S, v7.4S, v2.4S +trn2 v7.2D, v3.2D, v6.2D +trn2 v2.2D, v1.2D, v9.2D +trn1 v30.2D, v3.2D, v6.2D +trn1 v22.2D, v1.2D, v9.2D +trn1 v9.4S, v0.4S, v13.4S +trn2 v1.4S, v0.4S, v13.4S +trn1 v6.4S, v8.4S, v20.4S +trn2 v3.4S, v8.4S, v20.4S +trn2 v8.2D, v9.2D, v6.2D +trn2 v20.2D, v1.2D, v3.2D +trn1 v0.2D, v9.2D, v6.2D +trn1 v13.2D, v1.2D, v3.2D +trn1 v3.4S, v28.4S, v4.4S +trn2 v1.4S, v28.4S, v4.4S +trn1 v6.4S, v15.4S, v5.4S +trn2 v9.4S, v15.4S, v5.4S +trn2 v15.2D, v3.2D, v6.2D +trn2 v5.2D, v1.2D, v9.2D +trn1 v28.2D, v3.2D, v6.2D +trn1 v4.2D, v1.2D, v9.2D +trn1 v9.4S, v10.4S, v21.4S +trn2 v1.4S, v10.4S, v21.4S +trn1 v6.4S, v29.4S, v16.4S +trn2 v3.4S, v29.4S, v16.4S +trn2 v29.2D, v9.2D, v6.2D +trn2 v16.2D, v1.2D, v3.2D +trn1 v10.2D, v9.2D, v6.2D +trn1 v21.2D, v1.2D, v3.2D +ldr q3, [x17, #+672] +ldr q1, [x17, #+688] +sqrdmulh v6.4S, v7.4S, v1.4S +mul v7.4S, v7.4S,v3.4S +sqrdmulh v9.4S, v2.4S, v1.4S +mul v2.4S, v2.4S,v3.4S +mla v7.4S, v6.4S, v31.s[0] +ldr q6, [x17, #+800] +ldr q1, [x17, #+816] +sqrdmulh v3.4S, v8.4S, v1.4S +mul v8.4S, v8.4S,v6.4S +mla v2.4S, v9.4S, v31.s[0] +sub v9.4s, v30.4s, v7.4s +add v30.4s, v30.4s, v7.4s +sqrdmulh v7.4S, v20.4S, v1.4S +mul v20.4S, v20.4S,v6.4S +mla v8.4S, v3.4S, v31.s[0] +sub v3.4s, v22.4s, v2.4s +add v22.4s, v22.4s, v2.4s +ldr q2, [x17, #+704] +ldr q1, [x17, #+720] +sqrdmulh v6.4S, v22.4S, v1.4S +mul v22.4S, v22.4S,v2.4S +mla v20.4S, v7.4S, v31.s[0] +sub v7.4s, v0.4s, v8.4s +add v0.4s, v0.4s, v8.4s +ldr q8, [x17, #+736] +ldr q1, [x17, #+752] +sqrdmulh v2.4S, v3.4S, v1.4S +mul v3.4S, v3.4S,v8.4S +mla v22.4S, v6.4S, v31.s[0] +sub v6.4s, v13.4s, v20.4s +add v13.4s, v13.4s, v20.4s +ldr q20, [x17, #+832] +ldr q1, [x17, #+848] +sqrdmulh v8.4S, v13.4S, v1.4S +mul v13.4S, v13.4S,v20.4S +mla v3.4S, v2.4S, v31.s[0] +sub v2.4s, v30.4s, v22.4s +add v30.4s, v30.4s, v22.4s +ldr q22, [x17, #+864] +ldr q1, [x17, #+880] +sqrdmulh v20.4S, v6.4S, v1.4S +mul v6.4S, v6.4S,v22.4S +mla v13.4S, v8.4S, v31.s[0] +sub v8.4s, v9.4s, v3.4s +add v9.4s, v9.4s, v3.4s +mla v6.4S, v20.4S, v31.s[0] +sub v20.4s, v0.4s, v13.4s +add v0.4s, v0.4s, v13.4s +sub v13.4s, v7.4s, v6.4s +add v7.4s, v7.4s, v6.4s +str q30, [x0, #256] +str q2, [x0, #272] +str q9, [x0, #288] +str q8, [x0, #304] +str q0, [x0, #320] +str q20, [x0, #336] +str q7, [x0, #352] +str q13, [x0, #368] +ldr q13, [x17, #+928] +ldr q7, [x17, #+944] +sqrdmulh v20.4S, v15.4S, v7.4S +mul v15.4S, v15.4S,v13.4S +sqrdmulh v0.4S, v5.4S, v7.4S +mul v5.4S, v5.4S,v13.4S +mla v15.4S, v20.4S, v31.s[0] +ldr q20, [x17, #+1056] +ldr q7, [x17, #+1072] +sqrdmulh v13.4S, v29.4S, v7.4S +mul v29.4S, v29.4S,v20.4S +mla v5.4S, v0.4S, v31.s[0] +sub v0.4s, v28.4s, v15.4s +add v28.4s, v28.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v7.4S +mul v16.4S, v16.4S,v20.4S +mla v29.4S, v13.4S, v31.s[0] +sub v13.4s, v4.4s, v5.4s +add v4.4s, v4.4s, v5.4s +ldr q5, [x17, #+960] +ldr q7, [x17, #+976] +sqrdmulh v20.4S, v4.4S, v7.4S +mul v4.4S, v4.4S,v5.4S +mla v16.4S, v15.4S, v31.s[0] +sub v15.4s, v10.4s, v29.4s +add v10.4s, v10.4s, v29.4s +ldr q29, [x17, #+992] +ldr q7, [x17, #+1008] +sqrdmulh v5.4S, v13.4S, v7.4S +mul v13.4S, v13.4S,v29.4S +mla v4.4S, v20.4S, v31.s[0] +sub v20.4s, v21.4s, v16.4s +add v21.4s, v21.4s, v16.4s +ldr q16, [x17, #+1088] +ldr q7, [x17, #+1104] +sqrdmulh v29.4S, v21.4S, v7.4S +mul v21.4S, v21.4S,v16.4S +mla v13.4S, v5.4S, v31.s[0] +sub v5.4s, v28.4s, v4.4s +add v28.4s, v28.4s, v4.4s +ldr q4, [x17, #+1120] +ldr q7, [x17, #+1136] +sqrdmulh v16.4S, v20.4S, v7.4S +mul v20.4S, v20.4S,v4.4S +mla v21.4S, v29.4S, v31.s[0] +sub v29.4s, v0.4s, v13.4s +add v0.4s, v0.4s, v13.4s +mla v20.4S, v16.4S, v31.s[0] +sub v16.4s, v10.4s, v21.4s +add v10.4s, v10.4s, v21.4s +sub v21.4s, v15.4s, v20.4s +add v15.4s, v15.4s, v20.4s +str q28, [x0, #384] +str q5, [x0, #400] +str q0, [x0, #416] +str q29, [x0, #432] +str q10, [x0, #448] +str q16, [x0, #464] +str q15, [x0, #480] +str q21, [x0, #496] +ldr q21, [x0, #544] +ldr q15, [x0, #560] +ldr q16, [x0, #512] +ldr q10, [x0, #528] +ldr q29, [x0, #608] +ldr q0, [x0, #624] +ldr q5, [x0, #576] +ldr q28, [x0, #592] +ldr q20, [x0, #672] +ldr q13, [x0, #688] +ldr q7, [x0, #640] +ldr q4, [x0, #656] +ldr q8, [x0, #736] +ldr q9, [x0, #752] +ldr q2, [x0, #704] +ldr q30, [x0, #720] +ldr q6, [x17, #+1152] +ldr q3, [x17, #+1168] +ldr q1, [x17, #+1280] +ldr q22, [x17, #+1296] +ldr q12, [x17, #+1408] +ldr q14, [x17, #+1424] +ldr q18, [x17, #+1536] +ldr q11, [x17, #+1552] +sqrdmulh v17.4S, v21.4S, v3.s[0] +mul v21.4S, v21.4S,v6.s[0] +sqrdmulh v19.4S, v15.4S, v3.s[0] +mul v15.4S, v15.4S,v6.s[0] +mla v21.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v29.4S, v22.s[0] +mul v29.4S, v29.4S,v1.s[0] +mla v15.4S, v19.4S, v31.s[0] +sub v19.4s, v16.4s, v21.4s +add v16.4s, v16.4s, v21.4s +sqrdmulh v21.4S, v0.4S, v22.s[0] +mul v0.4S, v0.4S,v1.s[0] +mla v29.4S, v17.4S, v31.s[0] +sub v17.4s, v10.4s, v15.4s +add v10.4s, v10.4s, v15.4s +sqrdmulh v15.4S, v10.4S, v3.s[1] +mul v10.4S, v10.4S,v6.s[1] +mla v0.4S, v21.4S, v31.s[0] +sub v21.4s, v5.4s, v29.4s +add v5.4s, v5.4s, v29.4s +sqrdmulh v29.4S, v17.4S, v3.s[2] +mul v17.4S, v17.4S,v6.s[2] +mla v10.4S, v15.4S, v31.s[0] +sub v15.4s, v28.4s, v0.4s +add v28.4s, v28.4s, v0.4s +sqrdmulh v0.4S, v28.4S, v22.s[1] +mul v28.4S, v28.4S,v1.s[1] +mla v17.4S, v29.4S, v31.s[0] +sub v29.4s, v16.4s, v10.4s +add v16.4s, v16.4s, v10.4s +sqrdmulh v3.4S, v15.4S, v22.s[2] +mul v15.4S, v15.4S,v1.s[2] +mla v28.4S, v0.4S, v31.s[0] +sub v0.4s, v19.4s, v17.4s +add v19.4s, v19.4s, v17.4s +sqrdmulh v17.4S, v20.4S, v14.s[0] +mul v20.4S, v20.4S,v12.s[0] +mla v15.4S, v3.4S, v31.s[0] +sub v3.4s, v5.4s, v28.4s +add v5.4s, v5.4s, v28.4s +sqrdmulh v22.4S, v13.4S, v14.s[0] +mul v13.4S, v13.4S,v12.s[0] +mla v20.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v15.4s +add v21.4s, v21.4s, v15.4s +sqrdmulh v15.4S, v8.4S, v11.s[0] +mul v8.4S, v8.4S,v18.s[0] +mla v13.4S, v22.4S, v31.s[0] +sub v22.4s, v7.4s, v20.4s +add v7.4s, v7.4s, v20.4s +sqrdmulh v20.4S, v9.4S, v11.s[0] +mul v9.4S, v9.4S,v18.s[0] +mla v8.4S, v15.4S, v31.s[0] +sub v15.4s, v4.4s, v13.4s +add v4.4s, v4.4s, v13.4s +sqrdmulh v13.4S, v4.4S, v14.s[1] +mul v4.4S, v4.4S,v12.s[1] +mla v9.4S, v20.4S, v31.s[0] +sub v20.4s, v2.4s, v8.4s +add v2.4s, v2.4s, v8.4s +sqrdmulh v8.4S, v15.4S, v14.s[2] +mul v15.4S, v15.4S,v12.s[2] +mla v4.4S, v13.4S, v31.s[0] +sub v13.4s, v30.4s, v9.4s +add v30.4s, v30.4s, v9.4s +sqrdmulh v9.4S, v30.4S, v11.s[1] +mul v30.4S, v30.4S,v18.s[1] +mla v15.4S, v8.4S, v31.s[0] +sub v8.4s, v7.4s, v4.4s +add v7.4s, v7.4s, v4.4s +sqrdmulh v14.4S, v13.4S, v11.s[2] +mul v13.4S, v13.4S,v18.s[2] +mla v30.4S, v9.4S, v31.s[0] +sub v9.4s, v22.4s, v15.4s +add v22.4s, v22.4s, v15.4s +mla v13.4S, v14.4S, v31.s[0] +sub v14.4s, v2.4s, v30.4s +add v2.4s, v2.4s, v30.4s +sub v11.4s, v20.4s, v13.4s +add v20.4s, v20.4s, v13.4s +trn1 v13.4S, v16.4S, v29.4S +trn2 v18.4S, v16.4S, v29.4S +trn1 v30.4S, v19.4S, v0.4S +trn2 v15.4S, v19.4S, v0.4S +trn2 v19.2D, v13.2D, v30.2D +trn2 v0.2D, v18.2D, v15.2D +trn1 v16.2D, v13.2D, v30.2D +trn1 v29.2D, v18.2D, v15.2D +trn1 v15.4S, v5.4S, v3.4S +trn2 v18.4S, v5.4S, v3.4S +trn1 v30.4S, v21.4S, v17.4S +trn2 v13.4S, v21.4S, v17.4S +trn2 v21.2D, v15.2D, v30.2D +trn2 v17.2D, v18.2D, v13.2D +trn1 v5.2D, v15.2D, v30.2D +trn1 v3.2D, v18.2D, v13.2D +trn1 v13.4S, v7.4S, v8.4S +trn2 v18.4S, v7.4S, v8.4S +trn1 v30.4S, v22.4S, v9.4S +trn2 v15.4S, v22.4S, v9.4S +trn2 v22.2D, v13.2D, v30.2D +trn2 v9.2D, v18.2D, v15.2D +trn1 v7.2D, v13.2D, v30.2D +trn1 v8.2D, v18.2D, v15.2D +trn1 v15.4S, v2.4S, v14.4S +trn2 v18.4S, v2.4S, v14.4S +trn1 v30.4S, v20.4S, v11.4S +trn2 v13.4S, v20.4S, v11.4S +trn2 v20.2D, v15.2D, v30.2D +trn2 v11.2D, v18.2D, v13.2D +trn1 v2.2D, v15.2D, v30.2D +trn1 v14.2D, v18.2D, v13.2D +ldr q13, [x17, #+1184] +ldr q18, [x17, #+1200] +sqrdmulh v30.4S, v19.4S, v18.4S +mul v19.4S, v19.4S,v13.4S +sqrdmulh v15.4S, v0.4S, v18.4S +mul v0.4S, v0.4S,v13.4S +mla v19.4S, v30.4S, v31.s[0] +ldr q30, [x17, #+1312] +ldr q18, [x17, #+1328] +sqrdmulh v13.4S, v21.4S, v18.4S +mul v21.4S, v21.4S,v30.4S +mla v0.4S, v15.4S, v31.s[0] +sub v15.4s, v16.4s, v19.4s +add v16.4s, v16.4s, v19.4s +sqrdmulh v19.4S, v17.4S, v18.4S +mul v17.4S, v17.4S,v30.4S +mla v21.4S, v13.4S, v31.s[0] +sub v13.4s, v29.4s, v0.4s +add v29.4s, v29.4s, v0.4s +ldr q0, [x17, #+1216] +ldr q18, [x17, #+1232] +sqrdmulh v30.4S, v29.4S, v18.4S +mul v29.4S, v29.4S,v0.4S +mla v17.4S, v19.4S, v31.s[0] +sub v19.4s, v5.4s, v21.4s +add v5.4s, v5.4s, v21.4s +ldr q21, [x17, #+1248] +ldr q18, [x17, #+1264] +sqrdmulh v0.4S, v13.4S, v18.4S +mul v13.4S, v13.4S,v21.4S +mla v29.4S, v30.4S, v31.s[0] +sub v30.4s, v3.4s, v17.4s +add v3.4s, v3.4s, v17.4s +ldr q17, [x17, #+1344] +ldr q18, [x17, #+1360] +sqrdmulh v21.4S, v3.4S, v18.4S +mul v3.4S, v3.4S,v17.4S +mla v13.4S, v0.4S, v31.s[0] +sub v0.4s, v16.4s, v29.4s +add v16.4s, v16.4s, v29.4s +ldr q29, [x17, #+1376] +ldr q18, [x17, #+1392] +sqrdmulh v17.4S, v30.4S, v18.4S +mul v30.4S, v30.4S,v29.4S +mla v3.4S, v21.4S, v31.s[0] +sub v21.4s, v15.4s, v13.4s +add v15.4s, v15.4s, v13.4s +mla v30.4S, v17.4S, v31.s[0] +sub v17.4s, v5.4s, v3.4s +add v5.4s, v5.4s, v3.4s +sub v3.4s, v19.4s, v30.4s +add v19.4s, v19.4s, v30.4s +str q16, [x0, #512] +str q0, [x0, #528] +str q15, [x0, #544] +str q21, [x0, #560] +str q5, [x0, #576] +str q17, [x0, #592] +str q19, [x0, #608] +str q3, [x0, #624] +ldr q3, [x17, #+1440] +ldr q19, [x17, #+1456] +sqrdmulh v17.4S, v22.4S, v19.4S +mul v22.4S, v22.4S,v3.4S +sqrdmulh v5.4S, v9.4S, v19.4S +mul v9.4S, v9.4S,v3.4S +mla v22.4S, v17.4S, v31.s[0] +ldr q17, [x17, #+1568] +ldr q19, [x17, #+1584] +sqrdmulh v3.4S, v20.4S, v19.4S +mul v20.4S, v20.4S,v17.4S +mla v9.4S, v5.4S, v31.s[0] +sub v5.4s, v7.4s, v22.4s +add v7.4s, v7.4s, v22.4s +sqrdmulh v22.4S, v11.4S, v19.4S +mul v11.4S, v11.4S,v17.4S +mla v20.4S, v3.4S, v31.s[0] +sub v3.4s, v8.4s, v9.4s +add v8.4s, v8.4s, v9.4s +ldr q9, [x17, #+1472] +ldr q19, [x17, #+1488] +sqrdmulh v17.4S, v8.4S, v19.4S +mul v8.4S, v8.4S,v9.4S +mla v11.4S, v22.4S, v31.s[0] +sub v22.4s, v2.4s, v20.4s +add v2.4s, v2.4s, v20.4s +ldr q20, [x17, #+1504] +ldr q19, [x17, #+1520] +sqrdmulh v9.4S, v3.4S, v19.4S +mul v3.4S, v3.4S,v20.4S +mla v8.4S, v17.4S, v31.s[0] +sub v17.4s, v14.4s, v11.4s +add v14.4s, v14.4s, v11.4s +ldr q11, [x17, #+1600] +ldr q19, [x17, #+1616] +sqrdmulh v20.4S, v14.4S, v19.4S +mul v14.4S, v14.4S,v11.4S +mla v3.4S, v9.4S, v31.s[0] +sub v9.4s, v7.4s, v8.4s +add v7.4s, v7.4s, v8.4s +ldr q8, [x17, #+1632] +ldr q19, [x17, #+1648] +sqrdmulh v11.4S, v17.4S, v19.4S +mul v17.4S, v17.4S,v8.4S +mla v14.4S, v20.4S, v31.s[0] +sub v20.4s, v5.4s, v3.4s +add v5.4s, v5.4s, v3.4s +mla v17.4S, v11.4S, v31.s[0] +sub v11.4s, v2.4s, v14.4s +add v2.4s, v2.4s, v14.4s +sub v14.4s, v22.4s, v17.4s +add v22.4s, v22.4s, v17.4s +str q7, [x0, #640] +str q9, [x0, #656] +str q5, [x0, #672] +str q20, [x0, #688] +str q2, [x0, #704] +str q11, [x0, #720] +str q22, [x0, #736] +str q14, [x0, #752] +ldr q14, [x0, #800] +ldr q22, [x0, #816] +ldr q11, [x0, #768] +ldr q2, [x0, #784] +ldr q20, [x0, #864] +ldr q5, [x0, #880] +ldr q9, [x0, #832] +ldr q7, [x0, #848] +ldr q17, [x0, #928] +ldr q3, [x0, #944] +ldr q19, [x0, #896] +ldr q8, [x0, #912] +ldr q21, [x0, #992] +ldr q15, [x0, #1008] +ldr q0, [x0, #960] +ldr q16, [x0, #976] +ldr q30, [x17, #+1664] +ldr q13, [x17, #+1680] +ldr q18, [x17, #+1792] +ldr q29, [x17, #+1808] +ldr q12, [x17, #+1920] +ldr q4, [x17, #+1936] +ldr q1, [x17, #+2048] +ldr q28, [x17, #+2064] +sqrdmulh v6.4S, v14.4S, v13.s[0] +mul v14.4S, v14.4S,v30.s[0] +sqrdmulh v10.4S, v22.4S, v13.s[0] +mul v22.4S, v22.4S,v30.s[0] +mla v14.4S, v6.4S, v31.s[0] +sqrdmulh v6.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v18.s[0] +mla v22.4S, v10.4S, v31.s[0] +sub v10.4s, v11.4s, v14.4s +add v11.4s, v11.4s, v14.4s +sqrdmulh v14.4S, v5.4S, v29.s[0] +mul v5.4S, v5.4S,v18.s[0] +mla v20.4S, v6.4S, v31.s[0] +sub v6.4s, v2.4s, v22.4s +add v2.4s, v2.4s, v22.4s +sqrdmulh v22.4S, v2.4S, v13.s[1] +mul v2.4S, v2.4S,v30.s[1] +mla v5.4S, v14.4S, v31.s[0] +sub v14.4s, v9.4s, v20.4s +add v9.4s, v9.4s, v20.4s +sqrdmulh v20.4S, v6.4S, v13.s[2] +mul v6.4S, v6.4S,v30.s[2] +mla v2.4S, v22.4S, v31.s[0] +sub v22.4s, v7.4s, v5.4s +add v7.4s, v7.4s, v5.4s +sqrdmulh v5.4S, v7.4S, v29.s[1] +mul v7.4S, v7.4S,v18.s[1] +mla v6.4S, v20.4S, v31.s[0] +sub v20.4s, v11.4s, v2.4s +add v11.4s, v11.4s, v2.4s +sqrdmulh v13.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v18.s[2] +mla v7.4S, v5.4S, v31.s[0] +sub v5.4s, v10.4s, v6.4s +add v10.4s, v10.4s, v6.4s +sqrdmulh v6.4S, v17.4S, v4.s[0] +mul v17.4S, v17.4S,v12.s[0] +mla v22.4S, v13.4S, v31.s[0] +sub v13.4s, v9.4s, v7.4s +add v9.4s, v9.4s, v7.4s +sqrdmulh v29.4S, v3.4S, v4.s[0] +mul v3.4S, v3.4S,v12.s[0] +mla v17.4S, v6.4S, v31.s[0] +sub v6.4s, v14.4s, v22.4s +add v14.4s, v14.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v28.s[0] +mul v21.4S, v21.4S,v1.s[0] +mla v3.4S, v29.4S, v31.s[0] +sub v29.4s, v19.4s, v17.4s +add v19.4s, v19.4s, v17.4s +sqrdmulh v17.4S, v15.4S, v28.s[0] +mul v15.4S, v15.4S,v1.s[0] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v8.4s, v3.4s +add v8.4s, v8.4s, v3.4s +sqrdmulh v3.4S, v8.4S, v4.s[1] +mul v8.4S, v8.4S,v12.s[1] +mla v15.4S, v17.4S, v31.s[0] +sub v17.4s, v0.4s, v21.4s +add v0.4s, v0.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v4.s[2] +mul v22.4S, v22.4S,v12.s[2] +mla v8.4S, v3.4S, v31.s[0] +sub v3.4s, v16.4s, v15.4s +add v16.4s, v16.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v28.s[1] +mul v16.4S, v16.4S,v1.s[1] +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v19.4s, v8.4s +add v19.4s, v19.4s, v8.4s +sqrdmulh v4.4S, v3.4S, v28.s[2] +mul v3.4S, v3.4S,v1.s[2] +mla v16.4S, v15.4S, v31.s[0] +sub v15.4s, v29.4s, v22.4s +add v29.4s, v29.4s, v22.4s +mla v3.4S, v4.4S, v31.s[0] +sub v4.4s, v0.4s, v16.4s +add v0.4s, v0.4s, v16.4s +sub v28.4s, v17.4s, v3.4s +add v17.4s, v17.4s, v3.4s +trn1 v3.4S, v11.4S, v20.4S +trn2 v1.4S, v11.4S, v20.4S +trn1 v16.4S, v10.4S, v5.4S +trn2 v22.4S, v10.4S, v5.4S +trn2 v10.2D, v3.2D, v16.2D +trn2 v5.2D, v1.2D, v22.2D +trn1 v11.2D, v3.2D, v16.2D +trn1 v20.2D, v1.2D, v22.2D +trn1 v22.4S, v9.4S, v13.4S +trn2 v1.4S, v9.4S, v13.4S +trn1 v16.4S, v14.4S, v6.4S +trn2 v3.4S, v14.4S, v6.4S +trn2 v14.2D, v22.2D, v16.2D +trn2 v6.2D, v1.2D, v3.2D +trn1 v9.2D, v22.2D, v16.2D +trn1 v13.2D, v1.2D, v3.2D +trn1 v3.4S, v19.4S, v21.4S +trn2 v1.4S, v19.4S, v21.4S +trn1 v16.4S, v29.4S, v15.4S +trn2 v22.4S, v29.4S, v15.4S +trn2 v29.2D, v3.2D, v16.2D +trn2 v15.2D, v1.2D, v22.2D +trn1 v19.2D, v3.2D, v16.2D +trn1 v21.2D, v1.2D, v22.2D +trn1 v22.4S, v0.4S, v4.4S +trn2 v1.4S, v0.4S, v4.4S +trn1 v16.4S, v17.4S, v28.4S +trn2 v3.4S, v17.4S, v28.4S +trn2 v17.2D, v22.2D, v16.2D +trn2 v28.2D, v1.2D, v3.2D +trn1 v0.2D, v22.2D, v16.2D +trn1 v4.2D, v1.2D, v3.2D +ldr q3, [x17, #+1696] +ldr q1, [x17, #+1712] +sqrdmulh v16.4S, v10.4S, v1.4S +mul v10.4S, v10.4S,v3.4S +sqrdmulh v22.4S, v5.4S, v1.4S +mul v5.4S, v5.4S,v3.4S +mla v10.4S, v16.4S, v31.s[0] +ldr q16, [x17, #+1824] +ldr q1, [x17, #+1840] +sqrdmulh v3.4S, v14.4S, v1.4S +mul v14.4S, v14.4S,v16.4S +mla v5.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v10.4s +add v11.4s, v11.4s, v10.4s +sqrdmulh v10.4S, v6.4S, v1.4S +mul v6.4S, v6.4S,v16.4S +mla v14.4S, v3.4S, v31.s[0] +sub v3.4s, v20.4s, v5.4s +add v20.4s, v20.4s, v5.4s +ldr q5, [x17, #+1728] +ldr q1, [x17, #+1744] +sqrdmulh v16.4S, v20.4S, v1.4S +mul v20.4S, v20.4S,v5.4S +mla v6.4S, v10.4S, v31.s[0] +sub v10.4s, v9.4s, v14.4s +add v9.4s, v9.4s, v14.4s +ldr q14, [x17, #+1760] +ldr q1, [x17, #+1776] +sqrdmulh v5.4S, v3.4S, v1.4S +mul v3.4S, v3.4S,v14.4S +mla v20.4S, v16.4S, v31.s[0] +sub v16.4s, v13.4s, v6.4s +add v13.4s, v13.4s, v6.4s +ldr q6, [x17, #+1856] +ldr q1, [x17, #+1872] +sqrdmulh v14.4S, v13.4S, v1.4S +mul v13.4S, v13.4S,v6.4S +mla v3.4S, v5.4S, v31.s[0] +sub v5.4s, v11.4s, v20.4s +add v11.4s, v11.4s, v20.4s +ldr q20, [x17, #+1888] +ldr q1, [x17, #+1904] +sqrdmulh v6.4S, v16.4S, v1.4S +mul v16.4S, v16.4S,v20.4S +mla v13.4S, v14.4S, v31.s[0] +sub v14.4s, v22.4s, v3.4s +add v22.4s, v22.4s, v3.4s +mla v16.4S, v6.4S, v31.s[0] +sub v6.4s, v9.4s, v13.4s +add v9.4s, v9.4s, v13.4s +sub v13.4s, v10.4s, v16.4s +add v10.4s, v10.4s, v16.4s +str q11, [x0, #768] +str q5, [x0, #784] +str q22, [x0, #800] +str q14, [x0, #816] +str q9, [x0, #832] +str q6, [x0, #848] +str q10, [x0, #864] +str q13, [x0, #880] +ldr q13, [x17, #+1952] +ldr q10, [x17, #+1968] +sqrdmulh v6.4S, v29.4S, v10.4S +mul v29.4S, v29.4S,v13.4S +sqrdmulh v9.4S, v15.4S, v10.4S +mul v15.4S, v15.4S,v13.4S +mla v29.4S, v6.4S, v31.s[0] +ldr q6, [x17, #+2080] +ldr q10, [x17, #+2096] +sqrdmulh v13.4S, v17.4S, v10.4S +mul v17.4S, v17.4S,v6.4S +mla v15.4S, v9.4S, v31.s[0] +sub v9.4s, v19.4s, v29.4s +add v19.4s, v19.4s, v29.4s +sqrdmulh v29.4S, v28.4S, v10.4S +mul v28.4S, v28.4S,v6.4S +mla v17.4S, v13.4S, v31.s[0] +sub v13.4s, v21.4s, v15.4s +add v21.4s, v21.4s, v15.4s +ldr q15, [x17, #+1984] +ldr q10, [x17, #+2000] +sqrdmulh v6.4S, v21.4S, v10.4S +mul v21.4S, v21.4S,v15.4S +mla v28.4S, v29.4S, v31.s[0] +sub v29.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +ldr q17, [x17, #+2016] +ldr q10, [x17, #+2032] +sqrdmulh v15.4S, v13.4S, v10.4S +mul v13.4S, v13.4S,v17.4S +mla v21.4S, v6.4S, v31.s[0] +sub v6.4s, v4.4s, v28.4s +add v4.4s, v4.4s, v28.4s +ldr q28, [x17, #+2112] +ldr q10, [x17, #+2128] +sqrdmulh v17.4S, v4.4S, v10.4S +mul v4.4S, v4.4S,v28.4S +mla v13.4S, v15.4S, v31.s[0] +sub v15.4s, v19.4s, v21.4s +add v19.4s, v19.4s, v21.4s +ldr q21, [x17, #+2144] +ldr q10, [x17, #+2160] +sqrdmulh v28.4S, v6.4S, v10.4S +mul v6.4S, v6.4S,v21.4S +mla v4.4S, v17.4S, v31.s[0] +sub v17.4s, v9.4s, v13.4s +add v9.4s, v9.4s, v13.4s +mla v6.4S, v28.4S, v31.s[0] +sub v28.4s, v0.4s, v4.4s +add v0.4s, v0.4s, v4.4s +sub v4.4s, v29.4s, v6.4s +add v29.4s, v29.4s, v6.4s +str q19, [x0, #896] +str q15, [x0, #912] +str q9, [x0, #928] +str q17, [x0, #944] +str q0, [x0, #960] +str q28, [x0, #976] +str q29, [x0, #992] +str q4, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 2392 +// Instruction count: 2388 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_3_z4_2.s b/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_3_z4_2.s new file mode 100644 index 0000000..302342b --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_3_z4_2.s @@ -0,0 +1,2422 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 26036764 // Layer 6, block 0 +.word 7065381 // Layer 6, block 1 +.word 11280567 // Layer 6, block 2 +.word 19695786 // Layer 6, block 3 +.word 1666225723 // Layer 6, block 0 +.word 452149874 // Layer 6, block 1 +.word 721901190 // Layer 6, block 2 +.word 1260434103 // Layer 6, block 3 +.word 28678040 // Layer 7, block 0 +.word 5637166 // Layer 7, block 2 +.word 18759424 // Layer 7, block 4 +.word 8648030 // Layer 7, block 6 +.word 1835254486 // Layer 7, block 0 +.word 360751090 // Layer 7, block 2 +.word 1200511508 // Layer 7, block 4 +.word 553431680 // Layer 7, block 6 +.word 7232147 // Layer 7, block 1 +.word 7430689 // Layer 7, block 3 +.word 14819378 // Layer 7, block 5 +.word 22112339 // Layer 7, block 7 +.word 462822084 // Layer 7, block 1 +.word 475527802 // Layer 7, block 3 +.word 948367809 // Layer 7, block 5 +.word 1415081692 // Layer 7, block 7 +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14834498 // Layer 6, block 4 +.word 22861321 // Layer 6, block 5 +.word 23033862 // Layer 6, block 6 +.word 32211066 // Layer 6, block 7 +.word 949335415 // Layer 6, block 4 +.word 1463012881 // Layer 6, block 5 +.word 1474054663 // Layer 6, block 6 +.word 2061350894 // Layer 6, block 7 +.word 7103825 // Layer 7, block 8 +.word 24338119 // Layer 7, block 10 +.word 6674394 // Layer 7, block 12 +.word 3716128 // Layer 7, block 14 +.word 454610102 // Layer 7, block 8 +.word 1557520740 // Layer 7, block 10 +.word 427128616 // Layer 7, block 12 +.word 237814041 // Layer 7, block 14 +.word 18577393 // Layer 7, block 9 +.word 17042091 // Layer 7, block 11 +.word 6574213 // Layer 7, block 13 +.word 24666803 // Layer 7, block 15 +.word 1188862414 // Layer 7, block 9 +.word 1090610585 // Layer 7, block 11 +.word 420717521 // Layer 7, block 13 +.word 1578554911 // Layer 7, block 15 +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 11253846 // Layer 6, block 8 +.word 16151303 // Layer 6, block 9 +.word 1821442 // Layer 6, block 10 +.word 23358663 // Layer 6, block 11 +.word 720191176 // Layer 6, block 8 +.word 1033604503 // Layer 6, block 9 +.word 116563391 // Layer 6, block 10 +.word 1494840340 // Layer 6, block 11 +.word 32787475 // Layer 7, block 16 +.word 8269259 // Layer 7, block 18 +.word 20826321 // Layer 7, block 20 +.word 21194054 // Layer 7, block 22 +.word 2098238255 // Layer 7, block 16 +.word 529192186 // Layer 7, block 18 +.word 1332782821 // Layer 7, block 20 +.word 1356315937 // Layer 7, block 22 +.word 28400654 // Layer 7, block 17 +.word 31090287 // Layer 7, block 19 +.word 26776841 // Layer 7, block 21 +.word 22281074 // Layer 7, block 23 +.word 1817503137 // Layer 7, block 17 +.word 1989626512 // Layer 7, block 19 +.word 1713587037 // Layer 7, block 21 +.word 1425879908 // Layer 7, block 23 +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 20504641 // Layer 6, block 12 +.word 7735096 // Layer 6, block 13 +.word 29463916 // Layer 6, block 14 +.word 23172067 // Layer 6, block 15 +.word 1312196872 // Layer 6, block 12 +.word 495008363 // Layer 6, block 13 +.word 1885546712 // Layer 6, block 14 +.word 1482899108 // Layer 6, block 15 +.word 1953000 // Layer 7, block 24 +.word 12766243 // Layer 7, block 26 +.word 16292342 // Layer 7, block 28 +.word 25143337 // Layer 7, block 30 +.word 124982461 // Layer 7, block 24 +.word 816977197 // Layer 7, block 26 +.word 1042630311 // Layer 7, block 28 +.word 1609050759 // Layer 7, block 30 +.word 12486848 // Layer 7, block 25 +.word 31556661 // Layer 7, block 27 +.word 28330310 // Layer 7, block 29 +.word 15137961 // Layer 7, block 31 +.word 799097282 // Layer 7, block 25 +.word 2019472170 // Layer 7, block 27 +.word 1813001465 // Layer 7, block 29 +.word 968755565 // Layer 7, block 31 +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 18663828 // Layer 6, block 16 +.word 25765932 // Layer 6, block 17 +.word 11779122 // Layer 6, block 18 +.word 29112305 // Layer 6, block 19 +.word 1194393831 // Layer 6, block 16 +.word 1648893798 // Layer 6, block 17 +.word 753806275 // Layer 6, block 18 +.word 1863045325 // Layer 6, block 19 +.word 33163184 // Layer 7, block 32 +.word 11550623 // Layer 7, block 34 +.word 25375595 // Layer 7, block 36 +.word 18254638 // Layer 7, block 38 +.word 2122281795 // Layer 7, block 32 +.word 739183455 // Layer 7, block 34 +.word 1623914137 // Layer 7, block 36 +.word 1168207670 // Layer 7, block 38 +.word 9551359 // Layer 7, block 33 +.word 33257316 // Layer 7, block 35 +.word 10387700 // Layer 7, block 37 +.word 4263629 // Layer 7, block 39 +.word 611240324 // Layer 7, block 33 +.word 2128305784 // Layer 7, block 35 +.word 664762063 // Layer 7, block 37 +.word 272851431 // Layer 7, block 39 +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 596073 // Layer 6, block 20 +.word 29039358 // Layer 6, block 21 +.word 6760262 // Layer 6, block 22 +.word 2228887 // Layer 6, block 23 +.word 38145761 // Layer 6, block 20 +.word 1858377074 // Layer 6, block 21 +.word 432623749 // Layer 6, block 22 +.word 142637881 // Layer 6, block 23 +.word 25929180 // Layer 7, block 40 +.word 23508428 // Layer 7, block 42 +.word 22560727 // Layer 7, block 44 +.word 29457393 // Layer 7, block 46 +.word 1659340873 // Layer 7, block 40 +.word 1504424569 // Layer 7, block 42 +.word 1443776334 // Layer 7, block 44 +.word 1885129272 // Layer 7, block 46 +.word 17371159 // Layer 7, block 41 +.word 11558208 // Layer 7, block 43 +.word 15755637 // Layer 7, block 45 +.word 20740787 // Layer 7, block 47 +.word 1111669329 // Layer 7, block 41 +.word 739668858 // Layer 7, block 43 +.word 1008283812 // Layer 7, block 45 +.word 1327309063 // Layer 7, block 47 +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 13624329 // Layer 6, block 24 +.word 9838349 // Layer 6, block 25 +.word 6934560 // Layer 6, block 26 +.word 11310234 // Layer 6, block 27 +.word 871890510 // Layer 6, block 24 +.word 629606282 // Layer 6, block 25 +.word 443777969 // Layer 6, block 26 +.word 723799733 // Layer 6, block 27 +.word 3153984 // Layer 7, block 48 +.word 15599806 // Layer 7, block 50 +.word 23484790 // Layer 7, block 52 +.word 30174454 // Layer 7, block 54 +.word 201839571 // Layer 7, block 48 +.word 998311389 // Layer 7, block 50 +.word 1502911852 // Layer 7, block 52 +.word 1931017673 // Layer 7, block 54 +.word 13598070 // Layer 7, block 49 +.word 31454003 // Layer 7, block 51 +.word 20506260 // Layer 7, block 53 +.word 5928435 // Layer 7, block 55 +.word 870210062 // Layer 7, block 49 +.word 2012902560 // Layer 7, block 51 +.word 1312300480 // Layer 7, block 53 +.word 379390883 // Layer 7, block 55 +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 32798516 // Layer 6, block 28 +.word 9911360 // Layer 6, block 29 +.word 32443170 // Layer 6, block 30 +.word 31293482 // Layer 6, block 31 +.word 2098944825 // Layer 6, block 28 +.word 634278629 // Layer 6, block 29 +.word 2076204416 // Layer 6, block 30 +.word 2002630000 // Layer 6, block 31 +.word 26013877 // Layer 7, block 56 +.word 22928950 // Layer 7, block 58 +.word 24547058 // Layer 7, block 60 +.word 21082546 // Layer 7, block 62 +.word 1664761067 // Layer 7, block 56 +.word 1467340807 // Layer 7, block 58 +.word 1570891816 // Layer 7, block 60 +.word 1349179970 // Layer 7, block 62 +.word 21864746 // Layer 7, block 57 +.word 27678266 // Layer 7, block 59 +.word 30695887 // Layer 7, block 61 +.word 31772478 // Layer 7, block 63 +.word 1399236949 // Layer 7, block 57 +.word 1771273834 // Layer 7, block 59 +.word 1964386839 // Layer 7, block 61 +.word 2033283404 // Layer 7, block 63 +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 2853776 // Layer 6, block 32 +.word 31645959 // Layer 6, block 33 +.word 29723614 // Layer 6, block 34 +.word 31813171 // Layer 6, block 35 +.word 182627725 // Layer 6, block 32 +.word 2025186806 // Layer 6, block 33 +.word 1902166116 // Layer 6, block 34 +.word 2035887557 // Layer 6, block 35 +.word 30377953 // Layer 7, block 64 +.word 4924837 // Layer 7, block 66 +.word 11362575 // Layer 7, block 68 +.word 31398766 // Layer 7, block 70 +.word 1944040616 // Layer 7, block 64 +.word 315165513 // Layer 7, block 66 +.word 727149301 // Layer 7, block 68 +.word 2009367662 // Layer 7, block 70 +.word 27689101 // Layer 7, block 65 +.word 31229525 // Layer 7, block 67 +.word 6544948 // Layer 7, block 69 +.word 13728247 // Layer 7, block 71 +.word 1771967221 // Layer 7, block 65 +.word 1998537064 // Layer 7, block 67 +.word 418844704 // Layer 7, block 69 +.word 878540754 // Layer 7, block 71 +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9116920 // Layer 6, block 36 +.word 26449800 // Layer 6, block 37 +.word 27173300 // Layer 6, block 38 +.word 1574249 // Layer 6, block 39 +.word 583438350 // Layer 6, block 36 +.word 1692658010 // Layer 6, block 37 +.word 1738958476 // Layer 6, block 38 +.word 100744247 // Layer 6, block 39 +.word 6510145 // Layer 7, block 72 +.word 760999 // Layer 7, block 74 +.word 1634503 // Layer 7, block 76 +.word 29546109 // Layer 7, block 78 +.word 416617482 // Layer 7, block 72 +.word 48700219 // Layer 7, block 74 +.word 104600209 // Layer 7, block 76 +.word 1890806663 // Layer 7, block 78 +.word 2195232 // Layer 7, block 73 +.word 4465852 // Layer 7, block 75 +.word 31203102 // Layer 7, block 77 +.word 29916743 // Layer 7, block 79 +.word 140484126 // Layer 7, block 73 +.word 285792715 // Layer 7, block 75 +.word 1996846121 // Layer 7, block 77 +.word 1914525428 // Layer 7, block 79 +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29172999 // Layer 6, block 40 +.word 16825951 // Layer 6, block 41 +.word 11592382 // Layer 6, block 42 +.word 2671395 // Layer 6, block 43 +.word 1866929445 // Layer 6, block 40 +.word 1076778680 // Layer 6, block 41 +.word 741855827 // Layer 6, block 42 +.word 170956232 // Layer 6, block 43 +.word 14579779 // Layer 7, block 80 +.word 24263513 // Layer 7, block 82 +.word 4646776 // Layer 7, block 84 +.word 69049 // Layer 7, block 86 +.word 933034643 // Layer 7, block 80 +.word 1552746321 // Layer 7, block 82 +.word 297370968 // Layer 7, block 84 +.word 4418799 // Layer 7, block 86 +.word 33263488 // Layer 7, block 81 +.word 22493246 // Layer 7, block 83 +.word 22009979 // Layer 7, block 85 +.word 12021234 // Layer 7, block 87 +.word 2128700762 // Layer 7, block 81 +.word 1439457879 // Layer 7, block 83 +.word 1408531152 // Layer 7, block 85 +.word 769300260 // Layer 7, block 87 +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 15720958 // Layer 6, block 44 +.word 4876619 // Layer 6, block 45 +.word 9370171 // Layer 6, block 46 +.word 2197027 // Layer 6, block 47 +.word 1006064525 // Layer 6, block 44 +.word 312079797 // Layer 6, block 45 +.word 599645177 // Layer 6, block 46 +.word 140598997 // Layer 6, block 47 +.word 16117282 // Layer 7, block 88 +.word 9635661 // Layer 7, block 90 +.word 9117520 // Layer 7, block 92 +.word 3506913 // Layer 7, block 94 +.word 1031427326 // Layer 7, block 88 +.word 616635240 // Layer 7, block 90 +.word 583476747 // Layer 7, block 92 +.word 224425303 // Layer 7, block 94 +.word 20014407 // Layer 7, block 89 +.word 25893988 // Layer 7, block 91 +.word 10257619 // Layer 7, block 93 +.word 24501669 // Layer 7, block 95 +.word 1280824291 // Layer 7, block 89 +.word 1657088757 // Layer 7, block 91 +.word 656437514 // Layer 7, block 93 +.word 1567987141 // Layer 7, block 95 +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 23467272 // Layer 6, block 48 +.word 11944835 // Layer 6, block 49 +.word 29768154 // Layer 6, block 50 +.word 3189790 // Layer 6, block 51 +.word 1501790786 // Layer 6, block 48 +.word 764411097 // Layer 6, block 49 +.word 1905016458 // Layer 6, block 50 +.word 204130980 // Layer 6, block 51 +.word 28559032 // Layer 7, block 96 +.word 20151609 // Layer 7, block 98 +.word 11645481 // Layer 7, block 100 +.word 16402437 // Layer 7, block 102 +.word 1827638556 // Layer 7, block 96 +.word 1289604549 // Layer 7, block 98 +.word 745253903 // Layer 7, block 100 +.word 1049675853 // Layer 7, block 102 +.word 1005359 // Layer 7, block 97 +.word 19130139 // Layer 7, block 99 +.word 11690281 // Layer 7, block 101 +.word 5461508 // Layer 7, block 103 +.word 64338065 // Layer 7, block 97 +.word 1224235458 // Layer 7, block 99 +.word 748120885 // Layer 7, block 101 +.word 349509836 // Layer 7, block 103 +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 4898455 // Layer 6, block 52 +.word 22059944 // Layer 6, block 53 +.word 20315246 // Layer 6, block 54 +.word 28615767 // Layer 6, block 55 +.word 313477194 // Layer 6, block 52 +.word 1411728668 // Layer 6, block 53 +.word 1300076517 // Layer 6, block 54 +.word 1831269319 // Layer 6, block 55 +.word 6226096 // Layer 7, block 104 +.word 14029790 // Layer 7, block 106 +.word 7729000 // Layer 7, block 108 +.word 13958531 // Layer 7, block 110 +.word 398439734 // Layer 7, block 104 +.word 897838034 // Layer 7, block 106 +.word 494618249 // Layer 7, block 108 +.word 893277806 // Layer 7, block 110 +.word 31755058 // Layer 7, block 105 +.word 26102744 // Layer 7, block 107 +.word 19175904 // Layer 7, block 109 +.word 19472238 // Layer 7, block 111 +.word 2032168609 // Layer 7, block 105 +.word 1670448121 // Layer 7, block 107 +.word 1227164194 // Layer 7, block 109 +.word 1246128123 // Layer 7, block 111 +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 17302560 // Layer 6, block 56 +.word 8630188 // Layer 6, block 57 +.word 13744680 // Layer 6, block 58 +.word 31890906 // Layer 6, block 59 +.word 1107279328 // Layer 6, block 56 +.word 552289879 // Layer 6, block 57 +.word 879592386 // Layer 6, block 58 +.word 2040862218 // Layer 6, block 59 +.word 4735938 // Layer 7, block 112 +.word 26671657 // Layer 7, block 114 +.word 25810971 // Layer 7, block 116 +.word 25578690 // Layer 7, block 118 +.word 303076900 // Layer 7, block 112 +.word 1706855774 // Layer 7, block 114 +.word 1651776074 // Layer 7, block 116 +.word 1636911225 // Layer 7, block 118 +.word 6957373 // Layer 7, block 113 +.word 25381712 // Layer 7, block 115 +.word 27780827 // Layer 7, block 117 +.word 28062311 // Layer 7, block 119 +.word 445237890 // Layer 7, block 113 +.word 1624305595 // Layer 7, block 115 +.word 1777837237 // Layer 7, block 117 +.word 1795850838 // Layer 7, block 119 +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 26150922 // Layer 6, block 60 +.word 29525906 // Layer 6, block 61 +.word 23080870 // Layer 6, block 62 +.word 1636987 // Layer 6, block 63 +.word 1673531278 // Layer 6, block 60 +.word 1889513769 // Layer 6, block 61 +.word 1477062945 // Layer 6, block 62 +.word 104759172 // Layer 6, block 63 +.word 10674616 // Layer 7, block 120 +.word 9508293 // Layer 7, block 122 +.word 4274200 // Layer 7, block 124 +.word 10066304 // Layer 7, block 126 +.word 683123285 // Layer 7, block 120 +.word 608484310 // Layer 7, block 122 +.word 273527923 // Layer 7, block 124 +.word 644194289 // Layer 7, block 126 +.word 26473446 // Layer 7, block 121 +.word 14853570 // Layer 7, block 123 +.word 32427548 // Layer 7, block 125 +.word 16598340 // Layer 7, block 127 +.word 1694171239 // Layer 7, block 121 +.word 950555930 // Layer 7, block 123 +.word 2075204685 // Layer 7, block 125 +.word 1062212688 // Layer 7, block 127 +.text +.global ntt_u32_full_neon_asm_var_4_4_3_z4_2 +.global _ntt_u32_full_neon_asm_var_4_4_3_z4_2 +ntt_u32_full_neon_asm_var_4_4_3_z4_2: +_ntt_u32_full_neon_asm_var_4_4_3_z4_2: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #800] +ldr q21, [x0, #864] +ldr q20, [x0, #928] +ldr q19, [x0, #992] +ldr q18, [x0, #288] +ldr q17, [x0, #352] +ldr q16, [x0, #416] +ldr q3, [x0, #480] +sqrdmulh v2.4S, v22.4S, v29.s[0] +ldr q1, [x0, #544] +mul v22.4S, v22.4S,v30.s[0] +ldr q0, [x0, #608] +sqrdmulh v15.4S, v21.4S, v29.s[0] +ldr q14, [x0, #672] +mul v21.4S, v21.4S,v30.s[0] +ldr q13, [x0, #736] +mla v22.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q12, [x0, #32] +sub v11.4s, v18.4s, v22.4s +mla v21.4S, v15.4S, v31.s[0] +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q15, [x0, #96] +sub v10.4s, v17.4s, v21.4s +mla v20.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v1.4S, v29.s[0] +ldr q2, [x0, #160] +mul v1.4S, v1.4S,v30.s[0] +sub v9.4s, v16.4s, v20.4s +mla v19.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v0.4S, v29.s[0] +ldr q22, [x0, #224] +mul v0.4S, v0.4S,v30.s[0] +sub v8.4s, v3.4s, v19.4s +mla v1.4S, v21.4S, v31.s[0] +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v21.4s, v12.4s, v1.4s +mla v0.4S, v20.4S, v31.s[0] +add v12.4s, v12.4s, v1.4s +sqrdmulh v1.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +sub v20.4s, v15.4s, v0.4s +mla v14.4S, v19.4S, v31.s[0] +add v15.4s, v15.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v19.4s, v2.4s, v14.4s +mla v13.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v1.4s, v22.4s, v13.4s +mla v16.4S, v0.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v0.4s, v2.4s, v16.4s +mla v3.4S, v14.4S, v31.s[0] +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v14.4s, v22.4s, v3.4s +mla v18.4S, v13.4S, v31.s[0] +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v29.s[2] +mul v9.4S, v9.4S,v30.s[2] +sub v13.4s, v12.4s, v18.4s +mla v17.4S, v16.4S, v31.s[0] +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v8.4S, v29.s[2] +mul v8.4S, v8.4S,v30.s[2] +sub v16.4s, v15.4s, v17.4s +mla v9.4S, v3.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +sqrdmulh v17.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v3.4s, v19.4s, v9.4s +mla v8.4S, v18.4S, v31.s[0] +add v19.4s, v19.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v18.4s, v1.4s, v8.4s +mla v11.4S, v17.4S, v31.s[0] +add v1.4s, v1.4s, v8.4s +sqrdmulh v8.4S, v2.4S, v27.s[0] +mul v2.4S, v2.4S,v28.s[0] +sub v17.4s, v21.4s, v11.4s +mla v10.4S, v9.4S, v31.s[0] +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v27.s[0] +mul v22.4S, v22.4S,v28.s[0] +sub v9.4s, v20.4s, v10.4s +mla v2.4S, v8.4S, v31.s[0] +add v20.4s, v20.4s, v10.4s +sqrdmulh v10.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v8.4s, v12.4s, v2.4s +mla v22.4S, v11.4S, v31.s[0] +add v12.4s, v12.4s, v2.4s +sqrdmulh v2.4S, v14.4S, v27.s[1] +mul v14.4S, v14.4S,v28.s[1] +sub v11.4s, v15.4s, v22.4s +mla v0.4S, v10.4S, v31.s[0] +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v27.s[2] +mul v19.4S, v19.4S,v28.s[2] +sub v10.4s, v13.4s, v0.4s +mla v14.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v0.4s +sqrdmulh v0.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +sub v2.4s, v16.4s, v14.4s +mla v19.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v27.s[3] +mul v3.4S, v3.4S,v28.s[3] +sub v22.4s, v21.4s, v19.4s +mla v1.4S, v0.4S, v31.s[0] +add v21.4s, v21.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v0.4s, v20.4s, v1.4s +mla v3.4S, v14.4S, v31.s[0] +add v20.4s, v20.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v25.s[0] +mul v15.4S, v15.4S,v26.s[0] +sub v14.4s, v17.4s, v3.4s +mla v18.4S, v19.4S, v31.s[0] +add v17.4s, v17.4s, v3.4s +sqrdmulh v3.4S, v11.4S, v25.s[1] +mul v11.4S, v11.4S,v26.s[1] +sub v19.4s, v9.4s, v18.4s +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v1.4s, v12.4s, v15.4s +mla v11.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v25.s[3] +mul v2.4S, v2.4S,v26.s[3] +sub v3.4s, v8.4s, v11.4s +mla v16.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v11.4s +str q12, [x0, #32] +sqrdmulh v12.4S, v20.4S, v23.s[0] +str q1, [x0, #96] +mul v20.4S, v20.4S,v24.s[0] +ldr q1, [x0, #816] +sub v11.4s, v13.4s, v16.4s +ldr q18, [x0, #880] +mla v2.4S, v15.4S, v31.s[0] +add v13.4s, v13.4s, v16.4s +str q8, [x0, #160] +sqrdmulh v8.4S, v0.4S, v23.s[1] +str q3, [x0, #224] +mul v0.4S, v0.4S,v24.s[1] +ldr q3, [x0, #944] +sub v16.4s, v10.4s, v2.4s +ldr q15, [x0, #1008] +mla v20.4S, v12.4S, v31.s[0] +add v10.4s, v10.4s, v2.4s +str q13, [x0, #288] +sqrdmulh v13.4S, v9.4S, v23.s[2] +str q11, [x0, #352] +mul v9.4S, v9.4S,v24.s[2] +ldr q11, [x0, #304] +sub v2.4s, v21.4s, v20.4s +ldr q12, [x0, #368] +mla v0.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v20.4s +str q10, [x0, #416] +sqrdmulh v10.4S, v19.4S, v23.s[3] +str q16, [x0, #480] +mul v19.4S, v19.4S,v24.s[3] +ldr q16, [x0, #432] +sub v20.4s, v22.4s, v0.4s +ldr q8, [x0, #496] +mla v9.4S, v13.4S, v31.s[0] +add v22.4s, v22.4s, v0.4s +str q21, [x0, #544] +sqrdmulh v21.4S, v1.4S, v29.s[0] +str q2, [x0, #608] +ldr q2, [x0, #560] +mul v1.4S, v1.4S,v30.s[0] +ldr q0, [x0, #624] +sub v13.4s, v17.4s, v9.4s +mla v19.4S, v10.4S, v31.s[0] +add v17.4s, v17.4s, v9.4s +str q22, [x0, #672] +sqrdmulh v22.4S, v18.4S, v29.s[0] +str q20, [x0, #736] +ldr q20, [x0, #688] +mul v18.4S, v18.4S,v30.s[0] +ldr q9, [x0, #752] +sub v10.4s, v14.4s, v19.4s +mla v1.4S, v21.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +str q17, [x0, #800] +sqrdmulh v17.4S, v3.4S, v29.s[0] +str q13, [x0, #864] +mul v3.4S, v3.4S,v30.s[0] +ldr q13, [x0, #48] +sub v19.4s, v11.4s, v1.4s +mla v18.4S, v22.4S, v31.s[0] +add v11.4s, v11.4s, v1.4s +str q14, [x0, #928] +sqrdmulh v14.4S, v15.4S, v29.s[0] +str q10, [x0, #992] +mul v15.4S, v15.4S,v30.s[0] +ldr q10, [x0, #112] +sub v1.4s, v12.4s, v18.4s +mla v3.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v2.4S, v29.s[0] +ldr q17, [x0, #176] +mul v2.4S, v2.4S,v30.s[0] +sub v22.4s, v16.4s, v3.4s +mla v15.4S, v14.4S, v31.s[0] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v0.4S, v29.s[0] +ldr q14, [x0, #240] +mul v0.4S, v0.4S,v30.s[0] +sub v21.4s, v8.4s, v15.4s +mla v2.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +sub v18.4s, v13.4s, v2.4s +mla v0.4S, v3.4S, v31.s[0] +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v9.4S, v29.s[0] +mul v9.4S, v9.4S,v30.s[0] +sub v3.4s, v10.4s, v0.4s +mla v20.4S, v15.4S, v31.s[0] +add v10.4s, v10.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v15.4s, v17.4s, v20.4s +mla v9.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +sub v2.4s, v14.4s, v9.4s +mla v16.4S, v0.4S, v31.s[0] +add v14.4s, v14.4s, v9.4s +sqrdmulh v9.4S, v11.4S, v29.s[1] +mul v11.4S, v11.4S,v30.s[1] +sub v0.4s, v17.4s, v16.4s +mla v8.4S, v20.4S, v31.s[0] +add v17.4s, v17.4s, v16.4s +sqrdmulh v16.4S, v12.4S, v29.s[1] +mul v12.4S, v12.4S,v30.s[1] +sub v20.4s, v14.4s, v8.4s +mla v11.4S, v9.4S, v31.s[0] +add v14.4s, v14.4s, v8.4s +sqrdmulh v8.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +sub v9.4s, v13.4s, v11.4s +mla v12.4S, v16.4S, v31.s[0] +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +sub v16.4s, v10.4s, v12.4s +mla v22.4S, v8.4S, v31.s[0] +add v10.4s, v10.4s, v12.4s +sqrdmulh v12.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +sub v8.4s, v15.4s, v22.4s +mla v21.4S, v11.4S, v31.s[0] +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v1.4S, v29.s[2] +mul v1.4S, v1.4S,v30.s[2] +sub v11.4s, v2.4s, v21.4s +mla v19.4S, v12.4S, v31.s[0] +add v2.4s, v2.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v27.s[0] +mul v17.4S, v17.4S,v28.s[0] +sub v12.4s, v18.4s, v19.4s +mla v1.4S, v22.4S, v31.s[0] +add v18.4s, v18.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +sub v22.4s, v3.4s, v1.4s +mla v17.4S, v21.4S, v31.s[0] +add v3.4s, v3.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v21.4s, v13.4s, v17.4s +mla v14.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v17.4s +sqrdmulh v17.4S, v20.4S, v27.s[1] +mul v20.4S, v20.4S,v28.s[1] +sub v19.4s, v10.4s, v14.4s +mla v0.4S, v1.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v27.s[2] +mul v15.4S, v15.4S,v28.s[2] +sub v1.4s, v9.4s, v0.4s +mla v20.4S, v17.4S, v31.s[0] +add v9.4s, v9.4s, v0.4s +sqrdmulh v0.4S, v2.4S, v27.s[2] +mul v2.4S, v2.4S,v28.s[2] +sub v17.4s, v16.4s, v20.4s +mla v15.4S, v14.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v27.s[3] +mul v8.4S, v8.4S,v28.s[3] +sub v14.4s, v18.4s, v15.4s +mla v2.4S, v0.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v27.s[3] +mul v11.4S, v11.4S,v28.s[3] +sub v0.4s, v3.4s, v2.4s +mla v8.4S, v20.4S, v31.s[0] +add v3.4s, v3.4s, v2.4s +sqrdmulh v2.4S, v10.4S, v25.s[0] +mul v10.4S, v10.4S,v26.s[0] +sub v20.4s, v12.4s, v8.4s +mla v11.4S, v15.4S, v31.s[0] +add v12.4s, v12.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v25.s[1] +mul v19.4S, v19.4S,v26.s[1] +sub v15.4s, v22.4s, v11.4s +mla v10.4S, v2.4S, v31.s[0] +add v22.4s, v22.4s, v11.4s +sqrdmulh v11.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v2.4s, v13.4s, v10.4s +mla v19.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v10.4s +sqrdmulh v10.4S, v17.4S, v25.s[3] +mul v17.4S, v17.4S,v26.s[3] +sub v8.4s, v21.4s, v19.4s +mla v16.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v19.4s +str q13, [x0, #48] +sqrdmulh v13.4S, v3.4S, v23.s[0] +str q2, [x0, #112] +mul v3.4S, v3.4S,v24.s[0] +ldr q2, [x0, #768] +sub v19.4s, v9.4s, v16.4s +ldr q11, [x0, #832] +mla v17.4S, v10.4S, v31.s[0] +add v9.4s, v9.4s, v16.4s +str q21, [x0, #176] +sqrdmulh v21.4S, v0.4S, v23.s[1] +str q8, [x0, #240] +mul v0.4S, v0.4S,v24.s[1] +ldr q8, [x0, #896] +sub v16.4s, v1.4s, v17.4s +ldr q10, [x0, #960] +mla v3.4S, v13.4S, v31.s[0] +add v1.4s, v1.4s, v17.4s +str q9, [x0, #304] +sqrdmulh v9.4S, v22.4S, v23.s[2] +str q19, [x0, #368] +mul v22.4S, v22.4S,v24.s[2] +ldr q19, [x0, #256] +sub v17.4s, v18.4s, v3.4s +ldr q13, [x0, #320] +mla v0.4S, v21.4S, v31.s[0] +add v18.4s, v18.4s, v3.4s +str q1, [x0, #432] +sqrdmulh v1.4S, v15.4S, v23.s[3] +str q16, [x0, #496] +mul v15.4S, v15.4S,v24.s[3] +ldr q16, [x0, #384] +sub v3.4s, v14.4s, v0.4s +ldr q21, [x0, #448] +mla v22.4S, v9.4S, v31.s[0] +add v14.4s, v14.4s, v0.4s +str q18, [x0, #560] +sqrdmulh v18.4S, v2.4S, v29.s[0] +str q17, [x0, #624] +ldr q17, [x0, #512] +mul v2.4S, v2.4S,v30.s[0] +ldr q0, [x0, #576] +sub v9.4s, v12.4s, v22.4s +mla v15.4S, v1.4S, v31.s[0] +add v12.4s, v12.4s, v22.4s +str q14, [x0, #688] +sqrdmulh v14.4S, v11.4S, v29.s[0] +str q3, [x0, #752] +ldr q3, [x0, #640] +mul v11.4S, v11.4S,v30.s[0] +ldr q22, [x0, #704] +sub v1.4s, v20.4s, v15.4s +mla v2.4S, v18.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +str q12, [x0, #816] +sqrdmulh v12.4S, v8.4S, v29.s[0] +str q9, [x0, #880] +mul v8.4S, v8.4S,v30.s[0] +ldr q9, [x0, #0] +sub v15.4s, v19.4s, v2.4s +mla v11.4S, v14.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +str q20, [x0, #944] +sqrdmulh v20.4S, v10.4S, v29.s[0] +str q1, [x0, #1008] +mul v10.4S, v10.4S,v30.s[0] +ldr q1, [x0, #64] +sub v2.4s, v13.4s, v11.4s +mla v8.4S, v12.4S, v31.s[0] +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v29.s[0] +ldr q12, [x0, #128] +mul v17.4S, v17.4S,v30.s[0] +sub v14.4s, v16.4s, v8.4s +mla v10.4S, v20.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v0.4S, v29.s[0] +ldr q20, [x0, #192] +mul v0.4S, v0.4S,v30.s[0] +sub v18.4s, v21.4s, v10.4s +mla v17.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +sub v11.4s, v9.4s, v17.4s +mla v0.4S, v8.4S, v31.s[0] +add v9.4s, v9.4s, v17.4s +sqrdmulh v17.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v8.4s, v1.4s, v0.4s +mla v3.4S, v10.4S, v31.s[0] +add v1.4s, v1.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v10.4s, v12.4s, v3.4s +mla v22.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v17.4s, v20.4s, v22.4s +mla v16.4S, v0.4S, v31.s[0] +add v20.4s, v20.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[1] +mul v19.4S, v19.4S,v30.s[1] +sub v0.4s, v12.4s, v16.4s +mla v21.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v13.4S, v29.s[1] +mul v13.4S, v13.4S,v30.s[1] +sub v3.4s, v20.4s, v21.4s +mla v19.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v22.4s, v9.4s, v19.4s +mla v13.4S, v16.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v29.s[2] +mul v18.4S, v18.4S,v30.s[2] +sub v16.4s, v1.4s, v13.4s +mla v14.4S, v21.4S, v31.s[0] +add v1.4s, v1.4s, v13.4s +sqrdmulh v13.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +sub v21.4s, v10.4s, v14.4s +mla v18.4S, v19.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v19.4s, v17.4s, v18.4s +mla v15.4S, v13.4S, v31.s[0] +add v17.4s, v17.4s, v18.4s +sqrdmulh v18.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +sub v13.4s, v11.4s, v15.4s +mla v2.4S, v14.4S, v31.s[0] +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v27.s[0] +mul v20.4S, v20.4S,v28.s[0] +sub v14.4s, v8.4s, v2.4s +mla v12.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v2.4s +sqrdmulh v2.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v18.4s, v9.4s, v12.4s +mla v20.4S, v15.4S, v31.s[0] +add v9.4s, v9.4s, v12.4s +sqrdmulh v12.4S, v3.4S, v27.s[1] +mul v3.4S, v3.4S,v28.s[1] +sub v15.4s, v1.4s, v20.4s +mla v0.4S, v2.4S, v31.s[0] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v10.4S, v27.s[2] +mul v10.4S, v10.4S,v28.s[2] +sub v2.4s, v22.4s, v0.4s +mla v3.4S, v12.4S, v31.s[0] +add v22.4s, v22.4s, v0.4s +sqrdmulh v0.4S, v17.4S, v27.s[2] +mul v17.4S, v17.4S,v28.s[2] +sub v12.4s, v16.4s, v3.4s +mla v10.4S, v20.4S, v31.s[0] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +sub v20.4s, v11.4s, v10.4s +mla v17.4S, v0.4S, v31.s[0] +add v11.4s, v11.4s, v10.4s +sqrdmulh v10.4S, v19.4S, v27.s[3] +mul v19.4S, v19.4S,v28.s[3] +sub v0.4s, v8.4s, v17.4s +mla v21.4S, v3.4S, v31.s[0] +add v8.4s, v8.4s, v17.4s +sqrdmulh v17.4S, v1.4S, v25.s[0] +mul v1.4S, v1.4S,v26.s[0] +sub v3.4s, v13.4s, v21.4s +mla v19.4S, v10.4S, v31.s[0] +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v15.4S, v25.s[1] +mul v15.4S, v15.4S,v26.s[1] +sub v10.4s, v14.4s, v19.4s +mla v1.4S, v17.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +sqrdmulh v19.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v17.4s, v9.4s, v1.4s +mla v15.4S, v21.4S, v31.s[0] +add v9.4s, v9.4s, v1.4s +sqrdmulh v1.4S, v12.4S, v25.s[3] +mul v12.4S, v12.4S,v26.s[3] +sub v21.4s, v18.4s, v15.4s +mla v16.4S, v19.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +str q9, [x0, #0] +sqrdmulh v9.4S, v8.4S, v23.s[0] +str q17, [x0, #64] +mul v8.4S, v8.4S,v24.s[0] +ldr q17, [x0, #784] +sub v15.4s, v22.4s, v16.4s +ldr q19, [x0, #848] +mla v12.4S, v1.4S, v31.s[0] +add v22.4s, v22.4s, v16.4s +str q18, [x0, #128] +sqrdmulh v18.4S, v0.4S, v23.s[1] +str q21, [x0, #192] +mul v0.4S, v0.4S,v24.s[1] +ldr q21, [x0, #912] +sub v16.4s, v2.4s, v12.4s +ldr q1, [x0, #976] +mla v8.4S, v9.4S, v31.s[0] +add v2.4s, v2.4s, v12.4s +str q22, [x0, #256] +sqrdmulh v22.4S, v14.4S, v23.s[2] +str q15, [x0, #320] +mul v14.4S, v14.4S,v24.s[2] +ldr q15, [x0, #272] +sub v12.4s, v11.4s, v8.4s +ldr q9, [x0, #336] +mla v0.4S, v18.4S, v31.s[0] +add v11.4s, v11.4s, v8.4s +str q2, [x0, #384] +sqrdmulh v2.4S, v10.4S, v23.s[3] +str q16, [x0, #448] +mul v10.4S, v10.4S,v24.s[3] +ldr q16, [x0, #400] +sub v8.4s, v20.4s, v0.4s +ldr q18, [x0, #464] +mla v14.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v0.4s +str q11, [x0, #512] +sqrdmulh v11.4S, v17.4S, v29.s[0] +str q12, [x0, #576] +ldr q12, [x0, #528] +mul v17.4S, v17.4S,v30.s[0] +ldr q0, [x0, #592] +sub v22.4s, v13.4s, v14.4s +mla v10.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v14.4s +str q20, [x0, #640] +sqrdmulh v20.4S, v19.4S, v29.s[0] +str q8, [x0, #704] +ldr q8, [x0, #656] +mul v19.4S, v19.4S,v30.s[0] +ldr q14, [x0, #720] +sub v2.4s, v3.4s, v10.4s +mla v17.4S, v11.4S, v31.s[0] +add v3.4s, v3.4s, v10.4s +str q13, [x0, #768] +sqrdmulh v13.4S, v21.4S, v29.s[0] +str q22, [x0, #832] +mul v21.4S, v21.4S,v30.s[0] +ldr q22, [x0, #16] +sub v10.4s, v15.4s, v17.4s +mla v19.4S, v20.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +str q3, [x0, #896] +sqrdmulh v3.4S, v1.4S, v29.s[0] +str q2, [x0, #960] +mul v1.4S, v1.4S,v30.s[0] +ldr q2, [x0, #80] +sub v17.4s, v9.4s, v19.4s +mla v21.4S, v13.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v12.4S, v29.s[0] +ldr q13, [x0, #144] +mul v12.4S, v12.4S,v30.s[0] +sub v20.4s, v16.4s, v21.4s +mla v1.4S, v3.4S, v31.s[0] +add v16.4s, v16.4s, v21.4s +sqrdmulh v21.4S, v0.4S, v29.s[0] +ldr q3, [x0, #208] +mul v0.4S, v0.4S,v30.s[0] +sub v11.4s, v18.4s, v1.4s +mla v12.4S, v19.4S, v31.s[0] +add v18.4s, v18.4s, v1.4s +sqrdmulh v1.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v19.4s, v22.4s, v12.4s +mla v0.4S, v21.4S, v31.s[0] +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v21.4s, v2.4s, v0.4s +mla v8.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v1.4s, v13.4s, v8.4s +mla v14.4S, v12.4S, v31.s[0] +add v13.4s, v13.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v12.4s, v3.4s, v14.4s +mla v16.4S, v0.4S, v31.s[0] +add v3.4s, v3.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +sub v0.4s, v13.4s, v16.4s +mla v18.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v16.4s +sqrdmulh v16.4S, v9.4S, v29.s[1] +mul v9.4S, v9.4S,v30.s[1] +sub v8.4s, v3.4s, v18.4s +mla v15.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +sub v14.4s, v22.4s, v15.4s +mla v9.4S, v16.4S, v31.s[0] +add v22.4s, v22.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v16.4s, v2.4s, v9.4s +mla v20.4S, v18.4S, v31.s[0] +add v2.4s, v2.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v18.4s, v1.4s, v20.4s +mla v11.4S, v15.4S, v31.s[0] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +sub v15.4s, v12.4s, v11.4s +mla v10.4S, v9.4S, v31.s[0] +add v12.4s, v12.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v27.s[0] +mul v13.4S, v13.4S,v28.s[0] +sub v9.4s, v19.4s, v10.4s +mla v17.4S, v20.4S, v31.s[0] +add v19.4s, v19.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v27.s[0] +mul v3.4S, v3.4S,v28.s[0] +sub v20.4s, v21.4s, v17.4s +mla v13.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v11.4s, v22.4s, v13.4s +mla v3.4S, v10.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v8.4S, v27.s[1] +mul v8.4S, v8.4S,v28.s[1] +sub v10.4s, v2.4s, v3.4s +mla v0.4S, v17.4S, v31.s[0] +add v2.4s, v2.4s, v3.4s +sqrdmulh v3.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +sub v17.4s, v14.4s, v0.4s +mla v8.4S, v13.4S, v31.s[0] +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v12.4S, v27.s[2] +mul v12.4S, v12.4S,v28.s[2] +sub v13.4s, v16.4s, v8.4s +mla v1.4S, v3.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v3.4s, v19.4s, v1.4s +mla v12.4S, v0.4S, v31.s[0] +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +sub v0.4s, v21.4s, v12.4s +mla v18.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v2.4S, v25.s[0] +mul v2.4S, v2.4S,v26.s[0] +sub v8.4s, v9.4s, v18.4s +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v10.4S, v25.s[1] +mul v10.4S, v10.4S,v26.s[1] +sub v1.4s, v20.4s, v15.4s +mla v2.4S, v12.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v12.4s, v22.4s, v2.4s +mla v10.4S, v18.4S, v31.s[0] +add v22.4s, v22.4s, v2.4s +sqrdmulh v2.4S, v13.4S, v25.s[3] +mul v13.4S, v13.4S,v26.s[3] +sub v18.4s, v11.4s, v10.4s +mla v16.4S, v15.4S, v31.s[0] +add v11.4s, v11.4s, v10.4s +str q22, [x0, #16] +sqrdmulh v22.4S, v21.4S, v23.s[0] +str q12, [x0, #80] +mul v21.4S, v21.4S,v24.s[0] +sub v12.4s, v14.4s, v16.4s +mla v13.4S, v2.4S, v31.s[0] +add v14.4s, v14.4s, v16.4s +str q11, [x0, #144] +sqrdmulh v11.4S, v0.4S, v23.s[1] +str q18, [x0, #208] +mul v0.4S, v0.4S,v24.s[1] +sub v18.4s, v17.4s, v13.4s +mla v21.4S, v22.4S, v31.s[0] +add v17.4s, v17.4s, v13.4s +str q14, [x0, #272] +sqrdmulh v14.4S, v20.4S, v23.s[2] +str q12, [x0, #336] +mul v20.4S, v20.4S,v24.s[2] +sub v12.4s, v19.4s, v21.4s +mla v0.4S, v11.4S, v31.s[0] +add v19.4s, v19.4s, v21.4s +str q17, [x0, #400] +sqrdmulh v17.4S, v1.4S, v23.s[3] +str q18, [x0, #464] +mul v1.4S, v1.4S,v24.s[3] +sub v18.4s, v3.4s, v0.4s +mla v20.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v0.4s +str q19, [x0, #528] +str q12, [x0, #592] +sub v12.4s, v9.4s, v20.4s +mla v1.4S, v17.4S, v31.s[0] +add v9.4s, v9.4s, v20.4s +str q3, [x0, #656] +str q18, [x0, #720] +sub v18.4s, v8.4s, v1.4s +add v8.4s, v8.4s, v1.4s +str q9, [x0, #784] +str q12, [x0, #848] +str q8, [x0, #912] +str q18, [x0, #976] +ldr q4, [x0, #32] +ldr q5, [x0, #48] +ldr q6, [x0, #0] +ldr q7, [x0, #16] +ldr q15, [x0, #96] +ldr q10, [x0, #112] +ldr q2, [x0, #64] +ldr q16, [x0, #80] +ldr q22, [x0, #160] +ldr q13, [x0, #176] +ldr q11, [x0, #128] +ldr q21, [x0, #144] +ldr q14, [x0, #224] +ldr q0, [x0, #240] +ldr q19, [x0, #192] +ldr q17, [x0, #208] +ldr q20, [x17, #+128] +ldr q3, [x17, #+144] +ldr q1, [x17, #+256] +ldr q9, [x17, #+272] +ldr q12, [x17, #+384] +ldr q8, [x17, #+400] +ldr q18, [x17, #+512] +ldr q30, [x17, #+528] +sqrdmulh v29.4S, v4.4S, v3.s[0] +mul v4.4S, v4.4S,v20.s[0] +sqrdmulh v28.4S, v5.4S, v3.s[0] +mul v5.4S, v5.4S,v20.s[0] +mla v4.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v15.4S, v9.s[0] +mul v15.4S, v15.4S,v1.s[0] +mla v5.4S, v28.4S, v31.s[0] +sub v28.4s, v6.4s, v4.4s +add v6.4s, v6.4s, v4.4s +sqrdmulh v4.4S, v10.4S, v9.s[0] +mul v10.4S, v10.4S,v1.s[0] +mla v15.4S, v29.4S, v31.s[0] +sub v29.4s, v7.4s, v5.4s +add v7.4s, v7.4s, v5.4s +sqrdmulh v5.4S, v7.4S, v3.s[1] +mul v7.4S, v7.4S,v20.s[1] +mla v10.4S, v4.4S, v31.s[0] +sub v4.4s, v2.4s, v15.4s +add v2.4s, v2.4s, v15.4s +sqrdmulh v15.4S, v29.4S, v3.s[2] +mul v29.4S, v29.4S,v20.s[2] +mla v7.4S, v5.4S, v31.s[0] +sub v5.4s, v16.4s, v10.4s +add v16.4s, v16.4s, v10.4s +sqrdmulh v10.4S, v16.4S, v9.s[1] +mul v16.4S, v16.4S,v1.s[1] +mla v29.4S, v15.4S, v31.s[0] +sub v15.4s, v6.4s, v7.4s +add v6.4s, v6.4s, v7.4s +sqrdmulh v3.4S, v5.4S, v9.s[2] +mul v5.4S, v5.4S,v1.s[2] +mla v16.4S, v10.4S, v31.s[0] +sub v10.4s, v28.4s, v29.4s +add v28.4s, v28.4s, v29.4s +sqrdmulh v29.4S, v22.4S, v8.s[0] +mul v22.4S, v22.4S,v12.s[0] +trn1 v20.4S, v6.4S, v15.4S +trn2 v7.4S, v6.4S, v15.4S +mla v5.4S, v3.4S, v31.s[0] +sub v3.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +sqrdmulh v9.4S, v13.4S, v8.s[0] +mul v13.4S, v13.4S,v12.s[0] +trn1 v1.4S, v28.4S, v10.4S +trn2 v16.4S, v28.4S, v10.4S +mla v22.4S, v29.4S, v31.s[0] +sub v29.4s, v4.4s, v5.4s +add v4.4s, v4.4s, v5.4s +sqrdmulh v5.4S, v14.4S, v30.s[0] +mul v14.4S, v14.4S,v18.s[0] +trn2 v28.2D, v20.2D, v1.2D +trn2 v10.2D, v7.2D, v16.2D +mla v13.4S, v9.4S, v31.s[0] +sub v9.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +sqrdmulh v22.4S, v0.4S, v30.s[0] +mul v0.4S, v0.4S,v18.s[0] +trn1 v6.2D, v20.2D, v1.2D +trn1 v15.2D, v7.2D, v16.2D +mla v14.4S, v5.4S, v31.s[0] +sub v5.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +sqrdmulh v13.4S, v21.4S, v8.s[1] +mul v21.4S, v21.4S,v12.s[1] +trn1 v16.4S, v2.4S, v3.4S +trn2 v7.4S, v2.4S, v3.4S +mla v0.4S, v22.4S, v31.s[0] +sub v22.4s, v19.4s, v14.4s +add v19.4s, v19.4s, v14.4s +sqrdmulh v14.4S, v5.4S, v8.s[2] +mul v5.4S, v5.4S,v12.s[2] +trn1 v1.4S, v4.4S, v29.4S +trn2 v20.4S, v4.4S, v29.4S +mla v21.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v0.4s +add v17.4s, v17.4s, v0.4s +sqrdmulh v0.4S, v17.4S, v30.s[1] +mul v17.4S, v17.4S,v18.s[1] +trn2 v4.2D, v16.2D, v1.2D +trn2 v29.2D, v7.2D, v20.2D +mla v5.4S, v14.4S, v31.s[0] +sub v14.4s, v11.4s, v21.4s +add v11.4s, v11.4s, v21.4s +sqrdmulh v8.4S, v13.4S, v30.s[2] +mul v13.4S, v13.4S,v18.s[2] +trn1 v2.2D, v16.2D, v1.2D +trn1 v3.2D, v7.2D, v20.2D +mla v17.4S, v0.4S, v31.s[0] +sub v0.4s, v9.4s, v5.4s +add v9.4s, v9.4s, v5.4s +mla v13.4S, v8.4S, v31.s[0] +sub v8.4s, v19.4s, v17.4s +add v19.4s, v19.4s, v17.4s +sub v30.4s, v22.4s, v13.4s +add v22.4s, v22.4s, v13.4s +ldr q13, [x17, #+160] +ldr q18, [x17, #+176] +sqrdmulh v17.4S, v28.4S, v18.4S +mul v28.4S, v28.4S,v13.4S +trn1 v5.4S, v11.4S, v14.4S +trn2 v20.4S, v11.4S, v14.4S +sqrdmulh v7.4S, v10.4S, v18.4S +mul v10.4S, v10.4S,v13.4S +trn1 v1.4S, v9.4S, v0.4S +trn2 v16.4S, v9.4S, v0.4S +mla v28.4S, v17.4S, v31.s[0] +ldr q17, [x17, #+288] +ldr q18, [x17, #+304] +sqrdmulh v13.4S, v4.4S, v18.4S +mul v4.4S, v4.4S,v17.4S +trn2 v9.2D, v5.2D, v1.2D +trn2 v0.2D, v20.2D, v16.2D +mla v10.4S, v7.4S, v31.s[0] +sub v7.4s, v6.4s, v28.4s +add v6.4s, v6.4s, v28.4s +sqrdmulh v28.4S, v29.4S, v18.4S +mul v29.4S, v29.4S,v17.4S +trn1 v11.2D, v5.2D, v1.2D +trn1 v14.2D, v20.2D, v16.2D +mla v4.4S, v13.4S, v31.s[0] +sub v13.4s, v15.4s, v10.4s +add v15.4s, v15.4s, v10.4s +ldr q10, [x17, #+192] +ldr q16, [x17, #+208] +sqrdmulh v20.4S, v15.4S, v16.4S +mul v15.4S, v15.4S,v10.4S +trn1 v16.4S, v19.4S, v8.4S +trn2 v10.4S, v19.4S, v8.4S +mla v29.4S, v28.4S, v31.s[0] +sub v28.4s, v2.4s, v4.4s +add v2.4s, v2.4s, v4.4s +ldr q4, [x17, #+224] +ldr q1, [x17, #+240] +sqrdmulh v5.4S, v13.4S, v1.4S +mul v13.4S, v13.4S,v4.4S +trn1 v1.4S, v22.4S, v30.4S +trn2 v4.4S, v22.4S, v30.4S +mla v15.4S, v20.4S, v31.s[0] +sub v20.4s, v3.4s, v29.4s +add v3.4s, v3.4s, v29.4s +ldr q29, [x17, #+320] +ldr q18, [x17, #+336] +sqrdmulh v17.4S, v3.4S, v18.4S +mul v3.4S, v3.4S,v29.4S +trn2 v22.2D, v16.2D, v1.2D +trn2 v30.2D, v10.2D, v4.2D +mla v13.4S, v5.4S, v31.s[0] +sub v5.4s, v6.4s, v15.4s +add v6.4s, v6.4s, v15.4s +ldr q15, [x17, #+352] +ldr q18, [x17, #+368] +sqrdmulh v29.4S, v20.4S, v18.4S +mul v20.4S, v20.4S,v15.4S +trn1 v19.2D, v16.2D, v1.2D +trn1 v8.2D, v10.2D, v4.2D +mla v3.4S, v17.4S, v31.s[0] +sub v17.4s, v7.4s, v13.4s +add v7.4s, v7.4s, v13.4s +mla v20.4S, v29.4S, v31.s[0] +sub v29.4s, v2.4s, v3.4s +add v2.4s, v2.4s, v3.4s +sub v3.4s, v28.4s, v20.4s +add v28.4s, v28.4s, v20.4s +str q6, [x0, #0] +str q5, [x0, #16] +str q7, [x0, #32] +str q17, [x0, #48] +str q2, [x0, #64] +str q29, [x0, #80] +str q28, [x0, #96] +str q3, [x0, #112] +ldr q3, [x17, #+416] +ldr q28, [x17, #+432] +sqrdmulh v29.4S, v9.4S, v28.4S +mul v9.4S, v9.4S,v3.4S +sqrdmulh v2.4S, v0.4S, v28.4S +mul v0.4S, v0.4S,v3.4S +mla v9.4S, v29.4S, v31.s[0] +ldr q29, [x17, #+544] +ldr q28, [x17, #+560] +sqrdmulh v3.4S, v22.4S, v28.4S +mul v22.4S, v22.4S,v29.4S +mla v0.4S, v2.4S, v31.s[0] +sub v2.4s, v11.4s, v9.4s +add v11.4s, v11.4s, v9.4s +sqrdmulh v9.4S, v30.4S, v28.4S +mul v30.4S, v30.4S,v29.4S +mla v22.4S, v3.4S, v31.s[0] +sub v3.4s, v14.4s, v0.4s +add v14.4s, v14.4s, v0.4s +ldr q0, [x17, #+448] +ldr q28, [x17, #+464] +sqrdmulh v29.4S, v14.4S, v28.4S +mul v14.4S, v14.4S,v0.4S +mla v30.4S, v9.4S, v31.s[0] +sub v9.4s, v19.4s, v22.4s +add v19.4s, v19.4s, v22.4s +ldr q22, [x17, #+480] +ldr q28, [x17, #+496] +sqrdmulh v0.4S, v3.4S, v28.4S +mul v3.4S, v3.4S,v22.4S +mla v14.4S, v29.4S, v31.s[0] +sub v29.4s, v8.4s, v30.4s +add v8.4s, v8.4s, v30.4s +ldr q30, [x17, #+576] +ldr q28, [x17, #+592] +sqrdmulh v22.4S, v8.4S, v28.4S +mul v8.4S, v8.4S,v30.4S +mla v3.4S, v0.4S, v31.s[0] +sub v0.4s, v11.4s, v14.4s +add v11.4s, v11.4s, v14.4s +ldr q14, [x17, #+608] +ldr q28, [x17, #+624] +sqrdmulh v30.4S, v29.4S, v28.4S +mul v29.4S, v29.4S,v14.4S +mla v8.4S, v22.4S, v31.s[0] +sub v22.4s, v2.4s, v3.4s +add v2.4s, v2.4s, v3.4s +mla v29.4S, v30.4S, v31.s[0] +sub v30.4s, v19.4s, v8.4s +add v19.4s, v19.4s, v8.4s +sub v8.4s, v9.4s, v29.4s +add v9.4s, v9.4s, v29.4s +str q11, [x0, #128] +str q0, [x0, #144] +str q2, [x0, #160] +str q22, [x0, #176] +str q19, [x0, #192] +str q30, [x0, #208] +str q9, [x0, #224] +str q8, [x0, #240] +ldr q8, [x0, #288] +ldr q9, [x0, #304] +ldr q30, [x0, #256] +ldr q19, [x0, #272] +ldr q22, [x0, #352] +ldr q2, [x0, #368] +ldr q0, [x0, #320] +ldr q11, [x0, #336] +ldr q29, [x0, #416] +ldr q3, [x0, #432] +ldr q28, [x0, #384] +ldr q14, [x0, #400] +ldr q17, [x0, #480] +ldr q7, [x0, #496] +ldr q5, [x0, #448] +ldr q6, [x0, #464] +ldr q20, [x17, #+640] +ldr q13, [x17, #+656] +ldr q4, [x17, #+768] +ldr q10, [x17, #+784] +ldr q1, [x17, #+896] +ldr q16, [x17, #+912] +ldr q18, [x17, #+1024] +ldr q15, [x17, #+1040] +sqrdmulh v12.4S, v8.4S, v13.s[0] +mul v8.4S, v8.4S,v20.s[0] +sqrdmulh v21.4S, v9.4S, v13.s[0] +mul v9.4S, v9.4S,v20.s[0] +mla v8.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v22.4S, v10.s[0] +mul v22.4S, v22.4S,v4.s[0] +mla v9.4S, v21.4S, v31.s[0] +sub v21.4s, v30.4s, v8.4s +add v30.4s, v30.4s, v8.4s +sqrdmulh v8.4S, v2.4S, v10.s[0] +mul v2.4S, v2.4S,v4.s[0] +mla v22.4S, v12.4S, v31.s[0] +sub v12.4s, v19.4s, v9.4s +add v19.4s, v19.4s, v9.4s +sqrdmulh v9.4S, v19.4S, v13.s[1] +mul v19.4S, v19.4S,v20.s[1] +mla v2.4S, v8.4S, v31.s[0] +sub v8.4s, v0.4s, v22.4s +add v0.4s, v0.4s, v22.4s +sqrdmulh v22.4S, v12.4S, v13.s[2] +mul v12.4S, v12.4S,v20.s[2] +mla v19.4S, v9.4S, v31.s[0] +sub v9.4s, v11.4s, v2.4s +add v11.4s, v11.4s, v2.4s +sqrdmulh v2.4S, v11.4S, v10.s[1] +mul v11.4S, v11.4S,v4.s[1] +mla v12.4S, v22.4S, v31.s[0] +sub v22.4s, v30.4s, v19.4s +add v30.4s, v30.4s, v19.4s +sqrdmulh v13.4S, v9.4S, v10.s[2] +mul v9.4S, v9.4S,v4.s[2] +mla v11.4S, v2.4S, v31.s[0] +sub v2.4s, v21.4s, v12.4s +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v29.4S, v16.s[0] +mul v29.4S, v29.4S,v1.s[0] +trn1 v20.4S, v30.4S, v22.4S +trn2 v19.4S, v30.4S, v22.4S +mla v9.4S, v13.4S, v31.s[0] +sub v13.4s, v0.4s, v11.4s +add v0.4s, v0.4s, v11.4s +sqrdmulh v10.4S, v3.4S, v16.s[0] +mul v3.4S, v3.4S,v1.s[0] +trn1 v4.4S, v21.4S, v2.4S +trn2 v11.4S, v21.4S, v2.4S +mla v29.4S, v12.4S, v31.s[0] +sub v12.4s, v8.4s, v9.4s +add v8.4s, v8.4s, v9.4s +sqrdmulh v9.4S, v17.4S, v15.s[0] +mul v17.4S, v17.4S,v18.s[0] +trn2 v21.2D, v20.2D, v4.2D +trn2 v2.2D, v19.2D, v11.2D +mla v3.4S, v10.4S, v31.s[0] +sub v10.4s, v28.4s, v29.4s +add v28.4s, v28.4s, v29.4s +sqrdmulh v29.4S, v7.4S, v15.s[0] +mul v7.4S, v7.4S,v18.s[0] +trn1 v30.2D, v20.2D, v4.2D +trn1 v22.2D, v19.2D, v11.2D +mla v17.4S, v9.4S, v31.s[0] +sub v9.4s, v14.4s, v3.4s +add v14.4s, v14.4s, v3.4s +sqrdmulh v3.4S, v14.4S, v16.s[1] +mul v14.4S, v14.4S,v1.s[1] +trn1 v11.4S, v0.4S, v13.4S +trn2 v19.4S, v0.4S, v13.4S +mla v7.4S, v29.4S, v31.s[0] +sub v29.4s, v5.4s, v17.4s +add v5.4s, v5.4s, v17.4s +sqrdmulh v17.4S, v9.4S, v16.s[2] +mul v9.4S, v9.4S,v1.s[2] +trn1 v4.4S, v8.4S, v12.4S +trn2 v20.4S, v8.4S, v12.4S +mla v14.4S, v3.4S, v31.s[0] +sub v3.4s, v6.4s, v7.4s +add v6.4s, v6.4s, v7.4s +sqrdmulh v7.4S, v6.4S, v15.s[1] +mul v6.4S, v6.4S,v18.s[1] +trn2 v8.2D, v11.2D, v4.2D +trn2 v12.2D, v19.2D, v20.2D +mla v9.4S, v17.4S, v31.s[0] +sub v17.4s, v28.4s, v14.4s +add v28.4s, v28.4s, v14.4s +sqrdmulh v16.4S, v3.4S, v15.s[2] +mul v3.4S, v3.4S,v18.s[2] +trn1 v0.2D, v11.2D, v4.2D +trn1 v13.2D, v19.2D, v20.2D +mla v6.4S, v7.4S, v31.s[0] +sub v7.4s, v10.4s, v9.4s +add v10.4s, v10.4s, v9.4s +mla v3.4S, v16.4S, v31.s[0] +sub v16.4s, v5.4s, v6.4s +add v5.4s, v5.4s, v6.4s +sub v15.4s, v29.4s, v3.4s +add v29.4s, v29.4s, v3.4s +ldr q3, [x17, #+672] +ldr q18, [x17, #+688] +sqrdmulh v6.4S, v21.4S, v18.4S +mul v21.4S, v21.4S,v3.4S +trn1 v9.4S, v28.4S, v17.4S +trn2 v20.4S, v28.4S, v17.4S +sqrdmulh v19.4S, v2.4S, v18.4S +mul v2.4S, v2.4S,v3.4S +trn1 v4.4S, v10.4S, v7.4S +trn2 v11.4S, v10.4S, v7.4S +mla v21.4S, v6.4S, v31.s[0] +ldr q6, [x17, #+800] +ldr q18, [x17, #+816] +sqrdmulh v3.4S, v8.4S, v18.4S +mul v8.4S, v8.4S,v6.4S +trn2 v10.2D, v9.2D, v4.2D +trn2 v7.2D, v20.2D, v11.2D +mla v2.4S, v19.4S, v31.s[0] +sub v19.4s, v30.4s, v21.4s +add v30.4s, v30.4s, v21.4s +sqrdmulh v21.4S, v12.4S, v18.4S +mul v12.4S, v12.4S,v6.4S +trn1 v28.2D, v9.2D, v4.2D +trn1 v17.2D, v20.2D, v11.2D +mla v8.4S, v3.4S, v31.s[0] +sub v3.4s, v22.4s, v2.4s +add v22.4s, v22.4s, v2.4s +ldr q2, [x17, #+704] +ldr q11, [x17, #+720] +sqrdmulh v20.4S, v22.4S, v11.4S +mul v22.4S, v22.4S,v2.4S +trn1 v11.4S, v5.4S, v16.4S +trn2 v2.4S, v5.4S, v16.4S +mla v12.4S, v21.4S, v31.s[0] +sub v21.4s, v0.4s, v8.4s +add v0.4s, v0.4s, v8.4s +ldr q8, [x17, #+736] +ldr q4, [x17, #+752] +sqrdmulh v9.4S, v3.4S, v4.4S +mul v3.4S, v3.4S,v8.4S +trn1 v4.4S, v29.4S, v15.4S +trn2 v8.4S, v29.4S, v15.4S +mla v22.4S, v20.4S, v31.s[0] +sub v20.4s, v13.4s, v12.4s +add v13.4s, v13.4s, v12.4s +ldr q12, [x17, #+832] +ldr q18, [x17, #+848] +sqrdmulh v6.4S, v13.4S, v18.4S +mul v13.4S, v13.4S,v12.4S +trn2 v29.2D, v11.2D, v4.2D +trn2 v15.2D, v2.2D, v8.2D +mla v3.4S, v9.4S, v31.s[0] +sub v9.4s, v30.4s, v22.4s +add v30.4s, v30.4s, v22.4s +ldr q22, [x17, #+864] +ldr q18, [x17, #+880] +sqrdmulh v12.4S, v20.4S, v18.4S +mul v20.4S, v20.4S,v22.4S +trn1 v5.2D, v11.2D, v4.2D +trn1 v16.2D, v2.2D, v8.2D +mla v13.4S, v6.4S, v31.s[0] +sub v6.4s, v19.4s, v3.4s +add v19.4s, v19.4s, v3.4s +mla v20.4S, v12.4S, v31.s[0] +sub v12.4s, v0.4s, v13.4s +add v0.4s, v0.4s, v13.4s +sub v13.4s, v21.4s, v20.4s +add v21.4s, v21.4s, v20.4s +str q30, [x0, #256] +str q9, [x0, #272] +str q19, [x0, #288] +str q6, [x0, #304] +str q0, [x0, #320] +str q12, [x0, #336] +str q21, [x0, #352] +str q13, [x0, #368] +ldr q13, [x17, #+928] +ldr q21, [x17, #+944] +sqrdmulh v12.4S, v10.4S, v21.4S +mul v10.4S, v10.4S,v13.4S +sqrdmulh v0.4S, v7.4S, v21.4S +mul v7.4S, v7.4S,v13.4S +mla v10.4S, v12.4S, v31.s[0] +ldr q12, [x17, #+1056] +ldr q21, [x17, #+1072] +sqrdmulh v13.4S, v29.4S, v21.4S +mul v29.4S, v29.4S,v12.4S +mla v7.4S, v0.4S, v31.s[0] +sub v0.4s, v28.4s, v10.4s +add v28.4s, v28.4s, v10.4s +sqrdmulh v10.4S, v15.4S, v21.4S +mul v15.4S, v15.4S,v12.4S +mla v29.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v7.4s +add v17.4s, v17.4s, v7.4s +ldr q7, [x17, #+960] +ldr q21, [x17, #+976] +sqrdmulh v12.4S, v17.4S, v21.4S +mul v17.4S, v17.4S,v7.4S +mla v15.4S, v10.4S, v31.s[0] +sub v10.4s, v5.4s, v29.4s +add v5.4s, v5.4s, v29.4s +ldr q29, [x17, #+992] +ldr q21, [x17, #+1008] +sqrdmulh v7.4S, v13.4S, v21.4S +mul v13.4S, v13.4S,v29.4S +mla v17.4S, v12.4S, v31.s[0] +sub v12.4s, v16.4s, v15.4s +add v16.4s, v16.4s, v15.4s +ldr q15, [x17, #+1088] +ldr q21, [x17, #+1104] +sqrdmulh v29.4S, v16.4S, v21.4S +mul v16.4S, v16.4S,v15.4S +mla v13.4S, v7.4S, v31.s[0] +sub v7.4s, v28.4s, v17.4s +add v28.4s, v28.4s, v17.4s +ldr q17, [x17, #+1120] +ldr q21, [x17, #+1136] +sqrdmulh v15.4S, v12.4S, v21.4S +mul v12.4S, v12.4S,v17.4S +mla v16.4S, v29.4S, v31.s[0] +sub v29.4s, v0.4s, v13.4s +add v0.4s, v0.4s, v13.4s +mla v12.4S, v15.4S, v31.s[0] +sub v15.4s, v5.4s, v16.4s +add v5.4s, v5.4s, v16.4s +sub v16.4s, v10.4s, v12.4s +add v10.4s, v10.4s, v12.4s +str q28, [x0, #384] +str q7, [x0, #400] +str q0, [x0, #416] +str q29, [x0, #432] +str q5, [x0, #448] +str q15, [x0, #464] +str q10, [x0, #480] +str q16, [x0, #496] +ldr q16, [x0, #544] +ldr q10, [x0, #560] +ldr q15, [x0, #512] +ldr q5, [x0, #528] +ldr q29, [x0, #608] +ldr q0, [x0, #624] +ldr q7, [x0, #576] +ldr q28, [x0, #592] +ldr q12, [x0, #672] +ldr q13, [x0, #688] +ldr q21, [x0, #640] +ldr q17, [x0, #656] +ldr q6, [x0, #736] +ldr q19, [x0, #752] +ldr q9, [x0, #704] +ldr q30, [x0, #720] +ldr q20, [x17, #+1152] +ldr q3, [x17, #+1168] +ldr q8, [x17, #+1280] +ldr q2, [x17, #+1296] +ldr q4, [x17, #+1408] +ldr q11, [x17, #+1424] +ldr q18, [x17, #+1536] +ldr q22, [x17, #+1552] +sqrdmulh v1.4S, v16.4S, v3.s[0] +mul v16.4S, v16.4S,v20.s[0] +sqrdmulh v14.4S, v10.4S, v3.s[0] +mul v10.4S, v10.4S,v20.s[0] +mla v16.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v29.4S, v2.s[0] +mul v29.4S, v29.4S,v8.s[0] +mla v10.4S, v14.4S, v31.s[0] +sub v14.4s, v15.4s, v16.4s +add v15.4s, v15.4s, v16.4s +sqrdmulh v16.4S, v0.4S, v2.s[0] +mul v0.4S, v0.4S,v8.s[0] +mla v29.4S, v1.4S, v31.s[0] +sub v1.4s, v5.4s, v10.4s +add v5.4s, v5.4s, v10.4s +sqrdmulh v10.4S, v5.4S, v3.s[1] +mul v5.4S, v5.4S,v20.s[1] +mla v0.4S, v16.4S, v31.s[0] +sub v16.4s, v7.4s, v29.4s +add v7.4s, v7.4s, v29.4s +sqrdmulh v29.4S, v1.4S, v3.s[2] +mul v1.4S, v1.4S,v20.s[2] +mla v5.4S, v10.4S, v31.s[0] +sub v10.4s, v28.4s, v0.4s +add v28.4s, v28.4s, v0.4s +sqrdmulh v0.4S, v28.4S, v2.s[1] +mul v28.4S, v28.4S,v8.s[1] +mla v1.4S, v29.4S, v31.s[0] +sub v29.4s, v15.4s, v5.4s +add v15.4s, v15.4s, v5.4s +sqrdmulh v3.4S, v10.4S, v2.s[2] +mul v10.4S, v10.4S,v8.s[2] +mla v28.4S, v0.4S, v31.s[0] +sub v0.4s, v14.4s, v1.4s +add v14.4s, v14.4s, v1.4s +sqrdmulh v1.4S, v12.4S, v11.s[0] +mul v12.4S, v12.4S,v4.s[0] +trn1 v20.4S, v15.4S, v29.4S +trn2 v5.4S, v15.4S, v29.4S +mla v10.4S, v3.4S, v31.s[0] +sub v3.4s, v7.4s, v28.4s +add v7.4s, v7.4s, v28.4s +sqrdmulh v2.4S, v13.4S, v11.s[0] +mul v13.4S, v13.4S,v4.s[0] +trn1 v8.4S, v14.4S, v0.4S +trn2 v28.4S, v14.4S, v0.4S +mla v12.4S, v1.4S, v31.s[0] +sub v1.4s, v16.4s, v10.4s +add v16.4s, v16.4s, v10.4s +sqrdmulh v10.4S, v6.4S, v22.s[0] +mul v6.4S, v6.4S,v18.s[0] +trn2 v14.2D, v20.2D, v8.2D +trn2 v0.2D, v5.2D, v28.2D +mla v13.4S, v2.4S, v31.s[0] +sub v2.4s, v21.4s, v12.4s +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v18.s[0] +trn1 v15.2D, v20.2D, v8.2D +trn1 v29.2D, v5.2D, v28.2D +mla v6.4S, v10.4S, v31.s[0] +sub v10.4s, v17.4s, v13.4s +add v17.4s, v17.4s, v13.4s +sqrdmulh v13.4S, v17.4S, v11.s[1] +mul v17.4S, v17.4S,v4.s[1] +trn1 v28.4S, v7.4S, v3.4S +trn2 v5.4S, v7.4S, v3.4S +mla v19.4S, v12.4S, v31.s[0] +sub v12.4s, v9.4s, v6.4s +add v9.4s, v9.4s, v6.4s +sqrdmulh v6.4S, v10.4S, v11.s[2] +mul v10.4S, v10.4S,v4.s[2] +trn1 v8.4S, v16.4S, v1.4S +trn2 v20.4S, v16.4S, v1.4S +mla v17.4S, v13.4S, v31.s[0] +sub v13.4s, v30.4s, v19.4s +add v30.4s, v30.4s, v19.4s +sqrdmulh v19.4S, v30.4S, v22.s[1] +mul v30.4S, v30.4S,v18.s[1] +trn2 v16.2D, v28.2D, v8.2D +trn2 v1.2D, v5.2D, v20.2D +mla v10.4S, v6.4S, v31.s[0] +sub v6.4s, v21.4s, v17.4s +add v21.4s, v21.4s, v17.4s +sqrdmulh v11.4S, v13.4S, v22.s[2] +mul v13.4S, v13.4S,v18.s[2] +trn1 v7.2D, v28.2D, v8.2D +trn1 v3.2D, v5.2D, v20.2D +mla v30.4S, v19.4S, v31.s[0] +sub v19.4s, v2.4s, v10.4s +add v2.4s, v2.4s, v10.4s +mla v13.4S, v11.4S, v31.s[0] +sub v11.4s, v9.4s, v30.4s +add v9.4s, v9.4s, v30.4s +sub v22.4s, v12.4s, v13.4s +add v12.4s, v12.4s, v13.4s +ldr q13, [x17, #+1184] +ldr q18, [x17, #+1200] +sqrdmulh v30.4S, v14.4S, v18.4S +mul v14.4S, v14.4S,v13.4S +trn1 v10.4S, v21.4S, v6.4S +trn2 v20.4S, v21.4S, v6.4S +sqrdmulh v5.4S, v0.4S, v18.4S +mul v0.4S, v0.4S,v13.4S +trn1 v8.4S, v2.4S, v19.4S +trn2 v28.4S, v2.4S, v19.4S +mla v14.4S, v30.4S, v31.s[0] +ldr q30, [x17, #+1312] +ldr q18, [x17, #+1328] +sqrdmulh v13.4S, v16.4S, v18.4S +mul v16.4S, v16.4S,v30.4S +trn2 v2.2D, v10.2D, v8.2D +trn2 v19.2D, v20.2D, v28.2D +mla v0.4S, v5.4S, v31.s[0] +sub v5.4s, v15.4s, v14.4s +add v15.4s, v15.4s, v14.4s +sqrdmulh v14.4S, v1.4S, v18.4S +mul v1.4S, v1.4S,v30.4S +trn1 v21.2D, v10.2D, v8.2D +trn1 v6.2D, v20.2D, v28.2D +mla v16.4S, v13.4S, v31.s[0] +sub v13.4s, v29.4s, v0.4s +add v29.4s, v29.4s, v0.4s +ldr q0, [x17, #+1216] +ldr q28, [x17, #+1232] +sqrdmulh v20.4S, v29.4S, v28.4S +mul v29.4S, v29.4S,v0.4S +trn1 v28.4S, v9.4S, v11.4S +trn2 v0.4S, v9.4S, v11.4S +mla v1.4S, v14.4S, v31.s[0] +sub v14.4s, v7.4s, v16.4s +add v7.4s, v7.4s, v16.4s +ldr q16, [x17, #+1248] +ldr q8, [x17, #+1264] +sqrdmulh v10.4S, v13.4S, v8.4S +mul v13.4S, v13.4S,v16.4S +trn1 v8.4S, v12.4S, v22.4S +trn2 v16.4S, v12.4S, v22.4S +mla v29.4S, v20.4S, v31.s[0] +sub v20.4s, v3.4s, v1.4s +add v3.4s, v3.4s, v1.4s +ldr q1, [x17, #+1344] +ldr q18, [x17, #+1360] +sqrdmulh v30.4S, v3.4S, v18.4S +mul v3.4S, v3.4S,v1.4S +trn2 v12.2D, v28.2D, v8.2D +trn2 v22.2D, v0.2D, v16.2D +mla v13.4S, v10.4S, v31.s[0] +sub v10.4s, v15.4s, v29.4s +add v15.4s, v15.4s, v29.4s +ldr q29, [x17, #+1376] +ldr q18, [x17, #+1392] +sqrdmulh v1.4S, v20.4S, v18.4S +mul v20.4S, v20.4S,v29.4S +trn1 v9.2D, v28.2D, v8.2D +trn1 v11.2D, v0.2D, v16.2D +mla v3.4S, v30.4S, v31.s[0] +sub v30.4s, v5.4s, v13.4s +add v5.4s, v5.4s, v13.4s +mla v20.4S, v1.4S, v31.s[0] +sub v1.4s, v7.4s, v3.4s +add v7.4s, v7.4s, v3.4s +sub v3.4s, v14.4s, v20.4s +add v14.4s, v14.4s, v20.4s +str q15, [x0, #512] +str q10, [x0, #528] +str q5, [x0, #544] +str q30, [x0, #560] +str q7, [x0, #576] +str q1, [x0, #592] +str q14, [x0, #608] +str q3, [x0, #624] +ldr q3, [x17, #+1440] +ldr q14, [x17, #+1456] +sqrdmulh v1.4S, v2.4S, v14.4S +mul v2.4S, v2.4S,v3.4S +sqrdmulh v7.4S, v19.4S, v14.4S +mul v19.4S, v19.4S,v3.4S +mla v2.4S, v1.4S, v31.s[0] +ldr q1, [x17, #+1568] +ldr q14, [x17, #+1584] +sqrdmulh v3.4S, v12.4S, v14.4S +mul v12.4S, v12.4S,v1.4S +mla v19.4S, v7.4S, v31.s[0] +sub v7.4s, v21.4s, v2.4s +add v21.4s, v21.4s, v2.4s +sqrdmulh v2.4S, v22.4S, v14.4S +mul v22.4S, v22.4S,v1.4S +mla v12.4S, v3.4S, v31.s[0] +sub v3.4s, v6.4s, v19.4s +add v6.4s, v6.4s, v19.4s +ldr q19, [x17, #+1472] +ldr q14, [x17, #+1488] +sqrdmulh v1.4S, v6.4S, v14.4S +mul v6.4S, v6.4S,v19.4S +mla v22.4S, v2.4S, v31.s[0] +sub v2.4s, v9.4s, v12.4s +add v9.4s, v9.4s, v12.4s +ldr q12, [x17, #+1504] +ldr q14, [x17, #+1520] +sqrdmulh v19.4S, v3.4S, v14.4S +mul v3.4S, v3.4S,v12.4S +mla v6.4S, v1.4S, v31.s[0] +sub v1.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +ldr q22, [x17, #+1600] +ldr q14, [x17, #+1616] +sqrdmulh v12.4S, v11.4S, v14.4S +mul v11.4S, v11.4S,v22.4S +mla v3.4S, v19.4S, v31.s[0] +sub v19.4s, v21.4s, v6.4s +add v21.4s, v21.4s, v6.4s +ldr q6, [x17, #+1632] +ldr q14, [x17, #+1648] +sqrdmulh v22.4S, v1.4S, v14.4S +mul v1.4S, v1.4S,v6.4S +mla v11.4S, v12.4S, v31.s[0] +sub v12.4s, v7.4s, v3.4s +add v7.4s, v7.4s, v3.4s +mla v1.4S, v22.4S, v31.s[0] +sub v22.4s, v9.4s, v11.4s +add v9.4s, v9.4s, v11.4s +sub v11.4s, v2.4s, v1.4s +add v2.4s, v2.4s, v1.4s +str q21, [x0, #640] +str q19, [x0, #656] +str q7, [x0, #672] +str q12, [x0, #688] +str q9, [x0, #704] +str q22, [x0, #720] +str q2, [x0, #736] +str q11, [x0, #752] +ldr q11, [x0, #800] +ldr q2, [x0, #816] +ldr q22, [x0, #768] +ldr q9, [x0, #784] +ldr q12, [x0, #864] +ldr q7, [x0, #880] +ldr q19, [x0, #832] +ldr q21, [x0, #848] +ldr q1, [x0, #928] +ldr q3, [x0, #944] +ldr q14, [x0, #896] +ldr q6, [x0, #912] +ldr q30, [x0, #992] +ldr q5, [x0, #1008] +ldr q10, [x0, #960] +ldr q15, [x0, #976] +ldr q20, [x17, #+1664] +ldr q13, [x17, #+1680] +ldr q16, [x17, #+1792] +ldr q0, [x17, #+1808] +ldr q8, [x17, #+1920] +ldr q28, [x17, #+1936] +ldr q18, [x17, #+2048] +ldr q29, [x17, #+2064] +sqrdmulh v4.4S, v11.4S, v13.s[0] +mul v11.4S, v11.4S,v20.s[0] +sqrdmulh v17.4S, v2.4S, v13.s[0] +mul v2.4S, v2.4S,v20.s[0] +mla v11.4S, v4.4S, v31.s[0] +sqrdmulh v4.4S, v12.4S, v0.s[0] +mul v12.4S, v12.4S,v16.s[0] +mla v2.4S, v17.4S, v31.s[0] +sub v17.4s, v22.4s, v11.4s +add v22.4s, v22.4s, v11.4s +sqrdmulh v11.4S, v7.4S, v0.s[0] +mul v7.4S, v7.4S,v16.s[0] +mla v12.4S, v4.4S, v31.s[0] +sub v4.4s, v9.4s, v2.4s +add v9.4s, v9.4s, v2.4s +sqrdmulh v2.4S, v9.4S, v13.s[1] +mul v9.4S, v9.4S,v20.s[1] +mla v7.4S, v11.4S, v31.s[0] +sub v11.4s, v19.4s, v12.4s +add v19.4s, v19.4s, v12.4s +sqrdmulh v12.4S, v4.4S, v13.s[2] +mul v4.4S, v4.4S,v20.s[2] +mla v9.4S, v2.4S, v31.s[0] +sub v2.4s, v21.4s, v7.4s +add v21.4s, v21.4s, v7.4s +sqrdmulh v7.4S, v21.4S, v0.s[1] +mul v21.4S, v21.4S,v16.s[1] +mla v4.4S, v12.4S, v31.s[0] +sub v12.4s, v22.4s, v9.4s +add v22.4s, v22.4s, v9.4s +sqrdmulh v13.4S, v2.4S, v0.s[2] +mul v2.4S, v2.4S,v16.s[2] +mla v21.4S, v7.4S, v31.s[0] +sub v7.4s, v17.4s, v4.4s +add v17.4s, v17.4s, v4.4s +sqrdmulh v4.4S, v1.4S, v28.s[0] +mul v1.4S, v1.4S,v8.s[0] +trn1 v20.4S, v22.4S, v12.4S +trn2 v9.4S, v22.4S, v12.4S +mla v2.4S, v13.4S, v31.s[0] +sub v13.4s, v19.4s, v21.4s +add v19.4s, v19.4s, v21.4s +sqrdmulh v0.4S, v3.4S, v28.s[0] +mul v3.4S, v3.4S,v8.s[0] +trn1 v16.4S, v17.4S, v7.4S +trn2 v21.4S, v17.4S, v7.4S +mla v1.4S, v4.4S, v31.s[0] +sub v4.4s, v11.4s, v2.4s +add v11.4s, v11.4s, v2.4s +sqrdmulh v2.4S, v30.4S, v29.s[0] +mul v30.4S, v30.4S,v18.s[0] +trn2 v17.2D, v20.2D, v16.2D +trn2 v7.2D, v9.2D, v21.2D +mla v3.4S, v0.4S, v31.s[0] +sub v0.4s, v14.4s, v1.4s +add v14.4s, v14.4s, v1.4s +sqrdmulh v1.4S, v5.4S, v29.s[0] +mul v5.4S, v5.4S,v18.s[0] +trn1 v22.2D, v20.2D, v16.2D +trn1 v12.2D, v9.2D, v21.2D +mla v30.4S, v2.4S, v31.s[0] +sub v2.4s, v6.4s, v3.4s +add v6.4s, v6.4s, v3.4s +sqrdmulh v3.4S, v6.4S, v28.s[1] +mul v6.4S, v6.4S,v8.s[1] +trn1 v21.4S, v19.4S, v13.4S +trn2 v9.4S, v19.4S, v13.4S +mla v5.4S, v1.4S, v31.s[0] +sub v1.4s, v10.4s, v30.4s +add v10.4s, v10.4s, v30.4s +sqrdmulh v30.4S, v2.4S, v28.s[2] +mul v2.4S, v2.4S,v8.s[2] +trn1 v16.4S, v11.4S, v4.4S +trn2 v20.4S, v11.4S, v4.4S +mla v6.4S, v3.4S, v31.s[0] +sub v3.4s, v15.4s, v5.4s +add v15.4s, v15.4s, v5.4s +sqrdmulh v5.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v18.s[1] +trn2 v11.2D, v21.2D, v16.2D +trn2 v4.2D, v9.2D, v20.2D +mla v2.4S, v30.4S, v31.s[0] +sub v30.4s, v14.4s, v6.4s +add v14.4s, v14.4s, v6.4s +sqrdmulh v28.4S, v3.4S, v29.s[2] +mul v3.4S, v3.4S,v18.s[2] +trn1 v19.2D, v21.2D, v16.2D +trn1 v13.2D, v9.2D, v20.2D +mla v15.4S, v5.4S, v31.s[0] +sub v5.4s, v0.4s, v2.4s +add v0.4s, v0.4s, v2.4s +mla v3.4S, v28.4S, v31.s[0] +sub v28.4s, v10.4s, v15.4s +add v10.4s, v10.4s, v15.4s +sub v29.4s, v1.4s, v3.4s +add v1.4s, v1.4s, v3.4s +ldr q3, [x17, #+1696] +ldr q18, [x17, #+1712] +sqrdmulh v15.4S, v17.4S, v18.4S +mul v17.4S, v17.4S,v3.4S +trn1 v2.4S, v14.4S, v30.4S +trn2 v20.4S, v14.4S, v30.4S +sqrdmulh v9.4S, v7.4S, v18.4S +mul v7.4S, v7.4S,v3.4S +trn1 v16.4S, v0.4S, v5.4S +trn2 v21.4S, v0.4S, v5.4S +mla v17.4S, v15.4S, v31.s[0] +ldr q15, [x17, #+1824] +ldr q18, [x17, #+1840] +sqrdmulh v3.4S, v11.4S, v18.4S +mul v11.4S, v11.4S,v15.4S +trn2 v0.2D, v2.2D, v16.2D +trn2 v5.2D, v20.2D, v21.2D +mla v7.4S, v9.4S, v31.s[0] +sub v9.4s, v22.4s, v17.4s +add v22.4s, v22.4s, v17.4s +sqrdmulh v17.4S, v4.4S, v18.4S +mul v4.4S, v4.4S,v15.4S +trn1 v14.2D, v2.2D, v16.2D +trn1 v30.2D, v20.2D, v21.2D +mla v11.4S, v3.4S, v31.s[0] +sub v3.4s, v12.4s, v7.4s +add v12.4s, v12.4s, v7.4s +ldr q7, [x17, #+1728] +ldr q21, [x17, #+1744] +sqrdmulh v20.4S, v12.4S, v21.4S +mul v12.4S, v12.4S,v7.4S +trn1 v21.4S, v10.4S, v28.4S +trn2 v7.4S, v10.4S, v28.4S +mla v4.4S, v17.4S, v31.s[0] +sub v17.4s, v19.4s, v11.4s +add v19.4s, v19.4s, v11.4s +ldr q11, [x17, #+1760] +ldr q16, [x17, #+1776] +sqrdmulh v2.4S, v3.4S, v16.4S +mul v3.4S, v3.4S,v11.4S +trn1 v16.4S, v1.4S, v29.4S +trn2 v11.4S, v1.4S, v29.4S +mla v12.4S, v20.4S, v31.s[0] +sub v20.4s, v13.4s, v4.4s +add v13.4s, v13.4s, v4.4s +ldr q4, [x17, #+1856] +ldr q18, [x17, #+1872] +sqrdmulh v15.4S, v13.4S, v18.4S +mul v13.4S, v13.4S,v4.4S +trn2 v1.2D, v21.2D, v16.2D +trn2 v29.2D, v7.2D, v11.2D +mla v3.4S, v2.4S, v31.s[0] +sub v2.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +ldr q12, [x17, #+1888] +ldr q18, [x17, #+1904] +sqrdmulh v4.4S, v20.4S, v18.4S +mul v20.4S, v20.4S,v12.4S +trn1 v10.2D, v21.2D, v16.2D +trn1 v28.2D, v7.2D, v11.2D +mla v13.4S, v15.4S, v31.s[0] +sub v15.4s, v9.4s, v3.4s +add v9.4s, v9.4s, v3.4s +mla v20.4S, v4.4S, v31.s[0] +sub v4.4s, v19.4s, v13.4s +add v19.4s, v19.4s, v13.4s +sub v13.4s, v17.4s, v20.4s +add v17.4s, v17.4s, v20.4s +str q22, [x0, #768] +str q2, [x0, #784] +str q9, [x0, #800] +str q15, [x0, #816] +str q19, [x0, #832] +str q4, [x0, #848] +str q17, [x0, #864] +str q13, [x0, #880] +ldr q13, [x17, #+1952] +ldr q17, [x17, #+1968] +sqrdmulh v4.4S, v0.4S, v17.4S +mul v0.4S, v0.4S,v13.4S +sqrdmulh v19.4S, v5.4S, v17.4S +mul v5.4S, v5.4S,v13.4S +mla v0.4S, v4.4S, v31.s[0] +ldr q4, [x17, #+2080] +ldr q17, [x17, #+2096] +sqrdmulh v13.4S, v1.4S, v17.4S +mul v1.4S, v1.4S,v4.4S +mla v5.4S, v19.4S, v31.s[0] +sub v19.4s, v14.4s, v0.4s +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v29.4S, v17.4S +mul v29.4S, v29.4S,v4.4S +mla v1.4S, v13.4S, v31.s[0] +sub v13.4s, v30.4s, v5.4s +add v30.4s, v30.4s, v5.4s +ldr q5, [x17, #+1984] +ldr q17, [x17, #+2000] +sqrdmulh v4.4S, v30.4S, v17.4S +mul v30.4S, v30.4S,v5.4S +mla v29.4S, v0.4S, v31.s[0] +sub v0.4s, v10.4s, v1.4s +add v10.4s, v10.4s, v1.4s +ldr q1, [x17, #+2016] +ldr q17, [x17, #+2032] +sqrdmulh v5.4S, v13.4S, v17.4S +mul v13.4S, v13.4S,v1.4S +mla v30.4S, v4.4S, v31.s[0] +sub v4.4s, v28.4s, v29.4s +add v28.4s, v28.4s, v29.4s +ldr q29, [x17, #+2112] +ldr q17, [x17, #+2128] +sqrdmulh v1.4S, v28.4S, v17.4S +mul v28.4S, v28.4S,v29.4S +mla v13.4S, v5.4S, v31.s[0] +sub v5.4s, v14.4s, v30.4s +add v14.4s, v14.4s, v30.4s +ldr q30, [x17, #+2144] +ldr q17, [x17, #+2160] +sqrdmulh v29.4S, v4.4S, v17.4S +mul v4.4S, v4.4S,v30.4S +mla v28.4S, v1.4S, v31.s[0] +sub v1.4s, v19.4s, v13.4s +add v19.4s, v19.4s, v13.4s +mla v4.4S, v29.4S, v31.s[0] +sub v29.4s, v10.4s, v28.4s +add v10.4s, v10.4s, v28.4s +sub v28.4s, v0.4s, v4.4s +add v0.4s, v0.4s, v4.4s +str q14, [x0, #896] +str q5, [x0, #912] +str q19, [x0, #928] +str q1, [x0, #944] +str q10, [x0, #960] +str q29, [x0, #976] +str q0, [x0, #992] +str q28, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 2392 +// Instruction count: 2388 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_3_z4_3.s b/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_3_z4_3.s new file mode 100644 index 0000000..6810a7b --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_3_z4_3.s @@ -0,0 +1,2422 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 26036764 // Layer 6, block 0 +.word 7065381 // Layer 6, block 1 +.word 11280567 // Layer 6, block 2 +.word 19695786 // Layer 6, block 3 +.word 1666225723 // Layer 6, block 0 +.word 452149874 // Layer 6, block 1 +.word 721901190 // Layer 6, block 2 +.word 1260434103 // Layer 6, block 3 +.word 28678040 // Layer 7, block 0 +.word 5637166 // Layer 7, block 2 +.word 18759424 // Layer 7, block 4 +.word 8648030 // Layer 7, block 6 +.word 1835254486 // Layer 7, block 0 +.word 360751090 // Layer 7, block 2 +.word 1200511508 // Layer 7, block 4 +.word 553431680 // Layer 7, block 6 +.word 7232147 // Layer 7, block 1 +.word 7430689 // Layer 7, block 3 +.word 14819378 // Layer 7, block 5 +.word 22112339 // Layer 7, block 7 +.word 462822084 // Layer 7, block 1 +.word 475527802 // Layer 7, block 3 +.word 948367809 // Layer 7, block 5 +.word 1415081692 // Layer 7, block 7 +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14834498 // Layer 6, block 4 +.word 22861321 // Layer 6, block 5 +.word 23033862 // Layer 6, block 6 +.word 32211066 // Layer 6, block 7 +.word 949335415 // Layer 6, block 4 +.word 1463012881 // Layer 6, block 5 +.word 1474054663 // Layer 6, block 6 +.word 2061350894 // Layer 6, block 7 +.word 7103825 // Layer 7, block 8 +.word 24338119 // Layer 7, block 10 +.word 6674394 // Layer 7, block 12 +.word 3716128 // Layer 7, block 14 +.word 454610102 // Layer 7, block 8 +.word 1557520740 // Layer 7, block 10 +.word 427128616 // Layer 7, block 12 +.word 237814041 // Layer 7, block 14 +.word 18577393 // Layer 7, block 9 +.word 17042091 // Layer 7, block 11 +.word 6574213 // Layer 7, block 13 +.word 24666803 // Layer 7, block 15 +.word 1188862414 // Layer 7, block 9 +.word 1090610585 // Layer 7, block 11 +.word 420717521 // Layer 7, block 13 +.word 1578554911 // Layer 7, block 15 +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 11253846 // Layer 6, block 8 +.word 16151303 // Layer 6, block 9 +.word 1821442 // Layer 6, block 10 +.word 23358663 // Layer 6, block 11 +.word 720191176 // Layer 6, block 8 +.word 1033604503 // Layer 6, block 9 +.word 116563391 // Layer 6, block 10 +.word 1494840340 // Layer 6, block 11 +.word 32787475 // Layer 7, block 16 +.word 8269259 // Layer 7, block 18 +.word 20826321 // Layer 7, block 20 +.word 21194054 // Layer 7, block 22 +.word 2098238255 // Layer 7, block 16 +.word 529192186 // Layer 7, block 18 +.word 1332782821 // Layer 7, block 20 +.word 1356315937 // Layer 7, block 22 +.word 28400654 // Layer 7, block 17 +.word 31090287 // Layer 7, block 19 +.word 26776841 // Layer 7, block 21 +.word 22281074 // Layer 7, block 23 +.word 1817503137 // Layer 7, block 17 +.word 1989626512 // Layer 7, block 19 +.word 1713587037 // Layer 7, block 21 +.word 1425879908 // Layer 7, block 23 +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 20504641 // Layer 6, block 12 +.word 7735096 // Layer 6, block 13 +.word 29463916 // Layer 6, block 14 +.word 23172067 // Layer 6, block 15 +.word 1312196872 // Layer 6, block 12 +.word 495008363 // Layer 6, block 13 +.word 1885546712 // Layer 6, block 14 +.word 1482899108 // Layer 6, block 15 +.word 1953000 // Layer 7, block 24 +.word 12766243 // Layer 7, block 26 +.word 16292342 // Layer 7, block 28 +.word 25143337 // Layer 7, block 30 +.word 124982461 // Layer 7, block 24 +.word 816977197 // Layer 7, block 26 +.word 1042630311 // Layer 7, block 28 +.word 1609050759 // Layer 7, block 30 +.word 12486848 // Layer 7, block 25 +.word 31556661 // Layer 7, block 27 +.word 28330310 // Layer 7, block 29 +.word 15137961 // Layer 7, block 31 +.word 799097282 // Layer 7, block 25 +.word 2019472170 // Layer 7, block 27 +.word 1813001465 // Layer 7, block 29 +.word 968755565 // Layer 7, block 31 +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 18663828 // Layer 6, block 16 +.word 25765932 // Layer 6, block 17 +.word 11779122 // Layer 6, block 18 +.word 29112305 // Layer 6, block 19 +.word 1194393831 // Layer 6, block 16 +.word 1648893798 // Layer 6, block 17 +.word 753806275 // Layer 6, block 18 +.word 1863045325 // Layer 6, block 19 +.word 33163184 // Layer 7, block 32 +.word 11550623 // Layer 7, block 34 +.word 25375595 // Layer 7, block 36 +.word 18254638 // Layer 7, block 38 +.word 2122281795 // Layer 7, block 32 +.word 739183455 // Layer 7, block 34 +.word 1623914137 // Layer 7, block 36 +.word 1168207670 // Layer 7, block 38 +.word 9551359 // Layer 7, block 33 +.word 33257316 // Layer 7, block 35 +.word 10387700 // Layer 7, block 37 +.word 4263629 // Layer 7, block 39 +.word 611240324 // Layer 7, block 33 +.word 2128305784 // Layer 7, block 35 +.word 664762063 // Layer 7, block 37 +.word 272851431 // Layer 7, block 39 +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 596073 // Layer 6, block 20 +.word 29039358 // Layer 6, block 21 +.word 6760262 // Layer 6, block 22 +.word 2228887 // Layer 6, block 23 +.word 38145761 // Layer 6, block 20 +.word 1858377074 // Layer 6, block 21 +.word 432623749 // Layer 6, block 22 +.word 142637881 // Layer 6, block 23 +.word 25929180 // Layer 7, block 40 +.word 23508428 // Layer 7, block 42 +.word 22560727 // Layer 7, block 44 +.word 29457393 // Layer 7, block 46 +.word 1659340873 // Layer 7, block 40 +.word 1504424569 // Layer 7, block 42 +.word 1443776334 // Layer 7, block 44 +.word 1885129272 // Layer 7, block 46 +.word 17371159 // Layer 7, block 41 +.word 11558208 // Layer 7, block 43 +.word 15755637 // Layer 7, block 45 +.word 20740787 // Layer 7, block 47 +.word 1111669329 // Layer 7, block 41 +.word 739668858 // Layer 7, block 43 +.word 1008283812 // Layer 7, block 45 +.word 1327309063 // Layer 7, block 47 +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 13624329 // Layer 6, block 24 +.word 9838349 // Layer 6, block 25 +.word 6934560 // Layer 6, block 26 +.word 11310234 // Layer 6, block 27 +.word 871890510 // Layer 6, block 24 +.word 629606282 // Layer 6, block 25 +.word 443777969 // Layer 6, block 26 +.word 723799733 // Layer 6, block 27 +.word 3153984 // Layer 7, block 48 +.word 15599806 // Layer 7, block 50 +.word 23484790 // Layer 7, block 52 +.word 30174454 // Layer 7, block 54 +.word 201839571 // Layer 7, block 48 +.word 998311389 // Layer 7, block 50 +.word 1502911852 // Layer 7, block 52 +.word 1931017673 // Layer 7, block 54 +.word 13598070 // Layer 7, block 49 +.word 31454003 // Layer 7, block 51 +.word 20506260 // Layer 7, block 53 +.word 5928435 // Layer 7, block 55 +.word 870210062 // Layer 7, block 49 +.word 2012902560 // Layer 7, block 51 +.word 1312300480 // Layer 7, block 53 +.word 379390883 // Layer 7, block 55 +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 32798516 // Layer 6, block 28 +.word 9911360 // Layer 6, block 29 +.word 32443170 // Layer 6, block 30 +.word 31293482 // Layer 6, block 31 +.word 2098944825 // Layer 6, block 28 +.word 634278629 // Layer 6, block 29 +.word 2076204416 // Layer 6, block 30 +.word 2002630000 // Layer 6, block 31 +.word 26013877 // Layer 7, block 56 +.word 22928950 // Layer 7, block 58 +.word 24547058 // Layer 7, block 60 +.word 21082546 // Layer 7, block 62 +.word 1664761067 // Layer 7, block 56 +.word 1467340807 // Layer 7, block 58 +.word 1570891816 // Layer 7, block 60 +.word 1349179970 // Layer 7, block 62 +.word 21864746 // Layer 7, block 57 +.word 27678266 // Layer 7, block 59 +.word 30695887 // Layer 7, block 61 +.word 31772478 // Layer 7, block 63 +.word 1399236949 // Layer 7, block 57 +.word 1771273834 // Layer 7, block 59 +.word 1964386839 // Layer 7, block 61 +.word 2033283404 // Layer 7, block 63 +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 2853776 // Layer 6, block 32 +.word 31645959 // Layer 6, block 33 +.word 29723614 // Layer 6, block 34 +.word 31813171 // Layer 6, block 35 +.word 182627725 // Layer 6, block 32 +.word 2025186806 // Layer 6, block 33 +.word 1902166116 // Layer 6, block 34 +.word 2035887557 // Layer 6, block 35 +.word 30377953 // Layer 7, block 64 +.word 4924837 // Layer 7, block 66 +.word 11362575 // Layer 7, block 68 +.word 31398766 // Layer 7, block 70 +.word 1944040616 // Layer 7, block 64 +.word 315165513 // Layer 7, block 66 +.word 727149301 // Layer 7, block 68 +.word 2009367662 // Layer 7, block 70 +.word 27689101 // Layer 7, block 65 +.word 31229525 // Layer 7, block 67 +.word 6544948 // Layer 7, block 69 +.word 13728247 // Layer 7, block 71 +.word 1771967221 // Layer 7, block 65 +.word 1998537064 // Layer 7, block 67 +.word 418844704 // Layer 7, block 69 +.word 878540754 // Layer 7, block 71 +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9116920 // Layer 6, block 36 +.word 26449800 // Layer 6, block 37 +.word 27173300 // Layer 6, block 38 +.word 1574249 // Layer 6, block 39 +.word 583438350 // Layer 6, block 36 +.word 1692658010 // Layer 6, block 37 +.word 1738958476 // Layer 6, block 38 +.word 100744247 // Layer 6, block 39 +.word 6510145 // Layer 7, block 72 +.word 760999 // Layer 7, block 74 +.word 1634503 // Layer 7, block 76 +.word 29546109 // Layer 7, block 78 +.word 416617482 // Layer 7, block 72 +.word 48700219 // Layer 7, block 74 +.word 104600209 // Layer 7, block 76 +.word 1890806663 // Layer 7, block 78 +.word 2195232 // Layer 7, block 73 +.word 4465852 // Layer 7, block 75 +.word 31203102 // Layer 7, block 77 +.word 29916743 // Layer 7, block 79 +.word 140484126 // Layer 7, block 73 +.word 285792715 // Layer 7, block 75 +.word 1996846121 // Layer 7, block 77 +.word 1914525428 // Layer 7, block 79 +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29172999 // Layer 6, block 40 +.word 16825951 // Layer 6, block 41 +.word 11592382 // Layer 6, block 42 +.word 2671395 // Layer 6, block 43 +.word 1866929445 // Layer 6, block 40 +.word 1076778680 // Layer 6, block 41 +.word 741855827 // Layer 6, block 42 +.word 170956232 // Layer 6, block 43 +.word 14579779 // Layer 7, block 80 +.word 24263513 // Layer 7, block 82 +.word 4646776 // Layer 7, block 84 +.word 69049 // Layer 7, block 86 +.word 933034643 // Layer 7, block 80 +.word 1552746321 // Layer 7, block 82 +.word 297370968 // Layer 7, block 84 +.word 4418799 // Layer 7, block 86 +.word 33263488 // Layer 7, block 81 +.word 22493246 // Layer 7, block 83 +.word 22009979 // Layer 7, block 85 +.word 12021234 // Layer 7, block 87 +.word 2128700762 // Layer 7, block 81 +.word 1439457879 // Layer 7, block 83 +.word 1408531152 // Layer 7, block 85 +.word 769300260 // Layer 7, block 87 +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 15720958 // Layer 6, block 44 +.word 4876619 // Layer 6, block 45 +.word 9370171 // Layer 6, block 46 +.word 2197027 // Layer 6, block 47 +.word 1006064525 // Layer 6, block 44 +.word 312079797 // Layer 6, block 45 +.word 599645177 // Layer 6, block 46 +.word 140598997 // Layer 6, block 47 +.word 16117282 // Layer 7, block 88 +.word 9635661 // Layer 7, block 90 +.word 9117520 // Layer 7, block 92 +.word 3506913 // Layer 7, block 94 +.word 1031427326 // Layer 7, block 88 +.word 616635240 // Layer 7, block 90 +.word 583476747 // Layer 7, block 92 +.word 224425303 // Layer 7, block 94 +.word 20014407 // Layer 7, block 89 +.word 25893988 // Layer 7, block 91 +.word 10257619 // Layer 7, block 93 +.word 24501669 // Layer 7, block 95 +.word 1280824291 // Layer 7, block 89 +.word 1657088757 // Layer 7, block 91 +.word 656437514 // Layer 7, block 93 +.word 1567987141 // Layer 7, block 95 +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 23467272 // Layer 6, block 48 +.word 11944835 // Layer 6, block 49 +.word 29768154 // Layer 6, block 50 +.word 3189790 // Layer 6, block 51 +.word 1501790786 // Layer 6, block 48 +.word 764411097 // Layer 6, block 49 +.word 1905016458 // Layer 6, block 50 +.word 204130980 // Layer 6, block 51 +.word 28559032 // Layer 7, block 96 +.word 20151609 // Layer 7, block 98 +.word 11645481 // Layer 7, block 100 +.word 16402437 // Layer 7, block 102 +.word 1827638556 // Layer 7, block 96 +.word 1289604549 // Layer 7, block 98 +.word 745253903 // Layer 7, block 100 +.word 1049675853 // Layer 7, block 102 +.word 1005359 // Layer 7, block 97 +.word 19130139 // Layer 7, block 99 +.word 11690281 // Layer 7, block 101 +.word 5461508 // Layer 7, block 103 +.word 64338065 // Layer 7, block 97 +.word 1224235458 // Layer 7, block 99 +.word 748120885 // Layer 7, block 101 +.word 349509836 // Layer 7, block 103 +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 4898455 // Layer 6, block 52 +.word 22059944 // Layer 6, block 53 +.word 20315246 // Layer 6, block 54 +.word 28615767 // Layer 6, block 55 +.word 313477194 // Layer 6, block 52 +.word 1411728668 // Layer 6, block 53 +.word 1300076517 // Layer 6, block 54 +.word 1831269319 // Layer 6, block 55 +.word 6226096 // Layer 7, block 104 +.word 14029790 // Layer 7, block 106 +.word 7729000 // Layer 7, block 108 +.word 13958531 // Layer 7, block 110 +.word 398439734 // Layer 7, block 104 +.word 897838034 // Layer 7, block 106 +.word 494618249 // Layer 7, block 108 +.word 893277806 // Layer 7, block 110 +.word 31755058 // Layer 7, block 105 +.word 26102744 // Layer 7, block 107 +.word 19175904 // Layer 7, block 109 +.word 19472238 // Layer 7, block 111 +.word 2032168609 // Layer 7, block 105 +.word 1670448121 // Layer 7, block 107 +.word 1227164194 // Layer 7, block 109 +.word 1246128123 // Layer 7, block 111 +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 17302560 // Layer 6, block 56 +.word 8630188 // Layer 6, block 57 +.word 13744680 // Layer 6, block 58 +.word 31890906 // Layer 6, block 59 +.word 1107279328 // Layer 6, block 56 +.word 552289879 // Layer 6, block 57 +.word 879592386 // Layer 6, block 58 +.word 2040862218 // Layer 6, block 59 +.word 4735938 // Layer 7, block 112 +.word 26671657 // Layer 7, block 114 +.word 25810971 // Layer 7, block 116 +.word 25578690 // Layer 7, block 118 +.word 303076900 // Layer 7, block 112 +.word 1706855774 // Layer 7, block 114 +.word 1651776074 // Layer 7, block 116 +.word 1636911225 // Layer 7, block 118 +.word 6957373 // Layer 7, block 113 +.word 25381712 // Layer 7, block 115 +.word 27780827 // Layer 7, block 117 +.word 28062311 // Layer 7, block 119 +.word 445237890 // Layer 7, block 113 +.word 1624305595 // Layer 7, block 115 +.word 1777837237 // Layer 7, block 117 +.word 1795850838 // Layer 7, block 119 +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 26150922 // Layer 6, block 60 +.word 29525906 // Layer 6, block 61 +.word 23080870 // Layer 6, block 62 +.word 1636987 // Layer 6, block 63 +.word 1673531278 // Layer 6, block 60 +.word 1889513769 // Layer 6, block 61 +.word 1477062945 // Layer 6, block 62 +.word 104759172 // Layer 6, block 63 +.word 10674616 // Layer 7, block 120 +.word 9508293 // Layer 7, block 122 +.word 4274200 // Layer 7, block 124 +.word 10066304 // Layer 7, block 126 +.word 683123285 // Layer 7, block 120 +.word 608484310 // Layer 7, block 122 +.word 273527923 // Layer 7, block 124 +.word 644194289 // Layer 7, block 126 +.word 26473446 // Layer 7, block 121 +.word 14853570 // Layer 7, block 123 +.word 32427548 // Layer 7, block 125 +.word 16598340 // Layer 7, block 127 +.word 1694171239 // Layer 7, block 121 +.word 950555930 // Layer 7, block 123 +.word 2075204685 // Layer 7, block 125 +.word 1062212688 // Layer 7, block 127 +.text +.global ntt_u32_full_neon_asm_var_4_4_3_z4_3 +.global _ntt_u32_full_neon_asm_var_4_4_3_z4_3 +ntt_u32_full_neon_asm_var_4_4_3_z4_3: +_ntt_u32_full_neon_asm_var_4_4_3_z4_3: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #800] +ldr q21, [x0, #864] +ldr q20, [x0, #928] +ldr q19, [x0, #992] +ldr q18, [x0, #288] +ldr q17, [x0, #352] +ldr q16, [x0, #416] +ldr q3, [x0, #480] +sqrdmulh v2.4S, v22.4S, v29.s[0] +ldr q1, [x0, #544] +mul v22.4S, v22.4S,v30.s[0] +ldr q0, [x0, #608] +sqrdmulh v15.4S, v21.4S, v29.s[0] +ldr q14, [x0, #672] +mul v21.4S, v21.4S,v30.s[0] +ldr q13, [x0, #736] +mla v22.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q12, [x0, #32] +sub v11.4s, v18.4s, v22.4s +mla v21.4S, v15.4S, v31.s[0] +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q15, [x0, #96] +sub v10.4s, v17.4s, v21.4s +mla v20.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v1.4S, v29.s[0] +ldr q2, [x0, #160] +mul v1.4S, v1.4S,v30.s[0] +sub v9.4s, v16.4s, v20.4s +mla v19.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v0.4S, v29.s[0] +ldr q22, [x0, #224] +mul v0.4S, v0.4S,v30.s[0] +sub v8.4s, v3.4s, v19.4s +mla v1.4S, v21.4S, v31.s[0] +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v21.4s, v12.4s, v1.4s +mla v0.4S, v20.4S, v31.s[0] +add v12.4s, v12.4s, v1.4s +sqrdmulh v1.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +sub v20.4s, v15.4s, v0.4s +mla v14.4S, v19.4S, v31.s[0] +add v15.4s, v15.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v19.4s, v2.4s, v14.4s +mla v13.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v1.4s, v22.4s, v13.4s +mla v16.4S, v0.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v0.4s, v2.4s, v16.4s +mla v3.4S, v14.4S, v31.s[0] +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v14.4s, v22.4s, v3.4s +mla v18.4S, v13.4S, v31.s[0] +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v29.s[2] +mul v9.4S, v9.4S,v30.s[2] +sub v13.4s, v12.4s, v18.4s +mla v17.4S, v16.4S, v31.s[0] +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v8.4S, v29.s[2] +mul v8.4S, v8.4S,v30.s[2] +sub v16.4s, v15.4s, v17.4s +mla v9.4S, v3.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +sqrdmulh v17.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v3.4s, v19.4s, v9.4s +mla v8.4S, v18.4S, v31.s[0] +add v19.4s, v19.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v18.4s, v1.4s, v8.4s +mla v11.4S, v17.4S, v31.s[0] +add v1.4s, v1.4s, v8.4s +sqrdmulh v8.4S, v2.4S, v27.s[0] +mul v2.4S, v2.4S,v28.s[0] +sub v17.4s, v21.4s, v11.4s +mla v10.4S, v9.4S, v31.s[0] +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v27.s[0] +mul v22.4S, v22.4S,v28.s[0] +sub v9.4s, v20.4s, v10.4s +mla v2.4S, v8.4S, v31.s[0] +add v20.4s, v20.4s, v10.4s +sqrdmulh v10.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v8.4s, v12.4s, v2.4s +mla v22.4S, v11.4S, v31.s[0] +add v12.4s, v12.4s, v2.4s +sqrdmulh v2.4S, v14.4S, v27.s[1] +mul v14.4S, v14.4S,v28.s[1] +sub v11.4s, v15.4s, v22.4s +mla v0.4S, v10.4S, v31.s[0] +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v27.s[2] +mul v19.4S, v19.4S,v28.s[2] +sub v10.4s, v13.4s, v0.4s +mla v14.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v0.4s +sqrdmulh v0.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +sub v2.4s, v16.4s, v14.4s +mla v19.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v27.s[3] +mul v3.4S, v3.4S,v28.s[3] +sub v22.4s, v21.4s, v19.4s +mla v1.4S, v0.4S, v31.s[0] +add v21.4s, v21.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v0.4s, v20.4s, v1.4s +mla v3.4S, v14.4S, v31.s[0] +add v20.4s, v20.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v25.s[0] +mul v15.4S, v15.4S,v26.s[0] +sub v14.4s, v17.4s, v3.4s +mla v18.4S, v19.4S, v31.s[0] +add v17.4s, v17.4s, v3.4s +sqrdmulh v3.4S, v11.4S, v25.s[1] +mul v11.4S, v11.4S,v26.s[1] +sub v19.4s, v9.4s, v18.4s +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v1.4s, v12.4s, v15.4s +mla v11.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v25.s[3] +mul v2.4S, v2.4S,v26.s[3] +sub v3.4s, v8.4s, v11.4s +mla v16.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v11.4s +str q12, [x0, #32] +sqrdmulh v12.4S, v20.4S, v23.s[0] +str q1, [x0, #96] +mul v20.4S, v20.4S,v24.s[0] +ldr q1, [x0, #816] +sub v11.4s, v13.4s, v16.4s +ldr q18, [x0, #880] +mla v2.4S, v15.4S, v31.s[0] +add v13.4s, v13.4s, v16.4s +str q8, [x0, #160] +sqrdmulh v8.4S, v0.4S, v23.s[1] +str q3, [x0, #224] +mul v0.4S, v0.4S,v24.s[1] +ldr q3, [x0, #944] +sub v16.4s, v10.4s, v2.4s +ldr q15, [x0, #1008] +mla v20.4S, v12.4S, v31.s[0] +add v10.4s, v10.4s, v2.4s +str q13, [x0, #288] +sqrdmulh v13.4S, v9.4S, v23.s[2] +str q11, [x0, #352] +mul v9.4S, v9.4S,v24.s[2] +ldr q11, [x0, #304] +sub v2.4s, v21.4s, v20.4s +ldr q12, [x0, #368] +mla v0.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v20.4s +str q10, [x0, #416] +sqrdmulh v10.4S, v19.4S, v23.s[3] +str q16, [x0, #480] +mul v19.4S, v19.4S,v24.s[3] +ldr q16, [x0, #432] +sub v20.4s, v22.4s, v0.4s +ldr q8, [x0, #496] +mla v9.4S, v13.4S, v31.s[0] +add v22.4s, v22.4s, v0.4s +str q21, [x0, #544] +sqrdmulh v21.4S, v1.4S, v29.s[0] +str q2, [x0, #608] +ldr q2, [x0, #560] +mul v1.4S, v1.4S,v30.s[0] +ldr q0, [x0, #624] +sub v13.4s, v17.4s, v9.4s +mla v19.4S, v10.4S, v31.s[0] +add v17.4s, v17.4s, v9.4s +str q22, [x0, #672] +sqrdmulh v22.4S, v18.4S, v29.s[0] +str q20, [x0, #736] +ldr q20, [x0, #688] +mul v18.4S, v18.4S,v30.s[0] +ldr q9, [x0, #752] +sub v10.4s, v14.4s, v19.4s +mla v1.4S, v21.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +str q17, [x0, #800] +sqrdmulh v17.4S, v3.4S, v29.s[0] +str q13, [x0, #864] +mul v3.4S, v3.4S,v30.s[0] +ldr q13, [x0, #48] +sub v19.4s, v11.4s, v1.4s +mla v18.4S, v22.4S, v31.s[0] +add v11.4s, v11.4s, v1.4s +str q14, [x0, #928] +sqrdmulh v14.4S, v15.4S, v29.s[0] +str q10, [x0, #992] +mul v15.4S, v15.4S,v30.s[0] +ldr q10, [x0, #112] +sub v1.4s, v12.4s, v18.4s +mla v3.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v2.4S, v29.s[0] +ldr q17, [x0, #176] +mul v2.4S, v2.4S,v30.s[0] +sub v22.4s, v16.4s, v3.4s +mla v15.4S, v14.4S, v31.s[0] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v0.4S, v29.s[0] +ldr q14, [x0, #240] +mul v0.4S, v0.4S,v30.s[0] +sub v21.4s, v8.4s, v15.4s +mla v2.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +sub v18.4s, v13.4s, v2.4s +mla v0.4S, v3.4S, v31.s[0] +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v9.4S, v29.s[0] +mul v9.4S, v9.4S,v30.s[0] +sub v3.4s, v10.4s, v0.4s +mla v20.4S, v15.4S, v31.s[0] +add v10.4s, v10.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v15.4s, v17.4s, v20.4s +mla v9.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +sub v2.4s, v14.4s, v9.4s +mla v16.4S, v0.4S, v31.s[0] +add v14.4s, v14.4s, v9.4s +sqrdmulh v9.4S, v11.4S, v29.s[1] +mul v11.4S, v11.4S,v30.s[1] +sub v0.4s, v17.4s, v16.4s +mla v8.4S, v20.4S, v31.s[0] +add v17.4s, v17.4s, v16.4s +sqrdmulh v16.4S, v12.4S, v29.s[1] +mul v12.4S, v12.4S,v30.s[1] +sub v20.4s, v14.4s, v8.4s +mla v11.4S, v9.4S, v31.s[0] +add v14.4s, v14.4s, v8.4s +sqrdmulh v8.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +sub v9.4s, v13.4s, v11.4s +mla v12.4S, v16.4S, v31.s[0] +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +sub v16.4s, v10.4s, v12.4s +mla v22.4S, v8.4S, v31.s[0] +add v10.4s, v10.4s, v12.4s +sqrdmulh v12.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +sub v8.4s, v15.4s, v22.4s +mla v21.4S, v11.4S, v31.s[0] +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v1.4S, v29.s[2] +mul v1.4S, v1.4S,v30.s[2] +sub v11.4s, v2.4s, v21.4s +mla v19.4S, v12.4S, v31.s[0] +add v2.4s, v2.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v27.s[0] +mul v17.4S, v17.4S,v28.s[0] +sub v12.4s, v18.4s, v19.4s +mla v1.4S, v22.4S, v31.s[0] +add v18.4s, v18.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +sub v22.4s, v3.4s, v1.4s +mla v17.4S, v21.4S, v31.s[0] +add v3.4s, v3.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v21.4s, v13.4s, v17.4s +mla v14.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v17.4s +sqrdmulh v17.4S, v20.4S, v27.s[1] +mul v20.4S, v20.4S,v28.s[1] +sub v19.4s, v10.4s, v14.4s +mla v0.4S, v1.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v27.s[2] +mul v15.4S, v15.4S,v28.s[2] +sub v1.4s, v9.4s, v0.4s +mla v20.4S, v17.4S, v31.s[0] +add v9.4s, v9.4s, v0.4s +sqrdmulh v0.4S, v2.4S, v27.s[2] +mul v2.4S, v2.4S,v28.s[2] +sub v17.4s, v16.4s, v20.4s +mla v15.4S, v14.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v27.s[3] +mul v8.4S, v8.4S,v28.s[3] +sub v14.4s, v18.4s, v15.4s +mla v2.4S, v0.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v27.s[3] +mul v11.4S, v11.4S,v28.s[3] +sub v0.4s, v3.4s, v2.4s +mla v8.4S, v20.4S, v31.s[0] +add v3.4s, v3.4s, v2.4s +sqrdmulh v2.4S, v10.4S, v25.s[0] +mul v10.4S, v10.4S,v26.s[0] +sub v20.4s, v12.4s, v8.4s +mla v11.4S, v15.4S, v31.s[0] +add v12.4s, v12.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v25.s[1] +mul v19.4S, v19.4S,v26.s[1] +sub v15.4s, v22.4s, v11.4s +mla v10.4S, v2.4S, v31.s[0] +add v22.4s, v22.4s, v11.4s +sqrdmulh v11.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v2.4s, v13.4s, v10.4s +mla v19.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v10.4s +sqrdmulh v10.4S, v17.4S, v25.s[3] +mul v17.4S, v17.4S,v26.s[3] +sub v8.4s, v21.4s, v19.4s +mla v16.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v19.4s +str q13, [x0, #48] +sqrdmulh v13.4S, v3.4S, v23.s[0] +str q2, [x0, #112] +mul v3.4S, v3.4S,v24.s[0] +ldr q2, [x0, #768] +sub v19.4s, v9.4s, v16.4s +ldr q11, [x0, #832] +mla v17.4S, v10.4S, v31.s[0] +add v9.4s, v9.4s, v16.4s +str q21, [x0, #176] +sqrdmulh v21.4S, v0.4S, v23.s[1] +str q8, [x0, #240] +mul v0.4S, v0.4S,v24.s[1] +ldr q8, [x0, #896] +sub v16.4s, v1.4s, v17.4s +ldr q10, [x0, #960] +mla v3.4S, v13.4S, v31.s[0] +add v1.4s, v1.4s, v17.4s +str q9, [x0, #304] +sqrdmulh v9.4S, v22.4S, v23.s[2] +str q19, [x0, #368] +mul v22.4S, v22.4S,v24.s[2] +ldr q19, [x0, #256] +sub v17.4s, v18.4s, v3.4s +ldr q13, [x0, #320] +mla v0.4S, v21.4S, v31.s[0] +add v18.4s, v18.4s, v3.4s +str q1, [x0, #432] +sqrdmulh v1.4S, v15.4S, v23.s[3] +str q16, [x0, #496] +mul v15.4S, v15.4S,v24.s[3] +ldr q16, [x0, #384] +sub v3.4s, v14.4s, v0.4s +ldr q21, [x0, #448] +mla v22.4S, v9.4S, v31.s[0] +add v14.4s, v14.4s, v0.4s +str q18, [x0, #560] +sqrdmulh v18.4S, v2.4S, v29.s[0] +str q17, [x0, #624] +ldr q17, [x0, #512] +mul v2.4S, v2.4S,v30.s[0] +ldr q0, [x0, #576] +sub v9.4s, v12.4s, v22.4s +mla v15.4S, v1.4S, v31.s[0] +add v12.4s, v12.4s, v22.4s +str q14, [x0, #688] +sqrdmulh v14.4S, v11.4S, v29.s[0] +str q3, [x0, #752] +ldr q3, [x0, #640] +mul v11.4S, v11.4S,v30.s[0] +ldr q22, [x0, #704] +sub v1.4s, v20.4s, v15.4s +mla v2.4S, v18.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +str q12, [x0, #816] +sqrdmulh v12.4S, v8.4S, v29.s[0] +str q9, [x0, #880] +mul v8.4S, v8.4S,v30.s[0] +ldr q9, [x0, #0] +sub v15.4s, v19.4s, v2.4s +mla v11.4S, v14.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +str q20, [x0, #944] +sqrdmulh v20.4S, v10.4S, v29.s[0] +str q1, [x0, #1008] +mul v10.4S, v10.4S,v30.s[0] +ldr q1, [x0, #64] +sub v2.4s, v13.4s, v11.4s +mla v8.4S, v12.4S, v31.s[0] +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v29.s[0] +ldr q12, [x0, #128] +mul v17.4S, v17.4S,v30.s[0] +sub v14.4s, v16.4s, v8.4s +mla v10.4S, v20.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v0.4S, v29.s[0] +ldr q20, [x0, #192] +mul v0.4S, v0.4S,v30.s[0] +sub v18.4s, v21.4s, v10.4s +mla v17.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +sub v11.4s, v9.4s, v17.4s +mla v0.4S, v8.4S, v31.s[0] +add v9.4s, v9.4s, v17.4s +sqrdmulh v17.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v8.4s, v1.4s, v0.4s +mla v3.4S, v10.4S, v31.s[0] +add v1.4s, v1.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v10.4s, v12.4s, v3.4s +mla v22.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v17.4s, v20.4s, v22.4s +mla v16.4S, v0.4S, v31.s[0] +add v20.4s, v20.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[1] +mul v19.4S, v19.4S,v30.s[1] +sub v0.4s, v12.4s, v16.4s +mla v21.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v13.4S, v29.s[1] +mul v13.4S, v13.4S,v30.s[1] +sub v3.4s, v20.4s, v21.4s +mla v19.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v22.4s, v9.4s, v19.4s +mla v13.4S, v16.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v29.s[2] +mul v18.4S, v18.4S,v30.s[2] +sub v16.4s, v1.4s, v13.4s +mla v14.4S, v21.4S, v31.s[0] +add v1.4s, v1.4s, v13.4s +sqrdmulh v13.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +sub v21.4s, v10.4s, v14.4s +mla v18.4S, v19.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v19.4s, v17.4s, v18.4s +mla v15.4S, v13.4S, v31.s[0] +add v17.4s, v17.4s, v18.4s +sqrdmulh v18.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +sub v13.4s, v11.4s, v15.4s +mla v2.4S, v14.4S, v31.s[0] +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v27.s[0] +mul v20.4S, v20.4S,v28.s[0] +sub v14.4s, v8.4s, v2.4s +mla v12.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v2.4s +sqrdmulh v2.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v18.4s, v9.4s, v12.4s +mla v20.4S, v15.4S, v31.s[0] +add v9.4s, v9.4s, v12.4s +sqrdmulh v12.4S, v3.4S, v27.s[1] +mul v3.4S, v3.4S,v28.s[1] +sub v15.4s, v1.4s, v20.4s +mla v0.4S, v2.4S, v31.s[0] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v10.4S, v27.s[2] +mul v10.4S, v10.4S,v28.s[2] +sub v2.4s, v22.4s, v0.4s +mla v3.4S, v12.4S, v31.s[0] +add v22.4s, v22.4s, v0.4s +sqrdmulh v0.4S, v17.4S, v27.s[2] +mul v17.4S, v17.4S,v28.s[2] +sub v12.4s, v16.4s, v3.4s +mla v10.4S, v20.4S, v31.s[0] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +sub v20.4s, v11.4s, v10.4s +mla v17.4S, v0.4S, v31.s[0] +add v11.4s, v11.4s, v10.4s +sqrdmulh v10.4S, v19.4S, v27.s[3] +mul v19.4S, v19.4S,v28.s[3] +sub v0.4s, v8.4s, v17.4s +mla v21.4S, v3.4S, v31.s[0] +add v8.4s, v8.4s, v17.4s +sqrdmulh v17.4S, v1.4S, v25.s[0] +mul v1.4S, v1.4S,v26.s[0] +sub v3.4s, v13.4s, v21.4s +mla v19.4S, v10.4S, v31.s[0] +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v15.4S, v25.s[1] +mul v15.4S, v15.4S,v26.s[1] +sub v10.4s, v14.4s, v19.4s +mla v1.4S, v17.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +sqrdmulh v19.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v17.4s, v9.4s, v1.4s +mla v15.4S, v21.4S, v31.s[0] +add v9.4s, v9.4s, v1.4s +sqrdmulh v1.4S, v12.4S, v25.s[3] +mul v12.4S, v12.4S,v26.s[3] +sub v21.4s, v18.4s, v15.4s +mla v16.4S, v19.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +str q9, [x0, #0] +sqrdmulh v9.4S, v8.4S, v23.s[0] +str q17, [x0, #64] +mul v8.4S, v8.4S,v24.s[0] +ldr q17, [x0, #784] +sub v15.4s, v22.4s, v16.4s +ldr q19, [x0, #848] +mla v12.4S, v1.4S, v31.s[0] +add v22.4s, v22.4s, v16.4s +str q18, [x0, #128] +sqrdmulh v18.4S, v0.4S, v23.s[1] +str q21, [x0, #192] +mul v0.4S, v0.4S,v24.s[1] +ldr q21, [x0, #912] +sub v16.4s, v2.4s, v12.4s +ldr q1, [x0, #976] +mla v8.4S, v9.4S, v31.s[0] +add v2.4s, v2.4s, v12.4s +str q22, [x0, #256] +sqrdmulh v22.4S, v14.4S, v23.s[2] +str q15, [x0, #320] +mul v14.4S, v14.4S,v24.s[2] +ldr q15, [x0, #272] +sub v12.4s, v11.4s, v8.4s +ldr q9, [x0, #336] +mla v0.4S, v18.4S, v31.s[0] +add v11.4s, v11.4s, v8.4s +str q2, [x0, #384] +sqrdmulh v2.4S, v10.4S, v23.s[3] +str q16, [x0, #448] +mul v10.4S, v10.4S,v24.s[3] +ldr q16, [x0, #400] +sub v8.4s, v20.4s, v0.4s +ldr q18, [x0, #464] +mla v14.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v0.4s +str q11, [x0, #512] +sqrdmulh v11.4S, v17.4S, v29.s[0] +str q12, [x0, #576] +ldr q12, [x0, #528] +mul v17.4S, v17.4S,v30.s[0] +ldr q0, [x0, #592] +sub v22.4s, v13.4s, v14.4s +mla v10.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v14.4s +str q20, [x0, #640] +sqrdmulh v20.4S, v19.4S, v29.s[0] +str q8, [x0, #704] +ldr q8, [x0, #656] +mul v19.4S, v19.4S,v30.s[0] +ldr q14, [x0, #720] +sub v2.4s, v3.4s, v10.4s +mla v17.4S, v11.4S, v31.s[0] +add v3.4s, v3.4s, v10.4s +str q13, [x0, #768] +sqrdmulh v13.4S, v21.4S, v29.s[0] +str q22, [x0, #832] +mul v21.4S, v21.4S,v30.s[0] +ldr q22, [x0, #16] +sub v10.4s, v15.4s, v17.4s +mla v19.4S, v20.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +str q3, [x0, #896] +sqrdmulh v3.4S, v1.4S, v29.s[0] +str q2, [x0, #960] +mul v1.4S, v1.4S,v30.s[0] +ldr q2, [x0, #80] +sub v17.4s, v9.4s, v19.4s +mla v21.4S, v13.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v12.4S, v29.s[0] +ldr q13, [x0, #144] +mul v12.4S, v12.4S,v30.s[0] +sub v20.4s, v16.4s, v21.4s +mla v1.4S, v3.4S, v31.s[0] +add v16.4s, v16.4s, v21.4s +sqrdmulh v21.4S, v0.4S, v29.s[0] +ldr q3, [x0, #208] +mul v0.4S, v0.4S,v30.s[0] +sub v11.4s, v18.4s, v1.4s +mla v12.4S, v19.4S, v31.s[0] +add v18.4s, v18.4s, v1.4s +sqrdmulh v1.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v19.4s, v22.4s, v12.4s +mla v0.4S, v21.4S, v31.s[0] +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v21.4s, v2.4s, v0.4s +mla v8.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v1.4s, v13.4s, v8.4s +mla v14.4S, v12.4S, v31.s[0] +add v13.4s, v13.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v12.4s, v3.4s, v14.4s +mla v16.4S, v0.4S, v31.s[0] +add v3.4s, v3.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +sub v0.4s, v13.4s, v16.4s +mla v18.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v16.4s +sqrdmulh v16.4S, v9.4S, v29.s[1] +mul v9.4S, v9.4S,v30.s[1] +sub v8.4s, v3.4s, v18.4s +mla v15.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +sub v14.4s, v22.4s, v15.4s +mla v9.4S, v16.4S, v31.s[0] +add v22.4s, v22.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v16.4s, v2.4s, v9.4s +mla v20.4S, v18.4S, v31.s[0] +add v2.4s, v2.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v18.4s, v1.4s, v20.4s +mla v11.4S, v15.4S, v31.s[0] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +sub v15.4s, v12.4s, v11.4s +mla v10.4S, v9.4S, v31.s[0] +add v12.4s, v12.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v27.s[0] +mul v13.4S, v13.4S,v28.s[0] +sub v9.4s, v19.4s, v10.4s +mla v17.4S, v20.4S, v31.s[0] +add v19.4s, v19.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v27.s[0] +mul v3.4S, v3.4S,v28.s[0] +sub v20.4s, v21.4s, v17.4s +mla v13.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v11.4s, v22.4s, v13.4s +mla v3.4S, v10.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v8.4S, v27.s[1] +mul v8.4S, v8.4S,v28.s[1] +sub v10.4s, v2.4s, v3.4s +mla v0.4S, v17.4S, v31.s[0] +add v2.4s, v2.4s, v3.4s +sqrdmulh v3.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +sub v17.4s, v14.4s, v0.4s +mla v8.4S, v13.4S, v31.s[0] +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v12.4S, v27.s[2] +mul v12.4S, v12.4S,v28.s[2] +sub v13.4s, v16.4s, v8.4s +mla v1.4S, v3.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v3.4s, v19.4s, v1.4s +mla v12.4S, v0.4S, v31.s[0] +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +sub v0.4s, v21.4s, v12.4s +mla v18.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v2.4S, v25.s[0] +mul v2.4S, v2.4S,v26.s[0] +sub v8.4s, v9.4s, v18.4s +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v10.4S, v25.s[1] +mul v10.4S, v10.4S,v26.s[1] +sub v1.4s, v20.4s, v15.4s +mla v2.4S, v12.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v12.4s, v22.4s, v2.4s +mla v10.4S, v18.4S, v31.s[0] +add v22.4s, v22.4s, v2.4s +sqrdmulh v2.4S, v13.4S, v25.s[3] +mul v13.4S, v13.4S,v26.s[3] +sub v18.4s, v11.4s, v10.4s +mla v16.4S, v15.4S, v31.s[0] +add v11.4s, v11.4s, v10.4s +str q22, [x0, #16] +sqrdmulh v22.4S, v21.4S, v23.s[0] +str q12, [x0, #80] +mul v21.4S, v21.4S,v24.s[0] +sub v12.4s, v14.4s, v16.4s +mla v13.4S, v2.4S, v31.s[0] +add v14.4s, v14.4s, v16.4s +str q11, [x0, #144] +sqrdmulh v11.4S, v0.4S, v23.s[1] +str q18, [x0, #208] +mul v0.4S, v0.4S,v24.s[1] +sub v18.4s, v17.4s, v13.4s +mla v21.4S, v22.4S, v31.s[0] +add v17.4s, v17.4s, v13.4s +str q14, [x0, #272] +sqrdmulh v14.4S, v20.4S, v23.s[2] +str q12, [x0, #336] +mul v20.4S, v20.4S,v24.s[2] +sub v12.4s, v19.4s, v21.4s +mla v0.4S, v11.4S, v31.s[0] +add v19.4s, v19.4s, v21.4s +str q17, [x0, #400] +sqrdmulh v17.4S, v1.4S, v23.s[3] +str q18, [x0, #464] +mul v1.4S, v1.4S,v24.s[3] +sub v18.4s, v3.4s, v0.4s +mla v20.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v0.4s +str q19, [x0, #528] +str q12, [x0, #592] +sub v12.4s, v9.4s, v20.4s +mla v1.4S, v17.4S, v31.s[0] +add v9.4s, v9.4s, v20.4s +str q3, [x0, #656] +str q18, [x0, #720] +sub v18.4s, v8.4s, v1.4s +add v8.4s, v8.4s, v1.4s +str q9, [x0, #784] +str q12, [x0, #848] +str q8, [x0, #912] +str q18, [x0, #976] +ldr q4, [x0, #32] +ldr q5, [x0, #48] +ldr q6, [x17, #+128] +ldr q7, [x17, #+144] +ldr q15, [x17, #+256] +ldr q10, [x0, #96] +ldr q2, [x17, #+272] +ldr q16, [x0, #112] +sqrdmulh v22.4S, v4.4S, v7.s[0] +mul v4.4S, v4.4S,v6.s[0] +sqrdmulh v13.4S, v5.4S, v7.s[0] +mul v5.4S, v5.4S,v6.s[0] +mla v4.4S, v22.4S, v31.s[0] +ldr q22, [x0, #0] +sqrdmulh v11.4S, v10.4S, v2.s[0] +ldr q21, [x0, #16] +mul v10.4S, v10.4S,v15.s[0] +mla v5.4S, v13.4S, v31.s[0] +sub v13.4s, v22.4s, v4.4s +add v22.4s, v22.4s, v4.4s +sqrdmulh v4.4S, v16.4S, v2.s[0] +ldr q14, [x0, #160] +mul v16.4S, v16.4S,v15.s[0] +ldr q0, [x0, #176] +mla v10.4S, v11.4S, v31.s[0] +ldr q11, [x0, #64] +sub v19.4s, v21.4s, v5.4s +add v21.4s, v21.4s, v5.4s +sqrdmulh v5.4S, v21.4S, v7.s[1] +ldr q17, [x0, #128] +mul v21.4S, v21.4S,v6.s[1] +ldr q20, [x0, #144] +mla v16.4S, v4.4S, v31.s[0] +ldr q4, [x0, #80] +sub v3.4s, v11.4s, v10.4s +add v11.4s, v11.4s, v10.4s +sqrdmulh v10.4S, v19.4S, v7.s[2] +ldr q1, [x17, #+384] +mul v19.4S, v19.4S,v6.s[2] +ldr q9, [x17, #+400] +mla v21.4S, v5.4S, v31.s[0] +sub v5.4s, v4.4s, v16.4s +add v4.4s, v4.4s, v16.4s +sqrdmulh v16.4S, v4.4S, v2.s[1] +ldr q12, [x0, #224] +mul v4.4S, v4.4S,v15.s[1] +ldr q8, [x0, #240] +mla v19.4S, v10.4S, v31.s[0] +sub v10.4s, v22.4s, v21.4s +add v22.4s, v22.4s, v21.4s +sqrdmulh v7.4S, v5.4S, v2.s[2] +ldr q6, [x0, #192] +mul v5.4S, v5.4S,v15.s[2] +ldr q21, [x0, #208] +mla v4.4S, v16.4S, v31.s[0] +sub v16.4s, v13.4s, v19.4s +add v13.4s, v13.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v9.s[0] +ldr q18, [x17, #+512] +mul v14.4S, v14.4S,v1.s[0] +ldr q30, [x17, #+528] +trn1 v29.4S, v22.4S, v10.4S +trn2 v28.4S, v22.4S, v10.4S +mla v5.4S, v7.4S, v31.s[0] +sub v7.4s, v11.4s, v4.4s +add v11.4s, v11.4s, v4.4s +sqrdmulh v2.4S, v0.4S, v9.s[0] +mul v0.4S, v0.4S,v1.s[0] +trn1 v15.4S, v13.4S, v16.4S +trn2 v4.4S, v13.4S, v16.4S +mla v14.4S, v19.4S, v31.s[0] +sub v19.4s, v3.4s, v5.4s +add v3.4s, v3.4s, v5.4s +sqrdmulh v5.4S, v12.4S, v30.s[0] +mul v12.4S, v12.4S,v18.s[0] +trn2 v13.2D, v29.2D, v15.2D +trn2 v16.2D, v28.2D, v4.2D +mla v0.4S, v2.4S, v31.s[0] +sub v2.4s, v17.4s, v14.4s +add v17.4s, v17.4s, v14.4s +sqrdmulh v14.4S, v8.4S, v30.s[0] +mul v8.4S, v8.4S,v18.s[0] +trn1 v22.2D, v29.2D, v15.2D +trn1 v10.2D, v28.2D, v4.2D +mla v12.4S, v5.4S, v31.s[0] +sub v5.4s, v20.4s, v0.4s +add v20.4s, v20.4s, v0.4s +sqrdmulh v0.4S, v20.4S, v9.s[1] +mul v20.4S, v20.4S,v1.s[1] +trn1 v4.4S, v11.4S, v7.4S +trn2 v28.4S, v11.4S, v7.4S +mla v8.4S, v14.4S, v31.s[0] +sub v14.4s, v6.4s, v12.4s +add v6.4s, v6.4s, v12.4s +sqrdmulh v12.4S, v5.4S, v9.s[2] +mul v5.4S, v5.4S,v1.s[2] +trn1 v15.4S, v3.4S, v19.4S +trn2 v29.4S, v3.4S, v19.4S +mla v20.4S, v0.4S, v31.s[0] +sub v0.4s, v21.4s, v8.4s +add v21.4s, v21.4s, v8.4s +sqrdmulh v8.4S, v21.4S, v30.s[1] +mul v21.4S, v21.4S,v18.s[1] +trn2 v3.2D, v4.2D, v15.2D +trn2 v19.2D, v28.2D, v29.2D +mla v5.4S, v12.4S, v31.s[0] +sub v12.4s, v17.4s, v20.4s +add v17.4s, v17.4s, v20.4s +sqrdmulh v9.4S, v0.4S, v30.s[2] +mul v0.4S, v0.4S,v18.s[2] +trn1 v11.2D, v4.2D, v15.2D +trn1 v7.2D, v28.2D, v29.2D +mla v21.4S, v8.4S, v31.s[0] +sub v8.4s, v2.4s, v5.4s +add v2.4s, v2.4s, v5.4s +mla v0.4S, v9.4S, v31.s[0] +sub v9.4s, v6.4s, v21.4s +add v6.4s, v6.4s, v21.4s +sub v30.4s, v14.4s, v0.4s +add v14.4s, v14.4s, v0.4s +ldr q0, [x17, #+160] +ldr q18, [x17, #+176] +sqrdmulh v21.4S, v13.4S, v18.4S +mul v13.4S, v13.4S,v0.4S +trn1 v5.4S, v17.4S, v12.4S +trn2 v29.4S, v17.4S, v12.4S +sqrdmulh v28.4S, v16.4S, v18.4S +mul v16.4S, v16.4S,v0.4S +trn1 v18.4S, v2.4S, v8.4S +trn2 v0.4S, v2.4S, v8.4S +mla v13.4S, v21.4S, v31.s[0] +ldr q21, [x17, #+288] +ldr q15, [x17, #+304] +sqrdmulh v4.4S, v3.4S, v15.4S +mul v3.4S, v3.4S,v21.4S +trn2 v2.2D, v5.2D, v18.2D +trn2 v8.2D, v29.2D, v0.2D +mla v16.4S, v28.4S, v31.s[0] +sub v28.4s, v22.4s, v13.4s +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v19.4S, v15.4S +mul v19.4S, v19.4S,v21.4S +trn1 v17.2D, v5.2D, v18.2D +trn1 v12.2D, v29.2D, v0.2D +mla v3.4S, v4.4S, v31.s[0] +sub v4.4s, v10.4s, v16.4s +add v10.4s, v10.4s, v16.4s +ldr q16, [x17, #+192] +ldr q0, [x17, #+208] +sqrdmulh v29.4S, v10.4S, v0.4S +mul v10.4S, v10.4S,v16.4S +trn1 v0.4S, v6.4S, v9.4S +trn2 v16.4S, v6.4S, v9.4S +mla v19.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v3.4s +add v11.4s, v11.4s, v3.4s +ldr q3, [x17, #+224] +ldr q18, [x17, #+240] +sqrdmulh v5.4S, v4.4S, v18.4S +mul v4.4S, v4.4S,v3.4S +trn1 v18.4S, v14.4S, v30.4S +trn2 v3.4S, v14.4S, v30.4S +mla v10.4S, v29.4S, v31.s[0] +sub v29.4s, v7.4s, v19.4s +add v7.4s, v7.4s, v19.4s +ldr q19, [x17, #+320] +ldr q15, [x17, #+336] +sqrdmulh v21.4S, v7.4S, v15.4S +mul v7.4S, v7.4S,v19.4S +trn2 v14.2D, v0.2D, v18.2D +trn2 v30.2D, v16.2D, v3.2D +mla v4.4S, v5.4S, v31.s[0] +sub v5.4s, v22.4s, v10.4s +add v22.4s, v22.4s, v10.4s +ldr q10, [x17, #+352] +ldr q15, [x17, #+368] +sqrdmulh v19.4S, v29.4S, v15.4S +mul v29.4S, v29.4S,v10.4S +trn1 v6.2D, v0.2D, v18.2D +trn1 v9.2D, v16.2D, v3.2D +mla v7.4S, v21.4S, v31.s[0] +sub v21.4s, v28.4s, v4.4s +add v28.4s, v28.4s, v4.4s +mla v29.4S, v19.4S, v31.s[0] +sub v19.4s, v11.4s, v7.4s +add v11.4s, v11.4s, v7.4s +sub v7.4s, v13.4s, v29.4s +add v13.4s, v13.4s, v29.4s +ldr q29, [x17, #+416] +ldr q4, [x17, #+432] +sqrdmulh v3.4S, v2.4S, v4.4S +mul v2.4S, v2.4S,v29.4S +str q22, [x0, #0] +sqrdmulh v22.4S, v8.4S, v4.4S +str q5, [x0, #16] +mul v8.4S, v8.4S,v29.4S +str q28, [x0, #32] +mla v2.4S, v3.4S, v31.s[0] +ldr q3, [x17, #+544] +ldr q28, [x17, #+560] +sqrdmulh v4.4S, v14.4S, v28.4S +str q11, [x0, #64] +mul v14.4S, v14.4S,v3.4S +str q21, [x0, #48] +mla v8.4S, v22.4S, v31.s[0] +str q19, [x0, #80] +sub v19.4s, v17.4s, v2.4s +add v17.4s, v17.4s, v2.4s +sqrdmulh v2.4S, v30.4S, v28.4S +mul v30.4S, v30.4S,v3.4S +str q13, [x0, #96] +mla v14.4S, v4.4S, v31.s[0] +sub v4.4s, v12.4s, v8.4s +add v12.4s, v12.4s, v8.4s +ldr q8, [x17, #+448] +ldr q13, [x17, #+464] +sqrdmulh v28.4S, v12.4S, v13.4S +mul v12.4S, v12.4S,v8.4S +str q7, [x0, #112] +mla v30.4S, v2.4S, v31.s[0] +sub v2.4s, v6.4s, v14.4s +add v6.4s, v6.4s, v14.4s +ldr q14, [x17, #+480] +ldr q7, [x17, #+496] +sqrdmulh v13.4S, v4.4S, v7.4S +mul v4.4S, v4.4S,v14.4S +mla v12.4S, v28.4S, v31.s[0] +sub v28.4s, v9.4s, v30.4s +add v9.4s, v9.4s, v30.4s +ldr q30, [x17, #+576] +ldr q7, [x17, #+592] +sqrdmulh v14.4S, v9.4S, v7.4S +mul v9.4S, v9.4S,v30.4S +mla v4.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v12.4s +add v17.4s, v17.4s, v12.4s +ldr q12, [x17, #+608] +ldr q7, [x17, #+624] +sqrdmulh v30.4S, v28.4S, v7.4S +mul v28.4S, v28.4S,v12.4S +ldr q7, [x0, #288] +mla v9.4S, v14.4S, v31.s[0] +ldr q14, [x0, #304] +sub v12.4s, v19.4s, v4.4s +ldr q8, [x17, #+640] +add v19.4s, v19.4s, v4.4s +ldr q4, [x17, #+656] +mla v28.4S, v30.4S, v31.s[0] +ldr q30, [x17, #+768] +sub v3.4s, v6.4s, v9.4s +ldr q22, [x0, #352] +add v6.4s, v6.4s, v9.4s +ldr q9, [x17, #+784] +sub v21.4s, v2.4s, v28.4s +ldr q11, [x0, #368] +add v2.4s, v2.4s, v28.4s +sqrdmulh v28.4S, v7.4S, v4.s[0] +mul v7.4S, v7.4S,v8.s[0] +sqrdmulh v29.4S, v14.4S, v4.s[0] +mul v14.4S, v14.4S,v8.s[0] +str q17, [x0, #128] +str q13, [x0, #144] +str q19, [x0, #160] +str q12, [x0, #176] +mla v7.4S, v28.4S, v31.s[0] +ldr q28, [x0, #256] +sqrdmulh v12.4S, v22.4S, v9.s[0] +ldr q19, [x0, #272] +mul v22.4S, v22.4S,v30.s[0] +str q6, [x0, #192] +str q3, [x0, #208] +str q2, [x0, #224] +str q21, [x0, #240] +mla v14.4S, v29.4S, v31.s[0] +sub v29.4s, v28.4s, v7.4s +add v28.4s, v28.4s, v7.4s +sqrdmulh v7.4S, v11.4S, v9.s[0] +ldr q21, [x0, #416] +mul v11.4S, v11.4S,v30.s[0] +ldr q2, [x0, #432] +mla v22.4S, v12.4S, v31.s[0] +ldr q12, [x0, #320] +sub v3.4s, v19.4s, v14.4s +add v19.4s, v19.4s, v14.4s +sqrdmulh v14.4S, v19.4S, v4.s[1] +ldr q6, [x0, #384] +mul v19.4S, v19.4S,v8.s[1] +ldr q13, [x0, #400] +mla v11.4S, v7.4S, v31.s[0] +ldr q7, [x0, #336] +sub v17.4s, v12.4s, v22.4s +add v12.4s, v12.4s, v22.4s +sqrdmulh v22.4S, v3.4S, v4.s[2] +ldr q5, [x17, #+896] +mul v3.4S, v3.4S,v8.s[2] +ldr q16, [x17, #+912] +mla v19.4S, v14.4S, v31.s[0] +sub v14.4s, v7.4s, v11.4s +add v7.4s, v7.4s, v11.4s +sqrdmulh v11.4S, v7.4S, v9.s[1] +ldr q18, [x0, #480] +mul v7.4S, v7.4S,v30.s[1] +ldr q0, [x0, #496] +mla v3.4S, v22.4S, v31.s[0] +sub v22.4s, v28.4s, v19.4s +add v28.4s, v28.4s, v19.4s +sqrdmulh v4.4S, v14.4S, v9.s[2] +ldr q8, [x0, #448] +mul v14.4S, v14.4S,v30.s[2] +ldr q19, [x0, #464] +mla v7.4S, v11.4S, v31.s[0] +sub v11.4s, v29.4s, v3.4s +add v29.4s, v29.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v16.s[0] +ldr q15, [x17, #+1024] +mul v21.4S, v21.4S,v5.s[0] +ldr q10, [x17, #+1040] +trn1 v1.4S, v28.4S, v22.4S +trn2 v20.4S, v28.4S, v22.4S +mla v14.4S, v4.4S, v31.s[0] +sub v4.4s, v12.4s, v7.4s +add v12.4s, v12.4s, v7.4s +sqrdmulh v9.4S, v2.4S, v16.s[0] +mul v2.4S, v2.4S,v5.s[0] +trn1 v30.4S, v29.4S, v11.4S +trn2 v7.4S, v29.4S, v11.4S +mla v21.4S, v3.4S, v31.s[0] +sub v3.4s, v17.4s, v14.4s +add v17.4s, v17.4s, v14.4s +sqrdmulh v14.4S, v18.4S, v10.s[0] +mul v18.4S, v18.4S,v15.s[0] +trn2 v29.2D, v1.2D, v30.2D +trn2 v11.2D, v20.2D, v7.2D +mla v2.4S, v9.4S, v31.s[0] +sub v9.4s, v6.4s, v21.4s +add v6.4s, v6.4s, v21.4s +sqrdmulh v21.4S, v0.4S, v10.s[0] +mul v0.4S, v0.4S,v15.s[0] +trn1 v28.2D, v1.2D, v30.2D +trn1 v22.2D, v20.2D, v7.2D +mla v18.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v2.4s +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v13.4S, v16.s[1] +mul v13.4S, v13.4S,v5.s[1] +trn1 v7.4S, v12.4S, v4.4S +trn2 v20.4S, v12.4S, v4.4S +mla v0.4S, v21.4S, v31.s[0] +sub v21.4s, v8.4s, v18.4s +add v8.4s, v8.4s, v18.4s +sqrdmulh v18.4S, v14.4S, v16.s[2] +mul v14.4S, v14.4S,v5.s[2] +trn1 v30.4S, v17.4S, v3.4S +trn2 v1.4S, v17.4S, v3.4S +mla v13.4S, v2.4S, v31.s[0] +sub v2.4s, v19.4s, v0.4s +add v19.4s, v19.4s, v0.4s +sqrdmulh v0.4S, v19.4S, v10.s[1] +mul v19.4S, v19.4S,v15.s[1] +trn2 v17.2D, v7.2D, v30.2D +trn2 v3.2D, v20.2D, v1.2D +mla v14.4S, v18.4S, v31.s[0] +sub v18.4s, v6.4s, v13.4s +add v6.4s, v6.4s, v13.4s +sqrdmulh v16.4S, v2.4S, v10.s[2] +mul v2.4S, v2.4S,v15.s[2] +trn1 v12.2D, v7.2D, v30.2D +trn1 v4.2D, v20.2D, v1.2D +mla v19.4S, v0.4S, v31.s[0] +sub v0.4s, v9.4s, v14.4s +add v9.4s, v9.4s, v14.4s +mla v2.4S, v16.4S, v31.s[0] +sub v16.4s, v8.4s, v19.4s +add v8.4s, v8.4s, v19.4s +sub v10.4s, v21.4s, v2.4s +add v21.4s, v21.4s, v2.4s +ldr q2, [x17, #+672] +ldr q15, [x17, #+688] +sqrdmulh v19.4S, v29.4S, v15.4S +mul v29.4S, v29.4S,v2.4S +trn1 v14.4S, v6.4S, v18.4S +trn2 v1.4S, v6.4S, v18.4S +sqrdmulh v20.4S, v11.4S, v15.4S +mul v11.4S, v11.4S,v2.4S +trn1 v15.4S, v9.4S, v0.4S +trn2 v2.4S, v9.4S, v0.4S +mla v29.4S, v19.4S, v31.s[0] +ldr q19, [x17, #+800] +ldr q30, [x17, #+816] +sqrdmulh v7.4S, v17.4S, v30.4S +mul v17.4S, v17.4S,v19.4S +trn2 v9.2D, v14.2D, v15.2D +trn2 v0.2D, v1.2D, v2.2D +mla v11.4S, v20.4S, v31.s[0] +sub v20.4s, v28.4s, v29.4s +add v28.4s, v28.4s, v29.4s +sqrdmulh v29.4S, v3.4S, v30.4S +mul v3.4S, v3.4S,v19.4S +trn1 v6.2D, v14.2D, v15.2D +trn1 v18.2D, v1.2D, v2.2D +mla v17.4S, v7.4S, v31.s[0] +sub v7.4s, v22.4s, v11.4s +add v22.4s, v22.4s, v11.4s +ldr q11, [x17, #+704] +ldr q2, [x17, #+720] +sqrdmulh v1.4S, v22.4S, v2.4S +mul v22.4S, v22.4S,v11.4S +trn1 v2.4S, v8.4S, v16.4S +trn2 v11.4S, v8.4S, v16.4S +mla v3.4S, v29.4S, v31.s[0] +sub v29.4s, v12.4s, v17.4s +add v12.4s, v12.4s, v17.4s +ldr q17, [x17, #+736] +ldr q15, [x17, #+752] +sqrdmulh v14.4S, v7.4S, v15.4S +mul v7.4S, v7.4S,v17.4S +trn1 v15.4S, v21.4S, v10.4S +trn2 v17.4S, v21.4S, v10.4S +mla v22.4S, v1.4S, v31.s[0] +sub v1.4s, v4.4s, v3.4s +add v4.4s, v4.4s, v3.4s +ldr q3, [x17, #+832] +ldr q30, [x17, #+848] +sqrdmulh v19.4S, v4.4S, v30.4S +mul v4.4S, v4.4S,v3.4S +trn2 v21.2D, v2.2D, v15.2D +trn2 v10.2D, v11.2D, v17.2D +mla v7.4S, v14.4S, v31.s[0] +sub v14.4s, v28.4s, v22.4s +add v28.4s, v28.4s, v22.4s +ldr q22, [x17, #+864] +ldr q30, [x17, #+880] +sqrdmulh v3.4S, v1.4S, v30.4S +mul v1.4S, v1.4S,v22.4S +trn1 v8.2D, v2.2D, v15.2D +trn1 v16.2D, v11.2D, v17.2D +mla v4.4S, v19.4S, v31.s[0] +sub v19.4s, v20.4s, v7.4s +add v20.4s, v20.4s, v7.4s +mla v1.4S, v3.4S, v31.s[0] +sub v3.4s, v12.4s, v4.4s +add v12.4s, v12.4s, v4.4s +sub v4.4s, v29.4s, v1.4s +add v29.4s, v29.4s, v1.4s +ldr q1, [x17, #+928] +ldr q7, [x17, #+944] +sqrdmulh v17.4S, v9.4S, v7.4S +mul v9.4S, v9.4S,v1.4S +str q28, [x0, #256] +sqrdmulh v28.4S, v0.4S, v7.4S +str q14, [x0, #272] +mul v0.4S, v0.4S,v1.4S +str q20, [x0, #288] +mla v9.4S, v17.4S, v31.s[0] +ldr q17, [x17, #+1056] +ldr q20, [x17, #+1072] +sqrdmulh v7.4S, v21.4S, v20.4S +str q12, [x0, #320] +mul v21.4S, v21.4S,v17.4S +str q19, [x0, #304] +mla v0.4S, v28.4S, v31.s[0] +str q3, [x0, #336] +sub v3.4s, v6.4s, v9.4s +add v6.4s, v6.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v20.4S +mul v10.4S, v10.4S,v17.4S +str q29, [x0, #352] +mla v21.4S, v7.4S, v31.s[0] +sub v7.4s, v18.4s, v0.4s +add v18.4s, v18.4s, v0.4s +ldr q0, [x17, #+960] +ldr q29, [x17, #+976] +sqrdmulh v20.4S, v18.4S, v29.4S +mul v18.4S, v18.4S,v0.4S +str q4, [x0, #368] +mla v10.4S, v9.4S, v31.s[0] +sub v9.4s, v8.4s, v21.4s +add v8.4s, v8.4s, v21.4s +ldr q21, [x17, #+992] +ldr q4, [x17, #+1008] +sqrdmulh v29.4S, v7.4S, v4.4S +mul v7.4S, v7.4S,v21.4S +mla v18.4S, v20.4S, v31.s[0] +sub v20.4s, v16.4s, v10.4s +add v16.4s, v16.4s, v10.4s +ldr q10, [x17, #+1088] +ldr q4, [x17, #+1104] +sqrdmulh v21.4S, v16.4S, v4.4S +mul v16.4S, v16.4S,v10.4S +mla v7.4S, v29.4S, v31.s[0] +sub v29.4s, v6.4s, v18.4s +add v6.4s, v6.4s, v18.4s +ldr q18, [x17, #+1120] +ldr q4, [x17, #+1136] +sqrdmulh v10.4S, v20.4S, v4.4S +mul v20.4S, v20.4S,v18.4S +ldr q4, [x0, #544] +mla v16.4S, v21.4S, v31.s[0] +ldr q21, [x0, #560] +sub v18.4s, v3.4s, v7.4s +ldr q0, [x17, #+1152] +add v3.4s, v3.4s, v7.4s +ldr q7, [x17, #+1168] +mla v20.4S, v10.4S, v31.s[0] +ldr q10, [x17, #+1280] +sub v17.4s, v8.4s, v16.4s +ldr q28, [x0, #608] +add v8.4s, v8.4s, v16.4s +ldr q16, [x17, #+1296] +sub v19.4s, v9.4s, v20.4s +ldr q12, [x0, #624] +add v9.4s, v9.4s, v20.4s +sqrdmulh v20.4S, v4.4S, v7.s[0] +mul v4.4S, v4.4S,v0.s[0] +sqrdmulh v1.4S, v21.4S, v7.s[0] +mul v21.4S, v21.4S,v0.s[0] +str q6, [x0, #384] +str q29, [x0, #400] +str q3, [x0, #416] +str q18, [x0, #432] +mla v4.4S, v20.4S, v31.s[0] +ldr q20, [x0, #512] +sqrdmulh v18.4S, v28.4S, v16.s[0] +ldr q3, [x0, #528] +mul v28.4S, v28.4S,v10.s[0] +str q8, [x0, #448] +str q17, [x0, #464] +str q9, [x0, #480] +str q19, [x0, #496] +mla v21.4S, v1.4S, v31.s[0] +sub v1.4s, v20.4s, v4.4s +add v20.4s, v20.4s, v4.4s +sqrdmulh v4.4S, v12.4S, v16.s[0] +ldr q19, [x0, #672] +mul v12.4S, v12.4S,v10.s[0] +ldr q9, [x0, #688] +mla v28.4S, v18.4S, v31.s[0] +ldr q18, [x0, #576] +sub v17.4s, v3.4s, v21.4s +add v3.4s, v3.4s, v21.4s +sqrdmulh v21.4S, v3.4S, v7.s[1] +ldr q8, [x0, #640] +mul v3.4S, v3.4S,v0.s[1] +ldr q29, [x0, #656] +mla v12.4S, v4.4S, v31.s[0] +ldr q4, [x0, #592] +sub v6.4s, v18.4s, v28.4s +add v18.4s, v18.4s, v28.4s +sqrdmulh v28.4S, v17.4S, v7.s[2] +ldr q14, [x17, #+1408] +mul v17.4S, v17.4S,v0.s[2] +ldr q11, [x17, #+1424] +mla v3.4S, v21.4S, v31.s[0] +sub v21.4s, v4.4s, v12.4s +add v4.4s, v4.4s, v12.4s +sqrdmulh v12.4S, v4.4S, v16.s[1] +ldr q15, [x0, #736] +mul v4.4S, v4.4S,v10.s[1] +ldr q2, [x0, #752] +mla v17.4S, v28.4S, v31.s[0] +sub v28.4s, v20.4s, v3.4s +add v20.4s, v20.4s, v3.4s +sqrdmulh v7.4S, v21.4S, v16.s[2] +ldr q0, [x0, #704] +mul v21.4S, v21.4S,v10.s[2] +ldr q3, [x0, #720] +mla v4.4S, v12.4S, v31.s[0] +sub v12.4s, v1.4s, v17.4s +add v1.4s, v1.4s, v17.4s +sqrdmulh v17.4S, v19.4S, v11.s[0] +ldr q30, [x17, #+1536] +mul v19.4S, v19.4S,v14.s[0] +ldr q22, [x17, #+1552] +trn1 v5.4S, v20.4S, v28.4S +trn2 v13.4S, v20.4S, v28.4S +mla v21.4S, v7.4S, v31.s[0] +sub v7.4s, v18.4s, v4.4s +add v18.4s, v18.4s, v4.4s +sqrdmulh v16.4S, v9.4S, v11.s[0] +mul v9.4S, v9.4S,v14.s[0] +trn1 v10.4S, v1.4S, v12.4S +trn2 v4.4S, v1.4S, v12.4S +mla v19.4S, v17.4S, v31.s[0] +sub v17.4s, v6.4s, v21.4s +add v6.4s, v6.4s, v21.4s +sqrdmulh v21.4S, v15.4S, v22.s[0] +mul v15.4S, v15.4S,v30.s[0] +trn2 v1.2D, v5.2D, v10.2D +trn2 v12.2D, v13.2D, v4.2D +mla v9.4S, v16.4S, v31.s[0] +sub v16.4s, v8.4s, v19.4s +add v8.4s, v8.4s, v19.4s +sqrdmulh v19.4S, v2.4S, v22.s[0] +mul v2.4S, v2.4S,v30.s[0] +trn1 v20.2D, v5.2D, v10.2D +trn1 v28.2D, v13.2D, v4.2D +mla v15.4S, v21.4S, v31.s[0] +sub v21.4s, v29.4s, v9.4s +add v29.4s, v29.4s, v9.4s +sqrdmulh v9.4S, v29.4S, v11.s[1] +mul v29.4S, v29.4S,v14.s[1] +trn1 v4.4S, v18.4S, v7.4S +trn2 v13.4S, v18.4S, v7.4S +mla v2.4S, v19.4S, v31.s[0] +sub v19.4s, v0.4s, v15.4s +add v0.4s, v0.4s, v15.4s +sqrdmulh v15.4S, v21.4S, v11.s[2] +mul v21.4S, v21.4S,v14.s[2] +trn1 v10.4S, v6.4S, v17.4S +trn2 v5.4S, v6.4S, v17.4S +mla v29.4S, v9.4S, v31.s[0] +sub v9.4s, v3.4s, v2.4s +add v3.4s, v3.4s, v2.4s +sqrdmulh v2.4S, v3.4S, v22.s[1] +mul v3.4S, v3.4S,v30.s[1] +trn2 v6.2D, v4.2D, v10.2D +trn2 v17.2D, v13.2D, v5.2D +mla v21.4S, v15.4S, v31.s[0] +sub v15.4s, v8.4s, v29.4s +add v8.4s, v8.4s, v29.4s +sqrdmulh v11.4S, v9.4S, v22.s[2] +mul v9.4S, v9.4S,v30.s[2] +trn1 v18.2D, v4.2D, v10.2D +trn1 v7.2D, v13.2D, v5.2D +mla v3.4S, v2.4S, v31.s[0] +sub v2.4s, v16.4s, v21.4s +add v16.4s, v16.4s, v21.4s +mla v9.4S, v11.4S, v31.s[0] +sub v11.4s, v0.4s, v3.4s +add v0.4s, v0.4s, v3.4s +sub v22.4s, v19.4s, v9.4s +add v19.4s, v19.4s, v9.4s +ldr q9, [x17, #+1184] +ldr q30, [x17, #+1200] +sqrdmulh v3.4S, v1.4S, v30.4S +mul v1.4S, v1.4S,v9.4S +trn1 v21.4S, v8.4S, v15.4S +trn2 v5.4S, v8.4S, v15.4S +sqrdmulh v13.4S, v12.4S, v30.4S +mul v12.4S, v12.4S,v9.4S +trn1 v30.4S, v16.4S, v2.4S +trn2 v9.4S, v16.4S, v2.4S +mla v1.4S, v3.4S, v31.s[0] +ldr q3, [x17, #+1312] +ldr q10, [x17, #+1328] +sqrdmulh v4.4S, v6.4S, v10.4S +mul v6.4S, v6.4S,v3.4S +trn2 v16.2D, v21.2D, v30.2D +trn2 v2.2D, v5.2D, v9.2D +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v20.4s, v1.4s +add v20.4s, v20.4s, v1.4s +sqrdmulh v1.4S, v17.4S, v10.4S +mul v17.4S, v17.4S,v3.4S +trn1 v8.2D, v21.2D, v30.2D +trn1 v15.2D, v5.2D, v9.2D +mla v6.4S, v4.4S, v31.s[0] +sub v4.4s, v28.4s, v12.4s +add v28.4s, v28.4s, v12.4s +ldr q12, [x17, #+1216] +ldr q9, [x17, #+1232] +sqrdmulh v5.4S, v28.4S, v9.4S +mul v28.4S, v28.4S,v12.4S +trn1 v9.4S, v0.4S, v11.4S +trn2 v12.4S, v0.4S, v11.4S +mla v17.4S, v1.4S, v31.s[0] +sub v1.4s, v18.4s, v6.4s +add v18.4s, v18.4s, v6.4s +ldr q6, [x17, #+1248] +ldr q30, [x17, #+1264] +sqrdmulh v21.4S, v4.4S, v30.4S +mul v4.4S, v4.4S,v6.4S +trn1 v30.4S, v19.4S, v22.4S +trn2 v6.4S, v19.4S, v22.4S +mla v28.4S, v5.4S, v31.s[0] +sub v5.4s, v7.4s, v17.4s +add v7.4s, v7.4s, v17.4s +ldr q17, [x17, #+1344] +ldr q10, [x17, #+1360] +sqrdmulh v3.4S, v7.4S, v10.4S +mul v7.4S, v7.4S,v17.4S +trn2 v19.2D, v9.2D, v30.2D +trn2 v22.2D, v12.2D, v6.2D +mla v4.4S, v21.4S, v31.s[0] +sub v21.4s, v20.4s, v28.4s +add v20.4s, v20.4s, v28.4s +ldr q28, [x17, #+1376] +ldr q10, [x17, #+1392] +sqrdmulh v17.4S, v5.4S, v10.4S +mul v5.4S, v5.4S,v28.4S +trn1 v0.2D, v9.2D, v30.2D +trn1 v11.2D, v12.2D, v6.2D +mla v7.4S, v3.4S, v31.s[0] +sub v3.4s, v13.4s, v4.4s +add v13.4s, v13.4s, v4.4s +mla v5.4S, v17.4S, v31.s[0] +sub v17.4s, v18.4s, v7.4s +add v18.4s, v18.4s, v7.4s +sub v7.4s, v1.4s, v5.4s +add v1.4s, v1.4s, v5.4s +ldr q5, [x17, #+1440] +ldr q4, [x17, #+1456] +sqrdmulh v6.4S, v16.4S, v4.4S +mul v16.4S, v16.4S,v5.4S +str q20, [x0, #512] +sqrdmulh v20.4S, v2.4S, v4.4S +str q21, [x0, #528] +mul v2.4S, v2.4S,v5.4S +str q13, [x0, #544] +mla v16.4S, v6.4S, v31.s[0] +ldr q6, [x17, #+1568] +ldr q13, [x17, #+1584] +sqrdmulh v4.4S, v19.4S, v13.4S +str q18, [x0, #576] +mul v19.4S, v19.4S,v6.4S +str q3, [x0, #560] +mla v2.4S, v20.4S, v31.s[0] +str q17, [x0, #592] +sub v17.4s, v8.4s, v16.4s +add v8.4s, v8.4s, v16.4s +sqrdmulh v16.4S, v22.4S, v13.4S +mul v22.4S, v22.4S,v6.4S +str q1, [x0, #608] +mla v19.4S, v4.4S, v31.s[0] +sub v4.4s, v15.4s, v2.4s +add v15.4s, v15.4s, v2.4s +ldr q2, [x17, #+1472] +ldr q1, [x17, #+1488] +sqrdmulh v13.4S, v15.4S, v1.4S +mul v15.4S, v15.4S,v2.4S +str q7, [x0, #624] +mla v22.4S, v16.4S, v31.s[0] +sub v16.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +ldr q19, [x17, #+1504] +ldr q7, [x17, #+1520] +sqrdmulh v1.4S, v4.4S, v7.4S +mul v4.4S, v4.4S,v19.4S +mla v15.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +ldr q22, [x17, #+1600] +ldr q7, [x17, #+1616] +sqrdmulh v19.4S, v11.4S, v7.4S +mul v11.4S, v11.4S,v22.4S +mla v4.4S, v1.4S, v31.s[0] +sub v1.4s, v8.4s, v15.4s +add v8.4s, v8.4s, v15.4s +ldr q15, [x17, #+1632] +ldr q7, [x17, #+1648] +sqrdmulh v22.4S, v13.4S, v7.4S +mul v13.4S, v13.4S,v15.4S +ldr q7, [x0, #800] +mla v11.4S, v19.4S, v31.s[0] +ldr q19, [x0, #816] +sub v15.4s, v17.4s, v4.4s +ldr q2, [x17, #+1664] +add v17.4s, v17.4s, v4.4s +ldr q4, [x17, #+1680] +mla v13.4S, v22.4S, v31.s[0] +ldr q22, [x17, #+1792] +sub v6.4s, v0.4s, v11.4s +ldr q20, [x0, #864] +add v0.4s, v0.4s, v11.4s +ldr q11, [x17, #+1808] +sub v3.4s, v16.4s, v13.4s +ldr q18, [x0, #880] +add v16.4s, v16.4s, v13.4s +sqrdmulh v13.4S, v7.4S, v4.s[0] +mul v7.4S, v7.4S,v2.s[0] +sqrdmulh v5.4S, v19.4S, v4.s[0] +mul v19.4S, v19.4S,v2.s[0] +str q8, [x0, #640] +str q1, [x0, #656] +str q17, [x0, #672] +str q15, [x0, #688] +mla v7.4S, v13.4S, v31.s[0] +ldr q13, [x0, #768] +sqrdmulh v15.4S, v20.4S, v11.s[0] +ldr q17, [x0, #784] +mul v20.4S, v20.4S,v22.s[0] +str q0, [x0, #704] +str q6, [x0, #720] +str q16, [x0, #736] +str q3, [x0, #752] +mla v19.4S, v5.4S, v31.s[0] +sub v5.4s, v13.4s, v7.4s +add v13.4s, v13.4s, v7.4s +sqrdmulh v7.4S, v18.4S, v11.s[0] +ldr q3, [x0, #928] +mul v18.4S, v18.4S,v22.s[0] +ldr q16, [x0, #944] +mla v20.4S, v15.4S, v31.s[0] +ldr q15, [x0, #832] +sub v6.4s, v17.4s, v19.4s +add v17.4s, v17.4s, v19.4s +sqrdmulh v19.4S, v17.4S, v4.s[1] +ldr q0, [x0, #896] +mul v17.4S, v17.4S,v2.s[1] +ldr q1, [x0, #912] +mla v18.4S, v7.4S, v31.s[0] +ldr q7, [x0, #848] +sub v8.4s, v15.4s, v20.4s +add v15.4s, v15.4s, v20.4s +sqrdmulh v20.4S, v6.4S, v4.s[2] +ldr q21, [x17, #+1920] +mul v6.4S, v6.4S,v2.s[2] +ldr q12, [x17, #+1936] +mla v17.4S, v19.4S, v31.s[0] +sub v19.4s, v7.4s, v18.4s +add v7.4s, v7.4s, v18.4s +sqrdmulh v18.4S, v7.4S, v11.s[1] +ldr q30, [x0, #992] +mul v7.4S, v7.4S,v22.s[1] +ldr q9, [x0, #1008] +mla v6.4S, v20.4S, v31.s[0] +sub v20.4s, v13.4s, v17.4s +add v13.4s, v13.4s, v17.4s +sqrdmulh v4.4S, v19.4S, v11.s[2] +ldr q2, [x0, #960] +mul v19.4S, v19.4S,v22.s[2] +ldr q17, [x0, #976] +mla v7.4S, v18.4S, v31.s[0] +sub v18.4s, v5.4s, v6.4s +add v5.4s, v5.4s, v6.4s +sqrdmulh v6.4S, v3.4S, v12.s[0] +ldr q10, [x17, #+2048] +mul v3.4S, v3.4S,v21.s[0] +ldr q28, [x17, #+2064] +trn1 v14.4S, v13.4S, v20.4S +trn2 v29.4S, v13.4S, v20.4S +mla v19.4S, v4.4S, v31.s[0] +sub v4.4s, v15.4s, v7.4s +add v15.4s, v15.4s, v7.4s +sqrdmulh v11.4S, v16.4S, v12.s[0] +mul v16.4S, v16.4S,v21.s[0] +trn1 v22.4S, v5.4S, v18.4S +trn2 v7.4S, v5.4S, v18.4S +mla v3.4S, v6.4S, v31.s[0] +sub v6.4s, v8.4s, v19.4s +add v8.4s, v8.4s, v19.4s +sqrdmulh v19.4S, v30.4S, v28.s[0] +mul v30.4S, v30.4S,v10.s[0] +trn2 v5.2D, v14.2D, v22.2D +trn2 v18.2D, v29.2D, v7.2D +mla v16.4S, v11.4S, v31.s[0] +sub v11.4s, v0.4s, v3.4s +add v0.4s, v0.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v28.s[0] +mul v9.4S, v9.4S,v10.s[0] +trn1 v13.2D, v14.2D, v22.2D +trn1 v20.2D, v29.2D, v7.2D +mla v30.4S, v19.4S, v31.s[0] +sub v19.4s, v1.4s, v16.4s +add v1.4s, v1.4s, v16.4s +sqrdmulh v16.4S, v1.4S, v12.s[1] +mul v1.4S, v1.4S,v21.s[1] +trn1 v7.4S, v15.4S, v4.4S +trn2 v29.4S, v15.4S, v4.4S +mla v9.4S, v3.4S, v31.s[0] +sub v3.4s, v2.4s, v30.4s +add v2.4s, v2.4s, v30.4s +sqrdmulh v30.4S, v19.4S, v12.s[2] +mul v19.4S, v19.4S,v21.s[2] +trn1 v22.4S, v8.4S, v6.4S +trn2 v14.4S, v8.4S, v6.4S +mla v1.4S, v16.4S, v31.s[0] +sub v16.4s, v17.4s, v9.4s +add v17.4s, v17.4s, v9.4s +sqrdmulh v9.4S, v17.4S, v28.s[1] +mul v17.4S, v17.4S,v10.s[1] +trn2 v8.2D, v7.2D, v22.2D +trn2 v6.2D, v29.2D, v14.2D +mla v19.4S, v30.4S, v31.s[0] +sub v30.4s, v0.4s, v1.4s +add v0.4s, v0.4s, v1.4s +sqrdmulh v12.4S, v16.4S, v28.s[2] +mul v16.4S, v16.4S,v10.s[2] +trn1 v15.2D, v7.2D, v22.2D +trn1 v4.2D, v29.2D, v14.2D +mla v17.4S, v9.4S, v31.s[0] +sub v9.4s, v11.4s, v19.4s +add v11.4s, v11.4s, v19.4s +mla v16.4S, v12.4S, v31.s[0] +sub v12.4s, v2.4s, v17.4s +add v2.4s, v2.4s, v17.4s +sub v28.4s, v3.4s, v16.4s +add v3.4s, v3.4s, v16.4s +ldr q16, [x17, #+1696] +ldr q10, [x17, #+1712] +sqrdmulh v17.4S, v5.4S, v10.4S +mul v5.4S, v5.4S,v16.4S +trn1 v19.4S, v0.4S, v30.4S +trn2 v14.4S, v0.4S, v30.4S +sqrdmulh v29.4S, v18.4S, v10.4S +mul v18.4S, v18.4S,v16.4S +trn1 v10.4S, v11.4S, v9.4S +trn2 v16.4S, v11.4S, v9.4S +mla v5.4S, v17.4S, v31.s[0] +ldr q17, [x17, #+1824] +ldr q22, [x17, #+1840] +sqrdmulh v7.4S, v8.4S, v22.4S +mul v8.4S, v8.4S,v17.4S +trn2 v11.2D, v19.2D, v10.2D +trn2 v9.2D, v14.2D, v16.2D +mla v18.4S, v29.4S, v31.s[0] +sub v29.4s, v13.4s, v5.4s +add v13.4s, v13.4s, v5.4s +sqrdmulh v5.4S, v6.4S, v22.4S +mul v6.4S, v6.4S,v17.4S +trn1 v0.2D, v19.2D, v10.2D +trn1 v30.2D, v14.2D, v16.2D +mla v8.4S, v7.4S, v31.s[0] +sub v7.4s, v20.4s, v18.4s +add v20.4s, v20.4s, v18.4s +ldr q18, [x17, #+1728] +ldr q16, [x17, #+1744] +sqrdmulh v14.4S, v20.4S, v16.4S +mul v20.4S, v20.4S,v18.4S +trn1 v16.4S, v2.4S, v12.4S +trn2 v18.4S, v2.4S, v12.4S +mla v6.4S, v5.4S, v31.s[0] +sub v5.4s, v15.4s, v8.4s +add v15.4s, v15.4s, v8.4s +ldr q8, [x17, #+1760] +ldr q10, [x17, #+1776] +sqrdmulh v19.4S, v7.4S, v10.4S +mul v7.4S, v7.4S,v8.4S +trn1 v10.4S, v3.4S, v28.4S +trn2 v8.4S, v3.4S, v28.4S +mla v20.4S, v14.4S, v31.s[0] +sub v14.4s, v4.4s, v6.4s +add v4.4s, v4.4s, v6.4s +ldr q6, [x17, #+1856] +ldr q22, [x17, #+1872] +sqrdmulh v17.4S, v4.4S, v22.4S +mul v4.4S, v4.4S,v6.4S +trn2 v3.2D, v16.2D, v10.2D +trn2 v28.2D, v18.2D, v8.2D +mla v7.4S, v19.4S, v31.s[0] +sub v19.4s, v13.4s, v20.4s +add v13.4s, v13.4s, v20.4s +ldr q20, [x17, #+1888] +ldr q22, [x17, #+1904] +sqrdmulh v6.4S, v14.4S, v22.4S +mul v14.4S, v14.4S,v20.4S +trn1 v2.2D, v16.2D, v10.2D +trn1 v12.2D, v18.2D, v8.2D +mla v4.4S, v17.4S, v31.s[0] +sub v17.4s, v29.4s, v7.4s +add v29.4s, v29.4s, v7.4s +mla v14.4S, v6.4S, v31.s[0] +sub v6.4s, v15.4s, v4.4s +add v15.4s, v15.4s, v4.4s +sub v4.4s, v5.4s, v14.4s +add v5.4s, v5.4s, v14.4s +ldr q14, [x17, #+1952] +ldr q7, [x17, #+1968] +sqrdmulh v8.4S, v11.4S, v7.4S +mul v11.4S, v11.4S,v14.4S +str q13, [x0, #768] +sqrdmulh v13.4S, v9.4S, v7.4S +str q19, [x0, #784] +mul v9.4S, v9.4S,v14.4S +str q29, [x0, #800] +mla v11.4S, v8.4S, v31.s[0] +ldr q8, [x17, #+2080] +ldr q29, [x17, #+2096] +sqrdmulh v7.4S, v3.4S, v29.4S +str q15, [x0, #832] +mul v3.4S, v3.4S,v8.4S +str q17, [x0, #816] +mla v9.4S, v13.4S, v31.s[0] +str q6, [x0, #848] +sub v6.4s, v0.4s, v11.4s +add v0.4s, v0.4s, v11.4s +sqrdmulh v11.4S, v28.4S, v29.4S +mul v28.4S, v28.4S,v8.4S +str q5, [x0, #864] +mla v3.4S, v7.4S, v31.s[0] +sub v7.4s, v30.4s, v9.4s +add v30.4s, v30.4s, v9.4s +ldr q9, [x17, #+1984] +ldr q5, [x17, #+2000] +sqrdmulh v29.4S, v30.4S, v5.4S +mul v30.4S, v30.4S,v9.4S +str q4, [x0, #880] +mla v28.4S, v11.4S, v31.s[0] +sub v11.4s, v2.4s, v3.4s +add v2.4s, v2.4s, v3.4s +ldr q3, [x17, #+2016] +ldr q4, [x17, #+2032] +sqrdmulh v5.4S, v7.4S, v4.4S +mul v7.4S, v7.4S,v3.4S +mla v30.4S, v29.4S, v31.s[0] +sub v29.4s, v12.4s, v28.4s +add v12.4s, v12.4s, v28.4s +ldr q28, [x17, #+2112] +ldr q4, [x17, #+2128] +sqrdmulh v3.4S, v12.4S, v4.4S +mul v12.4S, v12.4S,v28.4S +mla v7.4S, v5.4S, v31.s[0] +sub v5.4s, v0.4s, v30.4s +add v0.4s, v0.4s, v30.4s +ldr q30, [x17, #+2144] +ldr q4, [x17, #+2160] +sqrdmulh v28.4S, v29.4S, v4.4S +mul v29.4S, v29.4S,v30.4S +mla v12.4S, v3.4S, v31.s[0] +sub v3.4s, v6.4s, v7.4s +add v6.4s, v6.4s, v7.4s +mla v29.4S, v28.4S, v31.s[0] +sub v28.4s, v2.4s, v12.4s +add v2.4s, v2.4s, v12.4s +sub v12.4s, v11.4s, v29.4s +add v11.4s, v11.4s, v29.4s +str q0, [x0, #896] +str q5, [x0, #912] +str q6, [x0, #928] +str q3, [x0, #944] +str q2, [x0, #960] +str q28, [x0, #976] +str q11, [x0, #992] +str q12, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 2392 +// Instruction count: 2388 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_3_z4_4.s b/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_3_z4_4.s new file mode 100644 index 0000000..39f413d --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_3_z4_4.s @@ -0,0 +1,2422 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 26036764 // Layer 6, block 0 +.word 7065381 // Layer 6, block 1 +.word 11280567 // Layer 6, block 2 +.word 19695786 // Layer 6, block 3 +.word 1666225723 // Layer 6, block 0 +.word 452149874 // Layer 6, block 1 +.word 721901190 // Layer 6, block 2 +.word 1260434103 // Layer 6, block 3 +.word 28678040 // Layer 7, block 0 +.word 5637166 // Layer 7, block 2 +.word 18759424 // Layer 7, block 4 +.word 8648030 // Layer 7, block 6 +.word 1835254486 // Layer 7, block 0 +.word 360751090 // Layer 7, block 2 +.word 1200511508 // Layer 7, block 4 +.word 553431680 // Layer 7, block 6 +.word 7232147 // Layer 7, block 1 +.word 7430689 // Layer 7, block 3 +.word 14819378 // Layer 7, block 5 +.word 22112339 // Layer 7, block 7 +.word 462822084 // Layer 7, block 1 +.word 475527802 // Layer 7, block 3 +.word 948367809 // Layer 7, block 5 +.word 1415081692 // Layer 7, block 7 +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14834498 // Layer 6, block 4 +.word 22861321 // Layer 6, block 5 +.word 23033862 // Layer 6, block 6 +.word 32211066 // Layer 6, block 7 +.word 949335415 // Layer 6, block 4 +.word 1463012881 // Layer 6, block 5 +.word 1474054663 // Layer 6, block 6 +.word 2061350894 // Layer 6, block 7 +.word 7103825 // Layer 7, block 8 +.word 24338119 // Layer 7, block 10 +.word 6674394 // Layer 7, block 12 +.word 3716128 // Layer 7, block 14 +.word 454610102 // Layer 7, block 8 +.word 1557520740 // Layer 7, block 10 +.word 427128616 // Layer 7, block 12 +.word 237814041 // Layer 7, block 14 +.word 18577393 // Layer 7, block 9 +.word 17042091 // Layer 7, block 11 +.word 6574213 // Layer 7, block 13 +.word 24666803 // Layer 7, block 15 +.word 1188862414 // Layer 7, block 9 +.word 1090610585 // Layer 7, block 11 +.word 420717521 // Layer 7, block 13 +.word 1578554911 // Layer 7, block 15 +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 11253846 // Layer 6, block 8 +.word 16151303 // Layer 6, block 9 +.word 1821442 // Layer 6, block 10 +.word 23358663 // Layer 6, block 11 +.word 720191176 // Layer 6, block 8 +.word 1033604503 // Layer 6, block 9 +.word 116563391 // Layer 6, block 10 +.word 1494840340 // Layer 6, block 11 +.word 32787475 // Layer 7, block 16 +.word 8269259 // Layer 7, block 18 +.word 20826321 // Layer 7, block 20 +.word 21194054 // Layer 7, block 22 +.word 2098238255 // Layer 7, block 16 +.word 529192186 // Layer 7, block 18 +.word 1332782821 // Layer 7, block 20 +.word 1356315937 // Layer 7, block 22 +.word 28400654 // Layer 7, block 17 +.word 31090287 // Layer 7, block 19 +.word 26776841 // Layer 7, block 21 +.word 22281074 // Layer 7, block 23 +.word 1817503137 // Layer 7, block 17 +.word 1989626512 // Layer 7, block 19 +.word 1713587037 // Layer 7, block 21 +.word 1425879908 // Layer 7, block 23 +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 20504641 // Layer 6, block 12 +.word 7735096 // Layer 6, block 13 +.word 29463916 // Layer 6, block 14 +.word 23172067 // Layer 6, block 15 +.word 1312196872 // Layer 6, block 12 +.word 495008363 // Layer 6, block 13 +.word 1885546712 // Layer 6, block 14 +.word 1482899108 // Layer 6, block 15 +.word 1953000 // Layer 7, block 24 +.word 12766243 // Layer 7, block 26 +.word 16292342 // Layer 7, block 28 +.word 25143337 // Layer 7, block 30 +.word 124982461 // Layer 7, block 24 +.word 816977197 // Layer 7, block 26 +.word 1042630311 // Layer 7, block 28 +.word 1609050759 // Layer 7, block 30 +.word 12486848 // Layer 7, block 25 +.word 31556661 // Layer 7, block 27 +.word 28330310 // Layer 7, block 29 +.word 15137961 // Layer 7, block 31 +.word 799097282 // Layer 7, block 25 +.word 2019472170 // Layer 7, block 27 +.word 1813001465 // Layer 7, block 29 +.word 968755565 // Layer 7, block 31 +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 18663828 // Layer 6, block 16 +.word 25765932 // Layer 6, block 17 +.word 11779122 // Layer 6, block 18 +.word 29112305 // Layer 6, block 19 +.word 1194393831 // Layer 6, block 16 +.word 1648893798 // Layer 6, block 17 +.word 753806275 // Layer 6, block 18 +.word 1863045325 // Layer 6, block 19 +.word 33163184 // Layer 7, block 32 +.word 11550623 // Layer 7, block 34 +.word 25375595 // Layer 7, block 36 +.word 18254638 // Layer 7, block 38 +.word 2122281795 // Layer 7, block 32 +.word 739183455 // Layer 7, block 34 +.word 1623914137 // Layer 7, block 36 +.word 1168207670 // Layer 7, block 38 +.word 9551359 // Layer 7, block 33 +.word 33257316 // Layer 7, block 35 +.word 10387700 // Layer 7, block 37 +.word 4263629 // Layer 7, block 39 +.word 611240324 // Layer 7, block 33 +.word 2128305784 // Layer 7, block 35 +.word 664762063 // Layer 7, block 37 +.word 272851431 // Layer 7, block 39 +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 596073 // Layer 6, block 20 +.word 29039358 // Layer 6, block 21 +.word 6760262 // Layer 6, block 22 +.word 2228887 // Layer 6, block 23 +.word 38145761 // Layer 6, block 20 +.word 1858377074 // Layer 6, block 21 +.word 432623749 // Layer 6, block 22 +.word 142637881 // Layer 6, block 23 +.word 25929180 // Layer 7, block 40 +.word 23508428 // Layer 7, block 42 +.word 22560727 // Layer 7, block 44 +.word 29457393 // Layer 7, block 46 +.word 1659340873 // Layer 7, block 40 +.word 1504424569 // Layer 7, block 42 +.word 1443776334 // Layer 7, block 44 +.word 1885129272 // Layer 7, block 46 +.word 17371159 // Layer 7, block 41 +.word 11558208 // Layer 7, block 43 +.word 15755637 // Layer 7, block 45 +.word 20740787 // Layer 7, block 47 +.word 1111669329 // Layer 7, block 41 +.word 739668858 // Layer 7, block 43 +.word 1008283812 // Layer 7, block 45 +.word 1327309063 // Layer 7, block 47 +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 13624329 // Layer 6, block 24 +.word 9838349 // Layer 6, block 25 +.word 6934560 // Layer 6, block 26 +.word 11310234 // Layer 6, block 27 +.word 871890510 // Layer 6, block 24 +.word 629606282 // Layer 6, block 25 +.word 443777969 // Layer 6, block 26 +.word 723799733 // Layer 6, block 27 +.word 3153984 // Layer 7, block 48 +.word 15599806 // Layer 7, block 50 +.word 23484790 // Layer 7, block 52 +.word 30174454 // Layer 7, block 54 +.word 201839571 // Layer 7, block 48 +.word 998311389 // Layer 7, block 50 +.word 1502911852 // Layer 7, block 52 +.word 1931017673 // Layer 7, block 54 +.word 13598070 // Layer 7, block 49 +.word 31454003 // Layer 7, block 51 +.word 20506260 // Layer 7, block 53 +.word 5928435 // Layer 7, block 55 +.word 870210062 // Layer 7, block 49 +.word 2012902560 // Layer 7, block 51 +.word 1312300480 // Layer 7, block 53 +.word 379390883 // Layer 7, block 55 +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 32798516 // Layer 6, block 28 +.word 9911360 // Layer 6, block 29 +.word 32443170 // Layer 6, block 30 +.word 31293482 // Layer 6, block 31 +.word 2098944825 // Layer 6, block 28 +.word 634278629 // Layer 6, block 29 +.word 2076204416 // Layer 6, block 30 +.word 2002630000 // Layer 6, block 31 +.word 26013877 // Layer 7, block 56 +.word 22928950 // Layer 7, block 58 +.word 24547058 // Layer 7, block 60 +.word 21082546 // Layer 7, block 62 +.word 1664761067 // Layer 7, block 56 +.word 1467340807 // Layer 7, block 58 +.word 1570891816 // Layer 7, block 60 +.word 1349179970 // Layer 7, block 62 +.word 21864746 // Layer 7, block 57 +.word 27678266 // Layer 7, block 59 +.word 30695887 // Layer 7, block 61 +.word 31772478 // Layer 7, block 63 +.word 1399236949 // Layer 7, block 57 +.word 1771273834 // Layer 7, block 59 +.word 1964386839 // Layer 7, block 61 +.word 2033283404 // Layer 7, block 63 +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 2853776 // Layer 6, block 32 +.word 31645959 // Layer 6, block 33 +.word 29723614 // Layer 6, block 34 +.word 31813171 // Layer 6, block 35 +.word 182627725 // Layer 6, block 32 +.word 2025186806 // Layer 6, block 33 +.word 1902166116 // Layer 6, block 34 +.word 2035887557 // Layer 6, block 35 +.word 30377953 // Layer 7, block 64 +.word 4924837 // Layer 7, block 66 +.word 11362575 // Layer 7, block 68 +.word 31398766 // Layer 7, block 70 +.word 1944040616 // Layer 7, block 64 +.word 315165513 // Layer 7, block 66 +.word 727149301 // Layer 7, block 68 +.word 2009367662 // Layer 7, block 70 +.word 27689101 // Layer 7, block 65 +.word 31229525 // Layer 7, block 67 +.word 6544948 // Layer 7, block 69 +.word 13728247 // Layer 7, block 71 +.word 1771967221 // Layer 7, block 65 +.word 1998537064 // Layer 7, block 67 +.word 418844704 // Layer 7, block 69 +.word 878540754 // Layer 7, block 71 +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9116920 // Layer 6, block 36 +.word 26449800 // Layer 6, block 37 +.word 27173300 // Layer 6, block 38 +.word 1574249 // Layer 6, block 39 +.word 583438350 // Layer 6, block 36 +.word 1692658010 // Layer 6, block 37 +.word 1738958476 // Layer 6, block 38 +.word 100744247 // Layer 6, block 39 +.word 6510145 // Layer 7, block 72 +.word 760999 // Layer 7, block 74 +.word 1634503 // Layer 7, block 76 +.word 29546109 // Layer 7, block 78 +.word 416617482 // Layer 7, block 72 +.word 48700219 // Layer 7, block 74 +.word 104600209 // Layer 7, block 76 +.word 1890806663 // Layer 7, block 78 +.word 2195232 // Layer 7, block 73 +.word 4465852 // Layer 7, block 75 +.word 31203102 // Layer 7, block 77 +.word 29916743 // Layer 7, block 79 +.word 140484126 // Layer 7, block 73 +.word 285792715 // Layer 7, block 75 +.word 1996846121 // Layer 7, block 77 +.word 1914525428 // Layer 7, block 79 +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29172999 // Layer 6, block 40 +.word 16825951 // Layer 6, block 41 +.word 11592382 // Layer 6, block 42 +.word 2671395 // Layer 6, block 43 +.word 1866929445 // Layer 6, block 40 +.word 1076778680 // Layer 6, block 41 +.word 741855827 // Layer 6, block 42 +.word 170956232 // Layer 6, block 43 +.word 14579779 // Layer 7, block 80 +.word 24263513 // Layer 7, block 82 +.word 4646776 // Layer 7, block 84 +.word 69049 // Layer 7, block 86 +.word 933034643 // Layer 7, block 80 +.word 1552746321 // Layer 7, block 82 +.word 297370968 // Layer 7, block 84 +.word 4418799 // Layer 7, block 86 +.word 33263488 // Layer 7, block 81 +.word 22493246 // Layer 7, block 83 +.word 22009979 // Layer 7, block 85 +.word 12021234 // Layer 7, block 87 +.word 2128700762 // Layer 7, block 81 +.word 1439457879 // Layer 7, block 83 +.word 1408531152 // Layer 7, block 85 +.word 769300260 // Layer 7, block 87 +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 15720958 // Layer 6, block 44 +.word 4876619 // Layer 6, block 45 +.word 9370171 // Layer 6, block 46 +.word 2197027 // Layer 6, block 47 +.word 1006064525 // Layer 6, block 44 +.word 312079797 // Layer 6, block 45 +.word 599645177 // Layer 6, block 46 +.word 140598997 // Layer 6, block 47 +.word 16117282 // Layer 7, block 88 +.word 9635661 // Layer 7, block 90 +.word 9117520 // Layer 7, block 92 +.word 3506913 // Layer 7, block 94 +.word 1031427326 // Layer 7, block 88 +.word 616635240 // Layer 7, block 90 +.word 583476747 // Layer 7, block 92 +.word 224425303 // Layer 7, block 94 +.word 20014407 // Layer 7, block 89 +.word 25893988 // Layer 7, block 91 +.word 10257619 // Layer 7, block 93 +.word 24501669 // Layer 7, block 95 +.word 1280824291 // Layer 7, block 89 +.word 1657088757 // Layer 7, block 91 +.word 656437514 // Layer 7, block 93 +.word 1567987141 // Layer 7, block 95 +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 23467272 // Layer 6, block 48 +.word 11944835 // Layer 6, block 49 +.word 29768154 // Layer 6, block 50 +.word 3189790 // Layer 6, block 51 +.word 1501790786 // Layer 6, block 48 +.word 764411097 // Layer 6, block 49 +.word 1905016458 // Layer 6, block 50 +.word 204130980 // Layer 6, block 51 +.word 28559032 // Layer 7, block 96 +.word 20151609 // Layer 7, block 98 +.word 11645481 // Layer 7, block 100 +.word 16402437 // Layer 7, block 102 +.word 1827638556 // Layer 7, block 96 +.word 1289604549 // Layer 7, block 98 +.word 745253903 // Layer 7, block 100 +.word 1049675853 // Layer 7, block 102 +.word 1005359 // Layer 7, block 97 +.word 19130139 // Layer 7, block 99 +.word 11690281 // Layer 7, block 101 +.word 5461508 // Layer 7, block 103 +.word 64338065 // Layer 7, block 97 +.word 1224235458 // Layer 7, block 99 +.word 748120885 // Layer 7, block 101 +.word 349509836 // Layer 7, block 103 +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 4898455 // Layer 6, block 52 +.word 22059944 // Layer 6, block 53 +.word 20315246 // Layer 6, block 54 +.word 28615767 // Layer 6, block 55 +.word 313477194 // Layer 6, block 52 +.word 1411728668 // Layer 6, block 53 +.word 1300076517 // Layer 6, block 54 +.word 1831269319 // Layer 6, block 55 +.word 6226096 // Layer 7, block 104 +.word 14029790 // Layer 7, block 106 +.word 7729000 // Layer 7, block 108 +.word 13958531 // Layer 7, block 110 +.word 398439734 // Layer 7, block 104 +.word 897838034 // Layer 7, block 106 +.word 494618249 // Layer 7, block 108 +.word 893277806 // Layer 7, block 110 +.word 31755058 // Layer 7, block 105 +.word 26102744 // Layer 7, block 107 +.word 19175904 // Layer 7, block 109 +.word 19472238 // Layer 7, block 111 +.word 2032168609 // Layer 7, block 105 +.word 1670448121 // Layer 7, block 107 +.word 1227164194 // Layer 7, block 109 +.word 1246128123 // Layer 7, block 111 +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 17302560 // Layer 6, block 56 +.word 8630188 // Layer 6, block 57 +.word 13744680 // Layer 6, block 58 +.word 31890906 // Layer 6, block 59 +.word 1107279328 // Layer 6, block 56 +.word 552289879 // Layer 6, block 57 +.word 879592386 // Layer 6, block 58 +.word 2040862218 // Layer 6, block 59 +.word 4735938 // Layer 7, block 112 +.word 26671657 // Layer 7, block 114 +.word 25810971 // Layer 7, block 116 +.word 25578690 // Layer 7, block 118 +.word 303076900 // Layer 7, block 112 +.word 1706855774 // Layer 7, block 114 +.word 1651776074 // Layer 7, block 116 +.word 1636911225 // Layer 7, block 118 +.word 6957373 // Layer 7, block 113 +.word 25381712 // Layer 7, block 115 +.word 27780827 // Layer 7, block 117 +.word 28062311 // Layer 7, block 119 +.word 445237890 // Layer 7, block 113 +.word 1624305595 // Layer 7, block 115 +.word 1777837237 // Layer 7, block 117 +.word 1795850838 // Layer 7, block 119 +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 26150922 // Layer 6, block 60 +.word 29525906 // Layer 6, block 61 +.word 23080870 // Layer 6, block 62 +.word 1636987 // Layer 6, block 63 +.word 1673531278 // Layer 6, block 60 +.word 1889513769 // Layer 6, block 61 +.word 1477062945 // Layer 6, block 62 +.word 104759172 // Layer 6, block 63 +.word 10674616 // Layer 7, block 120 +.word 9508293 // Layer 7, block 122 +.word 4274200 // Layer 7, block 124 +.word 10066304 // Layer 7, block 126 +.word 683123285 // Layer 7, block 120 +.word 608484310 // Layer 7, block 122 +.word 273527923 // Layer 7, block 124 +.word 644194289 // Layer 7, block 126 +.word 26473446 // Layer 7, block 121 +.word 14853570 // Layer 7, block 123 +.word 32427548 // Layer 7, block 125 +.word 16598340 // Layer 7, block 127 +.word 1694171239 // Layer 7, block 121 +.word 950555930 // Layer 7, block 123 +.word 2075204685 // Layer 7, block 125 +.word 1062212688 // Layer 7, block 127 +.text +.global ntt_u32_full_neon_asm_var_4_4_3_z4_4 +.global _ntt_u32_full_neon_asm_var_4_4_3_z4_4 +ntt_u32_full_neon_asm_var_4_4_3_z4_4: +_ntt_u32_full_neon_asm_var_4_4_3_z4_4: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #800] +ldr q21, [x0, #864] +ldr q20, [x0, #928] +ldr q19, [x0, #992] +ldr q18, [x0, #288] +ldr q17, [x0, #352] +ldr q16, [x0, #416] +ldr q3, [x0, #480] +sqrdmulh v2.4S, v22.4S, v29.s[0] +ldr q1, [x0, #544] +mul v22.4S, v22.4S,v30.s[0] +ldr q0, [x0, #608] +sqrdmulh v15.4S, v21.4S, v29.s[0] +ldr q14, [x0, #672] +mul v21.4S, v21.4S,v30.s[0] +ldr q13, [x0, #736] +mla v22.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q12, [x0, #32] +sub v11.4s, v18.4s, v22.4s +mla v21.4S, v15.4S, v31.s[0] +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q15, [x0, #96] +sub v10.4s, v17.4s, v21.4s +mla v20.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v1.4S, v29.s[0] +ldr q2, [x0, #160] +mul v1.4S, v1.4S,v30.s[0] +sub v9.4s, v16.4s, v20.4s +mla v19.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v0.4S, v29.s[0] +ldr q22, [x0, #224] +mul v0.4S, v0.4S,v30.s[0] +sub v8.4s, v3.4s, v19.4s +mla v1.4S, v21.4S, v31.s[0] +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v21.4s, v12.4s, v1.4s +mla v0.4S, v20.4S, v31.s[0] +add v12.4s, v12.4s, v1.4s +sqrdmulh v1.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +sub v20.4s, v15.4s, v0.4s +mla v14.4S, v19.4S, v31.s[0] +add v15.4s, v15.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v19.4s, v2.4s, v14.4s +mla v13.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v1.4s, v22.4s, v13.4s +mla v16.4S, v0.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v0.4s, v2.4s, v16.4s +mla v3.4S, v14.4S, v31.s[0] +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v14.4s, v22.4s, v3.4s +mla v18.4S, v13.4S, v31.s[0] +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v29.s[2] +mul v9.4S, v9.4S,v30.s[2] +sub v13.4s, v12.4s, v18.4s +mla v17.4S, v16.4S, v31.s[0] +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v8.4S, v29.s[2] +mul v8.4S, v8.4S,v30.s[2] +sub v16.4s, v15.4s, v17.4s +mla v9.4S, v3.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +sqrdmulh v17.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v3.4s, v19.4s, v9.4s +mla v8.4S, v18.4S, v31.s[0] +add v19.4s, v19.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v18.4s, v1.4s, v8.4s +mla v11.4S, v17.4S, v31.s[0] +add v1.4s, v1.4s, v8.4s +sqrdmulh v8.4S, v2.4S, v27.s[0] +mul v2.4S, v2.4S,v28.s[0] +sub v17.4s, v21.4s, v11.4s +mla v10.4S, v9.4S, v31.s[0] +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v27.s[0] +mul v22.4S, v22.4S,v28.s[0] +sub v9.4s, v20.4s, v10.4s +mla v2.4S, v8.4S, v31.s[0] +add v20.4s, v20.4s, v10.4s +sqrdmulh v10.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v8.4s, v12.4s, v2.4s +mla v22.4S, v11.4S, v31.s[0] +add v12.4s, v12.4s, v2.4s +sqrdmulh v2.4S, v14.4S, v27.s[1] +mul v14.4S, v14.4S,v28.s[1] +sub v11.4s, v15.4s, v22.4s +mla v0.4S, v10.4S, v31.s[0] +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v27.s[2] +mul v19.4S, v19.4S,v28.s[2] +sub v10.4s, v13.4s, v0.4s +mla v14.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v0.4s +sqrdmulh v0.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +sub v2.4s, v16.4s, v14.4s +mla v19.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v27.s[3] +mul v3.4S, v3.4S,v28.s[3] +sub v22.4s, v21.4s, v19.4s +mla v1.4S, v0.4S, v31.s[0] +add v21.4s, v21.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v0.4s, v20.4s, v1.4s +mla v3.4S, v14.4S, v31.s[0] +add v20.4s, v20.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v25.s[0] +mul v15.4S, v15.4S,v26.s[0] +sub v14.4s, v17.4s, v3.4s +mla v18.4S, v19.4S, v31.s[0] +add v17.4s, v17.4s, v3.4s +sqrdmulh v3.4S, v11.4S, v25.s[1] +mul v11.4S, v11.4S,v26.s[1] +sub v19.4s, v9.4s, v18.4s +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v1.4s, v12.4s, v15.4s +mla v11.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v25.s[3] +mul v2.4S, v2.4S,v26.s[3] +sub v3.4s, v8.4s, v11.4s +mla v16.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v11.4s +str q12, [x0, #32] +sqrdmulh v12.4S, v20.4S, v23.s[0] +str q1, [x0, #96] +mul v20.4S, v20.4S,v24.s[0] +ldr q1, [x0, #816] +sub v11.4s, v13.4s, v16.4s +ldr q18, [x0, #880] +mla v2.4S, v15.4S, v31.s[0] +add v13.4s, v13.4s, v16.4s +str q8, [x0, #160] +sqrdmulh v8.4S, v0.4S, v23.s[1] +str q3, [x0, #224] +mul v0.4S, v0.4S,v24.s[1] +ldr q3, [x0, #944] +sub v16.4s, v10.4s, v2.4s +ldr q15, [x0, #1008] +mla v20.4S, v12.4S, v31.s[0] +add v10.4s, v10.4s, v2.4s +str q13, [x0, #288] +sqrdmulh v13.4S, v9.4S, v23.s[2] +str q11, [x0, #352] +mul v9.4S, v9.4S,v24.s[2] +ldr q11, [x0, #304] +sub v2.4s, v21.4s, v20.4s +ldr q12, [x0, #368] +mla v0.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v20.4s +str q10, [x0, #416] +sqrdmulh v10.4S, v19.4S, v23.s[3] +str q16, [x0, #480] +mul v19.4S, v19.4S,v24.s[3] +ldr q16, [x0, #432] +sub v20.4s, v22.4s, v0.4s +ldr q8, [x0, #496] +mla v9.4S, v13.4S, v31.s[0] +add v22.4s, v22.4s, v0.4s +str q21, [x0, #544] +sqrdmulh v21.4S, v1.4S, v29.s[0] +str q2, [x0, #608] +ldr q2, [x0, #560] +mul v1.4S, v1.4S,v30.s[0] +ldr q0, [x0, #624] +sub v13.4s, v17.4s, v9.4s +mla v19.4S, v10.4S, v31.s[0] +add v17.4s, v17.4s, v9.4s +str q22, [x0, #672] +sqrdmulh v22.4S, v18.4S, v29.s[0] +str q20, [x0, #736] +ldr q20, [x0, #688] +mul v18.4S, v18.4S,v30.s[0] +ldr q9, [x0, #752] +sub v10.4s, v14.4s, v19.4s +mla v1.4S, v21.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +str q17, [x0, #800] +sqrdmulh v17.4S, v3.4S, v29.s[0] +str q13, [x0, #864] +mul v3.4S, v3.4S,v30.s[0] +ldr q13, [x0, #48] +sub v19.4s, v11.4s, v1.4s +mla v18.4S, v22.4S, v31.s[0] +add v11.4s, v11.4s, v1.4s +str q14, [x0, #928] +sqrdmulh v14.4S, v15.4S, v29.s[0] +str q10, [x0, #992] +mul v15.4S, v15.4S,v30.s[0] +ldr q10, [x0, #112] +sub v1.4s, v12.4s, v18.4s +mla v3.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v2.4S, v29.s[0] +ldr q17, [x0, #176] +mul v2.4S, v2.4S,v30.s[0] +sub v22.4s, v16.4s, v3.4s +mla v15.4S, v14.4S, v31.s[0] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v0.4S, v29.s[0] +ldr q14, [x0, #240] +mul v0.4S, v0.4S,v30.s[0] +sub v21.4s, v8.4s, v15.4s +mla v2.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +sub v18.4s, v13.4s, v2.4s +mla v0.4S, v3.4S, v31.s[0] +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v9.4S, v29.s[0] +mul v9.4S, v9.4S,v30.s[0] +sub v3.4s, v10.4s, v0.4s +mla v20.4S, v15.4S, v31.s[0] +add v10.4s, v10.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v15.4s, v17.4s, v20.4s +mla v9.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +sub v2.4s, v14.4s, v9.4s +mla v16.4S, v0.4S, v31.s[0] +add v14.4s, v14.4s, v9.4s +sqrdmulh v9.4S, v11.4S, v29.s[1] +mul v11.4S, v11.4S,v30.s[1] +sub v0.4s, v17.4s, v16.4s +mla v8.4S, v20.4S, v31.s[0] +add v17.4s, v17.4s, v16.4s +sqrdmulh v16.4S, v12.4S, v29.s[1] +mul v12.4S, v12.4S,v30.s[1] +sub v20.4s, v14.4s, v8.4s +mla v11.4S, v9.4S, v31.s[0] +add v14.4s, v14.4s, v8.4s +sqrdmulh v8.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +sub v9.4s, v13.4s, v11.4s +mla v12.4S, v16.4S, v31.s[0] +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +sub v16.4s, v10.4s, v12.4s +mla v22.4S, v8.4S, v31.s[0] +add v10.4s, v10.4s, v12.4s +sqrdmulh v12.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +sub v8.4s, v15.4s, v22.4s +mla v21.4S, v11.4S, v31.s[0] +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v1.4S, v29.s[2] +mul v1.4S, v1.4S,v30.s[2] +sub v11.4s, v2.4s, v21.4s +mla v19.4S, v12.4S, v31.s[0] +add v2.4s, v2.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v27.s[0] +mul v17.4S, v17.4S,v28.s[0] +sub v12.4s, v18.4s, v19.4s +mla v1.4S, v22.4S, v31.s[0] +add v18.4s, v18.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +sub v22.4s, v3.4s, v1.4s +mla v17.4S, v21.4S, v31.s[0] +add v3.4s, v3.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v21.4s, v13.4s, v17.4s +mla v14.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v17.4s +sqrdmulh v17.4S, v20.4S, v27.s[1] +mul v20.4S, v20.4S,v28.s[1] +sub v19.4s, v10.4s, v14.4s +mla v0.4S, v1.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v27.s[2] +mul v15.4S, v15.4S,v28.s[2] +sub v1.4s, v9.4s, v0.4s +mla v20.4S, v17.4S, v31.s[0] +add v9.4s, v9.4s, v0.4s +sqrdmulh v0.4S, v2.4S, v27.s[2] +mul v2.4S, v2.4S,v28.s[2] +sub v17.4s, v16.4s, v20.4s +mla v15.4S, v14.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v27.s[3] +mul v8.4S, v8.4S,v28.s[3] +sub v14.4s, v18.4s, v15.4s +mla v2.4S, v0.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v27.s[3] +mul v11.4S, v11.4S,v28.s[3] +sub v0.4s, v3.4s, v2.4s +mla v8.4S, v20.4S, v31.s[0] +add v3.4s, v3.4s, v2.4s +sqrdmulh v2.4S, v10.4S, v25.s[0] +mul v10.4S, v10.4S,v26.s[0] +sub v20.4s, v12.4s, v8.4s +mla v11.4S, v15.4S, v31.s[0] +add v12.4s, v12.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v25.s[1] +mul v19.4S, v19.4S,v26.s[1] +sub v15.4s, v22.4s, v11.4s +mla v10.4S, v2.4S, v31.s[0] +add v22.4s, v22.4s, v11.4s +sqrdmulh v11.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v2.4s, v13.4s, v10.4s +mla v19.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v10.4s +sqrdmulh v10.4S, v17.4S, v25.s[3] +mul v17.4S, v17.4S,v26.s[3] +sub v8.4s, v21.4s, v19.4s +mla v16.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v19.4s +str q13, [x0, #48] +sqrdmulh v13.4S, v3.4S, v23.s[0] +str q2, [x0, #112] +mul v3.4S, v3.4S,v24.s[0] +ldr q2, [x0, #768] +sub v19.4s, v9.4s, v16.4s +ldr q11, [x0, #832] +mla v17.4S, v10.4S, v31.s[0] +add v9.4s, v9.4s, v16.4s +str q21, [x0, #176] +sqrdmulh v21.4S, v0.4S, v23.s[1] +str q8, [x0, #240] +mul v0.4S, v0.4S,v24.s[1] +ldr q8, [x0, #896] +sub v16.4s, v1.4s, v17.4s +ldr q10, [x0, #960] +mla v3.4S, v13.4S, v31.s[0] +add v1.4s, v1.4s, v17.4s +str q9, [x0, #304] +sqrdmulh v9.4S, v22.4S, v23.s[2] +str q19, [x0, #368] +mul v22.4S, v22.4S,v24.s[2] +ldr q19, [x0, #256] +sub v17.4s, v18.4s, v3.4s +ldr q13, [x0, #320] +mla v0.4S, v21.4S, v31.s[0] +add v18.4s, v18.4s, v3.4s +str q1, [x0, #432] +sqrdmulh v1.4S, v15.4S, v23.s[3] +str q16, [x0, #496] +mul v15.4S, v15.4S,v24.s[3] +ldr q16, [x0, #384] +sub v3.4s, v14.4s, v0.4s +ldr q21, [x0, #448] +mla v22.4S, v9.4S, v31.s[0] +add v14.4s, v14.4s, v0.4s +str q18, [x0, #560] +sqrdmulh v18.4S, v2.4S, v29.s[0] +str q17, [x0, #624] +ldr q17, [x0, #512] +mul v2.4S, v2.4S,v30.s[0] +ldr q0, [x0, #576] +sub v9.4s, v12.4s, v22.4s +mla v15.4S, v1.4S, v31.s[0] +add v12.4s, v12.4s, v22.4s +str q14, [x0, #688] +sqrdmulh v14.4S, v11.4S, v29.s[0] +str q3, [x0, #752] +ldr q3, [x0, #640] +mul v11.4S, v11.4S,v30.s[0] +ldr q22, [x0, #704] +sub v1.4s, v20.4s, v15.4s +mla v2.4S, v18.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +str q12, [x0, #816] +sqrdmulh v12.4S, v8.4S, v29.s[0] +str q9, [x0, #880] +mul v8.4S, v8.4S,v30.s[0] +ldr q9, [x0, #0] +sub v15.4s, v19.4s, v2.4s +mla v11.4S, v14.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +str q20, [x0, #944] +sqrdmulh v20.4S, v10.4S, v29.s[0] +str q1, [x0, #1008] +mul v10.4S, v10.4S,v30.s[0] +ldr q1, [x0, #64] +sub v2.4s, v13.4s, v11.4s +mla v8.4S, v12.4S, v31.s[0] +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v29.s[0] +ldr q12, [x0, #128] +mul v17.4S, v17.4S,v30.s[0] +sub v14.4s, v16.4s, v8.4s +mla v10.4S, v20.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v0.4S, v29.s[0] +ldr q20, [x0, #192] +mul v0.4S, v0.4S,v30.s[0] +sub v18.4s, v21.4s, v10.4s +mla v17.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +sub v11.4s, v9.4s, v17.4s +mla v0.4S, v8.4S, v31.s[0] +add v9.4s, v9.4s, v17.4s +sqrdmulh v17.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v8.4s, v1.4s, v0.4s +mla v3.4S, v10.4S, v31.s[0] +add v1.4s, v1.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v10.4s, v12.4s, v3.4s +mla v22.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v17.4s, v20.4s, v22.4s +mla v16.4S, v0.4S, v31.s[0] +add v20.4s, v20.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[1] +mul v19.4S, v19.4S,v30.s[1] +sub v0.4s, v12.4s, v16.4s +mla v21.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v13.4S, v29.s[1] +mul v13.4S, v13.4S,v30.s[1] +sub v3.4s, v20.4s, v21.4s +mla v19.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v22.4s, v9.4s, v19.4s +mla v13.4S, v16.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v29.s[2] +mul v18.4S, v18.4S,v30.s[2] +sub v16.4s, v1.4s, v13.4s +mla v14.4S, v21.4S, v31.s[0] +add v1.4s, v1.4s, v13.4s +sqrdmulh v13.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +sub v21.4s, v10.4s, v14.4s +mla v18.4S, v19.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v19.4s, v17.4s, v18.4s +mla v15.4S, v13.4S, v31.s[0] +add v17.4s, v17.4s, v18.4s +sqrdmulh v18.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +sub v13.4s, v11.4s, v15.4s +mla v2.4S, v14.4S, v31.s[0] +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v27.s[0] +mul v20.4S, v20.4S,v28.s[0] +sub v14.4s, v8.4s, v2.4s +mla v12.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v2.4s +sqrdmulh v2.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v18.4s, v9.4s, v12.4s +mla v20.4S, v15.4S, v31.s[0] +add v9.4s, v9.4s, v12.4s +sqrdmulh v12.4S, v3.4S, v27.s[1] +mul v3.4S, v3.4S,v28.s[1] +sub v15.4s, v1.4s, v20.4s +mla v0.4S, v2.4S, v31.s[0] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v10.4S, v27.s[2] +mul v10.4S, v10.4S,v28.s[2] +sub v2.4s, v22.4s, v0.4s +mla v3.4S, v12.4S, v31.s[0] +add v22.4s, v22.4s, v0.4s +sqrdmulh v0.4S, v17.4S, v27.s[2] +mul v17.4S, v17.4S,v28.s[2] +sub v12.4s, v16.4s, v3.4s +mla v10.4S, v20.4S, v31.s[0] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +sub v20.4s, v11.4s, v10.4s +mla v17.4S, v0.4S, v31.s[0] +add v11.4s, v11.4s, v10.4s +sqrdmulh v10.4S, v19.4S, v27.s[3] +mul v19.4S, v19.4S,v28.s[3] +sub v0.4s, v8.4s, v17.4s +mla v21.4S, v3.4S, v31.s[0] +add v8.4s, v8.4s, v17.4s +sqrdmulh v17.4S, v1.4S, v25.s[0] +mul v1.4S, v1.4S,v26.s[0] +sub v3.4s, v13.4s, v21.4s +mla v19.4S, v10.4S, v31.s[0] +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v15.4S, v25.s[1] +mul v15.4S, v15.4S,v26.s[1] +sub v10.4s, v14.4s, v19.4s +mla v1.4S, v17.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +sqrdmulh v19.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v17.4s, v9.4s, v1.4s +mla v15.4S, v21.4S, v31.s[0] +add v9.4s, v9.4s, v1.4s +sqrdmulh v1.4S, v12.4S, v25.s[3] +mul v12.4S, v12.4S,v26.s[3] +sub v21.4s, v18.4s, v15.4s +mla v16.4S, v19.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +str q9, [x0, #0] +sqrdmulh v9.4S, v8.4S, v23.s[0] +str q17, [x0, #64] +mul v8.4S, v8.4S,v24.s[0] +ldr q17, [x0, #784] +sub v15.4s, v22.4s, v16.4s +ldr q19, [x0, #848] +mla v12.4S, v1.4S, v31.s[0] +add v22.4s, v22.4s, v16.4s +str q18, [x0, #128] +sqrdmulh v18.4S, v0.4S, v23.s[1] +str q21, [x0, #192] +mul v0.4S, v0.4S,v24.s[1] +ldr q21, [x0, #912] +sub v16.4s, v2.4s, v12.4s +ldr q1, [x0, #976] +mla v8.4S, v9.4S, v31.s[0] +add v2.4s, v2.4s, v12.4s +str q22, [x0, #256] +sqrdmulh v22.4S, v14.4S, v23.s[2] +str q15, [x0, #320] +mul v14.4S, v14.4S,v24.s[2] +ldr q15, [x0, #272] +sub v12.4s, v11.4s, v8.4s +ldr q9, [x0, #336] +mla v0.4S, v18.4S, v31.s[0] +add v11.4s, v11.4s, v8.4s +str q2, [x0, #384] +sqrdmulh v2.4S, v10.4S, v23.s[3] +str q16, [x0, #448] +mul v10.4S, v10.4S,v24.s[3] +ldr q16, [x0, #400] +sub v8.4s, v20.4s, v0.4s +ldr q18, [x0, #464] +mla v14.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v0.4s +str q11, [x0, #512] +sqrdmulh v11.4S, v17.4S, v29.s[0] +str q12, [x0, #576] +ldr q12, [x0, #528] +mul v17.4S, v17.4S,v30.s[0] +ldr q0, [x0, #592] +sub v22.4s, v13.4s, v14.4s +mla v10.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v14.4s +str q20, [x0, #640] +sqrdmulh v20.4S, v19.4S, v29.s[0] +str q8, [x0, #704] +ldr q8, [x0, #656] +mul v19.4S, v19.4S,v30.s[0] +ldr q14, [x0, #720] +sub v2.4s, v3.4s, v10.4s +mla v17.4S, v11.4S, v31.s[0] +add v3.4s, v3.4s, v10.4s +str q13, [x0, #768] +sqrdmulh v13.4S, v21.4S, v29.s[0] +str q22, [x0, #832] +mul v21.4S, v21.4S,v30.s[0] +ldr q22, [x0, #16] +sub v10.4s, v15.4s, v17.4s +mla v19.4S, v20.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +str q3, [x0, #896] +sqrdmulh v3.4S, v1.4S, v29.s[0] +str q2, [x0, #960] +mul v1.4S, v1.4S,v30.s[0] +ldr q2, [x0, #80] +sub v17.4s, v9.4s, v19.4s +mla v21.4S, v13.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v12.4S, v29.s[0] +ldr q13, [x0, #144] +mul v12.4S, v12.4S,v30.s[0] +sub v20.4s, v16.4s, v21.4s +mla v1.4S, v3.4S, v31.s[0] +add v16.4s, v16.4s, v21.4s +sqrdmulh v21.4S, v0.4S, v29.s[0] +ldr q3, [x0, #208] +mul v0.4S, v0.4S,v30.s[0] +sub v11.4s, v18.4s, v1.4s +mla v12.4S, v19.4S, v31.s[0] +add v18.4s, v18.4s, v1.4s +sqrdmulh v1.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v19.4s, v22.4s, v12.4s +mla v0.4S, v21.4S, v31.s[0] +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v21.4s, v2.4s, v0.4s +mla v8.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v1.4s, v13.4s, v8.4s +mla v14.4S, v12.4S, v31.s[0] +add v13.4s, v13.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v12.4s, v3.4s, v14.4s +mla v16.4S, v0.4S, v31.s[0] +add v3.4s, v3.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +sub v0.4s, v13.4s, v16.4s +mla v18.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v16.4s +sqrdmulh v16.4S, v9.4S, v29.s[1] +mul v9.4S, v9.4S,v30.s[1] +sub v8.4s, v3.4s, v18.4s +mla v15.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +sub v14.4s, v22.4s, v15.4s +mla v9.4S, v16.4S, v31.s[0] +add v22.4s, v22.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v16.4s, v2.4s, v9.4s +mla v20.4S, v18.4S, v31.s[0] +add v2.4s, v2.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v18.4s, v1.4s, v20.4s +mla v11.4S, v15.4S, v31.s[0] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +sub v15.4s, v12.4s, v11.4s +mla v10.4S, v9.4S, v31.s[0] +add v12.4s, v12.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v27.s[0] +mul v13.4S, v13.4S,v28.s[0] +sub v9.4s, v19.4s, v10.4s +mla v17.4S, v20.4S, v31.s[0] +add v19.4s, v19.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v27.s[0] +mul v3.4S, v3.4S,v28.s[0] +sub v20.4s, v21.4s, v17.4s +mla v13.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v11.4s, v22.4s, v13.4s +mla v3.4S, v10.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v8.4S, v27.s[1] +mul v8.4S, v8.4S,v28.s[1] +sub v10.4s, v2.4s, v3.4s +mla v0.4S, v17.4S, v31.s[0] +add v2.4s, v2.4s, v3.4s +sqrdmulh v3.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +sub v17.4s, v14.4s, v0.4s +mla v8.4S, v13.4S, v31.s[0] +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v12.4S, v27.s[2] +mul v12.4S, v12.4S,v28.s[2] +sub v13.4s, v16.4s, v8.4s +mla v1.4S, v3.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v3.4s, v19.4s, v1.4s +mla v12.4S, v0.4S, v31.s[0] +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +sub v0.4s, v21.4s, v12.4s +mla v18.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v2.4S, v25.s[0] +mul v2.4S, v2.4S,v26.s[0] +sub v8.4s, v9.4s, v18.4s +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v10.4S, v25.s[1] +mul v10.4S, v10.4S,v26.s[1] +sub v1.4s, v20.4s, v15.4s +mla v2.4S, v12.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v12.4s, v22.4s, v2.4s +mla v10.4S, v18.4S, v31.s[0] +add v22.4s, v22.4s, v2.4s +sqrdmulh v2.4S, v13.4S, v25.s[3] +mul v13.4S, v13.4S,v26.s[3] +sub v18.4s, v11.4s, v10.4s +mla v16.4S, v15.4S, v31.s[0] +add v11.4s, v11.4s, v10.4s +str q22, [x0, #16] +sqrdmulh v22.4S, v21.4S, v23.s[0] +str q12, [x0, #80] +mul v21.4S, v21.4S,v24.s[0] +sub v12.4s, v14.4s, v16.4s +mla v13.4S, v2.4S, v31.s[0] +add v14.4s, v14.4s, v16.4s +str q11, [x0, #144] +sqrdmulh v11.4S, v0.4S, v23.s[1] +str q18, [x0, #208] +mul v0.4S, v0.4S,v24.s[1] +sub v18.4s, v17.4s, v13.4s +mla v21.4S, v22.4S, v31.s[0] +add v17.4s, v17.4s, v13.4s +str q14, [x0, #272] +sqrdmulh v14.4S, v20.4S, v23.s[2] +str q12, [x0, #336] +mul v20.4S, v20.4S,v24.s[2] +sub v12.4s, v19.4s, v21.4s +mla v0.4S, v11.4S, v31.s[0] +add v19.4s, v19.4s, v21.4s +str q17, [x0, #400] +sqrdmulh v17.4S, v1.4S, v23.s[3] +str q18, [x0, #464] +mul v1.4S, v1.4S,v24.s[3] +sub v18.4s, v3.4s, v0.4s +mla v20.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v0.4s +str q19, [x0, #528] +str q12, [x0, #592] +sub v12.4s, v9.4s, v20.4s +mla v1.4S, v17.4S, v31.s[0] +add v9.4s, v9.4s, v20.4s +str q3, [x0, #656] +str q18, [x0, #720] +sub v18.4s, v8.4s, v1.4s +add v8.4s, v8.4s, v1.4s +str q9, [x0, #784] +str q12, [x0, #848] +str q8, [x0, #912] +str q18, [x0, #976] +ldr q4, [x0, #32] +ldr q5, [x0, #48] +ldr q6, [x17, #+128] +ldr q7, [x17, #+144] +ldr q15, [x17, #+256] +ldr q10, [x0, #96] +ldr q2, [x17, #+272] +ldr q16, [x0, #112] +sqrdmulh v22.4S, v4.4S, v7.s[0] +mul v4.4S, v4.4S,v6.s[0] +sqrdmulh v13.4S, v5.4S, v7.s[0] +mul v5.4S, v5.4S,v6.s[0] +mla v4.4S, v22.4S, v31.s[0] +ldr q22, [x0, #0] +sqrdmulh v11.4S, v10.4S, v2.s[0] +ldr q21, [x0, #16] +mul v10.4S, v10.4S,v15.s[0] +mla v5.4S, v13.4S, v31.s[0] +sub v13.4s, v22.4s, v4.4s +add v22.4s, v22.4s, v4.4s +sqrdmulh v4.4S, v16.4S, v2.s[0] +ldr q14, [x0, #160] +mul v16.4S, v16.4S,v15.s[0] +ldr q0, [x0, #176] +mla v10.4S, v11.4S, v31.s[0] +ldr q11, [x0, #64] +sub v19.4s, v21.4s, v5.4s +add v21.4s, v21.4s, v5.4s +sqrdmulh v5.4S, v21.4S, v7.s[1] +ldr q17, [x0, #128] +mul v21.4S, v21.4S,v6.s[1] +ldr q20, [x0, #144] +mla v16.4S, v4.4S, v31.s[0] +ldr q4, [x0, #80] +sub v3.4s, v11.4s, v10.4s +add v11.4s, v11.4s, v10.4s +sqrdmulh v10.4S, v19.4S, v7.s[2] +ldr q1, [x17, #+384] +mul v19.4S, v19.4S,v6.s[2] +ldr q9, [x17, #+400] +mla v21.4S, v5.4S, v31.s[0] +sub v5.4s, v4.4s, v16.4s +add v4.4s, v4.4s, v16.4s +sqrdmulh v16.4S, v4.4S, v2.s[1] +ldr q12, [x0, #224] +mul v4.4S, v4.4S,v15.s[1] +ldr q8, [x0, #240] +mla v19.4S, v10.4S, v31.s[0] +sub v10.4s, v22.4s, v21.4s +add v22.4s, v22.4s, v21.4s +sqrdmulh v7.4S, v5.4S, v2.s[2] +ldr q6, [x0, #192] +mul v5.4S, v5.4S,v15.s[2] +ldr q21, [x0, #208] +mla v4.4S, v16.4S, v31.s[0] +sub v16.4s, v13.4s, v19.4s +add v13.4s, v13.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v9.s[0] +ldr q18, [x17, #+512] +mul v14.4S, v14.4S,v1.s[0] +ldr q30, [x17, #+528] +trn1 v29.4S, v22.4S, v10.4S +trn2 v28.4S, v22.4S, v10.4S +mla v5.4S, v7.4S, v31.s[0] +sub v7.4s, v11.4s, v4.4s +add v11.4s, v11.4s, v4.4s +sqrdmulh v2.4S, v0.4S, v9.s[0] +mul v0.4S, v0.4S,v1.s[0] +trn1 v15.4S, v13.4S, v16.4S +trn2 v4.4S, v13.4S, v16.4S +mla v14.4S, v19.4S, v31.s[0] +sub v19.4s, v3.4s, v5.4s +add v3.4s, v3.4s, v5.4s +sqrdmulh v5.4S, v12.4S, v30.s[0] +mul v12.4S, v12.4S,v18.s[0] +trn2 v13.2D, v29.2D, v15.2D +trn2 v16.2D, v28.2D, v4.2D +mla v0.4S, v2.4S, v31.s[0] +sub v2.4s, v17.4s, v14.4s +add v17.4s, v17.4s, v14.4s +sqrdmulh v14.4S, v8.4S, v30.s[0] +mul v8.4S, v8.4S,v18.s[0] +trn1 v22.2D, v29.2D, v15.2D +trn1 v10.2D, v28.2D, v4.2D +mla v12.4S, v5.4S, v31.s[0] +sub v5.4s, v20.4s, v0.4s +add v20.4s, v20.4s, v0.4s +sqrdmulh v0.4S, v20.4S, v9.s[1] +mul v20.4S, v20.4S,v1.s[1] +trn1 v4.4S, v11.4S, v7.4S +trn2 v28.4S, v11.4S, v7.4S +mla v8.4S, v14.4S, v31.s[0] +sub v14.4s, v6.4s, v12.4s +add v6.4s, v6.4s, v12.4s +sqrdmulh v12.4S, v5.4S, v9.s[2] +mul v5.4S, v5.4S,v1.s[2] +trn1 v15.4S, v3.4S, v19.4S +trn2 v29.4S, v3.4S, v19.4S +ldr q27, [x17, #+160] +ldr q26, [x17, #+176] +mla v20.4S, v0.4S, v31.s[0] +sub v0.4s, v21.4s, v8.4s +add v21.4s, v21.4s, v8.4s +sqrdmulh v8.4S, v21.4S, v30.s[1] +mul v21.4S, v21.4S,v18.s[1] +trn2 v3.2D, v4.2D, v15.2D +trn2 v19.2D, v28.2D, v29.2D +mla v5.4S, v12.4S, v31.s[0] +sub v12.4s, v17.4s, v20.4s +add v17.4s, v17.4s, v20.4s +sqrdmulh v9.4S, v0.4S, v30.s[2] +mul v0.4S, v0.4S,v18.s[2] +trn1 v11.2D, v4.2D, v15.2D +trn1 v7.2D, v28.2D, v29.2D +mla v21.4S, v8.4S, v31.s[0] +sub v8.4s, v2.4s, v5.4s +add v2.4s, v2.4s, v5.4s +sqrdmulh v5.4S, v13.4S, v26.4S +mul v13.4S, v13.4S,v27.4S +mla v0.4S, v9.4S, v31.s[0] +sub v9.4s, v6.4s, v21.4s +add v6.4s, v6.4s, v21.4s +ldr q30, [x17, #+288] +ldr q18, [x17, #+304] +sqrdmulh v21.4S, v16.4S, v26.4S +mul v16.4S, v16.4S,v27.4S +trn1 v29.4S, v17.4S, v12.4S +trn2 v28.4S, v17.4S, v12.4S +mla v13.4S, v5.4S, v31.s[0] +sub v5.4s, v14.4s, v0.4s +add v14.4s, v14.4s, v0.4s +ldr q26, [x17, #+192] +ldr q27, [x17, #+208] +trn1 v0.4S, v2.4S, v8.4S +trn2 v15.4S, v2.4S, v8.4S +sqrdmulh v4.4S, v3.4S, v18.4S +mul v3.4S, v3.4S,v30.4S +trn2 v2.2D, v29.2D, v0.2D +trn2 v8.2D, v28.2D, v15.2D +ldr q1, [x17, #+224] +ldr q20, [x17, #+240] +mla v16.4S, v21.4S, v31.s[0] +sub v21.4s, v22.4s, v13.4s +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v19.4S, v18.4S +mul v19.4S, v19.4S,v30.4S +trn1 v17.2D, v29.2D, v0.2D +trn1 v12.2D, v28.2D, v15.2D +ldr q15, [x17, #+320] +ldr q28, [x17, #+336] +mla v3.4S, v4.4S, v31.s[0] +sub v4.4s, v10.4s, v16.4s +add v10.4s, v10.4s, v16.4s +sqrdmulh v16.4S, v10.4S, v27.4S +mul v10.4S, v10.4S,v26.4S +trn1 v27.4S, v6.4S, v9.4S +trn2 v26.4S, v6.4S, v9.4S +ldr q0, [x17, #+352] +ldr q29, [x17, #+368] +mla v19.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v3.4s +add v11.4s, v11.4s, v3.4s +sqrdmulh v3.4S, v4.4S, v20.4S +mul v4.4S, v4.4S,v1.4S +trn1 v20.4S, v14.4S, v5.4S +trn2 v1.4S, v14.4S, v5.4S +mla v10.4S, v16.4S, v31.s[0] +sub v16.4s, v7.4s, v19.4s +add v7.4s, v7.4s, v19.4s +sqrdmulh v19.4S, v7.4S, v28.4S +mul v7.4S, v7.4S,v15.4S +ldr q18, [x17, #+416] +ldr q30, [x17, #+432] +trn2 v14.2D, v27.2D, v20.2D +trn2 v5.2D, v26.2D, v1.2D +mla v4.4S, v3.4S, v31.s[0] +sub v3.4s, v22.4s, v10.4s +add v22.4s, v22.4s, v10.4s +sqrdmulh v10.4S, v16.4S, v29.4S +mul v16.4S, v16.4S,v0.4S +trn1 v6.2D, v27.2D, v20.2D +trn1 v9.2D, v26.2D, v1.2D +mla v7.4S, v19.4S, v31.s[0] +sub v19.4s, v21.4s, v4.4s +add v21.4s, v21.4s, v4.4s +sqrdmulh v4.4S, v2.4S, v30.4S +ldr q1, [x17, #+544] +ldr q26, [x17, #+560] +mul v2.4S, v2.4S,v18.4S +str q22, [x0, #0] +str q3, [x0, #16] +mla v16.4S, v10.4S, v31.s[0] +sub v10.4s, v11.4s, v7.4s +add v11.4s, v11.4s, v7.4s +sqrdmulh v7.4S, v8.4S, v30.4S +mul v8.4S, v8.4S,v18.4S +str q21, [x0, #32] +mla v2.4S, v4.4S, v31.s[0] +sub v4.4s, v13.4s, v16.4s +add v13.4s, v13.4s, v16.4s +ldr q16, [x17, #+448] +ldr q21, [x17, #+464] +sqrdmulh v30.4S, v14.4S, v26.4S +str q11, [x0, #64] +mul v14.4S, v14.4S,v1.4S +str q19, [x0, #48] +mla v8.4S, v7.4S, v31.s[0] +str q10, [x0, #80] +sub v10.4s, v17.4s, v2.4s +add v17.4s, v17.4s, v2.4s +ldr q2, [x17, #+480] +ldr q7, [x17, #+496] +sqrdmulh v19.4S, v5.4S, v26.4S +mul v5.4S, v5.4S,v1.4S +str q13, [x0, #96] +mla v14.4S, v30.4S, v31.s[0] +sub v30.4s, v12.4s, v8.4s +add v12.4s, v12.4s, v8.4s +ldr q8, [x17, #+576] +ldr q13, [x17, #+592] +sqrdmulh v26.4S, v12.4S, v21.4S +mul v12.4S, v12.4S,v16.4S +ldr q21, [x17, #+608] +ldr q16, [x17, #+624] +str q4, [x0, #112] +mla v5.4S, v19.4S, v31.s[0] +sub v19.4s, v6.4s, v14.4s +add v6.4s, v6.4s, v14.4s +sqrdmulh v14.4S, v30.4S, v7.4S +mul v30.4S, v30.4S,v2.4S +mla v12.4S, v26.4S, v31.s[0] +sub v26.4s, v9.4s, v5.4s +add v9.4s, v9.4s, v5.4s +sqrdmulh v5.4S, v9.4S, v13.4S +mul v9.4S, v9.4S,v8.4S +ldr q13, [x0, #288] +mla v30.4S, v14.4S, v31.s[0] +ldr q14, [x0, #304] +sub v8.4s, v17.4s, v12.4s +add v17.4s, v17.4s, v12.4s +sqrdmulh v12.4S, v26.4S, v16.4S +ldr q7, [x17, #+640] +mul v26.4S, v26.4S,v21.4S +ldr q16, [x17, #+656] +mla v9.4S, v5.4S, v31.s[0] +ldr q5, [x17, #+768] +sub v21.4s, v10.4s, v30.4s +ldr q2, [x0, #352] +add v10.4s, v10.4s, v30.4s +ldr q30, [x17, #+784] +ldr q4, [x0, #368] +sqrdmulh v1.4S, v13.4S, v16.s[0] +mul v13.4S, v13.4S,v7.s[0] +mla v26.4S, v12.4S, v31.s[0] +sub v12.4s, v6.4s, v9.4s +add v6.4s, v6.4s, v9.4s +sqrdmulh v9.4S, v14.4S, v16.s[0] +mul v14.4S, v14.4S,v7.s[0] +str q17, [x0, #128] +str q8, [x0, #144] +str q10, [x0, #160] +str q21, [x0, #176] +mla v13.4S, v1.4S, v31.s[0] +sub v1.4s, v19.4s, v26.4s +add v19.4s, v19.4s, v26.4s +ldr q26, [x0, #256] +sqrdmulh v21.4S, v2.4S, v30.s[0] +ldr q10, [x0, #272] +mul v2.4S, v2.4S,v5.s[0] +str q6, [x0, #192] +str q12, [x0, #208] +str q19, [x0, #224] +str q1, [x0, #240] +mla v14.4S, v9.4S, v31.s[0] +sub v9.4s, v26.4s, v13.4s +add v26.4s, v26.4s, v13.4s +sqrdmulh v13.4S, v4.4S, v30.s[0] +ldr q1, [x0, #416] +mul v4.4S, v4.4S,v5.s[0] +ldr q19, [x0, #432] +mla v2.4S, v21.4S, v31.s[0] +ldr q21, [x0, #320] +sub v12.4s, v10.4s, v14.4s +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v10.4S, v16.s[1] +ldr q6, [x0, #384] +mul v10.4S, v10.4S,v7.s[1] +ldr q8, [x0, #400] +mla v4.4S, v13.4S, v31.s[0] +ldr q13, [x0, #336] +sub v17.4s, v21.4s, v2.4s +add v21.4s, v21.4s, v2.4s +sqrdmulh v2.4S, v12.4S, v16.s[2] +ldr q11, [x17, #+896] +mul v12.4S, v12.4S,v7.s[2] +ldr q18, [x17, #+912] +mla v10.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v4.4s +add v13.4s, v13.4s, v4.4s +sqrdmulh v4.4S, v13.4S, v30.s[1] +ldr q3, [x0, #480] +mul v13.4S, v13.4S,v5.s[1] +ldr q22, [x0, #496] +mla v12.4S, v2.4S, v31.s[0] +sub v2.4s, v26.4s, v10.4s +add v26.4s, v26.4s, v10.4s +sqrdmulh v16.4S, v14.4S, v30.s[2] +ldr q7, [x0, #448] +mul v14.4S, v14.4S,v5.s[2] +ldr q10, [x0, #464] +mla v13.4S, v4.4S, v31.s[0] +sub v4.4s, v9.4s, v12.4s +add v9.4s, v9.4s, v12.4s +sqrdmulh v12.4S, v1.4S, v18.s[0] +ldr q20, [x17, #+1024] +mul v1.4S, v1.4S,v11.s[0] +ldr q27, [x17, #+1040] +trn1 v29.4S, v26.4S, v2.4S +trn2 v0.4S, v26.4S, v2.4S +mla v14.4S, v16.4S, v31.s[0] +sub v16.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +sqrdmulh v30.4S, v19.4S, v18.s[0] +mul v19.4S, v19.4S,v11.s[0] +trn1 v5.4S, v9.4S, v4.4S +trn2 v13.4S, v9.4S, v4.4S +mla v1.4S, v12.4S, v31.s[0] +sub v12.4s, v17.4s, v14.4s +add v17.4s, v17.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v27.s[0] +mul v3.4S, v3.4S,v20.s[0] +trn2 v9.2D, v29.2D, v5.2D +trn2 v4.2D, v0.2D, v13.2D +mla v19.4S, v30.4S, v31.s[0] +sub v30.4s, v6.4s, v1.4s +add v6.4s, v6.4s, v1.4s +sqrdmulh v1.4S, v22.4S, v27.s[0] +mul v22.4S, v22.4S,v20.s[0] +trn1 v26.2D, v29.2D, v5.2D +trn1 v2.2D, v0.2D, v13.2D +mla v3.4S, v14.4S, v31.s[0] +sub v14.4s, v8.4s, v19.4s +add v8.4s, v8.4s, v19.4s +sqrdmulh v19.4S, v8.4S, v18.s[1] +mul v8.4S, v8.4S,v11.s[1] +trn1 v13.4S, v21.4S, v16.4S +trn2 v0.4S, v21.4S, v16.4S +mla v22.4S, v1.4S, v31.s[0] +sub v1.4s, v7.4s, v3.4s +add v7.4s, v7.4s, v3.4s +sqrdmulh v3.4S, v14.4S, v18.s[2] +mul v14.4S, v14.4S,v11.s[2] +trn1 v5.4S, v17.4S, v12.4S +trn2 v29.4S, v17.4S, v12.4S +ldr q28, [x17, #+672] +ldr q15, [x17, #+688] +mla v8.4S, v19.4S, v31.s[0] +sub v19.4s, v10.4s, v22.4s +add v10.4s, v10.4s, v22.4s +sqrdmulh v22.4S, v10.4S, v27.s[1] +mul v10.4S, v10.4S,v20.s[1] +trn2 v17.2D, v13.2D, v5.2D +trn2 v12.2D, v0.2D, v29.2D +mla v14.4S, v3.4S, v31.s[0] +sub v3.4s, v6.4s, v8.4s +add v6.4s, v6.4s, v8.4s +sqrdmulh v18.4S, v19.4S, v27.s[2] +mul v19.4S, v19.4S,v20.s[2] +trn1 v21.2D, v13.2D, v5.2D +trn1 v16.2D, v0.2D, v29.2D +mla v10.4S, v22.4S, v31.s[0] +sub v22.4s, v30.4s, v14.4s +add v30.4s, v30.4s, v14.4s +sqrdmulh v14.4S, v9.4S, v15.4S +mul v9.4S, v9.4S,v28.4S +mla v19.4S, v18.4S, v31.s[0] +sub v18.4s, v7.4s, v10.4s +add v7.4s, v7.4s, v10.4s +ldr q27, [x17, #+800] +ldr q20, [x17, #+816] +sqrdmulh v10.4S, v4.4S, v15.4S +mul v4.4S, v4.4S,v28.4S +trn1 v29.4S, v6.4S, v3.4S +trn2 v0.4S, v6.4S, v3.4S +mla v9.4S, v14.4S, v31.s[0] +sub v14.4s, v1.4s, v19.4s +add v1.4s, v1.4s, v19.4s +ldr q15, [x17, #+704] +ldr q28, [x17, #+720] +trn1 v19.4S, v30.4S, v22.4S +trn2 v5.4S, v30.4S, v22.4S +sqrdmulh v13.4S, v17.4S, v20.4S +mul v17.4S, v17.4S,v27.4S +trn2 v30.2D, v29.2D, v19.2D +trn2 v22.2D, v0.2D, v5.2D +ldr q11, [x17, #+736] +ldr q8, [x17, #+752] +mla v4.4S, v10.4S, v31.s[0] +sub v10.4s, v26.4s, v9.4s +add v26.4s, v26.4s, v9.4s +sqrdmulh v9.4S, v12.4S, v20.4S +mul v12.4S, v12.4S,v27.4S +trn1 v6.2D, v29.2D, v19.2D +trn1 v3.2D, v0.2D, v5.2D +ldr q5, [x17, #+832] +ldr q0, [x17, #+848] +mla v17.4S, v13.4S, v31.s[0] +sub v13.4s, v2.4s, v4.4s +add v2.4s, v2.4s, v4.4s +sqrdmulh v4.4S, v2.4S, v28.4S +mul v2.4S, v2.4S,v15.4S +trn1 v28.4S, v7.4S, v18.4S +trn2 v15.4S, v7.4S, v18.4S +ldr q19, [x17, #+864] +ldr q29, [x17, #+880] +mla v12.4S, v9.4S, v31.s[0] +sub v9.4s, v21.4s, v17.4s +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v13.4S, v8.4S +mul v13.4S, v13.4S,v11.4S +trn1 v8.4S, v1.4S, v14.4S +trn2 v11.4S, v1.4S, v14.4S +mla v2.4S, v4.4S, v31.s[0] +sub v4.4s, v16.4s, v12.4s +add v16.4s, v16.4s, v12.4s +sqrdmulh v12.4S, v16.4S, v0.4S +mul v16.4S, v16.4S,v5.4S +ldr q20, [x17, #+928] +ldr q27, [x17, #+944] +trn2 v1.2D, v28.2D, v8.2D +trn2 v14.2D, v15.2D, v11.2D +mla v13.4S, v17.4S, v31.s[0] +sub v17.4s, v26.4s, v2.4s +add v26.4s, v26.4s, v2.4s +sqrdmulh v2.4S, v4.4S, v29.4S +mul v4.4S, v4.4S,v19.4S +trn1 v7.2D, v28.2D, v8.2D +trn1 v18.2D, v15.2D, v11.2D +mla v16.4S, v12.4S, v31.s[0] +sub v12.4s, v10.4s, v13.4s +add v10.4s, v10.4s, v13.4s +sqrdmulh v13.4S, v30.4S, v27.4S +ldr q11, [x17, #+1056] +ldr q15, [x17, #+1072] +mul v30.4S, v30.4S,v20.4S +str q26, [x0, #256] +str q17, [x0, #272] +mla v4.4S, v2.4S, v31.s[0] +sub v2.4s, v21.4s, v16.4s +add v21.4s, v21.4s, v16.4s +sqrdmulh v16.4S, v22.4S, v27.4S +mul v22.4S, v22.4S,v20.4S +str q10, [x0, #288] +mla v30.4S, v13.4S, v31.s[0] +sub v13.4s, v9.4s, v4.4s +add v9.4s, v9.4s, v4.4s +ldr q4, [x17, #+960] +ldr q10, [x17, #+976] +sqrdmulh v27.4S, v1.4S, v15.4S +str q21, [x0, #320] +mul v1.4S, v1.4S,v11.4S +str q12, [x0, #304] +mla v22.4S, v16.4S, v31.s[0] +str q2, [x0, #336] +sub v2.4s, v6.4s, v30.4s +add v6.4s, v6.4s, v30.4s +ldr q30, [x17, #+992] +ldr q16, [x17, #+1008] +sqrdmulh v12.4S, v14.4S, v15.4S +mul v14.4S, v14.4S,v11.4S +str q9, [x0, #352] +mla v1.4S, v27.4S, v31.s[0] +sub v27.4s, v3.4s, v22.4s +add v3.4s, v3.4s, v22.4s +ldr q22, [x17, #+1088] +ldr q9, [x17, #+1104] +sqrdmulh v15.4S, v3.4S, v10.4S +mul v3.4S, v3.4S,v4.4S +ldr q10, [x17, #+1120] +ldr q4, [x17, #+1136] +str q13, [x0, #368] +mla v14.4S, v12.4S, v31.s[0] +sub v12.4s, v7.4s, v1.4s +add v7.4s, v7.4s, v1.4s +sqrdmulh v1.4S, v27.4S, v16.4S +mul v27.4S, v27.4S,v30.4S +mla v3.4S, v15.4S, v31.s[0] +sub v15.4s, v18.4s, v14.4s +add v18.4s, v18.4s, v14.4s +sqrdmulh v14.4S, v18.4S, v9.4S +mul v18.4S, v18.4S,v22.4S +ldr q9, [x0, #544] +mla v27.4S, v1.4S, v31.s[0] +ldr q1, [x0, #560] +sub v22.4s, v6.4s, v3.4s +add v6.4s, v6.4s, v3.4s +sqrdmulh v3.4S, v15.4S, v4.4S +ldr q16, [x17, #+1152] +mul v15.4S, v15.4S,v10.4S +ldr q4, [x17, #+1168] +mla v18.4S, v14.4S, v31.s[0] +ldr q14, [x17, #+1280] +sub v10.4s, v2.4s, v27.4s +ldr q30, [x0, #608] +add v2.4s, v2.4s, v27.4s +ldr q27, [x17, #+1296] +ldr q13, [x0, #624] +sqrdmulh v11.4S, v9.4S, v4.s[0] +mul v9.4S, v9.4S,v16.s[0] +mla v15.4S, v3.4S, v31.s[0] +sub v3.4s, v7.4s, v18.4s +add v7.4s, v7.4s, v18.4s +sqrdmulh v18.4S, v1.4S, v4.s[0] +mul v1.4S, v1.4S,v16.s[0] +str q6, [x0, #384] +str q22, [x0, #400] +str q2, [x0, #416] +str q10, [x0, #432] +mla v9.4S, v11.4S, v31.s[0] +sub v11.4s, v12.4s, v15.4s +add v12.4s, v12.4s, v15.4s +ldr q15, [x0, #512] +sqrdmulh v10.4S, v30.4S, v27.s[0] +ldr q2, [x0, #528] +mul v30.4S, v30.4S,v14.s[0] +str q7, [x0, #448] +str q3, [x0, #464] +str q12, [x0, #480] +str q11, [x0, #496] +mla v1.4S, v18.4S, v31.s[0] +sub v18.4s, v15.4s, v9.4s +add v15.4s, v15.4s, v9.4s +sqrdmulh v9.4S, v13.4S, v27.s[0] +ldr q11, [x0, #672] +mul v13.4S, v13.4S,v14.s[0] +ldr q12, [x0, #688] +mla v30.4S, v10.4S, v31.s[0] +ldr q10, [x0, #576] +sub v3.4s, v2.4s, v1.4s +add v2.4s, v2.4s, v1.4s +sqrdmulh v1.4S, v2.4S, v4.s[1] +ldr q7, [x0, #640] +mul v2.4S, v2.4S,v16.s[1] +ldr q22, [x0, #656] +mla v13.4S, v9.4S, v31.s[0] +ldr q9, [x0, #592] +sub v6.4s, v10.4s, v30.4s +add v10.4s, v10.4s, v30.4s +sqrdmulh v30.4S, v3.4S, v4.s[2] +ldr q21, [x17, #+1408] +mul v3.4S, v3.4S,v16.s[2] +ldr q20, [x17, #+1424] +mla v2.4S, v1.4S, v31.s[0] +sub v1.4s, v9.4s, v13.4s +add v9.4s, v9.4s, v13.4s +sqrdmulh v13.4S, v9.4S, v27.s[1] +ldr q17, [x0, #736] +mul v9.4S, v9.4S,v14.s[1] +ldr q26, [x0, #752] +mla v3.4S, v30.4S, v31.s[0] +sub v30.4s, v15.4s, v2.4s +add v15.4s, v15.4s, v2.4s +sqrdmulh v4.4S, v1.4S, v27.s[2] +ldr q16, [x0, #704] +mul v1.4S, v1.4S,v14.s[2] +ldr q2, [x0, #720] +mla v9.4S, v13.4S, v31.s[0] +sub v13.4s, v18.4s, v3.4s +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v11.4S, v20.s[0] +ldr q8, [x17, #+1536] +mul v11.4S, v11.4S,v21.s[0] +ldr q28, [x17, #+1552] +trn1 v29.4S, v15.4S, v30.4S +trn2 v19.4S, v15.4S, v30.4S +mla v1.4S, v4.4S, v31.s[0] +sub v4.4s, v10.4s, v9.4s +add v10.4s, v10.4s, v9.4s +sqrdmulh v27.4S, v12.4S, v20.s[0] +mul v12.4S, v12.4S,v21.s[0] +trn1 v14.4S, v18.4S, v13.4S +trn2 v9.4S, v18.4S, v13.4S +mla v11.4S, v3.4S, v31.s[0] +sub v3.4s, v6.4s, v1.4s +add v6.4s, v6.4s, v1.4s +sqrdmulh v1.4S, v17.4S, v28.s[0] +mul v17.4S, v17.4S,v8.s[0] +trn2 v18.2D, v29.2D, v14.2D +trn2 v13.2D, v19.2D, v9.2D +mla v12.4S, v27.4S, v31.s[0] +sub v27.4s, v7.4s, v11.4s +add v7.4s, v7.4s, v11.4s +sqrdmulh v11.4S, v26.4S, v28.s[0] +mul v26.4S, v26.4S,v8.s[0] +trn1 v15.2D, v29.2D, v14.2D +trn1 v30.2D, v19.2D, v9.2D +mla v17.4S, v1.4S, v31.s[0] +sub v1.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v22.4S, v20.s[1] +mul v22.4S, v22.4S,v21.s[1] +trn1 v9.4S, v10.4S, v4.4S +trn2 v19.4S, v10.4S, v4.4S +mla v26.4S, v11.4S, v31.s[0] +sub v11.4s, v16.4s, v17.4s +add v16.4s, v16.4s, v17.4s +sqrdmulh v17.4S, v1.4S, v20.s[2] +mul v1.4S, v1.4S,v21.s[2] +trn1 v14.4S, v6.4S, v3.4S +trn2 v29.4S, v6.4S, v3.4S +ldr q0, [x17, #+1184] +ldr q5, [x17, #+1200] +mla v22.4S, v12.4S, v31.s[0] +sub v12.4s, v2.4s, v26.4s +add v2.4s, v2.4s, v26.4s +sqrdmulh v26.4S, v2.4S, v28.s[1] +mul v2.4S, v2.4S,v8.s[1] +trn2 v6.2D, v9.2D, v14.2D +trn2 v3.2D, v19.2D, v29.2D +mla v1.4S, v17.4S, v31.s[0] +sub v17.4s, v7.4s, v22.4s +add v7.4s, v7.4s, v22.4s +sqrdmulh v20.4S, v12.4S, v28.s[2] +mul v12.4S, v12.4S,v8.s[2] +trn1 v10.2D, v9.2D, v14.2D +trn1 v4.2D, v19.2D, v29.2D +mla v2.4S, v26.4S, v31.s[0] +sub v26.4s, v27.4s, v1.4s +add v27.4s, v27.4s, v1.4s +sqrdmulh v1.4S, v18.4S, v5.4S +mul v18.4S, v18.4S,v0.4S +mla v12.4S, v20.4S, v31.s[0] +sub v20.4s, v16.4s, v2.4s +add v16.4s, v16.4s, v2.4s +ldr q28, [x17, #+1312] +ldr q8, [x17, #+1328] +sqrdmulh v2.4S, v13.4S, v5.4S +mul v13.4S, v13.4S,v0.4S +trn1 v29.4S, v7.4S, v17.4S +trn2 v19.4S, v7.4S, v17.4S +mla v18.4S, v1.4S, v31.s[0] +sub v1.4s, v11.4s, v12.4s +add v11.4s, v11.4s, v12.4s +ldr q5, [x17, #+1216] +ldr q0, [x17, #+1232] +trn1 v12.4S, v27.4S, v26.4S +trn2 v14.4S, v27.4S, v26.4S +sqrdmulh v9.4S, v6.4S, v8.4S +mul v6.4S, v6.4S,v28.4S +trn2 v27.2D, v29.2D, v12.2D +trn2 v26.2D, v19.2D, v14.2D +ldr q21, [x17, #+1248] +ldr q22, [x17, #+1264] +mla v13.4S, v2.4S, v31.s[0] +sub v2.4s, v15.4s, v18.4s +add v15.4s, v15.4s, v18.4s +sqrdmulh v18.4S, v3.4S, v8.4S +mul v3.4S, v3.4S,v28.4S +trn1 v7.2D, v29.2D, v12.2D +trn1 v17.2D, v19.2D, v14.2D +ldr q14, [x17, #+1344] +ldr q19, [x17, #+1360] +mla v6.4S, v9.4S, v31.s[0] +sub v9.4s, v30.4s, v13.4s +add v30.4s, v30.4s, v13.4s +sqrdmulh v13.4S, v30.4S, v0.4S +mul v30.4S, v30.4S,v5.4S +trn1 v0.4S, v16.4S, v20.4S +trn2 v5.4S, v16.4S, v20.4S +ldr q12, [x17, #+1376] +ldr q29, [x17, #+1392] +mla v3.4S, v18.4S, v31.s[0] +sub v18.4s, v10.4s, v6.4s +add v10.4s, v10.4s, v6.4s +sqrdmulh v6.4S, v9.4S, v22.4S +mul v9.4S, v9.4S,v21.4S +trn1 v22.4S, v11.4S, v1.4S +trn2 v21.4S, v11.4S, v1.4S +mla v30.4S, v13.4S, v31.s[0] +sub v13.4s, v4.4s, v3.4s +add v4.4s, v4.4s, v3.4s +sqrdmulh v3.4S, v4.4S, v19.4S +mul v4.4S, v4.4S,v14.4S +ldr q8, [x17, #+1440] +ldr q28, [x17, #+1456] +trn2 v11.2D, v0.2D, v22.2D +trn2 v1.2D, v5.2D, v21.2D +mla v9.4S, v6.4S, v31.s[0] +sub v6.4s, v15.4s, v30.4s +add v15.4s, v15.4s, v30.4s +sqrdmulh v30.4S, v13.4S, v29.4S +mul v13.4S, v13.4S,v12.4S +trn1 v16.2D, v0.2D, v22.2D +trn1 v20.2D, v5.2D, v21.2D +mla v4.4S, v3.4S, v31.s[0] +sub v3.4s, v2.4s, v9.4s +add v2.4s, v2.4s, v9.4s +sqrdmulh v9.4S, v27.4S, v28.4S +ldr q21, [x17, #+1568] +ldr q5, [x17, #+1584] +mul v27.4S, v27.4S,v8.4S +str q15, [x0, #512] +str q6, [x0, #528] +mla v13.4S, v30.4S, v31.s[0] +sub v30.4s, v10.4s, v4.4s +add v10.4s, v10.4s, v4.4s +sqrdmulh v4.4S, v26.4S, v28.4S +mul v26.4S, v26.4S,v8.4S +str q2, [x0, #544] +mla v27.4S, v9.4S, v31.s[0] +sub v9.4s, v18.4s, v13.4s +add v18.4s, v18.4s, v13.4s +ldr q13, [x17, #+1472] +ldr q2, [x17, #+1488] +sqrdmulh v28.4S, v11.4S, v5.4S +str q10, [x0, #576] +mul v11.4S, v11.4S,v21.4S +str q3, [x0, #560] +mla v26.4S, v4.4S, v31.s[0] +str q30, [x0, #592] +sub v30.4s, v7.4s, v27.4s +add v7.4s, v7.4s, v27.4s +ldr q27, [x17, #+1504] +ldr q4, [x17, #+1520] +sqrdmulh v3.4S, v1.4S, v5.4S +mul v1.4S, v1.4S,v21.4S +str q18, [x0, #608] +mla v11.4S, v28.4S, v31.s[0] +sub v28.4s, v17.4s, v26.4s +add v17.4s, v17.4s, v26.4s +ldr q26, [x17, #+1600] +ldr q18, [x17, #+1616] +sqrdmulh v5.4S, v17.4S, v2.4S +mul v17.4S, v17.4S,v13.4S +ldr q2, [x17, #+1632] +ldr q13, [x17, #+1648] +str q9, [x0, #624] +mla v1.4S, v3.4S, v31.s[0] +sub v3.4s, v16.4s, v11.4s +add v16.4s, v16.4s, v11.4s +sqrdmulh v11.4S, v28.4S, v4.4S +mul v28.4S, v28.4S,v27.4S +mla v17.4S, v5.4S, v31.s[0] +sub v5.4s, v20.4s, v1.4s +add v20.4s, v20.4s, v1.4s +sqrdmulh v1.4S, v20.4S, v18.4S +mul v20.4S, v20.4S,v26.4S +ldr q18, [x0, #800] +mla v28.4S, v11.4S, v31.s[0] +ldr q11, [x0, #816] +sub v26.4s, v7.4s, v17.4s +add v7.4s, v7.4s, v17.4s +sqrdmulh v17.4S, v5.4S, v13.4S +ldr q4, [x17, #+1664] +mul v5.4S, v5.4S,v2.4S +ldr q13, [x17, #+1680] +mla v20.4S, v1.4S, v31.s[0] +ldr q1, [x17, #+1792] +sub v2.4s, v30.4s, v28.4s +ldr q27, [x0, #864] +add v30.4s, v30.4s, v28.4s +ldr q28, [x17, #+1808] +ldr q9, [x0, #880] +sqrdmulh v21.4S, v18.4S, v13.s[0] +mul v18.4S, v18.4S,v4.s[0] +mla v5.4S, v17.4S, v31.s[0] +sub v17.4s, v16.4s, v20.4s +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v11.4S, v13.s[0] +mul v11.4S, v11.4S,v4.s[0] +str q7, [x0, #640] +str q26, [x0, #656] +str q30, [x0, #672] +str q2, [x0, #688] +mla v18.4S, v21.4S, v31.s[0] +sub v21.4s, v3.4s, v5.4s +add v3.4s, v3.4s, v5.4s +ldr q5, [x0, #768] +sqrdmulh v2.4S, v27.4S, v28.s[0] +ldr q30, [x0, #784] +mul v27.4S, v27.4S,v1.s[0] +str q16, [x0, #704] +str q17, [x0, #720] +str q3, [x0, #736] +str q21, [x0, #752] +mla v11.4S, v20.4S, v31.s[0] +sub v20.4s, v5.4s, v18.4s +add v5.4s, v5.4s, v18.4s +sqrdmulh v18.4S, v9.4S, v28.s[0] +ldr q21, [x0, #928] +mul v9.4S, v9.4S,v1.s[0] +ldr q3, [x0, #944] +mla v27.4S, v2.4S, v31.s[0] +ldr q2, [x0, #832] +sub v17.4s, v30.4s, v11.4s +add v30.4s, v30.4s, v11.4s +sqrdmulh v11.4S, v30.4S, v13.s[1] +ldr q16, [x0, #896] +mul v30.4S, v30.4S,v4.s[1] +ldr q26, [x0, #912] +mla v9.4S, v18.4S, v31.s[0] +ldr q18, [x0, #848] +sub v7.4s, v2.4s, v27.4s +add v2.4s, v2.4s, v27.4s +sqrdmulh v27.4S, v17.4S, v13.s[2] +ldr q10, [x17, #+1920] +mul v17.4S, v17.4S,v4.s[2] +ldr q8, [x17, #+1936] +mla v30.4S, v11.4S, v31.s[0] +sub v11.4s, v18.4s, v9.4s +add v18.4s, v18.4s, v9.4s +sqrdmulh v9.4S, v18.4S, v28.s[1] +ldr q6, [x0, #992] +mul v18.4S, v18.4S,v1.s[1] +ldr q15, [x0, #1008] +mla v17.4S, v27.4S, v31.s[0] +sub v27.4s, v5.4s, v30.4s +add v5.4s, v5.4s, v30.4s +sqrdmulh v13.4S, v11.4S, v28.s[2] +ldr q4, [x0, #960] +mul v11.4S, v11.4S,v1.s[2] +ldr q30, [x0, #976] +mla v18.4S, v9.4S, v31.s[0] +sub v9.4s, v20.4s, v17.4s +add v20.4s, v20.4s, v17.4s +sqrdmulh v17.4S, v21.4S, v8.s[0] +ldr q22, [x17, #+2048] +mul v21.4S, v21.4S,v10.s[0] +ldr q0, [x17, #+2064] +trn1 v29.4S, v5.4S, v27.4S +trn2 v12.4S, v5.4S, v27.4S +mla v11.4S, v13.4S, v31.s[0] +sub v13.4s, v2.4s, v18.4s +add v2.4s, v2.4s, v18.4s +sqrdmulh v28.4S, v3.4S, v8.s[0] +mul v3.4S, v3.4S,v10.s[0] +trn1 v1.4S, v20.4S, v9.4S +trn2 v18.4S, v20.4S, v9.4S +mla v21.4S, v17.4S, v31.s[0] +sub v17.4s, v7.4s, v11.4s +add v7.4s, v7.4s, v11.4s +sqrdmulh v11.4S, v6.4S, v0.s[0] +mul v6.4S, v6.4S,v22.s[0] +trn2 v20.2D, v29.2D, v1.2D +trn2 v9.2D, v12.2D, v18.2D +mla v3.4S, v28.4S, v31.s[0] +sub v28.4s, v16.4s, v21.4s +add v16.4s, v16.4s, v21.4s +sqrdmulh v21.4S, v15.4S, v0.s[0] +mul v15.4S, v15.4S,v22.s[0] +trn1 v5.2D, v29.2D, v1.2D +trn1 v27.2D, v12.2D, v18.2D +mla v6.4S, v11.4S, v31.s[0] +sub v11.4s, v26.4s, v3.4s +add v26.4s, v26.4s, v3.4s +sqrdmulh v3.4S, v26.4S, v8.s[1] +mul v26.4S, v26.4S,v10.s[1] +trn1 v18.4S, v2.4S, v13.4S +trn2 v12.4S, v2.4S, v13.4S +mla v15.4S, v21.4S, v31.s[0] +sub v21.4s, v4.4s, v6.4s +add v4.4s, v4.4s, v6.4s +sqrdmulh v6.4S, v11.4S, v8.s[2] +mul v11.4S, v11.4S,v10.s[2] +trn1 v1.4S, v7.4S, v17.4S +trn2 v29.4S, v7.4S, v17.4S +ldr q19, [x17, #+1696] +ldr q14, [x17, #+1712] +mla v26.4S, v3.4S, v31.s[0] +sub v3.4s, v30.4s, v15.4s +add v30.4s, v30.4s, v15.4s +sqrdmulh v15.4S, v30.4S, v0.s[1] +mul v30.4S, v30.4S,v22.s[1] +trn2 v7.2D, v18.2D, v1.2D +trn2 v17.2D, v12.2D, v29.2D +mla v11.4S, v6.4S, v31.s[0] +sub v6.4s, v16.4s, v26.4s +add v16.4s, v16.4s, v26.4s +sqrdmulh v8.4S, v3.4S, v0.s[2] +mul v3.4S, v3.4S,v22.s[2] +trn1 v2.2D, v18.2D, v1.2D +trn1 v13.2D, v12.2D, v29.2D +mla v30.4S, v15.4S, v31.s[0] +sub v15.4s, v28.4s, v11.4s +add v28.4s, v28.4s, v11.4s +sqrdmulh v11.4S, v20.4S, v14.4S +mul v20.4S, v20.4S,v19.4S +mla v3.4S, v8.4S, v31.s[0] +sub v8.4s, v4.4s, v30.4s +add v4.4s, v4.4s, v30.4s +ldr q0, [x17, #+1824] +ldr q22, [x17, #+1840] +sqrdmulh v30.4S, v9.4S, v14.4S +mul v9.4S, v9.4S,v19.4S +trn1 v29.4S, v16.4S, v6.4S +trn2 v12.4S, v16.4S, v6.4S +mla v20.4S, v11.4S, v31.s[0] +sub v11.4s, v21.4s, v3.4s +add v21.4s, v21.4s, v3.4s +ldr q14, [x17, #+1728] +ldr q19, [x17, #+1744] +trn1 v3.4S, v28.4S, v15.4S +trn2 v1.4S, v28.4S, v15.4S +sqrdmulh v18.4S, v7.4S, v22.4S +mul v7.4S, v7.4S,v0.4S +trn2 v28.2D, v29.2D, v3.2D +trn2 v15.2D, v12.2D, v1.2D +ldr q10, [x17, #+1760] +ldr q26, [x17, #+1776] +mla v9.4S, v30.4S, v31.s[0] +sub v30.4s, v5.4s, v20.4s +add v5.4s, v5.4s, v20.4s +sqrdmulh v20.4S, v17.4S, v22.4S +mul v17.4S, v17.4S,v0.4S +trn1 v16.2D, v29.2D, v3.2D +trn1 v6.2D, v12.2D, v1.2D +ldr q1, [x17, #+1856] +ldr q12, [x17, #+1872] +mla v7.4S, v18.4S, v31.s[0] +sub v18.4s, v27.4s, v9.4s +add v27.4s, v27.4s, v9.4s +sqrdmulh v9.4S, v27.4S, v19.4S +mul v27.4S, v27.4S,v14.4S +trn1 v19.4S, v4.4S, v8.4S +trn2 v14.4S, v4.4S, v8.4S +ldr q3, [x17, #+1888] +ldr q29, [x17, #+1904] +mla v17.4S, v20.4S, v31.s[0] +sub v20.4s, v2.4s, v7.4s +add v2.4s, v2.4s, v7.4s +sqrdmulh v7.4S, v18.4S, v26.4S +mul v18.4S, v18.4S,v10.4S +trn1 v26.4S, v21.4S, v11.4S +trn2 v10.4S, v21.4S, v11.4S +mla v27.4S, v9.4S, v31.s[0] +sub v9.4s, v13.4s, v17.4s +add v13.4s, v13.4s, v17.4s +sqrdmulh v17.4S, v13.4S, v12.4S +mul v13.4S, v13.4S,v1.4S +ldr q22, [x17, #+1952] +ldr q0, [x17, #+1968] +trn2 v21.2D, v19.2D, v26.2D +trn2 v11.2D, v14.2D, v10.2D +mla v18.4S, v7.4S, v31.s[0] +sub v7.4s, v5.4s, v27.4s +add v5.4s, v5.4s, v27.4s +sqrdmulh v27.4S, v9.4S, v29.4S +mul v9.4S, v9.4S,v3.4S +trn1 v4.2D, v19.2D, v26.2D +trn1 v8.2D, v14.2D, v10.2D +mla v13.4S, v17.4S, v31.s[0] +sub v17.4s, v30.4s, v18.4s +add v30.4s, v30.4s, v18.4s +sqrdmulh v18.4S, v28.4S, v0.4S +ldr q10, [x17, #+2080] +ldr q14, [x17, #+2096] +mul v28.4S, v28.4S,v22.4S +str q5, [x0, #768] +str q7, [x0, #784] +mla v9.4S, v27.4S, v31.s[0] +sub v27.4s, v2.4s, v13.4s +add v2.4s, v2.4s, v13.4s +sqrdmulh v13.4S, v15.4S, v0.4S +mul v15.4S, v15.4S,v22.4S +str q30, [x0, #800] +mla v28.4S, v18.4S, v31.s[0] +sub v18.4s, v20.4s, v9.4s +add v20.4s, v20.4s, v9.4s +ldr q9, [x17, #+1984] +ldr q30, [x17, #+2000] +sqrdmulh v0.4S, v21.4S, v14.4S +str q2, [x0, #832] +mul v21.4S, v21.4S,v10.4S +str q17, [x0, #816] +mla v15.4S, v13.4S, v31.s[0] +str q27, [x0, #848] +sub v27.4s, v16.4s, v28.4s +add v16.4s, v16.4s, v28.4s +ldr q28, [x17, #+2016] +ldr q13, [x17, #+2032] +sqrdmulh v17.4S, v11.4S, v14.4S +mul v11.4S, v11.4S,v10.4S +str q20, [x0, #864] +mla v21.4S, v0.4S, v31.s[0] +sub v0.4s, v6.4s, v15.4s +add v6.4s, v6.4s, v15.4s +ldr q15, [x17, #+2112] +ldr q20, [x17, #+2128] +sqrdmulh v14.4S, v6.4S, v30.4S +mul v6.4S, v6.4S,v9.4S +ldr q30, [x17, #+2144] +ldr q9, [x17, #+2160] +str q18, [x0, #880] +mla v11.4S, v17.4S, v31.s[0] +sub v17.4s, v4.4s, v21.4s +add v4.4s, v4.4s, v21.4s +sqrdmulh v21.4S, v0.4S, v13.4S +mul v0.4S, v0.4S,v28.4S +mla v6.4S, v14.4S, v31.s[0] +sub v14.4s, v8.4s, v11.4s +add v8.4s, v8.4s, v11.4s +sqrdmulh v11.4S, v8.4S, v20.4S +mul v8.4S, v8.4S,v15.4S +mla v0.4S, v21.4S, v31.s[0] +sub v21.4s, v16.4s, v6.4s +add v16.4s, v16.4s, v6.4s +sqrdmulh v6.4S, v14.4S, v9.4S +mul v14.4S, v14.4S,v30.4S +mla v8.4S, v11.4S, v31.s[0] +sub v11.4s, v27.4s, v0.4s +add v27.4s, v27.4s, v0.4s +mla v14.4S, v6.4S, v31.s[0] +sub v6.4s, v4.4s, v8.4s +add v4.4s, v4.4s, v8.4s +str q16, [x0, #896] +str q21, [x0, #912] +str q27, [x0, #928] +str q11, [x0, #944] +sub v11.4s, v17.4s, v14.4s +add v17.4s, v17.4s, v14.4s +str q4, [x0, #960] +str q6, [x0, #976] +str q17, [x0, #992] +str q11, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 2392 +// Instruction count: 2388 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_4_0.s b/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_4_0.s new file mode 100644 index 0000000..807dda1 --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_4_0.s @@ -0,0 +1,2422 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 26036764 // Layer 6, block 0 +.word 7065381 // Layer 6, block 1 +.word 11280567 // Layer 6, block 2 +.word 19695786 // Layer 6, block 3 +.word 1666225723 // Layer 6, block 0 +.word 452149874 // Layer 6, block 1 +.word 721901190 // Layer 6, block 2 +.word 1260434103 // Layer 6, block 3 +.word 28678040 // Layer 7, block 0 +.word 5637166 // Layer 7, block 2 +.word 18759424 // Layer 7, block 4 +.word 8648030 // Layer 7, block 6 +.word 1835254486 // Layer 7, block 0 +.word 360751090 // Layer 7, block 2 +.word 1200511508 // Layer 7, block 4 +.word 553431680 // Layer 7, block 6 +.word 7232147 // Layer 7, block 1 +.word 7430689 // Layer 7, block 3 +.word 14819378 // Layer 7, block 5 +.word 22112339 // Layer 7, block 7 +.word 462822084 // Layer 7, block 1 +.word 475527802 // Layer 7, block 3 +.word 948367809 // Layer 7, block 5 +.word 1415081692 // Layer 7, block 7 +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14834498 // Layer 6, block 4 +.word 22861321 // Layer 6, block 5 +.word 23033862 // Layer 6, block 6 +.word 32211066 // Layer 6, block 7 +.word 949335415 // Layer 6, block 4 +.word 1463012881 // Layer 6, block 5 +.word 1474054663 // Layer 6, block 6 +.word 2061350894 // Layer 6, block 7 +.word 7103825 // Layer 7, block 8 +.word 24338119 // Layer 7, block 10 +.word 6674394 // Layer 7, block 12 +.word 3716128 // Layer 7, block 14 +.word 454610102 // Layer 7, block 8 +.word 1557520740 // Layer 7, block 10 +.word 427128616 // Layer 7, block 12 +.word 237814041 // Layer 7, block 14 +.word 18577393 // Layer 7, block 9 +.word 17042091 // Layer 7, block 11 +.word 6574213 // Layer 7, block 13 +.word 24666803 // Layer 7, block 15 +.word 1188862414 // Layer 7, block 9 +.word 1090610585 // Layer 7, block 11 +.word 420717521 // Layer 7, block 13 +.word 1578554911 // Layer 7, block 15 +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 11253846 // Layer 6, block 8 +.word 16151303 // Layer 6, block 9 +.word 1821442 // Layer 6, block 10 +.word 23358663 // Layer 6, block 11 +.word 720191176 // Layer 6, block 8 +.word 1033604503 // Layer 6, block 9 +.word 116563391 // Layer 6, block 10 +.word 1494840340 // Layer 6, block 11 +.word 32787475 // Layer 7, block 16 +.word 8269259 // Layer 7, block 18 +.word 20826321 // Layer 7, block 20 +.word 21194054 // Layer 7, block 22 +.word 2098238255 // Layer 7, block 16 +.word 529192186 // Layer 7, block 18 +.word 1332782821 // Layer 7, block 20 +.word 1356315937 // Layer 7, block 22 +.word 28400654 // Layer 7, block 17 +.word 31090287 // Layer 7, block 19 +.word 26776841 // Layer 7, block 21 +.word 22281074 // Layer 7, block 23 +.word 1817503137 // Layer 7, block 17 +.word 1989626512 // Layer 7, block 19 +.word 1713587037 // Layer 7, block 21 +.word 1425879908 // Layer 7, block 23 +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 20504641 // Layer 6, block 12 +.word 7735096 // Layer 6, block 13 +.word 29463916 // Layer 6, block 14 +.word 23172067 // Layer 6, block 15 +.word 1312196872 // Layer 6, block 12 +.word 495008363 // Layer 6, block 13 +.word 1885546712 // Layer 6, block 14 +.word 1482899108 // Layer 6, block 15 +.word 1953000 // Layer 7, block 24 +.word 12766243 // Layer 7, block 26 +.word 16292342 // Layer 7, block 28 +.word 25143337 // Layer 7, block 30 +.word 124982461 // Layer 7, block 24 +.word 816977197 // Layer 7, block 26 +.word 1042630311 // Layer 7, block 28 +.word 1609050759 // Layer 7, block 30 +.word 12486848 // Layer 7, block 25 +.word 31556661 // Layer 7, block 27 +.word 28330310 // Layer 7, block 29 +.word 15137961 // Layer 7, block 31 +.word 799097282 // Layer 7, block 25 +.word 2019472170 // Layer 7, block 27 +.word 1813001465 // Layer 7, block 29 +.word 968755565 // Layer 7, block 31 +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 18663828 // Layer 6, block 16 +.word 25765932 // Layer 6, block 17 +.word 11779122 // Layer 6, block 18 +.word 29112305 // Layer 6, block 19 +.word 1194393831 // Layer 6, block 16 +.word 1648893798 // Layer 6, block 17 +.word 753806275 // Layer 6, block 18 +.word 1863045325 // Layer 6, block 19 +.word 33163184 // Layer 7, block 32 +.word 11550623 // Layer 7, block 34 +.word 25375595 // Layer 7, block 36 +.word 18254638 // Layer 7, block 38 +.word 2122281795 // Layer 7, block 32 +.word 739183455 // Layer 7, block 34 +.word 1623914137 // Layer 7, block 36 +.word 1168207670 // Layer 7, block 38 +.word 9551359 // Layer 7, block 33 +.word 33257316 // Layer 7, block 35 +.word 10387700 // Layer 7, block 37 +.word 4263629 // Layer 7, block 39 +.word 611240324 // Layer 7, block 33 +.word 2128305784 // Layer 7, block 35 +.word 664762063 // Layer 7, block 37 +.word 272851431 // Layer 7, block 39 +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 596073 // Layer 6, block 20 +.word 29039358 // Layer 6, block 21 +.word 6760262 // Layer 6, block 22 +.word 2228887 // Layer 6, block 23 +.word 38145761 // Layer 6, block 20 +.word 1858377074 // Layer 6, block 21 +.word 432623749 // Layer 6, block 22 +.word 142637881 // Layer 6, block 23 +.word 25929180 // Layer 7, block 40 +.word 23508428 // Layer 7, block 42 +.word 22560727 // Layer 7, block 44 +.word 29457393 // Layer 7, block 46 +.word 1659340873 // Layer 7, block 40 +.word 1504424569 // Layer 7, block 42 +.word 1443776334 // Layer 7, block 44 +.word 1885129272 // Layer 7, block 46 +.word 17371159 // Layer 7, block 41 +.word 11558208 // Layer 7, block 43 +.word 15755637 // Layer 7, block 45 +.word 20740787 // Layer 7, block 47 +.word 1111669329 // Layer 7, block 41 +.word 739668858 // Layer 7, block 43 +.word 1008283812 // Layer 7, block 45 +.word 1327309063 // Layer 7, block 47 +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 13624329 // Layer 6, block 24 +.word 9838349 // Layer 6, block 25 +.word 6934560 // Layer 6, block 26 +.word 11310234 // Layer 6, block 27 +.word 871890510 // Layer 6, block 24 +.word 629606282 // Layer 6, block 25 +.word 443777969 // Layer 6, block 26 +.word 723799733 // Layer 6, block 27 +.word 3153984 // Layer 7, block 48 +.word 15599806 // Layer 7, block 50 +.word 23484790 // Layer 7, block 52 +.word 30174454 // Layer 7, block 54 +.word 201839571 // Layer 7, block 48 +.word 998311389 // Layer 7, block 50 +.word 1502911852 // Layer 7, block 52 +.word 1931017673 // Layer 7, block 54 +.word 13598070 // Layer 7, block 49 +.word 31454003 // Layer 7, block 51 +.word 20506260 // Layer 7, block 53 +.word 5928435 // Layer 7, block 55 +.word 870210062 // Layer 7, block 49 +.word 2012902560 // Layer 7, block 51 +.word 1312300480 // Layer 7, block 53 +.word 379390883 // Layer 7, block 55 +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 32798516 // Layer 6, block 28 +.word 9911360 // Layer 6, block 29 +.word 32443170 // Layer 6, block 30 +.word 31293482 // Layer 6, block 31 +.word 2098944825 // Layer 6, block 28 +.word 634278629 // Layer 6, block 29 +.word 2076204416 // Layer 6, block 30 +.word 2002630000 // Layer 6, block 31 +.word 26013877 // Layer 7, block 56 +.word 22928950 // Layer 7, block 58 +.word 24547058 // Layer 7, block 60 +.word 21082546 // Layer 7, block 62 +.word 1664761067 // Layer 7, block 56 +.word 1467340807 // Layer 7, block 58 +.word 1570891816 // Layer 7, block 60 +.word 1349179970 // Layer 7, block 62 +.word 21864746 // Layer 7, block 57 +.word 27678266 // Layer 7, block 59 +.word 30695887 // Layer 7, block 61 +.word 31772478 // Layer 7, block 63 +.word 1399236949 // Layer 7, block 57 +.word 1771273834 // Layer 7, block 59 +.word 1964386839 // Layer 7, block 61 +.word 2033283404 // Layer 7, block 63 +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 2853776 // Layer 6, block 32 +.word 31645959 // Layer 6, block 33 +.word 29723614 // Layer 6, block 34 +.word 31813171 // Layer 6, block 35 +.word 182627725 // Layer 6, block 32 +.word 2025186806 // Layer 6, block 33 +.word 1902166116 // Layer 6, block 34 +.word 2035887557 // Layer 6, block 35 +.word 30377953 // Layer 7, block 64 +.word 4924837 // Layer 7, block 66 +.word 11362575 // Layer 7, block 68 +.word 31398766 // Layer 7, block 70 +.word 1944040616 // Layer 7, block 64 +.word 315165513 // Layer 7, block 66 +.word 727149301 // Layer 7, block 68 +.word 2009367662 // Layer 7, block 70 +.word 27689101 // Layer 7, block 65 +.word 31229525 // Layer 7, block 67 +.word 6544948 // Layer 7, block 69 +.word 13728247 // Layer 7, block 71 +.word 1771967221 // Layer 7, block 65 +.word 1998537064 // Layer 7, block 67 +.word 418844704 // Layer 7, block 69 +.word 878540754 // Layer 7, block 71 +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9116920 // Layer 6, block 36 +.word 26449800 // Layer 6, block 37 +.word 27173300 // Layer 6, block 38 +.word 1574249 // Layer 6, block 39 +.word 583438350 // Layer 6, block 36 +.word 1692658010 // Layer 6, block 37 +.word 1738958476 // Layer 6, block 38 +.word 100744247 // Layer 6, block 39 +.word 6510145 // Layer 7, block 72 +.word 760999 // Layer 7, block 74 +.word 1634503 // Layer 7, block 76 +.word 29546109 // Layer 7, block 78 +.word 416617482 // Layer 7, block 72 +.word 48700219 // Layer 7, block 74 +.word 104600209 // Layer 7, block 76 +.word 1890806663 // Layer 7, block 78 +.word 2195232 // Layer 7, block 73 +.word 4465852 // Layer 7, block 75 +.word 31203102 // Layer 7, block 77 +.word 29916743 // Layer 7, block 79 +.word 140484126 // Layer 7, block 73 +.word 285792715 // Layer 7, block 75 +.word 1996846121 // Layer 7, block 77 +.word 1914525428 // Layer 7, block 79 +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29172999 // Layer 6, block 40 +.word 16825951 // Layer 6, block 41 +.word 11592382 // Layer 6, block 42 +.word 2671395 // Layer 6, block 43 +.word 1866929445 // Layer 6, block 40 +.word 1076778680 // Layer 6, block 41 +.word 741855827 // Layer 6, block 42 +.word 170956232 // Layer 6, block 43 +.word 14579779 // Layer 7, block 80 +.word 24263513 // Layer 7, block 82 +.word 4646776 // Layer 7, block 84 +.word 69049 // Layer 7, block 86 +.word 933034643 // Layer 7, block 80 +.word 1552746321 // Layer 7, block 82 +.word 297370968 // Layer 7, block 84 +.word 4418799 // Layer 7, block 86 +.word 33263488 // Layer 7, block 81 +.word 22493246 // Layer 7, block 83 +.word 22009979 // Layer 7, block 85 +.word 12021234 // Layer 7, block 87 +.word 2128700762 // Layer 7, block 81 +.word 1439457879 // Layer 7, block 83 +.word 1408531152 // Layer 7, block 85 +.word 769300260 // Layer 7, block 87 +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 15720958 // Layer 6, block 44 +.word 4876619 // Layer 6, block 45 +.word 9370171 // Layer 6, block 46 +.word 2197027 // Layer 6, block 47 +.word 1006064525 // Layer 6, block 44 +.word 312079797 // Layer 6, block 45 +.word 599645177 // Layer 6, block 46 +.word 140598997 // Layer 6, block 47 +.word 16117282 // Layer 7, block 88 +.word 9635661 // Layer 7, block 90 +.word 9117520 // Layer 7, block 92 +.word 3506913 // Layer 7, block 94 +.word 1031427326 // Layer 7, block 88 +.word 616635240 // Layer 7, block 90 +.word 583476747 // Layer 7, block 92 +.word 224425303 // Layer 7, block 94 +.word 20014407 // Layer 7, block 89 +.word 25893988 // Layer 7, block 91 +.word 10257619 // Layer 7, block 93 +.word 24501669 // Layer 7, block 95 +.word 1280824291 // Layer 7, block 89 +.word 1657088757 // Layer 7, block 91 +.word 656437514 // Layer 7, block 93 +.word 1567987141 // Layer 7, block 95 +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 23467272 // Layer 6, block 48 +.word 11944835 // Layer 6, block 49 +.word 29768154 // Layer 6, block 50 +.word 3189790 // Layer 6, block 51 +.word 1501790786 // Layer 6, block 48 +.word 764411097 // Layer 6, block 49 +.word 1905016458 // Layer 6, block 50 +.word 204130980 // Layer 6, block 51 +.word 28559032 // Layer 7, block 96 +.word 20151609 // Layer 7, block 98 +.word 11645481 // Layer 7, block 100 +.word 16402437 // Layer 7, block 102 +.word 1827638556 // Layer 7, block 96 +.word 1289604549 // Layer 7, block 98 +.word 745253903 // Layer 7, block 100 +.word 1049675853 // Layer 7, block 102 +.word 1005359 // Layer 7, block 97 +.word 19130139 // Layer 7, block 99 +.word 11690281 // Layer 7, block 101 +.word 5461508 // Layer 7, block 103 +.word 64338065 // Layer 7, block 97 +.word 1224235458 // Layer 7, block 99 +.word 748120885 // Layer 7, block 101 +.word 349509836 // Layer 7, block 103 +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 4898455 // Layer 6, block 52 +.word 22059944 // Layer 6, block 53 +.word 20315246 // Layer 6, block 54 +.word 28615767 // Layer 6, block 55 +.word 313477194 // Layer 6, block 52 +.word 1411728668 // Layer 6, block 53 +.word 1300076517 // Layer 6, block 54 +.word 1831269319 // Layer 6, block 55 +.word 6226096 // Layer 7, block 104 +.word 14029790 // Layer 7, block 106 +.word 7729000 // Layer 7, block 108 +.word 13958531 // Layer 7, block 110 +.word 398439734 // Layer 7, block 104 +.word 897838034 // Layer 7, block 106 +.word 494618249 // Layer 7, block 108 +.word 893277806 // Layer 7, block 110 +.word 31755058 // Layer 7, block 105 +.word 26102744 // Layer 7, block 107 +.word 19175904 // Layer 7, block 109 +.word 19472238 // Layer 7, block 111 +.word 2032168609 // Layer 7, block 105 +.word 1670448121 // Layer 7, block 107 +.word 1227164194 // Layer 7, block 109 +.word 1246128123 // Layer 7, block 111 +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 17302560 // Layer 6, block 56 +.word 8630188 // Layer 6, block 57 +.word 13744680 // Layer 6, block 58 +.word 31890906 // Layer 6, block 59 +.word 1107279328 // Layer 6, block 56 +.word 552289879 // Layer 6, block 57 +.word 879592386 // Layer 6, block 58 +.word 2040862218 // Layer 6, block 59 +.word 4735938 // Layer 7, block 112 +.word 26671657 // Layer 7, block 114 +.word 25810971 // Layer 7, block 116 +.word 25578690 // Layer 7, block 118 +.word 303076900 // Layer 7, block 112 +.word 1706855774 // Layer 7, block 114 +.word 1651776074 // Layer 7, block 116 +.word 1636911225 // Layer 7, block 118 +.word 6957373 // Layer 7, block 113 +.word 25381712 // Layer 7, block 115 +.word 27780827 // Layer 7, block 117 +.word 28062311 // Layer 7, block 119 +.word 445237890 // Layer 7, block 113 +.word 1624305595 // Layer 7, block 115 +.word 1777837237 // Layer 7, block 117 +.word 1795850838 // Layer 7, block 119 +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 26150922 // Layer 6, block 60 +.word 29525906 // Layer 6, block 61 +.word 23080870 // Layer 6, block 62 +.word 1636987 // Layer 6, block 63 +.word 1673531278 // Layer 6, block 60 +.word 1889513769 // Layer 6, block 61 +.word 1477062945 // Layer 6, block 62 +.word 104759172 // Layer 6, block 63 +.word 10674616 // Layer 7, block 120 +.word 9508293 // Layer 7, block 122 +.word 4274200 // Layer 7, block 124 +.word 10066304 // Layer 7, block 126 +.word 683123285 // Layer 7, block 120 +.word 608484310 // Layer 7, block 122 +.word 273527923 // Layer 7, block 124 +.word 644194289 // Layer 7, block 126 +.word 26473446 // Layer 7, block 121 +.word 14853570 // Layer 7, block 123 +.word 32427548 // Layer 7, block 125 +.word 16598340 // Layer 7, block 127 +.word 1694171239 // Layer 7, block 121 +.word 950555930 // Layer 7, block 123 +.word 2075204685 // Layer 7, block 125 +.word 1062212688 // Layer 7, block 127 +.text +.global ntt_u32_full_neon_asm_var_4_4_4_0 +.global _ntt_u32_full_neon_asm_var_4_4_4_0 +ntt_u32_full_neon_asm_var_4_4_4_0: +_ntt_u32_full_neon_asm_var_4_4_4_0: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x0, #800] +ldr q29, [x0, #864] +ldr q28, [x0, #928] +ldr q27, [x0, #992] +ldr q26, [x0, #288] +ldr q25, [x0, #352] +ldr q24, [x0, #416] +ldr q23, [x0, #480] +ldr q22, [x0, #544] +ldr q21, [x0, #608] +ldr q20, [x0, #672] +ldr q19, [x0, #736] +ldr q18, [x0, #32] +ldr q17, [x0, #96] +ldr q16, [x0, #160] +ldr q3, [x0, #224] +ldr q2, [x17, #+0] +ldr q1, [x17, #+16] +ldr q0, [x17, #+32] +ldr q15, [x17, #+48] +ldr q14, [x17, #+64] +ldr q13, [x17, #+80] +ldr q12, [x17, #+96] +ldr q11, [x17, #+112] +sqrdmulh v10.4S, v30.4S, v1.s[0] +sqrdmulh v9.4S, v29.4S, v1.s[0] +sqrdmulh v8.4S, v28.4S, v1.s[0] +sqrdmulh v7.4S, v27.4S, v1.s[0] +mul v30.4S, v30.4S,v2.s[0] +mul v29.4S, v29.4S,v2.s[0] +mul v28.4S, v28.4S,v2.s[0] +mul v27.4S, v27.4S,v2.s[0] +mla v30.4S, v10.4S, v31.s[0] +mla v29.4S, v9.4S, v31.s[0] +mla v28.4S, v8.4S, v31.s[0] +mla v27.4S, v7.4S, v31.s[0] +sub v7.4s, v26.4s, v30.4s +sub v8.4s, v25.4s, v29.4s +sub v9.4s, v24.4s, v28.4s +sub v10.4s, v23.4s, v27.4s +add v26.4s, v26.4s, v30.4s +add v25.4s, v25.4s, v29.4s +add v24.4s, v24.4s, v28.4s +add v23.4s, v23.4s, v27.4s +sqrdmulh v27.4S, v22.4S, v1.s[0] +sqrdmulh v28.4S, v21.4S, v1.s[0] +sqrdmulh v29.4S, v20.4S, v1.s[0] +sqrdmulh v30.4S, v19.4S, v1.s[0] +mul v22.4S, v22.4S,v2.s[0] +mul v21.4S, v21.4S,v2.s[0] +mul v20.4S, v20.4S,v2.s[0] +mul v19.4S, v19.4S,v2.s[0] +mla v22.4S, v27.4S, v31.s[0] +mla v21.4S, v28.4S, v31.s[0] +mla v20.4S, v29.4S, v31.s[0] +mla v19.4S, v30.4S, v31.s[0] +sub v30.4s, v18.4s, v22.4s +sub v29.4s, v17.4s, v21.4s +sub v28.4s, v16.4s, v20.4s +sub v27.4s, v3.4s, v19.4s +add v18.4s, v18.4s, v22.4s +add v17.4s, v17.4s, v21.4s +add v16.4s, v16.4s, v20.4s +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v24.4S, v1.s[1] +sqrdmulh v20.4S, v23.4S, v1.s[1] +sqrdmulh v21.4S, v26.4S, v1.s[1] +sqrdmulh v22.4S, v25.4S, v1.s[1] +mul v24.4S, v24.4S,v2.s[1] +mul v23.4S, v23.4S,v2.s[1] +mul v26.4S, v26.4S,v2.s[1] +mul v25.4S, v25.4S,v2.s[1] +mla v24.4S, v19.4S, v31.s[0] +mla v23.4S, v20.4S, v31.s[0] +mla v26.4S, v21.4S, v31.s[0] +mla v25.4S, v22.4S, v31.s[0] +sub v22.4s, v16.4s, v24.4s +sub v21.4s, v3.4s, v23.4s +sub v20.4s, v18.4s, v26.4s +sub v19.4s, v17.4s, v25.4s +add v16.4s, v16.4s, v24.4s +add v3.4s, v3.4s, v23.4s +add v18.4s, v18.4s, v26.4s +add v17.4s, v17.4s, v25.4s +sqrdmulh v25.4S, v9.4S, v1.s[2] +sqrdmulh v26.4S, v10.4S, v1.s[2] +sqrdmulh v23.4S, v7.4S, v1.s[2] +sqrdmulh v24.4S, v8.4S, v1.s[2] +mul v9.4S, v9.4S,v2.s[2] +mul v10.4S, v10.4S,v2.s[2] +mul v7.4S, v7.4S,v2.s[2] +mul v8.4S, v8.4S,v2.s[2] +mla v9.4S, v25.4S, v31.s[0] +mla v10.4S, v26.4S, v31.s[0] +mla v7.4S, v23.4S, v31.s[0] +mla v8.4S, v24.4S, v31.s[0] +sub v24.4s, v28.4s, v9.4s +sub v23.4s, v27.4s, v10.4s +sub v26.4s, v30.4s, v7.4s +sub v25.4s, v29.4s, v8.4s +add v28.4s, v28.4s, v9.4s +add v27.4s, v27.4s, v10.4s +add v30.4s, v30.4s, v7.4s +add v29.4s, v29.4s, v8.4s +sqrdmulh v8.4S, v16.4S, v15.s[0] +sqrdmulh v7.4S, v3.4S, v15.s[0] +sqrdmulh v10.4S, v22.4S, v15.s[1] +sqrdmulh v9.4S, v21.4S, v15.s[1] +mul v16.4S, v16.4S,v0.s[0] +mul v3.4S, v3.4S,v0.s[0] +mul v22.4S, v22.4S,v0.s[1] +mul v21.4S, v21.4S,v0.s[1] +mla v16.4S, v8.4S, v31.s[0] +mla v3.4S, v7.4S, v31.s[0] +mla v22.4S, v10.4S, v31.s[0] +mla v21.4S, v9.4S, v31.s[0] +sub v9.4s, v18.4s, v16.4s +sub v10.4s, v17.4s, v3.4s +sub v7.4s, v20.4s, v22.4s +sub v8.4s, v19.4s, v21.4s +add v18.4s, v18.4s, v16.4s +add v17.4s, v17.4s, v3.4s +add v20.4s, v20.4s, v22.4s +add v19.4s, v19.4s, v21.4s +sqrdmulh v21.4S, v28.4S, v15.s[2] +sqrdmulh v22.4S, v27.4S, v15.s[2] +sqrdmulh v3.4S, v24.4S, v15.s[3] +sqrdmulh v16.4S, v23.4S, v15.s[3] +mul v28.4S, v28.4S,v0.s[2] +mul v27.4S, v27.4S,v0.s[2] +mul v24.4S, v24.4S,v0.s[3] +mul v23.4S, v23.4S,v0.s[3] +mla v28.4S, v21.4S, v31.s[0] +mla v27.4S, v22.4S, v31.s[0] +mla v24.4S, v3.4S, v31.s[0] +mla v23.4S, v16.4S, v31.s[0] +sub v16.4s, v30.4s, v28.4s +sub v3.4s, v29.4s, v27.4s +sub v22.4s, v26.4s, v24.4s +sub v21.4s, v25.4s, v23.4s +add v30.4s, v30.4s, v28.4s +add v29.4s, v29.4s, v27.4s +add v26.4s, v26.4s, v24.4s +add v25.4s, v25.4s, v23.4s +sqrdmulh v23.4S, v17.4S, v13.s[0] +sqrdmulh v24.4S, v10.4S, v13.s[1] +sqrdmulh v27.4S, v19.4S, v13.s[2] +sqrdmulh v28.4S, v8.4S, v13.s[3] +mul v17.4S, v17.4S,v14.s[0] +mul v10.4S, v10.4S,v14.s[1] +mul v19.4S, v19.4S,v14.s[2] +mul v8.4S, v8.4S,v14.s[3] +mla v17.4S, v23.4S, v31.s[0] +mla v10.4S, v24.4S, v31.s[0] +mla v19.4S, v27.4S, v31.s[0] +mla v8.4S, v28.4S, v31.s[0] +sub v28.4s, v18.4s, v17.4s +sub v27.4s, v9.4s, v10.4s +sub v24.4s, v20.4s, v19.4s +sub v23.4s, v7.4s, v8.4s +add v18.4s, v18.4s, v17.4s +add v9.4s, v9.4s, v10.4s +add v20.4s, v20.4s, v19.4s +add v7.4s, v7.4s, v8.4s +sqrdmulh v8.4S, v29.4S, v11.s[0] +sqrdmulh v19.4S, v3.4S, v11.s[1] +sqrdmulh v10.4S, v25.4S, v11.s[2] +sqrdmulh v17.4S, v21.4S, v11.s[3] +mul v29.4S, v29.4S,v12.s[0] +mul v3.4S, v3.4S,v12.s[1] +mul v25.4S, v25.4S,v12.s[2] +mul v21.4S, v21.4S,v12.s[3] +mla v29.4S, v8.4S, v31.s[0] +mla v3.4S, v19.4S, v31.s[0] +mla v25.4S, v10.4S, v31.s[0] +mla v21.4S, v17.4S, v31.s[0] +sub v17.4s, v30.4s, v29.4s +sub v10.4s, v16.4s, v3.4s +sub v19.4s, v26.4s, v25.4s +sub v8.4s, v22.4s, v21.4s +add v30.4s, v30.4s, v29.4s +add v16.4s, v16.4s, v3.4s +add v26.4s, v26.4s, v25.4s +add v22.4s, v22.4s, v21.4s +str q18, [x0, #32] +str q28, [x0, #96] +str q9, [x0, #160] +str q27, [x0, #224] +str q20, [x0, #288] +str q24, [x0, #352] +str q7, [x0, #416] +str q23, [x0, #480] +str q30, [x0, #544] +str q17, [x0, #608] +str q16, [x0, #672] +str q10, [x0, #736] +str q26, [x0, #800] +str q19, [x0, #864] +str q22, [x0, #928] +str q8, [x0, #992] +ldr q8, [x0, #816] +ldr q22, [x0, #880] +ldr q19, [x0, #944] +ldr q26, [x0, #1008] +ldr q10, [x0, #304] +ldr q16, [x0, #368] +ldr q17, [x0, #432] +ldr q30, [x0, #496] +ldr q23, [x0, #560] +ldr q7, [x0, #624] +ldr q24, [x0, #688] +ldr q20, [x0, #752] +ldr q27, [x0, #48] +ldr q9, [x0, #112] +ldr q28, [x0, #176] +ldr q18, [x0, #240] +sqrdmulh v21.4S, v8.4S, v1.s[0] +sqrdmulh v25.4S, v22.4S, v1.s[0] +sqrdmulh v3.4S, v19.4S, v1.s[0] +sqrdmulh v29.4S, v26.4S, v1.s[0] +mul v8.4S, v8.4S,v2.s[0] +mul v22.4S, v22.4S,v2.s[0] +mul v19.4S, v19.4S,v2.s[0] +mul v26.4S, v26.4S,v2.s[0] +mla v8.4S, v21.4S, v31.s[0] +mla v22.4S, v25.4S, v31.s[0] +mla v19.4S, v3.4S, v31.s[0] +mla v26.4S, v29.4S, v31.s[0] +sub v29.4s, v10.4s, v8.4s +sub v3.4s, v16.4s, v22.4s +sub v25.4s, v17.4s, v19.4s +sub v21.4s, v30.4s, v26.4s +add v10.4s, v10.4s, v8.4s +add v16.4s, v16.4s, v22.4s +add v17.4s, v17.4s, v19.4s +add v30.4s, v30.4s, v26.4s +sqrdmulh v26.4S, v23.4S, v1.s[0] +sqrdmulh v19.4S, v7.4S, v1.s[0] +sqrdmulh v22.4S, v24.4S, v1.s[0] +sqrdmulh v8.4S, v20.4S, v1.s[0] +mul v23.4S, v23.4S,v2.s[0] +mul v7.4S, v7.4S,v2.s[0] +mul v24.4S, v24.4S,v2.s[0] +mul v20.4S, v20.4S,v2.s[0] +mla v23.4S, v26.4S, v31.s[0] +mla v7.4S, v19.4S, v31.s[0] +mla v24.4S, v22.4S, v31.s[0] +mla v20.4S, v8.4S, v31.s[0] +sub v8.4s, v27.4s, v23.4s +sub v22.4s, v9.4s, v7.4s +sub v19.4s, v28.4s, v24.4s +sub v26.4s, v18.4s, v20.4s +add v27.4s, v27.4s, v23.4s +add v9.4s, v9.4s, v7.4s +add v28.4s, v28.4s, v24.4s +add v18.4s, v18.4s, v20.4s +sqrdmulh v20.4S, v17.4S, v1.s[1] +sqrdmulh v24.4S, v30.4S, v1.s[1] +sqrdmulh v7.4S, v10.4S, v1.s[1] +sqrdmulh v23.4S, v16.4S, v1.s[1] +mul v17.4S, v17.4S,v2.s[1] +mul v30.4S, v30.4S,v2.s[1] +mul v10.4S, v10.4S,v2.s[1] +mul v16.4S, v16.4S,v2.s[1] +mla v17.4S, v20.4S, v31.s[0] +mla v30.4S, v24.4S, v31.s[0] +mla v10.4S, v7.4S, v31.s[0] +mla v16.4S, v23.4S, v31.s[0] +sub v23.4s, v28.4s, v17.4s +sub v7.4s, v18.4s, v30.4s +sub v24.4s, v27.4s, v10.4s +sub v20.4s, v9.4s, v16.4s +add v28.4s, v28.4s, v17.4s +add v18.4s, v18.4s, v30.4s +add v27.4s, v27.4s, v10.4s +add v9.4s, v9.4s, v16.4s +sqrdmulh v16.4S, v25.4S, v1.s[2] +sqrdmulh v10.4S, v21.4S, v1.s[2] +sqrdmulh v30.4S, v29.4S, v1.s[2] +sqrdmulh v17.4S, v3.4S, v1.s[2] +mul v25.4S, v25.4S,v2.s[2] +mul v21.4S, v21.4S,v2.s[2] +mul v29.4S, v29.4S,v2.s[2] +mul v3.4S, v3.4S,v2.s[2] +mla v25.4S, v16.4S, v31.s[0] +mla v21.4S, v10.4S, v31.s[0] +mla v29.4S, v30.4S, v31.s[0] +mla v3.4S, v17.4S, v31.s[0] +sub v17.4s, v19.4s, v25.4s +sub v30.4s, v26.4s, v21.4s +sub v10.4s, v8.4s, v29.4s +sub v16.4s, v22.4s, v3.4s +add v19.4s, v19.4s, v25.4s +add v26.4s, v26.4s, v21.4s +add v8.4s, v8.4s, v29.4s +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v28.4S, v15.s[0] +sqrdmulh v29.4S, v18.4S, v15.s[0] +sqrdmulh v21.4S, v23.4S, v15.s[1] +sqrdmulh v25.4S, v7.4S, v15.s[1] +mul v28.4S, v28.4S,v0.s[0] +mul v18.4S, v18.4S,v0.s[0] +mul v23.4S, v23.4S,v0.s[1] +mul v7.4S, v7.4S,v0.s[1] +mla v28.4S, v3.4S, v31.s[0] +mla v18.4S, v29.4S, v31.s[0] +mla v23.4S, v21.4S, v31.s[0] +mla v7.4S, v25.4S, v31.s[0] +sub v25.4s, v27.4s, v28.4s +sub v21.4s, v9.4s, v18.4s +sub v29.4s, v24.4s, v23.4s +sub v3.4s, v20.4s, v7.4s +add v27.4s, v27.4s, v28.4s +add v9.4s, v9.4s, v18.4s +add v24.4s, v24.4s, v23.4s +add v20.4s, v20.4s, v7.4s +sqrdmulh v7.4S, v19.4S, v15.s[2] +sqrdmulh v23.4S, v26.4S, v15.s[2] +sqrdmulh v18.4S, v17.4S, v15.s[3] +sqrdmulh v28.4S, v30.4S, v15.s[3] +mul v19.4S, v19.4S,v0.s[2] +mul v26.4S, v26.4S,v0.s[2] +mul v17.4S, v17.4S,v0.s[3] +mul v30.4S, v30.4S,v0.s[3] +mla v19.4S, v7.4S, v31.s[0] +mla v26.4S, v23.4S, v31.s[0] +mla v17.4S, v18.4S, v31.s[0] +mla v30.4S, v28.4S, v31.s[0] +sub v28.4s, v8.4s, v19.4s +sub v18.4s, v22.4s, v26.4s +sub v23.4s, v10.4s, v17.4s +sub v7.4s, v16.4s, v30.4s +add v8.4s, v8.4s, v19.4s +add v22.4s, v22.4s, v26.4s +add v10.4s, v10.4s, v17.4s +add v16.4s, v16.4s, v30.4s +sqrdmulh v30.4S, v9.4S, v13.s[0] +sqrdmulh v17.4S, v21.4S, v13.s[1] +sqrdmulh v26.4S, v20.4S, v13.s[2] +sqrdmulh v19.4S, v3.4S, v13.s[3] +mul v9.4S, v9.4S,v14.s[0] +mul v21.4S, v21.4S,v14.s[1] +mul v20.4S, v20.4S,v14.s[2] +mul v3.4S, v3.4S,v14.s[3] +mla v9.4S, v30.4S, v31.s[0] +mla v21.4S, v17.4S, v31.s[0] +mla v20.4S, v26.4S, v31.s[0] +mla v3.4S, v19.4S, v31.s[0] +sub v19.4s, v27.4s, v9.4s +sub v26.4s, v25.4s, v21.4s +sub v17.4s, v24.4s, v20.4s +sub v30.4s, v29.4s, v3.4s +add v27.4s, v27.4s, v9.4s +add v25.4s, v25.4s, v21.4s +add v24.4s, v24.4s, v20.4s +add v29.4s, v29.4s, v3.4s +sqrdmulh v3.4S, v22.4S, v11.s[0] +sqrdmulh v20.4S, v18.4S, v11.s[1] +sqrdmulh v21.4S, v16.4S, v11.s[2] +sqrdmulh v9.4S, v7.4S, v11.s[3] +mul v22.4S, v22.4S,v12.s[0] +mul v18.4S, v18.4S,v12.s[1] +mul v16.4S, v16.4S,v12.s[2] +mul v7.4S, v7.4S,v12.s[3] +mla v22.4S, v3.4S, v31.s[0] +mla v18.4S, v20.4S, v31.s[0] +mla v16.4S, v21.4S, v31.s[0] +mla v7.4S, v9.4S, v31.s[0] +sub v9.4s, v8.4s, v22.4s +sub v21.4s, v28.4s, v18.4s +sub v20.4s, v10.4s, v16.4s +sub v3.4s, v23.4s, v7.4s +add v8.4s, v8.4s, v22.4s +add v28.4s, v28.4s, v18.4s +add v10.4s, v10.4s, v16.4s +add v23.4s, v23.4s, v7.4s +str q27, [x0, #48] +str q19, [x0, #112] +str q25, [x0, #176] +str q26, [x0, #240] +str q24, [x0, #304] +str q17, [x0, #368] +str q29, [x0, #432] +str q30, [x0, #496] +str q8, [x0, #560] +str q9, [x0, #624] +str q28, [x0, #688] +str q21, [x0, #752] +str q10, [x0, #816] +str q20, [x0, #880] +str q23, [x0, #944] +str q3, [x0, #1008] +ldr q3, [x0, #768] +ldr q23, [x0, #832] +ldr q20, [x0, #896] +ldr q10, [x0, #960] +ldr q21, [x0, #256] +ldr q28, [x0, #320] +ldr q9, [x0, #384] +ldr q8, [x0, #448] +ldr q30, [x0, #512] +ldr q29, [x0, #576] +ldr q17, [x0, #640] +ldr q24, [x0, #704] +ldr q26, [x0, #0] +ldr q25, [x0, #64] +ldr q19, [x0, #128] +ldr q27, [x0, #192] +sqrdmulh v7.4S, v3.4S, v1.s[0] +sqrdmulh v16.4S, v23.4S, v1.s[0] +sqrdmulh v18.4S, v20.4S, v1.s[0] +sqrdmulh v22.4S, v10.4S, v1.s[0] +mul v3.4S, v3.4S,v2.s[0] +mul v23.4S, v23.4S,v2.s[0] +mul v20.4S, v20.4S,v2.s[0] +mul v10.4S, v10.4S,v2.s[0] +mla v3.4S, v7.4S, v31.s[0] +mla v23.4S, v16.4S, v31.s[0] +mla v20.4S, v18.4S, v31.s[0] +mla v10.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v3.4s +sub v18.4s, v28.4s, v23.4s +sub v16.4s, v9.4s, v20.4s +sub v7.4s, v8.4s, v10.4s +add v21.4s, v21.4s, v3.4s +add v28.4s, v28.4s, v23.4s +add v9.4s, v9.4s, v20.4s +add v8.4s, v8.4s, v10.4s +sqrdmulh v10.4S, v30.4S, v1.s[0] +sqrdmulh v20.4S, v29.4S, v1.s[0] +sqrdmulh v23.4S, v17.4S, v1.s[0] +sqrdmulh v3.4S, v24.4S, v1.s[0] +mul v30.4S, v30.4S,v2.s[0] +mul v29.4S, v29.4S,v2.s[0] +mul v17.4S, v17.4S,v2.s[0] +mul v24.4S, v24.4S,v2.s[0] +mla v30.4S, v10.4S, v31.s[0] +mla v29.4S, v20.4S, v31.s[0] +mla v17.4S, v23.4S, v31.s[0] +mla v24.4S, v3.4S, v31.s[0] +sub v3.4s, v26.4s, v30.4s +sub v23.4s, v25.4s, v29.4s +sub v20.4s, v19.4s, v17.4s +sub v10.4s, v27.4s, v24.4s +add v26.4s, v26.4s, v30.4s +add v25.4s, v25.4s, v29.4s +add v19.4s, v19.4s, v17.4s +add v27.4s, v27.4s, v24.4s +sqrdmulh v24.4S, v9.4S, v1.s[1] +sqrdmulh v17.4S, v8.4S, v1.s[1] +sqrdmulh v29.4S, v21.4S, v1.s[1] +sqrdmulh v30.4S, v28.4S, v1.s[1] +mul v9.4S, v9.4S,v2.s[1] +mul v8.4S, v8.4S,v2.s[1] +mul v21.4S, v21.4S,v2.s[1] +mul v28.4S, v28.4S,v2.s[1] +mla v9.4S, v24.4S, v31.s[0] +mla v8.4S, v17.4S, v31.s[0] +mla v21.4S, v29.4S, v31.s[0] +mla v28.4S, v30.4S, v31.s[0] +sub v30.4s, v19.4s, v9.4s +sub v29.4s, v27.4s, v8.4s +sub v17.4s, v26.4s, v21.4s +sub v24.4s, v25.4s, v28.4s +add v19.4s, v19.4s, v9.4s +add v27.4s, v27.4s, v8.4s +add v26.4s, v26.4s, v21.4s +add v25.4s, v25.4s, v28.4s +sqrdmulh v28.4S, v16.4S, v1.s[2] +sqrdmulh v21.4S, v7.4S, v1.s[2] +sqrdmulh v8.4S, v22.4S, v1.s[2] +sqrdmulh v9.4S, v18.4S, v1.s[2] +mul v16.4S, v16.4S,v2.s[2] +mul v7.4S, v7.4S,v2.s[2] +mul v22.4S, v22.4S,v2.s[2] +mul v18.4S, v18.4S,v2.s[2] +mla v16.4S, v28.4S, v31.s[0] +mla v7.4S, v21.4S, v31.s[0] +mla v22.4S, v8.4S, v31.s[0] +mla v18.4S, v9.4S, v31.s[0] +sub v9.4s, v20.4s, v16.4s +sub v8.4s, v10.4s, v7.4s +sub v21.4s, v3.4s, v22.4s +sub v28.4s, v23.4s, v18.4s +add v20.4s, v20.4s, v16.4s +add v10.4s, v10.4s, v7.4s +add v3.4s, v3.4s, v22.4s +add v23.4s, v23.4s, v18.4s +sqrdmulh v18.4S, v19.4S, v15.s[0] +sqrdmulh v22.4S, v27.4S, v15.s[0] +sqrdmulh v7.4S, v30.4S, v15.s[1] +sqrdmulh v16.4S, v29.4S, v15.s[1] +mul v19.4S, v19.4S,v0.s[0] +mul v27.4S, v27.4S,v0.s[0] +mul v30.4S, v30.4S,v0.s[1] +mul v29.4S, v29.4S,v0.s[1] +mla v19.4S, v18.4S, v31.s[0] +mla v27.4S, v22.4S, v31.s[0] +mla v30.4S, v7.4S, v31.s[0] +mla v29.4S, v16.4S, v31.s[0] +sub v16.4s, v26.4s, v19.4s +sub v7.4s, v25.4s, v27.4s +sub v22.4s, v17.4s, v30.4s +sub v18.4s, v24.4s, v29.4s +add v26.4s, v26.4s, v19.4s +add v25.4s, v25.4s, v27.4s +add v17.4s, v17.4s, v30.4s +add v24.4s, v24.4s, v29.4s +sqrdmulh v29.4S, v20.4S, v15.s[2] +sqrdmulh v30.4S, v10.4S, v15.s[2] +sqrdmulh v27.4S, v9.4S, v15.s[3] +sqrdmulh v19.4S, v8.4S, v15.s[3] +mul v20.4S, v20.4S,v0.s[2] +mul v10.4S, v10.4S,v0.s[2] +mul v9.4S, v9.4S,v0.s[3] +mul v8.4S, v8.4S,v0.s[3] +mla v20.4S, v29.4S, v31.s[0] +mla v10.4S, v30.4S, v31.s[0] +mla v9.4S, v27.4S, v31.s[0] +mla v8.4S, v19.4S, v31.s[0] +sub v19.4s, v3.4s, v20.4s +sub v27.4s, v23.4s, v10.4s +sub v30.4s, v21.4s, v9.4s +sub v29.4s, v28.4s, v8.4s +add v3.4s, v3.4s, v20.4s +add v23.4s, v23.4s, v10.4s +add v21.4s, v21.4s, v9.4s +add v28.4s, v28.4s, v8.4s +sqrdmulh v8.4S, v25.4S, v13.s[0] +sqrdmulh v9.4S, v7.4S, v13.s[1] +sqrdmulh v10.4S, v24.4S, v13.s[2] +sqrdmulh v20.4S, v18.4S, v13.s[3] +mul v25.4S, v25.4S,v14.s[0] +mul v7.4S, v7.4S,v14.s[1] +mul v24.4S, v24.4S,v14.s[2] +mul v18.4S, v18.4S,v14.s[3] +mla v25.4S, v8.4S, v31.s[0] +mla v7.4S, v9.4S, v31.s[0] +mla v24.4S, v10.4S, v31.s[0] +mla v18.4S, v20.4S, v31.s[0] +sub v20.4s, v26.4s, v25.4s +sub v10.4s, v16.4s, v7.4s +sub v9.4s, v17.4s, v24.4s +sub v8.4s, v22.4s, v18.4s +add v26.4s, v26.4s, v25.4s +add v16.4s, v16.4s, v7.4s +add v17.4s, v17.4s, v24.4s +add v22.4s, v22.4s, v18.4s +sqrdmulh v18.4S, v23.4S, v11.s[0] +sqrdmulh v24.4S, v27.4S, v11.s[1] +sqrdmulh v7.4S, v28.4S, v11.s[2] +sqrdmulh v25.4S, v29.4S, v11.s[3] +mul v23.4S, v23.4S,v12.s[0] +mul v27.4S, v27.4S,v12.s[1] +mul v28.4S, v28.4S,v12.s[2] +mul v29.4S, v29.4S,v12.s[3] +mla v23.4S, v18.4S, v31.s[0] +mla v27.4S, v24.4S, v31.s[0] +mla v28.4S, v7.4S, v31.s[0] +mla v29.4S, v25.4S, v31.s[0] +sub v25.4s, v3.4s, v23.4s +sub v7.4s, v19.4s, v27.4s +sub v24.4s, v21.4s, v28.4s +sub v18.4s, v30.4s, v29.4s +add v3.4s, v3.4s, v23.4s +add v19.4s, v19.4s, v27.4s +add v21.4s, v21.4s, v28.4s +add v30.4s, v30.4s, v29.4s +str q26, [x0, #0] +str q20, [x0, #64] +str q16, [x0, #128] +str q10, [x0, #192] +str q17, [x0, #256] +str q9, [x0, #320] +str q22, [x0, #384] +str q8, [x0, #448] +str q3, [x0, #512] +str q25, [x0, #576] +str q19, [x0, #640] +str q7, [x0, #704] +str q21, [x0, #768] +str q24, [x0, #832] +str q30, [x0, #896] +str q18, [x0, #960] +ldr q18, [x0, #784] +ldr q30, [x0, #848] +ldr q24, [x0, #912] +ldr q21, [x0, #976] +ldr q7, [x0, #272] +ldr q19, [x0, #336] +ldr q25, [x0, #400] +ldr q3, [x0, #464] +ldr q8, [x0, #528] +ldr q22, [x0, #592] +ldr q9, [x0, #656] +ldr q17, [x0, #720] +ldr q10, [x0, #16] +ldr q16, [x0, #80] +ldr q20, [x0, #144] +ldr q26, [x0, #208] +sqrdmulh v29.4S, v18.4S, v1.s[0] +sqrdmulh v28.4S, v30.4S, v1.s[0] +sqrdmulh v27.4S, v24.4S, v1.s[0] +sqrdmulh v23.4S, v21.4S, v1.s[0] +mul v18.4S, v18.4S,v2.s[0] +mul v30.4S, v30.4S,v2.s[0] +mul v24.4S, v24.4S,v2.s[0] +mul v21.4S, v21.4S,v2.s[0] +mla v18.4S, v29.4S, v31.s[0] +mla v30.4S, v28.4S, v31.s[0] +mla v24.4S, v27.4S, v31.s[0] +mla v21.4S, v23.4S, v31.s[0] +sub v23.4s, v7.4s, v18.4s +sub v27.4s, v19.4s, v30.4s +sub v28.4s, v25.4s, v24.4s +sub v29.4s, v3.4s, v21.4s +add v7.4s, v7.4s, v18.4s +add v19.4s, v19.4s, v30.4s +add v25.4s, v25.4s, v24.4s +add v3.4s, v3.4s, v21.4s +sqrdmulh v21.4S, v8.4S, v1.s[0] +sqrdmulh v24.4S, v22.4S, v1.s[0] +sqrdmulh v30.4S, v9.4S, v1.s[0] +sqrdmulh v18.4S, v17.4S, v1.s[0] +mul v8.4S, v8.4S,v2.s[0] +mul v22.4S, v22.4S,v2.s[0] +mul v9.4S, v9.4S,v2.s[0] +mul v17.4S, v17.4S,v2.s[0] +mla v8.4S, v21.4S, v31.s[0] +mla v22.4S, v24.4S, v31.s[0] +mla v9.4S, v30.4S, v31.s[0] +mla v17.4S, v18.4S, v31.s[0] +sub v18.4s, v10.4s, v8.4s +sub v30.4s, v16.4s, v22.4s +sub v24.4s, v20.4s, v9.4s +sub v21.4s, v26.4s, v17.4s +add v10.4s, v10.4s, v8.4s +add v16.4s, v16.4s, v22.4s +add v20.4s, v20.4s, v9.4s +add v26.4s, v26.4s, v17.4s +sqrdmulh v17.4S, v25.4S, v1.s[1] +sqrdmulh v9.4S, v3.4S, v1.s[1] +sqrdmulh v22.4S, v7.4S, v1.s[1] +sqrdmulh v8.4S, v19.4S, v1.s[1] +mul v25.4S, v25.4S,v2.s[1] +mul v3.4S, v3.4S,v2.s[1] +mul v7.4S, v7.4S,v2.s[1] +mul v19.4S, v19.4S,v2.s[1] +mla v25.4S, v17.4S, v31.s[0] +mla v3.4S, v9.4S, v31.s[0] +mla v7.4S, v22.4S, v31.s[0] +mla v19.4S, v8.4S, v31.s[0] +sub v8.4s, v20.4s, v25.4s +sub v22.4s, v26.4s, v3.4s +sub v9.4s, v10.4s, v7.4s +sub v17.4s, v16.4s, v19.4s +add v20.4s, v20.4s, v25.4s +add v26.4s, v26.4s, v3.4s +add v10.4s, v10.4s, v7.4s +add v16.4s, v16.4s, v19.4s +sqrdmulh v19.4S, v28.4S, v1.s[2] +sqrdmulh v7.4S, v29.4S, v1.s[2] +sqrdmulh v3.4S, v23.4S, v1.s[2] +sqrdmulh v25.4S, v27.4S, v1.s[2] +mul v28.4S, v28.4S,v2.s[2] +mul v29.4S, v29.4S,v2.s[2] +mul v23.4S, v23.4S,v2.s[2] +mul v27.4S, v27.4S,v2.s[2] +mla v28.4S, v19.4S, v31.s[0] +mla v29.4S, v7.4S, v31.s[0] +mla v23.4S, v3.4S, v31.s[0] +mla v27.4S, v25.4S, v31.s[0] +sub v25.4s, v24.4s, v28.4s +sub v3.4s, v21.4s, v29.4s +sub v7.4s, v18.4s, v23.4s +sub v19.4s, v30.4s, v27.4s +add v24.4s, v24.4s, v28.4s +add v21.4s, v21.4s, v29.4s +add v18.4s, v18.4s, v23.4s +add v30.4s, v30.4s, v27.4s +sqrdmulh v27.4S, v20.4S, v15.s[0] +sqrdmulh v23.4S, v26.4S, v15.s[0] +sqrdmulh v29.4S, v8.4S, v15.s[1] +sqrdmulh v28.4S, v22.4S, v15.s[1] +mul v20.4S, v20.4S,v0.s[0] +mul v26.4S, v26.4S,v0.s[0] +mul v8.4S, v8.4S,v0.s[1] +mul v22.4S, v22.4S,v0.s[1] +mla v20.4S, v27.4S, v31.s[0] +mla v26.4S, v23.4S, v31.s[0] +mla v8.4S, v29.4S, v31.s[0] +mla v22.4S, v28.4S, v31.s[0] +sub v28.4s, v10.4s, v20.4s +sub v29.4s, v16.4s, v26.4s +sub v23.4s, v9.4s, v8.4s +sub v27.4s, v17.4s, v22.4s +add v10.4s, v10.4s, v20.4s +add v16.4s, v16.4s, v26.4s +add v9.4s, v9.4s, v8.4s +add v17.4s, v17.4s, v22.4s +sqrdmulh v22.4S, v24.4S, v15.s[2] +sqrdmulh v8.4S, v21.4S, v15.s[2] +sqrdmulh v26.4S, v25.4S, v15.s[3] +sqrdmulh v20.4S, v3.4S, v15.s[3] +mul v24.4S, v24.4S,v0.s[2] +mul v21.4S, v21.4S,v0.s[2] +mul v25.4S, v25.4S,v0.s[3] +mul v3.4S, v3.4S,v0.s[3] +mla v24.4S, v22.4S, v31.s[0] +mla v21.4S, v8.4S, v31.s[0] +mla v25.4S, v26.4S, v31.s[0] +mla v3.4S, v20.4S, v31.s[0] +sub v20.4s, v18.4s, v24.4s +sub v26.4s, v30.4s, v21.4s +sub v8.4s, v7.4s, v25.4s +sub v22.4s, v19.4s, v3.4s +add v18.4s, v18.4s, v24.4s +add v30.4s, v30.4s, v21.4s +add v7.4s, v7.4s, v25.4s +add v19.4s, v19.4s, v3.4s +sqrdmulh v3.4S, v16.4S, v13.s[0] +sqrdmulh v25.4S, v29.4S, v13.s[1] +sqrdmulh v21.4S, v17.4S, v13.s[2] +sqrdmulh v24.4S, v27.4S, v13.s[3] +mul v16.4S, v16.4S,v14.s[0] +mul v29.4S, v29.4S,v14.s[1] +mul v17.4S, v17.4S,v14.s[2] +mul v27.4S, v27.4S,v14.s[3] +mla v16.4S, v3.4S, v31.s[0] +mla v29.4S, v25.4S, v31.s[0] +mla v17.4S, v21.4S, v31.s[0] +mla v27.4S, v24.4S, v31.s[0] +sub v24.4s, v10.4s, v16.4s +sub v21.4s, v28.4s, v29.4s +sub v25.4s, v9.4s, v17.4s +sub v3.4s, v23.4s, v27.4s +add v10.4s, v10.4s, v16.4s +add v28.4s, v28.4s, v29.4s +add v9.4s, v9.4s, v17.4s +add v23.4s, v23.4s, v27.4s +sqrdmulh v27.4S, v30.4S, v11.s[0] +sqrdmulh v17.4S, v26.4S, v11.s[1] +sqrdmulh v29.4S, v19.4S, v11.s[2] +sqrdmulh v16.4S, v22.4S, v11.s[3] +mul v30.4S, v30.4S,v12.s[0] +mul v26.4S, v26.4S,v12.s[1] +mul v19.4S, v19.4S,v12.s[2] +mul v22.4S, v22.4S,v12.s[3] +mla v30.4S, v27.4S, v31.s[0] +mla v26.4S, v17.4S, v31.s[0] +mla v19.4S, v29.4S, v31.s[0] +mla v22.4S, v16.4S, v31.s[0] +sub v16.4s, v18.4s, v30.4s +sub v29.4s, v20.4s, v26.4s +sub v17.4s, v7.4s, v19.4s +sub v27.4s, v8.4s, v22.4s +add v18.4s, v18.4s, v30.4s +add v20.4s, v20.4s, v26.4s +add v7.4s, v7.4s, v19.4s +add v8.4s, v8.4s, v22.4s +str q10, [x0, #16] +str q24, [x0, #80] +str q28, [x0, #144] +str q21, [x0, #208] +str q9, [x0, #272] +str q25, [x0, #336] +str q23, [x0, #400] +str q3, [x0, #464] +str q18, [x0, #528] +str q16, [x0, #592] +str q20, [x0, #656] +str q29, [x0, #720] +str q7, [x0, #784] +str q17, [x0, #848] +str q8, [x0, #912] +str q27, [x0, #976] +ldr q4, [x17, #+128] +ldr q5, [x17, #+144] +ldr q6, [x17, #+160] +ldr q30, [x17, #+176] +ldr q26, [x17, #+192] +ldr q19, [x17, #+208] +ldr q22, [x17, #+224] +ldr q10, [x17, #+240] +ldr q24, [x0, #32] +ldr q28, [x0, #48] +ldr q21, [x0, #0] +ldr q9, [x0, #16] +sqrdmulh v25.4S, v24.4S, v5.s[0] +mul v24.4S, v24.4S,v4.s[0] +mla v24.4S, v25.4S, v31.s[0] +sub v25.4s, v21.4s, v24.4s +add v21.4s, v21.4s, v24.4s +sqrdmulh v24.4S, v28.4S, v5.s[0] +mul v28.4S, v28.4S,v4.s[0] +mla v28.4S, v24.4S, v31.s[0] +sub v24.4s, v9.4s, v28.4s +add v9.4s, v9.4s, v28.4s +sqrdmulh v28.4S, v9.4S, v5.s[1] +mul v9.4S, v9.4S,v4.s[1] +mla v9.4S, v28.4S, v31.s[0] +sub v28.4s, v21.4s, v9.4s +add v21.4s, v21.4s, v9.4s +sqrdmulh v9.4S, v24.4S, v5.s[2] +mul v24.4S, v24.4S,v4.s[2] +mla v24.4S, v9.4S, v31.s[0] +sub v9.4s, v25.4s, v24.4s +add v25.4s, v25.4s, v24.4s +trn1 v24.4S, v21.4S, v28.4S +trn2 v23.4S, v21.4S, v28.4S +trn1 v3.4S, v25.4S, v9.4S +trn2 v18.4S, v25.4S, v9.4S +trn2 v25.2D, v24.2D, v3.2D +trn2 v9.2D, v23.2D, v18.2D +trn1 v21.2D, v24.2D, v3.2D +trn1 v28.2D, v23.2D, v18.2D +sqrdmulh v18.4S, v25.4S, v30.4S +mul v25.4S, v25.4S,v6.4S +mla v25.4S, v18.4S, v31.s[0] +sub v18.4s, v21.4s, v25.4s +add v21.4s, v21.4s, v25.4s +sqrdmulh v25.4S, v9.4S, v30.4S +mul v9.4S, v9.4S,v6.4S +mla v9.4S, v25.4S, v31.s[0] +sub v25.4s, v28.4s, v9.4s +add v28.4s, v28.4s, v9.4s +sqrdmulh v9.4S, v28.4S, v19.4S +mul v28.4S, v28.4S,v26.4S +mla v28.4S, v9.4S, v31.s[0] +sub v9.4s, v21.4s, v28.4s +add v21.4s, v21.4s, v28.4s +sqrdmulh v28.4S, v25.4S, v10.4S +mul v25.4S, v25.4S,v22.4S +mla v25.4S, v28.4S, v31.s[0] +sub v28.4s, v18.4s, v25.4s +add v18.4s, v18.4s, v25.4s +str q21, [x0, #0] +str q9, [x0, #16] +str q18, [x0, #32] +str q28, [x0, #48] +ldr q28, [x17, #+256] +ldr q18, [x17, #+272] +ldr q9, [x17, #+288] +ldr q21, [x17, #+304] +ldr q25, [x17, #+320] +ldr q23, [x17, #+336] +ldr q3, [x17, #+352] +ldr q24, [x17, #+368] +ldr q10, [x0, #96] +ldr q22, [x0, #112] +ldr q19, [x0, #64] +ldr q26, [x0, #80] +sqrdmulh v30.4S, v10.4S, v18.s[0] +mul v10.4S, v10.4S,v28.s[0] +mla v10.4S, v30.4S, v31.s[0] +sub v30.4s, v19.4s, v10.4s +add v19.4s, v19.4s, v10.4s +sqrdmulh v10.4S, v22.4S, v18.s[0] +mul v22.4S, v22.4S,v28.s[0] +mla v22.4S, v10.4S, v31.s[0] +sub v10.4s, v26.4s, v22.4s +add v26.4s, v26.4s, v22.4s +sqrdmulh v22.4S, v26.4S, v18.s[1] +mul v26.4S, v26.4S,v28.s[1] +mla v26.4S, v22.4S, v31.s[0] +sub v22.4s, v19.4s, v26.4s +add v19.4s, v19.4s, v26.4s +sqrdmulh v26.4S, v10.4S, v18.s[2] +mul v10.4S, v10.4S,v28.s[2] +mla v10.4S, v26.4S, v31.s[0] +sub v26.4s, v30.4s, v10.4s +add v30.4s, v30.4s, v10.4s +trn1 v10.4S, v19.4S, v22.4S +trn2 v6.4S, v19.4S, v22.4S +trn1 v5.4S, v30.4S, v26.4S +trn2 v4.4S, v30.4S, v26.4S +trn2 v30.2D, v10.2D, v5.2D +trn2 v26.2D, v6.2D, v4.2D +trn1 v19.2D, v10.2D, v5.2D +trn1 v22.2D, v6.2D, v4.2D +sqrdmulh v4.4S, v30.4S, v21.4S +mul v30.4S, v30.4S,v9.4S +mla v30.4S, v4.4S, v31.s[0] +sub v4.4s, v19.4s, v30.4s +add v19.4s, v19.4s, v30.4s +sqrdmulh v30.4S, v26.4S, v21.4S +mul v26.4S, v26.4S,v9.4S +mla v26.4S, v30.4S, v31.s[0] +sub v30.4s, v22.4s, v26.4s +add v22.4s, v22.4s, v26.4s +sqrdmulh v26.4S, v22.4S, v23.4S +mul v22.4S, v22.4S,v25.4S +mla v22.4S, v26.4S, v31.s[0] +sub v26.4s, v19.4s, v22.4s +add v19.4s, v19.4s, v22.4s +sqrdmulh v22.4S, v30.4S, v24.4S +mul v30.4S, v30.4S,v3.4S +mla v30.4S, v22.4S, v31.s[0] +sub v22.4s, v4.4s, v30.4s +add v4.4s, v4.4s, v30.4s +str q19, [x0, #64] +str q26, [x0, #80] +str q4, [x0, #96] +str q22, [x0, #112] +ldr q22, [x17, #+384] +ldr q4, [x17, #+400] +ldr q26, [x17, #+416] +ldr q19, [x17, #+432] +ldr q30, [x17, #+448] +ldr q6, [x17, #+464] +ldr q5, [x17, #+480] +ldr q10, [x17, #+496] +ldr q24, [x0, #160] +ldr q3, [x0, #176] +ldr q23, [x0, #128] +ldr q25, [x0, #144] +sqrdmulh v21.4S, v24.4S, v4.s[0] +mul v24.4S, v24.4S,v22.s[0] +mla v24.4S, v21.4S, v31.s[0] +sub v21.4s, v23.4s, v24.4s +add v23.4s, v23.4s, v24.4s +sqrdmulh v24.4S, v3.4S, v4.s[0] +mul v3.4S, v3.4S,v22.s[0] +mla v3.4S, v24.4S, v31.s[0] +sub v24.4s, v25.4s, v3.4s +add v25.4s, v25.4s, v3.4s +sqrdmulh v3.4S, v25.4S, v4.s[1] +mul v25.4S, v25.4S,v22.s[1] +mla v25.4S, v3.4S, v31.s[0] +sub v3.4s, v23.4s, v25.4s +add v23.4s, v23.4s, v25.4s +sqrdmulh v25.4S, v24.4S, v4.s[2] +mul v24.4S, v24.4S,v22.s[2] +mla v24.4S, v25.4S, v31.s[0] +sub v25.4s, v21.4s, v24.4s +add v21.4s, v21.4s, v24.4s +trn1 v24.4S, v23.4S, v3.4S +trn2 v9.4S, v23.4S, v3.4S +trn1 v18.4S, v21.4S, v25.4S +trn2 v28.4S, v21.4S, v25.4S +trn2 v21.2D, v24.2D, v18.2D +trn2 v25.2D, v9.2D, v28.2D +trn1 v23.2D, v24.2D, v18.2D +trn1 v3.2D, v9.2D, v28.2D +sqrdmulh v28.4S, v21.4S, v19.4S +mul v21.4S, v21.4S,v26.4S +mla v21.4S, v28.4S, v31.s[0] +sub v28.4s, v23.4s, v21.4s +add v23.4s, v23.4s, v21.4s +sqrdmulh v21.4S, v25.4S, v19.4S +mul v25.4S, v25.4S,v26.4S +mla v25.4S, v21.4S, v31.s[0] +sub v21.4s, v3.4s, v25.4s +add v3.4s, v3.4s, v25.4s +sqrdmulh v25.4S, v3.4S, v6.4S +mul v3.4S, v3.4S,v30.4S +mla v3.4S, v25.4S, v31.s[0] +sub v25.4s, v23.4s, v3.4s +add v23.4s, v23.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v10.4S +mul v21.4S, v21.4S,v5.4S +mla v21.4S, v3.4S, v31.s[0] +sub v3.4s, v28.4s, v21.4s +add v28.4s, v28.4s, v21.4s +str q23, [x0, #128] +str q25, [x0, #144] +str q28, [x0, #160] +str q3, [x0, #176] +ldr q3, [x17, #+512] +ldr q28, [x17, #+528] +ldr q25, [x17, #+544] +ldr q23, [x17, #+560] +ldr q21, [x17, #+576] +ldr q9, [x17, #+592] +ldr q18, [x17, #+608] +ldr q24, [x17, #+624] +ldr q10, [x0, #224] +ldr q5, [x0, #240] +ldr q6, [x0, #192] +ldr q30, [x0, #208] +sqrdmulh v19.4S, v10.4S, v28.s[0] +mul v10.4S, v10.4S,v3.s[0] +mla v10.4S, v19.4S, v31.s[0] +sub v19.4s, v6.4s, v10.4s +add v6.4s, v6.4s, v10.4s +sqrdmulh v10.4S, v5.4S, v28.s[0] +mul v5.4S, v5.4S,v3.s[0] +mla v5.4S, v10.4S, v31.s[0] +sub v10.4s, v30.4s, v5.4s +add v30.4s, v30.4s, v5.4s +sqrdmulh v5.4S, v30.4S, v28.s[1] +mul v30.4S, v30.4S,v3.s[1] +mla v30.4S, v5.4S, v31.s[0] +sub v5.4s, v6.4s, v30.4s +add v6.4s, v6.4s, v30.4s +sqrdmulh v30.4S, v10.4S, v28.s[2] +mul v10.4S, v10.4S,v3.s[2] +mla v10.4S, v30.4S, v31.s[0] +sub v30.4s, v19.4s, v10.4s +add v19.4s, v19.4s, v10.4s +trn1 v10.4S, v6.4S, v5.4S +trn2 v26.4S, v6.4S, v5.4S +trn1 v4.4S, v19.4S, v30.4S +trn2 v22.4S, v19.4S, v30.4S +trn2 v19.2D, v10.2D, v4.2D +trn2 v30.2D, v26.2D, v22.2D +trn1 v6.2D, v10.2D, v4.2D +trn1 v5.2D, v26.2D, v22.2D +sqrdmulh v22.4S, v19.4S, v23.4S +mul v19.4S, v19.4S,v25.4S +mla v19.4S, v22.4S, v31.s[0] +sub v22.4s, v6.4s, v19.4s +add v6.4s, v6.4s, v19.4s +sqrdmulh v19.4S, v30.4S, v23.4S +mul v30.4S, v30.4S,v25.4S +mla v30.4S, v19.4S, v31.s[0] +sub v19.4s, v5.4s, v30.4s +add v5.4s, v5.4s, v30.4s +sqrdmulh v30.4S, v5.4S, v9.4S +mul v5.4S, v5.4S,v21.4S +mla v5.4S, v30.4S, v31.s[0] +sub v30.4s, v6.4s, v5.4s +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v19.4S, v24.4S +mul v19.4S, v19.4S,v18.4S +mla v19.4S, v5.4S, v31.s[0] +sub v5.4s, v22.4s, v19.4s +add v22.4s, v22.4s, v19.4s +str q6, [x0, #192] +str q30, [x0, #208] +str q22, [x0, #224] +str q5, [x0, #240] +ldr q5, [x17, #+640] +ldr q22, [x17, #+656] +ldr q30, [x17, #+672] +ldr q6, [x17, #+688] +ldr q19, [x17, #+704] +ldr q26, [x17, #+720] +ldr q4, [x17, #+736] +ldr q10, [x17, #+752] +ldr q24, [x0, #288] +ldr q18, [x0, #304] +ldr q9, [x0, #256] +ldr q21, [x0, #272] +sqrdmulh v23.4S, v24.4S, v22.s[0] +mul v24.4S, v24.4S,v5.s[0] +mla v24.4S, v23.4S, v31.s[0] +sub v23.4s, v9.4s, v24.4s +add v9.4s, v9.4s, v24.4s +sqrdmulh v24.4S, v18.4S, v22.s[0] +mul v18.4S, v18.4S,v5.s[0] +mla v18.4S, v24.4S, v31.s[0] +sub v24.4s, v21.4s, v18.4s +add v21.4s, v21.4s, v18.4s +sqrdmulh v18.4S, v21.4S, v22.s[1] +mul v21.4S, v21.4S,v5.s[1] +mla v21.4S, v18.4S, v31.s[0] +sub v18.4s, v9.4s, v21.4s +add v9.4s, v9.4s, v21.4s +sqrdmulh v21.4S, v24.4S, v22.s[2] +mul v24.4S, v24.4S,v5.s[2] +mla v24.4S, v21.4S, v31.s[0] +sub v21.4s, v23.4s, v24.4s +add v23.4s, v23.4s, v24.4s +trn1 v24.4S, v9.4S, v18.4S +trn2 v25.4S, v9.4S, v18.4S +trn1 v28.4S, v23.4S, v21.4S +trn2 v3.4S, v23.4S, v21.4S +trn2 v23.2D, v24.2D, v28.2D +trn2 v21.2D, v25.2D, v3.2D +trn1 v9.2D, v24.2D, v28.2D +trn1 v18.2D, v25.2D, v3.2D +sqrdmulh v3.4S, v23.4S, v6.4S +mul v23.4S, v23.4S,v30.4S +mla v23.4S, v3.4S, v31.s[0] +sub v3.4s, v9.4s, v23.4s +add v9.4s, v9.4s, v23.4s +sqrdmulh v23.4S, v21.4S, v6.4S +mul v21.4S, v21.4S,v30.4S +mla v21.4S, v23.4S, v31.s[0] +sub v23.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v18.4S, v26.4S +mul v18.4S, v18.4S,v19.4S +mla v18.4S, v21.4S, v31.s[0] +sub v21.4s, v9.4s, v18.4s +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v23.4S, v10.4S +mul v23.4S, v23.4S,v4.4S +mla v23.4S, v18.4S, v31.s[0] +sub v18.4s, v3.4s, v23.4s +add v3.4s, v3.4s, v23.4s +str q9, [x0, #256] +str q21, [x0, #272] +str q3, [x0, #288] +str q18, [x0, #304] +ldr q18, [x17, #+768] +ldr q3, [x17, #+784] +ldr q21, [x17, #+800] +ldr q9, [x17, #+816] +ldr q23, [x17, #+832] +ldr q25, [x17, #+848] +ldr q28, [x17, #+864] +ldr q24, [x17, #+880] +ldr q10, [x0, #352] +ldr q4, [x0, #368] +ldr q26, [x0, #320] +ldr q19, [x0, #336] +sqrdmulh v6.4S, v10.4S, v3.s[0] +mul v10.4S, v10.4S,v18.s[0] +mla v10.4S, v6.4S, v31.s[0] +sub v6.4s, v26.4s, v10.4s +add v26.4s, v26.4s, v10.4s +sqrdmulh v10.4S, v4.4S, v3.s[0] +mul v4.4S, v4.4S,v18.s[0] +mla v4.4S, v10.4S, v31.s[0] +sub v10.4s, v19.4s, v4.4s +add v19.4s, v19.4s, v4.4s +sqrdmulh v4.4S, v19.4S, v3.s[1] +mul v19.4S, v19.4S,v18.s[1] +mla v19.4S, v4.4S, v31.s[0] +sub v4.4s, v26.4s, v19.4s +add v26.4s, v26.4s, v19.4s +sqrdmulh v19.4S, v10.4S, v3.s[2] +mul v10.4S, v10.4S,v18.s[2] +mla v10.4S, v19.4S, v31.s[0] +sub v19.4s, v6.4s, v10.4s +add v6.4s, v6.4s, v10.4s +trn1 v10.4S, v26.4S, v4.4S +trn2 v30.4S, v26.4S, v4.4S +trn1 v22.4S, v6.4S, v19.4S +trn2 v5.4S, v6.4S, v19.4S +trn2 v6.2D, v10.2D, v22.2D +trn2 v19.2D, v30.2D, v5.2D +trn1 v26.2D, v10.2D, v22.2D +trn1 v4.2D, v30.2D, v5.2D +sqrdmulh v5.4S, v6.4S, v9.4S +mul v6.4S, v6.4S,v21.4S +mla v6.4S, v5.4S, v31.s[0] +sub v5.4s, v26.4s, v6.4s +add v26.4s, v26.4s, v6.4s +sqrdmulh v6.4S, v19.4S, v9.4S +mul v19.4S, v19.4S,v21.4S +mla v19.4S, v6.4S, v31.s[0] +sub v6.4s, v4.4s, v19.4s +add v4.4s, v4.4s, v19.4s +sqrdmulh v19.4S, v4.4S, v25.4S +mul v4.4S, v4.4S,v23.4S +mla v4.4S, v19.4S, v31.s[0] +sub v19.4s, v26.4s, v4.4s +add v26.4s, v26.4s, v4.4s +sqrdmulh v4.4S, v6.4S, v24.4S +mul v6.4S, v6.4S,v28.4S +mla v6.4S, v4.4S, v31.s[0] +sub v4.4s, v5.4s, v6.4s +add v5.4s, v5.4s, v6.4s +str q26, [x0, #320] +str q19, [x0, #336] +str q5, [x0, #352] +str q4, [x0, #368] +ldr q4, [x17, #+896] +ldr q5, [x17, #+912] +ldr q19, [x17, #+928] +ldr q26, [x17, #+944] +ldr q6, [x17, #+960] +ldr q30, [x17, #+976] +ldr q22, [x17, #+992] +ldr q10, [x17, #+1008] +ldr q24, [x0, #416] +ldr q28, [x0, #432] +ldr q25, [x0, #384] +ldr q23, [x0, #400] +sqrdmulh v9.4S, v24.4S, v5.s[0] +mul v24.4S, v24.4S,v4.s[0] +mla v24.4S, v9.4S, v31.s[0] +sub v9.4s, v25.4s, v24.4s +add v25.4s, v25.4s, v24.4s +sqrdmulh v24.4S, v28.4S, v5.s[0] +mul v28.4S, v28.4S,v4.s[0] +mla v28.4S, v24.4S, v31.s[0] +sub v24.4s, v23.4s, v28.4s +add v23.4s, v23.4s, v28.4s +sqrdmulh v28.4S, v23.4S, v5.s[1] +mul v23.4S, v23.4S,v4.s[1] +mla v23.4S, v28.4S, v31.s[0] +sub v28.4s, v25.4s, v23.4s +add v25.4s, v25.4s, v23.4s +sqrdmulh v23.4S, v24.4S, v5.s[2] +mul v24.4S, v24.4S,v4.s[2] +mla v24.4S, v23.4S, v31.s[0] +sub v23.4s, v9.4s, v24.4s +add v9.4s, v9.4s, v24.4s +trn1 v24.4S, v25.4S, v28.4S +trn2 v21.4S, v25.4S, v28.4S +trn1 v3.4S, v9.4S, v23.4S +trn2 v18.4S, v9.4S, v23.4S +trn2 v9.2D, v24.2D, v3.2D +trn2 v23.2D, v21.2D, v18.2D +trn1 v25.2D, v24.2D, v3.2D +trn1 v28.2D, v21.2D, v18.2D +sqrdmulh v18.4S, v9.4S, v26.4S +mul v9.4S, v9.4S,v19.4S +mla v9.4S, v18.4S, v31.s[0] +sub v18.4s, v25.4s, v9.4s +add v25.4s, v25.4s, v9.4s +sqrdmulh v9.4S, v23.4S, v26.4S +mul v23.4S, v23.4S,v19.4S +mla v23.4S, v9.4S, v31.s[0] +sub v9.4s, v28.4s, v23.4s +add v28.4s, v28.4s, v23.4s +sqrdmulh v23.4S, v28.4S, v30.4S +mul v28.4S, v28.4S,v6.4S +mla v28.4S, v23.4S, v31.s[0] +sub v23.4s, v25.4s, v28.4s +add v25.4s, v25.4s, v28.4s +sqrdmulh v28.4S, v9.4S, v10.4S +mul v9.4S, v9.4S,v22.4S +mla v9.4S, v28.4S, v31.s[0] +sub v28.4s, v18.4s, v9.4s +add v18.4s, v18.4s, v9.4s +str q25, [x0, #384] +str q23, [x0, #400] +str q18, [x0, #416] +str q28, [x0, #432] +ldr q28, [x17, #+1024] +ldr q18, [x17, #+1040] +ldr q23, [x17, #+1056] +ldr q25, [x17, #+1072] +ldr q9, [x17, #+1088] +ldr q21, [x17, #+1104] +ldr q3, [x17, #+1120] +ldr q24, [x17, #+1136] +ldr q10, [x0, #480] +ldr q22, [x0, #496] +ldr q30, [x0, #448] +ldr q6, [x0, #464] +sqrdmulh v26.4S, v10.4S, v18.s[0] +mul v10.4S, v10.4S,v28.s[0] +mla v10.4S, v26.4S, v31.s[0] +sub v26.4s, v30.4s, v10.4s +add v30.4s, v30.4s, v10.4s +sqrdmulh v10.4S, v22.4S, v18.s[0] +mul v22.4S, v22.4S,v28.s[0] +mla v22.4S, v10.4S, v31.s[0] +sub v10.4s, v6.4s, v22.4s +add v6.4s, v6.4s, v22.4s +sqrdmulh v22.4S, v6.4S, v18.s[1] +mul v6.4S, v6.4S,v28.s[1] +mla v6.4S, v22.4S, v31.s[0] +sub v22.4s, v30.4s, v6.4s +add v30.4s, v30.4s, v6.4s +sqrdmulh v6.4S, v10.4S, v18.s[2] +mul v10.4S, v10.4S,v28.s[2] +mla v10.4S, v6.4S, v31.s[0] +sub v6.4s, v26.4s, v10.4s +add v26.4s, v26.4s, v10.4s +trn1 v10.4S, v30.4S, v22.4S +trn2 v19.4S, v30.4S, v22.4S +trn1 v5.4S, v26.4S, v6.4S +trn2 v4.4S, v26.4S, v6.4S +trn2 v26.2D, v10.2D, v5.2D +trn2 v6.2D, v19.2D, v4.2D +trn1 v30.2D, v10.2D, v5.2D +trn1 v22.2D, v19.2D, v4.2D +sqrdmulh v4.4S, v26.4S, v25.4S +mul v26.4S, v26.4S,v23.4S +mla v26.4S, v4.4S, v31.s[0] +sub v4.4s, v30.4s, v26.4s +add v30.4s, v30.4s, v26.4s +sqrdmulh v26.4S, v6.4S, v25.4S +mul v6.4S, v6.4S,v23.4S +mla v6.4S, v26.4S, v31.s[0] +sub v26.4s, v22.4s, v6.4s +add v22.4s, v22.4s, v6.4s +sqrdmulh v6.4S, v22.4S, v21.4S +mul v22.4S, v22.4S,v9.4S +mla v22.4S, v6.4S, v31.s[0] +sub v6.4s, v30.4s, v22.4s +add v30.4s, v30.4s, v22.4s +sqrdmulh v22.4S, v26.4S, v24.4S +mul v26.4S, v26.4S,v3.4S +mla v26.4S, v22.4S, v31.s[0] +sub v22.4s, v4.4s, v26.4s +add v4.4s, v4.4s, v26.4s +str q30, [x0, #448] +str q6, [x0, #464] +str q4, [x0, #480] +str q22, [x0, #496] +ldr q22, [x17, #+1152] +ldr q4, [x17, #+1168] +ldr q6, [x17, #+1184] +ldr q30, [x17, #+1200] +ldr q26, [x17, #+1216] +ldr q19, [x17, #+1232] +ldr q5, [x17, #+1248] +ldr q10, [x17, #+1264] +ldr q24, [x0, #544] +ldr q3, [x0, #560] +ldr q21, [x0, #512] +ldr q9, [x0, #528] +sqrdmulh v25.4S, v24.4S, v4.s[0] +mul v24.4S, v24.4S,v22.s[0] +mla v24.4S, v25.4S, v31.s[0] +sub v25.4s, v21.4s, v24.4s +add v21.4s, v21.4s, v24.4s +sqrdmulh v24.4S, v3.4S, v4.s[0] +mul v3.4S, v3.4S,v22.s[0] +mla v3.4S, v24.4S, v31.s[0] +sub v24.4s, v9.4s, v3.4s +add v9.4s, v9.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v4.s[1] +mul v9.4S, v9.4S,v22.s[1] +mla v9.4S, v3.4S, v31.s[0] +sub v3.4s, v21.4s, v9.4s +add v21.4s, v21.4s, v9.4s +sqrdmulh v9.4S, v24.4S, v4.s[2] +mul v24.4S, v24.4S,v22.s[2] +mla v24.4S, v9.4S, v31.s[0] +sub v9.4s, v25.4s, v24.4s +add v25.4s, v25.4s, v24.4s +trn1 v24.4S, v21.4S, v3.4S +trn2 v23.4S, v21.4S, v3.4S +trn1 v18.4S, v25.4S, v9.4S +trn2 v28.4S, v25.4S, v9.4S +trn2 v25.2D, v24.2D, v18.2D +trn2 v9.2D, v23.2D, v28.2D +trn1 v21.2D, v24.2D, v18.2D +trn1 v3.2D, v23.2D, v28.2D +sqrdmulh v28.4S, v25.4S, v30.4S +mul v25.4S, v25.4S,v6.4S +mla v25.4S, v28.4S, v31.s[0] +sub v28.4s, v21.4s, v25.4s +add v21.4s, v21.4s, v25.4s +sqrdmulh v25.4S, v9.4S, v30.4S +mul v9.4S, v9.4S,v6.4S +mla v9.4S, v25.4S, v31.s[0] +sub v25.4s, v3.4s, v9.4s +add v3.4s, v3.4s, v9.4s +sqrdmulh v9.4S, v3.4S, v19.4S +mul v3.4S, v3.4S,v26.4S +mla v3.4S, v9.4S, v31.s[0] +sub v9.4s, v21.4s, v3.4s +add v21.4s, v21.4s, v3.4s +sqrdmulh v3.4S, v25.4S, v10.4S +mul v25.4S, v25.4S,v5.4S +mla v25.4S, v3.4S, v31.s[0] +sub v3.4s, v28.4s, v25.4s +add v28.4s, v28.4s, v25.4s +str q21, [x0, #512] +str q9, [x0, #528] +str q28, [x0, #544] +str q3, [x0, #560] +ldr q3, [x17, #+1280] +ldr q28, [x17, #+1296] +ldr q9, [x17, #+1312] +ldr q21, [x17, #+1328] +ldr q25, [x17, #+1344] +ldr q23, [x17, #+1360] +ldr q18, [x17, #+1376] +ldr q24, [x17, #+1392] +ldr q10, [x0, #608] +ldr q5, [x0, #624] +ldr q19, [x0, #576] +ldr q26, [x0, #592] +sqrdmulh v30.4S, v10.4S, v28.s[0] +mul v10.4S, v10.4S,v3.s[0] +mla v10.4S, v30.4S, v31.s[0] +sub v30.4s, v19.4s, v10.4s +add v19.4s, v19.4s, v10.4s +sqrdmulh v10.4S, v5.4S, v28.s[0] +mul v5.4S, v5.4S,v3.s[0] +mla v5.4S, v10.4S, v31.s[0] +sub v10.4s, v26.4s, v5.4s +add v26.4s, v26.4s, v5.4s +sqrdmulh v5.4S, v26.4S, v28.s[1] +mul v26.4S, v26.4S,v3.s[1] +mla v26.4S, v5.4S, v31.s[0] +sub v5.4s, v19.4s, v26.4s +add v19.4s, v19.4s, v26.4s +sqrdmulh v26.4S, v10.4S, v28.s[2] +mul v10.4S, v10.4S,v3.s[2] +mla v10.4S, v26.4S, v31.s[0] +sub v26.4s, v30.4s, v10.4s +add v30.4s, v30.4s, v10.4s +trn1 v10.4S, v19.4S, v5.4S +trn2 v6.4S, v19.4S, v5.4S +trn1 v4.4S, v30.4S, v26.4S +trn2 v22.4S, v30.4S, v26.4S +trn2 v30.2D, v10.2D, v4.2D +trn2 v26.2D, v6.2D, v22.2D +trn1 v19.2D, v10.2D, v4.2D +trn1 v5.2D, v6.2D, v22.2D +sqrdmulh v22.4S, v30.4S, v21.4S +mul v30.4S, v30.4S,v9.4S +mla v30.4S, v22.4S, v31.s[0] +sub v22.4s, v19.4s, v30.4s +add v19.4s, v19.4s, v30.4s +sqrdmulh v30.4S, v26.4S, v21.4S +mul v26.4S, v26.4S,v9.4S +mla v26.4S, v30.4S, v31.s[0] +sub v30.4s, v5.4s, v26.4s +add v5.4s, v5.4s, v26.4s +sqrdmulh v26.4S, v5.4S, v23.4S +mul v5.4S, v5.4S,v25.4S +mla v5.4S, v26.4S, v31.s[0] +sub v26.4s, v19.4s, v5.4s +add v19.4s, v19.4s, v5.4s +sqrdmulh v5.4S, v30.4S, v24.4S +mul v30.4S, v30.4S,v18.4S +mla v30.4S, v5.4S, v31.s[0] +sub v5.4s, v22.4s, v30.4s +add v22.4s, v22.4s, v30.4s +str q19, [x0, #576] +str q26, [x0, #592] +str q22, [x0, #608] +str q5, [x0, #624] +ldr q5, [x17, #+1408] +ldr q22, [x17, #+1424] +ldr q26, [x17, #+1440] +ldr q19, [x17, #+1456] +ldr q30, [x17, #+1472] +ldr q6, [x17, #+1488] +ldr q4, [x17, #+1504] +ldr q10, [x17, #+1520] +ldr q24, [x0, #672] +ldr q18, [x0, #688] +ldr q23, [x0, #640] +ldr q25, [x0, #656] +sqrdmulh v21.4S, v24.4S, v22.s[0] +mul v24.4S, v24.4S,v5.s[0] +mla v24.4S, v21.4S, v31.s[0] +sub v21.4s, v23.4s, v24.4s +add v23.4s, v23.4s, v24.4s +sqrdmulh v24.4S, v18.4S, v22.s[0] +mul v18.4S, v18.4S,v5.s[0] +mla v18.4S, v24.4S, v31.s[0] +sub v24.4s, v25.4s, v18.4s +add v25.4s, v25.4s, v18.4s +sqrdmulh v18.4S, v25.4S, v22.s[1] +mul v25.4S, v25.4S,v5.s[1] +mla v25.4S, v18.4S, v31.s[0] +sub v18.4s, v23.4s, v25.4s +add v23.4s, v23.4s, v25.4s +sqrdmulh v25.4S, v24.4S, v22.s[2] +mul v24.4S, v24.4S,v5.s[2] +mla v24.4S, v25.4S, v31.s[0] +sub v25.4s, v21.4s, v24.4s +add v21.4s, v21.4s, v24.4s +trn1 v24.4S, v23.4S, v18.4S +trn2 v9.4S, v23.4S, v18.4S +trn1 v28.4S, v21.4S, v25.4S +trn2 v3.4S, v21.4S, v25.4S +trn2 v21.2D, v24.2D, v28.2D +trn2 v25.2D, v9.2D, v3.2D +trn1 v23.2D, v24.2D, v28.2D +trn1 v18.2D, v9.2D, v3.2D +sqrdmulh v3.4S, v21.4S, v19.4S +mul v21.4S, v21.4S,v26.4S +mla v21.4S, v3.4S, v31.s[0] +sub v3.4s, v23.4s, v21.4s +add v23.4s, v23.4s, v21.4s +sqrdmulh v21.4S, v25.4S, v19.4S +mul v25.4S, v25.4S,v26.4S +mla v25.4S, v21.4S, v31.s[0] +sub v21.4s, v18.4s, v25.4s +add v18.4s, v18.4s, v25.4s +sqrdmulh v25.4S, v18.4S, v6.4S +mul v18.4S, v18.4S,v30.4S +mla v18.4S, v25.4S, v31.s[0] +sub v25.4s, v23.4s, v18.4s +add v23.4s, v23.4s, v18.4s +sqrdmulh v18.4S, v21.4S, v10.4S +mul v21.4S, v21.4S,v4.4S +mla v21.4S, v18.4S, v31.s[0] +sub v18.4s, v3.4s, v21.4s +add v3.4s, v3.4s, v21.4s +str q23, [x0, #640] +str q25, [x0, #656] +str q3, [x0, #672] +str q18, [x0, #688] +ldr q18, [x17, #+1536] +ldr q3, [x17, #+1552] +ldr q25, [x17, #+1568] +ldr q23, [x17, #+1584] +ldr q21, [x17, #+1600] +ldr q9, [x17, #+1616] +ldr q28, [x17, #+1632] +ldr q24, [x17, #+1648] +ldr q10, [x0, #736] +ldr q4, [x0, #752] +ldr q6, [x0, #704] +ldr q30, [x0, #720] +sqrdmulh v19.4S, v10.4S, v3.s[0] +mul v10.4S, v10.4S,v18.s[0] +mla v10.4S, v19.4S, v31.s[0] +sub v19.4s, v6.4s, v10.4s +add v6.4s, v6.4s, v10.4s +sqrdmulh v10.4S, v4.4S, v3.s[0] +mul v4.4S, v4.4S,v18.s[0] +mla v4.4S, v10.4S, v31.s[0] +sub v10.4s, v30.4s, v4.4s +add v30.4s, v30.4s, v4.4s +sqrdmulh v4.4S, v30.4S, v3.s[1] +mul v30.4S, v30.4S,v18.s[1] +mla v30.4S, v4.4S, v31.s[0] +sub v4.4s, v6.4s, v30.4s +add v6.4s, v6.4s, v30.4s +sqrdmulh v30.4S, v10.4S, v3.s[2] +mul v10.4S, v10.4S,v18.s[2] +mla v10.4S, v30.4S, v31.s[0] +sub v30.4s, v19.4s, v10.4s +add v19.4s, v19.4s, v10.4s +trn1 v10.4S, v6.4S, v4.4S +trn2 v26.4S, v6.4S, v4.4S +trn1 v22.4S, v19.4S, v30.4S +trn2 v5.4S, v19.4S, v30.4S +trn2 v19.2D, v10.2D, v22.2D +trn2 v30.2D, v26.2D, v5.2D +trn1 v6.2D, v10.2D, v22.2D +trn1 v4.2D, v26.2D, v5.2D +sqrdmulh v5.4S, v19.4S, v23.4S +mul v19.4S, v19.4S,v25.4S +mla v19.4S, v5.4S, v31.s[0] +sub v5.4s, v6.4s, v19.4s +add v6.4s, v6.4s, v19.4s +sqrdmulh v19.4S, v30.4S, v23.4S +mul v30.4S, v30.4S,v25.4S +mla v30.4S, v19.4S, v31.s[0] +sub v19.4s, v4.4s, v30.4s +add v4.4s, v4.4s, v30.4s +sqrdmulh v30.4S, v4.4S, v9.4S +mul v4.4S, v4.4S,v21.4S +mla v4.4S, v30.4S, v31.s[0] +sub v30.4s, v6.4s, v4.4s +add v6.4s, v6.4s, v4.4s +sqrdmulh v4.4S, v19.4S, v24.4S +mul v19.4S, v19.4S,v28.4S +mla v19.4S, v4.4S, v31.s[0] +sub v4.4s, v5.4s, v19.4s +add v5.4s, v5.4s, v19.4s +str q6, [x0, #704] +str q30, [x0, #720] +str q5, [x0, #736] +str q4, [x0, #752] +ldr q4, [x17, #+1664] +ldr q5, [x17, #+1680] +ldr q30, [x17, #+1696] +ldr q6, [x17, #+1712] +ldr q19, [x17, #+1728] +ldr q26, [x17, #+1744] +ldr q22, [x17, #+1760] +ldr q10, [x17, #+1776] +ldr q24, [x0, #800] +ldr q28, [x0, #816] +ldr q9, [x0, #768] +ldr q21, [x0, #784] +sqrdmulh v23.4S, v24.4S, v5.s[0] +mul v24.4S, v24.4S,v4.s[0] +mla v24.4S, v23.4S, v31.s[0] +sub v23.4s, v9.4s, v24.4s +add v9.4s, v9.4s, v24.4s +sqrdmulh v24.4S, v28.4S, v5.s[0] +mul v28.4S, v28.4S,v4.s[0] +mla v28.4S, v24.4S, v31.s[0] +sub v24.4s, v21.4s, v28.4s +add v21.4s, v21.4s, v28.4s +sqrdmulh v28.4S, v21.4S, v5.s[1] +mul v21.4S, v21.4S,v4.s[1] +mla v21.4S, v28.4S, v31.s[0] +sub v28.4s, v9.4s, v21.4s +add v9.4s, v9.4s, v21.4s +sqrdmulh v21.4S, v24.4S, v5.s[2] +mul v24.4S, v24.4S,v4.s[2] +mla v24.4S, v21.4S, v31.s[0] +sub v21.4s, v23.4s, v24.4s +add v23.4s, v23.4s, v24.4s +trn1 v24.4S, v9.4S, v28.4S +trn2 v25.4S, v9.4S, v28.4S +trn1 v3.4S, v23.4S, v21.4S +trn2 v18.4S, v23.4S, v21.4S +trn2 v23.2D, v24.2D, v3.2D +trn2 v21.2D, v25.2D, v18.2D +trn1 v9.2D, v24.2D, v3.2D +trn1 v28.2D, v25.2D, v18.2D +sqrdmulh v18.4S, v23.4S, v6.4S +mul v23.4S, v23.4S,v30.4S +mla v23.4S, v18.4S, v31.s[0] +sub v18.4s, v9.4s, v23.4s +add v9.4s, v9.4s, v23.4s +sqrdmulh v23.4S, v21.4S, v6.4S +mul v21.4S, v21.4S,v30.4S +mla v21.4S, v23.4S, v31.s[0] +sub v23.4s, v28.4s, v21.4s +add v28.4s, v28.4s, v21.4s +sqrdmulh v21.4S, v28.4S, v26.4S +mul v28.4S, v28.4S,v19.4S +mla v28.4S, v21.4S, v31.s[0] +sub v21.4s, v9.4s, v28.4s +add v9.4s, v9.4s, v28.4s +sqrdmulh v28.4S, v23.4S, v10.4S +mul v23.4S, v23.4S,v22.4S +mla v23.4S, v28.4S, v31.s[0] +sub v28.4s, v18.4s, v23.4s +add v18.4s, v18.4s, v23.4s +str q9, [x0, #768] +str q21, [x0, #784] +str q18, [x0, #800] +str q28, [x0, #816] +ldr q28, [x17, #+1792] +ldr q18, [x17, #+1808] +ldr q21, [x17, #+1824] +ldr q9, [x17, #+1840] +ldr q23, [x17, #+1856] +ldr q25, [x17, #+1872] +ldr q3, [x17, #+1888] +ldr q24, [x17, #+1904] +ldr q10, [x0, #864] +ldr q22, [x0, #880] +ldr q26, [x0, #832] +ldr q19, [x0, #848] +sqrdmulh v6.4S, v10.4S, v18.s[0] +mul v10.4S, v10.4S,v28.s[0] +mla v10.4S, v6.4S, v31.s[0] +sub v6.4s, v26.4s, v10.4s +add v26.4s, v26.4s, v10.4s +sqrdmulh v10.4S, v22.4S, v18.s[0] +mul v22.4S, v22.4S,v28.s[0] +mla v22.4S, v10.4S, v31.s[0] +sub v10.4s, v19.4s, v22.4s +add v19.4s, v19.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v18.s[1] +mul v19.4S, v19.4S,v28.s[1] +mla v19.4S, v22.4S, v31.s[0] +sub v22.4s, v26.4s, v19.4s +add v26.4s, v26.4s, v19.4s +sqrdmulh v19.4S, v10.4S, v18.s[2] +mul v10.4S, v10.4S,v28.s[2] +mla v10.4S, v19.4S, v31.s[0] +sub v19.4s, v6.4s, v10.4s +add v6.4s, v6.4s, v10.4s +trn1 v10.4S, v26.4S, v22.4S +trn2 v30.4S, v26.4S, v22.4S +trn1 v5.4S, v6.4S, v19.4S +trn2 v4.4S, v6.4S, v19.4S +trn2 v6.2D, v10.2D, v5.2D +trn2 v19.2D, v30.2D, v4.2D +trn1 v26.2D, v10.2D, v5.2D +trn1 v22.2D, v30.2D, v4.2D +sqrdmulh v4.4S, v6.4S, v9.4S +mul v6.4S, v6.4S,v21.4S +mla v6.4S, v4.4S, v31.s[0] +sub v4.4s, v26.4s, v6.4s +add v26.4s, v26.4s, v6.4s +sqrdmulh v6.4S, v19.4S, v9.4S +mul v19.4S, v19.4S,v21.4S +mla v19.4S, v6.4S, v31.s[0] +sub v6.4s, v22.4s, v19.4s +add v22.4s, v22.4s, v19.4s +sqrdmulh v19.4S, v22.4S, v25.4S +mul v22.4S, v22.4S,v23.4S +mla v22.4S, v19.4S, v31.s[0] +sub v19.4s, v26.4s, v22.4s +add v26.4s, v26.4s, v22.4s +sqrdmulh v22.4S, v6.4S, v24.4S +mul v6.4S, v6.4S,v3.4S +mla v6.4S, v22.4S, v31.s[0] +sub v22.4s, v4.4s, v6.4s +add v4.4s, v4.4s, v6.4s +str q26, [x0, #832] +str q19, [x0, #848] +str q4, [x0, #864] +str q22, [x0, #880] +ldr q22, [x17, #+1920] +ldr q4, [x17, #+1936] +ldr q19, [x17, #+1952] +ldr q26, [x17, #+1968] +ldr q6, [x17, #+1984] +ldr q30, [x17, #+2000] +ldr q5, [x17, #+2016] +ldr q10, [x17, #+2032] +ldr q24, [x0, #928] +ldr q3, [x0, #944] +ldr q25, [x0, #896] +ldr q23, [x0, #912] +sqrdmulh v9.4S, v24.4S, v4.s[0] +mul v24.4S, v24.4S,v22.s[0] +mla v24.4S, v9.4S, v31.s[0] +sub v9.4s, v25.4s, v24.4s +add v25.4s, v25.4s, v24.4s +sqrdmulh v24.4S, v3.4S, v4.s[0] +mul v3.4S, v3.4S,v22.s[0] +mla v3.4S, v24.4S, v31.s[0] +sub v24.4s, v23.4s, v3.4s +add v23.4s, v23.4s, v3.4s +sqrdmulh v3.4S, v23.4S, v4.s[1] +mul v23.4S, v23.4S,v22.s[1] +mla v23.4S, v3.4S, v31.s[0] +sub v3.4s, v25.4s, v23.4s +add v25.4s, v25.4s, v23.4s +sqrdmulh v23.4S, v24.4S, v4.s[2] +mul v24.4S, v24.4S,v22.s[2] +mla v24.4S, v23.4S, v31.s[0] +sub v23.4s, v9.4s, v24.4s +add v9.4s, v9.4s, v24.4s +trn1 v24.4S, v25.4S, v3.4S +trn2 v21.4S, v25.4S, v3.4S +trn1 v18.4S, v9.4S, v23.4S +trn2 v28.4S, v9.4S, v23.4S +trn2 v9.2D, v24.2D, v18.2D +trn2 v23.2D, v21.2D, v28.2D +trn1 v25.2D, v24.2D, v18.2D +trn1 v3.2D, v21.2D, v28.2D +sqrdmulh v28.4S, v9.4S, v26.4S +mul v9.4S, v9.4S,v19.4S +mla v9.4S, v28.4S, v31.s[0] +sub v28.4s, v25.4s, v9.4s +add v25.4s, v25.4s, v9.4s +sqrdmulh v9.4S, v23.4S, v26.4S +mul v23.4S, v23.4S,v19.4S +mla v23.4S, v9.4S, v31.s[0] +sub v9.4s, v3.4s, v23.4s +add v3.4s, v3.4s, v23.4s +sqrdmulh v23.4S, v3.4S, v30.4S +mul v3.4S, v3.4S,v6.4S +mla v3.4S, v23.4S, v31.s[0] +sub v23.4s, v25.4s, v3.4s +add v25.4s, v25.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v10.4S +mul v9.4S, v9.4S,v5.4S +mla v9.4S, v3.4S, v31.s[0] +sub v3.4s, v28.4s, v9.4s +add v28.4s, v28.4s, v9.4s +str q25, [x0, #896] +str q23, [x0, #912] +str q28, [x0, #928] +str q3, [x0, #944] +ldr q3, [x17, #+2048] +ldr q28, [x17, #+2064] +ldr q23, [x17, #+2080] +ldr q25, [x17, #+2096] +ldr q9, [x17, #+2112] +ldr q21, [x17, #+2128] +ldr q18, [x17, #+2144] +ldr q24, [x17, #+2160] +ldr q10, [x0, #992] +ldr q5, [x0, #1008] +ldr q30, [x0, #960] +ldr q6, [x0, #976] +sqrdmulh v26.4S, v10.4S, v28.s[0] +mul v10.4S, v10.4S,v3.s[0] +mla v10.4S, v26.4S, v31.s[0] +sub v26.4s, v30.4s, v10.4s +add v30.4s, v30.4s, v10.4s +sqrdmulh v10.4S, v5.4S, v28.s[0] +mul v5.4S, v5.4S,v3.s[0] +mla v5.4S, v10.4S, v31.s[0] +sub v10.4s, v6.4s, v5.4s +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v6.4S, v28.s[1] +mul v6.4S, v6.4S,v3.s[1] +mla v6.4S, v5.4S, v31.s[0] +sub v5.4s, v30.4s, v6.4s +add v30.4s, v30.4s, v6.4s +sqrdmulh v6.4S, v10.4S, v28.s[2] +mul v10.4S, v10.4S,v3.s[2] +mla v10.4S, v6.4S, v31.s[0] +sub v6.4s, v26.4s, v10.4s +add v26.4s, v26.4s, v10.4s +trn1 v10.4S, v30.4S, v5.4S +trn2 v19.4S, v30.4S, v5.4S +trn1 v4.4S, v26.4S, v6.4S +trn2 v22.4S, v26.4S, v6.4S +trn2 v26.2D, v10.2D, v4.2D +trn2 v6.2D, v19.2D, v22.2D +trn1 v30.2D, v10.2D, v4.2D +trn1 v5.2D, v19.2D, v22.2D +sqrdmulh v22.4S, v26.4S, v25.4S +mul v26.4S, v26.4S,v23.4S +mla v26.4S, v22.4S, v31.s[0] +sub v22.4s, v30.4s, v26.4s +add v30.4s, v30.4s, v26.4s +sqrdmulh v26.4S, v6.4S, v25.4S +mul v6.4S, v6.4S,v23.4S +mla v6.4S, v26.4S, v31.s[0] +sub v26.4s, v5.4s, v6.4s +add v5.4s, v5.4s, v6.4s +sqrdmulh v6.4S, v5.4S, v21.4S +mul v5.4S, v5.4S,v9.4S +mla v5.4S, v6.4S, v31.s[0] +sub v6.4s, v30.4s, v5.4s +add v30.4s, v30.4s, v5.4s +sqrdmulh v5.4S, v26.4S, v24.4S +mul v26.4S, v26.4S,v18.4S +mla v26.4S, v5.4S, v31.s[0] +sub v5.4s, v22.4s, v26.4s +add v22.4s, v22.4s, v26.4s +str q30, [x0, #960] +str q6, [x0, #976] +str q22, [x0, #992] +str q5, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 2392 +// Instruction count: 2388 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_5_0.s b/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_5_0.s new file mode 100644 index 0000000..c47ecf1 --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_5_0.s @@ -0,0 +1,2422 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 26036764 // Layer 6, block 0 +.word 7065381 // Layer 6, block 1 +.word 11280567 // Layer 6, block 2 +.word 19695786 // Layer 6, block 3 +.word 1666225723 // Layer 6, block 0 +.word 452149874 // Layer 6, block 1 +.word 721901190 // Layer 6, block 2 +.word 1260434103 // Layer 6, block 3 +.word 28678040 // Layer 7, block 0 +.word 5637166 // Layer 7, block 2 +.word 18759424 // Layer 7, block 4 +.word 8648030 // Layer 7, block 6 +.word 1835254486 // Layer 7, block 0 +.word 360751090 // Layer 7, block 2 +.word 1200511508 // Layer 7, block 4 +.word 553431680 // Layer 7, block 6 +.word 7232147 // Layer 7, block 1 +.word 7430689 // Layer 7, block 3 +.word 14819378 // Layer 7, block 5 +.word 22112339 // Layer 7, block 7 +.word 462822084 // Layer 7, block 1 +.word 475527802 // Layer 7, block 3 +.word 948367809 // Layer 7, block 5 +.word 1415081692 // Layer 7, block 7 +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14834498 // Layer 6, block 4 +.word 22861321 // Layer 6, block 5 +.word 23033862 // Layer 6, block 6 +.word 32211066 // Layer 6, block 7 +.word 949335415 // Layer 6, block 4 +.word 1463012881 // Layer 6, block 5 +.word 1474054663 // Layer 6, block 6 +.word 2061350894 // Layer 6, block 7 +.word 7103825 // Layer 7, block 8 +.word 24338119 // Layer 7, block 10 +.word 6674394 // Layer 7, block 12 +.word 3716128 // Layer 7, block 14 +.word 454610102 // Layer 7, block 8 +.word 1557520740 // Layer 7, block 10 +.word 427128616 // Layer 7, block 12 +.word 237814041 // Layer 7, block 14 +.word 18577393 // Layer 7, block 9 +.word 17042091 // Layer 7, block 11 +.word 6574213 // Layer 7, block 13 +.word 24666803 // Layer 7, block 15 +.word 1188862414 // Layer 7, block 9 +.word 1090610585 // Layer 7, block 11 +.word 420717521 // Layer 7, block 13 +.word 1578554911 // Layer 7, block 15 +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 11253846 // Layer 6, block 8 +.word 16151303 // Layer 6, block 9 +.word 1821442 // Layer 6, block 10 +.word 23358663 // Layer 6, block 11 +.word 720191176 // Layer 6, block 8 +.word 1033604503 // Layer 6, block 9 +.word 116563391 // Layer 6, block 10 +.word 1494840340 // Layer 6, block 11 +.word 32787475 // Layer 7, block 16 +.word 8269259 // Layer 7, block 18 +.word 20826321 // Layer 7, block 20 +.word 21194054 // Layer 7, block 22 +.word 2098238255 // Layer 7, block 16 +.word 529192186 // Layer 7, block 18 +.word 1332782821 // Layer 7, block 20 +.word 1356315937 // Layer 7, block 22 +.word 28400654 // Layer 7, block 17 +.word 31090287 // Layer 7, block 19 +.word 26776841 // Layer 7, block 21 +.word 22281074 // Layer 7, block 23 +.word 1817503137 // Layer 7, block 17 +.word 1989626512 // Layer 7, block 19 +.word 1713587037 // Layer 7, block 21 +.word 1425879908 // Layer 7, block 23 +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 20504641 // Layer 6, block 12 +.word 7735096 // Layer 6, block 13 +.word 29463916 // Layer 6, block 14 +.word 23172067 // Layer 6, block 15 +.word 1312196872 // Layer 6, block 12 +.word 495008363 // Layer 6, block 13 +.word 1885546712 // Layer 6, block 14 +.word 1482899108 // Layer 6, block 15 +.word 1953000 // Layer 7, block 24 +.word 12766243 // Layer 7, block 26 +.word 16292342 // Layer 7, block 28 +.word 25143337 // Layer 7, block 30 +.word 124982461 // Layer 7, block 24 +.word 816977197 // Layer 7, block 26 +.word 1042630311 // Layer 7, block 28 +.word 1609050759 // Layer 7, block 30 +.word 12486848 // Layer 7, block 25 +.word 31556661 // Layer 7, block 27 +.word 28330310 // Layer 7, block 29 +.word 15137961 // Layer 7, block 31 +.word 799097282 // Layer 7, block 25 +.word 2019472170 // Layer 7, block 27 +.word 1813001465 // Layer 7, block 29 +.word 968755565 // Layer 7, block 31 +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 18663828 // Layer 6, block 16 +.word 25765932 // Layer 6, block 17 +.word 11779122 // Layer 6, block 18 +.word 29112305 // Layer 6, block 19 +.word 1194393831 // Layer 6, block 16 +.word 1648893798 // Layer 6, block 17 +.word 753806275 // Layer 6, block 18 +.word 1863045325 // Layer 6, block 19 +.word 33163184 // Layer 7, block 32 +.word 11550623 // Layer 7, block 34 +.word 25375595 // Layer 7, block 36 +.word 18254638 // Layer 7, block 38 +.word 2122281795 // Layer 7, block 32 +.word 739183455 // Layer 7, block 34 +.word 1623914137 // Layer 7, block 36 +.word 1168207670 // Layer 7, block 38 +.word 9551359 // Layer 7, block 33 +.word 33257316 // Layer 7, block 35 +.word 10387700 // Layer 7, block 37 +.word 4263629 // Layer 7, block 39 +.word 611240324 // Layer 7, block 33 +.word 2128305784 // Layer 7, block 35 +.word 664762063 // Layer 7, block 37 +.word 272851431 // Layer 7, block 39 +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 596073 // Layer 6, block 20 +.word 29039358 // Layer 6, block 21 +.word 6760262 // Layer 6, block 22 +.word 2228887 // Layer 6, block 23 +.word 38145761 // Layer 6, block 20 +.word 1858377074 // Layer 6, block 21 +.word 432623749 // Layer 6, block 22 +.word 142637881 // Layer 6, block 23 +.word 25929180 // Layer 7, block 40 +.word 23508428 // Layer 7, block 42 +.word 22560727 // Layer 7, block 44 +.word 29457393 // Layer 7, block 46 +.word 1659340873 // Layer 7, block 40 +.word 1504424569 // Layer 7, block 42 +.word 1443776334 // Layer 7, block 44 +.word 1885129272 // Layer 7, block 46 +.word 17371159 // Layer 7, block 41 +.word 11558208 // Layer 7, block 43 +.word 15755637 // Layer 7, block 45 +.word 20740787 // Layer 7, block 47 +.word 1111669329 // Layer 7, block 41 +.word 739668858 // Layer 7, block 43 +.word 1008283812 // Layer 7, block 45 +.word 1327309063 // Layer 7, block 47 +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 13624329 // Layer 6, block 24 +.word 9838349 // Layer 6, block 25 +.word 6934560 // Layer 6, block 26 +.word 11310234 // Layer 6, block 27 +.word 871890510 // Layer 6, block 24 +.word 629606282 // Layer 6, block 25 +.word 443777969 // Layer 6, block 26 +.word 723799733 // Layer 6, block 27 +.word 3153984 // Layer 7, block 48 +.word 15599806 // Layer 7, block 50 +.word 23484790 // Layer 7, block 52 +.word 30174454 // Layer 7, block 54 +.word 201839571 // Layer 7, block 48 +.word 998311389 // Layer 7, block 50 +.word 1502911852 // Layer 7, block 52 +.word 1931017673 // Layer 7, block 54 +.word 13598070 // Layer 7, block 49 +.word 31454003 // Layer 7, block 51 +.word 20506260 // Layer 7, block 53 +.word 5928435 // Layer 7, block 55 +.word 870210062 // Layer 7, block 49 +.word 2012902560 // Layer 7, block 51 +.word 1312300480 // Layer 7, block 53 +.word 379390883 // Layer 7, block 55 +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 32798516 // Layer 6, block 28 +.word 9911360 // Layer 6, block 29 +.word 32443170 // Layer 6, block 30 +.word 31293482 // Layer 6, block 31 +.word 2098944825 // Layer 6, block 28 +.word 634278629 // Layer 6, block 29 +.word 2076204416 // Layer 6, block 30 +.word 2002630000 // Layer 6, block 31 +.word 26013877 // Layer 7, block 56 +.word 22928950 // Layer 7, block 58 +.word 24547058 // Layer 7, block 60 +.word 21082546 // Layer 7, block 62 +.word 1664761067 // Layer 7, block 56 +.word 1467340807 // Layer 7, block 58 +.word 1570891816 // Layer 7, block 60 +.word 1349179970 // Layer 7, block 62 +.word 21864746 // Layer 7, block 57 +.word 27678266 // Layer 7, block 59 +.word 30695887 // Layer 7, block 61 +.word 31772478 // Layer 7, block 63 +.word 1399236949 // Layer 7, block 57 +.word 1771273834 // Layer 7, block 59 +.word 1964386839 // Layer 7, block 61 +.word 2033283404 // Layer 7, block 63 +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 2853776 // Layer 6, block 32 +.word 31645959 // Layer 6, block 33 +.word 29723614 // Layer 6, block 34 +.word 31813171 // Layer 6, block 35 +.word 182627725 // Layer 6, block 32 +.word 2025186806 // Layer 6, block 33 +.word 1902166116 // Layer 6, block 34 +.word 2035887557 // Layer 6, block 35 +.word 30377953 // Layer 7, block 64 +.word 4924837 // Layer 7, block 66 +.word 11362575 // Layer 7, block 68 +.word 31398766 // Layer 7, block 70 +.word 1944040616 // Layer 7, block 64 +.word 315165513 // Layer 7, block 66 +.word 727149301 // Layer 7, block 68 +.word 2009367662 // Layer 7, block 70 +.word 27689101 // Layer 7, block 65 +.word 31229525 // Layer 7, block 67 +.word 6544948 // Layer 7, block 69 +.word 13728247 // Layer 7, block 71 +.word 1771967221 // Layer 7, block 65 +.word 1998537064 // Layer 7, block 67 +.word 418844704 // Layer 7, block 69 +.word 878540754 // Layer 7, block 71 +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9116920 // Layer 6, block 36 +.word 26449800 // Layer 6, block 37 +.word 27173300 // Layer 6, block 38 +.word 1574249 // Layer 6, block 39 +.word 583438350 // Layer 6, block 36 +.word 1692658010 // Layer 6, block 37 +.word 1738958476 // Layer 6, block 38 +.word 100744247 // Layer 6, block 39 +.word 6510145 // Layer 7, block 72 +.word 760999 // Layer 7, block 74 +.word 1634503 // Layer 7, block 76 +.word 29546109 // Layer 7, block 78 +.word 416617482 // Layer 7, block 72 +.word 48700219 // Layer 7, block 74 +.word 104600209 // Layer 7, block 76 +.word 1890806663 // Layer 7, block 78 +.word 2195232 // Layer 7, block 73 +.word 4465852 // Layer 7, block 75 +.word 31203102 // Layer 7, block 77 +.word 29916743 // Layer 7, block 79 +.word 140484126 // Layer 7, block 73 +.word 285792715 // Layer 7, block 75 +.word 1996846121 // Layer 7, block 77 +.word 1914525428 // Layer 7, block 79 +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29172999 // Layer 6, block 40 +.word 16825951 // Layer 6, block 41 +.word 11592382 // Layer 6, block 42 +.word 2671395 // Layer 6, block 43 +.word 1866929445 // Layer 6, block 40 +.word 1076778680 // Layer 6, block 41 +.word 741855827 // Layer 6, block 42 +.word 170956232 // Layer 6, block 43 +.word 14579779 // Layer 7, block 80 +.word 24263513 // Layer 7, block 82 +.word 4646776 // Layer 7, block 84 +.word 69049 // Layer 7, block 86 +.word 933034643 // Layer 7, block 80 +.word 1552746321 // Layer 7, block 82 +.word 297370968 // Layer 7, block 84 +.word 4418799 // Layer 7, block 86 +.word 33263488 // Layer 7, block 81 +.word 22493246 // Layer 7, block 83 +.word 22009979 // Layer 7, block 85 +.word 12021234 // Layer 7, block 87 +.word 2128700762 // Layer 7, block 81 +.word 1439457879 // Layer 7, block 83 +.word 1408531152 // Layer 7, block 85 +.word 769300260 // Layer 7, block 87 +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 15720958 // Layer 6, block 44 +.word 4876619 // Layer 6, block 45 +.word 9370171 // Layer 6, block 46 +.word 2197027 // Layer 6, block 47 +.word 1006064525 // Layer 6, block 44 +.word 312079797 // Layer 6, block 45 +.word 599645177 // Layer 6, block 46 +.word 140598997 // Layer 6, block 47 +.word 16117282 // Layer 7, block 88 +.word 9635661 // Layer 7, block 90 +.word 9117520 // Layer 7, block 92 +.word 3506913 // Layer 7, block 94 +.word 1031427326 // Layer 7, block 88 +.word 616635240 // Layer 7, block 90 +.word 583476747 // Layer 7, block 92 +.word 224425303 // Layer 7, block 94 +.word 20014407 // Layer 7, block 89 +.word 25893988 // Layer 7, block 91 +.word 10257619 // Layer 7, block 93 +.word 24501669 // Layer 7, block 95 +.word 1280824291 // Layer 7, block 89 +.word 1657088757 // Layer 7, block 91 +.word 656437514 // Layer 7, block 93 +.word 1567987141 // Layer 7, block 95 +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 23467272 // Layer 6, block 48 +.word 11944835 // Layer 6, block 49 +.word 29768154 // Layer 6, block 50 +.word 3189790 // Layer 6, block 51 +.word 1501790786 // Layer 6, block 48 +.word 764411097 // Layer 6, block 49 +.word 1905016458 // Layer 6, block 50 +.word 204130980 // Layer 6, block 51 +.word 28559032 // Layer 7, block 96 +.word 20151609 // Layer 7, block 98 +.word 11645481 // Layer 7, block 100 +.word 16402437 // Layer 7, block 102 +.word 1827638556 // Layer 7, block 96 +.word 1289604549 // Layer 7, block 98 +.word 745253903 // Layer 7, block 100 +.word 1049675853 // Layer 7, block 102 +.word 1005359 // Layer 7, block 97 +.word 19130139 // Layer 7, block 99 +.word 11690281 // Layer 7, block 101 +.word 5461508 // Layer 7, block 103 +.word 64338065 // Layer 7, block 97 +.word 1224235458 // Layer 7, block 99 +.word 748120885 // Layer 7, block 101 +.word 349509836 // Layer 7, block 103 +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 4898455 // Layer 6, block 52 +.word 22059944 // Layer 6, block 53 +.word 20315246 // Layer 6, block 54 +.word 28615767 // Layer 6, block 55 +.word 313477194 // Layer 6, block 52 +.word 1411728668 // Layer 6, block 53 +.word 1300076517 // Layer 6, block 54 +.word 1831269319 // Layer 6, block 55 +.word 6226096 // Layer 7, block 104 +.word 14029790 // Layer 7, block 106 +.word 7729000 // Layer 7, block 108 +.word 13958531 // Layer 7, block 110 +.word 398439734 // Layer 7, block 104 +.word 897838034 // Layer 7, block 106 +.word 494618249 // Layer 7, block 108 +.word 893277806 // Layer 7, block 110 +.word 31755058 // Layer 7, block 105 +.word 26102744 // Layer 7, block 107 +.word 19175904 // Layer 7, block 109 +.word 19472238 // Layer 7, block 111 +.word 2032168609 // Layer 7, block 105 +.word 1670448121 // Layer 7, block 107 +.word 1227164194 // Layer 7, block 109 +.word 1246128123 // Layer 7, block 111 +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 17302560 // Layer 6, block 56 +.word 8630188 // Layer 6, block 57 +.word 13744680 // Layer 6, block 58 +.word 31890906 // Layer 6, block 59 +.word 1107279328 // Layer 6, block 56 +.word 552289879 // Layer 6, block 57 +.word 879592386 // Layer 6, block 58 +.word 2040862218 // Layer 6, block 59 +.word 4735938 // Layer 7, block 112 +.word 26671657 // Layer 7, block 114 +.word 25810971 // Layer 7, block 116 +.word 25578690 // Layer 7, block 118 +.word 303076900 // Layer 7, block 112 +.word 1706855774 // Layer 7, block 114 +.word 1651776074 // Layer 7, block 116 +.word 1636911225 // Layer 7, block 118 +.word 6957373 // Layer 7, block 113 +.word 25381712 // Layer 7, block 115 +.word 27780827 // Layer 7, block 117 +.word 28062311 // Layer 7, block 119 +.word 445237890 // Layer 7, block 113 +.word 1624305595 // Layer 7, block 115 +.word 1777837237 // Layer 7, block 117 +.word 1795850838 // Layer 7, block 119 +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 26150922 // Layer 6, block 60 +.word 29525906 // Layer 6, block 61 +.word 23080870 // Layer 6, block 62 +.word 1636987 // Layer 6, block 63 +.word 1673531278 // Layer 6, block 60 +.word 1889513769 // Layer 6, block 61 +.word 1477062945 // Layer 6, block 62 +.word 104759172 // Layer 6, block 63 +.word 10674616 // Layer 7, block 120 +.word 9508293 // Layer 7, block 122 +.word 4274200 // Layer 7, block 124 +.word 10066304 // Layer 7, block 126 +.word 683123285 // Layer 7, block 120 +.word 608484310 // Layer 7, block 122 +.word 273527923 // Layer 7, block 124 +.word 644194289 // Layer 7, block 126 +.word 26473446 // Layer 7, block 121 +.word 14853570 // Layer 7, block 123 +.word 32427548 // Layer 7, block 125 +.word 16598340 // Layer 7, block 127 +.word 1694171239 // Layer 7, block 121 +.word 950555930 // Layer 7, block 123 +.word 2075204685 // Layer 7, block 125 +.word 1062212688 // Layer 7, block 127 +.text +.global ntt_u32_full_neon_asm_var_4_4_5_0 +.global _ntt_u32_full_neon_asm_var_4_4_5_0 +ntt_u32_full_neon_asm_var_4_4_5_0: +_ntt_u32_full_neon_asm_var_4_4_5_0: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x0, #800] +ldr q29, [x0, #864] +ldr q28, [x0, #928] +ldr q27, [x0, #992] +ldr q26, [x0, #288] +ldr q25, [x0, #352] +ldr q24, [x0, #416] +ldr q23, [x0, #480] +ldr q22, [x0, #544] +ldr q21, [x0, #608] +ldr q20, [x0, #672] +ldr q19, [x0, #736] +ldr q18, [x0, #32] +ldr q17, [x0, #96] +ldr q16, [x0, #160] +ldr q3, [x0, #224] +ldr q2, [x17, #+0] +ldr q1, [x17, #+16] +ldr q0, [x17, #+32] +ldr q15, [x17, #+48] +ldr q14, [x17, #+64] +ldr q13, [x17, #+80] +ldr q12, [x17, #+96] +ldr q11, [x17, #+112] +sqrdmulh v10.4S, v30.4S, v1.s[0] +mul v30.4S, v30.4S,v2.s[0] +sqrdmulh v9.4S, v29.4S, v1.s[0] +mul v29.4S, v29.4S,v2.s[0] +sqrdmulh v8.4S, v28.4S, v1.s[0] +mul v28.4S, v28.4S,v2.s[0] +sqrdmulh v7.4S, v27.4S, v1.s[0] +mul v27.4S, v27.4S,v2.s[0] +mla v30.4S, v10.4S, v31.s[0] +mla v29.4S, v9.4S, v31.s[0] +mla v28.4S, v8.4S, v31.s[0] +mla v27.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v22.4S, v1.s[0] +mul v22.4S, v22.4S,v2.s[0] +sqrdmulh v8.4S, v21.4S, v1.s[0] +mul v21.4S, v21.4S,v2.s[0] +sqrdmulh v9.4S, v20.4S, v1.s[0] +mul v20.4S, v20.4S,v2.s[0] +sqrdmulh v10.4S, v19.4S, v1.s[0] +mul v19.4S, v19.4S,v2.s[0] +mla v22.4S, v7.4S, v31.s[0] +mla v21.4S, v8.4S, v31.s[0] +mla v20.4S, v9.4S, v31.s[0] +mla v19.4S, v10.4S, v31.s[0] +sub v10.4s, v26.4s, v30.4s +add v26.4s, v26.4s, v30.4s +sub v30.4s, v25.4s, v29.4s +add v25.4s, v25.4s, v29.4s +sub v29.4s, v24.4s, v28.4s +add v24.4s, v24.4s, v28.4s +sub v28.4s, v23.4s, v27.4s +add v23.4s, v23.4s, v27.4s +sub v27.4s, v18.4s, v22.4s +add v18.4s, v18.4s, v22.4s +sub v22.4s, v17.4s, v21.4s +add v17.4s, v17.4s, v21.4s +sub v21.4s, v16.4s, v20.4s +add v16.4s, v16.4s, v20.4s +sub v20.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v24.4S, v1.s[1] +mul v24.4S, v24.4S,v2.s[1] +sqrdmulh v9.4S, v23.4S, v1.s[1] +mul v23.4S, v23.4S,v2.s[1] +sqrdmulh v8.4S, v26.4S, v1.s[1] +mul v26.4S, v26.4S,v2.s[1] +sqrdmulh v7.4S, v25.4S, v1.s[1] +mul v25.4S, v25.4S,v2.s[1] +mla v24.4S, v19.4S, v31.s[0] +mla v23.4S, v9.4S, v31.s[0] +mla v26.4S, v8.4S, v31.s[0] +mla v25.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v29.4S, v1.s[2] +mul v29.4S, v29.4S,v2.s[2] +sqrdmulh v8.4S, v28.4S, v1.s[2] +mul v28.4S, v28.4S,v2.s[2] +sqrdmulh v9.4S, v10.4S, v1.s[2] +mul v10.4S, v10.4S,v2.s[2] +sqrdmulh v19.4S, v30.4S, v1.s[2] +mul v30.4S, v30.4S,v2.s[2] +mla v29.4S, v7.4S, v31.s[0] +mla v28.4S, v8.4S, v31.s[0] +mla v10.4S, v9.4S, v31.s[0] +mla v30.4S, v19.4S, v31.s[0] +sub v19.4s, v16.4s, v24.4s +add v16.4s, v16.4s, v24.4s +sub v24.4s, v3.4s, v23.4s +add v3.4s, v3.4s, v23.4s +sub v23.4s, v18.4s, v26.4s +add v18.4s, v18.4s, v26.4s +sub v26.4s, v17.4s, v25.4s +add v17.4s, v17.4s, v25.4s +sub v25.4s, v21.4s, v29.4s +add v21.4s, v21.4s, v29.4s +sub v29.4s, v20.4s, v28.4s +add v20.4s, v20.4s, v28.4s +sub v28.4s, v27.4s, v10.4s +add v27.4s, v27.4s, v10.4s +sub v10.4s, v22.4s, v30.4s +add v22.4s, v22.4s, v30.4s +sqrdmulh v30.4S, v16.4S, v15.s[0] +mul v16.4S, v16.4S,v0.s[0] +sqrdmulh v9.4S, v3.4S, v15.s[0] +mul v3.4S, v3.4S,v0.s[0] +sqrdmulh v8.4S, v19.4S, v15.s[1] +mul v19.4S, v19.4S,v0.s[1] +sqrdmulh v7.4S, v24.4S, v15.s[1] +mul v24.4S, v24.4S,v0.s[1] +mla v16.4S, v30.4S, v31.s[0] +mla v3.4S, v9.4S, v31.s[0] +mla v19.4S, v8.4S, v31.s[0] +mla v24.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v21.4S, v15.s[2] +mul v21.4S, v21.4S,v0.s[2] +sqrdmulh v8.4S, v20.4S, v15.s[2] +mul v20.4S, v20.4S,v0.s[2] +sqrdmulh v9.4S, v25.4S, v15.s[3] +mul v25.4S, v25.4S,v0.s[3] +sqrdmulh v30.4S, v29.4S, v15.s[3] +mul v29.4S, v29.4S,v0.s[3] +mla v21.4S, v7.4S, v31.s[0] +mla v20.4S, v8.4S, v31.s[0] +mla v25.4S, v9.4S, v31.s[0] +mla v29.4S, v30.4S, v31.s[0] +sub v30.4s, v18.4s, v16.4s +add v18.4s, v18.4s, v16.4s +sub v16.4s, v17.4s, v3.4s +add v17.4s, v17.4s, v3.4s +sub v3.4s, v23.4s, v19.4s +add v23.4s, v23.4s, v19.4s +sub v19.4s, v26.4s, v24.4s +add v26.4s, v26.4s, v24.4s +sub v24.4s, v27.4s, v21.4s +add v27.4s, v27.4s, v21.4s +sub v21.4s, v22.4s, v20.4s +add v22.4s, v22.4s, v20.4s +sub v20.4s, v28.4s, v25.4s +add v28.4s, v28.4s, v25.4s +sub v25.4s, v10.4s, v29.4s +add v10.4s, v10.4s, v29.4s +sqrdmulh v29.4S, v17.4S, v13.s[0] +mul v17.4S, v17.4S,v14.s[0] +sqrdmulh v9.4S, v16.4S, v13.s[1] +mul v16.4S, v16.4S,v14.s[1] +sqrdmulh v8.4S, v26.4S, v13.s[2] +mul v26.4S, v26.4S,v14.s[2] +sqrdmulh v7.4S, v19.4S, v13.s[3] +mul v19.4S, v19.4S,v14.s[3] +mla v17.4S, v29.4S, v31.s[0] +mla v16.4S, v9.4S, v31.s[0] +mla v26.4S, v8.4S, v31.s[0] +mla v19.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v22.4S, v11.s[0] +mul v22.4S, v22.4S,v12.s[0] +sqrdmulh v8.4S, v21.4S, v11.s[1] +mul v21.4S, v21.4S,v12.s[1] +sqrdmulh v9.4S, v10.4S, v11.s[2] +mul v10.4S, v10.4S,v12.s[2] +sqrdmulh v29.4S, v25.4S, v11.s[3] +mul v25.4S, v25.4S,v12.s[3] +mla v22.4S, v7.4S, v31.s[0] +mla v21.4S, v8.4S, v31.s[0] +mla v10.4S, v9.4S, v31.s[0] +mla v25.4S, v29.4S, v31.s[0] +sub v29.4s, v18.4s, v17.4s +add v18.4s, v18.4s, v17.4s +sub v17.4s, v30.4s, v16.4s +add v30.4s, v30.4s, v16.4s +sub v16.4s, v23.4s, v26.4s +add v23.4s, v23.4s, v26.4s +sub v26.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +sub v19.4s, v27.4s, v22.4s +add v27.4s, v27.4s, v22.4s +sub v22.4s, v24.4s, v21.4s +add v24.4s, v24.4s, v21.4s +sub v21.4s, v28.4s, v10.4s +add v28.4s, v28.4s, v10.4s +sub v10.4s, v20.4s, v25.4s +add v20.4s, v20.4s, v25.4s +str q18, [x0, #32] +str q29, [x0, #96] +str q30, [x0, #160] +str q17, [x0, #224] +str q23, [x0, #288] +str q16, [x0, #352] +str q3, [x0, #416] +str q26, [x0, #480] +str q27, [x0, #544] +str q19, [x0, #608] +str q24, [x0, #672] +str q22, [x0, #736] +str q28, [x0, #800] +str q21, [x0, #864] +str q20, [x0, #928] +str q10, [x0, #992] +ldr q10, [x0, #816] +ldr q20, [x0, #880] +ldr q21, [x0, #944] +ldr q28, [x0, #1008] +ldr q22, [x0, #304] +ldr q24, [x0, #368] +ldr q19, [x0, #432] +ldr q27, [x0, #496] +ldr q26, [x0, #560] +ldr q3, [x0, #624] +ldr q16, [x0, #688] +ldr q23, [x0, #752] +ldr q17, [x0, #48] +ldr q30, [x0, #112] +ldr q29, [x0, #176] +ldr q18, [x0, #240] +sqrdmulh v25.4S, v10.4S, v1.s[0] +mul v10.4S, v10.4S,v2.s[0] +sqrdmulh v9.4S, v20.4S, v1.s[0] +mul v20.4S, v20.4S,v2.s[0] +sqrdmulh v8.4S, v21.4S, v1.s[0] +mul v21.4S, v21.4S,v2.s[0] +sqrdmulh v7.4S, v28.4S, v1.s[0] +mul v28.4S, v28.4S,v2.s[0] +mla v10.4S, v25.4S, v31.s[0] +mla v20.4S, v9.4S, v31.s[0] +mla v21.4S, v8.4S, v31.s[0] +mla v28.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v26.4S, v1.s[0] +mul v26.4S, v26.4S,v2.s[0] +sqrdmulh v8.4S, v3.4S, v1.s[0] +mul v3.4S, v3.4S,v2.s[0] +sqrdmulh v9.4S, v16.4S, v1.s[0] +mul v16.4S, v16.4S,v2.s[0] +sqrdmulh v25.4S, v23.4S, v1.s[0] +mul v23.4S, v23.4S,v2.s[0] +mla v26.4S, v7.4S, v31.s[0] +mla v3.4S, v8.4S, v31.s[0] +mla v16.4S, v9.4S, v31.s[0] +mla v23.4S, v25.4S, v31.s[0] +sub v25.4s, v22.4s, v10.4s +add v22.4s, v22.4s, v10.4s +sub v10.4s, v24.4s, v20.4s +add v24.4s, v24.4s, v20.4s +sub v20.4s, v19.4s, v21.4s +add v19.4s, v19.4s, v21.4s +sub v21.4s, v27.4s, v28.4s +add v27.4s, v27.4s, v28.4s +sub v28.4s, v17.4s, v26.4s +add v17.4s, v17.4s, v26.4s +sub v26.4s, v30.4s, v3.4s +add v30.4s, v30.4s, v3.4s +sub v3.4s, v29.4s, v16.4s +add v29.4s, v29.4s, v16.4s +sub v16.4s, v18.4s, v23.4s +add v18.4s, v18.4s, v23.4s +sqrdmulh v23.4S, v19.4S, v1.s[1] +mul v19.4S, v19.4S,v2.s[1] +sqrdmulh v9.4S, v27.4S, v1.s[1] +mul v27.4S, v27.4S,v2.s[1] +sqrdmulh v8.4S, v22.4S, v1.s[1] +mul v22.4S, v22.4S,v2.s[1] +sqrdmulh v7.4S, v24.4S, v1.s[1] +mul v24.4S, v24.4S,v2.s[1] +mla v19.4S, v23.4S, v31.s[0] +mla v27.4S, v9.4S, v31.s[0] +mla v22.4S, v8.4S, v31.s[0] +mla v24.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v20.4S, v1.s[2] +mul v20.4S, v20.4S,v2.s[2] +sqrdmulh v8.4S, v21.4S, v1.s[2] +mul v21.4S, v21.4S,v2.s[2] +sqrdmulh v9.4S, v25.4S, v1.s[2] +mul v25.4S, v25.4S,v2.s[2] +sqrdmulh v23.4S, v10.4S, v1.s[2] +mul v10.4S, v10.4S,v2.s[2] +mla v20.4S, v7.4S, v31.s[0] +mla v21.4S, v8.4S, v31.s[0] +mla v25.4S, v9.4S, v31.s[0] +mla v10.4S, v23.4S, v31.s[0] +sub v23.4s, v29.4s, v19.4s +add v29.4s, v29.4s, v19.4s +sub v19.4s, v18.4s, v27.4s +add v18.4s, v18.4s, v27.4s +sub v27.4s, v17.4s, v22.4s +add v17.4s, v17.4s, v22.4s +sub v22.4s, v30.4s, v24.4s +add v30.4s, v30.4s, v24.4s +sub v24.4s, v3.4s, v20.4s +add v3.4s, v3.4s, v20.4s +sub v20.4s, v16.4s, v21.4s +add v16.4s, v16.4s, v21.4s +sub v21.4s, v28.4s, v25.4s +add v28.4s, v28.4s, v25.4s +sub v25.4s, v26.4s, v10.4s +add v26.4s, v26.4s, v10.4s +sqrdmulh v10.4S, v29.4S, v15.s[0] +mul v29.4S, v29.4S,v0.s[0] +sqrdmulh v9.4S, v18.4S, v15.s[0] +mul v18.4S, v18.4S,v0.s[0] +sqrdmulh v8.4S, v23.4S, v15.s[1] +mul v23.4S, v23.4S,v0.s[1] +sqrdmulh v7.4S, v19.4S, v15.s[1] +mul v19.4S, v19.4S,v0.s[1] +mla v29.4S, v10.4S, v31.s[0] +mla v18.4S, v9.4S, v31.s[0] +mla v23.4S, v8.4S, v31.s[0] +mla v19.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v3.4S, v15.s[2] +mul v3.4S, v3.4S,v0.s[2] +sqrdmulh v8.4S, v16.4S, v15.s[2] +mul v16.4S, v16.4S,v0.s[2] +sqrdmulh v9.4S, v24.4S, v15.s[3] +mul v24.4S, v24.4S,v0.s[3] +sqrdmulh v10.4S, v20.4S, v15.s[3] +mul v20.4S, v20.4S,v0.s[3] +mla v3.4S, v7.4S, v31.s[0] +mla v16.4S, v8.4S, v31.s[0] +mla v24.4S, v9.4S, v31.s[0] +mla v20.4S, v10.4S, v31.s[0] +sub v10.4s, v17.4s, v29.4s +add v17.4s, v17.4s, v29.4s +sub v29.4s, v30.4s, v18.4s +add v30.4s, v30.4s, v18.4s +sub v18.4s, v27.4s, v23.4s +add v27.4s, v27.4s, v23.4s +sub v23.4s, v22.4s, v19.4s +add v22.4s, v22.4s, v19.4s +sub v19.4s, v28.4s, v3.4s +add v28.4s, v28.4s, v3.4s +sub v3.4s, v26.4s, v16.4s +add v26.4s, v26.4s, v16.4s +sub v16.4s, v21.4s, v24.4s +add v21.4s, v21.4s, v24.4s +sub v24.4s, v25.4s, v20.4s +add v25.4s, v25.4s, v20.4s +sqrdmulh v20.4S, v30.4S, v13.s[0] +mul v30.4S, v30.4S,v14.s[0] +sqrdmulh v9.4S, v29.4S, v13.s[1] +mul v29.4S, v29.4S,v14.s[1] +sqrdmulh v8.4S, v22.4S, v13.s[2] +mul v22.4S, v22.4S,v14.s[2] +sqrdmulh v7.4S, v23.4S, v13.s[3] +mul v23.4S, v23.4S,v14.s[3] +mla v30.4S, v20.4S, v31.s[0] +mla v29.4S, v9.4S, v31.s[0] +mla v22.4S, v8.4S, v31.s[0] +mla v23.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v26.4S, v11.s[0] +mul v26.4S, v26.4S,v12.s[0] +sqrdmulh v8.4S, v3.4S, v11.s[1] +mul v3.4S, v3.4S,v12.s[1] +sqrdmulh v9.4S, v25.4S, v11.s[2] +mul v25.4S, v25.4S,v12.s[2] +sqrdmulh v20.4S, v24.4S, v11.s[3] +mul v24.4S, v24.4S,v12.s[3] +mla v26.4S, v7.4S, v31.s[0] +mla v3.4S, v8.4S, v31.s[0] +mla v25.4S, v9.4S, v31.s[0] +mla v24.4S, v20.4S, v31.s[0] +sub v20.4s, v17.4s, v30.4s +add v17.4s, v17.4s, v30.4s +sub v30.4s, v10.4s, v29.4s +add v10.4s, v10.4s, v29.4s +sub v29.4s, v27.4s, v22.4s +add v27.4s, v27.4s, v22.4s +sub v22.4s, v18.4s, v23.4s +add v18.4s, v18.4s, v23.4s +sub v23.4s, v28.4s, v26.4s +add v28.4s, v28.4s, v26.4s +sub v26.4s, v19.4s, v3.4s +add v19.4s, v19.4s, v3.4s +sub v3.4s, v21.4s, v25.4s +add v21.4s, v21.4s, v25.4s +sub v25.4s, v16.4s, v24.4s +add v16.4s, v16.4s, v24.4s +str q17, [x0, #48] +str q20, [x0, #112] +str q10, [x0, #176] +str q30, [x0, #240] +str q27, [x0, #304] +str q29, [x0, #368] +str q18, [x0, #432] +str q22, [x0, #496] +str q28, [x0, #560] +str q23, [x0, #624] +str q19, [x0, #688] +str q26, [x0, #752] +str q21, [x0, #816] +str q3, [x0, #880] +str q16, [x0, #944] +str q25, [x0, #1008] +ldr q25, [x0, #768] +ldr q16, [x0, #832] +ldr q3, [x0, #896] +ldr q21, [x0, #960] +ldr q26, [x0, #256] +ldr q19, [x0, #320] +ldr q23, [x0, #384] +ldr q28, [x0, #448] +ldr q22, [x0, #512] +ldr q18, [x0, #576] +ldr q29, [x0, #640] +ldr q27, [x0, #704] +ldr q30, [x0, #0] +ldr q10, [x0, #64] +ldr q20, [x0, #128] +ldr q17, [x0, #192] +sqrdmulh v24.4S, v25.4S, v1.s[0] +mul v25.4S, v25.4S,v2.s[0] +sqrdmulh v9.4S, v16.4S, v1.s[0] +mul v16.4S, v16.4S,v2.s[0] +sqrdmulh v8.4S, v3.4S, v1.s[0] +mul v3.4S, v3.4S,v2.s[0] +sqrdmulh v7.4S, v21.4S, v1.s[0] +mul v21.4S, v21.4S,v2.s[0] +mla v25.4S, v24.4S, v31.s[0] +mla v16.4S, v9.4S, v31.s[0] +mla v3.4S, v8.4S, v31.s[0] +mla v21.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v22.4S, v1.s[0] +mul v22.4S, v22.4S,v2.s[0] +sqrdmulh v8.4S, v18.4S, v1.s[0] +mul v18.4S, v18.4S,v2.s[0] +sqrdmulh v9.4S, v29.4S, v1.s[0] +mul v29.4S, v29.4S,v2.s[0] +sqrdmulh v24.4S, v27.4S, v1.s[0] +mul v27.4S, v27.4S,v2.s[0] +mla v22.4S, v7.4S, v31.s[0] +mla v18.4S, v8.4S, v31.s[0] +mla v29.4S, v9.4S, v31.s[0] +mla v27.4S, v24.4S, v31.s[0] +sub v24.4s, v26.4s, v25.4s +add v26.4s, v26.4s, v25.4s +sub v25.4s, v19.4s, v16.4s +add v19.4s, v19.4s, v16.4s +sub v16.4s, v23.4s, v3.4s +add v23.4s, v23.4s, v3.4s +sub v3.4s, v28.4s, v21.4s +add v28.4s, v28.4s, v21.4s +sub v21.4s, v30.4s, v22.4s +add v30.4s, v30.4s, v22.4s +sub v22.4s, v10.4s, v18.4s +add v10.4s, v10.4s, v18.4s +sub v18.4s, v20.4s, v29.4s +add v20.4s, v20.4s, v29.4s +sub v29.4s, v17.4s, v27.4s +add v17.4s, v17.4s, v27.4s +sqrdmulh v27.4S, v23.4S, v1.s[1] +mul v23.4S, v23.4S,v2.s[1] +sqrdmulh v9.4S, v28.4S, v1.s[1] +mul v28.4S, v28.4S,v2.s[1] +sqrdmulh v8.4S, v26.4S, v1.s[1] +mul v26.4S, v26.4S,v2.s[1] +sqrdmulh v7.4S, v19.4S, v1.s[1] +mul v19.4S, v19.4S,v2.s[1] +mla v23.4S, v27.4S, v31.s[0] +mla v28.4S, v9.4S, v31.s[0] +mla v26.4S, v8.4S, v31.s[0] +mla v19.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v16.4S, v1.s[2] +mul v16.4S, v16.4S,v2.s[2] +sqrdmulh v8.4S, v3.4S, v1.s[2] +mul v3.4S, v3.4S,v2.s[2] +sqrdmulh v9.4S, v24.4S, v1.s[2] +mul v24.4S, v24.4S,v2.s[2] +sqrdmulh v27.4S, v25.4S, v1.s[2] +mul v25.4S, v25.4S,v2.s[2] +mla v16.4S, v7.4S, v31.s[0] +mla v3.4S, v8.4S, v31.s[0] +mla v24.4S, v9.4S, v31.s[0] +mla v25.4S, v27.4S, v31.s[0] +sub v27.4s, v20.4s, v23.4s +add v20.4s, v20.4s, v23.4s +sub v23.4s, v17.4s, v28.4s +add v17.4s, v17.4s, v28.4s +sub v28.4s, v30.4s, v26.4s +add v30.4s, v30.4s, v26.4s +sub v26.4s, v10.4s, v19.4s +add v10.4s, v10.4s, v19.4s +sub v19.4s, v18.4s, v16.4s +add v18.4s, v18.4s, v16.4s +sub v16.4s, v29.4s, v3.4s +add v29.4s, v29.4s, v3.4s +sub v3.4s, v21.4s, v24.4s +add v21.4s, v21.4s, v24.4s +sub v24.4s, v22.4s, v25.4s +add v22.4s, v22.4s, v25.4s +sqrdmulh v25.4S, v20.4S, v15.s[0] +mul v20.4S, v20.4S,v0.s[0] +sqrdmulh v9.4S, v17.4S, v15.s[0] +mul v17.4S, v17.4S,v0.s[0] +sqrdmulh v8.4S, v27.4S, v15.s[1] +mul v27.4S, v27.4S,v0.s[1] +sqrdmulh v7.4S, v23.4S, v15.s[1] +mul v23.4S, v23.4S,v0.s[1] +mla v20.4S, v25.4S, v31.s[0] +mla v17.4S, v9.4S, v31.s[0] +mla v27.4S, v8.4S, v31.s[0] +mla v23.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v18.4S, v15.s[2] +mul v18.4S, v18.4S,v0.s[2] +sqrdmulh v8.4S, v29.4S, v15.s[2] +mul v29.4S, v29.4S,v0.s[2] +sqrdmulh v9.4S, v19.4S, v15.s[3] +mul v19.4S, v19.4S,v0.s[3] +sqrdmulh v25.4S, v16.4S, v15.s[3] +mul v16.4S, v16.4S,v0.s[3] +mla v18.4S, v7.4S, v31.s[0] +mla v29.4S, v8.4S, v31.s[0] +mla v19.4S, v9.4S, v31.s[0] +mla v16.4S, v25.4S, v31.s[0] +sub v25.4s, v30.4s, v20.4s +add v30.4s, v30.4s, v20.4s +sub v20.4s, v10.4s, v17.4s +add v10.4s, v10.4s, v17.4s +sub v17.4s, v28.4s, v27.4s +add v28.4s, v28.4s, v27.4s +sub v27.4s, v26.4s, v23.4s +add v26.4s, v26.4s, v23.4s +sub v23.4s, v21.4s, v18.4s +add v21.4s, v21.4s, v18.4s +sub v18.4s, v22.4s, v29.4s +add v22.4s, v22.4s, v29.4s +sub v29.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +sub v19.4s, v24.4s, v16.4s +add v24.4s, v24.4s, v16.4s +sqrdmulh v16.4S, v10.4S, v13.s[0] +mul v10.4S, v10.4S,v14.s[0] +sqrdmulh v9.4S, v20.4S, v13.s[1] +mul v20.4S, v20.4S,v14.s[1] +sqrdmulh v8.4S, v26.4S, v13.s[2] +mul v26.4S, v26.4S,v14.s[2] +sqrdmulh v7.4S, v27.4S, v13.s[3] +mul v27.4S, v27.4S,v14.s[3] +mla v10.4S, v16.4S, v31.s[0] +mla v20.4S, v9.4S, v31.s[0] +mla v26.4S, v8.4S, v31.s[0] +mla v27.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v22.4S, v11.s[0] +mul v22.4S, v22.4S,v12.s[0] +sqrdmulh v8.4S, v18.4S, v11.s[1] +mul v18.4S, v18.4S,v12.s[1] +sqrdmulh v9.4S, v24.4S, v11.s[2] +mul v24.4S, v24.4S,v12.s[2] +sqrdmulh v16.4S, v19.4S, v11.s[3] +mul v19.4S, v19.4S,v12.s[3] +mla v22.4S, v7.4S, v31.s[0] +mla v18.4S, v8.4S, v31.s[0] +mla v24.4S, v9.4S, v31.s[0] +mla v19.4S, v16.4S, v31.s[0] +sub v16.4s, v30.4s, v10.4s +add v30.4s, v30.4s, v10.4s +sub v10.4s, v25.4s, v20.4s +add v25.4s, v25.4s, v20.4s +sub v20.4s, v28.4s, v26.4s +add v28.4s, v28.4s, v26.4s +sub v26.4s, v17.4s, v27.4s +add v17.4s, v17.4s, v27.4s +sub v27.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +sub v22.4s, v23.4s, v18.4s +add v23.4s, v23.4s, v18.4s +sub v18.4s, v3.4s, v24.4s +add v3.4s, v3.4s, v24.4s +sub v24.4s, v29.4s, v19.4s +add v29.4s, v29.4s, v19.4s +str q30, [x0, #0] +str q16, [x0, #64] +str q25, [x0, #128] +str q10, [x0, #192] +str q28, [x0, #256] +str q20, [x0, #320] +str q17, [x0, #384] +str q26, [x0, #448] +str q21, [x0, #512] +str q27, [x0, #576] +str q23, [x0, #640] +str q22, [x0, #704] +str q3, [x0, #768] +str q18, [x0, #832] +str q29, [x0, #896] +str q24, [x0, #960] +ldr q24, [x0, #784] +ldr q29, [x0, #848] +ldr q18, [x0, #912] +ldr q3, [x0, #976] +ldr q22, [x0, #272] +ldr q23, [x0, #336] +ldr q27, [x0, #400] +ldr q21, [x0, #464] +ldr q26, [x0, #528] +ldr q17, [x0, #592] +ldr q20, [x0, #656] +ldr q28, [x0, #720] +ldr q10, [x0, #16] +ldr q25, [x0, #80] +ldr q16, [x0, #144] +ldr q30, [x0, #208] +sqrdmulh v19.4S, v24.4S, v1.s[0] +mul v24.4S, v24.4S,v2.s[0] +sqrdmulh v9.4S, v29.4S, v1.s[0] +mul v29.4S, v29.4S,v2.s[0] +sqrdmulh v8.4S, v18.4S, v1.s[0] +mul v18.4S, v18.4S,v2.s[0] +sqrdmulh v7.4S, v3.4S, v1.s[0] +mul v3.4S, v3.4S,v2.s[0] +mla v24.4S, v19.4S, v31.s[0] +mla v29.4S, v9.4S, v31.s[0] +mla v18.4S, v8.4S, v31.s[0] +mla v3.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v26.4S, v1.s[0] +mul v26.4S, v26.4S,v2.s[0] +sqrdmulh v8.4S, v17.4S, v1.s[0] +mul v17.4S, v17.4S,v2.s[0] +sqrdmulh v9.4S, v20.4S, v1.s[0] +mul v20.4S, v20.4S,v2.s[0] +sqrdmulh v19.4S, v28.4S, v1.s[0] +mul v28.4S, v28.4S,v2.s[0] +mla v26.4S, v7.4S, v31.s[0] +mla v17.4S, v8.4S, v31.s[0] +mla v20.4S, v9.4S, v31.s[0] +mla v28.4S, v19.4S, v31.s[0] +sub v19.4s, v22.4s, v24.4s +add v22.4s, v22.4s, v24.4s +sub v24.4s, v23.4s, v29.4s +add v23.4s, v23.4s, v29.4s +sub v29.4s, v27.4s, v18.4s +add v27.4s, v27.4s, v18.4s +sub v18.4s, v21.4s, v3.4s +add v21.4s, v21.4s, v3.4s +sub v3.4s, v10.4s, v26.4s +add v10.4s, v10.4s, v26.4s +sub v26.4s, v25.4s, v17.4s +add v25.4s, v25.4s, v17.4s +sub v17.4s, v16.4s, v20.4s +add v16.4s, v16.4s, v20.4s +sub v20.4s, v30.4s, v28.4s +add v30.4s, v30.4s, v28.4s +sqrdmulh v28.4S, v27.4S, v1.s[1] +mul v27.4S, v27.4S,v2.s[1] +sqrdmulh v9.4S, v21.4S, v1.s[1] +mul v21.4S, v21.4S,v2.s[1] +sqrdmulh v8.4S, v22.4S, v1.s[1] +mul v22.4S, v22.4S,v2.s[1] +sqrdmulh v7.4S, v23.4S, v1.s[1] +mul v23.4S, v23.4S,v2.s[1] +mla v27.4S, v28.4S, v31.s[0] +mla v21.4S, v9.4S, v31.s[0] +mla v22.4S, v8.4S, v31.s[0] +mla v23.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v29.4S, v1.s[2] +mul v29.4S, v29.4S,v2.s[2] +sqrdmulh v8.4S, v18.4S, v1.s[2] +mul v18.4S, v18.4S,v2.s[2] +sqrdmulh v9.4S, v19.4S, v1.s[2] +mul v19.4S, v19.4S,v2.s[2] +sqrdmulh v28.4S, v24.4S, v1.s[2] +mul v24.4S, v24.4S,v2.s[2] +mla v29.4S, v7.4S, v31.s[0] +mla v18.4S, v8.4S, v31.s[0] +mla v19.4S, v9.4S, v31.s[0] +mla v24.4S, v28.4S, v31.s[0] +sub v28.4s, v16.4s, v27.4s +add v16.4s, v16.4s, v27.4s +sub v27.4s, v30.4s, v21.4s +add v30.4s, v30.4s, v21.4s +sub v21.4s, v10.4s, v22.4s +add v10.4s, v10.4s, v22.4s +sub v22.4s, v25.4s, v23.4s +add v25.4s, v25.4s, v23.4s +sub v23.4s, v17.4s, v29.4s +add v17.4s, v17.4s, v29.4s +sub v29.4s, v20.4s, v18.4s +add v20.4s, v20.4s, v18.4s +sub v18.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +sub v19.4s, v26.4s, v24.4s +add v26.4s, v26.4s, v24.4s +sqrdmulh v24.4S, v16.4S, v15.s[0] +mul v16.4S, v16.4S,v0.s[0] +sqrdmulh v9.4S, v30.4S, v15.s[0] +mul v30.4S, v30.4S,v0.s[0] +sqrdmulh v8.4S, v28.4S, v15.s[1] +mul v28.4S, v28.4S,v0.s[1] +sqrdmulh v7.4S, v27.4S, v15.s[1] +mul v27.4S, v27.4S,v0.s[1] +mla v16.4S, v24.4S, v31.s[0] +mla v30.4S, v9.4S, v31.s[0] +mla v28.4S, v8.4S, v31.s[0] +mla v27.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v17.4S, v15.s[2] +mul v17.4S, v17.4S,v0.s[2] +sqrdmulh v8.4S, v20.4S, v15.s[2] +mul v20.4S, v20.4S,v0.s[2] +sqrdmulh v9.4S, v23.4S, v15.s[3] +mul v23.4S, v23.4S,v0.s[3] +sqrdmulh v24.4S, v29.4S, v15.s[3] +mul v29.4S, v29.4S,v0.s[3] +mla v17.4S, v7.4S, v31.s[0] +mla v20.4S, v8.4S, v31.s[0] +mla v23.4S, v9.4S, v31.s[0] +mla v29.4S, v24.4S, v31.s[0] +sub v24.4s, v10.4s, v16.4s +add v10.4s, v10.4s, v16.4s +sub v16.4s, v25.4s, v30.4s +add v25.4s, v25.4s, v30.4s +sub v30.4s, v21.4s, v28.4s +add v21.4s, v21.4s, v28.4s +sub v28.4s, v22.4s, v27.4s +add v22.4s, v22.4s, v27.4s +sub v27.4s, v3.4s, v17.4s +add v3.4s, v3.4s, v17.4s +sub v17.4s, v26.4s, v20.4s +add v26.4s, v26.4s, v20.4s +sub v20.4s, v18.4s, v23.4s +add v18.4s, v18.4s, v23.4s +sub v23.4s, v19.4s, v29.4s +add v19.4s, v19.4s, v29.4s +sqrdmulh v29.4S, v25.4S, v13.s[0] +mul v25.4S, v25.4S,v14.s[0] +sqrdmulh v9.4S, v16.4S, v13.s[1] +mul v16.4S, v16.4S,v14.s[1] +sqrdmulh v8.4S, v22.4S, v13.s[2] +mul v22.4S, v22.4S,v14.s[2] +sqrdmulh v7.4S, v28.4S, v13.s[3] +mul v28.4S, v28.4S,v14.s[3] +mla v25.4S, v29.4S, v31.s[0] +mla v16.4S, v9.4S, v31.s[0] +mla v22.4S, v8.4S, v31.s[0] +mla v28.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v26.4S, v11.s[0] +mul v26.4S, v26.4S,v12.s[0] +sqrdmulh v8.4S, v17.4S, v11.s[1] +mul v17.4S, v17.4S,v12.s[1] +sqrdmulh v9.4S, v19.4S, v11.s[2] +mul v19.4S, v19.4S,v12.s[2] +sqrdmulh v29.4S, v23.4S, v11.s[3] +mul v23.4S, v23.4S,v12.s[3] +mla v26.4S, v7.4S, v31.s[0] +mla v17.4S, v8.4S, v31.s[0] +mla v19.4S, v9.4S, v31.s[0] +mla v23.4S, v29.4S, v31.s[0] +sub v29.4s, v10.4s, v25.4s +add v10.4s, v10.4s, v25.4s +sub v25.4s, v24.4s, v16.4s +add v24.4s, v24.4s, v16.4s +sub v16.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +sub v22.4s, v30.4s, v28.4s +add v30.4s, v30.4s, v28.4s +sub v28.4s, v3.4s, v26.4s +add v3.4s, v3.4s, v26.4s +sub v26.4s, v27.4s, v17.4s +add v27.4s, v27.4s, v17.4s +sub v17.4s, v18.4s, v19.4s +add v18.4s, v18.4s, v19.4s +sub v19.4s, v20.4s, v23.4s +add v20.4s, v20.4s, v23.4s +str q10, [x0, #16] +str q29, [x0, #80] +str q24, [x0, #144] +str q25, [x0, #208] +str q21, [x0, #272] +str q16, [x0, #336] +str q30, [x0, #400] +str q22, [x0, #464] +str q3, [x0, #528] +str q28, [x0, #592] +str q27, [x0, #656] +str q26, [x0, #720] +str q18, [x0, #784] +str q17, [x0, #848] +str q20, [x0, #912] +str q19, [x0, #976] +ldr q4, [x17, #+128] +ldr q5, [x17, #+144] +ldr q6, [x17, #+160] +ldr q7, [x17, #+176] +ldr q8, [x17, #+192] +ldr q9, [x17, #+208] +ldr q23, [x17, #+224] +ldr q10, [x17, #+240] +ldr q29, [x0, #32] +ldr q24, [x0, #48] +ldr q25, [x0, #0] +ldr q21, [x0, #16] +sqrdmulh v16.4S, v29.4S, v5.s[0] +mul v29.4S, v29.4S,v4.s[0] +mla v29.4S, v16.4S, v31.s[0] +sub v16.4s, v25.4s, v29.4s +add v25.4s, v25.4s, v29.4s +sqrdmulh v29.4S, v24.4S, v5.s[0] +mul v24.4S, v24.4S,v4.s[0] +mla v24.4S, v29.4S, v31.s[0] +sub v29.4s, v21.4s, v24.4s +add v21.4s, v21.4s, v24.4s +sqrdmulh v24.4S, v21.4S, v5.s[1] +mul v21.4S, v21.4S,v4.s[1] +mla v21.4S, v24.4S, v31.s[0] +sub v24.4s, v25.4s, v21.4s +add v25.4s, v25.4s, v21.4s +sqrdmulh v21.4S, v29.4S, v5.s[2] +mul v29.4S, v29.4S,v4.s[2] +mla v29.4S, v21.4S, v31.s[0] +sub v21.4s, v16.4s, v29.4s +add v16.4s, v16.4s, v29.4s +trn1 v29.4S, v25.4S, v24.4S +trn2 v30.4S, v25.4S, v24.4S +trn1 v22.4S, v16.4S, v21.4S +trn2 v3.4S, v16.4S, v21.4S +trn2 v16.2D, v29.2D, v22.2D +trn2 v21.2D, v30.2D, v3.2D +trn1 v25.2D, v29.2D, v22.2D +trn1 v24.2D, v30.2D, v3.2D +sqrdmulh v3.4S, v16.4S, v7.4S +mul v16.4S, v16.4S,v6.4S +mla v16.4S, v3.4S, v31.s[0] +sub v3.4s, v25.4s, v16.4s +add v25.4s, v25.4s, v16.4s +sqrdmulh v16.4S, v21.4S, v7.4S +mul v21.4S, v21.4S,v6.4S +mla v21.4S, v16.4S, v31.s[0] +sub v16.4s, v24.4s, v21.4s +add v24.4s, v24.4s, v21.4s +sqrdmulh v21.4S, v24.4S, v9.4S +mul v24.4S, v24.4S,v8.4S +mla v24.4S, v21.4S, v31.s[0] +sub v21.4s, v25.4s, v24.4s +add v25.4s, v25.4s, v24.4s +sqrdmulh v24.4S, v16.4S, v10.4S +mul v16.4S, v16.4S,v23.4S +mla v16.4S, v24.4S, v31.s[0] +sub v24.4s, v3.4s, v16.4s +add v3.4s, v3.4s, v16.4s +str q25, [x0, #0] +str q21, [x0, #16] +str q3, [x0, #32] +str q24, [x0, #48] +ldr q24, [x17, #+256] +ldr q3, [x17, #+272] +ldr q21, [x17, #+288] +ldr q25, [x17, #+304] +ldr q16, [x17, #+320] +ldr q30, [x17, #+336] +ldr q22, [x17, #+352] +ldr q29, [x17, #+368] +ldr q10, [x0, #96] +ldr q23, [x0, #112] +ldr q9, [x0, #64] +ldr q8, [x0, #80] +sqrdmulh v7.4S, v10.4S, v3.s[0] +mul v10.4S, v10.4S,v24.s[0] +mla v10.4S, v7.4S, v31.s[0] +sub v7.4s, v9.4s, v10.4s +add v9.4s, v9.4s, v10.4s +sqrdmulh v10.4S, v23.4S, v3.s[0] +mul v23.4S, v23.4S,v24.s[0] +mla v23.4S, v10.4S, v31.s[0] +sub v10.4s, v8.4s, v23.4s +add v8.4s, v8.4s, v23.4s +sqrdmulh v23.4S, v8.4S, v3.s[1] +mul v8.4S, v8.4S,v24.s[1] +mla v8.4S, v23.4S, v31.s[0] +sub v23.4s, v9.4s, v8.4s +add v9.4s, v9.4s, v8.4s +sqrdmulh v8.4S, v10.4S, v3.s[2] +mul v10.4S, v10.4S,v24.s[2] +mla v10.4S, v8.4S, v31.s[0] +sub v8.4s, v7.4s, v10.4s +add v7.4s, v7.4s, v10.4s +trn1 v10.4S, v9.4S, v23.4S +trn2 v6.4S, v9.4S, v23.4S +trn1 v5.4S, v7.4S, v8.4S +trn2 v4.4S, v7.4S, v8.4S +trn2 v7.2D, v10.2D, v5.2D +trn2 v8.2D, v6.2D, v4.2D +trn1 v9.2D, v10.2D, v5.2D +trn1 v23.2D, v6.2D, v4.2D +sqrdmulh v4.4S, v7.4S, v25.4S +mul v7.4S, v7.4S,v21.4S +mla v7.4S, v4.4S, v31.s[0] +sub v4.4s, v9.4s, v7.4s +add v9.4s, v9.4s, v7.4s +sqrdmulh v7.4S, v8.4S, v25.4S +mul v8.4S, v8.4S,v21.4S +mla v8.4S, v7.4S, v31.s[0] +sub v7.4s, v23.4s, v8.4s +add v23.4s, v23.4s, v8.4s +sqrdmulh v8.4S, v23.4S, v30.4S +mul v23.4S, v23.4S,v16.4S +mla v23.4S, v8.4S, v31.s[0] +sub v8.4s, v9.4s, v23.4s +add v9.4s, v9.4s, v23.4s +sqrdmulh v23.4S, v7.4S, v29.4S +mul v7.4S, v7.4S,v22.4S +mla v7.4S, v23.4S, v31.s[0] +sub v23.4s, v4.4s, v7.4s +add v4.4s, v4.4s, v7.4s +str q9, [x0, #64] +str q8, [x0, #80] +str q4, [x0, #96] +str q23, [x0, #112] +ldr q23, [x17, #+384] +ldr q4, [x17, #+400] +ldr q8, [x17, #+416] +ldr q9, [x17, #+432] +ldr q7, [x17, #+448] +ldr q6, [x17, #+464] +ldr q5, [x17, #+480] +ldr q10, [x17, #+496] +ldr q29, [x0, #160] +ldr q22, [x0, #176] +ldr q30, [x0, #128] +ldr q16, [x0, #144] +sqrdmulh v25.4S, v29.4S, v4.s[0] +mul v29.4S, v29.4S,v23.s[0] +mla v29.4S, v25.4S, v31.s[0] +sub v25.4s, v30.4s, v29.4s +add v30.4s, v30.4s, v29.4s +sqrdmulh v29.4S, v22.4S, v4.s[0] +mul v22.4S, v22.4S,v23.s[0] +mla v22.4S, v29.4S, v31.s[0] +sub v29.4s, v16.4s, v22.4s +add v16.4s, v16.4s, v22.4s +sqrdmulh v22.4S, v16.4S, v4.s[1] +mul v16.4S, v16.4S,v23.s[1] +mla v16.4S, v22.4S, v31.s[0] +sub v22.4s, v30.4s, v16.4s +add v30.4s, v30.4s, v16.4s +sqrdmulh v16.4S, v29.4S, v4.s[2] +mul v29.4S, v29.4S,v23.s[2] +mla v29.4S, v16.4S, v31.s[0] +sub v16.4s, v25.4s, v29.4s +add v25.4s, v25.4s, v29.4s +trn1 v29.4S, v30.4S, v22.4S +trn2 v21.4S, v30.4S, v22.4S +trn1 v3.4S, v25.4S, v16.4S +trn2 v24.4S, v25.4S, v16.4S +trn2 v25.2D, v29.2D, v3.2D +trn2 v16.2D, v21.2D, v24.2D +trn1 v30.2D, v29.2D, v3.2D +trn1 v22.2D, v21.2D, v24.2D +sqrdmulh v24.4S, v25.4S, v9.4S +mul v25.4S, v25.4S,v8.4S +mla v25.4S, v24.4S, v31.s[0] +sub v24.4s, v30.4s, v25.4s +add v30.4s, v30.4s, v25.4s +sqrdmulh v25.4S, v16.4S, v9.4S +mul v16.4S, v16.4S,v8.4S +mla v16.4S, v25.4S, v31.s[0] +sub v25.4s, v22.4s, v16.4s +add v22.4s, v22.4s, v16.4s +sqrdmulh v16.4S, v22.4S, v6.4S +mul v22.4S, v22.4S,v7.4S +mla v22.4S, v16.4S, v31.s[0] +sub v16.4s, v30.4s, v22.4s +add v30.4s, v30.4s, v22.4s +sqrdmulh v22.4S, v25.4S, v10.4S +mul v25.4S, v25.4S,v5.4S +mla v25.4S, v22.4S, v31.s[0] +sub v22.4s, v24.4s, v25.4s +add v24.4s, v24.4s, v25.4s +str q30, [x0, #128] +str q16, [x0, #144] +str q24, [x0, #160] +str q22, [x0, #176] +ldr q22, [x17, #+512] +ldr q24, [x17, #+528] +ldr q16, [x17, #+544] +ldr q30, [x17, #+560] +ldr q25, [x17, #+576] +ldr q21, [x17, #+592] +ldr q3, [x17, #+608] +ldr q29, [x17, #+624] +ldr q10, [x0, #224] +ldr q5, [x0, #240] +ldr q6, [x0, #192] +ldr q7, [x0, #208] +sqrdmulh v9.4S, v10.4S, v24.s[0] +mul v10.4S, v10.4S,v22.s[0] +mla v10.4S, v9.4S, v31.s[0] +sub v9.4s, v6.4s, v10.4s +add v6.4s, v6.4s, v10.4s +sqrdmulh v10.4S, v5.4S, v24.s[0] +mul v5.4S, v5.4S,v22.s[0] +mla v5.4S, v10.4S, v31.s[0] +sub v10.4s, v7.4s, v5.4s +add v7.4s, v7.4s, v5.4s +sqrdmulh v5.4S, v7.4S, v24.s[1] +mul v7.4S, v7.4S,v22.s[1] +mla v7.4S, v5.4S, v31.s[0] +sub v5.4s, v6.4s, v7.4s +add v6.4s, v6.4s, v7.4s +sqrdmulh v7.4S, v10.4S, v24.s[2] +mul v10.4S, v10.4S,v22.s[2] +mla v10.4S, v7.4S, v31.s[0] +sub v7.4s, v9.4s, v10.4s +add v9.4s, v9.4s, v10.4s +trn1 v10.4S, v6.4S, v5.4S +trn2 v8.4S, v6.4S, v5.4S +trn1 v4.4S, v9.4S, v7.4S +trn2 v23.4S, v9.4S, v7.4S +trn2 v9.2D, v10.2D, v4.2D +trn2 v7.2D, v8.2D, v23.2D +trn1 v6.2D, v10.2D, v4.2D +trn1 v5.2D, v8.2D, v23.2D +sqrdmulh v23.4S, v9.4S, v30.4S +mul v9.4S, v9.4S,v16.4S +mla v9.4S, v23.4S, v31.s[0] +sub v23.4s, v6.4s, v9.4s +add v6.4s, v6.4s, v9.4s +sqrdmulh v9.4S, v7.4S, v30.4S +mul v7.4S, v7.4S,v16.4S +mla v7.4S, v9.4S, v31.s[0] +sub v9.4s, v5.4s, v7.4s +add v5.4s, v5.4s, v7.4s +sqrdmulh v7.4S, v5.4S, v21.4S +mul v5.4S, v5.4S,v25.4S +mla v5.4S, v7.4S, v31.s[0] +sub v7.4s, v6.4s, v5.4s +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v9.4S, v29.4S +mul v9.4S, v9.4S,v3.4S +mla v9.4S, v5.4S, v31.s[0] +sub v5.4s, v23.4s, v9.4s +add v23.4s, v23.4s, v9.4s +str q6, [x0, #192] +str q7, [x0, #208] +str q23, [x0, #224] +str q5, [x0, #240] +ldr q5, [x17, #+640] +ldr q23, [x17, #+656] +ldr q7, [x17, #+672] +ldr q6, [x17, #+688] +ldr q9, [x17, #+704] +ldr q8, [x17, #+720] +ldr q4, [x17, #+736] +ldr q10, [x17, #+752] +ldr q29, [x0, #288] +ldr q3, [x0, #304] +ldr q21, [x0, #256] +ldr q25, [x0, #272] +sqrdmulh v30.4S, v29.4S, v23.s[0] +mul v29.4S, v29.4S,v5.s[0] +mla v29.4S, v30.4S, v31.s[0] +sub v30.4s, v21.4s, v29.4s +add v21.4s, v21.4s, v29.4s +sqrdmulh v29.4S, v3.4S, v23.s[0] +mul v3.4S, v3.4S,v5.s[0] +mla v3.4S, v29.4S, v31.s[0] +sub v29.4s, v25.4s, v3.4s +add v25.4s, v25.4s, v3.4s +sqrdmulh v3.4S, v25.4S, v23.s[1] +mul v25.4S, v25.4S,v5.s[1] +mla v25.4S, v3.4S, v31.s[0] +sub v3.4s, v21.4s, v25.4s +add v21.4s, v21.4s, v25.4s +sqrdmulh v25.4S, v29.4S, v23.s[2] +mul v29.4S, v29.4S,v5.s[2] +mla v29.4S, v25.4S, v31.s[0] +sub v25.4s, v30.4s, v29.4s +add v30.4s, v30.4s, v29.4s +trn1 v29.4S, v21.4S, v3.4S +trn2 v16.4S, v21.4S, v3.4S +trn1 v24.4S, v30.4S, v25.4S +trn2 v22.4S, v30.4S, v25.4S +trn2 v30.2D, v29.2D, v24.2D +trn2 v25.2D, v16.2D, v22.2D +trn1 v21.2D, v29.2D, v24.2D +trn1 v3.2D, v16.2D, v22.2D +sqrdmulh v22.4S, v30.4S, v6.4S +mul v30.4S, v30.4S,v7.4S +mla v30.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v30.4s +add v21.4s, v21.4s, v30.4s +sqrdmulh v30.4S, v25.4S, v6.4S +mul v25.4S, v25.4S,v7.4S +mla v25.4S, v30.4S, v31.s[0] +sub v30.4s, v3.4s, v25.4s +add v3.4s, v3.4s, v25.4s +sqrdmulh v25.4S, v3.4S, v8.4S +mul v3.4S, v3.4S,v9.4S +mla v3.4S, v25.4S, v31.s[0] +sub v25.4s, v21.4s, v3.4s +add v21.4s, v21.4s, v3.4s +sqrdmulh v3.4S, v30.4S, v10.4S +mul v30.4S, v30.4S,v4.4S +mla v30.4S, v3.4S, v31.s[0] +sub v3.4s, v22.4s, v30.4s +add v22.4s, v22.4s, v30.4s +str q21, [x0, #256] +str q25, [x0, #272] +str q22, [x0, #288] +str q3, [x0, #304] +ldr q3, [x17, #+768] +ldr q22, [x17, #+784] +ldr q25, [x17, #+800] +ldr q21, [x17, #+816] +ldr q30, [x17, #+832] +ldr q16, [x17, #+848] +ldr q24, [x17, #+864] +ldr q29, [x17, #+880] +ldr q10, [x0, #352] +ldr q4, [x0, #368] +ldr q8, [x0, #320] +ldr q9, [x0, #336] +sqrdmulh v6.4S, v10.4S, v22.s[0] +mul v10.4S, v10.4S,v3.s[0] +mla v10.4S, v6.4S, v31.s[0] +sub v6.4s, v8.4s, v10.4s +add v8.4s, v8.4s, v10.4s +sqrdmulh v10.4S, v4.4S, v22.s[0] +mul v4.4S, v4.4S,v3.s[0] +mla v4.4S, v10.4S, v31.s[0] +sub v10.4s, v9.4s, v4.4s +add v9.4s, v9.4s, v4.4s +sqrdmulh v4.4S, v9.4S, v22.s[1] +mul v9.4S, v9.4S,v3.s[1] +mla v9.4S, v4.4S, v31.s[0] +sub v4.4s, v8.4s, v9.4s +add v8.4s, v8.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v22.s[2] +mul v10.4S, v10.4S,v3.s[2] +mla v10.4S, v9.4S, v31.s[0] +sub v9.4s, v6.4s, v10.4s +add v6.4s, v6.4s, v10.4s +trn1 v10.4S, v8.4S, v4.4S +trn2 v7.4S, v8.4S, v4.4S +trn1 v23.4S, v6.4S, v9.4S +trn2 v5.4S, v6.4S, v9.4S +trn2 v6.2D, v10.2D, v23.2D +trn2 v9.2D, v7.2D, v5.2D +trn1 v8.2D, v10.2D, v23.2D +trn1 v4.2D, v7.2D, v5.2D +sqrdmulh v5.4S, v6.4S, v21.4S +mul v6.4S, v6.4S,v25.4S +mla v6.4S, v5.4S, v31.s[0] +sub v5.4s, v8.4s, v6.4s +add v8.4s, v8.4s, v6.4s +sqrdmulh v6.4S, v9.4S, v21.4S +mul v9.4S, v9.4S,v25.4S +mla v9.4S, v6.4S, v31.s[0] +sub v6.4s, v4.4s, v9.4s +add v4.4s, v4.4s, v9.4s +sqrdmulh v9.4S, v4.4S, v16.4S +mul v4.4S, v4.4S,v30.4S +mla v4.4S, v9.4S, v31.s[0] +sub v9.4s, v8.4s, v4.4s +add v8.4s, v8.4s, v4.4s +sqrdmulh v4.4S, v6.4S, v29.4S +mul v6.4S, v6.4S,v24.4S +mla v6.4S, v4.4S, v31.s[0] +sub v4.4s, v5.4s, v6.4s +add v5.4s, v5.4s, v6.4s +str q8, [x0, #320] +str q9, [x0, #336] +str q5, [x0, #352] +str q4, [x0, #368] +ldr q4, [x17, #+896] +ldr q5, [x17, #+912] +ldr q9, [x17, #+928] +ldr q8, [x17, #+944] +ldr q6, [x17, #+960] +ldr q7, [x17, #+976] +ldr q23, [x17, #+992] +ldr q10, [x17, #+1008] +ldr q29, [x0, #416] +ldr q24, [x0, #432] +ldr q16, [x0, #384] +ldr q30, [x0, #400] +sqrdmulh v21.4S, v29.4S, v5.s[0] +mul v29.4S, v29.4S,v4.s[0] +mla v29.4S, v21.4S, v31.s[0] +sub v21.4s, v16.4s, v29.4s +add v16.4s, v16.4s, v29.4s +sqrdmulh v29.4S, v24.4S, v5.s[0] +mul v24.4S, v24.4S,v4.s[0] +mla v24.4S, v29.4S, v31.s[0] +sub v29.4s, v30.4s, v24.4s +add v30.4s, v30.4s, v24.4s +sqrdmulh v24.4S, v30.4S, v5.s[1] +mul v30.4S, v30.4S,v4.s[1] +mla v30.4S, v24.4S, v31.s[0] +sub v24.4s, v16.4s, v30.4s +add v16.4s, v16.4s, v30.4s +sqrdmulh v30.4S, v29.4S, v5.s[2] +mul v29.4S, v29.4S,v4.s[2] +mla v29.4S, v30.4S, v31.s[0] +sub v30.4s, v21.4s, v29.4s +add v21.4s, v21.4s, v29.4s +trn1 v29.4S, v16.4S, v24.4S +trn2 v25.4S, v16.4S, v24.4S +trn1 v22.4S, v21.4S, v30.4S +trn2 v3.4S, v21.4S, v30.4S +trn2 v21.2D, v29.2D, v22.2D +trn2 v30.2D, v25.2D, v3.2D +trn1 v16.2D, v29.2D, v22.2D +trn1 v24.2D, v25.2D, v3.2D +sqrdmulh v3.4S, v21.4S, v8.4S +mul v21.4S, v21.4S,v9.4S +mla v21.4S, v3.4S, v31.s[0] +sub v3.4s, v16.4s, v21.4s +add v16.4s, v16.4s, v21.4s +sqrdmulh v21.4S, v30.4S, v8.4S +mul v30.4S, v30.4S,v9.4S +mla v30.4S, v21.4S, v31.s[0] +sub v21.4s, v24.4s, v30.4s +add v24.4s, v24.4s, v30.4s +sqrdmulh v30.4S, v24.4S, v7.4S +mul v24.4S, v24.4S,v6.4S +mla v24.4S, v30.4S, v31.s[0] +sub v30.4s, v16.4s, v24.4s +add v16.4s, v16.4s, v24.4s +sqrdmulh v24.4S, v21.4S, v10.4S +mul v21.4S, v21.4S,v23.4S +mla v21.4S, v24.4S, v31.s[0] +sub v24.4s, v3.4s, v21.4s +add v3.4s, v3.4s, v21.4s +str q16, [x0, #384] +str q30, [x0, #400] +str q3, [x0, #416] +str q24, [x0, #432] +ldr q24, [x17, #+1024] +ldr q3, [x17, #+1040] +ldr q30, [x17, #+1056] +ldr q16, [x17, #+1072] +ldr q21, [x17, #+1088] +ldr q25, [x17, #+1104] +ldr q22, [x17, #+1120] +ldr q29, [x17, #+1136] +ldr q10, [x0, #480] +ldr q23, [x0, #496] +ldr q7, [x0, #448] +ldr q6, [x0, #464] +sqrdmulh v8.4S, v10.4S, v3.s[0] +mul v10.4S, v10.4S,v24.s[0] +mla v10.4S, v8.4S, v31.s[0] +sub v8.4s, v7.4s, v10.4s +add v7.4s, v7.4s, v10.4s +sqrdmulh v10.4S, v23.4S, v3.s[0] +mul v23.4S, v23.4S,v24.s[0] +mla v23.4S, v10.4S, v31.s[0] +sub v10.4s, v6.4s, v23.4s +add v6.4s, v6.4s, v23.4s +sqrdmulh v23.4S, v6.4S, v3.s[1] +mul v6.4S, v6.4S,v24.s[1] +mla v6.4S, v23.4S, v31.s[0] +sub v23.4s, v7.4s, v6.4s +add v7.4s, v7.4s, v6.4s +sqrdmulh v6.4S, v10.4S, v3.s[2] +mul v10.4S, v10.4S,v24.s[2] +mla v10.4S, v6.4S, v31.s[0] +sub v6.4s, v8.4s, v10.4s +add v8.4s, v8.4s, v10.4s +trn1 v10.4S, v7.4S, v23.4S +trn2 v9.4S, v7.4S, v23.4S +trn1 v5.4S, v8.4S, v6.4S +trn2 v4.4S, v8.4S, v6.4S +trn2 v8.2D, v10.2D, v5.2D +trn2 v6.2D, v9.2D, v4.2D +trn1 v7.2D, v10.2D, v5.2D +trn1 v23.2D, v9.2D, v4.2D +sqrdmulh v4.4S, v8.4S, v16.4S +mul v8.4S, v8.4S,v30.4S +mla v8.4S, v4.4S, v31.s[0] +sub v4.4s, v7.4s, v8.4s +add v7.4s, v7.4s, v8.4s +sqrdmulh v8.4S, v6.4S, v16.4S +mul v6.4S, v6.4S,v30.4S +mla v6.4S, v8.4S, v31.s[0] +sub v8.4s, v23.4s, v6.4s +add v23.4s, v23.4s, v6.4s +sqrdmulh v6.4S, v23.4S, v25.4S +mul v23.4S, v23.4S,v21.4S +mla v23.4S, v6.4S, v31.s[0] +sub v6.4s, v7.4s, v23.4s +add v7.4s, v7.4s, v23.4s +sqrdmulh v23.4S, v8.4S, v29.4S +mul v8.4S, v8.4S,v22.4S +mla v8.4S, v23.4S, v31.s[0] +sub v23.4s, v4.4s, v8.4s +add v4.4s, v4.4s, v8.4s +str q7, [x0, #448] +str q6, [x0, #464] +str q4, [x0, #480] +str q23, [x0, #496] +ldr q23, [x17, #+1152] +ldr q4, [x17, #+1168] +ldr q6, [x17, #+1184] +ldr q7, [x17, #+1200] +ldr q8, [x17, #+1216] +ldr q9, [x17, #+1232] +ldr q5, [x17, #+1248] +ldr q10, [x17, #+1264] +ldr q29, [x0, #544] +ldr q22, [x0, #560] +ldr q25, [x0, #512] +ldr q21, [x0, #528] +sqrdmulh v16.4S, v29.4S, v4.s[0] +mul v29.4S, v29.4S,v23.s[0] +mla v29.4S, v16.4S, v31.s[0] +sub v16.4s, v25.4s, v29.4s +add v25.4s, v25.4s, v29.4s +sqrdmulh v29.4S, v22.4S, v4.s[0] +mul v22.4S, v22.4S,v23.s[0] +mla v22.4S, v29.4S, v31.s[0] +sub v29.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v4.s[1] +mul v21.4S, v21.4S,v23.s[1] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v25.4s, v21.4s +add v25.4s, v25.4s, v21.4s +sqrdmulh v21.4S, v29.4S, v4.s[2] +mul v29.4S, v29.4S,v23.s[2] +mla v29.4S, v21.4S, v31.s[0] +sub v21.4s, v16.4s, v29.4s +add v16.4s, v16.4s, v29.4s +trn1 v29.4S, v25.4S, v22.4S +trn2 v30.4S, v25.4S, v22.4S +trn1 v3.4S, v16.4S, v21.4S +trn2 v24.4S, v16.4S, v21.4S +trn2 v16.2D, v29.2D, v3.2D +trn2 v21.2D, v30.2D, v24.2D +trn1 v25.2D, v29.2D, v3.2D +trn1 v22.2D, v30.2D, v24.2D +sqrdmulh v24.4S, v16.4S, v7.4S +mul v16.4S, v16.4S,v6.4S +mla v16.4S, v24.4S, v31.s[0] +sub v24.4s, v25.4s, v16.4s +add v25.4s, v25.4s, v16.4s +sqrdmulh v16.4S, v21.4S, v7.4S +mul v21.4S, v21.4S,v6.4S +mla v21.4S, v16.4S, v31.s[0] +sub v16.4s, v22.4s, v21.4s +add v22.4s, v22.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v9.4S +mul v22.4S, v22.4S,v8.4S +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v25.4s, v22.4s +add v25.4s, v25.4s, v22.4s +sqrdmulh v22.4S, v16.4S, v10.4S +mul v16.4S, v16.4S,v5.4S +mla v16.4S, v22.4S, v31.s[0] +sub v22.4s, v24.4s, v16.4s +add v24.4s, v24.4s, v16.4s +str q25, [x0, #512] +str q21, [x0, #528] +str q24, [x0, #544] +str q22, [x0, #560] +ldr q22, [x17, #+1280] +ldr q24, [x17, #+1296] +ldr q21, [x17, #+1312] +ldr q25, [x17, #+1328] +ldr q16, [x17, #+1344] +ldr q30, [x17, #+1360] +ldr q3, [x17, #+1376] +ldr q29, [x17, #+1392] +ldr q10, [x0, #608] +ldr q5, [x0, #624] +ldr q9, [x0, #576] +ldr q8, [x0, #592] +sqrdmulh v7.4S, v10.4S, v24.s[0] +mul v10.4S, v10.4S,v22.s[0] +mla v10.4S, v7.4S, v31.s[0] +sub v7.4s, v9.4s, v10.4s +add v9.4s, v9.4s, v10.4s +sqrdmulh v10.4S, v5.4S, v24.s[0] +mul v5.4S, v5.4S,v22.s[0] +mla v5.4S, v10.4S, v31.s[0] +sub v10.4s, v8.4s, v5.4s +add v8.4s, v8.4s, v5.4s +sqrdmulh v5.4S, v8.4S, v24.s[1] +mul v8.4S, v8.4S,v22.s[1] +mla v8.4S, v5.4S, v31.s[0] +sub v5.4s, v9.4s, v8.4s +add v9.4s, v9.4s, v8.4s +sqrdmulh v8.4S, v10.4S, v24.s[2] +mul v10.4S, v10.4S,v22.s[2] +mla v10.4S, v8.4S, v31.s[0] +sub v8.4s, v7.4s, v10.4s +add v7.4s, v7.4s, v10.4s +trn1 v10.4S, v9.4S, v5.4S +trn2 v6.4S, v9.4S, v5.4S +trn1 v4.4S, v7.4S, v8.4S +trn2 v23.4S, v7.4S, v8.4S +trn2 v7.2D, v10.2D, v4.2D +trn2 v8.2D, v6.2D, v23.2D +trn1 v9.2D, v10.2D, v4.2D +trn1 v5.2D, v6.2D, v23.2D +sqrdmulh v23.4S, v7.4S, v25.4S +mul v7.4S, v7.4S,v21.4S +mla v7.4S, v23.4S, v31.s[0] +sub v23.4s, v9.4s, v7.4s +add v9.4s, v9.4s, v7.4s +sqrdmulh v7.4S, v8.4S, v25.4S +mul v8.4S, v8.4S,v21.4S +mla v8.4S, v7.4S, v31.s[0] +sub v7.4s, v5.4s, v8.4s +add v5.4s, v5.4s, v8.4s +sqrdmulh v8.4S, v5.4S, v30.4S +mul v5.4S, v5.4S,v16.4S +mla v5.4S, v8.4S, v31.s[0] +sub v8.4s, v9.4s, v5.4s +add v9.4s, v9.4s, v5.4s +sqrdmulh v5.4S, v7.4S, v29.4S +mul v7.4S, v7.4S,v3.4S +mla v7.4S, v5.4S, v31.s[0] +sub v5.4s, v23.4s, v7.4s +add v23.4s, v23.4s, v7.4s +str q9, [x0, #576] +str q8, [x0, #592] +str q23, [x0, #608] +str q5, [x0, #624] +ldr q5, [x17, #+1408] +ldr q23, [x17, #+1424] +ldr q8, [x17, #+1440] +ldr q9, [x17, #+1456] +ldr q7, [x17, #+1472] +ldr q6, [x17, #+1488] +ldr q4, [x17, #+1504] +ldr q10, [x17, #+1520] +ldr q29, [x0, #672] +ldr q3, [x0, #688] +ldr q30, [x0, #640] +ldr q16, [x0, #656] +sqrdmulh v25.4S, v29.4S, v23.s[0] +mul v29.4S, v29.4S,v5.s[0] +mla v29.4S, v25.4S, v31.s[0] +sub v25.4s, v30.4s, v29.4s +add v30.4s, v30.4s, v29.4s +sqrdmulh v29.4S, v3.4S, v23.s[0] +mul v3.4S, v3.4S,v5.s[0] +mla v3.4S, v29.4S, v31.s[0] +sub v29.4s, v16.4s, v3.4s +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v16.4S, v23.s[1] +mul v16.4S, v16.4S,v5.s[1] +mla v16.4S, v3.4S, v31.s[0] +sub v3.4s, v30.4s, v16.4s +add v30.4s, v30.4s, v16.4s +sqrdmulh v16.4S, v29.4S, v23.s[2] +mul v29.4S, v29.4S,v5.s[2] +mla v29.4S, v16.4S, v31.s[0] +sub v16.4s, v25.4s, v29.4s +add v25.4s, v25.4s, v29.4s +trn1 v29.4S, v30.4S, v3.4S +trn2 v21.4S, v30.4S, v3.4S +trn1 v24.4S, v25.4S, v16.4S +trn2 v22.4S, v25.4S, v16.4S +trn2 v25.2D, v29.2D, v24.2D +trn2 v16.2D, v21.2D, v22.2D +trn1 v30.2D, v29.2D, v24.2D +trn1 v3.2D, v21.2D, v22.2D +sqrdmulh v22.4S, v25.4S, v9.4S +mul v25.4S, v25.4S,v8.4S +mla v25.4S, v22.4S, v31.s[0] +sub v22.4s, v30.4s, v25.4s +add v30.4s, v30.4s, v25.4s +sqrdmulh v25.4S, v16.4S, v9.4S +mul v16.4S, v16.4S,v8.4S +mla v16.4S, v25.4S, v31.s[0] +sub v25.4s, v3.4s, v16.4s +add v3.4s, v3.4s, v16.4s +sqrdmulh v16.4S, v3.4S, v6.4S +mul v3.4S, v3.4S,v7.4S +mla v3.4S, v16.4S, v31.s[0] +sub v16.4s, v30.4s, v3.4s +add v30.4s, v30.4s, v3.4s +sqrdmulh v3.4S, v25.4S, v10.4S +mul v25.4S, v25.4S,v4.4S +mla v25.4S, v3.4S, v31.s[0] +sub v3.4s, v22.4s, v25.4s +add v22.4s, v22.4s, v25.4s +str q30, [x0, #640] +str q16, [x0, #656] +str q22, [x0, #672] +str q3, [x0, #688] +ldr q3, [x17, #+1536] +ldr q22, [x17, #+1552] +ldr q16, [x17, #+1568] +ldr q30, [x17, #+1584] +ldr q25, [x17, #+1600] +ldr q21, [x17, #+1616] +ldr q24, [x17, #+1632] +ldr q29, [x17, #+1648] +ldr q10, [x0, #736] +ldr q4, [x0, #752] +ldr q6, [x0, #704] +ldr q7, [x0, #720] +sqrdmulh v9.4S, v10.4S, v22.s[0] +mul v10.4S, v10.4S,v3.s[0] +mla v10.4S, v9.4S, v31.s[0] +sub v9.4s, v6.4s, v10.4s +add v6.4s, v6.4s, v10.4s +sqrdmulh v10.4S, v4.4S, v22.s[0] +mul v4.4S, v4.4S,v3.s[0] +mla v4.4S, v10.4S, v31.s[0] +sub v10.4s, v7.4s, v4.4s +add v7.4s, v7.4s, v4.4s +sqrdmulh v4.4S, v7.4S, v22.s[1] +mul v7.4S, v7.4S,v3.s[1] +mla v7.4S, v4.4S, v31.s[0] +sub v4.4s, v6.4s, v7.4s +add v6.4s, v6.4s, v7.4s +sqrdmulh v7.4S, v10.4S, v22.s[2] +mul v10.4S, v10.4S,v3.s[2] +mla v10.4S, v7.4S, v31.s[0] +sub v7.4s, v9.4s, v10.4s +add v9.4s, v9.4s, v10.4s +trn1 v10.4S, v6.4S, v4.4S +trn2 v8.4S, v6.4S, v4.4S +trn1 v23.4S, v9.4S, v7.4S +trn2 v5.4S, v9.4S, v7.4S +trn2 v9.2D, v10.2D, v23.2D +trn2 v7.2D, v8.2D, v5.2D +trn1 v6.2D, v10.2D, v23.2D +trn1 v4.2D, v8.2D, v5.2D +sqrdmulh v5.4S, v9.4S, v30.4S +mul v9.4S, v9.4S,v16.4S +mla v9.4S, v5.4S, v31.s[0] +sub v5.4s, v6.4s, v9.4s +add v6.4s, v6.4s, v9.4s +sqrdmulh v9.4S, v7.4S, v30.4S +mul v7.4S, v7.4S,v16.4S +mla v7.4S, v9.4S, v31.s[0] +sub v9.4s, v4.4s, v7.4s +add v4.4s, v4.4s, v7.4s +sqrdmulh v7.4S, v4.4S, v21.4S +mul v4.4S, v4.4S,v25.4S +mla v4.4S, v7.4S, v31.s[0] +sub v7.4s, v6.4s, v4.4s +add v6.4s, v6.4s, v4.4s +sqrdmulh v4.4S, v9.4S, v29.4S +mul v9.4S, v9.4S,v24.4S +mla v9.4S, v4.4S, v31.s[0] +sub v4.4s, v5.4s, v9.4s +add v5.4s, v5.4s, v9.4s +str q6, [x0, #704] +str q7, [x0, #720] +str q5, [x0, #736] +str q4, [x0, #752] +ldr q4, [x17, #+1664] +ldr q5, [x17, #+1680] +ldr q7, [x17, #+1696] +ldr q6, [x17, #+1712] +ldr q9, [x17, #+1728] +ldr q8, [x17, #+1744] +ldr q23, [x17, #+1760] +ldr q10, [x17, #+1776] +ldr q29, [x0, #800] +ldr q24, [x0, #816] +ldr q21, [x0, #768] +ldr q25, [x0, #784] +sqrdmulh v30.4S, v29.4S, v5.s[0] +mul v29.4S, v29.4S,v4.s[0] +mla v29.4S, v30.4S, v31.s[0] +sub v30.4s, v21.4s, v29.4s +add v21.4s, v21.4s, v29.4s +sqrdmulh v29.4S, v24.4S, v5.s[0] +mul v24.4S, v24.4S,v4.s[0] +mla v24.4S, v29.4S, v31.s[0] +sub v29.4s, v25.4s, v24.4s +add v25.4s, v25.4s, v24.4s +sqrdmulh v24.4S, v25.4S, v5.s[1] +mul v25.4S, v25.4S,v4.s[1] +mla v25.4S, v24.4S, v31.s[0] +sub v24.4s, v21.4s, v25.4s +add v21.4s, v21.4s, v25.4s +sqrdmulh v25.4S, v29.4S, v5.s[2] +mul v29.4S, v29.4S,v4.s[2] +mla v29.4S, v25.4S, v31.s[0] +sub v25.4s, v30.4s, v29.4s +add v30.4s, v30.4s, v29.4s +trn1 v29.4S, v21.4S, v24.4S +trn2 v16.4S, v21.4S, v24.4S +trn1 v22.4S, v30.4S, v25.4S +trn2 v3.4S, v30.4S, v25.4S +trn2 v30.2D, v29.2D, v22.2D +trn2 v25.2D, v16.2D, v3.2D +trn1 v21.2D, v29.2D, v22.2D +trn1 v24.2D, v16.2D, v3.2D +sqrdmulh v3.4S, v30.4S, v6.4S +mul v30.4S, v30.4S,v7.4S +mla v30.4S, v3.4S, v31.s[0] +sub v3.4s, v21.4s, v30.4s +add v21.4s, v21.4s, v30.4s +sqrdmulh v30.4S, v25.4S, v6.4S +mul v25.4S, v25.4S,v7.4S +mla v25.4S, v30.4S, v31.s[0] +sub v30.4s, v24.4s, v25.4s +add v24.4s, v24.4s, v25.4s +sqrdmulh v25.4S, v24.4S, v8.4S +mul v24.4S, v24.4S,v9.4S +mla v24.4S, v25.4S, v31.s[0] +sub v25.4s, v21.4s, v24.4s +add v21.4s, v21.4s, v24.4s +sqrdmulh v24.4S, v30.4S, v10.4S +mul v30.4S, v30.4S,v23.4S +mla v30.4S, v24.4S, v31.s[0] +sub v24.4s, v3.4s, v30.4s +add v3.4s, v3.4s, v30.4s +str q21, [x0, #768] +str q25, [x0, #784] +str q3, [x0, #800] +str q24, [x0, #816] +ldr q24, [x17, #+1792] +ldr q3, [x17, #+1808] +ldr q25, [x17, #+1824] +ldr q21, [x17, #+1840] +ldr q30, [x17, #+1856] +ldr q16, [x17, #+1872] +ldr q22, [x17, #+1888] +ldr q29, [x17, #+1904] +ldr q10, [x0, #864] +ldr q23, [x0, #880] +ldr q8, [x0, #832] +ldr q9, [x0, #848] +sqrdmulh v6.4S, v10.4S, v3.s[0] +mul v10.4S, v10.4S,v24.s[0] +mla v10.4S, v6.4S, v31.s[0] +sub v6.4s, v8.4s, v10.4s +add v8.4s, v8.4s, v10.4s +sqrdmulh v10.4S, v23.4S, v3.s[0] +mul v23.4S, v23.4S,v24.s[0] +mla v23.4S, v10.4S, v31.s[0] +sub v10.4s, v9.4s, v23.4s +add v9.4s, v9.4s, v23.4s +sqrdmulh v23.4S, v9.4S, v3.s[1] +mul v9.4S, v9.4S,v24.s[1] +mla v9.4S, v23.4S, v31.s[0] +sub v23.4s, v8.4s, v9.4s +add v8.4s, v8.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v3.s[2] +mul v10.4S, v10.4S,v24.s[2] +mla v10.4S, v9.4S, v31.s[0] +sub v9.4s, v6.4s, v10.4s +add v6.4s, v6.4s, v10.4s +trn1 v10.4S, v8.4S, v23.4S +trn2 v7.4S, v8.4S, v23.4S +trn1 v5.4S, v6.4S, v9.4S +trn2 v4.4S, v6.4S, v9.4S +trn2 v6.2D, v10.2D, v5.2D +trn2 v9.2D, v7.2D, v4.2D +trn1 v8.2D, v10.2D, v5.2D +trn1 v23.2D, v7.2D, v4.2D +sqrdmulh v4.4S, v6.4S, v21.4S +mul v6.4S, v6.4S,v25.4S +mla v6.4S, v4.4S, v31.s[0] +sub v4.4s, v8.4s, v6.4s +add v8.4s, v8.4s, v6.4s +sqrdmulh v6.4S, v9.4S, v21.4S +mul v9.4S, v9.4S,v25.4S +mla v9.4S, v6.4S, v31.s[0] +sub v6.4s, v23.4s, v9.4s +add v23.4s, v23.4s, v9.4s +sqrdmulh v9.4S, v23.4S, v16.4S +mul v23.4S, v23.4S,v30.4S +mla v23.4S, v9.4S, v31.s[0] +sub v9.4s, v8.4s, v23.4s +add v8.4s, v8.4s, v23.4s +sqrdmulh v23.4S, v6.4S, v29.4S +mul v6.4S, v6.4S,v22.4S +mla v6.4S, v23.4S, v31.s[0] +sub v23.4s, v4.4s, v6.4s +add v4.4s, v4.4s, v6.4s +str q8, [x0, #832] +str q9, [x0, #848] +str q4, [x0, #864] +str q23, [x0, #880] +ldr q23, [x17, #+1920] +ldr q4, [x17, #+1936] +ldr q9, [x17, #+1952] +ldr q8, [x17, #+1968] +ldr q6, [x17, #+1984] +ldr q7, [x17, #+2000] +ldr q5, [x17, #+2016] +ldr q10, [x17, #+2032] +ldr q29, [x0, #928] +ldr q22, [x0, #944] +ldr q16, [x0, #896] +ldr q30, [x0, #912] +sqrdmulh v21.4S, v29.4S, v4.s[0] +mul v29.4S, v29.4S,v23.s[0] +mla v29.4S, v21.4S, v31.s[0] +sub v21.4s, v16.4s, v29.4s +add v16.4s, v16.4s, v29.4s +sqrdmulh v29.4S, v22.4S, v4.s[0] +mul v22.4S, v22.4S,v23.s[0] +mla v22.4S, v29.4S, v31.s[0] +sub v29.4s, v30.4s, v22.4s +add v30.4s, v30.4s, v22.4s +sqrdmulh v22.4S, v30.4S, v4.s[1] +mul v30.4S, v30.4S,v23.s[1] +mla v30.4S, v22.4S, v31.s[0] +sub v22.4s, v16.4s, v30.4s +add v16.4s, v16.4s, v30.4s +sqrdmulh v30.4S, v29.4S, v4.s[2] +mul v29.4S, v29.4S,v23.s[2] +mla v29.4S, v30.4S, v31.s[0] +sub v30.4s, v21.4s, v29.4s +add v21.4s, v21.4s, v29.4s +trn1 v29.4S, v16.4S, v22.4S +trn2 v25.4S, v16.4S, v22.4S +trn1 v3.4S, v21.4S, v30.4S +trn2 v24.4S, v21.4S, v30.4S +trn2 v21.2D, v29.2D, v3.2D +trn2 v30.2D, v25.2D, v24.2D +trn1 v16.2D, v29.2D, v3.2D +trn1 v22.2D, v25.2D, v24.2D +sqrdmulh v24.4S, v21.4S, v8.4S +mul v21.4S, v21.4S,v9.4S +mla v21.4S, v24.4S, v31.s[0] +sub v24.4s, v16.4s, v21.4s +add v16.4s, v16.4s, v21.4s +sqrdmulh v21.4S, v30.4S, v8.4S +mul v30.4S, v30.4S,v9.4S +mla v30.4S, v21.4S, v31.s[0] +sub v21.4s, v22.4s, v30.4s +add v22.4s, v22.4s, v30.4s +sqrdmulh v30.4S, v22.4S, v7.4S +mul v22.4S, v22.4S,v6.4S +mla v22.4S, v30.4S, v31.s[0] +sub v30.4s, v16.4s, v22.4s +add v16.4s, v16.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v10.4S +mul v21.4S, v21.4S,v5.4S +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v24.4s, v21.4s +add v24.4s, v24.4s, v21.4s +str q16, [x0, #896] +str q30, [x0, #912] +str q24, [x0, #928] +str q22, [x0, #944] +ldr q22, [x17, #+2048] +ldr q24, [x17, #+2064] +ldr q30, [x17, #+2080] +ldr q16, [x17, #+2096] +ldr q21, [x17, #+2112] +ldr q25, [x17, #+2128] +ldr q3, [x17, #+2144] +ldr q29, [x17, #+2160] +ldr q10, [x0, #992] +ldr q5, [x0, #1008] +ldr q7, [x0, #960] +ldr q6, [x0, #976] +sqrdmulh v8.4S, v10.4S, v24.s[0] +mul v10.4S, v10.4S,v22.s[0] +mla v10.4S, v8.4S, v31.s[0] +sub v8.4s, v7.4s, v10.4s +add v7.4s, v7.4s, v10.4s +sqrdmulh v10.4S, v5.4S, v24.s[0] +mul v5.4S, v5.4S,v22.s[0] +mla v5.4S, v10.4S, v31.s[0] +sub v10.4s, v6.4s, v5.4s +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v6.4S, v24.s[1] +mul v6.4S, v6.4S,v22.s[1] +mla v6.4S, v5.4S, v31.s[0] +sub v5.4s, v7.4s, v6.4s +add v7.4s, v7.4s, v6.4s +sqrdmulh v6.4S, v10.4S, v24.s[2] +mul v10.4S, v10.4S,v22.s[2] +mla v10.4S, v6.4S, v31.s[0] +sub v6.4s, v8.4s, v10.4s +add v8.4s, v8.4s, v10.4s +trn1 v10.4S, v7.4S, v5.4S +trn2 v9.4S, v7.4S, v5.4S +trn1 v4.4S, v8.4S, v6.4S +trn2 v23.4S, v8.4S, v6.4S +trn2 v8.2D, v10.2D, v4.2D +trn2 v6.2D, v9.2D, v23.2D +trn1 v7.2D, v10.2D, v4.2D +trn1 v5.2D, v9.2D, v23.2D +sqrdmulh v23.4S, v8.4S, v16.4S +mul v8.4S, v8.4S,v30.4S +mla v8.4S, v23.4S, v31.s[0] +sub v23.4s, v7.4s, v8.4s +add v7.4s, v7.4s, v8.4s +sqrdmulh v8.4S, v6.4S, v16.4S +mul v6.4S, v6.4S,v30.4S +mla v6.4S, v8.4S, v31.s[0] +sub v8.4s, v5.4s, v6.4s +add v5.4s, v5.4s, v6.4s +sqrdmulh v6.4S, v5.4S, v25.4S +mul v5.4S, v5.4S,v21.4S +mla v5.4S, v6.4S, v31.s[0] +sub v6.4s, v7.4s, v5.4s +add v7.4s, v7.4s, v5.4s +sqrdmulh v5.4S, v8.4S, v29.4S +mul v8.4S, v8.4S,v3.4S +mla v8.4S, v5.4S, v31.s[0] +sub v5.4s, v23.4s, v8.4s +add v23.4s, v23.4s, v8.4s +str q7, [x0, #960] +str q6, [x0, #976] +str q23, [x0, #992] +str q5, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 2392 +// Instruction count: 2388 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_6_0.s b/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_6_0.s new file mode 100644 index 0000000..b0b458f --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_6_0.s @@ -0,0 +1,2422 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 26036764 // Layer 6, block 0 +.word 7065381 // Layer 6, block 1 +.word 11280567 // Layer 6, block 2 +.word 19695786 // Layer 6, block 3 +.word 1666225723 // Layer 6, block 0 +.word 452149874 // Layer 6, block 1 +.word 721901190 // Layer 6, block 2 +.word 1260434103 // Layer 6, block 3 +.word 28678040 // Layer 7, block 0 +.word 5637166 // Layer 7, block 2 +.word 18759424 // Layer 7, block 4 +.word 8648030 // Layer 7, block 6 +.word 1835254486 // Layer 7, block 0 +.word 360751090 // Layer 7, block 2 +.word 1200511508 // Layer 7, block 4 +.word 553431680 // Layer 7, block 6 +.word 7232147 // Layer 7, block 1 +.word 7430689 // Layer 7, block 3 +.word 14819378 // Layer 7, block 5 +.word 22112339 // Layer 7, block 7 +.word 462822084 // Layer 7, block 1 +.word 475527802 // Layer 7, block 3 +.word 948367809 // Layer 7, block 5 +.word 1415081692 // Layer 7, block 7 +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14834498 // Layer 6, block 4 +.word 22861321 // Layer 6, block 5 +.word 23033862 // Layer 6, block 6 +.word 32211066 // Layer 6, block 7 +.word 949335415 // Layer 6, block 4 +.word 1463012881 // Layer 6, block 5 +.word 1474054663 // Layer 6, block 6 +.word 2061350894 // Layer 6, block 7 +.word 7103825 // Layer 7, block 8 +.word 24338119 // Layer 7, block 10 +.word 6674394 // Layer 7, block 12 +.word 3716128 // Layer 7, block 14 +.word 454610102 // Layer 7, block 8 +.word 1557520740 // Layer 7, block 10 +.word 427128616 // Layer 7, block 12 +.word 237814041 // Layer 7, block 14 +.word 18577393 // Layer 7, block 9 +.word 17042091 // Layer 7, block 11 +.word 6574213 // Layer 7, block 13 +.word 24666803 // Layer 7, block 15 +.word 1188862414 // Layer 7, block 9 +.word 1090610585 // Layer 7, block 11 +.word 420717521 // Layer 7, block 13 +.word 1578554911 // Layer 7, block 15 +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 11253846 // Layer 6, block 8 +.word 16151303 // Layer 6, block 9 +.word 1821442 // Layer 6, block 10 +.word 23358663 // Layer 6, block 11 +.word 720191176 // Layer 6, block 8 +.word 1033604503 // Layer 6, block 9 +.word 116563391 // Layer 6, block 10 +.word 1494840340 // Layer 6, block 11 +.word 32787475 // Layer 7, block 16 +.word 8269259 // Layer 7, block 18 +.word 20826321 // Layer 7, block 20 +.word 21194054 // Layer 7, block 22 +.word 2098238255 // Layer 7, block 16 +.word 529192186 // Layer 7, block 18 +.word 1332782821 // Layer 7, block 20 +.word 1356315937 // Layer 7, block 22 +.word 28400654 // Layer 7, block 17 +.word 31090287 // Layer 7, block 19 +.word 26776841 // Layer 7, block 21 +.word 22281074 // Layer 7, block 23 +.word 1817503137 // Layer 7, block 17 +.word 1989626512 // Layer 7, block 19 +.word 1713587037 // Layer 7, block 21 +.word 1425879908 // Layer 7, block 23 +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 20504641 // Layer 6, block 12 +.word 7735096 // Layer 6, block 13 +.word 29463916 // Layer 6, block 14 +.word 23172067 // Layer 6, block 15 +.word 1312196872 // Layer 6, block 12 +.word 495008363 // Layer 6, block 13 +.word 1885546712 // Layer 6, block 14 +.word 1482899108 // Layer 6, block 15 +.word 1953000 // Layer 7, block 24 +.word 12766243 // Layer 7, block 26 +.word 16292342 // Layer 7, block 28 +.word 25143337 // Layer 7, block 30 +.word 124982461 // Layer 7, block 24 +.word 816977197 // Layer 7, block 26 +.word 1042630311 // Layer 7, block 28 +.word 1609050759 // Layer 7, block 30 +.word 12486848 // Layer 7, block 25 +.word 31556661 // Layer 7, block 27 +.word 28330310 // Layer 7, block 29 +.word 15137961 // Layer 7, block 31 +.word 799097282 // Layer 7, block 25 +.word 2019472170 // Layer 7, block 27 +.word 1813001465 // Layer 7, block 29 +.word 968755565 // Layer 7, block 31 +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 18663828 // Layer 6, block 16 +.word 25765932 // Layer 6, block 17 +.word 11779122 // Layer 6, block 18 +.word 29112305 // Layer 6, block 19 +.word 1194393831 // Layer 6, block 16 +.word 1648893798 // Layer 6, block 17 +.word 753806275 // Layer 6, block 18 +.word 1863045325 // Layer 6, block 19 +.word 33163184 // Layer 7, block 32 +.word 11550623 // Layer 7, block 34 +.word 25375595 // Layer 7, block 36 +.word 18254638 // Layer 7, block 38 +.word 2122281795 // Layer 7, block 32 +.word 739183455 // Layer 7, block 34 +.word 1623914137 // Layer 7, block 36 +.word 1168207670 // Layer 7, block 38 +.word 9551359 // Layer 7, block 33 +.word 33257316 // Layer 7, block 35 +.word 10387700 // Layer 7, block 37 +.word 4263629 // Layer 7, block 39 +.word 611240324 // Layer 7, block 33 +.word 2128305784 // Layer 7, block 35 +.word 664762063 // Layer 7, block 37 +.word 272851431 // Layer 7, block 39 +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 596073 // Layer 6, block 20 +.word 29039358 // Layer 6, block 21 +.word 6760262 // Layer 6, block 22 +.word 2228887 // Layer 6, block 23 +.word 38145761 // Layer 6, block 20 +.word 1858377074 // Layer 6, block 21 +.word 432623749 // Layer 6, block 22 +.word 142637881 // Layer 6, block 23 +.word 25929180 // Layer 7, block 40 +.word 23508428 // Layer 7, block 42 +.word 22560727 // Layer 7, block 44 +.word 29457393 // Layer 7, block 46 +.word 1659340873 // Layer 7, block 40 +.word 1504424569 // Layer 7, block 42 +.word 1443776334 // Layer 7, block 44 +.word 1885129272 // Layer 7, block 46 +.word 17371159 // Layer 7, block 41 +.word 11558208 // Layer 7, block 43 +.word 15755637 // Layer 7, block 45 +.word 20740787 // Layer 7, block 47 +.word 1111669329 // Layer 7, block 41 +.word 739668858 // Layer 7, block 43 +.word 1008283812 // Layer 7, block 45 +.word 1327309063 // Layer 7, block 47 +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 13624329 // Layer 6, block 24 +.word 9838349 // Layer 6, block 25 +.word 6934560 // Layer 6, block 26 +.word 11310234 // Layer 6, block 27 +.word 871890510 // Layer 6, block 24 +.word 629606282 // Layer 6, block 25 +.word 443777969 // Layer 6, block 26 +.word 723799733 // Layer 6, block 27 +.word 3153984 // Layer 7, block 48 +.word 15599806 // Layer 7, block 50 +.word 23484790 // Layer 7, block 52 +.word 30174454 // Layer 7, block 54 +.word 201839571 // Layer 7, block 48 +.word 998311389 // Layer 7, block 50 +.word 1502911852 // Layer 7, block 52 +.word 1931017673 // Layer 7, block 54 +.word 13598070 // Layer 7, block 49 +.word 31454003 // Layer 7, block 51 +.word 20506260 // Layer 7, block 53 +.word 5928435 // Layer 7, block 55 +.word 870210062 // Layer 7, block 49 +.word 2012902560 // Layer 7, block 51 +.word 1312300480 // Layer 7, block 53 +.word 379390883 // Layer 7, block 55 +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 32798516 // Layer 6, block 28 +.word 9911360 // Layer 6, block 29 +.word 32443170 // Layer 6, block 30 +.word 31293482 // Layer 6, block 31 +.word 2098944825 // Layer 6, block 28 +.word 634278629 // Layer 6, block 29 +.word 2076204416 // Layer 6, block 30 +.word 2002630000 // Layer 6, block 31 +.word 26013877 // Layer 7, block 56 +.word 22928950 // Layer 7, block 58 +.word 24547058 // Layer 7, block 60 +.word 21082546 // Layer 7, block 62 +.word 1664761067 // Layer 7, block 56 +.word 1467340807 // Layer 7, block 58 +.word 1570891816 // Layer 7, block 60 +.word 1349179970 // Layer 7, block 62 +.word 21864746 // Layer 7, block 57 +.word 27678266 // Layer 7, block 59 +.word 30695887 // Layer 7, block 61 +.word 31772478 // Layer 7, block 63 +.word 1399236949 // Layer 7, block 57 +.word 1771273834 // Layer 7, block 59 +.word 1964386839 // Layer 7, block 61 +.word 2033283404 // Layer 7, block 63 +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 2853776 // Layer 6, block 32 +.word 31645959 // Layer 6, block 33 +.word 29723614 // Layer 6, block 34 +.word 31813171 // Layer 6, block 35 +.word 182627725 // Layer 6, block 32 +.word 2025186806 // Layer 6, block 33 +.word 1902166116 // Layer 6, block 34 +.word 2035887557 // Layer 6, block 35 +.word 30377953 // Layer 7, block 64 +.word 4924837 // Layer 7, block 66 +.word 11362575 // Layer 7, block 68 +.word 31398766 // Layer 7, block 70 +.word 1944040616 // Layer 7, block 64 +.word 315165513 // Layer 7, block 66 +.word 727149301 // Layer 7, block 68 +.word 2009367662 // Layer 7, block 70 +.word 27689101 // Layer 7, block 65 +.word 31229525 // Layer 7, block 67 +.word 6544948 // Layer 7, block 69 +.word 13728247 // Layer 7, block 71 +.word 1771967221 // Layer 7, block 65 +.word 1998537064 // Layer 7, block 67 +.word 418844704 // Layer 7, block 69 +.word 878540754 // Layer 7, block 71 +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9116920 // Layer 6, block 36 +.word 26449800 // Layer 6, block 37 +.word 27173300 // Layer 6, block 38 +.word 1574249 // Layer 6, block 39 +.word 583438350 // Layer 6, block 36 +.word 1692658010 // Layer 6, block 37 +.word 1738958476 // Layer 6, block 38 +.word 100744247 // Layer 6, block 39 +.word 6510145 // Layer 7, block 72 +.word 760999 // Layer 7, block 74 +.word 1634503 // Layer 7, block 76 +.word 29546109 // Layer 7, block 78 +.word 416617482 // Layer 7, block 72 +.word 48700219 // Layer 7, block 74 +.word 104600209 // Layer 7, block 76 +.word 1890806663 // Layer 7, block 78 +.word 2195232 // Layer 7, block 73 +.word 4465852 // Layer 7, block 75 +.word 31203102 // Layer 7, block 77 +.word 29916743 // Layer 7, block 79 +.word 140484126 // Layer 7, block 73 +.word 285792715 // Layer 7, block 75 +.word 1996846121 // Layer 7, block 77 +.word 1914525428 // Layer 7, block 79 +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29172999 // Layer 6, block 40 +.word 16825951 // Layer 6, block 41 +.word 11592382 // Layer 6, block 42 +.word 2671395 // Layer 6, block 43 +.word 1866929445 // Layer 6, block 40 +.word 1076778680 // Layer 6, block 41 +.word 741855827 // Layer 6, block 42 +.word 170956232 // Layer 6, block 43 +.word 14579779 // Layer 7, block 80 +.word 24263513 // Layer 7, block 82 +.word 4646776 // Layer 7, block 84 +.word 69049 // Layer 7, block 86 +.word 933034643 // Layer 7, block 80 +.word 1552746321 // Layer 7, block 82 +.word 297370968 // Layer 7, block 84 +.word 4418799 // Layer 7, block 86 +.word 33263488 // Layer 7, block 81 +.word 22493246 // Layer 7, block 83 +.word 22009979 // Layer 7, block 85 +.word 12021234 // Layer 7, block 87 +.word 2128700762 // Layer 7, block 81 +.word 1439457879 // Layer 7, block 83 +.word 1408531152 // Layer 7, block 85 +.word 769300260 // Layer 7, block 87 +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 15720958 // Layer 6, block 44 +.word 4876619 // Layer 6, block 45 +.word 9370171 // Layer 6, block 46 +.word 2197027 // Layer 6, block 47 +.word 1006064525 // Layer 6, block 44 +.word 312079797 // Layer 6, block 45 +.word 599645177 // Layer 6, block 46 +.word 140598997 // Layer 6, block 47 +.word 16117282 // Layer 7, block 88 +.word 9635661 // Layer 7, block 90 +.word 9117520 // Layer 7, block 92 +.word 3506913 // Layer 7, block 94 +.word 1031427326 // Layer 7, block 88 +.word 616635240 // Layer 7, block 90 +.word 583476747 // Layer 7, block 92 +.word 224425303 // Layer 7, block 94 +.word 20014407 // Layer 7, block 89 +.word 25893988 // Layer 7, block 91 +.word 10257619 // Layer 7, block 93 +.word 24501669 // Layer 7, block 95 +.word 1280824291 // Layer 7, block 89 +.word 1657088757 // Layer 7, block 91 +.word 656437514 // Layer 7, block 93 +.word 1567987141 // Layer 7, block 95 +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 23467272 // Layer 6, block 48 +.word 11944835 // Layer 6, block 49 +.word 29768154 // Layer 6, block 50 +.word 3189790 // Layer 6, block 51 +.word 1501790786 // Layer 6, block 48 +.word 764411097 // Layer 6, block 49 +.word 1905016458 // Layer 6, block 50 +.word 204130980 // Layer 6, block 51 +.word 28559032 // Layer 7, block 96 +.word 20151609 // Layer 7, block 98 +.word 11645481 // Layer 7, block 100 +.word 16402437 // Layer 7, block 102 +.word 1827638556 // Layer 7, block 96 +.word 1289604549 // Layer 7, block 98 +.word 745253903 // Layer 7, block 100 +.word 1049675853 // Layer 7, block 102 +.word 1005359 // Layer 7, block 97 +.word 19130139 // Layer 7, block 99 +.word 11690281 // Layer 7, block 101 +.word 5461508 // Layer 7, block 103 +.word 64338065 // Layer 7, block 97 +.word 1224235458 // Layer 7, block 99 +.word 748120885 // Layer 7, block 101 +.word 349509836 // Layer 7, block 103 +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 4898455 // Layer 6, block 52 +.word 22059944 // Layer 6, block 53 +.word 20315246 // Layer 6, block 54 +.word 28615767 // Layer 6, block 55 +.word 313477194 // Layer 6, block 52 +.word 1411728668 // Layer 6, block 53 +.word 1300076517 // Layer 6, block 54 +.word 1831269319 // Layer 6, block 55 +.word 6226096 // Layer 7, block 104 +.word 14029790 // Layer 7, block 106 +.word 7729000 // Layer 7, block 108 +.word 13958531 // Layer 7, block 110 +.word 398439734 // Layer 7, block 104 +.word 897838034 // Layer 7, block 106 +.word 494618249 // Layer 7, block 108 +.word 893277806 // Layer 7, block 110 +.word 31755058 // Layer 7, block 105 +.word 26102744 // Layer 7, block 107 +.word 19175904 // Layer 7, block 109 +.word 19472238 // Layer 7, block 111 +.word 2032168609 // Layer 7, block 105 +.word 1670448121 // Layer 7, block 107 +.word 1227164194 // Layer 7, block 109 +.word 1246128123 // Layer 7, block 111 +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 17302560 // Layer 6, block 56 +.word 8630188 // Layer 6, block 57 +.word 13744680 // Layer 6, block 58 +.word 31890906 // Layer 6, block 59 +.word 1107279328 // Layer 6, block 56 +.word 552289879 // Layer 6, block 57 +.word 879592386 // Layer 6, block 58 +.word 2040862218 // Layer 6, block 59 +.word 4735938 // Layer 7, block 112 +.word 26671657 // Layer 7, block 114 +.word 25810971 // Layer 7, block 116 +.word 25578690 // Layer 7, block 118 +.word 303076900 // Layer 7, block 112 +.word 1706855774 // Layer 7, block 114 +.word 1651776074 // Layer 7, block 116 +.word 1636911225 // Layer 7, block 118 +.word 6957373 // Layer 7, block 113 +.word 25381712 // Layer 7, block 115 +.word 27780827 // Layer 7, block 117 +.word 28062311 // Layer 7, block 119 +.word 445237890 // Layer 7, block 113 +.word 1624305595 // Layer 7, block 115 +.word 1777837237 // Layer 7, block 117 +.word 1795850838 // Layer 7, block 119 +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 26150922 // Layer 6, block 60 +.word 29525906 // Layer 6, block 61 +.word 23080870 // Layer 6, block 62 +.word 1636987 // Layer 6, block 63 +.word 1673531278 // Layer 6, block 60 +.word 1889513769 // Layer 6, block 61 +.word 1477062945 // Layer 6, block 62 +.word 104759172 // Layer 6, block 63 +.word 10674616 // Layer 7, block 120 +.word 9508293 // Layer 7, block 122 +.word 4274200 // Layer 7, block 124 +.word 10066304 // Layer 7, block 126 +.word 683123285 // Layer 7, block 120 +.word 608484310 // Layer 7, block 122 +.word 273527923 // Layer 7, block 124 +.word 644194289 // Layer 7, block 126 +.word 26473446 // Layer 7, block 121 +.word 14853570 // Layer 7, block 123 +.word 32427548 // Layer 7, block 125 +.word 16598340 // Layer 7, block 127 +.word 1694171239 // Layer 7, block 121 +.word 950555930 // Layer 7, block 123 +.word 2075204685 // Layer 7, block 125 +.word 1062212688 // Layer 7, block 127 +.text +.global ntt_u32_full_neon_asm_var_4_4_6_0 +.global _ntt_u32_full_neon_asm_var_4_4_6_0 +ntt_u32_full_neon_asm_var_4_4_6_0: +_ntt_u32_full_neon_asm_var_4_4_6_0: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x0, #800] +ldr q29, [x0, #864] +ldr q28, [x0, #928] +ldr q27, [x0, #992] +ldr q26, [x0, #288] +ldr q25, [x0, #352] +ldr q24, [x0, #416] +ldr q23, [x0, #480] +ldr q22, [x17, #+0] +ldr q21, [x17, #+16] +ldr q20, [x17, #+32] +ldr q19, [x17, #+48] +ldr q18, [x17, #+64] +ldr q17, [x17, #+80] +ldr q16, [x17, #+96] +ldr q3, [x17, #+112] +sqrdmulh v2.4S, v30.4S, v21.s[0] +ldr q1, [x0, #544] +ldr q0, [x0, #608] +mul v30.4S, v30.4S,v22.s[0] +ldr q15, [x0, #672] +ldr q14, [x0, #736] +sqrdmulh v13.4S, v29.4S, v21.s[0] +ldr q12, [x0, #32] +mul v29.4S, v29.4S,v22.s[0] +ldr q11, [x0, #96] +sqrdmulh v10.4S, v28.4S, v21.s[0] +ldr q9, [x0, #160] +mul v28.4S, v28.4S,v22.s[0] +ldr q8, [x0, #224] +sqrdmulh v7.4S, v27.4S, v21.s[0] +mul v27.4S, v27.4S,v22.s[0] +mla v30.4S, v2.4S, v31.s[0] +mla v29.4S, v13.4S, v31.s[0] +mla v28.4S, v10.4S, v31.s[0] +mla v27.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v1.4S, v21.s[0] +mul v1.4S, v1.4S,v22.s[0] +sub v10.4s, v26.4s, v30.4s +add v26.4s, v26.4s, v30.4s +sqrdmulh v30.4S, v0.4S, v21.s[0] +mul v0.4S, v0.4S,v22.s[0] +sub v13.4s, v25.4s, v29.4s +add v25.4s, v25.4s, v29.4s +sqrdmulh v29.4S, v15.4S, v21.s[0] +mul v15.4S, v15.4S,v22.s[0] +sub v2.4s, v24.4s, v28.4s +add v24.4s, v24.4s, v28.4s +sqrdmulh v28.4S, v14.4S, v21.s[0] +mul v14.4S, v14.4S,v22.s[0] +mla v1.4S, v7.4S, v31.s[0] +sub v7.4s, v23.4s, v27.4s +mla v0.4S, v30.4S, v31.s[0] +add v23.4s, v23.4s, v27.4s +mla v15.4S, v29.4S, v31.s[0] +mla v14.4S, v28.4S, v31.s[0] +sqrdmulh v28.4S, v24.4S, v21.s[1] +mul v24.4S, v24.4S,v22.s[1] +sub v29.4s, v12.4s, v1.4s +add v12.4s, v12.4s, v1.4s +sqrdmulh v1.4S, v23.4S, v21.s[1] +mul v23.4S, v23.4S,v22.s[1] +sub v27.4s, v11.4s, v0.4s +add v11.4s, v11.4s, v0.4s +sqrdmulh v0.4S, v26.4S, v21.s[1] +mul v26.4S, v26.4S,v22.s[1] +sub v30.4s, v9.4s, v15.4s +add v9.4s, v9.4s, v15.4s +sqrdmulh v15.4S, v25.4S, v21.s[1] +mul v25.4S, v25.4S,v22.s[1] +mla v24.4S, v28.4S, v31.s[0] +sub v28.4s, v8.4s, v14.4s +add v8.4s, v8.4s, v14.4s +mla v23.4S, v1.4S, v31.s[0] +mla v26.4S, v0.4S, v31.s[0] +mla v25.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v2.4S, v21.s[2] +mul v2.4S, v2.4S,v22.s[2] +sub v0.4s, v9.4s, v24.4s +add v9.4s, v9.4s, v24.4s +sqrdmulh v24.4S, v7.4S, v21.s[2] +mul v7.4S, v7.4S,v22.s[2] +sub v1.4s, v8.4s, v23.4s +add v8.4s, v8.4s, v23.4s +sqrdmulh v23.4S, v10.4S, v21.s[2] +mul v10.4S, v10.4S,v22.s[2] +sub v14.4s, v12.4s, v26.4s +add v12.4s, v12.4s, v26.4s +sqrdmulh v26.4S, v13.4S, v21.s[2] +mul v13.4S, v13.4S,v22.s[2] +mla v2.4S, v15.4S, v31.s[0] +sub v15.4s, v11.4s, v25.4s +mla v7.4S, v24.4S, v31.s[0] +add v11.4s, v11.4s, v25.4s +mla v10.4S, v23.4S, v31.s[0] +mla v13.4S, v26.4S, v31.s[0] +sqrdmulh v26.4S, v9.4S, v19.s[0] +mul v9.4S, v9.4S,v20.s[0] +sub v23.4s, v30.4s, v2.4s +add v30.4s, v30.4s, v2.4s +sqrdmulh v2.4S, v8.4S, v19.s[0] +mul v8.4S, v8.4S,v20.s[0] +sub v25.4s, v28.4s, v7.4s +add v28.4s, v28.4s, v7.4s +sqrdmulh v7.4S, v0.4S, v19.s[1] +mul v0.4S, v0.4S,v20.s[1] +sub v24.4s, v29.4s, v10.4s +add v29.4s, v29.4s, v10.4s +sqrdmulh v10.4S, v1.4S, v19.s[1] +mul v1.4S, v1.4S,v20.s[1] +mla v9.4S, v26.4S, v31.s[0] +sub v26.4s, v27.4s, v13.4s +add v27.4s, v27.4s, v13.4s +mla v8.4S, v2.4S, v31.s[0] +mla v0.4S, v7.4S, v31.s[0] +mla v1.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v30.4S, v19.s[2] +mul v30.4S, v30.4S,v20.s[2] +sub v7.4s, v12.4s, v9.4s +add v12.4s, v12.4s, v9.4s +sqrdmulh v9.4S, v28.4S, v19.s[2] +mul v28.4S, v28.4S,v20.s[2] +sub v2.4s, v11.4s, v8.4s +add v11.4s, v11.4s, v8.4s +sqrdmulh v8.4S, v23.4S, v19.s[3] +mul v23.4S, v23.4S,v20.s[3] +sub v13.4s, v14.4s, v0.4s +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v25.4S, v19.s[3] +mul v25.4S, v25.4S,v20.s[3] +mla v30.4S, v10.4S, v31.s[0] +sub v10.4s, v15.4s, v1.4s +mla v28.4S, v9.4S, v31.s[0] +add v15.4s, v15.4s, v1.4s +mla v23.4S, v8.4S, v31.s[0] +mla v25.4S, v0.4S, v31.s[0] +sqrdmulh v0.4S, v11.4S, v17.s[0] +mul v11.4S, v11.4S,v18.s[0] +sub v8.4s, v29.4s, v30.4s +add v29.4s, v29.4s, v30.4s +sqrdmulh v30.4S, v2.4S, v17.s[1] +mul v2.4S, v2.4S,v18.s[1] +sub v1.4s, v27.4s, v28.4s +add v27.4s, v27.4s, v28.4s +sqrdmulh v28.4S, v15.4S, v17.s[2] +mul v15.4S, v15.4S,v18.s[2] +sub v9.4s, v24.4s, v23.4s +add v24.4s, v24.4s, v23.4s +sqrdmulh v23.4S, v10.4S, v17.s[3] +mul v10.4S, v10.4S,v18.s[3] +mla v11.4S, v0.4S, v31.s[0] +sub v0.4s, v26.4s, v25.4s +add v26.4s, v26.4s, v25.4s +mla v2.4S, v30.4S, v31.s[0] +mla v15.4S, v28.4S, v31.s[0] +mla v10.4S, v23.4S, v31.s[0] +sqrdmulh v23.4S, v27.4S, v3.s[0] +mul v27.4S, v27.4S,v16.s[0] +sub v28.4s, v12.4s, v11.4s +add v12.4s, v12.4s, v11.4s +str q12, [x0, #32] +str q28, [x0, #96] +sqrdmulh v28.4S, v1.4S, v3.s[1] +mul v1.4S, v1.4S,v16.s[1] +ldr q12, [x0, #816] +ldr q11, [x0, #880] +sub v30.4s, v7.4s, v2.4s +add v7.4s, v7.4s, v2.4s +str q7, [x0, #160] +str q30, [x0, #224] +sqrdmulh v30.4S, v26.4S, v3.s[2] +mul v26.4S, v26.4S,v16.s[2] +ldr q7, [x0, #944] +ldr q2, [x0, #1008] +sub v25.4s, v14.4s, v15.4s +add v14.4s, v14.4s, v15.4s +str q14, [x0, #288] +str q25, [x0, #352] +sqrdmulh v25.4S, v0.4S, v3.s[3] +mul v0.4S, v0.4S,v16.s[3] +ldr q14, [x0, #304] +ldr q15, [x0, #368] +mla v27.4S, v23.4S, v31.s[0] +sub v23.4s, v13.4s, v10.4s +mla v1.4S, v28.4S, v31.s[0] +add v13.4s, v13.4s, v10.4s +str q13, [x0, #416] +str q23, [x0, #480] +mla v26.4S, v30.4S, v31.s[0] +ldr q30, [x0, #432] +ldr q23, [x0, #496] +mla v0.4S, v25.4S, v31.s[0] +sub v25.4s, v29.4s, v27.4s +add v29.4s, v29.4s, v27.4s +sub v27.4s, v8.4s, v1.4s +add v8.4s, v8.4s, v1.4s +sub v1.4s, v24.4s, v26.4s +add v24.4s, v24.4s, v26.4s +str q29, [x0, #544] +str q25, [x0, #608] +str q8, [x0, #672] +str q27, [x0, #736] +str q24, [x0, #800] +str q1, [x0, #864] +sqrdmulh v1.4S, v12.4S, v21.s[0] +ldr q24, [x0, #560] +ldr q27, [x0, #624] +mul v12.4S, v12.4S,v22.s[0] +sub v8.4s, v9.4s, v0.4s +add v9.4s, v9.4s, v0.4s +str q9, [x0, #928] +str q8, [x0, #992] +ldr q8, [x0, #688] +ldr q9, [x0, #752] +sqrdmulh v0.4S, v11.4S, v21.s[0] +ldr q25, [x0, #48] +mul v11.4S, v11.4S,v22.s[0] +ldr q29, [x0, #112] +sqrdmulh v26.4S, v7.4S, v21.s[0] +ldr q13, [x0, #176] +mul v7.4S, v7.4S,v22.s[0] +ldr q10, [x0, #240] +sqrdmulh v28.4S, v2.4S, v21.s[0] +mul v2.4S, v2.4S,v22.s[0] +mla v12.4S, v1.4S, v31.s[0] +mla v11.4S, v0.4S, v31.s[0] +mla v7.4S, v26.4S, v31.s[0] +mla v2.4S, v28.4S, v31.s[0] +sqrdmulh v28.4S, v24.4S, v21.s[0] +mul v24.4S, v24.4S,v22.s[0] +sub v26.4s, v14.4s, v12.4s +add v14.4s, v14.4s, v12.4s +sqrdmulh v12.4S, v27.4S, v21.s[0] +mul v27.4S, v27.4S,v22.s[0] +sub v0.4s, v15.4s, v11.4s +add v15.4s, v15.4s, v11.4s +sqrdmulh v11.4S, v8.4S, v21.s[0] +mul v8.4S, v8.4S,v22.s[0] +sub v1.4s, v30.4s, v7.4s +add v30.4s, v30.4s, v7.4s +sqrdmulh v7.4S, v9.4S, v21.s[0] +mul v9.4S, v9.4S,v22.s[0] +mla v24.4S, v28.4S, v31.s[0] +sub v28.4s, v23.4s, v2.4s +mla v27.4S, v12.4S, v31.s[0] +add v23.4s, v23.4s, v2.4s +mla v8.4S, v11.4S, v31.s[0] +mla v9.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v30.4S, v21.s[1] +mul v30.4S, v30.4S,v22.s[1] +sub v11.4s, v25.4s, v24.4s +add v25.4s, v25.4s, v24.4s +sqrdmulh v24.4S, v23.4S, v21.s[1] +mul v23.4S, v23.4S,v22.s[1] +sub v2.4s, v29.4s, v27.4s +add v29.4s, v29.4s, v27.4s +sqrdmulh v27.4S, v14.4S, v21.s[1] +mul v14.4S, v14.4S,v22.s[1] +sub v12.4s, v13.4s, v8.4s +add v13.4s, v13.4s, v8.4s +sqrdmulh v8.4S, v15.4S, v21.s[1] +mul v15.4S, v15.4S,v22.s[1] +mla v30.4S, v7.4S, v31.s[0] +sub v7.4s, v10.4s, v9.4s +add v10.4s, v10.4s, v9.4s +mla v23.4S, v24.4S, v31.s[0] +mla v14.4S, v27.4S, v31.s[0] +mla v15.4S, v8.4S, v31.s[0] +sqrdmulh v8.4S, v1.4S, v21.s[2] +mul v1.4S, v1.4S,v22.s[2] +sub v27.4s, v13.4s, v30.4s +add v13.4s, v13.4s, v30.4s +sqrdmulh v30.4S, v28.4S, v21.s[2] +mul v28.4S, v28.4S,v22.s[2] +sub v24.4s, v10.4s, v23.4s +add v10.4s, v10.4s, v23.4s +sqrdmulh v23.4S, v26.4S, v21.s[2] +mul v26.4S, v26.4S,v22.s[2] +sub v9.4s, v25.4s, v14.4s +add v25.4s, v25.4s, v14.4s +sqrdmulh v14.4S, v0.4S, v21.s[2] +mul v0.4S, v0.4S,v22.s[2] +mla v1.4S, v8.4S, v31.s[0] +sub v8.4s, v29.4s, v15.4s +mla v28.4S, v30.4S, v31.s[0] +add v29.4s, v29.4s, v15.4s +mla v26.4S, v23.4S, v31.s[0] +mla v0.4S, v14.4S, v31.s[0] +sqrdmulh v14.4S, v13.4S, v19.s[0] +mul v13.4S, v13.4S,v20.s[0] +sub v23.4s, v12.4s, v1.4s +add v12.4s, v12.4s, v1.4s +sqrdmulh v1.4S, v10.4S, v19.s[0] +mul v10.4S, v10.4S,v20.s[0] +sub v15.4s, v7.4s, v28.4s +add v7.4s, v7.4s, v28.4s +sqrdmulh v28.4S, v27.4S, v19.s[1] +mul v27.4S, v27.4S,v20.s[1] +sub v30.4s, v11.4s, v26.4s +add v11.4s, v11.4s, v26.4s +sqrdmulh v26.4S, v24.4S, v19.s[1] +mul v24.4S, v24.4S,v20.s[1] +mla v13.4S, v14.4S, v31.s[0] +sub v14.4s, v2.4s, v0.4s +add v2.4s, v2.4s, v0.4s +mla v10.4S, v1.4S, v31.s[0] +mla v27.4S, v28.4S, v31.s[0] +mla v24.4S, v26.4S, v31.s[0] +sqrdmulh v26.4S, v12.4S, v19.s[2] +mul v12.4S, v12.4S,v20.s[2] +sub v28.4s, v25.4s, v13.4s +add v25.4s, v25.4s, v13.4s +sqrdmulh v13.4S, v7.4S, v19.s[2] +mul v7.4S, v7.4S,v20.s[2] +sub v1.4s, v29.4s, v10.4s +add v29.4s, v29.4s, v10.4s +sqrdmulh v10.4S, v23.4S, v19.s[3] +mul v23.4S, v23.4S,v20.s[3] +sub v0.4s, v9.4s, v27.4s +add v9.4s, v9.4s, v27.4s +sqrdmulh v27.4S, v15.4S, v19.s[3] +mul v15.4S, v15.4S,v20.s[3] +mla v12.4S, v26.4S, v31.s[0] +sub v26.4s, v8.4s, v24.4s +mla v7.4S, v13.4S, v31.s[0] +add v8.4s, v8.4s, v24.4s +mla v23.4S, v10.4S, v31.s[0] +mla v15.4S, v27.4S, v31.s[0] +sqrdmulh v27.4S, v29.4S, v17.s[0] +mul v29.4S, v29.4S,v18.s[0] +sub v10.4s, v11.4s, v12.4s +add v11.4s, v11.4s, v12.4s +sqrdmulh v12.4S, v1.4S, v17.s[1] +mul v1.4S, v1.4S,v18.s[1] +sub v24.4s, v2.4s, v7.4s +add v2.4s, v2.4s, v7.4s +sqrdmulh v7.4S, v8.4S, v17.s[2] +mul v8.4S, v8.4S,v18.s[2] +sub v13.4s, v30.4s, v23.4s +add v30.4s, v30.4s, v23.4s +sqrdmulh v23.4S, v26.4S, v17.s[3] +mul v26.4S, v26.4S,v18.s[3] +mla v29.4S, v27.4S, v31.s[0] +sub v27.4s, v14.4s, v15.4s +add v14.4s, v14.4s, v15.4s +mla v1.4S, v12.4S, v31.s[0] +mla v8.4S, v7.4S, v31.s[0] +mla v26.4S, v23.4S, v31.s[0] +sqrdmulh v23.4S, v2.4S, v3.s[0] +mul v2.4S, v2.4S,v16.s[0] +sub v7.4s, v25.4s, v29.4s +add v25.4s, v25.4s, v29.4s +str q25, [x0, #48] +str q7, [x0, #112] +sqrdmulh v7.4S, v24.4S, v3.s[1] +mul v24.4S, v24.4S,v16.s[1] +ldr q25, [x0, #768] +ldr q29, [x0, #832] +sub v12.4s, v28.4s, v1.4s +add v28.4s, v28.4s, v1.4s +str q28, [x0, #176] +str q12, [x0, #240] +sqrdmulh v12.4S, v14.4S, v3.s[2] +mul v14.4S, v14.4S,v16.s[2] +ldr q28, [x0, #896] +ldr q1, [x0, #960] +sub v15.4s, v9.4s, v8.4s +add v9.4s, v9.4s, v8.4s +str q9, [x0, #304] +str q15, [x0, #368] +sqrdmulh v15.4S, v27.4S, v3.s[3] +mul v27.4S, v27.4S,v16.s[3] +ldr q9, [x0, #256] +ldr q8, [x0, #320] +mla v2.4S, v23.4S, v31.s[0] +sub v23.4s, v0.4s, v26.4s +mla v24.4S, v7.4S, v31.s[0] +add v0.4s, v0.4s, v26.4s +str q0, [x0, #432] +str q23, [x0, #496] +mla v14.4S, v12.4S, v31.s[0] +ldr q12, [x0, #384] +ldr q23, [x0, #448] +mla v27.4S, v15.4S, v31.s[0] +sub v15.4s, v11.4s, v2.4s +add v11.4s, v11.4s, v2.4s +sub v2.4s, v10.4s, v24.4s +add v10.4s, v10.4s, v24.4s +sub v24.4s, v30.4s, v14.4s +add v30.4s, v30.4s, v14.4s +str q11, [x0, #560] +str q15, [x0, #624] +str q10, [x0, #688] +str q2, [x0, #752] +str q30, [x0, #816] +str q24, [x0, #880] +sqrdmulh v24.4S, v25.4S, v21.s[0] +ldr q30, [x0, #512] +ldr q2, [x0, #576] +mul v25.4S, v25.4S,v22.s[0] +sub v10.4s, v13.4s, v27.4s +add v13.4s, v13.4s, v27.4s +str q13, [x0, #944] +str q10, [x0, #1008] +ldr q10, [x0, #640] +ldr q13, [x0, #704] +sqrdmulh v27.4S, v29.4S, v21.s[0] +ldr q15, [x0, #0] +mul v29.4S, v29.4S,v22.s[0] +ldr q11, [x0, #64] +sqrdmulh v14.4S, v28.4S, v21.s[0] +ldr q0, [x0, #128] +mul v28.4S, v28.4S,v22.s[0] +ldr q26, [x0, #192] +sqrdmulh v7.4S, v1.4S, v21.s[0] +mul v1.4S, v1.4S,v22.s[0] +mla v25.4S, v24.4S, v31.s[0] +mla v29.4S, v27.4S, v31.s[0] +mla v28.4S, v14.4S, v31.s[0] +mla v1.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v30.4S, v21.s[0] +mul v30.4S, v30.4S,v22.s[0] +sub v14.4s, v9.4s, v25.4s +add v9.4s, v9.4s, v25.4s +sqrdmulh v25.4S, v2.4S, v21.s[0] +mul v2.4S, v2.4S,v22.s[0] +sub v27.4s, v8.4s, v29.4s +add v8.4s, v8.4s, v29.4s +sqrdmulh v29.4S, v10.4S, v21.s[0] +mul v10.4S, v10.4S,v22.s[0] +sub v24.4s, v12.4s, v28.4s +add v12.4s, v12.4s, v28.4s +sqrdmulh v28.4S, v13.4S, v21.s[0] +mul v13.4S, v13.4S,v22.s[0] +mla v30.4S, v7.4S, v31.s[0] +sub v7.4s, v23.4s, v1.4s +mla v2.4S, v25.4S, v31.s[0] +add v23.4s, v23.4s, v1.4s +mla v10.4S, v29.4S, v31.s[0] +mla v13.4S, v28.4S, v31.s[0] +sqrdmulh v28.4S, v12.4S, v21.s[1] +mul v12.4S, v12.4S,v22.s[1] +sub v29.4s, v15.4s, v30.4s +add v15.4s, v15.4s, v30.4s +sqrdmulh v30.4S, v23.4S, v21.s[1] +mul v23.4S, v23.4S,v22.s[1] +sub v1.4s, v11.4s, v2.4s +add v11.4s, v11.4s, v2.4s +sqrdmulh v2.4S, v9.4S, v21.s[1] +mul v9.4S, v9.4S,v22.s[1] +sub v25.4s, v0.4s, v10.4s +add v0.4s, v0.4s, v10.4s +sqrdmulh v10.4S, v8.4S, v21.s[1] +mul v8.4S, v8.4S,v22.s[1] +mla v12.4S, v28.4S, v31.s[0] +sub v28.4s, v26.4s, v13.4s +add v26.4s, v26.4s, v13.4s +mla v23.4S, v30.4S, v31.s[0] +mla v9.4S, v2.4S, v31.s[0] +mla v8.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v24.4S, v21.s[2] +mul v24.4S, v24.4S,v22.s[2] +sub v2.4s, v0.4s, v12.4s +add v0.4s, v0.4s, v12.4s +sqrdmulh v12.4S, v7.4S, v21.s[2] +mul v7.4S, v7.4S,v22.s[2] +sub v30.4s, v26.4s, v23.4s +add v26.4s, v26.4s, v23.4s +sqrdmulh v23.4S, v14.4S, v21.s[2] +mul v14.4S, v14.4S,v22.s[2] +sub v13.4s, v15.4s, v9.4s +add v15.4s, v15.4s, v9.4s +sqrdmulh v9.4S, v27.4S, v21.s[2] +mul v27.4S, v27.4S,v22.s[2] +mla v24.4S, v10.4S, v31.s[0] +sub v10.4s, v11.4s, v8.4s +mla v7.4S, v12.4S, v31.s[0] +add v11.4s, v11.4s, v8.4s +mla v14.4S, v23.4S, v31.s[0] +mla v27.4S, v9.4S, v31.s[0] +sqrdmulh v9.4S, v0.4S, v19.s[0] +mul v0.4S, v0.4S,v20.s[0] +sub v23.4s, v25.4s, v24.4s +add v25.4s, v25.4s, v24.4s +sqrdmulh v24.4S, v26.4S, v19.s[0] +mul v26.4S, v26.4S,v20.s[0] +sub v8.4s, v28.4s, v7.4s +add v28.4s, v28.4s, v7.4s +sqrdmulh v7.4S, v2.4S, v19.s[1] +mul v2.4S, v2.4S,v20.s[1] +sub v12.4s, v29.4s, v14.4s +add v29.4s, v29.4s, v14.4s +sqrdmulh v14.4S, v30.4S, v19.s[1] +mul v30.4S, v30.4S,v20.s[1] +mla v0.4S, v9.4S, v31.s[0] +sub v9.4s, v1.4s, v27.4s +add v1.4s, v1.4s, v27.4s +mla v26.4S, v24.4S, v31.s[0] +mla v2.4S, v7.4S, v31.s[0] +mla v30.4S, v14.4S, v31.s[0] +sqrdmulh v14.4S, v25.4S, v19.s[2] +mul v25.4S, v25.4S,v20.s[2] +sub v7.4s, v15.4s, v0.4s +add v15.4s, v15.4s, v0.4s +sqrdmulh v0.4S, v28.4S, v19.s[2] +mul v28.4S, v28.4S,v20.s[2] +sub v24.4s, v11.4s, v26.4s +add v11.4s, v11.4s, v26.4s +sqrdmulh v26.4S, v23.4S, v19.s[3] +mul v23.4S, v23.4S,v20.s[3] +sub v27.4s, v13.4s, v2.4s +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v8.4S, v19.s[3] +mul v8.4S, v8.4S,v20.s[3] +mla v25.4S, v14.4S, v31.s[0] +sub v14.4s, v10.4s, v30.4s +mla v28.4S, v0.4S, v31.s[0] +add v10.4s, v10.4s, v30.4s +mla v23.4S, v26.4S, v31.s[0] +mla v8.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v11.4S, v17.s[0] +mul v11.4S, v11.4S,v18.s[0] +sub v26.4s, v29.4s, v25.4s +add v29.4s, v29.4s, v25.4s +sqrdmulh v25.4S, v24.4S, v17.s[1] +mul v24.4S, v24.4S,v18.s[1] +sub v30.4s, v1.4s, v28.4s +add v1.4s, v1.4s, v28.4s +sqrdmulh v28.4S, v10.4S, v17.s[2] +mul v10.4S, v10.4S,v18.s[2] +sub v0.4s, v12.4s, v23.4s +add v12.4s, v12.4s, v23.4s +sqrdmulh v23.4S, v14.4S, v17.s[3] +mul v14.4S, v14.4S,v18.s[3] +mla v11.4S, v2.4S, v31.s[0] +sub v2.4s, v9.4s, v8.4s +add v9.4s, v9.4s, v8.4s +mla v24.4S, v25.4S, v31.s[0] +mla v10.4S, v28.4S, v31.s[0] +mla v14.4S, v23.4S, v31.s[0] +sqrdmulh v23.4S, v1.4S, v3.s[0] +mul v1.4S, v1.4S,v16.s[0] +sub v28.4s, v15.4s, v11.4s +add v15.4s, v15.4s, v11.4s +str q15, [x0, #0] +str q28, [x0, #64] +sqrdmulh v28.4S, v30.4S, v3.s[1] +mul v30.4S, v30.4S,v16.s[1] +ldr q15, [x0, #784] +ldr q11, [x0, #848] +sub v25.4s, v7.4s, v24.4s +add v7.4s, v7.4s, v24.4s +str q7, [x0, #128] +str q25, [x0, #192] +sqrdmulh v25.4S, v9.4S, v3.s[2] +mul v9.4S, v9.4S,v16.s[2] +ldr q7, [x0, #912] +ldr q24, [x0, #976] +sub v8.4s, v13.4s, v10.4s +add v13.4s, v13.4s, v10.4s +str q13, [x0, #256] +str q8, [x0, #320] +sqrdmulh v8.4S, v2.4S, v3.s[3] +mul v2.4S, v2.4S,v16.s[3] +ldr q13, [x0, #272] +ldr q10, [x0, #336] +mla v1.4S, v23.4S, v31.s[0] +sub v23.4s, v27.4s, v14.4s +mla v30.4S, v28.4S, v31.s[0] +add v27.4s, v27.4s, v14.4s +str q27, [x0, #384] +str q23, [x0, #448] +mla v9.4S, v25.4S, v31.s[0] +ldr q25, [x0, #400] +ldr q23, [x0, #464] +mla v2.4S, v8.4S, v31.s[0] +sub v8.4s, v29.4s, v1.4s +add v29.4s, v29.4s, v1.4s +sub v1.4s, v26.4s, v30.4s +add v26.4s, v26.4s, v30.4s +sub v30.4s, v12.4s, v9.4s +add v12.4s, v12.4s, v9.4s +str q29, [x0, #512] +str q8, [x0, #576] +str q26, [x0, #640] +str q1, [x0, #704] +str q12, [x0, #768] +str q30, [x0, #832] +sqrdmulh v30.4S, v15.4S, v21.s[0] +ldr q12, [x0, #528] +ldr q1, [x0, #592] +mul v15.4S, v15.4S,v22.s[0] +sub v26.4s, v0.4s, v2.4s +add v0.4s, v0.4s, v2.4s +str q0, [x0, #896] +str q26, [x0, #960] +ldr q26, [x0, #656] +ldr q0, [x0, #720] +sqrdmulh v2.4S, v11.4S, v21.s[0] +ldr q8, [x0, #16] +mul v11.4S, v11.4S,v22.s[0] +ldr q29, [x0, #80] +sqrdmulh v9.4S, v7.4S, v21.s[0] +ldr q27, [x0, #144] +mul v7.4S, v7.4S,v22.s[0] +ldr q14, [x0, #208] +sqrdmulh v28.4S, v24.4S, v21.s[0] +mul v24.4S, v24.4S,v22.s[0] +mla v15.4S, v30.4S, v31.s[0] +mla v11.4S, v2.4S, v31.s[0] +mla v7.4S, v9.4S, v31.s[0] +mla v24.4S, v28.4S, v31.s[0] +sqrdmulh v28.4S, v12.4S, v21.s[0] +mul v12.4S, v12.4S,v22.s[0] +sub v9.4s, v13.4s, v15.4s +add v13.4s, v13.4s, v15.4s +sqrdmulh v15.4S, v1.4S, v21.s[0] +mul v1.4S, v1.4S,v22.s[0] +sub v2.4s, v10.4s, v11.4s +add v10.4s, v10.4s, v11.4s +sqrdmulh v11.4S, v26.4S, v21.s[0] +mul v26.4S, v26.4S,v22.s[0] +sub v30.4s, v25.4s, v7.4s +add v25.4s, v25.4s, v7.4s +sqrdmulh v7.4S, v0.4S, v21.s[0] +mul v0.4S, v0.4S,v22.s[0] +mla v12.4S, v28.4S, v31.s[0] +sub v28.4s, v23.4s, v24.4s +mla v1.4S, v15.4S, v31.s[0] +add v23.4s, v23.4s, v24.4s +mla v26.4S, v11.4S, v31.s[0] +mla v0.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v25.4S, v21.s[1] +mul v25.4S, v25.4S,v22.s[1] +sub v11.4s, v8.4s, v12.4s +add v8.4s, v8.4s, v12.4s +sqrdmulh v12.4S, v23.4S, v21.s[1] +mul v23.4S, v23.4S,v22.s[1] +sub v24.4s, v29.4s, v1.4s +add v29.4s, v29.4s, v1.4s +sqrdmulh v1.4S, v13.4S, v21.s[1] +mul v13.4S, v13.4S,v22.s[1] +sub v15.4s, v27.4s, v26.4s +add v27.4s, v27.4s, v26.4s +sqrdmulh v26.4S, v10.4S, v21.s[1] +mul v10.4S, v10.4S,v22.s[1] +mla v25.4S, v7.4S, v31.s[0] +sub v7.4s, v14.4s, v0.4s +add v14.4s, v14.4s, v0.4s +mla v23.4S, v12.4S, v31.s[0] +mla v13.4S, v1.4S, v31.s[0] +mla v10.4S, v26.4S, v31.s[0] +sqrdmulh v26.4S, v30.4S, v21.s[2] +mul v30.4S, v30.4S,v22.s[2] +sub v1.4s, v27.4s, v25.4s +add v27.4s, v27.4s, v25.4s +sqrdmulh v25.4S, v28.4S, v21.s[2] +mul v28.4S, v28.4S,v22.s[2] +sub v12.4s, v14.4s, v23.4s +add v14.4s, v14.4s, v23.4s +sqrdmulh v23.4S, v9.4S, v21.s[2] +mul v9.4S, v9.4S,v22.s[2] +sub v0.4s, v8.4s, v13.4s +add v8.4s, v8.4s, v13.4s +sqrdmulh v13.4S, v2.4S, v21.s[2] +mul v2.4S, v2.4S,v22.s[2] +mla v30.4S, v26.4S, v31.s[0] +sub v26.4s, v29.4s, v10.4s +mla v28.4S, v25.4S, v31.s[0] +add v29.4s, v29.4s, v10.4s +mla v9.4S, v23.4S, v31.s[0] +mla v2.4S, v13.4S, v31.s[0] +sqrdmulh v13.4S, v27.4S, v19.s[0] +mul v27.4S, v27.4S,v20.s[0] +sub v23.4s, v15.4s, v30.4s +add v15.4s, v15.4s, v30.4s +sqrdmulh v30.4S, v14.4S, v19.s[0] +mul v14.4S, v14.4S,v20.s[0] +sub v10.4s, v7.4s, v28.4s +add v7.4s, v7.4s, v28.4s +sqrdmulh v28.4S, v1.4S, v19.s[1] +mul v1.4S, v1.4S,v20.s[1] +sub v25.4s, v11.4s, v9.4s +add v11.4s, v11.4s, v9.4s +sqrdmulh v9.4S, v12.4S, v19.s[1] +mul v12.4S, v12.4S,v20.s[1] +mla v27.4S, v13.4S, v31.s[0] +sub v13.4s, v24.4s, v2.4s +add v24.4s, v24.4s, v2.4s +mla v14.4S, v30.4S, v31.s[0] +mla v1.4S, v28.4S, v31.s[0] +mla v12.4S, v9.4S, v31.s[0] +sqrdmulh v9.4S, v15.4S, v19.s[2] +mul v15.4S, v15.4S,v20.s[2] +sub v28.4s, v8.4s, v27.4s +add v8.4s, v8.4s, v27.4s +sqrdmulh v27.4S, v7.4S, v19.s[2] +mul v7.4S, v7.4S,v20.s[2] +sub v30.4s, v29.4s, v14.4s +add v29.4s, v29.4s, v14.4s +sqrdmulh v14.4S, v23.4S, v19.s[3] +mul v23.4S, v23.4S,v20.s[3] +sub v2.4s, v0.4s, v1.4s +add v0.4s, v0.4s, v1.4s +sqrdmulh v1.4S, v10.4S, v19.s[3] +mul v10.4S, v10.4S,v20.s[3] +mla v15.4S, v9.4S, v31.s[0] +sub v9.4s, v26.4s, v12.4s +mla v7.4S, v27.4S, v31.s[0] +add v26.4s, v26.4s, v12.4s +mla v23.4S, v14.4S, v31.s[0] +mla v10.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v29.4S, v17.s[0] +mul v29.4S, v29.4S,v18.s[0] +sub v14.4s, v11.4s, v15.4s +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v30.4S, v17.s[1] +mul v30.4S, v30.4S,v18.s[1] +sub v12.4s, v24.4s, v7.4s +add v24.4s, v24.4s, v7.4s +sqrdmulh v7.4S, v26.4S, v17.s[2] +mul v26.4S, v26.4S,v18.s[2] +sub v27.4s, v25.4s, v23.4s +add v25.4s, v25.4s, v23.4s +sqrdmulh v23.4S, v9.4S, v17.s[3] +mul v9.4S, v9.4S,v18.s[3] +mla v29.4S, v1.4S, v31.s[0] +sub v1.4s, v13.4s, v10.4s +add v13.4s, v13.4s, v10.4s +mla v30.4S, v15.4S, v31.s[0] +mla v26.4S, v7.4S, v31.s[0] +mla v9.4S, v23.4S, v31.s[0] +sqrdmulh v23.4S, v24.4S, v3.s[0] +mul v24.4S, v24.4S,v16.s[0] +sub v7.4s, v8.4s, v29.4s +add v8.4s, v8.4s, v29.4s +str q8, [x0, #16] +str q7, [x0, #80] +sqrdmulh v7.4S, v12.4S, v3.s[1] +mul v12.4S, v12.4S,v16.s[1] +sub v8.4s, v28.4s, v30.4s +add v28.4s, v28.4s, v30.4s +str q28, [x0, #144] +str q8, [x0, #208] +sqrdmulh v8.4S, v13.4S, v3.s[2] +mul v13.4S, v13.4S,v16.s[2] +sub v28.4s, v0.4s, v26.4s +add v0.4s, v0.4s, v26.4s +str q0, [x0, #272] +str q28, [x0, #336] +sqrdmulh v28.4S, v1.4S, v3.s[3] +mul v1.4S, v1.4S,v16.s[3] +mla v24.4S, v23.4S, v31.s[0] +sub v23.4s, v2.4s, v9.4s +mla v12.4S, v7.4S, v31.s[0] +add v2.4s, v2.4s, v9.4s +str q2, [x0, #400] +str q23, [x0, #464] +mla v13.4S, v8.4S, v31.s[0] +mla v1.4S, v28.4S, v31.s[0] +sub v28.4s, v11.4s, v24.4s +add v11.4s, v11.4s, v24.4s +sub v24.4s, v14.4s, v12.4s +add v14.4s, v14.4s, v12.4s +sub v12.4s, v25.4s, v13.4s +add v25.4s, v25.4s, v13.4s +str q11, [x0, #528] +str q28, [x0, #592] +str q14, [x0, #656] +str q24, [x0, #720] +str q25, [x0, #784] +str q12, [x0, #848] +sub v3.4s, v27.4s, v1.4s +add v27.4s, v27.4s, v1.4s +str q27, [x0, #912] +str q3, [x0, #976] +ldr q4, [x17, #+128] +ldr q5, [x17, #+144] +ldr q6, [x17, #+160] +ldr q10, [x17, #+176] +ldr q15, [x17, #+192] +ldr q29, [x17, #+208] +ldr q30, [x17, #+224] +ldr q26, [x17, #+240] +ldr q0, [x0, #32] +ldr q7, [x0, #48] +ldr q9, [x0, #0] +ldr q2, [x0, #16] +sqrdmulh v23.4S, v0.4S, v5.s[0] +mul v0.4S, v0.4S,v4.s[0] +mla v0.4S, v23.4S, v31.s[0] +sub v23.4s, v9.4s, v0.4s +add v9.4s, v9.4s, v0.4s +sqrdmulh v0.4S, v7.4S, v5.s[0] +mul v7.4S, v7.4S,v4.s[0] +mla v7.4S, v0.4S, v31.s[0] +sub v0.4s, v2.4s, v7.4s +add v2.4s, v2.4s, v7.4s +sqrdmulh v7.4S, v2.4S, v5.s[1] +mul v2.4S, v2.4S,v4.s[1] +mla v2.4S, v7.4S, v31.s[0] +sub v7.4s, v9.4s, v2.4s +add v9.4s, v9.4s, v2.4s +sqrdmulh v2.4S, v0.4S, v5.s[2] +mul v0.4S, v0.4S,v4.s[2] +mla v0.4S, v2.4S, v31.s[0] +sub v2.4s, v23.4s, v0.4s +add v23.4s, v23.4s, v0.4s +trn1 v0.4S, v9.4S, v7.4S +trn2 v8.4S, v9.4S, v7.4S +trn1 v13.4S, v23.4S, v2.4S +trn2 v11.4S, v23.4S, v2.4S +trn2 v23.2D, v0.2D, v13.2D +trn2 v2.2D, v8.2D, v11.2D +trn1 v9.2D, v0.2D, v13.2D +trn1 v7.2D, v8.2D, v11.2D +sqrdmulh v11.4S, v23.4S, v10.4S +mul v23.4S, v23.4S,v6.4S +mla v23.4S, v11.4S, v31.s[0] +sub v11.4s, v9.4s, v23.4s +add v9.4s, v9.4s, v23.4s +sqrdmulh v23.4S, v2.4S, v10.4S +mul v2.4S, v2.4S,v6.4S +mla v2.4S, v23.4S, v31.s[0] +sub v23.4s, v7.4s, v2.4s +add v7.4s, v7.4s, v2.4s +sqrdmulh v2.4S, v7.4S, v29.4S +mul v7.4S, v7.4S,v15.4S +mla v7.4S, v2.4S, v31.s[0] +sub v2.4s, v9.4s, v7.4s +add v9.4s, v9.4s, v7.4s +sqrdmulh v7.4S, v23.4S, v26.4S +mul v23.4S, v23.4S,v30.4S +mla v23.4S, v7.4S, v31.s[0] +sub v7.4s, v11.4s, v23.4s +add v11.4s, v11.4s, v23.4s +str q9, [x0, #0] +str q2, [x0, #16] +str q11, [x0, #32] +str q7, [x0, #48] +ldr q7, [x17, #+256] +ldr q11, [x17, #+272] +ldr q2, [x17, #+288] +ldr q9, [x17, #+304] +ldr q23, [x17, #+320] +ldr q8, [x17, #+336] +ldr q13, [x17, #+352] +ldr q0, [x17, #+368] +ldr q26, [x0, #96] +ldr q30, [x0, #112] +ldr q29, [x0, #64] +ldr q15, [x0, #80] +sqrdmulh v10.4S, v26.4S, v11.s[0] +mul v26.4S, v26.4S,v7.s[0] +mla v26.4S, v10.4S, v31.s[0] +sub v10.4s, v29.4s, v26.4s +add v29.4s, v29.4s, v26.4s +sqrdmulh v26.4S, v30.4S, v11.s[0] +mul v30.4S, v30.4S,v7.s[0] +mla v30.4S, v26.4S, v31.s[0] +sub v26.4s, v15.4s, v30.4s +add v15.4s, v15.4s, v30.4s +sqrdmulh v30.4S, v15.4S, v11.s[1] +mul v15.4S, v15.4S,v7.s[1] +mla v15.4S, v30.4S, v31.s[0] +sub v30.4s, v29.4s, v15.4s +add v29.4s, v29.4s, v15.4s +sqrdmulh v15.4S, v26.4S, v11.s[2] +mul v26.4S, v26.4S,v7.s[2] +mla v26.4S, v15.4S, v31.s[0] +sub v15.4s, v10.4s, v26.4s +add v10.4s, v10.4s, v26.4s +trn1 v26.4S, v29.4S, v30.4S +trn2 v6.4S, v29.4S, v30.4S +trn1 v5.4S, v10.4S, v15.4S +trn2 v4.4S, v10.4S, v15.4S +trn2 v10.2D, v26.2D, v5.2D +trn2 v15.2D, v6.2D, v4.2D +trn1 v29.2D, v26.2D, v5.2D +trn1 v30.2D, v6.2D, v4.2D +sqrdmulh v4.4S, v10.4S, v9.4S +mul v10.4S, v10.4S,v2.4S +mla v10.4S, v4.4S, v31.s[0] +sub v4.4s, v29.4s, v10.4s +add v29.4s, v29.4s, v10.4s +sqrdmulh v10.4S, v15.4S, v9.4S +mul v15.4S, v15.4S,v2.4S +mla v15.4S, v10.4S, v31.s[0] +sub v10.4s, v30.4s, v15.4s +add v30.4s, v30.4s, v15.4s +sqrdmulh v15.4S, v30.4S, v8.4S +mul v30.4S, v30.4S,v23.4S +mla v30.4S, v15.4S, v31.s[0] +sub v15.4s, v29.4s, v30.4s +add v29.4s, v29.4s, v30.4s +sqrdmulh v30.4S, v10.4S, v0.4S +mul v10.4S, v10.4S,v13.4S +mla v10.4S, v30.4S, v31.s[0] +sub v30.4s, v4.4s, v10.4s +add v4.4s, v4.4s, v10.4s +str q29, [x0, #64] +str q15, [x0, #80] +str q4, [x0, #96] +str q30, [x0, #112] +ldr q30, [x17, #+384] +ldr q4, [x17, #+400] +ldr q15, [x17, #+416] +ldr q29, [x17, #+432] +ldr q10, [x17, #+448] +ldr q6, [x17, #+464] +ldr q5, [x17, #+480] +ldr q26, [x17, #+496] +ldr q0, [x0, #160] +ldr q13, [x0, #176] +ldr q8, [x0, #128] +ldr q23, [x0, #144] +sqrdmulh v9.4S, v0.4S, v4.s[0] +mul v0.4S, v0.4S,v30.s[0] +mla v0.4S, v9.4S, v31.s[0] +sub v9.4s, v8.4s, v0.4s +add v8.4s, v8.4s, v0.4s +sqrdmulh v0.4S, v13.4S, v4.s[0] +mul v13.4S, v13.4S,v30.s[0] +mla v13.4S, v0.4S, v31.s[0] +sub v0.4s, v23.4s, v13.4s +add v23.4s, v23.4s, v13.4s +sqrdmulh v13.4S, v23.4S, v4.s[1] +mul v23.4S, v23.4S,v30.s[1] +mla v23.4S, v13.4S, v31.s[0] +sub v13.4s, v8.4s, v23.4s +add v8.4s, v8.4s, v23.4s +sqrdmulh v23.4S, v0.4S, v4.s[2] +mul v0.4S, v0.4S,v30.s[2] +mla v0.4S, v23.4S, v31.s[0] +sub v23.4s, v9.4s, v0.4s +add v9.4s, v9.4s, v0.4s +trn1 v0.4S, v8.4S, v13.4S +trn2 v2.4S, v8.4S, v13.4S +trn1 v11.4S, v9.4S, v23.4S +trn2 v7.4S, v9.4S, v23.4S +trn2 v9.2D, v0.2D, v11.2D +trn2 v23.2D, v2.2D, v7.2D +trn1 v8.2D, v0.2D, v11.2D +trn1 v13.2D, v2.2D, v7.2D +sqrdmulh v7.4S, v9.4S, v29.4S +mul v9.4S, v9.4S,v15.4S +mla v9.4S, v7.4S, v31.s[0] +sub v7.4s, v8.4s, v9.4s +add v8.4s, v8.4s, v9.4s +sqrdmulh v9.4S, v23.4S, v29.4S +mul v23.4S, v23.4S,v15.4S +mla v23.4S, v9.4S, v31.s[0] +sub v9.4s, v13.4s, v23.4s +add v13.4s, v13.4s, v23.4s +sqrdmulh v23.4S, v13.4S, v6.4S +mul v13.4S, v13.4S,v10.4S +mla v13.4S, v23.4S, v31.s[0] +sub v23.4s, v8.4s, v13.4s +add v8.4s, v8.4s, v13.4s +sqrdmulh v13.4S, v9.4S, v26.4S +mul v9.4S, v9.4S,v5.4S +mla v9.4S, v13.4S, v31.s[0] +sub v13.4s, v7.4s, v9.4s +add v7.4s, v7.4s, v9.4s +str q8, [x0, #128] +str q23, [x0, #144] +str q7, [x0, #160] +str q13, [x0, #176] +ldr q13, [x17, #+512] +ldr q7, [x17, #+528] +ldr q23, [x17, #+544] +ldr q8, [x17, #+560] +ldr q9, [x17, #+576] +ldr q2, [x17, #+592] +ldr q11, [x17, #+608] +ldr q0, [x17, #+624] +ldr q26, [x0, #224] +ldr q5, [x0, #240] +ldr q6, [x0, #192] +ldr q10, [x0, #208] +sqrdmulh v29.4S, v26.4S, v7.s[0] +mul v26.4S, v26.4S,v13.s[0] +mla v26.4S, v29.4S, v31.s[0] +sub v29.4s, v6.4s, v26.4s +add v6.4s, v6.4s, v26.4s +sqrdmulh v26.4S, v5.4S, v7.s[0] +mul v5.4S, v5.4S,v13.s[0] +mla v5.4S, v26.4S, v31.s[0] +sub v26.4s, v10.4s, v5.4s +add v10.4s, v10.4s, v5.4s +sqrdmulh v5.4S, v10.4S, v7.s[1] +mul v10.4S, v10.4S,v13.s[1] +mla v10.4S, v5.4S, v31.s[0] +sub v5.4s, v6.4s, v10.4s +add v6.4s, v6.4s, v10.4s +sqrdmulh v10.4S, v26.4S, v7.s[2] +mul v26.4S, v26.4S,v13.s[2] +mla v26.4S, v10.4S, v31.s[0] +sub v10.4s, v29.4s, v26.4s +add v29.4s, v29.4s, v26.4s +trn1 v26.4S, v6.4S, v5.4S +trn2 v15.4S, v6.4S, v5.4S +trn1 v4.4S, v29.4S, v10.4S +trn2 v30.4S, v29.4S, v10.4S +trn2 v29.2D, v26.2D, v4.2D +trn2 v10.2D, v15.2D, v30.2D +trn1 v6.2D, v26.2D, v4.2D +trn1 v5.2D, v15.2D, v30.2D +sqrdmulh v30.4S, v29.4S, v8.4S +mul v29.4S, v29.4S,v23.4S +mla v29.4S, v30.4S, v31.s[0] +sub v30.4s, v6.4s, v29.4s +add v6.4s, v6.4s, v29.4s +sqrdmulh v29.4S, v10.4S, v8.4S +mul v10.4S, v10.4S,v23.4S +mla v10.4S, v29.4S, v31.s[0] +sub v29.4s, v5.4s, v10.4s +add v5.4s, v5.4s, v10.4s +sqrdmulh v10.4S, v5.4S, v2.4S +mul v5.4S, v5.4S,v9.4S +mla v5.4S, v10.4S, v31.s[0] +sub v10.4s, v6.4s, v5.4s +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v29.4S, v0.4S +mul v29.4S, v29.4S,v11.4S +mla v29.4S, v5.4S, v31.s[0] +sub v5.4s, v30.4s, v29.4s +add v30.4s, v30.4s, v29.4s +str q6, [x0, #192] +str q10, [x0, #208] +str q30, [x0, #224] +str q5, [x0, #240] +ldr q5, [x17, #+640] +ldr q30, [x17, #+656] +ldr q10, [x17, #+672] +ldr q6, [x17, #+688] +ldr q29, [x17, #+704] +ldr q15, [x17, #+720] +ldr q4, [x17, #+736] +ldr q26, [x17, #+752] +ldr q0, [x0, #288] +ldr q11, [x0, #304] +ldr q2, [x0, #256] +ldr q9, [x0, #272] +sqrdmulh v8.4S, v0.4S, v30.s[0] +mul v0.4S, v0.4S,v5.s[0] +mla v0.4S, v8.4S, v31.s[0] +sub v8.4s, v2.4s, v0.4s +add v2.4s, v2.4s, v0.4s +sqrdmulh v0.4S, v11.4S, v30.s[0] +mul v11.4S, v11.4S,v5.s[0] +mla v11.4S, v0.4S, v31.s[0] +sub v0.4s, v9.4s, v11.4s +add v9.4s, v9.4s, v11.4s +sqrdmulh v11.4S, v9.4S, v30.s[1] +mul v9.4S, v9.4S,v5.s[1] +mla v9.4S, v11.4S, v31.s[0] +sub v11.4s, v2.4s, v9.4s +add v2.4s, v2.4s, v9.4s +sqrdmulh v9.4S, v0.4S, v30.s[2] +mul v0.4S, v0.4S,v5.s[2] +mla v0.4S, v9.4S, v31.s[0] +sub v9.4s, v8.4s, v0.4s +add v8.4s, v8.4s, v0.4s +trn1 v0.4S, v2.4S, v11.4S +trn2 v23.4S, v2.4S, v11.4S +trn1 v7.4S, v8.4S, v9.4S +trn2 v13.4S, v8.4S, v9.4S +trn2 v8.2D, v0.2D, v7.2D +trn2 v9.2D, v23.2D, v13.2D +trn1 v2.2D, v0.2D, v7.2D +trn1 v11.2D, v23.2D, v13.2D +sqrdmulh v13.4S, v8.4S, v6.4S +mul v8.4S, v8.4S,v10.4S +mla v8.4S, v13.4S, v31.s[0] +sub v13.4s, v2.4s, v8.4s +add v2.4s, v2.4s, v8.4s +sqrdmulh v8.4S, v9.4S, v6.4S +mul v9.4S, v9.4S,v10.4S +mla v9.4S, v8.4S, v31.s[0] +sub v8.4s, v11.4s, v9.4s +add v11.4s, v11.4s, v9.4s +sqrdmulh v9.4S, v11.4S, v15.4S +mul v11.4S, v11.4S,v29.4S +mla v11.4S, v9.4S, v31.s[0] +sub v9.4s, v2.4s, v11.4s +add v2.4s, v2.4s, v11.4s +sqrdmulh v11.4S, v8.4S, v26.4S +mul v8.4S, v8.4S,v4.4S +mla v8.4S, v11.4S, v31.s[0] +sub v11.4s, v13.4s, v8.4s +add v13.4s, v13.4s, v8.4s +str q2, [x0, #256] +str q9, [x0, #272] +str q13, [x0, #288] +str q11, [x0, #304] +ldr q11, [x17, #+768] +ldr q13, [x17, #+784] +ldr q9, [x17, #+800] +ldr q2, [x17, #+816] +ldr q8, [x17, #+832] +ldr q23, [x17, #+848] +ldr q7, [x17, #+864] +ldr q0, [x17, #+880] +ldr q26, [x0, #352] +ldr q4, [x0, #368] +ldr q15, [x0, #320] +ldr q29, [x0, #336] +sqrdmulh v6.4S, v26.4S, v13.s[0] +mul v26.4S, v26.4S,v11.s[0] +mla v26.4S, v6.4S, v31.s[0] +sub v6.4s, v15.4s, v26.4s +add v15.4s, v15.4s, v26.4s +sqrdmulh v26.4S, v4.4S, v13.s[0] +mul v4.4S, v4.4S,v11.s[0] +mla v4.4S, v26.4S, v31.s[0] +sub v26.4s, v29.4s, v4.4s +add v29.4s, v29.4s, v4.4s +sqrdmulh v4.4S, v29.4S, v13.s[1] +mul v29.4S, v29.4S,v11.s[1] +mla v29.4S, v4.4S, v31.s[0] +sub v4.4s, v15.4s, v29.4s +add v15.4s, v15.4s, v29.4s +sqrdmulh v29.4S, v26.4S, v13.s[2] +mul v26.4S, v26.4S,v11.s[2] +mla v26.4S, v29.4S, v31.s[0] +sub v29.4s, v6.4s, v26.4s +add v6.4s, v6.4s, v26.4s +trn1 v26.4S, v15.4S, v4.4S +trn2 v10.4S, v15.4S, v4.4S +trn1 v30.4S, v6.4S, v29.4S +trn2 v5.4S, v6.4S, v29.4S +trn2 v6.2D, v26.2D, v30.2D +trn2 v29.2D, v10.2D, v5.2D +trn1 v15.2D, v26.2D, v30.2D +trn1 v4.2D, v10.2D, v5.2D +sqrdmulh v5.4S, v6.4S, v2.4S +mul v6.4S, v6.4S,v9.4S +mla v6.4S, v5.4S, v31.s[0] +sub v5.4s, v15.4s, v6.4s +add v15.4s, v15.4s, v6.4s +sqrdmulh v6.4S, v29.4S, v2.4S +mul v29.4S, v29.4S,v9.4S +mla v29.4S, v6.4S, v31.s[0] +sub v6.4s, v4.4s, v29.4s +add v4.4s, v4.4s, v29.4s +sqrdmulh v29.4S, v4.4S, v23.4S +mul v4.4S, v4.4S,v8.4S +mla v4.4S, v29.4S, v31.s[0] +sub v29.4s, v15.4s, v4.4s +add v15.4s, v15.4s, v4.4s +sqrdmulh v4.4S, v6.4S, v0.4S +mul v6.4S, v6.4S,v7.4S +mla v6.4S, v4.4S, v31.s[0] +sub v4.4s, v5.4s, v6.4s +add v5.4s, v5.4s, v6.4s +str q15, [x0, #320] +str q29, [x0, #336] +str q5, [x0, #352] +str q4, [x0, #368] +ldr q4, [x17, #+896] +ldr q5, [x17, #+912] +ldr q29, [x17, #+928] +ldr q15, [x17, #+944] +ldr q6, [x17, #+960] +ldr q10, [x17, #+976] +ldr q30, [x17, #+992] +ldr q26, [x17, #+1008] +ldr q0, [x0, #416] +ldr q7, [x0, #432] +ldr q23, [x0, #384] +ldr q8, [x0, #400] +sqrdmulh v2.4S, v0.4S, v5.s[0] +mul v0.4S, v0.4S,v4.s[0] +mla v0.4S, v2.4S, v31.s[0] +sub v2.4s, v23.4s, v0.4s +add v23.4s, v23.4s, v0.4s +sqrdmulh v0.4S, v7.4S, v5.s[0] +mul v7.4S, v7.4S,v4.s[0] +mla v7.4S, v0.4S, v31.s[0] +sub v0.4s, v8.4s, v7.4s +add v8.4s, v8.4s, v7.4s +sqrdmulh v7.4S, v8.4S, v5.s[1] +mul v8.4S, v8.4S,v4.s[1] +mla v8.4S, v7.4S, v31.s[0] +sub v7.4s, v23.4s, v8.4s +add v23.4s, v23.4s, v8.4s +sqrdmulh v8.4S, v0.4S, v5.s[2] +mul v0.4S, v0.4S,v4.s[2] +mla v0.4S, v8.4S, v31.s[0] +sub v8.4s, v2.4s, v0.4s +add v2.4s, v2.4s, v0.4s +trn1 v0.4S, v23.4S, v7.4S +trn2 v9.4S, v23.4S, v7.4S +trn1 v13.4S, v2.4S, v8.4S +trn2 v11.4S, v2.4S, v8.4S +trn2 v2.2D, v0.2D, v13.2D +trn2 v8.2D, v9.2D, v11.2D +trn1 v23.2D, v0.2D, v13.2D +trn1 v7.2D, v9.2D, v11.2D +sqrdmulh v11.4S, v2.4S, v15.4S +mul v2.4S, v2.4S,v29.4S +mla v2.4S, v11.4S, v31.s[0] +sub v11.4s, v23.4s, v2.4s +add v23.4s, v23.4s, v2.4s +sqrdmulh v2.4S, v8.4S, v15.4S +mul v8.4S, v8.4S,v29.4S +mla v8.4S, v2.4S, v31.s[0] +sub v2.4s, v7.4s, v8.4s +add v7.4s, v7.4s, v8.4s +sqrdmulh v8.4S, v7.4S, v10.4S +mul v7.4S, v7.4S,v6.4S +mla v7.4S, v8.4S, v31.s[0] +sub v8.4s, v23.4s, v7.4s +add v23.4s, v23.4s, v7.4s +sqrdmulh v7.4S, v2.4S, v26.4S +mul v2.4S, v2.4S,v30.4S +mla v2.4S, v7.4S, v31.s[0] +sub v7.4s, v11.4s, v2.4s +add v11.4s, v11.4s, v2.4s +str q23, [x0, #384] +str q8, [x0, #400] +str q11, [x0, #416] +str q7, [x0, #432] +ldr q7, [x17, #+1024] +ldr q11, [x17, #+1040] +ldr q8, [x17, #+1056] +ldr q23, [x17, #+1072] +ldr q2, [x17, #+1088] +ldr q9, [x17, #+1104] +ldr q13, [x17, #+1120] +ldr q0, [x17, #+1136] +ldr q26, [x0, #480] +ldr q30, [x0, #496] +ldr q10, [x0, #448] +ldr q6, [x0, #464] +sqrdmulh v15.4S, v26.4S, v11.s[0] +mul v26.4S, v26.4S,v7.s[0] +mla v26.4S, v15.4S, v31.s[0] +sub v15.4s, v10.4s, v26.4s +add v10.4s, v10.4s, v26.4s +sqrdmulh v26.4S, v30.4S, v11.s[0] +mul v30.4S, v30.4S,v7.s[0] +mla v30.4S, v26.4S, v31.s[0] +sub v26.4s, v6.4s, v30.4s +add v6.4s, v6.4s, v30.4s +sqrdmulh v30.4S, v6.4S, v11.s[1] +mul v6.4S, v6.4S,v7.s[1] +mla v6.4S, v30.4S, v31.s[0] +sub v30.4s, v10.4s, v6.4s +add v10.4s, v10.4s, v6.4s +sqrdmulh v6.4S, v26.4S, v11.s[2] +mul v26.4S, v26.4S,v7.s[2] +mla v26.4S, v6.4S, v31.s[0] +sub v6.4s, v15.4s, v26.4s +add v15.4s, v15.4s, v26.4s +trn1 v26.4S, v10.4S, v30.4S +trn2 v29.4S, v10.4S, v30.4S +trn1 v5.4S, v15.4S, v6.4S +trn2 v4.4S, v15.4S, v6.4S +trn2 v15.2D, v26.2D, v5.2D +trn2 v6.2D, v29.2D, v4.2D +trn1 v10.2D, v26.2D, v5.2D +trn1 v30.2D, v29.2D, v4.2D +sqrdmulh v4.4S, v15.4S, v23.4S +mul v15.4S, v15.4S,v8.4S +mla v15.4S, v4.4S, v31.s[0] +sub v4.4s, v10.4s, v15.4s +add v10.4s, v10.4s, v15.4s +sqrdmulh v15.4S, v6.4S, v23.4S +mul v6.4S, v6.4S,v8.4S +mla v6.4S, v15.4S, v31.s[0] +sub v15.4s, v30.4s, v6.4s +add v30.4s, v30.4s, v6.4s +sqrdmulh v6.4S, v30.4S, v9.4S +mul v30.4S, v30.4S,v2.4S +mla v30.4S, v6.4S, v31.s[0] +sub v6.4s, v10.4s, v30.4s +add v10.4s, v10.4s, v30.4s +sqrdmulh v30.4S, v15.4S, v0.4S +mul v15.4S, v15.4S,v13.4S +mla v15.4S, v30.4S, v31.s[0] +sub v30.4s, v4.4s, v15.4s +add v4.4s, v4.4s, v15.4s +str q10, [x0, #448] +str q6, [x0, #464] +str q4, [x0, #480] +str q30, [x0, #496] +ldr q30, [x17, #+1152] +ldr q4, [x17, #+1168] +ldr q6, [x17, #+1184] +ldr q10, [x17, #+1200] +ldr q15, [x17, #+1216] +ldr q29, [x17, #+1232] +ldr q5, [x17, #+1248] +ldr q26, [x17, #+1264] +ldr q0, [x0, #544] +ldr q13, [x0, #560] +ldr q9, [x0, #512] +ldr q2, [x0, #528] +sqrdmulh v23.4S, v0.4S, v4.s[0] +mul v0.4S, v0.4S,v30.s[0] +mla v0.4S, v23.4S, v31.s[0] +sub v23.4s, v9.4s, v0.4s +add v9.4s, v9.4s, v0.4s +sqrdmulh v0.4S, v13.4S, v4.s[0] +mul v13.4S, v13.4S,v30.s[0] +mla v13.4S, v0.4S, v31.s[0] +sub v0.4s, v2.4s, v13.4s +add v2.4s, v2.4s, v13.4s +sqrdmulh v13.4S, v2.4S, v4.s[1] +mul v2.4S, v2.4S,v30.s[1] +mla v2.4S, v13.4S, v31.s[0] +sub v13.4s, v9.4s, v2.4s +add v9.4s, v9.4s, v2.4s +sqrdmulh v2.4S, v0.4S, v4.s[2] +mul v0.4S, v0.4S,v30.s[2] +mla v0.4S, v2.4S, v31.s[0] +sub v2.4s, v23.4s, v0.4s +add v23.4s, v23.4s, v0.4s +trn1 v0.4S, v9.4S, v13.4S +trn2 v8.4S, v9.4S, v13.4S +trn1 v11.4S, v23.4S, v2.4S +trn2 v7.4S, v23.4S, v2.4S +trn2 v23.2D, v0.2D, v11.2D +trn2 v2.2D, v8.2D, v7.2D +trn1 v9.2D, v0.2D, v11.2D +trn1 v13.2D, v8.2D, v7.2D +sqrdmulh v7.4S, v23.4S, v10.4S +mul v23.4S, v23.4S,v6.4S +mla v23.4S, v7.4S, v31.s[0] +sub v7.4s, v9.4s, v23.4s +add v9.4s, v9.4s, v23.4s +sqrdmulh v23.4S, v2.4S, v10.4S +mul v2.4S, v2.4S,v6.4S +mla v2.4S, v23.4S, v31.s[0] +sub v23.4s, v13.4s, v2.4s +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v13.4S, v29.4S +mul v13.4S, v13.4S,v15.4S +mla v13.4S, v2.4S, v31.s[0] +sub v2.4s, v9.4s, v13.4s +add v9.4s, v9.4s, v13.4s +sqrdmulh v13.4S, v23.4S, v26.4S +mul v23.4S, v23.4S,v5.4S +mla v23.4S, v13.4S, v31.s[0] +sub v13.4s, v7.4s, v23.4s +add v7.4s, v7.4s, v23.4s +str q9, [x0, #512] +str q2, [x0, #528] +str q7, [x0, #544] +str q13, [x0, #560] +ldr q13, [x17, #+1280] +ldr q7, [x17, #+1296] +ldr q2, [x17, #+1312] +ldr q9, [x17, #+1328] +ldr q23, [x17, #+1344] +ldr q8, [x17, #+1360] +ldr q11, [x17, #+1376] +ldr q0, [x17, #+1392] +ldr q26, [x0, #608] +ldr q5, [x0, #624] +ldr q29, [x0, #576] +ldr q15, [x0, #592] +sqrdmulh v10.4S, v26.4S, v7.s[0] +mul v26.4S, v26.4S,v13.s[0] +mla v26.4S, v10.4S, v31.s[0] +sub v10.4s, v29.4s, v26.4s +add v29.4s, v29.4s, v26.4s +sqrdmulh v26.4S, v5.4S, v7.s[0] +mul v5.4S, v5.4S,v13.s[0] +mla v5.4S, v26.4S, v31.s[0] +sub v26.4s, v15.4s, v5.4s +add v15.4s, v15.4s, v5.4s +sqrdmulh v5.4S, v15.4S, v7.s[1] +mul v15.4S, v15.4S,v13.s[1] +mla v15.4S, v5.4S, v31.s[0] +sub v5.4s, v29.4s, v15.4s +add v29.4s, v29.4s, v15.4s +sqrdmulh v15.4S, v26.4S, v7.s[2] +mul v26.4S, v26.4S,v13.s[2] +mla v26.4S, v15.4S, v31.s[0] +sub v15.4s, v10.4s, v26.4s +add v10.4s, v10.4s, v26.4s +trn1 v26.4S, v29.4S, v5.4S +trn2 v6.4S, v29.4S, v5.4S +trn1 v4.4S, v10.4S, v15.4S +trn2 v30.4S, v10.4S, v15.4S +trn2 v10.2D, v26.2D, v4.2D +trn2 v15.2D, v6.2D, v30.2D +trn1 v29.2D, v26.2D, v4.2D +trn1 v5.2D, v6.2D, v30.2D +sqrdmulh v30.4S, v10.4S, v9.4S +mul v10.4S, v10.4S,v2.4S +mla v10.4S, v30.4S, v31.s[0] +sub v30.4s, v29.4s, v10.4s +add v29.4s, v29.4s, v10.4s +sqrdmulh v10.4S, v15.4S, v9.4S +mul v15.4S, v15.4S,v2.4S +mla v15.4S, v10.4S, v31.s[0] +sub v10.4s, v5.4s, v15.4s +add v5.4s, v5.4s, v15.4s +sqrdmulh v15.4S, v5.4S, v8.4S +mul v5.4S, v5.4S,v23.4S +mla v5.4S, v15.4S, v31.s[0] +sub v15.4s, v29.4s, v5.4s +add v29.4s, v29.4s, v5.4s +sqrdmulh v5.4S, v10.4S, v0.4S +mul v10.4S, v10.4S,v11.4S +mla v10.4S, v5.4S, v31.s[0] +sub v5.4s, v30.4s, v10.4s +add v30.4s, v30.4s, v10.4s +str q29, [x0, #576] +str q15, [x0, #592] +str q30, [x0, #608] +str q5, [x0, #624] +ldr q5, [x17, #+1408] +ldr q30, [x17, #+1424] +ldr q15, [x17, #+1440] +ldr q29, [x17, #+1456] +ldr q10, [x17, #+1472] +ldr q6, [x17, #+1488] +ldr q4, [x17, #+1504] +ldr q26, [x17, #+1520] +ldr q0, [x0, #672] +ldr q11, [x0, #688] +ldr q8, [x0, #640] +ldr q23, [x0, #656] +sqrdmulh v9.4S, v0.4S, v30.s[0] +mul v0.4S, v0.4S,v5.s[0] +mla v0.4S, v9.4S, v31.s[0] +sub v9.4s, v8.4s, v0.4s +add v8.4s, v8.4s, v0.4s +sqrdmulh v0.4S, v11.4S, v30.s[0] +mul v11.4S, v11.4S,v5.s[0] +mla v11.4S, v0.4S, v31.s[0] +sub v0.4s, v23.4s, v11.4s +add v23.4s, v23.4s, v11.4s +sqrdmulh v11.4S, v23.4S, v30.s[1] +mul v23.4S, v23.4S,v5.s[1] +mla v23.4S, v11.4S, v31.s[0] +sub v11.4s, v8.4s, v23.4s +add v8.4s, v8.4s, v23.4s +sqrdmulh v23.4S, v0.4S, v30.s[2] +mul v0.4S, v0.4S,v5.s[2] +mla v0.4S, v23.4S, v31.s[0] +sub v23.4s, v9.4s, v0.4s +add v9.4s, v9.4s, v0.4s +trn1 v0.4S, v8.4S, v11.4S +trn2 v2.4S, v8.4S, v11.4S +trn1 v7.4S, v9.4S, v23.4S +trn2 v13.4S, v9.4S, v23.4S +trn2 v9.2D, v0.2D, v7.2D +trn2 v23.2D, v2.2D, v13.2D +trn1 v8.2D, v0.2D, v7.2D +trn1 v11.2D, v2.2D, v13.2D +sqrdmulh v13.4S, v9.4S, v29.4S +mul v9.4S, v9.4S,v15.4S +mla v9.4S, v13.4S, v31.s[0] +sub v13.4s, v8.4s, v9.4s +add v8.4s, v8.4s, v9.4s +sqrdmulh v9.4S, v23.4S, v29.4S +mul v23.4S, v23.4S,v15.4S +mla v23.4S, v9.4S, v31.s[0] +sub v9.4s, v11.4s, v23.4s +add v11.4s, v11.4s, v23.4s +sqrdmulh v23.4S, v11.4S, v6.4S +mul v11.4S, v11.4S,v10.4S +mla v11.4S, v23.4S, v31.s[0] +sub v23.4s, v8.4s, v11.4s +add v8.4s, v8.4s, v11.4s +sqrdmulh v11.4S, v9.4S, v26.4S +mul v9.4S, v9.4S,v4.4S +mla v9.4S, v11.4S, v31.s[0] +sub v11.4s, v13.4s, v9.4s +add v13.4s, v13.4s, v9.4s +str q8, [x0, #640] +str q23, [x0, #656] +str q13, [x0, #672] +str q11, [x0, #688] +ldr q11, [x17, #+1536] +ldr q13, [x17, #+1552] +ldr q23, [x17, #+1568] +ldr q8, [x17, #+1584] +ldr q9, [x17, #+1600] +ldr q2, [x17, #+1616] +ldr q7, [x17, #+1632] +ldr q0, [x17, #+1648] +ldr q26, [x0, #736] +ldr q4, [x0, #752] +ldr q6, [x0, #704] +ldr q10, [x0, #720] +sqrdmulh v29.4S, v26.4S, v13.s[0] +mul v26.4S, v26.4S,v11.s[0] +mla v26.4S, v29.4S, v31.s[0] +sub v29.4s, v6.4s, v26.4s +add v6.4s, v6.4s, v26.4s +sqrdmulh v26.4S, v4.4S, v13.s[0] +mul v4.4S, v4.4S,v11.s[0] +mla v4.4S, v26.4S, v31.s[0] +sub v26.4s, v10.4s, v4.4s +add v10.4s, v10.4s, v4.4s +sqrdmulh v4.4S, v10.4S, v13.s[1] +mul v10.4S, v10.4S,v11.s[1] +mla v10.4S, v4.4S, v31.s[0] +sub v4.4s, v6.4s, v10.4s +add v6.4s, v6.4s, v10.4s +sqrdmulh v10.4S, v26.4S, v13.s[2] +mul v26.4S, v26.4S,v11.s[2] +mla v26.4S, v10.4S, v31.s[0] +sub v10.4s, v29.4s, v26.4s +add v29.4s, v29.4s, v26.4s +trn1 v26.4S, v6.4S, v4.4S +trn2 v15.4S, v6.4S, v4.4S +trn1 v30.4S, v29.4S, v10.4S +trn2 v5.4S, v29.4S, v10.4S +trn2 v29.2D, v26.2D, v30.2D +trn2 v10.2D, v15.2D, v5.2D +trn1 v6.2D, v26.2D, v30.2D +trn1 v4.2D, v15.2D, v5.2D +sqrdmulh v5.4S, v29.4S, v8.4S +mul v29.4S, v29.4S,v23.4S +mla v29.4S, v5.4S, v31.s[0] +sub v5.4s, v6.4s, v29.4s +add v6.4s, v6.4s, v29.4s +sqrdmulh v29.4S, v10.4S, v8.4S +mul v10.4S, v10.4S,v23.4S +mla v10.4S, v29.4S, v31.s[0] +sub v29.4s, v4.4s, v10.4s +add v4.4s, v4.4s, v10.4s +sqrdmulh v10.4S, v4.4S, v2.4S +mul v4.4S, v4.4S,v9.4S +mla v4.4S, v10.4S, v31.s[0] +sub v10.4s, v6.4s, v4.4s +add v6.4s, v6.4s, v4.4s +sqrdmulh v4.4S, v29.4S, v0.4S +mul v29.4S, v29.4S,v7.4S +mla v29.4S, v4.4S, v31.s[0] +sub v4.4s, v5.4s, v29.4s +add v5.4s, v5.4s, v29.4s +str q6, [x0, #704] +str q10, [x0, #720] +str q5, [x0, #736] +str q4, [x0, #752] +ldr q4, [x17, #+1664] +ldr q5, [x17, #+1680] +ldr q10, [x17, #+1696] +ldr q6, [x17, #+1712] +ldr q29, [x17, #+1728] +ldr q15, [x17, #+1744] +ldr q30, [x17, #+1760] +ldr q26, [x17, #+1776] +ldr q0, [x0, #800] +ldr q7, [x0, #816] +ldr q2, [x0, #768] +ldr q9, [x0, #784] +sqrdmulh v8.4S, v0.4S, v5.s[0] +mul v0.4S, v0.4S,v4.s[0] +mla v0.4S, v8.4S, v31.s[0] +sub v8.4s, v2.4s, v0.4s +add v2.4s, v2.4s, v0.4s +sqrdmulh v0.4S, v7.4S, v5.s[0] +mul v7.4S, v7.4S,v4.s[0] +mla v7.4S, v0.4S, v31.s[0] +sub v0.4s, v9.4s, v7.4s +add v9.4s, v9.4s, v7.4s +sqrdmulh v7.4S, v9.4S, v5.s[1] +mul v9.4S, v9.4S,v4.s[1] +mla v9.4S, v7.4S, v31.s[0] +sub v7.4s, v2.4s, v9.4s +add v2.4s, v2.4s, v9.4s +sqrdmulh v9.4S, v0.4S, v5.s[2] +mul v0.4S, v0.4S,v4.s[2] +mla v0.4S, v9.4S, v31.s[0] +sub v9.4s, v8.4s, v0.4s +add v8.4s, v8.4s, v0.4s +trn1 v0.4S, v2.4S, v7.4S +trn2 v23.4S, v2.4S, v7.4S +trn1 v13.4S, v8.4S, v9.4S +trn2 v11.4S, v8.4S, v9.4S +trn2 v8.2D, v0.2D, v13.2D +trn2 v9.2D, v23.2D, v11.2D +trn1 v2.2D, v0.2D, v13.2D +trn1 v7.2D, v23.2D, v11.2D +sqrdmulh v11.4S, v8.4S, v6.4S +mul v8.4S, v8.4S,v10.4S +mla v8.4S, v11.4S, v31.s[0] +sub v11.4s, v2.4s, v8.4s +add v2.4s, v2.4s, v8.4s +sqrdmulh v8.4S, v9.4S, v6.4S +mul v9.4S, v9.4S,v10.4S +mla v9.4S, v8.4S, v31.s[0] +sub v8.4s, v7.4s, v9.4s +add v7.4s, v7.4s, v9.4s +sqrdmulh v9.4S, v7.4S, v15.4S +mul v7.4S, v7.4S,v29.4S +mla v7.4S, v9.4S, v31.s[0] +sub v9.4s, v2.4s, v7.4s +add v2.4s, v2.4s, v7.4s +sqrdmulh v7.4S, v8.4S, v26.4S +mul v8.4S, v8.4S,v30.4S +mla v8.4S, v7.4S, v31.s[0] +sub v7.4s, v11.4s, v8.4s +add v11.4s, v11.4s, v8.4s +str q2, [x0, #768] +str q9, [x0, #784] +str q11, [x0, #800] +str q7, [x0, #816] +ldr q7, [x17, #+1792] +ldr q11, [x17, #+1808] +ldr q9, [x17, #+1824] +ldr q2, [x17, #+1840] +ldr q8, [x17, #+1856] +ldr q23, [x17, #+1872] +ldr q13, [x17, #+1888] +ldr q0, [x17, #+1904] +ldr q26, [x0, #864] +ldr q30, [x0, #880] +ldr q15, [x0, #832] +ldr q29, [x0, #848] +sqrdmulh v6.4S, v26.4S, v11.s[0] +mul v26.4S, v26.4S,v7.s[0] +mla v26.4S, v6.4S, v31.s[0] +sub v6.4s, v15.4s, v26.4s +add v15.4s, v15.4s, v26.4s +sqrdmulh v26.4S, v30.4S, v11.s[0] +mul v30.4S, v30.4S,v7.s[0] +mla v30.4S, v26.4S, v31.s[0] +sub v26.4s, v29.4s, v30.4s +add v29.4s, v29.4s, v30.4s +sqrdmulh v30.4S, v29.4S, v11.s[1] +mul v29.4S, v29.4S,v7.s[1] +mla v29.4S, v30.4S, v31.s[0] +sub v30.4s, v15.4s, v29.4s +add v15.4s, v15.4s, v29.4s +sqrdmulh v29.4S, v26.4S, v11.s[2] +mul v26.4S, v26.4S,v7.s[2] +mla v26.4S, v29.4S, v31.s[0] +sub v29.4s, v6.4s, v26.4s +add v6.4s, v6.4s, v26.4s +trn1 v26.4S, v15.4S, v30.4S +trn2 v10.4S, v15.4S, v30.4S +trn1 v5.4S, v6.4S, v29.4S +trn2 v4.4S, v6.4S, v29.4S +trn2 v6.2D, v26.2D, v5.2D +trn2 v29.2D, v10.2D, v4.2D +trn1 v15.2D, v26.2D, v5.2D +trn1 v30.2D, v10.2D, v4.2D +sqrdmulh v4.4S, v6.4S, v2.4S +mul v6.4S, v6.4S,v9.4S +mla v6.4S, v4.4S, v31.s[0] +sub v4.4s, v15.4s, v6.4s +add v15.4s, v15.4s, v6.4s +sqrdmulh v6.4S, v29.4S, v2.4S +mul v29.4S, v29.4S,v9.4S +mla v29.4S, v6.4S, v31.s[0] +sub v6.4s, v30.4s, v29.4s +add v30.4s, v30.4s, v29.4s +sqrdmulh v29.4S, v30.4S, v23.4S +mul v30.4S, v30.4S,v8.4S +mla v30.4S, v29.4S, v31.s[0] +sub v29.4s, v15.4s, v30.4s +add v15.4s, v15.4s, v30.4s +sqrdmulh v30.4S, v6.4S, v0.4S +mul v6.4S, v6.4S,v13.4S +mla v6.4S, v30.4S, v31.s[0] +sub v30.4s, v4.4s, v6.4s +add v4.4s, v4.4s, v6.4s +str q15, [x0, #832] +str q29, [x0, #848] +str q4, [x0, #864] +str q30, [x0, #880] +ldr q30, [x17, #+1920] +ldr q4, [x17, #+1936] +ldr q29, [x17, #+1952] +ldr q15, [x17, #+1968] +ldr q6, [x17, #+1984] +ldr q10, [x17, #+2000] +ldr q5, [x17, #+2016] +ldr q26, [x17, #+2032] +ldr q0, [x0, #928] +ldr q13, [x0, #944] +ldr q23, [x0, #896] +ldr q8, [x0, #912] +sqrdmulh v2.4S, v0.4S, v4.s[0] +mul v0.4S, v0.4S,v30.s[0] +mla v0.4S, v2.4S, v31.s[0] +sub v2.4s, v23.4s, v0.4s +add v23.4s, v23.4s, v0.4s +sqrdmulh v0.4S, v13.4S, v4.s[0] +mul v13.4S, v13.4S,v30.s[0] +mla v13.4S, v0.4S, v31.s[0] +sub v0.4s, v8.4s, v13.4s +add v8.4s, v8.4s, v13.4s +sqrdmulh v13.4S, v8.4S, v4.s[1] +mul v8.4S, v8.4S,v30.s[1] +mla v8.4S, v13.4S, v31.s[0] +sub v13.4s, v23.4s, v8.4s +add v23.4s, v23.4s, v8.4s +sqrdmulh v8.4S, v0.4S, v4.s[2] +mul v0.4S, v0.4S,v30.s[2] +mla v0.4S, v8.4S, v31.s[0] +sub v8.4s, v2.4s, v0.4s +add v2.4s, v2.4s, v0.4s +trn1 v0.4S, v23.4S, v13.4S +trn2 v9.4S, v23.4S, v13.4S +trn1 v11.4S, v2.4S, v8.4S +trn2 v7.4S, v2.4S, v8.4S +trn2 v2.2D, v0.2D, v11.2D +trn2 v8.2D, v9.2D, v7.2D +trn1 v23.2D, v0.2D, v11.2D +trn1 v13.2D, v9.2D, v7.2D +sqrdmulh v7.4S, v2.4S, v15.4S +mul v2.4S, v2.4S,v29.4S +mla v2.4S, v7.4S, v31.s[0] +sub v7.4s, v23.4s, v2.4s +add v23.4s, v23.4s, v2.4s +sqrdmulh v2.4S, v8.4S, v15.4S +mul v8.4S, v8.4S,v29.4S +mla v8.4S, v2.4S, v31.s[0] +sub v2.4s, v13.4s, v8.4s +add v13.4s, v13.4s, v8.4s +sqrdmulh v8.4S, v13.4S, v10.4S +mul v13.4S, v13.4S,v6.4S +mla v13.4S, v8.4S, v31.s[0] +sub v8.4s, v23.4s, v13.4s +add v23.4s, v23.4s, v13.4s +sqrdmulh v13.4S, v2.4S, v26.4S +mul v2.4S, v2.4S,v5.4S +mla v2.4S, v13.4S, v31.s[0] +sub v13.4s, v7.4s, v2.4s +add v7.4s, v7.4s, v2.4s +str q23, [x0, #896] +str q8, [x0, #912] +str q7, [x0, #928] +str q13, [x0, #944] +ldr q13, [x17, #+2048] +ldr q7, [x17, #+2064] +ldr q8, [x17, #+2080] +ldr q23, [x17, #+2096] +ldr q2, [x17, #+2112] +ldr q9, [x17, #+2128] +ldr q11, [x17, #+2144] +ldr q0, [x17, #+2160] +ldr q26, [x0, #992] +ldr q5, [x0, #1008] +ldr q10, [x0, #960] +ldr q6, [x0, #976] +sqrdmulh v15.4S, v26.4S, v7.s[0] +mul v26.4S, v26.4S,v13.s[0] +mla v26.4S, v15.4S, v31.s[0] +sub v15.4s, v10.4s, v26.4s +add v10.4s, v10.4s, v26.4s +sqrdmulh v26.4S, v5.4S, v7.s[0] +mul v5.4S, v5.4S,v13.s[0] +mla v5.4S, v26.4S, v31.s[0] +sub v26.4s, v6.4s, v5.4s +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v6.4S, v7.s[1] +mul v6.4S, v6.4S,v13.s[1] +mla v6.4S, v5.4S, v31.s[0] +sub v5.4s, v10.4s, v6.4s +add v10.4s, v10.4s, v6.4s +sqrdmulh v6.4S, v26.4S, v7.s[2] +mul v26.4S, v26.4S,v13.s[2] +mla v26.4S, v6.4S, v31.s[0] +sub v6.4s, v15.4s, v26.4s +add v15.4s, v15.4s, v26.4s +trn1 v26.4S, v10.4S, v5.4S +trn2 v29.4S, v10.4S, v5.4S +trn1 v4.4S, v15.4S, v6.4S +trn2 v30.4S, v15.4S, v6.4S +trn2 v15.2D, v26.2D, v4.2D +trn2 v6.2D, v29.2D, v30.2D +trn1 v10.2D, v26.2D, v4.2D +trn1 v5.2D, v29.2D, v30.2D +sqrdmulh v30.4S, v15.4S, v23.4S +mul v15.4S, v15.4S,v8.4S +mla v15.4S, v30.4S, v31.s[0] +sub v30.4s, v10.4s, v15.4s +add v10.4s, v10.4s, v15.4s +sqrdmulh v15.4S, v6.4S, v23.4S +mul v6.4S, v6.4S,v8.4S +mla v6.4S, v15.4S, v31.s[0] +sub v15.4s, v5.4s, v6.4s +add v5.4s, v5.4s, v6.4s +sqrdmulh v6.4S, v5.4S, v9.4S +mul v5.4S, v5.4S,v2.4S +mla v5.4S, v6.4S, v31.s[0] +sub v6.4s, v10.4s, v5.4s +add v10.4s, v10.4s, v5.4s +sqrdmulh v5.4S, v15.4S, v0.4S +mul v15.4S, v15.4S,v11.4S +mla v15.4S, v5.4S, v31.s[0] +sub v5.4s, v30.4s, v15.4s +add v30.4s, v30.4s, v15.4s +str q10, [x0, #960] +str q6, [x0, #976] +str q30, [x0, #992] +str q5, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 2392 +// Instruction count: 2388 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_7_0.s b/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_7_0.s new file mode 100644 index 0000000..1628189 --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_7_0.s @@ -0,0 +1,2422 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 26036764 // Layer 6, block 0 +.word 7065381 // Layer 6, block 1 +.word 11280567 // Layer 6, block 2 +.word 19695786 // Layer 6, block 3 +.word 1666225723 // Layer 6, block 0 +.word 452149874 // Layer 6, block 1 +.word 721901190 // Layer 6, block 2 +.word 1260434103 // Layer 6, block 3 +.word 28678040 // Layer 7, block 0 +.word 5637166 // Layer 7, block 2 +.word 18759424 // Layer 7, block 4 +.word 8648030 // Layer 7, block 6 +.word 1835254486 // Layer 7, block 0 +.word 360751090 // Layer 7, block 2 +.word 1200511508 // Layer 7, block 4 +.word 553431680 // Layer 7, block 6 +.word 7232147 // Layer 7, block 1 +.word 7430689 // Layer 7, block 3 +.word 14819378 // Layer 7, block 5 +.word 22112339 // Layer 7, block 7 +.word 462822084 // Layer 7, block 1 +.word 475527802 // Layer 7, block 3 +.word 948367809 // Layer 7, block 5 +.word 1415081692 // Layer 7, block 7 +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14834498 // Layer 6, block 4 +.word 22861321 // Layer 6, block 5 +.word 23033862 // Layer 6, block 6 +.word 32211066 // Layer 6, block 7 +.word 949335415 // Layer 6, block 4 +.word 1463012881 // Layer 6, block 5 +.word 1474054663 // Layer 6, block 6 +.word 2061350894 // Layer 6, block 7 +.word 7103825 // Layer 7, block 8 +.word 24338119 // Layer 7, block 10 +.word 6674394 // Layer 7, block 12 +.word 3716128 // Layer 7, block 14 +.word 454610102 // Layer 7, block 8 +.word 1557520740 // Layer 7, block 10 +.word 427128616 // Layer 7, block 12 +.word 237814041 // Layer 7, block 14 +.word 18577393 // Layer 7, block 9 +.word 17042091 // Layer 7, block 11 +.word 6574213 // Layer 7, block 13 +.word 24666803 // Layer 7, block 15 +.word 1188862414 // Layer 7, block 9 +.word 1090610585 // Layer 7, block 11 +.word 420717521 // Layer 7, block 13 +.word 1578554911 // Layer 7, block 15 +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 11253846 // Layer 6, block 8 +.word 16151303 // Layer 6, block 9 +.word 1821442 // Layer 6, block 10 +.word 23358663 // Layer 6, block 11 +.word 720191176 // Layer 6, block 8 +.word 1033604503 // Layer 6, block 9 +.word 116563391 // Layer 6, block 10 +.word 1494840340 // Layer 6, block 11 +.word 32787475 // Layer 7, block 16 +.word 8269259 // Layer 7, block 18 +.word 20826321 // Layer 7, block 20 +.word 21194054 // Layer 7, block 22 +.word 2098238255 // Layer 7, block 16 +.word 529192186 // Layer 7, block 18 +.word 1332782821 // Layer 7, block 20 +.word 1356315937 // Layer 7, block 22 +.word 28400654 // Layer 7, block 17 +.word 31090287 // Layer 7, block 19 +.word 26776841 // Layer 7, block 21 +.word 22281074 // Layer 7, block 23 +.word 1817503137 // Layer 7, block 17 +.word 1989626512 // Layer 7, block 19 +.word 1713587037 // Layer 7, block 21 +.word 1425879908 // Layer 7, block 23 +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 20504641 // Layer 6, block 12 +.word 7735096 // Layer 6, block 13 +.word 29463916 // Layer 6, block 14 +.word 23172067 // Layer 6, block 15 +.word 1312196872 // Layer 6, block 12 +.word 495008363 // Layer 6, block 13 +.word 1885546712 // Layer 6, block 14 +.word 1482899108 // Layer 6, block 15 +.word 1953000 // Layer 7, block 24 +.word 12766243 // Layer 7, block 26 +.word 16292342 // Layer 7, block 28 +.word 25143337 // Layer 7, block 30 +.word 124982461 // Layer 7, block 24 +.word 816977197 // Layer 7, block 26 +.word 1042630311 // Layer 7, block 28 +.word 1609050759 // Layer 7, block 30 +.word 12486848 // Layer 7, block 25 +.word 31556661 // Layer 7, block 27 +.word 28330310 // Layer 7, block 29 +.word 15137961 // Layer 7, block 31 +.word 799097282 // Layer 7, block 25 +.word 2019472170 // Layer 7, block 27 +.word 1813001465 // Layer 7, block 29 +.word 968755565 // Layer 7, block 31 +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 18663828 // Layer 6, block 16 +.word 25765932 // Layer 6, block 17 +.word 11779122 // Layer 6, block 18 +.word 29112305 // Layer 6, block 19 +.word 1194393831 // Layer 6, block 16 +.word 1648893798 // Layer 6, block 17 +.word 753806275 // Layer 6, block 18 +.word 1863045325 // Layer 6, block 19 +.word 33163184 // Layer 7, block 32 +.word 11550623 // Layer 7, block 34 +.word 25375595 // Layer 7, block 36 +.word 18254638 // Layer 7, block 38 +.word 2122281795 // Layer 7, block 32 +.word 739183455 // Layer 7, block 34 +.word 1623914137 // Layer 7, block 36 +.word 1168207670 // Layer 7, block 38 +.word 9551359 // Layer 7, block 33 +.word 33257316 // Layer 7, block 35 +.word 10387700 // Layer 7, block 37 +.word 4263629 // Layer 7, block 39 +.word 611240324 // Layer 7, block 33 +.word 2128305784 // Layer 7, block 35 +.word 664762063 // Layer 7, block 37 +.word 272851431 // Layer 7, block 39 +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 596073 // Layer 6, block 20 +.word 29039358 // Layer 6, block 21 +.word 6760262 // Layer 6, block 22 +.word 2228887 // Layer 6, block 23 +.word 38145761 // Layer 6, block 20 +.word 1858377074 // Layer 6, block 21 +.word 432623749 // Layer 6, block 22 +.word 142637881 // Layer 6, block 23 +.word 25929180 // Layer 7, block 40 +.word 23508428 // Layer 7, block 42 +.word 22560727 // Layer 7, block 44 +.word 29457393 // Layer 7, block 46 +.word 1659340873 // Layer 7, block 40 +.word 1504424569 // Layer 7, block 42 +.word 1443776334 // Layer 7, block 44 +.word 1885129272 // Layer 7, block 46 +.word 17371159 // Layer 7, block 41 +.word 11558208 // Layer 7, block 43 +.word 15755637 // Layer 7, block 45 +.word 20740787 // Layer 7, block 47 +.word 1111669329 // Layer 7, block 41 +.word 739668858 // Layer 7, block 43 +.word 1008283812 // Layer 7, block 45 +.word 1327309063 // Layer 7, block 47 +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 13624329 // Layer 6, block 24 +.word 9838349 // Layer 6, block 25 +.word 6934560 // Layer 6, block 26 +.word 11310234 // Layer 6, block 27 +.word 871890510 // Layer 6, block 24 +.word 629606282 // Layer 6, block 25 +.word 443777969 // Layer 6, block 26 +.word 723799733 // Layer 6, block 27 +.word 3153984 // Layer 7, block 48 +.word 15599806 // Layer 7, block 50 +.word 23484790 // Layer 7, block 52 +.word 30174454 // Layer 7, block 54 +.word 201839571 // Layer 7, block 48 +.word 998311389 // Layer 7, block 50 +.word 1502911852 // Layer 7, block 52 +.word 1931017673 // Layer 7, block 54 +.word 13598070 // Layer 7, block 49 +.word 31454003 // Layer 7, block 51 +.word 20506260 // Layer 7, block 53 +.word 5928435 // Layer 7, block 55 +.word 870210062 // Layer 7, block 49 +.word 2012902560 // Layer 7, block 51 +.word 1312300480 // Layer 7, block 53 +.word 379390883 // Layer 7, block 55 +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 32798516 // Layer 6, block 28 +.word 9911360 // Layer 6, block 29 +.word 32443170 // Layer 6, block 30 +.word 31293482 // Layer 6, block 31 +.word 2098944825 // Layer 6, block 28 +.word 634278629 // Layer 6, block 29 +.word 2076204416 // Layer 6, block 30 +.word 2002630000 // Layer 6, block 31 +.word 26013877 // Layer 7, block 56 +.word 22928950 // Layer 7, block 58 +.word 24547058 // Layer 7, block 60 +.word 21082546 // Layer 7, block 62 +.word 1664761067 // Layer 7, block 56 +.word 1467340807 // Layer 7, block 58 +.word 1570891816 // Layer 7, block 60 +.word 1349179970 // Layer 7, block 62 +.word 21864746 // Layer 7, block 57 +.word 27678266 // Layer 7, block 59 +.word 30695887 // Layer 7, block 61 +.word 31772478 // Layer 7, block 63 +.word 1399236949 // Layer 7, block 57 +.word 1771273834 // Layer 7, block 59 +.word 1964386839 // Layer 7, block 61 +.word 2033283404 // Layer 7, block 63 +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 2853776 // Layer 6, block 32 +.word 31645959 // Layer 6, block 33 +.word 29723614 // Layer 6, block 34 +.word 31813171 // Layer 6, block 35 +.word 182627725 // Layer 6, block 32 +.word 2025186806 // Layer 6, block 33 +.word 1902166116 // Layer 6, block 34 +.word 2035887557 // Layer 6, block 35 +.word 30377953 // Layer 7, block 64 +.word 4924837 // Layer 7, block 66 +.word 11362575 // Layer 7, block 68 +.word 31398766 // Layer 7, block 70 +.word 1944040616 // Layer 7, block 64 +.word 315165513 // Layer 7, block 66 +.word 727149301 // Layer 7, block 68 +.word 2009367662 // Layer 7, block 70 +.word 27689101 // Layer 7, block 65 +.word 31229525 // Layer 7, block 67 +.word 6544948 // Layer 7, block 69 +.word 13728247 // Layer 7, block 71 +.word 1771967221 // Layer 7, block 65 +.word 1998537064 // Layer 7, block 67 +.word 418844704 // Layer 7, block 69 +.word 878540754 // Layer 7, block 71 +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9116920 // Layer 6, block 36 +.word 26449800 // Layer 6, block 37 +.word 27173300 // Layer 6, block 38 +.word 1574249 // Layer 6, block 39 +.word 583438350 // Layer 6, block 36 +.word 1692658010 // Layer 6, block 37 +.word 1738958476 // Layer 6, block 38 +.word 100744247 // Layer 6, block 39 +.word 6510145 // Layer 7, block 72 +.word 760999 // Layer 7, block 74 +.word 1634503 // Layer 7, block 76 +.word 29546109 // Layer 7, block 78 +.word 416617482 // Layer 7, block 72 +.word 48700219 // Layer 7, block 74 +.word 104600209 // Layer 7, block 76 +.word 1890806663 // Layer 7, block 78 +.word 2195232 // Layer 7, block 73 +.word 4465852 // Layer 7, block 75 +.word 31203102 // Layer 7, block 77 +.word 29916743 // Layer 7, block 79 +.word 140484126 // Layer 7, block 73 +.word 285792715 // Layer 7, block 75 +.word 1996846121 // Layer 7, block 77 +.word 1914525428 // Layer 7, block 79 +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29172999 // Layer 6, block 40 +.word 16825951 // Layer 6, block 41 +.word 11592382 // Layer 6, block 42 +.word 2671395 // Layer 6, block 43 +.word 1866929445 // Layer 6, block 40 +.word 1076778680 // Layer 6, block 41 +.word 741855827 // Layer 6, block 42 +.word 170956232 // Layer 6, block 43 +.word 14579779 // Layer 7, block 80 +.word 24263513 // Layer 7, block 82 +.word 4646776 // Layer 7, block 84 +.word 69049 // Layer 7, block 86 +.word 933034643 // Layer 7, block 80 +.word 1552746321 // Layer 7, block 82 +.word 297370968 // Layer 7, block 84 +.word 4418799 // Layer 7, block 86 +.word 33263488 // Layer 7, block 81 +.word 22493246 // Layer 7, block 83 +.word 22009979 // Layer 7, block 85 +.word 12021234 // Layer 7, block 87 +.word 2128700762 // Layer 7, block 81 +.word 1439457879 // Layer 7, block 83 +.word 1408531152 // Layer 7, block 85 +.word 769300260 // Layer 7, block 87 +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 15720958 // Layer 6, block 44 +.word 4876619 // Layer 6, block 45 +.word 9370171 // Layer 6, block 46 +.word 2197027 // Layer 6, block 47 +.word 1006064525 // Layer 6, block 44 +.word 312079797 // Layer 6, block 45 +.word 599645177 // Layer 6, block 46 +.word 140598997 // Layer 6, block 47 +.word 16117282 // Layer 7, block 88 +.word 9635661 // Layer 7, block 90 +.word 9117520 // Layer 7, block 92 +.word 3506913 // Layer 7, block 94 +.word 1031427326 // Layer 7, block 88 +.word 616635240 // Layer 7, block 90 +.word 583476747 // Layer 7, block 92 +.word 224425303 // Layer 7, block 94 +.word 20014407 // Layer 7, block 89 +.word 25893988 // Layer 7, block 91 +.word 10257619 // Layer 7, block 93 +.word 24501669 // Layer 7, block 95 +.word 1280824291 // Layer 7, block 89 +.word 1657088757 // Layer 7, block 91 +.word 656437514 // Layer 7, block 93 +.word 1567987141 // Layer 7, block 95 +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 23467272 // Layer 6, block 48 +.word 11944835 // Layer 6, block 49 +.word 29768154 // Layer 6, block 50 +.word 3189790 // Layer 6, block 51 +.word 1501790786 // Layer 6, block 48 +.word 764411097 // Layer 6, block 49 +.word 1905016458 // Layer 6, block 50 +.word 204130980 // Layer 6, block 51 +.word 28559032 // Layer 7, block 96 +.word 20151609 // Layer 7, block 98 +.word 11645481 // Layer 7, block 100 +.word 16402437 // Layer 7, block 102 +.word 1827638556 // Layer 7, block 96 +.word 1289604549 // Layer 7, block 98 +.word 745253903 // Layer 7, block 100 +.word 1049675853 // Layer 7, block 102 +.word 1005359 // Layer 7, block 97 +.word 19130139 // Layer 7, block 99 +.word 11690281 // Layer 7, block 101 +.word 5461508 // Layer 7, block 103 +.word 64338065 // Layer 7, block 97 +.word 1224235458 // Layer 7, block 99 +.word 748120885 // Layer 7, block 101 +.word 349509836 // Layer 7, block 103 +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 4898455 // Layer 6, block 52 +.word 22059944 // Layer 6, block 53 +.word 20315246 // Layer 6, block 54 +.word 28615767 // Layer 6, block 55 +.word 313477194 // Layer 6, block 52 +.word 1411728668 // Layer 6, block 53 +.word 1300076517 // Layer 6, block 54 +.word 1831269319 // Layer 6, block 55 +.word 6226096 // Layer 7, block 104 +.word 14029790 // Layer 7, block 106 +.word 7729000 // Layer 7, block 108 +.word 13958531 // Layer 7, block 110 +.word 398439734 // Layer 7, block 104 +.word 897838034 // Layer 7, block 106 +.word 494618249 // Layer 7, block 108 +.word 893277806 // Layer 7, block 110 +.word 31755058 // Layer 7, block 105 +.word 26102744 // Layer 7, block 107 +.word 19175904 // Layer 7, block 109 +.word 19472238 // Layer 7, block 111 +.word 2032168609 // Layer 7, block 105 +.word 1670448121 // Layer 7, block 107 +.word 1227164194 // Layer 7, block 109 +.word 1246128123 // Layer 7, block 111 +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 17302560 // Layer 6, block 56 +.word 8630188 // Layer 6, block 57 +.word 13744680 // Layer 6, block 58 +.word 31890906 // Layer 6, block 59 +.word 1107279328 // Layer 6, block 56 +.word 552289879 // Layer 6, block 57 +.word 879592386 // Layer 6, block 58 +.word 2040862218 // Layer 6, block 59 +.word 4735938 // Layer 7, block 112 +.word 26671657 // Layer 7, block 114 +.word 25810971 // Layer 7, block 116 +.word 25578690 // Layer 7, block 118 +.word 303076900 // Layer 7, block 112 +.word 1706855774 // Layer 7, block 114 +.word 1651776074 // Layer 7, block 116 +.word 1636911225 // Layer 7, block 118 +.word 6957373 // Layer 7, block 113 +.word 25381712 // Layer 7, block 115 +.word 27780827 // Layer 7, block 117 +.word 28062311 // Layer 7, block 119 +.word 445237890 // Layer 7, block 113 +.word 1624305595 // Layer 7, block 115 +.word 1777837237 // Layer 7, block 117 +.word 1795850838 // Layer 7, block 119 +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 26150922 // Layer 6, block 60 +.word 29525906 // Layer 6, block 61 +.word 23080870 // Layer 6, block 62 +.word 1636987 // Layer 6, block 63 +.word 1673531278 // Layer 6, block 60 +.word 1889513769 // Layer 6, block 61 +.word 1477062945 // Layer 6, block 62 +.word 104759172 // Layer 6, block 63 +.word 10674616 // Layer 7, block 120 +.word 9508293 // Layer 7, block 122 +.word 4274200 // Layer 7, block 124 +.word 10066304 // Layer 7, block 126 +.word 683123285 // Layer 7, block 120 +.word 608484310 // Layer 7, block 122 +.word 273527923 // Layer 7, block 124 +.word 644194289 // Layer 7, block 126 +.word 26473446 // Layer 7, block 121 +.word 14853570 // Layer 7, block 123 +.word 32427548 // Layer 7, block 125 +.word 16598340 // Layer 7, block 127 +.word 1694171239 // Layer 7, block 121 +.word 950555930 // Layer 7, block 123 +.word 2075204685 // Layer 7, block 125 +.word 1062212688 // Layer 7, block 127 +.text +.global ntt_u32_full_neon_asm_var_4_4_7_0 +.global _ntt_u32_full_neon_asm_var_4_4_7_0 +ntt_u32_full_neon_asm_var_4_4_7_0: +_ntt_u32_full_neon_asm_var_4_4_7_0: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #928] +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +ldr q20, [x0, #992] +sqrdmulh v19.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q18, [x0, #800] +sqrdmulh v17.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +ldr q16, [x0, #864] +sqrdmulh v3.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +mla v22.4S, v21.4S, v31.s[0] +mla v20.4S, v19.4S, v31.s[0] +mla v18.4S, v17.4S, v31.s[0] +mla v16.4S, v3.4S, v31.s[0] +ldr q3, [x0, #544] +sqrdmulh v17.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +ldr q19, [x0, #608] +sqrdmulh v21.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q2, [x0, #672] +ldr q1, [x0, #416] +sqrdmulh v0.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +sub v15.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +ldr q22, [x0, #736] +ldr q14, [x0, #480] +sqrdmulh v13.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v12.4s, v14.4s, v20.4s +add v14.4s, v14.4s, v20.4s +ldr q20, [x0, #288] +mla v3.4S, v17.4S, v31.s[0] +mla v19.4S, v21.4S, v31.s[0] +sub v21.4s, v20.4s, v18.4s +mla v2.4S, v0.4S, v31.s[0] +mla v22.4S, v13.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +ldr q18, [x0, #352] +sqrdmulh v13.4S, v1.4S, v29.s[1] +mul v1.4S, v1.4S,v30.s[1] +sub v0.4s, v18.4s, v16.4s +sqrdmulh v17.4S, v14.4S, v29.s[1] +mul v14.4S, v14.4S,v30.s[1] +add v18.4s, v18.4s, v16.4s +ldr q16, [x0, #32] +sqrdmulh v11.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v10.4s, v16.4s, v3.4s +add v16.4s, v16.4s, v3.4s +ldr q3, [x0, #96] +sqrdmulh v9.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v8.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +ldr q19, [x0, #160] +mla v1.4S, v13.4S, v31.s[0] +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v19.4s, v2.4s +mla v20.4S, v11.4S, v31.s[0] +mla v18.4S, v9.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +ldr q2, [x0, #224] +sqrdmulh v9.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +sub v11.4s, v2.4s, v22.4s +sqrdmulh v13.4S, v12.4S, v29.s[2] +mul v12.4S, v12.4S,v30.s[2] +add v2.4s, v2.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +sub v7.4s, v19.4s, v1.4s +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v29.s[2] +mul v0.4S, v0.4S,v30.s[2] +sub v6.4s, v2.4s, v14.4s +add v2.4s, v2.4s, v14.4s +mla v15.4S, v9.4S, v31.s[0] +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v16.4s, v20.4s +mla v21.4S, v22.4S, v31.s[0] +mla v0.4S, v1.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v7.4S, v27.s[1] +mul v7.4S, v7.4S,v28.s[1] +sub v1.4s, v3.4s, v18.4s +sqrdmulh v22.4S, v6.4S, v27.s[1] +mul v6.4S, v6.4S,v28.s[1] +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v19.4S, v27.s[0] +mul v19.4S, v19.4S,v28.s[0] +sub v9.4s, v17.4s, v15.4s +add v17.4s, v17.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v27.s[0] +mul v2.4S, v2.4S,v28.s[0] +sub v14.4s, v11.4s, v12.4s +add v11.4s, v11.4s, v12.4s +mla v7.4S, v20.4S, v31.s[0] +mla v6.4S, v22.4S, v31.s[0] +sub v22.4s, v10.4s, v21.4s +mla v19.4S, v18.4S, v31.s[0] +mla v2.4S, v15.4S, v31.s[0] +add v10.4s, v10.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v27.s[2] +mul v17.4S, v17.4S,v28.s[2] +sub v15.4s, v8.4s, v0.4s +sqrdmulh v18.4S, v11.4S, v27.s[2] +mul v11.4S, v11.4S,v28.s[2] +add v8.4s, v8.4s, v0.4s +sqrdmulh v0.4S, v9.4S, v27.s[3] +mul v9.4S, v9.4S,v28.s[3] +sub v20.4s, v13.4s, v7.4s +add v13.4s, v13.4s, v7.4s +sqrdmulh v7.4S, v14.4S, v27.s[3] +mul v14.4S, v14.4S,v28.s[3] +sub v12.4s, v1.4s, v6.4s +add v1.4s, v1.4s, v6.4s +mla v17.4S, v21.4S, v31.s[0] +mla v11.4S, v18.4S, v31.s[0] +sub v18.4s, v16.4s, v19.4s +mla v9.4S, v0.4S, v31.s[0] +mla v14.4S, v7.4S, v31.s[0] +add v16.4s, v16.4s, v19.4s +sqrdmulh v19.4S, v1.4S, v25.s[2] +mul v1.4S, v1.4S,v26.s[2] +sub v7.4s, v3.4s, v2.4s +sqrdmulh v0.4S, v12.4S, v25.s[3] +mul v12.4S, v12.4S,v26.s[3] +add v3.4s, v3.4s, v2.4s +sqrdmulh v2.4S, v7.4S, v25.s[1] +mul v7.4S, v7.4S,v26.s[1] +sub v21.4s, v10.4s, v17.4s +add v10.4s, v10.4s, v17.4s +sqrdmulh v17.4S, v3.4S, v25.s[0] +mul v3.4S, v3.4S,v26.s[0] +sub v6.4s, v8.4s, v11.4s +add v8.4s, v8.4s, v11.4s +mla v1.4S, v19.4S, v31.s[0] +mla v12.4S, v0.4S, v31.s[0] +sub v0.4s, v22.4s, v9.4s +mla v7.4S, v2.4S, v31.s[0] +mla v3.4S, v17.4S, v31.s[0] +add v22.4s, v22.4s, v9.4s +sqrdmulh v9.4S, v8.4S, v23.s[0] +mul v8.4S, v8.4S,v24.s[0] +sub v17.4s, v15.4s, v14.4s +sqrdmulh v2.4S, v6.4S, v23.s[1] +mul v6.4S, v6.4S,v24.s[1] +add v15.4s, v15.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v23.s[2] +mul v15.4S, v15.4S,v24.s[2] +sub v19.4s, v13.4s, v1.4s +add v13.4s, v13.4s, v1.4s +sqrdmulh v1.4S, v17.4S, v23.s[3] +mul v17.4S, v17.4S,v24.s[3] +sub v11.4s, v20.4s, v12.4s +add v20.4s, v20.4s, v12.4s +mla v8.4S, v9.4S, v31.s[0] +mla v6.4S, v2.4S, v31.s[0] +sub v2.4s, v18.4s, v7.4s +str q13, [x0, #288] +mla v15.4S, v14.4S, v31.s[0] +mla v17.4S, v1.4S, v31.s[0] +add v18.4s, v18.4s, v7.4s +str q19, [x0, #352] +ldr q19, [x0, #944] +sqrdmulh v7.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +sub v1.4s, v16.4s, v3.4s +str q20, [x0, #416] +ldr q20, [x0, #1008] +sqrdmulh v14.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v16.4s, v16.4s, v3.4s +str q11, [x0, #480] +ldr q11, [x0, #816] +sqrdmulh v3.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +sub v13.4s, v10.4s, v8.4s +add v10.4s, v10.4s, v8.4s +ldr q8, [x0, #880] +sqrdmulh v9.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v12.4s, v21.4s, v6.4s +add v21.4s, v21.4s, v6.4s +mla v19.4S, v7.4S, v31.s[0] +mla v20.4S, v14.4S, v31.s[0] +sub v14.4s, v22.4s, v15.4s +str q18, [x0, #160] +mla v11.4S, v3.4S, v31.s[0] +mla v8.4S, v9.4S, v31.s[0] +add v22.4s, v22.4s, v15.4s +str q2, [x0, #224] +ldr q2, [x0, #560] +sqrdmulh v15.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +sub v9.4s, v0.4s, v17.4s +str q16, [x0, #32] +ldr q16, [x0, #624] +sqrdmulh v3.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +add v0.4s, v0.4s, v17.4s +str q1, [x0, #96] +ldr q1, [x0, #688] +ldr q17, [x0, #432] +sqrdmulh v18.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +sub v7.4s, v17.4s, v19.4s +add v17.4s, v17.4s, v19.4s +ldr q19, [x0, #752] +ldr q6, [x0, #496] +sqrdmulh v5.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +sub v4.4s, v6.4s, v20.4s +add v6.4s, v6.4s, v20.4s +ldr q20, [x0, #304] +mla v2.4S, v15.4S, v31.s[0] +mla v16.4S, v3.4S, v31.s[0] +sub v3.4s, v20.4s, v11.4s +str q10, [x0, #544] +mla v1.4S, v18.4S, v31.s[0] +mla v19.4S, v5.4S, v31.s[0] +add v20.4s, v20.4s, v11.4s +str q13, [x0, #608] +ldr q13, [x0, #368] +sqrdmulh v11.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v5.4s, v13.4s, v8.4s +str q21, [x0, #672] +sqrdmulh v21.4S, v6.4S, v29.s[1] +mul v6.4S, v6.4S,v30.s[1] +add v13.4s, v13.4s, v8.4s +str q12, [x0, #736] +ldr q12, [x0, #48] +sqrdmulh v8.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v18.4s, v12.4s, v2.4s +add v12.4s, v12.4s, v2.4s +ldr q2, [x0, #112] +sqrdmulh v10.4S, v13.4S, v29.s[1] +mul v13.4S, v13.4S,v30.s[1] +sub v15.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +ldr q16, [x0, #176] +mla v17.4S, v11.4S, v31.s[0] +mla v6.4S, v21.4S, v31.s[0] +sub v21.4s, v16.4s, v1.4s +str q22, [x0, #800] +mla v20.4S, v8.4S, v31.s[0] +mla v13.4S, v10.4S, v31.s[0] +add v16.4s, v16.4s, v1.4s +str q14, [x0, #864] +ldr q14, [x0, #240] +sqrdmulh v1.4S, v7.4S, v29.s[2] +mul v7.4S, v7.4S,v30.s[2] +sub v10.4s, v14.4s, v19.4s +str q0, [x0, #928] +sqrdmulh v0.4S, v4.4S, v29.s[2] +mul v4.4S, v4.4S,v30.s[2] +add v14.4s, v14.4s, v19.4s +str q9, [x0, #992] +sqrdmulh v9.4S, v3.4S, v29.s[2] +mul v3.4S, v3.4S,v30.s[2] +sub v19.4s, v16.4s, v17.4s +add v16.4s, v16.4s, v17.4s +sqrdmulh v17.4S, v5.4S, v29.s[2] +mul v5.4S, v5.4S,v30.s[2] +sub v8.4s, v14.4s, v6.4s +add v14.4s, v14.4s, v6.4s +mla v7.4S, v1.4S, v31.s[0] +mla v4.4S, v0.4S, v31.s[0] +sub v0.4s, v12.4s, v20.4s +mla v3.4S, v9.4S, v31.s[0] +mla v5.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v27.s[1] +mul v19.4S, v19.4S,v28.s[1] +sub v17.4s, v2.4s, v13.4s +sqrdmulh v9.4S, v8.4S, v27.s[1] +mul v8.4S, v8.4S,v28.s[1] +add v2.4s, v2.4s, v13.4s +sqrdmulh v13.4S, v16.4S, v27.s[0] +mul v16.4S, v16.4S,v28.s[0] +sub v1.4s, v21.4s, v7.4s +add v21.4s, v21.4s, v7.4s +sqrdmulh v7.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +sub v6.4s, v10.4s, v4.4s +add v10.4s, v10.4s, v4.4s +mla v19.4S, v20.4S, v31.s[0] +mla v8.4S, v9.4S, v31.s[0] +sub v9.4s, v18.4s, v3.4s +mla v16.4S, v13.4S, v31.s[0] +mla v14.4S, v7.4S, v31.s[0] +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v27.s[2] +mul v21.4S, v21.4S,v28.s[2] +sub v7.4s, v15.4s, v5.4s +sqrdmulh v13.4S, v10.4S, v27.s[2] +mul v10.4S, v10.4S,v28.s[2] +add v15.4s, v15.4s, v5.4s +sqrdmulh v5.4S, v1.4S, v27.s[3] +mul v1.4S, v1.4S,v28.s[3] +sub v20.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v27.s[3] +mul v6.4S, v6.4S,v28.s[3] +sub v4.4s, v17.4s, v8.4s +add v17.4s, v17.4s, v8.4s +mla v21.4S, v3.4S, v31.s[0] +mla v10.4S, v13.4S, v31.s[0] +sub v13.4s, v12.4s, v16.4s +mla v1.4S, v5.4S, v31.s[0] +mla v6.4S, v19.4S, v31.s[0] +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v25.s[2] +mul v17.4S, v17.4S,v26.s[2] +sub v19.4s, v2.4s, v14.4s +sqrdmulh v5.4S, v4.4S, v25.s[3] +mul v4.4S, v4.4S,v26.s[3] +add v2.4s, v2.4s, v14.4s +sqrdmulh v14.4S, v19.4S, v25.s[1] +mul v19.4S, v19.4S,v26.s[1] +sub v3.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v2.4S, v25.s[0] +mul v2.4S, v2.4S,v26.s[0] +sub v8.4s, v15.4s, v10.4s +add v15.4s, v15.4s, v10.4s +mla v17.4S, v16.4S, v31.s[0] +mla v4.4S, v5.4S, v31.s[0] +sub v5.4s, v9.4s, v1.4s +mla v19.4S, v14.4S, v31.s[0] +mla v2.4S, v21.4S, v31.s[0] +add v9.4s, v9.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v23.s[0] +mul v15.4S, v15.4S,v24.s[0] +sub v21.4s, v7.4s, v6.4s +sqrdmulh v14.4S, v8.4S, v23.s[1] +mul v8.4S, v8.4S,v24.s[1] +add v7.4s, v7.4s, v6.4s +sqrdmulh v6.4S, v7.4S, v23.s[2] +mul v7.4S, v7.4S,v24.s[2] +sub v16.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +sqrdmulh v17.4S, v21.4S, v23.s[3] +mul v21.4S, v21.4S,v24.s[3] +sub v10.4s, v20.4s, v4.4s +add v20.4s, v20.4s, v4.4s +mla v15.4S, v1.4S, v31.s[0] +mla v8.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v19.4s +str q0, [x0, #304] +mla v7.4S, v6.4S, v31.s[0] +mla v21.4S, v17.4S, v31.s[0] +add v13.4s, v13.4s, v19.4s +str q16, [x0, #368] +ldr q16, [x0, #896] +sqrdmulh v19.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v17.4s, v12.4s, v2.4s +str q20, [x0, #432] +ldr q20, [x0, #960] +sqrdmulh v6.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v12.4s, v12.4s, v2.4s +str q10, [x0, #496] +ldr q10, [x0, #768] +sqrdmulh v2.4S, v10.4S, v29.s[0] +mul v10.4S, v10.4S,v30.s[0] +sub v0.4s, v18.4s, v15.4s +add v18.4s, v18.4s, v15.4s +ldr q15, [x0, #832] +sqrdmulh v1.4S, v15.4S, v29.s[0] +mul v15.4S, v15.4S,v30.s[0] +sub v4.4s, v3.4s, v8.4s +add v3.4s, v3.4s, v8.4s +mla v16.4S, v19.4S, v31.s[0] +mla v20.4S, v6.4S, v31.s[0] +sub v6.4s, v9.4s, v7.4s +str q13, [x0, #176] +mla v10.4S, v2.4S, v31.s[0] +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v7.4s +str q14, [x0, #240] +ldr q14, [x0, #512] +sqrdmulh v7.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v1.4s, v5.4s, v21.4s +str q12, [x0, #48] +ldr q12, [x0, #576] +sqrdmulh v2.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +add v5.4s, v5.4s, v21.4s +str q17, [x0, #112] +ldr q17, [x0, #640] +ldr q21, [x0, #384] +sqrdmulh v13.4S, v17.4S, v29.s[0] +mul v17.4S, v17.4S,v30.s[0] +sub v19.4s, v21.4s, v16.4s +add v21.4s, v21.4s, v16.4s +ldr q16, [x0, #704] +ldr q8, [x0, #448] +sqrdmulh v22.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v11.4s, v8.4s, v20.4s +add v8.4s, v8.4s, v20.4s +ldr q20, [x0, #256] +mla v14.4S, v7.4S, v31.s[0] +mla v12.4S, v2.4S, v31.s[0] +sub v2.4s, v20.4s, v10.4s +str q18, [x0, #560] +mla v17.4S, v13.4S, v31.s[0] +mla v16.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v10.4s +str q0, [x0, #624] +ldr q0, [x0, #320] +sqrdmulh v10.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v22.4s, v0.4s, v15.4s +str q3, [x0, #688] +sqrdmulh v3.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +add v0.4s, v0.4s, v15.4s +str q4, [x0, #752] +ldr q4, [x0, #0] +sqrdmulh v15.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v13.4s, v4.4s, v14.4s +add v4.4s, v4.4s, v14.4s +ldr q14, [x0, #64] +sqrdmulh v18.4S, v0.4S, v29.s[1] +mul v0.4S, v0.4S,v30.s[1] +sub v7.4s, v14.4s, v12.4s +add v14.4s, v14.4s, v12.4s +ldr q12, [x0, #128] +mla v21.4S, v10.4S, v31.s[0] +mla v8.4S, v3.4S, v31.s[0] +sub v3.4s, v12.4s, v17.4s +str q9, [x0, #816] +mla v20.4S, v15.4S, v31.s[0] +mla v0.4S, v18.4S, v31.s[0] +add v12.4s, v12.4s, v17.4s +str q6, [x0, #880] +ldr q6, [x0, #192] +sqrdmulh v17.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +sub v18.4s, v6.4s, v16.4s +str q5, [x0, #944] +sqrdmulh v5.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +add v6.4s, v6.4s, v16.4s +str q1, [x0, #1008] +sqrdmulh v1.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v16.4s, v12.4s, v21.4s +add v12.4s, v12.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +sub v15.4s, v6.4s, v8.4s +add v6.4s, v6.4s, v8.4s +mla v19.4S, v17.4S, v31.s[0] +mla v11.4S, v5.4S, v31.s[0] +sub v5.4s, v4.4s, v20.4s +mla v2.4S, v1.4S, v31.s[0] +mla v22.4S, v21.4S, v31.s[0] +add v4.4s, v4.4s, v20.4s +sqrdmulh v20.4S, v16.4S, v27.s[1] +mul v16.4S, v16.4S,v28.s[1] +sub v21.4s, v14.4s, v0.4s +sqrdmulh v1.4S, v15.4S, v27.s[1] +mul v15.4S, v15.4S,v28.s[1] +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +sub v17.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v27.s[0] +mul v6.4S, v6.4S,v28.s[0] +sub v8.4s, v18.4s, v11.4s +add v18.4s, v18.4s, v11.4s +mla v16.4S, v20.4S, v31.s[0] +mla v15.4S, v1.4S, v31.s[0] +sub v1.4s, v13.4s, v2.4s +mla v12.4S, v0.4S, v31.s[0] +mla v6.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v3.4S, v27.s[2] +mul v3.4S, v3.4S,v28.s[2] +sub v19.4s, v7.4s, v22.4s +sqrdmulh v0.4S, v18.4S, v27.s[2] +mul v18.4S, v18.4S,v28.s[2] +add v7.4s, v7.4s, v22.4s +sqrdmulh v22.4S, v17.4S, v27.s[3] +mul v17.4S, v17.4S,v28.s[3] +sub v20.4s, v5.4s, v16.4s +add v5.4s, v5.4s, v16.4s +sqrdmulh v16.4S, v8.4S, v27.s[3] +mul v8.4S, v8.4S,v28.s[3] +sub v11.4s, v21.4s, v15.4s +add v21.4s, v21.4s, v15.4s +mla v3.4S, v2.4S, v31.s[0] +mla v18.4S, v0.4S, v31.s[0] +sub v0.4s, v4.4s, v12.4s +mla v17.4S, v22.4S, v31.s[0] +mla v8.4S, v16.4S, v31.s[0] +add v4.4s, v4.4s, v12.4s +sqrdmulh v12.4S, v21.4S, v25.s[2] +mul v21.4S, v21.4S,v26.s[2] +sub v16.4s, v14.4s, v6.4s +sqrdmulh v22.4S, v11.4S, v25.s[3] +mul v11.4S, v11.4S,v26.s[3] +add v14.4s, v14.4s, v6.4s +sqrdmulh v6.4S, v16.4S, v25.s[1] +mul v16.4S, v16.4S,v26.s[1] +sub v2.4s, v13.4s, v3.4s +add v13.4s, v13.4s, v3.4s +sqrdmulh v3.4S, v14.4S, v25.s[0] +mul v14.4S, v14.4S,v26.s[0] +sub v15.4s, v7.4s, v18.4s +add v7.4s, v7.4s, v18.4s +mla v21.4S, v12.4S, v31.s[0] +mla v11.4S, v22.4S, v31.s[0] +sub v22.4s, v1.4s, v17.4s +mla v16.4S, v6.4S, v31.s[0] +mla v14.4S, v3.4S, v31.s[0] +add v1.4s, v1.4s, v17.4s +sqrdmulh v17.4S, v7.4S, v23.s[0] +mul v7.4S, v7.4S,v24.s[0] +sub v3.4s, v19.4s, v8.4s +sqrdmulh v6.4S, v15.4S, v23.s[1] +mul v15.4S, v15.4S,v24.s[1] +add v19.4s, v19.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v23.s[2] +mul v19.4S, v19.4S,v24.s[2] +sub v12.4s, v5.4s, v21.4s +add v5.4s, v5.4s, v21.4s +sqrdmulh v21.4S, v3.4S, v23.s[3] +mul v3.4S, v3.4S,v24.s[3] +sub v18.4s, v20.4s, v11.4s +add v20.4s, v20.4s, v11.4s +mla v7.4S, v17.4S, v31.s[0] +mla v15.4S, v6.4S, v31.s[0] +sub v6.4s, v0.4s, v16.4s +str q5, [x0, #256] +mla v19.4S, v8.4S, v31.s[0] +mla v3.4S, v21.4S, v31.s[0] +add v0.4s, v0.4s, v16.4s +str q12, [x0, #320] +ldr q12, [x0, #912] +sqrdmulh v16.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +sub v21.4s, v4.4s, v14.4s +str q20, [x0, #384] +ldr q20, [x0, #976] +sqrdmulh v8.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v4.4s, v4.4s, v14.4s +str q18, [x0, #448] +ldr q18, [x0, #784] +sqrdmulh v14.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +sub v5.4s, v13.4s, v7.4s +add v13.4s, v13.4s, v7.4s +ldr q7, [x0, #848] +sqrdmulh v17.4S, v7.4S, v29.s[0] +mul v7.4S, v7.4S,v30.s[0] +sub v11.4s, v2.4s, v15.4s +add v2.4s, v2.4s, v15.4s +mla v12.4S, v16.4S, v31.s[0] +mla v20.4S, v8.4S, v31.s[0] +sub v8.4s, v1.4s, v19.4s +str q0, [x0, #128] +mla v18.4S, v14.4S, v31.s[0] +mla v7.4S, v17.4S, v31.s[0] +add v1.4s, v1.4s, v19.4s +str q6, [x0, #192] +ldr q6, [x0, #528] +sqrdmulh v19.4S, v6.4S, v29.s[0] +mul v6.4S, v6.4S,v30.s[0] +sub v17.4s, v22.4s, v3.4s +str q4, [x0, #0] +ldr q4, [x0, #592] +sqrdmulh v14.4S, v4.4S, v29.s[0] +mul v4.4S, v4.4S,v30.s[0] +add v22.4s, v22.4s, v3.4s +str q21, [x0, #64] +ldr q21, [x0, #656] +ldr q3, [x0, #400] +sqrdmulh v0.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +sub v16.4s, v3.4s, v12.4s +add v3.4s, v3.4s, v12.4s +ldr q12, [x0, #720] +ldr q15, [x0, #464] +sqrdmulh v9.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +sub v10.4s, v15.4s, v20.4s +add v15.4s, v15.4s, v20.4s +ldr q20, [x0, #272] +mla v6.4S, v19.4S, v31.s[0] +mla v4.4S, v14.4S, v31.s[0] +sub v14.4s, v20.4s, v18.4s +str q13, [x0, #512] +mla v21.4S, v0.4S, v31.s[0] +mla v12.4S, v9.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +str q5, [x0, #576] +ldr q5, [x0, #336] +sqrdmulh v18.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v9.4s, v5.4s, v7.4s +str q2, [x0, #640] +sqrdmulh v2.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +add v5.4s, v5.4s, v7.4s +str q11, [x0, #704] +ldr q11, [x0, #16] +sqrdmulh v7.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v0.4s, v11.4s, v6.4s +add v11.4s, v11.4s, v6.4s +ldr q6, [x0, #80] +sqrdmulh v13.4S, v5.4S, v29.s[1] +mul v5.4S, v5.4S,v30.s[1] +sub v19.4s, v6.4s, v4.4s +add v6.4s, v6.4s, v4.4s +ldr q4, [x0, #144] +mla v3.4S, v18.4S, v31.s[0] +mla v15.4S, v2.4S, v31.s[0] +sub v2.4s, v4.4s, v21.4s +str q1, [x0, #768] +mla v20.4S, v7.4S, v31.s[0] +mla v5.4S, v13.4S, v31.s[0] +add v4.4s, v4.4s, v21.4s +str q8, [x0, #832] +ldr q8, [x0, #208] +sqrdmulh v21.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +sub v13.4s, v8.4s, v12.4s +str q22, [x0, #896] +sqrdmulh v22.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +add v8.4s, v8.4s, v12.4s +str q17, [x0, #960] +sqrdmulh v17.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v12.4s, v4.4s, v3.4s +add v4.4s, v4.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v29.s[2] +mul v9.4S, v9.4S,v30.s[2] +sub v7.4s, v8.4s, v15.4s +add v8.4s, v8.4s, v15.4s +mla v16.4S, v21.4S, v31.s[0] +mla v10.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v20.4s +mla v14.4S, v17.4S, v31.s[0] +mla v9.4S, v3.4S, v31.s[0] +add v11.4s, v11.4s, v20.4s +sqrdmulh v20.4S, v12.4S, v27.s[1] +mul v12.4S, v12.4S,v28.s[1] +sub v3.4s, v6.4s, v5.4s +sqrdmulh v17.4S, v7.4S, v27.s[1] +mul v7.4S, v7.4S,v28.s[1] +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v4.4S, v27.s[0] +mul v4.4S, v4.4S,v28.s[0] +sub v21.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v8.4S, v27.s[0] +mul v8.4S, v8.4S,v28.s[0] +sub v15.4s, v13.4s, v10.4s +add v13.4s, v13.4s, v10.4s +mla v12.4S, v20.4S, v31.s[0] +mla v7.4S, v17.4S, v31.s[0] +sub v17.4s, v0.4s, v14.4s +mla v4.4S, v5.4S, v31.s[0] +mla v8.4S, v16.4S, v31.s[0] +add v0.4s, v0.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v27.s[2] +mul v2.4S, v2.4S,v28.s[2] +sub v16.4s, v19.4s, v9.4s +sqrdmulh v5.4S, v13.4S, v27.s[2] +mul v13.4S, v13.4S,v28.s[2] +add v19.4s, v19.4s, v9.4s +sqrdmulh v9.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +sub v20.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +sub v10.4s, v3.4s, v7.4s +add v3.4s, v3.4s, v7.4s +mla v2.4S, v14.4S, v31.s[0] +mla v13.4S, v5.4S, v31.s[0] +sub v5.4s, v11.4s, v4.4s +mla v21.4S, v9.4S, v31.s[0] +mla v15.4S, v12.4S, v31.s[0] +add v11.4s, v11.4s, v4.4s +sqrdmulh v4.4S, v3.4S, v25.s[2] +mul v3.4S, v3.4S,v26.s[2] +sub v12.4s, v6.4s, v8.4s +sqrdmulh v9.4S, v10.4S, v25.s[3] +mul v10.4S, v10.4S,v26.s[3] +add v6.4s, v6.4s, v8.4s +sqrdmulh v8.4S, v12.4S, v25.s[1] +mul v12.4S, v12.4S,v26.s[1] +sub v14.4s, v0.4s, v2.4s +add v0.4s, v0.4s, v2.4s +sqrdmulh v2.4S, v6.4S, v25.s[0] +mul v6.4S, v6.4S,v26.s[0] +sub v7.4s, v19.4s, v13.4s +add v19.4s, v19.4s, v13.4s +mla v3.4S, v4.4S, v31.s[0] +mla v10.4S, v9.4S, v31.s[0] +sub v9.4s, v17.4s, v21.4s +mla v12.4S, v8.4S, v31.s[0] +mla v6.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v19.4S, v23.s[0] +mul v19.4S, v19.4S,v24.s[0] +sub v2.4s, v16.4s, v15.4s +sqrdmulh v8.4S, v7.4S, v23.s[1] +mul v7.4S, v7.4S,v24.s[1] +add v16.4s, v16.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v23.s[2] +mul v16.4S, v16.4S,v24.s[2] +sub v4.4s, v22.4s, v3.4s +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v2.4S, v23.s[3] +mul v2.4S, v2.4S,v24.s[3] +sub v13.4s, v20.4s, v10.4s +add v20.4s, v20.4s, v10.4s +mla v19.4S, v21.4S, v31.s[0] +mla v7.4S, v8.4S, v31.s[0] +sub v8.4s, v5.4s, v12.4s +str q22, [x0, #272] +mla v16.4S, v15.4S, v31.s[0] +mla v2.4S, v3.4S, v31.s[0] +add v5.4s, v5.4s, v12.4s +str q4, [x0, #336] +sub v23.4s, v11.4s, v6.4s +str q20, [x0, #400] +add v11.4s, v11.4s, v6.4s +str q13, [x0, #464] +sub v13.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sub v19.4s, v14.4s, v7.4s +add v14.4s, v14.4s, v7.4s +sub v7.4s, v17.4s, v16.4s +str q5, [x0, #144] +add v17.4s, v17.4s, v16.4s +str q8, [x0, #208] +sub v8.4s, v9.4s, v2.4s +str q11, [x0, #16] +add v9.4s, v9.4s, v2.4s +str q23, [x0, #80] +str q0, [x0, #528] +str q13, [x0, #592] +str q14, [x0, #656] +str q19, [x0, #720] +str q17, [x0, #784] +str q7, [x0, #848] +str q9, [x0, #912] +str q8, [x0, #976] +ldr q18, [x17, #+128] +ldr q1, [x17, #+144] +ldr q10, [x17, #+160] +ldr q21, [x17, #+176] +ldr q22, [x17, #+192] +ldr q15, [x17, #+208] +ldr q3, [x17, #+224] +ldr q12, [x17, #+240] +ldr q4, [x0, #32] +ldr q30, [x0, #48] +ldr q29, [x0, #0] +ldr q28, [x0, #16] +sqrdmulh v27.4S, v4.4S, v1.s[0] +mul v4.4S, v4.4S,v18.s[0] +mla v4.4S, v27.4S, v31.s[0] +sub v27.4s, v29.4s, v4.4s +add v29.4s, v29.4s, v4.4s +sqrdmulh v4.4S, v30.4S, v1.s[0] +mul v30.4S, v30.4S,v18.s[0] +mla v30.4S, v4.4S, v31.s[0] +sub v4.4s, v28.4s, v30.4s +add v28.4s, v28.4s, v30.4s +sqrdmulh v30.4S, v28.4S, v1.s[1] +mul v28.4S, v28.4S,v18.s[1] +mla v28.4S, v30.4S, v31.s[0] +sub v30.4s, v29.4s, v28.4s +add v29.4s, v29.4s, v28.4s +sqrdmulh v28.4S, v4.4S, v1.s[2] +mul v4.4S, v4.4S,v18.s[2] +mla v4.4S, v28.4S, v31.s[0] +sub v28.4s, v27.4s, v4.4s +add v27.4s, v27.4s, v4.4s +trn1 v4.4S, v29.4S, v30.4S +trn2 v26.4S, v29.4S, v30.4S +trn1 v25.4S, v27.4S, v28.4S +trn2 v24.4S, v27.4S, v28.4S +trn2 v27.2D, v4.2D, v25.2D +trn2 v28.2D, v26.2D, v24.2D +trn1 v29.2D, v4.2D, v25.2D +trn1 v30.2D, v26.2D, v24.2D +sqrdmulh v24.4S, v27.4S, v21.4S +mul v27.4S, v27.4S,v10.4S +mla v27.4S, v24.4S, v31.s[0] +sub v24.4s, v29.4s, v27.4s +add v29.4s, v29.4s, v27.4s +sqrdmulh v27.4S, v28.4S, v21.4S +mul v28.4S, v28.4S,v10.4S +mla v28.4S, v27.4S, v31.s[0] +sub v27.4s, v30.4s, v28.4s +add v30.4s, v30.4s, v28.4s +sqrdmulh v28.4S, v30.4S, v15.4S +mul v30.4S, v30.4S,v22.4S +mla v30.4S, v28.4S, v31.s[0] +sub v28.4s, v29.4s, v30.4s +add v29.4s, v29.4s, v30.4s +sqrdmulh v30.4S, v27.4S, v12.4S +mul v27.4S, v27.4S,v3.4S +mla v27.4S, v30.4S, v31.s[0] +sub v30.4s, v24.4s, v27.4s +add v24.4s, v24.4s, v27.4s +str q29, [x0, #0] +str q28, [x0, #16] +str q24, [x0, #32] +str q30, [x0, #48] +ldr q30, [x17, #+256] +ldr q24, [x17, #+272] +ldr q28, [x17, #+288] +ldr q29, [x17, #+304] +ldr q27, [x17, #+320] +ldr q26, [x17, #+336] +ldr q25, [x17, #+352] +ldr q4, [x17, #+368] +ldr q12, [x0, #96] +ldr q3, [x0, #112] +ldr q15, [x0, #64] +ldr q22, [x0, #80] +sqrdmulh v21.4S, v12.4S, v24.s[0] +mul v12.4S, v12.4S,v30.s[0] +mla v12.4S, v21.4S, v31.s[0] +sub v21.4s, v15.4s, v12.4s +add v15.4s, v15.4s, v12.4s +sqrdmulh v12.4S, v3.4S, v24.s[0] +mul v3.4S, v3.4S,v30.s[0] +mla v3.4S, v12.4S, v31.s[0] +sub v12.4s, v22.4s, v3.4s +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v22.4S, v24.s[1] +mul v22.4S, v22.4S,v30.s[1] +mla v22.4S, v3.4S, v31.s[0] +sub v3.4s, v15.4s, v22.4s +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v12.4S, v24.s[2] +mul v12.4S, v12.4S,v30.s[2] +mla v12.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v12.4s +add v21.4s, v21.4s, v12.4s +trn1 v12.4S, v15.4S, v3.4S +trn2 v10.4S, v15.4S, v3.4S +trn1 v1.4S, v21.4S, v22.4S +trn2 v18.4S, v21.4S, v22.4S +trn2 v21.2D, v12.2D, v1.2D +trn2 v22.2D, v10.2D, v18.2D +trn1 v15.2D, v12.2D, v1.2D +trn1 v3.2D, v10.2D, v18.2D +sqrdmulh v18.4S, v21.4S, v29.4S +mul v21.4S, v21.4S,v28.4S +mla v21.4S, v18.4S, v31.s[0] +sub v18.4s, v15.4s, v21.4s +add v15.4s, v15.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v29.4S +mul v22.4S, v22.4S,v28.4S +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v3.4s, v22.4s +add v3.4s, v3.4s, v22.4s +sqrdmulh v22.4S, v3.4S, v26.4S +mul v3.4S, v3.4S,v27.4S +mla v3.4S, v22.4S, v31.s[0] +sub v22.4s, v15.4s, v3.4s +add v15.4s, v15.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v4.4S +mul v21.4S, v21.4S,v25.4S +mla v21.4S, v3.4S, v31.s[0] +sub v3.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +str q15, [x0, #64] +str q22, [x0, #80] +str q18, [x0, #96] +str q3, [x0, #112] +ldr q3, [x17, #+384] +ldr q18, [x17, #+400] +ldr q22, [x17, #+416] +ldr q15, [x17, #+432] +ldr q21, [x17, #+448] +ldr q10, [x17, #+464] +ldr q1, [x17, #+480] +ldr q12, [x17, #+496] +ldr q4, [x0, #160] +ldr q25, [x0, #176] +ldr q26, [x0, #128] +ldr q27, [x0, #144] +sqrdmulh v29.4S, v4.4S, v18.s[0] +mul v4.4S, v4.4S,v3.s[0] +mla v4.4S, v29.4S, v31.s[0] +sub v29.4s, v26.4s, v4.4s +add v26.4s, v26.4s, v4.4s +sqrdmulh v4.4S, v25.4S, v18.s[0] +mul v25.4S, v25.4S,v3.s[0] +mla v25.4S, v4.4S, v31.s[0] +sub v4.4s, v27.4s, v25.4s +add v27.4s, v27.4s, v25.4s +sqrdmulh v25.4S, v27.4S, v18.s[1] +mul v27.4S, v27.4S,v3.s[1] +mla v27.4S, v25.4S, v31.s[0] +sub v25.4s, v26.4s, v27.4s +add v26.4s, v26.4s, v27.4s +sqrdmulh v27.4S, v4.4S, v18.s[2] +mul v4.4S, v4.4S,v3.s[2] +mla v4.4S, v27.4S, v31.s[0] +sub v27.4s, v29.4s, v4.4s +add v29.4s, v29.4s, v4.4s +trn1 v4.4S, v26.4S, v25.4S +trn2 v28.4S, v26.4S, v25.4S +trn1 v24.4S, v29.4S, v27.4S +trn2 v30.4S, v29.4S, v27.4S +trn2 v29.2D, v4.2D, v24.2D +trn2 v27.2D, v28.2D, v30.2D +trn1 v26.2D, v4.2D, v24.2D +trn1 v25.2D, v28.2D, v30.2D +sqrdmulh v30.4S, v29.4S, v15.4S +mul v29.4S, v29.4S,v22.4S +mla v29.4S, v30.4S, v31.s[0] +sub v30.4s, v26.4s, v29.4s +add v26.4s, v26.4s, v29.4s +sqrdmulh v29.4S, v27.4S, v15.4S +mul v27.4S, v27.4S,v22.4S +mla v27.4S, v29.4S, v31.s[0] +sub v29.4s, v25.4s, v27.4s +add v25.4s, v25.4s, v27.4s +sqrdmulh v27.4S, v25.4S, v10.4S +mul v25.4S, v25.4S,v21.4S +mla v25.4S, v27.4S, v31.s[0] +sub v27.4s, v26.4s, v25.4s +add v26.4s, v26.4s, v25.4s +sqrdmulh v25.4S, v29.4S, v12.4S +mul v29.4S, v29.4S,v1.4S +mla v29.4S, v25.4S, v31.s[0] +sub v25.4s, v30.4s, v29.4s +add v30.4s, v30.4s, v29.4s +str q26, [x0, #128] +str q27, [x0, #144] +str q30, [x0, #160] +str q25, [x0, #176] +ldr q25, [x17, #+512] +ldr q30, [x17, #+528] +ldr q27, [x17, #+544] +ldr q26, [x17, #+560] +ldr q29, [x17, #+576] +ldr q28, [x17, #+592] +ldr q24, [x17, #+608] +ldr q4, [x17, #+624] +ldr q12, [x0, #224] +ldr q1, [x0, #240] +ldr q10, [x0, #192] +ldr q21, [x0, #208] +sqrdmulh v15.4S, v12.4S, v30.s[0] +mul v12.4S, v12.4S,v25.s[0] +mla v12.4S, v15.4S, v31.s[0] +sub v15.4s, v10.4s, v12.4s +add v10.4s, v10.4s, v12.4s +sqrdmulh v12.4S, v1.4S, v30.s[0] +mul v1.4S, v1.4S,v25.s[0] +mla v1.4S, v12.4S, v31.s[0] +sub v12.4s, v21.4s, v1.4s +add v21.4s, v21.4s, v1.4s +sqrdmulh v1.4S, v21.4S, v30.s[1] +mul v21.4S, v21.4S,v25.s[1] +mla v21.4S, v1.4S, v31.s[0] +sub v1.4s, v10.4s, v21.4s +add v10.4s, v10.4s, v21.4s +sqrdmulh v21.4S, v12.4S, v30.s[2] +mul v12.4S, v12.4S,v25.s[2] +mla v12.4S, v21.4S, v31.s[0] +sub v21.4s, v15.4s, v12.4s +add v15.4s, v15.4s, v12.4s +trn1 v12.4S, v10.4S, v1.4S +trn2 v22.4S, v10.4S, v1.4S +trn1 v18.4S, v15.4S, v21.4S +trn2 v3.4S, v15.4S, v21.4S +trn2 v15.2D, v12.2D, v18.2D +trn2 v21.2D, v22.2D, v3.2D +trn1 v10.2D, v12.2D, v18.2D +trn1 v1.2D, v22.2D, v3.2D +sqrdmulh v3.4S, v15.4S, v26.4S +mul v15.4S, v15.4S,v27.4S +mla v15.4S, v3.4S, v31.s[0] +sub v3.4s, v10.4s, v15.4s +add v10.4s, v10.4s, v15.4s +sqrdmulh v15.4S, v21.4S, v26.4S +mul v21.4S, v21.4S,v27.4S +mla v21.4S, v15.4S, v31.s[0] +sub v15.4s, v1.4s, v21.4s +add v1.4s, v1.4s, v21.4s +sqrdmulh v21.4S, v1.4S, v28.4S +mul v1.4S, v1.4S,v29.4S +mla v1.4S, v21.4S, v31.s[0] +sub v21.4s, v10.4s, v1.4s +add v10.4s, v10.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v4.4S +mul v15.4S, v15.4S,v24.4S +mla v15.4S, v1.4S, v31.s[0] +sub v1.4s, v3.4s, v15.4s +add v3.4s, v3.4s, v15.4s +str q10, [x0, #192] +str q21, [x0, #208] +str q3, [x0, #224] +str q1, [x0, #240] +ldr q1, [x17, #+640] +ldr q3, [x17, #+656] +ldr q21, [x17, #+672] +ldr q10, [x17, #+688] +ldr q15, [x17, #+704] +ldr q22, [x17, #+720] +ldr q18, [x17, #+736] +ldr q12, [x17, #+752] +ldr q4, [x0, #288] +ldr q24, [x0, #304] +ldr q28, [x0, #256] +ldr q29, [x0, #272] +sqrdmulh v26.4S, v4.4S, v3.s[0] +mul v4.4S, v4.4S,v1.s[0] +mla v4.4S, v26.4S, v31.s[0] +sub v26.4s, v28.4s, v4.4s +add v28.4s, v28.4s, v4.4s +sqrdmulh v4.4S, v24.4S, v3.s[0] +mul v24.4S, v24.4S,v1.s[0] +mla v24.4S, v4.4S, v31.s[0] +sub v4.4s, v29.4s, v24.4s +add v29.4s, v29.4s, v24.4s +sqrdmulh v24.4S, v29.4S, v3.s[1] +mul v29.4S, v29.4S,v1.s[1] +mla v29.4S, v24.4S, v31.s[0] +sub v24.4s, v28.4s, v29.4s +add v28.4s, v28.4s, v29.4s +sqrdmulh v29.4S, v4.4S, v3.s[2] +mul v4.4S, v4.4S,v1.s[2] +mla v4.4S, v29.4S, v31.s[0] +sub v29.4s, v26.4s, v4.4s +add v26.4s, v26.4s, v4.4s +trn1 v4.4S, v28.4S, v24.4S +trn2 v27.4S, v28.4S, v24.4S +trn1 v30.4S, v26.4S, v29.4S +trn2 v25.4S, v26.4S, v29.4S +trn2 v26.2D, v4.2D, v30.2D +trn2 v29.2D, v27.2D, v25.2D +trn1 v28.2D, v4.2D, v30.2D +trn1 v24.2D, v27.2D, v25.2D +sqrdmulh v25.4S, v26.4S, v10.4S +mul v26.4S, v26.4S,v21.4S +mla v26.4S, v25.4S, v31.s[0] +sub v25.4s, v28.4s, v26.4s +add v28.4s, v28.4s, v26.4s +sqrdmulh v26.4S, v29.4S, v10.4S +mul v29.4S, v29.4S,v21.4S +mla v29.4S, v26.4S, v31.s[0] +sub v26.4s, v24.4s, v29.4s +add v24.4s, v24.4s, v29.4s +sqrdmulh v29.4S, v24.4S, v22.4S +mul v24.4S, v24.4S,v15.4S +mla v24.4S, v29.4S, v31.s[0] +sub v29.4s, v28.4s, v24.4s +add v28.4s, v28.4s, v24.4s +sqrdmulh v24.4S, v26.4S, v12.4S +mul v26.4S, v26.4S,v18.4S +mla v26.4S, v24.4S, v31.s[0] +sub v24.4s, v25.4s, v26.4s +add v25.4s, v25.4s, v26.4s +str q28, [x0, #256] +str q29, [x0, #272] +str q25, [x0, #288] +str q24, [x0, #304] +ldr q24, [x17, #+768] +ldr q25, [x17, #+784] +ldr q29, [x17, #+800] +ldr q28, [x17, #+816] +ldr q26, [x17, #+832] +ldr q27, [x17, #+848] +ldr q30, [x17, #+864] +ldr q4, [x17, #+880] +ldr q12, [x0, #352] +ldr q18, [x0, #368] +ldr q22, [x0, #320] +ldr q15, [x0, #336] +sqrdmulh v10.4S, v12.4S, v25.s[0] +mul v12.4S, v12.4S,v24.s[0] +mla v12.4S, v10.4S, v31.s[0] +sub v10.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v18.4S, v25.s[0] +mul v18.4S, v18.4S,v24.s[0] +mla v18.4S, v12.4S, v31.s[0] +sub v12.4s, v15.4s, v18.4s +add v15.4s, v15.4s, v18.4s +sqrdmulh v18.4S, v15.4S, v25.s[1] +mul v15.4S, v15.4S,v24.s[1] +mla v15.4S, v18.4S, v31.s[0] +sub v18.4s, v22.4s, v15.4s +add v22.4s, v22.4s, v15.4s +sqrdmulh v15.4S, v12.4S, v25.s[2] +mul v12.4S, v12.4S,v24.s[2] +mla v12.4S, v15.4S, v31.s[0] +sub v15.4s, v10.4s, v12.4s +add v10.4s, v10.4s, v12.4s +trn1 v12.4S, v22.4S, v18.4S +trn2 v21.4S, v22.4S, v18.4S +trn1 v3.4S, v10.4S, v15.4S +trn2 v1.4S, v10.4S, v15.4S +trn2 v10.2D, v12.2D, v3.2D +trn2 v15.2D, v21.2D, v1.2D +trn1 v22.2D, v12.2D, v3.2D +trn1 v18.2D, v21.2D, v1.2D +sqrdmulh v1.4S, v10.4S, v28.4S +mul v10.4S, v10.4S,v29.4S +mla v10.4S, v1.4S, v31.s[0] +sub v1.4s, v22.4s, v10.4s +add v22.4s, v22.4s, v10.4s +sqrdmulh v10.4S, v15.4S, v28.4S +mul v15.4S, v15.4S,v29.4S +mla v15.4S, v10.4S, v31.s[0] +sub v10.4s, v18.4s, v15.4s +add v18.4s, v18.4s, v15.4s +sqrdmulh v15.4S, v18.4S, v27.4S +mul v18.4S, v18.4S,v26.4S +mla v18.4S, v15.4S, v31.s[0] +sub v15.4s, v22.4s, v18.4s +add v22.4s, v22.4s, v18.4s +sqrdmulh v18.4S, v10.4S, v4.4S +mul v10.4S, v10.4S,v30.4S +mla v10.4S, v18.4S, v31.s[0] +sub v18.4s, v1.4s, v10.4s +add v1.4s, v1.4s, v10.4s +str q22, [x0, #320] +str q15, [x0, #336] +str q1, [x0, #352] +str q18, [x0, #368] +ldr q18, [x17, #+896] +ldr q1, [x17, #+912] +ldr q15, [x17, #+928] +ldr q22, [x17, #+944] +ldr q10, [x17, #+960] +ldr q21, [x17, #+976] +ldr q3, [x17, #+992] +ldr q12, [x17, #+1008] +ldr q4, [x0, #416] +ldr q30, [x0, #432] +ldr q27, [x0, #384] +ldr q26, [x0, #400] +sqrdmulh v28.4S, v4.4S, v1.s[0] +mul v4.4S, v4.4S,v18.s[0] +mla v4.4S, v28.4S, v31.s[0] +sub v28.4s, v27.4s, v4.4s +add v27.4s, v27.4s, v4.4s +sqrdmulh v4.4S, v30.4S, v1.s[0] +mul v30.4S, v30.4S,v18.s[0] +mla v30.4S, v4.4S, v31.s[0] +sub v4.4s, v26.4s, v30.4s +add v26.4s, v26.4s, v30.4s +sqrdmulh v30.4S, v26.4S, v1.s[1] +mul v26.4S, v26.4S,v18.s[1] +mla v26.4S, v30.4S, v31.s[0] +sub v30.4s, v27.4s, v26.4s +add v27.4s, v27.4s, v26.4s +sqrdmulh v26.4S, v4.4S, v1.s[2] +mul v4.4S, v4.4S,v18.s[2] +mla v4.4S, v26.4S, v31.s[0] +sub v26.4s, v28.4s, v4.4s +add v28.4s, v28.4s, v4.4s +trn1 v4.4S, v27.4S, v30.4S +trn2 v29.4S, v27.4S, v30.4S +trn1 v25.4S, v28.4S, v26.4S +trn2 v24.4S, v28.4S, v26.4S +trn2 v28.2D, v4.2D, v25.2D +trn2 v26.2D, v29.2D, v24.2D +trn1 v27.2D, v4.2D, v25.2D +trn1 v30.2D, v29.2D, v24.2D +sqrdmulh v24.4S, v28.4S, v22.4S +mul v28.4S, v28.4S,v15.4S +mla v28.4S, v24.4S, v31.s[0] +sub v24.4s, v27.4s, v28.4s +add v27.4s, v27.4s, v28.4s +sqrdmulh v28.4S, v26.4S, v22.4S +mul v26.4S, v26.4S,v15.4S +mla v26.4S, v28.4S, v31.s[0] +sub v28.4s, v30.4s, v26.4s +add v30.4s, v30.4s, v26.4s +sqrdmulh v26.4S, v30.4S, v21.4S +mul v30.4S, v30.4S,v10.4S +mla v30.4S, v26.4S, v31.s[0] +sub v26.4s, v27.4s, v30.4s +add v27.4s, v27.4s, v30.4s +sqrdmulh v30.4S, v28.4S, v12.4S +mul v28.4S, v28.4S,v3.4S +mla v28.4S, v30.4S, v31.s[0] +sub v30.4s, v24.4s, v28.4s +add v24.4s, v24.4s, v28.4s +str q27, [x0, #384] +str q26, [x0, #400] +str q24, [x0, #416] +str q30, [x0, #432] +ldr q30, [x17, #+1024] +ldr q24, [x17, #+1040] +ldr q26, [x17, #+1056] +ldr q27, [x17, #+1072] +ldr q28, [x17, #+1088] +ldr q29, [x17, #+1104] +ldr q25, [x17, #+1120] +ldr q4, [x17, #+1136] +ldr q12, [x0, #480] +ldr q3, [x0, #496] +ldr q21, [x0, #448] +ldr q10, [x0, #464] +sqrdmulh v22.4S, v12.4S, v24.s[0] +mul v12.4S, v12.4S,v30.s[0] +mla v12.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v12.4s +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v3.4S, v24.s[0] +mul v3.4S, v3.4S,v30.s[0] +mla v3.4S, v12.4S, v31.s[0] +sub v12.4s, v10.4s, v3.4s +add v10.4s, v10.4s, v3.4s +sqrdmulh v3.4S, v10.4S, v24.s[1] +mul v10.4S, v10.4S,v30.s[1] +mla v10.4S, v3.4S, v31.s[0] +sub v3.4s, v21.4s, v10.4s +add v21.4s, v21.4s, v10.4s +sqrdmulh v10.4S, v12.4S, v24.s[2] +mul v12.4S, v12.4S,v30.s[2] +mla v12.4S, v10.4S, v31.s[0] +sub v10.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +trn1 v12.4S, v21.4S, v3.4S +trn2 v15.4S, v21.4S, v3.4S +trn1 v1.4S, v22.4S, v10.4S +trn2 v18.4S, v22.4S, v10.4S +trn2 v22.2D, v12.2D, v1.2D +trn2 v10.2D, v15.2D, v18.2D +trn1 v21.2D, v12.2D, v1.2D +trn1 v3.2D, v15.2D, v18.2D +sqrdmulh v18.4S, v22.4S, v27.4S +mul v22.4S, v22.4S,v26.4S +mla v22.4S, v18.4S, v31.s[0] +sub v18.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +sqrdmulh v22.4S, v10.4S, v27.4S +mul v10.4S, v10.4S,v26.4S +mla v10.4S, v22.4S, v31.s[0] +sub v22.4s, v3.4s, v10.4s +add v3.4s, v3.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v29.4S +mul v3.4S, v3.4S,v28.4S +mla v3.4S, v10.4S, v31.s[0] +sub v10.4s, v21.4s, v3.4s +add v21.4s, v21.4s, v3.4s +sqrdmulh v3.4S, v22.4S, v4.4S +mul v22.4S, v22.4S,v25.4S +mla v22.4S, v3.4S, v31.s[0] +sub v3.4s, v18.4s, v22.4s +add v18.4s, v18.4s, v22.4s +str q21, [x0, #448] +str q10, [x0, #464] +str q18, [x0, #480] +str q3, [x0, #496] +ldr q3, [x17, #+1152] +ldr q18, [x17, #+1168] +ldr q10, [x17, #+1184] +ldr q21, [x17, #+1200] +ldr q22, [x17, #+1216] +ldr q15, [x17, #+1232] +ldr q1, [x17, #+1248] +ldr q12, [x17, #+1264] +ldr q4, [x0, #544] +ldr q25, [x0, #560] +ldr q29, [x0, #512] +ldr q28, [x0, #528] +sqrdmulh v27.4S, v4.4S, v18.s[0] +mul v4.4S, v4.4S,v3.s[0] +mla v4.4S, v27.4S, v31.s[0] +sub v27.4s, v29.4s, v4.4s +add v29.4s, v29.4s, v4.4s +sqrdmulh v4.4S, v25.4S, v18.s[0] +mul v25.4S, v25.4S,v3.s[0] +mla v25.4S, v4.4S, v31.s[0] +sub v4.4s, v28.4s, v25.4s +add v28.4s, v28.4s, v25.4s +sqrdmulh v25.4S, v28.4S, v18.s[1] +mul v28.4S, v28.4S,v3.s[1] +mla v28.4S, v25.4S, v31.s[0] +sub v25.4s, v29.4s, v28.4s +add v29.4s, v29.4s, v28.4s +sqrdmulh v28.4S, v4.4S, v18.s[2] +mul v4.4S, v4.4S,v3.s[2] +mla v4.4S, v28.4S, v31.s[0] +sub v28.4s, v27.4s, v4.4s +add v27.4s, v27.4s, v4.4s +trn1 v4.4S, v29.4S, v25.4S +trn2 v26.4S, v29.4S, v25.4S +trn1 v24.4S, v27.4S, v28.4S +trn2 v30.4S, v27.4S, v28.4S +trn2 v27.2D, v4.2D, v24.2D +trn2 v28.2D, v26.2D, v30.2D +trn1 v29.2D, v4.2D, v24.2D +trn1 v25.2D, v26.2D, v30.2D +sqrdmulh v30.4S, v27.4S, v21.4S +mul v27.4S, v27.4S,v10.4S +mla v27.4S, v30.4S, v31.s[0] +sub v30.4s, v29.4s, v27.4s +add v29.4s, v29.4s, v27.4s +sqrdmulh v27.4S, v28.4S, v21.4S +mul v28.4S, v28.4S,v10.4S +mla v28.4S, v27.4S, v31.s[0] +sub v27.4s, v25.4s, v28.4s +add v25.4s, v25.4s, v28.4s +sqrdmulh v28.4S, v25.4S, v15.4S +mul v25.4S, v25.4S,v22.4S +mla v25.4S, v28.4S, v31.s[0] +sub v28.4s, v29.4s, v25.4s +add v29.4s, v29.4s, v25.4s +sqrdmulh v25.4S, v27.4S, v12.4S +mul v27.4S, v27.4S,v1.4S +mla v27.4S, v25.4S, v31.s[0] +sub v25.4s, v30.4s, v27.4s +add v30.4s, v30.4s, v27.4s +str q29, [x0, #512] +str q28, [x0, #528] +str q30, [x0, #544] +str q25, [x0, #560] +ldr q25, [x17, #+1280] +ldr q30, [x17, #+1296] +ldr q28, [x17, #+1312] +ldr q29, [x17, #+1328] +ldr q27, [x17, #+1344] +ldr q26, [x17, #+1360] +ldr q24, [x17, #+1376] +ldr q4, [x17, #+1392] +ldr q12, [x0, #608] +ldr q1, [x0, #624] +ldr q15, [x0, #576] +ldr q22, [x0, #592] +sqrdmulh v21.4S, v12.4S, v30.s[0] +mul v12.4S, v12.4S,v25.s[0] +mla v12.4S, v21.4S, v31.s[0] +sub v21.4s, v15.4s, v12.4s +add v15.4s, v15.4s, v12.4s +sqrdmulh v12.4S, v1.4S, v30.s[0] +mul v1.4S, v1.4S,v25.s[0] +mla v1.4S, v12.4S, v31.s[0] +sub v12.4s, v22.4s, v1.4s +add v22.4s, v22.4s, v1.4s +sqrdmulh v1.4S, v22.4S, v30.s[1] +mul v22.4S, v22.4S,v25.s[1] +mla v22.4S, v1.4S, v31.s[0] +sub v1.4s, v15.4s, v22.4s +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v12.4S, v30.s[2] +mul v12.4S, v12.4S,v25.s[2] +mla v12.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v12.4s +add v21.4s, v21.4s, v12.4s +trn1 v12.4S, v15.4S, v1.4S +trn2 v10.4S, v15.4S, v1.4S +trn1 v18.4S, v21.4S, v22.4S +trn2 v3.4S, v21.4S, v22.4S +trn2 v21.2D, v12.2D, v18.2D +trn2 v22.2D, v10.2D, v3.2D +trn1 v15.2D, v12.2D, v18.2D +trn1 v1.2D, v10.2D, v3.2D +sqrdmulh v3.4S, v21.4S, v29.4S +mul v21.4S, v21.4S,v28.4S +mla v21.4S, v3.4S, v31.s[0] +sub v3.4s, v15.4s, v21.4s +add v15.4s, v15.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v29.4S +mul v22.4S, v22.4S,v28.4S +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +sqrdmulh v22.4S, v1.4S, v26.4S +mul v1.4S, v1.4S,v27.4S +mla v1.4S, v22.4S, v31.s[0] +sub v22.4s, v15.4s, v1.4s +add v15.4s, v15.4s, v1.4s +sqrdmulh v1.4S, v21.4S, v4.4S +mul v21.4S, v21.4S,v24.4S +mla v21.4S, v1.4S, v31.s[0] +sub v1.4s, v3.4s, v21.4s +add v3.4s, v3.4s, v21.4s +str q15, [x0, #576] +str q22, [x0, #592] +str q3, [x0, #608] +str q1, [x0, #624] +ldr q1, [x17, #+1408] +ldr q3, [x17, #+1424] +ldr q22, [x17, #+1440] +ldr q15, [x17, #+1456] +ldr q21, [x17, #+1472] +ldr q10, [x17, #+1488] +ldr q18, [x17, #+1504] +ldr q12, [x17, #+1520] +ldr q4, [x0, #672] +ldr q24, [x0, #688] +ldr q26, [x0, #640] +ldr q27, [x0, #656] +sqrdmulh v29.4S, v4.4S, v3.s[0] +mul v4.4S, v4.4S,v1.s[0] +mla v4.4S, v29.4S, v31.s[0] +sub v29.4s, v26.4s, v4.4s +add v26.4s, v26.4s, v4.4s +sqrdmulh v4.4S, v24.4S, v3.s[0] +mul v24.4S, v24.4S,v1.s[0] +mla v24.4S, v4.4S, v31.s[0] +sub v4.4s, v27.4s, v24.4s +add v27.4s, v27.4s, v24.4s +sqrdmulh v24.4S, v27.4S, v3.s[1] +mul v27.4S, v27.4S,v1.s[1] +mla v27.4S, v24.4S, v31.s[0] +sub v24.4s, v26.4s, v27.4s +add v26.4s, v26.4s, v27.4s +sqrdmulh v27.4S, v4.4S, v3.s[2] +mul v4.4S, v4.4S,v1.s[2] +mla v4.4S, v27.4S, v31.s[0] +sub v27.4s, v29.4s, v4.4s +add v29.4s, v29.4s, v4.4s +trn1 v4.4S, v26.4S, v24.4S +trn2 v28.4S, v26.4S, v24.4S +trn1 v30.4S, v29.4S, v27.4S +trn2 v25.4S, v29.4S, v27.4S +trn2 v29.2D, v4.2D, v30.2D +trn2 v27.2D, v28.2D, v25.2D +trn1 v26.2D, v4.2D, v30.2D +trn1 v24.2D, v28.2D, v25.2D +sqrdmulh v25.4S, v29.4S, v15.4S +mul v29.4S, v29.4S,v22.4S +mla v29.4S, v25.4S, v31.s[0] +sub v25.4s, v26.4s, v29.4s +add v26.4s, v26.4s, v29.4s +sqrdmulh v29.4S, v27.4S, v15.4S +mul v27.4S, v27.4S,v22.4S +mla v27.4S, v29.4S, v31.s[0] +sub v29.4s, v24.4s, v27.4s +add v24.4s, v24.4s, v27.4s +sqrdmulh v27.4S, v24.4S, v10.4S +mul v24.4S, v24.4S,v21.4S +mla v24.4S, v27.4S, v31.s[0] +sub v27.4s, v26.4s, v24.4s +add v26.4s, v26.4s, v24.4s +sqrdmulh v24.4S, v29.4S, v12.4S +mul v29.4S, v29.4S,v18.4S +mla v29.4S, v24.4S, v31.s[0] +sub v24.4s, v25.4s, v29.4s +add v25.4s, v25.4s, v29.4s +str q26, [x0, #640] +str q27, [x0, #656] +str q25, [x0, #672] +str q24, [x0, #688] +ldr q24, [x17, #+1536] +ldr q25, [x17, #+1552] +ldr q27, [x17, #+1568] +ldr q26, [x17, #+1584] +ldr q29, [x17, #+1600] +ldr q28, [x17, #+1616] +ldr q30, [x17, #+1632] +ldr q4, [x17, #+1648] +ldr q12, [x0, #736] +ldr q18, [x0, #752] +ldr q10, [x0, #704] +ldr q21, [x0, #720] +sqrdmulh v15.4S, v12.4S, v25.s[0] +mul v12.4S, v12.4S,v24.s[0] +mla v12.4S, v15.4S, v31.s[0] +sub v15.4s, v10.4s, v12.4s +add v10.4s, v10.4s, v12.4s +sqrdmulh v12.4S, v18.4S, v25.s[0] +mul v18.4S, v18.4S,v24.s[0] +mla v18.4S, v12.4S, v31.s[0] +sub v12.4s, v21.4s, v18.4s +add v21.4s, v21.4s, v18.4s +sqrdmulh v18.4S, v21.4S, v25.s[1] +mul v21.4S, v21.4S,v24.s[1] +mla v21.4S, v18.4S, v31.s[0] +sub v18.4s, v10.4s, v21.4s +add v10.4s, v10.4s, v21.4s +sqrdmulh v21.4S, v12.4S, v25.s[2] +mul v12.4S, v12.4S,v24.s[2] +mla v12.4S, v21.4S, v31.s[0] +sub v21.4s, v15.4s, v12.4s +add v15.4s, v15.4s, v12.4s +trn1 v12.4S, v10.4S, v18.4S +trn2 v22.4S, v10.4S, v18.4S +trn1 v3.4S, v15.4S, v21.4S +trn2 v1.4S, v15.4S, v21.4S +trn2 v15.2D, v12.2D, v3.2D +trn2 v21.2D, v22.2D, v1.2D +trn1 v10.2D, v12.2D, v3.2D +trn1 v18.2D, v22.2D, v1.2D +sqrdmulh v1.4S, v15.4S, v26.4S +mul v15.4S, v15.4S,v27.4S +mla v15.4S, v1.4S, v31.s[0] +sub v1.4s, v10.4s, v15.4s +add v10.4s, v10.4s, v15.4s +sqrdmulh v15.4S, v21.4S, v26.4S +mul v21.4S, v21.4S,v27.4S +mla v21.4S, v15.4S, v31.s[0] +sub v15.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v18.4S, v28.4S +mul v18.4S, v18.4S,v29.4S +mla v18.4S, v21.4S, v31.s[0] +sub v21.4s, v10.4s, v18.4s +add v10.4s, v10.4s, v18.4s +sqrdmulh v18.4S, v15.4S, v4.4S +mul v15.4S, v15.4S,v30.4S +mla v15.4S, v18.4S, v31.s[0] +sub v18.4s, v1.4s, v15.4s +add v1.4s, v1.4s, v15.4s +str q10, [x0, #704] +str q21, [x0, #720] +str q1, [x0, #736] +str q18, [x0, #752] +ldr q18, [x17, #+1664] +ldr q1, [x17, #+1680] +ldr q21, [x17, #+1696] +ldr q10, [x17, #+1712] +ldr q15, [x17, #+1728] +ldr q22, [x17, #+1744] +ldr q3, [x17, #+1760] +ldr q12, [x17, #+1776] +ldr q4, [x0, #800] +ldr q30, [x0, #816] +ldr q28, [x0, #768] +ldr q29, [x0, #784] +sqrdmulh v26.4S, v4.4S, v1.s[0] +mul v4.4S, v4.4S,v18.s[0] +mla v4.4S, v26.4S, v31.s[0] +sub v26.4s, v28.4s, v4.4s +add v28.4s, v28.4s, v4.4s +sqrdmulh v4.4S, v30.4S, v1.s[0] +mul v30.4S, v30.4S,v18.s[0] +mla v30.4S, v4.4S, v31.s[0] +sub v4.4s, v29.4s, v30.4s +add v29.4s, v29.4s, v30.4s +sqrdmulh v30.4S, v29.4S, v1.s[1] +mul v29.4S, v29.4S,v18.s[1] +mla v29.4S, v30.4S, v31.s[0] +sub v30.4s, v28.4s, v29.4s +add v28.4s, v28.4s, v29.4s +sqrdmulh v29.4S, v4.4S, v1.s[2] +mul v4.4S, v4.4S,v18.s[2] +mla v4.4S, v29.4S, v31.s[0] +sub v29.4s, v26.4s, v4.4s +add v26.4s, v26.4s, v4.4s +trn1 v4.4S, v28.4S, v30.4S +trn2 v27.4S, v28.4S, v30.4S +trn1 v25.4S, v26.4S, v29.4S +trn2 v24.4S, v26.4S, v29.4S +trn2 v26.2D, v4.2D, v25.2D +trn2 v29.2D, v27.2D, v24.2D +trn1 v28.2D, v4.2D, v25.2D +trn1 v30.2D, v27.2D, v24.2D +sqrdmulh v24.4S, v26.4S, v10.4S +mul v26.4S, v26.4S,v21.4S +mla v26.4S, v24.4S, v31.s[0] +sub v24.4s, v28.4s, v26.4s +add v28.4s, v28.4s, v26.4s +sqrdmulh v26.4S, v29.4S, v10.4S +mul v29.4S, v29.4S,v21.4S +mla v29.4S, v26.4S, v31.s[0] +sub v26.4s, v30.4s, v29.4s +add v30.4s, v30.4s, v29.4s +sqrdmulh v29.4S, v30.4S, v22.4S +mul v30.4S, v30.4S,v15.4S +mla v30.4S, v29.4S, v31.s[0] +sub v29.4s, v28.4s, v30.4s +add v28.4s, v28.4s, v30.4s +sqrdmulh v30.4S, v26.4S, v12.4S +mul v26.4S, v26.4S,v3.4S +mla v26.4S, v30.4S, v31.s[0] +sub v30.4s, v24.4s, v26.4s +add v24.4s, v24.4s, v26.4s +str q28, [x0, #768] +str q29, [x0, #784] +str q24, [x0, #800] +str q30, [x0, #816] +ldr q30, [x17, #+1792] +ldr q24, [x17, #+1808] +ldr q29, [x17, #+1824] +ldr q28, [x17, #+1840] +ldr q26, [x17, #+1856] +ldr q27, [x17, #+1872] +ldr q25, [x17, #+1888] +ldr q4, [x17, #+1904] +ldr q12, [x0, #864] +ldr q3, [x0, #880] +ldr q22, [x0, #832] +ldr q15, [x0, #848] +sqrdmulh v10.4S, v12.4S, v24.s[0] +mul v12.4S, v12.4S,v30.s[0] +mla v12.4S, v10.4S, v31.s[0] +sub v10.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v3.4S, v24.s[0] +mul v3.4S, v3.4S,v30.s[0] +mla v3.4S, v12.4S, v31.s[0] +sub v12.4s, v15.4s, v3.4s +add v15.4s, v15.4s, v3.4s +sqrdmulh v3.4S, v15.4S, v24.s[1] +mul v15.4S, v15.4S,v30.s[1] +mla v15.4S, v3.4S, v31.s[0] +sub v3.4s, v22.4s, v15.4s +add v22.4s, v22.4s, v15.4s +sqrdmulh v15.4S, v12.4S, v24.s[2] +mul v12.4S, v12.4S,v30.s[2] +mla v12.4S, v15.4S, v31.s[0] +sub v15.4s, v10.4s, v12.4s +add v10.4s, v10.4s, v12.4s +trn1 v12.4S, v22.4S, v3.4S +trn2 v21.4S, v22.4S, v3.4S +trn1 v1.4S, v10.4S, v15.4S +trn2 v18.4S, v10.4S, v15.4S +trn2 v10.2D, v12.2D, v1.2D +trn2 v15.2D, v21.2D, v18.2D +trn1 v22.2D, v12.2D, v1.2D +trn1 v3.2D, v21.2D, v18.2D +sqrdmulh v18.4S, v10.4S, v28.4S +mul v10.4S, v10.4S,v29.4S +mla v10.4S, v18.4S, v31.s[0] +sub v18.4s, v22.4s, v10.4s +add v22.4s, v22.4s, v10.4s +sqrdmulh v10.4S, v15.4S, v28.4S +mul v15.4S, v15.4S,v29.4S +mla v15.4S, v10.4S, v31.s[0] +sub v10.4s, v3.4s, v15.4s +add v3.4s, v3.4s, v15.4s +sqrdmulh v15.4S, v3.4S, v27.4S +mul v3.4S, v3.4S,v26.4S +mla v3.4S, v15.4S, v31.s[0] +sub v15.4s, v22.4s, v3.4s +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v10.4S, v4.4S +mul v10.4S, v10.4S,v25.4S +mla v10.4S, v3.4S, v31.s[0] +sub v3.4s, v18.4s, v10.4s +add v18.4s, v18.4s, v10.4s +str q22, [x0, #832] +str q15, [x0, #848] +str q18, [x0, #864] +str q3, [x0, #880] +ldr q3, [x17, #+1920] +ldr q18, [x17, #+1936] +ldr q15, [x17, #+1952] +ldr q22, [x17, #+1968] +ldr q10, [x17, #+1984] +ldr q21, [x17, #+2000] +ldr q1, [x17, #+2016] +ldr q12, [x17, #+2032] +ldr q4, [x0, #928] +ldr q25, [x0, #944] +ldr q27, [x0, #896] +ldr q26, [x0, #912] +sqrdmulh v28.4S, v4.4S, v18.s[0] +mul v4.4S, v4.4S,v3.s[0] +mla v4.4S, v28.4S, v31.s[0] +sub v28.4s, v27.4s, v4.4s +add v27.4s, v27.4s, v4.4s +sqrdmulh v4.4S, v25.4S, v18.s[0] +mul v25.4S, v25.4S,v3.s[0] +mla v25.4S, v4.4S, v31.s[0] +sub v4.4s, v26.4s, v25.4s +add v26.4s, v26.4s, v25.4s +sqrdmulh v25.4S, v26.4S, v18.s[1] +mul v26.4S, v26.4S,v3.s[1] +mla v26.4S, v25.4S, v31.s[0] +sub v25.4s, v27.4s, v26.4s +add v27.4s, v27.4s, v26.4s +sqrdmulh v26.4S, v4.4S, v18.s[2] +mul v4.4S, v4.4S,v3.s[2] +mla v4.4S, v26.4S, v31.s[0] +sub v26.4s, v28.4s, v4.4s +add v28.4s, v28.4s, v4.4s +trn1 v4.4S, v27.4S, v25.4S +trn2 v29.4S, v27.4S, v25.4S +trn1 v24.4S, v28.4S, v26.4S +trn2 v30.4S, v28.4S, v26.4S +trn2 v28.2D, v4.2D, v24.2D +trn2 v26.2D, v29.2D, v30.2D +trn1 v27.2D, v4.2D, v24.2D +trn1 v25.2D, v29.2D, v30.2D +sqrdmulh v30.4S, v28.4S, v22.4S +mul v28.4S, v28.4S,v15.4S +mla v28.4S, v30.4S, v31.s[0] +sub v30.4s, v27.4s, v28.4s +add v27.4s, v27.4s, v28.4s +sqrdmulh v28.4S, v26.4S, v22.4S +mul v26.4S, v26.4S,v15.4S +mla v26.4S, v28.4S, v31.s[0] +sub v28.4s, v25.4s, v26.4s +add v25.4s, v25.4s, v26.4s +sqrdmulh v26.4S, v25.4S, v21.4S +mul v25.4S, v25.4S,v10.4S +mla v25.4S, v26.4S, v31.s[0] +sub v26.4s, v27.4s, v25.4s +add v27.4s, v27.4s, v25.4s +sqrdmulh v25.4S, v28.4S, v12.4S +mul v28.4S, v28.4S,v1.4S +mla v28.4S, v25.4S, v31.s[0] +sub v25.4s, v30.4s, v28.4s +add v30.4s, v30.4s, v28.4s +str q27, [x0, #896] +str q26, [x0, #912] +str q30, [x0, #928] +str q25, [x0, #944] +ldr q25, [x17, #+2048] +ldr q30, [x17, #+2064] +ldr q26, [x17, #+2080] +ldr q27, [x17, #+2096] +ldr q28, [x17, #+2112] +ldr q29, [x17, #+2128] +ldr q24, [x17, #+2144] +ldr q4, [x17, #+2160] +ldr q12, [x0, #992] +ldr q1, [x0, #1008] +ldr q21, [x0, #960] +ldr q10, [x0, #976] +sqrdmulh v22.4S, v12.4S, v30.s[0] +mul v12.4S, v12.4S,v25.s[0] +mla v12.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v12.4s +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v1.4S, v30.s[0] +mul v1.4S, v1.4S,v25.s[0] +mla v1.4S, v12.4S, v31.s[0] +sub v12.4s, v10.4s, v1.4s +add v10.4s, v10.4s, v1.4s +sqrdmulh v1.4S, v10.4S, v30.s[1] +mul v10.4S, v10.4S,v25.s[1] +mla v10.4S, v1.4S, v31.s[0] +sub v1.4s, v21.4s, v10.4s +add v21.4s, v21.4s, v10.4s +sqrdmulh v10.4S, v12.4S, v30.s[2] +mul v12.4S, v12.4S,v25.s[2] +mla v12.4S, v10.4S, v31.s[0] +sub v10.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +trn1 v12.4S, v21.4S, v1.4S +trn2 v15.4S, v21.4S, v1.4S +trn1 v18.4S, v22.4S, v10.4S +trn2 v3.4S, v22.4S, v10.4S +trn2 v22.2D, v12.2D, v18.2D +trn2 v10.2D, v15.2D, v3.2D +trn1 v21.2D, v12.2D, v18.2D +trn1 v1.2D, v15.2D, v3.2D +sqrdmulh v3.4S, v22.4S, v27.4S +mul v22.4S, v22.4S,v26.4S +mla v22.4S, v3.4S, v31.s[0] +sub v3.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +sqrdmulh v22.4S, v10.4S, v27.4S +mul v10.4S, v10.4S,v26.4S +mla v10.4S, v22.4S, v31.s[0] +sub v22.4s, v1.4s, v10.4s +add v1.4s, v1.4s, v10.4s +sqrdmulh v10.4S, v1.4S, v29.4S +mul v1.4S, v1.4S,v28.4S +mla v1.4S, v10.4S, v31.s[0] +sub v10.4s, v21.4s, v1.4s +add v21.4s, v21.4s, v1.4s +sqrdmulh v1.4S, v22.4S, v4.4S +mul v22.4S, v22.4S,v24.4S +mla v22.4S, v1.4S, v31.s[0] +sub v1.4s, v3.4s, v22.4s +add v3.4s, v3.4s, v22.4s +str q21, [x0, #960] +str q10, [x0, #976] +str q3, [x0, #992] +str q1, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 2392 +// Instruction count: 2388 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_8_0.s b/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_8_0.s new file mode 100644 index 0000000..761b80d --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_8_0.s @@ -0,0 +1,2422 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 26036764 // Layer 6, block 0 +.word 7065381 // Layer 6, block 1 +.word 11280567 // Layer 6, block 2 +.word 19695786 // Layer 6, block 3 +.word 1666225723 // Layer 6, block 0 +.word 452149874 // Layer 6, block 1 +.word 721901190 // Layer 6, block 2 +.word 1260434103 // Layer 6, block 3 +.word 28678040 // Layer 7, block 0 +.word 5637166 // Layer 7, block 2 +.word 18759424 // Layer 7, block 4 +.word 8648030 // Layer 7, block 6 +.word 1835254486 // Layer 7, block 0 +.word 360751090 // Layer 7, block 2 +.word 1200511508 // Layer 7, block 4 +.word 553431680 // Layer 7, block 6 +.word 7232147 // Layer 7, block 1 +.word 7430689 // Layer 7, block 3 +.word 14819378 // Layer 7, block 5 +.word 22112339 // Layer 7, block 7 +.word 462822084 // Layer 7, block 1 +.word 475527802 // Layer 7, block 3 +.word 948367809 // Layer 7, block 5 +.word 1415081692 // Layer 7, block 7 +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14834498 // Layer 6, block 4 +.word 22861321 // Layer 6, block 5 +.word 23033862 // Layer 6, block 6 +.word 32211066 // Layer 6, block 7 +.word 949335415 // Layer 6, block 4 +.word 1463012881 // Layer 6, block 5 +.word 1474054663 // Layer 6, block 6 +.word 2061350894 // Layer 6, block 7 +.word 7103825 // Layer 7, block 8 +.word 24338119 // Layer 7, block 10 +.word 6674394 // Layer 7, block 12 +.word 3716128 // Layer 7, block 14 +.word 454610102 // Layer 7, block 8 +.word 1557520740 // Layer 7, block 10 +.word 427128616 // Layer 7, block 12 +.word 237814041 // Layer 7, block 14 +.word 18577393 // Layer 7, block 9 +.word 17042091 // Layer 7, block 11 +.word 6574213 // Layer 7, block 13 +.word 24666803 // Layer 7, block 15 +.word 1188862414 // Layer 7, block 9 +.word 1090610585 // Layer 7, block 11 +.word 420717521 // Layer 7, block 13 +.word 1578554911 // Layer 7, block 15 +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 11253846 // Layer 6, block 8 +.word 16151303 // Layer 6, block 9 +.word 1821442 // Layer 6, block 10 +.word 23358663 // Layer 6, block 11 +.word 720191176 // Layer 6, block 8 +.word 1033604503 // Layer 6, block 9 +.word 116563391 // Layer 6, block 10 +.word 1494840340 // Layer 6, block 11 +.word 32787475 // Layer 7, block 16 +.word 8269259 // Layer 7, block 18 +.word 20826321 // Layer 7, block 20 +.word 21194054 // Layer 7, block 22 +.word 2098238255 // Layer 7, block 16 +.word 529192186 // Layer 7, block 18 +.word 1332782821 // Layer 7, block 20 +.word 1356315937 // Layer 7, block 22 +.word 28400654 // Layer 7, block 17 +.word 31090287 // Layer 7, block 19 +.word 26776841 // Layer 7, block 21 +.word 22281074 // Layer 7, block 23 +.word 1817503137 // Layer 7, block 17 +.word 1989626512 // Layer 7, block 19 +.word 1713587037 // Layer 7, block 21 +.word 1425879908 // Layer 7, block 23 +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 20504641 // Layer 6, block 12 +.word 7735096 // Layer 6, block 13 +.word 29463916 // Layer 6, block 14 +.word 23172067 // Layer 6, block 15 +.word 1312196872 // Layer 6, block 12 +.word 495008363 // Layer 6, block 13 +.word 1885546712 // Layer 6, block 14 +.word 1482899108 // Layer 6, block 15 +.word 1953000 // Layer 7, block 24 +.word 12766243 // Layer 7, block 26 +.word 16292342 // Layer 7, block 28 +.word 25143337 // Layer 7, block 30 +.word 124982461 // Layer 7, block 24 +.word 816977197 // Layer 7, block 26 +.word 1042630311 // Layer 7, block 28 +.word 1609050759 // Layer 7, block 30 +.word 12486848 // Layer 7, block 25 +.word 31556661 // Layer 7, block 27 +.word 28330310 // Layer 7, block 29 +.word 15137961 // Layer 7, block 31 +.word 799097282 // Layer 7, block 25 +.word 2019472170 // Layer 7, block 27 +.word 1813001465 // Layer 7, block 29 +.word 968755565 // Layer 7, block 31 +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 18663828 // Layer 6, block 16 +.word 25765932 // Layer 6, block 17 +.word 11779122 // Layer 6, block 18 +.word 29112305 // Layer 6, block 19 +.word 1194393831 // Layer 6, block 16 +.word 1648893798 // Layer 6, block 17 +.word 753806275 // Layer 6, block 18 +.word 1863045325 // Layer 6, block 19 +.word 33163184 // Layer 7, block 32 +.word 11550623 // Layer 7, block 34 +.word 25375595 // Layer 7, block 36 +.word 18254638 // Layer 7, block 38 +.word 2122281795 // Layer 7, block 32 +.word 739183455 // Layer 7, block 34 +.word 1623914137 // Layer 7, block 36 +.word 1168207670 // Layer 7, block 38 +.word 9551359 // Layer 7, block 33 +.word 33257316 // Layer 7, block 35 +.word 10387700 // Layer 7, block 37 +.word 4263629 // Layer 7, block 39 +.word 611240324 // Layer 7, block 33 +.word 2128305784 // Layer 7, block 35 +.word 664762063 // Layer 7, block 37 +.word 272851431 // Layer 7, block 39 +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 596073 // Layer 6, block 20 +.word 29039358 // Layer 6, block 21 +.word 6760262 // Layer 6, block 22 +.word 2228887 // Layer 6, block 23 +.word 38145761 // Layer 6, block 20 +.word 1858377074 // Layer 6, block 21 +.word 432623749 // Layer 6, block 22 +.word 142637881 // Layer 6, block 23 +.word 25929180 // Layer 7, block 40 +.word 23508428 // Layer 7, block 42 +.word 22560727 // Layer 7, block 44 +.word 29457393 // Layer 7, block 46 +.word 1659340873 // Layer 7, block 40 +.word 1504424569 // Layer 7, block 42 +.word 1443776334 // Layer 7, block 44 +.word 1885129272 // Layer 7, block 46 +.word 17371159 // Layer 7, block 41 +.word 11558208 // Layer 7, block 43 +.word 15755637 // Layer 7, block 45 +.word 20740787 // Layer 7, block 47 +.word 1111669329 // Layer 7, block 41 +.word 739668858 // Layer 7, block 43 +.word 1008283812 // Layer 7, block 45 +.word 1327309063 // Layer 7, block 47 +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 13624329 // Layer 6, block 24 +.word 9838349 // Layer 6, block 25 +.word 6934560 // Layer 6, block 26 +.word 11310234 // Layer 6, block 27 +.word 871890510 // Layer 6, block 24 +.word 629606282 // Layer 6, block 25 +.word 443777969 // Layer 6, block 26 +.word 723799733 // Layer 6, block 27 +.word 3153984 // Layer 7, block 48 +.word 15599806 // Layer 7, block 50 +.word 23484790 // Layer 7, block 52 +.word 30174454 // Layer 7, block 54 +.word 201839571 // Layer 7, block 48 +.word 998311389 // Layer 7, block 50 +.word 1502911852 // Layer 7, block 52 +.word 1931017673 // Layer 7, block 54 +.word 13598070 // Layer 7, block 49 +.word 31454003 // Layer 7, block 51 +.word 20506260 // Layer 7, block 53 +.word 5928435 // Layer 7, block 55 +.word 870210062 // Layer 7, block 49 +.word 2012902560 // Layer 7, block 51 +.word 1312300480 // Layer 7, block 53 +.word 379390883 // Layer 7, block 55 +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 32798516 // Layer 6, block 28 +.word 9911360 // Layer 6, block 29 +.word 32443170 // Layer 6, block 30 +.word 31293482 // Layer 6, block 31 +.word 2098944825 // Layer 6, block 28 +.word 634278629 // Layer 6, block 29 +.word 2076204416 // Layer 6, block 30 +.word 2002630000 // Layer 6, block 31 +.word 26013877 // Layer 7, block 56 +.word 22928950 // Layer 7, block 58 +.word 24547058 // Layer 7, block 60 +.word 21082546 // Layer 7, block 62 +.word 1664761067 // Layer 7, block 56 +.word 1467340807 // Layer 7, block 58 +.word 1570891816 // Layer 7, block 60 +.word 1349179970 // Layer 7, block 62 +.word 21864746 // Layer 7, block 57 +.word 27678266 // Layer 7, block 59 +.word 30695887 // Layer 7, block 61 +.word 31772478 // Layer 7, block 63 +.word 1399236949 // Layer 7, block 57 +.word 1771273834 // Layer 7, block 59 +.word 1964386839 // Layer 7, block 61 +.word 2033283404 // Layer 7, block 63 +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 2853776 // Layer 6, block 32 +.word 31645959 // Layer 6, block 33 +.word 29723614 // Layer 6, block 34 +.word 31813171 // Layer 6, block 35 +.word 182627725 // Layer 6, block 32 +.word 2025186806 // Layer 6, block 33 +.word 1902166116 // Layer 6, block 34 +.word 2035887557 // Layer 6, block 35 +.word 30377953 // Layer 7, block 64 +.word 4924837 // Layer 7, block 66 +.word 11362575 // Layer 7, block 68 +.word 31398766 // Layer 7, block 70 +.word 1944040616 // Layer 7, block 64 +.word 315165513 // Layer 7, block 66 +.word 727149301 // Layer 7, block 68 +.word 2009367662 // Layer 7, block 70 +.word 27689101 // Layer 7, block 65 +.word 31229525 // Layer 7, block 67 +.word 6544948 // Layer 7, block 69 +.word 13728247 // Layer 7, block 71 +.word 1771967221 // Layer 7, block 65 +.word 1998537064 // Layer 7, block 67 +.word 418844704 // Layer 7, block 69 +.word 878540754 // Layer 7, block 71 +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9116920 // Layer 6, block 36 +.word 26449800 // Layer 6, block 37 +.word 27173300 // Layer 6, block 38 +.word 1574249 // Layer 6, block 39 +.word 583438350 // Layer 6, block 36 +.word 1692658010 // Layer 6, block 37 +.word 1738958476 // Layer 6, block 38 +.word 100744247 // Layer 6, block 39 +.word 6510145 // Layer 7, block 72 +.word 760999 // Layer 7, block 74 +.word 1634503 // Layer 7, block 76 +.word 29546109 // Layer 7, block 78 +.word 416617482 // Layer 7, block 72 +.word 48700219 // Layer 7, block 74 +.word 104600209 // Layer 7, block 76 +.word 1890806663 // Layer 7, block 78 +.word 2195232 // Layer 7, block 73 +.word 4465852 // Layer 7, block 75 +.word 31203102 // Layer 7, block 77 +.word 29916743 // Layer 7, block 79 +.word 140484126 // Layer 7, block 73 +.word 285792715 // Layer 7, block 75 +.word 1996846121 // Layer 7, block 77 +.word 1914525428 // Layer 7, block 79 +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29172999 // Layer 6, block 40 +.word 16825951 // Layer 6, block 41 +.word 11592382 // Layer 6, block 42 +.word 2671395 // Layer 6, block 43 +.word 1866929445 // Layer 6, block 40 +.word 1076778680 // Layer 6, block 41 +.word 741855827 // Layer 6, block 42 +.word 170956232 // Layer 6, block 43 +.word 14579779 // Layer 7, block 80 +.word 24263513 // Layer 7, block 82 +.word 4646776 // Layer 7, block 84 +.word 69049 // Layer 7, block 86 +.word 933034643 // Layer 7, block 80 +.word 1552746321 // Layer 7, block 82 +.word 297370968 // Layer 7, block 84 +.word 4418799 // Layer 7, block 86 +.word 33263488 // Layer 7, block 81 +.word 22493246 // Layer 7, block 83 +.word 22009979 // Layer 7, block 85 +.word 12021234 // Layer 7, block 87 +.word 2128700762 // Layer 7, block 81 +.word 1439457879 // Layer 7, block 83 +.word 1408531152 // Layer 7, block 85 +.word 769300260 // Layer 7, block 87 +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 15720958 // Layer 6, block 44 +.word 4876619 // Layer 6, block 45 +.word 9370171 // Layer 6, block 46 +.word 2197027 // Layer 6, block 47 +.word 1006064525 // Layer 6, block 44 +.word 312079797 // Layer 6, block 45 +.word 599645177 // Layer 6, block 46 +.word 140598997 // Layer 6, block 47 +.word 16117282 // Layer 7, block 88 +.word 9635661 // Layer 7, block 90 +.word 9117520 // Layer 7, block 92 +.word 3506913 // Layer 7, block 94 +.word 1031427326 // Layer 7, block 88 +.word 616635240 // Layer 7, block 90 +.word 583476747 // Layer 7, block 92 +.word 224425303 // Layer 7, block 94 +.word 20014407 // Layer 7, block 89 +.word 25893988 // Layer 7, block 91 +.word 10257619 // Layer 7, block 93 +.word 24501669 // Layer 7, block 95 +.word 1280824291 // Layer 7, block 89 +.word 1657088757 // Layer 7, block 91 +.word 656437514 // Layer 7, block 93 +.word 1567987141 // Layer 7, block 95 +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 23467272 // Layer 6, block 48 +.word 11944835 // Layer 6, block 49 +.word 29768154 // Layer 6, block 50 +.word 3189790 // Layer 6, block 51 +.word 1501790786 // Layer 6, block 48 +.word 764411097 // Layer 6, block 49 +.word 1905016458 // Layer 6, block 50 +.word 204130980 // Layer 6, block 51 +.word 28559032 // Layer 7, block 96 +.word 20151609 // Layer 7, block 98 +.word 11645481 // Layer 7, block 100 +.word 16402437 // Layer 7, block 102 +.word 1827638556 // Layer 7, block 96 +.word 1289604549 // Layer 7, block 98 +.word 745253903 // Layer 7, block 100 +.word 1049675853 // Layer 7, block 102 +.word 1005359 // Layer 7, block 97 +.word 19130139 // Layer 7, block 99 +.word 11690281 // Layer 7, block 101 +.word 5461508 // Layer 7, block 103 +.word 64338065 // Layer 7, block 97 +.word 1224235458 // Layer 7, block 99 +.word 748120885 // Layer 7, block 101 +.word 349509836 // Layer 7, block 103 +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 4898455 // Layer 6, block 52 +.word 22059944 // Layer 6, block 53 +.word 20315246 // Layer 6, block 54 +.word 28615767 // Layer 6, block 55 +.word 313477194 // Layer 6, block 52 +.word 1411728668 // Layer 6, block 53 +.word 1300076517 // Layer 6, block 54 +.word 1831269319 // Layer 6, block 55 +.word 6226096 // Layer 7, block 104 +.word 14029790 // Layer 7, block 106 +.word 7729000 // Layer 7, block 108 +.word 13958531 // Layer 7, block 110 +.word 398439734 // Layer 7, block 104 +.word 897838034 // Layer 7, block 106 +.word 494618249 // Layer 7, block 108 +.word 893277806 // Layer 7, block 110 +.word 31755058 // Layer 7, block 105 +.word 26102744 // Layer 7, block 107 +.word 19175904 // Layer 7, block 109 +.word 19472238 // Layer 7, block 111 +.word 2032168609 // Layer 7, block 105 +.word 1670448121 // Layer 7, block 107 +.word 1227164194 // Layer 7, block 109 +.word 1246128123 // Layer 7, block 111 +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 17302560 // Layer 6, block 56 +.word 8630188 // Layer 6, block 57 +.word 13744680 // Layer 6, block 58 +.word 31890906 // Layer 6, block 59 +.word 1107279328 // Layer 6, block 56 +.word 552289879 // Layer 6, block 57 +.word 879592386 // Layer 6, block 58 +.word 2040862218 // Layer 6, block 59 +.word 4735938 // Layer 7, block 112 +.word 26671657 // Layer 7, block 114 +.word 25810971 // Layer 7, block 116 +.word 25578690 // Layer 7, block 118 +.word 303076900 // Layer 7, block 112 +.word 1706855774 // Layer 7, block 114 +.word 1651776074 // Layer 7, block 116 +.word 1636911225 // Layer 7, block 118 +.word 6957373 // Layer 7, block 113 +.word 25381712 // Layer 7, block 115 +.word 27780827 // Layer 7, block 117 +.word 28062311 // Layer 7, block 119 +.word 445237890 // Layer 7, block 113 +.word 1624305595 // Layer 7, block 115 +.word 1777837237 // Layer 7, block 117 +.word 1795850838 // Layer 7, block 119 +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 26150922 // Layer 6, block 60 +.word 29525906 // Layer 6, block 61 +.word 23080870 // Layer 6, block 62 +.word 1636987 // Layer 6, block 63 +.word 1673531278 // Layer 6, block 60 +.word 1889513769 // Layer 6, block 61 +.word 1477062945 // Layer 6, block 62 +.word 104759172 // Layer 6, block 63 +.word 10674616 // Layer 7, block 120 +.word 9508293 // Layer 7, block 122 +.word 4274200 // Layer 7, block 124 +.word 10066304 // Layer 7, block 126 +.word 683123285 // Layer 7, block 120 +.word 608484310 // Layer 7, block 122 +.word 273527923 // Layer 7, block 124 +.word 644194289 // Layer 7, block 126 +.word 26473446 // Layer 7, block 121 +.word 14853570 // Layer 7, block 123 +.word 32427548 // Layer 7, block 125 +.word 16598340 // Layer 7, block 127 +.word 1694171239 // Layer 7, block 121 +.word 950555930 // Layer 7, block 123 +.word 2075204685 // Layer 7, block 125 +.word 1062212688 // Layer 7, block 127 +.text +.global ntt_u32_full_neon_asm_var_4_4_8_0 +.global _ntt_u32_full_neon_asm_var_4_4_8_0 +ntt_u32_full_neon_asm_var_4_4_8_0: +_ntt_u32_full_neon_asm_var_4_4_8_0: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #928] +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +ldr q20, [x0, #992] +sqrdmulh v19.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q18, [x0, #800] +sqrdmulh v17.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +ldr q16, [x0, #864] +sqrdmulh v3.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +mla v22.4S, v21.4S, v31.s[0] +mla v20.4S, v19.4S, v31.s[0] +mla v18.4S, v17.4S, v31.s[0] +mla v16.4S, v3.4S, v31.s[0] +ldr q3, [x0, #544] +sqrdmulh v17.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +ldr q19, [x0, #608] +sqrdmulh v21.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q2, [x0, #672] +ldr q1, [x0, #416] +sqrdmulh v0.4S, v2.4S, v29.s[0] +sub v15.4s, v1.4s, v22.4s +mul v2.4S, v2.4S,v30.s[0] +add v1.4s, v1.4s, v22.4s +ldr q22, [x0, #736] +ldr q14, [x0, #480] +sqrdmulh v13.4S, v22.4S, v29.s[0] +sub v12.4s, v14.4s, v20.4s +mul v22.4S, v22.4S,v30.s[0] +add v14.4s, v14.4s, v20.4s +ldr q20, [x0, #288] +mla v3.4S, v17.4S, v31.s[0] +sub v17.4s, v20.4s, v18.4s +mla v19.4S, v21.4S, v31.s[0] +mla v2.4S, v0.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +mla v22.4S, v13.4S, v31.s[0] +ldr q13, [x0, #352] +sqrdmulh v18.4S, v1.4S, v29.s[1] +sub v0.4s, v13.4s, v16.4s +mul v1.4S, v1.4S,v30.s[1] +sqrdmulh v21.4S, v14.4S, v29.s[1] +add v13.4s, v13.4s, v16.4s +mul v14.4S, v14.4S,v30.s[1] +ldr q16, [x0, #32] +sqrdmulh v11.4S, v20.4S, v29.s[1] +sub v10.4s, v16.4s, v3.4s +mul v20.4S, v20.4S,v30.s[1] +add v16.4s, v16.4s, v3.4s +ldr q3, [x0, #96] +sqrdmulh v9.4S, v13.4S, v29.s[1] +sub v8.4s, v3.4s, v19.4s +mul v13.4S, v13.4S,v30.s[1] +add v3.4s, v3.4s, v19.4s +ldr q19, [x0, #160] +mla v1.4S, v18.4S, v31.s[0] +sub v18.4s, v19.4s, v2.4s +mla v14.4S, v21.4S, v31.s[0] +mla v20.4S, v11.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +mla v13.4S, v9.4S, v31.s[0] +ldr q9, [x0, #224] +sqrdmulh v2.4S, v15.4S, v29.s[2] +sub v11.4s, v9.4s, v22.4s +mul v15.4S, v15.4S,v30.s[2] +sqrdmulh v21.4S, v12.4S, v29.s[2] +add v9.4s, v9.4s, v22.4s +mul v12.4S, v12.4S,v30.s[2] +sqrdmulh v22.4S, v17.4S, v29.s[2] +sub v7.4s, v19.4s, v1.4s +mul v17.4S, v17.4S,v30.s[2] +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v29.s[2] +sub v6.4s, v9.4s, v14.4s +mul v0.4S, v0.4S,v30.s[2] +add v9.4s, v9.4s, v14.4s +mla v15.4S, v2.4S, v31.s[0] +sub v2.4s, v16.4s, v20.4s +mla v12.4S, v21.4S, v31.s[0] +mla v17.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +mla v0.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v7.4S, v27.s[1] +sub v20.4s, v3.4s, v13.4s +mul v7.4S, v7.4S,v28.s[1] +sqrdmulh v22.4S, v6.4S, v27.s[1] +add v3.4s, v3.4s, v13.4s +mul v6.4S, v6.4S,v28.s[1] +sqrdmulh v13.4S, v19.4S, v27.s[0] +sub v21.4s, v18.4s, v15.4s +mul v19.4S, v19.4S,v28.s[0] +add v18.4s, v18.4s, v15.4s +sqrdmulh v15.4S, v9.4S, v27.s[0] +sub v14.4s, v11.4s, v12.4s +mul v9.4S, v9.4S,v28.s[0] +add v11.4s, v11.4s, v12.4s +mla v7.4S, v1.4S, v31.s[0] +sub v1.4s, v10.4s, v17.4s +mla v6.4S, v22.4S, v31.s[0] +mla v19.4S, v13.4S, v31.s[0] +add v10.4s, v10.4s, v17.4s +mla v9.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v18.4S, v27.s[2] +sub v17.4s, v8.4s, v0.4s +mul v18.4S, v18.4S,v28.s[2] +sqrdmulh v13.4S, v11.4S, v27.s[2] +add v8.4s, v8.4s, v0.4s +mul v11.4S, v11.4S,v28.s[2] +sqrdmulh v0.4S, v21.4S, v27.s[3] +sub v22.4s, v2.4s, v7.4s +mul v21.4S, v21.4S,v28.s[3] +add v2.4s, v2.4s, v7.4s +sqrdmulh v7.4S, v14.4S, v27.s[3] +sub v12.4s, v20.4s, v6.4s +mul v14.4S, v14.4S,v28.s[3] +add v20.4s, v20.4s, v6.4s +mla v18.4S, v15.4S, v31.s[0] +sub v15.4s, v16.4s, v19.4s +mla v11.4S, v13.4S, v31.s[0] +mla v21.4S, v0.4S, v31.s[0] +add v16.4s, v16.4s, v19.4s +mla v14.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v20.4S, v25.s[2] +sub v19.4s, v3.4s, v9.4s +mul v20.4S, v20.4S,v26.s[2] +sqrdmulh v0.4S, v12.4S, v25.s[3] +add v3.4s, v3.4s, v9.4s +mul v12.4S, v12.4S,v26.s[3] +sqrdmulh v9.4S, v19.4S, v25.s[1] +sub v13.4s, v10.4s, v18.4s +mul v19.4S, v19.4S,v26.s[1] +add v10.4s, v10.4s, v18.4s +sqrdmulh v18.4S, v3.4S, v25.s[0] +sub v6.4s, v8.4s, v11.4s +mul v3.4S, v3.4S,v26.s[0] +add v8.4s, v8.4s, v11.4s +mla v20.4S, v7.4S, v31.s[0] +sub v7.4s, v1.4s, v21.4s +mla v12.4S, v0.4S, v31.s[0] +mla v19.4S, v9.4S, v31.s[0] +add v1.4s, v1.4s, v21.4s +mla v3.4S, v18.4S, v31.s[0] +sqrdmulh v18.4S, v8.4S, v23.s[0] +sub v21.4s, v17.4s, v14.4s +mul v8.4S, v8.4S,v24.s[0] +sqrdmulh v9.4S, v6.4S, v23.s[1] +add v17.4s, v17.4s, v14.4s +mul v6.4S, v6.4S,v24.s[1] +sqrdmulh v14.4S, v17.4S, v23.s[2] +sub v0.4s, v2.4s, v20.4s +mul v17.4S, v17.4S,v24.s[2] +add v2.4s, v2.4s, v20.4s +sqrdmulh v20.4S, v21.4S, v23.s[3] +sub v11.4s, v22.4s, v12.4s +mul v21.4S, v21.4S,v24.s[3] +add v22.4s, v22.4s, v12.4s +mla v8.4S, v18.4S, v31.s[0] +sub v18.4s, v15.4s, v19.4s +mla v6.4S, v9.4S, v31.s[0] +str q2, [x0, #288] +mla v17.4S, v14.4S, v31.s[0] +add v15.4s, v15.4s, v19.4s +mla v21.4S, v20.4S, v31.s[0] +str q0, [x0, #352] +ldr q0, [x0, #944] +sqrdmulh v20.4S, v0.4S, v29.s[0] +sub v19.4s, v16.4s, v3.4s +mul v0.4S, v0.4S,v30.s[0] +str q22, [x0, #416] +ldr q22, [x0, #1008] +sqrdmulh v14.4S, v22.4S, v29.s[0] +add v16.4s, v16.4s, v3.4s +mul v22.4S, v22.4S,v30.s[0] +str q11, [x0, #480] +ldr q11, [x0, #816] +sqrdmulh v3.4S, v11.4S, v29.s[0] +sub v2.4s, v10.4s, v8.4s +mul v11.4S, v11.4S,v30.s[0] +add v10.4s, v10.4s, v8.4s +ldr q8, [x0, #880] +sqrdmulh v9.4S, v8.4S, v29.s[0] +sub v12.4s, v13.4s, v6.4s +mul v8.4S, v8.4S,v30.s[0] +add v13.4s, v13.4s, v6.4s +mla v0.4S, v20.4S, v31.s[0] +sub v20.4s, v1.4s, v17.4s +mla v22.4S, v14.4S, v31.s[0] +str q15, [x0, #160] +mla v11.4S, v3.4S, v31.s[0] +add v1.4s, v1.4s, v17.4s +mla v8.4S, v9.4S, v31.s[0] +str q18, [x0, #224] +ldr q18, [x0, #560] +sqrdmulh v9.4S, v18.4S, v29.s[0] +sub v17.4s, v7.4s, v21.4s +mul v18.4S, v18.4S,v30.s[0] +str q16, [x0, #32] +ldr q16, [x0, #624] +sqrdmulh v3.4S, v16.4S, v29.s[0] +add v7.4s, v7.4s, v21.4s +mul v16.4S, v16.4S,v30.s[0] +str q19, [x0, #96] +ldr q19, [x0, #688] +ldr q21, [x0, #432] +sqrdmulh v15.4S, v19.4S, v29.s[0] +sub v14.4s, v21.4s, v0.4s +mul v19.4S, v19.4S,v30.s[0] +add v21.4s, v21.4s, v0.4s +ldr q0, [x0, #752] +ldr q6, [x0, #496] +sqrdmulh v5.4S, v0.4S, v29.s[0] +sub v4.4s, v6.4s, v22.4s +mul v0.4S, v0.4S,v30.s[0] +add v6.4s, v6.4s, v22.4s +ldr q22, [x0, #304] +mla v18.4S, v9.4S, v31.s[0] +sub v9.4s, v22.4s, v11.4s +mla v16.4S, v3.4S, v31.s[0] +str q10, [x0, #544] +mla v19.4S, v15.4S, v31.s[0] +add v22.4s, v22.4s, v11.4s +mla v0.4S, v5.4S, v31.s[0] +str q2, [x0, #608] +ldr q2, [x0, #368] +sqrdmulh v5.4S, v21.4S, v29.s[1] +sub v11.4s, v2.4s, v8.4s +mul v21.4S, v21.4S,v30.s[1] +str q13, [x0, #672] +sqrdmulh v13.4S, v6.4S, v29.s[1] +add v2.4s, v2.4s, v8.4s +mul v6.4S, v6.4S,v30.s[1] +str q12, [x0, #736] +ldr q12, [x0, #48] +sqrdmulh v8.4S, v22.4S, v29.s[1] +sub v15.4s, v12.4s, v18.4s +mul v22.4S, v22.4S,v30.s[1] +add v12.4s, v12.4s, v18.4s +ldr q18, [x0, #112] +sqrdmulh v10.4S, v2.4S, v29.s[1] +sub v3.4s, v18.4s, v16.4s +mul v2.4S, v2.4S,v30.s[1] +add v18.4s, v18.4s, v16.4s +ldr q16, [x0, #176] +mla v21.4S, v5.4S, v31.s[0] +sub v5.4s, v16.4s, v19.4s +mla v6.4S, v13.4S, v31.s[0] +str q1, [x0, #800] +mla v22.4S, v8.4S, v31.s[0] +add v16.4s, v16.4s, v19.4s +mla v2.4S, v10.4S, v31.s[0] +str q20, [x0, #864] +ldr q20, [x0, #240] +sqrdmulh v10.4S, v14.4S, v29.s[2] +sub v19.4s, v20.4s, v0.4s +mul v14.4S, v14.4S,v30.s[2] +str q7, [x0, #928] +sqrdmulh v7.4S, v4.4S, v29.s[2] +add v20.4s, v20.4s, v0.4s +mul v4.4S, v4.4S,v30.s[2] +str q17, [x0, #992] +sqrdmulh v17.4S, v9.4S, v29.s[2] +sub v0.4s, v16.4s, v21.4s +mul v9.4S, v9.4S,v30.s[2] +add v16.4s, v16.4s, v21.4s +sqrdmulh v21.4S, v11.4S, v29.s[2] +sub v8.4s, v20.4s, v6.4s +mul v11.4S, v11.4S,v30.s[2] +add v20.4s, v20.4s, v6.4s +mla v14.4S, v10.4S, v31.s[0] +sub v10.4s, v12.4s, v22.4s +mla v4.4S, v7.4S, v31.s[0] +mla v9.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v22.4s +mla v11.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v0.4S, v27.s[1] +sub v22.4s, v18.4s, v2.4s +mul v0.4S, v0.4S,v28.s[1] +sqrdmulh v17.4S, v8.4S, v27.s[1] +add v18.4s, v18.4s, v2.4s +mul v8.4S, v8.4S,v28.s[1] +sqrdmulh v2.4S, v16.4S, v27.s[0] +sub v7.4s, v5.4s, v14.4s +mul v16.4S, v16.4S,v28.s[0] +add v5.4s, v5.4s, v14.4s +sqrdmulh v14.4S, v20.4S, v27.s[0] +sub v6.4s, v19.4s, v4.4s +mul v20.4S, v20.4S,v28.s[0] +add v19.4s, v19.4s, v4.4s +mla v0.4S, v21.4S, v31.s[0] +sub v21.4s, v15.4s, v9.4s +mla v8.4S, v17.4S, v31.s[0] +mla v16.4S, v2.4S, v31.s[0] +add v15.4s, v15.4s, v9.4s +mla v20.4S, v14.4S, v31.s[0] +sqrdmulh v14.4S, v5.4S, v27.s[2] +sub v9.4s, v3.4s, v11.4s +mul v5.4S, v5.4S,v28.s[2] +sqrdmulh v2.4S, v19.4S, v27.s[2] +add v3.4s, v3.4s, v11.4s +mul v19.4S, v19.4S,v28.s[2] +sqrdmulh v11.4S, v7.4S, v27.s[3] +sub v17.4s, v10.4s, v0.4s +mul v7.4S, v7.4S,v28.s[3] +add v10.4s, v10.4s, v0.4s +sqrdmulh v0.4S, v6.4S, v27.s[3] +sub v4.4s, v22.4s, v8.4s +mul v6.4S, v6.4S,v28.s[3] +add v22.4s, v22.4s, v8.4s +mla v5.4S, v14.4S, v31.s[0] +sub v14.4s, v12.4s, v16.4s +mla v19.4S, v2.4S, v31.s[0] +mla v7.4S, v11.4S, v31.s[0] +add v12.4s, v12.4s, v16.4s +mla v6.4S, v0.4S, v31.s[0] +sqrdmulh v0.4S, v22.4S, v25.s[2] +sub v16.4s, v18.4s, v20.4s +mul v22.4S, v22.4S,v26.s[2] +sqrdmulh v11.4S, v4.4S, v25.s[3] +add v18.4s, v18.4s, v20.4s +mul v4.4S, v4.4S,v26.s[3] +sqrdmulh v20.4S, v16.4S, v25.s[1] +sub v2.4s, v15.4s, v5.4s +mul v16.4S, v16.4S,v26.s[1] +add v15.4s, v15.4s, v5.4s +sqrdmulh v5.4S, v18.4S, v25.s[0] +sub v8.4s, v3.4s, v19.4s +mul v18.4S, v18.4S,v26.s[0] +add v3.4s, v3.4s, v19.4s +mla v22.4S, v0.4S, v31.s[0] +sub v0.4s, v21.4s, v7.4s +mla v4.4S, v11.4S, v31.s[0] +mla v16.4S, v20.4S, v31.s[0] +add v21.4s, v21.4s, v7.4s +mla v18.4S, v5.4S, v31.s[0] +sqrdmulh v5.4S, v3.4S, v23.s[0] +sub v7.4s, v9.4s, v6.4s +mul v3.4S, v3.4S,v24.s[0] +sqrdmulh v20.4S, v8.4S, v23.s[1] +add v9.4s, v9.4s, v6.4s +mul v8.4S, v8.4S,v24.s[1] +sqrdmulh v6.4S, v9.4S, v23.s[2] +sub v11.4s, v10.4s, v22.4s +mul v9.4S, v9.4S,v24.s[2] +add v10.4s, v10.4s, v22.4s +sqrdmulh v22.4S, v7.4S, v23.s[3] +sub v19.4s, v17.4s, v4.4s +mul v7.4S, v7.4S,v24.s[3] +add v17.4s, v17.4s, v4.4s +mla v3.4S, v5.4S, v31.s[0] +sub v5.4s, v14.4s, v16.4s +mla v8.4S, v20.4S, v31.s[0] +str q10, [x0, #304] +mla v9.4S, v6.4S, v31.s[0] +add v14.4s, v14.4s, v16.4s +mla v7.4S, v22.4S, v31.s[0] +str q11, [x0, #368] +ldr q11, [x0, #896] +sqrdmulh v22.4S, v11.4S, v29.s[0] +sub v16.4s, v12.4s, v18.4s +mul v11.4S, v11.4S,v30.s[0] +str q17, [x0, #432] +ldr q17, [x0, #960] +sqrdmulh v6.4S, v17.4S, v29.s[0] +add v12.4s, v12.4s, v18.4s +mul v17.4S, v17.4S,v30.s[0] +str q19, [x0, #496] +ldr q19, [x0, #768] +sqrdmulh v18.4S, v19.4S, v29.s[0] +sub v10.4s, v15.4s, v3.4s +mul v19.4S, v19.4S,v30.s[0] +add v15.4s, v15.4s, v3.4s +ldr q3, [x0, #832] +sqrdmulh v20.4S, v3.4S, v29.s[0] +sub v4.4s, v2.4s, v8.4s +mul v3.4S, v3.4S,v30.s[0] +add v2.4s, v2.4s, v8.4s +mla v11.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v9.4s +mla v17.4S, v6.4S, v31.s[0] +str q14, [x0, #176] +mla v19.4S, v18.4S, v31.s[0] +add v21.4s, v21.4s, v9.4s +mla v3.4S, v20.4S, v31.s[0] +str q5, [x0, #240] +ldr q5, [x0, #512] +sqrdmulh v20.4S, v5.4S, v29.s[0] +sub v9.4s, v0.4s, v7.4s +mul v5.4S, v5.4S,v30.s[0] +str q12, [x0, #48] +ldr q12, [x0, #576] +sqrdmulh v18.4S, v12.4S, v29.s[0] +add v0.4s, v0.4s, v7.4s +mul v12.4S, v12.4S,v30.s[0] +str q16, [x0, #112] +ldr q16, [x0, #640] +ldr q7, [x0, #384] +sqrdmulh v14.4S, v16.4S, v29.s[0] +sub v6.4s, v7.4s, v11.4s +mul v16.4S, v16.4S,v30.s[0] +add v7.4s, v7.4s, v11.4s +ldr q11, [x0, #704] +ldr q8, [x0, #448] +sqrdmulh v1.4S, v11.4S, v29.s[0] +sub v13.4s, v8.4s, v17.4s +mul v11.4S, v11.4S,v30.s[0] +add v8.4s, v8.4s, v17.4s +ldr q17, [x0, #256] +mla v5.4S, v20.4S, v31.s[0] +sub v20.4s, v17.4s, v19.4s +mla v12.4S, v18.4S, v31.s[0] +str q15, [x0, #560] +mla v16.4S, v14.4S, v31.s[0] +add v17.4s, v17.4s, v19.4s +mla v11.4S, v1.4S, v31.s[0] +str q10, [x0, #624] +ldr q10, [x0, #320] +sqrdmulh v1.4S, v7.4S, v29.s[1] +sub v19.4s, v10.4s, v3.4s +mul v7.4S, v7.4S,v30.s[1] +str q2, [x0, #688] +sqrdmulh v2.4S, v8.4S, v29.s[1] +add v10.4s, v10.4s, v3.4s +mul v8.4S, v8.4S,v30.s[1] +str q4, [x0, #752] +ldr q4, [x0, #0] +sqrdmulh v3.4S, v17.4S, v29.s[1] +sub v14.4s, v4.4s, v5.4s +mul v17.4S, v17.4S,v30.s[1] +add v4.4s, v4.4s, v5.4s +ldr q5, [x0, #64] +sqrdmulh v15.4S, v10.4S, v29.s[1] +sub v18.4s, v5.4s, v12.4s +mul v10.4S, v10.4S,v30.s[1] +add v5.4s, v5.4s, v12.4s +ldr q12, [x0, #128] +mla v7.4S, v1.4S, v31.s[0] +sub v1.4s, v12.4s, v16.4s +mla v8.4S, v2.4S, v31.s[0] +str q21, [x0, #816] +mla v17.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v16.4s +mla v10.4S, v15.4S, v31.s[0] +str q22, [x0, #880] +ldr q22, [x0, #192] +sqrdmulh v15.4S, v6.4S, v29.s[2] +sub v16.4s, v22.4s, v11.4s +mul v6.4S, v6.4S,v30.s[2] +str q0, [x0, #944] +sqrdmulh v0.4S, v13.4S, v29.s[2] +add v22.4s, v22.4s, v11.4s +mul v13.4S, v13.4S,v30.s[2] +str q9, [x0, #1008] +sqrdmulh v9.4S, v20.4S, v29.s[2] +sub v11.4s, v12.4s, v7.4s +mul v20.4S, v20.4S,v30.s[2] +add v12.4s, v12.4s, v7.4s +sqrdmulh v7.4S, v19.4S, v29.s[2] +sub v3.4s, v22.4s, v8.4s +mul v19.4S, v19.4S,v30.s[2] +add v22.4s, v22.4s, v8.4s +mla v6.4S, v15.4S, v31.s[0] +sub v15.4s, v4.4s, v17.4s +mla v13.4S, v0.4S, v31.s[0] +mla v20.4S, v9.4S, v31.s[0] +add v4.4s, v4.4s, v17.4s +mla v19.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v11.4S, v27.s[1] +sub v17.4s, v5.4s, v10.4s +mul v11.4S, v11.4S,v28.s[1] +sqrdmulh v9.4S, v3.4S, v27.s[1] +add v5.4s, v5.4s, v10.4s +mul v3.4S, v3.4S,v28.s[1] +sqrdmulh v10.4S, v12.4S, v27.s[0] +sub v0.4s, v1.4s, v6.4s +mul v12.4S, v12.4S,v28.s[0] +add v1.4s, v1.4s, v6.4s +sqrdmulh v6.4S, v22.4S, v27.s[0] +sub v8.4s, v16.4s, v13.4s +mul v22.4S, v22.4S,v28.s[0] +add v16.4s, v16.4s, v13.4s +mla v11.4S, v7.4S, v31.s[0] +sub v7.4s, v14.4s, v20.4s +mla v3.4S, v9.4S, v31.s[0] +mla v12.4S, v10.4S, v31.s[0] +add v14.4s, v14.4s, v20.4s +mla v22.4S, v6.4S, v31.s[0] +sqrdmulh v6.4S, v1.4S, v27.s[2] +sub v20.4s, v18.4s, v19.4s +mul v1.4S, v1.4S,v28.s[2] +sqrdmulh v10.4S, v16.4S, v27.s[2] +add v18.4s, v18.4s, v19.4s +mul v16.4S, v16.4S,v28.s[2] +sqrdmulh v19.4S, v0.4S, v27.s[3] +sub v9.4s, v15.4s, v11.4s +mul v0.4S, v0.4S,v28.s[3] +add v15.4s, v15.4s, v11.4s +sqrdmulh v11.4S, v8.4S, v27.s[3] +sub v13.4s, v17.4s, v3.4s +mul v8.4S, v8.4S,v28.s[3] +add v17.4s, v17.4s, v3.4s +mla v1.4S, v6.4S, v31.s[0] +sub v6.4s, v4.4s, v12.4s +mla v16.4S, v10.4S, v31.s[0] +mla v0.4S, v19.4S, v31.s[0] +add v4.4s, v4.4s, v12.4s +mla v8.4S, v11.4S, v31.s[0] +sqrdmulh v11.4S, v17.4S, v25.s[2] +sub v12.4s, v5.4s, v22.4s +mul v17.4S, v17.4S,v26.s[2] +sqrdmulh v19.4S, v13.4S, v25.s[3] +add v5.4s, v5.4s, v22.4s +mul v13.4S, v13.4S,v26.s[3] +sqrdmulh v22.4S, v12.4S, v25.s[1] +sub v10.4s, v14.4s, v1.4s +mul v12.4S, v12.4S,v26.s[1] +add v14.4s, v14.4s, v1.4s +sqrdmulh v1.4S, v5.4S, v25.s[0] +sub v3.4s, v18.4s, v16.4s +mul v5.4S, v5.4S,v26.s[0] +add v18.4s, v18.4s, v16.4s +mla v17.4S, v11.4S, v31.s[0] +sub v11.4s, v7.4s, v0.4s +mla v13.4S, v19.4S, v31.s[0] +mla v12.4S, v22.4S, v31.s[0] +add v7.4s, v7.4s, v0.4s +mla v5.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v18.4S, v23.s[0] +sub v0.4s, v20.4s, v8.4s +mul v18.4S, v18.4S,v24.s[0] +sqrdmulh v22.4S, v3.4S, v23.s[1] +add v20.4s, v20.4s, v8.4s +mul v3.4S, v3.4S,v24.s[1] +sqrdmulh v8.4S, v20.4S, v23.s[2] +sub v19.4s, v15.4s, v17.4s +mul v20.4S, v20.4S,v24.s[2] +add v15.4s, v15.4s, v17.4s +sqrdmulh v17.4S, v0.4S, v23.s[3] +sub v16.4s, v9.4s, v13.4s +mul v0.4S, v0.4S,v24.s[3] +add v9.4s, v9.4s, v13.4s +mla v18.4S, v1.4S, v31.s[0] +sub v1.4s, v6.4s, v12.4s +mla v3.4S, v22.4S, v31.s[0] +str q15, [x0, #256] +mla v20.4S, v8.4S, v31.s[0] +add v6.4s, v6.4s, v12.4s +mla v0.4S, v17.4S, v31.s[0] +str q19, [x0, #320] +ldr q19, [x0, #912] +sqrdmulh v17.4S, v19.4S, v29.s[0] +sub v12.4s, v4.4s, v5.4s +mul v19.4S, v19.4S,v30.s[0] +str q9, [x0, #384] +ldr q9, [x0, #976] +sqrdmulh v8.4S, v9.4S, v29.s[0] +add v4.4s, v4.4s, v5.4s +mul v9.4S, v9.4S,v30.s[0] +str q16, [x0, #448] +ldr q16, [x0, #784] +sqrdmulh v5.4S, v16.4S, v29.s[0] +sub v15.4s, v14.4s, v18.4s +mul v16.4S, v16.4S,v30.s[0] +add v14.4s, v14.4s, v18.4s +ldr q18, [x0, #848] +sqrdmulh v22.4S, v18.4S, v29.s[0] +sub v13.4s, v10.4s, v3.4s +mul v18.4S, v18.4S,v30.s[0] +add v10.4s, v10.4s, v3.4s +mla v19.4S, v17.4S, v31.s[0] +sub v17.4s, v7.4s, v20.4s +mla v9.4S, v8.4S, v31.s[0] +str q6, [x0, #128] +mla v16.4S, v5.4S, v31.s[0] +add v7.4s, v7.4s, v20.4s +mla v18.4S, v22.4S, v31.s[0] +str q1, [x0, #192] +ldr q1, [x0, #528] +sqrdmulh v22.4S, v1.4S, v29.s[0] +sub v20.4s, v11.4s, v0.4s +mul v1.4S, v1.4S,v30.s[0] +str q4, [x0, #0] +ldr q4, [x0, #592] +sqrdmulh v5.4S, v4.4S, v29.s[0] +add v11.4s, v11.4s, v0.4s +mul v4.4S, v4.4S,v30.s[0] +str q12, [x0, #64] +ldr q12, [x0, #656] +ldr q0, [x0, #400] +sqrdmulh v6.4S, v12.4S, v29.s[0] +sub v8.4s, v0.4s, v19.4s +mul v12.4S, v12.4S,v30.s[0] +add v0.4s, v0.4s, v19.4s +ldr q19, [x0, #720] +ldr q3, [x0, #464] +sqrdmulh v21.4S, v19.4S, v29.s[0] +sub v2.4s, v3.4s, v9.4s +mul v19.4S, v19.4S,v30.s[0] +add v3.4s, v3.4s, v9.4s +ldr q9, [x0, #272] +mla v1.4S, v22.4S, v31.s[0] +sub v22.4s, v9.4s, v16.4s +mla v4.4S, v5.4S, v31.s[0] +str q14, [x0, #512] +mla v12.4S, v6.4S, v31.s[0] +add v9.4s, v9.4s, v16.4s +mla v19.4S, v21.4S, v31.s[0] +str q15, [x0, #576] +ldr q15, [x0, #336] +sqrdmulh v21.4S, v0.4S, v29.s[1] +sub v16.4s, v15.4s, v18.4s +mul v0.4S, v0.4S,v30.s[1] +str q10, [x0, #640] +sqrdmulh v10.4S, v3.4S, v29.s[1] +add v15.4s, v15.4s, v18.4s +mul v3.4S, v3.4S,v30.s[1] +str q13, [x0, #704] +ldr q13, [x0, #16] +sqrdmulh v18.4S, v9.4S, v29.s[1] +sub v6.4s, v13.4s, v1.4s +mul v9.4S, v9.4S,v30.s[1] +add v13.4s, v13.4s, v1.4s +ldr q1, [x0, #80] +sqrdmulh v14.4S, v15.4S, v29.s[1] +sub v5.4s, v1.4s, v4.4s +mul v15.4S, v15.4S,v30.s[1] +add v1.4s, v1.4s, v4.4s +ldr q4, [x0, #144] +mla v0.4S, v21.4S, v31.s[0] +sub v21.4s, v4.4s, v12.4s +mla v3.4S, v10.4S, v31.s[0] +str q7, [x0, #768] +mla v9.4S, v18.4S, v31.s[0] +add v4.4s, v4.4s, v12.4s +mla v15.4S, v14.4S, v31.s[0] +str q17, [x0, #832] +ldr q17, [x0, #208] +sqrdmulh v14.4S, v8.4S, v29.s[2] +sub v12.4s, v17.4s, v19.4s +mul v8.4S, v8.4S,v30.s[2] +str q11, [x0, #896] +sqrdmulh v11.4S, v2.4S, v29.s[2] +add v17.4s, v17.4s, v19.4s +mul v2.4S, v2.4S,v30.s[2] +str q20, [x0, #960] +sqrdmulh v20.4S, v22.4S, v29.s[2] +sub v19.4s, v4.4s, v0.4s +mul v22.4S, v22.4S,v30.s[2] +add v4.4s, v4.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[2] +sub v18.4s, v17.4s, v3.4s +mul v16.4S, v16.4S,v30.s[2] +add v17.4s, v17.4s, v3.4s +mla v8.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v9.4s +mla v2.4S, v11.4S, v31.s[0] +mla v22.4S, v20.4S, v31.s[0] +add v13.4s, v13.4s, v9.4s +mla v16.4S, v0.4S, v31.s[0] +sqrdmulh v0.4S, v19.4S, v27.s[1] +sub v9.4s, v1.4s, v15.4s +mul v19.4S, v19.4S,v28.s[1] +sqrdmulh v20.4S, v18.4S, v27.s[1] +add v1.4s, v1.4s, v15.4s +mul v18.4S, v18.4S,v28.s[1] +sqrdmulh v15.4S, v4.4S, v27.s[0] +sub v11.4s, v21.4s, v8.4s +mul v4.4S, v4.4S,v28.s[0] +add v21.4s, v21.4s, v8.4s +sqrdmulh v8.4S, v17.4S, v27.s[0] +sub v3.4s, v12.4s, v2.4s +mul v17.4S, v17.4S,v28.s[0] +add v12.4s, v12.4s, v2.4s +mla v19.4S, v0.4S, v31.s[0] +sub v0.4s, v6.4s, v22.4s +mla v18.4S, v20.4S, v31.s[0] +mla v4.4S, v15.4S, v31.s[0] +add v6.4s, v6.4s, v22.4s +mla v17.4S, v8.4S, v31.s[0] +sqrdmulh v8.4S, v21.4S, v27.s[2] +sub v22.4s, v5.4s, v16.4s +mul v21.4S, v21.4S,v28.s[2] +sqrdmulh v15.4S, v12.4S, v27.s[2] +add v5.4s, v5.4s, v16.4s +mul v12.4S, v12.4S,v28.s[2] +sqrdmulh v16.4S, v11.4S, v27.s[3] +sub v20.4s, v14.4s, v19.4s +mul v11.4S, v11.4S,v28.s[3] +add v14.4s, v14.4s, v19.4s +sqrdmulh v19.4S, v3.4S, v27.s[3] +sub v2.4s, v9.4s, v18.4s +mul v3.4S, v3.4S,v28.s[3] +add v9.4s, v9.4s, v18.4s +mla v21.4S, v8.4S, v31.s[0] +sub v8.4s, v13.4s, v4.4s +mla v12.4S, v15.4S, v31.s[0] +mla v11.4S, v16.4S, v31.s[0] +add v13.4s, v13.4s, v4.4s +mla v3.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v9.4S, v25.s[2] +sub v4.4s, v1.4s, v17.4s +mul v9.4S, v9.4S,v26.s[2] +sqrdmulh v16.4S, v2.4S, v25.s[3] +add v1.4s, v1.4s, v17.4s +mul v2.4S, v2.4S,v26.s[3] +sqrdmulh v17.4S, v4.4S, v25.s[1] +sub v15.4s, v6.4s, v21.4s +mul v4.4S, v4.4S,v26.s[1] +add v6.4s, v6.4s, v21.4s +sqrdmulh v21.4S, v1.4S, v25.s[0] +sub v18.4s, v5.4s, v12.4s +mul v1.4S, v1.4S,v26.s[0] +add v5.4s, v5.4s, v12.4s +mla v9.4S, v19.4S, v31.s[0] +sub v19.4s, v0.4s, v11.4s +mla v2.4S, v16.4S, v31.s[0] +mla v4.4S, v17.4S, v31.s[0] +add v0.4s, v0.4s, v11.4s +mla v1.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v5.4S, v23.s[0] +sub v11.4s, v22.4s, v3.4s +mul v5.4S, v5.4S,v24.s[0] +sqrdmulh v17.4S, v18.4S, v23.s[1] +add v22.4s, v22.4s, v3.4s +mul v18.4S, v18.4S,v24.s[1] +sqrdmulh v3.4S, v22.4S, v23.s[2] +sub v16.4s, v14.4s, v9.4s +mul v22.4S, v22.4S,v24.s[2] +add v14.4s, v14.4s, v9.4s +sqrdmulh v9.4S, v11.4S, v23.s[3] +sub v12.4s, v20.4s, v2.4s +mul v11.4S, v11.4S,v24.s[3] +add v20.4s, v20.4s, v2.4s +mla v5.4S, v21.4S, v31.s[0] +sub v21.4s, v8.4s, v4.4s +mla v18.4S, v17.4S, v31.s[0] +str q14, [x0, #272] +mla v22.4S, v3.4S, v31.s[0] +add v8.4s, v8.4s, v4.4s +mla v11.4S, v9.4S, v31.s[0] +str q16, [x0, #336] +sub v23.4s, v13.4s, v1.4s +str q20, [x0, #400] +add v13.4s, v13.4s, v1.4s +str q12, [x0, #464] +sub v12.4s, v6.4s, v5.4s +add v6.4s, v6.4s, v5.4s +sub v5.4s, v15.4s, v18.4s +add v15.4s, v15.4s, v18.4s +sub v18.4s, v0.4s, v22.4s +str q8, [x0, #144] +add v0.4s, v0.4s, v22.4s +str q21, [x0, #208] +sub v21.4s, v19.4s, v11.4s +str q13, [x0, #16] +add v19.4s, v19.4s, v11.4s +str q23, [x0, #80] +str q6, [x0, #528] +str q12, [x0, #592] +str q15, [x0, #656] +str q5, [x0, #720] +str q0, [x0, #784] +str q18, [x0, #848] +str q19, [x0, #912] +str q21, [x0, #976] +ldr q10, [x17, #+128] +ldr q7, [x17, #+144] +ldr q2, [x17, #+160] +ldr q17, [x17, #+176] +ldr q14, [x17, #+192] +ldr q3, [x17, #+208] +ldr q4, [x17, #+224] +ldr q9, [x17, #+240] +ldr q16, [x0, #32] +ldr q30, [x0, #48] +ldr q29, [x0, #0] +ldr q28, [x0, #16] +sqrdmulh v27.4S, v16.4S, v7.s[0] +mul v16.4S, v16.4S,v10.s[0] +mla v16.4S, v27.4S, v31.s[0] +sub v27.4s, v29.4s, v16.4s +add v29.4s, v29.4s, v16.4s +sqrdmulh v16.4S, v30.4S, v7.s[0] +mul v30.4S, v30.4S,v10.s[0] +mla v30.4S, v16.4S, v31.s[0] +sub v16.4s, v28.4s, v30.4s +add v28.4s, v28.4s, v30.4s +sqrdmulh v30.4S, v28.4S, v7.s[1] +mul v28.4S, v28.4S,v10.s[1] +mla v28.4S, v30.4S, v31.s[0] +sub v30.4s, v29.4s, v28.4s +add v29.4s, v29.4s, v28.4s +sqrdmulh v28.4S, v16.4S, v7.s[2] +mul v16.4S, v16.4S,v10.s[2] +mla v16.4S, v28.4S, v31.s[0] +sub v28.4s, v27.4s, v16.4s +add v27.4s, v27.4s, v16.4s +trn1 v16.4S, v29.4S, v30.4S +trn2 v26.4S, v29.4S, v30.4S +trn1 v25.4S, v27.4S, v28.4S +trn2 v24.4S, v27.4S, v28.4S +trn2 v27.2D, v16.2D, v25.2D +trn2 v28.2D, v26.2D, v24.2D +trn1 v29.2D, v16.2D, v25.2D +trn1 v30.2D, v26.2D, v24.2D +sqrdmulh v24.4S, v27.4S, v17.4S +mul v27.4S, v27.4S,v2.4S +mla v27.4S, v24.4S, v31.s[0] +sub v24.4s, v29.4s, v27.4s +add v29.4s, v29.4s, v27.4s +sqrdmulh v27.4S, v28.4S, v17.4S +mul v28.4S, v28.4S,v2.4S +mla v28.4S, v27.4S, v31.s[0] +sub v27.4s, v30.4s, v28.4s +add v30.4s, v30.4s, v28.4s +sqrdmulh v28.4S, v30.4S, v3.4S +mul v30.4S, v30.4S,v14.4S +mla v30.4S, v28.4S, v31.s[0] +sub v28.4s, v29.4s, v30.4s +add v29.4s, v29.4s, v30.4s +sqrdmulh v30.4S, v27.4S, v9.4S +mul v27.4S, v27.4S,v4.4S +mla v27.4S, v30.4S, v31.s[0] +sub v30.4s, v24.4s, v27.4s +add v24.4s, v24.4s, v27.4s +str q29, [x0, #0] +str q28, [x0, #16] +str q24, [x0, #32] +str q30, [x0, #48] +ldr q30, [x17, #+256] +ldr q24, [x17, #+272] +ldr q28, [x17, #+288] +ldr q29, [x17, #+304] +ldr q27, [x17, #+320] +ldr q26, [x17, #+336] +ldr q25, [x17, #+352] +ldr q16, [x17, #+368] +ldr q9, [x0, #96] +ldr q4, [x0, #112] +ldr q3, [x0, #64] +ldr q14, [x0, #80] +sqrdmulh v17.4S, v9.4S, v24.s[0] +mul v9.4S, v9.4S,v30.s[0] +mla v9.4S, v17.4S, v31.s[0] +sub v17.4s, v3.4s, v9.4s +add v3.4s, v3.4s, v9.4s +sqrdmulh v9.4S, v4.4S, v24.s[0] +mul v4.4S, v4.4S,v30.s[0] +mla v4.4S, v9.4S, v31.s[0] +sub v9.4s, v14.4s, v4.4s +add v14.4s, v14.4s, v4.4s +sqrdmulh v4.4S, v14.4S, v24.s[1] +mul v14.4S, v14.4S,v30.s[1] +mla v14.4S, v4.4S, v31.s[0] +sub v4.4s, v3.4s, v14.4s +add v3.4s, v3.4s, v14.4s +sqrdmulh v14.4S, v9.4S, v24.s[2] +mul v9.4S, v9.4S,v30.s[2] +mla v9.4S, v14.4S, v31.s[0] +sub v14.4s, v17.4s, v9.4s +add v17.4s, v17.4s, v9.4s +trn1 v9.4S, v3.4S, v4.4S +trn2 v2.4S, v3.4S, v4.4S +trn1 v7.4S, v17.4S, v14.4S +trn2 v10.4S, v17.4S, v14.4S +trn2 v17.2D, v9.2D, v7.2D +trn2 v14.2D, v2.2D, v10.2D +trn1 v3.2D, v9.2D, v7.2D +trn1 v4.2D, v2.2D, v10.2D +sqrdmulh v10.4S, v17.4S, v29.4S +mul v17.4S, v17.4S,v28.4S +mla v17.4S, v10.4S, v31.s[0] +sub v10.4s, v3.4s, v17.4s +add v3.4s, v3.4s, v17.4s +sqrdmulh v17.4S, v14.4S, v29.4S +mul v14.4S, v14.4S,v28.4S +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v4.4s, v14.4s +add v4.4s, v4.4s, v14.4s +sqrdmulh v14.4S, v4.4S, v26.4S +mul v4.4S, v4.4S,v27.4S +mla v4.4S, v14.4S, v31.s[0] +sub v14.4s, v3.4s, v4.4s +add v3.4s, v3.4s, v4.4s +sqrdmulh v4.4S, v17.4S, v16.4S +mul v17.4S, v17.4S,v25.4S +mla v17.4S, v4.4S, v31.s[0] +sub v4.4s, v10.4s, v17.4s +add v10.4s, v10.4s, v17.4s +str q3, [x0, #64] +str q14, [x0, #80] +str q10, [x0, #96] +str q4, [x0, #112] +ldr q4, [x17, #+384] +ldr q10, [x17, #+400] +ldr q14, [x17, #+416] +ldr q3, [x17, #+432] +ldr q17, [x17, #+448] +ldr q2, [x17, #+464] +ldr q7, [x17, #+480] +ldr q9, [x17, #+496] +ldr q16, [x0, #160] +ldr q25, [x0, #176] +ldr q26, [x0, #128] +ldr q27, [x0, #144] +sqrdmulh v29.4S, v16.4S, v10.s[0] +mul v16.4S, v16.4S,v4.s[0] +mla v16.4S, v29.4S, v31.s[0] +sub v29.4s, v26.4s, v16.4s +add v26.4s, v26.4s, v16.4s +sqrdmulh v16.4S, v25.4S, v10.s[0] +mul v25.4S, v25.4S,v4.s[0] +mla v25.4S, v16.4S, v31.s[0] +sub v16.4s, v27.4s, v25.4s +add v27.4s, v27.4s, v25.4s +sqrdmulh v25.4S, v27.4S, v10.s[1] +mul v27.4S, v27.4S,v4.s[1] +mla v27.4S, v25.4S, v31.s[0] +sub v25.4s, v26.4s, v27.4s +add v26.4s, v26.4s, v27.4s +sqrdmulh v27.4S, v16.4S, v10.s[2] +mul v16.4S, v16.4S,v4.s[2] +mla v16.4S, v27.4S, v31.s[0] +sub v27.4s, v29.4s, v16.4s +add v29.4s, v29.4s, v16.4s +trn1 v16.4S, v26.4S, v25.4S +trn2 v28.4S, v26.4S, v25.4S +trn1 v24.4S, v29.4S, v27.4S +trn2 v30.4S, v29.4S, v27.4S +trn2 v29.2D, v16.2D, v24.2D +trn2 v27.2D, v28.2D, v30.2D +trn1 v26.2D, v16.2D, v24.2D +trn1 v25.2D, v28.2D, v30.2D +sqrdmulh v30.4S, v29.4S, v3.4S +mul v29.4S, v29.4S,v14.4S +mla v29.4S, v30.4S, v31.s[0] +sub v30.4s, v26.4s, v29.4s +add v26.4s, v26.4s, v29.4s +sqrdmulh v29.4S, v27.4S, v3.4S +mul v27.4S, v27.4S,v14.4S +mla v27.4S, v29.4S, v31.s[0] +sub v29.4s, v25.4s, v27.4s +add v25.4s, v25.4s, v27.4s +sqrdmulh v27.4S, v25.4S, v2.4S +mul v25.4S, v25.4S,v17.4S +mla v25.4S, v27.4S, v31.s[0] +sub v27.4s, v26.4s, v25.4s +add v26.4s, v26.4s, v25.4s +sqrdmulh v25.4S, v29.4S, v9.4S +mul v29.4S, v29.4S,v7.4S +mla v29.4S, v25.4S, v31.s[0] +sub v25.4s, v30.4s, v29.4s +add v30.4s, v30.4s, v29.4s +str q26, [x0, #128] +str q27, [x0, #144] +str q30, [x0, #160] +str q25, [x0, #176] +ldr q25, [x17, #+512] +ldr q30, [x17, #+528] +ldr q27, [x17, #+544] +ldr q26, [x17, #+560] +ldr q29, [x17, #+576] +ldr q28, [x17, #+592] +ldr q24, [x17, #+608] +ldr q16, [x17, #+624] +ldr q9, [x0, #224] +ldr q7, [x0, #240] +ldr q2, [x0, #192] +ldr q17, [x0, #208] +sqrdmulh v3.4S, v9.4S, v30.s[0] +mul v9.4S, v9.4S,v25.s[0] +mla v9.4S, v3.4S, v31.s[0] +sub v3.4s, v2.4s, v9.4s +add v2.4s, v2.4s, v9.4s +sqrdmulh v9.4S, v7.4S, v30.s[0] +mul v7.4S, v7.4S,v25.s[0] +mla v7.4S, v9.4S, v31.s[0] +sub v9.4s, v17.4s, v7.4s +add v17.4s, v17.4s, v7.4s +sqrdmulh v7.4S, v17.4S, v30.s[1] +mul v17.4S, v17.4S,v25.s[1] +mla v17.4S, v7.4S, v31.s[0] +sub v7.4s, v2.4s, v17.4s +add v2.4s, v2.4s, v17.4s +sqrdmulh v17.4S, v9.4S, v30.s[2] +mul v9.4S, v9.4S,v25.s[2] +mla v9.4S, v17.4S, v31.s[0] +sub v17.4s, v3.4s, v9.4s +add v3.4s, v3.4s, v9.4s +trn1 v9.4S, v2.4S, v7.4S +trn2 v14.4S, v2.4S, v7.4S +trn1 v10.4S, v3.4S, v17.4S +trn2 v4.4S, v3.4S, v17.4S +trn2 v3.2D, v9.2D, v10.2D +trn2 v17.2D, v14.2D, v4.2D +trn1 v2.2D, v9.2D, v10.2D +trn1 v7.2D, v14.2D, v4.2D +sqrdmulh v4.4S, v3.4S, v26.4S +mul v3.4S, v3.4S,v27.4S +mla v3.4S, v4.4S, v31.s[0] +sub v4.4s, v2.4s, v3.4s +add v2.4s, v2.4s, v3.4s +sqrdmulh v3.4S, v17.4S, v26.4S +mul v17.4S, v17.4S,v27.4S +mla v17.4S, v3.4S, v31.s[0] +sub v3.4s, v7.4s, v17.4s +add v7.4s, v7.4s, v17.4s +sqrdmulh v17.4S, v7.4S, v28.4S +mul v7.4S, v7.4S,v29.4S +mla v7.4S, v17.4S, v31.s[0] +sub v17.4s, v2.4s, v7.4s +add v2.4s, v2.4s, v7.4s +sqrdmulh v7.4S, v3.4S, v16.4S +mul v3.4S, v3.4S,v24.4S +mla v3.4S, v7.4S, v31.s[0] +sub v7.4s, v4.4s, v3.4s +add v4.4s, v4.4s, v3.4s +str q2, [x0, #192] +str q17, [x0, #208] +str q4, [x0, #224] +str q7, [x0, #240] +ldr q7, [x17, #+640] +ldr q4, [x17, #+656] +ldr q17, [x17, #+672] +ldr q2, [x17, #+688] +ldr q3, [x17, #+704] +ldr q14, [x17, #+720] +ldr q10, [x17, #+736] +ldr q9, [x17, #+752] +ldr q16, [x0, #288] +ldr q24, [x0, #304] +ldr q28, [x0, #256] +ldr q29, [x0, #272] +sqrdmulh v26.4S, v16.4S, v4.s[0] +mul v16.4S, v16.4S,v7.s[0] +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v28.4s, v16.4s +add v28.4s, v28.4s, v16.4s +sqrdmulh v16.4S, v24.4S, v4.s[0] +mul v24.4S, v24.4S,v7.s[0] +mla v24.4S, v16.4S, v31.s[0] +sub v16.4s, v29.4s, v24.4s +add v29.4s, v29.4s, v24.4s +sqrdmulh v24.4S, v29.4S, v4.s[1] +mul v29.4S, v29.4S,v7.s[1] +mla v29.4S, v24.4S, v31.s[0] +sub v24.4s, v28.4s, v29.4s +add v28.4s, v28.4s, v29.4s +sqrdmulh v29.4S, v16.4S, v4.s[2] +mul v16.4S, v16.4S,v7.s[2] +mla v16.4S, v29.4S, v31.s[0] +sub v29.4s, v26.4s, v16.4s +add v26.4s, v26.4s, v16.4s +trn1 v16.4S, v28.4S, v24.4S +trn2 v27.4S, v28.4S, v24.4S +trn1 v30.4S, v26.4S, v29.4S +trn2 v25.4S, v26.4S, v29.4S +trn2 v26.2D, v16.2D, v30.2D +trn2 v29.2D, v27.2D, v25.2D +trn1 v28.2D, v16.2D, v30.2D +trn1 v24.2D, v27.2D, v25.2D +sqrdmulh v25.4S, v26.4S, v2.4S +mul v26.4S, v26.4S,v17.4S +mla v26.4S, v25.4S, v31.s[0] +sub v25.4s, v28.4s, v26.4s +add v28.4s, v28.4s, v26.4s +sqrdmulh v26.4S, v29.4S, v2.4S +mul v29.4S, v29.4S,v17.4S +mla v29.4S, v26.4S, v31.s[0] +sub v26.4s, v24.4s, v29.4s +add v24.4s, v24.4s, v29.4s +sqrdmulh v29.4S, v24.4S, v14.4S +mul v24.4S, v24.4S,v3.4S +mla v24.4S, v29.4S, v31.s[0] +sub v29.4s, v28.4s, v24.4s +add v28.4s, v28.4s, v24.4s +sqrdmulh v24.4S, v26.4S, v9.4S +mul v26.4S, v26.4S,v10.4S +mla v26.4S, v24.4S, v31.s[0] +sub v24.4s, v25.4s, v26.4s +add v25.4s, v25.4s, v26.4s +str q28, [x0, #256] +str q29, [x0, #272] +str q25, [x0, #288] +str q24, [x0, #304] +ldr q24, [x17, #+768] +ldr q25, [x17, #+784] +ldr q29, [x17, #+800] +ldr q28, [x17, #+816] +ldr q26, [x17, #+832] +ldr q27, [x17, #+848] +ldr q30, [x17, #+864] +ldr q16, [x17, #+880] +ldr q9, [x0, #352] +ldr q10, [x0, #368] +ldr q14, [x0, #320] +ldr q3, [x0, #336] +sqrdmulh v2.4S, v9.4S, v25.s[0] +mul v9.4S, v9.4S,v24.s[0] +mla v9.4S, v2.4S, v31.s[0] +sub v2.4s, v14.4s, v9.4s +add v14.4s, v14.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v25.s[0] +mul v10.4S, v10.4S,v24.s[0] +mla v10.4S, v9.4S, v31.s[0] +sub v9.4s, v3.4s, v10.4s +add v3.4s, v3.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v25.s[1] +mul v3.4S, v3.4S,v24.s[1] +mla v3.4S, v10.4S, v31.s[0] +sub v10.4s, v14.4s, v3.4s +add v14.4s, v14.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v25.s[2] +mul v9.4S, v9.4S,v24.s[2] +mla v9.4S, v3.4S, v31.s[0] +sub v3.4s, v2.4s, v9.4s +add v2.4s, v2.4s, v9.4s +trn1 v9.4S, v14.4S, v10.4S +trn2 v17.4S, v14.4S, v10.4S +trn1 v4.4S, v2.4S, v3.4S +trn2 v7.4S, v2.4S, v3.4S +trn2 v2.2D, v9.2D, v4.2D +trn2 v3.2D, v17.2D, v7.2D +trn1 v14.2D, v9.2D, v4.2D +trn1 v10.2D, v17.2D, v7.2D +sqrdmulh v7.4S, v2.4S, v28.4S +mul v2.4S, v2.4S,v29.4S +mla v2.4S, v7.4S, v31.s[0] +sub v7.4s, v14.4s, v2.4s +add v14.4s, v14.4s, v2.4s +sqrdmulh v2.4S, v3.4S, v28.4S +mul v3.4S, v3.4S,v29.4S +mla v3.4S, v2.4S, v31.s[0] +sub v2.4s, v10.4s, v3.4s +add v10.4s, v10.4s, v3.4s +sqrdmulh v3.4S, v10.4S, v27.4S +mul v10.4S, v10.4S,v26.4S +mla v10.4S, v3.4S, v31.s[0] +sub v3.4s, v14.4s, v10.4s +add v14.4s, v14.4s, v10.4s +sqrdmulh v10.4S, v2.4S, v16.4S +mul v2.4S, v2.4S,v30.4S +mla v2.4S, v10.4S, v31.s[0] +sub v10.4s, v7.4s, v2.4s +add v7.4s, v7.4s, v2.4s +str q14, [x0, #320] +str q3, [x0, #336] +str q7, [x0, #352] +str q10, [x0, #368] +ldr q10, [x17, #+896] +ldr q7, [x17, #+912] +ldr q3, [x17, #+928] +ldr q14, [x17, #+944] +ldr q2, [x17, #+960] +ldr q17, [x17, #+976] +ldr q4, [x17, #+992] +ldr q9, [x17, #+1008] +ldr q16, [x0, #416] +ldr q30, [x0, #432] +ldr q27, [x0, #384] +ldr q26, [x0, #400] +sqrdmulh v28.4S, v16.4S, v7.s[0] +mul v16.4S, v16.4S,v10.s[0] +mla v16.4S, v28.4S, v31.s[0] +sub v28.4s, v27.4s, v16.4s +add v27.4s, v27.4s, v16.4s +sqrdmulh v16.4S, v30.4S, v7.s[0] +mul v30.4S, v30.4S,v10.s[0] +mla v30.4S, v16.4S, v31.s[0] +sub v16.4s, v26.4s, v30.4s +add v26.4s, v26.4s, v30.4s +sqrdmulh v30.4S, v26.4S, v7.s[1] +mul v26.4S, v26.4S,v10.s[1] +mla v26.4S, v30.4S, v31.s[0] +sub v30.4s, v27.4s, v26.4s +add v27.4s, v27.4s, v26.4s +sqrdmulh v26.4S, v16.4S, v7.s[2] +mul v16.4S, v16.4S,v10.s[2] +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v28.4s, v16.4s +add v28.4s, v28.4s, v16.4s +trn1 v16.4S, v27.4S, v30.4S +trn2 v29.4S, v27.4S, v30.4S +trn1 v25.4S, v28.4S, v26.4S +trn2 v24.4S, v28.4S, v26.4S +trn2 v28.2D, v16.2D, v25.2D +trn2 v26.2D, v29.2D, v24.2D +trn1 v27.2D, v16.2D, v25.2D +trn1 v30.2D, v29.2D, v24.2D +sqrdmulh v24.4S, v28.4S, v14.4S +mul v28.4S, v28.4S,v3.4S +mla v28.4S, v24.4S, v31.s[0] +sub v24.4s, v27.4s, v28.4s +add v27.4s, v27.4s, v28.4s +sqrdmulh v28.4S, v26.4S, v14.4S +mul v26.4S, v26.4S,v3.4S +mla v26.4S, v28.4S, v31.s[0] +sub v28.4s, v30.4s, v26.4s +add v30.4s, v30.4s, v26.4s +sqrdmulh v26.4S, v30.4S, v17.4S +mul v30.4S, v30.4S,v2.4S +mla v30.4S, v26.4S, v31.s[0] +sub v26.4s, v27.4s, v30.4s +add v27.4s, v27.4s, v30.4s +sqrdmulh v30.4S, v28.4S, v9.4S +mul v28.4S, v28.4S,v4.4S +mla v28.4S, v30.4S, v31.s[0] +sub v30.4s, v24.4s, v28.4s +add v24.4s, v24.4s, v28.4s +str q27, [x0, #384] +str q26, [x0, #400] +str q24, [x0, #416] +str q30, [x0, #432] +ldr q30, [x17, #+1024] +ldr q24, [x17, #+1040] +ldr q26, [x17, #+1056] +ldr q27, [x17, #+1072] +ldr q28, [x17, #+1088] +ldr q29, [x17, #+1104] +ldr q25, [x17, #+1120] +ldr q16, [x17, #+1136] +ldr q9, [x0, #480] +ldr q4, [x0, #496] +ldr q17, [x0, #448] +ldr q2, [x0, #464] +sqrdmulh v14.4S, v9.4S, v24.s[0] +mul v9.4S, v9.4S,v30.s[0] +mla v9.4S, v14.4S, v31.s[0] +sub v14.4s, v17.4s, v9.4s +add v17.4s, v17.4s, v9.4s +sqrdmulh v9.4S, v4.4S, v24.s[0] +mul v4.4S, v4.4S,v30.s[0] +mla v4.4S, v9.4S, v31.s[0] +sub v9.4s, v2.4s, v4.4s +add v2.4s, v2.4s, v4.4s +sqrdmulh v4.4S, v2.4S, v24.s[1] +mul v2.4S, v2.4S,v30.s[1] +mla v2.4S, v4.4S, v31.s[0] +sub v4.4s, v17.4s, v2.4s +add v17.4s, v17.4s, v2.4s +sqrdmulh v2.4S, v9.4S, v24.s[2] +mul v9.4S, v9.4S,v30.s[2] +mla v9.4S, v2.4S, v31.s[0] +sub v2.4s, v14.4s, v9.4s +add v14.4s, v14.4s, v9.4s +trn1 v9.4S, v17.4S, v4.4S +trn2 v3.4S, v17.4S, v4.4S +trn1 v7.4S, v14.4S, v2.4S +trn2 v10.4S, v14.4S, v2.4S +trn2 v14.2D, v9.2D, v7.2D +trn2 v2.2D, v3.2D, v10.2D +trn1 v17.2D, v9.2D, v7.2D +trn1 v4.2D, v3.2D, v10.2D +sqrdmulh v10.4S, v14.4S, v27.4S +mul v14.4S, v14.4S,v26.4S +mla v14.4S, v10.4S, v31.s[0] +sub v10.4s, v17.4s, v14.4s +add v17.4s, v17.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v27.4S +mul v2.4S, v2.4S,v26.4S +mla v2.4S, v14.4S, v31.s[0] +sub v14.4s, v4.4s, v2.4s +add v4.4s, v4.4s, v2.4s +sqrdmulh v2.4S, v4.4S, v29.4S +mul v4.4S, v4.4S,v28.4S +mla v4.4S, v2.4S, v31.s[0] +sub v2.4s, v17.4s, v4.4s +add v17.4s, v17.4s, v4.4s +sqrdmulh v4.4S, v14.4S, v16.4S +mul v14.4S, v14.4S,v25.4S +mla v14.4S, v4.4S, v31.s[0] +sub v4.4s, v10.4s, v14.4s +add v10.4s, v10.4s, v14.4s +str q17, [x0, #448] +str q2, [x0, #464] +str q10, [x0, #480] +str q4, [x0, #496] +ldr q4, [x17, #+1152] +ldr q10, [x17, #+1168] +ldr q2, [x17, #+1184] +ldr q17, [x17, #+1200] +ldr q14, [x17, #+1216] +ldr q3, [x17, #+1232] +ldr q7, [x17, #+1248] +ldr q9, [x17, #+1264] +ldr q16, [x0, #544] +ldr q25, [x0, #560] +ldr q29, [x0, #512] +ldr q28, [x0, #528] +sqrdmulh v27.4S, v16.4S, v10.s[0] +mul v16.4S, v16.4S,v4.s[0] +mla v16.4S, v27.4S, v31.s[0] +sub v27.4s, v29.4s, v16.4s +add v29.4s, v29.4s, v16.4s +sqrdmulh v16.4S, v25.4S, v10.s[0] +mul v25.4S, v25.4S,v4.s[0] +mla v25.4S, v16.4S, v31.s[0] +sub v16.4s, v28.4s, v25.4s +add v28.4s, v28.4s, v25.4s +sqrdmulh v25.4S, v28.4S, v10.s[1] +mul v28.4S, v28.4S,v4.s[1] +mla v28.4S, v25.4S, v31.s[0] +sub v25.4s, v29.4s, v28.4s +add v29.4s, v29.4s, v28.4s +sqrdmulh v28.4S, v16.4S, v10.s[2] +mul v16.4S, v16.4S,v4.s[2] +mla v16.4S, v28.4S, v31.s[0] +sub v28.4s, v27.4s, v16.4s +add v27.4s, v27.4s, v16.4s +trn1 v16.4S, v29.4S, v25.4S +trn2 v26.4S, v29.4S, v25.4S +trn1 v24.4S, v27.4S, v28.4S +trn2 v30.4S, v27.4S, v28.4S +trn2 v27.2D, v16.2D, v24.2D +trn2 v28.2D, v26.2D, v30.2D +trn1 v29.2D, v16.2D, v24.2D +trn1 v25.2D, v26.2D, v30.2D +sqrdmulh v30.4S, v27.4S, v17.4S +mul v27.4S, v27.4S,v2.4S +mla v27.4S, v30.4S, v31.s[0] +sub v30.4s, v29.4s, v27.4s +add v29.4s, v29.4s, v27.4s +sqrdmulh v27.4S, v28.4S, v17.4S +mul v28.4S, v28.4S,v2.4S +mla v28.4S, v27.4S, v31.s[0] +sub v27.4s, v25.4s, v28.4s +add v25.4s, v25.4s, v28.4s +sqrdmulh v28.4S, v25.4S, v3.4S +mul v25.4S, v25.4S,v14.4S +mla v25.4S, v28.4S, v31.s[0] +sub v28.4s, v29.4s, v25.4s +add v29.4s, v29.4s, v25.4s +sqrdmulh v25.4S, v27.4S, v9.4S +mul v27.4S, v27.4S,v7.4S +mla v27.4S, v25.4S, v31.s[0] +sub v25.4s, v30.4s, v27.4s +add v30.4s, v30.4s, v27.4s +str q29, [x0, #512] +str q28, [x0, #528] +str q30, [x0, #544] +str q25, [x0, #560] +ldr q25, [x17, #+1280] +ldr q30, [x17, #+1296] +ldr q28, [x17, #+1312] +ldr q29, [x17, #+1328] +ldr q27, [x17, #+1344] +ldr q26, [x17, #+1360] +ldr q24, [x17, #+1376] +ldr q16, [x17, #+1392] +ldr q9, [x0, #608] +ldr q7, [x0, #624] +ldr q3, [x0, #576] +ldr q14, [x0, #592] +sqrdmulh v17.4S, v9.4S, v30.s[0] +mul v9.4S, v9.4S,v25.s[0] +mla v9.4S, v17.4S, v31.s[0] +sub v17.4s, v3.4s, v9.4s +add v3.4s, v3.4s, v9.4s +sqrdmulh v9.4S, v7.4S, v30.s[0] +mul v7.4S, v7.4S,v25.s[0] +mla v7.4S, v9.4S, v31.s[0] +sub v9.4s, v14.4s, v7.4s +add v14.4s, v14.4s, v7.4s +sqrdmulh v7.4S, v14.4S, v30.s[1] +mul v14.4S, v14.4S,v25.s[1] +mla v14.4S, v7.4S, v31.s[0] +sub v7.4s, v3.4s, v14.4s +add v3.4s, v3.4s, v14.4s +sqrdmulh v14.4S, v9.4S, v30.s[2] +mul v9.4S, v9.4S,v25.s[2] +mla v9.4S, v14.4S, v31.s[0] +sub v14.4s, v17.4s, v9.4s +add v17.4s, v17.4s, v9.4s +trn1 v9.4S, v3.4S, v7.4S +trn2 v2.4S, v3.4S, v7.4S +trn1 v10.4S, v17.4S, v14.4S +trn2 v4.4S, v17.4S, v14.4S +trn2 v17.2D, v9.2D, v10.2D +trn2 v14.2D, v2.2D, v4.2D +trn1 v3.2D, v9.2D, v10.2D +trn1 v7.2D, v2.2D, v4.2D +sqrdmulh v4.4S, v17.4S, v29.4S +mul v17.4S, v17.4S,v28.4S +mla v17.4S, v4.4S, v31.s[0] +sub v4.4s, v3.4s, v17.4s +add v3.4s, v3.4s, v17.4s +sqrdmulh v17.4S, v14.4S, v29.4S +mul v14.4S, v14.4S,v28.4S +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v7.4s, v14.4s +add v7.4s, v7.4s, v14.4s +sqrdmulh v14.4S, v7.4S, v26.4S +mul v7.4S, v7.4S,v27.4S +mla v7.4S, v14.4S, v31.s[0] +sub v14.4s, v3.4s, v7.4s +add v3.4s, v3.4s, v7.4s +sqrdmulh v7.4S, v17.4S, v16.4S +mul v17.4S, v17.4S,v24.4S +mla v17.4S, v7.4S, v31.s[0] +sub v7.4s, v4.4s, v17.4s +add v4.4s, v4.4s, v17.4s +str q3, [x0, #576] +str q14, [x0, #592] +str q4, [x0, #608] +str q7, [x0, #624] +ldr q7, [x17, #+1408] +ldr q4, [x17, #+1424] +ldr q14, [x17, #+1440] +ldr q3, [x17, #+1456] +ldr q17, [x17, #+1472] +ldr q2, [x17, #+1488] +ldr q10, [x17, #+1504] +ldr q9, [x17, #+1520] +ldr q16, [x0, #672] +ldr q24, [x0, #688] +ldr q26, [x0, #640] +ldr q27, [x0, #656] +sqrdmulh v29.4S, v16.4S, v4.s[0] +mul v16.4S, v16.4S,v7.s[0] +mla v16.4S, v29.4S, v31.s[0] +sub v29.4s, v26.4s, v16.4s +add v26.4s, v26.4s, v16.4s +sqrdmulh v16.4S, v24.4S, v4.s[0] +mul v24.4S, v24.4S,v7.s[0] +mla v24.4S, v16.4S, v31.s[0] +sub v16.4s, v27.4s, v24.4s +add v27.4s, v27.4s, v24.4s +sqrdmulh v24.4S, v27.4S, v4.s[1] +mul v27.4S, v27.4S,v7.s[1] +mla v27.4S, v24.4S, v31.s[0] +sub v24.4s, v26.4s, v27.4s +add v26.4s, v26.4s, v27.4s +sqrdmulh v27.4S, v16.4S, v4.s[2] +mul v16.4S, v16.4S,v7.s[2] +mla v16.4S, v27.4S, v31.s[0] +sub v27.4s, v29.4s, v16.4s +add v29.4s, v29.4s, v16.4s +trn1 v16.4S, v26.4S, v24.4S +trn2 v28.4S, v26.4S, v24.4S +trn1 v30.4S, v29.4S, v27.4S +trn2 v25.4S, v29.4S, v27.4S +trn2 v29.2D, v16.2D, v30.2D +trn2 v27.2D, v28.2D, v25.2D +trn1 v26.2D, v16.2D, v30.2D +trn1 v24.2D, v28.2D, v25.2D +sqrdmulh v25.4S, v29.4S, v3.4S +mul v29.4S, v29.4S,v14.4S +mla v29.4S, v25.4S, v31.s[0] +sub v25.4s, v26.4s, v29.4s +add v26.4s, v26.4s, v29.4s +sqrdmulh v29.4S, v27.4S, v3.4S +mul v27.4S, v27.4S,v14.4S +mla v27.4S, v29.4S, v31.s[0] +sub v29.4s, v24.4s, v27.4s +add v24.4s, v24.4s, v27.4s +sqrdmulh v27.4S, v24.4S, v2.4S +mul v24.4S, v24.4S,v17.4S +mla v24.4S, v27.4S, v31.s[0] +sub v27.4s, v26.4s, v24.4s +add v26.4s, v26.4s, v24.4s +sqrdmulh v24.4S, v29.4S, v9.4S +mul v29.4S, v29.4S,v10.4S +mla v29.4S, v24.4S, v31.s[0] +sub v24.4s, v25.4s, v29.4s +add v25.4s, v25.4s, v29.4s +str q26, [x0, #640] +str q27, [x0, #656] +str q25, [x0, #672] +str q24, [x0, #688] +ldr q24, [x17, #+1536] +ldr q25, [x17, #+1552] +ldr q27, [x17, #+1568] +ldr q26, [x17, #+1584] +ldr q29, [x17, #+1600] +ldr q28, [x17, #+1616] +ldr q30, [x17, #+1632] +ldr q16, [x17, #+1648] +ldr q9, [x0, #736] +ldr q10, [x0, #752] +ldr q2, [x0, #704] +ldr q17, [x0, #720] +sqrdmulh v3.4S, v9.4S, v25.s[0] +mul v9.4S, v9.4S,v24.s[0] +mla v9.4S, v3.4S, v31.s[0] +sub v3.4s, v2.4s, v9.4s +add v2.4s, v2.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v25.s[0] +mul v10.4S, v10.4S,v24.s[0] +mla v10.4S, v9.4S, v31.s[0] +sub v9.4s, v17.4s, v10.4s +add v17.4s, v17.4s, v10.4s +sqrdmulh v10.4S, v17.4S, v25.s[1] +mul v17.4S, v17.4S,v24.s[1] +mla v17.4S, v10.4S, v31.s[0] +sub v10.4s, v2.4s, v17.4s +add v2.4s, v2.4s, v17.4s +sqrdmulh v17.4S, v9.4S, v25.s[2] +mul v9.4S, v9.4S,v24.s[2] +mla v9.4S, v17.4S, v31.s[0] +sub v17.4s, v3.4s, v9.4s +add v3.4s, v3.4s, v9.4s +trn1 v9.4S, v2.4S, v10.4S +trn2 v14.4S, v2.4S, v10.4S +trn1 v4.4S, v3.4S, v17.4S +trn2 v7.4S, v3.4S, v17.4S +trn2 v3.2D, v9.2D, v4.2D +trn2 v17.2D, v14.2D, v7.2D +trn1 v2.2D, v9.2D, v4.2D +trn1 v10.2D, v14.2D, v7.2D +sqrdmulh v7.4S, v3.4S, v26.4S +mul v3.4S, v3.4S,v27.4S +mla v3.4S, v7.4S, v31.s[0] +sub v7.4s, v2.4s, v3.4s +add v2.4s, v2.4s, v3.4s +sqrdmulh v3.4S, v17.4S, v26.4S +mul v17.4S, v17.4S,v27.4S +mla v17.4S, v3.4S, v31.s[0] +sub v3.4s, v10.4s, v17.4s +add v10.4s, v10.4s, v17.4s +sqrdmulh v17.4S, v10.4S, v28.4S +mul v10.4S, v10.4S,v29.4S +mla v10.4S, v17.4S, v31.s[0] +sub v17.4s, v2.4s, v10.4s +add v2.4s, v2.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v16.4S +mul v3.4S, v3.4S,v30.4S +mla v3.4S, v10.4S, v31.s[0] +sub v10.4s, v7.4s, v3.4s +add v7.4s, v7.4s, v3.4s +str q2, [x0, #704] +str q17, [x0, #720] +str q7, [x0, #736] +str q10, [x0, #752] +ldr q10, [x17, #+1664] +ldr q7, [x17, #+1680] +ldr q17, [x17, #+1696] +ldr q2, [x17, #+1712] +ldr q3, [x17, #+1728] +ldr q14, [x17, #+1744] +ldr q4, [x17, #+1760] +ldr q9, [x17, #+1776] +ldr q16, [x0, #800] +ldr q30, [x0, #816] +ldr q28, [x0, #768] +ldr q29, [x0, #784] +sqrdmulh v26.4S, v16.4S, v7.s[0] +mul v16.4S, v16.4S,v10.s[0] +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v28.4s, v16.4s +add v28.4s, v28.4s, v16.4s +sqrdmulh v16.4S, v30.4S, v7.s[0] +mul v30.4S, v30.4S,v10.s[0] +mla v30.4S, v16.4S, v31.s[0] +sub v16.4s, v29.4s, v30.4s +add v29.4s, v29.4s, v30.4s +sqrdmulh v30.4S, v29.4S, v7.s[1] +mul v29.4S, v29.4S,v10.s[1] +mla v29.4S, v30.4S, v31.s[0] +sub v30.4s, v28.4s, v29.4s +add v28.4s, v28.4s, v29.4s +sqrdmulh v29.4S, v16.4S, v7.s[2] +mul v16.4S, v16.4S,v10.s[2] +mla v16.4S, v29.4S, v31.s[0] +sub v29.4s, v26.4s, v16.4s +add v26.4s, v26.4s, v16.4s +trn1 v16.4S, v28.4S, v30.4S +trn2 v27.4S, v28.4S, v30.4S +trn1 v25.4S, v26.4S, v29.4S +trn2 v24.4S, v26.4S, v29.4S +trn2 v26.2D, v16.2D, v25.2D +trn2 v29.2D, v27.2D, v24.2D +trn1 v28.2D, v16.2D, v25.2D +trn1 v30.2D, v27.2D, v24.2D +sqrdmulh v24.4S, v26.4S, v2.4S +mul v26.4S, v26.4S,v17.4S +mla v26.4S, v24.4S, v31.s[0] +sub v24.4s, v28.4s, v26.4s +add v28.4s, v28.4s, v26.4s +sqrdmulh v26.4S, v29.4S, v2.4S +mul v29.4S, v29.4S,v17.4S +mla v29.4S, v26.4S, v31.s[0] +sub v26.4s, v30.4s, v29.4s +add v30.4s, v30.4s, v29.4s +sqrdmulh v29.4S, v30.4S, v14.4S +mul v30.4S, v30.4S,v3.4S +mla v30.4S, v29.4S, v31.s[0] +sub v29.4s, v28.4s, v30.4s +add v28.4s, v28.4s, v30.4s +sqrdmulh v30.4S, v26.4S, v9.4S +mul v26.4S, v26.4S,v4.4S +mla v26.4S, v30.4S, v31.s[0] +sub v30.4s, v24.4s, v26.4s +add v24.4s, v24.4s, v26.4s +str q28, [x0, #768] +str q29, [x0, #784] +str q24, [x0, #800] +str q30, [x0, #816] +ldr q30, [x17, #+1792] +ldr q24, [x17, #+1808] +ldr q29, [x17, #+1824] +ldr q28, [x17, #+1840] +ldr q26, [x17, #+1856] +ldr q27, [x17, #+1872] +ldr q25, [x17, #+1888] +ldr q16, [x17, #+1904] +ldr q9, [x0, #864] +ldr q4, [x0, #880] +ldr q14, [x0, #832] +ldr q3, [x0, #848] +sqrdmulh v2.4S, v9.4S, v24.s[0] +mul v9.4S, v9.4S,v30.s[0] +mla v9.4S, v2.4S, v31.s[0] +sub v2.4s, v14.4s, v9.4s +add v14.4s, v14.4s, v9.4s +sqrdmulh v9.4S, v4.4S, v24.s[0] +mul v4.4S, v4.4S,v30.s[0] +mla v4.4S, v9.4S, v31.s[0] +sub v9.4s, v3.4s, v4.4s +add v3.4s, v3.4s, v4.4s +sqrdmulh v4.4S, v3.4S, v24.s[1] +mul v3.4S, v3.4S,v30.s[1] +mla v3.4S, v4.4S, v31.s[0] +sub v4.4s, v14.4s, v3.4s +add v14.4s, v14.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v24.s[2] +mul v9.4S, v9.4S,v30.s[2] +mla v9.4S, v3.4S, v31.s[0] +sub v3.4s, v2.4s, v9.4s +add v2.4s, v2.4s, v9.4s +trn1 v9.4S, v14.4S, v4.4S +trn2 v17.4S, v14.4S, v4.4S +trn1 v7.4S, v2.4S, v3.4S +trn2 v10.4S, v2.4S, v3.4S +trn2 v2.2D, v9.2D, v7.2D +trn2 v3.2D, v17.2D, v10.2D +trn1 v14.2D, v9.2D, v7.2D +trn1 v4.2D, v17.2D, v10.2D +sqrdmulh v10.4S, v2.4S, v28.4S +mul v2.4S, v2.4S,v29.4S +mla v2.4S, v10.4S, v31.s[0] +sub v10.4s, v14.4s, v2.4s +add v14.4s, v14.4s, v2.4s +sqrdmulh v2.4S, v3.4S, v28.4S +mul v3.4S, v3.4S,v29.4S +mla v3.4S, v2.4S, v31.s[0] +sub v2.4s, v4.4s, v3.4s +add v4.4s, v4.4s, v3.4s +sqrdmulh v3.4S, v4.4S, v27.4S +mul v4.4S, v4.4S,v26.4S +mla v4.4S, v3.4S, v31.s[0] +sub v3.4s, v14.4s, v4.4s +add v14.4s, v14.4s, v4.4s +sqrdmulh v4.4S, v2.4S, v16.4S +mul v2.4S, v2.4S,v25.4S +mla v2.4S, v4.4S, v31.s[0] +sub v4.4s, v10.4s, v2.4s +add v10.4s, v10.4s, v2.4s +str q14, [x0, #832] +str q3, [x0, #848] +str q10, [x0, #864] +str q4, [x0, #880] +ldr q4, [x17, #+1920] +ldr q10, [x17, #+1936] +ldr q3, [x17, #+1952] +ldr q14, [x17, #+1968] +ldr q2, [x17, #+1984] +ldr q17, [x17, #+2000] +ldr q7, [x17, #+2016] +ldr q9, [x17, #+2032] +ldr q16, [x0, #928] +ldr q25, [x0, #944] +ldr q27, [x0, #896] +ldr q26, [x0, #912] +sqrdmulh v28.4S, v16.4S, v10.s[0] +mul v16.4S, v16.4S,v4.s[0] +mla v16.4S, v28.4S, v31.s[0] +sub v28.4s, v27.4s, v16.4s +add v27.4s, v27.4s, v16.4s +sqrdmulh v16.4S, v25.4S, v10.s[0] +mul v25.4S, v25.4S,v4.s[0] +mla v25.4S, v16.4S, v31.s[0] +sub v16.4s, v26.4s, v25.4s +add v26.4s, v26.4s, v25.4s +sqrdmulh v25.4S, v26.4S, v10.s[1] +mul v26.4S, v26.4S,v4.s[1] +mla v26.4S, v25.4S, v31.s[0] +sub v25.4s, v27.4s, v26.4s +add v27.4s, v27.4s, v26.4s +sqrdmulh v26.4S, v16.4S, v10.s[2] +mul v16.4S, v16.4S,v4.s[2] +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v28.4s, v16.4s +add v28.4s, v28.4s, v16.4s +trn1 v16.4S, v27.4S, v25.4S +trn2 v29.4S, v27.4S, v25.4S +trn1 v24.4S, v28.4S, v26.4S +trn2 v30.4S, v28.4S, v26.4S +trn2 v28.2D, v16.2D, v24.2D +trn2 v26.2D, v29.2D, v30.2D +trn1 v27.2D, v16.2D, v24.2D +trn1 v25.2D, v29.2D, v30.2D +sqrdmulh v30.4S, v28.4S, v14.4S +mul v28.4S, v28.4S,v3.4S +mla v28.4S, v30.4S, v31.s[0] +sub v30.4s, v27.4s, v28.4s +add v27.4s, v27.4s, v28.4s +sqrdmulh v28.4S, v26.4S, v14.4S +mul v26.4S, v26.4S,v3.4S +mla v26.4S, v28.4S, v31.s[0] +sub v28.4s, v25.4s, v26.4s +add v25.4s, v25.4s, v26.4s +sqrdmulh v26.4S, v25.4S, v17.4S +mul v25.4S, v25.4S,v2.4S +mla v25.4S, v26.4S, v31.s[0] +sub v26.4s, v27.4s, v25.4s +add v27.4s, v27.4s, v25.4s +sqrdmulh v25.4S, v28.4S, v9.4S +mul v28.4S, v28.4S,v7.4S +mla v28.4S, v25.4S, v31.s[0] +sub v25.4s, v30.4s, v28.4s +add v30.4s, v30.4s, v28.4s +str q27, [x0, #896] +str q26, [x0, #912] +str q30, [x0, #928] +str q25, [x0, #944] +ldr q25, [x17, #+2048] +ldr q30, [x17, #+2064] +ldr q26, [x17, #+2080] +ldr q27, [x17, #+2096] +ldr q28, [x17, #+2112] +ldr q29, [x17, #+2128] +ldr q24, [x17, #+2144] +ldr q16, [x17, #+2160] +ldr q9, [x0, #992] +ldr q7, [x0, #1008] +ldr q17, [x0, #960] +ldr q2, [x0, #976] +sqrdmulh v14.4S, v9.4S, v30.s[0] +mul v9.4S, v9.4S,v25.s[0] +mla v9.4S, v14.4S, v31.s[0] +sub v14.4s, v17.4s, v9.4s +add v17.4s, v17.4s, v9.4s +sqrdmulh v9.4S, v7.4S, v30.s[0] +mul v7.4S, v7.4S,v25.s[0] +mla v7.4S, v9.4S, v31.s[0] +sub v9.4s, v2.4s, v7.4s +add v2.4s, v2.4s, v7.4s +sqrdmulh v7.4S, v2.4S, v30.s[1] +mul v2.4S, v2.4S,v25.s[1] +mla v2.4S, v7.4S, v31.s[0] +sub v7.4s, v17.4s, v2.4s +add v17.4s, v17.4s, v2.4s +sqrdmulh v2.4S, v9.4S, v30.s[2] +mul v9.4S, v9.4S,v25.s[2] +mla v9.4S, v2.4S, v31.s[0] +sub v2.4s, v14.4s, v9.4s +add v14.4s, v14.4s, v9.4s +trn1 v9.4S, v17.4S, v7.4S +trn2 v3.4S, v17.4S, v7.4S +trn1 v10.4S, v14.4S, v2.4S +trn2 v4.4S, v14.4S, v2.4S +trn2 v14.2D, v9.2D, v10.2D +trn2 v2.2D, v3.2D, v4.2D +trn1 v17.2D, v9.2D, v10.2D +trn1 v7.2D, v3.2D, v4.2D +sqrdmulh v4.4S, v14.4S, v27.4S +mul v14.4S, v14.4S,v26.4S +mla v14.4S, v4.4S, v31.s[0] +sub v4.4s, v17.4s, v14.4s +add v17.4s, v17.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v27.4S +mul v2.4S, v2.4S,v26.4S +mla v2.4S, v14.4S, v31.s[0] +sub v14.4s, v7.4s, v2.4s +add v7.4s, v7.4s, v2.4s +sqrdmulh v2.4S, v7.4S, v29.4S +mul v7.4S, v7.4S,v28.4S +mla v7.4S, v2.4S, v31.s[0] +sub v2.4s, v17.4s, v7.4s +add v17.4s, v17.4s, v7.4s +sqrdmulh v7.4S, v14.4S, v16.4S +mul v14.4S, v14.4S,v24.4S +mla v14.4S, v7.4S, v31.s[0] +sub v7.4s, v4.4s, v14.4s +add v4.4s, v4.4s, v14.4s +str q17, [x0, #960] +str q2, [x0, #976] +str q4, [x0, #992] +str q7, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 2392 +// Instruction count: 2388 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_9_0.s b/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_9_0.s new file mode 100644 index 0000000..df18890 --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_full_33556993_28678040_var_4_4_9_0.s @@ -0,0 +1,2422 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 26036764 // Layer 6, block 0 +.word 7065381 // Layer 6, block 1 +.word 11280567 // Layer 6, block 2 +.word 19695786 // Layer 6, block 3 +.word 1666225723 // Layer 6, block 0 +.word 452149874 // Layer 6, block 1 +.word 721901190 // Layer 6, block 2 +.word 1260434103 // Layer 6, block 3 +.word 28678040 // Layer 7, block 0 +.word 5637166 // Layer 7, block 2 +.word 18759424 // Layer 7, block 4 +.word 8648030 // Layer 7, block 6 +.word 1835254486 // Layer 7, block 0 +.word 360751090 // Layer 7, block 2 +.word 1200511508 // Layer 7, block 4 +.word 553431680 // Layer 7, block 6 +.word 7232147 // Layer 7, block 1 +.word 7430689 // Layer 7, block 3 +.word 14819378 // Layer 7, block 5 +.word 22112339 // Layer 7, block 7 +.word 462822084 // Layer 7, block 1 +.word 475527802 // Layer 7, block 3 +.word 948367809 // Layer 7, block 5 +.word 1415081692 // Layer 7, block 7 +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14834498 // Layer 6, block 4 +.word 22861321 // Layer 6, block 5 +.word 23033862 // Layer 6, block 6 +.word 32211066 // Layer 6, block 7 +.word 949335415 // Layer 6, block 4 +.word 1463012881 // Layer 6, block 5 +.word 1474054663 // Layer 6, block 6 +.word 2061350894 // Layer 6, block 7 +.word 7103825 // Layer 7, block 8 +.word 24338119 // Layer 7, block 10 +.word 6674394 // Layer 7, block 12 +.word 3716128 // Layer 7, block 14 +.word 454610102 // Layer 7, block 8 +.word 1557520740 // Layer 7, block 10 +.word 427128616 // Layer 7, block 12 +.word 237814041 // Layer 7, block 14 +.word 18577393 // Layer 7, block 9 +.word 17042091 // Layer 7, block 11 +.word 6574213 // Layer 7, block 13 +.word 24666803 // Layer 7, block 15 +.word 1188862414 // Layer 7, block 9 +.word 1090610585 // Layer 7, block 11 +.word 420717521 // Layer 7, block 13 +.word 1578554911 // Layer 7, block 15 +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 11253846 // Layer 6, block 8 +.word 16151303 // Layer 6, block 9 +.word 1821442 // Layer 6, block 10 +.word 23358663 // Layer 6, block 11 +.word 720191176 // Layer 6, block 8 +.word 1033604503 // Layer 6, block 9 +.word 116563391 // Layer 6, block 10 +.word 1494840340 // Layer 6, block 11 +.word 32787475 // Layer 7, block 16 +.word 8269259 // Layer 7, block 18 +.word 20826321 // Layer 7, block 20 +.word 21194054 // Layer 7, block 22 +.word 2098238255 // Layer 7, block 16 +.word 529192186 // Layer 7, block 18 +.word 1332782821 // Layer 7, block 20 +.word 1356315937 // Layer 7, block 22 +.word 28400654 // Layer 7, block 17 +.word 31090287 // Layer 7, block 19 +.word 26776841 // Layer 7, block 21 +.word 22281074 // Layer 7, block 23 +.word 1817503137 // Layer 7, block 17 +.word 1989626512 // Layer 7, block 19 +.word 1713587037 // Layer 7, block 21 +.word 1425879908 // Layer 7, block 23 +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 20504641 // Layer 6, block 12 +.word 7735096 // Layer 6, block 13 +.word 29463916 // Layer 6, block 14 +.word 23172067 // Layer 6, block 15 +.word 1312196872 // Layer 6, block 12 +.word 495008363 // Layer 6, block 13 +.word 1885546712 // Layer 6, block 14 +.word 1482899108 // Layer 6, block 15 +.word 1953000 // Layer 7, block 24 +.word 12766243 // Layer 7, block 26 +.word 16292342 // Layer 7, block 28 +.word 25143337 // Layer 7, block 30 +.word 124982461 // Layer 7, block 24 +.word 816977197 // Layer 7, block 26 +.word 1042630311 // Layer 7, block 28 +.word 1609050759 // Layer 7, block 30 +.word 12486848 // Layer 7, block 25 +.word 31556661 // Layer 7, block 27 +.word 28330310 // Layer 7, block 29 +.word 15137961 // Layer 7, block 31 +.word 799097282 // Layer 7, block 25 +.word 2019472170 // Layer 7, block 27 +.word 1813001465 // Layer 7, block 29 +.word 968755565 // Layer 7, block 31 +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 18663828 // Layer 6, block 16 +.word 25765932 // Layer 6, block 17 +.word 11779122 // Layer 6, block 18 +.word 29112305 // Layer 6, block 19 +.word 1194393831 // Layer 6, block 16 +.word 1648893798 // Layer 6, block 17 +.word 753806275 // Layer 6, block 18 +.word 1863045325 // Layer 6, block 19 +.word 33163184 // Layer 7, block 32 +.word 11550623 // Layer 7, block 34 +.word 25375595 // Layer 7, block 36 +.word 18254638 // Layer 7, block 38 +.word 2122281795 // Layer 7, block 32 +.word 739183455 // Layer 7, block 34 +.word 1623914137 // Layer 7, block 36 +.word 1168207670 // Layer 7, block 38 +.word 9551359 // Layer 7, block 33 +.word 33257316 // Layer 7, block 35 +.word 10387700 // Layer 7, block 37 +.word 4263629 // Layer 7, block 39 +.word 611240324 // Layer 7, block 33 +.word 2128305784 // Layer 7, block 35 +.word 664762063 // Layer 7, block 37 +.word 272851431 // Layer 7, block 39 +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 596073 // Layer 6, block 20 +.word 29039358 // Layer 6, block 21 +.word 6760262 // Layer 6, block 22 +.word 2228887 // Layer 6, block 23 +.word 38145761 // Layer 6, block 20 +.word 1858377074 // Layer 6, block 21 +.word 432623749 // Layer 6, block 22 +.word 142637881 // Layer 6, block 23 +.word 25929180 // Layer 7, block 40 +.word 23508428 // Layer 7, block 42 +.word 22560727 // Layer 7, block 44 +.word 29457393 // Layer 7, block 46 +.word 1659340873 // Layer 7, block 40 +.word 1504424569 // Layer 7, block 42 +.word 1443776334 // Layer 7, block 44 +.word 1885129272 // Layer 7, block 46 +.word 17371159 // Layer 7, block 41 +.word 11558208 // Layer 7, block 43 +.word 15755637 // Layer 7, block 45 +.word 20740787 // Layer 7, block 47 +.word 1111669329 // Layer 7, block 41 +.word 739668858 // Layer 7, block 43 +.word 1008283812 // Layer 7, block 45 +.word 1327309063 // Layer 7, block 47 +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 13624329 // Layer 6, block 24 +.word 9838349 // Layer 6, block 25 +.word 6934560 // Layer 6, block 26 +.word 11310234 // Layer 6, block 27 +.word 871890510 // Layer 6, block 24 +.word 629606282 // Layer 6, block 25 +.word 443777969 // Layer 6, block 26 +.word 723799733 // Layer 6, block 27 +.word 3153984 // Layer 7, block 48 +.word 15599806 // Layer 7, block 50 +.word 23484790 // Layer 7, block 52 +.word 30174454 // Layer 7, block 54 +.word 201839571 // Layer 7, block 48 +.word 998311389 // Layer 7, block 50 +.word 1502911852 // Layer 7, block 52 +.word 1931017673 // Layer 7, block 54 +.word 13598070 // Layer 7, block 49 +.word 31454003 // Layer 7, block 51 +.word 20506260 // Layer 7, block 53 +.word 5928435 // Layer 7, block 55 +.word 870210062 // Layer 7, block 49 +.word 2012902560 // Layer 7, block 51 +.word 1312300480 // Layer 7, block 53 +.word 379390883 // Layer 7, block 55 +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 32798516 // Layer 6, block 28 +.word 9911360 // Layer 6, block 29 +.word 32443170 // Layer 6, block 30 +.word 31293482 // Layer 6, block 31 +.word 2098944825 // Layer 6, block 28 +.word 634278629 // Layer 6, block 29 +.word 2076204416 // Layer 6, block 30 +.word 2002630000 // Layer 6, block 31 +.word 26013877 // Layer 7, block 56 +.word 22928950 // Layer 7, block 58 +.word 24547058 // Layer 7, block 60 +.word 21082546 // Layer 7, block 62 +.word 1664761067 // Layer 7, block 56 +.word 1467340807 // Layer 7, block 58 +.word 1570891816 // Layer 7, block 60 +.word 1349179970 // Layer 7, block 62 +.word 21864746 // Layer 7, block 57 +.word 27678266 // Layer 7, block 59 +.word 30695887 // Layer 7, block 61 +.word 31772478 // Layer 7, block 63 +.word 1399236949 // Layer 7, block 57 +.word 1771273834 // Layer 7, block 59 +.word 1964386839 // Layer 7, block 61 +.word 2033283404 // Layer 7, block 63 +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 2853776 // Layer 6, block 32 +.word 31645959 // Layer 6, block 33 +.word 29723614 // Layer 6, block 34 +.word 31813171 // Layer 6, block 35 +.word 182627725 // Layer 6, block 32 +.word 2025186806 // Layer 6, block 33 +.word 1902166116 // Layer 6, block 34 +.word 2035887557 // Layer 6, block 35 +.word 30377953 // Layer 7, block 64 +.word 4924837 // Layer 7, block 66 +.word 11362575 // Layer 7, block 68 +.word 31398766 // Layer 7, block 70 +.word 1944040616 // Layer 7, block 64 +.word 315165513 // Layer 7, block 66 +.word 727149301 // Layer 7, block 68 +.word 2009367662 // Layer 7, block 70 +.word 27689101 // Layer 7, block 65 +.word 31229525 // Layer 7, block 67 +.word 6544948 // Layer 7, block 69 +.word 13728247 // Layer 7, block 71 +.word 1771967221 // Layer 7, block 65 +.word 1998537064 // Layer 7, block 67 +.word 418844704 // Layer 7, block 69 +.word 878540754 // Layer 7, block 71 +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9116920 // Layer 6, block 36 +.word 26449800 // Layer 6, block 37 +.word 27173300 // Layer 6, block 38 +.word 1574249 // Layer 6, block 39 +.word 583438350 // Layer 6, block 36 +.word 1692658010 // Layer 6, block 37 +.word 1738958476 // Layer 6, block 38 +.word 100744247 // Layer 6, block 39 +.word 6510145 // Layer 7, block 72 +.word 760999 // Layer 7, block 74 +.word 1634503 // Layer 7, block 76 +.word 29546109 // Layer 7, block 78 +.word 416617482 // Layer 7, block 72 +.word 48700219 // Layer 7, block 74 +.word 104600209 // Layer 7, block 76 +.word 1890806663 // Layer 7, block 78 +.word 2195232 // Layer 7, block 73 +.word 4465852 // Layer 7, block 75 +.word 31203102 // Layer 7, block 77 +.word 29916743 // Layer 7, block 79 +.word 140484126 // Layer 7, block 73 +.word 285792715 // Layer 7, block 75 +.word 1996846121 // Layer 7, block 77 +.word 1914525428 // Layer 7, block 79 +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29172999 // Layer 6, block 40 +.word 16825951 // Layer 6, block 41 +.word 11592382 // Layer 6, block 42 +.word 2671395 // Layer 6, block 43 +.word 1866929445 // Layer 6, block 40 +.word 1076778680 // Layer 6, block 41 +.word 741855827 // Layer 6, block 42 +.word 170956232 // Layer 6, block 43 +.word 14579779 // Layer 7, block 80 +.word 24263513 // Layer 7, block 82 +.word 4646776 // Layer 7, block 84 +.word 69049 // Layer 7, block 86 +.word 933034643 // Layer 7, block 80 +.word 1552746321 // Layer 7, block 82 +.word 297370968 // Layer 7, block 84 +.word 4418799 // Layer 7, block 86 +.word 33263488 // Layer 7, block 81 +.word 22493246 // Layer 7, block 83 +.word 22009979 // Layer 7, block 85 +.word 12021234 // Layer 7, block 87 +.word 2128700762 // Layer 7, block 81 +.word 1439457879 // Layer 7, block 83 +.word 1408531152 // Layer 7, block 85 +.word 769300260 // Layer 7, block 87 +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 15720958 // Layer 6, block 44 +.word 4876619 // Layer 6, block 45 +.word 9370171 // Layer 6, block 46 +.word 2197027 // Layer 6, block 47 +.word 1006064525 // Layer 6, block 44 +.word 312079797 // Layer 6, block 45 +.word 599645177 // Layer 6, block 46 +.word 140598997 // Layer 6, block 47 +.word 16117282 // Layer 7, block 88 +.word 9635661 // Layer 7, block 90 +.word 9117520 // Layer 7, block 92 +.word 3506913 // Layer 7, block 94 +.word 1031427326 // Layer 7, block 88 +.word 616635240 // Layer 7, block 90 +.word 583476747 // Layer 7, block 92 +.word 224425303 // Layer 7, block 94 +.word 20014407 // Layer 7, block 89 +.word 25893988 // Layer 7, block 91 +.word 10257619 // Layer 7, block 93 +.word 24501669 // Layer 7, block 95 +.word 1280824291 // Layer 7, block 89 +.word 1657088757 // Layer 7, block 91 +.word 656437514 // Layer 7, block 93 +.word 1567987141 // Layer 7, block 95 +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 23467272 // Layer 6, block 48 +.word 11944835 // Layer 6, block 49 +.word 29768154 // Layer 6, block 50 +.word 3189790 // Layer 6, block 51 +.word 1501790786 // Layer 6, block 48 +.word 764411097 // Layer 6, block 49 +.word 1905016458 // Layer 6, block 50 +.word 204130980 // Layer 6, block 51 +.word 28559032 // Layer 7, block 96 +.word 20151609 // Layer 7, block 98 +.word 11645481 // Layer 7, block 100 +.word 16402437 // Layer 7, block 102 +.word 1827638556 // Layer 7, block 96 +.word 1289604549 // Layer 7, block 98 +.word 745253903 // Layer 7, block 100 +.word 1049675853 // Layer 7, block 102 +.word 1005359 // Layer 7, block 97 +.word 19130139 // Layer 7, block 99 +.word 11690281 // Layer 7, block 101 +.word 5461508 // Layer 7, block 103 +.word 64338065 // Layer 7, block 97 +.word 1224235458 // Layer 7, block 99 +.word 748120885 // Layer 7, block 101 +.word 349509836 // Layer 7, block 103 +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 4898455 // Layer 6, block 52 +.word 22059944 // Layer 6, block 53 +.word 20315246 // Layer 6, block 54 +.word 28615767 // Layer 6, block 55 +.word 313477194 // Layer 6, block 52 +.word 1411728668 // Layer 6, block 53 +.word 1300076517 // Layer 6, block 54 +.word 1831269319 // Layer 6, block 55 +.word 6226096 // Layer 7, block 104 +.word 14029790 // Layer 7, block 106 +.word 7729000 // Layer 7, block 108 +.word 13958531 // Layer 7, block 110 +.word 398439734 // Layer 7, block 104 +.word 897838034 // Layer 7, block 106 +.word 494618249 // Layer 7, block 108 +.word 893277806 // Layer 7, block 110 +.word 31755058 // Layer 7, block 105 +.word 26102744 // Layer 7, block 107 +.word 19175904 // Layer 7, block 109 +.word 19472238 // Layer 7, block 111 +.word 2032168609 // Layer 7, block 105 +.word 1670448121 // Layer 7, block 107 +.word 1227164194 // Layer 7, block 109 +.word 1246128123 // Layer 7, block 111 +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 17302560 // Layer 6, block 56 +.word 8630188 // Layer 6, block 57 +.word 13744680 // Layer 6, block 58 +.word 31890906 // Layer 6, block 59 +.word 1107279328 // Layer 6, block 56 +.word 552289879 // Layer 6, block 57 +.word 879592386 // Layer 6, block 58 +.word 2040862218 // Layer 6, block 59 +.word 4735938 // Layer 7, block 112 +.word 26671657 // Layer 7, block 114 +.word 25810971 // Layer 7, block 116 +.word 25578690 // Layer 7, block 118 +.word 303076900 // Layer 7, block 112 +.word 1706855774 // Layer 7, block 114 +.word 1651776074 // Layer 7, block 116 +.word 1636911225 // Layer 7, block 118 +.word 6957373 // Layer 7, block 113 +.word 25381712 // Layer 7, block 115 +.word 27780827 // Layer 7, block 117 +.word 28062311 // Layer 7, block 119 +.word 445237890 // Layer 7, block 113 +.word 1624305595 // Layer 7, block 115 +.word 1777837237 // Layer 7, block 117 +.word 1795850838 // Layer 7, block 119 +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 26150922 // Layer 6, block 60 +.word 29525906 // Layer 6, block 61 +.word 23080870 // Layer 6, block 62 +.word 1636987 // Layer 6, block 63 +.word 1673531278 // Layer 6, block 60 +.word 1889513769 // Layer 6, block 61 +.word 1477062945 // Layer 6, block 62 +.word 104759172 // Layer 6, block 63 +.word 10674616 // Layer 7, block 120 +.word 9508293 // Layer 7, block 122 +.word 4274200 // Layer 7, block 124 +.word 10066304 // Layer 7, block 126 +.word 683123285 // Layer 7, block 120 +.word 608484310 // Layer 7, block 122 +.word 273527923 // Layer 7, block 124 +.word 644194289 // Layer 7, block 126 +.word 26473446 // Layer 7, block 121 +.word 14853570 // Layer 7, block 123 +.word 32427548 // Layer 7, block 125 +.word 16598340 // Layer 7, block 127 +.word 1694171239 // Layer 7, block 121 +.word 950555930 // Layer 7, block 123 +.word 2075204685 // Layer 7, block 125 +.word 1062212688 // Layer 7, block 127 +.text +.global ntt_u32_full_neon_asm_var_4_4_9_0 +.global _ntt_u32_full_neon_asm_var_4_4_9_0 +ntt_u32_full_neon_asm_var_4_4_9_0: +_ntt_u32_full_neon_asm_var_4_4_9_0: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #928] +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +ldr q20, [x0, #992] +sqrdmulh v19.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q18, [x0, #800] +sqrdmulh v17.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +ldr q16, [x0, #864] +sqrdmulh v3.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +mla v22.4S, v21.4S, v31.s[0] +mla v20.4S, v19.4S, v31.s[0] +mla v18.4S, v17.4S, v31.s[0] +mla v16.4S, v3.4S, v31.s[0] +ldr q3, [x0, #544] +sqrdmulh v17.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +ldr q19, [x0, #608] +sqrdmulh v21.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q2, [x0, #672] +ldr q1, [x0, #416] +sqrdmulh v0.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +sub v15.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +ldr q22, [x0, #736] +ldr q14, [x0, #480] +sqrdmulh v13.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v12.4s, v14.4s, v20.4s +add v14.4s, v14.4s, v20.4s +ldr q20, [x0, #288] +mla v3.4S, v17.4S, v31.s[0] +mla v19.4S, v21.4S, v31.s[0] +sub v21.4s, v20.4s, v18.4s +mla v2.4S, v0.4S, v31.s[0] +mla v22.4S, v13.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +ldr q18, [x0, #352] +sqrdmulh v13.4S, v1.4S, v29.s[1] +mul v1.4S, v1.4S,v30.s[1] +sub v0.4s, v18.4s, v16.4s +sqrdmulh v17.4S, v14.4S, v29.s[1] +mul v14.4S, v14.4S,v30.s[1] +add v18.4s, v18.4s, v16.4s +ldr q16, [x0, #32] +sqrdmulh v11.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v10.4s, v16.4s, v3.4s +add v16.4s, v16.4s, v3.4s +ldr q3, [x0, #96] +sqrdmulh v9.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v8.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +ldr q19, [x0, #160] +mla v1.4S, v13.4S, v31.s[0] +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v19.4s, v2.4s +mla v20.4S, v11.4S, v31.s[0] +mla v18.4S, v9.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +ldr q2, [x0, #224] +sqrdmulh v9.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +sub v11.4s, v2.4s, v22.4s +sqrdmulh v13.4S, v12.4S, v29.s[2] +mul v12.4S, v12.4S,v30.s[2] +add v2.4s, v2.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +sub v7.4s, v19.4s, v1.4s +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v29.s[2] +mul v0.4S, v0.4S,v30.s[2] +sub v6.4s, v2.4s, v14.4s +add v2.4s, v2.4s, v14.4s +mla v15.4S, v9.4S, v31.s[0] +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v16.4s, v20.4s +mla v21.4S, v22.4S, v31.s[0] +mla v0.4S, v1.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v7.4S, v27.s[1] +mul v7.4S, v7.4S,v28.s[1] +sub v1.4s, v3.4s, v18.4s +sqrdmulh v22.4S, v6.4S, v27.s[1] +mul v6.4S, v6.4S,v28.s[1] +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v19.4S, v27.s[0] +mul v19.4S, v19.4S,v28.s[0] +sub v9.4s, v17.4s, v15.4s +add v17.4s, v17.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v27.s[0] +mul v2.4S, v2.4S,v28.s[0] +sub v14.4s, v11.4s, v12.4s +add v11.4s, v11.4s, v12.4s +mla v7.4S, v20.4S, v31.s[0] +mla v6.4S, v22.4S, v31.s[0] +sub v22.4s, v10.4s, v21.4s +mla v19.4S, v18.4S, v31.s[0] +mla v2.4S, v15.4S, v31.s[0] +add v10.4s, v10.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v27.s[2] +mul v17.4S, v17.4S,v28.s[2] +sub v15.4s, v8.4s, v0.4s +sqrdmulh v18.4S, v11.4S, v27.s[2] +mul v11.4S, v11.4S,v28.s[2] +add v8.4s, v8.4s, v0.4s +sqrdmulh v0.4S, v9.4S, v27.s[3] +mul v9.4S, v9.4S,v28.s[3] +sub v20.4s, v13.4s, v7.4s +add v13.4s, v13.4s, v7.4s +sqrdmulh v7.4S, v14.4S, v27.s[3] +mul v14.4S, v14.4S,v28.s[3] +sub v12.4s, v1.4s, v6.4s +add v1.4s, v1.4s, v6.4s +mla v17.4S, v21.4S, v31.s[0] +mla v11.4S, v18.4S, v31.s[0] +sub v18.4s, v16.4s, v19.4s +mla v9.4S, v0.4S, v31.s[0] +mla v14.4S, v7.4S, v31.s[0] +add v16.4s, v16.4s, v19.4s +sqrdmulh v19.4S, v1.4S, v25.s[2] +mul v1.4S, v1.4S,v26.s[2] +sub v7.4s, v3.4s, v2.4s +sqrdmulh v0.4S, v12.4S, v25.s[3] +mul v12.4S, v12.4S,v26.s[3] +add v3.4s, v3.4s, v2.4s +sqrdmulh v2.4S, v7.4S, v25.s[1] +mul v7.4S, v7.4S,v26.s[1] +sub v21.4s, v10.4s, v17.4s +add v10.4s, v10.4s, v17.4s +sqrdmulh v17.4S, v3.4S, v25.s[0] +mul v3.4S, v3.4S,v26.s[0] +sub v6.4s, v8.4s, v11.4s +add v8.4s, v8.4s, v11.4s +mla v1.4S, v19.4S, v31.s[0] +mla v12.4S, v0.4S, v31.s[0] +sub v0.4s, v22.4s, v9.4s +mla v7.4S, v2.4S, v31.s[0] +mla v3.4S, v17.4S, v31.s[0] +add v22.4s, v22.4s, v9.4s +sqrdmulh v9.4S, v8.4S, v23.s[0] +mul v8.4S, v8.4S,v24.s[0] +sub v17.4s, v15.4s, v14.4s +sqrdmulh v2.4S, v6.4S, v23.s[1] +mul v6.4S, v6.4S,v24.s[1] +add v15.4s, v15.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v23.s[2] +mul v15.4S, v15.4S,v24.s[2] +sub v19.4s, v13.4s, v1.4s +add v13.4s, v13.4s, v1.4s +sqrdmulh v1.4S, v17.4S, v23.s[3] +mul v17.4S, v17.4S,v24.s[3] +sub v11.4s, v20.4s, v12.4s +add v20.4s, v20.4s, v12.4s +mla v8.4S, v9.4S, v31.s[0] +mla v6.4S, v2.4S, v31.s[0] +sub v2.4s, v18.4s, v7.4s +str q13, [x0, #288] +mla v15.4S, v14.4S, v31.s[0] +mla v17.4S, v1.4S, v31.s[0] +add v18.4s, v18.4s, v7.4s +str q19, [x0, #352] +ldr q19, [x0, #944] +sqrdmulh v7.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +str q20, [x0, #416] +sub v20.4s, v16.4s, v3.4s +ldr q1, [x0, #1008] +sqrdmulh v14.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +str q11, [x0, #480] +add v16.4s, v16.4s, v3.4s +ldr q3, [x0, #816] +sqrdmulh v11.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +sub v13.4s, v10.4s, v8.4s +add v10.4s, v10.4s, v8.4s +ldr q8, [x0, #880] +sqrdmulh v9.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v12.4s, v21.4s, v6.4s +add v21.4s, v21.4s, v6.4s +mla v19.4S, v7.4S, v31.s[0] +mla v1.4S, v14.4S, v31.s[0] +str q18, [x0, #160] +sub v18.4s, v22.4s, v15.4s +mla v3.4S, v11.4S, v31.s[0] +mla v8.4S, v9.4S, v31.s[0] +str q2, [x0, #224] +add v22.4s, v22.4s, v15.4s +ldr q15, [x0, #560] +sqrdmulh v2.4S, v15.4S, v29.s[0] +mul v15.4S, v15.4S,v30.s[0] +str q16, [x0, #32] +sub v16.4s, v0.4s, v17.4s +ldr q9, [x0, #624] +sqrdmulh v11.4S, v9.4S, v29.s[0] +mul v9.4S, v9.4S,v30.s[0] +str q20, [x0, #96] +add v0.4s, v0.4s, v17.4s +ldr q17, [x0, #688] +ldr q20, [x0, #432] +sqrdmulh v14.4S, v17.4S, v29.s[0] +mul v17.4S, v17.4S,v30.s[0] +sub v7.4s, v20.4s, v19.4s +add v20.4s, v20.4s, v19.4s +ldr q19, [x0, #752] +ldr q6, [x0, #496] +sqrdmulh v5.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +sub v4.4s, v6.4s, v1.4s +add v6.4s, v6.4s, v1.4s +ldr q1, [x0, #304] +mla v15.4S, v2.4S, v31.s[0] +mla v9.4S, v11.4S, v31.s[0] +str q10, [x0, #544] +sub v10.4s, v1.4s, v3.4s +mla v17.4S, v14.4S, v31.s[0] +mla v19.4S, v5.4S, v31.s[0] +str q13, [x0, #608] +add v1.4s, v1.4s, v3.4s +ldr q3, [x0, #368] +sqrdmulh v13.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +str q21, [x0, #672] +sub v21.4s, v3.4s, v8.4s +sqrdmulh v5.4S, v6.4S, v29.s[1] +mul v6.4S, v6.4S,v30.s[1] +str q12, [x0, #736] +add v3.4s, v3.4s, v8.4s +ldr q8, [x0, #48] +sqrdmulh v12.4S, v1.4S, v29.s[1] +mul v1.4S, v1.4S,v30.s[1] +sub v14.4s, v8.4s, v15.4s +add v8.4s, v8.4s, v15.4s +ldr q15, [x0, #112] +sqrdmulh v11.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v2.4s, v15.4s, v9.4s +add v15.4s, v15.4s, v9.4s +ldr q9, [x0, #176] +mla v20.4S, v13.4S, v31.s[0] +mla v6.4S, v5.4S, v31.s[0] +str q22, [x0, #800] +sub v22.4s, v9.4s, v17.4s +mla v1.4S, v12.4S, v31.s[0] +mla v3.4S, v11.4S, v31.s[0] +str q18, [x0, #864] +add v9.4s, v9.4s, v17.4s +ldr q17, [x0, #240] +sqrdmulh v18.4S, v7.4S, v29.s[2] +mul v7.4S, v7.4S,v30.s[2] +str q0, [x0, #928] +sub v0.4s, v17.4s, v19.4s +sqrdmulh v11.4S, v4.4S, v29.s[2] +mul v4.4S, v4.4S,v30.s[2] +str q16, [x0, #992] +add v17.4s, v17.4s, v19.4s +sqrdmulh v19.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v16.4s, v9.4s, v20.4s +add v9.4s, v9.4s, v20.4s +sqrdmulh v20.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +sub v12.4s, v17.4s, v6.4s +add v17.4s, v17.4s, v6.4s +mla v7.4S, v18.4S, v31.s[0] +mla v4.4S, v11.4S, v31.s[0] +sub v11.4s, v8.4s, v1.4s +mla v10.4S, v19.4S, v31.s[0] +mla v21.4S, v20.4S, v31.s[0] +add v8.4s, v8.4s, v1.4s +sqrdmulh v1.4S, v16.4S, v27.s[1] +mul v16.4S, v16.4S,v28.s[1] +sub v20.4s, v15.4s, v3.4s +sqrdmulh v19.4S, v12.4S, v27.s[1] +mul v12.4S, v12.4S,v28.s[1] +add v15.4s, v15.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v27.s[0] +mul v9.4S, v9.4S,v28.s[0] +sub v18.4s, v22.4s, v7.4s +add v22.4s, v22.4s, v7.4s +sqrdmulh v7.4S, v17.4S, v27.s[0] +mul v17.4S, v17.4S,v28.s[0] +sub v6.4s, v0.4s, v4.4s +add v0.4s, v0.4s, v4.4s +mla v16.4S, v1.4S, v31.s[0] +mla v12.4S, v19.4S, v31.s[0] +sub v19.4s, v14.4s, v10.4s +mla v9.4S, v3.4S, v31.s[0] +mla v17.4S, v7.4S, v31.s[0] +add v14.4s, v14.4s, v10.4s +sqrdmulh v10.4S, v22.4S, v27.s[2] +mul v22.4S, v22.4S,v28.s[2] +sub v7.4s, v2.4s, v21.4s +sqrdmulh v3.4S, v0.4S, v27.s[2] +mul v0.4S, v0.4S,v28.s[2] +add v2.4s, v2.4s, v21.4s +sqrdmulh v21.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v1.4s, v11.4s, v16.4s +add v11.4s, v11.4s, v16.4s +sqrdmulh v16.4S, v6.4S, v27.s[3] +mul v6.4S, v6.4S,v28.s[3] +sub v4.4s, v20.4s, v12.4s +add v20.4s, v20.4s, v12.4s +mla v22.4S, v10.4S, v31.s[0] +mla v0.4S, v3.4S, v31.s[0] +sub v3.4s, v8.4s, v9.4s +mla v18.4S, v21.4S, v31.s[0] +mla v6.4S, v16.4S, v31.s[0] +add v8.4s, v8.4s, v9.4s +sqrdmulh v9.4S, v20.4S, v25.s[2] +mul v20.4S, v20.4S,v26.s[2] +sub v16.4s, v15.4s, v17.4s +sqrdmulh v21.4S, v4.4S, v25.s[3] +mul v4.4S, v4.4S,v26.s[3] +add v15.4s, v15.4s, v17.4s +sqrdmulh v17.4S, v16.4S, v25.s[1] +mul v16.4S, v16.4S,v26.s[1] +sub v10.4s, v14.4s, v22.4s +add v14.4s, v14.4s, v22.4s +sqrdmulh v22.4S, v15.4S, v25.s[0] +mul v15.4S, v15.4S,v26.s[0] +sub v12.4s, v2.4s, v0.4s +add v2.4s, v2.4s, v0.4s +mla v20.4S, v9.4S, v31.s[0] +mla v4.4S, v21.4S, v31.s[0] +sub v21.4s, v19.4s, v18.4s +mla v16.4S, v17.4S, v31.s[0] +mla v15.4S, v22.4S, v31.s[0] +add v19.4s, v19.4s, v18.4s +sqrdmulh v18.4S, v2.4S, v23.s[0] +mul v2.4S, v2.4S,v24.s[0] +sub v22.4s, v7.4s, v6.4s +sqrdmulh v17.4S, v12.4S, v23.s[1] +mul v12.4S, v12.4S,v24.s[1] +add v7.4s, v7.4s, v6.4s +sqrdmulh v6.4S, v7.4S, v23.s[2] +mul v7.4S, v7.4S,v24.s[2] +sub v9.4s, v11.4s, v20.4s +add v11.4s, v11.4s, v20.4s +sqrdmulh v20.4S, v22.4S, v23.s[3] +mul v22.4S, v22.4S,v24.s[3] +sub v0.4s, v1.4s, v4.4s +add v1.4s, v1.4s, v4.4s +mla v2.4S, v18.4S, v31.s[0] +mla v12.4S, v17.4S, v31.s[0] +sub v17.4s, v3.4s, v16.4s +str q11, [x0, #304] +mla v7.4S, v6.4S, v31.s[0] +mla v22.4S, v20.4S, v31.s[0] +add v3.4s, v3.4s, v16.4s +str q9, [x0, #368] +ldr q9, [x0, #896] +sqrdmulh v16.4S, v9.4S, v29.s[0] +mul v9.4S, v9.4S,v30.s[0] +str q1, [x0, #432] +sub v1.4s, v8.4s, v15.4s +ldr q20, [x0, #960] +sqrdmulh v6.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +str q0, [x0, #496] +add v8.4s, v8.4s, v15.4s +ldr q15, [x0, #768] +sqrdmulh v0.4S, v15.4S, v29.s[0] +mul v15.4S, v15.4S,v30.s[0] +sub v11.4s, v14.4s, v2.4s +add v14.4s, v14.4s, v2.4s +ldr q2, [x0, #832] +sqrdmulh v18.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +sub v4.4s, v10.4s, v12.4s +add v10.4s, v10.4s, v12.4s +mla v9.4S, v16.4S, v31.s[0] +mla v20.4S, v6.4S, v31.s[0] +str q3, [x0, #176] +sub v3.4s, v19.4s, v7.4s +mla v15.4S, v0.4S, v31.s[0] +mla v2.4S, v18.4S, v31.s[0] +str q17, [x0, #240] +add v19.4s, v19.4s, v7.4s +ldr q7, [x0, #512] +sqrdmulh v17.4S, v7.4S, v29.s[0] +mul v7.4S, v7.4S,v30.s[0] +str q8, [x0, #48] +sub v8.4s, v21.4s, v22.4s +ldr q18, [x0, #576] +sqrdmulh v0.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +str q1, [x0, #112] +add v21.4s, v21.4s, v22.4s +ldr q22, [x0, #640] +ldr q1, [x0, #384] +sqrdmulh v6.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v16.4s, v1.4s, v9.4s +add v1.4s, v1.4s, v9.4s +ldr q9, [x0, #704] +ldr q12, [x0, #448] +sqrdmulh v5.4S, v9.4S, v29.s[0] +mul v9.4S, v9.4S,v30.s[0] +sub v13.4s, v12.4s, v20.4s +add v12.4s, v12.4s, v20.4s +ldr q20, [x0, #256] +mla v7.4S, v17.4S, v31.s[0] +mla v18.4S, v0.4S, v31.s[0] +str q14, [x0, #560] +sub v14.4s, v20.4s, v15.4s +mla v22.4S, v6.4S, v31.s[0] +mla v9.4S, v5.4S, v31.s[0] +str q11, [x0, #624] +add v20.4s, v20.4s, v15.4s +ldr q15, [x0, #320] +sqrdmulh v11.4S, v1.4S, v29.s[1] +mul v1.4S, v1.4S,v30.s[1] +str q10, [x0, #688] +sub v10.4s, v15.4s, v2.4s +sqrdmulh v5.4S, v12.4S, v29.s[1] +mul v12.4S, v12.4S,v30.s[1] +str q4, [x0, #752] +add v15.4s, v15.4s, v2.4s +ldr q2, [x0, #0] +sqrdmulh v4.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v6.4s, v2.4s, v7.4s +add v2.4s, v2.4s, v7.4s +ldr q7, [x0, #64] +sqrdmulh v0.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +sub v17.4s, v7.4s, v18.4s +add v7.4s, v7.4s, v18.4s +ldr q18, [x0, #128] +mla v1.4S, v11.4S, v31.s[0] +mla v12.4S, v5.4S, v31.s[0] +str q19, [x0, #816] +sub v19.4s, v18.4s, v22.4s +mla v20.4S, v4.4S, v31.s[0] +mla v15.4S, v0.4S, v31.s[0] +str q3, [x0, #880] +add v18.4s, v18.4s, v22.4s +ldr q22, [x0, #192] +sqrdmulh v3.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +str q21, [x0, #944] +sub v21.4s, v22.4s, v9.4s +sqrdmulh v0.4S, v13.4S, v29.s[2] +mul v13.4S, v13.4S,v30.s[2] +str q8, [x0, #1008] +add v22.4s, v22.4s, v9.4s +sqrdmulh v9.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v8.4s, v18.4s, v1.4s +add v18.4s, v18.4s, v1.4s +sqrdmulh v1.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v4.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +mla v16.4S, v3.4S, v31.s[0] +mla v13.4S, v0.4S, v31.s[0] +sub v0.4s, v2.4s, v20.4s +mla v14.4S, v9.4S, v31.s[0] +mla v10.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v27.s[1] +mul v8.4S, v8.4S,v28.s[1] +sub v1.4s, v7.4s, v15.4s +sqrdmulh v9.4S, v4.4S, v27.s[1] +mul v4.4S, v4.4S,v28.s[1] +add v7.4s, v7.4s, v15.4s +sqrdmulh v15.4S, v18.4S, v27.s[0] +mul v18.4S, v18.4S,v28.s[0] +sub v3.4s, v19.4s, v16.4s +add v19.4s, v19.4s, v16.4s +sqrdmulh v16.4S, v22.4S, v27.s[0] +mul v22.4S, v22.4S,v28.s[0] +sub v12.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +mla v8.4S, v20.4S, v31.s[0] +mla v4.4S, v9.4S, v31.s[0] +sub v9.4s, v6.4s, v14.4s +mla v18.4S, v15.4S, v31.s[0] +mla v22.4S, v16.4S, v31.s[0] +add v6.4s, v6.4s, v14.4s +sqrdmulh v14.4S, v19.4S, v27.s[2] +mul v19.4S, v19.4S,v28.s[2] +sub v16.4s, v17.4s, v10.4s +sqrdmulh v15.4S, v21.4S, v27.s[2] +mul v21.4S, v21.4S,v28.s[2] +add v17.4s, v17.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v27.s[3] +mul v3.4S, v3.4S,v28.s[3] +sub v20.4s, v0.4s, v8.4s +add v0.4s, v0.4s, v8.4s +sqrdmulh v8.4S, v12.4S, v27.s[3] +mul v12.4S, v12.4S,v28.s[3] +sub v13.4s, v1.4s, v4.4s +add v1.4s, v1.4s, v4.4s +mla v19.4S, v14.4S, v31.s[0] +mla v21.4S, v15.4S, v31.s[0] +sub v15.4s, v2.4s, v18.4s +mla v3.4S, v10.4S, v31.s[0] +mla v12.4S, v8.4S, v31.s[0] +add v2.4s, v2.4s, v18.4s +sqrdmulh v18.4S, v1.4S, v25.s[2] +mul v1.4S, v1.4S,v26.s[2] +sub v8.4s, v7.4s, v22.4s +sqrdmulh v10.4S, v13.4S, v25.s[3] +mul v13.4S, v13.4S,v26.s[3] +add v7.4s, v7.4s, v22.4s +sqrdmulh v22.4S, v8.4S, v25.s[1] +mul v8.4S, v8.4S,v26.s[1] +sub v14.4s, v6.4s, v19.4s +add v6.4s, v6.4s, v19.4s +sqrdmulh v19.4S, v7.4S, v25.s[0] +mul v7.4S, v7.4S,v26.s[0] +sub v4.4s, v17.4s, v21.4s +add v17.4s, v17.4s, v21.4s +mla v1.4S, v18.4S, v31.s[0] +mla v13.4S, v10.4S, v31.s[0] +sub v10.4s, v9.4s, v3.4s +mla v8.4S, v22.4S, v31.s[0] +mla v7.4S, v19.4S, v31.s[0] +add v9.4s, v9.4s, v3.4s +sqrdmulh v3.4S, v17.4S, v23.s[0] +mul v17.4S, v17.4S,v24.s[0] +sub v19.4s, v16.4s, v12.4s +sqrdmulh v22.4S, v4.4S, v23.s[1] +mul v4.4S, v4.4S,v24.s[1] +add v16.4s, v16.4s, v12.4s +sqrdmulh v12.4S, v16.4S, v23.s[2] +mul v16.4S, v16.4S,v24.s[2] +sub v18.4s, v0.4s, v1.4s +add v0.4s, v0.4s, v1.4s +sqrdmulh v1.4S, v19.4S, v23.s[3] +mul v19.4S, v19.4S,v24.s[3] +sub v21.4s, v20.4s, v13.4s +add v20.4s, v20.4s, v13.4s +mla v17.4S, v3.4S, v31.s[0] +mla v4.4S, v22.4S, v31.s[0] +sub v22.4s, v15.4s, v8.4s +str q0, [x0, #256] +mla v16.4S, v12.4S, v31.s[0] +mla v19.4S, v1.4S, v31.s[0] +add v15.4s, v15.4s, v8.4s +str q18, [x0, #320] +ldr q18, [x0, #912] +sqrdmulh v8.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +str q20, [x0, #384] +sub v20.4s, v2.4s, v7.4s +ldr q1, [x0, #976] +sqrdmulh v12.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +str q21, [x0, #448] +add v2.4s, v2.4s, v7.4s +ldr q7, [x0, #784] +sqrdmulh v21.4S, v7.4S, v29.s[0] +mul v7.4S, v7.4S,v30.s[0] +sub v0.4s, v6.4s, v17.4s +add v6.4s, v6.4s, v17.4s +ldr q17, [x0, #848] +sqrdmulh v3.4S, v17.4S, v29.s[0] +mul v17.4S, v17.4S,v30.s[0] +sub v13.4s, v14.4s, v4.4s +add v14.4s, v14.4s, v4.4s +mla v18.4S, v8.4S, v31.s[0] +mla v1.4S, v12.4S, v31.s[0] +str q15, [x0, #128] +sub v15.4s, v9.4s, v16.4s +mla v7.4S, v21.4S, v31.s[0] +mla v17.4S, v3.4S, v31.s[0] +str q22, [x0, #192] +add v9.4s, v9.4s, v16.4s +ldr q16, [x0, #528] +sqrdmulh v22.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +str q2, [x0, #0] +sub v2.4s, v10.4s, v19.4s +ldr q3, [x0, #592] +sqrdmulh v21.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +str q20, [x0, #64] +add v10.4s, v10.4s, v19.4s +ldr q19, [x0, #656] +ldr q20, [x0, #400] +sqrdmulh v12.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +sub v8.4s, v20.4s, v18.4s +add v20.4s, v20.4s, v18.4s +ldr q18, [x0, #720] +ldr q4, [x0, #464] +sqrdmulh v5.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +sub v11.4s, v4.4s, v1.4s +add v4.4s, v4.4s, v1.4s +ldr q1, [x0, #272] +mla v16.4S, v22.4S, v31.s[0] +mla v3.4S, v21.4S, v31.s[0] +str q6, [x0, #512] +sub v6.4s, v1.4s, v7.4s +mla v19.4S, v12.4S, v31.s[0] +mla v18.4S, v5.4S, v31.s[0] +str q0, [x0, #576] +add v1.4s, v1.4s, v7.4s +ldr q7, [x0, #336] +sqrdmulh v0.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +str q14, [x0, #640] +sub v14.4s, v7.4s, v17.4s +sqrdmulh v5.4S, v4.4S, v29.s[1] +mul v4.4S, v4.4S,v30.s[1] +str q13, [x0, #704] +add v7.4s, v7.4s, v17.4s +ldr q17, [x0, #16] +sqrdmulh v13.4S, v1.4S, v29.s[1] +mul v1.4S, v1.4S,v30.s[1] +sub v12.4s, v17.4s, v16.4s +add v17.4s, v17.4s, v16.4s +ldr q16, [x0, #80] +sqrdmulh v21.4S, v7.4S, v29.s[1] +mul v7.4S, v7.4S,v30.s[1] +sub v22.4s, v16.4s, v3.4s +add v16.4s, v16.4s, v3.4s +ldr q3, [x0, #144] +mla v20.4S, v0.4S, v31.s[0] +mla v4.4S, v5.4S, v31.s[0] +str q9, [x0, #768] +sub v9.4s, v3.4s, v19.4s +mla v1.4S, v13.4S, v31.s[0] +mla v7.4S, v21.4S, v31.s[0] +str q15, [x0, #832] +add v3.4s, v3.4s, v19.4s +ldr q19, [x0, #208] +sqrdmulh v15.4S, v8.4S, v29.s[2] +mul v8.4S, v8.4S,v30.s[2] +str q10, [x0, #896] +sub v10.4s, v19.4s, v18.4s +sqrdmulh v21.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +str q2, [x0, #960] +add v19.4s, v19.4s, v18.4s +sqrdmulh v18.4S, v6.4S, v29.s[2] +mul v6.4S, v6.4S,v30.s[2] +sub v2.4s, v3.4s, v20.4s +add v3.4s, v3.4s, v20.4s +sqrdmulh v20.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v13.4s, v19.4s, v4.4s +add v19.4s, v19.4s, v4.4s +mla v8.4S, v15.4S, v31.s[0] +mla v11.4S, v21.4S, v31.s[0] +sub v21.4s, v17.4s, v1.4s +mla v6.4S, v18.4S, v31.s[0] +mla v14.4S, v20.4S, v31.s[0] +add v17.4s, v17.4s, v1.4s +sqrdmulh v1.4S, v2.4S, v27.s[1] +mul v2.4S, v2.4S,v28.s[1] +sub v20.4s, v16.4s, v7.4s +sqrdmulh v18.4S, v13.4S, v27.s[1] +mul v13.4S, v13.4S,v28.s[1] +add v16.4s, v16.4s, v7.4s +sqrdmulh v7.4S, v3.4S, v27.s[0] +mul v3.4S, v3.4S,v28.s[0] +sub v15.4s, v9.4s, v8.4s +add v9.4s, v9.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v27.s[0] +mul v19.4S, v19.4S,v28.s[0] +sub v4.4s, v10.4s, v11.4s +add v10.4s, v10.4s, v11.4s +mla v2.4S, v1.4S, v31.s[0] +mla v13.4S, v18.4S, v31.s[0] +sub v18.4s, v12.4s, v6.4s +mla v3.4S, v7.4S, v31.s[0] +mla v19.4S, v8.4S, v31.s[0] +add v12.4s, v12.4s, v6.4s +sqrdmulh v6.4S, v9.4S, v27.s[2] +mul v9.4S, v9.4S,v28.s[2] +sub v8.4s, v22.4s, v14.4s +sqrdmulh v7.4S, v10.4S, v27.s[2] +mul v10.4S, v10.4S,v28.s[2] +add v22.4s, v22.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +sub v1.4s, v21.4s, v2.4s +add v21.4s, v21.4s, v2.4s +sqrdmulh v2.4S, v4.4S, v27.s[3] +mul v4.4S, v4.4S,v28.s[3] +sub v11.4s, v20.4s, v13.4s +add v20.4s, v20.4s, v13.4s +mla v9.4S, v6.4S, v31.s[0] +mla v10.4S, v7.4S, v31.s[0] +sub v7.4s, v17.4s, v3.4s +mla v15.4S, v14.4S, v31.s[0] +mla v4.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v3.4s +sqrdmulh v3.4S, v20.4S, v25.s[2] +mul v20.4S, v20.4S,v26.s[2] +sub v2.4s, v16.4s, v19.4s +sqrdmulh v14.4S, v11.4S, v25.s[3] +mul v11.4S, v11.4S,v26.s[3] +add v16.4s, v16.4s, v19.4s +sqrdmulh v19.4S, v2.4S, v25.s[1] +mul v2.4S, v2.4S,v26.s[1] +sub v6.4s, v12.4s, v9.4s +add v12.4s, v12.4s, v9.4s +sqrdmulh v9.4S, v16.4S, v25.s[0] +mul v16.4S, v16.4S,v26.s[0] +sub v13.4s, v22.4s, v10.4s +add v22.4s, v22.4s, v10.4s +mla v20.4S, v3.4S, v31.s[0] +mla v11.4S, v14.4S, v31.s[0] +sub v14.4s, v18.4s, v15.4s +mla v2.4S, v19.4S, v31.s[0] +mla v16.4S, v9.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +sqrdmulh v15.4S, v22.4S, v23.s[0] +mul v22.4S, v22.4S,v24.s[0] +sub v9.4s, v8.4s, v4.4s +sqrdmulh v19.4S, v13.4S, v23.s[1] +mul v13.4S, v13.4S,v24.s[1] +add v8.4s, v8.4s, v4.4s +sqrdmulh v4.4S, v8.4S, v23.s[2] +mul v8.4S, v8.4S,v24.s[2] +sub v3.4s, v21.4s, v20.4s +add v21.4s, v21.4s, v20.4s +sqrdmulh v20.4S, v9.4S, v23.s[3] +mul v9.4S, v9.4S,v24.s[3] +sub v10.4s, v1.4s, v11.4s +add v1.4s, v1.4s, v11.4s +mla v22.4S, v15.4S, v31.s[0] +mla v13.4S, v19.4S, v31.s[0] +sub v19.4s, v7.4s, v2.4s +str q21, [x0, #272] +mla v8.4S, v4.4S, v31.s[0] +mla v9.4S, v20.4S, v31.s[0] +add v7.4s, v7.4s, v2.4s +str q3, [x0, #336] +str q1, [x0, #400] +sub v1.4s, v17.4s, v16.4s +str q10, [x0, #464] +add v17.4s, v17.4s, v16.4s +sub v16.4s, v12.4s, v22.4s +add v12.4s, v12.4s, v22.4s +sub v22.4s, v6.4s, v13.4s +add v6.4s, v6.4s, v13.4s +str q7, [x0, #144] +sub v7.4s, v18.4s, v8.4s +str q19, [x0, #208] +add v18.4s, v18.4s, v8.4s +str q17, [x0, #16] +sub v17.4s, v14.4s, v9.4s +str q1, [x0, #80] +add v14.4s, v14.4s, v9.4s +str q12, [x0, #528] +str q16, [x0, #592] +str q6, [x0, #656] +str q22, [x0, #720] +str q18, [x0, #784] +str q7, [x0, #848] +str q14, [x0, #912] +str q17, [x0, #976] +ldr q0, [x17, #+128] +ldr q5, [x17, #+144] +ldr q11, [x17, #+160] +ldr q15, [x17, #+176] +ldr q21, [x17, #+192] +ldr q4, [x17, #+208] +ldr q20, [x17, #+224] +ldr q2, [x17, #+240] +ldr q3, [x0, #32] +ldr q30, [x0, #48] +ldr q29, [x0, #0] +ldr q28, [x0, #16] +sqrdmulh v27.4S, v3.4S, v5.s[0] +mul v3.4S, v3.4S,v0.s[0] +mla v3.4S, v27.4S, v31.s[0] +sub v27.4s, v29.4s, v3.4s +add v29.4s, v29.4s, v3.4s +sqrdmulh v3.4S, v30.4S, v5.s[0] +mul v30.4S, v30.4S,v0.s[0] +mla v30.4S, v3.4S, v31.s[0] +sub v3.4s, v28.4s, v30.4s +add v28.4s, v28.4s, v30.4s +sqrdmulh v30.4S, v28.4S, v5.s[1] +mul v28.4S, v28.4S,v0.s[1] +mla v28.4S, v30.4S, v31.s[0] +sub v30.4s, v29.4s, v28.4s +add v29.4s, v29.4s, v28.4s +sqrdmulh v28.4S, v3.4S, v5.s[2] +mul v3.4S, v3.4S,v0.s[2] +mla v3.4S, v28.4S, v31.s[0] +sub v28.4s, v27.4s, v3.4s +add v27.4s, v27.4s, v3.4s +trn1 v3.4S, v29.4S, v30.4S +trn2 v26.4S, v29.4S, v30.4S +trn1 v25.4S, v27.4S, v28.4S +trn2 v24.4S, v27.4S, v28.4S +trn2 v27.2D, v3.2D, v25.2D +trn2 v28.2D, v26.2D, v24.2D +trn1 v29.2D, v3.2D, v25.2D +trn1 v30.2D, v26.2D, v24.2D +sqrdmulh v24.4S, v27.4S, v15.4S +mul v27.4S, v27.4S,v11.4S +mla v27.4S, v24.4S, v31.s[0] +sub v24.4s, v29.4s, v27.4s +add v29.4s, v29.4s, v27.4s +sqrdmulh v27.4S, v28.4S, v15.4S +mul v28.4S, v28.4S,v11.4S +mla v28.4S, v27.4S, v31.s[0] +sub v27.4s, v30.4s, v28.4s +add v30.4s, v30.4s, v28.4s +sqrdmulh v28.4S, v30.4S, v4.4S +mul v30.4S, v30.4S,v21.4S +mla v30.4S, v28.4S, v31.s[0] +sub v28.4s, v29.4s, v30.4s +add v29.4s, v29.4s, v30.4s +sqrdmulh v30.4S, v27.4S, v2.4S +mul v27.4S, v27.4S,v20.4S +mla v27.4S, v30.4S, v31.s[0] +sub v30.4s, v24.4s, v27.4s +add v24.4s, v24.4s, v27.4s +str q29, [x0, #0] +str q28, [x0, #16] +str q24, [x0, #32] +str q30, [x0, #48] +ldr q30, [x17, #+256] +ldr q24, [x17, #+272] +ldr q28, [x17, #+288] +ldr q29, [x17, #+304] +ldr q27, [x17, #+320] +ldr q26, [x17, #+336] +ldr q25, [x17, #+352] +ldr q3, [x17, #+368] +ldr q2, [x0, #96] +ldr q20, [x0, #112] +ldr q4, [x0, #64] +ldr q21, [x0, #80] +sqrdmulh v15.4S, v2.4S, v24.s[0] +mul v2.4S, v2.4S,v30.s[0] +mla v2.4S, v15.4S, v31.s[0] +sub v15.4s, v4.4s, v2.4s +add v4.4s, v4.4s, v2.4s +sqrdmulh v2.4S, v20.4S, v24.s[0] +mul v20.4S, v20.4S,v30.s[0] +mla v20.4S, v2.4S, v31.s[0] +sub v2.4s, v21.4s, v20.4s +add v21.4s, v21.4s, v20.4s +sqrdmulh v20.4S, v21.4S, v24.s[1] +mul v21.4S, v21.4S,v30.s[1] +mla v21.4S, v20.4S, v31.s[0] +sub v20.4s, v4.4s, v21.4s +add v4.4s, v4.4s, v21.4s +sqrdmulh v21.4S, v2.4S, v24.s[2] +mul v2.4S, v2.4S,v30.s[2] +mla v2.4S, v21.4S, v31.s[0] +sub v21.4s, v15.4s, v2.4s +add v15.4s, v15.4s, v2.4s +trn1 v2.4S, v4.4S, v20.4S +trn2 v11.4S, v4.4S, v20.4S +trn1 v5.4S, v15.4S, v21.4S +trn2 v0.4S, v15.4S, v21.4S +trn2 v15.2D, v2.2D, v5.2D +trn2 v21.2D, v11.2D, v0.2D +trn1 v4.2D, v2.2D, v5.2D +trn1 v20.2D, v11.2D, v0.2D +sqrdmulh v0.4S, v15.4S, v29.4S +mul v15.4S, v15.4S,v28.4S +mla v15.4S, v0.4S, v31.s[0] +sub v0.4s, v4.4s, v15.4s +add v4.4s, v4.4s, v15.4s +sqrdmulh v15.4S, v21.4S, v29.4S +mul v21.4S, v21.4S,v28.4S +mla v21.4S, v15.4S, v31.s[0] +sub v15.4s, v20.4s, v21.4s +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v20.4S, v26.4S +mul v20.4S, v20.4S,v27.4S +mla v20.4S, v21.4S, v31.s[0] +sub v21.4s, v4.4s, v20.4s +add v4.4s, v4.4s, v20.4s +sqrdmulh v20.4S, v15.4S, v3.4S +mul v15.4S, v15.4S,v25.4S +mla v15.4S, v20.4S, v31.s[0] +sub v20.4s, v0.4s, v15.4s +add v0.4s, v0.4s, v15.4s +str q4, [x0, #64] +str q21, [x0, #80] +str q0, [x0, #96] +str q20, [x0, #112] +ldr q20, [x17, #+384] +ldr q0, [x17, #+400] +ldr q21, [x17, #+416] +ldr q4, [x17, #+432] +ldr q15, [x17, #+448] +ldr q11, [x17, #+464] +ldr q5, [x17, #+480] +ldr q2, [x17, #+496] +ldr q3, [x0, #160] +ldr q25, [x0, #176] +ldr q26, [x0, #128] +ldr q27, [x0, #144] +sqrdmulh v29.4S, v3.4S, v0.s[0] +mul v3.4S, v3.4S,v20.s[0] +mla v3.4S, v29.4S, v31.s[0] +sub v29.4s, v26.4s, v3.4s +add v26.4s, v26.4s, v3.4s +sqrdmulh v3.4S, v25.4S, v0.s[0] +mul v25.4S, v25.4S,v20.s[0] +mla v25.4S, v3.4S, v31.s[0] +sub v3.4s, v27.4s, v25.4s +add v27.4s, v27.4s, v25.4s +sqrdmulh v25.4S, v27.4S, v0.s[1] +mul v27.4S, v27.4S,v20.s[1] +mla v27.4S, v25.4S, v31.s[0] +sub v25.4s, v26.4s, v27.4s +add v26.4s, v26.4s, v27.4s +sqrdmulh v27.4S, v3.4S, v0.s[2] +mul v3.4S, v3.4S,v20.s[2] +mla v3.4S, v27.4S, v31.s[0] +sub v27.4s, v29.4s, v3.4s +add v29.4s, v29.4s, v3.4s +trn1 v3.4S, v26.4S, v25.4S +trn2 v28.4S, v26.4S, v25.4S +trn1 v24.4S, v29.4S, v27.4S +trn2 v30.4S, v29.4S, v27.4S +trn2 v29.2D, v3.2D, v24.2D +trn2 v27.2D, v28.2D, v30.2D +trn1 v26.2D, v3.2D, v24.2D +trn1 v25.2D, v28.2D, v30.2D +sqrdmulh v30.4S, v29.4S, v4.4S +mul v29.4S, v29.4S,v21.4S +mla v29.4S, v30.4S, v31.s[0] +sub v30.4s, v26.4s, v29.4s +add v26.4s, v26.4s, v29.4s +sqrdmulh v29.4S, v27.4S, v4.4S +mul v27.4S, v27.4S,v21.4S +mla v27.4S, v29.4S, v31.s[0] +sub v29.4s, v25.4s, v27.4s +add v25.4s, v25.4s, v27.4s +sqrdmulh v27.4S, v25.4S, v11.4S +mul v25.4S, v25.4S,v15.4S +mla v25.4S, v27.4S, v31.s[0] +sub v27.4s, v26.4s, v25.4s +add v26.4s, v26.4s, v25.4s +sqrdmulh v25.4S, v29.4S, v2.4S +mul v29.4S, v29.4S,v5.4S +mla v29.4S, v25.4S, v31.s[0] +sub v25.4s, v30.4s, v29.4s +add v30.4s, v30.4s, v29.4s +str q26, [x0, #128] +str q27, [x0, #144] +str q30, [x0, #160] +str q25, [x0, #176] +ldr q25, [x17, #+512] +ldr q30, [x17, #+528] +ldr q27, [x17, #+544] +ldr q26, [x17, #+560] +ldr q29, [x17, #+576] +ldr q28, [x17, #+592] +ldr q24, [x17, #+608] +ldr q3, [x17, #+624] +ldr q2, [x0, #224] +ldr q5, [x0, #240] +ldr q11, [x0, #192] +ldr q15, [x0, #208] +sqrdmulh v4.4S, v2.4S, v30.s[0] +mul v2.4S, v2.4S,v25.s[0] +mla v2.4S, v4.4S, v31.s[0] +sub v4.4s, v11.4s, v2.4s +add v11.4s, v11.4s, v2.4s +sqrdmulh v2.4S, v5.4S, v30.s[0] +mul v5.4S, v5.4S,v25.s[0] +mla v5.4S, v2.4S, v31.s[0] +sub v2.4s, v15.4s, v5.4s +add v15.4s, v15.4s, v5.4s +sqrdmulh v5.4S, v15.4S, v30.s[1] +mul v15.4S, v15.4S,v25.s[1] +mla v15.4S, v5.4S, v31.s[0] +sub v5.4s, v11.4s, v15.4s +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v30.s[2] +mul v2.4S, v2.4S,v25.s[2] +mla v2.4S, v15.4S, v31.s[0] +sub v15.4s, v4.4s, v2.4s +add v4.4s, v4.4s, v2.4s +trn1 v2.4S, v11.4S, v5.4S +trn2 v21.4S, v11.4S, v5.4S +trn1 v0.4S, v4.4S, v15.4S +trn2 v20.4S, v4.4S, v15.4S +trn2 v4.2D, v2.2D, v0.2D +trn2 v15.2D, v21.2D, v20.2D +trn1 v11.2D, v2.2D, v0.2D +trn1 v5.2D, v21.2D, v20.2D +sqrdmulh v20.4S, v4.4S, v26.4S +mul v4.4S, v4.4S,v27.4S +mla v4.4S, v20.4S, v31.s[0] +sub v20.4s, v11.4s, v4.4s +add v11.4s, v11.4s, v4.4s +sqrdmulh v4.4S, v15.4S, v26.4S +mul v15.4S, v15.4S,v27.4S +mla v15.4S, v4.4S, v31.s[0] +sub v4.4s, v5.4s, v15.4s +add v5.4s, v5.4s, v15.4s +sqrdmulh v15.4S, v5.4S, v28.4S +mul v5.4S, v5.4S,v29.4S +mla v5.4S, v15.4S, v31.s[0] +sub v15.4s, v11.4s, v5.4s +add v11.4s, v11.4s, v5.4s +sqrdmulh v5.4S, v4.4S, v3.4S +mul v4.4S, v4.4S,v24.4S +mla v4.4S, v5.4S, v31.s[0] +sub v5.4s, v20.4s, v4.4s +add v20.4s, v20.4s, v4.4s +str q11, [x0, #192] +str q15, [x0, #208] +str q20, [x0, #224] +str q5, [x0, #240] +ldr q5, [x17, #+640] +ldr q20, [x17, #+656] +ldr q15, [x17, #+672] +ldr q11, [x17, #+688] +ldr q4, [x17, #+704] +ldr q21, [x17, #+720] +ldr q0, [x17, #+736] +ldr q2, [x17, #+752] +ldr q3, [x0, #288] +ldr q24, [x0, #304] +ldr q28, [x0, #256] +ldr q29, [x0, #272] +sqrdmulh v26.4S, v3.4S, v20.s[0] +mul v3.4S, v3.4S,v5.s[0] +mla v3.4S, v26.4S, v31.s[0] +sub v26.4s, v28.4s, v3.4s +add v28.4s, v28.4s, v3.4s +sqrdmulh v3.4S, v24.4S, v20.s[0] +mul v24.4S, v24.4S,v5.s[0] +mla v24.4S, v3.4S, v31.s[0] +sub v3.4s, v29.4s, v24.4s +add v29.4s, v29.4s, v24.4s +sqrdmulh v24.4S, v29.4S, v20.s[1] +mul v29.4S, v29.4S,v5.s[1] +mla v29.4S, v24.4S, v31.s[0] +sub v24.4s, v28.4s, v29.4s +add v28.4s, v28.4s, v29.4s +sqrdmulh v29.4S, v3.4S, v20.s[2] +mul v3.4S, v3.4S,v5.s[2] +mla v3.4S, v29.4S, v31.s[0] +sub v29.4s, v26.4s, v3.4s +add v26.4s, v26.4s, v3.4s +trn1 v3.4S, v28.4S, v24.4S +trn2 v27.4S, v28.4S, v24.4S +trn1 v30.4S, v26.4S, v29.4S +trn2 v25.4S, v26.4S, v29.4S +trn2 v26.2D, v3.2D, v30.2D +trn2 v29.2D, v27.2D, v25.2D +trn1 v28.2D, v3.2D, v30.2D +trn1 v24.2D, v27.2D, v25.2D +sqrdmulh v25.4S, v26.4S, v11.4S +mul v26.4S, v26.4S,v15.4S +mla v26.4S, v25.4S, v31.s[0] +sub v25.4s, v28.4s, v26.4s +add v28.4s, v28.4s, v26.4s +sqrdmulh v26.4S, v29.4S, v11.4S +mul v29.4S, v29.4S,v15.4S +mla v29.4S, v26.4S, v31.s[0] +sub v26.4s, v24.4s, v29.4s +add v24.4s, v24.4s, v29.4s +sqrdmulh v29.4S, v24.4S, v21.4S +mul v24.4S, v24.4S,v4.4S +mla v24.4S, v29.4S, v31.s[0] +sub v29.4s, v28.4s, v24.4s +add v28.4s, v28.4s, v24.4s +sqrdmulh v24.4S, v26.4S, v2.4S +mul v26.4S, v26.4S,v0.4S +mla v26.4S, v24.4S, v31.s[0] +sub v24.4s, v25.4s, v26.4s +add v25.4s, v25.4s, v26.4s +str q28, [x0, #256] +str q29, [x0, #272] +str q25, [x0, #288] +str q24, [x0, #304] +ldr q24, [x17, #+768] +ldr q25, [x17, #+784] +ldr q29, [x17, #+800] +ldr q28, [x17, #+816] +ldr q26, [x17, #+832] +ldr q27, [x17, #+848] +ldr q30, [x17, #+864] +ldr q3, [x17, #+880] +ldr q2, [x0, #352] +ldr q0, [x0, #368] +ldr q21, [x0, #320] +ldr q4, [x0, #336] +sqrdmulh v11.4S, v2.4S, v25.s[0] +mul v2.4S, v2.4S,v24.s[0] +mla v2.4S, v11.4S, v31.s[0] +sub v11.4s, v21.4s, v2.4s +add v21.4s, v21.4s, v2.4s +sqrdmulh v2.4S, v0.4S, v25.s[0] +mul v0.4S, v0.4S,v24.s[0] +mla v0.4S, v2.4S, v31.s[0] +sub v2.4s, v4.4s, v0.4s +add v4.4s, v4.4s, v0.4s +sqrdmulh v0.4S, v4.4S, v25.s[1] +mul v4.4S, v4.4S,v24.s[1] +mla v4.4S, v0.4S, v31.s[0] +sub v0.4s, v21.4s, v4.4s +add v21.4s, v21.4s, v4.4s +sqrdmulh v4.4S, v2.4S, v25.s[2] +mul v2.4S, v2.4S,v24.s[2] +mla v2.4S, v4.4S, v31.s[0] +sub v4.4s, v11.4s, v2.4s +add v11.4s, v11.4s, v2.4s +trn1 v2.4S, v21.4S, v0.4S +trn2 v15.4S, v21.4S, v0.4S +trn1 v20.4S, v11.4S, v4.4S +trn2 v5.4S, v11.4S, v4.4S +trn2 v11.2D, v2.2D, v20.2D +trn2 v4.2D, v15.2D, v5.2D +trn1 v21.2D, v2.2D, v20.2D +trn1 v0.2D, v15.2D, v5.2D +sqrdmulh v5.4S, v11.4S, v28.4S +mul v11.4S, v11.4S,v29.4S +mla v11.4S, v5.4S, v31.s[0] +sub v5.4s, v21.4s, v11.4s +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v4.4S, v28.4S +mul v4.4S, v4.4S,v29.4S +mla v4.4S, v11.4S, v31.s[0] +sub v11.4s, v0.4s, v4.4s +add v0.4s, v0.4s, v4.4s +sqrdmulh v4.4S, v0.4S, v27.4S +mul v0.4S, v0.4S,v26.4S +mla v0.4S, v4.4S, v31.s[0] +sub v4.4s, v21.4s, v0.4s +add v21.4s, v21.4s, v0.4s +sqrdmulh v0.4S, v11.4S, v3.4S +mul v11.4S, v11.4S,v30.4S +mla v11.4S, v0.4S, v31.s[0] +sub v0.4s, v5.4s, v11.4s +add v5.4s, v5.4s, v11.4s +str q21, [x0, #320] +str q4, [x0, #336] +str q5, [x0, #352] +str q0, [x0, #368] +ldr q0, [x17, #+896] +ldr q5, [x17, #+912] +ldr q4, [x17, #+928] +ldr q21, [x17, #+944] +ldr q11, [x17, #+960] +ldr q15, [x17, #+976] +ldr q20, [x17, #+992] +ldr q2, [x17, #+1008] +ldr q3, [x0, #416] +ldr q30, [x0, #432] +ldr q27, [x0, #384] +ldr q26, [x0, #400] +sqrdmulh v28.4S, v3.4S, v5.s[0] +mul v3.4S, v3.4S,v0.s[0] +mla v3.4S, v28.4S, v31.s[0] +sub v28.4s, v27.4s, v3.4s +add v27.4s, v27.4s, v3.4s +sqrdmulh v3.4S, v30.4S, v5.s[0] +mul v30.4S, v30.4S,v0.s[0] +mla v30.4S, v3.4S, v31.s[0] +sub v3.4s, v26.4s, v30.4s +add v26.4s, v26.4s, v30.4s +sqrdmulh v30.4S, v26.4S, v5.s[1] +mul v26.4S, v26.4S,v0.s[1] +mla v26.4S, v30.4S, v31.s[0] +sub v30.4s, v27.4s, v26.4s +add v27.4s, v27.4s, v26.4s +sqrdmulh v26.4S, v3.4S, v5.s[2] +mul v3.4S, v3.4S,v0.s[2] +mla v3.4S, v26.4S, v31.s[0] +sub v26.4s, v28.4s, v3.4s +add v28.4s, v28.4s, v3.4s +trn1 v3.4S, v27.4S, v30.4S +trn2 v29.4S, v27.4S, v30.4S +trn1 v25.4S, v28.4S, v26.4S +trn2 v24.4S, v28.4S, v26.4S +trn2 v28.2D, v3.2D, v25.2D +trn2 v26.2D, v29.2D, v24.2D +trn1 v27.2D, v3.2D, v25.2D +trn1 v30.2D, v29.2D, v24.2D +sqrdmulh v24.4S, v28.4S, v21.4S +mul v28.4S, v28.4S,v4.4S +mla v28.4S, v24.4S, v31.s[0] +sub v24.4s, v27.4s, v28.4s +add v27.4s, v27.4s, v28.4s +sqrdmulh v28.4S, v26.4S, v21.4S +mul v26.4S, v26.4S,v4.4S +mla v26.4S, v28.4S, v31.s[0] +sub v28.4s, v30.4s, v26.4s +add v30.4s, v30.4s, v26.4s +sqrdmulh v26.4S, v30.4S, v15.4S +mul v30.4S, v30.4S,v11.4S +mla v30.4S, v26.4S, v31.s[0] +sub v26.4s, v27.4s, v30.4s +add v27.4s, v27.4s, v30.4s +sqrdmulh v30.4S, v28.4S, v2.4S +mul v28.4S, v28.4S,v20.4S +mla v28.4S, v30.4S, v31.s[0] +sub v30.4s, v24.4s, v28.4s +add v24.4s, v24.4s, v28.4s +str q27, [x0, #384] +str q26, [x0, #400] +str q24, [x0, #416] +str q30, [x0, #432] +ldr q30, [x17, #+1024] +ldr q24, [x17, #+1040] +ldr q26, [x17, #+1056] +ldr q27, [x17, #+1072] +ldr q28, [x17, #+1088] +ldr q29, [x17, #+1104] +ldr q25, [x17, #+1120] +ldr q3, [x17, #+1136] +ldr q2, [x0, #480] +ldr q20, [x0, #496] +ldr q15, [x0, #448] +ldr q11, [x0, #464] +sqrdmulh v21.4S, v2.4S, v24.s[0] +mul v2.4S, v2.4S,v30.s[0] +mla v2.4S, v21.4S, v31.s[0] +sub v21.4s, v15.4s, v2.4s +add v15.4s, v15.4s, v2.4s +sqrdmulh v2.4S, v20.4S, v24.s[0] +mul v20.4S, v20.4S,v30.s[0] +mla v20.4S, v2.4S, v31.s[0] +sub v2.4s, v11.4s, v20.4s +add v11.4s, v11.4s, v20.4s +sqrdmulh v20.4S, v11.4S, v24.s[1] +mul v11.4S, v11.4S,v30.s[1] +mla v11.4S, v20.4S, v31.s[0] +sub v20.4s, v15.4s, v11.4s +add v15.4s, v15.4s, v11.4s +sqrdmulh v11.4S, v2.4S, v24.s[2] +mul v2.4S, v2.4S,v30.s[2] +mla v2.4S, v11.4S, v31.s[0] +sub v11.4s, v21.4s, v2.4s +add v21.4s, v21.4s, v2.4s +trn1 v2.4S, v15.4S, v20.4S +trn2 v4.4S, v15.4S, v20.4S +trn1 v5.4S, v21.4S, v11.4S +trn2 v0.4S, v21.4S, v11.4S +trn2 v21.2D, v2.2D, v5.2D +trn2 v11.2D, v4.2D, v0.2D +trn1 v15.2D, v2.2D, v5.2D +trn1 v20.2D, v4.2D, v0.2D +sqrdmulh v0.4S, v21.4S, v27.4S +mul v21.4S, v21.4S,v26.4S +mla v21.4S, v0.4S, v31.s[0] +sub v0.4s, v15.4s, v21.4s +add v15.4s, v15.4s, v21.4s +sqrdmulh v21.4S, v11.4S, v27.4S +mul v11.4S, v11.4S,v26.4S +mla v11.4S, v21.4S, v31.s[0] +sub v21.4s, v20.4s, v11.4s +add v20.4s, v20.4s, v11.4s +sqrdmulh v11.4S, v20.4S, v29.4S +mul v20.4S, v20.4S,v28.4S +mla v20.4S, v11.4S, v31.s[0] +sub v11.4s, v15.4s, v20.4s +add v15.4s, v15.4s, v20.4s +sqrdmulh v20.4S, v21.4S, v3.4S +mul v21.4S, v21.4S,v25.4S +mla v21.4S, v20.4S, v31.s[0] +sub v20.4s, v0.4s, v21.4s +add v0.4s, v0.4s, v21.4s +str q15, [x0, #448] +str q11, [x0, #464] +str q0, [x0, #480] +str q20, [x0, #496] +ldr q20, [x17, #+1152] +ldr q0, [x17, #+1168] +ldr q11, [x17, #+1184] +ldr q15, [x17, #+1200] +ldr q21, [x17, #+1216] +ldr q4, [x17, #+1232] +ldr q5, [x17, #+1248] +ldr q2, [x17, #+1264] +ldr q3, [x0, #544] +ldr q25, [x0, #560] +ldr q29, [x0, #512] +ldr q28, [x0, #528] +sqrdmulh v27.4S, v3.4S, v0.s[0] +mul v3.4S, v3.4S,v20.s[0] +mla v3.4S, v27.4S, v31.s[0] +sub v27.4s, v29.4s, v3.4s +add v29.4s, v29.4s, v3.4s +sqrdmulh v3.4S, v25.4S, v0.s[0] +mul v25.4S, v25.4S,v20.s[0] +mla v25.4S, v3.4S, v31.s[0] +sub v3.4s, v28.4s, v25.4s +add v28.4s, v28.4s, v25.4s +sqrdmulh v25.4S, v28.4S, v0.s[1] +mul v28.4S, v28.4S,v20.s[1] +mla v28.4S, v25.4S, v31.s[0] +sub v25.4s, v29.4s, v28.4s +add v29.4s, v29.4s, v28.4s +sqrdmulh v28.4S, v3.4S, v0.s[2] +mul v3.4S, v3.4S,v20.s[2] +mla v3.4S, v28.4S, v31.s[0] +sub v28.4s, v27.4s, v3.4s +add v27.4s, v27.4s, v3.4s +trn1 v3.4S, v29.4S, v25.4S +trn2 v26.4S, v29.4S, v25.4S +trn1 v24.4S, v27.4S, v28.4S +trn2 v30.4S, v27.4S, v28.4S +trn2 v27.2D, v3.2D, v24.2D +trn2 v28.2D, v26.2D, v30.2D +trn1 v29.2D, v3.2D, v24.2D +trn1 v25.2D, v26.2D, v30.2D +sqrdmulh v30.4S, v27.4S, v15.4S +mul v27.4S, v27.4S,v11.4S +mla v27.4S, v30.4S, v31.s[0] +sub v30.4s, v29.4s, v27.4s +add v29.4s, v29.4s, v27.4s +sqrdmulh v27.4S, v28.4S, v15.4S +mul v28.4S, v28.4S,v11.4S +mla v28.4S, v27.4S, v31.s[0] +sub v27.4s, v25.4s, v28.4s +add v25.4s, v25.4s, v28.4s +sqrdmulh v28.4S, v25.4S, v4.4S +mul v25.4S, v25.4S,v21.4S +mla v25.4S, v28.4S, v31.s[0] +sub v28.4s, v29.4s, v25.4s +add v29.4s, v29.4s, v25.4s +sqrdmulh v25.4S, v27.4S, v2.4S +mul v27.4S, v27.4S,v5.4S +mla v27.4S, v25.4S, v31.s[0] +sub v25.4s, v30.4s, v27.4s +add v30.4s, v30.4s, v27.4s +str q29, [x0, #512] +str q28, [x0, #528] +str q30, [x0, #544] +str q25, [x0, #560] +ldr q25, [x17, #+1280] +ldr q30, [x17, #+1296] +ldr q28, [x17, #+1312] +ldr q29, [x17, #+1328] +ldr q27, [x17, #+1344] +ldr q26, [x17, #+1360] +ldr q24, [x17, #+1376] +ldr q3, [x17, #+1392] +ldr q2, [x0, #608] +ldr q5, [x0, #624] +ldr q4, [x0, #576] +ldr q21, [x0, #592] +sqrdmulh v15.4S, v2.4S, v30.s[0] +mul v2.4S, v2.4S,v25.s[0] +mla v2.4S, v15.4S, v31.s[0] +sub v15.4s, v4.4s, v2.4s +add v4.4s, v4.4s, v2.4s +sqrdmulh v2.4S, v5.4S, v30.s[0] +mul v5.4S, v5.4S,v25.s[0] +mla v5.4S, v2.4S, v31.s[0] +sub v2.4s, v21.4s, v5.4s +add v21.4s, v21.4s, v5.4s +sqrdmulh v5.4S, v21.4S, v30.s[1] +mul v21.4S, v21.4S,v25.s[1] +mla v21.4S, v5.4S, v31.s[0] +sub v5.4s, v4.4s, v21.4s +add v4.4s, v4.4s, v21.4s +sqrdmulh v21.4S, v2.4S, v30.s[2] +mul v2.4S, v2.4S,v25.s[2] +mla v2.4S, v21.4S, v31.s[0] +sub v21.4s, v15.4s, v2.4s +add v15.4s, v15.4s, v2.4s +trn1 v2.4S, v4.4S, v5.4S +trn2 v11.4S, v4.4S, v5.4S +trn1 v0.4S, v15.4S, v21.4S +trn2 v20.4S, v15.4S, v21.4S +trn2 v15.2D, v2.2D, v0.2D +trn2 v21.2D, v11.2D, v20.2D +trn1 v4.2D, v2.2D, v0.2D +trn1 v5.2D, v11.2D, v20.2D +sqrdmulh v20.4S, v15.4S, v29.4S +mul v15.4S, v15.4S,v28.4S +mla v15.4S, v20.4S, v31.s[0] +sub v20.4s, v4.4s, v15.4s +add v4.4s, v4.4s, v15.4s +sqrdmulh v15.4S, v21.4S, v29.4S +mul v21.4S, v21.4S,v28.4S +mla v21.4S, v15.4S, v31.s[0] +sub v15.4s, v5.4s, v21.4s +add v5.4s, v5.4s, v21.4s +sqrdmulh v21.4S, v5.4S, v26.4S +mul v5.4S, v5.4S,v27.4S +mla v5.4S, v21.4S, v31.s[0] +sub v21.4s, v4.4s, v5.4s +add v4.4s, v4.4s, v5.4s +sqrdmulh v5.4S, v15.4S, v3.4S +mul v15.4S, v15.4S,v24.4S +mla v15.4S, v5.4S, v31.s[0] +sub v5.4s, v20.4s, v15.4s +add v20.4s, v20.4s, v15.4s +str q4, [x0, #576] +str q21, [x0, #592] +str q20, [x0, #608] +str q5, [x0, #624] +ldr q5, [x17, #+1408] +ldr q20, [x17, #+1424] +ldr q21, [x17, #+1440] +ldr q4, [x17, #+1456] +ldr q15, [x17, #+1472] +ldr q11, [x17, #+1488] +ldr q0, [x17, #+1504] +ldr q2, [x17, #+1520] +ldr q3, [x0, #672] +ldr q24, [x0, #688] +ldr q26, [x0, #640] +ldr q27, [x0, #656] +sqrdmulh v29.4S, v3.4S, v20.s[0] +mul v3.4S, v3.4S,v5.s[0] +mla v3.4S, v29.4S, v31.s[0] +sub v29.4s, v26.4s, v3.4s +add v26.4s, v26.4s, v3.4s +sqrdmulh v3.4S, v24.4S, v20.s[0] +mul v24.4S, v24.4S,v5.s[0] +mla v24.4S, v3.4S, v31.s[0] +sub v3.4s, v27.4s, v24.4s +add v27.4s, v27.4s, v24.4s +sqrdmulh v24.4S, v27.4S, v20.s[1] +mul v27.4S, v27.4S,v5.s[1] +mla v27.4S, v24.4S, v31.s[0] +sub v24.4s, v26.4s, v27.4s +add v26.4s, v26.4s, v27.4s +sqrdmulh v27.4S, v3.4S, v20.s[2] +mul v3.4S, v3.4S,v5.s[2] +mla v3.4S, v27.4S, v31.s[0] +sub v27.4s, v29.4s, v3.4s +add v29.4s, v29.4s, v3.4s +trn1 v3.4S, v26.4S, v24.4S +trn2 v28.4S, v26.4S, v24.4S +trn1 v30.4S, v29.4S, v27.4S +trn2 v25.4S, v29.4S, v27.4S +trn2 v29.2D, v3.2D, v30.2D +trn2 v27.2D, v28.2D, v25.2D +trn1 v26.2D, v3.2D, v30.2D +trn1 v24.2D, v28.2D, v25.2D +sqrdmulh v25.4S, v29.4S, v4.4S +mul v29.4S, v29.4S,v21.4S +mla v29.4S, v25.4S, v31.s[0] +sub v25.4s, v26.4s, v29.4s +add v26.4s, v26.4s, v29.4s +sqrdmulh v29.4S, v27.4S, v4.4S +mul v27.4S, v27.4S,v21.4S +mla v27.4S, v29.4S, v31.s[0] +sub v29.4s, v24.4s, v27.4s +add v24.4s, v24.4s, v27.4s +sqrdmulh v27.4S, v24.4S, v11.4S +mul v24.4S, v24.4S,v15.4S +mla v24.4S, v27.4S, v31.s[0] +sub v27.4s, v26.4s, v24.4s +add v26.4s, v26.4s, v24.4s +sqrdmulh v24.4S, v29.4S, v2.4S +mul v29.4S, v29.4S,v0.4S +mla v29.4S, v24.4S, v31.s[0] +sub v24.4s, v25.4s, v29.4s +add v25.4s, v25.4s, v29.4s +str q26, [x0, #640] +str q27, [x0, #656] +str q25, [x0, #672] +str q24, [x0, #688] +ldr q24, [x17, #+1536] +ldr q25, [x17, #+1552] +ldr q27, [x17, #+1568] +ldr q26, [x17, #+1584] +ldr q29, [x17, #+1600] +ldr q28, [x17, #+1616] +ldr q30, [x17, #+1632] +ldr q3, [x17, #+1648] +ldr q2, [x0, #736] +ldr q0, [x0, #752] +ldr q11, [x0, #704] +ldr q15, [x0, #720] +sqrdmulh v4.4S, v2.4S, v25.s[0] +mul v2.4S, v2.4S,v24.s[0] +mla v2.4S, v4.4S, v31.s[0] +sub v4.4s, v11.4s, v2.4s +add v11.4s, v11.4s, v2.4s +sqrdmulh v2.4S, v0.4S, v25.s[0] +mul v0.4S, v0.4S,v24.s[0] +mla v0.4S, v2.4S, v31.s[0] +sub v2.4s, v15.4s, v0.4s +add v15.4s, v15.4s, v0.4s +sqrdmulh v0.4S, v15.4S, v25.s[1] +mul v15.4S, v15.4S,v24.s[1] +mla v15.4S, v0.4S, v31.s[0] +sub v0.4s, v11.4s, v15.4s +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v25.s[2] +mul v2.4S, v2.4S,v24.s[2] +mla v2.4S, v15.4S, v31.s[0] +sub v15.4s, v4.4s, v2.4s +add v4.4s, v4.4s, v2.4s +trn1 v2.4S, v11.4S, v0.4S +trn2 v21.4S, v11.4S, v0.4S +trn1 v20.4S, v4.4S, v15.4S +trn2 v5.4S, v4.4S, v15.4S +trn2 v4.2D, v2.2D, v20.2D +trn2 v15.2D, v21.2D, v5.2D +trn1 v11.2D, v2.2D, v20.2D +trn1 v0.2D, v21.2D, v5.2D +sqrdmulh v5.4S, v4.4S, v26.4S +mul v4.4S, v4.4S,v27.4S +mla v4.4S, v5.4S, v31.s[0] +sub v5.4s, v11.4s, v4.4s +add v11.4s, v11.4s, v4.4s +sqrdmulh v4.4S, v15.4S, v26.4S +mul v15.4S, v15.4S,v27.4S +mla v15.4S, v4.4S, v31.s[0] +sub v4.4s, v0.4s, v15.4s +add v0.4s, v0.4s, v15.4s +sqrdmulh v15.4S, v0.4S, v28.4S +mul v0.4S, v0.4S,v29.4S +mla v0.4S, v15.4S, v31.s[0] +sub v15.4s, v11.4s, v0.4s +add v11.4s, v11.4s, v0.4s +sqrdmulh v0.4S, v4.4S, v3.4S +mul v4.4S, v4.4S,v30.4S +mla v4.4S, v0.4S, v31.s[0] +sub v0.4s, v5.4s, v4.4s +add v5.4s, v5.4s, v4.4s +str q11, [x0, #704] +str q15, [x0, #720] +str q5, [x0, #736] +str q0, [x0, #752] +ldr q0, [x17, #+1664] +ldr q5, [x17, #+1680] +ldr q15, [x17, #+1696] +ldr q11, [x17, #+1712] +ldr q4, [x17, #+1728] +ldr q21, [x17, #+1744] +ldr q20, [x17, #+1760] +ldr q2, [x17, #+1776] +ldr q3, [x0, #800] +ldr q30, [x0, #816] +ldr q28, [x0, #768] +ldr q29, [x0, #784] +sqrdmulh v26.4S, v3.4S, v5.s[0] +mul v3.4S, v3.4S,v0.s[0] +mla v3.4S, v26.4S, v31.s[0] +sub v26.4s, v28.4s, v3.4s +add v28.4s, v28.4s, v3.4s +sqrdmulh v3.4S, v30.4S, v5.s[0] +mul v30.4S, v30.4S,v0.s[0] +mla v30.4S, v3.4S, v31.s[0] +sub v3.4s, v29.4s, v30.4s +add v29.4s, v29.4s, v30.4s +sqrdmulh v30.4S, v29.4S, v5.s[1] +mul v29.4S, v29.4S,v0.s[1] +mla v29.4S, v30.4S, v31.s[0] +sub v30.4s, v28.4s, v29.4s +add v28.4s, v28.4s, v29.4s +sqrdmulh v29.4S, v3.4S, v5.s[2] +mul v3.4S, v3.4S,v0.s[2] +mla v3.4S, v29.4S, v31.s[0] +sub v29.4s, v26.4s, v3.4s +add v26.4s, v26.4s, v3.4s +trn1 v3.4S, v28.4S, v30.4S +trn2 v27.4S, v28.4S, v30.4S +trn1 v25.4S, v26.4S, v29.4S +trn2 v24.4S, v26.4S, v29.4S +trn2 v26.2D, v3.2D, v25.2D +trn2 v29.2D, v27.2D, v24.2D +trn1 v28.2D, v3.2D, v25.2D +trn1 v30.2D, v27.2D, v24.2D +sqrdmulh v24.4S, v26.4S, v11.4S +mul v26.4S, v26.4S,v15.4S +mla v26.4S, v24.4S, v31.s[0] +sub v24.4s, v28.4s, v26.4s +add v28.4s, v28.4s, v26.4s +sqrdmulh v26.4S, v29.4S, v11.4S +mul v29.4S, v29.4S,v15.4S +mla v29.4S, v26.4S, v31.s[0] +sub v26.4s, v30.4s, v29.4s +add v30.4s, v30.4s, v29.4s +sqrdmulh v29.4S, v30.4S, v21.4S +mul v30.4S, v30.4S,v4.4S +mla v30.4S, v29.4S, v31.s[0] +sub v29.4s, v28.4s, v30.4s +add v28.4s, v28.4s, v30.4s +sqrdmulh v30.4S, v26.4S, v2.4S +mul v26.4S, v26.4S,v20.4S +mla v26.4S, v30.4S, v31.s[0] +sub v30.4s, v24.4s, v26.4s +add v24.4s, v24.4s, v26.4s +str q28, [x0, #768] +str q29, [x0, #784] +str q24, [x0, #800] +str q30, [x0, #816] +ldr q30, [x17, #+1792] +ldr q24, [x17, #+1808] +ldr q29, [x17, #+1824] +ldr q28, [x17, #+1840] +ldr q26, [x17, #+1856] +ldr q27, [x17, #+1872] +ldr q25, [x17, #+1888] +ldr q3, [x17, #+1904] +ldr q2, [x0, #864] +ldr q20, [x0, #880] +ldr q21, [x0, #832] +ldr q4, [x0, #848] +sqrdmulh v11.4S, v2.4S, v24.s[0] +mul v2.4S, v2.4S,v30.s[0] +mla v2.4S, v11.4S, v31.s[0] +sub v11.4s, v21.4s, v2.4s +add v21.4s, v21.4s, v2.4s +sqrdmulh v2.4S, v20.4S, v24.s[0] +mul v20.4S, v20.4S,v30.s[0] +mla v20.4S, v2.4S, v31.s[0] +sub v2.4s, v4.4s, v20.4s +add v4.4s, v4.4s, v20.4s +sqrdmulh v20.4S, v4.4S, v24.s[1] +mul v4.4S, v4.4S,v30.s[1] +mla v4.4S, v20.4S, v31.s[0] +sub v20.4s, v21.4s, v4.4s +add v21.4s, v21.4s, v4.4s +sqrdmulh v4.4S, v2.4S, v24.s[2] +mul v2.4S, v2.4S,v30.s[2] +mla v2.4S, v4.4S, v31.s[0] +sub v4.4s, v11.4s, v2.4s +add v11.4s, v11.4s, v2.4s +trn1 v2.4S, v21.4S, v20.4S +trn2 v15.4S, v21.4S, v20.4S +trn1 v5.4S, v11.4S, v4.4S +trn2 v0.4S, v11.4S, v4.4S +trn2 v11.2D, v2.2D, v5.2D +trn2 v4.2D, v15.2D, v0.2D +trn1 v21.2D, v2.2D, v5.2D +trn1 v20.2D, v15.2D, v0.2D +sqrdmulh v0.4S, v11.4S, v28.4S +mul v11.4S, v11.4S,v29.4S +mla v11.4S, v0.4S, v31.s[0] +sub v0.4s, v21.4s, v11.4s +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v4.4S, v28.4S +mul v4.4S, v4.4S,v29.4S +mla v4.4S, v11.4S, v31.s[0] +sub v11.4s, v20.4s, v4.4s +add v20.4s, v20.4s, v4.4s +sqrdmulh v4.4S, v20.4S, v27.4S +mul v20.4S, v20.4S,v26.4S +mla v20.4S, v4.4S, v31.s[0] +sub v4.4s, v21.4s, v20.4s +add v21.4s, v21.4s, v20.4s +sqrdmulh v20.4S, v11.4S, v3.4S +mul v11.4S, v11.4S,v25.4S +mla v11.4S, v20.4S, v31.s[0] +sub v20.4s, v0.4s, v11.4s +add v0.4s, v0.4s, v11.4s +str q21, [x0, #832] +str q4, [x0, #848] +str q0, [x0, #864] +str q20, [x0, #880] +ldr q20, [x17, #+1920] +ldr q0, [x17, #+1936] +ldr q4, [x17, #+1952] +ldr q21, [x17, #+1968] +ldr q11, [x17, #+1984] +ldr q15, [x17, #+2000] +ldr q5, [x17, #+2016] +ldr q2, [x17, #+2032] +ldr q3, [x0, #928] +ldr q25, [x0, #944] +ldr q27, [x0, #896] +ldr q26, [x0, #912] +sqrdmulh v28.4S, v3.4S, v0.s[0] +mul v3.4S, v3.4S,v20.s[0] +mla v3.4S, v28.4S, v31.s[0] +sub v28.4s, v27.4s, v3.4s +add v27.4s, v27.4s, v3.4s +sqrdmulh v3.4S, v25.4S, v0.s[0] +mul v25.4S, v25.4S,v20.s[0] +mla v25.4S, v3.4S, v31.s[0] +sub v3.4s, v26.4s, v25.4s +add v26.4s, v26.4s, v25.4s +sqrdmulh v25.4S, v26.4S, v0.s[1] +mul v26.4S, v26.4S,v20.s[1] +mla v26.4S, v25.4S, v31.s[0] +sub v25.4s, v27.4s, v26.4s +add v27.4s, v27.4s, v26.4s +sqrdmulh v26.4S, v3.4S, v0.s[2] +mul v3.4S, v3.4S,v20.s[2] +mla v3.4S, v26.4S, v31.s[0] +sub v26.4s, v28.4s, v3.4s +add v28.4s, v28.4s, v3.4s +trn1 v3.4S, v27.4S, v25.4S +trn2 v29.4S, v27.4S, v25.4S +trn1 v24.4S, v28.4S, v26.4S +trn2 v30.4S, v28.4S, v26.4S +trn2 v28.2D, v3.2D, v24.2D +trn2 v26.2D, v29.2D, v30.2D +trn1 v27.2D, v3.2D, v24.2D +trn1 v25.2D, v29.2D, v30.2D +sqrdmulh v30.4S, v28.4S, v21.4S +mul v28.4S, v28.4S,v4.4S +mla v28.4S, v30.4S, v31.s[0] +sub v30.4s, v27.4s, v28.4s +add v27.4s, v27.4s, v28.4s +sqrdmulh v28.4S, v26.4S, v21.4S +mul v26.4S, v26.4S,v4.4S +mla v26.4S, v28.4S, v31.s[0] +sub v28.4s, v25.4s, v26.4s +add v25.4s, v25.4s, v26.4s +sqrdmulh v26.4S, v25.4S, v15.4S +mul v25.4S, v25.4S,v11.4S +mla v25.4S, v26.4S, v31.s[0] +sub v26.4s, v27.4s, v25.4s +add v27.4s, v27.4s, v25.4s +sqrdmulh v25.4S, v28.4S, v2.4S +mul v28.4S, v28.4S,v5.4S +mla v28.4S, v25.4S, v31.s[0] +sub v25.4s, v30.4s, v28.4s +add v30.4s, v30.4s, v28.4s +str q27, [x0, #896] +str q26, [x0, #912] +str q30, [x0, #928] +str q25, [x0, #944] +ldr q25, [x17, #+2048] +ldr q30, [x17, #+2064] +ldr q26, [x17, #+2080] +ldr q27, [x17, #+2096] +ldr q28, [x17, #+2112] +ldr q29, [x17, #+2128] +ldr q24, [x17, #+2144] +ldr q3, [x17, #+2160] +ldr q2, [x0, #992] +ldr q5, [x0, #1008] +ldr q15, [x0, #960] +ldr q11, [x0, #976] +sqrdmulh v21.4S, v2.4S, v30.s[0] +mul v2.4S, v2.4S,v25.s[0] +mla v2.4S, v21.4S, v31.s[0] +sub v21.4s, v15.4s, v2.4s +add v15.4s, v15.4s, v2.4s +sqrdmulh v2.4S, v5.4S, v30.s[0] +mul v5.4S, v5.4S,v25.s[0] +mla v5.4S, v2.4S, v31.s[0] +sub v2.4s, v11.4s, v5.4s +add v11.4s, v11.4s, v5.4s +sqrdmulh v5.4S, v11.4S, v30.s[1] +mul v11.4S, v11.4S,v25.s[1] +mla v11.4S, v5.4S, v31.s[0] +sub v5.4s, v15.4s, v11.4s +add v15.4s, v15.4s, v11.4s +sqrdmulh v11.4S, v2.4S, v30.s[2] +mul v2.4S, v2.4S,v25.s[2] +mla v2.4S, v11.4S, v31.s[0] +sub v11.4s, v21.4s, v2.4s +add v21.4s, v21.4s, v2.4s +trn1 v2.4S, v15.4S, v5.4S +trn2 v4.4S, v15.4S, v5.4S +trn1 v0.4S, v21.4S, v11.4S +trn2 v20.4S, v21.4S, v11.4S +trn2 v21.2D, v2.2D, v0.2D +trn2 v11.2D, v4.2D, v20.2D +trn1 v15.2D, v2.2D, v0.2D +trn1 v5.2D, v4.2D, v20.2D +sqrdmulh v20.4S, v21.4S, v27.4S +mul v21.4S, v21.4S,v26.4S +mla v21.4S, v20.4S, v31.s[0] +sub v20.4s, v15.4s, v21.4s +add v15.4s, v15.4s, v21.4s +sqrdmulh v21.4S, v11.4S, v27.4S +mul v11.4S, v11.4S,v26.4S +mla v11.4S, v21.4S, v31.s[0] +sub v21.4s, v5.4s, v11.4s +add v5.4s, v5.4s, v11.4s +sqrdmulh v11.4S, v5.4S, v29.4S +mul v5.4S, v5.4S,v28.4S +mla v5.4S, v11.4S, v31.s[0] +sub v11.4s, v15.4s, v5.4s +add v15.4s, v15.4s, v5.4s +sqrdmulh v5.4S, v21.4S, v3.4S +mul v21.4S, v21.4S,v24.4S +mla v21.4S, v5.4S, v31.s[0] +sub v5.4s, v20.4s, v21.4s +add v20.4s, v20.4s, v21.4s +str q15, [x0, #960] +str q11, [x0, #976] +str q20, [x0, #992] +str q5, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 2392 +// Instruction count: 2388 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_3_3_0.s b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_3_3_0.s new file mode 100644 index 0000000..10f383d --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_3_3_0.s @@ -0,0 +1,1474 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 23825509 // Layer 4, block 0 +.word 27028662 // Layer 4, block 1 +.word 0 // Layer None, block None +.word 1307297022 // Layer 3, block 0 +.word 1524716204 // Layer 4, block 0 +.word 1729702351 // Layer 4, block 1 +.word 0 // Layer None, block None +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 14626653 // Layer 3, block 1 +.word 14833295 // Layer 4, block 2 +.word 2138810 // Layer 4, block 3 +.word 0 // Layer None, block None +.word 936034350 // Layer 3, block 1 +.word 949258429 // Layer 4, block 2 +.word 136873393 // Layer 4, block 3 +.word 0 // Layer None, block None +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 29737761 // Layer 3, block 2 +.word 6490403 // Layer 4, block 4 +.word 19648405 // Layer 4, block 5 +.word 0 // Layer None, block None +.word 1903071454 // Layer 3, block 2 +.word 415354091 // Layer 4, block 4 +.word 1257401950 // Layer 4, block 5 +.word 0 // Layer None, block None +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 30285189 // Layer 3, block 3 +.word 31254932 // Layer 4, block 6 +.word 26362414 // Layer 4, block 7 +.word 0 // Layer None, block None +.word 1938104173 // Layer 3, block 3 +.word 2000162988 // Layer 4, block 6 +.word 1687065733 // Layer 4, block 7 +.word 0 // Layer None, block None +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 21289485 // Layer 3, block 4 +.word 572895 // Layer 4, block 8 +.word 26691971 // Layer 4, block 9 +.word 0 // Layer None, block None +.word 1362423055 // Layer 3, block 4 +.word 36662482 // Layer 4, block 8 +.word 1708155771 // Layer 4, block 9 +.word 0 // Layer None, block None +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 9914896 // Layer 3, block 5 +.word 9249292 // Layer 4, block 10 +.word 29292862 // Layer 4, block 11 +.word 0 // Layer None, block None +.word 634504916 // Layer 3, block 5 +.word 591909511 // Layer 4, block 10 +.word 1874600091 // Layer 4, block 11 +.word 0 // Layer None, block None +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 22603682 // Layer 3, block 6 +.word 8247799 // Layer 4, block 12 +.word 5086187 // Layer 4, block 13 +.word 0 // Layer None, block None +.word 1446525244 // Layer 3, block 6 +.word 527818851 // Layer 4, block 12 +.word 325491125 // Layer 4, block 13 +.word 0 // Layer None, block None +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 16204162 // Layer 3, block 7 +.word 28113639 // Layer 4, block 14 +.word 8471290 // Layer 4, block 15 +.word 0 // Layer None, block None +.word 1036987221 // Layer 3, block 7 +.word 1799135579 // Layer 4, block 14 +.word 542121183 // Layer 4, block 15 +.word 0 // Layer None, block None +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.text +.global ntt_u32_incomplete_neon_asm_var_3_3_0 +.global _ntt_u32_incomplete_neon_asm_var_3_3_0 +ntt_u32_incomplete_neon_asm_var_3_3_0: +_ntt_u32_incomplete_neon_asm_var_3_3_0: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x0, #960] +ldr q25, [x0, #832] +ldr q24, [x0, #576] +ldr q23, [x0, #704] +ldr q22, [x0, #448] +ldr q21, [x0, #320] +ldr q20, [x0, #64] +ldr q19, [x0, #192] +sqrdmulh v18.4S, v26.4S, v29.s[0] +mul v26.4S, v26.4S,v30.s[0] +mla v26.4S, v18.4S, v31.s[0] +sub v18.4s, v22.4s, v26.4s +add v22.4s, v22.4s, v26.4s +sqrdmulh v26.4S, v25.4S, v29.s[0] +mul v25.4S, v25.4S,v30.s[0] +mla v25.4S, v26.4S, v31.s[0] +sub v26.4s, v21.4s, v25.4s +add v21.4s, v21.4s, v25.4s +sqrdmulh v25.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +mla v24.4S, v25.4S, v31.s[0] +sub v25.4s, v20.4s, v24.4s +add v20.4s, v20.4s, v24.4s +sqrdmulh v24.4S, v23.4S, v29.s[0] +mul v23.4S, v23.4S,v30.s[0] +mla v23.4S, v24.4S, v31.s[0] +sub v24.4s, v19.4s, v23.4s +add v19.4s, v19.4s, v23.4s +sqrdmulh v23.4S, v22.4S, v29.s[1] +mul v22.4S, v22.4S,v30.s[1] +mla v22.4S, v23.4S, v31.s[0] +sub v23.4s, v19.4s, v22.4s +add v19.4s, v19.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v20.4s, v21.4s +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v18.4S, v29.s[2] +mul v18.4S, v18.4S,v30.s[2] +mla v18.4S, v21.4S, v31.s[0] +sub v21.4s, v24.4s, v18.4s +add v24.4s, v24.4s, v18.4s +sqrdmulh v18.4S, v26.4S, v29.s[2] +mul v26.4S, v26.4S,v30.s[2] +mla v26.4S, v18.4S, v31.s[0] +sub v18.4s, v25.4s, v26.4s +add v25.4s, v25.4s, v26.4s +sqrdmulh v26.4S, v19.4S, v27.s[0] +mul v19.4S, v19.4S,v28.s[0] +mla v19.4S, v26.4S, v31.s[0] +sub v26.4s, v20.4s, v19.4s +add v20.4s, v20.4s, v19.4s +str q20, [x0, #64] +str q26, [x0, #192] +sqrdmulh v26.4S, v23.4S, v27.s[1] +mul v23.4S, v23.4S,v28.s[1] +mla v23.4S, v26.4S, v31.s[0] +sub v26.4s, v22.4s, v23.4s +add v22.4s, v22.4s, v23.4s +str q22, [x0, #320] +str q26, [x0, #448] +sqrdmulh v26.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +mla v21.4S, v26.4S, v31.s[0] +sub v26.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +str q18, [x0, #832] +str q26, [x0, #960] +sqrdmulh v26.4S, v24.4S, v27.s[2] +mul v24.4S, v24.4S,v28.s[2] +mla v24.4S, v26.4S, v31.s[0] +sub v26.4s, v25.4s, v24.4s +add v25.4s, v25.4s, v24.4s +str q25, [x0, #576] +str q26, [x0, #704] +ldr q26, [x0, #976] +ldr q25, [x0, #848] +ldr q24, [x0, #592] +ldr q18, [x0, #720] +ldr q21, [x0, #464] +ldr q22, [x0, #336] +ldr q23, [x0, #80] +ldr q20, [x0, #208] +sqrdmulh v19.4S, v26.4S, v29.s[0] +mul v26.4S, v26.4S,v30.s[0] +mla v26.4S, v19.4S, v31.s[0] +sub v19.4s, v21.4s, v26.4s +add v21.4s, v21.4s, v26.4s +sqrdmulh v26.4S, v25.4S, v29.s[0] +mul v25.4S, v25.4S,v30.s[0] +mla v25.4S, v26.4S, v31.s[0] +sub v26.4s, v22.4s, v25.4s +add v22.4s, v22.4s, v25.4s +sqrdmulh v25.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +mla v24.4S, v25.4S, v31.s[0] +sub v25.4s, v23.4s, v24.4s +add v23.4s, v23.4s, v24.4s +sqrdmulh v24.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +mla v18.4S, v24.4S, v31.s[0] +sub v24.4s, v20.4s, v18.4s +add v20.4s, v20.4s, v18.4s +sqrdmulh v18.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +mla v21.4S, v18.4S, v31.s[0] +sub v18.4s, v20.4s, v21.4s +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v29.s[1] +mul v22.4S, v22.4S,v30.s[1] +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v23.4s, v22.4s +add v23.4s, v23.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +mla v19.4S, v22.4S, v31.s[0] +sub v22.4s, v24.4s, v19.4s +add v24.4s, v24.4s, v19.4s +sqrdmulh v19.4S, v26.4S, v29.s[2] +mul v26.4S, v26.4S,v30.s[2] +mla v26.4S, v19.4S, v31.s[0] +sub v19.4s, v25.4s, v26.4s +add v25.4s, v25.4s, v26.4s +sqrdmulh v26.4S, v20.4S, v27.s[0] +mul v20.4S, v20.4S,v28.s[0] +mla v20.4S, v26.4S, v31.s[0] +sub v26.4s, v23.4s, v20.4s +add v23.4s, v23.4s, v20.4s +str q23, [x0, #80] +str q26, [x0, #208] +sqrdmulh v26.4S, v18.4S, v27.s[1] +mul v18.4S, v18.4S,v28.s[1] +mla v18.4S, v26.4S, v31.s[0] +sub v26.4s, v21.4s, v18.4s +add v21.4s, v21.4s, v18.4s +str q21, [x0, #336] +str q26, [x0, #464] +sqrdmulh v26.4S, v22.4S, v27.s[3] +mul v22.4S, v22.4S,v28.s[3] +mla v22.4S, v26.4S, v31.s[0] +sub v26.4s, v19.4s, v22.4s +add v19.4s, v19.4s, v22.4s +str q19, [x0, #848] +str q26, [x0, #976] +sqrdmulh v26.4S, v24.4S, v27.s[2] +mul v24.4S, v24.4S,v28.s[2] +mla v24.4S, v26.4S, v31.s[0] +sub v26.4s, v25.4s, v24.4s +add v25.4s, v25.4s, v24.4s +str q25, [x0, #592] +str q26, [x0, #720] +ldr q26, [x0, #992] +ldr q25, [x0, #864] +ldr q24, [x0, #608] +ldr q19, [x0, #736] +ldr q22, [x0, #480] +ldr q21, [x0, #352] +ldr q18, [x0, #96] +ldr q23, [x0, #224] +sqrdmulh v20.4S, v26.4S, v29.s[0] +mul v26.4S, v26.4S,v30.s[0] +mla v26.4S, v20.4S, v31.s[0] +sub v20.4s, v22.4s, v26.4s +add v22.4s, v22.4s, v26.4s +sqrdmulh v26.4S, v25.4S, v29.s[0] +mul v25.4S, v25.4S,v30.s[0] +mla v25.4S, v26.4S, v31.s[0] +sub v26.4s, v21.4s, v25.4s +add v21.4s, v21.4s, v25.4s +sqrdmulh v25.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +mla v24.4S, v25.4S, v31.s[0] +sub v25.4s, v18.4s, v24.4s +add v18.4s, v18.4s, v24.4s +sqrdmulh v24.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +mla v19.4S, v24.4S, v31.s[0] +sub v24.4s, v23.4s, v19.4s +add v23.4s, v23.4s, v19.4s +sqrdmulh v19.4S, v22.4S, v29.s[1] +mul v22.4S, v22.4S,v30.s[1] +mla v22.4S, v19.4S, v31.s[0] +sub v19.4s, v23.4s, v22.4s +add v23.4s, v23.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +mla v20.4S, v21.4S, v31.s[0] +sub v21.4s, v24.4s, v20.4s +add v24.4s, v24.4s, v20.4s +sqrdmulh v20.4S, v26.4S, v29.s[2] +mul v26.4S, v26.4S,v30.s[2] +mla v26.4S, v20.4S, v31.s[0] +sub v20.4s, v25.4s, v26.4s +add v25.4s, v25.4s, v26.4s +sqrdmulh v26.4S, v23.4S, v27.s[0] +mul v23.4S, v23.4S,v28.s[0] +mla v23.4S, v26.4S, v31.s[0] +sub v26.4s, v18.4s, v23.4s +add v18.4s, v18.4s, v23.4s +str q18, [x0, #96] +str q26, [x0, #224] +sqrdmulh v26.4S, v19.4S, v27.s[1] +mul v19.4S, v19.4S,v28.s[1] +mla v19.4S, v26.4S, v31.s[0] +sub v26.4s, v22.4s, v19.4s +add v22.4s, v22.4s, v19.4s +str q22, [x0, #352] +str q26, [x0, #480] +sqrdmulh v26.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +mla v21.4S, v26.4S, v31.s[0] +sub v26.4s, v20.4s, v21.4s +add v20.4s, v20.4s, v21.4s +str q20, [x0, #864] +str q26, [x0, #992] +sqrdmulh v26.4S, v24.4S, v27.s[2] +mul v24.4S, v24.4S,v28.s[2] +mla v24.4S, v26.4S, v31.s[0] +sub v26.4s, v25.4s, v24.4s +add v25.4s, v25.4s, v24.4s +str q25, [x0, #608] +str q26, [x0, #736] +ldr q26, [x0, #1008] +ldr q25, [x0, #880] +ldr q24, [x0, #624] +ldr q20, [x0, #752] +ldr q21, [x0, #496] +ldr q22, [x0, #368] +ldr q19, [x0, #112] +ldr q18, [x0, #240] +sqrdmulh v23.4S, v26.4S, v29.s[0] +mul v26.4S, v26.4S,v30.s[0] +mla v26.4S, v23.4S, v31.s[0] +sub v23.4s, v21.4s, v26.4s +add v21.4s, v21.4s, v26.4s +sqrdmulh v26.4S, v25.4S, v29.s[0] +mul v25.4S, v25.4S,v30.s[0] +mla v25.4S, v26.4S, v31.s[0] +sub v26.4s, v22.4s, v25.4s +add v22.4s, v22.4s, v25.4s +sqrdmulh v25.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +mla v24.4S, v25.4S, v31.s[0] +sub v25.4s, v19.4s, v24.4s +add v19.4s, v19.4s, v24.4s +sqrdmulh v24.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +mla v20.4S, v24.4S, v31.s[0] +sub v24.4s, v18.4s, v20.4s +add v18.4s, v18.4s, v20.4s +sqrdmulh v20.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +mla v21.4S, v20.4S, v31.s[0] +sub v20.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v29.s[1] +mul v22.4S, v22.4S,v30.s[1] +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v19.4s, v22.4s +add v19.4s, v19.4s, v22.4s +sqrdmulh v22.4S, v23.4S, v29.s[2] +mul v23.4S, v23.4S,v30.s[2] +mla v23.4S, v22.4S, v31.s[0] +sub v22.4s, v24.4s, v23.4s +add v24.4s, v24.4s, v23.4s +sqrdmulh v23.4S, v26.4S, v29.s[2] +mul v26.4S, v26.4S,v30.s[2] +mla v26.4S, v23.4S, v31.s[0] +sub v23.4s, v25.4s, v26.4s +add v25.4s, v25.4s, v26.4s +sqrdmulh v26.4S, v18.4S, v27.s[0] +mul v18.4S, v18.4S,v28.s[0] +mla v18.4S, v26.4S, v31.s[0] +sub v26.4s, v19.4s, v18.4s +add v19.4s, v19.4s, v18.4s +str q19, [x0, #112] +str q26, [x0, #240] +sqrdmulh v26.4S, v20.4S, v27.s[1] +mul v20.4S, v20.4S,v28.s[1] +mla v20.4S, v26.4S, v31.s[0] +sub v26.4s, v21.4s, v20.4s +add v21.4s, v21.4s, v20.4s +str q21, [x0, #368] +str q26, [x0, #496] +sqrdmulh v26.4S, v22.4S, v27.s[3] +mul v22.4S, v22.4S,v28.s[3] +mla v22.4S, v26.4S, v31.s[0] +sub v26.4s, v23.4s, v22.4s +add v23.4s, v23.4s, v22.4s +str q23, [x0, #880] +str q26, [x0, #1008] +sqrdmulh v26.4S, v24.4S, v27.s[2] +mul v24.4S, v24.4S,v28.s[2] +mla v24.4S, v26.4S, v31.s[0] +sub v26.4s, v25.4s, v24.4s +add v25.4s, v25.4s, v24.4s +str q25, [x0, #624] +str q26, [x0, #752] +ldr q26, [x0, #896] +ldr q25, [x0, #768] +ldr q24, [x0, #512] +ldr q23, [x0, #640] +ldr q22, [x0, #384] +ldr q21, [x0, #256] +ldr q20, [x0, #0] +ldr q19, [x0, #128] +sqrdmulh v18.4S, v26.4S, v29.s[0] +mul v26.4S, v26.4S,v30.s[0] +mla v26.4S, v18.4S, v31.s[0] +sub v18.4s, v22.4s, v26.4s +add v22.4s, v22.4s, v26.4s +sqrdmulh v26.4S, v25.4S, v29.s[0] +mul v25.4S, v25.4S,v30.s[0] +mla v25.4S, v26.4S, v31.s[0] +sub v26.4s, v21.4s, v25.4s +add v21.4s, v21.4s, v25.4s +sqrdmulh v25.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +mla v24.4S, v25.4S, v31.s[0] +sub v25.4s, v20.4s, v24.4s +add v20.4s, v20.4s, v24.4s +sqrdmulh v24.4S, v23.4S, v29.s[0] +mul v23.4S, v23.4S,v30.s[0] +mla v23.4S, v24.4S, v31.s[0] +sub v24.4s, v19.4s, v23.4s +add v19.4s, v19.4s, v23.4s +sqrdmulh v23.4S, v22.4S, v29.s[1] +mul v22.4S, v22.4S,v30.s[1] +mla v22.4S, v23.4S, v31.s[0] +sub v23.4s, v19.4s, v22.4s +add v19.4s, v19.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v20.4s, v21.4s +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v18.4S, v29.s[2] +mul v18.4S, v18.4S,v30.s[2] +mla v18.4S, v21.4S, v31.s[0] +sub v21.4s, v24.4s, v18.4s +add v24.4s, v24.4s, v18.4s +sqrdmulh v18.4S, v26.4S, v29.s[2] +mul v26.4S, v26.4S,v30.s[2] +mla v26.4S, v18.4S, v31.s[0] +sub v18.4s, v25.4s, v26.4s +add v25.4s, v25.4s, v26.4s +sqrdmulh v26.4S, v19.4S, v27.s[0] +mul v19.4S, v19.4S,v28.s[0] +mla v19.4S, v26.4S, v31.s[0] +sub v26.4s, v20.4s, v19.4s +add v20.4s, v20.4s, v19.4s +str q20, [x0, #0] +str q26, [x0, #128] +sqrdmulh v26.4S, v23.4S, v27.s[1] +mul v23.4S, v23.4S,v28.s[1] +mla v23.4S, v26.4S, v31.s[0] +sub v26.4s, v22.4s, v23.4s +add v22.4s, v22.4s, v23.4s +str q22, [x0, #256] +str q26, [x0, #384] +sqrdmulh v26.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +mla v21.4S, v26.4S, v31.s[0] +sub v26.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +str q18, [x0, #768] +str q26, [x0, #896] +sqrdmulh v26.4S, v24.4S, v27.s[2] +mul v24.4S, v24.4S,v28.s[2] +mla v24.4S, v26.4S, v31.s[0] +sub v26.4s, v25.4s, v24.4s +add v25.4s, v25.4s, v24.4s +str q25, [x0, #512] +str q26, [x0, #640] +ldr q26, [x0, #912] +ldr q25, [x0, #784] +ldr q24, [x0, #528] +ldr q18, [x0, #656] +ldr q21, [x0, #400] +ldr q22, [x0, #272] +ldr q23, [x0, #16] +ldr q20, [x0, #144] +sqrdmulh v19.4S, v26.4S, v29.s[0] +mul v26.4S, v26.4S,v30.s[0] +mla v26.4S, v19.4S, v31.s[0] +sub v19.4s, v21.4s, v26.4s +add v21.4s, v21.4s, v26.4s +sqrdmulh v26.4S, v25.4S, v29.s[0] +mul v25.4S, v25.4S,v30.s[0] +mla v25.4S, v26.4S, v31.s[0] +sub v26.4s, v22.4s, v25.4s +add v22.4s, v22.4s, v25.4s +sqrdmulh v25.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +mla v24.4S, v25.4S, v31.s[0] +sub v25.4s, v23.4s, v24.4s +add v23.4s, v23.4s, v24.4s +sqrdmulh v24.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +mla v18.4S, v24.4S, v31.s[0] +sub v24.4s, v20.4s, v18.4s +add v20.4s, v20.4s, v18.4s +sqrdmulh v18.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +mla v21.4S, v18.4S, v31.s[0] +sub v18.4s, v20.4s, v21.4s +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v29.s[1] +mul v22.4S, v22.4S,v30.s[1] +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v23.4s, v22.4s +add v23.4s, v23.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +mla v19.4S, v22.4S, v31.s[0] +sub v22.4s, v24.4s, v19.4s +add v24.4s, v24.4s, v19.4s +sqrdmulh v19.4S, v26.4S, v29.s[2] +mul v26.4S, v26.4S,v30.s[2] +mla v26.4S, v19.4S, v31.s[0] +sub v19.4s, v25.4s, v26.4s +add v25.4s, v25.4s, v26.4s +sqrdmulh v26.4S, v20.4S, v27.s[0] +mul v20.4S, v20.4S,v28.s[0] +mla v20.4S, v26.4S, v31.s[0] +sub v26.4s, v23.4s, v20.4s +add v23.4s, v23.4s, v20.4s +str q23, [x0, #16] +str q26, [x0, #144] +sqrdmulh v26.4S, v18.4S, v27.s[1] +mul v18.4S, v18.4S,v28.s[1] +mla v18.4S, v26.4S, v31.s[0] +sub v26.4s, v21.4s, v18.4s +add v21.4s, v21.4s, v18.4s +str q21, [x0, #272] +str q26, [x0, #400] +sqrdmulh v26.4S, v22.4S, v27.s[3] +mul v22.4S, v22.4S,v28.s[3] +mla v22.4S, v26.4S, v31.s[0] +sub v26.4s, v19.4s, v22.4s +add v19.4s, v19.4s, v22.4s +str q19, [x0, #784] +str q26, [x0, #912] +sqrdmulh v26.4S, v24.4S, v27.s[2] +mul v24.4S, v24.4S,v28.s[2] +mla v24.4S, v26.4S, v31.s[0] +sub v26.4s, v25.4s, v24.4s +add v25.4s, v25.4s, v24.4s +str q25, [x0, #528] +str q26, [x0, #656] +ldr q26, [x0, #928] +ldr q25, [x0, #800] +ldr q24, [x0, #544] +ldr q19, [x0, #672] +ldr q22, [x0, #416] +ldr q21, [x0, #288] +ldr q18, [x0, #32] +ldr q23, [x0, #160] +sqrdmulh v20.4S, v26.4S, v29.s[0] +mul v26.4S, v26.4S,v30.s[0] +mla v26.4S, v20.4S, v31.s[0] +sub v20.4s, v22.4s, v26.4s +add v22.4s, v22.4s, v26.4s +sqrdmulh v26.4S, v25.4S, v29.s[0] +mul v25.4S, v25.4S,v30.s[0] +mla v25.4S, v26.4S, v31.s[0] +sub v26.4s, v21.4s, v25.4s +add v21.4s, v21.4s, v25.4s +sqrdmulh v25.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +mla v24.4S, v25.4S, v31.s[0] +sub v25.4s, v18.4s, v24.4s +add v18.4s, v18.4s, v24.4s +sqrdmulh v24.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +mla v19.4S, v24.4S, v31.s[0] +sub v24.4s, v23.4s, v19.4s +add v23.4s, v23.4s, v19.4s +sqrdmulh v19.4S, v22.4S, v29.s[1] +mul v22.4S, v22.4S,v30.s[1] +mla v22.4S, v19.4S, v31.s[0] +sub v19.4s, v23.4s, v22.4s +add v23.4s, v23.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +mla v20.4S, v21.4S, v31.s[0] +sub v21.4s, v24.4s, v20.4s +add v24.4s, v24.4s, v20.4s +sqrdmulh v20.4S, v26.4S, v29.s[2] +mul v26.4S, v26.4S,v30.s[2] +mla v26.4S, v20.4S, v31.s[0] +sub v20.4s, v25.4s, v26.4s +add v25.4s, v25.4s, v26.4s +sqrdmulh v26.4S, v23.4S, v27.s[0] +mul v23.4S, v23.4S,v28.s[0] +mla v23.4S, v26.4S, v31.s[0] +sub v26.4s, v18.4s, v23.4s +add v18.4s, v18.4s, v23.4s +str q18, [x0, #32] +str q26, [x0, #160] +sqrdmulh v26.4S, v19.4S, v27.s[1] +mul v19.4S, v19.4S,v28.s[1] +mla v19.4S, v26.4S, v31.s[0] +sub v26.4s, v22.4s, v19.4s +add v22.4s, v22.4s, v19.4s +str q22, [x0, #288] +str q26, [x0, #416] +sqrdmulh v26.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +mla v21.4S, v26.4S, v31.s[0] +sub v26.4s, v20.4s, v21.4s +add v20.4s, v20.4s, v21.4s +str q20, [x0, #800] +str q26, [x0, #928] +sqrdmulh v26.4S, v24.4S, v27.s[2] +mul v24.4S, v24.4S,v28.s[2] +mla v24.4S, v26.4S, v31.s[0] +sub v26.4s, v25.4s, v24.4s +add v25.4s, v25.4s, v24.4s +str q25, [x0, #544] +str q26, [x0, #672] +ldr q26, [x0, #944] +ldr q25, [x0, #816] +ldr q24, [x0, #560] +ldr q20, [x0, #688] +ldr q21, [x0, #432] +ldr q22, [x0, #304] +ldr q19, [x0, #48] +ldr q18, [x0, #176] +sqrdmulh v23.4S, v26.4S, v29.s[0] +mul v26.4S, v26.4S,v30.s[0] +mla v26.4S, v23.4S, v31.s[0] +sub v23.4s, v21.4s, v26.4s +add v21.4s, v21.4s, v26.4s +sqrdmulh v26.4S, v25.4S, v29.s[0] +mul v25.4S, v25.4S,v30.s[0] +mla v25.4S, v26.4S, v31.s[0] +sub v26.4s, v22.4s, v25.4s +add v22.4s, v22.4s, v25.4s +sqrdmulh v25.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +mla v24.4S, v25.4S, v31.s[0] +sub v25.4s, v19.4s, v24.4s +add v19.4s, v19.4s, v24.4s +sqrdmulh v24.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +mla v20.4S, v24.4S, v31.s[0] +sub v24.4s, v18.4s, v20.4s +add v18.4s, v18.4s, v20.4s +sqrdmulh v20.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +mla v21.4S, v20.4S, v31.s[0] +sub v20.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v29.s[1] +mul v22.4S, v22.4S,v30.s[1] +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v19.4s, v22.4s +add v19.4s, v19.4s, v22.4s +sqrdmulh v22.4S, v23.4S, v29.s[2] +mul v23.4S, v23.4S,v30.s[2] +mla v23.4S, v22.4S, v31.s[0] +sub v22.4s, v24.4s, v23.4s +add v24.4s, v24.4s, v23.4s +sqrdmulh v23.4S, v26.4S, v29.s[2] +mul v26.4S, v26.4S,v30.s[2] +mla v26.4S, v23.4S, v31.s[0] +sub v23.4s, v25.4s, v26.4s +add v25.4s, v25.4s, v26.4s +sqrdmulh v26.4S, v18.4S, v27.s[0] +mul v18.4S, v18.4S,v28.s[0] +mla v18.4S, v26.4S, v31.s[0] +sub v26.4s, v19.4s, v18.4s +add v19.4s, v19.4s, v18.4s +str q19, [x0, #48] +str q26, [x0, #176] +sqrdmulh v26.4S, v20.4S, v27.s[1] +mul v20.4S, v20.4S,v28.s[1] +mla v20.4S, v26.4S, v31.s[0] +sub v26.4s, v21.4s, v20.4s +add v21.4s, v21.4s, v20.4s +str q21, [x0, #304] +str q26, [x0, #432] +sqrdmulh v26.4S, v22.4S, v27.s[3] +mul v22.4S, v22.4S,v28.s[3] +mla v22.4S, v26.4S, v31.s[0] +sub v26.4s, v23.4s, v22.4s +add v23.4s, v23.4s, v22.4s +str q23, [x0, #816] +str q26, [x0, #944] +sqrdmulh v26.4S, v24.4S, v27.s[2] +mul v24.4S, v24.4S,v28.s[2] +mla v24.4S, v26.4S, v31.s[0] +sub v26.4s, v25.4s, v24.4s +add v25.4s, v25.4s, v24.4s +str q25, [x0, #560] +str q26, [x0, #688] +ldr q4, [x17, #+64] +ldr q5, [x17, #+80] +ldr q6, [x17, #+96] +ldr q7, [x17, #+112] +ldr q8, [x0, #112] +ldr q9, [x0, #96] +ldr q10, [x0, #64] +ldr q11, [x0, #80] +ldr q12, [x0, #48] +ldr q13, [x0, #32] +ldr q14, [x0, #0] +ldr q15, [x0, #16] +sqrdmulh v0.4S, v8.4S, v5.s[0] +mul v8.4S, v8.4S,v4.s[0] +mla v8.4S, v0.4S, v31.s[0] +sub v0.4s, v12.4s, v8.4s +add v12.4s, v12.4s, v8.4s +sqrdmulh v8.4S, v9.4S, v5.s[0] +mul v9.4S, v9.4S,v4.s[0] +mla v9.4S, v8.4S, v31.s[0] +sub v8.4s, v13.4s, v9.4s +add v13.4s, v13.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v5.s[0] +mul v10.4S, v10.4S,v4.s[0] +mla v10.4S, v9.4S, v31.s[0] +sub v9.4s, v14.4s, v10.4s +add v14.4s, v14.4s, v10.4s +sqrdmulh v10.4S, v11.4S, v5.s[0] +mul v11.4S, v11.4S,v4.s[0] +mla v11.4S, v10.4S, v31.s[0] +sub v10.4s, v15.4s, v11.4s +add v15.4s, v15.4s, v11.4s +sqrdmulh v11.4S, v12.4S, v5.s[1] +mul v12.4S, v12.4S,v4.s[1] +mla v12.4S, v11.4S, v31.s[0] +sub v11.4s, v15.4s, v12.4s +add v15.4s, v15.4s, v12.4s +ldr q12, [x17, #+128] +ldr q1, [x17, #+144] +ldr q2, [x17, #+160] +ldr q3, [x17, #+176] +sqrdmulh v16.4S, v13.4S, v5.s[1] +mul v13.4S, v13.4S,v4.s[1] +mla v13.4S, v16.4S, v31.s[0] +sub v16.4s, v14.4s, v13.4s +add v14.4s, v14.4s, v13.4s +sqrdmulh v13.4S, v0.4S, v5.s[2] +mul v0.4S, v0.4S,v4.s[2] +mla v0.4S, v13.4S, v31.s[0] +sub v13.4s, v10.4s, v0.4s +add v10.4s, v10.4s, v0.4s +sqrdmulh v0.4S, v8.4S, v5.s[2] +mul v8.4S, v8.4S,v4.s[2] +mla v8.4S, v0.4S, v31.s[0] +sub v0.4s, v9.4s, v8.4s +add v9.4s, v9.4s, v8.4s +sqrdmulh v8.4S, v15.4S, v7.s[0] +mul v15.4S, v15.4S,v6.s[0] +mla v15.4S, v8.4S, v31.s[0] +sub v8.4s, v14.4s, v15.4s +add v14.4s, v14.4s, v15.4s +str q14, [x0, #0] +str q8, [x0, #16] +sqrdmulh v8.4S, v11.4S, v7.s[1] +mul v11.4S, v11.4S,v6.s[1] +mla v11.4S, v8.4S, v31.s[0] +sub v8.4s, v16.4s, v11.4s +add v16.4s, v16.4s, v11.4s +str q16, [x0, #32] +str q8, [x0, #48] +sqrdmulh v8.4S, v13.4S, v7.s[3] +mul v13.4S, v13.4S,v6.s[3] +mla v13.4S, v8.4S, v31.s[0] +sub v8.4s, v0.4s, v13.4s +add v0.4s, v0.4s, v13.4s +str q0, [x0, #96] +str q8, [x0, #112] +sqrdmulh v8.4S, v10.4S, v7.s[2] +mul v10.4S, v10.4S,v6.s[2] +mla v10.4S, v8.4S, v31.s[0] +sub v8.4s, v9.4s, v10.4s +add v9.4s, v9.4s, v10.4s +str q9, [x0, #64] +str q8, [x0, #80] +ldr q8, [x0, #240] +ldr q9, [x0, #224] +ldr q10, [x0, #192] +ldr q0, [x0, #208] +ldr q13, [x0, #176] +ldr q16, [x0, #160] +ldr q11, [x0, #128] +ldr q14, [x0, #144] +sqrdmulh v15.4S, v8.4S, v1.s[0] +mul v8.4S, v8.4S,v12.s[0] +mla v8.4S, v15.4S, v31.s[0] +sub v15.4s, v13.4s, v8.4s +add v13.4s, v13.4s, v8.4s +sqrdmulh v8.4S, v9.4S, v1.s[0] +mul v9.4S, v9.4S,v12.s[0] +mla v9.4S, v8.4S, v31.s[0] +sub v8.4s, v16.4s, v9.4s +add v16.4s, v16.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v1.s[0] +mul v10.4S, v10.4S,v12.s[0] +mla v10.4S, v9.4S, v31.s[0] +sub v9.4s, v11.4s, v10.4s +add v11.4s, v11.4s, v10.4s +sqrdmulh v10.4S, v0.4S, v1.s[0] +mul v0.4S, v0.4S,v12.s[0] +mla v0.4S, v10.4S, v31.s[0] +sub v10.4s, v14.4s, v0.4s +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v13.4S, v1.s[1] +mul v13.4S, v13.4S,v12.s[1] +mla v13.4S, v0.4S, v31.s[0] +sub v0.4s, v14.4s, v13.4s +add v14.4s, v14.4s, v13.4s +ldr q13, [x17, #+192] +ldr q17, [x17, #+208] +ldr q18, [x17, #+224] +ldr q19, [x17, #+240] +sqrdmulh v20.4S, v16.4S, v1.s[1] +mul v16.4S, v16.4S,v12.s[1] +mla v16.4S, v20.4S, v31.s[0] +sub v20.4s, v11.4s, v16.4s +add v11.4s, v11.4s, v16.4s +sqrdmulh v16.4S, v15.4S, v1.s[2] +mul v15.4S, v15.4S,v12.s[2] +mla v15.4S, v16.4S, v31.s[0] +sub v16.4s, v10.4s, v15.4s +add v10.4s, v10.4s, v15.4s +sqrdmulh v15.4S, v8.4S, v1.s[2] +mul v8.4S, v8.4S,v12.s[2] +mla v8.4S, v15.4S, v31.s[0] +sub v15.4s, v9.4s, v8.4s +add v9.4s, v9.4s, v8.4s +sqrdmulh v8.4S, v14.4S, v3.s[0] +mul v14.4S, v14.4S,v2.s[0] +mla v14.4S, v8.4S, v31.s[0] +sub v8.4s, v11.4s, v14.4s +add v11.4s, v11.4s, v14.4s +str q11, [x0, #128] +str q8, [x0, #144] +sqrdmulh v8.4S, v0.4S, v3.s[1] +mul v0.4S, v0.4S,v2.s[1] +mla v0.4S, v8.4S, v31.s[0] +sub v8.4s, v20.4s, v0.4s +add v20.4s, v20.4s, v0.4s +str q20, [x0, #160] +str q8, [x0, #176] +sqrdmulh v8.4S, v16.4S, v3.s[3] +mul v16.4S, v16.4S,v2.s[3] +mla v16.4S, v8.4S, v31.s[0] +sub v8.4s, v15.4s, v16.4s +add v15.4s, v15.4s, v16.4s +str q15, [x0, #224] +str q8, [x0, #240] +sqrdmulh v8.4S, v10.4S, v3.s[2] +mul v10.4S, v10.4S,v2.s[2] +mla v10.4S, v8.4S, v31.s[0] +sub v8.4s, v9.4s, v10.4s +add v9.4s, v9.4s, v10.4s +str q9, [x0, #192] +str q8, [x0, #208] +ldr q7, [x0, #368] +ldr q6, [x0, #352] +ldr q5, [x0, #320] +ldr q4, [x0, #336] +ldr q8, [x0, #304] +ldr q9, [x0, #288] +ldr q10, [x0, #256] +ldr q15, [x0, #272] +sqrdmulh v16.4S, v7.4S, v17.s[0] +mul v7.4S, v7.4S,v13.s[0] +mla v7.4S, v16.4S, v31.s[0] +sub v16.4s, v8.4s, v7.4s +add v8.4s, v8.4s, v7.4s +sqrdmulh v7.4S, v6.4S, v17.s[0] +mul v6.4S, v6.4S,v13.s[0] +mla v6.4S, v7.4S, v31.s[0] +sub v7.4s, v9.4s, v6.4s +add v9.4s, v9.4s, v6.4s +sqrdmulh v6.4S, v5.4S, v17.s[0] +mul v5.4S, v5.4S,v13.s[0] +mla v5.4S, v6.4S, v31.s[0] +sub v6.4s, v10.4s, v5.4s +add v10.4s, v10.4s, v5.4s +sqrdmulh v5.4S, v4.4S, v17.s[0] +mul v4.4S, v4.4S,v13.s[0] +mla v4.4S, v5.4S, v31.s[0] +sub v5.4s, v15.4s, v4.4s +add v15.4s, v15.4s, v4.4s +sqrdmulh v4.4S, v8.4S, v17.s[1] +mul v8.4S, v8.4S,v13.s[1] +mla v8.4S, v4.4S, v31.s[0] +sub v4.4s, v15.4s, v8.4s +add v15.4s, v15.4s, v8.4s +ldr q8, [x17, #+256] +ldr q20, [x17, #+272] +ldr q0, [x17, #+288] +ldr q11, [x17, #+304] +sqrdmulh v14.4S, v9.4S, v17.s[1] +mul v9.4S, v9.4S,v13.s[1] +mla v9.4S, v14.4S, v31.s[0] +sub v14.4s, v10.4s, v9.4s +add v10.4s, v10.4s, v9.4s +sqrdmulh v9.4S, v16.4S, v17.s[2] +mul v16.4S, v16.4S,v13.s[2] +mla v16.4S, v9.4S, v31.s[0] +sub v9.4s, v5.4s, v16.4s +add v5.4s, v5.4s, v16.4s +sqrdmulh v16.4S, v7.4S, v17.s[2] +mul v7.4S, v7.4S,v13.s[2] +mla v7.4S, v16.4S, v31.s[0] +sub v16.4s, v6.4s, v7.4s +add v6.4s, v6.4s, v7.4s +sqrdmulh v7.4S, v15.4S, v19.s[0] +mul v15.4S, v15.4S,v18.s[0] +mla v15.4S, v7.4S, v31.s[0] +sub v7.4s, v10.4s, v15.4s +add v10.4s, v10.4s, v15.4s +str q10, [x0, #256] +str q7, [x0, #272] +sqrdmulh v7.4S, v4.4S, v19.s[1] +mul v4.4S, v4.4S,v18.s[1] +mla v4.4S, v7.4S, v31.s[0] +sub v7.4s, v14.4s, v4.4s +add v14.4s, v14.4s, v4.4s +str q14, [x0, #288] +str q7, [x0, #304] +sqrdmulh v7.4S, v9.4S, v19.s[3] +mul v9.4S, v9.4S,v18.s[3] +mla v9.4S, v7.4S, v31.s[0] +sub v7.4s, v16.4s, v9.4s +add v16.4s, v16.4s, v9.4s +str q16, [x0, #352] +str q7, [x0, #368] +sqrdmulh v7.4S, v5.4S, v19.s[2] +mul v5.4S, v5.4S,v18.s[2] +mla v5.4S, v7.4S, v31.s[0] +sub v7.4s, v6.4s, v5.4s +add v6.4s, v6.4s, v5.4s +str q6, [x0, #320] +str q7, [x0, #336] +ldr q3, [x0, #496] +ldr q2, [x0, #480] +ldr q1, [x0, #448] +ldr q12, [x0, #464] +ldr q7, [x0, #432] +ldr q6, [x0, #416] +ldr q5, [x0, #384] +ldr q16, [x0, #400] +sqrdmulh v9.4S, v3.4S, v20.s[0] +mul v3.4S, v3.4S,v8.s[0] +mla v3.4S, v9.4S, v31.s[0] +sub v9.4s, v7.4s, v3.4s +add v7.4s, v7.4s, v3.4s +sqrdmulh v3.4S, v2.4S, v20.s[0] +mul v2.4S, v2.4S,v8.s[0] +mla v2.4S, v3.4S, v31.s[0] +sub v3.4s, v6.4s, v2.4s +add v6.4s, v6.4s, v2.4s +sqrdmulh v2.4S, v1.4S, v20.s[0] +mul v1.4S, v1.4S,v8.s[0] +mla v1.4S, v2.4S, v31.s[0] +sub v2.4s, v5.4s, v1.4s +add v5.4s, v5.4s, v1.4s +sqrdmulh v1.4S, v12.4S, v20.s[0] +mul v12.4S, v12.4S,v8.s[0] +mla v12.4S, v1.4S, v31.s[0] +sub v1.4s, v16.4s, v12.4s +add v16.4s, v16.4s, v12.4s +sqrdmulh v12.4S, v7.4S, v20.s[1] +mul v7.4S, v7.4S,v8.s[1] +mla v7.4S, v12.4S, v31.s[0] +sub v12.4s, v16.4s, v7.4s +add v16.4s, v16.4s, v7.4s +ldr q7, [x17, #+320] +ldr q14, [x17, #+336] +ldr q4, [x17, #+352] +ldr q10, [x17, #+368] +sqrdmulh v15.4S, v6.4S, v20.s[1] +mul v6.4S, v6.4S,v8.s[1] +mla v6.4S, v15.4S, v31.s[0] +sub v15.4s, v5.4s, v6.4s +add v5.4s, v5.4s, v6.4s +sqrdmulh v6.4S, v9.4S, v20.s[2] +mul v9.4S, v9.4S,v8.s[2] +mla v9.4S, v6.4S, v31.s[0] +sub v6.4s, v1.4s, v9.4s +add v1.4s, v1.4s, v9.4s +sqrdmulh v9.4S, v3.4S, v20.s[2] +mul v3.4S, v3.4S,v8.s[2] +mla v3.4S, v9.4S, v31.s[0] +sub v9.4s, v2.4s, v3.4s +add v2.4s, v2.4s, v3.4s +sqrdmulh v3.4S, v16.4S, v11.s[0] +mul v16.4S, v16.4S,v0.s[0] +mla v16.4S, v3.4S, v31.s[0] +sub v3.4s, v5.4s, v16.4s +add v5.4s, v5.4s, v16.4s +str q5, [x0, #384] +str q3, [x0, #400] +sqrdmulh v3.4S, v12.4S, v11.s[1] +mul v12.4S, v12.4S,v0.s[1] +mla v12.4S, v3.4S, v31.s[0] +sub v3.4s, v15.4s, v12.4s +add v15.4s, v15.4s, v12.4s +str q15, [x0, #416] +str q3, [x0, #432] +sqrdmulh v3.4S, v6.4S, v11.s[3] +mul v6.4S, v6.4S,v0.s[3] +mla v6.4S, v3.4S, v31.s[0] +sub v3.4s, v9.4s, v6.4s +add v9.4s, v9.4s, v6.4s +str q9, [x0, #480] +str q3, [x0, #496] +sqrdmulh v3.4S, v1.4S, v11.s[2] +mul v1.4S, v1.4S,v0.s[2] +mla v1.4S, v3.4S, v31.s[0] +sub v3.4s, v2.4s, v1.4s +add v2.4s, v2.4s, v1.4s +str q2, [x0, #448] +str q3, [x0, #464] +ldr q19, [x0, #624] +ldr q18, [x0, #608] +ldr q17, [x0, #576] +ldr q13, [x0, #592] +ldr q3, [x0, #560] +ldr q2, [x0, #544] +ldr q1, [x0, #512] +ldr q9, [x0, #528] +sqrdmulh v6.4S, v19.4S, v14.s[0] +mul v19.4S, v19.4S,v7.s[0] +mla v19.4S, v6.4S, v31.s[0] +sub v6.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v14.s[0] +mul v18.4S, v18.4S,v7.s[0] +mla v18.4S, v19.4S, v31.s[0] +sub v19.4s, v2.4s, v18.4s +add v2.4s, v2.4s, v18.4s +sqrdmulh v18.4S, v17.4S, v14.s[0] +mul v17.4S, v17.4S,v7.s[0] +mla v17.4S, v18.4S, v31.s[0] +sub v18.4s, v1.4s, v17.4s +add v1.4s, v1.4s, v17.4s +sqrdmulh v17.4S, v13.4S, v14.s[0] +mul v13.4S, v13.4S,v7.s[0] +mla v13.4S, v17.4S, v31.s[0] +sub v17.4s, v9.4s, v13.4s +add v9.4s, v9.4s, v13.4s +sqrdmulh v13.4S, v3.4S, v14.s[1] +mul v3.4S, v3.4S,v7.s[1] +mla v3.4S, v13.4S, v31.s[0] +sub v13.4s, v9.4s, v3.4s +add v9.4s, v9.4s, v3.4s +ldr q3, [x17, #+384] +ldr q15, [x17, #+400] +ldr q12, [x17, #+416] +ldr q5, [x17, #+432] +sqrdmulh v16.4S, v2.4S, v14.s[1] +mul v2.4S, v2.4S,v7.s[1] +mla v2.4S, v16.4S, v31.s[0] +sub v16.4s, v1.4s, v2.4s +add v1.4s, v1.4s, v2.4s +sqrdmulh v2.4S, v6.4S, v14.s[2] +mul v6.4S, v6.4S,v7.s[2] +mla v6.4S, v2.4S, v31.s[0] +sub v2.4s, v17.4s, v6.4s +add v17.4s, v17.4s, v6.4s +sqrdmulh v6.4S, v19.4S, v14.s[2] +mul v19.4S, v19.4S,v7.s[2] +mla v19.4S, v6.4S, v31.s[0] +sub v6.4s, v18.4s, v19.4s +add v18.4s, v18.4s, v19.4s +sqrdmulh v19.4S, v9.4S, v10.s[0] +mul v9.4S, v9.4S,v4.s[0] +mla v9.4S, v19.4S, v31.s[0] +sub v19.4s, v1.4s, v9.4s +add v1.4s, v1.4s, v9.4s +str q1, [x0, #512] +str q19, [x0, #528] +sqrdmulh v19.4S, v13.4S, v10.s[1] +mul v13.4S, v13.4S,v4.s[1] +mla v13.4S, v19.4S, v31.s[0] +sub v19.4s, v16.4s, v13.4s +add v16.4s, v16.4s, v13.4s +str q16, [x0, #544] +str q19, [x0, #560] +sqrdmulh v19.4S, v2.4S, v10.s[3] +mul v2.4S, v2.4S,v4.s[3] +mla v2.4S, v19.4S, v31.s[0] +sub v19.4s, v6.4s, v2.4s +add v6.4s, v6.4s, v2.4s +str q6, [x0, #608] +str q19, [x0, #624] +sqrdmulh v19.4S, v17.4S, v10.s[2] +mul v17.4S, v17.4S,v4.s[2] +mla v17.4S, v19.4S, v31.s[0] +sub v19.4s, v18.4s, v17.4s +add v18.4s, v18.4s, v17.4s +str q18, [x0, #576] +str q19, [x0, #592] +ldr q11, [x0, #752] +ldr q0, [x0, #736] +ldr q20, [x0, #704] +ldr q8, [x0, #720] +ldr q19, [x0, #688] +ldr q18, [x0, #672] +ldr q17, [x0, #640] +ldr q6, [x0, #656] +sqrdmulh v2.4S, v11.4S, v15.s[0] +mul v11.4S, v11.4S,v3.s[0] +mla v11.4S, v2.4S, v31.s[0] +sub v2.4s, v19.4s, v11.4s +add v19.4s, v19.4s, v11.4s +sqrdmulh v11.4S, v0.4S, v15.s[0] +mul v0.4S, v0.4S,v3.s[0] +mla v0.4S, v11.4S, v31.s[0] +sub v11.4s, v18.4s, v0.4s +add v18.4s, v18.4s, v0.4s +sqrdmulh v0.4S, v20.4S, v15.s[0] +mul v20.4S, v20.4S,v3.s[0] +mla v20.4S, v0.4S, v31.s[0] +sub v0.4s, v17.4s, v20.4s +add v17.4s, v17.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v15.s[0] +mul v8.4S, v8.4S,v3.s[0] +mla v8.4S, v20.4S, v31.s[0] +sub v20.4s, v6.4s, v8.4s +add v6.4s, v6.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v15.s[1] +mul v19.4S, v19.4S,v3.s[1] +mla v19.4S, v8.4S, v31.s[0] +sub v8.4s, v6.4s, v19.4s +add v6.4s, v6.4s, v19.4s +ldr q19, [x17, #+448] +ldr q16, [x17, #+464] +ldr q13, [x17, #+480] +ldr q1, [x17, #+496] +sqrdmulh v9.4S, v18.4S, v15.s[1] +mul v18.4S, v18.4S,v3.s[1] +mla v18.4S, v9.4S, v31.s[0] +sub v9.4s, v17.4s, v18.4s +add v17.4s, v17.4s, v18.4s +sqrdmulh v18.4S, v2.4S, v15.s[2] +mul v2.4S, v2.4S,v3.s[2] +mla v2.4S, v18.4S, v31.s[0] +sub v18.4s, v20.4s, v2.4s +add v20.4s, v20.4s, v2.4s +sqrdmulh v2.4S, v11.4S, v15.s[2] +mul v11.4S, v11.4S,v3.s[2] +mla v11.4S, v2.4S, v31.s[0] +sub v2.4s, v0.4s, v11.4s +add v0.4s, v0.4s, v11.4s +sqrdmulh v11.4S, v6.4S, v5.s[0] +mul v6.4S, v6.4S,v12.s[0] +mla v6.4S, v11.4S, v31.s[0] +sub v11.4s, v17.4s, v6.4s +add v17.4s, v17.4s, v6.4s +str q17, [x0, #640] +str q11, [x0, #656] +sqrdmulh v11.4S, v8.4S, v5.s[1] +mul v8.4S, v8.4S,v12.s[1] +mla v8.4S, v11.4S, v31.s[0] +sub v11.4s, v9.4s, v8.4s +add v9.4s, v9.4s, v8.4s +str q9, [x0, #672] +str q11, [x0, #688] +sqrdmulh v11.4S, v18.4S, v5.s[3] +mul v18.4S, v18.4S,v12.s[3] +mla v18.4S, v11.4S, v31.s[0] +sub v11.4s, v2.4s, v18.4s +add v2.4s, v2.4s, v18.4s +str q2, [x0, #736] +str q11, [x0, #752] +sqrdmulh v11.4S, v20.4S, v5.s[2] +mul v20.4S, v20.4S,v12.s[2] +mla v20.4S, v11.4S, v31.s[0] +sub v11.4s, v0.4s, v20.4s +add v0.4s, v0.4s, v20.4s +str q0, [x0, #704] +str q11, [x0, #720] +ldr q10, [x0, #880] +ldr q4, [x0, #864] +ldr q14, [x0, #832] +ldr q7, [x0, #848] +ldr q11, [x0, #816] +ldr q0, [x0, #800] +ldr q20, [x0, #768] +ldr q2, [x0, #784] +sqrdmulh v18.4S, v10.4S, v16.s[0] +mul v10.4S, v10.4S,v19.s[0] +mla v10.4S, v18.4S, v31.s[0] +sub v18.4s, v11.4s, v10.4s +add v11.4s, v11.4s, v10.4s +sqrdmulh v10.4S, v4.4S, v16.s[0] +mul v4.4S, v4.4S,v19.s[0] +mla v4.4S, v10.4S, v31.s[0] +sub v10.4s, v0.4s, v4.4s +add v0.4s, v0.4s, v4.4s +sqrdmulh v4.4S, v14.4S, v16.s[0] +mul v14.4S, v14.4S,v19.s[0] +mla v14.4S, v4.4S, v31.s[0] +sub v4.4s, v20.4s, v14.4s +add v20.4s, v20.4s, v14.4s +sqrdmulh v14.4S, v7.4S, v16.s[0] +mul v7.4S, v7.4S,v19.s[0] +mla v7.4S, v14.4S, v31.s[0] +sub v14.4s, v2.4s, v7.4s +add v2.4s, v2.4s, v7.4s +sqrdmulh v7.4S, v11.4S, v16.s[1] +mul v11.4S, v11.4S,v19.s[1] +mla v11.4S, v7.4S, v31.s[0] +sub v7.4s, v2.4s, v11.4s +add v2.4s, v2.4s, v11.4s +ldr q11, [x17, #+512] +ldr q9, [x17, #+528] +ldr q8, [x17, #+544] +ldr q17, [x17, #+560] +sqrdmulh v6.4S, v0.4S, v16.s[1] +mul v0.4S, v0.4S,v19.s[1] +mla v0.4S, v6.4S, v31.s[0] +sub v6.4s, v20.4s, v0.4s +add v20.4s, v20.4s, v0.4s +sqrdmulh v0.4S, v18.4S, v16.s[2] +mul v18.4S, v18.4S,v19.s[2] +mla v18.4S, v0.4S, v31.s[0] +sub v0.4s, v14.4s, v18.4s +add v14.4s, v14.4s, v18.4s +sqrdmulh v18.4S, v10.4S, v16.s[2] +mul v10.4S, v10.4S,v19.s[2] +mla v10.4S, v18.4S, v31.s[0] +sub v18.4s, v4.4s, v10.4s +add v4.4s, v4.4s, v10.4s +sqrdmulh v10.4S, v2.4S, v1.s[0] +mul v2.4S, v2.4S,v13.s[0] +mla v2.4S, v10.4S, v31.s[0] +sub v10.4s, v20.4s, v2.4s +add v20.4s, v20.4s, v2.4s +str q20, [x0, #768] +str q10, [x0, #784] +sqrdmulh v10.4S, v7.4S, v1.s[1] +mul v7.4S, v7.4S,v13.s[1] +mla v7.4S, v10.4S, v31.s[0] +sub v10.4s, v6.4s, v7.4s +add v6.4s, v6.4s, v7.4s +str q6, [x0, #800] +str q10, [x0, #816] +sqrdmulh v10.4S, v0.4S, v1.s[3] +mul v0.4S, v0.4S,v13.s[3] +mla v0.4S, v10.4S, v31.s[0] +sub v10.4s, v18.4s, v0.4s +add v18.4s, v18.4s, v0.4s +str q18, [x0, #864] +str q10, [x0, #880] +sqrdmulh v10.4S, v14.4S, v1.s[2] +mul v14.4S, v14.4S,v13.s[2] +mla v14.4S, v10.4S, v31.s[0] +sub v10.4s, v4.4s, v14.4s +add v4.4s, v4.4s, v14.4s +str q4, [x0, #832] +str q10, [x0, #848] +ldr q5, [x0, #1008] +ldr q12, [x0, #992] +ldr q15, [x0, #960] +ldr q3, [x0, #976] +ldr q10, [x0, #944] +ldr q4, [x0, #928] +ldr q14, [x0, #896] +ldr q18, [x0, #912] +sqrdmulh v0.4S, v5.4S, v9.s[0] +mul v5.4S, v5.4S,v11.s[0] +mla v5.4S, v0.4S, v31.s[0] +sub v0.4s, v10.4s, v5.4s +add v10.4s, v10.4s, v5.4s +sqrdmulh v5.4S, v12.4S, v9.s[0] +mul v12.4S, v12.4S,v11.s[0] +mla v12.4S, v5.4S, v31.s[0] +sub v5.4s, v4.4s, v12.4s +add v4.4s, v4.4s, v12.4s +sqrdmulh v12.4S, v15.4S, v9.s[0] +mul v15.4S, v15.4S,v11.s[0] +mla v15.4S, v12.4S, v31.s[0] +sub v12.4s, v14.4s, v15.4s +add v14.4s, v14.4s, v15.4s +sqrdmulh v15.4S, v3.4S, v9.s[0] +mul v3.4S, v3.4S,v11.s[0] +mla v3.4S, v15.4S, v31.s[0] +sub v15.4s, v18.4s, v3.4s +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v10.4S, v9.s[1] +mul v10.4S, v10.4S,v11.s[1] +mla v10.4S, v3.4S, v31.s[0] +sub v3.4s, v18.4s, v10.4s +add v18.4s, v18.4s, v10.4s +sqrdmulh v10.4S, v4.4S, v9.s[1] +mul v4.4S, v4.4S,v11.s[1] +mla v4.4S, v10.4S, v31.s[0] +sub v10.4s, v14.4s, v4.4s +add v14.4s, v14.4s, v4.4s +sqrdmulh v4.4S, v0.4S, v9.s[2] +mul v0.4S, v0.4S,v11.s[2] +mla v0.4S, v4.4S, v31.s[0] +sub v4.4s, v15.4s, v0.4s +add v15.4s, v15.4s, v0.4s +sqrdmulh v0.4S, v5.4S, v9.s[2] +mul v5.4S, v5.4S,v11.s[2] +mla v5.4S, v0.4S, v31.s[0] +sub v0.4s, v12.4s, v5.4s +add v12.4s, v12.4s, v5.4s +sqrdmulh v5.4S, v18.4S, v17.s[0] +mul v18.4S, v18.4S,v8.s[0] +mla v18.4S, v5.4S, v31.s[0] +sub v5.4s, v14.4s, v18.4s +add v14.4s, v14.4s, v18.4s +str q14, [x0, #896] +str q5, [x0, #912] +sqrdmulh v5.4S, v3.4S, v17.s[1] +mul v3.4S, v3.4S,v8.s[1] +mla v3.4S, v5.4S, v31.s[0] +sub v5.4s, v10.4s, v3.4s +add v10.4s, v10.4s, v3.4s +str q10, [x0, #928] +str q5, [x0, #944] +sqrdmulh v5.4S, v4.4S, v17.s[3] +mul v4.4S, v4.4S,v8.s[3] +mla v4.4S, v5.4S, v31.s[0] +sub v5.4s, v0.4s, v4.4s +add v0.4s, v0.4s, v4.4s +str q0, [x0, #992] +str q5, [x0, #1008] +sqrdmulh v5.4S, v15.4S, v17.s[2] +mul v15.4S, v15.4S,v8.s[2] +mla v15.4S, v5.4S, v31.s[0] +sub v5.4s, v12.4s, v15.4s +add v12.4s, v12.4s, v15.4s +str q12, [x0, #960] +str q5, [x0, #976] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1444 +// Instruction count: 1440 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_3_3_1.s b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_3_3_1.s new file mode 100644 index 0000000..bef74a7 --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_3_3_1.s @@ -0,0 +1,1474 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 23825509 // Layer 4, block 0 +.word 27028662 // Layer 4, block 1 +.word 0 // Layer None, block None +.word 1307297022 // Layer 3, block 0 +.word 1524716204 // Layer 4, block 0 +.word 1729702351 // Layer 4, block 1 +.word 0 // Layer None, block None +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 14626653 // Layer 3, block 1 +.word 14833295 // Layer 4, block 2 +.word 2138810 // Layer 4, block 3 +.word 0 // Layer None, block None +.word 936034350 // Layer 3, block 1 +.word 949258429 // Layer 4, block 2 +.word 136873393 // Layer 4, block 3 +.word 0 // Layer None, block None +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 29737761 // Layer 3, block 2 +.word 6490403 // Layer 4, block 4 +.word 19648405 // Layer 4, block 5 +.word 0 // Layer None, block None +.word 1903071454 // Layer 3, block 2 +.word 415354091 // Layer 4, block 4 +.word 1257401950 // Layer 4, block 5 +.word 0 // Layer None, block None +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 30285189 // Layer 3, block 3 +.word 31254932 // Layer 4, block 6 +.word 26362414 // Layer 4, block 7 +.word 0 // Layer None, block None +.word 1938104173 // Layer 3, block 3 +.word 2000162988 // Layer 4, block 6 +.word 1687065733 // Layer 4, block 7 +.word 0 // Layer None, block None +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 21289485 // Layer 3, block 4 +.word 572895 // Layer 4, block 8 +.word 26691971 // Layer 4, block 9 +.word 0 // Layer None, block None +.word 1362423055 // Layer 3, block 4 +.word 36662482 // Layer 4, block 8 +.word 1708155771 // Layer 4, block 9 +.word 0 // Layer None, block None +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 9914896 // Layer 3, block 5 +.word 9249292 // Layer 4, block 10 +.word 29292862 // Layer 4, block 11 +.word 0 // Layer None, block None +.word 634504916 // Layer 3, block 5 +.word 591909511 // Layer 4, block 10 +.word 1874600091 // Layer 4, block 11 +.word 0 // Layer None, block None +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 22603682 // Layer 3, block 6 +.word 8247799 // Layer 4, block 12 +.word 5086187 // Layer 4, block 13 +.word 0 // Layer None, block None +.word 1446525244 // Layer 3, block 6 +.word 527818851 // Layer 4, block 12 +.word 325491125 // Layer 4, block 13 +.word 0 // Layer None, block None +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 16204162 // Layer 3, block 7 +.word 28113639 // Layer 4, block 14 +.word 8471290 // Layer 4, block 15 +.word 0 // Layer None, block None +.word 1036987221 // Layer 3, block 7 +.word 1799135579 // Layer 4, block 14 +.word 542121183 // Layer 4, block 15 +.word 0 // Layer None, block None +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.text +.global ntt_u32_incomplete_neon_asm_var_3_3_1 +.global _ntt_u32_incomplete_neon_asm_var_3_3_1 +ntt_u32_incomplete_neon_asm_var_3_3_1: +_ntt_u32_incomplete_neon_asm_var_3_3_1: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x0, #960] +ldr q25, [x0, #832] +sqrdmulh v24.4S, v26.4S, v29.s[0] +mul v26.4S, v26.4S,v30.s[0] +ldr q23, [x0, #576] +ldr q22, [x0, #704] +sqrdmulh v21.4S, v25.4S, v29.s[0] +mla v26.4S, v24.4S, v31.s[0] +mul v25.4S, v25.4S,v30.s[0] +ldr q24, [x0, #448] +ldr q20, [x0, #320] +sqrdmulh v19.4S, v23.4S, v29.s[0] +sub v18.4s, v24.4s, v26.4s +mla v25.4S, v21.4S, v31.s[0] +mul v23.4S, v23.4S,v30.s[0] +add v24.4s, v24.4s, v26.4s +ldr q26, [x0, #64] +ldr q21, [x0, #192] +sqrdmulh v17.4S, v22.4S, v29.s[0] +sub v16.4s, v20.4s, v25.4s +mla v23.4S, v19.4S, v31.s[0] +mul v22.4S, v22.4S,v30.s[0] +add v20.4s, v20.4s, v25.4s +sqrdmulh v25.4S, v24.4S, v29.s[1] +sub v19.4s, v26.4s, v23.4s +mla v22.4S, v17.4S, v31.s[0] +mul v24.4S, v24.4S,v30.s[1] +add v26.4s, v26.4s, v23.4s +sqrdmulh v23.4S, v20.4S, v29.s[1] +sub v17.4s, v21.4s, v22.4s +mla v24.4S, v25.4S, v31.s[0] +mul v20.4S, v20.4S,v30.s[1] +add v21.4s, v21.4s, v22.4s +sqrdmulh v22.4S, v18.4S, v29.s[2] +sub v25.4s, v21.4s, v24.4s +mla v20.4S, v23.4S, v31.s[0] +mul v18.4S, v18.4S,v30.s[2] +add v21.4s, v21.4s, v24.4s +sqrdmulh v24.4S, v16.4S, v29.s[2] +sub v23.4s, v26.4s, v20.4s +mla v18.4S, v22.4S, v31.s[0] +mul v16.4S, v16.4S,v30.s[2] +add v26.4s, v26.4s, v20.4s +sqrdmulh v20.4S, v21.4S, v27.s[0] +sub v22.4s, v17.4s, v18.4s +mla v16.4S, v24.4S, v31.s[0] +mul v21.4S, v21.4S,v28.s[0] +add v17.4s, v17.4s, v18.4s +sqrdmulh v18.4S, v25.4S, v27.s[1] +sub v24.4s, v19.4s, v16.4s +mla v21.4S, v20.4S, v31.s[0] +mul v25.4S, v25.4S,v28.s[1] +add v19.4s, v19.4s, v16.4s +sqrdmulh v16.4S, v22.4S, v27.s[3] +sub v20.4s, v26.4s, v21.4s +mla v25.4S, v18.4S, v31.s[0] +mul v22.4S, v22.4S,v28.s[3] +add v26.4s, v26.4s, v21.4s +str q26, [x0, #64] +str q20, [x0, #192] +sqrdmulh v20.4S, v17.4S, v27.s[2] +sub v26.4s, v23.4s, v25.4s +mla v22.4S, v16.4S, v31.s[0] +mul v17.4S, v17.4S,v28.s[2] +add v23.4s, v23.4s, v25.4s +str q23, [x0, #320] +str q26, [x0, #448] +ldr q26, [x0, #976] +ldr q23, [x0, #848] +sqrdmulh v25.4S, v26.4S, v29.s[0] +sub v16.4s, v24.4s, v22.4s +mla v17.4S, v20.4S, v31.s[0] +mul v26.4S, v26.4S,v30.s[0] +add v24.4s, v24.4s, v22.4s +str q24, [x0, #832] +str q16, [x0, #960] +ldr q16, [x0, #592] +ldr q24, [x0, #720] +sqrdmulh v22.4S, v23.4S, v29.s[0] +sub v20.4s, v19.4s, v17.4s +mla v26.4S, v25.4S, v31.s[0] +mul v23.4S, v23.4S,v30.s[0] +add v19.4s, v19.4s, v17.4s +str q19, [x0, #576] +str q20, [x0, #704] +ldr q20, [x0, #464] +ldr q19, [x0, #336] +sqrdmulh v17.4S, v16.4S, v29.s[0] +sub v25.4s, v20.4s, v26.4s +mla v23.4S, v22.4S, v31.s[0] +mul v16.4S, v16.4S,v30.s[0] +add v20.4s, v20.4s, v26.4s +ldr q26, [x0, #80] +ldr q22, [x0, #208] +sqrdmulh v21.4S, v24.4S, v29.s[0] +sub v18.4s, v19.4s, v23.4s +mla v16.4S, v17.4S, v31.s[0] +mul v24.4S, v24.4S,v30.s[0] +add v19.4s, v19.4s, v23.4s +sqrdmulh v23.4S, v20.4S, v29.s[1] +sub v17.4s, v26.4s, v16.4s +mla v24.4S, v21.4S, v31.s[0] +mul v20.4S, v20.4S,v30.s[1] +add v26.4s, v26.4s, v16.4s +sqrdmulh v16.4S, v19.4S, v29.s[1] +sub v21.4s, v22.4s, v24.4s +mla v20.4S, v23.4S, v31.s[0] +mul v19.4S, v19.4S,v30.s[1] +add v22.4s, v22.4s, v24.4s +sqrdmulh v24.4S, v25.4S, v29.s[2] +sub v23.4s, v22.4s, v20.4s +mla v19.4S, v16.4S, v31.s[0] +mul v25.4S, v25.4S,v30.s[2] +add v22.4s, v22.4s, v20.4s +sqrdmulh v20.4S, v18.4S, v29.s[2] +sub v16.4s, v26.4s, v19.4s +mla v25.4S, v24.4S, v31.s[0] +mul v18.4S, v18.4S,v30.s[2] +add v26.4s, v26.4s, v19.4s +sqrdmulh v19.4S, v22.4S, v27.s[0] +sub v24.4s, v21.4s, v25.4s +mla v18.4S, v20.4S, v31.s[0] +mul v22.4S, v22.4S,v28.s[0] +add v21.4s, v21.4s, v25.4s +sqrdmulh v25.4S, v23.4S, v27.s[1] +sub v20.4s, v17.4s, v18.4s +mla v22.4S, v19.4S, v31.s[0] +mul v23.4S, v23.4S,v28.s[1] +add v17.4s, v17.4s, v18.4s +sqrdmulh v18.4S, v24.4S, v27.s[3] +sub v19.4s, v26.4s, v22.4s +mla v23.4S, v25.4S, v31.s[0] +mul v24.4S, v24.4S,v28.s[3] +add v26.4s, v26.4s, v22.4s +str q26, [x0, #80] +str q19, [x0, #208] +sqrdmulh v19.4S, v21.4S, v27.s[2] +sub v26.4s, v16.4s, v23.4s +mla v24.4S, v18.4S, v31.s[0] +mul v21.4S, v21.4S,v28.s[2] +add v16.4s, v16.4s, v23.4s +str q16, [x0, #336] +str q26, [x0, #464] +ldr q26, [x0, #992] +ldr q16, [x0, #864] +sqrdmulh v23.4S, v26.4S, v29.s[0] +sub v18.4s, v20.4s, v24.4s +mla v21.4S, v19.4S, v31.s[0] +mul v26.4S, v26.4S,v30.s[0] +add v20.4s, v20.4s, v24.4s +str q20, [x0, #848] +str q18, [x0, #976] +ldr q18, [x0, #608] +ldr q20, [x0, #736] +sqrdmulh v24.4S, v16.4S, v29.s[0] +sub v19.4s, v17.4s, v21.4s +mla v26.4S, v23.4S, v31.s[0] +mul v16.4S, v16.4S,v30.s[0] +add v17.4s, v17.4s, v21.4s +str q17, [x0, #592] +str q19, [x0, #720] +ldr q19, [x0, #480] +ldr q17, [x0, #352] +sqrdmulh v21.4S, v18.4S, v29.s[0] +sub v23.4s, v19.4s, v26.4s +mla v16.4S, v24.4S, v31.s[0] +mul v18.4S, v18.4S,v30.s[0] +add v19.4s, v19.4s, v26.4s +ldr q26, [x0, #96] +ldr q24, [x0, #224] +sqrdmulh v22.4S, v20.4S, v29.s[0] +sub v25.4s, v17.4s, v16.4s +mla v18.4S, v21.4S, v31.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v17.4s, v17.4s, v16.4s +sqrdmulh v16.4S, v19.4S, v29.s[1] +sub v21.4s, v26.4s, v18.4s +mla v20.4S, v22.4S, v31.s[0] +mul v19.4S, v19.4S,v30.s[1] +add v26.4s, v26.4s, v18.4s +sqrdmulh v18.4S, v17.4S, v29.s[1] +sub v22.4s, v24.4s, v20.4s +mla v19.4S, v16.4S, v31.s[0] +mul v17.4S, v17.4S,v30.s[1] +add v24.4s, v24.4s, v20.4s +sqrdmulh v20.4S, v23.4S, v29.s[2] +sub v16.4s, v24.4s, v19.4s +mla v17.4S, v18.4S, v31.s[0] +mul v23.4S, v23.4S,v30.s[2] +add v24.4s, v24.4s, v19.4s +sqrdmulh v19.4S, v25.4S, v29.s[2] +sub v18.4s, v26.4s, v17.4s +mla v23.4S, v20.4S, v31.s[0] +mul v25.4S, v25.4S,v30.s[2] +add v26.4s, v26.4s, v17.4s +sqrdmulh v17.4S, v24.4S, v27.s[0] +sub v20.4s, v22.4s, v23.4s +mla v25.4S, v19.4S, v31.s[0] +mul v24.4S, v24.4S,v28.s[0] +add v22.4s, v22.4s, v23.4s +sqrdmulh v23.4S, v16.4S, v27.s[1] +sub v19.4s, v21.4s, v25.4s +mla v24.4S, v17.4S, v31.s[0] +mul v16.4S, v16.4S,v28.s[1] +add v21.4s, v21.4s, v25.4s +sqrdmulh v25.4S, v20.4S, v27.s[3] +sub v17.4s, v26.4s, v24.4s +mla v16.4S, v23.4S, v31.s[0] +mul v20.4S, v20.4S,v28.s[3] +add v26.4s, v26.4s, v24.4s +str q26, [x0, #96] +str q17, [x0, #224] +sqrdmulh v17.4S, v22.4S, v27.s[2] +sub v26.4s, v18.4s, v16.4s +mla v20.4S, v25.4S, v31.s[0] +mul v22.4S, v22.4S,v28.s[2] +add v18.4s, v18.4s, v16.4s +str q18, [x0, #352] +str q26, [x0, #480] +ldr q26, [x0, #1008] +ldr q18, [x0, #880] +sqrdmulh v16.4S, v26.4S, v29.s[0] +sub v25.4s, v19.4s, v20.4s +mla v22.4S, v17.4S, v31.s[0] +mul v26.4S, v26.4S,v30.s[0] +add v19.4s, v19.4s, v20.4s +str q19, [x0, #864] +str q25, [x0, #992] +ldr q25, [x0, #624] +ldr q19, [x0, #752] +sqrdmulh v20.4S, v18.4S, v29.s[0] +sub v17.4s, v21.4s, v22.4s +mla v26.4S, v16.4S, v31.s[0] +mul v18.4S, v18.4S,v30.s[0] +add v21.4s, v21.4s, v22.4s +str q21, [x0, #608] +str q17, [x0, #736] +ldr q17, [x0, #496] +ldr q21, [x0, #368] +sqrdmulh v22.4S, v25.4S, v29.s[0] +sub v16.4s, v17.4s, v26.4s +mla v18.4S, v20.4S, v31.s[0] +mul v25.4S, v25.4S,v30.s[0] +add v17.4s, v17.4s, v26.4s +ldr q26, [x0, #112] +ldr q20, [x0, #240] +sqrdmulh v24.4S, v19.4S, v29.s[0] +sub v23.4s, v21.4s, v18.4s +mla v25.4S, v22.4S, v31.s[0] +mul v19.4S, v19.4S,v30.s[0] +add v21.4s, v21.4s, v18.4s +sqrdmulh v18.4S, v17.4S, v29.s[1] +sub v22.4s, v26.4s, v25.4s +mla v19.4S, v24.4S, v31.s[0] +mul v17.4S, v17.4S,v30.s[1] +add v26.4s, v26.4s, v25.4s +sqrdmulh v25.4S, v21.4S, v29.s[1] +sub v24.4s, v20.4s, v19.4s +mla v17.4S, v18.4S, v31.s[0] +mul v21.4S, v21.4S,v30.s[1] +add v20.4s, v20.4s, v19.4s +sqrdmulh v19.4S, v16.4S, v29.s[2] +sub v18.4s, v20.4s, v17.4s +mla v21.4S, v25.4S, v31.s[0] +mul v16.4S, v16.4S,v30.s[2] +add v20.4s, v20.4s, v17.4s +sqrdmulh v17.4S, v23.4S, v29.s[2] +sub v25.4s, v26.4s, v21.4s +mla v16.4S, v19.4S, v31.s[0] +mul v23.4S, v23.4S,v30.s[2] +add v26.4s, v26.4s, v21.4s +sqrdmulh v21.4S, v20.4S, v27.s[0] +sub v19.4s, v24.4s, v16.4s +mla v23.4S, v17.4S, v31.s[0] +mul v20.4S, v20.4S,v28.s[0] +add v24.4s, v24.4s, v16.4s +sqrdmulh v16.4S, v18.4S, v27.s[1] +sub v17.4s, v22.4s, v23.4s +mla v20.4S, v21.4S, v31.s[0] +mul v18.4S, v18.4S,v28.s[1] +add v22.4s, v22.4s, v23.4s +sqrdmulh v23.4S, v19.4S, v27.s[3] +sub v21.4s, v26.4s, v20.4s +mla v18.4S, v16.4S, v31.s[0] +mul v19.4S, v19.4S,v28.s[3] +add v26.4s, v26.4s, v20.4s +str q26, [x0, #112] +str q21, [x0, #240] +sqrdmulh v21.4S, v24.4S, v27.s[2] +sub v26.4s, v25.4s, v18.4s +mla v19.4S, v23.4S, v31.s[0] +mul v24.4S, v24.4S,v28.s[2] +add v25.4s, v25.4s, v18.4s +str q25, [x0, #368] +str q26, [x0, #496] +ldr q26, [x0, #896] +ldr q25, [x0, #768] +sqrdmulh v18.4S, v26.4S, v29.s[0] +sub v23.4s, v17.4s, v19.4s +mla v24.4S, v21.4S, v31.s[0] +mul v26.4S, v26.4S,v30.s[0] +add v17.4s, v17.4s, v19.4s +str q17, [x0, #880] +str q23, [x0, #1008] +ldr q23, [x0, #512] +ldr q17, [x0, #640] +sqrdmulh v19.4S, v25.4S, v29.s[0] +sub v21.4s, v22.4s, v24.4s +mla v26.4S, v18.4S, v31.s[0] +mul v25.4S, v25.4S,v30.s[0] +add v22.4s, v22.4s, v24.4s +str q22, [x0, #624] +str q21, [x0, #752] +ldr q21, [x0, #384] +ldr q22, [x0, #256] +sqrdmulh v24.4S, v23.4S, v29.s[0] +sub v18.4s, v21.4s, v26.4s +mla v25.4S, v19.4S, v31.s[0] +mul v23.4S, v23.4S,v30.s[0] +add v21.4s, v21.4s, v26.4s +ldr q26, [x0, #0] +ldr q19, [x0, #128] +sqrdmulh v20.4S, v17.4S, v29.s[0] +sub v16.4s, v22.4s, v25.4s +mla v23.4S, v24.4S, v31.s[0] +mul v17.4S, v17.4S,v30.s[0] +add v22.4s, v22.4s, v25.4s +sqrdmulh v25.4S, v21.4S, v29.s[1] +sub v24.4s, v26.4s, v23.4s +mla v17.4S, v20.4S, v31.s[0] +mul v21.4S, v21.4S,v30.s[1] +add v26.4s, v26.4s, v23.4s +sqrdmulh v23.4S, v22.4S, v29.s[1] +sub v20.4s, v19.4s, v17.4s +mla v21.4S, v25.4S, v31.s[0] +mul v22.4S, v22.4S,v30.s[1] +add v19.4s, v19.4s, v17.4s +sqrdmulh v17.4S, v18.4S, v29.s[2] +sub v25.4s, v19.4s, v21.4s +mla v22.4S, v23.4S, v31.s[0] +mul v18.4S, v18.4S,v30.s[2] +add v19.4s, v19.4s, v21.4s +sqrdmulh v21.4S, v16.4S, v29.s[2] +sub v23.4s, v26.4s, v22.4s +mla v18.4S, v17.4S, v31.s[0] +mul v16.4S, v16.4S,v30.s[2] +add v26.4s, v26.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v27.s[0] +sub v17.4s, v20.4s, v18.4s +mla v16.4S, v21.4S, v31.s[0] +mul v19.4S, v19.4S,v28.s[0] +add v20.4s, v20.4s, v18.4s +sqrdmulh v18.4S, v25.4S, v27.s[1] +sub v21.4s, v24.4s, v16.4s +mla v19.4S, v22.4S, v31.s[0] +mul v25.4S, v25.4S,v28.s[1] +add v24.4s, v24.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v27.s[3] +sub v22.4s, v26.4s, v19.4s +mla v25.4S, v18.4S, v31.s[0] +mul v17.4S, v17.4S,v28.s[3] +add v26.4s, v26.4s, v19.4s +str q26, [x0, #0] +str q22, [x0, #128] +sqrdmulh v22.4S, v20.4S, v27.s[2] +sub v26.4s, v23.4s, v25.4s +mla v17.4S, v16.4S, v31.s[0] +mul v20.4S, v20.4S,v28.s[2] +add v23.4s, v23.4s, v25.4s +str q23, [x0, #256] +str q26, [x0, #384] +ldr q26, [x0, #912] +ldr q23, [x0, #784] +sqrdmulh v25.4S, v26.4S, v29.s[0] +sub v16.4s, v21.4s, v17.4s +mla v20.4S, v22.4S, v31.s[0] +mul v26.4S, v26.4S,v30.s[0] +add v21.4s, v21.4s, v17.4s +str q21, [x0, #768] +str q16, [x0, #896] +ldr q16, [x0, #528] +ldr q21, [x0, #656] +sqrdmulh v17.4S, v23.4S, v29.s[0] +sub v22.4s, v24.4s, v20.4s +mla v26.4S, v25.4S, v31.s[0] +mul v23.4S, v23.4S,v30.s[0] +add v24.4s, v24.4s, v20.4s +str q24, [x0, #512] +str q22, [x0, #640] +ldr q22, [x0, #400] +ldr q24, [x0, #272] +sqrdmulh v20.4S, v16.4S, v29.s[0] +sub v25.4s, v22.4s, v26.4s +mla v23.4S, v17.4S, v31.s[0] +mul v16.4S, v16.4S,v30.s[0] +add v22.4s, v22.4s, v26.4s +ldr q26, [x0, #16] +ldr q17, [x0, #144] +sqrdmulh v19.4S, v21.4S, v29.s[0] +sub v18.4s, v24.4s, v23.4s +mla v16.4S, v20.4S, v31.s[0] +mul v21.4S, v21.4S,v30.s[0] +add v24.4s, v24.4s, v23.4s +sqrdmulh v23.4S, v22.4S, v29.s[1] +sub v20.4s, v26.4s, v16.4s +mla v21.4S, v19.4S, v31.s[0] +mul v22.4S, v22.4S,v30.s[1] +add v26.4s, v26.4s, v16.4s +sqrdmulh v16.4S, v24.4S, v29.s[1] +sub v19.4s, v17.4s, v21.4s +mla v22.4S, v23.4S, v31.s[0] +mul v24.4S, v24.4S,v30.s[1] +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v25.4S, v29.s[2] +sub v23.4s, v17.4s, v22.4s +mla v24.4S, v16.4S, v31.s[0] +mul v25.4S, v25.4S,v30.s[2] +add v17.4s, v17.4s, v22.4s +sqrdmulh v22.4S, v18.4S, v29.s[2] +sub v16.4s, v26.4s, v24.4s +mla v25.4S, v21.4S, v31.s[0] +mul v18.4S, v18.4S,v30.s[2] +add v26.4s, v26.4s, v24.4s +sqrdmulh v24.4S, v17.4S, v27.s[0] +sub v21.4s, v19.4s, v25.4s +mla v18.4S, v22.4S, v31.s[0] +mul v17.4S, v17.4S,v28.s[0] +add v19.4s, v19.4s, v25.4s +sqrdmulh v25.4S, v23.4S, v27.s[1] +sub v22.4s, v20.4s, v18.4s +mla v17.4S, v24.4S, v31.s[0] +mul v23.4S, v23.4S,v28.s[1] +add v20.4s, v20.4s, v18.4s +sqrdmulh v18.4S, v21.4S, v27.s[3] +sub v24.4s, v26.4s, v17.4s +mla v23.4S, v25.4S, v31.s[0] +mul v21.4S, v21.4S,v28.s[3] +add v26.4s, v26.4s, v17.4s +str q26, [x0, #16] +str q24, [x0, #144] +sqrdmulh v24.4S, v19.4S, v27.s[2] +sub v26.4s, v16.4s, v23.4s +mla v21.4S, v18.4S, v31.s[0] +mul v19.4S, v19.4S,v28.s[2] +add v16.4s, v16.4s, v23.4s +str q16, [x0, #272] +str q26, [x0, #400] +ldr q26, [x0, #928] +ldr q16, [x0, #800] +sqrdmulh v23.4S, v26.4S, v29.s[0] +sub v18.4s, v22.4s, v21.4s +mla v19.4S, v24.4S, v31.s[0] +mul v26.4S, v26.4S,v30.s[0] +add v22.4s, v22.4s, v21.4s +str q22, [x0, #784] +str q18, [x0, #912] +ldr q18, [x0, #544] +ldr q22, [x0, #672] +sqrdmulh v21.4S, v16.4S, v29.s[0] +sub v24.4s, v20.4s, v19.4s +mla v26.4S, v23.4S, v31.s[0] +mul v16.4S, v16.4S,v30.s[0] +add v20.4s, v20.4s, v19.4s +str q20, [x0, #528] +str q24, [x0, #656] +ldr q24, [x0, #416] +ldr q20, [x0, #288] +sqrdmulh v19.4S, v18.4S, v29.s[0] +sub v23.4s, v24.4s, v26.4s +mla v16.4S, v21.4S, v31.s[0] +mul v18.4S, v18.4S,v30.s[0] +add v24.4s, v24.4s, v26.4s +ldr q26, [x0, #32] +ldr q21, [x0, #160] +sqrdmulh v17.4S, v22.4S, v29.s[0] +sub v25.4s, v20.4s, v16.4s +mla v18.4S, v19.4S, v31.s[0] +mul v22.4S, v22.4S,v30.s[0] +add v20.4s, v20.4s, v16.4s +sqrdmulh v16.4S, v24.4S, v29.s[1] +sub v19.4s, v26.4s, v18.4s +mla v22.4S, v17.4S, v31.s[0] +mul v24.4S, v24.4S,v30.s[1] +add v26.4s, v26.4s, v18.4s +sqrdmulh v18.4S, v20.4S, v29.s[1] +sub v17.4s, v21.4s, v22.4s +mla v24.4S, v16.4S, v31.s[0] +mul v20.4S, v20.4S,v30.s[1] +add v21.4s, v21.4s, v22.4s +sqrdmulh v22.4S, v23.4S, v29.s[2] +sub v16.4s, v21.4s, v24.4s +mla v20.4S, v18.4S, v31.s[0] +mul v23.4S, v23.4S,v30.s[2] +add v21.4s, v21.4s, v24.4s +sqrdmulh v24.4S, v25.4S, v29.s[2] +sub v18.4s, v26.4s, v20.4s +mla v23.4S, v22.4S, v31.s[0] +mul v25.4S, v25.4S,v30.s[2] +add v26.4s, v26.4s, v20.4s +sqrdmulh v20.4S, v21.4S, v27.s[0] +sub v22.4s, v17.4s, v23.4s +mla v25.4S, v24.4S, v31.s[0] +mul v21.4S, v21.4S,v28.s[0] +add v17.4s, v17.4s, v23.4s +sqrdmulh v23.4S, v16.4S, v27.s[1] +sub v24.4s, v19.4s, v25.4s +mla v21.4S, v20.4S, v31.s[0] +mul v16.4S, v16.4S,v28.s[1] +add v19.4s, v19.4s, v25.4s +sqrdmulh v25.4S, v22.4S, v27.s[3] +sub v20.4s, v26.4s, v21.4s +mla v16.4S, v23.4S, v31.s[0] +mul v22.4S, v22.4S,v28.s[3] +add v26.4s, v26.4s, v21.4s +str q26, [x0, #32] +str q20, [x0, #160] +sqrdmulh v20.4S, v17.4S, v27.s[2] +sub v26.4s, v18.4s, v16.4s +mla v22.4S, v25.4S, v31.s[0] +mul v17.4S, v17.4S,v28.s[2] +add v18.4s, v18.4s, v16.4s +str q18, [x0, #288] +str q26, [x0, #416] +ldr q26, [x0, #944] +ldr q18, [x0, #816] +sqrdmulh v16.4S, v26.4S, v29.s[0] +sub v25.4s, v24.4s, v22.4s +mla v17.4S, v20.4S, v31.s[0] +mul v26.4S, v26.4S,v30.s[0] +add v24.4s, v24.4s, v22.4s +str q24, [x0, #800] +str q25, [x0, #928] +ldr q25, [x0, #560] +ldr q24, [x0, #688] +sqrdmulh v22.4S, v18.4S, v29.s[0] +sub v20.4s, v19.4s, v17.4s +mla v26.4S, v16.4S, v31.s[0] +mul v18.4S, v18.4S,v30.s[0] +add v19.4s, v19.4s, v17.4s +str q19, [x0, #544] +str q20, [x0, #672] +ldr q20, [x0, #432] +ldr q19, [x0, #304] +sqrdmulh v17.4S, v25.4S, v29.s[0] +sub v16.4s, v20.4s, v26.4s +mla v18.4S, v22.4S, v31.s[0] +mul v25.4S, v25.4S,v30.s[0] +add v20.4s, v20.4s, v26.4s +ldr q26, [x0, #48] +ldr q22, [x0, #176] +sqrdmulh v21.4S, v24.4S, v29.s[0] +sub v23.4s, v19.4s, v18.4s +mla v25.4S, v17.4S, v31.s[0] +mul v24.4S, v24.4S,v30.s[0] +add v19.4s, v19.4s, v18.4s +sqrdmulh v18.4S, v20.4S, v29.s[1] +sub v17.4s, v26.4s, v25.4s +mla v24.4S, v21.4S, v31.s[0] +mul v20.4S, v20.4S,v30.s[1] +add v26.4s, v26.4s, v25.4s +sqrdmulh v25.4S, v19.4S, v29.s[1] +sub v21.4s, v22.4s, v24.4s +mla v20.4S, v18.4S, v31.s[0] +mul v19.4S, v19.4S,v30.s[1] +add v22.4s, v22.4s, v24.4s +sqrdmulh v24.4S, v16.4S, v29.s[2] +sub v18.4s, v22.4s, v20.4s +mla v19.4S, v25.4S, v31.s[0] +mul v16.4S, v16.4S,v30.s[2] +add v22.4s, v22.4s, v20.4s +sqrdmulh v20.4S, v23.4S, v29.s[2] +sub v25.4s, v26.4s, v19.4s +mla v16.4S, v24.4S, v31.s[0] +mul v23.4S, v23.4S,v30.s[2] +add v26.4s, v26.4s, v19.4s +sqrdmulh v19.4S, v22.4S, v27.s[0] +sub v24.4s, v21.4s, v16.4s +mla v23.4S, v20.4S, v31.s[0] +mul v22.4S, v22.4S,v28.s[0] +add v21.4s, v21.4s, v16.4s +sqrdmulh v16.4S, v18.4S, v27.s[1] +sub v20.4s, v17.4s, v23.4s +mla v22.4S, v19.4S, v31.s[0] +mul v18.4S, v18.4S,v28.s[1] +add v17.4s, v17.4s, v23.4s +sqrdmulh v23.4S, v24.4S, v27.s[3] +sub v19.4s, v26.4s, v22.4s +mla v18.4S, v16.4S, v31.s[0] +mul v24.4S, v24.4S,v28.s[3] +add v26.4s, v26.4s, v22.4s +str q26, [x0, #48] +str q19, [x0, #176] +sqrdmulh v19.4S, v21.4S, v27.s[2] +sub v26.4s, v25.4s, v18.4s +mla v24.4S, v23.4S, v31.s[0] +mul v21.4S, v21.4S,v28.s[2] +add v25.4s, v25.4s, v18.4s +str q25, [x0, #304] +str q26, [x0, #432] +sub v26.4s, v20.4s, v24.4s +mla v21.4S, v19.4S, v31.s[0] +add v20.4s, v20.4s, v24.4s +str q20, [x0, #816] +str q26, [x0, #944] +sub v26.4s, v17.4s, v21.4s +add v17.4s, v17.4s, v21.4s +str q17, [x0, #560] +str q26, [x0, #688] +ldr q4, [x17, #+64] +ldr q5, [x17, #+80] +ldr q6, [x17, #+96] +ldr q7, [x17, #+112] +ldr q8, [x0, #112] +ldr q9, [x0, #96] +sqrdmulh v10.4S, v8.4S, v5.s[0] +mul v8.4S, v8.4S,v4.s[0] +ldr q11, [x0, #64] +ldr q12, [x0, #80] +sqrdmulh v13.4S, v9.4S, v5.s[0] +mla v8.4S, v10.4S, v31.s[0] +mul v9.4S, v9.4S,v4.s[0] +ldr q10, [x0, #48] +ldr q14, [x0, #32] +sqrdmulh v15.4S, v11.4S, v5.s[0] +sub v0.4s, v10.4s, v8.4s +mla v9.4S, v13.4S, v31.s[0] +mul v11.4S, v11.4S,v4.s[0] +add v10.4s, v10.4s, v8.4s +ldr q8, [x0, #0] +ldr q13, [x0, #16] +sqrdmulh v1.4S, v12.4S, v5.s[0] +sub v2.4s, v14.4s, v9.4s +mla v11.4S, v15.4S, v31.s[0] +mul v12.4S, v12.4S,v4.s[0] +add v14.4s, v14.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v5.s[1] +sub v15.4s, v8.4s, v11.4s +mla v12.4S, v1.4S, v31.s[0] +mul v10.4S, v10.4S,v4.s[1] +add v8.4s, v8.4s, v11.4s +sqrdmulh v11.4S, v14.4S, v5.s[1] +sub v1.4s, v13.4s, v12.4s +mla v10.4S, v9.4S, v31.s[0] +mul v14.4S, v14.4S,v4.s[1] +add v13.4s, v13.4s, v12.4s +sqrdmulh v12.4S, v0.4S, v5.s[2] +sub v9.4s, v13.4s, v10.4s +mla v14.4S, v11.4S, v31.s[0] +mul v0.4S, v0.4S,v4.s[2] +add v13.4s, v13.4s, v10.4s +ldr q10, [x17, #+128] +ldr q11, [x17, #+144] +ldr q3, [x17, #+160] +ldr q16, [x17, #+176] +sqrdmulh v22.4S, v2.4S, v5.s[2] +sub v23.4s, v8.4s, v14.4s +mla v0.4S, v12.4S, v31.s[0] +mul v2.4S, v2.4S,v4.s[2] +add v8.4s, v8.4s, v14.4s +sqrdmulh v14.4S, v13.4S, v7.s[0] +sub v12.4s, v1.4s, v0.4s +mla v2.4S, v22.4S, v31.s[0] +mul v13.4S, v13.4S,v6.s[0] +add v1.4s, v1.4s, v0.4s +sqrdmulh v0.4S, v9.4S, v7.s[1] +sub v22.4s, v15.4s, v2.4s +mla v13.4S, v14.4S, v31.s[0] +mul v9.4S, v9.4S,v6.s[1] +add v15.4s, v15.4s, v2.4s +sqrdmulh v2.4S, v12.4S, v7.s[3] +sub v14.4s, v8.4s, v13.4s +mla v9.4S, v0.4S, v31.s[0] +mul v12.4S, v12.4S,v6.s[3] +add v8.4s, v8.4s, v13.4s +str q8, [x0, #0] +str q14, [x0, #16] +sqrdmulh v14.4S, v1.4S, v7.s[2] +sub v8.4s, v23.4s, v9.4s +mla v12.4S, v2.4S, v31.s[0] +mul v1.4S, v1.4S,v6.s[2] +add v23.4s, v23.4s, v9.4s +str q23, [x0, #32] +str q8, [x0, #48] +ldr q8, [x0, #240] +ldr q23, [x0, #224] +sqrdmulh v9.4S, v8.4S, v11.s[0] +sub v2.4s, v22.4s, v12.4s +mla v1.4S, v14.4S, v31.s[0] +mul v8.4S, v8.4S,v10.s[0] +add v22.4s, v22.4s, v12.4s +str q22, [x0, #96] +str q2, [x0, #112] +ldr q2, [x0, #192] +ldr q22, [x0, #208] +sqrdmulh v12.4S, v23.4S, v11.s[0] +sub v14.4s, v15.4s, v1.4s +mla v8.4S, v9.4S, v31.s[0] +mul v23.4S, v23.4S,v10.s[0] +add v15.4s, v15.4s, v1.4s +str q15, [x0, #64] +str q14, [x0, #80] +ldr q7, [x0, #176] +ldr q6, [x0, #160] +sqrdmulh v5.4S, v2.4S, v11.s[0] +sub v4.4s, v7.4s, v8.4s +mla v23.4S, v12.4S, v31.s[0] +mul v2.4S, v2.4S,v10.s[0] +add v7.4s, v7.4s, v8.4s +ldr q8, [x0, #128] +ldr q12, [x0, #144] +sqrdmulh v14.4S, v22.4S, v11.s[0] +sub v15.4s, v6.4s, v23.4s +mla v2.4S, v5.4S, v31.s[0] +mul v22.4S, v22.4S,v10.s[0] +add v6.4s, v6.4s, v23.4s +sqrdmulh v23.4S, v7.4S, v11.s[1] +sub v5.4s, v8.4s, v2.4s +mla v22.4S, v14.4S, v31.s[0] +mul v7.4S, v7.4S,v10.s[1] +add v8.4s, v8.4s, v2.4s +sqrdmulh v2.4S, v6.4S, v11.s[1] +sub v14.4s, v12.4s, v22.4s +mla v7.4S, v23.4S, v31.s[0] +mul v6.4S, v6.4S,v10.s[1] +add v12.4s, v12.4s, v22.4s +sqrdmulh v22.4S, v4.4S, v11.s[2] +sub v23.4s, v12.4s, v7.4s +mla v6.4S, v2.4S, v31.s[0] +mul v4.4S, v4.4S,v10.s[2] +add v12.4s, v12.4s, v7.4s +ldr q7, [x17, #+192] +ldr q2, [x17, #+208] +ldr q1, [x17, #+224] +ldr q9, [x17, #+240] +sqrdmulh v13.4S, v15.4S, v11.s[2] +sub v0.4s, v8.4s, v6.4s +mla v4.4S, v22.4S, v31.s[0] +mul v15.4S, v15.4S,v10.s[2] +add v8.4s, v8.4s, v6.4s +sqrdmulh v6.4S, v12.4S, v16.s[0] +sub v22.4s, v14.4s, v4.4s +mla v15.4S, v13.4S, v31.s[0] +mul v12.4S, v12.4S,v3.s[0] +add v14.4s, v14.4s, v4.4s +sqrdmulh v4.4S, v23.4S, v16.s[1] +sub v13.4s, v5.4s, v15.4s +mla v12.4S, v6.4S, v31.s[0] +mul v23.4S, v23.4S,v3.s[1] +add v5.4s, v5.4s, v15.4s +sqrdmulh v15.4S, v22.4S, v16.s[3] +sub v6.4s, v8.4s, v12.4s +mla v23.4S, v4.4S, v31.s[0] +mul v22.4S, v22.4S,v3.s[3] +add v8.4s, v8.4s, v12.4s +str q8, [x0, #128] +str q6, [x0, #144] +sqrdmulh v6.4S, v14.4S, v16.s[2] +sub v8.4s, v0.4s, v23.4s +mla v22.4S, v15.4S, v31.s[0] +mul v14.4S, v14.4S,v3.s[2] +add v0.4s, v0.4s, v23.4s +str q0, [x0, #160] +str q8, [x0, #176] +ldr q8, [x0, #368] +ldr q0, [x0, #352] +sqrdmulh v23.4S, v8.4S, v2.s[0] +sub v15.4s, v13.4s, v22.4s +mla v14.4S, v6.4S, v31.s[0] +mul v8.4S, v8.4S,v7.s[0] +add v13.4s, v13.4s, v22.4s +str q13, [x0, #224] +str q15, [x0, #240] +ldr q15, [x0, #320] +ldr q13, [x0, #336] +sqrdmulh v22.4S, v0.4S, v2.s[0] +sub v6.4s, v5.4s, v14.4s +mla v8.4S, v23.4S, v31.s[0] +mul v0.4S, v0.4S,v7.s[0] +add v5.4s, v5.4s, v14.4s +str q5, [x0, #192] +str q6, [x0, #208] +ldr q16, [x0, #304] +ldr q3, [x0, #288] +sqrdmulh v11.4S, v15.4S, v2.s[0] +sub v10.4s, v16.4s, v8.4s +mla v0.4S, v22.4S, v31.s[0] +mul v15.4S, v15.4S,v7.s[0] +add v16.4s, v16.4s, v8.4s +ldr q8, [x0, #256] +ldr q22, [x0, #272] +sqrdmulh v6.4S, v13.4S, v2.s[0] +sub v5.4s, v3.4s, v0.4s +mla v15.4S, v11.4S, v31.s[0] +mul v13.4S, v13.4S,v7.s[0] +add v3.4s, v3.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v2.s[1] +sub v11.4s, v8.4s, v15.4s +mla v13.4S, v6.4S, v31.s[0] +mul v16.4S, v16.4S,v7.s[1] +add v8.4s, v8.4s, v15.4s +sqrdmulh v15.4S, v3.4S, v2.s[1] +sub v6.4s, v22.4s, v13.4s +mla v16.4S, v0.4S, v31.s[0] +mul v3.4S, v3.4S,v7.s[1] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v10.4S, v2.s[2] +sub v0.4s, v22.4s, v16.4s +mla v3.4S, v15.4S, v31.s[0] +mul v10.4S, v10.4S,v7.s[2] +add v22.4s, v22.4s, v16.4s +ldr q16, [x17, #+256] +ldr q15, [x17, #+272] +ldr q14, [x17, #+288] +ldr q23, [x17, #+304] +sqrdmulh v12.4S, v5.4S, v2.s[2] +sub v4.4s, v8.4s, v3.4s +mla v10.4S, v13.4S, v31.s[0] +mul v5.4S, v5.4S,v7.s[2] +add v8.4s, v8.4s, v3.4s +sqrdmulh v3.4S, v22.4S, v9.s[0] +sub v13.4s, v6.4s, v10.4s +mla v5.4S, v12.4S, v31.s[0] +mul v22.4S, v22.4S,v1.s[0] +add v6.4s, v6.4s, v10.4s +sqrdmulh v10.4S, v0.4S, v9.s[1] +sub v12.4s, v11.4s, v5.4s +mla v22.4S, v3.4S, v31.s[0] +mul v0.4S, v0.4S,v1.s[1] +add v11.4s, v11.4s, v5.4s +sqrdmulh v5.4S, v13.4S, v9.s[3] +sub v3.4s, v8.4s, v22.4s +mla v0.4S, v10.4S, v31.s[0] +mul v13.4S, v13.4S,v1.s[3] +add v8.4s, v8.4s, v22.4s +str q8, [x0, #256] +str q3, [x0, #272] +sqrdmulh v3.4S, v6.4S, v9.s[2] +sub v8.4s, v4.4s, v0.4s +mla v13.4S, v5.4S, v31.s[0] +mul v6.4S, v6.4S,v1.s[2] +add v4.4s, v4.4s, v0.4s +str q4, [x0, #288] +str q8, [x0, #304] +ldr q8, [x0, #496] +ldr q4, [x0, #480] +sqrdmulh v0.4S, v8.4S, v15.s[0] +sub v5.4s, v12.4s, v13.4s +mla v6.4S, v3.4S, v31.s[0] +mul v8.4S, v8.4S,v16.s[0] +add v12.4s, v12.4s, v13.4s +str q12, [x0, #352] +str q5, [x0, #368] +ldr q5, [x0, #448] +ldr q12, [x0, #464] +sqrdmulh v13.4S, v4.4S, v15.s[0] +sub v3.4s, v11.4s, v6.4s +mla v8.4S, v0.4S, v31.s[0] +mul v4.4S, v4.4S,v16.s[0] +add v11.4s, v11.4s, v6.4s +str q11, [x0, #320] +str q3, [x0, #336] +ldr q9, [x0, #432] +ldr q1, [x0, #416] +sqrdmulh v2.4S, v5.4S, v15.s[0] +sub v7.4s, v9.4s, v8.4s +mla v4.4S, v13.4S, v31.s[0] +mul v5.4S, v5.4S,v16.s[0] +add v9.4s, v9.4s, v8.4s +ldr q8, [x0, #384] +ldr q13, [x0, #400] +sqrdmulh v3.4S, v12.4S, v15.s[0] +sub v11.4s, v1.4s, v4.4s +mla v5.4S, v2.4S, v31.s[0] +mul v12.4S, v12.4S,v16.s[0] +add v1.4s, v1.4s, v4.4s +sqrdmulh v4.4S, v9.4S, v15.s[1] +sub v2.4s, v8.4s, v5.4s +mla v12.4S, v3.4S, v31.s[0] +mul v9.4S, v9.4S,v16.s[1] +add v8.4s, v8.4s, v5.4s +sqrdmulh v5.4S, v1.4S, v15.s[1] +sub v3.4s, v13.4s, v12.4s +mla v9.4S, v4.4S, v31.s[0] +mul v1.4S, v1.4S,v16.s[1] +add v13.4s, v13.4s, v12.4s +sqrdmulh v12.4S, v7.4S, v15.s[2] +sub v4.4s, v13.4s, v9.4s +mla v1.4S, v5.4S, v31.s[0] +mul v7.4S, v7.4S,v16.s[2] +add v13.4s, v13.4s, v9.4s +ldr q9, [x17, #+320] +ldr q5, [x17, #+336] +ldr q6, [x17, #+352] +ldr q0, [x17, #+368] +sqrdmulh v22.4S, v11.4S, v15.s[2] +sub v10.4s, v8.4s, v1.4s +mla v7.4S, v12.4S, v31.s[0] +mul v11.4S, v11.4S,v16.s[2] +add v8.4s, v8.4s, v1.4s +sqrdmulh v1.4S, v13.4S, v23.s[0] +sub v12.4s, v3.4s, v7.4s +mla v11.4S, v22.4S, v31.s[0] +mul v13.4S, v13.4S,v14.s[0] +add v3.4s, v3.4s, v7.4s +sqrdmulh v7.4S, v4.4S, v23.s[1] +sub v22.4s, v2.4s, v11.4s +mla v13.4S, v1.4S, v31.s[0] +mul v4.4S, v4.4S,v14.s[1] +add v2.4s, v2.4s, v11.4s +sqrdmulh v11.4S, v12.4S, v23.s[3] +sub v1.4s, v8.4s, v13.4s +mla v4.4S, v7.4S, v31.s[0] +mul v12.4S, v12.4S,v14.s[3] +add v8.4s, v8.4s, v13.4s +str q8, [x0, #384] +str q1, [x0, #400] +sqrdmulh v1.4S, v3.4S, v23.s[2] +sub v8.4s, v10.4s, v4.4s +mla v12.4S, v11.4S, v31.s[0] +mul v3.4S, v3.4S,v14.s[2] +add v10.4s, v10.4s, v4.4s +str q10, [x0, #416] +str q8, [x0, #432] +ldr q8, [x0, #624] +ldr q10, [x0, #608] +sqrdmulh v4.4S, v8.4S, v5.s[0] +sub v11.4s, v22.4s, v12.4s +mla v3.4S, v1.4S, v31.s[0] +mul v8.4S, v8.4S,v9.s[0] +add v22.4s, v22.4s, v12.4s +str q22, [x0, #480] +str q11, [x0, #496] +ldr q11, [x0, #576] +ldr q22, [x0, #592] +sqrdmulh v12.4S, v10.4S, v5.s[0] +sub v1.4s, v2.4s, v3.4s +mla v8.4S, v4.4S, v31.s[0] +mul v10.4S, v10.4S,v9.s[0] +add v2.4s, v2.4s, v3.4s +str q2, [x0, #448] +str q1, [x0, #464] +ldr q23, [x0, #560] +ldr q14, [x0, #544] +sqrdmulh v15.4S, v11.4S, v5.s[0] +sub v16.4s, v23.4s, v8.4s +mla v10.4S, v12.4S, v31.s[0] +mul v11.4S, v11.4S,v9.s[0] +add v23.4s, v23.4s, v8.4s +ldr q8, [x0, #512] +ldr q12, [x0, #528] +sqrdmulh v1.4S, v22.4S, v5.s[0] +sub v2.4s, v14.4s, v10.4s +mla v11.4S, v15.4S, v31.s[0] +mul v22.4S, v22.4S,v9.s[0] +add v14.4s, v14.4s, v10.4s +sqrdmulh v10.4S, v23.4S, v5.s[1] +sub v15.4s, v8.4s, v11.4s +mla v22.4S, v1.4S, v31.s[0] +mul v23.4S, v23.4S,v9.s[1] +add v8.4s, v8.4s, v11.4s +sqrdmulh v11.4S, v14.4S, v5.s[1] +sub v1.4s, v12.4s, v22.4s +mla v23.4S, v10.4S, v31.s[0] +mul v14.4S, v14.4S,v9.s[1] +add v12.4s, v12.4s, v22.4s +sqrdmulh v22.4S, v16.4S, v5.s[2] +sub v10.4s, v12.4s, v23.4s +mla v14.4S, v11.4S, v31.s[0] +mul v16.4S, v16.4S,v9.s[2] +add v12.4s, v12.4s, v23.4s +ldr q23, [x17, #+384] +ldr q11, [x17, #+400] +ldr q3, [x17, #+416] +ldr q4, [x17, #+432] +sqrdmulh v13.4S, v2.4S, v5.s[2] +sub v7.4s, v8.4s, v14.4s +mla v16.4S, v22.4S, v31.s[0] +mul v2.4S, v2.4S,v9.s[2] +add v8.4s, v8.4s, v14.4s +sqrdmulh v14.4S, v12.4S, v0.s[0] +sub v22.4s, v1.4s, v16.4s +mla v2.4S, v13.4S, v31.s[0] +mul v12.4S, v12.4S,v6.s[0] +add v1.4s, v1.4s, v16.4s +sqrdmulh v16.4S, v10.4S, v0.s[1] +sub v13.4s, v15.4s, v2.4s +mla v12.4S, v14.4S, v31.s[0] +mul v10.4S, v10.4S,v6.s[1] +add v15.4s, v15.4s, v2.4s +sqrdmulh v2.4S, v22.4S, v0.s[3] +sub v14.4s, v8.4s, v12.4s +mla v10.4S, v16.4S, v31.s[0] +mul v22.4S, v22.4S,v6.s[3] +add v8.4s, v8.4s, v12.4s +str q8, [x0, #512] +str q14, [x0, #528] +sqrdmulh v14.4S, v1.4S, v0.s[2] +sub v8.4s, v7.4s, v10.4s +mla v22.4S, v2.4S, v31.s[0] +mul v1.4S, v1.4S,v6.s[2] +add v7.4s, v7.4s, v10.4s +str q7, [x0, #544] +str q8, [x0, #560] +ldr q8, [x0, #752] +ldr q7, [x0, #736] +sqrdmulh v10.4S, v8.4S, v11.s[0] +sub v2.4s, v13.4s, v22.4s +mla v1.4S, v14.4S, v31.s[0] +mul v8.4S, v8.4S,v23.s[0] +add v13.4s, v13.4s, v22.4s +str q13, [x0, #608] +str q2, [x0, #624] +ldr q2, [x0, #704] +ldr q13, [x0, #720] +sqrdmulh v22.4S, v7.4S, v11.s[0] +sub v14.4s, v15.4s, v1.4s +mla v8.4S, v10.4S, v31.s[0] +mul v7.4S, v7.4S,v23.s[0] +add v15.4s, v15.4s, v1.4s +str q15, [x0, #576] +str q14, [x0, #592] +ldr q0, [x0, #688] +ldr q6, [x0, #672] +sqrdmulh v5.4S, v2.4S, v11.s[0] +sub v9.4s, v0.4s, v8.4s +mla v7.4S, v22.4S, v31.s[0] +mul v2.4S, v2.4S,v23.s[0] +add v0.4s, v0.4s, v8.4s +ldr q8, [x0, #640] +ldr q22, [x0, #656] +sqrdmulh v14.4S, v13.4S, v11.s[0] +sub v15.4s, v6.4s, v7.4s +mla v2.4S, v5.4S, v31.s[0] +mul v13.4S, v13.4S,v23.s[0] +add v6.4s, v6.4s, v7.4s +sqrdmulh v7.4S, v0.4S, v11.s[1] +sub v5.4s, v8.4s, v2.4s +mla v13.4S, v14.4S, v31.s[0] +mul v0.4S, v0.4S,v23.s[1] +add v8.4s, v8.4s, v2.4s +sqrdmulh v2.4S, v6.4S, v11.s[1] +sub v14.4s, v22.4s, v13.4s +mla v0.4S, v7.4S, v31.s[0] +mul v6.4S, v6.4S,v23.s[1] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v9.4S, v11.s[2] +sub v7.4s, v22.4s, v0.4s +mla v6.4S, v2.4S, v31.s[0] +mul v9.4S, v9.4S,v23.s[2] +add v22.4s, v22.4s, v0.4s +ldr q0, [x17, #+448] +ldr q2, [x17, #+464] +ldr q1, [x17, #+480] +ldr q10, [x17, #+496] +sqrdmulh v12.4S, v15.4S, v11.s[2] +sub v16.4s, v8.4s, v6.4s +mla v9.4S, v13.4S, v31.s[0] +mul v15.4S, v15.4S,v23.s[2] +add v8.4s, v8.4s, v6.4s +sqrdmulh v6.4S, v22.4S, v4.s[0] +sub v13.4s, v14.4s, v9.4s +mla v15.4S, v12.4S, v31.s[0] +mul v22.4S, v22.4S,v3.s[0] +add v14.4s, v14.4s, v9.4s +sqrdmulh v9.4S, v7.4S, v4.s[1] +sub v12.4s, v5.4s, v15.4s +mla v22.4S, v6.4S, v31.s[0] +mul v7.4S, v7.4S,v3.s[1] +add v5.4s, v5.4s, v15.4s +sqrdmulh v15.4S, v13.4S, v4.s[3] +sub v6.4s, v8.4s, v22.4s +mla v7.4S, v9.4S, v31.s[0] +mul v13.4S, v13.4S,v3.s[3] +add v8.4s, v8.4s, v22.4s +str q8, [x0, #640] +str q6, [x0, #656] +sqrdmulh v6.4S, v14.4S, v4.s[2] +sub v8.4s, v16.4s, v7.4s +mla v13.4S, v15.4S, v31.s[0] +mul v14.4S, v14.4S,v3.s[2] +add v16.4s, v16.4s, v7.4s +str q16, [x0, #672] +str q8, [x0, #688] +ldr q8, [x0, #880] +ldr q16, [x0, #864] +sqrdmulh v7.4S, v8.4S, v2.s[0] +sub v15.4s, v12.4s, v13.4s +mla v14.4S, v6.4S, v31.s[0] +mul v8.4S, v8.4S,v0.s[0] +add v12.4s, v12.4s, v13.4s +str q12, [x0, #736] +str q15, [x0, #752] +ldr q15, [x0, #832] +ldr q12, [x0, #848] +sqrdmulh v13.4S, v16.4S, v2.s[0] +sub v6.4s, v5.4s, v14.4s +mla v8.4S, v7.4S, v31.s[0] +mul v16.4S, v16.4S,v0.s[0] +add v5.4s, v5.4s, v14.4s +str q5, [x0, #704] +str q6, [x0, #720] +ldr q4, [x0, #816] +ldr q3, [x0, #800] +sqrdmulh v11.4S, v15.4S, v2.s[0] +sub v23.4s, v4.4s, v8.4s +mla v16.4S, v13.4S, v31.s[0] +mul v15.4S, v15.4S,v0.s[0] +add v4.4s, v4.4s, v8.4s +ldr q8, [x0, #768] +ldr q13, [x0, #784] +sqrdmulh v6.4S, v12.4S, v2.s[0] +sub v5.4s, v3.4s, v16.4s +mla v15.4S, v11.4S, v31.s[0] +mul v12.4S, v12.4S,v0.s[0] +add v3.4s, v3.4s, v16.4s +sqrdmulh v16.4S, v4.4S, v2.s[1] +sub v11.4s, v8.4s, v15.4s +mla v12.4S, v6.4S, v31.s[0] +mul v4.4S, v4.4S,v0.s[1] +add v8.4s, v8.4s, v15.4s +sqrdmulh v15.4S, v3.4S, v2.s[1] +sub v6.4s, v13.4s, v12.4s +mla v4.4S, v16.4S, v31.s[0] +mul v3.4S, v3.4S,v0.s[1] +add v13.4s, v13.4s, v12.4s +sqrdmulh v12.4S, v23.4S, v2.s[2] +sub v16.4s, v13.4s, v4.4s +mla v3.4S, v15.4S, v31.s[0] +mul v23.4S, v23.4S,v0.s[2] +add v13.4s, v13.4s, v4.4s +ldr q4, [x17, #+512] +ldr q15, [x17, #+528] +ldr q14, [x17, #+544] +ldr q7, [x17, #+560] +sqrdmulh v22.4S, v5.4S, v2.s[2] +sub v9.4s, v8.4s, v3.4s +mla v23.4S, v12.4S, v31.s[0] +mul v5.4S, v5.4S,v0.s[2] +add v8.4s, v8.4s, v3.4s +sqrdmulh v3.4S, v13.4S, v10.s[0] +sub v12.4s, v6.4s, v23.4s +mla v5.4S, v22.4S, v31.s[0] +mul v13.4S, v13.4S,v1.s[0] +add v6.4s, v6.4s, v23.4s +sqrdmulh v23.4S, v16.4S, v10.s[1] +sub v22.4s, v11.4s, v5.4s +mla v13.4S, v3.4S, v31.s[0] +mul v16.4S, v16.4S,v1.s[1] +add v11.4s, v11.4s, v5.4s +sqrdmulh v5.4S, v12.4S, v10.s[3] +sub v3.4s, v8.4s, v13.4s +mla v16.4S, v23.4S, v31.s[0] +mul v12.4S, v12.4S,v1.s[3] +add v8.4s, v8.4s, v13.4s +str q8, [x0, #768] +str q3, [x0, #784] +sqrdmulh v3.4S, v6.4S, v10.s[2] +sub v8.4s, v9.4s, v16.4s +mla v12.4S, v5.4S, v31.s[0] +mul v6.4S, v6.4S,v1.s[2] +add v9.4s, v9.4s, v16.4s +str q9, [x0, #800] +str q8, [x0, #816] +ldr q8, [x0, #1008] +ldr q9, [x0, #992] +sqrdmulh v16.4S, v8.4S, v15.s[0] +sub v5.4s, v22.4s, v12.4s +mla v6.4S, v3.4S, v31.s[0] +mul v8.4S, v8.4S,v4.s[0] +add v22.4s, v22.4s, v12.4s +str q22, [x0, #864] +str q5, [x0, #880] +ldr q5, [x0, #960] +ldr q22, [x0, #976] +sqrdmulh v12.4S, v9.4S, v15.s[0] +sub v3.4s, v11.4s, v6.4s +mla v8.4S, v16.4S, v31.s[0] +mul v9.4S, v9.4S,v4.s[0] +add v11.4s, v11.4s, v6.4s +str q11, [x0, #832] +str q3, [x0, #848] +ldr q10, [x0, #944] +ldr q1, [x0, #928] +sqrdmulh v2.4S, v5.4S, v15.s[0] +sub v0.4s, v10.4s, v8.4s +mla v9.4S, v12.4S, v31.s[0] +mul v5.4S, v5.4S,v4.s[0] +add v10.4s, v10.4s, v8.4s +ldr q8, [x0, #896] +ldr q12, [x0, #912] +sqrdmulh v3.4S, v22.4S, v15.s[0] +sub v11.4s, v1.4s, v9.4s +mla v5.4S, v2.4S, v31.s[0] +mul v22.4S, v22.4S,v4.s[0] +add v1.4s, v1.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v15.s[1] +sub v2.4s, v8.4s, v5.4s +mla v22.4S, v3.4S, v31.s[0] +mul v10.4S, v10.4S,v4.s[1] +add v8.4s, v8.4s, v5.4s +sqrdmulh v5.4S, v1.4S, v15.s[1] +sub v3.4s, v12.4s, v22.4s +mla v10.4S, v9.4S, v31.s[0] +mul v1.4S, v1.4S,v4.s[1] +add v12.4s, v12.4s, v22.4s +sqrdmulh v22.4S, v0.4S, v15.s[2] +sub v9.4s, v12.4s, v10.4s +mla v1.4S, v5.4S, v31.s[0] +mul v0.4S, v0.4S,v4.s[2] +add v12.4s, v12.4s, v10.4s +sqrdmulh v10.4S, v11.4S, v15.s[2] +sub v5.4s, v8.4s, v1.4s +mla v0.4S, v22.4S, v31.s[0] +mul v11.4S, v11.4S,v4.s[2] +add v8.4s, v8.4s, v1.4s +sqrdmulh v1.4S, v12.4S, v7.s[0] +sub v22.4s, v3.4s, v0.4s +mla v11.4S, v10.4S, v31.s[0] +mul v12.4S, v12.4S,v14.s[0] +add v3.4s, v3.4s, v0.4s +sqrdmulh v0.4S, v9.4S, v7.s[1] +sub v10.4s, v2.4s, v11.4s +mla v12.4S, v1.4S, v31.s[0] +mul v9.4S, v9.4S,v14.s[1] +add v2.4s, v2.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v7.s[3] +sub v1.4s, v8.4s, v12.4s +mla v9.4S, v0.4S, v31.s[0] +mul v22.4S, v22.4S,v14.s[3] +add v8.4s, v8.4s, v12.4s +str q8, [x0, #896] +str q1, [x0, #912] +sqrdmulh v1.4S, v3.4S, v7.s[2] +sub v8.4s, v5.4s, v9.4s +mla v22.4S, v11.4S, v31.s[0] +mul v3.4S, v3.4S,v14.s[2] +add v5.4s, v5.4s, v9.4s +str q5, [x0, #928] +str q8, [x0, #944] +sub v8.4s, v10.4s, v22.4s +mla v3.4S, v1.4S, v31.s[0] +add v10.4s, v10.4s, v22.4s +str q10, [x0, #992] +str q8, [x0, #1008] +sub v8.4s, v2.4s, v3.4s +add v2.4s, v2.4s, v3.4s +str q2, [x0, #960] +str q8, [x0, #976] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1444 +// Instruction count: 1440 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_3_3_2.s b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_3_3_2.s new file mode 100644 index 0000000..622ba55 --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_3_3_2.s @@ -0,0 +1,1474 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 23825509 // Layer 4, block 0 +.word 27028662 // Layer 4, block 1 +.word 0 // Layer None, block None +.word 1307297022 // Layer 3, block 0 +.word 1524716204 // Layer 4, block 0 +.word 1729702351 // Layer 4, block 1 +.word 0 // Layer None, block None +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 14626653 // Layer 3, block 1 +.word 14833295 // Layer 4, block 2 +.word 2138810 // Layer 4, block 3 +.word 0 // Layer None, block None +.word 936034350 // Layer 3, block 1 +.word 949258429 // Layer 4, block 2 +.word 136873393 // Layer 4, block 3 +.word 0 // Layer None, block None +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 29737761 // Layer 3, block 2 +.word 6490403 // Layer 4, block 4 +.word 19648405 // Layer 4, block 5 +.word 0 // Layer None, block None +.word 1903071454 // Layer 3, block 2 +.word 415354091 // Layer 4, block 4 +.word 1257401950 // Layer 4, block 5 +.word 0 // Layer None, block None +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 30285189 // Layer 3, block 3 +.word 31254932 // Layer 4, block 6 +.word 26362414 // Layer 4, block 7 +.word 0 // Layer None, block None +.word 1938104173 // Layer 3, block 3 +.word 2000162988 // Layer 4, block 6 +.word 1687065733 // Layer 4, block 7 +.word 0 // Layer None, block None +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 21289485 // Layer 3, block 4 +.word 572895 // Layer 4, block 8 +.word 26691971 // Layer 4, block 9 +.word 0 // Layer None, block None +.word 1362423055 // Layer 3, block 4 +.word 36662482 // Layer 4, block 8 +.word 1708155771 // Layer 4, block 9 +.word 0 // Layer None, block None +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 9914896 // Layer 3, block 5 +.word 9249292 // Layer 4, block 10 +.word 29292862 // Layer 4, block 11 +.word 0 // Layer None, block None +.word 634504916 // Layer 3, block 5 +.word 591909511 // Layer 4, block 10 +.word 1874600091 // Layer 4, block 11 +.word 0 // Layer None, block None +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 22603682 // Layer 3, block 6 +.word 8247799 // Layer 4, block 12 +.word 5086187 // Layer 4, block 13 +.word 0 // Layer None, block None +.word 1446525244 // Layer 3, block 6 +.word 527818851 // Layer 4, block 12 +.word 325491125 // Layer 4, block 13 +.word 0 // Layer None, block None +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 16204162 // Layer 3, block 7 +.word 28113639 // Layer 4, block 14 +.word 8471290 // Layer 4, block 15 +.word 0 // Layer None, block None +.word 1036987221 // Layer 3, block 7 +.word 1799135579 // Layer 4, block 14 +.word 542121183 // Layer 4, block 15 +.word 0 // Layer None, block None +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.text +.global ntt_u32_incomplete_neon_asm_var_3_3_2 +.global _ntt_u32_incomplete_neon_asm_var_3_3_2 +ntt_u32_incomplete_neon_asm_var_3_3_2: +_ntt_u32_incomplete_neon_asm_var_3_3_2: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x0, #960] +ldr q25, [x0, #832] +sqrdmulh v24.4S, v26.4S, v29.s[0] +mul v26.4S, v26.4S,v30.s[0] +ldr q23, [x0, #576] +sqrdmulh v22.4S, v25.4S, v29.s[0] +mul v25.4S, v25.4S,v30.s[0] +ldr q21, [x0, #704] +mla v26.4S, v24.4S, v31.s[0] +sqrdmulh v24.4S, v23.4S, v29.s[0] +mul v23.4S, v23.4S,v30.s[0] +ldr q20, [x0, #448] +mla v25.4S, v22.4S, v31.s[0] +sub v22.4s, v20.4s, v26.4s +add v20.4s, v20.4s, v26.4s +sqrdmulh v26.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +ldr q19, [x0, #320] +mla v23.4S, v24.4S, v31.s[0] +sub v24.4s, v19.4s, v25.4s +add v19.4s, v19.4s, v25.4s +sqrdmulh v25.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +ldr q18, [x0, #64] +mla v21.4S, v26.4S, v31.s[0] +sub v26.4s, v18.4s, v23.4s +add v18.4s, v18.4s, v23.4s +sqrdmulh v23.4S, v19.4S, v29.s[1] +mul v19.4S, v19.4S,v30.s[1] +ldr q17, [x0, #192] +mla v20.4S, v25.4S, v31.s[0] +sub v25.4s, v17.4s, v21.4s +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +mla v19.4S, v23.4S, v31.s[0] +sub v23.4s, v17.4s, v20.4s +add v17.4s, v17.4s, v20.4s +sqrdmulh v20.4S, v24.4S, v29.s[2] +mul v24.4S, v24.4S,v30.s[2] +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v18.4s, v19.4s +add v18.4s, v18.4s, v19.4s +sqrdmulh v19.4S, v17.4S, v27.s[0] +mul v17.4S, v17.4S,v28.s[0] +mla v24.4S, v20.4S, v31.s[0] +sub v20.4s, v25.4s, v22.4s +add v25.4s, v25.4s, v22.4s +sqrdmulh v22.4S, v23.4S, v27.s[1] +mul v23.4S, v23.4S,v28.s[1] +mla v17.4S, v19.4S, v31.s[0] +sub v19.4s, v26.4s, v24.4s +add v26.4s, v26.4s, v24.4s +sqrdmulh v24.4S, v20.4S, v27.s[3] +mul v20.4S, v20.4S,v28.s[3] +ldr q16, [x0, #976] +mla v23.4S, v22.4S, v31.s[0] +sub v22.4s, v18.4s, v17.4s +add v18.4s, v18.4s, v17.4s +sqrdmulh v17.4S, v25.4S, v27.s[2] +mul v25.4S, v25.4S,v28.s[2] +ldr q3, [x0, #848] +mla v20.4S, v24.4S, v31.s[0] +sub v24.4s, v21.4s, v23.4s +add v21.4s, v21.4s, v23.4s +sqrdmulh v23.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +ldr q2, [x0, #592] +mla v25.4S, v17.4S, v31.s[0] +sub v17.4s, v19.4s, v20.4s +add v19.4s, v19.4s, v20.4s +sqrdmulh v20.4S, v3.4S, v29.s[0] +str q18, [x0, #64] +mul v3.4S, v3.4S,v30.s[0] +ldr q18, [x0, #720] +mla v16.4S, v23.4S, v31.s[0] +sub v23.4s, v26.4s, v25.4s +add v26.4s, v26.4s, v25.4s +sqrdmulh v25.4S, v2.4S, v29.s[0] +str q22, [x0, #192] +mul v2.4S, v2.4S,v30.s[0] +ldr q22, [x0, #464] +mla v3.4S, v20.4S, v31.s[0] +sub v20.4s, v22.4s, v16.4s +add v22.4s, v22.4s, v16.4s +sqrdmulh v16.4S, v18.4S, v29.s[0] +str q21, [x0, #320] +mul v18.4S, v18.4S,v30.s[0] +ldr q21, [x0, #336] +mla v2.4S, v25.4S, v31.s[0] +sub v25.4s, v21.4s, v3.4s +add v21.4s, v21.4s, v3.4s +sqrdmulh v3.4S, v22.4S, v29.s[1] +str q24, [x0, #448] +mul v22.4S, v22.4S,v30.s[1] +ldr q24, [x0, #80] +mla v18.4S, v16.4S, v31.s[0] +sub v16.4s, v24.4s, v2.4s +add v24.4s, v24.4s, v2.4s +sqrdmulh v2.4S, v21.4S, v29.s[1] +str q19, [x0, #832] +mul v21.4S, v21.4S,v30.s[1] +ldr q19, [x0, #208] +mla v22.4S, v3.4S, v31.s[0] +sub v3.4s, v19.4s, v18.4s +add v19.4s, v19.4s, v18.4s +sqrdmulh v18.4S, v20.4S, v29.s[2] +str q17, [x0, #960] +mul v20.4S, v20.4S,v30.s[2] +mla v21.4S, v2.4S, v31.s[0] +sub v2.4s, v19.4s, v22.4s +add v19.4s, v19.4s, v22.4s +sqrdmulh v22.4S, v25.4S, v29.s[2] +str q26, [x0, #576] +mul v25.4S, v25.4S,v30.s[2] +mla v20.4S, v18.4S, v31.s[0] +sub v18.4s, v24.4s, v21.4s +add v24.4s, v24.4s, v21.4s +sqrdmulh v21.4S, v19.4S, v27.s[0] +str q23, [x0, #704] +mul v19.4S, v19.4S,v28.s[0] +mla v25.4S, v22.4S, v31.s[0] +sub v22.4s, v3.4s, v20.4s +add v3.4s, v3.4s, v20.4s +sqrdmulh v20.4S, v2.4S, v27.s[1] +mul v2.4S, v2.4S,v28.s[1] +mla v19.4S, v21.4S, v31.s[0] +sub v21.4s, v16.4s, v25.4s +add v16.4s, v16.4s, v25.4s +sqrdmulh v25.4S, v22.4S, v27.s[3] +mul v22.4S, v22.4S,v28.s[3] +ldr q23, [x0, #992] +mla v2.4S, v20.4S, v31.s[0] +sub v20.4s, v24.4s, v19.4s +add v24.4s, v24.4s, v19.4s +sqrdmulh v19.4S, v3.4S, v27.s[2] +mul v3.4S, v3.4S,v28.s[2] +ldr q26, [x0, #864] +mla v22.4S, v25.4S, v31.s[0] +sub v25.4s, v18.4s, v2.4s +add v18.4s, v18.4s, v2.4s +sqrdmulh v2.4S, v23.4S, v29.s[0] +mul v23.4S, v23.4S,v30.s[0] +ldr q17, [x0, #608] +mla v3.4S, v19.4S, v31.s[0] +sub v19.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +sqrdmulh v22.4S, v26.4S, v29.s[0] +str q24, [x0, #80] +mul v26.4S, v26.4S,v30.s[0] +ldr q24, [x0, #736] +mla v23.4S, v2.4S, v31.s[0] +sub v2.4s, v16.4s, v3.4s +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v17.4S, v29.s[0] +str q20, [x0, #208] +mul v17.4S, v17.4S,v30.s[0] +ldr q20, [x0, #480] +mla v26.4S, v22.4S, v31.s[0] +sub v22.4s, v20.4s, v23.4s +add v20.4s, v20.4s, v23.4s +sqrdmulh v23.4S, v24.4S, v29.s[0] +str q18, [x0, #336] +mul v24.4S, v24.4S,v30.s[0] +ldr q18, [x0, #352] +mla v17.4S, v3.4S, v31.s[0] +sub v3.4s, v18.4s, v26.4s +add v18.4s, v18.4s, v26.4s +sqrdmulh v26.4S, v20.4S, v29.s[1] +str q25, [x0, #464] +mul v20.4S, v20.4S,v30.s[1] +ldr q25, [x0, #96] +mla v24.4S, v23.4S, v31.s[0] +sub v23.4s, v25.4s, v17.4s +add v25.4s, v25.4s, v17.4s +sqrdmulh v17.4S, v18.4S, v29.s[1] +str q21, [x0, #848] +mul v18.4S, v18.4S,v30.s[1] +ldr q21, [x0, #224] +mla v20.4S, v26.4S, v31.s[0] +sub v26.4s, v21.4s, v24.4s +add v21.4s, v21.4s, v24.4s +sqrdmulh v24.4S, v22.4S, v29.s[2] +str q19, [x0, #976] +mul v22.4S, v22.4S,v30.s[2] +mla v18.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v20.4s +add v21.4s, v21.4s, v20.4s +sqrdmulh v20.4S, v3.4S, v29.s[2] +str q16, [x0, #592] +mul v3.4S, v3.4S,v30.s[2] +mla v22.4S, v24.4S, v31.s[0] +sub v24.4s, v25.4s, v18.4s +add v25.4s, v25.4s, v18.4s +sqrdmulh v18.4S, v21.4S, v27.s[0] +str q2, [x0, #720] +mul v21.4S, v21.4S,v28.s[0] +mla v3.4S, v20.4S, v31.s[0] +sub v20.4s, v26.4s, v22.4s +add v26.4s, v26.4s, v22.4s +sqrdmulh v22.4S, v17.4S, v27.s[1] +mul v17.4S, v17.4S,v28.s[1] +mla v21.4S, v18.4S, v31.s[0] +sub v18.4s, v23.4s, v3.4s +add v23.4s, v23.4s, v3.4s +sqrdmulh v3.4S, v20.4S, v27.s[3] +mul v20.4S, v20.4S,v28.s[3] +ldr q2, [x0, #1008] +mla v17.4S, v22.4S, v31.s[0] +sub v22.4s, v25.4s, v21.4s +add v25.4s, v25.4s, v21.4s +sqrdmulh v21.4S, v26.4S, v27.s[2] +mul v26.4S, v26.4S,v28.s[2] +ldr q16, [x0, #880] +mla v20.4S, v3.4S, v31.s[0] +sub v3.4s, v24.4s, v17.4s +add v24.4s, v24.4s, v17.4s +sqrdmulh v17.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +ldr q19, [x0, #624] +mla v26.4S, v21.4S, v31.s[0] +sub v21.4s, v18.4s, v20.4s +add v18.4s, v18.4s, v20.4s +sqrdmulh v20.4S, v16.4S, v29.s[0] +str q25, [x0, #96] +mul v16.4S, v16.4S,v30.s[0] +ldr q25, [x0, #752] +mla v2.4S, v17.4S, v31.s[0] +sub v17.4s, v23.4s, v26.4s +add v23.4s, v23.4s, v26.4s +sqrdmulh v26.4S, v19.4S, v29.s[0] +str q22, [x0, #224] +mul v19.4S, v19.4S,v30.s[0] +ldr q22, [x0, #496] +mla v16.4S, v20.4S, v31.s[0] +sub v20.4s, v22.4s, v2.4s +add v22.4s, v22.4s, v2.4s +sqrdmulh v2.4S, v25.4S, v29.s[0] +str q24, [x0, #352] +mul v25.4S, v25.4S,v30.s[0] +ldr q24, [x0, #368] +mla v19.4S, v26.4S, v31.s[0] +sub v26.4s, v24.4s, v16.4s +add v24.4s, v24.4s, v16.4s +sqrdmulh v16.4S, v22.4S, v29.s[1] +str q3, [x0, #480] +mul v22.4S, v22.4S,v30.s[1] +ldr q3, [x0, #112] +mla v25.4S, v2.4S, v31.s[0] +sub v2.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v24.4S, v29.s[1] +str q18, [x0, #864] +mul v24.4S, v24.4S,v30.s[1] +ldr q18, [x0, #240] +mla v22.4S, v16.4S, v31.s[0] +sub v16.4s, v18.4s, v25.4s +add v18.4s, v18.4s, v25.4s +sqrdmulh v25.4S, v20.4S, v29.s[2] +str q21, [x0, #992] +mul v20.4S, v20.4S,v30.s[2] +mla v24.4S, v19.4S, v31.s[0] +sub v19.4s, v18.4s, v22.4s +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v26.4S, v29.s[2] +str q23, [x0, #608] +mul v26.4S, v26.4S,v30.s[2] +mla v20.4S, v25.4S, v31.s[0] +sub v25.4s, v3.4s, v24.4s +add v3.4s, v3.4s, v24.4s +sqrdmulh v24.4S, v18.4S, v27.s[0] +str q17, [x0, #736] +mul v18.4S, v18.4S,v28.s[0] +mla v26.4S, v22.4S, v31.s[0] +sub v22.4s, v16.4s, v20.4s +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v27.s[1] +mul v19.4S, v19.4S,v28.s[1] +mla v18.4S, v24.4S, v31.s[0] +sub v24.4s, v2.4s, v26.4s +add v2.4s, v2.4s, v26.4s +sqrdmulh v26.4S, v22.4S, v27.s[3] +mul v22.4S, v22.4S,v28.s[3] +ldr q17, [x0, #896] +mla v19.4S, v20.4S, v31.s[0] +sub v20.4s, v3.4s, v18.4s +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v16.4S, v27.s[2] +mul v16.4S, v16.4S,v28.s[2] +ldr q23, [x0, #768] +mla v22.4S, v26.4S, v31.s[0] +sub v26.4s, v25.4s, v19.4s +add v25.4s, v25.4s, v19.4s +sqrdmulh v19.4S, v17.4S, v29.s[0] +mul v17.4S, v17.4S,v30.s[0] +ldr q21, [x0, #512] +mla v16.4S, v18.4S, v31.s[0] +sub v18.4s, v24.4s, v22.4s +add v24.4s, v24.4s, v22.4s +sqrdmulh v22.4S, v23.4S, v29.s[0] +str q3, [x0, #112] +mul v23.4S, v23.4S,v30.s[0] +ldr q3, [x0, #640] +mla v17.4S, v19.4S, v31.s[0] +sub v19.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v21.4S, v29.s[0] +str q20, [x0, #240] +mul v21.4S, v21.4S,v30.s[0] +ldr q20, [x0, #384] +mla v23.4S, v22.4S, v31.s[0] +sub v22.4s, v20.4s, v17.4s +add v20.4s, v20.4s, v17.4s +sqrdmulh v17.4S, v3.4S, v29.s[0] +str q25, [x0, #368] +mul v3.4S, v3.4S,v30.s[0] +ldr q25, [x0, #256] +mla v21.4S, v16.4S, v31.s[0] +sub v16.4s, v25.4s, v23.4s +add v25.4s, v25.4s, v23.4s +sqrdmulh v23.4S, v20.4S, v29.s[1] +str q26, [x0, #496] +mul v20.4S, v20.4S,v30.s[1] +ldr q26, [x0, #0] +mla v3.4S, v17.4S, v31.s[0] +sub v17.4s, v26.4s, v21.4s +add v26.4s, v26.4s, v21.4s +sqrdmulh v21.4S, v25.4S, v29.s[1] +str q24, [x0, #880] +mul v25.4S, v25.4S,v30.s[1] +ldr q24, [x0, #128] +mla v20.4S, v23.4S, v31.s[0] +sub v23.4s, v24.4s, v3.4s +add v24.4s, v24.4s, v3.4s +sqrdmulh v3.4S, v22.4S, v29.s[2] +str q18, [x0, #1008] +mul v22.4S, v22.4S,v30.s[2] +mla v25.4S, v21.4S, v31.s[0] +sub v21.4s, v24.4s, v20.4s +add v24.4s, v24.4s, v20.4s +sqrdmulh v20.4S, v16.4S, v29.s[2] +str q2, [x0, #624] +mul v16.4S, v16.4S,v30.s[2] +mla v22.4S, v3.4S, v31.s[0] +sub v3.4s, v26.4s, v25.4s +add v26.4s, v26.4s, v25.4s +sqrdmulh v25.4S, v24.4S, v27.s[0] +str q19, [x0, #752] +mul v24.4S, v24.4S,v28.s[0] +mla v16.4S, v20.4S, v31.s[0] +sub v20.4s, v23.4s, v22.4s +add v23.4s, v23.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v27.s[1] +mul v21.4S, v21.4S,v28.s[1] +mla v24.4S, v25.4S, v31.s[0] +sub v25.4s, v17.4s, v16.4s +add v17.4s, v17.4s, v16.4s +sqrdmulh v16.4S, v20.4S, v27.s[3] +mul v20.4S, v20.4S,v28.s[3] +ldr q19, [x0, #912] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v26.4s, v24.4s +add v26.4s, v26.4s, v24.4s +sqrdmulh v24.4S, v23.4S, v27.s[2] +mul v23.4S, v23.4S,v28.s[2] +ldr q2, [x0, #784] +mla v20.4S, v16.4S, v31.s[0] +sub v16.4s, v3.4s, v21.4s +add v3.4s, v3.4s, v21.4s +sqrdmulh v21.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q18, [x0, #528] +mla v23.4S, v24.4S, v31.s[0] +sub v24.4s, v25.4s, v20.4s +add v25.4s, v25.4s, v20.4s +sqrdmulh v20.4S, v2.4S, v29.s[0] +str q26, [x0, #0] +mul v2.4S, v2.4S,v30.s[0] +ldr q26, [x0, #656] +mla v19.4S, v21.4S, v31.s[0] +sub v21.4s, v17.4s, v23.4s +add v17.4s, v17.4s, v23.4s +sqrdmulh v23.4S, v18.4S, v29.s[0] +str q22, [x0, #128] +mul v18.4S, v18.4S,v30.s[0] +ldr q22, [x0, #400] +mla v2.4S, v20.4S, v31.s[0] +sub v20.4s, v22.4s, v19.4s +add v22.4s, v22.4s, v19.4s +sqrdmulh v19.4S, v26.4S, v29.s[0] +str q3, [x0, #256] +mul v26.4S, v26.4S,v30.s[0] +ldr q3, [x0, #272] +mla v18.4S, v23.4S, v31.s[0] +sub v23.4s, v3.4s, v2.4s +add v3.4s, v3.4s, v2.4s +sqrdmulh v2.4S, v22.4S, v29.s[1] +str q16, [x0, #384] +mul v22.4S, v22.4S,v30.s[1] +ldr q16, [x0, #16] +mla v26.4S, v19.4S, v31.s[0] +sub v19.4s, v16.4s, v18.4s +add v16.4s, v16.4s, v18.4s +sqrdmulh v18.4S, v3.4S, v29.s[1] +str q25, [x0, #768] +mul v3.4S, v3.4S,v30.s[1] +ldr q25, [x0, #144] +mla v22.4S, v2.4S, v31.s[0] +sub v2.4s, v25.4s, v26.4s +add v25.4s, v25.4s, v26.4s +sqrdmulh v26.4S, v20.4S, v29.s[2] +str q24, [x0, #896] +mul v20.4S, v20.4S,v30.s[2] +mla v3.4S, v18.4S, v31.s[0] +sub v18.4s, v25.4s, v22.4s +add v25.4s, v25.4s, v22.4s +sqrdmulh v22.4S, v23.4S, v29.s[2] +str q17, [x0, #512] +mul v23.4S, v23.4S,v30.s[2] +mla v20.4S, v26.4S, v31.s[0] +sub v26.4s, v16.4s, v3.4s +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v25.4S, v27.s[0] +str q21, [x0, #640] +mul v25.4S, v25.4S,v28.s[0] +mla v23.4S, v22.4S, v31.s[0] +sub v22.4s, v2.4s, v20.4s +add v2.4s, v2.4s, v20.4s +sqrdmulh v20.4S, v18.4S, v27.s[1] +mul v18.4S, v18.4S,v28.s[1] +mla v25.4S, v3.4S, v31.s[0] +sub v3.4s, v19.4s, v23.4s +add v19.4s, v19.4s, v23.4s +sqrdmulh v23.4S, v22.4S, v27.s[3] +mul v22.4S, v22.4S,v28.s[3] +ldr q21, [x0, #928] +mla v18.4S, v20.4S, v31.s[0] +sub v20.4s, v16.4s, v25.4s +add v16.4s, v16.4s, v25.4s +sqrdmulh v25.4S, v2.4S, v27.s[2] +mul v2.4S, v2.4S,v28.s[2] +ldr q17, [x0, #800] +mla v22.4S, v23.4S, v31.s[0] +sub v23.4s, v26.4s, v18.4s +add v26.4s, v26.4s, v18.4s +sqrdmulh v18.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +ldr q24, [x0, #544] +mla v2.4S, v25.4S, v31.s[0] +sub v25.4s, v3.4s, v22.4s +add v3.4s, v3.4s, v22.4s +sqrdmulh v22.4S, v17.4S, v29.s[0] +str q16, [x0, #16] +mul v17.4S, v17.4S,v30.s[0] +ldr q16, [x0, #672] +mla v21.4S, v18.4S, v31.s[0] +sub v18.4s, v19.4s, v2.4s +add v19.4s, v19.4s, v2.4s +sqrdmulh v2.4S, v24.4S, v29.s[0] +str q20, [x0, #144] +mul v24.4S, v24.4S,v30.s[0] +ldr q20, [x0, #416] +mla v17.4S, v22.4S, v31.s[0] +sub v22.4s, v20.4s, v21.4s +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v16.4S, v29.s[0] +str q26, [x0, #272] +mul v16.4S, v16.4S,v30.s[0] +ldr q26, [x0, #288] +mla v24.4S, v2.4S, v31.s[0] +sub v2.4s, v26.4s, v17.4s +add v26.4s, v26.4s, v17.4s +sqrdmulh v17.4S, v20.4S, v29.s[1] +str q23, [x0, #400] +mul v20.4S, v20.4S,v30.s[1] +ldr q23, [x0, #32] +mla v16.4S, v21.4S, v31.s[0] +sub v21.4s, v23.4s, v24.4s +add v23.4s, v23.4s, v24.4s +sqrdmulh v24.4S, v26.4S, v29.s[1] +str q3, [x0, #784] +mul v26.4S, v26.4S,v30.s[1] +ldr q3, [x0, #160] +mla v20.4S, v17.4S, v31.s[0] +sub v17.4s, v3.4s, v16.4s +add v3.4s, v3.4s, v16.4s +sqrdmulh v16.4S, v22.4S, v29.s[2] +str q25, [x0, #912] +mul v22.4S, v22.4S,v30.s[2] +mla v26.4S, v24.4S, v31.s[0] +sub v24.4s, v3.4s, v20.4s +add v3.4s, v3.4s, v20.4s +sqrdmulh v20.4S, v2.4S, v29.s[2] +str q19, [x0, #528] +mul v2.4S, v2.4S,v30.s[2] +mla v22.4S, v16.4S, v31.s[0] +sub v16.4s, v23.4s, v26.4s +add v23.4s, v23.4s, v26.4s +sqrdmulh v26.4S, v3.4S, v27.s[0] +str q18, [x0, #656] +mul v3.4S, v3.4S,v28.s[0] +mla v2.4S, v20.4S, v31.s[0] +sub v20.4s, v17.4s, v22.4s +add v17.4s, v17.4s, v22.4s +sqrdmulh v22.4S, v24.4S, v27.s[1] +mul v24.4S, v24.4S,v28.s[1] +mla v3.4S, v26.4S, v31.s[0] +sub v26.4s, v21.4s, v2.4s +add v21.4s, v21.4s, v2.4s +sqrdmulh v2.4S, v20.4S, v27.s[3] +mul v20.4S, v20.4S,v28.s[3] +ldr q18, [x0, #944] +mla v24.4S, v22.4S, v31.s[0] +sub v22.4s, v23.4s, v3.4s +add v23.4s, v23.4s, v3.4s +sqrdmulh v3.4S, v17.4S, v27.s[2] +mul v17.4S, v17.4S,v28.s[2] +ldr q19, [x0, #816] +mla v20.4S, v2.4S, v31.s[0] +sub v2.4s, v16.4s, v24.4s +add v16.4s, v16.4s, v24.4s +sqrdmulh v24.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +ldr q25, [x0, #560] +mla v17.4S, v3.4S, v31.s[0] +sub v3.4s, v26.4s, v20.4s +add v26.4s, v26.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v29.s[0] +str q23, [x0, #32] +mul v19.4S, v19.4S,v30.s[0] +ldr q23, [x0, #688] +mla v18.4S, v24.4S, v31.s[0] +sub v24.4s, v21.4s, v17.4s +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v25.4S, v29.s[0] +str q22, [x0, #160] +mul v25.4S, v25.4S,v30.s[0] +ldr q22, [x0, #432] +mla v19.4S, v20.4S, v31.s[0] +sub v20.4s, v22.4s, v18.4s +add v22.4s, v22.4s, v18.4s +sqrdmulh v18.4S, v23.4S, v29.s[0] +str q16, [x0, #288] +mul v23.4S, v23.4S,v30.s[0] +ldr q16, [x0, #304] +mla v25.4S, v17.4S, v31.s[0] +sub v17.4s, v16.4s, v19.4s +add v16.4s, v16.4s, v19.4s +sqrdmulh v19.4S, v22.4S, v29.s[1] +str q2, [x0, #416] +mul v22.4S, v22.4S,v30.s[1] +ldr q2, [x0, #48] +mla v23.4S, v18.4S, v31.s[0] +sub v18.4s, v2.4s, v25.4s +add v2.4s, v2.4s, v25.4s +sqrdmulh v25.4S, v16.4S, v29.s[1] +str q26, [x0, #800] +mul v16.4S, v16.4S,v30.s[1] +ldr q26, [x0, #176] +mla v22.4S, v19.4S, v31.s[0] +sub v19.4s, v26.4s, v23.4s +add v26.4s, v26.4s, v23.4s +sqrdmulh v23.4S, v20.4S, v29.s[2] +str q3, [x0, #928] +mul v20.4S, v20.4S,v30.s[2] +mla v16.4S, v25.4S, v31.s[0] +sub v25.4s, v26.4s, v22.4s +add v26.4s, v26.4s, v22.4s +sqrdmulh v22.4S, v17.4S, v29.s[2] +str q21, [x0, #544] +mul v17.4S, v17.4S,v30.s[2] +mla v20.4S, v23.4S, v31.s[0] +sub v23.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v26.4S, v27.s[0] +str q24, [x0, #672] +mul v26.4S, v26.4S,v28.s[0] +mla v17.4S, v22.4S, v31.s[0] +sub v22.4s, v19.4s, v20.4s +add v19.4s, v19.4s, v20.4s +sqrdmulh v20.4S, v25.4S, v27.s[1] +mul v25.4S, v25.4S,v28.s[1] +mla v26.4S, v16.4S, v31.s[0] +sub v16.4s, v18.4s, v17.4s +add v18.4s, v18.4s, v17.4s +sqrdmulh v17.4S, v22.4S, v27.s[3] +mul v22.4S, v22.4S,v28.s[3] +mla v25.4S, v20.4S, v31.s[0] +sub v20.4s, v2.4s, v26.4s +add v2.4s, v2.4s, v26.4s +sqrdmulh v26.4S, v19.4S, v27.s[2] +mul v19.4S, v19.4S,v28.s[2] +mla v22.4S, v17.4S, v31.s[0] +sub v17.4s, v23.4s, v25.4s +add v23.4s, v23.4s, v25.4s +mla v19.4S, v26.4S, v31.s[0] +sub v26.4s, v16.4s, v22.4s +add v16.4s, v16.4s, v22.4s +str q2, [x0, #48] +sub v2.4s, v18.4s, v19.4s +add v18.4s, v18.4s, v19.4s +str q20, [x0, #176] +str q23, [x0, #304] +str q17, [x0, #432] +str q16, [x0, #816] +str q26, [x0, #944] +str q18, [x0, #560] +str q2, [x0, #688] +ldr q4, [x17, #+64] +ldr q5, [x17, #+80] +ldr q6, [x17, #+96] +ldr q7, [x17, #+112] +ldr q8, [x0, #112] +ldr q9, [x0, #96] +sqrdmulh v10.4S, v8.4S, v5.s[0] +mul v8.4S, v8.4S,v4.s[0] +ldr q11, [x0, #64] +sqrdmulh v12.4S, v9.4S, v5.s[0] +mul v9.4S, v9.4S,v4.s[0] +ldr q13, [x0, #80] +mla v8.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v11.4S, v5.s[0] +mul v11.4S, v11.4S,v4.s[0] +ldr q14, [x0, #48] +mla v9.4S, v12.4S, v31.s[0] +sub v12.4s, v14.4s, v8.4s +add v14.4s, v14.4s, v8.4s +sqrdmulh v8.4S, v13.4S, v5.s[0] +mul v13.4S, v13.4S,v4.s[0] +ldr q15, [x0, #32] +mla v11.4S, v10.4S, v31.s[0] +sub v10.4s, v15.4s, v9.4s +add v15.4s, v15.4s, v9.4s +sqrdmulh v9.4S, v14.4S, v5.s[1] +mul v14.4S, v14.4S,v4.s[1] +ldr q0, [x0, #0] +mla v13.4S, v8.4S, v31.s[0] +sub v8.4s, v0.4s, v11.4s +add v0.4s, v0.4s, v11.4s +sqrdmulh v11.4S, v15.4S, v5.s[1] +mul v15.4S, v15.4S,v4.s[1] +ldr q1, [x0, #16] +mla v14.4S, v9.4S, v31.s[0] +sub v9.4s, v1.4s, v13.4s +add v1.4s, v1.4s, v13.4s +sqrdmulh v13.4S, v12.4S, v5.s[2] +mul v12.4S, v12.4S,v4.s[2] +mla v15.4S, v11.4S, v31.s[0] +sub v11.4s, v1.4s, v14.4s +add v1.4s, v1.4s, v14.4s +sqrdmulh v14.4S, v10.4S, v5.s[2] +mul v10.4S, v10.4S,v4.s[2] +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v0.4s, v15.4s +add v0.4s, v0.4s, v15.4s +ldr q15, [x17, #+128] +sqrdmulh v3.4S, v1.4S, v7.s[0] +mul v1.4S, v1.4S,v6.s[0] +mla v10.4S, v14.4S, v31.s[0] +sub v14.4s, v9.4s, v12.4s +add v9.4s, v9.4s, v12.4s +ldr q12, [x17, #+144] +sqrdmulh v21.4S, v11.4S, v7.s[1] +mul v11.4S, v11.4S,v6.s[1] +mla v1.4S, v3.4S, v31.s[0] +sub v3.4s, v8.4s, v10.4s +add v8.4s, v8.4s, v10.4s +ldr q10, [x17, #+160] +ldr q24, [x17, #+176] +sqrdmulh v25.4S, v14.4S, v7.s[3] +mul v14.4S, v14.4S,v6.s[3] +ldr q22, [x0, #240] +mla v11.4S, v21.4S, v31.s[0] +sub v21.4s, v0.4s, v1.4s +add v0.4s, v0.4s, v1.4s +sqrdmulh v1.4S, v9.4S, v7.s[2] +mul v9.4S, v9.4S,v6.s[2] +ldr q19, [x0, #224] +mla v14.4S, v25.4S, v31.s[0] +sub v25.4s, v13.4s, v11.4s +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v12.s[0] +mul v22.4S, v22.4S,v15.s[0] +ldr q30, [x0, #192] +mla v9.4S, v1.4S, v31.s[0] +sub v1.4s, v3.4s, v14.4s +add v3.4s, v3.4s, v14.4s +sqrdmulh v14.4S, v19.4S, v12.s[0] +str q0, [x0, #0] +mul v19.4S, v19.4S,v15.s[0] +ldr q0, [x0, #208] +mla v22.4S, v11.4S, v31.s[0] +sub v11.4s, v8.4s, v9.4s +add v8.4s, v8.4s, v9.4s +sqrdmulh v7.4S, v30.4S, v12.s[0] +str q21, [x0, #16] +mul v30.4S, v30.4S,v15.s[0] +ldr q21, [x0, #176] +mla v19.4S, v14.4S, v31.s[0] +sub v14.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +sqrdmulh v22.4S, v0.4S, v12.s[0] +str q13, [x0, #32] +mul v0.4S, v0.4S,v15.s[0] +ldr q13, [x0, #160] +mla v30.4S, v7.4S, v31.s[0] +sub v7.4s, v13.4s, v19.4s +add v13.4s, v13.4s, v19.4s +sqrdmulh v19.4S, v21.4S, v12.s[1] +str q25, [x0, #48] +mul v21.4S, v21.4S,v15.s[1] +ldr q25, [x0, #128] +mla v0.4S, v22.4S, v31.s[0] +sub v22.4s, v25.4s, v30.4s +add v25.4s, v25.4s, v30.4s +sqrdmulh v30.4S, v13.4S, v12.s[1] +str q3, [x0, #96] +mul v13.4S, v13.4S,v15.s[1] +ldr q3, [x0, #144] +mla v21.4S, v19.4S, v31.s[0] +sub v19.4s, v3.4s, v0.4s +add v3.4s, v3.4s, v0.4s +sqrdmulh v0.4S, v14.4S, v12.s[2] +str q1, [x0, #112] +mul v14.4S, v14.4S,v15.s[2] +mla v13.4S, v30.4S, v31.s[0] +sub v30.4s, v3.4s, v21.4s +add v3.4s, v3.4s, v21.4s +sqrdmulh v21.4S, v7.4S, v12.s[2] +str q8, [x0, #64] +mul v7.4S, v7.4S,v15.s[2] +mla v14.4S, v0.4S, v31.s[0] +sub v0.4s, v25.4s, v13.4s +add v25.4s, v25.4s, v13.4s +ldr q13, [x17, #+192] +sqrdmulh v8.4S, v3.4S, v24.s[0] +str q11, [x0, #80] +mul v3.4S, v3.4S,v10.s[0] +mla v7.4S, v21.4S, v31.s[0] +sub v21.4s, v19.4s, v14.4s +add v19.4s, v19.4s, v14.4s +ldr q14, [x17, #+208] +sqrdmulh v11.4S, v30.4S, v24.s[1] +mul v30.4S, v30.4S,v10.s[1] +mla v3.4S, v8.4S, v31.s[0] +sub v8.4s, v22.4s, v7.4s +add v22.4s, v22.4s, v7.4s +ldr q7, [x17, #+224] +ldr q1, [x17, #+240] +sqrdmulh v6.4S, v21.4S, v24.s[3] +mul v21.4S, v21.4S,v10.s[3] +ldr q5, [x0, #368] +mla v30.4S, v11.4S, v31.s[0] +sub v11.4s, v25.4s, v3.4s +add v25.4s, v25.4s, v3.4s +sqrdmulh v3.4S, v19.4S, v24.s[2] +mul v19.4S, v19.4S,v10.s[2] +ldr q4, [x0, #352] +mla v21.4S, v6.4S, v31.s[0] +sub v6.4s, v0.4s, v30.4s +add v0.4s, v0.4s, v30.4s +sqrdmulh v30.4S, v5.4S, v14.s[0] +mul v5.4S, v5.4S,v13.s[0] +ldr q9, [x0, #320] +mla v19.4S, v3.4S, v31.s[0] +sub v3.4s, v8.4s, v21.4s +add v8.4s, v8.4s, v21.4s +sqrdmulh v21.4S, v4.4S, v14.s[0] +str q25, [x0, #128] +mul v4.4S, v4.4S,v13.s[0] +ldr q25, [x0, #336] +mla v5.4S, v30.4S, v31.s[0] +sub v30.4s, v22.4s, v19.4s +add v22.4s, v22.4s, v19.4s +sqrdmulh v24.4S, v9.4S, v14.s[0] +str q11, [x0, #144] +mul v9.4S, v9.4S,v13.s[0] +ldr q11, [x0, #304] +mla v4.4S, v21.4S, v31.s[0] +sub v21.4s, v11.4s, v5.4s +add v11.4s, v11.4s, v5.4s +sqrdmulh v5.4S, v25.4S, v14.s[0] +str q0, [x0, #160] +mul v25.4S, v25.4S,v13.s[0] +ldr q0, [x0, #288] +mla v9.4S, v24.4S, v31.s[0] +sub v24.4s, v0.4s, v4.4s +add v0.4s, v0.4s, v4.4s +sqrdmulh v4.4S, v11.4S, v14.s[1] +str q6, [x0, #176] +mul v11.4S, v11.4S,v13.s[1] +ldr q6, [x0, #256] +mla v25.4S, v5.4S, v31.s[0] +sub v5.4s, v6.4s, v9.4s +add v6.4s, v6.4s, v9.4s +sqrdmulh v9.4S, v0.4S, v14.s[1] +str q8, [x0, #224] +mul v0.4S, v0.4S,v13.s[1] +ldr q8, [x0, #272] +mla v11.4S, v4.4S, v31.s[0] +sub v4.4s, v8.4s, v25.4s +add v8.4s, v8.4s, v25.4s +sqrdmulh v25.4S, v21.4S, v14.s[2] +str q3, [x0, #240] +mul v21.4S, v21.4S,v13.s[2] +mla v0.4S, v9.4S, v31.s[0] +sub v9.4s, v8.4s, v11.4s +add v8.4s, v8.4s, v11.4s +sqrdmulh v11.4S, v24.4S, v14.s[2] +str q22, [x0, #192] +mul v24.4S, v24.4S,v13.s[2] +mla v21.4S, v25.4S, v31.s[0] +sub v25.4s, v6.4s, v0.4s +add v6.4s, v6.4s, v0.4s +ldr q0, [x17, #+256] +sqrdmulh v22.4S, v8.4S, v1.s[0] +str q30, [x0, #208] +mul v8.4S, v8.4S,v7.s[0] +mla v24.4S, v11.4S, v31.s[0] +sub v11.4s, v4.4s, v21.4s +add v4.4s, v4.4s, v21.4s +ldr q21, [x17, #+272] +sqrdmulh v30.4S, v9.4S, v1.s[1] +mul v9.4S, v9.4S,v7.s[1] +mla v8.4S, v22.4S, v31.s[0] +sub v22.4s, v5.4s, v24.4s +add v5.4s, v5.4s, v24.4s +ldr q24, [x17, #+288] +ldr q3, [x17, #+304] +sqrdmulh v10.4S, v11.4S, v1.s[3] +mul v11.4S, v11.4S,v7.s[3] +ldr q12, [x0, #496] +mla v9.4S, v30.4S, v31.s[0] +sub v30.4s, v6.4s, v8.4s +add v6.4s, v6.4s, v8.4s +sqrdmulh v8.4S, v4.4S, v1.s[2] +mul v4.4S, v4.4S,v7.s[2] +ldr q15, [x0, #480] +mla v11.4S, v10.4S, v31.s[0] +sub v10.4s, v25.4s, v9.4s +add v25.4s, v25.4s, v9.4s +sqrdmulh v9.4S, v12.4S, v21.s[0] +mul v12.4S, v12.4S,v0.s[0] +ldr q19, [x0, #448] +mla v4.4S, v8.4S, v31.s[0] +sub v8.4s, v22.4s, v11.4s +add v22.4s, v22.4s, v11.4s +sqrdmulh v11.4S, v15.4S, v21.s[0] +str q6, [x0, #256] +mul v15.4S, v15.4S,v0.s[0] +ldr q6, [x0, #464] +mla v12.4S, v9.4S, v31.s[0] +sub v9.4s, v5.4s, v4.4s +add v5.4s, v5.4s, v4.4s +sqrdmulh v1.4S, v19.4S, v21.s[0] +str q30, [x0, #272] +mul v19.4S, v19.4S,v0.s[0] +ldr q30, [x0, #432] +mla v15.4S, v11.4S, v31.s[0] +sub v11.4s, v30.4s, v12.4s +add v30.4s, v30.4s, v12.4s +sqrdmulh v12.4S, v6.4S, v21.s[0] +str q25, [x0, #288] +mul v6.4S, v6.4S,v0.s[0] +ldr q25, [x0, #416] +mla v19.4S, v1.4S, v31.s[0] +sub v1.4s, v25.4s, v15.4s +add v25.4s, v25.4s, v15.4s +sqrdmulh v15.4S, v30.4S, v21.s[1] +str q10, [x0, #304] +mul v30.4S, v30.4S,v0.s[1] +ldr q10, [x0, #384] +mla v6.4S, v12.4S, v31.s[0] +sub v12.4s, v10.4s, v19.4s +add v10.4s, v10.4s, v19.4s +sqrdmulh v19.4S, v25.4S, v21.s[1] +str q22, [x0, #352] +mul v25.4S, v25.4S,v0.s[1] +ldr q22, [x0, #400] +mla v30.4S, v15.4S, v31.s[0] +sub v15.4s, v22.4s, v6.4s +add v22.4s, v22.4s, v6.4s +sqrdmulh v6.4S, v11.4S, v21.s[2] +str q8, [x0, #368] +mul v11.4S, v11.4S,v0.s[2] +mla v25.4S, v19.4S, v31.s[0] +sub v19.4s, v22.4s, v30.4s +add v22.4s, v22.4s, v30.4s +sqrdmulh v30.4S, v1.4S, v21.s[2] +str q5, [x0, #320] +mul v1.4S, v1.4S,v0.s[2] +mla v11.4S, v6.4S, v31.s[0] +sub v6.4s, v10.4s, v25.4s +add v10.4s, v10.4s, v25.4s +ldr q25, [x17, #+320] +sqrdmulh v5.4S, v22.4S, v3.s[0] +str q9, [x0, #336] +mul v22.4S, v22.4S,v24.s[0] +mla v1.4S, v30.4S, v31.s[0] +sub v30.4s, v15.4s, v11.4s +add v15.4s, v15.4s, v11.4s +ldr q11, [x17, #+336] +sqrdmulh v9.4S, v19.4S, v3.s[1] +mul v19.4S, v19.4S,v24.s[1] +mla v22.4S, v5.4S, v31.s[0] +sub v5.4s, v12.4s, v1.4s +add v12.4s, v12.4s, v1.4s +ldr q1, [x17, #+352] +ldr q8, [x17, #+368] +sqrdmulh v7.4S, v30.4S, v3.s[3] +mul v30.4S, v30.4S,v24.s[3] +ldr q14, [x0, #624] +mla v19.4S, v9.4S, v31.s[0] +sub v9.4s, v10.4s, v22.4s +add v10.4s, v10.4s, v22.4s +sqrdmulh v22.4S, v15.4S, v3.s[2] +mul v15.4S, v15.4S,v24.s[2] +ldr q13, [x0, #608] +mla v30.4S, v7.4S, v31.s[0] +sub v7.4s, v6.4s, v19.4s +add v6.4s, v6.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v11.s[0] +mul v14.4S, v14.4S,v25.s[0] +ldr q4, [x0, #576] +mla v15.4S, v22.4S, v31.s[0] +sub v22.4s, v5.4s, v30.4s +add v5.4s, v5.4s, v30.4s +sqrdmulh v30.4S, v13.4S, v11.s[0] +str q10, [x0, #384] +mul v13.4S, v13.4S,v25.s[0] +ldr q10, [x0, #592] +mla v14.4S, v19.4S, v31.s[0] +sub v19.4s, v12.4s, v15.4s +add v12.4s, v12.4s, v15.4s +sqrdmulh v3.4S, v4.4S, v11.s[0] +str q9, [x0, #400] +mul v4.4S, v4.4S,v25.s[0] +ldr q9, [x0, #560] +mla v13.4S, v30.4S, v31.s[0] +sub v30.4s, v9.4s, v14.4s +add v9.4s, v9.4s, v14.4s +sqrdmulh v14.4S, v10.4S, v11.s[0] +str q6, [x0, #416] +mul v10.4S, v10.4S,v25.s[0] +ldr q6, [x0, #544] +mla v4.4S, v3.4S, v31.s[0] +sub v3.4s, v6.4s, v13.4s +add v6.4s, v6.4s, v13.4s +sqrdmulh v13.4S, v9.4S, v11.s[1] +str q7, [x0, #432] +mul v9.4S, v9.4S,v25.s[1] +ldr q7, [x0, #512] +mla v10.4S, v14.4S, v31.s[0] +sub v14.4s, v7.4s, v4.4s +add v7.4s, v7.4s, v4.4s +sqrdmulh v4.4S, v6.4S, v11.s[1] +str q5, [x0, #480] +mul v6.4S, v6.4S,v25.s[1] +ldr q5, [x0, #528] +mla v9.4S, v13.4S, v31.s[0] +sub v13.4s, v5.4s, v10.4s +add v5.4s, v5.4s, v10.4s +sqrdmulh v10.4S, v30.4S, v11.s[2] +str q22, [x0, #496] +mul v30.4S, v30.4S,v25.s[2] +mla v6.4S, v4.4S, v31.s[0] +sub v4.4s, v5.4s, v9.4s +add v5.4s, v5.4s, v9.4s +sqrdmulh v9.4S, v3.4S, v11.s[2] +str q12, [x0, #448] +mul v3.4S, v3.4S,v25.s[2] +mla v30.4S, v10.4S, v31.s[0] +sub v10.4s, v7.4s, v6.4s +add v7.4s, v7.4s, v6.4s +ldr q6, [x17, #+384] +sqrdmulh v12.4S, v5.4S, v8.s[0] +str q19, [x0, #464] +mul v5.4S, v5.4S,v1.s[0] +mla v3.4S, v9.4S, v31.s[0] +sub v9.4s, v13.4s, v30.4s +add v13.4s, v13.4s, v30.4s +ldr q30, [x17, #+400] +sqrdmulh v19.4S, v4.4S, v8.s[1] +mul v4.4S, v4.4S,v1.s[1] +mla v5.4S, v12.4S, v31.s[0] +sub v12.4s, v14.4s, v3.4s +add v14.4s, v14.4s, v3.4s +ldr q3, [x17, #+416] +ldr q22, [x17, #+432] +sqrdmulh v24.4S, v9.4S, v8.s[3] +mul v9.4S, v9.4S,v1.s[3] +ldr q21, [x0, #752] +mla v4.4S, v19.4S, v31.s[0] +sub v19.4s, v7.4s, v5.4s +add v7.4s, v7.4s, v5.4s +sqrdmulh v5.4S, v13.4S, v8.s[2] +mul v13.4S, v13.4S,v1.s[2] +ldr q0, [x0, #736] +mla v9.4S, v24.4S, v31.s[0] +sub v24.4s, v10.4s, v4.4s +add v10.4s, v10.4s, v4.4s +sqrdmulh v4.4S, v21.4S, v30.s[0] +mul v21.4S, v21.4S,v6.s[0] +ldr q15, [x0, #704] +mla v13.4S, v5.4S, v31.s[0] +sub v5.4s, v12.4s, v9.4s +add v12.4s, v12.4s, v9.4s +sqrdmulh v9.4S, v0.4S, v30.s[0] +str q7, [x0, #512] +mul v0.4S, v0.4S,v6.s[0] +ldr q7, [x0, #720] +mla v21.4S, v4.4S, v31.s[0] +sub v4.4s, v14.4s, v13.4s +add v14.4s, v14.4s, v13.4s +sqrdmulh v8.4S, v15.4S, v30.s[0] +str q19, [x0, #528] +mul v15.4S, v15.4S,v6.s[0] +ldr q19, [x0, #688] +mla v0.4S, v9.4S, v31.s[0] +sub v9.4s, v19.4s, v21.4s +add v19.4s, v19.4s, v21.4s +sqrdmulh v21.4S, v7.4S, v30.s[0] +str q10, [x0, #544] +mul v7.4S, v7.4S,v6.s[0] +ldr q10, [x0, #672] +mla v15.4S, v8.4S, v31.s[0] +sub v8.4s, v10.4s, v0.4s +add v10.4s, v10.4s, v0.4s +sqrdmulh v0.4S, v19.4S, v30.s[1] +str q24, [x0, #560] +mul v19.4S, v19.4S,v6.s[1] +ldr q24, [x0, #640] +mla v7.4S, v21.4S, v31.s[0] +sub v21.4s, v24.4s, v15.4s +add v24.4s, v24.4s, v15.4s +sqrdmulh v15.4S, v10.4S, v30.s[1] +str q12, [x0, #608] +mul v10.4S, v10.4S,v6.s[1] +ldr q12, [x0, #656] +mla v19.4S, v0.4S, v31.s[0] +sub v0.4s, v12.4s, v7.4s +add v12.4s, v12.4s, v7.4s +sqrdmulh v7.4S, v9.4S, v30.s[2] +str q5, [x0, #624] +mul v9.4S, v9.4S,v6.s[2] +mla v10.4S, v15.4S, v31.s[0] +sub v15.4s, v12.4s, v19.4s +add v12.4s, v12.4s, v19.4s +sqrdmulh v19.4S, v8.4S, v30.s[2] +str q14, [x0, #576] +mul v8.4S, v8.4S,v6.s[2] +mla v9.4S, v7.4S, v31.s[0] +sub v7.4s, v24.4s, v10.4s +add v24.4s, v24.4s, v10.4s +ldr q10, [x17, #+448] +sqrdmulh v14.4S, v12.4S, v22.s[0] +str q4, [x0, #592] +mul v12.4S, v12.4S,v3.s[0] +mla v8.4S, v19.4S, v31.s[0] +sub v19.4s, v0.4s, v9.4s +add v0.4s, v0.4s, v9.4s +ldr q9, [x17, #+464] +sqrdmulh v4.4S, v15.4S, v22.s[1] +mul v15.4S, v15.4S,v3.s[1] +mla v12.4S, v14.4S, v31.s[0] +sub v14.4s, v21.4s, v8.4s +add v21.4s, v21.4s, v8.4s +ldr q8, [x17, #+480] +ldr q5, [x17, #+496] +sqrdmulh v1.4S, v19.4S, v22.s[3] +mul v19.4S, v19.4S,v3.s[3] +ldr q11, [x0, #880] +mla v15.4S, v4.4S, v31.s[0] +sub v4.4s, v24.4s, v12.4s +add v24.4s, v24.4s, v12.4s +sqrdmulh v12.4S, v0.4S, v22.s[2] +mul v0.4S, v0.4S,v3.s[2] +ldr q25, [x0, #864] +mla v19.4S, v1.4S, v31.s[0] +sub v1.4s, v7.4s, v15.4s +add v7.4s, v7.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v9.s[0] +mul v11.4S, v11.4S,v10.s[0] +ldr q13, [x0, #832] +mla v0.4S, v12.4S, v31.s[0] +sub v12.4s, v14.4s, v19.4s +add v14.4s, v14.4s, v19.4s +sqrdmulh v19.4S, v25.4S, v9.s[0] +str q24, [x0, #640] +mul v25.4S, v25.4S,v10.s[0] +ldr q24, [x0, #848] +mla v11.4S, v15.4S, v31.s[0] +sub v15.4s, v21.4s, v0.4s +add v21.4s, v21.4s, v0.4s +sqrdmulh v22.4S, v13.4S, v9.s[0] +str q4, [x0, #656] +mul v13.4S, v13.4S,v10.s[0] +ldr q4, [x0, #816] +mla v25.4S, v19.4S, v31.s[0] +sub v19.4s, v4.4s, v11.4s +add v4.4s, v4.4s, v11.4s +sqrdmulh v11.4S, v24.4S, v9.s[0] +str q7, [x0, #672] +mul v24.4S, v24.4S,v10.s[0] +ldr q7, [x0, #800] +mla v13.4S, v22.4S, v31.s[0] +sub v22.4s, v7.4s, v25.4s +add v7.4s, v7.4s, v25.4s +sqrdmulh v25.4S, v4.4S, v9.s[1] +str q1, [x0, #688] +mul v4.4S, v4.4S,v10.s[1] +ldr q1, [x0, #768] +mla v24.4S, v11.4S, v31.s[0] +sub v11.4s, v1.4s, v13.4s +add v1.4s, v1.4s, v13.4s +sqrdmulh v13.4S, v7.4S, v9.s[1] +str q14, [x0, #736] +mul v7.4S, v7.4S,v10.s[1] +ldr q14, [x0, #784] +mla v4.4S, v25.4S, v31.s[0] +sub v25.4s, v14.4s, v24.4s +add v14.4s, v14.4s, v24.4s +sqrdmulh v24.4S, v19.4S, v9.s[2] +str q12, [x0, #752] +mul v19.4S, v19.4S,v10.s[2] +mla v7.4S, v13.4S, v31.s[0] +sub v13.4s, v14.4s, v4.4s +add v14.4s, v14.4s, v4.4s +sqrdmulh v4.4S, v22.4S, v9.s[2] +str q21, [x0, #704] +mul v22.4S, v22.4S,v10.s[2] +mla v19.4S, v24.4S, v31.s[0] +sub v24.4s, v1.4s, v7.4s +add v1.4s, v1.4s, v7.4s +ldr q7, [x17, #+512] +sqrdmulh v21.4S, v14.4S, v5.s[0] +str q15, [x0, #720] +mul v14.4S, v14.4S,v8.s[0] +mla v22.4S, v4.4S, v31.s[0] +sub v4.4s, v25.4s, v19.4s +add v25.4s, v25.4s, v19.4s +ldr q19, [x17, #+528] +sqrdmulh v15.4S, v13.4S, v5.s[1] +mul v13.4S, v13.4S,v8.s[1] +mla v14.4S, v21.4S, v31.s[0] +sub v21.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +ldr q22, [x17, #+544] +ldr q12, [x17, #+560] +sqrdmulh v3.4S, v4.4S, v5.s[3] +mul v4.4S, v4.4S,v8.s[3] +ldr q30, [x0, #1008] +mla v13.4S, v15.4S, v31.s[0] +sub v15.4s, v1.4s, v14.4s +add v1.4s, v1.4s, v14.4s +sqrdmulh v14.4S, v25.4S, v5.s[2] +mul v25.4S, v25.4S,v8.s[2] +ldr q6, [x0, #992] +mla v4.4S, v3.4S, v31.s[0] +sub v3.4s, v24.4s, v13.4s +add v24.4s, v24.4s, v13.4s +sqrdmulh v13.4S, v30.4S, v19.s[0] +mul v30.4S, v30.4S,v7.s[0] +ldr q0, [x0, #960] +mla v25.4S, v14.4S, v31.s[0] +sub v14.4s, v21.4s, v4.4s +add v21.4s, v21.4s, v4.4s +sqrdmulh v4.4S, v6.4S, v19.s[0] +str q1, [x0, #768] +mul v6.4S, v6.4S,v7.s[0] +ldr q1, [x0, #976] +mla v30.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v25.4s +add v11.4s, v11.4s, v25.4s +sqrdmulh v5.4S, v0.4S, v19.s[0] +str q15, [x0, #784] +mul v0.4S, v0.4S,v7.s[0] +ldr q15, [x0, #944] +mla v6.4S, v4.4S, v31.s[0] +sub v4.4s, v15.4s, v30.4s +add v15.4s, v15.4s, v30.4s +sqrdmulh v30.4S, v1.4S, v19.s[0] +str q24, [x0, #800] +mul v1.4S, v1.4S,v7.s[0] +ldr q24, [x0, #928] +mla v0.4S, v5.4S, v31.s[0] +sub v5.4s, v24.4s, v6.4s +add v24.4s, v24.4s, v6.4s +sqrdmulh v6.4S, v15.4S, v19.s[1] +str q3, [x0, #816] +mul v15.4S, v15.4S,v7.s[1] +ldr q3, [x0, #896] +mla v1.4S, v30.4S, v31.s[0] +sub v30.4s, v3.4s, v0.4s +add v3.4s, v3.4s, v0.4s +sqrdmulh v0.4S, v24.4S, v19.s[1] +str q21, [x0, #864] +mul v24.4S, v24.4S,v7.s[1] +ldr q21, [x0, #912] +mla v15.4S, v6.4S, v31.s[0] +sub v6.4s, v21.4s, v1.4s +add v21.4s, v21.4s, v1.4s +sqrdmulh v1.4S, v4.4S, v19.s[2] +str q14, [x0, #880] +mul v4.4S, v4.4S,v7.s[2] +mla v24.4S, v0.4S, v31.s[0] +sub v0.4s, v21.4s, v15.4s +add v21.4s, v21.4s, v15.4s +sqrdmulh v15.4S, v5.4S, v19.s[2] +str q11, [x0, #832] +mul v5.4S, v5.4S,v7.s[2] +mla v4.4S, v1.4S, v31.s[0] +sub v1.4s, v3.4s, v24.4s +add v3.4s, v3.4s, v24.4s +sqrdmulh v24.4S, v21.4S, v12.s[0] +str q13, [x0, #848] +mul v21.4S, v21.4S,v22.s[0] +mla v5.4S, v15.4S, v31.s[0] +sub v15.4s, v6.4s, v4.4s +add v6.4s, v6.4s, v4.4s +sqrdmulh v4.4S, v0.4S, v12.s[1] +mul v0.4S, v0.4S,v22.s[1] +mla v21.4S, v24.4S, v31.s[0] +sub v24.4s, v30.4s, v5.4s +add v30.4s, v30.4s, v5.4s +sqrdmulh v5.4S, v15.4S, v12.s[3] +mul v15.4S, v15.4S,v22.s[3] +mla v0.4S, v4.4S, v31.s[0] +sub v4.4s, v3.4s, v21.4s +add v3.4s, v3.4s, v21.4s +sqrdmulh v21.4S, v6.4S, v12.s[2] +mul v6.4S, v6.4S,v22.s[2] +mla v15.4S, v5.4S, v31.s[0] +sub v5.4s, v1.4s, v0.4s +add v1.4s, v1.4s, v0.4s +mla v6.4S, v21.4S, v31.s[0] +sub v21.4s, v24.4s, v15.4s +add v24.4s, v24.4s, v15.4s +str q3, [x0, #896] +sub v3.4s, v30.4s, v6.4s +add v30.4s, v30.4s, v6.4s +str q4, [x0, #912] +str q1, [x0, #928] +str q5, [x0, #944] +str q24, [x0, #992] +str q21, [x0, #1008] +str q30, [x0, #960] +str q3, [x0, #976] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1444 +// Instruction count: 1440 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_3_3_3.s b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_3_3_3.s new file mode 100644 index 0000000..eebf2a2 --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_3_3_3.s @@ -0,0 +1,1474 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 23825509 // Layer 4, block 0 +.word 27028662 // Layer 4, block 1 +.word 0 // Layer None, block None +.word 1307297022 // Layer 3, block 0 +.word 1524716204 // Layer 4, block 0 +.word 1729702351 // Layer 4, block 1 +.word 0 // Layer None, block None +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 14626653 // Layer 3, block 1 +.word 14833295 // Layer 4, block 2 +.word 2138810 // Layer 4, block 3 +.word 0 // Layer None, block None +.word 936034350 // Layer 3, block 1 +.word 949258429 // Layer 4, block 2 +.word 136873393 // Layer 4, block 3 +.word 0 // Layer None, block None +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 29737761 // Layer 3, block 2 +.word 6490403 // Layer 4, block 4 +.word 19648405 // Layer 4, block 5 +.word 0 // Layer None, block None +.word 1903071454 // Layer 3, block 2 +.word 415354091 // Layer 4, block 4 +.word 1257401950 // Layer 4, block 5 +.word 0 // Layer None, block None +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 30285189 // Layer 3, block 3 +.word 31254932 // Layer 4, block 6 +.word 26362414 // Layer 4, block 7 +.word 0 // Layer None, block None +.word 1938104173 // Layer 3, block 3 +.word 2000162988 // Layer 4, block 6 +.word 1687065733 // Layer 4, block 7 +.word 0 // Layer None, block None +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 21289485 // Layer 3, block 4 +.word 572895 // Layer 4, block 8 +.word 26691971 // Layer 4, block 9 +.word 0 // Layer None, block None +.word 1362423055 // Layer 3, block 4 +.word 36662482 // Layer 4, block 8 +.word 1708155771 // Layer 4, block 9 +.word 0 // Layer None, block None +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 9914896 // Layer 3, block 5 +.word 9249292 // Layer 4, block 10 +.word 29292862 // Layer 4, block 11 +.word 0 // Layer None, block None +.word 634504916 // Layer 3, block 5 +.word 591909511 // Layer 4, block 10 +.word 1874600091 // Layer 4, block 11 +.word 0 // Layer None, block None +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 22603682 // Layer 3, block 6 +.word 8247799 // Layer 4, block 12 +.word 5086187 // Layer 4, block 13 +.word 0 // Layer None, block None +.word 1446525244 // Layer 3, block 6 +.word 527818851 // Layer 4, block 12 +.word 325491125 // Layer 4, block 13 +.word 0 // Layer None, block None +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 16204162 // Layer 3, block 7 +.word 28113639 // Layer 4, block 14 +.word 8471290 // Layer 4, block 15 +.word 0 // Layer None, block None +.word 1036987221 // Layer 3, block 7 +.word 1799135579 // Layer 4, block 14 +.word 542121183 // Layer 4, block 15 +.word 0 // Layer None, block None +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.text +.global ntt_u32_incomplete_neon_asm_var_3_3_3 +.global _ntt_u32_incomplete_neon_asm_var_3_3_3 +ntt_u32_incomplete_neon_asm_var_3_3_3: +_ntt_u32_incomplete_neon_asm_var_3_3_3: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x0, #960] +ldr q29, [x0, #832] +ldr q28, [x0, #576] +ldr q27, [x0, #704] +ldr q26, [x0, #448] +ldr q25, [x17, #+0] +ldr q24, [x17, #+16] +ldr q23, [x17, #+32] +ldr q22, [x17, #+48] +ldr q21, [x0, #320] +ldr q20, [x0, #64] +ldr q19, [x0, #192] +sqrdmulh v18.4S, v30.4S, v24.s[0] +mul v30.4S, v30.4S,v25.s[0] +sqrdmulh v17.4S, v29.4S, v24.s[0] +mul v29.4S, v29.4S,v25.s[0] +mla v30.4S, v18.4S, v31.s[0] +sqrdmulh v18.4S, v28.4S, v24.s[0] +mul v28.4S, v28.4S,v25.s[0] +ldr q16, [x0, #976] +mla v29.4S, v17.4S, v31.s[0] +sub v17.4s, v26.4s, v30.4s +add v26.4s, v26.4s, v30.4s +sqrdmulh v30.4S, v27.4S, v24.s[0] +mul v27.4S, v27.4S,v25.s[0] +ldr q3, [x0, #848] +mla v28.4S, v18.4S, v31.s[0] +sub v18.4s, v21.4s, v29.4s +add v21.4s, v21.4s, v29.4s +sqrdmulh v29.4S, v26.4S, v24.s[1] +mul v26.4S, v26.4S,v25.s[1] +ldr q2, [x0, #592] +mla v27.4S, v30.4S, v31.s[0] +sub v30.4s, v20.4s, v28.4s +add v20.4s, v20.4s, v28.4s +sqrdmulh v28.4S, v21.4S, v24.s[1] +mul v21.4S, v21.4S,v25.s[1] +ldr q1, [x0, #720] +mla v26.4S, v29.4S, v31.s[0] +sub v29.4s, v19.4s, v27.4s +add v19.4s, v19.4s, v27.4s +sqrdmulh v27.4S, v17.4S, v24.s[2] +mul v17.4S, v17.4S,v25.s[2] +ldr q0, [x0, #464] +mla v21.4S, v28.4S, v31.s[0] +sub v28.4s, v19.4s, v26.4s +add v19.4s, v19.4s, v26.4s +sqrdmulh v26.4S, v18.4S, v24.s[2] +mul v18.4S, v18.4S,v25.s[2] +ldr q15, [x0, #336] +mla v17.4S, v27.4S, v31.s[0] +sub v27.4s, v20.4s, v21.4s +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +ldr q14, [x0, #80] +mla v18.4S, v26.4S, v31.s[0] +sub v26.4s, v29.4s, v17.4s +add v29.4s, v29.4s, v17.4s +sqrdmulh v17.4S, v28.4S, v22.s[1] +mul v28.4S, v28.4S,v23.s[1] +ldr q13, [x0, #208] +mla v19.4S, v21.4S, v31.s[0] +sub v21.4s, v30.4s, v18.4s +add v30.4s, v30.4s, v18.4s +sqrdmulh v18.4S, v26.4S, v22.s[3] +mul v26.4S, v26.4S,v23.s[3] +mla v28.4S, v17.4S, v31.s[0] +sub v17.4s, v20.4s, v19.4s +add v20.4s, v20.4s, v19.4s +sqrdmulh v19.4S, v29.4S, v22.s[2] +mul v29.4S, v29.4S,v23.s[2] +mla v26.4S, v18.4S, v31.s[0] +sub v18.4s, v27.4s, v28.4s +add v27.4s, v27.4s, v28.4s +sqrdmulh v28.4S, v16.4S, v24.s[0] +mul v16.4S, v16.4S,v25.s[0] +mla v29.4S, v19.4S, v31.s[0] +sub v19.4s, v21.4s, v26.4s +add v21.4s, v21.4s, v26.4s +sqrdmulh v26.4S, v3.4S, v24.s[0] +mul v3.4S, v3.4S,v25.s[0] +mla v16.4S, v28.4S, v31.s[0] +sub v28.4s, v30.4s, v29.4s +add v30.4s, v30.4s, v29.4s +sqrdmulh v29.4S, v2.4S, v24.s[0] +mul v2.4S, v2.4S,v25.s[0] +ldr q12, [x0, #992] +mla v3.4S, v26.4S, v31.s[0] +sub v26.4s, v0.4s, v16.4s +add v0.4s, v0.4s, v16.4s +sqrdmulh v16.4S, v1.4S, v24.s[0] +mul v1.4S, v1.4S,v25.s[0] +ldr q11, [x0, #864] +mla v2.4S, v29.4S, v31.s[0] +sub v29.4s, v15.4s, v3.4s +add v15.4s, v15.4s, v3.4s +sqrdmulh v3.4S, v0.4S, v24.s[1] +str q20, [x0, #64] +mul v0.4S, v0.4S,v25.s[1] +ldr q20, [x0, #608] +mla v1.4S, v16.4S, v31.s[0] +sub v16.4s, v14.4s, v2.4s +add v14.4s, v14.4s, v2.4s +sqrdmulh v2.4S, v15.4S, v24.s[1] +str q17, [x0, #192] +mul v15.4S, v15.4S,v25.s[1] +ldr q17, [x0, #736] +mla v0.4S, v3.4S, v31.s[0] +sub v3.4s, v13.4s, v1.4s +add v13.4s, v13.4s, v1.4s +sqrdmulh v1.4S, v26.4S, v24.s[2] +str q27, [x0, #320] +mul v26.4S, v26.4S,v25.s[2] +ldr q27, [x0, #480] +mla v15.4S, v2.4S, v31.s[0] +sub v2.4s, v13.4s, v0.4s +add v13.4s, v13.4s, v0.4s +sqrdmulh v0.4S, v29.4S, v24.s[2] +str q18, [x0, #448] +mul v29.4S, v29.4S,v25.s[2] +ldr q18, [x0, #352] +mla v26.4S, v1.4S, v31.s[0] +sub v1.4s, v14.4s, v15.4s +add v14.4s, v14.4s, v15.4s +sqrdmulh v15.4S, v13.4S, v22.s[0] +str q21, [x0, #832] +mul v13.4S, v13.4S,v23.s[0] +ldr q21, [x0, #96] +mla v29.4S, v0.4S, v31.s[0] +sub v0.4s, v3.4s, v26.4s +add v3.4s, v3.4s, v26.4s +sqrdmulh v26.4S, v2.4S, v22.s[1] +str q19, [x0, #960] +mul v2.4S, v2.4S,v23.s[1] +ldr q19, [x0, #224] +mla v13.4S, v15.4S, v31.s[0] +sub v15.4s, v16.4s, v29.4s +add v16.4s, v16.4s, v29.4s +sqrdmulh v29.4S, v0.4S, v22.s[3] +str q30, [x0, #576] +mul v0.4S, v0.4S,v23.s[3] +mla v2.4S, v26.4S, v31.s[0] +sub v26.4s, v14.4s, v13.4s +add v14.4s, v14.4s, v13.4s +sqrdmulh v13.4S, v3.4S, v22.s[2] +str q28, [x0, #704] +mul v3.4S, v3.4S,v23.s[2] +mla v0.4S, v29.4S, v31.s[0] +sub v29.4s, v1.4s, v2.4s +add v1.4s, v1.4s, v2.4s +sqrdmulh v2.4S, v12.4S, v24.s[0] +mul v12.4S, v12.4S,v25.s[0] +mla v3.4S, v13.4S, v31.s[0] +sub v13.4s, v15.4s, v0.4s +add v15.4s, v15.4s, v0.4s +sqrdmulh v0.4S, v11.4S, v24.s[0] +mul v11.4S, v11.4S,v25.s[0] +mla v12.4S, v2.4S, v31.s[0] +sub v2.4s, v16.4s, v3.4s +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v20.4S, v24.s[0] +mul v20.4S, v20.4S,v25.s[0] +ldr q28, [x0, #1008] +mla v11.4S, v0.4S, v31.s[0] +sub v0.4s, v27.4s, v12.4s +add v27.4s, v27.4s, v12.4s +sqrdmulh v12.4S, v17.4S, v24.s[0] +mul v17.4S, v17.4S,v25.s[0] +ldr q30, [x0, #880] +mla v20.4S, v3.4S, v31.s[0] +sub v3.4s, v18.4s, v11.4s +add v18.4s, v18.4s, v11.4s +sqrdmulh v11.4S, v27.4S, v24.s[1] +str q14, [x0, #80] +mul v27.4S, v27.4S,v25.s[1] +ldr q14, [x0, #624] +mla v17.4S, v12.4S, v31.s[0] +sub v12.4s, v21.4s, v20.4s +add v21.4s, v21.4s, v20.4s +sqrdmulh v20.4S, v18.4S, v24.s[1] +str q26, [x0, #208] +mul v18.4S, v18.4S,v25.s[1] +ldr q26, [x0, #752] +mla v27.4S, v11.4S, v31.s[0] +sub v11.4s, v19.4s, v17.4s +add v19.4s, v19.4s, v17.4s +sqrdmulh v17.4S, v0.4S, v24.s[2] +str q1, [x0, #336] +mul v0.4S, v0.4S,v25.s[2] +ldr q1, [x0, #496] +mla v18.4S, v20.4S, v31.s[0] +sub v20.4s, v19.4s, v27.4s +add v19.4s, v19.4s, v27.4s +sqrdmulh v27.4S, v3.4S, v24.s[2] +str q29, [x0, #464] +mul v3.4S, v3.4S,v25.s[2] +ldr q29, [x0, #368] +mla v0.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v18.4s +add v21.4s, v21.4s, v18.4s +sqrdmulh v18.4S, v19.4S, v22.s[0] +str q15, [x0, #848] +mul v19.4S, v19.4S,v23.s[0] +ldr q15, [x0, #112] +mla v3.4S, v27.4S, v31.s[0] +sub v27.4s, v11.4s, v0.4s +add v11.4s, v11.4s, v0.4s +sqrdmulh v0.4S, v20.4S, v22.s[1] +str q13, [x0, #976] +mul v20.4S, v20.4S,v23.s[1] +ldr q13, [x0, #240] +mla v19.4S, v18.4S, v31.s[0] +sub v18.4s, v12.4s, v3.4s +add v12.4s, v12.4s, v3.4s +sqrdmulh v3.4S, v27.4S, v22.s[3] +str q16, [x0, #592] +mul v27.4S, v27.4S,v23.s[3] +mla v20.4S, v0.4S, v31.s[0] +sub v0.4s, v21.4s, v19.4s +add v21.4s, v21.4s, v19.4s +sqrdmulh v19.4S, v11.4S, v22.s[2] +str q2, [x0, #720] +mul v11.4S, v11.4S,v23.s[2] +mla v27.4S, v3.4S, v31.s[0] +sub v3.4s, v17.4s, v20.4s +add v17.4s, v17.4s, v20.4s +sqrdmulh v20.4S, v28.4S, v24.s[0] +mul v28.4S, v28.4S,v25.s[0] +mla v11.4S, v19.4S, v31.s[0] +sub v19.4s, v18.4s, v27.4s +add v18.4s, v18.4s, v27.4s +sqrdmulh v27.4S, v30.4S, v24.s[0] +mul v30.4S, v30.4S,v25.s[0] +mla v28.4S, v20.4S, v31.s[0] +sub v20.4s, v12.4s, v11.4s +add v12.4s, v12.4s, v11.4s +sqrdmulh v11.4S, v14.4S, v24.s[0] +mul v14.4S, v14.4S,v25.s[0] +ldr q2, [x0, #896] +mla v30.4S, v27.4S, v31.s[0] +sub v27.4s, v1.4s, v28.4s +add v1.4s, v1.4s, v28.4s +sqrdmulh v28.4S, v26.4S, v24.s[0] +mul v26.4S, v26.4S,v25.s[0] +ldr q16, [x0, #768] +mla v14.4S, v11.4S, v31.s[0] +sub v11.4s, v29.4s, v30.4s +add v29.4s, v29.4s, v30.4s +sqrdmulh v30.4S, v1.4S, v24.s[1] +str q21, [x0, #96] +mul v1.4S, v1.4S,v25.s[1] +ldr q21, [x0, #512] +mla v26.4S, v28.4S, v31.s[0] +sub v28.4s, v15.4s, v14.4s +add v15.4s, v15.4s, v14.4s +sqrdmulh v14.4S, v29.4S, v24.s[1] +str q0, [x0, #224] +mul v29.4S, v29.4S,v25.s[1] +ldr q0, [x0, #640] +mla v1.4S, v30.4S, v31.s[0] +sub v30.4s, v13.4s, v26.4s +add v13.4s, v13.4s, v26.4s +sqrdmulh v26.4S, v27.4S, v24.s[2] +str q17, [x0, #352] +mul v27.4S, v27.4S,v25.s[2] +ldr q17, [x0, #384] +mla v29.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v1.4s +add v13.4s, v13.4s, v1.4s +sqrdmulh v1.4S, v11.4S, v24.s[2] +str q3, [x0, #480] +mul v11.4S, v11.4S,v25.s[2] +ldr q3, [x0, #256] +mla v27.4S, v26.4S, v31.s[0] +sub v26.4s, v15.4s, v29.4s +add v15.4s, v15.4s, v29.4s +sqrdmulh v29.4S, v13.4S, v22.s[0] +str q18, [x0, #864] +mul v13.4S, v13.4S,v23.s[0] +ldr q18, [x0, #0] +mla v11.4S, v1.4S, v31.s[0] +sub v1.4s, v30.4s, v27.4s +add v30.4s, v30.4s, v27.4s +sqrdmulh v27.4S, v14.4S, v22.s[1] +str q19, [x0, #992] +mul v14.4S, v14.4S,v23.s[1] +ldr q19, [x0, #128] +mla v13.4S, v29.4S, v31.s[0] +sub v29.4s, v28.4s, v11.4s +add v28.4s, v28.4s, v11.4s +sqrdmulh v11.4S, v1.4S, v22.s[3] +str q12, [x0, #608] +mul v1.4S, v1.4S,v23.s[3] +mla v14.4S, v27.4S, v31.s[0] +sub v27.4s, v15.4s, v13.4s +add v15.4s, v15.4s, v13.4s +sqrdmulh v13.4S, v30.4S, v22.s[2] +str q20, [x0, #736] +mul v30.4S, v30.4S,v23.s[2] +mla v1.4S, v11.4S, v31.s[0] +sub v11.4s, v26.4s, v14.4s +add v26.4s, v26.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v24.s[0] +mul v2.4S, v2.4S,v25.s[0] +mla v30.4S, v13.4S, v31.s[0] +sub v13.4s, v29.4s, v1.4s +add v29.4s, v29.4s, v1.4s +sqrdmulh v1.4S, v16.4S, v24.s[0] +mul v16.4S, v16.4S,v25.s[0] +mla v2.4S, v14.4S, v31.s[0] +sub v14.4s, v28.4s, v30.4s +add v28.4s, v28.4s, v30.4s +sqrdmulh v30.4S, v21.4S, v24.s[0] +mul v21.4S, v21.4S,v25.s[0] +ldr q20, [x0, #912] +mla v16.4S, v1.4S, v31.s[0] +sub v1.4s, v17.4s, v2.4s +add v17.4s, v17.4s, v2.4s +sqrdmulh v2.4S, v0.4S, v24.s[0] +mul v0.4S, v0.4S,v25.s[0] +ldr q12, [x0, #784] +mla v21.4S, v30.4S, v31.s[0] +sub v30.4s, v3.4s, v16.4s +add v3.4s, v3.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v24.s[1] +str q15, [x0, #112] +mul v17.4S, v17.4S,v25.s[1] +ldr q15, [x0, #528] +mla v0.4S, v2.4S, v31.s[0] +sub v2.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v3.4S, v24.s[1] +str q27, [x0, #240] +mul v3.4S, v3.4S,v25.s[1] +ldr q27, [x0, #656] +mla v17.4S, v16.4S, v31.s[0] +sub v16.4s, v19.4s, v0.4s +add v19.4s, v19.4s, v0.4s +sqrdmulh v0.4S, v1.4S, v24.s[2] +str q26, [x0, #368] +mul v1.4S, v1.4S,v25.s[2] +ldr q26, [x0, #400] +mla v3.4S, v21.4S, v31.s[0] +sub v21.4s, v19.4s, v17.4s +add v19.4s, v19.4s, v17.4s +sqrdmulh v17.4S, v30.4S, v24.s[2] +str q11, [x0, #496] +mul v30.4S, v30.4S,v25.s[2] +ldr q11, [x0, #272] +mla v1.4S, v0.4S, v31.s[0] +sub v0.4s, v18.4s, v3.4s +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v19.4S, v22.s[0] +str q29, [x0, #880] +mul v19.4S, v19.4S,v23.s[0] +ldr q29, [x0, #16] +mla v30.4S, v17.4S, v31.s[0] +sub v17.4s, v16.4s, v1.4s +add v16.4s, v16.4s, v1.4s +sqrdmulh v1.4S, v21.4S, v22.s[1] +str q13, [x0, #1008] +mul v21.4S, v21.4S,v23.s[1] +ldr q13, [x0, #144] +mla v19.4S, v3.4S, v31.s[0] +sub v3.4s, v2.4s, v30.4s +add v2.4s, v2.4s, v30.4s +sqrdmulh v30.4S, v17.4S, v22.s[3] +str q28, [x0, #624] +mul v17.4S, v17.4S,v23.s[3] +mla v21.4S, v1.4S, v31.s[0] +sub v1.4s, v18.4s, v19.4s +add v18.4s, v18.4s, v19.4s +sqrdmulh v19.4S, v16.4S, v22.s[2] +str q14, [x0, #752] +mul v16.4S, v16.4S,v23.s[2] +mla v17.4S, v30.4S, v31.s[0] +sub v30.4s, v0.4s, v21.4s +add v0.4s, v0.4s, v21.4s +sqrdmulh v21.4S, v20.4S, v24.s[0] +mul v20.4S, v20.4S,v25.s[0] +mla v16.4S, v19.4S, v31.s[0] +sub v19.4s, v3.4s, v17.4s +add v3.4s, v3.4s, v17.4s +sqrdmulh v17.4S, v12.4S, v24.s[0] +mul v12.4S, v12.4S,v25.s[0] +mla v20.4S, v21.4S, v31.s[0] +sub v21.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v15.4S, v24.s[0] +mul v15.4S, v15.4S,v25.s[0] +ldr q14, [x0, #928] +mla v12.4S, v17.4S, v31.s[0] +sub v17.4s, v26.4s, v20.4s +add v26.4s, v26.4s, v20.4s +sqrdmulh v20.4S, v27.4S, v24.s[0] +mul v27.4S, v27.4S,v25.s[0] +ldr q28, [x0, #800] +mla v15.4S, v16.4S, v31.s[0] +sub v16.4s, v11.4s, v12.4s +add v11.4s, v11.4s, v12.4s +sqrdmulh v12.4S, v26.4S, v24.s[1] +str q18, [x0, #0] +mul v26.4S, v26.4S,v25.s[1] +ldr q18, [x0, #544] +mla v27.4S, v20.4S, v31.s[0] +sub v20.4s, v29.4s, v15.4s +add v29.4s, v29.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v24.s[1] +str q1, [x0, #128] +mul v11.4S, v11.4S,v25.s[1] +ldr q1, [x0, #672] +mla v26.4S, v12.4S, v31.s[0] +sub v12.4s, v13.4s, v27.4s +add v13.4s, v13.4s, v27.4s +sqrdmulh v27.4S, v17.4S, v24.s[2] +str q0, [x0, #256] +mul v17.4S, v17.4S,v25.s[2] +ldr q0, [x0, #416] +mla v11.4S, v15.4S, v31.s[0] +sub v15.4s, v13.4s, v26.4s +add v13.4s, v13.4s, v26.4s +sqrdmulh v26.4S, v16.4S, v24.s[2] +str q30, [x0, #384] +mul v16.4S, v16.4S,v25.s[2] +ldr q30, [x0, #288] +mla v17.4S, v27.4S, v31.s[0] +sub v27.4s, v29.4s, v11.4s +add v29.4s, v29.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v22.s[0] +str q3, [x0, #768] +mul v13.4S, v13.4S,v23.s[0] +ldr q3, [x0, #32] +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v12.4s, v17.4s +add v12.4s, v12.4s, v17.4s +sqrdmulh v17.4S, v15.4S, v22.s[1] +str q19, [x0, #896] +mul v15.4S, v15.4S,v23.s[1] +ldr q19, [x0, #160] +mla v13.4S, v11.4S, v31.s[0] +sub v11.4s, v20.4s, v16.4s +add v20.4s, v20.4s, v16.4s +sqrdmulh v16.4S, v26.4S, v22.s[3] +str q2, [x0, #512] +mul v26.4S, v26.4S,v23.s[3] +mla v15.4S, v17.4S, v31.s[0] +sub v17.4s, v29.4s, v13.4s +add v29.4s, v29.4s, v13.4s +sqrdmulh v13.4S, v12.4S, v22.s[2] +str q21, [x0, #640] +mul v12.4S, v12.4S,v23.s[2] +mla v26.4S, v16.4S, v31.s[0] +sub v16.4s, v27.4s, v15.4s +add v27.4s, v27.4s, v15.4s +sqrdmulh v15.4S, v14.4S, v24.s[0] +mul v14.4S, v14.4S,v25.s[0] +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v26.4s +add v11.4s, v11.4s, v26.4s +sqrdmulh v26.4S, v28.4S, v24.s[0] +mul v28.4S, v28.4S,v25.s[0] +mla v14.4S, v15.4S, v31.s[0] +sub v15.4s, v20.4s, v12.4s +add v20.4s, v20.4s, v12.4s +sqrdmulh v12.4S, v18.4S, v24.s[0] +mul v18.4S, v18.4S,v25.s[0] +ldr q21, [x0, #944] +mla v28.4S, v26.4S, v31.s[0] +sub v26.4s, v0.4s, v14.4s +add v0.4s, v0.4s, v14.4s +sqrdmulh v14.4S, v1.4S, v24.s[0] +mul v1.4S, v1.4S,v25.s[0] +ldr q2, [x0, #816] +mla v18.4S, v12.4S, v31.s[0] +sub v12.4s, v30.4s, v28.4s +add v30.4s, v30.4s, v28.4s +sqrdmulh v28.4S, v0.4S, v24.s[1] +str q29, [x0, #16] +mul v0.4S, v0.4S,v25.s[1] +ldr q29, [x0, #560] +mla v1.4S, v14.4S, v31.s[0] +sub v14.4s, v3.4s, v18.4s +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v30.4S, v24.s[1] +str q17, [x0, #144] +mul v30.4S, v30.4S,v25.s[1] +ldr q17, [x0, #688] +mla v0.4S, v28.4S, v31.s[0] +sub v28.4s, v19.4s, v1.4s +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v26.4S, v24.s[2] +str q27, [x0, #272] +mul v26.4S, v26.4S,v25.s[2] +ldr q27, [x0, #432] +mla v30.4S, v18.4S, v31.s[0] +sub v18.4s, v19.4s, v0.4s +add v19.4s, v19.4s, v0.4s +sqrdmulh v0.4S, v12.4S, v24.s[2] +str q16, [x0, #400] +mul v12.4S, v12.4S,v25.s[2] +ldr q16, [x0, #304] +mla v26.4S, v1.4S, v31.s[0] +sub v1.4s, v3.4s, v30.4s +add v3.4s, v3.4s, v30.4s +sqrdmulh v30.4S, v19.4S, v22.s[0] +str q11, [x0, #784] +mul v19.4S, v19.4S,v23.s[0] +ldr q11, [x0, #48] +mla v12.4S, v0.4S, v31.s[0] +sub v0.4s, v28.4s, v26.4s +add v28.4s, v28.4s, v26.4s +sqrdmulh v26.4S, v18.4S, v22.s[1] +str q13, [x0, #912] +mul v18.4S, v18.4S,v23.s[1] +ldr q13, [x0, #176] +mla v19.4S, v30.4S, v31.s[0] +sub v30.4s, v14.4s, v12.4s +add v14.4s, v14.4s, v12.4s +sqrdmulh v12.4S, v0.4S, v22.s[3] +str q20, [x0, #528] +mul v0.4S, v0.4S,v23.s[3] +mla v18.4S, v26.4S, v31.s[0] +sub v26.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v28.4S, v22.s[2] +str q15, [x0, #656] +mul v28.4S, v28.4S,v23.s[2] +mla v0.4S, v12.4S, v31.s[0] +sub v12.4s, v1.4s, v18.4s +add v1.4s, v1.4s, v18.4s +sqrdmulh v18.4S, v21.4S, v24.s[0] +mul v21.4S, v21.4S,v25.s[0] +mla v28.4S, v19.4S, v31.s[0] +sub v19.4s, v30.4s, v0.4s +add v30.4s, v30.4s, v0.4s +sqrdmulh v0.4S, v2.4S, v24.s[0] +mul v2.4S, v2.4S,v25.s[0] +mla v21.4S, v18.4S, v31.s[0] +sub v18.4s, v14.4s, v28.4s +add v14.4s, v14.4s, v28.4s +sqrdmulh v28.4S, v29.4S, v24.s[0] +mul v29.4S, v29.4S,v25.s[0] +mla v2.4S, v0.4S, v31.s[0] +sub v0.4s, v27.4s, v21.4s +add v27.4s, v27.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v24.s[0] +mul v17.4S, v17.4S,v25.s[0] +mla v29.4S, v28.4S, v31.s[0] +sub v28.4s, v16.4s, v2.4s +add v16.4s, v16.4s, v2.4s +sqrdmulh v2.4S, v27.4S, v24.s[1] +str q3, [x0, #32] +mul v27.4S, v27.4S,v25.s[1] +mla v17.4S, v21.4S, v31.s[0] +sub v21.4s, v11.4s, v29.4s +add v11.4s, v11.4s, v29.4s +sqrdmulh v29.4S, v16.4S, v24.s[1] +str q26, [x0, #160] +mul v16.4S, v16.4S,v25.s[1] +mla v27.4S, v2.4S, v31.s[0] +sub v2.4s, v13.4s, v17.4s +add v13.4s, v13.4s, v17.4s +sqrdmulh v17.4S, v0.4S, v24.s[2] +str q1, [x0, #288] +mul v0.4S, v0.4S,v25.s[2] +mla v16.4S, v29.4S, v31.s[0] +sub v29.4s, v13.4s, v27.4s +add v13.4s, v13.4s, v27.4s +sqrdmulh v27.4S, v28.4S, v24.s[2] +str q12, [x0, #416] +mul v28.4S, v28.4S,v25.s[2] +mla v0.4S, v17.4S, v31.s[0] +sub v17.4s, v11.4s, v16.4s +add v11.4s, v11.4s, v16.4s +sqrdmulh v16.4S, v13.4S, v22.s[0] +str q30, [x0, #800] +mul v13.4S, v13.4S,v23.s[0] +mla v28.4S, v27.4S, v31.s[0] +sub v27.4s, v2.4s, v0.4s +add v2.4s, v2.4s, v0.4s +sqrdmulh v0.4S, v29.4S, v22.s[1] +str q19, [x0, #928] +mul v29.4S, v29.4S,v23.s[1] +mla v13.4S, v16.4S, v31.s[0] +sub v16.4s, v21.4s, v28.4s +add v21.4s, v21.4s, v28.4s +sqrdmulh v28.4S, v27.4S, v22.s[3] +str q14, [x0, #544] +mul v27.4S, v27.4S,v23.s[3] +mla v29.4S, v0.4S, v31.s[0] +sub v0.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +sqrdmulh v13.4S, v2.4S, v22.s[2] +str q18, [x0, #672] +mul v2.4S, v2.4S,v23.s[2] +mla v27.4S, v28.4S, v31.s[0] +sub v28.4s, v17.4s, v29.4s +add v17.4s, v17.4s, v29.4s +mla v2.4S, v13.4S, v31.s[0] +sub v13.4s, v16.4s, v27.4s +add v16.4s, v16.4s, v27.4s +sub v27.4s, v21.4s, v2.4s +add v21.4s, v21.4s, v2.4s +str q11, [x0, #48] +str q0, [x0, #176] +str q17, [x0, #304] +str q28, [x0, #432] +str q16, [x0, #816] +str q13, [x0, #944] +str q21, [x0, #560] +str q27, [x0, #688] +ldr q4, [x0, #112] +ldr q5, [x0, #96] +ldr q6, [x0, #64] +ldr q7, [x0, #80] +ldr q8, [x0, #48] +ldr q9, [x17, #+64] +ldr q10, [x17, #+80] +ldr q20, [x17, #+96] +ldr q15, [x17, #+112] +ldr q3, [x0, #32] +ldr q26, [x0, #0] +ldr q1, [x0, #16] +sqrdmulh v12.4S, v4.4S, v10.s[0] +mul v4.4S, v4.4S,v9.s[0] +sqrdmulh v30.4S, v5.4S, v10.s[0] +mul v5.4S, v5.4S,v9.s[0] +mla v4.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v6.4S, v10.s[0] +mul v6.4S, v6.4S,v9.s[0] +ldr q19, [x0, #240] +mla v5.4S, v30.4S, v31.s[0] +sub v30.4s, v8.4s, v4.4s +add v8.4s, v8.4s, v4.4s +sqrdmulh v4.4S, v7.4S, v10.s[0] +mul v7.4S, v7.4S,v9.s[0] +ldr q14, [x0, #224] +mla v6.4S, v12.4S, v31.s[0] +sub v12.4s, v3.4s, v5.4s +add v3.4s, v3.4s, v5.4s +sqrdmulh v5.4S, v8.4S, v10.s[1] +mul v8.4S, v8.4S,v9.s[1] +ldr q18, [x0, #192] +mla v7.4S, v4.4S, v31.s[0] +sub v4.4s, v26.4s, v6.4s +add v26.4s, v26.4s, v6.4s +sqrdmulh v6.4S, v3.4S, v10.s[1] +mul v3.4S, v3.4S,v9.s[1] +ldr q29, [x0, #208] +mla v8.4S, v5.4S, v31.s[0] +sub v5.4s, v1.4s, v7.4s +add v1.4s, v1.4s, v7.4s +sqrdmulh v7.4S, v30.4S, v10.s[2] +mul v30.4S, v30.4S,v9.s[2] +ldr q2, [x0, #176] +mla v3.4S, v6.4S, v31.s[0] +sub v6.4s, v1.4s, v8.4s +add v1.4s, v1.4s, v8.4s +ldr q8, [x17, #+128] +ldr q25, [x17, #+144] +ldr q24, [x17, #+160] +ldr q23, [x17, #+176] +sqrdmulh v22.4S, v12.4S, v10.s[2] +mul v12.4S, v12.4S,v9.s[2] +ldr q11, [x0, #160] +mla v30.4S, v7.4S, v31.s[0] +sub v7.4s, v26.4s, v3.4s +add v26.4s, v26.4s, v3.4s +sqrdmulh v3.4S, v1.4S, v15.s[0] +mul v1.4S, v1.4S,v20.s[0] +ldr q0, [x0, #128] +mla v12.4S, v22.4S, v31.s[0] +sub v22.4s, v5.4s, v30.4s +add v5.4s, v5.4s, v30.4s +sqrdmulh v30.4S, v6.4S, v15.s[1] +mul v6.4S, v6.4S,v20.s[1] +ldr q17, [x0, #144] +mla v1.4S, v3.4S, v31.s[0] +sub v3.4s, v4.4s, v12.4s +add v4.4s, v4.4s, v12.4s +sqrdmulh v12.4S, v22.4S, v15.s[3] +mul v22.4S, v22.4S,v20.s[3] +mla v6.4S, v30.4S, v31.s[0] +sub v30.4s, v26.4s, v1.4s +add v26.4s, v26.4s, v1.4s +sqrdmulh v1.4S, v5.4S, v15.s[2] +mul v5.4S, v5.4S,v20.s[2] +mla v22.4S, v12.4S, v31.s[0] +sub v12.4s, v7.4s, v6.4s +add v7.4s, v7.4s, v6.4s +sqrdmulh v6.4S, v19.4S, v25.s[0] +mul v19.4S, v19.4S,v8.s[0] +mla v5.4S, v1.4S, v31.s[0] +sub v1.4s, v3.4s, v22.4s +add v3.4s, v3.4s, v22.4s +sqrdmulh v22.4S, v14.4S, v25.s[0] +mul v14.4S, v14.4S,v8.s[0] +mla v19.4S, v6.4S, v31.s[0] +sub v6.4s, v4.4s, v5.4s +add v4.4s, v4.4s, v5.4s +sqrdmulh v15.4S, v18.4S, v25.s[0] +mul v18.4S, v18.4S,v8.s[0] +ldr q20, [x0, #368] +mla v14.4S, v22.4S, v31.s[0] +sub v22.4s, v2.4s, v19.4s +add v2.4s, v2.4s, v19.4s +sqrdmulh v19.4S, v29.4S, v25.s[0] +mul v29.4S, v29.4S,v8.s[0] +ldr q10, [x0, #352] +mla v18.4S, v15.4S, v31.s[0] +sub v15.4s, v11.4s, v14.4s +add v11.4s, v11.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v25.s[1] +str q26, [x0, #0] +mul v2.4S, v2.4S,v8.s[1] +ldr q26, [x0, #320] +mla v29.4S, v19.4S, v31.s[0] +sub v19.4s, v0.4s, v18.4s +add v0.4s, v0.4s, v18.4s +sqrdmulh v18.4S, v11.4S, v25.s[1] +str q30, [x0, #16] +mul v11.4S, v11.4S,v8.s[1] +ldr q30, [x0, #336] +mla v2.4S, v14.4S, v31.s[0] +sub v14.4s, v17.4s, v29.4s +add v17.4s, v17.4s, v29.4s +sqrdmulh v29.4S, v22.4S, v25.s[2] +str q7, [x0, #32] +mul v22.4S, v22.4S,v8.s[2] +ldr q7, [x0, #304] +mla v11.4S, v18.4S, v31.s[0] +sub v18.4s, v17.4s, v2.4s +add v17.4s, v17.4s, v2.4s +ldr q2, [x17, #+192] +ldr q9, [x17, #+208] +ldr q5, [x17, #+224] +ldr q28, [x17, #+240] +sqrdmulh v16.4S, v15.4S, v25.s[2] +str q12, [x0, #48] +mul v15.4S, v15.4S,v8.s[2] +ldr q12, [x0, #288] +mla v22.4S, v29.4S, v31.s[0] +sub v29.4s, v0.4s, v11.4s +add v0.4s, v0.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v23.s[0] +str q3, [x0, #96] +mul v17.4S, v17.4S,v24.s[0] +ldr q3, [x0, #256] +mla v15.4S, v16.4S, v31.s[0] +sub v16.4s, v14.4s, v22.4s +add v14.4s, v14.4s, v22.4s +sqrdmulh v22.4S, v18.4S, v23.s[1] +str q1, [x0, #112] +mul v18.4S, v18.4S,v24.s[1] +ldr q1, [x0, #272] +mla v17.4S, v11.4S, v31.s[0] +sub v11.4s, v19.4s, v15.4s +add v19.4s, v19.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v23.s[3] +str q4, [x0, #64] +mul v16.4S, v16.4S,v24.s[3] +mla v18.4S, v22.4S, v31.s[0] +sub v22.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +sqrdmulh v17.4S, v14.4S, v23.s[2] +str q6, [x0, #80] +mul v14.4S, v14.4S,v24.s[2] +mla v16.4S, v15.4S, v31.s[0] +sub v15.4s, v29.4s, v18.4s +add v29.4s, v29.4s, v18.4s +sqrdmulh v18.4S, v20.4S, v9.s[0] +mul v20.4S, v20.4S,v2.s[0] +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v11.4s, v16.4s +add v11.4s, v11.4s, v16.4s +sqrdmulh v16.4S, v10.4S, v9.s[0] +mul v10.4S, v10.4S,v2.s[0] +mla v20.4S, v18.4S, v31.s[0] +sub v18.4s, v19.4s, v14.4s +add v19.4s, v19.4s, v14.4s +sqrdmulh v23.4S, v26.4S, v9.s[0] +mul v26.4S, v26.4S,v2.s[0] +ldr q24, [x0, #496] +mla v10.4S, v16.4S, v31.s[0] +sub v16.4s, v7.4s, v20.4s +add v7.4s, v7.4s, v20.4s +sqrdmulh v20.4S, v30.4S, v9.s[0] +mul v30.4S, v30.4S,v2.s[0] +ldr q25, [x0, #480] +mla v26.4S, v23.4S, v31.s[0] +sub v23.4s, v12.4s, v10.4s +add v12.4s, v12.4s, v10.4s +sqrdmulh v10.4S, v7.4S, v9.s[1] +str q0, [x0, #128] +mul v7.4S, v7.4S,v2.s[1] +ldr q0, [x0, #448] +mla v30.4S, v20.4S, v31.s[0] +sub v20.4s, v3.4s, v26.4s +add v3.4s, v3.4s, v26.4s +sqrdmulh v26.4S, v12.4S, v9.s[1] +str q22, [x0, #144] +mul v12.4S, v12.4S,v2.s[1] +ldr q22, [x0, #464] +mla v7.4S, v10.4S, v31.s[0] +sub v10.4s, v1.4s, v30.4s +add v1.4s, v1.4s, v30.4s +sqrdmulh v30.4S, v16.4S, v9.s[2] +str q29, [x0, #160] +mul v16.4S, v16.4S,v2.s[2] +ldr q29, [x0, #432] +mla v12.4S, v26.4S, v31.s[0] +sub v26.4s, v1.4s, v7.4s +add v1.4s, v1.4s, v7.4s +ldr q7, [x17, #+256] +ldr q8, [x17, #+272] +ldr q14, [x17, #+288] +ldr q6, [x17, #+304] +sqrdmulh v4.4S, v23.4S, v9.s[2] +str q15, [x0, #176] +mul v23.4S, v23.4S,v2.s[2] +ldr q15, [x0, #416] +mla v16.4S, v30.4S, v31.s[0] +sub v30.4s, v3.4s, v12.4s +add v3.4s, v3.4s, v12.4s +sqrdmulh v12.4S, v1.4S, v28.s[0] +str q11, [x0, #224] +mul v1.4S, v1.4S,v5.s[0] +ldr q11, [x0, #384] +mla v23.4S, v4.4S, v31.s[0] +sub v4.4s, v10.4s, v16.4s +add v10.4s, v10.4s, v16.4s +sqrdmulh v16.4S, v26.4S, v28.s[1] +str q17, [x0, #240] +mul v26.4S, v26.4S,v5.s[1] +ldr q17, [x0, #400] +mla v1.4S, v12.4S, v31.s[0] +sub v12.4s, v20.4s, v23.4s +add v20.4s, v20.4s, v23.4s +sqrdmulh v23.4S, v4.4S, v28.s[3] +str q19, [x0, #192] +mul v4.4S, v4.4S,v5.s[3] +mla v26.4S, v16.4S, v31.s[0] +sub v16.4s, v3.4s, v1.4s +add v3.4s, v3.4s, v1.4s +sqrdmulh v1.4S, v10.4S, v28.s[2] +str q18, [x0, #208] +mul v10.4S, v10.4S,v5.s[2] +mla v4.4S, v23.4S, v31.s[0] +sub v23.4s, v30.4s, v26.4s +add v30.4s, v30.4s, v26.4s +sqrdmulh v26.4S, v24.4S, v8.s[0] +mul v24.4S, v24.4S,v7.s[0] +mla v10.4S, v1.4S, v31.s[0] +sub v1.4s, v12.4s, v4.4s +add v12.4s, v12.4s, v4.4s +sqrdmulh v4.4S, v25.4S, v8.s[0] +mul v25.4S, v25.4S,v7.s[0] +mla v24.4S, v26.4S, v31.s[0] +sub v26.4s, v20.4s, v10.4s +add v20.4s, v20.4s, v10.4s +sqrdmulh v28.4S, v0.4S, v8.s[0] +mul v0.4S, v0.4S,v7.s[0] +ldr q5, [x0, #624] +mla v25.4S, v4.4S, v31.s[0] +sub v4.4s, v29.4s, v24.4s +add v29.4s, v29.4s, v24.4s +sqrdmulh v24.4S, v22.4S, v8.s[0] +mul v22.4S, v22.4S,v7.s[0] +ldr q9, [x0, #608] +mla v0.4S, v28.4S, v31.s[0] +sub v28.4s, v15.4s, v25.4s +add v15.4s, v15.4s, v25.4s +sqrdmulh v25.4S, v29.4S, v8.s[1] +str q3, [x0, #256] +mul v29.4S, v29.4S,v7.s[1] +ldr q3, [x0, #576] +mla v22.4S, v24.4S, v31.s[0] +sub v24.4s, v11.4s, v0.4s +add v11.4s, v11.4s, v0.4s +sqrdmulh v0.4S, v15.4S, v8.s[1] +str q16, [x0, #272] +mul v15.4S, v15.4S,v7.s[1] +ldr q16, [x0, #592] +mla v29.4S, v25.4S, v31.s[0] +sub v25.4s, v17.4s, v22.4s +add v17.4s, v17.4s, v22.4s +sqrdmulh v22.4S, v4.4S, v8.s[2] +str q30, [x0, #288] +mul v4.4S, v4.4S,v7.s[2] +ldr q30, [x0, #560] +mla v15.4S, v0.4S, v31.s[0] +sub v0.4s, v17.4s, v29.4s +add v17.4s, v17.4s, v29.4s +ldr q29, [x17, #+320] +ldr q2, [x17, #+336] +ldr q10, [x17, #+352] +ldr q18, [x17, #+368] +sqrdmulh v19.4S, v28.4S, v8.s[2] +str q23, [x0, #304] +mul v28.4S, v28.4S,v7.s[2] +ldr q23, [x0, #544] +mla v4.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v15.4s +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v17.4S, v6.s[0] +str q12, [x0, #352] +mul v17.4S, v17.4S,v14.s[0] +ldr q12, [x0, #512] +mla v28.4S, v19.4S, v31.s[0] +sub v19.4s, v25.4s, v4.4s +add v25.4s, v25.4s, v4.4s +sqrdmulh v4.4S, v0.4S, v6.s[1] +str q1, [x0, #368] +mul v0.4S, v0.4S,v14.s[1] +ldr q1, [x0, #528] +mla v17.4S, v15.4S, v31.s[0] +sub v15.4s, v24.4s, v28.4s +add v24.4s, v24.4s, v28.4s +sqrdmulh v28.4S, v19.4S, v6.s[3] +str q20, [x0, #320] +mul v19.4S, v19.4S,v14.s[3] +mla v0.4S, v4.4S, v31.s[0] +sub v4.4s, v11.4s, v17.4s +add v11.4s, v11.4s, v17.4s +sqrdmulh v17.4S, v25.4S, v6.s[2] +str q26, [x0, #336] +mul v25.4S, v25.4S,v14.s[2] +mla v19.4S, v28.4S, v31.s[0] +sub v28.4s, v22.4s, v0.4s +add v22.4s, v22.4s, v0.4s +sqrdmulh v0.4S, v5.4S, v2.s[0] +mul v5.4S, v5.4S,v29.s[0] +mla v25.4S, v17.4S, v31.s[0] +sub v17.4s, v15.4s, v19.4s +add v15.4s, v15.4s, v19.4s +sqrdmulh v19.4S, v9.4S, v2.s[0] +mul v9.4S, v9.4S,v29.s[0] +mla v5.4S, v0.4S, v31.s[0] +sub v0.4s, v24.4s, v25.4s +add v24.4s, v24.4s, v25.4s +sqrdmulh v6.4S, v3.4S, v2.s[0] +mul v3.4S, v3.4S,v29.s[0] +ldr q14, [x0, #752] +mla v9.4S, v19.4S, v31.s[0] +sub v19.4s, v30.4s, v5.4s +add v30.4s, v30.4s, v5.4s +sqrdmulh v5.4S, v16.4S, v2.s[0] +mul v16.4S, v16.4S,v29.s[0] +ldr q8, [x0, #736] +mla v3.4S, v6.4S, v31.s[0] +sub v6.4s, v23.4s, v9.4s +add v23.4s, v23.4s, v9.4s +sqrdmulh v9.4S, v30.4S, v2.s[1] +str q11, [x0, #384] +mul v30.4S, v30.4S,v29.s[1] +ldr q11, [x0, #704] +mla v16.4S, v5.4S, v31.s[0] +sub v5.4s, v12.4s, v3.4s +add v12.4s, v12.4s, v3.4s +sqrdmulh v3.4S, v23.4S, v2.s[1] +str q4, [x0, #400] +mul v23.4S, v23.4S,v29.s[1] +ldr q4, [x0, #720] +mla v30.4S, v9.4S, v31.s[0] +sub v9.4s, v1.4s, v16.4s +add v1.4s, v1.4s, v16.4s +sqrdmulh v16.4S, v19.4S, v2.s[2] +str q22, [x0, #416] +mul v19.4S, v19.4S,v29.s[2] +ldr q22, [x0, #688] +mla v23.4S, v3.4S, v31.s[0] +sub v3.4s, v1.4s, v30.4s +add v1.4s, v1.4s, v30.4s +ldr q30, [x17, #+384] +ldr q7, [x17, #+400] +ldr q25, [x17, #+416] +ldr q26, [x17, #+432] +sqrdmulh v20.4S, v6.4S, v2.s[2] +str q28, [x0, #432] +mul v6.4S, v6.4S,v29.s[2] +ldr q28, [x0, #672] +mla v19.4S, v16.4S, v31.s[0] +sub v16.4s, v12.4s, v23.4s +add v12.4s, v12.4s, v23.4s +sqrdmulh v23.4S, v1.4S, v18.s[0] +str q15, [x0, #480] +mul v1.4S, v1.4S,v10.s[0] +ldr q15, [x0, #640] +mla v6.4S, v20.4S, v31.s[0] +sub v20.4s, v9.4s, v19.4s +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v3.4S, v18.s[1] +str q17, [x0, #496] +mul v3.4S, v3.4S,v10.s[1] +ldr q17, [x0, #656] +mla v1.4S, v23.4S, v31.s[0] +sub v23.4s, v5.4s, v6.4s +add v5.4s, v5.4s, v6.4s +sqrdmulh v6.4S, v20.4S, v18.s[3] +str q24, [x0, #448] +mul v20.4S, v20.4S,v10.s[3] +mla v3.4S, v19.4S, v31.s[0] +sub v19.4s, v12.4s, v1.4s +add v12.4s, v12.4s, v1.4s +sqrdmulh v1.4S, v9.4S, v18.s[2] +str q0, [x0, #464] +mul v9.4S, v9.4S,v10.s[2] +mla v20.4S, v6.4S, v31.s[0] +sub v6.4s, v16.4s, v3.4s +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v14.4S, v7.s[0] +mul v14.4S, v14.4S,v30.s[0] +mla v9.4S, v1.4S, v31.s[0] +sub v1.4s, v23.4s, v20.4s +add v23.4s, v23.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v7.s[0] +mul v8.4S, v8.4S,v30.s[0] +mla v14.4S, v3.4S, v31.s[0] +sub v3.4s, v5.4s, v9.4s +add v5.4s, v5.4s, v9.4s +sqrdmulh v18.4S, v11.4S, v7.s[0] +mul v11.4S, v11.4S,v30.s[0] +ldr q10, [x0, #880] +mla v8.4S, v20.4S, v31.s[0] +sub v20.4s, v22.4s, v14.4s +add v22.4s, v22.4s, v14.4s +sqrdmulh v14.4S, v4.4S, v7.s[0] +mul v4.4S, v4.4S,v30.s[0] +ldr q2, [x0, #864] +mla v11.4S, v18.4S, v31.s[0] +sub v18.4s, v28.4s, v8.4s +add v28.4s, v28.4s, v8.4s +sqrdmulh v8.4S, v22.4S, v7.s[1] +str q12, [x0, #512] +mul v22.4S, v22.4S,v30.s[1] +ldr q12, [x0, #832] +mla v4.4S, v14.4S, v31.s[0] +sub v14.4s, v15.4s, v11.4s +add v15.4s, v15.4s, v11.4s +sqrdmulh v11.4S, v28.4S, v7.s[1] +str q19, [x0, #528] +mul v28.4S, v28.4S,v30.s[1] +ldr q19, [x0, #848] +mla v22.4S, v8.4S, v31.s[0] +sub v8.4s, v17.4s, v4.4s +add v17.4s, v17.4s, v4.4s +sqrdmulh v4.4S, v20.4S, v7.s[2] +str q16, [x0, #544] +mul v20.4S, v20.4S,v30.s[2] +ldr q16, [x0, #816] +mla v28.4S, v11.4S, v31.s[0] +sub v11.4s, v17.4s, v22.4s +add v17.4s, v17.4s, v22.4s +ldr q22, [x17, #+448] +ldr q29, [x17, #+464] +ldr q9, [x17, #+480] +ldr q0, [x17, #+496] +sqrdmulh v24.4S, v18.4S, v7.s[2] +str q6, [x0, #560] +mul v18.4S, v18.4S,v30.s[2] +ldr q6, [x0, #800] +mla v20.4S, v4.4S, v31.s[0] +sub v4.4s, v15.4s, v28.4s +add v15.4s, v15.4s, v28.4s +sqrdmulh v28.4S, v17.4S, v26.s[0] +str q23, [x0, #608] +mul v17.4S, v17.4S,v25.s[0] +ldr q23, [x0, #768] +mla v18.4S, v24.4S, v31.s[0] +sub v24.4s, v8.4s, v20.4s +add v8.4s, v8.4s, v20.4s +sqrdmulh v20.4S, v11.4S, v26.s[1] +str q1, [x0, #624] +mul v11.4S, v11.4S,v25.s[1] +ldr q1, [x0, #784] +mla v17.4S, v28.4S, v31.s[0] +sub v28.4s, v14.4s, v18.4s +add v14.4s, v14.4s, v18.4s +sqrdmulh v18.4S, v24.4S, v26.s[3] +str q5, [x0, #576] +mul v24.4S, v24.4S,v25.s[3] +mla v11.4S, v20.4S, v31.s[0] +sub v20.4s, v15.4s, v17.4s +add v15.4s, v15.4s, v17.4s +sqrdmulh v17.4S, v8.4S, v26.s[2] +str q3, [x0, #592] +mul v8.4S, v8.4S,v25.s[2] +mla v24.4S, v18.4S, v31.s[0] +sub v18.4s, v4.4s, v11.4s +add v4.4s, v4.4s, v11.4s +sqrdmulh v11.4S, v10.4S, v29.s[0] +mul v10.4S, v10.4S,v22.s[0] +mla v8.4S, v17.4S, v31.s[0] +sub v17.4s, v28.4s, v24.4s +add v28.4s, v28.4s, v24.4s +sqrdmulh v24.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v22.s[0] +mla v10.4S, v11.4S, v31.s[0] +sub v11.4s, v14.4s, v8.4s +add v14.4s, v14.4s, v8.4s +sqrdmulh v26.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v22.s[0] +ldr q25, [x0, #1008] +mla v2.4S, v24.4S, v31.s[0] +sub v24.4s, v16.4s, v10.4s +add v16.4s, v16.4s, v10.4s +sqrdmulh v10.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v22.s[0] +ldr q7, [x0, #992] +mla v12.4S, v26.4S, v31.s[0] +sub v26.4s, v6.4s, v2.4s +add v6.4s, v6.4s, v2.4s +sqrdmulh v2.4S, v16.4S, v29.s[1] +str q15, [x0, #640] +mul v16.4S, v16.4S,v22.s[1] +ldr q15, [x0, #960] +mla v19.4S, v10.4S, v31.s[0] +sub v10.4s, v23.4s, v12.4s +add v23.4s, v23.4s, v12.4s +sqrdmulh v12.4S, v6.4S, v29.s[1] +str q20, [x0, #656] +mul v6.4S, v6.4S,v22.s[1] +ldr q20, [x0, #976] +mla v16.4S, v2.4S, v31.s[0] +sub v2.4s, v1.4s, v19.4s +add v1.4s, v1.4s, v19.4s +sqrdmulh v19.4S, v24.4S, v29.s[2] +str q4, [x0, #672] +mul v24.4S, v24.4S,v22.s[2] +ldr q4, [x0, #944] +mla v6.4S, v12.4S, v31.s[0] +sub v12.4s, v1.4s, v16.4s +add v1.4s, v1.4s, v16.4s +ldr q16, [x17, #+512] +ldr q30, [x17, #+528] +ldr q8, [x17, #+544] +ldr q3, [x17, #+560] +sqrdmulh v5.4S, v26.4S, v29.s[2] +str q18, [x0, #688] +mul v26.4S, v26.4S,v22.s[2] +ldr q18, [x0, #928] +mla v24.4S, v19.4S, v31.s[0] +sub v19.4s, v23.4s, v6.4s +add v23.4s, v23.4s, v6.4s +sqrdmulh v6.4S, v1.4S, v0.s[0] +str q28, [x0, #736] +mul v1.4S, v1.4S,v9.s[0] +ldr q28, [x0, #896] +mla v26.4S, v5.4S, v31.s[0] +sub v5.4s, v2.4s, v24.4s +add v2.4s, v2.4s, v24.4s +sqrdmulh v24.4S, v12.4S, v0.s[1] +str q17, [x0, #752] +mul v12.4S, v12.4S,v9.s[1] +ldr q17, [x0, #912] +mla v1.4S, v6.4S, v31.s[0] +sub v6.4s, v10.4s, v26.4s +add v10.4s, v10.4s, v26.4s +sqrdmulh v26.4S, v5.4S, v0.s[3] +str q14, [x0, #704] +mul v5.4S, v5.4S,v9.s[3] +mla v12.4S, v24.4S, v31.s[0] +sub v24.4s, v23.4s, v1.4s +add v23.4s, v23.4s, v1.4s +sqrdmulh v1.4S, v2.4S, v0.s[2] +str q11, [x0, #720] +mul v2.4S, v2.4S,v9.s[2] +mla v5.4S, v26.4S, v31.s[0] +sub v26.4s, v19.4s, v12.4s +add v19.4s, v19.4s, v12.4s +sqrdmulh v12.4S, v25.4S, v30.s[0] +mul v25.4S, v25.4S,v16.s[0] +mla v2.4S, v1.4S, v31.s[0] +sub v1.4s, v6.4s, v5.4s +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v7.4S, v30.s[0] +mul v7.4S, v7.4S,v16.s[0] +mla v25.4S, v12.4S, v31.s[0] +sub v12.4s, v10.4s, v2.4s +add v10.4s, v10.4s, v2.4s +sqrdmulh v0.4S, v15.4S, v30.s[0] +mul v15.4S, v15.4S,v16.s[0] +mla v7.4S, v5.4S, v31.s[0] +sub v5.4s, v4.4s, v25.4s +add v4.4s, v4.4s, v25.4s +sqrdmulh v25.4S, v20.4S, v30.s[0] +mul v20.4S, v20.4S,v16.s[0] +mla v15.4S, v0.4S, v31.s[0] +sub v0.4s, v18.4s, v7.4s +add v18.4s, v18.4s, v7.4s +sqrdmulh v7.4S, v4.4S, v30.s[1] +str q23, [x0, #768] +mul v4.4S, v4.4S,v16.s[1] +mla v20.4S, v25.4S, v31.s[0] +sub v25.4s, v28.4s, v15.4s +add v28.4s, v28.4s, v15.4s +sqrdmulh v15.4S, v18.4S, v30.s[1] +str q24, [x0, #784] +mul v18.4S, v18.4S,v16.s[1] +mla v4.4S, v7.4S, v31.s[0] +sub v7.4s, v17.4s, v20.4s +add v17.4s, v17.4s, v20.4s +sqrdmulh v20.4S, v5.4S, v30.s[2] +str q19, [x0, #800] +mul v5.4S, v5.4S,v16.s[2] +mla v18.4S, v15.4S, v31.s[0] +sub v15.4s, v17.4s, v4.4s +add v17.4s, v17.4s, v4.4s +sqrdmulh v4.4S, v0.4S, v30.s[2] +str q26, [x0, #816] +mul v0.4S, v0.4S,v16.s[2] +mla v5.4S, v20.4S, v31.s[0] +sub v20.4s, v28.4s, v18.4s +add v28.4s, v28.4s, v18.4s +sqrdmulh v18.4S, v17.4S, v3.s[0] +str q6, [x0, #864] +mul v17.4S, v17.4S,v8.s[0] +mla v0.4S, v4.4S, v31.s[0] +sub v4.4s, v7.4s, v5.4s +add v7.4s, v7.4s, v5.4s +sqrdmulh v5.4S, v15.4S, v3.s[1] +str q1, [x0, #880] +mul v15.4S, v15.4S,v8.s[1] +mla v17.4S, v18.4S, v31.s[0] +sub v18.4s, v25.4s, v0.4s +add v25.4s, v25.4s, v0.4s +sqrdmulh v0.4S, v4.4S, v3.s[3] +str q10, [x0, #832] +mul v4.4S, v4.4S,v8.s[3] +mla v15.4S, v5.4S, v31.s[0] +sub v5.4s, v28.4s, v17.4s +add v28.4s, v28.4s, v17.4s +sqrdmulh v17.4S, v7.4S, v3.s[2] +str q12, [x0, #848] +mul v7.4S, v7.4S,v8.s[2] +mla v4.4S, v0.4S, v31.s[0] +sub v0.4s, v20.4s, v15.4s +add v20.4s, v20.4s, v15.4s +mla v7.4S, v17.4S, v31.s[0] +sub v17.4s, v18.4s, v4.4s +add v18.4s, v18.4s, v4.4s +sub v4.4s, v25.4s, v7.4s +add v25.4s, v25.4s, v7.4s +str q28, [x0, #896] +str q5, [x0, #912] +str q20, [x0, #928] +str q0, [x0, #944] +str q18, [x0, #992] +str q17, [x0, #1008] +str q25, [x0, #960] +str q4, [x0, #976] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1444 +// Instruction count: 1440 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_3_3_4.s b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_3_3_4.s new file mode 100644 index 0000000..ca19281 --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_3_3_4.s @@ -0,0 +1,1474 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 23825509 // Layer 4, block 0 +.word 27028662 // Layer 4, block 1 +.word 0 // Layer None, block None +.word 1307297022 // Layer 3, block 0 +.word 1524716204 // Layer 4, block 0 +.word 1729702351 // Layer 4, block 1 +.word 0 // Layer None, block None +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 14626653 // Layer 3, block 1 +.word 14833295 // Layer 4, block 2 +.word 2138810 // Layer 4, block 3 +.word 0 // Layer None, block None +.word 936034350 // Layer 3, block 1 +.word 949258429 // Layer 4, block 2 +.word 136873393 // Layer 4, block 3 +.word 0 // Layer None, block None +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 29737761 // Layer 3, block 2 +.word 6490403 // Layer 4, block 4 +.word 19648405 // Layer 4, block 5 +.word 0 // Layer None, block None +.word 1903071454 // Layer 3, block 2 +.word 415354091 // Layer 4, block 4 +.word 1257401950 // Layer 4, block 5 +.word 0 // Layer None, block None +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 30285189 // Layer 3, block 3 +.word 31254932 // Layer 4, block 6 +.word 26362414 // Layer 4, block 7 +.word 0 // Layer None, block None +.word 1938104173 // Layer 3, block 3 +.word 2000162988 // Layer 4, block 6 +.word 1687065733 // Layer 4, block 7 +.word 0 // Layer None, block None +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 21289485 // Layer 3, block 4 +.word 572895 // Layer 4, block 8 +.word 26691971 // Layer 4, block 9 +.word 0 // Layer None, block None +.word 1362423055 // Layer 3, block 4 +.word 36662482 // Layer 4, block 8 +.word 1708155771 // Layer 4, block 9 +.word 0 // Layer None, block None +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 9914896 // Layer 3, block 5 +.word 9249292 // Layer 4, block 10 +.word 29292862 // Layer 4, block 11 +.word 0 // Layer None, block None +.word 634504916 // Layer 3, block 5 +.word 591909511 // Layer 4, block 10 +.word 1874600091 // Layer 4, block 11 +.word 0 // Layer None, block None +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 22603682 // Layer 3, block 6 +.word 8247799 // Layer 4, block 12 +.word 5086187 // Layer 4, block 13 +.word 0 // Layer None, block None +.word 1446525244 // Layer 3, block 6 +.word 527818851 // Layer 4, block 12 +.word 325491125 // Layer 4, block 13 +.word 0 // Layer None, block None +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 16204162 // Layer 3, block 7 +.word 28113639 // Layer 4, block 14 +.word 8471290 // Layer 4, block 15 +.word 0 // Layer None, block None +.word 1036987221 // Layer 3, block 7 +.word 1799135579 // Layer 4, block 14 +.word 542121183 // Layer 4, block 15 +.word 0 // Layer None, block None +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.text +.global ntt_u32_incomplete_neon_asm_var_3_3_4 +.global _ntt_u32_incomplete_neon_asm_var_3_3_4 +ntt_u32_incomplete_neon_asm_var_3_3_4: +_ntt_u32_incomplete_neon_asm_var_3_3_4: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x0, #960] +ldr q29, [x0, #832] +ldr q28, [x0, #576] +ldr q27, [x0, #704] +ldr q26, [x0, #448] +ldr q25, [x17, #+0] +ldr q24, [x17, #+16] +ldr q23, [x17, #+32] +ldr q22, [x17, #+48] +ldr q21, [x0, #320] +ldr q20, [x0, #64] +ldr q19, [x0, #192] +sqrdmulh v18.4S, v30.4S, v24.s[0] +mul v30.4S, v30.4S,v25.s[0] +sqrdmulh v17.4S, v29.4S, v24.s[0] +mla v30.4S, v18.4S, v31.s[0] +mul v29.4S, v29.4S,v25.s[0] +sqrdmulh v18.4S, v28.4S, v24.s[0] +mla v29.4S, v17.4S, v31.s[0] +ldr q17, [x0, #976] +mul v28.4S, v28.4S,v25.s[0] +sub v16.4s, v26.4s, v30.4s +add v26.4s, v26.4s, v30.4s +sqrdmulh v30.4S, v27.4S, v24.s[0] +mla v28.4S, v18.4S, v31.s[0] +ldr q18, [x0, #848] +mul v27.4S, v27.4S,v25.s[0] +sub v3.4s, v21.4s, v29.4s +add v21.4s, v21.4s, v29.4s +sqrdmulh v29.4S, v26.4S, v24.s[1] +mla v27.4S, v30.4S, v31.s[0] +ldr q30, [x0, #592] +mul v26.4S, v26.4S,v25.s[1] +sub v2.4s, v20.4s, v28.4s +add v20.4s, v20.4s, v28.4s +sqrdmulh v28.4S, v21.4S, v24.s[1] +mla v26.4S, v29.4S, v31.s[0] +ldr q29, [x0, #720] +mul v21.4S, v21.4S,v25.s[1] +sub v1.4s, v19.4s, v27.4s +add v19.4s, v19.4s, v27.4s +sqrdmulh v27.4S, v16.4S, v24.s[2] +mla v21.4S, v28.4S, v31.s[0] +ldr q28, [x0, #464] +mul v16.4S, v16.4S,v25.s[2] +sub v0.4s, v19.4s, v26.4s +add v19.4s, v19.4s, v26.4s +sqrdmulh v26.4S, v3.4S, v24.s[2] +mla v16.4S, v27.4S, v31.s[0] +ldr q27, [x0, #336] +mul v3.4S, v3.4S,v25.s[2] +sub v15.4s, v20.4s, v21.4s +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v19.4S, v22.s[0] +mla v3.4S, v26.4S, v31.s[0] +ldr q26, [x0, #80] +mul v19.4S, v19.4S,v23.s[0] +sub v14.4s, v1.4s, v16.4s +add v1.4s, v1.4s, v16.4s +sqrdmulh v16.4S, v0.4S, v22.s[1] +mla v19.4S, v21.4S, v31.s[0] +ldr q21, [x0, #208] +mul v0.4S, v0.4S,v23.s[1] +sub v13.4s, v2.4s, v3.4s +add v2.4s, v2.4s, v3.4s +sqrdmulh v3.4S, v14.4S, v22.s[3] +mla v0.4S, v16.4S, v31.s[0] +mul v14.4S, v14.4S,v23.s[3] +sub v16.4s, v20.4s, v19.4s +add v20.4s, v20.4s, v19.4s +sqrdmulh v19.4S, v1.4S, v22.s[2] +mla v14.4S, v3.4S, v31.s[0] +mul v1.4S, v1.4S,v23.s[2] +sub v3.4s, v15.4s, v0.4s +add v15.4s, v15.4s, v0.4s +sqrdmulh v0.4S, v17.4S, v24.s[0] +mla v1.4S, v19.4S, v31.s[0] +mul v17.4S, v17.4S,v25.s[0] +sub v19.4s, v13.4s, v14.4s +add v13.4s, v13.4s, v14.4s +sqrdmulh v14.4S, v18.4S, v24.s[0] +mla v17.4S, v0.4S, v31.s[0] +mul v18.4S, v18.4S,v25.s[0] +sub v0.4s, v2.4s, v1.4s +add v2.4s, v2.4s, v1.4s +sqrdmulh v1.4S, v30.4S, v24.s[0] +mla v18.4S, v14.4S, v31.s[0] +ldr q14, [x0, #992] +mul v30.4S, v30.4S,v25.s[0] +sub v12.4s, v28.4s, v17.4s +add v28.4s, v28.4s, v17.4s +sqrdmulh v17.4S, v29.4S, v24.s[0] +mla v30.4S, v1.4S, v31.s[0] +ldr q1, [x0, #864] +mul v29.4S, v29.4S,v25.s[0] +sub v11.4s, v27.4s, v18.4s +add v27.4s, v27.4s, v18.4s +sqrdmulh v18.4S, v28.4S, v24.s[1] +str q20, [x0, #64] +mla v29.4S, v17.4S, v31.s[0] +ldr q17, [x0, #608] +mul v28.4S, v28.4S,v25.s[1] +sub v20.4s, v26.4s, v30.4s +add v26.4s, v26.4s, v30.4s +sqrdmulh v30.4S, v27.4S, v24.s[1] +str q16, [x0, #192] +mla v28.4S, v18.4S, v31.s[0] +ldr q18, [x0, #736] +mul v27.4S, v27.4S,v25.s[1] +sub v16.4s, v21.4s, v29.4s +add v21.4s, v21.4s, v29.4s +sqrdmulh v29.4S, v12.4S, v24.s[2] +str q15, [x0, #320] +mla v27.4S, v30.4S, v31.s[0] +ldr q30, [x0, #480] +mul v12.4S, v12.4S,v25.s[2] +sub v15.4s, v21.4s, v28.4s +add v21.4s, v21.4s, v28.4s +sqrdmulh v28.4S, v11.4S, v24.s[2] +str q3, [x0, #448] +mla v12.4S, v29.4S, v31.s[0] +ldr q29, [x0, #352] +mul v11.4S, v11.4S,v25.s[2] +sub v3.4s, v26.4s, v27.4s +add v26.4s, v26.4s, v27.4s +sqrdmulh v27.4S, v21.4S, v22.s[0] +str q13, [x0, #832] +mla v11.4S, v28.4S, v31.s[0] +ldr q28, [x0, #96] +mul v21.4S, v21.4S,v23.s[0] +sub v13.4s, v16.4s, v12.4s +add v16.4s, v16.4s, v12.4s +sqrdmulh v12.4S, v15.4S, v22.s[1] +str q19, [x0, #960] +mla v21.4S, v27.4S, v31.s[0] +ldr q27, [x0, #224] +mul v15.4S, v15.4S,v23.s[1] +sub v19.4s, v20.4s, v11.4s +add v20.4s, v20.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v22.s[3] +str q2, [x0, #576] +mla v15.4S, v12.4S, v31.s[0] +mul v13.4S, v13.4S,v23.s[3] +sub v12.4s, v26.4s, v21.4s +add v26.4s, v26.4s, v21.4s +sqrdmulh v21.4S, v16.4S, v22.s[2] +str q0, [x0, #704] +mla v13.4S, v11.4S, v31.s[0] +mul v16.4S, v16.4S,v23.s[2] +sub v11.4s, v3.4s, v15.4s +add v3.4s, v3.4s, v15.4s +sqrdmulh v15.4S, v14.4S, v24.s[0] +mla v16.4S, v21.4S, v31.s[0] +mul v14.4S, v14.4S,v25.s[0] +sub v21.4s, v19.4s, v13.4s +add v19.4s, v19.4s, v13.4s +sqrdmulh v13.4S, v1.4S, v24.s[0] +mla v14.4S, v15.4S, v31.s[0] +mul v1.4S, v1.4S,v25.s[0] +sub v15.4s, v20.4s, v16.4s +add v20.4s, v20.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v24.s[0] +mla v1.4S, v13.4S, v31.s[0] +ldr q13, [x0, #1008] +mul v17.4S, v17.4S,v25.s[0] +sub v0.4s, v30.4s, v14.4s +add v30.4s, v30.4s, v14.4s +sqrdmulh v14.4S, v18.4S, v24.s[0] +mla v17.4S, v16.4S, v31.s[0] +ldr q16, [x0, #880] +mul v18.4S, v18.4S,v25.s[0] +sub v2.4s, v29.4s, v1.4s +add v29.4s, v29.4s, v1.4s +sqrdmulh v1.4S, v30.4S, v24.s[1] +str q26, [x0, #80] +mla v18.4S, v14.4S, v31.s[0] +ldr q14, [x0, #624] +mul v30.4S, v30.4S,v25.s[1] +sub v26.4s, v28.4s, v17.4s +add v28.4s, v28.4s, v17.4s +sqrdmulh v17.4S, v29.4S, v24.s[1] +str q12, [x0, #208] +mla v30.4S, v1.4S, v31.s[0] +ldr q1, [x0, #752] +mul v29.4S, v29.4S,v25.s[1] +sub v12.4s, v27.4s, v18.4s +add v27.4s, v27.4s, v18.4s +sqrdmulh v18.4S, v0.4S, v24.s[2] +str q3, [x0, #336] +mla v29.4S, v17.4S, v31.s[0] +ldr q17, [x0, #496] +mul v0.4S, v0.4S,v25.s[2] +sub v3.4s, v27.4s, v30.4s +add v27.4s, v27.4s, v30.4s +sqrdmulh v30.4S, v2.4S, v24.s[2] +str q11, [x0, #464] +mla v0.4S, v18.4S, v31.s[0] +ldr q18, [x0, #368] +mul v2.4S, v2.4S,v25.s[2] +sub v11.4s, v28.4s, v29.4s +add v28.4s, v28.4s, v29.4s +sqrdmulh v29.4S, v27.4S, v22.s[0] +str q19, [x0, #848] +mla v2.4S, v30.4S, v31.s[0] +ldr q30, [x0, #112] +mul v27.4S, v27.4S,v23.s[0] +sub v19.4s, v12.4s, v0.4s +add v12.4s, v12.4s, v0.4s +sqrdmulh v0.4S, v3.4S, v22.s[1] +str q21, [x0, #976] +mla v27.4S, v29.4S, v31.s[0] +ldr q29, [x0, #240] +mul v3.4S, v3.4S,v23.s[1] +sub v21.4s, v26.4s, v2.4s +add v26.4s, v26.4s, v2.4s +sqrdmulh v2.4S, v19.4S, v22.s[3] +str q20, [x0, #592] +mla v3.4S, v0.4S, v31.s[0] +mul v19.4S, v19.4S,v23.s[3] +sub v0.4s, v28.4s, v27.4s +add v28.4s, v28.4s, v27.4s +sqrdmulh v27.4S, v12.4S, v22.s[2] +str q15, [x0, #720] +mla v19.4S, v2.4S, v31.s[0] +mul v12.4S, v12.4S,v23.s[2] +sub v2.4s, v11.4s, v3.4s +add v11.4s, v11.4s, v3.4s +sqrdmulh v3.4S, v13.4S, v24.s[0] +mla v12.4S, v27.4S, v31.s[0] +mul v13.4S, v13.4S,v25.s[0] +sub v27.4s, v21.4s, v19.4s +add v21.4s, v21.4s, v19.4s +sqrdmulh v19.4S, v16.4S, v24.s[0] +mla v13.4S, v3.4S, v31.s[0] +mul v16.4S, v16.4S,v25.s[0] +sub v3.4s, v26.4s, v12.4s +add v26.4s, v26.4s, v12.4s +sqrdmulh v12.4S, v14.4S, v24.s[0] +mla v16.4S, v19.4S, v31.s[0] +ldr q19, [x0, #896] +mul v14.4S, v14.4S,v25.s[0] +sub v15.4s, v17.4s, v13.4s +add v17.4s, v17.4s, v13.4s +sqrdmulh v13.4S, v1.4S, v24.s[0] +mla v14.4S, v12.4S, v31.s[0] +ldr q12, [x0, #768] +mul v1.4S, v1.4S,v25.s[0] +sub v20.4s, v18.4s, v16.4s +add v18.4s, v18.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v24.s[1] +str q28, [x0, #96] +mla v1.4S, v13.4S, v31.s[0] +ldr q13, [x0, #512] +mul v17.4S, v17.4S,v25.s[1] +sub v28.4s, v30.4s, v14.4s +add v30.4s, v30.4s, v14.4s +sqrdmulh v14.4S, v18.4S, v24.s[1] +str q0, [x0, #224] +mla v17.4S, v16.4S, v31.s[0] +ldr q16, [x0, #640] +mul v18.4S, v18.4S,v25.s[1] +sub v0.4s, v29.4s, v1.4s +add v29.4s, v29.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v24.s[2] +str q11, [x0, #352] +mla v18.4S, v14.4S, v31.s[0] +ldr q14, [x0, #384] +mul v15.4S, v15.4S,v25.s[2] +sub v11.4s, v29.4s, v17.4s +add v29.4s, v29.4s, v17.4s +sqrdmulh v17.4S, v20.4S, v24.s[2] +str q2, [x0, #480] +mla v15.4S, v1.4S, v31.s[0] +ldr q1, [x0, #256] +mul v20.4S, v20.4S,v25.s[2] +sub v2.4s, v30.4s, v18.4s +add v30.4s, v30.4s, v18.4s +sqrdmulh v18.4S, v29.4S, v22.s[0] +str q21, [x0, #864] +mla v20.4S, v17.4S, v31.s[0] +ldr q17, [x0, #0] +mul v29.4S, v29.4S,v23.s[0] +sub v21.4s, v0.4s, v15.4s +add v0.4s, v0.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v22.s[1] +str q27, [x0, #992] +mla v29.4S, v18.4S, v31.s[0] +ldr q18, [x0, #128] +mul v11.4S, v11.4S,v23.s[1] +sub v27.4s, v28.4s, v20.4s +add v28.4s, v28.4s, v20.4s +sqrdmulh v20.4S, v21.4S, v22.s[3] +str q26, [x0, #608] +mla v11.4S, v15.4S, v31.s[0] +mul v21.4S, v21.4S,v23.s[3] +sub v15.4s, v30.4s, v29.4s +add v30.4s, v30.4s, v29.4s +sqrdmulh v29.4S, v0.4S, v22.s[2] +str q3, [x0, #736] +mla v21.4S, v20.4S, v31.s[0] +mul v0.4S, v0.4S,v23.s[2] +sub v20.4s, v2.4s, v11.4s +add v2.4s, v2.4s, v11.4s +sqrdmulh v11.4S, v19.4S, v24.s[0] +mla v0.4S, v29.4S, v31.s[0] +mul v19.4S, v19.4S,v25.s[0] +sub v29.4s, v27.4s, v21.4s +add v27.4s, v27.4s, v21.4s +sqrdmulh v21.4S, v12.4S, v24.s[0] +mla v19.4S, v11.4S, v31.s[0] +mul v12.4S, v12.4S,v25.s[0] +sub v11.4s, v28.4s, v0.4s +add v28.4s, v28.4s, v0.4s +sqrdmulh v0.4S, v13.4S, v24.s[0] +mla v12.4S, v21.4S, v31.s[0] +ldr q21, [x0, #912] +mul v13.4S, v13.4S,v25.s[0] +sub v3.4s, v14.4s, v19.4s +add v14.4s, v14.4s, v19.4s +sqrdmulh v19.4S, v16.4S, v24.s[0] +mla v13.4S, v0.4S, v31.s[0] +ldr q0, [x0, #784] +mul v16.4S, v16.4S,v25.s[0] +sub v26.4s, v1.4s, v12.4s +add v1.4s, v1.4s, v12.4s +sqrdmulh v12.4S, v14.4S, v24.s[1] +str q30, [x0, #112] +mla v16.4S, v19.4S, v31.s[0] +ldr q19, [x0, #528] +mul v14.4S, v14.4S,v25.s[1] +sub v30.4s, v17.4s, v13.4s +add v17.4s, v17.4s, v13.4s +sqrdmulh v13.4S, v1.4S, v24.s[1] +str q15, [x0, #240] +mla v14.4S, v12.4S, v31.s[0] +ldr q12, [x0, #656] +mul v1.4S, v1.4S,v25.s[1] +sub v15.4s, v18.4s, v16.4s +add v18.4s, v18.4s, v16.4s +sqrdmulh v16.4S, v3.4S, v24.s[2] +str q2, [x0, #368] +mla v1.4S, v13.4S, v31.s[0] +ldr q13, [x0, #400] +mul v3.4S, v3.4S,v25.s[2] +sub v2.4s, v18.4s, v14.4s +add v18.4s, v18.4s, v14.4s +sqrdmulh v14.4S, v26.4S, v24.s[2] +str q20, [x0, #496] +mla v3.4S, v16.4S, v31.s[0] +ldr q16, [x0, #272] +mul v26.4S, v26.4S,v25.s[2] +sub v20.4s, v17.4s, v1.4s +add v17.4s, v17.4s, v1.4s +sqrdmulh v1.4S, v18.4S, v22.s[0] +str q27, [x0, #880] +mla v26.4S, v14.4S, v31.s[0] +ldr q14, [x0, #16] +mul v18.4S, v18.4S,v23.s[0] +sub v27.4s, v15.4s, v3.4s +add v15.4s, v15.4s, v3.4s +sqrdmulh v3.4S, v2.4S, v22.s[1] +str q29, [x0, #1008] +mla v18.4S, v1.4S, v31.s[0] +ldr q1, [x0, #144] +mul v2.4S, v2.4S,v23.s[1] +sub v29.4s, v30.4s, v26.4s +add v30.4s, v30.4s, v26.4s +sqrdmulh v26.4S, v27.4S, v22.s[3] +str q28, [x0, #624] +mla v2.4S, v3.4S, v31.s[0] +mul v27.4S, v27.4S,v23.s[3] +sub v3.4s, v17.4s, v18.4s +add v17.4s, v17.4s, v18.4s +sqrdmulh v18.4S, v15.4S, v22.s[2] +str q11, [x0, #752] +mla v27.4S, v26.4S, v31.s[0] +mul v15.4S, v15.4S,v23.s[2] +sub v26.4s, v20.4s, v2.4s +add v20.4s, v20.4s, v2.4s +sqrdmulh v2.4S, v21.4S, v24.s[0] +mla v15.4S, v18.4S, v31.s[0] +mul v21.4S, v21.4S,v25.s[0] +sub v18.4s, v29.4s, v27.4s +add v29.4s, v29.4s, v27.4s +sqrdmulh v27.4S, v0.4S, v24.s[0] +mla v21.4S, v2.4S, v31.s[0] +mul v0.4S, v0.4S,v25.s[0] +sub v2.4s, v30.4s, v15.4s +add v30.4s, v30.4s, v15.4s +sqrdmulh v15.4S, v19.4S, v24.s[0] +mla v0.4S, v27.4S, v31.s[0] +ldr q27, [x0, #928] +mul v19.4S, v19.4S,v25.s[0] +sub v11.4s, v13.4s, v21.4s +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v12.4S, v24.s[0] +mla v19.4S, v15.4S, v31.s[0] +ldr q15, [x0, #800] +mul v12.4S, v12.4S,v25.s[0] +sub v28.4s, v16.4s, v0.4s +add v16.4s, v16.4s, v0.4s +sqrdmulh v0.4S, v13.4S, v24.s[1] +str q17, [x0, #0] +mla v12.4S, v21.4S, v31.s[0] +ldr q21, [x0, #544] +mul v13.4S, v13.4S,v25.s[1] +sub v17.4s, v14.4s, v19.4s +add v14.4s, v14.4s, v19.4s +sqrdmulh v19.4S, v16.4S, v24.s[1] +str q3, [x0, #128] +mla v13.4S, v0.4S, v31.s[0] +ldr q0, [x0, #672] +mul v16.4S, v16.4S,v25.s[1] +sub v3.4s, v1.4s, v12.4s +add v1.4s, v1.4s, v12.4s +sqrdmulh v12.4S, v11.4S, v24.s[2] +str q20, [x0, #256] +mla v16.4S, v19.4S, v31.s[0] +ldr q19, [x0, #416] +mul v11.4S, v11.4S,v25.s[2] +sub v20.4s, v1.4s, v13.4s +add v1.4s, v1.4s, v13.4s +sqrdmulh v13.4S, v28.4S, v24.s[2] +str q26, [x0, #384] +mla v11.4S, v12.4S, v31.s[0] +ldr q12, [x0, #288] +mul v28.4S, v28.4S,v25.s[2] +sub v26.4s, v14.4s, v16.4s +add v14.4s, v14.4s, v16.4s +sqrdmulh v16.4S, v1.4S, v22.s[0] +str q29, [x0, #768] +mla v28.4S, v13.4S, v31.s[0] +ldr q13, [x0, #32] +mul v1.4S, v1.4S,v23.s[0] +sub v29.4s, v3.4s, v11.4s +add v3.4s, v3.4s, v11.4s +sqrdmulh v11.4S, v20.4S, v22.s[1] +str q18, [x0, #896] +mla v1.4S, v16.4S, v31.s[0] +ldr q16, [x0, #160] +mul v20.4S, v20.4S,v23.s[1] +sub v18.4s, v17.4s, v28.4s +add v17.4s, v17.4s, v28.4s +sqrdmulh v28.4S, v29.4S, v22.s[3] +str q30, [x0, #512] +mla v20.4S, v11.4S, v31.s[0] +mul v29.4S, v29.4S,v23.s[3] +sub v11.4s, v14.4s, v1.4s +add v14.4s, v14.4s, v1.4s +sqrdmulh v1.4S, v3.4S, v22.s[2] +str q2, [x0, #640] +mla v29.4S, v28.4S, v31.s[0] +mul v3.4S, v3.4S,v23.s[2] +sub v28.4s, v26.4s, v20.4s +add v26.4s, v26.4s, v20.4s +sqrdmulh v20.4S, v27.4S, v24.s[0] +mla v3.4S, v1.4S, v31.s[0] +mul v27.4S, v27.4S,v25.s[0] +sub v1.4s, v18.4s, v29.4s +add v18.4s, v18.4s, v29.4s +sqrdmulh v29.4S, v15.4S, v24.s[0] +mla v27.4S, v20.4S, v31.s[0] +mul v15.4S, v15.4S,v25.s[0] +sub v20.4s, v17.4s, v3.4s +add v17.4s, v17.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v24.s[0] +mla v15.4S, v29.4S, v31.s[0] +ldr q29, [x0, #944] +mul v21.4S, v21.4S,v25.s[0] +sub v2.4s, v19.4s, v27.4s +add v19.4s, v19.4s, v27.4s +sqrdmulh v27.4S, v0.4S, v24.s[0] +mla v21.4S, v3.4S, v31.s[0] +ldr q3, [x0, #816] +mul v0.4S, v0.4S,v25.s[0] +sub v30.4s, v12.4s, v15.4s +add v12.4s, v12.4s, v15.4s +sqrdmulh v15.4S, v19.4S, v24.s[1] +str q14, [x0, #16] +mla v0.4S, v27.4S, v31.s[0] +ldr q27, [x0, #560] +mul v19.4S, v19.4S,v25.s[1] +sub v14.4s, v13.4s, v21.4s +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v12.4S, v24.s[1] +str q11, [x0, #144] +mla v19.4S, v15.4S, v31.s[0] +ldr q15, [x0, #688] +mul v12.4S, v12.4S,v25.s[1] +sub v11.4s, v16.4s, v0.4s +add v16.4s, v16.4s, v0.4s +sqrdmulh v0.4S, v2.4S, v24.s[2] +str q26, [x0, #272] +mla v12.4S, v21.4S, v31.s[0] +ldr q21, [x0, #432] +mul v2.4S, v2.4S,v25.s[2] +sub v26.4s, v16.4s, v19.4s +add v16.4s, v16.4s, v19.4s +sqrdmulh v19.4S, v30.4S, v24.s[2] +str q28, [x0, #400] +mla v2.4S, v0.4S, v31.s[0] +ldr q0, [x0, #304] +mul v30.4S, v30.4S,v25.s[2] +sub v28.4s, v13.4s, v12.4s +add v13.4s, v13.4s, v12.4s +sqrdmulh v12.4S, v16.4S, v22.s[0] +str q18, [x0, #784] +mla v30.4S, v19.4S, v31.s[0] +ldr q19, [x0, #48] +mul v16.4S, v16.4S,v23.s[0] +sub v18.4s, v11.4s, v2.4s +add v11.4s, v11.4s, v2.4s +sqrdmulh v2.4S, v26.4S, v22.s[1] +str q1, [x0, #912] +mla v16.4S, v12.4S, v31.s[0] +ldr q12, [x0, #176] +mul v26.4S, v26.4S,v23.s[1] +sub v1.4s, v14.4s, v30.4s +add v14.4s, v14.4s, v30.4s +sqrdmulh v30.4S, v18.4S, v22.s[3] +str q17, [x0, #528] +mla v26.4S, v2.4S, v31.s[0] +mul v18.4S, v18.4S,v23.s[3] +sub v2.4s, v13.4s, v16.4s +add v13.4s, v13.4s, v16.4s +sqrdmulh v16.4S, v11.4S, v22.s[2] +str q20, [x0, #656] +mla v18.4S, v30.4S, v31.s[0] +mul v11.4S, v11.4S,v23.s[2] +sub v30.4s, v28.4s, v26.4s +add v28.4s, v28.4s, v26.4s +sqrdmulh v26.4S, v29.4S, v24.s[0] +mla v11.4S, v16.4S, v31.s[0] +mul v29.4S, v29.4S,v25.s[0] +sub v16.4s, v1.4s, v18.4s +add v1.4s, v1.4s, v18.4s +sqrdmulh v18.4S, v3.4S, v24.s[0] +mla v29.4S, v26.4S, v31.s[0] +mul v3.4S, v3.4S,v25.s[0] +sub v26.4s, v14.4s, v11.4s +add v14.4s, v14.4s, v11.4s +sqrdmulh v11.4S, v27.4S, v24.s[0] +mla v3.4S, v18.4S, v31.s[0] +mul v27.4S, v27.4S,v25.s[0] +sub v18.4s, v21.4s, v29.4s +add v21.4s, v21.4s, v29.4s +sqrdmulh v29.4S, v15.4S, v24.s[0] +mla v27.4S, v11.4S, v31.s[0] +mul v15.4S, v15.4S,v25.s[0] +sub v11.4s, v0.4s, v3.4s +add v0.4s, v0.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v24.s[1] +str q13, [x0, #32] +mla v15.4S, v29.4S, v31.s[0] +mul v21.4S, v21.4S,v25.s[1] +sub v29.4s, v19.4s, v27.4s +add v19.4s, v19.4s, v27.4s +sqrdmulh v27.4S, v0.4S, v24.s[1] +str q2, [x0, #160] +mla v21.4S, v3.4S, v31.s[0] +mul v0.4S, v0.4S,v25.s[1] +sub v3.4s, v12.4s, v15.4s +add v12.4s, v12.4s, v15.4s +sqrdmulh v15.4S, v18.4S, v24.s[2] +str q28, [x0, #288] +mla v0.4S, v27.4S, v31.s[0] +mul v18.4S, v18.4S,v25.s[2] +sub v27.4s, v12.4s, v21.4s +add v12.4s, v12.4s, v21.4s +sqrdmulh v21.4S, v11.4S, v24.s[2] +str q30, [x0, #416] +mla v18.4S, v15.4S, v31.s[0] +mul v11.4S, v11.4S,v25.s[2] +sub v15.4s, v19.4s, v0.4s +add v19.4s, v19.4s, v0.4s +sqrdmulh v0.4S, v12.4S, v22.s[0] +str q1, [x0, #800] +mla v11.4S, v21.4S, v31.s[0] +mul v12.4S, v12.4S,v23.s[0] +sub v21.4s, v3.4s, v18.4s +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v27.4S, v22.s[1] +str q16, [x0, #928] +mla v12.4S, v0.4S, v31.s[0] +mul v27.4S, v27.4S,v23.s[1] +sub v0.4s, v29.4s, v11.4s +add v29.4s, v29.4s, v11.4s +sqrdmulh v11.4S, v21.4S, v22.s[3] +str q14, [x0, #544] +mla v27.4S, v18.4S, v31.s[0] +mul v21.4S, v21.4S,v23.s[3] +sub v18.4s, v19.4s, v12.4s +add v19.4s, v19.4s, v12.4s +sqrdmulh v12.4S, v3.4S, v22.s[2] +str q26, [x0, #672] +mla v21.4S, v11.4S, v31.s[0] +mul v3.4S, v3.4S,v23.s[2] +sub v11.4s, v15.4s, v27.4s +add v15.4s, v15.4s, v27.4s +mla v3.4S, v12.4S, v31.s[0] +sub v12.4s, v0.4s, v21.4s +add v0.4s, v0.4s, v21.4s +sub v21.4s, v29.4s, v3.4s +add v29.4s, v29.4s, v3.4s +str q19, [x0, #48] +str q18, [x0, #176] +str q15, [x0, #304] +str q11, [x0, #432] +str q0, [x0, #816] +str q12, [x0, #944] +str q29, [x0, #560] +str q21, [x0, #688] +ldr q4, [x0, #112] +ldr q5, [x0, #96] +ldr q6, [x0, #64] +ldr q7, [x0, #80] +ldr q8, [x0, #48] +ldr q9, [x17, #+64] +ldr q10, [x17, #+80] +ldr q17, [x17, #+96] +ldr q20, [x17, #+112] +ldr q13, [x0, #32] +ldr q2, [x0, #0] +ldr q28, [x0, #16] +sqrdmulh v30.4S, v4.4S, v10.s[0] +mul v4.4S, v4.4S,v9.s[0] +sqrdmulh v1.4S, v5.4S, v10.s[0] +mla v4.4S, v30.4S, v31.s[0] +mul v5.4S, v5.4S,v9.s[0] +sqrdmulh v30.4S, v6.4S, v10.s[0] +mla v5.4S, v1.4S, v31.s[0] +ldr q1, [x0, #240] +mul v6.4S, v6.4S,v9.s[0] +sub v16.4s, v8.4s, v4.4s +add v8.4s, v8.4s, v4.4s +sqrdmulh v4.4S, v7.4S, v10.s[0] +mla v6.4S, v30.4S, v31.s[0] +ldr q30, [x0, #224] +mul v7.4S, v7.4S,v9.s[0] +sub v14.4s, v13.4s, v5.4s +add v13.4s, v13.4s, v5.4s +sqrdmulh v5.4S, v8.4S, v10.s[1] +mla v7.4S, v4.4S, v31.s[0] +ldr q4, [x0, #192] +mul v8.4S, v8.4S,v9.s[1] +sub v26.4s, v2.4s, v6.4s +add v2.4s, v2.4s, v6.4s +sqrdmulh v6.4S, v13.4S, v10.s[1] +mla v8.4S, v5.4S, v31.s[0] +ldr q5, [x0, #208] +mul v13.4S, v13.4S,v9.s[1] +sub v27.4s, v28.4s, v7.4s +add v28.4s, v28.4s, v7.4s +sqrdmulh v7.4S, v16.4S, v10.s[2] +mla v13.4S, v6.4S, v31.s[0] +ldr q6, [x0, #176] +mul v16.4S, v16.4S,v9.s[2] +sub v3.4s, v28.4s, v8.4s +add v28.4s, v28.4s, v8.4s +ldr q8, [x17, #+128] +ldr q25, [x17, #+144] +ldr q24, [x17, #+160] +ldr q23, [x17, #+176] +sqrdmulh v22.4S, v14.4S, v10.s[2] +mla v16.4S, v7.4S, v31.s[0] +ldr q7, [x0, #160] +mul v14.4S, v14.4S,v9.s[2] +sub v19.4s, v2.4s, v13.4s +add v2.4s, v2.4s, v13.4s +sqrdmulh v13.4S, v28.4S, v20.s[0] +mla v14.4S, v22.4S, v31.s[0] +ldr q22, [x0, #128] +mul v28.4S, v28.4S,v17.s[0] +sub v18.4s, v27.4s, v16.4s +add v27.4s, v27.4s, v16.4s +sqrdmulh v16.4S, v3.4S, v20.s[1] +mla v28.4S, v13.4S, v31.s[0] +ldr q13, [x0, #144] +mul v3.4S, v3.4S,v17.s[1] +sub v15.4s, v26.4s, v14.4s +add v26.4s, v26.4s, v14.4s +sqrdmulh v14.4S, v18.4S, v20.s[3] +mla v3.4S, v16.4S, v31.s[0] +mul v18.4S, v18.4S,v17.s[3] +sub v16.4s, v2.4s, v28.4s +add v2.4s, v2.4s, v28.4s +sqrdmulh v28.4S, v27.4S, v20.s[2] +mla v18.4S, v14.4S, v31.s[0] +mul v27.4S, v27.4S,v17.s[2] +sub v14.4s, v19.4s, v3.4s +add v19.4s, v19.4s, v3.4s +sqrdmulh v3.4S, v1.4S, v25.s[0] +mla v27.4S, v28.4S, v31.s[0] +mul v1.4S, v1.4S,v8.s[0] +sub v28.4s, v15.4s, v18.4s +add v15.4s, v15.4s, v18.4s +sqrdmulh v18.4S, v30.4S, v25.s[0] +mla v1.4S, v3.4S, v31.s[0] +mul v30.4S, v30.4S,v8.s[0] +sub v3.4s, v26.4s, v27.4s +add v26.4s, v26.4s, v27.4s +sqrdmulh v20.4S, v4.4S, v25.s[0] +mla v30.4S, v18.4S, v31.s[0] +ldr q18, [x0, #368] +mul v4.4S, v4.4S,v8.s[0] +sub v17.4s, v6.4s, v1.4s +add v6.4s, v6.4s, v1.4s +sqrdmulh v1.4S, v5.4S, v25.s[0] +mla v4.4S, v20.4S, v31.s[0] +ldr q20, [x0, #352] +mul v5.4S, v5.4S,v8.s[0] +sub v10.4s, v7.4s, v30.4s +add v7.4s, v7.4s, v30.4s +sqrdmulh v30.4S, v6.4S, v25.s[1] +str q2, [x0, #0] +mla v5.4S, v1.4S, v31.s[0] +ldr q1, [x0, #320] +mul v6.4S, v6.4S,v8.s[1] +sub v2.4s, v22.4s, v4.4s +add v22.4s, v22.4s, v4.4s +sqrdmulh v4.4S, v7.4S, v25.s[1] +str q16, [x0, #16] +mla v6.4S, v30.4S, v31.s[0] +ldr q30, [x0, #336] +mul v7.4S, v7.4S,v8.s[1] +sub v16.4s, v13.4s, v5.4s +add v13.4s, v13.4s, v5.4s +sqrdmulh v5.4S, v17.4S, v25.s[2] +str q19, [x0, #32] +mla v7.4S, v4.4S, v31.s[0] +ldr q4, [x0, #304] +mul v17.4S, v17.4S,v8.s[2] +sub v19.4s, v13.4s, v6.4s +add v13.4s, v13.4s, v6.4s +ldr q6, [x17, #+192] +ldr q9, [x17, #+208] +ldr q27, [x17, #+224] +ldr q11, [x17, #+240] +sqrdmulh v0.4S, v10.4S, v25.s[2] +str q14, [x0, #48] +mla v17.4S, v5.4S, v31.s[0] +ldr q5, [x0, #288] +mul v10.4S, v10.4S,v8.s[2] +sub v14.4s, v22.4s, v7.4s +add v22.4s, v22.4s, v7.4s +sqrdmulh v7.4S, v13.4S, v23.s[0] +str q15, [x0, #96] +mla v10.4S, v0.4S, v31.s[0] +ldr q0, [x0, #256] +mul v13.4S, v13.4S,v24.s[0] +sub v15.4s, v16.4s, v17.4s +add v16.4s, v16.4s, v17.4s +sqrdmulh v17.4S, v19.4S, v23.s[1] +str q28, [x0, #112] +mla v13.4S, v7.4S, v31.s[0] +ldr q7, [x0, #272] +mul v19.4S, v19.4S,v24.s[1] +sub v28.4s, v2.4s, v10.4s +add v2.4s, v2.4s, v10.4s +sqrdmulh v10.4S, v15.4S, v23.s[3] +str q26, [x0, #64] +mla v19.4S, v17.4S, v31.s[0] +mul v15.4S, v15.4S,v24.s[3] +sub v17.4s, v22.4s, v13.4s +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v16.4S, v23.s[2] +str q3, [x0, #80] +mla v15.4S, v10.4S, v31.s[0] +mul v16.4S, v16.4S,v24.s[2] +sub v10.4s, v14.4s, v19.4s +add v14.4s, v14.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v9.s[0] +mla v16.4S, v13.4S, v31.s[0] +mul v18.4S, v18.4S,v6.s[0] +sub v13.4s, v28.4s, v15.4s +add v28.4s, v28.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v9.s[0] +mla v18.4S, v19.4S, v31.s[0] +mul v20.4S, v20.4S,v6.s[0] +sub v19.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +sqrdmulh v23.4S, v1.4S, v9.s[0] +mla v20.4S, v15.4S, v31.s[0] +ldr q15, [x0, #496] +mul v1.4S, v1.4S,v6.s[0] +sub v24.4s, v4.4s, v18.4s +add v4.4s, v4.4s, v18.4s +sqrdmulh v18.4S, v30.4S, v9.s[0] +mla v1.4S, v23.4S, v31.s[0] +ldr q23, [x0, #480] +mul v30.4S, v30.4S,v6.s[0] +sub v25.4s, v5.4s, v20.4s +add v5.4s, v5.4s, v20.4s +sqrdmulh v20.4S, v4.4S, v9.s[1] +str q22, [x0, #128] +mla v30.4S, v18.4S, v31.s[0] +ldr q18, [x0, #448] +mul v4.4S, v4.4S,v6.s[1] +sub v22.4s, v0.4s, v1.4s +add v0.4s, v0.4s, v1.4s +sqrdmulh v1.4S, v5.4S, v9.s[1] +str q17, [x0, #144] +mla v4.4S, v20.4S, v31.s[0] +ldr q20, [x0, #464] +mul v5.4S, v5.4S,v6.s[1] +sub v17.4s, v7.4s, v30.4s +add v7.4s, v7.4s, v30.4s +sqrdmulh v30.4S, v24.4S, v9.s[2] +str q14, [x0, #160] +mla v5.4S, v1.4S, v31.s[0] +ldr q1, [x0, #432] +mul v24.4S, v24.4S,v6.s[2] +sub v14.4s, v7.4s, v4.4s +add v7.4s, v7.4s, v4.4s +ldr q4, [x17, #+256] +ldr q8, [x17, #+272] +ldr q16, [x17, #+288] +ldr q3, [x17, #+304] +sqrdmulh v26.4S, v25.4S, v9.s[2] +str q10, [x0, #176] +mla v24.4S, v30.4S, v31.s[0] +ldr q30, [x0, #416] +mul v25.4S, v25.4S,v6.s[2] +sub v10.4s, v0.4s, v5.4s +add v0.4s, v0.4s, v5.4s +sqrdmulh v5.4S, v7.4S, v11.s[0] +str q28, [x0, #224] +mla v25.4S, v26.4S, v31.s[0] +ldr q26, [x0, #384] +mul v7.4S, v7.4S,v27.s[0] +sub v28.4s, v17.4s, v24.4s +add v17.4s, v17.4s, v24.4s +sqrdmulh v24.4S, v14.4S, v11.s[1] +str q13, [x0, #240] +mla v7.4S, v5.4S, v31.s[0] +ldr q5, [x0, #400] +mul v14.4S, v14.4S,v27.s[1] +sub v13.4s, v22.4s, v25.4s +add v22.4s, v22.4s, v25.4s +sqrdmulh v25.4S, v28.4S, v11.s[3] +str q2, [x0, #192] +mla v14.4S, v24.4S, v31.s[0] +mul v28.4S, v28.4S,v27.s[3] +sub v24.4s, v0.4s, v7.4s +add v0.4s, v0.4s, v7.4s +sqrdmulh v7.4S, v17.4S, v11.s[2] +str q19, [x0, #208] +mla v28.4S, v25.4S, v31.s[0] +mul v17.4S, v17.4S,v27.s[2] +sub v25.4s, v10.4s, v14.4s +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v8.s[0] +mla v17.4S, v7.4S, v31.s[0] +mul v15.4S, v15.4S,v4.s[0] +sub v7.4s, v13.4s, v28.4s +add v13.4s, v13.4s, v28.4s +sqrdmulh v28.4S, v23.4S, v8.s[0] +mla v15.4S, v14.4S, v31.s[0] +mul v23.4S, v23.4S,v4.s[0] +sub v14.4s, v22.4s, v17.4s +add v22.4s, v22.4s, v17.4s +sqrdmulh v11.4S, v18.4S, v8.s[0] +mla v23.4S, v28.4S, v31.s[0] +ldr q28, [x0, #624] +mul v18.4S, v18.4S,v4.s[0] +sub v27.4s, v1.4s, v15.4s +add v1.4s, v1.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v8.s[0] +mla v18.4S, v11.4S, v31.s[0] +ldr q11, [x0, #608] +mul v20.4S, v20.4S,v4.s[0] +sub v9.4s, v30.4s, v23.4s +add v30.4s, v30.4s, v23.4s +sqrdmulh v23.4S, v1.4S, v8.s[1] +str q0, [x0, #256] +mla v20.4S, v15.4S, v31.s[0] +ldr q15, [x0, #576] +mul v1.4S, v1.4S,v4.s[1] +sub v0.4s, v26.4s, v18.4s +add v26.4s, v26.4s, v18.4s +sqrdmulh v18.4S, v30.4S, v8.s[1] +str q24, [x0, #272] +mla v1.4S, v23.4S, v31.s[0] +ldr q23, [x0, #592] +mul v30.4S, v30.4S,v4.s[1] +sub v24.4s, v5.4s, v20.4s +add v5.4s, v5.4s, v20.4s +sqrdmulh v20.4S, v27.4S, v8.s[2] +str q10, [x0, #288] +mla v30.4S, v18.4S, v31.s[0] +ldr q18, [x0, #560] +mul v27.4S, v27.4S,v4.s[2] +sub v10.4s, v5.4s, v1.4s +add v5.4s, v5.4s, v1.4s +ldr q1, [x17, #+320] +ldr q6, [x17, #+336] +ldr q17, [x17, #+352] +ldr q19, [x17, #+368] +sqrdmulh v2.4S, v9.4S, v8.s[2] +str q25, [x0, #304] +mla v27.4S, v20.4S, v31.s[0] +ldr q20, [x0, #544] +mul v9.4S, v9.4S,v4.s[2] +sub v25.4s, v26.4s, v30.4s +add v26.4s, v26.4s, v30.4s +sqrdmulh v30.4S, v5.4S, v3.s[0] +str q13, [x0, #352] +mla v9.4S, v2.4S, v31.s[0] +ldr q2, [x0, #512] +mul v5.4S, v5.4S,v16.s[0] +sub v13.4s, v24.4s, v27.4s +add v24.4s, v24.4s, v27.4s +sqrdmulh v27.4S, v10.4S, v3.s[1] +str q7, [x0, #368] +mla v5.4S, v30.4S, v31.s[0] +ldr q30, [x0, #528] +mul v10.4S, v10.4S,v16.s[1] +sub v7.4s, v0.4s, v9.4s +add v0.4s, v0.4s, v9.4s +sqrdmulh v9.4S, v13.4S, v3.s[3] +str q22, [x0, #320] +mla v10.4S, v27.4S, v31.s[0] +mul v13.4S, v13.4S,v16.s[3] +sub v27.4s, v26.4s, v5.4s +add v26.4s, v26.4s, v5.4s +sqrdmulh v5.4S, v24.4S, v3.s[2] +str q14, [x0, #336] +mla v13.4S, v9.4S, v31.s[0] +mul v24.4S, v24.4S,v16.s[2] +sub v9.4s, v25.4s, v10.4s +add v25.4s, v25.4s, v10.4s +sqrdmulh v10.4S, v28.4S, v6.s[0] +mla v24.4S, v5.4S, v31.s[0] +mul v28.4S, v28.4S,v1.s[0] +sub v5.4s, v7.4s, v13.4s +add v7.4s, v7.4s, v13.4s +sqrdmulh v13.4S, v11.4S, v6.s[0] +mla v28.4S, v10.4S, v31.s[0] +mul v11.4S, v11.4S,v1.s[0] +sub v10.4s, v0.4s, v24.4s +add v0.4s, v0.4s, v24.4s +sqrdmulh v3.4S, v15.4S, v6.s[0] +mla v11.4S, v13.4S, v31.s[0] +ldr q13, [x0, #752] +mul v15.4S, v15.4S,v1.s[0] +sub v16.4s, v18.4s, v28.4s +add v18.4s, v18.4s, v28.4s +sqrdmulh v28.4S, v23.4S, v6.s[0] +mla v15.4S, v3.4S, v31.s[0] +ldr q3, [x0, #736] +mul v23.4S, v23.4S,v1.s[0] +sub v8.4s, v20.4s, v11.4s +add v20.4s, v20.4s, v11.4s +sqrdmulh v11.4S, v18.4S, v6.s[1] +str q26, [x0, #384] +mla v23.4S, v28.4S, v31.s[0] +ldr q28, [x0, #704] +mul v18.4S, v18.4S,v1.s[1] +sub v26.4s, v2.4s, v15.4s +add v2.4s, v2.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v6.s[1] +str q27, [x0, #400] +mla v18.4S, v11.4S, v31.s[0] +ldr q11, [x0, #720] +mul v20.4S, v20.4S,v1.s[1] +sub v27.4s, v30.4s, v23.4s +add v30.4s, v30.4s, v23.4s +sqrdmulh v23.4S, v16.4S, v6.s[2] +str q25, [x0, #416] +mla v20.4S, v15.4S, v31.s[0] +ldr q15, [x0, #688] +mul v16.4S, v16.4S,v1.s[2] +sub v25.4s, v30.4s, v18.4s +add v30.4s, v30.4s, v18.4s +ldr q18, [x17, #+384] +ldr q4, [x17, #+400] +ldr q24, [x17, #+416] +ldr q14, [x17, #+432] +sqrdmulh v22.4S, v8.4S, v6.s[2] +str q9, [x0, #432] +mla v16.4S, v23.4S, v31.s[0] +ldr q23, [x0, #672] +mul v8.4S, v8.4S,v1.s[2] +sub v9.4s, v2.4s, v20.4s +add v2.4s, v2.4s, v20.4s +sqrdmulh v20.4S, v30.4S, v19.s[0] +str q7, [x0, #480] +mla v8.4S, v22.4S, v31.s[0] +ldr q22, [x0, #640] +mul v30.4S, v30.4S,v17.s[0] +sub v7.4s, v27.4s, v16.4s +add v27.4s, v27.4s, v16.4s +sqrdmulh v16.4S, v25.4S, v19.s[1] +str q5, [x0, #496] +mla v30.4S, v20.4S, v31.s[0] +ldr q20, [x0, #656] +mul v25.4S, v25.4S,v17.s[1] +sub v5.4s, v26.4s, v8.4s +add v26.4s, v26.4s, v8.4s +sqrdmulh v8.4S, v7.4S, v19.s[3] +str q0, [x0, #448] +mla v25.4S, v16.4S, v31.s[0] +mul v7.4S, v7.4S,v17.s[3] +sub v16.4s, v2.4s, v30.4s +add v2.4s, v2.4s, v30.4s +sqrdmulh v30.4S, v27.4S, v19.s[2] +str q10, [x0, #464] +mla v7.4S, v8.4S, v31.s[0] +mul v27.4S, v27.4S,v17.s[2] +sub v8.4s, v9.4s, v25.4s +add v9.4s, v9.4s, v25.4s +sqrdmulh v25.4S, v13.4S, v4.s[0] +mla v27.4S, v30.4S, v31.s[0] +mul v13.4S, v13.4S,v18.s[0] +sub v30.4s, v5.4s, v7.4s +add v5.4s, v5.4s, v7.4s +sqrdmulh v7.4S, v3.4S, v4.s[0] +mla v13.4S, v25.4S, v31.s[0] +mul v3.4S, v3.4S,v18.s[0] +sub v25.4s, v26.4s, v27.4s +add v26.4s, v26.4s, v27.4s +sqrdmulh v19.4S, v28.4S, v4.s[0] +mla v3.4S, v7.4S, v31.s[0] +ldr q7, [x0, #880] +mul v28.4S, v28.4S,v18.s[0] +sub v17.4s, v15.4s, v13.4s +add v15.4s, v15.4s, v13.4s +sqrdmulh v13.4S, v11.4S, v4.s[0] +mla v28.4S, v19.4S, v31.s[0] +ldr q19, [x0, #864] +mul v11.4S, v11.4S,v18.s[0] +sub v6.4s, v23.4s, v3.4s +add v23.4s, v23.4s, v3.4s +sqrdmulh v3.4S, v15.4S, v4.s[1] +str q2, [x0, #512] +mla v11.4S, v13.4S, v31.s[0] +ldr q13, [x0, #832] +mul v15.4S, v15.4S,v18.s[1] +sub v2.4s, v22.4s, v28.4s +add v22.4s, v22.4s, v28.4s +sqrdmulh v28.4S, v23.4S, v4.s[1] +str q16, [x0, #528] +mla v15.4S, v3.4S, v31.s[0] +ldr q3, [x0, #848] +mul v23.4S, v23.4S,v18.s[1] +sub v16.4s, v20.4s, v11.4s +add v20.4s, v20.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v4.s[2] +str q9, [x0, #544] +mla v23.4S, v28.4S, v31.s[0] +ldr q28, [x0, #816] +mul v17.4S, v17.4S,v18.s[2] +sub v9.4s, v20.4s, v15.4s +add v20.4s, v20.4s, v15.4s +ldr q15, [x17, #+448] +ldr q1, [x17, #+464] +ldr q27, [x17, #+480] +ldr q10, [x17, #+496] +sqrdmulh v0.4S, v6.4S, v4.s[2] +str q8, [x0, #560] +mla v17.4S, v11.4S, v31.s[0] +ldr q11, [x0, #800] +mul v6.4S, v6.4S,v18.s[2] +sub v8.4s, v22.4s, v23.4s +add v22.4s, v22.4s, v23.4s +sqrdmulh v23.4S, v20.4S, v14.s[0] +str q5, [x0, #608] +mla v6.4S, v0.4S, v31.s[0] +ldr q0, [x0, #768] +mul v20.4S, v20.4S,v24.s[0] +sub v5.4s, v16.4s, v17.4s +add v16.4s, v16.4s, v17.4s +sqrdmulh v17.4S, v9.4S, v14.s[1] +str q30, [x0, #624] +mla v20.4S, v23.4S, v31.s[0] +ldr q23, [x0, #784] +mul v9.4S, v9.4S,v24.s[1] +sub v30.4s, v2.4s, v6.4s +add v2.4s, v2.4s, v6.4s +sqrdmulh v6.4S, v5.4S, v14.s[3] +str q26, [x0, #576] +mla v9.4S, v17.4S, v31.s[0] +mul v5.4S, v5.4S,v24.s[3] +sub v17.4s, v22.4s, v20.4s +add v22.4s, v22.4s, v20.4s +sqrdmulh v20.4S, v16.4S, v14.s[2] +str q25, [x0, #592] +mla v5.4S, v6.4S, v31.s[0] +mul v16.4S, v16.4S,v24.s[2] +sub v6.4s, v8.4s, v9.4s +add v8.4s, v8.4s, v9.4s +sqrdmulh v9.4S, v7.4S, v1.s[0] +mla v16.4S, v20.4S, v31.s[0] +mul v7.4S, v7.4S,v15.s[0] +sub v20.4s, v30.4s, v5.4s +add v30.4s, v30.4s, v5.4s +sqrdmulh v5.4S, v19.4S, v1.s[0] +mla v7.4S, v9.4S, v31.s[0] +mul v19.4S, v19.4S,v15.s[0] +sub v9.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +sqrdmulh v14.4S, v13.4S, v1.s[0] +mla v19.4S, v5.4S, v31.s[0] +ldr q5, [x0, #1008] +mul v13.4S, v13.4S,v15.s[0] +sub v24.4s, v28.4s, v7.4s +add v28.4s, v28.4s, v7.4s +sqrdmulh v7.4S, v3.4S, v1.s[0] +mla v13.4S, v14.4S, v31.s[0] +ldr q14, [x0, #992] +mul v3.4S, v3.4S,v15.s[0] +sub v4.4s, v11.4s, v19.4s +add v11.4s, v11.4s, v19.4s +sqrdmulh v19.4S, v28.4S, v1.s[1] +str q22, [x0, #640] +mla v3.4S, v7.4S, v31.s[0] +ldr q7, [x0, #960] +mul v28.4S, v28.4S,v15.s[1] +sub v22.4s, v0.4s, v13.4s +add v0.4s, v0.4s, v13.4s +sqrdmulh v13.4S, v11.4S, v1.s[1] +str q17, [x0, #656] +mla v28.4S, v19.4S, v31.s[0] +ldr q19, [x0, #976] +mul v11.4S, v11.4S,v15.s[1] +sub v17.4s, v23.4s, v3.4s +add v23.4s, v23.4s, v3.4s +sqrdmulh v3.4S, v24.4S, v1.s[2] +str q8, [x0, #672] +mla v11.4S, v13.4S, v31.s[0] +ldr q13, [x0, #944] +mul v24.4S, v24.4S,v15.s[2] +sub v8.4s, v23.4s, v28.4s +add v23.4s, v23.4s, v28.4s +ldr q28, [x17, #+512] +ldr q18, [x17, #+528] +ldr q16, [x17, #+544] +ldr q25, [x17, #+560] +sqrdmulh v26.4S, v4.4S, v1.s[2] +str q6, [x0, #688] +mla v24.4S, v3.4S, v31.s[0] +ldr q3, [x0, #928] +mul v4.4S, v4.4S,v15.s[2] +sub v6.4s, v0.4s, v11.4s +add v0.4s, v0.4s, v11.4s +sqrdmulh v11.4S, v23.4S, v10.s[0] +str q30, [x0, #736] +mla v4.4S, v26.4S, v31.s[0] +ldr q26, [x0, #896] +mul v23.4S, v23.4S,v27.s[0] +sub v30.4s, v17.4s, v24.4s +add v17.4s, v17.4s, v24.4s +sqrdmulh v24.4S, v8.4S, v10.s[1] +str q20, [x0, #752] +mla v23.4S, v11.4S, v31.s[0] +ldr q11, [x0, #912] +mul v8.4S, v8.4S,v27.s[1] +sub v20.4s, v22.4s, v4.4s +add v22.4s, v22.4s, v4.4s +sqrdmulh v4.4S, v30.4S, v10.s[3] +str q2, [x0, #704] +mla v8.4S, v24.4S, v31.s[0] +mul v30.4S, v30.4S,v27.s[3] +sub v24.4s, v0.4s, v23.4s +add v0.4s, v0.4s, v23.4s +sqrdmulh v23.4S, v17.4S, v10.s[2] +str q9, [x0, #720] +mla v30.4S, v4.4S, v31.s[0] +mul v17.4S, v17.4S,v27.s[2] +sub v4.4s, v6.4s, v8.4s +add v6.4s, v6.4s, v8.4s +sqrdmulh v8.4S, v5.4S, v18.s[0] +mla v17.4S, v23.4S, v31.s[0] +mul v5.4S, v5.4S,v28.s[0] +sub v23.4s, v20.4s, v30.4s +add v20.4s, v20.4s, v30.4s +sqrdmulh v30.4S, v14.4S, v18.s[0] +mla v5.4S, v8.4S, v31.s[0] +mul v14.4S, v14.4S,v28.s[0] +sub v8.4s, v22.4s, v17.4s +add v22.4s, v22.4s, v17.4s +sqrdmulh v10.4S, v7.4S, v18.s[0] +mla v14.4S, v30.4S, v31.s[0] +mul v7.4S, v7.4S,v28.s[0] +sub v30.4s, v13.4s, v5.4s +add v13.4s, v13.4s, v5.4s +sqrdmulh v5.4S, v19.4S, v18.s[0] +mla v7.4S, v10.4S, v31.s[0] +mul v19.4S, v19.4S,v28.s[0] +sub v10.4s, v3.4s, v14.4s +add v3.4s, v3.4s, v14.4s +sqrdmulh v14.4S, v13.4S, v18.s[1] +str q0, [x0, #768] +mla v19.4S, v5.4S, v31.s[0] +mul v13.4S, v13.4S,v28.s[1] +sub v5.4s, v26.4s, v7.4s +add v26.4s, v26.4s, v7.4s +sqrdmulh v7.4S, v3.4S, v18.s[1] +str q24, [x0, #784] +mla v13.4S, v14.4S, v31.s[0] +mul v3.4S, v3.4S,v28.s[1] +sub v14.4s, v11.4s, v19.4s +add v11.4s, v11.4s, v19.4s +sqrdmulh v19.4S, v30.4S, v18.s[2] +str q6, [x0, #800] +mla v3.4S, v7.4S, v31.s[0] +mul v30.4S, v30.4S,v28.s[2] +sub v7.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +sqrdmulh v13.4S, v10.4S, v18.s[2] +str q4, [x0, #816] +mla v30.4S, v19.4S, v31.s[0] +mul v10.4S, v10.4S,v28.s[2] +sub v19.4s, v26.4s, v3.4s +add v26.4s, v26.4s, v3.4s +sqrdmulh v3.4S, v11.4S, v25.s[0] +str q20, [x0, #864] +mla v10.4S, v13.4S, v31.s[0] +mul v11.4S, v11.4S,v16.s[0] +sub v13.4s, v14.4s, v30.4s +add v14.4s, v14.4s, v30.4s +sqrdmulh v30.4S, v7.4S, v25.s[1] +str q23, [x0, #880] +mla v11.4S, v3.4S, v31.s[0] +mul v7.4S, v7.4S,v16.s[1] +sub v3.4s, v5.4s, v10.4s +add v5.4s, v5.4s, v10.4s +sqrdmulh v10.4S, v13.4S, v25.s[3] +str q22, [x0, #832] +mla v7.4S, v30.4S, v31.s[0] +mul v13.4S, v13.4S,v16.s[3] +sub v30.4s, v26.4s, v11.4s +add v26.4s, v26.4s, v11.4s +sqrdmulh v11.4S, v14.4S, v25.s[2] +str q8, [x0, #848] +mla v13.4S, v10.4S, v31.s[0] +mul v14.4S, v14.4S,v16.s[2] +sub v10.4s, v19.4s, v7.4s +add v19.4s, v19.4s, v7.4s +mla v14.4S, v11.4S, v31.s[0] +sub v11.4s, v3.4s, v13.4s +add v3.4s, v3.4s, v13.4s +sub v13.4s, v5.4s, v14.4s +add v5.4s, v5.4s, v14.4s +str q26, [x0, #896] +str q30, [x0, #912] +str q19, [x0, #928] +str q10, [x0, #944] +str q3, [x0, #992] +str q11, [x0, #1008] +str q5, [x0, #960] +str q13, [x0, #976] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1444 +// Instruction count: 1440 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_3_3_5.s b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_3_3_5.s new file mode 100644 index 0000000..2225ee5 --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_3_3_5.s @@ -0,0 +1,1474 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 23825509 // Layer 4, block 0 +.word 27028662 // Layer 4, block 1 +.word 0 // Layer None, block None +.word 1307297022 // Layer 3, block 0 +.word 1524716204 // Layer 4, block 0 +.word 1729702351 // Layer 4, block 1 +.word 0 // Layer None, block None +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 14626653 // Layer 3, block 1 +.word 14833295 // Layer 4, block 2 +.word 2138810 // Layer 4, block 3 +.word 0 // Layer None, block None +.word 936034350 // Layer 3, block 1 +.word 949258429 // Layer 4, block 2 +.word 136873393 // Layer 4, block 3 +.word 0 // Layer None, block None +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 29737761 // Layer 3, block 2 +.word 6490403 // Layer 4, block 4 +.word 19648405 // Layer 4, block 5 +.word 0 // Layer None, block None +.word 1903071454 // Layer 3, block 2 +.word 415354091 // Layer 4, block 4 +.word 1257401950 // Layer 4, block 5 +.word 0 // Layer None, block None +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 30285189 // Layer 3, block 3 +.word 31254932 // Layer 4, block 6 +.word 26362414 // Layer 4, block 7 +.word 0 // Layer None, block None +.word 1938104173 // Layer 3, block 3 +.word 2000162988 // Layer 4, block 6 +.word 1687065733 // Layer 4, block 7 +.word 0 // Layer None, block None +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 21289485 // Layer 3, block 4 +.word 572895 // Layer 4, block 8 +.word 26691971 // Layer 4, block 9 +.word 0 // Layer None, block None +.word 1362423055 // Layer 3, block 4 +.word 36662482 // Layer 4, block 8 +.word 1708155771 // Layer 4, block 9 +.word 0 // Layer None, block None +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 9914896 // Layer 3, block 5 +.word 9249292 // Layer 4, block 10 +.word 29292862 // Layer 4, block 11 +.word 0 // Layer None, block None +.word 634504916 // Layer 3, block 5 +.word 591909511 // Layer 4, block 10 +.word 1874600091 // Layer 4, block 11 +.word 0 // Layer None, block None +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 22603682 // Layer 3, block 6 +.word 8247799 // Layer 4, block 12 +.word 5086187 // Layer 4, block 13 +.word 0 // Layer None, block None +.word 1446525244 // Layer 3, block 6 +.word 527818851 // Layer 4, block 12 +.word 325491125 // Layer 4, block 13 +.word 0 // Layer None, block None +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 16204162 // Layer 3, block 7 +.word 28113639 // Layer 4, block 14 +.word 8471290 // Layer 4, block 15 +.word 0 // Layer None, block None +.word 1036987221 // Layer 3, block 7 +.word 1799135579 // Layer 4, block 14 +.word 542121183 // Layer 4, block 15 +.word 0 // Layer None, block None +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.text +.global ntt_u32_incomplete_neon_asm_var_3_3_5 +.global _ntt_u32_incomplete_neon_asm_var_3_3_5 +ntt_u32_incomplete_neon_asm_var_3_3_5: +_ntt_u32_incomplete_neon_asm_var_3_3_5: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x0, #960] +ldr q29, [x0, #832] +ldr q28, [x0, #576] +ldr q27, [x0, #704] +ldr q26, [x0, #448] +ldr q25, [x17, #+0] +ldr q24, [x17, #+16] +ldr q23, [x17, #+32] +ldr q22, [x17, #+48] +ldr q21, [x0, #320] +ldr q20, [x0, #64] +ldr q19, [x0, #192] +sqrdmulh v18.4S, v30.4S, v24.s[0] +sqrdmulh v17.4S, v29.4S, v24.s[0] +mul v30.4S, v30.4S,v25.s[0] +mla v30.4S, v18.4S, v31.s[0] +sqrdmulh v18.4S, v28.4S, v24.s[0] +ldr q16, [x0, #976] +mul v29.4S, v29.4S,v25.s[0] +mla v29.4S, v17.4S, v31.s[0] +sub v17.4s, v26.4s, v30.4s +add v26.4s, v26.4s, v30.4s +sqrdmulh v30.4S, v27.4S, v24.s[0] +ldr q3, [x0, #848] +mul v28.4S, v28.4S,v25.s[0] +mla v28.4S, v18.4S, v31.s[0] +sub v18.4s, v21.4s, v29.4s +add v21.4s, v21.4s, v29.4s +sqrdmulh v29.4S, v26.4S, v24.s[1] +ldr q2, [x0, #592] +mul v27.4S, v27.4S,v25.s[0] +mla v27.4S, v30.4S, v31.s[0] +sub v30.4s, v20.4s, v28.4s +add v20.4s, v20.4s, v28.4s +sqrdmulh v28.4S, v21.4S, v24.s[1] +ldr q1, [x0, #720] +mul v26.4S, v26.4S,v25.s[1] +mla v26.4S, v29.4S, v31.s[0] +sub v29.4s, v19.4s, v27.4s +add v19.4s, v19.4s, v27.4s +sqrdmulh v27.4S, v17.4S, v24.s[2] +ldr q0, [x0, #464] +mul v21.4S, v21.4S,v25.s[1] +mla v21.4S, v28.4S, v31.s[0] +sub v28.4s, v19.4s, v26.4s +add v19.4s, v19.4s, v26.4s +sqrdmulh v26.4S, v18.4S, v24.s[2] +ldr q15, [x0, #336] +mul v17.4S, v17.4S,v25.s[2] +mla v17.4S, v27.4S, v31.s[0] +sub v27.4s, v20.4s, v21.4s +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v19.4S, v22.s[0] +ldr q14, [x0, #80] +mul v18.4S, v18.4S,v25.s[2] +mla v18.4S, v26.4S, v31.s[0] +sub v26.4s, v29.4s, v17.4s +add v29.4s, v29.4s, v17.4s +sqrdmulh v17.4S, v28.4S, v22.s[1] +ldr q13, [x0, #208] +mul v19.4S, v19.4S,v23.s[0] +mla v19.4S, v21.4S, v31.s[0] +sub v21.4s, v30.4s, v18.4s +add v30.4s, v30.4s, v18.4s +sqrdmulh v18.4S, v26.4S, v22.s[3] +mul v28.4S, v28.4S,v23.s[1] +mla v28.4S, v17.4S, v31.s[0] +sub v17.4s, v20.4s, v19.4s +add v20.4s, v20.4s, v19.4s +sqrdmulh v19.4S, v29.4S, v22.s[2] +mul v26.4S, v26.4S,v23.s[3] +mla v26.4S, v18.4S, v31.s[0] +sub v18.4s, v27.4s, v28.4s +add v27.4s, v27.4s, v28.4s +sqrdmulh v28.4S, v16.4S, v24.s[0] +mul v29.4S, v29.4S,v23.s[2] +mla v29.4S, v19.4S, v31.s[0] +sub v19.4s, v21.4s, v26.4s +add v21.4s, v21.4s, v26.4s +sqrdmulh v26.4S, v3.4S, v24.s[0] +mul v16.4S, v16.4S,v25.s[0] +mla v16.4S, v28.4S, v31.s[0] +sub v28.4s, v30.4s, v29.4s +add v30.4s, v30.4s, v29.4s +sqrdmulh v29.4S, v2.4S, v24.s[0] +ldr q12, [x0, #992] +mul v3.4S, v3.4S,v25.s[0] +mla v3.4S, v26.4S, v31.s[0] +sub v26.4s, v0.4s, v16.4s +add v0.4s, v0.4s, v16.4s +sqrdmulh v16.4S, v1.4S, v24.s[0] +ldr q11, [x0, #864] +mul v2.4S, v2.4S,v25.s[0] +mla v2.4S, v29.4S, v31.s[0] +sub v29.4s, v15.4s, v3.4s +add v15.4s, v15.4s, v3.4s +sqrdmulh v3.4S, v0.4S, v24.s[1] +str q20, [x0, #64] +ldr q20, [x0, #608] +mul v1.4S, v1.4S,v25.s[0] +mla v1.4S, v16.4S, v31.s[0] +sub v16.4s, v14.4s, v2.4s +add v14.4s, v14.4s, v2.4s +sqrdmulh v2.4S, v15.4S, v24.s[1] +str q17, [x0, #192] +ldr q17, [x0, #736] +mul v0.4S, v0.4S,v25.s[1] +mla v0.4S, v3.4S, v31.s[0] +sub v3.4s, v13.4s, v1.4s +add v13.4s, v13.4s, v1.4s +sqrdmulh v1.4S, v26.4S, v24.s[2] +str q27, [x0, #320] +ldr q27, [x0, #480] +mul v15.4S, v15.4S,v25.s[1] +mla v15.4S, v2.4S, v31.s[0] +sub v2.4s, v13.4s, v0.4s +add v13.4s, v13.4s, v0.4s +sqrdmulh v0.4S, v29.4S, v24.s[2] +str q18, [x0, #448] +ldr q18, [x0, #352] +mul v26.4S, v26.4S,v25.s[2] +mla v26.4S, v1.4S, v31.s[0] +sub v1.4s, v14.4s, v15.4s +add v14.4s, v14.4s, v15.4s +sqrdmulh v15.4S, v13.4S, v22.s[0] +str q21, [x0, #832] +ldr q21, [x0, #96] +mul v29.4S, v29.4S,v25.s[2] +mla v29.4S, v0.4S, v31.s[0] +sub v0.4s, v3.4s, v26.4s +add v3.4s, v3.4s, v26.4s +sqrdmulh v26.4S, v2.4S, v22.s[1] +str q19, [x0, #960] +ldr q19, [x0, #224] +mul v13.4S, v13.4S,v23.s[0] +mla v13.4S, v15.4S, v31.s[0] +sub v15.4s, v16.4s, v29.4s +add v16.4s, v16.4s, v29.4s +sqrdmulh v29.4S, v0.4S, v22.s[3] +str q30, [x0, #576] +mul v2.4S, v2.4S,v23.s[1] +mla v2.4S, v26.4S, v31.s[0] +sub v26.4s, v14.4s, v13.4s +add v14.4s, v14.4s, v13.4s +sqrdmulh v13.4S, v3.4S, v22.s[2] +str q28, [x0, #704] +mul v0.4S, v0.4S,v23.s[3] +mla v0.4S, v29.4S, v31.s[0] +sub v29.4s, v1.4s, v2.4s +add v1.4s, v1.4s, v2.4s +sqrdmulh v2.4S, v12.4S, v24.s[0] +mul v3.4S, v3.4S,v23.s[2] +mla v3.4S, v13.4S, v31.s[0] +sub v13.4s, v15.4s, v0.4s +add v15.4s, v15.4s, v0.4s +sqrdmulh v0.4S, v11.4S, v24.s[0] +mul v12.4S, v12.4S,v25.s[0] +mla v12.4S, v2.4S, v31.s[0] +sub v2.4s, v16.4s, v3.4s +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v20.4S, v24.s[0] +ldr q28, [x0, #1008] +mul v11.4S, v11.4S,v25.s[0] +mla v11.4S, v0.4S, v31.s[0] +sub v0.4s, v27.4s, v12.4s +add v27.4s, v27.4s, v12.4s +sqrdmulh v12.4S, v17.4S, v24.s[0] +ldr q30, [x0, #880] +mul v20.4S, v20.4S,v25.s[0] +mla v20.4S, v3.4S, v31.s[0] +sub v3.4s, v18.4s, v11.4s +add v18.4s, v18.4s, v11.4s +sqrdmulh v11.4S, v27.4S, v24.s[1] +str q14, [x0, #80] +ldr q14, [x0, #624] +mul v17.4S, v17.4S,v25.s[0] +mla v17.4S, v12.4S, v31.s[0] +sub v12.4s, v21.4s, v20.4s +add v21.4s, v21.4s, v20.4s +sqrdmulh v20.4S, v18.4S, v24.s[1] +str q26, [x0, #208] +ldr q26, [x0, #752] +mul v27.4S, v27.4S,v25.s[1] +mla v27.4S, v11.4S, v31.s[0] +sub v11.4s, v19.4s, v17.4s +add v19.4s, v19.4s, v17.4s +sqrdmulh v17.4S, v0.4S, v24.s[2] +str q1, [x0, #336] +ldr q1, [x0, #496] +mul v18.4S, v18.4S,v25.s[1] +mla v18.4S, v20.4S, v31.s[0] +sub v20.4s, v19.4s, v27.4s +add v19.4s, v19.4s, v27.4s +sqrdmulh v27.4S, v3.4S, v24.s[2] +str q29, [x0, #464] +ldr q29, [x0, #368] +mul v0.4S, v0.4S,v25.s[2] +mla v0.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v18.4s +add v21.4s, v21.4s, v18.4s +sqrdmulh v18.4S, v19.4S, v22.s[0] +str q15, [x0, #848] +ldr q15, [x0, #112] +mul v3.4S, v3.4S,v25.s[2] +mla v3.4S, v27.4S, v31.s[0] +sub v27.4s, v11.4s, v0.4s +add v11.4s, v11.4s, v0.4s +sqrdmulh v0.4S, v20.4S, v22.s[1] +str q13, [x0, #976] +ldr q13, [x0, #240] +mul v19.4S, v19.4S,v23.s[0] +mla v19.4S, v18.4S, v31.s[0] +sub v18.4s, v12.4s, v3.4s +add v12.4s, v12.4s, v3.4s +sqrdmulh v3.4S, v27.4S, v22.s[3] +str q16, [x0, #592] +mul v20.4S, v20.4S,v23.s[1] +mla v20.4S, v0.4S, v31.s[0] +sub v0.4s, v21.4s, v19.4s +add v21.4s, v21.4s, v19.4s +sqrdmulh v19.4S, v11.4S, v22.s[2] +str q2, [x0, #720] +mul v27.4S, v27.4S,v23.s[3] +mla v27.4S, v3.4S, v31.s[0] +sub v3.4s, v17.4s, v20.4s +add v17.4s, v17.4s, v20.4s +sqrdmulh v20.4S, v28.4S, v24.s[0] +mul v11.4S, v11.4S,v23.s[2] +mla v11.4S, v19.4S, v31.s[0] +sub v19.4s, v18.4s, v27.4s +add v18.4s, v18.4s, v27.4s +sqrdmulh v27.4S, v30.4S, v24.s[0] +mul v28.4S, v28.4S,v25.s[0] +mla v28.4S, v20.4S, v31.s[0] +sub v20.4s, v12.4s, v11.4s +add v12.4s, v12.4s, v11.4s +sqrdmulh v11.4S, v14.4S, v24.s[0] +ldr q2, [x0, #896] +mul v30.4S, v30.4S,v25.s[0] +mla v30.4S, v27.4S, v31.s[0] +sub v27.4s, v1.4s, v28.4s +add v1.4s, v1.4s, v28.4s +sqrdmulh v28.4S, v26.4S, v24.s[0] +ldr q16, [x0, #768] +mul v14.4S, v14.4S,v25.s[0] +mla v14.4S, v11.4S, v31.s[0] +sub v11.4s, v29.4s, v30.4s +add v29.4s, v29.4s, v30.4s +sqrdmulh v30.4S, v1.4S, v24.s[1] +str q21, [x0, #96] +ldr q21, [x0, #512] +mul v26.4S, v26.4S,v25.s[0] +mla v26.4S, v28.4S, v31.s[0] +sub v28.4s, v15.4s, v14.4s +add v15.4s, v15.4s, v14.4s +sqrdmulh v14.4S, v29.4S, v24.s[1] +str q0, [x0, #224] +ldr q0, [x0, #640] +mul v1.4S, v1.4S,v25.s[1] +mla v1.4S, v30.4S, v31.s[0] +sub v30.4s, v13.4s, v26.4s +add v13.4s, v13.4s, v26.4s +sqrdmulh v26.4S, v27.4S, v24.s[2] +str q17, [x0, #352] +ldr q17, [x0, #384] +mul v29.4S, v29.4S,v25.s[1] +mla v29.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v1.4s +add v13.4s, v13.4s, v1.4s +sqrdmulh v1.4S, v11.4S, v24.s[2] +str q3, [x0, #480] +ldr q3, [x0, #256] +mul v27.4S, v27.4S,v25.s[2] +mla v27.4S, v26.4S, v31.s[0] +sub v26.4s, v15.4s, v29.4s +add v15.4s, v15.4s, v29.4s +sqrdmulh v29.4S, v13.4S, v22.s[0] +str q18, [x0, #864] +ldr q18, [x0, #0] +mul v11.4S, v11.4S,v25.s[2] +mla v11.4S, v1.4S, v31.s[0] +sub v1.4s, v30.4s, v27.4s +add v30.4s, v30.4s, v27.4s +sqrdmulh v27.4S, v14.4S, v22.s[1] +str q19, [x0, #992] +ldr q19, [x0, #128] +mul v13.4S, v13.4S,v23.s[0] +mla v13.4S, v29.4S, v31.s[0] +sub v29.4s, v28.4s, v11.4s +add v28.4s, v28.4s, v11.4s +sqrdmulh v11.4S, v1.4S, v22.s[3] +str q12, [x0, #608] +mul v14.4S, v14.4S,v23.s[1] +mla v14.4S, v27.4S, v31.s[0] +sub v27.4s, v15.4s, v13.4s +add v15.4s, v15.4s, v13.4s +sqrdmulh v13.4S, v30.4S, v22.s[2] +str q20, [x0, #736] +mul v1.4S, v1.4S,v23.s[3] +mla v1.4S, v11.4S, v31.s[0] +sub v11.4s, v26.4s, v14.4s +add v26.4s, v26.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v24.s[0] +mul v30.4S, v30.4S,v23.s[2] +mla v30.4S, v13.4S, v31.s[0] +sub v13.4s, v29.4s, v1.4s +add v29.4s, v29.4s, v1.4s +sqrdmulh v1.4S, v16.4S, v24.s[0] +mul v2.4S, v2.4S,v25.s[0] +mla v2.4S, v14.4S, v31.s[0] +sub v14.4s, v28.4s, v30.4s +add v28.4s, v28.4s, v30.4s +sqrdmulh v30.4S, v21.4S, v24.s[0] +ldr q20, [x0, #912] +mul v16.4S, v16.4S,v25.s[0] +mla v16.4S, v1.4S, v31.s[0] +sub v1.4s, v17.4s, v2.4s +add v17.4s, v17.4s, v2.4s +sqrdmulh v2.4S, v0.4S, v24.s[0] +ldr q12, [x0, #784] +mul v21.4S, v21.4S,v25.s[0] +mla v21.4S, v30.4S, v31.s[0] +sub v30.4s, v3.4s, v16.4s +add v3.4s, v3.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v24.s[1] +str q15, [x0, #112] +ldr q15, [x0, #528] +mul v0.4S, v0.4S,v25.s[0] +mla v0.4S, v2.4S, v31.s[0] +sub v2.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v3.4S, v24.s[1] +str q27, [x0, #240] +ldr q27, [x0, #656] +mul v17.4S, v17.4S,v25.s[1] +mla v17.4S, v16.4S, v31.s[0] +sub v16.4s, v19.4s, v0.4s +add v19.4s, v19.4s, v0.4s +sqrdmulh v0.4S, v1.4S, v24.s[2] +str q26, [x0, #368] +ldr q26, [x0, #400] +mul v3.4S, v3.4S,v25.s[1] +mla v3.4S, v21.4S, v31.s[0] +sub v21.4s, v19.4s, v17.4s +add v19.4s, v19.4s, v17.4s +sqrdmulh v17.4S, v30.4S, v24.s[2] +str q11, [x0, #496] +ldr q11, [x0, #272] +mul v1.4S, v1.4S,v25.s[2] +mla v1.4S, v0.4S, v31.s[0] +sub v0.4s, v18.4s, v3.4s +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v19.4S, v22.s[0] +str q29, [x0, #880] +ldr q29, [x0, #16] +mul v30.4S, v30.4S,v25.s[2] +mla v30.4S, v17.4S, v31.s[0] +sub v17.4s, v16.4s, v1.4s +add v16.4s, v16.4s, v1.4s +sqrdmulh v1.4S, v21.4S, v22.s[1] +str q13, [x0, #1008] +ldr q13, [x0, #144] +mul v19.4S, v19.4S,v23.s[0] +mla v19.4S, v3.4S, v31.s[0] +sub v3.4s, v2.4s, v30.4s +add v2.4s, v2.4s, v30.4s +sqrdmulh v30.4S, v17.4S, v22.s[3] +str q28, [x0, #624] +mul v21.4S, v21.4S,v23.s[1] +mla v21.4S, v1.4S, v31.s[0] +sub v1.4s, v18.4s, v19.4s +add v18.4s, v18.4s, v19.4s +sqrdmulh v19.4S, v16.4S, v22.s[2] +str q14, [x0, #752] +mul v17.4S, v17.4S,v23.s[3] +mla v17.4S, v30.4S, v31.s[0] +sub v30.4s, v0.4s, v21.4s +add v0.4s, v0.4s, v21.4s +sqrdmulh v21.4S, v20.4S, v24.s[0] +mul v16.4S, v16.4S,v23.s[2] +mla v16.4S, v19.4S, v31.s[0] +sub v19.4s, v3.4s, v17.4s +add v3.4s, v3.4s, v17.4s +sqrdmulh v17.4S, v12.4S, v24.s[0] +mul v20.4S, v20.4S,v25.s[0] +mla v20.4S, v21.4S, v31.s[0] +sub v21.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v15.4S, v24.s[0] +ldr q14, [x0, #928] +mul v12.4S, v12.4S,v25.s[0] +mla v12.4S, v17.4S, v31.s[0] +sub v17.4s, v26.4s, v20.4s +add v26.4s, v26.4s, v20.4s +sqrdmulh v20.4S, v27.4S, v24.s[0] +ldr q28, [x0, #800] +mul v15.4S, v15.4S,v25.s[0] +mla v15.4S, v16.4S, v31.s[0] +sub v16.4s, v11.4s, v12.4s +add v11.4s, v11.4s, v12.4s +sqrdmulh v12.4S, v26.4S, v24.s[1] +str q18, [x0, #0] +ldr q18, [x0, #544] +mul v27.4S, v27.4S,v25.s[0] +mla v27.4S, v20.4S, v31.s[0] +sub v20.4s, v29.4s, v15.4s +add v29.4s, v29.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v24.s[1] +str q1, [x0, #128] +ldr q1, [x0, #672] +mul v26.4S, v26.4S,v25.s[1] +mla v26.4S, v12.4S, v31.s[0] +sub v12.4s, v13.4s, v27.4s +add v13.4s, v13.4s, v27.4s +sqrdmulh v27.4S, v17.4S, v24.s[2] +str q0, [x0, #256] +ldr q0, [x0, #416] +mul v11.4S, v11.4S,v25.s[1] +mla v11.4S, v15.4S, v31.s[0] +sub v15.4s, v13.4s, v26.4s +add v13.4s, v13.4s, v26.4s +sqrdmulh v26.4S, v16.4S, v24.s[2] +str q30, [x0, #384] +ldr q30, [x0, #288] +mul v17.4S, v17.4S,v25.s[2] +mla v17.4S, v27.4S, v31.s[0] +sub v27.4s, v29.4s, v11.4s +add v29.4s, v29.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v22.s[0] +str q3, [x0, #768] +ldr q3, [x0, #32] +mul v16.4S, v16.4S,v25.s[2] +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v12.4s, v17.4s +add v12.4s, v12.4s, v17.4s +sqrdmulh v17.4S, v15.4S, v22.s[1] +str q19, [x0, #896] +ldr q19, [x0, #160] +mul v13.4S, v13.4S,v23.s[0] +mla v13.4S, v11.4S, v31.s[0] +sub v11.4s, v20.4s, v16.4s +add v20.4s, v20.4s, v16.4s +sqrdmulh v16.4S, v26.4S, v22.s[3] +str q2, [x0, #512] +mul v15.4S, v15.4S,v23.s[1] +mla v15.4S, v17.4S, v31.s[0] +sub v17.4s, v29.4s, v13.4s +add v29.4s, v29.4s, v13.4s +sqrdmulh v13.4S, v12.4S, v22.s[2] +str q21, [x0, #640] +mul v26.4S, v26.4S,v23.s[3] +mla v26.4S, v16.4S, v31.s[0] +sub v16.4s, v27.4s, v15.4s +add v27.4s, v27.4s, v15.4s +sqrdmulh v15.4S, v14.4S, v24.s[0] +mul v12.4S, v12.4S,v23.s[2] +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v26.4s +add v11.4s, v11.4s, v26.4s +sqrdmulh v26.4S, v28.4S, v24.s[0] +mul v14.4S, v14.4S,v25.s[0] +mla v14.4S, v15.4S, v31.s[0] +sub v15.4s, v20.4s, v12.4s +add v20.4s, v20.4s, v12.4s +sqrdmulh v12.4S, v18.4S, v24.s[0] +ldr q21, [x0, #944] +mul v28.4S, v28.4S,v25.s[0] +mla v28.4S, v26.4S, v31.s[0] +sub v26.4s, v0.4s, v14.4s +add v0.4s, v0.4s, v14.4s +sqrdmulh v14.4S, v1.4S, v24.s[0] +ldr q2, [x0, #816] +mul v18.4S, v18.4S,v25.s[0] +mla v18.4S, v12.4S, v31.s[0] +sub v12.4s, v30.4s, v28.4s +add v30.4s, v30.4s, v28.4s +sqrdmulh v28.4S, v0.4S, v24.s[1] +str q29, [x0, #16] +ldr q29, [x0, #560] +mul v1.4S, v1.4S,v25.s[0] +mla v1.4S, v14.4S, v31.s[0] +sub v14.4s, v3.4s, v18.4s +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v30.4S, v24.s[1] +str q17, [x0, #144] +ldr q17, [x0, #688] +mul v0.4S, v0.4S,v25.s[1] +mla v0.4S, v28.4S, v31.s[0] +sub v28.4s, v19.4s, v1.4s +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v26.4S, v24.s[2] +str q27, [x0, #272] +ldr q27, [x0, #432] +mul v30.4S, v30.4S,v25.s[1] +mla v30.4S, v18.4S, v31.s[0] +sub v18.4s, v19.4s, v0.4s +add v19.4s, v19.4s, v0.4s +sqrdmulh v0.4S, v12.4S, v24.s[2] +str q16, [x0, #400] +ldr q16, [x0, #304] +mul v26.4S, v26.4S,v25.s[2] +mla v26.4S, v1.4S, v31.s[0] +sub v1.4s, v3.4s, v30.4s +add v3.4s, v3.4s, v30.4s +sqrdmulh v30.4S, v19.4S, v22.s[0] +str q11, [x0, #784] +ldr q11, [x0, #48] +mul v12.4S, v12.4S,v25.s[2] +mla v12.4S, v0.4S, v31.s[0] +sub v0.4s, v28.4s, v26.4s +add v28.4s, v28.4s, v26.4s +sqrdmulh v26.4S, v18.4S, v22.s[1] +str q13, [x0, #912] +ldr q13, [x0, #176] +mul v19.4S, v19.4S,v23.s[0] +mla v19.4S, v30.4S, v31.s[0] +sub v30.4s, v14.4s, v12.4s +add v14.4s, v14.4s, v12.4s +sqrdmulh v12.4S, v0.4S, v22.s[3] +str q20, [x0, #528] +mul v18.4S, v18.4S,v23.s[1] +mla v18.4S, v26.4S, v31.s[0] +sub v26.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v28.4S, v22.s[2] +str q15, [x0, #656] +mul v0.4S, v0.4S,v23.s[3] +mla v0.4S, v12.4S, v31.s[0] +sub v12.4s, v1.4s, v18.4s +add v1.4s, v1.4s, v18.4s +sqrdmulh v18.4S, v21.4S, v24.s[0] +mul v28.4S, v28.4S,v23.s[2] +mla v28.4S, v19.4S, v31.s[0] +sub v19.4s, v30.4s, v0.4s +add v30.4s, v30.4s, v0.4s +sqrdmulh v0.4S, v2.4S, v24.s[0] +mul v21.4S, v21.4S,v25.s[0] +mla v21.4S, v18.4S, v31.s[0] +sub v18.4s, v14.4s, v28.4s +add v14.4s, v14.4s, v28.4s +sqrdmulh v28.4S, v29.4S, v24.s[0] +mul v2.4S, v2.4S,v25.s[0] +mla v2.4S, v0.4S, v31.s[0] +sub v0.4s, v27.4s, v21.4s +add v27.4s, v27.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v24.s[0] +mul v29.4S, v29.4S,v25.s[0] +mla v29.4S, v28.4S, v31.s[0] +sub v28.4s, v16.4s, v2.4s +add v16.4s, v16.4s, v2.4s +sqrdmulh v2.4S, v27.4S, v24.s[1] +str q3, [x0, #32] +mul v17.4S, v17.4S,v25.s[0] +mla v17.4S, v21.4S, v31.s[0] +sub v21.4s, v11.4s, v29.4s +add v11.4s, v11.4s, v29.4s +sqrdmulh v29.4S, v16.4S, v24.s[1] +str q26, [x0, #160] +mul v27.4S, v27.4S,v25.s[1] +mla v27.4S, v2.4S, v31.s[0] +sub v2.4s, v13.4s, v17.4s +add v13.4s, v13.4s, v17.4s +sqrdmulh v17.4S, v0.4S, v24.s[2] +str q1, [x0, #288] +mul v16.4S, v16.4S,v25.s[1] +mla v16.4S, v29.4S, v31.s[0] +sub v29.4s, v13.4s, v27.4s +add v13.4s, v13.4s, v27.4s +sqrdmulh v27.4S, v28.4S, v24.s[2] +str q12, [x0, #416] +mul v0.4S, v0.4S,v25.s[2] +mla v0.4S, v17.4S, v31.s[0] +sub v17.4s, v11.4s, v16.4s +add v11.4s, v11.4s, v16.4s +sqrdmulh v16.4S, v13.4S, v22.s[0] +str q30, [x0, #800] +mul v28.4S, v28.4S,v25.s[2] +mla v28.4S, v27.4S, v31.s[0] +sub v27.4s, v2.4s, v0.4s +add v2.4s, v2.4s, v0.4s +sqrdmulh v0.4S, v29.4S, v22.s[1] +str q19, [x0, #928] +mul v13.4S, v13.4S,v23.s[0] +mla v13.4S, v16.4S, v31.s[0] +sub v16.4s, v21.4s, v28.4s +add v21.4s, v21.4s, v28.4s +sqrdmulh v28.4S, v27.4S, v22.s[3] +str q14, [x0, #544] +mul v29.4S, v29.4S,v23.s[1] +mla v29.4S, v0.4S, v31.s[0] +sub v0.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +sqrdmulh v13.4S, v2.4S, v22.s[2] +str q18, [x0, #672] +mul v27.4S, v27.4S,v23.s[3] +mla v27.4S, v28.4S, v31.s[0] +sub v28.4s, v17.4s, v29.4s +add v17.4s, v17.4s, v29.4s +mul v2.4S, v2.4S,v23.s[2] +mla v2.4S, v13.4S, v31.s[0] +sub v13.4s, v16.4s, v27.4s +add v16.4s, v16.4s, v27.4s +sub v27.4s, v21.4s, v2.4s +add v21.4s, v21.4s, v2.4s +str q11, [x0, #48] +str q0, [x0, #176] +str q17, [x0, #304] +str q28, [x0, #432] +str q16, [x0, #816] +str q13, [x0, #944] +str q21, [x0, #560] +str q27, [x0, #688] +ldr q4, [x0, #112] +ldr q5, [x0, #96] +ldr q6, [x0, #64] +ldr q7, [x0, #80] +ldr q8, [x0, #48] +ldr q9, [x17, #+64] +ldr q10, [x17, #+80] +ldr q20, [x17, #+96] +ldr q15, [x17, #+112] +ldr q3, [x0, #32] +ldr q26, [x0, #0] +ldr q1, [x0, #16] +sqrdmulh v12.4S, v4.4S, v10.s[0] +sqrdmulh v30.4S, v5.4S, v10.s[0] +mul v4.4S, v4.4S,v9.s[0] +mla v4.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v6.4S, v10.s[0] +ldr q19, [x0, #240] +mul v5.4S, v5.4S,v9.s[0] +mla v5.4S, v30.4S, v31.s[0] +sub v30.4s, v8.4s, v4.4s +add v8.4s, v8.4s, v4.4s +sqrdmulh v4.4S, v7.4S, v10.s[0] +ldr q14, [x0, #224] +mul v6.4S, v6.4S,v9.s[0] +mla v6.4S, v12.4S, v31.s[0] +sub v12.4s, v3.4s, v5.4s +add v3.4s, v3.4s, v5.4s +sqrdmulh v5.4S, v8.4S, v10.s[1] +ldr q18, [x0, #192] +mul v7.4S, v7.4S,v9.s[0] +mla v7.4S, v4.4S, v31.s[0] +sub v4.4s, v26.4s, v6.4s +add v26.4s, v26.4s, v6.4s +sqrdmulh v6.4S, v3.4S, v10.s[1] +ldr q29, [x0, #208] +mul v8.4S, v8.4S,v9.s[1] +mla v8.4S, v5.4S, v31.s[0] +sub v5.4s, v1.4s, v7.4s +add v1.4s, v1.4s, v7.4s +sqrdmulh v7.4S, v30.4S, v10.s[2] +ldr q2, [x0, #176] +mul v3.4S, v3.4S,v9.s[1] +mla v3.4S, v6.4S, v31.s[0] +sub v6.4s, v1.4s, v8.4s +add v1.4s, v1.4s, v8.4s +ldr q8, [x17, #+128] +ldr q25, [x17, #+144] +ldr q24, [x17, #+160] +ldr q23, [x17, #+176] +sqrdmulh v22.4S, v12.4S, v10.s[2] +ldr q11, [x0, #160] +mul v30.4S, v30.4S,v9.s[2] +mla v30.4S, v7.4S, v31.s[0] +sub v7.4s, v26.4s, v3.4s +add v26.4s, v26.4s, v3.4s +sqrdmulh v3.4S, v1.4S, v15.s[0] +ldr q0, [x0, #128] +mul v12.4S, v12.4S,v9.s[2] +mla v12.4S, v22.4S, v31.s[0] +sub v22.4s, v5.4s, v30.4s +add v5.4s, v5.4s, v30.4s +sqrdmulh v30.4S, v6.4S, v15.s[1] +ldr q17, [x0, #144] +mul v1.4S, v1.4S,v20.s[0] +mla v1.4S, v3.4S, v31.s[0] +sub v3.4s, v4.4s, v12.4s +add v4.4s, v4.4s, v12.4s +sqrdmulh v12.4S, v22.4S, v15.s[3] +mul v6.4S, v6.4S,v20.s[1] +mla v6.4S, v30.4S, v31.s[0] +sub v30.4s, v26.4s, v1.4s +add v26.4s, v26.4s, v1.4s +sqrdmulh v1.4S, v5.4S, v15.s[2] +mul v22.4S, v22.4S,v20.s[3] +mla v22.4S, v12.4S, v31.s[0] +sub v12.4s, v7.4s, v6.4s +add v7.4s, v7.4s, v6.4s +sqrdmulh v6.4S, v19.4S, v25.s[0] +mul v5.4S, v5.4S,v20.s[2] +mla v5.4S, v1.4S, v31.s[0] +sub v1.4s, v3.4s, v22.4s +add v3.4s, v3.4s, v22.4s +sqrdmulh v22.4S, v14.4S, v25.s[0] +mul v19.4S, v19.4S,v8.s[0] +mla v19.4S, v6.4S, v31.s[0] +sub v6.4s, v4.4s, v5.4s +add v4.4s, v4.4s, v5.4s +sqrdmulh v15.4S, v18.4S, v25.s[0] +ldr q20, [x0, #368] +mul v14.4S, v14.4S,v8.s[0] +mla v14.4S, v22.4S, v31.s[0] +sub v22.4s, v2.4s, v19.4s +add v2.4s, v2.4s, v19.4s +sqrdmulh v19.4S, v29.4S, v25.s[0] +ldr q10, [x0, #352] +mul v18.4S, v18.4S,v8.s[0] +mla v18.4S, v15.4S, v31.s[0] +sub v15.4s, v11.4s, v14.4s +add v11.4s, v11.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v25.s[1] +str q26, [x0, #0] +ldr q26, [x0, #320] +mul v29.4S, v29.4S,v8.s[0] +mla v29.4S, v19.4S, v31.s[0] +sub v19.4s, v0.4s, v18.4s +add v0.4s, v0.4s, v18.4s +sqrdmulh v18.4S, v11.4S, v25.s[1] +str q30, [x0, #16] +ldr q30, [x0, #336] +mul v2.4S, v2.4S,v8.s[1] +mla v2.4S, v14.4S, v31.s[0] +sub v14.4s, v17.4s, v29.4s +add v17.4s, v17.4s, v29.4s +sqrdmulh v29.4S, v22.4S, v25.s[2] +str q7, [x0, #32] +ldr q7, [x0, #304] +mul v11.4S, v11.4S,v8.s[1] +mla v11.4S, v18.4S, v31.s[0] +sub v18.4s, v17.4s, v2.4s +add v17.4s, v17.4s, v2.4s +ldr q2, [x17, #+192] +ldr q9, [x17, #+208] +ldr q5, [x17, #+224] +ldr q28, [x17, #+240] +sqrdmulh v16.4S, v15.4S, v25.s[2] +str q12, [x0, #48] +ldr q12, [x0, #288] +mul v22.4S, v22.4S,v8.s[2] +mla v22.4S, v29.4S, v31.s[0] +sub v29.4s, v0.4s, v11.4s +add v0.4s, v0.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v23.s[0] +str q3, [x0, #96] +ldr q3, [x0, #256] +mul v15.4S, v15.4S,v8.s[2] +mla v15.4S, v16.4S, v31.s[0] +sub v16.4s, v14.4s, v22.4s +add v14.4s, v14.4s, v22.4s +sqrdmulh v22.4S, v18.4S, v23.s[1] +str q1, [x0, #112] +ldr q1, [x0, #272] +mul v17.4S, v17.4S,v24.s[0] +mla v17.4S, v11.4S, v31.s[0] +sub v11.4s, v19.4s, v15.4s +add v19.4s, v19.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v23.s[3] +str q4, [x0, #64] +mul v18.4S, v18.4S,v24.s[1] +mla v18.4S, v22.4S, v31.s[0] +sub v22.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +sqrdmulh v17.4S, v14.4S, v23.s[2] +str q6, [x0, #80] +mul v16.4S, v16.4S,v24.s[3] +mla v16.4S, v15.4S, v31.s[0] +sub v15.4s, v29.4s, v18.4s +add v29.4s, v29.4s, v18.4s +sqrdmulh v18.4S, v20.4S, v9.s[0] +mul v14.4S, v14.4S,v24.s[2] +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v11.4s, v16.4s +add v11.4s, v11.4s, v16.4s +sqrdmulh v16.4S, v10.4S, v9.s[0] +mul v20.4S, v20.4S,v2.s[0] +mla v20.4S, v18.4S, v31.s[0] +sub v18.4s, v19.4s, v14.4s +add v19.4s, v19.4s, v14.4s +sqrdmulh v23.4S, v26.4S, v9.s[0] +ldr q24, [x0, #496] +mul v10.4S, v10.4S,v2.s[0] +mla v10.4S, v16.4S, v31.s[0] +sub v16.4s, v7.4s, v20.4s +add v7.4s, v7.4s, v20.4s +sqrdmulh v20.4S, v30.4S, v9.s[0] +ldr q25, [x0, #480] +mul v26.4S, v26.4S,v2.s[0] +mla v26.4S, v23.4S, v31.s[0] +sub v23.4s, v12.4s, v10.4s +add v12.4s, v12.4s, v10.4s +sqrdmulh v10.4S, v7.4S, v9.s[1] +str q0, [x0, #128] +ldr q0, [x0, #448] +mul v30.4S, v30.4S,v2.s[0] +mla v30.4S, v20.4S, v31.s[0] +sub v20.4s, v3.4s, v26.4s +add v3.4s, v3.4s, v26.4s +sqrdmulh v26.4S, v12.4S, v9.s[1] +str q22, [x0, #144] +ldr q22, [x0, #464] +mul v7.4S, v7.4S,v2.s[1] +mla v7.4S, v10.4S, v31.s[0] +sub v10.4s, v1.4s, v30.4s +add v1.4s, v1.4s, v30.4s +sqrdmulh v30.4S, v16.4S, v9.s[2] +str q29, [x0, #160] +ldr q29, [x0, #432] +mul v12.4S, v12.4S,v2.s[1] +mla v12.4S, v26.4S, v31.s[0] +sub v26.4s, v1.4s, v7.4s +add v1.4s, v1.4s, v7.4s +ldr q7, [x17, #+256] +ldr q8, [x17, #+272] +ldr q14, [x17, #+288] +ldr q6, [x17, #+304] +sqrdmulh v4.4S, v23.4S, v9.s[2] +str q15, [x0, #176] +ldr q15, [x0, #416] +mul v16.4S, v16.4S,v2.s[2] +mla v16.4S, v30.4S, v31.s[0] +sub v30.4s, v3.4s, v12.4s +add v3.4s, v3.4s, v12.4s +sqrdmulh v12.4S, v1.4S, v28.s[0] +str q11, [x0, #224] +ldr q11, [x0, #384] +mul v23.4S, v23.4S,v2.s[2] +mla v23.4S, v4.4S, v31.s[0] +sub v4.4s, v10.4s, v16.4s +add v10.4s, v10.4s, v16.4s +sqrdmulh v16.4S, v26.4S, v28.s[1] +str q17, [x0, #240] +ldr q17, [x0, #400] +mul v1.4S, v1.4S,v5.s[0] +mla v1.4S, v12.4S, v31.s[0] +sub v12.4s, v20.4s, v23.4s +add v20.4s, v20.4s, v23.4s +sqrdmulh v23.4S, v4.4S, v28.s[3] +str q19, [x0, #192] +mul v26.4S, v26.4S,v5.s[1] +mla v26.4S, v16.4S, v31.s[0] +sub v16.4s, v3.4s, v1.4s +add v3.4s, v3.4s, v1.4s +sqrdmulh v1.4S, v10.4S, v28.s[2] +str q18, [x0, #208] +mul v4.4S, v4.4S,v5.s[3] +mla v4.4S, v23.4S, v31.s[0] +sub v23.4s, v30.4s, v26.4s +add v30.4s, v30.4s, v26.4s +sqrdmulh v26.4S, v24.4S, v8.s[0] +mul v10.4S, v10.4S,v5.s[2] +mla v10.4S, v1.4S, v31.s[0] +sub v1.4s, v12.4s, v4.4s +add v12.4s, v12.4s, v4.4s +sqrdmulh v4.4S, v25.4S, v8.s[0] +mul v24.4S, v24.4S,v7.s[0] +mla v24.4S, v26.4S, v31.s[0] +sub v26.4s, v20.4s, v10.4s +add v20.4s, v20.4s, v10.4s +sqrdmulh v28.4S, v0.4S, v8.s[0] +ldr q5, [x0, #624] +mul v25.4S, v25.4S,v7.s[0] +mla v25.4S, v4.4S, v31.s[0] +sub v4.4s, v29.4s, v24.4s +add v29.4s, v29.4s, v24.4s +sqrdmulh v24.4S, v22.4S, v8.s[0] +ldr q9, [x0, #608] +mul v0.4S, v0.4S,v7.s[0] +mla v0.4S, v28.4S, v31.s[0] +sub v28.4s, v15.4s, v25.4s +add v15.4s, v15.4s, v25.4s +sqrdmulh v25.4S, v29.4S, v8.s[1] +str q3, [x0, #256] +ldr q3, [x0, #576] +mul v22.4S, v22.4S,v7.s[0] +mla v22.4S, v24.4S, v31.s[0] +sub v24.4s, v11.4s, v0.4s +add v11.4s, v11.4s, v0.4s +sqrdmulh v0.4S, v15.4S, v8.s[1] +str q16, [x0, #272] +ldr q16, [x0, #592] +mul v29.4S, v29.4S,v7.s[1] +mla v29.4S, v25.4S, v31.s[0] +sub v25.4s, v17.4s, v22.4s +add v17.4s, v17.4s, v22.4s +sqrdmulh v22.4S, v4.4S, v8.s[2] +str q30, [x0, #288] +ldr q30, [x0, #560] +mul v15.4S, v15.4S,v7.s[1] +mla v15.4S, v0.4S, v31.s[0] +sub v0.4s, v17.4s, v29.4s +add v17.4s, v17.4s, v29.4s +ldr q29, [x17, #+320] +ldr q2, [x17, #+336] +ldr q10, [x17, #+352] +ldr q18, [x17, #+368] +sqrdmulh v19.4S, v28.4S, v8.s[2] +str q23, [x0, #304] +ldr q23, [x0, #544] +mul v4.4S, v4.4S,v7.s[2] +mla v4.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v15.4s +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v17.4S, v6.s[0] +str q12, [x0, #352] +ldr q12, [x0, #512] +mul v28.4S, v28.4S,v7.s[2] +mla v28.4S, v19.4S, v31.s[0] +sub v19.4s, v25.4s, v4.4s +add v25.4s, v25.4s, v4.4s +sqrdmulh v4.4S, v0.4S, v6.s[1] +str q1, [x0, #368] +ldr q1, [x0, #528] +mul v17.4S, v17.4S,v14.s[0] +mla v17.4S, v15.4S, v31.s[0] +sub v15.4s, v24.4s, v28.4s +add v24.4s, v24.4s, v28.4s +sqrdmulh v28.4S, v19.4S, v6.s[3] +str q20, [x0, #320] +mul v0.4S, v0.4S,v14.s[1] +mla v0.4S, v4.4S, v31.s[0] +sub v4.4s, v11.4s, v17.4s +add v11.4s, v11.4s, v17.4s +sqrdmulh v17.4S, v25.4S, v6.s[2] +str q26, [x0, #336] +mul v19.4S, v19.4S,v14.s[3] +mla v19.4S, v28.4S, v31.s[0] +sub v28.4s, v22.4s, v0.4s +add v22.4s, v22.4s, v0.4s +sqrdmulh v0.4S, v5.4S, v2.s[0] +mul v25.4S, v25.4S,v14.s[2] +mla v25.4S, v17.4S, v31.s[0] +sub v17.4s, v15.4s, v19.4s +add v15.4s, v15.4s, v19.4s +sqrdmulh v19.4S, v9.4S, v2.s[0] +mul v5.4S, v5.4S,v29.s[0] +mla v5.4S, v0.4S, v31.s[0] +sub v0.4s, v24.4s, v25.4s +add v24.4s, v24.4s, v25.4s +sqrdmulh v6.4S, v3.4S, v2.s[0] +ldr q14, [x0, #752] +mul v9.4S, v9.4S,v29.s[0] +mla v9.4S, v19.4S, v31.s[0] +sub v19.4s, v30.4s, v5.4s +add v30.4s, v30.4s, v5.4s +sqrdmulh v5.4S, v16.4S, v2.s[0] +ldr q8, [x0, #736] +mul v3.4S, v3.4S,v29.s[0] +mla v3.4S, v6.4S, v31.s[0] +sub v6.4s, v23.4s, v9.4s +add v23.4s, v23.4s, v9.4s +sqrdmulh v9.4S, v30.4S, v2.s[1] +str q11, [x0, #384] +ldr q11, [x0, #704] +mul v16.4S, v16.4S,v29.s[0] +mla v16.4S, v5.4S, v31.s[0] +sub v5.4s, v12.4s, v3.4s +add v12.4s, v12.4s, v3.4s +sqrdmulh v3.4S, v23.4S, v2.s[1] +str q4, [x0, #400] +ldr q4, [x0, #720] +mul v30.4S, v30.4S,v29.s[1] +mla v30.4S, v9.4S, v31.s[0] +sub v9.4s, v1.4s, v16.4s +add v1.4s, v1.4s, v16.4s +sqrdmulh v16.4S, v19.4S, v2.s[2] +str q22, [x0, #416] +ldr q22, [x0, #688] +mul v23.4S, v23.4S,v29.s[1] +mla v23.4S, v3.4S, v31.s[0] +sub v3.4s, v1.4s, v30.4s +add v1.4s, v1.4s, v30.4s +ldr q30, [x17, #+384] +ldr q7, [x17, #+400] +ldr q25, [x17, #+416] +ldr q26, [x17, #+432] +sqrdmulh v20.4S, v6.4S, v2.s[2] +str q28, [x0, #432] +ldr q28, [x0, #672] +mul v19.4S, v19.4S,v29.s[2] +mla v19.4S, v16.4S, v31.s[0] +sub v16.4s, v12.4s, v23.4s +add v12.4s, v12.4s, v23.4s +sqrdmulh v23.4S, v1.4S, v18.s[0] +str q15, [x0, #480] +ldr q15, [x0, #640] +mul v6.4S, v6.4S,v29.s[2] +mla v6.4S, v20.4S, v31.s[0] +sub v20.4s, v9.4s, v19.4s +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v3.4S, v18.s[1] +str q17, [x0, #496] +ldr q17, [x0, #656] +mul v1.4S, v1.4S,v10.s[0] +mla v1.4S, v23.4S, v31.s[0] +sub v23.4s, v5.4s, v6.4s +add v5.4s, v5.4s, v6.4s +sqrdmulh v6.4S, v20.4S, v18.s[3] +str q24, [x0, #448] +mul v3.4S, v3.4S,v10.s[1] +mla v3.4S, v19.4S, v31.s[0] +sub v19.4s, v12.4s, v1.4s +add v12.4s, v12.4s, v1.4s +sqrdmulh v1.4S, v9.4S, v18.s[2] +str q0, [x0, #464] +mul v20.4S, v20.4S,v10.s[3] +mla v20.4S, v6.4S, v31.s[0] +sub v6.4s, v16.4s, v3.4s +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v14.4S, v7.s[0] +mul v9.4S, v9.4S,v10.s[2] +mla v9.4S, v1.4S, v31.s[0] +sub v1.4s, v23.4s, v20.4s +add v23.4s, v23.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v7.s[0] +mul v14.4S, v14.4S,v30.s[0] +mla v14.4S, v3.4S, v31.s[0] +sub v3.4s, v5.4s, v9.4s +add v5.4s, v5.4s, v9.4s +sqrdmulh v18.4S, v11.4S, v7.s[0] +ldr q10, [x0, #880] +mul v8.4S, v8.4S,v30.s[0] +mla v8.4S, v20.4S, v31.s[0] +sub v20.4s, v22.4s, v14.4s +add v22.4s, v22.4s, v14.4s +sqrdmulh v14.4S, v4.4S, v7.s[0] +ldr q2, [x0, #864] +mul v11.4S, v11.4S,v30.s[0] +mla v11.4S, v18.4S, v31.s[0] +sub v18.4s, v28.4s, v8.4s +add v28.4s, v28.4s, v8.4s +sqrdmulh v8.4S, v22.4S, v7.s[1] +str q12, [x0, #512] +ldr q12, [x0, #832] +mul v4.4S, v4.4S,v30.s[0] +mla v4.4S, v14.4S, v31.s[0] +sub v14.4s, v15.4s, v11.4s +add v15.4s, v15.4s, v11.4s +sqrdmulh v11.4S, v28.4S, v7.s[1] +str q19, [x0, #528] +ldr q19, [x0, #848] +mul v22.4S, v22.4S,v30.s[1] +mla v22.4S, v8.4S, v31.s[0] +sub v8.4s, v17.4s, v4.4s +add v17.4s, v17.4s, v4.4s +sqrdmulh v4.4S, v20.4S, v7.s[2] +str q16, [x0, #544] +ldr q16, [x0, #816] +mul v28.4S, v28.4S,v30.s[1] +mla v28.4S, v11.4S, v31.s[0] +sub v11.4s, v17.4s, v22.4s +add v17.4s, v17.4s, v22.4s +ldr q22, [x17, #+448] +ldr q29, [x17, #+464] +ldr q9, [x17, #+480] +ldr q0, [x17, #+496] +sqrdmulh v24.4S, v18.4S, v7.s[2] +str q6, [x0, #560] +ldr q6, [x0, #800] +mul v20.4S, v20.4S,v30.s[2] +mla v20.4S, v4.4S, v31.s[0] +sub v4.4s, v15.4s, v28.4s +add v15.4s, v15.4s, v28.4s +sqrdmulh v28.4S, v17.4S, v26.s[0] +str q23, [x0, #608] +ldr q23, [x0, #768] +mul v18.4S, v18.4S,v30.s[2] +mla v18.4S, v24.4S, v31.s[0] +sub v24.4s, v8.4s, v20.4s +add v8.4s, v8.4s, v20.4s +sqrdmulh v20.4S, v11.4S, v26.s[1] +str q1, [x0, #624] +ldr q1, [x0, #784] +mul v17.4S, v17.4S,v25.s[0] +mla v17.4S, v28.4S, v31.s[0] +sub v28.4s, v14.4s, v18.4s +add v14.4s, v14.4s, v18.4s +sqrdmulh v18.4S, v24.4S, v26.s[3] +str q5, [x0, #576] +mul v11.4S, v11.4S,v25.s[1] +mla v11.4S, v20.4S, v31.s[0] +sub v20.4s, v15.4s, v17.4s +add v15.4s, v15.4s, v17.4s +sqrdmulh v17.4S, v8.4S, v26.s[2] +str q3, [x0, #592] +mul v24.4S, v24.4S,v25.s[3] +mla v24.4S, v18.4S, v31.s[0] +sub v18.4s, v4.4s, v11.4s +add v4.4s, v4.4s, v11.4s +sqrdmulh v11.4S, v10.4S, v29.s[0] +mul v8.4S, v8.4S,v25.s[2] +mla v8.4S, v17.4S, v31.s[0] +sub v17.4s, v28.4s, v24.4s +add v28.4s, v28.4s, v24.4s +sqrdmulh v24.4S, v2.4S, v29.s[0] +mul v10.4S, v10.4S,v22.s[0] +mla v10.4S, v11.4S, v31.s[0] +sub v11.4s, v14.4s, v8.4s +add v14.4s, v14.4s, v8.4s +sqrdmulh v26.4S, v12.4S, v29.s[0] +ldr q25, [x0, #1008] +mul v2.4S, v2.4S,v22.s[0] +mla v2.4S, v24.4S, v31.s[0] +sub v24.4s, v16.4s, v10.4s +add v16.4s, v16.4s, v10.4s +sqrdmulh v10.4S, v19.4S, v29.s[0] +ldr q7, [x0, #992] +mul v12.4S, v12.4S,v22.s[0] +mla v12.4S, v26.4S, v31.s[0] +sub v26.4s, v6.4s, v2.4s +add v6.4s, v6.4s, v2.4s +sqrdmulh v2.4S, v16.4S, v29.s[1] +str q15, [x0, #640] +ldr q15, [x0, #960] +mul v19.4S, v19.4S,v22.s[0] +mla v19.4S, v10.4S, v31.s[0] +sub v10.4s, v23.4s, v12.4s +add v23.4s, v23.4s, v12.4s +sqrdmulh v12.4S, v6.4S, v29.s[1] +str q20, [x0, #656] +ldr q20, [x0, #976] +mul v16.4S, v16.4S,v22.s[1] +mla v16.4S, v2.4S, v31.s[0] +sub v2.4s, v1.4s, v19.4s +add v1.4s, v1.4s, v19.4s +sqrdmulh v19.4S, v24.4S, v29.s[2] +str q4, [x0, #672] +ldr q4, [x0, #944] +mul v6.4S, v6.4S,v22.s[1] +mla v6.4S, v12.4S, v31.s[0] +sub v12.4s, v1.4s, v16.4s +add v1.4s, v1.4s, v16.4s +ldr q16, [x17, #+512] +ldr q30, [x17, #+528] +ldr q8, [x17, #+544] +ldr q3, [x17, #+560] +sqrdmulh v5.4S, v26.4S, v29.s[2] +str q18, [x0, #688] +ldr q18, [x0, #928] +mul v24.4S, v24.4S,v22.s[2] +mla v24.4S, v19.4S, v31.s[0] +sub v19.4s, v23.4s, v6.4s +add v23.4s, v23.4s, v6.4s +sqrdmulh v6.4S, v1.4S, v0.s[0] +str q28, [x0, #736] +ldr q28, [x0, #896] +mul v26.4S, v26.4S,v22.s[2] +mla v26.4S, v5.4S, v31.s[0] +sub v5.4s, v2.4s, v24.4s +add v2.4s, v2.4s, v24.4s +sqrdmulh v24.4S, v12.4S, v0.s[1] +str q17, [x0, #752] +ldr q17, [x0, #912] +mul v1.4S, v1.4S,v9.s[0] +mla v1.4S, v6.4S, v31.s[0] +sub v6.4s, v10.4s, v26.4s +add v10.4s, v10.4s, v26.4s +sqrdmulh v26.4S, v5.4S, v0.s[3] +str q14, [x0, #704] +mul v12.4S, v12.4S,v9.s[1] +mla v12.4S, v24.4S, v31.s[0] +sub v24.4s, v23.4s, v1.4s +add v23.4s, v23.4s, v1.4s +sqrdmulh v1.4S, v2.4S, v0.s[2] +str q11, [x0, #720] +mul v5.4S, v5.4S,v9.s[3] +mla v5.4S, v26.4S, v31.s[0] +sub v26.4s, v19.4s, v12.4s +add v19.4s, v19.4s, v12.4s +sqrdmulh v12.4S, v25.4S, v30.s[0] +mul v2.4S, v2.4S,v9.s[2] +mla v2.4S, v1.4S, v31.s[0] +sub v1.4s, v6.4s, v5.4s +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v7.4S, v30.s[0] +mul v25.4S, v25.4S,v16.s[0] +mla v25.4S, v12.4S, v31.s[0] +sub v12.4s, v10.4s, v2.4s +add v10.4s, v10.4s, v2.4s +sqrdmulh v0.4S, v15.4S, v30.s[0] +mul v7.4S, v7.4S,v16.s[0] +mla v7.4S, v5.4S, v31.s[0] +sub v5.4s, v4.4s, v25.4s +add v4.4s, v4.4s, v25.4s +sqrdmulh v25.4S, v20.4S, v30.s[0] +mul v15.4S, v15.4S,v16.s[0] +mla v15.4S, v0.4S, v31.s[0] +sub v0.4s, v18.4s, v7.4s +add v18.4s, v18.4s, v7.4s +sqrdmulh v7.4S, v4.4S, v30.s[1] +str q23, [x0, #768] +mul v20.4S, v20.4S,v16.s[0] +mla v20.4S, v25.4S, v31.s[0] +sub v25.4s, v28.4s, v15.4s +add v28.4s, v28.4s, v15.4s +sqrdmulh v15.4S, v18.4S, v30.s[1] +str q24, [x0, #784] +mul v4.4S, v4.4S,v16.s[1] +mla v4.4S, v7.4S, v31.s[0] +sub v7.4s, v17.4s, v20.4s +add v17.4s, v17.4s, v20.4s +sqrdmulh v20.4S, v5.4S, v30.s[2] +str q19, [x0, #800] +mul v18.4S, v18.4S,v16.s[1] +mla v18.4S, v15.4S, v31.s[0] +sub v15.4s, v17.4s, v4.4s +add v17.4s, v17.4s, v4.4s +sqrdmulh v4.4S, v0.4S, v30.s[2] +str q26, [x0, #816] +mul v5.4S, v5.4S,v16.s[2] +mla v5.4S, v20.4S, v31.s[0] +sub v20.4s, v28.4s, v18.4s +add v28.4s, v28.4s, v18.4s +sqrdmulh v18.4S, v17.4S, v3.s[0] +str q6, [x0, #864] +mul v0.4S, v0.4S,v16.s[2] +mla v0.4S, v4.4S, v31.s[0] +sub v4.4s, v7.4s, v5.4s +add v7.4s, v7.4s, v5.4s +sqrdmulh v5.4S, v15.4S, v3.s[1] +str q1, [x0, #880] +mul v17.4S, v17.4S,v8.s[0] +mla v17.4S, v18.4S, v31.s[0] +sub v18.4s, v25.4s, v0.4s +add v25.4s, v25.4s, v0.4s +sqrdmulh v0.4S, v4.4S, v3.s[3] +str q10, [x0, #832] +mul v15.4S, v15.4S,v8.s[1] +mla v15.4S, v5.4S, v31.s[0] +sub v5.4s, v28.4s, v17.4s +add v28.4s, v28.4s, v17.4s +sqrdmulh v17.4S, v7.4S, v3.s[2] +str q12, [x0, #848] +mul v4.4S, v4.4S,v8.s[3] +mla v4.4S, v0.4S, v31.s[0] +sub v0.4s, v20.4s, v15.4s +add v20.4s, v20.4s, v15.4s +mul v7.4S, v7.4S,v8.s[2] +mla v7.4S, v17.4S, v31.s[0] +sub v17.4s, v18.4s, v4.4s +add v18.4s, v18.4s, v4.4s +sub v4.4s, v25.4s, v7.4s +add v25.4s, v25.4s, v7.4s +str q28, [x0, #896] +str q5, [x0, #912] +str q20, [x0, #928] +str q0, [x0, #944] +str q18, [x0, #992] +str q17, [x0, #1008] +str q25, [x0, #960] +str q4, [x0, #976] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1444 +// Instruction count: 1440 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_0_0.s b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_0_0.s new file mode 100644 index 0000000..209eccf --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_0_0.s @@ -0,0 +1,1494 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_0_0 +.global _ntt_u32_incomplete_neon_asm_var_4_2_0_0 +ntt_u32_incomplete_neon_asm_var_4_2_0_0: +_ntt_u32_incomplete_neon_asm_var_4_2_0_0: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #800] +ldr q21, [x0, #864] +ldr q20, [x0, #928] +ldr q19, [x0, #992] +ldr q18, [x0, #288] +ldr q17, [x0, #352] +ldr q16, [x0, #416] +ldr q3, [x0, #480] +ldr q2, [x0, #544] +ldr q1, [x0, #608] +ldr q0, [x0, #672] +ldr q15, [x0, #736] +ldr q14, [x0, #32] +ldr q13, [x0, #96] +ldr q12, [x0, #160] +ldr q11, [x0, #224] +sqrdmulh v10.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +mla v22.4S, v10.4S, v31.s[0] +sub v10.4s, v18.4s, v22.4s +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v17.4s, v21.4s +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +mla v20.4S, v21.4S, v31.s[0] +sub v21.4s, v16.4s, v20.4s +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +mla v19.4S, v20.4S, v31.s[0] +sub v20.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +mla v2.4S, v19.4S, v31.s[0] +sub v19.4s, v14.4s, v2.4s +add v14.4s, v14.4s, v2.4s +sqrdmulh v2.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +mla v1.4S, v2.4S, v31.s[0] +sub v2.4s, v13.4s, v1.4s +add v13.4s, v13.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v29.s[0] +mul v0.4S, v0.4S,v30.s[0] +mla v0.4S, v1.4S, v31.s[0] +sub v1.4s, v12.4s, v0.4s +add v12.4s, v12.4s, v0.4s +sqrdmulh v0.4S, v15.4S, v29.s[0] +mul v15.4S, v15.4S,v30.s[0] +mla v15.4S, v0.4S, v31.s[0] +sub v0.4s, v11.4s, v15.4s +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +mla v16.4S, v15.4S, v31.s[0] +sub v15.4s, v12.4s, v16.4s +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +mla v3.4S, v16.4S, v31.s[0] +sub v16.4s, v11.4s, v3.4s +add v11.4s, v11.4s, v3.4s +sqrdmulh v3.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +mla v18.4S, v3.4S, v31.s[0] +sub v3.4s, v14.4s, v18.4s +add v14.4s, v14.4s, v18.4s +sqrdmulh v18.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +mla v17.4S, v18.4S, v31.s[0] +sub v18.4s, v13.4s, v17.4s +add v13.4s, v13.4s, v17.4s +sqrdmulh v17.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +mla v21.4S, v17.4S, v31.s[0] +sub v17.4s, v1.4s, v21.4s +add v1.4s, v1.4s, v21.4s +sqrdmulh v21.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +mla v20.4S, v21.4S, v31.s[0] +sub v21.4s, v0.4s, v20.4s +add v0.4s, v0.4s, v20.4s +sqrdmulh v20.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +mla v10.4S, v20.4S, v31.s[0] +sub v20.4s, v19.4s, v10.4s +add v19.4s, v19.4s, v10.4s +sqrdmulh v10.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +mla v22.4S, v10.4S, v31.s[0] +sub v10.4s, v2.4s, v22.4s +add v2.4s, v2.4s, v22.4s +sqrdmulh v22.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +mla v12.4S, v22.4S, v31.s[0] +sub v22.4s, v14.4s, v12.4s +add v14.4s, v14.4s, v12.4s +sqrdmulh v12.4S, v11.4S, v27.s[0] +mul v11.4S, v11.4S,v28.s[0] +mla v11.4S, v12.4S, v31.s[0] +sub v12.4s, v13.4s, v11.4s +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v15.4S, v27.s[1] +mul v15.4S, v15.4S,v28.s[1] +mla v15.4S, v11.4S, v31.s[0] +sub v11.4s, v3.4s, v15.4s +add v3.4s, v3.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v27.s[1] +mul v16.4S, v16.4S,v28.s[1] +mla v16.4S, v15.4S, v31.s[0] +sub v15.4s, v18.4s, v16.4s +add v18.4s, v18.4s, v16.4s +sqrdmulh v16.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +mla v1.4S, v16.4S, v31.s[0] +sub v16.4s, v19.4s, v1.4s +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v27.s[2] +mul v0.4S, v0.4S,v28.s[2] +mla v0.4S, v1.4S, v31.s[0] +sub v1.4s, v2.4s, v0.4s +add v2.4s, v2.4s, v0.4s +sqrdmulh v0.4S, v17.4S, v27.s[3] +mul v17.4S, v17.4S,v28.s[3] +mla v17.4S, v0.4S, v31.s[0] +sub v0.4s, v20.4s, v17.4s +add v20.4s, v20.4s, v17.4s +sqrdmulh v17.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +mla v21.4S, v17.4S, v31.s[0] +sub v17.4s, v10.4s, v21.4s +add v10.4s, v10.4s, v21.4s +sqrdmulh v21.4S, v13.4S, v25.s[0] +mul v13.4S, v13.4S,v26.s[0] +mla v13.4S, v21.4S, v31.s[0] +sub v21.4s, v14.4s, v13.4s +add v14.4s, v14.4s, v13.4s +sqrdmulh v13.4S, v12.4S, v25.s[1] +mul v12.4S, v12.4S,v26.s[1] +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v18.4S, v25.s[2] +mul v18.4S, v18.4S,v26.s[2] +mla v18.4S, v12.4S, v31.s[0] +sub v12.4s, v3.4s, v18.4s +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v15.4S, v25.s[3] +mul v15.4S, v15.4S,v26.s[3] +mla v15.4S, v18.4S, v31.s[0] +sub v18.4s, v11.4s, v15.4s +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v23.s[0] +mul v2.4S, v2.4S,v24.s[0] +mla v2.4S, v15.4S, v31.s[0] +sub v15.4s, v19.4s, v2.4s +add v19.4s, v19.4s, v2.4s +sqrdmulh v2.4S, v1.4S, v23.s[1] +mul v1.4S, v1.4S,v24.s[1] +mla v1.4S, v2.4S, v31.s[0] +sub v2.4s, v16.4s, v1.4s +add v16.4s, v16.4s, v1.4s +sqrdmulh v1.4S, v10.4S, v23.s[2] +mul v10.4S, v10.4S,v24.s[2] +mla v10.4S, v1.4S, v31.s[0] +sub v1.4s, v20.4s, v10.4s +add v20.4s, v20.4s, v10.4s +sqrdmulh v10.4S, v17.4S, v23.s[3] +mul v17.4S, v17.4S,v24.s[3] +mla v17.4S, v10.4S, v31.s[0] +sub v10.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +str q14, [x0, #32] +str q21, [x0, #96] +str q22, [x0, #160] +str q13, [x0, #224] +str q3, [x0, #288] +str q12, [x0, #352] +str q11, [x0, #416] +str q18, [x0, #480] +str q19, [x0, #544] +str q15, [x0, #608] +str q16, [x0, #672] +str q2, [x0, #736] +str q20, [x0, #800] +str q1, [x0, #864] +str q0, [x0, #928] +str q10, [x0, #992] +ldr q10, [x0, #816] +ldr q0, [x0, #880] +ldr q1, [x0, #944] +ldr q20, [x0, #1008] +ldr q2, [x0, #304] +ldr q16, [x0, #368] +ldr q15, [x0, #432] +ldr q19, [x0, #496] +ldr q18, [x0, #560] +ldr q11, [x0, #624] +ldr q12, [x0, #688] +ldr q3, [x0, #752] +ldr q13, [x0, #48] +ldr q22, [x0, #112] +ldr q21, [x0, #176] +ldr q14, [x0, #240] +sqrdmulh v17.4S, v10.4S, v29.s[0] +mul v10.4S, v10.4S,v30.s[0] +mla v10.4S, v17.4S, v31.s[0] +sub v17.4s, v2.4s, v10.4s +add v2.4s, v2.4s, v10.4s +sqrdmulh v10.4S, v0.4S, v29.s[0] +mul v0.4S, v0.4S,v30.s[0] +mla v0.4S, v10.4S, v31.s[0] +sub v10.4s, v16.4s, v0.4s +add v16.4s, v16.4s, v0.4s +sqrdmulh v0.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +mla v1.4S, v0.4S, v31.s[0] +sub v0.4s, v15.4s, v1.4s +add v15.4s, v15.4s, v1.4s +sqrdmulh v1.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +mla v20.4S, v1.4S, v31.s[0] +sub v1.4s, v19.4s, v20.4s +add v19.4s, v19.4s, v20.4s +sqrdmulh v20.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +mla v18.4S, v20.4S, v31.s[0] +sub v20.4s, v13.4s, v18.4s +add v13.4s, v13.4s, v18.4s +sqrdmulh v18.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +mla v11.4S, v18.4S, v31.s[0] +sub v18.4s, v22.4s, v11.4s +add v22.4s, v22.4s, v11.4s +sqrdmulh v11.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +mla v12.4S, v11.4S, v31.s[0] +sub v11.4s, v21.4s, v12.4s +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +mla v3.4S, v12.4S, v31.s[0] +sub v12.4s, v14.4s, v3.4s +add v14.4s, v14.4s, v3.4s +sqrdmulh v3.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +mla v15.4S, v3.4S, v31.s[0] +sub v3.4s, v21.4s, v15.4s +add v21.4s, v21.4s, v15.4s +sqrdmulh v15.4S, v19.4S, v29.s[1] +mul v19.4S, v19.4S,v30.s[1] +mla v19.4S, v15.4S, v31.s[0] +sub v15.4s, v14.4s, v19.4s +add v14.4s, v14.4s, v19.4s +sqrdmulh v19.4S, v2.4S, v29.s[1] +mul v2.4S, v2.4S,v30.s[1] +mla v2.4S, v19.4S, v31.s[0] +sub v19.4s, v13.4s, v2.4s +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +mla v16.4S, v2.4S, v31.s[0] +sub v2.4s, v22.4s, v16.4s +add v22.4s, v22.4s, v16.4s +sqrdmulh v16.4S, v0.4S, v29.s[2] +mul v0.4S, v0.4S,v30.s[2] +mla v0.4S, v16.4S, v31.s[0] +sub v16.4s, v11.4s, v0.4s +add v11.4s, v11.4s, v0.4s +sqrdmulh v0.4S, v1.4S, v29.s[2] +mul v1.4S, v1.4S,v30.s[2] +mla v1.4S, v0.4S, v31.s[0] +sub v0.4s, v12.4s, v1.4s +add v12.4s, v12.4s, v1.4s +sqrdmulh v1.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +mla v17.4S, v1.4S, v31.s[0] +sub v1.4s, v20.4s, v17.4s +add v20.4s, v20.4s, v17.4s +sqrdmulh v17.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +mla v10.4S, v17.4S, v31.s[0] +sub v17.4s, v18.4s, v10.4s +add v18.4s, v18.4s, v10.4s +sqrdmulh v10.4S, v21.4S, v27.s[0] +mul v21.4S, v21.4S,v28.s[0] +mla v21.4S, v10.4S, v31.s[0] +sub v10.4s, v13.4s, v21.4s +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +mla v14.4S, v21.4S, v31.s[0] +sub v21.4s, v22.4s, v14.4s +add v22.4s, v22.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v27.s[1] +mul v3.4S, v3.4S,v28.s[1] +mla v3.4S, v14.4S, v31.s[0] +sub v14.4s, v19.4s, v3.4s +add v19.4s, v19.4s, v3.4s +sqrdmulh v3.4S, v15.4S, v27.s[1] +mul v15.4S, v15.4S,v28.s[1] +mla v15.4S, v3.4S, v31.s[0] +sub v3.4s, v2.4s, v15.4s +add v2.4s, v2.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v27.s[2] +mul v11.4S, v11.4S,v28.s[2] +mla v11.4S, v15.4S, v31.s[0] +sub v15.4s, v20.4s, v11.4s +add v20.4s, v20.4s, v11.4s +sqrdmulh v11.4S, v12.4S, v27.s[2] +mul v12.4S, v12.4S,v28.s[2] +mla v12.4S, v11.4S, v31.s[0] +sub v11.4s, v18.4s, v12.4s +add v18.4s, v18.4s, v12.4s +sqrdmulh v12.4S, v16.4S, v27.s[3] +mul v16.4S, v16.4S,v28.s[3] +mla v16.4S, v12.4S, v31.s[0] +sub v12.4s, v1.4s, v16.4s +add v1.4s, v1.4s, v16.4s +sqrdmulh v16.4S, v0.4S, v27.s[3] +mul v0.4S, v0.4S,v28.s[3] +mla v0.4S, v16.4S, v31.s[0] +sub v16.4s, v17.4s, v0.4s +add v17.4s, v17.4s, v0.4s +sqrdmulh v0.4S, v22.4S, v25.s[0] +mul v22.4S, v22.4S,v26.s[0] +mla v22.4S, v0.4S, v31.s[0] +sub v0.4s, v13.4s, v22.4s +add v13.4s, v13.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v25.s[1] +mul v21.4S, v21.4S,v26.s[1] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v10.4s, v21.4s +add v10.4s, v10.4s, v21.4s +sqrdmulh v21.4S, v2.4S, v25.s[2] +mul v2.4S, v2.4S,v26.s[2] +mla v2.4S, v21.4S, v31.s[0] +sub v21.4s, v19.4s, v2.4s +add v19.4s, v19.4s, v2.4s +sqrdmulh v2.4S, v3.4S, v25.s[3] +mul v3.4S, v3.4S,v26.s[3] +mla v3.4S, v2.4S, v31.s[0] +sub v2.4s, v14.4s, v3.4s +add v14.4s, v14.4s, v3.4s +sqrdmulh v3.4S, v18.4S, v23.s[0] +mul v18.4S, v18.4S,v24.s[0] +mla v18.4S, v3.4S, v31.s[0] +sub v3.4s, v20.4s, v18.4s +add v20.4s, v20.4s, v18.4s +sqrdmulh v18.4S, v11.4S, v23.s[1] +mul v11.4S, v11.4S,v24.s[1] +mla v11.4S, v18.4S, v31.s[0] +sub v18.4s, v15.4s, v11.4s +add v15.4s, v15.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v23.s[2] +mul v17.4S, v17.4S,v24.s[2] +mla v17.4S, v11.4S, v31.s[0] +sub v11.4s, v1.4s, v17.4s +add v1.4s, v1.4s, v17.4s +sqrdmulh v17.4S, v16.4S, v23.s[3] +mul v16.4S, v16.4S,v24.s[3] +mla v16.4S, v17.4S, v31.s[0] +sub v17.4s, v12.4s, v16.4s +add v12.4s, v12.4s, v16.4s +str q13, [x0, #48] +str q0, [x0, #112] +str q10, [x0, #176] +str q22, [x0, #240] +str q19, [x0, #304] +str q21, [x0, #368] +str q14, [x0, #432] +str q2, [x0, #496] +str q20, [x0, #560] +str q3, [x0, #624] +str q15, [x0, #688] +str q18, [x0, #752] +str q1, [x0, #816] +str q11, [x0, #880] +str q12, [x0, #944] +str q17, [x0, #1008] +ldr q17, [x0, #768] +ldr q12, [x0, #832] +ldr q11, [x0, #896] +ldr q1, [x0, #960] +ldr q18, [x0, #256] +ldr q15, [x0, #320] +ldr q3, [x0, #384] +ldr q20, [x0, #448] +ldr q2, [x0, #512] +ldr q14, [x0, #576] +ldr q21, [x0, #640] +ldr q19, [x0, #704] +ldr q22, [x0, #0] +ldr q10, [x0, #64] +ldr q0, [x0, #128] +ldr q13, [x0, #192] +sqrdmulh v16.4S, v17.4S, v29.s[0] +mul v17.4S, v17.4S,v30.s[0] +mla v17.4S, v16.4S, v31.s[0] +sub v16.4s, v18.4s, v17.4s +add v18.4s, v18.4s, v17.4s +sqrdmulh v17.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +mla v12.4S, v17.4S, v31.s[0] +sub v17.4s, v15.4s, v12.4s +add v15.4s, v15.4s, v12.4s +sqrdmulh v12.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +mla v11.4S, v12.4S, v31.s[0] +sub v12.4s, v3.4s, v11.4s +add v3.4s, v3.4s, v11.4s +sqrdmulh v11.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +mla v1.4S, v11.4S, v31.s[0] +sub v11.4s, v20.4s, v1.4s +add v20.4s, v20.4s, v1.4s +sqrdmulh v1.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +mla v2.4S, v1.4S, v31.s[0] +sub v1.4s, v22.4s, v2.4s +add v22.4s, v22.4s, v2.4s +sqrdmulh v2.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +mla v14.4S, v2.4S, v31.s[0] +sub v2.4s, v10.4s, v14.4s +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +mla v21.4S, v14.4S, v31.s[0] +sub v14.4s, v0.4s, v21.4s +add v0.4s, v0.4s, v21.4s +sqrdmulh v21.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +mla v19.4S, v21.4S, v31.s[0] +sub v21.4s, v13.4s, v19.4s +add v13.4s, v13.4s, v19.4s +sqrdmulh v19.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +mla v3.4S, v19.4S, v31.s[0] +sub v19.4s, v0.4s, v3.4s +add v0.4s, v0.4s, v3.4s +sqrdmulh v3.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +mla v20.4S, v3.4S, v31.s[0] +sub v3.4s, v13.4s, v20.4s +add v13.4s, v13.4s, v20.4s +sqrdmulh v20.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +mla v18.4S, v20.4S, v31.s[0] +sub v20.4s, v22.4s, v18.4s +add v22.4s, v22.4s, v18.4s +sqrdmulh v18.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +mla v15.4S, v18.4S, v31.s[0] +sub v18.4s, v10.4s, v15.4s +add v10.4s, v10.4s, v15.4s +sqrdmulh v15.4S, v12.4S, v29.s[2] +mul v12.4S, v12.4S,v30.s[2] +mla v12.4S, v15.4S, v31.s[0] +sub v15.4s, v14.4s, v12.4s +add v14.4s, v14.4s, v12.4s +sqrdmulh v12.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +mla v11.4S, v12.4S, v31.s[0] +sub v12.4s, v21.4s, v11.4s +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +mla v16.4S, v11.4S, v31.s[0] +sub v11.4s, v1.4s, v16.4s +add v1.4s, v1.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +mla v17.4S, v16.4S, v31.s[0] +sub v16.4s, v2.4s, v17.4s +add v2.4s, v2.4s, v17.4s +sqrdmulh v17.4S, v0.4S, v27.s[0] +mul v0.4S, v0.4S,v28.s[0] +mla v0.4S, v17.4S, v31.s[0] +sub v17.4s, v22.4s, v0.4s +add v22.4s, v22.4s, v0.4s +sqrdmulh v0.4S, v13.4S, v27.s[0] +mul v13.4S, v13.4S,v28.s[0] +mla v13.4S, v0.4S, v31.s[0] +sub v0.4s, v10.4s, v13.4s +add v10.4s, v10.4s, v13.4s +sqrdmulh v13.4S, v19.4S, v27.s[1] +mul v19.4S, v19.4S,v28.s[1] +mla v19.4S, v13.4S, v31.s[0] +sub v13.4s, v20.4s, v19.4s +add v20.4s, v20.4s, v19.4s +sqrdmulh v19.4S, v3.4S, v27.s[1] +mul v3.4S, v3.4S,v28.s[1] +mla v3.4S, v19.4S, v31.s[0] +sub v19.4s, v18.4s, v3.4s +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v14.4S, v27.s[2] +mul v14.4S, v14.4S,v28.s[2] +mla v14.4S, v3.4S, v31.s[0] +sub v3.4s, v1.4s, v14.4s +add v1.4s, v1.4s, v14.4s +sqrdmulh v14.4S, v21.4S, v27.s[2] +mul v21.4S, v21.4S,v28.s[2] +mla v21.4S, v14.4S, v31.s[0] +sub v14.4s, v2.4s, v21.4s +add v2.4s, v2.4s, v21.4s +sqrdmulh v21.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +mla v15.4S, v21.4S, v31.s[0] +sub v21.4s, v11.4s, v15.4s +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v12.4S, v27.s[3] +mul v12.4S, v12.4S,v28.s[3] +mla v12.4S, v15.4S, v31.s[0] +sub v15.4s, v16.4s, v12.4s +add v16.4s, v16.4s, v12.4s +sqrdmulh v12.4S, v10.4S, v25.s[0] +mul v10.4S, v10.4S,v26.s[0] +mla v10.4S, v12.4S, v31.s[0] +sub v12.4s, v22.4s, v10.4s +add v22.4s, v22.4s, v10.4s +sqrdmulh v10.4S, v0.4S, v25.s[1] +mul v0.4S, v0.4S,v26.s[1] +mla v0.4S, v10.4S, v31.s[0] +sub v10.4s, v17.4s, v0.4s +add v17.4s, v17.4s, v0.4s +sqrdmulh v0.4S, v18.4S, v25.s[2] +mul v18.4S, v18.4S,v26.s[2] +mla v18.4S, v0.4S, v31.s[0] +sub v0.4s, v20.4s, v18.4s +add v20.4s, v20.4s, v18.4s +sqrdmulh v18.4S, v19.4S, v25.s[3] +mul v19.4S, v19.4S,v26.s[3] +mla v19.4S, v18.4S, v31.s[0] +sub v18.4s, v13.4s, v19.4s +add v13.4s, v13.4s, v19.4s +sqrdmulh v19.4S, v2.4S, v23.s[0] +mul v2.4S, v2.4S,v24.s[0] +mla v2.4S, v19.4S, v31.s[0] +sub v19.4s, v1.4s, v2.4s +add v1.4s, v1.4s, v2.4s +sqrdmulh v2.4S, v14.4S, v23.s[1] +mul v14.4S, v14.4S,v24.s[1] +mla v14.4S, v2.4S, v31.s[0] +sub v2.4s, v3.4s, v14.4s +add v3.4s, v3.4s, v14.4s +sqrdmulh v14.4S, v16.4S, v23.s[2] +mul v16.4S, v16.4S,v24.s[2] +mla v16.4S, v14.4S, v31.s[0] +sub v14.4s, v11.4s, v16.4s +add v11.4s, v11.4s, v16.4s +sqrdmulh v16.4S, v15.4S, v23.s[3] +mul v15.4S, v15.4S,v24.s[3] +mla v15.4S, v16.4S, v31.s[0] +sub v16.4s, v21.4s, v15.4s +add v21.4s, v21.4s, v15.4s +str q22, [x0, #0] +str q12, [x0, #64] +str q17, [x0, #128] +str q10, [x0, #192] +str q20, [x0, #256] +str q0, [x0, #320] +str q13, [x0, #384] +str q18, [x0, #448] +str q1, [x0, #512] +str q19, [x0, #576] +str q3, [x0, #640] +str q2, [x0, #704] +str q11, [x0, #768] +str q14, [x0, #832] +str q21, [x0, #896] +str q16, [x0, #960] +ldr q16, [x0, #784] +ldr q21, [x0, #848] +ldr q14, [x0, #912] +ldr q11, [x0, #976] +ldr q2, [x0, #272] +ldr q3, [x0, #336] +ldr q19, [x0, #400] +ldr q1, [x0, #464] +ldr q18, [x0, #528] +ldr q13, [x0, #592] +ldr q0, [x0, #656] +ldr q20, [x0, #720] +ldr q10, [x0, #16] +ldr q17, [x0, #80] +ldr q12, [x0, #144] +ldr q22, [x0, #208] +sqrdmulh v15.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +mla v16.4S, v15.4S, v31.s[0] +sub v15.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +mla v21.4S, v16.4S, v31.s[0] +sub v16.4s, v3.4s, v21.4s +add v3.4s, v3.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +mla v14.4S, v21.4S, v31.s[0] +sub v21.4s, v19.4s, v14.4s +add v19.4s, v19.4s, v14.4s +sqrdmulh v14.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +mla v11.4S, v14.4S, v31.s[0] +sub v14.4s, v1.4s, v11.4s +add v1.4s, v1.4s, v11.4s +sqrdmulh v11.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +mla v18.4S, v11.4S, v31.s[0] +sub v11.4s, v10.4s, v18.4s +add v10.4s, v10.4s, v18.4s +sqrdmulh v18.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +mla v13.4S, v18.4S, v31.s[0] +sub v18.4s, v17.4s, v13.4s +add v17.4s, v17.4s, v13.4s +sqrdmulh v13.4S, v0.4S, v29.s[0] +mul v0.4S, v0.4S,v30.s[0] +mla v0.4S, v13.4S, v31.s[0] +sub v13.4s, v12.4s, v0.4s +add v12.4s, v12.4s, v0.4s +sqrdmulh v0.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +mla v20.4S, v0.4S, v31.s[0] +sub v0.4s, v22.4s, v20.4s +add v22.4s, v22.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v29.s[1] +mul v19.4S, v19.4S,v30.s[1] +mla v19.4S, v20.4S, v31.s[0] +sub v20.4s, v12.4s, v19.4s +add v12.4s, v12.4s, v19.4s +sqrdmulh v19.4S, v1.4S, v29.s[1] +mul v1.4S, v1.4S,v30.s[1] +mla v1.4S, v19.4S, v31.s[0] +sub v19.4s, v22.4s, v1.4s +add v22.4s, v22.4s, v1.4s +sqrdmulh v1.4S, v2.4S, v29.s[1] +mul v2.4S, v2.4S,v30.s[1] +mla v2.4S, v1.4S, v31.s[0] +sub v1.4s, v10.4s, v2.4s +add v10.4s, v10.4s, v2.4s +sqrdmulh v2.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +mla v3.4S, v2.4S, v31.s[0] +sub v2.4s, v17.4s, v3.4s +add v17.4s, v17.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +mla v21.4S, v3.4S, v31.s[0] +sub v3.4s, v13.4s, v21.4s +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +mla v14.4S, v21.4S, v31.s[0] +sub v21.4s, v0.4s, v14.4s +add v0.4s, v0.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +mla v15.4S, v14.4S, v31.s[0] +sub v14.4s, v11.4s, v15.4s +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +mla v16.4S, v15.4S, v31.s[0] +sub v15.4s, v18.4s, v16.4s +add v18.4s, v18.4s, v16.4s +sqrdmulh v16.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +mla v12.4S, v16.4S, v31.s[0] +sub v16.4s, v10.4s, v12.4s +add v10.4s, v10.4s, v12.4s +sqrdmulh v12.4S, v22.4S, v27.s[0] +mul v22.4S, v22.4S,v28.s[0] +mla v22.4S, v12.4S, v31.s[0] +sub v12.4s, v17.4s, v22.4s +add v17.4s, v17.4s, v22.4s +sqrdmulh v22.4S, v20.4S, v27.s[1] +mul v20.4S, v20.4S,v28.s[1] +mla v20.4S, v22.4S, v31.s[0] +sub v22.4s, v1.4s, v20.4s +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v27.s[1] +mul v19.4S, v19.4S,v28.s[1] +mla v19.4S, v20.4S, v31.s[0] +sub v20.4s, v2.4s, v19.4s +add v2.4s, v2.4s, v19.4s +sqrdmulh v19.4S, v13.4S, v27.s[2] +mul v13.4S, v13.4S,v28.s[2] +mla v13.4S, v19.4S, v31.s[0] +sub v19.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +sqrdmulh v13.4S, v0.4S, v27.s[2] +mul v0.4S, v0.4S,v28.s[2] +mla v0.4S, v13.4S, v31.s[0] +sub v13.4s, v18.4s, v0.4s +add v18.4s, v18.4s, v0.4s +sqrdmulh v0.4S, v3.4S, v27.s[3] +mul v3.4S, v3.4S,v28.s[3] +mla v3.4S, v0.4S, v31.s[0] +sub v0.4s, v14.4s, v3.4s +add v14.4s, v14.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +mla v21.4S, v3.4S, v31.s[0] +sub v3.4s, v15.4s, v21.4s +add v15.4s, v15.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v25.s[0] +mul v17.4S, v17.4S,v26.s[0] +mla v17.4S, v21.4S, v31.s[0] +sub v21.4s, v10.4s, v17.4s +add v10.4s, v10.4s, v17.4s +sqrdmulh v17.4S, v12.4S, v25.s[1] +mul v12.4S, v12.4S,v26.s[1] +mla v12.4S, v17.4S, v31.s[0] +sub v17.4s, v16.4s, v12.4s +add v16.4s, v16.4s, v12.4s +sqrdmulh v12.4S, v2.4S, v25.s[2] +mul v2.4S, v2.4S,v26.s[2] +mla v2.4S, v12.4S, v31.s[0] +sub v12.4s, v1.4s, v2.4s +add v1.4s, v1.4s, v2.4s +sqrdmulh v2.4S, v20.4S, v25.s[3] +mul v20.4S, v20.4S,v26.s[3] +mla v20.4S, v2.4S, v31.s[0] +sub v2.4s, v22.4s, v20.4s +add v22.4s, v22.4s, v20.4s +sqrdmulh v20.4S, v18.4S, v23.s[0] +mul v18.4S, v18.4S,v24.s[0] +mla v18.4S, v20.4S, v31.s[0] +sub v20.4s, v11.4s, v18.4s +add v11.4s, v11.4s, v18.4s +sqrdmulh v18.4S, v13.4S, v23.s[1] +mul v13.4S, v13.4S,v24.s[1] +mla v13.4S, v18.4S, v31.s[0] +sub v18.4s, v19.4s, v13.4s +add v19.4s, v19.4s, v13.4s +sqrdmulh v13.4S, v15.4S, v23.s[2] +mul v15.4S, v15.4S,v24.s[2] +mla v15.4S, v13.4S, v31.s[0] +sub v13.4s, v14.4s, v15.4s +add v14.4s, v14.4s, v15.4s +sqrdmulh v15.4S, v3.4S, v23.s[3] +mul v3.4S, v3.4S,v24.s[3] +mla v3.4S, v15.4S, v31.s[0] +sub v15.4s, v0.4s, v3.4s +add v0.4s, v0.4s, v3.4s +str q10, [x0, #16] +str q21, [x0, #80] +str q16, [x0, #144] +str q17, [x0, #208] +str q1, [x0, #272] +str q12, [x0, #336] +str q22, [x0, #400] +str q2, [x0, #464] +str q11, [x0, #528] +str q20, [x0, #592] +str q19, [x0, #656] +str q18, [x0, #720] +str q14, [x0, #784] +str q13, [x0, #848] +str q0, [x0, #912] +str q15, [x0, #976] +ldr q4, [x17, #+128] +ldr q5, [x17, #+144] +ldr q6, [x0, #32] +ldr q7, [x0, #48] +ldr q8, [x0, #0] +ldr q9, [x0, #16] +sqrdmulh v3.4S, v6.4S, v5.s[0] +mul v6.4S, v6.4S,v4.s[0] +mla v6.4S, v3.4S, v31.s[0] +sub v3.4s, v8.4s, v6.4s +add v8.4s, v8.4s, v6.4s +sqrdmulh v6.4S, v7.4S, v5.s[0] +mul v7.4S, v7.4S,v4.s[0] +mla v7.4S, v6.4S, v31.s[0] +sub v6.4s, v9.4s, v7.4s +add v9.4s, v9.4s, v7.4s +ldr q7, [x17, #+160] +ldr q10, [x17, #+176] +sqrdmulh v21.4S, v9.4S, v5.s[1] +mul v9.4S, v9.4S,v4.s[1] +mla v9.4S, v21.4S, v31.s[0] +sub v21.4s, v8.4s, v9.4s +add v8.4s, v8.4s, v9.4s +sqrdmulh v9.4S, v6.4S, v5.s[2] +mul v6.4S, v6.4S,v4.s[2] +mla v6.4S, v9.4S, v31.s[0] +sub v9.4s, v3.4s, v6.4s +add v3.4s, v3.4s, v6.4s +str q8, [x0, #0] +str q21, [x0, #16] +str q3, [x0, #32] +str q9, [x0, #48] +ldr q5, [x0, #96] +ldr q4, [x0, #112] +ldr q9, [x0, #64] +ldr q3, [x0, #80] +sqrdmulh v21.4S, v5.4S, v10.s[0] +mul v5.4S, v5.4S,v7.s[0] +mla v5.4S, v21.4S, v31.s[0] +sub v21.4s, v9.4s, v5.4s +add v9.4s, v9.4s, v5.4s +sqrdmulh v5.4S, v4.4S, v10.s[0] +mul v4.4S, v4.4S,v7.s[0] +mla v4.4S, v5.4S, v31.s[0] +sub v5.4s, v3.4s, v4.4s +add v3.4s, v3.4s, v4.4s +ldr q4, [x17, #+192] +ldr q8, [x17, #+208] +sqrdmulh v6.4S, v3.4S, v10.s[1] +mul v3.4S, v3.4S,v7.s[1] +mla v3.4S, v6.4S, v31.s[0] +sub v6.4s, v9.4s, v3.4s +add v9.4s, v9.4s, v3.4s +sqrdmulh v3.4S, v5.4S, v10.s[2] +mul v5.4S, v5.4S,v7.s[2] +mla v5.4S, v3.4S, v31.s[0] +sub v3.4s, v21.4s, v5.4s +add v21.4s, v21.4s, v5.4s +str q9, [x0, #64] +str q6, [x0, #80] +str q21, [x0, #96] +str q3, [x0, #112] +ldr q10, [x0, #160] +ldr q7, [x0, #176] +ldr q3, [x0, #128] +ldr q21, [x0, #144] +sqrdmulh v6.4S, v10.4S, v8.s[0] +mul v10.4S, v10.4S,v4.s[0] +mla v10.4S, v6.4S, v31.s[0] +sub v6.4s, v3.4s, v10.4s +add v3.4s, v3.4s, v10.4s +sqrdmulh v10.4S, v7.4S, v8.s[0] +mul v7.4S, v7.4S,v4.s[0] +mla v7.4S, v10.4S, v31.s[0] +sub v10.4s, v21.4s, v7.4s +add v21.4s, v21.4s, v7.4s +ldr q7, [x17, #+224] +ldr q9, [x17, #+240] +sqrdmulh v5.4S, v21.4S, v8.s[1] +mul v21.4S, v21.4S,v4.s[1] +mla v21.4S, v5.4S, v31.s[0] +sub v5.4s, v3.4s, v21.4s +add v3.4s, v3.4s, v21.4s +sqrdmulh v21.4S, v10.4S, v8.s[2] +mul v10.4S, v10.4S,v4.s[2] +mla v10.4S, v21.4S, v31.s[0] +sub v21.4s, v6.4s, v10.4s +add v6.4s, v6.4s, v10.4s +str q3, [x0, #128] +str q5, [x0, #144] +str q6, [x0, #160] +str q21, [x0, #176] +ldr q8, [x0, #224] +ldr q4, [x0, #240] +ldr q21, [x0, #192] +ldr q6, [x0, #208] +sqrdmulh v5.4S, v8.4S, v9.s[0] +mul v8.4S, v8.4S,v7.s[0] +mla v8.4S, v5.4S, v31.s[0] +sub v5.4s, v21.4s, v8.4s +add v21.4s, v21.4s, v8.4s +sqrdmulh v8.4S, v4.4S, v9.s[0] +mul v4.4S, v4.4S,v7.s[0] +mla v4.4S, v8.4S, v31.s[0] +sub v8.4s, v6.4s, v4.4s +add v6.4s, v6.4s, v4.4s +ldr q4, [x17, #+256] +ldr q3, [x17, #+272] +sqrdmulh v10.4S, v6.4S, v9.s[1] +mul v6.4S, v6.4S,v7.s[1] +mla v6.4S, v10.4S, v31.s[0] +sub v10.4s, v21.4s, v6.4s +add v21.4s, v21.4s, v6.4s +sqrdmulh v6.4S, v8.4S, v9.s[2] +mul v8.4S, v8.4S,v7.s[2] +mla v8.4S, v6.4S, v31.s[0] +sub v6.4s, v5.4s, v8.4s +add v5.4s, v5.4s, v8.4s +str q21, [x0, #192] +str q10, [x0, #208] +str q5, [x0, #224] +str q6, [x0, #240] +ldr q9, [x0, #288] +ldr q7, [x0, #304] +ldr q6, [x0, #256] +ldr q5, [x0, #272] +sqrdmulh v10.4S, v9.4S, v3.s[0] +mul v9.4S, v9.4S,v4.s[0] +mla v9.4S, v10.4S, v31.s[0] +sub v10.4s, v6.4s, v9.4s +add v6.4s, v6.4s, v9.4s +sqrdmulh v9.4S, v7.4S, v3.s[0] +mul v7.4S, v7.4S,v4.s[0] +mla v7.4S, v9.4S, v31.s[0] +sub v9.4s, v5.4s, v7.4s +add v5.4s, v5.4s, v7.4s +ldr q7, [x17, #+288] +ldr q21, [x17, #+304] +sqrdmulh v8.4S, v5.4S, v3.s[1] +mul v5.4S, v5.4S,v4.s[1] +mla v5.4S, v8.4S, v31.s[0] +sub v8.4s, v6.4s, v5.4s +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v9.4S, v3.s[2] +mul v9.4S, v9.4S,v4.s[2] +mla v9.4S, v5.4S, v31.s[0] +sub v5.4s, v10.4s, v9.4s +add v10.4s, v10.4s, v9.4s +str q6, [x0, #256] +str q8, [x0, #272] +str q10, [x0, #288] +str q5, [x0, #304] +ldr q3, [x0, #352] +ldr q4, [x0, #368] +ldr q5, [x0, #320] +ldr q10, [x0, #336] +sqrdmulh v8.4S, v3.4S, v21.s[0] +mul v3.4S, v3.4S,v7.s[0] +mla v3.4S, v8.4S, v31.s[0] +sub v8.4s, v5.4s, v3.4s +add v5.4s, v5.4s, v3.4s +sqrdmulh v3.4S, v4.4S, v21.s[0] +mul v4.4S, v4.4S,v7.s[0] +mla v4.4S, v3.4S, v31.s[0] +sub v3.4s, v10.4s, v4.4s +add v10.4s, v10.4s, v4.4s +ldr q4, [x17, #+320] +ldr q6, [x17, #+336] +sqrdmulh v9.4S, v10.4S, v21.s[1] +mul v10.4S, v10.4S,v7.s[1] +mla v10.4S, v9.4S, v31.s[0] +sub v9.4s, v5.4s, v10.4s +add v5.4s, v5.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v21.s[2] +mul v3.4S, v3.4S,v7.s[2] +mla v3.4S, v10.4S, v31.s[0] +sub v10.4s, v8.4s, v3.4s +add v8.4s, v8.4s, v3.4s +str q5, [x0, #320] +str q9, [x0, #336] +str q8, [x0, #352] +str q10, [x0, #368] +ldr q21, [x0, #416] +ldr q7, [x0, #432] +ldr q10, [x0, #384] +ldr q8, [x0, #400] +sqrdmulh v9.4S, v21.4S, v6.s[0] +mul v21.4S, v21.4S,v4.s[0] +mla v21.4S, v9.4S, v31.s[0] +sub v9.4s, v10.4s, v21.4s +add v10.4s, v10.4s, v21.4s +sqrdmulh v21.4S, v7.4S, v6.s[0] +mul v7.4S, v7.4S,v4.s[0] +mla v7.4S, v21.4S, v31.s[0] +sub v21.4s, v8.4s, v7.4s +add v8.4s, v8.4s, v7.4s +ldr q7, [x17, #+352] +ldr q5, [x17, #+368] +sqrdmulh v3.4S, v8.4S, v6.s[1] +mul v8.4S, v8.4S,v4.s[1] +mla v8.4S, v3.4S, v31.s[0] +sub v3.4s, v10.4s, v8.4s +add v10.4s, v10.4s, v8.4s +sqrdmulh v8.4S, v21.4S, v6.s[2] +mul v21.4S, v21.4S,v4.s[2] +mla v21.4S, v8.4S, v31.s[0] +sub v8.4s, v9.4s, v21.4s +add v9.4s, v9.4s, v21.4s +str q10, [x0, #384] +str q3, [x0, #400] +str q9, [x0, #416] +str q8, [x0, #432] +ldr q6, [x0, #480] +ldr q4, [x0, #496] +ldr q8, [x0, #448] +ldr q9, [x0, #464] +sqrdmulh v3.4S, v6.4S, v5.s[0] +mul v6.4S, v6.4S,v7.s[0] +mla v6.4S, v3.4S, v31.s[0] +sub v3.4s, v8.4s, v6.4s +add v8.4s, v8.4s, v6.4s +sqrdmulh v6.4S, v4.4S, v5.s[0] +mul v4.4S, v4.4S,v7.s[0] +mla v4.4S, v6.4S, v31.s[0] +sub v6.4s, v9.4s, v4.4s +add v9.4s, v9.4s, v4.4s +ldr q4, [x17, #+384] +ldr q10, [x17, #+400] +sqrdmulh v21.4S, v9.4S, v5.s[1] +mul v9.4S, v9.4S,v7.s[1] +mla v9.4S, v21.4S, v31.s[0] +sub v21.4s, v8.4s, v9.4s +add v8.4s, v8.4s, v9.4s +sqrdmulh v9.4S, v6.4S, v5.s[2] +mul v6.4S, v6.4S,v7.s[2] +mla v6.4S, v9.4S, v31.s[0] +sub v9.4s, v3.4s, v6.4s +add v3.4s, v3.4s, v6.4s +str q8, [x0, #448] +str q21, [x0, #464] +str q3, [x0, #480] +str q9, [x0, #496] +ldr q5, [x0, #544] +ldr q7, [x0, #560] +ldr q9, [x0, #512] +ldr q3, [x0, #528] +sqrdmulh v21.4S, v5.4S, v10.s[0] +mul v5.4S, v5.4S,v4.s[0] +mla v5.4S, v21.4S, v31.s[0] +sub v21.4s, v9.4s, v5.4s +add v9.4s, v9.4s, v5.4s +sqrdmulh v5.4S, v7.4S, v10.s[0] +mul v7.4S, v7.4S,v4.s[0] +mla v7.4S, v5.4S, v31.s[0] +sub v5.4s, v3.4s, v7.4s +add v3.4s, v3.4s, v7.4s +ldr q7, [x17, #+416] +ldr q8, [x17, #+432] +sqrdmulh v6.4S, v3.4S, v10.s[1] +mul v3.4S, v3.4S,v4.s[1] +mla v3.4S, v6.4S, v31.s[0] +sub v6.4s, v9.4s, v3.4s +add v9.4s, v9.4s, v3.4s +sqrdmulh v3.4S, v5.4S, v10.s[2] +mul v5.4S, v5.4S,v4.s[2] +mla v5.4S, v3.4S, v31.s[0] +sub v3.4s, v21.4s, v5.4s +add v21.4s, v21.4s, v5.4s +str q9, [x0, #512] +str q6, [x0, #528] +str q21, [x0, #544] +str q3, [x0, #560] +ldr q10, [x0, #608] +ldr q4, [x0, #624] +ldr q3, [x0, #576] +ldr q21, [x0, #592] +sqrdmulh v6.4S, v10.4S, v8.s[0] +mul v10.4S, v10.4S,v7.s[0] +mla v10.4S, v6.4S, v31.s[0] +sub v6.4s, v3.4s, v10.4s +add v3.4s, v3.4s, v10.4s +sqrdmulh v10.4S, v4.4S, v8.s[0] +mul v4.4S, v4.4S,v7.s[0] +mla v4.4S, v10.4S, v31.s[0] +sub v10.4s, v21.4s, v4.4s +add v21.4s, v21.4s, v4.4s +ldr q4, [x17, #+448] +ldr q9, [x17, #+464] +sqrdmulh v5.4S, v21.4S, v8.s[1] +mul v21.4S, v21.4S,v7.s[1] +mla v21.4S, v5.4S, v31.s[0] +sub v5.4s, v3.4s, v21.4s +add v3.4s, v3.4s, v21.4s +sqrdmulh v21.4S, v10.4S, v8.s[2] +mul v10.4S, v10.4S,v7.s[2] +mla v10.4S, v21.4S, v31.s[0] +sub v21.4s, v6.4s, v10.4s +add v6.4s, v6.4s, v10.4s +str q3, [x0, #576] +str q5, [x0, #592] +str q6, [x0, #608] +str q21, [x0, #624] +ldr q8, [x0, #672] +ldr q7, [x0, #688] +ldr q21, [x0, #640] +ldr q6, [x0, #656] +sqrdmulh v5.4S, v8.4S, v9.s[0] +mul v8.4S, v8.4S,v4.s[0] +mla v8.4S, v5.4S, v31.s[0] +sub v5.4s, v21.4s, v8.4s +add v21.4s, v21.4s, v8.4s +sqrdmulh v8.4S, v7.4S, v9.s[0] +mul v7.4S, v7.4S,v4.s[0] +mla v7.4S, v8.4S, v31.s[0] +sub v8.4s, v6.4s, v7.4s +add v6.4s, v6.4s, v7.4s +ldr q7, [x17, #+480] +ldr q3, [x17, #+496] +sqrdmulh v10.4S, v6.4S, v9.s[1] +mul v6.4S, v6.4S,v4.s[1] +mla v6.4S, v10.4S, v31.s[0] +sub v10.4s, v21.4s, v6.4s +add v21.4s, v21.4s, v6.4s +sqrdmulh v6.4S, v8.4S, v9.s[2] +mul v8.4S, v8.4S,v4.s[2] +mla v8.4S, v6.4S, v31.s[0] +sub v6.4s, v5.4s, v8.4s +add v5.4s, v5.4s, v8.4s +str q21, [x0, #640] +str q10, [x0, #656] +str q5, [x0, #672] +str q6, [x0, #688] +ldr q9, [x0, #736] +ldr q4, [x0, #752] +ldr q6, [x0, #704] +ldr q5, [x0, #720] +sqrdmulh v10.4S, v9.4S, v3.s[0] +mul v9.4S, v9.4S,v7.s[0] +mla v9.4S, v10.4S, v31.s[0] +sub v10.4s, v6.4s, v9.4s +add v6.4s, v6.4s, v9.4s +sqrdmulh v9.4S, v4.4S, v3.s[0] +mul v4.4S, v4.4S,v7.s[0] +mla v4.4S, v9.4S, v31.s[0] +sub v9.4s, v5.4s, v4.4s +add v5.4s, v5.4s, v4.4s +ldr q4, [x17, #+512] +ldr q21, [x17, #+528] +sqrdmulh v8.4S, v5.4S, v3.s[1] +mul v5.4S, v5.4S,v7.s[1] +mla v5.4S, v8.4S, v31.s[0] +sub v8.4s, v6.4s, v5.4s +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v9.4S, v3.s[2] +mul v9.4S, v9.4S,v7.s[2] +mla v9.4S, v5.4S, v31.s[0] +sub v5.4s, v10.4s, v9.4s +add v10.4s, v10.4s, v9.4s +str q6, [x0, #704] +str q8, [x0, #720] +str q10, [x0, #736] +str q5, [x0, #752] +ldr q3, [x0, #800] +ldr q7, [x0, #816] +ldr q5, [x0, #768] +ldr q10, [x0, #784] +sqrdmulh v8.4S, v3.4S, v21.s[0] +mul v3.4S, v3.4S,v4.s[0] +mla v3.4S, v8.4S, v31.s[0] +sub v8.4s, v5.4s, v3.4s +add v5.4s, v5.4s, v3.4s +sqrdmulh v3.4S, v7.4S, v21.s[0] +mul v7.4S, v7.4S,v4.s[0] +mla v7.4S, v3.4S, v31.s[0] +sub v3.4s, v10.4s, v7.4s +add v10.4s, v10.4s, v7.4s +ldr q7, [x17, #+544] +ldr q6, [x17, #+560] +sqrdmulh v9.4S, v10.4S, v21.s[1] +mul v10.4S, v10.4S,v4.s[1] +mla v10.4S, v9.4S, v31.s[0] +sub v9.4s, v5.4s, v10.4s +add v5.4s, v5.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v21.s[2] +mul v3.4S, v3.4S,v4.s[2] +mla v3.4S, v10.4S, v31.s[0] +sub v10.4s, v8.4s, v3.4s +add v8.4s, v8.4s, v3.4s +str q5, [x0, #768] +str q9, [x0, #784] +str q8, [x0, #800] +str q10, [x0, #816] +ldr q21, [x0, #864] +ldr q4, [x0, #880] +ldr q10, [x0, #832] +ldr q8, [x0, #848] +sqrdmulh v9.4S, v21.4S, v6.s[0] +mul v21.4S, v21.4S,v7.s[0] +mla v21.4S, v9.4S, v31.s[0] +sub v9.4s, v10.4s, v21.4s +add v10.4s, v10.4s, v21.4s +sqrdmulh v21.4S, v4.4S, v6.s[0] +mul v4.4S, v4.4S,v7.s[0] +mla v4.4S, v21.4S, v31.s[0] +sub v21.4s, v8.4s, v4.4s +add v8.4s, v8.4s, v4.4s +ldr q4, [x17, #+576] +ldr q5, [x17, #+592] +sqrdmulh v3.4S, v8.4S, v6.s[1] +mul v8.4S, v8.4S,v7.s[1] +mla v8.4S, v3.4S, v31.s[0] +sub v3.4s, v10.4s, v8.4s +add v10.4s, v10.4s, v8.4s +sqrdmulh v8.4S, v21.4S, v6.s[2] +mul v21.4S, v21.4S,v7.s[2] +mla v21.4S, v8.4S, v31.s[0] +sub v8.4s, v9.4s, v21.4s +add v9.4s, v9.4s, v21.4s +str q10, [x0, #832] +str q3, [x0, #848] +str q9, [x0, #864] +str q8, [x0, #880] +ldr q6, [x0, #928] +ldr q7, [x0, #944] +ldr q8, [x0, #896] +ldr q9, [x0, #912] +sqrdmulh v3.4S, v6.4S, v5.s[0] +mul v6.4S, v6.4S,v4.s[0] +mla v6.4S, v3.4S, v31.s[0] +sub v3.4s, v8.4s, v6.4s +add v8.4s, v8.4s, v6.4s +sqrdmulh v6.4S, v7.4S, v5.s[0] +mul v7.4S, v7.4S,v4.s[0] +mla v7.4S, v6.4S, v31.s[0] +sub v6.4s, v9.4s, v7.4s +add v9.4s, v9.4s, v7.4s +ldr q7, [x17, #+608] +ldr q10, [x17, #+624] +sqrdmulh v21.4S, v9.4S, v5.s[1] +mul v9.4S, v9.4S,v4.s[1] +mla v9.4S, v21.4S, v31.s[0] +sub v21.4s, v8.4s, v9.4s +add v8.4s, v8.4s, v9.4s +sqrdmulh v9.4S, v6.4S, v5.s[2] +mul v6.4S, v6.4S,v4.s[2] +mla v6.4S, v9.4S, v31.s[0] +sub v9.4s, v3.4s, v6.4s +add v3.4s, v3.4s, v6.4s +str q8, [x0, #896] +str q21, [x0, #912] +str q3, [x0, #928] +str q9, [x0, #944] +ldr q5, [x0, #992] +ldr q4, [x0, #1008] +ldr q9, [x0, #960] +ldr q3, [x0, #976] +sqrdmulh v21.4S, v5.4S, v10.s[0] +mul v5.4S, v5.4S,v7.s[0] +mla v5.4S, v21.4S, v31.s[0] +sub v21.4s, v9.4s, v5.4s +add v9.4s, v9.4s, v5.4s +sqrdmulh v5.4S, v4.4S, v10.s[0] +mul v4.4S, v4.4S,v7.s[0] +mla v4.4S, v5.4S, v31.s[0] +sub v5.4s, v3.4s, v4.4s +add v3.4s, v3.4s, v4.4s +sqrdmulh v4.4S, v3.4S, v10.s[1] +mul v3.4S, v3.4S,v7.s[1] +mla v3.4S, v4.4S, v31.s[0] +sub v4.4s, v9.4s, v3.4s +add v9.4s, v9.4s, v3.4s +sqrdmulh v3.4S, v5.4S, v10.s[2] +mul v5.4S, v5.4S,v7.s[2] +mla v5.4S, v3.4S, v31.s[0] +sub v3.4s, v21.4s, v5.4s +add v21.4s, v21.4s, v5.4s +str q9, [x0, #960] +str q4, [x0, #976] +str q21, [x0, #992] +str q3, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1464 +// Instruction count: 1460 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_0_z4_0.s b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_0_z4_0.s new file mode 100644 index 0000000..846ba45 --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_0_z4_0.s @@ -0,0 +1,1494 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_0_z4_0 +.global _ntt_u32_incomplete_neon_asm_var_4_2_0_z4_0 +ntt_u32_incomplete_neon_asm_var_4_2_0_z4_0: +_ntt_u32_incomplete_neon_asm_var_4_2_0_z4_0: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #800] +ldr q21, [x0, #864] +ldr q20, [x0, #928] +ldr q19, [x0, #992] +ldr q18, [x0, #288] +ldr q17, [x0, #352] +ldr q16, [x0, #416] +ldr q3, [x0, #480] +ldr q2, [x0, #544] +ldr q1, [x0, #608] +ldr q0, [x0, #672] +ldr q15, [x0, #736] +ldr q14, [x0, #32] +ldr q13, [x0, #96] +ldr q12, [x0, #160] +ldr q11, [x0, #224] +sqrdmulh v10.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +mla v22.4S, v10.4S, v31.s[0] +sub v10.4s, v18.4s, v22.4s +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v17.4s, v21.4s +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +mla v20.4S, v21.4S, v31.s[0] +sub v21.4s, v16.4s, v20.4s +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +mla v19.4S, v20.4S, v31.s[0] +sub v20.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +mla v2.4S, v19.4S, v31.s[0] +sub v19.4s, v14.4s, v2.4s +add v14.4s, v14.4s, v2.4s +sqrdmulh v2.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +mla v1.4S, v2.4S, v31.s[0] +sub v2.4s, v13.4s, v1.4s +add v13.4s, v13.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v29.s[0] +mul v0.4S, v0.4S,v30.s[0] +mla v0.4S, v1.4S, v31.s[0] +sub v1.4s, v12.4s, v0.4s +add v12.4s, v12.4s, v0.4s +sqrdmulh v0.4S, v15.4S, v29.s[0] +mul v15.4S, v15.4S,v30.s[0] +mla v15.4S, v0.4S, v31.s[0] +sub v0.4s, v11.4s, v15.4s +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +mla v16.4S, v15.4S, v31.s[0] +sub v15.4s, v12.4s, v16.4s +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +mla v3.4S, v16.4S, v31.s[0] +sub v16.4s, v11.4s, v3.4s +add v11.4s, v11.4s, v3.4s +sqrdmulh v3.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +mla v18.4S, v3.4S, v31.s[0] +sub v3.4s, v14.4s, v18.4s +add v14.4s, v14.4s, v18.4s +sqrdmulh v18.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +mla v17.4S, v18.4S, v31.s[0] +sub v18.4s, v13.4s, v17.4s +add v13.4s, v13.4s, v17.4s +sqrdmulh v17.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +mla v21.4S, v17.4S, v31.s[0] +sub v17.4s, v1.4s, v21.4s +add v1.4s, v1.4s, v21.4s +sqrdmulh v21.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +mla v20.4S, v21.4S, v31.s[0] +sub v21.4s, v0.4s, v20.4s +add v0.4s, v0.4s, v20.4s +sqrdmulh v20.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +mla v10.4S, v20.4S, v31.s[0] +sub v20.4s, v19.4s, v10.4s +add v19.4s, v19.4s, v10.4s +sqrdmulh v10.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +mla v22.4S, v10.4S, v31.s[0] +sub v10.4s, v2.4s, v22.4s +add v2.4s, v2.4s, v22.4s +sqrdmulh v22.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +mla v12.4S, v22.4S, v31.s[0] +sub v22.4s, v14.4s, v12.4s +add v14.4s, v14.4s, v12.4s +sqrdmulh v12.4S, v11.4S, v27.s[0] +mul v11.4S, v11.4S,v28.s[0] +mla v11.4S, v12.4S, v31.s[0] +sub v12.4s, v13.4s, v11.4s +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v15.4S, v27.s[1] +mul v15.4S, v15.4S,v28.s[1] +mla v15.4S, v11.4S, v31.s[0] +sub v11.4s, v3.4s, v15.4s +add v3.4s, v3.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v27.s[1] +mul v16.4S, v16.4S,v28.s[1] +mla v16.4S, v15.4S, v31.s[0] +sub v15.4s, v18.4s, v16.4s +add v18.4s, v18.4s, v16.4s +sqrdmulh v16.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +mla v1.4S, v16.4S, v31.s[0] +sub v16.4s, v19.4s, v1.4s +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v27.s[2] +mul v0.4S, v0.4S,v28.s[2] +mla v0.4S, v1.4S, v31.s[0] +sub v1.4s, v2.4s, v0.4s +add v2.4s, v2.4s, v0.4s +sqrdmulh v0.4S, v17.4S, v27.s[3] +mul v17.4S, v17.4S,v28.s[3] +mla v17.4S, v0.4S, v31.s[0] +sub v0.4s, v20.4s, v17.4s +add v20.4s, v20.4s, v17.4s +sqrdmulh v17.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +mla v21.4S, v17.4S, v31.s[0] +sub v17.4s, v10.4s, v21.4s +add v10.4s, v10.4s, v21.4s +sqrdmulh v21.4S, v13.4S, v25.s[0] +mul v13.4S, v13.4S,v26.s[0] +mla v13.4S, v21.4S, v31.s[0] +sub v21.4s, v14.4s, v13.4s +add v14.4s, v14.4s, v13.4s +sqrdmulh v13.4S, v12.4S, v25.s[1] +mul v12.4S, v12.4S,v26.s[1] +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v18.4S, v25.s[2] +mul v18.4S, v18.4S,v26.s[2] +mla v18.4S, v12.4S, v31.s[0] +sub v12.4s, v3.4s, v18.4s +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v15.4S, v25.s[3] +mul v15.4S, v15.4S,v26.s[3] +mla v15.4S, v18.4S, v31.s[0] +sub v18.4s, v11.4s, v15.4s +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v23.s[0] +mul v2.4S, v2.4S,v24.s[0] +mla v2.4S, v15.4S, v31.s[0] +sub v15.4s, v19.4s, v2.4s +add v19.4s, v19.4s, v2.4s +sqrdmulh v2.4S, v1.4S, v23.s[1] +mul v1.4S, v1.4S,v24.s[1] +mla v1.4S, v2.4S, v31.s[0] +sub v2.4s, v16.4s, v1.4s +add v16.4s, v16.4s, v1.4s +sqrdmulh v1.4S, v10.4S, v23.s[2] +mul v10.4S, v10.4S,v24.s[2] +mla v10.4S, v1.4S, v31.s[0] +sub v1.4s, v20.4s, v10.4s +add v20.4s, v20.4s, v10.4s +sqrdmulh v10.4S, v17.4S, v23.s[3] +mul v17.4S, v17.4S,v24.s[3] +mla v17.4S, v10.4S, v31.s[0] +sub v10.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +str q14, [x0, #32] +str q21, [x0, #96] +str q22, [x0, #160] +str q13, [x0, #224] +str q3, [x0, #288] +str q12, [x0, #352] +str q11, [x0, #416] +str q18, [x0, #480] +str q19, [x0, #544] +str q15, [x0, #608] +str q16, [x0, #672] +str q2, [x0, #736] +str q20, [x0, #800] +str q1, [x0, #864] +str q0, [x0, #928] +str q10, [x0, #992] +ldr q10, [x0, #816] +ldr q0, [x0, #880] +ldr q1, [x0, #944] +ldr q20, [x0, #1008] +ldr q2, [x0, #304] +ldr q16, [x0, #368] +ldr q15, [x0, #432] +ldr q19, [x0, #496] +ldr q18, [x0, #560] +ldr q11, [x0, #624] +ldr q12, [x0, #688] +ldr q3, [x0, #752] +ldr q13, [x0, #48] +ldr q22, [x0, #112] +ldr q21, [x0, #176] +ldr q14, [x0, #240] +sqrdmulh v17.4S, v10.4S, v29.s[0] +mul v10.4S, v10.4S,v30.s[0] +mla v10.4S, v17.4S, v31.s[0] +sub v17.4s, v2.4s, v10.4s +add v2.4s, v2.4s, v10.4s +sqrdmulh v10.4S, v0.4S, v29.s[0] +mul v0.4S, v0.4S,v30.s[0] +mla v0.4S, v10.4S, v31.s[0] +sub v10.4s, v16.4s, v0.4s +add v16.4s, v16.4s, v0.4s +sqrdmulh v0.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +mla v1.4S, v0.4S, v31.s[0] +sub v0.4s, v15.4s, v1.4s +add v15.4s, v15.4s, v1.4s +sqrdmulh v1.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +mla v20.4S, v1.4S, v31.s[0] +sub v1.4s, v19.4s, v20.4s +add v19.4s, v19.4s, v20.4s +sqrdmulh v20.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +mla v18.4S, v20.4S, v31.s[0] +sub v20.4s, v13.4s, v18.4s +add v13.4s, v13.4s, v18.4s +sqrdmulh v18.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +mla v11.4S, v18.4S, v31.s[0] +sub v18.4s, v22.4s, v11.4s +add v22.4s, v22.4s, v11.4s +sqrdmulh v11.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +mla v12.4S, v11.4S, v31.s[0] +sub v11.4s, v21.4s, v12.4s +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +mla v3.4S, v12.4S, v31.s[0] +sub v12.4s, v14.4s, v3.4s +add v14.4s, v14.4s, v3.4s +sqrdmulh v3.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +mla v15.4S, v3.4S, v31.s[0] +sub v3.4s, v21.4s, v15.4s +add v21.4s, v21.4s, v15.4s +sqrdmulh v15.4S, v19.4S, v29.s[1] +mul v19.4S, v19.4S,v30.s[1] +mla v19.4S, v15.4S, v31.s[0] +sub v15.4s, v14.4s, v19.4s +add v14.4s, v14.4s, v19.4s +sqrdmulh v19.4S, v2.4S, v29.s[1] +mul v2.4S, v2.4S,v30.s[1] +mla v2.4S, v19.4S, v31.s[0] +sub v19.4s, v13.4s, v2.4s +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +mla v16.4S, v2.4S, v31.s[0] +sub v2.4s, v22.4s, v16.4s +add v22.4s, v22.4s, v16.4s +sqrdmulh v16.4S, v0.4S, v29.s[2] +mul v0.4S, v0.4S,v30.s[2] +mla v0.4S, v16.4S, v31.s[0] +sub v16.4s, v11.4s, v0.4s +add v11.4s, v11.4s, v0.4s +sqrdmulh v0.4S, v1.4S, v29.s[2] +mul v1.4S, v1.4S,v30.s[2] +mla v1.4S, v0.4S, v31.s[0] +sub v0.4s, v12.4s, v1.4s +add v12.4s, v12.4s, v1.4s +sqrdmulh v1.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +mla v17.4S, v1.4S, v31.s[0] +sub v1.4s, v20.4s, v17.4s +add v20.4s, v20.4s, v17.4s +sqrdmulh v17.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +mla v10.4S, v17.4S, v31.s[0] +sub v17.4s, v18.4s, v10.4s +add v18.4s, v18.4s, v10.4s +sqrdmulh v10.4S, v21.4S, v27.s[0] +mul v21.4S, v21.4S,v28.s[0] +mla v21.4S, v10.4S, v31.s[0] +sub v10.4s, v13.4s, v21.4s +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +mla v14.4S, v21.4S, v31.s[0] +sub v21.4s, v22.4s, v14.4s +add v22.4s, v22.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v27.s[1] +mul v3.4S, v3.4S,v28.s[1] +mla v3.4S, v14.4S, v31.s[0] +sub v14.4s, v19.4s, v3.4s +add v19.4s, v19.4s, v3.4s +sqrdmulh v3.4S, v15.4S, v27.s[1] +mul v15.4S, v15.4S,v28.s[1] +mla v15.4S, v3.4S, v31.s[0] +sub v3.4s, v2.4s, v15.4s +add v2.4s, v2.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v27.s[2] +mul v11.4S, v11.4S,v28.s[2] +mla v11.4S, v15.4S, v31.s[0] +sub v15.4s, v20.4s, v11.4s +add v20.4s, v20.4s, v11.4s +sqrdmulh v11.4S, v12.4S, v27.s[2] +mul v12.4S, v12.4S,v28.s[2] +mla v12.4S, v11.4S, v31.s[0] +sub v11.4s, v18.4s, v12.4s +add v18.4s, v18.4s, v12.4s +sqrdmulh v12.4S, v16.4S, v27.s[3] +mul v16.4S, v16.4S,v28.s[3] +mla v16.4S, v12.4S, v31.s[0] +sub v12.4s, v1.4s, v16.4s +add v1.4s, v1.4s, v16.4s +sqrdmulh v16.4S, v0.4S, v27.s[3] +mul v0.4S, v0.4S,v28.s[3] +mla v0.4S, v16.4S, v31.s[0] +sub v16.4s, v17.4s, v0.4s +add v17.4s, v17.4s, v0.4s +sqrdmulh v0.4S, v22.4S, v25.s[0] +mul v22.4S, v22.4S,v26.s[0] +mla v22.4S, v0.4S, v31.s[0] +sub v0.4s, v13.4s, v22.4s +add v13.4s, v13.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v25.s[1] +mul v21.4S, v21.4S,v26.s[1] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v10.4s, v21.4s +add v10.4s, v10.4s, v21.4s +sqrdmulh v21.4S, v2.4S, v25.s[2] +mul v2.4S, v2.4S,v26.s[2] +mla v2.4S, v21.4S, v31.s[0] +sub v21.4s, v19.4s, v2.4s +add v19.4s, v19.4s, v2.4s +sqrdmulh v2.4S, v3.4S, v25.s[3] +mul v3.4S, v3.4S,v26.s[3] +mla v3.4S, v2.4S, v31.s[0] +sub v2.4s, v14.4s, v3.4s +add v14.4s, v14.4s, v3.4s +sqrdmulh v3.4S, v18.4S, v23.s[0] +mul v18.4S, v18.4S,v24.s[0] +mla v18.4S, v3.4S, v31.s[0] +sub v3.4s, v20.4s, v18.4s +add v20.4s, v20.4s, v18.4s +sqrdmulh v18.4S, v11.4S, v23.s[1] +mul v11.4S, v11.4S,v24.s[1] +mla v11.4S, v18.4S, v31.s[0] +sub v18.4s, v15.4s, v11.4s +add v15.4s, v15.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v23.s[2] +mul v17.4S, v17.4S,v24.s[2] +mla v17.4S, v11.4S, v31.s[0] +sub v11.4s, v1.4s, v17.4s +add v1.4s, v1.4s, v17.4s +sqrdmulh v17.4S, v16.4S, v23.s[3] +mul v16.4S, v16.4S,v24.s[3] +mla v16.4S, v17.4S, v31.s[0] +sub v17.4s, v12.4s, v16.4s +add v12.4s, v12.4s, v16.4s +str q13, [x0, #48] +str q0, [x0, #112] +str q10, [x0, #176] +str q22, [x0, #240] +str q19, [x0, #304] +str q21, [x0, #368] +str q14, [x0, #432] +str q2, [x0, #496] +str q20, [x0, #560] +str q3, [x0, #624] +str q15, [x0, #688] +str q18, [x0, #752] +str q1, [x0, #816] +str q11, [x0, #880] +str q12, [x0, #944] +str q17, [x0, #1008] +ldr q17, [x0, #768] +ldr q12, [x0, #832] +ldr q11, [x0, #896] +ldr q1, [x0, #960] +ldr q18, [x0, #256] +ldr q15, [x0, #320] +ldr q3, [x0, #384] +ldr q20, [x0, #448] +ldr q2, [x0, #512] +ldr q14, [x0, #576] +ldr q21, [x0, #640] +ldr q19, [x0, #704] +ldr q22, [x0, #0] +ldr q10, [x0, #64] +ldr q0, [x0, #128] +ldr q13, [x0, #192] +sqrdmulh v16.4S, v17.4S, v29.s[0] +mul v17.4S, v17.4S,v30.s[0] +mla v17.4S, v16.4S, v31.s[0] +sub v16.4s, v18.4s, v17.4s +add v18.4s, v18.4s, v17.4s +sqrdmulh v17.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +mla v12.4S, v17.4S, v31.s[0] +sub v17.4s, v15.4s, v12.4s +add v15.4s, v15.4s, v12.4s +sqrdmulh v12.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +mla v11.4S, v12.4S, v31.s[0] +sub v12.4s, v3.4s, v11.4s +add v3.4s, v3.4s, v11.4s +sqrdmulh v11.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +mla v1.4S, v11.4S, v31.s[0] +sub v11.4s, v20.4s, v1.4s +add v20.4s, v20.4s, v1.4s +sqrdmulh v1.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +mla v2.4S, v1.4S, v31.s[0] +sub v1.4s, v22.4s, v2.4s +add v22.4s, v22.4s, v2.4s +sqrdmulh v2.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +mla v14.4S, v2.4S, v31.s[0] +sub v2.4s, v10.4s, v14.4s +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +mla v21.4S, v14.4S, v31.s[0] +sub v14.4s, v0.4s, v21.4s +add v0.4s, v0.4s, v21.4s +sqrdmulh v21.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +mla v19.4S, v21.4S, v31.s[0] +sub v21.4s, v13.4s, v19.4s +add v13.4s, v13.4s, v19.4s +sqrdmulh v19.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +mla v3.4S, v19.4S, v31.s[0] +sub v19.4s, v0.4s, v3.4s +add v0.4s, v0.4s, v3.4s +sqrdmulh v3.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +mla v20.4S, v3.4S, v31.s[0] +sub v3.4s, v13.4s, v20.4s +add v13.4s, v13.4s, v20.4s +sqrdmulh v20.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +mla v18.4S, v20.4S, v31.s[0] +sub v20.4s, v22.4s, v18.4s +add v22.4s, v22.4s, v18.4s +sqrdmulh v18.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +mla v15.4S, v18.4S, v31.s[0] +sub v18.4s, v10.4s, v15.4s +add v10.4s, v10.4s, v15.4s +sqrdmulh v15.4S, v12.4S, v29.s[2] +mul v12.4S, v12.4S,v30.s[2] +mla v12.4S, v15.4S, v31.s[0] +sub v15.4s, v14.4s, v12.4s +add v14.4s, v14.4s, v12.4s +sqrdmulh v12.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +mla v11.4S, v12.4S, v31.s[0] +sub v12.4s, v21.4s, v11.4s +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +mla v16.4S, v11.4S, v31.s[0] +sub v11.4s, v1.4s, v16.4s +add v1.4s, v1.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +mla v17.4S, v16.4S, v31.s[0] +sub v16.4s, v2.4s, v17.4s +add v2.4s, v2.4s, v17.4s +sqrdmulh v17.4S, v0.4S, v27.s[0] +mul v0.4S, v0.4S,v28.s[0] +mla v0.4S, v17.4S, v31.s[0] +sub v17.4s, v22.4s, v0.4s +add v22.4s, v22.4s, v0.4s +sqrdmulh v0.4S, v13.4S, v27.s[0] +mul v13.4S, v13.4S,v28.s[0] +mla v13.4S, v0.4S, v31.s[0] +sub v0.4s, v10.4s, v13.4s +add v10.4s, v10.4s, v13.4s +sqrdmulh v13.4S, v19.4S, v27.s[1] +mul v19.4S, v19.4S,v28.s[1] +mla v19.4S, v13.4S, v31.s[0] +sub v13.4s, v20.4s, v19.4s +add v20.4s, v20.4s, v19.4s +sqrdmulh v19.4S, v3.4S, v27.s[1] +mul v3.4S, v3.4S,v28.s[1] +mla v3.4S, v19.4S, v31.s[0] +sub v19.4s, v18.4s, v3.4s +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v14.4S, v27.s[2] +mul v14.4S, v14.4S,v28.s[2] +mla v14.4S, v3.4S, v31.s[0] +sub v3.4s, v1.4s, v14.4s +add v1.4s, v1.4s, v14.4s +sqrdmulh v14.4S, v21.4S, v27.s[2] +mul v21.4S, v21.4S,v28.s[2] +mla v21.4S, v14.4S, v31.s[0] +sub v14.4s, v2.4s, v21.4s +add v2.4s, v2.4s, v21.4s +sqrdmulh v21.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +mla v15.4S, v21.4S, v31.s[0] +sub v21.4s, v11.4s, v15.4s +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v12.4S, v27.s[3] +mul v12.4S, v12.4S,v28.s[3] +mla v12.4S, v15.4S, v31.s[0] +sub v15.4s, v16.4s, v12.4s +add v16.4s, v16.4s, v12.4s +sqrdmulh v12.4S, v10.4S, v25.s[0] +mul v10.4S, v10.4S,v26.s[0] +mla v10.4S, v12.4S, v31.s[0] +sub v12.4s, v22.4s, v10.4s +add v22.4s, v22.4s, v10.4s +sqrdmulh v10.4S, v0.4S, v25.s[1] +mul v0.4S, v0.4S,v26.s[1] +mla v0.4S, v10.4S, v31.s[0] +sub v10.4s, v17.4s, v0.4s +add v17.4s, v17.4s, v0.4s +sqrdmulh v0.4S, v18.4S, v25.s[2] +mul v18.4S, v18.4S,v26.s[2] +mla v18.4S, v0.4S, v31.s[0] +sub v0.4s, v20.4s, v18.4s +add v20.4s, v20.4s, v18.4s +sqrdmulh v18.4S, v19.4S, v25.s[3] +mul v19.4S, v19.4S,v26.s[3] +mla v19.4S, v18.4S, v31.s[0] +sub v18.4s, v13.4s, v19.4s +add v13.4s, v13.4s, v19.4s +sqrdmulh v19.4S, v2.4S, v23.s[0] +mul v2.4S, v2.4S,v24.s[0] +mla v2.4S, v19.4S, v31.s[0] +sub v19.4s, v1.4s, v2.4s +add v1.4s, v1.4s, v2.4s +sqrdmulh v2.4S, v14.4S, v23.s[1] +mul v14.4S, v14.4S,v24.s[1] +mla v14.4S, v2.4S, v31.s[0] +sub v2.4s, v3.4s, v14.4s +add v3.4s, v3.4s, v14.4s +sqrdmulh v14.4S, v16.4S, v23.s[2] +mul v16.4S, v16.4S,v24.s[2] +mla v16.4S, v14.4S, v31.s[0] +sub v14.4s, v11.4s, v16.4s +add v11.4s, v11.4s, v16.4s +sqrdmulh v16.4S, v15.4S, v23.s[3] +mul v15.4S, v15.4S,v24.s[3] +mla v15.4S, v16.4S, v31.s[0] +sub v16.4s, v21.4s, v15.4s +add v21.4s, v21.4s, v15.4s +str q22, [x0, #0] +str q12, [x0, #64] +str q17, [x0, #128] +str q10, [x0, #192] +str q20, [x0, #256] +str q0, [x0, #320] +str q13, [x0, #384] +str q18, [x0, #448] +str q1, [x0, #512] +str q19, [x0, #576] +str q3, [x0, #640] +str q2, [x0, #704] +str q11, [x0, #768] +str q14, [x0, #832] +str q21, [x0, #896] +str q16, [x0, #960] +ldr q16, [x0, #784] +ldr q21, [x0, #848] +ldr q14, [x0, #912] +ldr q11, [x0, #976] +ldr q2, [x0, #272] +ldr q3, [x0, #336] +ldr q19, [x0, #400] +ldr q1, [x0, #464] +ldr q18, [x0, #528] +ldr q13, [x0, #592] +ldr q0, [x0, #656] +ldr q20, [x0, #720] +ldr q10, [x0, #16] +ldr q17, [x0, #80] +ldr q12, [x0, #144] +ldr q22, [x0, #208] +sqrdmulh v15.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +mla v16.4S, v15.4S, v31.s[0] +sub v15.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +mla v21.4S, v16.4S, v31.s[0] +sub v16.4s, v3.4s, v21.4s +add v3.4s, v3.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +mla v14.4S, v21.4S, v31.s[0] +sub v21.4s, v19.4s, v14.4s +add v19.4s, v19.4s, v14.4s +sqrdmulh v14.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +mla v11.4S, v14.4S, v31.s[0] +sub v14.4s, v1.4s, v11.4s +add v1.4s, v1.4s, v11.4s +sqrdmulh v11.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +mla v18.4S, v11.4S, v31.s[0] +sub v11.4s, v10.4s, v18.4s +add v10.4s, v10.4s, v18.4s +sqrdmulh v18.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +mla v13.4S, v18.4S, v31.s[0] +sub v18.4s, v17.4s, v13.4s +add v17.4s, v17.4s, v13.4s +sqrdmulh v13.4S, v0.4S, v29.s[0] +mul v0.4S, v0.4S,v30.s[0] +mla v0.4S, v13.4S, v31.s[0] +sub v13.4s, v12.4s, v0.4s +add v12.4s, v12.4s, v0.4s +sqrdmulh v0.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +mla v20.4S, v0.4S, v31.s[0] +sub v0.4s, v22.4s, v20.4s +add v22.4s, v22.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v29.s[1] +mul v19.4S, v19.4S,v30.s[1] +mla v19.4S, v20.4S, v31.s[0] +sub v20.4s, v12.4s, v19.4s +add v12.4s, v12.4s, v19.4s +sqrdmulh v19.4S, v1.4S, v29.s[1] +mul v1.4S, v1.4S,v30.s[1] +mla v1.4S, v19.4S, v31.s[0] +sub v19.4s, v22.4s, v1.4s +add v22.4s, v22.4s, v1.4s +sqrdmulh v1.4S, v2.4S, v29.s[1] +mul v2.4S, v2.4S,v30.s[1] +mla v2.4S, v1.4S, v31.s[0] +sub v1.4s, v10.4s, v2.4s +add v10.4s, v10.4s, v2.4s +sqrdmulh v2.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +mla v3.4S, v2.4S, v31.s[0] +sub v2.4s, v17.4s, v3.4s +add v17.4s, v17.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +mla v21.4S, v3.4S, v31.s[0] +sub v3.4s, v13.4s, v21.4s +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +mla v14.4S, v21.4S, v31.s[0] +sub v21.4s, v0.4s, v14.4s +add v0.4s, v0.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +mla v15.4S, v14.4S, v31.s[0] +sub v14.4s, v11.4s, v15.4s +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +mla v16.4S, v15.4S, v31.s[0] +sub v15.4s, v18.4s, v16.4s +add v18.4s, v18.4s, v16.4s +sqrdmulh v16.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +mla v12.4S, v16.4S, v31.s[0] +sub v16.4s, v10.4s, v12.4s +add v10.4s, v10.4s, v12.4s +sqrdmulh v12.4S, v22.4S, v27.s[0] +mul v22.4S, v22.4S,v28.s[0] +mla v22.4S, v12.4S, v31.s[0] +sub v12.4s, v17.4s, v22.4s +add v17.4s, v17.4s, v22.4s +sqrdmulh v22.4S, v20.4S, v27.s[1] +mul v20.4S, v20.4S,v28.s[1] +mla v20.4S, v22.4S, v31.s[0] +sub v22.4s, v1.4s, v20.4s +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v27.s[1] +mul v19.4S, v19.4S,v28.s[1] +mla v19.4S, v20.4S, v31.s[0] +sub v20.4s, v2.4s, v19.4s +add v2.4s, v2.4s, v19.4s +sqrdmulh v19.4S, v13.4S, v27.s[2] +mul v13.4S, v13.4S,v28.s[2] +mla v13.4S, v19.4S, v31.s[0] +sub v19.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +sqrdmulh v13.4S, v0.4S, v27.s[2] +mul v0.4S, v0.4S,v28.s[2] +mla v0.4S, v13.4S, v31.s[0] +sub v13.4s, v18.4s, v0.4s +add v18.4s, v18.4s, v0.4s +sqrdmulh v0.4S, v3.4S, v27.s[3] +mul v3.4S, v3.4S,v28.s[3] +mla v3.4S, v0.4S, v31.s[0] +sub v0.4s, v14.4s, v3.4s +add v14.4s, v14.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +mla v21.4S, v3.4S, v31.s[0] +sub v3.4s, v15.4s, v21.4s +add v15.4s, v15.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v25.s[0] +mul v17.4S, v17.4S,v26.s[0] +mla v17.4S, v21.4S, v31.s[0] +sub v21.4s, v10.4s, v17.4s +add v10.4s, v10.4s, v17.4s +sqrdmulh v17.4S, v12.4S, v25.s[1] +mul v12.4S, v12.4S,v26.s[1] +mla v12.4S, v17.4S, v31.s[0] +sub v17.4s, v16.4s, v12.4s +add v16.4s, v16.4s, v12.4s +sqrdmulh v12.4S, v2.4S, v25.s[2] +mul v2.4S, v2.4S,v26.s[2] +mla v2.4S, v12.4S, v31.s[0] +sub v12.4s, v1.4s, v2.4s +add v1.4s, v1.4s, v2.4s +sqrdmulh v2.4S, v20.4S, v25.s[3] +mul v20.4S, v20.4S,v26.s[3] +mla v20.4S, v2.4S, v31.s[0] +sub v2.4s, v22.4s, v20.4s +add v22.4s, v22.4s, v20.4s +sqrdmulh v20.4S, v18.4S, v23.s[0] +mul v18.4S, v18.4S,v24.s[0] +mla v18.4S, v20.4S, v31.s[0] +sub v20.4s, v11.4s, v18.4s +add v11.4s, v11.4s, v18.4s +sqrdmulh v18.4S, v13.4S, v23.s[1] +mul v13.4S, v13.4S,v24.s[1] +mla v13.4S, v18.4S, v31.s[0] +sub v18.4s, v19.4s, v13.4s +add v19.4s, v19.4s, v13.4s +sqrdmulh v13.4S, v15.4S, v23.s[2] +mul v15.4S, v15.4S,v24.s[2] +mla v15.4S, v13.4S, v31.s[0] +sub v13.4s, v14.4s, v15.4s +add v14.4s, v14.4s, v15.4s +sqrdmulh v15.4S, v3.4S, v23.s[3] +mul v3.4S, v3.4S,v24.s[3] +mla v3.4S, v15.4S, v31.s[0] +sub v15.4s, v0.4s, v3.4s +add v0.4s, v0.4s, v3.4s +str q10, [x0, #16] +str q21, [x0, #80] +str q16, [x0, #144] +str q17, [x0, #208] +str q1, [x0, #272] +str q12, [x0, #336] +str q22, [x0, #400] +str q2, [x0, #464] +str q11, [x0, #528] +str q20, [x0, #592] +str q19, [x0, #656] +str q18, [x0, #720] +str q14, [x0, #784] +str q13, [x0, #848] +str q0, [x0, #912] +str q15, [x0, #976] +ldr q4, [x17, #+128] +ldr q5, [x17, #+144] +ldr q6, [x17, #+160] +ldr q7, [x17, #+176] +ldr q8, [x17, #+192] +ldr q9, [x17, #+208] +ldr q3, [x17, #+224] +ldr q10, [x17, #+240] +ldr q21, [x0, #32] +ldr q16, [x0, #48] +ldr q17, [x0, #0] +ldr q1, [x0, #16] +sqrdmulh v12.4S, v21.4S, v5.s[0] +mul v21.4S, v21.4S,v4.s[0] +mla v21.4S, v12.4S, v31.s[0] +sub v12.4s, v17.4s, v21.4s +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v16.4S, v5.s[0] +mul v16.4S, v16.4S,v4.s[0] +mla v16.4S, v21.4S, v31.s[0] +sub v21.4s, v1.4s, v16.4s +add v1.4s, v1.4s, v16.4s +ldr q16, [x17, #+256] +ldr q22, [x17, #+272] +sqrdmulh v2.4S, v1.4S, v5.s[1] +mul v1.4S, v1.4S,v4.s[1] +mla v1.4S, v2.4S, v31.s[0] +sub v2.4s, v17.4s, v1.4s +add v17.4s, v17.4s, v1.4s +sqrdmulh v1.4S, v21.4S, v5.s[2] +mul v21.4S, v21.4S,v4.s[2] +mla v21.4S, v1.4S, v31.s[0] +sub v1.4s, v12.4s, v21.4s +add v12.4s, v12.4s, v21.4s +str q17, [x0, #0] +str q2, [x0, #16] +str q12, [x0, #32] +str q1, [x0, #48] +ldr q1, [x0, #96] +ldr q12, [x0, #112] +ldr q2, [x0, #64] +ldr q17, [x0, #80] +sqrdmulh v21.4S, v1.4S, v7.s[0] +mul v1.4S, v1.4S,v6.s[0] +mla v1.4S, v21.4S, v31.s[0] +sub v21.4s, v2.4s, v1.4s +add v2.4s, v2.4s, v1.4s +sqrdmulh v1.4S, v12.4S, v7.s[0] +mul v12.4S, v12.4S,v6.s[0] +mla v12.4S, v1.4S, v31.s[0] +sub v1.4s, v17.4s, v12.4s +add v17.4s, v17.4s, v12.4s +ldr q12, [x17, #+288] +ldr q11, [x17, #+304] +sqrdmulh v20.4S, v17.4S, v7.s[1] +mul v17.4S, v17.4S,v6.s[1] +mla v17.4S, v20.4S, v31.s[0] +sub v20.4s, v2.4s, v17.4s +add v2.4s, v2.4s, v17.4s +sqrdmulh v17.4S, v1.4S, v7.s[2] +mul v1.4S, v1.4S,v6.s[2] +mla v1.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v1.4s +add v21.4s, v21.4s, v1.4s +str q2, [x0, #64] +str q20, [x0, #80] +str q21, [x0, #96] +str q17, [x0, #112] +ldr q17, [x0, #160] +ldr q21, [x0, #176] +ldr q20, [x0, #128] +ldr q2, [x0, #144] +sqrdmulh v1.4S, v17.4S, v9.s[0] +mul v17.4S, v17.4S,v8.s[0] +mla v17.4S, v1.4S, v31.s[0] +sub v1.4s, v20.4s, v17.4s +add v20.4s, v20.4s, v17.4s +sqrdmulh v17.4S, v21.4S, v9.s[0] +mul v21.4S, v21.4S,v8.s[0] +mla v21.4S, v17.4S, v31.s[0] +sub v17.4s, v2.4s, v21.4s +add v2.4s, v2.4s, v21.4s +ldr q21, [x17, #+320] +ldr q19, [x17, #+336] +sqrdmulh v18.4S, v2.4S, v9.s[1] +mul v2.4S, v2.4S,v8.s[1] +mla v2.4S, v18.4S, v31.s[0] +sub v18.4s, v20.4s, v2.4s +add v20.4s, v20.4s, v2.4s +sqrdmulh v2.4S, v17.4S, v9.s[2] +mul v17.4S, v17.4S,v8.s[2] +mla v17.4S, v2.4S, v31.s[0] +sub v2.4s, v1.4s, v17.4s +add v1.4s, v1.4s, v17.4s +str q20, [x0, #128] +str q18, [x0, #144] +str q1, [x0, #160] +str q2, [x0, #176] +ldr q2, [x0, #224] +ldr q1, [x0, #240] +ldr q18, [x0, #192] +ldr q20, [x0, #208] +sqrdmulh v17.4S, v2.4S, v10.s[0] +mul v2.4S, v2.4S,v3.s[0] +mla v2.4S, v17.4S, v31.s[0] +sub v17.4s, v18.4s, v2.4s +add v18.4s, v18.4s, v2.4s +sqrdmulh v2.4S, v1.4S, v10.s[0] +mul v1.4S, v1.4S,v3.s[0] +mla v1.4S, v2.4S, v31.s[0] +sub v2.4s, v20.4s, v1.4s +add v20.4s, v20.4s, v1.4s +ldr q1, [x17, #+352] +ldr q14, [x17, #+368] +sqrdmulh v13.4S, v20.4S, v10.s[1] +mul v20.4S, v20.4S,v3.s[1] +mla v20.4S, v13.4S, v31.s[0] +sub v13.4s, v18.4s, v20.4s +add v18.4s, v18.4s, v20.4s +sqrdmulh v20.4S, v2.4S, v10.s[2] +mul v2.4S, v2.4S,v3.s[2] +mla v2.4S, v20.4S, v31.s[0] +sub v20.4s, v17.4s, v2.4s +add v17.4s, v17.4s, v2.4s +str q18, [x0, #192] +str q13, [x0, #208] +str q17, [x0, #224] +str q20, [x0, #240] +ldr q20, [x0, #288] +ldr q17, [x0, #304] +ldr q13, [x0, #256] +ldr q18, [x0, #272] +sqrdmulh v2.4S, v20.4S, v22.s[0] +mul v20.4S, v20.4S,v16.s[0] +mla v20.4S, v2.4S, v31.s[0] +sub v2.4s, v13.4s, v20.4s +add v13.4s, v13.4s, v20.4s +sqrdmulh v20.4S, v17.4S, v22.s[0] +mul v17.4S, v17.4S,v16.s[0] +mla v17.4S, v20.4S, v31.s[0] +sub v20.4s, v18.4s, v17.4s +add v18.4s, v18.4s, v17.4s +ldr q17, [x17, #+384] +ldr q0, [x17, #+400] +sqrdmulh v15.4S, v18.4S, v22.s[1] +mul v18.4S, v18.4S,v16.s[1] +mla v18.4S, v15.4S, v31.s[0] +sub v15.4s, v13.4s, v18.4s +add v13.4s, v13.4s, v18.4s +sqrdmulh v18.4S, v20.4S, v22.s[2] +mul v20.4S, v20.4S,v16.s[2] +mla v20.4S, v18.4S, v31.s[0] +sub v18.4s, v2.4s, v20.4s +add v2.4s, v2.4s, v20.4s +str q13, [x0, #256] +str q15, [x0, #272] +str q2, [x0, #288] +str q18, [x0, #304] +ldr q5, [x0, #352] +ldr q4, [x0, #368] +ldr q18, [x0, #320] +ldr q2, [x0, #336] +sqrdmulh v15.4S, v5.4S, v11.s[0] +mul v5.4S, v5.4S,v12.s[0] +mla v5.4S, v15.4S, v31.s[0] +sub v15.4s, v18.4s, v5.4s +add v18.4s, v18.4s, v5.4s +sqrdmulh v5.4S, v4.4S, v11.s[0] +mul v4.4S, v4.4S,v12.s[0] +mla v4.4S, v5.4S, v31.s[0] +sub v5.4s, v2.4s, v4.4s +add v2.4s, v2.4s, v4.4s +ldr q4, [x17, #+416] +ldr q13, [x17, #+432] +sqrdmulh v20.4S, v2.4S, v11.s[1] +mul v2.4S, v2.4S,v12.s[1] +mla v2.4S, v20.4S, v31.s[0] +sub v20.4s, v18.4s, v2.4s +add v18.4s, v18.4s, v2.4s +sqrdmulh v2.4S, v5.4S, v11.s[2] +mul v5.4S, v5.4S,v12.s[2] +mla v5.4S, v2.4S, v31.s[0] +sub v2.4s, v15.4s, v5.4s +add v15.4s, v15.4s, v5.4s +str q18, [x0, #320] +str q20, [x0, #336] +str q15, [x0, #352] +str q2, [x0, #368] +ldr q7, [x0, #416] +ldr q6, [x0, #432] +ldr q2, [x0, #384] +ldr q15, [x0, #400] +sqrdmulh v20.4S, v7.4S, v19.s[0] +mul v7.4S, v7.4S,v21.s[0] +mla v7.4S, v20.4S, v31.s[0] +sub v20.4s, v2.4s, v7.4s +add v2.4s, v2.4s, v7.4s +sqrdmulh v7.4S, v6.4S, v19.s[0] +mul v6.4S, v6.4S,v21.s[0] +mla v6.4S, v7.4S, v31.s[0] +sub v7.4s, v15.4s, v6.4s +add v15.4s, v15.4s, v6.4s +ldr q6, [x17, #+448] +ldr q18, [x17, #+464] +sqrdmulh v5.4S, v15.4S, v19.s[1] +mul v15.4S, v15.4S,v21.s[1] +mla v15.4S, v5.4S, v31.s[0] +sub v5.4s, v2.4s, v15.4s +add v2.4s, v2.4s, v15.4s +sqrdmulh v15.4S, v7.4S, v19.s[2] +mul v7.4S, v7.4S,v21.s[2] +mla v7.4S, v15.4S, v31.s[0] +sub v15.4s, v20.4s, v7.4s +add v20.4s, v20.4s, v7.4s +str q2, [x0, #384] +str q5, [x0, #400] +str q20, [x0, #416] +str q15, [x0, #432] +ldr q9, [x0, #480] +ldr q8, [x0, #496] +ldr q15, [x0, #448] +ldr q20, [x0, #464] +sqrdmulh v5.4S, v9.4S, v14.s[0] +mul v9.4S, v9.4S,v1.s[0] +mla v9.4S, v5.4S, v31.s[0] +sub v5.4s, v15.4s, v9.4s +add v15.4s, v15.4s, v9.4s +sqrdmulh v9.4S, v8.4S, v14.s[0] +mul v8.4S, v8.4S,v1.s[0] +mla v8.4S, v9.4S, v31.s[0] +sub v9.4s, v20.4s, v8.4s +add v20.4s, v20.4s, v8.4s +ldr q8, [x17, #+480] +ldr q2, [x17, #+496] +sqrdmulh v7.4S, v20.4S, v14.s[1] +mul v20.4S, v20.4S,v1.s[1] +mla v20.4S, v7.4S, v31.s[0] +sub v7.4s, v15.4s, v20.4s +add v15.4s, v15.4s, v20.4s +sqrdmulh v20.4S, v9.4S, v14.s[2] +mul v9.4S, v9.4S,v1.s[2] +mla v9.4S, v20.4S, v31.s[0] +sub v20.4s, v5.4s, v9.4s +add v5.4s, v5.4s, v9.4s +str q15, [x0, #448] +str q7, [x0, #464] +str q5, [x0, #480] +str q20, [x0, #496] +ldr q10, [x0, #544] +ldr q3, [x0, #560] +ldr q20, [x0, #512] +ldr q5, [x0, #528] +sqrdmulh v7.4S, v10.4S, v0.s[0] +mul v10.4S, v10.4S,v17.s[0] +mla v10.4S, v7.4S, v31.s[0] +sub v7.4s, v20.4s, v10.4s +add v20.4s, v20.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v0.s[0] +mul v3.4S, v3.4S,v17.s[0] +mla v3.4S, v10.4S, v31.s[0] +sub v10.4s, v5.4s, v3.4s +add v5.4s, v5.4s, v3.4s +ldr q3, [x17, #+512] +ldr q15, [x17, #+528] +sqrdmulh v9.4S, v5.4S, v0.s[1] +mul v5.4S, v5.4S,v17.s[1] +mla v5.4S, v9.4S, v31.s[0] +sub v9.4s, v20.4s, v5.4s +add v20.4s, v20.4s, v5.4s +sqrdmulh v5.4S, v10.4S, v0.s[2] +mul v10.4S, v10.4S,v17.s[2] +mla v10.4S, v5.4S, v31.s[0] +sub v5.4s, v7.4s, v10.4s +add v7.4s, v7.4s, v10.4s +str q20, [x0, #512] +str q9, [x0, #528] +str q7, [x0, #544] +str q5, [x0, #560] +ldr q22, [x0, #608] +ldr q16, [x0, #624] +ldr q5, [x0, #576] +ldr q7, [x0, #592] +sqrdmulh v9.4S, v22.4S, v13.s[0] +mul v22.4S, v22.4S,v4.s[0] +mla v22.4S, v9.4S, v31.s[0] +sub v9.4s, v5.4s, v22.4s +add v5.4s, v5.4s, v22.4s +sqrdmulh v22.4S, v16.4S, v13.s[0] +mul v16.4S, v16.4S,v4.s[0] +mla v16.4S, v22.4S, v31.s[0] +sub v22.4s, v7.4s, v16.4s +add v7.4s, v7.4s, v16.4s +ldr q16, [x17, #+544] +ldr q20, [x17, #+560] +sqrdmulh v10.4S, v7.4S, v13.s[1] +mul v7.4S, v7.4S,v4.s[1] +mla v7.4S, v10.4S, v31.s[0] +sub v10.4s, v5.4s, v7.4s +add v5.4s, v5.4s, v7.4s +sqrdmulh v7.4S, v22.4S, v13.s[2] +mul v22.4S, v22.4S,v4.s[2] +mla v22.4S, v7.4S, v31.s[0] +sub v7.4s, v9.4s, v22.4s +add v9.4s, v9.4s, v22.4s +str q5, [x0, #576] +str q10, [x0, #592] +str q9, [x0, #608] +str q7, [x0, #624] +ldr q11, [x0, #672] +ldr q12, [x0, #688] +ldr q7, [x0, #640] +ldr q9, [x0, #656] +sqrdmulh v10.4S, v11.4S, v18.s[0] +mul v11.4S, v11.4S,v6.s[0] +mla v11.4S, v10.4S, v31.s[0] +sub v10.4s, v7.4s, v11.4s +add v7.4s, v7.4s, v11.4s +sqrdmulh v11.4S, v12.4S, v18.s[0] +mul v12.4S, v12.4S,v6.s[0] +mla v12.4S, v11.4S, v31.s[0] +sub v11.4s, v9.4s, v12.4s +add v9.4s, v9.4s, v12.4s +ldr q12, [x17, #+576] +ldr q5, [x17, #+592] +sqrdmulh v22.4S, v9.4S, v18.s[1] +mul v9.4S, v9.4S,v6.s[1] +mla v9.4S, v22.4S, v31.s[0] +sub v22.4s, v7.4s, v9.4s +add v7.4s, v7.4s, v9.4s +sqrdmulh v9.4S, v11.4S, v18.s[2] +mul v11.4S, v11.4S,v6.s[2] +mla v11.4S, v9.4S, v31.s[0] +sub v9.4s, v10.4s, v11.4s +add v10.4s, v10.4s, v11.4s +str q7, [x0, #640] +str q22, [x0, #656] +str q10, [x0, #672] +str q9, [x0, #688] +ldr q19, [x0, #736] +ldr q21, [x0, #752] +ldr q9, [x0, #704] +ldr q10, [x0, #720] +sqrdmulh v22.4S, v19.4S, v2.s[0] +mul v19.4S, v19.4S,v8.s[0] +mla v19.4S, v22.4S, v31.s[0] +sub v22.4s, v9.4s, v19.4s +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v21.4S, v2.s[0] +mul v21.4S, v21.4S,v8.s[0] +mla v21.4S, v19.4S, v31.s[0] +sub v19.4s, v10.4s, v21.4s +add v10.4s, v10.4s, v21.4s +ldr q21, [x17, #+608] +ldr q7, [x17, #+624] +sqrdmulh v11.4S, v10.4S, v2.s[1] +mul v10.4S, v10.4S,v8.s[1] +mla v10.4S, v11.4S, v31.s[0] +sub v11.4s, v9.4s, v10.4s +add v9.4s, v9.4s, v10.4s +sqrdmulh v10.4S, v19.4S, v2.s[2] +mul v19.4S, v19.4S,v8.s[2] +mla v19.4S, v10.4S, v31.s[0] +sub v10.4s, v22.4s, v19.4s +add v22.4s, v22.4s, v19.4s +str q9, [x0, #704] +str q11, [x0, #720] +str q22, [x0, #736] +str q10, [x0, #752] +ldr q14, [x0, #800] +ldr q1, [x0, #816] +ldr q10, [x0, #768] +ldr q22, [x0, #784] +sqrdmulh v11.4S, v14.4S, v15.s[0] +mul v14.4S, v14.4S,v3.s[0] +mla v14.4S, v11.4S, v31.s[0] +sub v11.4s, v10.4s, v14.4s +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v1.4S, v15.s[0] +mul v1.4S, v1.4S,v3.s[0] +mla v1.4S, v14.4S, v31.s[0] +sub v14.4s, v22.4s, v1.4s +add v22.4s, v22.4s, v1.4s +sqrdmulh v1.4S, v22.4S, v15.s[1] +mul v22.4S, v22.4S,v3.s[1] +mla v22.4S, v1.4S, v31.s[0] +sub v1.4s, v10.4s, v22.4s +add v10.4s, v10.4s, v22.4s +sqrdmulh v22.4S, v14.4S, v15.s[2] +mul v14.4S, v14.4S,v3.s[2] +mla v14.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v14.4s +add v11.4s, v11.4s, v14.4s +str q10, [x0, #768] +str q1, [x0, #784] +str q11, [x0, #800] +str q22, [x0, #816] +ldr q0, [x0, #864] +ldr q17, [x0, #880] +ldr q22, [x0, #832] +ldr q11, [x0, #848] +sqrdmulh v1.4S, v0.4S, v20.s[0] +mul v0.4S, v0.4S,v16.s[0] +mla v0.4S, v1.4S, v31.s[0] +sub v1.4s, v22.4s, v0.4s +add v22.4s, v22.4s, v0.4s +sqrdmulh v0.4S, v17.4S, v20.s[0] +mul v17.4S, v17.4S,v16.s[0] +mla v17.4S, v0.4S, v31.s[0] +sub v0.4s, v11.4s, v17.4s +add v11.4s, v11.4s, v17.4s +sqrdmulh v17.4S, v11.4S, v20.s[1] +mul v11.4S, v11.4S,v16.s[1] +mla v11.4S, v17.4S, v31.s[0] +sub v17.4s, v22.4s, v11.4s +add v22.4s, v22.4s, v11.4s +sqrdmulh v11.4S, v0.4S, v20.s[2] +mul v0.4S, v0.4S,v16.s[2] +mla v0.4S, v11.4S, v31.s[0] +sub v11.4s, v1.4s, v0.4s +add v1.4s, v1.4s, v0.4s +str q22, [x0, #832] +str q17, [x0, #848] +str q1, [x0, #864] +str q11, [x0, #880] +ldr q13, [x0, #928] +ldr q4, [x0, #944] +ldr q11, [x0, #896] +ldr q1, [x0, #912] +sqrdmulh v17.4S, v13.4S, v5.s[0] +mul v13.4S, v13.4S,v12.s[0] +mla v13.4S, v17.4S, v31.s[0] +sub v17.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +sqrdmulh v13.4S, v4.4S, v5.s[0] +mul v4.4S, v4.4S,v12.s[0] +mla v4.4S, v13.4S, v31.s[0] +sub v13.4s, v1.4s, v4.4s +add v1.4s, v1.4s, v4.4s +sqrdmulh v4.4S, v1.4S, v5.s[1] +mul v1.4S, v1.4S,v12.s[1] +mla v1.4S, v4.4S, v31.s[0] +sub v4.4s, v11.4s, v1.4s +add v11.4s, v11.4s, v1.4s +sqrdmulh v1.4S, v13.4S, v5.s[2] +mul v13.4S, v13.4S,v12.s[2] +mla v13.4S, v1.4S, v31.s[0] +sub v1.4s, v17.4s, v13.4s +add v17.4s, v17.4s, v13.4s +str q11, [x0, #896] +str q4, [x0, #912] +str q17, [x0, #928] +str q1, [x0, #944] +ldr q18, [x0, #992] +ldr q6, [x0, #1008] +ldr q1, [x0, #960] +ldr q17, [x0, #976] +sqrdmulh v4.4S, v18.4S, v7.s[0] +mul v18.4S, v18.4S,v21.s[0] +mla v18.4S, v4.4S, v31.s[0] +sub v4.4s, v1.4s, v18.4s +add v1.4s, v1.4s, v18.4s +sqrdmulh v18.4S, v6.4S, v7.s[0] +mul v6.4S, v6.4S,v21.s[0] +mla v6.4S, v18.4S, v31.s[0] +sub v18.4s, v17.4s, v6.4s +add v17.4s, v17.4s, v6.4s +sqrdmulh v6.4S, v17.4S, v7.s[1] +mul v17.4S, v17.4S,v21.s[1] +mla v17.4S, v6.4S, v31.s[0] +sub v6.4s, v1.4s, v17.4s +add v1.4s, v1.4s, v17.4s +sqrdmulh v17.4S, v18.4S, v7.s[2] +mul v18.4S, v18.4S,v21.s[2] +mla v18.4S, v17.4S, v31.s[0] +sub v17.4s, v4.4s, v18.4s +add v4.4s, v4.4s, v18.4s +str q1, [x0, #960] +str q6, [x0, #976] +str q4, [x0, #992] +str q17, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1464 +// Instruction count: 1460 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_0_z4_16.s b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_0_z4_16.s new file mode 100644 index 0000000..1443403 --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_0_z4_16.s @@ -0,0 +1,1494 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_0_z4_16 +.global _ntt_u32_incomplete_neon_asm_var_4_2_0_z4_16 +ntt_u32_incomplete_neon_asm_var_4_2_0_z4_16: +_ntt_u32_incomplete_neon_asm_var_4_2_0_z4_16: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #800] +ldr q21, [x0, #864] +ldr q20, [x0, #928] +ldr q19, [x0, #992] +ldr q18, [x0, #288] +ldr q17, [x0, #352] +ldr q16, [x0, #416] +ldr q3, [x0, #480] +ldr q2, [x0, #544] +ldr q1, [x0, #608] +ldr q0, [x0, #672] +ldr q15, [x0, #736] +ldr q14, [x0, #32] +ldr q13, [x0, #96] +ldr q12, [x0, #160] +ldr q11, [x0, #224] +sqrdmulh v10.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +mla v22.4S, v10.4S, v31.s[0] +sub v10.4s, v18.4s, v22.4s +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v17.4s, v21.4s +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +mla v20.4S, v21.4S, v31.s[0] +sub v21.4s, v16.4s, v20.4s +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +mla v19.4S, v20.4S, v31.s[0] +sub v20.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +mla v2.4S, v19.4S, v31.s[0] +sub v19.4s, v14.4s, v2.4s +add v14.4s, v14.4s, v2.4s +sqrdmulh v2.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +mla v1.4S, v2.4S, v31.s[0] +sub v2.4s, v13.4s, v1.4s +add v13.4s, v13.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v29.s[0] +mul v0.4S, v0.4S,v30.s[0] +mla v0.4S, v1.4S, v31.s[0] +sub v1.4s, v12.4s, v0.4s +add v12.4s, v12.4s, v0.4s +sqrdmulh v0.4S, v15.4S, v29.s[0] +mul v15.4S, v15.4S,v30.s[0] +mla v15.4S, v0.4S, v31.s[0] +sub v0.4s, v11.4s, v15.4s +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +mla v16.4S, v15.4S, v31.s[0] +sub v15.4s, v12.4s, v16.4s +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +mla v3.4S, v16.4S, v31.s[0] +sub v16.4s, v11.4s, v3.4s +add v11.4s, v11.4s, v3.4s +sqrdmulh v3.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +mla v18.4S, v3.4S, v31.s[0] +sub v3.4s, v14.4s, v18.4s +add v14.4s, v14.4s, v18.4s +sqrdmulh v18.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +mla v17.4S, v18.4S, v31.s[0] +sub v18.4s, v13.4s, v17.4s +add v13.4s, v13.4s, v17.4s +sqrdmulh v17.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +mla v21.4S, v17.4S, v31.s[0] +sub v17.4s, v1.4s, v21.4s +add v1.4s, v1.4s, v21.4s +sqrdmulh v21.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +mla v20.4S, v21.4S, v31.s[0] +sub v21.4s, v0.4s, v20.4s +add v0.4s, v0.4s, v20.4s +sqrdmulh v20.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +mla v10.4S, v20.4S, v31.s[0] +sub v20.4s, v19.4s, v10.4s +add v19.4s, v19.4s, v10.4s +sqrdmulh v10.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +mla v22.4S, v10.4S, v31.s[0] +sub v10.4s, v2.4s, v22.4s +add v2.4s, v2.4s, v22.4s +sqrdmulh v22.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +mla v12.4S, v22.4S, v31.s[0] +sub v22.4s, v14.4s, v12.4s +add v14.4s, v14.4s, v12.4s +sqrdmulh v12.4S, v11.4S, v27.s[0] +mul v11.4S, v11.4S,v28.s[0] +mla v11.4S, v12.4S, v31.s[0] +sub v12.4s, v13.4s, v11.4s +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v15.4S, v27.s[1] +mul v15.4S, v15.4S,v28.s[1] +mla v15.4S, v11.4S, v31.s[0] +sub v11.4s, v3.4s, v15.4s +add v3.4s, v3.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v27.s[1] +mul v16.4S, v16.4S,v28.s[1] +mla v16.4S, v15.4S, v31.s[0] +sub v15.4s, v18.4s, v16.4s +add v18.4s, v18.4s, v16.4s +sqrdmulh v16.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +mla v1.4S, v16.4S, v31.s[0] +sub v16.4s, v19.4s, v1.4s +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v27.s[2] +mul v0.4S, v0.4S,v28.s[2] +mla v0.4S, v1.4S, v31.s[0] +sub v1.4s, v2.4s, v0.4s +add v2.4s, v2.4s, v0.4s +sqrdmulh v0.4S, v17.4S, v27.s[3] +mul v17.4S, v17.4S,v28.s[3] +mla v17.4S, v0.4S, v31.s[0] +sub v0.4s, v20.4s, v17.4s +add v20.4s, v20.4s, v17.4s +sqrdmulh v17.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +mla v21.4S, v17.4S, v31.s[0] +sub v17.4s, v10.4s, v21.4s +add v10.4s, v10.4s, v21.4s +sqrdmulh v21.4S, v13.4S, v25.s[0] +mul v13.4S, v13.4S,v26.s[0] +mla v13.4S, v21.4S, v31.s[0] +sub v21.4s, v14.4s, v13.4s +add v14.4s, v14.4s, v13.4s +sqrdmulh v13.4S, v12.4S, v25.s[1] +mul v12.4S, v12.4S,v26.s[1] +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v18.4S, v25.s[2] +mul v18.4S, v18.4S,v26.s[2] +mla v18.4S, v12.4S, v31.s[0] +sub v12.4s, v3.4s, v18.4s +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v15.4S, v25.s[3] +mul v15.4S, v15.4S,v26.s[3] +mla v15.4S, v18.4S, v31.s[0] +sub v18.4s, v11.4s, v15.4s +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v23.s[0] +mul v2.4S, v2.4S,v24.s[0] +mla v2.4S, v15.4S, v31.s[0] +sub v15.4s, v19.4s, v2.4s +add v19.4s, v19.4s, v2.4s +sqrdmulh v2.4S, v1.4S, v23.s[1] +mul v1.4S, v1.4S,v24.s[1] +mla v1.4S, v2.4S, v31.s[0] +sub v2.4s, v16.4s, v1.4s +add v16.4s, v16.4s, v1.4s +sqrdmulh v1.4S, v10.4S, v23.s[2] +mul v10.4S, v10.4S,v24.s[2] +mla v10.4S, v1.4S, v31.s[0] +sub v1.4s, v20.4s, v10.4s +add v20.4s, v20.4s, v10.4s +sqrdmulh v10.4S, v17.4S, v23.s[3] +mul v17.4S, v17.4S,v24.s[3] +mla v17.4S, v10.4S, v31.s[0] +sub v10.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +str q14, [x0, #32] +str q21, [x0, #96] +str q22, [x0, #160] +str q13, [x0, #224] +str q3, [x0, #288] +str q12, [x0, #352] +str q11, [x0, #416] +str q18, [x0, #480] +str q19, [x0, #544] +str q15, [x0, #608] +str q16, [x0, #672] +str q2, [x0, #736] +str q20, [x0, #800] +str q1, [x0, #864] +str q0, [x0, #928] +str q10, [x0, #992] +ldr q10, [x0, #816] +ldr q0, [x0, #880] +ldr q1, [x0, #944] +ldr q20, [x0, #1008] +ldr q2, [x0, #304] +ldr q16, [x0, #368] +ldr q15, [x0, #432] +ldr q19, [x0, #496] +ldr q18, [x0, #560] +ldr q11, [x0, #624] +ldr q12, [x0, #688] +ldr q3, [x0, #752] +ldr q13, [x0, #48] +ldr q22, [x0, #112] +ldr q21, [x0, #176] +ldr q14, [x0, #240] +sqrdmulh v17.4S, v10.4S, v29.s[0] +mul v10.4S, v10.4S,v30.s[0] +mla v10.4S, v17.4S, v31.s[0] +sub v17.4s, v2.4s, v10.4s +add v2.4s, v2.4s, v10.4s +sqrdmulh v10.4S, v0.4S, v29.s[0] +mul v0.4S, v0.4S,v30.s[0] +mla v0.4S, v10.4S, v31.s[0] +sub v10.4s, v16.4s, v0.4s +add v16.4s, v16.4s, v0.4s +sqrdmulh v0.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +mla v1.4S, v0.4S, v31.s[0] +sub v0.4s, v15.4s, v1.4s +add v15.4s, v15.4s, v1.4s +sqrdmulh v1.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +mla v20.4S, v1.4S, v31.s[0] +sub v1.4s, v19.4s, v20.4s +add v19.4s, v19.4s, v20.4s +sqrdmulh v20.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +mla v18.4S, v20.4S, v31.s[0] +sub v20.4s, v13.4s, v18.4s +add v13.4s, v13.4s, v18.4s +sqrdmulh v18.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +mla v11.4S, v18.4S, v31.s[0] +sub v18.4s, v22.4s, v11.4s +add v22.4s, v22.4s, v11.4s +sqrdmulh v11.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +mla v12.4S, v11.4S, v31.s[0] +sub v11.4s, v21.4s, v12.4s +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +mla v3.4S, v12.4S, v31.s[0] +sub v12.4s, v14.4s, v3.4s +add v14.4s, v14.4s, v3.4s +sqrdmulh v3.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +mla v15.4S, v3.4S, v31.s[0] +sub v3.4s, v21.4s, v15.4s +add v21.4s, v21.4s, v15.4s +sqrdmulh v15.4S, v19.4S, v29.s[1] +mul v19.4S, v19.4S,v30.s[1] +mla v19.4S, v15.4S, v31.s[0] +sub v15.4s, v14.4s, v19.4s +add v14.4s, v14.4s, v19.4s +sqrdmulh v19.4S, v2.4S, v29.s[1] +mul v2.4S, v2.4S,v30.s[1] +mla v2.4S, v19.4S, v31.s[0] +sub v19.4s, v13.4s, v2.4s +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +mla v16.4S, v2.4S, v31.s[0] +sub v2.4s, v22.4s, v16.4s +add v22.4s, v22.4s, v16.4s +sqrdmulh v16.4S, v0.4S, v29.s[2] +mul v0.4S, v0.4S,v30.s[2] +mla v0.4S, v16.4S, v31.s[0] +sub v16.4s, v11.4s, v0.4s +add v11.4s, v11.4s, v0.4s +sqrdmulh v0.4S, v1.4S, v29.s[2] +mul v1.4S, v1.4S,v30.s[2] +mla v1.4S, v0.4S, v31.s[0] +sub v0.4s, v12.4s, v1.4s +add v12.4s, v12.4s, v1.4s +sqrdmulh v1.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +mla v17.4S, v1.4S, v31.s[0] +sub v1.4s, v20.4s, v17.4s +add v20.4s, v20.4s, v17.4s +sqrdmulh v17.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +mla v10.4S, v17.4S, v31.s[0] +sub v17.4s, v18.4s, v10.4s +add v18.4s, v18.4s, v10.4s +sqrdmulh v10.4S, v21.4S, v27.s[0] +mul v21.4S, v21.4S,v28.s[0] +mla v21.4S, v10.4S, v31.s[0] +sub v10.4s, v13.4s, v21.4s +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +mla v14.4S, v21.4S, v31.s[0] +sub v21.4s, v22.4s, v14.4s +add v22.4s, v22.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v27.s[1] +mul v3.4S, v3.4S,v28.s[1] +mla v3.4S, v14.4S, v31.s[0] +sub v14.4s, v19.4s, v3.4s +add v19.4s, v19.4s, v3.4s +sqrdmulh v3.4S, v15.4S, v27.s[1] +mul v15.4S, v15.4S,v28.s[1] +mla v15.4S, v3.4S, v31.s[0] +sub v3.4s, v2.4s, v15.4s +add v2.4s, v2.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v27.s[2] +mul v11.4S, v11.4S,v28.s[2] +mla v11.4S, v15.4S, v31.s[0] +sub v15.4s, v20.4s, v11.4s +add v20.4s, v20.4s, v11.4s +sqrdmulh v11.4S, v12.4S, v27.s[2] +mul v12.4S, v12.4S,v28.s[2] +mla v12.4S, v11.4S, v31.s[0] +sub v11.4s, v18.4s, v12.4s +add v18.4s, v18.4s, v12.4s +sqrdmulh v12.4S, v16.4S, v27.s[3] +mul v16.4S, v16.4S,v28.s[3] +mla v16.4S, v12.4S, v31.s[0] +sub v12.4s, v1.4s, v16.4s +add v1.4s, v1.4s, v16.4s +sqrdmulh v16.4S, v0.4S, v27.s[3] +mul v0.4S, v0.4S,v28.s[3] +mla v0.4S, v16.4S, v31.s[0] +sub v16.4s, v17.4s, v0.4s +add v17.4s, v17.4s, v0.4s +sqrdmulh v0.4S, v22.4S, v25.s[0] +mul v22.4S, v22.4S,v26.s[0] +mla v22.4S, v0.4S, v31.s[0] +sub v0.4s, v13.4s, v22.4s +add v13.4s, v13.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v25.s[1] +mul v21.4S, v21.4S,v26.s[1] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v10.4s, v21.4s +add v10.4s, v10.4s, v21.4s +sqrdmulh v21.4S, v2.4S, v25.s[2] +mul v2.4S, v2.4S,v26.s[2] +mla v2.4S, v21.4S, v31.s[0] +sub v21.4s, v19.4s, v2.4s +add v19.4s, v19.4s, v2.4s +sqrdmulh v2.4S, v3.4S, v25.s[3] +mul v3.4S, v3.4S,v26.s[3] +mla v3.4S, v2.4S, v31.s[0] +sub v2.4s, v14.4s, v3.4s +add v14.4s, v14.4s, v3.4s +sqrdmulh v3.4S, v18.4S, v23.s[0] +mul v18.4S, v18.4S,v24.s[0] +mla v18.4S, v3.4S, v31.s[0] +sub v3.4s, v20.4s, v18.4s +add v20.4s, v20.4s, v18.4s +sqrdmulh v18.4S, v11.4S, v23.s[1] +mul v11.4S, v11.4S,v24.s[1] +mla v11.4S, v18.4S, v31.s[0] +sub v18.4s, v15.4s, v11.4s +add v15.4s, v15.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v23.s[2] +mul v17.4S, v17.4S,v24.s[2] +mla v17.4S, v11.4S, v31.s[0] +sub v11.4s, v1.4s, v17.4s +add v1.4s, v1.4s, v17.4s +sqrdmulh v17.4S, v16.4S, v23.s[3] +mul v16.4S, v16.4S,v24.s[3] +mla v16.4S, v17.4S, v31.s[0] +sub v17.4s, v12.4s, v16.4s +add v12.4s, v12.4s, v16.4s +str q13, [x0, #48] +str q0, [x0, #112] +str q10, [x0, #176] +str q22, [x0, #240] +str q19, [x0, #304] +str q21, [x0, #368] +str q14, [x0, #432] +str q2, [x0, #496] +str q20, [x0, #560] +str q3, [x0, #624] +str q15, [x0, #688] +str q18, [x0, #752] +str q1, [x0, #816] +str q11, [x0, #880] +str q12, [x0, #944] +str q17, [x0, #1008] +ldr q17, [x0, #768] +ldr q12, [x0, #832] +ldr q11, [x0, #896] +ldr q1, [x0, #960] +ldr q18, [x0, #256] +ldr q15, [x0, #320] +ldr q3, [x0, #384] +ldr q20, [x0, #448] +ldr q2, [x0, #512] +ldr q14, [x0, #576] +ldr q21, [x0, #640] +ldr q19, [x0, #704] +ldr q22, [x0, #0] +ldr q10, [x0, #64] +ldr q0, [x0, #128] +ldr q13, [x0, #192] +sqrdmulh v16.4S, v17.4S, v29.s[0] +mul v17.4S, v17.4S,v30.s[0] +mla v17.4S, v16.4S, v31.s[0] +sub v16.4s, v18.4s, v17.4s +add v18.4s, v18.4s, v17.4s +sqrdmulh v17.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +mla v12.4S, v17.4S, v31.s[0] +sub v17.4s, v15.4s, v12.4s +add v15.4s, v15.4s, v12.4s +sqrdmulh v12.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +mla v11.4S, v12.4S, v31.s[0] +sub v12.4s, v3.4s, v11.4s +add v3.4s, v3.4s, v11.4s +sqrdmulh v11.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +mla v1.4S, v11.4S, v31.s[0] +sub v11.4s, v20.4s, v1.4s +add v20.4s, v20.4s, v1.4s +sqrdmulh v1.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +mla v2.4S, v1.4S, v31.s[0] +sub v1.4s, v22.4s, v2.4s +add v22.4s, v22.4s, v2.4s +sqrdmulh v2.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +mla v14.4S, v2.4S, v31.s[0] +sub v2.4s, v10.4s, v14.4s +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +mla v21.4S, v14.4S, v31.s[0] +sub v14.4s, v0.4s, v21.4s +add v0.4s, v0.4s, v21.4s +sqrdmulh v21.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +mla v19.4S, v21.4S, v31.s[0] +sub v21.4s, v13.4s, v19.4s +add v13.4s, v13.4s, v19.4s +sqrdmulh v19.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +mla v3.4S, v19.4S, v31.s[0] +sub v19.4s, v0.4s, v3.4s +add v0.4s, v0.4s, v3.4s +sqrdmulh v3.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +mla v20.4S, v3.4S, v31.s[0] +sub v3.4s, v13.4s, v20.4s +add v13.4s, v13.4s, v20.4s +sqrdmulh v20.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +mla v18.4S, v20.4S, v31.s[0] +sub v20.4s, v22.4s, v18.4s +add v22.4s, v22.4s, v18.4s +sqrdmulh v18.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +mla v15.4S, v18.4S, v31.s[0] +sub v18.4s, v10.4s, v15.4s +add v10.4s, v10.4s, v15.4s +sqrdmulh v15.4S, v12.4S, v29.s[2] +mul v12.4S, v12.4S,v30.s[2] +mla v12.4S, v15.4S, v31.s[0] +sub v15.4s, v14.4s, v12.4s +add v14.4s, v14.4s, v12.4s +sqrdmulh v12.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +mla v11.4S, v12.4S, v31.s[0] +sub v12.4s, v21.4s, v11.4s +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +mla v16.4S, v11.4S, v31.s[0] +sub v11.4s, v1.4s, v16.4s +add v1.4s, v1.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +mla v17.4S, v16.4S, v31.s[0] +sub v16.4s, v2.4s, v17.4s +add v2.4s, v2.4s, v17.4s +sqrdmulh v17.4S, v0.4S, v27.s[0] +mul v0.4S, v0.4S,v28.s[0] +mla v0.4S, v17.4S, v31.s[0] +sub v17.4s, v22.4s, v0.4s +add v22.4s, v22.4s, v0.4s +sqrdmulh v0.4S, v13.4S, v27.s[0] +mul v13.4S, v13.4S,v28.s[0] +mla v13.4S, v0.4S, v31.s[0] +sub v0.4s, v10.4s, v13.4s +add v10.4s, v10.4s, v13.4s +sqrdmulh v13.4S, v19.4S, v27.s[1] +mul v19.4S, v19.4S,v28.s[1] +mla v19.4S, v13.4S, v31.s[0] +sub v13.4s, v20.4s, v19.4s +add v20.4s, v20.4s, v19.4s +sqrdmulh v19.4S, v3.4S, v27.s[1] +mul v3.4S, v3.4S,v28.s[1] +mla v3.4S, v19.4S, v31.s[0] +sub v19.4s, v18.4s, v3.4s +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v14.4S, v27.s[2] +mul v14.4S, v14.4S,v28.s[2] +mla v14.4S, v3.4S, v31.s[0] +sub v3.4s, v1.4s, v14.4s +add v1.4s, v1.4s, v14.4s +sqrdmulh v14.4S, v21.4S, v27.s[2] +mul v21.4S, v21.4S,v28.s[2] +mla v21.4S, v14.4S, v31.s[0] +sub v14.4s, v2.4s, v21.4s +add v2.4s, v2.4s, v21.4s +sqrdmulh v21.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +mla v15.4S, v21.4S, v31.s[0] +sub v21.4s, v11.4s, v15.4s +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v12.4S, v27.s[3] +mul v12.4S, v12.4S,v28.s[3] +mla v12.4S, v15.4S, v31.s[0] +sub v15.4s, v16.4s, v12.4s +add v16.4s, v16.4s, v12.4s +sqrdmulh v12.4S, v10.4S, v25.s[0] +mul v10.4S, v10.4S,v26.s[0] +mla v10.4S, v12.4S, v31.s[0] +sub v12.4s, v22.4s, v10.4s +add v22.4s, v22.4s, v10.4s +sqrdmulh v10.4S, v0.4S, v25.s[1] +mul v0.4S, v0.4S,v26.s[1] +mla v0.4S, v10.4S, v31.s[0] +sub v10.4s, v17.4s, v0.4s +add v17.4s, v17.4s, v0.4s +sqrdmulh v0.4S, v18.4S, v25.s[2] +mul v18.4S, v18.4S,v26.s[2] +mla v18.4S, v0.4S, v31.s[0] +sub v0.4s, v20.4s, v18.4s +add v20.4s, v20.4s, v18.4s +sqrdmulh v18.4S, v19.4S, v25.s[3] +mul v19.4S, v19.4S,v26.s[3] +mla v19.4S, v18.4S, v31.s[0] +sub v18.4s, v13.4s, v19.4s +add v13.4s, v13.4s, v19.4s +sqrdmulh v19.4S, v2.4S, v23.s[0] +mul v2.4S, v2.4S,v24.s[0] +mla v2.4S, v19.4S, v31.s[0] +sub v19.4s, v1.4s, v2.4s +add v1.4s, v1.4s, v2.4s +sqrdmulh v2.4S, v14.4S, v23.s[1] +mul v14.4S, v14.4S,v24.s[1] +mla v14.4S, v2.4S, v31.s[0] +sub v2.4s, v3.4s, v14.4s +add v3.4s, v3.4s, v14.4s +sqrdmulh v14.4S, v16.4S, v23.s[2] +mul v16.4S, v16.4S,v24.s[2] +mla v16.4S, v14.4S, v31.s[0] +sub v14.4s, v11.4s, v16.4s +add v11.4s, v11.4s, v16.4s +sqrdmulh v16.4S, v15.4S, v23.s[3] +mul v15.4S, v15.4S,v24.s[3] +mla v15.4S, v16.4S, v31.s[0] +sub v16.4s, v21.4s, v15.4s +add v21.4s, v21.4s, v15.4s +str q22, [x0, #0] +str q12, [x0, #64] +str q17, [x0, #128] +str q10, [x0, #192] +str q20, [x0, #256] +str q0, [x0, #320] +str q13, [x0, #384] +str q18, [x0, #448] +str q1, [x0, #512] +str q19, [x0, #576] +str q3, [x0, #640] +str q2, [x0, #704] +str q11, [x0, #768] +str q14, [x0, #832] +str q21, [x0, #896] +str q16, [x0, #960] +ldr q16, [x0, #784] +ldr q21, [x0, #848] +ldr q14, [x0, #912] +ldr q11, [x0, #976] +ldr q2, [x0, #272] +ldr q3, [x0, #336] +ldr q19, [x0, #400] +ldr q1, [x0, #464] +ldr q18, [x0, #528] +ldr q13, [x0, #592] +ldr q0, [x0, #656] +ldr q20, [x0, #720] +ldr q10, [x0, #16] +ldr q17, [x0, #80] +ldr q12, [x0, #144] +ldr q22, [x0, #208] +sqrdmulh v15.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +mla v16.4S, v15.4S, v31.s[0] +sub v15.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +mla v21.4S, v16.4S, v31.s[0] +sub v16.4s, v3.4s, v21.4s +add v3.4s, v3.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +mla v14.4S, v21.4S, v31.s[0] +sub v21.4s, v19.4s, v14.4s +add v19.4s, v19.4s, v14.4s +sqrdmulh v14.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +mla v11.4S, v14.4S, v31.s[0] +sub v14.4s, v1.4s, v11.4s +add v1.4s, v1.4s, v11.4s +sqrdmulh v11.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +mla v18.4S, v11.4S, v31.s[0] +sub v11.4s, v10.4s, v18.4s +add v10.4s, v10.4s, v18.4s +sqrdmulh v18.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +mla v13.4S, v18.4S, v31.s[0] +sub v18.4s, v17.4s, v13.4s +add v17.4s, v17.4s, v13.4s +sqrdmulh v13.4S, v0.4S, v29.s[0] +mul v0.4S, v0.4S,v30.s[0] +mla v0.4S, v13.4S, v31.s[0] +sub v13.4s, v12.4s, v0.4s +add v12.4s, v12.4s, v0.4s +sqrdmulh v0.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +mla v20.4S, v0.4S, v31.s[0] +sub v0.4s, v22.4s, v20.4s +add v22.4s, v22.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v29.s[1] +mul v19.4S, v19.4S,v30.s[1] +mla v19.4S, v20.4S, v31.s[0] +sub v20.4s, v12.4s, v19.4s +add v12.4s, v12.4s, v19.4s +sqrdmulh v19.4S, v1.4S, v29.s[1] +mul v1.4S, v1.4S,v30.s[1] +mla v1.4S, v19.4S, v31.s[0] +sub v19.4s, v22.4s, v1.4s +add v22.4s, v22.4s, v1.4s +sqrdmulh v1.4S, v2.4S, v29.s[1] +mul v2.4S, v2.4S,v30.s[1] +mla v2.4S, v1.4S, v31.s[0] +sub v1.4s, v10.4s, v2.4s +add v10.4s, v10.4s, v2.4s +sqrdmulh v2.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +mla v3.4S, v2.4S, v31.s[0] +sub v2.4s, v17.4s, v3.4s +add v17.4s, v17.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +mla v21.4S, v3.4S, v31.s[0] +sub v3.4s, v13.4s, v21.4s +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +mla v14.4S, v21.4S, v31.s[0] +sub v21.4s, v0.4s, v14.4s +add v0.4s, v0.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +mla v15.4S, v14.4S, v31.s[0] +sub v14.4s, v11.4s, v15.4s +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +mla v16.4S, v15.4S, v31.s[0] +sub v15.4s, v18.4s, v16.4s +add v18.4s, v18.4s, v16.4s +sqrdmulh v16.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +mla v12.4S, v16.4S, v31.s[0] +sub v16.4s, v10.4s, v12.4s +add v10.4s, v10.4s, v12.4s +sqrdmulh v12.4S, v22.4S, v27.s[0] +mul v22.4S, v22.4S,v28.s[0] +mla v22.4S, v12.4S, v31.s[0] +sub v12.4s, v17.4s, v22.4s +add v17.4s, v17.4s, v22.4s +sqrdmulh v22.4S, v20.4S, v27.s[1] +mul v20.4S, v20.4S,v28.s[1] +mla v20.4S, v22.4S, v31.s[0] +sub v22.4s, v1.4s, v20.4s +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v27.s[1] +mul v19.4S, v19.4S,v28.s[1] +mla v19.4S, v20.4S, v31.s[0] +sub v20.4s, v2.4s, v19.4s +add v2.4s, v2.4s, v19.4s +sqrdmulh v19.4S, v13.4S, v27.s[2] +mul v13.4S, v13.4S,v28.s[2] +mla v13.4S, v19.4S, v31.s[0] +sub v19.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +sqrdmulh v13.4S, v0.4S, v27.s[2] +mul v0.4S, v0.4S,v28.s[2] +mla v0.4S, v13.4S, v31.s[0] +sub v13.4s, v18.4s, v0.4s +add v18.4s, v18.4s, v0.4s +sqrdmulh v0.4S, v3.4S, v27.s[3] +mul v3.4S, v3.4S,v28.s[3] +mla v3.4S, v0.4S, v31.s[0] +sub v0.4s, v14.4s, v3.4s +add v14.4s, v14.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +mla v21.4S, v3.4S, v31.s[0] +sub v3.4s, v15.4s, v21.4s +add v15.4s, v15.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v25.s[0] +mul v17.4S, v17.4S,v26.s[0] +mla v17.4S, v21.4S, v31.s[0] +sub v21.4s, v10.4s, v17.4s +add v10.4s, v10.4s, v17.4s +sqrdmulh v17.4S, v12.4S, v25.s[1] +mul v12.4S, v12.4S,v26.s[1] +mla v12.4S, v17.4S, v31.s[0] +sub v17.4s, v16.4s, v12.4s +add v16.4s, v16.4s, v12.4s +sqrdmulh v12.4S, v2.4S, v25.s[2] +mul v2.4S, v2.4S,v26.s[2] +mla v2.4S, v12.4S, v31.s[0] +sub v12.4s, v1.4s, v2.4s +add v1.4s, v1.4s, v2.4s +sqrdmulh v2.4S, v20.4S, v25.s[3] +mul v20.4S, v20.4S,v26.s[3] +mla v20.4S, v2.4S, v31.s[0] +sub v2.4s, v22.4s, v20.4s +add v22.4s, v22.4s, v20.4s +sqrdmulh v20.4S, v18.4S, v23.s[0] +mul v18.4S, v18.4S,v24.s[0] +mla v18.4S, v20.4S, v31.s[0] +sub v20.4s, v11.4s, v18.4s +add v11.4s, v11.4s, v18.4s +sqrdmulh v18.4S, v13.4S, v23.s[1] +mul v13.4S, v13.4S,v24.s[1] +mla v13.4S, v18.4S, v31.s[0] +sub v18.4s, v19.4s, v13.4s +add v19.4s, v19.4s, v13.4s +sqrdmulh v13.4S, v15.4S, v23.s[2] +mul v15.4S, v15.4S,v24.s[2] +mla v15.4S, v13.4S, v31.s[0] +sub v13.4s, v14.4s, v15.4s +add v14.4s, v14.4s, v15.4s +sqrdmulh v15.4S, v3.4S, v23.s[3] +mul v3.4S, v3.4S,v24.s[3] +mla v3.4S, v15.4S, v31.s[0] +sub v15.4s, v0.4s, v3.4s +add v0.4s, v0.4s, v3.4s +str q10, [x0, #16] +str q21, [x0, #80] +str q16, [x0, #144] +str q17, [x0, #208] +str q1, [x0, #272] +str q12, [x0, #336] +str q22, [x0, #400] +str q2, [x0, #464] +str q11, [x0, #528] +str q20, [x0, #592] +str q19, [x0, #656] +str q18, [x0, #720] +str q14, [x0, #784] +str q13, [x0, #848] +str q0, [x0, #912] +str q15, [x0, #976] +ldr q4, [x17, #+128] +ldr q5, [x17, #+144] +ldr q6, [x17, #+160] +ldr q7, [x17, #+176] +ldr q8, [x17, #+192] +ldr q9, [x17, #+208] +ldr q3, [x17, #+224] +ldr q10, [x17, #+240] +ldr q21, [x0, #32] +ldr q16, [x0, #48] +ldr q17, [x0, #0] +ldr q1, [x0, #16] +ldr q12, [x17, #+256] +ldr q22, [x17, #+272] +sqrdmulh v2.4S, v21.4S, v5.s[0] +mul v21.4S, v21.4S,v4.s[0] +mla v21.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v16.4S, v5.s[0] +mul v16.4S, v16.4S,v4.s[0] +mla v16.4S, v2.4S, v31.s[0] +sub v2.4s, v17.4s, v21.4s +add v17.4s, v17.4s, v21.4s +sub v21.4s, v1.4s, v16.4s +add v1.4s, v1.4s, v16.4s +sqrdmulh v16.4S, v1.4S, v5.s[1] +mul v1.4S, v1.4S,v4.s[1] +mla v1.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v21.4S, v5.s[2] +mul v21.4S, v21.4S,v4.s[2] +mla v21.4S, v16.4S, v31.s[0] +sub v16.4s, v17.4s, v1.4s +add v17.4s, v17.4s, v1.4s +sub v1.4s, v2.4s, v21.4s +add v2.4s, v2.4s, v21.4s +str q17, [x0, #0] +str q16, [x0, #16] +str q2, [x0, #32] +str q1, [x0, #48] +ldr q1, [x0, #96] +ldr q2, [x0, #112] +ldr q16, [x0, #64] +ldr q17, [x0, #80] +ldr q21, [x17, #+288] +ldr q11, [x17, #+304] +sqrdmulh v20.4S, v1.4S, v7.s[0] +mul v1.4S, v1.4S,v6.s[0] +mla v1.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v2.4S, v7.s[0] +mul v2.4S, v2.4S,v6.s[0] +mla v2.4S, v20.4S, v31.s[0] +sub v20.4s, v16.4s, v1.4s +add v16.4s, v16.4s, v1.4s +sub v1.4s, v17.4s, v2.4s +add v17.4s, v17.4s, v2.4s +sqrdmulh v2.4S, v17.4S, v7.s[1] +mul v17.4S, v17.4S,v6.s[1] +mla v17.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v1.4S, v7.s[2] +mul v1.4S, v1.4S,v6.s[2] +mla v1.4S, v2.4S, v31.s[0] +sub v2.4s, v16.4s, v17.4s +add v16.4s, v16.4s, v17.4s +sub v17.4s, v20.4s, v1.4s +add v20.4s, v20.4s, v1.4s +str q16, [x0, #64] +str q2, [x0, #80] +str q20, [x0, #96] +str q17, [x0, #112] +ldr q17, [x0, #160] +ldr q20, [x0, #176] +ldr q2, [x0, #128] +ldr q16, [x0, #144] +ldr q1, [x17, #+320] +ldr q19, [x17, #+336] +sqrdmulh v18.4S, v17.4S, v9.s[0] +mul v17.4S, v17.4S,v8.s[0] +mla v17.4S, v18.4S, v31.s[0] +sqrdmulh v18.4S, v20.4S, v9.s[0] +mul v20.4S, v20.4S,v8.s[0] +mla v20.4S, v18.4S, v31.s[0] +sub v18.4s, v2.4s, v17.4s +add v2.4s, v2.4s, v17.4s +sub v17.4s, v16.4s, v20.4s +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v16.4S, v9.s[1] +mul v16.4S, v16.4S,v8.s[1] +mla v16.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v17.4S, v9.s[2] +mul v17.4S, v17.4S,v8.s[2] +mla v17.4S, v20.4S, v31.s[0] +sub v20.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +sub v16.4s, v18.4s, v17.4s +add v18.4s, v18.4s, v17.4s +str q2, [x0, #128] +str q20, [x0, #144] +str q18, [x0, #160] +str q16, [x0, #176] +ldr q16, [x0, #224] +ldr q18, [x0, #240] +ldr q20, [x0, #192] +ldr q2, [x0, #208] +ldr q17, [x17, #+352] +ldr q14, [x17, #+368] +sqrdmulh v13.4S, v16.4S, v10.s[0] +mul v16.4S, v16.4S,v3.s[0] +mla v16.4S, v13.4S, v31.s[0] +sqrdmulh v13.4S, v18.4S, v10.s[0] +mul v18.4S, v18.4S,v3.s[0] +mla v18.4S, v13.4S, v31.s[0] +sub v13.4s, v20.4s, v16.4s +add v20.4s, v20.4s, v16.4s +sub v16.4s, v2.4s, v18.4s +add v2.4s, v2.4s, v18.4s +sqrdmulh v18.4S, v2.4S, v10.s[1] +mul v2.4S, v2.4S,v3.s[1] +mla v2.4S, v18.4S, v31.s[0] +sqrdmulh v18.4S, v16.4S, v10.s[2] +mul v16.4S, v16.4S,v3.s[2] +mla v16.4S, v18.4S, v31.s[0] +sub v18.4s, v20.4s, v2.4s +add v20.4s, v20.4s, v2.4s +sub v2.4s, v13.4s, v16.4s +add v13.4s, v13.4s, v16.4s +str q20, [x0, #192] +str q18, [x0, #208] +str q13, [x0, #224] +str q2, [x0, #240] +ldr q2, [x0, #288] +ldr q13, [x0, #304] +ldr q18, [x0, #256] +ldr q20, [x0, #272] +ldr q16, [x17, #+384] +ldr q0, [x17, #+400] +sqrdmulh v15.4S, v2.4S, v22.s[0] +mul v2.4S, v2.4S,v12.s[0] +mla v2.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v13.4S, v22.s[0] +mul v13.4S, v13.4S,v12.s[0] +mla v13.4S, v15.4S, v31.s[0] +sub v15.4s, v18.4s, v2.4s +add v18.4s, v18.4s, v2.4s +sub v2.4s, v20.4s, v13.4s +add v20.4s, v20.4s, v13.4s +sqrdmulh v13.4S, v20.4S, v22.s[1] +mul v20.4S, v20.4S,v12.s[1] +mla v20.4S, v13.4S, v31.s[0] +sqrdmulh v13.4S, v2.4S, v22.s[2] +mul v2.4S, v2.4S,v12.s[2] +mla v2.4S, v13.4S, v31.s[0] +sub v13.4s, v18.4s, v20.4s +add v18.4s, v18.4s, v20.4s +sub v20.4s, v15.4s, v2.4s +add v15.4s, v15.4s, v2.4s +str q18, [x0, #256] +str q13, [x0, #272] +str q15, [x0, #288] +str q20, [x0, #304] +ldr q5, [x0, #352] +ldr q4, [x0, #368] +ldr q20, [x0, #320] +ldr q15, [x0, #336] +ldr q13, [x17, #+416] +ldr q18, [x17, #+432] +sqrdmulh v2.4S, v5.4S, v11.s[0] +mul v5.4S, v5.4S,v21.s[0] +mla v5.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v4.4S, v11.s[0] +mul v4.4S, v4.4S,v21.s[0] +mla v4.4S, v2.4S, v31.s[0] +sub v2.4s, v20.4s, v5.4s +add v20.4s, v20.4s, v5.4s +sub v5.4s, v15.4s, v4.4s +add v15.4s, v15.4s, v4.4s +sqrdmulh v4.4S, v15.4S, v11.s[1] +mul v15.4S, v15.4S,v21.s[1] +mla v15.4S, v4.4S, v31.s[0] +sqrdmulh v4.4S, v5.4S, v11.s[2] +mul v5.4S, v5.4S,v21.s[2] +mla v5.4S, v4.4S, v31.s[0] +sub v4.4s, v20.4s, v15.4s +add v20.4s, v20.4s, v15.4s +sub v15.4s, v2.4s, v5.4s +add v2.4s, v2.4s, v5.4s +str q20, [x0, #320] +str q4, [x0, #336] +str q2, [x0, #352] +str q15, [x0, #368] +ldr q7, [x0, #416] +ldr q6, [x0, #432] +ldr q15, [x0, #384] +ldr q2, [x0, #400] +ldr q4, [x17, #+448] +ldr q20, [x17, #+464] +sqrdmulh v5.4S, v7.4S, v19.s[0] +mul v7.4S, v7.4S,v1.s[0] +mla v7.4S, v5.4S, v31.s[0] +sqrdmulh v5.4S, v6.4S, v19.s[0] +mul v6.4S, v6.4S,v1.s[0] +mla v6.4S, v5.4S, v31.s[0] +sub v5.4s, v15.4s, v7.4s +add v15.4s, v15.4s, v7.4s +sub v7.4s, v2.4s, v6.4s +add v2.4s, v2.4s, v6.4s +sqrdmulh v6.4S, v2.4S, v19.s[1] +mul v2.4S, v2.4S,v1.s[1] +mla v2.4S, v6.4S, v31.s[0] +sqrdmulh v6.4S, v7.4S, v19.s[2] +mul v7.4S, v7.4S,v1.s[2] +mla v7.4S, v6.4S, v31.s[0] +sub v6.4s, v15.4s, v2.4s +add v15.4s, v15.4s, v2.4s +sub v2.4s, v5.4s, v7.4s +add v5.4s, v5.4s, v7.4s +str q15, [x0, #384] +str q6, [x0, #400] +str q5, [x0, #416] +str q2, [x0, #432] +ldr q9, [x0, #480] +ldr q8, [x0, #496] +ldr q2, [x0, #448] +ldr q5, [x0, #464] +ldr q6, [x17, #+480] +ldr q15, [x17, #+496] +sqrdmulh v7.4S, v9.4S, v14.s[0] +mul v9.4S, v9.4S,v17.s[0] +mla v9.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v8.4S, v14.s[0] +mul v8.4S, v8.4S,v17.s[0] +mla v8.4S, v7.4S, v31.s[0] +sub v7.4s, v2.4s, v9.4s +add v2.4s, v2.4s, v9.4s +sub v9.4s, v5.4s, v8.4s +add v5.4s, v5.4s, v8.4s +sqrdmulh v8.4S, v5.4S, v14.s[1] +mul v5.4S, v5.4S,v17.s[1] +mla v5.4S, v8.4S, v31.s[0] +sqrdmulh v8.4S, v9.4S, v14.s[2] +mul v9.4S, v9.4S,v17.s[2] +mla v9.4S, v8.4S, v31.s[0] +sub v8.4s, v2.4s, v5.4s +add v2.4s, v2.4s, v5.4s +sub v5.4s, v7.4s, v9.4s +add v7.4s, v7.4s, v9.4s +str q2, [x0, #448] +str q8, [x0, #464] +str q7, [x0, #480] +str q5, [x0, #496] +ldr q10, [x0, #544] +ldr q3, [x0, #560] +ldr q5, [x0, #512] +ldr q7, [x0, #528] +ldr q8, [x17, #+512] +ldr q2, [x17, #+528] +sqrdmulh v9.4S, v10.4S, v0.s[0] +mul v10.4S, v10.4S,v16.s[0] +mla v10.4S, v9.4S, v31.s[0] +sqrdmulh v9.4S, v3.4S, v0.s[0] +mul v3.4S, v3.4S,v16.s[0] +mla v3.4S, v9.4S, v31.s[0] +sub v9.4s, v5.4s, v10.4s +add v5.4s, v5.4s, v10.4s +sub v10.4s, v7.4s, v3.4s +add v7.4s, v7.4s, v3.4s +sqrdmulh v3.4S, v7.4S, v0.s[1] +mul v7.4S, v7.4S,v16.s[1] +mla v7.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v10.4S, v0.s[2] +mul v10.4S, v10.4S,v16.s[2] +mla v10.4S, v3.4S, v31.s[0] +sub v3.4s, v5.4s, v7.4s +add v5.4s, v5.4s, v7.4s +sub v7.4s, v9.4s, v10.4s +add v9.4s, v9.4s, v10.4s +str q5, [x0, #512] +str q3, [x0, #528] +str q9, [x0, #544] +str q7, [x0, #560] +ldr q22, [x0, #608] +ldr q12, [x0, #624] +ldr q7, [x0, #576] +ldr q9, [x0, #592] +ldr q3, [x17, #+544] +ldr q5, [x17, #+560] +sqrdmulh v10.4S, v22.4S, v18.s[0] +mul v22.4S, v22.4S,v13.s[0] +mla v22.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v12.4S, v18.s[0] +mul v12.4S, v12.4S,v13.s[0] +mla v12.4S, v10.4S, v31.s[0] +sub v10.4s, v7.4s, v22.4s +add v7.4s, v7.4s, v22.4s +sub v22.4s, v9.4s, v12.4s +add v9.4s, v9.4s, v12.4s +sqrdmulh v12.4S, v9.4S, v18.s[1] +mul v9.4S, v9.4S,v13.s[1] +mla v9.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v22.4S, v18.s[2] +mul v22.4S, v22.4S,v13.s[2] +mla v22.4S, v12.4S, v31.s[0] +sub v12.4s, v7.4s, v9.4s +add v7.4s, v7.4s, v9.4s +sub v9.4s, v10.4s, v22.4s +add v10.4s, v10.4s, v22.4s +str q7, [x0, #576] +str q12, [x0, #592] +str q10, [x0, #608] +str q9, [x0, #624] +ldr q11, [x0, #672] +ldr q21, [x0, #688] +ldr q9, [x0, #640] +ldr q10, [x0, #656] +ldr q12, [x17, #+576] +ldr q7, [x17, #+592] +sqrdmulh v22.4S, v11.4S, v20.s[0] +mul v11.4S, v11.4S,v4.s[0] +mla v11.4S, v22.4S, v31.s[0] +sqrdmulh v22.4S, v21.4S, v20.s[0] +mul v21.4S, v21.4S,v4.s[0] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v9.4s, v11.4s +add v9.4s, v9.4s, v11.4s +sub v11.4s, v10.4s, v21.4s +add v10.4s, v10.4s, v21.4s +sqrdmulh v21.4S, v10.4S, v20.s[1] +mul v10.4S, v10.4S,v4.s[1] +mla v10.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v11.4S, v20.s[2] +mul v11.4S, v11.4S,v4.s[2] +mla v11.4S, v21.4S, v31.s[0] +sub v21.4s, v9.4s, v10.4s +add v9.4s, v9.4s, v10.4s +sub v10.4s, v22.4s, v11.4s +add v22.4s, v22.4s, v11.4s +str q9, [x0, #640] +str q21, [x0, #656] +str q22, [x0, #672] +str q10, [x0, #688] +ldr q19, [x0, #736] +ldr q1, [x0, #752] +ldr q10, [x0, #704] +ldr q22, [x0, #720] +ldr q21, [x17, #+608] +ldr q9, [x17, #+624] +sqrdmulh v11.4S, v19.4S, v15.s[0] +mul v19.4S, v19.4S,v6.s[0] +mla v19.4S, v11.4S, v31.s[0] +sqrdmulh v11.4S, v1.4S, v15.s[0] +mul v1.4S, v1.4S,v6.s[0] +mla v1.4S, v11.4S, v31.s[0] +sub v11.4s, v10.4s, v19.4s +add v10.4s, v10.4s, v19.4s +sub v19.4s, v22.4s, v1.4s +add v22.4s, v22.4s, v1.4s +sqrdmulh v1.4S, v22.4S, v15.s[1] +mul v22.4S, v22.4S,v6.s[1] +mla v22.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v19.4S, v15.s[2] +mul v19.4S, v19.4S,v6.s[2] +mla v19.4S, v1.4S, v31.s[0] +sub v1.4s, v10.4s, v22.4s +add v10.4s, v10.4s, v22.4s +sub v22.4s, v11.4s, v19.4s +add v11.4s, v11.4s, v19.4s +str q10, [x0, #704] +str q1, [x0, #720] +str q11, [x0, #736] +str q22, [x0, #752] +ldr q14, [x0, #800] +ldr q17, [x0, #816] +ldr q22, [x0, #768] +ldr q11, [x0, #784] +sqrdmulh v1.4S, v14.4S, v2.s[0] +mul v14.4S, v14.4S,v8.s[0] +mla v14.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v17.4S, v2.s[0] +mul v17.4S, v17.4S,v8.s[0] +mla v17.4S, v1.4S, v31.s[0] +sub v1.4s, v22.4s, v14.4s +add v22.4s, v22.4s, v14.4s +sub v14.4s, v11.4s, v17.4s +add v11.4s, v11.4s, v17.4s +sqrdmulh v17.4S, v11.4S, v2.s[1] +mul v11.4S, v11.4S,v8.s[1] +mla v11.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v14.4S, v2.s[2] +mul v14.4S, v14.4S,v8.s[2] +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v22.4s, v11.4s +add v22.4s, v22.4s, v11.4s +sub v11.4s, v1.4s, v14.4s +add v1.4s, v1.4s, v14.4s +str q22, [x0, #768] +str q17, [x0, #784] +str q1, [x0, #800] +str q11, [x0, #816] +ldr q0, [x0, #864] +ldr q16, [x0, #880] +ldr q11, [x0, #832] +ldr q1, [x0, #848] +sqrdmulh v17.4S, v0.4S, v5.s[0] +mul v0.4S, v0.4S,v3.s[0] +mla v0.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v16.4S, v5.s[0] +mul v16.4S, v16.4S,v3.s[0] +mla v16.4S, v17.4S, v31.s[0] +sub v17.4s, v11.4s, v0.4s +add v11.4s, v11.4s, v0.4s +sub v0.4s, v1.4s, v16.4s +add v1.4s, v1.4s, v16.4s +sqrdmulh v16.4S, v1.4S, v5.s[1] +mul v1.4S, v1.4S,v3.s[1] +mla v1.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v0.4S, v5.s[2] +mul v0.4S, v0.4S,v3.s[2] +mla v0.4S, v16.4S, v31.s[0] +sub v16.4s, v11.4s, v1.4s +add v11.4s, v11.4s, v1.4s +sub v1.4s, v17.4s, v0.4s +add v17.4s, v17.4s, v0.4s +str q11, [x0, #832] +str q16, [x0, #848] +str q17, [x0, #864] +str q1, [x0, #880] +ldr q18, [x0, #928] +ldr q13, [x0, #944] +ldr q1, [x0, #896] +ldr q17, [x0, #912] +sqrdmulh v16.4S, v18.4S, v7.s[0] +mul v18.4S, v18.4S,v12.s[0] +mla v18.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v13.4S, v7.s[0] +mul v13.4S, v13.4S,v12.s[0] +mla v13.4S, v16.4S, v31.s[0] +sub v16.4s, v1.4s, v18.4s +add v1.4s, v1.4s, v18.4s +sub v18.4s, v17.4s, v13.4s +add v17.4s, v17.4s, v13.4s +sqrdmulh v13.4S, v17.4S, v7.s[1] +mul v17.4S, v17.4S,v12.s[1] +mla v17.4S, v13.4S, v31.s[0] +sqrdmulh v13.4S, v18.4S, v7.s[2] +mul v18.4S, v18.4S,v12.s[2] +mla v18.4S, v13.4S, v31.s[0] +sub v13.4s, v1.4s, v17.4s +add v1.4s, v1.4s, v17.4s +sub v17.4s, v16.4s, v18.4s +add v16.4s, v16.4s, v18.4s +str q1, [x0, #896] +str q13, [x0, #912] +str q16, [x0, #928] +str q17, [x0, #944] +ldr q20, [x0, #992] +ldr q4, [x0, #1008] +ldr q17, [x0, #960] +ldr q16, [x0, #976] +sqrdmulh v13.4S, v20.4S, v9.s[0] +mul v20.4S, v20.4S,v21.s[0] +mla v20.4S, v13.4S, v31.s[0] +sqrdmulh v13.4S, v4.4S, v9.s[0] +mul v4.4S, v4.4S,v21.s[0] +mla v4.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v20.4s +add v17.4s, v17.4s, v20.4s +sub v20.4s, v16.4s, v4.4s +add v16.4s, v16.4s, v4.4s +sqrdmulh v4.4S, v16.4S, v9.s[1] +mul v16.4S, v16.4S,v21.s[1] +mla v16.4S, v4.4S, v31.s[0] +sqrdmulh v4.4S, v20.4S, v9.s[2] +mul v20.4S, v20.4S,v21.s[2] +mla v20.4S, v4.4S, v31.s[0] +sub v4.4s, v17.4s, v16.4s +add v17.4s, v17.4s, v16.4s +sub v16.4s, v13.4s, v20.4s +add v13.4s, v13.4s, v20.4s +str q17, [x0, #960] +str q4, [x0, #976] +str q13, [x0, #992] +str q16, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1464 +// Instruction count: 1460 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_10_z4_7.s b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_10_z4_7.s new file mode 100644 index 0000000..7075abc --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_10_z4_7.s @@ -0,0 +1,1558 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_10_z4_7 +.global _ntt_u32_incomplete_neon_asm_var_4_2_10_z4_7 +ntt_u32_incomplete_neon_asm_var_4_2_10_z4_7: +_ntt_u32_incomplete_neon_asm_var_4_2_10_z4_7: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #928] +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +ldr q20, [x0, #992] +sqrdmulh v19.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q18, [x0, #800] +sqrdmulh v17.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +ldr q16, [x0, #864] +sqrdmulh v3.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +mla v22.4S, v21.4S, v31.s[0] +mla v20.4S, v19.4S, v31.s[0] +mla v18.4S, v17.4S, v31.s[0] +mla v16.4S, v3.4S, v31.s[0] +ldr q3, [x0, #544] +sqrdmulh v17.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +ldr q19, [x0, #608] +sqrdmulh v21.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q2, [x0, #672] +ldr q1, [x0, #416] +sqrdmulh v0.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +sub v15.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +ldr q22, [x0, #736] +ldr q14, [x0, #480] +sqrdmulh v13.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v12.4s, v14.4s, v20.4s +add v14.4s, v14.4s, v20.4s +ldr q20, [x0, #288] +mla v3.4S, v17.4S, v31.s[0] +mla v19.4S, v21.4S, v31.s[0] +sub v21.4s, v20.4s, v18.4s +mla v2.4S, v0.4S, v31.s[0] +mla v22.4S, v13.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +ldr q18, [x0, #352] +sqrdmulh v13.4S, v1.4S, v29.s[1] +mul v1.4S, v1.4S,v30.s[1] +sub v0.4s, v18.4s, v16.4s +sqrdmulh v17.4S, v14.4S, v29.s[1] +mul v14.4S, v14.4S,v30.s[1] +add v18.4s, v18.4s, v16.4s +ldr q16, [x0, #32] +sqrdmulh v11.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v10.4s, v16.4s, v3.4s +add v16.4s, v16.4s, v3.4s +ldr q3, [x0, #96] +sqrdmulh v9.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v8.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +ldr q19, [x0, #160] +mla v1.4S, v13.4S, v31.s[0] +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v19.4s, v2.4s +mla v20.4S, v11.4S, v31.s[0] +mla v18.4S, v9.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +ldr q2, [x0, #224] +sqrdmulh v9.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +sub v11.4s, v2.4s, v22.4s +sqrdmulh v13.4S, v12.4S, v29.s[2] +mul v12.4S, v12.4S,v30.s[2] +add v2.4s, v2.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +sub v7.4s, v19.4s, v1.4s +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v29.s[2] +mul v0.4S, v0.4S,v30.s[2] +sub v6.4s, v2.4s, v14.4s +add v2.4s, v2.4s, v14.4s +mla v15.4S, v9.4S, v31.s[0] +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v16.4s, v20.4s +nop +mla v21.4S, v22.4S, v31.s[0] +mla v0.4S, v1.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +nop +sqrdmulh v20.4S, v7.4S, v27.s[1] +mul v7.4S, v7.4S,v28.s[1] +sub v1.4s, v3.4s, v18.4s +nop +sqrdmulh v22.4S, v6.4S, v27.s[1] +mul v6.4S, v6.4S,v28.s[1] +add v3.4s, v3.4s, v18.4s +nop +sqrdmulh v18.4S, v19.4S, v27.s[0] +mul v19.4S, v19.4S,v28.s[0] +sub v9.4s, v17.4s, v15.4s +add v17.4s, v17.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v27.s[0] +mul v2.4S, v2.4S,v28.s[0] +sub v14.4s, v11.4s, v12.4s +add v11.4s, v11.4s, v12.4s +mla v7.4S, v20.4S, v31.s[0] +mla v6.4S, v22.4S, v31.s[0] +sub v22.4s, v10.4s, v21.4s +nop +mla v19.4S, v18.4S, v31.s[0] +mla v2.4S, v15.4S, v31.s[0] +add v10.4s, v10.4s, v21.4s +nop +sqrdmulh v21.4S, v17.4S, v27.s[2] +mul v17.4S, v17.4S,v28.s[2] +sub v15.4s, v8.4s, v0.4s +nop +sqrdmulh v18.4S, v11.4S, v27.s[2] +mul v11.4S, v11.4S,v28.s[2] +add v8.4s, v8.4s, v0.4s +nop +sqrdmulh v0.4S, v9.4S, v27.s[3] +mul v9.4S, v9.4S,v28.s[3] +sub v20.4s, v13.4s, v7.4s +add v13.4s, v13.4s, v7.4s +sqrdmulh v7.4S, v14.4S, v27.s[3] +mul v14.4S, v14.4S,v28.s[3] +sub v12.4s, v1.4s, v6.4s +add v1.4s, v1.4s, v6.4s +mla v17.4S, v21.4S, v31.s[0] +mla v11.4S, v18.4S, v31.s[0] +sub v18.4s, v16.4s, v19.4s +nop +mla v9.4S, v0.4S, v31.s[0] +mla v14.4S, v7.4S, v31.s[0] +add v16.4s, v16.4s, v19.4s +nop +sqrdmulh v19.4S, v1.4S, v25.s[2] +mul v1.4S, v1.4S,v26.s[2] +sub v7.4s, v3.4s, v2.4s +nop +sqrdmulh v0.4S, v12.4S, v25.s[3] +mul v12.4S, v12.4S,v26.s[3] +add v3.4s, v3.4s, v2.4s +nop +sqrdmulh v2.4S, v7.4S, v25.s[1] +mul v7.4S, v7.4S,v26.s[1] +sub v21.4s, v10.4s, v17.4s +add v10.4s, v10.4s, v17.4s +sqrdmulh v17.4S, v3.4S, v25.s[0] +mul v3.4S, v3.4S,v26.s[0] +sub v6.4s, v8.4s, v11.4s +add v8.4s, v8.4s, v11.4s +mla v1.4S, v19.4S, v31.s[0] +mla v12.4S, v0.4S, v31.s[0] +sub v0.4s, v22.4s, v9.4s +nop +mla v7.4S, v2.4S, v31.s[0] +mla v3.4S, v17.4S, v31.s[0] +add v22.4s, v22.4s, v9.4s +nop +sqrdmulh v9.4S, v8.4S, v23.s[0] +mul v8.4S, v8.4S,v24.s[0] +sub v17.4s, v15.4s, v14.4s +nop +sqrdmulh v2.4S, v6.4S, v23.s[1] +mul v6.4S, v6.4S,v24.s[1] +add v15.4s, v15.4s, v14.4s +nop +sqrdmulh v14.4S, v15.4S, v23.s[2] +mul v15.4S, v15.4S,v24.s[2] +sub v19.4s, v13.4s, v1.4s +add v13.4s, v13.4s, v1.4s +sqrdmulh v1.4S, v17.4S, v23.s[3] +mul v17.4S, v17.4S,v24.s[3] +sub v11.4s, v20.4s, v12.4s +add v20.4s, v20.4s, v12.4s +mla v8.4S, v9.4S, v31.s[0] +mla v6.4S, v2.4S, v31.s[0] +sub v2.4s, v18.4s, v7.4s +str q13, [x0, #288] +mla v15.4S, v14.4S, v31.s[0] +mla v17.4S, v1.4S, v31.s[0] +add v18.4s, v18.4s, v7.4s +str q19, [x0, #352] +ldr q19, [x0, #944] +sqrdmulh v7.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +sub v1.4s, v16.4s, v3.4s +str q20, [x0, #416] +ldr q20, [x0, #1008] +sqrdmulh v14.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v16.4s, v16.4s, v3.4s +str q11, [x0, #480] +ldr q11, [x0, #816] +sqrdmulh v3.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +sub v13.4s, v10.4s, v8.4s +add v10.4s, v10.4s, v8.4s +ldr q8, [x0, #880] +sqrdmulh v9.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v12.4s, v21.4s, v6.4s +add v21.4s, v21.4s, v6.4s +mla v19.4S, v7.4S, v31.s[0] +mla v20.4S, v14.4S, v31.s[0] +sub v14.4s, v22.4s, v15.4s +str q18, [x0, #160] +mla v11.4S, v3.4S, v31.s[0] +mla v8.4S, v9.4S, v31.s[0] +add v22.4s, v22.4s, v15.4s +str q2, [x0, #224] +ldr q2, [x0, #560] +sqrdmulh v15.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +sub v9.4s, v0.4s, v17.4s +str q16, [x0, #32] +ldr q16, [x0, #624] +sqrdmulh v3.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +add v0.4s, v0.4s, v17.4s +str q1, [x0, #96] +ldr q1, [x0, #688] +ldr q17, [x0, #432] +sqrdmulh v18.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +sub v7.4s, v17.4s, v19.4s +add v17.4s, v17.4s, v19.4s +ldr q19, [x0, #752] +ldr q6, [x0, #496] +sqrdmulh v5.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +sub v4.4s, v6.4s, v20.4s +add v6.4s, v6.4s, v20.4s +ldr q20, [x0, #304] +mla v2.4S, v15.4S, v31.s[0] +mla v16.4S, v3.4S, v31.s[0] +sub v3.4s, v20.4s, v11.4s +str q10, [x0, #544] +mla v1.4S, v18.4S, v31.s[0] +mla v19.4S, v5.4S, v31.s[0] +add v20.4s, v20.4s, v11.4s +str q13, [x0, #608] +ldr q13, [x0, #368] +sqrdmulh v11.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v5.4s, v13.4s, v8.4s +str q21, [x0, #672] +sqrdmulh v21.4S, v6.4S, v29.s[1] +mul v6.4S, v6.4S,v30.s[1] +add v13.4s, v13.4s, v8.4s +str q12, [x0, #736] +ldr q12, [x0, #48] +sqrdmulh v8.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v18.4s, v12.4s, v2.4s +add v12.4s, v12.4s, v2.4s +ldr q2, [x0, #112] +sqrdmulh v10.4S, v13.4S, v29.s[1] +mul v13.4S, v13.4S,v30.s[1] +sub v15.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +ldr q16, [x0, #176] +mla v17.4S, v11.4S, v31.s[0] +mla v6.4S, v21.4S, v31.s[0] +sub v21.4s, v16.4s, v1.4s +str q22, [x0, #800] +mla v20.4S, v8.4S, v31.s[0] +mla v13.4S, v10.4S, v31.s[0] +add v16.4s, v16.4s, v1.4s +str q14, [x0, #864] +ldr q14, [x0, #240] +sqrdmulh v1.4S, v7.4S, v29.s[2] +mul v7.4S, v7.4S,v30.s[2] +sub v10.4s, v14.4s, v19.4s +str q0, [x0, #928] +sqrdmulh v0.4S, v4.4S, v29.s[2] +mul v4.4S, v4.4S,v30.s[2] +add v14.4s, v14.4s, v19.4s +str q9, [x0, #992] +sqrdmulh v9.4S, v3.4S, v29.s[2] +mul v3.4S, v3.4S,v30.s[2] +sub v19.4s, v16.4s, v17.4s +add v16.4s, v16.4s, v17.4s +sqrdmulh v17.4S, v5.4S, v29.s[2] +mul v5.4S, v5.4S,v30.s[2] +sub v8.4s, v14.4s, v6.4s +add v14.4s, v14.4s, v6.4s +mla v7.4S, v1.4S, v31.s[0] +mla v4.4S, v0.4S, v31.s[0] +sub v0.4s, v12.4s, v20.4s +nop +mla v3.4S, v9.4S, v31.s[0] +mla v5.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v20.4s +nop +sqrdmulh v20.4S, v19.4S, v27.s[1] +mul v19.4S, v19.4S,v28.s[1] +sub v17.4s, v2.4s, v13.4s +nop +sqrdmulh v9.4S, v8.4S, v27.s[1] +mul v8.4S, v8.4S,v28.s[1] +add v2.4s, v2.4s, v13.4s +nop +sqrdmulh v13.4S, v16.4S, v27.s[0] +mul v16.4S, v16.4S,v28.s[0] +sub v1.4s, v21.4s, v7.4s +add v21.4s, v21.4s, v7.4s +sqrdmulh v7.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +sub v6.4s, v10.4s, v4.4s +add v10.4s, v10.4s, v4.4s +mla v19.4S, v20.4S, v31.s[0] +mla v8.4S, v9.4S, v31.s[0] +sub v9.4s, v18.4s, v3.4s +nop +mla v16.4S, v13.4S, v31.s[0] +mla v14.4S, v7.4S, v31.s[0] +add v18.4s, v18.4s, v3.4s +nop +sqrdmulh v3.4S, v21.4S, v27.s[2] +mul v21.4S, v21.4S,v28.s[2] +sub v7.4s, v15.4s, v5.4s +nop +sqrdmulh v13.4S, v10.4S, v27.s[2] +mul v10.4S, v10.4S,v28.s[2] +add v15.4s, v15.4s, v5.4s +nop +sqrdmulh v5.4S, v1.4S, v27.s[3] +mul v1.4S, v1.4S,v28.s[3] +sub v20.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v27.s[3] +mul v6.4S, v6.4S,v28.s[3] +sub v4.4s, v17.4s, v8.4s +add v17.4s, v17.4s, v8.4s +mla v21.4S, v3.4S, v31.s[0] +mla v10.4S, v13.4S, v31.s[0] +sub v13.4s, v12.4s, v16.4s +nop +mla v1.4S, v5.4S, v31.s[0] +mla v6.4S, v19.4S, v31.s[0] +add v12.4s, v12.4s, v16.4s +nop +sqrdmulh v16.4S, v17.4S, v25.s[2] +mul v17.4S, v17.4S,v26.s[2] +sub v19.4s, v2.4s, v14.4s +nop +sqrdmulh v5.4S, v4.4S, v25.s[3] +mul v4.4S, v4.4S,v26.s[3] +add v2.4s, v2.4s, v14.4s +nop +sqrdmulh v14.4S, v19.4S, v25.s[1] +mul v19.4S, v19.4S,v26.s[1] +sub v3.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v2.4S, v25.s[0] +mul v2.4S, v2.4S,v26.s[0] +sub v8.4s, v15.4s, v10.4s +add v15.4s, v15.4s, v10.4s +mla v17.4S, v16.4S, v31.s[0] +mla v4.4S, v5.4S, v31.s[0] +sub v5.4s, v9.4s, v1.4s +nop +mla v19.4S, v14.4S, v31.s[0] +mla v2.4S, v21.4S, v31.s[0] +add v9.4s, v9.4s, v1.4s +nop +sqrdmulh v1.4S, v15.4S, v23.s[0] +mul v15.4S, v15.4S,v24.s[0] +sub v21.4s, v7.4s, v6.4s +nop +sqrdmulh v14.4S, v8.4S, v23.s[1] +mul v8.4S, v8.4S,v24.s[1] +add v7.4s, v7.4s, v6.4s +nop +sqrdmulh v6.4S, v7.4S, v23.s[2] +mul v7.4S, v7.4S,v24.s[2] +sub v16.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +sqrdmulh v17.4S, v21.4S, v23.s[3] +mul v21.4S, v21.4S,v24.s[3] +sub v10.4s, v20.4s, v4.4s +add v20.4s, v20.4s, v4.4s +mla v15.4S, v1.4S, v31.s[0] +mla v8.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v19.4s +str q0, [x0, #304] +mla v7.4S, v6.4S, v31.s[0] +mla v21.4S, v17.4S, v31.s[0] +add v13.4s, v13.4s, v19.4s +str q16, [x0, #368] +ldr q16, [x0, #896] +sqrdmulh v19.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v17.4s, v12.4s, v2.4s +str q20, [x0, #432] +ldr q20, [x0, #960] +sqrdmulh v6.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v12.4s, v12.4s, v2.4s +str q10, [x0, #496] +ldr q10, [x0, #768] +sqrdmulh v2.4S, v10.4S, v29.s[0] +mul v10.4S, v10.4S,v30.s[0] +sub v0.4s, v18.4s, v15.4s +add v18.4s, v18.4s, v15.4s +ldr q15, [x0, #832] +sqrdmulh v1.4S, v15.4S, v29.s[0] +mul v15.4S, v15.4S,v30.s[0] +sub v4.4s, v3.4s, v8.4s +add v3.4s, v3.4s, v8.4s +mla v16.4S, v19.4S, v31.s[0] +mla v20.4S, v6.4S, v31.s[0] +sub v6.4s, v9.4s, v7.4s +str q13, [x0, #176] +mla v10.4S, v2.4S, v31.s[0] +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v7.4s +str q14, [x0, #240] +ldr q14, [x0, #512] +sqrdmulh v7.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v1.4s, v5.4s, v21.4s +str q12, [x0, #48] +ldr q12, [x0, #576] +sqrdmulh v2.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +add v5.4s, v5.4s, v21.4s +str q17, [x0, #112] +ldr q17, [x0, #640] +ldr q21, [x0, #384] +sqrdmulh v13.4S, v17.4S, v29.s[0] +mul v17.4S, v17.4S,v30.s[0] +sub v19.4s, v21.4s, v16.4s +add v21.4s, v21.4s, v16.4s +ldr q16, [x0, #704] +ldr q8, [x0, #448] +sqrdmulh v22.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v11.4s, v8.4s, v20.4s +add v8.4s, v8.4s, v20.4s +ldr q20, [x0, #256] +mla v14.4S, v7.4S, v31.s[0] +mla v12.4S, v2.4S, v31.s[0] +sub v2.4s, v20.4s, v10.4s +str q18, [x0, #560] +mla v17.4S, v13.4S, v31.s[0] +mla v16.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v10.4s +str q0, [x0, #624] +ldr q0, [x0, #320] +sqrdmulh v10.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v22.4s, v0.4s, v15.4s +str q3, [x0, #688] +sqrdmulh v3.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +add v0.4s, v0.4s, v15.4s +str q4, [x0, #752] +ldr q4, [x0, #0] +sqrdmulh v15.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v13.4s, v4.4s, v14.4s +add v4.4s, v4.4s, v14.4s +ldr q14, [x0, #64] +sqrdmulh v18.4S, v0.4S, v29.s[1] +mul v0.4S, v0.4S,v30.s[1] +sub v7.4s, v14.4s, v12.4s +add v14.4s, v14.4s, v12.4s +ldr q12, [x0, #128] +mla v21.4S, v10.4S, v31.s[0] +mla v8.4S, v3.4S, v31.s[0] +sub v3.4s, v12.4s, v17.4s +str q9, [x0, #816] +mla v20.4S, v15.4S, v31.s[0] +mla v0.4S, v18.4S, v31.s[0] +add v12.4s, v12.4s, v17.4s +str q6, [x0, #880] +ldr q6, [x0, #192] +sqrdmulh v17.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +sub v18.4s, v6.4s, v16.4s +str q5, [x0, #944] +sqrdmulh v5.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +add v6.4s, v6.4s, v16.4s +str q1, [x0, #1008] +sqrdmulh v1.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v16.4s, v12.4s, v21.4s +add v12.4s, v12.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +sub v15.4s, v6.4s, v8.4s +add v6.4s, v6.4s, v8.4s +mla v19.4S, v17.4S, v31.s[0] +mla v11.4S, v5.4S, v31.s[0] +sub v5.4s, v4.4s, v20.4s +nop +mla v2.4S, v1.4S, v31.s[0] +mla v22.4S, v21.4S, v31.s[0] +add v4.4s, v4.4s, v20.4s +nop +sqrdmulh v20.4S, v16.4S, v27.s[1] +mul v16.4S, v16.4S,v28.s[1] +sub v21.4s, v14.4s, v0.4s +nop +sqrdmulh v1.4S, v15.4S, v27.s[1] +mul v15.4S, v15.4S,v28.s[1] +add v14.4s, v14.4s, v0.4s +nop +sqrdmulh v0.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +sub v17.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v27.s[0] +mul v6.4S, v6.4S,v28.s[0] +sub v8.4s, v18.4s, v11.4s +add v18.4s, v18.4s, v11.4s +mla v16.4S, v20.4S, v31.s[0] +mla v15.4S, v1.4S, v31.s[0] +sub v1.4s, v13.4s, v2.4s +nop +mla v12.4S, v0.4S, v31.s[0] +mla v6.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v2.4s +nop +sqrdmulh v2.4S, v3.4S, v27.s[2] +mul v3.4S, v3.4S,v28.s[2] +sub v19.4s, v7.4s, v22.4s +nop +sqrdmulh v0.4S, v18.4S, v27.s[2] +mul v18.4S, v18.4S,v28.s[2] +add v7.4s, v7.4s, v22.4s +nop +sqrdmulh v22.4S, v17.4S, v27.s[3] +mul v17.4S, v17.4S,v28.s[3] +sub v20.4s, v5.4s, v16.4s +add v5.4s, v5.4s, v16.4s +sqrdmulh v16.4S, v8.4S, v27.s[3] +mul v8.4S, v8.4S,v28.s[3] +sub v11.4s, v21.4s, v15.4s +add v21.4s, v21.4s, v15.4s +mla v3.4S, v2.4S, v31.s[0] +mla v18.4S, v0.4S, v31.s[0] +sub v0.4s, v4.4s, v12.4s +nop +mla v17.4S, v22.4S, v31.s[0] +mla v8.4S, v16.4S, v31.s[0] +add v4.4s, v4.4s, v12.4s +nop +sqrdmulh v12.4S, v21.4S, v25.s[2] +mul v21.4S, v21.4S,v26.s[2] +sub v16.4s, v14.4s, v6.4s +nop +sqrdmulh v22.4S, v11.4S, v25.s[3] +mul v11.4S, v11.4S,v26.s[3] +add v14.4s, v14.4s, v6.4s +nop +sqrdmulh v6.4S, v16.4S, v25.s[1] +mul v16.4S, v16.4S,v26.s[1] +sub v2.4s, v13.4s, v3.4s +add v13.4s, v13.4s, v3.4s +sqrdmulh v3.4S, v14.4S, v25.s[0] +mul v14.4S, v14.4S,v26.s[0] +sub v15.4s, v7.4s, v18.4s +add v7.4s, v7.4s, v18.4s +mla v21.4S, v12.4S, v31.s[0] +mla v11.4S, v22.4S, v31.s[0] +sub v22.4s, v1.4s, v17.4s +nop +mla v16.4S, v6.4S, v31.s[0] +mla v14.4S, v3.4S, v31.s[0] +add v1.4s, v1.4s, v17.4s +nop +sqrdmulh v17.4S, v7.4S, v23.s[0] +mul v7.4S, v7.4S,v24.s[0] +sub v3.4s, v19.4s, v8.4s +nop +sqrdmulh v6.4S, v15.4S, v23.s[1] +mul v15.4S, v15.4S,v24.s[1] +add v19.4s, v19.4s, v8.4s +nop +sqrdmulh v8.4S, v19.4S, v23.s[2] +mul v19.4S, v19.4S,v24.s[2] +sub v12.4s, v5.4s, v21.4s +add v5.4s, v5.4s, v21.4s +sqrdmulh v21.4S, v3.4S, v23.s[3] +mul v3.4S, v3.4S,v24.s[3] +sub v18.4s, v20.4s, v11.4s +add v20.4s, v20.4s, v11.4s +mla v7.4S, v17.4S, v31.s[0] +mla v15.4S, v6.4S, v31.s[0] +sub v6.4s, v0.4s, v16.4s +str q5, [x0, #256] +mla v19.4S, v8.4S, v31.s[0] +mla v3.4S, v21.4S, v31.s[0] +add v0.4s, v0.4s, v16.4s +str q12, [x0, #320] +ldr q12, [x0, #912] +sqrdmulh v16.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +sub v21.4s, v4.4s, v14.4s +str q20, [x0, #384] +ldr q20, [x0, #976] +sqrdmulh v8.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v4.4s, v4.4s, v14.4s +str q18, [x0, #448] +ldr q18, [x0, #784] +sqrdmulh v14.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +sub v5.4s, v13.4s, v7.4s +add v13.4s, v13.4s, v7.4s +ldr q7, [x0, #848] +sqrdmulh v17.4S, v7.4S, v29.s[0] +mul v7.4S, v7.4S,v30.s[0] +sub v11.4s, v2.4s, v15.4s +add v2.4s, v2.4s, v15.4s +mla v12.4S, v16.4S, v31.s[0] +mla v20.4S, v8.4S, v31.s[0] +sub v8.4s, v1.4s, v19.4s +str q0, [x0, #128] +mla v18.4S, v14.4S, v31.s[0] +mla v7.4S, v17.4S, v31.s[0] +add v1.4s, v1.4s, v19.4s +str q6, [x0, #192] +ldr q6, [x0, #528] +sqrdmulh v19.4S, v6.4S, v29.s[0] +mul v6.4S, v6.4S,v30.s[0] +sub v17.4s, v22.4s, v3.4s +str q4, [x0, #0] +ldr q4, [x0, #592] +sqrdmulh v14.4S, v4.4S, v29.s[0] +mul v4.4S, v4.4S,v30.s[0] +add v22.4s, v22.4s, v3.4s +str q21, [x0, #64] +ldr q21, [x0, #656] +ldr q3, [x0, #400] +sqrdmulh v0.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +sub v16.4s, v3.4s, v12.4s +add v3.4s, v3.4s, v12.4s +ldr q12, [x0, #720] +ldr q15, [x0, #464] +sqrdmulh v9.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +sub v10.4s, v15.4s, v20.4s +add v15.4s, v15.4s, v20.4s +ldr q20, [x0, #272] +mla v6.4S, v19.4S, v31.s[0] +mla v4.4S, v14.4S, v31.s[0] +sub v14.4s, v20.4s, v18.4s +str q13, [x0, #512] +mla v21.4S, v0.4S, v31.s[0] +mla v12.4S, v9.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +str q5, [x0, #576] +ldr q5, [x0, #336] +sqrdmulh v18.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v9.4s, v5.4s, v7.4s +str q2, [x0, #640] +sqrdmulh v2.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +add v5.4s, v5.4s, v7.4s +str q11, [x0, #704] +ldr q11, [x0, #16] +sqrdmulh v7.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v0.4s, v11.4s, v6.4s +add v11.4s, v11.4s, v6.4s +ldr q6, [x0, #80] +sqrdmulh v13.4S, v5.4S, v29.s[1] +mul v5.4S, v5.4S,v30.s[1] +sub v19.4s, v6.4s, v4.4s +add v6.4s, v6.4s, v4.4s +ldr q4, [x0, #144] +mla v3.4S, v18.4S, v31.s[0] +mla v15.4S, v2.4S, v31.s[0] +sub v2.4s, v4.4s, v21.4s +str q1, [x0, #768] +mla v20.4S, v7.4S, v31.s[0] +mla v5.4S, v13.4S, v31.s[0] +add v4.4s, v4.4s, v21.4s +str q8, [x0, #832] +ldr q8, [x0, #208] +sqrdmulh v21.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +sub v13.4s, v8.4s, v12.4s +str q22, [x0, #896] +sqrdmulh v22.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +add v8.4s, v8.4s, v12.4s +str q17, [x0, #960] +sqrdmulh v17.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v12.4s, v4.4s, v3.4s +add v4.4s, v4.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v29.s[2] +mul v9.4S, v9.4S,v30.s[2] +sub v7.4s, v8.4s, v15.4s +add v8.4s, v8.4s, v15.4s +mla v16.4S, v21.4S, v31.s[0] +mla v10.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v20.4s +nop +mla v14.4S, v17.4S, v31.s[0] +mla v9.4S, v3.4S, v31.s[0] +add v11.4s, v11.4s, v20.4s +nop +sqrdmulh v20.4S, v12.4S, v27.s[1] +mul v12.4S, v12.4S,v28.s[1] +sub v3.4s, v6.4s, v5.4s +nop +sqrdmulh v17.4S, v7.4S, v27.s[1] +mul v7.4S, v7.4S,v28.s[1] +add v6.4s, v6.4s, v5.4s +nop +sqrdmulh v5.4S, v4.4S, v27.s[0] +mul v4.4S, v4.4S,v28.s[0] +sub v21.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v8.4S, v27.s[0] +mul v8.4S, v8.4S,v28.s[0] +sub v15.4s, v13.4s, v10.4s +add v13.4s, v13.4s, v10.4s +mla v12.4S, v20.4S, v31.s[0] +mla v7.4S, v17.4S, v31.s[0] +sub v17.4s, v0.4s, v14.4s +nop +mla v4.4S, v5.4S, v31.s[0] +mla v8.4S, v16.4S, v31.s[0] +add v0.4s, v0.4s, v14.4s +nop +sqrdmulh v14.4S, v2.4S, v27.s[2] +mul v2.4S, v2.4S,v28.s[2] +sub v16.4s, v19.4s, v9.4s +nop +sqrdmulh v5.4S, v13.4S, v27.s[2] +mul v13.4S, v13.4S,v28.s[2] +add v19.4s, v19.4s, v9.4s +nop +sqrdmulh v9.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +sub v20.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +sub v10.4s, v3.4s, v7.4s +add v3.4s, v3.4s, v7.4s +mla v2.4S, v14.4S, v31.s[0] +mla v13.4S, v5.4S, v31.s[0] +sub v5.4s, v11.4s, v4.4s +nop +mla v21.4S, v9.4S, v31.s[0] +mla v15.4S, v12.4S, v31.s[0] +add v11.4s, v11.4s, v4.4s +nop +sqrdmulh v4.4S, v3.4S, v25.s[2] +mul v3.4S, v3.4S,v26.s[2] +sub v12.4s, v6.4s, v8.4s +nop +sqrdmulh v9.4S, v10.4S, v25.s[3] +mul v10.4S, v10.4S,v26.s[3] +add v6.4s, v6.4s, v8.4s +nop +sqrdmulh v8.4S, v12.4S, v25.s[1] +mul v12.4S, v12.4S,v26.s[1] +sub v14.4s, v0.4s, v2.4s +add v0.4s, v0.4s, v2.4s +sqrdmulh v2.4S, v6.4S, v25.s[0] +mul v6.4S, v6.4S,v26.s[0] +sub v7.4s, v19.4s, v13.4s +add v19.4s, v19.4s, v13.4s +mla v3.4S, v4.4S, v31.s[0] +mla v10.4S, v9.4S, v31.s[0] +sub v9.4s, v17.4s, v21.4s +nop +mla v12.4S, v8.4S, v31.s[0] +mla v6.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v21.4s +nop +sqrdmulh v21.4S, v19.4S, v23.s[0] +mul v19.4S, v19.4S,v24.s[0] +sub v2.4s, v16.4s, v15.4s +nop +sqrdmulh v8.4S, v7.4S, v23.s[1] +mul v7.4S, v7.4S,v24.s[1] +add v16.4s, v16.4s, v15.4s +nop +sqrdmulh v15.4S, v16.4S, v23.s[2] +mul v16.4S, v16.4S,v24.s[2] +sub v4.4s, v22.4s, v3.4s +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v2.4S, v23.s[3] +mul v2.4S, v2.4S,v24.s[3] +sub v13.4s, v20.4s, v10.4s +add v20.4s, v20.4s, v10.4s +mla v19.4S, v21.4S, v31.s[0] +mla v7.4S, v8.4S, v31.s[0] +sub v8.4s, v5.4s, v12.4s +str q22, [x0, #272] +mla v16.4S, v15.4S, v31.s[0] +mla v2.4S, v3.4S, v31.s[0] +add v5.4s, v5.4s, v12.4s +str q4, [x0, #336] +sub v23.4s, v11.4s, v6.4s +str q20, [x0, #400] +add v11.4s, v11.4s, v6.4s +str q13, [x0, #464] +sub v13.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sub v19.4s, v14.4s, v7.4s +add v14.4s, v14.4s, v7.4s +sub v7.4s, v17.4s, v16.4s +str q5, [x0, #144] +add v17.4s, v17.4s, v16.4s +str q8, [x0, #208] +sub v8.4s, v9.4s, v2.4s +str q11, [x0, #16] +add v9.4s, v9.4s, v2.4s +str q23, [x0, #80] +str q0, [x0, #528] +str q13, [x0, #592] +str q14, [x0, #656] +str q19, [x0, #720] +str q17, [x0, #784] +str q7, [x0, #848] +str q9, [x0, #912] +str q8, [x0, #976] +ldr q18, [x0, #224] +ldr q1, [x0, #160] +ldr q10, [x0, #32] +ldr q21, [x17, #+128] +ldr q22, [x17, #+144] +sqrdmulh v15.4S, v10.4S, v22.s[0] +mul v10.4S, v10.4S,v21.s[0] +ldr q3, [x0, #48] +sqrdmulh v12.4S, v3.4S, v22.s[0] +mul v3.4S, v3.4S,v21.s[0] +ldr q4, [x17, #+160] +ldr q30, [x17, #+176] +ldr q29, [x0, #96] +sqrdmulh v28.4S, v29.4S, v30.s[0] +mul v29.4S, v29.4S,v4.s[0] +ldr q27, [x0, #112] +sqrdmulh v26.4S, v27.4S, v30.s[0] +mul v27.4S, v27.4S,v4.s[0] +ldr q25, [x17, #+192] +ldr q24, [x17, #+208] +mla v10.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v1.4S, v24.s[0] +ldr q20, [x0, #176] +mla v3.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v20.4S, v24.s[0] +ldr q6, [x17, #+224] +ldr q5, [x17, #+240] +mla v29.4S, v28.4S, v31.s[0] +sqrdmulh v28.4S, v18.4S, v5.s[0] +ldr q16, [x0, #240] +mla v27.4S, v26.4S, v31.s[0] +sqrdmulh v26.4S, v16.4S, v5.s[0] +ldr q11, [x0, #0] +ldr q2, [x0, #128] +mul v1.4S, v1.4S,v25.s[0] +sub v23.4s, v11.4s, v10.4s +ldr q0, [x0, #16] +mul v20.4S, v20.4S,v25.s[0] +add v11.4s, v11.4s, v10.4s +ldr q10, [x0, #144] +mla v1.4S, v15.4S, v31.s[0] +sub v15.4s, v0.4s, v3.4s +ldr q13, [x0, #64] +mla v20.4S, v12.4S, v31.s[0] +add v0.4s, v0.4s, v3.4s +ldr q3, [x0, #192] +mul v18.4S, v18.4S,v6.s[0] +sub v12.4s, v13.4s, v29.4s +ldr q14, [x0, #80] +mul v16.4S, v16.4S,v6.s[0] +add v13.4s, v13.4s, v29.4s +ldr q29, [x0, #208] +mla v18.4S, v28.4S, v31.s[0] +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v14.4s, v27.4s +sqrdmulh v28.4S, v0.4S, v22.s[1] +add v14.4s, v14.4s, v27.4s +mul v0.4S, v0.4S,v21.s[1] +sqrdmulh v27.4S, v15.4S, v22.s[2] +sub v19.4s, v2.4s, v1.4s +mul v15.4S, v15.4S,v21.s[2] +add v2.4s, v2.4s, v1.4s +sqrdmulh v22.4S, v14.4S, v30.s[1] +sub v21.4s, v10.4s, v20.4s +mul v14.4S, v14.4S,v4.s[1] +add v10.4s, v10.4s, v20.4s +sqrdmulh v20.4S, v26.4S, v30.s[2] +sub v1.4s, v3.4s, v18.4s +mul v26.4S, v26.4S,v4.s[2] +add v3.4s, v3.4s, v18.4s +mla v0.4S, v28.4S, v31.s[0] +sub v28.4s, v29.4s, v16.4s +ldr q30, [x0, #480] +sqrdmulh v4.4S, v10.4S, v24.s[1] +add v29.4s, v29.4s, v16.4s +mla v15.4S, v27.4S, v31.s[0] +ldr q27, [x0, #416] +sqrdmulh v16.4S, v21.4S, v24.s[2] +sub v18.4s, v11.4s, v0.4s +mla v14.4S, v22.4S, v31.s[0] +ldr q22, [x0, #288] +sqrdmulh v17.4S, v29.4S, v5.s[1] +add v11.4s, v11.4s, v0.4s +str q18, [x0, #16] +mla v26.4S, v20.4S, v31.s[0] +ldr q20, [x17, #+256] +ldr q18, [x17, #+272] +sqrdmulh v0.4S, v28.4S, v5.s[2] +sub v7.4s, v23.4s, v15.4s +str q11, [x0, #0] +mul v10.4S, v10.4S,v25.s[1] +add v23.4s, v23.4s, v15.4s +mul v21.4S, v21.4S,v25.s[2] +str q7, [x0, #48] +mla v10.4S, v4.4S, v31.s[0] +sub v4.4s, v13.4s, v14.4s +mla v21.4S, v16.4S, v31.s[0] +str q23, [x0, #32] +mul v29.4S, v29.4S,v6.s[1] +str q4, [x0, #80] +mul v28.4S, v28.4S,v6.s[2] +add v13.4s, v13.4s, v14.4s +str q13, [x0, #64] +mla v29.4S, v17.4S, v31.s[0] +sub v17.4s, v12.4s, v26.4s +str q17, [x0, #112] +mla v28.4S, v0.4S, v31.s[0] +add v12.4s, v12.4s, v26.4s +str q12, [x0, #96] +sqrdmulh v5.4S, v22.4S, v18.s[0] +sub v6.4s, v2.4s, v10.4s +mul v22.4S, v22.4S,v20.s[0] +str q6, [x0, #144] +ldr q6, [x0, #304] +sqrdmulh v12.4S, v6.4S, v18.s[0] +add v2.4s, v2.4s, v10.4s +mul v6.4S, v6.4S,v20.s[0] +str q2, [x0, #128] +ldr q2, [x17, #+288] +ldr q10, [x17, #+304] +ldr q26, [x0, #352] +sqrdmulh v0.4S, v26.4S, v10.s[0] +sub v17.4s, v19.4s, v21.4s +mul v26.4S, v26.4S,v2.s[0] +str q17, [x0, #176] +ldr q17, [x0, #368] +sqrdmulh v13.4S, v17.4S, v10.s[0] +add v19.4s, v19.4s, v21.4s +mul v17.4S, v17.4S,v2.s[0] +str q19, [x0, #160] +ldr q19, [x17, #+320] +ldr q21, [x17, #+336] +mla v22.4S, v5.4S, v31.s[0] +sub v5.4s, v3.4s, v29.4s +sqrdmulh v14.4S, v27.4S, v21.s[0] +str q5, [x0, #208] +ldr q5, [x0, #432] +mla v6.4S, v12.4S, v31.s[0] +add v3.4s, v3.4s, v29.4s +sqrdmulh v29.4S, v5.4S, v21.s[0] +str q3, [x0, #192] +ldr q3, [x17, #+352] +ldr q12, [x17, #+368] +mla v26.4S, v0.4S, v31.s[0] +sub v0.4s, v1.4s, v28.4s +sqrdmulh v4.4S, v30.4S, v12.s[0] +str q0, [x0, #240] +ldr q0, [x0, #496] +mla v17.4S, v13.4S, v31.s[0] +add v1.4s, v1.4s, v28.4s +sqrdmulh v28.4S, v0.4S, v12.s[0] +str q1, [x0, #224] +ldr q1, [x0, #256] +ldr q13, [x0, #384] +mul v27.4S, v27.4S,v19.s[0] +sub v24.4s, v1.4s, v22.4s +ldr q25, [x0, #272] +mul v5.4S, v5.4S,v19.s[0] +add v1.4s, v1.4s, v22.4s +ldr q22, [x0, #400] +mla v27.4S, v14.4S, v31.s[0] +sub v14.4s, v25.4s, v6.4s +ldr q23, [x0, #320] +mla v5.4S, v29.4S, v31.s[0] +add v25.4s, v25.4s, v6.4s +ldr q6, [x0, #448] +mul v30.4S, v30.4S,v3.s[0] +sub v29.4s, v23.4s, v26.4s +ldr q16, [x0, #336] +mul v0.4S, v0.4S,v3.s[0] +add v23.4s, v23.4s, v26.4s +ldr q26, [x0, #464] +mla v30.4S, v4.4S, v31.s[0] +mla v0.4S, v28.4S, v31.s[0] +sub v28.4s, v16.4s, v17.4s +sqrdmulh v4.4S, v25.4S, v18.s[1] +add v16.4s, v16.4s, v17.4s +mul v25.4S, v25.4S,v20.s[1] +sqrdmulh v17.4S, v14.4S, v18.s[2] +sub v7.4s, v13.4s, v27.4s +mul v14.4S, v14.4S,v20.s[2] +add v13.4s, v13.4s, v27.4s +sqrdmulh v18.4S, v16.4S, v10.s[1] +sub v20.4s, v22.4s, v5.4s +mul v16.4S, v16.4S,v2.s[1] +add v22.4s, v22.4s, v5.4s +sqrdmulh v5.4S, v28.4S, v10.s[2] +sub v27.4s, v6.4s, v30.4s +mul v28.4S, v28.4S,v2.s[2] +add v6.4s, v6.4s, v30.4s +mla v25.4S, v4.4S, v31.s[0] +sub v4.4s, v26.4s, v0.4s +ldr q10, [x0, #736] +sqrdmulh v2.4S, v22.4S, v21.s[1] +add v26.4s, v26.4s, v0.4s +mla v14.4S, v17.4S, v31.s[0] +ldr q17, [x0, #672] +sqrdmulh v0.4S, v20.4S, v21.s[2] +sub v30.4s, v1.4s, v25.4s +mla v16.4S, v18.4S, v31.s[0] +ldr q18, [x0, #544] +sqrdmulh v15.4S, v26.4S, v12.s[1] +add v1.4s, v1.4s, v25.4s +str q30, [x0, #272] +mla v28.4S, v5.4S, v31.s[0] +ldr q5, [x17, #+384] +ldr q30, [x17, #+400] +sqrdmulh v25.4S, v4.4S, v12.s[2] +sub v11.4s, v24.4s, v14.4s +str q1, [x0, #256] +mul v22.4S, v22.4S,v19.s[1] +add v24.4s, v24.4s, v14.4s +mul v20.4S, v20.4S,v19.s[2] +str q11, [x0, #304] +mla v22.4S, v2.4S, v31.s[0] +sub v2.4s, v23.4s, v16.4s +mla v20.4S, v0.4S, v31.s[0] +str q24, [x0, #288] +mul v26.4S, v26.4S,v3.s[1] +str q2, [x0, #336] +mul v4.4S, v4.4S,v3.s[2] +add v23.4s, v23.4s, v16.4s +str q23, [x0, #320] +mla v26.4S, v15.4S, v31.s[0] +sub v15.4s, v29.4s, v28.4s +str q15, [x0, #368] +mla v4.4S, v25.4S, v31.s[0] +add v29.4s, v29.4s, v28.4s +str q29, [x0, #352] +sqrdmulh v12.4S, v18.4S, v30.s[0] +sub v3.4s, v13.4s, v22.4s +mul v18.4S, v18.4S,v5.s[0] +str q3, [x0, #400] +ldr q3, [x0, #560] +sqrdmulh v29.4S, v3.4S, v30.s[0] +add v13.4s, v13.4s, v22.4s +mul v3.4S, v3.4S,v5.s[0] +str q13, [x0, #384] +ldr q13, [x17, #+416] +ldr q22, [x17, #+432] +ldr q28, [x0, #608] +sqrdmulh v25.4S, v28.4S, v22.s[0] +sub v15.4s, v7.4s, v20.4s +mul v28.4S, v28.4S,v13.s[0] +str q15, [x0, #432] +ldr q15, [x0, #624] +sqrdmulh v23.4S, v15.4S, v22.s[0] +add v7.4s, v7.4s, v20.4s +mul v15.4S, v15.4S,v13.s[0] +str q7, [x0, #416] +ldr q7, [x17, #+448] +ldr q20, [x17, #+464] +mla v18.4S, v12.4S, v31.s[0] +sub v12.4s, v6.4s, v26.4s +sqrdmulh v16.4S, v17.4S, v20.s[0] +str q12, [x0, #464] +ldr q12, [x0, #688] +mla v3.4S, v29.4S, v31.s[0] +add v6.4s, v6.4s, v26.4s +sqrdmulh v26.4S, v12.4S, v20.s[0] +str q6, [x0, #448] +ldr q6, [x17, #+480] +ldr q29, [x17, #+496] +mla v28.4S, v25.4S, v31.s[0] +sub v25.4s, v27.4s, v4.4s +sqrdmulh v2.4S, v10.4S, v29.s[0] +str q25, [x0, #496] +ldr q25, [x0, #752] +mla v15.4S, v23.4S, v31.s[0] +add v27.4s, v27.4s, v4.4s +sqrdmulh v4.4S, v25.4S, v29.s[0] +str q27, [x0, #480] +ldr q27, [x0, #512] +ldr q23, [x0, #640] +mul v17.4S, v17.4S,v7.s[0] +sub v21.4s, v27.4s, v18.4s +ldr q19, [x0, #528] +mul v12.4S, v12.4S,v7.s[0] +add v27.4s, v27.4s, v18.4s +ldr q18, [x0, #656] +mla v17.4S, v16.4S, v31.s[0] +sub v16.4s, v19.4s, v3.4s +ldr q24, [x0, #576] +mla v12.4S, v26.4S, v31.s[0] +add v19.4s, v19.4s, v3.4s +ldr q3, [x0, #704] +mul v10.4S, v10.4S,v6.s[0] +sub v26.4s, v24.4s, v28.4s +ldr q0, [x0, #592] +mul v25.4S, v25.4S,v6.s[0] +add v24.4s, v24.4s, v28.4s +ldr q28, [x0, #720] +mla v10.4S, v2.4S, v31.s[0] +mla v25.4S, v4.4S, v31.s[0] +sub v4.4s, v0.4s, v15.4s +sqrdmulh v2.4S, v19.4S, v30.s[1] +add v0.4s, v0.4s, v15.4s +mul v19.4S, v19.4S,v5.s[1] +sqrdmulh v15.4S, v16.4S, v30.s[2] +sub v11.4s, v23.4s, v17.4s +mul v16.4S, v16.4S,v5.s[2] +add v23.4s, v23.4s, v17.4s +sqrdmulh v30.4S, v0.4S, v22.s[1] +sub v5.4s, v18.4s, v12.4s +mul v0.4S, v0.4S,v13.s[1] +add v18.4s, v18.4s, v12.4s +sqrdmulh v12.4S, v4.4S, v22.s[2] +sub v17.4s, v3.4s, v10.4s +mul v4.4S, v4.4S,v13.s[2] +add v3.4s, v3.4s, v10.4s +mla v19.4S, v2.4S, v31.s[0] +sub v2.4s, v28.4s, v25.4s +ldr q22, [x0, #992] +sqrdmulh v13.4S, v18.4S, v20.s[1] +add v28.4s, v28.4s, v25.4s +mla v16.4S, v15.4S, v31.s[0] +ldr q15, [x0, #928] +sqrdmulh v25.4S, v5.4S, v20.s[2] +sub v10.4s, v27.4s, v19.4s +mla v0.4S, v30.4S, v31.s[0] +ldr q30, [x0, #800] +sqrdmulh v14.4S, v28.4S, v29.s[1] +add v27.4s, v27.4s, v19.4s +str q10, [x0, #528] +mla v4.4S, v12.4S, v31.s[0] +ldr q12, [x17, #+512] +ldr q10, [x17, #+528] +sqrdmulh v19.4S, v2.4S, v29.s[2] +sub v1.4s, v21.4s, v16.4s +str q27, [x0, #512] +mul v18.4S, v18.4S,v7.s[1] +add v21.4s, v21.4s, v16.4s +mul v5.4S, v5.4S,v7.s[2] +str q1, [x0, #560] +mla v18.4S, v13.4S, v31.s[0] +sub v13.4s, v24.4s, v0.4s +mla v5.4S, v25.4S, v31.s[0] +str q21, [x0, #544] +mul v28.4S, v28.4S,v6.s[1] +str q13, [x0, #592] +mul v2.4S, v2.4S,v6.s[2] +add v24.4s, v24.4s, v0.4s +str q24, [x0, #576] +mla v28.4S, v14.4S, v31.s[0] +sub v14.4s, v26.4s, v4.4s +str q14, [x0, #624] +mla v2.4S, v19.4S, v31.s[0] +add v26.4s, v26.4s, v4.4s +str q26, [x0, #608] +sqrdmulh v29.4S, v30.4S, v10.s[0] +sub v6.4s, v23.4s, v18.4s +mul v30.4S, v30.4S,v12.s[0] +str q6, [x0, #656] +ldr q6, [x0, #816] +sqrdmulh v26.4S, v6.4S, v10.s[0] +add v23.4s, v23.4s, v18.4s +mul v6.4S, v6.4S,v12.s[0] +str q23, [x0, #640] +ldr q23, [x17, #+544] +ldr q18, [x17, #+560] +ldr q4, [x0, #864] +sqrdmulh v19.4S, v4.4S, v18.s[0] +sub v14.4s, v11.4s, v5.4s +mul v4.4S, v4.4S,v23.s[0] +str q14, [x0, #688] +ldr q14, [x0, #880] +sqrdmulh v24.4S, v14.4S, v18.s[0] +add v11.4s, v11.4s, v5.4s +mul v14.4S, v14.4S,v23.s[0] +str q11, [x0, #672] +ldr q11, [x17, #+576] +ldr q5, [x17, #+592] +mla v30.4S, v29.4S, v31.s[0] +sub v29.4s, v3.4s, v28.4s +sqrdmulh v0.4S, v15.4S, v5.s[0] +str q29, [x0, #720] +ldr q29, [x0, #944] +mla v6.4S, v26.4S, v31.s[0] +add v3.4s, v3.4s, v28.4s +sqrdmulh v28.4S, v29.4S, v5.s[0] +str q3, [x0, #704] +ldr q3, [x17, #+608] +ldr q26, [x17, #+624] +mla v4.4S, v19.4S, v31.s[0] +sub v19.4s, v17.4s, v2.4s +sqrdmulh v13.4S, v22.4S, v26.s[0] +str q19, [x0, #752] +ldr q19, [x0, #1008] +mla v14.4S, v24.4S, v31.s[0] +add v17.4s, v17.4s, v2.4s +sqrdmulh v2.4S, v19.4S, v26.s[0] +str q17, [x0, #736] +ldr q17, [x0, #768] +ldr q24, [x0, #896] +mul v15.4S, v15.4S,v11.s[0] +sub v20.4s, v17.4s, v30.4s +ldr q7, [x0, #784] +mul v29.4S, v29.4S,v11.s[0] +add v17.4s, v17.4s, v30.4s +ldr q30, [x0, #912] +mla v15.4S, v0.4S, v31.s[0] +sub v0.4s, v7.4s, v6.4s +ldr q21, [x0, #832] +mla v29.4S, v28.4S, v31.s[0] +add v7.4s, v7.4s, v6.4s +ldr q6, [x0, #960] +mul v22.4S, v22.4S,v3.s[0] +sub v28.4s, v21.4s, v4.4s +ldr q25, [x0, #848] +mul v19.4S, v19.4S,v3.s[0] +add v21.4s, v21.4s, v4.4s +ldr q4, [x0, #976] +mla v22.4S, v13.4S, v31.s[0] +mla v19.4S, v2.4S, v31.s[0] +sub v2.4s, v25.4s, v14.4s +sqrdmulh v13.4S, v7.4S, v10.s[1] +add v25.4s, v25.4s, v14.4s +mul v7.4S, v7.4S,v12.s[1] +sqrdmulh v14.4S, v0.4S, v10.s[2] +sub v1.4s, v24.4s, v15.4s +mul v0.4S, v0.4S,v12.s[2] +add v24.4s, v24.4s, v15.4s +sqrdmulh v10.4S, v25.4S, v18.s[1] +sub v12.4s, v30.4s, v29.4s +mul v25.4S, v25.4S,v23.s[1] +add v30.4s, v30.4s, v29.4s +sqrdmulh v29.4S, v2.4S, v18.s[2] +sub v15.4s, v6.4s, v22.4s +mul v2.4S, v2.4S,v23.s[2] +add v6.4s, v6.4s, v22.4s +mla v7.4S, v13.4S, v31.s[0] +sub v13.4s, v4.4s, v19.4s +sqrdmulh v18.4S, v30.4S, v5.s[1] +add v4.4s, v4.4s, v19.4s +mla v0.4S, v14.4S, v31.s[0] +sqrdmulh v14.4S, v12.4S, v5.s[2] +sub v19.4s, v17.4s, v7.4s +mla v25.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v4.4S, v26.s[1] +add v17.4s, v17.4s, v7.4s +str q19, [x0, #784] +mla v2.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v13.4S, v26.s[2] +sub v19.4s, v20.4s, v0.4s +str q17, [x0, #768] +mul v30.4S, v30.4S,v11.s[1] +add v20.4s, v20.4s, v0.4s +mul v12.4S, v12.4S,v11.s[2] +str q19, [x0, #816] +mla v30.4S, v18.4S, v31.s[0] +sub v18.4s, v21.4s, v25.4s +mla v12.4S, v14.4S, v31.s[0] +str q20, [x0, #800] +mul v4.4S, v4.4S,v3.s[1] +str q18, [x0, #848] +mul v13.4S, v13.4S,v3.s[2] +add v21.4s, v21.4s, v25.4s +str q21, [x0, #832] +mla v4.4S, v10.4S, v31.s[0] +sub v10.4s, v28.4s, v2.4s +str q10, [x0, #880] +mla v13.4S, v29.4S, v31.s[0] +add v28.4s, v28.4s, v2.4s +str q28, [x0, #864] +sub v26.4s, v24.4s, v30.4s +str q26, [x0, #912] +add v24.4s, v24.4s, v30.4s +str q24, [x0, #896] +sub v24.4s, v1.4s, v12.4s +str q24, [x0, #944] +add v1.4s, v1.4s, v12.4s +str q1, [x0, #928] +sub v1.4s, v6.4s, v4.4s +str q1, [x0, #976] +add v6.4s, v6.4s, v4.4s +str q6, [x0, #960] +sub v6.4s, v15.4s, v13.4s +str q6, [x0, #1008] +add v15.4s, v15.4s, v13.4s +str q15, [x0, #992] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1528 +// Instruction count: 1524 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_11_z4_7.s b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_11_z4_7.s new file mode 100644 index 0000000..025fcad --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_11_z4_7.s @@ -0,0 +1,1494 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_11_z4_7 +.global _ntt_u32_incomplete_neon_asm_var_4_2_11_z4_7 +ntt_u32_incomplete_neon_asm_var_4_2_11_z4_7: +_ntt_u32_incomplete_neon_asm_var_4_2_11_z4_7: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x0, #928] +ldr q29, [x17, #+0] +ldr q28, [x17, #+16] +sqrdmulh v27.4S, v30.4S, v28.s[0] +mul v30.4S, v30.4S,v29.s[0] +ldr q26, [x0, #992] +sqrdmulh v25.4S, v26.4S, v28.s[0] +mul v26.4S, v26.4S,v29.s[0] +ldr q24, [x0, #800] +sqrdmulh v23.4S, v24.4S, v28.s[0] +mul v24.4S, v24.4S,v29.s[0] +ldr q22, [x0, #864] +sqrdmulh v21.4S, v22.4S, v28.s[0] +mul v22.4S, v22.4S,v29.s[0] +ldr q20, [x0, #544] +mla v30.4S, v27.4S, v31.s[0] +sqrdmulh v27.4S, v20.4S, v28.s[0] +ldr q19, [x0, #608] +mla v26.4S, v25.4S, v31.s[0] +sqrdmulh v25.4S, v19.4S, v28.s[0] +ldr q18, [x0, #672] +mla v24.4S, v23.4S, v31.s[0] +sqrdmulh v23.4S, v18.4S, v28.s[0] +ldr q17, [x0, #736] +mla v22.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v17.4S, v28.s[0] +ldr q16, [x0, #416] +ldr q3, [x0, #480] +mul v20.4S, v20.4S,v29.s[0] +sub v2.4s, v16.4s, v30.4s +mul v19.4S, v19.4S,v29.s[0] +add v16.4s, v16.4s, v30.4s +ldr q30, [x0, #288] +ldr q1, [x0, #352] +mla v20.4S, v27.4S, v31.s[0] +sub v27.4s, v3.4s, v26.4s +mla v19.4S, v25.4S, v31.s[0] +add v3.4s, v3.4s, v26.4s +ldr q26, [x0, #32] +ldr q25, [x0, #96] +mul v18.4S, v18.4S,v29.s[0] +sub v0.4s, v30.4s, v24.4s +mul v17.4S, v17.4S,v29.s[0] +add v30.4s, v30.4s, v24.4s +ldr q24, [x0, #160] +ldr q15, [x0, #224] +mla v18.4S, v23.4S, v31.s[0] +sub v23.4s, v1.4s, v22.4s +mla v17.4S, v21.4S, v31.s[0] +add v1.4s, v1.4s, v22.4s +sqrdmulh v22.4S, v16.4S, v28.s[1] +mul v16.4S, v16.4S,v29.s[1] +sqrdmulh v21.4S, v3.4S, v28.s[1] +sub v14.4s, v26.4s, v20.4s +mul v3.4S, v3.4S,v29.s[1] +add v26.4s, v26.4s, v20.4s +sqrdmulh v20.4S, v30.4S, v28.s[1] +sub v13.4s, v25.4s, v19.4s +mul v30.4S, v30.4S,v29.s[1] +add v25.4s, v25.4s, v19.4s +sqrdmulh v19.4S, v1.4S, v28.s[1] +sub v12.4s, v24.4s, v18.4s +mul v1.4S, v1.4S,v29.s[1] +add v24.4s, v24.4s, v18.4s +mla v16.4S, v22.4S, v31.s[0] +sub v22.4s, v15.4s, v17.4s +sqrdmulh v18.4S, v2.4S, v28.s[2] +add v15.4s, v15.4s, v17.4s +mla v3.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v27.4S, v28.s[2] +mla v30.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v0.4S, v28.s[2] +mla v1.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v23.4S, v28.s[2] +ldr q17, [x17, #+32] +ldr q11, [x17, #+48] +mul v2.4S, v2.4S,v29.s[2] +sub v10.4s, v24.4s, v16.4s +mul v27.4S, v27.4S,v29.s[2] +add v24.4s, v24.4s, v16.4s +mla v2.4S, v18.4S, v31.s[0] +sub v18.4s, v15.4s, v3.4s +mla v27.4S, v21.4S, v31.s[0] +add v15.4s, v15.4s, v3.4s +mul v0.4S, v0.4S,v29.s[2] +sub v3.4s, v26.4s, v30.4s +mul v23.4S, v23.4S,v29.s[2] +add v26.4s, v26.4s, v30.4s +mla v0.4S, v20.4S, v31.s[0] +sub v20.4s, v25.4s, v1.4s +mla v23.4S, v19.4S, v31.s[0] +add v25.4s, v25.4s, v1.4s +sqrdmulh v1.4S, v10.4S, v11.s[1] +mul v10.4S, v10.4S,v17.s[1] +sqrdmulh v19.4S, v18.4S, v11.s[1] +sub v30.4s, v12.4s, v2.4s +mul v18.4S, v18.4S,v17.s[1] +add v12.4s, v12.4s, v2.4s +sqrdmulh v2.4S, v24.4S, v11.s[0] +sub v21.4s, v22.4s, v27.4s +mul v24.4S, v24.4S,v17.s[0] +add v22.4s, v22.4s, v27.4s +sqrdmulh v27.4S, v15.4S, v11.s[0] +sub v16.4s, v14.4s, v0.4s +mul v15.4S, v15.4S,v17.s[0] +add v14.4s, v14.4s, v0.4s +ldr q0, [x17, #+64] +ldr q9, [x17, #+80] +mla v10.4S, v1.4S, v31.s[0] +sub v1.4s, v13.4s, v23.4s +sqrdmulh v8.4S, v12.4S, v11.s[2] +add v13.4s, v13.4s, v23.4s +mla v18.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v22.4S, v11.s[2] +mla v24.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v30.4S, v11.s[3] +mla v15.4S, v27.4S, v31.s[0] +sqrdmulh v27.4S, v21.4S, v11.s[3] +ldr q23, [x17, #+96] +ldr q7, [x17, #+112] +mul v12.4S, v12.4S,v17.s[2] +sub v6.4s, v3.4s, v10.4s +mul v22.4S, v22.4S,v17.s[2] +add v3.4s, v3.4s, v10.4s +mla v12.4S, v8.4S, v31.s[0] +sub v8.4s, v20.4s, v18.4s +mla v22.4S, v19.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +mul v30.4S, v30.4S,v17.s[3] +sub v18.4s, v26.4s, v24.4s +mul v21.4S, v21.4S,v17.s[3] +add v26.4s, v26.4s, v24.4s +mla v30.4S, v2.4S, v31.s[0] +sub v2.4s, v25.4s, v15.4s +mla v21.4S, v27.4S, v31.s[0] +add v25.4s, v25.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v9.s[2] +mul v20.4S, v20.4S,v0.s[2] +sqrdmulh v27.4S, v8.4S, v9.s[3] +sub v24.4s, v14.4s, v12.4s +mul v8.4S, v8.4S,v0.s[3] +add v14.4s, v14.4s, v12.4s +sqrdmulh v12.4S, v2.4S, v9.s[1] +sub v19.4s, v13.4s, v22.4s +mul v2.4S, v2.4S,v0.s[1] +add v13.4s, v13.4s, v22.4s +sqrdmulh v22.4S, v25.4S, v9.s[0] +sub v10.4s, v16.4s, v30.4s +mul v25.4S, v25.4S,v0.s[0] +add v16.4s, v16.4s, v30.4s +mla v20.4S, v15.4S, v31.s[0] +sub v15.4s, v1.4s, v21.4s +sqrdmulh v30.4S, v13.4S, v7.s[0] +add v1.4s, v1.4s, v21.4s +mla v8.4S, v27.4S, v31.s[0] +sqrdmulh v27.4S, v19.4S, v7.s[1] +mla v2.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v1.4S, v7.s[2] +mla v25.4S, v22.4S, v31.s[0] +sqrdmulh v22.4S, v15.4S, v7.s[3] +mul v13.4S, v13.4S,v23.s[0] +sub v21.4s, v3.4s, v20.4s +str q21, [x0, #352] +mul v19.4S, v19.4S,v23.s[1] +add v3.4s, v3.4s, v20.4s +str q3, [x0, #288] +mla v13.4S, v30.4S, v31.s[0] +sub v30.4s, v6.4s, v8.4s +str q30, [x0, #480] +mla v19.4S, v27.4S, v31.s[0] +add v6.4s, v6.4s, v8.4s +str q6, [x0, #416] +mul v1.4S, v1.4S,v23.s[2] +sub v6.4s, v18.4s, v2.4s +str q6, [x0, #224] +mul v15.4S, v15.4S,v23.s[3] +add v18.4s, v18.4s, v2.4s +str q18, [x0, #160] +mla v1.4S, v12.4S, v31.s[0] +sub v12.4s, v26.4s, v25.4s +str q12, [x0, #96] +mla v15.4S, v22.4S, v31.s[0] +add v26.4s, v26.4s, v25.4s +str q26, [x0, #32] +ldr q26, [x0, #944] +sqrdmulh v25.4S, v26.4S, v28.s[0] +mul v26.4S, v26.4S,v29.s[0] +ldr q22, [x0, #1008] +sqrdmulh v12.4S, v22.4S, v28.s[0] +sub v18.4s, v14.4s, v13.4s +str q18, [x0, #608] +mul v22.4S, v22.4S,v29.s[0] +add v14.4s, v14.4s, v13.4s +str q14, [x0, #544] +ldr q14, [x0, #816] +sqrdmulh v13.4S, v14.4S, v28.s[0] +sub v18.4s, v24.4s, v19.4s +str q18, [x0, #736] +mul v14.4S, v14.4S,v29.s[0] +add v24.4s, v24.4s, v19.4s +str q24, [x0, #672] +ldr q24, [x0, #880] +sqrdmulh v19.4S, v24.4S, v28.s[0] +sub v18.4s, v16.4s, v1.4s +str q18, [x0, #864] +mul v24.4S, v24.4S,v29.s[0] +add v16.4s, v16.4s, v1.4s +str q16, [x0, #800] +ldr q16, [x0, #560] +mla v26.4S, v25.4S, v31.s[0] +sub v25.4s, v10.4s, v15.4s +str q25, [x0, #992] +sqrdmulh v25.4S, v16.4S, v28.s[0] +add v10.4s, v10.4s, v15.4s +str q10, [x0, #928] +ldr q10, [x0, #624] +mla v22.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v10.4S, v28.s[0] +ldr q15, [x0, #688] +mla v14.4S, v13.4S, v31.s[0] +sqrdmulh v13.4S, v15.4S, v28.s[0] +ldr q1, [x0, #752] +mla v24.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v1.4S, v28.s[0] +ldr q18, [x0, #432] +ldr q2, [x0, #496] +mul v16.4S, v16.4S,v29.s[0] +sub v6.4s, v18.4s, v26.4s +mul v10.4S, v10.4S,v29.s[0] +add v18.4s, v18.4s, v26.4s +ldr q26, [x0, #304] +ldr q8, [x0, #368] +mla v16.4S, v25.4S, v31.s[0] +sub v25.4s, v2.4s, v22.4s +mla v10.4S, v12.4S, v31.s[0] +add v2.4s, v2.4s, v22.4s +ldr q22, [x0, #48] +ldr q12, [x0, #112] +mul v15.4S, v15.4S,v29.s[0] +sub v27.4s, v26.4s, v14.4s +mul v1.4S, v1.4S,v29.s[0] +add v26.4s, v26.4s, v14.4s +ldr q14, [x0, #176] +ldr q30, [x0, #240] +mla v15.4S, v13.4S, v31.s[0] +sub v13.4s, v8.4s, v24.4s +mla v1.4S, v19.4S, v31.s[0] +add v8.4s, v8.4s, v24.4s +sqrdmulh v24.4S, v18.4S, v28.s[1] +mul v18.4S, v18.4S,v29.s[1] +sqrdmulh v19.4S, v2.4S, v28.s[1] +sub v3.4s, v22.4s, v16.4s +mul v2.4S, v2.4S,v29.s[1] +add v22.4s, v22.4s, v16.4s +sqrdmulh v16.4S, v26.4S, v28.s[1] +sub v20.4s, v12.4s, v10.4s +mul v26.4S, v26.4S,v29.s[1] +add v12.4s, v12.4s, v10.4s +sqrdmulh v10.4S, v8.4S, v28.s[1] +sub v21.4s, v14.4s, v15.4s +mul v8.4S, v8.4S,v29.s[1] +add v14.4s, v14.4s, v15.4s +mla v18.4S, v24.4S, v31.s[0] +sub v24.4s, v30.4s, v1.4s +sqrdmulh v15.4S, v6.4S, v28.s[2] +add v30.4s, v30.4s, v1.4s +mla v2.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v25.4S, v28.s[2] +mla v26.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v27.4S, v28.s[2] +mla v8.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v13.4S, v28.s[2] +mul v6.4S, v6.4S,v29.s[2] +sub v1.4s, v14.4s, v18.4s +mul v25.4S, v25.4S,v29.s[2] +add v14.4s, v14.4s, v18.4s +mla v6.4S, v15.4S, v31.s[0] +sub v15.4s, v30.4s, v2.4s +mla v25.4S, v19.4S, v31.s[0] +add v30.4s, v30.4s, v2.4s +mul v27.4S, v27.4S,v29.s[2] +sub v2.4s, v22.4s, v26.4s +mul v13.4S, v13.4S,v29.s[2] +add v22.4s, v22.4s, v26.4s +mla v27.4S, v16.4S, v31.s[0] +sub v16.4s, v12.4s, v8.4s +mla v13.4S, v10.4S, v31.s[0] +add v12.4s, v12.4s, v8.4s +sqrdmulh v8.4S, v1.4S, v11.s[1] +mul v1.4S, v1.4S,v17.s[1] +sqrdmulh v10.4S, v15.4S, v11.s[1] +sub v26.4s, v21.4s, v6.4s +mul v15.4S, v15.4S,v17.s[1] +add v21.4s, v21.4s, v6.4s +sqrdmulh v6.4S, v14.4S, v11.s[0] +sub v19.4s, v24.4s, v25.4s +mul v14.4S, v14.4S,v17.s[0] +add v24.4s, v24.4s, v25.4s +sqrdmulh v25.4S, v30.4S, v11.s[0] +sub v18.4s, v3.4s, v27.4s +mul v30.4S, v30.4S,v17.s[0] +add v3.4s, v3.4s, v27.4s +mla v1.4S, v8.4S, v31.s[0] +sub v8.4s, v20.4s, v13.4s +sqrdmulh v27.4S, v21.4S, v11.s[2] +add v20.4s, v20.4s, v13.4s +mla v15.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v24.4S, v11.s[2] +mla v14.4S, v6.4S, v31.s[0] +sqrdmulh v6.4S, v26.4S, v11.s[3] +mla v30.4S, v25.4S, v31.s[0] +sqrdmulh v25.4S, v19.4S, v11.s[3] +mul v21.4S, v21.4S,v17.s[2] +sub v13.4s, v2.4s, v1.4s +mul v24.4S, v24.4S,v17.s[2] +add v2.4s, v2.4s, v1.4s +mla v21.4S, v27.4S, v31.s[0] +sub v27.4s, v16.4s, v15.4s +mla v24.4S, v10.4S, v31.s[0] +add v16.4s, v16.4s, v15.4s +mul v26.4S, v26.4S,v17.s[3] +sub v15.4s, v22.4s, v14.4s +mul v19.4S, v19.4S,v17.s[3] +add v22.4s, v22.4s, v14.4s +mla v26.4S, v6.4S, v31.s[0] +sub v6.4s, v12.4s, v30.4s +mla v19.4S, v25.4S, v31.s[0] +add v12.4s, v12.4s, v30.4s +sqrdmulh v30.4S, v16.4S, v9.s[2] +mul v16.4S, v16.4S,v0.s[2] +sqrdmulh v25.4S, v27.4S, v9.s[3] +sub v14.4s, v3.4s, v21.4s +mul v27.4S, v27.4S,v0.s[3] +add v3.4s, v3.4s, v21.4s +sqrdmulh v21.4S, v6.4S, v9.s[1] +sub v10.4s, v20.4s, v24.4s +mul v6.4S, v6.4S,v0.s[1] +add v20.4s, v20.4s, v24.4s +sqrdmulh v24.4S, v12.4S, v9.s[0] +sub v1.4s, v18.4s, v26.4s +mul v12.4S, v12.4S,v0.s[0] +add v18.4s, v18.4s, v26.4s +mla v16.4S, v30.4S, v31.s[0] +sub v30.4s, v8.4s, v19.4s +sqrdmulh v26.4S, v20.4S, v7.s[0] +add v8.4s, v8.4s, v19.4s +mla v27.4S, v25.4S, v31.s[0] +sqrdmulh v25.4S, v10.4S, v7.s[1] +mla v6.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v8.4S, v7.s[2] +mla v12.4S, v24.4S, v31.s[0] +sqrdmulh v24.4S, v30.4S, v7.s[3] +mul v20.4S, v20.4S,v23.s[0] +sub v19.4s, v2.4s, v16.4s +str q19, [x0, #368] +mul v10.4S, v10.4S,v23.s[1] +add v2.4s, v2.4s, v16.4s +str q2, [x0, #304] +mla v20.4S, v26.4S, v31.s[0] +sub v26.4s, v13.4s, v27.4s +str q26, [x0, #496] +mla v10.4S, v25.4S, v31.s[0] +add v13.4s, v13.4s, v27.4s +str q13, [x0, #432] +mul v8.4S, v8.4S,v23.s[2] +sub v13.4s, v15.4s, v6.4s +str q13, [x0, #240] +mul v30.4S, v30.4S,v23.s[3] +add v15.4s, v15.4s, v6.4s +str q15, [x0, #176] +mla v8.4S, v21.4S, v31.s[0] +sub v21.4s, v22.4s, v12.4s +str q21, [x0, #112] +mla v30.4S, v24.4S, v31.s[0] +add v22.4s, v22.4s, v12.4s +str q22, [x0, #48] +ldr q22, [x0, #896] +sqrdmulh v12.4S, v22.4S, v28.s[0] +mul v22.4S, v22.4S,v29.s[0] +ldr q24, [x0, #960] +sqrdmulh v21.4S, v24.4S, v28.s[0] +sub v15.4s, v3.4s, v20.4s +str q15, [x0, #624] +mul v24.4S, v24.4S,v29.s[0] +add v3.4s, v3.4s, v20.4s +str q3, [x0, #560] +ldr q3, [x0, #768] +sqrdmulh v20.4S, v3.4S, v28.s[0] +sub v15.4s, v14.4s, v10.4s +str q15, [x0, #752] +mul v3.4S, v3.4S,v29.s[0] +add v14.4s, v14.4s, v10.4s +str q14, [x0, #688] +ldr q14, [x0, #832] +sqrdmulh v10.4S, v14.4S, v28.s[0] +sub v15.4s, v18.4s, v8.4s +str q15, [x0, #880] +mul v14.4S, v14.4S,v29.s[0] +add v18.4s, v18.4s, v8.4s +str q18, [x0, #816] +ldr q18, [x0, #512] +mla v22.4S, v12.4S, v31.s[0] +sub v12.4s, v1.4s, v30.4s +str q12, [x0, #1008] +sqrdmulh v12.4S, v18.4S, v28.s[0] +add v1.4s, v1.4s, v30.4s +str q1, [x0, #944] +ldr q1, [x0, #576] +mla v24.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v1.4S, v28.s[0] +ldr q30, [x0, #640] +mla v3.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v30.4S, v28.s[0] +ldr q8, [x0, #704] +mla v14.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v8.4S, v28.s[0] +ldr q15, [x0, #384] +ldr q6, [x0, #448] +mul v18.4S, v18.4S,v29.s[0] +sub v13.4s, v15.4s, v22.4s +mul v1.4S, v1.4S,v29.s[0] +add v15.4s, v15.4s, v22.4s +ldr q22, [x0, #256] +ldr q27, [x0, #320] +mla v18.4S, v12.4S, v31.s[0] +sub v12.4s, v6.4s, v24.4s +mla v1.4S, v21.4S, v31.s[0] +add v6.4s, v6.4s, v24.4s +ldr q24, [x0, #0] +ldr q21, [x0, #64] +mul v30.4S, v30.4S,v29.s[0] +sub v25.4s, v22.4s, v3.4s +mul v8.4S, v8.4S,v29.s[0] +add v22.4s, v22.4s, v3.4s +ldr q3, [x0, #128] +ldr q26, [x0, #192] +mla v30.4S, v20.4S, v31.s[0] +sub v20.4s, v27.4s, v14.4s +mla v8.4S, v10.4S, v31.s[0] +add v27.4s, v27.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v28.s[1] +mul v15.4S, v15.4S,v29.s[1] +sqrdmulh v10.4S, v6.4S, v28.s[1] +sub v2.4s, v24.4s, v18.4s +mul v6.4S, v6.4S,v29.s[1] +add v24.4s, v24.4s, v18.4s +sqrdmulh v18.4S, v22.4S, v28.s[1] +sub v16.4s, v21.4s, v1.4s +mul v22.4S, v22.4S,v29.s[1] +add v21.4s, v21.4s, v1.4s +sqrdmulh v1.4S, v27.4S, v28.s[1] +sub v19.4s, v3.4s, v30.4s +mul v27.4S, v27.4S,v29.s[1] +add v3.4s, v3.4s, v30.4s +mla v15.4S, v14.4S, v31.s[0] +sub v14.4s, v26.4s, v8.4s +sqrdmulh v30.4S, v13.4S, v28.s[2] +add v26.4s, v26.4s, v8.4s +mla v6.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v12.4S, v28.s[2] +mla v22.4S, v18.4S, v31.s[0] +sqrdmulh v18.4S, v25.4S, v28.s[2] +mla v27.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v20.4S, v28.s[2] +mul v13.4S, v13.4S,v29.s[2] +sub v8.4s, v3.4s, v15.4s +mul v12.4S, v12.4S,v29.s[2] +add v3.4s, v3.4s, v15.4s +mla v13.4S, v30.4S, v31.s[0] +sub v30.4s, v26.4s, v6.4s +mla v12.4S, v10.4S, v31.s[0] +add v26.4s, v26.4s, v6.4s +mul v25.4S, v25.4S,v29.s[2] +sub v6.4s, v24.4s, v22.4s +mul v20.4S, v20.4S,v29.s[2] +add v24.4s, v24.4s, v22.4s +mla v25.4S, v18.4S, v31.s[0] +sub v18.4s, v21.4s, v27.4s +mla v20.4S, v1.4S, v31.s[0] +add v21.4s, v21.4s, v27.4s +sqrdmulh v27.4S, v8.4S, v11.s[1] +mul v8.4S, v8.4S,v17.s[1] +sqrdmulh v1.4S, v30.4S, v11.s[1] +sub v22.4s, v19.4s, v13.4s +mul v30.4S, v30.4S,v17.s[1] +add v19.4s, v19.4s, v13.4s +sqrdmulh v13.4S, v3.4S, v11.s[0] +sub v10.4s, v14.4s, v12.4s +mul v3.4S, v3.4S,v17.s[0] +add v14.4s, v14.4s, v12.4s +sqrdmulh v12.4S, v26.4S, v11.s[0] +sub v15.4s, v2.4s, v25.4s +mul v26.4S, v26.4S,v17.s[0] +add v2.4s, v2.4s, v25.4s +mla v8.4S, v27.4S, v31.s[0] +sub v27.4s, v16.4s, v20.4s +sqrdmulh v25.4S, v19.4S, v11.s[2] +add v16.4s, v16.4s, v20.4s +mla v30.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v14.4S, v11.s[2] +mla v3.4S, v13.4S, v31.s[0] +sqrdmulh v13.4S, v22.4S, v11.s[3] +mla v26.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v10.4S, v11.s[3] +mul v19.4S, v19.4S,v17.s[2] +sub v20.4s, v6.4s, v8.4s +mul v14.4S, v14.4S,v17.s[2] +add v6.4s, v6.4s, v8.4s +mla v19.4S, v25.4S, v31.s[0] +sub v25.4s, v18.4s, v30.4s +mla v14.4S, v1.4S, v31.s[0] +add v18.4s, v18.4s, v30.4s +mul v22.4S, v22.4S,v17.s[3] +sub v30.4s, v24.4s, v3.4s +mul v10.4S, v10.4S,v17.s[3] +add v24.4s, v24.4s, v3.4s +mla v22.4S, v13.4S, v31.s[0] +sub v13.4s, v21.4s, v26.4s +mla v10.4S, v12.4S, v31.s[0] +add v21.4s, v21.4s, v26.4s +sqrdmulh v26.4S, v18.4S, v9.s[2] +mul v18.4S, v18.4S,v0.s[2] +sqrdmulh v12.4S, v25.4S, v9.s[3] +sub v3.4s, v2.4s, v19.4s +mul v25.4S, v25.4S,v0.s[3] +add v2.4s, v2.4s, v19.4s +sqrdmulh v19.4S, v13.4S, v9.s[1] +sub v1.4s, v16.4s, v14.4s +mul v13.4S, v13.4S,v0.s[1] +add v16.4s, v16.4s, v14.4s +sqrdmulh v14.4S, v21.4S, v9.s[0] +sub v8.4s, v15.4s, v22.4s +mul v21.4S, v21.4S,v0.s[0] +add v15.4s, v15.4s, v22.4s +mla v18.4S, v26.4S, v31.s[0] +sub v26.4s, v27.4s, v10.4s +sqrdmulh v22.4S, v16.4S, v7.s[0] +add v27.4s, v27.4s, v10.4s +mla v25.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v1.4S, v7.s[1] +mla v13.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v27.4S, v7.s[2] +mla v21.4S, v14.4S, v31.s[0] +sqrdmulh v14.4S, v26.4S, v7.s[3] +mul v16.4S, v16.4S,v23.s[0] +sub v10.4s, v6.4s, v18.4s +str q10, [x0, #320] +mul v1.4S, v1.4S,v23.s[1] +add v6.4s, v6.4s, v18.4s +str q6, [x0, #256] +mla v16.4S, v22.4S, v31.s[0] +sub v22.4s, v20.4s, v25.4s +str q22, [x0, #448] +mla v1.4S, v12.4S, v31.s[0] +add v20.4s, v20.4s, v25.4s +str q20, [x0, #384] +mul v27.4S, v27.4S,v23.s[2] +sub v20.4s, v30.4s, v13.4s +str q20, [x0, #192] +mul v26.4S, v26.4S,v23.s[3] +add v30.4s, v30.4s, v13.4s +str q30, [x0, #128] +mla v27.4S, v19.4S, v31.s[0] +sub v19.4s, v24.4s, v21.4s +str q19, [x0, #64] +mla v26.4S, v14.4S, v31.s[0] +add v24.4s, v24.4s, v21.4s +str q24, [x0, #0] +ldr q24, [x0, #912] +sqrdmulh v21.4S, v24.4S, v28.s[0] +mul v24.4S, v24.4S,v29.s[0] +ldr q14, [x0, #976] +sqrdmulh v19.4S, v14.4S, v28.s[0] +sub v30.4s, v2.4s, v16.4s +str q30, [x0, #576] +mul v14.4S, v14.4S,v29.s[0] +add v2.4s, v2.4s, v16.4s +str q2, [x0, #512] +ldr q2, [x0, #784] +sqrdmulh v16.4S, v2.4S, v28.s[0] +sub v30.4s, v3.4s, v1.4s +str q30, [x0, #704] +mul v2.4S, v2.4S,v29.s[0] +add v3.4s, v3.4s, v1.4s +str q3, [x0, #640] +ldr q3, [x0, #848] +sqrdmulh v1.4S, v3.4S, v28.s[0] +sub v30.4s, v15.4s, v27.4s +str q30, [x0, #832] +mul v3.4S, v3.4S,v29.s[0] +add v15.4s, v15.4s, v27.4s +str q15, [x0, #768] +ldr q15, [x0, #528] +mla v24.4S, v21.4S, v31.s[0] +sub v21.4s, v8.4s, v26.4s +str q21, [x0, #960] +sqrdmulh v21.4S, v15.4S, v28.s[0] +add v8.4s, v8.4s, v26.4s +str q8, [x0, #896] +ldr q8, [x0, #592] +mla v14.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v8.4S, v28.s[0] +ldr q26, [x0, #656] +mla v2.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v26.4S, v28.s[0] +ldr q27, [x0, #720] +mla v3.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v27.4S, v28.s[0] +ldr q30, [x0, #400] +ldr q13, [x0, #464] +mul v15.4S, v15.4S,v29.s[0] +sub v20.4s, v30.4s, v24.4s +mul v8.4S, v8.4S,v29.s[0] +add v30.4s, v30.4s, v24.4s +ldr q24, [x0, #272] +ldr q25, [x0, #336] +mla v15.4S, v21.4S, v31.s[0] +sub v21.4s, v13.4s, v14.4s +mla v8.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v14.4s +ldr q14, [x0, #16] +ldr q19, [x0, #80] +mul v26.4S, v26.4S,v29.s[0] +sub v12.4s, v24.4s, v2.4s +mul v27.4S, v27.4S,v29.s[0] +add v24.4s, v24.4s, v2.4s +ldr q2, [x0, #144] +ldr q22, [x0, #208] +mla v26.4S, v16.4S, v31.s[0] +sub v16.4s, v25.4s, v3.4s +mla v27.4S, v1.4S, v31.s[0] +add v25.4s, v25.4s, v3.4s +sqrdmulh v3.4S, v30.4S, v28.s[1] +mul v30.4S, v30.4S,v29.s[1] +sqrdmulh v1.4S, v13.4S, v28.s[1] +sub v6.4s, v14.4s, v15.4s +mul v13.4S, v13.4S,v29.s[1] +add v14.4s, v14.4s, v15.4s +sqrdmulh v15.4S, v24.4S, v28.s[1] +sub v18.4s, v19.4s, v8.4s +mul v24.4S, v24.4S,v29.s[1] +add v19.4s, v19.4s, v8.4s +sqrdmulh v8.4S, v25.4S, v28.s[1] +sub v10.4s, v2.4s, v26.4s +mul v25.4S, v25.4S,v29.s[1] +add v2.4s, v2.4s, v26.4s +mla v30.4S, v3.4S, v31.s[0] +sub v3.4s, v22.4s, v27.4s +sqrdmulh v26.4S, v20.4S, v28.s[2] +add v22.4s, v22.4s, v27.4s +mla v13.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v21.4S, v28.s[2] +mla v24.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v12.4S, v28.s[2] +mla v25.4S, v8.4S, v31.s[0] +sqrdmulh v8.4S, v16.4S, v28.s[2] +mul v20.4S, v20.4S,v29.s[2] +sub v27.4s, v2.4s, v30.4s +mul v21.4S, v21.4S,v29.s[2] +add v2.4s, v2.4s, v30.4s +mla v20.4S, v26.4S, v31.s[0] +sub v26.4s, v22.4s, v13.4s +mla v21.4S, v1.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +mul v12.4S, v12.4S,v29.s[2] +sub v13.4s, v14.4s, v24.4s +mul v16.4S, v16.4S,v29.s[2] +add v14.4s, v14.4s, v24.4s +mla v12.4S, v15.4S, v31.s[0] +sub v15.4s, v19.4s, v25.4s +mla v16.4S, v8.4S, v31.s[0] +add v19.4s, v19.4s, v25.4s +sqrdmulh v28.4S, v27.4S, v11.s[1] +mul v27.4S, v27.4S,v17.s[1] +sqrdmulh v29.4S, v26.4S, v11.s[1] +sub v25.4s, v10.4s, v20.4s +mul v26.4S, v26.4S,v17.s[1] +add v10.4s, v10.4s, v20.4s +sqrdmulh v20.4S, v2.4S, v11.s[0] +sub v8.4s, v3.4s, v21.4s +mul v2.4S, v2.4S,v17.s[0] +add v3.4s, v3.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v11.s[0] +sub v24.4s, v6.4s, v12.4s +mul v22.4S, v22.4S,v17.s[0] +add v6.4s, v6.4s, v12.4s +mla v27.4S, v28.4S, v31.s[0] +sub v28.4s, v18.4s, v16.4s +sqrdmulh v12.4S, v10.4S, v11.s[2] +add v18.4s, v18.4s, v16.4s +mla v26.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v3.4S, v11.s[2] +mla v2.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v25.4S, v11.s[3] +mla v22.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v8.4S, v11.s[3] +mul v10.4S, v10.4S,v17.s[2] +sub v16.4s, v13.4s, v27.4s +mul v3.4S, v3.4S,v17.s[2] +add v13.4s, v13.4s, v27.4s +mla v10.4S, v12.4S, v31.s[0] +sub v12.4s, v15.4s, v26.4s +mla v3.4S, v29.4S, v31.s[0] +add v15.4s, v15.4s, v26.4s +mul v25.4S, v25.4S,v17.s[3] +sub v26.4s, v14.4s, v2.4s +mul v8.4S, v8.4S,v17.s[3] +add v14.4s, v14.4s, v2.4s +mla v25.4S, v20.4S, v31.s[0] +sub v20.4s, v19.4s, v22.4s +mla v8.4S, v21.4S, v31.s[0] +add v19.4s, v19.4s, v22.4s +sqrdmulh v11.4S, v15.4S, v9.s[2] +mul v15.4S, v15.4S,v0.s[2] +sqrdmulh v17.4S, v12.4S, v9.s[3] +sub v22.4s, v6.4s, v10.4s +mul v12.4S, v12.4S,v0.s[3] +add v6.4s, v6.4s, v10.4s +sqrdmulh v10.4S, v20.4S, v9.s[1] +sub v21.4s, v18.4s, v3.4s +mul v20.4S, v20.4S,v0.s[1] +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v19.4S, v9.s[0] +sub v2.4s, v24.4s, v25.4s +mul v19.4S, v19.4S,v0.s[0] +add v24.4s, v24.4s, v25.4s +mla v15.4S, v11.4S, v31.s[0] +sub v11.4s, v28.4s, v8.4s +sqrdmulh v25.4S, v18.4S, v7.s[0] +add v28.4s, v28.4s, v8.4s +mla v12.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v21.4S, v7.s[1] +mla v20.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v28.4S, v7.s[2] +mla v19.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v11.4S, v7.s[3] +mul v18.4S, v18.4S,v23.s[0] +sub v8.4s, v13.4s, v15.4s +str q8, [x0, #336] +mul v21.4S, v21.4S,v23.s[1] +add v13.4s, v13.4s, v15.4s +str q13, [x0, #272] +mla v18.4S, v25.4S, v31.s[0] +sub v25.4s, v16.4s, v12.4s +str q25, [x0, #464] +mla v21.4S, v17.4S, v31.s[0] +add v16.4s, v16.4s, v12.4s +str q16, [x0, #400] +mul v28.4S, v28.4S,v23.s[2] +sub v16.4s, v26.4s, v20.4s +str q16, [x0, #208] +mul v11.4S, v11.4S,v23.s[3] +add v26.4s, v26.4s, v20.4s +str q26, [x0, #144] +mla v28.4S, v10.4S, v31.s[0] +sub v10.4s, v14.4s, v19.4s +str q10, [x0, #80] +mla v11.4S, v3.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +str q14, [x0, #16] +sub v7.4s, v6.4s, v18.4s +str q7, [x0, #592] +add v6.4s, v6.4s, v18.4s +str q6, [x0, #528] +sub v6.4s, v22.4s, v21.4s +str q6, [x0, #720] +add v22.4s, v22.4s, v21.4s +str q22, [x0, #656] +sub v22.4s, v24.4s, v28.4s +str q22, [x0, #848] +add v24.4s, v24.4s, v28.4s +str q24, [x0, #784] +sub v24.4s, v2.4s, v11.4s +str q24, [x0, #976] +add v2.4s, v2.4s, v11.4s +str q2, [x0, #912] +ldr q4, [x0, #224] +ldr q5, [x0, #160] +ldr q30, [x0, #32] +ldr q1, [x17, #+128] +ldr q27, [x17, #+144] +sqrdmulh v29.4S, v30.4S, v27.s[0] +mul v30.4S, v30.4S,v1.s[0] +ldr q8, [x0, #48] +sqrdmulh v15.4S, v8.4S, v27.s[0] +mul v8.4S, v8.4S,v1.s[0] +ldr q13, [x17, #+160] +ldr q25, [x17, #+176] +ldr q17, [x0, #96] +sqrdmulh v12.4S, v17.4S, v25.s[0] +mul v17.4S, v17.4S,v13.s[0] +ldr q16, [x0, #112] +sqrdmulh v20.4S, v16.4S, v25.s[0] +mul v16.4S, v16.4S,v13.s[0] +ldr q26, [x17, #+192] +ldr q10, [x17, #+208] +mla v30.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v5.4S, v10.s[0] +ldr q3, [x0, #176] +mla v8.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v3.4S, v10.s[0] +ldr q19, [x17, #+224] +ldr q14, [x17, #+240] +mla v17.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v4.4S, v14.s[0] +ldr q0, [x0, #240] +mla v16.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v0.4S, v14.s[0] +ldr q9, [x0, #0] +ldr q23, [x0, #128] +mul v5.4S, v5.4S,v26.s[0] +sub v7.4s, v9.4s, v30.4s +ldr q18, [x0, #16] +mul v3.4S, v3.4S,v26.s[0] +add v9.4s, v9.4s, v30.4s +ldr q30, [x0, #144] +mla v5.4S, v29.4S, v31.s[0] +sub v29.4s, v18.4s, v8.4s +ldr q6, [x0, #64] +mla v3.4S, v15.4S, v31.s[0] +add v18.4s, v18.4s, v8.4s +ldr q8, [x0, #192] +mul v4.4S, v4.4S,v19.s[0] +sub v15.4s, v6.4s, v17.4s +ldr q21, [x0, #80] +mul v0.4S, v0.4S,v19.s[0] +add v6.4s, v6.4s, v17.4s +ldr q17, [x0, #208] +mla v4.4S, v12.4S, v31.s[0] +mla v0.4S, v20.4S, v31.s[0] +sub v20.4s, v21.4s, v16.4s +sqrdmulh v12.4S, v18.4S, v27.s[1] +add v21.4s, v21.4s, v16.4s +mul v18.4S, v18.4S,v1.s[1] +sqrdmulh v16.4S, v29.4S, v27.s[2] +sub v22.4s, v23.4s, v5.4s +mul v29.4S, v29.4S,v1.s[2] +add v23.4s, v23.4s, v5.4s +sqrdmulh v27.4S, v21.4S, v25.s[1] +sub v1.4s, v30.4s, v3.4s +mul v21.4S, v21.4S,v13.s[1] +add v30.4s, v30.4s, v3.4s +sqrdmulh v3.4S, v20.4S, v25.s[2] +sub v5.4s, v8.4s, v4.4s +mul v20.4S, v20.4S,v13.s[2] +add v8.4s, v8.4s, v4.4s +mla v18.4S, v12.4S, v31.s[0] +sub v12.4s, v17.4s, v0.4s +ldr q25, [x0, #480] +sqrdmulh v13.4S, v30.4S, v10.s[1] +add v17.4s, v17.4s, v0.4s +mla v29.4S, v16.4S, v31.s[0] +ldr q16, [x0, #416] +sqrdmulh v0.4S, v1.4S, v10.s[2] +sub v4.4s, v9.4s, v18.4s +mla v21.4S, v27.4S, v31.s[0] +ldr q27, [x0, #288] +sqrdmulh v28.4S, v17.4S, v14.s[1] +add v9.4s, v9.4s, v18.4s +str q4, [x0, #16] +mla v20.4S, v3.4S, v31.s[0] +ldr q3, [x17, #+256] +ldr q4, [x17, #+272] +sqrdmulh v18.4S, v12.4S, v14.s[2] +sub v24.4s, v7.4s, v29.4s +str q9, [x0, #0] +mul v30.4S, v30.4S,v26.s[1] +add v7.4s, v7.4s, v29.4s +mul v1.4S, v1.4S,v26.s[2] +str q24, [x0, #48] +mla v30.4S, v13.4S, v31.s[0] +sub v13.4s, v6.4s, v21.4s +mla v1.4S, v0.4S, v31.s[0] +str q7, [x0, #32] +mul v17.4S, v17.4S,v19.s[1] +str q13, [x0, #80] +mul v12.4S, v12.4S,v19.s[2] +add v6.4s, v6.4s, v21.4s +str q6, [x0, #64] +mla v17.4S, v28.4S, v31.s[0] +sub v28.4s, v15.4s, v20.4s +str q28, [x0, #112] +mla v12.4S, v18.4S, v31.s[0] +add v15.4s, v15.4s, v20.4s +str q15, [x0, #96] +sqrdmulh v14.4S, v27.4S, v4.s[0] +sub v19.4s, v23.4s, v30.4s +mul v27.4S, v27.4S,v3.s[0] +str q19, [x0, #144] +ldr q19, [x0, #304] +sqrdmulh v15.4S, v19.4S, v4.s[0] +add v23.4s, v23.4s, v30.4s +mul v19.4S, v19.4S,v3.s[0] +str q23, [x0, #128] +ldr q23, [x17, #+288] +ldr q30, [x17, #+304] +ldr q20, [x0, #352] +sqrdmulh v18.4S, v20.4S, v30.s[0] +sub v28.4s, v22.4s, v1.4s +mul v20.4S, v20.4S,v23.s[0] +str q28, [x0, #176] +ldr q28, [x0, #368] +sqrdmulh v6.4S, v28.4S, v30.s[0] +add v22.4s, v22.4s, v1.4s +mul v28.4S, v28.4S,v23.s[0] +str q22, [x0, #160] +ldr q22, [x17, #+320] +ldr q1, [x17, #+336] +mla v27.4S, v14.4S, v31.s[0] +sub v14.4s, v8.4s, v17.4s +sqrdmulh v21.4S, v16.4S, v1.s[0] +str q14, [x0, #208] +ldr q14, [x0, #432] +mla v19.4S, v15.4S, v31.s[0] +add v8.4s, v8.4s, v17.4s +sqrdmulh v17.4S, v14.4S, v1.s[0] +str q8, [x0, #192] +ldr q8, [x17, #+352] +ldr q15, [x17, #+368] +mla v20.4S, v18.4S, v31.s[0] +sub v18.4s, v5.4s, v12.4s +sqrdmulh v13.4S, v25.4S, v15.s[0] +str q18, [x0, #240] +ldr q18, [x0, #496] +mla v28.4S, v6.4S, v31.s[0] +add v5.4s, v5.4s, v12.4s +sqrdmulh v12.4S, v18.4S, v15.s[0] +str q5, [x0, #224] +ldr q5, [x0, #256] +ldr q6, [x0, #384] +mul v16.4S, v16.4S,v22.s[0] +sub v10.4s, v5.4s, v27.4s +ldr q26, [x0, #272] +mul v14.4S, v14.4S,v22.s[0] +add v5.4s, v5.4s, v27.4s +ldr q27, [x0, #400] +mla v16.4S, v21.4S, v31.s[0] +sub v21.4s, v26.4s, v19.4s +ldr q7, [x0, #320] +mla v14.4S, v17.4S, v31.s[0] +add v26.4s, v26.4s, v19.4s +ldr q19, [x0, #448] +mul v25.4S, v25.4S,v8.s[0] +sub v17.4s, v7.4s, v20.4s +ldr q0, [x0, #336] +mul v18.4S, v18.4S,v8.s[0] +add v7.4s, v7.4s, v20.4s +ldr q20, [x0, #464] +mla v25.4S, v13.4S, v31.s[0] +mla v18.4S, v12.4S, v31.s[0] +sub v12.4s, v0.4s, v28.4s +sqrdmulh v13.4S, v26.4S, v4.s[1] +add v0.4s, v0.4s, v28.4s +mul v26.4S, v26.4S,v3.s[1] +sqrdmulh v28.4S, v21.4S, v4.s[2] +sub v24.4s, v6.4s, v16.4s +mul v21.4S, v21.4S,v3.s[2] +add v6.4s, v6.4s, v16.4s +sqrdmulh v4.4S, v0.4S, v30.s[1] +sub v3.4s, v27.4s, v14.4s +mul v0.4S, v0.4S,v23.s[1] +add v27.4s, v27.4s, v14.4s +sqrdmulh v14.4S, v12.4S, v30.s[2] +sub v16.4s, v19.4s, v25.4s +mul v12.4S, v12.4S,v23.s[2] +add v19.4s, v19.4s, v25.4s +mla v26.4S, v13.4S, v31.s[0] +sub v13.4s, v20.4s, v18.4s +ldr q30, [x0, #736] +sqrdmulh v23.4S, v27.4S, v1.s[1] +add v20.4s, v20.4s, v18.4s +mla v21.4S, v28.4S, v31.s[0] +ldr q28, [x0, #672] +sqrdmulh v18.4S, v3.4S, v1.s[2] +sub v25.4s, v5.4s, v26.4s +mla v0.4S, v4.4S, v31.s[0] +ldr q4, [x0, #544] +sqrdmulh v29.4S, v20.4S, v15.s[1] +add v5.4s, v5.4s, v26.4s +str q25, [x0, #272] +mla v12.4S, v14.4S, v31.s[0] +ldr q14, [x17, #+384] +ldr q25, [x17, #+400] +sqrdmulh v26.4S, v13.4S, v15.s[2] +sub v9.4s, v10.4s, v21.4s +str q5, [x0, #256] +mul v27.4S, v27.4S,v22.s[1] +add v10.4s, v10.4s, v21.4s +mul v3.4S, v3.4S,v22.s[2] +str q9, [x0, #304] +mla v27.4S, v23.4S, v31.s[0] +sub v23.4s, v7.4s, v0.4s +mla v3.4S, v18.4S, v31.s[0] +str q10, [x0, #288] +mul v20.4S, v20.4S,v8.s[1] +str q23, [x0, #336] +mul v13.4S, v13.4S,v8.s[2] +add v7.4s, v7.4s, v0.4s +str q7, [x0, #320] +mla v20.4S, v29.4S, v31.s[0] +sub v29.4s, v17.4s, v12.4s +str q29, [x0, #368] +mla v13.4S, v26.4S, v31.s[0] +add v17.4s, v17.4s, v12.4s +str q17, [x0, #352] +sqrdmulh v15.4S, v4.4S, v25.s[0] +sub v8.4s, v6.4s, v27.4s +mul v4.4S, v4.4S,v14.s[0] +str q8, [x0, #400] +ldr q8, [x0, #560] +sqrdmulh v17.4S, v8.4S, v25.s[0] +add v6.4s, v6.4s, v27.4s +mul v8.4S, v8.4S,v14.s[0] +str q6, [x0, #384] +ldr q6, [x17, #+416] +ldr q27, [x17, #+432] +ldr q12, [x0, #608] +sqrdmulh v26.4S, v12.4S, v27.s[0] +sub v29.4s, v24.4s, v3.4s +mul v12.4S, v12.4S,v6.s[0] +str q29, [x0, #432] +ldr q29, [x0, #624] +sqrdmulh v7.4S, v29.4S, v27.s[0] +add v24.4s, v24.4s, v3.4s +mul v29.4S, v29.4S,v6.s[0] +str q24, [x0, #416] +ldr q24, [x17, #+448] +ldr q3, [x17, #+464] +mla v4.4S, v15.4S, v31.s[0] +sub v15.4s, v19.4s, v20.4s +sqrdmulh v0.4S, v28.4S, v3.s[0] +str q15, [x0, #464] +ldr q15, [x0, #688] +mla v8.4S, v17.4S, v31.s[0] +add v19.4s, v19.4s, v20.4s +sqrdmulh v20.4S, v15.4S, v3.s[0] +str q19, [x0, #448] +ldr q19, [x17, #+480] +ldr q17, [x17, #+496] +mla v12.4S, v26.4S, v31.s[0] +sub v26.4s, v16.4s, v13.4s +sqrdmulh v23.4S, v30.4S, v17.s[0] +str q26, [x0, #496] +ldr q26, [x0, #752] +mla v29.4S, v7.4S, v31.s[0] +add v16.4s, v16.4s, v13.4s +sqrdmulh v13.4S, v26.4S, v17.s[0] +str q16, [x0, #480] +ldr q16, [x0, #512] +ldr q7, [x0, #640] +mul v28.4S, v28.4S,v24.s[0] +sub v1.4s, v16.4s, v4.4s +ldr q22, [x0, #528] +mul v15.4S, v15.4S,v24.s[0] +add v16.4s, v16.4s, v4.4s +ldr q4, [x0, #656] +mla v28.4S, v0.4S, v31.s[0] +sub v0.4s, v22.4s, v8.4s +ldr q10, [x0, #576] +mla v15.4S, v20.4S, v31.s[0] +add v22.4s, v22.4s, v8.4s +ldr q8, [x0, #704] +mul v30.4S, v30.4S,v19.s[0] +sub v20.4s, v10.4s, v12.4s +ldr q18, [x0, #592] +mul v26.4S, v26.4S,v19.s[0] +add v10.4s, v10.4s, v12.4s +ldr q12, [x0, #720] +mla v30.4S, v23.4S, v31.s[0] +mla v26.4S, v13.4S, v31.s[0] +sub v13.4s, v18.4s, v29.4s +sqrdmulh v23.4S, v22.4S, v25.s[1] +add v18.4s, v18.4s, v29.4s +mul v22.4S, v22.4S,v14.s[1] +sqrdmulh v29.4S, v0.4S, v25.s[2] +sub v9.4s, v7.4s, v28.4s +mul v0.4S, v0.4S,v14.s[2] +add v7.4s, v7.4s, v28.4s +sqrdmulh v25.4S, v18.4S, v27.s[1] +sub v14.4s, v4.4s, v15.4s +mul v18.4S, v18.4S,v6.s[1] +add v4.4s, v4.4s, v15.4s +sqrdmulh v15.4S, v13.4S, v27.s[2] +sub v28.4s, v8.4s, v30.4s +mul v13.4S, v13.4S,v6.s[2] +add v8.4s, v8.4s, v30.4s +mla v22.4S, v23.4S, v31.s[0] +sub v23.4s, v12.4s, v26.4s +ldr q27, [x0, #992] +sqrdmulh v6.4S, v4.4S, v3.s[1] +add v12.4s, v12.4s, v26.4s +mla v0.4S, v29.4S, v31.s[0] +ldr q29, [x0, #928] +sqrdmulh v26.4S, v14.4S, v3.s[2] +sub v30.4s, v16.4s, v22.4s +mla v18.4S, v25.4S, v31.s[0] +ldr q25, [x0, #800] +sqrdmulh v21.4S, v12.4S, v17.s[1] +add v16.4s, v16.4s, v22.4s +str q30, [x0, #528] +mla v13.4S, v15.4S, v31.s[0] +ldr q15, [x17, #+512] +ldr q30, [x17, #+528] +sqrdmulh v22.4S, v23.4S, v17.s[2] +sub v5.4s, v1.4s, v0.4s +str q16, [x0, #512] +mul v4.4S, v4.4S,v24.s[1] +add v1.4s, v1.4s, v0.4s +mul v14.4S, v14.4S,v24.s[2] +str q5, [x0, #560] +mla v4.4S, v6.4S, v31.s[0] +sub v6.4s, v10.4s, v18.4s +mla v14.4S, v26.4S, v31.s[0] +str q1, [x0, #544] +mul v12.4S, v12.4S,v19.s[1] +str q6, [x0, #592] +mul v23.4S, v23.4S,v19.s[2] +add v10.4s, v10.4s, v18.4s +str q10, [x0, #576] +mla v12.4S, v21.4S, v31.s[0] +sub v21.4s, v20.4s, v13.4s +str q21, [x0, #624] +mla v23.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v13.4s +str q20, [x0, #608] +sqrdmulh v17.4S, v25.4S, v30.s[0] +sub v19.4s, v7.4s, v4.4s +mul v25.4S, v25.4S,v15.s[0] +str q19, [x0, #656] +ldr q19, [x0, #816] +sqrdmulh v20.4S, v19.4S, v30.s[0] +add v7.4s, v7.4s, v4.4s +mul v19.4S, v19.4S,v15.s[0] +str q7, [x0, #640] +ldr q7, [x17, #+544] +ldr q4, [x17, #+560] +ldr q13, [x0, #864] +sqrdmulh v22.4S, v13.4S, v4.s[0] +sub v21.4s, v9.4s, v14.4s +mul v13.4S, v13.4S,v7.s[0] +str q21, [x0, #688] +ldr q21, [x0, #880] +sqrdmulh v10.4S, v21.4S, v4.s[0] +add v9.4s, v9.4s, v14.4s +mul v21.4S, v21.4S,v7.s[0] +str q9, [x0, #672] +ldr q9, [x17, #+576] +ldr q14, [x17, #+592] +mla v25.4S, v17.4S, v31.s[0] +sub v17.4s, v8.4s, v12.4s +sqrdmulh v18.4S, v29.4S, v14.s[0] +str q17, [x0, #720] +ldr q17, [x0, #944] +mla v19.4S, v20.4S, v31.s[0] +add v8.4s, v8.4s, v12.4s +sqrdmulh v12.4S, v17.4S, v14.s[0] +str q8, [x0, #704] +ldr q8, [x17, #+608] +ldr q20, [x17, #+624] +mla v13.4S, v22.4S, v31.s[0] +sub v22.4s, v28.4s, v23.4s +sqrdmulh v6.4S, v27.4S, v20.s[0] +str q22, [x0, #752] +ldr q22, [x0, #1008] +mla v21.4S, v10.4S, v31.s[0] +add v28.4s, v28.4s, v23.4s +sqrdmulh v23.4S, v22.4S, v20.s[0] +str q28, [x0, #736] +ldr q28, [x0, #768] +ldr q10, [x0, #896] +mul v29.4S, v29.4S,v9.s[0] +sub v3.4s, v28.4s, v25.4s +ldr q24, [x0, #784] +mul v17.4S, v17.4S,v9.s[0] +add v28.4s, v28.4s, v25.4s +ldr q25, [x0, #912] +mla v29.4S, v18.4S, v31.s[0] +sub v18.4s, v24.4s, v19.4s +ldr q1, [x0, #832] +mla v17.4S, v12.4S, v31.s[0] +add v24.4s, v24.4s, v19.4s +ldr q19, [x0, #960] +mul v27.4S, v27.4S,v8.s[0] +sub v12.4s, v1.4s, v13.4s +ldr q26, [x0, #848] +mul v22.4S, v22.4S,v8.s[0] +add v1.4s, v1.4s, v13.4s +ldr q13, [x0, #976] +mla v27.4S, v6.4S, v31.s[0] +mla v22.4S, v23.4S, v31.s[0] +sub v23.4s, v26.4s, v21.4s +sqrdmulh v6.4S, v24.4S, v30.s[1] +add v26.4s, v26.4s, v21.4s +mul v24.4S, v24.4S,v15.s[1] +sqrdmulh v21.4S, v18.4S, v30.s[2] +sub v5.4s, v10.4s, v29.4s +mul v18.4S, v18.4S,v15.s[2] +add v10.4s, v10.4s, v29.4s +sqrdmulh v30.4S, v26.4S, v4.s[1] +sub v15.4s, v25.4s, v17.4s +mul v26.4S, v26.4S,v7.s[1] +add v25.4s, v25.4s, v17.4s +sqrdmulh v17.4S, v23.4S, v4.s[2] +sub v29.4s, v19.4s, v27.4s +mul v23.4S, v23.4S,v7.s[2] +add v19.4s, v19.4s, v27.4s +mla v24.4S, v6.4S, v31.s[0] +sub v6.4s, v13.4s, v22.4s +sqrdmulh v4.4S, v25.4S, v14.s[1] +add v13.4s, v13.4s, v22.4s +mla v18.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v15.4S, v14.s[2] +sub v22.4s, v28.4s, v24.4s +mla v26.4S, v30.4S, v31.s[0] +sqrdmulh v30.4S, v13.4S, v20.s[1] +add v28.4s, v28.4s, v24.4s +str q22, [x0, #784] +mla v23.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v6.4S, v20.s[2] +sub v22.4s, v3.4s, v18.4s +str q28, [x0, #768] +mul v25.4S, v25.4S,v9.s[1] +add v3.4s, v3.4s, v18.4s +mul v15.4S, v15.4S,v9.s[2] +str q22, [x0, #816] +mla v25.4S, v4.4S, v31.s[0] +sub v4.4s, v1.4s, v26.4s +mla v15.4S, v21.4S, v31.s[0] +str q3, [x0, #800] +mul v13.4S, v13.4S,v8.s[1] +str q4, [x0, #848] +mul v6.4S, v6.4S,v8.s[2] +add v1.4s, v1.4s, v26.4s +str q1, [x0, #832] +mla v13.4S, v30.4S, v31.s[0] +sub v30.4s, v12.4s, v23.4s +str q30, [x0, #880] +mla v6.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v23.4s +str q12, [x0, #864] +sub v20.4s, v10.4s, v25.4s +str q20, [x0, #912] +add v10.4s, v10.4s, v25.4s +str q10, [x0, #896] +sub v10.4s, v5.4s, v15.4s +str q10, [x0, #944] +add v5.4s, v5.4s, v15.4s +str q5, [x0, #928] +sub v5.4s, v19.4s, v13.4s +str q5, [x0, #976] +add v19.4s, v19.4s, v13.4s +str q19, [x0, #960] +sub v19.4s, v29.4s, v6.4s +str q19, [x0, #1008] +add v29.4s, v29.4s, v6.4s +str q29, [x0, #992] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1464 +// Instruction count: 1460 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_12_z4_7.s b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_12_z4_7.s new file mode 100644 index 0000000..b2cda5a --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_12_z4_7.s @@ -0,0 +1,1494 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_12_z4_7 +.global _ntt_u32_incomplete_neon_asm_var_4_2_12_z4_7 +ntt_u32_incomplete_neon_asm_var_4_2_12_z4_7: +_ntt_u32_incomplete_neon_asm_var_4_2_12_z4_7: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x0, #928] +ldr q29, [x17, #+0] +ldr q28, [x17, #+16] +sqrdmulh v27.4S, v30.4S, v28.s[0] +mul v30.4S, v30.4S,v29.s[0] +ldr q26, [x0, #992] +sqrdmulh v25.4S, v26.4S, v28.s[0] +mul v26.4S, v26.4S,v29.s[0] +ldr q24, [x0, #800] +sqrdmulh v23.4S, v24.4S, v28.s[0] +mul v24.4S, v24.4S,v29.s[0] +ldr q22, [x0, #864] +sqrdmulh v21.4S, v22.4S, v28.s[0] +mul v22.4S, v22.4S,v29.s[0] +ldr q20, [x0, #544] +mla v30.4S, v27.4S, v31.s[0] +sqrdmulh v27.4S, v20.4S, v28.s[0] +ldr q19, [x0, #608] +mla v26.4S, v25.4S, v31.s[0] +sqrdmulh v25.4S, v19.4S, v28.s[0] +ldr q18, [x0, #672] +mla v24.4S, v23.4S, v31.s[0] +sqrdmulh v23.4S, v18.4S, v28.s[0] +ldr q17, [x0, #736] +mla v22.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v17.4S, v28.s[0] +ldr q16, [x0, #416] +ldr q3, [x0, #480] +mul v20.4S, v20.4S,v29.s[0] +sub v2.4s, v16.4s, v30.4s +mul v19.4S, v19.4S,v29.s[0] +add v16.4s, v16.4s, v30.4s +ldr q30, [x0, #288] +ldr q1, [x0, #352] +mla v20.4S, v27.4S, v31.s[0] +sub v27.4s, v3.4s, v26.4s +mla v19.4S, v25.4S, v31.s[0] +add v3.4s, v3.4s, v26.4s +ldr q26, [x0, #32] +ldr q25, [x0, #96] +mul v18.4S, v18.4S,v29.s[0] +sub v0.4s, v30.4s, v24.4s +mul v17.4S, v17.4S,v29.s[0] +add v30.4s, v30.4s, v24.4s +ldr q24, [x0, #160] +ldr q15, [x0, #224] +mla v18.4S, v23.4S, v31.s[0] +sub v23.4s, v1.4s, v22.4s +mla v17.4S, v21.4S, v31.s[0] +add v1.4s, v1.4s, v22.4s +sqrdmulh v22.4S, v16.4S, v28.s[1] +mul v16.4S, v16.4S,v29.s[1] +sqrdmulh v21.4S, v3.4S, v28.s[1] +sub v14.4s, v26.4s, v20.4s +mul v3.4S, v3.4S,v29.s[1] +add v26.4s, v26.4s, v20.4s +sqrdmulh v20.4S, v30.4S, v28.s[1] +sub v13.4s, v25.4s, v19.4s +mul v30.4S, v30.4S,v29.s[1] +add v25.4s, v25.4s, v19.4s +sqrdmulh v19.4S, v1.4S, v28.s[1] +sub v12.4s, v24.4s, v18.4s +mul v1.4S, v1.4S,v29.s[1] +add v24.4s, v24.4s, v18.4s +mla v16.4S, v22.4S, v31.s[0] +sub v22.4s, v15.4s, v17.4s +sqrdmulh v18.4S, v2.4S, v28.s[2] +add v15.4s, v15.4s, v17.4s +mla v3.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v27.4S, v28.s[2] +mla v30.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v0.4S, v28.s[2] +mla v1.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v23.4S, v28.s[2] +ldr q17, [x17, #+32] +ldr q11, [x17, #+48] +mul v2.4S, v2.4S,v29.s[2] +sub v10.4s, v24.4s, v16.4s +mul v27.4S, v27.4S,v29.s[2] +add v24.4s, v24.4s, v16.4s +mla v2.4S, v18.4S, v31.s[0] +sub v18.4s, v15.4s, v3.4s +mla v27.4S, v21.4S, v31.s[0] +add v15.4s, v15.4s, v3.4s +mul v0.4S, v0.4S,v29.s[2] +sub v3.4s, v26.4s, v30.4s +mul v23.4S, v23.4S,v29.s[2] +add v26.4s, v26.4s, v30.4s +mla v0.4S, v20.4S, v31.s[0] +sub v20.4s, v25.4s, v1.4s +mla v23.4S, v19.4S, v31.s[0] +add v25.4s, v25.4s, v1.4s +sqrdmulh v1.4S, v10.4S, v11.s[1] +mul v10.4S, v10.4S,v17.s[1] +sqrdmulh v19.4S, v18.4S, v11.s[1] +sub v30.4s, v12.4s, v2.4s +mul v18.4S, v18.4S,v17.s[1] +add v12.4s, v12.4s, v2.4s +sqrdmulh v2.4S, v24.4S, v11.s[0] +sub v21.4s, v22.4s, v27.4s +mul v24.4S, v24.4S,v17.s[0] +add v22.4s, v22.4s, v27.4s +sqrdmulh v27.4S, v15.4S, v11.s[0] +sub v16.4s, v14.4s, v0.4s +mul v15.4S, v15.4S,v17.s[0] +add v14.4s, v14.4s, v0.4s +ldr q0, [x17, #+64] +ldr q9, [x17, #+80] +mla v10.4S, v1.4S, v31.s[0] +sub v1.4s, v13.4s, v23.4s +sqrdmulh v8.4S, v12.4S, v11.s[2] +add v13.4s, v13.4s, v23.4s +mla v18.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v22.4S, v11.s[2] +mla v24.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v30.4S, v11.s[3] +mla v15.4S, v27.4S, v31.s[0] +sqrdmulh v27.4S, v21.4S, v11.s[3] +ldr q23, [x17, #+96] +ldr q7, [x17, #+112] +mul v12.4S, v12.4S,v17.s[2] +sub v6.4s, v3.4s, v10.4s +mul v22.4S, v22.4S,v17.s[2] +add v3.4s, v3.4s, v10.4s +mla v12.4S, v8.4S, v31.s[0] +sub v8.4s, v20.4s, v18.4s +mla v22.4S, v19.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +mul v30.4S, v30.4S,v17.s[3] +sub v18.4s, v26.4s, v24.4s +mul v21.4S, v21.4S,v17.s[3] +add v26.4s, v26.4s, v24.4s +mla v30.4S, v2.4S, v31.s[0] +sub v2.4s, v25.4s, v15.4s +mla v21.4S, v27.4S, v31.s[0] +add v25.4s, v25.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v9.s[2] +mul v20.4S, v20.4S,v0.s[2] +sqrdmulh v27.4S, v8.4S, v9.s[3] +sub v24.4s, v14.4s, v12.4s +mul v8.4S, v8.4S,v0.s[3] +add v14.4s, v14.4s, v12.4s +sqrdmulh v12.4S, v2.4S, v9.s[1] +sub v19.4s, v13.4s, v22.4s +mul v2.4S, v2.4S,v0.s[1] +add v13.4s, v13.4s, v22.4s +sqrdmulh v22.4S, v25.4S, v9.s[0] +sub v10.4s, v16.4s, v30.4s +mul v25.4S, v25.4S,v0.s[0] +add v16.4s, v16.4s, v30.4s +mla v20.4S, v15.4S, v31.s[0] +sub v15.4s, v1.4s, v21.4s +sqrdmulh v30.4S, v13.4S, v7.s[0] +add v1.4s, v1.4s, v21.4s +mla v8.4S, v27.4S, v31.s[0] +sub v27.4s, v3.4s, v20.4s +sqrdmulh v21.4S, v19.4S, v7.s[1] +add v3.4s, v3.4s, v20.4s +mla v2.4S, v12.4S, v31.s[0] +sub v12.4s, v6.4s, v8.4s +sqrdmulh v20.4S, v1.4S, v7.s[2] +add v6.4s, v6.4s, v8.4s +mla v25.4S, v22.4S, v31.s[0] +sub v22.4s, v18.4s, v2.4s +sqrdmulh v8.4S, v15.4S, v7.s[3] +add v18.4s, v18.4s, v2.4s +mul v13.4S, v13.4S,v23.s[0] +sub v2.4s, v26.4s, v25.4s +mul v19.4S, v19.4S,v23.s[1] +add v26.4s, v26.4s, v25.4s +mla v13.4S, v30.4S, v31.s[0] +str q27, [x0, #352] +mla v19.4S, v21.4S, v31.s[0] +str q3, [x0, #288] +mul v1.4S, v1.4S,v23.s[2] +str q12, [x0, #480] +mul v15.4S, v15.4S,v23.s[3] +str q6, [x0, #416] +mla v1.4S, v20.4S, v31.s[0] +str q22, [x0, #224] +mla v15.4S, v8.4S, v31.s[0] +str q18, [x0, #160] +ldr q18, [x0, #944] +sqrdmulh v8.4S, v18.4S, v28.s[0] +str q2, [x0, #96] +mul v18.4S, v18.4S,v29.s[0] +str q26, [x0, #32] +ldr q26, [x0, #1008] +sqrdmulh v2.4S, v26.4S, v28.s[0] +sub v22.4s, v14.4s, v13.4s +str q22, [x0, #608] +mul v26.4S, v26.4S,v29.s[0] +add v14.4s, v14.4s, v13.4s +ldr q13, [x0, #816] +sqrdmulh v22.4S, v13.4S, v28.s[0] +sub v20.4s, v24.4s, v19.4s +str q14, [x0, #544] +mul v13.4S, v13.4S,v29.s[0] +add v24.4s, v24.4s, v19.4s +ldr q19, [x0, #880] +sqrdmulh v14.4S, v19.4S, v28.s[0] +sub v6.4s, v16.4s, v1.4s +str q20, [x0, #736] +mul v19.4S, v19.4S,v29.s[0] +add v16.4s, v16.4s, v1.4s +ldr q1, [x0, #560] +mla v18.4S, v8.4S, v31.s[0] +sub v8.4s, v10.4s, v15.4s +str q24, [x0, #672] +sqrdmulh v24.4S, v1.4S, v28.s[0] +add v10.4s, v10.4s, v15.4s +ldr q15, [x0, #624] +mla v26.4S, v2.4S, v31.s[0] +str q6, [x0, #864] +sqrdmulh v6.4S, v15.4S, v28.s[0] +ldr q2, [x0, #688] +mla v13.4S, v22.4S, v31.s[0] +str q16, [x0, #800] +sqrdmulh v16.4S, v2.4S, v28.s[0] +ldr q22, [x0, #752] +mla v19.4S, v14.4S, v31.s[0] +str q8, [x0, #992] +sqrdmulh v8.4S, v22.4S, v28.s[0] +ldr q14, [x0, #432] +ldr q20, [x0, #496] +mul v1.4S, v1.4S,v29.s[0] +sub v12.4s, v14.4s, v18.4s +str q10, [x0, #928] +mul v15.4S, v15.4S,v29.s[0] +add v14.4s, v14.4s, v18.4s +ldr q18, [x0, #304] +ldr q10, [x0, #368] +mla v1.4S, v24.4S, v31.s[0] +sub v24.4s, v20.4s, v26.4s +mla v15.4S, v6.4S, v31.s[0] +add v20.4s, v20.4s, v26.4s +ldr q26, [x0, #48] +ldr q6, [x0, #112] +mul v2.4S, v2.4S,v29.s[0] +sub v3.4s, v18.4s, v13.4s +mul v22.4S, v22.4S,v29.s[0] +add v18.4s, v18.4s, v13.4s +ldr q13, [x0, #176] +ldr q21, [x0, #240] +mla v2.4S, v16.4S, v31.s[0] +sub v16.4s, v10.4s, v19.4s +mla v22.4S, v8.4S, v31.s[0] +add v10.4s, v10.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v28.s[1] +mul v14.4S, v14.4S,v29.s[1] +sqrdmulh v8.4S, v20.4S, v28.s[1] +sub v27.4s, v26.4s, v1.4s +mul v20.4S, v20.4S,v29.s[1] +add v26.4s, v26.4s, v1.4s +sqrdmulh v1.4S, v18.4S, v28.s[1] +sub v30.4s, v6.4s, v15.4s +mul v18.4S, v18.4S,v29.s[1] +add v6.4s, v6.4s, v15.4s +sqrdmulh v15.4S, v10.4S, v28.s[1] +sub v25.4s, v13.4s, v2.4s +mul v10.4S, v10.4S,v29.s[1] +add v13.4s, v13.4s, v2.4s +mla v14.4S, v19.4S, v31.s[0] +sub v19.4s, v21.4s, v22.4s +sqrdmulh v2.4S, v12.4S, v28.s[2] +add v21.4s, v21.4s, v22.4s +mla v20.4S, v8.4S, v31.s[0] +sqrdmulh v8.4S, v24.4S, v28.s[2] +mla v18.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v3.4S, v28.s[2] +mla v10.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v16.4S, v28.s[2] +mul v12.4S, v12.4S,v29.s[2] +sub v22.4s, v13.4s, v14.4s +mul v24.4S, v24.4S,v29.s[2] +add v13.4s, v13.4s, v14.4s +mla v12.4S, v2.4S, v31.s[0] +sub v2.4s, v21.4s, v20.4s +mla v24.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v20.4s +mul v3.4S, v3.4S,v29.s[2] +sub v20.4s, v26.4s, v18.4s +mul v16.4S, v16.4S,v29.s[2] +add v26.4s, v26.4s, v18.4s +mla v3.4S, v1.4S, v31.s[0] +sub v1.4s, v6.4s, v10.4s +mla v16.4S, v15.4S, v31.s[0] +add v6.4s, v6.4s, v10.4s +sqrdmulh v10.4S, v22.4S, v11.s[1] +mul v22.4S, v22.4S,v17.s[1] +sqrdmulh v15.4S, v2.4S, v11.s[1] +sub v18.4s, v25.4s, v12.4s +mul v2.4S, v2.4S,v17.s[1] +add v25.4s, v25.4s, v12.4s +sqrdmulh v12.4S, v13.4S, v11.s[0] +sub v8.4s, v19.4s, v24.4s +mul v13.4S, v13.4S,v17.s[0] +add v19.4s, v19.4s, v24.4s +sqrdmulh v24.4S, v21.4S, v11.s[0] +sub v14.4s, v27.4s, v3.4s +mul v21.4S, v21.4S,v17.s[0] +add v27.4s, v27.4s, v3.4s +mla v22.4S, v10.4S, v31.s[0] +sub v10.4s, v30.4s, v16.4s +sqrdmulh v3.4S, v25.4S, v11.s[2] +add v30.4s, v30.4s, v16.4s +mla v2.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v19.4S, v11.s[2] +mla v13.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v18.4S, v11.s[3] +mla v21.4S, v24.4S, v31.s[0] +sqrdmulh v24.4S, v8.4S, v11.s[3] +mul v25.4S, v25.4S,v17.s[2] +sub v16.4s, v20.4s, v22.4s +mul v19.4S, v19.4S,v17.s[2] +add v20.4s, v20.4s, v22.4s +mla v25.4S, v3.4S, v31.s[0] +sub v3.4s, v1.4s, v2.4s +mla v19.4S, v15.4S, v31.s[0] +add v1.4s, v1.4s, v2.4s +mul v18.4S, v18.4S,v17.s[3] +sub v2.4s, v26.4s, v13.4s +mul v8.4S, v8.4S,v17.s[3] +add v26.4s, v26.4s, v13.4s +mla v18.4S, v12.4S, v31.s[0] +sub v12.4s, v6.4s, v21.4s +mla v8.4S, v24.4S, v31.s[0] +add v6.4s, v6.4s, v21.4s +sqrdmulh v21.4S, v1.4S, v9.s[2] +mul v1.4S, v1.4S,v0.s[2] +sqrdmulh v24.4S, v3.4S, v9.s[3] +sub v13.4s, v27.4s, v25.4s +mul v3.4S, v3.4S,v0.s[3] +add v27.4s, v27.4s, v25.4s +sqrdmulh v25.4S, v12.4S, v9.s[1] +sub v15.4s, v30.4s, v19.4s +mul v12.4S, v12.4S,v0.s[1] +add v30.4s, v30.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v9.s[0] +sub v22.4s, v14.4s, v18.4s +mul v6.4S, v6.4S,v0.s[0] +add v14.4s, v14.4s, v18.4s +mla v1.4S, v21.4S, v31.s[0] +sub v21.4s, v10.4s, v8.4s +sqrdmulh v18.4S, v30.4S, v7.s[0] +add v10.4s, v10.4s, v8.4s +mla v3.4S, v24.4S, v31.s[0] +sub v24.4s, v20.4s, v1.4s +sqrdmulh v8.4S, v15.4S, v7.s[1] +add v20.4s, v20.4s, v1.4s +mla v12.4S, v25.4S, v31.s[0] +sub v25.4s, v16.4s, v3.4s +sqrdmulh v1.4S, v10.4S, v7.s[2] +add v16.4s, v16.4s, v3.4s +mla v6.4S, v19.4S, v31.s[0] +sub v19.4s, v2.4s, v12.4s +sqrdmulh v3.4S, v21.4S, v7.s[3] +add v2.4s, v2.4s, v12.4s +mul v30.4S, v30.4S,v23.s[0] +sub v12.4s, v26.4s, v6.4s +mul v15.4S, v15.4S,v23.s[1] +add v26.4s, v26.4s, v6.4s +mla v30.4S, v18.4S, v31.s[0] +str q24, [x0, #368] +mla v15.4S, v8.4S, v31.s[0] +str q20, [x0, #304] +mul v10.4S, v10.4S,v23.s[2] +str q25, [x0, #496] +mul v21.4S, v21.4S,v23.s[3] +str q16, [x0, #432] +mla v10.4S, v1.4S, v31.s[0] +str q19, [x0, #240] +mla v21.4S, v3.4S, v31.s[0] +str q2, [x0, #176] +ldr q2, [x0, #896] +sqrdmulh v3.4S, v2.4S, v28.s[0] +str q12, [x0, #112] +mul v2.4S, v2.4S,v29.s[0] +str q26, [x0, #48] +ldr q26, [x0, #960] +sqrdmulh v12.4S, v26.4S, v28.s[0] +sub v19.4s, v27.4s, v30.4s +str q19, [x0, #624] +mul v26.4S, v26.4S,v29.s[0] +add v27.4s, v27.4s, v30.4s +ldr q30, [x0, #768] +sqrdmulh v19.4S, v30.4S, v28.s[0] +sub v1.4s, v13.4s, v15.4s +str q27, [x0, #560] +mul v30.4S, v30.4S,v29.s[0] +add v13.4s, v13.4s, v15.4s +ldr q15, [x0, #832] +sqrdmulh v27.4S, v15.4S, v28.s[0] +sub v16.4s, v14.4s, v10.4s +str q1, [x0, #752] +mul v15.4S, v15.4S,v29.s[0] +add v14.4s, v14.4s, v10.4s +ldr q10, [x0, #512] +mla v2.4S, v3.4S, v31.s[0] +sub v3.4s, v22.4s, v21.4s +str q13, [x0, #688] +sqrdmulh v13.4S, v10.4S, v28.s[0] +add v22.4s, v22.4s, v21.4s +ldr q21, [x0, #576] +mla v26.4S, v12.4S, v31.s[0] +str q16, [x0, #880] +sqrdmulh v16.4S, v21.4S, v28.s[0] +ldr q12, [x0, #640] +mla v30.4S, v19.4S, v31.s[0] +str q14, [x0, #816] +sqrdmulh v14.4S, v12.4S, v28.s[0] +ldr q19, [x0, #704] +mla v15.4S, v27.4S, v31.s[0] +str q3, [x0, #1008] +sqrdmulh v3.4S, v19.4S, v28.s[0] +ldr q27, [x0, #384] +ldr q1, [x0, #448] +mul v10.4S, v10.4S,v29.s[0] +sub v25.4s, v27.4s, v2.4s +str q22, [x0, #944] +mul v21.4S, v21.4S,v29.s[0] +add v27.4s, v27.4s, v2.4s +ldr q2, [x0, #256] +ldr q22, [x0, #320] +mla v10.4S, v13.4S, v31.s[0] +sub v13.4s, v1.4s, v26.4s +mla v21.4S, v16.4S, v31.s[0] +add v1.4s, v1.4s, v26.4s +ldr q26, [x0, #0] +ldr q16, [x0, #64] +mul v12.4S, v12.4S,v29.s[0] +sub v20.4s, v2.4s, v30.4s +mul v19.4S, v19.4S,v29.s[0] +add v2.4s, v2.4s, v30.4s +ldr q30, [x0, #128] +ldr q8, [x0, #192] +mla v12.4S, v14.4S, v31.s[0] +sub v14.4s, v22.4s, v15.4s +mla v19.4S, v3.4S, v31.s[0] +add v22.4s, v22.4s, v15.4s +sqrdmulh v15.4S, v27.4S, v28.s[1] +mul v27.4S, v27.4S,v29.s[1] +sqrdmulh v3.4S, v1.4S, v28.s[1] +sub v24.4s, v26.4s, v10.4s +mul v1.4S, v1.4S,v29.s[1] +add v26.4s, v26.4s, v10.4s +sqrdmulh v10.4S, v2.4S, v28.s[1] +sub v18.4s, v16.4s, v21.4s +mul v2.4S, v2.4S,v29.s[1] +add v16.4s, v16.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v28.s[1] +sub v6.4s, v30.4s, v12.4s +mul v22.4S, v22.4S,v29.s[1] +add v30.4s, v30.4s, v12.4s +mla v27.4S, v15.4S, v31.s[0] +sub v15.4s, v8.4s, v19.4s +sqrdmulh v12.4S, v25.4S, v28.s[2] +add v8.4s, v8.4s, v19.4s +mla v1.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v13.4S, v28.s[2] +mla v2.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v20.4S, v28.s[2] +mla v22.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v14.4S, v28.s[2] +mul v25.4S, v25.4S,v29.s[2] +sub v19.4s, v30.4s, v27.4s +mul v13.4S, v13.4S,v29.s[2] +add v30.4s, v30.4s, v27.4s +mla v25.4S, v12.4S, v31.s[0] +sub v12.4s, v8.4s, v1.4s +mla v13.4S, v3.4S, v31.s[0] +add v8.4s, v8.4s, v1.4s +mul v20.4S, v20.4S,v29.s[2] +sub v1.4s, v26.4s, v2.4s +mul v14.4S, v14.4S,v29.s[2] +add v26.4s, v26.4s, v2.4s +mla v20.4S, v10.4S, v31.s[0] +sub v10.4s, v16.4s, v22.4s +mla v14.4S, v21.4S, v31.s[0] +add v16.4s, v16.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v11.s[1] +mul v19.4S, v19.4S,v17.s[1] +sqrdmulh v21.4S, v12.4S, v11.s[1] +sub v2.4s, v6.4s, v25.4s +mul v12.4S, v12.4S,v17.s[1] +add v6.4s, v6.4s, v25.4s +sqrdmulh v25.4S, v30.4S, v11.s[0] +sub v3.4s, v15.4s, v13.4s +mul v30.4S, v30.4S,v17.s[0] +add v15.4s, v15.4s, v13.4s +sqrdmulh v13.4S, v8.4S, v11.s[0] +sub v27.4s, v24.4s, v20.4s +mul v8.4S, v8.4S,v17.s[0] +add v24.4s, v24.4s, v20.4s +mla v19.4S, v22.4S, v31.s[0] +sub v22.4s, v18.4s, v14.4s +sqrdmulh v20.4S, v6.4S, v11.s[2] +add v18.4s, v18.4s, v14.4s +mla v12.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v15.4S, v11.s[2] +mla v30.4S, v25.4S, v31.s[0] +sqrdmulh v25.4S, v2.4S, v11.s[3] +mla v8.4S, v13.4S, v31.s[0] +sqrdmulh v13.4S, v3.4S, v11.s[3] +mul v6.4S, v6.4S,v17.s[2] +sub v14.4s, v1.4s, v19.4s +mul v15.4S, v15.4S,v17.s[2] +add v1.4s, v1.4s, v19.4s +mla v6.4S, v20.4S, v31.s[0] +sub v20.4s, v10.4s, v12.4s +mla v15.4S, v21.4S, v31.s[0] +add v10.4s, v10.4s, v12.4s +mul v2.4S, v2.4S,v17.s[3] +sub v12.4s, v26.4s, v30.4s +mul v3.4S, v3.4S,v17.s[3] +add v26.4s, v26.4s, v30.4s +mla v2.4S, v25.4S, v31.s[0] +sub v25.4s, v16.4s, v8.4s +mla v3.4S, v13.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v10.4S, v9.s[2] +mul v10.4S, v10.4S,v0.s[2] +sqrdmulh v13.4S, v20.4S, v9.s[3] +sub v30.4s, v24.4s, v6.4s +mul v20.4S, v20.4S,v0.s[3] +add v24.4s, v24.4s, v6.4s +sqrdmulh v6.4S, v25.4S, v9.s[1] +sub v21.4s, v18.4s, v15.4s +mul v25.4S, v25.4S,v0.s[1] +add v18.4s, v18.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v9.s[0] +sub v19.4s, v27.4s, v2.4s +mul v16.4S, v16.4S,v0.s[0] +add v27.4s, v27.4s, v2.4s +mla v10.4S, v8.4S, v31.s[0] +sub v8.4s, v22.4s, v3.4s +sqrdmulh v2.4S, v18.4S, v7.s[0] +add v22.4s, v22.4s, v3.4s +mla v20.4S, v13.4S, v31.s[0] +sub v13.4s, v1.4s, v10.4s +sqrdmulh v3.4S, v21.4S, v7.s[1] +add v1.4s, v1.4s, v10.4s +mla v25.4S, v6.4S, v31.s[0] +sub v6.4s, v14.4s, v20.4s +sqrdmulh v10.4S, v22.4S, v7.s[2] +add v14.4s, v14.4s, v20.4s +mla v16.4S, v15.4S, v31.s[0] +sub v15.4s, v12.4s, v25.4s +sqrdmulh v20.4S, v8.4S, v7.s[3] +add v12.4s, v12.4s, v25.4s +mul v18.4S, v18.4S,v23.s[0] +sub v25.4s, v26.4s, v16.4s +mul v21.4S, v21.4S,v23.s[1] +add v26.4s, v26.4s, v16.4s +mla v18.4S, v2.4S, v31.s[0] +str q13, [x0, #320] +mla v21.4S, v3.4S, v31.s[0] +str q1, [x0, #256] +mul v22.4S, v22.4S,v23.s[2] +str q6, [x0, #448] +mul v8.4S, v8.4S,v23.s[3] +str q14, [x0, #384] +mla v22.4S, v10.4S, v31.s[0] +str q15, [x0, #192] +mla v8.4S, v20.4S, v31.s[0] +str q12, [x0, #128] +ldr q12, [x0, #912] +sqrdmulh v20.4S, v12.4S, v28.s[0] +str q25, [x0, #64] +mul v12.4S, v12.4S,v29.s[0] +str q26, [x0, #0] +ldr q26, [x0, #976] +sqrdmulh v25.4S, v26.4S, v28.s[0] +sub v15.4s, v24.4s, v18.4s +str q15, [x0, #576] +mul v26.4S, v26.4S,v29.s[0] +add v24.4s, v24.4s, v18.4s +ldr q18, [x0, #784] +sqrdmulh v15.4S, v18.4S, v28.s[0] +sub v10.4s, v30.4s, v21.4s +str q24, [x0, #512] +mul v18.4S, v18.4S,v29.s[0] +add v30.4s, v30.4s, v21.4s +ldr q21, [x0, #848] +sqrdmulh v24.4S, v21.4S, v28.s[0] +sub v14.4s, v27.4s, v22.4s +str q10, [x0, #704] +mul v21.4S, v21.4S,v29.s[0] +add v27.4s, v27.4s, v22.4s +ldr q22, [x0, #528] +mla v12.4S, v20.4S, v31.s[0] +sub v20.4s, v19.4s, v8.4s +str q30, [x0, #640] +sqrdmulh v30.4S, v22.4S, v28.s[0] +add v19.4s, v19.4s, v8.4s +ldr q8, [x0, #592] +mla v26.4S, v25.4S, v31.s[0] +str q14, [x0, #832] +sqrdmulh v14.4S, v8.4S, v28.s[0] +ldr q25, [x0, #656] +mla v18.4S, v15.4S, v31.s[0] +str q27, [x0, #768] +sqrdmulh v27.4S, v25.4S, v28.s[0] +ldr q15, [x0, #720] +mla v21.4S, v24.4S, v31.s[0] +str q20, [x0, #960] +sqrdmulh v20.4S, v15.4S, v28.s[0] +ldr q24, [x0, #400] +ldr q10, [x0, #464] +mul v22.4S, v22.4S,v29.s[0] +sub v6.4s, v24.4s, v12.4s +str q19, [x0, #896] +mul v8.4S, v8.4S,v29.s[0] +add v24.4s, v24.4s, v12.4s +ldr q12, [x0, #272] +ldr q19, [x0, #336] +mla v22.4S, v30.4S, v31.s[0] +sub v30.4s, v10.4s, v26.4s +mla v8.4S, v14.4S, v31.s[0] +add v10.4s, v10.4s, v26.4s +ldr q26, [x0, #16] +ldr q14, [x0, #80] +mul v25.4S, v25.4S,v29.s[0] +sub v1.4s, v12.4s, v18.4s +mul v15.4S, v15.4S,v29.s[0] +add v12.4s, v12.4s, v18.4s +ldr q18, [x0, #144] +ldr q3, [x0, #208] +mla v25.4S, v27.4S, v31.s[0] +sub v27.4s, v19.4s, v21.4s +mla v15.4S, v20.4S, v31.s[0] +add v19.4s, v19.4s, v21.4s +sqrdmulh v21.4S, v24.4S, v28.s[1] +mul v24.4S, v24.4S,v29.s[1] +sqrdmulh v20.4S, v10.4S, v28.s[1] +sub v13.4s, v26.4s, v22.4s +mul v10.4S, v10.4S,v29.s[1] +add v26.4s, v26.4s, v22.4s +sqrdmulh v22.4S, v12.4S, v28.s[1] +sub v2.4s, v14.4s, v8.4s +mul v12.4S, v12.4S,v29.s[1] +add v14.4s, v14.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v28.s[1] +sub v16.4s, v18.4s, v25.4s +mul v19.4S, v19.4S,v29.s[1] +add v18.4s, v18.4s, v25.4s +mla v24.4S, v21.4S, v31.s[0] +sub v21.4s, v3.4s, v15.4s +sqrdmulh v25.4S, v6.4S, v28.s[2] +add v3.4s, v3.4s, v15.4s +mla v10.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v30.4S, v28.s[2] +mla v12.4S, v22.4S, v31.s[0] +sqrdmulh v22.4S, v1.4S, v28.s[2] +mla v19.4S, v8.4S, v31.s[0] +sqrdmulh v8.4S, v27.4S, v28.s[2] +mul v6.4S, v6.4S,v29.s[2] +sub v15.4s, v18.4s, v24.4s +mul v30.4S, v30.4S,v29.s[2] +add v18.4s, v18.4s, v24.4s +mla v6.4S, v25.4S, v31.s[0] +sub v25.4s, v3.4s, v10.4s +mla v30.4S, v20.4S, v31.s[0] +add v3.4s, v3.4s, v10.4s +mul v1.4S, v1.4S,v29.s[2] +sub v10.4s, v26.4s, v12.4s +mul v27.4S, v27.4S,v29.s[2] +add v26.4s, v26.4s, v12.4s +mla v1.4S, v22.4S, v31.s[0] +sub v22.4s, v14.4s, v19.4s +mla v27.4S, v8.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +sqrdmulh v28.4S, v15.4S, v11.s[1] +mul v15.4S, v15.4S,v17.s[1] +sqrdmulh v29.4S, v25.4S, v11.s[1] +sub v19.4s, v16.4s, v6.4s +mul v25.4S, v25.4S,v17.s[1] +add v16.4s, v16.4s, v6.4s +sqrdmulh v6.4S, v18.4S, v11.s[0] +sub v8.4s, v21.4s, v30.4s +mul v18.4S, v18.4S,v17.s[0] +add v21.4s, v21.4s, v30.4s +sqrdmulh v30.4S, v3.4S, v11.s[0] +sub v12.4s, v13.4s, v1.4s +mul v3.4S, v3.4S,v17.s[0] +add v13.4s, v13.4s, v1.4s +mla v15.4S, v28.4S, v31.s[0] +sub v28.4s, v2.4s, v27.4s +sqrdmulh v1.4S, v16.4S, v11.s[2] +add v2.4s, v2.4s, v27.4s +mla v25.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v21.4S, v11.s[2] +mla v18.4S, v6.4S, v31.s[0] +sqrdmulh v6.4S, v19.4S, v11.s[3] +mla v3.4S, v30.4S, v31.s[0] +sqrdmulh v30.4S, v8.4S, v11.s[3] +mul v16.4S, v16.4S,v17.s[2] +sub v27.4s, v10.4s, v15.4s +mul v21.4S, v21.4S,v17.s[2] +add v10.4s, v10.4s, v15.4s +mla v16.4S, v1.4S, v31.s[0] +sub v1.4s, v22.4s, v25.4s +mla v21.4S, v29.4S, v31.s[0] +add v22.4s, v22.4s, v25.4s +mul v19.4S, v19.4S,v17.s[3] +sub v25.4s, v26.4s, v18.4s +mul v8.4S, v8.4S,v17.s[3] +add v26.4s, v26.4s, v18.4s +mla v19.4S, v6.4S, v31.s[0] +sub v6.4s, v14.4s, v3.4s +mla v8.4S, v30.4S, v31.s[0] +add v14.4s, v14.4s, v3.4s +sqrdmulh v11.4S, v22.4S, v9.s[2] +mul v22.4S, v22.4S,v0.s[2] +sqrdmulh v17.4S, v1.4S, v9.s[3] +sub v3.4s, v13.4s, v16.4s +mul v1.4S, v1.4S,v0.s[3] +add v13.4s, v13.4s, v16.4s +sqrdmulh v16.4S, v6.4S, v9.s[1] +sub v30.4s, v2.4s, v21.4s +mul v6.4S, v6.4S,v0.s[1] +add v2.4s, v2.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v9.s[0] +sub v18.4s, v12.4s, v19.4s +mul v14.4S, v14.4S,v0.s[0] +add v12.4s, v12.4s, v19.4s +mla v22.4S, v11.4S, v31.s[0] +sub v11.4s, v28.4s, v8.4s +sqrdmulh v9.4S, v2.4S, v7.s[0] +add v28.4s, v28.4s, v8.4s +mla v1.4S, v17.4S, v31.s[0] +sub v17.4s, v10.4s, v22.4s +sqrdmulh v8.4S, v30.4S, v7.s[1] +add v10.4s, v10.4s, v22.4s +mla v6.4S, v16.4S, v31.s[0] +sub v16.4s, v27.4s, v1.4s +sqrdmulh v22.4S, v28.4S, v7.s[2] +add v27.4s, v27.4s, v1.4s +mla v14.4S, v21.4S, v31.s[0] +sub v21.4s, v25.4s, v6.4s +sqrdmulh v1.4S, v11.4S, v7.s[3] +add v25.4s, v25.4s, v6.4s +mul v2.4S, v2.4S,v23.s[0] +sub v6.4s, v26.4s, v14.4s +mul v30.4S, v30.4S,v23.s[1] +add v26.4s, v26.4s, v14.4s +mla v2.4S, v9.4S, v31.s[0] +str q17, [x0, #336] +mla v30.4S, v8.4S, v31.s[0] +str q10, [x0, #272] +mul v28.4S, v28.4S,v23.s[2] +str q16, [x0, #464] +mul v11.4S, v11.4S,v23.s[3] +str q27, [x0, #400] +mla v28.4S, v22.4S, v31.s[0] +str q21, [x0, #208] +mla v11.4S, v1.4S, v31.s[0] +str q25, [x0, #144] +str q6, [x0, #80] +str q26, [x0, #16] +sub v26.4s, v13.4s, v2.4s +str q26, [x0, #592] +add v13.4s, v13.4s, v2.4s +sub v2.4s, v3.4s, v30.4s +str q13, [x0, #528] +add v3.4s, v3.4s, v30.4s +sub v30.4s, v12.4s, v28.4s +str q2, [x0, #720] +add v12.4s, v12.4s, v28.4s +sub v28.4s, v18.4s, v11.4s +str q3, [x0, #656] +add v18.4s, v18.4s, v11.4s +str q30, [x0, #848] +str q12, [x0, #784] +str q28, [x0, #976] +str q18, [x0, #912] +ldr q4, [x0, #224] +ldr q5, [x0, #160] +ldr q24, [x0, #32] +ldr q20, [x17, #+128] +ldr q15, [x17, #+144] +sqrdmulh v29.4S, v24.4S, v15.s[0] +mul v24.4S, v24.4S,v20.s[0] +ldr q19, [x0, #48] +sqrdmulh v0.4S, v19.4S, v15.s[0] +mul v19.4S, v19.4S,v20.s[0] +ldr q14, [x17, #+160] +ldr q9, [x17, #+176] +ldr q17, [x0, #96] +sqrdmulh v8.4S, v17.4S, v9.s[0] +mul v17.4S, v17.4S,v14.s[0] +ldr q10, [x0, #112] +sqrdmulh v16.4S, v10.4S, v9.s[0] +mul v10.4S, v10.4S,v14.s[0] +ldr q27, [x17, #+192] +ldr q22, [x17, #+208] +mla v24.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v5.4S, v22.s[0] +ldr q21, [x0, #176] +mla v19.4S, v0.4S, v31.s[0] +sqrdmulh v0.4S, v21.4S, v22.s[0] +ldr q1, [x17, #+224] +ldr q25, [x17, #+240] +mla v17.4S, v8.4S, v31.s[0] +sqrdmulh v8.4S, v4.4S, v25.s[0] +ldr q23, [x0, #240] +mla v10.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v23.4S, v25.s[0] +ldr q7, [x0, #0] +ldr q6, [x0, #128] +mul v5.4S, v5.4S,v27.s[0] +sub v26.4s, v7.4s, v24.4s +ldr q13, [x0, #16] +mul v21.4S, v21.4S,v27.s[0] +add v7.4s, v7.4s, v24.4s +ldr q24, [x0, #144] +mla v5.4S, v29.4S, v31.s[0] +sub v29.4s, v13.4s, v19.4s +ldr q2, [x0, #64] +mla v21.4S, v0.4S, v31.s[0] +add v13.4s, v13.4s, v19.4s +ldr q19, [x0, #192] +mul v4.4S, v4.4S,v1.s[0] +sub v0.4s, v2.4s, v17.4s +ldr q3, [x0, #80] +mul v23.4S, v23.4S,v1.s[0] +add v2.4s, v2.4s, v17.4s +ldr q17, [x0, #208] +mla v4.4S, v8.4S, v31.s[0] +mla v23.4S, v16.4S, v31.s[0] +sub v16.4s, v3.4s, v10.4s +sqrdmulh v8.4S, v13.4S, v15.s[1] +add v3.4s, v3.4s, v10.4s +mul v13.4S, v13.4S,v20.s[1] +sqrdmulh v10.4S, v29.4S, v15.s[2] +sub v11.4s, v6.4s, v5.4s +mul v29.4S, v29.4S,v20.s[2] +add v6.4s, v6.4s, v5.4s +sqrdmulh v15.4S, v3.4S, v9.s[1] +sub v20.4s, v24.4s, v21.4s +mul v3.4S, v3.4S,v14.s[1] +add v24.4s, v24.4s, v21.4s +sqrdmulh v21.4S, v16.4S, v9.s[2] +sub v5.4s, v19.4s, v4.4s +mul v16.4S, v16.4S,v14.s[2] +add v19.4s, v19.4s, v4.4s +mla v13.4S, v8.4S, v31.s[0] +sub v8.4s, v17.4s, v23.4s +ldr q9, [x0, #480] +sqrdmulh v14.4S, v24.4S, v22.s[1] +add v17.4s, v17.4s, v23.4s +mla v29.4S, v10.4S, v31.s[0] +ldr q10, [x0, #416] +sqrdmulh v23.4S, v20.4S, v22.s[2] +sub v4.4s, v7.4s, v13.4s +mla v3.4S, v15.4S, v31.s[0] +ldr q15, [x0, #288] +sqrdmulh v30.4S, v17.4S, v25.s[1] +add v7.4s, v7.4s, v13.4s +str q4, [x0, #16] +mla v16.4S, v21.4S, v31.s[0] +ldr q21, [x17, #+256] +ldr q4, [x17, #+272] +sqrdmulh v13.4S, v8.4S, v25.s[2] +sub v12.4s, v26.4s, v29.4s +str q7, [x0, #0] +mul v24.4S, v24.4S,v27.s[1] +add v26.4s, v26.4s, v29.4s +mul v20.4S, v20.4S,v27.s[2] +str q12, [x0, #48] +mla v24.4S, v14.4S, v31.s[0] +sub v14.4s, v2.4s, v3.4s +mla v20.4S, v23.4S, v31.s[0] +str q26, [x0, #32] +mul v17.4S, v17.4S,v1.s[1] +str q14, [x0, #80] +mul v8.4S, v8.4S,v1.s[2] +add v2.4s, v2.4s, v3.4s +str q2, [x0, #64] +mla v17.4S, v30.4S, v31.s[0] +sub v30.4s, v0.4s, v16.4s +str q30, [x0, #112] +mla v8.4S, v13.4S, v31.s[0] +add v0.4s, v0.4s, v16.4s +str q0, [x0, #96] +sqrdmulh v25.4S, v15.4S, v4.s[0] +sub v1.4s, v6.4s, v24.4s +mul v15.4S, v15.4S,v21.s[0] +str q1, [x0, #144] +ldr q1, [x0, #304] +sqrdmulh v0.4S, v1.4S, v4.s[0] +add v6.4s, v6.4s, v24.4s +mul v1.4S, v1.4S,v21.s[0] +str q6, [x0, #128] +ldr q6, [x17, #+288] +ldr q24, [x17, #+304] +ldr q16, [x0, #352] +sqrdmulh v13.4S, v16.4S, v24.s[0] +sub v30.4s, v11.4s, v20.4s +mul v16.4S, v16.4S,v6.s[0] +str q30, [x0, #176] +ldr q30, [x0, #368] +sqrdmulh v2.4S, v30.4S, v24.s[0] +add v11.4s, v11.4s, v20.4s +mul v30.4S, v30.4S,v6.s[0] +str q11, [x0, #160] +ldr q11, [x17, #+320] +ldr q20, [x17, #+336] +mla v15.4S, v25.4S, v31.s[0] +sub v25.4s, v19.4s, v17.4s +sqrdmulh v3.4S, v10.4S, v20.s[0] +str q25, [x0, #208] +ldr q25, [x0, #432] +mla v1.4S, v0.4S, v31.s[0] +add v19.4s, v19.4s, v17.4s +sqrdmulh v17.4S, v25.4S, v20.s[0] +str q19, [x0, #192] +ldr q19, [x17, #+352] +ldr q0, [x17, #+368] +mla v16.4S, v13.4S, v31.s[0] +sub v13.4s, v5.4s, v8.4s +sqrdmulh v14.4S, v9.4S, v0.s[0] +str q13, [x0, #240] +ldr q13, [x0, #496] +mla v30.4S, v2.4S, v31.s[0] +add v5.4s, v5.4s, v8.4s +sqrdmulh v8.4S, v13.4S, v0.s[0] +str q5, [x0, #224] +ldr q5, [x0, #256] +ldr q2, [x0, #384] +mul v10.4S, v10.4S,v11.s[0] +sub v22.4s, v5.4s, v15.4s +ldr q27, [x0, #272] +mul v25.4S, v25.4S,v11.s[0] +add v5.4s, v5.4s, v15.4s +ldr q15, [x0, #400] +mla v10.4S, v3.4S, v31.s[0] +sub v3.4s, v27.4s, v1.4s +ldr q26, [x0, #320] +mla v25.4S, v17.4S, v31.s[0] +add v27.4s, v27.4s, v1.4s +ldr q1, [x0, #448] +mul v9.4S, v9.4S,v19.s[0] +sub v17.4s, v26.4s, v16.4s +ldr q23, [x0, #336] +mul v13.4S, v13.4S,v19.s[0] +add v26.4s, v26.4s, v16.4s +ldr q16, [x0, #464] +mla v9.4S, v14.4S, v31.s[0] +mla v13.4S, v8.4S, v31.s[0] +sub v8.4s, v23.4s, v30.4s +sqrdmulh v14.4S, v27.4S, v4.s[1] +add v23.4s, v23.4s, v30.4s +mul v27.4S, v27.4S,v21.s[1] +sqrdmulh v30.4S, v3.4S, v4.s[2] +sub v12.4s, v2.4s, v10.4s +mul v3.4S, v3.4S,v21.s[2] +add v2.4s, v2.4s, v10.4s +sqrdmulh v4.4S, v23.4S, v24.s[1] +sub v21.4s, v15.4s, v25.4s +mul v23.4S, v23.4S,v6.s[1] +add v15.4s, v15.4s, v25.4s +sqrdmulh v25.4S, v8.4S, v24.s[2] +sub v10.4s, v1.4s, v9.4s +mul v8.4S, v8.4S,v6.s[2] +add v1.4s, v1.4s, v9.4s +mla v27.4S, v14.4S, v31.s[0] +sub v14.4s, v16.4s, v13.4s +ldr q24, [x0, #736] +sqrdmulh v6.4S, v15.4S, v20.s[1] +add v16.4s, v16.4s, v13.4s +mla v3.4S, v30.4S, v31.s[0] +ldr q30, [x0, #672] +sqrdmulh v13.4S, v21.4S, v20.s[2] +sub v9.4s, v5.4s, v27.4s +mla v23.4S, v4.4S, v31.s[0] +ldr q4, [x0, #544] +sqrdmulh v29.4S, v16.4S, v0.s[1] +add v5.4s, v5.4s, v27.4s +str q9, [x0, #272] +mla v8.4S, v25.4S, v31.s[0] +ldr q25, [x17, #+384] +ldr q9, [x17, #+400] +sqrdmulh v27.4S, v14.4S, v0.s[2] +sub v7.4s, v22.4s, v3.4s +str q5, [x0, #256] +mul v15.4S, v15.4S,v11.s[1] +add v22.4s, v22.4s, v3.4s +mul v21.4S, v21.4S,v11.s[2] +str q7, [x0, #304] +mla v15.4S, v6.4S, v31.s[0] +sub v6.4s, v26.4s, v23.4s +mla v21.4S, v13.4S, v31.s[0] +str q22, [x0, #288] +mul v16.4S, v16.4S,v19.s[1] +str q6, [x0, #336] +mul v14.4S, v14.4S,v19.s[2] +add v26.4s, v26.4s, v23.4s +str q26, [x0, #320] +mla v16.4S, v29.4S, v31.s[0] +sub v29.4s, v17.4s, v8.4s +str q29, [x0, #368] +mla v14.4S, v27.4S, v31.s[0] +add v17.4s, v17.4s, v8.4s +str q17, [x0, #352] +sqrdmulh v0.4S, v4.4S, v9.s[0] +sub v19.4s, v2.4s, v15.4s +mul v4.4S, v4.4S,v25.s[0] +str q19, [x0, #400] +ldr q19, [x0, #560] +sqrdmulh v17.4S, v19.4S, v9.s[0] +add v2.4s, v2.4s, v15.4s +mul v19.4S, v19.4S,v25.s[0] +str q2, [x0, #384] +ldr q2, [x17, #+416] +ldr q15, [x17, #+432] +ldr q8, [x0, #608] +sqrdmulh v27.4S, v8.4S, v15.s[0] +sub v29.4s, v12.4s, v21.4s +mul v8.4S, v8.4S,v2.s[0] +str q29, [x0, #432] +ldr q29, [x0, #624] +sqrdmulh v26.4S, v29.4S, v15.s[0] +add v12.4s, v12.4s, v21.4s +mul v29.4S, v29.4S,v2.s[0] +str q12, [x0, #416] +ldr q12, [x17, #+448] +ldr q21, [x17, #+464] +mla v4.4S, v0.4S, v31.s[0] +sub v0.4s, v1.4s, v16.4s +sqrdmulh v23.4S, v30.4S, v21.s[0] +str q0, [x0, #464] +ldr q0, [x0, #688] +mla v19.4S, v17.4S, v31.s[0] +add v1.4s, v1.4s, v16.4s +sqrdmulh v16.4S, v0.4S, v21.s[0] +str q1, [x0, #448] +ldr q1, [x17, #+480] +ldr q17, [x17, #+496] +mla v8.4S, v27.4S, v31.s[0] +sub v27.4s, v10.4s, v14.4s +sqrdmulh v6.4S, v24.4S, v17.s[0] +str q27, [x0, #496] +ldr q27, [x0, #752] +mla v29.4S, v26.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v27.4S, v17.s[0] +str q10, [x0, #480] +ldr q10, [x0, #512] +ldr q26, [x0, #640] +mul v30.4S, v30.4S,v12.s[0] +sub v20.4s, v10.4s, v4.4s +ldr q11, [x0, #528] +mul v0.4S, v0.4S,v12.s[0] +add v10.4s, v10.4s, v4.4s +ldr q4, [x0, #656] +mla v30.4S, v23.4S, v31.s[0] +sub v23.4s, v11.4s, v19.4s +ldr q22, [x0, #576] +mla v0.4S, v16.4S, v31.s[0] +add v11.4s, v11.4s, v19.4s +ldr q19, [x0, #704] +mul v24.4S, v24.4S,v1.s[0] +sub v16.4s, v22.4s, v8.4s +ldr q13, [x0, #592] +mul v27.4S, v27.4S,v1.s[0] +add v22.4s, v22.4s, v8.4s +ldr q8, [x0, #720] +mla v24.4S, v6.4S, v31.s[0] +mla v27.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v29.4s +sqrdmulh v6.4S, v11.4S, v9.s[1] +add v13.4s, v13.4s, v29.4s +mul v11.4S, v11.4S,v25.s[1] +sqrdmulh v29.4S, v23.4S, v9.s[2] +sub v7.4s, v26.4s, v30.4s +mul v23.4S, v23.4S,v25.s[2] +add v26.4s, v26.4s, v30.4s +sqrdmulh v9.4S, v13.4S, v15.s[1] +sub v25.4s, v4.4s, v0.4s +mul v13.4S, v13.4S,v2.s[1] +add v4.4s, v4.4s, v0.4s +sqrdmulh v0.4S, v14.4S, v15.s[2] +sub v30.4s, v19.4s, v24.4s +mul v14.4S, v14.4S,v2.s[2] +add v19.4s, v19.4s, v24.4s +mla v11.4S, v6.4S, v31.s[0] +sub v6.4s, v8.4s, v27.4s +ldr q15, [x0, #992] +sqrdmulh v2.4S, v4.4S, v21.s[1] +add v8.4s, v8.4s, v27.4s +mla v23.4S, v29.4S, v31.s[0] +ldr q29, [x0, #928] +sqrdmulh v27.4S, v25.4S, v21.s[2] +sub v24.4s, v10.4s, v11.4s +mla v13.4S, v9.4S, v31.s[0] +ldr q9, [x0, #800] +sqrdmulh v3.4S, v8.4S, v17.s[1] +add v10.4s, v10.4s, v11.4s +str q24, [x0, #528] +mla v14.4S, v0.4S, v31.s[0] +ldr q0, [x17, #+512] +ldr q24, [x17, #+528] +sqrdmulh v11.4S, v6.4S, v17.s[2] +sub v5.4s, v20.4s, v23.4s +str q10, [x0, #512] +mul v4.4S, v4.4S,v12.s[1] +add v20.4s, v20.4s, v23.4s +mul v25.4S, v25.4S,v12.s[2] +str q5, [x0, #560] +mla v4.4S, v2.4S, v31.s[0] +sub v2.4s, v22.4s, v13.4s +mla v25.4S, v27.4S, v31.s[0] +str q20, [x0, #544] +mul v8.4S, v8.4S,v1.s[1] +str q2, [x0, #592] +mul v6.4S, v6.4S,v1.s[2] +add v22.4s, v22.4s, v13.4s +str q22, [x0, #576] +mla v8.4S, v3.4S, v31.s[0] +sub v3.4s, v16.4s, v14.4s +str q3, [x0, #624] +mla v6.4S, v11.4S, v31.s[0] +add v16.4s, v16.4s, v14.4s +str q16, [x0, #608] +sqrdmulh v17.4S, v9.4S, v24.s[0] +sub v1.4s, v26.4s, v4.4s +mul v9.4S, v9.4S,v0.s[0] +str q1, [x0, #656] +ldr q1, [x0, #816] +sqrdmulh v16.4S, v1.4S, v24.s[0] +add v26.4s, v26.4s, v4.4s +mul v1.4S, v1.4S,v0.s[0] +str q26, [x0, #640] +ldr q26, [x17, #+544] +ldr q4, [x17, #+560] +ldr q14, [x0, #864] +sqrdmulh v11.4S, v14.4S, v4.s[0] +sub v3.4s, v7.4s, v25.4s +mul v14.4S, v14.4S,v26.s[0] +str q3, [x0, #688] +ldr q3, [x0, #880] +sqrdmulh v22.4S, v3.4S, v4.s[0] +add v7.4s, v7.4s, v25.4s +mul v3.4S, v3.4S,v26.s[0] +str q7, [x0, #672] +ldr q7, [x17, #+576] +ldr q25, [x17, #+592] +mla v9.4S, v17.4S, v31.s[0] +sub v17.4s, v19.4s, v8.4s +sqrdmulh v13.4S, v29.4S, v25.s[0] +str q17, [x0, #720] +ldr q17, [x0, #944] +mla v1.4S, v16.4S, v31.s[0] +add v19.4s, v19.4s, v8.4s +sqrdmulh v8.4S, v17.4S, v25.s[0] +str q19, [x0, #704] +ldr q19, [x17, #+608] +ldr q16, [x17, #+624] +mla v14.4S, v11.4S, v31.s[0] +sub v11.4s, v30.4s, v6.4s +sqrdmulh v2.4S, v15.4S, v16.s[0] +str q11, [x0, #752] +ldr q11, [x0, #1008] +mla v3.4S, v22.4S, v31.s[0] +add v30.4s, v30.4s, v6.4s +sqrdmulh v6.4S, v11.4S, v16.s[0] +str q30, [x0, #736] +ldr q30, [x0, #768] +ldr q22, [x0, #896] +mul v29.4S, v29.4S,v7.s[0] +sub v21.4s, v30.4s, v9.4s +ldr q12, [x0, #784] +mul v17.4S, v17.4S,v7.s[0] +add v30.4s, v30.4s, v9.4s +ldr q9, [x0, #912] +mla v29.4S, v13.4S, v31.s[0] +sub v13.4s, v12.4s, v1.4s +ldr q20, [x0, #832] +mla v17.4S, v8.4S, v31.s[0] +add v12.4s, v12.4s, v1.4s +ldr q1, [x0, #960] +mul v15.4S, v15.4S,v19.s[0] +sub v8.4s, v20.4s, v14.4s +ldr q27, [x0, #848] +mul v11.4S, v11.4S,v19.s[0] +add v20.4s, v20.4s, v14.4s +ldr q14, [x0, #976] +mla v15.4S, v2.4S, v31.s[0] +mla v11.4S, v6.4S, v31.s[0] +sub v6.4s, v27.4s, v3.4s +sqrdmulh v2.4S, v12.4S, v24.s[1] +add v27.4s, v27.4s, v3.4s +mul v12.4S, v12.4S,v0.s[1] +sqrdmulh v3.4S, v13.4S, v24.s[2] +sub v5.4s, v22.4s, v29.4s +mul v13.4S, v13.4S,v0.s[2] +add v22.4s, v22.4s, v29.4s +sqrdmulh v24.4S, v27.4S, v4.s[1] +sub v0.4s, v9.4s, v17.4s +mul v27.4S, v27.4S,v26.s[1] +add v9.4s, v9.4s, v17.4s +sqrdmulh v17.4S, v6.4S, v4.s[2] +sub v29.4s, v1.4s, v15.4s +mul v6.4S, v6.4S,v26.s[2] +add v1.4s, v1.4s, v15.4s +mla v12.4S, v2.4S, v31.s[0] +sub v2.4s, v14.4s, v11.4s +sqrdmulh v4.4S, v9.4S, v25.s[1] +add v14.4s, v14.4s, v11.4s +mla v13.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v0.4S, v25.s[2] +sub v11.4s, v30.4s, v12.4s +mla v27.4S, v24.4S, v31.s[0] +sqrdmulh v24.4S, v14.4S, v16.s[1] +add v30.4s, v30.4s, v12.4s +str q11, [x0, #784] +mla v6.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v2.4S, v16.s[2] +sub v11.4s, v21.4s, v13.4s +str q30, [x0, #768] +mul v9.4S, v9.4S,v7.s[1] +add v21.4s, v21.4s, v13.4s +mul v0.4S, v0.4S,v7.s[2] +str q11, [x0, #816] +mla v9.4S, v4.4S, v31.s[0] +sub v4.4s, v20.4s, v27.4s +mla v0.4S, v3.4S, v31.s[0] +str q21, [x0, #800] +mul v14.4S, v14.4S,v19.s[1] +str q4, [x0, #848] +mul v2.4S, v2.4S,v19.s[2] +add v20.4s, v20.4s, v27.4s +str q20, [x0, #832] +mla v14.4S, v24.4S, v31.s[0] +sub v24.4s, v8.4s, v6.4s +str q24, [x0, #880] +mla v2.4S, v17.4S, v31.s[0] +add v8.4s, v8.4s, v6.4s +str q8, [x0, #864] +sub v16.4s, v22.4s, v9.4s +str q16, [x0, #912] +add v22.4s, v22.4s, v9.4s +str q22, [x0, #896] +sub v22.4s, v5.4s, v0.4s +str q22, [x0, #944] +add v5.4s, v5.4s, v0.4s +str q5, [x0, #928] +sub v5.4s, v1.4s, v14.4s +str q5, [x0, #976] +add v1.4s, v1.4s, v14.4s +str q1, [x0, #960] +sub v1.4s, v29.4s, v2.4s +str q1, [x0, #1008] +add v29.4s, v29.4s, v2.4s +str q29, [x0, #992] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1464 +// Instruction count: 1460 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_13_z4_7.s b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_13_z4_7.s new file mode 100644 index 0000000..4533d8c --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_13_z4_7.s @@ -0,0 +1,1494 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_13_z4_7 +.global _ntt_u32_incomplete_neon_asm_var_4_2_13_z4_7 +ntt_u32_incomplete_neon_asm_var_4_2_13_z4_7: +_ntt_u32_incomplete_neon_asm_var_4_2_13_z4_7: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #928] +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +ldr q20, [x0, #992] +sqrdmulh v19.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q18, [x0, #800] +sqrdmulh v17.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +ldr q16, [x0, #864] +sqrdmulh v3.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +ldr q2, [x0, #544] +mla v22.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v2.4S, v29.s[0] +ldr q1, [x0, #608] +mla v20.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v1.4S, v29.s[0] +ldr q0, [x0, #672] +mla v18.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v0.4S, v29.s[0] +ldr q15, [x0, #736] +mla v16.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v15.4S, v29.s[0] +ldr q14, [x0, #416] +ldr q13, [x0, #480] +mul v2.4S, v2.4S,v30.s[0] +sub v12.4s, v14.4s, v22.4s +mul v1.4S, v1.4S,v30.s[0] +add v14.4s, v14.4s, v22.4s +ldr q22, [x0, #288] +ldr q11, [x0, #352] +mla v2.4S, v21.4S, v31.s[0] +sub v21.4s, v13.4s, v20.4s +mla v1.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v20.4s +ldr q20, [x0, #32] +ldr q19, [x0, #96] +mul v0.4S, v0.4S,v30.s[0] +sub v10.4s, v22.4s, v18.4s +mul v15.4S, v15.4S,v30.s[0] +add v22.4s, v22.4s, v18.4s +ldr q18, [x0, #160] +ldr q9, [x0, #224] +mla v0.4S, v17.4S, v31.s[0] +sub v17.4s, v11.4s, v16.4s +mla v15.4S, v3.4S, v31.s[0] +add v11.4s, v11.4s, v16.4s +sqrdmulh v16.4S, v14.4S, v29.s[1] +mul v14.4S, v14.4S,v30.s[1] +sqrdmulh v3.4S, v13.4S, v29.s[1] +sub v8.4s, v20.4s, v2.4s +mul v13.4S, v13.4S,v30.s[1] +add v20.4s, v20.4s, v2.4s +sqrdmulh v2.4S, v22.4S, v29.s[1] +sub v7.4s, v19.4s, v1.4s +mul v22.4S, v22.4S,v30.s[1] +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v11.4S, v29.s[1] +sub v6.4s, v18.4s, v0.4s +mul v11.4S, v11.4S,v30.s[1] +add v18.4s, v18.4s, v0.4s +mla v14.4S, v16.4S, v31.s[0] +sub v16.4s, v9.4s, v15.4s +sqrdmulh v0.4S, v12.4S, v29.s[2] +add v9.4s, v9.4s, v15.4s +mla v13.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v21.4S, v29.s[2] +mla v22.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v10.4S, v29.s[2] +mla v11.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v17.4S, v29.s[2] +mul v12.4S, v12.4S,v30.s[2] +sub v15.4s, v18.4s, v14.4s +mul v21.4S, v21.4S,v30.s[2] +add v18.4s, v18.4s, v14.4s +mla v12.4S, v0.4S, v31.s[0] +sub v0.4s, v9.4s, v13.4s +mla v21.4S, v3.4S, v31.s[0] +add v9.4s, v9.4s, v13.4s +mul v10.4S, v10.4S,v30.s[2] +sub v13.4s, v20.4s, v22.4s +mul v17.4S, v17.4S,v30.s[2] +add v20.4s, v20.4s, v22.4s +mla v10.4S, v2.4S, v31.s[0] +sub v2.4s, v19.4s, v11.4s +mla v17.4S, v1.4S, v31.s[0] +add v19.4s, v19.4s, v11.4s +sqrdmulh v11.4S, v15.4S, v27.s[1] +mul v15.4S, v15.4S,v28.s[1] +sqrdmulh v1.4S, v0.4S, v27.s[1] +sub v22.4s, v6.4s, v12.4s +mul v0.4S, v0.4S,v28.s[1] +add v6.4s, v6.4s, v12.4s +sqrdmulh v12.4S, v18.4S, v27.s[0] +sub v3.4s, v16.4s, v21.4s +mul v18.4S, v18.4S,v28.s[0] +add v16.4s, v16.4s, v21.4s +sqrdmulh v21.4S, v9.4S, v27.s[0] +sub v14.4s, v8.4s, v10.4s +mul v9.4S, v9.4S,v28.s[0] +add v8.4s, v8.4s, v10.4s +mla v15.4S, v11.4S, v31.s[0] +sub v11.4s, v7.4s, v17.4s +sqrdmulh v10.4S, v6.4S, v27.s[2] +add v7.4s, v7.4s, v17.4s +mla v0.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v16.4S, v27.s[2] +mla v18.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v22.4S, v27.s[3] +mla v9.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v3.4S, v27.s[3] +mul v6.4S, v6.4S,v28.s[2] +sub v17.4s, v13.4s, v15.4s +mul v16.4S, v16.4S,v28.s[2] +add v13.4s, v13.4s, v15.4s +mla v6.4S, v10.4S, v31.s[0] +sub v10.4s, v2.4s, v0.4s +mla v16.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v0.4s +mul v22.4S, v22.4S,v28.s[3] +sub v0.4s, v20.4s, v18.4s +mul v3.4S, v3.4S,v28.s[3] +add v20.4s, v20.4s, v18.4s +mla v22.4S, v12.4S, v31.s[0] +sub v12.4s, v19.4s, v9.4s +mla v3.4S, v21.4S, v31.s[0] +add v19.4s, v19.4s, v9.4s +sqrdmulh v9.4S, v2.4S, v25.s[2] +mul v2.4S, v2.4S,v26.s[2] +sqrdmulh v21.4S, v10.4S, v25.s[3] +sub v18.4s, v8.4s, v6.4s +mul v10.4S, v10.4S,v26.s[3] +add v8.4s, v8.4s, v6.4s +sqrdmulh v6.4S, v12.4S, v25.s[1] +sub v1.4s, v7.4s, v16.4s +mul v12.4S, v12.4S,v26.s[1] +add v7.4s, v7.4s, v16.4s +sqrdmulh v16.4S, v19.4S, v25.s[0] +sub v15.4s, v14.4s, v22.4s +mul v19.4S, v19.4S,v26.s[0] +add v14.4s, v14.4s, v22.4s +mla v2.4S, v9.4S, v31.s[0] +sub v9.4s, v11.4s, v3.4s +sqrdmulh v22.4S, v7.4S, v23.s[0] +add v11.4s, v11.4s, v3.4s +mla v10.4S, v21.4S, v31.s[0] +sub v21.4s, v13.4s, v2.4s +sqrdmulh v3.4S, v1.4S, v23.s[1] +add v13.4s, v13.4s, v2.4s +mla v12.4S, v6.4S, v31.s[0] +sub v6.4s, v17.4s, v10.4s +sqrdmulh v2.4S, v11.4S, v23.s[2] +add v17.4s, v17.4s, v10.4s +mla v19.4S, v16.4S, v31.s[0] +sub v16.4s, v0.4s, v12.4s +sqrdmulh v10.4S, v9.4S, v23.s[3] +add v0.4s, v0.4s, v12.4s +mul v7.4S, v7.4S,v24.s[0] +sub v12.4s, v20.4s, v19.4s +mul v1.4S, v1.4S,v24.s[1] +add v20.4s, v20.4s, v19.4s +mla v7.4S, v22.4S, v31.s[0] +str q21, [x0, #352] +mla v1.4S, v3.4S, v31.s[0] +str q13, [x0, #288] +mul v11.4S, v11.4S,v24.s[2] +str q6, [x0, #480] +mul v9.4S, v9.4S,v24.s[3] +str q17, [x0, #416] +mla v11.4S, v2.4S, v31.s[0] +str q16, [x0, #224] +mla v9.4S, v10.4S, v31.s[0] +str q0, [x0, #160] +ldr q0, [x0, #944] +sqrdmulh v10.4S, v0.4S, v29.s[0] +str q12, [x0, #96] +mul v0.4S, v0.4S,v30.s[0] +str q20, [x0, #32] +ldr q20, [x0, #1008] +sqrdmulh v12.4S, v20.4S, v29.s[0] +sub v16.4s, v8.4s, v7.4s +str q16, [x0, #608] +mul v20.4S, v20.4S,v30.s[0] +add v8.4s, v8.4s, v7.4s +ldr q7, [x0, #816] +sqrdmulh v16.4S, v7.4S, v29.s[0] +sub v2.4s, v18.4s, v1.4s +str q8, [x0, #544] +mul v7.4S, v7.4S,v30.s[0] +add v18.4s, v18.4s, v1.4s +ldr q1, [x0, #880] +sqrdmulh v8.4S, v1.4S, v29.s[0] +sub v17.4s, v14.4s, v11.4s +str q2, [x0, #736] +mul v1.4S, v1.4S,v30.s[0] +add v14.4s, v14.4s, v11.4s +ldr q11, [x0, #560] +mla v0.4S, v10.4S, v31.s[0] +sub v10.4s, v15.4s, v9.4s +str q18, [x0, #672] +sqrdmulh v18.4S, v11.4S, v29.s[0] +add v15.4s, v15.4s, v9.4s +ldr q9, [x0, #624] +mla v20.4S, v12.4S, v31.s[0] +str q17, [x0, #864] +sqrdmulh v17.4S, v9.4S, v29.s[0] +ldr q12, [x0, #688] +mla v7.4S, v16.4S, v31.s[0] +str q14, [x0, #800] +sqrdmulh v14.4S, v12.4S, v29.s[0] +ldr q16, [x0, #752] +mla v1.4S, v8.4S, v31.s[0] +str q10, [x0, #992] +sqrdmulh v10.4S, v16.4S, v29.s[0] +ldr q8, [x0, #432] +ldr q2, [x0, #496] +mul v11.4S, v11.4S,v30.s[0] +sub v6.4s, v8.4s, v0.4s +str q15, [x0, #928] +mul v9.4S, v9.4S,v30.s[0] +add v8.4s, v8.4s, v0.4s +ldr q0, [x0, #304] +ldr q15, [x0, #368] +mla v11.4S, v18.4S, v31.s[0] +sub v18.4s, v2.4s, v20.4s +mla v9.4S, v17.4S, v31.s[0] +add v2.4s, v2.4s, v20.4s +ldr q20, [x0, #48] +ldr q17, [x0, #112] +mul v12.4S, v12.4S,v30.s[0] +sub v13.4s, v0.4s, v7.4s +mul v16.4S, v16.4S,v30.s[0] +add v0.4s, v0.4s, v7.4s +ldr q7, [x0, #176] +ldr q3, [x0, #240] +mla v12.4S, v14.4S, v31.s[0] +sub v14.4s, v15.4s, v1.4s +mla v16.4S, v10.4S, v31.s[0] +add v15.4s, v15.4s, v1.4s +sqrdmulh v1.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +sqrdmulh v10.4S, v2.4S, v29.s[1] +sub v21.4s, v20.4s, v11.4s +mul v2.4S, v2.4S,v30.s[1] +add v20.4s, v20.4s, v11.4s +sqrdmulh v11.4S, v0.4S, v29.s[1] +sub v22.4s, v17.4s, v9.4s +mul v0.4S, v0.4S,v30.s[1] +add v17.4s, v17.4s, v9.4s +sqrdmulh v9.4S, v15.4S, v29.s[1] +sub v19.4s, v7.4s, v12.4s +mul v15.4S, v15.4S,v30.s[1] +add v7.4s, v7.4s, v12.4s +mla v8.4S, v1.4S, v31.s[0] +sub v1.4s, v3.4s, v16.4s +sqrdmulh v12.4S, v6.4S, v29.s[2] +add v3.4s, v3.4s, v16.4s +mla v2.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v18.4S, v29.s[2] +mla v0.4S, v11.4S, v31.s[0] +sqrdmulh v11.4S, v13.4S, v29.s[2] +mla v15.4S, v9.4S, v31.s[0] +sqrdmulh v9.4S, v14.4S, v29.s[2] +mul v6.4S, v6.4S,v30.s[2] +sub v16.4s, v7.4s, v8.4s +mul v18.4S, v18.4S,v30.s[2] +add v7.4s, v7.4s, v8.4s +mla v6.4S, v12.4S, v31.s[0] +sub v12.4s, v3.4s, v2.4s +mla v18.4S, v10.4S, v31.s[0] +add v3.4s, v3.4s, v2.4s +mul v13.4S, v13.4S,v30.s[2] +sub v2.4s, v20.4s, v0.4s +mul v14.4S, v14.4S,v30.s[2] +add v20.4s, v20.4s, v0.4s +mla v13.4S, v11.4S, v31.s[0] +sub v11.4s, v17.4s, v15.4s +mla v14.4S, v9.4S, v31.s[0] +add v17.4s, v17.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v27.s[1] +mul v16.4S, v16.4S,v28.s[1] +sqrdmulh v9.4S, v12.4S, v27.s[1] +sub v0.4s, v19.4s, v6.4s +mul v12.4S, v12.4S,v28.s[1] +add v19.4s, v19.4s, v6.4s +sqrdmulh v6.4S, v7.4S, v27.s[0] +sub v10.4s, v1.4s, v18.4s +mul v7.4S, v7.4S,v28.s[0] +add v1.4s, v1.4s, v18.4s +sqrdmulh v18.4S, v3.4S, v27.s[0] +sub v8.4s, v21.4s, v13.4s +mul v3.4S, v3.4S,v28.s[0] +add v21.4s, v21.4s, v13.4s +mla v16.4S, v15.4S, v31.s[0] +sub v15.4s, v22.4s, v14.4s +sqrdmulh v13.4S, v19.4S, v27.s[2] +add v22.4s, v22.4s, v14.4s +mla v12.4S, v9.4S, v31.s[0] +sqrdmulh v9.4S, v1.4S, v27.s[2] +mla v7.4S, v6.4S, v31.s[0] +sqrdmulh v6.4S, v0.4S, v27.s[3] +mla v3.4S, v18.4S, v31.s[0] +sqrdmulh v18.4S, v10.4S, v27.s[3] +mul v19.4S, v19.4S,v28.s[2] +sub v14.4s, v2.4s, v16.4s +mul v1.4S, v1.4S,v28.s[2] +add v2.4s, v2.4s, v16.4s +mla v19.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v12.4s +mla v1.4S, v9.4S, v31.s[0] +add v11.4s, v11.4s, v12.4s +mul v0.4S, v0.4S,v28.s[3] +sub v12.4s, v20.4s, v7.4s +mul v10.4S, v10.4S,v28.s[3] +add v20.4s, v20.4s, v7.4s +mla v0.4S, v6.4S, v31.s[0] +sub v6.4s, v17.4s, v3.4s +mla v10.4S, v18.4S, v31.s[0] +add v17.4s, v17.4s, v3.4s +sqrdmulh v3.4S, v11.4S, v25.s[2] +mul v11.4S, v11.4S,v26.s[2] +sqrdmulh v18.4S, v13.4S, v25.s[3] +sub v7.4s, v21.4s, v19.4s +mul v13.4S, v13.4S,v26.s[3] +add v21.4s, v21.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v25.s[1] +sub v9.4s, v22.4s, v1.4s +mul v6.4S, v6.4S,v26.s[1] +add v22.4s, v22.4s, v1.4s +sqrdmulh v1.4S, v17.4S, v25.s[0] +sub v16.4s, v8.4s, v0.4s +mul v17.4S, v17.4S,v26.s[0] +add v8.4s, v8.4s, v0.4s +mla v11.4S, v3.4S, v31.s[0] +sub v3.4s, v15.4s, v10.4s +sqrdmulh v0.4S, v22.4S, v23.s[0] +add v15.4s, v15.4s, v10.4s +mla v13.4S, v18.4S, v31.s[0] +sub v18.4s, v2.4s, v11.4s +sqrdmulh v10.4S, v9.4S, v23.s[1] +add v2.4s, v2.4s, v11.4s +mla v6.4S, v19.4S, v31.s[0] +sub v19.4s, v14.4s, v13.4s +sqrdmulh v11.4S, v15.4S, v23.s[2] +add v14.4s, v14.4s, v13.4s +mla v17.4S, v1.4S, v31.s[0] +sub v1.4s, v12.4s, v6.4s +sqrdmulh v13.4S, v3.4S, v23.s[3] +add v12.4s, v12.4s, v6.4s +mul v22.4S, v22.4S,v24.s[0] +sub v6.4s, v20.4s, v17.4s +mul v9.4S, v9.4S,v24.s[1] +add v20.4s, v20.4s, v17.4s +mla v22.4S, v0.4S, v31.s[0] +str q18, [x0, #368] +mla v9.4S, v10.4S, v31.s[0] +str q2, [x0, #304] +mul v15.4S, v15.4S,v24.s[2] +str q19, [x0, #496] +mul v3.4S, v3.4S,v24.s[3] +str q14, [x0, #432] +mla v15.4S, v11.4S, v31.s[0] +str q1, [x0, #240] +mla v3.4S, v13.4S, v31.s[0] +str q12, [x0, #176] +ldr q12, [x0, #896] +sqrdmulh v13.4S, v12.4S, v29.s[0] +str q6, [x0, #112] +mul v12.4S, v12.4S,v30.s[0] +str q20, [x0, #48] +ldr q20, [x0, #960] +sqrdmulh v6.4S, v20.4S, v29.s[0] +sub v1.4s, v21.4s, v22.4s +str q1, [x0, #624] +mul v20.4S, v20.4S,v30.s[0] +add v21.4s, v21.4s, v22.4s +ldr q22, [x0, #768] +sqrdmulh v1.4S, v22.4S, v29.s[0] +sub v11.4s, v7.4s, v9.4s +str q21, [x0, #560] +mul v22.4S, v22.4S,v30.s[0] +add v7.4s, v7.4s, v9.4s +ldr q9, [x0, #832] +sqrdmulh v21.4S, v9.4S, v29.s[0] +sub v14.4s, v8.4s, v15.4s +str q11, [x0, #752] +mul v9.4S, v9.4S,v30.s[0] +add v8.4s, v8.4s, v15.4s +ldr q15, [x0, #512] +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v16.4s, v3.4s +str q7, [x0, #688] +sqrdmulh v7.4S, v15.4S, v29.s[0] +add v16.4s, v16.4s, v3.4s +ldr q3, [x0, #576] +mla v20.4S, v6.4S, v31.s[0] +str q14, [x0, #880] +sqrdmulh v14.4S, v3.4S, v29.s[0] +ldr q6, [x0, #640] +mla v22.4S, v1.4S, v31.s[0] +str q8, [x0, #816] +sqrdmulh v8.4S, v6.4S, v29.s[0] +ldr q1, [x0, #704] +mla v9.4S, v21.4S, v31.s[0] +str q13, [x0, #1008] +sqrdmulh v13.4S, v1.4S, v29.s[0] +ldr q21, [x0, #384] +ldr q11, [x0, #448] +mul v15.4S, v15.4S,v30.s[0] +sub v19.4s, v21.4s, v12.4s +str q16, [x0, #944] +mul v3.4S, v3.4S,v30.s[0] +add v21.4s, v21.4s, v12.4s +ldr q12, [x0, #256] +ldr q16, [x0, #320] +mla v15.4S, v7.4S, v31.s[0] +sub v7.4s, v11.4s, v20.4s +mla v3.4S, v14.4S, v31.s[0] +add v11.4s, v11.4s, v20.4s +ldr q20, [x0, #0] +ldr q14, [x0, #64] +mul v6.4S, v6.4S,v30.s[0] +sub v2.4s, v12.4s, v22.4s +mul v1.4S, v1.4S,v30.s[0] +add v12.4s, v12.4s, v22.4s +ldr q22, [x0, #128] +ldr q10, [x0, #192] +mla v6.4S, v8.4S, v31.s[0] +sub v8.4s, v16.4s, v9.4s +mla v1.4S, v13.4S, v31.s[0] +add v16.4s, v16.4s, v9.4s +sqrdmulh v9.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sqrdmulh v13.4S, v11.4S, v29.s[1] +sub v18.4s, v20.4s, v15.4s +mul v11.4S, v11.4S,v30.s[1] +add v20.4s, v20.4s, v15.4s +sqrdmulh v15.4S, v12.4S, v29.s[1] +sub v0.4s, v14.4s, v3.4s +mul v12.4S, v12.4S,v30.s[1] +add v14.4s, v14.4s, v3.4s +sqrdmulh v3.4S, v16.4S, v29.s[1] +sub v17.4s, v22.4s, v6.4s +mul v16.4S, v16.4S,v30.s[1] +add v22.4s, v22.4s, v6.4s +mla v21.4S, v9.4S, v31.s[0] +sub v9.4s, v10.4s, v1.4s +sqrdmulh v6.4S, v19.4S, v29.s[2] +add v10.4s, v10.4s, v1.4s +mla v11.4S, v13.4S, v31.s[0] +sqrdmulh v13.4S, v7.4S, v29.s[2] +mla v12.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v2.4S, v29.s[2] +mla v16.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v8.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +sub v1.4s, v22.4s, v21.4s +mul v7.4S, v7.4S,v30.s[2] +add v22.4s, v22.4s, v21.4s +mla v19.4S, v6.4S, v31.s[0] +sub v6.4s, v10.4s, v11.4s +mla v7.4S, v13.4S, v31.s[0] +add v10.4s, v10.4s, v11.4s +mul v2.4S, v2.4S,v30.s[2] +sub v11.4s, v20.4s, v12.4s +mul v8.4S, v8.4S,v30.s[2] +add v20.4s, v20.4s, v12.4s +mla v2.4S, v15.4S, v31.s[0] +sub v15.4s, v14.4s, v16.4s +mla v8.4S, v3.4S, v31.s[0] +add v14.4s, v14.4s, v16.4s +sqrdmulh v16.4S, v1.4S, v27.s[1] +mul v1.4S, v1.4S,v28.s[1] +sqrdmulh v3.4S, v6.4S, v27.s[1] +sub v12.4s, v17.4s, v19.4s +mul v6.4S, v6.4S,v28.s[1] +add v17.4s, v17.4s, v19.4s +sqrdmulh v19.4S, v22.4S, v27.s[0] +sub v13.4s, v9.4s, v7.4s +mul v22.4S, v22.4S,v28.s[0] +add v9.4s, v9.4s, v7.4s +sqrdmulh v7.4S, v10.4S, v27.s[0] +sub v21.4s, v18.4s, v2.4s +mul v10.4S, v10.4S,v28.s[0] +add v18.4s, v18.4s, v2.4s +mla v1.4S, v16.4S, v31.s[0] +sub v16.4s, v0.4s, v8.4s +sqrdmulh v2.4S, v17.4S, v27.s[2] +add v0.4s, v0.4s, v8.4s +mla v6.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v9.4S, v27.s[2] +mla v22.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v12.4S, v27.s[3] +mla v10.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v13.4S, v27.s[3] +mul v17.4S, v17.4S,v28.s[2] +sub v8.4s, v11.4s, v1.4s +mul v9.4S, v9.4S,v28.s[2] +add v11.4s, v11.4s, v1.4s +mla v17.4S, v2.4S, v31.s[0] +sub v2.4s, v15.4s, v6.4s +mla v9.4S, v3.4S, v31.s[0] +add v15.4s, v15.4s, v6.4s +mul v12.4S, v12.4S,v28.s[3] +sub v6.4s, v20.4s, v22.4s +mul v13.4S, v13.4S,v28.s[3] +add v20.4s, v20.4s, v22.4s +mla v12.4S, v19.4S, v31.s[0] +sub v19.4s, v14.4s, v10.4s +mla v13.4S, v7.4S, v31.s[0] +add v14.4s, v14.4s, v10.4s +sqrdmulh v10.4S, v15.4S, v25.s[2] +mul v15.4S, v15.4S,v26.s[2] +sqrdmulh v7.4S, v2.4S, v25.s[3] +sub v22.4s, v18.4s, v17.4s +mul v2.4S, v2.4S,v26.s[3] +add v18.4s, v18.4s, v17.4s +sqrdmulh v17.4S, v19.4S, v25.s[1] +sub v3.4s, v0.4s, v9.4s +mul v19.4S, v19.4S,v26.s[1] +add v0.4s, v0.4s, v9.4s +sqrdmulh v9.4S, v14.4S, v25.s[0] +sub v1.4s, v21.4s, v12.4s +mul v14.4S, v14.4S,v26.s[0] +add v21.4s, v21.4s, v12.4s +mla v15.4S, v10.4S, v31.s[0] +sub v10.4s, v16.4s, v13.4s +sqrdmulh v12.4S, v0.4S, v23.s[0] +add v16.4s, v16.4s, v13.4s +mla v2.4S, v7.4S, v31.s[0] +sub v7.4s, v11.4s, v15.4s +sqrdmulh v13.4S, v3.4S, v23.s[1] +add v11.4s, v11.4s, v15.4s +mla v19.4S, v17.4S, v31.s[0] +sub v17.4s, v8.4s, v2.4s +sqrdmulh v15.4S, v16.4S, v23.s[2] +add v8.4s, v8.4s, v2.4s +mla v14.4S, v9.4S, v31.s[0] +sub v9.4s, v6.4s, v19.4s +sqrdmulh v2.4S, v10.4S, v23.s[3] +add v6.4s, v6.4s, v19.4s +mul v0.4S, v0.4S,v24.s[0] +sub v19.4s, v20.4s, v14.4s +mul v3.4S, v3.4S,v24.s[1] +add v20.4s, v20.4s, v14.4s +mla v0.4S, v12.4S, v31.s[0] +str q7, [x0, #320] +mla v3.4S, v13.4S, v31.s[0] +str q11, [x0, #256] +mul v16.4S, v16.4S,v24.s[2] +str q17, [x0, #448] +mul v10.4S, v10.4S,v24.s[3] +str q8, [x0, #384] +mla v16.4S, v15.4S, v31.s[0] +str q9, [x0, #192] +mla v10.4S, v2.4S, v31.s[0] +str q6, [x0, #128] +ldr q6, [x0, #912] +sqrdmulh v2.4S, v6.4S, v29.s[0] +str q19, [x0, #64] +mul v6.4S, v6.4S,v30.s[0] +str q20, [x0, #0] +ldr q20, [x0, #976] +sqrdmulh v19.4S, v20.4S, v29.s[0] +sub v9.4s, v18.4s, v0.4s +str q9, [x0, #576] +mul v20.4S, v20.4S,v30.s[0] +add v18.4s, v18.4s, v0.4s +ldr q0, [x0, #784] +sqrdmulh v9.4S, v0.4S, v29.s[0] +sub v15.4s, v22.4s, v3.4s +str q18, [x0, #512] +mul v0.4S, v0.4S,v30.s[0] +add v22.4s, v22.4s, v3.4s +ldr q3, [x0, #848] +sqrdmulh v18.4S, v3.4S, v29.s[0] +sub v8.4s, v21.4s, v16.4s +str q15, [x0, #704] +mul v3.4S, v3.4S,v30.s[0] +add v21.4s, v21.4s, v16.4s +ldr q16, [x0, #528] +mla v6.4S, v2.4S, v31.s[0] +sub v2.4s, v1.4s, v10.4s +str q22, [x0, #640] +sqrdmulh v22.4S, v16.4S, v29.s[0] +add v1.4s, v1.4s, v10.4s +ldr q10, [x0, #592] +mla v20.4S, v19.4S, v31.s[0] +str q8, [x0, #832] +sqrdmulh v8.4S, v10.4S, v29.s[0] +ldr q19, [x0, #656] +mla v0.4S, v9.4S, v31.s[0] +str q21, [x0, #768] +sqrdmulh v21.4S, v19.4S, v29.s[0] +ldr q9, [x0, #720] +mla v3.4S, v18.4S, v31.s[0] +str q2, [x0, #960] +sqrdmulh v2.4S, v9.4S, v29.s[0] +ldr q18, [x0, #400] +ldr q15, [x0, #464] +mul v16.4S, v16.4S,v30.s[0] +sub v17.4s, v18.4s, v6.4s +str q1, [x0, #896] +mul v10.4S, v10.4S,v30.s[0] +add v18.4s, v18.4s, v6.4s +ldr q6, [x0, #272] +ldr q1, [x0, #336] +mla v16.4S, v22.4S, v31.s[0] +sub v22.4s, v15.4s, v20.4s +mla v10.4S, v8.4S, v31.s[0] +add v15.4s, v15.4s, v20.4s +ldr q20, [x0, #16] +ldr q8, [x0, #80] +mul v19.4S, v19.4S,v30.s[0] +sub v11.4s, v6.4s, v0.4s +mul v9.4S, v9.4S,v30.s[0] +add v6.4s, v6.4s, v0.4s +ldr q0, [x0, #144] +ldr q13, [x0, #208] +mla v19.4S, v21.4S, v31.s[0] +sub v21.4s, v1.4s, v3.4s +mla v9.4S, v2.4S, v31.s[0] +add v1.4s, v1.4s, v3.4s +sqrdmulh v3.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sqrdmulh v2.4S, v15.4S, v29.s[1] +sub v7.4s, v20.4s, v16.4s +mul v15.4S, v15.4S,v30.s[1] +add v20.4s, v20.4s, v16.4s +sqrdmulh v16.4S, v6.4S, v29.s[1] +sub v12.4s, v8.4s, v10.4s +mul v6.4S, v6.4S,v30.s[1] +add v8.4s, v8.4s, v10.4s +sqrdmulh v10.4S, v1.4S, v29.s[1] +sub v14.4s, v0.4s, v19.4s +mul v1.4S, v1.4S,v30.s[1] +add v0.4s, v0.4s, v19.4s +mla v18.4S, v3.4S, v31.s[0] +sub v3.4s, v13.4s, v9.4s +sqrdmulh v19.4S, v17.4S, v29.s[2] +add v13.4s, v13.4s, v9.4s +mla v15.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v22.4S, v29.s[2] +mla v6.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v11.4S, v29.s[2] +mla v1.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v21.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +sub v9.4s, v0.4s, v18.4s +mul v22.4S, v22.4S,v30.s[2] +add v0.4s, v0.4s, v18.4s +mla v17.4S, v19.4S, v31.s[0] +sub v19.4s, v13.4s, v15.4s +mla v22.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v15.4s +mul v11.4S, v11.4S,v30.s[2] +sub v15.4s, v20.4s, v6.4s +mul v21.4S, v21.4S,v30.s[2] +add v20.4s, v20.4s, v6.4s +mla v11.4S, v16.4S, v31.s[0] +sub v16.4s, v8.4s, v1.4s +mla v21.4S, v10.4S, v31.s[0] +add v8.4s, v8.4s, v1.4s +sqrdmulh v29.4S, v9.4S, v27.s[1] +mul v9.4S, v9.4S,v28.s[1] +sqrdmulh v30.4S, v19.4S, v27.s[1] +sub v1.4s, v14.4s, v17.4s +mul v19.4S, v19.4S,v28.s[1] +add v14.4s, v14.4s, v17.4s +sqrdmulh v17.4S, v0.4S, v27.s[0] +sub v10.4s, v3.4s, v22.4s +mul v0.4S, v0.4S,v28.s[0] +add v3.4s, v3.4s, v22.4s +sqrdmulh v22.4S, v13.4S, v27.s[0] +sub v6.4s, v7.4s, v11.4s +mul v13.4S, v13.4S,v28.s[0] +add v7.4s, v7.4s, v11.4s +mla v9.4S, v29.4S, v31.s[0] +sub v29.4s, v12.4s, v21.4s +sqrdmulh v11.4S, v14.4S, v27.s[2] +add v12.4s, v12.4s, v21.4s +mla v19.4S, v30.4S, v31.s[0] +sqrdmulh v30.4S, v3.4S, v27.s[2] +mla v0.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v1.4S, v27.s[3] +mla v13.4S, v22.4S, v31.s[0] +sqrdmulh v22.4S, v10.4S, v27.s[3] +mul v14.4S, v14.4S,v28.s[2] +sub v21.4s, v15.4s, v9.4s +mul v3.4S, v3.4S,v28.s[2] +add v15.4s, v15.4s, v9.4s +mla v14.4S, v11.4S, v31.s[0] +sub v11.4s, v16.4s, v19.4s +mla v3.4S, v30.4S, v31.s[0] +add v16.4s, v16.4s, v19.4s +mul v1.4S, v1.4S,v28.s[3] +sub v19.4s, v20.4s, v0.4s +mul v10.4S, v10.4S,v28.s[3] +add v20.4s, v20.4s, v0.4s +mla v1.4S, v17.4S, v31.s[0] +sub v17.4s, v8.4s, v13.4s +mla v10.4S, v22.4S, v31.s[0] +add v8.4s, v8.4s, v13.4s +sqrdmulh v27.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sqrdmulh v28.4S, v11.4S, v25.s[3] +sub v13.4s, v7.4s, v14.4s +mul v11.4S, v11.4S,v26.s[3] +add v7.4s, v7.4s, v14.4s +sqrdmulh v14.4S, v17.4S, v25.s[1] +sub v22.4s, v12.4s, v3.4s +mul v17.4S, v17.4S,v26.s[1] +add v12.4s, v12.4s, v3.4s +sqrdmulh v3.4S, v8.4S, v25.s[0] +sub v0.4s, v6.4s, v1.4s +mul v8.4S, v8.4S,v26.s[0] +add v6.4s, v6.4s, v1.4s +mla v16.4S, v27.4S, v31.s[0] +sub v27.4s, v29.4s, v10.4s +sqrdmulh v25.4S, v12.4S, v23.s[0] +add v29.4s, v29.4s, v10.4s +mla v11.4S, v28.4S, v31.s[0] +sub v28.4s, v15.4s, v16.4s +sqrdmulh v10.4S, v22.4S, v23.s[1] +add v15.4s, v15.4s, v16.4s +mla v17.4S, v14.4S, v31.s[0] +sub v14.4s, v21.4s, v11.4s +sqrdmulh v16.4S, v29.4S, v23.s[2] +add v21.4s, v21.4s, v11.4s +mla v8.4S, v3.4S, v31.s[0] +sub v3.4s, v19.4s, v17.4s +sqrdmulh v11.4S, v27.4S, v23.s[3] +add v19.4s, v19.4s, v17.4s +mul v12.4S, v12.4S,v24.s[0] +sub v17.4s, v20.4s, v8.4s +mul v22.4S, v22.4S,v24.s[1] +add v20.4s, v20.4s, v8.4s +mla v12.4S, v25.4S, v31.s[0] +str q28, [x0, #336] +mla v22.4S, v10.4S, v31.s[0] +str q15, [x0, #272] +mul v29.4S, v29.4S,v24.s[2] +str q14, [x0, #464] +mul v27.4S, v27.4S,v24.s[3] +str q21, [x0, #400] +mla v29.4S, v16.4S, v31.s[0] +str q3, [x0, #208] +mla v27.4S, v11.4S, v31.s[0] +str q19, [x0, #144] +str q17, [x0, #80] +str q20, [x0, #16] +sub v20.4s, v7.4s, v12.4s +str q20, [x0, #592] +add v7.4s, v7.4s, v12.4s +sub v12.4s, v13.4s, v22.4s +str q7, [x0, #528] +add v13.4s, v13.4s, v22.4s +sub v22.4s, v6.4s, v29.4s +str q12, [x0, #720] +add v6.4s, v6.4s, v29.4s +sub v29.4s, v0.4s, v27.4s +str q13, [x0, #656] +add v0.4s, v0.4s, v27.4s +str q22, [x0, #848] +str q6, [x0, #784] +str q29, [x0, #976] +str q0, [x0, #912] +ldr q4, [x0, #224] +ldr q5, [x0, #160] +ldr q18, [x0, #32] +ldr q2, [x17, #+128] +ldr q9, [x17, #+144] +sqrdmulh v30.4S, v18.4S, v9.s[0] +mul v18.4S, v18.4S,v2.s[0] +ldr q1, [x0, #48] +sqrdmulh v26.4S, v1.4S, v9.s[0] +mul v1.4S, v1.4S,v2.s[0] +ldr q8, [x17, #+160] +ldr q25, [x17, #+176] +ldr q28, [x0, #96] +sqrdmulh v10.4S, v28.4S, v25.s[0] +mul v28.4S, v28.4S,v8.s[0] +ldr q15, [x0, #112] +sqrdmulh v14.4S, v15.4S, v25.s[0] +mul v15.4S, v15.4S,v8.s[0] +ldr q21, [x17, #+192] +ldr q16, [x17, #+208] +mla v18.4S, v30.4S, v31.s[0] +sqrdmulh v30.4S, v5.4S, v16.s[0] +ldr q3, [x0, #176] +mla v1.4S, v26.4S, v31.s[0] +sqrdmulh v26.4S, v3.4S, v16.s[0] +ldr q11, [x17, #+224] +ldr q19, [x17, #+240] +mla v28.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v4.4S, v19.s[0] +ldr q24, [x0, #240] +mla v15.4S, v14.4S, v31.s[0] +sqrdmulh v14.4S, v24.4S, v19.s[0] +ldr q23, [x0, #0] +ldr q17, [x0, #128] +mul v5.4S, v5.4S,v21.s[0] +sub v20.4s, v23.4s, v18.4s +ldr q7, [x0, #16] +mul v3.4S, v3.4S,v21.s[0] +add v23.4s, v23.4s, v18.4s +ldr q18, [x0, #144] +mla v5.4S, v30.4S, v31.s[0] +sub v30.4s, v7.4s, v1.4s +ldr q12, [x0, #64] +mla v3.4S, v26.4S, v31.s[0] +add v7.4s, v7.4s, v1.4s +ldr q1, [x0, #192] +mul v4.4S, v4.4S,v11.s[0] +sub v26.4s, v12.4s, v28.4s +ldr q13, [x0, #80] +mul v24.4S, v24.4S,v11.s[0] +add v12.4s, v12.4s, v28.4s +ldr q28, [x0, #208] +mla v4.4S, v10.4S, v31.s[0] +mla v24.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v15.4s +sqrdmulh v10.4S, v7.4S, v9.s[1] +add v13.4s, v13.4s, v15.4s +mul v7.4S, v7.4S,v2.s[1] +sqrdmulh v15.4S, v30.4S, v9.s[2] +sub v27.4s, v17.4s, v5.4s +mul v30.4S, v30.4S,v2.s[2] +add v17.4s, v17.4s, v5.4s +sqrdmulh v9.4S, v13.4S, v25.s[1] +sub v2.4s, v18.4s, v3.4s +mul v13.4S, v13.4S,v8.s[1] +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v14.4S, v25.s[2] +sub v5.4s, v1.4s, v4.4s +mul v14.4S, v14.4S,v8.s[2] +add v1.4s, v1.4s, v4.4s +mla v7.4S, v10.4S, v31.s[0] +sub v10.4s, v28.4s, v24.4s +ldr q25, [x0, #480] +sqrdmulh v8.4S, v18.4S, v16.s[1] +add v28.4s, v28.4s, v24.4s +mla v30.4S, v15.4S, v31.s[0] +ldr q15, [x0, #416] +sqrdmulh v24.4S, v2.4S, v16.s[2] +sub v4.4s, v23.4s, v7.4s +mla v13.4S, v9.4S, v31.s[0] +ldr q9, [x0, #288] +sqrdmulh v22.4S, v28.4S, v19.s[1] +add v23.4s, v23.4s, v7.4s +str q4, [x0, #16] +mla v14.4S, v3.4S, v31.s[0] +ldr q3, [x17, #+256] +ldr q4, [x17, #+272] +sqrdmulh v7.4S, v10.4S, v19.s[2] +sub v6.4s, v20.4s, v30.4s +str q23, [x0, #0] +mul v18.4S, v18.4S,v21.s[1] +add v20.4s, v20.4s, v30.4s +mul v2.4S, v2.4S,v21.s[2] +str q6, [x0, #48] +mla v18.4S, v8.4S, v31.s[0] +sub v8.4s, v12.4s, v13.4s +mla v2.4S, v24.4S, v31.s[0] +str q20, [x0, #32] +mul v28.4S, v28.4S,v11.s[1] +str q8, [x0, #80] +mul v10.4S, v10.4S,v11.s[2] +add v12.4s, v12.4s, v13.4s +str q12, [x0, #64] +mla v28.4S, v22.4S, v31.s[0] +sub v22.4s, v26.4s, v14.4s +str q22, [x0, #112] +mla v10.4S, v7.4S, v31.s[0] +add v26.4s, v26.4s, v14.4s +str q26, [x0, #96] +sqrdmulh v19.4S, v9.4S, v4.s[0] +sub v11.4s, v17.4s, v18.4s +mul v9.4S, v9.4S,v3.s[0] +str q11, [x0, #144] +ldr q11, [x0, #304] +sqrdmulh v26.4S, v11.4S, v4.s[0] +add v17.4s, v17.4s, v18.4s +mul v11.4S, v11.4S,v3.s[0] +str q17, [x0, #128] +ldr q17, [x17, #+288] +ldr q18, [x17, #+304] +ldr q14, [x0, #352] +sqrdmulh v7.4S, v14.4S, v18.s[0] +sub v22.4s, v27.4s, v2.4s +mul v14.4S, v14.4S,v17.s[0] +str q22, [x0, #176] +ldr q22, [x0, #368] +sqrdmulh v12.4S, v22.4S, v18.s[0] +add v27.4s, v27.4s, v2.4s +mul v22.4S, v22.4S,v17.s[0] +str q27, [x0, #160] +ldr q27, [x17, #+320] +ldr q2, [x17, #+336] +mla v9.4S, v19.4S, v31.s[0] +sub v19.4s, v1.4s, v28.4s +sqrdmulh v13.4S, v15.4S, v2.s[0] +str q19, [x0, #208] +ldr q19, [x0, #432] +mla v11.4S, v26.4S, v31.s[0] +add v1.4s, v1.4s, v28.4s +sqrdmulh v28.4S, v19.4S, v2.s[0] +str q1, [x0, #192] +ldr q1, [x17, #+352] +ldr q26, [x17, #+368] +mla v14.4S, v7.4S, v31.s[0] +sub v7.4s, v5.4s, v10.4s +sqrdmulh v8.4S, v25.4S, v26.s[0] +str q7, [x0, #240] +ldr q7, [x0, #496] +mla v22.4S, v12.4S, v31.s[0] +add v5.4s, v5.4s, v10.4s +sqrdmulh v10.4S, v7.4S, v26.s[0] +str q5, [x0, #224] +ldr q5, [x0, #256] +ldr q12, [x0, #384] +mul v15.4S, v15.4S,v27.s[0] +sub v16.4s, v5.4s, v9.4s +ldr q21, [x0, #272] +mul v19.4S, v19.4S,v27.s[0] +add v5.4s, v5.4s, v9.4s +ldr q9, [x0, #400] +mla v15.4S, v13.4S, v31.s[0] +sub v13.4s, v21.4s, v11.4s +ldr q20, [x0, #320] +mla v19.4S, v28.4S, v31.s[0] +add v21.4s, v21.4s, v11.4s +ldr q11, [x0, #448] +mul v25.4S, v25.4S,v1.s[0] +sub v28.4s, v20.4s, v14.4s +ldr q24, [x0, #336] +mul v7.4S, v7.4S,v1.s[0] +add v20.4s, v20.4s, v14.4s +ldr q14, [x0, #464] +mla v25.4S, v8.4S, v31.s[0] +mla v7.4S, v10.4S, v31.s[0] +sub v10.4s, v24.4s, v22.4s +sqrdmulh v8.4S, v21.4S, v4.s[1] +add v24.4s, v24.4s, v22.4s +mul v21.4S, v21.4S,v3.s[1] +sqrdmulh v22.4S, v13.4S, v4.s[2] +sub v6.4s, v12.4s, v15.4s +mul v13.4S, v13.4S,v3.s[2] +add v12.4s, v12.4s, v15.4s +sqrdmulh v4.4S, v24.4S, v18.s[1] +sub v3.4s, v9.4s, v19.4s +mul v24.4S, v24.4S,v17.s[1] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v10.4S, v18.s[2] +sub v15.4s, v11.4s, v25.4s +mul v10.4S, v10.4S,v17.s[2] +add v11.4s, v11.4s, v25.4s +mla v21.4S, v8.4S, v31.s[0] +sub v8.4s, v14.4s, v7.4s +ldr q18, [x0, #736] +sqrdmulh v17.4S, v9.4S, v2.s[1] +add v14.4s, v14.4s, v7.4s +mla v13.4S, v22.4S, v31.s[0] +ldr q22, [x0, #672] +sqrdmulh v7.4S, v3.4S, v2.s[2] +sub v25.4s, v5.4s, v21.4s +mla v24.4S, v4.4S, v31.s[0] +ldr q4, [x0, #544] +sqrdmulh v30.4S, v14.4S, v26.s[1] +add v5.4s, v5.4s, v21.4s +str q25, [x0, #272] +mla v10.4S, v19.4S, v31.s[0] +ldr q19, [x17, #+384] +ldr q25, [x17, #+400] +sqrdmulh v21.4S, v8.4S, v26.s[2] +sub v23.4s, v16.4s, v13.4s +str q5, [x0, #256] +mul v9.4S, v9.4S,v27.s[1] +add v16.4s, v16.4s, v13.4s +mul v3.4S, v3.4S,v27.s[2] +str q23, [x0, #304] +mla v9.4S, v17.4S, v31.s[0] +sub v17.4s, v20.4s, v24.4s +mla v3.4S, v7.4S, v31.s[0] +str q16, [x0, #288] +mul v14.4S, v14.4S,v1.s[1] +str q17, [x0, #336] +mul v8.4S, v8.4S,v1.s[2] +add v20.4s, v20.4s, v24.4s +str q20, [x0, #320] +mla v14.4S, v30.4S, v31.s[0] +sub v30.4s, v28.4s, v10.4s +str q30, [x0, #368] +mla v8.4S, v21.4S, v31.s[0] +add v28.4s, v28.4s, v10.4s +str q28, [x0, #352] +sqrdmulh v26.4S, v4.4S, v25.s[0] +sub v1.4s, v12.4s, v9.4s +mul v4.4S, v4.4S,v19.s[0] +str q1, [x0, #400] +ldr q1, [x0, #560] +sqrdmulh v28.4S, v1.4S, v25.s[0] +add v12.4s, v12.4s, v9.4s +mul v1.4S, v1.4S,v19.s[0] +str q12, [x0, #384] +ldr q12, [x17, #+416] +ldr q9, [x17, #+432] +ldr q10, [x0, #608] +sqrdmulh v21.4S, v10.4S, v9.s[0] +sub v30.4s, v6.4s, v3.4s +mul v10.4S, v10.4S,v12.s[0] +str q30, [x0, #432] +ldr q30, [x0, #624] +sqrdmulh v20.4S, v30.4S, v9.s[0] +add v6.4s, v6.4s, v3.4s +mul v30.4S, v30.4S,v12.s[0] +str q6, [x0, #416] +ldr q6, [x17, #+448] +ldr q3, [x17, #+464] +mla v4.4S, v26.4S, v31.s[0] +sub v26.4s, v11.4s, v14.4s +sqrdmulh v24.4S, v22.4S, v3.s[0] +str q26, [x0, #464] +ldr q26, [x0, #688] +mla v1.4S, v28.4S, v31.s[0] +add v11.4s, v11.4s, v14.4s +sqrdmulh v14.4S, v26.4S, v3.s[0] +str q11, [x0, #448] +ldr q11, [x17, #+480] +ldr q28, [x17, #+496] +mla v10.4S, v21.4S, v31.s[0] +sub v21.4s, v15.4s, v8.4s +sqrdmulh v17.4S, v18.4S, v28.s[0] +str q21, [x0, #496] +ldr q21, [x0, #752] +mla v30.4S, v20.4S, v31.s[0] +add v15.4s, v15.4s, v8.4s +sqrdmulh v8.4S, v21.4S, v28.s[0] +str q15, [x0, #480] +ldr q15, [x0, #512] +ldr q20, [x0, #640] +mul v22.4S, v22.4S,v6.s[0] +sub v2.4s, v15.4s, v4.4s +ldr q27, [x0, #528] +mul v26.4S, v26.4S,v6.s[0] +add v15.4s, v15.4s, v4.4s +ldr q4, [x0, #656] +mla v22.4S, v24.4S, v31.s[0] +sub v24.4s, v27.4s, v1.4s +ldr q16, [x0, #576] +mla v26.4S, v14.4S, v31.s[0] +add v27.4s, v27.4s, v1.4s +ldr q1, [x0, #704] +mul v18.4S, v18.4S,v11.s[0] +sub v14.4s, v16.4s, v10.4s +ldr q7, [x0, #592] +mul v21.4S, v21.4S,v11.s[0] +add v16.4s, v16.4s, v10.4s +ldr q10, [x0, #720] +mla v18.4S, v17.4S, v31.s[0] +mla v21.4S, v8.4S, v31.s[0] +sub v8.4s, v7.4s, v30.4s +sqrdmulh v17.4S, v27.4S, v25.s[1] +add v7.4s, v7.4s, v30.4s +mul v27.4S, v27.4S,v19.s[1] +sqrdmulh v30.4S, v24.4S, v25.s[2] +sub v23.4s, v20.4s, v22.4s +mul v24.4S, v24.4S,v19.s[2] +add v20.4s, v20.4s, v22.4s +sqrdmulh v25.4S, v7.4S, v9.s[1] +sub v19.4s, v4.4s, v26.4s +mul v7.4S, v7.4S,v12.s[1] +add v4.4s, v4.4s, v26.4s +sqrdmulh v26.4S, v8.4S, v9.s[2] +sub v22.4s, v1.4s, v18.4s +mul v8.4S, v8.4S,v12.s[2] +add v1.4s, v1.4s, v18.4s +mla v27.4S, v17.4S, v31.s[0] +sub v17.4s, v10.4s, v21.4s +ldr q9, [x0, #992] +sqrdmulh v12.4S, v4.4S, v3.s[1] +add v10.4s, v10.4s, v21.4s +mla v24.4S, v30.4S, v31.s[0] +ldr q30, [x0, #928] +sqrdmulh v21.4S, v19.4S, v3.s[2] +sub v18.4s, v15.4s, v27.4s +mla v7.4S, v25.4S, v31.s[0] +ldr q25, [x0, #800] +sqrdmulh v13.4S, v10.4S, v28.s[1] +add v15.4s, v15.4s, v27.4s +str q18, [x0, #528] +mla v8.4S, v26.4S, v31.s[0] +ldr q26, [x17, #+512] +ldr q18, [x17, #+528] +sqrdmulh v27.4S, v17.4S, v28.s[2] +sub v5.4s, v2.4s, v24.4s +str q15, [x0, #512] +mul v4.4S, v4.4S,v6.s[1] +add v2.4s, v2.4s, v24.4s +mul v19.4S, v19.4S,v6.s[2] +str q5, [x0, #560] +mla v4.4S, v12.4S, v31.s[0] +sub v12.4s, v16.4s, v7.4s +mla v19.4S, v21.4S, v31.s[0] +str q2, [x0, #544] +mul v10.4S, v10.4S,v11.s[1] +str q12, [x0, #592] +mul v17.4S, v17.4S,v11.s[2] +add v16.4s, v16.4s, v7.4s +str q16, [x0, #576] +mla v10.4S, v13.4S, v31.s[0] +sub v13.4s, v14.4s, v8.4s +str q13, [x0, #624] +mla v17.4S, v27.4S, v31.s[0] +add v14.4s, v14.4s, v8.4s +str q14, [x0, #608] +sqrdmulh v28.4S, v25.4S, v18.s[0] +sub v11.4s, v20.4s, v4.4s +mul v25.4S, v25.4S,v26.s[0] +str q11, [x0, #656] +ldr q11, [x0, #816] +sqrdmulh v14.4S, v11.4S, v18.s[0] +add v20.4s, v20.4s, v4.4s +mul v11.4S, v11.4S,v26.s[0] +str q20, [x0, #640] +ldr q20, [x17, #+544] +ldr q4, [x17, #+560] +ldr q8, [x0, #864] +sqrdmulh v27.4S, v8.4S, v4.s[0] +sub v13.4s, v23.4s, v19.4s +mul v8.4S, v8.4S,v20.s[0] +str q13, [x0, #688] +ldr q13, [x0, #880] +sqrdmulh v16.4S, v13.4S, v4.s[0] +add v23.4s, v23.4s, v19.4s +mul v13.4S, v13.4S,v20.s[0] +str q23, [x0, #672] +ldr q23, [x17, #+576] +ldr q19, [x17, #+592] +mla v25.4S, v28.4S, v31.s[0] +sub v28.4s, v1.4s, v10.4s +sqrdmulh v7.4S, v30.4S, v19.s[0] +str q28, [x0, #720] +ldr q28, [x0, #944] +mla v11.4S, v14.4S, v31.s[0] +add v1.4s, v1.4s, v10.4s +sqrdmulh v10.4S, v28.4S, v19.s[0] +str q1, [x0, #704] +ldr q1, [x17, #+608] +ldr q14, [x17, #+624] +mla v8.4S, v27.4S, v31.s[0] +sub v27.4s, v22.4s, v17.4s +sqrdmulh v12.4S, v9.4S, v14.s[0] +str q27, [x0, #752] +ldr q27, [x0, #1008] +mla v13.4S, v16.4S, v31.s[0] +add v22.4s, v22.4s, v17.4s +sqrdmulh v17.4S, v27.4S, v14.s[0] +str q22, [x0, #736] +ldr q22, [x0, #768] +ldr q16, [x0, #896] +mul v30.4S, v30.4S,v23.s[0] +sub v3.4s, v22.4s, v25.4s +ldr q6, [x0, #784] +mul v28.4S, v28.4S,v23.s[0] +add v22.4s, v22.4s, v25.4s +ldr q25, [x0, #912] +mla v30.4S, v7.4S, v31.s[0] +sub v7.4s, v6.4s, v11.4s +ldr q2, [x0, #832] +mla v28.4S, v10.4S, v31.s[0] +add v6.4s, v6.4s, v11.4s +ldr q11, [x0, #960] +mul v9.4S, v9.4S,v1.s[0] +sub v10.4s, v2.4s, v8.4s +ldr q21, [x0, #848] +mul v27.4S, v27.4S,v1.s[0] +add v2.4s, v2.4s, v8.4s +ldr q8, [x0, #976] +mla v9.4S, v12.4S, v31.s[0] +mla v27.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v13.4s +sqrdmulh v12.4S, v6.4S, v18.s[1] +add v21.4s, v21.4s, v13.4s +mul v6.4S, v6.4S,v26.s[1] +sqrdmulh v13.4S, v7.4S, v18.s[2] +sub v5.4s, v16.4s, v30.4s +mul v7.4S, v7.4S,v26.s[2] +add v16.4s, v16.4s, v30.4s +sqrdmulh v18.4S, v21.4S, v4.s[1] +sub v26.4s, v25.4s, v28.4s +mul v21.4S, v21.4S,v20.s[1] +add v25.4s, v25.4s, v28.4s +sqrdmulh v28.4S, v17.4S, v4.s[2] +sub v30.4s, v11.4s, v9.4s +mul v17.4S, v17.4S,v20.s[2] +add v11.4s, v11.4s, v9.4s +mla v6.4S, v12.4S, v31.s[0] +sub v12.4s, v8.4s, v27.4s +sqrdmulh v4.4S, v25.4S, v19.s[1] +add v8.4s, v8.4s, v27.4s +mla v7.4S, v13.4S, v31.s[0] +sqrdmulh v13.4S, v26.4S, v19.s[2] +sub v27.4s, v22.4s, v6.4s +mla v21.4S, v18.4S, v31.s[0] +sqrdmulh v18.4S, v8.4S, v14.s[1] +add v22.4s, v22.4s, v6.4s +str q27, [x0, #784] +mla v17.4S, v28.4S, v31.s[0] +sqrdmulh v28.4S, v12.4S, v14.s[2] +sub v27.4s, v3.4s, v7.4s +str q22, [x0, #768] +mul v25.4S, v25.4S,v23.s[1] +add v3.4s, v3.4s, v7.4s +mul v26.4S, v26.4S,v23.s[2] +str q27, [x0, #816] +mla v25.4S, v4.4S, v31.s[0] +sub v4.4s, v2.4s, v21.4s +mla v26.4S, v13.4S, v31.s[0] +str q3, [x0, #800] +mul v8.4S, v8.4S,v1.s[1] +str q4, [x0, #848] +mul v12.4S, v12.4S,v1.s[2] +add v2.4s, v2.4s, v21.4s +str q2, [x0, #832] +mla v8.4S, v18.4S, v31.s[0] +sub v18.4s, v10.4s, v17.4s +str q18, [x0, #880] +mla v12.4S, v28.4S, v31.s[0] +add v10.4s, v10.4s, v17.4s +str q10, [x0, #864] +sub v14.4s, v16.4s, v25.4s +str q14, [x0, #912] +add v16.4s, v16.4s, v25.4s +str q16, [x0, #896] +sub v16.4s, v5.4s, v26.4s +str q16, [x0, #944] +add v5.4s, v5.4s, v26.4s +str q5, [x0, #928] +sub v5.4s, v11.4s, v8.4s +str q5, [x0, #976] +add v11.4s, v11.4s, v8.4s +str q11, [x0, #960] +sub v11.4s, v30.4s, v12.4s +str q11, [x0, #1008] +add v30.4s, v30.4s, v12.4s +str q30, [x0, #992] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1464 +// Instruction count: 1460 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_14_z4_7.s b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_14_z4_7.s new file mode 100644 index 0000000..2cf09d4 --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_14_z4_7.s @@ -0,0 +1,1578 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_14_z4_7 +.global _ntt_u32_incomplete_neon_asm_var_4_2_14_z4_7 +ntt_u32_incomplete_neon_asm_var_4_2_14_z4_7: +_ntt_u32_incomplete_neon_asm_var_4_2_14_z4_7: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x0, #928] +ldr q29, [x17, #+0] +ldr q28, [x17, #+16] +sqrdmulh v27.4S, v30.4S, v28.s[0] +mul v30.4S, v30.4S,v29.s[0] +ldr q26, [x0, #992] +sqrdmulh v25.4S, v26.4S, v28.s[0] +mul v26.4S, v26.4S,v29.s[0] +ldr q24, [x0, #800] +sqrdmulh v23.4S, v24.4S, v28.s[0] +mul v24.4S, v24.4S,v29.s[0] +ldr q22, [x0, #864] +sqrdmulh v21.4S, v22.4S, v28.s[0] +mul v22.4S, v22.4S,v29.s[0] +ldr q20, [x0, #544] +mla v30.4S, v27.4S, v31.s[0] +sqrdmulh v27.4S, v20.4S, v28.s[0] +ldr q19, [x0, #608] +mla v26.4S, v25.4S, v31.s[0] +sqrdmulh v25.4S, v19.4S, v28.s[0] +nop +ldr q18, [x0, #672] +mla v24.4S, v23.4S, v31.s[0] +sqrdmulh v23.4S, v18.4S, v28.s[0] +nop +ldr q17, [x0, #736] +mla v22.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v17.4S, v28.s[0] +nop +ldr q16, [x0, #416] +ldr q3, [x0, #480] +mul v20.4S, v20.4S,v29.s[0] +sub v2.4s, v16.4s, v30.4s +mul v19.4S, v19.4S,v29.s[0] +add v16.4s, v16.4s, v30.4s +ldr q30, [x0, #288] +ldr q1, [x0, #352] +mla v20.4S, v27.4S, v31.s[0] +sub v27.4s, v3.4s, v26.4s +mla v19.4S, v25.4S, v31.s[0] +add v3.4s, v3.4s, v26.4s +ldr q26, [x0, #32] +ldr q25, [x0, #96] +mul v18.4S, v18.4S,v29.s[0] +sub v0.4s, v30.4s, v24.4s +mul v17.4S, v17.4S,v29.s[0] +add v30.4s, v30.4s, v24.4s +ldr q24, [x0, #160] +ldr q15, [x0, #224] +mla v18.4S, v23.4S, v31.s[0] +sub v23.4s, v1.4s, v22.4s +mla v17.4S, v21.4S, v31.s[0] +add v1.4s, v1.4s, v22.4s +sqrdmulh v22.4S, v16.4S, v28.s[1] +nop +mul v16.4S, v16.4S,v29.s[1] +nop +sqrdmulh v21.4S, v3.4S, v28.s[1] +sub v14.4s, v26.4s, v20.4s +mul v3.4S, v3.4S,v29.s[1] +add v26.4s, v26.4s, v20.4s +sqrdmulh v20.4S, v30.4S, v28.s[1] +sub v13.4s, v25.4s, v19.4s +mul v30.4S, v30.4S,v29.s[1] +add v25.4s, v25.4s, v19.4s +sqrdmulh v19.4S, v1.4S, v28.s[1] +sub v12.4s, v24.4s, v18.4s +mul v1.4S, v1.4S,v29.s[1] +add v24.4s, v24.4s, v18.4s +mla v16.4S, v22.4S, v31.s[0] +sub v22.4s, v15.4s, v17.4s +sqrdmulh v18.4S, v2.4S, v28.s[2] +add v15.4s, v15.4s, v17.4s +mla v3.4S, v21.4S, v31.s[0] +nop +sqrdmulh v21.4S, v27.4S, v28.s[2] +nop +mla v30.4S, v20.4S, v31.s[0] +nop +sqrdmulh v20.4S, v0.4S, v28.s[2] +nop +mla v1.4S, v19.4S, v31.s[0] +nop +sqrdmulh v19.4S, v23.4S, v28.s[2] +nop +ldr q17, [x17, #+32] +ldr q11, [x17, #+48] +mul v2.4S, v2.4S,v29.s[2] +sub v10.4s, v24.4s, v16.4s +mul v27.4S, v27.4S,v29.s[2] +add v24.4s, v24.4s, v16.4s +mla v2.4S, v18.4S, v31.s[0] +sub v18.4s, v15.4s, v3.4s +mla v27.4S, v21.4S, v31.s[0] +add v15.4s, v15.4s, v3.4s +mul v0.4S, v0.4S,v29.s[2] +sub v3.4s, v26.4s, v30.4s +mul v23.4S, v23.4S,v29.s[2] +add v26.4s, v26.4s, v30.4s +mla v0.4S, v20.4S, v31.s[0] +sub v20.4s, v25.4s, v1.4s +mla v23.4S, v19.4S, v31.s[0] +add v25.4s, v25.4s, v1.4s +sqrdmulh v1.4S, v10.4S, v11.s[1] +nop +mul v10.4S, v10.4S,v17.s[1] +nop +sqrdmulh v19.4S, v18.4S, v11.s[1] +sub v30.4s, v12.4s, v2.4s +mul v18.4S, v18.4S,v17.s[1] +add v12.4s, v12.4s, v2.4s +sqrdmulh v2.4S, v24.4S, v11.s[0] +sub v21.4s, v22.4s, v27.4s +mul v24.4S, v24.4S,v17.s[0] +add v22.4s, v22.4s, v27.4s +sqrdmulh v27.4S, v15.4S, v11.s[0] +sub v16.4s, v14.4s, v0.4s +mul v15.4S, v15.4S,v17.s[0] +add v14.4s, v14.4s, v0.4s +ldr q0, [x17, #+64] +ldr q9, [x17, #+80] +mla v10.4S, v1.4S, v31.s[0] +sub v1.4s, v13.4s, v23.4s +sqrdmulh v8.4S, v12.4S, v11.s[2] +add v13.4s, v13.4s, v23.4s +mla v18.4S, v19.4S, v31.s[0] +nop +sqrdmulh v19.4S, v22.4S, v11.s[2] +nop +mla v24.4S, v2.4S, v31.s[0] +nop +sqrdmulh v2.4S, v30.4S, v11.s[3] +nop +mla v15.4S, v27.4S, v31.s[0] +nop +sqrdmulh v27.4S, v21.4S, v11.s[3] +nop +ldr q23, [x17, #+96] +ldr q7, [x17, #+112] +mul v12.4S, v12.4S,v17.s[2] +sub v6.4s, v3.4s, v10.4s +mul v22.4S, v22.4S,v17.s[2] +add v3.4s, v3.4s, v10.4s +mla v12.4S, v8.4S, v31.s[0] +sub v8.4s, v20.4s, v18.4s +mla v22.4S, v19.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +mul v30.4S, v30.4S,v17.s[3] +sub v18.4s, v26.4s, v24.4s +mul v21.4S, v21.4S,v17.s[3] +add v26.4s, v26.4s, v24.4s +mla v30.4S, v2.4S, v31.s[0] +sub v2.4s, v25.4s, v15.4s +mla v21.4S, v27.4S, v31.s[0] +add v25.4s, v25.4s, v15.4s +sqrdmulh v15.4S, v8.4S, v9.s[3] +nop +mul v8.4S, v8.4S,v0.s[3] +nop +sqrdmulh v27.4S, v20.4S, v9.s[2] +sub v24.4s, v14.4s, v12.4s +mul v20.4S, v20.4S,v0.s[2] +add v14.4s, v14.4s, v12.4s +sqrdmulh v12.4S, v2.4S, v9.s[1] +sub v19.4s, v13.4s, v22.4s +mul v2.4S, v2.4S,v0.s[1] +add v13.4s, v13.4s, v22.4s +sqrdmulh v22.4S, v25.4S, v9.s[0] +sub v10.4s, v16.4s, v30.4s +mul v25.4S, v25.4S,v0.s[0] +add v16.4s, v16.4s, v30.4s +mla v8.4S, v15.4S, v31.s[0] +sub v15.4s, v1.4s, v21.4s +sqrdmulh v30.4S, v13.4S, v7.s[0] +add v1.4s, v1.4s, v21.4s +mla v20.4S, v27.4S, v31.s[0] +sub v27.4s, v6.4s, v8.4s +sqrdmulh v21.4S, v19.4S, v7.s[1] +add v6.4s, v6.4s, v8.4s +mla v2.4S, v12.4S, v31.s[0] +sub v12.4s, v3.4s, v20.4s +sqrdmulh v8.4S, v1.4S, v7.s[2] +add v3.4s, v3.4s, v20.4s +mla v25.4S, v22.4S, v31.s[0] +sub v22.4s, v18.4s, v2.4s +sqrdmulh v20.4S, v15.4S, v7.s[3] +add v18.4s, v18.4s, v2.4s +mul v13.4S, v13.4S,v23.s[0] +sub v2.4s, v26.4s, v25.4s +mul v19.4S, v19.4S,v23.s[1] +add v26.4s, v26.4s, v25.4s +mla v13.4S, v30.4S, v31.s[0] +str q12, [x0, #352] +mla v19.4S, v21.4S, v31.s[0] +str q3, [x0, #288] +mul v1.4S, v1.4S,v23.s[2] +str q27, [x0, #480] +mul v15.4S, v15.4S,v23.s[3] +str q6, [x0, #416] +mla v1.4S, v8.4S, v31.s[0] +str q22, [x0, #224] +mla v15.4S, v20.4S, v31.s[0] +str q18, [x0, #160] +ldr q18, [x0, #944] +sqrdmulh v20.4S, v18.4S, v28.s[0] +str q2, [x0, #96] +mul v18.4S, v18.4S,v29.s[0] +str q26, [x0, #32] +ldr q26, [x0, #1008] +sqrdmulh v2.4S, v26.4S, v28.s[0] +sub v22.4s, v14.4s, v13.4s +str q22, [x0, #608] +mul v26.4S, v26.4S,v29.s[0] +add v14.4s, v14.4s, v13.4s +ldr q13, [x0, #816] +sqrdmulh v22.4S, v13.4S, v28.s[0] +sub v8.4s, v24.4s, v19.4s +str q14, [x0, #544] +mul v13.4S, v13.4S,v29.s[0] +add v24.4s, v24.4s, v19.4s +ldr q19, [x0, #880] +sqrdmulh v14.4S, v19.4S, v28.s[0] +sub v6.4s, v16.4s, v1.4s +str q8, [x0, #736] +mul v19.4S, v19.4S,v29.s[0] +add v16.4s, v16.4s, v1.4s +ldr q1, [x0, #560] +mla v18.4S, v20.4S, v31.s[0] +sub v20.4s, v10.4s, v15.4s +str q24, [x0, #672] +sqrdmulh v24.4S, v1.4S, v28.s[0] +add v10.4s, v10.4s, v15.4s +ldr q15, [x0, #624] +mla v26.4S, v2.4S, v31.s[0] +str q6, [x0, #864] +sqrdmulh v6.4S, v15.4S, v28.s[0] +nop +ldr q2, [x0, #688] +mla v13.4S, v22.4S, v31.s[0] +str q16, [x0, #800] +sqrdmulh v16.4S, v2.4S, v28.s[0] +nop +ldr q22, [x0, #752] +mla v19.4S, v14.4S, v31.s[0] +str q20, [x0, #992] +sqrdmulh v20.4S, v22.4S, v28.s[0] +nop +ldr q14, [x0, #432] +ldr q8, [x0, #496] +mul v1.4S, v1.4S,v29.s[0] +sub v27.4s, v14.4s, v18.4s +str q10, [x0, #928] +mul v15.4S, v15.4S,v29.s[0] +add v14.4s, v14.4s, v18.4s +ldr q18, [x0, #304] +ldr q10, [x0, #368] +mla v1.4S, v24.4S, v31.s[0] +sub v24.4s, v8.4s, v26.4s +mla v15.4S, v6.4S, v31.s[0] +add v8.4s, v8.4s, v26.4s +ldr q26, [x0, #48] +ldr q6, [x0, #112] +mul v2.4S, v2.4S,v29.s[0] +sub v3.4s, v18.4s, v13.4s +mul v22.4S, v22.4S,v29.s[0] +add v18.4s, v18.4s, v13.4s +ldr q13, [x0, #176] +ldr q21, [x0, #240] +mla v2.4S, v16.4S, v31.s[0] +sub v16.4s, v10.4s, v19.4s +mla v22.4S, v20.4S, v31.s[0] +add v10.4s, v10.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v28.s[1] +nop +mul v14.4S, v14.4S,v29.s[1] +nop +sqrdmulh v20.4S, v8.4S, v28.s[1] +sub v12.4s, v26.4s, v1.4s +mul v8.4S, v8.4S,v29.s[1] +add v26.4s, v26.4s, v1.4s +sqrdmulh v1.4S, v18.4S, v28.s[1] +sub v30.4s, v6.4s, v15.4s +mul v18.4S, v18.4S,v29.s[1] +add v6.4s, v6.4s, v15.4s +sqrdmulh v15.4S, v10.4S, v28.s[1] +sub v25.4s, v13.4s, v2.4s +mul v10.4S, v10.4S,v29.s[1] +add v13.4s, v13.4s, v2.4s +mla v14.4S, v19.4S, v31.s[0] +sub v19.4s, v21.4s, v22.4s +sqrdmulh v2.4S, v27.4S, v28.s[2] +add v21.4s, v21.4s, v22.4s +mla v8.4S, v20.4S, v31.s[0] +nop +sqrdmulh v20.4S, v24.4S, v28.s[2] +nop +mla v18.4S, v1.4S, v31.s[0] +nop +sqrdmulh v1.4S, v3.4S, v28.s[2] +nop +mla v10.4S, v15.4S, v31.s[0] +nop +sqrdmulh v15.4S, v16.4S, v28.s[2] +nop +mul v27.4S, v27.4S,v29.s[2] +sub v22.4s, v13.4s, v14.4s +mul v24.4S, v24.4S,v29.s[2] +add v13.4s, v13.4s, v14.4s +mla v27.4S, v2.4S, v31.s[0] +sub v2.4s, v21.4s, v8.4s +mla v24.4S, v20.4S, v31.s[0] +add v21.4s, v21.4s, v8.4s +mul v3.4S, v3.4S,v29.s[2] +sub v8.4s, v26.4s, v18.4s +mul v16.4S, v16.4S,v29.s[2] +add v26.4s, v26.4s, v18.4s +mla v3.4S, v1.4S, v31.s[0] +sub v1.4s, v6.4s, v10.4s +mla v16.4S, v15.4S, v31.s[0] +add v6.4s, v6.4s, v10.4s +sqrdmulh v10.4S, v22.4S, v11.s[1] +nop +mul v22.4S, v22.4S,v17.s[1] +nop +sqrdmulh v15.4S, v2.4S, v11.s[1] +sub v18.4s, v25.4s, v27.4s +mul v2.4S, v2.4S,v17.s[1] +add v25.4s, v25.4s, v27.4s +sqrdmulh v27.4S, v13.4S, v11.s[0] +sub v20.4s, v19.4s, v24.4s +mul v13.4S, v13.4S,v17.s[0] +add v19.4s, v19.4s, v24.4s +sqrdmulh v24.4S, v21.4S, v11.s[0] +sub v14.4s, v12.4s, v3.4s +mul v21.4S, v21.4S,v17.s[0] +add v12.4s, v12.4s, v3.4s +mla v22.4S, v10.4S, v31.s[0] +sub v10.4s, v30.4s, v16.4s +sqrdmulh v3.4S, v25.4S, v11.s[2] +add v30.4s, v30.4s, v16.4s +mla v2.4S, v15.4S, v31.s[0] +nop +sqrdmulh v15.4S, v19.4S, v11.s[2] +nop +mla v13.4S, v27.4S, v31.s[0] +nop +sqrdmulh v27.4S, v18.4S, v11.s[3] +nop +mla v21.4S, v24.4S, v31.s[0] +nop +sqrdmulh v24.4S, v20.4S, v11.s[3] +nop +mul v25.4S, v25.4S,v17.s[2] +sub v16.4s, v8.4s, v22.4s +mul v19.4S, v19.4S,v17.s[2] +add v8.4s, v8.4s, v22.4s +mla v25.4S, v3.4S, v31.s[0] +sub v3.4s, v1.4s, v2.4s +mla v19.4S, v15.4S, v31.s[0] +add v1.4s, v1.4s, v2.4s +mul v18.4S, v18.4S,v17.s[3] +sub v2.4s, v26.4s, v13.4s +mul v20.4S, v20.4S,v17.s[3] +add v26.4s, v26.4s, v13.4s +mla v18.4S, v27.4S, v31.s[0] +sub v27.4s, v6.4s, v21.4s +mla v20.4S, v24.4S, v31.s[0] +add v6.4s, v6.4s, v21.4s +sqrdmulh v21.4S, v3.4S, v9.s[3] +nop +mul v3.4S, v3.4S,v0.s[3] +nop +sqrdmulh v24.4S, v1.4S, v9.s[2] +sub v13.4s, v12.4s, v25.4s +mul v1.4S, v1.4S,v0.s[2] +add v12.4s, v12.4s, v25.4s +sqrdmulh v25.4S, v27.4S, v9.s[1] +sub v15.4s, v30.4s, v19.4s +mul v27.4S, v27.4S,v0.s[1] +add v30.4s, v30.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v9.s[0] +sub v22.4s, v14.4s, v18.4s +mul v6.4S, v6.4S,v0.s[0] +add v14.4s, v14.4s, v18.4s +mla v3.4S, v21.4S, v31.s[0] +sub v21.4s, v10.4s, v20.4s +sqrdmulh v18.4S, v30.4S, v7.s[0] +add v10.4s, v10.4s, v20.4s +mla v1.4S, v24.4S, v31.s[0] +sub v24.4s, v16.4s, v3.4s +sqrdmulh v20.4S, v15.4S, v7.s[1] +add v16.4s, v16.4s, v3.4s +mla v27.4S, v25.4S, v31.s[0] +sub v25.4s, v8.4s, v1.4s +sqrdmulh v3.4S, v10.4S, v7.s[2] +add v8.4s, v8.4s, v1.4s +mla v6.4S, v19.4S, v31.s[0] +sub v19.4s, v2.4s, v27.4s +sqrdmulh v1.4S, v21.4S, v7.s[3] +add v2.4s, v2.4s, v27.4s +mul v30.4S, v30.4S,v23.s[0] +sub v27.4s, v26.4s, v6.4s +mul v15.4S, v15.4S,v23.s[1] +add v26.4s, v26.4s, v6.4s +mla v30.4S, v18.4S, v31.s[0] +str q25, [x0, #368] +mla v15.4S, v20.4S, v31.s[0] +str q8, [x0, #304] +mul v10.4S, v10.4S,v23.s[2] +str q24, [x0, #496] +mul v21.4S, v21.4S,v23.s[3] +str q16, [x0, #432] +mla v10.4S, v3.4S, v31.s[0] +str q19, [x0, #240] +mla v21.4S, v1.4S, v31.s[0] +str q2, [x0, #176] +ldr q2, [x0, #896] +sqrdmulh v1.4S, v2.4S, v28.s[0] +str q27, [x0, #112] +mul v2.4S, v2.4S,v29.s[0] +str q26, [x0, #48] +ldr q26, [x0, #960] +sqrdmulh v27.4S, v26.4S, v28.s[0] +sub v19.4s, v12.4s, v30.4s +str q19, [x0, #624] +mul v26.4S, v26.4S,v29.s[0] +add v12.4s, v12.4s, v30.4s +ldr q30, [x0, #768] +sqrdmulh v19.4S, v30.4S, v28.s[0] +sub v3.4s, v13.4s, v15.4s +str q12, [x0, #560] +mul v30.4S, v30.4S,v29.s[0] +add v13.4s, v13.4s, v15.4s +ldr q15, [x0, #832] +sqrdmulh v12.4S, v15.4S, v28.s[0] +sub v16.4s, v14.4s, v10.4s +str q3, [x0, #752] +mul v15.4S, v15.4S,v29.s[0] +add v14.4s, v14.4s, v10.4s +ldr q10, [x0, #512] +mla v2.4S, v1.4S, v31.s[0] +sub v1.4s, v22.4s, v21.4s +str q13, [x0, #688] +sqrdmulh v13.4S, v10.4S, v28.s[0] +add v22.4s, v22.4s, v21.4s +ldr q21, [x0, #576] +mla v26.4S, v27.4S, v31.s[0] +str q16, [x0, #880] +sqrdmulh v16.4S, v21.4S, v28.s[0] +nop +ldr q27, [x0, #640] +mla v30.4S, v19.4S, v31.s[0] +str q14, [x0, #816] +sqrdmulh v14.4S, v27.4S, v28.s[0] +nop +ldr q19, [x0, #704] +mla v15.4S, v12.4S, v31.s[0] +str q1, [x0, #1008] +sqrdmulh v1.4S, v19.4S, v28.s[0] +nop +ldr q12, [x0, #384] +ldr q3, [x0, #448] +mul v10.4S, v10.4S,v29.s[0] +sub v24.4s, v12.4s, v2.4s +str q22, [x0, #944] +mul v21.4S, v21.4S,v29.s[0] +add v12.4s, v12.4s, v2.4s +ldr q2, [x0, #256] +ldr q22, [x0, #320] +mla v10.4S, v13.4S, v31.s[0] +sub v13.4s, v3.4s, v26.4s +mla v21.4S, v16.4S, v31.s[0] +add v3.4s, v3.4s, v26.4s +ldr q26, [x0, #0] +ldr q16, [x0, #64] +mul v27.4S, v27.4S,v29.s[0] +sub v8.4s, v2.4s, v30.4s +mul v19.4S, v19.4S,v29.s[0] +add v2.4s, v2.4s, v30.4s +ldr q30, [x0, #128] +ldr q20, [x0, #192] +mla v27.4S, v14.4S, v31.s[0] +sub v14.4s, v22.4s, v15.4s +mla v19.4S, v1.4S, v31.s[0] +add v22.4s, v22.4s, v15.4s +sqrdmulh v15.4S, v12.4S, v28.s[1] +nop +mul v12.4S, v12.4S,v29.s[1] +nop +sqrdmulh v1.4S, v3.4S, v28.s[1] +sub v25.4s, v26.4s, v10.4s +mul v3.4S, v3.4S,v29.s[1] +add v26.4s, v26.4s, v10.4s +sqrdmulh v10.4S, v2.4S, v28.s[1] +sub v18.4s, v16.4s, v21.4s +mul v2.4S, v2.4S,v29.s[1] +add v16.4s, v16.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v28.s[1] +sub v6.4s, v30.4s, v27.4s +mul v22.4S, v22.4S,v29.s[1] +add v30.4s, v30.4s, v27.4s +mla v12.4S, v15.4S, v31.s[0] +sub v15.4s, v20.4s, v19.4s +sqrdmulh v27.4S, v24.4S, v28.s[2] +add v20.4s, v20.4s, v19.4s +mla v3.4S, v1.4S, v31.s[0] +nop +sqrdmulh v1.4S, v13.4S, v28.s[2] +nop +mla v2.4S, v10.4S, v31.s[0] +nop +sqrdmulh v10.4S, v8.4S, v28.s[2] +nop +mla v22.4S, v21.4S, v31.s[0] +nop +sqrdmulh v21.4S, v14.4S, v28.s[2] +nop +mul v24.4S, v24.4S,v29.s[2] +sub v19.4s, v30.4s, v12.4s +mul v13.4S, v13.4S,v29.s[2] +add v30.4s, v30.4s, v12.4s +mla v24.4S, v27.4S, v31.s[0] +sub v27.4s, v20.4s, v3.4s +mla v13.4S, v1.4S, v31.s[0] +add v20.4s, v20.4s, v3.4s +mul v8.4S, v8.4S,v29.s[2] +sub v3.4s, v26.4s, v2.4s +mul v14.4S, v14.4S,v29.s[2] +add v26.4s, v26.4s, v2.4s +mla v8.4S, v10.4S, v31.s[0] +sub v10.4s, v16.4s, v22.4s +mla v14.4S, v21.4S, v31.s[0] +add v16.4s, v16.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v11.s[1] +nop +mul v19.4S, v19.4S,v17.s[1] +nop +sqrdmulh v21.4S, v27.4S, v11.s[1] +sub v2.4s, v6.4s, v24.4s +mul v27.4S, v27.4S,v17.s[1] +add v6.4s, v6.4s, v24.4s +sqrdmulh v24.4S, v30.4S, v11.s[0] +sub v1.4s, v15.4s, v13.4s +mul v30.4S, v30.4S,v17.s[0] +add v15.4s, v15.4s, v13.4s +sqrdmulh v13.4S, v20.4S, v11.s[0] +sub v12.4s, v25.4s, v8.4s +mul v20.4S, v20.4S,v17.s[0] +add v25.4s, v25.4s, v8.4s +mla v19.4S, v22.4S, v31.s[0] +sub v22.4s, v18.4s, v14.4s +sqrdmulh v8.4S, v6.4S, v11.s[2] +add v18.4s, v18.4s, v14.4s +mla v27.4S, v21.4S, v31.s[0] +nop +sqrdmulh v21.4S, v15.4S, v11.s[2] +nop +mla v30.4S, v24.4S, v31.s[0] +nop +sqrdmulh v24.4S, v2.4S, v11.s[3] +nop +mla v20.4S, v13.4S, v31.s[0] +nop +sqrdmulh v13.4S, v1.4S, v11.s[3] +nop +mul v6.4S, v6.4S,v17.s[2] +sub v14.4s, v3.4s, v19.4s +mul v15.4S, v15.4S,v17.s[2] +add v3.4s, v3.4s, v19.4s +mla v6.4S, v8.4S, v31.s[0] +sub v8.4s, v10.4s, v27.4s +mla v15.4S, v21.4S, v31.s[0] +add v10.4s, v10.4s, v27.4s +mul v2.4S, v2.4S,v17.s[3] +sub v27.4s, v26.4s, v30.4s +mul v1.4S, v1.4S,v17.s[3] +add v26.4s, v26.4s, v30.4s +mla v2.4S, v24.4S, v31.s[0] +sub v24.4s, v16.4s, v20.4s +mla v1.4S, v13.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v9.s[3] +nop +mul v8.4S, v8.4S,v0.s[3] +nop +sqrdmulh v13.4S, v10.4S, v9.s[2] +sub v30.4s, v25.4s, v6.4s +mul v10.4S, v10.4S,v0.s[2] +add v25.4s, v25.4s, v6.4s +sqrdmulh v6.4S, v24.4S, v9.s[1] +sub v21.4s, v18.4s, v15.4s +mul v24.4S, v24.4S,v0.s[1] +add v18.4s, v18.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v9.s[0] +sub v19.4s, v12.4s, v2.4s +mul v16.4S, v16.4S,v0.s[0] +add v12.4s, v12.4s, v2.4s +mla v8.4S, v20.4S, v31.s[0] +sub v20.4s, v22.4s, v1.4s +sqrdmulh v2.4S, v18.4S, v7.s[0] +add v22.4s, v22.4s, v1.4s +mla v10.4S, v13.4S, v31.s[0] +sub v13.4s, v14.4s, v8.4s +sqrdmulh v1.4S, v21.4S, v7.s[1] +add v14.4s, v14.4s, v8.4s +mla v24.4S, v6.4S, v31.s[0] +sub v6.4s, v3.4s, v10.4s +sqrdmulh v8.4S, v22.4S, v7.s[2] +add v3.4s, v3.4s, v10.4s +mla v16.4S, v15.4S, v31.s[0] +sub v15.4s, v27.4s, v24.4s +sqrdmulh v10.4S, v20.4S, v7.s[3] +add v27.4s, v27.4s, v24.4s +mul v18.4S, v18.4S,v23.s[0] +sub v24.4s, v26.4s, v16.4s +mul v21.4S, v21.4S,v23.s[1] +add v26.4s, v26.4s, v16.4s +mla v18.4S, v2.4S, v31.s[0] +str q6, [x0, #320] +mla v21.4S, v1.4S, v31.s[0] +str q3, [x0, #256] +mul v22.4S, v22.4S,v23.s[2] +str q13, [x0, #448] +mul v20.4S, v20.4S,v23.s[3] +str q14, [x0, #384] +mla v22.4S, v8.4S, v31.s[0] +str q15, [x0, #192] +mla v20.4S, v10.4S, v31.s[0] +str q27, [x0, #128] +ldr q27, [x0, #912] +sqrdmulh v10.4S, v27.4S, v28.s[0] +str q24, [x0, #64] +mul v27.4S, v27.4S,v29.s[0] +str q26, [x0, #0] +ldr q26, [x0, #976] +sqrdmulh v24.4S, v26.4S, v28.s[0] +sub v15.4s, v25.4s, v18.4s +str q15, [x0, #576] +mul v26.4S, v26.4S,v29.s[0] +add v25.4s, v25.4s, v18.4s +ldr q18, [x0, #784] +sqrdmulh v15.4S, v18.4S, v28.s[0] +sub v8.4s, v30.4s, v21.4s +str q25, [x0, #512] +mul v18.4S, v18.4S,v29.s[0] +add v30.4s, v30.4s, v21.4s +ldr q21, [x0, #848] +sqrdmulh v25.4S, v21.4S, v28.s[0] +sub v14.4s, v12.4s, v22.4s +str q8, [x0, #704] +mul v21.4S, v21.4S,v29.s[0] +add v12.4s, v12.4s, v22.4s +ldr q22, [x0, #528] +mla v27.4S, v10.4S, v31.s[0] +sub v10.4s, v19.4s, v20.4s +str q30, [x0, #640] +sqrdmulh v30.4S, v22.4S, v28.s[0] +add v19.4s, v19.4s, v20.4s +ldr q20, [x0, #592] +mla v26.4S, v24.4S, v31.s[0] +str q14, [x0, #832] +sqrdmulh v14.4S, v20.4S, v28.s[0] +nop +ldr q24, [x0, #656] +mla v18.4S, v15.4S, v31.s[0] +str q12, [x0, #768] +sqrdmulh v12.4S, v24.4S, v28.s[0] +nop +ldr q15, [x0, #720] +mla v21.4S, v25.4S, v31.s[0] +str q10, [x0, #960] +sqrdmulh v10.4S, v15.4S, v28.s[0] +nop +ldr q25, [x0, #400] +ldr q8, [x0, #464] +mul v22.4S, v22.4S,v29.s[0] +sub v13.4s, v25.4s, v27.4s +str q19, [x0, #896] +mul v20.4S, v20.4S,v29.s[0] +add v25.4s, v25.4s, v27.4s +ldr q27, [x0, #272] +ldr q19, [x0, #336] +mla v22.4S, v30.4S, v31.s[0] +sub v30.4s, v8.4s, v26.4s +mla v20.4S, v14.4S, v31.s[0] +add v8.4s, v8.4s, v26.4s +ldr q26, [x0, #16] +ldr q14, [x0, #80] +mul v24.4S, v24.4S,v29.s[0] +sub v3.4s, v27.4s, v18.4s +mul v15.4S, v15.4S,v29.s[0] +add v27.4s, v27.4s, v18.4s +ldr q18, [x0, #144] +ldr q1, [x0, #208] +mla v24.4S, v12.4S, v31.s[0] +sub v12.4s, v19.4s, v21.4s +mla v15.4S, v10.4S, v31.s[0] +add v19.4s, v19.4s, v21.4s +sqrdmulh v21.4S, v25.4S, v28.s[1] +nop +mul v25.4S, v25.4S,v29.s[1] +nop +sqrdmulh v10.4S, v8.4S, v28.s[1] +sub v6.4s, v26.4s, v22.4s +mul v8.4S, v8.4S,v29.s[1] +add v26.4s, v26.4s, v22.4s +sqrdmulh v22.4S, v27.4S, v28.s[1] +sub v2.4s, v14.4s, v20.4s +mul v27.4S, v27.4S,v29.s[1] +add v14.4s, v14.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v28.s[1] +sub v16.4s, v18.4s, v24.4s +mul v19.4S, v19.4S,v29.s[1] +add v18.4s, v18.4s, v24.4s +mla v25.4S, v21.4S, v31.s[0] +sub v21.4s, v1.4s, v15.4s +sqrdmulh v24.4S, v13.4S, v28.s[2] +add v1.4s, v1.4s, v15.4s +mla v8.4S, v10.4S, v31.s[0] +nop +sqrdmulh v10.4S, v30.4S, v28.s[2] +nop +mla v27.4S, v22.4S, v31.s[0] +nop +sqrdmulh v22.4S, v3.4S, v28.s[2] +nop +mla v19.4S, v20.4S, v31.s[0] +nop +sqrdmulh v20.4S, v12.4S, v28.s[2] +nop +mul v13.4S, v13.4S,v29.s[2] +sub v15.4s, v18.4s, v25.4s +mul v30.4S, v30.4S,v29.s[2] +add v18.4s, v18.4s, v25.4s +mla v13.4S, v24.4S, v31.s[0] +sub v24.4s, v1.4s, v8.4s +mla v30.4S, v10.4S, v31.s[0] +add v1.4s, v1.4s, v8.4s +mul v3.4S, v3.4S,v29.s[2] +sub v8.4s, v26.4s, v27.4s +mul v12.4S, v12.4S,v29.s[2] +add v26.4s, v26.4s, v27.4s +mla v3.4S, v22.4S, v31.s[0] +sub v22.4s, v14.4s, v19.4s +mla v12.4S, v20.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +sqrdmulh v28.4S, v15.4S, v11.s[1] +nop +mul v15.4S, v15.4S,v17.s[1] +nop +sqrdmulh v29.4S, v24.4S, v11.s[1] +sub v19.4s, v16.4s, v13.4s +mul v24.4S, v24.4S,v17.s[1] +add v16.4s, v16.4s, v13.4s +sqrdmulh v13.4S, v18.4S, v11.s[0] +sub v20.4s, v21.4s, v30.4s +mul v18.4S, v18.4S,v17.s[0] +add v21.4s, v21.4s, v30.4s +sqrdmulh v30.4S, v1.4S, v11.s[0] +sub v27.4s, v6.4s, v3.4s +mul v1.4S, v1.4S,v17.s[0] +add v6.4s, v6.4s, v3.4s +mla v15.4S, v28.4S, v31.s[0] +sub v28.4s, v2.4s, v12.4s +sqrdmulh v3.4S, v16.4S, v11.s[2] +add v2.4s, v2.4s, v12.4s +mla v24.4S, v29.4S, v31.s[0] +nop +sqrdmulh v29.4S, v21.4S, v11.s[2] +nop +mla v18.4S, v13.4S, v31.s[0] +nop +sqrdmulh v13.4S, v19.4S, v11.s[3] +nop +mla v1.4S, v30.4S, v31.s[0] +nop +sqrdmulh v30.4S, v20.4S, v11.s[3] +nop +mul v16.4S, v16.4S,v17.s[2] +sub v12.4s, v8.4s, v15.4s +mul v21.4S, v21.4S,v17.s[2] +add v8.4s, v8.4s, v15.4s +mla v16.4S, v3.4S, v31.s[0] +sub v3.4s, v22.4s, v24.4s +mla v21.4S, v29.4S, v31.s[0] +add v22.4s, v22.4s, v24.4s +mul v19.4S, v19.4S,v17.s[3] +sub v24.4s, v26.4s, v18.4s +mul v20.4S, v20.4S,v17.s[3] +add v26.4s, v26.4s, v18.4s +mla v19.4S, v13.4S, v31.s[0] +sub v13.4s, v14.4s, v1.4s +mla v20.4S, v30.4S, v31.s[0] +add v14.4s, v14.4s, v1.4s +sqrdmulh v11.4S, v3.4S, v9.s[3] +nop +mul v3.4S, v3.4S,v0.s[3] +nop +sqrdmulh v17.4S, v22.4S, v9.s[2] +sub v1.4s, v6.4s, v16.4s +mul v22.4S, v22.4S,v0.s[2] +add v6.4s, v6.4s, v16.4s +sqrdmulh v16.4S, v13.4S, v9.s[1] +sub v30.4s, v2.4s, v21.4s +mul v13.4S, v13.4S,v0.s[1] +add v2.4s, v2.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v9.s[0] +sub v18.4s, v27.4s, v19.4s +mul v14.4S, v14.4S,v0.s[0] +add v27.4s, v27.4s, v19.4s +mla v3.4S, v11.4S, v31.s[0] +sub v11.4s, v28.4s, v20.4s +sqrdmulh v9.4S, v2.4S, v7.s[0] +add v28.4s, v28.4s, v20.4s +mla v22.4S, v17.4S, v31.s[0] +sub v17.4s, v12.4s, v3.4s +sqrdmulh v20.4S, v30.4S, v7.s[1] +add v12.4s, v12.4s, v3.4s +mla v13.4S, v16.4S, v31.s[0] +sub v16.4s, v8.4s, v22.4s +sqrdmulh v3.4S, v28.4S, v7.s[2] +add v8.4s, v8.4s, v22.4s +mla v14.4S, v21.4S, v31.s[0] +sub v21.4s, v24.4s, v13.4s +sqrdmulh v22.4S, v11.4S, v7.s[3] +add v24.4s, v24.4s, v13.4s +mul v2.4S, v2.4S,v23.s[0] +sub v13.4s, v26.4s, v14.4s +mul v30.4S, v30.4S,v23.s[1] +add v26.4s, v26.4s, v14.4s +mla v2.4S, v9.4S, v31.s[0] +str q16, [x0, #336] +mla v30.4S, v20.4S, v31.s[0] +str q8, [x0, #272] +mul v28.4S, v28.4S,v23.s[2] +str q17, [x0, #464] +mul v11.4S, v11.4S,v23.s[3] +str q12, [x0, #400] +mla v28.4S, v3.4S, v31.s[0] +str q21, [x0, #208] +mla v11.4S, v22.4S, v31.s[0] +str q24, [x0, #144] +str q13, [x0, #80] +str q26, [x0, #16] +sub v26.4s, v6.4s, v2.4s +str q26, [x0, #592] +add v6.4s, v6.4s, v2.4s +sub v2.4s, v1.4s, v30.4s +str q6, [x0, #528] +add v1.4s, v1.4s, v30.4s +sub v30.4s, v27.4s, v28.4s +str q2, [x0, #720] +add v27.4s, v27.4s, v28.4s +sub v28.4s, v18.4s, v11.4s +str q1, [x0, #656] +add v18.4s, v18.4s, v11.4s +str q30, [x0, #848] +str q27, [x0, #784] +str q28, [x0, #976] +str q18, [x0, #912] +ldr q4, [x0, #224] +ldr q5, [x0, #160] +ldr q25, [x0, #32] +ldr q10, [x17, #+128] +ldr q15, [x17, #+144] +sqrdmulh v29.4S, v25.4S, v15.s[0] +mul v25.4S, v25.4S,v10.s[0] +ldr q19, [x0, #48] +sqrdmulh v0.4S, v19.4S, v15.s[0] +mul v19.4S, v19.4S,v10.s[0] +ldr q14, [x17, #+160] +ldr q9, [x17, #+176] +ldr q16, [x0, #96] +sqrdmulh v20.4S, v16.4S, v9.s[0] +mul v16.4S, v16.4S,v14.s[0] +ldr q8, [x0, #112] +sqrdmulh v17.4S, v8.4S, v9.s[0] +mul v8.4S, v8.4S,v14.s[0] +ldr q12, [x17, #+192] +ldr q3, [x17, #+208] +mla v25.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v5.4S, v3.s[0] +ldr q21, [x0, #176] +mla v19.4S, v0.4S, v31.s[0] +sqrdmulh v0.4S, v21.4S, v3.s[0] +ldr q22, [x17, #+224] +ldr q24, [x17, #+240] +mla v16.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v4.4S, v24.s[0] +ldr q23, [x0, #240] +mla v8.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v23.4S, v24.s[0] +ldr q7, [x0, #0] +ldr q13, [x0, #128] +mul v5.4S, v5.4S,v12.s[0] +sub v26.4s, v7.4s, v25.4s +ldr q6, [x0, #16] +mul v21.4S, v21.4S,v12.s[0] +add v7.4s, v7.4s, v25.4s +ldr q25, [x0, #144] +mla v5.4S, v29.4S, v31.s[0] +sub v29.4s, v6.4s, v19.4s +ldr q2, [x0, #64] +mla v21.4S, v0.4S, v31.s[0] +add v6.4s, v6.4s, v19.4s +ldr q19, [x0, #192] +mul v4.4S, v4.4S,v22.s[0] +sub v0.4s, v2.4s, v16.4s +ldr q1, [x0, #80] +mul v23.4S, v23.4S,v22.s[0] +add v2.4s, v2.4s, v16.4s +ldr q16, [x0, #208] +mla v4.4S, v20.4S, v31.s[0] +mla v23.4S, v17.4S, v31.s[0] +sub v17.4s, v1.4s, v8.4s +sqrdmulh v20.4S, v6.4S, v15.s[1] +add v1.4s, v1.4s, v8.4s +mul v6.4S, v6.4S,v10.s[1] +sqrdmulh v8.4S, v29.4S, v15.s[2] +sub v11.4s, v13.4s, v5.4s +mul v29.4S, v29.4S,v10.s[2] +add v13.4s, v13.4s, v5.4s +sqrdmulh v15.4S, v1.4S, v9.s[1] +sub v10.4s, v25.4s, v21.4s +mul v1.4S, v1.4S,v14.s[1] +add v25.4s, v25.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v9.s[2] +sub v5.4s, v19.4s, v4.4s +mul v17.4S, v17.4S,v14.s[2] +add v19.4s, v19.4s, v4.4s +mla v6.4S, v20.4S, v31.s[0] +sub v20.4s, v16.4s, v23.4s +ldr q9, [x0, #480] +sqrdmulh v14.4S, v25.4S, v3.s[1] +add v16.4s, v16.4s, v23.4s +mla v29.4S, v8.4S, v31.s[0] +ldr q8, [x0, #416] +sqrdmulh v23.4S, v10.4S, v3.s[2] +sub v4.4s, v7.4s, v6.4s +mla v1.4S, v15.4S, v31.s[0] +ldr q15, [x0, #288] +sqrdmulh v30.4S, v16.4S, v24.s[1] +add v7.4s, v7.4s, v6.4s +str q4, [x0, #16] +mla v17.4S, v21.4S, v31.s[0] +ldr q21, [x17, #+256] +ldr q4, [x17, #+272] +sqrdmulh v6.4S, v20.4S, v24.s[2] +sub v27.4s, v26.4s, v29.4s +str q7, [x0, #0] +mul v25.4S, v25.4S,v12.s[1] +add v26.4s, v26.4s, v29.4s +mul v10.4S, v10.4S,v12.s[2] +str q27, [x0, #48] +mla v25.4S, v14.4S, v31.s[0] +sub v14.4s, v2.4s, v1.4s +mla v10.4S, v23.4S, v31.s[0] +str q26, [x0, #32] +mul v16.4S, v16.4S,v22.s[1] +str q14, [x0, #80] +mul v20.4S, v20.4S,v22.s[2] +add v2.4s, v2.4s, v1.4s +str q2, [x0, #64] +mla v16.4S, v30.4S, v31.s[0] +sub v30.4s, v0.4s, v17.4s +str q30, [x0, #112] +mla v20.4S, v6.4S, v31.s[0] +add v0.4s, v0.4s, v17.4s +str q0, [x0, #96] +sqrdmulh v24.4S, v15.4S, v4.s[0] +sub v22.4s, v13.4s, v25.4s +mul v15.4S, v15.4S,v21.s[0] +str q22, [x0, #144] +ldr q22, [x0, #304] +sqrdmulh v0.4S, v22.4S, v4.s[0] +add v13.4s, v13.4s, v25.4s +mul v22.4S, v22.4S,v21.s[0] +str q13, [x0, #128] +ldr q13, [x17, #+288] +ldr q25, [x17, #+304] +ldr q17, [x0, #352] +sqrdmulh v6.4S, v17.4S, v25.s[0] +sub v30.4s, v11.4s, v10.4s +mul v17.4S, v17.4S,v13.s[0] +str q30, [x0, #176] +ldr q30, [x0, #368] +sqrdmulh v2.4S, v30.4S, v25.s[0] +add v11.4s, v11.4s, v10.4s +mul v30.4S, v30.4S,v13.s[0] +str q11, [x0, #160] +ldr q11, [x17, #+320] +ldr q10, [x17, #+336] +mla v15.4S, v24.4S, v31.s[0] +sub v24.4s, v19.4s, v16.4s +sqrdmulh v1.4S, v8.4S, v10.s[0] +str q24, [x0, #208] +ldr q24, [x0, #432] +mla v22.4S, v0.4S, v31.s[0] +add v19.4s, v19.4s, v16.4s +sqrdmulh v16.4S, v24.4S, v10.s[0] +str q19, [x0, #192] +ldr q19, [x17, #+352] +ldr q0, [x17, #+368] +mla v17.4S, v6.4S, v31.s[0] +sub v6.4s, v5.4s, v20.4s +sqrdmulh v14.4S, v9.4S, v0.s[0] +str q6, [x0, #240] +ldr q6, [x0, #496] +mla v30.4S, v2.4S, v31.s[0] +add v5.4s, v5.4s, v20.4s +sqrdmulh v20.4S, v6.4S, v0.s[0] +str q5, [x0, #224] +ldr q5, [x0, #256] +ldr q2, [x0, #384] +mul v8.4S, v8.4S,v11.s[0] +sub v3.4s, v5.4s, v15.4s +ldr q12, [x0, #272] +mul v24.4S, v24.4S,v11.s[0] +add v5.4s, v5.4s, v15.4s +ldr q15, [x0, #400] +mla v8.4S, v1.4S, v31.s[0] +sub v1.4s, v12.4s, v22.4s +ldr q26, [x0, #320] +mla v24.4S, v16.4S, v31.s[0] +add v12.4s, v12.4s, v22.4s +ldr q22, [x0, #448] +mul v9.4S, v9.4S,v19.s[0] +sub v16.4s, v26.4s, v17.4s +ldr q23, [x0, #336] +mul v6.4S, v6.4S,v19.s[0] +add v26.4s, v26.4s, v17.4s +ldr q17, [x0, #464] +mla v9.4S, v14.4S, v31.s[0] +mla v6.4S, v20.4S, v31.s[0] +sub v20.4s, v23.4s, v30.4s +sqrdmulh v14.4S, v12.4S, v4.s[1] +add v23.4s, v23.4s, v30.4s +mul v12.4S, v12.4S,v21.s[1] +sqrdmulh v30.4S, v1.4S, v4.s[2] +sub v27.4s, v2.4s, v8.4s +mul v1.4S, v1.4S,v21.s[2] +add v2.4s, v2.4s, v8.4s +sqrdmulh v4.4S, v23.4S, v25.s[1] +sub v21.4s, v15.4s, v24.4s +mul v23.4S, v23.4S,v13.s[1] +add v15.4s, v15.4s, v24.4s +sqrdmulh v24.4S, v20.4S, v25.s[2] +sub v8.4s, v22.4s, v9.4s +mul v20.4S, v20.4S,v13.s[2] +add v22.4s, v22.4s, v9.4s +mla v12.4S, v14.4S, v31.s[0] +sub v14.4s, v17.4s, v6.4s +ldr q25, [x0, #736] +sqrdmulh v13.4S, v15.4S, v10.s[1] +add v17.4s, v17.4s, v6.4s +mla v1.4S, v30.4S, v31.s[0] +ldr q30, [x0, #672] +sqrdmulh v6.4S, v21.4S, v10.s[2] +sub v9.4s, v5.4s, v12.4s +mla v23.4S, v4.4S, v31.s[0] +ldr q4, [x0, #544] +sqrdmulh v29.4S, v17.4S, v0.s[1] +add v5.4s, v5.4s, v12.4s +str q9, [x0, #272] +mla v20.4S, v24.4S, v31.s[0] +ldr q24, [x17, #+384] +ldr q9, [x17, #+400] +sqrdmulh v12.4S, v14.4S, v0.s[2] +sub v7.4s, v3.4s, v1.4s +str q5, [x0, #256] +mul v15.4S, v15.4S,v11.s[1] +add v3.4s, v3.4s, v1.4s +mul v21.4S, v21.4S,v11.s[2] +str q7, [x0, #304] +mla v15.4S, v13.4S, v31.s[0] +sub v13.4s, v26.4s, v23.4s +mla v21.4S, v6.4S, v31.s[0] +str q3, [x0, #288] +mul v17.4S, v17.4S,v19.s[1] +str q13, [x0, #336] +mul v14.4S, v14.4S,v19.s[2] +add v26.4s, v26.4s, v23.4s +str q26, [x0, #320] +mla v17.4S, v29.4S, v31.s[0] +sub v29.4s, v16.4s, v20.4s +str q29, [x0, #368] +mla v14.4S, v12.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +str q16, [x0, #352] +sqrdmulh v0.4S, v4.4S, v9.s[0] +sub v19.4s, v2.4s, v15.4s +mul v4.4S, v4.4S,v24.s[0] +str q19, [x0, #400] +ldr q19, [x0, #560] +sqrdmulh v16.4S, v19.4S, v9.s[0] +add v2.4s, v2.4s, v15.4s +mul v19.4S, v19.4S,v24.s[0] +str q2, [x0, #384] +ldr q2, [x17, #+416] +ldr q15, [x17, #+432] +ldr q20, [x0, #608] +sqrdmulh v12.4S, v20.4S, v15.s[0] +sub v29.4s, v27.4s, v21.4s +mul v20.4S, v20.4S,v2.s[0] +str q29, [x0, #432] +ldr q29, [x0, #624] +sqrdmulh v26.4S, v29.4S, v15.s[0] +add v27.4s, v27.4s, v21.4s +mul v29.4S, v29.4S,v2.s[0] +str q27, [x0, #416] +ldr q27, [x17, #+448] +ldr q21, [x17, #+464] +mla v4.4S, v0.4S, v31.s[0] +sub v0.4s, v22.4s, v17.4s +sqrdmulh v23.4S, v30.4S, v21.s[0] +str q0, [x0, #464] +ldr q0, [x0, #688] +mla v19.4S, v16.4S, v31.s[0] +add v22.4s, v22.4s, v17.4s +sqrdmulh v17.4S, v0.4S, v21.s[0] +str q22, [x0, #448] +ldr q22, [x17, #+480] +ldr q16, [x17, #+496] +mla v20.4S, v12.4S, v31.s[0] +sub v12.4s, v8.4s, v14.4s +sqrdmulh v13.4S, v25.4S, v16.s[0] +str q12, [x0, #496] +ldr q12, [x0, #752] +mla v29.4S, v26.4S, v31.s[0] +add v8.4s, v8.4s, v14.4s +sqrdmulh v14.4S, v12.4S, v16.s[0] +str q8, [x0, #480] +ldr q8, [x0, #512] +ldr q26, [x0, #640] +mul v30.4S, v30.4S,v27.s[0] +sub v10.4s, v8.4s, v4.4s +ldr q11, [x0, #528] +mul v0.4S, v0.4S,v27.s[0] +add v8.4s, v8.4s, v4.4s +ldr q4, [x0, #656] +mla v30.4S, v23.4S, v31.s[0] +sub v23.4s, v11.4s, v19.4s +ldr q3, [x0, #576] +mla v0.4S, v17.4S, v31.s[0] +add v11.4s, v11.4s, v19.4s +ldr q19, [x0, #704] +mul v25.4S, v25.4S,v22.s[0] +sub v17.4s, v3.4s, v20.4s +ldr q6, [x0, #592] +mul v12.4S, v12.4S,v22.s[0] +add v3.4s, v3.4s, v20.4s +ldr q20, [x0, #720] +mla v25.4S, v13.4S, v31.s[0] +mla v12.4S, v14.4S, v31.s[0] +sub v14.4s, v6.4s, v29.4s +sqrdmulh v13.4S, v11.4S, v9.s[1] +add v6.4s, v6.4s, v29.4s +mul v11.4S, v11.4S,v24.s[1] +sqrdmulh v29.4S, v23.4S, v9.s[2] +sub v7.4s, v26.4s, v30.4s +mul v23.4S, v23.4S,v24.s[2] +add v26.4s, v26.4s, v30.4s +sqrdmulh v9.4S, v6.4S, v15.s[1] +sub v24.4s, v4.4s, v0.4s +mul v6.4S, v6.4S,v2.s[1] +add v4.4s, v4.4s, v0.4s +sqrdmulh v0.4S, v14.4S, v15.s[2] +sub v30.4s, v19.4s, v25.4s +mul v14.4S, v14.4S,v2.s[2] +add v19.4s, v19.4s, v25.4s +mla v11.4S, v13.4S, v31.s[0] +sub v13.4s, v20.4s, v12.4s +ldr q15, [x0, #992] +sqrdmulh v2.4S, v4.4S, v21.s[1] +add v20.4s, v20.4s, v12.4s +mla v23.4S, v29.4S, v31.s[0] +ldr q29, [x0, #928] +sqrdmulh v12.4S, v24.4S, v21.s[2] +sub v25.4s, v8.4s, v11.4s +mla v6.4S, v9.4S, v31.s[0] +ldr q9, [x0, #800] +sqrdmulh v1.4S, v20.4S, v16.s[1] +add v8.4s, v8.4s, v11.4s +str q25, [x0, #528] +mla v14.4S, v0.4S, v31.s[0] +ldr q0, [x17, #+512] +ldr q25, [x17, #+528] +sqrdmulh v11.4S, v13.4S, v16.s[2] +sub v5.4s, v10.4s, v23.4s +str q8, [x0, #512] +mul v4.4S, v4.4S,v27.s[1] +add v10.4s, v10.4s, v23.4s +mul v24.4S, v24.4S,v27.s[2] +str q5, [x0, #560] +mla v4.4S, v2.4S, v31.s[0] +sub v2.4s, v3.4s, v6.4s +mla v24.4S, v12.4S, v31.s[0] +str q10, [x0, #544] +mul v20.4S, v20.4S,v22.s[1] +str q2, [x0, #592] +mul v13.4S, v13.4S,v22.s[2] +add v3.4s, v3.4s, v6.4s +str q3, [x0, #576] +mla v20.4S, v1.4S, v31.s[0] +sub v1.4s, v17.4s, v14.4s +str q1, [x0, #624] +mla v13.4S, v11.4S, v31.s[0] +add v17.4s, v17.4s, v14.4s +str q17, [x0, #608] +sqrdmulh v16.4S, v9.4S, v25.s[0] +sub v22.4s, v26.4s, v4.4s +mul v9.4S, v9.4S,v0.s[0] +str q22, [x0, #656] +ldr q22, [x0, #816] +sqrdmulh v17.4S, v22.4S, v25.s[0] +add v26.4s, v26.4s, v4.4s +mul v22.4S, v22.4S,v0.s[0] +str q26, [x0, #640] +ldr q26, [x17, #+544] +ldr q4, [x17, #+560] +ldr q14, [x0, #864] +sqrdmulh v11.4S, v14.4S, v4.s[0] +sub v1.4s, v7.4s, v24.4s +mul v14.4S, v14.4S,v26.s[0] +str q1, [x0, #688] +ldr q1, [x0, #880] +sqrdmulh v3.4S, v1.4S, v4.s[0] +add v7.4s, v7.4s, v24.4s +mul v1.4S, v1.4S,v26.s[0] +str q7, [x0, #672] +ldr q7, [x17, #+576] +ldr q24, [x17, #+592] +mla v9.4S, v16.4S, v31.s[0] +sub v16.4s, v19.4s, v20.4s +sqrdmulh v6.4S, v29.4S, v24.s[0] +str q16, [x0, #720] +ldr q16, [x0, #944] +mla v22.4S, v17.4S, v31.s[0] +add v19.4s, v19.4s, v20.4s +sqrdmulh v20.4S, v16.4S, v24.s[0] +str q19, [x0, #704] +ldr q19, [x17, #+608] +ldr q17, [x17, #+624] +mla v14.4S, v11.4S, v31.s[0] +sub v11.4s, v30.4s, v13.4s +sqrdmulh v2.4S, v15.4S, v17.s[0] +str q11, [x0, #752] +ldr q11, [x0, #1008] +mla v1.4S, v3.4S, v31.s[0] +add v30.4s, v30.4s, v13.4s +sqrdmulh v13.4S, v11.4S, v17.s[0] +str q30, [x0, #736] +ldr q30, [x0, #768] +ldr q3, [x0, #896] +mul v29.4S, v29.4S,v7.s[0] +sub v21.4s, v30.4s, v9.4s +ldr q27, [x0, #784] +mul v16.4S, v16.4S,v7.s[0] +add v30.4s, v30.4s, v9.4s +ldr q9, [x0, #912] +mla v29.4S, v6.4S, v31.s[0] +sub v6.4s, v27.4s, v22.4s +ldr q10, [x0, #832] +mla v16.4S, v20.4S, v31.s[0] +add v27.4s, v27.4s, v22.4s +ldr q22, [x0, #960] +mul v15.4S, v15.4S,v19.s[0] +sub v20.4s, v10.4s, v14.4s +ldr q12, [x0, #848] +mul v11.4S, v11.4S,v19.s[0] +add v10.4s, v10.4s, v14.4s +ldr q14, [x0, #976] +mla v15.4S, v2.4S, v31.s[0] +mla v11.4S, v13.4S, v31.s[0] +sub v13.4s, v12.4s, v1.4s +sqrdmulh v2.4S, v27.4S, v25.s[1] +add v12.4s, v12.4s, v1.4s +mul v27.4S, v27.4S,v0.s[1] +sqrdmulh v1.4S, v6.4S, v25.s[2] +sub v5.4s, v3.4s, v29.4s +mul v6.4S, v6.4S,v0.s[2] +add v3.4s, v3.4s, v29.4s +sqrdmulh v25.4S, v12.4S, v4.s[1] +sub v0.4s, v9.4s, v16.4s +mul v12.4S, v12.4S,v26.s[1] +add v9.4s, v9.4s, v16.4s +sqrdmulh v16.4S, v13.4S, v4.s[2] +sub v29.4s, v22.4s, v15.4s +mul v13.4S, v13.4S,v26.s[2] +add v22.4s, v22.4s, v15.4s +mla v27.4S, v2.4S, v31.s[0] +sub v2.4s, v14.4s, v11.4s +sqrdmulh v4.4S, v9.4S, v24.s[1] +add v14.4s, v14.4s, v11.4s +mla v6.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v0.4S, v24.s[2] +sub v11.4s, v30.4s, v27.4s +mla v12.4S, v25.4S, v31.s[0] +sqrdmulh v25.4S, v14.4S, v17.s[1] +add v30.4s, v30.4s, v27.4s +str q11, [x0, #784] +mla v13.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v2.4S, v17.s[2] +sub v11.4s, v21.4s, v6.4s +str q30, [x0, #768] +mul v9.4S, v9.4S,v7.s[1] +add v21.4s, v21.4s, v6.4s +mul v0.4S, v0.4S,v7.s[2] +str q11, [x0, #816] +mla v9.4S, v4.4S, v31.s[0] +sub v4.4s, v10.4s, v12.4s +mla v0.4S, v1.4S, v31.s[0] +str q21, [x0, #800] +mul v14.4S, v14.4S,v19.s[1] +str q4, [x0, #848] +mul v2.4S, v2.4S,v19.s[2] +add v10.4s, v10.4s, v12.4s +str q10, [x0, #832] +mla v14.4S, v25.4S, v31.s[0] +sub v25.4s, v20.4s, v13.4s +str q25, [x0, #880] +mla v2.4S, v16.4S, v31.s[0] +add v20.4s, v20.4s, v13.4s +str q20, [x0, #864] +sub v17.4s, v3.4s, v9.4s +str q17, [x0, #912] +add v3.4s, v3.4s, v9.4s +str q3, [x0, #896] +sub v3.4s, v5.4s, v0.4s +str q3, [x0, #944] +add v5.4s, v5.4s, v0.4s +str q5, [x0, #928] +sub v5.4s, v22.4s, v14.4s +str q5, [x0, #976] +add v22.4s, v22.4s, v14.4s +str q22, [x0, #960] +sub v22.4s, v29.4s, v2.4s +str q22, [x0, #1008] +add v29.4s, v29.4s, v2.4s +str q29, [x0, #992] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1548 +// Instruction count: 1544 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_15_z4_7.s b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_15_z4_7.s new file mode 100644 index 0000000..cbb2ab0 --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_15_z4_7.s @@ -0,0 +1,1578 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_15_z4_7 +.global _ntt_u32_incomplete_neon_asm_var_4_2_15_z4_7 +ntt_u32_incomplete_neon_asm_var_4_2_15_z4_7: +_ntt_u32_incomplete_neon_asm_var_4_2_15_z4_7: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x0, #992] +ldr q29, [x17, #+0] +ldr q28, [x17, #+16] +sqrdmulh v27.4S, v30.4S, v28.s[0] +mul v30.4S, v30.4S,v29.s[0] +ldr q26, [x0, #928] +sqrdmulh v25.4S, v26.4S, v28.s[0] +mul v26.4S, v26.4S,v29.s[0] +ldr q24, [x0, #864] +sqrdmulh v23.4S, v24.4S, v28.s[0] +mul v24.4S, v24.4S,v29.s[0] +ldr q22, [x0, #800] +sqrdmulh v21.4S, v22.4S, v28.s[0] +mul v22.4S, v22.4S,v29.s[0] +ldr q20, [x0, #736] +mla v30.4S, v27.4S, v31.s[0] +sqrdmulh v27.4S, v20.4S, v28.s[0] +ldr q19, [x0, #672] +mla v26.4S, v25.4S, v31.s[0] +sqrdmulh v25.4S, v19.4S, v28.s[0] +nop +ldr q18, [x0, #608] +mla v24.4S, v23.4S, v31.s[0] +sqrdmulh v23.4S, v18.4S, v28.s[0] +nop +ldr q17, [x0, #544] +mla v22.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v17.4S, v28.s[0] +nop +ldr q16, [x0, #480] +ldr q3, [x0, #416] +mul v20.4S, v20.4S,v29.s[0] +sub v2.4s, v16.4s, v30.4s +mul v19.4S, v19.4S,v29.s[0] +add v16.4s, v16.4s, v30.4s +ldr q30, [x0, #352] +ldr q1, [x0, #288] +mla v20.4S, v27.4S, v31.s[0] +sub v27.4s, v3.4s, v26.4s +mla v19.4S, v25.4S, v31.s[0] +add v3.4s, v3.4s, v26.4s +ldr q26, [x0, #224] +ldr q25, [x0, #160] +mul v18.4S, v18.4S,v29.s[0] +sub v0.4s, v30.4s, v24.4s +mul v17.4S, v17.4S,v29.s[0] +add v30.4s, v30.4s, v24.4s +ldr q24, [x0, #96] +ldr q15, [x0, #32] +mla v18.4S, v23.4S, v31.s[0] +sub v23.4s, v1.4s, v22.4s +mla v17.4S, v21.4S, v31.s[0] +add v1.4s, v1.4s, v22.4s +sqrdmulh v22.4S, v2.4S, v28.s[2] +nop +mul v2.4S, v2.4S,v29.s[2] +nop +sqrdmulh v21.4S, v27.4S, v28.s[2] +sub v14.4s, v26.4s, v20.4s +mul v27.4S, v27.4S,v29.s[2] +add v26.4s, v26.4s, v20.4s +sqrdmulh v20.4S, v0.4S, v28.s[2] +sub v13.4s, v25.4s, v19.4s +mul v0.4S, v0.4S,v29.s[2] +add v25.4s, v25.4s, v19.4s +sqrdmulh v19.4S, v23.4S, v28.s[2] +sub v12.4s, v24.4s, v18.4s +mul v23.4S, v23.4S,v29.s[2] +add v24.4s, v24.4s, v18.4s +mla v2.4S, v22.4S, v31.s[0] +sub v22.4s, v15.4s, v17.4s +sqrdmulh v18.4S, v16.4S, v28.s[1] +add v15.4s, v15.4s, v17.4s +mla v27.4S, v21.4S, v31.s[0] +nop +sqrdmulh v21.4S, v3.4S, v28.s[1] +nop +mla v0.4S, v20.4S, v31.s[0] +nop +sqrdmulh v20.4S, v30.4S, v28.s[1] +nop +mla v23.4S, v19.4S, v31.s[0] +nop +sqrdmulh v19.4S, v1.4S, v28.s[1] +nop +ldr q17, [x17, #+32] +ldr q11, [x17, #+48] +mul v16.4S, v16.4S,v29.s[1] +sub v10.4s, v14.4s, v2.4s +mul v3.4S, v3.4S,v29.s[1] +add v14.4s, v14.4s, v2.4s +mla v16.4S, v18.4S, v31.s[0] +sub v18.4s, v13.4s, v27.4s +mla v3.4S, v21.4S, v31.s[0] +add v13.4s, v13.4s, v27.4s +mul v30.4S, v30.4S,v29.s[1] +sub v27.4s, v12.4s, v0.4s +mul v1.4S, v1.4S,v29.s[1] +add v12.4s, v12.4s, v0.4s +mla v30.4S, v20.4S, v31.s[0] +sub v20.4s, v22.4s, v23.4s +mla v1.4S, v19.4S, v31.s[0] +add v22.4s, v22.4s, v23.4s +sqrdmulh v23.4S, v10.4S, v11.s[3] +nop +mul v10.4S, v10.4S,v17.s[3] +nop +sqrdmulh v19.4S, v18.4S, v11.s[3] +sub v0.4s, v26.4s, v16.4s +mul v18.4S, v18.4S,v17.s[3] +add v26.4s, v26.4s, v16.4s +sqrdmulh v16.4S, v14.4S, v11.s[2] +sub v21.4s, v25.4s, v3.4s +mul v14.4S, v14.4S,v17.s[2] +add v25.4s, v25.4s, v3.4s +sqrdmulh v3.4S, v13.4S, v11.s[2] +sub v2.4s, v24.4s, v30.4s +mul v13.4S, v13.4S,v17.s[2] +add v24.4s, v24.4s, v30.4s +ldr q30, [x17, #+96] +ldr q9, [x17, #+112] +mla v10.4S, v23.4S, v31.s[0] +sub v23.4s, v15.4s, v1.4s +sqrdmulh v8.4S, v0.4S, v11.s[1] +add v15.4s, v15.4s, v1.4s +mla v18.4S, v19.4S, v31.s[0] +nop +sqrdmulh v19.4S, v21.4S, v11.s[1] +nop +mla v14.4S, v16.4S, v31.s[0] +nop +sqrdmulh v16.4S, v26.4S, v11.s[0] +nop +mla v13.4S, v3.4S, v31.s[0] +nop +sqrdmulh v3.4S, v25.4S, v11.s[0] +nop +ldr q1, [x17, #+64] +ldr q7, [x17, #+80] +mul v0.4S, v0.4S,v17.s[1] +sub v6.4s, v27.4s, v10.4s +mul v21.4S, v21.4S,v17.s[1] +add v27.4s, v27.4s, v10.4s +mla v0.4S, v8.4S, v31.s[0] +sub v8.4s, v20.4s, v18.4s +mla v21.4S, v19.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +mul v26.4S, v26.4S,v17.s[0] +sub v18.4s, v12.4s, v14.4s +mul v25.4S, v25.4S,v17.s[0] +add v12.4s, v12.4s, v14.4s +mla v26.4S, v16.4S, v31.s[0] +sub v16.4s, v22.4s, v13.4s +mla v25.4S, v3.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v6.4S, v9.s[3] +nop +mul v6.4S, v6.4S,v30.s[3] +nop +sqrdmulh v3.4S, v27.4S, v9.s[2] +sub v14.4s, v2.4s, v0.4s +mul v27.4S, v27.4S,v30.s[2] +add v2.4s, v2.4s, v0.4s +sqrdmulh v0.4S, v18.4S, v9.s[1] +sub v19.4s, v23.4s, v21.4s +mul v18.4S, v18.4S,v30.s[1] +add v23.4s, v23.4s, v21.4s +sqrdmulh v21.4S, v12.4S, v9.s[0] +sub v10.4s, v24.4s, v26.4s +mul v12.4S, v12.4S,v30.s[0] +add v24.4s, v24.4s, v26.4s +mla v6.4S, v13.4S, v31.s[0] +sub v13.4s, v15.4s, v25.4s +sqrdmulh v26.4S, v14.4S, v7.s[3] +add v15.4s, v15.4s, v25.4s +mla v27.4S, v3.4S, v31.s[0] +sub v3.4s, v8.4s, v6.4s +sqrdmulh v25.4S, v2.4S, v7.s[2] +add v8.4s, v8.4s, v6.4s +mla v18.4S, v0.4S, v31.s[0] +sub v0.4s, v20.4s, v27.4s +sqrdmulh v6.4S, v10.4S, v7.s[1] +add v20.4s, v20.4s, v27.4s +mla v12.4S, v21.4S, v31.s[0] +sub v21.4s, v16.4s, v18.4s +sqrdmulh v27.4S, v24.4S, v7.s[0] +add v16.4s, v16.4s, v18.4s +mul v14.4S, v14.4S,v1.s[3] +sub v18.4s, v22.4s, v12.4s +mul v2.4S, v2.4S,v1.s[2] +add v22.4s, v22.4s, v12.4s +mla v14.4S, v26.4S, v31.s[0] +str q3, [x0, #992] +mla v2.4S, v25.4S, v31.s[0] +str q8, [x0, #928] +mul v10.4S, v10.4S,v1.s[1] +str q0, [x0, #864] +mul v24.4S, v24.4S,v1.s[0] +str q20, [x0, #800] +mla v10.4S, v6.4S, v31.s[0] +str q21, [x0, #736] +mla v24.4S, v27.4S, v31.s[0] +str q16, [x0, #672] +ldr q16, [x0, #1008] +sqrdmulh v27.4S, v16.4S, v28.s[0] +str q18, [x0, #608] +mul v16.4S, v16.4S,v29.s[0] +str q22, [x0, #544] +ldr q22, [x0, #944] +sqrdmulh v18.4S, v22.4S, v28.s[0] +sub v21.4s, v19.4s, v14.4s +str q21, [x0, #480] +mul v22.4S, v22.4S,v29.s[0] +add v19.4s, v19.4s, v14.4s +ldr q14, [x0, #880] +sqrdmulh v21.4S, v14.4S, v28.s[0] +sub v6.4s, v23.4s, v2.4s +str q19, [x0, #416] +mul v14.4S, v14.4S,v29.s[0] +add v23.4s, v23.4s, v2.4s +ldr q2, [x0, #816] +sqrdmulh v19.4S, v2.4S, v28.s[0] +sub v20.4s, v13.4s, v10.4s +str q6, [x0, #352] +mul v2.4S, v2.4S,v29.s[0] +add v13.4s, v13.4s, v10.4s +ldr q10, [x0, #752] +mla v16.4S, v27.4S, v31.s[0] +sub v27.4s, v15.4s, v24.4s +str q23, [x0, #288] +sqrdmulh v23.4S, v10.4S, v28.s[0] +add v15.4s, v15.4s, v24.4s +ldr q24, [x0, #688] +mla v22.4S, v18.4S, v31.s[0] +str q20, [x0, #224] +sqrdmulh v20.4S, v24.4S, v28.s[0] +nop +ldr q18, [x0, #624] +mla v14.4S, v21.4S, v31.s[0] +str q13, [x0, #160] +sqrdmulh v13.4S, v18.4S, v28.s[0] +nop +ldr q21, [x0, #560] +mla v2.4S, v19.4S, v31.s[0] +str q27, [x0, #96] +sqrdmulh v27.4S, v21.4S, v28.s[0] +nop +ldr q19, [x0, #496] +ldr q6, [x0, #432] +mul v10.4S, v10.4S,v29.s[0] +sub v0.4s, v19.4s, v16.4s +str q15, [x0, #32] +mul v24.4S, v24.4S,v29.s[0] +add v19.4s, v19.4s, v16.4s +ldr q16, [x0, #368] +ldr q15, [x0, #304] +mla v10.4S, v23.4S, v31.s[0] +sub v23.4s, v6.4s, v22.4s +mla v24.4S, v20.4S, v31.s[0] +add v6.4s, v6.4s, v22.4s +ldr q22, [x0, #240] +ldr q20, [x0, #176] +mul v18.4S, v18.4S,v29.s[0] +sub v8.4s, v16.4s, v14.4s +mul v21.4S, v21.4S,v29.s[0] +add v16.4s, v16.4s, v14.4s +ldr q14, [x0, #112] +ldr q25, [x0, #48] +mla v18.4S, v13.4S, v31.s[0] +sub v13.4s, v15.4s, v2.4s +mla v21.4S, v27.4S, v31.s[0] +add v15.4s, v15.4s, v2.4s +sqrdmulh v2.4S, v0.4S, v28.s[2] +nop +mul v0.4S, v0.4S,v29.s[2] +nop +sqrdmulh v27.4S, v23.4S, v28.s[2] +sub v3.4s, v22.4s, v10.4s +mul v23.4S, v23.4S,v29.s[2] +add v22.4s, v22.4s, v10.4s +sqrdmulh v10.4S, v8.4S, v28.s[2] +sub v26.4s, v20.4s, v24.4s +mul v8.4S, v8.4S,v29.s[2] +add v20.4s, v20.4s, v24.4s +sqrdmulh v24.4S, v13.4S, v28.s[2] +sub v12.4s, v14.4s, v18.4s +mul v13.4S, v13.4S,v29.s[2] +add v14.4s, v14.4s, v18.4s +mla v0.4S, v2.4S, v31.s[0] +sub v2.4s, v25.4s, v21.4s +sqrdmulh v18.4S, v19.4S, v28.s[1] +add v25.4s, v25.4s, v21.4s +mla v23.4S, v27.4S, v31.s[0] +nop +sqrdmulh v27.4S, v6.4S, v28.s[1] +nop +mla v8.4S, v10.4S, v31.s[0] +nop +sqrdmulh v10.4S, v16.4S, v28.s[1] +nop +mla v13.4S, v24.4S, v31.s[0] +nop +sqrdmulh v24.4S, v15.4S, v28.s[1] +nop +mul v19.4S, v19.4S,v29.s[1] +sub v21.4s, v3.4s, v0.4s +mul v6.4S, v6.4S,v29.s[1] +add v3.4s, v3.4s, v0.4s +mla v19.4S, v18.4S, v31.s[0] +sub v18.4s, v26.4s, v23.4s +mla v6.4S, v27.4S, v31.s[0] +add v26.4s, v26.4s, v23.4s +mul v16.4S, v16.4S,v29.s[1] +sub v23.4s, v12.4s, v8.4s +mul v15.4S, v15.4S,v29.s[1] +add v12.4s, v12.4s, v8.4s +mla v16.4S, v10.4S, v31.s[0] +sub v10.4s, v2.4s, v13.4s +mla v15.4S, v24.4S, v31.s[0] +add v2.4s, v2.4s, v13.4s +sqrdmulh v13.4S, v21.4S, v11.s[3] +nop +mul v21.4S, v21.4S,v17.s[3] +nop +sqrdmulh v24.4S, v18.4S, v11.s[3] +sub v8.4s, v22.4s, v19.4s +mul v18.4S, v18.4S,v17.s[3] +add v22.4s, v22.4s, v19.4s +sqrdmulh v19.4S, v3.4S, v11.s[2] +sub v27.4s, v20.4s, v6.4s +mul v3.4S, v3.4S,v17.s[2] +add v20.4s, v20.4s, v6.4s +sqrdmulh v6.4S, v26.4S, v11.s[2] +sub v0.4s, v14.4s, v16.4s +mul v26.4S, v26.4S,v17.s[2] +add v14.4s, v14.4s, v16.4s +mla v21.4S, v13.4S, v31.s[0] +sub v13.4s, v25.4s, v15.4s +sqrdmulh v16.4S, v8.4S, v11.s[1] +add v25.4s, v25.4s, v15.4s +mla v18.4S, v24.4S, v31.s[0] +nop +sqrdmulh v24.4S, v27.4S, v11.s[1] +nop +mla v3.4S, v19.4S, v31.s[0] +nop +sqrdmulh v19.4S, v22.4S, v11.s[0] +nop +mla v26.4S, v6.4S, v31.s[0] +nop +sqrdmulh v6.4S, v20.4S, v11.s[0] +nop +mul v8.4S, v8.4S,v17.s[1] +sub v15.4s, v23.4s, v21.4s +mul v27.4S, v27.4S,v17.s[1] +add v23.4s, v23.4s, v21.4s +mla v8.4S, v16.4S, v31.s[0] +sub v16.4s, v10.4s, v18.4s +mla v27.4S, v24.4S, v31.s[0] +add v10.4s, v10.4s, v18.4s +mul v22.4S, v22.4S,v17.s[0] +sub v18.4s, v12.4s, v3.4s +mul v20.4S, v20.4S,v17.s[0] +add v12.4s, v12.4s, v3.4s +mla v22.4S, v19.4S, v31.s[0] +sub v19.4s, v2.4s, v26.4s +mla v20.4S, v6.4S, v31.s[0] +add v2.4s, v2.4s, v26.4s +sqrdmulh v26.4S, v15.4S, v9.s[3] +nop +mul v15.4S, v15.4S,v30.s[3] +nop +sqrdmulh v6.4S, v23.4S, v9.s[2] +sub v3.4s, v0.4s, v8.4s +mul v23.4S, v23.4S,v30.s[2] +add v0.4s, v0.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v9.s[1] +sub v24.4s, v13.4s, v27.4s +mul v18.4S, v18.4S,v30.s[1] +add v13.4s, v13.4s, v27.4s +sqrdmulh v27.4S, v12.4S, v9.s[0] +sub v21.4s, v14.4s, v22.4s +mul v12.4S, v12.4S,v30.s[0] +add v14.4s, v14.4s, v22.4s +mla v15.4S, v26.4S, v31.s[0] +sub v26.4s, v25.4s, v20.4s +sqrdmulh v22.4S, v3.4S, v7.s[3] +add v25.4s, v25.4s, v20.4s +mla v23.4S, v6.4S, v31.s[0] +sub v6.4s, v16.4s, v15.4s +sqrdmulh v20.4S, v0.4S, v7.s[2] +add v16.4s, v16.4s, v15.4s +mla v18.4S, v8.4S, v31.s[0] +sub v8.4s, v10.4s, v23.4s +sqrdmulh v15.4S, v21.4S, v7.s[1] +add v10.4s, v10.4s, v23.4s +mla v12.4S, v27.4S, v31.s[0] +sub v27.4s, v19.4s, v18.4s +sqrdmulh v23.4S, v14.4S, v7.s[0] +add v19.4s, v19.4s, v18.4s +mul v3.4S, v3.4S,v1.s[3] +sub v18.4s, v2.4s, v12.4s +mul v0.4S, v0.4S,v1.s[2] +add v2.4s, v2.4s, v12.4s +mla v3.4S, v22.4S, v31.s[0] +str q6, [x0, #1008] +mla v0.4S, v20.4S, v31.s[0] +str q16, [x0, #944] +mul v21.4S, v21.4S,v1.s[1] +str q8, [x0, #880] +mul v14.4S, v14.4S,v1.s[0] +str q10, [x0, #816] +mla v21.4S, v15.4S, v31.s[0] +str q27, [x0, #752] +mla v14.4S, v23.4S, v31.s[0] +str q19, [x0, #688] +ldr q19, [x0, #960] +sqrdmulh v23.4S, v19.4S, v28.s[0] +str q18, [x0, #624] +mul v19.4S, v19.4S,v29.s[0] +str q2, [x0, #560] +ldr q2, [x0, #896] +sqrdmulh v18.4S, v2.4S, v28.s[0] +sub v27.4s, v24.4s, v3.4s +str q27, [x0, #496] +mul v2.4S, v2.4S,v29.s[0] +add v24.4s, v24.4s, v3.4s +ldr q3, [x0, #832] +sqrdmulh v27.4S, v3.4S, v28.s[0] +sub v15.4s, v13.4s, v0.4s +str q24, [x0, #432] +mul v3.4S, v3.4S,v29.s[0] +add v13.4s, v13.4s, v0.4s +ldr q0, [x0, #768] +sqrdmulh v24.4S, v0.4S, v28.s[0] +sub v10.4s, v26.4s, v21.4s +str q15, [x0, #368] +mul v0.4S, v0.4S,v29.s[0] +add v26.4s, v26.4s, v21.4s +ldr q21, [x0, #704] +mla v19.4S, v23.4S, v31.s[0] +sub v23.4s, v25.4s, v14.4s +str q13, [x0, #304] +sqrdmulh v13.4S, v21.4S, v28.s[0] +add v25.4s, v25.4s, v14.4s +ldr q14, [x0, #640] +mla v2.4S, v18.4S, v31.s[0] +str q10, [x0, #240] +sqrdmulh v10.4S, v14.4S, v28.s[0] +nop +ldr q18, [x0, #576] +mla v3.4S, v27.4S, v31.s[0] +str q26, [x0, #176] +sqrdmulh v26.4S, v18.4S, v28.s[0] +nop +ldr q27, [x0, #512] +mla v0.4S, v24.4S, v31.s[0] +str q23, [x0, #112] +sqrdmulh v23.4S, v27.4S, v28.s[0] +nop +ldr q24, [x0, #448] +ldr q15, [x0, #384] +mul v21.4S, v21.4S,v29.s[0] +sub v8.4s, v24.4s, v19.4s +str q25, [x0, #48] +mul v14.4S, v14.4S,v29.s[0] +add v24.4s, v24.4s, v19.4s +ldr q19, [x0, #320] +ldr q25, [x0, #256] +mla v21.4S, v13.4S, v31.s[0] +sub v13.4s, v15.4s, v2.4s +mla v14.4S, v10.4S, v31.s[0] +add v15.4s, v15.4s, v2.4s +ldr q2, [x0, #192] +ldr q10, [x0, #128] +mul v18.4S, v18.4S,v29.s[0] +sub v16.4s, v19.4s, v3.4s +mul v27.4S, v27.4S,v29.s[0] +add v19.4s, v19.4s, v3.4s +ldr q3, [x0, #64] +ldr q20, [x0, #0] +mla v18.4S, v26.4S, v31.s[0] +sub v26.4s, v25.4s, v0.4s +mla v27.4S, v23.4S, v31.s[0] +add v25.4s, v25.4s, v0.4s +sqrdmulh v0.4S, v8.4S, v28.s[2] +nop +mul v8.4S, v8.4S,v29.s[2] +nop +sqrdmulh v23.4S, v13.4S, v28.s[2] +sub v6.4s, v2.4s, v21.4s +mul v13.4S, v13.4S,v29.s[2] +add v2.4s, v2.4s, v21.4s +sqrdmulh v21.4S, v16.4S, v28.s[2] +sub v22.4s, v10.4s, v14.4s +mul v16.4S, v16.4S,v29.s[2] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v26.4S, v28.s[2] +sub v12.4s, v3.4s, v18.4s +mul v26.4S, v26.4S,v29.s[2] +add v3.4s, v3.4s, v18.4s +mla v8.4S, v0.4S, v31.s[0] +sub v0.4s, v20.4s, v27.4s +sqrdmulh v18.4S, v24.4S, v28.s[1] +add v20.4s, v20.4s, v27.4s +mla v13.4S, v23.4S, v31.s[0] +nop +sqrdmulh v23.4S, v15.4S, v28.s[1] +nop +mla v16.4S, v21.4S, v31.s[0] +nop +sqrdmulh v21.4S, v19.4S, v28.s[1] +nop +mla v26.4S, v14.4S, v31.s[0] +nop +sqrdmulh v14.4S, v25.4S, v28.s[1] +nop +mul v24.4S, v24.4S,v29.s[1] +sub v27.4s, v6.4s, v8.4s +mul v15.4S, v15.4S,v29.s[1] +add v6.4s, v6.4s, v8.4s +mla v24.4S, v18.4S, v31.s[0] +sub v18.4s, v22.4s, v13.4s +mla v15.4S, v23.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +mul v19.4S, v19.4S,v29.s[1] +sub v13.4s, v12.4s, v16.4s +mul v25.4S, v25.4S,v29.s[1] +add v12.4s, v12.4s, v16.4s +mla v19.4S, v21.4S, v31.s[0] +sub v21.4s, v0.4s, v26.4s +mla v25.4S, v14.4S, v31.s[0] +add v0.4s, v0.4s, v26.4s +sqrdmulh v26.4S, v27.4S, v11.s[3] +nop +mul v27.4S, v27.4S,v17.s[3] +nop +sqrdmulh v14.4S, v18.4S, v11.s[3] +sub v16.4s, v2.4s, v24.4s +mul v18.4S, v18.4S,v17.s[3] +add v2.4s, v2.4s, v24.4s +sqrdmulh v24.4S, v6.4S, v11.s[2] +sub v23.4s, v10.4s, v15.4s +mul v6.4S, v6.4S,v17.s[2] +add v10.4s, v10.4s, v15.4s +sqrdmulh v15.4S, v22.4S, v11.s[2] +sub v8.4s, v3.4s, v19.4s +mul v22.4S, v22.4S,v17.s[2] +add v3.4s, v3.4s, v19.4s +mla v27.4S, v26.4S, v31.s[0] +sub v26.4s, v20.4s, v25.4s +sqrdmulh v19.4S, v16.4S, v11.s[1] +add v20.4s, v20.4s, v25.4s +mla v18.4S, v14.4S, v31.s[0] +nop +sqrdmulh v14.4S, v23.4S, v11.s[1] +nop +mla v6.4S, v24.4S, v31.s[0] +nop +sqrdmulh v24.4S, v2.4S, v11.s[0] +nop +mla v22.4S, v15.4S, v31.s[0] +nop +sqrdmulh v15.4S, v10.4S, v11.s[0] +nop +mul v16.4S, v16.4S,v17.s[1] +sub v25.4s, v13.4s, v27.4s +mul v23.4S, v23.4S,v17.s[1] +add v13.4s, v13.4s, v27.4s +mla v16.4S, v19.4S, v31.s[0] +sub v19.4s, v21.4s, v18.4s +mla v23.4S, v14.4S, v31.s[0] +add v21.4s, v21.4s, v18.4s +mul v2.4S, v2.4S,v17.s[0] +sub v18.4s, v12.4s, v6.4s +mul v10.4S, v10.4S,v17.s[0] +add v12.4s, v12.4s, v6.4s +mla v2.4S, v24.4S, v31.s[0] +sub v24.4s, v0.4s, v22.4s +mla v10.4S, v15.4S, v31.s[0] +add v0.4s, v0.4s, v22.4s +sqrdmulh v22.4S, v25.4S, v9.s[3] +nop +mul v25.4S, v25.4S,v30.s[3] +nop +sqrdmulh v15.4S, v13.4S, v9.s[2] +sub v6.4s, v8.4s, v16.4s +mul v13.4S, v13.4S,v30.s[2] +add v8.4s, v8.4s, v16.4s +sqrdmulh v16.4S, v18.4S, v9.s[1] +sub v14.4s, v26.4s, v23.4s +mul v18.4S, v18.4S,v30.s[1] +add v26.4s, v26.4s, v23.4s +sqrdmulh v23.4S, v12.4S, v9.s[0] +sub v27.4s, v3.4s, v2.4s +mul v12.4S, v12.4S,v30.s[0] +add v3.4s, v3.4s, v2.4s +mla v25.4S, v22.4S, v31.s[0] +sub v22.4s, v20.4s, v10.4s +sqrdmulh v2.4S, v6.4S, v7.s[3] +add v20.4s, v20.4s, v10.4s +mla v13.4S, v15.4S, v31.s[0] +sub v15.4s, v19.4s, v25.4s +sqrdmulh v10.4S, v8.4S, v7.s[2] +add v19.4s, v19.4s, v25.4s +mla v18.4S, v16.4S, v31.s[0] +sub v16.4s, v21.4s, v13.4s +sqrdmulh v25.4S, v27.4S, v7.s[1] +add v21.4s, v21.4s, v13.4s +mla v12.4S, v23.4S, v31.s[0] +sub v23.4s, v24.4s, v18.4s +sqrdmulh v13.4S, v3.4S, v7.s[0] +add v24.4s, v24.4s, v18.4s +mul v6.4S, v6.4S,v1.s[3] +sub v18.4s, v0.4s, v12.4s +mul v8.4S, v8.4S,v1.s[2] +add v0.4s, v0.4s, v12.4s +mla v6.4S, v2.4S, v31.s[0] +str q15, [x0, #960] +mla v8.4S, v10.4S, v31.s[0] +str q19, [x0, #896] +mul v27.4S, v27.4S,v1.s[1] +str q16, [x0, #832] +mul v3.4S, v3.4S,v1.s[0] +str q21, [x0, #768] +mla v27.4S, v25.4S, v31.s[0] +str q23, [x0, #704] +mla v3.4S, v13.4S, v31.s[0] +str q24, [x0, #640] +ldr q24, [x0, #976] +sqrdmulh v13.4S, v24.4S, v28.s[0] +str q18, [x0, #576] +mul v24.4S, v24.4S,v29.s[0] +str q0, [x0, #512] +ldr q0, [x0, #912] +sqrdmulh v18.4S, v0.4S, v28.s[0] +sub v23.4s, v14.4s, v6.4s +str q23, [x0, #448] +mul v0.4S, v0.4S,v29.s[0] +add v14.4s, v14.4s, v6.4s +ldr q6, [x0, #848] +sqrdmulh v23.4S, v6.4S, v28.s[0] +sub v25.4s, v26.4s, v8.4s +str q14, [x0, #384] +mul v6.4S, v6.4S,v29.s[0] +add v26.4s, v26.4s, v8.4s +ldr q8, [x0, #784] +sqrdmulh v14.4S, v8.4S, v28.s[0] +sub v21.4s, v22.4s, v27.4s +str q25, [x0, #320] +mul v8.4S, v8.4S,v29.s[0] +add v22.4s, v22.4s, v27.4s +ldr q27, [x0, #720] +mla v24.4S, v13.4S, v31.s[0] +sub v13.4s, v20.4s, v3.4s +str q26, [x0, #256] +sqrdmulh v26.4S, v27.4S, v28.s[0] +add v20.4s, v20.4s, v3.4s +ldr q3, [x0, #656] +mla v0.4S, v18.4S, v31.s[0] +str q21, [x0, #192] +sqrdmulh v21.4S, v3.4S, v28.s[0] +nop +ldr q18, [x0, #592] +mla v6.4S, v23.4S, v31.s[0] +str q22, [x0, #128] +sqrdmulh v22.4S, v18.4S, v28.s[0] +nop +ldr q23, [x0, #528] +mla v8.4S, v14.4S, v31.s[0] +str q13, [x0, #64] +sqrdmulh v13.4S, v23.4S, v28.s[0] +nop +ldr q14, [x0, #464] +ldr q25, [x0, #400] +mul v27.4S, v27.4S,v29.s[0] +sub v16.4s, v14.4s, v24.4s +str q20, [x0, #0] +mul v3.4S, v3.4S,v29.s[0] +add v14.4s, v14.4s, v24.4s +ldr q24, [x0, #336] +ldr q20, [x0, #272] +mla v27.4S, v26.4S, v31.s[0] +sub v26.4s, v25.4s, v0.4s +mla v3.4S, v21.4S, v31.s[0] +add v25.4s, v25.4s, v0.4s +ldr q0, [x0, #208] +ldr q21, [x0, #144] +mul v18.4S, v18.4S,v29.s[0] +sub v19.4s, v24.4s, v6.4s +mul v23.4S, v23.4S,v29.s[0] +add v24.4s, v24.4s, v6.4s +ldr q6, [x0, #80] +ldr q10, [x0, #16] +mla v18.4S, v22.4S, v31.s[0] +sub v22.4s, v20.4s, v8.4s +mla v23.4S, v13.4S, v31.s[0] +add v20.4s, v20.4s, v8.4s +sqrdmulh v8.4S, v16.4S, v28.s[2] +nop +mul v16.4S, v16.4S,v29.s[2] +nop +sqrdmulh v13.4S, v26.4S, v28.s[2] +sub v15.4s, v0.4s, v27.4s +mul v26.4S, v26.4S,v29.s[2] +add v0.4s, v0.4s, v27.4s +sqrdmulh v27.4S, v19.4S, v28.s[2] +sub v2.4s, v21.4s, v3.4s +mul v19.4S, v19.4S,v29.s[2] +add v21.4s, v21.4s, v3.4s +sqrdmulh v3.4S, v22.4S, v28.s[2] +sub v12.4s, v6.4s, v18.4s +mul v22.4S, v22.4S,v29.s[2] +add v6.4s, v6.4s, v18.4s +mla v16.4S, v8.4S, v31.s[0] +sub v8.4s, v10.4s, v23.4s +sqrdmulh v18.4S, v14.4S, v28.s[1] +add v10.4s, v10.4s, v23.4s +mla v26.4S, v13.4S, v31.s[0] +nop +sqrdmulh v13.4S, v25.4S, v28.s[1] +nop +mla v19.4S, v27.4S, v31.s[0] +nop +sqrdmulh v27.4S, v24.4S, v28.s[1] +nop +mla v22.4S, v3.4S, v31.s[0] +nop +sqrdmulh v3.4S, v20.4S, v28.s[1] +nop +mul v14.4S, v14.4S,v29.s[1] +sub v23.4s, v15.4s, v16.4s +mul v25.4S, v25.4S,v29.s[1] +add v15.4s, v15.4s, v16.4s +mla v14.4S, v18.4S, v31.s[0] +sub v18.4s, v2.4s, v26.4s +mla v25.4S, v13.4S, v31.s[0] +add v2.4s, v2.4s, v26.4s +mul v24.4S, v24.4S,v29.s[1] +sub v26.4s, v12.4s, v19.4s +mul v20.4S, v20.4S,v29.s[1] +add v12.4s, v12.4s, v19.4s +mla v24.4S, v27.4S, v31.s[0] +sub v27.4s, v8.4s, v22.4s +mla v20.4S, v3.4S, v31.s[0] +add v8.4s, v8.4s, v22.4s +sqrdmulh v28.4S, v23.4S, v11.s[3] +nop +mul v23.4S, v23.4S,v17.s[3] +nop +sqrdmulh v29.4S, v18.4S, v11.s[3] +sub v22.4s, v0.4s, v14.4s +mul v18.4S, v18.4S,v17.s[3] +add v0.4s, v0.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v11.s[2] +sub v3.4s, v21.4s, v25.4s +mul v15.4S, v15.4S,v17.s[2] +add v21.4s, v21.4s, v25.4s +sqrdmulh v25.4S, v2.4S, v11.s[2] +sub v19.4s, v6.4s, v24.4s +mul v2.4S, v2.4S,v17.s[2] +add v6.4s, v6.4s, v24.4s +mla v23.4S, v28.4S, v31.s[0] +sub v28.4s, v10.4s, v20.4s +sqrdmulh v24.4S, v22.4S, v11.s[1] +add v10.4s, v10.4s, v20.4s +mla v18.4S, v29.4S, v31.s[0] +nop +sqrdmulh v29.4S, v3.4S, v11.s[1] +nop +mla v15.4S, v14.4S, v31.s[0] +nop +sqrdmulh v14.4S, v0.4S, v11.s[0] +nop +mla v2.4S, v25.4S, v31.s[0] +nop +sqrdmulh v25.4S, v21.4S, v11.s[0] +nop +mul v22.4S, v22.4S,v17.s[1] +sub v20.4s, v26.4s, v23.4s +mul v3.4S, v3.4S,v17.s[1] +add v26.4s, v26.4s, v23.4s +mla v22.4S, v24.4S, v31.s[0] +sub v24.4s, v27.4s, v18.4s +mla v3.4S, v29.4S, v31.s[0] +add v27.4s, v27.4s, v18.4s +mul v0.4S, v0.4S,v17.s[0] +sub v18.4s, v12.4s, v15.4s +mul v21.4S, v21.4S,v17.s[0] +add v12.4s, v12.4s, v15.4s +mla v0.4S, v14.4S, v31.s[0] +sub v14.4s, v8.4s, v2.4s +mla v21.4S, v25.4S, v31.s[0] +add v8.4s, v8.4s, v2.4s +sqrdmulh v11.4S, v20.4S, v9.s[3] +nop +mul v20.4S, v20.4S,v30.s[3] +nop +sqrdmulh v17.4S, v26.4S, v9.s[2] +sub v2.4s, v19.4s, v22.4s +mul v26.4S, v26.4S,v30.s[2] +add v19.4s, v19.4s, v22.4s +sqrdmulh v22.4S, v18.4S, v9.s[1] +sub v25.4s, v28.4s, v3.4s +mul v18.4S, v18.4S,v30.s[1] +add v28.4s, v28.4s, v3.4s +sqrdmulh v3.4S, v12.4S, v9.s[0] +sub v15.4s, v6.4s, v0.4s +mul v12.4S, v12.4S,v30.s[0] +add v6.4s, v6.4s, v0.4s +mla v20.4S, v11.4S, v31.s[0] +sub v11.4s, v10.4s, v21.4s +sqrdmulh v9.4S, v2.4S, v7.s[3] +add v10.4s, v10.4s, v21.4s +mla v26.4S, v17.4S, v31.s[0] +sub v17.4s, v24.4s, v20.4s +sqrdmulh v21.4S, v19.4S, v7.s[2] +add v24.4s, v24.4s, v20.4s +mla v18.4S, v22.4S, v31.s[0] +sub v22.4s, v27.4s, v26.4s +sqrdmulh v20.4S, v15.4S, v7.s[1] +add v27.4s, v27.4s, v26.4s +mla v12.4S, v3.4S, v31.s[0] +sub v3.4s, v14.4s, v18.4s +sqrdmulh v26.4S, v6.4S, v7.s[0] +add v14.4s, v14.4s, v18.4s +mul v2.4S, v2.4S,v1.s[3] +sub v18.4s, v8.4s, v12.4s +mul v19.4S, v19.4S,v1.s[2] +add v8.4s, v8.4s, v12.4s +mla v2.4S, v9.4S, v31.s[0] +str q17, [x0, #976] +mla v19.4S, v21.4S, v31.s[0] +str q24, [x0, #912] +mul v15.4S, v15.4S,v1.s[1] +str q22, [x0, #848] +mul v6.4S, v6.4S,v1.s[0] +str q27, [x0, #784] +mla v15.4S, v20.4S, v31.s[0] +str q3, [x0, #720] +mla v6.4S, v26.4S, v31.s[0] +str q14, [x0, #656] +str q18, [x0, #592] +str q8, [x0, #528] +sub v8.4s, v25.4s, v2.4s +str q8, [x0, #464] +add v25.4s, v25.4s, v2.4s +sub v2.4s, v28.4s, v19.4s +str q25, [x0, #400] +add v28.4s, v28.4s, v19.4s +sub v19.4s, v11.4s, v15.4s +str q2, [x0, #336] +add v11.4s, v11.4s, v15.4s +sub v15.4s, v10.4s, v6.4s +str q28, [x0, #272] +add v10.4s, v10.4s, v6.4s +str q19, [x0, #208] +str q11, [x0, #144] +str q15, [x0, #80] +str q10, [x0, #16] +ldr q4, [x0, #224] +ldr q5, [x0, #160] +ldr q16, [x0, #32] +ldr q13, [x17, #+128] +ldr q23, [x17, #+144] +sqrdmulh v29.4S, v16.4S, v23.s[0] +mul v16.4S, v16.4S,v13.s[0] +ldr q0, [x0, #48] +sqrdmulh v30.4S, v0.4S, v23.s[0] +mul v0.4S, v0.4S,v13.s[0] +ldr q12, [x17, #+160] +ldr q9, [x17, #+176] +ldr q17, [x0, #96] +sqrdmulh v21.4S, v17.4S, v9.s[0] +mul v17.4S, v17.4S,v12.s[0] +ldr q24, [x0, #112] +sqrdmulh v22.4S, v24.4S, v9.s[0] +mul v24.4S, v24.4S,v12.s[0] +ldr q27, [x17, #+192] +ldr q20, [x17, #+208] +mla v16.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v5.4S, v20.s[0] +ldr q3, [x0, #176] +mla v0.4S, v30.4S, v31.s[0] +sqrdmulh v30.4S, v3.4S, v20.s[0] +ldr q26, [x17, #+224] +ldr q14, [x17, #+240] +mla v17.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v4.4S, v14.s[0] +ldr q1, [x0, #240] +mla v24.4S, v22.4S, v31.s[0] +sqrdmulh v22.4S, v1.4S, v14.s[0] +ldr q7, [x0, #0] +ldr q18, [x0, #128] +mul v5.4S, v5.4S,v27.s[0] +sub v8.4s, v7.4s, v16.4s +ldr q25, [x0, #16] +mul v3.4S, v3.4S,v27.s[0] +add v7.4s, v7.4s, v16.4s +ldr q16, [x0, #144] +mla v5.4S, v29.4S, v31.s[0] +sub v29.4s, v25.4s, v0.4s +ldr q2, [x0, #64] +mla v3.4S, v30.4S, v31.s[0] +add v25.4s, v25.4s, v0.4s +ldr q0, [x0, #192] +mul v4.4S, v4.4S,v26.s[0] +sub v30.4s, v2.4s, v17.4s +ldr q28, [x0, #80] +mul v1.4S, v1.4S,v26.s[0] +add v2.4s, v2.4s, v17.4s +ldr q17, [x0, #208] +mla v4.4S, v21.4S, v31.s[0] +mla v1.4S, v22.4S, v31.s[0] +sub v22.4s, v28.4s, v24.4s +sqrdmulh v21.4S, v25.4S, v23.s[1] +add v28.4s, v28.4s, v24.4s +mul v25.4S, v25.4S,v13.s[1] +sqrdmulh v24.4S, v29.4S, v23.s[2] +sub v6.4s, v18.4s, v5.4s +mul v29.4S, v29.4S,v13.s[2] +add v18.4s, v18.4s, v5.4s +sqrdmulh v23.4S, v28.4S, v9.s[1] +sub v13.4s, v16.4s, v3.4s +mul v28.4S, v28.4S,v12.s[1] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v22.4S, v9.s[2] +sub v5.4s, v0.4s, v4.4s +mul v22.4S, v22.4S,v12.s[2] +add v0.4s, v0.4s, v4.4s +mla v25.4S, v21.4S, v31.s[0] +sub v21.4s, v17.4s, v1.4s +ldr q9, [x0, #480] +sqrdmulh v12.4S, v16.4S, v20.s[1] +add v17.4s, v17.4s, v1.4s +mla v29.4S, v24.4S, v31.s[0] +ldr q24, [x0, #416] +sqrdmulh v1.4S, v13.4S, v20.s[2] +sub v4.4s, v7.4s, v25.4s +mla v28.4S, v23.4S, v31.s[0] +ldr q23, [x0, #288] +sqrdmulh v19.4S, v17.4S, v14.s[1] +add v7.4s, v7.4s, v25.4s +str q4, [x0, #16] +mla v22.4S, v3.4S, v31.s[0] +ldr q3, [x17, #+256] +ldr q4, [x17, #+272] +sqrdmulh v25.4S, v21.4S, v14.s[2] +sub v11.4s, v8.4s, v29.4s +str q7, [x0, #0] +mul v16.4S, v16.4S,v27.s[1] +add v8.4s, v8.4s, v29.4s +mul v13.4S, v13.4S,v27.s[2] +str q11, [x0, #48] +mla v16.4S, v12.4S, v31.s[0] +sub v12.4s, v2.4s, v28.4s +mla v13.4S, v1.4S, v31.s[0] +str q8, [x0, #32] +mul v17.4S, v17.4S,v26.s[1] +str q12, [x0, #80] +mul v21.4S, v21.4S,v26.s[2] +add v2.4s, v2.4s, v28.4s +str q2, [x0, #64] +mla v17.4S, v19.4S, v31.s[0] +sub v19.4s, v30.4s, v22.4s +str q19, [x0, #112] +mla v21.4S, v25.4S, v31.s[0] +add v30.4s, v30.4s, v22.4s +str q30, [x0, #96] +sqrdmulh v14.4S, v23.4S, v4.s[0] +sub v26.4s, v18.4s, v16.4s +mul v23.4S, v23.4S,v3.s[0] +str q26, [x0, #144] +ldr q26, [x0, #304] +sqrdmulh v30.4S, v26.4S, v4.s[0] +add v18.4s, v18.4s, v16.4s +mul v26.4S, v26.4S,v3.s[0] +str q18, [x0, #128] +ldr q18, [x17, #+288] +ldr q16, [x17, #+304] +ldr q22, [x0, #352] +sqrdmulh v25.4S, v22.4S, v16.s[0] +sub v19.4s, v6.4s, v13.4s +mul v22.4S, v22.4S,v18.s[0] +str q19, [x0, #176] +ldr q19, [x0, #368] +sqrdmulh v2.4S, v19.4S, v16.s[0] +add v6.4s, v6.4s, v13.4s +mul v19.4S, v19.4S,v18.s[0] +str q6, [x0, #160] +ldr q6, [x17, #+320] +ldr q13, [x17, #+336] +mla v23.4S, v14.4S, v31.s[0] +sub v14.4s, v0.4s, v17.4s +sqrdmulh v28.4S, v24.4S, v13.s[0] +str q14, [x0, #208] +ldr q14, [x0, #432] +mla v26.4S, v30.4S, v31.s[0] +add v0.4s, v0.4s, v17.4s +sqrdmulh v17.4S, v14.4S, v13.s[0] +str q0, [x0, #192] +ldr q0, [x17, #+352] +ldr q30, [x17, #+368] +mla v22.4S, v25.4S, v31.s[0] +sub v25.4s, v5.4s, v21.4s +sqrdmulh v12.4S, v9.4S, v30.s[0] +str q25, [x0, #240] +ldr q25, [x0, #496] +mla v19.4S, v2.4S, v31.s[0] +add v5.4s, v5.4s, v21.4s +sqrdmulh v21.4S, v25.4S, v30.s[0] +str q5, [x0, #224] +ldr q5, [x0, #256] +ldr q2, [x0, #384] +mul v24.4S, v24.4S,v6.s[0] +sub v20.4s, v5.4s, v23.4s +ldr q27, [x0, #272] +mul v14.4S, v14.4S,v6.s[0] +add v5.4s, v5.4s, v23.4s +ldr q23, [x0, #400] +mla v24.4S, v28.4S, v31.s[0] +sub v28.4s, v27.4s, v26.4s +ldr q8, [x0, #320] +mla v14.4S, v17.4S, v31.s[0] +add v27.4s, v27.4s, v26.4s +ldr q26, [x0, #448] +mul v9.4S, v9.4S,v0.s[0] +sub v17.4s, v8.4s, v22.4s +ldr q1, [x0, #336] +mul v25.4S, v25.4S,v0.s[0] +add v8.4s, v8.4s, v22.4s +ldr q22, [x0, #464] +mla v9.4S, v12.4S, v31.s[0] +mla v25.4S, v21.4S, v31.s[0] +sub v21.4s, v1.4s, v19.4s +sqrdmulh v12.4S, v27.4S, v4.s[1] +add v1.4s, v1.4s, v19.4s +mul v27.4S, v27.4S,v3.s[1] +sqrdmulh v19.4S, v28.4S, v4.s[2] +sub v11.4s, v2.4s, v24.4s +mul v28.4S, v28.4S,v3.s[2] +add v2.4s, v2.4s, v24.4s +sqrdmulh v4.4S, v1.4S, v16.s[1] +sub v3.4s, v23.4s, v14.4s +mul v1.4S, v1.4S,v18.s[1] +add v23.4s, v23.4s, v14.4s +sqrdmulh v14.4S, v21.4S, v16.s[2] +sub v24.4s, v26.4s, v9.4s +mul v21.4S, v21.4S,v18.s[2] +add v26.4s, v26.4s, v9.4s +mla v27.4S, v12.4S, v31.s[0] +sub v12.4s, v22.4s, v25.4s +ldr q16, [x0, #736] +sqrdmulh v18.4S, v23.4S, v13.s[1] +add v22.4s, v22.4s, v25.4s +mla v28.4S, v19.4S, v31.s[0] +ldr q19, [x0, #672] +sqrdmulh v25.4S, v3.4S, v13.s[2] +sub v9.4s, v5.4s, v27.4s +mla v1.4S, v4.4S, v31.s[0] +ldr q4, [x0, #544] +sqrdmulh v29.4S, v22.4S, v30.s[1] +add v5.4s, v5.4s, v27.4s +str q9, [x0, #272] +mla v21.4S, v14.4S, v31.s[0] +ldr q14, [x17, #+384] +ldr q9, [x17, #+400] +sqrdmulh v27.4S, v12.4S, v30.s[2] +sub v7.4s, v20.4s, v28.4s +str q5, [x0, #256] +mul v23.4S, v23.4S,v6.s[1] +add v20.4s, v20.4s, v28.4s +mul v3.4S, v3.4S,v6.s[2] +str q7, [x0, #304] +mla v23.4S, v18.4S, v31.s[0] +sub v18.4s, v8.4s, v1.4s +mla v3.4S, v25.4S, v31.s[0] +str q20, [x0, #288] +mul v22.4S, v22.4S,v0.s[1] +str q18, [x0, #336] +mul v12.4S, v12.4S,v0.s[2] +add v8.4s, v8.4s, v1.4s +str q8, [x0, #320] +mla v22.4S, v29.4S, v31.s[0] +sub v29.4s, v17.4s, v21.4s +str q29, [x0, #368] +mla v12.4S, v27.4S, v31.s[0] +add v17.4s, v17.4s, v21.4s +str q17, [x0, #352] +sqrdmulh v30.4S, v4.4S, v9.s[0] +sub v0.4s, v2.4s, v23.4s +mul v4.4S, v4.4S,v14.s[0] +str q0, [x0, #400] +ldr q0, [x0, #560] +sqrdmulh v17.4S, v0.4S, v9.s[0] +add v2.4s, v2.4s, v23.4s +mul v0.4S, v0.4S,v14.s[0] +str q2, [x0, #384] +ldr q2, [x17, #+416] +ldr q23, [x17, #+432] +ldr q21, [x0, #608] +sqrdmulh v27.4S, v21.4S, v23.s[0] +sub v29.4s, v11.4s, v3.4s +mul v21.4S, v21.4S,v2.s[0] +str q29, [x0, #432] +ldr q29, [x0, #624] +sqrdmulh v8.4S, v29.4S, v23.s[0] +add v11.4s, v11.4s, v3.4s +mul v29.4S, v29.4S,v2.s[0] +str q11, [x0, #416] +ldr q11, [x17, #+448] +ldr q3, [x17, #+464] +mla v4.4S, v30.4S, v31.s[0] +sub v30.4s, v26.4s, v22.4s +sqrdmulh v1.4S, v19.4S, v3.s[0] +str q30, [x0, #464] +ldr q30, [x0, #688] +mla v0.4S, v17.4S, v31.s[0] +add v26.4s, v26.4s, v22.4s +sqrdmulh v22.4S, v30.4S, v3.s[0] +str q26, [x0, #448] +ldr q26, [x17, #+480] +ldr q17, [x17, #+496] +mla v21.4S, v27.4S, v31.s[0] +sub v27.4s, v24.4s, v12.4s +sqrdmulh v18.4S, v16.4S, v17.s[0] +str q27, [x0, #496] +ldr q27, [x0, #752] +mla v29.4S, v8.4S, v31.s[0] +add v24.4s, v24.4s, v12.4s +sqrdmulh v12.4S, v27.4S, v17.s[0] +str q24, [x0, #480] +ldr q24, [x0, #512] +ldr q8, [x0, #640] +mul v19.4S, v19.4S,v11.s[0] +sub v13.4s, v24.4s, v4.4s +ldr q6, [x0, #528] +mul v30.4S, v30.4S,v11.s[0] +add v24.4s, v24.4s, v4.4s +ldr q4, [x0, #656] +mla v19.4S, v1.4S, v31.s[0] +sub v1.4s, v6.4s, v0.4s +ldr q20, [x0, #576] +mla v30.4S, v22.4S, v31.s[0] +add v6.4s, v6.4s, v0.4s +ldr q0, [x0, #704] +mul v16.4S, v16.4S,v26.s[0] +sub v22.4s, v20.4s, v21.4s +ldr q25, [x0, #592] +mul v27.4S, v27.4S,v26.s[0] +add v20.4s, v20.4s, v21.4s +ldr q21, [x0, #720] +mla v16.4S, v18.4S, v31.s[0] +mla v27.4S, v12.4S, v31.s[0] +sub v12.4s, v25.4s, v29.4s +sqrdmulh v18.4S, v6.4S, v9.s[1] +add v25.4s, v25.4s, v29.4s +mul v6.4S, v6.4S,v14.s[1] +sqrdmulh v29.4S, v1.4S, v9.s[2] +sub v7.4s, v8.4s, v19.4s +mul v1.4S, v1.4S,v14.s[2] +add v8.4s, v8.4s, v19.4s +sqrdmulh v9.4S, v25.4S, v23.s[1] +sub v14.4s, v4.4s, v30.4s +mul v25.4S, v25.4S,v2.s[1] +add v4.4s, v4.4s, v30.4s +sqrdmulh v30.4S, v12.4S, v23.s[2] +sub v19.4s, v0.4s, v16.4s +mul v12.4S, v12.4S,v2.s[2] +add v0.4s, v0.4s, v16.4s +mla v6.4S, v18.4S, v31.s[0] +sub v18.4s, v21.4s, v27.4s +ldr q23, [x0, #992] +sqrdmulh v2.4S, v4.4S, v3.s[1] +add v21.4s, v21.4s, v27.4s +mla v1.4S, v29.4S, v31.s[0] +ldr q29, [x0, #928] +sqrdmulh v27.4S, v14.4S, v3.s[2] +sub v16.4s, v24.4s, v6.4s +mla v25.4S, v9.4S, v31.s[0] +ldr q9, [x0, #800] +sqrdmulh v28.4S, v21.4S, v17.s[1] +add v24.4s, v24.4s, v6.4s +str q16, [x0, #528] +mla v12.4S, v30.4S, v31.s[0] +ldr q30, [x17, #+512] +ldr q16, [x17, #+528] +sqrdmulh v6.4S, v18.4S, v17.s[2] +sub v5.4s, v13.4s, v1.4s +str q24, [x0, #512] +mul v4.4S, v4.4S,v11.s[1] +add v13.4s, v13.4s, v1.4s +mul v14.4S, v14.4S,v11.s[2] +str q5, [x0, #560] +mla v4.4S, v2.4S, v31.s[0] +sub v2.4s, v20.4s, v25.4s +mla v14.4S, v27.4S, v31.s[0] +str q13, [x0, #544] +mul v21.4S, v21.4S,v26.s[1] +str q2, [x0, #592] +mul v18.4S, v18.4S,v26.s[2] +add v20.4s, v20.4s, v25.4s +str q20, [x0, #576] +mla v21.4S, v28.4S, v31.s[0] +sub v28.4s, v22.4s, v12.4s +str q28, [x0, #624] +mla v18.4S, v6.4S, v31.s[0] +add v22.4s, v22.4s, v12.4s +str q22, [x0, #608] +sqrdmulh v17.4S, v9.4S, v16.s[0] +sub v26.4s, v8.4s, v4.4s +mul v9.4S, v9.4S,v30.s[0] +str q26, [x0, #656] +ldr q26, [x0, #816] +sqrdmulh v22.4S, v26.4S, v16.s[0] +add v8.4s, v8.4s, v4.4s +mul v26.4S, v26.4S,v30.s[0] +str q8, [x0, #640] +ldr q8, [x17, #+544] +ldr q4, [x17, #+560] +ldr q12, [x0, #864] +sqrdmulh v6.4S, v12.4S, v4.s[0] +sub v28.4s, v7.4s, v14.4s +mul v12.4S, v12.4S,v8.s[0] +str q28, [x0, #688] +ldr q28, [x0, #880] +sqrdmulh v20.4S, v28.4S, v4.s[0] +add v7.4s, v7.4s, v14.4s +mul v28.4S, v28.4S,v8.s[0] +str q7, [x0, #672] +ldr q7, [x17, #+576] +ldr q14, [x17, #+592] +mla v9.4S, v17.4S, v31.s[0] +sub v17.4s, v0.4s, v21.4s +sqrdmulh v25.4S, v29.4S, v14.s[0] +str q17, [x0, #720] +ldr q17, [x0, #944] +mla v26.4S, v22.4S, v31.s[0] +add v0.4s, v0.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v14.s[0] +str q0, [x0, #704] +ldr q0, [x17, #+608] +ldr q22, [x17, #+624] +mla v12.4S, v6.4S, v31.s[0] +sub v6.4s, v19.4s, v18.4s +sqrdmulh v2.4S, v23.4S, v22.s[0] +str q6, [x0, #752] +ldr q6, [x0, #1008] +mla v28.4S, v20.4S, v31.s[0] +add v19.4s, v19.4s, v18.4s +sqrdmulh v18.4S, v6.4S, v22.s[0] +str q19, [x0, #736] +ldr q19, [x0, #768] +ldr q20, [x0, #896] +mul v29.4S, v29.4S,v7.s[0] +sub v3.4s, v19.4s, v9.4s +ldr q11, [x0, #784] +mul v17.4S, v17.4S,v7.s[0] +add v19.4s, v19.4s, v9.4s +ldr q9, [x0, #912] +mla v29.4S, v25.4S, v31.s[0] +sub v25.4s, v11.4s, v26.4s +ldr q13, [x0, #832] +mla v17.4S, v21.4S, v31.s[0] +add v11.4s, v11.4s, v26.4s +ldr q26, [x0, #960] +mul v23.4S, v23.4S,v0.s[0] +sub v21.4s, v13.4s, v12.4s +ldr q27, [x0, #848] +mul v6.4S, v6.4S,v0.s[0] +add v13.4s, v13.4s, v12.4s +ldr q12, [x0, #976] +mla v23.4S, v2.4S, v31.s[0] +mla v6.4S, v18.4S, v31.s[0] +sub v18.4s, v27.4s, v28.4s +sqrdmulh v2.4S, v11.4S, v16.s[1] +add v27.4s, v27.4s, v28.4s +mul v11.4S, v11.4S,v30.s[1] +sqrdmulh v28.4S, v25.4S, v16.s[2] +sub v5.4s, v20.4s, v29.4s +mul v25.4S, v25.4S,v30.s[2] +add v20.4s, v20.4s, v29.4s +sqrdmulh v16.4S, v27.4S, v4.s[1] +sub v30.4s, v9.4s, v17.4s +mul v27.4S, v27.4S,v8.s[1] +add v9.4s, v9.4s, v17.4s +sqrdmulh v17.4S, v18.4S, v4.s[2] +sub v29.4s, v26.4s, v23.4s +mul v18.4S, v18.4S,v8.s[2] +add v26.4s, v26.4s, v23.4s +mla v11.4S, v2.4S, v31.s[0] +sub v2.4s, v12.4s, v6.4s +sqrdmulh v4.4S, v9.4S, v14.s[1] +add v12.4s, v12.4s, v6.4s +mla v25.4S, v28.4S, v31.s[0] +sqrdmulh v28.4S, v30.4S, v14.s[2] +sub v6.4s, v19.4s, v11.4s +mla v27.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v12.4S, v22.s[1] +add v19.4s, v19.4s, v11.4s +str q6, [x0, #784] +mla v18.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v2.4S, v22.s[2] +sub v6.4s, v3.4s, v25.4s +str q19, [x0, #768] +mul v9.4S, v9.4S,v7.s[1] +add v3.4s, v3.4s, v25.4s +mul v30.4S, v30.4S,v7.s[2] +str q6, [x0, #816] +mla v9.4S, v4.4S, v31.s[0] +sub v4.4s, v13.4s, v27.4s +mla v30.4S, v28.4S, v31.s[0] +str q3, [x0, #800] +mul v12.4S, v12.4S,v0.s[1] +str q4, [x0, #848] +mul v2.4S, v2.4S,v0.s[2] +add v13.4s, v13.4s, v27.4s +str q13, [x0, #832] +mla v12.4S, v16.4S, v31.s[0] +sub v16.4s, v21.4s, v18.4s +str q16, [x0, #880] +mla v2.4S, v17.4S, v31.s[0] +add v21.4s, v21.4s, v18.4s +str q21, [x0, #864] +sub v22.4s, v20.4s, v9.4s +str q22, [x0, #912] +add v20.4s, v20.4s, v9.4s +str q20, [x0, #896] +sub v20.4s, v5.4s, v30.4s +str q20, [x0, #944] +add v5.4s, v5.4s, v30.4s +str q5, [x0, #928] +sub v5.4s, v26.4s, v12.4s +str q5, [x0, #976] +add v26.4s, v26.4s, v12.4s +str q26, [x0, #960] +sub v26.4s, v29.4s, v2.4s +str q26, [x0, #1008] +add v29.4s, v29.4s, v2.4s +str q29, [x0, #992] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1548 +// Instruction count: 1544 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_16_z4_7.s b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_16_z4_7.s new file mode 100644 index 0000000..d7db1d0 --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_16_z4_7.s @@ -0,0 +1,1578 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_16_z4_7 +.global _ntt_u32_incomplete_neon_asm_var_4_2_16_z4_7 +ntt_u32_incomplete_neon_asm_var_4_2_16_z4_7: +_ntt_u32_incomplete_neon_asm_var_4_2_16_z4_7: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x0, #992] +ldr q29, [x17, #+0] +ldr q28, [x17, #+16] +sqrdmulh v27.4S, v30.4S, v28.s[0] +mul v30.4S, v30.4S,v29.s[0] +ldr q26, [x0, #928] +sqrdmulh v25.4S, v26.4S, v28.s[0] +mul v26.4S, v26.4S,v29.s[0] +ldr q24, [x0, #864] +sqrdmulh v23.4S, v24.4S, v28.s[0] +mul v24.4S, v24.4S,v29.s[0] +ldr q22, [x0, #800] +sqrdmulh v21.4S, v22.4S, v28.s[0] +mul v22.4S, v22.4S,v29.s[0] +ldr q20, [x0, #736] +mla v30.4S, v27.4S, v31.s[0] +sqrdmulh v27.4S, v20.4S, v28.s[0] +ldr q19, [x0, #672] +mla v26.4S, v25.4S, v31.s[0] +sqrdmulh v25.4S, v19.4S, v28.s[0] +nop +ldr q18, [x0, #608] +mla v24.4S, v23.4S, v31.s[0] +sqrdmulh v23.4S, v18.4S, v28.s[0] +nop +ldr q17, [x0, #544] +mla v22.4S, v21.4S, v31.s[0] +nop +sqrdmulh v21.4S, v17.4S, v28.s[0] +ldr q16, [x0, #480] +ldr q3, [x0, #416] +mul v20.4S, v20.4S,v29.s[0] +sub v2.4s, v16.4s, v30.4s +mul v19.4S, v19.4S,v29.s[0] +add v16.4s, v16.4s, v30.4s +ldr q30, [x0, #352] +ldr q1, [x0, #288] +mla v20.4S, v27.4S, v31.s[0] +sub v27.4s, v3.4s, v26.4s +mla v19.4S, v25.4S, v31.s[0] +add v3.4s, v3.4s, v26.4s +ldr q26, [x0, #224] +ldr q25, [x0, #160] +mul v18.4S, v18.4S,v29.s[0] +sub v0.4s, v30.4s, v24.4s +mul v17.4S, v17.4S,v29.s[0] +add v30.4s, v30.4s, v24.4s +ldr q24, [x0, #96] +ldr q15, [x0, #32] +mla v18.4S, v23.4S, v31.s[0] +sub v23.4s, v1.4s, v22.4s +mla v17.4S, v21.4S, v31.s[0] +add v1.4s, v1.4s, v22.4s +sqrdmulh v22.4S, v2.4S, v28.s[2] +nop +mul v2.4S, v2.4S,v29.s[2] +nop +sqrdmulh v21.4S, v27.4S, v28.s[2] +sub v14.4s, v26.4s, v20.4s +mul v27.4S, v27.4S,v29.s[2] +add v26.4s, v26.4s, v20.4s +sqrdmulh v20.4S, v16.4S, v28.s[1] +sub v13.4s, v25.4s, v19.4s +mul v16.4S, v16.4S,v29.s[1] +add v25.4s, v25.4s, v19.4s +sqrdmulh v19.4S, v3.4S, v28.s[1] +sub v12.4s, v24.4s, v18.4s +mul v3.4S, v3.4S,v29.s[1] +add v24.4s, v24.4s, v18.4s +mla v2.4S, v22.4S, v31.s[0] +sub v22.4s, v15.4s, v17.4s +sqrdmulh v18.4S, v0.4S, v28.s[2] +add v15.4s, v15.4s, v17.4s +mla v27.4S, v21.4S, v31.s[0] +nop +sqrdmulh v21.4S, v23.4S, v28.s[2] +nop +mla v16.4S, v20.4S, v31.s[0] +nop +sqrdmulh v20.4S, v30.4S, v28.s[1] +nop +mla v3.4S, v19.4S, v31.s[0] +nop +sqrdmulh v19.4S, v1.4S, v28.s[1] +nop +ldr q17, [x17, #+32] +ldr q11, [x17, #+48] +mul v0.4S, v0.4S,v29.s[2] +sub v10.4s, v14.4s, v2.4s +mul v23.4S, v23.4S,v29.s[2] +add v14.4s, v14.4s, v2.4s +mla v0.4S, v18.4S, v31.s[0] +sub v18.4s, v13.4s, v27.4s +mla v23.4S, v21.4S, v31.s[0] +add v13.4s, v13.4s, v27.4s +mul v30.4S, v30.4S,v29.s[1] +sub v27.4s, v26.4s, v16.4s +mul v1.4S, v1.4S,v29.s[1] +add v26.4s, v26.4s, v16.4s +mla v30.4S, v20.4S, v31.s[0] +sub v20.4s, v25.4s, v3.4s +mla v1.4S, v19.4S, v31.s[0] +add v25.4s, v25.4s, v3.4s +sqrdmulh v3.4S, v10.4S, v11.s[3] +nop +mul v10.4S, v10.4S,v17.s[3] +nop +sqrdmulh v19.4S, v14.4S, v11.s[2] +sub v16.4s, v12.4s, v0.4s +mul v14.4S, v14.4S,v17.s[2] +add v12.4s, v12.4s, v0.4s +sqrdmulh v0.4S, v27.4S, v11.s[1] +sub v21.4s, v22.4s, v23.4s +mul v27.4S, v27.4S,v17.s[1] +add v22.4s, v22.4s, v23.4s +sqrdmulh v23.4S, v26.4S, v11.s[0] +sub v2.4s, v24.4s, v30.4s +mul v26.4S, v26.4S,v17.s[0] +add v24.4s, v24.4s, v30.4s +ldr q30, [x17, #+96] +ldr q9, [x17, #+112] +mla v10.4S, v3.4S, v31.s[0] +sub v3.4s, v15.4s, v1.4s +sqrdmulh v8.4S, v18.4S, v11.s[3] +add v15.4s, v15.4s, v1.4s +mla v14.4S, v19.4S, v31.s[0] +nop +sqrdmulh v19.4S, v13.4S, v11.s[2] +nop +mla v27.4S, v0.4S, v31.s[0] +nop +sqrdmulh v0.4S, v20.4S, v11.s[1] +nop +mla v26.4S, v23.4S, v31.s[0] +nop +sqrdmulh v23.4S, v25.4S, v11.s[0] +nop +ldr q1, [x17, #+64] +ldr q7, [x17, #+80] +mul v18.4S, v18.4S,v17.s[3] +sub v6.4s, v16.4s, v10.4s +mul v13.4S, v13.4S,v17.s[2] +add v16.4s, v16.4s, v10.4s +mla v18.4S, v8.4S, v31.s[0] +sub v8.4s, v12.4s, v14.4s +mla v13.4S, v19.4S, v31.s[0] +add v12.4s, v12.4s, v14.4s +mul v20.4S, v20.4S,v17.s[1] +sub v14.4s, v2.4s, v27.4s +mul v25.4S, v25.4S,v17.s[0] +add v2.4s, v2.4s, v27.4s +mla v20.4S, v0.4S, v31.s[0] +sub v0.4s, v24.4s, v26.4s +mla v25.4S, v23.4S, v31.s[0] +add v24.4s, v24.4s, v26.4s +sqrdmulh v26.4S, v6.4S, v9.s[3] +nop +mul v6.4S, v6.4S,v30.s[3] +nop +sqrdmulh v23.4S, v16.4S, v9.s[2] +sub v27.4s, v21.4s, v18.4s +mul v16.4S, v16.4S,v30.s[2] +add v21.4s, v21.4s, v18.4s +sqrdmulh v18.4S, v8.4S, v9.s[1] +sub v19.4s, v22.4s, v13.4s +mul v8.4S, v8.4S,v30.s[1] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v12.4S, v9.s[0] +sub v10.4s, v3.4s, v20.4s +mul v12.4S, v12.4S,v30.s[0] +add v3.4s, v3.4s, v20.4s +mla v6.4S, v26.4S, v31.s[0] +sub v26.4s, v15.4s, v25.4s +sqrdmulh v20.4S, v14.4S, v7.s[3] +add v15.4s, v15.4s, v25.4s +mla v16.4S, v23.4S, v31.s[0] +sub v23.4s, v27.4s, v6.4s +sqrdmulh v25.4S, v2.4S, v7.s[2] +add v27.4s, v27.4s, v6.4s +mla v8.4S, v18.4S, v31.s[0] +sub v18.4s, v21.4s, v16.4s +sqrdmulh v6.4S, v0.4S, v7.s[1] +add v21.4s, v21.4s, v16.4s +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v19.4s, v8.4s +sqrdmulh v16.4S, v24.4S, v7.s[0] +add v19.4s, v19.4s, v8.4s +mul v14.4S, v14.4S,v1.s[3] +sub v8.4s, v22.4s, v12.4s +mul v2.4S, v2.4S,v1.s[2] +add v22.4s, v22.4s, v12.4s +mla v14.4S, v20.4S, v31.s[0] +str q23, [x0, #992] +mla v2.4S, v25.4S, v31.s[0] +str q27, [x0, #928] +mul v0.4S, v0.4S,v1.s[1] +str q18, [x0, #864] +mul v24.4S, v24.4S,v1.s[0] +str q21, [x0, #800] +mla v0.4S, v6.4S, v31.s[0] +str q13, [x0, #736] +mla v24.4S, v16.4S, v31.s[0] +str q19, [x0, #672] +ldr q19, [x0, #1008] +sqrdmulh v16.4S, v19.4S, v28.s[0] +str q8, [x0, #608] +mul v19.4S, v19.4S,v29.s[0] +str q22, [x0, #544] +ldr q22, [x0, #944] +sqrdmulh v8.4S, v22.4S, v28.s[0] +sub v13.4s, v10.4s, v14.4s +str q13, [x0, #480] +mul v22.4S, v22.4S,v29.s[0] +add v10.4s, v10.4s, v14.4s +ldr q14, [x0, #880] +sqrdmulh v13.4S, v14.4S, v28.s[0] +sub v6.4s, v3.4s, v2.4s +str q10, [x0, #416] +mul v14.4S, v14.4S,v29.s[0] +add v3.4s, v3.4s, v2.4s +ldr q2, [x0, #816] +sqrdmulh v10.4S, v2.4S, v28.s[0] +sub v21.4s, v26.4s, v0.4s +str q6, [x0, #352] +mul v2.4S, v2.4S,v29.s[0] +add v26.4s, v26.4s, v0.4s +ldr q0, [x0, #752] +mla v19.4S, v16.4S, v31.s[0] +sub v16.4s, v15.4s, v24.4s +str q3, [x0, #288] +sqrdmulh v3.4S, v0.4S, v28.s[0] +add v15.4s, v15.4s, v24.4s +ldr q24, [x0, #688] +mla v22.4S, v8.4S, v31.s[0] +str q21, [x0, #224] +sqrdmulh v21.4S, v24.4S, v28.s[0] +nop +ldr q8, [x0, #624] +mla v14.4S, v13.4S, v31.s[0] +str q26, [x0, #160] +sqrdmulh v26.4S, v8.4S, v28.s[0] +nop +ldr q13, [x0, #560] +mla v2.4S, v10.4S, v31.s[0] +nop +sqrdmulh v10.4S, v13.4S, v28.s[0] +str q16, [x0, #96] +ldr q16, [x0, #496] +ldr q6, [x0, #432] +mul v0.4S, v0.4S,v29.s[0] +sub v18.4s, v16.4s, v19.4s +str q15, [x0, #32] +mul v24.4S, v24.4S,v29.s[0] +add v16.4s, v16.4s, v19.4s +ldr q19, [x0, #368] +ldr q15, [x0, #304] +mla v0.4S, v3.4S, v31.s[0] +sub v3.4s, v6.4s, v22.4s +mla v24.4S, v21.4S, v31.s[0] +add v6.4s, v6.4s, v22.4s +ldr q22, [x0, #240] +ldr q21, [x0, #176] +mul v8.4S, v8.4S,v29.s[0] +sub v27.4s, v19.4s, v14.4s +mul v13.4S, v13.4S,v29.s[0] +add v19.4s, v19.4s, v14.4s +ldr q14, [x0, #112] +ldr q25, [x0, #48] +mla v8.4S, v26.4S, v31.s[0] +sub v26.4s, v15.4s, v2.4s +mla v13.4S, v10.4S, v31.s[0] +add v15.4s, v15.4s, v2.4s +sqrdmulh v2.4S, v18.4S, v28.s[2] +nop +mul v18.4S, v18.4S,v29.s[2] +nop +sqrdmulh v10.4S, v3.4S, v28.s[2] +sub v23.4s, v22.4s, v0.4s +mul v3.4S, v3.4S,v29.s[2] +add v22.4s, v22.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v28.s[1] +sub v20.4s, v21.4s, v24.4s +mul v16.4S, v16.4S,v29.s[1] +add v21.4s, v21.4s, v24.4s +sqrdmulh v24.4S, v6.4S, v28.s[1] +sub v12.4s, v14.4s, v8.4s +mul v6.4S, v6.4S,v29.s[1] +add v14.4s, v14.4s, v8.4s +mla v18.4S, v2.4S, v31.s[0] +sub v2.4s, v25.4s, v13.4s +sqrdmulh v8.4S, v27.4S, v28.s[2] +add v25.4s, v25.4s, v13.4s +mla v3.4S, v10.4S, v31.s[0] +nop +sqrdmulh v10.4S, v26.4S, v28.s[2] +nop +mla v16.4S, v0.4S, v31.s[0] +nop +sqrdmulh v0.4S, v19.4S, v28.s[1] +nop +mla v6.4S, v24.4S, v31.s[0] +nop +sqrdmulh v24.4S, v15.4S, v28.s[1] +nop +mul v27.4S, v27.4S,v29.s[2] +sub v13.4s, v23.4s, v18.4s +mul v26.4S, v26.4S,v29.s[2] +add v23.4s, v23.4s, v18.4s +mla v27.4S, v8.4S, v31.s[0] +sub v8.4s, v20.4s, v3.4s +mla v26.4S, v10.4S, v31.s[0] +add v20.4s, v20.4s, v3.4s +mul v19.4S, v19.4S,v29.s[1] +sub v3.4s, v22.4s, v16.4s +mul v15.4S, v15.4S,v29.s[1] +add v22.4s, v22.4s, v16.4s +mla v19.4S, v0.4S, v31.s[0] +sub v0.4s, v21.4s, v6.4s +mla v15.4S, v24.4S, v31.s[0] +add v21.4s, v21.4s, v6.4s +sqrdmulh v6.4S, v13.4S, v11.s[3] +nop +mul v13.4S, v13.4S,v17.s[3] +nop +sqrdmulh v24.4S, v23.4S, v11.s[2] +sub v16.4s, v12.4s, v27.4s +mul v23.4S, v23.4S,v17.s[2] +add v12.4s, v12.4s, v27.4s +sqrdmulh v27.4S, v3.4S, v11.s[1] +sub v10.4s, v2.4s, v26.4s +mul v3.4S, v3.4S,v17.s[1] +add v2.4s, v2.4s, v26.4s +sqrdmulh v26.4S, v22.4S, v11.s[0] +sub v18.4s, v14.4s, v19.4s +mul v22.4S, v22.4S,v17.s[0] +add v14.4s, v14.4s, v19.4s +mla v13.4S, v6.4S, v31.s[0] +sub v6.4s, v25.4s, v15.4s +sqrdmulh v19.4S, v8.4S, v11.s[3] +add v25.4s, v25.4s, v15.4s +mla v23.4S, v24.4S, v31.s[0] +nop +sqrdmulh v24.4S, v20.4S, v11.s[2] +nop +mla v3.4S, v27.4S, v31.s[0] +nop +sqrdmulh v27.4S, v0.4S, v11.s[1] +nop +mla v22.4S, v26.4S, v31.s[0] +nop +sqrdmulh v26.4S, v21.4S, v11.s[0] +nop +mul v8.4S, v8.4S,v17.s[3] +sub v15.4s, v16.4s, v13.4s +mul v20.4S, v20.4S,v17.s[2] +add v16.4s, v16.4s, v13.4s +mla v8.4S, v19.4S, v31.s[0] +sub v19.4s, v12.4s, v23.4s +mla v20.4S, v24.4S, v31.s[0] +add v12.4s, v12.4s, v23.4s +mul v0.4S, v0.4S,v17.s[1] +sub v23.4s, v18.4s, v3.4s +mul v21.4S, v21.4S,v17.s[0] +add v18.4s, v18.4s, v3.4s +mla v0.4S, v27.4S, v31.s[0] +sub v27.4s, v14.4s, v22.4s +mla v21.4S, v26.4S, v31.s[0] +add v14.4s, v14.4s, v22.4s +sqrdmulh v22.4S, v15.4S, v9.s[3] +nop +mul v15.4S, v15.4S,v30.s[3] +nop +sqrdmulh v26.4S, v16.4S, v9.s[2] +sub v3.4s, v10.4s, v8.4s +mul v16.4S, v16.4S,v30.s[2] +add v10.4s, v10.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v9.s[1] +sub v24.4s, v2.4s, v20.4s +mul v19.4S, v19.4S,v30.s[1] +add v2.4s, v2.4s, v20.4s +sqrdmulh v20.4S, v12.4S, v9.s[0] +sub v13.4s, v6.4s, v0.4s +mul v12.4S, v12.4S,v30.s[0] +add v6.4s, v6.4s, v0.4s +mla v15.4S, v22.4S, v31.s[0] +sub v22.4s, v25.4s, v21.4s +sqrdmulh v0.4S, v23.4S, v7.s[3] +add v25.4s, v25.4s, v21.4s +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v3.4s, v15.4s +sqrdmulh v21.4S, v18.4S, v7.s[2] +add v3.4s, v3.4s, v15.4s +mla v19.4S, v8.4S, v31.s[0] +sub v8.4s, v10.4s, v16.4s +sqrdmulh v15.4S, v27.4S, v7.s[1] +add v10.4s, v10.4s, v16.4s +mla v12.4S, v20.4S, v31.s[0] +sub v20.4s, v24.4s, v19.4s +sqrdmulh v16.4S, v14.4S, v7.s[0] +add v24.4s, v24.4s, v19.4s +mul v23.4S, v23.4S,v1.s[3] +sub v19.4s, v2.4s, v12.4s +mul v18.4S, v18.4S,v1.s[2] +add v2.4s, v2.4s, v12.4s +mla v23.4S, v0.4S, v31.s[0] +str q26, [x0, #1008] +mla v18.4S, v21.4S, v31.s[0] +str q3, [x0, #944] +mul v27.4S, v27.4S,v1.s[1] +str q8, [x0, #880] +mul v14.4S, v14.4S,v1.s[0] +str q10, [x0, #816] +mla v27.4S, v15.4S, v31.s[0] +str q20, [x0, #752] +mla v14.4S, v16.4S, v31.s[0] +str q24, [x0, #688] +ldr q24, [x0, #960] +sqrdmulh v16.4S, v24.4S, v28.s[0] +str q19, [x0, #624] +mul v24.4S, v24.4S,v29.s[0] +str q2, [x0, #560] +ldr q2, [x0, #896] +sqrdmulh v19.4S, v2.4S, v28.s[0] +sub v20.4s, v13.4s, v23.4s +str q20, [x0, #496] +mul v2.4S, v2.4S,v29.s[0] +add v13.4s, v13.4s, v23.4s +ldr q23, [x0, #832] +sqrdmulh v20.4S, v23.4S, v28.s[0] +sub v15.4s, v6.4s, v18.4s +str q13, [x0, #432] +mul v23.4S, v23.4S,v29.s[0] +add v6.4s, v6.4s, v18.4s +ldr q18, [x0, #768] +sqrdmulh v13.4S, v18.4S, v28.s[0] +sub v10.4s, v22.4s, v27.4s +str q15, [x0, #368] +mul v18.4S, v18.4S,v29.s[0] +add v22.4s, v22.4s, v27.4s +ldr q27, [x0, #704] +mla v24.4S, v16.4S, v31.s[0] +sub v16.4s, v25.4s, v14.4s +str q6, [x0, #304] +sqrdmulh v6.4S, v27.4S, v28.s[0] +add v25.4s, v25.4s, v14.4s +ldr q14, [x0, #640] +mla v2.4S, v19.4S, v31.s[0] +str q10, [x0, #240] +sqrdmulh v10.4S, v14.4S, v28.s[0] +nop +ldr q19, [x0, #576] +mla v23.4S, v20.4S, v31.s[0] +str q22, [x0, #176] +sqrdmulh v22.4S, v19.4S, v28.s[0] +nop +ldr q20, [x0, #512] +mla v18.4S, v13.4S, v31.s[0] +nop +sqrdmulh v13.4S, v20.4S, v28.s[0] +str q16, [x0, #112] +ldr q16, [x0, #448] +ldr q15, [x0, #384] +mul v27.4S, v27.4S,v29.s[0] +sub v8.4s, v16.4s, v24.4s +str q25, [x0, #48] +mul v14.4S, v14.4S,v29.s[0] +add v16.4s, v16.4s, v24.4s +ldr q24, [x0, #320] +ldr q25, [x0, #256] +mla v27.4S, v6.4S, v31.s[0] +sub v6.4s, v15.4s, v2.4s +mla v14.4S, v10.4S, v31.s[0] +add v15.4s, v15.4s, v2.4s +ldr q2, [x0, #192] +ldr q10, [x0, #128] +mul v19.4S, v19.4S,v29.s[0] +sub v3.4s, v24.4s, v23.4s +mul v20.4S, v20.4S,v29.s[0] +add v24.4s, v24.4s, v23.4s +ldr q23, [x0, #64] +ldr q21, [x0, #0] +mla v19.4S, v22.4S, v31.s[0] +sub v22.4s, v25.4s, v18.4s +mla v20.4S, v13.4S, v31.s[0] +add v25.4s, v25.4s, v18.4s +sqrdmulh v18.4S, v8.4S, v28.s[2] +nop +mul v8.4S, v8.4S,v29.s[2] +nop +sqrdmulh v13.4S, v6.4S, v28.s[2] +sub v26.4s, v2.4s, v27.4s +mul v6.4S, v6.4S,v29.s[2] +add v2.4s, v2.4s, v27.4s +sqrdmulh v27.4S, v16.4S, v28.s[1] +sub v0.4s, v10.4s, v14.4s +mul v16.4S, v16.4S,v29.s[1] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v28.s[1] +sub v12.4s, v23.4s, v19.4s +mul v15.4S, v15.4S,v29.s[1] +add v23.4s, v23.4s, v19.4s +mla v8.4S, v18.4S, v31.s[0] +sub v18.4s, v21.4s, v20.4s +sqrdmulh v19.4S, v3.4S, v28.s[2] +add v21.4s, v21.4s, v20.4s +mla v6.4S, v13.4S, v31.s[0] +nop +sqrdmulh v13.4S, v22.4S, v28.s[2] +nop +mla v16.4S, v27.4S, v31.s[0] +nop +sqrdmulh v27.4S, v24.4S, v28.s[1] +nop +mla v15.4S, v14.4S, v31.s[0] +nop +sqrdmulh v14.4S, v25.4S, v28.s[1] +nop +mul v3.4S, v3.4S,v29.s[2] +sub v20.4s, v26.4s, v8.4s +mul v22.4S, v22.4S,v29.s[2] +add v26.4s, v26.4s, v8.4s +mla v3.4S, v19.4S, v31.s[0] +sub v19.4s, v0.4s, v6.4s +mla v22.4S, v13.4S, v31.s[0] +add v0.4s, v0.4s, v6.4s +mul v24.4S, v24.4S,v29.s[1] +sub v6.4s, v2.4s, v16.4s +mul v25.4S, v25.4S,v29.s[1] +add v2.4s, v2.4s, v16.4s +mla v24.4S, v27.4S, v31.s[0] +sub v27.4s, v10.4s, v15.4s +mla v25.4S, v14.4S, v31.s[0] +add v10.4s, v10.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v11.s[3] +nop +mul v20.4S, v20.4S,v17.s[3] +nop +sqrdmulh v14.4S, v26.4S, v11.s[2] +sub v16.4s, v12.4s, v3.4s +mul v26.4S, v26.4S,v17.s[2] +add v12.4s, v12.4s, v3.4s +sqrdmulh v3.4S, v6.4S, v11.s[1] +sub v13.4s, v18.4s, v22.4s +mul v6.4S, v6.4S,v17.s[1] +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v2.4S, v11.s[0] +sub v8.4s, v23.4s, v24.4s +mul v2.4S, v2.4S,v17.s[0] +add v23.4s, v23.4s, v24.4s +mla v20.4S, v15.4S, v31.s[0] +sub v15.4s, v21.4s, v25.4s +sqrdmulh v24.4S, v19.4S, v11.s[3] +add v21.4s, v21.4s, v25.4s +mla v26.4S, v14.4S, v31.s[0] +nop +sqrdmulh v14.4S, v0.4S, v11.s[2] +nop +mla v6.4S, v3.4S, v31.s[0] +nop +sqrdmulh v3.4S, v27.4S, v11.s[1] +nop +mla v2.4S, v22.4S, v31.s[0] +nop +sqrdmulh v22.4S, v10.4S, v11.s[0] +nop +mul v19.4S, v19.4S,v17.s[3] +sub v25.4s, v16.4s, v20.4s +mul v0.4S, v0.4S,v17.s[2] +add v16.4s, v16.4s, v20.4s +mla v19.4S, v24.4S, v31.s[0] +sub v24.4s, v12.4s, v26.4s +mla v0.4S, v14.4S, v31.s[0] +add v12.4s, v12.4s, v26.4s +mul v27.4S, v27.4S,v17.s[1] +sub v26.4s, v8.4s, v6.4s +mul v10.4S, v10.4S,v17.s[0] +add v8.4s, v8.4s, v6.4s +mla v27.4S, v3.4S, v31.s[0] +sub v3.4s, v23.4s, v2.4s +mla v10.4S, v22.4S, v31.s[0] +add v23.4s, v23.4s, v2.4s +sqrdmulh v2.4S, v25.4S, v9.s[3] +nop +mul v25.4S, v25.4S,v30.s[3] +nop +sqrdmulh v22.4S, v16.4S, v9.s[2] +sub v6.4s, v13.4s, v19.4s +mul v16.4S, v16.4S,v30.s[2] +add v13.4s, v13.4s, v19.4s +sqrdmulh v19.4S, v24.4S, v9.s[1] +sub v14.4s, v18.4s, v0.4s +mul v24.4S, v24.4S,v30.s[1] +add v18.4s, v18.4s, v0.4s +sqrdmulh v0.4S, v12.4S, v9.s[0] +sub v20.4s, v15.4s, v27.4s +mul v12.4S, v12.4S,v30.s[0] +add v15.4s, v15.4s, v27.4s +mla v25.4S, v2.4S, v31.s[0] +sub v2.4s, v21.4s, v10.4s +sqrdmulh v27.4S, v26.4S, v7.s[3] +add v21.4s, v21.4s, v10.4s +mla v16.4S, v22.4S, v31.s[0] +sub v22.4s, v6.4s, v25.4s +sqrdmulh v10.4S, v8.4S, v7.s[2] +add v6.4s, v6.4s, v25.4s +mla v24.4S, v19.4S, v31.s[0] +sub v19.4s, v13.4s, v16.4s +sqrdmulh v25.4S, v3.4S, v7.s[1] +add v13.4s, v13.4s, v16.4s +mla v12.4S, v0.4S, v31.s[0] +sub v0.4s, v14.4s, v24.4s +sqrdmulh v16.4S, v23.4S, v7.s[0] +add v14.4s, v14.4s, v24.4s +mul v26.4S, v26.4S,v1.s[3] +sub v24.4s, v18.4s, v12.4s +mul v8.4S, v8.4S,v1.s[2] +add v18.4s, v18.4s, v12.4s +mla v26.4S, v27.4S, v31.s[0] +str q22, [x0, #960] +mla v8.4S, v10.4S, v31.s[0] +str q6, [x0, #896] +mul v3.4S, v3.4S,v1.s[1] +str q19, [x0, #832] +mul v23.4S, v23.4S,v1.s[0] +str q13, [x0, #768] +mla v3.4S, v25.4S, v31.s[0] +str q0, [x0, #704] +mla v23.4S, v16.4S, v31.s[0] +str q14, [x0, #640] +ldr q14, [x0, #976] +sqrdmulh v16.4S, v14.4S, v28.s[0] +str q24, [x0, #576] +mul v14.4S, v14.4S,v29.s[0] +str q18, [x0, #512] +ldr q18, [x0, #912] +sqrdmulh v24.4S, v18.4S, v28.s[0] +sub v0.4s, v20.4s, v26.4s +str q0, [x0, #448] +mul v18.4S, v18.4S,v29.s[0] +add v20.4s, v20.4s, v26.4s +ldr q26, [x0, #848] +sqrdmulh v0.4S, v26.4S, v28.s[0] +sub v25.4s, v15.4s, v8.4s +str q20, [x0, #384] +mul v26.4S, v26.4S,v29.s[0] +add v15.4s, v15.4s, v8.4s +ldr q8, [x0, #784] +sqrdmulh v20.4S, v8.4S, v28.s[0] +sub v13.4s, v2.4s, v3.4s +str q25, [x0, #320] +mul v8.4S, v8.4S,v29.s[0] +add v2.4s, v2.4s, v3.4s +ldr q3, [x0, #720] +mla v14.4S, v16.4S, v31.s[0] +sub v16.4s, v21.4s, v23.4s +str q15, [x0, #256] +sqrdmulh v15.4S, v3.4S, v28.s[0] +add v21.4s, v21.4s, v23.4s +ldr q23, [x0, #656] +mla v18.4S, v24.4S, v31.s[0] +str q13, [x0, #192] +sqrdmulh v13.4S, v23.4S, v28.s[0] +nop +ldr q24, [x0, #592] +mla v26.4S, v0.4S, v31.s[0] +str q2, [x0, #128] +sqrdmulh v2.4S, v24.4S, v28.s[0] +nop +ldr q0, [x0, #528] +mla v8.4S, v20.4S, v31.s[0] +nop +sqrdmulh v20.4S, v0.4S, v28.s[0] +str q16, [x0, #64] +ldr q16, [x0, #464] +ldr q25, [x0, #400] +mul v3.4S, v3.4S,v29.s[0] +sub v19.4s, v16.4s, v14.4s +str q21, [x0, #0] +mul v23.4S, v23.4S,v29.s[0] +add v16.4s, v16.4s, v14.4s +ldr q14, [x0, #336] +ldr q21, [x0, #272] +mla v3.4S, v15.4S, v31.s[0] +sub v15.4s, v25.4s, v18.4s +mla v23.4S, v13.4S, v31.s[0] +add v25.4s, v25.4s, v18.4s +ldr q18, [x0, #208] +ldr q13, [x0, #144] +mul v24.4S, v24.4S,v29.s[0] +sub v6.4s, v14.4s, v26.4s +mul v0.4S, v0.4S,v29.s[0] +add v14.4s, v14.4s, v26.4s +ldr q26, [x0, #80] +ldr q10, [x0, #16] +mla v24.4S, v2.4S, v31.s[0] +sub v2.4s, v21.4s, v8.4s +mla v0.4S, v20.4S, v31.s[0] +add v21.4s, v21.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v28.s[2] +nop +mul v19.4S, v19.4S,v29.s[2] +nop +sqrdmulh v20.4S, v15.4S, v28.s[2] +sub v22.4s, v18.4s, v3.4s +mul v15.4S, v15.4S,v29.s[2] +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v16.4S, v28.s[1] +sub v27.4s, v13.4s, v23.4s +mul v16.4S, v16.4S,v29.s[1] +add v13.4s, v13.4s, v23.4s +sqrdmulh v23.4S, v25.4S, v28.s[1] +sub v12.4s, v26.4s, v24.4s +mul v25.4S, v25.4S,v29.s[1] +add v26.4s, v26.4s, v24.4s +mla v19.4S, v8.4S, v31.s[0] +sub v8.4s, v10.4s, v0.4s +sqrdmulh v24.4S, v6.4S, v28.s[2] +add v10.4s, v10.4s, v0.4s +mla v15.4S, v20.4S, v31.s[0] +nop +sqrdmulh v20.4S, v2.4S, v28.s[2] +nop +mla v16.4S, v3.4S, v31.s[0] +nop +sqrdmulh v3.4S, v14.4S, v28.s[1] +nop +mla v25.4S, v23.4S, v31.s[0] +nop +sqrdmulh v23.4S, v21.4S, v28.s[1] +nop +mul v6.4S, v6.4S,v29.s[2] +sub v0.4s, v22.4s, v19.4s +mul v2.4S, v2.4S,v29.s[2] +add v22.4s, v22.4s, v19.4s +mla v6.4S, v24.4S, v31.s[0] +sub v24.4s, v27.4s, v15.4s +mla v2.4S, v20.4S, v31.s[0] +add v27.4s, v27.4s, v15.4s +mul v14.4S, v14.4S,v29.s[1] +sub v15.4s, v18.4s, v16.4s +mul v21.4S, v21.4S,v29.s[1] +add v18.4s, v18.4s, v16.4s +mla v14.4S, v3.4S, v31.s[0] +sub v3.4s, v13.4s, v25.4s +mla v21.4S, v23.4S, v31.s[0] +add v13.4s, v13.4s, v25.4s +sqrdmulh v28.4S, v0.4S, v11.s[3] +nop +mul v0.4S, v0.4S,v17.s[3] +nop +sqrdmulh v29.4S, v22.4S, v11.s[2] +sub v25.4s, v12.4s, v6.4s +mul v22.4S, v22.4S,v17.s[2] +add v12.4s, v12.4s, v6.4s +sqrdmulh v6.4S, v15.4S, v11.s[1] +sub v23.4s, v8.4s, v2.4s +mul v15.4S, v15.4S,v17.s[1] +add v8.4s, v8.4s, v2.4s +sqrdmulh v2.4S, v18.4S, v11.s[0] +sub v16.4s, v26.4s, v14.4s +mul v18.4S, v18.4S,v17.s[0] +add v26.4s, v26.4s, v14.4s +mla v0.4S, v28.4S, v31.s[0] +sub v28.4s, v10.4s, v21.4s +sqrdmulh v14.4S, v24.4S, v11.s[3] +add v10.4s, v10.4s, v21.4s +mla v22.4S, v29.4S, v31.s[0] +nop +sqrdmulh v29.4S, v27.4S, v11.s[2] +nop +mla v15.4S, v6.4S, v31.s[0] +nop +sqrdmulh v6.4S, v3.4S, v11.s[1] +nop +mla v18.4S, v2.4S, v31.s[0] +nop +sqrdmulh v2.4S, v13.4S, v11.s[0] +nop +mul v24.4S, v24.4S,v17.s[3] +sub v21.4s, v25.4s, v0.4s +mul v27.4S, v27.4S,v17.s[2] +add v25.4s, v25.4s, v0.4s +mla v24.4S, v14.4S, v31.s[0] +sub v14.4s, v12.4s, v22.4s +mla v27.4S, v29.4S, v31.s[0] +add v12.4s, v12.4s, v22.4s +mul v3.4S, v3.4S,v17.s[1] +sub v22.4s, v16.4s, v15.4s +mul v13.4S, v13.4S,v17.s[0] +add v16.4s, v16.4s, v15.4s +mla v3.4S, v6.4S, v31.s[0] +sub v6.4s, v26.4s, v18.4s +mla v13.4S, v2.4S, v31.s[0] +add v26.4s, v26.4s, v18.4s +sqrdmulh v11.4S, v21.4S, v9.s[3] +nop +mul v21.4S, v21.4S,v30.s[3] +nop +sqrdmulh v17.4S, v25.4S, v9.s[2] +sub v18.4s, v23.4s, v24.4s +mul v25.4S, v25.4S,v30.s[2] +add v23.4s, v23.4s, v24.4s +sqrdmulh v24.4S, v14.4S, v9.s[1] +sub v2.4s, v8.4s, v27.4s +mul v14.4S, v14.4S,v30.s[1] +add v8.4s, v8.4s, v27.4s +sqrdmulh v27.4S, v12.4S, v9.s[0] +sub v15.4s, v28.4s, v3.4s +mul v12.4S, v12.4S,v30.s[0] +add v28.4s, v28.4s, v3.4s +mla v21.4S, v11.4S, v31.s[0] +sub v11.4s, v10.4s, v13.4s +sqrdmulh v9.4S, v22.4S, v7.s[3] +add v10.4s, v10.4s, v13.4s +mla v25.4S, v17.4S, v31.s[0] +sub v17.4s, v18.4s, v21.4s +sqrdmulh v13.4S, v16.4S, v7.s[2] +add v18.4s, v18.4s, v21.4s +mla v14.4S, v24.4S, v31.s[0] +sub v24.4s, v23.4s, v25.4s +sqrdmulh v21.4S, v6.4S, v7.s[1] +add v23.4s, v23.4s, v25.4s +mla v12.4S, v27.4S, v31.s[0] +sub v27.4s, v2.4s, v14.4s +sqrdmulh v25.4S, v26.4S, v7.s[0] +add v2.4s, v2.4s, v14.4s +mul v22.4S, v22.4S,v1.s[3] +sub v14.4s, v8.4s, v12.4s +mul v16.4S, v16.4S,v1.s[2] +add v8.4s, v8.4s, v12.4s +mla v22.4S, v9.4S, v31.s[0] +str q17, [x0, #976] +mla v16.4S, v13.4S, v31.s[0] +str q18, [x0, #912] +mul v6.4S, v6.4S,v1.s[1] +str q24, [x0, #848] +mul v26.4S, v26.4S,v1.s[0] +str q23, [x0, #784] +mla v6.4S, v21.4S, v31.s[0] +str q27, [x0, #720] +mla v26.4S, v25.4S, v31.s[0] +str q2, [x0, #656] +str q14, [x0, #592] +str q8, [x0, #528] +sub v8.4s, v15.4s, v22.4s +str q8, [x0, #464] +add v15.4s, v15.4s, v22.4s +sub v22.4s, v28.4s, v16.4s +str q15, [x0, #400] +add v28.4s, v28.4s, v16.4s +sub v16.4s, v11.4s, v6.4s +str q22, [x0, #336] +add v11.4s, v11.4s, v6.4s +sub v6.4s, v10.4s, v26.4s +str q28, [x0, #272] +add v10.4s, v10.4s, v26.4s +str q16, [x0, #208] +str q11, [x0, #144] +str q6, [x0, #80] +str q10, [x0, #16] +ldr q4, [x0, #224] +ldr q5, [x0, #160] +ldr q19, [x0, #32] +ldr q20, [x17, #+128] +ldr q0, [x17, #+144] +sqrdmulh v29.4S, v19.4S, v0.s[0] +mul v19.4S, v19.4S,v20.s[0] +ldr q3, [x0, #48] +sqrdmulh v30.4S, v3.4S, v0.s[0] +mul v3.4S, v3.4S,v20.s[0] +ldr q12, [x17, #+160] +ldr q9, [x17, #+176] +ldr q17, [x0, #96] +sqrdmulh v13.4S, v17.4S, v9.s[0] +mul v17.4S, v17.4S,v12.s[0] +ldr q18, [x0, #112] +sqrdmulh v24.4S, v18.4S, v9.s[0] +mul v18.4S, v18.4S,v12.s[0] +ldr q23, [x17, #+192] +ldr q21, [x17, #+208] +mla v19.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v5.4S, v21.s[0] +ldr q27, [x0, #176] +mla v3.4S, v30.4S, v31.s[0] +sqrdmulh v30.4S, v27.4S, v21.s[0] +ldr q25, [x17, #+224] +ldr q2, [x17, #+240] +mla v17.4S, v13.4S, v31.s[0] +sqrdmulh v13.4S, v4.4S, v2.s[0] +ldr q1, [x0, #240] +mla v18.4S, v24.4S, v31.s[0] +sqrdmulh v24.4S, v1.4S, v2.s[0] +ldr q7, [x0, #0] +ldr q14, [x0, #128] +mul v5.4S, v5.4S,v23.s[0] +sub v8.4s, v7.4s, v19.4s +ldr q15, [x0, #16] +mul v27.4S, v27.4S,v23.s[0] +add v7.4s, v7.4s, v19.4s +ldr q19, [x0, #144] +mla v5.4S, v29.4S, v31.s[0] +sub v29.4s, v15.4s, v3.4s +ldr q22, [x0, #64] +mla v27.4S, v30.4S, v31.s[0] +add v15.4s, v15.4s, v3.4s +ldr q3, [x0, #192] +mul v4.4S, v4.4S,v25.s[0] +sub v30.4s, v22.4s, v17.4s +ldr q28, [x0, #80] +mul v1.4S, v1.4S,v25.s[0] +add v22.4s, v22.4s, v17.4s +ldr q17, [x0, #208] +mla v4.4S, v13.4S, v31.s[0] +mla v1.4S, v24.4S, v31.s[0] +sub v24.4s, v28.4s, v18.4s +sqrdmulh v13.4S, v15.4S, v0.s[1] +add v28.4s, v28.4s, v18.4s +mul v15.4S, v15.4S,v20.s[1] +sqrdmulh v18.4S, v29.4S, v0.s[2] +sub v26.4s, v14.4s, v5.4s +mul v29.4S, v29.4S,v20.s[2] +add v14.4s, v14.4s, v5.4s +sqrdmulh v0.4S, v28.4S, v9.s[1] +sub v20.4s, v19.4s, v27.4s +mul v28.4S, v28.4S,v12.s[1] +add v19.4s, v19.4s, v27.4s +sqrdmulh v27.4S, v24.4S, v9.s[2] +sub v5.4s, v3.4s, v4.4s +mul v24.4S, v24.4S,v12.s[2] +add v3.4s, v3.4s, v4.4s +mla v15.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v1.4s +ldr q9, [x0, #480] +sqrdmulh v12.4S, v19.4S, v21.s[1] +add v17.4s, v17.4s, v1.4s +mla v29.4S, v18.4S, v31.s[0] +ldr q18, [x0, #416] +sqrdmulh v1.4S, v20.4S, v21.s[2] +sub v4.4s, v7.4s, v15.4s +mla v28.4S, v0.4S, v31.s[0] +ldr q0, [x0, #288] +sqrdmulh v16.4S, v17.4S, v2.s[1] +add v7.4s, v7.4s, v15.4s +str q4, [x0, #16] +mla v24.4S, v27.4S, v31.s[0] +ldr q27, [x17, #+256] +ldr q4, [x17, #+272] +sqrdmulh v15.4S, v13.4S, v2.s[2] +sub v11.4s, v8.4s, v29.4s +str q7, [x0, #0] +mul v19.4S, v19.4S,v23.s[1] +add v8.4s, v8.4s, v29.4s +mul v20.4S, v20.4S,v23.s[2] +str q11, [x0, #48] +mla v19.4S, v12.4S, v31.s[0] +sub v12.4s, v22.4s, v28.4s +mla v20.4S, v1.4S, v31.s[0] +str q8, [x0, #32] +mul v17.4S, v17.4S,v25.s[1] +str q12, [x0, #80] +mul v13.4S, v13.4S,v25.s[2] +add v22.4s, v22.4s, v28.4s +str q22, [x0, #64] +mla v17.4S, v16.4S, v31.s[0] +sub v16.4s, v30.4s, v24.4s +str q16, [x0, #112] +mla v13.4S, v15.4S, v31.s[0] +add v30.4s, v30.4s, v24.4s +str q30, [x0, #96] +sqrdmulh v2.4S, v0.4S, v4.s[0] +sub v25.4s, v14.4s, v19.4s +mul v0.4S, v0.4S,v27.s[0] +str q25, [x0, #144] +ldr q25, [x0, #304] +sqrdmulh v30.4S, v25.4S, v4.s[0] +add v14.4s, v14.4s, v19.4s +mul v25.4S, v25.4S,v27.s[0] +str q14, [x0, #128] +ldr q14, [x17, #+288] +ldr q19, [x17, #+304] +ldr q24, [x0, #352] +sqrdmulh v15.4S, v24.4S, v19.s[0] +sub v16.4s, v26.4s, v20.4s +mul v24.4S, v24.4S,v14.s[0] +str q16, [x0, #176] +ldr q16, [x0, #368] +sqrdmulh v22.4S, v16.4S, v19.s[0] +add v26.4s, v26.4s, v20.4s +mul v16.4S, v16.4S,v14.s[0] +str q26, [x0, #160] +ldr q26, [x17, #+320] +ldr q20, [x17, #+336] +mla v0.4S, v2.4S, v31.s[0] +sub v2.4s, v3.4s, v17.4s +sqrdmulh v28.4S, v18.4S, v20.s[0] +str q2, [x0, #208] +ldr q2, [x0, #432] +mla v25.4S, v30.4S, v31.s[0] +add v3.4s, v3.4s, v17.4s +sqrdmulh v17.4S, v2.4S, v20.s[0] +str q3, [x0, #192] +ldr q3, [x17, #+352] +ldr q30, [x17, #+368] +mla v24.4S, v15.4S, v31.s[0] +sub v15.4s, v5.4s, v13.4s +sqrdmulh v12.4S, v9.4S, v30.s[0] +str q15, [x0, #240] +ldr q15, [x0, #496] +mla v16.4S, v22.4S, v31.s[0] +add v5.4s, v5.4s, v13.4s +sqrdmulh v13.4S, v15.4S, v30.s[0] +str q5, [x0, #224] +ldr q5, [x0, #256] +ldr q22, [x0, #384] +mul v18.4S, v18.4S,v26.s[0] +sub v21.4s, v5.4s, v0.4s +ldr q23, [x0, #272] +mul v2.4S, v2.4S,v26.s[0] +add v5.4s, v5.4s, v0.4s +ldr q0, [x0, #400] +mla v18.4S, v28.4S, v31.s[0] +sub v28.4s, v23.4s, v25.4s +ldr q8, [x0, #320] +mla v2.4S, v17.4S, v31.s[0] +add v23.4s, v23.4s, v25.4s +ldr q25, [x0, #448] +mul v9.4S, v9.4S,v3.s[0] +sub v17.4s, v8.4s, v24.4s +ldr q1, [x0, #336] +mul v15.4S, v15.4S,v3.s[0] +add v8.4s, v8.4s, v24.4s +ldr q24, [x0, #464] +mla v9.4S, v12.4S, v31.s[0] +mla v15.4S, v13.4S, v31.s[0] +sub v13.4s, v1.4s, v16.4s +sqrdmulh v12.4S, v23.4S, v4.s[1] +add v1.4s, v1.4s, v16.4s +mul v23.4S, v23.4S,v27.s[1] +sqrdmulh v16.4S, v28.4S, v4.s[2] +sub v11.4s, v22.4s, v18.4s +mul v28.4S, v28.4S,v27.s[2] +add v22.4s, v22.4s, v18.4s +sqrdmulh v4.4S, v1.4S, v19.s[1] +sub v27.4s, v0.4s, v2.4s +mul v1.4S, v1.4S,v14.s[1] +add v0.4s, v0.4s, v2.4s +sqrdmulh v2.4S, v13.4S, v19.s[2] +sub v18.4s, v25.4s, v9.4s +mul v13.4S, v13.4S,v14.s[2] +add v25.4s, v25.4s, v9.4s +mla v23.4S, v12.4S, v31.s[0] +sub v12.4s, v24.4s, v15.4s +ldr q19, [x0, #736] +sqrdmulh v14.4S, v0.4S, v20.s[1] +add v24.4s, v24.4s, v15.4s +mla v28.4S, v16.4S, v31.s[0] +ldr q16, [x0, #672] +sqrdmulh v15.4S, v27.4S, v20.s[2] +sub v9.4s, v5.4s, v23.4s +mla v1.4S, v4.4S, v31.s[0] +ldr q4, [x0, #544] +sqrdmulh v29.4S, v24.4S, v30.s[1] +add v5.4s, v5.4s, v23.4s +str q9, [x0, #272] +mla v13.4S, v2.4S, v31.s[0] +ldr q2, [x17, #+384] +ldr q9, [x17, #+400] +sqrdmulh v23.4S, v12.4S, v30.s[2] +sub v7.4s, v21.4s, v28.4s +str q5, [x0, #256] +mul v0.4S, v0.4S,v26.s[1] +add v21.4s, v21.4s, v28.4s +mul v27.4S, v27.4S,v26.s[2] +str q7, [x0, #304] +mla v0.4S, v14.4S, v31.s[0] +sub v14.4s, v8.4s, v1.4s +mla v27.4S, v15.4S, v31.s[0] +str q21, [x0, #288] +mul v24.4S, v24.4S,v3.s[1] +str q14, [x0, #336] +mul v12.4S, v12.4S,v3.s[2] +add v8.4s, v8.4s, v1.4s +str q8, [x0, #320] +mla v24.4S, v29.4S, v31.s[0] +sub v29.4s, v17.4s, v13.4s +str q29, [x0, #368] +mla v12.4S, v23.4S, v31.s[0] +add v17.4s, v17.4s, v13.4s +str q17, [x0, #352] +sqrdmulh v30.4S, v4.4S, v9.s[0] +sub v3.4s, v22.4s, v0.4s +mul v4.4S, v4.4S,v2.s[0] +str q3, [x0, #400] +ldr q3, [x0, #560] +sqrdmulh v17.4S, v3.4S, v9.s[0] +add v22.4s, v22.4s, v0.4s +mul v3.4S, v3.4S,v2.s[0] +str q22, [x0, #384] +ldr q22, [x17, #+416] +ldr q0, [x17, #+432] +ldr q13, [x0, #608] +sqrdmulh v23.4S, v13.4S, v0.s[0] +sub v29.4s, v11.4s, v27.4s +mul v13.4S, v13.4S,v22.s[0] +str q29, [x0, #432] +ldr q29, [x0, #624] +sqrdmulh v8.4S, v29.4S, v0.s[0] +add v11.4s, v11.4s, v27.4s +mul v29.4S, v29.4S,v22.s[0] +str q11, [x0, #416] +ldr q11, [x17, #+448] +ldr q27, [x17, #+464] +mla v4.4S, v30.4S, v31.s[0] +sub v30.4s, v25.4s, v24.4s +sqrdmulh v1.4S, v16.4S, v27.s[0] +str q30, [x0, #464] +ldr q30, [x0, #688] +mla v3.4S, v17.4S, v31.s[0] +add v25.4s, v25.4s, v24.4s +sqrdmulh v24.4S, v30.4S, v27.s[0] +str q25, [x0, #448] +ldr q25, [x17, #+480] +ldr q17, [x17, #+496] +mla v13.4S, v23.4S, v31.s[0] +sub v23.4s, v18.4s, v12.4s +sqrdmulh v14.4S, v19.4S, v17.s[0] +str q23, [x0, #496] +ldr q23, [x0, #752] +mla v29.4S, v8.4S, v31.s[0] +add v18.4s, v18.4s, v12.4s +sqrdmulh v12.4S, v23.4S, v17.s[0] +str q18, [x0, #480] +ldr q18, [x0, #512] +ldr q8, [x0, #640] +mul v16.4S, v16.4S,v11.s[0] +sub v20.4s, v18.4s, v4.4s +ldr q26, [x0, #528] +mul v30.4S, v30.4S,v11.s[0] +add v18.4s, v18.4s, v4.4s +ldr q4, [x0, #656] +mla v16.4S, v1.4S, v31.s[0] +sub v1.4s, v26.4s, v3.4s +ldr q21, [x0, #576] +mla v30.4S, v24.4S, v31.s[0] +add v26.4s, v26.4s, v3.4s +ldr q3, [x0, #704] +mul v19.4S, v19.4S,v25.s[0] +sub v24.4s, v21.4s, v13.4s +ldr q15, [x0, #592] +mul v23.4S, v23.4S,v25.s[0] +add v21.4s, v21.4s, v13.4s +ldr q13, [x0, #720] +mla v19.4S, v14.4S, v31.s[0] +mla v23.4S, v12.4S, v31.s[0] +sub v12.4s, v15.4s, v29.4s +sqrdmulh v14.4S, v26.4S, v9.s[1] +add v15.4s, v15.4s, v29.4s +mul v26.4S, v26.4S,v2.s[1] +sqrdmulh v29.4S, v1.4S, v9.s[2] +sub v7.4s, v8.4s, v16.4s +mul v1.4S, v1.4S,v2.s[2] +add v8.4s, v8.4s, v16.4s +sqrdmulh v9.4S, v15.4S, v0.s[1] +sub v2.4s, v4.4s, v30.4s +mul v15.4S, v15.4S,v22.s[1] +add v4.4s, v4.4s, v30.4s +sqrdmulh v30.4S, v12.4S, v0.s[2] +sub v16.4s, v3.4s, v19.4s +mul v12.4S, v12.4S,v22.s[2] +add v3.4s, v3.4s, v19.4s +mla v26.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v23.4s +ldr q0, [x0, #992] +sqrdmulh v22.4S, v4.4S, v27.s[1] +add v13.4s, v13.4s, v23.4s +mla v1.4S, v29.4S, v31.s[0] +ldr q29, [x0, #928] +sqrdmulh v23.4S, v2.4S, v27.s[2] +sub v19.4s, v18.4s, v26.4s +mla v15.4S, v9.4S, v31.s[0] +ldr q9, [x0, #800] +sqrdmulh v28.4S, v13.4S, v17.s[1] +add v18.4s, v18.4s, v26.4s +str q19, [x0, #528] +mla v12.4S, v30.4S, v31.s[0] +ldr q30, [x17, #+512] +ldr q19, [x17, #+528] +sqrdmulh v26.4S, v14.4S, v17.s[2] +sub v5.4s, v20.4s, v1.4s +str q18, [x0, #512] +mul v4.4S, v4.4S,v11.s[1] +add v20.4s, v20.4s, v1.4s +mul v2.4S, v2.4S,v11.s[2] +str q5, [x0, #560] +mla v4.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v15.4s +mla v2.4S, v23.4S, v31.s[0] +str q20, [x0, #544] +mul v13.4S, v13.4S,v25.s[1] +str q22, [x0, #592] +mul v14.4S, v14.4S,v25.s[2] +add v21.4s, v21.4s, v15.4s +str q21, [x0, #576] +mla v13.4S, v28.4S, v31.s[0] +sub v28.4s, v24.4s, v12.4s +str q28, [x0, #624] +mla v14.4S, v26.4S, v31.s[0] +add v24.4s, v24.4s, v12.4s +str q24, [x0, #608] +sqrdmulh v17.4S, v9.4S, v19.s[0] +sub v25.4s, v8.4s, v4.4s +mul v9.4S, v9.4S,v30.s[0] +str q25, [x0, #656] +ldr q25, [x0, #816] +sqrdmulh v24.4S, v25.4S, v19.s[0] +add v8.4s, v8.4s, v4.4s +mul v25.4S, v25.4S,v30.s[0] +str q8, [x0, #640] +ldr q8, [x17, #+544] +ldr q4, [x17, #+560] +ldr q12, [x0, #864] +sqrdmulh v26.4S, v12.4S, v4.s[0] +sub v28.4s, v7.4s, v2.4s +mul v12.4S, v12.4S,v8.s[0] +str q28, [x0, #688] +ldr q28, [x0, #880] +sqrdmulh v21.4S, v28.4S, v4.s[0] +add v7.4s, v7.4s, v2.4s +mul v28.4S, v28.4S,v8.s[0] +str q7, [x0, #672] +ldr q7, [x17, #+576] +ldr q2, [x17, #+592] +mla v9.4S, v17.4S, v31.s[0] +sub v17.4s, v3.4s, v13.4s +sqrdmulh v15.4S, v29.4S, v2.s[0] +str q17, [x0, #720] +ldr q17, [x0, #944] +mla v25.4S, v24.4S, v31.s[0] +add v3.4s, v3.4s, v13.4s +sqrdmulh v13.4S, v17.4S, v2.s[0] +str q3, [x0, #704] +ldr q3, [x17, #+608] +ldr q24, [x17, #+624] +mla v12.4S, v26.4S, v31.s[0] +sub v26.4s, v16.4s, v14.4s +sqrdmulh v22.4S, v0.4S, v24.s[0] +str q26, [x0, #752] +ldr q26, [x0, #1008] +mla v28.4S, v21.4S, v31.s[0] +add v16.4s, v16.4s, v14.4s +sqrdmulh v14.4S, v26.4S, v24.s[0] +str q16, [x0, #736] +ldr q16, [x0, #768] +ldr q21, [x0, #896] +mul v29.4S, v29.4S,v7.s[0] +sub v27.4s, v16.4s, v9.4s +ldr q11, [x0, #784] +mul v17.4S, v17.4S,v7.s[0] +add v16.4s, v16.4s, v9.4s +ldr q9, [x0, #912] +mla v29.4S, v15.4S, v31.s[0] +sub v15.4s, v11.4s, v25.4s +ldr q20, [x0, #832] +mla v17.4S, v13.4S, v31.s[0] +add v11.4s, v11.4s, v25.4s +ldr q25, [x0, #960] +mul v0.4S, v0.4S,v3.s[0] +sub v13.4s, v20.4s, v12.4s +ldr q23, [x0, #848] +mul v26.4S, v26.4S,v3.s[0] +add v20.4s, v20.4s, v12.4s +ldr q12, [x0, #976] +mla v0.4S, v22.4S, v31.s[0] +mla v26.4S, v14.4S, v31.s[0] +sub v14.4s, v23.4s, v28.4s +sqrdmulh v22.4S, v11.4S, v19.s[1] +add v23.4s, v23.4s, v28.4s +mul v11.4S, v11.4S,v30.s[1] +sqrdmulh v28.4S, v15.4S, v19.s[2] +sub v5.4s, v21.4s, v29.4s +mul v15.4S, v15.4S,v30.s[2] +add v21.4s, v21.4s, v29.4s +sqrdmulh v19.4S, v23.4S, v4.s[1] +sub v30.4s, v9.4s, v17.4s +mul v23.4S, v23.4S,v8.s[1] +add v9.4s, v9.4s, v17.4s +sqrdmulh v17.4S, v14.4S, v4.s[2] +sub v29.4s, v25.4s, v0.4s +mul v14.4S, v14.4S,v8.s[2] +add v25.4s, v25.4s, v0.4s +mla v11.4S, v22.4S, v31.s[0] +sub v22.4s, v12.4s, v26.4s +sqrdmulh v4.4S, v9.4S, v2.s[1] +add v12.4s, v12.4s, v26.4s +mla v15.4S, v28.4S, v31.s[0] +sqrdmulh v28.4S, v30.4S, v2.s[2] +sub v26.4s, v16.4s, v11.4s +mla v23.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v12.4S, v24.s[1] +add v16.4s, v16.4s, v11.4s +str q26, [x0, #784] +mla v14.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v22.4S, v24.s[2] +sub v26.4s, v27.4s, v15.4s +str q16, [x0, #768] +mul v9.4S, v9.4S,v7.s[1] +add v27.4s, v27.4s, v15.4s +mul v30.4S, v30.4S,v7.s[2] +str q26, [x0, #816] +mla v9.4S, v4.4S, v31.s[0] +sub v4.4s, v20.4s, v23.4s +mla v30.4S, v28.4S, v31.s[0] +str q27, [x0, #800] +mul v12.4S, v12.4S,v3.s[1] +str q4, [x0, #848] +mul v22.4S, v22.4S,v3.s[2] +add v20.4s, v20.4s, v23.4s +str q20, [x0, #832] +mla v12.4S, v19.4S, v31.s[0] +sub v19.4s, v13.4s, v14.4s +str q19, [x0, #880] +mla v22.4S, v17.4S, v31.s[0] +add v13.4s, v13.4s, v14.4s +str q13, [x0, #864] +sub v24.4s, v21.4s, v9.4s +str q24, [x0, #912] +add v21.4s, v21.4s, v9.4s +str q21, [x0, #896] +sub v21.4s, v5.4s, v30.4s +str q21, [x0, #944] +add v5.4s, v5.4s, v30.4s +str q5, [x0, #928] +sub v5.4s, v25.4s, v12.4s +str q5, [x0, #976] +add v25.4s, v25.4s, v12.4s +str q25, [x0, #960] +sub v25.4s, v29.4s, v22.4s +str q25, [x0, #1008] +add v29.4s, v29.4s, v22.4s +str q29, [x0, #992] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1548 +// Instruction count: 1544 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_17_z4_7.s b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_17_z4_7.s new file mode 100644 index 0000000..83076cb --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_17_z4_7.s @@ -0,0 +1,1558 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_17_z4_7 +.global _ntt_u32_incomplete_neon_asm_var_4_2_17_z4_7 +ntt_u32_incomplete_neon_asm_var_4_2_17_z4_7: +_ntt_u32_incomplete_neon_asm_var_4_2_17_z4_7: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x0, #992] +sqrdmulh v27.4S, v28.4S, v29.s[0] +mul v28.4S, v28.4S,v30.s[0] +ldr q26, [x0, #928] +sqrdmulh v25.4S, v26.4S, v29.s[0] +mul v26.4S, v26.4S,v30.s[0] +ldr q24, [x0, #864] +sqrdmulh v23.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +ldr q22, [x0, #800] +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +ldr q20, [x0, #736] +mla v28.4S, v27.4S, v31.s[0] +sqrdmulh v27.4S, v20.4S, v29.s[0] +ldr q19, [x0, #672] +mla v26.4S, v25.4S, v31.s[0] +sqrdmulh v25.4S, v19.4S, v29.s[0] +ldr q18, [x0, #608] +mla v24.4S, v23.4S, v31.s[0] +sqrdmulh v23.4S, v18.4S, v29.s[0] +ldr q17, [x0, #544] +mla v22.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v17.4S, v29.s[0] +ldr q16, [x0, #480] +ldr q3, [x0, #416] +mul v20.4S, v20.4S,v30.s[0] +sub v2.4s, v16.4s, v28.4s +mul v19.4S, v19.4S,v30.s[0] +add v16.4s, v16.4s, v28.4s +ldr q28, [x0, #352] +ldr q1, [x0, #288] +mla v20.4S, v27.4S, v31.s[0] +sub v27.4s, v3.4s, v26.4s +mla v19.4S, v25.4S, v31.s[0] +add v3.4s, v3.4s, v26.4s +ldr q26, [x0, #224] +ldr q25, [x0, #160] +mul v18.4S, v18.4S,v30.s[0] +sub v0.4s, v28.4s, v24.4s +mul v17.4S, v17.4S,v30.s[0] +add v28.4s, v28.4s, v24.4s +ldr q24, [x0, #96] +ldr q15, [x0, #32] +mla v18.4S, v23.4S, v31.s[0] +sub v23.4s, v1.4s, v22.4s +mla v17.4S, v21.4S, v31.s[0] +add v1.4s, v1.4s, v22.4s +sqrdmulh v22.4S, v2.4S, v29.s[2] +nop +mul v2.4S, v2.4S,v30.s[2] +nop +sqrdmulh v21.4S, v27.4S, v29.s[2] +sub v14.4s, v26.4s, v20.4s +mul v27.4S, v27.4S,v30.s[2] +add v26.4s, v26.4s, v20.4s +sqrdmulh v20.4S, v16.4S, v29.s[1] +sub v13.4s, v25.4s, v19.4s +mul v16.4S, v16.4S,v30.s[1] +add v25.4s, v25.4s, v19.4s +sqrdmulh v19.4S, v3.4S, v29.s[1] +sub v12.4s, v24.4s, v18.4s +mul v3.4S, v3.4S,v30.s[1] +add v24.4s, v24.4s, v18.4s +mla v2.4S, v22.4S, v31.s[0] +sub v22.4s, v15.4s, v17.4s +sqrdmulh v18.4S, v0.4S, v29.s[2] +add v15.4s, v15.4s, v17.4s +mla v27.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v23.4S, v29.s[2] +nop +mla v16.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v28.4S, v29.s[1] +nop +mla v3.4S, v19.4S, v31.s[0] +nop +sqrdmulh v19.4S, v1.4S, v29.s[1] +nop +ldr q17, [x17, #+32] +ldr q11, [x17, #+48] +mul v0.4S, v0.4S,v30.s[2] +sub v10.4s, v14.4s, v2.4s +mul v23.4S, v23.4S,v30.s[2] +add v14.4s, v14.4s, v2.4s +mla v0.4S, v18.4S, v31.s[0] +sub v18.4s, v13.4s, v27.4s +mla v23.4S, v21.4S, v31.s[0] +add v13.4s, v13.4s, v27.4s +mul v28.4S, v28.4S,v30.s[1] +sub v27.4s, v26.4s, v16.4s +mul v1.4S, v1.4S,v30.s[1] +add v26.4s, v26.4s, v16.4s +mla v28.4S, v20.4S, v31.s[0] +sub v20.4s, v25.4s, v3.4s +mla v1.4S, v19.4S, v31.s[0] +add v25.4s, v25.4s, v3.4s +sqrdmulh v3.4S, v10.4S, v11.s[3] +nop +mul v10.4S, v10.4S,v17.s[3] +nop +sqrdmulh v19.4S, v14.4S, v11.s[2] +sub v16.4s, v12.4s, v0.4s +mul v14.4S, v14.4S,v17.s[2] +add v12.4s, v12.4s, v0.4s +sqrdmulh v0.4S, v27.4S, v11.s[1] +sub v21.4s, v22.4s, v23.4s +mul v27.4S, v27.4S,v17.s[1] +add v22.4s, v22.4s, v23.4s +sqrdmulh v23.4S, v26.4S, v11.s[0] +sub v2.4s, v24.4s, v28.4s +mul v26.4S, v26.4S,v17.s[0] +add v24.4s, v24.4s, v28.4s +ldr q28, [x17, #+96] +ldr q9, [x17, #+112] +mla v10.4S, v3.4S, v31.s[0] +sub v3.4s, v15.4s, v1.4s +sqrdmulh v8.4S, v18.4S, v11.s[3] +add v15.4s, v15.4s, v1.4s +mla v14.4S, v19.4S, v31.s[0] +nop +sqrdmulh v19.4S, v13.4S, v11.s[2] +nop +mla v27.4S, v0.4S, v31.s[0] +nop +sqrdmulh v0.4S, v20.4S, v11.s[1] +nop +mla v26.4S, v23.4S, v31.s[0] +nop +sqrdmulh v23.4S, v25.4S, v11.s[0] +nop +ldr q1, [x17, #+64] +ldr q7, [x17, #+80] +mul v18.4S, v18.4S,v17.s[3] +sub v6.4s, v16.4s, v10.4s +mul v13.4S, v13.4S,v17.s[2] +add v16.4s, v16.4s, v10.4s +mla v18.4S, v8.4S, v31.s[0] +sub v8.4s, v12.4s, v14.4s +mla v13.4S, v19.4S, v31.s[0] +add v12.4s, v12.4s, v14.4s +mul v20.4S, v20.4S,v17.s[1] +sub v14.4s, v2.4s, v27.4s +mul v25.4S, v25.4S,v17.s[0] +add v2.4s, v2.4s, v27.4s +mla v20.4S, v0.4S, v31.s[0] +sub v0.4s, v24.4s, v26.4s +mla v25.4S, v23.4S, v31.s[0] +add v24.4s, v24.4s, v26.4s +sqrdmulh v26.4S, v6.4S, v9.s[3] +nop +mul v6.4S, v6.4S,v28.s[3] +nop +sqrdmulh v23.4S, v16.4S, v9.s[2] +sub v27.4s, v21.4s, v18.4s +mul v16.4S, v16.4S,v28.s[2] +add v21.4s, v21.4s, v18.4s +sqrdmulh v18.4S, v8.4S, v9.s[1] +sub v19.4s, v22.4s, v13.4s +mul v8.4S, v8.4S,v28.s[1] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v12.4S, v9.s[0] +sub v10.4s, v3.4s, v20.4s +mul v12.4S, v12.4S,v28.s[0] +add v3.4s, v3.4s, v20.4s +mla v6.4S, v26.4S, v31.s[0] +sub v26.4s, v15.4s, v25.4s +sqrdmulh v20.4S, v14.4S, v7.s[3] +add v15.4s, v15.4s, v25.4s +mla v16.4S, v23.4S, v31.s[0] +sub v23.4s, v27.4s, v6.4s +sqrdmulh v25.4S, v2.4S, v7.s[2] +add v27.4s, v27.4s, v6.4s +mla v8.4S, v18.4S, v31.s[0] +sub v18.4s, v21.4s, v16.4s +sqrdmulh v6.4S, v0.4S, v7.s[1] +add v21.4s, v21.4s, v16.4s +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v19.4s, v8.4s +sqrdmulh v16.4S, v24.4S, v7.s[0] +add v19.4s, v19.4s, v8.4s +mul v14.4S, v14.4S,v1.s[3] +sub v8.4s, v22.4s, v12.4s +mul v2.4S, v2.4S,v1.s[2] +add v22.4s, v22.4s, v12.4s +mla v14.4S, v20.4S, v31.s[0] +str q23, [x0, #992] +mla v2.4S, v25.4S, v31.s[0] +str q27, [x0, #928] +mul v0.4S, v0.4S,v1.s[1] +str q18, [x0, #864] +mul v24.4S, v24.4S,v1.s[0] +str q21, [x0, #800] +mla v0.4S, v6.4S, v31.s[0] +str q13, [x0, #736] +mla v24.4S, v16.4S, v31.s[0] +str q19, [x0, #672] +ldr q19, [x0, #1008] +sqrdmulh v16.4S, v19.4S, v29.s[0] +str q8, [x0, #608] +mul v19.4S, v19.4S,v30.s[0] +sub v8.4s, v10.4s, v14.4s +ldr q13, [x0, #944] +sqrdmulh v6.4S, v13.4S, v29.s[0] +str q22, [x0, #544] +mul v13.4S, v13.4S,v30.s[0] +add v10.4s, v10.4s, v14.4s +ldr q14, [x0, #880] +sqrdmulh v22.4S, v14.4S, v29.s[0] +str q8, [x0, #480] +mul v14.4S, v14.4S,v30.s[0] +sub v8.4s, v3.4s, v2.4s +ldr q21, [x0, #816] +sqrdmulh v18.4S, v21.4S, v29.s[0] +str q10, [x0, #416] +mul v21.4S, v21.4S,v30.s[0] +add v3.4s, v3.4s, v2.4s +ldr q2, [x0, #752] +mla v19.4S, v16.4S, v31.s[0] +str q8, [x0, #352] +sqrdmulh v8.4S, v2.4S, v29.s[0] +sub v16.4s, v26.4s, v0.4s +ldr q10, [x0, #688] +mla v13.4S, v6.4S, v31.s[0] +str q3, [x0, #288] +sqrdmulh v3.4S, v10.4S, v29.s[0] +add v26.4s, v26.4s, v0.4s +ldr q0, [x0, #624] +mla v14.4S, v22.4S, v31.s[0] +str q16, [x0, #224] +sqrdmulh v16.4S, v0.4S, v29.s[0] +sub v22.4s, v15.4s, v24.4s +ldr q6, [x0, #560] +mla v21.4S, v18.4S, v31.s[0] +str q26, [x0, #160] +sqrdmulh v26.4S, v6.4S, v29.s[0] +add v15.4s, v15.4s, v24.4s +ldr q24, [x0, #496] +ldr q18, [x0, #432] +mul v2.4S, v2.4S,v30.s[0] +sub v27.4s, v24.4s, v19.4s +mul v10.4S, v10.4S,v30.s[0] +add v24.4s, v24.4s, v19.4s +ldr q19, [x0, #368] +ldr q25, [x0, #304] +mla v2.4S, v8.4S, v31.s[0] +sub v8.4s, v18.4s, v13.4s +mla v10.4S, v3.4S, v31.s[0] +add v18.4s, v18.4s, v13.4s +ldr q13, [x0, #240] +ldr q3, [x0, #176] +mul v0.4S, v0.4S,v30.s[0] +sub v23.4s, v19.4s, v14.4s +mul v6.4S, v6.4S,v30.s[0] +add v19.4s, v19.4s, v14.4s +ldr q14, [x0, #112] +ldr q20, [x0, #48] +mla v0.4S, v16.4S, v31.s[0] +sub v16.4s, v25.4s, v21.4s +mla v6.4S, v26.4S, v31.s[0] +add v25.4s, v25.4s, v21.4s +sqrdmulh v21.4S, v27.4S, v29.s[2] +nop +mul v27.4S, v27.4S,v30.s[2] +nop +sqrdmulh v26.4S, v8.4S, v29.s[2] +sub v12.4s, v13.4s, v2.4s +mul v8.4S, v8.4S,v30.s[2] +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v24.4S, v29.s[1] +sub v5.4s, v3.4s, v10.4s +mul v24.4S, v24.4S,v30.s[1] +add v3.4s, v3.4s, v10.4s +sqrdmulh v10.4S, v18.4S, v29.s[1] +sub v4.4s, v14.4s, v0.4s +mul v18.4S, v18.4S,v30.s[1] +add v14.4s, v14.4s, v0.4s +mla v27.4S, v21.4S, v31.s[0] +sub v21.4s, v20.4s, v6.4s +sqrdmulh v0.4S, v23.4S, v29.s[2] +add v20.4s, v20.4s, v6.4s +mla v8.4S, v26.4S, v31.s[0] +str q22, [x0, #96] +sqrdmulh v22.4S, v16.4S, v29.s[2] +nop +mla v24.4S, v2.4S, v31.s[0] +str q15, [x0, #32] +sqrdmulh v15.4S, v19.4S, v29.s[1] +nop +mla v18.4S, v10.4S, v31.s[0] +nop +sqrdmulh v10.4S, v25.4S, v29.s[1] +nop +mul v23.4S, v23.4S,v30.s[2] +sub v2.4s, v12.4s, v27.4s +mul v16.4S, v16.4S,v30.s[2] +add v12.4s, v12.4s, v27.4s +mla v23.4S, v0.4S, v31.s[0] +sub v0.4s, v5.4s, v8.4s +mla v16.4S, v22.4S, v31.s[0] +add v5.4s, v5.4s, v8.4s +mul v19.4S, v19.4S,v30.s[1] +sub v8.4s, v13.4s, v24.4s +mul v25.4S, v25.4S,v30.s[1] +add v13.4s, v13.4s, v24.4s +mla v19.4S, v15.4S, v31.s[0] +sub v15.4s, v3.4s, v18.4s +mla v25.4S, v10.4S, v31.s[0] +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v2.4S, v11.s[3] +nop +mul v2.4S, v2.4S,v17.s[3] +nop +sqrdmulh v10.4S, v12.4S, v11.s[2] +sub v24.4s, v4.4s, v23.4s +mul v12.4S, v12.4S,v17.s[2] +add v4.4s, v4.4s, v23.4s +sqrdmulh v23.4S, v8.4S, v11.s[1] +sub v22.4s, v21.4s, v16.4s +mul v8.4S, v8.4S,v17.s[1] +add v21.4s, v21.4s, v16.4s +sqrdmulh v16.4S, v13.4S, v11.s[0] +sub v27.4s, v14.4s, v19.4s +mul v13.4S, v13.4S,v17.s[0] +add v14.4s, v14.4s, v19.4s +mla v2.4S, v18.4S, v31.s[0] +sub v18.4s, v20.4s, v25.4s +sqrdmulh v19.4S, v0.4S, v11.s[3] +add v20.4s, v20.4s, v25.4s +mla v12.4S, v10.4S, v31.s[0] +nop +sqrdmulh v10.4S, v5.4S, v11.s[2] +nop +mla v8.4S, v23.4S, v31.s[0] +nop +sqrdmulh v23.4S, v15.4S, v11.s[1] +nop +mla v13.4S, v16.4S, v31.s[0] +nop +sqrdmulh v16.4S, v3.4S, v11.s[0] +nop +mul v0.4S, v0.4S,v17.s[3] +sub v25.4s, v24.4s, v2.4s +mul v5.4S, v5.4S,v17.s[2] +add v24.4s, v24.4s, v2.4s +mla v0.4S, v19.4S, v31.s[0] +sub v19.4s, v4.4s, v12.4s +mla v5.4S, v10.4S, v31.s[0] +add v4.4s, v4.4s, v12.4s +mul v15.4S, v15.4S,v17.s[1] +sub v12.4s, v27.4s, v8.4s +mul v3.4S, v3.4S,v17.s[0] +add v27.4s, v27.4s, v8.4s +mla v15.4S, v23.4S, v31.s[0] +sub v23.4s, v14.4s, v13.4s +mla v3.4S, v16.4S, v31.s[0] +add v14.4s, v14.4s, v13.4s +sqrdmulh v13.4S, v25.4S, v9.s[3] +nop +mul v25.4S, v25.4S,v28.s[3] +nop +sqrdmulh v16.4S, v24.4S, v9.s[2] +sub v8.4s, v22.4s, v0.4s +mul v24.4S, v24.4S,v28.s[2] +add v22.4s, v22.4s, v0.4s +sqrdmulh v0.4S, v19.4S, v9.s[1] +sub v10.4s, v21.4s, v5.4s +mul v19.4S, v19.4S,v28.s[1] +add v21.4s, v21.4s, v5.4s +sqrdmulh v5.4S, v4.4S, v9.s[0] +sub v2.4s, v18.4s, v15.4s +mul v4.4S, v4.4S,v28.s[0] +add v18.4s, v18.4s, v15.4s +mla v25.4S, v13.4S, v31.s[0] +sub v13.4s, v20.4s, v3.4s +sqrdmulh v15.4S, v12.4S, v7.s[3] +add v20.4s, v20.4s, v3.4s +mla v24.4S, v16.4S, v31.s[0] +sub v16.4s, v8.4s, v25.4s +sqrdmulh v3.4S, v27.4S, v7.s[2] +add v8.4s, v8.4s, v25.4s +mla v19.4S, v0.4S, v31.s[0] +sub v0.4s, v22.4s, v24.4s +sqrdmulh v25.4S, v23.4S, v7.s[1] +add v22.4s, v22.4s, v24.4s +mla v4.4S, v5.4S, v31.s[0] +sub v5.4s, v10.4s, v19.4s +sqrdmulh v24.4S, v14.4S, v7.s[0] +add v10.4s, v10.4s, v19.4s +mul v12.4S, v12.4S,v1.s[3] +sub v19.4s, v21.4s, v4.4s +mul v27.4S, v27.4S,v1.s[2] +add v21.4s, v21.4s, v4.4s +mla v12.4S, v15.4S, v31.s[0] +str q16, [x0, #1008] +mla v27.4S, v3.4S, v31.s[0] +str q8, [x0, #944] +mul v23.4S, v23.4S,v1.s[1] +str q0, [x0, #880] +mul v14.4S, v14.4S,v1.s[0] +str q22, [x0, #816] +mla v23.4S, v25.4S, v31.s[0] +str q5, [x0, #752] +mla v14.4S, v24.4S, v31.s[0] +str q10, [x0, #688] +ldr q10, [x0, #960] +sqrdmulh v24.4S, v10.4S, v29.s[0] +str q19, [x0, #624] +mul v10.4S, v10.4S,v30.s[0] +sub v19.4s, v2.4s, v12.4s +ldr q5, [x0, #896] +sqrdmulh v25.4S, v5.4S, v29.s[0] +str q21, [x0, #560] +mul v5.4S, v5.4S,v30.s[0] +add v2.4s, v2.4s, v12.4s +ldr q12, [x0, #832] +sqrdmulh v21.4S, v12.4S, v29.s[0] +str q19, [x0, #496] +mul v12.4S, v12.4S,v30.s[0] +sub v19.4s, v18.4s, v27.4s +ldr q22, [x0, #768] +sqrdmulh v0.4S, v22.4S, v29.s[0] +str q2, [x0, #432] +mul v22.4S, v22.4S,v30.s[0] +add v18.4s, v18.4s, v27.4s +ldr q27, [x0, #704] +mla v10.4S, v24.4S, v31.s[0] +str q19, [x0, #368] +sqrdmulh v19.4S, v27.4S, v29.s[0] +sub v24.4s, v13.4s, v23.4s +ldr q2, [x0, #640] +mla v5.4S, v25.4S, v31.s[0] +str q18, [x0, #304] +sqrdmulh v18.4S, v2.4S, v29.s[0] +add v13.4s, v13.4s, v23.4s +ldr q23, [x0, #576] +mla v12.4S, v21.4S, v31.s[0] +str q24, [x0, #240] +sqrdmulh v24.4S, v23.4S, v29.s[0] +sub v21.4s, v20.4s, v14.4s +ldr q25, [x0, #512] +mla v22.4S, v0.4S, v31.s[0] +str q13, [x0, #176] +sqrdmulh v13.4S, v25.4S, v29.s[0] +add v20.4s, v20.4s, v14.4s +ldr q14, [x0, #448] +ldr q0, [x0, #384] +mul v27.4S, v27.4S,v30.s[0] +sub v8.4s, v14.4s, v10.4s +mul v2.4S, v2.4S,v30.s[0] +add v14.4s, v14.4s, v10.4s +ldr q10, [x0, #320] +ldr q3, [x0, #256] +mla v27.4S, v19.4S, v31.s[0] +sub v19.4s, v0.4s, v5.4s +mla v2.4S, v18.4S, v31.s[0] +add v0.4s, v0.4s, v5.4s +ldr q5, [x0, #192] +ldr q18, [x0, #128] +mul v23.4S, v23.4S,v30.s[0] +sub v16.4s, v10.4s, v12.4s +mul v25.4S, v25.4S,v30.s[0] +add v10.4s, v10.4s, v12.4s +ldr q12, [x0, #64] +ldr q15, [x0, #0] +mla v23.4S, v24.4S, v31.s[0] +sub v24.4s, v3.4s, v22.4s +mla v25.4S, v13.4S, v31.s[0] +add v3.4s, v3.4s, v22.4s +sqrdmulh v22.4S, v8.4S, v29.s[2] +nop +mul v8.4S, v8.4S,v30.s[2] +nop +sqrdmulh v13.4S, v19.4S, v29.s[2] +sub v4.4s, v5.4s, v27.4s +mul v19.4S, v19.4S,v30.s[2] +add v5.4s, v5.4s, v27.4s +sqrdmulh v27.4S, v14.4S, v29.s[1] +sub v26.4s, v18.4s, v2.4s +mul v14.4S, v14.4S,v30.s[1] +add v18.4s, v18.4s, v2.4s +sqrdmulh v2.4S, v0.4S, v29.s[1] +sub v6.4s, v12.4s, v23.4s +mul v0.4S, v0.4S,v30.s[1] +add v12.4s, v12.4s, v23.4s +mla v8.4S, v22.4S, v31.s[0] +sub v22.4s, v15.4s, v25.4s +sqrdmulh v23.4S, v16.4S, v29.s[2] +add v15.4s, v15.4s, v25.4s +mla v19.4S, v13.4S, v31.s[0] +str q21, [x0, #112] +sqrdmulh v21.4S, v24.4S, v29.s[2] +nop +mla v14.4S, v27.4S, v31.s[0] +str q20, [x0, #48] +sqrdmulh v20.4S, v10.4S, v29.s[1] +nop +mla v0.4S, v2.4S, v31.s[0] +nop +sqrdmulh v2.4S, v3.4S, v29.s[1] +nop +mul v16.4S, v16.4S,v30.s[2] +sub v27.4s, v4.4s, v8.4s +mul v24.4S, v24.4S,v30.s[2] +add v4.4s, v4.4s, v8.4s +mla v16.4S, v23.4S, v31.s[0] +sub v23.4s, v26.4s, v19.4s +mla v24.4S, v21.4S, v31.s[0] +add v26.4s, v26.4s, v19.4s +mul v10.4S, v10.4S,v30.s[1] +sub v19.4s, v5.4s, v14.4s +mul v3.4S, v3.4S,v30.s[1] +add v5.4s, v5.4s, v14.4s +mla v10.4S, v20.4S, v31.s[0] +sub v20.4s, v18.4s, v0.4s +mla v3.4S, v2.4S, v31.s[0] +add v18.4s, v18.4s, v0.4s +sqrdmulh v0.4S, v27.4S, v11.s[3] +nop +mul v27.4S, v27.4S,v17.s[3] +nop +sqrdmulh v2.4S, v4.4S, v11.s[2] +sub v14.4s, v6.4s, v16.4s +mul v4.4S, v4.4S,v17.s[2] +add v6.4s, v6.4s, v16.4s +sqrdmulh v16.4S, v19.4S, v11.s[1] +sub v21.4s, v22.4s, v24.4s +mul v19.4S, v19.4S,v17.s[1] +add v22.4s, v22.4s, v24.4s +sqrdmulh v24.4S, v5.4S, v11.s[0] +sub v8.4s, v12.4s, v10.4s +mul v5.4S, v5.4S,v17.s[0] +add v12.4s, v12.4s, v10.4s +mla v27.4S, v0.4S, v31.s[0] +sub v0.4s, v15.4s, v3.4s +sqrdmulh v10.4S, v23.4S, v11.s[3] +add v15.4s, v15.4s, v3.4s +mla v4.4S, v2.4S, v31.s[0] +nop +sqrdmulh v2.4S, v26.4S, v11.s[2] +nop +mla v19.4S, v16.4S, v31.s[0] +nop +sqrdmulh v16.4S, v20.4S, v11.s[1] +nop +mla v5.4S, v24.4S, v31.s[0] +nop +sqrdmulh v24.4S, v18.4S, v11.s[0] +nop +mul v23.4S, v23.4S,v17.s[3] +sub v3.4s, v14.4s, v27.4s +mul v26.4S, v26.4S,v17.s[2] +add v14.4s, v14.4s, v27.4s +mla v23.4S, v10.4S, v31.s[0] +sub v10.4s, v6.4s, v4.4s +mla v26.4S, v2.4S, v31.s[0] +add v6.4s, v6.4s, v4.4s +mul v20.4S, v20.4S,v17.s[1] +sub v4.4s, v8.4s, v19.4s +mul v18.4S, v18.4S,v17.s[0] +add v8.4s, v8.4s, v19.4s +mla v20.4S, v16.4S, v31.s[0] +sub v16.4s, v12.4s, v5.4s +mla v18.4S, v24.4S, v31.s[0] +add v12.4s, v12.4s, v5.4s +sqrdmulh v5.4S, v3.4S, v9.s[3] +nop +mul v3.4S, v3.4S,v28.s[3] +nop +sqrdmulh v24.4S, v14.4S, v9.s[2] +sub v19.4s, v21.4s, v23.4s +mul v14.4S, v14.4S,v28.s[2] +add v21.4s, v21.4s, v23.4s +sqrdmulh v23.4S, v10.4S, v9.s[1] +sub v2.4s, v22.4s, v26.4s +mul v10.4S, v10.4S,v28.s[1] +add v22.4s, v22.4s, v26.4s +sqrdmulh v26.4S, v6.4S, v9.s[0] +sub v27.4s, v0.4s, v20.4s +mul v6.4S, v6.4S,v28.s[0] +add v0.4s, v0.4s, v20.4s +mla v3.4S, v5.4S, v31.s[0] +sub v5.4s, v15.4s, v18.4s +sqrdmulh v20.4S, v4.4S, v7.s[3] +add v15.4s, v15.4s, v18.4s +mla v14.4S, v24.4S, v31.s[0] +sub v24.4s, v19.4s, v3.4s +sqrdmulh v18.4S, v8.4S, v7.s[2] +add v19.4s, v19.4s, v3.4s +mla v10.4S, v23.4S, v31.s[0] +sub v23.4s, v21.4s, v14.4s +sqrdmulh v3.4S, v16.4S, v7.s[1] +add v21.4s, v21.4s, v14.4s +mla v6.4S, v26.4S, v31.s[0] +sub v26.4s, v2.4s, v10.4s +sqrdmulh v14.4S, v12.4S, v7.s[0] +add v2.4s, v2.4s, v10.4s +mul v4.4S, v4.4S,v1.s[3] +sub v10.4s, v22.4s, v6.4s +mul v8.4S, v8.4S,v1.s[2] +add v22.4s, v22.4s, v6.4s +mla v4.4S, v20.4S, v31.s[0] +str q24, [x0, #960] +mla v8.4S, v18.4S, v31.s[0] +str q19, [x0, #896] +mul v16.4S, v16.4S,v1.s[1] +str q23, [x0, #832] +mul v12.4S, v12.4S,v1.s[0] +str q21, [x0, #768] +mla v16.4S, v3.4S, v31.s[0] +str q26, [x0, #704] +mla v12.4S, v14.4S, v31.s[0] +str q2, [x0, #640] +ldr q2, [x0, #976] +sqrdmulh v14.4S, v2.4S, v29.s[0] +str q10, [x0, #576] +mul v2.4S, v2.4S,v30.s[0] +sub v10.4s, v27.4s, v4.4s +ldr q26, [x0, #912] +sqrdmulh v3.4S, v26.4S, v29.s[0] +str q22, [x0, #512] +mul v26.4S, v26.4S,v30.s[0] +add v27.4s, v27.4s, v4.4s +ldr q4, [x0, #848] +sqrdmulh v22.4S, v4.4S, v29.s[0] +str q10, [x0, #448] +mul v4.4S, v4.4S,v30.s[0] +sub v10.4s, v0.4s, v8.4s +ldr q21, [x0, #784] +sqrdmulh v23.4S, v21.4S, v29.s[0] +str q27, [x0, #384] +mul v21.4S, v21.4S,v30.s[0] +add v0.4s, v0.4s, v8.4s +ldr q8, [x0, #720] +mla v2.4S, v14.4S, v31.s[0] +str q10, [x0, #320] +sqrdmulh v10.4S, v8.4S, v29.s[0] +sub v14.4s, v5.4s, v16.4s +ldr q27, [x0, #656] +mla v26.4S, v3.4S, v31.s[0] +str q0, [x0, #256] +sqrdmulh v0.4S, v27.4S, v29.s[0] +add v5.4s, v5.4s, v16.4s +ldr q16, [x0, #592] +mla v4.4S, v22.4S, v31.s[0] +str q14, [x0, #192] +sqrdmulh v14.4S, v16.4S, v29.s[0] +sub v22.4s, v15.4s, v12.4s +ldr q3, [x0, #528] +mla v21.4S, v23.4S, v31.s[0] +str q5, [x0, #128] +sqrdmulh v5.4S, v3.4S, v29.s[0] +add v15.4s, v15.4s, v12.4s +ldr q12, [x0, #464] +ldr q23, [x0, #400] +mul v8.4S, v8.4S,v30.s[0] +sub v19.4s, v12.4s, v2.4s +mul v27.4S, v27.4S,v30.s[0] +add v12.4s, v12.4s, v2.4s +ldr q2, [x0, #336] +ldr q18, [x0, #272] +mla v8.4S, v10.4S, v31.s[0] +sub v10.4s, v23.4s, v26.4s +mla v27.4S, v0.4S, v31.s[0] +add v23.4s, v23.4s, v26.4s +ldr q26, [x0, #208] +ldr q0, [x0, #144] +mul v16.4S, v16.4S,v30.s[0] +sub v24.4s, v2.4s, v4.4s +mul v3.4S, v3.4S,v30.s[0] +add v2.4s, v2.4s, v4.4s +ldr q4, [x0, #80] +ldr q20, [x0, #16] +mla v16.4S, v14.4S, v31.s[0] +sub v14.4s, v18.4s, v21.4s +mla v3.4S, v5.4S, v31.s[0] +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v19.4S, v29.s[2] +nop +mul v19.4S, v19.4S,v30.s[2] +nop +sqrdmulh v5.4S, v10.4S, v29.s[2] +sub v6.4s, v26.4s, v8.4s +mul v10.4S, v10.4S,v30.s[2] +add v26.4s, v26.4s, v8.4s +sqrdmulh v8.4S, v12.4S, v29.s[1] +sub v13.4s, v0.4s, v27.4s +mul v12.4S, v12.4S,v30.s[1] +add v0.4s, v0.4s, v27.4s +sqrdmulh v27.4S, v23.4S, v29.s[1] +sub v25.4s, v4.4s, v16.4s +mul v23.4S, v23.4S,v30.s[1] +add v4.4s, v4.4s, v16.4s +mla v19.4S, v21.4S, v31.s[0] +sub v21.4s, v20.4s, v3.4s +sqrdmulh v16.4S, v24.4S, v29.s[2] +add v20.4s, v20.4s, v3.4s +mla v10.4S, v5.4S, v31.s[0] +str q22, [x0, #64] +sqrdmulh v22.4S, v14.4S, v29.s[2] +nop +mla v12.4S, v8.4S, v31.s[0] +str q15, [x0, #0] +sqrdmulh v15.4S, v2.4S, v29.s[1] +nop +mla v23.4S, v27.4S, v31.s[0] +nop +sqrdmulh v27.4S, v18.4S, v29.s[1] +nop +mul v24.4S, v24.4S,v30.s[2] +sub v8.4s, v6.4s, v19.4s +mul v14.4S, v14.4S,v30.s[2] +add v6.4s, v6.4s, v19.4s +mla v24.4S, v16.4S, v31.s[0] +sub v16.4s, v13.4s, v10.4s +mla v14.4S, v22.4S, v31.s[0] +add v13.4s, v13.4s, v10.4s +mul v2.4S, v2.4S,v30.s[1] +sub v10.4s, v26.4s, v12.4s +mul v18.4S, v18.4S,v30.s[1] +add v26.4s, v26.4s, v12.4s +mla v2.4S, v15.4S, v31.s[0] +sub v15.4s, v0.4s, v23.4s +mla v18.4S, v27.4S, v31.s[0] +add v0.4s, v0.4s, v23.4s +sqrdmulh v29.4S, v8.4S, v11.s[3] +nop +mul v8.4S, v8.4S,v17.s[3] +nop +sqrdmulh v30.4S, v6.4S, v11.s[2] +sub v23.4s, v25.4s, v24.4s +mul v6.4S, v6.4S,v17.s[2] +add v25.4s, v25.4s, v24.4s +sqrdmulh v24.4S, v10.4S, v11.s[1] +sub v27.4s, v21.4s, v14.4s +mul v10.4S, v10.4S,v17.s[1] +add v21.4s, v21.4s, v14.4s +sqrdmulh v14.4S, v26.4S, v11.s[0] +sub v12.4s, v4.4s, v2.4s +mul v26.4S, v26.4S,v17.s[0] +add v4.4s, v4.4s, v2.4s +mla v8.4S, v29.4S, v31.s[0] +sub v29.4s, v20.4s, v18.4s +sqrdmulh v2.4S, v16.4S, v11.s[3] +add v20.4s, v20.4s, v18.4s +mla v6.4S, v30.4S, v31.s[0] +nop +sqrdmulh v30.4S, v13.4S, v11.s[2] +nop +mla v10.4S, v24.4S, v31.s[0] +nop +sqrdmulh v24.4S, v15.4S, v11.s[1] +nop +mla v26.4S, v14.4S, v31.s[0] +nop +sqrdmulh v14.4S, v0.4S, v11.s[0] +nop +mul v16.4S, v16.4S,v17.s[3] +sub v18.4s, v23.4s, v8.4s +mul v13.4S, v13.4S,v17.s[2] +add v23.4s, v23.4s, v8.4s +mla v16.4S, v2.4S, v31.s[0] +sub v2.4s, v25.4s, v6.4s +mla v13.4S, v30.4S, v31.s[0] +add v25.4s, v25.4s, v6.4s +mul v15.4S, v15.4S,v17.s[1] +sub v6.4s, v12.4s, v10.4s +mul v0.4S, v0.4S,v17.s[0] +add v12.4s, v12.4s, v10.4s +mla v15.4S, v24.4S, v31.s[0] +sub v24.4s, v4.4s, v26.4s +mla v0.4S, v14.4S, v31.s[0] +add v4.4s, v4.4s, v26.4s +sqrdmulh v11.4S, v18.4S, v9.s[3] +nop +mul v18.4S, v18.4S,v28.s[3] +nop +sqrdmulh v17.4S, v23.4S, v9.s[2] +sub v26.4s, v27.4s, v16.4s +mul v23.4S, v23.4S,v28.s[2] +add v27.4s, v27.4s, v16.4s +sqrdmulh v16.4S, v2.4S, v9.s[1] +sub v14.4s, v21.4s, v13.4s +mul v2.4S, v2.4S,v28.s[1] +add v21.4s, v21.4s, v13.4s +sqrdmulh v13.4S, v25.4S, v9.s[0] +sub v10.4s, v29.4s, v15.4s +mul v25.4S, v25.4S,v28.s[0] +add v29.4s, v29.4s, v15.4s +mla v18.4S, v11.4S, v31.s[0] +sub v11.4s, v20.4s, v0.4s +sqrdmulh v9.4S, v6.4S, v7.s[3] +add v20.4s, v20.4s, v0.4s +mla v23.4S, v17.4S, v31.s[0] +sub v17.4s, v26.4s, v18.4s +sqrdmulh v0.4S, v12.4S, v7.s[2] +add v26.4s, v26.4s, v18.4s +mla v2.4S, v16.4S, v31.s[0] +sub v16.4s, v27.4s, v23.4s +sqrdmulh v18.4S, v24.4S, v7.s[1] +add v27.4s, v27.4s, v23.4s +mla v25.4S, v13.4S, v31.s[0] +sub v13.4s, v14.4s, v2.4s +sqrdmulh v23.4S, v4.4S, v7.s[0] +add v14.4s, v14.4s, v2.4s +mul v6.4S, v6.4S,v1.s[3] +sub v2.4s, v21.4s, v25.4s +mul v12.4S, v12.4S,v1.s[2] +add v21.4s, v21.4s, v25.4s +mla v6.4S, v9.4S, v31.s[0] +str q17, [x0, #976] +mla v12.4S, v0.4S, v31.s[0] +str q26, [x0, #912] +mul v24.4S, v24.4S,v1.s[1] +str q16, [x0, #848] +mul v4.4S, v4.4S,v1.s[0] +str q27, [x0, #784] +mla v24.4S, v18.4S, v31.s[0] +str q13, [x0, #720] +mla v4.4S, v23.4S, v31.s[0] +str q14, [x0, #656] +str q2, [x0, #592] +sub v2.4s, v10.4s, v6.4s +str q21, [x0, #528] +add v10.4s, v10.4s, v6.4s +str q2, [x0, #464] +sub v2.4s, v29.4s, v12.4s +str q10, [x0, #400] +add v29.4s, v29.4s, v12.4s +str q2, [x0, #336] +sub v2.4s, v11.4s, v24.4s +str q29, [x0, #272] +add v11.4s, v11.4s, v24.4s +str q2, [x0, #208] +sub v2.4s, v20.4s, v4.4s +str q11, [x0, #144] +add v20.4s, v20.4s, v4.4s +str q2, [x0, #80] +str q20, [x0, #16] +ldr q3, [x0, #224] +ldr q5, [x0, #160] +ldr q19, [x0, #32] +ldr q22, [x17, #+128] +ldr q8, [x17, #+144] +sqrdmulh v30.4S, v19.4S, v8.s[0] +mul v19.4S, v19.4S,v22.s[0] +ldr q15, [x0, #48] +sqrdmulh v28.4S, v15.4S, v8.s[0] +mul v15.4S, v15.4S,v22.s[0] +ldr q25, [x17, #+160] +ldr q9, [x17, #+176] +ldr q17, [x0, #96] +sqrdmulh v0.4S, v17.4S, v9.s[0] +mul v17.4S, v17.4S,v25.s[0] +ldr q26, [x0, #112] +sqrdmulh v16.4S, v26.4S, v9.s[0] +mul v26.4S, v26.4S,v25.s[0] +ldr q27, [x17, #+192] +ldr q18, [x17, #+208] +mla v19.4S, v30.4S, v31.s[0] +sqrdmulh v30.4S, v5.4S, v18.s[0] +ldr q13, [x0, #176] +mla v15.4S, v28.4S, v31.s[0] +sqrdmulh v28.4S, v13.4S, v18.s[0] +ldr q23, [x17, #+224] +ldr q14, [x17, #+240] +mla v17.4S, v0.4S, v31.s[0] +sqrdmulh v0.4S, v3.4S, v14.s[0] +ldr q1, [x0, #240] +mla v26.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v1.4S, v14.s[0] +ldr q7, [x0, #0] +ldr q21, [x0, #128] +mul v5.4S, v5.4S,v27.s[0] +sub v6.4s, v7.4s, v19.4s +ldr q10, [x0, #16] +mul v13.4S, v13.4S,v27.s[0] +add v7.4s, v7.4s, v19.4s +ldr q19, [x0, #144] +mla v5.4S, v30.4S, v31.s[0] +sub v30.4s, v10.4s, v15.4s +ldr q12, [x0, #64] +mla v13.4S, v28.4S, v31.s[0] +add v10.4s, v10.4s, v15.4s +ldr q15, [x0, #192] +mul v3.4S, v3.4S,v23.s[0] +sub v28.4s, v12.4s, v17.4s +ldr q29, [x0, #80] +mul v1.4S, v1.4S,v23.s[0] +add v12.4s, v12.4s, v17.4s +ldr q17, [x0, #208] +mla v3.4S, v0.4S, v31.s[0] +mla v1.4S, v16.4S, v31.s[0] +sub v16.4s, v29.4s, v26.4s +sqrdmulh v0.4S, v10.4S, v8.s[1] +add v29.4s, v29.4s, v26.4s +mul v10.4S, v10.4S,v22.s[1] +sqrdmulh v26.4S, v30.4S, v8.s[2] +sub v24.4s, v21.4s, v5.4s +mul v30.4S, v30.4S,v22.s[2] +add v21.4s, v21.4s, v5.4s +sqrdmulh v8.4S, v29.4S, v9.s[1] +sub v22.4s, v19.4s, v13.4s +mul v29.4S, v29.4S,v25.s[1] +add v19.4s, v19.4s, v13.4s +sqrdmulh v13.4S, v16.4S, v9.s[2] +sub v5.4s, v15.4s, v3.4s +mul v16.4S, v16.4S,v25.s[2] +add v15.4s, v15.4s, v3.4s +mla v10.4S, v0.4S, v31.s[0] +sub v0.4s, v17.4s, v1.4s +ldr q9, [x0, #480] +sqrdmulh v25.4S, v19.4S, v18.s[1] +add v17.4s, v17.4s, v1.4s +mla v30.4S, v26.4S, v31.s[0] +ldr q26, [x0, #416] +sqrdmulh v1.4S, v22.4S, v18.s[2] +sub v3.4s, v7.4s, v10.4s +mla v29.4S, v8.4S, v31.s[0] +ldr q8, [x0, #288] +sqrdmulh v11.4S, v17.4S, v14.s[1] +add v7.4s, v7.4s, v10.4s +str q3, [x0, #16] +mla v16.4S, v13.4S, v31.s[0] +ldr q13, [x17, #+256] +ldr q3, [x17, #+272] +sqrdmulh v10.4S, v0.4S, v14.s[2] +sub v4.4s, v6.4s, v30.4s +str q7, [x0, #0] +mul v19.4S, v19.4S,v27.s[1] +add v6.4s, v6.4s, v30.4s +mul v22.4S, v22.4S,v27.s[2] +str q4, [x0, #48] +mla v19.4S, v25.4S, v31.s[0] +sub v25.4s, v12.4s, v29.4s +mla v22.4S, v1.4S, v31.s[0] +str q6, [x0, #32] +mul v17.4S, v17.4S,v23.s[1] +str q25, [x0, #80] +mul v0.4S, v0.4S,v23.s[2] +add v12.4s, v12.4s, v29.4s +str q12, [x0, #64] +mla v17.4S, v11.4S, v31.s[0] +sub v11.4s, v28.4s, v16.4s +str q11, [x0, #112] +mla v0.4S, v10.4S, v31.s[0] +add v28.4s, v28.4s, v16.4s +str q28, [x0, #96] +sqrdmulh v14.4S, v8.4S, v3.s[0] +sub v23.4s, v21.4s, v19.4s +mul v8.4S, v8.4S,v13.s[0] +str q23, [x0, #144] +ldr q23, [x0, #304] +sqrdmulh v28.4S, v23.4S, v3.s[0] +add v21.4s, v21.4s, v19.4s +mul v23.4S, v23.4S,v13.s[0] +str q21, [x0, #128] +ldr q21, [x17, #+288] +ldr q19, [x17, #+304] +ldr q16, [x0, #352] +sqrdmulh v10.4S, v16.4S, v19.s[0] +sub v11.4s, v24.4s, v22.4s +mul v16.4S, v16.4S,v21.s[0] +str q11, [x0, #176] +ldr q11, [x0, #368] +sqrdmulh v12.4S, v11.4S, v19.s[0] +add v24.4s, v24.4s, v22.4s +mul v11.4S, v11.4S,v21.s[0] +str q24, [x0, #160] +ldr q24, [x17, #+320] +ldr q22, [x17, #+336] +mla v8.4S, v14.4S, v31.s[0] +sub v14.4s, v15.4s, v17.4s +sqrdmulh v29.4S, v26.4S, v22.s[0] +str q14, [x0, #208] +ldr q14, [x0, #432] +mla v23.4S, v28.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +sqrdmulh v17.4S, v14.4S, v22.s[0] +str q15, [x0, #192] +ldr q15, [x17, #+352] +ldr q28, [x17, #+368] +mla v16.4S, v10.4S, v31.s[0] +sub v10.4s, v5.4s, v0.4s +sqrdmulh v25.4S, v9.4S, v28.s[0] +str q10, [x0, #240] +ldr q10, [x0, #496] +mla v11.4S, v12.4S, v31.s[0] +add v5.4s, v5.4s, v0.4s +sqrdmulh v0.4S, v10.4S, v28.s[0] +str q5, [x0, #224] +ldr q5, [x0, #256] +ldr q12, [x0, #384] +mul v26.4S, v26.4S,v24.s[0] +sub v18.4s, v5.4s, v8.4s +ldr q27, [x0, #272] +mul v14.4S, v14.4S,v24.s[0] +add v5.4s, v5.4s, v8.4s +ldr q8, [x0, #400] +mla v26.4S, v29.4S, v31.s[0] +sub v29.4s, v27.4s, v23.4s +ldr q6, [x0, #320] +mla v14.4S, v17.4S, v31.s[0] +add v27.4s, v27.4s, v23.4s +ldr q23, [x0, #448] +mul v9.4S, v9.4S,v15.s[0] +sub v17.4s, v6.4s, v16.4s +ldr q1, [x0, #336] +mul v10.4S, v10.4S,v15.s[0] +add v6.4s, v6.4s, v16.4s +ldr q16, [x0, #464] +mla v9.4S, v25.4S, v31.s[0] +mla v10.4S, v0.4S, v31.s[0] +sub v0.4s, v1.4s, v11.4s +sqrdmulh v25.4S, v27.4S, v3.s[1] +add v1.4s, v1.4s, v11.4s +mul v27.4S, v27.4S,v13.s[1] +sqrdmulh v11.4S, v29.4S, v3.s[2] +sub v4.4s, v12.4s, v26.4s +mul v29.4S, v29.4S,v13.s[2] +add v12.4s, v12.4s, v26.4s +sqrdmulh v3.4S, v1.4S, v19.s[1] +sub v13.4s, v8.4s, v14.4s +mul v1.4S, v1.4S,v21.s[1] +add v8.4s, v8.4s, v14.4s +sqrdmulh v14.4S, v0.4S, v19.s[2] +sub v26.4s, v23.4s, v9.4s +mul v0.4S, v0.4S,v21.s[2] +add v23.4s, v23.4s, v9.4s +mla v27.4S, v25.4S, v31.s[0] +sub v25.4s, v16.4s, v10.4s +ldr q19, [x0, #736] +sqrdmulh v21.4S, v8.4S, v22.s[1] +add v16.4s, v16.4s, v10.4s +mla v29.4S, v11.4S, v31.s[0] +ldr q11, [x0, #672] +sqrdmulh v10.4S, v13.4S, v22.s[2] +sub v9.4s, v5.4s, v27.4s +mla v1.4S, v3.4S, v31.s[0] +ldr q3, [x0, #544] +sqrdmulh v30.4S, v16.4S, v28.s[1] +add v5.4s, v5.4s, v27.4s +str q9, [x0, #272] +mla v0.4S, v14.4S, v31.s[0] +ldr q14, [x17, #+384] +ldr q9, [x17, #+400] +sqrdmulh v27.4S, v25.4S, v28.s[2] +sub v7.4s, v18.4s, v29.4s +str q5, [x0, #256] +mul v8.4S, v8.4S,v24.s[1] +add v18.4s, v18.4s, v29.4s +mul v13.4S, v13.4S,v24.s[2] +str q7, [x0, #304] +mla v8.4S, v21.4S, v31.s[0] +sub v21.4s, v6.4s, v1.4s +mla v13.4S, v10.4S, v31.s[0] +str q18, [x0, #288] +mul v16.4S, v16.4S,v15.s[1] +str q21, [x0, #336] +mul v25.4S, v25.4S,v15.s[2] +add v6.4s, v6.4s, v1.4s +str q6, [x0, #320] +mla v16.4S, v30.4S, v31.s[0] +sub v30.4s, v17.4s, v0.4s +str q30, [x0, #368] +mla v25.4S, v27.4S, v31.s[0] +add v17.4s, v17.4s, v0.4s +str q17, [x0, #352] +sqrdmulh v28.4S, v3.4S, v9.s[0] +sub v15.4s, v12.4s, v8.4s +mul v3.4S, v3.4S,v14.s[0] +str q15, [x0, #400] +ldr q15, [x0, #560] +sqrdmulh v17.4S, v15.4S, v9.s[0] +add v12.4s, v12.4s, v8.4s +mul v15.4S, v15.4S,v14.s[0] +str q12, [x0, #384] +ldr q12, [x17, #+416] +ldr q8, [x17, #+432] +ldr q0, [x0, #608] +sqrdmulh v27.4S, v0.4S, v8.s[0] +sub v30.4s, v4.4s, v13.4s +mul v0.4S, v0.4S,v12.s[0] +str q30, [x0, #432] +ldr q30, [x0, #624] +sqrdmulh v6.4S, v30.4S, v8.s[0] +add v4.4s, v4.4s, v13.4s +mul v30.4S, v30.4S,v12.s[0] +str q4, [x0, #416] +ldr q4, [x17, #+448] +ldr q13, [x17, #+464] +mla v3.4S, v28.4S, v31.s[0] +sub v28.4s, v23.4s, v16.4s +sqrdmulh v1.4S, v11.4S, v13.s[0] +str q28, [x0, #464] +ldr q28, [x0, #688] +mla v15.4S, v17.4S, v31.s[0] +add v23.4s, v23.4s, v16.4s +sqrdmulh v16.4S, v28.4S, v13.s[0] +str q23, [x0, #448] +ldr q23, [x17, #+480] +ldr q17, [x17, #+496] +mla v0.4S, v27.4S, v31.s[0] +sub v27.4s, v26.4s, v25.4s +sqrdmulh v21.4S, v19.4S, v17.s[0] +str q27, [x0, #496] +ldr q27, [x0, #752] +mla v30.4S, v6.4S, v31.s[0] +add v26.4s, v26.4s, v25.4s +sqrdmulh v25.4S, v27.4S, v17.s[0] +str q26, [x0, #480] +ldr q26, [x0, #512] +ldr q6, [x0, #640] +mul v11.4S, v11.4S,v4.s[0] +sub v22.4s, v26.4s, v3.4s +ldr q24, [x0, #528] +mul v28.4S, v28.4S,v4.s[0] +add v26.4s, v26.4s, v3.4s +ldr q3, [x0, #656] +mla v11.4S, v1.4S, v31.s[0] +sub v1.4s, v24.4s, v15.4s +ldr q18, [x0, #576] +mla v28.4S, v16.4S, v31.s[0] +add v24.4s, v24.4s, v15.4s +ldr q15, [x0, #704] +mul v19.4S, v19.4S,v23.s[0] +sub v16.4s, v18.4s, v0.4s +ldr q10, [x0, #592] +mul v27.4S, v27.4S,v23.s[0] +add v18.4s, v18.4s, v0.4s +ldr q0, [x0, #720] +mla v19.4S, v21.4S, v31.s[0] +mla v27.4S, v25.4S, v31.s[0] +sub v25.4s, v10.4s, v30.4s +sqrdmulh v21.4S, v24.4S, v9.s[1] +add v10.4s, v10.4s, v30.4s +mul v24.4S, v24.4S,v14.s[1] +sqrdmulh v30.4S, v1.4S, v9.s[2] +sub v7.4s, v6.4s, v11.4s +mul v1.4S, v1.4S,v14.s[2] +add v6.4s, v6.4s, v11.4s +sqrdmulh v9.4S, v10.4S, v8.s[1] +sub v14.4s, v3.4s, v28.4s +mul v10.4S, v10.4S,v12.s[1] +add v3.4s, v3.4s, v28.4s +sqrdmulh v28.4S, v25.4S, v8.s[2] +sub v11.4s, v15.4s, v19.4s +mul v25.4S, v25.4S,v12.s[2] +add v15.4s, v15.4s, v19.4s +mla v24.4S, v21.4S, v31.s[0] +sub v21.4s, v0.4s, v27.4s +ldr q8, [x0, #992] +sqrdmulh v12.4S, v3.4S, v13.s[1] +add v0.4s, v0.4s, v27.4s +mla v1.4S, v30.4S, v31.s[0] +ldr q30, [x0, #928] +sqrdmulh v27.4S, v14.4S, v13.s[2] +sub v19.4s, v26.4s, v24.4s +mla v10.4S, v9.4S, v31.s[0] +ldr q9, [x0, #800] +sqrdmulh v29.4S, v0.4S, v17.s[1] +add v26.4s, v26.4s, v24.4s +str q19, [x0, #528] +mla v25.4S, v28.4S, v31.s[0] +ldr q28, [x17, #+512] +ldr q19, [x17, #+528] +sqrdmulh v24.4S, v21.4S, v17.s[2] +sub v5.4s, v22.4s, v1.4s +str q26, [x0, #512] +mul v3.4S, v3.4S,v4.s[1] +add v22.4s, v22.4s, v1.4s +mul v14.4S, v14.4S,v4.s[2] +str q5, [x0, #560] +mla v3.4S, v12.4S, v31.s[0] +sub v12.4s, v18.4s, v10.4s +mla v14.4S, v27.4S, v31.s[0] +str q22, [x0, #544] +mul v0.4S, v0.4S,v23.s[1] +str q12, [x0, #592] +mul v21.4S, v21.4S,v23.s[2] +add v18.4s, v18.4s, v10.4s +str q18, [x0, #576] +mla v0.4S, v29.4S, v31.s[0] +sub v29.4s, v16.4s, v25.4s +str q29, [x0, #624] +mla v21.4S, v24.4S, v31.s[0] +add v16.4s, v16.4s, v25.4s +str q16, [x0, #608] +sqrdmulh v17.4S, v9.4S, v19.s[0] +sub v23.4s, v6.4s, v3.4s +mul v9.4S, v9.4S,v28.s[0] +str q23, [x0, #656] +ldr q23, [x0, #816] +sqrdmulh v16.4S, v23.4S, v19.s[0] +add v6.4s, v6.4s, v3.4s +mul v23.4S, v23.4S,v28.s[0] +str q6, [x0, #640] +ldr q6, [x17, #+544] +ldr q3, [x17, #+560] +ldr q25, [x0, #864] +sqrdmulh v24.4S, v25.4S, v3.s[0] +sub v29.4s, v7.4s, v14.4s +mul v25.4S, v25.4S,v6.s[0] +str q29, [x0, #688] +ldr q29, [x0, #880] +sqrdmulh v18.4S, v29.4S, v3.s[0] +add v7.4s, v7.4s, v14.4s +mul v29.4S, v29.4S,v6.s[0] +str q7, [x0, #672] +ldr q7, [x17, #+576] +ldr q14, [x17, #+592] +mla v9.4S, v17.4S, v31.s[0] +sub v17.4s, v15.4s, v0.4s +sqrdmulh v10.4S, v30.4S, v14.s[0] +str q17, [x0, #720] +ldr q17, [x0, #944] +mla v23.4S, v16.4S, v31.s[0] +add v15.4s, v15.4s, v0.4s +sqrdmulh v0.4S, v17.4S, v14.s[0] +str q15, [x0, #704] +ldr q15, [x17, #+608] +ldr q16, [x17, #+624] +mla v25.4S, v24.4S, v31.s[0] +sub v24.4s, v11.4s, v21.4s +sqrdmulh v12.4S, v8.4S, v16.s[0] +str q24, [x0, #752] +ldr q24, [x0, #1008] +mla v29.4S, v18.4S, v31.s[0] +add v11.4s, v11.4s, v21.4s +sqrdmulh v21.4S, v24.4S, v16.s[0] +str q11, [x0, #736] +ldr q11, [x0, #768] +ldr q18, [x0, #896] +mul v30.4S, v30.4S,v7.s[0] +sub v13.4s, v11.4s, v9.4s +ldr q4, [x0, #784] +mul v17.4S, v17.4S,v7.s[0] +add v11.4s, v11.4s, v9.4s +ldr q9, [x0, #912] +mla v30.4S, v10.4S, v31.s[0] +sub v10.4s, v4.4s, v23.4s +ldr q22, [x0, #832] +mla v17.4S, v0.4S, v31.s[0] +add v4.4s, v4.4s, v23.4s +ldr q23, [x0, #960] +mul v8.4S, v8.4S,v15.s[0] +sub v0.4s, v22.4s, v25.4s +ldr q27, [x0, #848] +mul v24.4S, v24.4S,v15.s[0] +add v22.4s, v22.4s, v25.4s +ldr q25, [x0, #976] +mla v8.4S, v12.4S, v31.s[0] +mla v24.4S, v21.4S, v31.s[0] +sub v21.4s, v27.4s, v29.4s +sqrdmulh v12.4S, v4.4S, v19.s[1] +add v27.4s, v27.4s, v29.4s +mul v4.4S, v4.4S,v28.s[1] +sqrdmulh v29.4S, v10.4S, v19.s[2] +sub v5.4s, v18.4s, v30.4s +mul v10.4S, v10.4S,v28.s[2] +add v18.4s, v18.4s, v30.4s +sqrdmulh v19.4S, v27.4S, v3.s[1] +sub v28.4s, v9.4s, v17.4s +mul v27.4S, v27.4S,v6.s[1] +add v9.4s, v9.4s, v17.4s +sqrdmulh v17.4S, v21.4S, v3.s[2] +sub v30.4s, v23.4s, v8.4s +mul v21.4S, v21.4S,v6.s[2] +add v23.4s, v23.4s, v8.4s +mla v4.4S, v12.4S, v31.s[0] +sub v12.4s, v25.4s, v24.4s +sqrdmulh v3.4S, v9.4S, v14.s[1] +add v25.4s, v25.4s, v24.4s +mla v10.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v28.4S, v14.s[2] +sub v24.4s, v11.4s, v4.4s +mla v27.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v25.4S, v16.s[1] +add v11.4s, v11.4s, v4.4s +str q24, [x0, #784] +mla v21.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v12.4S, v16.s[2] +sub v24.4s, v13.4s, v10.4s +str q11, [x0, #768] +mul v9.4S, v9.4S,v7.s[1] +add v13.4s, v13.4s, v10.4s +mul v28.4S, v28.4S,v7.s[2] +str q24, [x0, #816] +mla v9.4S, v3.4S, v31.s[0] +sub v3.4s, v22.4s, v27.4s +mla v28.4S, v29.4S, v31.s[0] +str q13, [x0, #800] +mul v25.4S, v25.4S,v15.s[1] +str q3, [x0, #848] +mul v12.4S, v12.4S,v15.s[2] +add v22.4s, v22.4s, v27.4s +str q22, [x0, #832] +mla v25.4S, v19.4S, v31.s[0] +sub v19.4s, v0.4s, v21.4s +str q19, [x0, #880] +mla v12.4S, v17.4S, v31.s[0] +add v0.4s, v0.4s, v21.4s +str q0, [x0, #864] +sub v16.4s, v18.4s, v9.4s +str q16, [x0, #912] +add v18.4s, v18.4s, v9.4s +str q18, [x0, #896] +sub v18.4s, v5.4s, v28.4s +str q18, [x0, #944] +add v5.4s, v5.4s, v28.4s +str q5, [x0, #928] +sub v5.4s, v23.4s, v25.4s +str q5, [x0, #976] +add v23.4s, v23.4s, v25.4s +str q23, [x0, #960] +sub v23.4s, v30.4s, v12.4s +str q23, [x0, #1008] +add v30.4s, v30.4s, v12.4s +str q30, [x0, #992] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1528 +// Instruction count: 1524 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_18_z4_7.s b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_18_z4_7.s new file mode 100644 index 0000000..42ecff5 --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_18_z4_7.s @@ -0,0 +1,1558 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_18_z4_7 +.global _ntt_u32_incomplete_neon_asm_var_4_2_18_z4_7 +ntt_u32_incomplete_neon_asm_var_4_2_18_z4_7: +_ntt_u32_incomplete_neon_asm_var_4_2_18_z4_7: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x0, #992] +sqrdmulh v27.4S, v28.4S, v29.s[0] +mul v28.4S, v28.4S,v30.s[0] +ldr q26, [x0, #928] +sqrdmulh v25.4S, v26.4S, v29.s[0] +mul v26.4S, v26.4S,v30.s[0] +ldr q24, [x0, #864] +sqrdmulh v23.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +ldr q22, [x0, #800] +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +ldr q20, [x0, #736] +sqrdmulh v19.4S, v20.4S, v29.s[0] +mla v28.4S, v27.4S, v31.s[0] +ldr q27, [x0, #672] +sqrdmulh v18.4S, v27.4S, v29.s[0] +mla v26.4S, v25.4S, v31.s[0] +ldr q25, [x0, #608] +sqrdmulh v17.4S, v25.4S, v29.s[0] +mla v24.4S, v23.4S, v31.s[0] +ldr q23, [x0, #544] +sqrdmulh v16.4S, v23.4S, v29.s[0] +mla v22.4S, v21.4S, v31.s[0] +ldr q21, [x0, #480] +ldr q3, [x0, #416] +mul v27.4S, v27.4S,v30.s[0] +mul v20.4S, v20.4S,v30.s[0] +sub v2.4s, v21.4s, v28.4s +add v21.4s, v21.4s, v28.4s +ldr q28, [x0, #352] +ldr q1, [x0, #288] +mla v27.4S, v18.4S, v31.s[0] +mla v20.4S, v19.4S, v31.s[0] +sub v19.4s, v3.4s, v26.4s +add v3.4s, v3.4s, v26.4s +ldr q26, [x0, #224] +ldr q18, [x0, #160] +mul v23.4S, v23.4S,v30.s[0] +mul v25.4S, v25.4S,v30.s[0] +sub v0.4s, v28.4s, v24.4s +add v28.4s, v28.4s, v24.4s +ldr q24, [x0, #96] +ldr q15, [x0, #32] +mla v23.4S, v16.4S, v31.s[0] +mla v25.4S, v17.4S, v31.s[0] +sub v17.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +sqrdmulh v22.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v16.4s, v26.4s, v20.4s +nop +sqrdmulh v14.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +add v26.4s, v26.4s, v20.4s +nop +sqrdmulh v20.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v13.4s, v18.4s, v27.4s +add v18.4s, v18.4s, v27.4s +sqrdmulh v27.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v12.4s, v24.4s, v25.4s +add v24.4s, v24.4s, v25.4s +sqrdmulh v25.4S, v0.4S, v29.s[2] +mla v2.4S, v22.4S, v31.s[0] +sub v22.4s, v15.4s, v23.4s +sqrdmulh v11.4S, v17.4S, v29.s[2] +mla v19.4S, v14.4S, v31.s[0] +add v15.4s, v15.4s, v23.4s +nop +sqrdmulh v23.4S, v28.4S, v29.s[1] +mla v21.4S, v20.4S, v31.s[0] +nop +sqrdmulh v20.4S, v1.4S, v29.s[1] +mla v3.4S, v27.4S, v31.s[0] +nop +nop +ldr q27, [x17, #+32] +ldr q14, [x17, #+48] +mul v17.4S, v17.4S,v30.s[2] +mul v0.4S, v0.4S,v30.s[2] +sub v10.4s, v16.4s, v2.4s +add v16.4s, v16.4s, v2.4s +mla v17.4S, v11.4S, v31.s[0] +mla v0.4S, v25.4S, v31.s[0] +sub v25.4s, v13.4s, v19.4s +add v13.4s, v13.4s, v19.4s +mul v1.4S, v1.4S,v30.s[1] +mul v28.4S, v28.4S,v30.s[1] +sub v19.4s, v26.4s, v21.4s +add v26.4s, v26.4s, v21.4s +mla v1.4S, v20.4S, v31.s[0] +mla v28.4S, v23.4S, v31.s[0] +sub v23.4s, v18.4s, v3.4s +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v10.4S, v14.s[3] +mul v10.4S, v10.4S,v27.s[3] +nop +nop +sqrdmulh v20.4S, v16.4S, v14.s[2] +mul v16.4S, v16.4S,v27.s[2] +sub v21.4s, v12.4s, v0.4s +add v12.4s, v12.4s, v0.4s +sqrdmulh v0.4S, v19.4S, v14.s[1] +mul v19.4S, v19.4S,v27.s[1] +sub v11.4s, v22.4s, v17.4s +add v22.4s, v22.4s, v17.4s +sqrdmulh v17.4S, v26.4S, v14.s[0] +mul v26.4S, v26.4S,v27.s[0] +sub v2.4s, v24.4s, v28.4s +add v24.4s, v24.4s, v28.4s +ldr q28, [x17, #+96] +ldr q9, [x17, #+112] +sqrdmulh v8.4S, v25.4S, v14.s[3] +mla v10.4S, v3.4S, v31.s[0] +sub v3.4s, v15.4s, v1.4s +add v15.4s, v15.4s, v1.4s +sqrdmulh v1.4S, v13.4S, v14.s[2] +mla v16.4S, v20.4S, v31.s[0] +nop +nop +sqrdmulh v20.4S, v23.4S, v14.s[1] +mla v19.4S, v0.4S, v31.s[0] +nop +nop +sqrdmulh v0.4S, v18.4S, v14.s[0] +mla v26.4S, v17.4S, v31.s[0] +nop +nop +ldr q17, [x17, #+64] +ldr q7, [x17, #+80] +mul v13.4S, v13.4S,v27.s[2] +mul v25.4S, v25.4S,v27.s[3] +sub v6.4s, v21.4s, v10.4s +add v21.4s, v21.4s, v10.4s +mla v13.4S, v1.4S, v31.s[0] +mla v25.4S, v8.4S, v31.s[0] +sub v8.4s, v12.4s, v16.4s +add v12.4s, v12.4s, v16.4s +mul v18.4S, v18.4S,v27.s[0] +mul v23.4S, v23.4S,v27.s[1] +sub v16.4s, v2.4s, v19.4s +add v2.4s, v2.4s, v19.4s +mla v18.4S, v0.4S, v31.s[0] +mla v23.4S, v20.4S, v31.s[0] +sub v20.4s, v24.4s, v26.4s +add v24.4s, v24.4s, v26.4s +sqrdmulh v26.4S, v6.4S, v9.s[3] +mul v6.4S, v6.4S,v28.s[3] +nop +nop +sqrdmulh v0.4S, v21.4S, v9.s[2] +mul v21.4S, v21.4S,v28.s[2] +sub v19.4s, v11.4s, v25.4s +add v11.4s, v11.4s, v25.4s +sqrdmulh v25.4S, v8.4S, v9.s[1] +mul v8.4S, v8.4S,v28.s[1] +sub v1.4s, v22.4s, v13.4s +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v12.4S, v9.s[0] +mul v12.4S, v12.4S,v28.s[0] +sub v10.4s, v3.4s, v23.4s +add v3.4s, v3.4s, v23.4s +sqrdmulh v23.4S, v16.4S, v7.s[3] +mla v6.4S, v26.4S, v31.s[0] +sub v26.4s, v15.4s, v18.4s +add v15.4s, v15.4s, v18.4s +sqrdmulh v18.4S, v2.4S, v7.s[2] +mla v21.4S, v0.4S, v31.s[0] +sub v0.4s, v19.4s, v6.4s +str q0, [x0, #992] +sqrdmulh v0.4S, v20.4S, v7.s[1] +mla v8.4S, v25.4S, v31.s[0] +add v19.4s, v19.4s, v6.4s +str q19, [x0, #928] +sqrdmulh v19.4S, v24.4S, v7.s[0] +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v21.4s +str q13, [x0, #864] +mul v2.4S, v2.4S,v17.s[2] +mul v16.4S, v16.4S,v17.s[3] +add v11.4s, v11.4s, v21.4s +sub v21.4s, v1.4s, v8.4s +mla v2.4S, v18.4S, v31.s[0] +mla v16.4S, v23.4S, v31.s[0] +add v1.4s, v1.4s, v8.4s +str q11, [x0, #800] +mul v24.4S, v24.4S,v17.s[0] +mul v20.4S, v20.4S,v17.s[1] +sub v11.4s, v22.4s, v12.4s +str q21, [x0, #736] +mla v24.4S, v19.4S, v31.s[0] +mla v20.4S, v0.4S, v31.s[0] +add v22.4s, v22.4s, v12.4s +str q1, [x0, #672] +ldr q1, [x0, #1008] +sqrdmulh v12.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +str q11, [x0, #608] +sub v11.4s, v10.4s, v16.4s +ldr q0, [x0, #944] +sqrdmulh v19.4S, v0.4S, v29.s[0] +mul v0.4S, v0.4S,v30.s[0] +str q22, [x0, #544] +add v10.4s, v10.4s, v16.4s +ldr q16, [x0, #880] +sqrdmulh v22.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +str q11, [x0, #480] +sub v11.4s, v3.4s, v2.4s +ldr q21, [x0, #816] +sqrdmulh v8.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +str q10, [x0, #416] +add v3.4s, v3.4s, v2.4s +ldr q2, [x0, #752] +sqrdmulh v10.4S, v2.4S, v29.s[0] +mla v1.4S, v12.4S, v31.s[0] +str q11, [x0, #352] +sub v11.4s, v26.4s, v20.4s +ldr q12, [x0, #688] +sqrdmulh v23.4S, v12.4S, v29.s[0] +mla v0.4S, v19.4S, v31.s[0] +str q3, [x0, #288] +add v26.4s, v26.4s, v20.4s +ldr q20, [x0, #624] +sqrdmulh v3.4S, v20.4S, v29.s[0] +mla v16.4S, v22.4S, v31.s[0] +str q11, [x0, #224] +sub v11.4s, v15.4s, v24.4s +ldr q22, [x0, #560] +sqrdmulh v19.4S, v22.4S, v29.s[0] +mla v21.4S, v8.4S, v31.s[0] +str q26, [x0, #160] +add v15.4s, v15.4s, v24.4s +ldr q24, [x0, #496] +ldr q26, [x0, #432] +mul v12.4S, v12.4S,v30.s[0] +mul v2.4S, v2.4S,v30.s[0] +sub v8.4s, v24.4s, v1.4s +add v24.4s, v24.4s, v1.4s +ldr q1, [x0, #368] +ldr q18, [x0, #304] +mla v12.4S, v23.4S, v31.s[0] +mla v2.4S, v10.4S, v31.s[0] +sub v10.4s, v26.4s, v0.4s +add v26.4s, v26.4s, v0.4s +ldr q0, [x0, #240] +ldr q23, [x0, #176] +mul v22.4S, v22.4S,v30.s[0] +mul v20.4S, v20.4S,v30.s[0] +sub v13.4s, v1.4s, v16.4s +add v1.4s, v1.4s, v16.4s +ldr q16, [x0, #112] +ldr q6, [x0, #48] +mla v22.4S, v19.4S, v31.s[0] +mla v20.4S, v3.4S, v31.s[0] +sub v3.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v8.4S, v29.s[2] +mul v8.4S, v8.4S,v30.s[2] +sub v19.4s, v0.4s, v2.4s +nop +sqrdmulh v25.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +add v0.4s, v0.4s, v2.4s +nop +sqrdmulh v2.4S, v24.4S, v29.s[1] +mul v24.4S, v24.4S,v30.s[1] +sub v5.4s, v23.4s, v12.4s +add v23.4s, v23.4s, v12.4s +sqrdmulh v12.4S, v26.4S, v29.s[1] +mul v26.4S, v26.4S,v30.s[1] +sub v4.4s, v16.4s, v20.4s +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v13.4S, v29.s[2] +mla v8.4S, v21.4S, v31.s[0] +sub v21.4s, v6.4s, v22.4s +str q11, [x0, #96] +sqrdmulh v11.4S, v3.4S, v29.s[2] +mla v10.4S, v25.4S, v31.s[0] +add v6.4s, v6.4s, v22.4s +nop +sqrdmulh v22.4S, v1.4S, v29.s[1] +mla v24.4S, v2.4S, v31.s[0] +str q15, [x0, #32] +nop +sqrdmulh v15.4S, v18.4S, v29.s[1] +mla v26.4S, v12.4S, v31.s[0] +nop +nop +mul v3.4S, v3.4S,v30.s[2] +mul v13.4S, v13.4S,v30.s[2] +sub v12.4s, v19.4s, v8.4s +add v19.4s, v19.4s, v8.4s +mla v3.4S, v11.4S, v31.s[0] +mla v13.4S, v20.4S, v31.s[0] +sub v20.4s, v5.4s, v10.4s +add v5.4s, v5.4s, v10.4s +mul v18.4S, v18.4S,v30.s[1] +mul v1.4S, v1.4S,v30.s[1] +sub v10.4s, v0.4s, v24.4s +add v0.4s, v0.4s, v24.4s +mla v18.4S, v15.4S, v31.s[0] +mla v1.4S, v22.4S, v31.s[0] +sub v22.4s, v23.4s, v26.4s +add v23.4s, v23.4s, v26.4s +sqrdmulh v26.4S, v12.4S, v14.s[3] +mul v12.4S, v12.4S,v27.s[3] +nop +nop +sqrdmulh v15.4S, v19.4S, v14.s[2] +mul v19.4S, v19.4S,v27.s[2] +sub v24.4s, v4.4s, v13.4s +add v4.4s, v4.4s, v13.4s +sqrdmulh v13.4S, v10.4S, v14.s[1] +mul v10.4S, v10.4S,v27.s[1] +sub v11.4s, v21.4s, v3.4s +add v21.4s, v21.4s, v3.4s +sqrdmulh v3.4S, v0.4S, v14.s[0] +mul v0.4S, v0.4S,v27.s[0] +sub v8.4s, v16.4s, v1.4s +add v16.4s, v16.4s, v1.4s +sqrdmulh v1.4S, v20.4S, v14.s[3] +mla v12.4S, v26.4S, v31.s[0] +sub v26.4s, v6.4s, v18.4s +add v6.4s, v6.4s, v18.4s +sqrdmulh v18.4S, v5.4S, v14.s[2] +mla v19.4S, v15.4S, v31.s[0] +nop +nop +sqrdmulh v15.4S, v22.4S, v14.s[1] +mla v10.4S, v13.4S, v31.s[0] +nop +nop +sqrdmulh v13.4S, v23.4S, v14.s[0] +mla v0.4S, v3.4S, v31.s[0] +nop +nop +mul v5.4S, v5.4S,v27.s[2] +mul v20.4S, v20.4S,v27.s[3] +sub v3.4s, v24.4s, v12.4s +add v24.4s, v24.4s, v12.4s +mla v5.4S, v18.4S, v31.s[0] +mla v20.4S, v1.4S, v31.s[0] +sub v1.4s, v4.4s, v19.4s +add v4.4s, v4.4s, v19.4s +mul v23.4S, v23.4S,v27.s[0] +mul v22.4S, v22.4S,v27.s[1] +sub v19.4s, v8.4s, v10.4s +add v8.4s, v8.4s, v10.4s +mla v23.4S, v13.4S, v31.s[0] +mla v22.4S, v15.4S, v31.s[0] +sub v15.4s, v16.4s, v0.4s +add v16.4s, v16.4s, v0.4s +sqrdmulh v0.4S, v3.4S, v9.s[3] +mul v3.4S, v3.4S,v28.s[3] +nop +nop +sqrdmulh v13.4S, v24.4S, v9.s[2] +mul v24.4S, v24.4S,v28.s[2] +sub v10.4s, v11.4s, v20.4s +add v11.4s, v11.4s, v20.4s +sqrdmulh v20.4S, v1.4S, v9.s[1] +mul v1.4S, v1.4S,v28.s[1] +sub v18.4s, v21.4s, v5.4s +add v21.4s, v21.4s, v5.4s +sqrdmulh v5.4S, v4.4S, v9.s[0] +mul v4.4S, v4.4S,v28.s[0] +sub v12.4s, v26.4s, v22.4s +add v26.4s, v26.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v7.s[3] +mla v3.4S, v0.4S, v31.s[0] +sub v0.4s, v6.4s, v23.4s +add v6.4s, v6.4s, v23.4s +sqrdmulh v23.4S, v8.4S, v7.s[2] +mla v24.4S, v13.4S, v31.s[0] +sub v13.4s, v10.4s, v3.4s +str q13, [x0, #1008] +sqrdmulh v13.4S, v15.4S, v7.s[1] +mla v1.4S, v20.4S, v31.s[0] +add v10.4s, v10.4s, v3.4s +str q10, [x0, #944] +sqrdmulh v10.4S, v16.4S, v7.s[0] +mla v4.4S, v5.4S, v31.s[0] +sub v5.4s, v11.4s, v24.4s +str q5, [x0, #880] +mul v8.4S, v8.4S,v17.s[2] +mul v19.4S, v19.4S,v17.s[3] +add v11.4s, v11.4s, v24.4s +sub v24.4s, v18.4s, v1.4s +mla v8.4S, v23.4S, v31.s[0] +mla v19.4S, v22.4S, v31.s[0] +add v18.4s, v18.4s, v1.4s +str q11, [x0, #816] +mul v16.4S, v16.4S,v17.s[0] +mul v15.4S, v15.4S,v17.s[1] +sub v11.4s, v21.4s, v4.4s +str q24, [x0, #752] +mla v16.4S, v10.4S, v31.s[0] +mla v15.4S, v13.4S, v31.s[0] +add v21.4s, v21.4s, v4.4s +str q18, [x0, #688] +ldr q18, [x0, #960] +sqrdmulh v4.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +str q11, [x0, #624] +sub v11.4s, v12.4s, v19.4s +ldr q13, [x0, #896] +sqrdmulh v10.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +str q21, [x0, #560] +add v12.4s, v12.4s, v19.4s +ldr q19, [x0, #832] +sqrdmulh v21.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +str q11, [x0, #496] +sub v11.4s, v26.4s, v8.4s +ldr q24, [x0, #768] +sqrdmulh v1.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +str q12, [x0, #432] +add v26.4s, v26.4s, v8.4s +ldr q8, [x0, #704] +sqrdmulh v12.4S, v8.4S, v29.s[0] +mla v18.4S, v4.4S, v31.s[0] +str q11, [x0, #368] +sub v11.4s, v0.4s, v15.4s +ldr q4, [x0, #640] +sqrdmulh v22.4S, v4.4S, v29.s[0] +mla v13.4S, v10.4S, v31.s[0] +str q26, [x0, #304] +add v0.4s, v0.4s, v15.4s +ldr q15, [x0, #576] +sqrdmulh v26.4S, v15.4S, v29.s[0] +mla v19.4S, v21.4S, v31.s[0] +str q11, [x0, #240] +sub v11.4s, v6.4s, v16.4s +ldr q21, [x0, #512] +sqrdmulh v10.4S, v21.4S, v29.s[0] +mla v24.4S, v1.4S, v31.s[0] +str q0, [x0, #176] +add v6.4s, v6.4s, v16.4s +ldr q16, [x0, #448] +ldr q0, [x0, #384] +mul v4.4S, v4.4S,v30.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v1.4s, v16.4s, v18.4s +add v16.4s, v16.4s, v18.4s +ldr q18, [x0, #320] +ldr q23, [x0, #256] +mla v4.4S, v22.4S, v31.s[0] +mla v8.4S, v12.4S, v31.s[0] +sub v12.4s, v0.4s, v13.4s +add v0.4s, v0.4s, v13.4s +ldr q13, [x0, #192] +ldr q22, [x0, #128] +mul v21.4S, v21.4S,v30.s[0] +mul v15.4S, v15.4S,v30.s[0] +sub v5.4s, v18.4s, v19.4s +add v18.4s, v18.4s, v19.4s +ldr q19, [x0, #64] +ldr q3, [x0, #0] +mla v21.4S, v10.4S, v31.s[0] +mla v15.4S, v26.4S, v31.s[0] +sub v26.4s, v23.4s, v24.4s +add v23.4s, v23.4s, v24.4s +sqrdmulh v24.4S, v1.4S, v29.s[2] +mul v1.4S, v1.4S,v30.s[2] +sub v10.4s, v13.4s, v8.4s +nop +sqrdmulh v20.4S, v12.4S, v29.s[2] +mul v12.4S, v12.4S,v30.s[2] +add v13.4s, v13.4s, v8.4s +nop +sqrdmulh v8.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v2.4s, v22.4s, v4.4s +add v22.4s, v22.4s, v4.4s +sqrdmulh v4.4S, v0.4S, v29.s[1] +mul v0.4S, v0.4S,v30.s[1] +sub v25.4s, v19.4s, v15.4s +add v19.4s, v19.4s, v15.4s +sqrdmulh v15.4S, v5.4S, v29.s[2] +mla v1.4S, v24.4S, v31.s[0] +sub v24.4s, v3.4s, v21.4s +str q11, [x0, #112] +sqrdmulh v11.4S, v26.4S, v29.s[2] +mla v12.4S, v20.4S, v31.s[0] +add v3.4s, v3.4s, v21.4s +nop +sqrdmulh v21.4S, v18.4S, v29.s[1] +mla v16.4S, v8.4S, v31.s[0] +str q6, [x0, #48] +nop +sqrdmulh v6.4S, v23.4S, v29.s[1] +mla v0.4S, v4.4S, v31.s[0] +nop +nop +mul v26.4S, v26.4S,v30.s[2] +mul v5.4S, v5.4S,v30.s[2] +sub v4.4s, v10.4s, v1.4s +add v10.4s, v10.4s, v1.4s +mla v26.4S, v11.4S, v31.s[0] +mla v5.4S, v15.4S, v31.s[0] +sub v15.4s, v2.4s, v12.4s +add v2.4s, v2.4s, v12.4s +mul v23.4S, v23.4S,v30.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v12.4s, v13.4s, v16.4s +add v13.4s, v13.4s, v16.4s +mla v23.4S, v6.4S, v31.s[0] +mla v18.4S, v21.4S, v31.s[0] +sub v21.4s, v22.4s, v0.4s +add v22.4s, v22.4s, v0.4s +sqrdmulh v0.4S, v4.4S, v14.s[3] +mul v4.4S, v4.4S,v27.s[3] +nop +nop +sqrdmulh v6.4S, v10.4S, v14.s[2] +mul v10.4S, v10.4S,v27.s[2] +sub v16.4s, v25.4s, v5.4s +add v25.4s, v25.4s, v5.4s +sqrdmulh v5.4S, v12.4S, v14.s[1] +mul v12.4S, v12.4S,v27.s[1] +sub v11.4s, v24.4s, v26.4s +add v24.4s, v24.4s, v26.4s +sqrdmulh v26.4S, v13.4S, v14.s[0] +mul v13.4S, v13.4S,v27.s[0] +sub v1.4s, v19.4s, v18.4s +add v19.4s, v19.4s, v18.4s +sqrdmulh v18.4S, v15.4S, v14.s[3] +mla v4.4S, v0.4S, v31.s[0] +sub v0.4s, v3.4s, v23.4s +add v3.4s, v3.4s, v23.4s +sqrdmulh v23.4S, v2.4S, v14.s[2] +mla v10.4S, v6.4S, v31.s[0] +nop +nop +sqrdmulh v6.4S, v21.4S, v14.s[1] +mla v12.4S, v5.4S, v31.s[0] +nop +nop +sqrdmulh v5.4S, v22.4S, v14.s[0] +mla v13.4S, v26.4S, v31.s[0] +nop +nop +mul v2.4S, v2.4S,v27.s[2] +mul v15.4S, v15.4S,v27.s[3] +sub v26.4s, v16.4s, v4.4s +add v16.4s, v16.4s, v4.4s +mla v2.4S, v23.4S, v31.s[0] +mla v15.4S, v18.4S, v31.s[0] +sub v18.4s, v25.4s, v10.4s +add v25.4s, v25.4s, v10.4s +mul v22.4S, v22.4S,v27.s[0] +mul v21.4S, v21.4S,v27.s[1] +sub v10.4s, v1.4s, v12.4s +add v1.4s, v1.4s, v12.4s +mla v22.4S, v5.4S, v31.s[0] +mla v21.4S, v6.4S, v31.s[0] +sub v6.4s, v19.4s, v13.4s +add v19.4s, v19.4s, v13.4s +sqrdmulh v13.4S, v26.4S, v9.s[3] +mul v26.4S, v26.4S,v28.s[3] +nop +nop +sqrdmulh v5.4S, v16.4S, v9.s[2] +mul v16.4S, v16.4S,v28.s[2] +sub v12.4s, v11.4s, v15.4s +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v18.4S, v9.s[1] +mul v18.4S, v18.4S,v28.s[1] +sub v23.4s, v24.4s, v2.4s +add v24.4s, v24.4s, v2.4s +sqrdmulh v2.4S, v25.4S, v9.s[0] +mul v25.4S, v25.4S,v28.s[0] +sub v4.4s, v0.4s, v21.4s +add v0.4s, v0.4s, v21.4s +sqrdmulh v21.4S, v10.4S, v7.s[3] +mla v26.4S, v13.4S, v31.s[0] +sub v13.4s, v3.4s, v22.4s +add v3.4s, v3.4s, v22.4s +sqrdmulh v22.4S, v1.4S, v7.s[2] +mla v16.4S, v5.4S, v31.s[0] +sub v5.4s, v12.4s, v26.4s +str q5, [x0, #960] +sqrdmulh v5.4S, v6.4S, v7.s[1] +mla v18.4S, v15.4S, v31.s[0] +add v12.4s, v12.4s, v26.4s +str q12, [x0, #896] +sqrdmulh v12.4S, v19.4S, v7.s[0] +mla v25.4S, v2.4S, v31.s[0] +sub v2.4s, v11.4s, v16.4s +str q2, [x0, #832] +mul v1.4S, v1.4S,v17.s[2] +mul v10.4S, v10.4S,v17.s[3] +add v11.4s, v11.4s, v16.4s +sub v16.4s, v23.4s, v18.4s +mla v1.4S, v22.4S, v31.s[0] +mla v10.4S, v21.4S, v31.s[0] +add v23.4s, v23.4s, v18.4s +str q11, [x0, #768] +mul v19.4S, v19.4S,v17.s[0] +mul v6.4S, v6.4S,v17.s[1] +sub v11.4s, v24.4s, v25.4s +str q16, [x0, #704] +mla v19.4S, v12.4S, v31.s[0] +mla v6.4S, v5.4S, v31.s[0] +add v24.4s, v24.4s, v25.4s +str q23, [x0, #640] +ldr q23, [x0, #976] +sqrdmulh v25.4S, v23.4S, v29.s[0] +mul v23.4S, v23.4S,v30.s[0] +str q11, [x0, #576] +sub v11.4s, v4.4s, v10.4s +ldr q5, [x0, #912] +sqrdmulh v12.4S, v5.4S, v29.s[0] +mul v5.4S, v5.4S,v30.s[0] +str q24, [x0, #512] +add v4.4s, v4.4s, v10.4s +ldr q10, [x0, #848] +sqrdmulh v24.4S, v10.4S, v29.s[0] +mul v10.4S, v10.4S,v30.s[0] +str q11, [x0, #448] +sub v11.4s, v0.4s, v1.4s +ldr q16, [x0, #784] +sqrdmulh v18.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +str q4, [x0, #384] +add v0.4s, v0.4s, v1.4s +ldr q1, [x0, #720] +sqrdmulh v4.4S, v1.4S, v29.s[0] +mla v23.4S, v25.4S, v31.s[0] +str q11, [x0, #320] +sub v11.4s, v13.4s, v6.4s +ldr q25, [x0, #656] +sqrdmulh v21.4S, v25.4S, v29.s[0] +mla v5.4S, v12.4S, v31.s[0] +str q0, [x0, #256] +add v13.4s, v13.4s, v6.4s +ldr q6, [x0, #592] +sqrdmulh v0.4S, v6.4S, v29.s[0] +mla v10.4S, v24.4S, v31.s[0] +str q11, [x0, #192] +sub v11.4s, v3.4s, v19.4s +ldr q24, [x0, #528] +sqrdmulh v12.4S, v24.4S, v29.s[0] +mla v16.4S, v18.4S, v31.s[0] +str q13, [x0, #128] +add v3.4s, v3.4s, v19.4s +ldr q19, [x0, #464] +ldr q13, [x0, #400] +mul v25.4S, v25.4S,v30.s[0] +mul v1.4S, v1.4S,v30.s[0] +sub v18.4s, v19.4s, v23.4s +add v19.4s, v19.4s, v23.4s +ldr q23, [x0, #336] +ldr q22, [x0, #272] +mla v25.4S, v21.4S, v31.s[0] +mla v1.4S, v4.4S, v31.s[0] +sub v4.4s, v13.4s, v5.4s +add v13.4s, v13.4s, v5.4s +ldr q5, [x0, #208] +ldr q21, [x0, #144] +mul v24.4S, v24.4S,v30.s[0] +mul v6.4S, v6.4S,v30.s[0] +sub v2.4s, v23.4s, v10.4s +add v23.4s, v23.4s, v10.4s +ldr q10, [x0, #80] +ldr q26, [x0, #16] +mla v24.4S, v12.4S, v31.s[0] +mla v6.4S, v0.4S, v31.s[0] +sub v0.4s, v22.4s, v16.4s +add v22.4s, v22.4s, v16.4s +sqrdmulh v16.4S, v18.4S, v29.s[2] +mul v18.4S, v18.4S,v30.s[2] +sub v12.4s, v5.4s, v1.4s +nop +sqrdmulh v15.4S, v4.4S, v29.s[2] +mul v4.4S, v4.4S,v30.s[2] +add v5.4s, v5.4s, v1.4s +nop +sqrdmulh v1.4S, v19.4S, v29.s[1] +mul v19.4S, v19.4S,v30.s[1] +sub v8.4s, v21.4s, v25.4s +add v21.4s, v21.4s, v25.4s +sqrdmulh v25.4S, v13.4S, v29.s[1] +mul v13.4S, v13.4S,v30.s[1] +sub v20.4s, v10.4s, v6.4s +add v10.4s, v10.4s, v6.4s +sqrdmulh v6.4S, v2.4S, v29.s[2] +mla v18.4S, v16.4S, v31.s[0] +sub v16.4s, v26.4s, v24.4s +str q11, [x0, #64] +sqrdmulh v11.4S, v0.4S, v29.s[2] +mla v4.4S, v15.4S, v31.s[0] +add v26.4s, v26.4s, v24.4s +nop +sqrdmulh v24.4S, v23.4S, v29.s[1] +mla v19.4S, v1.4S, v31.s[0] +str q3, [x0, #0] +nop +sqrdmulh v3.4S, v22.4S, v29.s[1] +mla v13.4S, v25.4S, v31.s[0] +nop +nop +mul v0.4S, v0.4S,v30.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v25.4s, v12.4s, v18.4s +add v12.4s, v12.4s, v18.4s +mla v0.4S, v11.4S, v31.s[0] +mla v2.4S, v6.4S, v31.s[0] +sub v6.4s, v8.4s, v4.4s +add v8.4s, v8.4s, v4.4s +mul v22.4S, v22.4S,v30.s[1] +mul v23.4S, v23.4S,v30.s[1] +sub v4.4s, v5.4s, v19.4s +add v5.4s, v5.4s, v19.4s +mla v22.4S, v3.4S, v31.s[0] +mla v23.4S, v24.4S, v31.s[0] +sub v24.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +sqrdmulh v29.4S, v25.4S, v14.s[3] +mul v25.4S, v25.4S,v27.s[3] +nop +nop +sqrdmulh v30.4S, v12.4S, v14.s[2] +mul v12.4S, v12.4S,v27.s[2] +sub v13.4s, v20.4s, v2.4s +add v20.4s, v20.4s, v2.4s +sqrdmulh v2.4S, v4.4S, v14.s[1] +mul v4.4S, v4.4S,v27.s[1] +sub v3.4s, v16.4s, v0.4s +add v16.4s, v16.4s, v0.4s +sqrdmulh v0.4S, v5.4S, v14.s[0] +mul v5.4S, v5.4S,v27.s[0] +sub v19.4s, v10.4s, v23.4s +add v10.4s, v10.4s, v23.4s +sqrdmulh v23.4S, v6.4S, v14.s[3] +mla v25.4S, v29.4S, v31.s[0] +sub v29.4s, v26.4s, v22.4s +add v26.4s, v26.4s, v22.4s +sqrdmulh v22.4S, v8.4S, v14.s[2] +mla v12.4S, v30.4S, v31.s[0] +nop +nop +sqrdmulh v30.4S, v24.4S, v14.s[1] +mla v4.4S, v2.4S, v31.s[0] +nop +nop +sqrdmulh v2.4S, v21.4S, v14.s[0] +mla v5.4S, v0.4S, v31.s[0] +nop +nop +mul v8.4S, v8.4S,v27.s[2] +mul v6.4S, v6.4S,v27.s[3] +sub v0.4s, v13.4s, v25.4s +add v13.4s, v13.4s, v25.4s +mla v8.4S, v22.4S, v31.s[0] +mla v6.4S, v23.4S, v31.s[0] +sub v23.4s, v20.4s, v12.4s +add v20.4s, v20.4s, v12.4s +mul v21.4S, v21.4S,v27.s[0] +mul v24.4S, v24.4S,v27.s[1] +sub v12.4s, v19.4s, v4.4s +add v19.4s, v19.4s, v4.4s +mla v21.4S, v2.4S, v31.s[0] +mla v24.4S, v30.4S, v31.s[0] +sub v30.4s, v10.4s, v5.4s +add v10.4s, v10.4s, v5.4s +sqrdmulh v14.4S, v0.4S, v9.s[3] +mul v0.4S, v0.4S,v28.s[3] +nop +nop +sqrdmulh v27.4S, v13.4S, v9.s[2] +mul v13.4S, v13.4S,v28.s[2] +sub v5.4s, v3.4s, v6.4s +add v3.4s, v3.4s, v6.4s +sqrdmulh v6.4S, v23.4S, v9.s[1] +mul v23.4S, v23.4S,v28.s[1] +sub v2.4s, v16.4s, v8.4s +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v20.4S, v9.s[0] +mul v20.4S, v20.4S,v28.s[0] +sub v4.4s, v29.4s, v24.4s +add v29.4s, v29.4s, v24.4s +sqrdmulh v9.4S, v12.4S, v7.s[3] +mla v0.4S, v14.4S, v31.s[0] +sub v14.4s, v26.4s, v21.4s +add v26.4s, v26.4s, v21.4s +sqrdmulh v21.4S, v19.4S, v7.s[2] +mla v13.4S, v27.4S, v31.s[0] +sub v27.4s, v5.4s, v0.4s +str q27, [x0, #976] +sqrdmulh v27.4S, v30.4S, v7.s[1] +mla v23.4S, v6.4S, v31.s[0] +add v5.4s, v5.4s, v0.4s +str q5, [x0, #912] +sqrdmulh v5.4S, v10.4S, v7.s[0] +mla v20.4S, v8.4S, v31.s[0] +sub v8.4s, v3.4s, v13.4s +str q8, [x0, #848] +mul v19.4S, v19.4S,v17.s[2] +mul v12.4S, v12.4S,v17.s[3] +add v3.4s, v3.4s, v13.4s +sub v13.4s, v2.4s, v23.4s +mla v19.4S, v21.4S, v31.s[0] +mla v12.4S, v9.4S, v31.s[0] +add v2.4s, v2.4s, v23.4s +str q3, [x0, #784] +mul v10.4S, v10.4S,v17.s[0] +mul v30.4S, v30.4S,v17.s[1] +sub v3.4s, v16.4s, v20.4s +str q13, [x0, #720] +mla v10.4S, v5.4S, v31.s[0] +mla v30.4S, v27.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +str q2, [x0, #656] +str q3, [x0, #592] +sub v3.4s, v4.4s, v12.4s +str q16, [x0, #528] +add v4.4s, v4.4s, v12.4s +str q3, [x0, #464] +sub v3.4s, v29.4s, v19.4s +str q4, [x0, #400] +add v29.4s, v29.4s, v19.4s +str q3, [x0, #336] +sub v3.4s, v14.4s, v30.4s +str q29, [x0, #272] +add v14.4s, v14.4s, v30.4s +str q3, [x0, #208] +sub v3.4s, v26.4s, v10.4s +str q14, [x0, #144] +add v26.4s, v26.4s, v10.4s +str q3, [x0, #80] +str q26, [x0, #16] +ldr q15, [x0, #224] +ldr q1, [x0, #160] +ldr q18, [x0, #32] +ldr q11, [x17, #+128] +ldr q25, [x17, #+144] +sqrdmulh v22.4S, v18.4S, v25.s[0] +mul v18.4S, v18.4S,v11.s[0] +ldr q24, [x0, #48] +sqrdmulh v28.4S, v24.4S, v25.s[0] +mul v24.4S, v24.4S,v11.s[0] +ldr q6, [x17, #+160] +ldr q0, [x17, #+176] +ldr q8, [x0, #96] +sqrdmulh v21.4S, v8.4S, v0.s[0] +mul v8.4S, v8.4S,v6.s[0] +ldr q9, [x0, #112] +sqrdmulh v23.4S, v9.4S, v0.s[0] +mul v9.4S, v9.4S,v6.s[0] +ldr q13, [x17, #+192] +ldr q5, [x17, #+208] +mla v18.4S, v22.4S, v31.s[0] +sqrdmulh v22.4S, v1.4S, v5.s[0] +ldr q27, [x0, #176] +mla v24.4S, v28.4S, v31.s[0] +sqrdmulh v28.4S, v27.4S, v5.s[0] +ldr q20, [x17, #+224] +ldr q2, [x17, #+240] +mla v8.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v15.4S, v2.s[0] +ldr q17, [x0, #240] +mla v9.4S, v23.4S, v31.s[0] +sqrdmulh v23.4S, v17.4S, v2.s[0] +ldr q7, [x0, #0] +ldr q16, [x0, #128] +mul v1.4S, v1.4S,v13.s[0] +sub v12.4s, v7.4s, v18.4s +ldr q4, [x0, #16] +mul v27.4S, v27.4S,v13.s[0] +add v7.4s, v7.4s, v18.4s +ldr q18, [x0, #144] +mla v1.4S, v22.4S, v31.s[0] +sub v22.4s, v4.4s, v24.4s +ldr q19, [x0, #64] +mla v27.4S, v28.4S, v31.s[0] +add v4.4s, v4.4s, v24.4s +ldr q24, [x0, #192] +mul v15.4S, v15.4S,v20.s[0] +sub v28.4s, v19.4s, v8.4s +ldr q29, [x0, #80] +mul v17.4S, v17.4S,v20.s[0] +add v19.4s, v19.4s, v8.4s +ldr q8, [x0, #208] +mla v15.4S, v21.4S, v31.s[0] +mla v17.4S, v23.4S, v31.s[0] +sub v23.4s, v29.4s, v9.4s +sqrdmulh v21.4S, v4.4S, v25.s[1] +add v29.4s, v29.4s, v9.4s +mul v4.4S, v4.4S,v11.s[1] +sqrdmulh v9.4S, v22.4S, v25.s[2] +sub v30.4s, v16.4s, v1.4s +mul v22.4S, v22.4S,v11.s[2] +add v16.4s, v16.4s, v1.4s +sqrdmulh v25.4S, v29.4S, v0.s[1] +sub v11.4s, v18.4s, v27.4s +mul v29.4S, v29.4S,v6.s[1] +add v18.4s, v18.4s, v27.4s +sqrdmulh v27.4S, v23.4S, v0.s[2] +sub v1.4s, v24.4s, v15.4s +mul v23.4S, v23.4S,v6.s[2] +add v24.4s, v24.4s, v15.4s +mla v4.4S, v21.4S, v31.s[0] +sub v21.4s, v8.4s, v17.4s +ldr q0, [x0, #480] +sqrdmulh v6.4S, v18.4S, v5.s[1] +add v8.4s, v8.4s, v17.4s +mla v22.4S, v9.4S, v31.s[0] +ldr q9, [x0, #416] +sqrdmulh v17.4S, v11.4S, v5.s[2] +sub v15.4s, v7.4s, v4.4s +mla v29.4S, v25.4S, v31.s[0] +ldr q25, [x0, #288] +sqrdmulh v14.4S, v8.4S, v2.s[1] +add v7.4s, v7.4s, v4.4s +str q15, [x0, #16] +mla v23.4S, v27.4S, v31.s[0] +ldr q27, [x17, #+256] +ldr q15, [x17, #+272] +sqrdmulh v4.4S, v21.4S, v2.s[2] +sub v10.4s, v12.4s, v22.4s +str q7, [x0, #0] +mul v18.4S, v18.4S,v13.s[1] +add v12.4s, v12.4s, v22.4s +mul v11.4S, v11.4S,v13.s[2] +str q10, [x0, #48] +mla v18.4S, v6.4S, v31.s[0] +sub v6.4s, v19.4s, v29.4s +mla v11.4S, v17.4S, v31.s[0] +str q12, [x0, #32] +mul v8.4S, v8.4S,v20.s[1] +str q6, [x0, #80] +mul v21.4S, v21.4S,v20.s[2] +add v19.4s, v19.4s, v29.4s +str q19, [x0, #64] +mla v8.4S, v14.4S, v31.s[0] +sub v14.4s, v28.4s, v23.4s +str q14, [x0, #112] +mla v21.4S, v4.4S, v31.s[0] +add v28.4s, v28.4s, v23.4s +str q28, [x0, #96] +sqrdmulh v2.4S, v25.4S, v15.s[0] +sub v20.4s, v16.4s, v18.4s +mul v25.4S, v25.4S,v27.s[0] +str q20, [x0, #144] +ldr q20, [x0, #304] +sqrdmulh v28.4S, v20.4S, v15.s[0] +add v16.4s, v16.4s, v18.4s +mul v20.4S, v20.4S,v27.s[0] +str q16, [x0, #128] +ldr q16, [x17, #+288] +ldr q18, [x17, #+304] +ldr q23, [x0, #352] +sqrdmulh v4.4S, v23.4S, v18.s[0] +sub v14.4s, v30.4s, v11.4s +mul v23.4S, v23.4S,v16.s[0] +str q14, [x0, #176] +ldr q14, [x0, #368] +sqrdmulh v19.4S, v14.4S, v18.s[0] +add v30.4s, v30.4s, v11.4s +mul v14.4S, v14.4S,v16.s[0] +str q30, [x0, #160] +ldr q30, [x17, #+320] +ldr q11, [x17, #+336] +mla v25.4S, v2.4S, v31.s[0] +sub v2.4s, v24.4s, v8.4s +sqrdmulh v29.4S, v9.4S, v11.s[0] +str q2, [x0, #208] +ldr q2, [x0, #432] +mla v20.4S, v28.4S, v31.s[0] +add v24.4s, v24.4s, v8.4s +sqrdmulh v8.4S, v2.4S, v11.s[0] +str q24, [x0, #192] +ldr q24, [x17, #+352] +ldr q28, [x17, #+368] +mla v23.4S, v4.4S, v31.s[0] +sub v4.4s, v1.4s, v21.4s +sqrdmulh v6.4S, v0.4S, v28.s[0] +str q4, [x0, #240] +ldr q4, [x0, #496] +mla v14.4S, v19.4S, v31.s[0] +add v1.4s, v1.4s, v21.4s +sqrdmulh v21.4S, v4.4S, v28.s[0] +str q1, [x0, #224] +ldr q1, [x0, #256] +ldr q19, [x0, #384] +mul v9.4S, v9.4S,v30.s[0] +sub v5.4s, v1.4s, v25.4s +ldr q13, [x0, #272] +mul v2.4S, v2.4S,v30.s[0] +add v1.4s, v1.4s, v25.4s +ldr q25, [x0, #400] +mla v9.4S, v29.4S, v31.s[0] +sub v29.4s, v13.4s, v20.4s +ldr q12, [x0, #320] +mla v2.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v20.4s +ldr q20, [x0, #448] +mul v0.4S, v0.4S,v24.s[0] +sub v8.4s, v12.4s, v23.4s +ldr q17, [x0, #336] +mul v4.4S, v4.4S,v24.s[0] +add v12.4s, v12.4s, v23.4s +ldr q23, [x0, #464] +mla v0.4S, v6.4S, v31.s[0] +mla v4.4S, v21.4S, v31.s[0] +sub v21.4s, v17.4s, v14.4s +sqrdmulh v6.4S, v13.4S, v15.s[1] +add v17.4s, v17.4s, v14.4s +mul v13.4S, v13.4S,v27.s[1] +sqrdmulh v14.4S, v29.4S, v15.s[2] +sub v10.4s, v19.4s, v9.4s +mul v29.4S, v29.4S,v27.s[2] +add v19.4s, v19.4s, v9.4s +sqrdmulh v15.4S, v17.4S, v18.s[1] +sub v27.4s, v25.4s, v2.4s +mul v17.4S, v17.4S,v16.s[1] +add v25.4s, v25.4s, v2.4s +sqrdmulh v2.4S, v21.4S, v18.s[2] +sub v9.4s, v20.4s, v0.4s +mul v21.4S, v21.4S,v16.s[2] +add v20.4s, v20.4s, v0.4s +mla v13.4S, v6.4S, v31.s[0] +sub v6.4s, v23.4s, v4.4s +ldr q18, [x0, #736] +sqrdmulh v16.4S, v25.4S, v11.s[1] +add v23.4s, v23.4s, v4.4s +mla v29.4S, v14.4S, v31.s[0] +ldr q14, [x0, #672] +sqrdmulh v4.4S, v27.4S, v11.s[2] +sub v0.4s, v1.4s, v13.4s +mla v17.4S, v15.4S, v31.s[0] +ldr q15, [x0, #544] +sqrdmulh v22.4S, v23.4S, v28.s[1] +add v1.4s, v1.4s, v13.4s +str q0, [x0, #272] +mla v21.4S, v2.4S, v31.s[0] +ldr q2, [x17, #+384] +ldr q0, [x17, #+400] +sqrdmulh v13.4S, v6.4S, v28.s[2] +sub v7.4s, v5.4s, v29.4s +str q1, [x0, #256] +mul v25.4S, v25.4S,v30.s[1] +add v5.4s, v5.4s, v29.4s +mul v27.4S, v27.4S,v30.s[2] +str q7, [x0, #304] +mla v25.4S, v16.4S, v31.s[0] +sub v16.4s, v12.4s, v17.4s +mla v27.4S, v4.4S, v31.s[0] +str q5, [x0, #288] +mul v23.4S, v23.4S,v24.s[1] +str q16, [x0, #336] +mul v6.4S, v6.4S,v24.s[2] +add v12.4s, v12.4s, v17.4s +str q12, [x0, #320] +mla v23.4S, v22.4S, v31.s[0] +sub v22.4s, v8.4s, v21.4s +str q22, [x0, #368] +mla v6.4S, v13.4S, v31.s[0] +add v8.4s, v8.4s, v21.4s +str q8, [x0, #352] +sqrdmulh v28.4S, v15.4S, v0.s[0] +sub v24.4s, v19.4s, v25.4s +mul v15.4S, v15.4S,v2.s[0] +str q24, [x0, #400] +ldr q24, [x0, #560] +sqrdmulh v8.4S, v24.4S, v0.s[0] +add v19.4s, v19.4s, v25.4s +mul v24.4S, v24.4S,v2.s[0] +str q19, [x0, #384] +ldr q19, [x17, #+416] +ldr q25, [x17, #+432] +ldr q21, [x0, #608] +sqrdmulh v13.4S, v21.4S, v25.s[0] +sub v22.4s, v10.4s, v27.4s +mul v21.4S, v21.4S,v19.s[0] +str q22, [x0, #432] +ldr q22, [x0, #624] +sqrdmulh v12.4S, v22.4S, v25.s[0] +add v10.4s, v10.4s, v27.4s +mul v22.4S, v22.4S,v19.s[0] +str q10, [x0, #416] +ldr q10, [x17, #+448] +ldr q27, [x17, #+464] +mla v15.4S, v28.4S, v31.s[0] +sub v28.4s, v20.4s, v23.4s +sqrdmulh v17.4S, v14.4S, v27.s[0] +str q28, [x0, #464] +ldr q28, [x0, #688] +mla v24.4S, v8.4S, v31.s[0] +add v20.4s, v20.4s, v23.4s +sqrdmulh v23.4S, v28.4S, v27.s[0] +str q20, [x0, #448] +ldr q20, [x17, #+480] +ldr q8, [x17, #+496] +mla v21.4S, v13.4S, v31.s[0] +sub v13.4s, v9.4s, v6.4s +sqrdmulh v16.4S, v18.4S, v8.s[0] +str q13, [x0, #496] +ldr q13, [x0, #752] +mla v22.4S, v12.4S, v31.s[0] +add v9.4s, v9.4s, v6.4s +sqrdmulh v6.4S, v13.4S, v8.s[0] +str q9, [x0, #480] +ldr q9, [x0, #512] +ldr q12, [x0, #640] +mul v14.4S, v14.4S,v10.s[0] +sub v11.4s, v9.4s, v15.4s +ldr q30, [x0, #528] +mul v28.4S, v28.4S,v10.s[0] +add v9.4s, v9.4s, v15.4s +ldr q15, [x0, #656] +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v30.4s, v24.4s +ldr q5, [x0, #576] +mla v28.4S, v23.4S, v31.s[0] +add v30.4s, v30.4s, v24.4s +ldr q24, [x0, #704] +mul v18.4S, v18.4S,v20.s[0] +sub v23.4s, v5.4s, v21.4s +ldr q4, [x0, #592] +mul v13.4S, v13.4S,v20.s[0] +add v5.4s, v5.4s, v21.4s +ldr q21, [x0, #720] +mla v18.4S, v16.4S, v31.s[0] +mla v13.4S, v6.4S, v31.s[0] +sub v6.4s, v4.4s, v22.4s +sqrdmulh v16.4S, v30.4S, v0.s[1] +add v4.4s, v4.4s, v22.4s +mul v30.4S, v30.4S,v2.s[1] +sqrdmulh v22.4S, v17.4S, v0.s[2] +sub v7.4s, v12.4s, v14.4s +mul v17.4S, v17.4S,v2.s[2] +add v12.4s, v12.4s, v14.4s +sqrdmulh v0.4S, v4.4S, v25.s[1] +sub v2.4s, v15.4s, v28.4s +mul v4.4S, v4.4S,v19.s[1] +add v15.4s, v15.4s, v28.4s +sqrdmulh v28.4S, v6.4S, v25.s[2] +sub v14.4s, v24.4s, v18.4s +mul v6.4S, v6.4S,v19.s[2] +add v24.4s, v24.4s, v18.4s +mla v30.4S, v16.4S, v31.s[0] +sub v16.4s, v21.4s, v13.4s +ldr q25, [x0, #992] +sqrdmulh v19.4S, v15.4S, v27.s[1] +add v21.4s, v21.4s, v13.4s +mla v17.4S, v22.4S, v31.s[0] +ldr q22, [x0, #928] +sqrdmulh v13.4S, v2.4S, v27.s[2] +sub v18.4s, v9.4s, v30.4s +mla v4.4S, v0.4S, v31.s[0] +ldr q0, [x0, #800] +sqrdmulh v29.4S, v21.4S, v8.s[1] +add v9.4s, v9.4s, v30.4s +str q18, [x0, #528] +mla v6.4S, v28.4S, v31.s[0] +ldr q28, [x17, #+512] +ldr q18, [x17, #+528] +sqrdmulh v30.4S, v16.4S, v8.s[2] +sub v1.4s, v11.4s, v17.4s +str q9, [x0, #512] +mul v15.4S, v15.4S,v10.s[1] +add v11.4s, v11.4s, v17.4s +mul v2.4S, v2.4S,v10.s[2] +str q1, [x0, #560] +mla v15.4S, v19.4S, v31.s[0] +sub v19.4s, v5.4s, v4.4s +mla v2.4S, v13.4S, v31.s[0] +str q11, [x0, #544] +mul v21.4S, v21.4S,v20.s[1] +str q19, [x0, #592] +mul v16.4S, v16.4S,v20.s[2] +add v5.4s, v5.4s, v4.4s +str q5, [x0, #576] +mla v21.4S, v29.4S, v31.s[0] +sub v29.4s, v23.4s, v6.4s +str q29, [x0, #624] +mla v16.4S, v30.4S, v31.s[0] +add v23.4s, v23.4s, v6.4s +str q23, [x0, #608] +sqrdmulh v8.4S, v0.4S, v18.s[0] +sub v20.4s, v12.4s, v15.4s +mul v0.4S, v0.4S,v28.s[0] +str q20, [x0, #656] +ldr q20, [x0, #816] +sqrdmulh v23.4S, v20.4S, v18.s[0] +add v12.4s, v12.4s, v15.4s +mul v20.4S, v20.4S,v28.s[0] +str q12, [x0, #640] +ldr q12, [x17, #+544] +ldr q15, [x17, #+560] +ldr q6, [x0, #864] +sqrdmulh v30.4S, v6.4S, v15.s[0] +sub v29.4s, v7.4s, v2.4s +mul v6.4S, v6.4S,v12.s[0] +str q29, [x0, #688] +ldr q29, [x0, #880] +sqrdmulh v5.4S, v29.4S, v15.s[0] +add v7.4s, v7.4s, v2.4s +mul v29.4S, v29.4S,v12.s[0] +str q7, [x0, #672] +ldr q7, [x17, #+576] +ldr q2, [x17, #+592] +mla v0.4S, v8.4S, v31.s[0] +sub v8.4s, v24.4s, v21.4s +sqrdmulh v4.4S, v22.4S, v2.s[0] +str q8, [x0, #720] +ldr q8, [x0, #944] +mla v20.4S, v23.4S, v31.s[0] +add v24.4s, v24.4s, v21.4s +sqrdmulh v21.4S, v8.4S, v2.s[0] +str q24, [x0, #704] +ldr q24, [x17, #+608] +ldr q23, [x17, #+624] +mla v6.4S, v30.4S, v31.s[0] +sub v30.4s, v14.4s, v16.4s +sqrdmulh v19.4S, v25.4S, v23.s[0] +str q30, [x0, #752] +ldr q30, [x0, #1008] +mla v29.4S, v5.4S, v31.s[0] +add v14.4s, v14.4s, v16.4s +sqrdmulh v16.4S, v30.4S, v23.s[0] +str q14, [x0, #736] +ldr q14, [x0, #768] +ldr q5, [x0, #896] +mul v22.4S, v22.4S,v7.s[0] +sub v27.4s, v14.4s, v0.4s +ldr q10, [x0, #784] +mul v8.4S, v8.4S,v7.s[0] +add v14.4s, v14.4s, v0.4s +ldr q0, [x0, #912] +mla v22.4S, v4.4S, v31.s[0] +sub v4.4s, v10.4s, v20.4s +ldr q11, [x0, #832] +mla v8.4S, v21.4S, v31.s[0] +add v10.4s, v10.4s, v20.4s +ldr q20, [x0, #960] +mul v25.4S, v25.4S,v24.s[0] +sub v21.4s, v11.4s, v6.4s +ldr q13, [x0, #848] +mul v30.4S, v30.4S,v24.s[0] +add v11.4s, v11.4s, v6.4s +ldr q6, [x0, #976] +mla v25.4S, v19.4S, v31.s[0] +mla v30.4S, v16.4S, v31.s[0] +sub v16.4s, v13.4s, v29.4s +sqrdmulh v19.4S, v10.4S, v18.s[1] +add v13.4s, v13.4s, v29.4s +mul v10.4S, v10.4S,v28.s[1] +sqrdmulh v29.4S, v4.4S, v18.s[2] +sub v1.4s, v5.4s, v22.4s +mul v4.4S, v4.4S,v28.s[2] +add v5.4s, v5.4s, v22.4s +sqrdmulh v18.4S, v13.4S, v15.s[1] +sub v28.4s, v0.4s, v8.4s +mul v13.4S, v13.4S,v12.s[1] +add v0.4s, v0.4s, v8.4s +sqrdmulh v8.4S, v16.4S, v15.s[2] +sub v22.4s, v20.4s, v25.4s +mul v16.4S, v16.4S,v12.s[2] +add v20.4s, v20.4s, v25.4s +mla v10.4S, v19.4S, v31.s[0] +sub v19.4s, v6.4s, v30.4s +sqrdmulh v15.4S, v0.4S, v2.s[1] +add v6.4s, v6.4s, v30.4s +mla v4.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v28.4S, v2.s[2] +sub v30.4s, v14.4s, v10.4s +mla v13.4S, v18.4S, v31.s[0] +sqrdmulh v18.4S, v6.4S, v23.s[1] +add v14.4s, v14.4s, v10.4s +str q30, [x0, #784] +mla v16.4S, v8.4S, v31.s[0] +sqrdmulh v8.4S, v19.4S, v23.s[2] +sub v30.4s, v27.4s, v4.4s +str q14, [x0, #768] +mul v0.4S, v0.4S,v7.s[1] +add v27.4s, v27.4s, v4.4s +mul v28.4S, v28.4S,v7.s[2] +str q30, [x0, #816] +mla v0.4S, v15.4S, v31.s[0] +sub v15.4s, v11.4s, v13.4s +mla v28.4S, v29.4S, v31.s[0] +str q27, [x0, #800] +mul v6.4S, v6.4S,v24.s[1] +str q15, [x0, #848] +mul v19.4S, v19.4S,v24.s[2] +add v11.4s, v11.4s, v13.4s +str q11, [x0, #832] +mla v6.4S, v18.4S, v31.s[0] +sub v18.4s, v21.4s, v16.4s +str q18, [x0, #880] +mla v19.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v16.4s +str q21, [x0, #864] +sub v23.4s, v5.4s, v0.4s +str q23, [x0, #912] +add v5.4s, v5.4s, v0.4s +str q5, [x0, #896] +sub v5.4s, v1.4s, v28.4s +str q5, [x0, #944] +add v1.4s, v1.4s, v28.4s +str q1, [x0, #928] +sub v1.4s, v20.4s, v6.4s +str q1, [x0, #976] +add v20.4s, v20.4s, v6.4s +str q20, [x0, #960] +sub v20.4s, v22.4s, v19.4s +str q20, [x0, #1008] +add v22.4s, v22.4s, v19.4s +str q22, [x0, #992] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1528 +// Instruction count: 1524 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_19_z4_7.s b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_19_z4_7.s new file mode 100644 index 0000000..db8d7f4 --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_19_z4_7.s @@ -0,0 +1,1558 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_19_z4_7 +.global _ntt_u32_incomplete_neon_asm_var_4_2_19_z4_7 +ntt_u32_incomplete_neon_asm_var_4_2_19_z4_7: +_ntt_u32_incomplete_neon_asm_var_4_2_19_z4_7: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x0, #992] +sqrdmulh v27.4S, v28.4S, v29.s[0] +mul v28.4S, v28.4S,v30.s[0] +ldr q26, [x0, #928] +sqrdmulh v25.4S, v26.4S, v29.s[0] +mul v26.4S, v26.4S,v30.s[0] +ldr q24, [x0, #864] +sqrdmulh v23.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +ldr q22, [x0, #800] +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +ldr q20, [x0, #736] +sqrdmulh v19.4S, v20.4S, v29.s[0] +mla v28.4S, v27.4S, v31.s[0] +ldr q27, [x0, #672] +sqrdmulh v18.4S, v27.4S, v29.s[0] +mla v26.4S, v25.4S, v31.s[0] +ldr q25, [x0, #608] +sqrdmulh v17.4S, v25.4S, v29.s[0] +mla v24.4S, v23.4S, v31.s[0] +ldr q23, [x0, #544] +sqrdmulh v16.4S, v23.4S, v29.s[0] +mla v22.4S, v21.4S, v31.s[0] +ldr q21, [x0, #480] +ldr q3, [x0, #416] +mul v27.4S, v27.4S,v30.s[0] +mul v20.4S, v20.4S,v30.s[0] +sub v2.4s, v21.4s, v28.4s +add v21.4s, v21.4s, v28.4s +ldr q28, [x0, #352] +ldr q1, [x0, #288] +mla v27.4S, v18.4S, v31.s[0] +mla v20.4S, v19.4S, v31.s[0] +sub v19.4s, v3.4s, v26.4s +add v3.4s, v3.4s, v26.4s +ldr q26, [x0, #224] +ldr q18, [x0, #160] +mul v23.4S, v23.4S,v30.s[0] +mul v25.4S, v25.4S,v30.s[0] +sub v0.4s, v28.4s, v24.4s +add v28.4s, v28.4s, v24.4s +ldr q24, [x0, #96] +ldr q15, [x0, #32] +mla v23.4S, v16.4S, v31.s[0] +mla v25.4S, v17.4S, v31.s[0] +sub v17.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +sqrdmulh v22.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v16.4s, v26.4s, v20.4s +nop +sqrdmulh v14.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +add v26.4s, v26.4s, v20.4s +nop +sqrdmulh v20.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v13.4s, v18.4s, v27.4s +add v18.4s, v18.4s, v27.4s +sqrdmulh v27.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v12.4s, v24.4s, v25.4s +add v24.4s, v24.4s, v25.4s +sqrdmulh v25.4S, v0.4S, v29.s[2] +mla v2.4S, v22.4S, v31.s[0] +sub v22.4s, v15.4s, v23.4s +sqrdmulh v11.4S, v17.4S, v29.s[2] +mla v19.4S, v14.4S, v31.s[0] +add v15.4s, v15.4s, v23.4s +nop +sqrdmulh v23.4S, v28.4S, v29.s[1] +mla v21.4S, v20.4S, v31.s[0] +nop +sqrdmulh v20.4S, v1.4S, v29.s[1] +mla v3.4S, v27.4S, v31.s[0] +nop +nop +ldr q27, [x17, #+32] +ldr q14, [x17, #+48] +mul v17.4S, v17.4S,v30.s[2] +mul v0.4S, v0.4S,v30.s[2] +sub v10.4s, v16.4s, v2.4s +add v16.4s, v16.4s, v2.4s +mla v17.4S, v11.4S, v31.s[0] +mla v0.4S, v25.4S, v31.s[0] +sub v25.4s, v13.4s, v19.4s +add v13.4s, v13.4s, v19.4s +mul v1.4S, v1.4S,v30.s[1] +mul v28.4S, v28.4S,v30.s[1] +sub v19.4s, v26.4s, v21.4s +add v26.4s, v26.4s, v21.4s +mla v1.4S, v20.4S, v31.s[0] +mla v28.4S, v23.4S, v31.s[0] +sub v23.4s, v18.4s, v3.4s +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v10.4S, v14.s[3] +mul v10.4S, v10.4S,v27.s[3] +sub v20.4s, v12.4s, v0.4s +add v12.4s, v12.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v14.s[2] +mul v16.4S, v16.4S,v27.s[2] +sub v21.4s, v22.4s, v17.4s +add v22.4s, v22.4s, v17.4s +sqrdmulh v17.4S, v19.4S, v14.s[1] +mul v19.4S, v19.4S,v27.s[1] +sub v11.4s, v24.4s, v28.4s +add v24.4s, v24.4s, v28.4s +sqrdmulh v28.4S, v26.4S, v14.s[0] +mul v26.4S, v26.4S,v27.s[0] +sub v2.4s, v15.4s, v1.4s +add v15.4s, v15.4s, v1.4s +ldr q1, [x17, #+96] +ldr q9, [x17, #+112] +sqrdmulh v8.4S, v25.4S, v14.s[3] +mla v10.4S, v3.4S, v31.s[0] +nop +nop +sqrdmulh v3.4S, v13.4S, v14.s[2] +mla v16.4S, v0.4S, v31.s[0] +nop +nop +sqrdmulh v0.4S, v23.4S, v14.s[1] +mla v19.4S, v17.4S, v31.s[0] +nop +nop +sqrdmulh v17.4S, v18.4S, v14.s[0] +mla v26.4S, v28.4S, v31.s[0] +nop +nop +ldr q28, [x17, #+64] +ldr q7, [x17, #+80] +mul v13.4S, v13.4S,v27.s[2] +mul v25.4S, v25.4S,v27.s[3] +sub v6.4s, v20.4s, v10.4s +add v20.4s, v20.4s, v10.4s +mla v13.4S, v3.4S, v31.s[0] +mla v25.4S, v8.4S, v31.s[0] +sub v8.4s, v12.4s, v16.4s +add v12.4s, v12.4s, v16.4s +mul v18.4S, v18.4S,v27.s[0] +mul v23.4S, v23.4S,v27.s[1] +sub v16.4s, v11.4s, v19.4s +add v11.4s, v11.4s, v19.4s +mla v18.4S, v17.4S, v31.s[0] +mla v23.4S, v0.4S, v31.s[0] +sub v0.4s, v24.4s, v26.4s +add v24.4s, v24.4s, v26.4s +sqrdmulh v26.4S, v6.4S, v9.s[3] +mul v6.4S, v6.4S,v1.s[3] +sub v17.4s, v21.4s, v25.4s +add v21.4s, v21.4s, v25.4s +sqrdmulh v25.4S, v20.4S, v9.s[2] +mul v20.4S, v20.4S,v1.s[2] +sub v19.4s, v22.4s, v13.4s +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v8.4S, v9.s[1] +mul v8.4S, v8.4S,v1.s[1] +sub v3.4s, v2.4s, v23.4s +add v2.4s, v2.4s, v23.4s +sqrdmulh v23.4S, v12.4S, v9.s[0] +mul v12.4S, v12.4S,v1.s[0] +sub v10.4s, v15.4s, v18.4s +add v15.4s, v15.4s, v18.4s +sqrdmulh v18.4S, v16.4S, v7.s[3] +mla v6.4S, v26.4S, v31.s[0] +nop +nop +sqrdmulh v26.4S, v11.4S, v7.s[2] +mla v20.4S, v25.4S, v31.s[0] +sub v25.4s, v17.4s, v6.4s +str q25, [x0, #992] +sqrdmulh v25.4S, v0.4S, v7.s[1] +mla v8.4S, v13.4S, v31.s[0] +add v17.4s, v17.4s, v6.4s +str q17, [x0, #928] +sqrdmulh v17.4S, v24.4S, v7.s[0] +mla v12.4S, v23.4S, v31.s[0] +sub v23.4s, v21.4s, v20.4s +str q23, [x0, #864] +mul v11.4S, v11.4S,v28.s[2] +mul v16.4S, v16.4S,v28.s[3] +add v21.4s, v21.4s, v20.4s +sub v20.4s, v19.4s, v8.4s +mla v11.4S, v26.4S, v31.s[0] +mla v16.4S, v18.4S, v31.s[0] +add v19.4s, v19.4s, v8.4s +str q21, [x0, #800] +mul v24.4S, v24.4S,v28.s[0] +mul v0.4S, v0.4S,v28.s[1] +sub v21.4s, v22.4s, v12.4s +str q20, [x0, #736] +mla v24.4S, v17.4S, v31.s[0] +mla v0.4S, v25.4S, v31.s[0] +add v22.4s, v22.4s, v12.4s +str q19, [x0, #672] +ldr q19, [x0, #1008] +sqrdmulh v12.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +str q21, [x0, #608] +sub v21.4s, v3.4s, v16.4s +ldr q25, [x0, #944] +sqrdmulh v17.4S, v25.4S, v29.s[0] +mul v25.4S, v25.4S,v30.s[0] +str q22, [x0, #544] +add v3.4s, v3.4s, v16.4s +ldr q16, [x0, #880] +sqrdmulh v22.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +str q21, [x0, #480] +sub v21.4s, v2.4s, v11.4s +ldr q20, [x0, #816] +sqrdmulh v8.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +str q3, [x0, #416] +add v2.4s, v2.4s, v11.4s +ldr q11, [x0, #752] +sqrdmulh v3.4S, v11.4S, v29.s[0] +mla v19.4S, v12.4S, v31.s[0] +str q21, [x0, #352] +sub v21.4s, v10.4s, v0.4s +ldr q12, [x0, #688] +sqrdmulh v18.4S, v12.4S, v29.s[0] +mla v25.4S, v17.4S, v31.s[0] +str q2, [x0, #288] +add v10.4s, v10.4s, v0.4s +ldr q0, [x0, #624] +sqrdmulh v2.4S, v0.4S, v29.s[0] +mla v16.4S, v22.4S, v31.s[0] +str q21, [x0, #224] +sub v21.4s, v15.4s, v24.4s +ldr q22, [x0, #560] +sqrdmulh v17.4S, v22.4S, v29.s[0] +mla v20.4S, v8.4S, v31.s[0] +str q10, [x0, #160] +add v15.4s, v15.4s, v24.4s +ldr q24, [x0, #496] +ldr q10, [x0, #432] +mul v12.4S, v12.4S,v30.s[0] +mul v11.4S, v11.4S,v30.s[0] +sub v8.4s, v24.4s, v19.4s +add v24.4s, v24.4s, v19.4s +ldr q19, [x0, #368] +ldr q26, [x0, #304] +mla v12.4S, v18.4S, v31.s[0] +mla v11.4S, v3.4S, v31.s[0] +sub v3.4s, v10.4s, v25.4s +add v10.4s, v10.4s, v25.4s +ldr q25, [x0, #240] +ldr q18, [x0, #176] +mul v22.4S, v22.4S,v30.s[0] +mul v0.4S, v0.4S,v30.s[0] +sub v23.4s, v19.4s, v16.4s +add v19.4s, v19.4s, v16.4s +ldr q16, [x0, #112] +ldr q6, [x0, #48] +mla v22.4S, v17.4S, v31.s[0] +mla v0.4S, v2.4S, v31.s[0] +sub v2.4s, v26.4s, v20.4s +add v26.4s, v26.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v29.s[2] +mul v8.4S, v8.4S,v30.s[2] +sub v17.4s, v25.4s, v11.4s +nop +sqrdmulh v13.4S, v3.4S, v29.s[2] +mul v3.4S, v3.4S,v30.s[2] +add v25.4s, v25.4s, v11.4s +nop +sqrdmulh v11.4S, v24.4S, v29.s[1] +mul v24.4S, v24.4S,v30.s[1] +sub v5.4s, v18.4s, v12.4s +add v18.4s, v18.4s, v12.4s +sqrdmulh v12.4S, v10.4S, v29.s[1] +mul v10.4S, v10.4S,v30.s[1] +sub v4.4s, v16.4s, v0.4s +add v16.4s, v16.4s, v0.4s +sqrdmulh v0.4S, v23.4S, v29.s[2] +mla v8.4S, v20.4S, v31.s[0] +sub v20.4s, v6.4s, v22.4s +str q21, [x0, #96] +sqrdmulh v21.4S, v2.4S, v29.s[2] +mla v3.4S, v13.4S, v31.s[0] +add v6.4s, v6.4s, v22.4s +nop +sqrdmulh v22.4S, v19.4S, v29.s[1] +mla v24.4S, v11.4S, v31.s[0] +str q15, [x0, #32] +nop +sqrdmulh v15.4S, v26.4S, v29.s[1] +mla v10.4S, v12.4S, v31.s[0] +nop +nop +mul v2.4S, v2.4S,v30.s[2] +mul v23.4S, v23.4S,v30.s[2] +sub v12.4s, v17.4s, v8.4s +add v17.4s, v17.4s, v8.4s +mla v2.4S, v21.4S, v31.s[0] +mla v23.4S, v0.4S, v31.s[0] +sub v0.4s, v5.4s, v3.4s +add v5.4s, v5.4s, v3.4s +mul v26.4S, v26.4S,v30.s[1] +mul v19.4S, v19.4S,v30.s[1] +sub v3.4s, v25.4s, v24.4s +add v25.4s, v25.4s, v24.4s +mla v26.4S, v15.4S, v31.s[0] +mla v19.4S, v22.4S, v31.s[0] +sub v22.4s, v18.4s, v10.4s +add v18.4s, v18.4s, v10.4s +sqrdmulh v10.4S, v12.4S, v14.s[3] +mul v12.4S, v12.4S,v27.s[3] +sub v15.4s, v4.4s, v23.4s +add v4.4s, v4.4s, v23.4s +sqrdmulh v23.4S, v17.4S, v14.s[2] +mul v17.4S, v17.4S,v27.s[2] +sub v24.4s, v20.4s, v2.4s +add v20.4s, v20.4s, v2.4s +sqrdmulh v2.4S, v3.4S, v14.s[1] +mul v3.4S, v3.4S,v27.s[1] +sub v21.4s, v16.4s, v19.4s +add v16.4s, v16.4s, v19.4s +sqrdmulh v19.4S, v25.4S, v14.s[0] +mul v25.4S, v25.4S,v27.s[0] +sub v8.4s, v6.4s, v26.4s +add v6.4s, v6.4s, v26.4s +sqrdmulh v26.4S, v0.4S, v14.s[3] +mla v12.4S, v10.4S, v31.s[0] +nop +nop +sqrdmulh v10.4S, v5.4S, v14.s[2] +mla v17.4S, v23.4S, v31.s[0] +nop +nop +sqrdmulh v23.4S, v22.4S, v14.s[1] +mla v3.4S, v2.4S, v31.s[0] +nop +nop +sqrdmulh v2.4S, v18.4S, v14.s[0] +mla v25.4S, v19.4S, v31.s[0] +nop +nop +mul v5.4S, v5.4S,v27.s[2] +mul v0.4S, v0.4S,v27.s[3] +sub v19.4s, v15.4s, v12.4s +add v15.4s, v15.4s, v12.4s +mla v5.4S, v10.4S, v31.s[0] +mla v0.4S, v26.4S, v31.s[0] +sub v26.4s, v4.4s, v17.4s +add v4.4s, v4.4s, v17.4s +mul v18.4S, v18.4S,v27.s[0] +mul v22.4S, v22.4S,v27.s[1] +sub v17.4s, v21.4s, v3.4s +add v21.4s, v21.4s, v3.4s +mla v18.4S, v2.4S, v31.s[0] +mla v22.4S, v23.4S, v31.s[0] +sub v23.4s, v16.4s, v25.4s +add v16.4s, v16.4s, v25.4s +sqrdmulh v25.4S, v19.4S, v9.s[3] +mul v19.4S, v19.4S,v1.s[3] +sub v2.4s, v24.4s, v0.4s +add v24.4s, v24.4s, v0.4s +sqrdmulh v0.4S, v15.4S, v9.s[2] +mul v15.4S, v15.4S,v1.s[2] +sub v3.4s, v20.4s, v5.4s +add v20.4s, v20.4s, v5.4s +sqrdmulh v5.4S, v26.4S, v9.s[1] +mul v26.4S, v26.4S,v1.s[1] +sub v10.4s, v8.4s, v22.4s +add v8.4s, v8.4s, v22.4s +sqrdmulh v22.4S, v4.4S, v9.s[0] +mul v4.4S, v4.4S,v1.s[0] +sub v12.4s, v6.4s, v18.4s +add v6.4s, v6.4s, v18.4s +sqrdmulh v18.4S, v17.4S, v7.s[3] +mla v19.4S, v25.4S, v31.s[0] +nop +nop +sqrdmulh v25.4S, v21.4S, v7.s[2] +mla v15.4S, v0.4S, v31.s[0] +sub v0.4s, v2.4s, v19.4s +str q0, [x0, #1008] +sqrdmulh v0.4S, v23.4S, v7.s[1] +mla v26.4S, v5.4S, v31.s[0] +add v2.4s, v2.4s, v19.4s +str q2, [x0, #944] +sqrdmulh v2.4S, v16.4S, v7.s[0] +mla v4.4S, v22.4S, v31.s[0] +sub v22.4s, v24.4s, v15.4s +str q22, [x0, #880] +mul v21.4S, v21.4S,v28.s[2] +mul v17.4S, v17.4S,v28.s[3] +add v24.4s, v24.4s, v15.4s +sub v15.4s, v3.4s, v26.4s +mla v21.4S, v25.4S, v31.s[0] +mla v17.4S, v18.4S, v31.s[0] +add v3.4s, v3.4s, v26.4s +str q24, [x0, #816] +mul v16.4S, v16.4S,v28.s[0] +mul v23.4S, v23.4S,v28.s[1] +sub v24.4s, v20.4s, v4.4s +str q15, [x0, #752] +mla v16.4S, v2.4S, v31.s[0] +mla v23.4S, v0.4S, v31.s[0] +add v20.4s, v20.4s, v4.4s +str q3, [x0, #688] +ldr q3, [x0, #960] +sqrdmulh v4.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +str q24, [x0, #624] +sub v24.4s, v10.4s, v17.4s +ldr q0, [x0, #896] +sqrdmulh v2.4S, v0.4S, v29.s[0] +mul v0.4S, v0.4S,v30.s[0] +str q20, [x0, #560] +add v10.4s, v10.4s, v17.4s +ldr q17, [x0, #832] +sqrdmulh v20.4S, v17.4S, v29.s[0] +mul v17.4S, v17.4S,v30.s[0] +str q24, [x0, #496] +sub v24.4s, v8.4s, v21.4s +ldr q15, [x0, #768] +sqrdmulh v26.4S, v15.4S, v29.s[0] +mul v15.4S, v15.4S,v30.s[0] +str q10, [x0, #432] +add v8.4s, v8.4s, v21.4s +ldr q21, [x0, #704] +sqrdmulh v10.4S, v21.4S, v29.s[0] +mla v3.4S, v4.4S, v31.s[0] +str q24, [x0, #368] +sub v24.4s, v12.4s, v23.4s +ldr q4, [x0, #640] +sqrdmulh v18.4S, v4.4S, v29.s[0] +mla v0.4S, v2.4S, v31.s[0] +str q8, [x0, #304] +add v12.4s, v12.4s, v23.4s +ldr q23, [x0, #576] +sqrdmulh v8.4S, v23.4S, v29.s[0] +mla v17.4S, v20.4S, v31.s[0] +str q24, [x0, #240] +sub v24.4s, v6.4s, v16.4s +ldr q20, [x0, #512] +sqrdmulh v2.4S, v20.4S, v29.s[0] +mla v15.4S, v26.4S, v31.s[0] +str q12, [x0, #176] +add v6.4s, v6.4s, v16.4s +ldr q16, [x0, #448] +ldr q12, [x0, #384] +mul v4.4S, v4.4S,v30.s[0] +mul v21.4S, v21.4S,v30.s[0] +sub v26.4s, v16.4s, v3.4s +add v16.4s, v16.4s, v3.4s +ldr q3, [x0, #320] +ldr q25, [x0, #256] +mla v4.4S, v18.4S, v31.s[0] +mla v21.4S, v10.4S, v31.s[0] +sub v10.4s, v12.4s, v0.4s +add v12.4s, v12.4s, v0.4s +ldr q0, [x0, #192] +ldr q18, [x0, #128] +mul v20.4S, v20.4S,v30.s[0] +mul v23.4S, v23.4S,v30.s[0] +sub v22.4s, v3.4s, v17.4s +add v3.4s, v3.4s, v17.4s +ldr q17, [x0, #64] +ldr q19, [x0, #0] +mla v20.4S, v2.4S, v31.s[0] +mla v23.4S, v8.4S, v31.s[0] +sub v8.4s, v25.4s, v15.4s +add v25.4s, v25.4s, v15.4s +sqrdmulh v15.4S, v26.4S, v29.s[2] +mul v26.4S, v26.4S,v30.s[2] +sub v2.4s, v0.4s, v21.4s +nop +sqrdmulh v5.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +add v0.4s, v0.4s, v21.4s +nop +sqrdmulh v21.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v11.4s, v18.4s, v4.4s +add v18.4s, v18.4s, v4.4s +sqrdmulh v4.4S, v12.4S, v29.s[1] +mul v12.4S, v12.4S,v30.s[1] +sub v13.4s, v17.4s, v23.4s +add v17.4s, v17.4s, v23.4s +sqrdmulh v23.4S, v22.4S, v29.s[2] +mla v26.4S, v15.4S, v31.s[0] +sub v15.4s, v19.4s, v20.4s +str q24, [x0, #112] +sqrdmulh v24.4S, v8.4S, v29.s[2] +mla v10.4S, v5.4S, v31.s[0] +add v19.4s, v19.4s, v20.4s +nop +sqrdmulh v20.4S, v3.4S, v29.s[1] +mla v16.4S, v21.4S, v31.s[0] +str q6, [x0, #48] +nop +sqrdmulh v6.4S, v25.4S, v29.s[1] +mla v12.4S, v4.4S, v31.s[0] +nop +nop +mul v8.4S, v8.4S,v30.s[2] +mul v22.4S, v22.4S,v30.s[2] +sub v4.4s, v2.4s, v26.4s +add v2.4s, v2.4s, v26.4s +mla v8.4S, v24.4S, v31.s[0] +mla v22.4S, v23.4S, v31.s[0] +sub v23.4s, v11.4s, v10.4s +add v11.4s, v11.4s, v10.4s +mul v25.4S, v25.4S,v30.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v10.4s, v0.4s, v16.4s +add v0.4s, v0.4s, v16.4s +mla v25.4S, v6.4S, v31.s[0] +mla v3.4S, v20.4S, v31.s[0] +sub v20.4s, v18.4s, v12.4s +add v18.4s, v18.4s, v12.4s +sqrdmulh v12.4S, v4.4S, v14.s[3] +mul v4.4S, v4.4S,v27.s[3] +sub v6.4s, v13.4s, v22.4s +add v13.4s, v13.4s, v22.4s +sqrdmulh v22.4S, v2.4S, v14.s[2] +mul v2.4S, v2.4S,v27.s[2] +sub v16.4s, v15.4s, v8.4s +add v15.4s, v15.4s, v8.4s +sqrdmulh v8.4S, v10.4S, v14.s[1] +mul v10.4S, v10.4S,v27.s[1] +sub v24.4s, v17.4s, v3.4s +add v17.4s, v17.4s, v3.4s +sqrdmulh v3.4S, v0.4S, v14.s[0] +mul v0.4S, v0.4S,v27.s[0] +sub v26.4s, v19.4s, v25.4s +add v19.4s, v19.4s, v25.4s +sqrdmulh v25.4S, v23.4S, v14.s[3] +mla v4.4S, v12.4S, v31.s[0] +nop +nop +sqrdmulh v12.4S, v11.4S, v14.s[2] +mla v2.4S, v22.4S, v31.s[0] +nop +nop +sqrdmulh v22.4S, v20.4S, v14.s[1] +mla v10.4S, v8.4S, v31.s[0] +nop +nop +sqrdmulh v8.4S, v18.4S, v14.s[0] +mla v0.4S, v3.4S, v31.s[0] +nop +nop +mul v11.4S, v11.4S,v27.s[2] +mul v23.4S, v23.4S,v27.s[3] +sub v3.4s, v6.4s, v4.4s +add v6.4s, v6.4s, v4.4s +mla v11.4S, v12.4S, v31.s[0] +mla v23.4S, v25.4S, v31.s[0] +sub v25.4s, v13.4s, v2.4s +add v13.4s, v13.4s, v2.4s +mul v18.4S, v18.4S,v27.s[0] +mul v20.4S, v20.4S,v27.s[1] +sub v2.4s, v24.4s, v10.4s +add v24.4s, v24.4s, v10.4s +mla v18.4S, v8.4S, v31.s[0] +mla v20.4S, v22.4S, v31.s[0] +sub v22.4s, v17.4s, v0.4s +add v17.4s, v17.4s, v0.4s +sqrdmulh v0.4S, v3.4S, v9.s[3] +mul v3.4S, v3.4S,v1.s[3] +sub v8.4s, v16.4s, v23.4s +add v16.4s, v16.4s, v23.4s +sqrdmulh v23.4S, v6.4S, v9.s[2] +mul v6.4S, v6.4S,v1.s[2] +sub v10.4s, v15.4s, v11.4s +add v15.4s, v15.4s, v11.4s +sqrdmulh v11.4S, v25.4S, v9.s[1] +mul v25.4S, v25.4S,v1.s[1] +sub v12.4s, v26.4s, v20.4s +add v26.4s, v26.4s, v20.4s +sqrdmulh v20.4S, v13.4S, v9.s[0] +mul v13.4S, v13.4S,v1.s[0] +sub v4.4s, v19.4s, v18.4s +add v19.4s, v19.4s, v18.4s +sqrdmulh v18.4S, v2.4S, v7.s[3] +mla v3.4S, v0.4S, v31.s[0] +nop +nop +sqrdmulh v0.4S, v24.4S, v7.s[2] +mla v6.4S, v23.4S, v31.s[0] +sub v23.4s, v8.4s, v3.4s +str q23, [x0, #960] +sqrdmulh v23.4S, v22.4S, v7.s[1] +mla v25.4S, v11.4S, v31.s[0] +add v8.4s, v8.4s, v3.4s +str q8, [x0, #896] +sqrdmulh v8.4S, v17.4S, v7.s[0] +mla v13.4S, v20.4S, v31.s[0] +sub v20.4s, v16.4s, v6.4s +str q20, [x0, #832] +mul v24.4S, v24.4S,v28.s[2] +mul v2.4S, v2.4S,v28.s[3] +add v16.4s, v16.4s, v6.4s +sub v6.4s, v10.4s, v25.4s +mla v24.4S, v0.4S, v31.s[0] +mla v2.4S, v18.4S, v31.s[0] +add v10.4s, v10.4s, v25.4s +str q16, [x0, #768] +mul v17.4S, v17.4S,v28.s[0] +mul v22.4S, v22.4S,v28.s[1] +sub v16.4s, v15.4s, v13.4s +str q6, [x0, #704] +mla v17.4S, v8.4S, v31.s[0] +mla v22.4S, v23.4S, v31.s[0] +add v15.4s, v15.4s, v13.4s +str q10, [x0, #640] +ldr q10, [x0, #976] +sqrdmulh v13.4S, v10.4S, v29.s[0] +mul v10.4S, v10.4S,v30.s[0] +str q16, [x0, #576] +sub v16.4s, v12.4s, v2.4s +ldr q23, [x0, #912] +sqrdmulh v8.4S, v23.4S, v29.s[0] +mul v23.4S, v23.4S,v30.s[0] +str q15, [x0, #512] +add v12.4s, v12.4s, v2.4s +ldr q2, [x0, #848] +sqrdmulh v15.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +str q16, [x0, #448] +sub v16.4s, v26.4s, v24.4s +ldr q6, [x0, #784] +sqrdmulh v25.4S, v6.4S, v29.s[0] +mul v6.4S, v6.4S,v30.s[0] +str q12, [x0, #384] +add v26.4s, v26.4s, v24.4s +ldr q24, [x0, #720] +sqrdmulh v12.4S, v24.4S, v29.s[0] +mla v10.4S, v13.4S, v31.s[0] +str q16, [x0, #320] +sub v16.4s, v4.4s, v22.4s +ldr q13, [x0, #656] +sqrdmulh v18.4S, v13.4S, v29.s[0] +mla v23.4S, v8.4S, v31.s[0] +str q26, [x0, #256] +add v4.4s, v4.4s, v22.4s +ldr q22, [x0, #592] +sqrdmulh v26.4S, v22.4S, v29.s[0] +mla v2.4S, v15.4S, v31.s[0] +str q16, [x0, #192] +sub v16.4s, v19.4s, v17.4s +ldr q15, [x0, #528] +sqrdmulh v8.4S, v15.4S, v29.s[0] +mla v6.4S, v25.4S, v31.s[0] +str q4, [x0, #128] +add v19.4s, v19.4s, v17.4s +ldr q17, [x0, #464] +ldr q4, [x0, #400] +mul v13.4S, v13.4S,v30.s[0] +mul v24.4S, v24.4S,v30.s[0] +sub v25.4s, v17.4s, v10.4s +add v17.4s, v17.4s, v10.4s +ldr q10, [x0, #336] +ldr q0, [x0, #272] +mla v13.4S, v18.4S, v31.s[0] +mla v24.4S, v12.4S, v31.s[0] +sub v12.4s, v4.4s, v23.4s +add v4.4s, v4.4s, v23.4s +ldr q23, [x0, #208] +ldr q18, [x0, #144] +mul v15.4S, v15.4S,v30.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v20.4s, v10.4s, v2.4s +add v10.4s, v10.4s, v2.4s +ldr q2, [x0, #80] +ldr q3, [x0, #16] +mla v15.4S, v8.4S, v31.s[0] +mla v22.4S, v26.4S, v31.s[0] +sub v26.4s, v0.4s, v6.4s +add v0.4s, v0.4s, v6.4s +sqrdmulh v6.4S, v25.4S, v29.s[2] +mul v25.4S, v25.4S,v30.s[2] +sub v8.4s, v23.4s, v24.4s +nop +sqrdmulh v11.4S, v12.4S, v29.s[2] +mul v12.4S, v12.4S,v30.s[2] +add v23.4s, v23.4s, v24.4s +nop +sqrdmulh v24.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v21.4s, v18.4s, v13.4s +add v18.4s, v18.4s, v13.4s +sqrdmulh v13.4S, v4.4S, v29.s[1] +mul v4.4S, v4.4S,v30.s[1] +sub v5.4s, v2.4s, v22.4s +add v2.4s, v2.4s, v22.4s +sqrdmulh v22.4S, v20.4S, v29.s[2] +mla v25.4S, v6.4S, v31.s[0] +sub v6.4s, v3.4s, v15.4s +str q16, [x0, #64] +sqrdmulh v16.4S, v26.4S, v29.s[2] +mla v12.4S, v11.4S, v31.s[0] +add v3.4s, v3.4s, v15.4s +nop +sqrdmulh v15.4S, v10.4S, v29.s[1] +mla v17.4S, v24.4S, v31.s[0] +str q19, [x0, #0] +nop +sqrdmulh v19.4S, v0.4S, v29.s[1] +mla v4.4S, v13.4S, v31.s[0] +nop +nop +mul v26.4S, v26.4S,v30.s[2] +mul v20.4S, v20.4S,v30.s[2] +sub v13.4s, v8.4s, v25.4s +add v8.4s, v8.4s, v25.4s +mla v26.4S, v16.4S, v31.s[0] +mla v20.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v12.4s +add v21.4s, v21.4s, v12.4s +mul v0.4S, v0.4S,v30.s[1] +mul v10.4S, v10.4S,v30.s[1] +sub v12.4s, v23.4s, v17.4s +add v23.4s, v23.4s, v17.4s +mla v0.4S, v19.4S, v31.s[0] +mla v10.4S, v15.4S, v31.s[0] +sub v15.4s, v18.4s, v4.4s +add v18.4s, v18.4s, v4.4s +sqrdmulh v29.4S, v13.4S, v14.s[3] +mul v13.4S, v13.4S,v27.s[3] +sub v30.4s, v5.4s, v20.4s +add v5.4s, v5.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v14.s[2] +mul v8.4S, v8.4S,v27.s[2] +sub v4.4s, v6.4s, v26.4s +add v6.4s, v6.4s, v26.4s +sqrdmulh v26.4S, v12.4S, v14.s[1] +mul v12.4S, v12.4S,v27.s[1] +sub v19.4s, v2.4s, v10.4s +add v2.4s, v2.4s, v10.4s +sqrdmulh v10.4S, v23.4S, v14.s[0] +mul v23.4S, v23.4S,v27.s[0] +sub v17.4s, v3.4s, v0.4s +add v3.4s, v3.4s, v0.4s +sqrdmulh v0.4S, v22.4S, v14.s[3] +mla v13.4S, v29.4S, v31.s[0] +nop +nop +sqrdmulh v29.4S, v21.4S, v14.s[2] +mla v8.4S, v20.4S, v31.s[0] +nop +nop +sqrdmulh v20.4S, v15.4S, v14.s[1] +mla v12.4S, v26.4S, v31.s[0] +nop +nop +sqrdmulh v26.4S, v18.4S, v14.s[0] +mla v23.4S, v10.4S, v31.s[0] +nop +nop +mul v21.4S, v21.4S,v27.s[2] +mul v22.4S, v22.4S,v27.s[3] +sub v10.4s, v30.4s, v13.4s +add v30.4s, v30.4s, v13.4s +mla v21.4S, v29.4S, v31.s[0] +mla v22.4S, v0.4S, v31.s[0] +sub v0.4s, v5.4s, v8.4s +add v5.4s, v5.4s, v8.4s +mul v18.4S, v18.4S,v27.s[0] +mul v15.4S, v15.4S,v27.s[1] +sub v8.4s, v19.4s, v12.4s +add v19.4s, v19.4s, v12.4s +mla v18.4S, v26.4S, v31.s[0] +mla v15.4S, v20.4S, v31.s[0] +sub v20.4s, v2.4s, v23.4s +add v2.4s, v2.4s, v23.4s +sqrdmulh v14.4S, v10.4S, v9.s[3] +mul v10.4S, v10.4S,v1.s[3] +sub v27.4s, v4.4s, v22.4s +add v4.4s, v4.4s, v22.4s +sqrdmulh v22.4S, v30.4S, v9.s[2] +mul v30.4S, v30.4S,v1.s[2] +sub v23.4s, v6.4s, v21.4s +add v6.4s, v6.4s, v21.4s +sqrdmulh v21.4S, v0.4S, v9.s[1] +mul v0.4S, v0.4S,v1.s[1] +sub v26.4s, v17.4s, v15.4s +add v17.4s, v17.4s, v15.4s +sqrdmulh v15.4S, v5.4S, v9.s[0] +mul v5.4S, v5.4S,v1.s[0] +sub v12.4s, v3.4s, v18.4s +add v3.4s, v3.4s, v18.4s +sqrdmulh v9.4S, v8.4S, v7.s[3] +mla v10.4S, v14.4S, v31.s[0] +nop +nop +sqrdmulh v14.4S, v19.4S, v7.s[2] +mla v30.4S, v22.4S, v31.s[0] +sub v22.4s, v27.4s, v10.4s +str q22, [x0, #976] +sqrdmulh v22.4S, v20.4S, v7.s[1] +mla v0.4S, v21.4S, v31.s[0] +add v27.4s, v27.4s, v10.4s +str q27, [x0, #912] +sqrdmulh v27.4S, v2.4S, v7.s[0] +mla v5.4S, v15.4S, v31.s[0] +sub v15.4s, v4.4s, v30.4s +str q15, [x0, #848] +mul v19.4S, v19.4S,v28.s[2] +mul v8.4S, v8.4S,v28.s[3] +add v4.4s, v4.4s, v30.4s +sub v30.4s, v23.4s, v0.4s +mla v19.4S, v14.4S, v31.s[0] +mla v8.4S, v9.4S, v31.s[0] +add v23.4s, v23.4s, v0.4s +str q4, [x0, #784] +mul v2.4S, v2.4S,v28.s[0] +mul v20.4S, v20.4S,v28.s[1] +sub v4.4s, v6.4s, v5.4s +str q30, [x0, #720] +mla v2.4S, v27.4S, v31.s[0] +mla v20.4S, v22.4S, v31.s[0] +add v6.4s, v6.4s, v5.4s +str q23, [x0, #656] +str q4, [x0, #592] +sub v4.4s, v26.4s, v8.4s +str q6, [x0, #528] +add v26.4s, v26.4s, v8.4s +str q4, [x0, #464] +sub v4.4s, v17.4s, v19.4s +str q26, [x0, #400] +add v17.4s, v17.4s, v19.4s +str q4, [x0, #336] +sub v4.4s, v12.4s, v20.4s +str q17, [x0, #272] +add v12.4s, v12.4s, v20.4s +str q4, [x0, #208] +sub v4.4s, v3.4s, v2.4s +str q12, [x0, #144] +add v3.4s, v3.4s, v2.4s +str q4, [x0, #80] +str q3, [x0, #16] +ldr q11, [x0, #224] +ldr q24, [x0, #160] +ldr q25, [x0, #32] +ldr q16, [x17, #+128] +ldr q13, [x17, #+144] +sqrdmulh v29.4S, v25.4S, v13.s[0] +mul v25.4S, v25.4S,v16.s[0] +ldr q18, [x0, #48] +sqrdmulh v1.4S, v18.4S, v13.s[0] +mul v18.4S, v18.4S,v16.s[0] +ldr q21, [x17, #+160] +ldr q10, [x17, #+176] +ldr q15, [x0, #96] +sqrdmulh v14.4S, v15.4S, v10.s[0] +mul v15.4S, v15.4S,v21.s[0] +ldr q9, [x0, #112] +sqrdmulh v0.4S, v9.4S, v10.s[0] +mul v9.4S, v9.4S,v21.s[0] +ldr q30, [x17, #+192] +ldr q27, [x17, #+208] +mla v25.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v24.4S, v27.s[0] +ldr q22, [x0, #176] +mla v18.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v22.4S, v27.s[0] +ldr q5, [x17, #+224] +ldr q23, [x17, #+240] +mla v15.4S, v14.4S, v31.s[0] +sqrdmulh v14.4S, v11.4S, v23.s[0] +ldr q28, [x0, #240] +mla v9.4S, v0.4S, v31.s[0] +sqrdmulh v0.4S, v28.4S, v23.s[0] +ldr q7, [x0, #0] +ldr q6, [x0, #128] +mul v24.4S, v24.4S,v30.s[0] +sub v8.4s, v7.4s, v25.4s +ldr q26, [x0, #16] +mul v22.4S, v22.4S,v30.s[0] +add v7.4s, v7.4s, v25.4s +ldr q25, [x0, #144] +mla v24.4S, v29.4S, v31.s[0] +sub v29.4s, v26.4s, v18.4s +ldr q19, [x0, #64] +mla v22.4S, v1.4S, v31.s[0] +add v26.4s, v26.4s, v18.4s +ldr q18, [x0, #192] +mul v11.4S, v11.4S,v5.s[0] +sub v1.4s, v19.4s, v15.4s +ldr q17, [x0, #80] +mul v28.4S, v28.4S,v5.s[0] +add v19.4s, v19.4s, v15.4s +ldr q15, [x0, #208] +mla v11.4S, v14.4S, v31.s[0] +mla v28.4S, v0.4S, v31.s[0] +sub v0.4s, v17.4s, v9.4s +sqrdmulh v14.4S, v26.4S, v13.s[1] +add v17.4s, v17.4s, v9.4s +mul v26.4S, v26.4S,v16.s[1] +sqrdmulh v9.4S, v29.4S, v13.s[2] +sub v20.4s, v6.4s, v24.4s +mul v29.4S, v29.4S,v16.s[2] +add v6.4s, v6.4s, v24.4s +sqrdmulh v13.4S, v17.4S, v10.s[1] +sub v16.4s, v25.4s, v22.4s +mul v17.4S, v17.4S,v21.s[1] +add v25.4s, v25.4s, v22.4s +sqrdmulh v22.4S, v0.4S, v10.s[2] +sub v24.4s, v18.4s, v11.4s +mul v0.4S, v0.4S,v21.s[2] +add v18.4s, v18.4s, v11.4s +mla v26.4S, v14.4S, v31.s[0] +sub v14.4s, v15.4s, v28.4s +ldr q10, [x0, #480] +sqrdmulh v21.4S, v25.4S, v27.s[1] +add v15.4s, v15.4s, v28.4s +mla v29.4S, v9.4S, v31.s[0] +ldr q9, [x0, #416] +sqrdmulh v28.4S, v16.4S, v27.s[2] +sub v11.4s, v7.4s, v26.4s +mla v17.4S, v13.4S, v31.s[0] +ldr q13, [x0, #288] +sqrdmulh v12.4S, v15.4S, v23.s[1] +add v7.4s, v7.4s, v26.4s +str q11, [x0, #16] +mla v0.4S, v22.4S, v31.s[0] +ldr q22, [x17, #+256] +ldr q11, [x17, #+272] +sqrdmulh v26.4S, v14.4S, v23.s[2] +sub v2.4s, v8.4s, v29.4s +str q7, [x0, #0] +mul v25.4S, v25.4S,v30.s[1] +add v8.4s, v8.4s, v29.4s +mul v16.4S, v16.4S,v30.s[2] +str q2, [x0, #48] +mla v25.4S, v21.4S, v31.s[0] +sub v21.4s, v19.4s, v17.4s +mla v16.4S, v28.4S, v31.s[0] +str q8, [x0, #32] +mul v15.4S, v15.4S,v5.s[1] +str q21, [x0, #80] +mul v14.4S, v14.4S,v5.s[2] +add v19.4s, v19.4s, v17.4s +str q19, [x0, #64] +mla v15.4S, v12.4S, v31.s[0] +sub v12.4s, v1.4s, v0.4s +str q12, [x0, #112] +mla v14.4S, v26.4S, v31.s[0] +add v1.4s, v1.4s, v0.4s +str q1, [x0, #96] +sqrdmulh v23.4S, v13.4S, v11.s[0] +sub v5.4s, v6.4s, v25.4s +mul v13.4S, v13.4S,v22.s[0] +str q5, [x0, #144] +ldr q5, [x0, #304] +sqrdmulh v1.4S, v5.4S, v11.s[0] +add v6.4s, v6.4s, v25.4s +mul v5.4S, v5.4S,v22.s[0] +str q6, [x0, #128] +ldr q6, [x17, #+288] +ldr q25, [x17, #+304] +ldr q0, [x0, #352] +sqrdmulh v26.4S, v0.4S, v25.s[0] +sub v12.4s, v20.4s, v16.4s +mul v0.4S, v0.4S,v6.s[0] +str q12, [x0, #176] +ldr q12, [x0, #368] +sqrdmulh v19.4S, v12.4S, v25.s[0] +add v20.4s, v20.4s, v16.4s +mul v12.4S, v12.4S,v6.s[0] +str q20, [x0, #160] +ldr q20, [x17, #+320] +ldr q16, [x17, #+336] +mla v13.4S, v23.4S, v31.s[0] +sub v23.4s, v18.4s, v15.4s +sqrdmulh v17.4S, v9.4S, v16.s[0] +str q23, [x0, #208] +ldr q23, [x0, #432] +mla v5.4S, v1.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +sqrdmulh v15.4S, v23.4S, v16.s[0] +str q18, [x0, #192] +ldr q18, [x17, #+352] +ldr q1, [x17, #+368] +mla v0.4S, v26.4S, v31.s[0] +sub v26.4s, v24.4s, v14.4s +sqrdmulh v21.4S, v10.4S, v1.s[0] +str q26, [x0, #240] +ldr q26, [x0, #496] +mla v12.4S, v19.4S, v31.s[0] +add v24.4s, v24.4s, v14.4s +sqrdmulh v14.4S, v26.4S, v1.s[0] +str q24, [x0, #224] +ldr q24, [x0, #256] +ldr q19, [x0, #384] +mul v9.4S, v9.4S,v20.s[0] +sub v27.4s, v24.4s, v13.4s +ldr q30, [x0, #272] +mul v23.4S, v23.4S,v20.s[0] +add v24.4s, v24.4s, v13.4s +ldr q13, [x0, #400] +mla v9.4S, v17.4S, v31.s[0] +sub v17.4s, v30.4s, v5.4s +ldr q8, [x0, #320] +mla v23.4S, v15.4S, v31.s[0] +add v30.4s, v30.4s, v5.4s +ldr q5, [x0, #448] +mul v10.4S, v10.4S,v18.s[0] +sub v15.4s, v8.4s, v0.4s +ldr q28, [x0, #336] +mul v26.4S, v26.4S,v18.s[0] +add v8.4s, v8.4s, v0.4s +ldr q0, [x0, #464] +mla v10.4S, v21.4S, v31.s[0] +mla v26.4S, v14.4S, v31.s[0] +sub v14.4s, v28.4s, v12.4s +sqrdmulh v21.4S, v30.4S, v11.s[1] +add v28.4s, v28.4s, v12.4s +mul v30.4S, v30.4S,v22.s[1] +sqrdmulh v12.4S, v17.4S, v11.s[2] +sub v2.4s, v19.4s, v9.4s +mul v17.4S, v17.4S,v22.s[2] +add v19.4s, v19.4s, v9.4s +sqrdmulh v11.4S, v28.4S, v25.s[1] +sub v22.4s, v13.4s, v23.4s +mul v28.4S, v28.4S,v6.s[1] +add v13.4s, v13.4s, v23.4s +sqrdmulh v23.4S, v14.4S, v25.s[2] +sub v9.4s, v5.4s, v10.4s +mul v14.4S, v14.4S,v6.s[2] +add v5.4s, v5.4s, v10.4s +mla v30.4S, v21.4S, v31.s[0] +sub v21.4s, v0.4s, v26.4s +ldr q25, [x0, #736] +sqrdmulh v6.4S, v13.4S, v16.s[1] +add v0.4s, v0.4s, v26.4s +mla v17.4S, v12.4S, v31.s[0] +ldr q12, [x0, #672] +sqrdmulh v26.4S, v22.4S, v16.s[2] +sub v10.4s, v24.4s, v30.4s +mla v28.4S, v11.4S, v31.s[0] +ldr q11, [x0, #544] +sqrdmulh v29.4S, v0.4S, v1.s[1] +add v24.4s, v24.4s, v30.4s +str q10, [x0, #272] +mla v14.4S, v23.4S, v31.s[0] +ldr q23, [x17, #+384] +ldr q10, [x17, #+400] +sqrdmulh v30.4S, v21.4S, v1.s[2] +sub v7.4s, v27.4s, v17.4s +str q24, [x0, #256] +mul v13.4S, v13.4S,v20.s[1] +add v27.4s, v27.4s, v17.4s +mul v22.4S, v22.4S,v20.s[2] +str q7, [x0, #304] +mla v13.4S, v6.4S, v31.s[0] +sub v6.4s, v8.4s, v28.4s +mla v22.4S, v26.4S, v31.s[0] +str q27, [x0, #288] +mul v0.4S, v0.4S,v18.s[1] +str q6, [x0, #336] +mul v21.4S, v21.4S,v18.s[2] +add v8.4s, v8.4s, v28.4s +str q8, [x0, #320] +mla v0.4S, v29.4S, v31.s[0] +sub v29.4s, v15.4s, v14.4s +str q29, [x0, #368] +mla v21.4S, v30.4S, v31.s[0] +add v15.4s, v15.4s, v14.4s +str q15, [x0, #352] +sqrdmulh v1.4S, v11.4S, v10.s[0] +sub v18.4s, v19.4s, v13.4s +mul v11.4S, v11.4S,v23.s[0] +str q18, [x0, #400] +ldr q18, [x0, #560] +sqrdmulh v15.4S, v18.4S, v10.s[0] +add v19.4s, v19.4s, v13.4s +mul v18.4S, v18.4S,v23.s[0] +str q19, [x0, #384] +ldr q19, [x17, #+416] +ldr q13, [x17, #+432] +ldr q14, [x0, #608] +sqrdmulh v30.4S, v14.4S, v13.s[0] +sub v29.4s, v2.4s, v22.4s +mul v14.4S, v14.4S,v19.s[0] +str q29, [x0, #432] +ldr q29, [x0, #624] +sqrdmulh v8.4S, v29.4S, v13.s[0] +add v2.4s, v2.4s, v22.4s +mul v29.4S, v29.4S,v19.s[0] +str q2, [x0, #416] +ldr q2, [x17, #+448] +ldr q22, [x17, #+464] +mla v11.4S, v1.4S, v31.s[0] +sub v1.4s, v5.4s, v0.4s +sqrdmulh v28.4S, v12.4S, v22.s[0] +str q1, [x0, #464] +ldr q1, [x0, #688] +mla v18.4S, v15.4S, v31.s[0] +add v5.4s, v5.4s, v0.4s +sqrdmulh v0.4S, v1.4S, v22.s[0] +str q5, [x0, #448] +ldr q5, [x17, #+480] +ldr q15, [x17, #+496] +mla v14.4S, v30.4S, v31.s[0] +sub v30.4s, v9.4s, v21.4s +sqrdmulh v6.4S, v25.4S, v15.s[0] +str q30, [x0, #496] +ldr q30, [x0, #752] +mla v29.4S, v8.4S, v31.s[0] +add v9.4s, v9.4s, v21.4s +sqrdmulh v21.4S, v30.4S, v15.s[0] +str q9, [x0, #480] +ldr q9, [x0, #512] +ldr q8, [x0, #640] +mul v12.4S, v12.4S,v2.s[0] +sub v16.4s, v9.4s, v11.4s +ldr q20, [x0, #528] +mul v1.4S, v1.4S,v2.s[0] +add v9.4s, v9.4s, v11.4s +ldr q11, [x0, #656] +mla v12.4S, v28.4S, v31.s[0] +sub v28.4s, v20.4s, v18.4s +ldr q27, [x0, #576] +mla v1.4S, v0.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +ldr q18, [x0, #704] +mul v25.4S, v25.4S,v5.s[0] +sub v0.4s, v27.4s, v14.4s +ldr q26, [x0, #592] +mul v30.4S, v30.4S,v5.s[0] +add v27.4s, v27.4s, v14.4s +ldr q14, [x0, #720] +mla v25.4S, v6.4S, v31.s[0] +mla v30.4S, v21.4S, v31.s[0] +sub v21.4s, v26.4s, v29.4s +sqrdmulh v6.4S, v20.4S, v10.s[1] +add v26.4s, v26.4s, v29.4s +mul v20.4S, v20.4S,v23.s[1] +sqrdmulh v29.4S, v28.4S, v10.s[2] +sub v7.4s, v8.4s, v12.4s +mul v28.4S, v28.4S,v23.s[2] +add v8.4s, v8.4s, v12.4s +sqrdmulh v10.4S, v26.4S, v13.s[1] +sub v23.4s, v11.4s, v1.4s +mul v26.4S, v26.4S,v19.s[1] +add v11.4s, v11.4s, v1.4s +sqrdmulh v1.4S, v21.4S, v13.s[2] +sub v12.4s, v18.4s, v25.4s +mul v21.4S, v21.4S,v19.s[2] +add v18.4s, v18.4s, v25.4s +mla v20.4S, v6.4S, v31.s[0] +sub v6.4s, v14.4s, v30.4s +ldr q13, [x0, #992] +sqrdmulh v19.4S, v11.4S, v22.s[1] +add v14.4s, v14.4s, v30.4s +mla v28.4S, v29.4S, v31.s[0] +ldr q29, [x0, #928] +sqrdmulh v30.4S, v23.4S, v22.s[2] +sub v25.4s, v9.4s, v20.4s +mla v26.4S, v10.4S, v31.s[0] +ldr q10, [x0, #800] +sqrdmulh v17.4S, v14.4S, v15.s[1] +add v9.4s, v9.4s, v20.4s +str q25, [x0, #528] +mla v21.4S, v1.4S, v31.s[0] +ldr q1, [x17, #+512] +ldr q25, [x17, #+528] +sqrdmulh v20.4S, v6.4S, v15.s[2] +sub v24.4s, v16.4s, v28.4s +str q9, [x0, #512] +mul v11.4S, v11.4S,v2.s[1] +add v16.4s, v16.4s, v28.4s +mul v23.4S, v23.4S,v2.s[2] +str q24, [x0, #560] +mla v11.4S, v19.4S, v31.s[0] +sub v19.4s, v27.4s, v26.4s +mla v23.4S, v30.4S, v31.s[0] +str q16, [x0, #544] +mul v14.4S, v14.4S,v5.s[1] +str q19, [x0, #592] +mul v6.4S, v6.4S,v5.s[2] +add v27.4s, v27.4s, v26.4s +str q27, [x0, #576] +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v0.4s, v21.4s +str q17, [x0, #624] +mla v6.4S, v20.4S, v31.s[0] +add v0.4s, v0.4s, v21.4s +str q0, [x0, #608] +sqrdmulh v15.4S, v10.4S, v25.s[0] +sub v5.4s, v8.4s, v11.4s +mul v10.4S, v10.4S,v1.s[0] +str q5, [x0, #656] +ldr q5, [x0, #816] +sqrdmulh v0.4S, v5.4S, v25.s[0] +add v8.4s, v8.4s, v11.4s +mul v5.4S, v5.4S,v1.s[0] +str q8, [x0, #640] +ldr q8, [x17, #+544] +ldr q11, [x17, #+560] +ldr q21, [x0, #864] +sqrdmulh v20.4S, v21.4S, v11.s[0] +sub v17.4s, v7.4s, v23.4s +mul v21.4S, v21.4S,v8.s[0] +str q17, [x0, #688] +ldr q17, [x0, #880] +sqrdmulh v27.4S, v17.4S, v11.s[0] +add v7.4s, v7.4s, v23.4s +mul v17.4S, v17.4S,v8.s[0] +str q7, [x0, #672] +ldr q7, [x17, #+576] +ldr q23, [x17, #+592] +mla v10.4S, v15.4S, v31.s[0] +sub v15.4s, v18.4s, v14.4s +sqrdmulh v26.4S, v29.4S, v23.s[0] +str q15, [x0, #720] +ldr q15, [x0, #944] +mla v5.4S, v0.4S, v31.s[0] +add v18.4s, v18.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v23.s[0] +str q18, [x0, #704] +ldr q18, [x17, #+608] +ldr q0, [x17, #+624] +mla v21.4S, v20.4S, v31.s[0] +sub v20.4s, v12.4s, v6.4s +sqrdmulh v19.4S, v13.4S, v0.s[0] +str q20, [x0, #752] +ldr q20, [x0, #1008] +mla v17.4S, v27.4S, v31.s[0] +add v12.4s, v12.4s, v6.4s +sqrdmulh v6.4S, v20.4S, v0.s[0] +str q12, [x0, #736] +ldr q12, [x0, #768] +ldr q27, [x0, #896] +mul v29.4S, v29.4S,v7.s[0] +sub v22.4s, v12.4s, v10.4s +ldr q2, [x0, #784] +mul v15.4S, v15.4S,v7.s[0] +add v12.4s, v12.4s, v10.4s +ldr q10, [x0, #912] +mla v29.4S, v26.4S, v31.s[0] +sub v26.4s, v2.4s, v5.4s +ldr q16, [x0, #832] +mla v15.4S, v14.4S, v31.s[0] +add v2.4s, v2.4s, v5.4s +ldr q5, [x0, #960] +mul v13.4S, v13.4S,v18.s[0] +sub v14.4s, v16.4s, v21.4s +ldr q30, [x0, #848] +mul v20.4S, v20.4S,v18.s[0] +add v16.4s, v16.4s, v21.4s +ldr q21, [x0, #976] +mla v13.4S, v19.4S, v31.s[0] +mla v20.4S, v6.4S, v31.s[0] +sub v6.4s, v30.4s, v17.4s +sqrdmulh v19.4S, v2.4S, v25.s[1] +add v30.4s, v30.4s, v17.4s +mul v2.4S, v2.4S,v1.s[1] +sqrdmulh v17.4S, v26.4S, v25.s[2] +sub v24.4s, v27.4s, v29.4s +mul v26.4S, v26.4S,v1.s[2] +add v27.4s, v27.4s, v29.4s +sqrdmulh v25.4S, v30.4S, v11.s[1] +sub v1.4s, v10.4s, v15.4s +mul v30.4S, v30.4S,v8.s[1] +add v10.4s, v10.4s, v15.4s +sqrdmulh v15.4S, v6.4S, v11.s[2] +sub v29.4s, v5.4s, v13.4s +mul v6.4S, v6.4S,v8.s[2] +add v5.4s, v5.4s, v13.4s +mla v2.4S, v19.4S, v31.s[0] +sub v19.4s, v21.4s, v20.4s +sqrdmulh v11.4S, v10.4S, v23.s[1] +add v21.4s, v21.4s, v20.4s +mla v26.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v1.4S, v23.s[2] +sub v20.4s, v12.4s, v2.4s +mla v30.4S, v25.4S, v31.s[0] +sqrdmulh v25.4S, v21.4S, v0.s[1] +add v12.4s, v12.4s, v2.4s +str q20, [x0, #784] +mla v6.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v19.4S, v0.s[2] +sub v20.4s, v22.4s, v26.4s +str q12, [x0, #768] +mul v10.4S, v10.4S,v7.s[1] +add v22.4s, v22.4s, v26.4s +mul v1.4S, v1.4S,v7.s[2] +str q20, [x0, #816] +mla v10.4S, v11.4S, v31.s[0] +sub v11.4s, v16.4s, v30.4s +mla v1.4S, v17.4S, v31.s[0] +str q22, [x0, #800] +mul v21.4S, v21.4S,v18.s[1] +str q11, [x0, #848] +mul v19.4S, v19.4S,v18.s[2] +add v16.4s, v16.4s, v30.4s +str q16, [x0, #832] +mla v21.4S, v25.4S, v31.s[0] +sub v25.4s, v14.4s, v6.4s +str q25, [x0, #880] +mla v19.4S, v15.4S, v31.s[0] +add v14.4s, v14.4s, v6.4s +str q14, [x0, #864] +sub v0.4s, v27.4s, v10.4s +str q0, [x0, #912] +add v27.4s, v27.4s, v10.4s +str q27, [x0, #896] +sub v27.4s, v24.4s, v1.4s +str q27, [x0, #944] +add v24.4s, v24.4s, v1.4s +str q24, [x0, #928] +sub v24.4s, v5.4s, v21.4s +str q24, [x0, #976] +add v5.4s, v5.4s, v21.4s +str q5, [x0, #960] +sub v5.4s, v29.4s, v19.4s +str q5, [x0, #1008] +add v29.4s, v29.4s, v19.4s +str q29, [x0, #992] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1528 +// Instruction count: 1524 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_20_z4_7.s b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_20_z4_7.s new file mode 100644 index 0000000..203055c --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_20_z4_7.s @@ -0,0 +1,1558 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_20_z4_7 +.global _ntt_u32_incomplete_neon_asm_var_4_2_20_z4_7 +ntt_u32_incomplete_neon_asm_var_4_2_20_z4_7: +_ntt_u32_incomplete_neon_asm_var_4_2_20_z4_7: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x0, #992] +sqrdmulh v27.4S, v28.4S, v29.s[0] +mul v28.4S, v28.4S,v30.s[0] +ldr q26, [x0, #928] +sqrdmulh v25.4S, v26.4S, v29.s[0] +mul v26.4S, v26.4S,v30.s[0] +ldr q24, [x0, #864] +sqrdmulh v23.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +ldr q22, [x0, #800] +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +ldr q20, [x0, #736] +sqrdmulh v19.4S, v20.4S, v29.s[0] +mla v28.4S, v27.4S, v31.s[0] +ldr q27, [x0, #672] +sqrdmulh v18.4S, v27.4S, v29.s[0] +mla v26.4S, v25.4S, v31.s[0] +ldr q25, [x0, #608] +sqrdmulh v17.4S, v25.4S, v29.s[0] +mla v24.4S, v23.4S, v31.s[0] +ldr q23, [x0, #544] +sqrdmulh v16.4S, v23.4S, v29.s[0] +mla v22.4S, v21.4S, v31.s[0] +ldr q21, [x0, #480] +ldr q3, [x0, #416] +mul v27.4S, v27.4S,v30.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q2, [x0, #352] +ldr q1, [x0, #288] +mla v27.4S, v18.4S, v31.s[0] +mla v20.4S, v19.4S, v31.s[0] +ldr q19, [x0, #224] +ldr q18, [x0, #160] +mul v23.4S, v23.4S,v30.s[0] +mul v25.4S, v25.4S,v30.s[0] +ldr q0, [x0, #96] +ldr q15, [x0, #32] +mla v23.4S, v16.4S, v31.s[0] +mla v25.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v28.4s +add v21.4s, v21.4s, v28.4s +sqrdmulh v28.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +sub v16.4s, v3.4s, v26.4s +add v3.4s, v3.4s, v26.4s +sqrdmulh v26.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +sub v14.4s, v2.4s, v24.4s +add v2.4s, v2.4s, v24.4s +sqrdmulh v24.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v13.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +sqrdmulh v22.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v12.4s, v19.4s, v20.4s +add v19.4s, v19.4s, v20.4s +sqrdmulh v20.4S, v14.4S, v29.s[2] +mla v17.4S, v28.4S, v31.s[0] +sub v28.4s, v18.4s, v27.4s +add v18.4s, v18.4s, v27.4s +sqrdmulh v27.4S, v13.4S, v29.s[2] +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v0.4s, v25.4s +add v0.4s, v0.4s, v25.4s +sqrdmulh v25.4S, v2.4S, v29.s[1] +mla v21.4S, v24.4S, v31.s[0] +sub v24.4s, v15.4s, v23.4s +sqrdmulh v11.4S, v1.4S, v29.s[1] +mla v3.4S, v22.4S, v31.s[0] +add v15.4s, v15.4s, v23.4s +ldr q23, [x17, #+32] +ldr q22, [x17, #+48] +mul v13.4S, v13.4S,v30.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v10.4s, v12.4s, v17.4s +add v12.4s, v12.4s, v17.4s +mla v13.4S, v27.4S, v31.s[0] +mla v14.4S, v20.4S, v31.s[0] +sub v20.4s, v28.4s, v16.4s +add v28.4s, v28.4s, v16.4s +mul v1.4S, v1.4S,v30.s[1] +mul v2.4S, v2.4S,v30.s[1] +sub v16.4s, v19.4s, v21.4s +add v19.4s, v19.4s, v21.4s +mla v1.4S, v11.4S, v31.s[0] +mla v2.4S, v25.4S, v31.s[0] +sub v25.4s, v18.4s, v3.4s +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v10.4S, v22.s[3] +mul v10.4S, v10.4S,v23.s[3] +sub v11.4s, v26.4s, v14.4s +add v26.4s, v26.4s, v14.4s +sqrdmulh v14.4S, v12.4S, v22.s[2] +mul v12.4S, v12.4S,v23.s[2] +sub v21.4s, v24.4s, v13.4s +add v24.4s, v24.4s, v13.4s +sqrdmulh v13.4S, v16.4S, v22.s[1] +mul v16.4S, v16.4S,v23.s[1] +sub v27.4s, v0.4s, v2.4s +add v0.4s, v0.4s, v2.4s +sqrdmulh v2.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v17.4s, v15.4s, v1.4s +add v15.4s, v15.4s, v1.4s +ldr q1, [x17, #+96] +ldr q9, [x17, #+112] +sqrdmulh v8.4S, v20.4S, v22.s[3] +mla v10.4S, v3.4S, v31.s[0] +nop +nop +sqrdmulh v3.4S, v28.4S, v22.s[2] +mla v12.4S, v14.4S, v31.s[0] +nop +nop +sqrdmulh v14.4S, v25.4S, v22.s[1] +mla v16.4S, v13.4S, v31.s[0] +nop +nop +sqrdmulh v13.4S, v18.4S, v22.s[0] +mla v19.4S, v2.4S, v31.s[0] +nop +nop +ldr q2, [x17, #+64] +ldr q7, [x17, #+80] +mul v28.4S, v28.4S,v23.s[2] +mul v20.4S, v20.4S,v23.s[3] +sub v6.4s, v11.4s, v10.4s +add v11.4s, v11.4s, v10.4s +mla v28.4S, v3.4S, v31.s[0] +mla v20.4S, v8.4S, v31.s[0] +sub v8.4s, v26.4s, v12.4s +add v26.4s, v26.4s, v12.4s +mul v18.4S, v18.4S,v23.s[0] +mul v25.4S, v25.4S,v23.s[1] +sub v12.4s, v27.4s, v16.4s +add v27.4s, v27.4s, v16.4s +mla v18.4S, v13.4S, v31.s[0] +mla v25.4S, v14.4S, v31.s[0] +sub v14.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v9.s[3] +mul v6.4S, v6.4S,v1.s[3] +sub v13.4s, v21.4s, v20.4s +add v21.4s, v21.4s, v20.4s +sqrdmulh v20.4S, v11.4S, v9.s[2] +mul v11.4S, v11.4S,v1.s[2] +sub v16.4s, v24.4s, v28.4s +add v24.4s, v24.4s, v28.4s +sqrdmulh v28.4S, v8.4S, v9.s[1] +mul v8.4S, v8.4S,v1.s[1] +sub v3.4s, v17.4s, v25.4s +add v17.4s, v17.4s, v25.4s +sqrdmulh v25.4S, v26.4S, v9.s[0] +mul v26.4S, v26.4S,v1.s[0] +sub v10.4s, v15.4s, v18.4s +add v15.4s, v15.4s, v18.4s +sqrdmulh v18.4S, v12.4S, v7.s[3] +mla v6.4S, v19.4S, v31.s[0] +nop +nop +sqrdmulh v19.4S, v27.4S, v7.s[2] +mla v11.4S, v20.4S, v31.s[0] +nop +nop +sqrdmulh v20.4S, v14.4S, v7.s[1] +mla v8.4S, v28.4S, v31.s[0] +nop +nop +sqrdmulh v28.4S, v0.4S, v7.s[0] +mla v26.4S, v25.4S, v31.s[0] +nop +nop +mul v27.4S, v27.4S,v2.s[2] +mul v12.4S, v12.4S,v2.s[3] +sub v25.4s, v13.4s, v6.4s +str q25, [x0, #992] +mla v27.4S, v19.4S, v31.s[0] +mla v12.4S, v18.4S, v31.s[0] +add v13.4s, v13.4s, v6.4s +str q13, [x0, #928] +mul v0.4S, v0.4S,v2.s[0] +mul v14.4S, v14.4S,v2.s[1] +sub v13.4s, v21.4s, v11.4s +str q13, [x0, #864] +mla v0.4S, v28.4S, v31.s[0] +mla v14.4S, v20.4S, v31.s[0] +add v21.4s, v21.4s, v11.4s +sub v11.4s, v16.4s, v8.4s +ldr q20, [x0, #1008] +sqrdmulh v28.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v16.4s, v16.4s, v8.4s +str q21, [x0, #800] +ldr q21, [x0, #944] +sqrdmulh v8.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +sub v13.4s, v24.4s, v26.4s +str q11, [x0, #736] +ldr q11, [x0, #880] +sqrdmulh v6.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +add v24.4s, v24.4s, v26.4s +str q16, [x0, #672] +ldr q16, [x0, #816] +sqrdmulh v26.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +str q13, [x0, #608] +sub v13.4s, v3.4s, v12.4s +ldr q18, [x0, #752] +sqrdmulh v19.4S, v18.4S, v29.s[0] +mla v20.4S, v28.4S, v31.s[0] +str q24, [x0, #544] +add v3.4s, v3.4s, v12.4s +ldr q12, [x0, #688] +sqrdmulh v24.4S, v12.4S, v29.s[0] +mla v21.4S, v8.4S, v31.s[0] +str q13, [x0, #480] +sub v13.4s, v17.4s, v27.4s +ldr q8, [x0, #624] +sqrdmulh v28.4S, v8.4S, v29.s[0] +mla v11.4S, v6.4S, v31.s[0] +str q3, [x0, #416] +add v17.4s, v17.4s, v27.4s +ldr q27, [x0, #560] +sqrdmulh v3.4S, v27.4S, v29.s[0] +mla v16.4S, v26.4S, v31.s[0] +str q13, [x0, #352] +sub v13.4s, v10.4s, v14.4s +ldr q26, [x0, #496] +ldr q6, [x0, #432] +mul v12.4S, v12.4S,v30.s[0] +mul v18.4S, v18.4S,v30.s[0] +str q17, [x0, #288] +add v10.4s, v10.4s, v14.4s +ldr q14, [x0, #368] +ldr q17, [x0, #304] +mla v12.4S, v24.4S, v31.s[0] +mla v18.4S, v19.4S, v31.s[0] +str q13, [x0, #224] +sub v13.4s, v15.4s, v0.4s +ldr q19, [x0, #240] +ldr q24, [x0, #176] +mul v27.4S, v27.4S,v30.s[0] +mul v8.4S, v8.4S,v30.s[0] +str q10, [x0, #160] +add v15.4s, v15.4s, v0.4s +ldr q0, [x0, #112] +ldr q10, [x0, #48] +mla v27.4S, v3.4S, v31.s[0] +mla v8.4S, v28.4S, v31.s[0] +sub v28.4s, v26.4s, v20.4s +add v26.4s, v26.4s, v20.4s +sqrdmulh v20.4S, v28.4S, v29.s[2] +mul v28.4S, v28.4S,v30.s[2] +sub v3.4s, v6.4s, v21.4s +add v6.4s, v6.4s, v21.4s +sqrdmulh v21.4S, v3.4S, v29.s[2] +mul v3.4S, v3.4S,v30.s[2] +sub v25.4s, v14.4s, v11.4s +add v14.4s, v14.4s, v11.4s +sqrdmulh v11.4S, v26.4S, v29.s[1] +mul v26.4S, v26.4S,v30.s[1] +sub v5.4s, v17.4s, v16.4s +add v17.4s, v17.4s, v16.4s +sqrdmulh v16.4S, v6.4S, v29.s[1] +mul v6.4S, v6.4S,v30.s[1] +sub v4.4s, v19.4s, v18.4s +add v19.4s, v19.4s, v18.4s +sqrdmulh v18.4S, v25.4S, v29.s[2] +mla v28.4S, v20.4S, v31.s[0] +sub v20.4s, v24.4s, v12.4s +add v24.4s, v24.4s, v12.4s +sqrdmulh v12.4S, v5.4S, v29.s[2] +mla v3.4S, v21.4S, v31.s[0] +sub v21.4s, v0.4s, v8.4s +add v0.4s, v0.4s, v8.4s +sqrdmulh v8.4S, v14.4S, v29.s[1] +mla v26.4S, v11.4S, v31.s[0] +sub v11.4s, v10.4s, v27.4s +str q13, [x0, #96] +sqrdmulh v13.4S, v17.4S, v29.s[1] +mla v6.4S, v16.4S, v31.s[0] +add v10.4s, v10.4s, v27.4s +str q15, [x0, #32] +mul v5.4S, v5.4S,v30.s[2] +mul v25.4S, v25.4S,v30.s[2] +sub v15.4s, v4.4s, v28.4s +add v4.4s, v4.4s, v28.4s +mla v5.4S, v12.4S, v31.s[0] +mla v25.4S, v18.4S, v31.s[0] +sub v18.4s, v20.4s, v3.4s +add v20.4s, v20.4s, v3.4s +mul v17.4S, v17.4S,v30.s[1] +mul v14.4S, v14.4S,v30.s[1] +sub v3.4s, v19.4s, v26.4s +add v19.4s, v19.4s, v26.4s +mla v17.4S, v13.4S, v31.s[0] +mla v14.4S, v8.4S, v31.s[0] +sub v8.4s, v24.4s, v6.4s +add v24.4s, v24.4s, v6.4s +sqrdmulh v6.4S, v15.4S, v22.s[3] +mul v15.4S, v15.4S,v23.s[3] +sub v13.4s, v21.4s, v25.4s +add v21.4s, v21.4s, v25.4s +sqrdmulh v25.4S, v4.4S, v22.s[2] +mul v4.4S, v4.4S,v23.s[2] +sub v26.4s, v11.4s, v5.4s +add v11.4s, v11.4s, v5.4s +sqrdmulh v5.4S, v3.4S, v22.s[1] +mul v3.4S, v3.4S,v23.s[1] +sub v12.4s, v0.4s, v14.4s +add v0.4s, v0.4s, v14.4s +sqrdmulh v14.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v28.4s, v10.4s, v17.4s +add v10.4s, v10.4s, v17.4s +sqrdmulh v17.4S, v18.4S, v22.s[3] +mla v15.4S, v6.4S, v31.s[0] +nop +nop +sqrdmulh v6.4S, v20.4S, v22.s[2] +mla v4.4S, v25.4S, v31.s[0] +nop +nop +sqrdmulh v25.4S, v8.4S, v22.s[1] +mla v3.4S, v5.4S, v31.s[0] +nop +nop +sqrdmulh v5.4S, v24.4S, v22.s[0] +mla v19.4S, v14.4S, v31.s[0] +nop +nop +mul v20.4S, v20.4S,v23.s[2] +mul v18.4S, v18.4S,v23.s[3] +sub v14.4s, v13.4s, v15.4s +add v13.4s, v13.4s, v15.4s +mla v20.4S, v6.4S, v31.s[0] +mla v18.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v4.4s +add v21.4s, v21.4s, v4.4s +mul v24.4S, v24.4S,v23.s[0] +mul v8.4S, v8.4S,v23.s[1] +sub v4.4s, v12.4s, v3.4s +add v12.4s, v12.4s, v3.4s +mla v24.4S, v5.4S, v31.s[0] +mla v8.4S, v25.4S, v31.s[0] +sub v25.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v9.s[3] +mul v14.4S, v14.4S,v1.s[3] +sub v5.4s, v26.4s, v18.4s +add v26.4s, v26.4s, v18.4s +sqrdmulh v18.4S, v13.4S, v9.s[2] +mul v13.4S, v13.4S,v1.s[2] +sub v3.4s, v11.4s, v20.4s +add v11.4s, v11.4s, v20.4s +sqrdmulh v20.4S, v17.4S, v9.s[1] +mul v17.4S, v17.4S,v1.s[1] +sub v6.4s, v28.4s, v8.4s +add v28.4s, v28.4s, v8.4s +sqrdmulh v8.4S, v21.4S, v9.s[0] +mul v21.4S, v21.4S,v1.s[0] +sub v15.4s, v10.4s, v24.4s +add v10.4s, v10.4s, v24.4s +sqrdmulh v24.4S, v4.4S, v7.s[3] +mla v14.4S, v19.4S, v31.s[0] +nop +nop +sqrdmulh v19.4S, v12.4S, v7.s[2] +mla v13.4S, v18.4S, v31.s[0] +nop +nop +sqrdmulh v18.4S, v25.4S, v7.s[1] +mla v17.4S, v20.4S, v31.s[0] +nop +nop +sqrdmulh v20.4S, v0.4S, v7.s[0] +mla v21.4S, v8.4S, v31.s[0] +nop +nop +mul v12.4S, v12.4S,v2.s[2] +mul v4.4S, v4.4S,v2.s[3] +sub v8.4s, v5.4s, v14.4s +str q8, [x0, #1008] +mla v12.4S, v19.4S, v31.s[0] +mla v4.4S, v24.4S, v31.s[0] +add v5.4s, v5.4s, v14.4s +str q5, [x0, #944] +mul v0.4S, v0.4S,v2.s[0] +mul v25.4S, v25.4S,v2.s[1] +sub v5.4s, v26.4s, v13.4s +str q5, [x0, #880] +mla v0.4S, v20.4S, v31.s[0] +mla v25.4S, v18.4S, v31.s[0] +add v26.4s, v26.4s, v13.4s +sub v13.4s, v3.4s, v17.4s +ldr q18, [x0, #960] +sqrdmulh v20.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +add v3.4s, v3.4s, v17.4s +str q26, [x0, #816] +ldr q26, [x0, #896] +sqrdmulh v17.4S, v26.4S, v29.s[0] +mul v26.4S, v26.4S,v30.s[0] +sub v5.4s, v11.4s, v21.4s +str q13, [x0, #752] +ldr q13, [x0, #832] +sqrdmulh v14.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +add v11.4s, v11.4s, v21.4s +str q3, [x0, #688] +ldr q3, [x0, #768] +sqrdmulh v21.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +str q5, [x0, #624] +sub v5.4s, v6.4s, v4.4s +ldr q24, [x0, #704] +sqrdmulh v19.4S, v24.4S, v29.s[0] +mla v18.4S, v20.4S, v31.s[0] +str q11, [x0, #560] +add v6.4s, v6.4s, v4.4s +ldr q4, [x0, #640] +sqrdmulh v11.4S, v4.4S, v29.s[0] +mla v26.4S, v17.4S, v31.s[0] +str q5, [x0, #496] +sub v5.4s, v28.4s, v12.4s +ldr q17, [x0, #576] +sqrdmulh v20.4S, v17.4S, v29.s[0] +mla v13.4S, v14.4S, v31.s[0] +str q6, [x0, #432] +add v28.4s, v28.4s, v12.4s +ldr q12, [x0, #512] +sqrdmulh v6.4S, v12.4S, v29.s[0] +mla v3.4S, v21.4S, v31.s[0] +str q5, [x0, #368] +sub v5.4s, v15.4s, v25.4s +ldr q21, [x0, #448] +ldr q14, [x0, #384] +mul v4.4S, v4.4S,v30.s[0] +mul v24.4S, v24.4S,v30.s[0] +str q28, [x0, #304] +add v15.4s, v15.4s, v25.4s +ldr q25, [x0, #320] +ldr q28, [x0, #256] +mla v4.4S, v11.4S, v31.s[0] +mla v24.4S, v19.4S, v31.s[0] +str q5, [x0, #240] +sub v5.4s, v10.4s, v0.4s +ldr q19, [x0, #192] +ldr q11, [x0, #128] +mul v12.4S, v12.4S,v30.s[0] +mul v17.4S, v17.4S,v30.s[0] +str q15, [x0, #176] +add v10.4s, v10.4s, v0.4s +ldr q0, [x0, #64] +ldr q15, [x0, #0] +mla v12.4S, v6.4S, v31.s[0] +mla v17.4S, v20.4S, v31.s[0] +sub v20.4s, v21.4s, v18.4s +add v21.4s, v21.4s, v18.4s +sqrdmulh v18.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +sub v6.4s, v14.4s, v26.4s +add v14.4s, v14.4s, v26.4s +sqrdmulh v26.4S, v6.4S, v29.s[2] +mul v6.4S, v6.4S,v30.s[2] +sub v8.4s, v25.4s, v13.4s +add v25.4s, v25.4s, v13.4s +sqrdmulh v13.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v27.4s, v28.4s, v3.4s +add v28.4s, v28.4s, v3.4s +sqrdmulh v3.4S, v14.4S, v29.s[1] +mul v14.4S, v14.4S,v30.s[1] +sub v16.4s, v19.4s, v24.4s +add v19.4s, v19.4s, v24.4s +sqrdmulh v24.4S, v8.4S, v29.s[2] +mla v20.4S, v18.4S, v31.s[0] +sub v18.4s, v11.4s, v4.4s +add v11.4s, v11.4s, v4.4s +sqrdmulh v4.4S, v27.4S, v29.s[2] +mla v6.4S, v26.4S, v31.s[0] +sub v26.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +sqrdmulh v17.4S, v25.4S, v29.s[1] +mla v21.4S, v13.4S, v31.s[0] +sub v13.4s, v15.4s, v12.4s +str q5, [x0, #112] +sqrdmulh v5.4S, v28.4S, v29.s[1] +mla v14.4S, v3.4S, v31.s[0] +add v15.4s, v15.4s, v12.4s +str q10, [x0, #48] +mul v27.4S, v27.4S,v30.s[2] +mul v8.4S, v8.4S,v30.s[2] +sub v10.4s, v16.4s, v20.4s +add v16.4s, v16.4s, v20.4s +mla v27.4S, v4.4S, v31.s[0] +mla v8.4S, v24.4S, v31.s[0] +sub v24.4s, v18.4s, v6.4s +add v18.4s, v18.4s, v6.4s +mul v28.4S, v28.4S,v30.s[1] +mul v25.4S, v25.4S,v30.s[1] +sub v6.4s, v19.4s, v21.4s +add v19.4s, v19.4s, v21.4s +mla v28.4S, v5.4S, v31.s[0] +mla v25.4S, v17.4S, v31.s[0] +sub v17.4s, v11.4s, v14.4s +add v11.4s, v11.4s, v14.4s +sqrdmulh v14.4S, v10.4S, v22.s[3] +mul v10.4S, v10.4S,v23.s[3] +sub v5.4s, v26.4s, v8.4s +add v26.4s, v26.4s, v8.4s +sqrdmulh v8.4S, v16.4S, v22.s[2] +mul v16.4S, v16.4S,v23.s[2] +sub v21.4s, v13.4s, v27.4s +add v13.4s, v13.4s, v27.4s +sqrdmulh v27.4S, v6.4S, v22.s[1] +mul v6.4S, v6.4S,v23.s[1] +sub v4.4s, v0.4s, v25.4s +add v0.4s, v0.4s, v25.4s +sqrdmulh v25.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v20.4s, v15.4s, v28.4s +add v15.4s, v15.4s, v28.4s +sqrdmulh v28.4S, v24.4S, v22.s[3] +mla v10.4S, v14.4S, v31.s[0] +nop +nop +sqrdmulh v14.4S, v18.4S, v22.s[2] +mla v16.4S, v8.4S, v31.s[0] +nop +nop +sqrdmulh v8.4S, v17.4S, v22.s[1] +mla v6.4S, v27.4S, v31.s[0] +nop +nop +sqrdmulh v27.4S, v11.4S, v22.s[0] +mla v19.4S, v25.4S, v31.s[0] +nop +nop +mul v18.4S, v18.4S,v23.s[2] +mul v24.4S, v24.4S,v23.s[3] +sub v25.4s, v5.4s, v10.4s +add v5.4s, v5.4s, v10.4s +mla v18.4S, v14.4S, v31.s[0] +mla v24.4S, v28.4S, v31.s[0] +sub v28.4s, v26.4s, v16.4s +add v26.4s, v26.4s, v16.4s +mul v11.4S, v11.4S,v23.s[0] +mul v17.4S, v17.4S,v23.s[1] +sub v16.4s, v4.4s, v6.4s +add v4.4s, v4.4s, v6.4s +mla v11.4S, v27.4S, v31.s[0] +mla v17.4S, v8.4S, v31.s[0] +sub v8.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v25.4S, v9.s[3] +mul v25.4S, v25.4S,v1.s[3] +sub v27.4s, v21.4s, v24.4s +add v21.4s, v21.4s, v24.4s +sqrdmulh v24.4S, v5.4S, v9.s[2] +mul v5.4S, v5.4S,v1.s[2] +sub v6.4s, v13.4s, v18.4s +add v13.4s, v13.4s, v18.4s +sqrdmulh v18.4S, v28.4S, v9.s[1] +mul v28.4S, v28.4S,v1.s[1] +sub v14.4s, v20.4s, v17.4s +add v20.4s, v20.4s, v17.4s +sqrdmulh v17.4S, v26.4S, v9.s[0] +mul v26.4S, v26.4S,v1.s[0] +sub v10.4s, v15.4s, v11.4s +add v15.4s, v15.4s, v11.4s +sqrdmulh v11.4S, v16.4S, v7.s[3] +mla v25.4S, v19.4S, v31.s[0] +nop +nop +sqrdmulh v19.4S, v4.4S, v7.s[2] +mla v5.4S, v24.4S, v31.s[0] +nop +nop +sqrdmulh v24.4S, v8.4S, v7.s[1] +mla v28.4S, v18.4S, v31.s[0] +nop +nop +sqrdmulh v18.4S, v0.4S, v7.s[0] +mla v26.4S, v17.4S, v31.s[0] +nop +nop +mul v4.4S, v4.4S,v2.s[2] +mul v16.4S, v16.4S,v2.s[3] +sub v17.4s, v27.4s, v25.4s +str q17, [x0, #960] +mla v4.4S, v19.4S, v31.s[0] +mla v16.4S, v11.4S, v31.s[0] +add v27.4s, v27.4s, v25.4s +str q27, [x0, #896] +mul v0.4S, v0.4S,v2.s[0] +mul v8.4S, v8.4S,v2.s[1] +sub v27.4s, v21.4s, v5.4s +str q27, [x0, #832] +mla v0.4S, v18.4S, v31.s[0] +mla v8.4S, v24.4S, v31.s[0] +add v21.4s, v21.4s, v5.4s +sub v5.4s, v6.4s, v28.4s +ldr q24, [x0, #976] +sqrdmulh v18.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +add v6.4s, v6.4s, v28.4s +str q21, [x0, #768] +ldr q21, [x0, #912] +sqrdmulh v28.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +sub v27.4s, v13.4s, v26.4s +str q5, [x0, #704] +ldr q5, [x0, #848] +sqrdmulh v25.4S, v5.4S, v29.s[0] +mul v5.4S, v5.4S,v30.s[0] +add v13.4s, v13.4s, v26.4s +str q6, [x0, #640] +ldr q6, [x0, #784] +sqrdmulh v26.4S, v6.4S, v29.s[0] +mul v6.4S, v6.4S,v30.s[0] +str q27, [x0, #576] +sub v27.4s, v14.4s, v16.4s +ldr q11, [x0, #720] +sqrdmulh v19.4S, v11.4S, v29.s[0] +mla v24.4S, v18.4S, v31.s[0] +str q13, [x0, #512] +add v14.4s, v14.4s, v16.4s +ldr q16, [x0, #656] +sqrdmulh v13.4S, v16.4S, v29.s[0] +mla v21.4S, v28.4S, v31.s[0] +str q27, [x0, #448] +sub v27.4s, v20.4s, v4.4s +ldr q28, [x0, #592] +sqrdmulh v18.4S, v28.4S, v29.s[0] +mla v5.4S, v25.4S, v31.s[0] +str q14, [x0, #384] +add v20.4s, v20.4s, v4.4s +ldr q4, [x0, #528] +sqrdmulh v14.4S, v4.4S, v29.s[0] +mla v6.4S, v26.4S, v31.s[0] +str q27, [x0, #320] +sub v27.4s, v10.4s, v8.4s +ldr q26, [x0, #464] +ldr q25, [x0, #400] +mul v16.4S, v16.4S,v30.s[0] +mul v11.4S, v11.4S,v30.s[0] +str q20, [x0, #256] +add v10.4s, v10.4s, v8.4s +ldr q8, [x0, #336] +ldr q20, [x0, #272] +mla v16.4S, v13.4S, v31.s[0] +mla v11.4S, v19.4S, v31.s[0] +str q27, [x0, #192] +sub v27.4s, v15.4s, v0.4s +ldr q19, [x0, #208] +ldr q13, [x0, #144] +mul v4.4S, v4.4S,v30.s[0] +mul v28.4S, v28.4S,v30.s[0] +str q10, [x0, #128] +add v15.4s, v15.4s, v0.4s +ldr q0, [x0, #80] +ldr q10, [x0, #16] +mla v4.4S, v14.4S, v31.s[0] +mla v28.4S, v18.4S, v31.s[0] +sub v18.4s, v26.4s, v24.4s +add v26.4s, v26.4s, v24.4s +sqrdmulh v24.4S, v18.4S, v29.s[2] +mul v18.4S, v18.4S,v30.s[2] +sub v14.4s, v25.4s, v21.4s +add v25.4s, v25.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v17.4s, v8.4s, v5.4s +add v8.4s, v8.4s, v5.4s +sqrdmulh v5.4S, v26.4S, v29.s[1] +mul v26.4S, v26.4S,v30.s[1] +sub v12.4s, v20.4s, v6.4s +add v20.4s, v20.4s, v6.4s +sqrdmulh v6.4S, v25.4S, v29.s[1] +mul v25.4S, v25.4S,v30.s[1] +sub v3.4s, v19.4s, v11.4s +add v19.4s, v19.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v29.s[2] +mla v18.4S, v24.4S, v31.s[0] +sub v24.4s, v13.4s, v16.4s +add v13.4s, v13.4s, v16.4s +sqrdmulh v16.4S, v12.4S, v29.s[2] +mla v14.4S, v21.4S, v31.s[0] +sub v21.4s, v0.4s, v28.4s +add v0.4s, v0.4s, v28.4s +sqrdmulh v28.4S, v8.4S, v29.s[1] +mla v26.4S, v5.4S, v31.s[0] +sub v5.4s, v10.4s, v4.4s +str q27, [x0, #64] +sqrdmulh v27.4S, v20.4S, v29.s[1] +mla v25.4S, v6.4S, v31.s[0] +add v10.4s, v10.4s, v4.4s +str q15, [x0, #0] +mul v12.4S, v12.4S,v30.s[2] +mul v17.4S, v17.4S,v30.s[2] +sub v15.4s, v3.4s, v18.4s +add v3.4s, v3.4s, v18.4s +mla v12.4S, v16.4S, v31.s[0] +mla v17.4S, v11.4S, v31.s[0] +sub v11.4s, v24.4s, v14.4s +add v24.4s, v24.4s, v14.4s +mul v20.4S, v20.4S,v30.s[1] +mul v8.4S, v8.4S,v30.s[1] +sub v14.4s, v19.4s, v26.4s +add v19.4s, v19.4s, v26.4s +mla v20.4S, v27.4S, v31.s[0] +mla v8.4S, v28.4S, v31.s[0] +sub v28.4s, v13.4s, v25.4s +add v13.4s, v13.4s, v25.4s +sqrdmulh v29.4S, v15.4S, v22.s[3] +mul v15.4S, v15.4S,v23.s[3] +sub v30.4s, v21.4s, v17.4s +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v3.4S, v22.s[2] +mul v3.4S, v3.4S,v23.s[2] +sub v25.4s, v5.4s, v12.4s +add v5.4s, v5.4s, v12.4s +sqrdmulh v12.4S, v14.4S, v22.s[1] +mul v14.4S, v14.4S,v23.s[1] +sub v27.4s, v0.4s, v8.4s +add v0.4s, v0.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v26.4s, v10.4s, v20.4s +add v10.4s, v10.4s, v20.4s +sqrdmulh v20.4S, v11.4S, v22.s[3] +mla v15.4S, v29.4S, v31.s[0] +nop +nop +sqrdmulh v29.4S, v24.4S, v22.s[2] +mla v3.4S, v17.4S, v31.s[0] +nop +nop +sqrdmulh v17.4S, v28.4S, v22.s[1] +mla v14.4S, v12.4S, v31.s[0] +nop +nop +sqrdmulh v12.4S, v13.4S, v22.s[0] +mla v19.4S, v8.4S, v31.s[0] +nop +nop +mul v24.4S, v24.4S,v23.s[2] +mul v11.4S, v11.4S,v23.s[3] +sub v8.4s, v30.4s, v15.4s +add v30.4s, v30.4s, v15.4s +mla v24.4S, v29.4S, v31.s[0] +mla v11.4S, v20.4S, v31.s[0] +sub v20.4s, v21.4s, v3.4s +add v21.4s, v21.4s, v3.4s +mul v13.4S, v13.4S,v23.s[0] +mul v28.4S, v28.4S,v23.s[1] +sub v3.4s, v27.4s, v14.4s +add v27.4s, v27.4s, v14.4s +mla v13.4S, v12.4S, v31.s[0] +mla v28.4S, v17.4S, v31.s[0] +sub v17.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v22.4S, v8.4S, v9.s[3] +mul v8.4S, v8.4S,v1.s[3] +sub v23.4s, v25.4s, v11.4s +add v25.4s, v25.4s, v11.4s +sqrdmulh v11.4S, v30.4S, v9.s[2] +mul v30.4S, v30.4S,v1.s[2] +sub v19.4s, v5.4s, v24.4s +add v5.4s, v5.4s, v24.4s +sqrdmulh v24.4S, v20.4S, v9.s[1] +mul v20.4S, v20.4S,v1.s[1] +sub v12.4s, v26.4s, v28.4s +add v26.4s, v26.4s, v28.4s +sqrdmulh v28.4S, v21.4S, v9.s[0] +mul v21.4S, v21.4S,v1.s[0] +sub v14.4s, v10.4s, v13.4s +add v10.4s, v10.4s, v13.4s +sqrdmulh v9.4S, v3.4S, v7.s[3] +mla v8.4S, v22.4S, v31.s[0] +nop +nop +sqrdmulh v22.4S, v27.4S, v7.s[2] +mla v30.4S, v11.4S, v31.s[0] +nop +nop +sqrdmulh v11.4S, v17.4S, v7.s[1] +mla v20.4S, v24.4S, v31.s[0] +nop +nop +sqrdmulh v24.4S, v0.4S, v7.s[0] +mla v21.4S, v28.4S, v31.s[0] +nop +nop +mul v27.4S, v27.4S,v2.s[2] +mul v3.4S, v3.4S,v2.s[3] +sub v28.4s, v23.4s, v8.4s +str q28, [x0, #976] +mla v27.4S, v22.4S, v31.s[0] +mla v3.4S, v9.4S, v31.s[0] +add v23.4s, v23.4s, v8.4s +str q23, [x0, #912] +mul v0.4S, v0.4S,v2.s[0] +mul v17.4S, v17.4S,v2.s[1] +sub v23.4s, v25.4s, v30.4s +str q23, [x0, #848] +mla v0.4S, v24.4S, v31.s[0] +mla v17.4S, v11.4S, v31.s[0] +add v25.4s, v25.4s, v30.4s +sub v30.4s, v19.4s, v20.4s +add v19.4s, v19.4s, v20.4s +str q25, [x0, #784] +sub v25.4s, v5.4s, v21.4s +str q30, [x0, #720] +add v5.4s, v5.4s, v21.4s +str q19, [x0, #656] +str q25, [x0, #592] +sub v25.4s, v12.4s, v3.4s +str q5, [x0, #528] +add v12.4s, v12.4s, v3.4s +str q25, [x0, #464] +sub v25.4s, v26.4s, v27.4s +str q12, [x0, #400] +add v26.4s, v26.4s, v27.4s +str q25, [x0, #336] +sub v25.4s, v14.4s, v17.4s +str q26, [x0, #272] +add v14.4s, v14.4s, v17.4s +str q25, [x0, #208] +sub v25.4s, v10.4s, v0.4s +str q14, [x0, #144] +add v10.4s, v10.4s, v0.4s +str q25, [x0, #80] +str q10, [x0, #16] +ldr q6, [x0, #224] +ldr q4, [x0, #160] +ldr q18, [x0, #32] +ldr q16, [x17, #+128] +ldr q15, [x17, #+144] +sqrdmulh v29.4S, v18.4S, v15.s[0] +mul v18.4S, v18.4S,v16.s[0] +ldr q13, [x0, #48] +sqrdmulh v1.4S, v13.4S, v15.s[0] +mul v13.4S, v13.4S,v16.s[0] +ldr q28, [x17, #+160] +ldr q22, [x17, #+176] +ldr q9, [x0, #96] +sqrdmulh v8.4S, v9.4S, v22.s[0] +mul v9.4S, v9.4S,v28.s[0] +ldr q23, [x0, #112] +sqrdmulh v24.4S, v23.4S, v22.s[0] +mul v23.4S, v23.4S,v28.s[0] +ldr q11, [x17, #+192] +ldr q2, [x17, #+208] +mla v18.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v4.4S, v2.s[0] +ldr q7, [x0, #176] +mla v13.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v7.4S, v2.s[0] +ldr q20, [x17, #+224] +ldr q30, [x17, #+240] +mla v9.4S, v8.4S, v31.s[0] +sqrdmulh v8.4S, v6.4S, v30.s[0] +ldr q21, [x0, #240] +mla v23.4S, v24.4S, v31.s[0] +sqrdmulh v24.4S, v21.4S, v30.s[0] +ldr q19, [x0, #0] +ldr q5, [x0, #128] +mul v4.4S, v4.4S,v11.s[0] +sub v3.4s, v19.4s, v18.4s +ldr q12, [x0, #16] +mul v7.4S, v7.4S,v11.s[0] +add v19.4s, v19.4s, v18.4s +ldr q18, [x0, #144] +mla v4.4S, v29.4S, v31.s[0] +sub v29.4s, v12.4s, v13.4s +ldr q27, [x0, #64] +mla v7.4S, v1.4S, v31.s[0] +add v12.4s, v12.4s, v13.4s +ldr q13, [x0, #192] +mul v6.4S, v6.4S,v20.s[0] +sub v1.4s, v27.4s, v9.4s +ldr q26, [x0, #80] +mul v21.4S, v21.4S,v20.s[0] +add v27.4s, v27.4s, v9.4s +ldr q9, [x0, #208] +mla v6.4S, v8.4S, v31.s[0] +mla v21.4S, v24.4S, v31.s[0] +sub v24.4s, v26.4s, v23.4s +sqrdmulh v8.4S, v12.4S, v15.s[1] +add v26.4s, v26.4s, v23.4s +mul v12.4S, v12.4S,v16.s[1] +sqrdmulh v23.4S, v29.4S, v15.s[2] +sub v17.4s, v5.4s, v4.4s +mul v29.4S, v29.4S,v16.s[2] +add v5.4s, v5.4s, v4.4s +sqrdmulh v15.4S, v26.4S, v22.s[1] +sub v16.4s, v18.4s, v7.4s +mul v26.4S, v26.4S,v28.s[1] +add v18.4s, v18.4s, v7.4s +sqrdmulh v7.4S, v24.4S, v22.s[2] +sub v4.4s, v13.4s, v6.4s +mul v24.4S, v24.4S,v28.s[2] +add v13.4s, v13.4s, v6.4s +mla v12.4S, v8.4S, v31.s[0] +sub v8.4s, v9.4s, v21.4s +ldr q22, [x0, #480] +sqrdmulh v28.4S, v18.4S, v2.s[1] +add v9.4s, v9.4s, v21.4s +mla v29.4S, v23.4S, v31.s[0] +ldr q23, [x0, #416] +sqrdmulh v21.4S, v16.4S, v2.s[2] +sub v6.4s, v19.4s, v12.4s +mla v26.4S, v15.4S, v31.s[0] +ldr q15, [x0, #288] +sqrdmulh v14.4S, v9.4S, v30.s[1] +add v19.4s, v19.4s, v12.4s +str q6, [x0, #16] +mla v24.4S, v7.4S, v31.s[0] +ldr q7, [x17, #+256] +ldr q6, [x17, #+272] +sqrdmulh v12.4S, v8.4S, v30.s[2] +sub v0.4s, v3.4s, v29.4s +str q19, [x0, #0] +mul v18.4S, v18.4S,v11.s[1] +add v3.4s, v3.4s, v29.4s +mul v16.4S, v16.4S,v11.s[2] +str q0, [x0, #48] +mla v18.4S, v28.4S, v31.s[0] +sub v28.4s, v27.4s, v26.4s +mla v16.4S, v21.4S, v31.s[0] +str q3, [x0, #32] +mul v9.4S, v9.4S,v20.s[1] +str q28, [x0, #80] +mul v8.4S, v8.4S,v20.s[2] +add v27.4s, v27.4s, v26.4s +str q27, [x0, #64] +mla v9.4S, v14.4S, v31.s[0] +sub v14.4s, v1.4s, v24.4s +str q14, [x0, #112] +mla v8.4S, v12.4S, v31.s[0] +add v1.4s, v1.4s, v24.4s +str q1, [x0, #96] +sqrdmulh v30.4S, v15.4S, v6.s[0] +sub v20.4s, v5.4s, v18.4s +mul v15.4S, v15.4S,v7.s[0] +str q20, [x0, #144] +ldr q20, [x0, #304] +sqrdmulh v1.4S, v20.4S, v6.s[0] +add v5.4s, v5.4s, v18.4s +mul v20.4S, v20.4S,v7.s[0] +str q5, [x0, #128] +ldr q5, [x17, #+288] +ldr q18, [x17, #+304] +ldr q24, [x0, #352] +sqrdmulh v12.4S, v24.4S, v18.s[0] +sub v14.4s, v17.4s, v16.4s +mul v24.4S, v24.4S,v5.s[0] +str q14, [x0, #176] +ldr q14, [x0, #368] +sqrdmulh v27.4S, v14.4S, v18.s[0] +add v17.4s, v17.4s, v16.4s +mul v14.4S, v14.4S,v5.s[0] +str q17, [x0, #160] +ldr q17, [x17, #+320] +ldr q16, [x17, #+336] +mla v15.4S, v30.4S, v31.s[0] +sub v30.4s, v13.4s, v9.4s +sqrdmulh v26.4S, v23.4S, v16.s[0] +str q30, [x0, #208] +ldr q30, [x0, #432] +mla v20.4S, v1.4S, v31.s[0] +add v13.4s, v13.4s, v9.4s +sqrdmulh v9.4S, v30.4S, v16.s[0] +str q13, [x0, #192] +ldr q13, [x17, #+352] +ldr q1, [x17, #+368] +mla v24.4S, v12.4S, v31.s[0] +sub v12.4s, v4.4s, v8.4s +sqrdmulh v28.4S, v22.4S, v1.s[0] +str q12, [x0, #240] +ldr q12, [x0, #496] +mla v14.4S, v27.4S, v31.s[0] +add v4.4s, v4.4s, v8.4s +sqrdmulh v8.4S, v12.4S, v1.s[0] +str q4, [x0, #224] +ldr q4, [x0, #256] +ldr q27, [x0, #384] +mul v23.4S, v23.4S,v17.s[0] +sub v2.4s, v4.4s, v15.4s +ldr q11, [x0, #272] +mul v30.4S, v30.4S,v17.s[0] +add v4.4s, v4.4s, v15.4s +ldr q15, [x0, #400] +mla v23.4S, v26.4S, v31.s[0] +sub v26.4s, v11.4s, v20.4s +ldr q3, [x0, #320] +mla v30.4S, v9.4S, v31.s[0] +add v11.4s, v11.4s, v20.4s +ldr q20, [x0, #448] +mul v22.4S, v22.4S,v13.s[0] +sub v9.4s, v3.4s, v24.4s +ldr q21, [x0, #336] +mul v12.4S, v12.4S,v13.s[0] +add v3.4s, v3.4s, v24.4s +ldr q24, [x0, #464] +mla v22.4S, v28.4S, v31.s[0] +mla v12.4S, v8.4S, v31.s[0] +sub v8.4s, v21.4s, v14.4s +sqrdmulh v28.4S, v11.4S, v6.s[1] +add v21.4s, v21.4s, v14.4s +mul v11.4S, v11.4S,v7.s[1] +sqrdmulh v14.4S, v26.4S, v6.s[2] +sub v0.4s, v27.4s, v23.4s +mul v26.4S, v26.4S,v7.s[2] +add v27.4s, v27.4s, v23.4s +sqrdmulh v6.4S, v21.4S, v18.s[1] +sub v7.4s, v15.4s, v30.4s +mul v21.4S, v21.4S,v5.s[1] +add v15.4s, v15.4s, v30.4s +sqrdmulh v30.4S, v8.4S, v18.s[2] +sub v23.4s, v20.4s, v22.4s +mul v8.4S, v8.4S,v5.s[2] +add v20.4s, v20.4s, v22.4s +mla v11.4S, v28.4S, v31.s[0] +sub v28.4s, v24.4s, v12.4s +ldr q18, [x0, #736] +sqrdmulh v5.4S, v15.4S, v16.s[1] +add v24.4s, v24.4s, v12.4s +mla v26.4S, v14.4S, v31.s[0] +ldr q14, [x0, #672] +sqrdmulh v12.4S, v7.4S, v16.s[2] +sub v22.4s, v4.4s, v11.4s +mla v21.4S, v6.4S, v31.s[0] +ldr q6, [x0, #544] +sqrdmulh v29.4S, v24.4S, v1.s[1] +add v4.4s, v4.4s, v11.4s +str q22, [x0, #272] +mla v8.4S, v30.4S, v31.s[0] +ldr q30, [x17, #+384] +ldr q22, [x17, #+400] +sqrdmulh v11.4S, v28.4S, v1.s[2] +sub v19.4s, v2.4s, v26.4s +str q4, [x0, #256] +mul v15.4S, v15.4S,v17.s[1] +add v2.4s, v2.4s, v26.4s +mul v7.4S, v7.4S,v17.s[2] +str q19, [x0, #304] +mla v15.4S, v5.4S, v31.s[0] +sub v5.4s, v3.4s, v21.4s +mla v7.4S, v12.4S, v31.s[0] +str q2, [x0, #288] +mul v24.4S, v24.4S,v13.s[1] +str q5, [x0, #336] +mul v28.4S, v28.4S,v13.s[2] +add v3.4s, v3.4s, v21.4s +str q3, [x0, #320] +mla v24.4S, v29.4S, v31.s[0] +sub v29.4s, v9.4s, v8.4s +str q29, [x0, #368] +mla v28.4S, v11.4S, v31.s[0] +add v9.4s, v9.4s, v8.4s +str q9, [x0, #352] +sqrdmulh v1.4S, v6.4S, v22.s[0] +sub v13.4s, v27.4s, v15.4s +mul v6.4S, v6.4S,v30.s[0] +str q13, [x0, #400] +ldr q13, [x0, #560] +sqrdmulh v9.4S, v13.4S, v22.s[0] +add v27.4s, v27.4s, v15.4s +mul v13.4S, v13.4S,v30.s[0] +str q27, [x0, #384] +ldr q27, [x17, #+416] +ldr q15, [x17, #+432] +ldr q8, [x0, #608] +sqrdmulh v11.4S, v8.4S, v15.s[0] +sub v29.4s, v0.4s, v7.4s +mul v8.4S, v8.4S,v27.s[0] +str q29, [x0, #432] +ldr q29, [x0, #624] +sqrdmulh v3.4S, v29.4S, v15.s[0] +add v0.4s, v0.4s, v7.4s +mul v29.4S, v29.4S,v27.s[0] +str q0, [x0, #416] +ldr q0, [x17, #+448] +ldr q7, [x17, #+464] +mla v6.4S, v1.4S, v31.s[0] +sub v1.4s, v20.4s, v24.4s +sqrdmulh v21.4S, v14.4S, v7.s[0] +str q1, [x0, #464] +ldr q1, [x0, #688] +mla v13.4S, v9.4S, v31.s[0] +add v20.4s, v20.4s, v24.4s +sqrdmulh v24.4S, v1.4S, v7.s[0] +str q20, [x0, #448] +ldr q20, [x17, #+480] +ldr q9, [x17, #+496] +mla v8.4S, v11.4S, v31.s[0] +sub v11.4s, v23.4s, v28.4s +sqrdmulh v5.4S, v18.4S, v9.s[0] +str q11, [x0, #496] +ldr q11, [x0, #752] +mla v29.4S, v3.4S, v31.s[0] +add v23.4s, v23.4s, v28.4s +sqrdmulh v28.4S, v11.4S, v9.s[0] +str q23, [x0, #480] +ldr q23, [x0, #512] +ldr q3, [x0, #640] +mul v14.4S, v14.4S,v0.s[0] +sub v16.4s, v23.4s, v6.4s +ldr q17, [x0, #528] +mul v1.4S, v1.4S,v0.s[0] +add v23.4s, v23.4s, v6.4s +ldr q6, [x0, #656] +mla v14.4S, v21.4S, v31.s[0] +sub v21.4s, v17.4s, v13.4s +ldr q2, [x0, #576] +mla v1.4S, v24.4S, v31.s[0] +add v17.4s, v17.4s, v13.4s +ldr q13, [x0, #704] +mul v18.4S, v18.4S,v20.s[0] +sub v24.4s, v2.4s, v8.4s +ldr q12, [x0, #592] +mul v11.4S, v11.4S,v20.s[0] +add v2.4s, v2.4s, v8.4s +ldr q8, [x0, #720] +mla v18.4S, v5.4S, v31.s[0] +mla v11.4S, v28.4S, v31.s[0] +sub v28.4s, v12.4s, v29.4s +sqrdmulh v5.4S, v17.4S, v22.s[1] +add v12.4s, v12.4s, v29.4s +mul v17.4S, v17.4S,v30.s[1] +sqrdmulh v29.4S, v21.4S, v22.s[2] +sub v19.4s, v3.4s, v14.4s +mul v21.4S, v21.4S,v30.s[2] +add v3.4s, v3.4s, v14.4s +sqrdmulh v22.4S, v12.4S, v15.s[1] +sub v30.4s, v6.4s, v1.4s +mul v12.4S, v12.4S,v27.s[1] +add v6.4s, v6.4s, v1.4s +sqrdmulh v1.4S, v28.4S, v15.s[2] +sub v14.4s, v13.4s, v18.4s +mul v28.4S, v28.4S,v27.s[2] +add v13.4s, v13.4s, v18.4s +mla v17.4S, v5.4S, v31.s[0] +sub v5.4s, v8.4s, v11.4s +ldr q15, [x0, #992] +sqrdmulh v27.4S, v6.4S, v7.s[1] +add v8.4s, v8.4s, v11.4s +mla v21.4S, v29.4S, v31.s[0] +ldr q29, [x0, #928] +sqrdmulh v11.4S, v30.4S, v7.s[2] +sub v18.4s, v23.4s, v17.4s +mla v12.4S, v22.4S, v31.s[0] +ldr q22, [x0, #800] +sqrdmulh v26.4S, v8.4S, v9.s[1] +add v23.4s, v23.4s, v17.4s +str q18, [x0, #528] +mla v28.4S, v1.4S, v31.s[0] +ldr q1, [x17, #+512] +ldr q18, [x17, #+528] +sqrdmulh v17.4S, v5.4S, v9.s[2] +sub v4.4s, v16.4s, v21.4s +str q23, [x0, #512] +mul v6.4S, v6.4S,v0.s[1] +add v16.4s, v16.4s, v21.4s +mul v30.4S, v30.4S,v0.s[2] +str q4, [x0, #560] +mla v6.4S, v27.4S, v31.s[0] +sub v27.4s, v2.4s, v12.4s +mla v30.4S, v11.4S, v31.s[0] +str q16, [x0, #544] +mul v8.4S, v8.4S,v20.s[1] +str q27, [x0, #592] +mul v5.4S, v5.4S,v20.s[2] +add v2.4s, v2.4s, v12.4s +str q2, [x0, #576] +mla v8.4S, v26.4S, v31.s[0] +sub v26.4s, v24.4s, v28.4s +str q26, [x0, #624] +mla v5.4S, v17.4S, v31.s[0] +add v24.4s, v24.4s, v28.4s +str q24, [x0, #608] +sqrdmulh v9.4S, v22.4S, v18.s[0] +sub v20.4s, v3.4s, v6.4s +mul v22.4S, v22.4S,v1.s[0] +str q20, [x0, #656] +ldr q20, [x0, #816] +sqrdmulh v24.4S, v20.4S, v18.s[0] +add v3.4s, v3.4s, v6.4s +mul v20.4S, v20.4S,v1.s[0] +str q3, [x0, #640] +ldr q3, [x17, #+544] +ldr q6, [x17, #+560] +ldr q28, [x0, #864] +sqrdmulh v17.4S, v28.4S, v6.s[0] +sub v26.4s, v19.4s, v30.4s +mul v28.4S, v28.4S,v3.s[0] +str q26, [x0, #688] +ldr q26, [x0, #880] +sqrdmulh v2.4S, v26.4S, v6.s[0] +add v19.4s, v19.4s, v30.4s +mul v26.4S, v26.4S,v3.s[0] +str q19, [x0, #672] +ldr q19, [x17, #+576] +ldr q30, [x17, #+592] +mla v22.4S, v9.4S, v31.s[0] +sub v9.4s, v13.4s, v8.4s +sqrdmulh v12.4S, v29.4S, v30.s[0] +str q9, [x0, #720] +ldr q9, [x0, #944] +mla v20.4S, v24.4S, v31.s[0] +add v13.4s, v13.4s, v8.4s +sqrdmulh v8.4S, v9.4S, v30.s[0] +str q13, [x0, #704] +ldr q13, [x17, #+608] +ldr q24, [x17, #+624] +mla v28.4S, v17.4S, v31.s[0] +sub v17.4s, v14.4s, v5.4s +sqrdmulh v27.4S, v15.4S, v24.s[0] +str q17, [x0, #752] +ldr q17, [x0, #1008] +mla v26.4S, v2.4S, v31.s[0] +add v14.4s, v14.4s, v5.4s +sqrdmulh v5.4S, v17.4S, v24.s[0] +str q14, [x0, #736] +ldr q14, [x0, #768] +ldr q2, [x0, #896] +mul v29.4S, v29.4S,v19.s[0] +sub v7.4s, v14.4s, v22.4s +ldr q0, [x0, #784] +mul v9.4S, v9.4S,v19.s[0] +add v14.4s, v14.4s, v22.4s +ldr q22, [x0, #912] +mla v29.4S, v12.4S, v31.s[0] +sub v12.4s, v0.4s, v20.4s +ldr q16, [x0, #832] +mla v9.4S, v8.4S, v31.s[0] +add v0.4s, v0.4s, v20.4s +ldr q20, [x0, #960] +mul v15.4S, v15.4S,v13.s[0] +sub v8.4s, v16.4s, v28.4s +ldr q11, [x0, #848] +mul v17.4S, v17.4S,v13.s[0] +add v16.4s, v16.4s, v28.4s +ldr q28, [x0, #976] +mla v15.4S, v27.4S, v31.s[0] +mla v17.4S, v5.4S, v31.s[0] +sub v5.4s, v11.4s, v26.4s +sqrdmulh v27.4S, v0.4S, v18.s[1] +add v11.4s, v11.4s, v26.4s +mul v0.4S, v0.4S,v1.s[1] +sqrdmulh v26.4S, v12.4S, v18.s[2] +sub v4.4s, v2.4s, v29.4s +mul v12.4S, v12.4S,v1.s[2] +add v2.4s, v2.4s, v29.4s +sqrdmulh v18.4S, v11.4S, v6.s[1] +sub v1.4s, v22.4s, v9.4s +mul v11.4S, v11.4S,v3.s[1] +add v22.4s, v22.4s, v9.4s +sqrdmulh v9.4S, v5.4S, v6.s[2] +sub v29.4s, v20.4s, v15.4s +mul v5.4S, v5.4S,v3.s[2] +add v20.4s, v20.4s, v15.4s +mla v0.4S, v27.4S, v31.s[0] +sub v27.4s, v28.4s, v17.4s +sqrdmulh v6.4S, v22.4S, v30.s[1] +add v28.4s, v28.4s, v17.4s +mla v12.4S, v26.4S, v31.s[0] +sqrdmulh v26.4S, v1.4S, v30.s[2] +sub v17.4s, v14.4s, v0.4s +mla v11.4S, v18.4S, v31.s[0] +sqrdmulh v18.4S, v28.4S, v24.s[1] +add v14.4s, v14.4s, v0.4s +str q17, [x0, #784] +mla v5.4S, v9.4S, v31.s[0] +sqrdmulh v9.4S, v27.4S, v24.s[2] +sub v17.4s, v7.4s, v12.4s +str q14, [x0, #768] +mul v22.4S, v22.4S,v19.s[1] +add v7.4s, v7.4s, v12.4s +mul v1.4S, v1.4S,v19.s[2] +str q17, [x0, #816] +mla v22.4S, v6.4S, v31.s[0] +sub v6.4s, v16.4s, v11.4s +mla v1.4S, v26.4S, v31.s[0] +str q7, [x0, #800] +mul v28.4S, v28.4S,v13.s[1] +str q6, [x0, #848] +mul v27.4S, v27.4S,v13.s[2] +add v16.4s, v16.4s, v11.4s +str q16, [x0, #832] +mla v28.4S, v18.4S, v31.s[0] +sub v18.4s, v8.4s, v5.4s +str q18, [x0, #880] +mla v27.4S, v9.4S, v31.s[0] +add v8.4s, v8.4s, v5.4s +str q8, [x0, #864] +sub v24.4s, v2.4s, v22.4s +str q24, [x0, #912] +add v2.4s, v2.4s, v22.4s +str q2, [x0, #896] +sub v2.4s, v4.4s, v1.4s +str q2, [x0, #944] +add v4.4s, v4.4s, v1.4s +str q4, [x0, #928] +sub v4.4s, v20.4s, v28.4s +str q4, [x0, #976] +add v20.4s, v20.4s, v28.4s +str q20, [x0, #960] +sub v20.4s, v29.4s, v27.4s +str q20, [x0, #1008] +add v29.4s, v29.4s, v27.4s +str q29, [x0, #992] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1528 +// Instruction count: 1524 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_21_z4_7.s b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_21_z4_7.s new file mode 100644 index 0000000..3e72693 --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_21_z4_7.s @@ -0,0 +1,1558 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_21_z4_7 +.global _ntt_u32_incomplete_neon_asm_var_4_2_21_z4_7 +ntt_u32_incomplete_neon_asm_var_4_2_21_z4_7: +_ntt_u32_incomplete_neon_asm_var_4_2_21_z4_7: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x0, #992] +sqrdmulh v27.4S, v28.4S, v29.s[0] +mul v28.4S, v28.4S,v30.s[0] +ldr q26, [x0, #928] +sqrdmulh v25.4S, v26.4S, v29.s[0] +mul v26.4S, v26.4S,v30.s[0] +ldr q24, [x0, #864] +sqrdmulh v23.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +ldr q22, [x0, #800] +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +ldr q20, [x0, #736] +sqrdmulh v19.4S, v20.4S, v29.s[0] +mla v28.4S, v27.4S, v31.s[0] +ldr q27, [x0, #672] +sqrdmulh v18.4S, v27.4S, v29.s[0] +mla v26.4S, v25.4S, v31.s[0] +ldr q25, [x0, #608] +sqrdmulh v17.4S, v25.4S, v29.s[0] +mla v24.4S, v23.4S, v31.s[0] +ldr q23, [x0, #544] +sqrdmulh v16.4S, v23.4S, v29.s[0] +mla v22.4S, v21.4S, v31.s[0] +ldr q21, [x0, #480] +ldr q3, [x0, #416] +mul v27.4S, v27.4S,v30.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q2, [x0, #352] +ldr q1, [x0, #288] +mla v27.4S, v18.4S, v31.s[0] +mla v20.4S, v19.4S, v31.s[0] +ldr q19, [x0, #224] +ldr q18, [x0, #160] +mul v23.4S, v23.4S,v30.s[0] +mul v25.4S, v25.4S,v30.s[0] +ldr q0, [x0, #96] +ldr q15, [x0, #32] +mla v23.4S, v16.4S, v31.s[0] +mla v25.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v28.4s +add v21.4s, v21.4s, v28.4s +sqrdmulh v28.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +sub v16.4s, v3.4s, v26.4s +add v3.4s, v3.4s, v26.4s +sqrdmulh v26.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +sub v14.4s, v2.4s, v24.4s +add v2.4s, v2.4s, v24.4s +sqrdmulh v24.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v13.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +sqrdmulh v22.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v12.4s, v19.4s, v20.4s +add v19.4s, v19.4s, v20.4s +sqrdmulh v20.4S, v14.4S, v29.s[2] +mla v17.4S, v28.4S, v31.s[0] +sub v28.4s, v18.4s, v27.4s +add v18.4s, v18.4s, v27.4s +sqrdmulh v27.4S, v13.4S, v29.s[2] +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v0.4s, v25.4s +add v0.4s, v0.4s, v25.4s +sqrdmulh v25.4S, v2.4S, v29.s[1] +mla v21.4S, v24.4S, v31.s[0] +sub v24.4s, v15.4s, v23.4s +sqrdmulh v11.4S, v1.4S, v29.s[1] +mla v3.4S, v22.4S, v31.s[0] +add v15.4s, v15.4s, v23.4s +ldr q23, [x17, #+32] +ldr q22, [x17, #+48] +mul v13.4S, v13.4S,v30.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v10.4s, v12.4s, v17.4s +add v12.4s, v12.4s, v17.4s +mla v13.4S, v27.4S, v31.s[0] +mla v14.4S, v20.4S, v31.s[0] +sub v20.4s, v28.4s, v16.4s +add v28.4s, v28.4s, v16.4s +mul v1.4S, v1.4S,v30.s[1] +mul v2.4S, v2.4S,v30.s[1] +sub v16.4s, v19.4s, v21.4s +add v19.4s, v19.4s, v21.4s +mla v1.4S, v11.4S, v31.s[0] +mla v2.4S, v25.4S, v31.s[0] +sub v25.4s, v18.4s, v3.4s +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v10.4S, v22.s[3] +mul v10.4S, v10.4S,v23.s[3] +sub v11.4s, v26.4s, v14.4s +add v26.4s, v26.4s, v14.4s +sqrdmulh v14.4S, v12.4S, v22.s[2] +mul v12.4S, v12.4S,v23.s[2] +sub v21.4s, v24.4s, v13.4s +add v24.4s, v24.4s, v13.4s +sqrdmulh v13.4S, v16.4S, v22.s[1] +mul v16.4S, v16.4S,v23.s[1] +sub v27.4s, v0.4s, v2.4s +add v0.4s, v0.4s, v2.4s +sqrdmulh v2.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v17.4s, v15.4s, v1.4s +add v15.4s, v15.4s, v1.4s +ldr q1, [x17, #+96] +ldr q9, [x17, #+112] +sqrdmulh v8.4S, v20.4S, v22.s[3] +mla v10.4S, v3.4S, v31.s[0] +nop +nop +sqrdmulh v3.4S, v28.4S, v22.s[2] +mla v12.4S, v14.4S, v31.s[0] +nop +nop +sqrdmulh v14.4S, v25.4S, v22.s[1] +mla v16.4S, v13.4S, v31.s[0] +nop +nop +sqrdmulh v13.4S, v18.4S, v22.s[0] +mla v19.4S, v2.4S, v31.s[0] +nop +nop +ldr q2, [x17, #+64] +ldr q7, [x17, #+80] +mul v28.4S, v28.4S,v23.s[2] +mul v20.4S, v20.4S,v23.s[3] +sub v6.4s, v11.4s, v10.4s +add v11.4s, v11.4s, v10.4s +mla v28.4S, v3.4S, v31.s[0] +mla v20.4S, v8.4S, v31.s[0] +sub v8.4s, v26.4s, v12.4s +add v26.4s, v26.4s, v12.4s +mul v18.4S, v18.4S,v23.s[0] +mul v25.4S, v25.4S,v23.s[1] +sub v12.4s, v27.4s, v16.4s +add v27.4s, v27.4s, v16.4s +mla v18.4S, v13.4S, v31.s[0] +mla v25.4S, v14.4S, v31.s[0] +sub v14.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v9.s[3] +mul v6.4S, v6.4S,v1.s[3] +sub v13.4s, v21.4s, v20.4s +add v21.4s, v21.4s, v20.4s +sqrdmulh v20.4S, v11.4S, v9.s[2] +mul v11.4S, v11.4S,v1.s[2] +sub v16.4s, v24.4s, v28.4s +add v24.4s, v24.4s, v28.4s +sqrdmulh v28.4S, v8.4S, v9.s[1] +mul v8.4S, v8.4S,v1.s[1] +sub v3.4s, v17.4s, v25.4s +add v17.4s, v17.4s, v25.4s +sqrdmulh v25.4S, v26.4S, v9.s[0] +mul v26.4S, v26.4S,v1.s[0] +sub v10.4s, v15.4s, v18.4s +add v15.4s, v15.4s, v18.4s +sqrdmulh v18.4S, v12.4S, v7.s[3] +mla v6.4S, v19.4S, v31.s[0] +nop +nop +sqrdmulh v19.4S, v27.4S, v7.s[2] +mla v11.4S, v20.4S, v31.s[0] +nop +nop +sqrdmulh v20.4S, v14.4S, v7.s[1] +mla v8.4S, v28.4S, v31.s[0] +nop +nop +sqrdmulh v28.4S, v0.4S, v7.s[0] +mla v26.4S, v25.4S, v31.s[0] +nop +nop +mul v27.4S, v27.4S,v2.s[2] +mul v12.4S, v12.4S,v2.s[3] +sub v25.4s, v13.4s, v6.4s +str q25, [x0, #992] +mla v27.4S, v19.4S, v31.s[0] +mla v12.4S, v18.4S, v31.s[0] +add v13.4s, v13.4s, v6.4s +str q13, [x0, #928] +mul v0.4S, v0.4S,v2.s[0] +mul v14.4S, v14.4S,v2.s[1] +sub v13.4s, v21.4s, v11.4s +str q13, [x0, #864] +mla v0.4S, v28.4S, v31.s[0] +mla v14.4S, v20.4S, v31.s[0] +add v21.4s, v21.4s, v11.4s +sub v11.4s, v16.4s, v8.4s +ldr q20, [x0, #1008] +sqrdmulh v28.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v16.4s, v16.4s, v8.4s +str q21, [x0, #800] +ldr q21, [x0, #944] +sqrdmulh v8.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +sub v13.4s, v24.4s, v26.4s +str q11, [x0, #736] +ldr q11, [x0, #880] +sqrdmulh v6.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +add v24.4s, v24.4s, v26.4s +str q16, [x0, #672] +ldr q16, [x0, #816] +sqrdmulh v26.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +str q13, [x0, #608] +sub v13.4s, v3.4s, v12.4s +ldr q18, [x0, #752] +sqrdmulh v19.4S, v18.4S, v29.s[0] +mla v20.4S, v28.4S, v31.s[0] +add v3.4s, v3.4s, v12.4s +str q24, [x0, #544] +ldr q24, [x0, #688] +sqrdmulh v12.4S, v24.4S, v29.s[0] +mla v21.4S, v8.4S, v31.s[0] +sub v8.4s, v17.4s, v27.4s +str q13, [x0, #480] +ldr q13, [x0, #624] +sqrdmulh v28.4S, v13.4S, v29.s[0] +mla v11.4S, v6.4S, v31.s[0] +add v17.4s, v17.4s, v27.4s +str q3, [x0, #416] +ldr q3, [x0, #560] +sqrdmulh v27.4S, v3.4S, v29.s[0] +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v10.4s, v14.4s +str q8, [x0, #352] +ldr q8, [x0, #496] +ldr q6, [x0, #432] +mul v24.4S, v24.4S,v30.s[0] +mul v18.4S, v18.4S,v30.s[0] +add v10.4s, v10.4s, v14.4s +str q17, [x0, #288] +ldr q17, [x0, #368] +ldr q14, [x0, #304] +mla v24.4S, v12.4S, v31.s[0] +mla v18.4S, v19.4S, v31.s[0] +sub v19.4s, v15.4s, v0.4s +str q26, [x0, #224] +ldr q26, [x0, #240] +ldr q12, [x0, #176] +mul v3.4S, v3.4S,v30.s[0] +mul v13.4S, v13.4S,v30.s[0] +add v15.4s, v15.4s, v0.4s +str q10, [x0, #160] +ldr q10, [x0, #112] +ldr q0, [x0, #48] +mla v3.4S, v27.4S, v31.s[0] +mla v13.4S, v28.4S, v31.s[0] +sub v28.4s, v8.4s, v20.4s +add v8.4s, v8.4s, v20.4s +sqrdmulh v20.4S, v28.4S, v29.s[2] +mul v28.4S, v28.4S,v30.s[2] +sub v27.4s, v6.4s, v21.4s +add v6.4s, v6.4s, v21.4s +sqrdmulh v21.4S, v27.4S, v29.s[2] +mul v27.4S, v27.4S,v30.s[2] +sub v25.4s, v17.4s, v11.4s +add v17.4s, v17.4s, v11.4s +sqrdmulh v11.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +sub v5.4s, v14.4s, v16.4s +add v14.4s, v14.4s, v16.4s +sqrdmulh v16.4S, v6.4S, v29.s[1] +mul v6.4S, v6.4S,v30.s[1] +sub v4.4s, v26.4s, v18.4s +add v26.4s, v26.4s, v18.4s +sqrdmulh v18.4S, v25.4S, v29.s[2] +mla v28.4S, v20.4S, v31.s[0] +sub v20.4s, v12.4s, v24.4s +add v12.4s, v12.4s, v24.4s +sqrdmulh v24.4S, v5.4S, v29.s[2] +mla v27.4S, v21.4S, v31.s[0] +sub v21.4s, v10.4s, v13.4s +add v10.4s, v10.4s, v13.4s +sqrdmulh v13.4S, v17.4S, v29.s[1] +mla v8.4S, v11.4S, v31.s[0] +str q19, [x0, #96] +sub v19.4s, v0.4s, v3.4s +sqrdmulh v11.4S, v14.4S, v29.s[1] +mla v6.4S, v16.4S, v31.s[0] +str q15, [x0, #32] +add v0.4s, v0.4s, v3.4s +mul v5.4S, v5.4S,v30.s[2] +mul v25.4S, v25.4S,v30.s[2] +sub v3.4s, v4.4s, v28.4s +add v4.4s, v4.4s, v28.4s +mla v5.4S, v24.4S, v31.s[0] +mla v25.4S, v18.4S, v31.s[0] +sub v18.4s, v20.4s, v27.4s +add v20.4s, v20.4s, v27.4s +mul v14.4S, v14.4S,v30.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v27.4s, v26.4s, v8.4s +add v26.4s, v26.4s, v8.4s +mla v14.4S, v11.4S, v31.s[0] +mla v17.4S, v13.4S, v31.s[0] +sub v13.4s, v12.4s, v6.4s +add v12.4s, v12.4s, v6.4s +sqrdmulh v6.4S, v3.4S, v22.s[3] +mul v3.4S, v3.4S,v23.s[3] +sub v11.4s, v21.4s, v25.4s +add v21.4s, v21.4s, v25.4s +sqrdmulh v25.4S, v4.4S, v22.s[2] +mul v4.4S, v4.4S,v23.s[2] +sub v8.4s, v19.4s, v5.4s +add v19.4s, v19.4s, v5.4s +sqrdmulh v5.4S, v27.4S, v22.s[1] +mul v27.4S, v27.4S,v23.s[1] +sub v24.4s, v10.4s, v17.4s +add v10.4s, v10.4s, v17.4s +sqrdmulh v17.4S, v26.4S, v22.s[0] +mul v26.4S, v26.4S,v23.s[0] +sub v28.4s, v0.4s, v14.4s +add v0.4s, v0.4s, v14.4s +sqrdmulh v14.4S, v18.4S, v22.s[3] +mla v3.4S, v6.4S, v31.s[0] +nop +nop +sqrdmulh v6.4S, v20.4S, v22.s[2] +mla v4.4S, v25.4S, v31.s[0] +nop +nop +sqrdmulh v25.4S, v13.4S, v22.s[1] +mla v27.4S, v5.4S, v31.s[0] +nop +nop +sqrdmulh v5.4S, v12.4S, v22.s[0] +mla v26.4S, v17.4S, v31.s[0] +nop +nop +mul v20.4S, v20.4S,v23.s[2] +mul v18.4S, v18.4S,v23.s[3] +sub v17.4s, v11.4s, v3.4s +add v11.4s, v11.4s, v3.4s +mla v20.4S, v6.4S, v31.s[0] +mla v18.4S, v14.4S, v31.s[0] +sub v14.4s, v21.4s, v4.4s +add v21.4s, v21.4s, v4.4s +mul v12.4S, v12.4S,v23.s[0] +mul v13.4S, v13.4S,v23.s[1] +sub v4.4s, v24.4s, v27.4s +add v24.4s, v24.4s, v27.4s +mla v12.4S, v5.4S, v31.s[0] +mla v13.4S, v25.4S, v31.s[0] +sub v25.4s, v10.4s, v26.4s +add v10.4s, v10.4s, v26.4s +sqrdmulh v26.4S, v17.4S, v9.s[3] +mul v17.4S, v17.4S,v1.s[3] +sub v5.4s, v8.4s, v18.4s +add v8.4s, v8.4s, v18.4s +sqrdmulh v18.4S, v11.4S, v9.s[2] +mul v11.4S, v11.4S,v1.s[2] +sub v27.4s, v19.4s, v20.4s +add v19.4s, v19.4s, v20.4s +sqrdmulh v20.4S, v14.4S, v9.s[1] +mul v14.4S, v14.4S,v1.s[1] +sub v6.4s, v28.4s, v13.4s +add v28.4s, v28.4s, v13.4s +sqrdmulh v13.4S, v21.4S, v9.s[0] +mul v21.4S, v21.4S,v1.s[0] +sub v3.4s, v0.4s, v12.4s +add v0.4s, v0.4s, v12.4s +sqrdmulh v12.4S, v4.4S, v7.s[3] +mla v17.4S, v26.4S, v31.s[0] +nop +nop +sqrdmulh v26.4S, v24.4S, v7.s[2] +mla v11.4S, v18.4S, v31.s[0] +nop +nop +sqrdmulh v18.4S, v25.4S, v7.s[1] +mla v14.4S, v20.4S, v31.s[0] +nop +nop +sqrdmulh v20.4S, v10.4S, v7.s[0] +mla v21.4S, v13.4S, v31.s[0] +nop +nop +mul v24.4S, v24.4S,v2.s[2] +mul v4.4S, v4.4S,v2.s[3] +sub v13.4s, v5.4s, v17.4s +str q13, [x0, #1008] +mla v24.4S, v26.4S, v31.s[0] +mla v4.4S, v12.4S, v31.s[0] +add v5.4s, v5.4s, v17.4s +str q5, [x0, #944] +mul v10.4S, v10.4S,v2.s[0] +mul v25.4S, v25.4S,v2.s[1] +sub v5.4s, v8.4s, v11.4s +str q5, [x0, #880] +mla v10.4S, v20.4S, v31.s[0] +mla v25.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v11.4s +sub v11.4s, v27.4s, v14.4s +ldr q18, [x0, #960] +sqrdmulh v20.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +add v27.4s, v27.4s, v14.4s +str q8, [x0, #816] +ldr q8, [x0, #896] +sqrdmulh v14.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v5.4s, v19.4s, v21.4s +str q11, [x0, #752] +ldr q11, [x0, #832] +sqrdmulh v17.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +add v19.4s, v19.4s, v21.4s +str q27, [x0, #688] +ldr q27, [x0, #768] +sqrdmulh v21.4S, v27.4S, v29.s[0] +mul v27.4S, v27.4S,v30.s[0] +str q5, [x0, #624] +sub v5.4s, v6.4s, v4.4s +ldr q12, [x0, #704] +sqrdmulh v26.4S, v12.4S, v29.s[0] +mla v18.4S, v20.4S, v31.s[0] +add v6.4s, v6.4s, v4.4s +str q19, [x0, #560] +ldr q19, [x0, #640] +sqrdmulh v4.4S, v19.4S, v29.s[0] +mla v8.4S, v14.4S, v31.s[0] +sub v14.4s, v28.4s, v24.4s +str q5, [x0, #496] +ldr q5, [x0, #576] +sqrdmulh v20.4S, v5.4S, v29.s[0] +mla v11.4S, v17.4S, v31.s[0] +add v28.4s, v28.4s, v24.4s +str q6, [x0, #432] +ldr q6, [x0, #512] +sqrdmulh v24.4S, v6.4S, v29.s[0] +mla v27.4S, v21.4S, v31.s[0] +sub v21.4s, v3.4s, v25.4s +str q14, [x0, #368] +ldr q14, [x0, #448] +ldr q17, [x0, #384] +mul v19.4S, v19.4S,v30.s[0] +mul v12.4S, v12.4S,v30.s[0] +add v3.4s, v3.4s, v25.4s +str q28, [x0, #304] +ldr q28, [x0, #320] +ldr q25, [x0, #256] +mla v19.4S, v4.4S, v31.s[0] +mla v12.4S, v26.4S, v31.s[0] +sub v26.4s, v0.4s, v10.4s +str q21, [x0, #240] +ldr q21, [x0, #192] +ldr q4, [x0, #128] +mul v6.4S, v6.4S,v30.s[0] +mul v5.4S, v5.4S,v30.s[0] +add v0.4s, v0.4s, v10.4s +str q3, [x0, #176] +ldr q3, [x0, #64] +ldr q10, [x0, #0] +mla v6.4S, v24.4S, v31.s[0] +mla v5.4S, v20.4S, v31.s[0] +sub v20.4s, v14.4s, v18.4s +add v14.4s, v14.4s, v18.4s +sqrdmulh v18.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +sub v24.4s, v17.4s, v8.4s +add v17.4s, v17.4s, v8.4s +sqrdmulh v8.4S, v24.4S, v29.s[2] +mul v24.4S, v24.4S,v30.s[2] +sub v13.4s, v28.4s, v11.4s +add v28.4s, v28.4s, v11.4s +sqrdmulh v11.4S, v14.4S, v29.s[1] +mul v14.4S, v14.4S,v30.s[1] +sub v15.4s, v25.4s, v27.4s +add v25.4s, v25.4s, v27.4s +sqrdmulh v27.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v16.4s, v21.4s, v12.4s +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v13.4S, v29.s[2] +mla v20.4S, v18.4S, v31.s[0] +sub v18.4s, v4.4s, v19.4s +add v4.4s, v4.4s, v19.4s +sqrdmulh v19.4S, v15.4S, v29.s[2] +mla v24.4S, v8.4S, v31.s[0] +sub v8.4s, v3.4s, v5.4s +add v3.4s, v3.4s, v5.4s +sqrdmulh v5.4S, v28.4S, v29.s[1] +mla v14.4S, v11.4S, v31.s[0] +str q26, [x0, #112] +sub v26.4s, v10.4s, v6.4s +sqrdmulh v11.4S, v25.4S, v29.s[1] +mla v17.4S, v27.4S, v31.s[0] +str q0, [x0, #48] +add v10.4s, v10.4s, v6.4s +mul v15.4S, v15.4S,v30.s[2] +mul v13.4S, v13.4S,v30.s[2] +sub v6.4s, v16.4s, v20.4s +add v16.4s, v16.4s, v20.4s +mla v15.4S, v19.4S, v31.s[0] +mla v13.4S, v12.4S, v31.s[0] +sub v12.4s, v18.4s, v24.4s +add v18.4s, v18.4s, v24.4s +mul v25.4S, v25.4S,v30.s[1] +mul v28.4S, v28.4S,v30.s[1] +sub v24.4s, v21.4s, v14.4s +add v21.4s, v21.4s, v14.4s +mla v25.4S, v11.4S, v31.s[0] +mla v28.4S, v5.4S, v31.s[0] +sub v5.4s, v4.4s, v17.4s +add v4.4s, v4.4s, v17.4s +sqrdmulh v17.4S, v6.4S, v22.s[3] +mul v6.4S, v6.4S,v23.s[3] +sub v11.4s, v8.4s, v13.4s +add v8.4s, v8.4s, v13.4s +sqrdmulh v13.4S, v16.4S, v22.s[2] +mul v16.4S, v16.4S,v23.s[2] +sub v14.4s, v26.4s, v15.4s +add v26.4s, v26.4s, v15.4s +sqrdmulh v15.4S, v24.4S, v22.s[1] +mul v24.4S, v24.4S,v23.s[1] +sub v19.4s, v3.4s, v28.4s +add v3.4s, v3.4s, v28.4s +sqrdmulh v28.4S, v21.4S, v22.s[0] +mul v21.4S, v21.4S,v23.s[0] +sub v20.4s, v10.4s, v25.4s +add v10.4s, v10.4s, v25.4s +sqrdmulh v25.4S, v12.4S, v22.s[3] +mla v6.4S, v17.4S, v31.s[0] +nop +nop +sqrdmulh v17.4S, v18.4S, v22.s[2] +mla v16.4S, v13.4S, v31.s[0] +nop +nop +sqrdmulh v13.4S, v5.4S, v22.s[1] +mla v24.4S, v15.4S, v31.s[0] +nop +nop +sqrdmulh v15.4S, v4.4S, v22.s[0] +mla v21.4S, v28.4S, v31.s[0] +nop +nop +mul v18.4S, v18.4S,v23.s[2] +mul v12.4S, v12.4S,v23.s[3] +sub v28.4s, v11.4s, v6.4s +add v11.4s, v11.4s, v6.4s +mla v18.4S, v17.4S, v31.s[0] +mla v12.4S, v25.4S, v31.s[0] +sub v25.4s, v8.4s, v16.4s +add v8.4s, v8.4s, v16.4s +mul v4.4S, v4.4S,v23.s[0] +mul v5.4S, v5.4S,v23.s[1] +sub v16.4s, v19.4s, v24.4s +add v19.4s, v19.4s, v24.4s +mla v4.4S, v15.4S, v31.s[0] +mla v5.4S, v13.4S, v31.s[0] +sub v13.4s, v3.4s, v21.4s +add v3.4s, v3.4s, v21.4s +sqrdmulh v21.4S, v28.4S, v9.s[3] +mul v28.4S, v28.4S,v1.s[3] +sub v15.4s, v14.4s, v12.4s +add v14.4s, v14.4s, v12.4s +sqrdmulh v12.4S, v11.4S, v9.s[2] +mul v11.4S, v11.4S,v1.s[2] +sub v24.4s, v26.4s, v18.4s +add v26.4s, v26.4s, v18.4s +sqrdmulh v18.4S, v25.4S, v9.s[1] +mul v25.4S, v25.4S,v1.s[1] +sub v17.4s, v20.4s, v5.4s +add v20.4s, v20.4s, v5.4s +sqrdmulh v5.4S, v8.4S, v9.s[0] +mul v8.4S, v8.4S,v1.s[0] +sub v6.4s, v10.4s, v4.4s +add v10.4s, v10.4s, v4.4s +sqrdmulh v4.4S, v16.4S, v7.s[3] +mla v28.4S, v21.4S, v31.s[0] +nop +nop +sqrdmulh v21.4S, v19.4S, v7.s[2] +mla v11.4S, v12.4S, v31.s[0] +nop +nop +sqrdmulh v12.4S, v13.4S, v7.s[1] +mla v25.4S, v18.4S, v31.s[0] +nop +nop +sqrdmulh v18.4S, v3.4S, v7.s[0] +mla v8.4S, v5.4S, v31.s[0] +nop +nop +mul v19.4S, v19.4S,v2.s[2] +mul v16.4S, v16.4S,v2.s[3] +sub v5.4s, v15.4s, v28.4s +str q5, [x0, #960] +mla v19.4S, v21.4S, v31.s[0] +mla v16.4S, v4.4S, v31.s[0] +add v15.4s, v15.4s, v28.4s +str q15, [x0, #896] +mul v3.4S, v3.4S,v2.s[0] +mul v13.4S, v13.4S,v2.s[1] +sub v15.4s, v14.4s, v11.4s +str q15, [x0, #832] +mla v3.4S, v18.4S, v31.s[0] +mla v13.4S, v12.4S, v31.s[0] +add v14.4s, v14.4s, v11.4s +sub v11.4s, v24.4s, v25.4s +ldr q12, [x0, #976] +sqrdmulh v18.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +add v24.4s, v24.4s, v25.4s +str q14, [x0, #768] +ldr q14, [x0, #912] +sqrdmulh v25.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v15.4s, v26.4s, v8.4s +str q11, [x0, #704] +ldr q11, [x0, #848] +sqrdmulh v28.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +add v26.4s, v26.4s, v8.4s +str q24, [x0, #640] +ldr q24, [x0, #784] +sqrdmulh v8.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +str q15, [x0, #576] +sub v15.4s, v17.4s, v16.4s +ldr q4, [x0, #720] +sqrdmulh v21.4S, v4.4S, v29.s[0] +mla v12.4S, v18.4S, v31.s[0] +add v17.4s, v17.4s, v16.4s +str q26, [x0, #512] +ldr q26, [x0, #656] +sqrdmulh v16.4S, v26.4S, v29.s[0] +mla v14.4S, v25.4S, v31.s[0] +sub v25.4s, v20.4s, v19.4s +str q15, [x0, #448] +ldr q15, [x0, #592] +sqrdmulh v18.4S, v15.4S, v29.s[0] +mla v11.4S, v28.4S, v31.s[0] +add v20.4s, v20.4s, v19.4s +str q17, [x0, #384] +ldr q17, [x0, #528] +sqrdmulh v19.4S, v17.4S, v29.s[0] +mla v24.4S, v8.4S, v31.s[0] +sub v8.4s, v6.4s, v13.4s +str q25, [x0, #320] +ldr q25, [x0, #464] +ldr q28, [x0, #400] +mul v26.4S, v26.4S,v30.s[0] +mul v4.4S, v4.4S,v30.s[0] +add v6.4s, v6.4s, v13.4s +str q20, [x0, #256] +ldr q20, [x0, #336] +ldr q13, [x0, #272] +mla v26.4S, v16.4S, v31.s[0] +mla v4.4S, v21.4S, v31.s[0] +sub v21.4s, v10.4s, v3.4s +str q8, [x0, #192] +ldr q8, [x0, #208] +ldr q16, [x0, #144] +mul v17.4S, v17.4S,v30.s[0] +mul v15.4S, v15.4S,v30.s[0] +add v10.4s, v10.4s, v3.4s +str q6, [x0, #128] +ldr q6, [x0, #80] +ldr q3, [x0, #16] +mla v17.4S, v19.4S, v31.s[0] +mla v15.4S, v18.4S, v31.s[0] +sub v18.4s, v25.4s, v12.4s +add v25.4s, v25.4s, v12.4s +sqrdmulh v12.4S, v18.4S, v29.s[2] +mul v18.4S, v18.4S,v30.s[2] +sub v19.4s, v28.4s, v14.4s +add v28.4s, v28.4s, v14.4s +sqrdmulh v14.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +sub v5.4s, v20.4s, v11.4s +add v20.4s, v20.4s, v11.4s +sqrdmulh v11.4S, v25.4S, v29.s[1] +mul v25.4S, v25.4S,v30.s[1] +sub v0.4s, v13.4s, v24.4s +add v13.4s, v13.4s, v24.4s +sqrdmulh v24.4S, v28.4S, v29.s[1] +mul v28.4S, v28.4S,v30.s[1] +sub v27.4s, v8.4s, v4.4s +add v8.4s, v8.4s, v4.4s +sqrdmulh v4.4S, v5.4S, v29.s[2] +mla v18.4S, v12.4S, v31.s[0] +sub v12.4s, v16.4s, v26.4s +add v16.4s, v16.4s, v26.4s +sqrdmulh v26.4S, v0.4S, v29.s[2] +mla v19.4S, v14.4S, v31.s[0] +sub v14.4s, v6.4s, v15.4s +add v6.4s, v6.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v29.s[1] +mla v25.4S, v11.4S, v31.s[0] +str q21, [x0, #64] +sub v21.4s, v3.4s, v17.4s +sqrdmulh v11.4S, v13.4S, v29.s[1] +mla v28.4S, v24.4S, v31.s[0] +str q10, [x0, #0] +add v3.4s, v3.4s, v17.4s +mul v0.4S, v0.4S,v30.s[2] +mul v5.4S, v5.4S,v30.s[2] +sub v17.4s, v27.4s, v18.4s +add v27.4s, v27.4s, v18.4s +mla v0.4S, v26.4S, v31.s[0] +mla v5.4S, v4.4S, v31.s[0] +sub v4.4s, v12.4s, v19.4s +add v12.4s, v12.4s, v19.4s +mul v13.4S, v13.4S,v30.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v19.4s, v8.4s, v25.4s +add v8.4s, v8.4s, v25.4s +mla v13.4S, v11.4S, v31.s[0] +mla v20.4S, v15.4S, v31.s[0] +sub v15.4s, v16.4s, v28.4s +add v16.4s, v16.4s, v28.4s +sqrdmulh v29.4S, v17.4S, v22.s[3] +mul v17.4S, v17.4S,v23.s[3] +sub v30.4s, v14.4s, v5.4s +add v14.4s, v14.4s, v5.4s +sqrdmulh v5.4S, v27.4S, v22.s[2] +mul v27.4S, v27.4S,v23.s[2] +sub v28.4s, v21.4s, v0.4s +add v21.4s, v21.4s, v0.4s +sqrdmulh v0.4S, v19.4S, v22.s[1] +mul v19.4S, v19.4S,v23.s[1] +sub v11.4s, v6.4s, v20.4s +add v6.4s, v6.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v22.s[0] +mul v8.4S, v8.4S,v23.s[0] +sub v25.4s, v3.4s, v13.4s +add v3.4s, v3.4s, v13.4s +sqrdmulh v13.4S, v4.4S, v22.s[3] +mla v17.4S, v29.4S, v31.s[0] +nop +nop +sqrdmulh v29.4S, v12.4S, v22.s[2] +mla v27.4S, v5.4S, v31.s[0] +nop +nop +sqrdmulh v5.4S, v15.4S, v22.s[1] +mla v19.4S, v0.4S, v31.s[0] +nop +nop +sqrdmulh v0.4S, v16.4S, v22.s[0] +mla v8.4S, v20.4S, v31.s[0] +nop +nop +mul v12.4S, v12.4S,v23.s[2] +mul v4.4S, v4.4S,v23.s[3] +sub v20.4s, v30.4s, v17.4s +add v30.4s, v30.4s, v17.4s +mla v12.4S, v29.4S, v31.s[0] +mla v4.4S, v13.4S, v31.s[0] +sub v13.4s, v14.4s, v27.4s +add v14.4s, v14.4s, v27.4s +mul v16.4S, v16.4S,v23.s[0] +mul v15.4S, v15.4S,v23.s[1] +sub v27.4s, v11.4s, v19.4s +add v11.4s, v11.4s, v19.4s +mla v16.4S, v0.4S, v31.s[0] +mla v15.4S, v5.4S, v31.s[0] +sub v5.4s, v6.4s, v8.4s +add v6.4s, v6.4s, v8.4s +sqrdmulh v22.4S, v20.4S, v9.s[3] +mul v20.4S, v20.4S,v1.s[3] +sub v23.4s, v28.4s, v4.4s +add v28.4s, v28.4s, v4.4s +sqrdmulh v4.4S, v30.4S, v9.s[2] +mul v30.4S, v30.4S,v1.s[2] +sub v8.4s, v21.4s, v12.4s +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v13.4S, v9.s[1] +mul v13.4S, v13.4S,v1.s[1] +sub v0.4s, v25.4s, v15.4s +add v25.4s, v25.4s, v15.4s +sqrdmulh v15.4S, v14.4S, v9.s[0] +mul v14.4S, v14.4S,v1.s[0] +sub v19.4s, v3.4s, v16.4s +add v3.4s, v3.4s, v16.4s +sqrdmulh v9.4S, v27.4S, v7.s[3] +mla v20.4S, v22.4S, v31.s[0] +nop +nop +sqrdmulh v22.4S, v11.4S, v7.s[2] +mla v30.4S, v4.4S, v31.s[0] +nop +nop +sqrdmulh v4.4S, v5.4S, v7.s[1] +mla v13.4S, v12.4S, v31.s[0] +nop +nop +sqrdmulh v12.4S, v6.4S, v7.s[0] +mla v14.4S, v15.4S, v31.s[0] +nop +nop +mul v11.4S, v11.4S,v2.s[2] +mul v27.4S, v27.4S,v2.s[3] +sub v15.4s, v23.4s, v20.4s +str q15, [x0, #976] +mla v11.4S, v22.4S, v31.s[0] +mla v27.4S, v9.4S, v31.s[0] +add v23.4s, v23.4s, v20.4s +str q23, [x0, #912] +mul v6.4S, v6.4S,v2.s[0] +mul v5.4S, v5.4S,v2.s[1] +sub v23.4s, v28.4s, v30.4s +str q23, [x0, #848] +mla v6.4S, v12.4S, v31.s[0] +mla v5.4S, v4.4S, v31.s[0] +add v28.4s, v28.4s, v30.4s +sub v30.4s, v8.4s, v13.4s +add v8.4s, v8.4s, v13.4s +str q28, [x0, #784] +sub v28.4s, v21.4s, v14.4s +str q30, [x0, #720] +add v21.4s, v21.4s, v14.4s +str q8, [x0, #656] +str q28, [x0, #592] +sub v28.4s, v0.4s, v27.4s +add v0.4s, v0.4s, v27.4s +str q21, [x0, #528] +sub v21.4s, v25.4s, v11.4s +str q28, [x0, #464] +add v25.4s, v25.4s, v11.4s +str q0, [x0, #400] +sub v0.4s, v19.4s, v5.4s +str q21, [x0, #336] +add v19.4s, v19.4s, v5.4s +str q25, [x0, #272] +sub v25.4s, v3.4s, v6.4s +str q0, [x0, #208] +add v3.4s, v3.4s, v6.4s +str q19, [x0, #144] +str q25, [x0, #80] +str q3, [x0, #16] +ldr q24, [x0, #224] +ldr q10, [x0, #160] +ldr q18, [x0, #32] +ldr q26, [x17, #+128] +ldr q17, [x17, #+144] +sqrdmulh v29.4S, v18.4S, v17.s[0] +mul v18.4S, v18.4S,v26.s[0] +ldr q16, [x0, #48] +sqrdmulh v1.4S, v16.4S, v17.s[0] +mul v16.4S, v16.4S,v26.s[0] +ldr q15, [x17, #+160] +ldr q22, [x17, #+176] +ldr q9, [x0, #96] +sqrdmulh v20.4S, v9.4S, v22.s[0] +mul v9.4S, v9.4S,v15.s[0] +ldr q23, [x0, #112] +sqrdmulh v12.4S, v23.4S, v22.s[0] +mul v23.4S, v23.4S,v15.s[0] +ldr q4, [x17, #+192] +ldr q2, [x17, #+208] +mla v18.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v10.4S, v2.s[0] +ldr q7, [x0, #176] +mla v16.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v7.4S, v2.s[0] +ldr q13, [x17, #+224] +ldr q30, [x17, #+240] +mla v9.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v24.4S, v30.s[0] +ldr q14, [x0, #240] +mla v23.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v14.4S, v30.s[0] +ldr q8, [x0, #0] +ldr q27, [x0, #128] +mul v10.4S, v10.4S,v4.s[0] +sub v28.4s, v8.4s, v18.4s +ldr q11, [x0, #16] +mul v7.4S, v7.4S,v4.s[0] +add v8.4s, v8.4s, v18.4s +ldr q18, [x0, #144] +mla v10.4S, v29.4S, v31.s[0] +sub v29.4s, v11.4s, v16.4s +ldr q21, [x0, #64] +mla v7.4S, v1.4S, v31.s[0] +add v11.4s, v11.4s, v16.4s +ldr q16, [x0, #192] +mul v24.4S, v24.4S,v13.s[0] +sub v1.4s, v21.4s, v9.4s +ldr q5, [x0, #80] +mul v14.4S, v14.4S,v13.s[0] +add v21.4s, v21.4s, v9.4s +ldr q9, [x0, #208] +mla v24.4S, v20.4S, v31.s[0] +mla v14.4S, v12.4S, v31.s[0] +sub v12.4s, v5.4s, v23.4s +sqrdmulh v20.4S, v11.4S, v17.s[1] +add v5.4s, v5.4s, v23.4s +mul v11.4S, v11.4S,v26.s[1] +sqrdmulh v23.4S, v29.4S, v17.s[2] +sub v0.4s, v27.4s, v10.4s +mul v29.4S, v29.4S,v26.s[2] +add v27.4s, v27.4s, v10.4s +sqrdmulh v17.4S, v5.4S, v22.s[1] +sub v26.4s, v18.4s, v7.4s +mul v5.4S, v5.4S,v15.s[1] +add v18.4s, v18.4s, v7.4s +sqrdmulh v7.4S, v12.4S, v22.s[2] +sub v10.4s, v16.4s, v24.4s +mul v12.4S, v12.4S,v15.s[2] +add v16.4s, v16.4s, v24.4s +mla v11.4S, v20.4S, v31.s[0] +sub v20.4s, v9.4s, v14.4s +ldr q22, [x0, #480] +sqrdmulh v15.4S, v18.4S, v2.s[1] +add v9.4s, v9.4s, v14.4s +mla v29.4S, v23.4S, v31.s[0] +ldr q23, [x0, #416] +sqrdmulh v14.4S, v26.4S, v2.s[2] +sub v24.4s, v8.4s, v11.4s +mla v5.4S, v17.4S, v31.s[0] +ldr q17, [x0, #288] +sqrdmulh v6.4S, v9.4S, v30.s[1] +add v8.4s, v8.4s, v11.4s +str q24, [x0, #16] +mla v12.4S, v7.4S, v31.s[0] +ldr q7, [x17, #+256] +ldr q24, [x17, #+272] +sqrdmulh v11.4S, v20.4S, v30.s[2] +sub v19.4s, v28.4s, v29.4s +str q8, [x0, #0] +mul v18.4S, v18.4S,v4.s[1] +add v28.4s, v28.4s, v29.4s +mul v26.4S, v26.4S,v4.s[2] +str q19, [x0, #48] +mla v18.4S, v15.4S, v31.s[0] +sub v15.4s, v21.4s, v5.4s +mla v26.4S, v14.4S, v31.s[0] +str q28, [x0, #32] +mul v9.4S, v9.4S,v13.s[1] +str q15, [x0, #80] +mul v20.4S, v20.4S,v13.s[2] +add v21.4s, v21.4s, v5.4s +str q21, [x0, #64] +mla v9.4S, v6.4S, v31.s[0] +sub v6.4s, v1.4s, v12.4s +str q6, [x0, #112] +mla v20.4S, v11.4S, v31.s[0] +add v1.4s, v1.4s, v12.4s +str q1, [x0, #96] +sqrdmulh v30.4S, v17.4S, v24.s[0] +sub v13.4s, v27.4s, v18.4s +mul v17.4S, v17.4S,v7.s[0] +str q13, [x0, #144] +ldr q13, [x0, #304] +sqrdmulh v1.4S, v13.4S, v24.s[0] +add v27.4s, v27.4s, v18.4s +mul v13.4S, v13.4S,v7.s[0] +str q27, [x0, #128] +ldr q27, [x17, #+288] +ldr q18, [x17, #+304] +ldr q12, [x0, #352] +sqrdmulh v11.4S, v12.4S, v18.s[0] +sub v6.4s, v0.4s, v26.4s +mul v12.4S, v12.4S,v27.s[0] +str q6, [x0, #176] +ldr q6, [x0, #368] +sqrdmulh v21.4S, v6.4S, v18.s[0] +add v0.4s, v0.4s, v26.4s +mul v6.4S, v6.4S,v27.s[0] +str q0, [x0, #160] +ldr q0, [x17, #+320] +ldr q26, [x17, #+336] +mla v17.4S, v30.4S, v31.s[0] +sub v30.4s, v16.4s, v9.4s +sqrdmulh v5.4S, v23.4S, v26.s[0] +str q30, [x0, #208] +ldr q30, [x0, #432] +mla v13.4S, v1.4S, v31.s[0] +add v16.4s, v16.4s, v9.4s +sqrdmulh v9.4S, v30.4S, v26.s[0] +str q16, [x0, #192] +ldr q16, [x17, #+352] +ldr q1, [x17, #+368] +mla v12.4S, v11.4S, v31.s[0] +sub v11.4s, v10.4s, v20.4s +sqrdmulh v15.4S, v22.4S, v1.s[0] +str q11, [x0, #240] +ldr q11, [x0, #496] +mla v6.4S, v21.4S, v31.s[0] +add v10.4s, v10.4s, v20.4s +sqrdmulh v20.4S, v11.4S, v1.s[0] +str q10, [x0, #224] +ldr q10, [x0, #256] +ldr q21, [x0, #384] +mul v23.4S, v23.4S,v0.s[0] +sub v2.4s, v10.4s, v17.4s +ldr q4, [x0, #272] +mul v30.4S, v30.4S,v0.s[0] +add v10.4s, v10.4s, v17.4s +ldr q17, [x0, #400] +mla v23.4S, v5.4S, v31.s[0] +sub v5.4s, v4.4s, v13.4s +ldr q28, [x0, #320] +mla v30.4S, v9.4S, v31.s[0] +add v4.4s, v4.4s, v13.4s +ldr q13, [x0, #448] +mul v22.4S, v22.4S,v16.s[0] +sub v9.4s, v28.4s, v12.4s +ldr q14, [x0, #336] +mul v11.4S, v11.4S,v16.s[0] +add v28.4s, v28.4s, v12.4s +ldr q12, [x0, #464] +mla v22.4S, v15.4S, v31.s[0] +mla v11.4S, v20.4S, v31.s[0] +sub v20.4s, v14.4s, v6.4s +sqrdmulh v15.4S, v4.4S, v24.s[1] +add v14.4s, v14.4s, v6.4s +mul v4.4S, v4.4S,v7.s[1] +sqrdmulh v6.4S, v5.4S, v24.s[2] +sub v19.4s, v21.4s, v23.4s +mul v5.4S, v5.4S,v7.s[2] +add v21.4s, v21.4s, v23.4s +sqrdmulh v24.4S, v14.4S, v18.s[1] +sub v7.4s, v17.4s, v30.4s +mul v14.4S, v14.4S,v27.s[1] +add v17.4s, v17.4s, v30.4s +sqrdmulh v30.4S, v20.4S, v18.s[2] +sub v23.4s, v13.4s, v22.4s +mul v20.4S, v20.4S,v27.s[2] +add v13.4s, v13.4s, v22.4s +mla v4.4S, v15.4S, v31.s[0] +sub v15.4s, v12.4s, v11.4s +ldr q18, [x0, #736] +sqrdmulh v27.4S, v17.4S, v26.s[1] +add v12.4s, v12.4s, v11.4s +mla v5.4S, v6.4S, v31.s[0] +ldr q6, [x0, #672] +sqrdmulh v11.4S, v7.4S, v26.s[2] +sub v22.4s, v10.4s, v4.4s +mla v14.4S, v24.4S, v31.s[0] +ldr q24, [x0, #544] +sqrdmulh v29.4S, v12.4S, v1.s[1] +add v10.4s, v10.4s, v4.4s +str q22, [x0, #272] +mla v20.4S, v30.4S, v31.s[0] +ldr q30, [x17, #+384] +ldr q22, [x17, #+400] +sqrdmulh v4.4S, v15.4S, v1.s[2] +sub v8.4s, v2.4s, v5.4s +str q10, [x0, #256] +mul v17.4S, v17.4S,v0.s[1] +add v2.4s, v2.4s, v5.4s +mul v7.4S, v7.4S,v0.s[2] +str q8, [x0, #304] +mla v17.4S, v27.4S, v31.s[0] +sub v27.4s, v28.4s, v14.4s +mla v7.4S, v11.4S, v31.s[0] +str q2, [x0, #288] +mul v12.4S, v12.4S,v16.s[1] +str q27, [x0, #336] +mul v15.4S, v15.4S,v16.s[2] +add v28.4s, v28.4s, v14.4s +str q28, [x0, #320] +mla v12.4S, v29.4S, v31.s[0] +sub v29.4s, v9.4s, v20.4s +str q29, [x0, #368] +mla v15.4S, v4.4S, v31.s[0] +add v9.4s, v9.4s, v20.4s +str q9, [x0, #352] +sqrdmulh v1.4S, v24.4S, v22.s[0] +sub v16.4s, v21.4s, v17.4s +mul v24.4S, v24.4S,v30.s[0] +str q16, [x0, #400] +ldr q16, [x0, #560] +sqrdmulh v9.4S, v16.4S, v22.s[0] +add v21.4s, v21.4s, v17.4s +mul v16.4S, v16.4S,v30.s[0] +str q21, [x0, #384] +ldr q21, [x17, #+416] +ldr q17, [x17, #+432] +ldr q20, [x0, #608] +sqrdmulh v4.4S, v20.4S, v17.s[0] +sub v29.4s, v19.4s, v7.4s +mul v20.4S, v20.4S,v21.s[0] +str q29, [x0, #432] +ldr q29, [x0, #624] +sqrdmulh v28.4S, v29.4S, v17.s[0] +add v19.4s, v19.4s, v7.4s +mul v29.4S, v29.4S,v21.s[0] +str q19, [x0, #416] +ldr q19, [x17, #+448] +ldr q7, [x17, #+464] +mla v24.4S, v1.4S, v31.s[0] +sub v1.4s, v13.4s, v12.4s +sqrdmulh v14.4S, v6.4S, v7.s[0] +str q1, [x0, #464] +ldr q1, [x0, #688] +mla v16.4S, v9.4S, v31.s[0] +add v13.4s, v13.4s, v12.4s +sqrdmulh v12.4S, v1.4S, v7.s[0] +str q13, [x0, #448] +ldr q13, [x17, #+480] +ldr q9, [x17, #+496] +mla v20.4S, v4.4S, v31.s[0] +sub v4.4s, v23.4s, v15.4s +sqrdmulh v27.4S, v18.4S, v9.s[0] +str q4, [x0, #496] +ldr q4, [x0, #752] +mla v29.4S, v28.4S, v31.s[0] +add v23.4s, v23.4s, v15.4s +sqrdmulh v15.4S, v4.4S, v9.s[0] +str q23, [x0, #480] +ldr q23, [x0, #512] +ldr q28, [x0, #640] +mul v6.4S, v6.4S,v19.s[0] +sub v26.4s, v23.4s, v24.4s +ldr q0, [x0, #528] +mul v1.4S, v1.4S,v19.s[0] +add v23.4s, v23.4s, v24.4s +ldr q24, [x0, #656] +mla v6.4S, v14.4S, v31.s[0] +sub v14.4s, v0.4s, v16.4s +ldr q2, [x0, #576] +mla v1.4S, v12.4S, v31.s[0] +add v0.4s, v0.4s, v16.4s +ldr q16, [x0, #704] +mul v18.4S, v18.4S,v13.s[0] +sub v12.4s, v2.4s, v20.4s +ldr q11, [x0, #592] +mul v4.4S, v4.4S,v13.s[0] +add v2.4s, v2.4s, v20.4s +ldr q20, [x0, #720] +mla v18.4S, v27.4S, v31.s[0] +mla v4.4S, v15.4S, v31.s[0] +sub v15.4s, v11.4s, v29.4s +sqrdmulh v27.4S, v0.4S, v22.s[1] +add v11.4s, v11.4s, v29.4s +mul v0.4S, v0.4S,v30.s[1] +sqrdmulh v29.4S, v14.4S, v22.s[2] +sub v8.4s, v28.4s, v6.4s +mul v14.4S, v14.4S,v30.s[2] +add v28.4s, v28.4s, v6.4s +sqrdmulh v22.4S, v11.4S, v17.s[1] +sub v30.4s, v24.4s, v1.4s +mul v11.4S, v11.4S,v21.s[1] +add v24.4s, v24.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v17.s[2] +sub v6.4s, v16.4s, v18.4s +mul v15.4S, v15.4S,v21.s[2] +add v16.4s, v16.4s, v18.4s +mla v0.4S, v27.4S, v31.s[0] +sub v27.4s, v20.4s, v4.4s +ldr q17, [x0, #992] +sqrdmulh v21.4S, v24.4S, v7.s[1] +add v20.4s, v20.4s, v4.4s +mla v14.4S, v29.4S, v31.s[0] +ldr q29, [x0, #928] +sqrdmulh v4.4S, v30.4S, v7.s[2] +sub v18.4s, v23.4s, v0.4s +mla v11.4S, v22.4S, v31.s[0] +ldr q22, [x0, #800] +sqrdmulh v5.4S, v20.4S, v9.s[1] +add v23.4s, v23.4s, v0.4s +str q18, [x0, #528] +mla v15.4S, v1.4S, v31.s[0] +ldr q1, [x17, #+512] +ldr q18, [x17, #+528] +sqrdmulh v0.4S, v27.4S, v9.s[2] +sub v10.4s, v26.4s, v14.4s +str q23, [x0, #512] +mul v24.4S, v24.4S,v19.s[1] +add v26.4s, v26.4s, v14.4s +mul v30.4S, v30.4S,v19.s[2] +str q10, [x0, #560] +mla v24.4S, v21.4S, v31.s[0] +sub v21.4s, v2.4s, v11.4s +mla v30.4S, v4.4S, v31.s[0] +str q26, [x0, #544] +mul v20.4S, v20.4S,v13.s[1] +str q21, [x0, #592] +mul v27.4S, v27.4S,v13.s[2] +add v2.4s, v2.4s, v11.4s +str q2, [x0, #576] +mla v20.4S, v5.4S, v31.s[0] +sub v5.4s, v12.4s, v15.4s +str q5, [x0, #624] +mla v27.4S, v0.4S, v31.s[0] +add v12.4s, v12.4s, v15.4s +str q12, [x0, #608] +sqrdmulh v9.4S, v22.4S, v18.s[0] +sub v13.4s, v28.4s, v24.4s +mul v22.4S, v22.4S,v1.s[0] +str q13, [x0, #656] +ldr q13, [x0, #816] +sqrdmulh v12.4S, v13.4S, v18.s[0] +add v28.4s, v28.4s, v24.4s +mul v13.4S, v13.4S,v1.s[0] +str q28, [x0, #640] +ldr q28, [x17, #+544] +ldr q24, [x17, #+560] +ldr q15, [x0, #864] +sqrdmulh v0.4S, v15.4S, v24.s[0] +sub v5.4s, v8.4s, v30.4s +mul v15.4S, v15.4S,v28.s[0] +str q5, [x0, #688] +ldr q5, [x0, #880] +sqrdmulh v2.4S, v5.4S, v24.s[0] +add v8.4s, v8.4s, v30.4s +mul v5.4S, v5.4S,v28.s[0] +str q8, [x0, #672] +ldr q8, [x17, #+576] +ldr q30, [x17, #+592] +mla v22.4S, v9.4S, v31.s[0] +sub v9.4s, v16.4s, v20.4s +sqrdmulh v11.4S, v29.4S, v30.s[0] +str q9, [x0, #720] +ldr q9, [x0, #944] +mla v13.4S, v12.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v9.4S, v30.s[0] +str q16, [x0, #704] +ldr q16, [x17, #+608] +ldr q12, [x17, #+624] +mla v15.4S, v0.4S, v31.s[0] +sub v0.4s, v6.4s, v27.4s +sqrdmulh v21.4S, v17.4S, v12.s[0] +str q0, [x0, #752] +ldr q0, [x0, #1008] +mla v5.4S, v2.4S, v31.s[0] +add v6.4s, v6.4s, v27.4s +sqrdmulh v27.4S, v0.4S, v12.s[0] +str q6, [x0, #736] +ldr q6, [x0, #768] +ldr q2, [x0, #896] +mul v29.4S, v29.4S,v8.s[0] +sub v7.4s, v6.4s, v22.4s +ldr q19, [x0, #784] +mul v9.4S, v9.4S,v8.s[0] +add v6.4s, v6.4s, v22.4s +ldr q22, [x0, #912] +mla v29.4S, v11.4S, v31.s[0] +sub v11.4s, v19.4s, v13.4s +ldr q26, [x0, #832] +mla v9.4S, v20.4S, v31.s[0] +add v19.4s, v19.4s, v13.4s +ldr q13, [x0, #960] +mul v17.4S, v17.4S,v16.s[0] +sub v20.4s, v26.4s, v15.4s +ldr q4, [x0, #848] +mul v0.4S, v0.4S,v16.s[0] +add v26.4s, v26.4s, v15.4s +ldr q15, [x0, #976] +mla v17.4S, v21.4S, v31.s[0] +mla v0.4S, v27.4S, v31.s[0] +sub v27.4s, v4.4s, v5.4s +sqrdmulh v21.4S, v19.4S, v18.s[1] +add v4.4s, v4.4s, v5.4s +mul v19.4S, v19.4S,v1.s[1] +sqrdmulh v5.4S, v11.4S, v18.s[2] +sub v10.4s, v2.4s, v29.4s +mul v11.4S, v11.4S,v1.s[2] +add v2.4s, v2.4s, v29.4s +sqrdmulh v18.4S, v4.4S, v24.s[1] +sub v1.4s, v22.4s, v9.4s +mul v4.4S, v4.4S,v28.s[1] +add v22.4s, v22.4s, v9.4s +sqrdmulh v9.4S, v27.4S, v24.s[2] +sub v29.4s, v13.4s, v17.4s +mul v27.4S, v27.4S,v28.s[2] +add v13.4s, v13.4s, v17.4s +mla v19.4S, v21.4S, v31.s[0] +sub v21.4s, v15.4s, v0.4s +sqrdmulh v24.4S, v22.4S, v30.s[1] +add v15.4s, v15.4s, v0.4s +mla v11.4S, v5.4S, v31.s[0] +sqrdmulh v5.4S, v1.4S, v30.s[2] +sub v0.4s, v6.4s, v19.4s +mla v4.4S, v18.4S, v31.s[0] +sqrdmulh v18.4S, v15.4S, v12.s[1] +add v6.4s, v6.4s, v19.4s +str q0, [x0, #784] +mla v27.4S, v9.4S, v31.s[0] +sqrdmulh v9.4S, v21.4S, v12.s[2] +sub v0.4s, v7.4s, v11.4s +str q6, [x0, #768] +mul v22.4S, v22.4S,v8.s[1] +add v7.4s, v7.4s, v11.4s +mul v1.4S, v1.4S,v8.s[2] +str q0, [x0, #816] +mla v22.4S, v24.4S, v31.s[0] +sub v24.4s, v26.4s, v4.4s +mla v1.4S, v5.4S, v31.s[0] +str q7, [x0, #800] +mul v15.4S, v15.4S,v16.s[1] +str q24, [x0, #848] +mul v21.4S, v21.4S,v16.s[2] +add v26.4s, v26.4s, v4.4s +str q26, [x0, #832] +mla v15.4S, v18.4S, v31.s[0] +sub v18.4s, v20.4s, v27.4s +str q18, [x0, #880] +mla v21.4S, v9.4S, v31.s[0] +add v20.4s, v20.4s, v27.4s +str q20, [x0, #864] +sub v12.4s, v2.4s, v22.4s +str q12, [x0, #912] +add v2.4s, v2.4s, v22.4s +str q2, [x0, #896] +sub v2.4s, v10.4s, v1.4s +str q2, [x0, #944] +add v10.4s, v10.4s, v1.4s +str q10, [x0, #928] +sub v10.4s, v13.4s, v15.4s +str q10, [x0, #976] +add v13.4s, v13.4s, v15.4s +str q13, [x0, #960] +sub v13.4s, v29.4s, v21.4s +str q13, [x0, #1008] +add v29.4s, v29.4s, v21.4s +str q29, [x0, #992] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1528 +// Instruction count: 1524 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_10.s b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_10.s new file mode 100644 index 0000000..9f6d143 --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_10.s @@ -0,0 +1,1550 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_22_z4_10 +.global _ntt_u32_incomplete_neon_asm_var_4_2_22_z4_10 +ntt_u32_incomplete_neon_asm_var_4_2_22_z4_10: +_ntt_u32_incomplete_neon_asm_var_4_2_22_z4_10: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x0, #992] +sqrdmulh v27.4S, v28.4S, v29.s[0] +mul v28.4S, v28.4S,v30.s[0] +ldr q26, [x0, #928] +sqrdmulh v25.4S, v26.4S, v29.s[0] +mul v26.4S, v26.4S,v30.s[0] +ldr q24, [x0, #864] +sqrdmulh v23.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +ldr q22, [x0, #800] +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +ldr q20, [x0, #736] +sqrdmulh v19.4S, v20.4S, v29.s[0] +mla v28.4S, v27.4S, v31.s[0] +ldr q27, [x0, #672] +sqrdmulh v18.4S, v27.4S, v29.s[0] +mla v26.4S, v25.4S, v31.s[0] +ldr q25, [x0, #608] +sqrdmulh v17.4S, v25.4S, v29.s[0] +mla v24.4S, v23.4S, v31.s[0] +ldr q23, [x0, #544] +sqrdmulh v16.4S, v23.4S, v29.s[0] +mla v22.4S, v21.4S, v31.s[0] +ldr q21, [x0, #480] +mul v27.4S, v27.4S,v30.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q3, [x0, #416] +ldr q2, [x0, #352] +ldr q1, [x0, #288] +mla v27.4S, v18.4S, v31.s[0] +mla v20.4S, v19.4S, v31.s[0] +ldr q19, [x0, #224] +ldr q18, [x0, #160] +mul v23.4S, v23.4S,v30.s[0] +mul v25.4S, v25.4S,v30.s[0] +ldr q0, [x0, #96] +ldr q15, [x0, #32] +mla v23.4S, v16.4S, v31.s[0] +mla v25.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v28.4s +add v21.4s, v21.4s, v28.4s +sqrdmulh v28.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +sub v16.4s, v3.4s, v26.4s +add v3.4s, v3.4s, v26.4s +sqrdmulh v26.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +sub v14.4s, v2.4s, v24.4s +add v2.4s, v2.4s, v24.4s +sqrdmulh v24.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v13.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +sqrdmulh v22.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v12.4s, v19.4s, v20.4s +add v19.4s, v19.4s, v20.4s +sqrdmulh v20.4S, v14.4S, v29.s[2] +mla v17.4S, v28.4S, v31.s[0] +sub v28.4s, v18.4s, v27.4s +add v18.4s, v18.4s, v27.4s +sqrdmulh v27.4S, v13.4S, v29.s[2] +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v0.4s, v25.4s +add v0.4s, v0.4s, v25.4s +sqrdmulh v25.4S, v2.4S, v29.s[1] +mla v21.4S, v24.4S, v31.s[0] +sub v24.4s, v15.4s, v23.4s +sqrdmulh v11.4S, v1.4S, v29.s[1] +mla v3.4S, v22.4S, v31.s[0] +add v15.4s, v15.4s, v23.4s +ldr q23, [x17, #+32] +ldr q22, [x17, #+48] +mul v13.4S, v13.4S,v30.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v10.4s, v12.4s, v17.4s +add v12.4s, v12.4s, v17.4s +mla v13.4S, v27.4S, v31.s[0] +mla v14.4S, v20.4S, v31.s[0] +sub v20.4s, v28.4s, v16.4s +add v28.4s, v28.4s, v16.4s +mul v1.4S, v1.4S,v30.s[1] +mul v2.4S, v2.4S,v30.s[1] +sub v16.4s, v19.4s, v21.4s +add v19.4s, v19.4s, v21.4s +mla v1.4S, v11.4S, v31.s[0] +mla v2.4S, v25.4S, v31.s[0] +sub v25.4s, v18.4s, v3.4s +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v10.4S, v22.s[3] +mul v10.4S, v10.4S,v23.s[3] +sub v11.4s, v26.4s, v14.4s +add v26.4s, v26.4s, v14.4s +sqrdmulh v14.4S, v12.4S, v22.s[2] +mul v12.4S, v12.4S,v23.s[2] +sub v21.4s, v24.4s, v13.4s +add v24.4s, v24.4s, v13.4s +sqrdmulh v13.4S, v16.4S, v22.s[1] +mul v16.4S, v16.4S,v23.s[1] +sub v27.4s, v0.4s, v2.4s +add v0.4s, v0.4s, v2.4s +sqrdmulh v2.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v17.4s, v15.4s, v1.4s +add v15.4s, v15.4s, v1.4s +ldr q1, [x17, #+96] +ldr q9, [x17, #+112] +sqrdmulh v8.4S, v20.4S, v22.s[3] +mla v10.4S, v3.4S, v31.s[0] +nop +nop +sqrdmulh v3.4S, v28.4S, v22.s[2] +mla v12.4S, v14.4S, v31.s[0] +nop +nop +sqrdmulh v14.4S, v25.4S, v22.s[1] +mla v16.4S, v13.4S, v31.s[0] +nop +nop +sqrdmulh v13.4S, v18.4S, v22.s[0] +mla v19.4S, v2.4S, v31.s[0] +nop +nop +ldr q2, [x17, #+64] +ldr q7, [x17, #+80] +mul v28.4S, v28.4S,v23.s[2] +mul v20.4S, v20.4S,v23.s[3] +sub v6.4s, v11.4s, v10.4s +add v11.4s, v11.4s, v10.4s +mla v28.4S, v3.4S, v31.s[0] +mla v20.4S, v8.4S, v31.s[0] +sub v8.4s, v26.4s, v12.4s +add v26.4s, v26.4s, v12.4s +mul v18.4S, v18.4S,v23.s[0] +mul v25.4S, v25.4S,v23.s[1] +sub v12.4s, v27.4s, v16.4s +add v27.4s, v27.4s, v16.4s +mla v18.4S, v13.4S, v31.s[0] +mla v25.4S, v14.4S, v31.s[0] +sub v14.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v9.s[3] +mul v6.4S, v6.4S,v1.s[3] +sub v13.4s, v21.4s, v20.4s +add v21.4s, v21.4s, v20.4s +sqrdmulh v20.4S, v11.4S, v9.s[2] +mul v11.4S, v11.4S,v1.s[2] +sub v16.4s, v24.4s, v28.4s +add v24.4s, v24.4s, v28.4s +sqrdmulh v28.4S, v8.4S, v9.s[1] +mul v8.4S, v8.4S,v1.s[1] +sub v3.4s, v17.4s, v25.4s +add v17.4s, v17.4s, v25.4s +sqrdmulh v25.4S, v26.4S, v9.s[0] +mul v26.4S, v26.4S,v1.s[0] +sub v10.4s, v15.4s, v18.4s +add v15.4s, v15.4s, v18.4s +sqrdmulh v18.4S, v12.4S, v7.s[3] +mla v6.4S, v19.4S, v31.s[0] +nop +nop +sqrdmulh v19.4S, v27.4S, v7.s[2] +mla v11.4S, v20.4S, v31.s[0] +nop +nop +sqrdmulh v20.4S, v14.4S, v7.s[1] +mla v8.4S, v28.4S, v31.s[0] +nop +nop +sqrdmulh v28.4S, v0.4S, v7.s[0] +mla v26.4S, v25.4S, v31.s[0] +nop +nop +mul v27.4S, v27.4S,v2.s[2] +mul v12.4S, v12.4S,v2.s[3] +sub v25.4s, v13.4s, v6.4s +str q25, [x0, #992] +mla v27.4S, v19.4S, v31.s[0] +mla v12.4S, v18.4S, v31.s[0] +add v13.4s, v13.4s, v6.4s +str q13, [x0, #928] +mul v0.4S, v0.4S,v2.s[0] +mul v14.4S, v14.4S,v2.s[1] +sub v13.4s, v21.4s, v11.4s +str q13, [x0, #864] +mla v0.4S, v28.4S, v31.s[0] +mla v14.4S, v20.4S, v31.s[0] +add v21.4s, v21.4s, v11.4s +sub v11.4s, v16.4s, v8.4s +ldr q20, [x0, #1008] +sqrdmulh v28.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v16.4s, v16.4s, v8.4s +str q21, [x0, #800] +ldr q21, [x0, #944] +sqrdmulh v8.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +sub v13.4s, v24.4s, v26.4s +str q11, [x0, #736] +ldr q11, [x0, #880] +sqrdmulh v6.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +add v24.4s, v24.4s, v26.4s +str q16, [x0, #672] +ldr q16, [x0, #816] +sqrdmulh v26.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v18.4s, v3.4s, v12.4s +str q13, [x0, #608] +ldr q13, [x0, #752] +sqrdmulh v19.4S, v13.4S, v29.s[0] +mla v20.4S, v28.4S, v31.s[0] +add v3.4s, v3.4s, v12.4s +str q24, [x0, #544] +ldr q24, [x0, #688] +sqrdmulh v12.4S, v24.4S, v29.s[0] +mla v21.4S, v8.4S, v31.s[0] +sub v8.4s, v17.4s, v27.4s +str q18, [x0, #480] +ldr q18, [x0, #624] +sqrdmulh v28.4S, v18.4S, v29.s[0] +mla v11.4S, v6.4S, v31.s[0] +add v17.4s, v17.4s, v27.4s +str q3, [x0, #416] +ldr q3, [x0, #560] +sqrdmulh v27.4S, v3.4S, v29.s[0] +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v10.4s, v14.4s +str q8, [x0, #352] +ldr q8, [x0, #496] +add v10.4s, v10.4s, v14.4s +mul v24.4S, v24.4S,v30.s[0] +mul v13.4S, v13.4S,v30.s[0] +ldr q14, [x0, #432] +str q17, [x0, #288] +ldr q17, [x0, #368] +ldr q6, [x0, #304] +mla v24.4S, v12.4S, v31.s[0] +mla v13.4S, v19.4S, v31.s[0] +str q26, [x0, #224] +sub v26.4s, v15.4s, v0.4s +ldr q19, [x0, #240] +ldr q12, [x0, #176] +mul v3.4S, v3.4S,v30.s[0] +mul v18.4S, v18.4S,v30.s[0] +str q10, [x0, #160] +add v15.4s, v15.4s, v0.4s +ldr q0, [x0, #112] +ldr q10, [x0, #48] +mla v3.4S, v27.4S, v31.s[0] +mla v18.4S, v28.4S, v31.s[0] +sub v28.4s, v8.4s, v20.4s +add v8.4s, v8.4s, v20.4s +sqrdmulh v20.4S, v28.4S, v29.s[2] +mul v28.4S, v28.4S,v30.s[2] +sub v27.4s, v14.4s, v21.4s +add v14.4s, v14.4s, v21.4s +sqrdmulh v21.4S, v27.4S, v29.s[2] +mul v27.4S, v27.4S,v30.s[2] +sub v25.4s, v17.4s, v11.4s +add v17.4s, v17.4s, v11.4s +sqrdmulh v11.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +sub v5.4s, v6.4s, v16.4s +add v6.4s, v6.4s, v16.4s +sqrdmulh v16.4S, v14.4S, v29.s[1] +mul v14.4S, v14.4S,v30.s[1] +sub v4.4s, v19.4s, v13.4s +add v19.4s, v19.4s, v13.4s +sqrdmulh v13.4S, v25.4S, v29.s[2] +mla v28.4S, v20.4S, v31.s[0] +sub v20.4s, v12.4s, v24.4s +add v12.4s, v12.4s, v24.4s +sqrdmulh v24.4S, v5.4S, v29.s[2] +mla v27.4S, v21.4S, v31.s[0] +sub v21.4s, v0.4s, v18.4s +add v0.4s, v0.4s, v18.4s +sqrdmulh v18.4S, v17.4S, v29.s[1] +mla v8.4S, v11.4S, v31.s[0] +sub v11.4s, v10.4s, v3.4s +str q26, [x0, #96] +sqrdmulh v26.4S, v6.4S, v29.s[1] +mla v14.4S, v16.4S, v31.s[0] +add v10.4s, v10.4s, v3.4s +str q15, [x0, #32] +mul v5.4S, v5.4S,v30.s[2] +mul v25.4S, v25.4S,v30.s[2] +sub v15.4s, v4.4s, v28.4s +add v4.4s, v4.4s, v28.4s +mla v5.4S, v24.4S, v31.s[0] +mla v25.4S, v13.4S, v31.s[0] +sub v13.4s, v20.4s, v27.4s +add v20.4s, v20.4s, v27.4s +mul v6.4S, v6.4S,v30.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v27.4s, v19.4s, v8.4s +add v19.4s, v19.4s, v8.4s +mla v6.4S, v26.4S, v31.s[0] +mla v17.4S, v18.4S, v31.s[0] +sub v18.4s, v12.4s, v14.4s +add v12.4s, v12.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v22.s[3] +mul v15.4S, v15.4S,v23.s[3] +sub v26.4s, v21.4s, v25.4s +add v21.4s, v21.4s, v25.4s +sqrdmulh v25.4S, v4.4S, v22.s[2] +mul v4.4S, v4.4S,v23.s[2] +sub v8.4s, v11.4s, v5.4s +add v11.4s, v11.4s, v5.4s +sqrdmulh v5.4S, v27.4S, v22.s[1] +mul v27.4S, v27.4S,v23.s[1] +sub v24.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +sqrdmulh v17.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v28.4s, v10.4s, v6.4s +add v10.4s, v10.4s, v6.4s +sqrdmulh v6.4S, v13.4S, v22.s[3] +mla v15.4S, v14.4S, v31.s[0] +nop +nop +sqrdmulh v14.4S, v20.4S, v22.s[2] +mla v4.4S, v25.4S, v31.s[0] +nop +nop +sqrdmulh v25.4S, v18.4S, v22.s[1] +mla v27.4S, v5.4S, v31.s[0] +nop +nop +sqrdmulh v5.4S, v12.4S, v22.s[0] +mla v19.4S, v17.4S, v31.s[0] +nop +nop +mul v20.4S, v20.4S,v23.s[2] +mul v13.4S, v13.4S,v23.s[3] +sub v17.4s, v26.4s, v15.4s +add v26.4s, v26.4s, v15.4s +mla v20.4S, v14.4S, v31.s[0] +mla v13.4S, v6.4S, v31.s[0] +sub v6.4s, v21.4s, v4.4s +add v21.4s, v21.4s, v4.4s +mul v12.4S, v12.4S,v23.s[0] +mul v18.4S, v18.4S,v23.s[1] +sub v4.4s, v24.4s, v27.4s +add v24.4s, v24.4s, v27.4s +mla v12.4S, v5.4S, v31.s[0] +mla v18.4S, v25.4S, v31.s[0] +sub v25.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v17.4S, v9.s[3] +mul v17.4S, v17.4S,v1.s[3] +sub v5.4s, v8.4s, v13.4s +add v8.4s, v8.4s, v13.4s +sqrdmulh v13.4S, v26.4S, v9.s[2] +mul v26.4S, v26.4S,v1.s[2] +sub v27.4s, v11.4s, v20.4s +add v11.4s, v11.4s, v20.4s +sqrdmulh v20.4S, v6.4S, v9.s[1] +mul v6.4S, v6.4S,v1.s[1] +sub v14.4s, v28.4s, v18.4s +add v28.4s, v28.4s, v18.4s +sqrdmulh v18.4S, v21.4S, v9.s[0] +mul v21.4S, v21.4S,v1.s[0] +sub v15.4s, v10.4s, v12.4s +add v10.4s, v10.4s, v12.4s +sqrdmulh v12.4S, v4.4S, v7.s[3] +mla v17.4S, v19.4S, v31.s[0] +nop +nop +sqrdmulh v19.4S, v24.4S, v7.s[2] +mla v26.4S, v13.4S, v31.s[0] +nop +nop +sqrdmulh v13.4S, v25.4S, v7.s[1] +mla v6.4S, v20.4S, v31.s[0] +nop +nop +sqrdmulh v20.4S, v0.4S, v7.s[0] +mla v21.4S, v18.4S, v31.s[0] +nop +nop +mul v24.4S, v24.4S,v2.s[2] +mul v4.4S, v4.4S,v2.s[3] +sub v18.4s, v5.4s, v17.4s +str q18, [x0, #1008] +mla v24.4S, v19.4S, v31.s[0] +mla v4.4S, v12.4S, v31.s[0] +add v5.4s, v5.4s, v17.4s +str q5, [x0, #944] +mul v0.4S, v0.4S,v2.s[0] +mul v25.4S, v25.4S,v2.s[1] +sub v5.4s, v8.4s, v26.4s +str q5, [x0, #880] +mla v0.4S, v20.4S, v31.s[0] +mla v25.4S, v13.4S, v31.s[0] +add v8.4s, v8.4s, v26.4s +sub v26.4s, v27.4s, v6.4s +ldr q13, [x0, #960] +sqrdmulh v20.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +add v27.4s, v27.4s, v6.4s +str q8, [x0, #816] +ldr q8, [x0, #896] +sqrdmulh v6.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v5.4s, v11.4s, v21.4s +str q26, [x0, #752] +ldr q26, [x0, #832] +sqrdmulh v17.4S, v26.4S, v29.s[0] +mul v26.4S, v26.4S,v30.s[0] +add v11.4s, v11.4s, v21.4s +str q27, [x0, #688] +ldr q27, [x0, #768] +sqrdmulh v21.4S, v27.4S, v29.s[0] +mul v27.4S, v27.4S,v30.s[0] +sub v12.4s, v14.4s, v4.4s +str q5, [x0, #624] +ldr q5, [x0, #704] +sqrdmulh v19.4S, v5.4S, v29.s[0] +mla v13.4S, v20.4S, v31.s[0] +add v14.4s, v14.4s, v4.4s +str q11, [x0, #560] +ldr q11, [x0, #640] +sqrdmulh v4.4S, v11.4S, v29.s[0] +mla v8.4S, v6.4S, v31.s[0] +sub v6.4s, v28.4s, v24.4s +str q12, [x0, #496] +ldr q12, [x0, #576] +sqrdmulh v20.4S, v12.4S, v29.s[0] +mla v26.4S, v17.4S, v31.s[0] +add v28.4s, v28.4s, v24.4s +str q14, [x0, #432] +ldr q14, [x0, #512] +sqrdmulh v24.4S, v14.4S, v29.s[0] +mla v27.4S, v21.4S, v31.s[0] +sub v21.4s, v15.4s, v25.4s +str q6, [x0, #368] +ldr q6, [x0, #448] +add v15.4s, v15.4s, v25.4s +mul v11.4S, v11.4S,v30.s[0] +mul v5.4S, v5.4S,v30.s[0] +ldr q25, [x0, #384] +str q28, [x0, #304] +ldr q28, [x0, #320] +ldr q17, [x0, #256] +mla v11.4S, v4.4S, v31.s[0] +mla v5.4S, v19.4S, v31.s[0] +str q21, [x0, #240] +sub v21.4s, v10.4s, v0.4s +ldr q19, [x0, #192] +ldr q4, [x0, #128] +mul v14.4S, v14.4S,v30.s[0] +mul v12.4S, v12.4S,v30.s[0] +str q15, [x0, #176] +add v10.4s, v10.4s, v0.4s +ldr q0, [x0, #64] +ldr q15, [x0, #0] +mla v14.4S, v24.4S, v31.s[0] +mla v12.4S, v20.4S, v31.s[0] +sub v20.4s, v6.4s, v13.4s +add v6.4s, v6.4s, v13.4s +sqrdmulh v13.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +sub v24.4s, v25.4s, v8.4s +add v25.4s, v25.4s, v8.4s +sqrdmulh v8.4S, v24.4S, v29.s[2] +mul v24.4S, v24.4S,v30.s[2] +sub v18.4s, v28.4s, v26.4s +add v28.4s, v28.4s, v26.4s +sqrdmulh v26.4S, v6.4S, v29.s[1] +mul v6.4S, v6.4S,v30.s[1] +sub v3.4s, v17.4s, v27.4s +add v17.4s, v17.4s, v27.4s +sqrdmulh v27.4S, v25.4S, v29.s[1] +mul v25.4S, v25.4S,v30.s[1] +sub v16.4s, v19.4s, v5.4s +add v19.4s, v19.4s, v5.4s +sqrdmulh v5.4S, v18.4S, v29.s[2] +mla v20.4S, v13.4S, v31.s[0] +sub v13.4s, v4.4s, v11.4s +add v4.4s, v4.4s, v11.4s +sqrdmulh v11.4S, v3.4S, v29.s[2] +mla v24.4S, v8.4S, v31.s[0] +sub v8.4s, v0.4s, v12.4s +add v0.4s, v0.4s, v12.4s +sqrdmulh v12.4S, v28.4S, v29.s[1] +mla v6.4S, v26.4S, v31.s[0] +sub v26.4s, v15.4s, v14.4s +str q21, [x0, #112] +sqrdmulh v21.4S, v17.4S, v29.s[1] +mla v25.4S, v27.4S, v31.s[0] +add v15.4s, v15.4s, v14.4s +str q10, [x0, #48] +mul v3.4S, v3.4S,v30.s[2] +mul v18.4S, v18.4S,v30.s[2] +sub v10.4s, v16.4s, v20.4s +add v16.4s, v16.4s, v20.4s +mla v3.4S, v11.4S, v31.s[0] +mla v18.4S, v5.4S, v31.s[0] +sub v5.4s, v13.4s, v24.4s +add v13.4s, v13.4s, v24.4s +mul v17.4S, v17.4S,v30.s[1] +mul v28.4S, v28.4S,v30.s[1] +sub v24.4s, v19.4s, v6.4s +add v19.4s, v19.4s, v6.4s +mla v17.4S, v21.4S, v31.s[0] +mla v28.4S, v12.4S, v31.s[0] +sub v12.4s, v4.4s, v25.4s +add v4.4s, v4.4s, v25.4s +sqrdmulh v25.4S, v10.4S, v22.s[3] +mul v10.4S, v10.4S,v23.s[3] +sub v21.4s, v8.4s, v18.4s +add v8.4s, v8.4s, v18.4s +sqrdmulh v18.4S, v16.4S, v22.s[2] +mul v16.4S, v16.4S,v23.s[2] +sub v6.4s, v26.4s, v3.4s +add v26.4s, v26.4s, v3.4s +sqrdmulh v3.4S, v24.4S, v22.s[1] +mul v24.4S, v24.4S,v23.s[1] +sub v11.4s, v0.4s, v28.4s +add v0.4s, v0.4s, v28.4s +sqrdmulh v28.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v20.4s, v15.4s, v17.4s +add v15.4s, v15.4s, v17.4s +sqrdmulh v17.4S, v5.4S, v22.s[3] +mla v10.4S, v25.4S, v31.s[0] +nop +nop +sqrdmulh v25.4S, v13.4S, v22.s[2] +mla v16.4S, v18.4S, v31.s[0] +nop +nop +sqrdmulh v18.4S, v12.4S, v22.s[1] +mla v24.4S, v3.4S, v31.s[0] +nop +nop +sqrdmulh v3.4S, v4.4S, v22.s[0] +mla v19.4S, v28.4S, v31.s[0] +nop +nop +mul v13.4S, v13.4S,v23.s[2] +mul v5.4S, v5.4S,v23.s[3] +sub v28.4s, v21.4s, v10.4s +add v21.4s, v21.4s, v10.4s +mla v13.4S, v25.4S, v31.s[0] +mla v5.4S, v17.4S, v31.s[0] +sub v17.4s, v8.4s, v16.4s +add v8.4s, v8.4s, v16.4s +mul v4.4S, v4.4S,v23.s[0] +mul v12.4S, v12.4S,v23.s[1] +sub v16.4s, v11.4s, v24.4s +add v11.4s, v11.4s, v24.4s +mla v4.4S, v3.4S, v31.s[0] +mla v12.4S, v18.4S, v31.s[0] +sub v18.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v28.4S, v9.s[3] +mul v28.4S, v28.4S,v1.s[3] +sub v3.4s, v6.4s, v5.4s +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v21.4S, v9.s[2] +mul v21.4S, v21.4S,v1.s[2] +sub v24.4s, v26.4s, v13.4s +add v26.4s, v26.4s, v13.4s +sqrdmulh v13.4S, v17.4S, v9.s[1] +mul v17.4S, v17.4S,v1.s[1] +sub v25.4s, v20.4s, v12.4s +add v20.4s, v20.4s, v12.4s +sqrdmulh v12.4S, v8.4S, v9.s[0] +mul v8.4S, v8.4S,v1.s[0] +sub v10.4s, v15.4s, v4.4s +add v15.4s, v15.4s, v4.4s +sqrdmulh v4.4S, v16.4S, v7.s[3] +mla v28.4S, v19.4S, v31.s[0] +nop +nop +sqrdmulh v19.4S, v11.4S, v7.s[2] +mla v21.4S, v5.4S, v31.s[0] +nop +nop +sqrdmulh v5.4S, v18.4S, v7.s[1] +mla v17.4S, v13.4S, v31.s[0] +nop +nop +sqrdmulh v13.4S, v0.4S, v7.s[0] +mla v8.4S, v12.4S, v31.s[0] +nop +nop +mul v11.4S, v11.4S,v2.s[2] +mul v16.4S, v16.4S,v2.s[3] +sub v12.4s, v3.4s, v28.4s +str q12, [x0, #960] +mla v11.4S, v19.4S, v31.s[0] +mla v16.4S, v4.4S, v31.s[0] +add v3.4s, v3.4s, v28.4s +str q3, [x0, #896] +mul v0.4S, v0.4S,v2.s[0] +mul v18.4S, v18.4S,v2.s[1] +sub v3.4s, v6.4s, v21.4s +str q3, [x0, #832] +mla v0.4S, v13.4S, v31.s[0] +mla v18.4S, v5.4S, v31.s[0] +add v6.4s, v6.4s, v21.4s +sub v21.4s, v24.4s, v17.4s +ldr q5, [x0, #976] +sqrdmulh v13.4S, v5.4S, v29.s[0] +mul v5.4S, v5.4S,v30.s[0] +add v24.4s, v24.4s, v17.4s +str q6, [x0, #768] +ldr q6, [x0, #912] +sqrdmulh v17.4S, v6.4S, v29.s[0] +mul v6.4S, v6.4S,v30.s[0] +sub v3.4s, v26.4s, v8.4s +str q21, [x0, #704] +ldr q21, [x0, #848] +sqrdmulh v28.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +add v26.4s, v26.4s, v8.4s +str q24, [x0, #640] +ldr q24, [x0, #784] +sqrdmulh v8.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +sub v4.4s, v25.4s, v16.4s +str q3, [x0, #576] +ldr q3, [x0, #720] +sqrdmulh v19.4S, v3.4S, v29.s[0] +mla v5.4S, v13.4S, v31.s[0] +add v25.4s, v25.4s, v16.4s +str q26, [x0, #512] +ldr q26, [x0, #656] +sqrdmulh v16.4S, v26.4S, v29.s[0] +mla v6.4S, v17.4S, v31.s[0] +sub v17.4s, v20.4s, v11.4s +str q4, [x0, #448] +ldr q4, [x0, #592] +sqrdmulh v13.4S, v4.4S, v29.s[0] +mla v21.4S, v28.4S, v31.s[0] +add v20.4s, v20.4s, v11.4s +str q25, [x0, #384] +ldr q25, [x0, #528] +sqrdmulh v11.4S, v25.4S, v29.s[0] +mla v24.4S, v8.4S, v31.s[0] +sub v8.4s, v10.4s, v18.4s +str q17, [x0, #320] +ldr q17, [x0, #464] +add v10.4s, v10.4s, v18.4s +mul v26.4S, v26.4S,v30.s[0] +mul v3.4S, v3.4S,v30.s[0] +ldr q18, [x0, #400] +str q20, [x0, #256] +ldr q20, [x0, #336] +ldr q28, [x0, #272] +mla v26.4S, v16.4S, v31.s[0] +mla v3.4S, v19.4S, v31.s[0] +str q8, [x0, #192] +sub v8.4s, v15.4s, v0.4s +ldr q19, [x0, #208] +ldr q16, [x0, #144] +mul v25.4S, v25.4S,v30.s[0] +mul v4.4S, v4.4S,v30.s[0] +str q10, [x0, #128] +add v15.4s, v15.4s, v0.4s +ldr q0, [x0, #80] +ldr q10, [x0, #16] +mla v25.4S, v11.4S, v31.s[0] +mla v4.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v5.4s +add v17.4s, v17.4s, v5.4s +sqrdmulh v5.4S, v13.4S, v29.s[2] +mul v13.4S, v13.4S,v30.s[2] +sub v11.4s, v18.4s, v6.4s +add v18.4s, v18.4s, v6.4s +sqrdmulh v6.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v12.4s, v20.4s, v21.4s +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v14.4s, v28.4s, v24.4s +add v28.4s, v28.4s, v24.4s +sqrdmulh v24.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v27.4s, v19.4s, v3.4s +add v19.4s, v19.4s, v3.4s +sqrdmulh v3.4S, v12.4S, v29.s[2] +mla v13.4S, v5.4S, v31.s[0] +sub v5.4s, v16.4s, v26.4s +add v16.4s, v16.4s, v26.4s +sqrdmulh v26.4S, v14.4S, v29.s[2] +mla v11.4S, v6.4S, v31.s[0] +sub v6.4s, v0.4s, v4.4s +add v0.4s, v0.4s, v4.4s +sqrdmulh v4.4S, v20.4S, v29.s[1] +mla v17.4S, v21.4S, v31.s[0] +sub v21.4s, v10.4s, v25.4s +str q8, [x0, #64] +sqrdmulh v8.4S, v28.4S, v29.s[1] +mla v18.4S, v24.4S, v31.s[0] +add v10.4s, v10.4s, v25.4s +str q15, [x0, #0] +mul v14.4S, v14.4S,v30.s[2] +mul v12.4S, v12.4S,v30.s[2] +sub v15.4s, v27.4s, v13.4s +add v27.4s, v27.4s, v13.4s +mla v14.4S, v26.4S, v31.s[0] +mla v12.4S, v3.4S, v31.s[0] +sub v3.4s, v5.4s, v11.4s +add v5.4s, v5.4s, v11.4s +mul v28.4S, v28.4S,v30.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v11.4s, v19.4s, v17.4s +add v19.4s, v19.4s, v17.4s +mla v28.4S, v8.4S, v31.s[0] +mla v20.4S, v4.4S, v31.s[0] +sub v4.4s, v16.4s, v18.4s +add v16.4s, v16.4s, v18.4s +sqrdmulh v29.4S, v15.4S, v22.s[3] +mul v15.4S, v15.4S,v23.s[3] +sub v30.4s, v6.4s, v12.4s +add v6.4s, v6.4s, v12.4s +sqrdmulh v12.4S, v27.4S, v22.s[2] +mul v27.4S, v27.4S,v23.s[2] +sub v18.4s, v21.4s, v14.4s +add v21.4s, v21.4s, v14.4s +sqrdmulh v14.4S, v11.4S, v22.s[1] +mul v11.4S, v11.4S,v23.s[1] +sub v8.4s, v0.4s, v20.4s +add v0.4s, v0.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v17.4s, v10.4s, v28.4s +add v10.4s, v10.4s, v28.4s +sqrdmulh v28.4S, v3.4S, v22.s[3] +mla v15.4S, v29.4S, v31.s[0] +nop +nop +sqrdmulh v29.4S, v5.4S, v22.s[2] +mla v27.4S, v12.4S, v31.s[0] +nop +nop +sqrdmulh v12.4S, v4.4S, v22.s[1] +mla v11.4S, v14.4S, v31.s[0] +nop +nop +sqrdmulh v14.4S, v16.4S, v22.s[0] +mla v19.4S, v20.4S, v31.s[0] +nop +nop +mul v5.4S, v5.4S,v23.s[2] +mul v3.4S, v3.4S,v23.s[3] +sub v20.4s, v30.4s, v15.4s +add v30.4s, v30.4s, v15.4s +mla v5.4S, v29.4S, v31.s[0] +mla v3.4S, v28.4S, v31.s[0] +sub v28.4s, v6.4s, v27.4s +add v6.4s, v6.4s, v27.4s +mul v16.4S, v16.4S,v23.s[0] +mul v4.4S, v4.4S,v23.s[1] +sub v27.4s, v8.4s, v11.4s +add v8.4s, v8.4s, v11.4s +mla v16.4S, v14.4S, v31.s[0] +mla v4.4S, v12.4S, v31.s[0] +sub v12.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v22.4S, v20.4S, v9.s[3] +mul v20.4S, v20.4S,v1.s[3] +sub v23.4s, v18.4s, v3.4s +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v30.4S, v9.s[2] +mul v30.4S, v30.4S,v1.s[2] +sub v19.4s, v21.4s, v5.4s +add v21.4s, v21.4s, v5.4s +sqrdmulh v5.4S, v28.4S, v9.s[1] +mul v28.4S, v28.4S,v1.s[1] +sub v14.4s, v17.4s, v4.4s +add v17.4s, v17.4s, v4.4s +sqrdmulh v4.4S, v6.4S, v9.s[0] +mul v6.4S, v6.4S,v1.s[0] +sub v11.4s, v10.4s, v16.4s +add v10.4s, v10.4s, v16.4s +sqrdmulh v9.4S, v27.4S, v7.s[3] +mla v20.4S, v22.4S, v31.s[0] +nop +nop +sqrdmulh v22.4S, v8.4S, v7.s[2] +mla v30.4S, v3.4S, v31.s[0] +nop +nop +sqrdmulh v3.4S, v12.4S, v7.s[1] +mla v28.4S, v5.4S, v31.s[0] +nop +nop +sqrdmulh v5.4S, v0.4S, v7.s[0] +mla v6.4S, v4.4S, v31.s[0] +nop +nop +mul v8.4S, v8.4S,v2.s[2] +mul v27.4S, v27.4S,v2.s[3] +sub v4.4s, v23.4s, v20.4s +str q4, [x0, #976] +mla v8.4S, v22.4S, v31.s[0] +mla v27.4S, v9.4S, v31.s[0] +add v23.4s, v23.4s, v20.4s +str q23, [x0, #912] +mul v0.4S, v0.4S,v2.s[0] +mul v12.4S, v12.4S,v2.s[1] +sub v23.4s, v18.4s, v30.4s +str q23, [x0, #848] +mla v0.4S, v5.4S, v31.s[0] +mla v12.4S, v3.4S, v31.s[0] +add v18.4s, v18.4s, v30.4s +sub v30.4s, v19.4s, v28.4s +add v19.4s, v19.4s, v28.4s +str q18, [x0, #784] +sub v18.4s, v21.4s, v6.4s +str q30, [x0, #720] +add v21.4s, v21.4s, v6.4s +str q19, [x0, #656] +sub v19.4s, v14.4s, v27.4s +str q18, [x0, #592] +add v14.4s, v14.4s, v27.4s +str q21, [x0, #528] +sub v21.4s, v17.4s, v8.4s +str q19, [x0, #464] +add v17.4s, v17.4s, v8.4s +str q14, [x0, #400] +sub v14.4s, v11.4s, v12.4s +str q21, [x0, #336] +add v11.4s, v11.4s, v12.4s +str q17, [x0, #272] +sub v17.4s, v10.4s, v0.4s +add v10.4s, v10.4s, v0.4s +ldr q24, [x0, #224] +ldr q25, [x0, #160] +ldr q13, [x0, #32] +ldr q26, [x17, #+128] +ldr q15, [x17, #+144] +sqrdmulh v29.4S, v13.4S, v15.s[0] +mul v13.4S, v13.4S,v26.s[0] +ldr q16, [x0, #48] +ldr q1, [x17, #+160] +sqrdmulh v4.4S, v16.4S, v15.s[0] +mul v16.4S, v16.4S,v26.s[0] +ldr q22, [x17, #+176] +ldr q9, [x0, #96] +sqrdmulh v20.4S, v9.4S, v22.s[0] +mul v9.4S, v9.4S,v1.s[0] +ldr q23, [x0, #112] +sqrdmulh v5.4S, v23.4S, v22.s[0] +mul v23.4S, v23.4S,v1.s[0] +ldr q3, [x17, #+192] +ldr q2, [x17, #+208] +mla v13.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v25.4S, v2.s[0] +ldr q7, [x0, #176] +mla v16.4S, v4.4S, v31.s[0] +sqrdmulh v4.4S, v7.4S, v2.s[0] +ldr q28, [x17, #+224] +ldr q30, [x17, #+240] +mla v9.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v24.4S, v30.s[0] +ldr q6, [x0, #240] +mla v23.4S, v5.4S, v31.s[0] +sqrdmulh v5.4S, v6.4S, v30.s[0] +ldr q18, [x0, #0] +ldr q27, [x0, #128] +mul v25.4S, v25.4S,v3.s[0] +mul v7.4S, v7.4S,v3.s[0] +mla v25.4S, v29.4S, v31.s[0] +mla v7.4S, v4.4S, v31.s[0] +sub v4.4s, v18.4s, v13.4s +ldr q29, [x0, #64] +add v18.4s, v18.4s, v13.4s +ldr q13, [x0, #192] +mul v24.4S, v24.4S,v28.s[0] +mul v6.4S, v6.4S,v28.s[0] +sub v19.4s, v10.4s, v16.4s +add v10.4s, v10.4s, v16.4s +mla v24.4S, v20.4S, v31.s[0] +mla v6.4S, v5.4S, v31.s[0] +sub v5.4s, v29.4s, v9.4s +add v29.4s, v29.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v15.s[1] +mul v10.4S, v10.4S,v26.s[1] +sub v20.4s, v17.4s, v23.4s +add v17.4s, v17.4s, v23.4s +sqrdmulh v23.4S, v19.4S, v15.s[2] +mul v19.4S, v19.4S,v26.s[2] +sub v16.4s, v27.4s, v25.4s +add v27.4s, v27.4s, v25.4s +sqrdmulh v15.4S, v17.4S, v22.s[1] +mul v17.4S, v17.4S,v1.s[1] +sub v25.4s, v11.4s, v7.4s +add v11.4s, v11.4s, v7.4s +sqrdmulh v7.4S, v20.4S, v22.s[2] +mul v20.4S, v20.4S,v1.s[2] +sub v26.4s, v13.4s, v24.4s +add v13.4s, v13.4s, v24.4s +mla v10.4S, v9.4S, v31.s[0] +sqrdmulh v9.4S, v11.4S, v2.s[1] +sub v22.4s, v14.4s, v6.4s +ldr q24, [x0, #480] +add v14.4s, v14.4s, v6.4s +mla v19.4S, v23.4S, v31.s[0] +sqrdmulh v23.4S, v25.4S, v2.s[2] +sub v6.4s, v18.4s, v10.4s +ldr q1, [x0, #416] +str q6, [x0, #16] +mla v17.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v14.4S, v30.s[1] +add v18.4s, v18.4s, v10.4s +ldr q10, [x0, #288] +str q18, [x0, #0] +mla v20.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v22.4S, v30.s[2] +sub v18.4s, v4.4s, v19.4s +ldr q6, [x17, #+256] +str q18, [x0, #48] +mul v11.4S, v11.4S,v3.s[1] +mul v25.4S, v25.4S,v3.s[2] +add v4.4s, v4.4s, v19.4s +str q4, [x0, #32] +ldr q4, [x17, #+272] +mla v11.4S, v9.4S, v31.s[0] +mla v25.4S, v23.4S, v31.s[0] +sub v23.4s, v29.4s, v17.4s +str q23, [x0, #80] +mul v14.4S, v14.4S,v28.s[1] +mul v22.4S, v22.4S,v28.s[2] +add v29.4s, v29.4s, v17.4s +str q29, [x0, #64] +mla v14.4S, v15.4S, v31.s[0] +mla v22.4S, v7.4S, v31.s[0] +sub v7.4s, v5.4s, v20.4s +str q7, [x0, #112] +sqrdmulh v30.4S, v10.4S, v4.s[0] +mul v10.4S, v10.4S,v6.s[0] +add v5.4s, v5.4s, v20.4s +ldr q20, [x0, #304] +str q5, [x0, #96] +ldr q5, [x17, #+288] +sqrdmulh v7.4S, v20.4S, v4.s[0] +mul v20.4S, v20.4S,v6.s[0] +sub v28.4s, v27.4s, v11.4s +ldr q15, [x17, #+304] +str q28, [x0, #144] +ldr q28, [x0, #352] +sqrdmulh v29.4S, v28.4S, v15.s[0] +mul v28.4S, v28.4S,v5.s[0] +add v27.4s, v27.4s, v11.4s +str q27, [x0, #128] +ldr q27, [x0, #368] +sqrdmulh v11.4S, v27.4S, v15.s[0] +mul v27.4S, v27.4S,v5.s[0] +sub v17.4s, v16.4s, v25.4s +ldr q2, [x17, #+320] +str q17, [x0, #176] +ldr q17, [x17, #+336] +mla v10.4S, v30.4S, v31.s[0] +sqrdmulh v30.4S, v1.4S, v17.s[0] +add v16.4s, v16.4s, v25.4s +ldr q25, [x0, #432] +str q16, [x0, #160] +mla v20.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v25.4S, v17.s[0] +sub v16.4s, v13.4s, v14.4s +ldr q23, [x17, #+352] +str q16, [x0, #208] +ldr q16, [x17, #+368] +mla v28.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v24.4S, v16.s[0] +add v13.4s, v13.4s, v14.4s +str q13, [x0, #192] +ldr q13, [x0, #496] +mla v27.4S, v11.4S, v31.s[0] +sqrdmulh v11.4S, v13.4S, v16.s[0] +sub v14.4s, v26.4s, v22.4s +ldr q3, [x0, #256] +str q14, [x0, #240] +ldr q14, [x0, #384] +mul v1.4S, v1.4S,v2.s[0] +mul v25.4S, v25.4S,v2.s[0] +add v26.4s, v26.4s, v22.4s +ldr q22, [x0, #272] +str q26, [x0, #224] +ldr q26, [x0, #400] +mla v1.4S, v30.4S, v31.s[0] +mla v25.4S, v7.4S, v31.s[0] +sub v7.4s, v3.4s, v10.4s +ldr q30, [x0, #320] +add v3.4s, v3.4s, v10.4s +ldr q10, [x0, #448] +mul v24.4S, v24.4S,v23.s[0] +mul v13.4S, v13.4S,v23.s[0] +sub v9.4s, v22.4s, v20.4s +ldr q19, [x0, #336] +add v22.4s, v22.4s, v20.4s +ldr q20, [x0, #464] +mla v24.4S, v29.4S, v31.s[0] +mla v13.4S, v11.4S, v31.s[0] +sub v11.4s, v30.4s, v28.4s +add v30.4s, v30.4s, v28.4s +sqrdmulh v28.4S, v22.4S, v4.s[1] +mul v22.4S, v22.4S,v6.s[1] +sub v29.4s, v19.4s, v27.4s +add v19.4s, v19.4s, v27.4s +sqrdmulh v27.4S, v9.4S, v4.s[2] +mul v9.4S, v9.4S,v6.s[2] +sub v18.4s, v14.4s, v1.4s +add v14.4s, v14.4s, v1.4s +sqrdmulh v4.4S, v19.4S, v15.s[1] +mul v19.4S, v19.4S,v5.s[1] +sub v1.4s, v26.4s, v25.4s +add v26.4s, v26.4s, v25.4s +sqrdmulh v25.4S, v29.4S, v15.s[2] +mul v29.4S, v29.4S,v5.s[2] +sub v6.4s, v10.4s, v24.4s +add v10.4s, v10.4s, v24.4s +mla v22.4S, v28.4S, v31.s[0] +sqrdmulh v28.4S, v26.4S, v17.s[1] +sub v15.4s, v20.4s, v13.4s +ldr q24, [x0, #736] +add v20.4s, v20.4s, v13.4s +mla v9.4S, v27.4S, v31.s[0] +sqrdmulh v27.4S, v1.4S, v17.s[2] +sub v13.4s, v3.4s, v22.4s +ldr q5, [x0, #672] +str q13, [x0, #272] +mla v19.4S, v4.4S, v31.s[0] +sqrdmulh v4.4S, v20.4S, v16.s[1] +add v3.4s, v3.4s, v22.4s +ldr q22, [x0, #544] +str q3, [x0, #256] +mla v29.4S, v25.4S, v31.s[0] +sqrdmulh v25.4S, v15.4S, v16.s[2] +sub v3.4s, v7.4s, v9.4s +ldr q13, [x17, #+384] +str q3, [x0, #304] +mul v26.4S, v26.4S,v2.s[1] +mul v1.4S, v1.4S,v2.s[2] +add v7.4s, v7.4s, v9.4s +str q7, [x0, #288] +ldr q7, [x17, #+400] +mla v26.4S, v28.4S, v31.s[0] +mla v1.4S, v27.4S, v31.s[0] +sub v27.4s, v30.4s, v19.4s +str q27, [x0, #336] +mul v20.4S, v20.4S,v23.s[1] +mul v15.4S, v15.4S,v23.s[2] +add v30.4s, v30.4s, v19.4s +str q30, [x0, #320] +mla v20.4S, v4.4S, v31.s[0] +mla v15.4S, v25.4S, v31.s[0] +sub v25.4s, v11.4s, v29.4s +str q25, [x0, #368] +sqrdmulh v16.4S, v22.4S, v7.s[0] +mul v22.4S, v22.4S,v13.s[0] +add v11.4s, v11.4s, v29.4s +ldr q29, [x0, #560] +str q11, [x0, #352] +ldr q11, [x17, #+416] +sqrdmulh v25.4S, v29.4S, v7.s[0] +mul v29.4S, v29.4S,v13.s[0] +sub v23.4s, v14.4s, v26.4s +ldr q4, [x17, #+432] +str q23, [x0, #400] +ldr q23, [x0, #608] +sqrdmulh v30.4S, v23.4S, v4.s[0] +mul v23.4S, v23.4S,v11.s[0] +add v14.4s, v14.4s, v26.4s +str q14, [x0, #384] +ldr q14, [x0, #624] +sqrdmulh v26.4S, v14.4S, v4.s[0] +mul v14.4S, v14.4S,v11.s[0] +sub v19.4s, v18.4s, v1.4s +ldr q17, [x17, #+448] +str q19, [x0, #432] +ldr q19, [x17, #+464] +mla v22.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v5.4S, v19.s[0] +add v18.4s, v18.4s, v1.4s +ldr q1, [x0, #688] +str q18, [x0, #416] +mla v29.4S, v25.4S, v31.s[0] +sqrdmulh v25.4S, v1.4S, v19.s[0] +sub v18.4s, v10.4s, v20.4s +ldr q27, [x17, #+480] +str q18, [x0, #464] +ldr q18, [x17, #+496] +mla v23.4S, v30.4S, v31.s[0] +sqrdmulh v30.4S, v24.4S, v18.s[0] +add v10.4s, v10.4s, v20.4s +str q10, [x0, #448] +ldr q10, [x0, #752] +mla v14.4S, v26.4S, v31.s[0] +sqrdmulh v26.4S, v10.4S, v18.s[0] +sub v20.4s, v6.4s, v15.4s +ldr q2, [x0, #512] +str q20, [x0, #496] +ldr q20, [x0, #640] +mul v5.4S, v5.4S,v17.s[0] +mul v1.4S, v1.4S,v17.s[0] +add v6.4s, v6.4s, v15.4s +ldr q15, [x0, #528] +str q6, [x0, #480] +ldr q6, [x0, #656] +mla v5.4S, v16.4S, v31.s[0] +mla v1.4S, v25.4S, v31.s[0] +sub v25.4s, v2.4s, v22.4s +ldr q16, [x0, #576] +add v2.4s, v2.4s, v22.4s +ldr q22, [x0, #704] +mul v24.4S, v24.4S,v27.s[0] +mul v10.4S, v10.4S,v27.s[0] +sub v28.4s, v15.4s, v29.4s +ldr q9, [x0, #592] +add v15.4s, v15.4s, v29.4s +ldr q29, [x0, #720] +mla v24.4S, v30.4S, v31.s[0] +mla v10.4S, v26.4S, v31.s[0] +sub v26.4s, v16.4s, v23.4s +add v16.4s, v16.4s, v23.4s +sqrdmulh v23.4S, v15.4S, v7.s[1] +mul v15.4S, v15.4S,v13.s[1] +sub v30.4s, v9.4s, v14.4s +add v9.4s, v9.4s, v14.4s +sqrdmulh v14.4S, v28.4S, v7.s[2] +mul v28.4S, v28.4S,v13.s[2] +sub v3.4s, v20.4s, v5.4s +add v20.4s, v20.4s, v5.4s +sqrdmulh v7.4S, v9.4S, v4.s[1] +mul v9.4S, v9.4S,v11.s[1] +sub v5.4s, v6.4s, v1.4s +add v6.4s, v6.4s, v1.4s +sqrdmulh v1.4S, v30.4S, v4.s[2] +mul v30.4S, v30.4S,v11.s[2] +sub v13.4s, v22.4s, v24.4s +add v22.4s, v22.4s, v24.4s +mla v15.4S, v23.4S, v31.s[0] +sqrdmulh v23.4S, v6.4S, v19.s[1] +sub v4.4s, v29.4s, v10.4s +ldr q24, [x0, #992] +add v29.4s, v29.4s, v10.4s +mla v28.4S, v14.4S, v31.s[0] +sqrdmulh v14.4S, v5.4S, v19.s[2] +sub v10.4s, v2.4s, v15.4s +ldr q11, [x0, #928] +str q10, [x0, #528] +mla v9.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v29.4S, v18.s[1] +add v2.4s, v2.4s, v15.4s +ldr q15, [x0, #800] +str q2, [x0, #512] +mla v30.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v4.4S, v18.s[2] +sub v2.4s, v25.4s, v28.4s +ldr q10, [x17, #+512] +str q2, [x0, #560] +mul v6.4S, v6.4S,v17.s[1] +mul v5.4S, v5.4S,v17.s[2] +add v25.4s, v25.4s, v28.4s +str q25, [x0, #544] +ldr q25, [x17, #+528] +mla v6.4S, v23.4S, v31.s[0] +mla v5.4S, v14.4S, v31.s[0] +sub v14.4s, v16.4s, v9.4s +str q14, [x0, #592] +mul v29.4S, v29.4S,v27.s[1] +mul v4.4S, v4.4S,v27.s[2] +add v16.4s, v16.4s, v9.4s +str q16, [x0, #576] +mla v29.4S, v7.4S, v31.s[0] +mla v4.4S, v1.4S, v31.s[0] +sub v1.4s, v26.4s, v30.4s +str q1, [x0, #624] +sqrdmulh v18.4S, v15.4S, v25.s[0] +mul v15.4S, v15.4S,v10.s[0] +add v26.4s, v26.4s, v30.4s +ldr q30, [x0, #816] +str q26, [x0, #608] +ldr q26, [x17, #+544] +sqrdmulh v1.4S, v30.4S, v25.s[0] +mul v30.4S, v30.4S,v10.s[0] +sub v27.4s, v20.4s, v6.4s +ldr q7, [x17, #+560] +str q27, [x0, #656] +ldr q27, [x0, #864] +sqrdmulh v16.4S, v27.4S, v7.s[0] +mul v27.4S, v27.4S,v26.s[0] +add v20.4s, v20.4s, v6.4s +str q20, [x0, #640] +ldr q20, [x0, #880] +sqrdmulh v6.4S, v20.4S, v7.s[0] +mul v20.4S, v20.4S,v26.s[0] +sub v9.4s, v3.4s, v5.4s +ldr q19, [x17, #+576] +str q9, [x0, #688] +ldr q9, [x17, #+592] +mla v15.4S, v18.4S, v31.s[0] +sqrdmulh v18.4S, v11.4S, v9.s[0] +add v3.4s, v3.4s, v5.4s +ldr q5, [x0, #944] +str q3, [x0, #672] +mla v30.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v5.4S, v9.s[0] +sub v3.4s, v22.4s, v29.4s +ldr q14, [x17, #+608] +str q3, [x0, #720] +ldr q3, [x17, #+624] +mla v27.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v24.4S, v3.s[0] +add v22.4s, v22.4s, v29.4s +str q22, [x0, #704] +ldr q22, [x0, #1008] +mla v20.4S, v6.4S, v31.s[0] +sqrdmulh v6.4S, v22.4S, v3.s[0] +sub v29.4s, v13.4s, v4.4s +ldr q17, [x0, #768] +str q29, [x0, #752] +ldr q29, [x0, #896] +mul v11.4S, v11.4S,v19.s[0] +mul v5.4S, v5.4S,v19.s[0] +add v13.4s, v13.4s, v4.4s +ldr q4, [x0, #784] +str q13, [x0, #736] +ldr q13, [x0, #912] +mla v11.4S, v18.4S, v31.s[0] +mla v5.4S, v1.4S, v31.s[0] +sub v1.4s, v17.4s, v15.4s +ldr q18, [x0, #832] +add v17.4s, v17.4s, v15.4s +ldr q15, [x0, #960] +mul v24.4S, v24.4S,v14.s[0] +mul v22.4S, v22.4S,v14.s[0] +sub v23.4s, v4.4s, v30.4s +ldr q28, [x0, #848] +add v4.4s, v4.4s, v30.4s +ldr q30, [x0, #976] +mla v24.4S, v16.4S, v31.s[0] +mla v22.4S, v6.4S, v31.s[0] +sub v6.4s, v18.4s, v27.4s +add v18.4s, v18.4s, v27.4s +sqrdmulh v27.4S, v4.4S, v25.s[1] +mul v4.4S, v4.4S,v10.s[1] +sub v16.4s, v28.4s, v20.4s +add v28.4s, v28.4s, v20.4s +sqrdmulh v20.4S, v23.4S, v25.s[2] +mul v23.4S, v23.4S,v10.s[2] +sub v2.4s, v29.4s, v11.4s +add v29.4s, v29.4s, v11.4s +sqrdmulh v25.4S, v28.4S, v7.s[1] +mul v28.4S, v28.4S,v26.s[1] +sub v11.4s, v13.4s, v5.4s +add v13.4s, v13.4s, v5.4s +sqrdmulh v5.4S, v16.4S, v7.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v10.4s, v15.4s, v24.4s +add v15.4s, v15.4s, v24.4s +mla v4.4S, v27.4S, v31.s[0] +sqrdmulh v27.4S, v13.4S, v9.s[1] +sub v7.4s, v30.4s, v22.4s +add v30.4s, v30.4s, v22.4s +mla v23.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v11.4S, v9.s[2] +sub v22.4s, v17.4s, v4.4s +str q22, [x0, #784] +mla v28.4S, v25.4S, v31.s[0] +sqrdmulh v25.4S, v30.4S, v3.s[1] +add v17.4s, v17.4s, v4.4s +str q17, [x0, #768] +mla v16.4S, v5.4S, v31.s[0] +sqrdmulh v5.4S, v7.4S, v3.s[2] +sub v17.4s, v1.4s, v23.4s +str q17, [x0, #816] +mul v13.4S, v13.4S,v19.s[1] +mul v11.4S, v11.4S,v19.s[2] +add v1.4s, v1.4s, v23.4s +str q1, [x0, #800] +mla v13.4S, v27.4S, v31.s[0] +mla v11.4S, v20.4S, v31.s[0] +sub v20.4s, v18.4s, v28.4s +str q20, [x0, #848] +mul v30.4S, v30.4S,v14.s[1] +mul v7.4S, v7.4S,v14.s[2] +add v18.4s, v18.4s, v28.4s +str q18, [x0, #832] +mla v30.4S, v25.4S, v31.s[0] +mla v7.4S, v5.4S, v31.s[0] +sub v5.4s, v6.4s, v16.4s +str q5, [x0, #880] +add v6.4s, v6.4s, v16.4s +str q6, [x0, #864] +sub v6.4s, v29.4s, v13.4s +str q6, [x0, #912] +add v29.4s, v29.4s, v13.4s +str q29, [x0, #896] +sub v29.4s, v2.4s, v11.4s +str q29, [x0, #944] +add v2.4s, v2.4s, v11.4s +str q2, [x0, #928] +sub v2.4s, v15.4s, v30.4s +str q2, [x0, #976] +add v15.4s, v15.4s, v30.4s +str q15, [x0, #960] +sub v15.4s, v10.4s, v7.4s +str q15, [x0, #1008] +add v10.4s, v10.4s, v7.4s +str q10, [x0, #992] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1520 +// Instruction count: 1516 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_11.s b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_11.s new file mode 100644 index 0000000..ba9add4 --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_11.s @@ -0,0 +1,1550 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_22_z4_11 +.global _ntt_u32_incomplete_neon_asm_var_4_2_22_z4_11 +ntt_u32_incomplete_neon_asm_var_4_2_22_z4_11: +_ntt_u32_incomplete_neon_asm_var_4_2_22_z4_11: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x0, #992] +sqrdmulh v27.4S, v28.4S, v29.s[0] +mul v28.4S, v28.4S,v30.s[0] +ldr q26, [x0, #928] +sqrdmulh v25.4S, v26.4S, v29.s[0] +mul v26.4S, v26.4S,v30.s[0] +ldr q24, [x0, #864] +sqrdmulh v23.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +ldr q22, [x0, #800] +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +ldr q20, [x0, #736] +sqrdmulh v19.4S, v20.4S, v29.s[0] +mla v28.4S, v27.4S, v31.s[0] +ldr q27, [x0, #672] +sqrdmulh v18.4S, v27.4S, v29.s[0] +mla v26.4S, v25.4S, v31.s[0] +ldr q25, [x0, #608] +sqrdmulh v17.4S, v25.4S, v29.s[0] +mla v24.4S, v23.4S, v31.s[0] +ldr q23, [x0, #544] +sqrdmulh v16.4S, v23.4S, v29.s[0] +mla v22.4S, v21.4S, v31.s[0] +ldr q21, [x0, #480] +mul v27.4S, v27.4S,v30.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q3, [x0, #416] +ldr q2, [x0, #352] +ldr q1, [x0, #288] +mla v27.4S, v18.4S, v31.s[0] +mla v20.4S, v19.4S, v31.s[0] +ldr q19, [x0, #224] +ldr q18, [x0, #160] +mul v23.4S, v23.4S,v30.s[0] +mul v25.4S, v25.4S,v30.s[0] +ldr q0, [x0, #96] +ldr q15, [x0, #32] +mla v23.4S, v16.4S, v31.s[0] +mla v25.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v28.4s +add v21.4s, v21.4s, v28.4s +sqrdmulh v28.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +sub v16.4s, v3.4s, v26.4s +add v3.4s, v3.4s, v26.4s +sqrdmulh v26.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +sub v14.4s, v2.4s, v24.4s +add v2.4s, v2.4s, v24.4s +sqrdmulh v24.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v13.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +sqrdmulh v22.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v12.4s, v19.4s, v20.4s +add v19.4s, v19.4s, v20.4s +sqrdmulh v20.4S, v14.4S, v29.s[2] +mla v17.4S, v28.4S, v31.s[0] +sub v28.4s, v18.4s, v27.4s +add v18.4s, v18.4s, v27.4s +sqrdmulh v27.4S, v13.4S, v29.s[2] +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v0.4s, v25.4s +add v0.4s, v0.4s, v25.4s +sqrdmulh v25.4S, v2.4S, v29.s[1] +mla v21.4S, v24.4S, v31.s[0] +sub v24.4s, v15.4s, v23.4s +sqrdmulh v11.4S, v1.4S, v29.s[1] +mla v3.4S, v22.4S, v31.s[0] +add v15.4s, v15.4s, v23.4s +ldr q23, [x17, #+32] +ldr q22, [x17, #+48] +mul v13.4S, v13.4S,v30.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v10.4s, v12.4s, v17.4s +add v12.4s, v12.4s, v17.4s +mla v13.4S, v27.4S, v31.s[0] +mla v14.4S, v20.4S, v31.s[0] +sub v20.4s, v28.4s, v16.4s +add v28.4s, v28.4s, v16.4s +mul v1.4S, v1.4S,v30.s[1] +mul v2.4S, v2.4S,v30.s[1] +sub v16.4s, v19.4s, v21.4s +add v19.4s, v19.4s, v21.4s +mla v1.4S, v11.4S, v31.s[0] +mla v2.4S, v25.4S, v31.s[0] +sub v25.4s, v18.4s, v3.4s +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v10.4S, v22.s[3] +mul v10.4S, v10.4S,v23.s[3] +sub v11.4s, v26.4s, v14.4s +add v26.4s, v26.4s, v14.4s +sqrdmulh v14.4S, v12.4S, v22.s[2] +mul v12.4S, v12.4S,v23.s[2] +sub v21.4s, v24.4s, v13.4s +add v24.4s, v24.4s, v13.4s +sqrdmulh v13.4S, v16.4S, v22.s[1] +mul v16.4S, v16.4S,v23.s[1] +sub v27.4s, v0.4s, v2.4s +add v0.4s, v0.4s, v2.4s +sqrdmulh v2.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v17.4s, v15.4s, v1.4s +add v15.4s, v15.4s, v1.4s +ldr q1, [x17, #+96] +ldr q9, [x17, #+112] +sqrdmulh v8.4S, v20.4S, v22.s[3] +mla v10.4S, v3.4S, v31.s[0] +nop +nop +sqrdmulh v3.4S, v28.4S, v22.s[2] +mla v12.4S, v14.4S, v31.s[0] +nop +nop +sqrdmulh v14.4S, v25.4S, v22.s[1] +mla v16.4S, v13.4S, v31.s[0] +nop +nop +sqrdmulh v13.4S, v18.4S, v22.s[0] +mla v19.4S, v2.4S, v31.s[0] +nop +nop +ldr q2, [x17, #+64] +ldr q7, [x17, #+80] +mul v28.4S, v28.4S,v23.s[2] +mul v20.4S, v20.4S,v23.s[3] +sub v6.4s, v11.4s, v10.4s +add v11.4s, v11.4s, v10.4s +mla v28.4S, v3.4S, v31.s[0] +mla v20.4S, v8.4S, v31.s[0] +sub v8.4s, v26.4s, v12.4s +add v26.4s, v26.4s, v12.4s +mul v18.4S, v18.4S,v23.s[0] +mul v25.4S, v25.4S,v23.s[1] +sub v12.4s, v27.4s, v16.4s +add v27.4s, v27.4s, v16.4s +mla v18.4S, v13.4S, v31.s[0] +mla v25.4S, v14.4S, v31.s[0] +sub v14.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v9.s[3] +mul v6.4S, v6.4S,v1.s[3] +sub v13.4s, v21.4s, v20.4s +add v21.4s, v21.4s, v20.4s +sqrdmulh v20.4S, v11.4S, v9.s[2] +mul v11.4S, v11.4S,v1.s[2] +sub v16.4s, v24.4s, v28.4s +add v24.4s, v24.4s, v28.4s +sqrdmulh v28.4S, v8.4S, v9.s[1] +mul v8.4S, v8.4S,v1.s[1] +sub v3.4s, v17.4s, v25.4s +add v17.4s, v17.4s, v25.4s +sqrdmulh v25.4S, v26.4S, v9.s[0] +mul v26.4S, v26.4S,v1.s[0] +sub v10.4s, v15.4s, v18.4s +add v15.4s, v15.4s, v18.4s +sqrdmulh v18.4S, v12.4S, v7.s[3] +mla v6.4S, v19.4S, v31.s[0] +nop +nop +sqrdmulh v19.4S, v27.4S, v7.s[2] +mla v11.4S, v20.4S, v31.s[0] +nop +nop +sqrdmulh v20.4S, v14.4S, v7.s[1] +mla v8.4S, v28.4S, v31.s[0] +nop +nop +sqrdmulh v28.4S, v0.4S, v7.s[0] +mla v26.4S, v25.4S, v31.s[0] +nop +nop +mul v27.4S, v27.4S,v2.s[2] +mul v12.4S, v12.4S,v2.s[3] +sub v25.4s, v13.4s, v6.4s +str q25, [x0, #992] +mla v27.4S, v19.4S, v31.s[0] +mla v12.4S, v18.4S, v31.s[0] +add v13.4s, v13.4s, v6.4s +str q13, [x0, #928] +mul v0.4S, v0.4S,v2.s[0] +mul v14.4S, v14.4S,v2.s[1] +sub v13.4s, v21.4s, v11.4s +str q13, [x0, #864] +mla v0.4S, v28.4S, v31.s[0] +mla v14.4S, v20.4S, v31.s[0] +add v21.4s, v21.4s, v11.4s +sub v11.4s, v16.4s, v8.4s +ldr q20, [x0, #1008] +sqrdmulh v28.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v16.4s, v16.4s, v8.4s +str q21, [x0, #800] +ldr q21, [x0, #944] +sqrdmulh v8.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +sub v13.4s, v24.4s, v26.4s +str q11, [x0, #736] +ldr q11, [x0, #880] +sqrdmulh v6.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +add v24.4s, v24.4s, v26.4s +str q16, [x0, #672] +ldr q16, [x0, #816] +sqrdmulh v26.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v18.4s, v3.4s, v12.4s +str q13, [x0, #608] +ldr q13, [x0, #752] +sqrdmulh v19.4S, v13.4S, v29.s[0] +mla v20.4S, v28.4S, v31.s[0] +add v3.4s, v3.4s, v12.4s +str q24, [x0, #544] +ldr q24, [x0, #688] +sqrdmulh v12.4S, v24.4S, v29.s[0] +mla v21.4S, v8.4S, v31.s[0] +sub v8.4s, v17.4s, v27.4s +str q18, [x0, #480] +ldr q18, [x0, #624] +sqrdmulh v28.4S, v18.4S, v29.s[0] +mla v11.4S, v6.4S, v31.s[0] +add v17.4s, v17.4s, v27.4s +str q3, [x0, #416] +ldr q3, [x0, #560] +sqrdmulh v27.4S, v3.4S, v29.s[0] +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v10.4s, v14.4s +str q8, [x0, #352] +ldr q8, [x0, #496] +add v10.4s, v10.4s, v14.4s +mul v24.4S, v24.4S,v30.s[0] +mul v13.4S, v13.4S,v30.s[0] +ldr q14, [x0, #432] +str q17, [x0, #288] +ldr q17, [x0, #368] +ldr q6, [x0, #304] +mla v24.4S, v12.4S, v31.s[0] +mla v13.4S, v19.4S, v31.s[0] +str q26, [x0, #224] +sub v26.4s, v15.4s, v0.4s +ldr q19, [x0, #240] +ldr q12, [x0, #176] +mul v3.4S, v3.4S,v30.s[0] +mul v18.4S, v18.4S,v30.s[0] +str q10, [x0, #160] +add v15.4s, v15.4s, v0.4s +ldr q0, [x0, #112] +ldr q10, [x0, #48] +mla v3.4S, v27.4S, v31.s[0] +mla v18.4S, v28.4S, v31.s[0] +sub v28.4s, v8.4s, v20.4s +add v8.4s, v8.4s, v20.4s +sqrdmulh v20.4S, v28.4S, v29.s[2] +mul v28.4S, v28.4S,v30.s[2] +sub v27.4s, v14.4s, v21.4s +add v14.4s, v14.4s, v21.4s +sqrdmulh v21.4S, v27.4S, v29.s[2] +mul v27.4S, v27.4S,v30.s[2] +sub v25.4s, v17.4s, v11.4s +add v17.4s, v17.4s, v11.4s +sqrdmulh v11.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +sub v5.4s, v6.4s, v16.4s +add v6.4s, v6.4s, v16.4s +sqrdmulh v16.4S, v14.4S, v29.s[1] +mul v14.4S, v14.4S,v30.s[1] +sub v4.4s, v19.4s, v13.4s +add v19.4s, v19.4s, v13.4s +sqrdmulh v13.4S, v25.4S, v29.s[2] +mla v28.4S, v20.4S, v31.s[0] +sub v20.4s, v12.4s, v24.4s +add v12.4s, v12.4s, v24.4s +sqrdmulh v24.4S, v5.4S, v29.s[2] +mla v27.4S, v21.4S, v31.s[0] +sub v21.4s, v0.4s, v18.4s +add v0.4s, v0.4s, v18.4s +sqrdmulh v18.4S, v17.4S, v29.s[1] +mla v8.4S, v11.4S, v31.s[0] +sub v11.4s, v10.4s, v3.4s +str q26, [x0, #96] +sqrdmulh v26.4S, v6.4S, v29.s[1] +mla v14.4S, v16.4S, v31.s[0] +add v10.4s, v10.4s, v3.4s +str q15, [x0, #32] +mul v5.4S, v5.4S,v30.s[2] +mul v25.4S, v25.4S,v30.s[2] +sub v15.4s, v4.4s, v28.4s +add v4.4s, v4.4s, v28.4s +mla v5.4S, v24.4S, v31.s[0] +mla v25.4S, v13.4S, v31.s[0] +sub v13.4s, v20.4s, v27.4s +add v20.4s, v20.4s, v27.4s +mul v6.4S, v6.4S,v30.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v27.4s, v19.4s, v8.4s +add v19.4s, v19.4s, v8.4s +mla v6.4S, v26.4S, v31.s[0] +mla v17.4S, v18.4S, v31.s[0] +sub v18.4s, v12.4s, v14.4s +add v12.4s, v12.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v22.s[3] +mul v15.4S, v15.4S,v23.s[3] +sub v26.4s, v21.4s, v25.4s +add v21.4s, v21.4s, v25.4s +sqrdmulh v25.4S, v4.4S, v22.s[2] +mul v4.4S, v4.4S,v23.s[2] +sub v8.4s, v11.4s, v5.4s +add v11.4s, v11.4s, v5.4s +sqrdmulh v5.4S, v27.4S, v22.s[1] +mul v27.4S, v27.4S,v23.s[1] +sub v24.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +sqrdmulh v17.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v28.4s, v10.4s, v6.4s +add v10.4s, v10.4s, v6.4s +sqrdmulh v6.4S, v13.4S, v22.s[3] +mla v15.4S, v14.4S, v31.s[0] +nop +nop +sqrdmulh v14.4S, v20.4S, v22.s[2] +mla v4.4S, v25.4S, v31.s[0] +nop +nop +sqrdmulh v25.4S, v18.4S, v22.s[1] +mla v27.4S, v5.4S, v31.s[0] +nop +nop +sqrdmulh v5.4S, v12.4S, v22.s[0] +mla v19.4S, v17.4S, v31.s[0] +nop +nop +mul v20.4S, v20.4S,v23.s[2] +mul v13.4S, v13.4S,v23.s[3] +sub v17.4s, v26.4s, v15.4s +add v26.4s, v26.4s, v15.4s +mla v20.4S, v14.4S, v31.s[0] +mla v13.4S, v6.4S, v31.s[0] +sub v6.4s, v21.4s, v4.4s +add v21.4s, v21.4s, v4.4s +mul v12.4S, v12.4S,v23.s[0] +mul v18.4S, v18.4S,v23.s[1] +sub v4.4s, v24.4s, v27.4s +add v24.4s, v24.4s, v27.4s +mla v12.4S, v5.4S, v31.s[0] +mla v18.4S, v25.4S, v31.s[0] +sub v25.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v17.4S, v9.s[3] +mul v17.4S, v17.4S,v1.s[3] +sub v5.4s, v8.4s, v13.4s +add v8.4s, v8.4s, v13.4s +sqrdmulh v13.4S, v26.4S, v9.s[2] +mul v26.4S, v26.4S,v1.s[2] +sub v27.4s, v11.4s, v20.4s +add v11.4s, v11.4s, v20.4s +sqrdmulh v20.4S, v6.4S, v9.s[1] +mul v6.4S, v6.4S,v1.s[1] +sub v14.4s, v28.4s, v18.4s +add v28.4s, v28.4s, v18.4s +sqrdmulh v18.4S, v21.4S, v9.s[0] +mul v21.4S, v21.4S,v1.s[0] +sub v15.4s, v10.4s, v12.4s +add v10.4s, v10.4s, v12.4s +sqrdmulh v12.4S, v4.4S, v7.s[3] +mla v17.4S, v19.4S, v31.s[0] +nop +nop +sqrdmulh v19.4S, v24.4S, v7.s[2] +mla v26.4S, v13.4S, v31.s[0] +nop +nop +sqrdmulh v13.4S, v25.4S, v7.s[1] +mla v6.4S, v20.4S, v31.s[0] +nop +nop +sqrdmulh v20.4S, v0.4S, v7.s[0] +mla v21.4S, v18.4S, v31.s[0] +nop +nop +mul v24.4S, v24.4S,v2.s[2] +mul v4.4S, v4.4S,v2.s[3] +sub v18.4s, v5.4s, v17.4s +str q18, [x0, #1008] +mla v24.4S, v19.4S, v31.s[0] +mla v4.4S, v12.4S, v31.s[0] +add v5.4s, v5.4s, v17.4s +str q5, [x0, #944] +mul v0.4S, v0.4S,v2.s[0] +mul v25.4S, v25.4S,v2.s[1] +sub v5.4s, v8.4s, v26.4s +str q5, [x0, #880] +mla v0.4S, v20.4S, v31.s[0] +mla v25.4S, v13.4S, v31.s[0] +add v8.4s, v8.4s, v26.4s +sub v26.4s, v27.4s, v6.4s +ldr q13, [x0, #960] +sqrdmulh v20.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +add v27.4s, v27.4s, v6.4s +str q8, [x0, #816] +ldr q8, [x0, #896] +sqrdmulh v6.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v5.4s, v11.4s, v21.4s +str q26, [x0, #752] +ldr q26, [x0, #832] +sqrdmulh v17.4S, v26.4S, v29.s[0] +mul v26.4S, v26.4S,v30.s[0] +add v11.4s, v11.4s, v21.4s +str q27, [x0, #688] +ldr q27, [x0, #768] +sqrdmulh v21.4S, v27.4S, v29.s[0] +mul v27.4S, v27.4S,v30.s[0] +sub v12.4s, v14.4s, v4.4s +str q5, [x0, #624] +ldr q5, [x0, #704] +sqrdmulh v19.4S, v5.4S, v29.s[0] +mla v13.4S, v20.4S, v31.s[0] +add v14.4s, v14.4s, v4.4s +str q11, [x0, #560] +ldr q11, [x0, #640] +sqrdmulh v4.4S, v11.4S, v29.s[0] +mla v8.4S, v6.4S, v31.s[0] +sub v6.4s, v28.4s, v24.4s +str q12, [x0, #496] +ldr q12, [x0, #576] +sqrdmulh v20.4S, v12.4S, v29.s[0] +mla v26.4S, v17.4S, v31.s[0] +add v28.4s, v28.4s, v24.4s +str q14, [x0, #432] +ldr q14, [x0, #512] +sqrdmulh v24.4S, v14.4S, v29.s[0] +mla v27.4S, v21.4S, v31.s[0] +sub v21.4s, v15.4s, v25.4s +str q6, [x0, #368] +ldr q6, [x0, #448] +add v15.4s, v15.4s, v25.4s +mul v11.4S, v11.4S,v30.s[0] +mul v5.4S, v5.4S,v30.s[0] +ldr q25, [x0, #384] +str q28, [x0, #304] +ldr q28, [x0, #320] +ldr q17, [x0, #256] +mla v11.4S, v4.4S, v31.s[0] +mla v5.4S, v19.4S, v31.s[0] +str q21, [x0, #240] +sub v21.4s, v10.4s, v0.4s +ldr q19, [x0, #192] +ldr q4, [x0, #128] +mul v14.4S, v14.4S,v30.s[0] +mul v12.4S, v12.4S,v30.s[0] +str q15, [x0, #176] +add v10.4s, v10.4s, v0.4s +ldr q0, [x0, #64] +ldr q15, [x0, #0] +mla v14.4S, v24.4S, v31.s[0] +mla v12.4S, v20.4S, v31.s[0] +sub v20.4s, v6.4s, v13.4s +add v6.4s, v6.4s, v13.4s +sqrdmulh v13.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +sub v24.4s, v25.4s, v8.4s +add v25.4s, v25.4s, v8.4s +sqrdmulh v8.4S, v24.4S, v29.s[2] +mul v24.4S, v24.4S,v30.s[2] +sub v18.4s, v28.4s, v26.4s +add v28.4s, v28.4s, v26.4s +sqrdmulh v26.4S, v6.4S, v29.s[1] +mul v6.4S, v6.4S,v30.s[1] +sub v3.4s, v17.4s, v27.4s +add v17.4s, v17.4s, v27.4s +sqrdmulh v27.4S, v25.4S, v29.s[1] +mul v25.4S, v25.4S,v30.s[1] +sub v16.4s, v19.4s, v5.4s +add v19.4s, v19.4s, v5.4s +sqrdmulh v5.4S, v18.4S, v29.s[2] +mla v20.4S, v13.4S, v31.s[0] +sub v13.4s, v4.4s, v11.4s +add v4.4s, v4.4s, v11.4s +sqrdmulh v11.4S, v3.4S, v29.s[2] +mla v24.4S, v8.4S, v31.s[0] +sub v8.4s, v0.4s, v12.4s +add v0.4s, v0.4s, v12.4s +sqrdmulh v12.4S, v28.4S, v29.s[1] +mla v6.4S, v26.4S, v31.s[0] +sub v26.4s, v15.4s, v14.4s +str q21, [x0, #112] +sqrdmulh v21.4S, v17.4S, v29.s[1] +mla v25.4S, v27.4S, v31.s[0] +add v15.4s, v15.4s, v14.4s +str q10, [x0, #48] +mul v3.4S, v3.4S,v30.s[2] +mul v18.4S, v18.4S,v30.s[2] +sub v10.4s, v16.4s, v20.4s +add v16.4s, v16.4s, v20.4s +mla v3.4S, v11.4S, v31.s[0] +mla v18.4S, v5.4S, v31.s[0] +sub v5.4s, v13.4s, v24.4s +add v13.4s, v13.4s, v24.4s +mul v17.4S, v17.4S,v30.s[1] +mul v28.4S, v28.4S,v30.s[1] +sub v24.4s, v19.4s, v6.4s +add v19.4s, v19.4s, v6.4s +mla v17.4S, v21.4S, v31.s[0] +mla v28.4S, v12.4S, v31.s[0] +sub v12.4s, v4.4s, v25.4s +add v4.4s, v4.4s, v25.4s +sqrdmulh v25.4S, v10.4S, v22.s[3] +mul v10.4S, v10.4S,v23.s[3] +sub v21.4s, v8.4s, v18.4s +add v8.4s, v8.4s, v18.4s +sqrdmulh v18.4S, v16.4S, v22.s[2] +mul v16.4S, v16.4S,v23.s[2] +sub v6.4s, v26.4s, v3.4s +add v26.4s, v26.4s, v3.4s +sqrdmulh v3.4S, v24.4S, v22.s[1] +mul v24.4S, v24.4S,v23.s[1] +sub v11.4s, v0.4s, v28.4s +add v0.4s, v0.4s, v28.4s +sqrdmulh v28.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v20.4s, v15.4s, v17.4s +add v15.4s, v15.4s, v17.4s +sqrdmulh v17.4S, v5.4S, v22.s[3] +mla v10.4S, v25.4S, v31.s[0] +nop +nop +sqrdmulh v25.4S, v13.4S, v22.s[2] +mla v16.4S, v18.4S, v31.s[0] +nop +nop +sqrdmulh v18.4S, v12.4S, v22.s[1] +mla v24.4S, v3.4S, v31.s[0] +nop +nop +sqrdmulh v3.4S, v4.4S, v22.s[0] +mla v19.4S, v28.4S, v31.s[0] +nop +nop +mul v13.4S, v13.4S,v23.s[2] +mul v5.4S, v5.4S,v23.s[3] +sub v28.4s, v21.4s, v10.4s +add v21.4s, v21.4s, v10.4s +mla v13.4S, v25.4S, v31.s[0] +mla v5.4S, v17.4S, v31.s[0] +sub v17.4s, v8.4s, v16.4s +add v8.4s, v8.4s, v16.4s +mul v4.4S, v4.4S,v23.s[0] +mul v12.4S, v12.4S,v23.s[1] +sub v16.4s, v11.4s, v24.4s +add v11.4s, v11.4s, v24.4s +mla v4.4S, v3.4S, v31.s[0] +mla v12.4S, v18.4S, v31.s[0] +sub v18.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v28.4S, v9.s[3] +mul v28.4S, v28.4S,v1.s[3] +sub v3.4s, v6.4s, v5.4s +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v21.4S, v9.s[2] +mul v21.4S, v21.4S,v1.s[2] +sub v24.4s, v26.4s, v13.4s +add v26.4s, v26.4s, v13.4s +sqrdmulh v13.4S, v17.4S, v9.s[1] +mul v17.4S, v17.4S,v1.s[1] +sub v25.4s, v20.4s, v12.4s +add v20.4s, v20.4s, v12.4s +sqrdmulh v12.4S, v8.4S, v9.s[0] +mul v8.4S, v8.4S,v1.s[0] +sub v10.4s, v15.4s, v4.4s +add v15.4s, v15.4s, v4.4s +sqrdmulh v4.4S, v16.4S, v7.s[3] +mla v28.4S, v19.4S, v31.s[0] +nop +nop +sqrdmulh v19.4S, v11.4S, v7.s[2] +mla v21.4S, v5.4S, v31.s[0] +nop +nop +sqrdmulh v5.4S, v18.4S, v7.s[1] +mla v17.4S, v13.4S, v31.s[0] +nop +nop +sqrdmulh v13.4S, v0.4S, v7.s[0] +mla v8.4S, v12.4S, v31.s[0] +nop +nop +mul v11.4S, v11.4S,v2.s[2] +mul v16.4S, v16.4S,v2.s[3] +sub v12.4s, v3.4s, v28.4s +str q12, [x0, #960] +mla v11.4S, v19.4S, v31.s[0] +mla v16.4S, v4.4S, v31.s[0] +add v3.4s, v3.4s, v28.4s +str q3, [x0, #896] +mul v0.4S, v0.4S,v2.s[0] +mul v18.4S, v18.4S,v2.s[1] +sub v3.4s, v6.4s, v21.4s +str q3, [x0, #832] +mla v0.4S, v13.4S, v31.s[0] +mla v18.4S, v5.4S, v31.s[0] +add v6.4s, v6.4s, v21.4s +sub v21.4s, v24.4s, v17.4s +ldr q5, [x0, #976] +sqrdmulh v13.4S, v5.4S, v29.s[0] +mul v5.4S, v5.4S,v30.s[0] +add v24.4s, v24.4s, v17.4s +str q6, [x0, #768] +ldr q6, [x0, #912] +sqrdmulh v17.4S, v6.4S, v29.s[0] +mul v6.4S, v6.4S,v30.s[0] +sub v3.4s, v26.4s, v8.4s +str q21, [x0, #704] +ldr q21, [x0, #848] +sqrdmulh v28.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +add v26.4s, v26.4s, v8.4s +str q24, [x0, #640] +ldr q24, [x0, #784] +sqrdmulh v8.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +sub v4.4s, v25.4s, v16.4s +str q3, [x0, #576] +ldr q3, [x0, #720] +sqrdmulh v19.4S, v3.4S, v29.s[0] +mla v5.4S, v13.4S, v31.s[0] +add v25.4s, v25.4s, v16.4s +str q26, [x0, #512] +ldr q26, [x0, #656] +sqrdmulh v16.4S, v26.4S, v29.s[0] +mla v6.4S, v17.4S, v31.s[0] +sub v17.4s, v20.4s, v11.4s +str q4, [x0, #448] +ldr q4, [x0, #592] +sqrdmulh v13.4S, v4.4S, v29.s[0] +mla v21.4S, v28.4S, v31.s[0] +add v20.4s, v20.4s, v11.4s +str q25, [x0, #384] +ldr q25, [x0, #528] +sqrdmulh v11.4S, v25.4S, v29.s[0] +mla v24.4S, v8.4S, v31.s[0] +sub v8.4s, v10.4s, v18.4s +str q17, [x0, #320] +ldr q17, [x0, #464] +add v10.4s, v10.4s, v18.4s +mul v26.4S, v26.4S,v30.s[0] +mul v3.4S, v3.4S,v30.s[0] +ldr q18, [x0, #400] +str q20, [x0, #256] +ldr q20, [x0, #336] +ldr q28, [x0, #272] +mla v26.4S, v16.4S, v31.s[0] +mla v3.4S, v19.4S, v31.s[0] +str q8, [x0, #192] +sub v8.4s, v15.4s, v0.4s +ldr q19, [x0, #208] +ldr q16, [x0, #144] +mul v25.4S, v25.4S,v30.s[0] +mul v4.4S, v4.4S,v30.s[0] +str q10, [x0, #128] +add v15.4s, v15.4s, v0.4s +ldr q0, [x0, #80] +ldr q10, [x0, #16] +mla v25.4S, v11.4S, v31.s[0] +mla v4.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v5.4s +add v17.4s, v17.4s, v5.4s +sqrdmulh v5.4S, v13.4S, v29.s[2] +mul v13.4S, v13.4S,v30.s[2] +sub v11.4s, v18.4s, v6.4s +add v18.4s, v18.4s, v6.4s +sqrdmulh v6.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v12.4s, v20.4s, v21.4s +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v14.4s, v28.4s, v24.4s +add v28.4s, v28.4s, v24.4s +sqrdmulh v24.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v27.4s, v19.4s, v3.4s +add v19.4s, v19.4s, v3.4s +sqrdmulh v3.4S, v12.4S, v29.s[2] +mla v13.4S, v5.4S, v31.s[0] +sub v5.4s, v16.4s, v26.4s +add v16.4s, v16.4s, v26.4s +sqrdmulh v26.4S, v14.4S, v29.s[2] +mla v11.4S, v6.4S, v31.s[0] +sub v6.4s, v0.4s, v4.4s +add v0.4s, v0.4s, v4.4s +sqrdmulh v4.4S, v20.4S, v29.s[1] +mla v17.4S, v21.4S, v31.s[0] +sub v21.4s, v10.4s, v25.4s +str q8, [x0, #64] +sqrdmulh v8.4S, v28.4S, v29.s[1] +mla v18.4S, v24.4S, v31.s[0] +add v10.4s, v10.4s, v25.4s +str q15, [x0, #0] +mul v14.4S, v14.4S,v30.s[2] +mul v12.4S, v12.4S,v30.s[2] +sub v15.4s, v27.4s, v13.4s +add v27.4s, v27.4s, v13.4s +mla v14.4S, v26.4S, v31.s[0] +mla v12.4S, v3.4S, v31.s[0] +sub v3.4s, v5.4s, v11.4s +add v5.4s, v5.4s, v11.4s +mul v28.4S, v28.4S,v30.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v11.4s, v19.4s, v17.4s +add v19.4s, v19.4s, v17.4s +mla v28.4S, v8.4S, v31.s[0] +mla v20.4S, v4.4S, v31.s[0] +sub v4.4s, v16.4s, v18.4s +add v16.4s, v16.4s, v18.4s +sqrdmulh v29.4S, v15.4S, v22.s[3] +mul v15.4S, v15.4S,v23.s[3] +sub v30.4s, v6.4s, v12.4s +add v6.4s, v6.4s, v12.4s +sqrdmulh v12.4S, v27.4S, v22.s[2] +mul v27.4S, v27.4S,v23.s[2] +sub v18.4s, v21.4s, v14.4s +add v21.4s, v21.4s, v14.4s +sqrdmulh v14.4S, v11.4S, v22.s[1] +mul v11.4S, v11.4S,v23.s[1] +sub v8.4s, v0.4s, v20.4s +add v0.4s, v0.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v17.4s, v10.4s, v28.4s +add v10.4s, v10.4s, v28.4s +sqrdmulh v28.4S, v3.4S, v22.s[3] +mla v15.4S, v29.4S, v31.s[0] +nop +nop +sqrdmulh v29.4S, v5.4S, v22.s[2] +mla v27.4S, v12.4S, v31.s[0] +nop +nop +sqrdmulh v12.4S, v4.4S, v22.s[1] +mla v11.4S, v14.4S, v31.s[0] +nop +nop +sqrdmulh v14.4S, v16.4S, v22.s[0] +mla v19.4S, v20.4S, v31.s[0] +nop +nop +mul v5.4S, v5.4S,v23.s[2] +mul v3.4S, v3.4S,v23.s[3] +sub v20.4s, v30.4s, v15.4s +add v30.4s, v30.4s, v15.4s +mla v5.4S, v29.4S, v31.s[0] +mla v3.4S, v28.4S, v31.s[0] +sub v28.4s, v6.4s, v27.4s +add v6.4s, v6.4s, v27.4s +mul v16.4S, v16.4S,v23.s[0] +mul v4.4S, v4.4S,v23.s[1] +sub v27.4s, v8.4s, v11.4s +add v8.4s, v8.4s, v11.4s +mla v16.4S, v14.4S, v31.s[0] +mla v4.4S, v12.4S, v31.s[0] +sub v12.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v22.4S, v20.4S, v9.s[3] +mul v20.4S, v20.4S,v1.s[3] +sub v23.4s, v18.4s, v3.4s +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v30.4S, v9.s[2] +mul v30.4S, v30.4S,v1.s[2] +sub v19.4s, v21.4s, v5.4s +add v21.4s, v21.4s, v5.4s +sqrdmulh v5.4S, v28.4S, v9.s[1] +mul v28.4S, v28.4S,v1.s[1] +sub v14.4s, v17.4s, v4.4s +add v17.4s, v17.4s, v4.4s +sqrdmulh v4.4S, v6.4S, v9.s[0] +mul v6.4S, v6.4S,v1.s[0] +sub v11.4s, v10.4s, v16.4s +add v10.4s, v10.4s, v16.4s +sqrdmulh v9.4S, v27.4S, v7.s[3] +mla v20.4S, v22.4S, v31.s[0] +nop +nop +sqrdmulh v22.4S, v8.4S, v7.s[2] +mla v30.4S, v3.4S, v31.s[0] +nop +nop +sqrdmulh v3.4S, v12.4S, v7.s[1] +mla v28.4S, v5.4S, v31.s[0] +nop +nop +sqrdmulh v5.4S, v0.4S, v7.s[0] +mla v6.4S, v4.4S, v31.s[0] +nop +nop +mul v8.4S, v8.4S,v2.s[2] +mul v27.4S, v27.4S,v2.s[3] +sub v4.4s, v23.4s, v20.4s +str q4, [x0, #976] +mla v8.4S, v22.4S, v31.s[0] +mla v27.4S, v9.4S, v31.s[0] +add v23.4s, v23.4s, v20.4s +str q23, [x0, #912] +mul v0.4S, v0.4S,v2.s[0] +mul v12.4S, v12.4S,v2.s[1] +sub v23.4s, v18.4s, v30.4s +str q23, [x0, #848] +mla v0.4S, v5.4S, v31.s[0] +mla v12.4S, v3.4S, v31.s[0] +add v18.4s, v18.4s, v30.4s +sub v30.4s, v19.4s, v28.4s +add v19.4s, v19.4s, v28.4s +str q18, [x0, #784] +sub v18.4s, v21.4s, v6.4s +str q30, [x0, #720] +add v21.4s, v21.4s, v6.4s +str q19, [x0, #656] +sub v19.4s, v14.4s, v27.4s +str q18, [x0, #592] +add v14.4s, v14.4s, v27.4s +str q21, [x0, #528] +sub v21.4s, v17.4s, v8.4s +str q19, [x0, #464] +add v17.4s, v17.4s, v8.4s +str q14, [x0, #400] +sub v14.4s, v11.4s, v12.4s +str q21, [x0, #336] +add v11.4s, v11.4s, v12.4s +str q17, [x0, #272] +sub v17.4s, v10.4s, v0.4s +add v10.4s, v10.4s, v0.4s +ldr q24, [x0, #32] +ldr q25, [x0, #48] +ldr q13, [x0, #96] +ldr q26, [x0, #112] +ldr q15, [x17, #+128] +ldr q29, [x17, #+144] +ldr q16, [x17, #+160] +ldr q1, [x17, #+176] +ldr q4, [x0, #160] +ldr q22, [x0, #176] +sqrdmulh v9.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v15.s[0] +ldr q20, [x0, #224] +sqrdmulh v23.4S, v25.4S, v29.s[0] +mul v25.4S, v25.4S,v15.s[0] +ldr q5, [x0, #240] +sqrdmulh v3.4S, v13.4S, v1.s[0] +mul v13.4S, v13.4S,v16.s[0] +ldr q2, [x17, #+192] +sqrdmulh v7.4S, v26.4S, v1.s[0] +mul v26.4S, v26.4S,v16.s[0] +ldr q28, [x17, #+208] +mla v24.4S, v9.4S, v31.s[0] +sqrdmulh v9.4S, v4.4S, v28.s[0] +ldr q30, [x17, #+224] +mla v25.4S, v23.4S, v31.s[0] +sqrdmulh v23.4S, v22.4S, v28.s[0] +ldr q6, [x17, #+240] +mla v13.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v20.4S, v6.s[0] +ldr q18, [x0, #0] +mla v26.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v5.4S, v6.s[0] +mul v4.4S, v4.4S,v2.s[0] +mul v22.4S, v22.4S,v2.s[0] +ldr q27, [x0, #64] +mla v4.4S, v9.4S, v31.s[0] +mla v22.4S, v23.4S, v31.s[0] +sub v23.4s, v18.4s, v24.4s +add v18.4s, v18.4s, v24.4s +mul v20.4S, v20.4S,v30.s[0] +mul v5.4S, v5.4S,v30.s[0] +sub v24.4s, v10.4s, v25.4s +ldr q9, [x0, #128] +add v10.4s, v10.4s, v25.4s +mla v20.4S, v3.4S, v31.s[0] +mla v5.4S, v7.4S, v31.s[0] +sub v7.4s, v27.4s, v13.4s +add v27.4s, v27.4s, v13.4s +sqrdmulh v13.4S, v10.4S, v29.s[1] +mul v10.4S, v10.4S,v15.s[1] +sub v3.4s, v17.4s, v26.4s +ldr q25, [x0, #192] +add v17.4s, v17.4s, v26.4s +sqrdmulh v26.4S, v24.4S, v29.s[2] +mul v24.4S, v24.4S,v15.s[2] +sub v19.4s, v9.4s, v4.4s +add v9.4s, v9.4s, v4.4s +sqrdmulh v29.4S, v17.4S, v1.s[1] +mul v17.4S, v17.4S,v16.s[1] +sub v4.4s, v11.4s, v22.4s +ldr q15, [x0, #288] +add v11.4s, v11.4s, v22.4s +sqrdmulh v22.4S, v3.4S, v1.s[2] +mul v3.4S, v3.4S,v16.s[2] +sub v8.4s, v25.4s, v20.4s +ldr q21, [x0, #304] +add v25.4s, v25.4s, v20.4s +mla v10.4S, v13.4S, v31.s[0] +sqrdmulh v13.4S, v11.4S, v28.s[1] +sub v1.4s, v14.4s, v5.4s +ldr q20, [x0, #352] +add v14.4s, v14.4s, v5.4s +mla v24.4S, v26.4S, v31.s[0] +sqrdmulh v26.4S, v4.4S, v28.s[2] +sub v5.4s, v18.4s, v10.4s +ldr q16, [x0, #368] +str q5, [x0, #16] +mla v17.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v14.4S, v6.s[1] +add v18.4s, v18.4s, v10.4s +ldr q10, [x17, #+256] +str q18, [x0, #0] +mla v3.4S, v22.4S, v31.s[0] +sqrdmulh v22.4S, v1.4S, v6.s[2] +sub v18.4s, v23.4s, v24.4s +ldr q5, [x17, #+272] +str q18, [x0, #48] +mul v11.4S, v11.4S,v2.s[1] +mul v4.4S, v4.4S,v2.s[2] +add v23.4s, v23.4s, v24.4s +ldr q24, [x17, #+288] +str q23, [x0, #32] +mla v11.4S, v13.4S, v31.s[0] +mla v4.4S, v26.4S, v31.s[0] +sub v26.4s, v27.4s, v17.4s +ldr q13, [x17, #+304] +str q26, [x0, #80] +mul v14.4S, v14.4S,v30.s[1] +mul v1.4S, v1.4S,v30.s[2] +add v27.4s, v27.4s, v17.4s +ldr q17, [x0, #416] +str q27, [x0, #64] +mla v14.4S, v29.4S, v31.s[0] +mla v1.4S, v22.4S, v31.s[0] +sub v22.4s, v7.4s, v3.4s +ldr q29, [x0, #432] +str q22, [x0, #112] +sqrdmulh v6.4S, v15.4S, v5.s[0] +mul v15.4S, v15.4S,v10.s[0] +add v7.4s, v7.4s, v3.4s +ldr q3, [x0, #480] +str q7, [x0, #96] +sqrdmulh v7.4S, v21.4S, v5.s[0] +mul v21.4S, v21.4S,v10.s[0] +sub v22.4s, v9.4s, v11.4s +ldr q30, [x0, #496] +str q22, [x0, #144] +sqrdmulh v22.4S, v20.4S, v13.s[0] +mul v20.4S, v20.4S,v24.s[0] +add v9.4s, v9.4s, v11.4s +ldr q11, [x17, #+320] +str q9, [x0, #128] +sqrdmulh v9.4S, v16.4S, v13.s[0] +mul v16.4S, v16.4S,v24.s[0] +sub v27.4s, v19.4s, v4.4s +ldr q28, [x17, #+336] +str q27, [x0, #176] +mla v15.4S, v6.4S, v31.s[0] +sqrdmulh v6.4S, v17.4S, v28.s[0] +add v19.4s, v19.4s, v4.4s +ldr q4, [x17, #+352] +str q19, [x0, #160] +mla v21.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v29.4S, v28.s[0] +sub v19.4s, v25.4s, v14.4s +ldr q27, [x17, #+368] +str q19, [x0, #208] +mla v20.4S, v22.4S, v31.s[0] +sqrdmulh v22.4S, v3.4S, v27.s[0] +add v25.4s, v25.4s, v14.4s +ldr q14, [x0, #256] +str q25, [x0, #192] +mla v16.4S, v9.4S, v31.s[0] +sqrdmulh v9.4S, v30.4S, v27.s[0] +sub v25.4s, v8.4s, v1.4s +ldr q19, [x0, #272] +str q25, [x0, #240] +mul v17.4S, v17.4S,v11.s[0] +mul v29.4S, v29.4S,v11.s[0] +add v8.4s, v8.4s, v1.4s +ldr q1, [x0, #320] +str q8, [x0, #224] +mla v17.4S, v6.4S, v31.s[0] +mla v29.4S, v7.4S, v31.s[0] +sub v7.4s, v14.4s, v15.4s +ldr q6, [x0, #336] +add v14.4s, v14.4s, v15.4s +mul v3.4S, v3.4S,v4.s[0] +mul v30.4S, v30.4S,v4.s[0] +sub v15.4s, v19.4s, v21.4s +ldr q8, [x0, #384] +add v19.4s, v19.4s, v21.4s +mla v3.4S, v22.4S, v31.s[0] +mla v30.4S, v9.4S, v31.s[0] +sub v9.4s, v1.4s, v20.4s +ldr q22, [x0, #400] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v5.s[1] +mul v19.4S, v19.4S,v10.s[1] +sub v21.4s, v6.4s, v16.4s +ldr q25, [x0, #448] +add v6.4s, v6.4s, v16.4s +sqrdmulh v16.4S, v15.4S, v5.s[2] +mul v15.4S, v15.4S,v10.s[2] +sub v26.4s, v8.4s, v17.4s +ldr q2, [x0, #464] +add v8.4s, v8.4s, v17.4s +sqrdmulh v5.4S, v6.4S, v13.s[1] +mul v6.4S, v6.4S,v24.s[1] +sub v17.4s, v22.4s, v29.4s +ldr q10, [x0, #544] +add v22.4s, v22.4s, v29.4s +sqrdmulh v29.4S, v21.4S, v13.s[2] +mul v21.4S, v21.4S,v24.s[2] +sub v23.4s, v25.4s, v3.4s +ldr q18, [x0, #560] +add v25.4s, v25.4s, v3.4s +mla v19.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v22.4S, v28.s[1] +sub v13.4s, v2.4s, v30.4s +ldr q3, [x0, #608] +add v2.4s, v2.4s, v30.4s +mla v15.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v17.4S, v28.s[2] +sub v30.4s, v14.4s, v19.4s +ldr q24, [x0, #624] +str q30, [x0, #272] +mla v6.4S, v5.4S, v31.s[0] +sqrdmulh v5.4S, v2.4S, v27.s[1] +add v14.4s, v14.4s, v19.4s +ldr q19, [x17, #+384] +str q14, [x0, #256] +mla v21.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v13.4S, v27.s[2] +sub v14.4s, v7.4s, v15.4s +ldr q30, [x17, #+400] +str q14, [x0, #304] +mul v22.4S, v22.4S,v11.s[1] +mul v17.4S, v17.4S,v11.s[2] +add v7.4s, v7.4s, v15.4s +ldr q15, [x17, #+416] +str q7, [x0, #288] +mla v22.4S, v20.4S, v31.s[0] +mla v17.4S, v16.4S, v31.s[0] +sub v16.4s, v1.4s, v6.4s +ldr q20, [x17, #+432] +str q16, [x0, #336] +mul v2.4S, v2.4S,v4.s[1] +mul v13.4S, v13.4S,v4.s[2] +add v1.4s, v1.4s, v6.4s +ldr q6, [x0, #672] +str q1, [x0, #320] +mla v2.4S, v5.4S, v31.s[0] +mla v13.4S, v29.4S, v31.s[0] +sub v29.4s, v9.4s, v21.4s +ldr q5, [x0, #688] +str q29, [x0, #368] +sqrdmulh v27.4S, v10.4S, v30.s[0] +mul v10.4S, v10.4S,v19.s[0] +add v9.4s, v9.4s, v21.4s +ldr q21, [x0, #736] +str q9, [x0, #352] +sqrdmulh v9.4S, v18.4S, v30.s[0] +mul v18.4S, v18.4S,v19.s[0] +sub v29.4s, v8.4s, v22.4s +ldr q4, [x0, #752] +str q29, [x0, #400] +sqrdmulh v29.4S, v3.4S, v20.s[0] +mul v3.4S, v3.4S,v15.s[0] +add v8.4s, v8.4s, v22.4s +ldr q22, [x17, #+448] +str q8, [x0, #384] +sqrdmulh v8.4S, v24.4S, v20.s[0] +mul v24.4S, v24.4S,v15.s[0] +sub v1.4s, v26.4s, v17.4s +ldr q28, [x17, #+464] +str q1, [x0, #432] +mla v10.4S, v27.4S, v31.s[0] +sqrdmulh v27.4S, v6.4S, v28.s[0] +add v26.4s, v26.4s, v17.4s +ldr q17, [x17, #+480] +str q26, [x0, #416] +mla v18.4S, v9.4S, v31.s[0] +sqrdmulh v9.4S, v5.4S, v28.s[0] +sub v26.4s, v25.4s, v2.4s +ldr q1, [x17, #+496] +str q26, [x0, #464] +mla v3.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v21.4S, v1.s[0] +add v25.4s, v25.4s, v2.4s +ldr q2, [x0, #512] +str q25, [x0, #448] +mla v24.4S, v8.4S, v31.s[0] +sqrdmulh v8.4S, v4.4S, v1.s[0] +sub v25.4s, v23.4s, v13.4s +ldr q26, [x0, #528] +str q25, [x0, #496] +mul v6.4S, v6.4S,v22.s[0] +mul v5.4S, v5.4S,v22.s[0] +add v23.4s, v23.4s, v13.4s +ldr q13, [x0, #576] +str q23, [x0, #480] +mla v6.4S, v27.4S, v31.s[0] +mla v5.4S, v9.4S, v31.s[0] +sub v9.4s, v2.4s, v10.4s +ldr q27, [x0, #592] +add v2.4s, v2.4s, v10.4s +mul v21.4S, v21.4S,v17.s[0] +mul v4.4S, v4.4S,v17.s[0] +sub v10.4s, v26.4s, v18.4s +ldr q23, [x0, #640] +add v26.4s, v26.4s, v18.4s +mla v21.4S, v29.4S, v31.s[0] +mla v4.4S, v8.4S, v31.s[0] +sub v8.4s, v13.4s, v3.4s +ldr q29, [x0, #656] +add v13.4s, v13.4s, v3.4s +sqrdmulh v3.4S, v26.4S, v30.s[1] +mul v26.4S, v26.4S,v19.s[1] +sub v18.4s, v27.4s, v24.4s +ldr q25, [x0, #704] +add v27.4s, v27.4s, v24.4s +sqrdmulh v24.4S, v10.4S, v30.s[2] +mul v10.4S, v10.4S,v19.s[2] +sub v16.4s, v23.4s, v6.4s +ldr q11, [x0, #720] +add v23.4s, v23.4s, v6.4s +sqrdmulh v30.4S, v27.4S, v20.s[1] +mul v27.4S, v27.4S,v15.s[1] +sub v6.4s, v29.4s, v5.4s +ldr q19, [x0, #800] +add v29.4s, v29.4s, v5.4s +sqrdmulh v5.4S, v18.4S, v20.s[2] +mul v18.4S, v18.4S,v15.s[2] +sub v7.4s, v25.4s, v21.4s +ldr q14, [x0, #816] +add v25.4s, v25.4s, v21.4s +mla v26.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v29.4S, v28.s[1] +sub v20.4s, v11.4s, v4.4s +ldr q21, [x0, #864] +add v11.4s, v11.4s, v4.4s +mla v10.4S, v24.4S, v31.s[0] +sqrdmulh v24.4S, v6.4S, v28.s[2] +sub v4.4s, v2.4s, v26.4s +ldr q15, [x0, #880] +str q4, [x0, #528] +mla v27.4S, v30.4S, v31.s[0] +sqrdmulh v30.4S, v11.4S, v1.s[1] +add v2.4s, v2.4s, v26.4s +ldr q26, [x17, #+512] +str q2, [x0, #512] +mla v18.4S, v5.4S, v31.s[0] +sqrdmulh v5.4S, v20.4S, v1.s[2] +sub v2.4s, v9.4s, v10.4s +ldr q4, [x17, #+528] +str q2, [x0, #560] +mul v29.4S, v29.4S,v22.s[1] +mul v6.4S, v6.4S,v22.s[2] +add v9.4s, v9.4s, v10.4s +ldr q10, [x17, #+544] +str q9, [x0, #544] +mla v29.4S, v3.4S, v31.s[0] +mla v6.4S, v24.4S, v31.s[0] +sub v24.4s, v13.4s, v27.4s +ldr q3, [x17, #+560] +str q24, [x0, #592] +mul v11.4S, v11.4S,v17.s[1] +mul v20.4S, v20.4S,v17.s[2] +add v13.4s, v13.4s, v27.4s +ldr q27, [x0, #928] +str q13, [x0, #576] +mla v11.4S, v30.4S, v31.s[0] +mla v20.4S, v5.4S, v31.s[0] +sub v5.4s, v8.4s, v18.4s +ldr q30, [x0, #944] +str q5, [x0, #624] +sqrdmulh v1.4S, v19.4S, v4.s[0] +mul v19.4S, v19.4S,v26.s[0] +add v8.4s, v8.4s, v18.4s +ldr q18, [x0, #992] +str q8, [x0, #608] +sqrdmulh v8.4S, v14.4S, v4.s[0] +mul v14.4S, v14.4S,v26.s[0] +sub v5.4s, v23.4s, v29.4s +ldr q17, [x0, #1008] +str q5, [x0, #656] +sqrdmulh v5.4S, v21.4S, v3.s[0] +mul v21.4S, v21.4S,v10.s[0] +add v23.4s, v23.4s, v29.4s +ldr q29, [x17, #+576] +str q23, [x0, #640] +sqrdmulh v23.4S, v15.4S, v3.s[0] +mul v15.4S, v15.4S,v10.s[0] +sub v13.4s, v16.4s, v6.4s +ldr q28, [x17, #+592] +str q13, [x0, #688] +mla v19.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v27.4S, v28.s[0] +add v16.4s, v16.4s, v6.4s +ldr q6, [x17, #+608] +str q16, [x0, #672] +mla v14.4S, v8.4S, v31.s[0] +sqrdmulh v8.4S, v30.4S, v28.s[0] +sub v16.4s, v25.4s, v11.4s +ldr q13, [x17, #+624] +str q16, [x0, #720] +mla v21.4S, v5.4S, v31.s[0] +sqrdmulh v5.4S, v18.4S, v13.s[0] +add v25.4s, v25.4s, v11.4s +ldr q11, [x0, #768] +str q25, [x0, #704] +mla v15.4S, v23.4S, v31.s[0] +sqrdmulh v23.4S, v17.4S, v13.s[0] +sub v25.4s, v7.4s, v20.4s +ldr q16, [x0, #784] +str q25, [x0, #752] +mul v27.4S, v27.4S,v29.s[0] +mul v30.4S, v30.4S,v29.s[0] +add v7.4s, v7.4s, v20.4s +ldr q20, [x0, #832] +str q7, [x0, #736] +mla v27.4S, v1.4S, v31.s[0] +mla v30.4S, v8.4S, v31.s[0] +sub v8.4s, v11.4s, v19.4s +ldr q1, [x0, #848] +add v11.4s, v11.4s, v19.4s +mul v18.4S, v18.4S,v6.s[0] +mul v17.4S, v17.4S,v6.s[0] +sub v19.4s, v16.4s, v14.4s +ldr q7, [x0, #896] +add v16.4s, v16.4s, v14.4s +mla v18.4S, v5.4S, v31.s[0] +mla v17.4S, v23.4S, v31.s[0] +sub v23.4s, v20.4s, v21.4s +ldr q5, [x0, #912] +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v16.4S, v4.s[1] +mul v16.4S, v16.4S,v26.s[1] +sub v14.4s, v1.4s, v15.4s +ldr q25, [x0, #960] +add v1.4s, v1.4s, v15.4s +sqrdmulh v15.4S, v19.4S, v4.s[2] +mul v19.4S, v19.4S,v26.s[2] +sub v24.4s, v7.4s, v27.4s +ldr q22, [x0, #976] +add v7.4s, v7.4s, v27.4s +sqrdmulh v4.4S, v1.4S, v3.s[1] +mul v1.4S, v1.4S,v10.s[1] +sub v27.4s, v5.4s, v30.4s +add v5.4s, v5.4s, v30.4s +sqrdmulh v30.4S, v14.4S, v3.s[2] +mul v14.4S, v14.4S,v10.s[2] +sub v26.4s, v25.4s, v18.4s +add v25.4s, v25.4s, v18.4s +mla v16.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v5.4S, v28.s[1] +sub v3.4s, v22.4s, v17.4s +add v22.4s, v22.4s, v17.4s +mla v19.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v27.4S, v28.s[2] +sub v17.4s, v11.4s, v16.4s +str q17, [x0, #784] +mla v1.4S, v4.4S, v31.s[0] +sqrdmulh v4.4S, v22.4S, v13.s[1] +add v11.4s, v11.4s, v16.4s +str q11, [x0, #768] +mla v14.4S, v30.4S, v31.s[0] +sqrdmulh v30.4S, v3.4S, v13.s[2] +sub v11.4s, v8.4s, v19.4s +str q11, [x0, #816] +mul v5.4S, v5.4S,v29.s[1] +mul v27.4S, v27.4S,v29.s[2] +add v8.4s, v8.4s, v19.4s +str q8, [x0, #800] +mla v5.4S, v21.4S, v31.s[0] +mla v27.4S, v15.4S, v31.s[0] +sub v15.4s, v20.4s, v1.4s +str q15, [x0, #848] +mul v22.4S, v22.4S,v6.s[1] +mul v3.4S, v3.4S,v6.s[2] +add v20.4s, v20.4s, v1.4s +str q20, [x0, #832] +mla v22.4S, v4.4S, v31.s[0] +mla v3.4S, v30.4S, v31.s[0] +sub v30.4s, v23.4s, v14.4s +str q30, [x0, #880] +add v23.4s, v23.4s, v14.4s +str q23, [x0, #864] +sub v23.4s, v7.4s, v5.4s +str q23, [x0, #912] +add v7.4s, v7.4s, v5.4s +str q7, [x0, #896] +sub v7.4s, v24.4s, v27.4s +str q7, [x0, #944] +add v24.4s, v24.4s, v27.4s +str q24, [x0, #928] +sub v24.4s, v25.4s, v22.4s +str q24, [x0, #976] +add v25.4s, v25.4s, v22.4s +str q25, [x0, #960] +sub v25.4s, v26.4s, v3.4s +str q25, [x0, #1008] +add v26.4s, v26.4s, v3.4s +str q26, [x0, #992] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1520 +// Instruction count: 1516 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_12.s b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_12.s new file mode 100644 index 0000000..68e1c14 --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_12.s @@ -0,0 +1,1550 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_22_z4_12 +.global _ntt_u32_incomplete_neon_asm_var_4_2_22_z4_12 +ntt_u32_incomplete_neon_asm_var_4_2_22_z4_12: +_ntt_u32_incomplete_neon_asm_var_4_2_22_z4_12: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x0, #992] +sqrdmulh v27.4S, v28.4S, v29.s[0] +mul v28.4S, v28.4S,v30.s[0] +ldr q26, [x0, #928] +sqrdmulh v25.4S, v26.4S, v29.s[0] +mul v26.4S, v26.4S,v30.s[0] +ldr q24, [x0, #864] +sqrdmulh v23.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +ldr q22, [x0, #800] +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +ldr q20, [x0, #736] +sqrdmulh v19.4S, v20.4S, v29.s[0] +mla v28.4S, v27.4S, v31.s[0] +ldr q27, [x0, #672] +sqrdmulh v18.4S, v27.4S, v29.s[0] +mla v26.4S, v25.4S, v31.s[0] +ldr q25, [x0, #608] +sqrdmulh v17.4S, v25.4S, v29.s[0] +mla v24.4S, v23.4S, v31.s[0] +ldr q23, [x0, #544] +sqrdmulh v16.4S, v23.4S, v29.s[0] +mla v22.4S, v21.4S, v31.s[0] +ldr q21, [x0, #480] +mul v27.4S, v27.4S,v30.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q3, [x0, #416] +ldr q2, [x0, #352] +ldr q1, [x0, #288] +mla v27.4S, v18.4S, v31.s[0] +mla v20.4S, v19.4S, v31.s[0] +ldr q19, [x0, #224] +ldr q18, [x0, #160] +mul v23.4S, v23.4S,v30.s[0] +mul v25.4S, v25.4S,v30.s[0] +ldr q0, [x0, #96] +ldr q15, [x0, #32] +mla v23.4S, v16.4S, v31.s[0] +mla v25.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v28.4s +add v21.4s, v21.4s, v28.4s +sqrdmulh v28.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +sub v16.4s, v3.4s, v26.4s +add v3.4s, v3.4s, v26.4s +sqrdmulh v26.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +sub v14.4s, v2.4s, v24.4s +add v2.4s, v2.4s, v24.4s +sqrdmulh v24.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v13.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +sqrdmulh v22.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v12.4s, v19.4s, v20.4s +add v19.4s, v19.4s, v20.4s +sqrdmulh v20.4S, v14.4S, v29.s[2] +mla v17.4S, v28.4S, v31.s[0] +sub v28.4s, v18.4s, v27.4s +add v18.4s, v18.4s, v27.4s +sqrdmulh v27.4S, v13.4S, v29.s[2] +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v0.4s, v25.4s +add v0.4s, v0.4s, v25.4s +sqrdmulh v25.4S, v2.4S, v29.s[1] +mla v21.4S, v24.4S, v31.s[0] +sub v24.4s, v15.4s, v23.4s +sqrdmulh v11.4S, v1.4S, v29.s[1] +mla v3.4S, v22.4S, v31.s[0] +add v15.4s, v15.4s, v23.4s +ldr q23, [x17, #+32] +ldr q22, [x17, #+48] +mul v13.4S, v13.4S,v30.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v10.4s, v12.4s, v17.4s +add v12.4s, v12.4s, v17.4s +mla v13.4S, v27.4S, v31.s[0] +mla v14.4S, v20.4S, v31.s[0] +sub v20.4s, v28.4s, v16.4s +add v28.4s, v28.4s, v16.4s +mul v1.4S, v1.4S,v30.s[1] +mul v2.4S, v2.4S,v30.s[1] +sub v16.4s, v19.4s, v21.4s +add v19.4s, v19.4s, v21.4s +mla v1.4S, v11.4S, v31.s[0] +mla v2.4S, v25.4S, v31.s[0] +sub v25.4s, v18.4s, v3.4s +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v10.4S, v22.s[3] +mul v10.4S, v10.4S,v23.s[3] +sub v11.4s, v26.4s, v14.4s +add v26.4s, v26.4s, v14.4s +sqrdmulh v14.4S, v12.4S, v22.s[2] +mul v12.4S, v12.4S,v23.s[2] +sub v21.4s, v24.4s, v13.4s +add v24.4s, v24.4s, v13.4s +sqrdmulh v13.4S, v16.4S, v22.s[1] +mul v16.4S, v16.4S,v23.s[1] +sub v27.4s, v0.4s, v2.4s +add v0.4s, v0.4s, v2.4s +sqrdmulh v2.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v17.4s, v15.4s, v1.4s +add v15.4s, v15.4s, v1.4s +ldr q1, [x17, #+96] +ldr q9, [x17, #+112] +sqrdmulh v8.4S, v20.4S, v22.s[3] +mla v10.4S, v3.4S, v31.s[0] +nop +nop +sqrdmulh v3.4S, v28.4S, v22.s[2] +mla v12.4S, v14.4S, v31.s[0] +nop +nop +sqrdmulh v14.4S, v25.4S, v22.s[1] +mla v16.4S, v13.4S, v31.s[0] +nop +nop +sqrdmulh v13.4S, v18.4S, v22.s[0] +mla v19.4S, v2.4S, v31.s[0] +nop +nop +ldr q2, [x17, #+64] +ldr q7, [x17, #+80] +mul v28.4S, v28.4S,v23.s[2] +mul v20.4S, v20.4S,v23.s[3] +sub v6.4s, v11.4s, v10.4s +add v11.4s, v11.4s, v10.4s +mla v28.4S, v3.4S, v31.s[0] +mla v20.4S, v8.4S, v31.s[0] +sub v8.4s, v26.4s, v12.4s +add v26.4s, v26.4s, v12.4s +mul v18.4S, v18.4S,v23.s[0] +mul v25.4S, v25.4S,v23.s[1] +sub v12.4s, v27.4s, v16.4s +add v27.4s, v27.4s, v16.4s +mla v18.4S, v13.4S, v31.s[0] +mla v25.4S, v14.4S, v31.s[0] +sub v14.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v9.s[3] +mul v6.4S, v6.4S,v1.s[3] +sub v13.4s, v21.4s, v20.4s +add v21.4s, v21.4s, v20.4s +sqrdmulh v20.4S, v11.4S, v9.s[2] +mul v11.4S, v11.4S,v1.s[2] +sub v16.4s, v24.4s, v28.4s +add v24.4s, v24.4s, v28.4s +sqrdmulh v28.4S, v8.4S, v9.s[1] +mul v8.4S, v8.4S,v1.s[1] +sub v3.4s, v17.4s, v25.4s +add v17.4s, v17.4s, v25.4s +sqrdmulh v25.4S, v26.4S, v9.s[0] +mul v26.4S, v26.4S,v1.s[0] +sub v10.4s, v15.4s, v18.4s +add v15.4s, v15.4s, v18.4s +sqrdmulh v18.4S, v12.4S, v7.s[3] +mla v6.4S, v19.4S, v31.s[0] +nop +nop +sqrdmulh v19.4S, v27.4S, v7.s[2] +mla v11.4S, v20.4S, v31.s[0] +nop +nop +sqrdmulh v20.4S, v14.4S, v7.s[1] +mla v8.4S, v28.4S, v31.s[0] +nop +nop +sqrdmulh v28.4S, v0.4S, v7.s[0] +mla v26.4S, v25.4S, v31.s[0] +nop +nop +mul v27.4S, v27.4S,v2.s[2] +mul v12.4S, v12.4S,v2.s[3] +sub v25.4s, v13.4s, v6.4s +str q25, [x0, #992] +mla v27.4S, v19.4S, v31.s[0] +mla v12.4S, v18.4S, v31.s[0] +add v13.4s, v13.4s, v6.4s +str q13, [x0, #928] +mul v0.4S, v0.4S,v2.s[0] +mul v14.4S, v14.4S,v2.s[1] +sub v13.4s, v21.4s, v11.4s +str q13, [x0, #864] +mla v0.4S, v28.4S, v31.s[0] +mla v14.4S, v20.4S, v31.s[0] +add v21.4s, v21.4s, v11.4s +sub v11.4s, v16.4s, v8.4s +ldr q20, [x0, #1008] +sqrdmulh v28.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v16.4s, v16.4s, v8.4s +str q21, [x0, #800] +ldr q21, [x0, #944] +sqrdmulh v8.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +sub v13.4s, v24.4s, v26.4s +str q11, [x0, #736] +ldr q11, [x0, #880] +sqrdmulh v6.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +add v24.4s, v24.4s, v26.4s +str q16, [x0, #672] +ldr q16, [x0, #816] +sqrdmulh v26.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v18.4s, v3.4s, v12.4s +str q13, [x0, #608] +ldr q13, [x0, #752] +sqrdmulh v19.4S, v13.4S, v29.s[0] +mla v20.4S, v28.4S, v31.s[0] +add v3.4s, v3.4s, v12.4s +str q24, [x0, #544] +ldr q24, [x0, #688] +sqrdmulh v12.4S, v24.4S, v29.s[0] +mla v21.4S, v8.4S, v31.s[0] +sub v8.4s, v17.4s, v27.4s +str q18, [x0, #480] +ldr q18, [x0, #624] +sqrdmulh v28.4S, v18.4S, v29.s[0] +mla v11.4S, v6.4S, v31.s[0] +add v17.4s, v17.4s, v27.4s +str q3, [x0, #416] +ldr q3, [x0, #560] +sqrdmulh v27.4S, v3.4S, v29.s[0] +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v10.4s, v14.4s +str q8, [x0, #352] +ldr q8, [x0, #496] +add v10.4s, v10.4s, v14.4s +mul v24.4S, v24.4S,v30.s[0] +mul v13.4S, v13.4S,v30.s[0] +ldr q14, [x0, #432] +str q17, [x0, #288] +ldr q17, [x0, #368] +ldr q6, [x0, #304] +mla v24.4S, v12.4S, v31.s[0] +mla v13.4S, v19.4S, v31.s[0] +str q26, [x0, #224] +sub v26.4s, v15.4s, v0.4s +ldr q19, [x0, #240] +ldr q12, [x0, #176] +mul v3.4S, v3.4S,v30.s[0] +mul v18.4S, v18.4S,v30.s[0] +str q10, [x0, #160] +add v15.4s, v15.4s, v0.4s +ldr q0, [x0, #112] +ldr q10, [x0, #48] +mla v3.4S, v27.4S, v31.s[0] +mla v18.4S, v28.4S, v31.s[0] +sub v28.4s, v8.4s, v20.4s +add v8.4s, v8.4s, v20.4s +sqrdmulh v20.4S, v28.4S, v29.s[2] +mul v28.4S, v28.4S,v30.s[2] +sub v27.4s, v14.4s, v21.4s +add v14.4s, v14.4s, v21.4s +sqrdmulh v21.4S, v27.4S, v29.s[2] +mul v27.4S, v27.4S,v30.s[2] +sub v25.4s, v17.4s, v11.4s +add v17.4s, v17.4s, v11.4s +sqrdmulh v11.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +sub v5.4s, v6.4s, v16.4s +add v6.4s, v6.4s, v16.4s +sqrdmulh v16.4S, v14.4S, v29.s[1] +mul v14.4S, v14.4S,v30.s[1] +sub v4.4s, v19.4s, v13.4s +add v19.4s, v19.4s, v13.4s +sqrdmulh v13.4S, v25.4S, v29.s[2] +mla v28.4S, v20.4S, v31.s[0] +sub v20.4s, v12.4s, v24.4s +add v12.4s, v12.4s, v24.4s +sqrdmulh v24.4S, v5.4S, v29.s[2] +mla v27.4S, v21.4S, v31.s[0] +sub v21.4s, v0.4s, v18.4s +add v0.4s, v0.4s, v18.4s +sqrdmulh v18.4S, v17.4S, v29.s[1] +mla v8.4S, v11.4S, v31.s[0] +sub v11.4s, v10.4s, v3.4s +str q26, [x0, #96] +sqrdmulh v26.4S, v6.4S, v29.s[1] +mla v14.4S, v16.4S, v31.s[0] +add v10.4s, v10.4s, v3.4s +str q15, [x0, #32] +mul v5.4S, v5.4S,v30.s[2] +mul v25.4S, v25.4S,v30.s[2] +sub v15.4s, v4.4s, v28.4s +add v4.4s, v4.4s, v28.4s +mla v5.4S, v24.4S, v31.s[0] +mla v25.4S, v13.4S, v31.s[0] +sub v13.4s, v20.4s, v27.4s +add v20.4s, v20.4s, v27.4s +mul v6.4S, v6.4S,v30.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v27.4s, v19.4s, v8.4s +add v19.4s, v19.4s, v8.4s +mla v6.4S, v26.4S, v31.s[0] +mla v17.4S, v18.4S, v31.s[0] +sub v18.4s, v12.4s, v14.4s +add v12.4s, v12.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v22.s[3] +mul v15.4S, v15.4S,v23.s[3] +sub v26.4s, v21.4s, v25.4s +add v21.4s, v21.4s, v25.4s +sqrdmulh v25.4S, v4.4S, v22.s[2] +mul v4.4S, v4.4S,v23.s[2] +sub v8.4s, v11.4s, v5.4s +add v11.4s, v11.4s, v5.4s +sqrdmulh v5.4S, v27.4S, v22.s[1] +mul v27.4S, v27.4S,v23.s[1] +sub v24.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +sqrdmulh v17.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v28.4s, v10.4s, v6.4s +add v10.4s, v10.4s, v6.4s +sqrdmulh v6.4S, v13.4S, v22.s[3] +mla v15.4S, v14.4S, v31.s[0] +nop +nop +sqrdmulh v14.4S, v20.4S, v22.s[2] +mla v4.4S, v25.4S, v31.s[0] +nop +nop +sqrdmulh v25.4S, v18.4S, v22.s[1] +mla v27.4S, v5.4S, v31.s[0] +nop +nop +sqrdmulh v5.4S, v12.4S, v22.s[0] +mla v19.4S, v17.4S, v31.s[0] +nop +nop +mul v20.4S, v20.4S,v23.s[2] +mul v13.4S, v13.4S,v23.s[3] +sub v17.4s, v26.4s, v15.4s +add v26.4s, v26.4s, v15.4s +mla v20.4S, v14.4S, v31.s[0] +mla v13.4S, v6.4S, v31.s[0] +sub v6.4s, v21.4s, v4.4s +add v21.4s, v21.4s, v4.4s +mul v12.4S, v12.4S,v23.s[0] +mul v18.4S, v18.4S,v23.s[1] +sub v4.4s, v24.4s, v27.4s +add v24.4s, v24.4s, v27.4s +mla v12.4S, v5.4S, v31.s[0] +mla v18.4S, v25.4S, v31.s[0] +sub v25.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v17.4S, v9.s[3] +mul v17.4S, v17.4S,v1.s[3] +sub v5.4s, v8.4s, v13.4s +add v8.4s, v8.4s, v13.4s +sqrdmulh v13.4S, v26.4S, v9.s[2] +mul v26.4S, v26.4S,v1.s[2] +sub v27.4s, v11.4s, v20.4s +add v11.4s, v11.4s, v20.4s +sqrdmulh v20.4S, v6.4S, v9.s[1] +mul v6.4S, v6.4S,v1.s[1] +sub v14.4s, v28.4s, v18.4s +add v28.4s, v28.4s, v18.4s +sqrdmulh v18.4S, v21.4S, v9.s[0] +mul v21.4S, v21.4S,v1.s[0] +sub v15.4s, v10.4s, v12.4s +add v10.4s, v10.4s, v12.4s +sqrdmulh v12.4S, v4.4S, v7.s[3] +mla v17.4S, v19.4S, v31.s[0] +nop +nop +sqrdmulh v19.4S, v24.4S, v7.s[2] +mla v26.4S, v13.4S, v31.s[0] +nop +nop +sqrdmulh v13.4S, v25.4S, v7.s[1] +mla v6.4S, v20.4S, v31.s[0] +nop +nop +sqrdmulh v20.4S, v0.4S, v7.s[0] +mla v21.4S, v18.4S, v31.s[0] +nop +nop +mul v24.4S, v24.4S,v2.s[2] +mul v4.4S, v4.4S,v2.s[3] +sub v18.4s, v5.4s, v17.4s +str q18, [x0, #1008] +mla v24.4S, v19.4S, v31.s[0] +mla v4.4S, v12.4S, v31.s[0] +add v5.4s, v5.4s, v17.4s +str q5, [x0, #944] +mul v0.4S, v0.4S,v2.s[0] +mul v25.4S, v25.4S,v2.s[1] +sub v5.4s, v8.4s, v26.4s +str q5, [x0, #880] +mla v0.4S, v20.4S, v31.s[0] +mla v25.4S, v13.4S, v31.s[0] +add v8.4s, v8.4s, v26.4s +sub v26.4s, v27.4s, v6.4s +ldr q13, [x0, #960] +sqrdmulh v20.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +add v27.4s, v27.4s, v6.4s +str q8, [x0, #816] +ldr q8, [x0, #896] +sqrdmulh v6.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v5.4s, v11.4s, v21.4s +str q26, [x0, #752] +ldr q26, [x0, #832] +sqrdmulh v17.4S, v26.4S, v29.s[0] +mul v26.4S, v26.4S,v30.s[0] +add v11.4s, v11.4s, v21.4s +str q27, [x0, #688] +ldr q27, [x0, #768] +sqrdmulh v21.4S, v27.4S, v29.s[0] +mul v27.4S, v27.4S,v30.s[0] +sub v12.4s, v14.4s, v4.4s +str q5, [x0, #624] +ldr q5, [x0, #704] +sqrdmulh v19.4S, v5.4S, v29.s[0] +mla v13.4S, v20.4S, v31.s[0] +add v14.4s, v14.4s, v4.4s +str q11, [x0, #560] +ldr q11, [x0, #640] +sqrdmulh v4.4S, v11.4S, v29.s[0] +mla v8.4S, v6.4S, v31.s[0] +sub v6.4s, v28.4s, v24.4s +str q12, [x0, #496] +ldr q12, [x0, #576] +sqrdmulh v20.4S, v12.4S, v29.s[0] +mla v26.4S, v17.4S, v31.s[0] +add v28.4s, v28.4s, v24.4s +str q14, [x0, #432] +ldr q14, [x0, #512] +sqrdmulh v24.4S, v14.4S, v29.s[0] +mla v27.4S, v21.4S, v31.s[0] +sub v21.4s, v15.4s, v25.4s +str q6, [x0, #368] +ldr q6, [x0, #448] +add v15.4s, v15.4s, v25.4s +mul v11.4S, v11.4S,v30.s[0] +mul v5.4S, v5.4S,v30.s[0] +ldr q25, [x0, #384] +str q28, [x0, #304] +ldr q28, [x0, #320] +ldr q17, [x0, #256] +mla v11.4S, v4.4S, v31.s[0] +mla v5.4S, v19.4S, v31.s[0] +str q21, [x0, #240] +sub v21.4s, v10.4s, v0.4s +ldr q19, [x0, #192] +ldr q4, [x0, #128] +mul v14.4S, v14.4S,v30.s[0] +mul v12.4S, v12.4S,v30.s[0] +str q15, [x0, #176] +add v10.4s, v10.4s, v0.4s +ldr q0, [x0, #64] +ldr q15, [x0, #0] +mla v14.4S, v24.4S, v31.s[0] +mla v12.4S, v20.4S, v31.s[0] +sub v20.4s, v6.4s, v13.4s +add v6.4s, v6.4s, v13.4s +sqrdmulh v13.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +sub v24.4s, v25.4s, v8.4s +add v25.4s, v25.4s, v8.4s +sqrdmulh v8.4S, v24.4S, v29.s[2] +mul v24.4S, v24.4S,v30.s[2] +sub v18.4s, v28.4s, v26.4s +add v28.4s, v28.4s, v26.4s +sqrdmulh v26.4S, v6.4S, v29.s[1] +mul v6.4S, v6.4S,v30.s[1] +sub v3.4s, v17.4s, v27.4s +add v17.4s, v17.4s, v27.4s +sqrdmulh v27.4S, v25.4S, v29.s[1] +mul v25.4S, v25.4S,v30.s[1] +sub v16.4s, v19.4s, v5.4s +add v19.4s, v19.4s, v5.4s +sqrdmulh v5.4S, v18.4S, v29.s[2] +mla v20.4S, v13.4S, v31.s[0] +sub v13.4s, v4.4s, v11.4s +add v4.4s, v4.4s, v11.4s +sqrdmulh v11.4S, v3.4S, v29.s[2] +mla v24.4S, v8.4S, v31.s[0] +sub v8.4s, v0.4s, v12.4s +add v0.4s, v0.4s, v12.4s +sqrdmulh v12.4S, v28.4S, v29.s[1] +mla v6.4S, v26.4S, v31.s[0] +sub v26.4s, v15.4s, v14.4s +str q21, [x0, #112] +sqrdmulh v21.4S, v17.4S, v29.s[1] +mla v25.4S, v27.4S, v31.s[0] +add v15.4s, v15.4s, v14.4s +str q10, [x0, #48] +mul v3.4S, v3.4S,v30.s[2] +mul v18.4S, v18.4S,v30.s[2] +sub v10.4s, v16.4s, v20.4s +add v16.4s, v16.4s, v20.4s +mla v3.4S, v11.4S, v31.s[0] +mla v18.4S, v5.4S, v31.s[0] +sub v5.4s, v13.4s, v24.4s +add v13.4s, v13.4s, v24.4s +mul v17.4S, v17.4S,v30.s[1] +mul v28.4S, v28.4S,v30.s[1] +sub v24.4s, v19.4s, v6.4s +add v19.4s, v19.4s, v6.4s +mla v17.4S, v21.4S, v31.s[0] +mla v28.4S, v12.4S, v31.s[0] +sub v12.4s, v4.4s, v25.4s +add v4.4s, v4.4s, v25.4s +sqrdmulh v25.4S, v10.4S, v22.s[3] +mul v10.4S, v10.4S,v23.s[3] +sub v21.4s, v8.4s, v18.4s +add v8.4s, v8.4s, v18.4s +sqrdmulh v18.4S, v16.4S, v22.s[2] +mul v16.4S, v16.4S,v23.s[2] +sub v6.4s, v26.4s, v3.4s +add v26.4s, v26.4s, v3.4s +sqrdmulh v3.4S, v24.4S, v22.s[1] +mul v24.4S, v24.4S,v23.s[1] +sub v11.4s, v0.4s, v28.4s +add v0.4s, v0.4s, v28.4s +sqrdmulh v28.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v20.4s, v15.4s, v17.4s +add v15.4s, v15.4s, v17.4s +sqrdmulh v17.4S, v5.4S, v22.s[3] +mla v10.4S, v25.4S, v31.s[0] +nop +nop +sqrdmulh v25.4S, v13.4S, v22.s[2] +mla v16.4S, v18.4S, v31.s[0] +nop +nop +sqrdmulh v18.4S, v12.4S, v22.s[1] +mla v24.4S, v3.4S, v31.s[0] +nop +nop +sqrdmulh v3.4S, v4.4S, v22.s[0] +mla v19.4S, v28.4S, v31.s[0] +nop +nop +mul v13.4S, v13.4S,v23.s[2] +mul v5.4S, v5.4S,v23.s[3] +sub v28.4s, v21.4s, v10.4s +add v21.4s, v21.4s, v10.4s +mla v13.4S, v25.4S, v31.s[0] +mla v5.4S, v17.4S, v31.s[0] +sub v17.4s, v8.4s, v16.4s +add v8.4s, v8.4s, v16.4s +mul v4.4S, v4.4S,v23.s[0] +mul v12.4S, v12.4S,v23.s[1] +sub v16.4s, v11.4s, v24.4s +add v11.4s, v11.4s, v24.4s +mla v4.4S, v3.4S, v31.s[0] +mla v12.4S, v18.4S, v31.s[0] +sub v18.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v28.4S, v9.s[3] +mul v28.4S, v28.4S,v1.s[3] +sub v3.4s, v6.4s, v5.4s +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v21.4S, v9.s[2] +mul v21.4S, v21.4S,v1.s[2] +sub v24.4s, v26.4s, v13.4s +add v26.4s, v26.4s, v13.4s +sqrdmulh v13.4S, v17.4S, v9.s[1] +mul v17.4S, v17.4S,v1.s[1] +sub v25.4s, v20.4s, v12.4s +add v20.4s, v20.4s, v12.4s +sqrdmulh v12.4S, v8.4S, v9.s[0] +mul v8.4S, v8.4S,v1.s[0] +sub v10.4s, v15.4s, v4.4s +add v15.4s, v15.4s, v4.4s +sqrdmulh v4.4S, v16.4S, v7.s[3] +mla v28.4S, v19.4S, v31.s[0] +nop +nop +sqrdmulh v19.4S, v11.4S, v7.s[2] +mla v21.4S, v5.4S, v31.s[0] +nop +nop +sqrdmulh v5.4S, v18.4S, v7.s[1] +mla v17.4S, v13.4S, v31.s[0] +nop +nop +sqrdmulh v13.4S, v0.4S, v7.s[0] +mla v8.4S, v12.4S, v31.s[0] +nop +nop +mul v11.4S, v11.4S,v2.s[2] +mul v16.4S, v16.4S,v2.s[3] +sub v12.4s, v3.4s, v28.4s +str q12, [x0, #960] +mla v11.4S, v19.4S, v31.s[0] +mla v16.4S, v4.4S, v31.s[0] +add v3.4s, v3.4s, v28.4s +str q3, [x0, #896] +mul v0.4S, v0.4S,v2.s[0] +mul v18.4S, v18.4S,v2.s[1] +sub v3.4s, v6.4s, v21.4s +str q3, [x0, #832] +mla v0.4S, v13.4S, v31.s[0] +mla v18.4S, v5.4S, v31.s[0] +add v6.4s, v6.4s, v21.4s +sub v21.4s, v24.4s, v17.4s +ldr q5, [x0, #976] +sqrdmulh v13.4S, v5.4S, v29.s[0] +mul v5.4S, v5.4S,v30.s[0] +add v24.4s, v24.4s, v17.4s +str q6, [x0, #768] +ldr q6, [x0, #912] +sqrdmulh v17.4S, v6.4S, v29.s[0] +mul v6.4S, v6.4S,v30.s[0] +sub v3.4s, v26.4s, v8.4s +str q21, [x0, #704] +ldr q21, [x0, #848] +sqrdmulh v28.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +add v26.4s, v26.4s, v8.4s +str q24, [x0, #640] +ldr q24, [x0, #784] +sqrdmulh v8.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +sub v4.4s, v25.4s, v16.4s +str q3, [x0, #576] +ldr q3, [x0, #720] +sqrdmulh v19.4S, v3.4S, v29.s[0] +mla v5.4S, v13.4S, v31.s[0] +add v25.4s, v25.4s, v16.4s +str q26, [x0, #512] +ldr q26, [x0, #656] +sqrdmulh v16.4S, v26.4S, v29.s[0] +mla v6.4S, v17.4S, v31.s[0] +sub v17.4s, v20.4s, v11.4s +str q4, [x0, #448] +ldr q4, [x0, #592] +sqrdmulh v13.4S, v4.4S, v29.s[0] +mla v21.4S, v28.4S, v31.s[0] +add v20.4s, v20.4s, v11.4s +str q25, [x0, #384] +ldr q25, [x0, #528] +sqrdmulh v11.4S, v25.4S, v29.s[0] +mla v24.4S, v8.4S, v31.s[0] +sub v8.4s, v10.4s, v18.4s +str q17, [x0, #320] +ldr q17, [x0, #464] +add v10.4s, v10.4s, v18.4s +mul v26.4S, v26.4S,v30.s[0] +mul v3.4S, v3.4S,v30.s[0] +ldr q18, [x0, #400] +str q20, [x0, #256] +ldr q20, [x0, #336] +ldr q28, [x0, #272] +mla v26.4S, v16.4S, v31.s[0] +mla v3.4S, v19.4S, v31.s[0] +str q8, [x0, #192] +sub v8.4s, v15.4s, v0.4s +ldr q19, [x0, #208] +ldr q16, [x0, #144] +mul v25.4S, v25.4S,v30.s[0] +mul v4.4S, v4.4S,v30.s[0] +str q10, [x0, #128] +add v15.4s, v15.4s, v0.4s +ldr q0, [x0, #80] +ldr q10, [x0, #16] +mla v25.4S, v11.4S, v31.s[0] +mla v4.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v5.4s +add v17.4s, v17.4s, v5.4s +sqrdmulh v5.4S, v13.4S, v29.s[2] +mul v13.4S, v13.4S,v30.s[2] +sub v11.4s, v18.4s, v6.4s +add v18.4s, v18.4s, v6.4s +sqrdmulh v6.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v12.4s, v20.4s, v21.4s +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v14.4s, v28.4s, v24.4s +add v28.4s, v28.4s, v24.4s +sqrdmulh v24.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v27.4s, v19.4s, v3.4s +add v19.4s, v19.4s, v3.4s +sqrdmulh v3.4S, v12.4S, v29.s[2] +mla v13.4S, v5.4S, v31.s[0] +sub v5.4s, v16.4s, v26.4s +add v16.4s, v16.4s, v26.4s +sqrdmulh v26.4S, v14.4S, v29.s[2] +mla v11.4S, v6.4S, v31.s[0] +sub v6.4s, v0.4s, v4.4s +add v0.4s, v0.4s, v4.4s +sqrdmulh v4.4S, v20.4S, v29.s[1] +mla v17.4S, v21.4S, v31.s[0] +sub v21.4s, v10.4s, v25.4s +str q8, [x0, #64] +sqrdmulh v8.4S, v28.4S, v29.s[1] +mla v18.4S, v24.4S, v31.s[0] +add v10.4s, v10.4s, v25.4s +str q15, [x0, #0] +mul v14.4S, v14.4S,v30.s[2] +mul v12.4S, v12.4S,v30.s[2] +sub v15.4s, v27.4s, v13.4s +add v27.4s, v27.4s, v13.4s +mla v14.4S, v26.4S, v31.s[0] +mla v12.4S, v3.4S, v31.s[0] +sub v3.4s, v5.4s, v11.4s +add v5.4s, v5.4s, v11.4s +mul v28.4S, v28.4S,v30.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v11.4s, v19.4s, v17.4s +add v19.4s, v19.4s, v17.4s +mla v28.4S, v8.4S, v31.s[0] +mla v20.4S, v4.4S, v31.s[0] +sub v4.4s, v16.4s, v18.4s +add v16.4s, v16.4s, v18.4s +sqrdmulh v29.4S, v15.4S, v22.s[3] +mul v15.4S, v15.4S,v23.s[3] +sub v30.4s, v6.4s, v12.4s +add v6.4s, v6.4s, v12.4s +sqrdmulh v12.4S, v27.4S, v22.s[2] +mul v27.4S, v27.4S,v23.s[2] +sub v18.4s, v21.4s, v14.4s +add v21.4s, v21.4s, v14.4s +sqrdmulh v14.4S, v11.4S, v22.s[1] +mul v11.4S, v11.4S,v23.s[1] +sub v8.4s, v0.4s, v20.4s +add v0.4s, v0.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v17.4s, v10.4s, v28.4s +add v10.4s, v10.4s, v28.4s +sqrdmulh v28.4S, v3.4S, v22.s[3] +mla v15.4S, v29.4S, v31.s[0] +nop +nop +sqrdmulh v29.4S, v5.4S, v22.s[2] +mla v27.4S, v12.4S, v31.s[0] +nop +nop +sqrdmulh v12.4S, v4.4S, v22.s[1] +mla v11.4S, v14.4S, v31.s[0] +nop +nop +sqrdmulh v14.4S, v16.4S, v22.s[0] +mla v19.4S, v20.4S, v31.s[0] +nop +nop +mul v5.4S, v5.4S,v23.s[2] +mul v3.4S, v3.4S,v23.s[3] +sub v20.4s, v30.4s, v15.4s +add v30.4s, v30.4s, v15.4s +mla v5.4S, v29.4S, v31.s[0] +mla v3.4S, v28.4S, v31.s[0] +sub v28.4s, v6.4s, v27.4s +add v6.4s, v6.4s, v27.4s +mul v16.4S, v16.4S,v23.s[0] +mul v4.4S, v4.4S,v23.s[1] +sub v27.4s, v8.4s, v11.4s +add v8.4s, v8.4s, v11.4s +mla v16.4S, v14.4S, v31.s[0] +mla v4.4S, v12.4S, v31.s[0] +sub v12.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v22.4S, v20.4S, v9.s[3] +mul v20.4S, v20.4S,v1.s[3] +sub v23.4s, v18.4s, v3.4s +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v30.4S, v9.s[2] +mul v30.4S, v30.4S,v1.s[2] +sub v19.4s, v21.4s, v5.4s +add v21.4s, v21.4s, v5.4s +sqrdmulh v5.4S, v28.4S, v9.s[1] +mul v28.4S, v28.4S,v1.s[1] +sub v14.4s, v17.4s, v4.4s +add v17.4s, v17.4s, v4.4s +sqrdmulh v4.4S, v6.4S, v9.s[0] +mul v6.4S, v6.4S,v1.s[0] +sub v11.4s, v10.4s, v16.4s +add v10.4s, v10.4s, v16.4s +sqrdmulh v9.4S, v27.4S, v7.s[3] +mla v20.4S, v22.4S, v31.s[0] +nop +nop +sqrdmulh v22.4S, v8.4S, v7.s[2] +mla v30.4S, v3.4S, v31.s[0] +nop +nop +sqrdmulh v3.4S, v12.4S, v7.s[1] +mla v28.4S, v5.4S, v31.s[0] +nop +nop +sqrdmulh v5.4S, v0.4S, v7.s[0] +mla v6.4S, v4.4S, v31.s[0] +nop +nop +mul v8.4S, v8.4S,v2.s[2] +mul v27.4S, v27.4S,v2.s[3] +sub v4.4s, v23.4s, v20.4s +str q4, [x0, #976] +mla v8.4S, v22.4S, v31.s[0] +mla v27.4S, v9.4S, v31.s[0] +add v23.4s, v23.4s, v20.4s +str q23, [x0, #912] +mul v0.4S, v0.4S,v2.s[0] +mul v12.4S, v12.4S,v2.s[1] +sub v23.4s, v18.4s, v30.4s +str q23, [x0, #848] +mla v0.4S, v5.4S, v31.s[0] +mla v12.4S, v3.4S, v31.s[0] +add v18.4s, v18.4s, v30.4s +sub v30.4s, v19.4s, v28.4s +add v19.4s, v19.4s, v28.4s +str q18, [x0, #784] +sub v18.4s, v21.4s, v6.4s +str q30, [x0, #720] +add v21.4s, v21.4s, v6.4s +str q19, [x0, #656] +sub v19.4s, v14.4s, v27.4s +str q18, [x0, #592] +add v14.4s, v14.4s, v27.4s +str q21, [x0, #528] +sub v21.4s, v17.4s, v8.4s +str q19, [x0, #464] +add v17.4s, v17.4s, v8.4s +str q14, [x0, #400] +sub v14.4s, v11.4s, v12.4s +str q21, [x0, #336] +add v11.4s, v11.4s, v12.4s +str q17, [x0, #272] +sub v17.4s, v10.4s, v0.4s +add v10.4s, v10.4s, v0.4s +ldr q24, [x0, #48] +ldr q25, [x0, #32] +ldr q13, [x0, #112] +ldr q26, [x0, #96] +ldr q15, [x17, #+128] +ldr q29, [x17, #+144] +ldr q16, [x17, #+160] +ldr q1, [x17, #+176] +ldr q4, [x0, #176] +ldr q22, [x0, #160] +sqrdmulh v9.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v15.s[0] +ldr q20, [x0, #240] +sqrdmulh v23.4S, v25.4S, v29.s[0] +mul v25.4S, v25.4S,v15.s[0] +ldr q5, [x0, #224] +sqrdmulh v3.4S, v13.4S, v1.s[0] +mul v13.4S, v13.4S,v16.s[0] +ldr q2, [x17, #+192] +sqrdmulh v7.4S, v26.4S, v1.s[0] +mul v26.4S, v26.4S,v16.s[0] +ldr q28, [x17, #+208] +mla v24.4S, v9.4S, v31.s[0] +sqrdmulh v9.4S, v4.4S, v28.s[0] +ldr q30, [x17, #+224] +mla v25.4S, v23.4S, v31.s[0] +sqrdmulh v23.4S, v22.4S, v28.s[0] +ldr q6, [x17, #+240] +mla v13.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v20.4S, v6.s[0] +mla v26.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v5.4S, v6.s[0] +ldr q18, [x0, #0] +mul v4.4S, v4.4S,v2.s[0] +mul v22.4S, v22.4S,v2.s[0] +mla v4.4S, v9.4S, v31.s[0] +mla v22.4S, v23.4S, v31.s[0] +sub v23.4s, v10.4s, v24.4s +ldr q9, [x0, #64] +add v10.4s, v10.4s, v24.4s +mul v20.4S, v20.4S,v30.s[0] +mul v5.4S, v5.4S,v30.s[0] +sub v24.4s, v18.4s, v25.4s +add v18.4s, v18.4s, v25.4s +mla v20.4S, v3.4S, v31.s[0] +mla v5.4S, v7.4S, v31.s[0] +sub v7.4s, v17.4s, v13.4s +ldr q3, [x0, #128] +add v17.4s, v17.4s, v13.4s +sqrdmulh v13.4S, v23.4S, v29.s[2] +mul v23.4S, v23.4S,v15.s[2] +sub v25.4s, v9.4s, v26.4s +add v9.4s, v9.4s, v26.4s +sqrdmulh v26.4S, v10.4S, v29.s[1] +mul v10.4S, v10.4S,v15.s[1] +sub v27.4s, v11.4s, v4.4s +ldr q19, [x0, #192] +add v11.4s, v11.4s, v4.4s +sqrdmulh v29.4S, v7.4S, v1.s[2] +mul v7.4S, v7.4S,v16.s[2] +sub v4.4s, v3.4s, v22.4s +ldr q15, [x0, #304] +add v3.4s, v3.4s, v22.4s +sqrdmulh v22.4S, v17.4S, v1.s[1] +mul v17.4S, v17.4S,v16.s[1] +sub v8.4s, v14.4s, v20.4s +ldr q21, [x0, #288] +add v14.4s, v14.4s, v20.4s +mla v23.4S, v13.4S, v31.s[0] +sqrdmulh v13.4S, v27.4S, v28.s[2] +sub v1.4s, v19.4s, v5.4s +ldr q20, [x0, #368] +add v19.4s, v19.4s, v5.4s +mla v10.4S, v26.4S, v31.s[0] +sqrdmulh v26.4S, v11.4S, v28.s[1] +sub v5.4s, v24.4s, v23.4s +ldr q16, [x0, #352] +str q5, [x0, #48] +mla v7.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v8.4S, v6.s[2] +add v24.4s, v24.4s, v23.4s +ldr q23, [x17, #+256] +str q24, [x0, #32] +mla v17.4S, v22.4S, v31.s[0] +sqrdmulh v22.4S, v14.4S, v6.s[1] +sub v24.4s, v18.4s, v10.4s +ldr q5, [x17, #+272] +str q24, [x0, #16] +mul v27.4S, v27.4S,v2.s[2] +mul v11.4S, v11.4S,v2.s[1] +add v18.4s, v18.4s, v10.4s +ldr q10, [x17, #+288] +str q18, [x0, #0] +mla v27.4S, v13.4S, v31.s[0] +mla v11.4S, v26.4S, v31.s[0] +sub v26.4s, v25.4s, v7.4s +ldr q13, [x17, #+304] +str q26, [x0, #112] +mul v8.4S, v8.4S,v30.s[2] +mul v14.4S, v14.4S,v30.s[1] +add v25.4s, v25.4s, v7.4s +ldr q7, [x0, #432] +str q25, [x0, #96] +mla v8.4S, v29.4S, v31.s[0] +mla v14.4S, v22.4S, v31.s[0] +sub v22.4s, v9.4s, v17.4s +ldr q29, [x0, #416] +str q22, [x0, #80] +sqrdmulh v6.4S, v15.4S, v5.s[0] +mul v15.4S, v15.4S,v23.s[0] +add v9.4s, v9.4s, v17.4s +ldr q17, [x0, #496] +str q9, [x0, #64] +sqrdmulh v9.4S, v21.4S, v5.s[0] +mul v21.4S, v21.4S,v23.s[0] +sub v22.4s, v4.4s, v27.4s +ldr q30, [x0, #480] +str q22, [x0, #176] +sqrdmulh v22.4S, v20.4S, v13.s[0] +mul v20.4S, v20.4S,v10.s[0] +add v4.4s, v4.4s, v27.4s +ldr q27, [x17, #+320] +str q4, [x0, #160] +sqrdmulh v4.4S, v16.4S, v13.s[0] +mul v16.4S, v16.4S,v10.s[0] +sub v25.4s, v3.4s, v11.4s +ldr q28, [x17, #+336] +str q25, [x0, #144] +mla v15.4S, v6.4S, v31.s[0] +sqrdmulh v6.4S, v7.4S, v28.s[0] +add v3.4s, v3.4s, v11.4s +ldr q11, [x17, #+352] +str q3, [x0, #128] +mla v21.4S, v9.4S, v31.s[0] +sqrdmulh v9.4S, v29.4S, v28.s[0] +sub v3.4s, v1.4s, v8.4s +ldr q25, [x17, #+368] +str q3, [x0, #240] +mla v20.4S, v22.4S, v31.s[0] +sqrdmulh v22.4S, v17.4S, v25.s[0] +add v1.4s, v1.4s, v8.4s +ldr q8, [x0, #272] +str q1, [x0, #224] +mla v16.4S, v4.4S, v31.s[0] +sqrdmulh v4.4S, v30.4S, v25.s[0] +sub v1.4s, v19.4s, v14.4s +ldr q3, [x0, #256] +str q1, [x0, #208] +mul v7.4S, v7.4S,v27.s[0] +mul v29.4S, v29.4S,v27.s[0] +add v19.4s, v19.4s, v14.4s +ldr q14, [x0, #336] +str q19, [x0, #192] +mla v7.4S, v6.4S, v31.s[0] +mla v29.4S, v9.4S, v31.s[0] +sub v9.4s, v8.4s, v15.4s +ldr q6, [x0, #320] +add v8.4s, v8.4s, v15.4s +mul v17.4S, v17.4S,v11.s[0] +mul v30.4S, v30.4S,v11.s[0] +sub v15.4s, v3.4s, v21.4s +ldr q19, [x0, #400] +add v3.4s, v3.4s, v21.4s +mla v17.4S, v22.4S, v31.s[0] +mla v30.4S, v4.4S, v31.s[0] +sub v4.4s, v14.4s, v20.4s +ldr q22, [x0, #384] +add v14.4s, v14.4s, v20.4s +sqrdmulh v20.4S, v9.4S, v5.s[2] +mul v9.4S, v9.4S,v23.s[2] +sub v21.4s, v6.4s, v16.4s +ldr q1, [x0, #464] +add v6.4s, v6.4s, v16.4s +sqrdmulh v16.4S, v8.4S, v5.s[1] +mul v8.4S, v8.4S,v23.s[1] +sub v26.4s, v19.4s, v7.4s +ldr q2, [x0, #448] +add v19.4s, v19.4s, v7.4s +sqrdmulh v5.4S, v4.4S, v13.s[2] +mul v4.4S, v4.4S,v10.s[2] +sub v7.4s, v22.4s, v29.4s +ldr q23, [x0, #560] +add v22.4s, v22.4s, v29.4s +sqrdmulh v29.4S, v14.4S, v13.s[1] +mul v14.4S, v14.4S,v10.s[1] +sub v18.4s, v1.4s, v17.4s +ldr q24, [x0, #544] +add v1.4s, v1.4s, v17.4s +mla v9.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v26.4S, v28.s[2] +sub v13.4s, v2.4s, v30.4s +ldr q17, [x0, #624] +add v2.4s, v2.4s, v30.4s +mla v8.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v19.4S, v28.s[1] +sub v30.4s, v15.4s, v9.4s +ldr q10, [x0, #608] +str q30, [x0, #304] +mla v4.4S, v5.4S, v31.s[0] +sqrdmulh v5.4S, v18.4S, v25.s[2] +add v15.4s, v15.4s, v9.4s +ldr q9, [x17, #+384] +str q15, [x0, #288] +mla v14.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v1.4S, v25.s[1] +sub v15.4s, v3.4s, v8.4s +ldr q30, [x17, #+400] +str q15, [x0, #272] +mul v26.4S, v26.4S,v27.s[2] +mul v19.4S, v19.4S,v27.s[1] +add v3.4s, v3.4s, v8.4s +ldr q8, [x17, #+416] +str q3, [x0, #256] +mla v26.4S, v20.4S, v31.s[0] +mla v19.4S, v16.4S, v31.s[0] +sub v16.4s, v21.4s, v4.4s +ldr q20, [x17, #+432] +str q16, [x0, #368] +mul v18.4S, v18.4S,v11.s[2] +mul v1.4S, v1.4S,v11.s[1] +add v21.4s, v21.4s, v4.4s +ldr q4, [x0, #688] +str q21, [x0, #352] +mla v18.4S, v5.4S, v31.s[0] +mla v1.4S, v29.4S, v31.s[0] +sub v29.4s, v6.4s, v14.4s +ldr q5, [x0, #672] +str q29, [x0, #336] +sqrdmulh v25.4S, v23.4S, v30.s[0] +mul v23.4S, v23.4S,v9.s[0] +add v6.4s, v6.4s, v14.4s +ldr q14, [x0, #752] +str q6, [x0, #320] +sqrdmulh v6.4S, v24.4S, v30.s[0] +mul v24.4S, v24.4S,v9.s[0] +sub v29.4s, v7.4s, v26.4s +ldr q11, [x0, #736] +str q29, [x0, #432] +sqrdmulh v29.4S, v17.4S, v20.s[0] +mul v17.4S, v17.4S,v8.s[0] +add v7.4s, v7.4s, v26.4s +ldr q26, [x17, #+448] +str q7, [x0, #416] +sqrdmulh v7.4S, v10.4S, v20.s[0] +mul v10.4S, v10.4S,v8.s[0] +sub v21.4s, v22.4s, v19.4s +ldr q28, [x17, #+464] +str q21, [x0, #400] +mla v23.4S, v25.4S, v31.s[0] +sqrdmulh v25.4S, v4.4S, v28.s[0] +add v22.4s, v22.4s, v19.4s +ldr q19, [x17, #+480] +str q22, [x0, #384] +mla v24.4S, v6.4S, v31.s[0] +sqrdmulh v6.4S, v5.4S, v28.s[0] +sub v22.4s, v13.4s, v18.4s +ldr q21, [x17, #+496] +str q22, [x0, #496] +mla v17.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v14.4S, v21.s[0] +add v13.4s, v13.4s, v18.4s +ldr q18, [x0, #528] +str q13, [x0, #480] +mla v10.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v11.4S, v21.s[0] +sub v13.4s, v2.4s, v1.4s +ldr q22, [x0, #512] +str q13, [x0, #464] +mul v4.4S, v4.4S,v26.s[0] +mul v5.4S, v5.4S,v26.s[0] +add v2.4s, v2.4s, v1.4s +ldr q1, [x0, #592] +str q2, [x0, #448] +mla v4.4S, v25.4S, v31.s[0] +mla v5.4S, v6.4S, v31.s[0] +sub v6.4s, v18.4s, v23.4s +ldr q25, [x0, #576] +add v18.4s, v18.4s, v23.4s +mul v14.4S, v14.4S,v19.s[0] +mul v11.4S, v11.4S,v19.s[0] +sub v23.4s, v22.4s, v24.4s +ldr q2, [x0, #656] +add v22.4s, v22.4s, v24.4s +mla v14.4S, v29.4S, v31.s[0] +mla v11.4S, v7.4S, v31.s[0] +sub v7.4s, v1.4s, v17.4s +ldr q29, [x0, #640] +add v1.4s, v1.4s, v17.4s +sqrdmulh v17.4S, v6.4S, v30.s[2] +mul v6.4S, v6.4S,v9.s[2] +sub v24.4s, v25.4s, v10.4s +ldr q13, [x0, #720] +add v25.4s, v25.4s, v10.4s +sqrdmulh v10.4S, v18.4S, v30.s[1] +mul v18.4S, v18.4S,v9.s[1] +sub v16.4s, v2.4s, v4.4s +ldr q27, [x0, #704] +add v2.4s, v2.4s, v4.4s +sqrdmulh v30.4S, v7.4S, v20.s[2] +mul v7.4S, v7.4S,v8.s[2] +sub v4.4s, v29.4s, v5.4s +ldr q9, [x0, #816] +add v29.4s, v29.4s, v5.4s +sqrdmulh v5.4S, v1.4S, v20.s[1] +mul v1.4S, v1.4S,v8.s[1] +sub v3.4s, v13.4s, v14.4s +ldr q15, [x0, #800] +add v13.4s, v13.4s, v14.4s +mla v6.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v16.4S, v28.s[2] +sub v20.4s, v27.4s, v11.4s +ldr q14, [x0, #880] +add v27.4s, v27.4s, v11.4s +mla v18.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v2.4S, v28.s[1] +sub v11.4s, v23.4s, v6.4s +ldr q8, [x0, #864] +str q11, [x0, #560] +mla v7.4S, v30.4S, v31.s[0] +sqrdmulh v30.4S, v3.4S, v21.s[2] +add v23.4s, v23.4s, v6.4s +ldr q6, [x17, #+512] +str q23, [x0, #544] +mla v1.4S, v5.4S, v31.s[0] +sqrdmulh v5.4S, v13.4S, v21.s[1] +sub v23.4s, v22.4s, v18.4s +ldr q11, [x17, #+528] +str q23, [x0, #528] +mul v16.4S, v16.4S,v26.s[2] +mul v2.4S, v2.4S,v26.s[1] +add v22.4s, v22.4s, v18.4s +ldr q18, [x17, #+544] +str q22, [x0, #512] +mla v16.4S, v17.4S, v31.s[0] +mla v2.4S, v10.4S, v31.s[0] +sub v10.4s, v24.4s, v7.4s +ldr q17, [x17, #+560] +str q10, [x0, #624] +mul v3.4S, v3.4S,v19.s[2] +mul v13.4S, v13.4S,v19.s[1] +add v24.4s, v24.4s, v7.4s +ldr q7, [x0, #944] +str q24, [x0, #608] +mla v3.4S, v30.4S, v31.s[0] +mla v13.4S, v5.4S, v31.s[0] +sub v5.4s, v25.4s, v1.4s +ldr q30, [x0, #928] +str q5, [x0, #592] +sqrdmulh v21.4S, v9.4S, v11.s[0] +mul v9.4S, v9.4S,v6.s[0] +add v25.4s, v25.4s, v1.4s +ldr q1, [x0, #1008] +str q25, [x0, #576] +sqrdmulh v25.4S, v15.4S, v11.s[0] +mul v15.4S, v15.4S,v6.s[0] +sub v5.4s, v4.4s, v16.4s +ldr q19, [x0, #992] +str q5, [x0, #688] +sqrdmulh v5.4S, v14.4S, v17.s[0] +mul v14.4S, v14.4S,v18.s[0] +add v4.4s, v4.4s, v16.4s +ldr q16, [x17, #+576] +str q4, [x0, #672] +sqrdmulh v4.4S, v8.4S, v17.s[0] +mul v8.4S, v8.4S,v18.s[0] +sub v24.4s, v29.4s, v2.4s +ldr q28, [x17, #+592] +str q24, [x0, #656] +mla v9.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v7.4S, v28.s[0] +add v29.4s, v29.4s, v2.4s +ldr q2, [x17, #+608] +str q29, [x0, #640] +mla v15.4S, v25.4S, v31.s[0] +sqrdmulh v25.4S, v30.4S, v28.s[0] +sub v29.4s, v20.4s, v3.4s +ldr q24, [x17, #+624] +str q29, [x0, #752] +mla v14.4S, v5.4S, v31.s[0] +sqrdmulh v5.4S, v1.4S, v24.s[0] +add v20.4s, v20.4s, v3.4s +ldr q3, [x0, #784] +str q20, [x0, #736] +mla v8.4S, v4.4S, v31.s[0] +sqrdmulh v4.4S, v19.4S, v24.s[0] +sub v20.4s, v27.4s, v13.4s +ldr q29, [x0, #768] +str q20, [x0, #720] +mul v7.4S, v7.4S,v16.s[0] +mul v30.4S, v30.4S,v16.s[0] +add v27.4s, v27.4s, v13.4s +ldr q13, [x0, #848] +str q27, [x0, #704] +mla v7.4S, v21.4S, v31.s[0] +mla v30.4S, v25.4S, v31.s[0] +sub v25.4s, v3.4s, v9.4s +ldr q21, [x0, #832] +add v3.4s, v3.4s, v9.4s +mul v1.4S, v1.4S,v2.s[0] +mul v19.4S, v19.4S,v2.s[0] +sub v9.4s, v29.4s, v15.4s +ldr q27, [x0, #912] +add v29.4s, v29.4s, v15.4s +mla v1.4S, v5.4S, v31.s[0] +mla v19.4S, v4.4S, v31.s[0] +sub v4.4s, v13.4s, v14.4s +ldr q5, [x0, #896] +add v13.4s, v13.4s, v14.4s +sqrdmulh v14.4S, v25.4S, v11.s[2] +mul v25.4S, v25.4S,v6.s[2] +sub v15.4s, v21.4s, v8.4s +ldr q20, [x0, #976] +add v21.4s, v21.4s, v8.4s +sqrdmulh v8.4S, v3.4S, v11.s[1] +mul v3.4S, v3.4S,v6.s[1] +sub v10.4s, v27.4s, v7.4s +ldr q26, [x0, #960] +add v27.4s, v27.4s, v7.4s +sqrdmulh v11.4S, v4.4S, v17.s[2] +mul v4.4S, v4.4S,v18.s[2] +sub v7.4s, v5.4s, v30.4s +add v5.4s, v5.4s, v30.4s +sqrdmulh v30.4S, v13.4S, v17.s[1] +mul v13.4S, v13.4S,v18.s[1] +sub v6.4s, v20.4s, v1.4s +add v20.4s, v20.4s, v1.4s +mla v25.4S, v14.4S, v31.s[0] +sqrdmulh v14.4S, v10.4S, v28.s[2] +sub v17.4s, v26.4s, v19.4s +add v26.4s, v26.4s, v19.4s +mla v3.4S, v8.4S, v31.s[0] +sqrdmulh v8.4S, v27.4S, v28.s[1] +sub v19.4s, v9.4s, v25.4s +str q19, [x0, #816] +mla v4.4S, v11.4S, v31.s[0] +sqrdmulh v11.4S, v6.4S, v24.s[2] +add v9.4s, v9.4s, v25.4s +str q9, [x0, #800] +mla v13.4S, v30.4S, v31.s[0] +sqrdmulh v30.4S, v20.4S, v24.s[1] +sub v9.4s, v29.4s, v3.4s +str q9, [x0, #784] +mul v10.4S, v10.4S,v16.s[2] +mul v27.4S, v27.4S,v16.s[1] +add v29.4s, v29.4s, v3.4s +str q29, [x0, #768] +mla v10.4S, v14.4S, v31.s[0] +mla v27.4S, v8.4S, v31.s[0] +sub v8.4s, v15.4s, v4.4s +str q8, [x0, #880] +mul v6.4S, v6.4S,v2.s[2] +mul v20.4S, v20.4S,v2.s[1] +add v15.4s, v15.4s, v4.4s +str q15, [x0, #864] +mla v6.4S, v11.4S, v31.s[0] +mla v20.4S, v30.4S, v31.s[0] +sub v30.4s, v21.4s, v13.4s +str q30, [x0, #848] +add v21.4s, v21.4s, v13.4s +str q21, [x0, #832] +sub v21.4s, v7.4s, v10.4s +str q21, [x0, #944] +add v7.4s, v7.4s, v10.4s +str q7, [x0, #928] +sub v7.4s, v5.4s, v27.4s +str q7, [x0, #912] +add v5.4s, v5.4s, v27.4s +str q5, [x0, #896] +sub v5.4s, v17.4s, v6.4s +str q5, [x0, #1008] +add v17.4s, v17.4s, v6.4s +str q17, [x0, #992] +sub v17.4s, v26.4s, v20.4s +str q17, [x0, #976] +add v26.4s, v26.4s, v20.4s +str q26, [x0, #960] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1520 +// Instruction count: 1516 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_13.s b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_13.s new file mode 100644 index 0000000..e12e06a --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_13.s @@ -0,0 +1,1550 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_22_z4_13 +.global _ntt_u32_incomplete_neon_asm_var_4_2_22_z4_13 +ntt_u32_incomplete_neon_asm_var_4_2_22_z4_13: +_ntt_u32_incomplete_neon_asm_var_4_2_22_z4_13: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x0, #992] +sqrdmulh v27.4S, v28.4S, v29.s[0] +mul v28.4S, v28.4S,v30.s[0] +ldr q26, [x0, #928] +sqrdmulh v25.4S, v26.4S, v29.s[0] +mul v26.4S, v26.4S,v30.s[0] +ldr q24, [x0, #864] +sqrdmulh v23.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +ldr q22, [x0, #800] +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +ldr q20, [x0, #736] +sqrdmulh v19.4S, v20.4S, v29.s[0] +mla v28.4S, v27.4S, v31.s[0] +ldr q27, [x0, #672] +sqrdmulh v18.4S, v27.4S, v29.s[0] +mla v26.4S, v25.4S, v31.s[0] +ldr q25, [x0, #608] +sqrdmulh v17.4S, v25.4S, v29.s[0] +mla v24.4S, v23.4S, v31.s[0] +ldr q23, [x0, #544] +sqrdmulh v16.4S, v23.4S, v29.s[0] +mla v22.4S, v21.4S, v31.s[0] +ldr q21, [x0, #480] +mul v27.4S, v27.4S,v30.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q3, [x0, #416] +ldr q2, [x0, #352] +ldr q1, [x0, #288] +mla v27.4S, v18.4S, v31.s[0] +mla v20.4S, v19.4S, v31.s[0] +ldr q19, [x0, #224] +ldr q18, [x0, #160] +mul v23.4S, v23.4S,v30.s[0] +mul v25.4S, v25.4S,v30.s[0] +ldr q0, [x0, #96] +ldr q15, [x0, #32] +mla v23.4S, v16.4S, v31.s[0] +mla v25.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v28.4s +add v21.4s, v21.4s, v28.4s +sqrdmulh v28.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +sub v16.4s, v3.4s, v26.4s +add v3.4s, v3.4s, v26.4s +sqrdmulh v26.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +sub v14.4s, v2.4s, v24.4s +add v2.4s, v2.4s, v24.4s +sqrdmulh v24.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v13.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +sqrdmulh v22.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v12.4s, v19.4s, v20.4s +add v19.4s, v19.4s, v20.4s +sqrdmulh v20.4S, v14.4S, v29.s[2] +mla v17.4S, v28.4S, v31.s[0] +sub v28.4s, v18.4s, v27.4s +add v18.4s, v18.4s, v27.4s +sqrdmulh v27.4S, v13.4S, v29.s[2] +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v0.4s, v25.4s +add v0.4s, v0.4s, v25.4s +sqrdmulh v25.4S, v2.4S, v29.s[1] +mla v21.4S, v24.4S, v31.s[0] +sub v24.4s, v15.4s, v23.4s +sqrdmulh v11.4S, v1.4S, v29.s[1] +mla v3.4S, v22.4S, v31.s[0] +add v15.4s, v15.4s, v23.4s +ldr q23, [x17, #+32] +ldr q22, [x17, #+48] +mul v13.4S, v13.4S,v30.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v10.4s, v12.4s, v17.4s +add v12.4s, v12.4s, v17.4s +mla v13.4S, v27.4S, v31.s[0] +mla v14.4S, v20.4S, v31.s[0] +sub v20.4s, v28.4s, v16.4s +add v28.4s, v28.4s, v16.4s +mul v1.4S, v1.4S,v30.s[1] +mul v2.4S, v2.4S,v30.s[1] +sub v16.4s, v19.4s, v21.4s +add v19.4s, v19.4s, v21.4s +mla v1.4S, v11.4S, v31.s[0] +mla v2.4S, v25.4S, v31.s[0] +sub v25.4s, v18.4s, v3.4s +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v10.4S, v22.s[3] +mul v10.4S, v10.4S,v23.s[3] +sub v11.4s, v26.4s, v14.4s +add v26.4s, v26.4s, v14.4s +sqrdmulh v14.4S, v12.4S, v22.s[2] +mul v12.4S, v12.4S,v23.s[2] +sub v21.4s, v24.4s, v13.4s +add v24.4s, v24.4s, v13.4s +sqrdmulh v13.4S, v16.4S, v22.s[1] +mul v16.4S, v16.4S,v23.s[1] +sub v27.4s, v0.4s, v2.4s +add v0.4s, v0.4s, v2.4s +sqrdmulh v2.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v17.4s, v15.4s, v1.4s +add v15.4s, v15.4s, v1.4s +ldr q1, [x17, #+96] +ldr q9, [x17, #+112] +sqrdmulh v8.4S, v20.4S, v22.s[3] +mla v10.4S, v3.4S, v31.s[0] +nop +nop +sqrdmulh v3.4S, v28.4S, v22.s[2] +mla v12.4S, v14.4S, v31.s[0] +nop +nop +sqrdmulh v14.4S, v25.4S, v22.s[1] +mla v16.4S, v13.4S, v31.s[0] +nop +nop +sqrdmulh v13.4S, v18.4S, v22.s[0] +mla v19.4S, v2.4S, v31.s[0] +nop +nop +ldr q2, [x17, #+64] +ldr q7, [x17, #+80] +mul v28.4S, v28.4S,v23.s[2] +mul v20.4S, v20.4S,v23.s[3] +sub v6.4s, v11.4s, v10.4s +add v11.4s, v11.4s, v10.4s +mla v28.4S, v3.4S, v31.s[0] +mla v20.4S, v8.4S, v31.s[0] +sub v8.4s, v26.4s, v12.4s +add v26.4s, v26.4s, v12.4s +mul v18.4S, v18.4S,v23.s[0] +mul v25.4S, v25.4S,v23.s[1] +sub v12.4s, v27.4s, v16.4s +add v27.4s, v27.4s, v16.4s +mla v18.4S, v13.4S, v31.s[0] +mla v25.4S, v14.4S, v31.s[0] +sub v14.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v9.s[3] +mul v6.4S, v6.4S,v1.s[3] +sub v13.4s, v21.4s, v20.4s +add v21.4s, v21.4s, v20.4s +sqrdmulh v20.4S, v11.4S, v9.s[2] +mul v11.4S, v11.4S,v1.s[2] +sub v16.4s, v24.4s, v28.4s +add v24.4s, v24.4s, v28.4s +sqrdmulh v28.4S, v8.4S, v9.s[1] +mul v8.4S, v8.4S,v1.s[1] +sub v3.4s, v17.4s, v25.4s +add v17.4s, v17.4s, v25.4s +sqrdmulh v25.4S, v26.4S, v9.s[0] +mul v26.4S, v26.4S,v1.s[0] +sub v10.4s, v15.4s, v18.4s +add v15.4s, v15.4s, v18.4s +sqrdmulh v18.4S, v12.4S, v7.s[3] +mla v6.4S, v19.4S, v31.s[0] +nop +nop +sqrdmulh v19.4S, v27.4S, v7.s[2] +mla v11.4S, v20.4S, v31.s[0] +nop +nop +sqrdmulh v20.4S, v14.4S, v7.s[1] +mla v8.4S, v28.4S, v31.s[0] +nop +nop +sqrdmulh v28.4S, v0.4S, v7.s[0] +mla v26.4S, v25.4S, v31.s[0] +nop +nop +mul v27.4S, v27.4S,v2.s[2] +mul v12.4S, v12.4S,v2.s[3] +sub v25.4s, v13.4s, v6.4s +str q25, [x0, #992] +mla v27.4S, v19.4S, v31.s[0] +mla v12.4S, v18.4S, v31.s[0] +add v13.4s, v13.4s, v6.4s +str q13, [x0, #928] +mul v0.4S, v0.4S,v2.s[0] +mul v14.4S, v14.4S,v2.s[1] +sub v13.4s, v21.4s, v11.4s +str q13, [x0, #864] +mla v0.4S, v28.4S, v31.s[0] +mla v14.4S, v20.4S, v31.s[0] +add v21.4s, v21.4s, v11.4s +sub v11.4s, v16.4s, v8.4s +ldr q20, [x0, #1008] +sqrdmulh v28.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v16.4s, v16.4s, v8.4s +str q21, [x0, #800] +ldr q21, [x0, #944] +sqrdmulh v8.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +sub v13.4s, v24.4s, v26.4s +str q11, [x0, #736] +ldr q11, [x0, #880] +sqrdmulh v6.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +add v24.4s, v24.4s, v26.4s +str q16, [x0, #672] +ldr q16, [x0, #816] +sqrdmulh v26.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v18.4s, v3.4s, v12.4s +str q13, [x0, #608] +ldr q13, [x0, #752] +sqrdmulh v19.4S, v13.4S, v29.s[0] +mla v20.4S, v28.4S, v31.s[0] +add v3.4s, v3.4s, v12.4s +str q24, [x0, #544] +ldr q24, [x0, #688] +sqrdmulh v12.4S, v24.4S, v29.s[0] +mla v21.4S, v8.4S, v31.s[0] +sub v8.4s, v17.4s, v27.4s +str q18, [x0, #480] +ldr q18, [x0, #624] +sqrdmulh v28.4S, v18.4S, v29.s[0] +mla v11.4S, v6.4S, v31.s[0] +add v17.4s, v17.4s, v27.4s +str q3, [x0, #416] +ldr q3, [x0, #560] +sqrdmulh v27.4S, v3.4S, v29.s[0] +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v10.4s, v14.4s +str q8, [x0, #352] +ldr q8, [x0, #496] +add v10.4s, v10.4s, v14.4s +mul v24.4S, v24.4S,v30.s[0] +mul v13.4S, v13.4S,v30.s[0] +ldr q14, [x0, #432] +str q17, [x0, #288] +ldr q17, [x0, #368] +ldr q6, [x0, #304] +mla v24.4S, v12.4S, v31.s[0] +mla v13.4S, v19.4S, v31.s[0] +str q26, [x0, #224] +sub v26.4s, v15.4s, v0.4s +ldr q19, [x0, #240] +ldr q12, [x0, #176] +mul v3.4S, v3.4S,v30.s[0] +mul v18.4S, v18.4S,v30.s[0] +str q10, [x0, #160] +add v15.4s, v15.4s, v0.4s +ldr q0, [x0, #112] +ldr q10, [x0, #48] +mla v3.4S, v27.4S, v31.s[0] +mla v18.4S, v28.4S, v31.s[0] +sub v28.4s, v8.4s, v20.4s +add v8.4s, v8.4s, v20.4s +sqrdmulh v20.4S, v28.4S, v29.s[2] +mul v28.4S, v28.4S,v30.s[2] +sub v27.4s, v14.4s, v21.4s +add v14.4s, v14.4s, v21.4s +sqrdmulh v21.4S, v27.4S, v29.s[2] +mul v27.4S, v27.4S,v30.s[2] +sub v25.4s, v17.4s, v11.4s +add v17.4s, v17.4s, v11.4s +sqrdmulh v11.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +sub v5.4s, v6.4s, v16.4s +add v6.4s, v6.4s, v16.4s +sqrdmulh v16.4S, v14.4S, v29.s[1] +mul v14.4S, v14.4S,v30.s[1] +sub v4.4s, v19.4s, v13.4s +add v19.4s, v19.4s, v13.4s +sqrdmulh v13.4S, v25.4S, v29.s[2] +mla v28.4S, v20.4S, v31.s[0] +sub v20.4s, v12.4s, v24.4s +add v12.4s, v12.4s, v24.4s +sqrdmulh v24.4S, v5.4S, v29.s[2] +mla v27.4S, v21.4S, v31.s[0] +sub v21.4s, v0.4s, v18.4s +add v0.4s, v0.4s, v18.4s +sqrdmulh v18.4S, v17.4S, v29.s[1] +mla v8.4S, v11.4S, v31.s[0] +sub v11.4s, v10.4s, v3.4s +str q26, [x0, #96] +sqrdmulh v26.4S, v6.4S, v29.s[1] +mla v14.4S, v16.4S, v31.s[0] +add v10.4s, v10.4s, v3.4s +str q15, [x0, #32] +mul v5.4S, v5.4S,v30.s[2] +mul v25.4S, v25.4S,v30.s[2] +sub v15.4s, v4.4s, v28.4s +add v4.4s, v4.4s, v28.4s +mla v5.4S, v24.4S, v31.s[0] +mla v25.4S, v13.4S, v31.s[0] +sub v13.4s, v20.4s, v27.4s +add v20.4s, v20.4s, v27.4s +mul v6.4S, v6.4S,v30.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v27.4s, v19.4s, v8.4s +add v19.4s, v19.4s, v8.4s +mla v6.4S, v26.4S, v31.s[0] +mla v17.4S, v18.4S, v31.s[0] +sub v18.4s, v12.4s, v14.4s +add v12.4s, v12.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v22.s[3] +mul v15.4S, v15.4S,v23.s[3] +sub v26.4s, v21.4s, v25.4s +add v21.4s, v21.4s, v25.4s +sqrdmulh v25.4S, v4.4S, v22.s[2] +mul v4.4S, v4.4S,v23.s[2] +sub v8.4s, v11.4s, v5.4s +add v11.4s, v11.4s, v5.4s +sqrdmulh v5.4S, v27.4S, v22.s[1] +mul v27.4S, v27.4S,v23.s[1] +sub v24.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +sqrdmulh v17.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v28.4s, v10.4s, v6.4s +add v10.4s, v10.4s, v6.4s +sqrdmulh v6.4S, v13.4S, v22.s[3] +mla v15.4S, v14.4S, v31.s[0] +nop +nop +sqrdmulh v14.4S, v20.4S, v22.s[2] +mla v4.4S, v25.4S, v31.s[0] +nop +nop +sqrdmulh v25.4S, v18.4S, v22.s[1] +mla v27.4S, v5.4S, v31.s[0] +nop +nop +sqrdmulh v5.4S, v12.4S, v22.s[0] +mla v19.4S, v17.4S, v31.s[0] +nop +nop +mul v20.4S, v20.4S,v23.s[2] +mul v13.4S, v13.4S,v23.s[3] +sub v17.4s, v26.4s, v15.4s +add v26.4s, v26.4s, v15.4s +mla v20.4S, v14.4S, v31.s[0] +mla v13.4S, v6.4S, v31.s[0] +sub v6.4s, v21.4s, v4.4s +add v21.4s, v21.4s, v4.4s +mul v12.4S, v12.4S,v23.s[0] +mul v18.4S, v18.4S,v23.s[1] +sub v4.4s, v24.4s, v27.4s +add v24.4s, v24.4s, v27.4s +mla v12.4S, v5.4S, v31.s[0] +mla v18.4S, v25.4S, v31.s[0] +sub v25.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v17.4S, v9.s[3] +mul v17.4S, v17.4S,v1.s[3] +sub v5.4s, v8.4s, v13.4s +add v8.4s, v8.4s, v13.4s +sqrdmulh v13.4S, v26.4S, v9.s[2] +mul v26.4S, v26.4S,v1.s[2] +sub v27.4s, v11.4s, v20.4s +add v11.4s, v11.4s, v20.4s +sqrdmulh v20.4S, v6.4S, v9.s[1] +mul v6.4S, v6.4S,v1.s[1] +sub v14.4s, v28.4s, v18.4s +add v28.4s, v28.4s, v18.4s +sqrdmulh v18.4S, v21.4S, v9.s[0] +mul v21.4S, v21.4S,v1.s[0] +sub v15.4s, v10.4s, v12.4s +add v10.4s, v10.4s, v12.4s +sqrdmulh v12.4S, v4.4S, v7.s[3] +mla v17.4S, v19.4S, v31.s[0] +nop +nop +sqrdmulh v19.4S, v24.4S, v7.s[2] +mla v26.4S, v13.4S, v31.s[0] +nop +nop +sqrdmulh v13.4S, v25.4S, v7.s[1] +mla v6.4S, v20.4S, v31.s[0] +nop +nop +sqrdmulh v20.4S, v0.4S, v7.s[0] +mla v21.4S, v18.4S, v31.s[0] +nop +nop +mul v24.4S, v24.4S,v2.s[2] +mul v4.4S, v4.4S,v2.s[3] +sub v18.4s, v5.4s, v17.4s +str q18, [x0, #1008] +mla v24.4S, v19.4S, v31.s[0] +mla v4.4S, v12.4S, v31.s[0] +add v5.4s, v5.4s, v17.4s +str q5, [x0, #944] +mul v0.4S, v0.4S,v2.s[0] +mul v25.4S, v25.4S,v2.s[1] +sub v5.4s, v8.4s, v26.4s +str q5, [x0, #880] +mla v0.4S, v20.4S, v31.s[0] +mla v25.4S, v13.4S, v31.s[0] +add v8.4s, v8.4s, v26.4s +sub v26.4s, v27.4s, v6.4s +ldr q13, [x0, #960] +sqrdmulh v20.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +add v27.4s, v27.4s, v6.4s +str q8, [x0, #816] +ldr q8, [x0, #896] +sqrdmulh v6.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v5.4s, v11.4s, v21.4s +str q26, [x0, #752] +ldr q26, [x0, #832] +sqrdmulh v17.4S, v26.4S, v29.s[0] +mul v26.4S, v26.4S,v30.s[0] +add v11.4s, v11.4s, v21.4s +str q27, [x0, #688] +ldr q27, [x0, #768] +sqrdmulh v21.4S, v27.4S, v29.s[0] +mul v27.4S, v27.4S,v30.s[0] +sub v12.4s, v14.4s, v4.4s +str q5, [x0, #624] +ldr q5, [x0, #704] +sqrdmulh v19.4S, v5.4S, v29.s[0] +mla v13.4S, v20.4S, v31.s[0] +add v14.4s, v14.4s, v4.4s +str q11, [x0, #560] +ldr q11, [x0, #640] +sqrdmulh v4.4S, v11.4S, v29.s[0] +mla v8.4S, v6.4S, v31.s[0] +sub v6.4s, v28.4s, v24.4s +str q12, [x0, #496] +ldr q12, [x0, #576] +sqrdmulh v20.4S, v12.4S, v29.s[0] +mla v26.4S, v17.4S, v31.s[0] +add v28.4s, v28.4s, v24.4s +str q14, [x0, #432] +ldr q14, [x0, #512] +sqrdmulh v24.4S, v14.4S, v29.s[0] +mla v27.4S, v21.4S, v31.s[0] +sub v21.4s, v15.4s, v25.4s +str q6, [x0, #368] +ldr q6, [x0, #448] +add v15.4s, v15.4s, v25.4s +mul v11.4S, v11.4S,v30.s[0] +mul v5.4S, v5.4S,v30.s[0] +ldr q25, [x0, #384] +str q28, [x0, #304] +ldr q28, [x0, #320] +ldr q17, [x0, #256] +mla v11.4S, v4.4S, v31.s[0] +mla v5.4S, v19.4S, v31.s[0] +str q21, [x0, #240] +sub v21.4s, v10.4s, v0.4s +ldr q19, [x0, #192] +ldr q4, [x0, #128] +mul v14.4S, v14.4S,v30.s[0] +mul v12.4S, v12.4S,v30.s[0] +str q15, [x0, #176] +add v10.4s, v10.4s, v0.4s +ldr q0, [x0, #64] +ldr q15, [x0, #0] +mla v14.4S, v24.4S, v31.s[0] +mla v12.4S, v20.4S, v31.s[0] +sub v20.4s, v6.4s, v13.4s +add v6.4s, v6.4s, v13.4s +sqrdmulh v13.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +sub v24.4s, v25.4s, v8.4s +add v25.4s, v25.4s, v8.4s +sqrdmulh v8.4S, v24.4S, v29.s[2] +mul v24.4S, v24.4S,v30.s[2] +sub v18.4s, v28.4s, v26.4s +add v28.4s, v28.4s, v26.4s +sqrdmulh v26.4S, v6.4S, v29.s[1] +mul v6.4S, v6.4S,v30.s[1] +sub v3.4s, v17.4s, v27.4s +add v17.4s, v17.4s, v27.4s +sqrdmulh v27.4S, v25.4S, v29.s[1] +mul v25.4S, v25.4S,v30.s[1] +sub v16.4s, v19.4s, v5.4s +add v19.4s, v19.4s, v5.4s +sqrdmulh v5.4S, v18.4S, v29.s[2] +mla v20.4S, v13.4S, v31.s[0] +sub v13.4s, v4.4s, v11.4s +add v4.4s, v4.4s, v11.4s +sqrdmulh v11.4S, v3.4S, v29.s[2] +mla v24.4S, v8.4S, v31.s[0] +sub v8.4s, v0.4s, v12.4s +add v0.4s, v0.4s, v12.4s +sqrdmulh v12.4S, v28.4S, v29.s[1] +mla v6.4S, v26.4S, v31.s[0] +sub v26.4s, v15.4s, v14.4s +str q21, [x0, #112] +sqrdmulh v21.4S, v17.4S, v29.s[1] +mla v25.4S, v27.4S, v31.s[0] +add v15.4s, v15.4s, v14.4s +str q10, [x0, #48] +mul v3.4S, v3.4S,v30.s[2] +mul v18.4S, v18.4S,v30.s[2] +sub v10.4s, v16.4s, v20.4s +add v16.4s, v16.4s, v20.4s +mla v3.4S, v11.4S, v31.s[0] +mla v18.4S, v5.4S, v31.s[0] +sub v5.4s, v13.4s, v24.4s +add v13.4s, v13.4s, v24.4s +mul v17.4S, v17.4S,v30.s[1] +mul v28.4S, v28.4S,v30.s[1] +sub v24.4s, v19.4s, v6.4s +add v19.4s, v19.4s, v6.4s +mla v17.4S, v21.4S, v31.s[0] +mla v28.4S, v12.4S, v31.s[0] +sub v12.4s, v4.4s, v25.4s +add v4.4s, v4.4s, v25.4s +sqrdmulh v25.4S, v10.4S, v22.s[3] +mul v10.4S, v10.4S,v23.s[3] +sub v21.4s, v8.4s, v18.4s +add v8.4s, v8.4s, v18.4s +sqrdmulh v18.4S, v16.4S, v22.s[2] +mul v16.4S, v16.4S,v23.s[2] +sub v6.4s, v26.4s, v3.4s +add v26.4s, v26.4s, v3.4s +sqrdmulh v3.4S, v24.4S, v22.s[1] +mul v24.4S, v24.4S,v23.s[1] +sub v11.4s, v0.4s, v28.4s +add v0.4s, v0.4s, v28.4s +sqrdmulh v28.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v20.4s, v15.4s, v17.4s +add v15.4s, v15.4s, v17.4s +sqrdmulh v17.4S, v5.4S, v22.s[3] +mla v10.4S, v25.4S, v31.s[0] +nop +nop +sqrdmulh v25.4S, v13.4S, v22.s[2] +mla v16.4S, v18.4S, v31.s[0] +nop +nop +sqrdmulh v18.4S, v12.4S, v22.s[1] +mla v24.4S, v3.4S, v31.s[0] +nop +nop +sqrdmulh v3.4S, v4.4S, v22.s[0] +mla v19.4S, v28.4S, v31.s[0] +nop +nop +mul v13.4S, v13.4S,v23.s[2] +mul v5.4S, v5.4S,v23.s[3] +sub v28.4s, v21.4s, v10.4s +add v21.4s, v21.4s, v10.4s +mla v13.4S, v25.4S, v31.s[0] +mla v5.4S, v17.4S, v31.s[0] +sub v17.4s, v8.4s, v16.4s +add v8.4s, v8.4s, v16.4s +mul v4.4S, v4.4S,v23.s[0] +mul v12.4S, v12.4S,v23.s[1] +sub v16.4s, v11.4s, v24.4s +add v11.4s, v11.4s, v24.4s +mla v4.4S, v3.4S, v31.s[0] +mla v12.4S, v18.4S, v31.s[0] +sub v18.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v28.4S, v9.s[3] +mul v28.4S, v28.4S,v1.s[3] +sub v3.4s, v6.4s, v5.4s +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v21.4S, v9.s[2] +mul v21.4S, v21.4S,v1.s[2] +sub v24.4s, v26.4s, v13.4s +add v26.4s, v26.4s, v13.4s +sqrdmulh v13.4S, v17.4S, v9.s[1] +mul v17.4S, v17.4S,v1.s[1] +sub v25.4s, v20.4s, v12.4s +add v20.4s, v20.4s, v12.4s +sqrdmulh v12.4S, v8.4S, v9.s[0] +mul v8.4S, v8.4S,v1.s[0] +sub v10.4s, v15.4s, v4.4s +add v15.4s, v15.4s, v4.4s +sqrdmulh v4.4S, v16.4S, v7.s[3] +mla v28.4S, v19.4S, v31.s[0] +nop +nop +sqrdmulh v19.4S, v11.4S, v7.s[2] +mla v21.4S, v5.4S, v31.s[0] +nop +nop +sqrdmulh v5.4S, v18.4S, v7.s[1] +mla v17.4S, v13.4S, v31.s[0] +nop +nop +sqrdmulh v13.4S, v0.4S, v7.s[0] +mla v8.4S, v12.4S, v31.s[0] +nop +nop +mul v11.4S, v11.4S,v2.s[2] +mul v16.4S, v16.4S,v2.s[3] +sub v12.4s, v3.4s, v28.4s +str q12, [x0, #960] +mla v11.4S, v19.4S, v31.s[0] +mla v16.4S, v4.4S, v31.s[0] +add v3.4s, v3.4s, v28.4s +str q3, [x0, #896] +mul v0.4S, v0.4S,v2.s[0] +mul v18.4S, v18.4S,v2.s[1] +sub v3.4s, v6.4s, v21.4s +str q3, [x0, #832] +mla v0.4S, v13.4S, v31.s[0] +mla v18.4S, v5.4S, v31.s[0] +add v6.4s, v6.4s, v21.4s +sub v21.4s, v24.4s, v17.4s +ldr q5, [x0, #976] +sqrdmulh v13.4S, v5.4S, v29.s[0] +mul v5.4S, v5.4S,v30.s[0] +add v24.4s, v24.4s, v17.4s +str q6, [x0, #768] +ldr q6, [x0, #912] +sqrdmulh v17.4S, v6.4S, v29.s[0] +mul v6.4S, v6.4S,v30.s[0] +sub v3.4s, v26.4s, v8.4s +str q21, [x0, #704] +ldr q21, [x0, #848] +sqrdmulh v28.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +add v26.4s, v26.4s, v8.4s +str q24, [x0, #640] +ldr q24, [x0, #784] +sqrdmulh v8.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +sub v4.4s, v25.4s, v16.4s +str q3, [x0, #576] +ldr q3, [x0, #720] +sqrdmulh v19.4S, v3.4S, v29.s[0] +mla v5.4S, v13.4S, v31.s[0] +add v25.4s, v25.4s, v16.4s +str q26, [x0, #512] +ldr q26, [x0, #656] +sqrdmulh v16.4S, v26.4S, v29.s[0] +mla v6.4S, v17.4S, v31.s[0] +sub v17.4s, v20.4s, v11.4s +str q4, [x0, #448] +ldr q4, [x0, #592] +sqrdmulh v13.4S, v4.4S, v29.s[0] +mla v21.4S, v28.4S, v31.s[0] +add v20.4s, v20.4s, v11.4s +str q25, [x0, #384] +ldr q25, [x0, #528] +sqrdmulh v11.4S, v25.4S, v29.s[0] +mla v24.4S, v8.4S, v31.s[0] +sub v8.4s, v10.4s, v18.4s +str q17, [x0, #320] +ldr q17, [x0, #464] +add v10.4s, v10.4s, v18.4s +mul v26.4S, v26.4S,v30.s[0] +mul v3.4S, v3.4S,v30.s[0] +ldr q18, [x0, #400] +str q20, [x0, #256] +ldr q20, [x0, #336] +ldr q28, [x0, #272] +mla v26.4S, v16.4S, v31.s[0] +mla v3.4S, v19.4S, v31.s[0] +str q8, [x0, #192] +sub v8.4s, v15.4s, v0.4s +ldr q19, [x0, #208] +ldr q16, [x0, #144] +mul v25.4S, v25.4S,v30.s[0] +mul v4.4S, v4.4S,v30.s[0] +str q10, [x0, #128] +add v15.4s, v15.4s, v0.4s +ldr q0, [x0, #80] +ldr q10, [x0, #16] +mla v25.4S, v11.4S, v31.s[0] +mla v4.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v5.4s +add v17.4s, v17.4s, v5.4s +sqrdmulh v5.4S, v13.4S, v29.s[2] +mul v13.4S, v13.4S,v30.s[2] +sub v11.4s, v18.4s, v6.4s +add v18.4s, v18.4s, v6.4s +sqrdmulh v6.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v12.4s, v20.4s, v21.4s +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v14.4s, v28.4s, v24.4s +add v28.4s, v28.4s, v24.4s +sqrdmulh v24.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v27.4s, v19.4s, v3.4s +add v19.4s, v19.4s, v3.4s +sqrdmulh v3.4S, v12.4S, v29.s[2] +mla v13.4S, v5.4S, v31.s[0] +sub v5.4s, v16.4s, v26.4s +add v16.4s, v16.4s, v26.4s +sqrdmulh v26.4S, v14.4S, v29.s[2] +mla v11.4S, v6.4S, v31.s[0] +sub v6.4s, v0.4s, v4.4s +add v0.4s, v0.4s, v4.4s +sqrdmulh v4.4S, v20.4S, v29.s[1] +mla v17.4S, v21.4S, v31.s[0] +sub v21.4s, v10.4s, v25.4s +str q8, [x0, #64] +sqrdmulh v8.4S, v28.4S, v29.s[1] +mla v18.4S, v24.4S, v31.s[0] +add v10.4s, v10.4s, v25.4s +str q15, [x0, #0] +mul v14.4S, v14.4S,v30.s[2] +mul v12.4S, v12.4S,v30.s[2] +sub v15.4s, v27.4s, v13.4s +add v27.4s, v27.4s, v13.4s +mla v14.4S, v26.4S, v31.s[0] +mla v12.4S, v3.4S, v31.s[0] +sub v3.4s, v5.4s, v11.4s +add v5.4s, v5.4s, v11.4s +mul v28.4S, v28.4S,v30.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v11.4s, v19.4s, v17.4s +add v19.4s, v19.4s, v17.4s +mla v28.4S, v8.4S, v31.s[0] +mla v20.4S, v4.4S, v31.s[0] +sub v4.4s, v16.4s, v18.4s +add v16.4s, v16.4s, v18.4s +sqrdmulh v29.4S, v15.4S, v22.s[3] +mul v15.4S, v15.4S,v23.s[3] +sub v30.4s, v6.4s, v12.4s +add v6.4s, v6.4s, v12.4s +sqrdmulh v12.4S, v27.4S, v22.s[2] +mul v27.4S, v27.4S,v23.s[2] +sub v18.4s, v21.4s, v14.4s +add v21.4s, v21.4s, v14.4s +sqrdmulh v14.4S, v11.4S, v22.s[1] +mul v11.4S, v11.4S,v23.s[1] +sub v8.4s, v0.4s, v20.4s +add v0.4s, v0.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v17.4s, v10.4s, v28.4s +add v10.4s, v10.4s, v28.4s +sqrdmulh v28.4S, v3.4S, v22.s[3] +mla v15.4S, v29.4S, v31.s[0] +nop +nop +sqrdmulh v29.4S, v5.4S, v22.s[2] +mla v27.4S, v12.4S, v31.s[0] +nop +nop +sqrdmulh v12.4S, v4.4S, v22.s[1] +mla v11.4S, v14.4S, v31.s[0] +nop +nop +sqrdmulh v14.4S, v16.4S, v22.s[0] +mla v19.4S, v20.4S, v31.s[0] +nop +nop +mul v5.4S, v5.4S,v23.s[2] +mul v3.4S, v3.4S,v23.s[3] +sub v20.4s, v30.4s, v15.4s +add v30.4s, v30.4s, v15.4s +mla v5.4S, v29.4S, v31.s[0] +mla v3.4S, v28.4S, v31.s[0] +sub v28.4s, v6.4s, v27.4s +add v6.4s, v6.4s, v27.4s +mul v16.4S, v16.4S,v23.s[0] +mul v4.4S, v4.4S,v23.s[1] +sub v27.4s, v8.4s, v11.4s +add v8.4s, v8.4s, v11.4s +mla v16.4S, v14.4S, v31.s[0] +mla v4.4S, v12.4S, v31.s[0] +sub v12.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v22.4S, v20.4S, v9.s[3] +mul v20.4S, v20.4S,v1.s[3] +sub v23.4s, v18.4s, v3.4s +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v30.4S, v9.s[2] +mul v30.4S, v30.4S,v1.s[2] +sub v19.4s, v21.4s, v5.4s +add v21.4s, v21.4s, v5.4s +sqrdmulh v5.4S, v28.4S, v9.s[1] +mul v28.4S, v28.4S,v1.s[1] +sub v14.4s, v17.4s, v4.4s +add v17.4s, v17.4s, v4.4s +sqrdmulh v4.4S, v6.4S, v9.s[0] +mul v6.4S, v6.4S,v1.s[0] +sub v11.4s, v10.4s, v16.4s +add v10.4s, v10.4s, v16.4s +sqrdmulh v9.4S, v27.4S, v7.s[3] +mla v20.4S, v22.4S, v31.s[0] +nop +nop +sqrdmulh v22.4S, v8.4S, v7.s[2] +mla v30.4S, v3.4S, v31.s[0] +nop +nop +sqrdmulh v3.4S, v12.4S, v7.s[1] +mla v28.4S, v5.4S, v31.s[0] +nop +nop +sqrdmulh v5.4S, v0.4S, v7.s[0] +mla v6.4S, v4.4S, v31.s[0] +nop +nop +mul v8.4S, v8.4S,v2.s[2] +mul v27.4S, v27.4S,v2.s[3] +sub v4.4s, v23.4s, v20.4s +str q4, [x0, #976] +mla v8.4S, v22.4S, v31.s[0] +mla v27.4S, v9.4S, v31.s[0] +add v23.4s, v23.4s, v20.4s +str q23, [x0, #912] +mul v0.4S, v0.4S,v2.s[0] +mul v12.4S, v12.4S,v2.s[1] +sub v23.4s, v18.4s, v30.4s +str q23, [x0, #848] +mla v0.4S, v5.4S, v31.s[0] +mla v12.4S, v3.4S, v31.s[0] +add v18.4s, v18.4s, v30.4s +sub v30.4s, v19.4s, v28.4s +add v19.4s, v19.4s, v28.4s +str q18, [x0, #784] +sub v18.4s, v21.4s, v6.4s +str q30, [x0, #720] +add v21.4s, v21.4s, v6.4s +str q19, [x0, #656] +sub v19.4s, v14.4s, v27.4s +str q18, [x0, #592] +add v14.4s, v14.4s, v27.4s +str q21, [x0, #528] +sub v21.4s, v17.4s, v8.4s +str q19, [x0, #464] +add v17.4s, v17.4s, v8.4s +str q14, [x0, #400] +sub v14.4s, v11.4s, v12.4s +str q21, [x0, #336] +add v11.4s, v11.4s, v12.4s +str q17, [x0, #272] +sub v17.4s, v10.4s, v0.4s +add v10.4s, v10.4s, v0.4s +ldr q24, [x0, #32] +ldr q25, [x0, #48] +ldr q13, [x0, #96] +ldr q26, [x0, #112] +ldr q15, [x17, #+128] +ldr q29, [x17, #+144] +ldr q16, [x17, #+160] +ldr q1, [x17, #+176] +ldr q4, [x0, #160] +ldr q22, [x0, #176] +sqrdmulh v9.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v15.s[0] +ldr q20, [x0, #224] +sqrdmulh v23.4S, v25.4S, v29.s[0] +mul v25.4S, v25.4S,v15.s[0] +ldr q5, [x0, #240] +sqrdmulh v3.4S, v13.4S, v1.s[0] +mul v13.4S, v13.4S,v16.s[0] +ldr q2, [x17, #+192] +sqrdmulh v7.4S, v26.4S, v1.s[0] +mul v26.4S, v26.4S,v16.s[0] +ldr q28, [x17, #+208] +mla v24.4S, v9.4S, v31.s[0] +sqrdmulh v9.4S, v4.4S, v28.s[0] +ldr q30, [x17, #+224] +mla v25.4S, v23.4S, v31.s[0] +sqrdmulh v23.4S, v22.4S, v28.s[0] +ldr q6, [x17, #+240] +mla v13.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v20.4S, v6.s[0] +ldr q18, [x0, #0] +mla v26.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v5.4S, v6.s[0] +mul v4.4S, v4.4S,v2.s[0] +mul v22.4S, v22.4S,v2.s[0] +sub v27.4s, v18.4s, v24.4s +ldr q19, [x0, #64] +add v18.4s, v18.4s, v24.4s +mla v4.4S, v9.4S, v31.s[0] +mla v22.4S, v23.4S, v31.s[0] +sub v23.4s, v10.4s, v25.4s +add v10.4s, v10.4s, v25.4s +mul v20.4S, v20.4S,v30.s[0] +mul v5.4S, v5.4S,v30.s[0] +sub v25.4s, v19.4s, v13.4s +ldr q9, [x0, #128] +add v19.4s, v19.4s, v13.4s +mla v20.4S, v3.4S, v31.s[0] +mla v5.4S, v7.4S, v31.s[0] +sub v7.4s, v17.4s, v26.4s +add v17.4s, v17.4s, v26.4s +sqrdmulh v26.4S, v10.4S, v29.s[1] +mul v10.4S, v10.4S,v15.s[1] +sub v3.4s, v9.4s, v4.4s +ldr q13, [x0, #192] +add v9.4s, v9.4s, v4.4s +sqrdmulh v4.4S, v23.4S, v29.s[2] +mul v23.4S, v23.4S,v15.s[2] +sub v24.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +sqrdmulh v29.4S, v17.4S, v1.s[1] +mul v17.4S, v17.4S,v16.s[1] +sub v22.4s, v13.4s, v20.4s +ldr q15, [x0, #288] +add v13.4s, v13.4s, v20.4s +sqrdmulh v20.4S, v7.4S, v1.s[2] +mul v7.4S, v7.4S,v16.s[2] +sub v8.4s, v14.4s, v5.4s +ldr q21, [x0, #304] +add v14.4s, v14.4s, v5.4s +mla v10.4S, v26.4S, v31.s[0] +sqrdmulh v26.4S, v11.4S, v28.s[1] +sub v1.4s, v18.4s, v10.4s +ldr q5, [x0, #352] +str q1, [x0, #16] +mla v23.4S, v4.4S, v31.s[0] +sqrdmulh v4.4S, v24.4S, v28.s[2] +add v18.4s, v18.4s, v10.4s +ldr q10, [x0, #368] +str q18, [x0, #0] +mla v17.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v14.4S, v6.s[1] +sub v18.4s, v27.4s, v23.4s +ldr q1, [x17, #+256] +str q18, [x0, #48] +mla v7.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v8.4S, v6.s[2] +add v27.4s, v27.4s, v23.4s +ldr q23, [x17, #+272] +str q27, [x0, #32] +mul v11.4S, v11.4S,v2.s[1] +mul v24.4S, v24.4S,v2.s[2] +sub v27.4s, v19.4s, v17.4s +ldr q18, [x17, #+288] +str q27, [x0, #80] +mla v11.4S, v26.4S, v31.s[0] +mla v24.4S, v4.4S, v31.s[0] +add v19.4s, v19.4s, v17.4s +ldr q17, [x17, #+304] +str q19, [x0, #64] +mul v14.4S, v14.4S,v30.s[1] +mul v8.4S, v8.4S,v30.s[2] +sub v28.4s, v25.4s, v7.4s +ldr q19, [x0, #416] +str q28, [x0, #112] +mla v14.4S, v29.4S, v31.s[0] +mla v8.4S, v20.4S, v31.s[0] +add v25.4s, v25.4s, v7.4s +ldr q7, [x0, #432] +str q25, [x0, #96] +sqrdmulh v6.4S, v15.4S, v23.s[0] +mul v15.4S, v15.4S,v1.s[0] +sub v25.4s, v9.4s, v11.4s +ldr q30, [x0, #480] +str q25, [x0, #144] +sqrdmulh v25.4S, v21.4S, v23.s[0] +mul v21.4S, v21.4S,v1.s[0] +add v9.4s, v9.4s, v11.4s +ldr q11, [x0, #496] +str q9, [x0, #128] +sqrdmulh v9.4S, v5.4S, v17.s[0] +mul v5.4S, v5.4S,v18.s[0] +sub v20.4s, v3.4s, v24.4s +ldr q29, [x17, #+320] +str q20, [x0, #176] +sqrdmulh v20.4S, v10.4S, v17.s[0] +mul v10.4S, v10.4S,v18.s[0] +add v3.4s, v3.4s, v24.4s +ldr q24, [x17, #+336] +str q3, [x0, #160] +mla v15.4S, v6.4S, v31.s[0] +sqrdmulh v6.4S, v19.4S, v24.s[0] +sub v3.4s, v13.4s, v14.4s +ldr q28, [x17, #+352] +str q3, [x0, #208] +mla v21.4S, v25.4S, v31.s[0] +sqrdmulh v25.4S, v7.4S, v24.s[0] +add v13.4s, v13.4s, v14.4s +ldr q14, [x17, #+368] +str q13, [x0, #192] +mla v5.4S, v9.4S, v31.s[0] +sqrdmulh v9.4S, v30.4S, v14.s[0] +sub v13.4s, v22.4s, v8.4s +ldr q3, [x0, #256] +str q13, [x0, #240] +mla v10.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v11.4S, v14.s[0] +add v22.4s, v22.4s, v8.4s +ldr q8, [x0, #272] +str q22, [x0, #224] +mul v19.4S, v19.4S,v29.s[0] +mul v7.4S, v7.4S,v29.s[0] +sub v22.4s, v3.4s, v15.4s +ldr q13, [x0, #320] +add v3.4s, v3.4s, v15.4s +mla v19.4S, v6.4S, v31.s[0] +mla v7.4S, v25.4S, v31.s[0] +sub v25.4s, v8.4s, v21.4s +ldr q6, [x0, #336] +add v8.4s, v8.4s, v21.4s +mul v30.4S, v30.4S,v28.s[0] +mul v11.4S, v11.4S,v28.s[0] +sub v21.4s, v13.4s, v5.4s +ldr q15, [x0, #384] +add v13.4s, v13.4s, v5.4s +mla v30.4S, v9.4S, v31.s[0] +mla v11.4S, v20.4S, v31.s[0] +sub v20.4s, v6.4s, v10.4s +ldr q9, [x0, #400] +add v6.4s, v6.4s, v10.4s +sqrdmulh v10.4S, v8.4S, v23.s[1] +mul v8.4S, v8.4S,v1.s[1] +sub v5.4s, v15.4s, v19.4s +ldr q2, [x0, #448] +add v15.4s, v15.4s, v19.4s +sqrdmulh v19.4S, v25.4S, v23.s[2] +mul v25.4S, v25.4S,v1.s[2] +sub v4.4s, v9.4s, v7.4s +ldr q26, [x0, #464] +add v9.4s, v9.4s, v7.4s +sqrdmulh v23.4S, v6.4S, v17.s[1] +mul v6.4S, v6.4S,v18.s[1] +sub v7.4s, v2.4s, v30.4s +ldr q1, [x0, #544] +add v2.4s, v2.4s, v30.4s +sqrdmulh v30.4S, v20.4S, v17.s[2] +mul v20.4S, v20.4S,v18.s[2] +sub v27.4s, v26.4s, v11.4s +ldr q16, [x0, #560] +add v26.4s, v26.4s, v11.4s +mla v8.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v9.4S, v24.s[1] +sub v17.4s, v3.4s, v8.4s +ldr q11, [x0, #608] +str q17, [x0, #272] +mla v25.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v4.4S, v24.s[2] +add v3.4s, v3.4s, v8.4s +ldr q8, [x0, #624] +str q3, [x0, #256] +mla v6.4S, v23.4S, v31.s[0] +sqrdmulh v23.4S, v26.4S, v14.s[1] +sub v3.4s, v22.4s, v25.4s +ldr q17, [x17, #+384] +str q3, [x0, #304] +mla v20.4S, v30.4S, v31.s[0] +sqrdmulh v30.4S, v27.4S, v14.s[2] +add v22.4s, v22.4s, v25.4s +ldr q25, [x17, #+400] +str q22, [x0, #288] +mul v9.4S, v9.4S,v29.s[1] +mul v4.4S, v4.4S,v29.s[2] +sub v22.4s, v13.4s, v6.4s +ldr q3, [x17, #+416] +str q22, [x0, #336] +mla v9.4S, v10.4S, v31.s[0] +mla v4.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v6.4s +ldr q6, [x17, #+432] +str q13, [x0, #320] +mul v26.4S, v26.4S,v28.s[1] +mul v27.4S, v27.4S,v28.s[2] +sub v24.4s, v21.4s, v20.4s +ldr q13, [x0, #672] +str q24, [x0, #368] +mla v26.4S, v23.4S, v31.s[0] +mla v27.4S, v30.4S, v31.s[0] +add v21.4s, v21.4s, v20.4s +ldr q20, [x0, #688] +str q21, [x0, #352] +sqrdmulh v14.4S, v1.4S, v25.s[0] +mul v1.4S, v1.4S,v17.s[0] +sub v21.4s, v15.4s, v9.4s +ldr q28, [x0, #736] +str q21, [x0, #400] +sqrdmulh v21.4S, v16.4S, v25.s[0] +mul v16.4S, v16.4S,v17.s[0] +add v15.4s, v15.4s, v9.4s +ldr q9, [x0, #752] +str q15, [x0, #384] +sqrdmulh v15.4S, v11.4S, v6.s[0] +mul v11.4S, v11.4S,v3.s[0] +sub v30.4s, v5.4s, v4.4s +ldr q23, [x17, #+448] +str q30, [x0, #432] +sqrdmulh v30.4S, v8.4S, v6.s[0] +mul v8.4S, v8.4S,v3.s[0] +add v5.4s, v5.4s, v4.4s +ldr q4, [x17, #+464] +str q5, [x0, #416] +mla v1.4S, v14.4S, v31.s[0] +sqrdmulh v14.4S, v13.4S, v4.s[0] +sub v5.4s, v2.4s, v26.4s +ldr q24, [x17, #+480] +str q5, [x0, #464] +mla v16.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v20.4S, v4.s[0] +add v2.4s, v2.4s, v26.4s +ldr q26, [x17, #+496] +str q2, [x0, #448] +mla v11.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v28.4S, v26.s[0] +sub v2.4s, v7.4s, v27.4s +ldr q5, [x0, #512] +str q2, [x0, #496] +mla v8.4S, v30.4S, v31.s[0] +sqrdmulh v30.4S, v9.4S, v26.s[0] +add v7.4s, v7.4s, v27.4s +ldr q27, [x0, #528] +str q7, [x0, #480] +mul v13.4S, v13.4S,v23.s[0] +mul v20.4S, v20.4S,v23.s[0] +sub v7.4s, v5.4s, v1.4s +ldr q2, [x0, #576] +add v5.4s, v5.4s, v1.4s +mla v13.4S, v14.4S, v31.s[0] +mla v20.4S, v21.4S, v31.s[0] +sub v21.4s, v27.4s, v16.4s +ldr q14, [x0, #592] +add v27.4s, v27.4s, v16.4s +mul v28.4S, v28.4S,v24.s[0] +mul v9.4S, v9.4S,v24.s[0] +sub v16.4s, v2.4s, v11.4s +ldr q1, [x0, #640] +add v2.4s, v2.4s, v11.4s +mla v28.4S, v15.4S, v31.s[0] +mla v9.4S, v30.4S, v31.s[0] +sub v30.4s, v14.4s, v8.4s +ldr q15, [x0, #656] +add v14.4s, v14.4s, v8.4s +sqrdmulh v8.4S, v27.4S, v25.s[1] +mul v27.4S, v27.4S,v17.s[1] +sub v11.4s, v1.4s, v13.4s +ldr q29, [x0, #704] +add v1.4s, v1.4s, v13.4s +sqrdmulh v13.4S, v21.4S, v25.s[2] +mul v21.4S, v21.4S,v17.s[2] +sub v19.4s, v15.4s, v20.4s +ldr q10, [x0, #720] +add v15.4s, v15.4s, v20.4s +sqrdmulh v25.4S, v14.4S, v6.s[1] +mul v14.4S, v14.4S,v3.s[1] +sub v20.4s, v29.4s, v28.4s +ldr q17, [x0, #800] +add v29.4s, v29.4s, v28.4s +sqrdmulh v28.4S, v30.4S, v6.s[2] +mul v30.4S, v30.4S,v3.s[2] +sub v22.4s, v10.4s, v9.4s +ldr q18, [x0, #816] +add v10.4s, v10.4s, v9.4s +mla v27.4S, v8.4S, v31.s[0] +sqrdmulh v8.4S, v15.4S, v4.s[1] +sub v6.4s, v5.4s, v27.4s +ldr q9, [x0, #864] +str q6, [x0, #528] +mla v21.4S, v13.4S, v31.s[0] +sqrdmulh v13.4S, v19.4S, v4.s[2] +add v5.4s, v5.4s, v27.4s +ldr q27, [x0, #880] +str q5, [x0, #512] +mla v14.4S, v25.4S, v31.s[0] +sqrdmulh v25.4S, v10.4S, v26.s[1] +sub v5.4s, v7.4s, v21.4s +ldr q6, [x17, #+512] +str q5, [x0, #560] +mla v30.4S, v28.4S, v31.s[0] +sqrdmulh v28.4S, v22.4S, v26.s[2] +add v7.4s, v7.4s, v21.4s +ldr q21, [x17, #+528] +str q7, [x0, #544] +mul v15.4S, v15.4S,v23.s[1] +mul v19.4S, v19.4S,v23.s[2] +sub v7.4s, v2.4s, v14.4s +ldr q5, [x17, #+544] +str q7, [x0, #592] +mla v15.4S, v8.4S, v31.s[0] +mla v19.4S, v13.4S, v31.s[0] +add v2.4s, v2.4s, v14.4s +ldr q14, [x17, #+560] +str q2, [x0, #576] +mul v10.4S, v10.4S,v24.s[1] +mul v22.4S, v22.4S,v24.s[2] +sub v4.4s, v16.4s, v30.4s +ldr q2, [x0, #928] +str q4, [x0, #624] +mla v10.4S, v25.4S, v31.s[0] +mla v22.4S, v28.4S, v31.s[0] +add v16.4s, v16.4s, v30.4s +ldr q30, [x0, #944] +str q16, [x0, #608] +sqrdmulh v26.4S, v17.4S, v21.s[0] +mul v17.4S, v17.4S,v6.s[0] +sub v16.4s, v1.4s, v15.4s +ldr q24, [x0, #992] +str q16, [x0, #656] +sqrdmulh v16.4S, v18.4S, v21.s[0] +mul v18.4S, v18.4S,v6.s[0] +add v1.4s, v1.4s, v15.4s +ldr q15, [x0, #1008] +str q1, [x0, #640] +sqrdmulh v1.4S, v9.4S, v14.s[0] +mul v9.4S, v9.4S,v5.s[0] +sub v28.4s, v11.4s, v19.4s +ldr q25, [x17, #+576] +str q28, [x0, #688] +sqrdmulh v28.4S, v27.4S, v14.s[0] +mul v27.4S, v27.4S,v5.s[0] +add v11.4s, v11.4s, v19.4s +ldr q19, [x17, #+592] +str q11, [x0, #672] +mla v17.4S, v26.4S, v31.s[0] +sqrdmulh v26.4S, v2.4S, v19.s[0] +sub v11.4s, v29.4s, v10.4s +ldr q4, [x17, #+608] +str q11, [x0, #720] +mla v18.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v30.4S, v19.s[0] +add v29.4s, v29.4s, v10.4s +ldr q10, [x17, #+624] +str q29, [x0, #704] +mla v9.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v24.4S, v10.s[0] +sub v29.4s, v20.4s, v22.4s +ldr q11, [x0, #768] +str q29, [x0, #752] +mla v27.4S, v28.4S, v31.s[0] +sqrdmulh v28.4S, v15.4S, v10.s[0] +add v20.4s, v20.4s, v22.4s +ldr q22, [x0, #784] +str q20, [x0, #736] +mul v2.4S, v2.4S,v25.s[0] +mul v30.4S, v30.4S,v25.s[0] +sub v20.4s, v11.4s, v17.4s +ldr q29, [x0, #832] +add v11.4s, v11.4s, v17.4s +mla v2.4S, v26.4S, v31.s[0] +mla v30.4S, v16.4S, v31.s[0] +sub v16.4s, v22.4s, v18.4s +ldr q26, [x0, #848] +add v22.4s, v22.4s, v18.4s +mul v24.4S, v24.4S,v4.s[0] +mul v15.4S, v15.4S,v4.s[0] +sub v18.4s, v29.4s, v9.4s +ldr q17, [x0, #896] +add v29.4s, v29.4s, v9.4s +mla v24.4S, v1.4S, v31.s[0] +mla v15.4S, v28.4S, v31.s[0] +sub v28.4s, v26.4s, v27.4s +ldr q1, [x0, #912] +add v26.4s, v26.4s, v27.4s +sqrdmulh v27.4S, v22.4S, v21.s[1] +mul v22.4S, v22.4S,v6.s[1] +sub v9.4s, v17.4s, v2.4s +ldr q23, [x0, #960] +add v17.4s, v17.4s, v2.4s +sqrdmulh v2.4S, v16.4S, v21.s[2] +mul v16.4S, v16.4S,v6.s[2] +sub v13.4s, v1.4s, v30.4s +ldr q8, [x0, #976] +add v1.4s, v1.4s, v30.4s +sqrdmulh v21.4S, v26.4S, v14.s[1] +mul v26.4S, v26.4S,v5.s[1] +sub v30.4s, v23.4s, v24.4s +add v23.4s, v23.4s, v24.4s +sqrdmulh v24.4S, v28.4S, v14.s[2] +mul v28.4S, v28.4S,v5.s[2] +sub v6.4s, v8.4s, v15.4s +add v8.4s, v8.4s, v15.4s +mla v22.4S, v27.4S, v31.s[0] +sqrdmulh v27.4S, v1.4S, v19.s[1] +sub v14.4s, v11.4s, v22.4s +str q14, [x0, #784] +mla v16.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v13.4S, v19.s[2] +add v11.4s, v11.4s, v22.4s +str q11, [x0, #768] +mla v26.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v8.4S, v10.s[1] +sub v11.4s, v20.4s, v16.4s +str q11, [x0, #816] +mla v28.4S, v24.4S, v31.s[0] +sqrdmulh v24.4S, v6.4S, v10.s[2] +add v20.4s, v20.4s, v16.4s +str q20, [x0, #800] +mul v1.4S, v1.4S,v25.s[1] +mul v13.4S, v13.4S,v25.s[2] +sub v20.4s, v29.4s, v26.4s +str q20, [x0, #848] +mla v1.4S, v27.4S, v31.s[0] +mla v13.4S, v2.4S, v31.s[0] +add v29.4s, v29.4s, v26.4s +str q29, [x0, #832] +mul v8.4S, v8.4S,v4.s[1] +mul v6.4S, v6.4S,v4.s[2] +sub v19.4s, v18.4s, v28.4s +str q19, [x0, #880] +mla v8.4S, v21.4S, v31.s[0] +mla v6.4S, v24.4S, v31.s[0] +add v18.4s, v18.4s, v28.4s +str q18, [x0, #864] +sub v10.4s, v17.4s, v1.4s +str q10, [x0, #912] +add v17.4s, v17.4s, v1.4s +str q17, [x0, #896] +sub v17.4s, v9.4s, v13.4s +str q17, [x0, #944] +add v9.4s, v9.4s, v13.4s +str q9, [x0, #928] +sub v9.4s, v23.4s, v8.4s +str q9, [x0, #976] +add v23.4s, v23.4s, v8.4s +str q23, [x0, #960] +sub v23.4s, v30.4s, v6.4s +str q23, [x0, #1008] +add v30.4s, v30.4s, v6.4s +str q30, [x0, #992] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1520 +// Instruction count: 1516 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_14.s b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_14.s new file mode 100644 index 0000000..9604c58 --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_14.s @@ -0,0 +1,1550 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_22_z4_14 +.global _ntt_u32_incomplete_neon_asm_var_4_2_22_z4_14 +ntt_u32_incomplete_neon_asm_var_4_2_22_z4_14: +_ntt_u32_incomplete_neon_asm_var_4_2_22_z4_14: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x0, #992] +sqrdmulh v27.4S, v28.4S, v29.s[0] +mul v28.4S, v28.4S,v30.s[0] +ldr q26, [x0, #928] +sqrdmulh v25.4S, v26.4S, v29.s[0] +mul v26.4S, v26.4S,v30.s[0] +ldr q24, [x0, #864] +sqrdmulh v23.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +ldr q22, [x0, #800] +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +ldr q20, [x0, #736] +sqrdmulh v19.4S, v20.4S, v29.s[0] +mla v28.4S, v27.4S, v31.s[0] +ldr q27, [x0, #672] +sqrdmulh v18.4S, v27.4S, v29.s[0] +mla v26.4S, v25.4S, v31.s[0] +ldr q25, [x0, #608] +sqrdmulh v17.4S, v25.4S, v29.s[0] +mla v24.4S, v23.4S, v31.s[0] +ldr q23, [x0, #544] +sqrdmulh v16.4S, v23.4S, v29.s[0] +mla v22.4S, v21.4S, v31.s[0] +ldr q21, [x0, #480] +mul v27.4S, v27.4S,v30.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q3, [x0, #416] +ldr q2, [x0, #352] +ldr q1, [x0, #288] +mla v27.4S, v18.4S, v31.s[0] +mla v20.4S, v19.4S, v31.s[0] +ldr q19, [x0, #224] +ldr q18, [x0, #160] +mul v23.4S, v23.4S,v30.s[0] +mul v25.4S, v25.4S,v30.s[0] +ldr q0, [x0, #96] +ldr q15, [x0, #32] +mla v23.4S, v16.4S, v31.s[0] +mla v25.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v28.4s +add v21.4s, v21.4s, v28.4s +sqrdmulh v28.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +sub v16.4s, v3.4s, v26.4s +add v3.4s, v3.4s, v26.4s +sqrdmulh v26.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +sub v14.4s, v2.4s, v24.4s +add v2.4s, v2.4s, v24.4s +sqrdmulh v24.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v13.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +sqrdmulh v22.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v12.4s, v19.4s, v20.4s +add v19.4s, v19.4s, v20.4s +sqrdmulh v20.4S, v14.4S, v29.s[2] +mla v17.4S, v28.4S, v31.s[0] +sub v28.4s, v18.4s, v27.4s +add v18.4s, v18.4s, v27.4s +sqrdmulh v27.4S, v13.4S, v29.s[2] +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v0.4s, v25.4s +add v0.4s, v0.4s, v25.4s +sqrdmulh v25.4S, v2.4S, v29.s[1] +mla v21.4S, v24.4S, v31.s[0] +sub v24.4s, v15.4s, v23.4s +sqrdmulh v11.4S, v1.4S, v29.s[1] +mla v3.4S, v22.4S, v31.s[0] +add v15.4s, v15.4s, v23.4s +ldr q23, [x17, #+32] +ldr q22, [x17, #+48] +mul v13.4S, v13.4S,v30.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v10.4s, v12.4s, v17.4s +add v12.4s, v12.4s, v17.4s +mla v13.4S, v27.4S, v31.s[0] +mla v14.4S, v20.4S, v31.s[0] +sub v20.4s, v28.4s, v16.4s +add v28.4s, v28.4s, v16.4s +mul v1.4S, v1.4S,v30.s[1] +mul v2.4S, v2.4S,v30.s[1] +sub v16.4s, v19.4s, v21.4s +add v19.4s, v19.4s, v21.4s +mla v1.4S, v11.4S, v31.s[0] +mla v2.4S, v25.4S, v31.s[0] +sub v25.4s, v18.4s, v3.4s +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v10.4S, v22.s[3] +mul v10.4S, v10.4S,v23.s[3] +sub v11.4s, v26.4s, v14.4s +add v26.4s, v26.4s, v14.4s +sqrdmulh v14.4S, v12.4S, v22.s[2] +mul v12.4S, v12.4S,v23.s[2] +sub v21.4s, v24.4s, v13.4s +add v24.4s, v24.4s, v13.4s +sqrdmulh v13.4S, v16.4S, v22.s[1] +mul v16.4S, v16.4S,v23.s[1] +sub v27.4s, v0.4s, v2.4s +add v0.4s, v0.4s, v2.4s +sqrdmulh v2.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v17.4s, v15.4s, v1.4s +add v15.4s, v15.4s, v1.4s +ldr q1, [x17, #+96] +ldr q9, [x17, #+112] +sqrdmulh v8.4S, v20.4S, v22.s[3] +mla v10.4S, v3.4S, v31.s[0] +nop +nop +sqrdmulh v3.4S, v28.4S, v22.s[2] +mla v12.4S, v14.4S, v31.s[0] +nop +nop +sqrdmulh v14.4S, v25.4S, v22.s[1] +mla v16.4S, v13.4S, v31.s[0] +nop +nop +sqrdmulh v13.4S, v18.4S, v22.s[0] +mla v19.4S, v2.4S, v31.s[0] +nop +nop +ldr q2, [x17, #+64] +ldr q7, [x17, #+80] +mul v28.4S, v28.4S,v23.s[2] +mul v20.4S, v20.4S,v23.s[3] +sub v6.4s, v11.4s, v10.4s +add v11.4s, v11.4s, v10.4s +mla v28.4S, v3.4S, v31.s[0] +mla v20.4S, v8.4S, v31.s[0] +sub v8.4s, v26.4s, v12.4s +add v26.4s, v26.4s, v12.4s +mul v18.4S, v18.4S,v23.s[0] +mul v25.4S, v25.4S,v23.s[1] +sub v12.4s, v27.4s, v16.4s +add v27.4s, v27.4s, v16.4s +mla v18.4S, v13.4S, v31.s[0] +mla v25.4S, v14.4S, v31.s[0] +sub v14.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v9.s[3] +mul v6.4S, v6.4S,v1.s[3] +sub v13.4s, v21.4s, v20.4s +add v21.4s, v21.4s, v20.4s +sqrdmulh v20.4S, v11.4S, v9.s[2] +mul v11.4S, v11.4S,v1.s[2] +sub v16.4s, v24.4s, v28.4s +add v24.4s, v24.4s, v28.4s +sqrdmulh v28.4S, v8.4S, v9.s[1] +mul v8.4S, v8.4S,v1.s[1] +sub v3.4s, v17.4s, v25.4s +add v17.4s, v17.4s, v25.4s +sqrdmulh v25.4S, v26.4S, v9.s[0] +mul v26.4S, v26.4S,v1.s[0] +sub v10.4s, v15.4s, v18.4s +add v15.4s, v15.4s, v18.4s +sqrdmulh v18.4S, v12.4S, v7.s[3] +mla v6.4S, v19.4S, v31.s[0] +nop +nop +sqrdmulh v19.4S, v27.4S, v7.s[2] +mla v11.4S, v20.4S, v31.s[0] +nop +nop +sqrdmulh v20.4S, v14.4S, v7.s[1] +mla v8.4S, v28.4S, v31.s[0] +nop +nop +sqrdmulh v28.4S, v0.4S, v7.s[0] +mla v26.4S, v25.4S, v31.s[0] +nop +nop +mul v27.4S, v27.4S,v2.s[2] +mul v12.4S, v12.4S,v2.s[3] +sub v25.4s, v13.4s, v6.4s +str q25, [x0, #992] +mla v27.4S, v19.4S, v31.s[0] +mla v12.4S, v18.4S, v31.s[0] +add v13.4s, v13.4s, v6.4s +str q13, [x0, #928] +mul v0.4S, v0.4S,v2.s[0] +mul v14.4S, v14.4S,v2.s[1] +sub v13.4s, v21.4s, v11.4s +str q13, [x0, #864] +mla v0.4S, v28.4S, v31.s[0] +mla v14.4S, v20.4S, v31.s[0] +add v21.4s, v21.4s, v11.4s +sub v11.4s, v16.4s, v8.4s +ldr q20, [x0, #1008] +sqrdmulh v28.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v16.4s, v16.4s, v8.4s +str q21, [x0, #800] +ldr q21, [x0, #944] +sqrdmulh v8.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +sub v13.4s, v24.4s, v26.4s +str q11, [x0, #736] +ldr q11, [x0, #880] +sqrdmulh v6.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +add v24.4s, v24.4s, v26.4s +str q16, [x0, #672] +ldr q16, [x0, #816] +sqrdmulh v26.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v18.4s, v3.4s, v12.4s +str q13, [x0, #608] +ldr q13, [x0, #752] +sqrdmulh v19.4S, v13.4S, v29.s[0] +mla v20.4S, v28.4S, v31.s[0] +add v3.4s, v3.4s, v12.4s +str q24, [x0, #544] +ldr q24, [x0, #688] +sqrdmulh v12.4S, v24.4S, v29.s[0] +mla v21.4S, v8.4S, v31.s[0] +sub v8.4s, v17.4s, v27.4s +str q18, [x0, #480] +ldr q18, [x0, #624] +sqrdmulh v28.4S, v18.4S, v29.s[0] +mla v11.4S, v6.4S, v31.s[0] +add v17.4s, v17.4s, v27.4s +str q3, [x0, #416] +ldr q3, [x0, #560] +sqrdmulh v27.4S, v3.4S, v29.s[0] +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v10.4s, v14.4s +str q8, [x0, #352] +ldr q8, [x0, #496] +add v10.4s, v10.4s, v14.4s +mul v24.4S, v24.4S,v30.s[0] +mul v13.4S, v13.4S,v30.s[0] +ldr q14, [x0, #432] +str q17, [x0, #288] +ldr q17, [x0, #368] +ldr q6, [x0, #304] +mla v24.4S, v12.4S, v31.s[0] +mla v13.4S, v19.4S, v31.s[0] +str q26, [x0, #224] +sub v26.4s, v15.4s, v0.4s +ldr q19, [x0, #240] +ldr q12, [x0, #176] +mul v3.4S, v3.4S,v30.s[0] +mul v18.4S, v18.4S,v30.s[0] +str q10, [x0, #160] +add v15.4s, v15.4s, v0.4s +ldr q0, [x0, #112] +ldr q10, [x0, #48] +mla v3.4S, v27.4S, v31.s[0] +mla v18.4S, v28.4S, v31.s[0] +sub v28.4s, v8.4s, v20.4s +add v8.4s, v8.4s, v20.4s +sqrdmulh v20.4S, v28.4S, v29.s[2] +mul v28.4S, v28.4S,v30.s[2] +sub v27.4s, v14.4s, v21.4s +add v14.4s, v14.4s, v21.4s +sqrdmulh v21.4S, v27.4S, v29.s[2] +mul v27.4S, v27.4S,v30.s[2] +sub v25.4s, v17.4s, v11.4s +add v17.4s, v17.4s, v11.4s +sqrdmulh v11.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +sub v5.4s, v6.4s, v16.4s +add v6.4s, v6.4s, v16.4s +sqrdmulh v16.4S, v14.4S, v29.s[1] +mul v14.4S, v14.4S,v30.s[1] +sub v4.4s, v19.4s, v13.4s +add v19.4s, v19.4s, v13.4s +sqrdmulh v13.4S, v25.4S, v29.s[2] +mla v28.4S, v20.4S, v31.s[0] +sub v20.4s, v12.4s, v24.4s +add v12.4s, v12.4s, v24.4s +sqrdmulh v24.4S, v5.4S, v29.s[2] +mla v27.4S, v21.4S, v31.s[0] +sub v21.4s, v0.4s, v18.4s +add v0.4s, v0.4s, v18.4s +sqrdmulh v18.4S, v17.4S, v29.s[1] +mla v8.4S, v11.4S, v31.s[0] +sub v11.4s, v10.4s, v3.4s +str q26, [x0, #96] +sqrdmulh v26.4S, v6.4S, v29.s[1] +mla v14.4S, v16.4S, v31.s[0] +add v10.4s, v10.4s, v3.4s +str q15, [x0, #32] +mul v5.4S, v5.4S,v30.s[2] +mul v25.4S, v25.4S,v30.s[2] +sub v15.4s, v4.4s, v28.4s +add v4.4s, v4.4s, v28.4s +mla v5.4S, v24.4S, v31.s[0] +mla v25.4S, v13.4S, v31.s[0] +sub v13.4s, v20.4s, v27.4s +add v20.4s, v20.4s, v27.4s +mul v6.4S, v6.4S,v30.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v27.4s, v19.4s, v8.4s +add v19.4s, v19.4s, v8.4s +mla v6.4S, v26.4S, v31.s[0] +mla v17.4S, v18.4S, v31.s[0] +sub v18.4s, v12.4s, v14.4s +add v12.4s, v12.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v22.s[3] +mul v15.4S, v15.4S,v23.s[3] +sub v26.4s, v21.4s, v25.4s +add v21.4s, v21.4s, v25.4s +sqrdmulh v25.4S, v4.4S, v22.s[2] +mul v4.4S, v4.4S,v23.s[2] +sub v8.4s, v11.4s, v5.4s +add v11.4s, v11.4s, v5.4s +sqrdmulh v5.4S, v27.4S, v22.s[1] +mul v27.4S, v27.4S,v23.s[1] +sub v24.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +sqrdmulh v17.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v28.4s, v10.4s, v6.4s +add v10.4s, v10.4s, v6.4s +sqrdmulh v6.4S, v13.4S, v22.s[3] +mla v15.4S, v14.4S, v31.s[0] +nop +nop +sqrdmulh v14.4S, v20.4S, v22.s[2] +mla v4.4S, v25.4S, v31.s[0] +nop +nop +sqrdmulh v25.4S, v18.4S, v22.s[1] +mla v27.4S, v5.4S, v31.s[0] +nop +nop +sqrdmulh v5.4S, v12.4S, v22.s[0] +mla v19.4S, v17.4S, v31.s[0] +nop +nop +mul v20.4S, v20.4S,v23.s[2] +mul v13.4S, v13.4S,v23.s[3] +sub v17.4s, v26.4s, v15.4s +add v26.4s, v26.4s, v15.4s +mla v20.4S, v14.4S, v31.s[0] +mla v13.4S, v6.4S, v31.s[0] +sub v6.4s, v21.4s, v4.4s +add v21.4s, v21.4s, v4.4s +mul v12.4S, v12.4S,v23.s[0] +mul v18.4S, v18.4S,v23.s[1] +sub v4.4s, v24.4s, v27.4s +add v24.4s, v24.4s, v27.4s +mla v12.4S, v5.4S, v31.s[0] +mla v18.4S, v25.4S, v31.s[0] +sub v25.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v17.4S, v9.s[3] +mul v17.4S, v17.4S,v1.s[3] +sub v5.4s, v8.4s, v13.4s +add v8.4s, v8.4s, v13.4s +sqrdmulh v13.4S, v26.4S, v9.s[2] +mul v26.4S, v26.4S,v1.s[2] +sub v27.4s, v11.4s, v20.4s +add v11.4s, v11.4s, v20.4s +sqrdmulh v20.4S, v6.4S, v9.s[1] +mul v6.4S, v6.4S,v1.s[1] +sub v14.4s, v28.4s, v18.4s +add v28.4s, v28.4s, v18.4s +sqrdmulh v18.4S, v21.4S, v9.s[0] +mul v21.4S, v21.4S,v1.s[0] +sub v15.4s, v10.4s, v12.4s +add v10.4s, v10.4s, v12.4s +sqrdmulh v12.4S, v4.4S, v7.s[3] +mla v17.4S, v19.4S, v31.s[0] +nop +nop +sqrdmulh v19.4S, v24.4S, v7.s[2] +mla v26.4S, v13.4S, v31.s[0] +nop +nop +sqrdmulh v13.4S, v25.4S, v7.s[1] +mla v6.4S, v20.4S, v31.s[0] +nop +nop +sqrdmulh v20.4S, v0.4S, v7.s[0] +mla v21.4S, v18.4S, v31.s[0] +nop +nop +mul v24.4S, v24.4S,v2.s[2] +mul v4.4S, v4.4S,v2.s[3] +sub v18.4s, v5.4s, v17.4s +str q18, [x0, #1008] +mla v24.4S, v19.4S, v31.s[0] +mla v4.4S, v12.4S, v31.s[0] +add v5.4s, v5.4s, v17.4s +str q5, [x0, #944] +mul v0.4S, v0.4S,v2.s[0] +mul v25.4S, v25.4S,v2.s[1] +sub v5.4s, v8.4s, v26.4s +str q5, [x0, #880] +mla v0.4S, v20.4S, v31.s[0] +mla v25.4S, v13.4S, v31.s[0] +add v8.4s, v8.4s, v26.4s +sub v26.4s, v27.4s, v6.4s +ldr q13, [x0, #960] +sqrdmulh v20.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +add v27.4s, v27.4s, v6.4s +str q8, [x0, #816] +ldr q8, [x0, #896] +sqrdmulh v6.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v5.4s, v11.4s, v21.4s +str q26, [x0, #752] +ldr q26, [x0, #832] +sqrdmulh v17.4S, v26.4S, v29.s[0] +mul v26.4S, v26.4S,v30.s[0] +add v11.4s, v11.4s, v21.4s +str q27, [x0, #688] +ldr q27, [x0, #768] +sqrdmulh v21.4S, v27.4S, v29.s[0] +mul v27.4S, v27.4S,v30.s[0] +sub v12.4s, v14.4s, v4.4s +str q5, [x0, #624] +ldr q5, [x0, #704] +sqrdmulh v19.4S, v5.4S, v29.s[0] +mla v13.4S, v20.4S, v31.s[0] +add v14.4s, v14.4s, v4.4s +str q11, [x0, #560] +ldr q11, [x0, #640] +sqrdmulh v4.4S, v11.4S, v29.s[0] +mla v8.4S, v6.4S, v31.s[0] +sub v6.4s, v28.4s, v24.4s +str q12, [x0, #496] +ldr q12, [x0, #576] +sqrdmulh v20.4S, v12.4S, v29.s[0] +mla v26.4S, v17.4S, v31.s[0] +add v28.4s, v28.4s, v24.4s +str q14, [x0, #432] +ldr q14, [x0, #512] +sqrdmulh v24.4S, v14.4S, v29.s[0] +mla v27.4S, v21.4S, v31.s[0] +sub v21.4s, v15.4s, v25.4s +str q6, [x0, #368] +ldr q6, [x0, #448] +add v15.4s, v15.4s, v25.4s +mul v11.4S, v11.4S,v30.s[0] +mul v5.4S, v5.4S,v30.s[0] +ldr q25, [x0, #384] +str q28, [x0, #304] +ldr q28, [x0, #320] +ldr q17, [x0, #256] +mla v11.4S, v4.4S, v31.s[0] +mla v5.4S, v19.4S, v31.s[0] +str q21, [x0, #240] +sub v21.4s, v10.4s, v0.4s +ldr q19, [x0, #192] +ldr q4, [x0, #128] +mul v14.4S, v14.4S,v30.s[0] +mul v12.4S, v12.4S,v30.s[0] +str q15, [x0, #176] +add v10.4s, v10.4s, v0.4s +ldr q0, [x0, #64] +ldr q15, [x0, #0] +mla v14.4S, v24.4S, v31.s[0] +mla v12.4S, v20.4S, v31.s[0] +sub v20.4s, v6.4s, v13.4s +add v6.4s, v6.4s, v13.4s +sqrdmulh v13.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +sub v24.4s, v25.4s, v8.4s +add v25.4s, v25.4s, v8.4s +sqrdmulh v8.4S, v24.4S, v29.s[2] +mul v24.4S, v24.4S,v30.s[2] +sub v18.4s, v28.4s, v26.4s +add v28.4s, v28.4s, v26.4s +sqrdmulh v26.4S, v6.4S, v29.s[1] +mul v6.4S, v6.4S,v30.s[1] +sub v3.4s, v17.4s, v27.4s +add v17.4s, v17.4s, v27.4s +sqrdmulh v27.4S, v25.4S, v29.s[1] +mul v25.4S, v25.4S,v30.s[1] +sub v16.4s, v19.4s, v5.4s +add v19.4s, v19.4s, v5.4s +sqrdmulh v5.4S, v18.4S, v29.s[2] +mla v20.4S, v13.4S, v31.s[0] +sub v13.4s, v4.4s, v11.4s +add v4.4s, v4.4s, v11.4s +sqrdmulh v11.4S, v3.4S, v29.s[2] +mla v24.4S, v8.4S, v31.s[0] +sub v8.4s, v0.4s, v12.4s +add v0.4s, v0.4s, v12.4s +sqrdmulh v12.4S, v28.4S, v29.s[1] +mla v6.4S, v26.4S, v31.s[0] +sub v26.4s, v15.4s, v14.4s +str q21, [x0, #112] +sqrdmulh v21.4S, v17.4S, v29.s[1] +mla v25.4S, v27.4S, v31.s[0] +add v15.4s, v15.4s, v14.4s +str q10, [x0, #48] +mul v3.4S, v3.4S,v30.s[2] +mul v18.4S, v18.4S,v30.s[2] +sub v10.4s, v16.4s, v20.4s +add v16.4s, v16.4s, v20.4s +mla v3.4S, v11.4S, v31.s[0] +mla v18.4S, v5.4S, v31.s[0] +sub v5.4s, v13.4s, v24.4s +add v13.4s, v13.4s, v24.4s +mul v17.4S, v17.4S,v30.s[1] +mul v28.4S, v28.4S,v30.s[1] +sub v24.4s, v19.4s, v6.4s +add v19.4s, v19.4s, v6.4s +mla v17.4S, v21.4S, v31.s[0] +mla v28.4S, v12.4S, v31.s[0] +sub v12.4s, v4.4s, v25.4s +add v4.4s, v4.4s, v25.4s +sqrdmulh v25.4S, v10.4S, v22.s[3] +mul v10.4S, v10.4S,v23.s[3] +sub v21.4s, v8.4s, v18.4s +add v8.4s, v8.4s, v18.4s +sqrdmulh v18.4S, v16.4S, v22.s[2] +mul v16.4S, v16.4S,v23.s[2] +sub v6.4s, v26.4s, v3.4s +add v26.4s, v26.4s, v3.4s +sqrdmulh v3.4S, v24.4S, v22.s[1] +mul v24.4S, v24.4S,v23.s[1] +sub v11.4s, v0.4s, v28.4s +add v0.4s, v0.4s, v28.4s +sqrdmulh v28.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v20.4s, v15.4s, v17.4s +add v15.4s, v15.4s, v17.4s +sqrdmulh v17.4S, v5.4S, v22.s[3] +mla v10.4S, v25.4S, v31.s[0] +nop +nop +sqrdmulh v25.4S, v13.4S, v22.s[2] +mla v16.4S, v18.4S, v31.s[0] +nop +nop +sqrdmulh v18.4S, v12.4S, v22.s[1] +mla v24.4S, v3.4S, v31.s[0] +nop +nop +sqrdmulh v3.4S, v4.4S, v22.s[0] +mla v19.4S, v28.4S, v31.s[0] +nop +nop +mul v13.4S, v13.4S,v23.s[2] +mul v5.4S, v5.4S,v23.s[3] +sub v28.4s, v21.4s, v10.4s +add v21.4s, v21.4s, v10.4s +mla v13.4S, v25.4S, v31.s[0] +mla v5.4S, v17.4S, v31.s[0] +sub v17.4s, v8.4s, v16.4s +add v8.4s, v8.4s, v16.4s +mul v4.4S, v4.4S,v23.s[0] +mul v12.4S, v12.4S,v23.s[1] +sub v16.4s, v11.4s, v24.4s +add v11.4s, v11.4s, v24.4s +mla v4.4S, v3.4S, v31.s[0] +mla v12.4S, v18.4S, v31.s[0] +sub v18.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v28.4S, v9.s[3] +mul v28.4S, v28.4S,v1.s[3] +sub v3.4s, v6.4s, v5.4s +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v21.4S, v9.s[2] +mul v21.4S, v21.4S,v1.s[2] +sub v24.4s, v26.4s, v13.4s +add v26.4s, v26.4s, v13.4s +sqrdmulh v13.4S, v17.4S, v9.s[1] +mul v17.4S, v17.4S,v1.s[1] +sub v25.4s, v20.4s, v12.4s +add v20.4s, v20.4s, v12.4s +sqrdmulh v12.4S, v8.4S, v9.s[0] +mul v8.4S, v8.4S,v1.s[0] +sub v10.4s, v15.4s, v4.4s +add v15.4s, v15.4s, v4.4s +sqrdmulh v4.4S, v16.4S, v7.s[3] +mla v28.4S, v19.4S, v31.s[0] +nop +nop +sqrdmulh v19.4S, v11.4S, v7.s[2] +mla v21.4S, v5.4S, v31.s[0] +nop +nop +sqrdmulh v5.4S, v18.4S, v7.s[1] +mla v17.4S, v13.4S, v31.s[0] +nop +nop +sqrdmulh v13.4S, v0.4S, v7.s[0] +mla v8.4S, v12.4S, v31.s[0] +nop +nop +mul v11.4S, v11.4S,v2.s[2] +mul v16.4S, v16.4S,v2.s[3] +sub v12.4s, v3.4s, v28.4s +str q12, [x0, #960] +mla v11.4S, v19.4S, v31.s[0] +mla v16.4S, v4.4S, v31.s[0] +add v3.4s, v3.4s, v28.4s +str q3, [x0, #896] +mul v0.4S, v0.4S,v2.s[0] +mul v18.4S, v18.4S,v2.s[1] +sub v3.4s, v6.4s, v21.4s +str q3, [x0, #832] +mla v0.4S, v13.4S, v31.s[0] +mla v18.4S, v5.4S, v31.s[0] +add v6.4s, v6.4s, v21.4s +sub v21.4s, v24.4s, v17.4s +ldr q5, [x0, #976] +sqrdmulh v13.4S, v5.4S, v29.s[0] +mul v5.4S, v5.4S,v30.s[0] +add v24.4s, v24.4s, v17.4s +str q6, [x0, #768] +ldr q6, [x0, #912] +sqrdmulh v17.4S, v6.4S, v29.s[0] +mul v6.4S, v6.4S,v30.s[0] +sub v3.4s, v26.4s, v8.4s +str q21, [x0, #704] +ldr q21, [x0, #848] +sqrdmulh v28.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +add v26.4s, v26.4s, v8.4s +str q24, [x0, #640] +ldr q24, [x0, #784] +sqrdmulh v8.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +sub v4.4s, v25.4s, v16.4s +str q3, [x0, #576] +ldr q3, [x0, #720] +sqrdmulh v19.4S, v3.4S, v29.s[0] +mla v5.4S, v13.4S, v31.s[0] +add v25.4s, v25.4s, v16.4s +str q26, [x0, #512] +ldr q26, [x0, #656] +sqrdmulh v16.4S, v26.4S, v29.s[0] +mla v6.4S, v17.4S, v31.s[0] +sub v17.4s, v20.4s, v11.4s +str q4, [x0, #448] +ldr q4, [x0, #592] +sqrdmulh v13.4S, v4.4S, v29.s[0] +mla v21.4S, v28.4S, v31.s[0] +add v20.4s, v20.4s, v11.4s +str q25, [x0, #384] +ldr q25, [x0, #528] +sqrdmulh v11.4S, v25.4S, v29.s[0] +mla v24.4S, v8.4S, v31.s[0] +sub v8.4s, v10.4s, v18.4s +str q17, [x0, #320] +ldr q17, [x0, #464] +add v10.4s, v10.4s, v18.4s +mul v26.4S, v26.4S,v30.s[0] +mul v3.4S, v3.4S,v30.s[0] +ldr q18, [x0, #400] +str q20, [x0, #256] +ldr q20, [x0, #336] +ldr q28, [x0, #272] +mla v26.4S, v16.4S, v31.s[0] +mla v3.4S, v19.4S, v31.s[0] +str q8, [x0, #192] +sub v8.4s, v15.4s, v0.4s +ldr q19, [x0, #208] +ldr q16, [x0, #144] +mul v25.4S, v25.4S,v30.s[0] +mul v4.4S, v4.4S,v30.s[0] +str q10, [x0, #128] +add v15.4s, v15.4s, v0.4s +ldr q0, [x0, #80] +ldr q10, [x0, #16] +mla v25.4S, v11.4S, v31.s[0] +mla v4.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v5.4s +add v17.4s, v17.4s, v5.4s +sqrdmulh v5.4S, v13.4S, v29.s[2] +mul v13.4S, v13.4S,v30.s[2] +sub v11.4s, v18.4s, v6.4s +add v18.4s, v18.4s, v6.4s +sqrdmulh v6.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v12.4s, v20.4s, v21.4s +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v14.4s, v28.4s, v24.4s +add v28.4s, v28.4s, v24.4s +sqrdmulh v24.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v27.4s, v19.4s, v3.4s +add v19.4s, v19.4s, v3.4s +sqrdmulh v3.4S, v12.4S, v29.s[2] +mla v13.4S, v5.4S, v31.s[0] +sub v5.4s, v16.4s, v26.4s +add v16.4s, v16.4s, v26.4s +sqrdmulh v26.4S, v14.4S, v29.s[2] +mla v11.4S, v6.4S, v31.s[0] +sub v6.4s, v0.4s, v4.4s +add v0.4s, v0.4s, v4.4s +sqrdmulh v4.4S, v20.4S, v29.s[1] +mla v17.4S, v21.4S, v31.s[0] +sub v21.4s, v10.4s, v25.4s +str q8, [x0, #64] +sqrdmulh v8.4S, v28.4S, v29.s[1] +mla v18.4S, v24.4S, v31.s[0] +add v10.4s, v10.4s, v25.4s +str q15, [x0, #0] +mul v14.4S, v14.4S,v30.s[2] +mul v12.4S, v12.4S,v30.s[2] +sub v15.4s, v27.4s, v13.4s +add v27.4s, v27.4s, v13.4s +mla v14.4S, v26.4S, v31.s[0] +mla v12.4S, v3.4S, v31.s[0] +sub v3.4s, v5.4s, v11.4s +add v5.4s, v5.4s, v11.4s +mul v28.4S, v28.4S,v30.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v11.4s, v19.4s, v17.4s +add v19.4s, v19.4s, v17.4s +mla v28.4S, v8.4S, v31.s[0] +mla v20.4S, v4.4S, v31.s[0] +sub v4.4s, v16.4s, v18.4s +add v16.4s, v16.4s, v18.4s +sqrdmulh v29.4S, v15.4S, v22.s[3] +mul v15.4S, v15.4S,v23.s[3] +sub v30.4s, v6.4s, v12.4s +add v6.4s, v6.4s, v12.4s +sqrdmulh v12.4S, v27.4S, v22.s[2] +mul v27.4S, v27.4S,v23.s[2] +sub v18.4s, v21.4s, v14.4s +add v21.4s, v21.4s, v14.4s +sqrdmulh v14.4S, v11.4S, v22.s[1] +mul v11.4S, v11.4S,v23.s[1] +sub v8.4s, v0.4s, v20.4s +add v0.4s, v0.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v17.4s, v10.4s, v28.4s +add v10.4s, v10.4s, v28.4s +sqrdmulh v28.4S, v3.4S, v22.s[3] +mla v15.4S, v29.4S, v31.s[0] +nop +nop +sqrdmulh v29.4S, v5.4S, v22.s[2] +mla v27.4S, v12.4S, v31.s[0] +nop +nop +sqrdmulh v12.4S, v4.4S, v22.s[1] +mla v11.4S, v14.4S, v31.s[0] +nop +nop +sqrdmulh v14.4S, v16.4S, v22.s[0] +mla v19.4S, v20.4S, v31.s[0] +nop +nop +mul v5.4S, v5.4S,v23.s[2] +mul v3.4S, v3.4S,v23.s[3] +sub v20.4s, v30.4s, v15.4s +add v30.4s, v30.4s, v15.4s +mla v5.4S, v29.4S, v31.s[0] +mla v3.4S, v28.4S, v31.s[0] +sub v28.4s, v6.4s, v27.4s +add v6.4s, v6.4s, v27.4s +mul v16.4S, v16.4S,v23.s[0] +mul v4.4S, v4.4S,v23.s[1] +sub v27.4s, v8.4s, v11.4s +add v8.4s, v8.4s, v11.4s +mla v16.4S, v14.4S, v31.s[0] +mla v4.4S, v12.4S, v31.s[0] +sub v12.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v22.4S, v20.4S, v9.s[3] +mul v20.4S, v20.4S,v1.s[3] +sub v23.4s, v18.4s, v3.4s +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v30.4S, v9.s[2] +mul v30.4S, v30.4S,v1.s[2] +sub v19.4s, v21.4s, v5.4s +add v21.4s, v21.4s, v5.4s +sqrdmulh v5.4S, v28.4S, v9.s[1] +mul v28.4S, v28.4S,v1.s[1] +sub v14.4s, v17.4s, v4.4s +add v17.4s, v17.4s, v4.4s +sqrdmulh v4.4S, v6.4S, v9.s[0] +mul v6.4S, v6.4S,v1.s[0] +sub v11.4s, v10.4s, v16.4s +add v10.4s, v10.4s, v16.4s +sqrdmulh v9.4S, v27.4S, v7.s[3] +mla v20.4S, v22.4S, v31.s[0] +nop +nop +sqrdmulh v22.4S, v8.4S, v7.s[2] +mla v30.4S, v3.4S, v31.s[0] +nop +nop +sqrdmulh v3.4S, v12.4S, v7.s[1] +mla v28.4S, v5.4S, v31.s[0] +nop +nop +sqrdmulh v5.4S, v0.4S, v7.s[0] +mla v6.4S, v4.4S, v31.s[0] +nop +nop +mul v8.4S, v8.4S,v2.s[2] +mul v27.4S, v27.4S,v2.s[3] +sub v4.4s, v23.4s, v20.4s +str q4, [x0, #976] +mla v8.4S, v22.4S, v31.s[0] +mla v27.4S, v9.4S, v31.s[0] +add v23.4s, v23.4s, v20.4s +str q23, [x0, #912] +mul v0.4S, v0.4S,v2.s[0] +mul v12.4S, v12.4S,v2.s[1] +sub v23.4s, v18.4s, v30.4s +str q23, [x0, #848] +mla v0.4S, v5.4S, v31.s[0] +mla v12.4S, v3.4S, v31.s[0] +add v18.4s, v18.4s, v30.4s +sub v30.4s, v19.4s, v28.4s +add v19.4s, v19.4s, v28.4s +str q18, [x0, #784] +sub v18.4s, v21.4s, v6.4s +str q30, [x0, #720] +add v21.4s, v21.4s, v6.4s +str q19, [x0, #656] +sub v19.4s, v14.4s, v27.4s +str q18, [x0, #592] +add v14.4s, v14.4s, v27.4s +str q21, [x0, #528] +sub v21.4s, v17.4s, v8.4s +str q19, [x0, #464] +add v17.4s, v17.4s, v8.4s +str q14, [x0, #400] +sub v14.4s, v11.4s, v12.4s +str q21, [x0, #336] +add v11.4s, v11.4s, v12.4s +str q17, [x0, #272] +sub v17.4s, v10.4s, v0.4s +add v10.4s, v10.4s, v0.4s +ldr q24, [x0, #48] +ldr q25, [x0, #32] +ldr q13, [x0, #112] +ldr q26, [x0, #96] +ldr q15, [x17, #+128] +ldr q29, [x17, #+144] +ldr q16, [x17, #+160] +ldr q1, [x17, #+176] +ldr q4, [x0, #176] +ldr q22, [x0, #160] +sqrdmulh v9.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v15.s[0] +ldr q20, [x0, #240] +sqrdmulh v23.4S, v25.4S, v29.s[0] +mul v25.4S, v25.4S,v15.s[0] +ldr q5, [x0, #224] +sqrdmulh v3.4S, v13.4S, v1.s[0] +mul v13.4S, v13.4S,v16.s[0] +ldr q2, [x17, #+192] +sqrdmulh v7.4S, v26.4S, v1.s[0] +mul v26.4S, v26.4S,v16.s[0] +ldr q28, [x17, #+208] +mla v24.4S, v9.4S, v31.s[0] +sqrdmulh v9.4S, v4.4S, v28.s[0] +ldr q30, [x17, #+224] +mla v25.4S, v23.4S, v31.s[0] +sqrdmulh v23.4S, v22.4S, v28.s[0] +ldr q6, [x17, #+240] +mla v13.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v20.4S, v6.s[0] +mla v26.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v5.4S, v6.s[0] +ldr q18, [x0, #0] +mul v4.4S, v4.4S,v2.s[0] +mul v22.4S, v22.4S,v2.s[0] +sub v27.4s, v10.4s, v24.4s +add v10.4s, v10.4s, v24.4s +mla v4.4S, v9.4S, v31.s[0] +mla v22.4S, v23.4S, v31.s[0] +sub v23.4s, v18.4s, v25.4s +ldr q9, [x0, #64] +add v18.4s, v18.4s, v25.4s +mul v20.4S, v20.4S,v30.s[0] +mul v5.4S, v5.4S,v30.s[0] +sub v25.4s, v17.4s, v13.4s +add v17.4s, v17.4s, v13.4s +mla v20.4S, v3.4S, v31.s[0] +mla v5.4S, v7.4S, v31.s[0] +sub v7.4s, v9.4s, v26.4s +ldr q3, [x0, #128] +add v9.4s, v9.4s, v26.4s +sqrdmulh v26.4S, v27.4S, v29.s[2] +mul v27.4S, v27.4S,v15.s[2] +sub v13.4s, v11.4s, v4.4s +add v11.4s, v11.4s, v4.4s +sqrdmulh v4.4S, v10.4S, v29.s[1] +mul v10.4S, v10.4S,v15.s[1] +sub v24.4s, v3.4s, v22.4s +ldr q19, [x0, #192] +add v3.4s, v3.4s, v22.4s +sqrdmulh v29.4S, v25.4S, v1.s[2] +mul v25.4S, v25.4S,v16.s[2] +sub v22.4s, v14.4s, v20.4s +ldr q15, [x0, #304] +add v14.4s, v14.4s, v20.4s +sqrdmulh v20.4S, v17.4S, v1.s[1] +mul v17.4S, v17.4S,v16.s[1] +sub v8.4s, v19.4s, v5.4s +ldr q21, [x0, #288] +add v19.4s, v19.4s, v5.4s +mla v27.4S, v26.4S, v31.s[0] +sqrdmulh v26.4S, v13.4S, v28.s[2] +sub v1.4s, v23.4s, v27.4s +ldr q5, [x0, #368] +str q1, [x0, #48] +mla v10.4S, v4.4S, v31.s[0] +sqrdmulh v4.4S, v11.4S, v28.s[1] +add v23.4s, v23.4s, v27.4s +ldr q27, [x0, #352] +str q23, [x0, #32] +mla v25.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v22.4S, v6.s[2] +sub v23.4s, v18.4s, v10.4s +ldr q1, [x17, #+256] +str q23, [x0, #16] +mla v17.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v14.4S, v6.s[1] +add v18.4s, v18.4s, v10.4s +ldr q10, [x17, #+272] +str q18, [x0, #0] +mul v13.4S, v13.4S,v2.s[2] +mul v11.4S, v11.4S,v2.s[1] +sub v18.4s, v7.4s, v25.4s +ldr q23, [x17, #+288] +str q18, [x0, #112] +mla v13.4S, v26.4S, v31.s[0] +mla v11.4S, v4.4S, v31.s[0] +add v7.4s, v7.4s, v25.4s +ldr q25, [x17, #+304] +str q7, [x0, #96] +mul v22.4S, v22.4S,v30.s[2] +mul v14.4S, v14.4S,v30.s[1] +sub v28.4s, v9.4s, v17.4s +ldr q7, [x0, #432] +str q28, [x0, #80] +mla v22.4S, v29.4S, v31.s[0] +mla v14.4S, v20.4S, v31.s[0] +add v9.4s, v9.4s, v17.4s +ldr q17, [x0, #416] +str q9, [x0, #64] +sqrdmulh v6.4S, v15.4S, v10.s[0] +mul v15.4S, v15.4S,v1.s[0] +sub v9.4s, v24.4s, v13.4s +ldr q30, [x0, #496] +str q9, [x0, #176] +sqrdmulh v9.4S, v21.4S, v10.s[0] +mul v21.4S, v21.4S,v1.s[0] +add v24.4s, v24.4s, v13.4s +ldr q13, [x0, #480] +str q24, [x0, #160] +sqrdmulh v24.4S, v5.4S, v25.s[0] +mul v5.4S, v5.4S,v23.s[0] +sub v20.4s, v3.4s, v11.4s +ldr q29, [x17, #+320] +str q20, [x0, #144] +sqrdmulh v20.4S, v27.4S, v25.s[0] +mul v27.4S, v27.4S,v23.s[0] +add v3.4s, v3.4s, v11.4s +ldr q11, [x17, #+336] +str q3, [x0, #128] +mla v15.4S, v6.4S, v31.s[0] +sqrdmulh v6.4S, v7.4S, v11.s[0] +sub v3.4s, v8.4s, v22.4s +ldr q28, [x17, #+352] +str q3, [x0, #240] +mla v21.4S, v9.4S, v31.s[0] +sqrdmulh v9.4S, v17.4S, v11.s[0] +add v8.4s, v8.4s, v22.4s +ldr q22, [x17, #+368] +str q8, [x0, #224] +mla v5.4S, v24.4S, v31.s[0] +sqrdmulh v24.4S, v30.4S, v22.s[0] +sub v8.4s, v19.4s, v14.4s +ldr q3, [x0, #272] +str q8, [x0, #208] +mla v27.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v13.4S, v22.s[0] +add v19.4s, v19.4s, v14.4s +ldr q14, [x0, #256] +str q19, [x0, #192] +mul v7.4S, v7.4S,v29.s[0] +mul v17.4S, v17.4S,v29.s[0] +sub v19.4s, v3.4s, v15.4s +ldr q8, [x0, #336] +add v3.4s, v3.4s, v15.4s +mla v7.4S, v6.4S, v31.s[0] +mla v17.4S, v9.4S, v31.s[0] +sub v9.4s, v14.4s, v21.4s +ldr q6, [x0, #320] +add v14.4s, v14.4s, v21.4s +mul v30.4S, v30.4S,v28.s[0] +mul v13.4S, v13.4S,v28.s[0] +sub v21.4s, v8.4s, v5.4s +ldr q15, [x0, #400] +add v8.4s, v8.4s, v5.4s +mla v30.4S, v24.4S, v31.s[0] +mla v13.4S, v20.4S, v31.s[0] +sub v20.4s, v6.4s, v27.4s +ldr q24, [x0, #384] +add v6.4s, v6.4s, v27.4s +sqrdmulh v27.4S, v19.4S, v10.s[2] +mul v19.4S, v19.4S,v1.s[2] +sub v5.4s, v15.4s, v7.4s +ldr q2, [x0, #464] +add v15.4s, v15.4s, v7.4s +sqrdmulh v7.4S, v3.4S, v10.s[1] +mul v3.4S, v3.4S,v1.s[1] +sub v4.4s, v24.4s, v17.4s +ldr q26, [x0, #448] +add v24.4s, v24.4s, v17.4s +sqrdmulh v10.4S, v21.4S, v25.s[2] +mul v21.4S, v21.4S,v23.s[2] +sub v17.4s, v2.4s, v30.4s +ldr q1, [x0, #560] +add v2.4s, v2.4s, v30.4s +sqrdmulh v30.4S, v8.4S, v25.s[1] +mul v8.4S, v8.4S,v23.s[1] +sub v18.4s, v26.4s, v13.4s +ldr q16, [x0, #544] +add v26.4s, v26.4s, v13.4s +mla v19.4S, v27.4S, v31.s[0] +sqrdmulh v27.4S, v5.4S, v11.s[2] +sub v25.4s, v9.4s, v19.4s +ldr q13, [x0, #624] +str q25, [x0, #304] +mla v3.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v15.4S, v11.s[1] +add v9.4s, v9.4s, v19.4s +ldr q19, [x0, #608] +str q9, [x0, #288] +mla v21.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v17.4S, v22.s[2] +sub v9.4s, v14.4s, v3.4s +ldr q25, [x17, #+384] +str q9, [x0, #272] +mla v8.4S, v30.4S, v31.s[0] +sqrdmulh v30.4S, v2.4S, v22.s[1] +add v14.4s, v14.4s, v3.4s +ldr q3, [x17, #+400] +str q14, [x0, #256] +mul v5.4S, v5.4S,v29.s[2] +mul v15.4S, v15.4S,v29.s[1] +sub v14.4s, v20.4s, v21.4s +ldr q9, [x17, #+416] +str q14, [x0, #368] +mla v5.4S, v27.4S, v31.s[0] +mla v15.4S, v7.4S, v31.s[0] +add v20.4s, v20.4s, v21.4s +ldr q21, [x17, #+432] +str q20, [x0, #352] +mul v17.4S, v17.4S,v28.s[2] +mul v2.4S, v2.4S,v28.s[1] +sub v11.4s, v6.4s, v8.4s +ldr q20, [x0, #688] +str q11, [x0, #336] +mla v17.4S, v10.4S, v31.s[0] +mla v2.4S, v30.4S, v31.s[0] +add v6.4s, v6.4s, v8.4s +ldr q8, [x0, #672] +str q6, [x0, #320] +sqrdmulh v22.4S, v1.4S, v3.s[0] +mul v1.4S, v1.4S,v25.s[0] +sub v6.4s, v4.4s, v5.4s +ldr q28, [x0, #752] +str q6, [x0, #432] +sqrdmulh v6.4S, v16.4S, v3.s[0] +mul v16.4S, v16.4S,v25.s[0] +add v4.4s, v4.4s, v5.4s +ldr q5, [x0, #736] +str q4, [x0, #416] +sqrdmulh v4.4S, v13.4S, v21.s[0] +mul v13.4S, v13.4S,v9.s[0] +sub v30.4s, v24.4s, v15.4s +ldr q10, [x17, #+448] +str q30, [x0, #400] +sqrdmulh v30.4S, v19.4S, v21.s[0] +mul v19.4S, v19.4S,v9.s[0] +add v24.4s, v24.4s, v15.4s +ldr q15, [x17, #+464] +str q24, [x0, #384] +mla v1.4S, v22.4S, v31.s[0] +sqrdmulh v22.4S, v20.4S, v15.s[0] +sub v24.4s, v18.4s, v17.4s +ldr q11, [x17, #+480] +str q24, [x0, #496] +mla v16.4S, v6.4S, v31.s[0] +sqrdmulh v6.4S, v8.4S, v15.s[0] +add v18.4s, v18.4s, v17.4s +ldr q17, [x17, #+496] +str q18, [x0, #480] +mla v13.4S, v4.4S, v31.s[0] +sqrdmulh v4.4S, v28.4S, v17.s[0] +sub v18.4s, v26.4s, v2.4s +ldr q24, [x0, #528] +str q18, [x0, #464] +mla v19.4S, v30.4S, v31.s[0] +sqrdmulh v30.4S, v5.4S, v17.s[0] +add v26.4s, v26.4s, v2.4s +ldr q2, [x0, #512] +str q26, [x0, #448] +mul v20.4S, v20.4S,v10.s[0] +mul v8.4S, v8.4S,v10.s[0] +sub v26.4s, v24.4s, v1.4s +ldr q18, [x0, #592] +add v24.4s, v24.4s, v1.4s +mla v20.4S, v22.4S, v31.s[0] +mla v8.4S, v6.4S, v31.s[0] +sub v6.4s, v2.4s, v16.4s +ldr q22, [x0, #576] +add v2.4s, v2.4s, v16.4s +mul v28.4S, v28.4S,v11.s[0] +mul v5.4S, v5.4S,v11.s[0] +sub v16.4s, v18.4s, v13.4s +ldr q1, [x0, #656] +add v18.4s, v18.4s, v13.4s +mla v28.4S, v4.4S, v31.s[0] +mla v5.4S, v30.4S, v31.s[0] +sub v30.4s, v22.4s, v19.4s +ldr q4, [x0, #640] +add v22.4s, v22.4s, v19.4s +sqrdmulh v19.4S, v26.4S, v3.s[2] +mul v26.4S, v26.4S,v25.s[2] +sub v13.4s, v1.4s, v20.4s +ldr q29, [x0, #720] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v24.4S, v3.s[1] +mul v24.4S, v24.4S,v25.s[1] +sub v7.4s, v4.4s, v8.4s +ldr q27, [x0, #704] +add v4.4s, v4.4s, v8.4s +sqrdmulh v3.4S, v16.4S, v21.s[2] +mul v16.4S, v16.4S,v9.s[2] +sub v8.4s, v29.4s, v28.4s +ldr q25, [x0, #816] +add v29.4s, v29.4s, v28.4s +sqrdmulh v28.4S, v18.4S, v21.s[1] +mul v18.4S, v18.4S,v9.s[1] +sub v14.4s, v27.4s, v5.4s +ldr q23, [x0, #800] +add v27.4s, v27.4s, v5.4s +mla v26.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v13.4S, v15.s[2] +sub v21.4s, v6.4s, v26.4s +ldr q5, [x0, #880] +str q21, [x0, #560] +mla v24.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v1.4S, v15.s[1] +add v6.4s, v6.4s, v26.4s +ldr q26, [x0, #864] +str q6, [x0, #544] +mla v16.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v8.4S, v17.s[2] +sub v6.4s, v2.4s, v24.4s +ldr q21, [x17, #+512] +str q6, [x0, #528] +mla v18.4S, v28.4S, v31.s[0] +sqrdmulh v28.4S, v29.4S, v17.s[1] +add v2.4s, v2.4s, v24.4s +ldr q24, [x17, #+528] +str q2, [x0, #512] +mul v13.4S, v13.4S,v10.s[2] +mul v1.4S, v1.4S,v10.s[1] +sub v2.4s, v30.4s, v16.4s +ldr q6, [x17, #+544] +str q2, [x0, #624] +mla v13.4S, v19.4S, v31.s[0] +mla v1.4S, v20.4S, v31.s[0] +add v30.4s, v30.4s, v16.4s +ldr q16, [x17, #+560] +str q30, [x0, #608] +mul v8.4S, v8.4S,v11.s[2] +mul v29.4S, v29.4S,v11.s[1] +sub v15.4s, v22.4s, v18.4s +ldr q30, [x0, #944] +str q15, [x0, #592] +mla v8.4S, v3.4S, v31.s[0] +mla v29.4S, v28.4S, v31.s[0] +add v22.4s, v22.4s, v18.4s +ldr q18, [x0, #928] +str q22, [x0, #576] +sqrdmulh v17.4S, v25.4S, v24.s[0] +mul v25.4S, v25.4S,v21.s[0] +sub v22.4s, v7.4s, v13.4s +ldr q11, [x0, #1008] +str q22, [x0, #688] +sqrdmulh v22.4S, v23.4S, v24.s[0] +mul v23.4S, v23.4S,v21.s[0] +add v7.4s, v7.4s, v13.4s +ldr q13, [x0, #992] +str q7, [x0, #672] +sqrdmulh v7.4S, v5.4S, v16.s[0] +mul v5.4S, v5.4S,v6.s[0] +sub v28.4s, v4.4s, v1.4s +ldr q3, [x17, #+576] +str q28, [x0, #656] +sqrdmulh v28.4S, v26.4S, v16.s[0] +mul v26.4S, v26.4S,v6.s[0] +add v4.4s, v4.4s, v1.4s +ldr q1, [x17, #+592] +str q4, [x0, #640] +mla v25.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v30.4S, v1.s[0] +sub v4.4s, v14.4s, v8.4s +ldr q15, [x17, #+608] +str q4, [x0, #752] +mla v23.4S, v22.4S, v31.s[0] +sqrdmulh v22.4S, v18.4S, v1.s[0] +add v14.4s, v14.4s, v8.4s +ldr q8, [x17, #+624] +str q14, [x0, #736] +mla v5.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v11.4S, v8.s[0] +sub v14.4s, v27.4s, v29.4s +ldr q4, [x0, #784] +str q14, [x0, #720] +mla v26.4S, v28.4S, v31.s[0] +sqrdmulh v28.4S, v13.4S, v8.s[0] +add v27.4s, v27.4s, v29.4s +ldr q29, [x0, #768] +str q27, [x0, #704] +mul v30.4S, v30.4S,v3.s[0] +mul v18.4S, v18.4S,v3.s[0] +sub v27.4s, v4.4s, v25.4s +ldr q14, [x0, #848] +add v4.4s, v4.4s, v25.4s +mla v30.4S, v17.4S, v31.s[0] +mla v18.4S, v22.4S, v31.s[0] +sub v22.4s, v29.4s, v23.4s +ldr q17, [x0, #832] +add v29.4s, v29.4s, v23.4s +mul v11.4S, v11.4S,v15.s[0] +mul v13.4S, v13.4S,v15.s[0] +sub v23.4s, v14.4s, v5.4s +ldr q25, [x0, #912] +add v14.4s, v14.4s, v5.4s +mla v11.4S, v7.4S, v31.s[0] +mla v13.4S, v28.4S, v31.s[0] +sub v28.4s, v17.4s, v26.4s +ldr q7, [x0, #896] +add v17.4s, v17.4s, v26.4s +sqrdmulh v26.4S, v27.4S, v24.s[2] +mul v27.4S, v27.4S,v21.s[2] +sub v5.4s, v25.4s, v30.4s +ldr q10, [x0, #976] +add v25.4s, v25.4s, v30.4s +sqrdmulh v30.4S, v4.4S, v24.s[1] +mul v4.4S, v4.4S,v21.s[1] +sub v20.4s, v7.4s, v18.4s +ldr q19, [x0, #960] +add v7.4s, v7.4s, v18.4s +sqrdmulh v24.4S, v23.4S, v16.s[2] +mul v23.4S, v23.4S,v6.s[2] +sub v18.4s, v10.4s, v11.4s +add v10.4s, v10.4s, v11.4s +sqrdmulh v11.4S, v14.4S, v16.s[1] +mul v14.4S, v14.4S,v6.s[1] +sub v21.4s, v19.4s, v13.4s +add v19.4s, v19.4s, v13.4s +mla v27.4S, v26.4S, v31.s[0] +sqrdmulh v26.4S, v5.4S, v1.s[2] +sub v16.4s, v22.4s, v27.4s +str q16, [x0, #816] +mla v4.4S, v30.4S, v31.s[0] +sqrdmulh v30.4S, v25.4S, v1.s[1] +add v22.4s, v22.4s, v27.4s +str q22, [x0, #800] +mla v23.4S, v24.4S, v31.s[0] +sqrdmulh v24.4S, v18.4S, v8.s[2] +sub v22.4s, v29.4s, v4.4s +str q22, [x0, #784] +mla v14.4S, v11.4S, v31.s[0] +sqrdmulh v11.4S, v10.4S, v8.s[1] +add v29.4s, v29.4s, v4.4s +str q29, [x0, #768] +mul v5.4S, v5.4S,v3.s[2] +mul v25.4S, v25.4S,v3.s[1] +sub v29.4s, v28.4s, v23.4s +str q29, [x0, #880] +mla v5.4S, v26.4S, v31.s[0] +mla v25.4S, v30.4S, v31.s[0] +add v28.4s, v28.4s, v23.4s +str q28, [x0, #864] +mul v18.4S, v18.4S,v15.s[2] +mul v10.4S, v10.4S,v15.s[1] +sub v1.4s, v17.4s, v14.4s +str q1, [x0, #848] +mla v18.4S, v24.4S, v31.s[0] +mla v10.4S, v11.4S, v31.s[0] +add v17.4s, v17.4s, v14.4s +str q17, [x0, #832] +sub v8.4s, v20.4s, v5.4s +str q8, [x0, #944] +add v20.4s, v20.4s, v5.4s +str q20, [x0, #928] +sub v20.4s, v7.4s, v25.4s +str q20, [x0, #912] +add v7.4s, v7.4s, v25.4s +str q7, [x0, #896] +sub v7.4s, v21.4s, v18.4s +str q7, [x0, #1008] +add v21.4s, v21.4s, v18.4s +str q21, [x0, #992] +sub v21.4s, v19.4s, v10.4s +str q21, [x0, #976] +add v19.4s, v19.4s, v10.4s +str q19, [x0, #960] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1520 +// Instruction count: 1516 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_15.s b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_15.s new file mode 100644 index 0000000..afe097b --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_15.s @@ -0,0 +1,1550 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_22_z4_15 +.global _ntt_u32_incomplete_neon_asm_var_4_2_22_z4_15 +ntt_u32_incomplete_neon_asm_var_4_2_22_z4_15: +_ntt_u32_incomplete_neon_asm_var_4_2_22_z4_15: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x0, #992] +sqrdmulh v27.4S, v28.4S, v29.s[0] +mul v28.4S, v28.4S,v30.s[0] +ldr q26, [x0, #928] +sqrdmulh v25.4S, v26.4S, v29.s[0] +mul v26.4S, v26.4S,v30.s[0] +ldr q24, [x0, #864] +sqrdmulh v23.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +ldr q22, [x0, #800] +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +ldr q20, [x0, #736] +sqrdmulh v19.4S, v20.4S, v29.s[0] +mla v28.4S, v27.4S, v31.s[0] +ldr q27, [x0, #672] +sqrdmulh v18.4S, v27.4S, v29.s[0] +mla v26.4S, v25.4S, v31.s[0] +ldr q25, [x0, #608] +sqrdmulh v17.4S, v25.4S, v29.s[0] +mla v24.4S, v23.4S, v31.s[0] +ldr q23, [x0, #544] +sqrdmulh v16.4S, v23.4S, v29.s[0] +mla v22.4S, v21.4S, v31.s[0] +ldr q21, [x0, #480] +mul v27.4S, v27.4S,v30.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q3, [x0, #416] +ldr q2, [x0, #352] +ldr q1, [x0, #288] +mla v27.4S, v18.4S, v31.s[0] +mla v20.4S, v19.4S, v31.s[0] +ldr q19, [x0, #224] +ldr q18, [x0, #160] +mul v23.4S, v23.4S,v30.s[0] +mul v25.4S, v25.4S,v30.s[0] +ldr q0, [x0, #96] +ldr q15, [x0, #32] +mla v23.4S, v16.4S, v31.s[0] +mla v25.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v28.4s +add v21.4s, v21.4s, v28.4s +sqrdmulh v28.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +sub v16.4s, v3.4s, v26.4s +add v3.4s, v3.4s, v26.4s +sqrdmulh v26.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +sub v14.4s, v2.4s, v24.4s +add v2.4s, v2.4s, v24.4s +sqrdmulh v24.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v13.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +sqrdmulh v22.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v12.4s, v19.4s, v20.4s +add v19.4s, v19.4s, v20.4s +sqrdmulh v20.4S, v14.4S, v29.s[2] +mla v17.4S, v28.4S, v31.s[0] +sub v28.4s, v18.4s, v27.4s +add v18.4s, v18.4s, v27.4s +sqrdmulh v27.4S, v13.4S, v29.s[2] +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v0.4s, v25.4s +add v0.4s, v0.4s, v25.4s +sqrdmulh v25.4S, v2.4S, v29.s[1] +mla v21.4S, v24.4S, v31.s[0] +sub v24.4s, v15.4s, v23.4s +sqrdmulh v11.4S, v1.4S, v29.s[1] +mla v3.4S, v22.4S, v31.s[0] +add v15.4s, v15.4s, v23.4s +ldr q23, [x17, #+32] +ldr q22, [x17, #+48] +mul v13.4S, v13.4S,v30.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v10.4s, v12.4s, v17.4s +add v12.4s, v12.4s, v17.4s +mla v13.4S, v27.4S, v31.s[0] +mla v14.4S, v20.4S, v31.s[0] +sub v20.4s, v28.4s, v16.4s +add v28.4s, v28.4s, v16.4s +mul v1.4S, v1.4S,v30.s[1] +mul v2.4S, v2.4S,v30.s[1] +sub v16.4s, v19.4s, v21.4s +add v19.4s, v19.4s, v21.4s +mla v1.4S, v11.4S, v31.s[0] +mla v2.4S, v25.4S, v31.s[0] +sub v25.4s, v18.4s, v3.4s +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v10.4S, v22.s[3] +mul v10.4S, v10.4S,v23.s[3] +sub v11.4s, v26.4s, v14.4s +add v26.4s, v26.4s, v14.4s +sqrdmulh v14.4S, v12.4S, v22.s[2] +mul v12.4S, v12.4S,v23.s[2] +sub v21.4s, v24.4s, v13.4s +add v24.4s, v24.4s, v13.4s +sqrdmulh v13.4S, v16.4S, v22.s[1] +mul v16.4S, v16.4S,v23.s[1] +sub v27.4s, v0.4s, v2.4s +add v0.4s, v0.4s, v2.4s +sqrdmulh v2.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v17.4s, v15.4s, v1.4s +add v15.4s, v15.4s, v1.4s +ldr q1, [x17, #+96] +ldr q9, [x17, #+112] +sqrdmulh v8.4S, v20.4S, v22.s[3] +mla v10.4S, v3.4S, v31.s[0] +nop +nop +sqrdmulh v3.4S, v28.4S, v22.s[2] +mla v12.4S, v14.4S, v31.s[0] +nop +nop +sqrdmulh v14.4S, v25.4S, v22.s[1] +mla v16.4S, v13.4S, v31.s[0] +nop +nop +sqrdmulh v13.4S, v18.4S, v22.s[0] +mla v19.4S, v2.4S, v31.s[0] +nop +nop +ldr q2, [x17, #+64] +ldr q7, [x17, #+80] +mul v28.4S, v28.4S,v23.s[2] +mul v20.4S, v20.4S,v23.s[3] +sub v6.4s, v11.4s, v10.4s +add v11.4s, v11.4s, v10.4s +mla v28.4S, v3.4S, v31.s[0] +mla v20.4S, v8.4S, v31.s[0] +sub v8.4s, v26.4s, v12.4s +add v26.4s, v26.4s, v12.4s +mul v18.4S, v18.4S,v23.s[0] +mul v25.4S, v25.4S,v23.s[1] +sub v12.4s, v27.4s, v16.4s +add v27.4s, v27.4s, v16.4s +mla v18.4S, v13.4S, v31.s[0] +mla v25.4S, v14.4S, v31.s[0] +sub v14.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v9.s[3] +mul v6.4S, v6.4S,v1.s[3] +sub v13.4s, v21.4s, v20.4s +add v21.4s, v21.4s, v20.4s +sqrdmulh v20.4S, v11.4S, v9.s[2] +mul v11.4S, v11.4S,v1.s[2] +sub v16.4s, v24.4s, v28.4s +add v24.4s, v24.4s, v28.4s +sqrdmulh v28.4S, v8.4S, v9.s[1] +mul v8.4S, v8.4S,v1.s[1] +sub v3.4s, v17.4s, v25.4s +add v17.4s, v17.4s, v25.4s +sqrdmulh v25.4S, v26.4S, v9.s[0] +mul v26.4S, v26.4S,v1.s[0] +sub v10.4s, v15.4s, v18.4s +add v15.4s, v15.4s, v18.4s +sqrdmulh v18.4S, v12.4S, v7.s[3] +mla v6.4S, v19.4S, v31.s[0] +nop +nop +sqrdmulh v19.4S, v27.4S, v7.s[2] +mla v11.4S, v20.4S, v31.s[0] +nop +nop +sqrdmulh v20.4S, v14.4S, v7.s[1] +mla v8.4S, v28.4S, v31.s[0] +nop +nop +sqrdmulh v28.4S, v0.4S, v7.s[0] +mla v26.4S, v25.4S, v31.s[0] +nop +nop +mul v27.4S, v27.4S,v2.s[2] +mul v12.4S, v12.4S,v2.s[3] +sub v25.4s, v13.4s, v6.4s +str q25, [x0, #992] +mla v27.4S, v19.4S, v31.s[0] +mla v12.4S, v18.4S, v31.s[0] +add v13.4s, v13.4s, v6.4s +str q13, [x0, #928] +mul v0.4S, v0.4S,v2.s[0] +mul v14.4S, v14.4S,v2.s[1] +sub v13.4s, v21.4s, v11.4s +str q13, [x0, #864] +mla v0.4S, v28.4S, v31.s[0] +mla v14.4S, v20.4S, v31.s[0] +add v21.4s, v21.4s, v11.4s +sub v11.4s, v16.4s, v8.4s +ldr q20, [x0, #1008] +sqrdmulh v28.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v16.4s, v16.4s, v8.4s +str q21, [x0, #800] +ldr q21, [x0, #944] +sqrdmulh v8.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +sub v13.4s, v24.4s, v26.4s +str q11, [x0, #736] +ldr q11, [x0, #880] +sqrdmulh v6.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +add v24.4s, v24.4s, v26.4s +str q16, [x0, #672] +ldr q16, [x0, #816] +sqrdmulh v26.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v18.4s, v3.4s, v12.4s +str q13, [x0, #608] +ldr q13, [x0, #752] +sqrdmulh v19.4S, v13.4S, v29.s[0] +mla v20.4S, v28.4S, v31.s[0] +add v3.4s, v3.4s, v12.4s +str q24, [x0, #544] +ldr q24, [x0, #688] +sqrdmulh v12.4S, v24.4S, v29.s[0] +mla v21.4S, v8.4S, v31.s[0] +sub v8.4s, v17.4s, v27.4s +str q18, [x0, #480] +ldr q18, [x0, #624] +sqrdmulh v28.4S, v18.4S, v29.s[0] +mla v11.4S, v6.4S, v31.s[0] +add v17.4s, v17.4s, v27.4s +str q3, [x0, #416] +ldr q3, [x0, #560] +sqrdmulh v27.4S, v3.4S, v29.s[0] +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v10.4s, v14.4s +str q8, [x0, #352] +ldr q8, [x0, #496] +add v10.4s, v10.4s, v14.4s +mul v24.4S, v24.4S,v30.s[0] +mul v13.4S, v13.4S,v30.s[0] +ldr q14, [x0, #432] +str q17, [x0, #288] +ldr q17, [x0, #368] +ldr q6, [x0, #304] +mla v24.4S, v12.4S, v31.s[0] +mla v13.4S, v19.4S, v31.s[0] +str q26, [x0, #224] +sub v26.4s, v15.4s, v0.4s +ldr q19, [x0, #240] +ldr q12, [x0, #176] +mul v3.4S, v3.4S,v30.s[0] +mul v18.4S, v18.4S,v30.s[0] +str q10, [x0, #160] +add v15.4s, v15.4s, v0.4s +ldr q0, [x0, #112] +ldr q10, [x0, #48] +mla v3.4S, v27.4S, v31.s[0] +mla v18.4S, v28.4S, v31.s[0] +sub v28.4s, v8.4s, v20.4s +add v8.4s, v8.4s, v20.4s +sqrdmulh v20.4S, v28.4S, v29.s[2] +mul v28.4S, v28.4S,v30.s[2] +sub v27.4s, v14.4s, v21.4s +add v14.4s, v14.4s, v21.4s +sqrdmulh v21.4S, v27.4S, v29.s[2] +mul v27.4S, v27.4S,v30.s[2] +sub v25.4s, v17.4s, v11.4s +add v17.4s, v17.4s, v11.4s +sqrdmulh v11.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +sub v5.4s, v6.4s, v16.4s +add v6.4s, v6.4s, v16.4s +sqrdmulh v16.4S, v14.4S, v29.s[1] +mul v14.4S, v14.4S,v30.s[1] +sub v4.4s, v19.4s, v13.4s +add v19.4s, v19.4s, v13.4s +sqrdmulh v13.4S, v25.4S, v29.s[2] +mla v28.4S, v20.4S, v31.s[0] +sub v20.4s, v12.4s, v24.4s +add v12.4s, v12.4s, v24.4s +sqrdmulh v24.4S, v5.4S, v29.s[2] +mla v27.4S, v21.4S, v31.s[0] +sub v21.4s, v0.4s, v18.4s +add v0.4s, v0.4s, v18.4s +sqrdmulh v18.4S, v17.4S, v29.s[1] +mla v8.4S, v11.4S, v31.s[0] +sub v11.4s, v10.4s, v3.4s +str q26, [x0, #96] +sqrdmulh v26.4S, v6.4S, v29.s[1] +mla v14.4S, v16.4S, v31.s[0] +add v10.4s, v10.4s, v3.4s +str q15, [x0, #32] +mul v5.4S, v5.4S,v30.s[2] +mul v25.4S, v25.4S,v30.s[2] +sub v15.4s, v4.4s, v28.4s +add v4.4s, v4.4s, v28.4s +mla v5.4S, v24.4S, v31.s[0] +mla v25.4S, v13.4S, v31.s[0] +sub v13.4s, v20.4s, v27.4s +add v20.4s, v20.4s, v27.4s +mul v6.4S, v6.4S,v30.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v27.4s, v19.4s, v8.4s +add v19.4s, v19.4s, v8.4s +mla v6.4S, v26.4S, v31.s[0] +mla v17.4S, v18.4S, v31.s[0] +sub v18.4s, v12.4s, v14.4s +add v12.4s, v12.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v22.s[3] +mul v15.4S, v15.4S,v23.s[3] +sub v26.4s, v21.4s, v25.4s +add v21.4s, v21.4s, v25.4s +sqrdmulh v25.4S, v4.4S, v22.s[2] +mul v4.4S, v4.4S,v23.s[2] +sub v8.4s, v11.4s, v5.4s +add v11.4s, v11.4s, v5.4s +sqrdmulh v5.4S, v27.4S, v22.s[1] +mul v27.4S, v27.4S,v23.s[1] +sub v24.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +sqrdmulh v17.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v28.4s, v10.4s, v6.4s +add v10.4s, v10.4s, v6.4s +sqrdmulh v6.4S, v13.4S, v22.s[3] +mla v15.4S, v14.4S, v31.s[0] +nop +nop +sqrdmulh v14.4S, v20.4S, v22.s[2] +mla v4.4S, v25.4S, v31.s[0] +nop +nop +sqrdmulh v25.4S, v18.4S, v22.s[1] +mla v27.4S, v5.4S, v31.s[0] +nop +nop +sqrdmulh v5.4S, v12.4S, v22.s[0] +mla v19.4S, v17.4S, v31.s[0] +nop +nop +mul v20.4S, v20.4S,v23.s[2] +mul v13.4S, v13.4S,v23.s[3] +sub v17.4s, v26.4s, v15.4s +add v26.4s, v26.4s, v15.4s +mla v20.4S, v14.4S, v31.s[0] +mla v13.4S, v6.4S, v31.s[0] +sub v6.4s, v21.4s, v4.4s +add v21.4s, v21.4s, v4.4s +mul v12.4S, v12.4S,v23.s[0] +mul v18.4S, v18.4S,v23.s[1] +sub v4.4s, v24.4s, v27.4s +add v24.4s, v24.4s, v27.4s +mla v12.4S, v5.4S, v31.s[0] +mla v18.4S, v25.4S, v31.s[0] +sub v25.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v17.4S, v9.s[3] +mul v17.4S, v17.4S,v1.s[3] +sub v5.4s, v8.4s, v13.4s +add v8.4s, v8.4s, v13.4s +sqrdmulh v13.4S, v26.4S, v9.s[2] +mul v26.4S, v26.4S,v1.s[2] +sub v27.4s, v11.4s, v20.4s +add v11.4s, v11.4s, v20.4s +sqrdmulh v20.4S, v6.4S, v9.s[1] +mul v6.4S, v6.4S,v1.s[1] +sub v14.4s, v28.4s, v18.4s +add v28.4s, v28.4s, v18.4s +sqrdmulh v18.4S, v21.4S, v9.s[0] +mul v21.4S, v21.4S,v1.s[0] +sub v15.4s, v10.4s, v12.4s +add v10.4s, v10.4s, v12.4s +sqrdmulh v12.4S, v4.4S, v7.s[3] +mla v17.4S, v19.4S, v31.s[0] +nop +nop +sqrdmulh v19.4S, v24.4S, v7.s[2] +mla v26.4S, v13.4S, v31.s[0] +nop +nop +sqrdmulh v13.4S, v25.4S, v7.s[1] +mla v6.4S, v20.4S, v31.s[0] +nop +nop +sqrdmulh v20.4S, v0.4S, v7.s[0] +mla v21.4S, v18.4S, v31.s[0] +nop +nop +mul v24.4S, v24.4S,v2.s[2] +mul v4.4S, v4.4S,v2.s[3] +sub v18.4s, v5.4s, v17.4s +str q18, [x0, #1008] +mla v24.4S, v19.4S, v31.s[0] +mla v4.4S, v12.4S, v31.s[0] +add v5.4s, v5.4s, v17.4s +str q5, [x0, #944] +mul v0.4S, v0.4S,v2.s[0] +mul v25.4S, v25.4S,v2.s[1] +sub v5.4s, v8.4s, v26.4s +str q5, [x0, #880] +mla v0.4S, v20.4S, v31.s[0] +mla v25.4S, v13.4S, v31.s[0] +add v8.4s, v8.4s, v26.4s +sub v26.4s, v27.4s, v6.4s +ldr q13, [x0, #960] +sqrdmulh v20.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +add v27.4s, v27.4s, v6.4s +str q8, [x0, #816] +ldr q8, [x0, #896] +sqrdmulh v6.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v5.4s, v11.4s, v21.4s +str q26, [x0, #752] +ldr q26, [x0, #832] +sqrdmulh v17.4S, v26.4S, v29.s[0] +mul v26.4S, v26.4S,v30.s[0] +add v11.4s, v11.4s, v21.4s +str q27, [x0, #688] +ldr q27, [x0, #768] +sqrdmulh v21.4S, v27.4S, v29.s[0] +mul v27.4S, v27.4S,v30.s[0] +sub v12.4s, v14.4s, v4.4s +str q5, [x0, #624] +ldr q5, [x0, #704] +sqrdmulh v19.4S, v5.4S, v29.s[0] +mla v13.4S, v20.4S, v31.s[0] +add v14.4s, v14.4s, v4.4s +str q11, [x0, #560] +ldr q11, [x0, #640] +sqrdmulh v4.4S, v11.4S, v29.s[0] +mla v8.4S, v6.4S, v31.s[0] +sub v6.4s, v28.4s, v24.4s +str q12, [x0, #496] +ldr q12, [x0, #576] +sqrdmulh v20.4S, v12.4S, v29.s[0] +mla v26.4S, v17.4S, v31.s[0] +add v28.4s, v28.4s, v24.4s +str q14, [x0, #432] +ldr q14, [x0, #512] +sqrdmulh v24.4S, v14.4S, v29.s[0] +mla v27.4S, v21.4S, v31.s[0] +sub v21.4s, v15.4s, v25.4s +str q6, [x0, #368] +ldr q6, [x0, #448] +add v15.4s, v15.4s, v25.4s +mul v11.4S, v11.4S,v30.s[0] +mul v5.4S, v5.4S,v30.s[0] +ldr q25, [x0, #384] +str q28, [x0, #304] +ldr q28, [x0, #320] +ldr q17, [x0, #256] +mla v11.4S, v4.4S, v31.s[0] +mla v5.4S, v19.4S, v31.s[0] +str q21, [x0, #240] +sub v21.4s, v10.4s, v0.4s +ldr q19, [x0, #192] +ldr q4, [x0, #128] +mul v14.4S, v14.4S,v30.s[0] +mul v12.4S, v12.4S,v30.s[0] +str q15, [x0, #176] +add v10.4s, v10.4s, v0.4s +ldr q0, [x0, #64] +ldr q15, [x0, #0] +mla v14.4S, v24.4S, v31.s[0] +mla v12.4S, v20.4S, v31.s[0] +sub v20.4s, v6.4s, v13.4s +add v6.4s, v6.4s, v13.4s +sqrdmulh v13.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +sub v24.4s, v25.4s, v8.4s +add v25.4s, v25.4s, v8.4s +sqrdmulh v8.4S, v24.4S, v29.s[2] +mul v24.4S, v24.4S,v30.s[2] +sub v18.4s, v28.4s, v26.4s +add v28.4s, v28.4s, v26.4s +sqrdmulh v26.4S, v6.4S, v29.s[1] +mul v6.4S, v6.4S,v30.s[1] +sub v3.4s, v17.4s, v27.4s +add v17.4s, v17.4s, v27.4s +sqrdmulh v27.4S, v25.4S, v29.s[1] +mul v25.4S, v25.4S,v30.s[1] +sub v16.4s, v19.4s, v5.4s +add v19.4s, v19.4s, v5.4s +sqrdmulh v5.4S, v18.4S, v29.s[2] +mla v20.4S, v13.4S, v31.s[0] +sub v13.4s, v4.4s, v11.4s +add v4.4s, v4.4s, v11.4s +sqrdmulh v11.4S, v3.4S, v29.s[2] +mla v24.4S, v8.4S, v31.s[0] +sub v8.4s, v0.4s, v12.4s +add v0.4s, v0.4s, v12.4s +sqrdmulh v12.4S, v28.4S, v29.s[1] +mla v6.4S, v26.4S, v31.s[0] +sub v26.4s, v15.4s, v14.4s +str q21, [x0, #112] +sqrdmulh v21.4S, v17.4S, v29.s[1] +mla v25.4S, v27.4S, v31.s[0] +add v15.4s, v15.4s, v14.4s +str q10, [x0, #48] +mul v3.4S, v3.4S,v30.s[2] +mul v18.4S, v18.4S,v30.s[2] +sub v10.4s, v16.4s, v20.4s +add v16.4s, v16.4s, v20.4s +mla v3.4S, v11.4S, v31.s[0] +mla v18.4S, v5.4S, v31.s[0] +sub v5.4s, v13.4s, v24.4s +add v13.4s, v13.4s, v24.4s +mul v17.4S, v17.4S,v30.s[1] +mul v28.4S, v28.4S,v30.s[1] +sub v24.4s, v19.4s, v6.4s +add v19.4s, v19.4s, v6.4s +mla v17.4S, v21.4S, v31.s[0] +mla v28.4S, v12.4S, v31.s[0] +sub v12.4s, v4.4s, v25.4s +add v4.4s, v4.4s, v25.4s +sqrdmulh v25.4S, v10.4S, v22.s[3] +mul v10.4S, v10.4S,v23.s[3] +sub v21.4s, v8.4s, v18.4s +add v8.4s, v8.4s, v18.4s +sqrdmulh v18.4S, v16.4S, v22.s[2] +mul v16.4S, v16.4S,v23.s[2] +sub v6.4s, v26.4s, v3.4s +add v26.4s, v26.4s, v3.4s +sqrdmulh v3.4S, v24.4S, v22.s[1] +mul v24.4S, v24.4S,v23.s[1] +sub v11.4s, v0.4s, v28.4s +add v0.4s, v0.4s, v28.4s +sqrdmulh v28.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v20.4s, v15.4s, v17.4s +add v15.4s, v15.4s, v17.4s +sqrdmulh v17.4S, v5.4S, v22.s[3] +mla v10.4S, v25.4S, v31.s[0] +nop +nop +sqrdmulh v25.4S, v13.4S, v22.s[2] +mla v16.4S, v18.4S, v31.s[0] +nop +nop +sqrdmulh v18.4S, v12.4S, v22.s[1] +mla v24.4S, v3.4S, v31.s[0] +nop +nop +sqrdmulh v3.4S, v4.4S, v22.s[0] +mla v19.4S, v28.4S, v31.s[0] +nop +nop +mul v13.4S, v13.4S,v23.s[2] +mul v5.4S, v5.4S,v23.s[3] +sub v28.4s, v21.4s, v10.4s +add v21.4s, v21.4s, v10.4s +mla v13.4S, v25.4S, v31.s[0] +mla v5.4S, v17.4S, v31.s[0] +sub v17.4s, v8.4s, v16.4s +add v8.4s, v8.4s, v16.4s +mul v4.4S, v4.4S,v23.s[0] +mul v12.4S, v12.4S,v23.s[1] +sub v16.4s, v11.4s, v24.4s +add v11.4s, v11.4s, v24.4s +mla v4.4S, v3.4S, v31.s[0] +mla v12.4S, v18.4S, v31.s[0] +sub v18.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v28.4S, v9.s[3] +mul v28.4S, v28.4S,v1.s[3] +sub v3.4s, v6.4s, v5.4s +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v21.4S, v9.s[2] +mul v21.4S, v21.4S,v1.s[2] +sub v24.4s, v26.4s, v13.4s +add v26.4s, v26.4s, v13.4s +sqrdmulh v13.4S, v17.4S, v9.s[1] +mul v17.4S, v17.4S,v1.s[1] +sub v25.4s, v20.4s, v12.4s +add v20.4s, v20.4s, v12.4s +sqrdmulh v12.4S, v8.4S, v9.s[0] +mul v8.4S, v8.4S,v1.s[0] +sub v10.4s, v15.4s, v4.4s +add v15.4s, v15.4s, v4.4s +sqrdmulh v4.4S, v16.4S, v7.s[3] +mla v28.4S, v19.4S, v31.s[0] +nop +nop +sqrdmulh v19.4S, v11.4S, v7.s[2] +mla v21.4S, v5.4S, v31.s[0] +nop +nop +sqrdmulh v5.4S, v18.4S, v7.s[1] +mla v17.4S, v13.4S, v31.s[0] +nop +nop +sqrdmulh v13.4S, v0.4S, v7.s[0] +mla v8.4S, v12.4S, v31.s[0] +nop +nop +mul v11.4S, v11.4S,v2.s[2] +mul v16.4S, v16.4S,v2.s[3] +sub v12.4s, v3.4s, v28.4s +str q12, [x0, #960] +mla v11.4S, v19.4S, v31.s[0] +mla v16.4S, v4.4S, v31.s[0] +add v3.4s, v3.4s, v28.4s +str q3, [x0, #896] +mul v0.4S, v0.4S,v2.s[0] +mul v18.4S, v18.4S,v2.s[1] +sub v3.4s, v6.4s, v21.4s +str q3, [x0, #832] +mla v0.4S, v13.4S, v31.s[0] +mla v18.4S, v5.4S, v31.s[0] +add v6.4s, v6.4s, v21.4s +sub v21.4s, v24.4s, v17.4s +ldr q5, [x0, #976] +sqrdmulh v13.4S, v5.4S, v29.s[0] +mul v5.4S, v5.4S,v30.s[0] +add v24.4s, v24.4s, v17.4s +str q6, [x0, #768] +ldr q6, [x0, #912] +sqrdmulh v17.4S, v6.4S, v29.s[0] +mul v6.4S, v6.4S,v30.s[0] +sub v3.4s, v26.4s, v8.4s +str q21, [x0, #704] +ldr q21, [x0, #848] +sqrdmulh v28.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +add v26.4s, v26.4s, v8.4s +str q24, [x0, #640] +ldr q24, [x0, #784] +sqrdmulh v8.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +sub v4.4s, v25.4s, v16.4s +str q3, [x0, #576] +ldr q3, [x0, #720] +sqrdmulh v19.4S, v3.4S, v29.s[0] +mla v5.4S, v13.4S, v31.s[0] +add v25.4s, v25.4s, v16.4s +str q26, [x0, #512] +ldr q26, [x0, #656] +sqrdmulh v16.4S, v26.4S, v29.s[0] +mla v6.4S, v17.4S, v31.s[0] +sub v17.4s, v20.4s, v11.4s +str q4, [x0, #448] +ldr q4, [x0, #592] +sqrdmulh v13.4S, v4.4S, v29.s[0] +mla v21.4S, v28.4S, v31.s[0] +add v20.4s, v20.4s, v11.4s +str q25, [x0, #384] +ldr q25, [x0, #528] +sqrdmulh v11.4S, v25.4S, v29.s[0] +mla v24.4S, v8.4S, v31.s[0] +sub v8.4s, v10.4s, v18.4s +str q17, [x0, #320] +ldr q17, [x0, #464] +add v10.4s, v10.4s, v18.4s +mul v26.4S, v26.4S,v30.s[0] +mul v3.4S, v3.4S,v30.s[0] +ldr q18, [x0, #400] +str q20, [x0, #256] +ldr q20, [x0, #336] +ldr q28, [x0, #272] +mla v26.4S, v16.4S, v31.s[0] +mla v3.4S, v19.4S, v31.s[0] +str q8, [x0, #192] +sub v8.4s, v15.4s, v0.4s +ldr q19, [x0, #208] +ldr q16, [x0, #144] +mul v25.4S, v25.4S,v30.s[0] +mul v4.4S, v4.4S,v30.s[0] +str q10, [x0, #128] +add v15.4s, v15.4s, v0.4s +ldr q0, [x0, #80] +ldr q10, [x0, #16] +mla v25.4S, v11.4S, v31.s[0] +mla v4.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v5.4s +add v17.4s, v17.4s, v5.4s +sqrdmulh v5.4S, v13.4S, v29.s[2] +mul v13.4S, v13.4S,v30.s[2] +sub v11.4s, v18.4s, v6.4s +add v18.4s, v18.4s, v6.4s +sqrdmulh v6.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v12.4s, v20.4s, v21.4s +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v14.4s, v28.4s, v24.4s +add v28.4s, v28.4s, v24.4s +sqrdmulh v24.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v27.4s, v19.4s, v3.4s +add v19.4s, v19.4s, v3.4s +sqrdmulh v3.4S, v12.4S, v29.s[2] +mla v13.4S, v5.4S, v31.s[0] +sub v5.4s, v16.4s, v26.4s +add v16.4s, v16.4s, v26.4s +sqrdmulh v26.4S, v14.4S, v29.s[2] +mla v11.4S, v6.4S, v31.s[0] +sub v6.4s, v0.4s, v4.4s +add v0.4s, v0.4s, v4.4s +sqrdmulh v4.4S, v20.4S, v29.s[1] +mla v17.4S, v21.4S, v31.s[0] +sub v21.4s, v10.4s, v25.4s +str q8, [x0, #64] +sqrdmulh v8.4S, v28.4S, v29.s[1] +mla v18.4S, v24.4S, v31.s[0] +add v10.4s, v10.4s, v25.4s +str q15, [x0, #0] +mul v14.4S, v14.4S,v30.s[2] +mul v12.4S, v12.4S,v30.s[2] +sub v15.4s, v27.4s, v13.4s +add v27.4s, v27.4s, v13.4s +mla v14.4S, v26.4S, v31.s[0] +mla v12.4S, v3.4S, v31.s[0] +sub v3.4s, v5.4s, v11.4s +add v5.4s, v5.4s, v11.4s +mul v28.4S, v28.4S,v30.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v11.4s, v19.4s, v17.4s +add v19.4s, v19.4s, v17.4s +mla v28.4S, v8.4S, v31.s[0] +mla v20.4S, v4.4S, v31.s[0] +sub v4.4s, v16.4s, v18.4s +add v16.4s, v16.4s, v18.4s +sqrdmulh v29.4S, v15.4S, v22.s[3] +mul v15.4S, v15.4S,v23.s[3] +sub v30.4s, v6.4s, v12.4s +add v6.4s, v6.4s, v12.4s +sqrdmulh v12.4S, v27.4S, v22.s[2] +mul v27.4S, v27.4S,v23.s[2] +sub v18.4s, v21.4s, v14.4s +add v21.4s, v21.4s, v14.4s +sqrdmulh v14.4S, v11.4S, v22.s[1] +mul v11.4S, v11.4S,v23.s[1] +sub v8.4s, v0.4s, v20.4s +add v0.4s, v0.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v17.4s, v10.4s, v28.4s +add v10.4s, v10.4s, v28.4s +sqrdmulh v28.4S, v3.4S, v22.s[3] +mla v15.4S, v29.4S, v31.s[0] +nop +nop +sqrdmulh v29.4S, v5.4S, v22.s[2] +mla v27.4S, v12.4S, v31.s[0] +nop +nop +sqrdmulh v12.4S, v4.4S, v22.s[1] +mla v11.4S, v14.4S, v31.s[0] +nop +nop +sqrdmulh v14.4S, v16.4S, v22.s[0] +mla v19.4S, v20.4S, v31.s[0] +nop +nop +mul v5.4S, v5.4S,v23.s[2] +mul v3.4S, v3.4S,v23.s[3] +sub v20.4s, v30.4s, v15.4s +add v30.4s, v30.4s, v15.4s +mla v5.4S, v29.4S, v31.s[0] +mla v3.4S, v28.4S, v31.s[0] +sub v28.4s, v6.4s, v27.4s +add v6.4s, v6.4s, v27.4s +mul v16.4S, v16.4S,v23.s[0] +mul v4.4S, v4.4S,v23.s[1] +sub v27.4s, v8.4s, v11.4s +add v8.4s, v8.4s, v11.4s +mla v16.4S, v14.4S, v31.s[0] +mla v4.4S, v12.4S, v31.s[0] +sub v12.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v22.4S, v20.4S, v9.s[3] +mul v20.4S, v20.4S,v1.s[3] +sub v23.4s, v18.4s, v3.4s +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v30.4S, v9.s[2] +mul v30.4S, v30.4S,v1.s[2] +sub v19.4s, v21.4s, v5.4s +add v21.4s, v21.4s, v5.4s +sqrdmulh v5.4S, v28.4S, v9.s[1] +mul v28.4S, v28.4S,v1.s[1] +sub v14.4s, v17.4s, v4.4s +add v17.4s, v17.4s, v4.4s +sqrdmulh v4.4S, v6.4S, v9.s[0] +mul v6.4S, v6.4S,v1.s[0] +sub v11.4s, v10.4s, v16.4s +add v10.4s, v10.4s, v16.4s +sqrdmulh v9.4S, v27.4S, v7.s[3] +mla v20.4S, v22.4S, v31.s[0] +nop +nop +sqrdmulh v22.4S, v8.4S, v7.s[2] +mla v30.4S, v3.4S, v31.s[0] +nop +nop +sqrdmulh v3.4S, v12.4S, v7.s[1] +mla v28.4S, v5.4S, v31.s[0] +nop +nop +sqrdmulh v5.4S, v0.4S, v7.s[0] +mla v6.4S, v4.4S, v31.s[0] +nop +nop +mul v8.4S, v8.4S,v2.s[2] +mul v27.4S, v27.4S,v2.s[3] +sub v4.4s, v23.4s, v20.4s +str q4, [x0, #976] +mla v8.4S, v22.4S, v31.s[0] +mla v27.4S, v9.4S, v31.s[0] +add v23.4s, v23.4s, v20.4s +str q23, [x0, #912] +mul v0.4S, v0.4S,v2.s[0] +mul v12.4S, v12.4S,v2.s[1] +sub v23.4s, v18.4s, v30.4s +str q23, [x0, #848] +mla v0.4S, v5.4S, v31.s[0] +mla v12.4S, v3.4S, v31.s[0] +add v18.4s, v18.4s, v30.4s +sub v30.4s, v19.4s, v28.4s +add v19.4s, v19.4s, v28.4s +str q18, [x0, #784] +sub v18.4s, v21.4s, v6.4s +str q30, [x0, #720] +add v21.4s, v21.4s, v6.4s +str q19, [x0, #656] +sub v19.4s, v14.4s, v27.4s +str q18, [x0, #592] +add v14.4s, v14.4s, v27.4s +str q21, [x0, #528] +sub v21.4s, v17.4s, v8.4s +str q19, [x0, #464] +add v17.4s, v17.4s, v8.4s +str q14, [x0, #400] +sub v14.4s, v11.4s, v12.4s +str q21, [x0, #336] +add v11.4s, v11.4s, v12.4s +str q17, [x0, #272] +sub v17.4s, v10.4s, v0.4s +add v10.4s, v10.4s, v0.4s +ldr q24, [x0, #48] +ldr q25, [x0, #32] +ldr q13, [x0, #112] +ldr q26, [x0, #96] +ldr q15, [x17, #+128] +ldr q29, [x17, #+144] +ldr q16, [x17, #+160] +ldr q1, [x17, #+176] +ldr q4, [x0, #176] +ldr q22, [x0, #160] +sqrdmulh v9.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v15.s[0] +ldr q20, [x0, #240] +sqrdmulh v23.4S, v25.4S, v29.s[0] +mul v25.4S, v25.4S,v15.s[0] +ldr q5, [x0, #224] +sqrdmulh v3.4S, v13.4S, v1.s[0] +mul v13.4S, v13.4S,v16.s[0] +ldr q2, [x17, #+192] +sqrdmulh v7.4S, v26.4S, v1.s[0] +mul v26.4S, v26.4S,v16.s[0] +ldr q28, [x17, #+208] +mla v24.4S, v9.4S, v31.s[0] +sqrdmulh v9.4S, v4.4S, v28.s[0] +ldr q30, [x17, #+224] +mla v25.4S, v23.4S, v31.s[0] +sqrdmulh v23.4S, v22.4S, v28.s[0] +ldr q6, [x17, #+240] +mla v13.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v20.4S, v6.s[0] +mla v26.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v5.4S, v6.s[0] +ldr q18, [x0, #0] +mul v4.4S, v4.4S,v2.s[0] +mul v22.4S, v22.4S,v2.s[0] +mla v4.4S, v9.4S, v31.s[0] +mla v22.4S, v23.4S, v31.s[0] +ldr q23, [x0, #64] +mul v20.4S, v20.4S,v30.s[0] +mul v5.4S, v5.4S,v30.s[0] +sub v9.4s, v10.4s, v24.4s +add v10.4s, v10.4s, v24.4s +mla v20.4S, v3.4S, v31.s[0] +mla v5.4S, v7.4S, v31.s[0] +sub v7.4s, v18.4s, v25.4s +ldr q3, [x0, #128] +add v18.4s, v18.4s, v25.4s +sqrdmulh v25.4S, v9.4S, v29.s[2] +mul v9.4S, v9.4S,v15.s[2] +sub v24.4s, v17.4s, v13.4s +add v17.4s, v17.4s, v13.4s +sqrdmulh v13.4S, v10.4S, v29.s[1] +mul v10.4S, v10.4S,v15.s[1] +sub v27.4s, v23.4s, v26.4s +ldr q19, [x0, #192] +add v23.4s, v23.4s, v26.4s +sqrdmulh v29.4S, v24.4S, v1.s[2] +mul v24.4S, v24.4S,v16.s[2] +sub v26.4s, v11.4s, v4.4s +ldr q15, [x0, #304] +add v11.4s, v11.4s, v4.4s +sqrdmulh v4.4S, v17.4S, v1.s[1] +mul v17.4S, v17.4S,v16.s[1] +sub v8.4s, v3.4s, v22.4s +ldr q21, [x0, #288] +add v3.4s, v3.4s, v22.4s +mla v9.4S, v25.4S, v31.s[0] +sqrdmulh v25.4S, v26.4S, v28.s[2] +sub v1.4s, v14.4s, v20.4s +ldr q22, [x0, #368] +add v14.4s, v14.4s, v20.4s +mla v10.4S, v13.4S, v31.s[0] +sqrdmulh v13.4S, v11.4S, v28.s[1] +sub v20.4s, v19.4s, v5.4s +ldr q16, [x0, #352] +add v19.4s, v19.4s, v5.4s +mla v24.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v1.4S, v6.s[2] +sub v5.4s, v7.4s, v9.4s +ldr q12, [x17, #+256] +str q5, [x0, #48] +mla v17.4S, v4.4S, v31.s[0] +sqrdmulh v4.4S, v14.4S, v6.s[1] +add v7.4s, v7.4s, v9.4s +ldr q9, [x17, #+272] +str q7, [x0, #32] +mul v26.4S, v26.4S,v2.s[2] +mul v11.4S, v11.4S,v2.s[1] +sub v7.4s, v18.4s, v10.4s +ldr q5, [x17, #+288] +str q7, [x0, #16] +mla v26.4S, v25.4S, v31.s[0] +mla v11.4S, v13.4S, v31.s[0] +add v18.4s, v18.4s, v10.4s +ldr q10, [x17, #+304] +str q18, [x0, #0] +mul v1.4S, v1.4S,v30.s[2] +mul v14.4S, v14.4S,v30.s[1] +sub v28.4s, v27.4s, v24.4s +ldr q18, [x0, #432] +str q28, [x0, #112] +mla v1.4S, v29.4S, v31.s[0] +mla v14.4S, v4.4S, v31.s[0] +add v27.4s, v27.4s, v24.4s +ldr q24, [x0, #416] +str q27, [x0, #96] +sqrdmulh v6.4S, v15.4S, v9.s[0] +mul v15.4S, v15.4S,v12.s[0] +sub v27.4s, v23.4s, v17.4s +ldr q30, [x0, #496] +str q27, [x0, #80] +sqrdmulh v27.4S, v21.4S, v9.s[0] +mul v21.4S, v21.4S,v12.s[0] +add v23.4s, v23.4s, v17.4s +ldr q17, [x0, #480] +str q23, [x0, #64] +sqrdmulh v23.4S, v22.4S, v10.s[0] +mul v22.4S, v22.4S,v5.s[0] +sub v4.4s, v8.4s, v26.4s +ldr q29, [x17, #+320] +str q4, [x0, #176] +sqrdmulh v4.4S, v16.4S, v10.s[0] +mul v16.4S, v16.4S,v5.s[0] +add v8.4s, v8.4s, v26.4s +ldr q26, [x17, #+336] +str q8, [x0, #160] +mla v15.4S, v6.4S, v31.s[0] +sqrdmulh v6.4S, v18.4S, v26.s[0] +sub v8.4s, v3.4s, v11.4s +ldr q28, [x17, #+352] +str q8, [x0, #144] +mla v21.4S, v27.4S, v31.s[0] +sqrdmulh v27.4S, v24.4S, v26.s[0] +add v3.4s, v3.4s, v11.4s +ldr q11, [x17, #+368] +str q3, [x0, #128] +mla v22.4S, v23.4S, v31.s[0] +sqrdmulh v23.4S, v30.4S, v11.s[0] +sub v3.4s, v20.4s, v1.4s +ldr q8, [x0, #272] +str q3, [x0, #240] +mla v16.4S, v4.4S, v31.s[0] +sqrdmulh v4.4S, v17.4S, v11.s[0] +add v20.4s, v20.4s, v1.4s +ldr q1, [x0, #256] +str q20, [x0, #224] +mul v18.4S, v18.4S,v29.s[0] +mul v24.4S, v24.4S,v29.s[0] +sub v20.4s, v19.4s, v14.4s +ldr q3, [x0, #336] +str q20, [x0, #208] +mla v18.4S, v6.4S, v31.s[0] +mla v24.4S, v27.4S, v31.s[0] +add v19.4s, v19.4s, v14.4s +ldr q14, [x0, #320] +str q19, [x0, #192] +mul v30.4S, v30.4S,v28.s[0] +mul v17.4S, v17.4S,v28.s[0] +sub v19.4s, v8.4s, v15.4s +ldr q27, [x0, #400] +add v8.4s, v8.4s, v15.4s +mla v30.4S, v23.4S, v31.s[0] +mla v17.4S, v4.4S, v31.s[0] +sub v4.4s, v1.4s, v21.4s +ldr q23, [x0, #384] +add v1.4s, v1.4s, v21.4s +sqrdmulh v21.4S, v19.4S, v9.s[2] +mul v19.4S, v19.4S,v12.s[2] +sub v15.4s, v3.4s, v22.4s +ldr q6, [x0, #464] +add v3.4s, v3.4s, v22.4s +sqrdmulh v22.4S, v8.4S, v9.s[1] +mul v8.4S, v8.4S,v12.s[1] +sub v20.4s, v14.4s, v16.4s +ldr q2, [x0, #448] +add v14.4s, v14.4s, v16.4s +sqrdmulh v9.4S, v15.4S, v10.s[2] +mul v15.4S, v15.4S,v5.s[2] +sub v16.4s, v27.4s, v18.4s +ldr q12, [x0, #560] +add v27.4s, v27.4s, v18.4s +sqrdmulh v18.4S, v3.4S, v10.s[1] +mul v3.4S, v3.4S,v5.s[1] +sub v13.4s, v23.4s, v24.4s +ldr q25, [x0, #544] +add v23.4s, v23.4s, v24.4s +mla v19.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v16.4S, v26.s[2] +sub v10.4s, v6.4s, v30.4s +ldr q24, [x0, #624] +add v6.4s, v6.4s, v30.4s +mla v8.4S, v22.4S, v31.s[0] +sqrdmulh v22.4S, v27.4S, v26.s[1] +sub v30.4s, v2.4s, v17.4s +ldr q5, [x0, #608] +add v2.4s, v2.4s, v17.4s +mla v15.4S, v9.4S, v31.s[0] +sqrdmulh v9.4S, v10.4S, v11.s[2] +sub v17.4s, v4.4s, v19.4s +ldr q7, [x17, #+384] +str q17, [x0, #304] +mla v3.4S, v18.4S, v31.s[0] +sqrdmulh v18.4S, v6.4S, v11.s[1] +add v4.4s, v4.4s, v19.4s +ldr q19, [x17, #+400] +str q4, [x0, #288] +mul v16.4S, v16.4S,v29.s[2] +mul v27.4S, v27.4S,v29.s[1] +sub v4.4s, v1.4s, v8.4s +ldr q17, [x17, #+416] +str q4, [x0, #272] +mla v16.4S, v21.4S, v31.s[0] +mla v27.4S, v22.4S, v31.s[0] +add v1.4s, v1.4s, v8.4s +ldr q8, [x17, #+432] +str q1, [x0, #256] +mul v10.4S, v10.4S,v28.s[2] +mul v6.4S, v6.4S,v28.s[1] +sub v26.4s, v20.4s, v15.4s +ldr q1, [x0, #688] +str q26, [x0, #368] +mla v10.4S, v9.4S, v31.s[0] +mla v6.4S, v18.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +ldr q15, [x0, #672] +str q20, [x0, #352] +sqrdmulh v11.4S, v12.4S, v19.s[0] +mul v12.4S, v12.4S,v7.s[0] +sub v20.4s, v14.4s, v3.4s +ldr q28, [x0, #752] +str q20, [x0, #336] +sqrdmulh v20.4S, v25.4S, v19.s[0] +mul v25.4S, v25.4S,v7.s[0] +add v14.4s, v14.4s, v3.4s +ldr q3, [x0, #736] +str q14, [x0, #320] +sqrdmulh v14.4S, v24.4S, v8.s[0] +mul v24.4S, v24.4S,v17.s[0] +sub v18.4s, v13.4s, v16.4s +ldr q9, [x17, #+448] +str q18, [x0, #432] +sqrdmulh v18.4S, v5.4S, v8.s[0] +mul v5.4S, v5.4S,v17.s[0] +add v13.4s, v13.4s, v16.4s +ldr q16, [x17, #+464] +str q13, [x0, #416] +mla v12.4S, v11.4S, v31.s[0] +sqrdmulh v11.4S, v1.4S, v16.s[0] +sub v13.4s, v23.4s, v27.4s +ldr q26, [x17, #+480] +str q13, [x0, #400] +mla v25.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v15.4S, v16.s[0] +add v23.4s, v23.4s, v27.4s +ldr q27, [x17, #+496] +str q23, [x0, #384] +mla v24.4S, v14.4S, v31.s[0] +sqrdmulh v14.4S, v28.4S, v27.s[0] +sub v23.4s, v30.4s, v10.4s +ldr q13, [x0, #528] +str q23, [x0, #496] +mla v5.4S, v18.4S, v31.s[0] +sqrdmulh v18.4S, v3.4S, v27.s[0] +add v30.4s, v30.4s, v10.4s +ldr q10, [x0, #512] +str q30, [x0, #480] +mul v1.4S, v1.4S,v9.s[0] +mul v15.4S, v15.4S,v9.s[0] +sub v30.4s, v2.4s, v6.4s +ldr q23, [x0, #592] +str q30, [x0, #464] +mla v1.4S, v11.4S, v31.s[0] +mla v15.4S, v20.4S, v31.s[0] +add v2.4s, v2.4s, v6.4s +ldr q6, [x0, #576] +str q2, [x0, #448] +mul v28.4S, v28.4S,v26.s[0] +mul v3.4S, v3.4S,v26.s[0] +sub v2.4s, v13.4s, v12.4s +ldr q20, [x0, #656] +add v13.4s, v13.4s, v12.4s +mla v28.4S, v14.4S, v31.s[0] +mla v3.4S, v18.4S, v31.s[0] +sub v18.4s, v10.4s, v25.4s +ldr q14, [x0, #640] +add v10.4s, v10.4s, v25.4s +sqrdmulh v25.4S, v2.4S, v19.s[2] +mul v2.4S, v2.4S,v7.s[2] +sub v12.4s, v23.4s, v24.4s +ldr q11, [x0, #720] +add v23.4s, v23.4s, v24.4s +sqrdmulh v24.4S, v13.4S, v19.s[1] +mul v13.4S, v13.4S,v7.s[1] +sub v30.4s, v6.4s, v5.4s +ldr q29, [x0, #704] +add v6.4s, v6.4s, v5.4s +sqrdmulh v19.4S, v12.4S, v8.s[2] +mul v12.4S, v12.4S,v17.s[2] +sub v5.4s, v20.4s, v1.4s +ldr q7, [x0, #816] +add v20.4s, v20.4s, v1.4s +sqrdmulh v1.4S, v23.4S, v8.s[1] +mul v23.4S, v23.4S,v17.s[1] +sub v22.4s, v14.4s, v15.4s +ldr q21, [x0, #800] +add v14.4s, v14.4s, v15.4s +mla v2.4S, v25.4S, v31.s[0] +sqrdmulh v25.4S, v5.4S, v16.s[2] +sub v8.4s, v11.4s, v28.4s +ldr q15, [x0, #880] +add v11.4s, v11.4s, v28.4s +mla v13.4S, v24.4S, v31.s[0] +sqrdmulh v24.4S, v20.4S, v16.s[1] +sub v28.4s, v29.4s, v3.4s +ldr q17, [x0, #864] +add v29.4s, v29.4s, v3.4s +mla v12.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v8.4S, v27.s[2] +sub v3.4s, v18.4s, v2.4s +ldr q4, [x17, #+512] +str q3, [x0, #560] +mla v23.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v11.4S, v27.s[1] +add v18.4s, v18.4s, v2.4s +ldr q2, [x17, #+528] +str q18, [x0, #544] +mul v5.4S, v5.4S,v9.s[2] +mul v20.4S, v20.4S,v9.s[1] +sub v18.4s, v10.4s, v13.4s +ldr q3, [x17, #+544] +str q18, [x0, #528] +mla v5.4S, v25.4S, v31.s[0] +mla v20.4S, v24.4S, v31.s[0] +add v10.4s, v10.4s, v13.4s +ldr q13, [x17, #+560] +str q10, [x0, #512] +mul v8.4S, v8.4S,v26.s[2] +mul v11.4S, v11.4S,v26.s[1] +sub v16.4s, v30.4s, v12.4s +ldr q10, [x0, #944] +str q16, [x0, #624] +mla v8.4S, v19.4S, v31.s[0] +mla v11.4S, v1.4S, v31.s[0] +add v30.4s, v30.4s, v12.4s +ldr q12, [x0, #928] +str q30, [x0, #608] +sqrdmulh v27.4S, v7.4S, v2.s[0] +mul v7.4S, v7.4S,v4.s[0] +sub v30.4s, v6.4s, v23.4s +ldr q26, [x0, #1008] +str q30, [x0, #592] +sqrdmulh v30.4S, v21.4S, v2.s[0] +mul v21.4S, v21.4S,v4.s[0] +add v6.4s, v6.4s, v23.4s +ldr q23, [x0, #992] +str q6, [x0, #576] +sqrdmulh v6.4S, v15.4S, v13.s[0] +mul v15.4S, v15.4S,v3.s[0] +sub v1.4s, v22.4s, v5.4s +ldr q19, [x17, #+576] +str q1, [x0, #688] +sqrdmulh v1.4S, v17.4S, v13.s[0] +mul v17.4S, v17.4S,v3.s[0] +add v22.4s, v22.4s, v5.4s +ldr q5, [x17, #+592] +str q22, [x0, #672] +mla v7.4S, v27.4S, v31.s[0] +sqrdmulh v27.4S, v10.4S, v5.s[0] +sub v22.4s, v14.4s, v20.4s +ldr q16, [x17, #+608] +str q22, [x0, #656] +mla v21.4S, v30.4S, v31.s[0] +sqrdmulh v30.4S, v12.4S, v5.s[0] +add v14.4s, v14.4s, v20.4s +ldr q20, [x17, #+624] +str q14, [x0, #640] +mla v15.4S, v6.4S, v31.s[0] +sqrdmulh v6.4S, v26.4S, v20.s[0] +sub v14.4s, v28.4s, v8.4s +ldr q22, [x0, #784] +str q14, [x0, #752] +mla v17.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v23.4S, v20.s[0] +add v28.4s, v28.4s, v8.4s +ldr q8, [x0, #768] +str q28, [x0, #736] +mul v10.4S, v10.4S,v19.s[0] +mul v12.4S, v12.4S,v19.s[0] +sub v28.4s, v29.4s, v11.4s +ldr q14, [x0, #848] +str q28, [x0, #720] +mla v10.4S, v27.4S, v31.s[0] +mla v12.4S, v30.4S, v31.s[0] +add v29.4s, v29.4s, v11.4s +ldr q11, [x0, #832] +str q29, [x0, #704] +mul v26.4S, v26.4S,v16.s[0] +mul v23.4S, v23.4S,v16.s[0] +sub v29.4s, v22.4s, v7.4s +ldr q30, [x0, #912] +add v22.4s, v22.4s, v7.4s +mla v26.4S, v6.4S, v31.s[0] +mla v23.4S, v1.4S, v31.s[0] +sub v1.4s, v8.4s, v21.4s +ldr q6, [x0, #896] +add v8.4s, v8.4s, v21.4s +sqrdmulh v21.4S, v29.4S, v2.s[2] +mul v29.4S, v29.4S,v4.s[2] +sub v7.4s, v14.4s, v15.4s +ldr q27, [x0, #976] +add v14.4s, v14.4s, v15.4s +sqrdmulh v15.4S, v22.4S, v2.s[1] +mul v22.4S, v22.4S,v4.s[1] +sub v28.4s, v11.4s, v17.4s +ldr q9, [x0, #960] +add v11.4s, v11.4s, v17.4s +sqrdmulh v2.4S, v7.4S, v13.s[2] +mul v7.4S, v7.4S,v3.s[2] +sub v17.4s, v30.4s, v10.4s +add v30.4s, v30.4s, v10.4s +sqrdmulh v10.4S, v14.4S, v13.s[1] +mul v14.4S, v14.4S,v3.s[1] +sub v4.4s, v6.4s, v12.4s +add v6.4s, v6.4s, v12.4s +mla v29.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v17.4S, v5.s[2] +sub v13.4s, v27.4s, v26.4s +add v27.4s, v27.4s, v26.4s +mla v22.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v30.4S, v5.s[1] +sub v26.4s, v9.4s, v23.4s +add v9.4s, v9.4s, v23.4s +mla v7.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v13.4S, v20.s[2] +sub v23.4s, v1.4s, v29.4s +str q23, [x0, #816] +mla v14.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v27.4S, v20.s[1] +add v1.4s, v1.4s, v29.4s +str q1, [x0, #800] +mul v17.4S, v17.4S,v19.s[2] +mul v30.4S, v30.4S,v19.s[1] +sub v1.4s, v8.4s, v22.4s +str q1, [x0, #784] +mla v17.4S, v21.4S, v31.s[0] +mla v30.4S, v15.4S, v31.s[0] +add v8.4s, v8.4s, v22.4s +str q8, [x0, #768] +mul v13.4S, v13.4S,v16.s[2] +mul v27.4S, v27.4S,v16.s[1] +sub v5.4s, v28.4s, v7.4s +str q5, [x0, #880] +mla v13.4S, v2.4S, v31.s[0] +mla v27.4S, v10.4S, v31.s[0] +add v28.4s, v28.4s, v7.4s +str q28, [x0, #864] +sub v20.4s, v11.4s, v14.4s +str q20, [x0, #848] +add v11.4s, v11.4s, v14.4s +str q11, [x0, #832] +sub v11.4s, v4.4s, v17.4s +str q11, [x0, #944] +add v4.4s, v4.4s, v17.4s +str q4, [x0, #928] +sub v4.4s, v6.4s, v30.4s +str q4, [x0, #912] +add v6.4s, v6.4s, v30.4s +str q6, [x0, #896] +sub v6.4s, v26.4s, v13.4s +str q6, [x0, #1008] +add v26.4s, v26.4s, v13.4s +str q26, [x0, #992] +sub v26.4s, v9.4s, v27.4s +str q26, [x0, #976] +add v9.4s, v9.4s, v27.4s +str q9, [x0, #960] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1520 +// Instruction count: 1516 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_7.s b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_7.s new file mode 100644 index 0000000..e7dd93d --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_7.s @@ -0,0 +1,1550 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_22_z4_7 +.global _ntt_u32_incomplete_neon_asm_var_4_2_22_z4_7 +ntt_u32_incomplete_neon_asm_var_4_2_22_z4_7: +_ntt_u32_incomplete_neon_asm_var_4_2_22_z4_7: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x0, #992] +sqrdmulh v27.4S, v28.4S, v29.s[0] +mul v28.4S, v28.4S,v30.s[0] +ldr q26, [x0, #928] +sqrdmulh v25.4S, v26.4S, v29.s[0] +mul v26.4S, v26.4S,v30.s[0] +ldr q24, [x0, #864] +sqrdmulh v23.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +ldr q22, [x0, #800] +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +ldr q20, [x0, #736] +sqrdmulh v19.4S, v20.4S, v29.s[0] +mla v28.4S, v27.4S, v31.s[0] +ldr q27, [x0, #672] +sqrdmulh v18.4S, v27.4S, v29.s[0] +mla v26.4S, v25.4S, v31.s[0] +ldr q25, [x0, #608] +sqrdmulh v17.4S, v25.4S, v29.s[0] +mla v24.4S, v23.4S, v31.s[0] +ldr q23, [x0, #544] +sqrdmulh v16.4S, v23.4S, v29.s[0] +mla v22.4S, v21.4S, v31.s[0] +ldr q21, [x0, #480] +mul v27.4S, v27.4S,v30.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q3, [x0, #416] +ldr q2, [x0, #352] +ldr q1, [x0, #288] +mla v27.4S, v18.4S, v31.s[0] +mla v20.4S, v19.4S, v31.s[0] +ldr q19, [x0, #224] +ldr q18, [x0, #160] +mul v23.4S, v23.4S,v30.s[0] +mul v25.4S, v25.4S,v30.s[0] +ldr q0, [x0, #96] +ldr q15, [x0, #32] +mla v23.4S, v16.4S, v31.s[0] +mla v25.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v28.4s +add v21.4s, v21.4s, v28.4s +sqrdmulh v28.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +sub v16.4s, v3.4s, v26.4s +add v3.4s, v3.4s, v26.4s +sqrdmulh v26.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +sub v14.4s, v2.4s, v24.4s +add v2.4s, v2.4s, v24.4s +sqrdmulh v24.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v13.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +sqrdmulh v22.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v12.4s, v19.4s, v20.4s +add v19.4s, v19.4s, v20.4s +sqrdmulh v20.4S, v14.4S, v29.s[2] +mla v17.4S, v28.4S, v31.s[0] +sub v28.4s, v18.4s, v27.4s +add v18.4s, v18.4s, v27.4s +sqrdmulh v27.4S, v13.4S, v29.s[2] +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v0.4s, v25.4s +add v0.4s, v0.4s, v25.4s +sqrdmulh v25.4S, v2.4S, v29.s[1] +mla v21.4S, v24.4S, v31.s[0] +sub v24.4s, v15.4s, v23.4s +sqrdmulh v11.4S, v1.4S, v29.s[1] +mla v3.4S, v22.4S, v31.s[0] +add v15.4s, v15.4s, v23.4s +ldr q23, [x17, #+32] +ldr q22, [x17, #+48] +mul v13.4S, v13.4S,v30.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v10.4s, v12.4s, v17.4s +add v12.4s, v12.4s, v17.4s +mla v13.4S, v27.4S, v31.s[0] +mla v14.4S, v20.4S, v31.s[0] +sub v20.4s, v28.4s, v16.4s +add v28.4s, v28.4s, v16.4s +mul v1.4S, v1.4S,v30.s[1] +mul v2.4S, v2.4S,v30.s[1] +sub v16.4s, v19.4s, v21.4s +add v19.4s, v19.4s, v21.4s +mla v1.4S, v11.4S, v31.s[0] +mla v2.4S, v25.4S, v31.s[0] +sub v25.4s, v18.4s, v3.4s +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v10.4S, v22.s[3] +mul v10.4S, v10.4S,v23.s[3] +sub v11.4s, v26.4s, v14.4s +add v26.4s, v26.4s, v14.4s +sqrdmulh v14.4S, v12.4S, v22.s[2] +mul v12.4S, v12.4S,v23.s[2] +sub v21.4s, v24.4s, v13.4s +add v24.4s, v24.4s, v13.4s +sqrdmulh v13.4S, v16.4S, v22.s[1] +mul v16.4S, v16.4S,v23.s[1] +sub v27.4s, v0.4s, v2.4s +add v0.4s, v0.4s, v2.4s +sqrdmulh v2.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v17.4s, v15.4s, v1.4s +add v15.4s, v15.4s, v1.4s +ldr q1, [x17, #+96] +ldr q9, [x17, #+112] +sqrdmulh v8.4S, v20.4S, v22.s[3] +mla v10.4S, v3.4S, v31.s[0] +nop +nop +sqrdmulh v3.4S, v28.4S, v22.s[2] +mla v12.4S, v14.4S, v31.s[0] +nop +nop +sqrdmulh v14.4S, v25.4S, v22.s[1] +mla v16.4S, v13.4S, v31.s[0] +nop +nop +sqrdmulh v13.4S, v18.4S, v22.s[0] +mla v19.4S, v2.4S, v31.s[0] +nop +nop +ldr q2, [x17, #+64] +ldr q7, [x17, #+80] +mul v28.4S, v28.4S,v23.s[2] +mul v20.4S, v20.4S,v23.s[3] +sub v6.4s, v11.4s, v10.4s +add v11.4s, v11.4s, v10.4s +mla v28.4S, v3.4S, v31.s[0] +mla v20.4S, v8.4S, v31.s[0] +sub v8.4s, v26.4s, v12.4s +add v26.4s, v26.4s, v12.4s +mul v18.4S, v18.4S,v23.s[0] +mul v25.4S, v25.4S,v23.s[1] +sub v12.4s, v27.4s, v16.4s +add v27.4s, v27.4s, v16.4s +mla v18.4S, v13.4S, v31.s[0] +mla v25.4S, v14.4S, v31.s[0] +sub v14.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v9.s[3] +mul v6.4S, v6.4S,v1.s[3] +sub v13.4s, v21.4s, v20.4s +add v21.4s, v21.4s, v20.4s +sqrdmulh v20.4S, v11.4S, v9.s[2] +mul v11.4S, v11.4S,v1.s[2] +sub v16.4s, v24.4s, v28.4s +add v24.4s, v24.4s, v28.4s +sqrdmulh v28.4S, v8.4S, v9.s[1] +mul v8.4S, v8.4S,v1.s[1] +sub v3.4s, v17.4s, v25.4s +add v17.4s, v17.4s, v25.4s +sqrdmulh v25.4S, v26.4S, v9.s[0] +mul v26.4S, v26.4S,v1.s[0] +sub v10.4s, v15.4s, v18.4s +add v15.4s, v15.4s, v18.4s +sqrdmulh v18.4S, v12.4S, v7.s[3] +mla v6.4S, v19.4S, v31.s[0] +nop +nop +sqrdmulh v19.4S, v27.4S, v7.s[2] +mla v11.4S, v20.4S, v31.s[0] +nop +nop +sqrdmulh v20.4S, v14.4S, v7.s[1] +mla v8.4S, v28.4S, v31.s[0] +nop +nop +sqrdmulh v28.4S, v0.4S, v7.s[0] +mla v26.4S, v25.4S, v31.s[0] +nop +nop +mul v27.4S, v27.4S,v2.s[2] +mul v12.4S, v12.4S,v2.s[3] +sub v25.4s, v13.4s, v6.4s +str q25, [x0, #992] +mla v27.4S, v19.4S, v31.s[0] +mla v12.4S, v18.4S, v31.s[0] +add v13.4s, v13.4s, v6.4s +str q13, [x0, #928] +mul v0.4S, v0.4S,v2.s[0] +mul v14.4S, v14.4S,v2.s[1] +sub v13.4s, v21.4s, v11.4s +str q13, [x0, #864] +mla v0.4S, v28.4S, v31.s[0] +mla v14.4S, v20.4S, v31.s[0] +add v21.4s, v21.4s, v11.4s +sub v11.4s, v16.4s, v8.4s +ldr q20, [x0, #1008] +sqrdmulh v28.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v16.4s, v16.4s, v8.4s +str q21, [x0, #800] +ldr q21, [x0, #944] +sqrdmulh v8.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +sub v13.4s, v24.4s, v26.4s +str q11, [x0, #736] +ldr q11, [x0, #880] +sqrdmulh v6.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +add v24.4s, v24.4s, v26.4s +str q16, [x0, #672] +ldr q16, [x0, #816] +sqrdmulh v26.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v18.4s, v3.4s, v12.4s +str q13, [x0, #608] +ldr q13, [x0, #752] +sqrdmulh v19.4S, v13.4S, v29.s[0] +mla v20.4S, v28.4S, v31.s[0] +add v3.4s, v3.4s, v12.4s +str q24, [x0, #544] +ldr q24, [x0, #688] +sqrdmulh v12.4S, v24.4S, v29.s[0] +mla v21.4S, v8.4S, v31.s[0] +sub v8.4s, v17.4s, v27.4s +str q18, [x0, #480] +ldr q18, [x0, #624] +sqrdmulh v28.4S, v18.4S, v29.s[0] +mla v11.4S, v6.4S, v31.s[0] +add v17.4s, v17.4s, v27.4s +str q3, [x0, #416] +ldr q3, [x0, #560] +sqrdmulh v27.4S, v3.4S, v29.s[0] +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v10.4s, v14.4s +str q8, [x0, #352] +ldr q8, [x0, #496] +add v10.4s, v10.4s, v14.4s +mul v24.4S, v24.4S,v30.s[0] +mul v13.4S, v13.4S,v30.s[0] +ldr q14, [x0, #432] +str q17, [x0, #288] +ldr q17, [x0, #368] +ldr q6, [x0, #304] +mla v24.4S, v12.4S, v31.s[0] +mla v13.4S, v19.4S, v31.s[0] +str q26, [x0, #224] +sub v26.4s, v15.4s, v0.4s +ldr q19, [x0, #240] +ldr q12, [x0, #176] +mul v3.4S, v3.4S,v30.s[0] +mul v18.4S, v18.4S,v30.s[0] +str q10, [x0, #160] +add v15.4s, v15.4s, v0.4s +ldr q0, [x0, #112] +ldr q10, [x0, #48] +mla v3.4S, v27.4S, v31.s[0] +mla v18.4S, v28.4S, v31.s[0] +sub v28.4s, v8.4s, v20.4s +add v8.4s, v8.4s, v20.4s +sqrdmulh v20.4S, v28.4S, v29.s[2] +mul v28.4S, v28.4S,v30.s[2] +sub v27.4s, v14.4s, v21.4s +add v14.4s, v14.4s, v21.4s +sqrdmulh v21.4S, v27.4S, v29.s[2] +mul v27.4S, v27.4S,v30.s[2] +sub v25.4s, v17.4s, v11.4s +add v17.4s, v17.4s, v11.4s +sqrdmulh v11.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +sub v5.4s, v6.4s, v16.4s +add v6.4s, v6.4s, v16.4s +sqrdmulh v16.4S, v14.4S, v29.s[1] +mul v14.4S, v14.4S,v30.s[1] +sub v4.4s, v19.4s, v13.4s +add v19.4s, v19.4s, v13.4s +sqrdmulh v13.4S, v25.4S, v29.s[2] +mla v28.4S, v20.4S, v31.s[0] +sub v20.4s, v12.4s, v24.4s +add v12.4s, v12.4s, v24.4s +sqrdmulh v24.4S, v5.4S, v29.s[2] +mla v27.4S, v21.4S, v31.s[0] +sub v21.4s, v0.4s, v18.4s +add v0.4s, v0.4s, v18.4s +sqrdmulh v18.4S, v17.4S, v29.s[1] +mla v8.4S, v11.4S, v31.s[0] +sub v11.4s, v10.4s, v3.4s +str q26, [x0, #96] +sqrdmulh v26.4S, v6.4S, v29.s[1] +mla v14.4S, v16.4S, v31.s[0] +add v10.4s, v10.4s, v3.4s +str q15, [x0, #32] +mul v5.4S, v5.4S,v30.s[2] +mul v25.4S, v25.4S,v30.s[2] +sub v15.4s, v4.4s, v28.4s +add v4.4s, v4.4s, v28.4s +mla v5.4S, v24.4S, v31.s[0] +mla v25.4S, v13.4S, v31.s[0] +sub v13.4s, v20.4s, v27.4s +add v20.4s, v20.4s, v27.4s +mul v6.4S, v6.4S,v30.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v27.4s, v19.4s, v8.4s +add v19.4s, v19.4s, v8.4s +mla v6.4S, v26.4S, v31.s[0] +mla v17.4S, v18.4S, v31.s[0] +sub v18.4s, v12.4s, v14.4s +add v12.4s, v12.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v22.s[3] +mul v15.4S, v15.4S,v23.s[3] +sub v26.4s, v21.4s, v25.4s +add v21.4s, v21.4s, v25.4s +sqrdmulh v25.4S, v4.4S, v22.s[2] +mul v4.4S, v4.4S,v23.s[2] +sub v8.4s, v11.4s, v5.4s +add v11.4s, v11.4s, v5.4s +sqrdmulh v5.4S, v27.4S, v22.s[1] +mul v27.4S, v27.4S,v23.s[1] +sub v24.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +sqrdmulh v17.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v28.4s, v10.4s, v6.4s +add v10.4s, v10.4s, v6.4s +sqrdmulh v6.4S, v13.4S, v22.s[3] +mla v15.4S, v14.4S, v31.s[0] +nop +nop +sqrdmulh v14.4S, v20.4S, v22.s[2] +mla v4.4S, v25.4S, v31.s[0] +nop +nop +sqrdmulh v25.4S, v18.4S, v22.s[1] +mla v27.4S, v5.4S, v31.s[0] +nop +nop +sqrdmulh v5.4S, v12.4S, v22.s[0] +mla v19.4S, v17.4S, v31.s[0] +nop +nop +mul v20.4S, v20.4S,v23.s[2] +mul v13.4S, v13.4S,v23.s[3] +sub v17.4s, v26.4s, v15.4s +add v26.4s, v26.4s, v15.4s +mla v20.4S, v14.4S, v31.s[0] +mla v13.4S, v6.4S, v31.s[0] +sub v6.4s, v21.4s, v4.4s +add v21.4s, v21.4s, v4.4s +mul v12.4S, v12.4S,v23.s[0] +mul v18.4S, v18.4S,v23.s[1] +sub v4.4s, v24.4s, v27.4s +add v24.4s, v24.4s, v27.4s +mla v12.4S, v5.4S, v31.s[0] +mla v18.4S, v25.4S, v31.s[0] +sub v25.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v17.4S, v9.s[3] +mul v17.4S, v17.4S,v1.s[3] +sub v5.4s, v8.4s, v13.4s +add v8.4s, v8.4s, v13.4s +sqrdmulh v13.4S, v26.4S, v9.s[2] +mul v26.4S, v26.4S,v1.s[2] +sub v27.4s, v11.4s, v20.4s +add v11.4s, v11.4s, v20.4s +sqrdmulh v20.4S, v6.4S, v9.s[1] +mul v6.4S, v6.4S,v1.s[1] +sub v14.4s, v28.4s, v18.4s +add v28.4s, v28.4s, v18.4s +sqrdmulh v18.4S, v21.4S, v9.s[0] +mul v21.4S, v21.4S,v1.s[0] +sub v15.4s, v10.4s, v12.4s +add v10.4s, v10.4s, v12.4s +sqrdmulh v12.4S, v4.4S, v7.s[3] +mla v17.4S, v19.4S, v31.s[0] +nop +nop +sqrdmulh v19.4S, v24.4S, v7.s[2] +mla v26.4S, v13.4S, v31.s[0] +nop +nop +sqrdmulh v13.4S, v25.4S, v7.s[1] +mla v6.4S, v20.4S, v31.s[0] +nop +nop +sqrdmulh v20.4S, v0.4S, v7.s[0] +mla v21.4S, v18.4S, v31.s[0] +nop +nop +mul v24.4S, v24.4S,v2.s[2] +mul v4.4S, v4.4S,v2.s[3] +sub v18.4s, v5.4s, v17.4s +str q18, [x0, #1008] +mla v24.4S, v19.4S, v31.s[0] +mla v4.4S, v12.4S, v31.s[0] +add v5.4s, v5.4s, v17.4s +str q5, [x0, #944] +mul v0.4S, v0.4S,v2.s[0] +mul v25.4S, v25.4S,v2.s[1] +sub v5.4s, v8.4s, v26.4s +str q5, [x0, #880] +mla v0.4S, v20.4S, v31.s[0] +mla v25.4S, v13.4S, v31.s[0] +add v8.4s, v8.4s, v26.4s +sub v26.4s, v27.4s, v6.4s +ldr q13, [x0, #960] +sqrdmulh v20.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +add v27.4s, v27.4s, v6.4s +str q8, [x0, #816] +ldr q8, [x0, #896] +sqrdmulh v6.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v5.4s, v11.4s, v21.4s +str q26, [x0, #752] +ldr q26, [x0, #832] +sqrdmulh v17.4S, v26.4S, v29.s[0] +mul v26.4S, v26.4S,v30.s[0] +add v11.4s, v11.4s, v21.4s +str q27, [x0, #688] +ldr q27, [x0, #768] +sqrdmulh v21.4S, v27.4S, v29.s[0] +mul v27.4S, v27.4S,v30.s[0] +sub v12.4s, v14.4s, v4.4s +str q5, [x0, #624] +ldr q5, [x0, #704] +sqrdmulh v19.4S, v5.4S, v29.s[0] +mla v13.4S, v20.4S, v31.s[0] +add v14.4s, v14.4s, v4.4s +str q11, [x0, #560] +ldr q11, [x0, #640] +sqrdmulh v4.4S, v11.4S, v29.s[0] +mla v8.4S, v6.4S, v31.s[0] +sub v6.4s, v28.4s, v24.4s +str q12, [x0, #496] +ldr q12, [x0, #576] +sqrdmulh v20.4S, v12.4S, v29.s[0] +mla v26.4S, v17.4S, v31.s[0] +add v28.4s, v28.4s, v24.4s +str q14, [x0, #432] +ldr q14, [x0, #512] +sqrdmulh v24.4S, v14.4S, v29.s[0] +mla v27.4S, v21.4S, v31.s[0] +sub v21.4s, v15.4s, v25.4s +str q6, [x0, #368] +ldr q6, [x0, #448] +add v15.4s, v15.4s, v25.4s +mul v11.4S, v11.4S,v30.s[0] +mul v5.4S, v5.4S,v30.s[0] +ldr q25, [x0, #384] +str q28, [x0, #304] +ldr q28, [x0, #320] +ldr q17, [x0, #256] +mla v11.4S, v4.4S, v31.s[0] +mla v5.4S, v19.4S, v31.s[0] +str q21, [x0, #240] +sub v21.4s, v10.4s, v0.4s +ldr q19, [x0, #192] +ldr q4, [x0, #128] +mul v14.4S, v14.4S,v30.s[0] +mul v12.4S, v12.4S,v30.s[0] +str q15, [x0, #176] +add v10.4s, v10.4s, v0.4s +ldr q0, [x0, #64] +ldr q15, [x0, #0] +mla v14.4S, v24.4S, v31.s[0] +mla v12.4S, v20.4S, v31.s[0] +sub v20.4s, v6.4s, v13.4s +add v6.4s, v6.4s, v13.4s +sqrdmulh v13.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +sub v24.4s, v25.4s, v8.4s +add v25.4s, v25.4s, v8.4s +sqrdmulh v8.4S, v24.4S, v29.s[2] +mul v24.4S, v24.4S,v30.s[2] +sub v18.4s, v28.4s, v26.4s +add v28.4s, v28.4s, v26.4s +sqrdmulh v26.4S, v6.4S, v29.s[1] +mul v6.4S, v6.4S,v30.s[1] +sub v3.4s, v17.4s, v27.4s +add v17.4s, v17.4s, v27.4s +sqrdmulh v27.4S, v25.4S, v29.s[1] +mul v25.4S, v25.4S,v30.s[1] +sub v16.4s, v19.4s, v5.4s +add v19.4s, v19.4s, v5.4s +sqrdmulh v5.4S, v18.4S, v29.s[2] +mla v20.4S, v13.4S, v31.s[0] +sub v13.4s, v4.4s, v11.4s +add v4.4s, v4.4s, v11.4s +sqrdmulh v11.4S, v3.4S, v29.s[2] +mla v24.4S, v8.4S, v31.s[0] +sub v8.4s, v0.4s, v12.4s +add v0.4s, v0.4s, v12.4s +sqrdmulh v12.4S, v28.4S, v29.s[1] +mla v6.4S, v26.4S, v31.s[0] +sub v26.4s, v15.4s, v14.4s +str q21, [x0, #112] +sqrdmulh v21.4S, v17.4S, v29.s[1] +mla v25.4S, v27.4S, v31.s[0] +add v15.4s, v15.4s, v14.4s +str q10, [x0, #48] +mul v3.4S, v3.4S,v30.s[2] +mul v18.4S, v18.4S,v30.s[2] +sub v10.4s, v16.4s, v20.4s +add v16.4s, v16.4s, v20.4s +mla v3.4S, v11.4S, v31.s[0] +mla v18.4S, v5.4S, v31.s[0] +sub v5.4s, v13.4s, v24.4s +add v13.4s, v13.4s, v24.4s +mul v17.4S, v17.4S,v30.s[1] +mul v28.4S, v28.4S,v30.s[1] +sub v24.4s, v19.4s, v6.4s +add v19.4s, v19.4s, v6.4s +mla v17.4S, v21.4S, v31.s[0] +mla v28.4S, v12.4S, v31.s[0] +sub v12.4s, v4.4s, v25.4s +add v4.4s, v4.4s, v25.4s +sqrdmulh v25.4S, v10.4S, v22.s[3] +mul v10.4S, v10.4S,v23.s[3] +sub v21.4s, v8.4s, v18.4s +add v8.4s, v8.4s, v18.4s +sqrdmulh v18.4S, v16.4S, v22.s[2] +mul v16.4S, v16.4S,v23.s[2] +sub v6.4s, v26.4s, v3.4s +add v26.4s, v26.4s, v3.4s +sqrdmulh v3.4S, v24.4S, v22.s[1] +mul v24.4S, v24.4S,v23.s[1] +sub v11.4s, v0.4s, v28.4s +add v0.4s, v0.4s, v28.4s +sqrdmulh v28.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v20.4s, v15.4s, v17.4s +add v15.4s, v15.4s, v17.4s +sqrdmulh v17.4S, v5.4S, v22.s[3] +mla v10.4S, v25.4S, v31.s[0] +nop +nop +sqrdmulh v25.4S, v13.4S, v22.s[2] +mla v16.4S, v18.4S, v31.s[0] +nop +nop +sqrdmulh v18.4S, v12.4S, v22.s[1] +mla v24.4S, v3.4S, v31.s[0] +nop +nop +sqrdmulh v3.4S, v4.4S, v22.s[0] +mla v19.4S, v28.4S, v31.s[0] +nop +nop +mul v13.4S, v13.4S,v23.s[2] +mul v5.4S, v5.4S,v23.s[3] +sub v28.4s, v21.4s, v10.4s +add v21.4s, v21.4s, v10.4s +mla v13.4S, v25.4S, v31.s[0] +mla v5.4S, v17.4S, v31.s[0] +sub v17.4s, v8.4s, v16.4s +add v8.4s, v8.4s, v16.4s +mul v4.4S, v4.4S,v23.s[0] +mul v12.4S, v12.4S,v23.s[1] +sub v16.4s, v11.4s, v24.4s +add v11.4s, v11.4s, v24.4s +mla v4.4S, v3.4S, v31.s[0] +mla v12.4S, v18.4S, v31.s[0] +sub v18.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v28.4S, v9.s[3] +mul v28.4S, v28.4S,v1.s[3] +sub v3.4s, v6.4s, v5.4s +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v21.4S, v9.s[2] +mul v21.4S, v21.4S,v1.s[2] +sub v24.4s, v26.4s, v13.4s +add v26.4s, v26.4s, v13.4s +sqrdmulh v13.4S, v17.4S, v9.s[1] +mul v17.4S, v17.4S,v1.s[1] +sub v25.4s, v20.4s, v12.4s +add v20.4s, v20.4s, v12.4s +sqrdmulh v12.4S, v8.4S, v9.s[0] +mul v8.4S, v8.4S,v1.s[0] +sub v10.4s, v15.4s, v4.4s +add v15.4s, v15.4s, v4.4s +sqrdmulh v4.4S, v16.4S, v7.s[3] +mla v28.4S, v19.4S, v31.s[0] +nop +nop +sqrdmulh v19.4S, v11.4S, v7.s[2] +mla v21.4S, v5.4S, v31.s[0] +nop +nop +sqrdmulh v5.4S, v18.4S, v7.s[1] +mla v17.4S, v13.4S, v31.s[0] +nop +nop +sqrdmulh v13.4S, v0.4S, v7.s[0] +mla v8.4S, v12.4S, v31.s[0] +nop +nop +mul v11.4S, v11.4S,v2.s[2] +mul v16.4S, v16.4S,v2.s[3] +sub v12.4s, v3.4s, v28.4s +str q12, [x0, #960] +mla v11.4S, v19.4S, v31.s[0] +mla v16.4S, v4.4S, v31.s[0] +add v3.4s, v3.4s, v28.4s +str q3, [x0, #896] +mul v0.4S, v0.4S,v2.s[0] +mul v18.4S, v18.4S,v2.s[1] +sub v3.4s, v6.4s, v21.4s +str q3, [x0, #832] +mla v0.4S, v13.4S, v31.s[0] +mla v18.4S, v5.4S, v31.s[0] +add v6.4s, v6.4s, v21.4s +sub v21.4s, v24.4s, v17.4s +ldr q5, [x0, #976] +sqrdmulh v13.4S, v5.4S, v29.s[0] +mul v5.4S, v5.4S,v30.s[0] +add v24.4s, v24.4s, v17.4s +str q6, [x0, #768] +ldr q6, [x0, #912] +sqrdmulh v17.4S, v6.4S, v29.s[0] +mul v6.4S, v6.4S,v30.s[0] +sub v3.4s, v26.4s, v8.4s +str q21, [x0, #704] +ldr q21, [x0, #848] +sqrdmulh v28.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +add v26.4s, v26.4s, v8.4s +str q24, [x0, #640] +ldr q24, [x0, #784] +sqrdmulh v8.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +sub v4.4s, v25.4s, v16.4s +str q3, [x0, #576] +ldr q3, [x0, #720] +sqrdmulh v19.4S, v3.4S, v29.s[0] +mla v5.4S, v13.4S, v31.s[0] +add v25.4s, v25.4s, v16.4s +str q26, [x0, #512] +ldr q26, [x0, #656] +sqrdmulh v16.4S, v26.4S, v29.s[0] +mla v6.4S, v17.4S, v31.s[0] +sub v17.4s, v20.4s, v11.4s +str q4, [x0, #448] +ldr q4, [x0, #592] +sqrdmulh v13.4S, v4.4S, v29.s[0] +mla v21.4S, v28.4S, v31.s[0] +add v20.4s, v20.4s, v11.4s +str q25, [x0, #384] +ldr q25, [x0, #528] +sqrdmulh v11.4S, v25.4S, v29.s[0] +mla v24.4S, v8.4S, v31.s[0] +sub v8.4s, v10.4s, v18.4s +str q17, [x0, #320] +ldr q17, [x0, #464] +add v10.4s, v10.4s, v18.4s +mul v26.4S, v26.4S,v30.s[0] +mul v3.4S, v3.4S,v30.s[0] +ldr q18, [x0, #400] +str q20, [x0, #256] +ldr q20, [x0, #336] +ldr q28, [x0, #272] +mla v26.4S, v16.4S, v31.s[0] +mla v3.4S, v19.4S, v31.s[0] +str q8, [x0, #192] +sub v8.4s, v15.4s, v0.4s +ldr q19, [x0, #208] +ldr q16, [x0, #144] +mul v25.4S, v25.4S,v30.s[0] +mul v4.4S, v4.4S,v30.s[0] +str q10, [x0, #128] +add v15.4s, v15.4s, v0.4s +ldr q0, [x0, #80] +ldr q10, [x0, #16] +mla v25.4S, v11.4S, v31.s[0] +mla v4.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v5.4s +add v17.4s, v17.4s, v5.4s +sqrdmulh v5.4S, v13.4S, v29.s[2] +mul v13.4S, v13.4S,v30.s[2] +sub v11.4s, v18.4s, v6.4s +add v18.4s, v18.4s, v6.4s +sqrdmulh v6.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v12.4s, v20.4s, v21.4s +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v14.4s, v28.4s, v24.4s +add v28.4s, v28.4s, v24.4s +sqrdmulh v24.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v27.4s, v19.4s, v3.4s +add v19.4s, v19.4s, v3.4s +sqrdmulh v3.4S, v12.4S, v29.s[2] +mla v13.4S, v5.4S, v31.s[0] +sub v5.4s, v16.4s, v26.4s +add v16.4s, v16.4s, v26.4s +sqrdmulh v26.4S, v14.4S, v29.s[2] +mla v11.4S, v6.4S, v31.s[0] +sub v6.4s, v0.4s, v4.4s +add v0.4s, v0.4s, v4.4s +sqrdmulh v4.4S, v20.4S, v29.s[1] +mla v17.4S, v21.4S, v31.s[0] +sub v21.4s, v10.4s, v25.4s +str q8, [x0, #64] +sqrdmulh v8.4S, v28.4S, v29.s[1] +mla v18.4S, v24.4S, v31.s[0] +add v10.4s, v10.4s, v25.4s +str q15, [x0, #0] +mul v14.4S, v14.4S,v30.s[2] +mul v12.4S, v12.4S,v30.s[2] +sub v15.4s, v27.4s, v13.4s +add v27.4s, v27.4s, v13.4s +mla v14.4S, v26.4S, v31.s[0] +mla v12.4S, v3.4S, v31.s[0] +sub v3.4s, v5.4s, v11.4s +add v5.4s, v5.4s, v11.4s +mul v28.4S, v28.4S,v30.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v11.4s, v19.4s, v17.4s +add v19.4s, v19.4s, v17.4s +mla v28.4S, v8.4S, v31.s[0] +mla v20.4S, v4.4S, v31.s[0] +sub v4.4s, v16.4s, v18.4s +add v16.4s, v16.4s, v18.4s +sqrdmulh v29.4S, v15.4S, v22.s[3] +mul v15.4S, v15.4S,v23.s[3] +sub v30.4s, v6.4s, v12.4s +add v6.4s, v6.4s, v12.4s +sqrdmulh v12.4S, v27.4S, v22.s[2] +mul v27.4S, v27.4S,v23.s[2] +sub v18.4s, v21.4s, v14.4s +add v21.4s, v21.4s, v14.4s +sqrdmulh v14.4S, v11.4S, v22.s[1] +mul v11.4S, v11.4S,v23.s[1] +sub v8.4s, v0.4s, v20.4s +add v0.4s, v0.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v17.4s, v10.4s, v28.4s +add v10.4s, v10.4s, v28.4s +sqrdmulh v28.4S, v3.4S, v22.s[3] +mla v15.4S, v29.4S, v31.s[0] +nop +nop +sqrdmulh v29.4S, v5.4S, v22.s[2] +mla v27.4S, v12.4S, v31.s[0] +nop +nop +sqrdmulh v12.4S, v4.4S, v22.s[1] +mla v11.4S, v14.4S, v31.s[0] +nop +nop +sqrdmulh v14.4S, v16.4S, v22.s[0] +mla v19.4S, v20.4S, v31.s[0] +nop +nop +mul v5.4S, v5.4S,v23.s[2] +mul v3.4S, v3.4S,v23.s[3] +sub v20.4s, v30.4s, v15.4s +add v30.4s, v30.4s, v15.4s +mla v5.4S, v29.4S, v31.s[0] +mla v3.4S, v28.4S, v31.s[0] +sub v28.4s, v6.4s, v27.4s +add v6.4s, v6.4s, v27.4s +mul v16.4S, v16.4S,v23.s[0] +mul v4.4S, v4.4S,v23.s[1] +sub v27.4s, v8.4s, v11.4s +add v8.4s, v8.4s, v11.4s +mla v16.4S, v14.4S, v31.s[0] +mla v4.4S, v12.4S, v31.s[0] +sub v12.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v22.4S, v20.4S, v9.s[3] +mul v20.4S, v20.4S,v1.s[3] +sub v23.4s, v18.4s, v3.4s +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v30.4S, v9.s[2] +mul v30.4S, v30.4S,v1.s[2] +sub v19.4s, v21.4s, v5.4s +add v21.4s, v21.4s, v5.4s +sqrdmulh v5.4S, v28.4S, v9.s[1] +mul v28.4S, v28.4S,v1.s[1] +sub v14.4s, v17.4s, v4.4s +add v17.4s, v17.4s, v4.4s +sqrdmulh v4.4S, v6.4S, v9.s[0] +mul v6.4S, v6.4S,v1.s[0] +sub v11.4s, v10.4s, v16.4s +add v10.4s, v10.4s, v16.4s +sqrdmulh v9.4S, v27.4S, v7.s[3] +mla v20.4S, v22.4S, v31.s[0] +nop +nop +sqrdmulh v22.4S, v8.4S, v7.s[2] +mla v30.4S, v3.4S, v31.s[0] +nop +nop +sqrdmulh v3.4S, v12.4S, v7.s[1] +mla v28.4S, v5.4S, v31.s[0] +nop +nop +sqrdmulh v5.4S, v0.4S, v7.s[0] +mla v6.4S, v4.4S, v31.s[0] +nop +nop +mul v8.4S, v8.4S,v2.s[2] +mul v27.4S, v27.4S,v2.s[3] +sub v4.4s, v23.4s, v20.4s +str q4, [x0, #976] +mla v8.4S, v22.4S, v31.s[0] +mla v27.4S, v9.4S, v31.s[0] +add v23.4s, v23.4s, v20.4s +str q23, [x0, #912] +mul v0.4S, v0.4S,v2.s[0] +mul v12.4S, v12.4S,v2.s[1] +sub v23.4s, v18.4s, v30.4s +str q23, [x0, #848] +mla v0.4S, v5.4S, v31.s[0] +mla v12.4S, v3.4S, v31.s[0] +add v18.4s, v18.4s, v30.4s +sub v30.4s, v19.4s, v28.4s +add v19.4s, v19.4s, v28.4s +str q18, [x0, #784] +sub v18.4s, v21.4s, v6.4s +str q30, [x0, #720] +add v21.4s, v21.4s, v6.4s +str q19, [x0, #656] +sub v19.4s, v14.4s, v27.4s +str q18, [x0, #592] +add v14.4s, v14.4s, v27.4s +str q21, [x0, #528] +sub v21.4s, v17.4s, v8.4s +str q19, [x0, #464] +add v17.4s, v17.4s, v8.4s +str q14, [x0, #400] +sub v14.4s, v11.4s, v12.4s +str q21, [x0, #336] +add v11.4s, v11.4s, v12.4s +str q17, [x0, #272] +sub v17.4s, v10.4s, v0.4s +add v10.4s, v10.4s, v0.4s +ldr q24, [x0, #224] +ldr q25, [x0, #160] +ldr q13, [x0, #32] +ldr q26, [x17, #+128] +ldr q15, [x17, #+144] +sqrdmulh v29.4S, v13.4S, v15.s[0] +mul v13.4S, v13.4S,v26.s[0] +ldr q16, [x0, #48] +sqrdmulh v1.4S, v16.4S, v15.s[0] +mul v16.4S, v16.4S,v26.s[0] +ldr q4, [x17, #+160] +ldr q22, [x17, #+176] +ldr q9, [x0, #96] +sqrdmulh v20.4S, v9.4S, v22.s[0] +mul v9.4S, v9.4S,v4.s[0] +ldr q23, [x0, #112] +sqrdmulh v5.4S, v23.4S, v22.s[0] +mul v23.4S, v23.4S,v4.s[0] +ldr q3, [x17, #+192] +ldr q2, [x17, #+208] +mla v13.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v25.4S, v2.s[0] +ldr q7, [x0, #176] +mla v16.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v7.4S, v2.s[0] +ldr q28, [x17, #+224] +ldr q30, [x17, #+240] +mla v9.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v24.4S, v30.s[0] +ldr q6, [x0, #240] +mla v23.4S, v5.4S, v31.s[0] +sqrdmulh v5.4S, v6.4S, v30.s[0] +ldr q18, [x0, #0] +ldr q27, [x0, #128] +mul v25.4S, v25.4S,v3.s[0] +sub v19.4s, v18.4s, v13.4s +mul v7.4S, v7.4S,v3.s[0] +add v18.4s, v18.4s, v13.4s +mla v25.4S, v29.4S, v31.s[0] +sub v29.4s, v10.4s, v16.4s +ldr q13, [x0, #64] +mla v7.4S, v1.4S, v31.s[0] +add v10.4s, v10.4s, v16.4s +ldr q16, [x0, #192] +mul v24.4S, v24.4S,v28.s[0] +sub v1.4s, v13.4s, v9.4s +mul v6.4S, v6.4S,v28.s[0] +add v13.4s, v13.4s, v9.4s +mla v24.4S, v20.4S, v31.s[0] +mla v6.4S, v5.4S, v31.s[0] +sub v5.4s, v17.4s, v23.4s +sqrdmulh v20.4S, v10.4S, v15.s[1] +add v17.4s, v17.4s, v23.4s +mul v10.4S, v10.4S,v26.s[1] +sqrdmulh v23.4S, v29.4S, v15.s[2] +sub v9.4s, v27.4s, v25.4s +mul v29.4S, v29.4S,v26.s[2] +add v27.4s, v27.4s, v25.4s +sqrdmulh v15.4S, v17.4S, v22.s[1] +sub v26.4s, v11.4s, v7.4s +mul v17.4S, v17.4S,v4.s[1] +add v11.4s, v11.4s, v7.4s +sqrdmulh v7.4S, v5.4S, v22.s[2] +sub v25.4s, v16.4s, v24.4s +mul v5.4S, v5.4S,v4.s[2] +add v16.4s, v16.4s, v24.4s +mla v10.4S, v20.4S, v31.s[0] +sub v20.4s, v14.4s, v6.4s +ldr q22, [x0, #480] +sqrdmulh v4.4S, v11.4S, v2.s[1] +add v14.4s, v14.4s, v6.4s +mla v29.4S, v23.4S, v31.s[0] +ldr q23, [x0, #416] +sqrdmulh v6.4S, v26.4S, v2.s[2] +sub v24.4s, v18.4s, v10.4s +mla v17.4S, v15.4S, v31.s[0] +ldr q15, [x0, #288] +sqrdmulh v8.4S, v14.4S, v30.s[1] +add v18.4s, v18.4s, v10.4s +str q24, [x0, #16] +mla v5.4S, v7.4S, v31.s[0] +ldr q7, [x17, #+256] +ldr q24, [x17, #+272] +sqrdmulh v10.4S, v20.4S, v30.s[2] +sub v21.4s, v19.4s, v29.4s +str q18, [x0, #0] +mul v11.4S, v11.4S,v3.s[1] +add v19.4s, v19.4s, v29.4s +mul v26.4S, v26.4S,v3.s[2] +str q21, [x0, #48] +mla v11.4S, v4.4S, v31.s[0] +sub v4.4s, v13.4s, v17.4s +mla v26.4S, v6.4S, v31.s[0] +str q19, [x0, #32] +mul v14.4S, v14.4S,v28.s[1] +str q4, [x0, #80] +mul v20.4S, v20.4S,v28.s[2] +add v13.4s, v13.4s, v17.4s +str q13, [x0, #64] +mla v14.4S, v8.4S, v31.s[0] +sub v8.4s, v1.4s, v5.4s +str q8, [x0, #112] +mla v20.4S, v10.4S, v31.s[0] +add v1.4s, v1.4s, v5.4s +str q1, [x0, #96] +sqrdmulh v30.4S, v15.4S, v24.s[0] +sub v28.4s, v27.4s, v11.4s +mul v15.4S, v15.4S,v7.s[0] +str q28, [x0, #144] +ldr q28, [x0, #304] +sqrdmulh v1.4S, v28.4S, v24.s[0] +add v27.4s, v27.4s, v11.4s +mul v28.4S, v28.4S,v7.s[0] +str q27, [x0, #128] +ldr q27, [x17, #+288] +ldr q11, [x17, #+304] +ldr q5, [x0, #352] +sqrdmulh v10.4S, v5.4S, v11.s[0] +sub v8.4s, v9.4s, v26.4s +mul v5.4S, v5.4S,v27.s[0] +str q8, [x0, #176] +ldr q8, [x0, #368] +sqrdmulh v13.4S, v8.4S, v11.s[0] +add v9.4s, v9.4s, v26.4s +mul v8.4S, v8.4S,v27.s[0] +str q9, [x0, #160] +ldr q9, [x17, #+320] +ldr q26, [x17, #+336] +mla v15.4S, v30.4S, v31.s[0] +sub v30.4s, v16.4s, v14.4s +sqrdmulh v17.4S, v23.4S, v26.s[0] +str q30, [x0, #208] +ldr q30, [x0, #432] +mla v28.4S, v1.4S, v31.s[0] +add v16.4s, v16.4s, v14.4s +sqrdmulh v14.4S, v30.4S, v26.s[0] +str q16, [x0, #192] +ldr q16, [x17, #+352] +ldr q1, [x17, #+368] +mla v5.4S, v10.4S, v31.s[0] +sub v10.4s, v25.4s, v20.4s +sqrdmulh v4.4S, v22.4S, v1.s[0] +str q10, [x0, #240] +ldr q10, [x0, #496] +mla v8.4S, v13.4S, v31.s[0] +add v25.4s, v25.4s, v20.4s +sqrdmulh v20.4S, v10.4S, v1.s[0] +str q25, [x0, #224] +ldr q25, [x0, #256] +ldr q13, [x0, #384] +mul v23.4S, v23.4S,v9.s[0] +sub v2.4s, v25.4s, v15.4s +ldr q3, [x0, #272] +mul v30.4S, v30.4S,v9.s[0] +add v25.4s, v25.4s, v15.4s +ldr q15, [x0, #400] +mla v23.4S, v17.4S, v31.s[0] +sub v17.4s, v3.4s, v28.4s +ldr q19, [x0, #320] +mla v30.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v28.4s +ldr q28, [x0, #448] +mul v22.4S, v22.4S,v16.s[0] +sub v14.4s, v19.4s, v5.4s +ldr q6, [x0, #336] +mul v10.4S, v10.4S,v16.s[0] +add v19.4s, v19.4s, v5.4s +ldr q5, [x0, #464] +mla v22.4S, v4.4S, v31.s[0] +mla v10.4S, v20.4S, v31.s[0] +sub v20.4s, v6.4s, v8.4s +sqrdmulh v4.4S, v3.4S, v24.s[1] +add v6.4s, v6.4s, v8.4s +mul v3.4S, v3.4S,v7.s[1] +sqrdmulh v8.4S, v17.4S, v24.s[2] +sub v21.4s, v13.4s, v23.4s +mul v17.4S, v17.4S,v7.s[2] +add v13.4s, v13.4s, v23.4s +sqrdmulh v24.4S, v6.4S, v11.s[1] +sub v7.4s, v15.4s, v30.4s +mul v6.4S, v6.4S,v27.s[1] +add v15.4s, v15.4s, v30.4s +sqrdmulh v30.4S, v20.4S, v11.s[2] +sub v23.4s, v28.4s, v22.4s +mul v20.4S, v20.4S,v27.s[2] +add v28.4s, v28.4s, v22.4s +mla v3.4S, v4.4S, v31.s[0] +sub v4.4s, v5.4s, v10.4s +ldr q11, [x0, #736] +sqrdmulh v27.4S, v15.4S, v26.s[1] +add v5.4s, v5.4s, v10.4s +mla v17.4S, v8.4S, v31.s[0] +ldr q8, [x0, #672] +sqrdmulh v10.4S, v7.4S, v26.s[2] +sub v22.4s, v25.4s, v3.4s +mla v6.4S, v24.4S, v31.s[0] +ldr q24, [x0, #544] +sqrdmulh v29.4S, v5.4S, v1.s[1] +add v25.4s, v25.4s, v3.4s +str q22, [x0, #272] +mla v20.4S, v30.4S, v31.s[0] +ldr q30, [x17, #+384] +ldr q22, [x17, #+400] +sqrdmulh v3.4S, v4.4S, v1.s[2] +sub v18.4s, v2.4s, v17.4s +str q25, [x0, #256] +mul v15.4S, v15.4S,v9.s[1] +add v2.4s, v2.4s, v17.4s +mul v7.4S, v7.4S,v9.s[2] +str q18, [x0, #304] +mla v15.4S, v27.4S, v31.s[0] +sub v27.4s, v19.4s, v6.4s +mla v7.4S, v10.4S, v31.s[0] +str q2, [x0, #288] +mul v5.4S, v5.4S,v16.s[1] +str q27, [x0, #336] +mul v4.4S, v4.4S,v16.s[2] +add v19.4s, v19.4s, v6.4s +str q19, [x0, #320] +mla v5.4S, v29.4S, v31.s[0] +sub v29.4s, v14.4s, v20.4s +str q29, [x0, #368] +mla v4.4S, v3.4S, v31.s[0] +add v14.4s, v14.4s, v20.4s +str q14, [x0, #352] +sqrdmulh v1.4S, v24.4S, v22.s[0] +sub v16.4s, v13.4s, v15.4s +mul v24.4S, v24.4S,v30.s[0] +str q16, [x0, #400] +ldr q16, [x0, #560] +sqrdmulh v14.4S, v16.4S, v22.s[0] +add v13.4s, v13.4s, v15.4s +mul v16.4S, v16.4S,v30.s[0] +str q13, [x0, #384] +ldr q13, [x17, #+416] +ldr q15, [x17, #+432] +ldr q20, [x0, #608] +sqrdmulh v3.4S, v20.4S, v15.s[0] +sub v29.4s, v21.4s, v7.4s +mul v20.4S, v20.4S,v13.s[0] +str q29, [x0, #432] +ldr q29, [x0, #624] +sqrdmulh v19.4S, v29.4S, v15.s[0] +add v21.4s, v21.4s, v7.4s +mul v29.4S, v29.4S,v13.s[0] +str q21, [x0, #416] +ldr q21, [x17, #+448] +ldr q7, [x17, #+464] +mla v24.4S, v1.4S, v31.s[0] +sub v1.4s, v28.4s, v5.4s +sqrdmulh v6.4S, v8.4S, v7.s[0] +str q1, [x0, #464] +ldr q1, [x0, #688] +mla v16.4S, v14.4S, v31.s[0] +add v28.4s, v28.4s, v5.4s +sqrdmulh v5.4S, v1.4S, v7.s[0] +str q28, [x0, #448] +ldr q28, [x17, #+480] +ldr q14, [x17, #+496] +mla v20.4S, v3.4S, v31.s[0] +sub v3.4s, v23.4s, v4.4s +sqrdmulh v27.4S, v11.4S, v14.s[0] +str q3, [x0, #496] +ldr q3, [x0, #752] +mla v29.4S, v19.4S, v31.s[0] +add v23.4s, v23.4s, v4.4s +sqrdmulh v4.4S, v3.4S, v14.s[0] +str q23, [x0, #480] +ldr q23, [x0, #512] +ldr q19, [x0, #640] +mul v8.4S, v8.4S,v21.s[0] +sub v26.4s, v23.4s, v24.4s +ldr q9, [x0, #528] +mul v1.4S, v1.4S,v21.s[0] +add v23.4s, v23.4s, v24.4s +ldr q24, [x0, #656] +mla v8.4S, v6.4S, v31.s[0] +sub v6.4s, v9.4s, v16.4s +ldr q2, [x0, #576] +mla v1.4S, v5.4S, v31.s[0] +add v9.4s, v9.4s, v16.4s +ldr q16, [x0, #704] +mul v11.4S, v11.4S,v28.s[0] +sub v5.4s, v2.4s, v20.4s +ldr q10, [x0, #592] +mul v3.4S, v3.4S,v28.s[0] +add v2.4s, v2.4s, v20.4s +ldr q20, [x0, #720] +mla v11.4S, v27.4S, v31.s[0] +mla v3.4S, v4.4S, v31.s[0] +sub v4.4s, v10.4s, v29.4s +sqrdmulh v27.4S, v9.4S, v22.s[1] +add v10.4s, v10.4s, v29.4s +mul v9.4S, v9.4S,v30.s[1] +sqrdmulh v29.4S, v6.4S, v22.s[2] +sub v18.4s, v19.4s, v8.4s +mul v6.4S, v6.4S,v30.s[2] +add v19.4s, v19.4s, v8.4s +sqrdmulh v22.4S, v10.4S, v15.s[1] +sub v30.4s, v24.4s, v1.4s +mul v10.4S, v10.4S,v13.s[1] +add v24.4s, v24.4s, v1.4s +sqrdmulh v1.4S, v4.4S, v15.s[2] +sub v8.4s, v16.4s, v11.4s +mul v4.4S, v4.4S,v13.s[2] +add v16.4s, v16.4s, v11.4s +mla v9.4S, v27.4S, v31.s[0] +sub v27.4s, v20.4s, v3.4s +ldr q15, [x0, #992] +sqrdmulh v13.4S, v24.4S, v7.s[1] +add v20.4s, v20.4s, v3.4s +mla v6.4S, v29.4S, v31.s[0] +ldr q29, [x0, #928] +sqrdmulh v3.4S, v30.4S, v7.s[2] +sub v11.4s, v23.4s, v9.4s +mla v10.4S, v22.4S, v31.s[0] +ldr q22, [x0, #800] +sqrdmulh v17.4S, v20.4S, v14.s[1] +add v23.4s, v23.4s, v9.4s +str q11, [x0, #528] +mla v4.4S, v1.4S, v31.s[0] +ldr q1, [x17, #+512] +ldr q11, [x17, #+528] +sqrdmulh v9.4S, v27.4S, v14.s[2] +sub v25.4s, v26.4s, v6.4s +str q23, [x0, #512] +mul v24.4S, v24.4S,v21.s[1] +add v26.4s, v26.4s, v6.4s +mul v30.4S, v30.4S,v21.s[2] +str q25, [x0, #560] +mla v24.4S, v13.4S, v31.s[0] +sub v13.4s, v2.4s, v10.4s +mla v30.4S, v3.4S, v31.s[0] +str q26, [x0, #544] +mul v20.4S, v20.4S,v28.s[1] +str q13, [x0, #592] +mul v27.4S, v27.4S,v28.s[2] +add v2.4s, v2.4s, v10.4s +str q2, [x0, #576] +mla v20.4S, v17.4S, v31.s[0] +sub v17.4s, v5.4s, v4.4s +str q17, [x0, #624] +mla v27.4S, v9.4S, v31.s[0] +add v5.4s, v5.4s, v4.4s +str q5, [x0, #608] +sqrdmulh v14.4S, v22.4S, v11.s[0] +sub v28.4s, v19.4s, v24.4s +mul v22.4S, v22.4S,v1.s[0] +str q28, [x0, #656] +ldr q28, [x0, #816] +sqrdmulh v5.4S, v28.4S, v11.s[0] +add v19.4s, v19.4s, v24.4s +mul v28.4S, v28.4S,v1.s[0] +str q19, [x0, #640] +ldr q19, [x17, #+544] +ldr q24, [x17, #+560] +ldr q4, [x0, #864] +sqrdmulh v9.4S, v4.4S, v24.s[0] +sub v17.4s, v18.4s, v30.4s +mul v4.4S, v4.4S,v19.s[0] +str q17, [x0, #688] +ldr q17, [x0, #880] +sqrdmulh v2.4S, v17.4S, v24.s[0] +add v18.4s, v18.4s, v30.4s +mul v17.4S, v17.4S,v19.s[0] +str q18, [x0, #672] +ldr q18, [x17, #+576] +ldr q30, [x17, #+592] +mla v22.4S, v14.4S, v31.s[0] +sub v14.4s, v16.4s, v20.4s +sqrdmulh v10.4S, v29.4S, v30.s[0] +str q14, [x0, #720] +ldr q14, [x0, #944] +mla v28.4S, v5.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v14.4S, v30.s[0] +str q16, [x0, #704] +ldr q16, [x17, #+608] +ldr q5, [x17, #+624] +mla v4.4S, v9.4S, v31.s[0] +sub v9.4s, v8.4s, v27.4s +sqrdmulh v13.4S, v15.4S, v5.s[0] +str q9, [x0, #752] +ldr q9, [x0, #1008] +mla v17.4S, v2.4S, v31.s[0] +add v8.4s, v8.4s, v27.4s +sqrdmulh v27.4S, v9.4S, v5.s[0] +str q8, [x0, #736] +ldr q8, [x0, #768] +ldr q2, [x0, #896] +mul v29.4S, v29.4S,v18.s[0] +sub v7.4s, v8.4s, v22.4s +ldr q21, [x0, #784] +mul v14.4S, v14.4S,v18.s[0] +add v8.4s, v8.4s, v22.4s +ldr q22, [x0, #912] +mla v29.4S, v10.4S, v31.s[0] +sub v10.4s, v21.4s, v28.4s +ldr q26, [x0, #832] +mla v14.4S, v20.4S, v31.s[0] +add v21.4s, v21.4s, v28.4s +ldr q28, [x0, #960] +mul v15.4S, v15.4S,v16.s[0] +sub v20.4s, v26.4s, v4.4s +ldr q3, [x0, #848] +mul v9.4S, v9.4S,v16.s[0] +add v26.4s, v26.4s, v4.4s +ldr q4, [x0, #976] +mla v15.4S, v13.4S, v31.s[0] +mla v9.4S, v27.4S, v31.s[0] +sub v27.4s, v3.4s, v17.4s +sqrdmulh v13.4S, v21.4S, v11.s[1] +add v3.4s, v3.4s, v17.4s +mul v21.4S, v21.4S,v1.s[1] +sqrdmulh v17.4S, v10.4S, v11.s[2] +sub v25.4s, v2.4s, v29.4s +mul v10.4S, v10.4S,v1.s[2] +add v2.4s, v2.4s, v29.4s +sqrdmulh v11.4S, v3.4S, v24.s[1] +sub v1.4s, v22.4s, v14.4s +mul v3.4S, v3.4S,v19.s[1] +add v22.4s, v22.4s, v14.4s +sqrdmulh v14.4S, v27.4S, v24.s[2] +sub v29.4s, v28.4s, v15.4s +mul v27.4S, v27.4S,v19.s[2] +add v28.4s, v28.4s, v15.4s +mla v21.4S, v13.4S, v31.s[0] +sub v13.4s, v4.4s, v9.4s +sqrdmulh v24.4S, v22.4S, v30.s[1] +add v4.4s, v4.4s, v9.4s +mla v10.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v1.4S, v30.s[2] +sub v9.4s, v8.4s, v21.4s +mla v3.4S, v11.4S, v31.s[0] +sqrdmulh v11.4S, v4.4S, v5.s[1] +add v8.4s, v8.4s, v21.4s +str q9, [x0, #784] +mla v27.4S, v14.4S, v31.s[0] +sqrdmulh v14.4S, v13.4S, v5.s[2] +sub v9.4s, v7.4s, v10.4s +str q8, [x0, #768] +mul v22.4S, v22.4S,v18.s[1] +add v7.4s, v7.4s, v10.4s +mul v1.4S, v1.4S,v18.s[2] +str q9, [x0, #816] +mla v22.4S, v24.4S, v31.s[0] +sub v24.4s, v26.4s, v3.4s +mla v1.4S, v17.4S, v31.s[0] +str q7, [x0, #800] +mul v4.4S, v4.4S,v16.s[1] +str q24, [x0, #848] +mul v13.4S, v13.4S,v16.s[2] +add v26.4s, v26.4s, v3.4s +str q26, [x0, #832] +mla v4.4S, v11.4S, v31.s[0] +sub v11.4s, v20.4s, v27.4s +str q11, [x0, #880] +mla v13.4S, v14.4S, v31.s[0] +add v20.4s, v20.4s, v27.4s +str q20, [x0, #864] +sub v5.4s, v2.4s, v22.4s +str q5, [x0, #912] +add v2.4s, v2.4s, v22.4s +str q2, [x0, #896] +sub v2.4s, v25.4s, v1.4s +str q2, [x0, #944] +add v25.4s, v25.4s, v1.4s +str q25, [x0, #928] +sub v25.4s, v28.4s, v4.4s +str q25, [x0, #976] +add v28.4s, v28.4s, v4.4s +str q28, [x0, #960] +sub v28.4s, v29.4s, v13.4s +str q28, [x0, #1008] +add v29.4s, v29.4s, v13.4s +str q29, [x0, #992] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1520 +// Instruction count: 1516 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_8.s b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_8.s new file mode 100644 index 0000000..a3ac527 --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_8.s @@ -0,0 +1,1550 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_22_z4_8 +.global _ntt_u32_incomplete_neon_asm_var_4_2_22_z4_8 +ntt_u32_incomplete_neon_asm_var_4_2_22_z4_8: +_ntt_u32_incomplete_neon_asm_var_4_2_22_z4_8: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x0, #992] +sqrdmulh v27.4S, v28.4S, v29.s[0] +mul v28.4S, v28.4S,v30.s[0] +ldr q26, [x0, #928] +sqrdmulh v25.4S, v26.4S, v29.s[0] +mul v26.4S, v26.4S,v30.s[0] +ldr q24, [x0, #864] +sqrdmulh v23.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +ldr q22, [x0, #800] +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +ldr q20, [x0, #736] +sqrdmulh v19.4S, v20.4S, v29.s[0] +mla v28.4S, v27.4S, v31.s[0] +ldr q27, [x0, #672] +sqrdmulh v18.4S, v27.4S, v29.s[0] +mla v26.4S, v25.4S, v31.s[0] +ldr q25, [x0, #608] +sqrdmulh v17.4S, v25.4S, v29.s[0] +mla v24.4S, v23.4S, v31.s[0] +ldr q23, [x0, #544] +sqrdmulh v16.4S, v23.4S, v29.s[0] +mla v22.4S, v21.4S, v31.s[0] +ldr q21, [x0, #480] +mul v27.4S, v27.4S,v30.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q3, [x0, #416] +ldr q2, [x0, #352] +ldr q1, [x0, #288] +mla v27.4S, v18.4S, v31.s[0] +mla v20.4S, v19.4S, v31.s[0] +ldr q19, [x0, #224] +ldr q18, [x0, #160] +mul v23.4S, v23.4S,v30.s[0] +mul v25.4S, v25.4S,v30.s[0] +ldr q0, [x0, #96] +ldr q15, [x0, #32] +mla v23.4S, v16.4S, v31.s[0] +mla v25.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v28.4s +add v21.4s, v21.4s, v28.4s +sqrdmulh v28.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +sub v16.4s, v3.4s, v26.4s +add v3.4s, v3.4s, v26.4s +sqrdmulh v26.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +sub v14.4s, v2.4s, v24.4s +add v2.4s, v2.4s, v24.4s +sqrdmulh v24.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v13.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +sqrdmulh v22.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v12.4s, v19.4s, v20.4s +add v19.4s, v19.4s, v20.4s +sqrdmulh v20.4S, v14.4S, v29.s[2] +mla v17.4S, v28.4S, v31.s[0] +sub v28.4s, v18.4s, v27.4s +add v18.4s, v18.4s, v27.4s +sqrdmulh v27.4S, v13.4S, v29.s[2] +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v0.4s, v25.4s +add v0.4s, v0.4s, v25.4s +sqrdmulh v25.4S, v2.4S, v29.s[1] +mla v21.4S, v24.4S, v31.s[0] +sub v24.4s, v15.4s, v23.4s +sqrdmulh v11.4S, v1.4S, v29.s[1] +mla v3.4S, v22.4S, v31.s[0] +add v15.4s, v15.4s, v23.4s +ldr q23, [x17, #+32] +ldr q22, [x17, #+48] +mul v13.4S, v13.4S,v30.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v10.4s, v12.4s, v17.4s +add v12.4s, v12.4s, v17.4s +mla v13.4S, v27.4S, v31.s[0] +mla v14.4S, v20.4S, v31.s[0] +sub v20.4s, v28.4s, v16.4s +add v28.4s, v28.4s, v16.4s +mul v1.4S, v1.4S,v30.s[1] +mul v2.4S, v2.4S,v30.s[1] +sub v16.4s, v19.4s, v21.4s +add v19.4s, v19.4s, v21.4s +mla v1.4S, v11.4S, v31.s[0] +mla v2.4S, v25.4S, v31.s[0] +sub v25.4s, v18.4s, v3.4s +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v10.4S, v22.s[3] +mul v10.4S, v10.4S,v23.s[3] +sub v11.4s, v26.4s, v14.4s +add v26.4s, v26.4s, v14.4s +sqrdmulh v14.4S, v12.4S, v22.s[2] +mul v12.4S, v12.4S,v23.s[2] +sub v21.4s, v24.4s, v13.4s +add v24.4s, v24.4s, v13.4s +sqrdmulh v13.4S, v16.4S, v22.s[1] +mul v16.4S, v16.4S,v23.s[1] +sub v27.4s, v0.4s, v2.4s +add v0.4s, v0.4s, v2.4s +sqrdmulh v2.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v17.4s, v15.4s, v1.4s +add v15.4s, v15.4s, v1.4s +ldr q1, [x17, #+96] +ldr q9, [x17, #+112] +sqrdmulh v8.4S, v20.4S, v22.s[3] +mla v10.4S, v3.4S, v31.s[0] +nop +nop +sqrdmulh v3.4S, v28.4S, v22.s[2] +mla v12.4S, v14.4S, v31.s[0] +nop +nop +sqrdmulh v14.4S, v25.4S, v22.s[1] +mla v16.4S, v13.4S, v31.s[0] +nop +nop +sqrdmulh v13.4S, v18.4S, v22.s[0] +mla v19.4S, v2.4S, v31.s[0] +nop +nop +ldr q2, [x17, #+64] +ldr q7, [x17, #+80] +mul v28.4S, v28.4S,v23.s[2] +mul v20.4S, v20.4S,v23.s[3] +sub v6.4s, v11.4s, v10.4s +add v11.4s, v11.4s, v10.4s +mla v28.4S, v3.4S, v31.s[0] +mla v20.4S, v8.4S, v31.s[0] +sub v8.4s, v26.4s, v12.4s +add v26.4s, v26.4s, v12.4s +mul v18.4S, v18.4S,v23.s[0] +mul v25.4S, v25.4S,v23.s[1] +sub v12.4s, v27.4s, v16.4s +add v27.4s, v27.4s, v16.4s +mla v18.4S, v13.4S, v31.s[0] +mla v25.4S, v14.4S, v31.s[0] +sub v14.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v9.s[3] +mul v6.4S, v6.4S,v1.s[3] +sub v13.4s, v21.4s, v20.4s +add v21.4s, v21.4s, v20.4s +sqrdmulh v20.4S, v11.4S, v9.s[2] +mul v11.4S, v11.4S,v1.s[2] +sub v16.4s, v24.4s, v28.4s +add v24.4s, v24.4s, v28.4s +sqrdmulh v28.4S, v8.4S, v9.s[1] +mul v8.4S, v8.4S,v1.s[1] +sub v3.4s, v17.4s, v25.4s +add v17.4s, v17.4s, v25.4s +sqrdmulh v25.4S, v26.4S, v9.s[0] +mul v26.4S, v26.4S,v1.s[0] +sub v10.4s, v15.4s, v18.4s +add v15.4s, v15.4s, v18.4s +sqrdmulh v18.4S, v12.4S, v7.s[3] +mla v6.4S, v19.4S, v31.s[0] +nop +nop +sqrdmulh v19.4S, v27.4S, v7.s[2] +mla v11.4S, v20.4S, v31.s[0] +nop +nop +sqrdmulh v20.4S, v14.4S, v7.s[1] +mla v8.4S, v28.4S, v31.s[0] +nop +nop +sqrdmulh v28.4S, v0.4S, v7.s[0] +mla v26.4S, v25.4S, v31.s[0] +nop +nop +mul v27.4S, v27.4S,v2.s[2] +mul v12.4S, v12.4S,v2.s[3] +sub v25.4s, v13.4s, v6.4s +str q25, [x0, #992] +mla v27.4S, v19.4S, v31.s[0] +mla v12.4S, v18.4S, v31.s[0] +add v13.4s, v13.4s, v6.4s +str q13, [x0, #928] +mul v0.4S, v0.4S,v2.s[0] +mul v14.4S, v14.4S,v2.s[1] +sub v13.4s, v21.4s, v11.4s +str q13, [x0, #864] +mla v0.4S, v28.4S, v31.s[0] +mla v14.4S, v20.4S, v31.s[0] +add v21.4s, v21.4s, v11.4s +sub v11.4s, v16.4s, v8.4s +ldr q20, [x0, #1008] +sqrdmulh v28.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v16.4s, v16.4s, v8.4s +str q21, [x0, #800] +ldr q21, [x0, #944] +sqrdmulh v8.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +sub v13.4s, v24.4s, v26.4s +str q11, [x0, #736] +ldr q11, [x0, #880] +sqrdmulh v6.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +add v24.4s, v24.4s, v26.4s +str q16, [x0, #672] +ldr q16, [x0, #816] +sqrdmulh v26.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v18.4s, v3.4s, v12.4s +str q13, [x0, #608] +ldr q13, [x0, #752] +sqrdmulh v19.4S, v13.4S, v29.s[0] +mla v20.4S, v28.4S, v31.s[0] +add v3.4s, v3.4s, v12.4s +str q24, [x0, #544] +ldr q24, [x0, #688] +sqrdmulh v12.4S, v24.4S, v29.s[0] +mla v21.4S, v8.4S, v31.s[0] +sub v8.4s, v17.4s, v27.4s +str q18, [x0, #480] +ldr q18, [x0, #624] +sqrdmulh v28.4S, v18.4S, v29.s[0] +mla v11.4S, v6.4S, v31.s[0] +add v17.4s, v17.4s, v27.4s +str q3, [x0, #416] +ldr q3, [x0, #560] +sqrdmulh v27.4S, v3.4S, v29.s[0] +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v10.4s, v14.4s +str q8, [x0, #352] +ldr q8, [x0, #496] +add v10.4s, v10.4s, v14.4s +mul v24.4S, v24.4S,v30.s[0] +mul v13.4S, v13.4S,v30.s[0] +ldr q14, [x0, #432] +str q17, [x0, #288] +ldr q17, [x0, #368] +ldr q6, [x0, #304] +mla v24.4S, v12.4S, v31.s[0] +mla v13.4S, v19.4S, v31.s[0] +str q26, [x0, #224] +sub v26.4s, v15.4s, v0.4s +ldr q19, [x0, #240] +ldr q12, [x0, #176] +mul v3.4S, v3.4S,v30.s[0] +mul v18.4S, v18.4S,v30.s[0] +str q10, [x0, #160] +add v15.4s, v15.4s, v0.4s +ldr q0, [x0, #112] +ldr q10, [x0, #48] +mla v3.4S, v27.4S, v31.s[0] +mla v18.4S, v28.4S, v31.s[0] +sub v28.4s, v8.4s, v20.4s +add v8.4s, v8.4s, v20.4s +sqrdmulh v20.4S, v28.4S, v29.s[2] +mul v28.4S, v28.4S,v30.s[2] +sub v27.4s, v14.4s, v21.4s +add v14.4s, v14.4s, v21.4s +sqrdmulh v21.4S, v27.4S, v29.s[2] +mul v27.4S, v27.4S,v30.s[2] +sub v25.4s, v17.4s, v11.4s +add v17.4s, v17.4s, v11.4s +sqrdmulh v11.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +sub v5.4s, v6.4s, v16.4s +add v6.4s, v6.4s, v16.4s +sqrdmulh v16.4S, v14.4S, v29.s[1] +mul v14.4S, v14.4S,v30.s[1] +sub v4.4s, v19.4s, v13.4s +add v19.4s, v19.4s, v13.4s +sqrdmulh v13.4S, v25.4S, v29.s[2] +mla v28.4S, v20.4S, v31.s[0] +sub v20.4s, v12.4s, v24.4s +add v12.4s, v12.4s, v24.4s +sqrdmulh v24.4S, v5.4S, v29.s[2] +mla v27.4S, v21.4S, v31.s[0] +sub v21.4s, v0.4s, v18.4s +add v0.4s, v0.4s, v18.4s +sqrdmulh v18.4S, v17.4S, v29.s[1] +mla v8.4S, v11.4S, v31.s[0] +sub v11.4s, v10.4s, v3.4s +str q26, [x0, #96] +sqrdmulh v26.4S, v6.4S, v29.s[1] +mla v14.4S, v16.4S, v31.s[0] +add v10.4s, v10.4s, v3.4s +str q15, [x0, #32] +mul v5.4S, v5.4S,v30.s[2] +mul v25.4S, v25.4S,v30.s[2] +sub v15.4s, v4.4s, v28.4s +add v4.4s, v4.4s, v28.4s +mla v5.4S, v24.4S, v31.s[0] +mla v25.4S, v13.4S, v31.s[0] +sub v13.4s, v20.4s, v27.4s +add v20.4s, v20.4s, v27.4s +mul v6.4S, v6.4S,v30.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v27.4s, v19.4s, v8.4s +add v19.4s, v19.4s, v8.4s +mla v6.4S, v26.4S, v31.s[0] +mla v17.4S, v18.4S, v31.s[0] +sub v18.4s, v12.4s, v14.4s +add v12.4s, v12.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v22.s[3] +mul v15.4S, v15.4S,v23.s[3] +sub v26.4s, v21.4s, v25.4s +add v21.4s, v21.4s, v25.4s +sqrdmulh v25.4S, v4.4S, v22.s[2] +mul v4.4S, v4.4S,v23.s[2] +sub v8.4s, v11.4s, v5.4s +add v11.4s, v11.4s, v5.4s +sqrdmulh v5.4S, v27.4S, v22.s[1] +mul v27.4S, v27.4S,v23.s[1] +sub v24.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +sqrdmulh v17.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v28.4s, v10.4s, v6.4s +add v10.4s, v10.4s, v6.4s +sqrdmulh v6.4S, v13.4S, v22.s[3] +mla v15.4S, v14.4S, v31.s[0] +nop +nop +sqrdmulh v14.4S, v20.4S, v22.s[2] +mla v4.4S, v25.4S, v31.s[0] +nop +nop +sqrdmulh v25.4S, v18.4S, v22.s[1] +mla v27.4S, v5.4S, v31.s[0] +nop +nop +sqrdmulh v5.4S, v12.4S, v22.s[0] +mla v19.4S, v17.4S, v31.s[0] +nop +nop +mul v20.4S, v20.4S,v23.s[2] +mul v13.4S, v13.4S,v23.s[3] +sub v17.4s, v26.4s, v15.4s +add v26.4s, v26.4s, v15.4s +mla v20.4S, v14.4S, v31.s[0] +mla v13.4S, v6.4S, v31.s[0] +sub v6.4s, v21.4s, v4.4s +add v21.4s, v21.4s, v4.4s +mul v12.4S, v12.4S,v23.s[0] +mul v18.4S, v18.4S,v23.s[1] +sub v4.4s, v24.4s, v27.4s +add v24.4s, v24.4s, v27.4s +mla v12.4S, v5.4S, v31.s[0] +mla v18.4S, v25.4S, v31.s[0] +sub v25.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v17.4S, v9.s[3] +mul v17.4S, v17.4S,v1.s[3] +sub v5.4s, v8.4s, v13.4s +add v8.4s, v8.4s, v13.4s +sqrdmulh v13.4S, v26.4S, v9.s[2] +mul v26.4S, v26.4S,v1.s[2] +sub v27.4s, v11.4s, v20.4s +add v11.4s, v11.4s, v20.4s +sqrdmulh v20.4S, v6.4S, v9.s[1] +mul v6.4S, v6.4S,v1.s[1] +sub v14.4s, v28.4s, v18.4s +add v28.4s, v28.4s, v18.4s +sqrdmulh v18.4S, v21.4S, v9.s[0] +mul v21.4S, v21.4S,v1.s[0] +sub v15.4s, v10.4s, v12.4s +add v10.4s, v10.4s, v12.4s +sqrdmulh v12.4S, v4.4S, v7.s[3] +mla v17.4S, v19.4S, v31.s[0] +nop +nop +sqrdmulh v19.4S, v24.4S, v7.s[2] +mla v26.4S, v13.4S, v31.s[0] +nop +nop +sqrdmulh v13.4S, v25.4S, v7.s[1] +mla v6.4S, v20.4S, v31.s[0] +nop +nop +sqrdmulh v20.4S, v0.4S, v7.s[0] +mla v21.4S, v18.4S, v31.s[0] +nop +nop +mul v24.4S, v24.4S,v2.s[2] +mul v4.4S, v4.4S,v2.s[3] +sub v18.4s, v5.4s, v17.4s +str q18, [x0, #1008] +mla v24.4S, v19.4S, v31.s[0] +mla v4.4S, v12.4S, v31.s[0] +add v5.4s, v5.4s, v17.4s +str q5, [x0, #944] +mul v0.4S, v0.4S,v2.s[0] +mul v25.4S, v25.4S,v2.s[1] +sub v5.4s, v8.4s, v26.4s +str q5, [x0, #880] +mla v0.4S, v20.4S, v31.s[0] +mla v25.4S, v13.4S, v31.s[0] +add v8.4s, v8.4s, v26.4s +sub v26.4s, v27.4s, v6.4s +ldr q13, [x0, #960] +sqrdmulh v20.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +add v27.4s, v27.4s, v6.4s +str q8, [x0, #816] +ldr q8, [x0, #896] +sqrdmulh v6.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v5.4s, v11.4s, v21.4s +str q26, [x0, #752] +ldr q26, [x0, #832] +sqrdmulh v17.4S, v26.4S, v29.s[0] +mul v26.4S, v26.4S,v30.s[0] +add v11.4s, v11.4s, v21.4s +str q27, [x0, #688] +ldr q27, [x0, #768] +sqrdmulh v21.4S, v27.4S, v29.s[0] +mul v27.4S, v27.4S,v30.s[0] +sub v12.4s, v14.4s, v4.4s +str q5, [x0, #624] +ldr q5, [x0, #704] +sqrdmulh v19.4S, v5.4S, v29.s[0] +mla v13.4S, v20.4S, v31.s[0] +add v14.4s, v14.4s, v4.4s +str q11, [x0, #560] +ldr q11, [x0, #640] +sqrdmulh v4.4S, v11.4S, v29.s[0] +mla v8.4S, v6.4S, v31.s[0] +sub v6.4s, v28.4s, v24.4s +str q12, [x0, #496] +ldr q12, [x0, #576] +sqrdmulh v20.4S, v12.4S, v29.s[0] +mla v26.4S, v17.4S, v31.s[0] +add v28.4s, v28.4s, v24.4s +str q14, [x0, #432] +ldr q14, [x0, #512] +sqrdmulh v24.4S, v14.4S, v29.s[0] +mla v27.4S, v21.4S, v31.s[0] +sub v21.4s, v15.4s, v25.4s +str q6, [x0, #368] +ldr q6, [x0, #448] +add v15.4s, v15.4s, v25.4s +mul v11.4S, v11.4S,v30.s[0] +mul v5.4S, v5.4S,v30.s[0] +ldr q25, [x0, #384] +str q28, [x0, #304] +ldr q28, [x0, #320] +ldr q17, [x0, #256] +mla v11.4S, v4.4S, v31.s[0] +mla v5.4S, v19.4S, v31.s[0] +str q21, [x0, #240] +sub v21.4s, v10.4s, v0.4s +ldr q19, [x0, #192] +ldr q4, [x0, #128] +mul v14.4S, v14.4S,v30.s[0] +mul v12.4S, v12.4S,v30.s[0] +str q15, [x0, #176] +add v10.4s, v10.4s, v0.4s +ldr q0, [x0, #64] +ldr q15, [x0, #0] +mla v14.4S, v24.4S, v31.s[0] +mla v12.4S, v20.4S, v31.s[0] +sub v20.4s, v6.4s, v13.4s +add v6.4s, v6.4s, v13.4s +sqrdmulh v13.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +sub v24.4s, v25.4s, v8.4s +add v25.4s, v25.4s, v8.4s +sqrdmulh v8.4S, v24.4S, v29.s[2] +mul v24.4S, v24.4S,v30.s[2] +sub v18.4s, v28.4s, v26.4s +add v28.4s, v28.4s, v26.4s +sqrdmulh v26.4S, v6.4S, v29.s[1] +mul v6.4S, v6.4S,v30.s[1] +sub v3.4s, v17.4s, v27.4s +add v17.4s, v17.4s, v27.4s +sqrdmulh v27.4S, v25.4S, v29.s[1] +mul v25.4S, v25.4S,v30.s[1] +sub v16.4s, v19.4s, v5.4s +add v19.4s, v19.4s, v5.4s +sqrdmulh v5.4S, v18.4S, v29.s[2] +mla v20.4S, v13.4S, v31.s[0] +sub v13.4s, v4.4s, v11.4s +add v4.4s, v4.4s, v11.4s +sqrdmulh v11.4S, v3.4S, v29.s[2] +mla v24.4S, v8.4S, v31.s[0] +sub v8.4s, v0.4s, v12.4s +add v0.4s, v0.4s, v12.4s +sqrdmulh v12.4S, v28.4S, v29.s[1] +mla v6.4S, v26.4S, v31.s[0] +sub v26.4s, v15.4s, v14.4s +str q21, [x0, #112] +sqrdmulh v21.4S, v17.4S, v29.s[1] +mla v25.4S, v27.4S, v31.s[0] +add v15.4s, v15.4s, v14.4s +str q10, [x0, #48] +mul v3.4S, v3.4S,v30.s[2] +mul v18.4S, v18.4S,v30.s[2] +sub v10.4s, v16.4s, v20.4s +add v16.4s, v16.4s, v20.4s +mla v3.4S, v11.4S, v31.s[0] +mla v18.4S, v5.4S, v31.s[0] +sub v5.4s, v13.4s, v24.4s +add v13.4s, v13.4s, v24.4s +mul v17.4S, v17.4S,v30.s[1] +mul v28.4S, v28.4S,v30.s[1] +sub v24.4s, v19.4s, v6.4s +add v19.4s, v19.4s, v6.4s +mla v17.4S, v21.4S, v31.s[0] +mla v28.4S, v12.4S, v31.s[0] +sub v12.4s, v4.4s, v25.4s +add v4.4s, v4.4s, v25.4s +sqrdmulh v25.4S, v10.4S, v22.s[3] +mul v10.4S, v10.4S,v23.s[3] +sub v21.4s, v8.4s, v18.4s +add v8.4s, v8.4s, v18.4s +sqrdmulh v18.4S, v16.4S, v22.s[2] +mul v16.4S, v16.4S,v23.s[2] +sub v6.4s, v26.4s, v3.4s +add v26.4s, v26.4s, v3.4s +sqrdmulh v3.4S, v24.4S, v22.s[1] +mul v24.4S, v24.4S,v23.s[1] +sub v11.4s, v0.4s, v28.4s +add v0.4s, v0.4s, v28.4s +sqrdmulh v28.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v20.4s, v15.4s, v17.4s +add v15.4s, v15.4s, v17.4s +sqrdmulh v17.4S, v5.4S, v22.s[3] +mla v10.4S, v25.4S, v31.s[0] +nop +nop +sqrdmulh v25.4S, v13.4S, v22.s[2] +mla v16.4S, v18.4S, v31.s[0] +nop +nop +sqrdmulh v18.4S, v12.4S, v22.s[1] +mla v24.4S, v3.4S, v31.s[0] +nop +nop +sqrdmulh v3.4S, v4.4S, v22.s[0] +mla v19.4S, v28.4S, v31.s[0] +nop +nop +mul v13.4S, v13.4S,v23.s[2] +mul v5.4S, v5.4S,v23.s[3] +sub v28.4s, v21.4s, v10.4s +add v21.4s, v21.4s, v10.4s +mla v13.4S, v25.4S, v31.s[0] +mla v5.4S, v17.4S, v31.s[0] +sub v17.4s, v8.4s, v16.4s +add v8.4s, v8.4s, v16.4s +mul v4.4S, v4.4S,v23.s[0] +mul v12.4S, v12.4S,v23.s[1] +sub v16.4s, v11.4s, v24.4s +add v11.4s, v11.4s, v24.4s +mla v4.4S, v3.4S, v31.s[0] +mla v12.4S, v18.4S, v31.s[0] +sub v18.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v28.4S, v9.s[3] +mul v28.4S, v28.4S,v1.s[3] +sub v3.4s, v6.4s, v5.4s +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v21.4S, v9.s[2] +mul v21.4S, v21.4S,v1.s[2] +sub v24.4s, v26.4s, v13.4s +add v26.4s, v26.4s, v13.4s +sqrdmulh v13.4S, v17.4S, v9.s[1] +mul v17.4S, v17.4S,v1.s[1] +sub v25.4s, v20.4s, v12.4s +add v20.4s, v20.4s, v12.4s +sqrdmulh v12.4S, v8.4S, v9.s[0] +mul v8.4S, v8.4S,v1.s[0] +sub v10.4s, v15.4s, v4.4s +add v15.4s, v15.4s, v4.4s +sqrdmulh v4.4S, v16.4S, v7.s[3] +mla v28.4S, v19.4S, v31.s[0] +nop +nop +sqrdmulh v19.4S, v11.4S, v7.s[2] +mla v21.4S, v5.4S, v31.s[0] +nop +nop +sqrdmulh v5.4S, v18.4S, v7.s[1] +mla v17.4S, v13.4S, v31.s[0] +nop +nop +sqrdmulh v13.4S, v0.4S, v7.s[0] +mla v8.4S, v12.4S, v31.s[0] +nop +nop +mul v11.4S, v11.4S,v2.s[2] +mul v16.4S, v16.4S,v2.s[3] +sub v12.4s, v3.4s, v28.4s +str q12, [x0, #960] +mla v11.4S, v19.4S, v31.s[0] +mla v16.4S, v4.4S, v31.s[0] +add v3.4s, v3.4s, v28.4s +str q3, [x0, #896] +mul v0.4S, v0.4S,v2.s[0] +mul v18.4S, v18.4S,v2.s[1] +sub v3.4s, v6.4s, v21.4s +str q3, [x0, #832] +mla v0.4S, v13.4S, v31.s[0] +mla v18.4S, v5.4S, v31.s[0] +add v6.4s, v6.4s, v21.4s +sub v21.4s, v24.4s, v17.4s +ldr q5, [x0, #976] +sqrdmulh v13.4S, v5.4S, v29.s[0] +mul v5.4S, v5.4S,v30.s[0] +add v24.4s, v24.4s, v17.4s +str q6, [x0, #768] +ldr q6, [x0, #912] +sqrdmulh v17.4S, v6.4S, v29.s[0] +mul v6.4S, v6.4S,v30.s[0] +sub v3.4s, v26.4s, v8.4s +str q21, [x0, #704] +ldr q21, [x0, #848] +sqrdmulh v28.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +add v26.4s, v26.4s, v8.4s +str q24, [x0, #640] +ldr q24, [x0, #784] +sqrdmulh v8.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +sub v4.4s, v25.4s, v16.4s +str q3, [x0, #576] +ldr q3, [x0, #720] +sqrdmulh v19.4S, v3.4S, v29.s[0] +mla v5.4S, v13.4S, v31.s[0] +add v25.4s, v25.4s, v16.4s +str q26, [x0, #512] +ldr q26, [x0, #656] +sqrdmulh v16.4S, v26.4S, v29.s[0] +mla v6.4S, v17.4S, v31.s[0] +sub v17.4s, v20.4s, v11.4s +str q4, [x0, #448] +ldr q4, [x0, #592] +sqrdmulh v13.4S, v4.4S, v29.s[0] +mla v21.4S, v28.4S, v31.s[0] +add v20.4s, v20.4s, v11.4s +str q25, [x0, #384] +ldr q25, [x0, #528] +sqrdmulh v11.4S, v25.4S, v29.s[0] +mla v24.4S, v8.4S, v31.s[0] +sub v8.4s, v10.4s, v18.4s +str q17, [x0, #320] +ldr q17, [x0, #464] +add v10.4s, v10.4s, v18.4s +mul v26.4S, v26.4S,v30.s[0] +mul v3.4S, v3.4S,v30.s[0] +ldr q18, [x0, #400] +str q20, [x0, #256] +ldr q20, [x0, #336] +ldr q28, [x0, #272] +mla v26.4S, v16.4S, v31.s[0] +mla v3.4S, v19.4S, v31.s[0] +str q8, [x0, #192] +sub v8.4s, v15.4s, v0.4s +ldr q19, [x0, #208] +ldr q16, [x0, #144] +mul v25.4S, v25.4S,v30.s[0] +mul v4.4S, v4.4S,v30.s[0] +str q10, [x0, #128] +add v15.4s, v15.4s, v0.4s +ldr q0, [x0, #80] +ldr q10, [x0, #16] +mla v25.4S, v11.4S, v31.s[0] +mla v4.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v5.4s +add v17.4s, v17.4s, v5.4s +sqrdmulh v5.4S, v13.4S, v29.s[2] +mul v13.4S, v13.4S,v30.s[2] +sub v11.4s, v18.4s, v6.4s +add v18.4s, v18.4s, v6.4s +sqrdmulh v6.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v12.4s, v20.4s, v21.4s +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v14.4s, v28.4s, v24.4s +add v28.4s, v28.4s, v24.4s +sqrdmulh v24.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v27.4s, v19.4s, v3.4s +add v19.4s, v19.4s, v3.4s +sqrdmulh v3.4S, v12.4S, v29.s[2] +mla v13.4S, v5.4S, v31.s[0] +sub v5.4s, v16.4s, v26.4s +add v16.4s, v16.4s, v26.4s +sqrdmulh v26.4S, v14.4S, v29.s[2] +mla v11.4S, v6.4S, v31.s[0] +sub v6.4s, v0.4s, v4.4s +add v0.4s, v0.4s, v4.4s +sqrdmulh v4.4S, v20.4S, v29.s[1] +mla v17.4S, v21.4S, v31.s[0] +sub v21.4s, v10.4s, v25.4s +str q8, [x0, #64] +sqrdmulh v8.4S, v28.4S, v29.s[1] +mla v18.4S, v24.4S, v31.s[0] +add v10.4s, v10.4s, v25.4s +str q15, [x0, #0] +mul v14.4S, v14.4S,v30.s[2] +mul v12.4S, v12.4S,v30.s[2] +sub v15.4s, v27.4s, v13.4s +add v27.4s, v27.4s, v13.4s +mla v14.4S, v26.4S, v31.s[0] +mla v12.4S, v3.4S, v31.s[0] +sub v3.4s, v5.4s, v11.4s +add v5.4s, v5.4s, v11.4s +mul v28.4S, v28.4S,v30.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v11.4s, v19.4s, v17.4s +add v19.4s, v19.4s, v17.4s +mla v28.4S, v8.4S, v31.s[0] +mla v20.4S, v4.4S, v31.s[0] +sub v4.4s, v16.4s, v18.4s +add v16.4s, v16.4s, v18.4s +sqrdmulh v29.4S, v15.4S, v22.s[3] +mul v15.4S, v15.4S,v23.s[3] +sub v30.4s, v6.4s, v12.4s +add v6.4s, v6.4s, v12.4s +sqrdmulh v12.4S, v27.4S, v22.s[2] +mul v27.4S, v27.4S,v23.s[2] +sub v18.4s, v21.4s, v14.4s +add v21.4s, v21.4s, v14.4s +sqrdmulh v14.4S, v11.4S, v22.s[1] +mul v11.4S, v11.4S,v23.s[1] +sub v8.4s, v0.4s, v20.4s +add v0.4s, v0.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v17.4s, v10.4s, v28.4s +add v10.4s, v10.4s, v28.4s +sqrdmulh v28.4S, v3.4S, v22.s[3] +mla v15.4S, v29.4S, v31.s[0] +nop +nop +sqrdmulh v29.4S, v5.4S, v22.s[2] +mla v27.4S, v12.4S, v31.s[0] +nop +nop +sqrdmulh v12.4S, v4.4S, v22.s[1] +mla v11.4S, v14.4S, v31.s[0] +nop +nop +sqrdmulh v14.4S, v16.4S, v22.s[0] +mla v19.4S, v20.4S, v31.s[0] +nop +nop +mul v5.4S, v5.4S,v23.s[2] +mul v3.4S, v3.4S,v23.s[3] +sub v20.4s, v30.4s, v15.4s +add v30.4s, v30.4s, v15.4s +mla v5.4S, v29.4S, v31.s[0] +mla v3.4S, v28.4S, v31.s[0] +sub v28.4s, v6.4s, v27.4s +add v6.4s, v6.4s, v27.4s +mul v16.4S, v16.4S,v23.s[0] +mul v4.4S, v4.4S,v23.s[1] +sub v27.4s, v8.4s, v11.4s +add v8.4s, v8.4s, v11.4s +mla v16.4S, v14.4S, v31.s[0] +mla v4.4S, v12.4S, v31.s[0] +sub v12.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v22.4S, v20.4S, v9.s[3] +mul v20.4S, v20.4S,v1.s[3] +sub v23.4s, v18.4s, v3.4s +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v30.4S, v9.s[2] +mul v30.4S, v30.4S,v1.s[2] +sub v19.4s, v21.4s, v5.4s +add v21.4s, v21.4s, v5.4s +sqrdmulh v5.4S, v28.4S, v9.s[1] +mul v28.4S, v28.4S,v1.s[1] +sub v14.4s, v17.4s, v4.4s +add v17.4s, v17.4s, v4.4s +sqrdmulh v4.4S, v6.4S, v9.s[0] +mul v6.4S, v6.4S,v1.s[0] +sub v11.4s, v10.4s, v16.4s +add v10.4s, v10.4s, v16.4s +sqrdmulh v9.4S, v27.4S, v7.s[3] +mla v20.4S, v22.4S, v31.s[0] +nop +nop +sqrdmulh v22.4S, v8.4S, v7.s[2] +mla v30.4S, v3.4S, v31.s[0] +nop +nop +sqrdmulh v3.4S, v12.4S, v7.s[1] +mla v28.4S, v5.4S, v31.s[0] +nop +nop +sqrdmulh v5.4S, v0.4S, v7.s[0] +mla v6.4S, v4.4S, v31.s[0] +nop +nop +mul v8.4S, v8.4S,v2.s[2] +mul v27.4S, v27.4S,v2.s[3] +sub v4.4s, v23.4s, v20.4s +str q4, [x0, #976] +mla v8.4S, v22.4S, v31.s[0] +mla v27.4S, v9.4S, v31.s[0] +add v23.4s, v23.4s, v20.4s +str q23, [x0, #912] +mul v0.4S, v0.4S,v2.s[0] +mul v12.4S, v12.4S,v2.s[1] +sub v23.4s, v18.4s, v30.4s +str q23, [x0, #848] +mla v0.4S, v5.4S, v31.s[0] +mla v12.4S, v3.4S, v31.s[0] +add v18.4s, v18.4s, v30.4s +sub v30.4s, v19.4s, v28.4s +add v19.4s, v19.4s, v28.4s +str q18, [x0, #784] +sub v18.4s, v21.4s, v6.4s +str q30, [x0, #720] +add v21.4s, v21.4s, v6.4s +str q19, [x0, #656] +sub v19.4s, v14.4s, v27.4s +str q18, [x0, #592] +add v14.4s, v14.4s, v27.4s +str q21, [x0, #528] +sub v21.4s, v17.4s, v8.4s +str q19, [x0, #464] +add v17.4s, v17.4s, v8.4s +str q14, [x0, #400] +sub v14.4s, v11.4s, v12.4s +str q21, [x0, #336] +add v11.4s, v11.4s, v12.4s +str q17, [x0, #272] +sub v17.4s, v10.4s, v0.4s +add v10.4s, v10.4s, v0.4s +ldr q24, [x0, #224] +ldr q25, [x0, #160] +ldr q13, [x0, #32] +ldr q26, [x17, #+128] +ldr q15, [x17, #+144] +sqrdmulh v29.4S, v13.4S, v15.s[0] +mul v13.4S, v13.4S,v26.s[0] +ldr q16, [x0, #48] +sqrdmulh v1.4S, v16.4S, v15.s[0] +ldr q4, [x17, #+160] +mul v16.4S, v16.4S,v26.s[0] +ldr q22, [x17, #+176] +ldr q9, [x0, #96] +sqrdmulh v20.4S, v9.4S, v22.s[0] +mul v9.4S, v9.4S,v4.s[0] +ldr q23, [x0, #112] +sqrdmulh v5.4S, v23.4S, v22.s[0] +mul v23.4S, v23.4S,v4.s[0] +ldr q3, [x17, #+192] +mla v13.4S, v29.4S, v31.s[0] +ldr q29, [x17, #+208] +sqrdmulh v2.4S, v25.4S, v29.s[0] +ldr q7, [x0, #176] +mla v16.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v7.4S, v29.s[0] +ldr q28, [x17, #+224] +mla v9.4S, v20.4S, v31.s[0] +ldr q20, [x17, #+240] +sqrdmulh v30.4S, v24.4S, v20.s[0] +ldr q6, [x0, #240] +mla v23.4S, v5.4S, v31.s[0] +sqrdmulh v5.4S, v6.4S, v20.s[0] +ldr q18, [x0, #0] +ldr q27, [x0, #128] +mul v25.4S, v25.4S,v3.s[0] +sub v19.4s, v18.4s, v13.4s +mul v7.4S, v7.4S,v3.s[0] +add v18.4s, v18.4s, v13.4s +mla v25.4S, v2.4S, v31.s[0] +sub v2.4s, v10.4s, v16.4s +ldr q13, [x0, #64] +mla v7.4S, v1.4S, v31.s[0] +add v10.4s, v10.4s, v16.4s +ldr q16, [x0, #192] +mul v24.4S, v24.4S,v28.s[0] +sub v1.4s, v13.4s, v9.4s +mul v6.4S, v6.4S,v28.s[0] +add v13.4s, v13.4s, v9.4s +mla v24.4S, v30.4S, v31.s[0] +mla v6.4S, v5.4S, v31.s[0] +sub v5.4s, v17.4s, v23.4s +sqrdmulh v30.4S, v10.4S, v15.s[1] +add v17.4s, v17.4s, v23.4s +mul v10.4S, v10.4S,v26.s[1] +sqrdmulh v23.4S, v2.4S, v15.s[2] +sub v9.4s, v27.4s, v25.4s +mul v2.4S, v2.4S,v26.s[2] +add v27.4s, v27.4s, v25.4s +sqrdmulh v15.4S, v17.4S, v22.s[1] +sub v26.4s, v11.4s, v7.4s +mul v17.4S, v17.4S,v4.s[1] +add v11.4s, v11.4s, v7.4s +sqrdmulh v7.4S, v5.4S, v22.s[2] +sub v25.4s, v16.4s, v24.4s +mul v5.4S, v5.4S,v4.s[2] +add v16.4s, v16.4s, v24.4s +mla v10.4S, v30.4S, v31.s[0] +sub v30.4s, v14.4s, v6.4s +ldr q22, [x0, #480] +sqrdmulh v4.4S, v11.4S, v29.s[1] +add v14.4s, v14.4s, v6.4s +mla v2.4S, v23.4S, v31.s[0] +ldr q23, [x0, #416] +sqrdmulh v6.4S, v26.4S, v29.s[2] +sub v24.4s, v18.4s, v10.4s +mla v17.4S, v15.4S, v31.s[0] +ldr q15, [x0, #288] +sqrdmulh v8.4S, v14.4S, v20.s[1] +add v18.4s, v18.4s, v10.4s +str q24, [x0, #16] +mla v5.4S, v7.4S, v31.s[0] +ldr q7, [x17, #+256] +sqrdmulh v24.4S, v30.4S, v20.s[2] +sub v10.4s, v19.4s, v2.4s +str q18, [x0, #0] +mul v11.4S, v11.4S,v3.s[1] +add v19.4s, v19.4s, v2.4s +ldr q2, [x17, #+272] +mul v26.4S, v26.4S,v3.s[2] +str q10, [x0, #48] +mla v11.4S, v4.4S, v31.s[0] +sub v4.4s, v13.4s, v17.4s +mla v26.4S, v6.4S, v31.s[0] +str q19, [x0, #32] +mul v14.4S, v14.4S,v28.s[1] +str q4, [x0, #80] +mul v30.4S, v30.4S,v28.s[2] +add v13.4s, v13.4s, v17.4s +str q13, [x0, #64] +mla v14.4S, v8.4S, v31.s[0] +sub v8.4s, v1.4s, v5.4s +str q8, [x0, #112] +mla v30.4S, v24.4S, v31.s[0] +add v1.4s, v1.4s, v5.4s +str q1, [x0, #96] +sqrdmulh v20.4S, v15.4S, v2.s[0] +sub v28.4s, v27.4s, v11.4s +mul v15.4S, v15.4S,v7.s[0] +str q28, [x0, #144] +ldr q28, [x0, #304] +sqrdmulh v1.4S, v28.4S, v2.s[0] +add v27.4s, v27.4s, v11.4s +ldr q11, [x17, #+288] +mul v28.4S, v28.4S,v7.s[0] +str q27, [x0, #128] +ldr q27, [x17, #+304] +ldr q5, [x0, #352] +sqrdmulh v24.4S, v5.4S, v27.s[0] +sub v8.4s, v9.4s, v26.4s +mul v5.4S, v5.4S,v11.s[0] +str q8, [x0, #176] +ldr q8, [x0, #368] +sqrdmulh v13.4S, v8.4S, v27.s[0] +add v9.4s, v9.4s, v26.4s +mul v8.4S, v8.4S,v11.s[0] +str q9, [x0, #160] +ldr q9, [x17, #+320] +mla v15.4S, v20.4S, v31.s[0] +sub v20.4s, v16.4s, v14.4s +ldr q26, [x17, #+336] +sqrdmulh v17.4S, v23.4S, v26.s[0] +str q20, [x0, #208] +ldr q20, [x0, #432] +mla v28.4S, v1.4S, v31.s[0] +add v16.4s, v16.4s, v14.4s +sqrdmulh v14.4S, v20.4S, v26.s[0] +str q16, [x0, #192] +ldr q16, [x17, #+352] +mla v5.4S, v24.4S, v31.s[0] +sub v24.4s, v25.4s, v30.4s +ldr q1, [x17, #+368] +sqrdmulh v4.4S, v22.4S, v1.s[0] +str q24, [x0, #240] +ldr q24, [x0, #496] +mla v8.4S, v13.4S, v31.s[0] +add v25.4s, v25.4s, v30.4s +sqrdmulh v30.4S, v24.4S, v1.s[0] +str q25, [x0, #224] +ldr q25, [x0, #256] +ldr q13, [x0, #384] +mul v23.4S, v23.4S,v9.s[0] +sub v29.4s, v25.4s, v15.4s +ldr q3, [x0, #272] +mul v20.4S, v20.4S,v9.s[0] +add v25.4s, v25.4s, v15.4s +ldr q15, [x0, #400] +mla v23.4S, v17.4S, v31.s[0] +sub v17.4s, v3.4s, v28.4s +ldr q19, [x0, #320] +mla v20.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v28.4s +ldr q28, [x0, #448] +mul v22.4S, v22.4S,v16.s[0] +sub v14.4s, v19.4s, v5.4s +ldr q6, [x0, #336] +mul v24.4S, v24.4S,v16.s[0] +add v19.4s, v19.4s, v5.4s +ldr q5, [x0, #464] +mla v22.4S, v4.4S, v31.s[0] +mla v24.4S, v30.4S, v31.s[0] +sub v30.4s, v6.4s, v8.4s +sqrdmulh v4.4S, v3.4S, v2.s[1] +add v6.4s, v6.4s, v8.4s +mul v3.4S, v3.4S,v7.s[1] +sqrdmulh v8.4S, v17.4S, v2.s[2] +sub v10.4s, v13.4s, v23.4s +mul v17.4S, v17.4S,v7.s[2] +add v13.4s, v13.4s, v23.4s +sqrdmulh v2.4S, v6.4S, v27.s[1] +sub v7.4s, v15.4s, v20.4s +mul v6.4S, v6.4S,v11.s[1] +add v15.4s, v15.4s, v20.4s +sqrdmulh v20.4S, v30.4S, v27.s[2] +sub v23.4s, v28.4s, v22.4s +mul v30.4S, v30.4S,v11.s[2] +add v28.4s, v28.4s, v22.4s +mla v3.4S, v4.4S, v31.s[0] +sub v4.4s, v5.4s, v24.4s +ldr q27, [x0, #736] +sqrdmulh v11.4S, v15.4S, v26.s[1] +add v5.4s, v5.4s, v24.4s +mla v17.4S, v8.4S, v31.s[0] +ldr q8, [x0, #672] +sqrdmulh v24.4S, v7.4S, v26.s[2] +sub v22.4s, v25.4s, v3.4s +mla v6.4S, v2.4S, v31.s[0] +ldr q2, [x0, #544] +sqrdmulh v18.4S, v5.4S, v1.s[1] +add v25.4s, v25.4s, v3.4s +str q22, [x0, #272] +mla v30.4S, v20.4S, v31.s[0] +ldr q20, [x17, #+384] +sqrdmulh v22.4S, v4.4S, v1.s[2] +sub v3.4s, v29.4s, v17.4s +str q25, [x0, #256] +mul v15.4S, v15.4S,v9.s[1] +add v29.4s, v29.4s, v17.4s +ldr q17, [x17, #+400] +mul v7.4S, v7.4S,v9.s[2] +str q3, [x0, #304] +mla v15.4S, v11.4S, v31.s[0] +sub v11.4s, v19.4s, v6.4s +mla v7.4S, v24.4S, v31.s[0] +str q29, [x0, #288] +mul v5.4S, v5.4S,v16.s[1] +str q11, [x0, #336] +mul v4.4S, v4.4S,v16.s[2] +add v19.4s, v19.4s, v6.4s +str q19, [x0, #320] +mla v5.4S, v18.4S, v31.s[0] +sub v18.4s, v14.4s, v30.4s +str q18, [x0, #368] +mla v4.4S, v22.4S, v31.s[0] +add v14.4s, v14.4s, v30.4s +str q14, [x0, #352] +sqrdmulh v1.4S, v2.4S, v17.s[0] +sub v16.4s, v13.4s, v15.4s +mul v2.4S, v2.4S,v20.s[0] +str q16, [x0, #400] +ldr q16, [x0, #560] +sqrdmulh v14.4S, v16.4S, v17.s[0] +add v13.4s, v13.4s, v15.4s +ldr q15, [x17, #+416] +mul v16.4S, v16.4S,v20.s[0] +str q13, [x0, #384] +ldr q13, [x17, #+432] +ldr q30, [x0, #608] +sqrdmulh v22.4S, v30.4S, v13.s[0] +sub v18.4s, v10.4s, v7.4s +mul v30.4S, v30.4S,v15.s[0] +str q18, [x0, #432] +ldr q18, [x0, #624] +sqrdmulh v19.4S, v18.4S, v13.s[0] +add v10.4s, v10.4s, v7.4s +mul v18.4S, v18.4S,v15.s[0] +str q10, [x0, #416] +ldr q10, [x17, #+448] +mla v2.4S, v1.4S, v31.s[0] +sub v1.4s, v28.4s, v5.4s +ldr q7, [x17, #+464] +sqrdmulh v6.4S, v8.4S, v7.s[0] +str q1, [x0, #464] +ldr q1, [x0, #688] +mla v16.4S, v14.4S, v31.s[0] +add v28.4s, v28.4s, v5.4s +sqrdmulh v5.4S, v1.4S, v7.s[0] +str q28, [x0, #448] +ldr q28, [x17, #+480] +mla v30.4S, v22.4S, v31.s[0] +sub v22.4s, v23.4s, v4.4s +ldr q14, [x17, #+496] +sqrdmulh v11.4S, v27.4S, v14.s[0] +str q22, [x0, #496] +ldr q22, [x0, #752] +mla v18.4S, v19.4S, v31.s[0] +add v23.4s, v23.4s, v4.4s +sqrdmulh v4.4S, v22.4S, v14.s[0] +str q23, [x0, #480] +ldr q23, [x0, #512] +ldr q19, [x0, #640] +mul v8.4S, v8.4S,v10.s[0] +sub v26.4s, v23.4s, v2.4s +ldr q9, [x0, #528] +mul v1.4S, v1.4S,v10.s[0] +add v23.4s, v23.4s, v2.4s +ldr q2, [x0, #656] +mla v8.4S, v6.4S, v31.s[0] +sub v6.4s, v9.4s, v16.4s +ldr q29, [x0, #576] +mla v1.4S, v5.4S, v31.s[0] +add v9.4s, v9.4s, v16.4s +ldr q16, [x0, #704] +mul v27.4S, v27.4S,v28.s[0] +sub v5.4s, v29.4s, v30.4s +ldr q24, [x0, #592] +mul v22.4S, v22.4S,v28.s[0] +add v29.4s, v29.4s, v30.4s +ldr q30, [x0, #720] +mla v27.4S, v11.4S, v31.s[0] +mla v22.4S, v4.4S, v31.s[0] +sub v4.4s, v24.4s, v18.4s +sqrdmulh v11.4S, v9.4S, v17.s[1] +add v24.4s, v24.4s, v18.4s +mul v9.4S, v9.4S,v20.s[1] +sqrdmulh v18.4S, v6.4S, v17.s[2] +sub v3.4s, v19.4s, v8.4s +mul v6.4S, v6.4S,v20.s[2] +add v19.4s, v19.4s, v8.4s +sqrdmulh v17.4S, v24.4S, v13.s[1] +sub v20.4s, v2.4s, v1.4s +mul v24.4S, v24.4S,v15.s[1] +add v2.4s, v2.4s, v1.4s +sqrdmulh v1.4S, v4.4S, v13.s[2] +sub v8.4s, v16.4s, v27.4s +mul v4.4S, v4.4S,v15.s[2] +add v16.4s, v16.4s, v27.4s +mla v9.4S, v11.4S, v31.s[0] +sub v11.4s, v30.4s, v22.4s +ldr q13, [x0, #992] +sqrdmulh v15.4S, v2.4S, v7.s[1] +add v30.4s, v30.4s, v22.4s +mla v6.4S, v18.4S, v31.s[0] +ldr q18, [x0, #928] +sqrdmulh v22.4S, v20.4S, v7.s[2] +sub v27.4s, v23.4s, v9.4s +mla v24.4S, v17.4S, v31.s[0] +ldr q17, [x0, #800] +sqrdmulh v25.4S, v30.4S, v14.s[1] +add v23.4s, v23.4s, v9.4s +str q27, [x0, #528] +mla v4.4S, v1.4S, v31.s[0] +ldr q1, [x17, #+512] +sqrdmulh v27.4S, v11.4S, v14.s[2] +sub v9.4s, v26.4s, v6.4s +str q23, [x0, #512] +mul v2.4S, v2.4S,v10.s[1] +add v26.4s, v26.4s, v6.4s +ldr q6, [x17, #+528] +mul v20.4S, v20.4S,v10.s[2] +str q9, [x0, #560] +mla v2.4S, v15.4S, v31.s[0] +sub v15.4s, v29.4s, v24.4s +mla v20.4S, v22.4S, v31.s[0] +str q26, [x0, #544] +mul v30.4S, v30.4S,v28.s[1] +str q15, [x0, #592] +mul v11.4S, v11.4S,v28.s[2] +add v29.4s, v29.4s, v24.4s +str q29, [x0, #576] +mla v30.4S, v25.4S, v31.s[0] +sub v25.4s, v5.4s, v4.4s +str q25, [x0, #624] +mla v11.4S, v27.4S, v31.s[0] +add v5.4s, v5.4s, v4.4s +str q5, [x0, #608] +sqrdmulh v14.4S, v17.4S, v6.s[0] +sub v28.4s, v19.4s, v2.4s +mul v17.4S, v17.4S,v1.s[0] +str q28, [x0, #656] +ldr q28, [x0, #816] +sqrdmulh v5.4S, v28.4S, v6.s[0] +add v19.4s, v19.4s, v2.4s +ldr q2, [x17, #+544] +mul v28.4S, v28.4S,v1.s[0] +str q19, [x0, #640] +ldr q19, [x17, #+560] +ldr q4, [x0, #864] +sqrdmulh v27.4S, v4.4S, v19.s[0] +sub v25.4s, v3.4s, v20.4s +mul v4.4S, v4.4S,v2.s[0] +str q25, [x0, #688] +ldr q25, [x0, #880] +sqrdmulh v29.4S, v25.4S, v19.s[0] +add v3.4s, v3.4s, v20.4s +mul v25.4S, v25.4S,v2.s[0] +str q3, [x0, #672] +ldr q3, [x17, #+576] +mla v17.4S, v14.4S, v31.s[0] +sub v14.4s, v16.4s, v30.4s +ldr q20, [x17, #+592] +sqrdmulh v24.4S, v18.4S, v20.s[0] +str q14, [x0, #720] +ldr q14, [x0, #944] +mla v28.4S, v5.4S, v31.s[0] +add v16.4s, v16.4s, v30.4s +sqrdmulh v30.4S, v14.4S, v20.s[0] +str q16, [x0, #704] +ldr q16, [x17, #+608] +mla v4.4S, v27.4S, v31.s[0] +sub v27.4s, v8.4s, v11.4s +ldr q5, [x17, #+624] +sqrdmulh v15.4S, v13.4S, v5.s[0] +str q27, [x0, #752] +ldr q27, [x0, #1008] +mla v25.4S, v29.4S, v31.s[0] +add v8.4s, v8.4s, v11.4s +sqrdmulh v11.4S, v27.4S, v5.s[0] +str q8, [x0, #736] +ldr q8, [x0, #768] +ldr q29, [x0, #896] +mul v18.4S, v18.4S,v3.s[0] +sub v7.4s, v8.4s, v17.4s +ldr q10, [x0, #784] +mul v14.4S, v14.4S,v3.s[0] +add v8.4s, v8.4s, v17.4s +ldr q17, [x0, #912] +mla v18.4S, v24.4S, v31.s[0] +sub v24.4s, v10.4s, v28.4s +ldr q26, [x0, #832] +mla v14.4S, v30.4S, v31.s[0] +add v10.4s, v10.4s, v28.4s +ldr q28, [x0, #960] +mul v13.4S, v13.4S,v16.s[0] +sub v30.4s, v26.4s, v4.4s +ldr q22, [x0, #848] +mul v27.4S, v27.4S,v16.s[0] +add v26.4s, v26.4s, v4.4s +ldr q4, [x0, #976] +mla v13.4S, v15.4S, v31.s[0] +mla v27.4S, v11.4S, v31.s[0] +sub v11.4s, v22.4s, v25.4s +sqrdmulh v15.4S, v10.4S, v6.s[1] +add v22.4s, v22.4s, v25.4s +mul v10.4S, v10.4S,v1.s[1] +sqrdmulh v25.4S, v24.4S, v6.s[2] +sub v9.4s, v29.4s, v18.4s +mul v24.4S, v24.4S,v1.s[2] +add v29.4s, v29.4s, v18.4s +sqrdmulh v6.4S, v22.4S, v19.s[1] +sub v1.4s, v17.4s, v14.4s +mul v22.4S, v22.4S,v2.s[1] +add v17.4s, v17.4s, v14.4s +sqrdmulh v14.4S, v11.4S, v19.s[2] +sub v18.4s, v28.4s, v13.4s +mul v11.4S, v11.4S,v2.s[2] +add v28.4s, v28.4s, v13.4s +mla v10.4S, v15.4S, v31.s[0] +sub v15.4s, v4.4s, v27.4s +sqrdmulh v19.4S, v17.4S, v20.s[1] +add v4.4s, v4.4s, v27.4s +mla v24.4S, v25.4S, v31.s[0] +sqrdmulh v25.4S, v1.4S, v20.s[2] +sub v27.4s, v8.4s, v10.4s +mla v22.4S, v6.4S, v31.s[0] +sqrdmulh v6.4S, v4.4S, v5.s[1] +add v8.4s, v8.4s, v10.4s +str q27, [x0, #784] +mla v11.4S, v14.4S, v31.s[0] +sqrdmulh v14.4S, v15.4S, v5.s[2] +sub v27.4s, v7.4s, v24.4s +str q8, [x0, #768] +mul v17.4S, v17.4S,v3.s[1] +add v7.4s, v7.4s, v24.4s +mul v1.4S, v1.4S,v3.s[2] +str q27, [x0, #816] +mla v17.4S, v19.4S, v31.s[0] +sub v19.4s, v26.4s, v22.4s +mla v1.4S, v25.4S, v31.s[0] +str q7, [x0, #800] +mul v4.4S, v4.4S,v16.s[1] +str q19, [x0, #848] +mul v15.4S, v15.4S,v16.s[2] +add v26.4s, v26.4s, v22.4s +str q26, [x0, #832] +mla v4.4S, v6.4S, v31.s[0] +sub v6.4s, v30.4s, v11.4s +str q6, [x0, #880] +mla v15.4S, v14.4S, v31.s[0] +add v30.4s, v30.4s, v11.4s +str q30, [x0, #864] +sub v5.4s, v29.4s, v17.4s +str q5, [x0, #912] +add v29.4s, v29.4s, v17.4s +str q29, [x0, #896] +sub v29.4s, v9.4s, v1.4s +str q29, [x0, #944] +add v9.4s, v9.4s, v1.4s +str q9, [x0, #928] +sub v9.4s, v28.4s, v4.4s +str q9, [x0, #976] +add v28.4s, v28.4s, v4.4s +str q28, [x0, #960] +sub v28.4s, v18.4s, v15.4s +str q28, [x0, #1008] +add v18.4s, v18.4s, v15.4s +str q18, [x0, #992] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1520 +// Instruction count: 1516 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_9.s b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_9.s new file mode 100644 index 0000000..6c246f1 --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_9.s @@ -0,0 +1,1558 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_22_z4_9 +.global _ntt_u32_incomplete_neon_asm_var_4_2_22_z4_9 +ntt_u32_incomplete_neon_asm_var_4_2_22_z4_9: +_ntt_u32_incomplete_neon_asm_var_4_2_22_z4_9: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x0, #992] +sqrdmulh v27.4S, v28.4S, v29.s[0] +mul v28.4S, v28.4S,v30.s[0] +ldr q26, [x0, #928] +sqrdmulh v25.4S, v26.4S, v29.s[0] +mul v26.4S, v26.4S,v30.s[0] +ldr q24, [x0, #864] +sqrdmulh v23.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +ldr q22, [x0, #800] +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +ldr q20, [x0, #736] +sqrdmulh v19.4S, v20.4S, v29.s[0] +mla v28.4S, v27.4S, v31.s[0] +ldr q27, [x0, #672] +sqrdmulh v18.4S, v27.4S, v29.s[0] +mla v26.4S, v25.4S, v31.s[0] +ldr q25, [x0, #608] +sqrdmulh v17.4S, v25.4S, v29.s[0] +mla v24.4S, v23.4S, v31.s[0] +ldr q23, [x0, #544] +sqrdmulh v16.4S, v23.4S, v29.s[0] +mla v22.4S, v21.4S, v31.s[0] +ldr q21, [x0, #480] +mul v27.4S, v27.4S,v30.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q3, [x0, #416] +ldr q2, [x0, #352] +ldr q1, [x0, #288] +mla v27.4S, v18.4S, v31.s[0] +mla v20.4S, v19.4S, v31.s[0] +ldr q19, [x0, #224] +ldr q18, [x0, #160] +mul v23.4S, v23.4S,v30.s[0] +mul v25.4S, v25.4S,v30.s[0] +ldr q0, [x0, #96] +ldr q15, [x0, #32] +mla v23.4S, v16.4S, v31.s[0] +mla v25.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v28.4s +add v21.4s, v21.4s, v28.4s +sqrdmulh v28.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +sub v16.4s, v3.4s, v26.4s +add v3.4s, v3.4s, v26.4s +sqrdmulh v26.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +sub v14.4s, v2.4s, v24.4s +add v2.4s, v2.4s, v24.4s +sqrdmulh v24.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v13.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +sqrdmulh v22.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v12.4s, v19.4s, v20.4s +add v19.4s, v19.4s, v20.4s +sqrdmulh v20.4S, v14.4S, v29.s[2] +mla v17.4S, v28.4S, v31.s[0] +sub v28.4s, v18.4s, v27.4s +add v18.4s, v18.4s, v27.4s +sqrdmulh v27.4S, v13.4S, v29.s[2] +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v0.4s, v25.4s +add v0.4s, v0.4s, v25.4s +sqrdmulh v25.4S, v2.4S, v29.s[1] +mla v21.4S, v24.4S, v31.s[0] +sub v24.4s, v15.4s, v23.4s +sqrdmulh v11.4S, v1.4S, v29.s[1] +mla v3.4S, v22.4S, v31.s[0] +add v15.4s, v15.4s, v23.4s +ldr q23, [x17, #+32] +ldr q22, [x17, #+48] +mul v13.4S, v13.4S,v30.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v10.4s, v12.4s, v17.4s +add v12.4s, v12.4s, v17.4s +mla v13.4S, v27.4S, v31.s[0] +mla v14.4S, v20.4S, v31.s[0] +sub v20.4s, v28.4s, v16.4s +add v28.4s, v28.4s, v16.4s +mul v1.4S, v1.4S,v30.s[1] +mul v2.4S, v2.4S,v30.s[1] +sub v16.4s, v19.4s, v21.4s +add v19.4s, v19.4s, v21.4s +mla v1.4S, v11.4S, v31.s[0] +mla v2.4S, v25.4S, v31.s[0] +sub v25.4s, v18.4s, v3.4s +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v10.4S, v22.s[3] +mul v10.4S, v10.4S,v23.s[3] +sub v11.4s, v26.4s, v14.4s +add v26.4s, v26.4s, v14.4s +sqrdmulh v14.4S, v12.4S, v22.s[2] +mul v12.4S, v12.4S,v23.s[2] +sub v21.4s, v24.4s, v13.4s +add v24.4s, v24.4s, v13.4s +sqrdmulh v13.4S, v16.4S, v22.s[1] +mul v16.4S, v16.4S,v23.s[1] +sub v27.4s, v0.4s, v2.4s +add v0.4s, v0.4s, v2.4s +sqrdmulh v2.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v17.4s, v15.4s, v1.4s +add v15.4s, v15.4s, v1.4s +ldr q1, [x17, #+96] +ldr q9, [x17, #+112] +sqrdmulh v8.4S, v20.4S, v22.s[3] +mla v10.4S, v3.4S, v31.s[0] +nop +nop +sqrdmulh v3.4S, v28.4S, v22.s[2] +mla v12.4S, v14.4S, v31.s[0] +nop +nop +sqrdmulh v14.4S, v25.4S, v22.s[1] +mla v16.4S, v13.4S, v31.s[0] +nop +nop +sqrdmulh v13.4S, v18.4S, v22.s[0] +mla v19.4S, v2.4S, v31.s[0] +nop +nop +ldr q2, [x17, #+64] +ldr q7, [x17, #+80] +mul v28.4S, v28.4S,v23.s[2] +mul v20.4S, v20.4S,v23.s[3] +sub v6.4s, v11.4s, v10.4s +add v11.4s, v11.4s, v10.4s +mla v28.4S, v3.4S, v31.s[0] +mla v20.4S, v8.4S, v31.s[0] +sub v8.4s, v26.4s, v12.4s +add v26.4s, v26.4s, v12.4s +mul v18.4S, v18.4S,v23.s[0] +mul v25.4S, v25.4S,v23.s[1] +sub v12.4s, v27.4s, v16.4s +add v27.4s, v27.4s, v16.4s +mla v18.4S, v13.4S, v31.s[0] +mla v25.4S, v14.4S, v31.s[0] +sub v14.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v9.s[3] +mul v6.4S, v6.4S,v1.s[3] +sub v13.4s, v21.4s, v20.4s +add v21.4s, v21.4s, v20.4s +sqrdmulh v20.4S, v11.4S, v9.s[2] +mul v11.4S, v11.4S,v1.s[2] +sub v16.4s, v24.4s, v28.4s +add v24.4s, v24.4s, v28.4s +sqrdmulh v28.4S, v8.4S, v9.s[1] +mul v8.4S, v8.4S,v1.s[1] +sub v3.4s, v17.4s, v25.4s +add v17.4s, v17.4s, v25.4s +sqrdmulh v25.4S, v26.4S, v9.s[0] +mul v26.4S, v26.4S,v1.s[0] +sub v10.4s, v15.4s, v18.4s +add v15.4s, v15.4s, v18.4s +sqrdmulh v18.4S, v12.4S, v7.s[3] +mla v6.4S, v19.4S, v31.s[0] +nop +nop +sqrdmulh v19.4S, v27.4S, v7.s[2] +mla v11.4S, v20.4S, v31.s[0] +nop +nop +sqrdmulh v20.4S, v14.4S, v7.s[1] +mla v8.4S, v28.4S, v31.s[0] +nop +nop +sqrdmulh v28.4S, v0.4S, v7.s[0] +mla v26.4S, v25.4S, v31.s[0] +nop +nop +mul v27.4S, v27.4S,v2.s[2] +mul v12.4S, v12.4S,v2.s[3] +sub v25.4s, v13.4s, v6.4s +str q25, [x0, #992] +mla v27.4S, v19.4S, v31.s[0] +mla v12.4S, v18.4S, v31.s[0] +add v13.4s, v13.4s, v6.4s +str q13, [x0, #928] +mul v0.4S, v0.4S,v2.s[0] +mul v14.4S, v14.4S,v2.s[1] +sub v13.4s, v21.4s, v11.4s +str q13, [x0, #864] +mla v0.4S, v28.4S, v31.s[0] +mla v14.4S, v20.4S, v31.s[0] +add v21.4s, v21.4s, v11.4s +sub v11.4s, v16.4s, v8.4s +ldr q20, [x0, #1008] +sqrdmulh v28.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v16.4s, v16.4s, v8.4s +str q21, [x0, #800] +ldr q21, [x0, #944] +sqrdmulh v8.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +sub v13.4s, v24.4s, v26.4s +str q11, [x0, #736] +ldr q11, [x0, #880] +sqrdmulh v6.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +add v24.4s, v24.4s, v26.4s +str q16, [x0, #672] +ldr q16, [x0, #816] +sqrdmulh v26.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v18.4s, v3.4s, v12.4s +str q13, [x0, #608] +ldr q13, [x0, #752] +sqrdmulh v19.4S, v13.4S, v29.s[0] +mla v20.4S, v28.4S, v31.s[0] +add v3.4s, v3.4s, v12.4s +str q24, [x0, #544] +ldr q24, [x0, #688] +sqrdmulh v12.4S, v24.4S, v29.s[0] +mla v21.4S, v8.4S, v31.s[0] +sub v8.4s, v17.4s, v27.4s +str q18, [x0, #480] +ldr q18, [x0, #624] +sqrdmulh v28.4S, v18.4S, v29.s[0] +mla v11.4S, v6.4S, v31.s[0] +add v17.4s, v17.4s, v27.4s +str q3, [x0, #416] +ldr q3, [x0, #560] +sqrdmulh v27.4S, v3.4S, v29.s[0] +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v10.4s, v14.4s +str q8, [x0, #352] +ldr q8, [x0, #496] +add v10.4s, v10.4s, v14.4s +mul v24.4S, v24.4S,v30.s[0] +mul v13.4S, v13.4S,v30.s[0] +ldr q14, [x0, #432] +str q17, [x0, #288] +ldr q17, [x0, #368] +ldr q6, [x0, #304] +mla v24.4S, v12.4S, v31.s[0] +mla v13.4S, v19.4S, v31.s[0] +str q26, [x0, #224] +sub v26.4s, v15.4s, v0.4s +ldr q19, [x0, #240] +ldr q12, [x0, #176] +mul v3.4S, v3.4S,v30.s[0] +mul v18.4S, v18.4S,v30.s[0] +str q10, [x0, #160] +add v15.4s, v15.4s, v0.4s +ldr q0, [x0, #112] +ldr q10, [x0, #48] +mla v3.4S, v27.4S, v31.s[0] +mla v18.4S, v28.4S, v31.s[0] +sub v28.4s, v8.4s, v20.4s +add v8.4s, v8.4s, v20.4s +sqrdmulh v20.4S, v28.4S, v29.s[2] +mul v28.4S, v28.4S,v30.s[2] +sub v27.4s, v14.4s, v21.4s +add v14.4s, v14.4s, v21.4s +sqrdmulh v21.4S, v27.4S, v29.s[2] +mul v27.4S, v27.4S,v30.s[2] +sub v25.4s, v17.4s, v11.4s +add v17.4s, v17.4s, v11.4s +sqrdmulh v11.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +sub v5.4s, v6.4s, v16.4s +add v6.4s, v6.4s, v16.4s +sqrdmulh v16.4S, v14.4S, v29.s[1] +mul v14.4S, v14.4S,v30.s[1] +sub v4.4s, v19.4s, v13.4s +add v19.4s, v19.4s, v13.4s +sqrdmulh v13.4S, v25.4S, v29.s[2] +mla v28.4S, v20.4S, v31.s[0] +sub v20.4s, v12.4s, v24.4s +add v12.4s, v12.4s, v24.4s +sqrdmulh v24.4S, v5.4S, v29.s[2] +mla v27.4S, v21.4S, v31.s[0] +sub v21.4s, v0.4s, v18.4s +add v0.4s, v0.4s, v18.4s +sqrdmulh v18.4S, v17.4S, v29.s[1] +mla v8.4S, v11.4S, v31.s[0] +sub v11.4s, v10.4s, v3.4s +str q26, [x0, #96] +sqrdmulh v26.4S, v6.4S, v29.s[1] +mla v14.4S, v16.4S, v31.s[0] +add v10.4s, v10.4s, v3.4s +str q15, [x0, #32] +mul v5.4S, v5.4S,v30.s[2] +mul v25.4S, v25.4S,v30.s[2] +sub v15.4s, v4.4s, v28.4s +add v4.4s, v4.4s, v28.4s +mla v5.4S, v24.4S, v31.s[0] +mla v25.4S, v13.4S, v31.s[0] +sub v13.4s, v20.4s, v27.4s +add v20.4s, v20.4s, v27.4s +mul v6.4S, v6.4S,v30.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v27.4s, v19.4s, v8.4s +add v19.4s, v19.4s, v8.4s +mla v6.4S, v26.4S, v31.s[0] +mla v17.4S, v18.4S, v31.s[0] +sub v18.4s, v12.4s, v14.4s +add v12.4s, v12.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v22.s[3] +mul v15.4S, v15.4S,v23.s[3] +sub v26.4s, v21.4s, v25.4s +add v21.4s, v21.4s, v25.4s +sqrdmulh v25.4S, v4.4S, v22.s[2] +mul v4.4S, v4.4S,v23.s[2] +sub v8.4s, v11.4s, v5.4s +add v11.4s, v11.4s, v5.4s +sqrdmulh v5.4S, v27.4S, v22.s[1] +mul v27.4S, v27.4S,v23.s[1] +sub v24.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +sqrdmulh v17.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v28.4s, v10.4s, v6.4s +add v10.4s, v10.4s, v6.4s +sqrdmulh v6.4S, v13.4S, v22.s[3] +mla v15.4S, v14.4S, v31.s[0] +nop +nop +sqrdmulh v14.4S, v20.4S, v22.s[2] +mla v4.4S, v25.4S, v31.s[0] +nop +nop +sqrdmulh v25.4S, v18.4S, v22.s[1] +mla v27.4S, v5.4S, v31.s[0] +nop +nop +sqrdmulh v5.4S, v12.4S, v22.s[0] +mla v19.4S, v17.4S, v31.s[0] +nop +nop +mul v20.4S, v20.4S,v23.s[2] +mul v13.4S, v13.4S,v23.s[3] +sub v17.4s, v26.4s, v15.4s +add v26.4s, v26.4s, v15.4s +mla v20.4S, v14.4S, v31.s[0] +mla v13.4S, v6.4S, v31.s[0] +sub v6.4s, v21.4s, v4.4s +add v21.4s, v21.4s, v4.4s +mul v12.4S, v12.4S,v23.s[0] +mul v18.4S, v18.4S,v23.s[1] +sub v4.4s, v24.4s, v27.4s +add v24.4s, v24.4s, v27.4s +mla v12.4S, v5.4S, v31.s[0] +mla v18.4S, v25.4S, v31.s[0] +sub v25.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v17.4S, v9.s[3] +mul v17.4S, v17.4S,v1.s[3] +sub v5.4s, v8.4s, v13.4s +add v8.4s, v8.4s, v13.4s +sqrdmulh v13.4S, v26.4S, v9.s[2] +mul v26.4S, v26.4S,v1.s[2] +sub v27.4s, v11.4s, v20.4s +add v11.4s, v11.4s, v20.4s +sqrdmulh v20.4S, v6.4S, v9.s[1] +mul v6.4S, v6.4S,v1.s[1] +sub v14.4s, v28.4s, v18.4s +add v28.4s, v28.4s, v18.4s +sqrdmulh v18.4S, v21.4S, v9.s[0] +mul v21.4S, v21.4S,v1.s[0] +sub v15.4s, v10.4s, v12.4s +add v10.4s, v10.4s, v12.4s +sqrdmulh v12.4S, v4.4S, v7.s[3] +mla v17.4S, v19.4S, v31.s[0] +nop +nop +sqrdmulh v19.4S, v24.4S, v7.s[2] +mla v26.4S, v13.4S, v31.s[0] +nop +nop +sqrdmulh v13.4S, v25.4S, v7.s[1] +mla v6.4S, v20.4S, v31.s[0] +nop +nop +sqrdmulh v20.4S, v0.4S, v7.s[0] +mla v21.4S, v18.4S, v31.s[0] +nop +nop +mul v24.4S, v24.4S,v2.s[2] +mul v4.4S, v4.4S,v2.s[3] +sub v18.4s, v5.4s, v17.4s +str q18, [x0, #1008] +mla v24.4S, v19.4S, v31.s[0] +mla v4.4S, v12.4S, v31.s[0] +add v5.4s, v5.4s, v17.4s +str q5, [x0, #944] +mul v0.4S, v0.4S,v2.s[0] +mul v25.4S, v25.4S,v2.s[1] +sub v5.4s, v8.4s, v26.4s +str q5, [x0, #880] +mla v0.4S, v20.4S, v31.s[0] +mla v25.4S, v13.4S, v31.s[0] +add v8.4s, v8.4s, v26.4s +sub v26.4s, v27.4s, v6.4s +ldr q13, [x0, #960] +sqrdmulh v20.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +add v27.4s, v27.4s, v6.4s +str q8, [x0, #816] +ldr q8, [x0, #896] +sqrdmulh v6.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v5.4s, v11.4s, v21.4s +str q26, [x0, #752] +ldr q26, [x0, #832] +sqrdmulh v17.4S, v26.4S, v29.s[0] +mul v26.4S, v26.4S,v30.s[0] +add v11.4s, v11.4s, v21.4s +str q27, [x0, #688] +ldr q27, [x0, #768] +sqrdmulh v21.4S, v27.4S, v29.s[0] +mul v27.4S, v27.4S,v30.s[0] +sub v12.4s, v14.4s, v4.4s +str q5, [x0, #624] +ldr q5, [x0, #704] +sqrdmulh v19.4S, v5.4S, v29.s[0] +mla v13.4S, v20.4S, v31.s[0] +add v14.4s, v14.4s, v4.4s +str q11, [x0, #560] +ldr q11, [x0, #640] +sqrdmulh v4.4S, v11.4S, v29.s[0] +mla v8.4S, v6.4S, v31.s[0] +sub v6.4s, v28.4s, v24.4s +str q12, [x0, #496] +ldr q12, [x0, #576] +sqrdmulh v20.4S, v12.4S, v29.s[0] +mla v26.4S, v17.4S, v31.s[0] +add v28.4s, v28.4s, v24.4s +str q14, [x0, #432] +ldr q14, [x0, #512] +sqrdmulh v24.4S, v14.4S, v29.s[0] +mla v27.4S, v21.4S, v31.s[0] +sub v21.4s, v15.4s, v25.4s +str q6, [x0, #368] +ldr q6, [x0, #448] +add v15.4s, v15.4s, v25.4s +mul v11.4S, v11.4S,v30.s[0] +mul v5.4S, v5.4S,v30.s[0] +ldr q25, [x0, #384] +str q28, [x0, #304] +ldr q28, [x0, #320] +ldr q17, [x0, #256] +mla v11.4S, v4.4S, v31.s[0] +mla v5.4S, v19.4S, v31.s[0] +str q21, [x0, #240] +sub v21.4s, v10.4s, v0.4s +ldr q19, [x0, #192] +ldr q4, [x0, #128] +mul v14.4S, v14.4S,v30.s[0] +mul v12.4S, v12.4S,v30.s[0] +str q15, [x0, #176] +add v10.4s, v10.4s, v0.4s +ldr q0, [x0, #64] +ldr q15, [x0, #0] +mla v14.4S, v24.4S, v31.s[0] +mla v12.4S, v20.4S, v31.s[0] +sub v20.4s, v6.4s, v13.4s +add v6.4s, v6.4s, v13.4s +sqrdmulh v13.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +sub v24.4s, v25.4s, v8.4s +add v25.4s, v25.4s, v8.4s +sqrdmulh v8.4S, v24.4S, v29.s[2] +mul v24.4S, v24.4S,v30.s[2] +sub v18.4s, v28.4s, v26.4s +add v28.4s, v28.4s, v26.4s +sqrdmulh v26.4S, v6.4S, v29.s[1] +mul v6.4S, v6.4S,v30.s[1] +sub v3.4s, v17.4s, v27.4s +add v17.4s, v17.4s, v27.4s +sqrdmulh v27.4S, v25.4S, v29.s[1] +mul v25.4S, v25.4S,v30.s[1] +sub v16.4s, v19.4s, v5.4s +add v19.4s, v19.4s, v5.4s +sqrdmulh v5.4S, v18.4S, v29.s[2] +mla v20.4S, v13.4S, v31.s[0] +sub v13.4s, v4.4s, v11.4s +add v4.4s, v4.4s, v11.4s +sqrdmulh v11.4S, v3.4S, v29.s[2] +mla v24.4S, v8.4S, v31.s[0] +sub v8.4s, v0.4s, v12.4s +add v0.4s, v0.4s, v12.4s +sqrdmulh v12.4S, v28.4S, v29.s[1] +mla v6.4S, v26.4S, v31.s[0] +sub v26.4s, v15.4s, v14.4s +str q21, [x0, #112] +sqrdmulh v21.4S, v17.4S, v29.s[1] +mla v25.4S, v27.4S, v31.s[0] +add v15.4s, v15.4s, v14.4s +str q10, [x0, #48] +mul v3.4S, v3.4S,v30.s[2] +mul v18.4S, v18.4S,v30.s[2] +sub v10.4s, v16.4s, v20.4s +add v16.4s, v16.4s, v20.4s +mla v3.4S, v11.4S, v31.s[0] +mla v18.4S, v5.4S, v31.s[0] +sub v5.4s, v13.4s, v24.4s +add v13.4s, v13.4s, v24.4s +mul v17.4S, v17.4S,v30.s[1] +mul v28.4S, v28.4S,v30.s[1] +sub v24.4s, v19.4s, v6.4s +add v19.4s, v19.4s, v6.4s +mla v17.4S, v21.4S, v31.s[0] +mla v28.4S, v12.4S, v31.s[0] +sub v12.4s, v4.4s, v25.4s +add v4.4s, v4.4s, v25.4s +sqrdmulh v25.4S, v10.4S, v22.s[3] +mul v10.4S, v10.4S,v23.s[3] +sub v21.4s, v8.4s, v18.4s +add v8.4s, v8.4s, v18.4s +sqrdmulh v18.4S, v16.4S, v22.s[2] +mul v16.4S, v16.4S,v23.s[2] +sub v6.4s, v26.4s, v3.4s +add v26.4s, v26.4s, v3.4s +sqrdmulh v3.4S, v24.4S, v22.s[1] +mul v24.4S, v24.4S,v23.s[1] +sub v11.4s, v0.4s, v28.4s +add v0.4s, v0.4s, v28.4s +sqrdmulh v28.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v20.4s, v15.4s, v17.4s +add v15.4s, v15.4s, v17.4s +sqrdmulh v17.4S, v5.4S, v22.s[3] +mla v10.4S, v25.4S, v31.s[0] +nop +nop +sqrdmulh v25.4S, v13.4S, v22.s[2] +mla v16.4S, v18.4S, v31.s[0] +nop +nop +sqrdmulh v18.4S, v12.4S, v22.s[1] +mla v24.4S, v3.4S, v31.s[0] +nop +nop +sqrdmulh v3.4S, v4.4S, v22.s[0] +mla v19.4S, v28.4S, v31.s[0] +nop +nop +mul v13.4S, v13.4S,v23.s[2] +mul v5.4S, v5.4S,v23.s[3] +sub v28.4s, v21.4s, v10.4s +add v21.4s, v21.4s, v10.4s +mla v13.4S, v25.4S, v31.s[0] +mla v5.4S, v17.4S, v31.s[0] +sub v17.4s, v8.4s, v16.4s +add v8.4s, v8.4s, v16.4s +mul v4.4S, v4.4S,v23.s[0] +mul v12.4S, v12.4S,v23.s[1] +sub v16.4s, v11.4s, v24.4s +add v11.4s, v11.4s, v24.4s +mla v4.4S, v3.4S, v31.s[0] +mla v12.4S, v18.4S, v31.s[0] +sub v18.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v28.4S, v9.s[3] +mul v28.4S, v28.4S,v1.s[3] +sub v3.4s, v6.4s, v5.4s +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v21.4S, v9.s[2] +mul v21.4S, v21.4S,v1.s[2] +sub v24.4s, v26.4s, v13.4s +add v26.4s, v26.4s, v13.4s +sqrdmulh v13.4S, v17.4S, v9.s[1] +mul v17.4S, v17.4S,v1.s[1] +sub v25.4s, v20.4s, v12.4s +add v20.4s, v20.4s, v12.4s +sqrdmulh v12.4S, v8.4S, v9.s[0] +mul v8.4S, v8.4S,v1.s[0] +sub v10.4s, v15.4s, v4.4s +add v15.4s, v15.4s, v4.4s +sqrdmulh v4.4S, v16.4S, v7.s[3] +mla v28.4S, v19.4S, v31.s[0] +nop +nop +sqrdmulh v19.4S, v11.4S, v7.s[2] +mla v21.4S, v5.4S, v31.s[0] +nop +nop +sqrdmulh v5.4S, v18.4S, v7.s[1] +mla v17.4S, v13.4S, v31.s[0] +nop +nop +sqrdmulh v13.4S, v0.4S, v7.s[0] +mla v8.4S, v12.4S, v31.s[0] +nop +nop +mul v11.4S, v11.4S,v2.s[2] +mul v16.4S, v16.4S,v2.s[3] +sub v12.4s, v3.4s, v28.4s +str q12, [x0, #960] +mla v11.4S, v19.4S, v31.s[0] +mla v16.4S, v4.4S, v31.s[0] +add v3.4s, v3.4s, v28.4s +str q3, [x0, #896] +mul v0.4S, v0.4S,v2.s[0] +mul v18.4S, v18.4S,v2.s[1] +sub v3.4s, v6.4s, v21.4s +str q3, [x0, #832] +mla v0.4S, v13.4S, v31.s[0] +mla v18.4S, v5.4S, v31.s[0] +add v6.4s, v6.4s, v21.4s +sub v21.4s, v24.4s, v17.4s +ldr q5, [x0, #976] +sqrdmulh v13.4S, v5.4S, v29.s[0] +mul v5.4S, v5.4S,v30.s[0] +add v24.4s, v24.4s, v17.4s +str q6, [x0, #768] +ldr q6, [x0, #912] +sqrdmulh v17.4S, v6.4S, v29.s[0] +mul v6.4S, v6.4S,v30.s[0] +sub v3.4s, v26.4s, v8.4s +str q21, [x0, #704] +ldr q21, [x0, #848] +sqrdmulh v28.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +add v26.4s, v26.4s, v8.4s +str q24, [x0, #640] +ldr q24, [x0, #784] +sqrdmulh v8.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +sub v4.4s, v25.4s, v16.4s +str q3, [x0, #576] +ldr q3, [x0, #720] +sqrdmulh v19.4S, v3.4S, v29.s[0] +mla v5.4S, v13.4S, v31.s[0] +add v25.4s, v25.4s, v16.4s +str q26, [x0, #512] +ldr q26, [x0, #656] +sqrdmulh v16.4S, v26.4S, v29.s[0] +mla v6.4S, v17.4S, v31.s[0] +sub v17.4s, v20.4s, v11.4s +str q4, [x0, #448] +ldr q4, [x0, #592] +sqrdmulh v13.4S, v4.4S, v29.s[0] +mla v21.4S, v28.4S, v31.s[0] +add v20.4s, v20.4s, v11.4s +str q25, [x0, #384] +ldr q25, [x0, #528] +sqrdmulh v11.4S, v25.4S, v29.s[0] +mla v24.4S, v8.4S, v31.s[0] +sub v8.4s, v10.4s, v18.4s +str q17, [x0, #320] +ldr q17, [x0, #464] +add v10.4s, v10.4s, v18.4s +mul v26.4S, v26.4S,v30.s[0] +mul v3.4S, v3.4S,v30.s[0] +ldr q18, [x0, #400] +str q20, [x0, #256] +ldr q20, [x0, #336] +ldr q28, [x0, #272] +mla v26.4S, v16.4S, v31.s[0] +mla v3.4S, v19.4S, v31.s[0] +str q8, [x0, #192] +sub v8.4s, v15.4s, v0.4s +ldr q19, [x0, #208] +ldr q16, [x0, #144] +mul v25.4S, v25.4S,v30.s[0] +mul v4.4S, v4.4S,v30.s[0] +str q10, [x0, #128] +add v15.4s, v15.4s, v0.4s +ldr q0, [x0, #80] +ldr q10, [x0, #16] +mla v25.4S, v11.4S, v31.s[0] +mla v4.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v5.4s +add v17.4s, v17.4s, v5.4s +sqrdmulh v5.4S, v13.4S, v29.s[2] +mul v13.4S, v13.4S,v30.s[2] +sub v11.4s, v18.4s, v6.4s +add v18.4s, v18.4s, v6.4s +sqrdmulh v6.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v12.4s, v20.4s, v21.4s +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v14.4s, v28.4s, v24.4s +add v28.4s, v28.4s, v24.4s +sqrdmulh v24.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v27.4s, v19.4s, v3.4s +add v19.4s, v19.4s, v3.4s +sqrdmulh v3.4S, v12.4S, v29.s[2] +mla v13.4S, v5.4S, v31.s[0] +sub v5.4s, v16.4s, v26.4s +add v16.4s, v16.4s, v26.4s +sqrdmulh v26.4S, v14.4S, v29.s[2] +mla v11.4S, v6.4S, v31.s[0] +sub v6.4s, v0.4s, v4.4s +add v0.4s, v0.4s, v4.4s +sqrdmulh v4.4S, v20.4S, v29.s[1] +mla v17.4S, v21.4S, v31.s[0] +sub v21.4s, v10.4s, v25.4s +str q8, [x0, #64] +sqrdmulh v8.4S, v28.4S, v29.s[1] +mla v18.4S, v24.4S, v31.s[0] +add v10.4s, v10.4s, v25.4s +str q15, [x0, #0] +mul v14.4S, v14.4S,v30.s[2] +mul v12.4S, v12.4S,v30.s[2] +sub v15.4s, v27.4s, v13.4s +add v27.4s, v27.4s, v13.4s +mla v14.4S, v26.4S, v31.s[0] +mla v12.4S, v3.4S, v31.s[0] +sub v3.4s, v5.4s, v11.4s +add v5.4s, v5.4s, v11.4s +mul v28.4S, v28.4S,v30.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v11.4s, v19.4s, v17.4s +add v19.4s, v19.4s, v17.4s +mla v28.4S, v8.4S, v31.s[0] +mla v20.4S, v4.4S, v31.s[0] +sub v4.4s, v16.4s, v18.4s +add v16.4s, v16.4s, v18.4s +sqrdmulh v29.4S, v15.4S, v22.s[3] +mul v15.4S, v15.4S,v23.s[3] +sub v30.4s, v6.4s, v12.4s +add v6.4s, v6.4s, v12.4s +sqrdmulh v12.4S, v27.4S, v22.s[2] +mul v27.4S, v27.4S,v23.s[2] +sub v18.4s, v21.4s, v14.4s +add v21.4s, v21.4s, v14.4s +sqrdmulh v14.4S, v11.4S, v22.s[1] +mul v11.4S, v11.4S,v23.s[1] +sub v8.4s, v0.4s, v20.4s +add v0.4s, v0.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v17.4s, v10.4s, v28.4s +add v10.4s, v10.4s, v28.4s +sqrdmulh v28.4S, v3.4S, v22.s[3] +mla v15.4S, v29.4S, v31.s[0] +nop +nop +sqrdmulh v29.4S, v5.4S, v22.s[2] +mla v27.4S, v12.4S, v31.s[0] +nop +nop +sqrdmulh v12.4S, v4.4S, v22.s[1] +mla v11.4S, v14.4S, v31.s[0] +nop +nop +sqrdmulh v14.4S, v16.4S, v22.s[0] +mla v19.4S, v20.4S, v31.s[0] +nop +nop +mul v5.4S, v5.4S,v23.s[2] +mul v3.4S, v3.4S,v23.s[3] +sub v20.4s, v30.4s, v15.4s +add v30.4s, v30.4s, v15.4s +mla v5.4S, v29.4S, v31.s[0] +mla v3.4S, v28.4S, v31.s[0] +sub v28.4s, v6.4s, v27.4s +add v6.4s, v6.4s, v27.4s +mul v16.4S, v16.4S,v23.s[0] +mul v4.4S, v4.4S,v23.s[1] +sub v27.4s, v8.4s, v11.4s +add v8.4s, v8.4s, v11.4s +mla v16.4S, v14.4S, v31.s[0] +mla v4.4S, v12.4S, v31.s[0] +sub v12.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v22.4S, v20.4S, v9.s[3] +mul v20.4S, v20.4S,v1.s[3] +sub v23.4s, v18.4s, v3.4s +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v30.4S, v9.s[2] +mul v30.4S, v30.4S,v1.s[2] +sub v19.4s, v21.4s, v5.4s +add v21.4s, v21.4s, v5.4s +sqrdmulh v5.4S, v28.4S, v9.s[1] +mul v28.4S, v28.4S,v1.s[1] +sub v14.4s, v17.4s, v4.4s +add v17.4s, v17.4s, v4.4s +sqrdmulh v4.4S, v6.4S, v9.s[0] +mul v6.4S, v6.4S,v1.s[0] +sub v11.4s, v10.4s, v16.4s +add v10.4s, v10.4s, v16.4s +sqrdmulh v9.4S, v27.4S, v7.s[3] +mla v20.4S, v22.4S, v31.s[0] +nop +nop +sqrdmulh v22.4S, v8.4S, v7.s[2] +mla v30.4S, v3.4S, v31.s[0] +nop +nop +sqrdmulh v3.4S, v12.4S, v7.s[1] +mla v28.4S, v5.4S, v31.s[0] +nop +nop +sqrdmulh v5.4S, v0.4S, v7.s[0] +mla v6.4S, v4.4S, v31.s[0] +nop +nop +mul v8.4S, v8.4S,v2.s[2] +mul v27.4S, v27.4S,v2.s[3] +sub v4.4s, v23.4s, v20.4s +str q4, [x0, #976] +mla v8.4S, v22.4S, v31.s[0] +mla v27.4S, v9.4S, v31.s[0] +add v23.4s, v23.4s, v20.4s +str q23, [x0, #912] +mul v0.4S, v0.4S,v2.s[0] +mul v12.4S, v12.4S,v2.s[1] +sub v23.4s, v18.4s, v30.4s +str q23, [x0, #848] +mla v0.4S, v5.4S, v31.s[0] +mla v12.4S, v3.4S, v31.s[0] +add v18.4s, v18.4s, v30.4s +sub v30.4s, v19.4s, v28.4s +add v19.4s, v19.4s, v28.4s +str q18, [x0, #784] +sub v18.4s, v21.4s, v6.4s +str q30, [x0, #720] +add v21.4s, v21.4s, v6.4s +str q19, [x0, #656] +sub v19.4s, v14.4s, v27.4s +str q18, [x0, #592] +add v14.4s, v14.4s, v27.4s +str q21, [x0, #528] +sub v21.4s, v17.4s, v8.4s +str q19, [x0, #464] +add v17.4s, v17.4s, v8.4s +str q14, [x0, #400] +sub v14.4s, v11.4s, v12.4s +str q21, [x0, #336] +add v11.4s, v11.4s, v12.4s +str q17, [x0, #272] +sub v17.4s, v10.4s, v0.4s +add v10.4s, v10.4s, v0.4s +ldr q24, [x0, #224] +ldr q25, [x0, #160] +ldr q13, [x0, #32] +ldr q26, [x17, #+128] +ldr q15, [x17, #+144] +sqrdmulh v29.4S, v13.4S, v15.s[0] +mul v13.4S, v13.4S,v26.s[0] +ldr q16, [x0, #48] +sqrdmulh v1.4S, v16.4S, v15.s[0] +mul v16.4S, v16.4S,v26.s[0] +ldr q4, [x17, #+160] +ldr q22, [x17, #+176] +ldr q9, [x0, #96] +sqrdmulh v20.4S, v9.4S, v22.s[0] +mul v9.4S, v9.4S,v4.s[0] +ldr q23, [x0, #112] +sqrdmulh v5.4S, v23.4S, v22.s[0] +mul v23.4S, v23.4S,v4.s[0] +ldr q3, [x17, #+192] +ldr q2, [x17, #+208] +mla v13.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v25.4S, v2.s[0] +ldr q7, [x0, #176] +mla v16.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v7.4S, v2.s[0] +ldr q28, [x17, #+224] +ldr q30, [x17, #+240] +mla v9.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v24.4S, v30.s[0] +ldr q6, [x0, #240] +mla v23.4S, v5.4S, v31.s[0] +sqrdmulh v5.4S, v6.4S, v30.s[0] +ldr q18, [x0, #0] +ldr q27, [x0, #128] +mul v25.4S, v25.4S,v3.s[0] +sub v19.4s, v18.4s, v13.4s +mul v7.4S, v7.4S,v3.s[0] +add v18.4s, v18.4s, v13.4s +mla v25.4S, v29.4S, v31.s[0] +sub v29.4s, v10.4s, v16.4s +ldr q13, [x0, #64] +mla v7.4S, v1.4S, v31.s[0] +add v10.4s, v10.4s, v16.4s +ldr q16, [x0, #192] +mul v24.4S, v24.4S,v28.s[0] +sub v1.4s, v13.4s, v9.4s +mul v6.4S, v6.4S,v28.s[0] +add v13.4s, v13.4s, v9.4s +mla v24.4S, v20.4S, v31.s[0] +nop +mla v6.4S, v5.4S, v31.s[0] +sub v5.4s, v17.4s, v23.4s +sqrdmulh v20.4S, v10.4S, v15.s[1] +add v17.4s, v17.4s, v23.4s +mul v10.4S, v10.4S,v26.s[1] +nop +sqrdmulh v23.4S, v29.4S, v15.s[2] +sub v9.4s, v27.4s, v25.4s +mul v29.4S, v29.4S,v26.s[2] +add v27.4s, v27.4s, v25.4s +sqrdmulh v15.4S, v17.4S, v22.s[1] +sub v26.4s, v11.4s, v7.4s +mul v17.4S, v17.4S,v4.s[1] +add v11.4s, v11.4s, v7.4s +sqrdmulh v7.4S, v5.4S, v22.s[2] +sub v25.4s, v16.4s, v24.4s +mul v5.4S, v5.4S,v4.s[2] +add v16.4s, v16.4s, v24.4s +mla v10.4S, v20.4S, v31.s[0] +sub v20.4s, v14.4s, v6.4s +ldr q22, [x0, #480] +sqrdmulh v4.4S, v11.4S, v2.s[1] +add v14.4s, v14.4s, v6.4s +mla v29.4S, v23.4S, v31.s[0] +ldr q23, [x0, #416] +sqrdmulh v6.4S, v26.4S, v2.s[2] +sub v24.4s, v18.4s, v10.4s +mla v17.4S, v15.4S, v31.s[0] +ldr q15, [x0, #288] +sqrdmulh v8.4S, v14.4S, v30.s[1] +add v18.4s, v18.4s, v10.4s +str q24, [x0, #16] +mla v5.4S, v7.4S, v31.s[0] +ldr q7, [x17, #+256] +ldr q24, [x17, #+272] +sqrdmulh v10.4S, v20.4S, v30.s[2] +sub v21.4s, v19.4s, v29.4s +str q18, [x0, #0] +mul v11.4S, v11.4S,v3.s[1] +add v19.4s, v19.4s, v29.4s +mul v26.4S, v26.4S,v3.s[2] +str q21, [x0, #48] +mla v11.4S, v4.4S, v31.s[0] +sub v4.4s, v13.4s, v17.4s +mla v26.4S, v6.4S, v31.s[0] +str q19, [x0, #32] +mul v14.4S, v14.4S,v28.s[1] +str q4, [x0, #80] +mul v20.4S, v20.4S,v28.s[2] +add v13.4s, v13.4s, v17.4s +str q13, [x0, #64] +mla v14.4S, v8.4S, v31.s[0] +sub v8.4s, v1.4s, v5.4s +str q8, [x0, #112] +mla v20.4S, v10.4S, v31.s[0] +add v1.4s, v1.4s, v5.4s +str q1, [x0, #96] +sqrdmulh v30.4S, v15.4S, v24.s[0] +sub v28.4s, v27.4s, v11.4s +mul v15.4S, v15.4S,v7.s[0] +str q28, [x0, #144] +ldr q28, [x0, #304] +sqrdmulh v1.4S, v28.4S, v24.s[0] +add v27.4s, v27.4s, v11.4s +mul v28.4S, v28.4S,v7.s[0] +str q27, [x0, #128] +ldr q27, [x17, #+288] +ldr q11, [x17, #+304] +ldr q5, [x0, #352] +sqrdmulh v10.4S, v5.4S, v11.s[0] +sub v8.4s, v9.4s, v26.4s +mul v5.4S, v5.4S,v27.s[0] +str q8, [x0, #176] +ldr q8, [x0, #368] +sqrdmulh v13.4S, v8.4S, v11.s[0] +add v9.4s, v9.4s, v26.4s +mul v8.4S, v8.4S,v27.s[0] +str q9, [x0, #160] +ldr q9, [x17, #+320] +ldr q26, [x17, #+336] +mla v15.4S, v30.4S, v31.s[0] +sub v30.4s, v16.4s, v14.4s +sqrdmulh v17.4S, v23.4S, v26.s[0] +str q30, [x0, #208] +ldr q30, [x0, #432] +mla v28.4S, v1.4S, v31.s[0] +add v16.4s, v16.4s, v14.4s +sqrdmulh v14.4S, v30.4S, v26.s[0] +str q16, [x0, #192] +ldr q16, [x17, #+352] +ldr q1, [x17, #+368] +mla v5.4S, v10.4S, v31.s[0] +sub v10.4s, v25.4s, v20.4s +sqrdmulh v4.4S, v22.4S, v1.s[0] +str q10, [x0, #240] +ldr q10, [x0, #496] +mla v8.4S, v13.4S, v31.s[0] +add v25.4s, v25.4s, v20.4s +sqrdmulh v20.4S, v10.4S, v1.s[0] +str q25, [x0, #224] +ldr q25, [x0, #256] +ldr q13, [x0, #384] +mul v23.4S, v23.4S,v9.s[0] +sub v2.4s, v25.4s, v15.4s +ldr q3, [x0, #272] +mul v30.4S, v30.4S,v9.s[0] +add v25.4s, v25.4s, v15.4s +ldr q15, [x0, #400] +mla v23.4S, v17.4S, v31.s[0] +sub v17.4s, v3.4s, v28.4s +ldr q19, [x0, #320] +mla v30.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v28.4s +ldr q28, [x0, #448] +mul v22.4S, v22.4S,v16.s[0] +sub v14.4s, v19.4s, v5.4s +ldr q6, [x0, #336] +mul v10.4S, v10.4S,v16.s[0] +add v19.4s, v19.4s, v5.4s +ldr q5, [x0, #464] +mla v22.4S, v4.4S, v31.s[0] +nop +mla v10.4S, v20.4S, v31.s[0] +sub v20.4s, v6.4s, v8.4s +sqrdmulh v4.4S, v3.4S, v24.s[1] +add v6.4s, v6.4s, v8.4s +mul v3.4S, v3.4S,v7.s[1] +nop +sqrdmulh v8.4S, v17.4S, v24.s[2] +sub v21.4s, v13.4s, v23.4s +mul v17.4S, v17.4S,v7.s[2] +add v13.4s, v13.4s, v23.4s +sqrdmulh v24.4S, v6.4S, v11.s[1] +sub v7.4s, v15.4s, v30.4s +mul v6.4S, v6.4S,v27.s[1] +add v15.4s, v15.4s, v30.4s +sqrdmulh v30.4S, v20.4S, v11.s[2] +sub v23.4s, v28.4s, v22.4s +mul v20.4S, v20.4S,v27.s[2] +add v28.4s, v28.4s, v22.4s +mla v3.4S, v4.4S, v31.s[0] +sub v4.4s, v5.4s, v10.4s +ldr q11, [x0, #736] +sqrdmulh v27.4S, v15.4S, v26.s[1] +add v5.4s, v5.4s, v10.4s +mla v17.4S, v8.4S, v31.s[0] +ldr q8, [x0, #672] +sqrdmulh v10.4S, v7.4S, v26.s[2] +sub v22.4s, v25.4s, v3.4s +mla v6.4S, v24.4S, v31.s[0] +ldr q24, [x0, #544] +sqrdmulh v29.4S, v5.4S, v1.s[1] +add v25.4s, v25.4s, v3.4s +str q22, [x0, #272] +mla v20.4S, v30.4S, v31.s[0] +ldr q30, [x17, #+384] +ldr q22, [x17, #+400] +sqrdmulh v3.4S, v4.4S, v1.s[2] +sub v18.4s, v2.4s, v17.4s +str q25, [x0, #256] +mul v15.4S, v15.4S,v9.s[1] +add v2.4s, v2.4s, v17.4s +mul v7.4S, v7.4S,v9.s[2] +str q18, [x0, #304] +mla v15.4S, v27.4S, v31.s[0] +sub v27.4s, v19.4s, v6.4s +mla v7.4S, v10.4S, v31.s[0] +str q2, [x0, #288] +mul v5.4S, v5.4S,v16.s[1] +str q27, [x0, #336] +mul v4.4S, v4.4S,v16.s[2] +add v19.4s, v19.4s, v6.4s +str q19, [x0, #320] +mla v5.4S, v29.4S, v31.s[0] +sub v29.4s, v14.4s, v20.4s +str q29, [x0, #368] +mla v4.4S, v3.4S, v31.s[0] +add v14.4s, v14.4s, v20.4s +str q14, [x0, #352] +sqrdmulh v1.4S, v24.4S, v22.s[0] +sub v16.4s, v13.4s, v15.4s +mul v24.4S, v24.4S,v30.s[0] +str q16, [x0, #400] +ldr q16, [x0, #560] +sqrdmulh v14.4S, v16.4S, v22.s[0] +add v13.4s, v13.4s, v15.4s +mul v16.4S, v16.4S,v30.s[0] +str q13, [x0, #384] +ldr q13, [x17, #+416] +ldr q15, [x17, #+432] +ldr q20, [x0, #608] +sqrdmulh v3.4S, v20.4S, v15.s[0] +sub v29.4s, v21.4s, v7.4s +mul v20.4S, v20.4S,v13.s[0] +str q29, [x0, #432] +ldr q29, [x0, #624] +sqrdmulh v19.4S, v29.4S, v15.s[0] +add v21.4s, v21.4s, v7.4s +mul v29.4S, v29.4S,v13.s[0] +str q21, [x0, #416] +ldr q21, [x17, #+448] +ldr q7, [x17, #+464] +mla v24.4S, v1.4S, v31.s[0] +sub v1.4s, v28.4s, v5.4s +sqrdmulh v6.4S, v8.4S, v7.s[0] +str q1, [x0, #464] +ldr q1, [x0, #688] +mla v16.4S, v14.4S, v31.s[0] +add v28.4s, v28.4s, v5.4s +sqrdmulh v5.4S, v1.4S, v7.s[0] +str q28, [x0, #448] +ldr q28, [x17, #+480] +ldr q14, [x17, #+496] +mla v20.4S, v3.4S, v31.s[0] +sub v3.4s, v23.4s, v4.4s +sqrdmulh v27.4S, v11.4S, v14.s[0] +str q3, [x0, #496] +ldr q3, [x0, #752] +mla v29.4S, v19.4S, v31.s[0] +add v23.4s, v23.4s, v4.4s +sqrdmulh v4.4S, v3.4S, v14.s[0] +str q23, [x0, #480] +ldr q23, [x0, #512] +ldr q19, [x0, #640] +mul v8.4S, v8.4S,v21.s[0] +sub v26.4s, v23.4s, v24.4s +ldr q9, [x0, #528] +mul v1.4S, v1.4S,v21.s[0] +add v23.4s, v23.4s, v24.4s +ldr q24, [x0, #656] +mla v8.4S, v6.4S, v31.s[0] +sub v6.4s, v9.4s, v16.4s +ldr q2, [x0, #576] +mla v1.4S, v5.4S, v31.s[0] +add v9.4s, v9.4s, v16.4s +ldr q16, [x0, #704] +mul v11.4S, v11.4S,v28.s[0] +sub v5.4s, v2.4s, v20.4s +ldr q10, [x0, #592] +mul v3.4S, v3.4S,v28.s[0] +add v2.4s, v2.4s, v20.4s +ldr q20, [x0, #720] +mla v11.4S, v27.4S, v31.s[0] +nop +mla v3.4S, v4.4S, v31.s[0] +sub v4.4s, v10.4s, v29.4s +sqrdmulh v27.4S, v9.4S, v22.s[1] +add v10.4s, v10.4s, v29.4s +mul v9.4S, v9.4S,v30.s[1] +nop +sqrdmulh v29.4S, v6.4S, v22.s[2] +sub v18.4s, v19.4s, v8.4s +mul v6.4S, v6.4S,v30.s[2] +add v19.4s, v19.4s, v8.4s +sqrdmulh v22.4S, v10.4S, v15.s[1] +sub v30.4s, v24.4s, v1.4s +mul v10.4S, v10.4S,v13.s[1] +add v24.4s, v24.4s, v1.4s +sqrdmulh v1.4S, v4.4S, v15.s[2] +sub v8.4s, v16.4s, v11.4s +mul v4.4S, v4.4S,v13.s[2] +add v16.4s, v16.4s, v11.4s +mla v9.4S, v27.4S, v31.s[0] +sub v27.4s, v20.4s, v3.4s +ldr q15, [x0, #992] +sqrdmulh v13.4S, v24.4S, v7.s[1] +add v20.4s, v20.4s, v3.4s +mla v6.4S, v29.4S, v31.s[0] +ldr q29, [x0, #928] +sqrdmulh v3.4S, v30.4S, v7.s[2] +sub v11.4s, v23.4s, v9.4s +mla v10.4S, v22.4S, v31.s[0] +ldr q22, [x0, #800] +sqrdmulh v17.4S, v20.4S, v14.s[1] +add v23.4s, v23.4s, v9.4s +str q11, [x0, #528] +mla v4.4S, v1.4S, v31.s[0] +ldr q1, [x17, #+512] +ldr q11, [x17, #+528] +sqrdmulh v9.4S, v27.4S, v14.s[2] +sub v25.4s, v26.4s, v6.4s +str q23, [x0, #512] +mul v24.4S, v24.4S,v21.s[1] +add v26.4s, v26.4s, v6.4s +mul v30.4S, v30.4S,v21.s[2] +str q25, [x0, #560] +mla v24.4S, v13.4S, v31.s[0] +sub v13.4s, v2.4s, v10.4s +mla v30.4S, v3.4S, v31.s[0] +str q26, [x0, #544] +mul v20.4S, v20.4S,v28.s[1] +str q13, [x0, #592] +mul v27.4S, v27.4S,v28.s[2] +add v2.4s, v2.4s, v10.4s +str q2, [x0, #576] +mla v20.4S, v17.4S, v31.s[0] +sub v17.4s, v5.4s, v4.4s +str q17, [x0, #624] +mla v27.4S, v9.4S, v31.s[0] +add v5.4s, v5.4s, v4.4s +str q5, [x0, #608] +sqrdmulh v14.4S, v22.4S, v11.s[0] +sub v28.4s, v19.4s, v24.4s +mul v22.4S, v22.4S,v1.s[0] +str q28, [x0, #656] +ldr q28, [x0, #816] +sqrdmulh v5.4S, v28.4S, v11.s[0] +add v19.4s, v19.4s, v24.4s +mul v28.4S, v28.4S,v1.s[0] +str q19, [x0, #640] +ldr q19, [x17, #+544] +ldr q24, [x17, #+560] +ldr q4, [x0, #864] +sqrdmulh v9.4S, v4.4S, v24.s[0] +sub v17.4s, v18.4s, v30.4s +mul v4.4S, v4.4S,v19.s[0] +str q17, [x0, #688] +ldr q17, [x0, #880] +sqrdmulh v2.4S, v17.4S, v24.s[0] +add v18.4s, v18.4s, v30.4s +mul v17.4S, v17.4S,v19.s[0] +str q18, [x0, #672] +ldr q18, [x17, #+576] +ldr q30, [x17, #+592] +mla v22.4S, v14.4S, v31.s[0] +sub v14.4s, v16.4s, v20.4s +sqrdmulh v10.4S, v29.4S, v30.s[0] +str q14, [x0, #720] +ldr q14, [x0, #944] +mla v28.4S, v5.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v14.4S, v30.s[0] +str q16, [x0, #704] +ldr q16, [x17, #+608] +ldr q5, [x17, #+624] +mla v4.4S, v9.4S, v31.s[0] +sub v9.4s, v8.4s, v27.4s +sqrdmulh v13.4S, v15.4S, v5.s[0] +str q9, [x0, #752] +ldr q9, [x0, #1008] +mla v17.4S, v2.4S, v31.s[0] +add v8.4s, v8.4s, v27.4s +sqrdmulh v27.4S, v9.4S, v5.s[0] +str q8, [x0, #736] +ldr q8, [x0, #768] +ldr q2, [x0, #896] +mul v29.4S, v29.4S,v18.s[0] +sub v7.4s, v8.4s, v22.4s +ldr q21, [x0, #784] +mul v14.4S, v14.4S,v18.s[0] +add v8.4s, v8.4s, v22.4s +ldr q22, [x0, #912] +mla v29.4S, v10.4S, v31.s[0] +sub v10.4s, v21.4s, v28.4s +ldr q26, [x0, #832] +mla v14.4S, v20.4S, v31.s[0] +add v21.4s, v21.4s, v28.4s +ldr q28, [x0, #960] +mul v15.4S, v15.4S,v16.s[0] +sub v20.4s, v26.4s, v4.4s +ldr q3, [x0, #848] +mul v9.4S, v9.4S,v16.s[0] +add v26.4s, v26.4s, v4.4s +ldr q4, [x0, #976] +mla v15.4S, v13.4S, v31.s[0] +nop +mla v9.4S, v27.4S, v31.s[0] +sub v27.4s, v3.4s, v17.4s +sqrdmulh v13.4S, v21.4S, v11.s[1] +add v3.4s, v3.4s, v17.4s +mul v21.4S, v21.4S,v1.s[1] +nop +sqrdmulh v17.4S, v10.4S, v11.s[2] +sub v25.4s, v2.4s, v29.4s +mul v10.4S, v10.4S,v1.s[2] +add v2.4s, v2.4s, v29.4s +sqrdmulh v11.4S, v3.4S, v24.s[1] +sub v1.4s, v22.4s, v14.4s +mul v3.4S, v3.4S,v19.s[1] +add v22.4s, v22.4s, v14.4s +sqrdmulh v14.4S, v27.4S, v24.s[2] +sub v29.4s, v28.4s, v15.4s +mul v27.4S, v27.4S,v19.s[2] +add v28.4s, v28.4s, v15.4s +mla v21.4S, v13.4S, v31.s[0] +sub v13.4s, v4.4s, v9.4s +sqrdmulh v24.4S, v22.4S, v30.s[1] +add v4.4s, v4.4s, v9.4s +mla v10.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v1.4S, v30.s[2] +sub v9.4s, v8.4s, v21.4s +mla v3.4S, v11.4S, v31.s[0] +sqrdmulh v11.4S, v4.4S, v5.s[1] +add v8.4s, v8.4s, v21.4s +str q9, [x0, #784] +mla v27.4S, v14.4S, v31.s[0] +sqrdmulh v14.4S, v13.4S, v5.s[2] +sub v9.4s, v7.4s, v10.4s +str q8, [x0, #768] +mul v22.4S, v22.4S,v18.s[1] +add v7.4s, v7.4s, v10.4s +mul v1.4S, v1.4S,v18.s[2] +str q9, [x0, #816] +mla v22.4S, v24.4S, v31.s[0] +sub v24.4s, v26.4s, v3.4s +mla v1.4S, v17.4S, v31.s[0] +str q7, [x0, #800] +mul v4.4S, v4.4S,v16.s[1] +str q24, [x0, #848] +mul v13.4S, v13.4S,v16.s[2] +add v26.4s, v26.4s, v3.4s +str q26, [x0, #832] +mla v4.4S, v11.4S, v31.s[0] +sub v11.4s, v20.4s, v27.4s +str q11, [x0, #880] +mla v13.4S, v14.4S, v31.s[0] +add v20.4s, v20.4s, v27.4s +str q20, [x0, #864] +sub v5.4s, v2.4s, v22.4s +str q5, [x0, #912] +add v2.4s, v2.4s, v22.4s +str q2, [x0, #896] +sub v2.4s, v25.4s, v1.4s +str q2, [x0, #944] +add v25.4s, v25.4s, v1.4s +str q25, [x0, #928] +sub v25.4s, v28.4s, v4.4s +str q25, [x0, #976] +add v28.4s, v28.4s, v4.4s +str q28, [x0, #960] +sub v28.4s, v29.4s, v13.4s +str q28, [x0, #1008] +add v29.4s, v29.4s, v13.4s +str q29, [x0, #992] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1528 +// Instruction count: 1524 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_24_z4_0.s b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_24_z4_0.s new file mode 100644 index 0000000..cf16f9a --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_24_z4_0.s @@ -0,0 +1,1494 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_24_z4_0 +.global _ntt_u32_incomplete_neon_asm_var_4_2_24_z4_0 +ntt_u32_incomplete_neon_asm_var_4_2_24_z4_0: +_ntt_u32_incomplete_neon_asm_var_4_2_24_z4_0: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #800] +ldr q21, [x0, #864] +ldr q20, [x0, #928] +ldr q19, [x0, #992] +ldr q18, [x0, #288] +ldr q17, [x0, #352] +ldr q16, [x0, #416] +ldr q3, [x0, #480] +ldr q2, [x0, #544] +ldr q1, [x0, #608] +ldr q0, [x0, #672] +ldr q15, [x0, #736] +ldr q14, [x0, #32] +ldr q13, [x0, #96] +ldr q12, [x0, #160] +ldr q11, [x0, #224] +sqrdmulh v10.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +mla v22.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +mla v21.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +mla v20.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +mla v19.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +mla v2.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +mla v1.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v0.4S, v29.s[0] +mul v0.4S, v0.4S,v30.s[0] +mla v0.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v15.4S, v29.s[0] +mul v15.4S, v15.4S,v30.s[0] +mla v15.4S, v10.4S, v31.s[0] +sub v10.4s, v18.4s, v22.4s +add v18.4s, v18.4s, v22.4s +sub v22.4s, v17.4s, v21.4s +add v17.4s, v17.4s, v21.4s +sub v21.4s, v16.4s, v20.4s +add v16.4s, v16.4s, v20.4s +sub v20.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +sub v19.4s, v14.4s, v2.4s +add v14.4s, v14.4s, v2.4s +sub v2.4s, v13.4s, v1.4s +add v13.4s, v13.4s, v1.4s +sub v1.4s, v12.4s, v0.4s +add v12.4s, v12.4s, v0.4s +sub v0.4s, v11.4s, v15.4s +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +mla v16.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +mla v3.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +mla v18.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +mla v17.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +mla v21.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +mla v20.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +mla v10.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +mla v22.4S, v15.4S, v31.s[0] +sub v15.4s, v12.4s, v16.4s +add v12.4s, v12.4s, v16.4s +sub v16.4s, v11.4s, v3.4s +add v11.4s, v11.4s, v3.4s +sub v3.4s, v14.4s, v18.4s +add v14.4s, v14.4s, v18.4s +sub v18.4s, v13.4s, v17.4s +add v13.4s, v13.4s, v17.4s +sub v17.4s, v1.4s, v21.4s +add v1.4s, v1.4s, v21.4s +sub v21.4s, v0.4s, v20.4s +add v0.4s, v0.4s, v20.4s +sub v20.4s, v19.4s, v10.4s +add v19.4s, v19.4s, v10.4s +sub v10.4s, v2.4s, v22.4s +add v2.4s, v2.4s, v22.4s +sqrdmulh v22.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +mla v12.4S, v22.4S, v31.s[0] +sqrdmulh v22.4S, v11.4S, v27.s[0] +mul v11.4S, v11.4S,v28.s[0] +mla v11.4S, v22.4S, v31.s[0] +sqrdmulh v22.4S, v15.4S, v27.s[1] +mul v15.4S, v15.4S,v28.s[1] +mla v15.4S, v22.4S, v31.s[0] +sqrdmulh v22.4S, v16.4S, v27.s[1] +mul v16.4S, v16.4S,v28.s[1] +mla v16.4S, v22.4S, v31.s[0] +sqrdmulh v22.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +mla v1.4S, v22.4S, v31.s[0] +sqrdmulh v22.4S, v0.4S, v27.s[2] +mul v0.4S, v0.4S,v28.s[2] +mla v0.4S, v22.4S, v31.s[0] +sqrdmulh v22.4S, v17.4S, v27.s[3] +mul v17.4S, v17.4S,v28.s[3] +mla v17.4S, v22.4S, v31.s[0] +sqrdmulh v22.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v14.4s, v12.4s +add v14.4s, v14.4s, v12.4s +sub v12.4s, v13.4s, v11.4s +add v13.4s, v13.4s, v11.4s +sub v11.4s, v3.4s, v15.4s +add v3.4s, v3.4s, v15.4s +sub v15.4s, v18.4s, v16.4s +add v18.4s, v18.4s, v16.4s +sub v16.4s, v19.4s, v1.4s +add v19.4s, v19.4s, v1.4s +sub v1.4s, v2.4s, v0.4s +add v2.4s, v2.4s, v0.4s +sub v0.4s, v20.4s, v17.4s +add v20.4s, v20.4s, v17.4s +sub v17.4s, v10.4s, v21.4s +add v10.4s, v10.4s, v21.4s +sqrdmulh v21.4S, v13.4S, v25.s[0] +mul v13.4S, v13.4S,v26.s[0] +mla v13.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v12.4S, v25.s[1] +mul v12.4S, v12.4S,v26.s[1] +mla v12.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v18.4S, v25.s[2] +mul v18.4S, v18.4S,v26.s[2] +mla v18.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v15.4S, v25.s[3] +mul v15.4S, v15.4S,v26.s[3] +mla v15.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v2.4S, v23.s[0] +mul v2.4S, v2.4S,v24.s[0] +mla v2.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v1.4S, v23.s[1] +mul v1.4S, v1.4S,v24.s[1] +mla v1.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v10.4S, v23.s[2] +mul v10.4S, v10.4S,v24.s[2] +mla v10.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v17.4S, v23.s[3] +mul v17.4S, v17.4S,v24.s[3] +mla v17.4S, v21.4S, v31.s[0] +sub v21.4s, v14.4s, v13.4s +add v14.4s, v14.4s, v13.4s +sub v13.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +sub v12.4s, v3.4s, v18.4s +add v3.4s, v3.4s, v18.4s +sub v18.4s, v11.4s, v15.4s +add v11.4s, v11.4s, v15.4s +sub v15.4s, v19.4s, v2.4s +add v19.4s, v19.4s, v2.4s +sub v2.4s, v16.4s, v1.4s +add v16.4s, v16.4s, v1.4s +sub v1.4s, v20.4s, v10.4s +add v20.4s, v20.4s, v10.4s +sub v10.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +str q14, [x0, #32] +str q21, [x0, #96] +str q22, [x0, #160] +str q13, [x0, #224] +str q3, [x0, #288] +str q12, [x0, #352] +str q11, [x0, #416] +str q18, [x0, #480] +str q19, [x0, #544] +str q15, [x0, #608] +str q16, [x0, #672] +str q2, [x0, #736] +str q20, [x0, #800] +str q1, [x0, #864] +str q0, [x0, #928] +str q10, [x0, #992] +ldr q10, [x0, #816] +ldr q0, [x0, #880] +ldr q1, [x0, #944] +ldr q20, [x0, #1008] +ldr q2, [x0, #304] +ldr q16, [x0, #368] +ldr q15, [x0, #432] +ldr q19, [x0, #496] +ldr q18, [x0, #560] +ldr q11, [x0, #624] +ldr q12, [x0, #688] +ldr q3, [x0, #752] +ldr q13, [x0, #48] +ldr q22, [x0, #112] +ldr q21, [x0, #176] +ldr q14, [x0, #240] +sqrdmulh v17.4S, v10.4S, v29.s[0] +mul v10.4S, v10.4S,v30.s[0] +mla v10.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v0.4S, v29.s[0] +mul v0.4S, v0.4S,v30.s[0] +mla v0.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +mla v1.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +mla v20.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +mla v18.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +mla v11.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +mla v12.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +mla v3.4S, v17.4S, v31.s[0] +sub v17.4s, v2.4s, v10.4s +add v2.4s, v2.4s, v10.4s +sub v10.4s, v16.4s, v0.4s +add v16.4s, v16.4s, v0.4s +sub v0.4s, v15.4s, v1.4s +add v15.4s, v15.4s, v1.4s +sub v1.4s, v19.4s, v20.4s +add v19.4s, v19.4s, v20.4s +sub v20.4s, v13.4s, v18.4s +add v13.4s, v13.4s, v18.4s +sub v18.4s, v22.4s, v11.4s +add v22.4s, v22.4s, v11.4s +sub v11.4s, v21.4s, v12.4s +add v21.4s, v21.4s, v12.4s +sub v12.4s, v14.4s, v3.4s +add v14.4s, v14.4s, v3.4s +sqrdmulh v3.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +mla v15.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v19.4S, v29.s[1] +mul v19.4S, v19.4S,v30.s[1] +mla v19.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v2.4S, v29.s[1] +mul v2.4S, v2.4S,v30.s[1] +mla v2.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +mla v16.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v0.4S, v29.s[2] +mul v0.4S, v0.4S,v30.s[2] +mla v0.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v1.4S, v29.s[2] +mul v1.4S, v1.4S,v30.s[2] +mla v1.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +mla v17.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +mla v10.4S, v3.4S, v31.s[0] +sub v3.4s, v21.4s, v15.4s +add v21.4s, v21.4s, v15.4s +sub v15.4s, v14.4s, v19.4s +add v14.4s, v14.4s, v19.4s +sub v19.4s, v13.4s, v2.4s +add v13.4s, v13.4s, v2.4s +sub v2.4s, v22.4s, v16.4s +add v22.4s, v22.4s, v16.4s +sub v16.4s, v11.4s, v0.4s +add v11.4s, v11.4s, v0.4s +sub v0.4s, v12.4s, v1.4s +add v12.4s, v12.4s, v1.4s +sub v1.4s, v20.4s, v17.4s +add v20.4s, v20.4s, v17.4s +sub v17.4s, v18.4s, v10.4s +add v18.4s, v18.4s, v10.4s +sqrdmulh v10.4S, v21.4S, v27.s[0] +mul v21.4S, v21.4S,v28.s[0] +mla v21.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +mla v14.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v3.4S, v27.s[1] +mul v3.4S, v3.4S,v28.s[1] +mla v3.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v15.4S, v27.s[1] +mul v15.4S, v15.4S,v28.s[1] +mla v15.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v11.4S, v27.s[2] +mul v11.4S, v11.4S,v28.s[2] +mla v11.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v12.4S, v27.s[2] +mul v12.4S, v12.4S,v28.s[2] +mla v12.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v16.4S, v27.s[3] +mul v16.4S, v16.4S,v28.s[3] +mla v16.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v0.4S, v27.s[3] +mul v0.4S, v0.4S,v28.s[3] +mla v0.4S, v10.4S, v31.s[0] +sub v10.4s, v13.4s, v21.4s +add v13.4s, v13.4s, v21.4s +sub v21.4s, v22.4s, v14.4s +add v22.4s, v22.4s, v14.4s +sub v14.4s, v19.4s, v3.4s +add v19.4s, v19.4s, v3.4s +sub v3.4s, v2.4s, v15.4s +add v2.4s, v2.4s, v15.4s +sub v15.4s, v20.4s, v11.4s +add v20.4s, v20.4s, v11.4s +sub v11.4s, v18.4s, v12.4s +add v18.4s, v18.4s, v12.4s +sub v12.4s, v1.4s, v16.4s +add v1.4s, v1.4s, v16.4s +sub v16.4s, v17.4s, v0.4s +add v17.4s, v17.4s, v0.4s +sqrdmulh v0.4S, v22.4S, v25.s[0] +mul v22.4S, v22.4S,v26.s[0] +mla v22.4S, v0.4S, v31.s[0] +sqrdmulh v0.4S, v21.4S, v25.s[1] +mul v21.4S, v21.4S,v26.s[1] +mla v21.4S, v0.4S, v31.s[0] +sqrdmulh v0.4S, v2.4S, v25.s[2] +mul v2.4S, v2.4S,v26.s[2] +mla v2.4S, v0.4S, v31.s[0] +sqrdmulh v0.4S, v3.4S, v25.s[3] +mul v3.4S, v3.4S,v26.s[3] +mla v3.4S, v0.4S, v31.s[0] +sqrdmulh v0.4S, v18.4S, v23.s[0] +mul v18.4S, v18.4S,v24.s[0] +mla v18.4S, v0.4S, v31.s[0] +sqrdmulh v0.4S, v11.4S, v23.s[1] +mul v11.4S, v11.4S,v24.s[1] +mla v11.4S, v0.4S, v31.s[0] +sqrdmulh v0.4S, v17.4S, v23.s[2] +mul v17.4S, v17.4S,v24.s[2] +mla v17.4S, v0.4S, v31.s[0] +sqrdmulh v0.4S, v16.4S, v23.s[3] +mul v16.4S, v16.4S,v24.s[3] +mla v16.4S, v0.4S, v31.s[0] +sub v0.4s, v13.4s, v22.4s +add v13.4s, v13.4s, v22.4s +sub v22.4s, v10.4s, v21.4s +add v10.4s, v10.4s, v21.4s +sub v21.4s, v19.4s, v2.4s +add v19.4s, v19.4s, v2.4s +sub v2.4s, v14.4s, v3.4s +add v14.4s, v14.4s, v3.4s +sub v3.4s, v20.4s, v18.4s +add v20.4s, v20.4s, v18.4s +sub v18.4s, v15.4s, v11.4s +add v15.4s, v15.4s, v11.4s +sub v11.4s, v1.4s, v17.4s +add v1.4s, v1.4s, v17.4s +sub v17.4s, v12.4s, v16.4s +add v12.4s, v12.4s, v16.4s +str q13, [x0, #48] +str q0, [x0, #112] +str q10, [x0, #176] +str q22, [x0, #240] +str q19, [x0, #304] +str q21, [x0, #368] +str q14, [x0, #432] +str q2, [x0, #496] +str q20, [x0, #560] +str q3, [x0, #624] +str q15, [x0, #688] +str q18, [x0, #752] +str q1, [x0, #816] +str q11, [x0, #880] +str q12, [x0, #944] +str q17, [x0, #1008] +ldr q17, [x0, #768] +ldr q12, [x0, #832] +ldr q11, [x0, #896] +ldr q1, [x0, #960] +ldr q18, [x0, #256] +ldr q15, [x0, #320] +ldr q3, [x0, #384] +ldr q20, [x0, #448] +ldr q2, [x0, #512] +ldr q14, [x0, #576] +ldr q21, [x0, #640] +ldr q19, [x0, #704] +ldr q22, [x0, #0] +ldr q10, [x0, #64] +ldr q0, [x0, #128] +ldr q13, [x0, #192] +sqrdmulh v16.4S, v17.4S, v29.s[0] +mul v17.4S, v17.4S,v30.s[0] +mla v17.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +mla v12.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +mla v11.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +mla v1.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +mla v2.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +mla v14.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +mla v21.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +mla v19.4S, v16.4S, v31.s[0] +sub v16.4s, v18.4s, v17.4s +add v18.4s, v18.4s, v17.4s +sub v17.4s, v15.4s, v12.4s +add v15.4s, v15.4s, v12.4s +sub v12.4s, v3.4s, v11.4s +add v3.4s, v3.4s, v11.4s +sub v11.4s, v20.4s, v1.4s +add v20.4s, v20.4s, v1.4s +sub v1.4s, v22.4s, v2.4s +add v22.4s, v22.4s, v2.4s +sub v2.4s, v10.4s, v14.4s +add v10.4s, v10.4s, v14.4s +sub v14.4s, v0.4s, v21.4s +add v0.4s, v0.4s, v21.4s +sub v21.4s, v13.4s, v19.4s +add v13.4s, v13.4s, v19.4s +sqrdmulh v19.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +mla v3.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +mla v20.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +mla v18.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +mla v15.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v12.4S, v29.s[2] +mul v12.4S, v12.4S,v30.s[2] +mla v12.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +mla v11.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +mla v16.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +mla v17.4S, v19.4S, v31.s[0] +sub v19.4s, v0.4s, v3.4s +add v0.4s, v0.4s, v3.4s +sub v3.4s, v13.4s, v20.4s +add v13.4s, v13.4s, v20.4s +sub v20.4s, v22.4s, v18.4s +add v22.4s, v22.4s, v18.4s +sub v18.4s, v10.4s, v15.4s +add v10.4s, v10.4s, v15.4s +sub v15.4s, v14.4s, v12.4s +add v14.4s, v14.4s, v12.4s +sub v12.4s, v21.4s, v11.4s +add v21.4s, v21.4s, v11.4s +sub v11.4s, v1.4s, v16.4s +add v1.4s, v1.4s, v16.4s +sub v16.4s, v2.4s, v17.4s +add v2.4s, v2.4s, v17.4s +sqrdmulh v17.4S, v0.4S, v27.s[0] +mul v0.4S, v0.4S,v28.s[0] +mla v0.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v13.4S, v27.s[0] +mul v13.4S, v13.4S,v28.s[0] +mla v13.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v19.4S, v27.s[1] +mul v19.4S, v19.4S,v28.s[1] +mla v19.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v3.4S, v27.s[1] +mul v3.4S, v3.4S,v28.s[1] +mla v3.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v14.4S, v27.s[2] +mul v14.4S, v14.4S,v28.s[2] +mla v14.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v21.4S, v27.s[2] +mul v21.4S, v21.4S,v28.s[2] +mla v21.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +mla v15.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v12.4S, v27.s[3] +mul v12.4S, v12.4S,v28.s[3] +mla v12.4S, v17.4S, v31.s[0] +sub v17.4s, v22.4s, v0.4s +add v22.4s, v22.4s, v0.4s +sub v0.4s, v10.4s, v13.4s +add v10.4s, v10.4s, v13.4s +sub v13.4s, v20.4s, v19.4s +add v20.4s, v20.4s, v19.4s +sub v19.4s, v18.4s, v3.4s +add v18.4s, v18.4s, v3.4s +sub v3.4s, v1.4s, v14.4s +add v1.4s, v1.4s, v14.4s +sub v14.4s, v2.4s, v21.4s +add v2.4s, v2.4s, v21.4s +sub v21.4s, v11.4s, v15.4s +add v11.4s, v11.4s, v15.4s +sub v15.4s, v16.4s, v12.4s +add v16.4s, v16.4s, v12.4s +sqrdmulh v12.4S, v10.4S, v25.s[0] +mul v10.4S, v10.4S,v26.s[0] +mla v10.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v0.4S, v25.s[1] +mul v0.4S, v0.4S,v26.s[1] +mla v0.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v18.4S, v25.s[2] +mul v18.4S, v18.4S,v26.s[2] +mla v18.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v19.4S, v25.s[3] +mul v19.4S, v19.4S,v26.s[3] +mla v19.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v2.4S, v23.s[0] +mul v2.4S, v2.4S,v24.s[0] +mla v2.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v14.4S, v23.s[1] +mul v14.4S, v14.4S,v24.s[1] +mla v14.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v16.4S, v23.s[2] +mul v16.4S, v16.4S,v24.s[2] +mla v16.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v15.4S, v23.s[3] +mul v15.4S, v15.4S,v24.s[3] +mla v15.4S, v12.4S, v31.s[0] +sub v12.4s, v22.4s, v10.4s +add v22.4s, v22.4s, v10.4s +sub v10.4s, v17.4s, v0.4s +add v17.4s, v17.4s, v0.4s +sub v0.4s, v20.4s, v18.4s +add v20.4s, v20.4s, v18.4s +sub v18.4s, v13.4s, v19.4s +add v13.4s, v13.4s, v19.4s +sub v19.4s, v1.4s, v2.4s +add v1.4s, v1.4s, v2.4s +sub v2.4s, v3.4s, v14.4s +add v3.4s, v3.4s, v14.4s +sub v14.4s, v11.4s, v16.4s +add v11.4s, v11.4s, v16.4s +sub v16.4s, v21.4s, v15.4s +add v21.4s, v21.4s, v15.4s +str q22, [x0, #0] +str q12, [x0, #64] +str q17, [x0, #128] +str q10, [x0, #192] +str q20, [x0, #256] +str q0, [x0, #320] +str q13, [x0, #384] +str q18, [x0, #448] +str q1, [x0, #512] +str q19, [x0, #576] +str q3, [x0, #640] +str q2, [x0, #704] +str q11, [x0, #768] +str q14, [x0, #832] +str q21, [x0, #896] +str q16, [x0, #960] +ldr q16, [x0, #784] +ldr q21, [x0, #848] +ldr q14, [x0, #912] +ldr q11, [x0, #976] +ldr q2, [x0, #272] +ldr q3, [x0, #336] +ldr q19, [x0, #400] +ldr q1, [x0, #464] +ldr q18, [x0, #528] +ldr q13, [x0, #592] +ldr q0, [x0, #656] +ldr q20, [x0, #720] +ldr q10, [x0, #16] +ldr q17, [x0, #80] +ldr q12, [x0, #144] +ldr q22, [x0, #208] +sqrdmulh v15.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +mla v16.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +mla v21.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +mla v14.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +mla v11.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +mla v18.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +mla v13.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v0.4S, v29.s[0] +mul v0.4S, v0.4S,v30.s[0] +mla v0.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +mla v20.4S, v15.4S, v31.s[0] +sub v15.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +sub v16.4s, v3.4s, v21.4s +add v3.4s, v3.4s, v21.4s +sub v21.4s, v19.4s, v14.4s +add v19.4s, v19.4s, v14.4s +sub v14.4s, v1.4s, v11.4s +add v1.4s, v1.4s, v11.4s +sub v11.4s, v10.4s, v18.4s +add v10.4s, v10.4s, v18.4s +sub v18.4s, v17.4s, v13.4s +add v17.4s, v17.4s, v13.4s +sub v13.4s, v12.4s, v0.4s +add v12.4s, v12.4s, v0.4s +sub v0.4s, v22.4s, v20.4s +add v22.4s, v22.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v29.s[1] +mul v19.4S, v19.4S,v30.s[1] +mla v19.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v1.4S, v29.s[1] +mul v1.4S, v1.4S,v30.s[1] +mla v1.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v2.4S, v29.s[1] +mul v2.4S, v2.4S,v30.s[1] +mla v2.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +mla v3.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +mla v21.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +mla v14.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +mla v15.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +mla v16.4S, v20.4S, v31.s[0] +sub v20.4s, v12.4s, v19.4s +add v12.4s, v12.4s, v19.4s +sub v19.4s, v22.4s, v1.4s +add v22.4s, v22.4s, v1.4s +sub v1.4s, v10.4s, v2.4s +add v10.4s, v10.4s, v2.4s +sub v2.4s, v17.4s, v3.4s +add v17.4s, v17.4s, v3.4s +sub v3.4s, v13.4s, v21.4s +add v13.4s, v13.4s, v21.4s +sub v21.4s, v0.4s, v14.4s +add v0.4s, v0.4s, v14.4s +sub v14.4s, v11.4s, v15.4s +add v11.4s, v11.4s, v15.4s +sub v15.4s, v18.4s, v16.4s +add v18.4s, v18.4s, v16.4s +sqrdmulh v16.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +mla v12.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v22.4S, v27.s[0] +mul v22.4S, v22.4S,v28.s[0] +mla v22.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v20.4S, v27.s[1] +mul v20.4S, v20.4S,v28.s[1] +mla v20.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v19.4S, v27.s[1] +mul v19.4S, v19.4S,v28.s[1] +mla v19.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v13.4S, v27.s[2] +mul v13.4S, v13.4S,v28.s[2] +mla v13.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v0.4S, v27.s[2] +mul v0.4S, v0.4S,v28.s[2] +mla v0.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v3.4S, v27.s[3] +mul v3.4S, v3.4S,v28.s[3] +mla v3.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +mla v21.4S, v16.4S, v31.s[0] +sub v16.4s, v10.4s, v12.4s +add v10.4s, v10.4s, v12.4s +sub v12.4s, v17.4s, v22.4s +add v17.4s, v17.4s, v22.4s +sub v22.4s, v1.4s, v20.4s +add v1.4s, v1.4s, v20.4s +sub v20.4s, v2.4s, v19.4s +add v2.4s, v2.4s, v19.4s +sub v19.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +sub v13.4s, v18.4s, v0.4s +add v18.4s, v18.4s, v0.4s +sub v0.4s, v14.4s, v3.4s +add v14.4s, v14.4s, v3.4s +sub v3.4s, v15.4s, v21.4s +add v15.4s, v15.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v25.s[0] +mul v17.4S, v17.4S,v26.s[0] +mla v17.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v12.4S, v25.s[1] +mul v12.4S, v12.4S,v26.s[1] +mla v12.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v2.4S, v25.s[2] +mul v2.4S, v2.4S,v26.s[2] +mla v2.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v20.4S, v25.s[3] +mul v20.4S, v20.4S,v26.s[3] +mla v20.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v18.4S, v23.s[0] +mul v18.4S, v18.4S,v24.s[0] +mla v18.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v13.4S, v23.s[1] +mul v13.4S, v13.4S,v24.s[1] +mla v13.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v15.4S, v23.s[2] +mul v15.4S, v15.4S,v24.s[2] +mla v15.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v3.4S, v23.s[3] +mul v3.4S, v3.4S,v24.s[3] +mla v3.4S, v21.4S, v31.s[0] +sub v21.4s, v10.4s, v17.4s +add v10.4s, v10.4s, v17.4s +sub v17.4s, v16.4s, v12.4s +add v16.4s, v16.4s, v12.4s +sub v12.4s, v1.4s, v2.4s +add v1.4s, v1.4s, v2.4s +sub v2.4s, v22.4s, v20.4s +add v22.4s, v22.4s, v20.4s +sub v20.4s, v11.4s, v18.4s +add v11.4s, v11.4s, v18.4s +sub v18.4s, v19.4s, v13.4s +add v19.4s, v19.4s, v13.4s +sub v13.4s, v14.4s, v15.4s +add v14.4s, v14.4s, v15.4s +sub v15.4s, v0.4s, v3.4s +add v0.4s, v0.4s, v3.4s +str q10, [x0, #16] +str q21, [x0, #80] +str q16, [x0, #144] +str q17, [x0, #208] +str q1, [x0, #272] +str q12, [x0, #336] +str q22, [x0, #400] +str q2, [x0, #464] +str q11, [x0, #528] +str q20, [x0, #592] +str q19, [x0, #656] +str q18, [x0, #720] +str q14, [x0, #784] +str q13, [x0, #848] +str q0, [x0, #912] +str q15, [x0, #976] +ldr q4, [x17, #+128] +ldr q5, [x17, #+144] +ldr q6, [x17, #+160] +ldr q7, [x17, #+176] +ldr q8, [x17, #+192] +ldr q9, [x17, #+208] +ldr q3, [x17, #+224] +ldr q10, [x17, #+240] +ldr q21, [x0, #32] +ldr q16, [x0, #48] +ldr q17, [x0, #0] +ldr q1, [x0, #16] +sqrdmulh v12.4S, v21.4S, v5.s[0] +mul v21.4S, v21.4S,v4.s[0] +mla v21.4S, v12.4S, v31.s[0] +sub v12.4s, v17.4s, v21.4s +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v16.4S, v5.s[0] +mul v16.4S, v16.4S,v4.s[0] +mla v16.4S, v21.4S, v31.s[0] +sub v21.4s, v1.4s, v16.4s +add v1.4s, v1.4s, v16.4s +ldr q16, [x17, #+256] +ldr q22, [x17, #+272] +sqrdmulh v2.4S, v1.4S, v5.s[1] +mul v1.4S, v1.4S,v4.s[1] +mla v1.4S, v2.4S, v31.s[0] +sub v2.4s, v17.4s, v1.4s +add v17.4s, v17.4s, v1.4s +sqrdmulh v1.4S, v21.4S, v5.s[2] +mul v21.4S, v21.4S,v4.s[2] +mla v21.4S, v1.4S, v31.s[0] +sub v1.4s, v12.4s, v21.4s +add v12.4s, v12.4s, v21.4s +str q17, [x0, #0] +str q2, [x0, #16] +str q12, [x0, #32] +str q1, [x0, #48] +ldr q1, [x0, #96] +ldr q12, [x0, #112] +ldr q2, [x0, #64] +ldr q17, [x0, #80] +sqrdmulh v21.4S, v1.4S, v7.s[0] +mul v1.4S, v1.4S,v6.s[0] +mla v1.4S, v21.4S, v31.s[0] +sub v21.4s, v2.4s, v1.4s +add v2.4s, v2.4s, v1.4s +sqrdmulh v1.4S, v12.4S, v7.s[0] +mul v12.4S, v12.4S,v6.s[0] +mla v12.4S, v1.4S, v31.s[0] +sub v1.4s, v17.4s, v12.4s +add v17.4s, v17.4s, v12.4s +ldr q12, [x17, #+288] +ldr q11, [x17, #+304] +sqrdmulh v20.4S, v17.4S, v7.s[1] +mul v17.4S, v17.4S,v6.s[1] +mla v17.4S, v20.4S, v31.s[0] +sub v20.4s, v2.4s, v17.4s +add v2.4s, v2.4s, v17.4s +sqrdmulh v17.4S, v1.4S, v7.s[2] +mul v1.4S, v1.4S,v6.s[2] +mla v1.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v1.4s +add v21.4s, v21.4s, v1.4s +str q2, [x0, #64] +str q20, [x0, #80] +str q21, [x0, #96] +str q17, [x0, #112] +ldr q17, [x0, #160] +ldr q21, [x0, #176] +ldr q20, [x0, #128] +ldr q2, [x0, #144] +sqrdmulh v1.4S, v17.4S, v9.s[0] +mul v17.4S, v17.4S,v8.s[0] +mla v17.4S, v1.4S, v31.s[0] +sub v1.4s, v20.4s, v17.4s +add v20.4s, v20.4s, v17.4s +sqrdmulh v17.4S, v21.4S, v9.s[0] +mul v21.4S, v21.4S,v8.s[0] +mla v21.4S, v17.4S, v31.s[0] +sub v17.4s, v2.4s, v21.4s +add v2.4s, v2.4s, v21.4s +ldr q21, [x17, #+320] +ldr q19, [x17, #+336] +sqrdmulh v18.4S, v2.4S, v9.s[1] +mul v2.4S, v2.4S,v8.s[1] +mla v2.4S, v18.4S, v31.s[0] +sub v18.4s, v20.4s, v2.4s +add v20.4s, v20.4s, v2.4s +sqrdmulh v2.4S, v17.4S, v9.s[2] +mul v17.4S, v17.4S,v8.s[2] +mla v17.4S, v2.4S, v31.s[0] +sub v2.4s, v1.4s, v17.4s +add v1.4s, v1.4s, v17.4s +str q20, [x0, #128] +str q18, [x0, #144] +str q1, [x0, #160] +str q2, [x0, #176] +ldr q2, [x0, #224] +ldr q1, [x0, #240] +ldr q18, [x0, #192] +ldr q20, [x0, #208] +sqrdmulh v17.4S, v2.4S, v10.s[0] +mul v2.4S, v2.4S,v3.s[0] +mla v2.4S, v17.4S, v31.s[0] +sub v17.4s, v18.4s, v2.4s +add v18.4s, v18.4s, v2.4s +sqrdmulh v2.4S, v1.4S, v10.s[0] +mul v1.4S, v1.4S,v3.s[0] +mla v1.4S, v2.4S, v31.s[0] +sub v2.4s, v20.4s, v1.4s +add v20.4s, v20.4s, v1.4s +ldr q1, [x17, #+352] +ldr q14, [x17, #+368] +sqrdmulh v13.4S, v20.4S, v10.s[1] +mul v20.4S, v20.4S,v3.s[1] +mla v20.4S, v13.4S, v31.s[0] +sub v13.4s, v18.4s, v20.4s +add v18.4s, v18.4s, v20.4s +sqrdmulh v20.4S, v2.4S, v10.s[2] +mul v2.4S, v2.4S,v3.s[2] +mla v2.4S, v20.4S, v31.s[0] +sub v20.4s, v17.4s, v2.4s +add v17.4s, v17.4s, v2.4s +str q18, [x0, #192] +str q13, [x0, #208] +str q17, [x0, #224] +str q20, [x0, #240] +ldr q20, [x0, #288] +ldr q17, [x0, #304] +ldr q13, [x0, #256] +ldr q18, [x0, #272] +sqrdmulh v2.4S, v20.4S, v22.s[0] +mul v20.4S, v20.4S,v16.s[0] +mla v20.4S, v2.4S, v31.s[0] +sub v2.4s, v13.4s, v20.4s +add v13.4s, v13.4s, v20.4s +sqrdmulh v20.4S, v17.4S, v22.s[0] +mul v17.4S, v17.4S,v16.s[0] +mla v17.4S, v20.4S, v31.s[0] +sub v20.4s, v18.4s, v17.4s +add v18.4s, v18.4s, v17.4s +ldr q17, [x17, #+384] +ldr q0, [x17, #+400] +sqrdmulh v15.4S, v18.4S, v22.s[1] +mul v18.4S, v18.4S,v16.s[1] +mla v18.4S, v15.4S, v31.s[0] +sub v15.4s, v13.4s, v18.4s +add v13.4s, v13.4s, v18.4s +sqrdmulh v18.4S, v20.4S, v22.s[2] +mul v20.4S, v20.4S,v16.s[2] +mla v20.4S, v18.4S, v31.s[0] +sub v18.4s, v2.4s, v20.4s +add v2.4s, v2.4s, v20.4s +str q13, [x0, #256] +str q15, [x0, #272] +str q2, [x0, #288] +str q18, [x0, #304] +ldr q5, [x0, #352] +ldr q4, [x0, #368] +ldr q18, [x0, #320] +ldr q2, [x0, #336] +sqrdmulh v15.4S, v5.4S, v11.s[0] +mul v5.4S, v5.4S,v12.s[0] +mla v5.4S, v15.4S, v31.s[0] +sub v15.4s, v18.4s, v5.4s +add v18.4s, v18.4s, v5.4s +sqrdmulh v5.4S, v4.4S, v11.s[0] +mul v4.4S, v4.4S,v12.s[0] +mla v4.4S, v5.4S, v31.s[0] +sub v5.4s, v2.4s, v4.4s +add v2.4s, v2.4s, v4.4s +ldr q4, [x17, #+416] +ldr q13, [x17, #+432] +sqrdmulh v20.4S, v2.4S, v11.s[1] +mul v2.4S, v2.4S,v12.s[1] +mla v2.4S, v20.4S, v31.s[0] +sub v20.4s, v18.4s, v2.4s +add v18.4s, v18.4s, v2.4s +sqrdmulh v2.4S, v5.4S, v11.s[2] +mul v5.4S, v5.4S,v12.s[2] +mla v5.4S, v2.4S, v31.s[0] +sub v2.4s, v15.4s, v5.4s +add v15.4s, v15.4s, v5.4s +str q18, [x0, #320] +str q20, [x0, #336] +str q15, [x0, #352] +str q2, [x0, #368] +ldr q7, [x0, #416] +ldr q6, [x0, #432] +ldr q2, [x0, #384] +ldr q15, [x0, #400] +sqrdmulh v20.4S, v7.4S, v19.s[0] +mul v7.4S, v7.4S,v21.s[0] +mla v7.4S, v20.4S, v31.s[0] +sub v20.4s, v2.4s, v7.4s +add v2.4s, v2.4s, v7.4s +sqrdmulh v7.4S, v6.4S, v19.s[0] +mul v6.4S, v6.4S,v21.s[0] +mla v6.4S, v7.4S, v31.s[0] +sub v7.4s, v15.4s, v6.4s +add v15.4s, v15.4s, v6.4s +ldr q6, [x17, #+448] +ldr q18, [x17, #+464] +sqrdmulh v5.4S, v15.4S, v19.s[1] +mul v15.4S, v15.4S,v21.s[1] +mla v15.4S, v5.4S, v31.s[0] +sub v5.4s, v2.4s, v15.4s +add v2.4s, v2.4s, v15.4s +sqrdmulh v15.4S, v7.4S, v19.s[2] +mul v7.4S, v7.4S,v21.s[2] +mla v7.4S, v15.4S, v31.s[0] +sub v15.4s, v20.4s, v7.4s +add v20.4s, v20.4s, v7.4s +str q2, [x0, #384] +str q5, [x0, #400] +str q20, [x0, #416] +str q15, [x0, #432] +ldr q9, [x0, #480] +ldr q8, [x0, #496] +ldr q15, [x0, #448] +ldr q20, [x0, #464] +sqrdmulh v5.4S, v9.4S, v14.s[0] +mul v9.4S, v9.4S,v1.s[0] +mla v9.4S, v5.4S, v31.s[0] +sub v5.4s, v15.4s, v9.4s +add v15.4s, v15.4s, v9.4s +sqrdmulh v9.4S, v8.4S, v14.s[0] +mul v8.4S, v8.4S,v1.s[0] +mla v8.4S, v9.4S, v31.s[0] +sub v9.4s, v20.4s, v8.4s +add v20.4s, v20.4s, v8.4s +ldr q8, [x17, #+480] +ldr q2, [x17, #+496] +sqrdmulh v7.4S, v20.4S, v14.s[1] +mul v20.4S, v20.4S,v1.s[1] +mla v20.4S, v7.4S, v31.s[0] +sub v7.4s, v15.4s, v20.4s +add v15.4s, v15.4s, v20.4s +sqrdmulh v20.4S, v9.4S, v14.s[2] +mul v9.4S, v9.4S,v1.s[2] +mla v9.4S, v20.4S, v31.s[0] +sub v20.4s, v5.4s, v9.4s +add v5.4s, v5.4s, v9.4s +str q15, [x0, #448] +str q7, [x0, #464] +str q5, [x0, #480] +str q20, [x0, #496] +ldr q10, [x0, #544] +ldr q3, [x0, #560] +ldr q20, [x0, #512] +ldr q5, [x0, #528] +sqrdmulh v7.4S, v10.4S, v0.s[0] +mul v10.4S, v10.4S,v17.s[0] +mla v10.4S, v7.4S, v31.s[0] +sub v7.4s, v20.4s, v10.4s +add v20.4s, v20.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v0.s[0] +mul v3.4S, v3.4S,v17.s[0] +mla v3.4S, v10.4S, v31.s[0] +sub v10.4s, v5.4s, v3.4s +add v5.4s, v5.4s, v3.4s +ldr q3, [x17, #+512] +ldr q15, [x17, #+528] +sqrdmulh v9.4S, v5.4S, v0.s[1] +mul v5.4S, v5.4S,v17.s[1] +mla v5.4S, v9.4S, v31.s[0] +sub v9.4s, v20.4s, v5.4s +add v20.4s, v20.4s, v5.4s +sqrdmulh v5.4S, v10.4S, v0.s[2] +mul v10.4S, v10.4S,v17.s[2] +mla v10.4S, v5.4S, v31.s[0] +sub v5.4s, v7.4s, v10.4s +add v7.4s, v7.4s, v10.4s +str q20, [x0, #512] +str q9, [x0, #528] +str q7, [x0, #544] +str q5, [x0, #560] +ldr q22, [x0, #608] +ldr q16, [x0, #624] +ldr q5, [x0, #576] +ldr q7, [x0, #592] +sqrdmulh v9.4S, v22.4S, v13.s[0] +mul v22.4S, v22.4S,v4.s[0] +mla v22.4S, v9.4S, v31.s[0] +sub v9.4s, v5.4s, v22.4s +add v5.4s, v5.4s, v22.4s +sqrdmulh v22.4S, v16.4S, v13.s[0] +mul v16.4S, v16.4S,v4.s[0] +mla v16.4S, v22.4S, v31.s[0] +sub v22.4s, v7.4s, v16.4s +add v7.4s, v7.4s, v16.4s +ldr q16, [x17, #+544] +ldr q20, [x17, #+560] +sqrdmulh v10.4S, v7.4S, v13.s[1] +mul v7.4S, v7.4S,v4.s[1] +mla v7.4S, v10.4S, v31.s[0] +sub v10.4s, v5.4s, v7.4s +add v5.4s, v5.4s, v7.4s +sqrdmulh v7.4S, v22.4S, v13.s[2] +mul v22.4S, v22.4S,v4.s[2] +mla v22.4S, v7.4S, v31.s[0] +sub v7.4s, v9.4s, v22.4s +add v9.4s, v9.4s, v22.4s +str q5, [x0, #576] +str q10, [x0, #592] +str q9, [x0, #608] +str q7, [x0, #624] +ldr q11, [x0, #672] +ldr q12, [x0, #688] +ldr q7, [x0, #640] +ldr q9, [x0, #656] +sqrdmulh v10.4S, v11.4S, v18.s[0] +mul v11.4S, v11.4S,v6.s[0] +mla v11.4S, v10.4S, v31.s[0] +sub v10.4s, v7.4s, v11.4s +add v7.4s, v7.4s, v11.4s +sqrdmulh v11.4S, v12.4S, v18.s[0] +mul v12.4S, v12.4S,v6.s[0] +mla v12.4S, v11.4S, v31.s[0] +sub v11.4s, v9.4s, v12.4s +add v9.4s, v9.4s, v12.4s +ldr q12, [x17, #+576] +ldr q5, [x17, #+592] +sqrdmulh v22.4S, v9.4S, v18.s[1] +mul v9.4S, v9.4S,v6.s[1] +mla v9.4S, v22.4S, v31.s[0] +sub v22.4s, v7.4s, v9.4s +add v7.4s, v7.4s, v9.4s +sqrdmulh v9.4S, v11.4S, v18.s[2] +mul v11.4S, v11.4S,v6.s[2] +mla v11.4S, v9.4S, v31.s[0] +sub v9.4s, v10.4s, v11.4s +add v10.4s, v10.4s, v11.4s +str q7, [x0, #640] +str q22, [x0, #656] +str q10, [x0, #672] +str q9, [x0, #688] +ldr q19, [x0, #736] +ldr q21, [x0, #752] +ldr q9, [x0, #704] +ldr q10, [x0, #720] +sqrdmulh v22.4S, v19.4S, v2.s[0] +mul v19.4S, v19.4S,v8.s[0] +mla v19.4S, v22.4S, v31.s[0] +sub v22.4s, v9.4s, v19.4s +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v21.4S, v2.s[0] +mul v21.4S, v21.4S,v8.s[0] +mla v21.4S, v19.4S, v31.s[0] +sub v19.4s, v10.4s, v21.4s +add v10.4s, v10.4s, v21.4s +ldr q21, [x17, #+608] +ldr q7, [x17, #+624] +sqrdmulh v11.4S, v10.4S, v2.s[1] +mul v10.4S, v10.4S,v8.s[1] +mla v10.4S, v11.4S, v31.s[0] +sub v11.4s, v9.4s, v10.4s +add v9.4s, v9.4s, v10.4s +sqrdmulh v10.4S, v19.4S, v2.s[2] +mul v19.4S, v19.4S,v8.s[2] +mla v19.4S, v10.4S, v31.s[0] +sub v10.4s, v22.4s, v19.4s +add v22.4s, v22.4s, v19.4s +str q9, [x0, #704] +str q11, [x0, #720] +str q22, [x0, #736] +str q10, [x0, #752] +ldr q14, [x0, #800] +ldr q1, [x0, #816] +ldr q10, [x0, #768] +ldr q22, [x0, #784] +sqrdmulh v11.4S, v14.4S, v15.s[0] +mul v14.4S, v14.4S,v3.s[0] +mla v14.4S, v11.4S, v31.s[0] +sub v11.4s, v10.4s, v14.4s +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v1.4S, v15.s[0] +mul v1.4S, v1.4S,v3.s[0] +mla v1.4S, v14.4S, v31.s[0] +sub v14.4s, v22.4s, v1.4s +add v22.4s, v22.4s, v1.4s +sqrdmulh v1.4S, v22.4S, v15.s[1] +mul v22.4S, v22.4S,v3.s[1] +mla v22.4S, v1.4S, v31.s[0] +sub v1.4s, v10.4s, v22.4s +add v10.4s, v10.4s, v22.4s +sqrdmulh v22.4S, v14.4S, v15.s[2] +mul v14.4S, v14.4S,v3.s[2] +mla v14.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v14.4s +add v11.4s, v11.4s, v14.4s +str q10, [x0, #768] +str q1, [x0, #784] +str q11, [x0, #800] +str q22, [x0, #816] +ldr q0, [x0, #864] +ldr q17, [x0, #880] +ldr q22, [x0, #832] +ldr q11, [x0, #848] +sqrdmulh v1.4S, v0.4S, v20.s[0] +mul v0.4S, v0.4S,v16.s[0] +mla v0.4S, v1.4S, v31.s[0] +sub v1.4s, v22.4s, v0.4s +add v22.4s, v22.4s, v0.4s +sqrdmulh v0.4S, v17.4S, v20.s[0] +mul v17.4S, v17.4S,v16.s[0] +mla v17.4S, v0.4S, v31.s[0] +sub v0.4s, v11.4s, v17.4s +add v11.4s, v11.4s, v17.4s +sqrdmulh v17.4S, v11.4S, v20.s[1] +mul v11.4S, v11.4S,v16.s[1] +mla v11.4S, v17.4S, v31.s[0] +sub v17.4s, v22.4s, v11.4s +add v22.4s, v22.4s, v11.4s +sqrdmulh v11.4S, v0.4S, v20.s[2] +mul v0.4S, v0.4S,v16.s[2] +mla v0.4S, v11.4S, v31.s[0] +sub v11.4s, v1.4s, v0.4s +add v1.4s, v1.4s, v0.4s +str q22, [x0, #832] +str q17, [x0, #848] +str q1, [x0, #864] +str q11, [x0, #880] +ldr q13, [x0, #928] +ldr q4, [x0, #944] +ldr q11, [x0, #896] +ldr q1, [x0, #912] +sqrdmulh v17.4S, v13.4S, v5.s[0] +mul v13.4S, v13.4S,v12.s[0] +mla v13.4S, v17.4S, v31.s[0] +sub v17.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +sqrdmulh v13.4S, v4.4S, v5.s[0] +mul v4.4S, v4.4S,v12.s[0] +mla v4.4S, v13.4S, v31.s[0] +sub v13.4s, v1.4s, v4.4s +add v1.4s, v1.4s, v4.4s +sqrdmulh v4.4S, v1.4S, v5.s[1] +mul v1.4S, v1.4S,v12.s[1] +mla v1.4S, v4.4S, v31.s[0] +sub v4.4s, v11.4s, v1.4s +add v11.4s, v11.4s, v1.4s +sqrdmulh v1.4S, v13.4S, v5.s[2] +mul v13.4S, v13.4S,v12.s[2] +mla v13.4S, v1.4S, v31.s[0] +sub v1.4s, v17.4s, v13.4s +add v17.4s, v17.4s, v13.4s +str q11, [x0, #896] +str q4, [x0, #912] +str q17, [x0, #928] +str q1, [x0, #944] +ldr q18, [x0, #992] +ldr q6, [x0, #1008] +ldr q1, [x0, #960] +ldr q17, [x0, #976] +sqrdmulh v4.4S, v18.4S, v7.s[0] +mul v18.4S, v18.4S,v21.s[0] +mla v18.4S, v4.4S, v31.s[0] +sub v4.4s, v1.4s, v18.4s +add v1.4s, v1.4s, v18.4s +sqrdmulh v18.4S, v6.4S, v7.s[0] +mul v6.4S, v6.4S,v21.s[0] +mla v6.4S, v18.4S, v31.s[0] +sub v18.4s, v17.4s, v6.4s +add v17.4s, v17.4s, v6.4s +sqrdmulh v6.4S, v17.4S, v7.s[1] +mul v17.4S, v17.4S,v21.s[1] +mla v17.4S, v6.4S, v31.s[0] +sub v6.4s, v1.4s, v17.4s +add v1.4s, v1.4s, v17.4s +sqrdmulh v17.4S, v18.4S, v7.s[2] +mul v18.4S, v18.4S,v21.s[2] +mla v18.4S, v17.4S, v31.s[0] +sub v17.4s, v4.4s, v18.4s +add v4.4s, v4.4s, v18.4s +str q1, [x0, #960] +str q6, [x0, #976] +str q4, [x0, #992] +str q17, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1464 +// Instruction count: 1460 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_24_z4_16.s b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_24_z4_16.s new file mode 100644 index 0000000..70b872e --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_24_z4_16.s @@ -0,0 +1,1494 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_24_z4_16 +.global _ntt_u32_incomplete_neon_asm_var_4_2_24_z4_16 +ntt_u32_incomplete_neon_asm_var_4_2_24_z4_16: +_ntt_u32_incomplete_neon_asm_var_4_2_24_z4_16: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #800] +ldr q21, [x0, #864] +ldr q20, [x0, #928] +ldr q19, [x0, #992] +ldr q18, [x0, #288] +ldr q17, [x0, #352] +ldr q16, [x0, #416] +ldr q3, [x0, #480] +ldr q2, [x0, #544] +ldr q1, [x0, #608] +ldr q0, [x0, #672] +ldr q15, [x0, #736] +ldr q14, [x0, #32] +ldr q13, [x0, #96] +ldr q12, [x0, #160] +ldr q11, [x0, #224] +sqrdmulh v10.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +mla v22.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +mla v21.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +mla v20.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +mla v19.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +mla v2.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +mla v1.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v0.4S, v29.s[0] +mul v0.4S, v0.4S,v30.s[0] +mla v0.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v15.4S, v29.s[0] +mul v15.4S, v15.4S,v30.s[0] +mla v15.4S, v10.4S, v31.s[0] +sub v10.4s, v18.4s, v22.4s +add v18.4s, v18.4s, v22.4s +sub v22.4s, v17.4s, v21.4s +add v17.4s, v17.4s, v21.4s +sub v21.4s, v16.4s, v20.4s +add v16.4s, v16.4s, v20.4s +sub v20.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +sub v19.4s, v14.4s, v2.4s +add v14.4s, v14.4s, v2.4s +sub v2.4s, v13.4s, v1.4s +add v13.4s, v13.4s, v1.4s +sub v1.4s, v12.4s, v0.4s +add v12.4s, v12.4s, v0.4s +sub v0.4s, v11.4s, v15.4s +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +mla v16.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +mla v3.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +mla v18.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +mla v17.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +mla v21.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +mla v20.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +mla v10.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +mla v22.4S, v15.4S, v31.s[0] +sub v15.4s, v12.4s, v16.4s +add v12.4s, v12.4s, v16.4s +sub v16.4s, v11.4s, v3.4s +add v11.4s, v11.4s, v3.4s +sub v3.4s, v14.4s, v18.4s +add v14.4s, v14.4s, v18.4s +sub v18.4s, v13.4s, v17.4s +add v13.4s, v13.4s, v17.4s +sub v17.4s, v1.4s, v21.4s +add v1.4s, v1.4s, v21.4s +sub v21.4s, v0.4s, v20.4s +add v0.4s, v0.4s, v20.4s +sub v20.4s, v19.4s, v10.4s +add v19.4s, v19.4s, v10.4s +sub v10.4s, v2.4s, v22.4s +add v2.4s, v2.4s, v22.4s +sqrdmulh v22.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +mla v12.4S, v22.4S, v31.s[0] +sqrdmulh v22.4S, v11.4S, v27.s[0] +mul v11.4S, v11.4S,v28.s[0] +mla v11.4S, v22.4S, v31.s[0] +sqrdmulh v22.4S, v15.4S, v27.s[1] +mul v15.4S, v15.4S,v28.s[1] +mla v15.4S, v22.4S, v31.s[0] +sqrdmulh v22.4S, v16.4S, v27.s[1] +mul v16.4S, v16.4S,v28.s[1] +mla v16.4S, v22.4S, v31.s[0] +sqrdmulh v22.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +mla v1.4S, v22.4S, v31.s[0] +sqrdmulh v22.4S, v0.4S, v27.s[2] +mul v0.4S, v0.4S,v28.s[2] +mla v0.4S, v22.4S, v31.s[0] +sqrdmulh v22.4S, v17.4S, v27.s[3] +mul v17.4S, v17.4S,v28.s[3] +mla v17.4S, v22.4S, v31.s[0] +sqrdmulh v22.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v14.4s, v12.4s +add v14.4s, v14.4s, v12.4s +sub v12.4s, v13.4s, v11.4s +add v13.4s, v13.4s, v11.4s +sub v11.4s, v3.4s, v15.4s +add v3.4s, v3.4s, v15.4s +sub v15.4s, v18.4s, v16.4s +add v18.4s, v18.4s, v16.4s +sub v16.4s, v19.4s, v1.4s +add v19.4s, v19.4s, v1.4s +sub v1.4s, v2.4s, v0.4s +add v2.4s, v2.4s, v0.4s +sub v0.4s, v20.4s, v17.4s +add v20.4s, v20.4s, v17.4s +sub v17.4s, v10.4s, v21.4s +add v10.4s, v10.4s, v21.4s +sqrdmulh v21.4S, v13.4S, v25.s[0] +mul v13.4S, v13.4S,v26.s[0] +mla v13.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v12.4S, v25.s[1] +mul v12.4S, v12.4S,v26.s[1] +mla v12.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v18.4S, v25.s[2] +mul v18.4S, v18.4S,v26.s[2] +mla v18.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v15.4S, v25.s[3] +mul v15.4S, v15.4S,v26.s[3] +mla v15.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v2.4S, v23.s[0] +mul v2.4S, v2.4S,v24.s[0] +mla v2.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v1.4S, v23.s[1] +mul v1.4S, v1.4S,v24.s[1] +mla v1.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v10.4S, v23.s[2] +mul v10.4S, v10.4S,v24.s[2] +mla v10.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v17.4S, v23.s[3] +mul v17.4S, v17.4S,v24.s[3] +mla v17.4S, v21.4S, v31.s[0] +sub v21.4s, v14.4s, v13.4s +add v14.4s, v14.4s, v13.4s +sub v13.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +sub v12.4s, v3.4s, v18.4s +add v3.4s, v3.4s, v18.4s +sub v18.4s, v11.4s, v15.4s +add v11.4s, v11.4s, v15.4s +sub v15.4s, v19.4s, v2.4s +add v19.4s, v19.4s, v2.4s +sub v2.4s, v16.4s, v1.4s +add v16.4s, v16.4s, v1.4s +sub v1.4s, v20.4s, v10.4s +add v20.4s, v20.4s, v10.4s +sub v10.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +str q14, [x0, #32] +str q21, [x0, #96] +str q22, [x0, #160] +str q13, [x0, #224] +str q3, [x0, #288] +str q12, [x0, #352] +str q11, [x0, #416] +str q18, [x0, #480] +str q19, [x0, #544] +str q15, [x0, #608] +str q16, [x0, #672] +str q2, [x0, #736] +str q20, [x0, #800] +str q1, [x0, #864] +str q0, [x0, #928] +str q10, [x0, #992] +ldr q10, [x0, #816] +ldr q0, [x0, #880] +ldr q1, [x0, #944] +ldr q20, [x0, #1008] +ldr q2, [x0, #304] +ldr q16, [x0, #368] +ldr q15, [x0, #432] +ldr q19, [x0, #496] +ldr q18, [x0, #560] +ldr q11, [x0, #624] +ldr q12, [x0, #688] +ldr q3, [x0, #752] +ldr q13, [x0, #48] +ldr q22, [x0, #112] +ldr q21, [x0, #176] +ldr q14, [x0, #240] +sqrdmulh v17.4S, v10.4S, v29.s[0] +mul v10.4S, v10.4S,v30.s[0] +mla v10.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v0.4S, v29.s[0] +mul v0.4S, v0.4S,v30.s[0] +mla v0.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +mla v1.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +mla v20.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +mla v18.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +mla v11.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +mla v12.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +mla v3.4S, v17.4S, v31.s[0] +sub v17.4s, v2.4s, v10.4s +add v2.4s, v2.4s, v10.4s +sub v10.4s, v16.4s, v0.4s +add v16.4s, v16.4s, v0.4s +sub v0.4s, v15.4s, v1.4s +add v15.4s, v15.4s, v1.4s +sub v1.4s, v19.4s, v20.4s +add v19.4s, v19.4s, v20.4s +sub v20.4s, v13.4s, v18.4s +add v13.4s, v13.4s, v18.4s +sub v18.4s, v22.4s, v11.4s +add v22.4s, v22.4s, v11.4s +sub v11.4s, v21.4s, v12.4s +add v21.4s, v21.4s, v12.4s +sub v12.4s, v14.4s, v3.4s +add v14.4s, v14.4s, v3.4s +sqrdmulh v3.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +mla v15.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v19.4S, v29.s[1] +mul v19.4S, v19.4S,v30.s[1] +mla v19.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v2.4S, v29.s[1] +mul v2.4S, v2.4S,v30.s[1] +mla v2.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +mla v16.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v0.4S, v29.s[2] +mul v0.4S, v0.4S,v30.s[2] +mla v0.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v1.4S, v29.s[2] +mul v1.4S, v1.4S,v30.s[2] +mla v1.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +mla v17.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +mla v10.4S, v3.4S, v31.s[0] +sub v3.4s, v21.4s, v15.4s +add v21.4s, v21.4s, v15.4s +sub v15.4s, v14.4s, v19.4s +add v14.4s, v14.4s, v19.4s +sub v19.4s, v13.4s, v2.4s +add v13.4s, v13.4s, v2.4s +sub v2.4s, v22.4s, v16.4s +add v22.4s, v22.4s, v16.4s +sub v16.4s, v11.4s, v0.4s +add v11.4s, v11.4s, v0.4s +sub v0.4s, v12.4s, v1.4s +add v12.4s, v12.4s, v1.4s +sub v1.4s, v20.4s, v17.4s +add v20.4s, v20.4s, v17.4s +sub v17.4s, v18.4s, v10.4s +add v18.4s, v18.4s, v10.4s +sqrdmulh v10.4S, v21.4S, v27.s[0] +mul v21.4S, v21.4S,v28.s[0] +mla v21.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +mla v14.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v3.4S, v27.s[1] +mul v3.4S, v3.4S,v28.s[1] +mla v3.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v15.4S, v27.s[1] +mul v15.4S, v15.4S,v28.s[1] +mla v15.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v11.4S, v27.s[2] +mul v11.4S, v11.4S,v28.s[2] +mla v11.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v12.4S, v27.s[2] +mul v12.4S, v12.4S,v28.s[2] +mla v12.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v16.4S, v27.s[3] +mul v16.4S, v16.4S,v28.s[3] +mla v16.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v0.4S, v27.s[3] +mul v0.4S, v0.4S,v28.s[3] +mla v0.4S, v10.4S, v31.s[0] +sub v10.4s, v13.4s, v21.4s +add v13.4s, v13.4s, v21.4s +sub v21.4s, v22.4s, v14.4s +add v22.4s, v22.4s, v14.4s +sub v14.4s, v19.4s, v3.4s +add v19.4s, v19.4s, v3.4s +sub v3.4s, v2.4s, v15.4s +add v2.4s, v2.4s, v15.4s +sub v15.4s, v20.4s, v11.4s +add v20.4s, v20.4s, v11.4s +sub v11.4s, v18.4s, v12.4s +add v18.4s, v18.4s, v12.4s +sub v12.4s, v1.4s, v16.4s +add v1.4s, v1.4s, v16.4s +sub v16.4s, v17.4s, v0.4s +add v17.4s, v17.4s, v0.4s +sqrdmulh v0.4S, v22.4S, v25.s[0] +mul v22.4S, v22.4S,v26.s[0] +mla v22.4S, v0.4S, v31.s[0] +sqrdmulh v0.4S, v21.4S, v25.s[1] +mul v21.4S, v21.4S,v26.s[1] +mla v21.4S, v0.4S, v31.s[0] +sqrdmulh v0.4S, v2.4S, v25.s[2] +mul v2.4S, v2.4S,v26.s[2] +mla v2.4S, v0.4S, v31.s[0] +sqrdmulh v0.4S, v3.4S, v25.s[3] +mul v3.4S, v3.4S,v26.s[3] +mla v3.4S, v0.4S, v31.s[0] +sqrdmulh v0.4S, v18.4S, v23.s[0] +mul v18.4S, v18.4S,v24.s[0] +mla v18.4S, v0.4S, v31.s[0] +sqrdmulh v0.4S, v11.4S, v23.s[1] +mul v11.4S, v11.4S,v24.s[1] +mla v11.4S, v0.4S, v31.s[0] +sqrdmulh v0.4S, v17.4S, v23.s[2] +mul v17.4S, v17.4S,v24.s[2] +mla v17.4S, v0.4S, v31.s[0] +sqrdmulh v0.4S, v16.4S, v23.s[3] +mul v16.4S, v16.4S,v24.s[3] +mla v16.4S, v0.4S, v31.s[0] +sub v0.4s, v13.4s, v22.4s +add v13.4s, v13.4s, v22.4s +sub v22.4s, v10.4s, v21.4s +add v10.4s, v10.4s, v21.4s +sub v21.4s, v19.4s, v2.4s +add v19.4s, v19.4s, v2.4s +sub v2.4s, v14.4s, v3.4s +add v14.4s, v14.4s, v3.4s +sub v3.4s, v20.4s, v18.4s +add v20.4s, v20.4s, v18.4s +sub v18.4s, v15.4s, v11.4s +add v15.4s, v15.4s, v11.4s +sub v11.4s, v1.4s, v17.4s +add v1.4s, v1.4s, v17.4s +sub v17.4s, v12.4s, v16.4s +add v12.4s, v12.4s, v16.4s +str q13, [x0, #48] +str q0, [x0, #112] +str q10, [x0, #176] +str q22, [x0, #240] +str q19, [x0, #304] +str q21, [x0, #368] +str q14, [x0, #432] +str q2, [x0, #496] +str q20, [x0, #560] +str q3, [x0, #624] +str q15, [x0, #688] +str q18, [x0, #752] +str q1, [x0, #816] +str q11, [x0, #880] +str q12, [x0, #944] +str q17, [x0, #1008] +ldr q17, [x0, #768] +ldr q12, [x0, #832] +ldr q11, [x0, #896] +ldr q1, [x0, #960] +ldr q18, [x0, #256] +ldr q15, [x0, #320] +ldr q3, [x0, #384] +ldr q20, [x0, #448] +ldr q2, [x0, #512] +ldr q14, [x0, #576] +ldr q21, [x0, #640] +ldr q19, [x0, #704] +ldr q22, [x0, #0] +ldr q10, [x0, #64] +ldr q0, [x0, #128] +ldr q13, [x0, #192] +sqrdmulh v16.4S, v17.4S, v29.s[0] +mul v17.4S, v17.4S,v30.s[0] +mla v17.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +mla v12.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +mla v11.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +mla v1.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +mla v2.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +mla v14.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +mla v21.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +mla v19.4S, v16.4S, v31.s[0] +sub v16.4s, v18.4s, v17.4s +add v18.4s, v18.4s, v17.4s +sub v17.4s, v15.4s, v12.4s +add v15.4s, v15.4s, v12.4s +sub v12.4s, v3.4s, v11.4s +add v3.4s, v3.4s, v11.4s +sub v11.4s, v20.4s, v1.4s +add v20.4s, v20.4s, v1.4s +sub v1.4s, v22.4s, v2.4s +add v22.4s, v22.4s, v2.4s +sub v2.4s, v10.4s, v14.4s +add v10.4s, v10.4s, v14.4s +sub v14.4s, v0.4s, v21.4s +add v0.4s, v0.4s, v21.4s +sub v21.4s, v13.4s, v19.4s +add v13.4s, v13.4s, v19.4s +sqrdmulh v19.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +mla v3.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +mla v20.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +mla v18.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +mla v15.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v12.4S, v29.s[2] +mul v12.4S, v12.4S,v30.s[2] +mla v12.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +mla v11.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +mla v16.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +mla v17.4S, v19.4S, v31.s[0] +sub v19.4s, v0.4s, v3.4s +add v0.4s, v0.4s, v3.4s +sub v3.4s, v13.4s, v20.4s +add v13.4s, v13.4s, v20.4s +sub v20.4s, v22.4s, v18.4s +add v22.4s, v22.4s, v18.4s +sub v18.4s, v10.4s, v15.4s +add v10.4s, v10.4s, v15.4s +sub v15.4s, v14.4s, v12.4s +add v14.4s, v14.4s, v12.4s +sub v12.4s, v21.4s, v11.4s +add v21.4s, v21.4s, v11.4s +sub v11.4s, v1.4s, v16.4s +add v1.4s, v1.4s, v16.4s +sub v16.4s, v2.4s, v17.4s +add v2.4s, v2.4s, v17.4s +sqrdmulh v17.4S, v0.4S, v27.s[0] +mul v0.4S, v0.4S,v28.s[0] +mla v0.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v13.4S, v27.s[0] +mul v13.4S, v13.4S,v28.s[0] +mla v13.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v19.4S, v27.s[1] +mul v19.4S, v19.4S,v28.s[1] +mla v19.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v3.4S, v27.s[1] +mul v3.4S, v3.4S,v28.s[1] +mla v3.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v14.4S, v27.s[2] +mul v14.4S, v14.4S,v28.s[2] +mla v14.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v21.4S, v27.s[2] +mul v21.4S, v21.4S,v28.s[2] +mla v21.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +mla v15.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v12.4S, v27.s[3] +mul v12.4S, v12.4S,v28.s[3] +mla v12.4S, v17.4S, v31.s[0] +sub v17.4s, v22.4s, v0.4s +add v22.4s, v22.4s, v0.4s +sub v0.4s, v10.4s, v13.4s +add v10.4s, v10.4s, v13.4s +sub v13.4s, v20.4s, v19.4s +add v20.4s, v20.4s, v19.4s +sub v19.4s, v18.4s, v3.4s +add v18.4s, v18.4s, v3.4s +sub v3.4s, v1.4s, v14.4s +add v1.4s, v1.4s, v14.4s +sub v14.4s, v2.4s, v21.4s +add v2.4s, v2.4s, v21.4s +sub v21.4s, v11.4s, v15.4s +add v11.4s, v11.4s, v15.4s +sub v15.4s, v16.4s, v12.4s +add v16.4s, v16.4s, v12.4s +sqrdmulh v12.4S, v10.4S, v25.s[0] +mul v10.4S, v10.4S,v26.s[0] +mla v10.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v0.4S, v25.s[1] +mul v0.4S, v0.4S,v26.s[1] +mla v0.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v18.4S, v25.s[2] +mul v18.4S, v18.4S,v26.s[2] +mla v18.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v19.4S, v25.s[3] +mul v19.4S, v19.4S,v26.s[3] +mla v19.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v2.4S, v23.s[0] +mul v2.4S, v2.4S,v24.s[0] +mla v2.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v14.4S, v23.s[1] +mul v14.4S, v14.4S,v24.s[1] +mla v14.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v16.4S, v23.s[2] +mul v16.4S, v16.4S,v24.s[2] +mla v16.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v15.4S, v23.s[3] +mul v15.4S, v15.4S,v24.s[3] +mla v15.4S, v12.4S, v31.s[0] +sub v12.4s, v22.4s, v10.4s +add v22.4s, v22.4s, v10.4s +sub v10.4s, v17.4s, v0.4s +add v17.4s, v17.4s, v0.4s +sub v0.4s, v20.4s, v18.4s +add v20.4s, v20.4s, v18.4s +sub v18.4s, v13.4s, v19.4s +add v13.4s, v13.4s, v19.4s +sub v19.4s, v1.4s, v2.4s +add v1.4s, v1.4s, v2.4s +sub v2.4s, v3.4s, v14.4s +add v3.4s, v3.4s, v14.4s +sub v14.4s, v11.4s, v16.4s +add v11.4s, v11.4s, v16.4s +sub v16.4s, v21.4s, v15.4s +add v21.4s, v21.4s, v15.4s +str q22, [x0, #0] +str q12, [x0, #64] +str q17, [x0, #128] +str q10, [x0, #192] +str q20, [x0, #256] +str q0, [x0, #320] +str q13, [x0, #384] +str q18, [x0, #448] +str q1, [x0, #512] +str q19, [x0, #576] +str q3, [x0, #640] +str q2, [x0, #704] +str q11, [x0, #768] +str q14, [x0, #832] +str q21, [x0, #896] +str q16, [x0, #960] +ldr q16, [x0, #784] +ldr q21, [x0, #848] +ldr q14, [x0, #912] +ldr q11, [x0, #976] +ldr q2, [x0, #272] +ldr q3, [x0, #336] +ldr q19, [x0, #400] +ldr q1, [x0, #464] +ldr q18, [x0, #528] +ldr q13, [x0, #592] +ldr q0, [x0, #656] +ldr q20, [x0, #720] +ldr q10, [x0, #16] +ldr q17, [x0, #80] +ldr q12, [x0, #144] +ldr q22, [x0, #208] +sqrdmulh v15.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +mla v16.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +mla v21.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +mla v14.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +mla v11.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +mla v18.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +mla v13.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v0.4S, v29.s[0] +mul v0.4S, v0.4S,v30.s[0] +mla v0.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +mla v20.4S, v15.4S, v31.s[0] +sub v15.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +sub v16.4s, v3.4s, v21.4s +add v3.4s, v3.4s, v21.4s +sub v21.4s, v19.4s, v14.4s +add v19.4s, v19.4s, v14.4s +sub v14.4s, v1.4s, v11.4s +add v1.4s, v1.4s, v11.4s +sub v11.4s, v10.4s, v18.4s +add v10.4s, v10.4s, v18.4s +sub v18.4s, v17.4s, v13.4s +add v17.4s, v17.4s, v13.4s +sub v13.4s, v12.4s, v0.4s +add v12.4s, v12.4s, v0.4s +sub v0.4s, v22.4s, v20.4s +add v22.4s, v22.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v29.s[1] +mul v19.4S, v19.4S,v30.s[1] +mla v19.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v1.4S, v29.s[1] +mul v1.4S, v1.4S,v30.s[1] +mla v1.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v2.4S, v29.s[1] +mul v2.4S, v2.4S,v30.s[1] +mla v2.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +mla v3.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +mla v21.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +mla v14.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +mla v15.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +mla v16.4S, v20.4S, v31.s[0] +sub v20.4s, v12.4s, v19.4s +add v12.4s, v12.4s, v19.4s +sub v19.4s, v22.4s, v1.4s +add v22.4s, v22.4s, v1.4s +sub v1.4s, v10.4s, v2.4s +add v10.4s, v10.4s, v2.4s +sub v2.4s, v17.4s, v3.4s +add v17.4s, v17.4s, v3.4s +sub v3.4s, v13.4s, v21.4s +add v13.4s, v13.4s, v21.4s +sub v21.4s, v0.4s, v14.4s +add v0.4s, v0.4s, v14.4s +sub v14.4s, v11.4s, v15.4s +add v11.4s, v11.4s, v15.4s +sub v15.4s, v18.4s, v16.4s +add v18.4s, v18.4s, v16.4s +sqrdmulh v16.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +mla v12.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v22.4S, v27.s[0] +mul v22.4S, v22.4S,v28.s[0] +mla v22.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v20.4S, v27.s[1] +mul v20.4S, v20.4S,v28.s[1] +mla v20.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v19.4S, v27.s[1] +mul v19.4S, v19.4S,v28.s[1] +mla v19.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v13.4S, v27.s[2] +mul v13.4S, v13.4S,v28.s[2] +mla v13.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v0.4S, v27.s[2] +mul v0.4S, v0.4S,v28.s[2] +mla v0.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v3.4S, v27.s[3] +mul v3.4S, v3.4S,v28.s[3] +mla v3.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +mla v21.4S, v16.4S, v31.s[0] +sub v16.4s, v10.4s, v12.4s +add v10.4s, v10.4s, v12.4s +sub v12.4s, v17.4s, v22.4s +add v17.4s, v17.4s, v22.4s +sub v22.4s, v1.4s, v20.4s +add v1.4s, v1.4s, v20.4s +sub v20.4s, v2.4s, v19.4s +add v2.4s, v2.4s, v19.4s +sub v19.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +sub v13.4s, v18.4s, v0.4s +add v18.4s, v18.4s, v0.4s +sub v0.4s, v14.4s, v3.4s +add v14.4s, v14.4s, v3.4s +sub v3.4s, v15.4s, v21.4s +add v15.4s, v15.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v25.s[0] +mul v17.4S, v17.4S,v26.s[0] +mla v17.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v12.4S, v25.s[1] +mul v12.4S, v12.4S,v26.s[1] +mla v12.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v2.4S, v25.s[2] +mul v2.4S, v2.4S,v26.s[2] +mla v2.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v20.4S, v25.s[3] +mul v20.4S, v20.4S,v26.s[3] +mla v20.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v18.4S, v23.s[0] +mul v18.4S, v18.4S,v24.s[0] +mla v18.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v13.4S, v23.s[1] +mul v13.4S, v13.4S,v24.s[1] +mla v13.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v15.4S, v23.s[2] +mul v15.4S, v15.4S,v24.s[2] +mla v15.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v3.4S, v23.s[3] +mul v3.4S, v3.4S,v24.s[3] +mla v3.4S, v21.4S, v31.s[0] +sub v21.4s, v10.4s, v17.4s +add v10.4s, v10.4s, v17.4s +sub v17.4s, v16.4s, v12.4s +add v16.4s, v16.4s, v12.4s +sub v12.4s, v1.4s, v2.4s +add v1.4s, v1.4s, v2.4s +sub v2.4s, v22.4s, v20.4s +add v22.4s, v22.4s, v20.4s +sub v20.4s, v11.4s, v18.4s +add v11.4s, v11.4s, v18.4s +sub v18.4s, v19.4s, v13.4s +add v19.4s, v19.4s, v13.4s +sub v13.4s, v14.4s, v15.4s +add v14.4s, v14.4s, v15.4s +sub v15.4s, v0.4s, v3.4s +add v0.4s, v0.4s, v3.4s +str q10, [x0, #16] +str q21, [x0, #80] +str q16, [x0, #144] +str q17, [x0, #208] +str q1, [x0, #272] +str q12, [x0, #336] +str q22, [x0, #400] +str q2, [x0, #464] +str q11, [x0, #528] +str q20, [x0, #592] +str q19, [x0, #656] +str q18, [x0, #720] +str q14, [x0, #784] +str q13, [x0, #848] +str q0, [x0, #912] +str q15, [x0, #976] +ldr q4, [x17, #+128] +ldr q5, [x17, #+144] +ldr q6, [x17, #+160] +ldr q7, [x17, #+176] +ldr q8, [x17, #+192] +ldr q9, [x17, #+208] +ldr q3, [x17, #+224] +ldr q10, [x17, #+240] +ldr q21, [x0, #32] +ldr q16, [x0, #48] +ldr q17, [x0, #0] +ldr q1, [x0, #16] +ldr q12, [x17, #+256] +ldr q22, [x17, #+272] +sqrdmulh v2.4S, v21.4S, v5.s[0] +mul v21.4S, v21.4S,v4.s[0] +mla v21.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v16.4S, v5.s[0] +mul v16.4S, v16.4S,v4.s[0] +mla v16.4S, v2.4S, v31.s[0] +sub v2.4s, v17.4s, v21.4s +add v17.4s, v17.4s, v21.4s +sub v21.4s, v1.4s, v16.4s +add v1.4s, v1.4s, v16.4s +sqrdmulh v16.4S, v1.4S, v5.s[1] +mul v1.4S, v1.4S,v4.s[1] +mla v1.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v21.4S, v5.s[2] +mul v21.4S, v21.4S,v4.s[2] +mla v21.4S, v16.4S, v31.s[0] +sub v16.4s, v17.4s, v1.4s +add v17.4s, v17.4s, v1.4s +sub v1.4s, v2.4s, v21.4s +add v2.4s, v2.4s, v21.4s +str q17, [x0, #0] +str q16, [x0, #16] +str q2, [x0, #32] +str q1, [x0, #48] +ldr q1, [x0, #96] +ldr q2, [x0, #112] +ldr q16, [x0, #64] +ldr q17, [x0, #80] +ldr q21, [x17, #+288] +ldr q11, [x17, #+304] +sqrdmulh v20.4S, v1.4S, v7.s[0] +mul v1.4S, v1.4S,v6.s[0] +mla v1.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v2.4S, v7.s[0] +mul v2.4S, v2.4S,v6.s[0] +mla v2.4S, v20.4S, v31.s[0] +sub v20.4s, v16.4s, v1.4s +add v16.4s, v16.4s, v1.4s +sub v1.4s, v17.4s, v2.4s +add v17.4s, v17.4s, v2.4s +sqrdmulh v2.4S, v17.4S, v7.s[1] +mul v17.4S, v17.4S,v6.s[1] +mla v17.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v1.4S, v7.s[2] +mul v1.4S, v1.4S,v6.s[2] +mla v1.4S, v2.4S, v31.s[0] +sub v2.4s, v16.4s, v17.4s +add v16.4s, v16.4s, v17.4s +sub v17.4s, v20.4s, v1.4s +add v20.4s, v20.4s, v1.4s +str q16, [x0, #64] +str q2, [x0, #80] +str q20, [x0, #96] +str q17, [x0, #112] +ldr q17, [x0, #160] +ldr q20, [x0, #176] +ldr q2, [x0, #128] +ldr q16, [x0, #144] +ldr q1, [x17, #+320] +ldr q19, [x17, #+336] +sqrdmulh v18.4S, v17.4S, v9.s[0] +mul v17.4S, v17.4S,v8.s[0] +mla v17.4S, v18.4S, v31.s[0] +sqrdmulh v18.4S, v20.4S, v9.s[0] +mul v20.4S, v20.4S,v8.s[0] +mla v20.4S, v18.4S, v31.s[0] +sub v18.4s, v2.4s, v17.4s +add v2.4s, v2.4s, v17.4s +sub v17.4s, v16.4s, v20.4s +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v16.4S, v9.s[1] +mul v16.4S, v16.4S,v8.s[1] +mla v16.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v17.4S, v9.s[2] +mul v17.4S, v17.4S,v8.s[2] +mla v17.4S, v20.4S, v31.s[0] +sub v20.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +sub v16.4s, v18.4s, v17.4s +add v18.4s, v18.4s, v17.4s +str q2, [x0, #128] +str q20, [x0, #144] +str q18, [x0, #160] +str q16, [x0, #176] +ldr q16, [x0, #224] +ldr q18, [x0, #240] +ldr q20, [x0, #192] +ldr q2, [x0, #208] +ldr q17, [x17, #+352] +ldr q14, [x17, #+368] +sqrdmulh v13.4S, v16.4S, v10.s[0] +mul v16.4S, v16.4S,v3.s[0] +mla v16.4S, v13.4S, v31.s[0] +sqrdmulh v13.4S, v18.4S, v10.s[0] +mul v18.4S, v18.4S,v3.s[0] +mla v18.4S, v13.4S, v31.s[0] +sub v13.4s, v20.4s, v16.4s +add v20.4s, v20.4s, v16.4s +sub v16.4s, v2.4s, v18.4s +add v2.4s, v2.4s, v18.4s +sqrdmulh v18.4S, v2.4S, v10.s[1] +mul v2.4S, v2.4S,v3.s[1] +mla v2.4S, v18.4S, v31.s[0] +sqrdmulh v18.4S, v16.4S, v10.s[2] +mul v16.4S, v16.4S,v3.s[2] +mla v16.4S, v18.4S, v31.s[0] +sub v18.4s, v20.4s, v2.4s +add v20.4s, v20.4s, v2.4s +sub v2.4s, v13.4s, v16.4s +add v13.4s, v13.4s, v16.4s +str q20, [x0, #192] +str q18, [x0, #208] +str q13, [x0, #224] +str q2, [x0, #240] +ldr q2, [x0, #288] +ldr q13, [x0, #304] +ldr q18, [x0, #256] +ldr q20, [x0, #272] +ldr q16, [x17, #+384] +ldr q0, [x17, #+400] +sqrdmulh v15.4S, v2.4S, v22.s[0] +mul v2.4S, v2.4S,v12.s[0] +mla v2.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v13.4S, v22.s[0] +mul v13.4S, v13.4S,v12.s[0] +mla v13.4S, v15.4S, v31.s[0] +sub v15.4s, v18.4s, v2.4s +add v18.4s, v18.4s, v2.4s +sub v2.4s, v20.4s, v13.4s +add v20.4s, v20.4s, v13.4s +sqrdmulh v13.4S, v20.4S, v22.s[1] +mul v20.4S, v20.4S,v12.s[1] +mla v20.4S, v13.4S, v31.s[0] +sqrdmulh v13.4S, v2.4S, v22.s[2] +mul v2.4S, v2.4S,v12.s[2] +mla v2.4S, v13.4S, v31.s[0] +sub v13.4s, v18.4s, v20.4s +add v18.4s, v18.4s, v20.4s +sub v20.4s, v15.4s, v2.4s +add v15.4s, v15.4s, v2.4s +str q18, [x0, #256] +str q13, [x0, #272] +str q15, [x0, #288] +str q20, [x0, #304] +ldr q5, [x0, #352] +ldr q4, [x0, #368] +ldr q20, [x0, #320] +ldr q15, [x0, #336] +ldr q13, [x17, #+416] +ldr q18, [x17, #+432] +sqrdmulh v2.4S, v5.4S, v11.s[0] +mul v5.4S, v5.4S,v21.s[0] +mla v5.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v4.4S, v11.s[0] +mul v4.4S, v4.4S,v21.s[0] +mla v4.4S, v2.4S, v31.s[0] +sub v2.4s, v20.4s, v5.4s +add v20.4s, v20.4s, v5.4s +sub v5.4s, v15.4s, v4.4s +add v15.4s, v15.4s, v4.4s +sqrdmulh v4.4S, v15.4S, v11.s[1] +mul v15.4S, v15.4S,v21.s[1] +mla v15.4S, v4.4S, v31.s[0] +sqrdmulh v4.4S, v5.4S, v11.s[2] +mul v5.4S, v5.4S,v21.s[2] +mla v5.4S, v4.4S, v31.s[0] +sub v4.4s, v20.4s, v15.4s +add v20.4s, v20.4s, v15.4s +sub v15.4s, v2.4s, v5.4s +add v2.4s, v2.4s, v5.4s +str q20, [x0, #320] +str q4, [x0, #336] +str q2, [x0, #352] +str q15, [x0, #368] +ldr q7, [x0, #416] +ldr q6, [x0, #432] +ldr q15, [x0, #384] +ldr q2, [x0, #400] +ldr q4, [x17, #+448] +ldr q20, [x17, #+464] +sqrdmulh v5.4S, v7.4S, v19.s[0] +mul v7.4S, v7.4S,v1.s[0] +mla v7.4S, v5.4S, v31.s[0] +sqrdmulh v5.4S, v6.4S, v19.s[0] +mul v6.4S, v6.4S,v1.s[0] +mla v6.4S, v5.4S, v31.s[0] +sub v5.4s, v15.4s, v7.4s +add v15.4s, v15.4s, v7.4s +sub v7.4s, v2.4s, v6.4s +add v2.4s, v2.4s, v6.4s +sqrdmulh v6.4S, v2.4S, v19.s[1] +mul v2.4S, v2.4S,v1.s[1] +mla v2.4S, v6.4S, v31.s[0] +sqrdmulh v6.4S, v7.4S, v19.s[2] +mul v7.4S, v7.4S,v1.s[2] +mla v7.4S, v6.4S, v31.s[0] +sub v6.4s, v15.4s, v2.4s +add v15.4s, v15.4s, v2.4s +sub v2.4s, v5.4s, v7.4s +add v5.4s, v5.4s, v7.4s +str q15, [x0, #384] +str q6, [x0, #400] +str q5, [x0, #416] +str q2, [x0, #432] +ldr q9, [x0, #480] +ldr q8, [x0, #496] +ldr q2, [x0, #448] +ldr q5, [x0, #464] +ldr q6, [x17, #+480] +ldr q15, [x17, #+496] +sqrdmulh v7.4S, v9.4S, v14.s[0] +mul v9.4S, v9.4S,v17.s[0] +mla v9.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v8.4S, v14.s[0] +mul v8.4S, v8.4S,v17.s[0] +mla v8.4S, v7.4S, v31.s[0] +sub v7.4s, v2.4s, v9.4s +add v2.4s, v2.4s, v9.4s +sub v9.4s, v5.4s, v8.4s +add v5.4s, v5.4s, v8.4s +sqrdmulh v8.4S, v5.4S, v14.s[1] +mul v5.4S, v5.4S,v17.s[1] +mla v5.4S, v8.4S, v31.s[0] +sqrdmulh v8.4S, v9.4S, v14.s[2] +mul v9.4S, v9.4S,v17.s[2] +mla v9.4S, v8.4S, v31.s[0] +sub v8.4s, v2.4s, v5.4s +add v2.4s, v2.4s, v5.4s +sub v5.4s, v7.4s, v9.4s +add v7.4s, v7.4s, v9.4s +str q2, [x0, #448] +str q8, [x0, #464] +str q7, [x0, #480] +str q5, [x0, #496] +ldr q10, [x0, #544] +ldr q3, [x0, #560] +ldr q5, [x0, #512] +ldr q7, [x0, #528] +ldr q8, [x17, #+512] +ldr q2, [x17, #+528] +sqrdmulh v9.4S, v10.4S, v0.s[0] +mul v10.4S, v10.4S,v16.s[0] +mla v10.4S, v9.4S, v31.s[0] +sqrdmulh v9.4S, v3.4S, v0.s[0] +mul v3.4S, v3.4S,v16.s[0] +mla v3.4S, v9.4S, v31.s[0] +sub v9.4s, v5.4s, v10.4s +add v5.4s, v5.4s, v10.4s +sub v10.4s, v7.4s, v3.4s +add v7.4s, v7.4s, v3.4s +sqrdmulh v3.4S, v7.4S, v0.s[1] +mul v7.4S, v7.4S,v16.s[1] +mla v7.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v10.4S, v0.s[2] +mul v10.4S, v10.4S,v16.s[2] +mla v10.4S, v3.4S, v31.s[0] +sub v3.4s, v5.4s, v7.4s +add v5.4s, v5.4s, v7.4s +sub v7.4s, v9.4s, v10.4s +add v9.4s, v9.4s, v10.4s +str q5, [x0, #512] +str q3, [x0, #528] +str q9, [x0, #544] +str q7, [x0, #560] +ldr q22, [x0, #608] +ldr q12, [x0, #624] +ldr q7, [x0, #576] +ldr q9, [x0, #592] +ldr q3, [x17, #+544] +ldr q5, [x17, #+560] +sqrdmulh v10.4S, v22.4S, v18.s[0] +mul v22.4S, v22.4S,v13.s[0] +mla v22.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v12.4S, v18.s[0] +mul v12.4S, v12.4S,v13.s[0] +mla v12.4S, v10.4S, v31.s[0] +sub v10.4s, v7.4s, v22.4s +add v7.4s, v7.4s, v22.4s +sub v22.4s, v9.4s, v12.4s +add v9.4s, v9.4s, v12.4s +sqrdmulh v12.4S, v9.4S, v18.s[1] +mul v9.4S, v9.4S,v13.s[1] +mla v9.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v22.4S, v18.s[2] +mul v22.4S, v22.4S,v13.s[2] +mla v22.4S, v12.4S, v31.s[0] +sub v12.4s, v7.4s, v9.4s +add v7.4s, v7.4s, v9.4s +sub v9.4s, v10.4s, v22.4s +add v10.4s, v10.4s, v22.4s +str q7, [x0, #576] +str q12, [x0, #592] +str q10, [x0, #608] +str q9, [x0, #624] +ldr q11, [x0, #672] +ldr q21, [x0, #688] +ldr q9, [x0, #640] +ldr q10, [x0, #656] +ldr q12, [x17, #+576] +ldr q7, [x17, #+592] +sqrdmulh v22.4S, v11.4S, v20.s[0] +mul v11.4S, v11.4S,v4.s[0] +mla v11.4S, v22.4S, v31.s[0] +sqrdmulh v22.4S, v21.4S, v20.s[0] +mul v21.4S, v21.4S,v4.s[0] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v9.4s, v11.4s +add v9.4s, v9.4s, v11.4s +sub v11.4s, v10.4s, v21.4s +add v10.4s, v10.4s, v21.4s +sqrdmulh v21.4S, v10.4S, v20.s[1] +mul v10.4S, v10.4S,v4.s[1] +mla v10.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v11.4S, v20.s[2] +mul v11.4S, v11.4S,v4.s[2] +mla v11.4S, v21.4S, v31.s[0] +sub v21.4s, v9.4s, v10.4s +add v9.4s, v9.4s, v10.4s +sub v10.4s, v22.4s, v11.4s +add v22.4s, v22.4s, v11.4s +str q9, [x0, #640] +str q21, [x0, #656] +str q22, [x0, #672] +str q10, [x0, #688] +ldr q19, [x0, #736] +ldr q1, [x0, #752] +ldr q10, [x0, #704] +ldr q22, [x0, #720] +ldr q21, [x17, #+608] +ldr q9, [x17, #+624] +sqrdmulh v11.4S, v19.4S, v15.s[0] +mul v19.4S, v19.4S,v6.s[0] +mla v19.4S, v11.4S, v31.s[0] +sqrdmulh v11.4S, v1.4S, v15.s[0] +mul v1.4S, v1.4S,v6.s[0] +mla v1.4S, v11.4S, v31.s[0] +sub v11.4s, v10.4s, v19.4s +add v10.4s, v10.4s, v19.4s +sub v19.4s, v22.4s, v1.4s +add v22.4s, v22.4s, v1.4s +sqrdmulh v1.4S, v22.4S, v15.s[1] +mul v22.4S, v22.4S,v6.s[1] +mla v22.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v19.4S, v15.s[2] +mul v19.4S, v19.4S,v6.s[2] +mla v19.4S, v1.4S, v31.s[0] +sub v1.4s, v10.4s, v22.4s +add v10.4s, v10.4s, v22.4s +sub v22.4s, v11.4s, v19.4s +add v11.4s, v11.4s, v19.4s +str q10, [x0, #704] +str q1, [x0, #720] +str q11, [x0, #736] +str q22, [x0, #752] +ldr q14, [x0, #800] +ldr q17, [x0, #816] +ldr q22, [x0, #768] +ldr q11, [x0, #784] +sqrdmulh v1.4S, v14.4S, v2.s[0] +mul v14.4S, v14.4S,v8.s[0] +mla v14.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v17.4S, v2.s[0] +mul v17.4S, v17.4S,v8.s[0] +mla v17.4S, v1.4S, v31.s[0] +sub v1.4s, v22.4s, v14.4s +add v22.4s, v22.4s, v14.4s +sub v14.4s, v11.4s, v17.4s +add v11.4s, v11.4s, v17.4s +sqrdmulh v17.4S, v11.4S, v2.s[1] +mul v11.4S, v11.4S,v8.s[1] +mla v11.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v14.4S, v2.s[2] +mul v14.4S, v14.4S,v8.s[2] +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v22.4s, v11.4s +add v22.4s, v22.4s, v11.4s +sub v11.4s, v1.4s, v14.4s +add v1.4s, v1.4s, v14.4s +str q22, [x0, #768] +str q17, [x0, #784] +str q1, [x0, #800] +str q11, [x0, #816] +ldr q0, [x0, #864] +ldr q16, [x0, #880] +ldr q11, [x0, #832] +ldr q1, [x0, #848] +sqrdmulh v17.4S, v0.4S, v5.s[0] +mul v0.4S, v0.4S,v3.s[0] +mla v0.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v16.4S, v5.s[0] +mul v16.4S, v16.4S,v3.s[0] +mla v16.4S, v17.4S, v31.s[0] +sub v17.4s, v11.4s, v0.4s +add v11.4s, v11.4s, v0.4s +sub v0.4s, v1.4s, v16.4s +add v1.4s, v1.4s, v16.4s +sqrdmulh v16.4S, v1.4S, v5.s[1] +mul v1.4S, v1.4S,v3.s[1] +mla v1.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v0.4S, v5.s[2] +mul v0.4S, v0.4S,v3.s[2] +mla v0.4S, v16.4S, v31.s[0] +sub v16.4s, v11.4s, v1.4s +add v11.4s, v11.4s, v1.4s +sub v1.4s, v17.4s, v0.4s +add v17.4s, v17.4s, v0.4s +str q11, [x0, #832] +str q16, [x0, #848] +str q17, [x0, #864] +str q1, [x0, #880] +ldr q18, [x0, #928] +ldr q13, [x0, #944] +ldr q1, [x0, #896] +ldr q17, [x0, #912] +sqrdmulh v16.4S, v18.4S, v7.s[0] +mul v18.4S, v18.4S,v12.s[0] +mla v18.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v13.4S, v7.s[0] +mul v13.4S, v13.4S,v12.s[0] +mla v13.4S, v16.4S, v31.s[0] +sub v16.4s, v1.4s, v18.4s +add v1.4s, v1.4s, v18.4s +sub v18.4s, v17.4s, v13.4s +add v17.4s, v17.4s, v13.4s +sqrdmulh v13.4S, v17.4S, v7.s[1] +mul v17.4S, v17.4S,v12.s[1] +mla v17.4S, v13.4S, v31.s[0] +sqrdmulh v13.4S, v18.4S, v7.s[2] +mul v18.4S, v18.4S,v12.s[2] +mla v18.4S, v13.4S, v31.s[0] +sub v13.4s, v1.4s, v17.4s +add v1.4s, v1.4s, v17.4s +sub v17.4s, v16.4s, v18.4s +add v16.4s, v16.4s, v18.4s +str q1, [x0, #896] +str q13, [x0, #912] +str q16, [x0, #928] +str q17, [x0, #944] +ldr q20, [x0, #992] +ldr q4, [x0, #1008] +ldr q17, [x0, #960] +ldr q16, [x0, #976] +sqrdmulh v13.4S, v20.4S, v9.s[0] +mul v20.4S, v20.4S,v21.s[0] +mla v20.4S, v13.4S, v31.s[0] +sqrdmulh v13.4S, v4.4S, v9.s[0] +mul v4.4S, v4.4S,v21.s[0] +mla v4.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v20.4s +add v17.4s, v17.4s, v20.4s +sub v20.4s, v16.4s, v4.4s +add v16.4s, v16.4s, v4.4s +sqrdmulh v4.4S, v16.4S, v9.s[1] +mul v16.4S, v16.4S,v21.s[1] +mla v16.4S, v4.4S, v31.s[0] +sqrdmulh v4.4S, v20.4S, v9.s[2] +mul v20.4S, v20.4S,v21.s[2] +mla v20.4S, v4.4S, v31.s[0] +sub v4.4s, v17.4s, v16.4s +add v17.4s, v17.4s, v16.4s +sub v16.4s, v13.4s, v20.4s +add v13.4s, v13.4s, v20.4s +str q17, [x0, #960] +str q4, [x0, #976] +str q13, [x0, #992] +str q16, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1464 +// Instruction count: 1460 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_0.s b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_0.s new file mode 100644 index 0000000..b9ef60e --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_0.s @@ -0,0 +1,1494 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_3_z4_0 +.global _ntt_u32_incomplete_neon_asm_var_4_2_3_z4_0 +ntt_u32_incomplete_neon_asm_var_4_2_3_z4_0: +_ntt_u32_incomplete_neon_asm_var_4_2_3_z4_0: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #800] +ldr q21, [x0, #864] +ldr q20, [x0, #928] +ldr q19, [x0, #992] +ldr q18, [x0, #288] +ldr q17, [x0, #352] +ldr q16, [x0, #416] +ldr q3, [x0, #480] +sqrdmulh v2.4S, v22.4S, v29.s[0] +ldr q1, [x0, #544] +mul v22.4S, v22.4S,v30.s[0] +ldr q0, [x0, #608] +sqrdmulh v15.4S, v21.4S, v29.s[0] +ldr q14, [x0, #672] +mul v21.4S, v21.4S,v30.s[0] +ldr q13, [x0, #736] +mla v22.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q12, [x0, #32] +sub v11.4s, v18.4s, v22.4s +mla v21.4S, v15.4S, v31.s[0] +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q15, [x0, #96] +sub v10.4s, v17.4s, v21.4s +mla v20.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v1.4S, v29.s[0] +ldr q2, [x0, #160] +mul v1.4S, v1.4S,v30.s[0] +sub v9.4s, v16.4s, v20.4s +mla v19.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v0.4S, v29.s[0] +ldr q22, [x0, #224] +mul v0.4S, v0.4S,v30.s[0] +sub v8.4s, v3.4s, v19.4s +mla v1.4S, v21.4S, v31.s[0] +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v21.4s, v12.4s, v1.4s +mla v0.4S, v20.4S, v31.s[0] +add v12.4s, v12.4s, v1.4s +sqrdmulh v1.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +sub v20.4s, v15.4s, v0.4s +mla v14.4S, v19.4S, v31.s[0] +add v15.4s, v15.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v19.4s, v2.4s, v14.4s +mla v13.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v1.4s, v22.4s, v13.4s +mla v16.4S, v0.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v0.4s, v2.4s, v16.4s +mla v3.4S, v14.4S, v31.s[0] +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v14.4s, v22.4s, v3.4s +mla v18.4S, v13.4S, v31.s[0] +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v29.s[2] +mul v9.4S, v9.4S,v30.s[2] +sub v13.4s, v12.4s, v18.4s +mla v17.4S, v16.4S, v31.s[0] +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v8.4S, v29.s[2] +mul v8.4S, v8.4S,v30.s[2] +sub v16.4s, v15.4s, v17.4s +mla v9.4S, v3.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +sqrdmulh v17.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v3.4s, v19.4s, v9.4s +mla v8.4S, v18.4S, v31.s[0] +add v19.4s, v19.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v18.4s, v1.4s, v8.4s +mla v11.4S, v17.4S, v31.s[0] +add v1.4s, v1.4s, v8.4s +sqrdmulh v8.4S, v2.4S, v27.s[0] +mul v2.4S, v2.4S,v28.s[0] +sub v17.4s, v21.4s, v11.4s +mla v10.4S, v9.4S, v31.s[0] +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v27.s[0] +mul v22.4S, v22.4S,v28.s[0] +sub v9.4s, v20.4s, v10.4s +mla v2.4S, v8.4S, v31.s[0] +add v20.4s, v20.4s, v10.4s +sqrdmulh v10.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v8.4s, v12.4s, v2.4s +mla v22.4S, v11.4S, v31.s[0] +add v12.4s, v12.4s, v2.4s +sqrdmulh v2.4S, v14.4S, v27.s[1] +mul v14.4S, v14.4S,v28.s[1] +sub v11.4s, v15.4s, v22.4s +mla v0.4S, v10.4S, v31.s[0] +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v27.s[2] +mul v19.4S, v19.4S,v28.s[2] +sub v10.4s, v13.4s, v0.4s +mla v14.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v0.4s +sqrdmulh v0.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +sub v2.4s, v16.4s, v14.4s +mla v19.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v27.s[3] +mul v3.4S, v3.4S,v28.s[3] +sub v22.4s, v21.4s, v19.4s +mla v1.4S, v0.4S, v31.s[0] +add v21.4s, v21.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v0.4s, v20.4s, v1.4s +mla v3.4S, v14.4S, v31.s[0] +add v20.4s, v20.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v25.s[0] +mul v15.4S, v15.4S,v26.s[0] +sub v14.4s, v17.4s, v3.4s +mla v18.4S, v19.4S, v31.s[0] +add v17.4s, v17.4s, v3.4s +sqrdmulh v3.4S, v11.4S, v25.s[1] +mul v11.4S, v11.4S,v26.s[1] +sub v19.4s, v9.4s, v18.4s +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v1.4s, v12.4s, v15.4s +mla v11.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v25.s[3] +mul v2.4S, v2.4S,v26.s[3] +sub v3.4s, v8.4s, v11.4s +mla v16.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v11.4s +str q12, [x0, #32] +sqrdmulh v12.4S, v20.4S, v23.s[0] +str q1, [x0, #96] +mul v20.4S, v20.4S,v24.s[0] +ldr q1, [x0, #816] +sub v11.4s, v13.4s, v16.4s +ldr q18, [x0, #880] +mla v2.4S, v15.4S, v31.s[0] +add v13.4s, v13.4s, v16.4s +str q8, [x0, #160] +sqrdmulh v8.4S, v0.4S, v23.s[1] +str q3, [x0, #224] +mul v0.4S, v0.4S,v24.s[1] +ldr q3, [x0, #944] +sub v16.4s, v10.4s, v2.4s +ldr q15, [x0, #1008] +mla v20.4S, v12.4S, v31.s[0] +add v10.4s, v10.4s, v2.4s +str q13, [x0, #288] +sqrdmulh v13.4S, v9.4S, v23.s[2] +str q11, [x0, #352] +mul v9.4S, v9.4S,v24.s[2] +ldr q11, [x0, #304] +sub v2.4s, v21.4s, v20.4s +ldr q12, [x0, #368] +mla v0.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v20.4s +str q10, [x0, #416] +sqrdmulh v10.4S, v19.4S, v23.s[3] +str q16, [x0, #480] +mul v19.4S, v19.4S,v24.s[3] +ldr q16, [x0, #432] +sub v20.4s, v22.4s, v0.4s +ldr q8, [x0, #496] +mla v9.4S, v13.4S, v31.s[0] +add v22.4s, v22.4s, v0.4s +str q21, [x0, #544] +sqrdmulh v21.4S, v1.4S, v29.s[0] +str q2, [x0, #608] +ldr q2, [x0, #560] +mul v1.4S, v1.4S,v30.s[0] +ldr q0, [x0, #624] +sub v13.4s, v17.4s, v9.4s +mla v19.4S, v10.4S, v31.s[0] +add v17.4s, v17.4s, v9.4s +str q22, [x0, #672] +sqrdmulh v22.4S, v18.4S, v29.s[0] +str q20, [x0, #736] +ldr q20, [x0, #688] +mul v18.4S, v18.4S,v30.s[0] +ldr q9, [x0, #752] +sub v10.4s, v14.4s, v19.4s +mla v1.4S, v21.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +str q17, [x0, #800] +sqrdmulh v17.4S, v3.4S, v29.s[0] +str q13, [x0, #864] +mul v3.4S, v3.4S,v30.s[0] +ldr q13, [x0, #48] +sub v19.4s, v11.4s, v1.4s +mla v18.4S, v22.4S, v31.s[0] +add v11.4s, v11.4s, v1.4s +str q14, [x0, #928] +sqrdmulh v14.4S, v15.4S, v29.s[0] +str q10, [x0, #992] +mul v15.4S, v15.4S,v30.s[0] +ldr q10, [x0, #112] +sub v1.4s, v12.4s, v18.4s +mla v3.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v2.4S, v29.s[0] +ldr q17, [x0, #176] +mul v2.4S, v2.4S,v30.s[0] +sub v22.4s, v16.4s, v3.4s +mla v15.4S, v14.4S, v31.s[0] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v0.4S, v29.s[0] +ldr q14, [x0, #240] +mul v0.4S, v0.4S,v30.s[0] +sub v21.4s, v8.4s, v15.4s +mla v2.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +sub v18.4s, v13.4s, v2.4s +mla v0.4S, v3.4S, v31.s[0] +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v9.4S, v29.s[0] +mul v9.4S, v9.4S,v30.s[0] +sub v3.4s, v10.4s, v0.4s +mla v20.4S, v15.4S, v31.s[0] +add v10.4s, v10.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v15.4s, v17.4s, v20.4s +mla v9.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +sub v2.4s, v14.4s, v9.4s +mla v16.4S, v0.4S, v31.s[0] +add v14.4s, v14.4s, v9.4s +sqrdmulh v9.4S, v11.4S, v29.s[1] +mul v11.4S, v11.4S,v30.s[1] +sub v0.4s, v17.4s, v16.4s +mla v8.4S, v20.4S, v31.s[0] +add v17.4s, v17.4s, v16.4s +sqrdmulh v16.4S, v12.4S, v29.s[1] +mul v12.4S, v12.4S,v30.s[1] +sub v20.4s, v14.4s, v8.4s +mla v11.4S, v9.4S, v31.s[0] +add v14.4s, v14.4s, v8.4s +sqrdmulh v8.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +sub v9.4s, v13.4s, v11.4s +mla v12.4S, v16.4S, v31.s[0] +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +sub v16.4s, v10.4s, v12.4s +mla v22.4S, v8.4S, v31.s[0] +add v10.4s, v10.4s, v12.4s +sqrdmulh v12.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +sub v8.4s, v15.4s, v22.4s +mla v21.4S, v11.4S, v31.s[0] +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v1.4S, v29.s[2] +mul v1.4S, v1.4S,v30.s[2] +sub v11.4s, v2.4s, v21.4s +mla v19.4S, v12.4S, v31.s[0] +add v2.4s, v2.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v27.s[0] +mul v17.4S, v17.4S,v28.s[0] +sub v12.4s, v18.4s, v19.4s +mla v1.4S, v22.4S, v31.s[0] +add v18.4s, v18.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +sub v22.4s, v3.4s, v1.4s +mla v17.4S, v21.4S, v31.s[0] +add v3.4s, v3.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v21.4s, v13.4s, v17.4s +mla v14.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v17.4s +sqrdmulh v17.4S, v20.4S, v27.s[1] +mul v20.4S, v20.4S,v28.s[1] +sub v19.4s, v10.4s, v14.4s +mla v0.4S, v1.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v27.s[2] +mul v15.4S, v15.4S,v28.s[2] +sub v1.4s, v9.4s, v0.4s +mla v20.4S, v17.4S, v31.s[0] +add v9.4s, v9.4s, v0.4s +sqrdmulh v0.4S, v2.4S, v27.s[2] +mul v2.4S, v2.4S,v28.s[2] +sub v17.4s, v16.4s, v20.4s +mla v15.4S, v14.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v27.s[3] +mul v8.4S, v8.4S,v28.s[3] +sub v14.4s, v18.4s, v15.4s +mla v2.4S, v0.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v27.s[3] +mul v11.4S, v11.4S,v28.s[3] +sub v0.4s, v3.4s, v2.4s +mla v8.4S, v20.4S, v31.s[0] +add v3.4s, v3.4s, v2.4s +sqrdmulh v2.4S, v10.4S, v25.s[0] +mul v10.4S, v10.4S,v26.s[0] +sub v20.4s, v12.4s, v8.4s +mla v11.4S, v15.4S, v31.s[0] +add v12.4s, v12.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v25.s[1] +mul v19.4S, v19.4S,v26.s[1] +sub v15.4s, v22.4s, v11.4s +mla v10.4S, v2.4S, v31.s[0] +add v22.4s, v22.4s, v11.4s +sqrdmulh v11.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v2.4s, v13.4s, v10.4s +mla v19.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v10.4s +sqrdmulh v10.4S, v17.4S, v25.s[3] +mul v17.4S, v17.4S,v26.s[3] +sub v8.4s, v21.4s, v19.4s +mla v16.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v19.4s +str q13, [x0, #48] +sqrdmulh v13.4S, v3.4S, v23.s[0] +str q2, [x0, #112] +mul v3.4S, v3.4S,v24.s[0] +ldr q2, [x0, #768] +sub v19.4s, v9.4s, v16.4s +ldr q11, [x0, #832] +mla v17.4S, v10.4S, v31.s[0] +add v9.4s, v9.4s, v16.4s +str q21, [x0, #176] +sqrdmulh v21.4S, v0.4S, v23.s[1] +str q8, [x0, #240] +mul v0.4S, v0.4S,v24.s[1] +ldr q8, [x0, #896] +sub v16.4s, v1.4s, v17.4s +ldr q10, [x0, #960] +mla v3.4S, v13.4S, v31.s[0] +add v1.4s, v1.4s, v17.4s +str q9, [x0, #304] +sqrdmulh v9.4S, v22.4S, v23.s[2] +str q19, [x0, #368] +mul v22.4S, v22.4S,v24.s[2] +ldr q19, [x0, #256] +sub v17.4s, v18.4s, v3.4s +ldr q13, [x0, #320] +mla v0.4S, v21.4S, v31.s[0] +add v18.4s, v18.4s, v3.4s +str q1, [x0, #432] +sqrdmulh v1.4S, v15.4S, v23.s[3] +str q16, [x0, #496] +mul v15.4S, v15.4S,v24.s[3] +ldr q16, [x0, #384] +sub v3.4s, v14.4s, v0.4s +ldr q21, [x0, #448] +mla v22.4S, v9.4S, v31.s[0] +add v14.4s, v14.4s, v0.4s +str q18, [x0, #560] +sqrdmulh v18.4S, v2.4S, v29.s[0] +str q17, [x0, #624] +ldr q17, [x0, #512] +mul v2.4S, v2.4S,v30.s[0] +ldr q0, [x0, #576] +sub v9.4s, v12.4s, v22.4s +mla v15.4S, v1.4S, v31.s[0] +add v12.4s, v12.4s, v22.4s +str q14, [x0, #688] +sqrdmulh v14.4S, v11.4S, v29.s[0] +str q3, [x0, #752] +ldr q3, [x0, #640] +mul v11.4S, v11.4S,v30.s[0] +ldr q22, [x0, #704] +sub v1.4s, v20.4s, v15.4s +mla v2.4S, v18.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +str q12, [x0, #816] +sqrdmulh v12.4S, v8.4S, v29.s[0] +str q9, [x0, #880] +mul v8.4S, v8.4S,v30.s[0] +ldr q9, [x0, #0] +sub v15.4s, v19.4s, v2.4s +mla v11.4S, v14.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +str q20, [x0, #944] +sqrdmulh v20.4S, v10.4S, v29.s[0] +str q1, [x0, #1008] +mul v10.4S, v10.4S,v30.s[0] +ldr q1, [x0, #64] +sub v2.4s, v13.4s, v11.4s +mla v8.4S, v12.4S, v31.s[0] +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v29.s[0] +ldr q12, [x0, #128] +mul v17.4S, v17.4S,v30.s[0] +sub v14.4s, v16.4s, v8.4s +mla v10.4S, v20.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v0.4S, v29.s[0] +ldr q20, [x0, #192] +mul v0.4S, v0.4S,v30.s[0] +sub v18.4s, v21.4s, v10.4s +mla v17.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +sub v11.4s, v9.4s, v17.4s +mla v0.4S, v8.4S, v31.s[0] +add v9.4s, v9.4s, v17.4s +sqrdmulh v17.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v8.4s, v1.4s, v0.4s +mla v3.4S, v10.4S, v31.s[0] +add v1.4s, v1.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v10.4s, v12.4s, v3.4s +mla v22.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v17.4s, v20.4s, v22.4s +mla v16.4S, v0.4S, v31.s[0] +add v20.4s, v20.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[1] +mul v19.4S, v19.4S,v30.s[1] +sub v0.4s, v12.4s, v16.4s +mla v21.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v13.4S, v29.s[1] +mul v13.4S, v13.4S,v30.s[1] +sub v3.4s, v20.4s, v21.4s +mla v19.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v22.4s, v9.4s, v19.4s +mla v13.4S, v16.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v29.s[2] +mul v18.4S, v18.4S,v30.s[2] +sub v16.4s, v1.4s, v13.4s +mla v14.4S, v21.4S, v31.s[0] +add v1.4s, v1.4s, v13.4s +sqrdmulh v13.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +sub v21.4s, v10.4s, v14.4s +mla v18.4S, v19.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v19.4s, v17.4s, v18.4s +mla v15.4S, v13.4S, v31.s[0] +add v17.4s, v17.4s, v18.4s +sqrdmulh v18.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +sub v13.4s, v11.4s, v15.4s +mla v2.4S, v14.4S, v31.s[0] +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v27.s[0] +mul v20.4S, v20.4S,v28.s[0] +sub v14.4s, v8.4s, v2.4s +mla v12.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v2.4s +sqrdmulh v2.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v18.4s, v9.4s, v12.4s +mla v20.4S, v15.4S, v31.s[0] +add v9.4s, v9.4s, v12.4s +sqrdmulh v12.4S, v3.4S, v27.s[1] +mul v3.4S, v3.4S,v28.s[1] +sub v15.4s, v1.4s, v20.4s +mla v0.4S, v2.4S, v31.s[0] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v10.4S, v27.s[2] +mul v10.4S, v10.4S,v28.s[2] +sub v2.4s, v22.4s, v0.4s +mla v3.4S, v12.4S, v31.s[0] +add v22.4s, v22.4s, v0.4s +sqrdmulh v0.4S, v17.4S, v27.s[2] +mul v17.4S, v17.4S,v28.s[2] +sub v12.4s, v16.4s, v3.4s +mla v10.4S, v20.4S, v31.s[0] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +sub v20.4s, v11.4s, v10.4s +mla v17.4S, v0.4S, v31.s[0] +add v11.4s, v11.4s, v10.4s +sqrdmulh v10.4S, v19.4S, v27.s[3] +mul v19.4S, v19.4S,v28.s[3] +sub v0.4s, v8.4s, v17.4s +mla v21.4S, v3.4S, v31.s[0] +add v8.4s, v8.4s, v17.4s +sqrdmulh v17.4S, v1.4S, v25.s[0] +mul v1.4S, v1.4S,v26.s[0] +sub v3.4s, v13.4s, v21.4s +mla v19.4S, v10.4S, v31.s[0] +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v15.4S, v25.s[1] +mul v15.4S, v15.4S,v26.s[1] +sub v10.4s, v14.4s, v19.4s +mla v1.4S, v17.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +sqrdmulh v19.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v17.4s, v9.4s, v1.4s +mla v15.4S, v21.4S, v31.s[0] +add v9.4s, v9.4s, v1.4s +sqrdmulh v1.4S, v12.4S, v25.s[3] +mul v12.4S, v12.4S,v26.s[3] +sub v21.4s, v18.4s, v15.4s +mla v16.4S, v19.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +str q9, [x0, #0] +sqrdmulh v9.4S, v8.4S, v23.s[0] +str q17, [x0, #64] +mul v8.4S, v8.4S,v24.s[0] +ldr q17, [x0, #784] +sub v15.4s, v22.4s, v16.4s +ldr q19, [x0, #848] +mla v12.4S, v1.4S, v31.s[0] +add v22.4s, v22.4s, v16.4s +str q18, [x0, #128] +sqrdmulh v18.4S, v0.4S, v23.s[1] +str q21, [x0, #192] +mul v0.4S, v0.4S,v24.s[1] +ldr q21, [x0, #912] +sub v16.4s, v2.4s, v12.4s +ldr q1, [x0, #976] +mla v8.4S, v9.4S, v31.s[0] +add v2.4s, v2.4s, v12.4s +str q22, [x0, #256] +sqrdmulh v22.4S, v14.4S, v23.s[2] +str q15, [x0, #320] +mul v14.4S, v14.4S,v24.s[2] +ldr q15, [x0, #272] +sub v12.4s, v11.4s, v8.4s +ldr q9, [x0, #336] +mla v0.4S, v18.4S, v31.s[0] +add v11.4s, v11.4s, v8.4s +str q2, [x0, #384] +sqrdmulh v2.4S, v10.4S, v23.s[3] +str q16, [x0, #448] +mul v10.4S, v10.4S,v24.s[3] +ldr q16, [x0, #400] +sub v8.4s, v20.4s, v0.4s +ldr q18, [x0, #464] +mla v14.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v0.4s +str q11, [x0, #512] +sqrdmulh v11.4S, v17.4S, v29.s[0] +str q12, [x0, #576] +ldr q12, [x0, #528] +mul v17.4S, v17.4S,v30.s[0] +ldr q0, [x0, #592] +sub v22.4s, v13.4s, v14.4s +mla v10.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v14.4s +str q20, [x0, #640] +sqrdmulh v20.4S, v19.4S, v29.s[0] +str q8, [x0, #704] +ldr q8, [x0, #656] +mul v19.4S, v19.4S,v30.s[0] +ldr q14, [x0, #720] +sub v2.4s, v3.4s, v10.4s +mla v17.4S, v11.4S, v31.s[0] +add v3.4s, v3.4s, v10.4s +str q13, [x0, #768] +sqrdmulh v13.4S, v21.4S, v29.s[0] +str q22, [x0, #832] +mul v21.4S, v21.4S,v30.s[0] +ldr q22, [x0, #16] +sub v10.4s, v15.4s, v17.4s +mla v19.4S, v20.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +str q3, [x0, #896] +sqrdmulh v3.4S, v1.4S, v29.s[0] +str q2, [x0, #960] +mul v1.4S, v1.4S,v30.s[0] +ldr q2, [x0, #80] +sub v17.4s, v9.4s, v19.4s +mla v21.4S, v13.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v12.4S, v29.s[0] +ldr q13, [x0, #144] +mul v12.4S, v12.4S,v30.s[0] +sub v20.4s, v16.4s, v21.4s +mla v1.4S, v3.4S, v31.s[0] +add v16.4s, v16.4s, v21.4s +sqrdmulh v21.4S, v0.4S, v29.s[0] +ldr q3, [x0, #208] +mul v0.4S, v0.4S,v30.s[0] +sub v11.4s, v18.4s, v1.4s +mla v12.4S, v19.4S, v31.s[0] +add v18.4s, v18.4s, v1.4s +sqrdmulh v1.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v19.4s, v22.4s, v12.4s +mla v0.4S, v21.4S, v31.s[0] +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v21.4s, v2.4s, v0.4s +mla v8.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v1.4s, v13.4s, v8.4s +mla v14.4S, v12.4S, v31.s[0] +add v13.4s, v13.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v12.4s, v3.4s, v14.4s +mla v16.4S, v0.4S, v31.s[0] +add v3.4s, v3.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +sub v0.4s, v13.4s, v16.4s +mla v18.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v16.4s +sqrdmulh v16.4S, v9.4S, v29.s[1] +mul v9.4S, v9.4S,v30.s[1] +sub v8.4s, v3.4s, v18.4s +mla v15.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +sub v14.4s, v22.4s, v15.4s +mla v9.4S, v16.4S, v31.s[0] +add v22.4s, v22.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v16.4s, v2.4s, v9.4s +mla v20.4S, v18.4S, v31.s[0] +add v2.4s, v2.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v18.4s, v1.4s, v20.4s +mla v11.4S, v15.4S, v31.s[0] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +sub v15.4s, v12.4s, v11.4s +mla v10.4S, v9.4S, v31.s[0] +add v12.4s, v12.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v27.s[0] +mul v13.4S, v13.4S,v28.s[0] +sub v9.4s, v19.4s, v10.4s +mla v17.4S, v20.4S, v31.s[0] +add v19.4s, v19.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v27.s[0] +mul v3.4S, v3.4S,v28.s[0] +sub v20.4s, v21.4s, v17.4s +mla v13.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v11.4s, v22.4s, v13.4s +mla v3.4S, v10.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v8.4S, v27.s[1] +mul v8.4S, v8.4S,v28.s[1] +sub v10.4s, v2.4s, v3.4s +mla v0.4S, v17.4S, v31.s[0] +add v2.4s, v2.4s, v3.4s +sqrdmulh v3.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +sub v17.4s, v14.4s, v0.4s +mla v8.4S, v13.4S, v31.s[0] +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v12.4S, v27.s[2] +mul v12.4S, v12.4S,v28.s[2] +sub v13.4s, v16.4s, v8.4s +mla v1.4S, v3.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v3.4s, v19.4s, v1.4s +mla v12.4S, v0.4S, v31.s[0] +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +sub v0.4s, v21.4s, v12.4s +mla v18.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v2.4S, v25.s[0] +mul v2.4S, v2.4S,v26.s[0] +sub v8.4s, v9.4s, v18.4s +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v10.4S, v25.s[1] +mul v10.4S, v10.4S,v26.s[1] +sub v1.4s, v20.4s, v15.4s +mla v2.4S, v12.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v12.4s, v22.4s, v2.4s +mla v10.4S, v18.4S, v31.s[0] +add v22.4s, v22.4s, v2.4s +sqrdmulh v2.4S, v13.4S, v25.s[3] +mul v13.4S, v13.4S,v26.s[3] +sub v18.4s, v11.4s, v10.4s +mla v16.4S, v15.4S, v31.s[0] +add v11.4s, v11.4s, v10.4s +str q22, [x0, #16] +sqrdmulh v22.4S, v21.4S, v23.s[0] +str q12, [x0, #80] +mul v21.4S, v21.4S,v24.s[0] +sub v12.4s, v14.4s, v16.4s +mla v13.4S, v2.4S, v31.s[0] +add v14.4s, v14.4s, v16.4s +str q11, [x0, #144] +sqrdmulh v11.4S, v0.4S, v23.s[1] +str q18, [x0, #208] +mul v0.4S, v0.4S,v24.s[1] +sub v18.4s, v17.4s, v13.4s +mla v21.4S, v22.4S, v31.s[0] +add v17.4s, v17.4s, v13.4s +str q14, [x0, #272] +sqrdmulh v14.4S, v20.4S, v23.s[2] +str q12, [x0, #336] +mul v20.4S, v20.4S,v24.s[2] +sub v12.4s, v19.4s, v21.4s +mla v0.4S, v11.4S, v31.s[0] +add v19.4s, v19.4s, v21.4s +str q17, [x0, #400] +sqrdmulh v17.4S, v1.4S, v23.s[3] +str q18, [x0, #464] +mul v1.4S, v1.4S,v24.s[3] +sub v18.4s, v3.4s, v0.4s +mla v20.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v0.4s +str q19, [x0, #528] +str q12, [x0, #592] +sub v12.4s, v9.4s, v20.4s +mla v1.4S, v17.4S, v31.s[0] +add v9.4s, v9.4s, v20.4s +str q3, [x0, #656] +str q18, [x0, #720] +sub v18.4s, v8.4s, v1.4s +add v8.4s, v8.4s, v1.4s +str q9, [x0, #784] +str q12, [x0, #848] +str q8, [x0, #912] +str q18, [x0, #976] +ldr q4, [x17, #+128] +ldr q5, [x17, #+144] +ldr q6, [x17, #+160] +ldr q7, [x17, #+176] +ldr q15, [x17, #+192] +ldr q10, [x17, #+208] +ldr q2, [x17, #+224] +ldr q16, [x17, #+240] +ldr q22, [x0, #32] +ldr q13, [x0, #48] +ldr q11, [x0, #0] +ldr q21, [x0, #16] +sqrdmulh v14.4S, v22.4S, v5.s[0] +mul v22.4S, v22.4S,v4.s[0] +mla v22.4S, v14.4S, v31.s[0] +sub v14.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +sqrdmulh v22.4S, v13.4S, v5.s[0] +mul v13.4S, v13.4S,v4.s[0] +mla v13.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +ldr q13, [x17, #+256] +ldr q0, [x17, #+272] +sqrdmulh v19.4S, v21.4S, v5.s[1] +mul v21.4S, v21.4S,v4.s[1] +mla v21.4S, v19.4S, v31.s[0] +sub v19.4s, v11.4s, v21.4s +add v11.4s, v11.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v5.s[2] +mul v22.4S, v22.4S,v4.s[2] +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v14.4s, v22.4s +add v14.4s, v14.4s, v22.4s +str q11, [x0, #0] +str q19, [x0, #16] +str q14, [x0, #32] +str q21, [x0, #48] +ldr q21, [x0, #96] +ldr q14, [x0, #112] +ldr q19, [x0, #64] +ldr q11, [x0, #80] +sqrdmulh v22.4S, v21.4S, v7.s[0] +mul v21.4S, v21.4S,v6.s[0] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v19.4s, v21.4s +add v19.4s, v19.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v7.s[0] +mul v14.4S, v14.4S,v6.s[0] +mla v14.4S, v21.4S, v31.s[0] +sub v21.4s, v11.4s, v14.4s +add v11.4s, v11.4s, v14.4s +ldr q14, [x17, #+288] +ldr q17, [x17, #+304] +sqrdmulh v20.4S, v11.4S, v7.s[1] +mul v11.4S, v11.4S,v6.s[1] +mla v11.4S, v20.4S, v31.s[0] +sub v20.4s, v19.4s, v11.4s +add v19.4s, v19.4s, v11.4s +sqrdmulh v11.4S, v21.4S, v7.s[2] +mul v21.4S, v21.4S,v6.s[2] +mla v21.4S, v11.4S, v31.s[0] +sub v11.4s, v22.4s, v21.4s +add v22.4s, v22.4s, v21.4s +str q19, [x0, #64] +str q20, [x0, #80] +str q22, [x0, #96] +str q11, [x0, #112] +ldr q11, [x0, #160] +ldr q22, [x0, #176] +ldr q20, [x0, #128] +ldr q19, [x0, #144] +sqrdmulh v21.4S, v11.4S, v10.s[0] +mul v11.4S, v11.4S,v15.s[0] +mla v11.4S, v21.4S, v31.s[0] +sub v21.4s, v20.4s, v11.4s +add v20.4s, v20.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v10.s[0] +mul v22.4S, v22.4S,v15.s[0] +mla v22.4S, v11.4S, v31.s[0] +sub v11.4s, v19.4s, v22.4s +add v19.4s, v19.4s, v22.4s +ldr q22, [x17, #+320] +ldr q3, [x17, #+336] +sqrdmulh v1.4S, v19.4S, v10.s[1] +mul v19.4S, v19.4S,v15.s[1] +mla v19.4S, v1.4S, v31.s[0] +sub v1.4s, v20.4s, v19.4s +add v20.4s, v20.4s, v19.4s +sqrdmulh v19.4S, v11.4S, v10.s[2] +mul v11.4S, v11.4S,v15.s[2] +mla v11.4S, v19.4S, v31.s[0] +sub v19.4s, v21.4s, v11.4s +add v21.4s, v21.4s, v11.4s +str q20, [x0, #128] +str q1, [x0, #144] +str q21, [x0, #160] +str q19, [x0, #176] +ldr q19, [x0, #224] +ldr q21, [x0, #240] +ldr q1, [x0, #192] +ldr q20, [x0, #208] +sqrdmulh v11.4S, v19.4S, v16.s[0] +mul v19.4S, v19.4S,v2.s[0] +mla v19.4S, v11.4S, v31.s[0] +sub v11.4s, v1.4s, v19.4s +add v1.4s, v1.4s, v19.4s +sqrdmulh v19.4S, v21.4S, v16.s[0] +mul v21.4S, v21.4S,v2.s[0] +mla v21.4S, v19.4S, v31.s[0] +sub v19.4s, v20.4s, v21.4s +add v20.4s, v20.4s, v21.4s +ldr q21, [x17, #+352] +ldr q9, [x17, #+368] +sqrdmulh v12.4S, v20.4S, v16.s[1] +mul v20.4S, v20.4S,v2.s[1] +mla v20.4S, v12.4S, v31.s[0] +sub v12.4s, v1.4s, v20.4s +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v16.s[2] +mul v19.4S, v19.4S,v2.s[2] +mla v19.4S, v20.4S, v31.s[0] +sub v20.4s, v11.4s, v19.4s +add v11.4s, v11.4s, v19.4s +str q1, [x0, #192] +str q12, [x0, #208] +str q11, [x0, #224] +str q20, [x0, #240] +ldr q20, [x0, #288] +ldr q11, [x0, #304] +ldr q12, [x0, #256] +ldr q1, [x0, #272] +sqrdmulh v19.4S, v20.4S, v0.s[0] +mul v20.4S, v20.4S,v13.s[0] +mla v20.4S, v19.4S, v31.s[0] +sub v19.4s, v12.4s, v20.4s +add v12.4s, v12.4s, v20.4s +sqrdmulh v20.4S, v11.4S, v0.s[0] +mul v11.4S, v11.4S,v13.s[0] +mla v11.4S, v20.4S, v31.s[0] +sub v20.4s, v1.4s, v11.4s +add v1.4s, v1.4s, v11.4s +ldr q11, [x17, #+384] +ldr q8, [x17, #+400] +sqrdmulh v18.4S, v1.4S, v0.s[1] +mul v1.4S, v1.4S,v13.s[1] +mla v1.4S, v18.4S, v31.s[0] +sub v18.4s, v12.4s, v1.4s +add v12.4s, v12.4s, v1.4s +sqrdmulh v1.4S, v20.4S, v0.s[2] +mul v20.4S, v20.4S,v13.s[2] +mla v20.4S, v1.4S, v31.s[0] +sub v1.4s, v19.4s, v20.4s +add v19.4s, v19.4s, v20.4s +str q12, [x0, #256] +str q18, [x0, #272] +str q19, [x0, #288] +str q1, [x0, #304] +ldr q5, [x0, #352] +ldr q4, [x0, #368] +ldr q1, [x0, #320] +ldr q19, [x0, #336] +sqrdmulh v18.4S, v5.4S, v17.s[0] +mul v5.4S, v5.4S,v14.s[0] +mla v5.4S, v18.4S, v31.s[0] +sub v18.4s, v1.4s, v5.4s +add v1.4s, v1.4s, v5.4s +sqrdmulh v5.4S, v4.4S, v17.s[0] +mul v4.4S, v4.4S,v14.s[0] +mla v4.4S, v5.4S, v31.s[0] +sub v5.4s, v19.4s, v4.4s +add v19.4s, v19.4s, v4.4s +ldr q4, [x17, #+416] +ldr q12, [x17, #+432] +sqrdmulh v20.4S, v19.4S, v17.s[1] +mul v19.4S, v19.4S,v14.s[1] +mla v19.4S, v20.4S, v31.s[0] +sub v20.4s, v1.4s, v19.4s +add v1.4s, v1.4s, v19.4s +sqrdmulh v19.4S, v5.4S, v17.s[2] +mul v5.4S, v5.4S,v14.s[2] +mla v5.4S, v19.4S, v31.s[0] +sub v19.4s, v18.4s, v5.4s +add v18.4s, v18.4s, v5.4s +str q1, [x0, #320] +str q20, [x0, #336] +str q18, [x0, #352] +str q19, [x0, #368] +ldr q7, [x0, #416] +ldr q6, [x0, #432] +ldr q19, [x0, #384] +ldr q18, [x0, #400] +sqrdmulh v20.4S, v7.4S, v3.s[0] +mul v7.4S, v7.4S,v22.s[0] +mla v7.4S, v20.4S, v31.s[0] +sub v20.4s, v19.4s, v7.4s +add v19.4s, v19.4s, v7.4s +sqrdmulh v7.4S, v6.4S, v3.s[0] +mul v6.4S, v6.4S,v22.s[0] +mla v6.4S, v7.4S, v31.s[0] +sub v7.4s, v18.4s, v6.4s +add v18.4s, v18.4s, v6.4s +ldr q6, [x17, #+448] +ldr q1, [x17, #+464] +sqrdmulh v5.4S, v18.4S, v3.s[1] +mul v18.4S, v18.4S,v22.s[1] +mla v18.4S, v5.4S, v31.s[0] +sub v5.4s, v19.4s, v18.4s +add v19.4s, v19.4s, v18.4s +sqrdmulh v18.4S, v7.4S, v3.s[2] +mul v7.4S, v7.4S,v22.s[2] +mla v7.4S, v18.4S, v31.s[0] +sub v18.4s, v20.4s, v7.4s +add v20.4s, v20.4s, v7.4s +str q19, [x0, #384] +str q5, [x0, #400] +str q20, [x0, #416] +str q18, [x0, #432] +ldr q10, [x0, #480] +ldr q15, [x0, #496] +ldr q18, [x0, #448] +ldr q20, [x0, #464] +sqrdmulh v5.4S, v10.4S, v9.s[0] +mul v10.4S, v10.4S,v21.s[0] +mla v10.4S, v5.4S, v31.s[0] +sub v5.4s, v18.4s, v10.4s +add v18.4s, v18.4s, v10.4s +sqrdmulh v10.4S, v15.4S, v9.s[0] +mul v15.4S, v15.4S,v21.s[0] +mla v15.4S, v10.4S, v31.s[0] +sub v10.4s, v20.4s, v15.4s +add v20.4s, v20.4s, v15.4s +ldr q15, [x17, #+480] +ldr q19, [x17, #+496] +sqrdmulh v7.4S, v20.4S, v9.s[1] +mul v20.4S, v20.4S,v21.s[1] +mla v20.4S, v7.4S, v31.s[0] +sub v7.4s, v18.4s, v20.4s +add v18.4s, v18.4s, v20.4s +sqrdmulh v20.4S, v10.4S, v9.s[2] +mul v10.4S, v10.4S,v21.s[2] +mla v10.4S, v20.4S, v31.s[0] +sub v20.4s, v5.4s, v10.4s +add v5.4s, v5.4s, v10.4s +str q18, [x0, #448] +str q7, [x0, #464] +str q5, [x0, #480] +str q20, [x0, #496] +ldr q16, [x0, #544] +ldr q2, [x0, #560] +ldr q20, [x0, #512] +ldr q5, [x0, #528] +sqrdmulh v7.4S, v16.4S, v8.s[0] +mul v16.4S, v16.4S,v11.s[0] +mla v16.4S, v7.4S, v31.s[0] +sub v7.4s, v20.4s, v16.4s +add v20.4s, v20.4s, v16.4s +sqrdmulh v16.4S, v2.4S, v8.s[0] +mul v2.4S, v2.4S,v11.s[0] +mla v2.4S, v16.4S, v31.s[0] +sub v16.4s, v5.4s, v2.4s +add v5.4s, v5.4s, v2.4s +ldr q2, [x17, #+512] +ldr q18, [x17, #+528] +sqrdmulh v10.4S, v5.4S, v8.s[1] +mul v5.4S, v5.4S,v11.s[1] +mla v5.4S, v10.4S, v31.s[0] +sub v10.4s, v20.4s, v5.4s +add v20.4s, v20.4s, v5.4s +sqrdmulh v5.4S, v16.4S, v8.s[2] +mul v16.4S, v16.4S,v11.s[2] +mla v16.4S, v5.4S, v31.s[0] +sub v5.4s, v7.4s, v16.4s +add v7.4s, v7.4s, v16.4s +str q20, [x0, #512] +str q10, [x0, #528] +str q7, [x0, #544] +str q5, [x0, #560] +ldr q0, [x0, #608] +ldr q13, [x0, #624] +ldr q5, [x0, #576] +ldr q7, [x0, #592] +sqrdmulh v10.4S, v0.4S, v12.s[0] +mul v0.4S, v0.4S,v4.s[0] +mla v0.4S, v10.4S, v31.s[0] +sub v10.4s, v5.4s, v0.4s +add v5.4s, v5.4s, v0.4s +sqrdmulh v0.4S, v13.4S, v12.s[0] +mul v13.4S, v13.4S,v4.s[0] +mla v13.4S, v0.4S, v31.s[0] +sub v0.4s, v7.4s, v13.4s +add v7.4s, v7.4s, v13.4s +ldr q13, [x17, #+544] +ldr q20, [x17, #+560] +sqrdmulh v16.4S, v7.4S, v12.s[1] +mul v7.4S, v7.4S,v4.s[1] +mla v7.4S, v16.4S, v31.s[0] +sub v16.4s, v5.4s, v7.4s +add v5.4s, v5.4s, v7.4s +sqrdmulh v7.4S, v0.4S, v12.s[2] +mul v0.4S, v0.4S,v4.s[2] +mla v0.4S, v7.4S, v31.s[0] +sub v7.4s, v10.4s, v0.4s +add v10.4s, v10.4s, v0.4s +str q5, [x0, #576] +str q16, [x0, #592] +str q10, [x0, #608] +str q7, [x0, #624] +ldr q17, [x0, #672] +ldr q14, [x0, #688] +ldr q7, [x0, #640] +ldr q10, [x0, #656] +sqrdmulh v16.4S, v17.4S, v1.s[0] +mul v17.4S, v17.4S,v6.s[0] +mla v17.4S, v16.4S, v31.s[0] +sub v16.4s, v7.4s, v17.4s +add v7.4s, v7.4s, v17.4s +sqrdmulh v17.4S, v14.4S, v1.s[0] +mul v14.4S, v14.4S,v6.s[0] +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v10.4s, v14.4s +add v10.4s, v10.4s, v14.4s +ldr q14, [x17, #+576] +ldr q5, [x17, #+592] +sqrdmulh v0.4S, v10.4S, v1.s[1] +mul v10.4S, v10.4S,v6.s[1] +mla v10.4S, v0.4S, v31.s[0] +sub v0.4s, v7.4s, v10.4s +add v7.4s, v7.4s, v10.4s +sqrdmulh v10.4S, v17.4S, v1.s[2] +mul v17.4S, v17.4S,v6.s[2] +mla v17.4S, v10.4S, v31.s[0] +sub v10.4s, v16.4s, v17.4s +add v16.4s, v16.4s, v17.4s +str q7, [x0, #640] +str q0, [x0, #656] +str q16, [x0, #672] +str q10, [x0, #688] +ldr q3, [x0, #736] +ldr q22, [x0, #752] +ldr q10, [x0, #704] +ldr q16, [x0, #720] +sqrdmulh v0.4S, v3.4S, v19.s[0] +mul v3.4S, v3.4S,v15.s[0] +mla v3.4S, v0.4S, v31.s[0] +sub v0.4s, v10.4s, v3.4s +add v10.4s, v10.4s, v3.4s +sqrdmulh v3.4S, v22.4S, v19.s[0] +mul v22.4S, v22.4S,v15.s[0] +mla v22.4S, v3.4S, v31.s[0] +sub v3.4s, v16.4s, v22.4s +add v16.4s, v16.4s, v22.4s +ldr q22, [x17, #+608] +ldr q7, [x17, #+624] +sqrdmulh v17.4S, v16.4S, v19.s[1] +mul v16.4S, v16.4S,v15.s[1] +mla v16.4S, v17.4S, v31.s[0] +sub v17.4s, v10.4s, v16.4s +add v10.4s, v10.4s, v16.4s +sqrdmulh v16.4S, v3.4S, v19.s[2] +mul v3.4S, v3.4S,v15.s[2] +mla v3.4S, v16.4S, v31.s[0] +sub v16.4s, v0.4s, v3.4s +add v0.4s, v0.4s, v3.4s +str q10, [x0, #704] +str q17, [x0, #720] +str q0, [x0, #736] +str q16, [x0, #752] +ldr q9, [x0, #800] +ldr q21, [x0, #816] +ldr q16, [x0, #768] +ldr q0, [x0, #784] +sqrdmulh v17.4S, v9.4S, v18.s[0] +mul v9.4S, v9.4S,v2.s[0] +mla v9.4S, v17.4S, v31.s[0] +sub v17.4s, v16.4s, v9.4s +add v16.4s, v16.4s, v9.4s +sqrdmulh v9.4S, v21.4S, v18.s[0] +mul v21.4S, v21.4S,v2.s[0] +mla v21.4S, v9.4S, v31.s[0] +sub v9.4s, v0.4s, v21.4s +add v0.4s, v0.4s, v21.4s +sqrdmulh v21.4S, v0.4S, v18.s[1] +mul v0.4S, v0.4S,v2.s[1] +mla v0.4S, v21.4S, v31.s[0] +sub v21.4s, v16.4s, v0.4s +add v16.4s, v16.4s, v0.4s +sqrdmulh v0.4S, v9.4S, v18.s[2] +mul v9.4S, v9.4S,v2.s[2] +mla v9.4S, v0.4S, v31.s[0] +sub v0.4s, v17.4s, v9.4s +add v17.4s, v17.4s, v9.4s +str q16, [x0, #768] +str q21, [x0, #784] +str q17, [x0, #800] +str q0, [x0, #816] +ldr q8, [x0, #864] +ldr q11, [x0, #880] +ldr q0, [x0, #832] +ldr q17, [x0, #848] +sqrdmulh v21.4S, v8.4S, v20.s[0] +mul v8.4S, v8.4S,v13.s[0] +mla v8.4S, v21.4S, v31.s[0] +sub v21.4s, v0.4s, v8.4s +add v0.4s, v0.4s, v8.4s +sqrdmulh v8.4S, v11.4S, v20.s[0] +mul v11.4S, v11.4S,v13.s[0] +mla v11.4S, v8.4S, v31.s[0] +sub v8.4s, v17.4s, v11.4s +add v17.4s, v17.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v20.s[1] +mul v17.4S, v17.4S,v13.s[1] +mla v17.4S, v11.4S, v31.s[0] +sub v11.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +sqrdmulh v17.4S, v8.4S, v20.s[2] +mul v8.4S, v8.4S,v13.s[2] +mla v8.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v8.4s +add v21.4s, v21.4s, v8.4s +str q0, [x0, #832] +str q11, [x0, #848] +str q21, [x0, #864] +str q17, [x0, #880] +ldr q12, [x0, #928] +ldr q4, [x0, #944] +ldr q17, [x0, #896] +ldr q21, [x0, #912] +sqrdmulh v11.4S, v12.4S, v5.s[0] +mul v12.4S, v12.4S,v14.s[0] +mla v12.4S, v11.4S, v31.s[0] +sub v11.4s, v17.4s, v12.4s +add v17.4s, v17.4s, v12.4s +sqrdmulh v12.4S, v4.4S, v5.s[0] +mul v4.4S, v4.4S,v14.s[0] +mla v4.4S, v12.4S, v31.s[0] +sub v12.4s, v21.4s, v4.4s +add v21.4s, v21.4s, v4.4s +sqrdmulh v4.4S, v21.4S, v5.s[1] +mul v21.4S, v21.4S,v14.s[1] +mla v21.4S, v4.4S, v31.s[0] +sub v4.4s, v17.4s, v21.4s +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v12.4S, v5.s[2] +mul v12.4S, v12.4S,v14.s[2] +mla v12.4S, v21.4S, v31.s[0] +sub v21.4s, v11.4s, v12.4s +add v11.4s, v11.4s, v12.4s +str q17, [x0, #896] +str q4, [x0, #912] +str q11, [x0, #928] +str q21, [x0, #944] +ldr q1, [x0, #992] +ldr q6, [x0, #1008] +ldr q21, [x0, #960] +ldr q11, [x0, #976] +sqrdmulh v4.4S, v1.4S, v7.s[0] +mul v1.4S, v1.4S,v22.s[0] +mla v1.4S, v4.4S, v31.s[0] +sub v4.4s, v21.4s, v1.4s +add v21.4s, v21.4s, v1.4s +sqrdmulh v1.4S, v6.4S, v7.s[0] +mul v6.4S, v6.4S,v22.s[0] +mla v6.4S, v1.4S, v31.s[0] +sub v1.4s, v11.4s, v6.4s +add v11.4s, v11.4s, v6.4s +sqrdmulh v6.4S, v11.4S, v7.s[1] +mul v11.4S, v11.4S,v22.s[1] +mla v11.4S, v6.4S, v31.s[0] +sub v6.4s, v21.4s, v11.4s +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v1.4S, v7.s[2] +mul v1.4S, v1.4S,v22.s[2] +mla v1.4S, v11.4S, v31.s[0] +sub v11.4s, v4.4s, v1.4s +add v4.4s, v4.4s, v1.4s +str q21, [x0, #960] +str q6, [x0, #976] +str q4, [x0, #992] +str q11, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1464 +// Instruction count: 1460 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_1.s b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_1.s new file mode 100644 index 0000000..6da50a5 --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_1.s @@ -0,0 +1,1494 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_3_z4_1 +.global _ntt_u32_incomplete_neon_asm_var_4_2_3_z4_1 +ntt_u32_incomplete_neon_asm_var_4_2_3_z4_1: +_ntt_u32_incomplete_neon_asm_var_4_2_3_z4_1: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #800] +ldr q21, [x0, #864] +ldr q20, [x0, #928] +ldr q19, [x0, #992] +ldr q18, [x0, #288] +ldr q17, [x0, #352] +ldr q16, [x0, #416] +ldr q3, [x0, #480] +sqrdmulh v2.4S, v22.4S, v29.s[0] +ldr q1, [x0, #544] +mul v22.4S, v22.4S,v30.s[0] +ldr q0, [x0, #608] +sqrdmulh v15.4S, v21.4S, v29.s[0] +ldr q14, [x0, #672] +mul v21.4S, v21.4S,v30.s[0] +ldr q13, [x0, #736] +mla v22.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q12, [x0, #32] +sub v11.4s, v18.4s, v22.4s +mla v21.4S, v15.4S, v31.s[0] +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q15, [x0, #96] +sub v10.4s, v17.4s, v21.4s +mla v20.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v1.4S, v29.s[0] +ldr q2, [x0, #160] +mul v1.4S, v1.4S,v30.s[0] +sub v9.4s, v16.4s, v20.4s +mla v19.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v0.4S, v29.s[0] +ldr q22, [x0, #224] +mul v0.4S, v0.4S,v30.s[0] +sub v8.4s, v3.4s, v19.4s +mla v1.4S, v21.4S, v31.s[0] +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v21.4s, v12.4s, v1.4s +mla v0.4S, v20.4S, v31.s[0] +add v12.4s, v12.4s, v1.4s +sqrdmulh v1.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +sub v20.4s, v15.4s, v0.4s +mla v14.4S, v19.4S, v31.s[0] +add v15.4s, v15.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v19.4s, v2.4s, v14.4s +mla v13.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v1.4s, v22.4s, v13.4s +mla v16.4S, v0.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v0.4s, v2.4s, v16.4s +mla v3.4S, v14.4S, v31.s[0] +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v14.4s, v22.4s, v3.4s +mla v18.4S, v13.4S, v31.s[0] +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v29.s[2] +mul v9.4S, v9.4S,v30.s[2] +sub v13.4s, v12.4s, v18.4s +mla v17.4S, v16.4S, v31.s[0] +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v8.4S, v29.s[2] +mul v8.4S, v8.4S,v30.s[2] +sub v16.4s, v15.4s, v17.4s +mla v9.4S, v3.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +sqrdmulh v17.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v3.4s, v19.4s, v9.4s +mla v8.4S, v18.4S, v31.s[0] +add v19.4s, v19.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v18.4s, v1.4s, v8.4s +mla v11.4S, v17.4S, v31.s[0] +add v1.4s, v1.4s, v8.4s +sqrdmulh v8.4S, v2.4S, v27.s[0] +mul v2.4S, v2.4S,v28.s[0] +sub v17.4s, v21.4s, v11.4s +mla v10.4S, v9.4S, v31.s[0] +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v27.s[0] +mul v22.4S, v22.4S,v28.s[0] +sub v9.4s, v20.4s, v10.4s +mla v2.4S, v8.4S, v31.s[0] +add v20.4s, v20.4s, v10.4s +sqrdmulh v10.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v8.4s, v12.4s, v2.4s +mla v22.4S, v11.4S, v31.s[0] +add v12.4s, v12.4s, v2.4s +sqrdmulh v2.4S, v14.4S, v27.s[1] +mul v14.4S, v14.4S,v28.s[1] +sub v11.4s, v15.4s, v22.4s +mla v0.4S, v10.4S, v31.s[0] +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v27.s[2] +mul v19.4S, v19.4S,v28.s[2] +sub v10.4s, v13.4s, v0.4s +mla v14.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v0.4s +sqrdmulh v0.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +sub v2.4s, v16.4s, v14.4s +mla v19.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v27.s[3] +mul v3.4S, v3.4S,v28.s[3] +sub v22.4s, v21.4s, v19.4s +mla v1.4S, v0.4S, v31.s[0] +add v21.4s, v21.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v0.4s, v20.4s, v1.4s +mla v3.4S, v14.4S, v31.s[0] +add v20.4s, v20.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v25.s[0] +mul v15.4S, v15.4S,v26.s[0] +sub v14.4s, v17.4s, v3.4s +mla v18.4S, v19.4S, v31.s[0] +add v17.4s, v17.4s, v3.4s +sqrdmulh v3.4S, v11.4S, v25.s[1] +mul v11.4S, v11.4S,v26.s[1] +sub v19.4s, v9.4s, v18.4s +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v1.4s, v12.4s, v15.4s +mla v11.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v25.s[3] +mul v2.4S, v2.4S,v26.s[3] +sub v3.4s, v8.4s, v11.4s +mla v16.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v11.4s +str q12, [x0, #32] +sqrdmulh v12.4S, v20.4S, v23.s[0] +str q1, [x0, #96] +mul v20.4S, v20.4S,v24.s[0] +ldr q1, [x0, #816] +sub v11.4s, v13.4s, v16.4s +ldr q18, [x0, #880] +mla v2.4S, v15.4S, v31.s[0] +add v13.4s, v13.4s, v16.4s +str q8, [x0, #160] +sqrdmulh v8.4S, v0.4S, v23.s[1] +str q3, [x0, #224] +mul v0.4S, v0.4S,v24.s[1] +ldr q3, [x0, #944] +sub v16.4s, v10.4s, v2.4s +ldr q15, [x0, #1008] +mla v20.4S, v12.4S, v31.s[0] +add v10.4s, v10.4s, v2.4s +str q13, [x0, #288] +sqrdmulh v13.4S, v9.4S, v23.s[2] +str q11, [x0, #352] +mul v9.4S, v9.4S,v24.s[2] +ldr q11, [x0, #304] +sub v2.4s, v21.4s, v20.4s +ldr q12, [x0, #368] +mla v0.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v20.4s +str q10, [x0, #416] +sqrdmulh v10.4S, v19.4S, v23.s[3] +str q16, [x0, #480] +mul v19.4S, v19.4S,v24.s[3] +ldr q16, [x0, #432] +sub v20.4s, v22.4s, v0.4s +ldr q8, [x0, #496] +mla v9.4S, v13.4S, v31.s[0] +add v22.4s, v22.4s, v0.4s +str q21, [x0, #544] +sqrdmulh v21.4S, v1.4S, v29.s[0] +str q2, [x0, #608] +ldr q2, [x0, #560] +mul v1.4S, v1.4S,v30.s[0] +ldr q0, [x0, #624] +sub v13.4s, v17.4s, v9.4s +mla v19.4S, v10.4S, v31.s[0] +add v17.4s, v17.4s, v9.4s +str q22, [x0, #672] +sqrdmulh v22.4S, v18.4S, v29.s[0] +str q20, [x0, #736] +ldr q20, [x0, #688] +mul v18.4S, v18.4S,v30.s[0] +ldr q9, [x0, #752] +sub v10.4s, v14.4s, v19.4s +mla v1.4S, v21.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +str q17, [x0, #800] +sqrdmulh v17.4S, v3.4S, v29.s[0] +str q13, [x0, #864] +mul v3.4S, v3.4S,v30.s[0] +ldr q13, [x0, #48] +sub v19.4s, v11.4s, v1.4s +mla v18.4S, v22.4S, v31.s[0] +add v11.4s, v11.4s, v1.4s +str q14, [x0, #928] +sqrdmulh v14.4S, v15.4S, v29.s[0] +str q10, [x0, #992] +mul v15.4S, v15.4S,v30.s[0] +ldr q10, [x0, #112] +sub v1.4s, v12.4s, v18.4s +mla v3.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v2.4S, v29.s[0] +ldr q17, [x0, #176] +mul v2.4S, v2.4S,v30.s[0] +sub v22.4s, v16.4s, v3.4s +mla v15.4S, v14.4S, v31.s[0] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v0.4S, v29.s[0] +ldr q14, [x0, #240] +mul v0.4S, v0.4S,v30.s[0] +sub v21.4s, v8.4s, v15.4s +mla v2.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +sub v18.4s, v13.4s, v2.4s +mla v0.4S, v3.4S, v31.s[0] +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v9.4S, v29.s[0] +mul v9.4S, v9.4S,v30.s[0] +sub v3.4s, v10.4s, v0.4s +mla v20.4S, v15.4S, v31.s[0] +add v10.4s, v10.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v15.4s, v17.4s, v20.4s +mla v9.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +sub v2.4s, v14.4s, v9.4s +mla v16.4S, v0.4S, v31.s[0] +add v14.4s, v14.4s, v9.4s +sqrdmulh v9.4S, v11.4S, v29.s[1] +mul v11.4S, v11.4S,v30.s[1] +sub v0.4s, v17.4s, v16.4s +mla v8.4S, v20.4S, v31.s[0] +add v17.4s, v17.4s, v16.4s +sqrdmulh v16.4S, v12.4S, v29.s[1] +mul v12.4S, v12.4S,v30.s[1] +sub v20.4s, v14.4s, v8.4s +mla v11.4S, v9.4S, v31.s[0] +add v14.4s, v14.4s, v8.4s +sqrdmulh v8.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +sub v9.4s, v13.4s, v11.4s +mla v12.4S, v16.4S, v31.s[0] +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +sub v16.4s, v10.4s, v12.4s +mla v22.4S, v8.4S, v31.s[0] +add v10.4s, v10.4s, v12.4s +sqrdmulh v12.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +sub v8.4s, v15.4s, v22.4s +mla v21.4S, v11.4S, v31.s[0] +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v1.4S, v29.s[2] +mul v1.4S, v1.4S,v30.s[2] +sub v11.4s, v2.4s, v21.4s +mla v19.4S, v12.4S, v31.s[0] +add v2.4s, v2.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v27.s[0] +mul v17.4S, v17.4S,v28.s[0] +sub v12.4s, v18.4s, v19.4s +mla v1.4S, v22.4S, v31.s[0] +add v18.4s, v18.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +sub v22.4s, v3.4s, v1.4s +mla v17.4S, v21.4S, v31.s[0] +add v3.4s, v3.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v21.4s, v13.4s, v17.4s +mla v14.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v17.4s +sqrdmulh v17.4S, v20.4S, v27.s[1] +mul v20.4S, v20.4S,v28.s[1] +sub v19.4s, v10.4s, v14.4s +mla v0.4S, v1.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v27.s[2] +mul v15.4S, v15.4S,v28.s[2] +sub v1.4s, v9.4s, v0.4s +mla v20.4S, v17.4S, v31.s[0] +add v9.4s, v9.4s, v0.4s +sqrdmulh v0.4S, v2.4S, v27.s[2] +mul v2.4S, v2.4S,v28.s[2] +sub v17.4s, v16.4s, v20.4s +mla v15.4S, v14.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v27.s[3] +mul v8.4S, v8.4S,v28.s[3] +sub v14.4s, v18.4s, v15.4s +mla v2.4S, v0.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v27.s[3] +mul v11.4S, v11.4S,v28.s[3] +sub v0.4s, v3.4s, v2.4s +mla v8.4S, v20.4S, v31.s[0] +add v3.4s, v3.4s, v2.4s +sqrdmulh v2.4S, v10.4S, v25.s[0] +mul v10.4S, v10.4S,v26.s[0] +sub v20.4s, v12.4s, v8.4s +mla v11.4S, v15.4S, v31.s[0] +add v12.4s, v12.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v25.s[1] +mul v19.4S, v19.4S,v26.s[1] +sub v15.4s, v22.4s, v11.4s +mla v10.4S, v2.4S, v31.s[0] +add v22.4s, v22.4s, v11.4s +sqrdmulh v11.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v2.4s, v13.4s, v10.4s +mla v19.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v10.4s +sqrdmulh v10.4S, v17.4S, v25.s[3] +mul v17.4S, v17.4S,v26.s[3] +sub v8.4s, v21.4s, v19.4s +mla v16.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v19.4s +str q13, [x0, #48] +sqrdmulh v13.4S, v3.4S, v23.s[0] +str q2, [x0, #112] +mul v3.4S, v3.4S,v24.s[0] +ldr q2, [x0, #768] +sub v19.4s, v9.4s, v16.4s +ldr q11, [x0, #832] +mla v17.4S, v10.4S, v31.s[0] +add v9.4s, v9.4s, v16.4s +str q21, [x0, #176] +sqrdmulh v21.4S, v0.4S, v23.s[1] +str q8, [x0, #240] +mul v0.4S, v0.4S,v24.s[1] +ldr q8, [x0, #896] +sub v16.4s, v1.4s, v17.4s +ldr q10, [x0, #960] +mla v3.4S, v13.4S, v31.s[0] +add v1.4s, v1.4s, v17.4s +str q9, [x0, #304] +sqrdmulh v9.4S, v22.4S, v23.s[2] +str q19, [x0, #368] +mul v22.4S, v22.4S,v24.s[2] +ldr q19, [x0, #256] +sub v17.4s, v18.4s, v3.4s +ldr q13, [x0, #320] +mla v0.4S, v21.4S, v31.s[0] +add v18.4s, v18.4s, v3.4s +str q1, [x0, #432] +sqrdmulh v1.4S, v15.4S, v23.s[3] +str q16, [x0, #496] +mul v15.4S, v15.4S,v24.s[3] +ldr q16, [x0, #384] +sub v3.4s, v14.4s, v0.4s +ldr q21, [x0, #448] +mla v22.4S, v9.4S, v31.s[0] +add v14.4s, v14.4s, v0.4s +str q18, [x0, #560] +sqrdmulh v18.4S, v2.4S, v29.s[0] +str q17, [x0, #624] +ldr q17, [x0, #512] +mul v2.4S, v2.4S,v30.s[0] +ldr q0, [x0, #576] +sub v9.4s, v12.4s, v22.4s +mla v15.4S, v1.4S, v31.s[0] +add v12.4s, v12.4s, v22.4s +str q14, [x0, #688] +sqrdmulh v14.4S, v11.4S, v29.s[0] +str q3, [x0, #752] +ldr q3, [x0, #640] +mul v11.4S, v11.4S,v30.s[0] +ldr q22, [x0, #704] +sub v1.4s, v20.4s, v15.4s +mla v2.4S, v18.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +str q12, [x0, #816] +sqrdmulh v12.4S, v8.4S, v29.s[0] +str q9, [x0, #880] +mul v8.4S, v8.4S,v30.s[0] +ldr q9, [x0, #0] +sub v15.4s, v19.4s, v2.4s +mla v11.4S, v14.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +str q20, [x0, #944] +sqrdmulh v20.4S, v10.4S, v29.s[0] +str q1, [x0, #1008] +mul v10.4S, v10.4S,v30.s[0] +ldr q1, [x0, #64] +sub v2.4s, v13.4s, v11.4s +mla v8.4S, v12.4S, v31.s[0] +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v29.s[0] +ldr q12, [x0, #128] +mul v17.4S, v17.4S,v30.s[0] +sub v14.4s, v16.4s, v8.4s +mla v10.4S, v20.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v0.4S, v29.s[0] +ldr q20, [x0, #192] +mul v0.4S, v0.4S,v30.s[0] +sub v18.4s, v21.4s, v10.4s +mla v17.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +sub v11.4s, v9.4s, v17.4s +mla v0.4S, v8.4S, v31.s[0] +add v9.4s, v9.4s, v17.4s +sqrdmulh v17.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v8.4s, v1.4s, v0.4s +mla v3.4S, v10.4S, v31.s[0] +add v1.4s, v1.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v10.4s, v12.4s, v3.4s +mla v22.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v17.4s, v20.4s, v22.4s +mla v16.4S, v0.4S, v31.s[0] +add v20.4s, v20.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[1] +mul v19.4S, v19.4S,v30.s[1] +sub v0.4s, v12.4s, v16.4s +mla v21.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v13.4S, v29.s[1] +mul v13.4S, v13.4S,v30.s[1] +sub v3.4s, v20.4s, v21.4s +mla v19.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v22.4s, v9.4s, v19.4s +mla v13.4S, v16.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v29.s[2] +mul v18.4S, v18.4S,v30.s[2] +sub v16.4s, v1.4s, v13.4s +mla v14.4S, v21.4S, v31.s[0] +add v1.4s, v1.4s, v13.4s +sqrdmulh v13.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +sub v21.4s, v10.4s, v14.4s +mla v18.4S, v19.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v19.4s, v17.4s, v18.4s +mla v15.4S, v13.4S, v31.s[0] +add v17.4s, v17.4s, v18.4s +sqrdmulh v18.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +sub v13.4s, v11.4s, v15.4s +mla v2.4S, v14.4S, v31.s[0] +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v27.s[0] +mul v20.4S, v20.4S,v28.s[0] +sub v14.4s, v8.4s, v2.4s +mla v12.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v2.4s +sqrdmulh v2.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v18.4s, v9.4s, v12.4s +mla v20.4S, v15.4S, v31.s[0] +add v9.4s, v9.4s, v12.4s +sqrdmulh v12.4S, v3.4S, v27.s[1] +mul v3.4S, v3.4S,v28.s[1] +sub v15.4s, v1.4s, v20.4s +mla v0.4S, v2.4S, v31.s[0] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v10.4S, v27.s[2] +mul v10.4S, v10.4S,v28.s[2] +sub v2.4s, v22.4s, v0.4s +mla v3.4S, v12.4S, v31.s[0] +add v22.4s, v22.4s, v0.4s +sqrdmulh v0.4S, v17.4S, v27.s[2] +mul v17.4S, v17.4S,v28.s[2] +sub v12.4s, v16.4s, v3.4s +mla v10.4S, v20.4S, v31.s[0] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +sub v20.4s, v11.4s, v10.4s +mla v17.4S, v0.4S, v31.s[0] +add v11.4s, v11.4s, v10.4s +sqrdmulh v10.4S, v19.4S, v27.s[3] +mul v19.4S, v19.4S,v28.s[3] +sub v0.4s, v8.4s, v17.4s +mla v21.4S, v3.4S, v31.s[0] +add v8.4s, v8.4s, v17.4s +sqrdmulh v17.4S, v1.4S, v25.s[0] +mul v1.4S, v1.4S,v26.s[0] +sub v3.4s, v13.4s, v21.4s +mla v19.4S, v10.4S, v31.s[0] +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v15.4S, v25.s[1] +mul v15.4S, v15.4S,v26.s[1] +sub v10.4s, v14.4s, v19.4s +mla v1.4S, v17.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +sqrdmulh v19.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v17.4s, v9.4s, v1.4s +mla v15.4S, v21.4S, v31.s[0] +add v9.4s, v9.4s, v1.4s +sqrdmulh v1.4S, v12.4S, v25.s[3] +mul v12.4S, v12.4S,v26.s[3] +sub v21.4s, v18.4s, v15.4s +mla v16.4S, v19.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +str q9, [x0, #0] +sqrdmulh v9.4S, v8.4S, v23.s[0] +str q17, [x0, #64] +mul v8.4S, v8.4S,v24.s[0] +ldr q17, [x0, #784] +sub v15.4s, v22.4s, v16.4s +ldr q19, [x0, #848] +mla v12.4S, v1.4S, v31.s[0] +add v22.4s, v22.4s, v16.4s +str q18, [x0, #128] +sqrdmulh v18.4S, v0.4S, v23.s[1] +str q21, [x0, #192] +mul v0.4S, v0.4S,v24.s[1] +ldr q21, [x0, #912] +sub v16.4s, v2.4s, v12.4s +ldr q1, [x0, #976] +mla v8.4S, v9.4S, v31.s[0] +add v2.4s, v2.4s, v12.4s +str q22, [x0, #256] +sqrdmulh v22.4S, v14.4S, v23.s[2] +str q15, [x0, #320] +mul v14.4S, v14.4S,v24.s[2] +ldr q15, [x0, #272] +sub v12.4s, v11.4s, v8.4s +ldr q9, [x0, #336] +mla v0.4S, v18.4S, v31.s[0] +add v11.4s, v11.4s, v8.4s +str q2, [x0, #384] +sqrdmulh v2.4S, v10.4S, v23.s[3] +str q16, [x0, #448] +mul v10.4S, v10.4S,v24.s[3] +ldr q16, [x0, #400] +sub v8.4s, v20.4s, v0.4s +ldr q18, [x0, #464] +mla v14.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v0.4s +str q11, [x0, #512] +sqrdmulh v11.4S, v17.4S, v29.s[0] +str q12, [x0, #576] +ldr q12, [x0, #528] +mul v17.4S, v17.4S,v30.s[0] +ldr q0, [x0, #592] +sub v22.4s, v13.4s, v14.4s +mla v10.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v14.4s +str q20, [x0, #640] +sqrdmulh v20.4S, v19.4S, v29.s[0] +str q8, [x0, #704] +ldr q8, [x0, #656] +mul v19.4S, v19.4S,v30.s[0] +ldr q14, [x0, #720] +sub v2.4s, v3.4s, v10.4s +mla v17.4S, v11.4S, v31.s[0] +add v3.4s, v3.4s, v10.4s +str q13, [x0, #768] +sqrdmulh v13.4S, v21.4S, v29.s[0] +str q22, [x0, #832] +mul v21.4S, v21.4S,v30.s[0] +ldr q22, [x0, #16] +sub v10.4s, v15.4s, v17.4s +mla v19.4S, v20.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +str q3, [x0, #896] +sqrdmulh v3.4S, v1.4S, v29.s[0] +str q2, [x0, #960] +mul v1.4S, v1.4S,v30.s[0] +ldr q2, [x0, #80] +sub v17.4s, v9.4s, v19.4s +mla v21.4S, v13.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v12.4S, v29.s[0] +ldr q13, [x0, #144] +mul v12.4S, v12.4S,v30.s[0] +sub v20.4s, v16.4s, v21.4s +mla v1.4S, v3.4S, v31.s[0] +add v16.4s, v16.4s, v21.4s +sqrdmulh v21.4S, v0.4S, v29.s[0] +ldr q3, [x0, #208] +mul v0.4S, v0.4S,v30.s[0] +sub v11.4s, v18.4s, v1.4s +mla v12.4S, v19.4S, v31.s[0] +add v18.4s, v18.4s, v1.4s +sqrdmulh v1.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v19.4s, v22.4s, v12.4s +mla v0.4S, v21.4S, v31.s[0] +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v21.4s, v2.4s, v0.4s +mla v8.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v1.4s, v13.4s, v8.4s +mla v14.4S, v12.4S, v31.s[0] +add v13.4s, v13.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v12.4s, v3.4s, v14.4s +mla v16.4S, v0.4S, v31.s[0] +add v3.4s, v3.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +sub v0.4s, v13.4s, v16.4s +mla v18.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v16.4s +sqrdmulh v16.4S, v9.4S, v29.s[1] +mul v9.4S, v9.4S,v30.s[1] +sub v8.4s, v3.4s, v18.4s +mla v15.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +sub v14.4s, v22.4s, v15.4s +mla v9.4S, v16.4S, v31.s[0] +add v22.4s, v22.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v16.4s, v2.4s, v9.4s +mla v20.4S, v18.4S, v31.s[0] +add v2.4s, v2.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v18.4s, v1.4s, v20.4s +mla v11.4S, v15.4S, v31.s[0] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +sub v15.4s, v12.4s, v11.4s +mla v10.4S, v9.4S, v31.s[0] +add v12.4s, v12.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v27.s[0] +mul v13.4S, v13.4S,v28.s[0] +sub v9.4s, v19.4s, v10.4s +mla v17.4S, v20.4S, v31.s[0] +add v19.4s, v19.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v27.s[0] +mul v3.4S, v3.4S,v28.s[0] +sub v20.4s, v21.4s, v17.4s +mla v13.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v11.4s, v22.4s, v13.4s +mla v3.4S, v10.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v8.4S, v27.s[1] +mul v8.4S, v8.4S,v28.s[1] +sub v10.4s, v2.4s, v3.4s +mla v0.4S, v17.4S, v31.s[0] +add v2.4s, v2.4s, v3.4s +sqrdmulh v3.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +sub v17.4s, v14.4s, v0.4s +mla v8.4S, v13.4S, v31.s[0] +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v12.4S, v27.s[2] +mul v12.4S, v12.4S,v28.s[2] +sub v13.4s, v16.4s, v8.4s +mla v1.4S, v3.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v3.4s, v19.4s, v1.4s +mla v12.4S, v0.4S, v31.s[0] +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +sub v0.4s, v21.4s, v12.4s +mla v18.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v2.4S, v25.s[0] +mul v2.4S, v2.4S,v26.s[0] +sub v8.4s, v9.4s, v18.4s +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v10.4S, v25.s[1] +mul v10.4S, v10.4S,v26.s[1] +sub v1.4s, v20.4s, v15.4s +mla v2.4S, v12.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v12.4s, v22.4s, v2.4s +mla v10.4S, v18.4S, v31.s[0] +add v22.4s, v22.4s, v2.4s +sqrdmulh v2.4S, v13.4S, v25.s[3] +mul v13.4S, v13.4S,v26.s[3] +sub v18.4s, v11.4s, v10.4s +mla v16.4S, v15.4S, v31.s[0] +add v11.4s, v11.4s, v10.4s +str q22, [x0, #16] +sqrdmulh v22.4S, v21.4S, v23.s[0] +str q12, [x0, #80] +mul v21.4S, v21.4S,v24.s[0] +sub v12.4s, v14.4s, v16.4s +mla v13.4S, v2.4S, v31.s[0] +add v14.4s, v14.4s, v16.4s +str q11, [x0, #144] +sqrdmulh v11.4S, v0.4S, v23.s[1] +str q18, [x0, #208] +mul v0.4S, v0.4S,v24.s[1] +sub v18.4s, v17.4s, v13.4s +mla v21.4S, v22.4S, v31.s[0] +add v17.4s, v17.4s, v13.4s +str q14, [x0, #272] +sqrdmulh v14.4S, v20.4S, v23.s[2] +str q12, [x0, #336] +mul v20.4S, v20.4S,v24.s[2] +sub v12.4s, v19.4s, v21.4s +mla v0.4S, v11.4S, v31.s[0] +add v19.4s, v19.4s, v21.4s +str q17, [x0, #400] +sqrdmulh v17.4S, v1.4S, v23.s[3] +str q18, [x0, #464] +mul v1.4S, v1.4S,v24.s[3] +sub v18.4s, v3.4s, v0.4s +mla v20.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v0.4s +str q19, [x0, #528] +str q12, [x0, #592] +sub v12.4s, v9.4s, v20.4s +mla v1.4S, v17.4S, v31.s[0] +add v9.4s, v9.4s, v20.4s +str q3, [x0, #656] +str q18, [x0, #720] +sub v18.4s, v8.4s, v1.4s +add v8.4s, v8.4s, v1.4s +str q9, [x0, #784] +str q12, [x0, #848] +str q8, [x0, #912] +str q18, [x0, #976] +ldr q4, [x0, #32] +ldr q5, [x0, #48] +ldr q6, [x0, #0] +ldr q7, [x0, #16] +ldr q15, [x0, #96] +ldr q10, [x0, #112] +ldr q2, [x0, #64] +ldr q16, [x0, #80] +ldr q22, [x0, #160] +ldr q13, [x0, #176] +ldr q11, [x0, #128] +ldr q21, [x0, #144] +ldr q14, [x0, #224] +ldr q0, [x0, #240] +ldr q19, [x0, #192] +ldr q17, [x0, #208] +ldr q20, [x17, #+128] +ldr q3, [x17, #+144] +ldr q1, [x17, #+160] +ldr q9, [x17, #+176] +ldr q12, [x17, #+192] +ldr q8, [x17, #+208] +ldr q18, [x17, #+224] +ldr q30, [x17, #+240] +sqrdmulh v29.4S, v4.4S, v3.s[0] +mul v4.4S, v4.4S,v20.s[0] +sqrdmulh v28.4S, v5.4S, v3.s[0] +mul v5.4S, v5.4S,v20.s[0] +mla v4.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v15.4S, v9.s[0] +mul v15.4S, v15.4S,v1.s[0] +mla v5.4S, v28.4S, v31.s[0] +sub v28.4s, v6.4s, v4.4s +add v6.4s, v6.4s, v4.4s +sqrdmulh v4.4S, v10.4S, v9.s[0] +mul v10.4S, v10.4S,v1.s[0] +mla v15.4S, v29.4S, v31.s[0] +sub v29.4s, v7.4s, v5.4s +add v7.4s, v7.4s, v5.4s +sqrdmulh v5.4S, v7.4S, v3.s[1] +mul v7.4S, v7.4S,v20.s[1] +mla v10.4S, v4.4S, v31.s[0] +sub v4.4s, v2.4s, v15.4s +add v2.4s, v2.4s, v15.4s +sqrdmulh v15.4S, v29.4S, v3.s[2] +mul v29.4S, v29.4S,v20.s[2] +mla v7.4S, v5.4S, v31.s[0] +sub v5.4s, v16.4s, v10.4s +add v16.4s, v16.4s, v10.4s +sqrdmulh v10.4S, v16.4S, v9.s[1] +mul v16.4S, v16.4S,v1.s[1] +mla v29.4S, v15.4S, v31.s[0] +sub v15.4s, v6.4s, v7.4s +add v6.4s, v6.4s, v7.4s +sqrdmulh v3.4S, v5.4S, v9.s[2] +mul v5.4S, v5.4S,v1.s[2] +mla v16.4S, v10.4S, v31.s[0] +sub v10.4s, v28.4s, v29.4s +add v28.4s, v28.4s, v29.4s +sqrdmulh v29.4S, v22.4S, v8.s[0] +mul v22.4S, v22.4S,v12.s[0] +mla v5.4S, v3.4S, v31.s[0] +sub v3.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +sqrdmulh v9.4S, v13.4S, v8.s[0] +mul v13.4S, v13.4S,v12.s[0] +mla v22.4S, v29.4S, v31.s[0] +sub v29.4s, v4.4s, v5.4s +add v4.4s, v4.4s, v5.4s +sqrdmulh v5.4S, v14.4S, v30.s[0] +mul v14.4S, v14.4S,v18.s[0] +mla v13.4S, v9.4S, v31.s[0] +sub v9.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +sqrdmulh v22.4S, v0.4S, v30.s[0] +mul v0.4S, v0.4S,v18.s[0] +mla v14.4S, v5.4S, v31.s[0] +sub v5.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +sqrdmulh v13.4S, v21.4S, v8.s[1] +mul v21.4S, v21.4S,v12.s[1] +mla v0.4S, v22.4S, v31.s[0] +sub v22.4s, v19.4s, v14.4s +add v19.4s, v19.4s, v14.4s +sqrdmulh v14.4S, v5.4S, v8.s[2] +mul v5.4S, v5.4S,v12.s[2] +mla v21.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v0.4s +add v17.4s, v17.4s, v0.4s +sqrdmulh v0.4S, v17.4S, v30.s[1] +mul v17.4S, v17.4S,v18.s[1] +mla v5.4S, v14.4S, v31.s[0] +sub v14.4s, v11.4s, v21.4s +add v11.4s, v11.4s, v21.4s +sqrdmulh v8.4S, v13.4S, v30.s[2] +mul v13.4S, v13.4S,v18.s[2] +mla v17.4S, v0.4S, v31.s[0] +sub v0.4s, v9.4s, v5.4s +add v9.4s, v9.4s, v5.4s +mla v13.4S, v8.4S, v31.s[0] +sub v8.4s, v19.4s, v17.4s +add v19.4s, v19.4s, v17.4s +sub v30.4s, v22.4s, v13.4s +add v22.4s, v22.4s, v13.4s +str q6, [x0, #0] +str q15, [x0, #16] +str q28, [x0, #32] +str q10, [x0, #48] +str q2, [x0, #64] +str q3, [x0, #80] +str q4, [x0, #96] +str q29, [x0, #112] +str q11, [x0, #128] +str q14, [x0, #144] +str q9, [x0, #160] +str q0, [x0, #176] +str q19, [x0, #192] +str q8, [x0, #208] +str q22, [x0, #224] +str q30, [x0, #240] +ldr q30, [x0, #288] +ldr q22, [x0, #304] +ldr q8, [x0, #256] +ldr q19, [x0, #272] +ldr q0, [x0, #352] +ldr q9, [x0, #368] +ldr q14, [x0, #320] +ldr q11, [x0, #336] +ldr q29, [x0, #416] +ldr q4, [x0, #432] +ldr q3, [x0, #384] +ldr q2, [x0, #400] +ldr q10, [x0, #480] +ldr q28, [x0, #496] +ldr q15, [x0, #448] +ldr q6, [x0, #464] +ldr q13, [x17, #+256] +ldr q18, [x17, #+272] +ldr q17, [x17, #+288] +ldr q5, [x17, #+304] +ldr q12, [x17, #+320] +ldr q21, [x17, #+336] +ldr q1, [x17, #+352] +ldr q16, [x17, #+368] +sqrdmulh v20.4S, v30.4S, v18.s[0] +mul v30.4S, v30.4S,v13.s[0] +sqrdmulh v7.4S, v22.4S, v18.s[0] +mul v22.4S, v22.4S,v13.s[0] +mla v30.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v0.4S, v5.s[0] +mul v0.4S, v0.4S,v17.s[0] +mla v22.4S, v7.4S, v31.s[0] +sub v7.4s, v8.4s, v30.4s +add v8.4s, v8.4s, v30.4s +sqrdmulh v30.4S, v9.4S, v5.s[0] +mul v9.4S, v9.4S,v17.s[0] +mla v0.4S, v20.4S, v31.s[0] +sub v20.4s, v19.4s, v22.4s +add v19.4s, v19.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v18.s[1] +mul v19.4S, v19.4S,v13.s[1] +mla v9.4S, v30.4S, v31.s[0] +sub v30.4s, v14.4s, v0.4s +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v20.4S, v18.s[2] +mul v20.4S, v20.4S,v13.s[2] +mla v19.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v9.4s +add v11.4s, v11.4s, v9.4s +sqrdmulh v9.4S, v11.4S, v5.s[1] +mul v11.4S, v11.4S,v17.s[1] +mla v20.4S, v0.4S, v31.s[0] +sub v0.4s, v8.4s, v19.4s +add v8.4s, v8.4s, v19.4s +sqrdmulh v18.4S, v22.4S, v5.s[2] +mul v22.4S, v22.4S,v17.s[2] +mla v11.4S, v9.4S, v31.s[0] +sub v9.4s, v7.4s, v20.4s +add v7.4s, v7.4s, v20.4s +sqrdmulh v20.4S, v29.4S, v21.s[0] +mul v29.4S, v29.4S,v12.s[0] +mla v22.4S, v18.4S, v31.s[0] +sub v18.4s, v14.4s, v11.4s +add v14.4s, v14.4s, v11.4s +sqrdmulh v5.4S, v4.4S, v21.s[0] +mul v4.4S, v4.4S,v12.s[0] +mla v29.4S, v20.4S, v31.s[0] +sub v20.4s, v30.4s, v22.4s +add v30.4s, v30.4s, v22.4s +sqrdmulh v22.4S, v10.4S, v16.s[0] +mul v10.4S, v10.4S,v1.s[0] +mla v4.4S, v5.4S, v31.s[0] +sub v5.4s, v3.4s, v29.4s +add v3.4s, v3.4s, v29.4s +sqrdmulh v29.4S, v28.4S, v16.s[0] +mul v28.4S, v28.4S,v1.s[0] +mla v10.4S, v22.4S, v31.s[0] +sub v22.4s, v2.4s, v4.4s +add v2.4s, v2.4s, v4.4s +sqrdmulh v4.4S, v2.4S, v21.s[1] +mul v2.4S, v2.4S,v12.s[1] +mla v28.4S, v29.4S, v31.s[0] +sub v29.4s, v15.4s, v10.4s +add v15.4s, v15.4s, v10.4s +sqrdmulh v10.4S, v22.4S, v21.s[2] +mul v22.4S, v22.4S,v12.s[2] +mla v2.4S, v4.4S, v31.s[0] +sub v4.4s, v6.4s, v28.4s +add v6.4s, v6.4s, v28.4s +sqrdmulh v28.4S, v6.4S, v16.s[1] +mul v6.4S, v6.4S,v1.s[1] +mla v22.4S, v10.4S, v31.s[0] +sub v10.4s, v3.4s, v2.4s +add v3.4s, v3.4s, v2.4s +sqrdmulh v21.4S, v4.4S, v16.s[2] +mul v4.4S, v4.4S,v1.s[2] +mla v6.4S, v28.4S, v31.s[0] +sub v28.4s, v5.4s, v22.4s +add v5.4s, v5.4s, v22.4s +mla v4.4S, v21.4S, v31.s[0] +sub v21.4s, v15.4s, v6.4s +add v15.4s, v15.4s, v6.4s +sub v16.4s, v29.4s, v4.4s +add v29.4s, v29.4s, v4.4s +str q8, [x0, #256] +str q0, [x0, #272] +str q7, [x0, #288] +str q9, [x0, #304] +str q14, [x0, #320] +str q18, [x0, #336] +str q30, [x0, #352] +str q20, [x0, #368] +str q3, [x0, #384] +str q10, [x0, #400] +str q5, [x0, #416] +str q28, [x0, #432] +str q15, [x0, #448] +str q21, [x0, #464] +str q29, [x0, #480] +str q16, [x0, #496] +ldr q16, [x0, #544] +ldr q29, [x0, #560] +ldr q21, [x0, #512] +ldr q15, [x0, #528] +ldr q28, [x0, #608] +ldr q5, [x0, #624] +ldr q10, [x0, #576] +ldr q3, [x0, #592] +ldr q20, [x0, #672] +ldr q30, [x0, #688] +ldr q18, [x0, #640] +ldr q14, [x0, #656] +ldr q9, [x0, #736] +ldr q7, [x0, #752] +ldr q0, [x0, #704] +ldr q8, [x0, #720] +ldr q4, [x17, #+384] +ldr q1, [x17, #+400] +ldr q6, [x17, #+416] +ldr q22, [x17, #+432] +ldr q12, [x17, #+448] +ldr q2, [x17, #+464] +ldr q17, [x17, #+480] +ldr q11, [x17, #+496] +sqrdmulh v13.4S, v16.4S, v1.s[0] +mul v16.4S, v16.4S,v4.s[0] +sqrdmulh v19.4S, v29.4S, v1.s[0] +mul v29.4S, v29.4S,v4.s[0] +mla v16.4S, v13.4S, v31.s[0] +sqrdmulh v13.4S, v28.4S, v22.s[0] +mul v28.4S, v28.4S,v6.s[0] +mla v29.4S, v19.4S, v31.s[0] +sub v19.4s, v21.4s, v16.4s +add v21.4s, v21.4s, v16.4s +sqrdmulh v16.4S, v5.4S, v22.s[0] +mul v5.4S, v5.4S,v6.s[0] +mla v28.4S, v13.4S, v31.s[0] +sub v13.4s, v15.4s, v29.4s +add v15.4s, v15.4s, v29.4s +sqrdmulh v29.4S, v15.4S, v1.s[1] +mul v15.4S, v15.4S,v4.s[1] +mla v5.4S, v16.4S, v31.s[0] +sub v16.4s, v10.4s, v28.4s +add v10.4s, v10.4s, v28.4s +sqrdmulh v28.4S, v13.4S, v1.s[2] +mul v13.4S, v13.4S,v4.s[2] +mla v15.4S, v29.4S, v31.s[0] +sub v29.4s, v3.4s, v5.4s +add v3.4s, v3.4s, v5.4s +sqrdmulh v5.4S, v3.4S, v22.s[1] +mul v3.4S, v3.4S,v6.s[1] +mla v13.4S, v28.4S, v31.s[0] +sub v28.4s, v21.4s, v15.4s +add v21.4s, v21.4s, v15.4s +sqrdmulh v1.4S, v29.4S, v22.s[2] +mul v29.4S, v29.4S,v6.s[2] +mla v3.4S, v5.4S, v31.s[0] +sub v5.4s, v19.4s, v13.4s +add v19.4s, v19.4s, v13.4s +sqrdmulh v13.4S, v20.4S, v2.s[0] +mul v20.4S, v20.4S,v12.s[0] +mla v29.4S, v1.4S, v31.s[0] +sub v1.4s, v10.4s, v3.4s +add v10.4s, v10.4s, v3.4s +sqrdmulh v22.4S, v30.4S, v2.s[0] +mul v30.4S, v30.4S,v12.s[0] +mla v20.4S, v13.4S, v31.s[0] +sub v13.4s, v16.4s, v29.4s +add v16.4s, v16.4s, v29.4s +sqrdmulh v29.4S, v9.4S, v11.s[0] +mul v9.4S, v9.4S,v17.s[0] +mla v30.4S, v22.4S, v31.s[0] +sub v22.4s, v18.4s, v20.4s +add v18.4s, v18.4s, v20.4s +sqrdmulh v20.4S, v7.4S, v11.s[0] +mul v7.4S, v7.4S,v17.s[0] +mla v9.4S, v29.4S, v31.s[0] +sub v29.4s, v14.4s, v30.4s +add v14.4s, v14.4s, v30.4s +sqrdmulh v30.4S, v14.4S, v2.s[1] +mul v14.4S, v14.4S,v12.s[1] +mla v7.4S, v20.4S, v31.s[0] +sub v20.4s, v0.4s, v9.4s +add v0.4s, v0.4s, v9.4s +sqrdmulh v9.4S, v29.4S, v2.s[2] +mul v29.4S, v29.4S,v12.s[2] +mla v14.4S, v30.4S, v31.s[0] +sub v30.4s, v8.4s, v7.4s +add v8.4s, v8.4s, v7.4s +sqrdmulh v7.4S, v8.4S, v11.s[1] +mul v8.4S, v8.4S,v17.s[1] +mla v29.4S, v9.4S, v31.s[0] +sub v9.4s, v18.4s, v14.4s +add v18.4s, v18.4s, v14.4s +sqrdmulh v2.4S, v30.4S, v11.s[2] +mul v30.4S, v30.4S,v17.s[2] +mla v8.4S, v7.4S, v31.s[0] +sub v7.4s, v22.4s, v29.4s +add v22.4s, v22.4s, v29.4s +mla v30.4S, v2.4S, v31.s[0] +sub v2.4s, v0.4s, v8.4s +add v0.4s, v0.4s, v8.4s +sub v11.4s, v20.4s, v30.4s +add v20.4s, v20.4s, v30.4s +str q21, [x0, #512] +str q28, [x0, #528] +str q19, [x0, #544] +str q5, [x0, #560] +str q10, [x0, #576] +str q1, [x0, #592] +str q16, [x0, #608] +str q13, [x0, #624] +str q18, [x0, #640] +str q9, [x0, #656] +str q22, [x0, #672] +str q7, [x0, #688] +str q0, [x0, #704] +str q2, [x0, #720] +str q20, [x0, #736] +str q11, [x0, #752] +ldr q11, [x0, #800] +ldr q20, [x0, #816] +ldr q2, [x0, #768] +ldr q0, [x0, #784] +ldr q7, [x0, #864] +ldr q22, [x0, #880] +ldr q9, [x0, #832] +ldr q18, [x0, #848] +ldr q13, [x0, #928] +ldr q16, [x0, #944] +ldr q1, [x0, #896] +ldr q10, [x0, #912] +ldr q5, [x0, #992] +ldr q19, [x0, #1008] +ldr q28, [x0, #960] +ldr q21, [x0, #976] +ldr q30, [x17, #+512] +ldr q17, [x17, #+528] +ldr q8, [x17, #+544] +ldr q29, [x17, #+560] +ldr q12, [x17, #+576] +ldr q14, [x17, #+592] +ldr q6, [x17, #+608] +ldr q3, [x17, #+624] +sqrdmulh v4.4S, v11.4S, v17.s[0] +mul v11.4S, v11.4S,v30.s[0] +sqrdmulh v15.4S, v20.4S, v17.s[0] +mul v20.4S, v20.4S,v30.s[0] +mla v11.4S, v4.4S, v31.s[0] +sqrdmulh v4.4S, v7.4S, v29.s[0] +mul v7.4S, v7.4S,v8.s[0] +mla v20.4S, v15.4S, v31.s[0] +sub v15.4s, v2.4s, v11.4s +add v2.4s, v2.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v8.s[0] +mla v7.4S, v4.4S, v31.s[0] +sub v4.4s, v0.4s, v20.4s +add v0.4s, v0.4s, v20.4s +sqrdmulh v20.4S, v0.4S, v17.s[1] +mul v0.4S, v0.4S,v30.s[1] +mla v22.4S, v11.4S, v31.s[0] +sub v11.4s, v9.4s, v7.4s +add v9.4s, v9.4s, v7.4s +sqrdmulh v7.4S, v4.4S, v17.s[2] +mul v4.4S, v4.4S,v30.s[2] +mla v0.4S, v20.4S, v31.s[0] +sub v20.4s, v18.4s, v22.4s +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v8.s[1] +mla v4.4S, v7.4S, v31.s[0] +sub v7.4s, v2.4s, v0.4s +add v2.4s, v2.4s, v0.4s +sqrdmulh v17.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v8.s[2] +mla v18.4S, v22.4S, v31.s[0] +sub v22.4s, v15.4s, v4.4s +add v15.4s, v15.4s, v4.4s +sqrdmulh v4.4S, v13.4S, v14.s[0] +mul v13.4S, v13.4S,v12.s[0] +mla v20.4S, v17.4S, v31.s[0] +sub v17.4s, v9.4s, v18.4s +add v9.4s, v9.4s, v18.4s +sqrdmulh v29.4S, v16.4S, v14.s[0] +mul v16.4S, v16.4S,v12.s[0] +mla v13.4S, v4.4S, v31.s[0] +sub v4.4s, v11.4s, v20.4s +add v11.4s, v11.4s, v20.4s +sqrdmulh v20.4S, v5.4S, v3.s[0] +mul v5.4S, v5.4S,v6.s[0] +mla v16.4S, v29.4S, v31.s[0] +sub v29.4s, v1.4s, v13.4s +add v1.4s, v1.4s, v13.4s +sqrdmulh v13.4S, v19.4S, v3.s[0] +mul v19.4S, v19.4S,v6.s[0] +mla v5.4S, v20.4S, v31.s[0] +sub v20.4s, v10.4s, v16.4s +add v10.4s, v10.4s, v16.4s +sqrdmulh v16.4S, v10.4S, v14.s[1] +mul v10.4S, v10.4S,v12.s[1] +mla v19.4S, v13.4S, v31.s[0] +sub v13.4s, v28.4s, v5.4s +add v28.4s, v28.4s, v5.4s +sqrdmulh v5.4S, v20.4S, v14.s[2] +mul v20.4S, v20.4S,v12.s[2] +mla v10.4S, v16.4S, v31.s[0] +sub v16.4s, v21.4s, v19.4s +add v21.4s, v21.4s, v19.4s +sqrdmulh v19.4S, v21.4S, v3.s[1] +mul v21.4S, v21.4S,v6.s[1] +mla v20.4S, v5.4S, v31.s[0] +sub v5.4s, v1.4s, v10.4s +add v1.4s, v1.4s, v10.4s +sqrdmulh v14.4S, v16.4S, v3.s[2] +mul v16.4S, v16.4S,v6.s[2] +mla v21.4S, v19.4S, v31.s[0] +sub v19.4s, v29.4s, v20.4s +add v29.4s, v29.4s, v20.4s +mla v16.4S, v14.4S, v31.s[0] +sub v14.4s, v28.4s, v21.4s +add v28.4s, v28.4s, v21.4s +sub v3.4s, v13.4s, v16.4s +add v13.4s, v13.4s, v16.4s +str q2, [x0, #768] +str q7, [x0, #784] +str q15, [x0, #800] +str q22, [x0, #816] +str q9, [x0, #832] +str q17, [x0, #848] +str q11, [x0, #864] +str q4, [x0, #880] +str q1, [x0, #896] +str q5, [x0, #912] +str q29, [x0, #928] +str q19, [x0, #944] +str q28, [x0, #960] +str q14, [x0, #976] +str q13, [x0, #992] +str q3, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1464 +// Instruction count: 1460 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_2.s b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_2.s new file mode 100644 index 0000000..a72c6cd --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_2.s @@ -0,0 +1,1494 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_3_z4_2 +.global _ntt_u32_incomplete_neon_asm_var_4_2_3_z4_2 +ntt_u32_incomplete_neon_asm_var_4_2_3_z4_2: +_ntt_u32_incomplete_neon_asm_var_4_2_3_z4_2: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #800] +ldr q21, [x0, #864] +ldr q20, [x0, #928] +ldr q19, [x0, #992] +ldr q18, [x0, #288] +ldr q17, [x0, #352] +ldr q16, [x0, #416] +ldr q3, [x0, #480] +sqrdmulh v2.4S, v22.4S, v29.s[0] +ldr q1, [x0, #544] +mul v22.4S, v22.4S,v30.s[0] +ldr q0, [x0, #608] +sqrdmulh v15.4S, v21.4S, v29.s[0] +ldr q14, [x0, #672] +mul v21.4S, v21.4S,v30.s[0] +ldr q13, [x0, #736] +mla v22.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q12, [x0, #32] +sub v11.4s, v18.4s, v22.4s +mla v21.4S, v15.4S, v31.s[0] +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q15, [x0, #96] +sub v10.4s, v17.4s, v21.4s +mla v20.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v1.4S, v29.s[0] +ldr q2, [x0, #160] +mul v1.4S, v1.4S,v30.s[0] +sub v9.4s, v16.4s, v20.4s +mla v19.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v0.4S, v29.s[0] +ldr q22, [x0, #224] +mul v0.4S, v0.4S,v30.s[0] +sub v8.4s, v3.4s, v19.4s +mla v1.4S, v21.4S, v31.s[0] +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v21.4s, v12.4s, v1.4s +mla v0.4S, v20.4S, v31.s[0] +add v12.4s, v12.4s, v1.4s +sqrdmulh v1.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +sub v20.4s, v15.4s, v0.4s +mla v14.4S, v19.4S, v31.s[0] +add v15.4s, v15.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v19.4s, v2.4s, v14.4s +mla v13.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v1.4s, v22.4s, v13.4s +mla v16.4S, v0.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v0.4s, v2.4s, v16.4s +mla v3.4S, v14.4S, v31.s[0] +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v14.4s, v22.4s, v3.4s +mla v18.4S, v13.4S, v31.s[0] +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v29.s[2] +mul v9.4S, v9.4S,v30.s[2] +sub v13.4s, v12.4s, v18.4s +mla v17.4S, v16.4S, v31.s[0] +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v8.4S, v29.s[2] +mul v8.4S, v8.4S,v30.s[2] +sub v16.4s, v15.4s, v17.4s +mla v9.4S, v3.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +sqrdmulh v17.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v3.4s, v19.4s, v9.4s +mla v8.4S, v18.4S, v31.s[0] +add v19.4s, v19.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v18.4s, v1.4s, v8.4s +mla v11.4S, v17.4S, v31.s[0] +add v1.4s, v1.4s, v8.4s +sqrdmulh v8.4S, v2.4S, v27.s[0] +mul v2.4S, v2.4S,v28.s[0] +sub v17.4s, v21.4s, v11.4s +mla v10.4S, v9.4S, v31.s[0] +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v27.s[0] +mul v22.4S, v22.4S,v28.s[0] +sub v9.4s, v20.4s, v10.4s +mla v2.4S, v8.4S, v31.s[0] +add v20.4s, v20.4s, v10.4s +sqrdmulh v10.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v8.4s, v12.4s, v2.4s +mla v22.4S, v11.4S, v31.s[0] +add v12.4s, v12.4s, v2.4s +sqrdmulh v2.4S, v14.4S, v27.s[1] +mul v14.4S, v14.4S,v28.s[1] +sub v11.4s, v15.4s, v22.4s +mla v0.4S, v10.4S, v31.s[0] +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v27.s[2] +mul v19.4S, v19.4S,v28.s[2] +sub v10.4s, v13.4s, v0.4s +mla v14.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v0.4s +sqrdmulh v0.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +sub v2.4s, v16.4s, v14.4s +mla v19.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v27.s[3] +mul v3.4S, v3.4S,v28.s[3] +sub v22.4s, v21.4s, v19.4s +mla v1.4S, v0.4S, v31.s[0] +add v21.4s, v21.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v0.4s, v20.4s, v1.4s +mla v3.4S, v14.4S, v31.s[0] +add v20.4s, v20.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v25.s[0] +mul v15.4S, v15.4S,v26.s[0] +sub v14.4s, v17.4s, v3.4s +mla v18.4S, v19.4S, v31.s[0] +add v17.4s, v17.4s, v3.4s +sqrdmulh v3.4S, v11.4S, v25.s[1] +mul v11.4S, v11.4S,v26.s[1] +sub v19.4s, v9.4s, v18.4s +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v1.4s, v12.4s, v15.4s +mla v11.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v25.s[3] +mul v2.4S, v2.4S,v26.s[3] +sub v3.4s, v8.4s, v11.4s +mla v16.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v11.4s +str q12, [x0, #32] +sqrdmulh v12.4S, v20.4S, v23.s[0] +str q1, [x0, #96] +mul v20.4S, v20.4S,v24.s[0] +ldr q1, [x0, #816] +sub v11.4s, v13.4s, v16.4s +ldr q18, [x0, #880] +mla v2.4S, v15.4S, v31.s[0] +add v13.4s, v13.4s, v16.4s +str q8, [x0, #160] +sqrdmulh v8.4S, v0.4S, v23.s[1] +str q3, [x0, #224] +mul v0.4S, v0.4S,v24.s[1] +ldr q3, [x0, #944] +sub v16.4s, v10.4s, v2.4s +ldr q15, [x0, #1008] +mla v20.4S, v12.4S, v31.s[0] +add v10.4s, v10.4s, v2.4s +str q13, [x0, #288] +sqrdmulh v13.4S, v9.4S, v23.s[2] +str q11, [x0, #352] +mul v9.4S, v9.4S,v24.s[2] +ldr q11, [x0, #304] +sub v2.4s, v21.4s, v20.4s +ldr q12, [x0, #368] +mla v0.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v20.4s +str q10, [x0, #416] +sqrdmulh v10.4S, v19.4S, v23.s[3] +str q16, [x0, #480] +mul v19.4S, v19.4S,v24.s[3] +ldr q16, [x0, #432] +sub v20.4s, v22.4s, v0.4s +ldr q8, [x0, #496] +mla v9.4S, v13.4S, v31.s[0] +add v22.4s, v22.4s, v0.4s +str q21, [x0, #544] +sqrdmulh v21.4S, v1.4S, v29.s[0] +str q2, [x0, #608] +ldr q2, [x0, #560] +mul v1.4S, v1.4S,v30.s[0] +ldr q0, [x0, #624] +sub v13.4s, v17.4s, v9.4s +mla v19.4S, v10.4S, v31.s[0] +add v17.4s, v17.4s, v9.4s +str q22, [x0, #672] +sqrdmulh v22.4S, v18.4S, v29.s[0] +str q20, [x0, #736] +ldr q20, [x0, #688] +mul v18.4S, v18.4S,v30.s[0] +ldr q9, [x0, #752] +sub v10.4s, v14.4s, v19.4s +mla v1.4S, v21.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +str q17, [x0, #800] +sqrdmulh v17.4S, v3.4S, v29.s[0] +str q13, [x0, #864] +mul v3.4S, v3.4S,v30.s[0] +ldr q13, [x0, #48] +sub v19.4s, v11.4s, v1.4s +mla v18.4S, v22.4S, v31.s[0] +add v11.4s, v11.4s, v1.4s +str q14, [x0, #928] +sqrdmulh v14.4S, v15.4S, v29.s[0] +str q10, [x0, #992] +mul v15.4S, v15.4S,v30.s[0] +ldr q10, [x0, #112] +sub v1.4s, v12.4s, v18.4s +mla v3.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v2.4S, v29.s[0] +ldr q17, [x0, #176] +mul v2.4S, v2.4S,v30.s[0] +sub v22.4s, v16.4s, v3.4s +mla v15.4S, v14.4S, v31.s[0] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v0.4S, v29.s[0] +ldr q14, [x0, #240] +mul v0.4S, v0.4S,v30.s[0] +sub v21.4s, v8.4s, v15.4s +mla v2.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +sub v18.4s, v13.4s, v2.4s +mla v0.4S, v3.4S, v31.s[0] +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v9.4S, v29.s[0] +mul v9.4S, v9.4S,v30.s[0] +sub v3.4s, v10.4s, v0.4s +mla v20.4S, v15.4S, v31.s[0] +add v10.4s, v10.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v15.4s, v17.4s, v20.4s +mla v9.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +sub v2.4s, v14.4s, v9.4s +mla v16.4S, v0.4S, v31.s[0] +add v14.4s, v14.4s, v9.4s +sqrdmulh v9.4S, v11.4S, v29.s[1] +mul v11.4S, v11.4S,v30.s[1] +sub v0.4s, v17.4s, v16.4s +mla v8.4S, v20.4S, v31.s[0] +add v17.4s, v17.4s, v16.4s +sqrdmulh v16.4S, v12.4S, v29.s[1] +mul v12.4S, v12.4S,v30.s[1] +sub v20.4s, v14.4s, v8.4s +mla v11.4S, v9.4S, v31.s[0] +add v14.4s, v14.4s, v8.4s +sqrdmulh v8.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +sub v9.4s, v13.4s, v11.4s +mla v12.4S, v16.4S, v31.s[0] +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +sub v16.4s, v10.4s, v12.4s +mla v22.4S, v8.4S, v31.s[0] +add v10.4s, v10.4s, v12.4s +sqrdmulh v12.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +sub v8.4s, v15.4s, v22.4s +mla v21.4S, v11.4S, v31.s[0] +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v1.4S, v29.s[2] +mul v1.4S, v1.4S,v30.s[2] +sub v11.4s, v2.4s, v21.4s +mla v19.4S, v12.4S, v31.s[0] +add v2.4s, v2.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v27.s[0] +mul v17.4S, v17.4S,v28.s[0] +sub v12.4s, v18.4s, v19.4s +mla v1.4S, v22.4S, v31.s[0] +add v18.4s, v18.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +sub v22.4s, v3.4s, v1.4s +mla v17.4S, v21.4S, v31.s[0] +add v3.4s, v3.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v21.4s, v13.4s, v17.4s +mla v14.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v17.4s +sqrdmulh v17.4S, v20.4S, v27.s[1] +mul v20.4S, v20.4S,v28.s[1] +sub v19.4s, v10.4s, v14.4s +mla v0.4S, v1.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v27.s[2] +mul v15.4S, v15.4S,v28.s[2] +sub v1.4s, v9.4s, v0.4s +mla v20.4S, v17.4S, v31.s[0] +add v9.4s, v9.4s, v0.4s +sqrdmulh v0.4S, v2.4S, v27.s[2] +mul v2.4S, v2.4S,v28.s[2] +sub v17.4s, v16.4s, v20.4s +mla v15.4S, v14.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v27.s[3] +mul v8.4S, v8.4S,v28.s[3] +sub v14.4s, v18.4s, v15.4s +mla v2.4S, v0.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v27.s[3] +mul v11.4S, v11.4S,v28.s[3] +sub v0.4s, v3.4s, v2.4s +mla v8.4S, v20.4S, v31.s[0] +add v3.4s, v3.4s, v2.4s +sqrdmulh v2.4S, v10.4S, v25.s[0] +mul v10.4S, v10.4S,v26.s[0] +sub v20.4s, v12.4s, v8.4s +mla v11.4S, v15.4S, v31.s[0] +add v12.4s, v12.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v25.s[1] +mul v19.4S, v19.4S,v26.s[1] +sub v15.4s, v22.4s, v11.4s +mla v10.4S, v2.4S, v31.s[0] +add v22.4s, v22.4s, v11.4s +sqrdmulh v11.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v2.4s, v13.4s, v10.4s +mla v19.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v10.4s +sqrdmulh v10.4S, v17.4S, v25.s[3] +mul v17.4S, v17.4S,v26.s[3] +sub v8.4s, v21.4s, v19.4s +mla v16.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v19.4s +str q13, [x0, #48] +sqrdmulh v13.4S, v3.4S, v23.s[0] +str q2, [x0, #112] +mul v3.4S, v3.4S,v24.s[0] +ldr q2, [x0, #768] +sub v19.4s, v9.4s, v16.4s +ldr q11, [x0, #832] +mla v17.4S, v10.4S, v31.s[0] +add v9.4s, v9.4s, v16.4s +str q21, [x0, #176] +sqrdmulh v21.4S, v0.4S, v23.s[1] +str q8, [x0, #240] +mul v0.4S, v0.4S,v24.s[1] +ldr q8, [x0, #896] +sub v16.4s, v1.4s, v17.4s +ldr q10, [x0, #960] +mla v3.4S, v13.4S, v31.s[0] +add v1.4s, v1.4s, v17.4s +str q9, [x0, #304] +sqrdmulh v9.4S, v22.4S, v23.s[2] +str q19, [x0, #368] +mul v22.4S, v22.4S,v24.s[2] +ldr q19, [x0, #256] +sub v17.4s, v18.4s, v3.4s +ldr q13, [x0, #320] +mla v0.4S, v21.4S, v31.s[0] +add v18.4s, v18.4s, v3.4s +str q1, [x0, #432] +sqrdmulh v1.4S, v15.4S, v23.s[3] +str q16, [x0, #496] +mul v15.4S, v15.4S,v24.s[3] +ldr q16, [x0, #384] +sub v3.4s, v14.4s, v0.4s +ldr q21, [x0, #448] +mla v22.4S, v9.4S, v31.s[0] +add v14.4s, v14.4s, v0.4s +str q18, [x0, #560] +sqrdmulh v18.4S, v2.4S, v29.s[0] +str q17, [x0, #624] +ldr q17, [x0, #512] +mul v2.4S, v2.4S,v30.s[0] +ldr q0, [x0, #576] +sub v9.4s, v12.4s, v22.4s +mla v15.4S, v1.4S, v31.s[0] +add v12.4s, v12.4s, v22.4s +str q14, [x0, #688] +sqrdmulh v14.4S, v11.4S, v29.s[0] +str q3, [x0, #752] +ldr q3, [x0, #640] +mul v11.4S, v11.4S,v30.s[0] +ldr q22, [x0, #704] +sub v1.4s, v20.4s, v15.4s +mla v2.4S, v18.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +str q12, [x0, #816] +sqrdmulh v12.4S, v8.4S, v29.s[0] +str q9, [x0, #880] +mul v8.4S, v8.4S,v30.s[0] +ldr q9, [x0, #0] +sub v15.4s, v19.4s, v2.4s +mla v11.4S, v14.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +str q20, [x0, #944] +sqrdmulh v20.4S, v10.4S, v29.s[0] +str q1, [x0, #1008] +mul v10.4S, v10.4S,v30.s[0] +ldr q1, [x0, #64] +sub v2.4s, v13.4s, v11.4s +mla v8.4S, v12.4S, v31.s[0] +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v29.s[0] +ldr q12, [x0, #128] +mul v17.4S, v17.4S,v30.s[0] +sub v14.4s, v16.4s, v8.4s +mla v10.4S, v20.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v0.4S, v29.s[0] +ldr q20, [x0, #192] +mul v0.4S, v0.4S,v30.s[0] +sub v18.4s, v21.4s, v10.4s +mla v17.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +sub v11.4s, v9.4s, v17.4s +mla v0.4S, v8.4S, v31.s[0] +add v9.4s, v9.4s, v17.4s +sqrdmulh v17.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v8.4s, v1.4s, v0.4s +mla v3.4S, v10.4S, v31.s[0] +add v1.4s, v1.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v10.4s, v12.4s, v3.4s +mla v22.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v17.4s, v20.4s, v22.4s +mla v16.4S, v0.4S, v31.s[0] +add v20.4s, v20.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[1] +mul v19.4S, v19.4S,v30.s[1] +sub v0.4s, v12.4s, v16.4s +mla v21.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v13.4S, v29.s[1] +mul v13.4S, v13.4S,v30.s[1] +sub v3.4s, v20.4s, v21.4s +mla v19.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v22.4s, v9.4s, v19.4s +mla v13.4S, v16.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v29.s[2] +mul v18.4S, v18.4S,v30.s[2] +sub v16.4s, v1.4s, v13.4s +mla v14.4S, v21.4S, v31.s[0] +add v1.4s, v1.4s, v13.4s +sqrdmulh v13.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +sub v21.4s, v10.4s, v14.4s +mla v18.4S, v19.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v19.4s, v17.4s, v18.4s +mla v15.4S, v13.4S, v31.s[0] +add v17.4s, v17.4s, v18.4s +sqrdmulh v18.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +sub v13.4s, v11.4s, v15.4s +mla v2.4S, v14.4S, v31.s[0] +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v27.s[0] +mul v20.4S, v20.4S,v28.s[0] +sub v14.4s, v8.4s, v2.4s +mla v12.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v2.4s +sqrdmulh v2.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v18.4s, v9.4s, v12.4s +mla v20.4S, v15.4S, v31.s[0] +add v9.4s, v9.4s, v12.4s +sqrdmulh v12.4S, v3.4S, v27.s[1] +mul v3.4S, v3.4S,v28.s[1] +sub v15.4s, v1.4s, v20.4s +mla v0.4S, v2.4S, v31.s[0] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v10.4S, v27.s[2] +mul v10.4S, v10.4S,v28.s[2] +sub v2.4s, v22.4s, v0.4s +mla v3.4S, v12.4S, v31.s[0] +add v22.4s, v22.4s, v0.4s +sqrdmulh v0.4S, v17.4S, v27.s[2] +mul v17.4S, v17.4S,v28.s[2] +sub v12.4s, v16.4s, v3.4s +mla v10.4S, v20.4S, v31.s[0] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +sub v20.4s, v11.4s, v10.4s +mla v17.4S, v0.4S, v31.s[0] +add v11.4s, v11.4s, v10.4s +sqrdmulh v10.4S, v19.4S, v27.s[3] +mul v19.4S, v19.4S,v28.s[3] +sub v0.4s, v8.4s, v17.4s +mla v21.4S, v3.4S, v31.s[0] +add v8.4s, v8.4s, v17.4s +sqrdmulh v17.4S, v1.4S, v25.s[0] +mul v1.4S, v1.4S,v26.s[0] +sub v3.4s, v13.4s, v21.4s +mla v19.4S, v10.4S, v31.s[0] +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v15.4S, v25.s[1] +mul v15.4S, v15.4S,v26.s[1] +sub v10.4s, v14.4s, v19.4s +mla v1.4S, v17.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +sqrdmulh v19.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v17.4s, v9.4s, v1.4s +mla v15.4S, v21.4S, v31.s[0] +add v9.4s, v9.4s, v1.4s +sqrdmulh v1.4S, v12.4S, v25.s[3] +mul v12.4S, v12.4S,v26.s[3] +sub v21.4s, v18.4s, v15.4s +mla v16.4S, v19.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +str q9, [x0, #0] +sqrdmulh v9.4S, v8.4S, v23.s[0] +str q17, [x0, #64] +mul v8.4S, v8.4S,v24.s[0] +ldr q17, [x0, #784] +sub v15.4s, v22.4s, v16.4s +ldr q19, [x0, #848] +mla v12.4S, v1.4S, v31.s[0] +add v22.4s, v22.4s, v16.4s +str q18, [x0, #128] +sqrdmulh v18.4S, v0.4S, v23.s[1] +str q21, [x0, #192] +mul v0.4S, v0.4S,v24.s[1] +ldr q21, [x0, #912] +sub v16.4s, v2.4s, v12.4s +ldr q1, [x0, #976] +mla v8.4S, v9.4S, v31.s[0] +add v2.4s, v2.4s, v12.4s +str q22, [x0, #256] +sqrdmulh v22.4S, v14.4S, v23.s[2] +str q15, [x0, #320] +mul v14.4S, v14.4S,v24.s[2] +ldr q15, [x0, #272] +sub v12.4s, v11.4s, v8.4s +ldr q9, [x0, #336] +mla v0.4S, v18.4S, v31.s[0] +add v11.4s, v11.4s, v8.4s +str q2, [x0, #384] +sqrdmulh v2.4S, v10.4S, v23.s[3] +str q16, [x0, #448] +mul v10.4S, v10.4S,v24.s[3] +ldr q16, [x0, #400] +sub v8.4s, v20.4s, v0.4s +ldr q18, [x0, #464] +mla v14.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v0.4s +str q11, [x0, #512] +sqrdmulh v11.4S, v17.4S, v29.s[0] +str q12, [x0, #576] +ldr q12, [x0, #528] +mul v17.4S, v17.4S,v30.s[0] +ldr q0, [x0, #592] +sub v22.4s, v13.4s, v14.4s +mla v10.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v14.4s +str q20, [x0, #640] +sqrdmulh v20.4S, v19.4S, v29.s[0] +str q8, [x0, #704] +ldr q8, [x0, #656] +mul v19.4S, v19.4S,v30.s[0] +ldr q14, [x0, #720] +sub v2.4s, v3.4s, v10.4s +mla v17.4S, v11.4S, v31.s[0] +add v3.4s, v3.4s, v10.4s +str q13, [x0, #768] +sqrdmulh v13.4S, v21.4S, v29.s[0] +str q22, [x0, #832] +mul v21.4S, v21.4S,v30.s[0] +ldr q22, [x0, #16] +sub v10.4s, v15.4s, v17.4s +mla v19.4S, v20.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +str q3, [x0, #896] +sqrdmulh v3.4S, v1.4S, v29.s[0] +str q2, [x0, #960] +mul v1.4S, v1.4S,v30.s[0] +ldr q2, [x0, #80] +sub v17.4s, v9.4s, v19.4s +mla v21.4S, v13.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v12.4S, v29.s[0] +ldr q13, [x0, #144] +mul v12.4S, v12.4S,v30.s[0] +sub v20.4s, v16.4s, v21.4s +mla v1.4S, v3.4S, v31.s[0] +add v16.4s, v16.4s, v21.4s +sqrdmulh v21.4S, v0.4S, v29.s[0] +ldr q3, [x0, #208] +mul v0.4S, v0.4S,v30.s[0] +sub v11.4s, v18.4s, v1.4s +mla v12.4S, v19.4S, v31.s[0] +add v18.4s, v18.4s, v1.4s +sqrdmulh v1.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v19.4s, v22.4s, v12.4s +mla v0.4S, v21.4S, v31.s[0] +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v21.4s, v2.4s, v0.4s +mla v8.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v1.4s, v13.4s, v8.4s +mla v14.4S, v12.4S, v31.s[0] +add v13.4s, v13.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v12.4s, v3.4s, v14.4s +mla v16.4S, v0.4S, v31.s[0] +add v3.4s, v3.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +sub v0.4s, v13.4s, v16.4s +mla v18.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v16.4s +sqrdmulh v16.4S, v9.4S, v29.s[1] +mul v9.4S, v9.4S,v30.s[1] +sub v8.4s, v3.4s, v18.4s +mla v15.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +sub v14.4s, v22.4s, v15.4s +mla v9.4S, v16.4S, v31.s[0] +add v22.4s, v22.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v16.4s, v2.4s, v9.4s +mla v20.4S, v18.4S, v31.s[0] +add v2.4s, v2.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v18.4s, v1.4s, v20.4s +mla v11.4S, v15.4S, v31.s[0] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +sub v15.4s, v12.4s, v11.4s +mla v10.4S, v9.4S, v31.s[0] +add v12.4s, v12.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v27.s[0] +mul v13.4S, v13.4S,v28.s[0] +sub v9.4s, v19.4s, v10.4s +mla v17.4S, v20.4S, v31.s[0] +add v19.4s, v19.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v27.s[0] +mul v3.4S, v3.4S,v28.s[0] +sub v20.4s, v21.4s, v17.4s +mla v13.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v11.4s, v22.4s, v13.4s +mla v3.4S, v10.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v8.4S, v27.s[1] +mul v8.4S, v8.4S,v28.s[1] +sub v10.4s, v2.4s, v3.4s +mla v0.4S, v17.4S, v31.s[0] +add v2.4s, v2.4s, v3.4s +sqrdmulh v3.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +sub v17.4s, v14.4s, v0.4s +mla v8.4S, v13.4S, v31.s[0] +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v12.4S, v27.s[2] +mul v12.4S, v12.4S,v28.s[2] +sub v13.4s, v16.4s, v8.4s +mla v1.4S, v3.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v3.4s, v19.4s, v1.4s +mla v12.4S, v0.4S, v31.s[0] +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +sub v0.4s, v21.4s, v12.4s +mla v18.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v2.4S, v25.s[0] +mul v2.4S, v2.4S,v26.s[0] +sub v8.4s, v9.4s, v18.4s +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v10.4S, v25.s[1] +mul v10.4S, v10.4S,v26.s[1] +sub v1.4s, v20.4s, v15.4s +mla v2.4S, v12.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v12.4s, v22.4s, v2.4s +mla v10.4S, v18.4S, v31.s[0] +add v22.4s, v22.4s, v2.4s +sqrdmulh v2.4S, v13.4S, v25.s[3] +mul v13.4S, v13.4S,v26.s[3] +sub v18.4s, v11.4s, v10.4s +mla v16.4S, v15.4S, v31.s[0] +add v11.4s, v11.4s, v10.4s +str q22, [x0, #16] +sqrdmulh v22.4S, v21.4S, v23.s[0] +str q12, [x0, #80] +mul v21.4S, v21.4S,v24.s[0] +sub v12.4s, v14.4s, v16.4s +mla v13.4S, v2.4S, v31.s[0] +add v14.4s, v14.4s, v16.4s +str q11, [x0, #144] +sqrdmulh v11.4S, v0.4S, v23.s[1] +str q18, [x0, #208] +mul v0.4S, v0.4S,v24.s[1] +sub v18.4s, v17.4s, v13.4s +mla v21.4S, v22.4S, v31.s[0] +add v17.4s, v17.4s, v13.4s +str q14, [x0, #272] +sqrdmulh v14.4S, v20.4S, v23.s[2] +str q12, [x0, #336] +mul v20.4S, v20.4S,v24.s[2] +sub v12.4s, v19.4s, v21.4s +mla v0.4S, v11.4S, v31.s[0] +add v19.4s, v19.4s, v21.4s +str q17, [x0, #400] +sqrdmulh v17.4S, v1.4S, v23.s[3] +str q18, [x0, #464] +mul v1.4S, v1.4S,v24.s[3] +sub v18.4s, v3.4s, v0.4s +mla v20.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v0.4s +str q19, [x0, #528] +str q12, [x0, #592] +sub v12.4s, v9.4s, v20.4s +mla v1.4S, v17.4S, v31.s[0] +add v9.4s, v9.4s, v20.4s +str q3, [x0, #656] +str q18, [x0, #720] +sub v18.4s, v8.4s, v1.4s +add v8.4s, v8.4s, v1.4s +str q9, [x0, #784] +str q12, [x0, #848] +str q8, [x0, #912] +str q18, [x0, #976] +ldr q4, [x17, #+128] +ldr q5, [x17, #+144] +ldr q6, [x17, #+160] +ldr q7, [x17, #+176] +ldr q15, [x17, #+192] +ldr q10, [x17, #+208] +ldr q2, [x17, #+224] +ldr q16, [x17, #+240] +ldr q22, [x0, #32] +ldr q13, [x0, #48] +ldr q11, [x0, #0] +ldr q21, [x0, #96] +ldr q14, [x0, #112] +ldr q0, [x0, #64] +ldr q19, [x0, #160] +ldr q17, [x0, #176] +ldr q20, [x0, #128] +ldr q3, [x0, #224] +ldr q1, [x0, #240] +ldr q9, [x0, #192] +sqrdmulh v12.4S, v22.4S, v5.s[0] +mul v22.4S, v22.4S,v4.s[0] +mla v22.4S, v12.4S, v31.s[0] +sub v12.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +ldr q22, [x0, #16] +sqrdmulh v8.4S, v21.4S, v7.s[0] +mul v21.4S, v21.4S,v6.s[0] +mla v21.4S, v8.4S, v31.s[0] +sub v8.4s, v0.4s, v21.4s +add v0.4s, v0.4s, v21.4s +ldr q21, [x0, #80] +sqrdmulh v18.4S, v19.4S, v10.s[0] +mul v19.4S, v19.4S,v15.s[0] +mla v19.4S, v18.4S, v31.s[0] +sub v18.4s, v20.4s, v19.4s +add v20.4s, v20.4s, v19.4s +ldr q19, [x0, #144] +sqrdmulh v30.4S, v3.4S, v16.s[0] +mul v3.4S, v3.4S,v2.s[0] +mla v3.4S, v30.4S, v31.s[0] +sub v30.4s, v9.4s, v3.4s +add v9.4s, v9.4s, v3.4s +ldr q3, [x0, #208] +sqrdmulh v29.4S, v13.4S, v5.s[0] +mul v13.4S, v13.4S,v4.s[0] +mla v13.4S, v29.4S, v31.s[0] +sub v29.4s, v22.4s, v13.4s +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v14.4S, v7.s[0] +mul v14.4S, v14.4S,v6.s[0] +mla v14.4S, v13.4S, v31.s[0] +sub v13.4s, v21.4s, v14.4s +add v21.4s, v21.4s, v14.4s +sqrdmulh v14.4S, v17.4S, v10.s[0] +mul v17.4S, v17.4S,v15.s[0] +mla v17.4S, v14.4S, v31.s[0] +sub v14.4s, v19.4s, v17.4s +add v19.4s, v19.4s, v17.4s +sqrdmulh v17.4S, v1.4S, v16.s[0] +mul v1.4S, v1.4S,v2.s[0] +mla v1.4S, v17.4S, v31.s[0] +sub v17.4s, v3.4s, v1.4s +add v3.4s, v3.4s, v1.4s +sqrdmulh v1.4S, v22.4S, v5.s[1] +mul v22.4S, v22.4S,v4.s[1] +mla v22.4S, v1.4S, v31.s[0] +sub v1.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v7.s[1] +mul v21.4S, v21.4S,v6.s[1] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v0.4s, v21.4s +add v0.4s, v0.4s, v21.4s +str q11, [x0, #0] +str q1, [x0, #16] +sqrdmulh v1.4S, v19.4S, v10.s[1] +mul v19.4S, v19.4S,v15.s[1] +mla v19.4S, v1.4S, v31.s[0] +sub v1.4s, v20.4s, v19.4s +add v20.4s, v20.4s, v19.4s +str q0, [x0, #64] +str q22, [x0, #80] +sqrdmulh v22.4S, v3.4S, v16.s[1] +mul v3.4S, v3.4S,v2.s[1] +mla v3.4S, v22.4S, v31.s[0] +sub v22.4s, v9.4s, v3.4s +add v9.4s, v9.4s, v3.4s +str q20, [x0, #128] +str q1, [x0, #144] +sqrdmulh v1.4S, v29.4S, v5.s[2] +mul v29.4S, v29.4S,v4.s[2] +mla v29.4S, v1.4S, v31.s[0] +sub v1.4s, v12.4s, v29.4s +add v12.4s, v12.4s, v29.4s +str q9, [x0, #192] +str q22, [x0, #208] +ldr q5, [x17, #+256] +ldr q4, [x17, #+272] +sqrdmulh v22.4S, v13.4S, v7.s[2] +mul v13.4S, v13.4S,v6.s[2] +mla v13.4S, v22.4S, v31.s[0] +sub v22.4s, v8.4s, v13.4s +add v8.4s, v8.4s, v13.4s +ldr q7, [x17, #+288] +ldr q6, [x17, #+304] +sqrdmulh v13.4S, v14.4S, v10.s[2] +mul v14.4S, v14.4S,v15.s[2] +mla v14.4S, v13.4S, v31.s[0] +sub v13.4s, v18.4s, v14.4s +add v18.4s, v18.4s, v14.4s +ldr q10, [x17, #+320] +ldr q15, [x17, #+336] +sqrdmulh v14.4S, v17.4S, v16.s[2] +mul v17.4S, v17.4S,v2.s[2] +mla v17.4S, v14.4S, v31.s[0] +sub v14.4s, v30.4s, v17.4s +add v30.4s, v30.4s, v17.4s +ldr q16, [x17, #+352] +ldr q2, [x17, #+368] +str q12, [x0, #32] +str q1, [x0, #48] +str q8, [x0, #96] +str q22, [x0, #112] +str q18, [x0, #160] +str q13, [x0, #176] +str q30, [x0, #224] +str q14, [x0, #240] +ldr q14, [x0, #288] +ldr q30, [x0, #304] +ldr q13, [x0, #256] +ldr q18, [x0, #352] +ldr q22, [x0, #368] +ldr q8, [x0, #320] +ldr q1, [x0, #416] +ldr q12, [x0, #432] +ldr q17, [x0, #384] +ldr q9, [x0, #480] +ldr q29, [x0, #496] +ldr q20, [x0, #448] +sqrdmulh v3.4S, v14.4S, v4.s[0] +mul v14.4S, v14.4S,v5.s[0] +mla v14.4S, v3.4S, v31.s[0] +sub v3.4s, v13.4s, v14.4s +add v13.4s, v13.4s, v14.4s +ldr q14, [x0, #272] +sqrdmulh v0.4S, v18.4S, v6.s[0] +mul v18.4S, v18.4S,v7.s[0] +mla v18.4S, v0.4S, v31.s[0] +sub v0.4s, v8.4s, v18.4s +add v8.4s, v8.4s, v18.4s +ldr q18, [x0, #336] +sqrdmulh v19.4S, v1.4S, v15.s[0] +mul v1.4S, v1.4S,v10.s[0] +mla v1.4S, v19.4S, v31.s[0] +sub v19.4s, v17.4s, v1.4s +add v17.4s, v17.4s, v1.4s +ldr q1, [x0, #400] +sqrdmulh v11.4S, v9.4S, v2.s[0] +mul v9.4S, v9.4S,v16.s[0] +mla v9.4S, v11.4S, v31.s[0] +sub v11.4s, v20.4s, v9.4s +add v20.4s, v20.4s, v9.4s +ldr q9, [x0, #464] +sqrdmulh v21.4S, v30.4S, v4.s[0] +mul v30.4S, v30.4S,v5.s[0] +mla v30.4S, v21.4S, v31.s[0] +sub v21.4s, v14.4s, v30.4s +add v14.4s, v14.4s, v30.4s +sqrdmulh v30.4S, v22.4S, v6.s[0] +mul v22.4S, v22.4S,v7.s[0] +mla v22.4S, v30.4S, v31.s[0] +sub v30.4s, v18.4s, v22.4s +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v12.4S, v15.s[0] +mul v12.4S, v12.4S,v10.s[0] +mla v12.4S, v22.4S, v31.s[0] +sub v22.4s, v1.4s, v12.4s +add v1.4s, v1.4s, v12.4s +sqrdmulh v12.4S, v29.4S, v2.s[0] +mul v29.4S, v29.4S,v16.s[0] +mla v29.4S, v12.4S, v31.s[0] +sub v12.4s, v9.4s, v29.4s +add v9.4s, v9.4s, v29.4s +sqrdmulh v29.4S, v14.4S, v4.s[1] +mul v14.4S, v14.4S,v5.s[1] +mla v14.4S, v29.4S, v31.s[0] +sub v29.4s, v13.4s, v14.4s +add v13.4s, v13.4s, v14.4s +sqrdmulh v14.4S, v18.4S, v6.s[1] +mul v18.4S, v18.4S,v7.s[1] +mla v18.4S, v14.4S, v31.s[0] +sub v14.4s, v8.4s, v18.4s +add v8.4s, v8.4s, v18.4s +str q13, [x0, #256] +str q29, [x0, #272] +sqrdmulh v29.4S, v1.4S, v15.s[1] +mul v1.4S, v1.4S,v10.s[1] +mla v1.4S, v29.4S, v31.s[0] +sub v29.4s, v17.4s, v1.4s +add v17.4s, v17.4s, v1.4s +str q8, [x0, #320] +str q14, [x0, #336] +sqrdmulh v14.4S, v9.4S, v2.s[1] +mul v9.4S, v9.4S,v16.s[1] +mla v9.4S, v14.4S, v31.s[0] +sub v14.4s, v20.4s, v9.4s +add v20.4s, v20.4s, v9.4s +str q17, [x0, #384] +str q29, [x0, #400] +sqrdmulh v29.4S, v21.4S, v4.s[2] +mul v21.4S, v21.4S,v5.s[2] +mla v21.4S, v29.4S, v31.s[0] +sub v29.4s, v3.4s, v21.4s +add v3.4s, v3.4s, v21.4s +str q20, [x0, #448] +str q14, [x0, #464] +ldr q4, [x17, #+384] +ldr q5, [x17, #+400] +sqrdmulh v14.4S, v30.4S, v6.s[2] +mul v30.4S, v30.4S,v7.s[2] +mla v30.4S, v14.4S, v31.s[0] +sub v14.4s, v0.4s, v30.4s +add v0.4s, v0.4s, v30.4s +ldr q6, [x17, #+416] +ldr q7, [x17, #+432] +sqrdmulh v30.4S, v22.4S, v15.s[2] +mul v22.4S, v22.4S,v10.s[2] +mla v22.4S, v30.4S, v31.s[0] +sub v30.4s, v19.4s, v22.4s +add v19.4s, v19.4s, v22.4s +ldr q15, [x17, #+448] +ldr q10, [x17, #+464] +sqrdmulh v22.4S, v12.4S, v2.s[2] +mul v12.4S, v12.4S,v16.s[2] +mla v12.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v12.4s +add v11.4s, v11.4s, v12.4s +ldr q2, [x17, #+480] +ldr q16, [x17, #+496] +str q3, [x0, #288] +str q29, [x0, #304] +str q0, [x0, #352] +str q14, [x0, #368] +str q19, [x0, #416] +str q30, [x0, #432] +str q11, [x0, #480] +str q22, [x0, #496] +ldr q22, [x0, #544] +ldr q11, [x0, #560] +ldr q30, [x0, #512] +ldr q19, [x0, #608] +ldr q14, [x0, #624] +ldr q0, [x0, #576] +ldr q29, [x0, #672] +ldr q3, [x0, #688] +ldr q12, [x0, #640] +ldr q20, [x0, #736] +ldr q21, [x0, #752] +ldr q17, [x0, #704] +sqrdmulh v9.4S, v22.4S, v5.s[0] +mul v22.4S, v22.4S,v4.s[0] +mla v22.4S, v9.4S, v31.s[0] +sub v9.4s, v30.4s, v22.4s +add v30.4s, v30.4s, v22.4s +ldr q22, [x0, #528] +sqrdmulh v8.4S, v19.4S, v7.s[0] +mul v19.4S, v19.4S,v6.s[0] +mla v19.4S, v8.4S, v31.s[0] +sub v8.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +ldr q19, [x0, #592] +sqrdmulh v1.4S, v29.4S, v10.s[0] +mul v29.4S, v29.4S,v15.s[0] +mla v29.4S, v1.4S, v31.s[0] +sub v1.4s, v12.4s, v29.4s +add v12.4s, v12.4s, v29.4s +ldr q29, [x0, #656] +sqrdmulh v13.4S, v20.4S, v16.s[0] +mul v20.4S, v20.4S,v2.s[0] +mla v20.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v20.4s +add v17.4s, v17.4s, v20.4s +ldr q20, [x0, #720] +sqrdmulh v18.4S, v11.4S, v5.s[0] +mul v11.4S, v11.4S,v4.s[0] +mla v11.4S, v18.4S, v31.s[0] +sub v18.4s, v22.4s, v11.4s +add v22.4s, v22.4s, v11.4s +sqrdmulh v11.4S, v14.4S, v7.s[0] +mul v14.4S, v14.4S,v6.s[0] +mla v14.4S, v11.4S, v31.s[0] +sub v11.4s, v19.4s, v14.4s +add v19.4s, v19.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v10.s[0] +mul v3.4S, v3.4S,v15.s[0] +mla v3.4S, v14.4S, v31.s[0] +sub v14.4s, v29.4s, v3.4s +add v29.4s, v29.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v16.s[0] +mul v21.4S, v21.4S,v2.s[0] +mla v21.4S, v3.4S, v31.s[0] +sub v3.4s, v20.4s, v21.4s +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v5.s[1] +mul v22.4S, v22.4S,v4.s[1] +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v30.4s, v22.4s +add v30.4s, v30.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v7.s[1] +mul v19.4S, v19.4S,v6.s[1] +mla v19.4S, v22.4S, v31.s[0] +sub v22.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +str q30, [x0, #512] +str q21, [x0, #528] +sqrdmulh v21.4S, v29.4S, v10.s[1] +mul v29.4S, v29.4S,v15.s[1] +mla v29.4S, v21.4S, v31.s[0] +sub v21.4s, v12.4s, v29.4s +add v12.4s, v12.4s, v29.4s +str q0, [x0, #576] +str q22, [x0, #592] +sqrdmulh v22.4S, v20.4S, v16.s[1] +mul v20.4S, v20.4S,v2.s[1] +mla v20.4S, v22.4S, v31.s[0] +sub v22.4s, v17.4s, v20.4s +add v17.4s, v17.4s, v20.4s +str q12, [x0, #640] +str q21, [x0, #656] +sqrdmulh v21.4S, v18.4S, v5.s[2] +mul v18.4S, v18.4S,v4.s[2] +mla v18.4S, v21.4S, v31.s[0] +sub v21.4s, v9.4s, v18.4s +add v9.4s, v9.4s, v18.4s +str q17, [x0, #704] +str q22, [x0, #720] +ldr q5, [x17, #+512] +ldr q4, [x17, #+528] +sqrdmulh v22.4S, v11.4S, v7.s[2] +mul v11.4S, v11.4S,v6.s[2] +mla v11.4S, v22.4S, v31.s[0] +sub v22.4s, v8.4s, v11.4s +add v8.4s, v8.4s, v11.4s +ldr q7, [x17, #+544] +ldr q6, [x17, #+560] +sqrdmulh v11.4S, v14.4S, v10.s[2] +mul v14.4S, v14.4S,v15.s[2] +mla v14.4S, v11.4S, v31.s[0] +sub v11.4s, v1.4s, v14.4s +add v1.4s, v1.4s, v14.4s +ldr q10, [x17, #+576] +ldr q15, [x17, #+592] +sqrdmulh v14.4S, v3.4S, v16.s[2] +mul v3.4S, v3.4S,v2.s[2] +mla v3.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v3.4s +add v13.4s, v13.4s, v3.4s +ldr q16, [x17, #+608] +ldr q2, [x17, #+624] +str q9, [x0, #544] +str q21, [x0, #560] +str q8, [x0, #608] +str q22, [x0, #624] +str q1, [x0, #672] +str q11, [x0, #688] +str q13, [x0, #736] +str q14, [x0, #752] +ldr q14, [x0, #800] +ldr q13, [x0, #816] +ldr q11, [x0, #768] +ldr q1, [x0, #864] +ldr q22, [x0, #880] +ldr q8, [x0, #832] +ldr q21, [x0, #928] +ldr q9, [x0, #944] +ldr q3, [x0, #896] +ldr q17, [x0, #992] +ldr q18, [x0, #1008] +ldr q12, [x0, #960] +sqrdmulh v20.4S, v14.4S, v4.s[0] +mul v14.4S, v14.4S,v5.s[0] +mla v14.4S, v20.4S, v31.s[0] +sub v20.4s, v11.4s, v14.4s +add v11.4s, v11.4s, v14.4s +ldr q14, [x0, #784] +sqrdmulh v0.4S, v1.4S, v6.s[0] +mul v1.4S, v1.4S,v7.s[0] +mla v1.4S, v0.4S, v31.s[0] +sub v0.4s, v8.4s, v1.4s +add v8.4s, v8.4s, v1.4s +ldr q1, [x0, #848] +sqrdmulh v29.4S, v21.4S, v15.s[0] +mul v21.4S, v21.4S,v10.s[0] +mla v21.4S, v29.4S, v31.s[0] +sub v29.4s, v3.4s, v21.4s +add v3.4s, v3.4s, v21.4s +ldr q21, [x0, #912] +sqrdmulh v30.4S, v17.4S, v2.s[0] +mul v17.4S, v17.4S,v16.s[0] +mla v17.4S, v30.4S, v31.s[0] +sub v30.4s, v12.4s, v17.4s +add v12.4s, v12.4s, v17.4s +ldr q17, [x0, #976] +sqrdmulh v19.4S, v13.4S, v4.s[0] +mul v13.4S, v13.4S,v5.s[0] +mla v13.4S, v19.4S, v31.s[0] +sub v19.4s, v14.4s, v13.4s +add v14.4s, v14.4s, v13.4s +sqrdmulh v13.4S, v22.4S, v6.s[0] +mul v22.4S, v22.4S,v7.s[0] +mla v22.4S, v13.4S, v31.s[0] +sub v13.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +sqrdmulh v22.4S, v9.4S, v15.s[0] +mul v9.4S, v9.4S,v10.s[0] +mla v9.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v9.4s +add v21.4s, v21.4s, v9.4s +sqrdmulh v9.4S, v18.4S, v2.s[0] +mul v18.4S, v18.4S,v16.s[0] +mla v18.4S, v9.4S, v31.s[0] +sub v9.4s, v17.4s, v18.4s +add v17.4s, v17.4s, v18.4s +sqrdmulh v18.4S, v14.4S, v4.s[1] +mul v14.4S, v14.4S,v5.s[1] +mla v14.4S, v18.4S, v31.s[0] +sub v18.4s, v11.4s, v14.4s +add v11.4s, v11.4s, v14.4s +sqrdmulh v14.4S, v1.4S, v6.s[1] +mul v1.4S, v1.4S,v7.s[1] +mla v1.4S, v14.4S, v31.s[0] +sub v14.4s, v8.4s, v1.4s +add v8.4s, v8.4s, v1.4s +str q11, [x0, #768] +str q18, [x0, #784] +sqrdmulh v18.4S, v21.4S, v15.s[1] +mul v21.4S, v21.4S,v10.s[1] +mla v21.4S, v18.4S, v31.s[0] +sub v18.4s, v3.4s, v21.4s +add v3.4s, v3.4s, v21.4s +str q8, [x0, #832] +str q14, [x0, #848] +sqrdmulh v14.4S, v17.4S, v2.s[1] +mul v17.4S, v17.4S,v16.s[1] +mla v17.4S, v14.4S, v31.s[0] +sub v14.4s, v12.4s, v17.4s +add v12.4s, v12.4s, v17.4s +str q3, [x0, #896] +str q18, [x0, #912] +sqrdmulh v18.4S, v19.4S, v4.s[2] +mul v19.4S, v19.4S,v5.s[2] +mla v19.4S, v18.4S, v31.s[0] +sub v18.4s, v20.4s, v19.4s +add v20.4s, v20.4s, v19.4s +str q12, [x0, #960] +str q14, [x0, #976] +sqrdmulh v4.4S, v13.4S, v6.s[2] +mul v13.4S, v13.4S,v7.s[2] +mla v13.4S, v4.4S, v31.s[0] +sub v4.4s, v0.4s, v13.4s +add v0.4s, v0.4s, v13.4s +sqrdmulh v6.4S, v22.4S, v15.s[2] +mul v22.4S, v22.4S,v10.s[2] +mla v22.4S, v6.4S, v31.s[0] +sub v6.4s, v29.4s, v22.4s +add v29.4s, v29.4s, v22.4s +sqrdmulh v15.4S, v9.4S, v2.s[2] +mul v9.4S, v9.4S,v16.s[2] +mla v9.4S, v15.4S, v31.s[0] +sub v15.4s, v30.4s, v9.4s +add v30.4s, v30.4s, v9.4s +str q20, [x0, #800] +str q18, [x0, #816] +str q0, [x0, #864] +str q4, [x0, #880] +str q29, [x0, #928] +str q6, [x0, #944] +str q30, [x0, #992] +str q15, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1464 +// Instruction count: 1460 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_3.s b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_3.s new file mode 100644 index 0000000..ed2fb5d --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_3.s @@ -0,0 +1,1494 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_3_z4_3 +.global _ntt_u32_incomplete_neon_asm_var_4_2_3_z4_3 +ntt_u32_incomplete_neon_asm_var_4_2_3_z4_3: +_ntt_u32_incomplete_neon_asm_var_4_2_3_z4_3: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #800] +ldr q21, [x0, #864] +ldr q20, [x0, #928] +ldr q19, [x0, #992] +ldr q18, [x0, #288] +ldr q17, [x0, #352] +ldr q16, [x0, #416] +ldr q3, [x0, #480] +sqrdmulh v2.4S, v22.4S, v29.s[0] +ldr q1, [x0, #544] +mul v22.4S, v22.4S,v30.s[0] +ldr q0, [x0, #608] +sqrdmulh v15.4S, v21.4S, v29.s[0] +ldr q14, [x0, #672] +mul v21.4S, v21.4S,v30.s[0] +ldr q13, [x0, #736] +mla v22.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q12, [x0, #32] +sub v11.4s, v18.4s, v22.4s +mla v21.4S, v15.4S, v31.s[0] +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q15, [x0, #96] +sub v10.4s, v17.4s, v21.4s +mla v20.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v1.4S, v29.s[0] +ldr q2, [x0, #160] +mul v1.4S, v1.4S,v30.s[0] +sub v9.4s, v16.4s, v20.4s +mla v19.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v0.4S, v29.s[0] +ldr q22, [x0, #224] +mul v0.4S, v0.4S,v30.s[0] +sub v8.4s, v3.4s, v19.4s +mla v1.4S, v21.4S, v31.s[0] +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v21.4s, v12.4s, v1.4s +mla v0.4S, v20.4S, v31.s[0] +add v12.4s, v12.4s, v1.4s +sqrdmulh v1.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +sub v20.4s, v15.4s, v0.4s +mla v14.4S, v19.4S, v31.s[0] +add v15.4s, v15.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v19.4s, v2.4s, v14.4s +mla v13.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v1.4s, v22.4s, v13.4s +mla v16.4S, v0.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v0.4s, v2.4s, v16.4s +mla v3.4S, v14.4S, v31.s[0] +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v14.4s, v22.4s, v3.4s +mla v18.4S, v13.4S, v31.s[0] +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v29.s[2] +mul v9.4S, v9.4S,v30.s[2] +sub v13.4s, v12.4s, v18.4s +mla v17.4S, v16.4S, v31.s[0] +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v8.4S, v29.s[2] +mul v8.4S, v8.4S,v30.s[2] +sub v16.4s, v15.4s, v17.4s +mla v9.4S, v3.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +sqrdmulh v17.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v3.4s, v19.4s, v9.4s +mla v8.4S, v18.4S, v31.s[0] +add v19.4s, v19.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v18.4s, v1.4s, v8.4s +mla v11.4S, v17.4S, v31.s[0] +add v1.4s, v1.4s, v8.4s +sqrdmulh v8.4S, v2.4S, v27.s[0] +mul v2.4S, v2.4S,v28.s[0] +sub v17.4s, v21.4s, v11.4s +mla v10.4S, v9.4S, v31.s[0] +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v27.s[0] +mul v22.4S, v22.4S,v28.s[0] +sub v9.4s, v20.4s, v10.4s +mla v2.4S, v8.4S, v31.s[0] +add v20.4s, v20.4s, v10.4s +sqrdmulh v10.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v8.4s, v12.4s, v2.4s +mla v22.4S, v11.4S, v31.s[0] +add v12.4s, v12.4s, v2.4s +sqrdmulh v2.4S, v14.4S, v27.s[1] +mul v14.4S, v14.4S,v28.s[1] +sub v11.4s, v15.4s, v22.4s +mla v0.4S, v10.4S, v31.s[0] +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v27.s[2] +mul v19.4S, v19.4S,v28.s[2] +sub v10.4s, v13.4s, v0.4s +mla v14.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v0.4s +sqrdmulh v0.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +sub v2.4s, v16.4s, v14.4s +mla v19.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v27.s[3] +mul v3.4S, v3.4S,v28.s[3] +sub v22.4s, v21.4s, v19.4s +mla v1.4S, v0.4S, v31.s[0] +add v21.4s, v21.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v0.4s, v20.4s, v1.4s +mla v3.4S, v14.4S, v31.s[0] +add v20.4s, v20.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v25.s[0] +mul v15.4S, v15.4S,v26.s[0] +sub v14.4s, v17.4s, v3.4s +mla v18.4S, v19.4S, v31.s[0] +add v17.4s, v17.4s, v3.4s +sqrdmulh v3.4S, v11.4S, v25.s[1] +mul v11.4S, v11.4S,v26.s[1] +sub v19.4s, v9.4s, v18.4s +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v1.4s, v12.4s, v15.4s +mla v11.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v25.s[3] +mul v2.4S, v2.4S,v26.s[3] +sub v3.4s, v8.4s, v11.4s +mla v16.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v11.4s +str q12, [x0, #32] +sqrdmulh v12.4S, v20.4S, v23.s[0] +str q1, [x0, #96] +mul v20.4S, v20.4S,v24.s[0] +ldr q1, [x0, #816] +sub v11.4s, v13.4s, v16.4s +ldr q18, [x0, #880] +mla v2.4S, v15.4S, v31.s[0] +add v13.4s, v13.4s, v16.4s +str q8, [x0, #160] +sqrdmulh v8.4S, v0.4S, v23.s[1] +str q3, [x0, #224] +mul v0.4S, v0.4S,v24.s[1] +ldr q3, [x0, #944] +sub v16.4s, v10.4s, v2.4s +ldr q15, [x0, #1008] +mla v20.4S, v12.4S, v31.s[0] +add v10.4s, v10.4s, v2.4s +str q13, [x0, #288] +sqrdmulh v13.4S, v9.4S, v23.s[2] +str q11, [x0, #352] +mul v9.4S, v9.4S,v24.s[2] +ldr q11, [x0, #304] +sub v2.4s, v21.4s, v20.4s +ldr q12, [x0, #368] +mla v0.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v20.4s +str q10, [x0, #416] +sqrdmulh v10.4S, v19.4S, v23.s[3] +str q16, [x0, #480] +mul v19.4S, v19.4S,v24.s[3] +ldr q16, [x0, #432] +sub v20.4s, v22.4s, v0.4s +ldr q8, [x0, #496] +mla v9.4S, v13.4S, v31.s[0] +add v22.4s, v22.4s, v0.4s +str q21, [x0, #544] +sqrdmulh v21.4S, v1.4S, v29.s[0] +str q2, [x0, #608] +ldr q2, [x0, #560] +mul v1.4S, v1.4S,v30.s[0] +ldr q0, [x0, #624] +sub v13.4s, v17.4s, v9.4s +mla v19.4S, v10.4S, v31.s[0] +add v17.4s, v17.4s, v9.4s +str q22, [x0, #672] +sqrdmulh v22.4S, v18.4S, v29.s[0] +str q20, [x0, #736] +ldr q20, [x0, #688] +mul v18.4S, v18.4S,v30.s[0] +ldr q9, [x0, #752] +sub v10.4s, v14.4s, v19.4s +mla v1.4S, v21.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +str q17, [x0, #800] +sqrdmulh v17.4S, v3.4S, v29.s[0] +str q13, [x0, #864] +mul v3.4S, v3.4S,v30.s[0] +ldr q13, [x0, #48] +sub v19.4s, v11.4s, v1.4s +mla v18.4S, v22.4S, v31.s[0] +add v11.4s, v11.4s, v1.4s +str q14, [x0, #928] +sqrdmulh v14.4S, v15.4S, v29.s[0] +str q10, [x0, #992] +mul v15.4S, v15.4S,v30.s[0] +ldr q10, [x0, #112] +sub v1.4s, v12.4s, v18.4s +mla v3.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v2.4S, v29.s[0] +ldr q17, [x0, #176] +mul v2.4S, v2.4S,v30.s[0] +sub v22.4s, v16.4s, v3.4s +mla v15.4S, v14.4S, v31.s[0] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v0.4S, v29.s[0] +ldr q14, [x0, #240] +mul v0.4S, v0.4S,v30.s[0] +sub v21.4s, v8.4s, v15.4s +mla v2.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +sub v18.4s, v13.4s, v2.4s +mla v0.4S, v3.4S, v31.s[0] +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v9.4S, v29.s[0] +mul v9.4S, v9.4S,v30.s[0] +sub v3.4s, v10.4s, v0.4s +mla v20.4S, v15.4S, v31.s[0] +add v10.4s, v10.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v15.4s, v17.4s, v20.4s +mla v9.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +sub v2.4s, v14.4s, v9.4s +mla v16.4S, v0.4S, v31.s[0] +add v14.4s, v14.4s, v9.4s +sqrdmulh v9.4S, v11.4S, v29.s[1] +mul v11.4S, v11.4S,v30.s[1] +sub v0.4s, v17.4s, v16.4s +mla v8.4S, v20.4S, v31.s[0] +add v17.4s, v17.4s, v16.4s +sqrdmulh v16.4S, v12.4S, v29.s[1] +mul v12.4S, v12.4S,v30.s[1] +sub v20.4s, v14.4s, v8.4s +mla v11.4S, v9.4S, v31.s[0] +add v14.4s, v14.4s, v8.4s +sqrdmulh v8.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +sub v9.4s, v13.4s, v11.4s +mla v12.4S, v16.4S, v31.s[0] +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +sub v16.4s, v10.4s, v12.4s +mla v22.4S, v8.4S, v31.s[0] +add v10.4s, v10.4s, v12.4s +sqrdmulh v12.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +sub v8.4s, v15.4s, v22.4s +mla v21.4S, v11.4S, v31.s[0] +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v1.4S, v29.s[2] +mul v1.4S, v1.4S,v30.s[2] +sub v11.4s, v2.4s, v21.4s +mla v19.4S, v12.4S, v31.s[0] +add v2.4s, v2.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v27.s[0] +mul v17.4S, v17.4S,v28.s[0] +sub v12.4s, v18.4s, v19.4s +mla v1.4S, v22.4S, v31.s[0] +add v18.4s, v18.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +sub v22.4s, v3.4s, v1.4s +mla v17.4S, v21.4S, v31.s[0] +add v3.4s, v3.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v21.4s, v13.4s, v17.4s +mla v14.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v17.4s +sqrdmulh v17.4S, v20.4S, v27.s[1] +mul v20.4S, v20.4S,v28.s[1] +sub v19.4s, v10.4s, v14.4s +mla v0.4S, v1.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v27.s[2] +mul v15.4S, v15.4S,v28.s[2] +sub v1.4s, v9.4s, v0.4s +mla v20.4S, v17.4S, v31.s[0] +add v9.4s, v9.4s, v0.4s +sqrdmulh v0.4S, v2.4S, v27.s[2] +mul v2.4S, v2.4S,v28.s[2] +sub v17.4s, v16.4s, v20.4s +mla v15.4S, v14.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v27.s[3] +mul v8.4S, v8.4S,v28.s[3] +sub v14.4s, v18.4s, v15.4s +mla v2.4S, v0.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v27.s[3] +mul v11.4S, v11.4S,v28.s[3] +sub v0.4s, v3.4s, v2.4s +mla v8.4S, v20.4S, v31.s[0] +add v3.4s, v3.4s, v2.4s +sqrdmulh v2.4S, v10.4S, v25.s[0] +mul v10.4S, v10.4S,v26.s[0] +sub v20.4s, v12.4s, v8.4s +mla v11.4S, v15.4S, v31.s[0] +add v12.4s, v12.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v25.s[1] +mul v19.4S, v19.4S,v26.s[1] +sub v15.4s, v22.4s, v11.4s +mla v10.4S, v2.4S, v31.s[0] +add v22.4s, v22.4s, v11.4s +sqrdmulh v11.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v2.4s, v13.4s, v10.4s +mla v19.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v10.4s +sqrdmulh v10.4S, v17.4S, v25.s[3] +mul v17.4S, v17.4S,v26.s[3] +sub v8.4s, v21.4s, v19.4s +mla v16.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v19.4s +str q13, [x0, #48] +sqrdmulh v13.4S, v3.4S, v23.s[0] +str q2, [x0, #112] +mul v3.4S, v3.4S,v24.s[0] +ldr q2, [x0, #768] +sub v19.4s, v9.4s, v16.4s +ldr q11, [x0, #832] +mla v17.4S, v10.4S, v31.s[0] +add v9.4s, v9.4s, v16.4s +str q21, [x0, #176] +sqrdmulh v21.4S, v0.4S, v23.s[1] +str q8, [x0, #240] +mul v0.4S, v0.4S,v24.s[1] +ldr q8, [x0, #896] +sub v16.4s, v1.4s, v17.4s +ldr q10, [x0, #960] +mla v3.4S, v13.4S, v31.s[0] +add v1.4s, v1.4s, v17.4s +str q9, [x0, #304] +sqrdmulh v9.4S, v22.4S, v23.s[2] +str q19, [x0, #368] +mul v22.4S, v22.4S,v24.s[2] +ldr q19, [x0, #256] +sub v17.4s, v18.4s, v3.4s +ldr q13, [x0, #320] +mla v0.4S, v21.4S, v31.s[0] +add v18.4s, v18.4s, v3.4s +str q1, [x0, #432] +sqrdmulh v1.4S, v15.4S, v23.s[3] +str q16, [x0, #496] +mul v15.4S, v15.4S,v24.s[3] +ldr q16, [x0, #384] +sub v3.4s, v14.4s, v0.4s +ldr q21, [x0, #448] +mla v22.4S, v9.4S, v31.s[0] +add v14.4s, v14.4s, v0.4s +str q18, [x0, #560] +sqrdmulh v18.4S, v2.4S, v29.s[0] +str q17, [x0, #624] +ldr q17, [x0, #512] +mul v2.4S, v2.4S,v30.s[0] +ldr q0, [x0, #576] +sub v9.4s, v12.4s, v22.4s +mla v15.4S, v1.4S, v31.s[0] +add v12.4s, v12.4s, v22.4s +str q14, [x0, #688] +sqrdmulh v14.4S, v11.4S, v29.s[0] +str q3, [x0, #752] +ldr q3, [x0, #640] +mul v11.4S, v11.4S,v30.s[0] +ldr q22, [x0, #704] +sub v1.4s, v20.4s, v15.4s +mla v2.4S, v18.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +str q12, [x0, #816] +sqrdmulh v12.4S, v8.4S, v29.s[0] +str q9, [x0, #880] +mul v8.4S, v8.4S,v30.s[0] +ldr q9, [x0, #0] +sub v15.4s, v19.4s, v2.4s +mla v11.4S, v14.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +str q20, [x0, #944] +sqrdmulh v20.4S, v10.4S, v29.s[0] +str q1, [x0, #1008] +mul v10.4S, v10.4S,v30.s[0] +ldr q1, [x0, #64] +sub v2.4s, v13.4s, v11.4s +mla v8.4S, v12.4S, v31.s[0] +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v29.s[0] +ldr q12, [x0, #128] +mul v17.4S, v17.4S,v30.s[0] +sub v14.4s, v16.4s, v8.4s +mla v10.4S, v20.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v0.4S, v29.s[0] +ldr q20, [x0, #192] +mul v0.4S, v0.4S,v30.s[0] +sub v18.4s, v21.4s, v10.4s +mla v17.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +sub v11.4s, v9.4s, v17.4s +mla v0.4S, v8.4S, v31.s[0] +add v9.4s, v9.4s, v17.4s +sqrdmulh v17.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v8.4s, v1.4s, v0.4s +mla v3.4S, v10.4S, v31.s[0] +add v1.4s, v1.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v10.4s, v12.4s, v3.4s +mla v22.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v17.4s, v20.4s, v22.4s +mla v16.4S, v0.4S, v31.s[0] +add v20.4s, v20.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[1] +mul v19.4S, v19.4S,v30.s[1] +sub v0.4s, v12.4s, v16.4s +mla v21.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v13.4S, v29.s[1] +mul v13.4S, v13.4S,v30.s[1] +sub v3.4s, v20.4s, v21.4s +mla v19.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v22.4s, v9.4s, v19.4s +mla v13.4S, v16.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v29.s[2] +mul v18.4S, v18.4S,v30.s[2] +sub v16.4s, v1.4s, v13.4s +mla v14.4S, v21.4S, v31.s[0] +add v1.4s, v1.4s, v13.4s +sqrdmulh v13.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +sub v21.4s, v10.4s, v14.4s +mla v18.4S, v19.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v19.4s, v17.4s, v18.4s +mla v15.4S, v13.4S, v31.s[0] +add v17.4s, v17.4s, v18.4s +sqrdmulh v18.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +sub v13.4s, v11.4s, v15.4s +mla v2.4S, v14.4S, v31.s[0] +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v27.s[0] +mul v20.4S, v20.4S,v28.s[0] +sub v14.4s, v8.4s, v2.4s +mla v12.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v2.4s +sqrdmulh v2.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v18.4s, v9.4s, v12.4s +mla v20.4S, v15.4S, v31.s[0] +add v9.4s, v9.4s, v12.4s +sqrdmulh v12.4S, v3.4S, v27.s[1] +mul v3.4S, v3.4S,v28.s[1] +sub v15.4s, v1.4s, v20.4s +mla v0.4S, v2.4S, v31.s[0] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v10.4S, v27.s[2] +mul v10.4S, v10.4S,v28.s[2] +sub v2.4s, v22.4s, v0.4s +mla v3.4S, v12.4S, v31.s[0] +add v22.4s, v22.4s, v0.4s +sqrdmulh v0.4S, v17.4S, v27.s[2] +mul v17.4S, v17.4S,v28.s[2] +sub v12.4s, v16.4s, v3.4s +mla v10.4S, v20.4S, v31.s[0] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +sub v20.4s, v11.4s, v10.4s +mla v17.4S, v0.4S, v31.s[0] +add v11.4s, v11.4s, v10.4s +sqrdmulh v10.4S, v19.4S, v27.s[3] +mul v19.4S, v19.4S,v28.s[3] +sub v0.4s, v8.4s, v17.4s +mla v21.4S, v3.4S, v31.s[0] +add v8.4s, v8.4s, v17.4s +sqrdmulh v17.4S, v1.4S, v25.s[0] +mul v1.4S, v1.4S,v26.s[0] +sub v3.4s, v13.4s, v21.4s +mla v19.4S, v10.4S, v31.s[0] +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v15.4S, v25.s[1] +mul v15.4S, v15.4S,v26.s[1] +sub v10.4s, v14.4s, v19.4s +mla v1.4S, v17.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +sqrdmulh v19.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v17.4s, v9.4s, v1.4s +mla v15.4S, v21.4S, v31.s[0] +add v9.4s, v9.4s, v1.4s +sqrdmulh v1.4S, v12.4S, v25.s[3] +mul v12.4S, v12.4S,v26.s[3] +sub v21.4s, v18.4s, v15.4s +mla v16.4S, v19.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +str q9, [x0, #0] +sqrdmulh v9.4S, v8.4S, v23.s[0] +str q17, [x0, #64] +mul v8.4S, v8.4S,v24.s[0] +ldr q17, [x0, #784] +sub v15.4s, v22.4s, v16.4s +ldr q19, [x0, #848] +mla v12.4S, v1.4S, v31.s[0] +add v22.4s, v22.4s, v16.4s +str q18, [x0, #128] +sqrdmulh v18.4S, v0.4S, v23.s[1] +str q21, [x0, #192] +mul v0.4S, v0.4S,v24.s[1] +ldr q21, [x0, #912] +sub v16.4s, v2.4s, v12.4s +ldr q1, [x0, #976] +mla v8.4S, v9.4S, v31.s[0] +add v2.4s, v2.4s, v12.4s +str q22, [x0, #256] +sqrdmulh v22.4S, v14.4S, v23.s[2] +str q15, [x0, #320] +mul v14.4S, v14.4S,v24.s[2] +ldr q15, [x0, #272] +sub v12.4s, v11.4s, v8.4s +ldr q9, [x0, #336] +mla v0.4S, v18.4S, v31.s[0] +add v11.4s, v11.4s, v8.4s +str q2, [x0, #384] +sqrdmulh v2.4S, v10.4S, v23.s[3] +str q16, [x0, #448] +mul v10.4S, v10.4S,v24.s[3] +ldr q16, [x0, #400] +sub v8.4s, v20.4s, v0.4s +ldr q18, [x0, #464] +mla v14.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v0.4s +str q11, [x0, #512] +sqrdmulh v11.4S, v17.4S, v29.s[0] +str q12, [x0, #576] +ldr q12, [x0, #528] +mul v17.4S, v17.4S,v30.s[0] +ldr q0, [x0, #592] +sub v22.4s, v13.4s, v14.4s +mla v10.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v14.4s +str q20, [x0, #640] +sqrdmulh v20.4S, v19.4S, v29.s[0] +str q8, [x0, #704] +ldr q8, [x0, #656] +mul v19.4S, v19.4S,v30.s[0] +ldr q14, [x0, #720] +sub v2.4s, v3.4s, v10.4s +mla v17.4S, v11.4S, v31.s[0] +add v3.4s, v3.4s, v10.4s +str q13, [x0, #768] +sqrdmulh v13.4S, v21.4S, v29.s[0] +str q22, [x0, #832] +mul v21.4S, v21.4S,v30.s[0] +ldr q22, [x0, #16] +sub v10.4s, v15.4s, v17.4s +mla v19.4S, v20.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +str q3, [x0, #896] +sqrdmulh v3.4S, v1.4S, v29.s[0] +str q2, [x0, #960] +mul v1.4S, v1.4S,v30.s[0] +ldr q2, [x0, #80] +sub v17.4s, v9.4s, v19.4s +mla v21.4S, v13.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v12.4S, v29.s[0] +ldr q13, [x0, #144] +mul v12.4S, v12.4S,v30.s[0] +sub v20.4s, v16.4s, v21.4s +mla v1.4S, v3.4S, v31.s[0] +add v16.4s, v16.4s, v21.4s +sqrdmulh v21.4S, v0.4S, v29.s[0] +ldr q3, [x0, #208] +mul v0.4S, v0.4S,v30.s[0] +sub v11.4s, v18.4s, v1.4s +mla v12.4S, v19.4S, v31.s[0] +add v18.4s, v18.4s, v1.4s +sqrdmulh v1.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v19.4s, v22.4s, v12.4s +mla v0.4S, v21.4S, v31.s[0] +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v21.4s, v2.4s, v0.4s +mla v8.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v1.4s, v13.4s, v8.4s +mla v14.4S, v12.4S, v31.s[0] +add v13.4s, v13.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v12.4s, v3.4s, v14.4s +mla v16.4S, v0.4S, v31.s[0] +add v3.4s, v3.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +sub v0.4s, v13.4s, v16.4s +mla v18.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v16.4s +sqrdmulh v16.4S, v9.4S, v29.s[1] +mul v9.4S, v9.4S,v30.s[1] +sub v8.4s, v3.4s, v18.4s +mla v15.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +sub v14.4s, v22.4s, v15.4s +mla v9.4S, v16.4S, v31.s[0] +add v22.4s, v22.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v16.4s, v2.4s, v9.4s +mla v20.4S, v18.4S, v31.s[0] +add v2.4s, v2.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v18.4s, v1.4s, v20.4s +mla v11.4S, v15.4S, v31.s[0] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +sub v15.4s, v12.4s, v11.4s +mla v10.4S, v9.4S, v31.s[0] +add v12.4s, v12.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v27.s[0] +mul v13.4S, v13.4S,v28.s[0] +sub v9.4s, v19.4s, v10.4s +mla v17.4S, v20.4S, v31.s[0] +add v19.4s, v19.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v27.s[0] +mul v3.4S, v3.4S,v28.s[0] +sub v20.4s, v21.4s, v17.4s +mla v13.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v11.4s, v22.4s, v13.4s +mla v3.4S, v10.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v8.4S, v27.s[1] +mul v8.4S, v8.4S,v28.s[1] +sub v10.4s, v2.4s, v3.4s +mla v0.4S, v17.4S, v31.s[0] +add v2.4s, v2.4s, v3.4s +sqrdmulh v3.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +sub v17.4s, v14.4s, v0.4s +mla v8.4S, v13.4S, v31.s[0] +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v12.4S, v27.s[2] +mul v12.4S, v12.4S,v28.s[2] +sub v13.4s, v16.4s, v8.4s +mla v1.4S, v3.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v3.4s, v19.4s, v1.4s +mla v12.4S, v0.4S, v31.s[0] +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +sub v0.4s, v21.4s, v12.4s +mla v18.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v2.4S, v25.s[0] +mul v2.4S, v2.4S,v26.s[0] +sub v8.4s, v9.4s, v18.4s +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v10.4S, v25.s[1] +mul v10.4S, v10.4S,v26.s[1] +sub v1.4s, v20.4s, v15.4s +mla v2.4S, v12.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v12.4s, v22.4s, v2.4s +mla v10.4S, v18.4S, v31.s[0] +add v22.4s, v22.4s, v2.4s +sqrdmulh v2.4S, v13.4S, v25.s[3] +mul v13.4S, v13.4S,v26.s[3] +sub v18.4s, v11.4s, v10.4s +mla v16.4S, v15.4S, v31.s[0] +add v11.4s, v11.4s, v10.4s +str q22, [x0, #16] +sqrdmulh v22.4S, v21.4S, v23.s[0] +str q12, [x0, #80] +mul v21.4S, v21.4S,v24.s[0] +sub v12.4s, v14.4s, v16.4s +mla v13.4S, v2.4S, v31.s[0] +add v14.4s, v14.4s, v16.4s +str q11, [x0, #144] +sqrdmulh v11.4S, v0.4S, v23.s[1] +str q18, [x0, #208] +mul v0.4S, v0.4S,v24.s[1] +sub v18.4s, v17.4s, v13.4s +mla v21.4S, v22.4S, v31.s[0] +add v17.4s, v17.4s, v13.4s +str q14, [x0, #272] +sqrdmulh v14.4S, v20.4S, v23.s[2] +str q12, [x0, #336] +mul v20.4S, v20.4S,v24.s[2] +sub v12.4s, v19.4s, v21.4s +mla v0.4S, v11.4S, v31.s[0] +add v19.4s, v19.4s, v21.4s +str q17, [x0, #400] +sqrdmulh v17.4S, v1.4S, v23.s[3] +str q18, [x0, #464] +mul v1.4S, v1.4S,v24.s[3] +sub v18.4s, v3.4s, v0.4s +mla v20.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v0.4s +str q19, [x0, #528] +str q12, [x0, #592] +sub v12.4s, v9.4s, v20.4s +mla v1.4S, v17.4S, v31.s[0] +add v9.4s, v9.4s, v20.4s +str q3, [x0, #656] +str q18, [x0, #720] +sub v18.4s, v8.4s, v1.4s +add v8.4s, v8.4s, v1.4s +str q9, [x0, #784] +str q12, [x0, #848] +str q8, [x0, #912] +str q18, [x0, #976] +ldr q4, [x17, #+128] +ldr q5, [x17, #+144] +ldr q6, [x17, #+160] +ldr q7, [x17, #+176] +ldr q15, [x17, #+192] +ldr q10, [x17, #+208] +ldr q2, [x17, #+224] +ldr q16, [x17, #+240] +ldr q22, [x0, #32] +ldr q13, [x0, #48] +ldr q11, [x0, #0] +ldr q21, [x0, #96] +ldr q14, [x0, #112] +ldr q0, [x0, #64] +ldr q19, [x0, #160] +ldr q17, [x0, #176] +ldr q20, [x0, #128] +ldr q3, [x0, #224] +ldr q1, [x0, #240] +ldr q9, [x0, #192] +sqrdmulh v12.4S, v22.4S, v5.s[0] +sqrdmulh v8.4S, v21.4S, v7.s[0] +sqrdmulh v18.4S, v19.4S, v10.s[0] +sqrdmulh v30.4S, v3.4S, v16.s[0] +mul v22.4S, v22.4S,v4.s[0] +mul v21.4S, v21.4S,v6.s[0] +mul v19.4S, v19.4S,v15.s[0] +mul v3.4S, v3.4S,v2.s[0] +mla v22.4S, v12.4S, v31.s[0] +mla v21.4S, v8.4S, v31.s[0] +mla v19.4S, v18.4S, v31.s[0] +mla v3.4S, v30.4S, v31.s[0] +sub v30.4s, v11.4s, v22.4s +sub v18.4s, v0.4s, v21.4s +sub v8.4s, v20.4s, v19.4s +sub v12.4s, v9.4s, v3.4s +add v11.4s, v11.4s, v22.4s +add v0.4s, v0.4s, v21.4s +add v20.4s, v20.4s, v19.4s +add v9.4s, v9.4s, v3.4s +ldr q3, [x0, #16] +ldr q19, [x0, #80] +ldr q21, [x0, #144] +ldr q22, [x0, #208] +sqrdmulh v29.4S, v13.4S, v5.s[0] +sqrdmulh v28.4S, v14.4S, v7.s[0] +sqrdmulh v27.4S, v17.4S, v10.s[0] +sqrdmulh v26.4S, v1.4S, v16.s[0] +mul v13.4S, v13.4S,v4.s[0] +mul v14.4S, v14.4S,v6.s[0] +mul v17.4S, v17.4S,v15.s[0] +mul v1.4S, v1.4S,v2.s[0] +mla v13.4S, v29.4S, v31.s[0] +mla v14.4S, v28.4S, v31.s[0] +mla v17.4S, v27.4S, v31.s[0] +mla v1.4S, v26.4S, v31.s[0] +sub v26.4s, v3.4s, v13.4s +sub v27.4s, v19.4s, v14.4s +sub v28.4s, v21.4s, v17.4s +sub v29.4s, v22.4s, v1.4s +add v3.4s, v3.4s, v13.4s +add v19.4s, v19.4s, v14.4s +add v21.4s, v21.4s, v17.4s +add v22.4s, v22.4s, v1.4s +sqrdmulh v1.4S, v3.4S, v5.s[1] +sqrdmulh v17.4S, v19.4S, v7.s[1] +sqrdmulh v14.4S, v21.4S, v10.s[1] +sqrdmulh v13.4S, v22.4S, v16.s[1] +mul v3.4S, v3.4S,v4.s[1] +mul v19.4S, v19.4S,v6.s[1] +mul v21.4S, v21.4S,v15.s[1] +mul v22.4S, v22.4S,v2.s[1] +mla v3.4S, v1.4S, v31.s[0] +mla v19.4S, v17.4S, v31.s[0] +mla v21.4S, v14.4S, v31.s[0] +mla v22.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v3.4s +sub v14.4s, v0.4s, v19.4s +sub v17.4s, v20.4s, v21.4s +sub v1.4s, v9.4s, v22.4s +add v11.4s, v11.4s, v3.4s +add v0.4s, v0.4s, v19.4s +add v20.4s, v20.4s, v21.4s +add v9.4s, v9.4s, v22.4s +sqrdmulh v22.4S, v26.4S, v5.s[2] +sqrdmulh v21.4S, v27.4S, v7.s[2] +sqrdmulh v19.4S, v28.4S, v10.s[2] +sqrdmulh v3.4S, v29.4S, v16.s[2] +str q11, [x0, #0] +str q13, [x0, #16] +mul v26.4S, v26.4S,v4.s[2] +mul v27.4S, v27.4S,v6.s[2] +mul v28.4S, v28.4S,v15.s[2] +mul v29.4S, v29.4S,v2.s[2] +str q0, [x0, #64] +str q14, [x0, #80] +ldr q16, [x17, #+256] +ldr q2, [x17, #+272] +ldr q10, [x17, #+288] +ldr q15, [x17, #+304] +mla v26.4S, v22.4S, v31.s[0] +mla v27.4S, v21.4S, v31.s[0] +mla v28.4S, v19.4S, v31.s[0] +mla v29.4S, v3.4S, v31.s[0] +str q20, [x0, #128] +str q17, [x0, #144] +ldr q17, [x17, #+320] +ldr q20, [x17, #+336] +sub v3.4s, v30.4s, v26.4s +sub v19.4s, v18.4s, v27.4s +sub v21.4s, v8.4s, v28.4s +sub v22.4s, v12.4s, v29.4s +str q9, [x0, #192] +str q1, [x0, #208] +ldr q1, [x17, #+352] +ldr q9, [x17, #+368] +add v30.4s, v30.4s, v26.4s +add v18.4s, v18.4s, v27.4s +add v8.4s, v8.4s, v28.4s +add v12.4s, v12.4s, v29.4s +str q30, [x0, #32] +str q18, [x0, #96] +str q8, [x0, #160] +str q12, [x0, #224] +ldr q12, [x0, #288] +ldr q8, [x0, #304] +ldr q18, [x0, #256] +ldr q30, [x0, #352] +ldr q29, [x0, #368] +ldr q28, [x0, #320] +ldr q27, [x0, #416] +ldr q26, [x0, #432] +ldr q7, [x0, #384] +ldr q6, [x0, #480] +ldr q5, [x0, #496] +ldr q4, [x0, #448] +sqrdmulh v14.4S, v12.4S, v2.s[0] +sqrdmulh v0.4S, v30.4S, v15.s[0] +sqrdmulh v13.4S, v27.4S, v20.s[0] +sqrdmulh v11.4S, v6.4S, v9.s[0] +str q3, [x0, #48] +mul v12.4S, v12.4S,v16.s[0] +mul v30.4S, v30.4S,v10.s[0] +mul v27.4S, v27.4S,v17.s[0] +mul v6.4S, v6.4S,v1.s[0] +str q19, [x0, #112] +mla v12.4S, v14.4S, v31.s[0] +mla v30.4S, v0.4S, v31.s[0] +mla v27.4S, v13.4S, v31.s[0] +mla v6.4S, v11.4S, v31.s[0] +str q21, [x0, #176] +sub v21.4s, v18.4s, v12.4s +sub v11.4s, v28.4s, v30.4s +sub v13.4s, v7.4s, v27.4s +sub v0.4s, v4.4s, v6.4s +str q22, [x0, #240] +add v18.4s, v18.4s, v12.4s +add v28.4s, v28.4s, v30.4s +add v7.4s, v7.4s, v27.4s +add v4.4s, v4.4s, v6.4s +ldr q6, [x0, #272] +ldr q27, [x0, #336] +ldr q30, [x0, #400] +ldr q12, [x0, #464] +sqrdmulh v22.4S, v8.4S, v2.s[0] +sqrdmulh v14.4S, v29.4S, v15.s[0] +sqrdmulh v19.4S, v26.4S, v20.s[0] +sqrdmulh v3.4S, v5.4S, v9.s[0] +mul v8.4S, v8.4S,v16.s[0] +mul v29.4S, v29.4S,v10.s[0] +mul v26.4S, v26.4S,v17.s[0] +mul v5.4S, v5.4S,v1.s[0] +mla v8.4S, v22.4S, v31.s[0] +mla v29.4S, v14.4S, v31.s[0] +mla v26.4S, v19.4S, v31.s[0] +mla v5.4S, v3.4S, v31.s[0] +sub v3.4s, v6.4s, v8.4s +sub v19.4s, v27.4s, v29.4s +sub v14.4s, v30.4s, v26.4s +sub v22.4s, v12.4s, v5.4s +add v6.4s, v6.4s, v8.4s +add v27.4s, v27.4s, v29.4s +add v30.4s, v30.4s, v26.4s +add v12.4s, v12.4s, v5.4s +sqrdmulh v5.4S, v6.4S, v2.s[1] +sqrdmulh v26.4S, v27.4S, v15.s[1] +sqrdmulh v29.4S, v30.4S, v20.s[1] +sqrdmulh v8.4S, v12.4S, v9.s[1] +mul v6.4S, v6.4S,v16.s[1] +mul v27.4S, v27.4S,v10.s[1] +mul v30.4S, v30.4S,v17.s[1] +mul v12.4S, v12.4S,v1.s[1] +mla v6.4S, v5.4S, v31.s[0] +mla v27.4S, v26.4S, v31.s[0] +mla v30.4S, v29.4S, v31.s[0] +mla v12.4S, v8.4S, v31.s[0] +sub v8.4s, v18.4s, v6.4s +sub v29.4s, v28.4s, v27.4s +sub v26.4s, v7.4s, v30.4s +sub v5.4s, v4.4s, v12.4s +add v18.4s, v18.4s, v6.4s +add v28.4s, v28.4s, v27.4s +add v7.4s, v7.4s, v30.4s +add v4.4s, v4.4s, v12.4s +sqrdmulh v12.4S, v3.4S, v2.s[2] +sqrdmulh v30.4S, v19.4S, v15.s[2] +sqrdmulh v27.4S, v14.4S, v20.s[2] +sqrdmulh v6.4S, v22.4S, v9.s[2] +str q18, [x0, #256] +str q8, [x0, #272] +mul v3.4S, v3.4S,v16.s[2] +mul v19.4S, v19.4S,v10.s[2] +mul v14.4S, v14.4S,v17.s[2] +mul v22.4S, v22.4S,v1.s[2] +str q28, [x0, #320] +str q29, [x0, #336] +ldr q9, [x17, #+384] +ldr q1, [x17, #+400] +ldr q20, [x17, #+416] +ldr q17, [x17, #+432] +mla v3.4S, v12.4S, v31.s[0] +mla v19.4S, v30.4S, v31.s[0] +mla v14.4S, v27.4S, v31.s[0] +mla v22.4S, v6.4S, v31.s[0] +str q7, [x0, #384] +str q26, [x0, #400] +ldr q26, [x17, #+448] +ldr q7, [x17, #+464] +sub v6.4s, v21.4s, v3.4s +sub v27.4s, v11.4s, v19.4s +sub v30.4s, v13.4s, v14.4s +sub v12.4s, v0.4s, v22.4s +str q4, [x0, #448] +str q5, [x0, #464] +ldr q5, [x17, #+480] +ldr q4, [x17, #+496] +add v21.4s, v21.4s, v3.4s +add v11.4s, v11.4s, v19.4s +add v13.4s, v13.4s, v14.4s +add v0.4s, v0.4s, v22.4s +str q21, [x0, #288] +str q11, [x0, #352] +str q13, [x0, #416] +str q0, [x0, #480] +ldr q0, [x0, #544] +ldr q13, [x0, #560] +ldr q11, [x0, #512] +ldr q21, [x0, #608] +ldr q22, [x0, #624] +ldr q14, [x0, #576] +ldr q19, [x0, #672] +ldr q3, [x0, #688] +ldr q15, [x0, #640] +ldr q10, [x0, #736] +ldr q2, [x0, #752] +ldr q16, [x0, #704] +sqrdmulh v29.4S, v0.4S, v1.s[0] +sqrdmulh v28.4S, v21.4S, v17.s[0] +sqrdmulh v8.4S, v19.4S, v7.s[0] +sqrdmulh v18.4S, v10.4S, v4.s[0] +str q6, [x0, #304] +mul v0.4S, v0.4S,v9.s[0] +mul v21.4S, v21.4S,v20.s[0] +mul v19.4S, v19.4S,v26.s[0] +mul v10.4S, v10.4S,v5.s[0] +str q27, [x0, #368] +mla v0.4S, v29.4S, v31.s[0] +mla v21.4S, v28.4S, v31.s[0] +mla v19.4S, v8.4S, v31.s[0] +mla v10.4S, v18.4S, v31.s[0] +str q30, [x0, #432] +sub v30.4s, v11.4s, v0.4s +sub v18.4s, v14.4s, v21.4s +sub v8.4s, v15.4s, v19.4s +sub v28.4s, v16.4s, v10.4s +str q12, [x0, #496] +add v11.4s, v11.4s, v0.4s +add v14.4s, v14.4s, v21.4s +add v15.4s, v15.4s, v19.4s +add v16.4s, v16.4s, v10.4s +ldr q10, [x0, #528] +ldr q19, [x0, #592] +ldr q21, [x0, #656] +ldr q0, [x0, #720] +sqrdmulh v12.4S, v13.4S, v1.s[0] +sqrdmulh v29.4S, v22.4S, v17.s[0] +sqrdmulh v27.4S, v3.4S, v7.s[0] +sqrdmulh v6.4S, v2.4S, v4.s[0] +mul v13.4S, v13.4S,v9.s[0] +mul v22.4S, v22.4S,v20.s[0] +mul v3.4S, v3.4S,v26.s[0] +mul v2.4S, v2.4S,v5.s[0] +mla v13.4S, v12.4S, v31.s[0] +mla v22.4S, v29.4S, v31.s[0] +mla v3.4S, v27.4S, v31.s[0] +mla v2.4S, v6.4S, v31.s[0] +sub v6.4s, v10.4s, v13.4s +sub v27.4s, v19.4s, v22.4s +sub v29.4s, v21.4s, v3.4s +sub v12.4s, v0.4s, v2.4s +add v10.4s, v10.4s, v13.4s +add v19.4s, v19.4s, v22.4s +add v21.4s, v21.4s, v3.4s +add v0.4s, v0.4s, v2.4s +sqrdmulh v2.4S, v10.4S, v1.s[1] +sqrdmulh v3.4S, v19.4S, v17.s[1] +sqrdmulh v22.4S, v21.4S, v7.s[1] +sqrdmulh v13.4S, v0.4S, v4.s[1] +mul v10.4S, v10.4S,v9.s[1] +mul v19.4S, v19.4S,v20.s[1] +mul v21.4S, v21.4S,v26.s[1] +mul v0.4S, v0.4S,v5.s[1] +mla v10.4S, v2.4S, v31.s[0] +mla v19.4S, v3.4S, v31.s[0] +mla v21.4S, v22.4S, v31.s[0] +mla v0.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v10.4s +sub v22.4s, v14.4s, v19.4s +sub v3.4s, v15.4s, v21.4s +sub v2.4s, v16.4s, v0.4s +add v11.4s, v11.4s, v10.4s +add v14.4s, v14.4s, v19.4s +add v15.4s, v15.4s, v21.4s +add v16.4s, v16.4s, v0.4s +sqrdmulh v0.4S, v6.4S, v1.s[2] +sqrdmulh v21.4S, v27.4S, v17.s[2] +sqrdmulh v19.4S, v29.4S, v7.s[2] +sqrdmulh v10.4S, v12.4S, v4.s[2] +str q11, [x0, #512] +str q13, [x0, #528] +mul v6.4S, v6.4S,v9.s[2] +mul v27.4S, v27.4S,v20.s[2] +mul v29.4S, v29.4S,v26.s[2] +mul v12.4S, v12.4S,v5.s[2] +str q14, [x0, #576] +str q22, [x0, #592] +ldr q4, [x17, #+512] +ldr q5, [x17, #+528] +ldr q7, [x17, #+544] +ldr q26, [x17, #+560] +mla v6.4S, v0.4S, v31.s[0] +mla v27.4S, v21.4S, v31.s[0] +mla v29.4S, v19.4S, v31.s[0] +mla v12.4S, v10.4S, v31.s[0] +str q15, [x0, #640] +str q3, [x0, #656] +ldr q3, [x17, #+576] +ldr q15, [x17, #+592] +sub v10.4s, v30.4s, v6.4s +sub v19.4s, v18.4s, v27.4s +sub v21.4s, v8.4s, v29.4s +sub v0.4s, v28.4s, v12.4s +str q16, [x0, #704] +str q2, [x0, #720] +ldr q2, [x17, #+608] +ldr q16, [x17, #+624] +add v30.4s, v30.4s, v6.4s +add v18.4s, v18.4s, v27.4s +add v8.4s, v8.4s, v29.4s +add v28.4s, v28.4s, v12.4s +str q30, [x0, #544] +str q18, [x0, #608] +str q8, [x0, #672] +str q28, [x0, #736] +ldr q28, [x0, #800] +ldr q8, [x0, #816] +ldr q18, [x0, #768] +ldr q30, [x0, #864] +ldr q12, [x0, #880] +ldr q29, [x0, #832] +ldr q27, [x0, #928] +ldr q6, [x0, #944] +ldr q17, [x0, #896] +ldr q20, [x0, #992] +ldr q1, [x0, #1008] +ldr q9, [x0, #960] +sqrdmulh v22.4S, v28.4S, v5.s[0] +sqrdmulh v14.4S, v30.4S, v26.s[0] +sqrdmulh v13.4S, v27.4S, v15.s[0] +sqrdmulh v11.4S, v20.4S, v16.s[0] +str q10, [x0, #560] +mul v28.4S, v28.4S,v4.s[0] +mul v30.4S, v30.4S,v7.s[0] +mul v27.4S, v27.4S,v3.s[0] +mul v20.4S, v20.4S,v2.s[0] +str q19, [x0, #624] +mla v28.4S, v22.4S, v31.s[0] +mla v30.4S, v14.4S, v31.s[0] +mla v27.4S, v13.4S, v31.s[0] +mla v20.4S, v11.4S, v31.s[0] +str q21, [x0, #688] +sub v21.4s, v18.4s, v28.4s +sub v11.4s, v29.4s, v30.4s +sub v13.4s, v17.4s, v27.4s +sub v14.4s, v9.4s, v20.4s +str q0, [x0, #752] +add v18.4s, v18.4s, v28.4s +add v29.4s, v29.4s, v30.4s +add v17.4s, v17.4s, v27.4s +add v9.4s, v9.4s, v20.4s +ldr q20, [x0, #784] +ldr q27, [x0, #848] +ldr q30, [x0, #912] +ldr q28, [x0, #976] +sqrdmulh v0.4S, v8.4S, v5.s[0] +sqrdmulh v22.4S, v12.4S, v26.s[0] +sqrdmulh v19.4S, v6.4S, v15.s[0] +sqrdmulh v10.4S, v1.4S, v16.s[0] +mul v8.4S, v8.4S,v4.s[0] +mul v12.4S, v12.4S,v7.s[0] +mul v6.4S, v6.4S,v3.s[0] +mul v1.4S, v1.4S,v2.s[0] +mla v8.4S, v0.4S, v31.s[0] +mla v12.4S, v22.4S, v31.s[0] +mla v6.4S, v19.4S, v31.s[0] +mla v1.4S, v10.4S, v31.s[0] +sub v10.4s, v20.4s, v8.4s +sub v19.4s, v27.4s, v12.4s +sub v22.4s, v30.4s, v6.4s +sub v0.4s, v28.4s, v1.4s +add v20.4s, v20.4s, v8.4s +add v27.4s, v27.4s, v12.4s +add v30.4s, v30.4s, v6.4s +add v28.4s, v28.4s, v1.4s +sqrdmulh v1.4S, v20.4S, v5.s[1] +sqrdmulh v6.4S, v27.4S, v26.s[1] +sqrdmulh v12.4S, v30.4S, v15.s[1] +sqrdmulh v8.4S, v28.4S, v16.s[1] +mul v20.4S, v20.4S,v4.s[1] +mul v27.4S, v27.4S,v7.s[1] +mul v30.4S, v30.4S,v3.s[1] +mul v28.4S, v28.4S,v2.s[1] +mla v20.4S, v1.4S, v31.s[0] +mla v27.4S, v6.4S, v31.s[0] +mla v30.4S, v12.4S, v31.s[0] +mla v28.4S, v8.4S, v31.s[0] +sub v8.4s, v18.4s, v20.4s +sub v12.4s, v29.4s, v27.4s +sub v6.4s, v17.4s, v30.4s +sub v1.4s, v9.4s, v28.4s +add v18.4s, v18.4s, v20.4s +add v29.4s, v29.4s, v27.4s +add v17.4s, v17.4s, v30.4s +add v9.4s, v9.4s, v28.4s +sqrdmulh v28.4S, v10.4S, v5.s[2] +sqrdmulh v30.4S, v19.4S, v26.s[2] +sqrdmulh v27.4S, v22.4S, v15.s[2] +sqrdmulh v20.4S, v0.4S, v16.s[2] +str q18, [x0, #768] +str q8, [x0, #784] +mul v10.4S, v10.4S,v4.s[2] +mul v19.4S, v19.4S,v7.s[2] +mul v22.4S, v22.4S,v3.s[2] +mul v0.4S, v0.4S,v2.s[2] +str q29, [x0, #832] +str q12, [x0, #848] +mla v10.4S, v28.4S, v31.s[0] +mla v19.4S, v30.4S, v31.s[0] +mla v22.4S, v27.4S, v31.s[0] +mla v0.4S, v20.4S, v31.s[0] +str q17, [x0, #896] +str q6, [x0, #912] +sub v6.4s, v21.4s, v10.4s +sub v17.4s, v11.4s, v19.4s +sub v20.4s, v13.4s, v22.4s +sub v27.4s, v14.4s, v0.4s +str q9, [x0, #960] +str q1, [x0, #976] +add v21.4s, v21.4s, v10.4s +add v11.4s, v11.4s, v19.4s +add v13.4s, v13.4s, v22.4s +add v14.4s, v14.4s, v0.4s +str q21, [x0, #800] +str q11, [x0, #864] +str q13, [x0, #928] +str q14, [x0, #992] +str q6, [x0, #816] +str q17, [x0, #880] +str q20, [x0, #944] +str q27, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1464 +// Instruction count: 1460 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_4.s b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_4.s new file mode 100644 index 0000000..1381d5b --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_4.s @@ -0,0 +1,1494 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_3_z4_4 +.global _ntt_u32_incomplete_neon_asm_var_4_2_3_z4_4 +ntt_u32_incomplete_neon_asm_var_4_2_3_z4_4: +_ntt_u32_incomplete_neon_asm_var_4_2_3_z4_4: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #800] +ldr q21, [x0, #864] +ldr q20, [x0, #928] +ldr q19, [x0, #992] +ldr q18, [x0, #288] +ldr q17, [x0, #352] +ldr q16, [x0, #416] +ldr q3, [x0, #480] +sqrdmulh v2.4S, v22.4S, v29.s[0] +ldr q1, [x0, #544] +mul v22.4S, v22.4S,v30.s[0] +ldr q0, [x0, #608] +sqrdmulh v15.4S, v21.4S, v29.s[0] +ldr q14, [x0, #672] +mul v21.4S, v21.4S,v30.s[0] +ldr q13, [x0, #736] +mla v22.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q12, [x0, #32] +sub v11.4s, v18.4s, v22.4s +mla v21.4S, v15.4S, v31.s[0] +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q15, [x0, #96] +sub v10.4s, v17.4s, v21.4s +mla v20.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v1.4S, v29.s[0] +ldr q2, [x0, #160] +mul v1.4S, v1.4S,v30.s[0] +sub v9.4s, v16.4s, v20.4s +mla v19.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v0.4S, v29.s[0] +ldr q22, [x0, #224] +mul v0.4S, v0.4S,v30.s[0] +sub v8.4s, v3.4s, v19.4s +mla v1.4S, v21.4S, v31.s[0] +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v21.4s, v12.4s, v1.4s +mla v0.4S, v20.4S, v31.s[0] +add v12.4s, v12.4s, v1.4s +sqrdmulh v1.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +sub v20.4s, v15.4s, v0.4s +mla v14.4S, v19.4S, v31.s[0] +add v15.4s, v15.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v19.4s, v2.4s, v14.4s +mla v13.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v1.4s, v22.4s, v13.4s +mla v16.4S, v0.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v0.4s, v2.4s, v16.4s +mla v3.4S, v14.4S, v31.s[0] +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v14.4s, v22.4s, v3.4s +mla v18.4S, v13.4S, v31.s[0] +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v29.s[2] +mul v9.4S, v9.4S,v30.s[2] +sub v13.4s, v12.4s, v18.4s +mla v17.4S, v16.4S, v31.s[0] +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v8.4S, v29.s[2] +mul v8.4S, v8.4S,v30.s[2] +sub v16.4s, v15.4s, v17.4s +mla v9.4S, v3.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +sqrdmulh v17.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v3.4s, v19.4s, v9.4s +mla v8.4S, v18.4S, v31.s[0] +add v19.4s, v19.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v18.4s, v1.4s, v8.4s +mla v11.4S, v17.4S, v31.s[0] +add v1.4s, v1.4s, v8.4s +sqrdmulh v8.4S, v2.4S, v27.s[0] +mul v2.4S, v2.4S,v28.s[0] +sub v17.4s, v21.4s, v11.4s +mla v10.4S, v9.4S, v31.s[0] +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v27.s[0] +mul v22.4S, v22.4S,v28.s[0] +sub v9.4s, v20.4s, v10.4s +mla v2.4S, v8.4S, v31.s[0] +add v20.4s, v20.4s, v10.4s +sqrdmulh v10.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v8.4s, v12.4s, v2.4s +mla v22.4S, v11.4S, v31.s[0] +add v12.4s, v12.4s, v2.4s +sqrdmulh v2.4S, v14.4S, v27.s[1] +mul v14.4S, v14.4S,v28.s[1] +sub v11.4s, v15.4s, v22.4s +mla v0.4S, v10.4S, v31.s[0] +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v27.s[2] +mul v19.4S, v19.4S,v28.s[2] +sub v10.4s, v13.4s, v0.4s +mla v14.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v0.4s +sqrdmulh v0.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +sub v2.4s, v16.4s, v14.4s +mla v19.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v27.s[3] +mul v3.4S, v3.4S,v28.s[3] +sub v22.4s, v21.4s, v19.4s +mla v1.4S, v0.4S, v31.s[0] +add v21.4s, v21.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v0.4s, v20.4s, v1.4s +mla v3.4S, v14.4S, v31.s[0] +add v20.4s, v20.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v25.s[0] +mul v15.4S, v15.4S,v26.s[0] +sub v14.4s, v17.4s, v3.4s +mla v18.4S, v19.4S, v31.s[0] +add v17.4s, v17.4s, v3.4s +sqrdmulh v3.4S, v11.4S, v25.s[1] +mul v11.4S, v11.4S,v26.s[1] +sub v19.4s, v9.4s, v18.4s +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v1.4s, v12.4s, v15.4s +mla v11.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v25.s[3] +mul v2.4S, v2.4S,v26.s[3] +sub v3.4s, v8.4s, v11.4s +mla v16.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v11.4s +str q12, [x0, #32] +sqrdmulh v12.4S, v20.4S, v23.s[0] +str q1, [x0, #96] +mul v20.4S, v20.4S,v24.s[0] +ldr q1, [x0, #816] +sub v11.4s, v13.4s, v16.4s +ldr q18, [x0, #880] +mla v2.4S, v15.4S, v31.s[0] +add v13.4s, v13.4s, v16.4s +str q8, [x0, #160] +sqrdmulh v8.4S, v0.4S, v23.s[1] +str q3, [x0, #224] +mul v0.4S, v0.4S,v24.s[1] +ldr q3, [x0, #944] +sub v16.4s, v10.4s, v2.4s +ldr q15, [x0, #1008] +mla v20.4S, v12.4S, v31.s[0] +add v10.4s, v10.4s, v2.4s +str q13, [x0, #288] +sqrdmulh v13.4S, v9.4S, v23.s[2] +str q11, [x0, #352] +mul v9.4S, v9.4S,v24.s[2] +ldr q11, [x0, #304] +sub v2.4s, v21.4s, v20.4s +ldr q12, [x0, #368] +mla v0.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v20.4s +str q10, [x0, #416] +sqrdmulh v10.4S, v19.4S, v23.s[3] +str q16, [x0, #480] +mul v19.4S, v19.4S,v24.s[3] +ldr q16, [x0, #432] +sub v20.4s, v22.4s, v0.4s +ldr q8, [x0, #496] +mla v9.4S, v13.4S, v31.s[0] +add v22.4s, v22.4s, v0.4s +str q21, [x0, #544] +sqrdmulh v21.4S, v1.4S, v29.s[0] +str q2, [x0, #608] +ldr q2, [x0, #560] +mul v1.4S, v1.4S,v30.s[0] +ldr q0, [x0, #624] +sub v13.4s, v17.4s, v9.4s +mla v19.4S, v10.4S, v31.s[0] +add v17.4s, v17.4s, v9.4s +str q22, [x0, #672] +sqrdmulh v22.4S, v18.4S, v29.s[0] +str q20, [x0, #736] +ldr q20, [x0, #688] +mul v18.4S, v18.4S,v30.s[0] +ldr q9, [x0, #752] +sub v10.4s, v14.4s, v19.4s +mla v1.4S, v21.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +str q17, [x0, #800] +sqrdmulh v17.4S, v3.4S, v29.s[0] +str q13, [x0, #864] +mul v3.4S, v3.4S,v30.s[0] +ldr q13, [x0, #48] +sub v19.4s, v11.4s, v1.4s +mla v18.4S, v22.4S, v31.s[0] +add v11.4s, v11.4s, v1.4s +str q14, [x0, #928] +sqrdmulh v14.4S, v15.4S, v29.s[0] +str q10, [x0, #992] +mul v15.4S, v15.4S,v30.s[0] +ldr q10, [x0, #112] +sub v1.4s, v12.4s, v18.4s +mla v3.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v2.4S, v29.s[0] +ldr q17, [x0, #176] +mul v2.4S, v2.4S,v30.s[0] +sub v22.4s, v16.4s, v3.4s +mla v15.4S, v14.4S, v31.s[0] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v0.4S, v29.s[0] +ldr q14, [x0, #240] +mul v0.4S, v0.4S,v30.s[0] +sub v21.4s, v8.4s, v15.4s +mla v2.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +sub v18.4s, v13.4s, v2.4s +mla v0.4S, v3.4S, v31.s[0] +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v9.4S, v29.s[0] +mul v9.4S, v9.4S,v30.s[0] +sub v3.4s, v10.4s, v0.4s +mla v20.4S, v15.4S, v31.s[0] +add v10.4s, v10.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v15.4s, v17.4s, v20.4s +mla v9.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +sub v2.4s, v14.4s, v9.4s +mla v16.4S, v0.4S, v31.s[0] +add v14.4s, v14.4s, v9.4s +sqrdmulh v9.4S, v11.4S, v29.s[1] +mul v11.4S, v11.4S,v30.s[1] +sub v0.4s, v17.4s, v16.4s +mla v8.4S, v20.4S, v31.s[0] +add v17.4s, v17.4s, v16.4s +sqrdmulh v16.4S, v12.4S, v29.s[1] +mul v12.4S, v12.4S,v30.s[1] +sub v20.4s, v14.4s, v8.4s +mla v11.4S, v9.4S, v31.s[0] +add v14.4s, v14.4s, v8.4s +sqrdmulh v8.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +sub v9.4s, v13.4s, v11.4s +mla v12.4S, v16.4S, v31.s[0] +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +sub v16.4s, v10.4s, v12.4s +mla v22.4S, v8.4S, v31.s[0] +add v10.4s, v10.4s, v12.4s +sqrdmulh v12.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +sub v8.4s, v15.4s, v22.4s +mla v21.4S, v11.4S, v31.s[0] +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v1.4S, v29.s[2] +mul v1.4S, v1.4S,v30.s[2] +sub v11.4s, v2.4s, v21.4s +mla v19.4S, v12.4S, v31.s[0] +add v2.4s, v2.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v27.s[0] +mul v17.4S, v17.4S,v28.s[0] +sub v12.4s, v18.4s, v19.4s +mla v1.4S, v22.4S, v31.s[0] +add v18.4s, v18.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +sub v22.4s, v3.4s, v1.4s +mla v17.4S, v21.4S, v31.s[0] +add v3.4s, v3.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v21.4s, v13.4s, v17.4s +mla v14.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v17.4s +sqrdmulh v17.4S, v20.4S, v27.s[1] +mul v20.4S, v20.4S,v28.s[1] +sub v19.4s, v10.4s, v14.4s +mla v0.4S, v1.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v27.s[2] +mul v15.4S, v15.4S,v28.s[2] +sub v1.4s, v9.4s, v0.4s +mla v20.4S, v17.4S, v31.s[0] +add v9.4s, v9.4s, v0.4s +sqrdmulh v0.4S, v2.4S, v27.s[2] +mul v2.4S, v2.4S,v28.s[2] +sub v17.4s, v16.4s, v20.4s +mla v15.4S, v14.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v27.s[3] +mul v8.4S, v8.4S,v28.s[3] +sub v14.4s, v18.4s, v15.4s +mla v2.4S, v0.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v27.s[3] +mul v11.4S, v11.4S,v28.s[3] +sub v0.4s, v3.4s, v2.4s +mla v8.4S, v20.4S, v31.s[0] +add v3.4s, v3.4s, v2.4s +sqrdmulh v2.4S, v10.4S, v25.s[0] +mul v10.4S, v10.4S,v26.s[0] +sub v20.4s, v12.4s, v8.4s +mla v11.4S, v15.4S, v31.s[0] +add v12.4s, v12.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v25.s[1] +mul v19.4S, v19.4S,v26.s[1] +sub v15.4s, v22.4s, v11.4s +mla v10.4S, v2.4S, v31.s[0] +add v22.4s, v22.4s, v11.4s +sqrdmulh v11.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v2.4s, v13.4s, v10.4s +mla v19.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v10.4s +sqrdmulh v10.4S, v17.4S, v25.s[3] +mul v17.4S, v17.4S,v26.s[3] +sub v8.4s, v21.4s, v19.4s +mla v16.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v19.4s +str q13, [x0, #48] +sqrdmulh v13.4S, v3.4S, v23.s[0] +str q2, [x0, #112] +mul v3.4S, v3.4S,v24.s[0] +ldr q2, [x0, #768] +sub v19.4s, v9.4s, v16.4s +ldr q11, [x0, #832] +mla v17.4S, v10.4S, v31.s[0] +add v9.4s, v9.4s, v16.4s +str q21, [x0, #176] +sqrdmulh v21.4S, v0.4S, v23.s[1] +str q8, [x0, #240] +mul v0.4S, v0.4S,v24.s[1] +ldr q8, [x0, #896] +sub v16.4s, v1.4s, v17.4s +ldr q10, [x0, #960] +mla v3.4S, v13.4S, v31.s[0] +add v1.4s, v1.4s, v17.4s +str q9, [x0, #304] +sqrdmulh v9.4S, v22.4S, v23.s[2] +str q19, [x0, #368] +mul v22.4S, v22.4S,v24.s[2] +ldr q19, [x0, #256] +sub v17.4s, v18.4s, v3.4s +ldr q13, [x0, #320] +mla v0.4S, v21.4S, v31.s[0] +add v18.4s, v18.4s, v3.4s +str q1, [x0, #432] +sqrdmulh v1.4S, v15.4S, v23.s[3] +str q16, [x0, #496] +mul v15.4S, v15.4S,v24.s[3] +ldr q16, [x0, #384] +sub v3.4s, v14.4s, v0.4s +ldr q21, [x0, #448] +mla v22.4S, v9.4S, v31.s[0] +add v14.4s, v14.4s, v0.4s +str q18, [x0, #560] +sqrdmulh v18.4S, v2.4S, v29.s[0] +str q17, [x0, #624] +ldr q17, [x0, #512] +mul v2.4S, v2.4S,v30.s[0] +ldr q0, [x0, #576] +sub v9.4s, v12.4s, v22.4s +mla v15.4S, v1.4S, v31.s[0] +add v12.4s, v12.4s, v22.4s +str q14, [x0, #688] +sqrdmulh v14.4S, v11.4S, v29.s[0] +str q3, [x0, #752] +ldr q3, [x0, #640] +mul v11.4S, v11.4S,v30.s[0] +ldr q22, [x0, #704] +sub v1.4s, v20.4s, v15.4s +mla v2.4S, v18.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +str q12, [x0, #816] +sqrdmulh v12.4S, v8.4S, v29.s[0] +str q9, [x0, #880] +mul v8.4S, v8.4S,v30.s[0] +ldr q9, [x0, #0] +sub v15.4s, v19.4s, v2.4s +mla v11.4S, v14.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +str q20, [x0, #944] +sqrdmulh v20.4S, v10.4S, v29.s[0] +str q1, [x0, #1008] +mul v10.4S, v10.4S,v30.s[0] +ldr q1, [x0, #64] +sub v2.4s, v13.4s, v11.4s +mla v8.4S, v12.4S, v31.s[0] +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v29.s[0] +ldr q12, [x0, #128] +mul v17.4S, v17.4S,v30.s[0] +sub v14.4s, v16.4s, v8.4s +mla v10.4S, v20.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v0.4S, v29.s[0] +ldr q20, [x0, #192] +mul v0.4S, v0.4S,v30.s[0] +sub v18.4s, v21.4s, v10.4s +mla v17.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +sub v11.4s, v9.4s, v17.4s +mla v0.4S, v8.4S, v31.s[0] +add v9.4s, v9.4s, v17.4s +sqrdmulh v17.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v8.4s, v1.4s, v0.4s +mla v3.4S, v10.4S, v31.s[0] +add v1.4s, v1.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v10.4s, v12.4s, v3.4s +mla v22.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v17.4s, v20.4s, v22.4s +mla v16.4S, v0.4S, v31.s[0] +add v20.4s, v20.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[1] +mul v19.4S, v19.4S,v30.s[1] +sub v0.4s, v12.4s, v16.4s +mla v21.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v13.4S, v29.s[1] +mul v13.4S, v13.4S,v30.s[1] +sub v3.4s, v20.4s, v21.4s +mla v19.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v22.4s, v9.4s, v19.4s +mla v13.4S, v16.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v29.s[2] +mul v18.4S, v18.4S,v30.s[2] +sub v16.4s, v1.4s, v13.4s +mla v14.4S, v21.4S, v31.s[0] +add v1.4s, v1.4s, v13.4s +sqrdmulh v13.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +sub v21.4s, v10.4s, v14.4s +mla v18.4S, v19.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v19.4s, v17.4s, v18.4s +mla v15.4S, v13.4S, v31.s[0] +add v17.4s, v17.4s, v18.4s +sqrdmulh v18.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +sub v13.4s, v11.4s, v15.4s +mla v2.4S, v14.4S, v31.s[0] +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v27.s[0] +mul v20.4S, v20.4S,v28.s[0] +sub v14.4s, v8.4s, v2.4s +mla v12.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v2.4s +sqrdmulh v2.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v18.4s, v9.4s, v12.4s +mla v20.4S, v15.4S, v31.s[0] +add v9.4s, v9.4s, v12.4s +sqrdmulh v12.4S, v3.4S, v27.s[1] +mul v3.4S, v3.4S,v28.s[1] +sub v15.4s, v1.4s, v20.4s +mla v0.4S, v2.4S, v31.s[0] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v10.4S, v27.s[2] +mul v10.4S, v10.4S,v28.s[2] +sub v2.4s, v22.4s, v0.4s +mla v3.4S, v12.4S, v31.s[0] +add v22.4s, v22.4s, v0.4s +sqrdmulh v0.4S, v17.4S, v27.s[2] +mul v17.4S, v17.4S,v28.s[2] +sub v12.4s, v16.4s, v3.4s +mla v10.4S, v20.4S, v31.s[0] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +sub v20.4s, v11.4s, v10.4s +mla v17.4S, v0.4S, v31.s[0] +add v11.4s, v11.4s, v10.4s +sqrdmulh v10.4S, v19.4S, v27.s[3] +mul v19.4S, v19.4S,v28.s[3] +sub v0.4s, v8.4s, v17.4s +mla v21.4S, v3.4S, v31.s[0] +add v8.4s, v8.4s, v17.4s +sqrdmulh v17.4S, v1.4S, v25.s[0] +mul v1.4S, v1.4S,v26.s[0] +sub v3.4s, v13.4s, v21.4s +mla v19.4S, v10.4S, v31.s[0] +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v15.4S, v25.s[1] +mul v15.4S, v15.4S,v26.s[1] +sub v10.4s, v14.4s, v19.4s +mla v1.4S, v17.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +sqrdmulh v19.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v17.4s, v9.4s, v1.4s +mla v15.4S, v21.4S, v31.s[0] +add v9.4s, v9.4s, v1.4s +sqrdmulh v1.4S, v12.4S, v25.s[3] +mul v12.4S, v12.4S,v26.s[3] +sub v21.4s, v18.4s, v15.4s +mla v16.4S, v19.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +str q9, [x0, #0] +sqrdmulh v9.4S, v8.4S, v23.s[0] +str q17, [x0, #64] +mul v8.4S, v8.4S,v24.s[0] +ldr q17, [x0, #784] +sub v15.4s, v22.4s, v16.4s +ldr q19, [x0, #848] +mla v12.4S, v1.4S, v31.s[0] +add v22.4s, v22.4s, v16.4s +str q18, [x0, #128] +sqrdmulh v18.4S, v0.4S, v23.s[1] +str q21, [x0, #192] +mul v0.4S, v0.4S,v24.s[1] +ldr q21, [x0, #912] +sub v16.4s, v2.4s, v12.4s +ldr q1, [x0, #976] +mla v8.4S, v9.4S, v31.s[0] +add v2.4s, v2.4s, v12.4s +str q22, [x0, #256] +sqrdmulh v22.4S, v14.4S, v23.s[2] +str q15, [x0, #320] +mul v14.4S, v14.4S,v24.s[2] +ldr q15, [x0, #272] +sub v12.4s, v11.4s, v8.4s +ldr q9, [x0, #336] +mla v0.4S, v18.4S, v31.s[0] +add v11.4s, v11.4s, v8.4s +str q2, [x0, #384] +sqrdmulh v2.4S, v10.4S, v23.s[3] +str q16, [x0, #448] +mul v10.4S, v10.4S,v24.s[3] +ldr q16, [x0, #400] +sub v8.4s, v20.4s, v0.4s +ldr q18, [x0, #464] +mla v14.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v0.4s +str q11, [x0, #512] +sqrdmulh v11.4S, v17.4S, v29.s[0] +str q12, [x0, #576] +ldr q12, [x0, #528] +mul v17.4S, v17.4S,v30.s[0] +ldr q0, [x0, #592] +sub v22.4s, v13.4s, v14.4s +mla v10.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v14.4s +str q20, [x0, #640] +sqrdmulh v20.4S, v19.4S, v29.s[0] +str q8, [x0, #704] +ldr q8, [x0, #656] +mul v19.4S, v19.4S,v30.s[0] +ldr q14, [x0, #720] +sub v2.4s, v3.4s, v10.4s +mla v17.4S, v11.4S, v31.s[0] +add v3.4s, v3.4s, v10.4s +str q13, [x0, #768] +sqrdmulh v13.4S, v21.4S, v29.s[0] +str q22, [x0, #832] +mul v21.4S, v21.4S,v30.s[0] +ldr q22, [x0, #16] +sub v10.4s, v15.4s, v17.4s +mla v19.4S, v20.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +str q3, [x0, #896] +sqrdmulh v3.4S, v1.4S, v29.s[0] +str q2, [x0, #960] +mul v1.4S, v1.4S,v30.s[0] +ldr q2, [x0, #80] +sub v17.4s, v9.4s, v19.4s +mla v21.4S, v13.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v12.4S, v29.s[0] +ldr q13, [x0, #144] +mul v12.4S, v12.4S,v30.s[0] +sub v20.4s, v16.4s, v21.4s +mla v1.4S, v3.4S, v31.s[0] +add v16.4s, v16.4s, v21.4s +sqrdmulh v21.4S, v0.4S, v29.s[0] +ldr q3, [x0, #208] +mul v0.4S, v0.4S,v30.s[0] +sub v11.4s, v18.4s, v1.4s +mla v12.4S, v19.4S, v31.s[0] +add v18.4s, v18.4s, v1.4s +sqrdmulh v1.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v19.4s, v22.4s, v12.4s +mla v0.4S, v21.4S, v31.s[0] +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v21.4s, v2.4s, v0.4s +mla v8.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v1.4s, v13.4s, v8.4s +mla v14.4S, v12.4S, v31.s[0] +add v13.4s, v13.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v12.4s, v3.4s, v14.4s +mla v16.4S, v0.4S, v31.s[0] +add v3.4s, v3.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +sub v0.4s, v13.4s, v16.4s +mla v18.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v16.4s +sqrdmulh v16.4S, v9.4S, v29.s[1] +mul v9.4S, v9.4S,v30.s[1] +sub v8.4s, v3.4s, v18.4s +mla v15.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +sub v14.4s, v22.4s, v15.4s +mla v9.4S, v16.4S, v31.s[0] +add v22.4s, v22.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v16.4s, v2.4s, v9.4s +mla v20.4S, v18.4S, v31.s[0] +add v2.4s, v2.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v18.4s, v1.4s, v20.4s +mla v11.4S, v15.4S, v31.s[0] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +sub v15.4s, v12.4s, v11.4s +mla v10.4S, v9.4S, v31.s[0] +add v12.4s, v12.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v27.s[0] +mul v13.4S, v13.4S,v28.s[0] +sub v9.4s, v19.4s, v10.4s +mla v17.4S, v20.4S, v31.s[0] +add v19.4s, v19.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v27.s[0] +mul v3.4S, v3.4S,v28.s[0] +sub v20.4s, v21.4s, v17.4s +mla v13.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v11.4s, v22.4s, v13.4s +mla v3.4S, v10.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v8.4S, v27.s[1] +mul v8.4S, v8.4S,v28.s[1] +sub v10.4s, v2.4s, v3.4s +mla v0.4S, v17.4S, v31.s[0] +add v2.4s, v2.4s, v3.4s +sqrdmulh v3.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +sub v17.4s, v14.4s, v0.4s +mla v8.4S, v13.4S, v31.s[0] +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v12.4S, v27.s[2] +mul v12.4S, v12.4S,v28.s[2] +sub v13.4s, v16.4s, v8.4s +mla v1.4S, v3.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v3.4s, v19.4s, v1.4s +mla v12.4S, v0.4S, v31.s[0] +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +sub v0.4s, v21.4s, v12.4s +mla v18.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v2.4S, v25.s[0] +mul v2.4S, v2.4S,v26.s[0] +sub v8.4s, v9.4s, v18.4s +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v10.4S, v25.s[1] +mul v10.4S, v10.4S,v26.s[1] +sub v1.4s, v20.4s, v15.4s +mla v2.4S, v12.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v12.4s, v22.4s, v2.4s +mla v10.4S, v18.4S, v31.s[0] +add v22.4s, v22.4s, v2.4s +sqrdmulh v2.4S, v13.4S, v25.s[3] +mul v13.4S, v13.4S,v26.s[3] +sub v18.4s, v11.4s, v10.4s +mla v16.4S, v15.4S, v31.s[0] +add v11.4s, v11.4s, v10.4s +str q22, [x0, #16] +sqrdmulh v22.4S, v21.4S, v23.s[0] +str q12, [x0, #80] +mul v21.4S, v21.4S,v24.s[0] +sub v12.4s, v14.4s, v16.4s +mla v13.4S, v2.4S, v31.s[0] +add v14.4s, v14.4s, v16.4s +str q11, [x0, #144] +sqrdmulh v11.4S, v0.4S, v23.s[1] +str q18, [x0, #208] +mul v0.4S, v0.4S,v24.s[1] +sub v18.4s, v17.4s, v13.4s +mla v21.4S, v22.4S, v31.s[0] +add v17.4s, v17.4s, v13.4s +str q14, [x0, #272] +sqrdmulh v14.4S, v20.4S, v23.s[2] +str q12, [x0, #336] +mul v20.4S, v20.4S,v24.s[2] +sub v12.4s, v19.4s, v21.4s +mla v0.4S, v11.4S, v31.s[0] +add v19.4s, v19.4s, v21.4s +str q17, [x0, #400] +sqrdmulh v17.4S, v1.4S, v23.s[3] +str q18, [x0, #464] +mul v1.4S, v1.4S,v24.s[3] +sub v18.4s, v3.4s, v0.4s +mla v20.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v0.4s +str q19, [x0, #528] +str q12, [x0, #592] +sub v12.4s, v9.4s, v20.4s +mla v1.4S, v17.4S, v31.s[0] +add v9.4s, v9.4s, v20.4s +str q3, [x0, #656] +str q18, [x0, #720] +sub v18.4s, v8.4s, v1.4s +add v8.4s, v8.4s, v1.4s +str q9, [x0, #784] +str q12, [x0, #848] +str q8, [x0, #912] +str q18, [x0, #976] +ldr q4, [x17, #+128] +ldr q5, [x17, #+144] +ldr q6, [x0, #32] +sqrdmulh v7.4S, v6.4S, v5.s[0] +mul v6.4S, v6.4S,v4.s[0] +ldr q15, [x0, #48] +sqrdmulh v10.4S, v15.4S, v5.s[0] +mul v15.4S, v15.4S,v4.s[0] +ldr q2, [x17, #+160] +ldr q16, [x17, #+176] +ldr q22, [x0, #96] +sqrdmulh v13.4S, v22.4S, v16.s[0] +mul v22.4S, v22.4S,v2.s[0] +ldr q11, [x0, #112] +sqrdmulh v21.4S, v11.4S, v16.s[0] +mul v11.4S, v11.4S,v2.s[0] +ldr q14, [x0, #160] +ldr q0, [x17, #+192] +ldr q19, [x17, #+208] +mla v6.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v14.4S, v19.s[0] +ldr q17, [x0, #176] +mla v15.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v17.4S, v19.s[0] +ldr q20, [x0, #224] +ldr q3, [x17, #+224] +ldr q1, [x17, #+240] +mla v22.4S, v13.4S, v31.s[0] +sqrdmulh v13.4S, v20.4S, v1.s[0] +ldr q9, [x0, #240] +mla v11.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v9.4S, v1.s[0] +ldr q12, [x0, #128] +ldr q8, [x0, #0] +mul v14.4S, v14.4S,v0.s[0] +sub v18.4s, v8.4s, v6.4s +mul v17.4S, v17.4S,v0.s[0] +add v8.4s, v8.4s, v6.4s +ldr q6, [x0, #144] +ldr q30, [x0, #16] +mla v14.4S, v7.4S, v31.s[0] +sub v7.4s, v30.4s, v15.4s +mla v17.4S, v10.4S, v31.s[0] +add v30.4s, v30.4s, v15.4s +ldr q15, [x0, #192] +ldr q10, [x0, #64] +mul v20.4S, v20.4S,v3.s[0] +sub v29.4s, v10.4s, v22.4s +mul v9.4S, v9.4S,v3.s[0] +add v10.4s, v10.4s, v22.4s +ldr q22, [x0, #208] +ldr q28, [x0, #80] +mla v20.4S, v13.4S, v31.s[0] +sub v13.4s, v28.4s, v11.4s +mla v9.4S, v21.4S, v31.s[0] +add v28.4s, v28.4s, v11.4s +sqrdmulh v11.4S, v30.4S, v5.s[1] +mul v30.4S, v30.4S,v4.s[1] +sqrdmulh v21.4S, v7.4S, v5.s[2] +sub v27.4s, v12.4s, v14.4s +mul v7.4S, v7.4S,v4.s[2] +add v12.4s, v12.4s, v14.4s +sqrdmulh v5.4S, v28.4S, v16.s[1] +sub v4.4s, v6.4s, v17.4s +mul v28.4S, v28.4S,v2.s[1] +add v6.4s, v6.4s, v17.4s +sqrdmulh v17.4S, v13.4S, v16.s[2] +sub v14.4s, v15.4s, v20.4s +mul v13.4S, v13.4S,v2.s[2] +add v15.4s, v15.4s, v20.4s +mla v30.4S, v11.4S, v31.s[0] +sub v11.4s, v22.4s, v9.4s +sqrdmulh v16.4S, v6.4S, v19.s[1] +add v22.4s, v22.4s, v9.4s +mla v7.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v4.4S, v19.s[2] +mla v28.4S, v5.4S, v31.s[0] +sqrdmulh v5.4S, v22.4S, v1.s[1] +mla v13.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v11.4S, v1.s[2] +mul v6.4S, v6.4S,v0.s[1] +sub v9.4s, v8.4s, v30.4s +mul v4.4S, v4.4S,v0.s[2] +add v8.4s, v8.4s, v30.4s +str q9, [x0, #16] +str q8, [x0, #0] +mla v6.4S, v16.4S, v31.s[0] +sub v16.4s, v18.4s, v7.4s +mla v4.4S, v21.4S, v31.s[0] +add v18.4s, v18.4s, v7.4s +str q16, [x0, #48] +str q18, [x0, #32] +mul v22.4S, v22.4S,v3.s[1] +sub v19.4s, v10.4s, v28.4s +mul v11.4S, v11.4S,v3.s[2] +add v10.4s, v10.4s, v28.4s +str q19, [x0, #80] +str q10, [x0, #64] +mla v22.4S, v5.4S, v31.s[0] +sub v5.4s, v29.4s, v13.4s +mla v11.4S, v17.4S, v31.s[0] +add v29.4s, v29.4s, v13.4s +str q5, [x0, #112] +str q29, [x0, #96] +ldr q1, [x17, #+256] +ldr q3, [x17, #+272] +ldr q29, [x0, #288] +sqrdmulh v5.4S, v29.4S, v3.s[0] +sub v13.4s, v12.4s, v6.4s +str q13, [x0, #144] +mul v29.4S, v29.4S,v1.s[0] +add v12.4s, v12.4s, v6.4s +str q12, [x0, #128] +ldr q12, [x0, #304] +sqrdmulh v6.4S, v12.4S, v3.s[0] +sub v13.4s, v27.4s, v4.4s +mul v12.4S, v12.4S,v1.s[0] +add v27.4s, v27.4s, v4.4s +str q13, [x0, #176] +str q27, [x0, #160] +ldr q27, [x17, #+288] +ldr q13, [x17, #+304] +ldr q4, [x0, #352] +sqrdmulh v17.4S, v4.4S, v13.s[0] +sub v10.4s, v15.4s, v22.4s +mul v4.4S, v4.4S,v27.s[0] +add v15.4s, v15.4s, v22.4s +str q10, [x0, #208] +str q15, [x0, #192] +ldr q15, [x0, #368] +sqrdmulh v10.4S, v15.4S, v13.s[0] +sub v22.4s, v14.4s, v11.4s +mul v15.4S, v15.4S,v27.s[0] +add v14.4s, v14.4s, v11.4s +str q22, [x0, #240] +str q14, [x0, #224] +ldr q14, [x0, #416] +ldr q22, [x17, #+320] +ldr q11, [x17, #+336] +mla v29.4S, v5.4S, v31.s[0] +sqrdmulh v5.4S, v14.4S, v11.s[0] +ldr q19, [x0, #432] +mla v12.4S, v6.4S, v31.s[0] +sqrdmulh v6.4S, v19.4S, v11.s[0] +ldr q28, [x0, #480] +ldr q0, [x17, #+352] +ldr q18, [x17, #+368] +mla v4.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v28.4S, v18.s[0] +ldr q16, [x0, #496] +mla v15.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v16.4S, v18.s[0] +ldr q7, [x0, #384] +ldr q21, [x0, #256] +mul v14.4S, v14.4S,v22.s[0] +sub v8.4s, v21.4s, v29.4s +mul v19.4S, v19.4S,v22.s[0] +add v21.4s, v21.4s, v29.4s +ldr q29, [x0, #400] +ldr q9, [x0, #272] +mla v14.4S, v5.4S, v31.s[0] +sub v5.4s, v9.4s, v12.4s +mla v19.4S, v6.4S, v31.s[0] +add v9.4s, v9.4s, v12.4s +ldr q12, [x0, #448] +ldr q6, [x0, #320] +mul v28.4S, v28.4S,v0.s[0] +sub v30.4s, v6.4s, v4.4s +mul v16.4S, v16.4S,v0.s[0] +add v6.4s, v6.4s, v4.4s +ldr q4, [x0, #464] +ldr q2, [x0, #336] +mla v28.4S, v17.4S, v31.s[0] +sub v17.4s, v2.4s, v15.4s +mla v16.4S, v10.4S, v31.s[0] +add v2.4s, v2.4s, v15.4s +sqrdmulh v15.4S, v9.4S, v3.s[1] +mul v9.4S, v9.4S,v1.s[1] +sqrdmulh v10.4S, v5.4S, v3.s[2] +sub v20.4s, v7.4s, v14.4s +mul v5.4S, v5.4S,v1.s[2] +add v7.4s, v7.4s, v14.4s +sqrdmulh v3.4S, v2.4S, v13.s[1] +sub v1.4s, v29.4s, v19.4s +mul v2.4S, v2.4S,v27.s[1] +add v29.4s, v29.4s, v19.4s +sqrdmulh v19.4S, v17.4S, v13.s[2] +sub v14.4s, v12.4s, v28.4s +mul v17.4S, v17.4S,v27.s[2] +add v12.4s, v12.4s, v28.4s +mla v9.4S, v15.4S, v31.s[0] +sub v15.4s, v4.4s, v16.4s +sqrdmulh v13.4S, v29.4S, v11.s[1] +add v4.4s, v4.4s, v16.4s +mla v5.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v1.4S, v11.s[2] +mla v2.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v4.4S, v18.s[1] +mla v17.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v15.4S, v18.s[2] +mul v29.4S, v29.4S,v22.s[1] +sub v16.4s, v21.4s, v9.4s +mul v1.4S, v1.4S,v22.s[2] +add v21.4s, v21.4s, v9.4s +str q16, [x0, #272] +str q21, [x0, #256] +mla v29.4S, v13.4S, v31.s[0] +sub v13.4s, v8.4s, v5.4s +mla v1.4S, v10.4S, v31.s[0] +add v8.4s, v8.4s, v5.4s +str q13, [x0, #304] +str q8, [x0, #288] +mul v4.4S, v4.4S,v0.s[1] +sub v11.4s, v6.4s, v2.4s +mul v15.4S, v15.4S,v0.s[2] +add v6.4s, v6.4s, v2.4s +str q11, [x0, #336] +str q6, [x0, #320] +mla v4.4S, v3.4S, v31.s[0] +sub v3.4s, v30.4s, v17.4s +mla v15.4S, v19.4S, v31.s[0] +add v30.4s, v30.4s, v17.4s +str q3, [x0, #368] +str q30, [x0, #352] +ldr q18, [x17, #+384] +ldr q0, [x17, #+400] +ldr q30, [x0, #544] +sqrdmulh v3.4S, v30.4S, v0.s[0] +sub v17.4s, v7.4s, v29.4s +str q17, [x0, #400] +mul v30.4S, v30.4S,v18.s[0] +add v7.4s, v7.4s, v29.4s +str q7, [x0, #384] +ldr q7, [x0, #560] +sqrdmulh v29.4S, v7.4S, v0.s[0] +sub v17.4s, v20.4s, v1.4s +mul v7.4S, v7.4S,v18.s[0] +add v20.4s, v20.4s, v1.4s +str q17, [x0, #432] +str q20, [x0, #416] +ldr q20, [x17, #+416] +ldr q17, [x17, #+432] +ldr q1, [x0, #608] +sqrdmulh v19.4S, v1.4S, v17.s[0] +sub v6.4s, v12.4s, v4.4s +mul v1.4S, v1.4S,v20.s[0] +add v12.4s, v12.4s, v4.4s +str q6, [x0, #464] +str q12, [x0, #448] +ldr q12, [x0, #624] +sqrdmulh v6.4S, v12.4S, v17.s[0] +sub v4.4s, v14.4s, v15.4s +mul v12.4S, v12.4S,v20.s[0] +add v14.4s, v14.4s, v15.4s +str q4, [x0, #496] +str q14, [x0, #480] +ldr q14, [x0, #672] +ldr q4, [x17, #+448] +ldr q15, [x17, #+464] +mla v30.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v14.4S, v15.s[0] +ldr q11, [x0, #688] +mla v7.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v11.4S, v15.s[0] +ldr q2, [x0, #736] +ldr q22, [x17, #+480] +ldr q8, [x17, #+496] +mla v1.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v2.4S, v8.s[0] +ldr q13, [x0, #752] +mla v12.4S, v6.4S, v31.s[0] +sqrdmulh v6.4S, v13.4S, v8.s[0] +ldr q5, [x0, #640] +ldr q10, [x0, #512] +mul v14.4S, v14.4S,v4.s[0] +sub v21.4s, v10.4s, v30.4s +mul v11.4S, v11.4S,v4.s[0] +add v10.4s, v10.4s, v30.4s +ldr q30, [x0, #656] +ldr q16, [x0, #528] +mla v14.4S, v3.4S, v31.s[0] +sub v3.4s, v16.4s, v7.4s +mla v11.4S, v29.4S, v31.s[0] +add v16.4s, v16.4s, v7.4s +ldr q7, [x0, #704] +ldr q29, [x0, #576] +mul v2.4S, v2.4S,v22.s[0] +sub v9.4s, v29.4s, v1.4s +mul v13.4S, v13.4S,v22.s[0] +add v29.4s, v29.4s, v1.4s +ldr q1, [x0, #720] +ldr q27, [x0, #592] +mla v2.4S, v19.4S, v31.s[0] +sub v19.4s, v27.4s, v12.4s +mla v13.4S, v6.4S, v31.s[0] +add v27.4s, v27.4s, v12.4s +sqrdmulh v12.4S, v16.4S, v0.s[1] +mul v16.4S, v16.4S,v18.s[1] +sqrdmulh v6.4S, v3.4S, v0.s[2] +sub v28.4s, v5.4s, v14.4s +mul v3.4S, v3.4S,v18.s[2] +add v5.4s, v5.4s, v14.4s +sqrdmulh v0.4S, v27.4S, v17.s[1] +sub v18.4s, v30.4s, v11.4s +mul v27.4S, v27.4S,v20.s[1] +add v30.4s, v30.4s, v11.4s +sqrdmulh v11.4S, v19.4S, v17.s[2] +sub v14.4s, v7.4s, v2.4s +mul v19.4S, v19.4S,v20.s[2] +add v7.4s, v7.4s, v2.4s +mla v16.4S, v12.4S, v31.s[0] +sub v12.4s, v1.4s, v13.4s +sqrdmulh v17.4S, v30.4S, v15.s[1] +add v1.4s, v1.4s, v13.4s +mla v3.4S, v6.4S, v31.s[0] +sqrdmulh v6.4S, v18.4S, v15.s[2] +mla v27.4S, v0.4S, v31.s[0] +sqrdmulh v0.4S, v1.4S, v8.s[1] +mla v19.4S, v11.4S, v31.s[0] +sqrdmulh v11.4S, v12.4S, v8.s[2] +mul v30.4S, v30.4S,v4.s[1] +sub v13.4s, v10.4s, v16.4s +mul v18.4S, v18.4S,v4.s[2] +add v10.4s, v10.4s, v16.4s +str q13, [x0, #528] +str q10, [x0, #512] +mla v30.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v3.4s +mla v18.4S, v6.4S, v31.s[0] +add v21.4s, v21.4s, v3.4s +str q17, [x0, #560] +str q21, [x0, #544] +mul v1.4S, v1.4S,v22.s[1] +sub v15.4s, v29.4s, v27.4s +mul v12.4S, v12.4S,v22.s[2] +add v29.4s, v29.4s, v27.4s +str q15, [x0, #592] +str q29, [x0, #576] +mla v1.4S, v0.4S, v31.s[0] +sub v0.4s, v9.4s, v19.4s +mla v12.4S, v11.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +str q0, [x0, #624] +str q9, [x0, #608] +ldr q8, [x17, #+512] +ldr q22, [x17, #+528] +ldr q9, [x0, #800] +sqrdmulh v0.4S, v9.4S, v22.s[0] +sub v19.4s, v5.4s, v30.4s +str q19, [x0, #656] +mul v9.4S, v9.4S,v8.s[0] +add v5.4s, v5.4s, v30.4s +str q5, [x0, #640] +ldr q5, [x0, #816] +sqrdmulh v30.4S, v5.4S, v22.s[0] +sub v19.4s, v28.4s, v18.4s +mul v5.4S, v5.4S,v8.s[0] +add v28.4s, v28.4s, v18.4s +str q19, [x0, #688] +str q28, [x0, #672] +ldr q28, [x17, #+544] +ldr q19, [x17, #+560] +ldr q18, [x0, #864] +sqrdmulh v11.4S, v18.4S, v19.s[0] +sub v29.4s, v7.4s, v1.4s +mul v18.4S, v18.4S,v28.s[0] +add v7.4s, v7.4s, v1.4s +str q29, [x0, #720] +str q7, [x0, #704] +ldr q7, [x0, #880] +sqrdmulh v29.4S, v7.4S, v19.s[0] +sub v1.4s, v14.4s, v12.4s +mul v7.4S, v7.4S,v28.s[0] +add v14.4s, v14.4s, v12.4s +str q1, [x0, #752] +str q14, [x0, #736] +ldr q14, [x0, #928] +ldr q1, [x17, #+576] +ldr q12, [x17, #+592] +mla v9.4S, v0.4S, v31.s[0] +sqrdmulh v0.4S, v14.4S, v12.s[0] +ldr q15, [x0, #944] +mla v5.4S, v30.4S, v31.s[0] +sqrdmulh v30.4S, v15.4S, v12.s[0] +ldr q27, [x0, #992] +ldr q4, [x17, #+608] +ldr q21, [x17, #+624] +mla v18.4S, v11.4S, v31.s[0] +sqrdmulh v11.4S, v27.4S, v21.s[0] +ldr q17, [x0, #1008] +mla v7.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v17.4S, v21.s[0] +ldr q3, [x0, #896] +ldr q6, [x0, #768] +mul v14.4S, v14.4S,v1.s[0] +sub v10.4s, v6.4s, v9.4s +mul v15.4S, v15.4S,v1.s[0] +add v6.4s, v6.4s, v9.4s +ldr q9, [x0, #912] +ldr q13, [x0, #784] +mla v14.4S, v0.4S, v31.s[0] +sub v0.4s, v13.4s, v5.4s +mla v15.4S, v30.4S, v31.s[0] +add v13.4s, v13.4s, v5.4s +ldr q5, [x0, #960] +ldr q30, [x0, #832] +mul v27.4S, v27.4S,v4.s[0] +sub v16.4s, v30.4s, v18.4s +mul v17.4S, v17.4S,v4.s[0] +add v30.4s, v30.4s, v18.4s +ldr q18, [x0, #976] +ldr q20, [x0, #848] +mla v27.4S, v11.4S, v31.s[0] +sub v11.4s, v20.4s, v7.4s +mla v17.4S, v29.4S, v31.s[0] +add v20.4s, v20.4s, v7.4s +sqrdmulh v7.4S, v13.4S, v22.s[1] +mul v13.4S, v13.4S,v8.s[1] +sqrdmulh v29.4S, v0.4S, v22.s[2] +sub v2.4s, v3.4s, v14.4s +mul v0.4S, v0.4S,v8.s[2] +add v3.4s, v3.4s, v14.4s +sqrdmulh v22.4S, v20.4S, v19.s[1] +sub v8.4s, v9.4s, v15.4s +mul v20.4S, v20.4S,v28.s[1] +add v9.4s, v9.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v19.s[2] +sub v14.4s, v5.4s, v27.4s +mul v11.4S, v11.4S,v28.s[2] +add v5.4s, v5.4s, v27.4s +mla v13.4S, v7.4S, v31.s[0] +sub v7.4s, v18.4s, v17.4s +sqrdmulh v19.4S, v9.4S, v12.s[1] +add v18.4s, v18.4s, v17.4s +mla v0.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v8.4S, v12.s[2] +mla v20.4S, v22.4S, v31.s[0] +sqrdmulh v22.4S, v18.4S, v21.s[1] +mla v11.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v7.4S, v21.s[2] +mul v9.4S, v9.4S,v1.s[1] +sub v17.4s, v6.4s, v13.4s +mul v8.4S, v8.4S,v1.s[2] +add v6.4s, v6.4s, v13.4s +str q17, [x0, #784] +str q6, [x0, #768] +mla v9.4S, v19.4S, v31.s[0] +sub v19.4s, v10.4s, v0.4s +mla v8.4S, v29.4S, v31.s[0] +add v10.4s, v10.4s, v0.4s +str q19, [x0, #816] +str q10, [x0, #800] +mul v18.4S, v18.4S,v4.s[1] +sub v12.4s, v30.4s, v20.4s +mul v7.4S, v7.4S,v4.s[2] +add v30.4s, v30.4s, v20.4s +str q12, [x0, #848] +str q30, [x0, #832] +mla v18.4S, v22.4S, v31.s[0] +sub v22.4s, v16.4s, v11.4s +mla v7.4S, v15.4S, v31.s[0] +add v16.4s, v16.4s, v11.4s +str q22, [x0, #880] +str q16, [x0, #864] +sub v21.4s, v3.4s, v9.4s +str q21, [x0, #912] +add v3.4s, v3.4s, v9.4s +str q3, [x0, #896] +sub v3.4s, v2.4s, v8.4s +add v2.4s, v2.4s, v8.4s +str q3, [x0, #944] +str q2, [x0, #928] +sub v2.4s, v5.4s, v18.4s +add v5.4s, v5.4s, v18.4s +str q2, [x0, #976] +str q5, [x0, #960] +sub v5.4s, v14.4s, v7.4s +add v14.4s, v14.4s, v7.4s +str q5, [x0, #1008] +str q14, [x0, #992] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1464 +// Instruction count: 1460 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_5.s b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_5.s new file mode 100644 index 0000000..d9c302a --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_5.s @@ -0,0 +1,1494 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_3_z4_5 +.global _ntt_u32_incomplete_neon_asm_var_4_2_3_z4_5 +ntt_u32_incomplete_neon_asm_var_4_2_3_z4_5: +_ntt_u32_incomplete_neon_asm_var_4_2_3_z4_5: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #800] +ldr q21, [x0, #864] +ldr q20, [x0, #928] +ldr q19, [x0, #992] +ldr q18, [x0, #288] +ldr q17, [x0, #352] +ldr q16, [x0, #416] +ldr q3, [x0, #480] +sqrdmulh v2.4S, v22.4S, v29.s[0] +ldr q1, [x0, #544] +mul v22.4S, v22.4S,v30.s[0] +ldr q0, [x0, #608] +sqrdmulh v15.4S, v21.4S, v29.s[0] +ldr q14, [x0, #672] +mul v21.4S, v21.4S,v30.s[0] +ldr q13, [x0, #736] +mla v22.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q12, [x0, #32] +sub v11.4s, v18.4s, v22.4s +mla v21.4S, v15.4S, v31.s[0] +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q15, [x0, #96] +sub v10.4s, v17.4s, v21.4s +mla v20.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v1.4S, v29.s[0] +ldr q2, [x0, #160] +mul v1.4S, v1.4S,v30.s[0] +sub v9.4s, v16.4s, v20.4s +mla v19.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v0.4S, v29.s[0] +ldr q22, [x0, #224] +mul v0.4S, v0.4S,v30.s[0] +sub v8.4s, v3.4s, v19.4s +mla v1.4S, v21.4S, v31.s[0] +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v21.4s, v12.4s, v1.4s +mla v0.4S, v20.4S, v31.s[0] +add v12.4s, v12.4s, v1.4s +sqrdmulh v1.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +sub v20.4s, v15.4s, v0.4s +mla v14.4S, v19.4S, v31.s[0] +add v15.4s, v15.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v19.4s, v2.4s, v14.4s +mla v13.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v1.4s, v22.4s, v13.4s +mla v16.4S, v0.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v0.4s, v2.4s, v16.4s +mla v3.4S, v14.4S, v31.s[0] +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v14.4s, v22.4s, v3.4s +mla v18.4S, v13.4S, v31.s[0] +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v29.s[2] +mul v9.4S, v9.4S,v30.s[2] +sub v13.4s, v12.4s, v18.4s +mla v17.4S, v16.4S, v31.s[0] +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v8.4S, v29.s[2] +mul v8.4S, v8.4S,v30.s[2] +sub v16.4s, v15.4s, v17.4s +mla v9.4S, v3.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +sqrdmulh v17.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v3.4s, v19.4s, v9.4s +mla v8.4S, v18.4S, v31.s[0] +add v19.4s, v19.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v18.4s, v1.4s, v8.4s +mla v11.4S, v17.4S, v31.s[0] +add v1.4s, v1.4s, v8.4s +sqrdmulh v8.4S, v2.4S, v27.s[0] +mul v2.4S, v2.4S,v28.s[0] +sub v17.4s, v21.4s, v11.4s +mla v10.4S, v9.4S, v31.s[0] +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v27.s[0] +mul v22.4S, v22.4S,v28.s[0] +sub v9.4s, v20.4s, v10.4s +mla v2.4S, v8.4S, v31.s[0] +add v20.4s, v20.4s, v10.4s +sqrdmulh v10.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v8.4s, v12.4s, v2.4s +mla v22.4S, v11.4S, v31.s[0] +add v12.4s, v12.4s, v2.4s +sqrdmulh v2.4S, v14.4S, v27.s[1] +mul v14.4S, v14.4S,v28.s[1] +sub v11.4s, v15.4s, v22.4s +mla v0.4S, v10.4S, v31.s[0] +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v27.s[2] +mul v19.4S, v19.4S,v28.s[2] +sub v10.4s, v13.4s, v0.4s +mla v14.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v0.4s +sqrdmulh v0.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +sub v2.4s, v16.4s, v14.4s +mla v19.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v27.s[3] +mul v3.4S, v3.4S,v28.s[3] +sub v22.4s, v21.4s, v19.4s +mla v1.4S, v0.4S, v31.s[0] +add v21.4s, v21.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v0.4s, v20.4s, v1.4s +mla v3.4S, v14.4S, v31.s[0] +add v20.4s, v20.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v25.s[0] +mul v15.4S, v15.4S,v26.s[0] +sub v14.4s, v17.4s, v3.4s +mla v18.4S, v19.4S, v31.s[0] +add v17.4s, v17.4s, v3.4s +sqrdmulh v3.4S, v11.4S, v25.s[1] +mul v11.4S, v11.4S,v26.s[1] +sub v19.4s, v9.4s, v18.4s +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v1.4s, v12.4s, v15.4s +mla v11.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v25.s[3] +mul v2.4S, v2.4S,v26.s[3] +sub v3.4s, v8.4s, v11.4s +mla v16.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v11.4s +str q12, [x0, #32] +sqrdmulh v12.4S, v20.4S, v23.s[0] +str q1, [x0, #96] +mul v20.4S, v20.4S,v24.s[0] +ldr q1, [x0, #816] +sub v11.4s, v13.4s, v16.4s +ldr q18, [x0, #880] +mla v2.4S, v15.4S, v31.s[0] +add v13.4s, v13.4s, v16.4s +str q8, [x0, #160] +sqrdmulh v8.4S, v0.4S, v23.s[1] +str q3, [x0, #224] +mul v0.4S, v0.4S,v24.s[1] +ldr q3, [x0, #944] +sub v16.4s, v10.4s, v2.4s +ldr q15, [x0, #1008] +mla v20.4S, v12.4S, v31.s[0] +add v10.4s, v10.4s, v2.4s +str q13, [x0, #288] +sqrdmulh v13.4S, v9.4S, v23.s[2] +str q11, [x0, #352] +mul v9.4S, v9.4S,v24.s[2] +ldr q11, [x0, #304] +sub v2.4s, v21.4s, v20.4s +ldr q12, [x0, #368] +mla v0.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v20.4s +str q10, [x0, #416] +sqrdmulh v10.4S, v19.4S, v23.s[3] +str q16, [x0, #480] +mul v19.4S, v19.4S,v24.s[3] +ldr q16, [x0, #432] +sub v20.4s, v22.4s, v0.4s +ldr q8, [x0, #496] +mla v9.4S, v13.4S, v31.s[0] +add v22.4s, v22.4s, v0.4s +str q21, [x0, #544] +sqrdmulh v21.4S, v1.4S, v29.s[0] +str q2, [x0, #608] +ldr q2, [x0, #560] +mul v1.4S, v1.4S,v30.s[0] +ldr q0, [x0, #624] +sub v13.4s, v17.4s, v9.4s +mla v19.4S, v10.4S, v31.s[0] +add v17.4s, v17.4s, v9.4s +str q22, [x0, #672] +sqrdmulh v22.4S, v18.4S, v29.s[0] +str q20, [x0, #736] +ldr q20, [x0, #688] +mul v18.4S, v18.4S,v30.s[0] +ldr q9, [x0, #752] +sub v10.4s, v14.4s, v19.4s +mla v1.4S, v21.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +str q17, [x0, #800] +sqrdmulh v17.4S, v3.4S, v29.s[0] +str q13, [x0, #864] +mul v3.4S, v3.4S,v30.s[0] +ldr q13, [x0, #48] +sub v19.4s, v11.4s, v1.4s +mla v18.4S, v22.4S, v31.s[0] +add v11.4s, v11.4s, v1.4s +str q14, [x0, #928] +sqrdmulh v14.4S, v15.4S, v29.s[0] +str q10, [x0, #992] +mul v15.4S, v15.4S,v30.s[0] +ldr q10, [x0, #112] +sub v1.4s, v12.4s, v18.4s +mla v3.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v2.4S, v29.s[0] +ldr q17, [x0, #176] +mul v2.4S, v2.4S,v30.s[0] +sub v22.4s, v16.4s, v3.4s +mla v15.4S, v14.4S, v31.s[0] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v0.4S, v29.s[0] +ldr q14, [x0, #240] +mul v0.4S, v0.4S,v30.s[0] +sub v21.4s, v8.4s, v15.4s +mla v2.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +sub v18.4s, v13.4s, v2.4s +mla v0.4S, v3.4S, v31.s[0] +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v9.4S, v29.s[0] +mul v9.4S, v9.4S,v30.s[0] +sub v3.4s, v10.4s, v0.4s +mla v20.4S, v15.4S, v31.s[0] +add v10.4s, v10.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v15.4s, v17.4s, v20.4s +mla v9.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +sub v2.4s, v14.4s, v9.4s +mla v16.4S, v0.4S, v31.s[0] +add v14.4s, v14.4s, v9.4s +sqrdmulh v9.4S, v11.4S, v29.s[1] +mul v11.4S, v11.4S,v30.s[1] +sub v0.4s, v17.4s, v16.4s +mla v8.4S, v20.4S, v31.s[0] +add v17.4s, v17.4s, v16.4s +sqrdmulh v16.4S, v12.4S, v29.s[1] +mul v12.4S, v12.4S,v30.s[1] +sub v20.4s, v14.4s, v8.4s +mla v11.4S, v9.4S, v31.s[0] +add v14.4s, v14.4s, v8.4s +sqrdmulh v8.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +sub v9.4s, v13.4s, v11.4s +mla v12.4S, v16.4S, v31.s[0] +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +sub v16.4s, v10.4s, v12.4s +mla v22.4S, v8.4S, v31.s[0] +add v10.4s, v10.4s, v12.4s +sqrdmulh v12.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +sub v8.4s, v15.4s, v22.4s +mla v21.4S, v11.4S, v31.s[0] +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v1.4S, v29.s[2] +mul v1.4S, v1.4S,v30.s[2] +sub v11.4s, v2.4s, v21.4s +mla v19.4S, v12.4S, v31.s[0] +add v2.4s, v2.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v27.s[0] +mul v17.4S, v17.4S,v28.s[0] +sub v12.4s, v18.4s, v19.4s +mla v1.4S, v22.4S, v31.s[0] +add v18.4s, v18.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +sub v22.4s, v3.4s, v1.4s +mla v17.4S, v21.4S, v31.s[0] +add v3.4s, v3.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v21.4s, v13.4s, v17.4s +mla v14.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v17.4s +sqrdmulh v17.4S, v20.4S, v27.s[1] +mul v20.4S, v20.4S,v28.s[1] +sub v19.4s, v10.4s, v14.4s +mla v0.4S, v1.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v27.s[2] +mul v15.4S, v15.4S,v28.s[2] +sub v1.4s, v9.4s, v0.4s +mla v20.4S, v17.4S, v31.s[0] +add v9.4s, v9.4s, v0.4s +sqrdmulh v0.4S, v2.4S, v27.s[2] +mul v2.4S, v2.4S,v28.s[2] +sub v17.4s, v16.4s, v20.4s +mla v15.4S, v14.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v27.s[3] +mul v8.4S, v8.4S,v28.s[3] +sub v14.4s, v18.4s, v15.4s +mla v2.4S, v0.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v27.s[3] +mul v11.4S, v11.4S,v28.s[3] +sub v0.4s, v3.4s, v2.4s +mla v8.4S, v20.4S, v31.s[0] +add v3.4s, v3.4s, v2.4s +sqrdmulh v2.4S, v10.4S, v25.s[0] +mul v10.4S, v10.4S,v26.s[0] +sub v20.4s, v12.4s, v8.4s +mla v11.4S, v15.4S, v31.s[0] +add v12.4s, v12.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v25.s[1] +mul v19.4S, v19.4S,v26.s[1] +sub v15.4s, v22.4s, v11.4s +mla v10.4S, v2.4S, v31.s[0] +add v22.4s, v22.4s, v11.4s +sqrdmulh v11.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v2.4s, v13.4s, v10.4s +mla v19.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v10.4s +sqrdmulh v10.4S, v17.4S, v25.s[3] +mul v17.4S, v17.4S,v26.s[3] +sub v8.4s, v21.4s, v19.4s +mla v16.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v19.4s +str q13, [x0, #48] +sqrdmulh v13.4S, v3.4S, v23.s[0] +str q2, [x0, #112] +mul v3.4S, v3.4S,v24.s[0] +ldr q2, [x0, #768] +sub v19.4s, v9.4s, v16.4s +ldr q11, [x0, #832] +mla v17.4S, v10.4S, v31.s[0] +add v9.4s, v9.4s, v16.4s +str q21, [x0, #176] +sqrdmulh v21.4S, v0.4S, v23.s[1] +str q8, [x0, #240] +mul v0.4S, v0.4S,v24.s[1] +ldr q8, [x0, #896] +sub v16.4s, v1.4s, v17.4s +ldr q10, [x0, #960] +mla v3.4S, v13.4S, v31.s[0] +add v1.4s, v1.4s, v17.4s +str q9, [x0, #304] +sqrdmulh v9.4S, v22.4S, v23.s[2] +str q19, [x0, #368] +mul v22.4S, v22.4S,v24.s[2] +ldr q19, [x0, #256] +sub v17.4s, v18.4s, v3.4s +ldr q13, [x0, #320] +mla v0.4S, v21.4S, v31.s[0] +add v18.4s, v18.4s, v3.4s +str q1, [x0, #432] +sqrdmulh v1.4S, v15.4S, v23.s[3] +str q16, [x0, #496] +mul v15.4S, v15.4S,v24.s[3] +ldr q16, [x0, #384] +sub v3.4s, v14.4s, v0.4s +ldr q21, [x0, #448] +mla v22.4S, v9.4S, v31.s[0] +add v14.4s, v14.4s, v0.4s +str q18, [x0, #560] +sqrdmulh v18.4S, v2.4S, v29.s[0] +str q17, [x0, #624] +ldr q17, [x0, #512] +mul v2.4S, v2.4S,v30.s[0] +ldr q0, [x0, #576] +sub v9.4s, v12.4s, v22.4s +mla v15.4S, v1.4S, v31.s[0] +add v12.4s, v12.4s, v22.4s +str q14, [x0, #688] +sqrdmulh v14.4S, v11.4S, v29.s[0] +str q3, [x0, #752] +ldr q3, [x0, #640] +mul v11.4S, v11.4S,v30.s[0] +ldr q22, [x0, #704] +sub v1.4s, v20.4s, v15.4s +mla v2.4S, v18.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +str q12, [x0, #816] +sqrdmulh v12.4S, v8.4S, v29.s[0] +str q9, [x0, #880] +mul v8.4S, v8.4S,v30.s[0] +ldr q9, [x0, #0] +sub v15.4s, v19.4s, v2.4s +mla v11.4S, v14.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +str q20, [x0, #944] +sqrdmulh v20.4S, v10.4S, v29.s[0] +str q1, [x0, #1008] +mul v10.4S, v10.4S,v30.s[0] +ldr q1, [x0, #64] +sub v2.4s, v13.4s, v11.4s +mla v8.4S, v12.4S, v31.s[0] +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v29.s[0] +ldr q12, [x0, #128] +mul v17.4S, v17.4S,v30.s[0] +sub v14.4s, v16.4s, v8.4s +mla v10.4S, v20.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v0.4S, v29.s[0] +ldr q20, [x0, #192] +mul v0.4S, v0.4S,v30.s[0] +sub v18.4s, v21.4s, v10.4s +mla v17.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +sub v11.4s, v9.4s, v17.4s +mla v0.4S, v8.4S, v31.s[0] +add v9.4s, v9.4s, v17.4s +sqrdmulh v17.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v8.4s, v1.4s, v0.4s +mla v3.4S, v10.4S, v31.s[0] +add v1.4s, v1.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v10.4s, v12.4s, v3.4s +mla v22.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v17.4s, v20.4s, v22.4s +mla v16.4S, v0.4S, v31.s[0] +add v20.4s, v20.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[1] +mul v19.4S, v19.4S,v30.s[1] +sub v0.4s, v12.4s, v16.4s +mla v21.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v13.4S, v29.s[1] +mul v13.4S, v13.4S,v30.s[1] +sub v3.4s, v20.4s, v21.4s +mla v19.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v22.4s, v9.4s, v19.4s +mla v13.4S, v16.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v29.s[2] +mul v18.4S, v18.4S,v30.s[2] +sub v16.4s, v1.4s, v13.4s +mla v14.4S, v21.4S, v31.s[0] +add v1.4s, v1.4s, v13.4s +sqrdmulh v13.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +sub v21.4s, v10.4s, v14.4s +mla v18.4S, v19.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v19.4s, v17.4s, v18.4s +mla v15.4S, v13.4S, v31.s[0] +add v17.4s, v17.4s, v18.4s +sqrdmulh v18.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +sub v13.4s, v11.4s, v15.4s +mla v2.4S, v14.4S, v31.s[0] +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v27.s[0] +mul v20.4S, v20.4S,v28.s[0] +sub v14.4s, v8.4s, v2.4s +mla v12.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v2.4s +sqrdmulh v2.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v18.4s, v9.4s, v12.4s +mla v20.4S, v15.4S, v31.s[0] +add v9.4s, v9.4s, v12.4s +sqrdmulh v12.4S, v3.4S, v27.s[1] +mul v3.4S, v3.4S,v28.s[1] +sub v15.4s, v1.4s, v20.4s +mla v0.4S, v2.4S, v31.s[0] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v10.4S, v27.s[2] +mul v10.4S, v10.4S,v28.s[2] +sub v2.4s, v22.4s, v0.4s +mla v3.4S, v12.4S, v31.s[0] +add v22.4s, v22.4s, v0.4s +sqrdmulh v0.4S, v17.4S, v27.s[2] +mul v17.4S, v17.4S,v28.s[2] +sub v12.4s, v16.4s, v3.4s +mla v10.4S, v20.4S, v31.s[0] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +sub v20.4s, v11.4s, v10.4s +mla v17.4S, v0.4S, v31.s[0] +add v11.4s, v11.4s, v10.4s +sqrdmulh v10.4S, v19.4S, v27.s[3] +mul v19.4S, v19.4S,v28.s[3] +sub v0.4s, v8.4s, v17.4s +mla v21.4S, v3.4S, v31.s[0] +add v8.4s, v8.4s, v17.4s +sqrdmulh v17.4S, v1.4S, v25.s[0] +mul v1.4S, v1.4S,v26.s[0] +sub v3.4s, v13.4s, v21.4s +mla v19.4S, v10.4S, v31.s[0] +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v15.4S, v25.s[1] +mul v15.4S, v15.4S,v26.s[1] +sub v10.4s, v14.4s, v19.4s +mla v1.4S, v17.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +sqrdmulh v19.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v17.4s, v9.4s, v1.4s +mla v15.4S, v21.4S, v31.s[0] +add v9.4s, v9.4s, v1.4s +sqrdmulh v1.4S, v12.4S, v25.s[3] +mul v12.4S, v12.4S,v26.s[3] +sub v21.4s, v18.4s, v15.4s +mla v16.4S, v19.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +str q9, [x0, #0] +sqrdmulh v9.4S, v8.4S, v23.s[0] +str q17, [x0, #64] +mul v8.4S, v8.4S,v24.s[0] +ldr q17, [x0, #784] +sub v15.4s, v22.4s, v16.4s +ldr q19, [x0, #848] +mla v12.4S, v1.4S, v31.s[0] +add v22.4s, v22.4s, v16.4s +str q18, [x0, #128] +sqrdmulh v18.4S, v0.4S, v23.s[1] +str q21, [x0, #192] +mul v0.4S, v0.4S,v24.s[1] +ldr q21, [x0, #912] +sub v16.4s, v2.4s, v12.4s +ldr q1, [x0, #976] +mla v8.4S, v9.4S, v31.s[0] +add v2.4s, v2.4s, v12.4s +str q22, [x0, #256] +sqrdmulh v22.4S, v14.4S, v23.s[2] +str q15, [x0, #320] +mul v14.4S, v14.4S,v24.s[2] +ldr q15, [x0, #272] +sub v12.4s, v11.4s, v8.4s +ldr q9, [x0, #336] +mla v0.4S, v18.4S, v31.s[0] +add v11.4s, v11.4s, v8.4s +str q2, [x0, #384] +sqrdmulh v2.4S, v10.4S, v23.s[3] +str q16, [x0, #448] +mul v10.4S, v10.4S,v24.s[3] +ldr q16, [x0, #400] +sub v8.4s, v20.4s, v0.4s +ldr q18, [x0, #464] +mla v14.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v0.4s +str q11, [x0, #512] +sqrdmulh v11.4S, v17.4S, v29.s[0] +str q12, [x0, #576] +ldr q12, [x0, #528] +mul v17.4S, v17.4S,v30.s[0] +ldr q0, [x0, #592] +sub v22.4s, v13.4s, v14.4s +mla v10.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v14.4s +str q20, [x0, #640] +sqrdmulh v20.4S, v19.4S, v29.s[0] +str q8, [x0, #704] +ldr q8, [x0, #656] +mul v19.4S, v19.4S,v30.s[0] +ldr q14, [x0, #720] +sub v2.4s, v3.4s, v10.4s +mla v17.4S, v11.4S, v31.s[0] +add v3.4s, v3.4s, v10.4s +str q13, [x0, #768] +sqrdmulh v13.4S, v21.4S, v29.s[0] +str q22, [x0, #832] +mul v21.4S, v21.4S,v30.s[0] +ldr q22, [x0, #16] +sub v10.4s, v15.4s, v17.4s +mla v19.4S, v20.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +str q3, [x0, #896] +sqrdmulh v3.4S, v1.4S, v29.s[0] +str q2, [x0, #960] +mul v1.4S, v1.4S,v30.s[0] +ldr q2, [x0, #80] +sub v17.4s, v9.4s, v19.4s +mla v21.4S, v13.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v12.4S, v29.s[0] +ldr q13, [x0, #144] +mul v12.4S, v12.4S,v30.s[0] +sub v20.4s, v16.4s, v21.4s +mla v1.4S, v3.4S, v31.s[0] +add v16.4s, v16.4s, v21.4s +sqrdmulh v21.4S, v0.4S, v29.s[0] +ldr q3, [x0, #208] +mul v0.4S, v0.4S,v30.s[0] +sub v11.4s, v18.4s, v1.4s +mla v12.4S, v19.4S, v31.s[0] +add v18.4s, v18.4s, v1.4s +sqrdmulh v1.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v19.4s, v22.4s, v12.4s +mla v0.4S, v21.4S, v31.s[0] +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v21.4s, v2.4s, v0.4s +mla v8.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v1.4s, v13.4s, v8.4s +mla v14.4S, v12.4S, v31.s[0] +add v13.4s, v13.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v12.4s, v3.4s, v14.4s +mla v16.4S, v0.4S, v31.s[0] +add v3.4s, v3.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +sub v0.4s, v13.4s, v16.4s +mla v18.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v16.4s +sqrdmulh v16.4S, v9.4S, v29.s[1] +mul v9.4S, v9.4S,v30.s[1] +sub v8.4s, v3.4s, v18.4s +mla v15.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +sub v14.4s, v22.4s, v15.4s +mla v9.4S, v16.4S, v31.s[0] +add v22.4s, v22.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v16.4s, v2.4s, v9.4s +mla v20.4S, v18.4S, v31.s[0] +add v2.4s, v2.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v18.4s, v1.4s, v20.4s +mla v11.4S, v15.4S, v31.s[0] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +sub v15.4s, v12.4s, v11.4s +mla v10.4S, v9.4S, v31.s[0] +add v12.4s, v12.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v27.s[0] +mul v13.4S, v13.4S,v28.s[0] +sub v9.4s, v19.4s, v10.4s +mla v17.4S, v20.4S, v31.s[0] +add v19.4s, v19.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v27.s[0] +mul v3.4S, v3.4S,v28.s[0] +sub v20.4s, v21.4s, v17.4s +mla v13.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v11.4s, v22.4s, v13.4s +mla v3.4S, v10.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v8.4S, v27.s[1] +mul v8.4S, v8.4S,v28.s[1] +sub v10.4s, v2.4s, v3.4s +mla v0.4S, v17.4S, v31.s[0] +add v2.4s, v2.4s, v3.4s +sqrdmulh v3.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +sub v17.4s, v14.4s, v0.4s +mla v8.4S, v13.4S, v31.s[0] +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v12.4S, v27.s[2] +mul v12.4S, v12.4S,v28.s[2] +sub v13.4s, v16.4s, v8.4s +mla v1.4S, v3.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v3.4s, v19.4s, v1.4s +mla v12.4S, v0.4S, v31.s[0] +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +sub v0.4s, v21.4s, v12.4s +mla v18.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v2.4S, v25.s[0] +mul v2.4S, v2.4S,v26.s[0] +sub v8.4s, v9.4s, v18.4s +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v10.4S, v25.s[1] +mul v10.4S, v10.4S,v26.s[1] +sub v1.4s, v20.4s, v15.4s +mla v2.4S, v12.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v12.4s, v22.4s, v2.4s +mla v10.4S, v18.4S, v31.s[0] +add v22.4s, v22.4s, v2.4s +sqrdmulh v2.4S, v13.4S, v25.s[3] +mul v13.4S, v13.4S,v26.s[3] +sub v18.4s, v11.4s, v10.4s +mla v16.4S, v15.4S, v31.s[0] +add v11.4s, v11.4s, v10.4s +str q22, [x0, #16] +sqrdmulh v22.4S, v21.4S, v23.s[0] +str q12, [x0, #80] +mul v21.4S, v21.4S,v24.s[0] +sub v12.4s, v14.4s, v16.4s +mla v13.4S, v2.4S, v31.s[0] +add v14.4s, v14.4s, v16.4s +str q11, [x0, #144] +sqrdmulh v11.4S, v0.4S, v23.s[1] +str q18, [x0, #208] +mul v0.4S, v0.4S,v24.s[1] +sub v18.4s, v17.4s, v13.4s +mla v21.4S, v22.4S, v31.s[0] +add v17.4s, v17.4s, v13.4s +str q14, [x0, #272] +sqrdmulh v14.4S, v20.4S, v23.s[2] +str q12, [x0, #336] +mul v20.4S, v20.4S,v24.s[2] +sub v12.4s, v19.4s, v21.4s +mla v0.4S, v11.4S, v31.s[0] +add v19.4s, v19.4s, v21.4s +str q17, [x0, #400] +sqrdmulh v17.4S, v1.4S, v23.s[3] +str q18, [x0, #464] +mul v1.4S, v1.4S,v24.s[3] +sub v18.4s, v3.4s, v0.4s +mla v20.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v0.4s +str q19, [x0, #528] +str q12, [x0, #592] +sub v12.4s, v9.4s, v20.4s +mla v1.4S, v17.4S, v31.s[0] +add v9.4s, v9.4s, v20.4s +str q3, [x0, #656] +str q18, [x0, #720] +sub v18.4s, v8.4s, v1.4s +add v8.4s, v8.4s, v1.4s +str q9, [x0, #784] +str q12, [x0, #848] +str q8, [x0, #912] +str q18, [x0, #976] +ldr q4, [x0, #224] +ldr q5, [x0, #160] +ldr q6, [x0, #32] +ldr q7, [x17, #+128] +ldr q15, [x17, #+144] +sqrdmulh v10.4S, v6.4S, v15.s[0] +mul v6.4S, v6.4S,v7.s[0] +ldr q2, [x0, #48] +sqrdmulh v16.4S, v2.4S, v15.s[0] +mul v2.4S, v2.4S,v7.s[0] +ldr q22, [x17, #+160] +ldr q13, [x17, #+176] +ldr q11, [x0, #96] +sqrdmulh v21.4S, v11.4S, v13.s[0] +mul v11.4S, v11.4S,v22.s[0] +ldr q14, [x0, #112] +sqrdmulh v0.4S, v14.4S, v13.s[0] +mul v14.4S, v14.4S,v22.s[0] +ldr q19, [x17, #+192] +ldr q17, [x17, #+208] +mla v6.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v5.4S, v17.s[0] +ldr q20, [x0, #176] +mla v2.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v20.4S, v17.s[0] +ldr q3, [x17, #+224] +ldr q1, [x17, #+240] +mla v11.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v4.4S, v1.s[0] +ldr q9, [x0, #240] +mla v14.4S, v0.4S, v31.s[0] +sqrdmulh v0.4S, v9.4S, v1.s[0] +ldr q12, [x0, #128] +ldr q8, [x0, #0] +mul v5.4S, v5.4S,v19.s[0] +sub v18.4s, v8.4s, v6.4s +mul v20.4S, v20.4S,v19.s[0] +add v8.4s, v8.4s, v6.4s +ldr q6, [x0, #144] +ldr q30, [x0, #16] +mla v5.4S, v10.4S, v31.s[0] +sub v10.4s, v30.4s, v2.4s +mla v20.4S, v16.4S, v31.s[0] +add v30.4s, v30.4s, v2.4s +ldr q2, [x0, #192] +ldr q16, [x0, #64] +mul v4.4S, v4.4S,v3.s[0] +sub v29.4s, v16.4s, v11.4s +mul v9.4S, v9.4S,v3.s[0] +add v16.4s, v16.4s, v11.4s +ldr q11, [x0, #208] +ldr q28, [x0, #80] +mla v4.4S, v21.4S, v31.s[0] +mla v9.4S, v0.4S, v31.s[0] +sub v0.4s, v28.4s, v14.4s +sqrdmulh v21.4S, v30.4S, v15.s[1] +mul v30.4S, v30.4S,v7.s[1] +add v28.4s, v28.4s, v14.4s +sqrdmulh v14.4S, v10.4S, v15.s[2] +sub v27.4s, v12.4s, v5.4s +mul v10.4S, v10.4S,v7.s[2] +add v12.4s, v12.4s, v5.4s +sqrdmulh v15.4S, v28.4S, v13.s[1] +sub v7.4s, v6.4s, v20.4s +mul v28.4S, v28.4S,v22.s[1] +add v6.4s, v6.4s, v20.4s +sqrdmulh v20.4S, v0.4S, v13.s[2] +sub v5.4s, v2.4s, v4.4s +mul v0.4S, v0.4S,v22.s[2] +add v2.4s, v2.4s, v4.4s +mla v30.4S, v21.4S, v31.s[0] +sub v21.4s, v11.4s, v9.4s +ldr q13, [x0, #480] +sqrdmulh v22.4S, v6.4S, v17.s[1] +add v11.4s, v11.4s, v9.4s +mla v10.4S, v14.4S, v31.s[0] +ldr q14, [x0, #416] +sqrdmulh v9.4S, v7.4S, v17.s[2] +mla v28.4S, v15.4S, v31.s[0] +ldr q15, [x0, #288] +sqrdmulh v4.4S, v11.4S, v1.s[1] +mla v0.4S, v20.4S, v31.s[0] +ldr q20, [x17, #+256] +sqrdmulh v26.4S, v21.4S, v1.s[2] +ldr q25, [x17, #+272] +mul v6.4S, v6.4S,v19.s[1] +sub v24.4s, v8.4s, v30.4s +str q24, [x0, #16] +mul v7.4S, v7.4S,v19.s[2] +add v8.4s, v8.4s, v30.4s +str q8, [x0, #0] +mla v6.4S, v22.4S, v31.s[0] +sub v22.4s, v18.4s, v10.4s +str q22, [x0, #48] +mla v7.4S, v9.4S, v31.s[0] +add v18.4s, v18.4s, v10.4s +str q18, [x0, #32] +mul v11.4S, v11.4S,v3.s[1] +sub v17.4s, v16.4s, v28.4s +str q17, [x0, #80] +mul v21.4S, v21.4S,v3.s[2] +add v16.4s, v16.4s, v28.4s +str q16, [x0, #64] +mla v11.4S, v4.4S, v31.s[0] +sub v4.4s, v29.4s, v0.4s +str q4, [x0, #112] +mla v21.4S, v26.4S, v31.s[0] +add v29.4s, v29.4s, v0.4s +str q29, [x0, #96] +sqrdmulh v1.4S, v15.4S, v25.s[0] +sub v3.4s, v12.4s, v6.4s +mul v15.4S, v15.4S,v20.s[0] +str q3, [x0, #144] +ldr q3, [x0, #304] +sqrdmulh v29.4S, v3.4S, v25.s[0] +add v12.4s, v12.4s, v6.4s +mul v3.4S, v3.4S,v20.s[0] +str q12, [x0, #128] +ldr q12, [x17, #+288] +ldr q6, [x17, #+304] +ldr q0, [x0, #352] +sqrdmulh v26.4S, v0.4S, v6.s[0] +sub v4.4s, v27.4s, v7.4s +mul v0.4S, v0.4S,v12.s[0] +str q4, [x0, #176] +ldr q4, [x0, #368] +sqrdmulh v16.4S, v4.4S, v6.s[0] +add v27.4s, v27.4s, v7.4s +mul v4.4S, v4.4S,v12.s[0] +str q27, [x0, #160] +ldr q27, [x17, #+320] +ldr q7, [x17, #+336] +mla v15.4S, v1.4S, v31.s[0] +sub v1.4s, v2.4s, v11.4s +sqrdmulh v28.4S, v14.4S, v7.s[0] +str q1, [x0, #208] +ldr q1, [x0, #432] +mla v3.4S, v29.4S, v31.s[0] +add v2.4s, v2.4s, v11.4s +sqrdmulh v11.4S, v1.4S, v7.s[0] +str q2, [x0, #192] +ldr q2, [x17, #+352] +ldr q29, [x17, #+368] +mla v0.4S, v26.4S, v31.s[0] +sub v26.4s, v5.4s, v21.4s +sqrdmulh v17.4S, v13.4S, v29.s[0] +str q26, [x0, #240] +ldr q26, [x0, #496] +mla v4.4S, v16.4S, v31.s[0] +add v5.4s, v5.4s, v21.4s +sqrdmulh v21.4S, v26.4S, v29.s[0] +str q5, [x0, #224] +ldr q5, [x0, #384] +ldr q16, [x0, #256] +mul v14.4S, v14.4S,v27.s[0] +sub v19.4s, v16.4s, v15.4s +mul v1.4S, v1.4S,v27.s[0] +add v16.4s, v16.4s, v15.4s +ldr q15, [x0, #400] +ldr q18, [x0, #272] +mla v14.4S, v28.4S, v31.s[0] +sub v28.4s, v18.4s, v3.4s +mla v1.4S, v11.4S, v31.s[0] +add v18.4s, v18.4s, v3.4s +ldr q3, [x0, #448] +ldr q11, [x0, #320] +mul v13.4S, v13.4S,v2.s[0] +sub v10.4s, v11.4s, v0.4s +mul v26.4S, v26.4S,v2.s[0] +add v11.4s, v11.4s, v0.4s +ldr q0, [x0, #464] +ldr q9, [x0, #336] +mla v13.4S, v17.4S, v31.s[0] +mla v26.4S, v21.4S, v31.s[0] +sub v21.4s, v9.4s, v4.4s +sqrdmulh v17.4S, v18.4S, v25.s[1] +mul v18.4S, v18.4S,v20.s[1] +add v9.4s, v9.4s, v4.4s +sqrdmulh v4.4S, v28.4S, v25.s[2] +sub v22.4s, v5.4s, v14.4s +mul v28.4S, v28.4S,v20.s[2] +add v5.4s, v5.4s, v14.4s +sqrdmulh v25.4S, v9.4S, v6.s[1] +sub v20.4s, v15.4s, v1.4s +mul v9.4S, v9.4S,v12.s[1] +add v15.4s, v15.4s, v1.4s +sqrdmulh v1.4S, v21.4S, v6.s[2] +sub v14.4s, v3.4s, v13.4s +mul v21.4S, v21.4S,v12.s[2] +add v3.4s, v3.4s, v13.4s +mla v18.4S, v17.4S, v31.s[0] +sub v17.4s, v0.4s, v26.4s +ldr q6, [x0, #736] +sqrdmulh v12.4S, v15.4S, v7.s[1] +add v0.4s, v0.4s, v26.4s +mla v28.4S, v4.4S, v31.s[0] +ldr q4, [x0, #672] +sqrdmulh v26.4S, v20.4S, v7.s[2] +mla v9.4S, v25.4S, v31.s[0] +ldr q25, [x0, #544] +sqrdmulh v13.4S, v0.4S, v29.s[1] +mla v21.4S, v1.4S, v31.s[0] +ldr q1, [x17, #+384] +sqrdmulh v8.4S, v17.4S, v29.s[2] +ldr q30, [x17, #+400] +mul v15.4S, v15.4S,v27.s[1] +sub v24.4s, v16.4s, v18.4s +str q24, [x0, #272] +mul v20.4S, v20.4S,v27.s[2] +add v16.4s, v16.4s, v18.4s +str q16, [x0, #256] +mla v15.4S, v12.4S, v31.s[0] +sub v12.4s, v19.4s, v28.4s +str q12, [x0, #304] +mla v20.4S, v26.4S, v31.s[0] +add v19.4s, v19.4s, v28.4s +str q19, [x0, #288] +mul v0.4S, v0.4S,v2.s[1] +sub v7.4s, v11.4s, v9.4s +str q7, [x0, #336] +mul v17.4S, v17.4S,v2.s[2] +add v11.4s, v11.4s, v9.4s +str q11, [x0, #320] +mla v0.4S, v13.4S, v31.s[0] +sub v13.4s, v10.4s, v21.4s +str q13, [x0, #368] +mla v17.4S, v8.4S, v31.s[0] +add v10.4s, v10.4s, v21.4s +str q10, [x0, #352] +sqrdmulh v29.4S, v25.4S, v30.s[0] +sub v2.4s, v5.4s, v15.4s +mul v25.4S, v25.4S,v1.s[0] +str q2, [x0, #400] +ldr q2, [x0, #560] +sqrdmulh v10.4S, v2.4S, v30.s[0] +add v5.4s, v5.4s, v15.4s +mul v2.4S, v2.4S,v1.s[0] +str q5, [x0, #384] +ldr q5, [x17, #+416] +ldr q15, [x17, #+432] +ldr q21, [x0, #608] +sqrdmulh v8.4S, v21.4S, v15.s[0] +sub v13.4s, v22.4s, v20.4s +mul v21.4S, v21.4S,v5.s[0] +str q13, [x0, #432] +ldr q13, [x0, #624] +sqrdmulh v11.4S, v13.4S, v15.s[0] +add v22.4s, v22.4s, v20.4s +mul v13.4S, v13.4S,v5.s[0] +str q22, [x0, #416] +ldr q22, [x17, #+448] +ldr q20, [x17, #+464] +mla v25.4S, v29.4S, v31.s[0] +sub v29.4s, v3.4s, v0.4s +sqrdmulh v9.4S, v4.4S, v20.s[0] +str q29, [x0, #464] +ldr q29, [x0, #688] +mla v2.4S, v10.4S, v31.s[0] +add v3.4s, v3.4s, v0.4s +sqrdmulh v0.4S, v29.4S, v20.s[0] +str q3, [x0, #448] +ldr q3, [x17, #+480] +ldr q10, [x17, #+496] +mla v21.4S, v8.4S, v31.s[0] +sub v8.4s, v14.4s, v17.4s +sqrdmulh v7.4S, v6.4S, v10.s[0] +str q8, [x0, #496] +ldr q8, [x0, #752] +mla v13.4S, v11.4S, v31.s[0] +add v14.4s, v14.4s, v17.4s +sqrdmulh v17.4S, v8.4S, v10.s[0] +str q14, [x0, #480] +ldr q14, [x0, #640] +ldr q11, [x0, #512] +mul v4.4S, v4.4S,v22.s[0] +sub v27.4s, v11.4s, v25.4s +mul v29.4S, v29.4S,v22.s[0] +add v11.4s, v11.4s, v25.4s +ldr q25, [x0, #656] +ldr q19, [x0, #528] +mla v4.4S, v9.4S, v31.s[0] +sub v9.4s, v19.4s, v2.4s +mla v29.4S, v0.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +ldr q2, [x0, #704] +ldr q0, [x0, #576] +mul v6.4S, v6.4S,v3.s[0] +sub v28.4s, v0.4s, v21.4s +mul v8.4S, v8.4S,v3.s[0] +add v0.4s, v0.4s, v21.4s +ldr q21, [x0, #720] +ldr q26, [x0, #592] +mla v6.4S, v7.4S, v31.s[0] +mla v8.4S, v17.4S, v31.s[0] +sub v17.4s, v26.4s, v13.4s +sqrdmulh v7.4S, v19.4S, v30.s[1] +mul v19.4S, v19.4S,v1.s[1] +add v26.4s, v26.4s, v13.4s +sqrdmulh v13.4S, v9.4S, v30.s[2] +sub v12.4s, v14.4s, v4.4s +mul v9.4S, v9.4S,v1.s[2] +add v14.4s, v14.4s, v4.4s +sqrdmulh v30.4S, v26.4S, v15.s[1] +sub v1.4s, v25.4s, v29.4s +mul v26.4S, v26.4S,v5.s[1] +add v25.4s, v25.4s, v29.4s +sqrdmulh v29.4S, v17.4S, v15.s[2] +sub v4.4s, v2.4s, v6.4s +mul v17.4S, v17.4S,v5.s[2] +add v2.4s, v2.4s, v6.4s +mla v19.4S, v7.4S, v31.s[0] +sub v7.4s, v21.4s, v8.4s +ldr q15, [x0, #992] +sqrdmulh v5.4S, v25.4S, v20.s[1] +add v21.4s, v21.4s, v8.4s +mla v9.4S, v13.4S, v31.s[0] +ldr q13, [x0, #928] +sqrdmulh v8.4S, v1.4S, v20.s[2] +mla v26.4S, v30.4S, v31.s[0] +ldr q30, [x0, #800] +sqrdmulh v6.4S, v21.4S, v10.s[1] +mla v17.4S, v29.4S, v31.s[0] +ldr q29, [x17, #+512] +sqrdmulh v16.4S, v7.4S, v10.s[2] +ldr q18, [x17, #+528] +mul v25.4S, v25.4S,v22.s[1] +sub v24.4s, v11.4s, v19.4s +str q24, [x0, #528] +mul v1.4S, v1.4S,v22.s[2] +add v11.4s, v11.4s, v19.4s +str q11, [x0, #512] +mla v25.4S, v5.4S, v31.s[0] +sub v5.4s, v27.4s, v9.4s +str q5, [x0, #560] +mla v1.4S, v8.4S, v31.s[0] +add v27.4s, v27.4s, v9.4s +str q27, [x0, #544] +mul v21.4S, v21.4S,v3.s[1] +sub v20.4s, v0.4s, v26.4s +str q20, [x0, #592] +mul v7.4S, v7.4S,v3.s[2] +add v0.4s, v0.4s, v26.4s +str q0, [x0, #576] +mla v21.4S, v6.4S, v31.s[0] +sub v6.4s, v28.4s, v17.4s +str q6, [x0, #624] +mla v7.4S, v16.4S, v31.s[0] +add v28.4s, v28.4s, v17.4s +str q28, [x0, #608] +sqrdmulh v10.4S, v30.4S, v18.s[0] +sub v3.4s, v14.4s, v25.4s +mul v30.4S, v30.4S,v29.s[0] +str q3, [x0, #656] +ldr q3, [x0, #816] +sqrdmulh v28.4S, v3.4S, v18.s[0] +add v14.4s, v14.4s, v25.4s +mul v3.4S, v3.4S,v29.s[0] +str q14, [x0, #640] +ldr q14, [x17, #+544] +ldr q25, [x17, #+560] +ldr q17, [x0, #864] +sqrdmulh v16.4S, v17.4S, v25.s[0] +sub v6.4s, v12.4s, v1.4s +mul v17.4S, v17.4S,v14.s[0] +str q6, [x0, #688] +ldr q6, [x0, #880] +sqrdmulh v0.4S, v6.4S, v25.s[0] +add v12.4s, v12.4s, v1.4s +mul v6.4S, v6.4S,v14.s[0] +str q12, [x0, #672] +ldr q12, [x17, #+576] +ldr q1, [x17, #+592] +mla v30.4S, v10.4S, v31.s[0] +sub v10.4s, v2.4s, v21.4s +sqrdmulh v26.4S, v13.4S, v1.s[0] +str q10, [x0, #720] +ldr q10, [x0, #944] +mla v3.4S, v28.4S, v31.s[0] +add v2.4s, v2.4s, v21.4s +sqrdmulh v21.4S, v10.4S, v1.s[0] +str q2, [x0, #704] +ldr q2, [x17, #+608] +ldr q28, [x17, #+624] +mla v17.4S, v16.4S, v31.s[0] +sub v16.4s, v4.4s, v7.4s +sqrdmulh v20.4S, v15.4S, v28.s[0] +str q16, [x0, #752] +ldr q16, [x0, #1008] +mla v6.4S, v0.4S, v31.s[0] +add v4.4s, v4.4s, v7.4s +sqrdmulh v7.4S, v16.4S, v28.s[0] +str q4, [x0, #736] +ldr q4, [x0, #896] +ldr q0, [x0, #768] +mul v13.4S, v13.4S,v12.s[0] +sub v22.4s, v0.4s, v30.4s +mul v10.4S, v10.4S,v12.s[0] +add v0.4s, v0.4s, v30.4s +ldr q30, [x0, #912] +ldr q27, [x0, #784] +mla v13.4S, v26.4S, v31.s[0] +sub v26.4s, v27.4s, v3.4s +mla v10.4S, v21.4S, v31.s[0] +add v27.4s, v27.4s, v3.4s +ldr q3, [x0, #960] +ldr q21, [x0, #832] +mul v15.4S, v15.4S,v2.s[0] +sub v9.4s, v21.4s, v17.4s +mul v16.4S, v16.4S,v2.s[0] +add v21.4s, v21.4s, v17.4s +ldr q17, [x0, #976] +ldr q8, [x0, #848] +mla v15.4S, v20.4S, v31.s[0] +mla v16.4S, v7.4S, v31.s[0] +sub v7.4s, v8.4s, v6.4s +sqrdmulh v20.4S, v27.4S, v18.s[1] +mul v27.4S, v27.4S,v29.s[1] +add v8.4s, v8.4s, v6.4s +sqrdmulh v6.4S, v26.4S, v18.s[2] +sub v5.4s, v4.4s, v13.4s +mul v26.4S, v26.4S,v29.s[2] +add v4.4s, v4.4s, v13.4s +sqrdmulh v18.4S, v8.4S, v25.s[1] +sub v29.4s, v30.4s, v10.4s +mul v8.4S, v8.4S,v14.s[1] +add v30.4s, v30.4s, v10.4s +sqrdmulh v10.4S, v7.4S, v25.s[2] +sub v13.4s, v3.4s, v15.4s +mul v7.4S, v7.4S,v14.s[2] +add v3.4s, v3.4s, v15.4s +mla v27.4S, v20.4S, v31.s[0] +sub v20.4s, v17.4s, v16.4s +sqrdmulh v25.4S, v30.4S, v1.s[1] +add v17.4s, v17.4s, v16.4s +mla v26.4S, v6.4S, v31.s[0] +sqrdmulh v6.4S, v29.4S, v1.s[2] +mla v8.4S, v18.4S, v31.s[0] +sqrdmulh v18.4S, v17.4S, v28.s[1] +mla v7.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v20.4S, v28.s[2] +mul v30.4S, v30.4S,v12.s[1] +sub v16.4s, v0.4s, v27.4s +str q16, [x0, #784] +mul v29.4S, v29.4S,v12.s[2] +add v0.4s, v0.4s, v27.4s +str q0, [x0, #768] +mla v30.4S, v25.4S, v31.s[0] +sub v25.4s, v22.4s, v26.4s +str q25, [x0, #816] +mla v29.4S, v6.4S, v31.s[0] +add v22.4s, v22.4s, v26.4s +str q22, [x0, #800] +mul v17.4S, v17.4S,v2.s[1] +sub v1.4s, v21.4s, v8.4s +str q1, [x0, #848] +mul v20.4S, v20.4S,v2.s[2] +add v21.4s, v21.4s, v8.4s +str q21, [x0, #832] +mla v17.4S, v18.4S, v31.s[0] +sub v18.4s, v9.4s, v7.4s +str q18, [x0, #880] +mla v20.4S, v10.4S, v31.s[0] +add v9.4s, v9.4s, v7.4s +str q9, [x0, #864] +sub v28.4s, v4.4s, v30.4s +str q28, [x0, #912] +add v4.4s, v4.4s, v30.4s +str q4, [x0, #896] +sub v4.4s, v5.4s, v29.4s +str q4, [x0, #944] +add v5.4s, v5.4s, v29.4s +str q5, [x0, #928] +sub v5.4s, v3.4s, v17.4s +str q5, [x0, #976] +add v3.4s, v3.4s, v17.4s +str q3, [x0, #960] +sub v3.4s, v13.4s, v20.4s +str q3, [x0, #1008] +add v13.4s, v13.4s, v20.4s +str q13, [x0, #992] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1464 +// Instruction count: 1460 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_0.s b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_0.s new file mode 100644 index 0000000..982af55 --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_0.s @@ -0,0 +1,1494 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_7_z4_0 +.global _ntt_u32_incomplete_neon_asm_var_4_2_7_z4_0 +ntt_u32_incomplete_neon_asm_var_4_2_7_z4_0: +_ntt_u32_incomplete_neon_asm_var_4_2_7_z4_0: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #928] +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +ldr q20, [x0, #992] +sqrdmulh v19.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q18, [x0, #800] +sqrdmulh v17.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +ldr q16, [x0, #864] +sqrdmulh v3.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +mla v22.4S, v21.4S, v31.s[0] +mla v20.4S, v19.4S, v31.s[0] +mla v18.4S, v17.4S, v31.s[0] +mla v16.4S, v3.4S, v31.s[0] +ldr q3, [x0, #544] +sqrdmulh v17.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +ldr q19, [x0, #608] +sqrdmulh v21.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q2, [x0, #672] +ldr q1, [x0, #416] +sqrdmulh v0.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +sub v15.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +ldr q22, [x0, #736] +ldr q14, [x0, #480] +sqrdmulh v13.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v12.4s, v14.4s, v20.4s +add v14.4s, v14.4s, v20.4s +ldr q20, [x0, #288] +mla v3.4S, v17.4S, v31.s[0] +mla v19.4S, v21.4S, v31.s[0] +sub v21.4s, v20.4s, v18.4s +mla v2.4S, v0.4S, v31.s[0] +mla v22.4S, v13.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +ldr q18, [x0, #352] +sqrdmulh v13.4S, v1.4S, v29.s[1] +mul v1.4S, v1.4S,v30.s[1] +sub v0.4s, v18.4s, v16.4s +sqrdmulh v17.4S, v14.4S, v29.s[1] +mul v14.4S, v14.4S,v30.s[1] +add v18.4s, v18.4s, v16.4s +ldr q16, [x0, #32] +sqrdmulh v11.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v10.4s, v16.4s, v3.4s +add v16.4s, v16.4s, v3.4s +ldr q3, [x0, #96] +sqrdmulh v9.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v8.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +ldr q19, [x0, #160] +mla v1.4S, v13.4S, v31.s[0] +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v19.4s, v2.4s +mla v20.4S, v11.4S, v31.s[0] +mla v18.4S, v9.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +ldr q2, [x0, #224] +sqrdmulh v9.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +sub v11.4s, v2.4s, v22.4s +sqrdmulh v13.4S, v12.4S, v29.s[2] +mul v12.4S, v12.4S,v30.s[2] +add v2.4s, v2.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +sub v7.4s, v19.4s, v1.4s +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v29.s[2] +mul v0.4S, v0.4S,v30.s[2] +sub v6.4s, v2.4s, v14.4s +add v2.4s, v2.4s, v14.4s +mla v15.4S, v9.4S, v31.s[0] +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v16.4s, v20.4s +mla v21.4S, v22.4S, v31.s[0] +mla v0.4S, v1.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v7.4S, v27.s[1] +mul v7.4S, v7.4S,v28.s[1] +sub v1.4s, v3.4s, v18.4s +sqrdmulh v22.4S, v6.4S, v27.s[1] +mul v6.4S, v6.4S,v28.s[1] +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v19.4S, v27.s[0] +mul v19.4S, v19.4S,v28.s[0] +sub v9.4s, v17.4s, v15.4s +add v17.4s, v17.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v27.s[0] +mul v2.4S, v2.4S,v28.s[0] +sub v14.4s, v11.4s, v12.4s +add v11.4s, v11.4s, v12.4s +mla v7.4S, v20.4S, v31.s[0] +mla v6.4S, v22.4S, v31.s[0] +sub v22.4s, v10.4s, v21.4s +mla v19.4S, v18.4S, v31.s[0] +mla v2.4S, v15.4S, v31.s[0] +add v10.4s, v10.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v27.s[2] +mul v17.4S, v17.4S,v28.s[2] +sub v15.4s, v8.4s, v0.4s +sqrdmulh v18.4S, v11.4S, v27.s[2] +mul v11.4S, v11.4S,v28.s[2] +add v8.4s, v8.4s, v0.4s +sqrdmulh v0.4S, v9.4S, v27.s[3] +mul v9.4S, v9.4S,v28.s[3] +sub v20.4s, v13.4s, v7.4s +add v13.4s, v13.4s, v7.4s +sqrdmulh v7.4S, v14.4S, v27.s[3] +mul v14.4S, v14.4S,v28.s[3] +sub v12.4s, v1.4s, v6.4s +add v1.4s, v1.4s, v6.4s +mla v17.4S, v21.4S, v31.s[0] +mla v11.4S, v18.4S, v31.s[0] +sub v18.4s, v16.4s, v19.4s +mla v9.4S, v0.4S, v31.s[0] +mla v14.4S, v7.4S, v31.s[0] +add v16.4s, v16.4s, v19.4s +sqrdmulh v19.4S, v1.4S, v25.s[2] +mul v1.4S, v1.4S,v26.s[2] +sub v7.4s, v3.4s, v2.4s +sqrdmulh v0.4S, v12.4S, v25.s[3] +mul v12.4S, v12.4S,v26.s[3] +add v3.4s, v3.4s, v2.4s +sqrdmulh v2.4S, v7.4S, v25.s[1] +mul v7.4S, v7.4S,v26.s[1] +sub v21.4s, v10.4s, v17.4s +add v10.4s, v10.4s, v17.4s +sqrdmulh v17.4S, v3.4S, v25.s[0] +mul v3.4S, v3.4S,v26.s[0] +sub v6.4s, v8.4s, v11.4s +add v8.4s, v8.4s, v11.4s +mla v1.4S, v19.4S, v31.s[0] +mla v12.4S, v0.4S, v31.s[0] +sub v0.4s, v22.4s, v9.4s +mla v7.4S, v2.4S, v31.s[0] +mla v3.4S, v17.4S, v31.s[0] +add v22.4s, v22.4s, v9.4s +sqrdmulh v9.4S, v8.4S, v23.s[0] +mul v8.4S, v8.4S,v24.s[0] +sub v17.4s, v15.4s, v14.4s +sqrdmulh v2.4S, v6.4S, v23.s[1] +mul v6.4S, v6.4S,v24.s[1] +add v15.4s, v15.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v23.s[2] +mul v15.4S, v15.4S,v24.s[2] +sub v19.4s, v13.4s, v1.4s +add v13.4s, v13.4s, v1.4s +sqrdmulh v1.4S, v17.4S, v23.s[3] +mul v17.4S, v17.4S,v24.s[3] +sub v11.4s, v20.4s, v12.4s +add v20.4s, v20.4s, v12.4s +mla v8.4S, v9.4S, v31.s[0] +mla v6.4S, v2.4S, v31.s[0] +sub v2.4s, v18.4s, v7.4s +str q13, [x0, #288] +mla v15.4S, v14.4S, v31.s[0] +mla v17.4S, v1.4S, v31.s[0] +add v18.4s, v18.4s, v7.4s +str q19, [x0, #352] +ldr q19, [x0, #944] +sqrdmulh v7.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +sub v1.4s, v16.4s, v3.4s +str q20, [x0, #416] +ldr q20, [x0, #1008] +sqrdmulh v14.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v16.4s, v16.4s, v3.4s +str q11, [x0, #480] +ldr q11, [x0, #816] +sqrdmulh v3.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +sub v13.4s, v10.4s, v8.4s +add v10.4s, v10.4s, v8.4s +ldr q8, [x0, #880] +sqrdmulh v9.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v12.4s, v21.4s, v6.4s +add v21.4s, v21.4s, v6.4s +mla v19.4S, v7.4S, v31.s[0] +mla v20.4S, v14.4S, v31.s[0] +sub v14.4s, v22.4s, v15.4s +str q18, [x0, #160] +mla v11.4S, v3.4S, v31.s[0] +mla v8.4S, v9.4S, v31.s[0] +add v22.4s, v22.4s, v15.4s +str q2, [x0, #224] +ldr q2, [x0, #560] +sqrdmulh v15.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +sub v9.4s, v0.4s, v17.4s +str q16, [x0, #32] +ldr q16, [x0, #624] +sqrdmulh v3.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +add v0.4s, v0.4s, v17.4s +str q1, [x0, #96] +ldr q1, [x0, #688] +ldr q17, [x0, #432] +sqrdmulh v18.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +sub v7.4s, v17.4s, v19.4s +add v17.4s, v17.4s, v19.4s +ldr q19, [x0, #752] +ldr q6, [x0, #496] +sqrdmulh v5.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +sub v4.4s, v6.4s, v20.4s +add v6.4s, v6.4s, v20.4s +ldr q20, [x0, #304] +mla v2.4S, v15.4S, v31.s[0] +mla v16.4S, v3.4S, v31.s[0] +sub v3.4s, v20.4s, v11.4s +str q10, [x0, #544] +mla v1.4S, v18.4S, v31.s[0] +mla v19.4S, v5.4S, v31.s[0] +add v20.4s, v20.4s, v11.4s +str q13, [x0, #608] +ldr q13, [x0, #368] +sqrdmulh v11.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v5.4s, v13.4s, v8.4s +str q21, [x0, #672] +sqrdmulh v21.4S, v6.4S, v29.s[1] +mul v6.4S, v6.4S,v30.s[1] +add v13.4s, v13.4s, v8.4s +str q12, [x0, #736] +ldr q12, [x0, #48] +sqrdmulh v8.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v18.4s, v12.4s, v2.4s +add v12.4s, v12.4s, v2.4s +ldr q2, [x0, #112] +sqrdmulh v10.4S, v13.4S, v29.s[1] +mul v13.4S, v13.4S,v30.s[1] +sub v15.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +ldr q16, [x0, #176] +mla v17.4S, v11.4S, v31.s[0] +mla v6.4S, v21.4S, v31.s[0] +sub v21.4s, v16.4s, v1.4s +str q22, [x0, #800] +mla v20.4S, v8.4S, v31.s[0] +mla v13.4S, v10.4S, v31.s[0] +add v16.4s, v16.4s, v1.4s +str q14, [x0, #864] +ldr q14, [x0, #240] +sqrdmulh v1.4S, v7.4S, v29.s[2] +mul v7.4S, v7.4S,v30.s[2] +sub v10.4s, v14.4s, v19.4s +str q0, [x0, #928] +sqrdmulh v0.4S, v4.4S, v29.s[2] +mul v4.4S, v4.4S,v30.s[2] +add v14.4s, v14.4s, v19.4s +str q9, [x0, #992] +sqrdmulh v9.4S, v3.4S, v29.s[2] +mul v3.4S, v3.4S,v30.s[2] +sub v19.4s, v16.4s, v17.4s +add v16.4s, v16.4s, v17.4s +sqrdmulh v17.4S, v5.4S, v29.s[2] +mul v5.4S, v5.4S,v30.s[2] +sub v8.4s, v14.4s, v6.4s +add v14.4s, v14.4s, v6.4s +mla v7.4S, v1.4S, v31.s[0] +mla v4.4S, v0.4S, v31.s[0] +sub v0.4s, v12.4s, v20.4s +mla v3.4S, v9.4S, v31.s[0] +mla v5.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v27.s[1] +mul v19.4S, v19.4S,v28.s[1] +sub v17.4s, v2.4s, v13.4s +sqrdmulh v9.4S, v8.4S, v27.s[1] +mul v8.4S, v8.4S,v28.s[1] +add v2.4s, v2.4s, v13.4s +sqrdmulh v13.4S, v16.4S, v27.s[0] +mul v16.4S, v16.4S,v28.s[0] +sub v1.4s, v21.4s, v7.4s +add v21.4s, v21.4s, v7.4s +sqrdmulh v7.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +sub v6.4s, v10.4s, v4.4s +add v10.4s, v10.4s, v4.4s +mla v19.4S, v20.4S, v31.s[0] +mla v8.4S, v9.4S, v31.s[0] +sub v9.4s, v18.4s, v3.4s +mla v16.4S, v13.4S, v31.s[0] +mla v14.4S, v7.4S, v31.s[0] +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v27.s[2] +mul v21.4S, v21.4S,v28.s[2] +sub v7.4s, v15.4s, v5.4s +sqrdmulh v13.4S, v10.4S, v27.s[2] +mul v10.4S, v10.4S,v28.s[2] +add v15.4s, v15.4s, v5.4s +sqrdmulh v5.4S, v1.4S, v27.s[3] +mul v1.4S, v1.4S,v28.s[3] +sub v20.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v27.s[3] +mul v6.4S, v6.4S,v28.s[3] +sub v4.4s, v17.4s, v8.4s +add v17.4s, v17.4s, v8.4s +mla v21.4S, v3.4S, v31.s[0] +mla v10.4S, v13.4S, v31.s[0] +sub v13.4s, v12.4s, v16.4s +mla v1.4S, v5.4S, v31.s[0] +mla v6.4S, v19.4S, v31.s[0] +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v25.s[2] +mul v17.4S, v17.4S,v26.s[2] +sub v19.4s, v2.4s, v14.4s +sqrdmulh v5.4S, v4.4S, v25.s[3] +mul v4.4S, v4.4S,v26.s[3] +add v2.4s, v2.4s, v14.4s +sqrdmulh v14.4S, v19.4S, v25.s[1] +mul v19.4S, v19.4S,v26.s[1] +sub v3.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v2.4S, v25.s[0] +mul v2.4S, v2.4S,v26.s[0] +sub v8.4s, v15.4s, v10.4s +add v15.4s, v15.4s, v10.4s +mla v17.4S, v16.4S, v31.s[0] +mla v4.4S, v5.4S, v31.s[0] +sub v5.4s, v9.4s, v1.4s +mla v19.4S, v14.4S, v31.s[0] +mla v2.4S, v21.4S, v31.s[0] +add v9.4s, v9.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v23.s[0] +mul v15.4S, v15.4S,v24.s[0] +sub v21.4s, v7.4s, v6.4s +sqrdmulh v14.4S, v8.4S, v23.s[1] +mul v8.4S, v8.4S,v24.s[1] +add v7.4s, v7.4s, v6.4s +sqrdmulh v6.4S, v7.4S, v23.s[2] +mul v7.4S, v7.4S,v24.s[2] +sub v16.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +sqrdmulh v17.4S, v21.4S, v23.s[3] +mul v21.4S, v21.4S,v24.s[3] +sub v10.4s, v20.4s, v4.4s +add v20.4s, v20.4s, v4.4s +mla v15.4S, v1.4S, v31.s[0] +mla v8.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v19.4s +str q0, [x0, #304] +mla v7.4S, v6.4S, v31.s[0] +mla v21.4S, v17.4S, v31.s[0] +add v13.4s, v13.4s, v19.4s +str q16, [x0, #368] +ldr q16, [x0, #896] +sqrdmulh v19.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v17.4s, v12.4s, v2.4s +str q20, [x0, #432] +ldr q20, [x0, #960] +sqrdmulh v6.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v12.4s, v12.4s, v2.4s +str q10, [x0, #496] +ldr q10, [x0, #768] +sqrdmulh v2.4S, v10.4S, v29.s[0] +mul v10.4S, v10.4S,v30.s[0] +sub v0.4s, v18.4s, v15.4s +add v18.4s, v18.4s, v15.4s +ldr q15, [x0, #832] +sqrdmulh v1.4S, v15.4S, v29.s[0] +mul v15.4S, v15.4S,v30.s[0] +sub v4.4s, v3.4s, v8.4s +add v3.4s, v3.4s, v8.4s +mla v16.4S, v19.4S, v31.s[0] +mla v20.4S, v6.4S, v31.s[0] +sub v6.4s, v9.4s, v7.4s +str q13, [x0, #176] +mla v10.4S, v2.4S, v31.s[0] +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v7.4s +str q14, [x0, #240] +ldr q14, [x0, #512] +sqrdmulh v7.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v1.4s, v5.4s, v21.4s +str q12, [x0, #48] +ldr q12, [x0, #576] +sqrdmulh v2.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +add v5.4s, v5.4s, v21.4s +str q17, [x0, #112] +ldr q17, [x0, #640] +ldr q21, [x0, #384] +sqrdmulh v13.4S, v17.4S, v29.s[0] +mul v17.4S, v17.4S,v30.s[0] +sub v19.4s, v21.4s, v16.4s +add v21.4s, v21.4s, v16.4s +ldr q16, [x0, #704] +ldr q8, [x0, #448] +sqrdmulh v22.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v11.4s, v8.4s, v20.4s +add v8.4s, v8.4s, v20.4s +ldr q20, [x0, #256] +mla v14.4S, v7.4S, v31.s[0] +mla v12.4S, v2.4S, v31.s[0] +sub v2.4s, v20.4s, v10.4s +str q18, [x0, #560] +mla v17.4S, v13.4S, v31.s[0] +mla v16.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v10.4s +str q0, [x0, #624] +ldr q0, [x0, #320] +sqrdmulh v10.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v22.4s, v0.4s, v15.4s +str q3, [x0, #688] +sqrdmulh v3.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +add v0.4s, v0.4s, v15.4s +str q4, [x0, #752] +ldr q4, [x0, #0] +sqrdmulh v15.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v13.4s, v4.4s, v14.4s +add v4.4s, v4.4s, v14.4s +ldr q14, [x0, #64] +sqrdmulh v18.4S, v0.4S, v29.s[1] +mul v0.4S, v0.4S,v30.s[1] +sub v7.4s, v14.4s, v12.4s +add v14.4s, v14.4s, v12.4s +ldr q12, [x0, #128] +mla v21.4S, v10.4S, v31.s[0] +mla v8.4S, v3.4S, v31.s[0] +sub v3.4s, v12.4s, v17.4s +str q9, [x0, #816] +mla v20.4S, v15.4S, v31.s[0] +mla v0.4S, v18.4S, v31.s[0] +add v12.4s, v12.4s, v17.4s +str q6, [x0, #880] +ldr q6, [x0, #192] +sqrdmulh v17.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +sub v18.4s, v6.4s, v16.4s +str q5, [x0, #944] +sqrdmulh v5.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +add v6.4s, v6.4s, v16.4s +str q1, [x0, #1008] +sqrdmulh v1.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v16.4s, v12.4s, v21.4s +add v12.4s, v12.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +sub v15.4s, v6.4s, v8.4s +add v6.4s, v6.4s, v8.4s +mla v19.4S, v17.4S, v31.s[0] +mla v11.4S, v5.4S, v31.s[0] +sub v5.4s, v4.4s, v20.4s +mla v2.4S, v1.4S, v31.s[0] +mla v22.4S, v21.4S, v31.s[0] +add v4.4s, v4.4s, v20.4s +sqrdmulh v20.4S, v16.4S, v27.s[1] +mul v16.4S, v16.4S,v28.s[1] +sub v21.4s, v14.4s, v0.4s +sqrdmulh v1.4S, v15.4S, v27.s[1] +mul v15.4S, v15.4S,v28.s[1] +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +sub v17.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v27.s[0] +mul v6.4S, v6.4S,v28.s[0] +sub v8.4s, v18.4s, v11.4s +add v18.4s, v18.4s, v11.4s +mla v16.4S, v20.4S, v31.s[0] +mla v15.4S, v1.4S, v31.s[0] +sub v1.4s, v13.4s, v2.4s +mla v12.4S, v0.4S, v31.s[0] +mla v6.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v3.4S, v27.s[2] +mul v3.4S, v3.4S,v28.s[2] +sub v19.4s, v7.4s, v22.4s +sqrdmulh v0.4S, v18.4S, v27.s[2] +mul v18.4S, v18.4S,v28.s[2] +add v7.4s, v7.4s, v22.4s +sqrdmulh v22.4S, v17.4S, v27.s[3] +mul v17.4S, v17.4S,v28.s[3] +sub v20.4s, v5.4s, v16.4s +add v5.4s, v5.4s, v16.4s +sqrdmulh v16.4S, v8.4S, v27.s[3] +mul v8.4S, v8.4S,v28.s[3] +sub v11.4s, v21.4s, v15.4s +add v21.4s, v21.4s, v15.4s +mla v3.4S, v2.4S, v31.s[0] +mla v18.4S, v0.4S, v31.s[0] +sub v0.4s, v4.4s, v12.4s +mla v17.4S, v22.4S, v31.s[0] +mla v8.4S, v16.4S, v31.s[0] +add v4.4s, v4.4s, v12.4s +sqrdmulh v12.4S, v21.4S, v25.s[2] +mul v21.4S, v21.4S,v26.s[2] +sub v16.4s, v14.4s, v6.4s +sqrdmulh v22.4S, v11.4S, v25.s[3] +mul v11.4S, v11.4S,v26.s[3] +add v14.4s, v14.4s, v6.4s +sqrdmulh v6.4S, v16.4S, v25.s[1] +mul v16.4S, v16.4S,v26.s[1] +sub v2.4s, v13.4s, v3.4s +add v13.4s, v13.4s, v3.4s +sqrdmulh v3.4S, v14.4S, v25.s[0] +mul v14.4S, v14.4S,v26.s[0] +sub v15.4s, v7.4s, v18.4s +add v7.4s, v7.4s, v18.4s +mla v21.4S, v12.4S, v31.s[0] +mla v11.4S, v22.4S, v31.s[0] +sub v22.4s, v1.4s, v17.4s +mla v16.4S, v6.4S, v31.s[0] +mla v14.4S, v3.4S, v31.s[0] +add v1.4s, v1.4s, v17.4s +sqrdmulh v17.4S, v7.4S, v23.s[0] +mul v7.4S, v7.4S,v24.s[0] +sub v3.4s, v19.4s, v8.4s +sqrdmulh v6.4S, v15.4S, v23.s[1] +mul v15.4S, v15.4S,v24.s[1] +add v19.4s, v19.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v23.s[2] +mul v19.4S, v19.4S,v24.s[2] +sub v12.4s, v5.4s, v21.4s +add v5.4s, v5.4s, v21.4s +sqrdmulh v21.4S, v3.4S, v23.s[3] +mul v3.4S, v3.4S,v24.s[3] +sub v18.4s, v20.4s, v11.4s +add v20.4s, v20.4s, v11.4s +mla v7.4S, v17.4S, v31.s[0] +mla v15.4S, v6.4S, v31.s[0] +sub v6.4s, v0.4s, v16.4s +str q5, [x0, #256] +mla v19.4S, v8.4S, v31.s[0] +mla v3.4S, v21.4S, v31.s[0] +add v0.4s, v0.4s, v16.4s +str q12, [x0, #320] +ldr q12, [x0, #912] +sqrdmulh v16.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +sub v21.4s, v4.4s, v14.4s +str q20, [x0, #384] +ldr q20, [x0, #976] +sqrdmulh v8.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v4.4s, v4.4s, v14.4s +str q18, [x0, #448] +ldr q18, [x0, #784] +sqrdmulh v14.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +sub v5.4s, v13.4s, v7.4s +add v13.4s, v13.4s, v7.4s +ldr q7, [x0, #848] +sqrdmulh v17.4S, v7.4S, v29.s[0] +mul v7.4S, v7.4S,v30.s[0] +sub v11.4s, v2.4s, v15.4s +add v2.4s, v2.4s, v15.4s +mla v12.4S, v16.4S, v31.s[0] +mla v20.4S, v8.4S, v31.s[0] +sub v8.4s, v1.4s, v19.4s +str q0, [x0, #128] +mla v18.4S, v14.4S, v31.s[0] +mla v7.4S, v17.4S, v31.s[0] +add v1.4s, v1.4s, v19.4s +str q6, [x0, #192] +ldr q6, [x0, #528] +sqrdmulh v19.4S, v6.4S, v29.s[0] +mul v6.4S, v6.4S,v30.s[0] +sub v17.4s, v22.4s, v3.4s +str q4, [x0, #0] +ldr q4, [x0, #592] +sqrdmulh v14.4S, v4.4S, v29.s[0] +mul v4.4S, v4.4S,v30.s[0] +add v22.4s, v22.4s, v3.4s +str q21, [x0, #64] +ldr q21, [x0, #656] +ldr q3, [x0, #400] +sqrdmulh v0.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +sub v16.4s, v3.4s, v12.4s +add v3.4s, v3.4s, v12.4s +ldr q12, [x0, #720] +ldr q15, [x0, #464] +sqrdmulh v9.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +sub v10.4s, v15.4s, v20.4s +add v15.4s, v15.4s, v20.4s +ldr q20, [x0, #272] +mla v6.4S, v19.4S, v31.s[0] +mla v4.4S, v14.4S, v31.s[0] +sub v14.4s, v20.4s, v18.4s +str q13, [x0, #512] +mla v21.4S, v0.4S, v31.s[0] +mla v12.4S, v9.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +str q5, [x0, #576] +ldr q5, [x0, #336] +sqrdmulh v18.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v9.4s, v5.4s, v7.4s +str q2, [x0, #640] +sqrdmulh v2.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +add v5.4s, v5.4s, v7.4s +str q11, [x0, #704] +ldr q11, [x0, #16] +sqrdmulh v7.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v0.4s, v11.4s, v6.4s +add v11.4s, v11.4s, v6.4s +ldr q6, [x0, #80] +sqrdmulh v13.4S, v5.4S, v29.s[1] +mul v5.4S, v5.4S,v30.s[1] +sub v19.4s, v6.4s, v4.4s +add v6.4s, v6.4s, v4.4s +ldr q4, [x0, #144] +mla v3.4S, v18.4S, v31.s[0] +mla v15.4S, v2.4S, v31.s[0] +sub v2.4s, v4.4s, v21.4s +str q1, [x0, #768] +mla v20.4S, v7.4S, v31.s[0] +mla v5.4S, v13.4S, v31.s[0] +add v4.4s, v4.4s, v21.4s +str q8, [x0, #832] +ldr q8, [x0, #208] +sqrdmulh v21.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +sub v13.4s, v8.4s, v12.4s +str q22, [x0, #896] +sqrdmulh v22.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +add v8.4s, v8.4s, v12.4s +str q17, [x0, #960] +sqrdmulh v17.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v12.4s, v4.4s, v3.4s +add v4.4s, v4.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v29.s[2] +mul v9.4S, v9.4S,v30.s[2] +sub v7.4s, v8.4s, v15.4s +add v8.4s, v8.4s, v15.4s +mla v16.4S, v21.4S, v31.s[0] +mla v10.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v20.4s +mla v14.4S, v17.4S, v31.s[0] +mla v9.4S, v3.4S, v31.s[0] +add v11.4s, v11.4s, v20.4s +sqrdmulh v20.4S, v12.4S, v27.s[1] +mul v12.4S, v12.4S,v28.s[1] +sub v3.4s, v6.4s, v5.4s +sqrdmulh v17.4S, v7.4S, v27.s[1] +mul v7.4S, v7.4S,v28.s[1] +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v4.4S, v27.s[0] +mul v4.4S, v4.4S,v28.s[0] +sub v21.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v8.4S, v27.s[0] +mul v8.4S, v8.4S,v28.s[0] +sub v15.4s, v13.4s, v10.4s +add v13.4s, v13.4s, v10.4s +mla v12.4S, v20.4S, v31.s[0] +mla v7.4S, v17.4S, v31.s[0] +sub v17.4s, v0.4s, v14.4s +mla v4.4S, v5.4S, v31.s[0] +mla v8.4S, v16.4S, v31.s[0] +add v0.4s, v0.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v27.s[2] +mul v2.4S, v2.4S,v28.s[2] +sub v16.4s, v19.4s, v9.4s +sqrdmulh v5.4S, v13.4S, v27.s[2] +mul v13.4S, v13.4S,v28.s[2] +add v19.4s, v19.4s, v9.4s +sqrdmulh v9.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +sub v20.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +sub v10.4s, v3.4s, v7.4s +add v3.4s, v3.4s, v7.4s +mla v2.4S, v14.4S, v31.s[0] +mla v13.4S, v5.4S, v31.s[0] +sub v5.4s, v11.4s, v4.4s +mla v21.4S, v9.4S, v31.s[0] +mla v15.4S, v12.4S, v31.s[0] +add v11.4s, v11.4s, v4.4s +sqrdmulh v4.4S, v3.4S, v25.s[2] +mul v3.4S, v3.4S,v26.s[2] +sub v12.4s, v6.4s, v8.4s +sqrdmulh v9.4S, v10.4S, v25.s[3] +mul v10.4S, v10.4S,v26.s[3] +add v6.4s, v6.4s, v8.4s +sqrdmulh v8.4S, v12.4S, v25.s[1] +mul v12.4S, v12.4S,v26.s[1] +sub v14.4s, v0.4s, v2.4s +add v0.4s, v0.4s, v2.4s +sqrdmulh v2.4S, v6.4S, v25.s[0] +mul v6.4S, v6.4S,v26.s[0] +sub v7.4s, v19.4s, v13.4s +add v19.4s, v19.4s, v13.4s +mla v3.4S, v4.4S, v31.s[0] +mla v10.4S, v9.4S, v31.s[0] +sub v9.4s, v17.4s, v21.4s +mla v12.4S, v8.4S, v31.s[0] +mla v6.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v19.4S, v23.s[0] +mul v19.4S, v19.4S,v24.s[0] +sub v2.4s, v16.4s, v15.4s +sqrdmulh v8.4S, v7.4S, v23.s[1] +mul v7.4S, v7.4S,v24.s[1] +add v16.4s, v16.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v23.s[2] +mul v16.4S, v16.4S,v24.s[2] +sub v4.4s, v22.4s, v3.4s +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v2.4S, v23.s[3] +mul v2.4S, v2.4S,v24.s[3] +sub v13.4s, v20.4s, v10.4s +add v20.4s, v20.4s, v10.4s +mla v19.4S, v21.4S, v31.s[0] +mla v7.4S, v8.4S, v31.s[0] +sub v8.4s, v5.4s, v12.4s +str q22, [x0, #272] +mla v16.4S, v15.4S, v31.s[0] +mla v2.4S, v3.4S, v31.s[0] +add v5.4s, v5.4s, v12.4s +str q4, [x0, #336] +sub v23.4s, v11.4s, v6.4s +str q20, [x0, #400] +add v11.4s, v11.4s, v6.4s +str q13, [x0, #464] +sub v13.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sub v19.4s, v14.4s, v7.4s +add v14.4s, v14.4s, v7.4s +sub v7.4s, v17.4s, v16.4s +str q5, [x0, #144] +add v17.4s, v17.4s, v16.4s +str q8, [x0, #208] +sub v8.4s, v9.4s, v2.4s +str q11, [x0, #16] +add v9.4s, v9.4s, v2.4s +str q23, [x0, #80] +str q0, [x0, #528] +str q13, [x0, #592] +str q14, [x0, #656] +str q19, [x0, #720] +str q17, [x0, #784] +str q7, [x0, #848] +str q9, [x0, #912] +str q8, [x0, #976] +ldr q18, [x17, #+128] +ldr q1, [x17, #+144] +ldr q10, [x17, #+160] +ldr q21, [x17, #+176] +ldr q22, [x17, #+192] +ldr q15, [x17, #+208] +ldr q3, [x17, #+224] +ldr q12, [x17, #+240] +ldr q4, [x0, #32] +ldr q30, [x0, #48] +ldr q29, [x0, #0] +ldr q28, [x0, #16] +sqrdmulh v27.4S, v4.4S, v1.s[0] +mul v4.4S, v4.4S,v18.s[0] +mla v4.4S, v27.4S, v31.s[0] +sub v27.4s, v29.4s, v4.4s +add v29.4s, v29.4s, v4.4s +sqrdmulh v4.4S, v30.4S, v1.s[0] +mul v30.4S, v30.4S,v18.s[0] +mla v30.4S, v4.4S, v31.s[0] +sub v4.4s, v28.4s, v30.4s +add v28.4s, v28.4s, v30.4s +ldr q30, [x17, #+256] +ldr q26, [x17, #+272] +sqrdmulh v25.4S, v28.4S, v1.s[1] +mul v28.4S, v28.4S,v18.s[1] +mla v28.4S, v25.4S, v31.s[0] +sub v25.4s, v29.4s, v28.4s +add v29.4s, v29.4s, v28.4s +sqrdmulh v28.4S, v4.4S, v1.s[2] +mul v4.4S, v4.4S,v18.s[2] +mla v4.4S, v28.4S, v31.s[0] +sub v28.4s, v27.4s, v4.4s +add v27.4s, v27.4s, v4.4s +str q29, [x0, #0] +str q25, [x0, #16] +str q27, [x0, #32] +str q28, [x0, #48] +ldr q28, [x0, #96] +ldr q27, [x0, #112] +ldr q25, [x0, #64] +ldr q29, [x0, #80] +sqrdmulh v4.4S, v28.4S, v21.s[0] +mul v28.4S, v28.4S,v10.s[0] +mla v28.4S, v4.4S, v31.s[0] +sub v4.4s, v25.4s, v28.4s +add v25.4s, v25.4s, v28.4s +sqrdmulh v28.4S, v27.4S, v21.s[0] +mul v27.4S, v27.4S,v10.s[0] +mla v27.4S, v28.4S, v31.s[0] +sub v28.4s, v29.4s, v27.4s +add v29.4s, v29.4s, v27.4s +ldr q27, [x17, #+288] +ldr q24, [x17, #+304] +sqrdmulh v20.4S, v29.4S, v21.s[1] +mul v29.4S, v29.4S,v10.s[1] +mla v29.4S, v20.4S, v31.s[0] +sub v20.4s, v25.4s, v29.4s +add v25.4s, v25.4s, v29.4s +sqrdmulh v29.4S, v28.4S, v21.s[2] +mul v28.4S, v28.4S,v10.s[2] +mla v28.4S, v29.4S, v31.s[0] +sub v29.4s, v4.4s, v28.4s +add v4.4s, v4.4s, v28.4s +str q25, [x0, #64] +str q20, [x0, #80] +str q4, [x0, #96] +str q29, [x0, #112] +ldr q29, [x0, #160] +ldr q4, [x0, #176] +ldr q20, [x0, #128] +ldr q25, [x0, #144] +sqrdmulh v28.4S, v29.4S, v15.s[0] +mul v29.4S, v29.4S,v22.s[0] +mla v29.4S, v28.4S, v31.s[0] +sub v28.4s, v20.4s, v29.4s +add v20.4s, v20.4s, v29.4s +sqrdmulh v29.4S, v4.4S, v15.s[0] +mul v4.4S, v4.4S,v22.s[0] +mla v4.4S, v29.4S, v31.s[0] +sub v29.4s, v25.4s, v4.4s +add v25.4s, v25.4s, v4.4s +ldr q4, [x17, #+320] +ldr q6, [x17, #+336] +sqrdmulh v5.4S, v25.4S, v15.s[1] +mul v25.4S, v25.4S,v22.s[1] +mla v25.4S, v5.4S, v31.s[0] +sub v5.4s, v20.4s, v25.4s +add v20.4s, v20.4s, v25.4s +sqrdmulh v25.4S, v29.4S, v15.s[2] +mul v29.4S, v29.4S,v22.s[2] +mla v29.4S, v25.4S, v31.s[0] +sub v25.4s, v28.4s, v29.4s +add v28.4s, v28.4s, v29.4s +str q20, [x0, #128] +str q5, [x0, #144] +str q28, [x0, #160] +str q25, [x0, #176] +ldr q25, [x0, #224] +ldr q28, [x0, #240] +ldr q5, [x0, #192] +ldr q20, [x0, #208] +sqrdmulh v29.4S, v25.4S, v12.s[0] +mul v25.4S, v25.4S,v3.s[0] +mla v25.4S, v29.4S, v31.s[0] +sub v29.4s, v5.4s, v25.4s +add v5.4s, v5.4s, v25.4s +sqrdmulh v25.4S, v28.4S, v12.s[0] +mul v28.4S, v28.4S,v3.s[0] +mla v28.4S, v25.4S, v31.s[0] +sub v25.4s, v20.4s, v28.4s +add v20.4s, v20.4s, v28.4s +ldr q28, [x17, #+352] +ldr q16, [x17, #+368] +sqrdmulh v11.4S, v20.4S, v12.s[1] +mul v20.4S, v20.4S,v3.s[1] +mla v20.4S, v11.4S, v31.s[0] +sub v11.4s, v5.4s, v20.4s +add v5.4s, v5.4s, v20.4s +sqrdmulh v20.4S, v25.4S, v12.s[2] +mul v25.4S, v25.4S,v3.s[2] +mla v25.4S, v20.4S, v31.s[0] +sub v20.4s, v29.4s, v25.4s +add v29.4s, v29.4s, v25.4s +str q5, [x0, #192] +str q11, [x0, #208] +str q29, [x0, #224] +str q20, [x0, #240] +ldr q20, [x0, #288] +ldr q29, [x0, #304] +ldr q11, [x0, #256] +ldr q5, [x0, #272] +sqrdmulh v25.4S, v20.4S, v26.s[0] +mul v20.4S, v20.4S,v30.s[0] +mla v20.4S, v25.4S, v31.s[0] +sub v25.4s, v11.4s, v20.4s +add v11.4s, v11.4s, v20.4s +sqrdmulh v20.4S, v29.4S, v26.s[0] +mul v29.4S, v29.4S,v30.s[0] +mla v29.4S, v20.4S, v31.s[0] +sub v20.4s, v5.4s, v29.4s +add v5.4s, v5.4s, v29.4s +ldr q29, [x17, #+384] +ldr q2, [x17, #+400] +sqrdmulh v23.4S, v5.4S, v26.s[1] +mul v5.4S, v5.4S,v30.s[1] +mla v5.4S, v23.4S, v31.s[0] +sub v23.4s, v11.4s, v5.4s +add v11.4s, v11.4s, v5.4s +sqrdmulh v5.4S, v20.4S, v26.s[2] +mul v20.4S, v20.4S,v30.s[2] +mla v20.4S, v5.4S, v31.s[0] +sub v5.4s, v25.4s, v20.4s +add v25.4s, v25.4s, v20.4s +str q11, [x0, #256] +str q23, [x0, #272] +str q25, [x0, #288] +str q5, [x0, #304] +ldr q1, [x0, #352] +ldr q18, [x0, #368] +ldr q5, [x0, #320] +ldr q25, [x0, #336] +sqrdmulh v23.4S, v1.4S, v24.s[0] +mul v1.4S, v1.4S,v27.s[0] +mla v1.4S, v23.4S, v31.s[0] +sub v23.4s, v5.4s, v1.4s +add v5.4s, v5.4s, v1.4s +sqrdmulh v1.4S, v18.4S, v24.s[0] +mul v18.4S, v18.4S,v27.s[0] +mla v18.4S, v1.4S, v31.s[0] +sub v1.4s, v25.4s, v18.4s +add v25.4s, v25.4s, v18.4s +ldr q18, [x17, #+416] +ldr q11, [x17, #+432] +sqrdmulh v20.4S, v25.4S, v24.s[1] +mul v25.4S, v25.4S,v27.s[1] +mla v25.4S, v20.4S, v31.s[0] +sub v20.4s, v5.4s, v25.4s +add v5.4s, v5.4s, v25.4s +sqrdmulh v25.4S, v1.4S, v24.s[2] +mul v1.4S, v1.4S,v27.s[2] +mla v1.4S, v25.4S, v31.s[0] +sub v25.4s, v23.4s, v1.4s +add v23.4s, v23.4s, v1.4s +str q5, [x0, #320] +str q20, [x0, #336] +str q23, [x0, #352] +str q25, [x0, #368] +ldr q21, [x0, #416] +ldr q10, [x0, #432] +ldr q25, [x0, #384] +ldr q23, [x0, #400] +sqrdmulh v20.4S, v21.4S, v6.s[0] +mul v21.4S, v21.4S,v4.s[0] +mla v21.4S, v20.4S, v31.s[0] +sub v20.4s, v25.4s, v21.4s +add v25.4s, v25.4s, v21.4s +sqrdmulh v21.4S, v10.4S, v6.s[0] +mul v10.4S, v10.4S,v4.s[0] +mla v10.4S, v21.4S, v31.s[0] +sub v21.4s, v23.4s, v10.4s +add v23.4s, v23.4s, v10.4s +ldr q10, [x17, #+448] +ldr q5, [x17, #+464] +sqrdmulh v1.4S, v23.4S, v6.s[1] +mul v23.4S, v23.4S,v4.s[1] +mla v23.4S, v1.4S, v31.s[0] +sub v1.4s, v25.4s, v23.4s +add v25.4s, v25.4s, v23.4s +sqrdmulh v23.4S, v21.4S, v6.s[2] +mul v21.4S, v21.4S,v4.s[2] +mla v21.4S, v23.4S, v31.s[0] +sub v23.4s, v20.4s, v21.4s +add v20.4s, v20.4s, v21.4s +str q25, [x0, #384] +str q1, [x0, #400] +str q20, [x0, #416] +str q23, [x0, #432] +ldr q15, [x0, #480] +ldr q22, [x0, #496] +ldr q23, [x0, #448] +ldr q20, [x0, #464] +sqrdmulh v1.4S, v15.4S, v16.s[0] +mul v15.4S, v15.4S,v28.s[0] +mla v15.4S, v1.4S, v31.s[0] +sub v1.4s, v23.4s, v15.4s +add v23.4s, v23.4s, v15.4s +sqrdmulh v15.4S, v22.4S, v16.s[0] +mul v22.4S, v22.4S,v28.s[0] +mla v22.4S, v15.4S, v31.s[0] +sub v15.4s, v20.4s, v22.4s +add v20.4s, v20.4s, v22.4s +ldr q22, [x17, #+480] +ldr q25, [x17, #+496] +sqrdmulh v21.4S, v20.4S, v16.s[1] +mul v20.4S, v20.4S,v28.s[1] +mla v20.4S, v21.4S, v31.s[0] +sub v21.4s, v23.4s, v20.4s +add v23.4s, v23.4s, v20.4s +sqrdmulh v20.4S, v15.4S, v16.s[2] +mul v15.4S, v15.4S,v28.s[2] +mla v15.4S, v20.4S, v31.s[0] +sub v20.4s, v1.4s, v15.4s +add v1.4s, v1.4s, v15.4s +str q23, [x0, #448] +str q21, [x0, #464] +str q1, [x0, #480] +str q20, [x0, #496] +ldr q12, [x0, #544] +ldr q3, [x0, #560] +ldr q20, [x0, #512] +ldr q1, [x0, #528] +sqrdmulh v21.4S, v12.4S, v2.s[0] +mul v12.4S, v12.4S,v29.s[0] +mla v12.4S, v21.4S, v31.s[0] +sub v21.4s, v20.4s, v12.4s +add v20.4s, v20.4s, v12.4s +sqrdmulh v12.4S, v3.4S, v2.s[0] +mul v3.4S, v3.4S,v29.s[0] +mla v3.4S, v12.4S, v31.s[0] +sub v12.4s, v1.4s, v3.4s +add v1.4s, v1.4s, v3.4s +ldr q3, [x17, #+512] +ldr q23, [x17, #+528] +sqrdmulh v15.4S, v1.4S, v2.s[1] +mul v1.4S, v1.4S,v29.s[1] +mla v1.4S, v15.4S, v31.s[0] +sub v15.4s, v20.4s, v1.4s +add v20.4s, v20.4s, v1.4s +sqrdmulh v1.4S, v12.4S, v2.s[2] +mul v12.4S, v12.4S,v29.s[2] +mla v12.4S, v1.4S, v31.s[0] +sub v1.4s, v21.4s, v12.4s +add v21.4s, v21.4s, v12.4s +str q20, [x0, #512] +str q15, [x0, #528] +str q21, [x0, #544] +str q1, [x0, #560] +ldr q26, [x0, #608] +ldr q30, [x0, #624] +ldr q1, [x0, #576] +ldr q21, [x0, #592] +sqrdmulh v15.4S, v26.4S, v11.s[0] +mul v26.4S, v26.4S,v18.s[0] +mla v26.4S, v15.4S, v31.s[0] +sub v15.4s, v1.4s, v26.4s +add v1.4s, v1.4s, v26.4s +sqrdmulh v26.4S, v30.4S, v11.s[0] +mul v30.4S, v30.4S,v18.s[0] +mla v30.4S, v26.4S, v31.s[0] +sub v26.4s, v21.4s, v30.4s +add v21.4s, v21.4s, v30.4s +ldr q30, [x17, #+544] +ldr q20, [x17, #+560] +sqrdmulh v12.4S, v21.4S, v11.s[1] +mul v21.4S, v21.4S,v18.s[1] +mla v21.4S, v12.4S, v31.s[0] +sub v12.4s, v1.4s, v21.4s +add v1.4s, v1.4s, v21.4s +sqrdmulh v21.4S, v26.4S, v11.s[2] +mul v26.4S, v26.4S,v18.s[2] +mla v26.4S, v21.4S, v31.s[0] +sub v21.4s, v15.4s, v26.4s +add v15.4s, v15.4s, v26.4s +str q1, [x0, #576] +str q12, [x0, #592] +str q15, [x0, #608] +str q21, [x0, #624] +ldr q24, [x0, #672] +ldr q27, [x0, #688] +ldr q21, [x0, #640] +ldr q15, [x0, #656] +sqrdmulh v12.4S, v24.4S, v5.s[0] +mul v24.4S, v24.4S,v10.s[0] +mla v24.4S, v12.4S, v31.s[0] +sub v12.4s, v21.4s, v24.4s +add v21.4s, v21.4s, v24.4s +sqrdmulh v24.4S, v27.4S, v5.s[0] +mul v27.4S, v27.4S,v10.s[0] +mla v27.4S, v24.4S, v31.s[0] +sub v24.4s, v15.4s, v27.4s +add v15.4s, v15.4s, v27.4s +ldr q27, [x17, #+576] +ldr q1, [x17, #+592] +sqrdmulh v26.4S, v15.4S, v5.s[1] +mul v15.4S, v15.4S,v10.s[1] +mla v15.4S, v26.4S, v31.s[0] +sub v26.4s, v21.4s, v15.4s +add v21.4s, v21.4s, v15.4s +sqrdmulh v15.4S, v24.4S, v5.s[2] +mul v24.4S, v24.4S,v10.s[2] +mla v24.4S, v15.4S, v31.s[0] +sub v15.4s, v12.4s, v24.4s +add v12.4s, v12.4s, v24.4s +str q21, [x0, #640] +str q26, [x0, #656] +str q12, [x0, #672] +str q15, [x0, #688] +ldr q6, [x0, #736] +ldr q4, [x0, #752] +ldr q15, [x0, #704] +ldr q12, [x0, #720] +sqrdmulh v26.4S, v6.4S, v25.s[0] +mul v6.4S, v6.4S,v22.s[0] +mla v6.4S, v26.4S, v31.s[0] +sub v26.4s, v15.4s, v6.4s +add v15.4s, v15.4s, v6.4s +sqrdmulh v6.4S, v4.4S, v25.s[0] +mul v4.4S, v4.4S,v22.s[0] +mla v4.4S, v6.4S, v31.s[0] +sub v6.4s, v12.4s, v4.4s +add v12.4s, v12.4s, v4.4s +ldr q4, [x17, #+608] +ldr q21, [x17, #+624] +sqrdmulh v24.4S, v12.4S, v25.s[1] +mul v12.4S, v12.4S,v22.s[1] +mla v12.4S, v24.4S, v31.s[0] +sub v24.4s, v15.4s, v12.4s +add v15.4s, v15.4s, v12.4s +sqrdmulh v12.4S, v6.4S, v25.s[2] +mul v6.4S, v6.4S,v22.s[2] +mla v6.4S, v12.4S, v31.s[0] +sub v12.4s, v26.4s, v6.4s +add v26.4s, v26.4s, v6.4s +str q15, [x0, #704] +str q24, [x0, #720] +str q26, [x0, #736] +str q12, [x0, #752] +ldr q16, [x0, #800] +ldr q28, [x0, #816] +ldr q12, [x0, #768] +ldr q26, [x0, #784] +sqrdmulh v24.4S, v16.4S, v23.s[0] +mul v16.4S, v16.4S,v3.s[0] +mla v16.4S, v24.4S, v31.s[0] +sub v24.4s, v12.4s, v16.4s +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v28.4S, v23.s[0] +mul v28.4S, v28.4S,v3.s[0] +mla v28.4S, v16.4S, v31.s[0] +sub v16.4s, v26.4s, v28.4s +add v26.4s, v26.4s, v28.4s +sqrdmulh v28.4S, v26.4S, v23.s[1] +mul v26.4S, v26.4S,v3.s[1] +mla v26.4S, v28.4S, v31.s[0] +sub v28.4s, v12.4s, v26.4s +add v12.4s, v12.4s, v26.4s +sqrdmulh v26.4S, v16.4S, v23.s[2] +mul v16.4S, v16.4S,v3.s[2] +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v24.4s, v16.4s +add v24.4s, v24.4s, v16.4s +str q12, [x0, #768] +str q28, [x0, #784] +str q24, [x0, #800] +str q26, [x0, #816] +ldr q2, [x0, #864] +ldr q29, [x0, #880] +ldr q26, [x0, #832] +ldr q24, [x0, #848] +sqrdmulh v28.4S, v2.4S, v20.s[0] +mul v2.4S, v2.4S,v30.s[0] +mla v2.4S, v28.4S, v31.s[0] +sub v28.4s, v26.4s, v2.4s +add v26.4s, v26.4s, v2.4s +sqrdmulh v2.4S, v29.4S, v20.s[0] +mul v29.4S, v29.4S,v30.s[0] +mla v29.4S, v2.4S, v31.s[0] +sub v2.4s, v24.4s, v29.4s +add v24.4s, v24.4s, v29.4s +sqrdmulh v29.4S, v24.4S, v20.s[1] +mul v24.4S, v24.4S,v30.s[1] +mla v24.4S, v29.4S, v31.s[0] +sub v29.4s, v26.4s, v24.4s +add v26.4s, v26.4s, v24.4s +sqrdmulh v24.4S, v2.4S, v20.s[2] +mul v2.4S, v2.4S,v30.s[2] +mla v2.4S, v24.4S, v31.s[0] +sub v24.4s, v28.4s, v2.4s +add v28.4s, v28.4s, v2.4s +str q26, [x0, #832] +str q29, [x0, #848] +str q28, [x0, #864] +str q24, [x0, #880] +ldr q11, [x0, #928] +ldr q18, [x0, #944] +ldr q24, [x0, #896] +ldr q28, [x0, #912] +sqrdmulh v29.4S, v11.4S, v1.s[0] +mul v11.4S, v11.4S,v27.s[0] +mla v11.4S, v29.4S, v31.s[0] +sub v29.4s, v24.4s, v11.4s +add v24.4s, v24.4s, v11.4s +sqrdmulh v11.4S, v18.4S, v1.s[0] +mul v18.4S, v18.4S,v27.s[0] +mla v18.4S, v11.4S, v31.s[0] +sub v11.4s, v28.4s, v18.4s +add v28.4s, v28.4s, v18.4s +sqrdmulh v18.4S, v28.4S, v1.s[1] +mul v28.4S, v28.4S,v27.s[1] +mla v28.4S, v18.4S, v31.s[0] +sub v18.4s, v24.4s, v28.4s +add v24.4s, v24.4s, v28.4s +sqrdmulh v28.4S, v11.4S, v1.s[2] +mul v11.4S, v11.4S,v27.s[2] +mla v11.4S, v28.4S, v31.s[0] +sub v28.4s, v29.4s, v11.4s +add v29.4s, v29.4s, v11.4s +str q24, [x0, #896] +str q18, [x0, #912] +str q29, [x0, #928] +str q28, [x0, #944] +ldr q5, [x0, #992] +ldr q10, [x0, #1008] +ldr q28, [x0, #960] +ldr q29, [x0, #976] +sqrdmulh v18.4S, v5.4S, v21.s[0] +mul v5.4S, v5.4S,v4.s[0] +mla v5.4S, v18.4S, v31.s[0] +sub v18.4s, v28.4s, v5.4s +add v28.4s, v28.4s, v5.4s +sqrdmulh v5.4S, v10.4S, v21.s[0] +mul v10.4S, v10.4S,v4.s[0] +mla v10.4S, v5.4S, v31.s[0] +sub v5.4s, v29.4s, v10.4s +add v29.4s, v29.4s, v10.4s +sqrdmulh v10.4S, v29.4S, v21.s[1] +mul v29.4S, v29.4S,v4.s[1] +mla v29.4S, v10.4S, v31.s[0] +sub v10.4s, v28.4s, v29.4s +add v28.4s, v28.4s, v29.4s +sqrdmulh v29.4S, v5.4S, v21.s[2] +mul v5.4S, v5.4S,v4.s[2] +mla v5.4S, v29.4S, v31.s[0] +sub v29.4s, v18.4s, v5.4s +add v18.4s, v18.4s, v5.4s +str q28, [x0, #960] +str q10, [x0, #976] +str q18, [x0, #992] +str q29, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1464 +// Instruction count: 1460 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_1.s b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_1.s new file mode 100644 index 0000000..ab592c3 --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_1.s @@ -0,0 +1,1494 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_7_z4_1 +.global _ntt_u32_incomplete_neon_asm_var_4_2_7_z4_1 +ntt_u32_incomplete_neon_asm_var_4_2_7_z4_1: +_ntt_u32_incomplete_neon_asm_var_4_2_7_z4_1: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #928] +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +ldr q20, [x0, #992] +sqrdmulh v19.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q18, [x0, #800] +sqrdmulh v17.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +ldr q16, [x0, #864] +sqrdmulh v3.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +mla v22.4S, v21.4S, v31.s[0] +mla v20.4S, v19.4S, v31.s[0] +mla v18.4S, v17.4S, v31.s[0] +mla v16.4S, v3.4S, v31.s[0] +ldr q3, [x0, #544] +sqrdmulh v17.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +ldr q19, [x0, #608] +sqrdmulh v21.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q2, [x0, #672] +ldr q1, [x0, #416] +sqrdmulh v0.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +sub v15.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +ldr q22, [x0, #736] +ldr q14, [x0, #480] +sqrdmulh v13.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v12.4s, v14.4s, v20.4s +add v14.4s, v14.4s, v20.4s +ldr q20, [x0, #288] +mla v3.4S, v17.4S, v31.s[0] +mla v19.4S, v21.4S, v31.s[0] +sub v21.4s, v20.4s, v18.4s +mla v2.4S, v0.4S, v31.s[0] +mla v22.4S, v13.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +ldr q18, [x0, #352] +sqrdmulh v13.4S, v1.4S, v29.s[1] +mul v1.4S, v1.4S,v30.s[1] +sub v0.4s, v18.4s, v16.4s +sqrdmulh v17.4S, v14.4S, v29.s[1] +mul v14.4S, v14.4S,v30.s[1] +add v18.4s, v18.4s, v16.4s +ldr q16, [x0, #32] +sqrdmulh v11.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v10.4s, v16.4s, v3.4s +add v16.4s, v16.4s, v3.4s +ldr q3, [x0, #96] +sqrdmulh v9.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v8.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +ldr q19, [x0, #160] +mla v1.4S, v13.4S, v31.s[0] +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v19.4s, v2.4s +mla v20.4S, v11.4S, v31.s[0] +mla v18.4S, v9.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +ldr q2, [x0, #224] +sqrdmulh v9.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +sub v11.4s, v2.4s, v22.4s +sqrdmulh v13.4S, v12.4S, v29.s[2] +mul v12.4S, v12.4S,v30.s[2] +add v2.4s, v2.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +sub v7.4s, v19.4s, v1.4s +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v29.s[2] +mul v0.4S, v0.4S,v30.s[2] +sub v6.4s, v2.4s, v14.4s +add v2.4s, v2.4s, v14.4s +mla v15.4S, v9.4S, v31.s[0] +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v16.4s, v20.4s +mla v21.4S, v22.4S, v31.s[0] +mla v0.4S, v1.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v7.4S, v27.s[1] +mul v7.4S, v7.4S,v28.s[1] +sub v1.4s, v3.4s, v18.4s +sqrdmulh v22.4S, v6.4S, v27.s[1] +mul v6.4S, v6.4S,v28.s[1] +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v19.4S, v27.s[0] +mul v19.4S, v19.4S,v28.s[0] +sub v9.4s, v17.4s, v15.4s +add v17.4s, v17.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v27.s[0] +mul v2.4S, v2.4S,v28.s[0] +sub v14.4s, v11.4s, v12.4s +add v11.4s, v11.4s, v12.4s +mla v7.4S, v20.4S, v31.s[0] +mla v6.4S, v22.4S, v31.s[0] +sub v22.4s, v10.4s, v21.4s +mla v19.4S, v18.4S, v31.s[0] +mla v2.4S, v15.4S, v31.s[0] +add v10.4s, v10.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v27.s[2] +mul v17.4S, v17.4S,v28.s[2] +sub v15.4s, v8.4s, v0.4s +sqrdmulh v18.4S, v11.4S, v27.s[2] +mul v11.4S, v11.4S,v28.s[2] +add v8.4s, v8.4s, v0.4s +sqrdmulh v0.4S, v9.4S, v27.s[3] +mul v9.4S, v9.4S,v28.s[3] +sub v20.4s, v13.4s, v7.4s +add v13.4s, v13.4s, v7.4s +sqrdmulh v7.4S, v14.4S, v27.s[3] +mul v14.4S, v14.4S,v28.s[3] +sub v12.4s, v1.4s, v6.4s +add v1.4s, v1.4s, v6.4s +mla v17.4S, v21.4S, v31.s[0] +mla v11.4S, v18.4S, v31.s[0] +sub v18.4s, v16.4s, v19.4s +mla v9.4S, v0.4S, v31.s[0] +mla v14.4S, v7.4S, v31.s[0] +add v16.4s, v16.4s, v19.4s +sqrdmulh v19.4S, v1.4S, v25.s[2] +mul v1.4S, v1.4S,v26.s[2] +sub v7.4s, v3.4s, v2.4s +sqrdmulh v0.4S, v12.4S, v25.s[3] +mul v12.4S, v12.4S,v26.s[3] +add v3.4s, v3.4s, v2.4s +sqrdmulh v2.4S, v7.4S, v25.s[1] +mul v7.4S, v7.4S,v26.s[1] +sub v21.4s, v10.4s, v17.4s +add v10.4s, v10.4s, v17.4s +sqrdmulh v17.4S, v3.4S, v25.s[0] +mul v3.4S, v3.4S,v26.s[0] +sub v6.4s, v8.4s, v11.4s +add v8.4s, v8.4s, v11.4s +mla v1.4S, v19.4S, v31.s[0] +mla v12.4S, v0.4S, v31.s[0] +sub v0.4s, v22.4s, v9.4s +mla v7.4S, v2.4S, v31.s[0] +mla v3.4S, v17.4S, v31.s[0] +add v22.4s, v22.4s, v9.4s +sqrdmulh v9.4S, v8.4S, v23.s[0] +mul v8.4S, v8.4S,v24.s[0] +sub v17.4s, v15.4s, v14.4s +sqrdmulh v2.4S, v6.4S, v23.s[1] +mul v6.4S, v6.4S,v24.s[1] +add v15.4s, v15.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v23.s[2] +mul v15.4S, v15.4S,v24.s[2] +sub v19.4s, v13.4s, v1.4s +add v13.4s, v13.4s, v1.4s +sqrdmulh v1.4S, v17.4S, v23.s[3] +mul v17.4S, v17.4S,v24.s[3] +sub v11.4s, v20.4s, v12.4s +add v20.4s, v20.4s, v12.4s +mla v8.4S, v9.4S, v31.s[0] +mla v6.4S, v2.4S, v31.s[0] +sub v2.4s, v18.4s, v7.4s +str q13, [x0, #288] +mla v15.4S, v14.4S, v31.s[0] +mla v17.4S, v1.4S, v31.s[0] +add v18.4s, v18.4s, v7.4s +str q19, [x0, #352] +ldr q19, [x0, #944] +sqrdmulh v7.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +sub v1.4s, v16.4s, v3.4s +str q20, [x0, #416] +ldr q20, [x0, #1008] +sqrdmulh v14.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v16.4s, v16.4s, v3.4s +str q11, [x0, #480] +ldr q11, [x0, #816] +sqrdmulh v3.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +sub v13.4s, v10.4s, v8.4s +add v10.4s, v10.4s, v8.4s +ldr q8, [x0, #880] +sqrdmulh v9.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v12.4s, v21.4s, v6.4s +add v21.4s, v21.4s, v6.4s +mla v19.4S, v7.4S, v31.s[0] +mla v20.4S, v14.4S, v31.s[0] +sub v14.4s, v22.4s, v15.4s +str q18, [x0, #160] +mla v11.4S, v3.4S, v31.s[0] +mla v8.4S, v9.4S, v31.s[0] +add v22.4s, v22.4s, v15.4s +str q2, [x0, #224] +ldr q2, [x0, #560] +sqrdmulh v15.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +sub v9.4s, v0.4s, v17.4s +str q16, [x0, #32] +ldr q16, [x0, #624] +sqrdmulh v3.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +add v0.4s, v0.4s, v17.4s +str q1, [x0, #96] +ldr q1, [x0, #688] +ldr q17, [x0, #432] +sqrdmulh v18.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +sub v7.4s, v17.4s, v19.4s +add v17.4s, v17.4s, v19.4s +ldr q19, [x0, #752] +ldr q6, [x0, #496] +sqrdmulh v5.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +sub v4.4s, v6.4s, v20.4s +add v6.4s, v6.4s, v20.4s +ldr q20, [x0, #304] +mla v2.4S, v15.4S, v31.s[0] +mla v16.4S, v3.4S, v31.s[0] +sub v3.4s, v20.4s, v11.4s +str q10, [x0, #544] +mla v1.4S, v18.4S, v31.s[0] +mla v19.4S, v5.4S, v31.s[0] +add v20.4s, v20.4s, v11.4s +str q13, [x0, #608] +ldr q13, [x0, #368] +sqrdmulh v11.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v5.4s, v13.4s, v8.4s +str q21, [x0, #672] +sqrdmulh v21.4S, v6.4S, v29.s[1] +mul v6.4S, v6.4S,v30.s[1] +add v13.4s, v13.4s, v8.4s +str q12, [x0, #736] +ldr q12, [x0, #48] +sqrdmulh v8.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v18.4s, v12.4s, v2.4s +add v12.4s, v12.4s, v2.4s +ldr q2, [x0, #112] +sqrdmulh v10.4S, v13.4S, v29.s[1] +mul v13.4S, v13.4S,v30.s[1] +sub v15.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +ldr q16, [x0, #176] +mla v17.4S, v11.4S, v31.s[0] +mla v6.4S, v21.4S, v31.s[0] +sub v21.4s, v16.4s, v1.4s +str q22, [x0, #800] +mla v20.4S, v8.4S, v31.s[0] +mla v13.4S, v10.4S, v31.s[0] +add v16.4s, v16.4s, v1.4s +str q14, [x0, #864] +ldr q14, [x0, #240] +sqrdmulh v1.4S, v7.4S, v29.s[2] +mul v7.4S, v7.4S,v30.s[2] +sub v10.4s, v14.4s, v19.4s +str q0, [x0, #928] +sqrdmulh v0.4S, v4.4S, v29.s[2] +mul v4.4S, v4.4S,v30.s[2] +add v14.4s, v14.4s, v19.4s +str q9, [x0, #992] +sqrdmulh v9.4S, v3.4S, v29.s[2] +mul v3.4S, v3.4S,v30.s[2] +sub v19.4s, v16.4s, v17.4s +add v16.4s, v16.4s, v17.4s +sqrdmulh v17.4S, v5.4S, v29.s[2] +mul v5.4S, v5.4S,v30.s[2] +sub v8.4s, v14.4s, v6.4s +add v14.4s, v14.4s, v6.4s +mla v7.4S, v1.4S, v31.s[0] +mla v4.4S, v0.4S, v31.s[0] +sub v0.4s, v12.4s, v20.4s +mla v3.4S, v9.4S, v31.s[0] +mla v5.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v27.s[1] +mul v19.4S, v19.4S,v28.s[1] +sub v17.4s, v2.4s, v13.4s +sqrdmulh v9.4S, v8.4S, v27.s[1] +mul v8.4S, v8.4S,v28.s[1] +add v2.4s, v2.4s, v13.4s +sqrdmulh v13.4S, v16.4S, v27.s[0] +mul v16.4S, v16.4S,v28.s[0] +sub v1.4s, v21.4s, v7.4s +add v21.4s, v21.4s, v7.4s +sqrdmulh v7.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +sub v6.4s, v10.4s, v4.4s +add v10.4s, v10.4s, v4.4s +mla v19.4S, v20.4S, v31.s[0] +mla v8.4S, v9.4S, v31.s[0] +sub v9.4s, v18.4s, v3.4s +mla v16.4S, v13.4S, v31.s[0] +mla v14.4S, v7.4S, v31.s[0] +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v27.s[2] +mul v21.4S, v21.4S,v28.s[2] +sub v7.4s, v15.4s, v5.4s +sqrdmulh v13.4S, v10.4S, v27.s[2] +mul v10.4S, v10.4S,v28.s[2] +add v15.4s, v15.4s, v5.4s +sqrdmulh v5.4S, v1.4S, v27.s[3] +mul v1.4S, v1.4S,v28.s[3] +sub v20.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v27.s[3] +mul v6.4S, v6.4S,v28.s[3] +sub v4.4s, v17.4s, v8.4s +add v17.4s, v17.4s, v8.4s +mla v21.4S, v3.4S, v31.s[0] +mla v10.4S, v13.4S, v31.s[0] +sub v13.4s, v12.4s, v16.4s +mla v1.4S, v5.4S, v31.s[0] +mla v6.4S, v19.4S, v31.s[0] +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v25.s[2] +mul v17.4S, v17.4S,v26.s[2] +sub v19.4s, v2.4s, v14.4s +sqrdmulh v5.4S, v4.4S, v25.s[3] +mul v4.4S, v4.4S,v26.s[3] +add v2.4s, v2.4s, v14.4s +sqrdmulh v14.4S, v19.4S, v25.s[1] +mul v19.4S, v19.4S,v26.s[1] +sub v3.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v2.4S, v25.s[0] +mul v2.4S, v2.4S,v26.s[0] +sub v8.4s, v15.4s, v10.4s +add v15.4s, v15.4s, v10.4s +mla v17.4S, v16.4S, v31.s[0] +mla v4.4S, v5.4S, v31.s[0] +sub v5.4s, v9.4s, v1.4s +mla v19.4S, v14.4S, v31.s[0] +mla v2.4S, v21.4S, v31.s[0] +add v9.4s, v9.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v23.s[0] +mul v15.4S, v15.4S,v24.s[0] +sub v21.4s, v7.4s, v6.4s +sqrdmulh v14.4S, v8.4S, v23.s[1] +mul v8.4S, v8.4S,v24.s[1] +add v7.4s, v7.4s, v6.4s +sqrdmulh v6.4S, v7.4S, v23.s[2] +mul v7.4S, v7.4S,v24.s[2] +sub v16.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +sqrdmulh v17.4S, v21.4S, v23.s[3] +mul v21.4S, v21.4S,v24.s[3] +sub v10.4s, v20.4s, v4.4s +add v20.4s, v20.4s, v4.4s +mla v15.4S, v1.4S, v31.s[0] +mla v8.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v19.4s +str q0, [x0, #304] +mla v7.4S, v6.4S, v31.s[0] +mla v21.4S, v17.4S, v31.s[0] +add v13.4s, v13.4s, v19.4s +str q16, [x0, #368] +ldr q16, [x0, #896] +sqrdmulh v19.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v17.4s, v12.4s, v2.4s +str q20, [x0, #432] +ldr q20, [x0, #960] +sqrdmulh v6.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v12.4s, v12.4s, v2.4s +str q10, [x0, #496] +ldr q10, [x0, #768] +sqrdmulh v2.4S, v10.4S, v29.s[0] +mul v10.4S, v10.4S,v30.s[0] +sub v0.4s, v18.4s, v15.4s +add v18.4s, v18.4s, v15.4s +ldr q15, [x0, #832] +sqrdmulh v1.4S, v15.4S, v29.s[0] +mul v15.4S, v15.4S,v30.s[0] +sub v4.4s, v3.4s, v8.4s +add v3.4s, v3.4s, v8.4s +mla v16.4S, v19.4S, v31.s[0] +mla v20.4S, v6.4S, v31.s[0] +sub v6.4s, v9.4s, v7.4s +str q13, [x0, #176] +mla v10.4S, v2.4S, v31.s[0] +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v7.4s +str q14, [x0, #240] +ldr q14, [x0, #512] +sqrdmulh v7.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v1.4s, v5.4s, v21.4s +str q12, [x0, #48] +ldr q12, [x0, #576] +sqrdmulh v2.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +add v5.4s, v5.4s, v21.4s +str q17, [x0, #112] +ldr q17, [x0, #640] +ldr q21, [x0, #384] +sqrdmulh v13.4S, v17.4S, v29.s[0] +mul v17.4S, v17.4S,v30.s[0] +sub v19.4s, v21.4s, v16.4s +add v21.4s, v21.4s, v16.4s +ldr q16, [x0, #704] +ldr q8, [x0, #448] +sqrdmulh v22.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v11.4s, v8.4s, v20.4s +add v8.4s, v8.4s, v20.4s +ldr q20, [x0, #256] +mla v14.4S, v7.4S, v31.s[0] +mla v12.4S, v2.4S, v31.s[0] +sub v2.4s, v20.4s, v10.4s +str q18, [x0, #560] +mla v17.4S, v13.4S, v31.s[0] +mla v16.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v10.4s +str q0, [x0, #624] +ldr q0, [x0, #320] +sqrdmulh v10.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v22.4s, v0.4s, v15.4s +str q3, [x0, #688] +sqrdmulh v3.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +add v0.4s, v0.4s, v15.4s +str q4, [x0, #752] +ldr q4, [x0, #0] +sqrdmulh v15.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v13.4s, v4.4s, v14.4s +add v4.4s, v4.4s, v14.4s +ldr q14, [x0, #64] +sqrdmulh v18.4S, v0.4S, v29.s[1] +mul v0.4S, v0.4S,v30.s[1] +sub v7.4s, v14.4s, v12.4s +add v14.4s, v14.4s, v12.4s +ldr q12, [x0, #128] +mla v21.4S, v10.4S, v31.s[0] +mla v8.4S, v3.4S, v31.s[0] +sub v3.4s, v12.4s, v17.4s +str q9, [x0, #816] +mla v20.4S, v15.4S, v31.s[0] +mla v0.4S, v18.4S, v31.s[0] +add v12.4s, v12.4s, v17.4s +str q6, [x0, #880] +ldr q6, [x0, #192] +sqrdmulh v17.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +sub v18.4s, v6.4s, v16.4s +str q5, [x0, #944] +sqrdmulh v5.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +add v6.4s, v6.4s, v16.4s +str q1, [x0, #1008] +sqrdmulh v1.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v16.4s, v12.4s, v21.4s +add v12.4s, v12.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +sub v15.4s, v6.4s, v8.4s +add v6.4s, v6.4s, v8.4s +mla v19.4S, v17.4S, v31.s[0] +mla v11.4S, v5.4S, v31.s[0] +sub v5.4s, v4.4s, v20.4s +mla v2.4S, v1.4S, v31.s[0] +mla v22.4S, v21.4S, v31.s[0] +add v4.4s, v4.4s, v20.4s +sqrdmulh v20.4S, v16.4S, v27.s[1] +mul v16.4S, v16.4S,v28.s[1] +sub v21.4s, v14.4s, v0.4s +sqrdmulh v1.4S, v15.4S, v27.s[1] +mul v15.4S, v15.4S,v28.s[1] +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +sub v17.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v27.s[0] +mul v6.4S, v6.4S,v28.s[0] +sub v8.4s, v18.4s, v11.4s +add v18.4s, v18.4s, v11.4s +mla v16.4S, v20.4S, v31.s[0] +mla v15.4S, v1.4S, v31.s[0] +sub v1.4s, v13.4s, v2.4s +mla v12.4S, v0.4S, v31.s[0] +mla v6.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v3.4S, v27.s[2] +mul v3.4S, v3.4S,v28.s[2] +sub v19.4s, v7.4s, v22.4s +sqrdmulh v0.4S, v18.4S, v27.s[2] +mul v18.4S, v18.4S,v28.s[2] +add v7.4s, v7.4s, v22.4s +sqrdmulh v22.4S, v17.4S, v27.s[3] +mul v17.4S, v17.4S,v28.s[3] +sub v20.4s, v5.4s, v16.4s +add v5.4s, v5.4s, v16.4s +sqrdmulh v16.4S, v8.4S, v27.s[3] +mul v8.4S, v8.4S,v28.s[3] +sub v11.4s, v21.4s, v15.4s +add v21.4s, v21.4s, v15.4s +mla v3.4S, v2.4S, v31.s[0] +mla v18.4S, v0.4S, v31.s[0] +sub v0.4s, v4.4s, v12.4s +mla v17.4S, v22.4S, v31.s[0] +mla v8.4S, v16.4S, v31.s[0] +add v4.4s, v4.4s, v12.4s +sqrdmulh v12.4S, v21.4S, v25.s[2] +mul v21.4S, v21.4S,v26.s[2] +sub v16.4s, v14.4s, v6.4s +sqrdmulh v22.4S, v11.4S, v25.s[3] +mul v11.4S, v11.4S,v26.s[3] +add v14.4s, v14.4s, v6.4s +sqrdmulh v6.4S, v16.4S, v25.s[1] +mul v16.4S, v16.4S,v26.s[1] +sub v2.4s, v13.4s, v3.4s +add v13.4s, v13.4s, v3.4s +sqrdmulh v3.4S, v14.4S, v25.s[0] +mul v14.4S, v14.4S,v26.s[0] +sub v15.4s, v7.4s, v18.4s +add v7.4s, v7.4s, v18.4s +mla v21.4S, v12.4S, v31.s[0] +mla v11.4S, v22.4S, v31.s[0] +sub v22.4s, v1.4s, v17.4s +mla v16.4S, v6.4S, v31.s[0] +mla v14.4S, v3.4S, v31.s[0] +add v1.4s, v1.4s, v17.4s +sqrdmulh v17.4S, v7.4S, v23.s[0] +mul v7.4S, v7.4S,v24.s[0] +sub v3.4s, v19.4s, v8.4s +sqrdmulh v6.4S, v15.4S, v23.s[1] +mul v15.4S, v15.4S,v24.s[1] +add v19.4s, v19.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v23.s[2] +mul v19.4S, v19.4S,v24.s[2] +sub v12.4s, v5.4s, v21.4s +add v5.4s, v5.4s, v21.4s +sqrdmulh v21.4S, v3.4S, v23.s[3] +mul v3.4S, v3.4S,v24.s[3] +sub v18.4s, v20.4s, v11.4s +add v20.4s, v20.4s, v11.4s +mla v7.4S, v17.4S, v31.s[0] +mla v15.4S, v6.4S, v31.s[0] +sub v6.4s, v0.4s, v16.4s +str q5, [x0, #256] +mla v19.4S, v8.4S, v31.s[0] +mla v3.4S, v21.4S, v31.s[0] +add v0.4s, v0.4s, v16.4s +str q12, [x0, #320] +ldr q12, [x0, #912] +sqrdmulh v16.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +sub v21.4s, v4.4s, v14.4s +str q20, [x0, #384] +ldr q20, [x0, #976] +sqrdmulh v8.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v4.4s, v4.4s, v14.4s +str q18, [x0, #448] +ldr q18, [x0, #784] +sqrdmulh v14.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +sub v5.4s, v13.4s, v7.4s +add v13.4s, v13.4s, v7.4s +ldr q7, [x0, #848] +sqrdmulh v17.4S, v7.4S, v29.s[0] +mul v7.4S, v7.4S,v30.s[0] +sub v11.4s, v2.4s, v15.4s +add v2.4s, v2.4s, v15.4s +mla v12.4S, v16.4S, v31.s[0] +mla v20.4S, v8.4S, v31.s[0] +sub v8.4s, v1.4s, v19.4s +str q0, [x0, #128] +mla v18.4S, v14.4S, v31.s[0] +mla v7.4S, v17.4S, v31.s[0] +add v1.4s, v1.4s, v19.4s +str q6, [x0, #192] +ldr q6, [x0, #528] +sqrdmulh v19.4S, v6.4S, v29.s[0] +mul v6.4S, v6.4S,v30.s[0] +sub v17.4s, v22.4s, v3.4s +str q4, [x0, #0] +ldr q4, [x0, #592] +sqrdmulh v14.4S, v4.4S, v29.s[0] +mul v4.4S, v4.4S,v30.s[0] +add v22.4s, v22.4s, v3.4s +str q21, [x0, #64] +ldr q21, [x0, #656] +ldr q3, [x0, #400] +sqrdmulh v0.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +sub v16.4s, v3.4s, v12.4s +add v3.4s, v3.4s, v12.4s +ldr q12, [x0, #720] +ldr q15, [x0, #464] +sqrdmulh v9.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +sub v10.4s, v15.4s, v20.4s +add v15.4s, v15.4s, v20.4s +ldr q20, [x0, #272] +mla v6.4S, v19.4S, v31.s[0] +mla v4.4S, v14.4S, v31.s[0] +sub v14.4s, v20.4s, v18.4s +str q13, [x0, #512] +mla v21.4S, v0.4S, v31.s[0] +mla v12.4S, v9.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +str q5, [x0, #576] +ldr q5, [x0, #336] +sqrdmulh v18.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v9.4s, v5.4s, v7.4s +str q2, [x0, #640] +sqrdmulh v2.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +add v5.4s, v5.4s, v7.4s +str q11, [x0, #704] +ldr q11, [x0, #16] +sqrdmulh v7.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v0.4s, v11.4s, v6.4s +add v11.4s, v11.4s, v6.4s +ldr q6, [x0, #80] +sqrdmulh v13.4S, v5.4S, v29.s[1] +mul v5.4S, v5.4S,v30.s[1] +sub v19.4s, v6.4s, v4.4s +add v6.4s, v6.4s, v4.4s +ldr q4, [x0, #144] +mla v3.4S, v18.4S, v31.s[0] +mla v15.4S, v2.4S, v31.s[0] +sub v2.4s, v4.4s, v21.4s +str q1, [x0, #768] +mla v20.4S, v7.4S, v31.s[0] +mla v5.4S, v13.4S, v31.s[0] +add v4.4s, v4.4s, v21.4s +str q8, [x0, #832] +ldr q8, [x0, #208] +sqrdmulh v21.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +sub v13.4s, v8.4s, v12.4s +str q22, [x0, #896] +sqrdmulh v22.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +add v8.4s, v8.4s, v12.4s +str q17, [x0, #960] +sqrdmulh v17.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v12.4s, v4.4s, v3.4s +add v4.4s, v4.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v29.s[2] +mul v9.4S, v9.4S,v30.s[2] +sub v7.4s, v8.4s, v15.4s +add v8.4s, v8.4s, v15.4s +mla v16.4S, v21.4S, v31.s[0] +mla v10.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v20.4s +mla v14.4S, v17.4S, v31.s[0] +mla v9.4S, v3.4S, v31.s[0] +add v11.4s, v11.4s, v20.4s +sqrdmulh v20.4S, v12.4S, v27.s[1] +mul v12.4S, v12.4S,v28.s[1] +sub v3.4s, v6.4s, v5.4s +sqrdmulh v17.4S, v7.4S, v27.s[1] +mul v7.4S, v7.4S,v28.s[1] +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v4.4S, v27.s[0] +mul v4.4S, v4.4S,v28.s[0] +sub v21.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v8.4S, v27.s[0] +mul v8.4S, v8.4S,v28.s[0] +sub v15.4s, v13.4s, v10.4s +add v13.4s, v13.4s, v10.4s +mla v12.4S, v20.4S, v31.s[0] +mla v7.4S, v17.4S, v31.s[0] +sub v17.4s, v0.4s, v14.4s +mla v4.4S, v5.4S, v31.s[0] +mla v8.4S, v16.4S, v31.s[0] +add v0.4s, v0.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v27.s[2] +mul v2.4S, v2.4S,v28.s[2] +sub v16.4s, v19.4s, v9.4s +sqrdmulh v5.4S, v13.4S, v27.s[2] +mul v13.4S, v13.4S,v28.s[2] +add v19.4s, v19.4s, v9.4s +sqrdmulh v9.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +sub v20.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +sub v10.4s, v3.4s, v7.4s +add v3.4s, v3.4s, v7.4s +mla v2.4S, v14.4S, v31.s[0] +mla v13.4S, v5.4S, v31.s[0] +sub v5.4s, v11.4s, v4.4s +mla v21.4S, v9.4S, v31.s[0] +mla v15.4S, v12.4S, v31.s[0] +add v11.4s, v11.4s, v4.4s +sqrdmulh v4.4S, v3.4S, v25.s[2] +mul v3.4S, v3.4S,v26.s[2] +sub v12.4s, v6.4s, v8.4s +sqrdmulh v9.4S, v10.4S, v25.s[3] +mul v10.4S, v10.4S,v26.s[3] +add v6.4s, v6.4s, v8.4s +sqrdmulh v8.4S, v12.4S, v25.s[1] +mul v12.4S, v12.4S,v26.s[1] +sub v14.4s, v0.4s, v2.4s +add v0.4s, v0.4s, v2.4s +sqrdmulh v2.4S, v6.4S, v25.s[0] +mul v6.4S, v6.4S,v26.s[0] +sub v7.4s, v19.4s, v13.4s +add v19.4s, v19.4s, v13.4s +mla v3.4S, v4.4S, v31.s[0] +mla v10.4S, v9.4S, v31.s[0] +sub v9.4s, v17.4s, v21.4s +mla v12.4S, v8.4S, v31.s[0] +mla v6.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v19.4S, v23.s[0] +mul v19.4S, v19.4S,v24.s[0] +sub v2.4s, v16.4s, v15.4s +sqrdmulh v8.4S, v7.4S, v23.s[1] +mul v7.4S, v7.4S,v24.s[1] +add v16.4s, v16.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v23.s[2] +mul v16.4S, v16.4S,v24.s[2] +sub v4.4s, v22.4s, v3.4s +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v2.4S, v23.s[3] +mul v2.4S, v2.4S,v24.s[3] +sub v13.4s, v20.4s, v10.4s +add v20.4s, v20.4s, v10.4s +mla v19.4S, v21.4S, v31.s[0] +mla v7.4S, v8.4S, v31.s[0] +sub v8.4s, v5.4s, v12.4s +str q22, [x0, #272] +mla v16.4S, v15.4S, v31.s[0] +mla v2.4S, v3.4S, v31.s[0] +add v5.4s, v5.4s, v12.4s +str q4, [x0, #336] +sub v23.4s, v11.4s, v6.4s +str q20, [x0, #400] +add v11.4s, v11.4s, v6.4s +str q13, [x0, #464] +sub v13.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sub v19.4s, v14.4s, v7.4s +add v14.4s, v14.4s, v7.4s +sub v7.4s, v17.4s, v16.4s +str q5, [x0, #144] +add v17.4s, v17.4s, v16.4s +str q8, [x0, #208] +sub v8.4s, v9.4s, v2.4s +str q11, [x0, #16] +add v9.4s, v9.4s, v2.4s +str q23, [x0, #80] +str q0, [x0, #528] +str q13, [x0, #592] +str q14, [x0, #656] +str q19, [x0, #720] +str q17, [x0, #784] +str q7, [x0, #848] +str q9, [x0, #912] +str q8, [x0, #976] +ldr q18, [x0, #32] +ldr q1, [x0, #48] +ldr q10, [x0, #0] +ldr q21, [x0, #16] +ldr q22, [x0, #96] +ldr q15, [x0, #112] +ldr q3, [x0, #64] +ldr q12, [x0, #80] +ldr q4, [x0, #160] +ldr q30, [x0, #176] +ldr q29, [x0, #128] +ldr q28, [x0, #144] +ldr q27, [x0, #224] +ldr q26, [x0, #240] +ldr q25, [x0, #192] +ldr q24, [x0, #208] +ldr q20, [x17, #+128] +ldr q6, [x17, #+144] +ldr q5, [x17, #+160] +ldr q16, [x17, #+176] +ldr q11, [x17, #+192] +ldr q2, [x17, #+208] +ldr q23, [x17, #+224] +ldr q0, [x17, #+240] +sqrdmulh v13.4S, v18.4S, v6.s[0] +mul v18.4S, v18.4S,v20.s[0] +sqrdmulh v14.4S, v1.4S, v6.s[0] +mul v1.4S, v1.4S,v20.s[0] +mla v18.4S, v13.4S, v31.s[0] +sqrdmulh v13.4S, v22.4S, v16.s[0] +mul v22.4S, v22.4S,v5.s[0] +mla v1.4S, v14.4S, v31.s[0] +sub v14.4s, v10.4s, v18.4s +add v10.4s, v10.4s, v18.4s +sqrdmulh v18.4S, v15.4S, v16.s[0] +mul v15.4S, v15.4S,v5.s[0] +mla v22.4S, v13.4S, v31.s[0] +sub v13.4s, v21.4s, v1.4s +add v21.4s, v21.4s, v1.4s +sqrdmulh v1.4S, v21.4S, v6.s[1] +mul v21.4S, v21.4S,v20.s[1] +mla v15.4S, v18.4S, v31.s[0] +sub v18.4s, v3.4s, v22.4s +add v3.4s, v3.4s, v22.4s +sqrdmulh v22.4S, v13.4S, v6.s[2] +mul v13.4S, v13.4S,v20.s[2] +mla v21.4S, v1.4S, v31.s[0] +sub v1.4s, v12.4s, v15.4s +add v12.4s, v12.4s, v15.4s +sqrdmulh v15.4S, v12.4S, v16.s[1] +mul v12.4S, v12.4S,v5.s[1] +mla v13.4S, v22.4S, v31.s[0] +sub v22.4s, v10.4s, v21.4s +add v10.4s, v10.4s, v21.4s +sqrdmulh v6.4S, v1.4S, v16.s[2] +mul v1.4S, v1.4S,v5.s[2] +mla v12.4S, v15.4S, v31.s[0] +sub v15.4s, v14.4s, v13.4s +add v14.4s, v14.4s, v13.4s +sqrdmulh v13.4S, v4.4S, v2.s[0] +mul v4.4S, v4.4S,v11.s[0] +mla v1.4S, v6.4S, v31.s[0] +sub v6.4s, v3.4s, v12.4s +add v3.4s, v3.4s, v12.4s +sqrdmulh v16.4S, v30.4S, v2.s[0] +mul v30.4S, v30.4S,v11.s[0] +mla v4.4S, v13.4S, v31.s[0] +sub v13.4s, v18.4s, v1.4s +add v18.4s, v18.4s, v1.4s +sqrdmulh v1.4S, v27.4S, v0.s[0] +mul v27.4S, v27.4S,v23.s[0] +mla v30.4S, v16.4S, v31.s[0] +sub v16.4s, v29.4s, v4.4s +add v29.4s, v29.4s, v4.4s +sqrdmulh v4.4S, v26.4S, v0.s[0] +mul v26.4S, v26.4S,v23.s[0] +mla v27.4S, v1.4S, v31.s[0] +sub v1.4s, v28.4s, v30.4s +add v28.4s, v28.4s, v30.4s +sqrdmulh v30.4S, v28.4S, v2.s[1] +mul v28.4S, v28.4S,v11.s[1] +mla v26.4S, v4.4S, v31.s[0] +sub v4.4s, v25.4s, v27.4s +add v25.4s, v25.4s, v27.4s +sqrdmulh v27.4S, v1.4S, v2.s[2] +mul v1.4S, v1.4S,v11.s[2] +mla v28.4S, v30.4S, v31.s[0] +sub v30.4s, v24.4s, v26.4s +add v24.4s, v24.4s, v26.4s +sqrdmulh v26.4S, v24.4S, v0.s[1] +mul v24.4S, v24.4S,v23.s[1] +mla v1.4S, v27.4S, v31.s[0] +sub v27.4s, v29.4s, v28.4s +add v29.4s, v29.4s, v28.4s +sqrdmulh v2.4S, v30.4S, v0.s[2] +mul v30.4S, v30.4S,v23.s[2] +mla v24.4S, v26.4S, v31.s[0] +sub v26.4s, v16.4s, v1.4s +add v16.4s, v16.4s, v1.4s +mla v30.4S, v2.4S, v31.s[0] +sub v2.4s, v25.4s, v24.4s +add v25.4s, v25.4s, v24.4s +sub v0.4s, v4.4s, v30.4s +add v4.4s, v4.4s, v30.4s +str q10, [x0, #0] +str q22, [x0, #16] +str q14, [x0, #32] +str q15, [x0, #48] +str q3, [x0, #64] +str q6, [x0, #80] +str q18, [x0, #96] +str q13, [x0, #112] +str q29, [x0, #128] +str q27, [x0, #144] +str q16, [x0, #160] +str q26, [x0, #176] +str q25, [x0, #192] +str q2, [x0, #208] +str q4, [x0, #224] +str q0, [x0, #240] +ldr q0, [x0, #288] +ldr q4, [x0, #304] +ldr q2, [x0, #256] +ldr q25, [x0, #272] +ldr q26, [x0, #352] +ldr q16, [x0, #368] +ldr q27, [x0, #320] +ldr q29, [x0, #336] +ldr q13, [x0, #416] +ldr q18, [x0, #432] +ldr q6, [x0, #384] +ldr q3, [x0, #400] +ldr q15, [x0, #480] +ldr q14, [x0, #496] +ldr q22, [x0, #448] +ldr q10, [x0, #464] +ldr q30, [x17, #+256] +ldr q23, [x17, #+272] +ldr q24, [x17, #+288] +ldr q1, [x17, #+304] +ldr q11, [x17, #+320] +ldr q28, [x17, #+336] +ldr q5, [x17, #+352] +ldr q12, [x17, #+368] +sqrdmulh v20.4S, v0.4S, v23.s[0] +mul v0.4S, v0.4S,v30.s[0] +sqrdmulh v21.4S, v4.4S, v23.s[0] +mul v4.4S, v4.4S,v30.s[0] +mla v0.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v26.4S, v1.s[0] +mul v26.4S, v26.4S,v24.s[0] +mla v4.4S, v21.4S, v31.s[0] +sub v21.4s, v2.4s, v0.4s +add v2.4s, v2.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v1.s[0] +mul v16.4S, v16.4S,v24.s[0] +mla v26.4S, v20.4S, v31.s[0] +sub v20.4s, v25.4s, v4.4s +add v25.4s, v25.4s, v4.4s +sqrdmulh v4.4S, v25.4S, v23.s[1] +mul v25.4S, v25.4S,v30.s[1] +mla v16.4S, v0.4S, v31.s[0] +sub v0.4s, v27.4s, v26.4s +add v27.4s, v27.4s, v26.4s +sqrdmulh v26.4S, v20.4S, v23.s[2] +mul v20.4S, v20.4S,v30.s[2] +mla v25.4S, v4.4S, v31.s[0] +sub v4.4s, v29.4s, v16.4s +add v29.4s, v29.4s, v16.4s +sqrdmulh v16.4S, v29.4S, v1.s[1] +mul v29.4S, v29.4S,v24.s[1] +mla v20.4S, v26.4S, v31.s[0] +sub v26.4s, v2.4s, v25.4s +add v2.4s, v2.4s, v25.4s +sqrdmulh v23.4S, v4.4S, v1.s[2] +mul v4.4S, v4.4S,v24.s[2] +mla v29.4S, v16.4S, v31.s[0] +sub v16.4s, v21.4s, v20.4s +add v21.4s, v21.4s, v20.4s +sqrdmulh v20.4S, v13.4S, v28.s[0] +mul v13.4S, v13.4S,v11.s[0] +mla v4.4S, v23.4S, v31.s[0] +sub v23.4s, v27.4s, v29.4s +add v27.4s, v27.4s, v29.4s +sqrdmulh v1.4S, v18.4S, v28.s[0] +mul v18.4S, v18.4S,v11.s[0] +mla v13.4S, v20.4S, v31.s[0] +sub v20.4s, v0.4s, v4.4s +add v0.4s, v0.4s, v4.4s +sqrdmulh v4.4S, v15.4S, v12.s[0] +mul v15.4S, v15.4S,v5.s[0] +mla v18.4S, v1.4S, v31.s[0] +sub v1.4s, v6.4s, v13.4s +add v6.4s, v6.4s, v13.4s +sqrdmulh v13.4S, v14.4S, v12.s[0] +mul v14.4S, v14.4S,v5.s[0] +mla v15.4S, v4.4S, v31.s[0] +sub v4.4s, v3.4s, v18.4s +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v3.4S, v28.s[1] +mul v3.4S, v3.4S,v11.s[1] +mla v14.4S, v13.4S, v31.s[0] +sub v13.4s, v22.4s, v15.4s +add v22.4s, v22.4s, v15.4s +sqrdmulh v15.4S, v4.4S, v28.s[2] +mul v4.4S, v4.4S,v11.s[2] +mla v3.4S, v18.4S, v31.s[0] +sub v18.4s, v10.4s, v14.4s +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v10.4S, v12.s[1] +mul v10.4S, v10.4S,v5.s[1] +mla v4.4S, v15.4S, v31.s[0] +sub v15.4s, v6.4s, v3.4s +add v6.4s, v6.4s, v3.4s +sqrdmulh v28.4S, v18.4S, v12.s[2] +mul v18.4S, v18.4S,v5.s[2] +mla v10.4S, v14.4S, v31.s[0] +sub v14.4s, v1.4s, v4.4s +add v1.4s, v1.4s, v4.4s +mla v18.4S, v28.4S, v31.s[0] +sub v28.4s, v22.4s, v10.4s +add v22.4s, v22.4s, v10.4s +sub v12.4s, v13.4s, v18.4s +add v13.4s, v13.4s, v18.4s +str q2, [x0, #256] +str q26, [x0, #272] +str q21, [x0, #288] +str q16, [x0, #304] +str q27, [x0, #320] +str q23, [x0, #336] +str q0, [x0, #352] +str q20, [x0, #368] +str q6, [x0, #384] +str q15, [x0, #400] +str q1, [x0, #416] +str q14, [x0, #432] +str q22, [x0, #448] +str q28, [x0, #464] +str q13, [x0, #480] +str q12, [x0, #496] +ldr q12, [x0, #544] +ldr q13, [x0, #560] +ldr q28, [x0, #512] +ldr q22, [x0, #528] +ldr q14, [x0, #608] +ldr q1, [x0, #624] +ldr q15, [x0, #576] +ldr q6, [x0, #592] +ldr q20, [x0, #672] +ldr q0, [x0, #688] +ldr q23, [x0, #640] +ldr q27, [x0, #656] +ldr q16, [x0, #736] +ldr q21, [x0, #752] +ldr q26, [x0, #704] +ldr q2, [x0, #720] +ldr q18, [x17, #+384] +ldr q5, [x17, #+400] +ldr q10, [x17, #+416] +ldr q4, [x17, #+432] +ldr q11, [x17, #+448] +ldr q3, [x17, #+464] +ldr q24, [x17, #+480] +ldr q29, [x17, #+496] +sqrdmulh v30.4S, v12.4S, v5.s[0] +mul v12.4S, v12.4S,v18.s[0] +sqrdmulh v25.4S, v13.4S, v5.s[0] +mul v13.4S, v13.4S,v18.s[0] +mla v12.4S, v30.4S, v31.s[0] +sqrdmulh v30.4S, v14.4S, v4.s[0] +mul v14.4S, v14.4S,v10.s[0] +mla v13.4S, v25.4S, v31.s[0] +sub v25.4s, v28.4s, v12.4s +add v28.4s, v28.4s, v12.4s +sqrdmulh v12.4S, v1.4S, v4.s[0] +mul v1.4S, v1.4S,v10.s[0] +mla v14.4S, v30.4S, v31.s[0] +sub v30.4s, v22.4s, v13.4s +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v22.4S, v5.s[1] +mul v22.4S, v22.4S,v18.s[1] +mla v1.4S, v12.4S, v31.s[0] +sub v12.4s, v15.4s, v14.4s +add v15.4s, v15.4s, v14.4s +sqrdmulh v14.4S, v30.4S, v5.s[2] +mul v30.4S, v30.4S,v18.s[2] +mla v22.4S, v13.4S, v31.s[0] +sub v13.4s, v6.4s, v1.4s +add v6.4s, v6.4s, v1.4s +sqrdmulh v1.4S, v6.4S, v4.s[1] +mul v6.4S, v6.4S,v10.s[1] +mla v30.4S, v14.4S, v31.s[0] +sub v14.4s, v28.4s, v22.4s +add v28.4s, v28.4s, v22.4s +sqrdmulh v5.4S, v13.4S, v4.s[2] +mul v13.4S, v13.4S,v10.s[2] +mla v6.4S, v1.4S, v31.s[0] +sub v1.4s, v25.4s, v30.4s +add v25.4s, v25.4s, v30.4s +sqrdmulh v30.4S, v20.4S, v3.s[0] +mul v20.4S, v20.4S,v11.s[0] +mla v13.4S, v5.4S, v31.s[0] +sub v5.4s, v15.4s, v6.4s +add v15.4s, v15.4s, v6.4s +sqrdmulh v4.4S, v0.4S, v3.s[0] +mul v0.4S, v0.4S,v11.s[0] +mla v20.4S, v30.4S, v31.s[0] +sub v30.4s, v12.4s, v13.4s +add v12.4s, v12.4s, v13.4s +sqrdmulh v13.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v24.s[0] +mla v0.4S, v4.4S, v31.s[0] +sub v4.4s, v23.4s, v20.4s +add v23.4s, v23.4s, v20.4s +sqrdmulh v20.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v24.s[0] +mla v16.4S, v13.4S, v31.s[0] +sub v13.4s, v27.4s, v0.4s +add v27.4s, v27.4s, v0.4s +sqrdmulh v0.4S, v27.4S, v3.s[1] +mul v27.4S, v27.4S,v11.s[1] +mla v21.4S, v20.4S, v31.s[0] +sub v20.4s, v26.4s, v16.4s +add v26.4s, v26.4s, v16.4s +sqrdmulh v16.4S, v13.4S, v3.s[2] +mul v13.4S, v13.4S,v11.s[2] +mla v27.4S, v0.4S, v31.s[0] +sub v0.4s, v2.4s, v21.4s +add v2.4s, v2.4s, v21.4s +sqrdmulh v21.4S, v2.4S, v29.s[1] +mul v2.4S, v2.4S,v24.s[1] +mla v13.4S, v16.4S, v31.s[0] +sub v16.4s, v23.4s, v27.4s +add v23.4s, v23.4s, v27.4s +sqrdmulh v3.4S, v0.4S, v29.s[2] +mul v0.4S, v0.4S,v24.s[2] +mla v2.4S, v21.4S, v31.s[0] +sub v21.4s, v4.4s, v13.4s +add v4.4s, v4.4s, v13.4s +mla v0.4S, v3.4S, v31.s[0] +sub v3.4s, v26.4s, v2.4s +add v26.4s, v26.4s, v2.4s +sub v29.4s, v20.4s, v0.4s +add v20.4s, v20.4s, v0.4s +str q28, [x0, #512] +str q14, [x0, #528] +str q25, [x0, #544] +str q1, [x0, #560] +str q15, [x0, #576] +str q5, [x0, #592] +str q12, [x0, #608] +str q30, [x0, #624] +str q23, [x0, #640] +str q16, [x0, #656] +str q4, [x0, #672] +str q21, [x0, #688] +str q26, [x0, #704] +str q3, [x0, #720] +str q20, [x0, #736] +str q29, [x0, #752] +ldr q29, [x0, #800] +ldr q20, [x0, #816] +ldr q3, [x0, #768] +ldr q26, [x0, #784] +ldr q21, [x0, #864] +ldr q4, [x0, #880] +ldr q16, [x0, #832] +ldr q23, [x0, #848] +ldr q30, [x0, #928] +ldr q12, [x0, #944] +ldr q5, [x0, #896] +ldr q15, [x0, #912] +ldr q1, [x0, #992] +ldr q25, [x0, #1008] +ldr q14, [x0, #960] +ldr q28, [x0, #976] +ldr q0, [x17, #+512] +ldr q24, [x17, #+528] +ldr q2, [x17, #+544] +ldr q13, [x17, #+560] +ldr q11, [x17, #+576] +ldr q27, [x17, #+592] +ldr q10, [x17, #+608] +ldr q6, [x17, #+624] +sqrdmulh v18.4S, v29.4S, v24.s[0] +mul v29.4S, v29.4S,v0.s[0] +sqrdmulh v22.4S, v20.4S, v24.s[0] +mul v20.4S, v20.4S,v0.s[0] +mla v29.4S, v18.4S, v31.s[0] +sqrdmulh v18.4S, v21.4S, v13.s[0] +mul v21.4S, v21.4S,v2.s[0] +mla v20.4S, v22.4S, v31.s[0] +sub v22.4s, v3.4s, v29.4s +add v3.4s, v3.4s, v29.4s +sqrdmulh v29.4S, v4.4S, v13.s[0] +mul v4.4S, v4.4S,v2.s[0] +mla v21.4S, v18.4S, v31.s[0] +sub v18.4s, v26.4s, v20.4s +add v26.4s, v26.4s, v20.4s +sqrdmulh v20.4S, v26.4S, v24.s[1] +mul v26.4S, v26.4S,v0.s[1] +mla v4.4S, v29.4S, v31.s[0] +sub v29.4s, v16.4s, v21.4s +add v16.4s, v16.4s, v21.4s +sqrdmulh v21.4S, v18.4S, v24.s[2] +mul v18.4S, v18.4S,v0.s[2] +mla v26.4S, v20.4S, v31.s[0] +sub v20.4s, v23.4s, v4.4s +add v23.4s, v23.4s, v4.4s +sqrdmulh v4.4S, v23.4S, v13.s[1] +mul v23.4S, v23.4S,v2.s[1] +mla v18.4S, v21.4S, v31.s[0] +sub v21.4s, v3.4s, v26.4s +add v3.4s, v3.4s, v26.4s +sqrdmulh v24.4S, v20.4S, v13.s[2] +mul v20.4S, v20.4S,v2.s[2] +mla v23.4S, v4.4S, v31.s[0] +sub v4.4s, v22.4s, v18.4s +add v22.4s, v22.4s, v18.4s +sqrdmulh v18.4S, v30.4S, v27.s[0] +mul v30.4S, v30.4S,v11.s[0] +mla v20.4S, v24.4S, v31.s[0] +sub v24.4s, v16.4s, v23.4s +add v16.4s, v16.4s, v23.4s +sqrdmulh v13.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v11.s[0] +mla v30.4S, v18.4S, v31.s[0] +sub v18.4s, v29.4s, v20.4s +add v29.4s, v29.4s, v20.4s +sqrdmulh v20.4S, v1.4S, v6.s[0] +mul v1.4S, v1.4S,v10.s[0] +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v5.4s, v30.4s +add v5.4s, v5.4s, v30.4s +sqrdmulh v30.4S, v25.4S, v6.s[0] +mul v25.4S, v25.4S,v10.s[0] +mla v1.4S, v20.4S, v31.s[0] +sub v20.4s, v15.4s, v12.4s +add v15.4s, v15.4s, v12.4s +sqrdmulh v12.4S, v15.4S, v27.s[1] +mul v15.4S, v15.4S,v11.s[1] +mla v25.4S, v30.4S, v31.s[0] +sub v30.4s, v14.4s, v1.4s +add v14.4s, v14.4s, v1.4s +sqrdmulh v1.4S, v20.4S, v27.s[2] +mul v20.4S, v20.4S,v11.s[2] +mla v15.4S, v12.4S, v31.s[0] +sub v12.4s, v28.4s, v25.4s +add v28.4s, v28.4s, v25.4s +sqrdmulh v25.4S, v28.4S, v6.s[1] +mul v28.4S, v28.4S,v10.s[1] +mla v20.4S, v1.4S, v31.s[0] +sub v1.4s, v5.4s, v15.4s +add v5.4s, v5.4s, v15.4s +sqrdmulh v27.4S, v12.4S, v6.s[2] +mul v12.4S, v12.4S,v10.s[2] +mla v28.4S, v25.4S, v31.s[0] +sub v25.4s, v13.4s, v20.4s +add v13.4s, v13.4s, v20.4s +mla v12.4S, v27.4S, v31.s[0] +sub v27.4s, v14.4s, v28.4s +add v14.4s, v14.4s, v28.4s +sub v6.4s, v30.4s, v12.4s +add v30.4s, v30.4s, v12.4s +str q3, [x0, #768] +str q21, [x0, #784] +str q22, [x0, #800] +str q4, [x0, #816] +str q16, [x0, #832] +str q24, [x0, #848] +str q29, [x0, #864] +str q18, [x0, #880] +str q5, [x0, #896] +str q1, [x0, #912] +str q13, [x0, #928] +str q25, [x0, #944] +str q14, [x0, #960] +str q27, [x0, #976] +str q30, [x0, #992] +str q6, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1464 +// Instruction count: 1460 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_10.s b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_10.s new file mode 100644 index 0000000..807b044 --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_10.s @@ -0,0 +1,1494 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_7_z4_10 +.global _ntt_u32_incomplete_neon_asm_var_4_2_7_z4_10 +ntt_u32_incomplete_neon_asm_var_4_2_7_z4_10: +_ntt_u32_incomplete_neon_asm_var_4_2_7_z4_10: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #928] +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +ldr q20, [x0, #992] +sqrdmulh v19.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q18, [x0, #800] +sqrdmulh v17.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +ldr q16, [x0, #864] +sqrdmulh v3.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +mla v22.4S, v21.4S, v31.s[0] +mla v20.4S, v19.4S, v31.s[0] +mla v18.4S, v17.4S, v31.s[0] +mla v16.4S, v3.4S, v31.s[0] +ldr q3, [x0, #544] +sqrdmulh v17.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +ldr q19, [x0, #608] +sqrdmulh v21.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q2, [x0, #672] +ldr q1, [x0, #416] +sqrdmulh v0.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +sub v15.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +ldr q22, [x0, #736] +ldr q14, [x0, #480] +sqrdmulh v13.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v12.4s, v14.4s, v20.4s +add v14.4s, v14.4s, v20.4s +ldr q20, [x0, #288] +mla v3.4S, v17.4S, v31.s[0] +mla v19.4S, v21.4S, v31.s[0] +sub v21.4s, v20.4s, v18.4s +mla v2.4S, v0.4S, v31.s[0] +mla v22.4S, v13.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +ldr q18, [x0, #352] +sqrdmulh v13.4S, v1.4S, v29.s[1] +mul v1.4S, v1.4S,v30.s[1] +sub v0.4s, v18.4s, v16.4s +sqrdmulh v17.4S, v14.4S, v29.s[1] +mul v14.4S, v14.4S,v30.s[1] +add v18.4s, v18.4s, v16.4s +ldr q16, [x0, #32] +sqrdmulh v11.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v10.4s, v16.4s, v3.4s +add v16.4s, v16.4s, v3.4s +ldr q3, [x0, #96] +sqrdmulh v9.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v8.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +ldr q19, [x0, #160] +mla v1.4S, v13.4S, v31.s[0] +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v19.4s, v2.4s +mla v20.4S, v11.4S, v31.s[0] +mla v18.4S, v9.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +ldr q2, [x0, #224] +sqrdmulh v9.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +sub v11.4s, v2.4s, v22.4s +sqrdmulh v13.4S, v12.4S, v29.s[2] +mul v12.4S, v12.4S,v30.s[2] +add v2.4s, v2.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +sub v7.4s, v19.4s, v1.4s +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v29.s[2] +mul v0.4S, v0.4S,v30.s[2] +sub v6.4s, v2.4s, v14.4s +add v2.4s, v2.4s, v14.4s +mla v15.4S, v9.4S, v31.s[0] +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v16.4s, v20.4s +mla v21.4S, v22.4S, v31.s[0] +mla v0.4S, v1.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v7.4S, v27.s[1] +mul v7.4S, v7.4S,v28.s[1] +sub v1.4s, v3.4s, v18.4s +sqrdmulh v22.4S, v6.4S, v27.s[1] +mul v6.4S, v6.4S,v28.s[1] +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v19.4S, v27.s[0] +mul v19.4S, v19.4S,v28.s[0] +sub v9.4s, v17.4s, v15.4s +add v17.4s, v17.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v27.s[0] +mul v2.4S, v2.4S,v28.s[0] +sub v14.4s, v11.4s, v12.4s +add v11.4s, v11.4s, v12.4s +mla v7.4S, v20.4S, v31.s[0] +mla v6.4S, v22.4S, v31.s[0] +sub v22.4s, v10.4s, v21.4s +mla v19.4S, v18.4S, v31.s[0] +mla v2.4S, v15.4S, v31.s[0] +add v10.4s, v10.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v27.s[2] +mul v17.4S, v17.4S,v28.s[2] +sub v15.4s, v8.4s, v0.4s +sqrdmulh v18.4S, v11.4S, v27.s[2] +mul v11.4S, v11.4S,v28.s[2] +add v8.4s, v8.4s, v0.4s +sqrdmulh v0.4S, v9.4S, v27.s[3] +mul v9.4S, v9.4S,v28.s[3] +sub v20.4s, v13.4s, v7.4s +add v13.4s, v13.4s, v7.4s +sqrdmulh v7.4S, v14.4S, v27.s[3] +mul v14.4S, v14.4S,v28.s[3] +sub v12.4s, v1.4s, v6.4s +add v1.4s, v1.4s, v6.4s +mla v17.4S, v21.4S, v31.s[0] +mla v11.4S, v18.4S, v31.s[0] +sub v18.4s, v16.4s, v19.4s +mla v9.4S, v0.4S, v31.s[0] +mla v14.4S, v7.4S, v31.s[0] +add v16.4s, v16.4s, v19.4s +sqrdmulh v19.4S, v1.4S, v25.s[2] +mul v1.4S, v1.4S,v26.s[2] +sub v7.4s, v3.4s, v2.4s +sqrdmulh v0.4S, v12.4S, v25.s[3] +mul v12.4S, v12.4S,v26.s[3] +add v3.4s, v3.4s, v2.4s +sqrdmulh v2.4S, v7.4S, v25.s[1] +mul v7.4S, v7.4S,v26.s[1] +sub v21.4s, v10.4s, v17.4s +add v10.4s, v10.4s, v17.4s +sqrdmulh v17.4S, v3.4S, v25.s[0] +mul v3.4S, v3.4S,v26.s[0] +sub v6.4s, v8.4s, v11.4s +add v8.4s, v8.4s, v11.4s +mla v1.4S, v19.4S, v31.s[0] +mla v12.4S, v0.4S, v31.s[0] +sub v0.4s, v22.4s, v9.4s +mla v7.4S, v2.4S, v31.s[0] +mla v3.4S, v17.4S, v31.s[0] +add v22.4s, v22.4s, v9.4s +sqrdmulh v9.4S, v8.4S, v23.s[0] +mul v8.4S, v8.4S,v24.s[0] +sub v17.4s, v15.4s, v14.4s +sqrdmulh v2.4S, v6.4S, v23.s[1] +mul v6.4S, v6.4S,v24.s[1] +add v15.4s, v15.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v23.s[2] +mul v15.4S, v15.4S,v24.s[2] +sub v19.4s, v13.4s, v1.4s +add v13.4s, v13.4s, v1.4s +sqrdmulh v1.4S, v17.4S, v23.s[3] +mul v17.4S, v17.4S,v24.s[3] +sub v11.4s, v20.4s, v12.4s +add v20.4s, v20.4s, v12.4s +mla v8.4S, v9.4S, v31.s[0] +mla v6.4S, v2.4S, v31.s[0] +sub v2.4s, v18.4s, v7.4s +str q13, [x0, #288] +mla v15.4S, v14.4S, v31.s[0] +mla v17.4S, v1.4S, v31.s[0] +add v18.4s, v18.4s, v7.4s +str q19, [x0, #352] +ldr q19, [x0, #944] +sqrdmulh v7.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +sub v1.4s, v16.4s, v3.4s +str q20, [x0, #416] +ldr q20, [x0, #1008] +sqrdmulh v14.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v16.4s, v16.4s, v3.4s +str q11, [x0, #480] +ldr q11, [x0, #816] +sqrdmulh v3.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +sub v13.4s, v10.4s, v8.4s +add v10.4s, v10.4s, v8.4s +ldr q8, [x0, #880] +sqrdmulh v9.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v12.4s, v21.4s, v6.4s +add v21.4s, v21.4s, v6.4s +mla v19.4S, v7.4S, v31.s[0] +mla v20.4S, v14.4S, v31.s[0] +sub v14.4s, v22.4s, v15.4s +str q18, [x0, #160] +mla v11.4S, v3.4S, v31.s[0] +mla v8.4S, v9.4S, v31.s[0] +add v22.4s, v22.4s, v15.4s +str q2, [x0, #224] +ldr q2, [x0, #560] +sqrdmulh v15.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +sub v9.4s, v0.4s, v17.4s +str q16, [x0, #32] +ldr q16, [x0, #624] +sqrdmulh v3.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +add v0.4s, v0.4s, v17.4s +str q1, [x0, #96] +ldr q1, [x0, #688] +ldr q17, [x0, #432] +sqrdmulh v18.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +sub v7.4s, v17.4s, v19.4s +add v17.4s, v17.4s, v19.4s +ldr q19, [x0, #752] +ldr q6, [x0, #496] +sqrdmulh v5.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +sub v4.4s, v6.4s, v20.4s +add v6.4s, v6.4s, v20.4s +ldr q20, [x0, #304] +mla v2.4S, v15.4S, v31.s[0] +mla v16.4S, v3.4S, v31.s[0] +sub v3.4s, v20.4s, v11.4s +str q10, [x0, #544] +mla v1.4S, v18.4S, v31.s[0] +mla v19.4S, v5.4S, v31.s[0] +add v20.4s, v20.4s, v11.4s +str q13, [x0, #608] +ldr q13, [x0, #368] +sqrdmulh v11.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v5.4s, v13.4s, v8.4s +str q21, [x0, #672] +sqrdmulh v21.4S, v6.4S, v29.s[1] +mul v6.4S, v6.4S,v30.s[1] +add v13.4s, v13.4s, v8.4s +str q12, [x0, #736] +ldr q12, [x0, #48] +sqrdmulh v8.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v18.4s, v12.4s, v2.4s +add v12.4s, v12.4s, v2.4s +ldr q2, [x0, #112] +sqrdmulh v10.4S, v13.4S, v29.s[1] +mul v13.4S, v13.4S,v30.s[1] +sub v15.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +ldr q16, [x0, #176] +mla v17.4S, v11.4S, v31.s[0] +mla v6.4S, v21.4S, v31.s[0] +sub v21.4s, v16.4s, v1.4s +str q22, [x0, #800] +mla v20.4S, v8.4S, v31.s[0] +mla v13.4S, v10.4S, v31.s[0] +add v16.4s, v16.4s, v1.4s +str q14, [x0, #864] +ldr q14, [x0, #240] +sqrdmulh v1.4S, v7.4S, v29.s[2] +mul v7.4S, v7.4S,v30.s[2] +sub v10.4s, v14.4s, v19.4s +str q0, [x0, #928] +sqrdmulh v0.4S, v4.4S, v29.s[2] +mul v4.4S, v4.4S,v30.s[2] +add v14.4s, v14.4s, v19.4s +str q9, [x0, #992] +sqrdmulh v9.4S, v3.4S, v29.s[2] +mul v3.4S, v3.4S,v30.s[2] +sub v19.4s, v16.4s, v17.4s +add v16.4s, v16.4s, v17.4s +sqrdmulh v17.4S, v5.4S, v29.s[2] +mul v5.4S, v5.4S,v30.s[2] +sub v8.4s, v14.4s, v6.4s +add v14.4s, v14.4s, v6.4s +mla v7.4S, v1.4S, v31.s[0] +mla v4.4S, v0.4S, v31.s[0] +sub v0.4s, v12.4s, v20.4s +mla v3.4S, v9.4S, v31.s[0] +mla v5.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v27.s[1] +mul v19.4S, v19.4S,v28.s[1] +sub v17.4s, v2.4s, v13.4s +sqrdmulh v9.4S, v8.4S, v27.s[1] +mul v8.4S, v8.4S,v28.s[1] +add v2.4s, v2.4s, v13.4s +sqrdmulh v13.4S, v16.4S, v27.s[0] +mul v16.4S, v16.4S,v28.s[0] +sub v1.4s, v21.4s, v7.4s +add v21.4s, v21.4s, v7.4s +sqrdmulh v7.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +sub v6.4s, v10.4s, v4.4s +add v10.4s, v10.4s, v4.4s +mla v19.4S, v20.4S, v31.s[0] +mla v8.4S, v9.4S, v31.s[0] +sub v9.4s, v18.4s, v3.4s +mla v16.4S, v13.4S, v31.s[0] +mla v14.4S, v7.4S, v31.s[0] +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v27.s[2] +mul v21.4S, v21.4S,v28.s[2] +sub v7.4s, v15.4s, v5.4s +sqrdmulh v13.4S, v10.4S, v27.s[2] +mul v10.4S, v10.4S,v28.s[2] +add v15.4s, v15.4s, v5.4s +sqrdmulh v5.4S, v1.4S, v27.s[3] +mul v1.4S, v1.4S,v28.s[3] +sub v20.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v27.s[3] +mul v6.4S, v6.4S,v28.s[3] +sub v4.4s, v17.4s, v8.4s +add v17.4s, v17.4s, v8.4s +mla v21.4S, v3.4S, v31.s[0] +mla v10.4S, v13.4S, v31.s[0] +sub v13.4s, v12.4s, v16.4s +mla v1.4S, v5.4S, v31.s[0] +mla v6.4S, v19.4S, v31.s[0] +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v25.s[2] +mul v17.4S, v17.4S,v26.s[2] +sub v19.4s, v2.4s, v14.4s +sqrdmulh v5.4S, v4.4S, v25.s[3] +mul v4.4S, v4.4S,v26.s[3] +add v2.4s, v2.4s, v14.4s +sqrdmulh v14.4S, v19.4S, v25.s[1] +mul v19.4S, v19.4S,v26.s[1] +sub v3.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v2.4S, v25.s[0] +mul v2.4S, v2.4S,v26.s[0] +sub v8.4s, v15.4s, v10.4s +add v15.4s, v15.4s, v10.4s +mla v17.4S, v16.4S, v31.s[0] +mla v4.4S, v5.4S, v31.s[0] +sub v5.4s, v9.4s, v1.4s +mla v19.4S, v14.4S, v31.s[0] +mla v2.4S, v21.4S, v31.s[0] +add v9.4s, v9.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v23.s[0] +mul v15.4S, v15.4S,v24.s[0] +sub v21.4s, v7.4s, v6.4s +sqrdmulh v14.4S, v8.4S, v23.s[1] +mul v8.4S, v8.4S,v24.s[1] +add v7.4s, v7.4s, v6.4s +sqrdmulh v6.4S, v7.4S, v23.s[2] +mul v7.4S, v7.4S,v24.s[2] +sub v16.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +sqrdmulh v17.4S, v21.4S, v23.s[3] +mul v21.4S, v21.4S,v24.s[3] +sub v10.4s, v20.4s, v4.4s +add v20.4s, v20.4s, v4.4s +mla v15.4S, v1.4S, v31.s[0] +mla v8.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v19.4s +str q0, [x0, #304] +mla v7.4S, v6.4S, v31.s[0] +mla v21.4S, v17.4S, v31.s[0] +add v13.4s, v13.4s, v19.4s +str q16, [x0, #368] +ldr q16, [x0, #896] +sqrdmulh v19.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v17.4s, v12.4s, v2.4s +str q20, [x0, #432] +ldr q20, [x0, #960] +sqrdmulh v6.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v12.4s, v12.4s, v2.4s +str q10, [x0, #496] +ldr q10, [x0, #768] +sqrdmulh v2.4S, v10.4S, v29.s[0] +mul v10.4S, v10.4S,v30.s[0] +sub v0.4s, v18.4s, v15.4s +add v18.4s, v18.4s, v15.4s +ldr q15, [x0, #832] +sqrdmulh v1.4S, v15.4S, v29.s[0] +mul v15.4S, v15.4S,v30.s[0] +sub v4.4s, v3.4s, v8.4s +add v3.4s, v3.4s, v8.4s +mla v16.4S, v19.4S, v31.s[0] +mla v20.4S, v6.4S, v31.s[0] +sub v6.4s, v9.4s, v7.4s +str q13, [x0, #176] +mla v10.4S, v2.4S, v31.s[0] +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v7.4s +str q14, [x0, #240] +ldr q14, [x0, #512] +sqrdmulh v7.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v1.4s, v5.4s, v21.4s +str q12, [x0, #48] +ldr q12, [x0, #576] +sqrdmulh v2.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +add v5.4s, v5.4s, v21.4s +str q17, [x0, #112] +ldr q17, [x0, #640] +ldr q21, [x0, #384] +sqrdmulh v13.4S, v17.4S, v29.s[0] +mul v17.4S, v17.4S,v30.s[0] +sub v19.4s, v21.4s, v16.4s +add v21.4s, v21.4s, v16.4s +ldr q16, [x0, #704] +ldr q8, [x0, #448] +sqrdmulh v22.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v11.4s, v8.4s, v20.4s +add v8.4s, v8.4s, v20.4s +ldr q20, [x0, #256] +mla v14.4S, v7.4S, v31.s[0] +mla v12.4S, v2.4S, v31.s[0] +sub v2.4s, v20.4s, v10.4s +str q18, [x0, #560] +mla v17.4S, v13.4S, v31.s[0] +mla v16.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v10.4s +str q0, [x0, #624] +ldr q0, [x0, #320] +sqrdmulh v10.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v22.4s, v0.4s, v15.4s +str q3, [x0, #688] +sqrdmulh v3.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +add v0.4s, v0.4s, v15.4s +str q4, [x0, #752] +ldr q4, [x0, #0] +sqrdmulh v15.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v13.4s, v4.4s, v14.4s +add v4.4s, v4.4s, v14.4s +ldr q14, [x0, #64] +sqrdmulh v18.4S, v0.4S, v29.s[1] +mul v0.4S, v0.4S,v30.s[1] +sub v7.4s, v14.4s, v12.4s +add v14.4s, v14.4s, v12.4s +ldr q12, [x0, #128] +mla v21.4S, v10.4S, v31.s[0] +mla v8.4S, v3.4S, v31.s[0] +sub v3.4s, v12.4s, v17.4s +str q9, [x0, #816] +mla v20.4S, v15.4S, v31.s[0] +mla v0.4S, v18.4S, v31.s[0] +add v12.4s, v12.4s, v17.4s +str q6, [x0, #880] +ldr q6, [x0, #192] +sqrdmulh v17.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +sub v18.4s, v6.4s, v16.4s +str q5, [x0, #944] +sqrdmulh v5.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +add v6.4s, v6.4s, v16.4s +str q1, [x0, #1008] +sqrdmulh v1.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v16.4s, v12.4s, v21.4s +add v12.4s, v12.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +sub v15.4s, v6.4s, v8.4s +add v6.4s, v6.4s, v8.4s +mla v19.4S, v17.4S, v31.s[0] +mla v11.4S, v5.4S, v31.s[0] +sub v5.4s, v4.4s, v20.4s +mla v2.4S, v1.4S, v31.s[0] +mla v22.4S, v21.4S, v31.s[0] +add v4.4s, v4.4s, v20.4s +sqrdmulh v20.4S, v16.4S, v27.s[1] +mul v16.4S, v16.4S,v28.s[1] +sub v21.4s, v14.4s, v0.4s +sqrdmulh v1.4S, v15.4S, v27.s[1] +mul v15.4S, v15.4S,v28.s[1] +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +sub v17.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v27.s[0] +mul v6.4S, v6.4S,v28.s[0] +sub v8.4s, v18.4s, v11.4s +add v18.4s, v18.4s, v11.4s +mla v16.4S, v20.4S, v31.s[0] +mla v15.4S, v1.4S, v31.s[0] +sub v1.4s, v13.4s, v2.4s +mla v12.4S, v0.4S, v31.s[0] +mla v6.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v3.4S, v27.s[2] +mul v3.4S, v3.4S,v28.s[2] +sub v19.4s, v7.4s, v22.4s +sqrdmulh v0.4S, v18.4S, v27.s[2] +mul v18.4S, v18.4S,v28.s[2] +add v7.4s, v7.4s, v22.4s +sqrdmulh v22.4S, v17.4S, v27.s[3] +mul v17.4S, v17.4S,v28.s[3] +sub v20.4s, v5.4s, v16.4s +add v5.4s, v5.4s, v16.4s +sqrdmulh v16.4S, v8.4S, v27.s[3] +mul v8.4S, v8.4S,v28.s[3] +sub v11.4s, v21.4s, v15.4s +add v21.4s, v21.4s, v15.4s +mla v3.4S, v2.4S, v31.s[0] +mla v18.4S, v0.4S, v31.s[0] +sub v0.4s, v4.4s, v12.4s +mla v17.4S, v22.4S, v31.s[0] +mla v8.4S, v16.4S, v31.s[0] +add v4.4s, v4.4s, v12.4s +sqrdmulh v12.4S, v21.4S, v25.s[2] +mul v21.4S, v21.4S,v26.s[2] +sub v16.4s, v14.4s, v6.4s +sqrdmulh v22.4S, v11.4S, v25.s[3] +mul v11.4S, v11.4S,v26.s[3] +add v14.4s, v14.4s, v6.4s +sqrdmulh v6.4S, v16.4S, v25.s[1] +mul v16.4S, v16.4S,v26.s[1] +sub v2.4s, v13.4s, v3.4s +add v13.4s, v13.4s, v3.4s +sqrdmulh v3.4S, v14.4S, v25.s[0] +mul v14.4S, v14.4S,v26.s[0] +sub v15.4s, v7.4s, v18.4s +add v7.4s, v7.4s, v18.4s +mla v21.4S, v12.4S, v31.s[0] +mla v11.4S, v22.4S, v31.s[0] +sub v22.4s, v1.4s, v17.4s +mla v16.4S, v6.4S, v31.s[0] +mla v14.4S, v3.4S, v31.s[0] +add v1.4s, v1.4s, v17.4s +sqrdmulh v17.4S, v7.4S, v23.s[0] +mul v7.4S, v7.4S,v24.s[0] +sub v3.4s, v19.4s, v8.4s +sqrdmulh v6.4S, v15.4S, v23.s[1] +mul v15.4S, v15.4S,v24.s[1] +add v19.4s, v19.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v23.s[2] +mul v19.4S, v19.4S,v24.s[2] +sub v12.4s, v5.4s, v21.4s +add v5.4s, v5.4s, v21.4s +sqrdmulh v21.4S, v3.4S, v23.s[3] +mul v3.4S, v3.4S,v24.s[3] +sub v18.4s, v20.4s, v11.4s +add v20.4s, v20.4s, v11.4s +mla v7.4S, v17.4S, v31.s[0] +mla v15.4S, v6.4S, v31.s[0] +sub v6.4s, v0.4s, v16.4s +str q5, [x0, #256] +mla v19.4S, v8.4S, v31.s[0] +mla v3.4S, v21.4S, v31.s[0] +add v0.4s, v0.4s, v16.4s +str q12, [x0, #320] +ldr q12, [x0, #912] +sqrdmulh v16.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +sub v21.4s, v4.4s, v14.4s +str q20, [x0, #384] +ldr q20, [x0, #976] +sqrdmulh v8.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v4.4s, v4.4s, v14.4s +str q18, [x0, #448] +ldr q18, [x0, #784] +sqrdmulh v14.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +sub v5.4s, v13.4s, v7.4s +add v13.4s, v13.4s, v7.4s +ldr q7, [x0, #848] +sqrdmulh v17.4S, v7.4S, v29.s[0] +mul v7.4S, v7.4S,v30.s[0] +sub v11.4s, v2.4s, v15.4s +add v2.4s, v2.4s, v15.4s +mla v12.4S, v16.4S, v31.s[0] +mla v20.4S, v8.4S, v31.s[0] +sub v8.4s, v1.4s, v19.4s +str q0, [x0, #128] +mla v18.4S, v14.4S, v31.s[0] +mla v7.4S, v17.4S, v31.s[0] +add v1.4s, v1.4s, v19.4s +str q6, [x0, #192] +ldr q6, [x0, #528] +sqrdmulh v19.4S, v6.4S, v29.s[0] +mul v6.4S, v6.4S,v30.s[0] +sub v17.4s, v22.4s, v3.4s +str q4, [x0, #0] +ldr q4, [x0, #592] +sqrdmulh v14.4S, v4.4S, v29.s[0] +mul v4.4S, v4.4S,v30.s[0] +add v22.4s, v22.4s, v3.4s +str q21, [x0, #64] +ldr q21, [x0, #656] +ldr q3, [x0, #400] +sqrdmulh v0.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +sub v16.4s, v3.4s, v12.4s +add v3.4s, v3.4s, v12.4s +ldr q12, [x0, #720] +ldr q15, [x0, #464] +sqrdmulh v9.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +sub v10.4s, v15.4s, v20.4s +add v15.4s, v15.4s, v20.4s +ldr q20, [x0, #272] +mla v6.4S, v19.4S, v31.s[0] +mla v4.4S, v14.4S, v31.s[0] +sub v14.4s, v20.4s, v18.4s +str q13, [x0, #512] +mla v21.4S, v0.4S, v31.s[0] +mla v12.4S, v9.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +str q5, [x0, #576] +ldr q5, [x0, #336] +sqrdmulh v18.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v9.4s, v5.4s, v7.4s +str q2, [x0, #640] +sqrdmulh v2.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +add v5.4s, v5.4s, v7.4s +str q11, [x0, #704] +ldr q11, [x0, #16] +sqrdmulh v7.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v0.4s, v11.4s, v6.4s +add v11.4s, v11.4s, v6.4s +ldr q6, [x0, #80] +sqrdmulh v13.4S, v5.4S, v29.s[1] +mul v5.4S, v5.4S,v30.s[1] +sub v19.4s, v6.4s, v4.4s +add v6.4s, v6.4s, v4.4s +ldr q4, [x0, #144] +mla v3.4S, v18.4S, v31.s[0] +mla v15.4S, v2.4S, v31.s[0] +sub v2.4s, v4.4s, v21.4s +str q1, [x0, #768] +mla v20.4S, v7.4S, v31.s[0] +mla v5.4S, v13.4S, v31.s[0] +add v4.4s, v4.4s, v21.4s +str q8, [x0, #832] +ldr q8, [x0, #208] +sqrdmulh v21.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +sub v13.4s, v8.4s, v12.4s +str q22, [x0, #896] +sqrdmulh v22.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +add v8.4s, v8.4s, v12.4s +str q17, [x0, #960] +sqrdmulh v17.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v12.4s, v4.4s, v3.4s +add v4.4s, v4.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v29.s[2] +mul v9.4S, v9.4S,v30.s[2] +sub v7.4s, v8.4s, v15.4s +add v8.4s, v8.4s, v15.4s +mla v16.4S, v21.4S, v31.s[0] +mla v10.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v20.4s +mla v14.4S, v17.4S, v31.s[0] +mla v9.4S, v3.4S, v31.s[0] +add v11.4s, v11.4s, v20.4s +sqrdmulh v20.4S, v12.4S, v27.s[1] +mul v12.4S, v12.4S,v28.s[1] +sub v3.4s, v6.4s, v5.4s +sqrdmulh v17.4S, v7.4S, v27.s[1] +mul v7.4S, v7.4S,v28.s[1] +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v4.4S, v27.s[0] +mul v4.4S, v4.4S,v28.s[0] +sub v21.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v8.4S, v27.s[0] +mul v8.4S, v8.4S,v28.s[0] +sub v15.4s, v13.4s, v10.4s +add v13.4s, v13.4s, v10.4s +mla v12.4S, v20.4S, v31.s[0] +mla v7.4S, v17.4S, v31.s[0] +sub v17.4s, v0.4s, v14.4s +mla v4.4S, v5.4S, v31.s[0] +mla v8.4S, v16.4S, v31.s[0] +add v0.4s, v0.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v27.s[2] +mul v2.4S, v2.4S,v28.s[2] +sub v16.4s, v19.4s, v9.4s +sqrdmulh v5.4S, v13.4S, v27.s[2] +mul v13.4S, v13.4S,v28.s[2] +add v19.4s, v19.4s, v9.4s +sqrdmulh v9.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +sub v20.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +sub v10.4s, v3.4s, v7.4s +add v3.4s, v3.4s, v7.4s +mla v2.4S, v14.4S, v31.s[0] +mla v13.4S, v5.4S, v31.s[0] +sub v5.4s, v11.4s, v4.4s +mla v21.4S, v9.4S, v31.s[0] +mla v15.4S, v12.4S, v31.s[0] +add v11.4s, v11.4s, v4.4s +sqrdmulh v4.4S, v3.4S, v25.s[2] +mul v3.4S, v3.4S,v26.s[2] +sub v12.4s, v6.4s, v8.4s +sqrdmulh v9.4S, v10.4S, v25.s[3] +mul v10.4S, v10.4S,v26.s[3] +add v6.4s, v6.4s, v8.4s +sqrdmulh v8.4S, v12.4S, v25.s[1] +mul v12.4S, v12.4S,v26.s[1] +sub v14.4s, v0.4s, v2.4s +add v0.4s, v0.4s, v2.4s +sqrdmulh v2.4S, v6.4S, v25.s[0] +mul v6.4S, v6.4S,v26.s[0] +sub v7.4s, v19.4s, v13.4s +add v19.4s, v19.4s, v13.4s +mla v3.4S, v4.4S, v31.s[0] +mla v10.4S, v9.4S, v31.s[0] +sub v9.4s, v17.4s, v21.4s +mla v12.4S, v8.4S, v31.s[0] +mla v6.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v19.4S, v23.s[0] +mul v19.4S, v19.4S,v24.s[0] +sub v2.4s, v16.4s, v15.4s +sqrdmulh v8.4S, v7.4S, v23.s[1] +mul v7.4S, v7.4S,v24.s[1] +add v16.4s, v16.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v23.s[2] +mul v16.4S, v16.4S,v24.s[2] +sub v4.4s, v22.4s, v3.4s +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v2.4S, v23.s[3] +mul v2.4S, v2.4S,v24.s[3] +sub v13.4s, v20.4s, v10.4s +add v20.4s, v20.4s, v10.4s +mla v19.4S, v21.4S, v31.s[0] +mla v7.4S, v8.4S, v31.s[0] +sub v8.4s, v5.4s, v12.4s +str q22, [x0, #272] +mla v16.4S, v15.4S, v31.s[0] +mla v2.4S, v3.4S, v31.s[0] +add v5.4s, v5.4s, v12.4s +str q4, [x0, #336] +sub v23.4s, v11.4s, v6.4s +str q20, [x0, #400] +add v11.4s, v11.4s, v6.4s +str q13, [x0, #464] +sub v13.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sub v19.4s, v14.4s, v7.4s +add v14.4s, v14.4s, v7.4s +sub v7.4s, v17.4s, v16.4s +str q5, [x0, #144] +add v17.4s, v17.4s, v16.4s +str q8, [x0, #208] +sub v8.4s, v9.4s, v2.4s +str q11, [x0, #16] +add v9.4s, v9.4s, v2.4s +str q23, [x0, #80] +str q0, [x0, #528] +str q13, [x0, #592] +str q14, [x0, #656] +str q19, [x0, #720] +str q17, [x0, #784] +str q7, [x0, #848] +str q9, [x0, #912] +str q8, [x0, #976] +ldr q18, [x0, #224] +ldr q1, [x0, #160] +ldr q10, [x0, #32] +ldr q21, [x17, #+128] +ldr q22, [x17, #+144] +sqrdmulh v15.4S, v10.4S, v22.s[0] +mul v10.4S, v10.4S,v21.s[0] +ldr q3, [x0, #48] +ldr q12, [x17, #+160] +sqrdmulh v4.4S, v3.4S, v22.s[0] +mul v3.4S, v3.4S,v21.s[0] +ldr q30, [x17, #+176] +ldr q29, [x0, #96] +sqrdmulh v28.4S, v29.4S, v30.s[0] +mul v29.4S, v29.4S,v12.s[0] +ldr q27, [x0, #112] +sqrdmulh v26.4S, v27.4S, v30.s[0] +mul v27.4S, v27.4S,v12.s[0] +ldr q25, [x17, #+192] +ldr q24, [x17, #+208] +mla v10.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v1.4S, v24.s[0] +ldr q20, [x0, #176] +mla v3.4S, v4.4S, v31.s[0] +sqrdmulh v4.4S, v20.4S, v24.s[0] +ldr q6, [x17, #+224] +ldr q5, [x17, #+240] +mla v29.4S, v28.4S, v31.s[0] +sqrdmulh v28.4S, v18.4S, v5.s[0] +ldr q16, [x0, #240] +mla v27.4S, v26.4S, v31.s[0] +sqrdmulh v26.4S, v16.4S, v5.s[0] +ldr q11, [x0, #0] +ldr q2, [x0, #128] +mul v1.4S, v1.4S,v25.s[0] +mul v20.4S, v20.4S,v25.s[0] +ldr q23, [x0, #16] +ldr q0, [x0, #144] +mla v1.4S, v15.4S, v31.s[0] +mla v20.4S, v4.4S, v31.s[0] +sub v4.4s, v11.4s, v10.4s +ldr q15, [x0, #64] +add v11.4s, v11.4s, v10.4s +ldr q10, [x0, #192] +mul v18.4S, v18.4S,v6.s[0] +mul v16.4S, v16.4S,v6.s[0] +sub v13.4s, v23.4s, v3.4s +ldr q14, [x0, #80] +add v23.4s, v23.4s, v3.4s +ldr q3, [x0, #208] +mla v18.4S, v28.4S, v31.s[0] +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v15.4s, v29.4s +add v15.4s, v15.4s, v29.4s +sqrdmulh v29.4S, v23.4S, v22.s[1] +mul v23.4S, v23.4S,v21.s[1] +sub v28.4s, v14.4s, v27.4s +add v14.4s, v14.4s, v27.4s +sqrdmulh v27.4S, v13.4S, v22.s[2] +mul v13.4S, v13.4S,v21.s[2] +sub v19.4s, v2.4s, v1.4s +add v2.4s, v2.4s, v1.4s +sqrdmulh v22.4S, v14.4S, v30.s[1] +mul v14.4S, v14.4S,v12.s[1] +sub v1.4s, v0.4s, v20.4s +add v0.4s, v0.4s, v20.4s +sqrdmulh v20.4S, v28.4S, v30.s[2] +mul v28.4S, v28.4S,v12.s[2] +sub v21.4s, v10.4s, v18.4s +add v10.4s, v10.4s, v18.4s +mla v23.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v0.4S, v24.s[1] +sub v30.4s, v3.4s, v16.4s +ldr q18, [x0, #480] +add v3.4s, v3.4s, v16.4s +mla v13.4S, v27.4S, v31.s[0] +sqrdmulh v27.4S, v1.4S, v24.s[2] +sub v16.4s, v11.4s, v23.4s +ldr q12, [x0, #416] +str q16, [x0, #16] +mla v14.4S, v22.4S, v31.s[0] +sqrdmulh v22.4S, v3.4S, v5.s[1] +add v11.4s, v11.4s, v23.4s +ldr q23, [x0, #288] +str q11, [x0, #0] +mla v28.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v30.4S, v5.s[2] +sub v11.4s, v4.4s, v13.4s +ldr q16, [x17, #+256] +str q11, [x0, #48] +mul v0.4S, v0.4S,v25.s[1] +mul v1.4S, v1.4S,v25.s[2] +add v4.4s, v4.4s, v13.4s +str q4, [x0, #32] +ldr q4, [x17, #+272] +mla v0.4S, v29.4S, v31.s[0] +mla v1.4S, v27.4S, v31.s[0] +sub v27.4s, v15.4s, v14.4s +str q27, [x0, #80] +mul v3.4S, v3.4S,v6.s[1] +mul v30.4S, v30.4S,v6.s[2] +add v15.4s, v15.4s, v14.4s +str q15, [x0, #64] +mla v3.4S, v22.4S, v31.s[0] +mla v30.4S, v20.4S, v31.s[0] +sub v20.4s, v26.4s, v28.4s +str q20, [x0, #112] +sqrdmulh v5.4S, v23.4S, v4.s[0] +mul v23.4S, v23.4S,v16.s[0] +add v26.4s, v26.4s, v28.4s +ldr q28, [x0, #304] +str q26, [x0, #96] +ldr q26, [x17, #+288] +sqrdmulh v20.4S, v28.4S, v4.s[0] +mul v28.4S, v28.4S,v16.s[0] +sub v6.4s, v2.4s, v0.4s +ldr q22, [x17, #+304] +str q6, [x0, #144] +ldr q6, [x0, #352] +sqrdmulh v15.4S, v6.4S, v22.s[0] +mul v6.4S, v6.4S,v26.s[0] +add v2.4s, v2.4s, v0.4s +str q2, [x0, #128] +ldr q2, [x0, #368] +sqrdmulh v0.4S, v2.4S, v22.s[0] +mul v2.4S, v2.4S,v26.s[0] +sub v14.4s, v19.4s, v1.4s +ldr q24, [x17, #+320] +str q14, [x0, #176] +ldr q14, [x17, #+336] +mla v23.4S, v5.4S, v31.s[0] +sqrdmulh v5.4S, v12.4S, v14.s[0] +add v19.4s, v19.4s, v1.4s +ldr q1, [x0, #432] +str q19, [x0, #160] +mla v28.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v1.4S, v14.s[0] +sub v19.4s, v10.4s, v3.4s +ldr q27, [x17, #+352] +str q19, [x0, #208] +ldr q19, [x17, #+368] +mla v6.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v18.4S, v19.s[0] +add v10.4s, v10.4s, v3.4s +str q10, [x0, #192] +ldr q10, [x0, #496] +mla v2.4S, v0.4S, v31.s[0] +sqrdmulh v0.4S, v10.4S, v19.s[0] +sub v3.4s, v21.4s, v30.4s +ldr q25, [x0, #256] +str q3, [x0, #240] +ldr q3, [x0, #384] +mul v12.4S, v12.4S,v24.s[0] +mul v1.4S, v1.4S,v24.s[0] +add v21.4s, v21.4s, v30.4s +ldr q30, [x0, #272] +str q21, [x0, #224] +ldr q21, [x0, #400] +mla v12.4S, v5.4S, v31.s[0] +mla v1.4S, v20.4S, v31.s[0] +sub v20.4s, v25.4s, v23.4s +ldr q5, [x0, #320] +add v25.4s, v25.4s, v23.4s +ldr q23, [x0, #448] +mul v18.4S, v18.4S,v27.s[0] +mul v10.4S, v10.4S,v27.s[0] +sub v29.4s, v30.4s, v28.4s +ldr q13, [x0, #336] +add v30.4s, v30.4s, v28.4s +ldr q28, [x0, #464] +mla v18.4S, v15.4S, v31.s[0] +mla v10.4S, v0.4S, v31.s[0] +sub v0.4s, v5.4s, v6.4s +add v5.4s, v5.4s, v6.4s +sqrdmulh v6.4S, v30.4S, v4.s[1] +mul v30.4S, v30.4S,v16.s[1] +sub v15.4s, v13.4s, v2.4s +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v29.4S, v4.s[2] +mul v29.4S, v29.4S,v16.s[2] +sub v11.4s, v3.4s, v12.4s +add v3.4s, v3.4s, v12.4s +sqrdmulh v4.4S, v13.4S, v22.s[1] +mul v13.4S, v13.4S,v26.s[1] +sub v12.4s, v21.4s, v1.4s +add v21.4s, v21.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v22.s[2] +mul v15.4S, v15.4S,v26.s[2] +sub v16.4s, v23.4s, v18.4s +add v23.4s, v23.4s, v18.4s +mla v30.4S, v6.4S, v31.s[0] +sqrdmulh v6.4S, v21.4S, v14.s[1] +sub v22.4s, v28.4s, v10.4s +ldr q18, [x0, #736] +add v28.4s, v28.4s, v10.4s +mla v29.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v12.4S, v14.s[2] +sub v10.4s, v25.4s, v30.4s +ldr q26, [x0, #672] +str q10, [x0, #272] +mla v13.4S, v4.4S, v31.s[0] +sqrdmulh v4.4S, v28.4S, v19.s[1] +add v25.4s, v25.4s, v30.4s +ldr q30, [x0, #544] +str q25, [x0, #256] +mla v15.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v22.4S, v19.s[2] +sub v25.4s, v20.4s, v29.4s +ldr q10, [x17, #+384] +str q25, [x0, #304] +mul v21.4S, v21.4S,v24.s[1] +mul v12.4S, v12.4S,v24.s[2] +add v20.4s, v20.4s, v29.4s +str q20, [x0, #288] +ldr q20, [x17, #+400] +mla v21.4S, v6.4S, v31.s[0] +mla v12.4S, v2.4S, v31.s[0] +sub v2.4s, v5.4s, v13.4s +str q2, [x0, #336] +mul v28.4S, v28.4S,v27.s[1] +mul v22.4S, v22.4S,v27.s[2] +add v5.4s, v5.4s, v13.4s +str q5, [x0, #320] +mla v28.4S, v4.4S, v31.s[0] +mla v22.4S, v1.4S, v31.s[0] +sub v1.4s, v0.4s, v15.4s +str q1, [x0, #368] +sqrdmulh v19.4S, v30.4S, v20.s[0] +mul v30.4S, v30.4S,v10.s[0] +add v0.4s, v0.4s, v15.4s +ldr q15, [x0, #560] +str q0, [x0, #352] +ldr q0, [x17, #+416] +sqrdmulh v1.4S, v15.4S, v20.s[0] +mul v15.4S, v15.4S,v10.s[0] +sub v27.4s, v3.4s, v21.4s +ldr q4, [x17, #+432] +str q27, [x0, #400] +ldr q27, [x0, #608] +sqrdmulh v5.4S, v27.4S, v4.s[0] +mul v27.4S, v27.4S,v0.s[0] +add v3.4s, v3.4s, v21.4s +str q3, [x0, #384] +ldr q3, [x0, #624] +sqrdmulh v21.4S, v3.4S, v4.s[0] +mul v3.4S, v3.4S,v0.s[0] +sub v13.4s, v11.4s, v12.4s +ldr q14, [x17, #+448] +str q13, [x0, #432] +ldr q13, [x17, #+464] +mla v30.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v26.4S, v13.s[0] +add v11.4s, v11.4s, v12.4s +ldr q12, [x0, #688] +str q11, [x0, #416] +mla v15.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v12.4S, v13.s[0] +sub v11.4s, v23.4s, v28.4s +ldr q2, [x17, #+480] +str q11, [x0, #464] +ldr q11, [x17, #+496] +mla v27.4S, v5.4S, v31.s[0] +sqrdmulh v5.4S, v18.4S, v11.s[0] +add v23.4s, v23.4s, v28.4s +str q23, [x0, #448] +ldr q23, [x0, #752] +mla v3.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v23.4S, v11.s[0] +sub v28.4s, v16.4s, v22.4s +ldr q24, [x0, #512] +str q28, [x0, #496] +ldr q28, [x0, #640] +mul v26.4S, v26.4S,v14.s[0] +mul v12.4S, v12.4S,v14.s[0] +add v16.4s, v16.4s, v22.4s +ldr q22, [x0, #528] +str q16, [x0, #480] +ldr q16, [x0, #656] +mla v26.4S, v19.4S, v31.s[0] +mla v12.4S, v1.4S, v31.s[0] +sub v1.4s, v24.4s, v30.4s +ldr q19, [x0, #576] +add v24.4s, v24.4s, v30.4s +ldr q30, [x0, #704] +mul v18.4S, v18.4S,v2.s[0] +mul v23.4S, v23.4S,v2.s[0] +sub v6.4s, v22.4s, v15.4s +ldr q29, [x0, #592] +add v22.4s, v22.4s, v15.4s +ldr q15, [x0, #720] +mla v18.4S, v5.4S, v31.s[0] +mla v23.4S, v21.4S, v31.s[0] +sub v21.4s, v19.4s, v27.4s +add v19.4s, v19.4s, v27.4s +sqrdmulh v27.4S, v22.4S, v20.s[1] +mul v22.4S, v22.4S,v10.s[1] +sub v5.4s, v29.4s, v3.4s +add v29.4s, v29.4s, v3.4s +sqrdmulh v3.4S, v6.4S, v20.s[2] +mul v6.4S, v6.4S,v10.s[2] +sub v25.4s, v28.4s, v26.4s +add v28.4s, v28.4s, v26.4s +sqrdmulh v20.4S, v29.4S, v4.s[1] +mul v29.4S, v29.4S,v0.s[1] +sub v26.4s, v16.4s, v12.4s +add v16.4s, v16.4s, v12.4s +sqrdmulh v12.4S, v5.4S, v4.s[2] +mul v5.4S, v5.4S,v0.s[2] +sub v10.4s, v30.4s, v18.4s +add v30.4s, v30.4s, v18.4s +mla v22.4S, v27.4S, v31.s[0] +sqrdmulh v27.4S, v16.4S, v13.s[1] +sub v4.4s, v15.4s, v23.4s +ldr q18, [x0, #992] +add v15.4s, v15.4s, v23.4s +mla v6.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v26.4S, v13.s[2] +sub v23.4s, v24.4s, v22.4s +ldr q0, [x0, #928] +str q23, [x0, #528] +mla v29.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v15.4S, v11.s[1] +add v24.4s, v24.4s, v22.4s +ldr q22, [x0, #800] +str q24, [x0, #512] +mla v5.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v4.4S, v11.s[2] +sub v24.4s, v1.4s, v6.4s +ldr q23, [x17, #+512] +str q24, [x0, #560] +mul v16.4S, v16.4S,v14.s[1] +mul v26.4S, v26.4S,v14.s[2] +add v1.4s, v1.4s, v6.4s +str q1, [x0, #544] +ldr q1, [x17, #+528] +mla v16.4S, v27.4S, v31.s[0] +mla v26.4S, v3.4S, v31.s[0] +sub v3.4s, v19.4s, v29.4s +str q3, [x0, #592] +mul v15.4S, v15.4S,v2.s[1] +mul v4.4S, v4.4S,v2.s[2] +add v19.4s, v19.4s, v29.4s +str q19, [x0, #576] +mla v15.4S, v20.4S, v31.s[0] +mla v4.4S, v12.4S, v31.s[0] +sub v12.4s, v21.4s, v5.4s +str q12, [x0, #624] +sqrdmulh v11.4S, v22.4S, v1.s[0] +mul v22.4S, v22.4S,v23.s[0] +add v21.4s, v21.4s, v5.4s +ldr q5, [x0, #816] +str q21, [x0, #608] +ldr q21, [x17, #+544] +sqrdmulh v12.4S, v5.4S, v1.s[0] +mul v5.4S, v5.4S,v23.s[0] +sub v2.4s, v28.4s, v16.4s +ldr q20, [x17, #+560] +str q2, [x0, #656] +ldr q2, [x0, #864] +sqrdmulh v19.4S, v2.4S, v20.s[0] +mul v2.4S, v2.4S,v21.s[0] +add v28.4s, v28.4s, v16.4s +str q28, [x0, #640] +ldr q28, [x0, #880] +sqrdmulh v16.4S, v28.4S, v20.s[0] +mul v28.4S, v28.4S,v21.s[0] +sub v29.4s, v25.4s, v26.4s +ldr q13, [x17, #+576] +str q29, [x0, #688] +ldr q29, [x17, #+592] +mla v22.4S, v11.4S, v31.s[0] +sqrdmulh v11.4S, v0.4S, v29.s[0] +add v25.4s, v25.4s, v26.4s +ldr q26, [x0, #944] +str q25, [x0, #672] +mla v5.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v26.4S, v29.s[0] +sub v25.4s, v30.4s, v15.4s +ldr q3, [x17, #+608] +str q25, [x0, #720] +ldr q25, [x17, #+624] +mla v2.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v18.4S, v25.s[0] +add v30.4s, v30.4s, v15.4s +str q30, [x0, #704] +ldr q30, [x0, #1008] +mla v28.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v30.4S, v25.s[0] +sub v15.4s, v10.4s, v4.4s +ldr q14, [x0, #768] +str q15, [x0, #752] +ldr q15, [x0, #896] +mul v0.4S, v0.4S,v13.s[0] +mul v26.4S, v26.4S,v13.s[0] +add v10.4s, v10.4s, v4.4s +ldr q4, [x0, #784] +str q10, [x0, #736] +ldr q10, [x0, #912] +mla v0.4S, v11.4S, v31.s[0] +mla v26.4S, v12.4S, v31.s[0] +sub v12.4s, v14.4s, v22.4s +ldr q11, [x0, #832] +add v14.4s, v14.4s, v22.4s +ldr q22, [x0, #960] +mul v18.4S, v18.4S,v3.s[0] +mul v30.4S, v30.4S,v3.s[0] +sub v27.4s, v4.4s, v5.4s +ldr q6, [x0, #848] +add v4.4s, v4.4s, v5.4s +ldr q5, [x0, #976] +mla v18.4S, v19.4S, v31.s[0] +mla v30.4S, v16.4S, v31.s[0] +sub v16.4s, v11.4s, v2.4s +add v11.4s, v11.4s, v2.4s +sqrdmulh v2.4S, v4.4S, v1.s[1] +mul v4.4S, v4.4S,v23.s[1] +sub v19.4s, v6.4s, v28.4s +add v6.4s, v6.4s, v28.4s +sqrdmulh v28.4S, v27.4S, v1.s[2] +mul v27.4S, v27.4S,v23.s[2] +sub v24.4s, v15.4s, v0.4s +add v15.4s, v15.4s, v0.4s +sqrdmulh v1.4S, v6.4S, v20.s[1] +mul v6.4S, v6.4S,v21.s[1] +sub v0.4s, v10.4s, v26.4s +add v10.4s, v10.4s, v26.4s +sqrdmulh v26.4S, v19.4S, v20.s[2] +mul v19.4S, v19.4S,v21.s[2] +sub v23.4s, v22.4s, v18.4s +add v22.4s, v22.4s, v18.4s +mla v4.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v10.4S, v29.s[1] +sub v20.4s, v5.4s, v30.4s +add v5.4s, v5.4s, v30.4s +mla v27.4S, v28.4S, v31.s[0] +sqrdmulh v28.4S, v0.4S, v29.s[2] +sub v30.4s, v14.4s, v4.4s +str q30, [x0, #784] +mla v6.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v5.4S, v25.s[1] +add v14.4s, v14.4s, v4.4s +str q14, [x0, #768] +mla v19.4S, v26.4S, v31.s[0] +sqrdmulh v26.4S, v20.4S, v25.s[2] +sub v14.4s, v12.4s, v27.4s +str q14, [x0, #816] +mul v10.4S, v10.4S,v13.s[1] +mul v0.4S, v0.4S,v13.s[2] +add v12.4s, v12.4s, v27.4s +str q12, [x0, #800] +mla v10.4S, v2.4S, v31.s[0] +mla v0.4S, v28.4S, v31.s[0] +sub v28.4s, v11.4s, v6.4s +str q28, [x0, #848] +mul v5.4S, v5.4S,v3.s[1] +mul v20.4S, v20.4S,v3.s[2] +add v11.4s, v11.4s, v6.4s +str q11, [x0, #832] +mla v5.4S, v1.4S, v31.s[0] +mla v20.4S, v26.4S, v31.s[0] +sub v26.4s, v16.4s, v19.4s +str q26, [x0, #880] +add v16.4s, v16.4s, v19.4s +str q16, [x0, #864] +sub v16.4s, v15.4s, v10.4s +str q16, [x0, #912] +add v15.4s, v15.4s, v10.4s +str q15, [x0, #896] +sub v15.4s, v24.4s, v0.4s +str q15, [x0, #944] +add v24.4s, v24.4s, v0.4s +str q24, [x0, #928] +sub v24.4s, v22.4s, v5.4s +str q24, [x0, #976] +add v22.4s, v22.4s, v5.4s +str q22, [x0, #960] +sub v22.4s, v23.4s, v20.4s +str q22, [x0, #1008] +add v23.4s, v23.4s, v20.4s +str q23, [x0, #992] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1464 +// Instruction count: 1460 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_2.s b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_2.s new file mode 100644 index 0000000..b48ef69 --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_2.s @@ -0,0 +1,1494 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_7_z4_2 +.global _ntt_u32_incomplete_neon_asm_var_4_2_7_z4_2 +ntt_u32_incomplete_neon_asm_var_4_2_7_z4_2: +_ntt_u32_incomplete_neon_asm_var_4_2_7_z4_2: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #928] +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +ldr q20, [x0, #992] +sqrdmulh v19.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q18, [x0, #800] +sqrdmulh v17.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +ldr q16, [x0, #864] +sqrdmulh v3.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +mla v22.4S, v21.4S, v31.s[0] +mla v20.4S, v19.4S, v31.s[0] +mla v18.4S, v17.4S, v31.s[0] +mla v16.4S, v3.4S, v31.s[0] +ldr q3, [x0, #544] +sqrdmulh v17.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +ldr q19, [x0, #608] +sqrdmulh v21.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q2, [x0, #672] +ldr q1, [x0, #416] +sqrdmulh v0.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +sub v15.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +ldr q22, [x0, #736] +ldr q14, [x0, #480] +sqrdmulh v13.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v12.4s, v14.4s, v20.4s +add v14.4s, v14.4s, v20.4s +ldr q20, [x0, #288] +mla v3.4S, v17.4S, v31.s[0] +mla v19.4S, v21.4S, v31.s[0] +sub v21.4s, v20.4s, v18.4s +mla v2.4S, v0.4S, v31.s[0] +mla v22.4S, v13.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +ldr q18, [x0, #352] +sqrdmulh v13.4S, v1.4S, v29.s[1] +mul v1.4S, v1.4S,v30.s[1] +sub v0.4s, v18.4s, v16.4s +sqrdmulh v17.4S, v14.4S, v29.s[1] +mul v14.4S, v14.4S,v30.s[1] +add v18.4s, v18.4s, v16.4s +ldr q16, [x0, #32] +sqrdmulh v11.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v10.4s, v16.4s, v3.4s +add v16.4s, v16.4s, v3.4s +ldr q3, [x0, #96] +sqrdmulh v9.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v8.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +ldr q19, [x0, #160] +mla v1.4S, v13.4S, v31.s[0] +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v19.4s, v2.4s +mla v20.4S, v11.4S, v31.s[0] +mla v18.4S, v9.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +ldr q2, [x0, #224] +sqrdmulh v9.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +sub v11.4s, v2.4s, v22.4s +sqrdmulh v13.4S, v12.4S, v29.s[2] +mul v12.4S, v12.4S,v30.s[2] +add v2.4s, v2.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +sub v7.4s, v19.4s, v1.4s +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v29.s[2] +mul v0.4S, v0.4S,v30.s[2] +sub v6.4s, v2.4s, v14.4s +add v2.4s, v2.4s, v14.4s +mla v15.4S, v9.4S, v31.s[0] +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v16.4s, v20.4s +mla v21.4S, v22.4S, v31.s[0] +mla v0.4S, v1.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v7.4S, v27.s[1] +mul v7.4S, v7.4S,v28.s[1] +sub v1.4s, v3.4s, v18.4s +sqrdmulh v22.4S, v6.4S, v27.s[1] +mul v6.4S, v6.4S,v28.s[1] +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v19.4S, v27.s[0] +mul v19.4S, v19.4S,v28.s[0] +sub v9.4s, v17.4s, v15.4s +add v17.4s, v17.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v27.s[0] +mul v2.4S, v2.4S,v28.s[0] +sub v14.4s, v11.4s, v12.4s +add v11.4s, v11.4s, v12.4s +mla v7.4S, v20.4S, v31.s[0] +mla v6.4S, v22.4S, v31.s[0] +sub v22.4s, v10.4s, v21.4s +mla v19.4S, v18.4S, v31.s[0] +mla v2.4S, v15.4S, v31.s[0] +add v10.4s, v10.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v27.s[2] +mul v17.4S, v17.4S,v28.s[2] +sub v15.4s, v8.4s, v0.4s +sqrdmulh v18.4S, v11.4S, v27.s[2] +mul v11.4S, v11.4S,v28.s[2] +add v8.4s, v8.4s, v0.4s +sqrdmulh v0.4S, v9.4S, v27.s[3] +mul v9.4S, v9.4S,v28.s[3] +sub v20.4s, v13.4s, v7.4s +add v13.4s, v13.4s, v7.4s +sqrdmulh v7.4S, v14.4S, v27.s[3] +mul v14.4S, v14.4S,v28.s[3] +sub v12.4s, v1.4s, v6.4s +add v1.4s, v1.4s, v6.4s +mla v17.4S, v21.4S, v31.s[0] +mla v11.4S, v18.4S, v31.s[0] +sub v18.4s, v16.4s, v19.4s +mla v9.4S, v0.4S, v31.s[0] +mla v14.4S, v7.4S, v31.s[0] +add v16.4s, v16.4s, v19.4s +sqrdmulh v19.4S, v1.4S, v25.s[2] +mul v1.4S, v1.4S,v26.s[2] +sub v7.4s, v3.4s, v2.4s +sqrdmulh v0.4S, v12.4S, v25.s[3] +mul v12.4S, v12.4S,v26.s[3] +add v3.4s, v3.4s, v2.4s +sqrdmulh v2.4S, v7.4S, v25.s[1] +mul v7.4S, v7.4S,v26.s[1] +sub v21.4s, v10.4s, v17.4s +add v10.4s, v10.4s, v17.4s +sqrdmulh v17.4S, v3.4S, v25.s[0] +mul v3.4S, v3.4S,v26.s[0] +sub v6.4s, v8.4s, v11.4s +add v8.4s, v8.4s, v11.4s +mla v1.4S, v19.4S, v31.s[0] +mla v12.4S, v0.4S, v31.s[0] +sub v0.4s, v22.4s, v9.4s +mla v7.4S, v2.4S, v31.s[0] +mla v3.4S, v17.4S, v31.s[0] +add v22.4s, v22.4s, v9.4s +sqrdmulh v9.4S, v8.4S, v23.s[0] +mul v8.4S, v8.4S,v24.s[0] +sub v17.4s, v15.4s, v14.4s +sqrdmulh v2.4S, v6.4S, v23.s[1] +mul v6.4S, v6.4S,v24.s[1] +add v15.4s, v15.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v23.s[2] +mul v15.4S, v15.4S,v24.s[2] +sub v19.4s, v13.4s, v1.4s +add v13.4s, v13.4s, v1.4s +sqrdmulh v1.4S, v17.4S, v23.s[3] +mul v17.4S, v17.4S,v24.s[3] +sub v11.4s, v20.4s, v12.4s +add v20.4s, v20.4s, v12.4s +mla v8.4S, v9.4S, v31.s[0] +mla v6.4S, v2.4S, v31.s[0] +sub v2.4s, v18.4s, v7.4s +str q13, [x0, #288] +mla v15.4S, v14.4S, v31.s[0] +mla v17.4S, v1.4S, v31.s[0] +add v18.4s, v18.4s, v7.4s +str q19, [x0, #352] +ldr q19, [x0, #944] +sqrdmulh v7.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +sub v1.4s, v16.4s, v3.4s +str q20, [x0, #416] +ldr q20, [x0, #1008] +sqrdmulh v14.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v16.4s, v16.4s, v3.4s +str q11, [x0, #480] +ldr q11, [x0, #816] +sqrdmulh v3.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +sub v13.4s, v10.4s, v8.4s +add v10.4s, v10.4s, v8.4s +ldr q8, [x0, #880] +sqrdmulh v9.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v12.4s, v21.4s, v6.4s +add v21.4s, v21.4s, v6.4s +mla v19.4S, v7.4S, v31.s[0] +mla v20.4S, v14.4S, v31.s[0] +sub v14.4s, v22.4s, v15.4s +str q18, [x0, #160] +mla v11.4S, v3.4S, v31.s[0] +mla v8.4S, v9.4S, v31.s[0] +add v22.4s, v22.4s, v15.4s +str q2, [x0, #224] +ldr q2, [x0, #560] +sqrdmulh v15.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +sub v9.4s, v0.4s, v17.4s +str q16, [x0, #32] +ldr q16, [x0, #624] +sqrdmulh v3.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +add v0.4s, v0.4s, v17.4s +str q1, [x0, #96] +ldr q1, [x0, #688] +ldr q17, [x0, #432] +sqrdmulh v18.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +sub v7.4s, v17.4s, v19.4s +add v17.4s, v17.4s, v19.4s +ldr q19, [x0, #752] +ldr q6, [x0, #496] +sqrdmulh v5.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +sub v4.4s, v6.4s, v20.4s +add v6.4s, v6.4s, v20.4s +ldr q20, [x0, #304] +mla v2.4S, v15.4S, v31.s[0] +mla v16.4S, v3.4S, v31.s[0] +sub v3.4s, v20.4s, v11.4s +str q10, [x0, #544] +mla v1.4S, v18.4S, v31.s[0] +mla v19.4S, v5.4S, v31.s[0] +add v20.4s, v20.4s, v11.4s +str q13, [x0, #608] +ldr q13, [x0, #368] +sqrdmulh v11.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v5.4s, v13.4s, v8.4s +str q21, [x0, #672] +sqrdmulh v21.4S, v6.4S, v29.s[1] +mul v6.4S, v6.4S,v30.s[1] +add v13.4s, v13.4s, v8.4s +str q12, [x0, #736] +ldr q12, [x0, #48] +sqrdmulh v8.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v18.4s, v12.4s, v2.4s +add v12.4s, v12.4s, v2.4s +ldr q2, [x0, #112] +sqrdmulh v10.4S, v13.4S, v29.s[1] +mul v13.4S, v13.4S,v30.s[1] +sub v15.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +ldr q16, [x0, #176] +mla v17.4S, v11.4S, v31.s[0] +mla v6.4S, v21.4S, v31.s[0] +sub v21.4s, v16.4s, v1.4s +str q22, [x0, #800] +mla v20.4S, v8.4S, v31.s[0] +mla v13.4S, v10.4S, v31.s[0] +add v16.4s, v16.4s, v1.4s +str q14, [x0, #864] +ldr q14, [x0, #240] +sqrdmulh v1.4S, v7.4S, v29.s[2] +mul v7.4S, v7.4S,v30.s[2] +sub v10.4s, v14.4s, v19.4s +str q0, [x0, #928] +sqrdmulh v0.4S, v4.4S, v29.s[2] +mul v4.4S, v4.4S,v30.s[2] +add v14.4s, v14.4s, v19.4s +str q9, [x0, #992] +sqrdmulh v9.4S, v3.4S, v29.s[2] +mul v3.4S, v3.4S,v30.s[2] +sub v19.4s, v16.4s, v17.4s +add v16.4s, v16.4s, v17.4s +sqrdmulh v17.4S, v5.4S, v29.s[2] +mul v5.4S, v5.4S,v30.s[2] +sub v8.4s, v14.4s, v6.4s +add v14.4s, v14.4s, v6.4s +mla v7.4S, v1.4S, v31.s[0] +mla v4.4S, v0.4S, v31.s[0] +sub v0.4s, v12.4s, v20.4s +mla v3.4S, v9.4S, v31.s[0] +mla v5.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v27.s[1] +mul v19.4S, v19.4S,v28.s[1] +sub v17.4s, v2.4s, v13.4s +sqrdmulh v9.4S, v8.4S, v27.s[1] +mul v8.4S, v8.4S,v28.s[1] +add v2.4s, v2.4s, v13.4s +sqrdmulh v13.4S, v16.4S, v27.s[0] +mul v16.4S, v16.4S,v28.s[0] +sub v1.4s, v21.4s, v7.4s +add v21.4s, v21.4s, v7.4s +sqrdmulh v7.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +sub v6.4s, v10.4s, v4.4s +add v10.4s, v10.4s, v4.4s +mla v19.4S, v20.4S, v31.s[0] +mla v8.4S, v9.4S, v31.s[0] +sub v9.4s, v18.4s, v3.4s +mla v16.4S, v13.4S, v31.s[0] +mla v14.4S, v7.4S, v31.s[0] +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v27.s[2] +mul v21.4S, v21.4S,v28.s[2] +sub v7.4s, v15.4s, v5.4s +sqrdmulh v13.4S, v10.4S, v27.s[2] +mul v10.4S, v10.4S,v28.s[2] +add v15.4s, v15.4s, v5.4s +sqrdmulh v5.4S, v1.4S, v27.s[3] +mul v1.4S, v1.4S,v28.s[3] +sub v20.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v27.s[3] +mul v6.4S, v6.4S,v28.s[3] +sub v4.4s, v17.4s, v8.4s +add v17.4s, v17.4s, v8.4s +mla v21.4S, v3.4S, v31.s[0] +mla v10.4S, v13.4S, v31.s[0] +sub v13.4s, v12.4s, v16.4s +mla v1.4S, v5.4S, v31.s[0] +mla v6.4S, v19.4S, v31.s[0] +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v25.s[2] +mul v17.4S, v17.4S,v26.s[2] +sub v19.4s, v2.4s, v14.4s +sqrdmulh v5.4S, v4.4S, v25.s[3] +mul v4.4S, v4.4S,v26.s[3] +add v2.4s, v2.4s, v14.4s +sqrdmulh v14.4S, v19.4S, v25.s[1] +mul v19.4S, v19.4S,v26.s[1] +sub v3.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v2.4S, v25.s[0] +mul v2.4S, v2.4S,v26.s[0] +sub v8.4s, v15.4s, v10.4s +add v15.4s, v15.4s, v10.4s +mla v17.4S, v16.4S, v31.s[0] +mla v4.4S, v5.4S, v31.s[0] +sub v5.4s, v9.4s, v1.4s +mla v19.4S, v14.4S, v31.s[0] +mla v2.4S, v21.4S, v31.s[0] +add v9.4s, v9.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v23.s[0] +mul v15.4S, v15.4S,v24.s[0] +sub v21.4s, v7.4s, v6.4s +sqrdmulh v14.4S, v8.4S, v23.s[1] +mul v8.4S, v8.4S,v24.s[1] +add v7.4s, v7.4s, v6.4s +sqrdmulh v6.4S, v7.4S, v23.s[2] +mul v7.4S, v7.4S,v24.s[2] +sub v16.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +sqrdmulh v17.4S, v21.4S, v23.s[3] +mul v21.4S, v21.4S,v24.s[3] +sub v10.4s, v20.4s, v4.4s +add v20.4s, v20.4s, v4.4s +mla v15.4S, v1.4S, v31.s[0] +mla v8.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v19.4s +str q0, [x0, #304] +mla v7.4S, v6.4S, v31.s[0] +mla v21.4S, v17.4S, v31.s[0] +add v13.4s, v13.4s, v19.4s +str q16, [x0, #368] +ldr q16, [x0, #896] +sqrdmulh v19.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v17.4s, v12.4s, v2.4s +str q20, [x0, #432] +ldr q20, [x0, #960] +sqrdmulh v6.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v12.4s, v12.4s, v2.4s +str q10, [x0, #496] +ldr q10, [x0, #768] +sqrdmulh v2.4S, v10.4S, v29.s[0] +mul v10.4S, v10.4S,v30.s[0] +sub v0.4s, v18.4s, v15.4s +add v18.4s, v18.4s, v15.4s +ldr q15, [x0, #832] +sqrdmulh v1.4S, v15.4S, v29.s[0] +mul v15.4S, v15.4S,v30.s[0] +sub v4.4s, v3.4s, v8.4s +add v3.4s, v3.4s, v8.4s +mla v16.4S, v19.4S, v31.s[0] +mla v20.4S, v6.4S, v31.s[0] +sub v6.4s, v9.4s, v7.4s +str q13, [x0, #176] +mla v10.4S, v2.4S, v31.s[0] +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v7.4s +str q14, [x0, #240] +ldr q14, [x0, #512] +sqrdmulh v7.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v1.4s, v5.4s, v21.4s +str q12, [x0, #48] +ldr q12, [x0, #576] +sqrdmulh v2.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +add v5.4s, v5.4s, v21.4s +str q17, [x0, #112] +ldr q17, [x0, #640] +ldr q21, [x0, #384] +sqrdmulh v13.4S, v17.4S, v29.s[0] +mul v17.4S, v17.4S,v30.s[0] +sub v19.4s, v21.4s, v16.4s +add v21.4s, v21.4s, v16.4s +ldr q16, [x0, #704] +ldr q8, [x0, #448] +sqrdmulh v22.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v11.4s, v8.4s, v20.4s +add v8.4s, v8.4s, v20.4s +ldr q20, [x0, #256] +mla v14.4S, v7.4S, v31.s[0] +mla v12.4S, v2.4S, v31.s[0] +sub v2.4s, v20.4s, v10.4s +str q18, [x0, #560] +mla v17.4S, v13.4S, v31.s[0] +mla v16.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v10.4s +str q0, [x0, #624] +ldr q0, [x0, #320] +sqrdmulh v10.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v22.4s, v0.4s, v15.4s +str q3, [x0, #688] +sqrdmulh v3.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +add v0.4s, v0.4s, v15.4s +str q4, [x0, #752] +ldr q4, [x0, #0] +sqrdmulh v15.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v13.4s, v4.4s, v14.4s +add v4.4s, v4.4s, v14.4s +ldr q14, [x0, #64] +sqrdmulh v18.4S, v0.4S, v29.s[1] +mul v0.4S, v0.4S,v30.s[1] +sub v7.4s, v14.4s, v12.4s +add v14.4s, v14.4s, v12.4s +ldr q12, [x0, #128] +mla v21.4S, v10.4S, v31.s[0] +mla v8.4S, v3.4S, v31.s[0] +sub v3.4s, v12.4s, v17.4s +str q9, [x0, #816] +mla v20.4S, v15.4S, v31.s[0] +mla v0.4S, v18.4S, v31.s[0] +add v12.4s, v12.4s, v17.4s +str q6, [x0, #880] +ldr q6, [x0, #192] +sqrdmulh v17.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +sub v18.4s, v6.4s, v16.4s +str q5, [x0, #944] +sqrdmulh v5.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +add v6.4s, v6.4s, v16.4s +str q1, [x0, #1008] +sqrdmulh v1.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v16.4s, v12.4s, v21.4s +add v12.4s, v12.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +sub v15.4s, v6.4s, v8.4s +add v6.4s, v6.4s, v8.4s +mla v19.4S, v17.4S, v31.s[0] +mla v11.4S, v5.4S, v31.s[0] +sub v5.4s, v4.4s, v20.4s +mla v2.4S, v1.4S, v31.s[0] +mla v22.4S, v21.4S, v31.s[0] +add v4.4s, v4.4s, v20.4s +sqrdmulh v20.4S, v16.4S, v27.s[1] +mul v16.4S, v16.4S,v28.s[1] +sub v21.4s, v14.4s, v0.4s +sqrdmulh v1.4S, v15.4S, v27.s[1] +mul v15.4S, v15.4S,v28.s[1] +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +sub v17.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v27.s[0] +mul v6.4S, v6.4S,v28.s[0] +sub v8.4s, v18.4s, v11.4s +add v18.4s, v18.4s, v11.4s +mla v16.4S, v20.4S, v31.s[0] +mla v15.4S, v1.4S, v31.s[0] +sub v1.4s, v13.4s, v2.4s +mla v12.4S, v0.4S, v31.s[0] +mla v6.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v3.4S, v27.s[2] +mul v3.4S, v3.4S,v28.s[2] +sub v19.4s, v7.4s, v22.4s +sqrdmulh v0.4S, v18.4S, v27.s[2] +mul v18.4S, v18.4S,v28.s[2] +add v7.4s, v7.4s, v22.4s +sqrdmulh v22.4S, v17.4S, v27.s[3] +mul v17.4S, v17.4S,v28.s[3] +sub v20.4s, v5.4s, v16.4s +add v5.4s, v5.4s, v16.4s +sqrdmulh v16.4S, v8.4S, v27.s[3] +mul v8.4S, v8.4S,v28.s[3] +sub v11.4s, v21.4s, v15.4s +add v21.4s, v21.4s, v15.4s +mla v3.4S, v2.4S, v31.s[0] +mla v18.4S, v0.4S, v31.s[0] +sub v0.4s, v4.4s, v12.4s +mla v17.4S, v22.4S, v31.s[0] +mla v8.4S, v16.4S, v31.s[0] +add v4.4s, v4.4s, v12.4s +sqrdmulh v12.4S, v21.4S, v25.s[2] +mul v21.4S, v21.4S,v26.s[2] +sub v16.4s, v14.4s, v6.4s +sqrdmulh v22.4S, v11.4S, v25.s[3] +mul v11.4S, v11.4S,v26.s[3] +add v14.4s, v14.4s, v6.4s +sqrdmulh v6.4S, v16.4S, v25.s[1] +mul v16.4S, v16.4S,v26.s[1] +sub v2.4s, v13.4s, v3.4s +add v13.4s, v13.4s, v3.4s +sqrdmulh v3.4S, v14.4S, v25.s[0] +mul v14.4S, v14.4S,v26.s[0] +sub v15.4s, v7.4s, v18.4s +add v7.4s, v7.4s, v18.4s +mla v21.4S, v12.4S, v31.s[0] +mla v11.4S, v22.4S, v31.s[0] +sub v22.4s, v1.4s, v17.4s +mla v16.4S, v6.4S, v31.s[0] +mla v14.4S, v3.4S, v31.s[0] +add v1.4s, v1.4s, v17.4s +sqrdmulh v17.4S, v7.4S, v23.s[0] +mul v7.4S, v7.4S,v24.s[0] +sub v3.4s, v19.4s, v8.4s +sqrdmulh v6.4S, v15.4S, v23.s[1] +mul v15.4S, v15.4S,v24.s[1] +add v19.4s, v19.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v23.s[2] +mul v19.4S, v19.4S,v24.s[2] +sub v12.4s, v5.4s, v21.4s +add v5.4s, v5.4s, v21.4s +sqrdmulh v21.4S, v3.4S, v23.s[3] +mul v3.4S, v3.4S,v24.s[3] +sub v18.4s, v20.4s, v11.4s +add v20.4s, v20.4s, v11.4s +mla v7.4S, v17.4S, v31.s[0] +mla v15.4S, v6.4S, v31.s[0] +sub v6.4s, v0.4s, v16.4s +str q5, [x0, #256] +mla v19.4S, v8.4S, v31.s[0] +mla v3.4S, v21.4S, v31.s[0] +add v0.4s, v0.4s, v16.4s +str q12, [x0, #320] +ldr q12, [x0, #912] +sqrdmulh v16.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +sub v21.4s, v4.4s, v14.4s +str q20, [x0, #384] +ldr q20, [x0, #976] +sqrdmulh v8.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v4.4s, v4.4s, v14.4s +str q18, [x0, #448] +ldr q18, [x0, #784] +sqrdmulh v14.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +sub v5.4s, v13.4s, v7.4s +add v13.4s, v13.4s, v7.4s +ldr q7, [x0, #848] +sqrdmulh v17.4S, v7.4S, v29.s[0] +mul v7.4S, v7.4S,v30.s[0] +sub v11.4s, v2.4s, v15.4s +add v2.4s, v2.4s, v15.4s +mla v12.4S, v16.4S, v31.s[0] +mla v20.4S, v8.4S, v31.s[0] +sub v8.4s, v1.4s, v19.4s +str q0, [x0, #128] +mla v18.4S, v14.4S, v31.s[0] +mla v7.4S, v17.4S, v31.s[0] +add v1.4s, v1.4s, v19.4s +str q6, [x0, #192] +ldr q6, [x0, #528] +sqrdmulh v19.4S, v6.4S, v29.s[0] +mul v6.4S, v6.4S,v30.s[0] +sub v17.4s, v22.4s, v3.4s +str q4, [x0, #0] +ldr q4, [x0, #592] +sqrdmulh v14.4S, v4.4S, v29.s[0] +mul v4.4S, v4.4S,v30.s[0] +add v22.4s, v22.4s, v3.4s +str q21, [x0, #64] +ldr q21, [x0, #656] +ldr q3, [x0, #400] +sqrdmulh v0.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +sub v16.4s, v3.4s, v12.4s +add v3.4s, v3.4s, v12.4s +ldr q12, [x0, #720] +ldr q15, [x0, #464] +sqrdmulh v9.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +sub v10.4s, v15.4s, v20.4s +add v15.4s, v15.4s, v20.4s +ldr q20, [x0, #272] +mla v6.4S, v19.4S, v31.s[0] +mla v4.4S, v14.4S, v31.s[0] +sub v14.4s, v20.4s, v18.4s +str q13, [x0, #512] +mla v21.4S, v0.4S, v31.s[0] +mla v12.4S, v9.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +str q5, [x0, #576] +ldr q5, [x0, #336] +sqrdmulh v18.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v9.4s, v5.4s, v7.4s +str q2, [x0, #640] +sqrdmulh v2.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +add v5.4s, v5.4s, v7.4s +str q11, [x0, #704] +ldr q11, [x0, #16] +sqrdmulh v7.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v0.4s, v11.4s, v6.4s +add v11.4s, v11.4s, v6.4s +ldr q6, [x0, #80] +sqrdmulh v13.4S, v5.4S, v29.s[1] +mul v5.4S, v5.4S,v30.s[1] +sub v19.4s, v6.4s, v4.4s +add v6.4s, v6.4s, v4.4s +ldr q4, [x0, #144] +mla v3.4S, v18.4S, v31.s[0] +mla v15.4S, v2.4S, v31.s[0] +sub v2.4s, v4.4s, v21.4s +str q1, [x0, #768] +mla v20.4S, v7.4S, v31.s[0] +mla v5.4S, v13.4S, v31.s[0] +add v4.4s, v4.4s, v21.4s +str q8, [x0, #832] +ldr q8, [x0, #208] +sqrdmulh v21.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +sub v13.4s, v8.4s, v12.4s +str q22, [x0, #896] +sqrdmulh v22.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +add v8.4s, v8.4s, v12.4s +str q17, [x0, #960] +sqrdmulh v17.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v12.4s, v4.4s, v3.4s +add v4.4s, v4.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v29.s[2] +mul v9.4S, v9.4S,v30.s[2] +sub v7.4s, v8.4s, v15.4s +add v8.4s, v8.4s, v15.4s +mla v16.4S, v21.4S, v31.s[0] +mla v10.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v20.4s +mla v14.4S, v17.4S, v31.s[0] +mla v9.4S, v3.4S, v31.s[0] +add v11.4s, v11.4s, v20.4s +sqrdmulh v20.4S, v12.4S, v27.s[1] +mul v12.4S, v12.4S,v28.s[1] +sub v3.4s, v6.4s, v5.4s +sqrdmulh v17.4S, v7.4S, v27.s[1] +mul v7.4S, v7.4S,v28.s[1] +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v4.4S, v27.s[0] +mul v4.4S, v4.4S,v28.s[0] +sub v21.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v8.4S, v27.s[0] +mul v8.4S, v8.4S,v28.s[0] +sub v15.4s, v13.4s, v10.4s +add v13.4s, v13.4s, v10.4s +mla v12.4S, v20.4S, v31.s[0] +mla v7.4S, v17.4S, v31.s[0] +sub v17.4s, v0.4s, v14.4s +mla v4.4S, v5.4S, v31.s[0] +mla v8.4S, v16.4S, v31.s[0] +add v0.4s, v0.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v27.s[2] +mul v2.4S, v2.4S,v28.s[2] +sub v16.4s, v19.4s, v9.4s +sqrdmulh v5.4S, v13.4S, v27.s[2] +mul v13.4S, v13.4S,v28.s[2] +add v19.4s, v19.4s, v9.4s +sqrdmulh v9.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +sub v20.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +sub v10.4s, v3.4s, v7.4s +add v3.4s, v3.4s, v7.4s +mla v2.4S, v14.4S, v31.s[0] +mla v13.4S, v5.4S, v31.s[0] +sub v5.4s, v11.4s, v4.4s +mla v21.4S, v9.4S, v31.s[0] +mla v15.4S, v12.4S, v31.s[0] +add v11.4s, v11.4s, v4.4s +sqrdmulh v4.4S, v3.4S, v25.s[2] +mul v3.4S, v3.4S,v26.s[2] +sub v12.4s, v6.4s, v8.4s +sqrdmulh v9.4S, v10.4S, v25.s[3] +mul v10.4S, v10.4S,v26.s[3] +add v6.4s, v6.4s, v8.4s +sqrdmulh v8.4S, v12.4S, v25.s[1] +mul v12.4S, v12.4S,v26.s[1] +sub v14.4s, v0.4s, v2.4s +add v0.4s, v0.4s, v2.4s +sqrdmulh v2.4S, v6.4S, v25.s[0] +mul v6.4S, v6.4S,v26.s[0] +sub v7.4s, v19.4s, v13.4s +add v19.4s, v19.4s, v13.4s +mla v3.4S, v4.4S, v31.s[0] +mla v10.4S, v9.4S, v31.s[0] +sub v9.4s, v17.4s, v21.4s +mla v12.4S, v8.4S, v31.s[0] +mla v6.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v19.4S, v23.s[0] +mul v19.4S, v19.4S,v24.s[0] +sub v2.4s, v16.4s, v15.4s +sqrdmulh v8.4S, v7.4S, v23.s[1] +mul v7.4S, v7.4S,v24.s[1] +add v16.4s, v16.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v23.s[2] +mul v16.4S, v16.4S,v24.s[2] +sub v4.4s, v22.4s, v3.4s +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v2.4S, v23.s[3] +mul v2.4S, v2.4S,v24.s[3] +sub v13.4s, v20.4s, v10.4s +add v20.4s, v20.4s, v10.4s +mla v19.4S, v21.4S, v31.s[0] +mla v7.4S, v8.4S, v31.s[0] +sub v8.4s, v5.4s, v12.4s +str q22, [x0, #272] +mla v16.4S, v15.4S, v31.s[0] +mla v2.4S, v3.4S, v31.s[0] +add v5.4s, v5.4s, v12.4s +str q4, [x0, #336] +sub v23.4s, v11.4s, v6.4s +str q20, [x0, #400] +add v11.4s, v11.4s, v6.4s +str q13, [x0, #464] +sub v13.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sub v19.4s, v14.4s, v7.4s +add v14.4s, v14.4s, v7.4s +sub v7.4s, v17.4s, v16.4s +str q5, [x0, #144] +add v17.4s, v17.4s, v16.4s +str q8, [x0, #208] +sub v8.4s, v9.4s, v2.4s +str q11, [x0, #16] +add v9.4s, v9.4s, v2.4s +str q23, [x0, #80] +str q0, [x0, #528] +str q13, [x0, #592] +str q14, [x0, #656] +str q19, [x0, #720] +str q17, [x0, #784] +str q7, [x0, #848] +str q9, [x0, #912] +str q8, [x0, #976] +ldr q18, [x17, #+128] +ldr q1, [x17, #+144] +ldr q10, [x17, #+160] +ldr q21, [x17, #+176] +ldr q22, [x17, #+192] +ldr q15, [x17, #+208] +ldr q3, [x17, #+224] +ldr q12, [x17, #+240] +ldr q4, [x0, #32] +ldr q30, [x0, #48] +ldr q29, [x0, #0] +ldr q28, [x0, #96] +ldr q27, [x0, #112] +ldr q26, [x0, #64] +ldr q25, [x0, #160] +ldr q24, [x0, #176] +ldr q20, [x0, #128] +ldr q6, [x0, #224] +ldr q5, [x0, #240] +ldr q16, [x0, #192] +sqrdmulh v11.4S, v4.4S, v1.s[0] +mul v4.4S, v4.4S,v18.s[0] +mla v4.4S, v11.4S, v31.s[0] +sub v11.4s, v29.4s, v4.4s +add v29.4s, v29.4s, v4.4s +ldr q4, [x0, #16] +sqrdmulh v2.4S, v28.4S, v21.s[0] +mul v28.4S, v28.4S,v10.s[0] +mla v28.4S, v2.4S, v31.s[0] +sub v2.4s, v26.4s, v28.4s +add v26.4s, v26.4s, v28.4s +ldr q28, [x0, #80] +sqrdmulh v23.4S, v25.4S, v15.s[0] +mul v25.4S, v25.4S,v22.s[0] +mla v25.4S, v23.4S, v31.s[0] +sub v23.4s, v20.4s, v25.4s +add v20.4s, v20.4s, v25.4s +ldr q25, [x0, #144] +sqrdmulh v0.4S, v6.4S, v12.s[0] +mul v6.4S, v6.4S,v3.s[0] +mla v6.4S, v0.4S, v31.s[0] +sub v0.4s, v16.4s, v6.4s +add v16.4s, v16.4s, v6.4s +ldr q6, [x0, #208] +sqrdmulh v13.4S, v30.4S, v1.s[0] +mul v30.4S, v30.4S,v18.s[0] +mla v30.4S, v13.4S, v31.s[0] +sub v13.4s, v4.4s, v30.4s +add v4.4s, v4.4s, v30.4s +sqrdmulh v30.4S, v27.4S, v21.s[0] +mul v27.4S, v27.4S,v10.s[0] +mla v27.4S, v30.4S, v31.s[0] +sub v30.4s, v28.4s, v27.4s +add v28.4s, v28.4s, v27.4s +sqrdmulh v27.4S, v24.4S, v15.s[0] +mul v24.4S, v24.4S,v22.s[0] +mla v24.4S, v27.4S, v31.s[0] +sub v27.4s, v25.4s, v24.4s +add v25.4s, v25.4s, v24.4s +sqrdmulh v24.4S, v5.4S, v12.s[0] +mul v5.4S, v5.4S,v3.s[0] +mla v5.4S, v24.4S, v31.s[0] +sub v24.4s, v6.4s, v5.4s +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v4.4S, v1.s[1] +mul v4.4S, v4.4S,v18.s[1] +mla v4.4S, v5.4S, v31.s[0] +sub v5.4s, v29.4s, v4.4s +add v29.4s, v29.4s, v4.4s +sqrdmulh v4.4S, v28.4S, v21.s[1] +mul v28.4S, v28.4S,v10.s[1] +mla v28.4S, v4.4S, v31.s[0] +sub v4.4s, v26.4s, v28.4s +add v26.4s, v26.4s, v28.4s +str q29, [x0, #0] +str q5, [x0, #16] +sqrdmulh v5.4S, v25.4S, v15.s[1] +mul v25.4S, v25.4S,v22.s[1] +mla v25.4S, v5.4S, v31.s[0] +sub v5.4s, v20.4s, v25.4s +add v20.4s, v20.4s, v25.4s +str q26, [x0, #64] +str q4, [x0, #80] +sqrdmulh v4.4S, v6.4S, v12.s[1] +mul v6.4S, v6.4S,v3.s[1] +mla v6.4S, v4.4S, v31.s[0] +sub v4.4s, v16.4s, v6.4s +add v16.4s, v16.4s, v6.4s +str q20, [x0, #128] +str q5, [x0, #144] +sqrdmulh v5.4S, v13.4S, v1.s[2] +mul v13.4S, v13.4S,v18.s[2] +mla v13.4S, v5.4S, v31.s[0] +sub v5.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +str q16, [x0, #192] +str q4, [x0, #208] +ldr q1, [x17, #+256] +ldr q18, [x17, #+272] +sqrdmulh v4.4S, v30.4S, v21.s[2] +mul v30.4S, v30.4S,v10.s[2] +mla v30.4S, v4.4S, v31.s[0] +sub v4.4s, v2.4s, v30.4s +add v2.4s, v2.4s, v30.4s +ldr q21, [x17, #+288] +ldr q10, [x17, #+304] +sqrdmulh v30.4S, v27.4S, v15.s[2] +mul v27.4S, v27.4S,v22.s[2] +mla v27.4S, v30.4S, v31.s[0] +sub v30.4s, v23.4s, v27.4s +add v23.4s, v23.4s, v27.4s +ldr q15, [x17, #+320] +ldr q22, [x17, #+336] +sqrdmulh v27.4S, v24.4S, v12.s[2] +mul v24.4S, v24.4S,v3.s[2] +mla v24.4S, v27.4S, v31.s[0] +sub v27.4s, v0.4s, v24.4s +add v0.4s, v0.4s, v24.4s +ldr q12, [x17, #+352] +ldr q3, [x17, #+368] +str q11, [x0, #32] +str q5, [x0, #48] +str q2, [x0, #96] +str q4, [x0, #112] +str q23, [x0, #160] +str q30, [x0, #176] +str q0, [x0, #224] +str q27, [x0, #240] +ldr q27, [x0, #288] +ldr q0, [x0, #304] +ldr q30, [x0, #256] +ldr q23, [x0, #352] +ldr q4, [x0, #368] +ldr q2, [x0, #320] +ldr q5, [x0, #416] +ldr q11, [x0, #432] +ldr q24, [x0, #384] +ldr q16, [x0, #480] +ldr q13, [x0, #496] +ldr q20, [x0, #448] +sqrdmulh v6.4S, v27.4S, v18.s[0] +mul v27.4S, v27.4S,v1.s[0] +mla v27.4S, v6.4S, v31.s[0] +sub v6.4s, v30.4s, v27.4s +add v30.4s, v30.4s, v27.4s +ldr q27, [x0, #272] +sqrdmulh v26.4S, v23.4S, v10.s[0] +mul v23.4S, v23.4S,v21.s[0] +mla v23.4S, v26.4S, v31.s[0] +sub v26.4s, v2.4s, v23.4s +add v2.4s, v2.4s, v23.4s +ldr q23, [x0, #336] +sqrdmulh v25.4S, v5.4S, v22.s[0] +mul v5.4S, v5.4S,v15.s[0] +mla v5.4S, v25.4S, v31.s[0] +sub v25.4s, v24.4s, v5.4s +add v24.4s, v24.4s, v5.4s +ldr q5, [x0, #400] +sqrdmulh v29.4S, v16.4S, v3.s[0] +mul v16.4S, v16.4S,v12.s[0] +mla v16.4S, v29.4S, v31.s[0] +sub v29.4s, v20.4s, v16.4s +add v20.4s, v20.4s, v16.4s +ldr q16, [x0, #464] +sqrdmulh v28.4S, v0.4S, v18.s[0] +mul v0.4S, v0.4S,v1.s[0] +mla v0.4S, v28.4S, v31.s[0] +sub v28.4s, v27.4s, v0.4s +add v27.4s, v27.4s, v0.4s +sqrdmulh v0.4S, v4.4S, v10.s[0] +mul v4.4S, v4.4S,v21.s[0] +mla v4.4S, v0.4S, v31.s[0] +sub v0.4s, v23.4s, v4.4s +add v23.4s, v23.4s, v4.4s +sqrdmulh v4.4S, v11.4S, v22.s[0] +mul v11.4S, v11.4S,v15.s[0] +mla v11.4S, v4.4S, v31.s[0] +sub v4.4s, v5.4s, v11.4s +add v5.4s, v5.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v3.s[0] +mul v13.4S, v13.4S,v12.s[0] +mla v13.4S, v11.4S, v31.s[0] +sub v11.4s, v16.4s, v13.4s +add v16.4s, v16.4s, v13.4s +sqrdmulh v13.4S, v27.4S, v18.s[1] +mul v27.4S, v27.4S,v1.s[1] +mla v27.4S, v13.4S, v31.s[0] +sub v13.4s, v30.4s, v27.4s +add v30.4s, v30.4s, v27.4s +sqrdmulh v27.4S, v23.4S, v10.s[1] +mul v23.4S, v23.4S,v21.s[1] +mla v23.4S, v27.4S, v31.s[0] +sub v27.4s, v2.4s, v23.4s +add v2.4s, v2.4s, v23.4s +str q30, [x0, #256] +str q13, [x0, #272] +sqrdmulh v13.4S, v5.4S, v22.s[1] +mul v5.4S, v5.4S,v15.s[1] +mla v5.4S, v13.4S, v31.s[0] +sub v13.4s, v24.4s, v5.4s +add v24.4s, v24.4s, v5.4s +str q2, [x0, #320] +str q27, [x0, #336] +sqrdmulh v27.4S, v16.4S, v3.s[1] +mul v16.4S, v16.4S,v12.s[1] +mla v16.4S, v27.4S, v31.s[0] +sub v27.4s, v20.4s, v16.4s +add v20.4s, v20.4s, v16.4s +str q24, [x0, #384] +str q13, [x0, #400] +sqrdmulh v13.4S, v28.4S, v18.s[2] +mul v28.4S, v28.4S,v1.s[2] +mla v28.4S, v13.4S, v31.s[0] +sub v13.4s, v6.4s, v28.4s +add v6.4s, v6.4s, v28.4s +str q20, [x0, #448] +str q27, [x0, #464] +ldr q18, [x17, #+384] +ldr q1, [x17, #+400] +sqrdmulh v27.4S, v0.4S, v10.s[2] +mul v0.4S, v0.4S,v21.s[2] +mla v0.4S, v27.4S, v31.s[0] +sub v27.4s, v26.4s, v0.4s +add v26.4s, v26.4s, v0.4s +ldr q10, [x17, #+416] +ldr q21, [x17, #+432] +sqrdmulh v0.4S, v4.4S, v22.s[2] +mul v4.4S, v4.4S,v15.s[2] +mla v4.4S, v0.4S, v31.s[0] +sub v0.4s, v25.4s, v4.4s +add v25.4s, v25.4s, v4.4s +ldr q22, [x17, #+448] +ldr q15, [x17, #+464] +sqrdmulh v4.4S, v11.4S, v3.s[2] +mul v11.4S, v11.4S,v12.s[2] +mla v11.4S, v4.4S, v31.s[0] +sub v4.4s, v29.4s, v11.4s +add v29.4s, v29.4s, v11.4s +ldr q3, [x17, #+480] +ldr q12, [x17, #+496] +str q6, [x0, #288] +str q13, [x0, #304] +str q26, [x0, #352] +str q27, [x0, #368] +str q25, [x0, #416] +str q0, [x0, #432] +str q29, [x0, #480] +str q4, [x0, #496] +ldr q4, [x0, #544] +ldr q29, [x0, #560] +ldr q0, [x0, #512] +ldr q25, [x0, #608] +ldr q27, [x0, #624] +ldr q26, [x0, #576] +ldr q13, [x0, #672] +ldr q6, [x0, #688] +ldr q11, [x0, #640] +ldr q20, [x0, #736] +ldr q28, [x0, #752] +ldr q24, [x0, #704] +sqrdmulh v16.4S, v4.4S, v1.s[0] +mul v4.4S, v4.4S,v18.s[0] +mla v4.4S, v16.4S, v31.s[0] +sub v16.4s, v0.4s, v4.4s +add v0.4s, v0.4s, v4.4s +ldr q4, [x0, #528] +sqrdmulh v2.4S, v25.4S, v21.s[0] +mul v25.4S, v25.4S,v10.s[0] +mla v25.4S, v2.4S, v31.s[0] +sub v2.4s, v26.4s, v25.4s +add v26.4s, v26.4s, v25.4s +ldr q25, [x0, #592] +sqrdmulh v5.4S, v13.4S, v15.s[0] +mul v13.4S, v13.4S,v22.s[0] +mla v13.4S, v5.4S, v31.s[0] +sub v5.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +ldr q13, [x0, #656] +sqrdmulh v30.4S, v20.4S, v12.s[0] +mul v20.4S, v20.4S,v3.s[0] +mla v20.4S, v30.4S, v31.s[0] +sub v30.4s, v24.4s, v20.4s +add v24.4s, v24.4s, v20.4s +ldr q20, [x0, #720] +sqrdmulh v23.4S, v29.4S, v1.s[0] +mul v29.4S, v29.4S,v18.s[0] +mla v29.4S, v23.4S, v31.s[0] +sub v23.4s, v4.4s, v29.4s +add v4.4s, v4.4s, v29.4s +sqrdmulh v29.4S, v27.4S, v21.s[0] +mul v27.4S, v27.4S,v10.s[0] +mla v27.4S, v29.4S, v31.s[0] +sub v29.4s, v25.4s, v27.4s +add v25.4s, v25.4s, v27.4s +sqrdmulh v27.4S, v6.4S, v15.s[0] +mul v6.4S, v6.4S,v22.s[0] +mla v6.4S, v27.4S, v31.s[0] +sub v27.4s, v13.4s, v6.4s +add v13.4s, v13.4s, v6.4s +sqrdmulh v6.4S, v28.4S, v12.s[0] +mul v28.4S, v28.4S,v3.s[0] +mla v28.4S, v6.4S, v31.s[0] +sub v6.4s, v20.4s, v28.4s +add v20.4s, v20.4s, v28.4s +sqrdmulh v28.4S, v4.4S, v1.s[1] +mul v4.4S, v4.4S,v18.s[1] +mla v4.4S, v28.4S, v31.s[0] +sub v28.4s, v0.4s, v4.4s +add v0.4s, v0.4s, v4.4s +sqrdmulh v4.4S, v25.4S, v21.s[1] +mul v25.4S, v25.4S,v10.s[1] +mla v25.4S, v4.4S, v31.s[0] +sub v4.4s, v26.4s, v25.4s +add v26.4s, v26.4s, v25.4s +str q0, [x0, #512] +str q28, [x0, #528] +sqrdmulh v28.4S, v13.4S, v15.s[1] +mul v13.4S, v13.4S,v22.s[1] +mla v13.4S, v28.4S, v31.s[0] +sub v28.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +str q26, [x0, #576] +str q4, [x0, #592] +sqrdmulh v4.4S, v20.4S, v12.s[1] +mul v20.4S, v20.4S,v3.s[1] +mla v20.4S, v4.4S, v31.s[0] +sub v4.4s, v24.4s, v20.4s +add v24.4s, v24.4s, v20.4s +str q11, [x0, #640] +str q28, [x0, #656] +sqrdmulh v28.4S, v23.4S, v1.s[2] +mul v23.4S, v23.4S,v18.s[2] +mla v23.4S, v28.4S, v31.s[0] +sub v28.4s, v16.4s, v23.4s +add v16.4s, v16.4s, v23.4s +str q24, [x0, #704] +str q4, [x0, #720] +ldr q1, [x17, #+512] +ldr q18, [x17, #+528] +sqrdmulh v4.4S, v29.4S, v21.s[2] +mul v29.4S, v29.4S,v10.s[2] +mla v29.4S, v4.4S, v31.s[0] +sub v4.4s, v2.4s, v29.4s +add v2.4s, v2.4s, v29.4s +ldr q21, [x17, #+544] +ldr q10, [x17, #+560] +sqrdmulh v29.4S, v27.4S, v15.s[2] +mul v27.4S, v27.4S,v22.s[2] +mla v27.4S, v29.4S, v31.s[0] +sub v29.4s, v5.4s, v27.4s +add v5.4s, v5.4s, v27.4s +ldr q15, [x17, #+576] +ldr q22, [x17, #+592] +sqrdmulh v27.4S, v6.4S, v12.s[2] +mul v6.4S, v6.4S,v3.s[2] +mla v6.4S, v27.4S, v31.s[0] +sub v27.4s, v30.4s, v6.4s +add v30.4s, v30.4s, v6.4s +ldr q12, [x17, #+608] +ldr q3, [x17, #+624] +str q16, [x0, #544] +str q28, [x0, #560] +str q2, [x0, #608] +str q4, [x0, #624] +str q5, [x0, #672] +str q29, [x0, #688] +str q30, [x0, #736] +str q27, [x0, #752] +ldr q27, [x0, #800] +ldr q30, [x0, #816] +ldr q29, [x0, #768] +ldr q5, [x0, #864] +ldr q4, [x0, #880] +ldr q2, [x0, #832] +ldr q28, [x0, #928] +ldr q16, [x0, #944] +ldr q6, [x0, #896] +ldr q24, [x0, #992] +ldr q23, [x0, #1008] +ldr q11, [x0, #960] +sqrdmulh v20.4S, v27.4S, v18.s[0] +mul v27.4S, v27.4S,v1.s[0] +mla v27.4S, v20.4S, v31.s[0] +sub v20.4s, v29.4s, v27.4s +add v29.4s, v29.4s, v27.4s +ldr q27, [x0, #784] +sqrdmulh v26.4S, v5.4S, v10.s[0] +mul v5.4S, v5.4S,v21.s[0] +mla v5.4S, v26.4S, v31.s[0] +sub v26.4s, v2.4s, v5.4s +add v2.4s, v2.4s, v5.4s +ldr q5, [x0, #848] +sqrdmulh v13.4S, v28.4S, v22.s[0] +mul v28.4S, v28.4S,v15.s[0] +mla v28.4S, v13.4S, v31.s[0] +sub v13.4s, v6.4s, v28.4s +add v6.4s, v6.4s, v28.4s +ldr q28, [x0, #912] +sqrdmulh v0.4S, v24.4S, v3.s[0] +mul v24.4S, v24.4S,v12.s[0] +mla v24.4S, v0.4S, v31.s[0] +sub v0.4s, v11.4s, v24.4s +add v11.4s, v11.4s, v24.4s +ldr q24, [x0, #976] +sqrdmulh v25.4S, v30.4S, v18.s[0] +mul v30.4S, v30.4S,v1.s[0] +mla v30.4S, v25.4S, v31.s[0] +sub v25.4s, v27.4s, v30.4s +add v27.4s, v27.4s, v30.4s +sqrdmulh v30.4S, v4.4S, v10.s[0] +mul v4.4S, v4.4S,v21.s[0] +mla v4.4S, v30.4S, v31.s[0] +sub v30.4s, v5.4s, v4.4s +add v5.4s, v5.4s, v4.4s +sqrdmulh v4.4S, v16.4S, v22.s[0] +mul v16.4S, v16.4S,v15.s[0] +mla v16.4S, v4.4S, v31.s[0] +sub v4.4s, v28.4s, v16.4s +add v28.4s, v28.4s, v16.4s +sqrdmulh v16.4S, v23.4S, v3.s[0] +mul v23.4S, v23.4S,v12.s[0] +mla v23.4S, v16.4S, v31.s[0] +sub v16.4s, v24.4s, v23.4s +add v24.4s, v24.4s, v23.4s +sqrdmulh v23.4S, v27.4S, v18.s[1] +mul v27.4S, v27.4S,v1.s[1] +mla v27.4S, v23.4S, v31.s[0] +sub v23.4s, v29.4s, v27.4s +add v29.4s, v29.4s, v27.4s +sqrdmulh v27.4S, v5.4S, v10.s[1] +mul v5.4S, v5.4S,v21.s[1] +mla v5.4S, v27.4S, v31.s[0] +sub v27.4s, v2.4s, v5.4s +add v2.4s, v2.4s, v5.4s +str q29, [x0, #768] +str q23, [x0, #784] +sqrdmulh v23.4S, v28.4S, v22.s[1] +mul v28.4S, v28.4S,v15.s[1] +mla v28.4S, v23.4S, v31.s[0] +sub v23.4s, v6.4s, v28.4s +add v6.4s, v6.4s, v28.4s +str q2, [x0, #832] +str q27, [x0, #848] +sqrdmulh v27.4S, v24.4S, v3.s[1] +mul v24.4S, v24.4S,v12.s[1] +mla v24.4S, v27.4S, v31.s[0] +sub v27.4s, v11.4s, v24.4s +add v11.4s, v11.4s, v24.4s +str q6, [x0, #896] +str q23, [x0, #912] +sqrdmulh v23.4S, v25.4S, v18.s[2] +mul v25.4S, v25.4S,v1.s[2] +mla v25.4S, v23.4S, v31.s[0] +sub v23.4s, v20.4s, v25.4s +add v20.4s, v20.4s, v25.4s +str q11, [x0, #960] +str q27, [x0, #976] +sqrdmulh v18.4S, v30.4S, v10.s[2] +mul v30.4S, v30.4S,v21.s[2] +mla v30.4S, v18.4S, v31.s[0] +sub v18.4s, v26.4s, v30.4s +add v26.4s, v26.4s, v30.4s +sqrdmulh v10.4S, v4.4S, v22.s[2] +mul v4.4S, v4.4S,v15.s[2] +mla v4.4S, v10.4S, v31.s[0] +sub v10.4s, v13.4s, v4.4s +add v13.4s, v13.4s, v4.4s +sqrdmulh v22.4S, v16.4S, v3.s[2] +mul v16.4S, v16.4S,v12.s[2] +mla v16.4S, v22.4S, v31.s[0] +sub v22.4s, v0.4s, v16.4s +add v0.4s, v0.4s, v16.4s +str q20, [x0, #800] +str q23, [x0, #816] +str q26, [x0, #864] +str q18, [x0, #880] +str q13, [x0, #928] +str q10, [x0, #944] +str q0, [x0, #992] +str q22, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1464 +// Instruction count: 1460 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_3.s b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_3.s new file mode 100644 index 0000000..11b1cd1 --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_3.s @@ -0,0 +1,1494 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_7_z4_3 +.global _ntt_u32_incomplete_neon_asm_var_4_2_7_z4_3 +ntt_u32_incomplete_neon_asm_var_4_2_7_z4_3: +_ntt_u32_incomplete_neon_asm_var_4_2_7_z4_3: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #928] +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +ldr q20, [x0, #992] +sqrdmulh v19.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q18, [x0, #800] +sqrdmulh v17.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +ldr q16, [x0, #864] +sqrdmulh v3.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +mla v22.4S, v21.4S, v31.s[0] +mla v20.4S, v19.4S, v31.s[0] +mla v18.4S, v17.4S, v31.s[0] +mla v16.4S, v3.4S, v31.s[0] +ldr q3, [x0, #544] +sqrdmulh v17.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +ldr q19, [x0, #608] +sqrdmulh v21.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q2, [x0, #672] +ldr q1, [x0, #416] +sqrdmulh v0.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +sub v15.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +ldr q22, [x0, #736] +ldr q14, [x0, #480] +sqrdmulh v13.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v12.4s, v14.4s, v20.4s +add v14.4s, v14.4s, v20.4s +ldr q20, [x0, #288] +mla v3.4S, v17.4S, v31.s[0] +mla v19.4S, v21.4S, v31.s[0] +sub v21.4s, v20.4s, v18.4s +mla v2.4S, v0.4S, v31.s[0] +mla v22.4S, v13.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +ldr q18, [x0, #352] +sqrdmulh v13.4S, v1.4S, v29.s[1] +mul v1.4S, v1.4S,v30.s[1] +sub v0.4s, v18.4s, v16.4s +sqrdmulh v17.4S, v14.4S, v29.s[1] +mul v14.4S, v14.4S,v30.s[1] +add v18.4s, v18.4s, v16.4s +ldr q16, [x0, #32] +sqrdmulh v11.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v10.4s, v16.4s, v3.4s +add v16.4s, v16.4s, v3.4s +ldr q3, [x0, #96] +sqrdmulh v9.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v8.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +ldr q19, [x0, #160] +mla v1.4S, v13.4S, v31.s[0] +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v19.4s, v2.4s +mla v20.4S, v11.4S, v31.s[0] +mla v18.4S, v9.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +ldr q2, [x0, #224] +sqrdmulh v9.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +sub v11.4s, v2.4s, v22.4s +sqrdmulh v13.4S, v12.4S, v29.s[2] +mul v12.4S, v12.4S,v30.s[2] +add v2.4s, v2.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +sub v7.4s, v19.4s, v1.4s +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v29.s[2] +mul v0.4S, v0.4S,v30.s[2] +sub v6.4s, v2.4s, v14.4s +add v2.4s, v2.4s, v14.4s +mla v15.4S, v9.4S, v31.s[0] +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v16.4s, v20.4s +mla v21.4S, v22.4S, v31.s[0] +mla v0.4S, v1.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v7.4S, v27.s[1] +mul v7.4S, v7.4S,v28.s[1] +sub v1.4s, v3.4s, v18.4s +sqrdmulh v22.4S, v6.4S, v27.s[1] +mul v6.4S, v6.4S,v28.s[1] +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v19.4S, v27.s[0] +mul v19.4S, v19.4S,v28.s[0] +sub v9.4s, v17.4s, v15.4s +add v17.4s, v17.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v27.s[0] +mul v2.4S, v2.4S,v28.s[0] +sub v14.4s, v11.4s, v12.4s +add v11.4s, v11.4s, v12.4s +mla v7.4S, v20.4S, v31.s[0] +mla v6.4S, v22.4S, v31.s[0] +sub v22.4s, v10.4s, v21.4s +mla v19.4S, v18.4S, v31.s[0] +mla v2.4S, v15.4S, v31.s[0] +add v10.4s, v10.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v27.s[2] +mul v17.4S, v17.4S,v28.s[2] +sub v15.4s, v8.4s, v0.4s +sqrdmulh v18.4S, v11.4S, v27.s[2] +mul v11.4S, v11.4S,v28.s[2] +add v8.4s, v8.4s, v0.4s +sqrdmulh v0.4S, v9.4S, v27.s[3] +mul v9.4S, v9.4S,v28.s[3] +sub v20.4s, v13.4s, v7.4s +add v13.4s, v13.4s, v7.4s +sqrdmulh v7.4S, v14.4S, v27.s[3] +mul v14.4S, v14.4S,v28.s[3] +sub v12.4s, v1.4s, v6.4s +add v1.4s, v1.4s, v6.4s +mla v17.4S, v21.4S, v31.s[0] +mla v11.4S, v18.4S, v31.s[0] +sub v18.4s, v16.4s, v19.4s +mla v9.4S, v0.4S, v31.s[0] +mla v14.4S, v7.4S, v31.s[0] +add v16.4s, v16.4s, v19.4s +sqrdmulh v19.4S, v1.4S, v25.s[2] +mul v1.4S, v1.4S,v26.s[2] +sub v7.4s, v3.4s, v2.4s +sqrdmulh v0.4S, v12.4S, v25.s[3] +mul v12.4S, v12.4S,v26.s[3] +add v3.4s, v3.4s, v2.4s +sqrdmulh v2.4S, v7.4S, v25.s[1] +mul v7.4S, v7.4S,v26.s[1] +sub v21.4s, v10.4s, v17.4s +add v10.4s, v10.4s, v17.4s +sqrdmulh v17.4S, v3.4S, v25.s[0] +mul v3.4S, v3.4S,v26.s[0] +sub v6.4s, v8.4s, v11.4s +add v8.4s, v8.4s, v11.4s +mla v1.4S, v19.4S, v31.s[0] +mla v12.4S, v0.4S, v31.s[0] +sub v0.4s, v22.4s, v9.4s +mla v7.4S, v2.4S, v31.s[0] +mla v3.4S, v17.4S, v31.s[0] +add v22.4s, v22.4s, v9.4s +sqrdmulh v9.4S, v8.4S, v23.s[0] +mul v8.4S, v8.4S,v24.s[0] +sub v17.4s, v15.4s, v14.4s +sqrdmulh v2.4S, v6.4S, v23.s[1] +mul v6.4S, v6.4S,v24.s[1] +add v15.4s, v15.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v23.s[2] +mul v15.4S, v15.4S,v24.s[2] +sub v19.4s, v13.4s, v1.4s +add v13.4s, v13.4s, v1.4s +sqrdmulh v1.4S, v17.4S, v23.s[3] +mul v17.4S, v17.4S,v24.s[3] +sub v11.4s, v20.4s, v12.4s +add v20.4s, v20.4s, v12.4s +mla v8.4S, v9.4S, v31.s[0] +mla v6.4S, v2.4S, v31.s[0] +sub v2.4s, v18.4s, v7.4s +str q13, [x0, #288] +mla v15.4S, v14.4S, v31.s[0] +mla v17.4S, v1.4S, v31.s[0] +add v18.4s, v18.4s, v7.4s +str q19, [x0, #352] +ldr q19, [x0, #944] +sqrdmulh v7.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +sub v1.4s, v16.4s, v3.4s +str q20, [x0, #416] +ldr q20, [x0, #1008] +sqrdmulh v14.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v16.4s, v16.4s, v3.4s +str q11, [x0, #480] +ldr q11, [x0, #816] +sqrdmulh v3.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +sub v13.4s, v10.4s, v8.4s +add v10.4s, v10.4s, v8.4s +ldr q8, [x0, #880] +sqrdmulh v9.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v12.4s, v21.4s, v6.4s +add v21.4s, v21.4s, v6.4s +mla v19.4S, v7.4S, v31.s[0] +mla v20.4S, v14.4S, v31.s[0] +sub v14.4s, v22.4s, v15.4s +str q18, [x0, #160] +mla v11.4S, v3.4S, v31.s[0] +mla v8.4S, v9.4S, v31.s[0] +add v22.4s, v22.4s, v15.4s +str q2, [x0, #224] +ldr q2, [x0, #560] +sqrdmulh v15.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +sub v9.4s, v0.4s, v17.4s +str q16, [x0, #32] +ldr q16, [x0, #624] +sqrdmulh v3.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +add v0.4s, v0.4s, v17.4s +str q1, [x0, #96] +ldr q1, [x0, #688] +ldr q17, [x0, #432] +sqrdmulh v18.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +sub v7.4s, v17.4s, v19.4s +add v17.4s, v17.4s, v19.4s +ldr q19, [x0, #752] +ldr q6, [x0, #496] +sqrdmulh v5.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +sub v4.4s, v6.4s, v20.4s +add v6.4s, v6.4s, v20.4s +ldr q20, [x0, #304] +mla v2.4S, v15.4S, v31.s[0] +mla v16.4S, v3.4S, v31.s[0] +sub v3.4s, v20.4s, v11.4s +str q10, [x0, #544] +mla v1.4S, v18.4S, v31.s[0] +mla v19.4S, v5.4S, v31.s[0] +add v20.4s, v20.4s, v11.4s +str q13, [x0, #608] +ldr q13, [x0, #368] +sqrdmulh v11.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v5.4s, v13.4s, v8.4s +str q21, [x0, #672] +sqrdmulh v21.4S, v6.4S, v29.s[1] +mul v6.4S, v6.4S,v30.s[1] +add v13.4s, v13.4s, v8.4s +str q12, [x0, #736] +ldr q12, [x0, #48] +sqrdmulh v8.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v18.4s, v12.4s, v2.4s +add v12.4s, v12.4s, v2.4s +ldr q2, [x0, #112] +sqrdmulh v10.4S, v13.4S, v29.s[1] +mul v13.4S, v13.4S,v30.s[1] +sub v15.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +ldr q16, [x0, #176] +mla v17.4S, v11.4S, v31.s[0] +mla v6.4S, v21.4S, v31.s[0] +sub v21.4s, v16.4s, v1.4s +str q22, [x0, #800] +mla v20.4S, v8.4S, v31.s[0] +mla v13.4S, v10.4S, v31.s[0] +add v16.4s, v16.4s, v1.4s +str q14, [x0, #864] +ldr q14, [x0, #240] +sqrdmulh v1.4S, v7.4S, v29.s[2] +mul v7.4S, v7.4S,v30.s[2] +sub v10.4s, v14.4s, v19.4s +str q0, [x0, #928] +sqrdmulh v0.4S, v4.4S, v29.s[2] +mul v4.4S, v4.4S,v30.s[2] +add v14.4s, v14.4s, v19.4s +str q9, [x0, #992] +sqrdmulh v9.4S, v3.4S, v29.s[2] +mul v3.4S, v3.4S,v30.s[2] +sub v19.4s, v16.4s, v17.4s +add v16.4s, v16.4s, v17.4s +sqrdmulh v17.4S, v5.4S, v29.s[2] +mul v5.4S, v5.4S,v30.s[2] +sub v8.4s, v14.4s, v6.4s +add v14.4s, v14.4s, v6.4s +mla v7.4S, v1.4S, v31.s[0] +mla v4.4S, v0.4S, v31.s[0] +sub v0.4s, v12.4s, v20.4s +mla v3.4S, v9.4S, v31.s[0] +mla v5.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v27.s[1] +mul v19.4S, v19.4S,v28.s[1] +sub v17.4s, v2.4s, v13.4s +sqrdmulh v9.4S, v8.4S, v27.s[1] +mul v8.4S, v8.4S,v28.s[1] +add v2.4s, v2.4s, v13.4s +sqrdmulh v13.4S, v16.4S, v27.s[0] +mul v16.4S, v16.4S,v28.s[0] +sub v1.4s, v21.4s, v7.4s +add v21.4s, v21.4s, v7.4s +sqrdmulh v7.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +sub v6.4s, v10.4s, v4.4s +add v10.4s, v10.4s, v4.4s +mla v19.4S, v20.4S, v31.s[0] +mla v8.4S, v9.4S, v31.s[0] +sub v9.4s, v18.4s, v3.4s +mla v16.4S, v13.4S, v31.s[0] +mla v14.4S, v7.4S, v31.s[0] +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v27.s[2] +mul v21.4S, v21.4S,v28.s[2] +sub v7.4s, v15.4s, v5.4s +sqrdmulh v13.4S, v10.4S, v27.s[2] +mul v10.4S, v10.4S,v28.s[2] +add v15.4s, v15.4s, v5.4s +sqrdmulh v5.4S, v1.4S, v27.s[3] +mul v1.4S, v1.4S,v28.s[3] +sub v20.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v27.s[3] +mul v6.4S, v6.4S,v28.s[3] +sub v4.4s, v17.4s, v8.4s +add v17.4s, v17.4s, v8.4s +mla v21.4S, v3.4S, v31.s[0] +mla v10.4S, v13.4S, v31.s[0] +sub v13.4s, v12.4s, v16.4s +mla v1.4S, v5.4S, v31.s[0] +mla v6.4S, v19.4S, v31.s[0] +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v25.s[2] +mul v17.4S, v17.4S,v26.s[2] +sub v19.4s, v2.4s, v14.4s +sqrdmulh v5.4S, v4.4S, v25.s[3] +mul v4.4S, v4.4S,v26.s[3] +add v2.4s, v2.4s, v14.4s +sqrdmulh v14.4S, v19.4S, v25.s[1] +mul v19.4S, v19.4S,v26.s[1] +sub v3.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v2.4S, v25.s[0] +mul v2.4S, v2.4S,v26.s[0] +sub v8.4s, v15.4s, v10.4s +add v15.4s, v15.4s, v10.4s +mla v17.4S, v16.4S, v31.s[0] +mla v4.4S, v5.4S, v31.s[0] +sub v5.4s, v9.4s, v1.4s +mla v19.4S, v14.4S, v31.s[0] +mla v2.4S, v21.4S, v31.s[0] +add v9.4s, v9.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v23.s[0] +mul v15.4S, v15.4S,v24.s[0] +sub v21.4s, v7.4s, v6.4s +sqrdmulh v14.4S, v8.4S, v23.s[1] +mul v8.4S, v8.4S,v24.s[1] +add v7.4s, v7.4s, v6.4s +sqrdmulh v6.4S, v7.4S, v23.s[2] +mul v7.4S, v7.4S,v24.s[2] +sub v16.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +sqrdmulh v17.4S, v21.4S, v23.s[3] +mul v21.4S, v21.4S,v24.s[3] +sub v10.4s, v20.4s, v4.4s +add v20.4s, v20.4s, v4.4s +mla v15.4S, v1.4S, v31.s[0] +mla v8.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v19.4s +str q0, [x0, #304] +mla v7.4S, v6.4S, v31.s[0] +mla v21.4S, v17.4S, v31.s[0] +add v13.4s, v13.4s, v19.4s +str q16, [x0, #368] +ldr q16, [x0, #896] +sqrdmulh v19.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v17.4s, v12.4s, v2.4s +str q20, [x0, #432] +ldr q20, [x0, #960] +sqrdmulh v6.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v12.4s, v12.4s, v2.4s +str q10, [x0, #496] +ldr q10, [x0, #768] +sqrdmulh v2.4S, v10.4S, v29.s[0] +mul v10.4S, v10.4S,v30.s[0] +sub v0.4s, v18.4s, v15.4s +add v18.4s, v18.4s, v15.4s +ldr q15, [x0, #832] +sqrdmulh v1.4S, v15.4S, v29.s[0] +mul v15.4S, v15.4S,v30.s[0] +sub v4.4s, v3.4s, v8.4s +add v3.4s, v3.4s, v8.4s +mla v16.4S, v19.4S, v31.s[0] +mla v20.4S, v6.4S, v31.s[0] +sub v6.4s, v9.4s, v7.4s +str q13, [x0, #176] +mla v10.4S, v2.4S, v31.s[0] +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v7.4s +str q14, [x0, #240] +ldr q14, [x0, #512] +sqrdmulh v7.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v1.4s, v5.4s, v21.4s +str q12, [x0, #48] +ldr q12, [x0, #576] +sqrdmulh v2.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +add v5.4s, v5.4s, v21.4s +str q17, [x0, #112] +ldr q17, [x0, #640] +ldr q21, [x0, #384] +sqrdmulh v13.4S, v17.4S, v29.s[0] +mul v17.4S, v17.4S,v30.s[0] +sub v19.4s, v21.4s, v16.4s +add v21.4s, v21.4s, v16.4s +ldr q16, [x0, #704] +ldr q8, [x0, #448] +sqrdmulh v22.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v11.4s, v8.4s, v20.4s +add v8.4s, v8.4s, v20.4s +ldr q20, [x0, #256] +mla v14.4S, v7.4S, v31.s[0] +mla v12.4S, v2.4S, v31.s[0] +sub v2.4s, v20.4s, v10.4s +str q18, [x0, #560] +mla v17.4S, v13.4S, v31.s[0] +mla v16.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v10.4s +str q0, [x0, #624] +ldr q0, [x0, #320] +sqrdmulh v10.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v22.4s, v0.4s, v15.4s +str q3, [x0, #688] +sqrdmulh v3.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +add v0.4s, v0.4s, v15.4s +str q4, [x0, #752] +ldr q4, [x0, #0] +sqrdmulh v15.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v13.4s, v4.4s, v14.4s +add v4.4s, v4.4s, v14.4s +ldr q14, [x0, #64] +sqrdmulh v18.4S, v0.4S, v29.s[1] +mul v0.4S, v0.4S,v30.s[1] +sub v7.4s, v14.4s, v12.4s +add v14.4s, v14.4s, v12.4s +ldr q12, [x0, #128] +mla v21.4S, v10.4S, v31.s[0] +mla v8.4S, v3.4S, v31.s[0] +sub v3.4s, v12.4s, v17.4s +str q9, [x0, #816] +mla v20.4S, v15.4S, v31.s[0] +mla v0.4S, v18.4S, v31.s[0] +add v12.4s, v12.4s, v17.4s +str q6, [x0, #880] +ldr q6, [x0, #192] +sqrdmulh v17.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +sub v18.4s, v6.4s, v16.4s +str q5, [x0, #944] +sqrdmulh v5.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +add v6.4s, v6.4s, v16.4s +str q1, [x0, #1008] +sqrdmulh v1.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v16.4s, v12.4s, v21.4s +add v12.4s, v12.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +sub v15.4s, v6.4s, v8.4s +add v6.4s, v6.4s, v8.4s +mla v19.4S, v17.4S, v31.s[0] +mla v11.4S, v5.4S, v31.s[0] +sub v5.4s, v4.4s, v20.4s +mla v2.4S, v1.4S, v31.s[0] +mla v22.4S, v21.4S, v31.s[0] +add v4.4s, v4.4s, v20.4s +sqrdmulh v20.4S, v16.4S, v27.s[1] +mul v16.4S, v16.4S,v28.s[1] +sub v21.4s, v14.4s, v0.4s +sqrdmulh v1.4S, v15.4S, v27.s[1] +mul v15.4S, v15.4S,v28.s[1] +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +sub v17.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v27.s[0] +mul v6.4S, v6.4S,v28.s[0] +sub v8.4s, v18.4s, v11.4s +add v18.4s, v18.4s, v11.4s +mla v16.4S, v20.4S, v31.s[0] +mla v15.4S, v1.4S, v31.s[0] +sub v1.4s, v13.4s, v2.4s +mla v12.4S, v0.4S, v31.s[0] +mla v6.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v3.4S, v27.s[2] +mul v3.4S, v3.4S,v28.s[2] +sub v19.4s, v7.4s, v22.4s +sqrdmulh v0.4S, v18.4S, v27.s[2] +mul v18.4S, v18.4S,v28.s[2] +add v7.4s, v7.4s, v22.4s +sqrdmulh v22.4S, v17.4S, v27.s[3] +mul v17.4S, v17.4S,v28.s[3] +sub v20.4s, v5.4s, v16.4s +add v5.4s, v5.4s, v16.4s +sqrdmulh v16.4S, v8.4S, v27.s[3] +mul v8.4S, v8.4S,v28.s[3] +sub v11.4s, v21.4s, v15.4s +add v21.4s, v21.4s, v15.4s +mla v3.4S, v2.4S, v31.s[0] +mla v18.4S, v0.4S, v31.s[0] +sub v0.4s, v4.4s, v12.4s +mla v17.4S, v22.4S, v31.s[0] +mla v8.4S, v16.4S, v31.s[0] +add v4.4s, v4.4s, v12.4s +sqrdmulh v12.4S, v21.4S, v25.s[2] +mul v21.4S, v21.4S,v26.s[2] +sub v16.4s, v14.4s, v6.4s +sqrdmulh v22.4S, v11.4S, v25.s[3] +mul v11.4S, v11.4S,v26.s[3] +add v14.4s, v14.4s, v6.4s +sqrdmulh v6.4S, v16.4S, v25.s[1] +mul v16.4S, v16.4S,v26.s[1] +sub v2.4s, v13.4s, v3.4s +add v13.4s, v13.4s, v3.4s +sqrdmulh v3.4S, v14.4S, v25.s[0] +mul v14.4S, v14.4S,v26.s[0] +sub v15.4s, v7.4s, v18.4s +add v7.4s, v7.4s, v18.4s +mla v21.4S, v12.4S, v31.s[0] +mla v11.4S, v22.4S, v31.s[0] +sub v22.4s, v1.4s, v17.4s +mla v16.4S, v6.4S, v31.s[0] +mla v14.4S, v3.4S, v31.s[0] +add v1.4s, v1.4s, v17.4s +sqrdmulh v17.4S, v7.4S, v23.s[0] +mul v7.4S, v7.4S,v24.s[0] +sub v3.4s, v19.4s, v8.4s +sqrdmulh v6.4S, v15.4S, v23.s[1] +mul v15.4S, v15.4S,v24.s[1] +add v19.4s, v19.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v23.s[2] +mul v19.4S, v19.4S,v24.s[2] +sub v12.4s, v5.4s, v21.4s +add v5.4s, v5.4s, v21.4s +sqrdmulh v21.4S, v3.4S, v23.s[3] +mul v3.4S, v3.4S,v24.s[3] +sub v18.4s, v20.4s, v11.4s +add v20.4s, v20.4s, v11.4s +mla v7.4S, v17.4S, v31.s[0] +mla v15.4S, v6.4S, v31.s[0] +sub v6.4s, v0.4s, v16.4s +str q5, [x0, #256] +mla v19.4S, v8.4S, v31.s[0] +mla v3.4S, v21.4S, v31.s[0] +add v0.4s, v0.4s, v16.4s +str q12, [x0, #320] +ldr q12, [x0, #912] +sqrdmulh v16.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +sub v21.4s, v4.4s, v14.4s +str q20, [x0, #384] +ldr q20, [x0, #976] +sqrdmulh v8.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v4.4s, v4.4s, v14.4s +str q18, [x0, #448] +ldr q18, [x0, #784] +sqrdmulh v14.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +sub v5.4s, v13.4s, v7.4s +add v13.4s, v13.4s, v7.4s +ldr q7, [x0, #848] +sqrdmulh v17.4S, v7.4S, v29.s[0] +mul v7.4S, v7.4S,v30.s[0] +sub v11.4s, v2.4s, v15.4s +add v2.4s, v2.4s, v15.4s +mla v12.4S, v16.4S, v31.s[0] +mla v20.4S, v8.4S, v31.s[0] +sub v8.4s, v1.4s, v19.4s +str q0, [x0, #128] +mla v18.4S, v14.4S, v31.s[0] +mla v7.4S, v17.4S, v31.s[0] +add v1.4s, v1.4s, v19.4s +str q6, [x0, #192] +ldr q6, [x0, #528] +sqrdmulh v19.4S, v6.4S, v29.s[0] +mul v6.4S, v6.4S,v30.s[0] +sub v17.4s, v22.4s, v3.4s +str q4, [x0, #0] +ldr q4, [x0, #592] +sqrdmulh v14.4S, v4.4S, v29.s[0] +mul v4.4S, v4.4S,v30.s[0] +add v22.4s, v22.4s, v3.4s +str q21, [x0, #64] +ldr q21, [x0, #656] +ldr q3, [x0, #400] +sqrdmulh v0.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +sub v16.4s, v3.4s, v12.4s +add v3.4s, v3.4s, v12.4s +ldr q12, [x0, #720] +ldr q15, [x0, #464] +sqrdmulh v9.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +sub v10.4s, v15.4s, v20.4s +add v15.4s, v15.4s, v20.4s +ldr q20, [x0, #272] +mla v6.4S, v19.4S, v31.s[0] +mla v4.4S, v14.4S, v31.s[0] +sub v14.4s, v20.4s, v18.4s +str q13, [x0, #512] +mla v21.4S, v0.4S, v31.s[0] +mla v12.4S, v9.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +str q5, [x0, #576] +ldr q5, [x0, #336] +sqrdmulh v18.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v9.4s, v5.4s, v7.4s +str q2, [x0, #640] +sqrdmulh v2.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +add v5.4s, v5.4s, v7.4s +str q11, [x0, #704] +ldr q11, [x0, #16] +sqrdmulh v7.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v0.4s, v11.4s, v6.4s +add v11.4s, v11.4s, v6.4s +ldr q6, [x0, #80] +sqrdmulh v13.4S, v5.4S, v29.s[1] +mul v5.4S, v5.4S,v30.s[1] +sub v19.4s, v6.4s, v4.4s +add v6.4s, v6.4s, v4.4s +ldr q4, [x0, #144] +mla v3.4S, v18.4S, v31.s[0] +mla v15.4S, v2.4S, v31.s[0] +sub v2.4s, v4.4s, v21.4s +str q1, [x0, #768] +mla v20.4S, v7.4S, v31.s[0] +mla v5.4S, v13.4S, v31.s[0] +add v4.4s, v4.4s, v21.4s +str q8, [x0, #832] +ldr q8, [x0, #208] +sqrdmulh v21.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +sub v13.4s, v8.4s, v12.4s +str q22, [x0, #896] +sqrdmulh v22.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +add v8.4s, v8.4s, v12.4s +str q17, [x0, #960] +sqrdmulh v17.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v12.4s, v4.4s, v3.4s +add v4.4s, v4.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v29.s[2] +mul v9.4S, v9.4S,v30.s[2] +sub v7.4s, v8.4s, v15.4s +add v8.4s, v8.4s, v15.4s +mla v16.4S, v21.4S, v31.s[0] +mla v10.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v20.4s +mla v14.4S, v17.4S, v31.s[0] +mla v9.4S, v3.4S, v31.s[0] +add v11.4s, v11.4s, v20.4s +sqrdmulh v20.4S, v12.4S, v27.s[1] +mul v12.4S, v12.4S,v28.s[1] +sub v3.4s, v6.4s, v5.4s +sqrdmulh v17.4S, v7.4S, v27.s[1] +mul v7.4S, v7.4S,v28.s[1] +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v4.4S, v27.s[0] +mul v4.4S, v4.4S,v28.s[0] +sub v21.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v8.4S, v27.s[0] +mul v8.4S, v8.4S,v28.s[0] +sub v15.4s, v13.4s, v10.4s +add v13.4s, v13.4s, v10.4s +mla v12.4S, v20.4S, v31.s[0] +mla v7.4S, v17.4S, v31.s[0] +sub v17.4s, v0.4s, v14.4s +mla v4.4S, v5.4S, v31.s[0] +mla v8.4S, v16.4S, v31.s[0] +add v0.4s, v0.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v27.s[2] +mul v2.4S, v2.4S,v28.s[2] +sub v16.4s, v19.4s, v9.4s +sqrdmulh v5.4S, v13.4S, v27.s[2] +mul v13.4S, v13.4S,v28.s[2] +add v19.4s, v19.4s, v9.4s +sqrdmulh v9.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +sub v20.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +sub v10.4s, v3.4s, v7.4s +add v3.4s, v3.4s, v7.4s +mla v2.4S, v14.4S, v31.s[0] +mla v13.4S, v5.4S, v31.s[0] +sub v5.4s, v11.4s, v4.4s +mla v21.4S, v9.4S, v31.s[0] +mla v15.4S, v12.4S, v31.s[0] +add v11.4s, v11.4s, v4.4s +sqrdmulh v4.4S, v3.4S, v25.s[2] +mul v3.4S, v3.4S,v26.s[2] +sub v12.4s, v6.4s, v8.4s +sqrdmulh v9.4S, v10.4S, v25.s[3] +mul v10.4S, v10.4S,v26.s[3] +add v6.4s, v6.4s, v8.4s +sqrdmulh v8.4S, v12.4S, v25.s[1] +mul v12.4S, v12.4S,v26.s[1] +sub v14.4s, v0.4s, v2.4s +add v0.4s, v0.4s, v2.4s +sqrdmulh v2.4S, v6.4S, v25.s[0] +mul v6.4S, v6.4S,v26.s[0] +sub v7.4s, v19.4s, v13.4s +add v19.4s, v19.4s, v13.4s +mla v3.4S, v4.4S, v31.s[0] +mla v10.4S, v9.4S, v31.s[0] +sub v9.4s, v17.4s, v21.4s +mla v12.4S, v8.4S, v31.s[0] +mla v6.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v19.4S, v23.s[0] +mul v19.4S, v19.4S,v24.s[0] +sub v2.4s, v16.4s, v15.4s +sqrdmulh v8.4S, v7.4S, v23.s[1] +mul v7.4S, v7.4S,v24.s[1] +add v16.4s, v16.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v23.s[2] +mul v16.4S, v16.4S,v24.s[2] +sub v4.4s, v22.4s, v3.4s +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v2.4S, v23.s[3] +mul v2.4S, v2.4S,v24.s[3] +sub v13.4s, v20.4s, v10.4s +add v20.4s, v20.4s, v10.4s +mla v19.4S, v21.4S, v31.s[0] +mla v7.4S, v8.4S, v31.s[0] +sub v8.4s, v5.4s, v12.4s +str q22, [x0, #272] +mla v16.4S, v15.4S, v31.s[0] +mla v2.4S, v3.4S, v31.s[0] +add v5.4s, v5.4s, v12.4s +str q4, [x0, #336] +sub v23.4s, v11.4s, v6.4s +str q20, [x0, #400] +add v11.4s, v11.4s, v6.4s +str q13, [x0, #464] +sub v13.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sub v19.4s, v14.4s, v7.4s +add v14.4s, v14.4s, v7.4s +sub v7.4s, v17.4s, v16.4s +str q5, [x0, #144] +add v17.4s, v17.4s, v16.4s +str q8, [x0, #208] +sub v8.4s, v9.4s, v2.4s +str q11, [x0, #16] +add v9.4s, v9.4s, v2.4s +str q23, [x0, #80] +str q0, [x0, #528] +str q13, [x0, #592] +str q14, [x0, #656] +str q19, [x0, #720] +str q17, [x0, #784] +str q7, [x0, #848] +str q9, [x0, #912] +str q8, [x0, #976] +ldr q18, [x17, #+128] +ldr q1, [x17, #+144] +ldr q10, [x17, #+160] +ldr q21, [x17, #+176] +ldr q22, [x17, #+192] +ldr q15, [x17, #+208] +ldr q3, [x17, #+224] +ldr q12, [x17, #+240] +ldr q4, [x0, #32] +ldr q30, [x0, #48] +ldr q29, [x0, #0] +ldr q28, [x0, #96] +ldr q27, [x0, #112] +ldr q26, [x0, #64] +ldr q25, [x0, #160] +ldr q24, [x0, #176] +ldr q20, [x0, #128] +ldr q6, [x0, #224] +ldr q5, [x0, #240] +ldr q16, [x0, #192] +sqrdmulh v11.4S, v4.4S, v1.s[0] +sqrdmulh v2.4S, v28.4S, v21.s[0] +sqrdmulh v23.4S, v25.4S, v15.s[0] +sqrdmulh v0.4S, v6.4S, v12.s[0] +mul v4.4S, v4.4S,v18.s[0] +mul v28.4S, v28.4S,v10.s[0] +mul v25.4S, v25.4S,v22.s[0] +mul v6.4S, v6.4S,v3.s[0] +mla v4.4S, v11.4S, v31.s[0] +mla v28.4S, v2.4S, v31.s[0] +mla v25.4S, v23.4S, v31.s[0] +mla v6.4S, v0.4S, v31.s[0] +sub v0.4s, v29.4s, v4.4s +sub v23.4s, v26.4s, v28.4s +sub v2.4s, v20.4s, v25.4s +sub v11.4s, v16.4s, v6.4s +add v29.4s, v29.4s, v4.4s +add v26.4s, v26.4s, v28.4s +add v20.4s, v20.4s, v25.4s +add v16.4s, v16.4s, v6.4s +ldr q6, [x0, #16] +ldr q25, [x0, #80] +ldr q28, [x0, #144] +ldr q4, [x0, #208] +sqrdmulh v13.4S, v30.4S, v1.s[0] +sqrdmulh v14.4S, v27.4S, v21.s[0] +sqrdmulh v19.4S, v24.4S, v15.s[0] +sqrdmulh v17.4S, v5.4S, v12.s[0] +mul v30.4S, v30.4S,v18.s[0] +mul v27.4S, v27.4S,v10.s[0] +mul v24.4S, v24.4S,v22.s[0] +mul v5.4S, v5.4S,v3.s[0] +mla v30.4S, v13.4S, v31.s[0] +mla v27.4S, v14.4S, v31.s[0] +mla v24.4S, v19.4S, v31.s[0] +mla v5.4S, v17.4S, v31.s[0] +sub v17.4s, v6.4s, v30.4s +sub v19.4s, v25.4s, v27.4s +sub v14.4s, v28.4s, v24.4s +sub v13.4s, v4.4s, v5.4s +add v6.4s, v6.4s, v30.4s +add v25.4s, v25.4s, v27.4s +add v28.4s, v28.4s, v24.4s +add v4.4s, v4.4s, v5.4s +sqrdmulh v5.4S, v6.4S, v1.s[1] +sqrdmulh v24.4S, v25.4S, v21.s[1] +sqrdmulh v27.4S, v28.4S, v15.s[1] +sqrdmulh v30.4S, v4.4S, v12.s[1] +mul v6.4S, v6.4S,v18.s[1] +mul v25.4S, v25.4S,v10.s[1] +mul v28.4S, v28.4S,v22.s[1] +mul v4.4S, v4.4S,v3.s[1] +mla v6.4S, v5.4S, v31.s[0] +mla v25.4S, v24.4S, v31.s[0] +mla v28.4S, v27.4S, v31.s[0] +mla v4.4S, v30.4S, v31.s[0] +sub v30.4s, v29.4s, v6.4s +sub v27.4s, v26.4s, v25.4s +sub v24.4s, v20.4s, v28.4s +sub v5.4s, v16.4s, v4.4s +add v29.4s, v29.4s, v6.4s +add v26.4s, v26.4s, v25.4s +add v20.4s, v20.4s, v28.4s +add v16.4s, v16.4s, v4.4s +sqrdmulh v4.4S, v17.4S, v1.s[2] +sqrdmulh v28.4S, v19.4S, v21.s[2] +sqrdmulh v25.4S, v14.4S, v15.s[2] +sqrdmulh v6.4S, v13.4S, v12.s[2] +str q29, [x0, #0] +str q30, [x0, #16] +mul v17.4S, v17.4S,v18.s[2] +mul v19.4S, v19.4S,v10.s[2] +mul v14.4S, v14.4S,v22.s[2] +mul v13.4S, v13.4S,v3.s[2] +str q26, [x0, #64] +str q27, [x0, #80] +ldr q12, [x17, #+256] +ldr q3, [x17, #+272] +ldr q15, [x17, #+288] +ldr q22, [x17, #+304] +mla v17.4S, v4.4S, v31.s[0] +mla v19.4S, v28.4S, v31.s[0] +mla v14.4S, v25.4S, v31.s[0] +mla v13.4S, v6.4S, v31.s[0] +str q20, [x0, #128] +str q24, [x0, #144] +ldr q24, [x17, #+320] +ldr q20, [x17, #+336] +sub v6.4s, v0.4s, v17.4s +sub v25.4s, v23.4s, v19.4s +sub v28.4s, v2.4s, v14.4s +sub v4.4s, v11.4s, v13.4s +str q16, [x0, #192] +str q5, [x0, #208] +ldr q5, [x17, #+352] +ldr q16, [x17, #+368] +add v0.4s, v0.4s, v17.4s +add v23.4s, v23.4s, v19.4s +add v2.4s, v2.4s, v14.4s +add v11.4s, v11.4s, v13.4s +str q0, [x0, #32] +str q23, [x0, #96] +str q2, [x0, #160] +str q11, [x0, #224] +ldr q11, [x0, #288] +ldr q2, [x0, #304] +ldr q23, [x0, #256] +ldr q0, [x0, #352] +ldr q13, [x0, #368] +ldr q14, [x0, #320] +ldr q19, [x0, #416] +ldr q17, [x0, #432] +ldr q21, [x0, #384] +ldr q10, [x0, #480] +ldr q1, [x0, #496] +ldr q18, [x0, #448] +sqrdmulh v27.4S, v11.4S, v3.s[0] +sqrdmulh v26.4S, v0.4S, v22.s[0] +sqrdmulh v30.4S, v19.4S, v20.s[0] +sqrdmulh v29.4S, v10.4S, v16.s[0] +str q6, [x0, #48] +mul v11.4S, v11.4S,v12.s[0] +mul v0.4S, v0.4S,v15.s[0] +mul v19.4S, v19.4S,v24.s[0] +mul v10.4S, v10.4S,v5.s[0] +str q25, [x0, #112] +mla v11.4S, v27.4S, v31.s[0] +mla v0.4S, v26.4S, v31.s[0] +mla v19.4S, v30.4S, v31.s[0] +mla v10.4S, v29.4S, v31.s[0] +str q28, [x0, #176] +sub v28.4s, v23.4s, v11.4s +sub v29.4s, v14.4s, v0.4s +sub v30.4s, v21.4s, v19.4s +sub v26.4s, v18.4s, v10.4s +str q4, [x0, #240] +add v23.4s, v23.4s, v11.4s +add v14.4s, v14.4s, v0.4s +add v21.4s, v21.4s, v19.4s +add v18.4s, v18.4s, v10.4s +ldr q10, [x0, #272] +ldr q19, [x0, #336] +ldr q0, [x0, #400] +ldr q11, [x0, #464] +sqrdmulh v4.4S, v2.4S, v3.s[0] +sqrdmulh v27.4S, v13.4S, v22.s[0] +sqrdmulh v25.4S, v17.4S, v20.s[0] +sqrdmulh v6.4S, v1.4S, v16.s[0] +mul v2.4S, v2.4S,v12.s[0] +mul v13.4S, v13.4S,v15.s[0] +mul v17.4S, v17.4S,v24.s[0] +mul v1.4S, v1.4S,v5.s[0] +mla v2.4S, v4.4S, v31.s[0] +mla v13.4S, v27.4S, v31.s[0] +mla v17.4S, v25.4S, v31.s[0] +mla v1.4S, v6.4S, v31.s[0] +sub v6.4s, v10.4s, v2.4s +sub v25.4s, v19.4s, v13.4s +sub v27.4s, v0.4s, v17.4s +sub v4.4s, v11.4s, v1.4s +add v10.4s, v10.4s, v2.4s +add v19.4s, v19.4s, v13.4s +add v0.4s, v0.4s, v17.4s +add v11.4s, v11.4s, v1.4s +sqrdmulh v1.4S, v10.4S, v3.s[1] +sqrdmulh v17.4S, v19.4S, v22.s[1] +sqrdmulh v13.4S, v0.4S, v20.s[1] +sqrdmulh v2.4S, v11.4S, v16.s[1] +mul v10.4S, v10.4S,v12.s[1] +mul v19.4S, v19.4S,v15.s[1] +mul v0.4S, v0.4S,v24.s[1] +mul v11.4S, v11.4S,v5.s[1] +mla v10.4S, v1.4S, v31.s[0] +mla v19.4S, v17.4S, v31.s[0] +mla v0.4S, v13.4S, v31.s[0] +mla v11.4S, v2.4S, v31.s[0] +sub v2.4s, v23.4s, v10.4s +sub v13.4s, v14.4s, v19.4s +sub v17.4s, v21.4s, v0.4s +sub v1.4s, v18.4s, v11.4s +add v23.4s, v23.4s, v10.4s +add v14.4s, v14.4s, v19.4s +add v21.4s, v21.4s, v0.4s +add v18.4s, v18.4s, v11.4s +sqrdmulh v11.4S, v6.4S, v3.s[2] +sqrdmulh v0.4S, v25.4S, v22.s[2] +sqrdmulh v19.4S, v27.4S, v20.s[2] +sqrdmulh v10.4S, v4.4S, v16.s[2] +str q23, [x0, #256] +str q2, [x0, #272] +mul v6.4S, v6.4S,v12.s[2] +mul v25.4S, v25.4S,v15.s[2] +mul v27.4S, v27.4S,v24.s[2] +mul v4.4S, v4.4S,v5.s[2] +str q14, [x0, #320] +str q13, [x0, #336] +ldr q16, [x17, #+384] +ldr q5, [x17, #+400] +ldr q20, [x17, #+416] +ldr q24, [x17, #+432] +mla v6.4S, v11.4S, v31.s[0] +mla v25.4S, v0.4S, v31.s[0] +mla v27.4S, v19.4S, v31.s[0] +mla v4.4S, v10.4S, v31.s[0] +str q21, [x0, #384] +str q17, [x0, #400] +ldr q17, [x17, #+448] +ldr q21, [x17, #+464] +sub v10.4s, v28.4s, v6.4s +sub v19.4s, v29.4s, v25.4s +sub v0.4s, v30.4s, v27.4s +sub v11.4s, v26.4s, v4.4s +str q18, [x0, #448] +str q1, [x0, #464] +ldr q1, [x17, #+480] +ldr q18, [x17, #+496] +add v28.4s, v28.4s, v6.4s +add v29.4s, v29.4s, v25.4s +add v30.4s, v30.4s, v27.4s +add v26.4s, v26.4s, v4.4s +str q28, [x0, #288] +str q29, [x0, #352] +str q30, [x0, #416] +str q26, [x0, #480] +ldr q26, [x0, #544] +ldr q30, [x0, #560] +ldr q29, [x0, #512] +ldr q28, [x0, #608] +ldr q4, [x0, #624] +ldr q27, [x0, #576] +ldr q25, [x0, #672] +ldr q6, [x0, #688] +ldr q22, [x0, #640] +ldr q15, [x0, #736] +ldr q3, [x0, #752] +ldr q12, [x0, #704] +sqrdmulh v13.4S, v26.4S, v5.s[0] +sqrdmulh v14.4S, v28.4S, v24.s[0] +sqrdmulh v2.4S, v25.4S, v21.s[0] +sqrdmulh v23.4S, v15.4S, v18.s[0] +str q10, [x0, #304] +mul v26.4S, v26.4S,v16.s[0] +mul v28.4S, v28.4S,v20.s[0] +mul v25.4S, v25.4S,v17.s[0] +mul v15.4S, v15.4S,v1.s[0] +str q19, [x0, #368] +mla v26.4S, v13.4S, v31.s[0] +mla v28.4S, v14.4S, v31.s[0] +mla v25.4S, v2.4S, v31.s[0] +mla v15.4S, v23.4S, v31.s[0] +str q0, [x0, #432] +sub v0.4s, v29.4s, v26.4s +sub v23.4s, v27.4s, v28.4s +sub v2.4s, v22.4s, v25.4s +sub v14.4s, v12.4s, v15.4s +str q11, [x0, #496] +add v29.4s, v29.4s, v26.4s +add v27.4s, v27.4s, v28.4s +add v22.4s, v22.4s, v25.4s +add v12.4s, v12.4s, v15.4s +ldr q15, [x0, #528] +ldr q25, [x0, #592] +ldr q28, [x0, #656] +ldr q26, [x0, #720] +sqrdmulh v11.4S, v30.4S, v5.s[0] +sqrdmulh v13.4S, v4.4S, v24.s[0] +sqrdmulh v19.4S, v6.4S, v21.s[0] +sqrdmulh v10.4S, v3.4S, v18.s[0] +mul v30.4S, v30.4S,v16.s[0] +mul v4.4S, v4.4S,v20.s[0] +mul v6.4S, v6.4S,v17.s[0] +mul v3.4S, v3.4S,v1.s[0] +mla v30.4S, v11.4S, v31.s[0] +mla v4.4S, v13.4S, v31.s[0] +mla v6.4S, v19.4S, v31.s[0] +mla v3.4S, v10.4S, v31.s[0] +sub v10.4s, v15.4s, v30.4s +sub v19.4s, v25.4s, v4.4s +sub v13.4s, v28.4s, v6.4s +sub v11.4s, v26.4s, v3.4s +add v15.4s, v15.4s, v30.4s +add v25.4s, v25.4s, v4.4s +add v28.4s, v28.4s, v6.4s +add v26.4s, v26.4s, v3.4s +sqrdmulh v3.4S, v15.4S, v5.s[1] +sqrdmulh v6.4S, v25.4S, v24.s[1] +sqrdmulh v4.4S, v28.4S, v21.s[1] +sqrdmulh v30.4S, v26.4S, v18.s[1] +mul v15.4S, v15.4S,v16.s[1] +mul v25.4S, v25.4S,v20.s[1] +mul v28.4S, v28.4S,v17.s[1] +mul v26.4S, v26.4S,v1.s[1] +mla v15.4S, v3.4S, v31.s[0] +mla v25.4S, v6.4S, v31.s[0] +mla v28.4S, v4.4S, v31.s[0] +mla v26.4S, v30.4S, v31.s[0] +sub v30.4s, v29.4s, v15.4s +sub v4.4s, v27.4s, v25.4s +sub v6.4s, v22.4s, v28.4s +sub v3.4s, v12.4s, v26.4s +add v29.4s, v29.4s, v15.4s +add v27.4s, v27.4s, v25.4s +add v22.4s, v22.4s, v28.4s +add v12.4s, v12.4s, v26.4s +sqrdmulh v26.4S, v10.4S, v5.s[2] +sqrdmulh v28.4S, v19.4S, v24.s[2] +sqrdmulh v25.4S, v13.4S, v21.s[2] +sqrdmulh v15.4S, v11.4S, v18.s[2] +str q29, [x0, #512] +str q30, [x0, #528] +mul v10.4S, v10.4S,v16.s[2] +mul v19.4S, v19.4S,v20.s[2] +mul v13.4S, v13.4S,v17.s[2] +mul v11.4S, v11.4S,v1.s[2] +str q27, [x0, #576] +str q4, [x0, #592] +ldr q18, [x17, #+512] +ldr q1, [x17, #+528] +ldr q21, [x17, #+544] +ldr q17, [x17, #+560] +mla v10.4S, v26.4S, v31.s[0] +mla v19.4S, v28.4S, v31.s[0] +mla v13.4S, v25.4S, v31.s[0] +mla v11.4S, v15.4S, v31.s[0] +str q22, [x0, #640] +str q6, [x0, #656] +ldr q6, [x17, #+576] +ldr q22, [x17, #+592] +sub v15.4s, v0.4s, v10.4s +sub v25.4s, v23.4s, v19.4s +sub v28.4s, v2.4s, v13.4s +sub v26.4s, v14.4s, v11.4s +str q12, [x0, #704] +str q3, [x0, #720] +ldr q3, [x17, #+608] +ldr q12, [x17, #+624] +add v0.4s, v0.4s, v10.4s +add v23.4s, v23.4s, v19.4s +add v2.4s, v2.4s, v13.4s +add v14.4s, v14.4s, v11.4s +str q0, [x0, #544] +str q23, [x0, #608] +str q2, [x0, #672] +str q14, [x0, #736] +ldr q14, [x0, #800] +ldr q2, [x0, #816] +ldr q23, [x0, #768] +ldr q0, [x0, #864] +ldr q11, [x0, #880] +ldr q13, [x0, #832] +ldr q19, [x0, #928] +ldr q10, [x0, #944] +ldr q24, [x0, #896] +ldr q20, [x0, #992] +ldr q5, [x0, #1008] +ldr q16, [x0, #960] +sqrdmulh v4.4S, v14.4S, v1.s[0] +sqrdmulh v27.4S, v0.4S, v17.s[0] +sqrdmulh v30.4S, v19.4S, v22.s[0] +sqrdmulh v29.4S, v20.4S, v12.s[0] +str q15, [x0, #560] +mul v14.4S, v14.4S,v18.s[0] +mul v0.4S, v0.4S,v21.s[0] +mul v19.4S, v19.4S,v6.s[0] +mul v20.4S, v20.4S,v3.s[0] +str q25, [x0, #624] +mla v14.4S, v4.4S, v31.s[0] +mla v0.4S, v27.4S, v31.s[0] +mla v19.4S, v30.4S, v31.s[0] +mla v20.4S, v29.4S, v31.s[0] +str q28, [x0, #688] +sub v28.4s, v23.4s, v14.4s +sub v29.4s, v13.4s, v0.4s +sub v30.4s, v24.4s, v19.4s +sub v27.4s, v16.4s, v20.4s +str q26, [x0, #752] +add v23.4s, v23.4s, v14.4s +add v13.4s, v13.4s, v0.4s +add v24.4s, v24.4s, v19.4s +add v16.4s, v16.4s, v20.4s +ldr q20, [x0, #784] +ldr q19, [x0, #848] +ldr q0, [x0, #912] +ldr q14, [x0, #976] +sqrdmulh v26.4S, v2.4S, v1.s[0] +sqrdmulh v4.4S, v11.4S, v17.s[0] +sqrdmulh v25.4S, v10.4S, v22.s[0] +sqrdmulh v15.4S, v5.4S, v12.s[0] +mul v2.4S, v2.4S,v18.s[0] +mul v11.4S, v11.4S,v21.s[0] +mul v10.4S, v10.4S,v6.s[0] +mul v5.4S, v5.4S,v3.s[0] +mla v2.4S, v26.4S, v31.s[0] +mla v11.4S, v4.4S, v31.s[0] +mla v10.4S, v25.4S, v31.s[0] +mla v5.4S, v15.4S, v31.s[0] +sub v15.4s, v20.4s, v2.4s +sub v25.4s, v19.4s, v11.4s +sub v4.4s, v0.4s, v10.4s +sub v26.4s, v14.4s, v5.4s +add v20.4s, v20.4s, v2.4s +add v19.4s, v19.4s, v11.4s +add v0.4s, v0.4s, v10.4s +add v14.4s, v14.4s, v5.4s +sqrdmulh v5.4S, v20.4S, v1.s[1] +sqrdmulh v10.4S, v19.4S, v17.s[1] +sqrdmulh v11.4S, v0.4S, v22.s[1] +sqrdmulh v2.4S, v14.4S, v12.s[1] +mul v20.4S, v20.4S,v18.s[1] +mul v19.4S, v19.4S,v21.s[1] +mul v0.4S, v0.4S,v6.s[1] +mul v14.4S, v14.4S,v3.s[1] +mla v20.4S, v5.4S, v31.s[0] +mla v19.4S, v10.4S, v31.s[0] +mla v0.4S, v11.4S, v31.s[0] +mla v14.4S, v2.4S, v31.s[0] +sub v2.4s, v23.4s, v20.4s +sub v11.4s, v13.4s, v19.4s +sub v10.4s, v24.4s, v0.4s +sub v5.4s, v16.4s, v14.4s +add v23.4s, v23.4s, v20.4s +add v13.4s, v13.4s, v19.4s +add v24.4s, v24.4s, v0.4s +add v16.4s, v16.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v1.s[2] +sqrdmulh v0.4S, v25.4S, v17.s[2] +sqrdmulh v19.4S, v4.4S, v22.s[2] +sqrdmulh v20.4S, v26.4S, v12.s[2] +str q23, [x0, #768] +str q2, [x0, #784] +mul v15.4S, v15.4S,v18.s[2] +mul v25.4S, v25.4S,v21.s[2] +mul v4.4S, v4.4S,v6.s[2] +mul v26.4S, v26.4S,v3.s[2] +str q13, [x0, #832] +str q11, [x0, #848] +mla v15.4S, v14.4S, v31.s[0] +mla v25.4S, v0.4S, v31.s[0] +mla v4.4S, v19.4S, v31.s[0] +mla v26.4S, v20.4S, v31.s[0] +str q24, [x0, #896] +str q10, [x0, #912] +sub v10.4s, v28.4s, v15.4s +sub v24.4s, v29.4s, v25.4s +sub v20.4s, v30.4s, v4.4s +sub v19.4s, v27.4s, v26.4s +str q16, [x0, #960] +str q5, [x0, #976] +add v28.4s, v28.4s, v15.4s +add v29.4s, v29.4s, v25.4s +add v30.4s, v30.4s, v4.4s +add v27.4s, v27.4s, v26.4s +str q28, [x0, #800] +str q29, [x0, #864] +str q30, [x0, #928] +str q27, [x0, #992] +str q10, [x0, #816] +str q24, [x0, #880] +str q20, [x0, #944] +str q19, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1464 +// Instruction count: 1460 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_4.s b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_4.s new file mode 100644 index 0000000..dbd61ed --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_4.s @@ -0,0 +1,1494 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_7_z4_4 +.global _ntt_u32_incomplete_neon_asm_var_4_2_7_z4_4 +ntt_u32_incomplete_neon_asm_var_4_2_7_z4_4: +_ntt_u32_incomplete_neon_asm_var_4_2_7_z4_4: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #928] +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +ldr q20, [x0, #992] +sqrdmulh v19.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q18, [x0, #800] +sqrdmulh v17.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +ldr q16, [x0, #864] +sqrdmulh v3.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +mla v22.4S, v21.4S, v31.s[0] +mla v20.4S, v19.4S, v31.s[0] +mla v18.4S, v17.4S, v31.s[0] +mla v16.4S, v3.4S, v31.s[0] +ldr q3, [x0, #544] +sqrdmulh v17.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +ldr q19, [x0, #608] +sqrdmulh v21.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q2, [x0, #672] +ldr q1, [x0, #416] +sqrdmulh v0.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +sub v15.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +ldr q22, [x0, #736] +ldr q14, [x0, #480] +sqrdmulh v13.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v12.4s, v14.4s, v20.4s +add v14.4s, v14.4s, v20.4s +ldr q20, [x0, #288] +mla v3.4S, v17.4S, v31.s[0] +mla v19.4S, v21.4S, v31.s[0] +sub v21.4s, v20.4s, v18.4s +mla v2.4S, v0.4S, v31.s[0] +mla v22.4S, v13.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +ldr q18, [x0, #352] +sqrdmulh v13.4S, v1.4S, v29.s[1] +mul v1.4S, v1.4S,v30.s[1] +sub v0.4s, v18.4s, v16.4s +sqrdmulh v17.4S, v14.4S, v29.s[1] +mul v14.4S, v14.4S,v30.s[1] +add v18.4s, v18.4s, v16.4s +ldr q16, [x0, #32] +sqrdmulh v11.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v10.4s, v16.4s, v3.4s +add v16.4s, v16.4s, v3.4s +ldr q3, [x0, #96] +sqrdmulh v9.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v8.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +ldr q19, [x0, #160] +mla v1.4S, v13.4S, v31.s[0] +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v19.4s, v2.4s +mla v20.4S, v11.4S, v31.s[0] +mla v18.4S, v9.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +ldr q2, [x0, #224] +sqrdmulh v9.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +sub v11.4s, v2.4s, v22.4s +sqrdmulh v13.4S, v12.4S, v29.s[2] +mul v12.4S, v12.4S,v30.s[2] +add v2.4s, v2.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +sub v7.4s, v19.4s, v1.4s +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v29.s[2] +mul v0.4S, v0.4S,v30.s[2] +sub v6.4s, v2.4s, v14.4s +add v2.4s, v2.4s, v14.4s +mla v15.4S, v9.4S, v31.s[0] +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v16.4s, v20.4s +mla v21.4S, v22.4S, v31.s[0] +mla v0.4S, v1.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v7.4S, v27.s[1] +mul v7.4S, v7.4S,v28.s[1] +sub v1.4s, v3.4s, v18.4s +sqrdmulh v22.4S, v6.4S, v27.s[1] +mul v6.4S, v6.4S,v28.s[1] +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v19.4S, v27.s[0] +mul v19.4S, v19.4S,v28.s[0] +sub v9.4s, v17.4s, v15.4s +add v17.4s, v17.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v27.s[0] +mul v2.4S, v2.4S,v28.s[0] +sub v14.4s, v11.4s, v12.4s +add v11.4s, v11.4s, v12.4s +mla v7.4S, v20.4S, v31.s[0] +mla v6.4S, v22.4S, v31.s[0] +sub v22.4s, v10.4s, v21.4s +mla v19.4S, v18.4S, v31.s[0] +mla v2.4S, v15.4S, v31.s[0] +add v10.4s, v10.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v27.s[2] +mul v17.4S, v17.4S,v28.s[2] +sub v15.4s, v8.4s, v0.4s +sqrdmulh v18.4S, v11.4S, v27.s[2] +mul v11.4S, v11.4S,v28.s[2] +add v8.4s, v8.4s, v0.4s +sqrdmulh v0.4S, v9.4S, v27.s[3] +mul v9.4S, v9.4S,v28.s[3] +sub v20.4s, v13.4s, v7.4s +add v13.4s, v13.4s, v7.4s +sqrdmulh v7.4S, v14.4S, v27.s[3] +mul v14.4S, v14.4S,v28.s[3] +sub v12.4s, v1.4s, v6.4s +add v1.4s, v1.4s, v6.4s +mla v17.4S, v21.4S, v31.s[0] +mla v11.4S, v18.4S, v31.s[0] +sub v18.4s, v16.4s, v19.4s +mla v9.4S, v0.4S, v31.s[0] +mla v14.4S, v7.4S, v31.s[0] +add v16.4s, v16.4s, v19.4s +sqrdmulh v19.4S, v1.4S, v25.s[2] +mul v1.4S, v1.4S,v26.s[2] +sub v7.4s, v3.4s, v2.4s +sqrdmulh v0.4S, v12.4S, v25.s[3] +mul v12.4S, v12.4S,v26.s[3] +add v3.4s, v3.4s, v2.4s +sqrdmulh v2.4S, v7.4S, v25.s[1] +mul v7.4S, v7.4S,v26.s[1] +sub v21.4s, v10.4s, v17.4s +add v10.4s, v10.4s, v17.4s +sqrdmulh v17.4S, v3.4S, v25.s[0] +mul v3.4S, v3.4S,v26.s[0] +sub v6.4s, v8.4s, v11.4s +add v8.4s, v8.4s, v11.4s +mla v1.4S, v19.4S, v31.s[0] +mla v12.4S, v0.4S, v31.s[0] +sub v0.4s, v22.4s, v9.4s +mla v7.4S, v2.4S, v31.s[0] +mla v3.4S, v17.4S, v31.s[0] +add v22.4s, v22.4s, v9.4s +sqrdmulh v9.4S, v8.4S, v23.s[0] +mul v8.4S, v8.4S,v24.s[0] +sub v17.4s, v15.4s, v14.4s +sqrdmulh v2.4S, v6.4S, v23.s[1] +mul v6.4S, v6.4S,v24.s[1] +add v15.4s, v15.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v23.s[2] +mul v15.4S, v15.4S,v24.s[2] +sub v19.4s, v13.4s, v1.4s +add v13.4s, v13.4s, v1.4s +sqrdmulh v1.4S, v17.4S, v23.s[3] +mul v17.4S, v17.4S,v24.s[3] +sub v11.4s, v20.4s, v12.4s +add v20.4s, v20.4s, v12.4s +mla v8.4S, v9.4S, v31.s[0] +mla v6.4S, v2.4S, v31.s[0] +sub v2.4s, v18.4s, v7.4s +str q13, [x0, #288] +mla v15.4S, v14.4S, v31.s[0] +mla v17.4S, v1.4S, v31.s[0] +add v18.4s, v18.4s, v7.4s +str q19, [x0, #352] +ldr q19, [x0, #944] +sqrdmulh v7.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +sub v1.4s, v16.4s, v3.4s +str q20, [x0, #416] +ldr q20, [x0, #1008] +sqrdmulh v14.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v16.4s, v16.4s, v3.4s +str q11, [x0, #480] +ldr q11, [x0, #816] +sqrdmulh v3.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +sub v13.4s, v10.4s, v8.4s +add v10.4s, v10.4s, v8.4s +ldr q8, [x0, #880] +sqrdmulh v9.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v12.4s, v21.4s, v6.4s +add v21.4s, v21.4s, v6.4s +mla v19.4S, v7.4S, v31.s[0] +mla v20.4S, v14.4S, v31.s[0] +sub v14.4s, v22.4s, v15.4s +str q18, [x0, #160] +mla v11.4S, v3.4S, v31.s[0] +mla v8.4S, v9.4S, v31.s[0] +add v22.4s, v22.4s, v15.4s +str q2, [x0, #224] +ldr q2, [x0, #560] +sqrdmulh v15.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +sub v9.4s, v0.4s, v17.4s +str q16, [x0, #32] +ldr q16, [x0, #624] +sqrdmulh v3.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +add v0.4s, v0.4s, v17.4s +str q1, [x0, #96] +ldr q1, [x0, #688] +ldr q17, [x0, #432] +sqrdmulh v18.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +sub v7.4s, v17.4s, v19.4s +add v17.4s, v17.4s, v19.4s +ldr q19, [x0, #752] +ldr q6, [x0, #496] +sqrdmulh v5.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +sub v4.4s, v6.4s, v20.4s +add v6.4s, v6.4s, v20.4s +ldr q20, [x0, #304] +mla v2.4S, v15.4S, v31.s[0] +mla v16.4S, v3.4S, v31.s[0] +sub v3.4s, v20.4s, v11.4s +str q10, [x0, #544] +mla v1.4S, v18.4S, v31.s[0] +mla v19.4S, v5.4S, v31.s[0] +add v20.4s, v20.4s, v11.4s +str q13, [x0, #608] +ldr q13, [x0, #368] +sqrdmulh v11.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v5.4s, v13.4s, v8.4s +str q21, [x0, #672] +sqrdmulh v21.4S, v6.4S, v29.s[1] +mul v6.4S, v6.4S,v30.s[1] +add v13.4s, v13.4s, v8.4s +str q12, [x0, #736] +ldr q12, [x0, #48] +sqrdmulh v8.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v18.4s, v12.4s, v2.4s +add v12.4s, v12.4s, v2.4s +ldr q2, [x0, #112] +sqrdmulh v10.4S, v13.4S, v29.s[1] +mul v13.4S, v13.4S,v30.s[1] +sub v15.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +ldr q16, [x0, #176] +mla v17.4S, v11.4S, v31.s[0] +mla v6.4S, v21.4S, v31.s[0] +sub v21.4s, v16.4s, v1.4s +str q22, [x0, #800] +mla v20.4S, v8.4S, v31.s[0] +mla v13.4S, v10.4S, v31.s[0] +add v16.4s, v16.4s, v1.4s +str q14, [x0, #864] +ldr q14, [x0, #240] +sqrdmulh v1.4S, v7.4S, v29.s[2] +mul v7.4S, v7.4S,v30.s[2] +sub v10.4s, v14.4s, v19.4s +str q0, [x0, #928] +sqrdmulh v0.4S, v4.4S, v29.s[2] +mul v4.4S, v4.4S,v30.s[2] +add v14.4s, v14.4s, v19.4s +str q9, [x0, #992] +sqrdmulh v9.4S, v3.4S, v29.s[2] +mul v3.4S, v3.4S,v30.s[2] +sub v19.4s, v16.4s, v17.4s +add v16.4s, v16.4s, v17.4s +sqrdmulh v17.4S, v5.4S, v29.s[2] +mul v5.4S, v5.4S,v30.s[2] +sub v8.4s, v14.4s, v6.4s +add v14.4s, v14.4s, v6.4s +mla v7.4S, v1.4S, v31.s[0] +mla v4.4S, v0.4S, v31.s[0] +sub v0.4s, v12.4s, v20.4s +mla v3.4S, v9.4S, v31.s[0] +mla v5.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v27.s[1] +mul v19.4S, v19.4S,v28.s[1] +sub v17.4s, v2.4s, v13.4s +sqrdmulh v9.4S, v8.4S, v27.s[1] +mul v8.4S, v8.4S,v28.s[1] +add v2.4s, v2.4s, v13.4s +sqrdmulh v13.4S, v16.4S, v27.s[0] +mul v16.4S, v16.4S,v28.s[0] +sub v1.4s, v21.4s, v7.4s +add v21.4s, v21.4s, v7.4s +sqrdmulh v7.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +sub v6.4s, v10.4s, v4.4s +add v10.4s, v10.4s, v4.4s +mla v19.4S, v20.4S, v31.s[0] +mla v8.4S, v9.4S, v31.s[0] +sub v9.4s, v18.4s, v3.4s +mla v16.4S, v13.4S, v31.s[0] +mla v14.4S, v7.4S, v31.s[0] +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v27.s[2] +mul v21.4S, v21.4S,v28.s[2] +sub v7.4s, v15.4s, v5.4s +sqrdmulh v13.4S, v10.4S, v27.s[2] +mul v10.4S, v10.4S,v28.s[2] +add v15.4s, v15.4s, v5.4s +sqrdmulh v5.4S, v1.4S, v27.s[3] +mul v1.4S, v1.4S,v28.s[3] +sub v20.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v27.s[3] +mul v6.4S, v6.4S,v28.s[3] +sub v4.4s, v17.4s, v8.4s +add v17.4s, v17.4s, v8.4s +mla v21.4S, v3.4S, v31.s[0] +mla v10.4S, v13.4S, v31.s[0] +sub v13.4s, v12.4s, v16.4s +mla v1.4S, v5.4S, v31.s[0] +mla v6.4S, v19.4S, v31.s[0] +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v25.s[2] +mul v17.4S, v17.4S,v26.s[2] +sub v19.4s, v2.4s, v14.4s +sqrdmulh v5.4S, v4.4S, v25.s[3] +mul v4.4S, v4.4S,v26.s[3] +add v2.4s, v2.4s, v14.4s +sqrdmulh v14.4S, v19.4S, v25.s[1] +mul v19.4S, v19.4S,v26.s[1] +sub v3.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v2.4S, v25.s[0] +mul v2.4S, v2.4S,v26.s[0] +sub v8.4s, v15.4s, v10.4s +add v15.4s, v15.4s, v10.4s +mla v17.4S, v16.4S, v31.s[0] +mla v4.4S, v5.4S, v31.s[0] +sub v5.4s, v9.4s, v1.4s +mla v19.4S, v14.4S, v31.s[0] +mla v2.4S, v21.4S, v31.s[0] +add v9.4s, v9.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v23.s[0] +mul v15.4S, v15.4S,v24.s[0] +sub v21.4s, v7.4s, v6.4s +sqrdmulh v14.4S, v8.4S, v23.s[1] +mul v8.4S, v8.4S,v24.s[1] +add v7.4s, v7.4s, v6.4s +sqrdmulh v6.4S, v7.4S, v23.s[2] +mul v7.4S, v7.4S,v24.s[2] +sub v16.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +sqrdmulh v17.4S, v21.4S, v23.s[3] +mul v21.4S, v21.4S,v24.s[3] +sub v10.4s, v20.4s, v4.4s +add v20.4s, v20.4s, v4.4s +mla v15.4S, v1.4S, v31.s[0] +mla v8.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v19.4s +str q0, [x0, #304] +mla v7.4S, v6.4S, v31.s[0] +mla v21.4S, v17.4S, v31.s[0] +add v13.4s, v13.4s, v19.4s +str q16, [x0, #368] +ldr q16, [x0, #896] +sqrdmulh v19.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v17.4s, v12.4s, v2.4s +str q20, [x0, #432] +ldr q20, [x0, #960] +sqrdmulh v6.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v12.4s, v12.4s, v2.4s +str q10, [x0, #496] +ldr q10, [x0, #768] +sqrdmulh v2.4S, v10.4S, v29.s[0] +mul v10.4S, v10.4S,v30.s[0] +sub v0.4s, v18.4s, v15.4s +add v18.4s, v18.4s, v15.4s +ldr q15, [x0, #832] +sqrdmulh v1.4S, v15.4S, v29.s[0] +mul v15.4S, v15.4S,v30.s[0] +sub v4.4s, v3.4s, v8.4s +add v3.4s, v3.4s, v8.4s +mla v16.4S, v19.4S, v31.s[0] +mla v20.4S, v6.4S, v31.s[0] +sub v6.4s, v9.4s, v7.4s +str q13, [x0, #176] +mla v10.4S, v2.4S, v31.s[0] +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v7.4s +str q14, [x0, #240] +ldr q14, [x0, #512] +sqrdmulh v7.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v1.4s, v5.4s, v21.4s +str q12, [x0, #48] +ldr q12, [x0, #576] +sqrdmulh v2.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +add v5.4s, v5.4s, v21.4s +str q17, [x0, #112] +ldr q17, [x0, #640] +ldr q21, [x0, #384] +sqrdmulh v13.4S, v17.4S, v29.s[0] +mul v17.4S, v17.4S,v30.s[0] +sub v19.4s, v21.4s, v16.4s +add v21.4s, v21.4s, v16.4s +ldr q16, [x0, #704] +ldr q8, [x0, #448] +sqrdmulh v22.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v11.4s, v8.4s, v20.4s +add v8.4s, v8.4s, v20.4s +ldr q20, [x0, #256] +mla v14.4S, v7.4S, v31.s[0] +mla v12.4S, v2.4S, v31.s[0] +sub v2.4s, v20.4s, v10.4s +str q18, [x0, #560] +mla v17.4S, v13.4S, v31.s[0] +mla v16.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v10.4s +str q0, [x0, #624] +ldr q0, [x0, #320] +sqrdmulh v10.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v22.4s, v0.4s, v15.4s +str q3, [x0, #688] +sqrdmulh v3.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +add v0.4s, v0.4s, v15.4s +str q4, [x0, #752] +ldr q4, [x0, #0] +sqrdmulh v15.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v13.4s, v4.4s, v14.4s +add v4.4s, v4.4s, v14.4s +ldr q14, [x0, #64] +sqrdmulh v18.4S, v0.4S, v29.s[1] +mul v0.4S, v0.4S,v30.s[1] +sub v7.4s, v14.4s, v12.4s +add v14.4s, v14.4s, v12.4s +ldr q12, [x0, #128] +mla v21.4S, v10.4S, v31.s[0] +mla v8.4S, v3.4S, v31.s[0] +sub v3.4s, v12.4s, v17.4s +str q9, [x0, #816] +mla v20.4S, v15.4S, v31.s[0] +mla v0.4S, v18.4S, v31.s[0] +add v12.4s, v12.4s, v17.4s +str q6, [x0, #880] +ldr q6, [x0, #192] +sqrdmulh v17.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +sub v18.4s, v6.4s, v16.4s +str q5, [x0, #944] +sqrdmulh v5.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +add v6.4s, v6.4s, v16.4s +str q1, [x0, #1008] +sqrdmulh v1.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v16.4s, v12.4s, v21.4s +add v12.4s, v12.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +sub v15.4s, v6.4s, v8.4s +add v6.4s, v6.4s, v8.4s +mla v19.4S, v17.4S, v31.s[0] +mla v11.4S, v5.4S, v31.s[0] +sub v5.4s, v4.4s, v20.4s +mla v2.4S, v1.4S, v31.s[0] +mla v22.4S, v21.4S, v31.s[0] +add v4.4s, v4.4s, v20.4s +sqrdmulh v20.4S, v16.4S, v27.s[1] +mul v16.4S, v16.4S,v28.s[1] +sub v21.4s, v14.4s, v0.4s +sqrdmulh v1.4S, v15.4S, v27.s[1] +mul v15.4S, v15.4S,v28.s[1] +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +sub v17.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v27.s[0] +mul v6.4S, v6.4S,v28.s[0] +sub v8.4s, v18.4s, v11.4s +add v18.4s, v18.4s, v11.4s +mla v16.4S, v20.4S, v31.s[0] +mla v15.4S, v1.4S, v31.s[0] +sub v1.4s, v13.4s, v2.4s +mla v12.4S, v0.4S, v31.s[0] +mla v6.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v3.4S, v27.s[2] +mul v3.4S, v3.4S,v28.s[2] +sub v19.4s, v7.4s, v22.4s +sqrdmulh v0.4S, v18.4S, v27.s[2] +mul v18.4S, v18.4S,v28.s[2] +add v7.4s, v7.4s, v22.4s +sqrdmulh v22.4S, v17.4S, v27.s[3] +mul v17.4S, v17.4S,v28.s[3] +sub v20.4s, v5.4s, v16.4s +add v5.4s, v5.4s, v16.4s +sqrdmulh v16.4S, v8.4S, v27.s[3] +mul v8.4S, v8.4S,v28.s[3] +sub v11.4s, v21.4s, v15.4s +add v21.4s, v21.4s, v15.4s +mla v3.4S, v2.4S, v31.s[0] +mla v18.4S, v0.4S, v31.s[0] +sub v0.4s, v4.4s, v12.4s +mla v17.4S, v22.4S, v31.s[0] +mla v8.4S, v16.4S, v31.s[0] +add v4.4s, v4.4s, v12.4s +sqrdmulh v12.4S, v21.4S, v25.s[2] +mul v21.4S, v21.4S,v26.s[2] +sub v16.4s, v14.4s, v6.4s +sqrdmulh v22.4S, v11.4S, v25.s[3] +mul v11.4S, v11.4S,v26.s[3] +add v14.4s, v14.4s, v6.4s +sqrdmulh v6.4S, v16.4S, v25.s[1] +mul v16.4S, v16.4S,v26.s[1] +sub v2.4s, v13.4s, v3.4s +add v13.4s, v13.4s, v3.4s +sqrdmulh v3.4S, v14.4S, v25.s[0] +mul v14.4S, v14.4S,v26.s[0] +sub v15.4s, v7.4s, v18.4s +add v7.4s, v7.4s, v18.4s +mla v21.4S, v12.4S, v31.s[0] +mla v11.4S, v22.4S, v31.s[0] +sub v22.4s, v1.4s, v17.4s +mla v16.4S, v6.4S, v31.s[0] +mla v14.4S, v3.4S, v31.s[0] +add v1.4s, v1.4s, v17.4s +sqrdmulh v17.4S, v7.4S, v23.s[0] +mul v7.4S, v7.4S,v24.s[0] +sub v3.4s, v19.4s, v8.4s +sqrdmulh v6.4S, v15.4S, v23.s[1] +mul v15.4S, v15.4S,v24.s[1] +add v19.4s, v19.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v23.s[2] +mul v19.4S, v19.4S,v24.s[2] +sub v12.4s, v5.4s, v21.4s +add v5.4s, v5.4s, v21.4s +sqrdmulh v21.4S, v3.4S, v23.s[3] +mul v3.4S, v3.4S,v24.s[3] +sub v18.4s, v20.4s, v11.4s +add v20.4s, v20.4s, v11.4s +mla v7.4S, v17.4S, v31.s[0] +mla v15.4S, v6.4S, v31.s[0] +sub v6.4s, v0.4s, v16.4s +str q5, [x0, #256] +mla v19.4S, v8.4S, v31.s[0] +mla v3.4S, v21.4S, v31.s[0] +add v0.4s, v0.4s, v16.4s +str q12, [x0, #320] +ldr q12, [x0, #912] +sqrdmulh v16.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +sub v21.4s, v4.4s, v14.4s +str q20, [x0, #384] +ldr q20, [x0, #976] +sqrdmulh v8.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v4.4s, v4.4s, v14.4s +str q18, [x0, #448] +ldr q18, [x0, #784] +sqrdmulh v14.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +sub v5.4s, v13.4s, v7.4s +add v13.4s, v13.4s, v7.4s +ldr q7, [x0, #848] +sqrdmulh v17.4S, v7.4S, v29.s[0] +mul v7.4S, v7.4S,v30.s[0] +sub v11.4s, v2.4s, v15.4s +add v2.4s, v2.4s, v15.4s +mla v12.4S, v16.4S, v31.s[0] +mla v20.4S, v8.4S, v31.s[0] +sub v8.4s, v1.4s, v19.4s +str q0, [x0, #128] +mla v18.4S, v14.4S, v31.s[0] +mla v7.4S, v17.4S, v31.s[0] +add v1.4s, v1.4s, v19.4s +str q6, [x0, #192] +ldr q6, [x0, #528] +sqrdmulh v19.4S, v6.4S, v29.s[0] +mul v6.4S, v6.4S,v30.s[0] +sub v17.4s, v22.4s, v3.4s +str q4, [x0, #0] +ldr q4, [x0, #592] +sqrdmulh v14.4S, v4.4S, v29.s[0] +mul v4.4S, v4.4S,v30.s[0] +add v22.4s, v22.4s, v3.4s +str q21, [x0, #64] +ldr q21, [x0, #656] +ldr q3, [x0, #400] +sqrdmulh v0.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +sub v16.4s, v3.4s, v12.4s +add v3.4s, v3.4s, v12.4s +ldr q12, [x0, #720] +ldr q15, [x0, #464] +sqrdmulh v9.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +sub v10.4s, v15.4s, v20.4s +add v15.4s, v15.4s, v20.4s +ldr q20, [x0, #272] +mla v6.4S, v19.4S, v31.s[0] +mla v4.4S, v14.4S, v31.s[0] +sub v14.4s, v20.4s, v18.4s +str q13, [x0, #512] +mla v21.4S, v0.4S, v31.s[0] +mla v12.4S, v9.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +str q5, [x0, #576] +ldr q5, [x0, #336] +sqrdmulh v18.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v9.4s, v5.4s, v7.4s +str q2, [x0, #640] +sqrdmulh v2.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +add v5.4s, v5.4s, v7.4s +str q11, [x0, #704] +ldr q11, [x0, #16] +sqrdmulh v7.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v0.4s, v11.4s, v6.4s +add v11.4s, v11.4s, v6.4s +ldr q6, [x0, #80] +sqrdmulh v13.4S, v5.4S, v29.s[1] +mul v5.4S, v5.4S,v30.s[1] +sub v19.4s, v6.4s, v4.4s +add v6.4s, v6.4s, v4.4s +ldr q4, [x0, #144] +mla v3.4S, v18.4S, v31.s[0] +mla v15.4S, v2.4S, v31.s[0] +sub v2.4s, v4.4s, v21.4s +str q1, [x0, #768] +mla v20.4S, v7.4S, v31.s[0] +mla v5.4S, v13.4S, v31.s[0] +add v4.4s, v4.4s, v21.4s +str q8, [x0, #832] +ldr q8, [x0, #208] +sqrdmulh v21.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +sub v13.4s, v8.4s, v12.4s +str q22, [x0, #896] +sqrdmulh v22.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +add v8.4s, v8.4s, v12.4s +str q17, [x0, #960] +sqrdmulh v17.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v12.4s, v4.4s, v3.4s +add v4.4s, v4.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v29.s[2] +mul v9.4S, v9.4S,v30.s[2] +sub v7.4s, v8.4s, v15.4s +add v8.4s, v8.4s, v15.4s +mla v16.4S, v21.4S, v31.s[0] +mla v10.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v20.4s +mla v14.4S, v17.4S, v31.s[0] +mla v9.4S, v3.4S, v31.s[0] +add v11.4s, v11.4s, v20.4s +sqrdmulh v20.4S, v12.4S, v27.s[1] +mul v12.4S, v12.4S,v28.s[1] +sub v3.4s, v6.4s, v5.4s +sqrdmulh v17.4S, v7.4S, v27.s[1] +mul v7.4S, v7.4S,v28.s[1] +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v4.4S, v27.s[0] +mul v4.4S, v4.4S,v28.s[0] +sub v21.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v8.4S, v27.s[0] +mul v8.4S, v8.4S,v28.s[0] +sub v15.4s, v13.4s, v10.4s +add v13.4s, v13.4s, v10.4s +mla v12.4S, v20.4S, v31.s[0] +mla v7.4S, v17.4S, v31.s[0] +sub v17.4s, v0.4s, v14.4s +mla v4.4S, v5.4S, v31.s[0] +mla v8.4S, v16.4S, v31.s[0] +add v0.4s, v0.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v27.s[2] +mul v2.4S, v2.4S,v28.s[2] +sub v16.4s, v19.4s, v9.4s +sqrdmulh v5.4S, v13.4S, v27.s[2] +mul v13.4S, v13.4S,v28.s[2] +add v19.4s, v19.4s, v9.4s +sqrdmulh v9.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +sub v20.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +sub v10.4s, v3.4s, v7.4s +add v3.4s, v3.4s, v7.4s +mla v2.4S, v14.4S, v31.s[0] +mla v13.4S, v5.4S, v31.s[0] +sub v5.4s, v11.4s, v4.4s +mla v21.4S, v9.4S, v31.s[0] +mla v15.4S, v12.4S, v31.s[0] +add v11.4s, v11.4s, v4.4s +sqrdmulh v4.4S, v3.4S, v25.s[2] +mul v3.4S, v3.4S,v26.s[2] +sub v12.4s, v6.4s, v8.4s +sqrdmulh v9.4S, v10.4S, v25.s[3] +mul v10.4S, v10.4S,v26.s[3] +add v6.4s, v6.4s, v8.4s +sqrdmulh v8.4S, v12.4S, v25.s[1] +mul v12.4S, v12.4S,v26.s[1] +sub v14.4s, v0.4s, v2.4s +add v0.4s, v0.4s, v2.4s +sqrdmulh v2.4S, v6.4S, v25.s[0] +mul v6.4S, v6.4S,v26.s[0] +sub v7.4s, v19.4s, v13.4s +add v19.4s, v19.4s, v13.4s +mla v3.4S, v4.4S, v31.s[0] +mla v10.4S, v9.4S, v31.s[0] +sub v9.4s, v17.4s, v21.4s +mla v12.4S, v8.4S, v31.s[0] +mla v6.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v19.4S, v23.s[0] +mul v19.4S, v19.4S,v24.s[0] +sub v2.4s, v16.4s, v15.4s +sqrdmulh v8.4S, v7.4S, v23.s[1] +mul v7.4S, v7.4S,v24.s[1] +add v16.4s, v16.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v23.s[2] +mul v16.4S, v16.4S,v24.s[2] +sub v4.4s, v22.4s, v3.4s +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v2.4S, v23.s[3] +mul v2.4S, v2.4S,v24.s[3] +sub v13.4s, v20.4s, v10.4s +add v20.4s, v20.4s, v10.4s +mla v19.4S, v21.4S, v31.s[0] +mla v7.4S, v8.4S, v31.s[0] +sub v8.4s, v5.4s, v12.4s +str q22, [x0, #272] +mla v16.4S, v15.4S, v31.s[0] +mla v2.4S, v3.4S, v31.s[0] +add v5.4s, v5.4s, v12.4s +str q4, [x0, #336] +sub v23.4s, v11.4s, v6.4s +str q20, [x0, #400] +add v11.4s, v11.4s, v6.4s +str q13, [x0, #464] +sub v13.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sub v19.4s, v14.4s, v7.4s +add v14.4s, v14.4s, v7.4s +sub v7.4s, v17.4s, v16.4s +str q5, [x0, #144] +add v17.4s, v17.4s, v16.4s +str q8, [x0, #208] +sub v8.4s, v9.4s, v2.4s +str q11, [x0, #16] +add v9.4s, v9.4s, v2.4s +str q23, [x0, #80] +str q0, [x0, #528] +str q13, [x0, #592] +str q14, [x0, #656] +str q19, [x0, #720] +str q17, [x0, #784] +str q7, [x0, #848] +str q9, [x0, #912] +str q8, [x0, #976] +ldr q18, [x17, #+128] +ldr q1, [x17, #+144] +ldr q10, [x0, #32] +sqrdmulh v21.4S, v10.4S, v1.s[0] +mul v10.4S, v10.4S,v18.s[0] +ldr q22, [x0, #48] +sqrdmulh v15.4S, v22.4S, v1.s[0] +mul v22.4S, v22.4S,v18.s[0] +ldr q3, [x17, #+160] +ldr q12, [x17, #+176] +ldr q4, [x0, #96] +sqrdmulh v30.4S, v4.4S, v12.s[0] +mul v4.4S, v4.4S,v3.s[0] +ldr q29, [x0, #112] +sqrdmulh v28.4S, v29.4S, v12.s[0] +mul v29.4S, v29.4S,v3.s[0] +ldr q27, [x0, #160] +ldr q26, [x17, #+192] +ldr q25, [x17, #+208] +mla v10.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v27.4S, v25.s[0] +ldr q24, [x0, #176] +mla v22.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v24.4S, v25.s[0] +ldr q20, [x0, #224] +ldr q6, [x17, #+224] +ldr q5, [x17, #+240] +mla v4.4S, v30.4S, v31.s[0] +sqrdmulh v30.4S, v20.4S, v5.s[0] +ldr q16, [x0, #240] +mla v29.4S, v28.4S, v31.s[0] +sqrdmulh v28.4S, v16.4S, v5.s[0] +ldr q11, [x0, #128] +ldr q2, [x0, #0] +mul v27.4S, v27.4S,v26.s[0] +sub v23.4s, v2.4s, v10.4s +mul v24.4S, v24.4S,v26.s[0] +add v2.4s, v2.4s, v10.4s +ldr q10, [x0, #144] +ldr q0, [x0, #16] +mla v27.4S, v21.4S, v31.s[0] +sub v21.4s, v0.4s, v22.4s +mla v24.4S, v15.4S, v31.s[0] +add v0.4s, v0.4s, v22.4s +ldr q22, [x0, #192] +ldr q15, [x0, #64] +mul v20.4S, v20.4S,v6.s[0] +sub v13.4s, v15.4s, v4.4s +mul v16.4S, v16.4S,v6.s[0] +add v15.4s, v15.4s, v4.4s +ldr q4, [x0, #208] +ldr q14, [x0, #80] +mla v20.4S, v30.4S, v31.s[0] +sub v30.4s, v14.4s, v29.4s +mla v16.4S, v28.4S, v31.s[0] +add v14.4s, v14.4s, v29.4s +sqrdmulh v29.4S, v0.4S, v1.s[1] +mul v0.4S, v0.4S,v18.s[1] +sqrdmulh v28.4S, v21.4S, v1.s[2] +sub v19.4s, v11.4s, v27.4s +mul v21.4S, v21.4S,v18.s[2] +add v11.4s, v11.4s, v27.4s +sqrdmulh v1.4S, v14.4S, v12.s[1] +sub v18.4s, v10.4s, v24.4s +mul v14.4S, v14.4S,v3.s[1] +add v10.4s, v10.4s, v24.4s +sqrdmulh v24.4S, v30.4S, v12.s[2] +sub v27.4s, v22.4s, v20.4s +mul v30.4S, v30.4S,v3.s[2] +add v22.4s, v22.4s, v20.4s +mla v0.4S, v29.4S, v31.s[0] +sub v29.4s, v4.4s, v16.4s +sqrdmulh v12.4S, v10.4S, v25.s[1] +add v4.4s, v4.4s, v16.4s +mla v21.4S, v28.4S, v31.s[0] +sqrdmulh v28.4S, v18.4S, v25.s[2] +mla v14.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v4.4S, v5.s[1] +mla v30.4S, v24.4S, v31.s[0] +sqrdmulh v24.4S, v29.4S, v5.s[2] +mul v10.4S, v10.4S,v26.s[1] +sub v16.4s, v2.4s, v0.4s +mul v18.4S, v18.4S,v26.s[2] +add v2.4s, v2.4s, v0.4s +str q16, [x0, #16] +str q2, [x0, #0] +mla v10.4S, v12.4S, v31.s[0] +sub v12.4s, v23.4s, v21.4s +mla v18.4S, v28.4S, v31.s[0] +add v23.4s, v23.4s, v21.4s +str q12, [x0, #48] +str q23, [x0, #32] +mul v4.4S, v4.4S,v6.s[1] +sub v25.4s, v15.4s, v14.4s +mul v29.4S, v29.4S,v6.s[2] +add v15.4s, v15.4s, v14.4s +str q25, [x0, #80] +str q15, [x0, #64] +mla v4.4S, v1.4S, v31.s[0] +sub v1.4s, v13.4s, v30.4s +mla v29.4S, v24.4S, v31.s[0] +add v13.4s, v13.4s, v30.4s +str q1, [x0, #112] +str q13, [x0, #96] +ldr q5, [x17, #+256] +ldr q6, [x17, #+272] +ldr q13, [x0, #288] +sqrdmulh v1.4S, v13.4S, v6.s[0] +sub v30.4s, v11.4s, v10.4s +str q30, [x0, #144] +mul v13.4S, v13.4S,v5.s[0] +add v11.4s, v11.4s, v10.4s +str q11, [x0, #128] +ldr q11, [x0, #304] +sqrdmulh v10.4S, v11.4S, v6.s[0] +sub v30.4s, v19.4s, v18.4s +mul v11.4S, v11.4S,v5.s[0] +add v19.4s, v19.4s, v18.4s +str q30, [x0, #176] +str q19, [x0, #160] +ldr q19, [x17, #+288] +ldr q30, [x17, #+304] +ldr q18, [x0, #352] +sqrdmulh v24.4S, v18.4S, v30.s[0] +sub v15.4s, v22.4s, v4.4s +mul v18.4S, v18.4S,v19.s[0] +add v22.4s, v22.4s, v4.4s +str q15, [x0, #208] +str q22, [x0, #192] +ldr q22, [x0, #368] +sqrdmulh v15.4S, v22.4S, v30.s[0] +sub v4.4s, v27.4s, v29.4s +mul v22.4S, v22.4S,v19.s[0] +add v27.4s, v27.4s, v29.4s +str q4, [x0, #240] +str q27, [x0, #224] +ldr q27, [x0, #416] +ldr q4, [x17, #+320] +ldr q29, [x17, #+336] +mla v13.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v27.4S, v29.s[0] +ldr q25, [x0, #432] +mla v11.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v25.4S, v29.s[0] +ldr q14, [x0, #480] +ldr q26, [x17, #+352] +ldr q23, [x17, #+368] +mla v18.4S, v24.4S, v31.s[0] +sqrdmulh v24.4S, v14.4S, v23.s[0] +ldr q12, [x0, #496] +mla v22.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v12.4S, v23.s[0] +ldr q21, [x0, #384] +ldr q28, [x0, #256] +mul v27.4S, v27.4S,v4.s[0] +sub v2.4s, v28.4s, v13.4s +mul v25.4S, v25.4S,v4.s[0] +add v28.4s, v28.4s, v13.4s +ldr q13, [x0, #400] +ldr q16, [x0, #272] +mla v27.4S, v1.4S, v31.s[0] +sub v1.4s, v16.4s, v11.4s +mla v25.4S, v10.4S, v31.s[0] +add v16.4s, v16.4s, v11.4s +ldr q11, [x0, #448] +ldr q10, [x0, #320] +mul v14.4S, v14.4S,v26.s[0] +sub v0.4s, v10.4s, v18.4s +mul v12.4S, v12.4S,v26.s[0] +add v10.4s, v10.4s, v18.4s +ldr q18, [x0, #464] +ldr q3, [x0, #336] +mla v14.4S, v24.4S, v31.s[0] +sub v24.4s, v3.4s, v22.4s +mla v12.4S, v15.4S, v31.s[0] +add v3.4s, v3.4s, v22.4s +sqrdmulh v22.4S, v16.4S, v6.s[1] +mul v16.4S, v16.4S,v5.s[1] +sqrdmulh v15.4S, v1.4S, v6.s[2] +sub v20.4s, v21.4s, v27.4s +mul v1.4S, v1.4S,v5.s[2] +add v21.4s, v21.4s, v27.4s +sqrdmulh v6.4S, v3.4S, v30.s[1] +sub v5.4s, v13.4s, v25.4s +mul v3.4S, v3.4S,v19.s[1] +add v13.4s, v13.4s, v25.4s +sqrdmulh v25.4S, v24.4S, v30.s[2] +sub v27.4s, v11.4s, v14.4s +mul v24.4S, v24.4S,v19.s[2] +add v11.4s, v11.4s, v14.4s +mla v16.4S, v22.4S, v31.s[0] +sub v22.4s, v18.4s, v12.4s +sqrdmulh v30.4S, v13.4S, v29.s[1] +add v18.4s, v18.4s, v12.4s +mla v1.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v5.4S, v29.s[2] +mla v3.4S, v6.4S, v31.s[0] +sqrdmulh v6.4S, v18.4S, v23.s[1] +mla v24.4S, v25.4S, v31.s[0] +sqrdmulh v25.4S, v22.4S, v23.s[2] +mul v13.4S, v13.4S,v4.s[1] +sub v12.4s, v28.4s, v16.4s +mul v5.4S, v5.4S,v4.s[2] +add v28.4s, v28.4s, v16.4s +str q12, [x0, #272] +str q28, [x0, #256] +mla v13.4S, v30.4S, v31.s[0] +sub v30.4s, v2.4s, v1.4s +mla v5.4S, v15.4S, v31.s[0] +add v2.4s, v2.4s, v1.4s +str q30, [x0, #304] +str q2, [x0, #288] +mul v18.4S, v18.4S,v26.s[1] +sub v29.4s, v10.4s, v3.4s +mul v22.4S, v22.4S,v26.s[2] +add v10.4s, v10.4s, v3.4s +str q29, [x0, #336] +str q10, [x0, #320] +mla v18.4S, v6.4S, v31.s[0] +sub v6.4s, v0.4s, v24.4s +mla v22.4S, v25.4S, v31.s[0] +add v0.4s, v0.4s, v24.4s +str q6, [x0, #368] +str q0, [x0, #352] +ldr q23, [x17, #+384] +ldr q26, [x17, #+400] +ldr q0, [x0, #544] +sqrdmulh v6.4S, v0.4S, v26.s[0] +sub v24.4s, v21.4s, v13.4s +str q24, [x0, #400] +mul v0.4S, v0.4S,v23.s[0] +add v21.4s, v21.4s, v13.4s +str q21, [x0, #384] +ldr q21, [x0, #560] +sqrdmulh v13.4S, v21.4S, v26.s[0] +sub v24.4s, v20.4s, v5.4s +mul v21.4S, v21.4S,v23.s[0] +add v20.4s, v20.4s, v5.4s +str q24, [x0, #432] +str q20, [x0, #416] +ldr q20, [x17, #+416] +ldr q24, [x17, #+432] +ldr q5, [x0, #608] +sqrdmulh v25.4S, v5.4S, v24.s[0] +sub v10.4s, v11.4s, v18.4s +mul v5.4S, v5.4S,v20.s[0] +add v11.4s, v11.4s, v18.4s +str q10, [x0, #464] +str q11, [x0, #448] +ldr q11, [x0, #624] +sqrdmulh v10.4S, v11.4S, v24.s[0] +sub v18.4s, v27.4s, v22.4s +mul v11.4S, v11.4S,v20.s[0] +add v27.4s, v27.4s, v22.4s +str q18, [x0, #496] +str q27, [x0, #480] +ldr q27, [x0, #672] +ldr q18, [x17, #+448] +ldr q22, [x17, #+464] +mla v0.4S, v6.4S, v31.s[0] +sqrdmulh v6.4S, v27.4S, v22.s[0] +ldr q29, [x0, #688] +mla v21.4S, v13.4S, v31.s[0] +sqrdmulh v13.4S, v29.4S, v22.s[0] +ldr q3, [x0, #736] +ldr q4, [x17, #+480] +ldr q2, [x17, #+496] +mla v5.4S, v25.4S, v31.s[0] +sqrdmulh v25.4S, v3.4S, v2.s[0] +ldr q30, [x0, #752] +mla v11.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v30.4S, v2.s[0] +ldr q1, [x0, #640] +ldr q15, [x0, #512] +mul v27.4S, v27.4S,v18.s[0] +sub v28.4s, v15.4s, v0.4s +mul v29.4S, v29.4S,v18.s[0] +add v15.4s, v15.4s, v0.4s +ldr q0, [x0, #656] +ldr q12, [x0, #528] +mla v27.4S, v6.4S, v31.s[0] +sub v6.4s, v12.4s, v21.4s +mla v29.4S, v13.4S, v31.s[0] +add v12.4s, v12.4s, v21.4s +ldr q21, [x0, #704] +ldr q13, [x0, #576] +mul v3.4S, v3.4S,v4.s[0] +sub v16.4s, v13.4s, v5.4s +mul v30.4S, v30.4S,v4.s[0] +add v13.4s, v13.4s, v5.4s +ldr q5, [x0, #720] +ldr q19, [x0, #592] +mla v3.4S, v25.4S, v31.s[0] +sub v25.4s, v19.4s, v11.4s +mla v30.4S, v10.4S, v31.s[0] +add v19.4s, v19.4s, v11.4s +sqrdmulh v11.4S, v12.4S, v26.s[1] +mul v12.4S, v12.4S,v23.s[1] +sqrdmulh v10.4S, v6.4S, v26.s[2] +sub v14.4s, v1.4s, v27.4s +mul v6.4S, v6.4S,v23.s[2] +add v1.4s, v1.4s, v27.4s +sqrdmulh v26.4S, v19.4S, v24.s[1] +sub v23.4s, v0.4s, v29.4s +mul v19.4S, v19.4S,v20.s[1] +add v0.4s, v0.4s, v29.4s +sqrdmulh v29.4S, v25.4S, v24.s[2] +sub v27.4s, v21.4s, v3.4s +mul v25.4S, v25.4S,v20.s[2] +add v21.4s, v21.4s, v3.4s +mla v12.4S, v11.4S, v31.s[0] +sub v11.4s, v5.4s, v30.4s +sqrdmulh v24.4S, v0.4S, v22.s[1] +add v5.4s, v5.4s, v30.4s +mla v6.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v23.4S, v22.s[2] +mla v19.4S, v26.4S, v31.s[0] +sqrdmulh v26.4S, v5.4S, v2.s[1] +mla v25.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v11.4S, v2.s[2] +mul v0.4S, v0.4S,v18.s[1] +sub v30.4s, v15.4s, v12.4s +mul v23.4S, v23.4S,v18.s[2] +add v15.4s, v15.4s, v12.4s +str q30, [x0, #528] +str q15, [x0, #512] +mla v0.4S, v24.4S, v31.s[0] +sub v24.4s, v28.4s, v6.4s +mla v23.4S, v10.4S, v31.s[0] +add v28.4s, v28.4s, v6.4s +str q24, [x0, #560] +str q28, [x0, #544] +mul v5.4S, v5.4S,v4.s[1] +sub v22.4s, v13.4s, v19.4s +mul v11.4S, v11.4S,v4.s[2] +add v13.4s, v13.4s, v19.4s +str q22, [x0, #592] +str q13, [x0, #576] +mla v5.4S, v26.4S, v31.s[0] +sub v26.4s, v16.4s, v25.4s +mla v11.4S, v29.4S, v31.s[0] +add v16.4s, v16.4s, v25.4s +str q26, [x0, #624] +str q16, [x0, #608] +ldr q2, [x17, #+512] +ldr q4, [x17, #+528] +ldr q16, [x0, #800] +sqrdmulh v26.4S, v16.4S, v4.s[0] +sub v25.4s, v1.4s, v0.4s +str q25, [x0, #656] +mul v16.4S, v16.4S,v2.s[0] +add v1.4s, v1.4s, v0.4s +str q1, [x0, #640] +ldr q1, [x0, #816] +sqrdmulh v0.4S, v1.4S, v4.s[0] +sub v25.4s, v14.4s, v23.4s +mul v1.4S, v1.4S,v2.s[0] +add v14.4s, v14.4s, v23.4s +str q25, [x0, #688] +str q14, [x0, #672] +ldr q14, [x17, #+544] +ldr q25, [x17, #+560] +ldr q23, [x0, #864] +sqrdmulh v29.4S, v23.4S, v25.s[0] +sub v13.4s, v21.4s, v5.4s +mul v23.4S, v23.4S,v14.s[0] +add v21.4s, v21.4s, v5.4s +str q13, [x0, #720] +str q21, [x0, #704] +ldr q21, [x0, #880] +sqrdmulh v13.4S, v21.4S, v25.s[0] +sub v5.4s, v27.4s, v11.4s +mul v21.4S, v21.4S,v14.s[0] +add v27.4s, v27.4s, v11.4s +str q5, [x0, #752] +str q27, [x0, #736] +ldr q27, [x0, #928] +ldr q5, [x17, #+576] +ldr q11, [x17, #+592] +mla v16.4S, v26.4S, v31.s[0] +sqrdmulh v26.4S, v27.4S, v11.s[0] +ldr q22, [x0, #944] +mla v1.4S, v0.4S, v31.s[0] +sqrdmulh v0.4S, v22.4S, v11.s[0] +ldr q19, [x0, #992] +ldr q18, [x17, #+608] +ldr q28, [x17, #+624] +mla v23.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v19.4S, v28.s[0] +ldr q24, [x0, #1008] +mla v21.4S, v13.4S, v31.s[0] +sqrdmulh v13.4S, v24.4S, v28.s[0] +ldr q6, [x0, #896] +ldr q10, [x0, #768] +mul v27.4S, v27.4S,v5.s[0] +sub v15.4s, v10.4s, v16.4s +mul v22.4S, v22.4S,v5.s[0] +add v10.4s, v10.4s, v16.4s +ldr q16, [x0, #912] +ldr q30, [x0, #784] +mla v27.4S, v26.4S, v31.s[0] +sub v26.4s, v30.4s, v1.4s +mla v22.4S, v0.4S, v31.s[0] +add v30.4s, v30.4s, v1.4s +ldr q1, [x0, #960] +ldr q0, [x0, #832] +mul v19.4S, v19.4S,v18.s[0] +sub v12.4s, v0.4s, v23.4s +mul v24.4S, v24.4S,v18.s[0] +add v0.4s, v0.4s, v23.4s +ldr q23, [x0, #976] +ldr q20, [x0, #848] +mla v19.4S, v29.4S, v31.s[0] +sub v29.4s, v20.4s, v21.4s +mla v24.4S, v13.4S, v31.s[0] +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v30.4S, v4.s[1] +mul v30.4S, v30.4S,v2.s[1] +sqrdmulh v13.4S, v26.4S, v4.s[2] +sub v3.4s, v6.4s, v27.4s +mul v26.4S, v26.4S,v2.s[2] +add v6.4s, v6.4s, v27.4s +sqrdmulh v4.4S, v20.4S, v25.s[1] +sub v2.4s, v16.4s, v22.4s +mul v20.4S, v20.4S,v14.s[1] +add v16.4s, v16.4s, v22.4s +sqrdmulh v22.4S, v29.4S, v25.s[2] +sub v27.4s, v1.4s, v19.4s +mul v29.4S, v29.4S,v14.s[2] +add v1.4s, v1.4s, v19.4s +mla v30.4S, v21.4S, v31.s[0] +sub v21.4s, v23.4s, v24.4s +sqrdmulh v25.4S, v16.4S, v11.s[1] +add v23.4s, v23.4s, v24.4s +mla v26.4S, v13.4S, v31.s[0] +sqrdmulh v13.4S, v2.4S, v11.s[2] +mla v20.4S, v4.4S, v31.s[0] +sqrdmulh v4.4S, v23.4S, v28.s[1] +mla v29.4S, v22.4S, v31.s[0] +sqrdmulh v22.4S, v21.4S, v28.s[2] +mul v16.4S, v16.4S,v5.s[1] +sub v24.4s, v10.4s, v30.4s +mul v2.4S, v2.4S,v5.s[2] +add v10.4s, v10.4s, v30.4s +str q24, [x0, #784] +str q10, [x0, #768] +mla v16.4S, v25.4S, v31.s[0] +sub v25.4s, v15.4s, v26.4s +mla v2.4S, v13.4S, v31.s[0] +add v15.4s, v15.4s, v26.4s +str q25, [x0, #816] +str q15, [x0, #800] +mul v23.4S, v23.4S,v18.s[1] +sub v11.4s, v0.4s, v20.4s +mul v21.4S, v21.4S,v18.s[2] +add v0.4s, v0.4s, v20.4s +str q11, [x0, #848] +str q0, [x0, #832] +mla v23.4S, v4.4S, v31.s[0] +sub v4.4s, v12.4s, v29.4s +mla v21.4S, v22.4S, v31.s[0] +add v12.4s, v12.4s, v29.4s +str q4, [x0, #880] +str q12, [x0, #864] +sub v28.4s, v6.4s, v16.4s +str q28, [x0, #912] +add v6.4s, v6.4s, v16.4s +str q6, [x0, #896] +sub v6.4s, v3.4s, v2.4s +add v3.4s, v3.4s, v2.4s +str q6, [x0, #944] +str q3, [x0, #928] +sub v3.4s, v1.4s, v23.4s +add v1.4s, v1.4s, v23.4s +str q3, [x0, #976] +str q1, [x0, #960] +sub v1.4s, v27.4s, v21.4s +add v27.4s, v27.4s, v21.4s +str q1, [x0, #1008] +str q27, [x0, #992] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1464 +// Instruction count: 1460 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_5.s b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_5.s new file mode 100644 index 0000000..2182562 --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_5.s @@ -0,0 +1,1494 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_7_z4_5 +.global _ntt_u32_incomplete_neon_asm_var_4_2_7_z4_5 +ntt_u32_incomplete_neon_asm_var_4_2_7_z4_5: +_ntt_u32_incomplete_neon_asm_var_4_2_7_z4_5: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #928] +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +ldr q20, [x0, #992] +sqrdmulh v19.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q18, [x0, #800] +sqrdmulh v17.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +ldr q16, [x0, #864] +sqrdmulh v3.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +mla v22.4S, v21.4S, v31.s[0] +mla v20.4S, v19.4S, v31.s[0] +mla v18.4S, v17.4S, v31.s[0] +mla v16.4S, v3.4S, v31.s[0] +ldr q3, [x0, #544] +sqrdmulh v17.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +ldr q19, [x0, #608] +sqrdmulh v21.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q2, [x0, #672] +ldr q1, [x0, #416] +sqrdmulh v0.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +sub v15.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +ldr q22, [x0, #736] +ldr q14, [x0, #480] +sqrdmulh v13.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v12.4s, v14.4s, v20.4s +add v14.4s, v14.4s, v20.4s +ldr q20, [x0, #288] +mla v3.4S, v17.4S, v31.s[0] +mla v19.4S, v21.4S, v31.s[0] +sub v21.4s, v20.4s, v18.4s +mla v2.4S, v0.4S, v31.s[0] +mla v22.4S, v13.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +ldr q18, [x0, #352] +sqrdmulh v13.4S, v1.4S, v29.s[1] +mul v1.4S, v1.4S,v30.s[1] +sub v0.4s, v18.4s, v16.4s +sqrdmulh v17.4S, v14.4S, v29.s[1] +mul v14.4S, v14.4S,v30.s[1] +add v18.4s, v18.4s, v16.4s +ldr q16, [x0, #32] +sqrdmulh v11.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v10.4s, v16.4s, v3.4s +add v16.4s, v16.4s, v3.4s +ldr q3, [x0, #96] +sqrdmulh v9.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v8.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +ldr q19, [x0, #160] +mla v1.4S, v13.4S, v31.s[0] +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v19.4s, v2.4s +mla v20.4S, v11.4S, v31.s[0] +mla v18.4S, v9.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +ldr q2, [x0, #224] +sqrdmulh v9.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +sub v11.4s, v2.4s, v22.4s +sqrdmulh v13.4S, v12.4S, v29.s[2] +mul v12.4S, v12.4S,v30.s[2] +add v2.4s, v2.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +sub v7.4s, v19.4s, v1.4s +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v29.s[2] +mul v0.4S, v0.4S,v30.s[2] +sub v6.4s, v2.4s, v14.4s +add v2.4s, v2.4s, v14.4s +mla v15.4S, v9.4S, v31.s[0] +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v16.4s, v20.4s +mla v21.4S, v22.4S, v31.s[0] +mla v0.4S, v1.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v7.4S, v27.s[1] +mul v7.4S, v7.4S,v28.s[1] +sub v1.4s, v3.4s, v18.4s +sqrdmulh v22.4S, v6.4S, v27.s[1] +mul v6.4S, v6.4S,v28.s[1] +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v19.4S, v27.s[0] +mul v19.4S, v19.4S,v28.s[0] +sub v9.4s, v17.4s, v15.4s +add v17.4s, v17.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v27.s[0] +mul v2.4S, v2.4S,v28.s[0] +sub v14.4s, v11.4s, v12.4s +add v11.4s, v11.4s, v12.4s +mla v7.4S, v20.4S, v31.s[0] +mla v6.4S, v22.4S, v31.s[0] +sub v22.4s, v10.4s, v21.4s +mla v19.4S, v18.4S, v31.s[0] +mla v2.4S, v15.4S, v31.s[0] +add v10.4s, v10.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v27.s[2] +mul v17.4S, v17.4S,v28.s[2] +sub v15.4s, v8.4s, v0.4s +sqrdmulh v18.4S, v11.4S, v27.s[2] +mul v11.4S, v11.4S,v28.s[2] +add v8.4s, v8.4s, v0.4s +sqrdmulh v0.4S, v9.4S, v27.s[3] +mul v9.4S, v9.4S,v28.s[3] +sub v20.4s, v13.4s, v7.4s +add v13.4s, v13.4s, v7.4s +sqrdmulh v7.4S, v14.4S, v27.s[3] +mul v14.4S, v14.4S,v28.s[3] +sub v12.4s, v1.4s, v6.4s +add v1.4s, v1.4s, v6.4s +mla v17.4S, v21.4S, v31.s[0] +mla v11.4S, v18.4S, v31.s[0] +sub v18.4s, v16.4s, v19.4s +mla v9.4S, v0.4S, v31.s[0] +mla v14.4S, v7.4S, v31.s[0] +add v16.4s, v16.4s, v19.4s +sqrdmulh v19.4S, v1.4S, v25.s[2] +mul v1.4S, v1.4S,v26.s[2] +sub v7.4s, v3.4s, v2.4s +sqrdmulh v0.4S, v12.4S, v25.s[3] +mul v12.4S, v12.4S,v26.s[3] +add v3.4s, v3.4s, v2.4s +sqrdmulh v2.4S, v7.4S, v25.s[1] +mul v7.4S, v7.4S,v26.s[1] +sub v21.4s, v10.4s, v17.4s +add v10.4s, v10.4s, v17.4s +sqrdmulh v17.4S, v3.4S, v25.s[0] +mul v3.4S, v3.4S,v26.s[0] +sub v6.4s, v8.4s, v11.4s +add v8.4s, v8.4s, v11.4s +mla v1.4S, v19.4S, v31.s[0] +mla v12.4S, v0.4S, v31.s[0] +sub v0.4s, v22.4s, v9.4s +mla v7.4S, v2.4S, v31.s[0] +mla v3.4S, v17.4S, v31.s[0] +add v22.4s, v22.4s, v9.4s +sqrdmulh v9.4S, v8.4S, v23.s[0] +mul v8.4S, v8.4S,v24.s[0] +sub v17.4s, v15.4s, v14.4s +sqrdmulh v2.4S, v6.4S, v23.s[1] +mul v6.4S, v6.4S,v24.s[1] +add v15.4s, v15.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v23.s[2] +mul v15.4S, v15.4S,v24.s[2] +sub v19.4s, v13.4s, v1.4s +add v13.4s, v13.4s, v1.4s +sqrdmulh v1.4S, v17.4S, v23.s[3] +mul v17.4S, v17.4S,v24.s[3] +sub v11.4s, v20.4s, v12.4s +add v20.4s, v20.4s, v12.4s +mla v8.4S, v9.4S, v31.s[0] +mla v6.4S, v2.4S, v31.s[0] +sub v2.4s, v18.4s, v7.4s +str q13, [x0, #288] +mla v15.4S, v14.4S, v31.s[0] +mla v17.4S, v1.4S, v31.s[0] +add v18.4s, v18.4s, v7.4s +str q19, [x0, #352] +ldr q19, [x0, #944] +sqrdmulh v7.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +sub v1.4s, v16.4s, v3.4s +str q20, [x0, #416] +ldr q20, [x0, #1008] +sqrdmulh v14.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v16.4s, v16.4s, v3.4s +str q11, [x0, #480] +ldr q11, [x0, #816] +sqrdmulh v3.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +sub v13.4s, v10.4s, v8.4s +add v10.4s, v10.4s, v8.4s +ldr q8, [x0, #880] +sqrdmulh v9.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v12.4s, v21.4s, v6.4s +add v21.4s, v21.4s, v6.4s +mla v19.4S, v7.4S, v31.s[0] +mla v20.4S, v14.4S, v31.s[0] +sub v14.4s, v22.4s, v15.4s +str q18, [x0, #160] +mla v11.4S, v3.4S, v31.s[0] +mla v8.4S, v9.4S, v31.s[0] +add v22.4s, v22.4s, v15.4s +str q2, [x0, #224] +ldr q2, [x0, #560] +sqrdmulh v15.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +sub v9.4s, v0.4s, v17.4s +str q16, [x0, #32] +ldr q16, [x0, #624] +sqrdmulh v3.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +add v0.4s, v0.4s, v17.4s +str q1, [x0, #96] +ldr q1, [x0, #688] +ldr q17, [x0, #432] +sqrdmulh v18.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +sub v7.4s, v17.4s, v19.4s +add v17.4s, v17.4s, v19.4s +ldr q19, [x0, #752] +ldr q6, [x0, #496] +sqrdmulh v5.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +sub v4.4s, v6.4s, v20.4s +add v6.4s, v6.4s, v20.4s +ldr q20, [x0, #304] +mla v2.4S, v15.4S, v31.s[0] +mla v16.4S, v3.4S, v31.s[0] +sub v3.4s, v20.4s, v11.4s +str q10, [x0, #544] +mla v1.4S, v18.4S, v31.s[0] +mla v19.4S, v5.4S, v31.s[0] +add v20.4s, v20.4s, v11.4s +str q13, [x0, #608] +ldr q13, [x0, #368] +sqrdmulh v11.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v5.4s, v13.4s, v8.4s +str q21, [x0, #672] +sqrdmulh v21.4S, v6.4S, v29.s[1] +mul v6.4S, v6.4S,v30.s[1] +add v13.4s, v13.4s, v8.4s +str q12, [x0, #736] +ldr q12, [x0, #48] +sqrdmulh v8.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v18.4s, v12.4s, v2.4s +add v12.4s, v12.4s, v2.4s +ldr q2, [x0, #112] +sqrdmulh v10.4S, v13.4S, v29.s[1] +mul v13.4S, v13.4S,v30.s[1] +sub v15.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +ldr q16, [x0, #176] +mla v17.4S, v11.4S, v31.s[0] +mla v6.4S, v21.4S, v31.s[0] +sub v21.4s, v16.4s, v1.4s +str q22, [x0, #800] +mla v20.4S, v8.4S, v31.s[0] +mla v13.4S, v10.4S, v31.s[0] +add v16.4s, v16.4s, v1.4s +str q14, [x0, #864] +ldr q14, [x0, #240] +sqrdmulh v1.4S, v7.4S, v29.s[2] +mul v7.4S, v7.4S,v30.s[2] +sub v10.4s, v14.4s, v19.4s +str q0, [x0, #928] +sqrdmulh v0.4S, v4.4S, v29.s[2] +mul v4.4S, v4.4S,v30.s[2] +add v14.4s, v14.4s, v19.4s +str q9, [x0, #992] +sqrdmulh v9.4S, v3.4S, v29.s[2] +mul v3.4S, v3.4S,v30.s[2] +sub v19.4s, v16.4s, v17.4s +add v16.4s, v16.4s, v17.4s +sqrdmulh v17.4S, v5.4S, v29.s[2] +mul v5.4S, v5.4S,v30.s[2] +sub v8.4s, v14.4s, v6.4s +add v14.4s, v14.4s, v6.4s +mla v7.4S, v1.4S, v31.s[0] +mla v4.4S, v0.4S, v31.s[0] +sub v0.4s, v12.4s, v20.4s +mla v3.4S, v9.4S, v31.s[0] +mla v5.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v27.s[1] +mul v19.4S, v19.4S,v28.s[1] +sub v17.4s, v2.4s, v13.4s +sqrdmulh v9.4S, v8.4S, v27.s[1] +mul v8.4S, v8.4S,v28.s[1] +add v2.4s, v2.4s, v13.4s +sqrdmulh v13.4S, v16.4S, v27.s[0] +mul v16.4S, v16.4S,v28.s[0] +sub v1.4s, v21.4s, v7.4s +add v21.4s, v21.4s, v7.4s +sqrdmulh v7.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +sub v6.4s, v10.4s, v4.4s +add v10.4s, v10.4s, v4.4s +mla v19.4S, v20.4S, v31.s[0] +mla v8.4S, v9.4S, v31.s[0] +sub v9.4s, v18.4s, v3.4s +mla v16.4S, v13.4S, v31.s[0] +mla v14.4S, v7.4S, v31.s[0] +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v27.s[2] +mul v21.4S, v21.4S,v28.s[2] +sub v7.4s, v15.4s, v5.4s +sqrdmulh v13.4S, v10.4S, v27.s[2] +mul v10.4S, v10.4S,v28.s[2] +add v15.4s, v15.4s, v5.4s +sqrdmulh v5.4S, v1.4S, v27.s[3] +mul v1.4S, v1.4S,v28.s[3] +sub v20.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v27.s[3] +mul v6.4S, v6.4S,v28.s[3] +sub v4.4s, v17.4s, v8.4s +add v17.4s, v17.4s, v8.4s +mla v21.4S, v3.4S, v31.s[0] +mla v10.4S, v13.4S, v31.s[0] +sub v13.4s, v12.4s, v16.4s +mla v1.4S, v5.4S, v31.s[0] +mla v6.4S, v19.4S, v31.s[0] +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v25.s[2] +mul v17.4S, v17.4S,v26.s[2] +sub v19.4s, v2.4s, v14.4s +sqrdmulh v5.4S, v4.4S, v25.s[3] +mul v4.4S, v4.4S,v26.s[3] +add v2.4s, v2.4s, v14.4s +sqrdmulh v14.4S, v19.4S, v25.s[1] +mul v19.4S, v19.4S,v26.s[1] +sub v3.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v2.4S, v25.s[0] +mul v2.4S, v2.4S,v26.s[0] +sub v8.4s, v15.4s, v10.4s +add v15.4s, v15.4s, v10.4s +mla v17.4S, v16.4S, v31.s[0] +mla v4.4S, v5.4S, v31.s[0] +sub v5.4s, v9.4s, v1.4s +mla v19.4S, v14.4S, v31.s[0] +mla v2.4S, v21.4S, v31.s[0] +add v9.4s, v9.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v23.s[0] +mul v15.4S, v15.4S,v24.s[0] +sub v21.4s, v7.4s, v6.4s +sqrdmulh v14.4S, v8.4S, v23.s[1] +mul v8.4S, v8.4S,v24.s[1] +add v7.4s, v7.4s, v6.4s +sqrdmulh v6.4S, v7.4S, v23.s[2] +mul v7.4S, v7.4S,v24.s[2] +sub v16.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +sqrdmulh v17.4S, v21.4S, v23.s[3] +mul v21.4S, v21.4S,v24.s[3] +sub v10.4s, v20.4s, v4.4s +add v20.4s, v20.4s, v4.4s +mla v15.4S, v1.4S, v31.s[0] +mla v8.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v19.4s +str q0, [x0, #304] +mla v7.4S, v6.4S, v31.s[0] +mla v21.4S, v17.4S, v31.s[0] +add v13.4s, v13.4s, v19.4s +str q16, [x0, #368] +ldr q16, [x0, #896] +sqrdmulh v19.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v17.4s, v12.4s, v2.4s +str q20, [x0, #432] +ldr q20, [x0, #960] +sqrdmulh v6.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v12.4s, v12.4s, v2.4s +str q10, [x0, #496] +ldr q10, [x0, #768] +sqrdmulh v2.4S, v10.4S, v29.s[0] +mul v10.4S, v10.4S,v30.s[0] +sub v0.4s, v18.4s, v15.4s +add v18.4s, v18.4s, v15.4s +ldr q15, [x0, #832] +sqrdmulh v1.4S, v15.4S, v29.s[0] +mul v15.4S, v15.4S,v30.s[0] +sub v4.4s, v3.4s, v8.4s +add v3.4s, v3.4s, v8.4s +mla v16.4S, v19.4S, v31.s[0] +mla v20.4S, v6.4S, v31.s[0] +sub v6.4s, v9.4s, v7.4s +str q13, [x0, #176] +mla v10.4S, v2.4S, v31.s[0] +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v7.4s +str q14, [x0, #240] +ldr q14, [x0, #512] +sqrdmulh v7.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v1.4s, v5.4s, v21.4s +str q12, [x0, #48] +ldr q12, [x0, #576] +sqrdmulh v2.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +add v5.4s, v5.4s, v21.4s +str q17, [x0, #112] +ldr q17, [x0, #640] +ldr q21, [x0, #384] +sqrdmulh v13.4S, v17.4S, v29.s[0] +mul v17.4S, v17.4S,v30.s[0] +sub v19.4s, v21.4s, v16.4s +add v21.4s, v21.4s, v16.4s +ldr q16, [x0, #704] +ldr q8, [x0, #448] +sqrdmulh v22.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v11.4s, v8.4s, v20.4s +add v8.4s, v8.4s, v20.4s +ldr q20, [x0, #256] +mla v14.4S, v7.4S, v31.s[0] +mla v12.4S, v2.4S, v31.s[0] +sub v2.4s, v20.4s, v10.4s +str q18, [x0, #560] +mla v17.4S, v13.4S, v31.s[0] +mla v16.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v10.4s +str q0, [x0, #624] +ldr q0, [x0, #320] +sqrdmulh v10.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v22.4s, v0.4s, v15.4s +str q3, [x0, #688] +sqrdmulh v3.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +add v0.4s, v0.4s, v15.4s +str q4, [x0, #752] +ldr q4, [x0, #0] +sqrdmulh v15.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v13.4s, v4.4s, v14.4s +add v4.4s, v4.4s, v14.4s +ldr q14, [x0, #64] +sqrdmulh v18.4S, v0.4S, v29.s[1] +mul v0.4S, v0.4S,v30.s[1] +sub v7.4s, v14.4s, v12.4s +add v14.4s, v14.4s, v12.4s +ldr q12, [x0, #128] +mla v21.4S, v10.4S, v31.s[0] +mla v8.4S, v3.4S, v31.s[0] +sub v3.4s, v12.4s, v17.4s +str q9, [x0, #816] +mla v20.4S, v15.4S, v31.s[0] +mla v0.4S, v18.4S, v31.s[0] +add v12.4s, v12.4s, v17.4s +str q6, [x0, #880] +ldr q6, [x0, #192] +sqrdmulh v17.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +sub v18.4s, v6.4s, v16.4s +str q5, [x0, #944] +sqrdmulh v5.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +add v6.4s, v6.4s, v16.4s +str q1, [x0, #1008] +sqrdmulh v1.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v16.4s, v12.4s, v21.4s +add v12.4s, v12.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +sub v15.4s, v6.4s, v8.4s +add v6.4s, v6.4s, v8.4s +mla v19.4S, v17.4S, v31.s[0] +mla v11.4S, v5.4S, v31.s[0] +sub v5.4s, v4.4s, v20.4s +mla v2.4S, v1.4S, v31.s[0] +mla v22.4S, v21.4S, v31.s[0] +add v4.4s, v4.4s, v20.4s +sqrdmulh v20.4S, v16.4S, v27.s[1] +mul v16.4S, v16.4S,v28.s[1] +sub v21.4s, v14.4s, v0.4s +sqrdmulh v1.4S, v15.4S, v27.s[1] +mul v15.4S, v15.4S,v28.s[1] +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +sub v17.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v27.s[0] +mul v6.4S, v6.4S,v28.s[0] +sub v8.4s, v18.4s, v11.4s +add v18.4s, v18.4s, v11.4s +mla v16.4S, v20.4S, v31.s[0] +mla v15.4S, v1.4S, v31.s[0] +sub v1.4s, v13.4s, v2.4s +mla v12.4S, v0.4S, v31.s[0] +mla v6.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v3.4S, v27.s[2] +mul v3.4S, v3.4S,v28.s[2] +sub v19.4s, v7.4s, v22.4s +sqrdmulh v0.4S, v18.4S, v27.s[2] +mul v18.4S, v18.4S,v28.s[2] +add v7.4s, v7.4s, v22.4s +sqrdmulh v22.4S, v17.4S, v27.s[3] +mul v17.4S, v17.4S,v28.s[3] +sub v20.4s, v5.4s, v16.4s +add v5.4s, v5.4s, v16.4s +sqrdmulh v16.4S, v8.4S, v27.s[3] +mul v8.4S, v8.4S,v28.s[3] +sub v11.4s, v21.4s, v15.4s +add v21.4s, v21.4s, v15.4s +mla v3.4S, v2.4S, v31.s[0] +mla v18.4S, v0.4S, v31.s[0] +sub v0.4s, v4.4s, v12.4s +mla v17.4S, v22.4S, v31.s[0] +mla v8.4S, v16.4S, v31.s[0] +add v4.4s, v4.4s, v12.4s +sqrdmulh v12.4S, v21.4S, v25.s[2] +mul v21.4S, v21.4S,v26.s[2] +sub v16.4s, v14.4s, v6.4s +sqrdmulh v22.4S, v11.4S, v25.s[3] +mul v11.4S, v11.4S,v26.s[3] +add v14.4s, v14.4s, v6.4s +sqrdmulh v6.4S, v16.4S, v25.s[1] +mul v16.4S, v16.4S,v26.s[1] +sub v2.4s, v13.4s, v3.4s +add v13.4s, v13.4s, v3.4s +sqrdmulh v3.4S, v14.4S, v25.s[0] +mul v14.4S, v14.4S,v26.s[0] +sub v15.4s, v7.4s, v18.4s +add v7.4s, v7.4s, v18.4s +mla v21.4S, v12.4S, v31.s[0] +mla v11.4S, v22.4S, v31.s[0] +sub v22.4s, v1.4s, v17.4s +mla v16.4S, v6.4S, v31.s[0] +mla v14.4S, v3.4S, v31.s[0] +add v1.4s, v1.4s, v17.4s +sqrdmulh v17.4S, v7.4S, v23.s[0] +mul v7.4S, v7.4S,v24.s[0] +sub v3.4s, v19.4s, v8.4s +sqrdmulh v6.4S, v15.4S, v23.s[1] +mul v15.4S, v15.4S,v24.s[1] +add v19.4s, v19.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v23.s[2] +mul v19.4S, v19.4S,v24.s[2] +sub v12.4s, v5.4s, v21.4s +add v5.4s, v5.4s, v21.4s +sqrdmulh v21.4S, v3.4S, v23.s[3] +mul v3.4S, v3.4S,v24.s[3] +sub v18.4s, v20.4s, v11.4s +add v20.4s, v20.4s, v11.4s +mla v7.4S, v17.4S, v31.s[0] +mla v15.4S, v6.4S, v31.s[0] +sub v6.4s, v0.4s, v16.4s +str q5, [x0, #256] +mla v19.4S, v8.4S, v31.s[0] +mla v3.4S, v21.4S, v31.s[0] +add v0.4s, v0.4s, v16.4s +str q12, [x0, #320] +ldr q12, [x0, #912] +sqrdmulh v16.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +sub v21.4s, v4.4s, v14.4s +str q20, [x0, #384] +ldr q20, [x0, #976] +sqrdmulh v8.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v4.4s, v4.4s, v14.4s +str q18, [x0, #448] +ldr q18, [x0, #784] +sqrdmulh v14.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +sub v5.4s, v13.4s, v7.4s +add v13.4s, v13.4s, v7.4s +ldr q7, [x0, #848] +sqrdmulh v17.4S, v7.4S, v29.s[0] +mul v7.4S, v7.4S,v30.s[0] +sub v11.4s, v2.4s, v15.4s +add v2.4s, v2.4s, v15.4s +mla v12.4S, v16.4S, v31.s[0] +mla v20.4S, v8.4S, v31.s[0] +sub v8.4s, v1.4s, v19.4s +str q0, [x0, #128] +mla v18.4S, v14.4S, v31.s[0] +mla v7.4S, v17.4S, v31.s[0] +add v1.4s, v1.4s, v19.4s +str q6, [x0, #192] +ldr q6, [x0, #528] +sqrdmulh v19.4S, v6.4S, v29.s[0] +mul v6.4S, v6.4S,v30.s[0] +sub v17.4s, v22.4s, v3.4s +str q4, [x0, #0] +ldr q4, [x0, #592] +sqrdmulh v14.4S, v4.4S, v29.s[0] +mul v4.4S, v4.4S,v30.s[0] +add v22.4s, v22.4s, v3.4s +str q21, [x0, #64] +ldr q21, [x0, #656] +ldr q3, [x0, #400] +sqrdmulh v0.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +sub v16.4s, v3.4s, v12.4s +add v3.4s, v3.4s, v12.4s +ldr q12, [x0, #720] +ldr q15, [x0, #464] +sqrdmulh v9.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +sub v10.4s, v15.4s, v20.4s +add v15.4s, v15.4s, v20.4s +ldr q20, [x0, #272] +mla v6.4S, v19.4S, v31.s[0] +mla v4.4S, v14.4S, v31.s[0] +sub v14.4s, v20.4s, v18.4s +str q13, [x0, #512] +mla v21.4S, v0.4S, v31.s[0] +mla v12.4S, v9.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +str q5, [x0, #576] +ldr q5, [x0, #336] +sqrdmulh v18.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v9.4s, v5.4s, v7.4s +str q2, [x0, #640] +sqrdmulh v2.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +add v5.4s, v5.4s, v7.4s +str q11, [x0, #704] +ldr q11, [x0, #16] +sqrdmulh v7.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v0.4s, v11.4s, v6.4s +add v11.4s, v11.4s, v6.4s +ldr q6, [x0, #80] +sqrdmulh v13.4S, v5.4S, v29.s[1] +mul v5.4S, v5.4S,v30.s[1] +sub v19.4s, v6.4s, v4.4s +add v6.4s, v6.4s, v4.4s +ldr q4, [x0, #144] +mla v3.4S, v18.4S, v31.s[0] +mla v15.4S, v2.4S, v31.s[0] +sub v2.4s, v4.4s, v21.4s +str q1, [x0, #768] +mla v20.4S, v7.4S, v31.s[0] +mla v5.4S, v13.4S, v31.s[0] +add v4.4s, v4.4s, v21.4s +str q8, [x0, #832] +ldr q8, [x0, #208] +sqrdmulh v21.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +sub v13.4s, v8.4s, v12.4s +str q22, [x0, #896] +sqrdmulh v22.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +add v8.4s, v8.4s, v12.4s +str q17, [x0, #960] +sqrdmulh v17.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v12.4s, v4.4s, v3.4s +add v4.4s, v4.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v29.s[2] +mul v9.4S, v9.4S,v30.s[2] +sub v7.4s, v8.4s, v15.4s +add v8.4s, v8.4s, v15.4s +mla v16.4S, v21.4S, v31.s[0] +mla v10.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v20.4s +mla v14.4S, v17.4S, v31.s[0] +mla v9.4S, v3.4S, v31.s[0] +add v11.4s, v11.4s, v20.4s +sqrdmulh v20.4S, v12.4S, v27.s[1] +mul v12.4S, v12.4S,v28.s[1] +sub v3.4s, v6.4s, v5.4s +sqrdmulh v17.4S, v7.4S, v27.s[1] +mul v7.4S, v7.4S,v28.s[1] +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v4.4S, v27.s[0] +mul v4.4S, v4.4S,v28.s[0] +sub v21.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v8.4S, v27.s[0] +mul v8.4S, v8.4S,v28.s[0] +sub v15.4s, v13.4s, v10.4s +add v13.4s, v13.4s, v10.4s +mla v12.4S, v20.4S, v31.s[0] +mla v7.4S, v17.4S, v31.s[0] +sub v17.4s, v0.4s, v14.4s +mla v4.4S, v5.4S, v31.s[0] +mla v8.4S, v16.4S, v31.s[0] +add v0.4s, v0.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v27.s[2] +mul v2.4S, v2.4S,v28.s[2] +sub v16.4s, v19.4s, v9.4s +sqrdmulh v5.4S, v13.4S, v27.s[2] +mul v13.4S, v13.4S,v28.s[2] +add v19.4s, v19.4s, v9.4s +sqrdmulh v9.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +sub v20.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +sub v10.4s, v3.4s, v7.4s +add v3.4s, v3.4s, v7.4s +mla v2.4S, v14.4S, v31.s[0] +mla v13.4S, v5.4S, v31.s[0] +sub v5.4s, v11.4s, v4.4s +mla v21.4S, v9.4S, v31.s[0] +mla v15.4S, v12.4S, v31.s[0] +add v11.4s, v11.4s, v4.4s +sqrdmulh v4.4S, v3.4S, v25.s[2] +mul v3.4S, v3.4S,v26.s[2] +sub v12.4s, v6.4s, v8.4s +sqrdmulh v9.4S, v10.4S, v25.s[3] +mul v10.4S, v10.4S,v26.s[3] +add v6.4s, v6.4s, v8.4s +sqrdmulh v8.4S, v12.4S, v25.s[1] +mul v12.4S, v12.4S,v26.s[1] +sub v14.4s, v0.4s, v2.4s +add v0.4s, v0.4s, v2.4s +sqrdmulh v2.4S, v6.4S, v25.s[0] +mul v6.4S, v6.4S,v26.s[0] +sub v7.4s, v19.4s, v13.4s +add v19.4s, v19.4s, v13.4s +mla v3.4S, v4.4S, v31.s[0] +mla v10.4S, v9.4S, v31.s[0] +sub v9.4s, v17.4s, v21.4s +mla v12.4S, v8.4S, v31.s[0] +mla v6.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v19.4S, v23.s[0] +mul v19.4S, v19.4S,v24.s[0] +sub v2.4s, v16.4s, v15.4s +sqrdmulh v8.4S, v7.4S, v23.s[1] +mul v7.4S, v7.4S,v24.s[1] +add v16.4s, v16.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v23.s[2] +mul v16.4S, v16.4S,v24.s[2] +sub v4.4s, v22.4s, v3.4s +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v2.4S, v23.s[3] +mul v2.4S, v2.4S,v24.s[3] +sub v13.4s, v20.4s, v10.4s +add v20.4s, v20.4s, v10.4s +mla v19.4S, v21.4S, v31.s[0] +mla v7.4S, v8.4S, v31.s[0] +sub v8.4s, v5.4s, v12.4s +str q22, [x0, #272] +mla v16.4S, v15.4S, v31.s[0] +mla v2.4S, v3.4S, v31.s[0] +add v5.4s, v5.4s, v12.4s +str q4, [x0, #336] +sub v23.4s, v11.4s, v6.4s +str q20, [x0, #400] +add v11.4s, v11.4s, v6.4s +str q13, [x0, #464] +sub v13.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sub v19.4s, v14.4s, v7.4s +add v14.4s, v14.4s, v7.4s +sub v7.4s, v17.4s, v16.4s +str q5, [x0, #144] +add v17.4s, v17.4s, v16.4s +str q8, [x0, #208] +sub v8.4s, v9.4s, v2.4s +str q11, [x0, #16] +add v9.4s, v9.4s, v2.4s +str q23, [x0, #80] +str q0, [x0, #528] +str q13, [x0, #592] +str q14, [x0, #656] +str q19, [x0, #720] +str q17, [x0, #784] +str q7, [x0, #848] +str q9, [x0, #912] +str q8, [x0, #976] +ldr q18, [x0, #224] +ldr q1, [x0, #160] +ldr q10, [x0, #32] +ldr q21, [x17, #+128] +ldr q22, [x17, #+144] +sqrdmulh v15.4S, v10.4S, v22.s[0] +mul v10.4S, v10.4S,v21.s[0] +ldr q3, [x0, #48] +sqrdmulh v12.4S, v3.4S, v22.s[0] +mul v3.4S, v3.4S,v21.s[0] +ldr q4, [x17, #+160] +ldr q30, [x17, #+176] +ldr q29, [x0, #96] +sqrdmulh v28.4S, v29.4S, v30.s[0] +mul v29.4S, v29.4S,v4.s[0] +ldr q27, [x0, #112] +sqrdmulh v26.4S, v27.4S, v30.s[0] +mul v27.4S, v27.4S,v4.s[0] +ldr q25, [x17, #+192] +ldr q24, [x17, #+208] +mla v10.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v1.4S, v24.s[0] +ldr q20, [x0, #176] +mla v3.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v20.4S, v24.s[0] +ldr q6, [x17, #+224] +ldr q5, [x17, #+240] +mla v29.4S, v28.4S, v31.s[0] +sqrdmulh v28.4S, v18.4S, v5.s[0] +ldr q16, [x0, #240] +mla v27.4S, v26.4S, v31.s[0] +sqrdmulh v26.4S, v16.4S, v5.s[0] +ldr q11, [x0, #128] +ldr q2, [x0, #0] +mul v1.4S, v1.4S,v25.s[0] +sub v23.4s, v2.4s, v10.4s +mul v20.4S, v20.4S,v25.s[0] +add v2.4s, v2.4s, v10.4s +ldr q10, [x0, #144] +ldr q0, [x0, #16] +mla v1.4S, v15.4S, v31.s[0] +sub v15.4s, v0.4s, v3.4s +mla v20.4S, v12.4S, v31.s[0] +add v0.4s, v0.4s, v3.4s +ldr q3, [x0, #192] +ldr q12, [x0, #64] +mul v18.4S, v18.4S,v6.s[0] +sub v13.4s, v12.4s, v29.4s +mul v16.4S, v16.4S,v6.s[0] +add v12.4s, v12.4s, v29.4s +ldr q29, [x0, #208] +ldr q14, [x0, #80] +mla v18.4S, v28.4S, v31.s[0] +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v14.4s, v27.4s +sqrdmulh v28.4S, v0.4S, v22.s[1] +mul v0.4S, v0.4S,v21.s[1] +add v14.4s, v14.4s, v27.4s +sqrdmulh v27.4S, v15.4S, v22.s[2] +sub v19.4s, v11.4s, v1.4s +mul v15.4S, v15.4S,v21.s[2] +add v11.4s, v11.4s, v1.4s +sqrdmulh v22.4S, v14.4S, v30.s[1] +sub v21.4s, v10.4s, v20.4s +mul v14.4S, v14.4S,v4.s[1] +add v10.4s, v10.4s, v20.4s +sqrdmulh v20.4S, v26.4S, v30.s[2] +sub v1.4s, v3.4s, v18.4s +mul v26.4S, v26.4S,v4.s[2] +add v3.4s, v3.4s, v18.4s +mla v0.4S, v28.4S, v31.s[0] +sub v28.4s, v29.4s, v16.4s +ldr q30, [x0, #480] +sqrdmulh v4.4S, v10.4S, v24.s[1] +add v29.4s, v29.4s, v16.4s +mla v15.4S, v27.4S, v31.s[0] +ldr q27, [x0, #416] +sqrdmulh v16.4S, v21.4S, v24.s[2] +mla v14.4S, v22.4S, v31.s[0] +ldr q22, [x0, #288] +sqrdmulh v18.4S, v29.4S, v5.s[1] +mla v26.4S, v20.4S, v31.s[0] +ldr q20, [x17, #+256] +sqrdmulh v17.4S, v28.4S, v5.s[2] +ldr q7, [x17, #+272] +mul v10.4S, v10.4S,v25.s[1] +sub v9.4s, v2.4s, v0.4s +str q9, [x0, #16] +mul v21.4S, v21.4S,v25.s[2] +add v2.4s, v2.4s, v0.4s +str q2, [x0, #0] +mla v10.4S, v4.4S, v31.s[0] +sub v4.4s, v23.4s, v15.4s +str q4, [x0, #48] +mla v21.4S, v16.4S, v31.s[0] +add v23.4s, v23.4s, v15.4s +str q23, [x0, #32] +mul v29.4S, v29.4S,v6.s[1] +sub v24.4s, v12.4s, v14.4s +str q24, [x0, #80] +mul v28.4S, v28.4S,v6.s[2] +add v12.4s, v12.4s, v14.4s +str q12, [x0, #64] +mla v29.4S, v18.4S, v31.s[0] +sub v18.4s, v13.4s, v26.4s +str q18, [x0, #112] +mla v28.4S, v17.4S, v31.s[0] +add v13.4s, v13.4s, v26.4s +str q13, [x0, #96] +sqrdmulh v5.4S, v22.4S, v7.s[0] +sub v6.4s, v11.4s, v10.4s +mul v22.4S, v22.4S,v20.s[0] +str q6, [x0, #144] +ldr q6, [x0, #304] +sqrdmulh v13.4S, v6.4S, v7.s[0] +add v11.4s, v11.4s, v10.4s +mul v6.4S, v6.4S,v20.s[0] +str q11, [x0, #128] +ldr q11, [x17, #+288] +ldr q10, [x17, #+304] +ldr q26, [x0, #352] +sqrdmulh v17.4S, v26.4S, v10.s[0] +sub v18.4s, v19.4s, v21.4s +mul v26.4S, v26.4S,v11.s[0] +str q18, [x0, #176] +ldr q18, [x0, #368] +sqrdmulh v12.4S, v18.4S, v10.s[0] +add v19.4s, v19.4s, v21.4s +mul v18.4S, v18.4S,v11.s[0] +str q19, [x0, #160] +ldr q19, [x17, #+320] +ldr q21, [x17, #+336] +mla v22.4S, v5.4S, v31.s[0] +sub v5.4s, v3.4s, v29.4s +sqrdmulh v14.4S, v27.4S, v21.s[0] +str q5, [x0, #208] +ldr q5, [x0, #432] +mla v6.4S, v13.4S, v31.s[0] +add v3.4s, v3.4s, v29.4s +sqrdmulh v29.4S, v5.4S, v21.s[0] +str q3, [x0, #192] +ldr q3, [x17, #+352] +ldr q13, [x17, #+368] +mla v26.4S, v17.4S, v31.s[0] +sub v17.4s, v1.4s, v28.4s +sqrdmulh v24.4S, v30.4S, v13.s[0] +str q17, [x0, #240] +ldr q17, [x0, #496] +mla v18.4S, v12.4S, v31.s[0] +add v1.4s, v1.4s, v28.4s +sqrdmulh v28.4S, v17.4S, v13.s[0] +str q1, [x0, #224] +ldr q1, [x0, #384] +ldr q12, [x0, #256] +mul v27.4S, v27.4S,v19.s[0] +sub v25.4s, v12.4s, v22.4s +mul v5.4S, v5.4S,v19.s[0] +add v12.4s, v12.4s, v22.4s +ldr q22, [x0, #400] +ldr q23, [x0, #272] +mla v27.4S, v14.4S, v31.s[0] +sub v14.4s, v23.4s, v6.4s +mla v5.4S, v29.4S, v31.s[0] +add v23.4s, v23.4s, v6.4s +ldr q6, [x0, #448] +ldr q29, [x0, #320] +mul v30.4S, v30.4S,v3.s[0] +sub v15.4s, v29.4s, v26.4s +mul v17.4S, v17.4S,v3.s[0] +add v29.4s, v29.4s, v26.4s +ldr q26, [x0, #464] +ldr q16, [x0, #336] +mla v30.4S, v24.4S, v31.s[0] +mla v17.4S, v28.4S, v31.s[0] +sub v28.4s, v16.4s, v18.4s +sqrdmulh v24.4S, v23.4S, v7.s[1] +mul v23.4S, v23.4S,v20.s[1] +add v16.4s, v16.4s, v18.4s +sqrdmulh v18.4S, v14.4S, v7.s[2] +sub v4.4s, v1.4s, v27.4s +mul v14.4S, v14.4S,v20.s[2] +add v1.4s, v1.4s, v27.4s +sqrdmulh v7.4S, v16.4S, v10.s[1] +sub v20.4s, v22.4s, v5.4s +mul v16.4S, v16.4S,v11.s[1] +add v22.4s, v22.4s, v5.4s +sqrdmulh v5.4S, v28.4S, v10.s[2] +sub v27.4s, v6.4s, v30.4s +mul v28.4S, v28.4S,v11.s[2] +add v6.4s, v6.4s, v30.4s +mla v23.4S, v24.4S, v31.s[0] +sub v24.4s, v26.4s, v17.4s +ldr q10, [x0, #736] +sqrdmulh v11.4S, v22.4S, v21.s[1] +add v26.4s, v26.4s, v17.4s +mla v14.4S, v18.4S, v31.s[0] +ldr q18, [x0, #672] +sqrdmulh v17.4S, v20.4S, v21.s[2] +mla v16.4S, v7.4S, v31.s[0] +ldr q7, [x0, #544] +sqrdmulh v30.4S, v26.4S, v13.s[1] +mla v28.4S, v5.4S, v31.s[0] +ldr q5, [x17, #+384] +sqrdmulh v2.4S, v24.4S, v13.s[2] +ldr q0, [x17, #+400] +mul v22.4S, v22.4S,v19.s[1] +sub v9.4s, v12.4s, v23.4s +str q9, [x0, #272] +mul v20.4S, v20.4S,v19.s[2] +add v12.4s, v12.4s, v23.4s +str q12, [x0, #256] +mla v22.4S, v11.4S, v31.s[0] +sub v11.4s, v25.4s, v14.4s +str q11, [x0, #304] +mla v20.4S, v17.4S, v31.s[0] +add v25.4s, v25.4s, v14.4s +str q25, [x0, #288] +mul v26.4S, v26.4S,v3.s[1] +sub v21.4s, v29.4s, v16.4s +str q21, [x0, #336] +mul v24.4S, v24.4S,v3.s[2] +add v29.4s, v29.4s, v16.4s +str q29, [x0, #320] +mla v26.4S, v30.4S, v31.s[0] +sub v30.4s, v15.4s, v28.4s +str q30, [x0, #368] +mla v24.4S, v2.4S, v31.s[0] +add v15.4s, v15.4s, v28.4s +str q15, [x0, #352] +sqrdmulh v13.4S, v7.4S, v0.s[0] +sub v3.4s, v1.4s, v22.4s +mul v7.4S, v7.4S,v5.s[0] +str q3, [x0, #400] +ldr q3, [x0, #560] +sqrdmulh v15.4S, v3.4S, v0.s[0] +add v1.4s, v1.4s, v22.4s +mul v3.4S, v3.4S,v5.s[0] +str q1, [x0, #384] +ldr q1, [x17, #+416] +ldr q22, [x17, #+432] +ldr q28, [x0, #608] +sqrdmulh v2.4S, v28.4S, v22.s[0] +sub v30.4s, v4.4s, v20.4s +mul v28.4S, v28.4S,v1.s[0] +str q30, [x0, #432] +ldr q30, [x0, #624] +sqrdmulh v29.4S, v30.4S, v22.s[0] +add v4.4s, v4.4s, v20.4s +mul v30.4S, v30.4S,v1.s[0] +str q4, [x0, #416] +ldr q4, [x17, #+448] +ldr q20, [x17, #+464] +mla v7.4S, v13.4S, v31.s[0] +sub v13.4s, v6.4s, v26.4s +sqrdmulh v16.4S, v18.4S, v20.s[0] +str q13, [x0, #464] +ldr q13, [x0, #688] +mla v3.4S, v15.4S, v31.s[0] +add v6.4s, v6.4s, v26.4s +sqrdmulh v26.4S, v13.4S, v20.s[0] +str q6, [x0, #448] +ldr q6, [x17, #+480] +ldr q15, [x17, #+496] +mla v28.4S, v2.4S, v31.s[0] +sub v2.4s, v27.4s, v24.4s +sqrdmulh v21.4S, v10.4S, v15.s[0] +str q2, [x0, #496] +ldr q2, [x0, #752] +mla v30.4S, v29.4S, v31.s[0] +add v27.4s, v27.4s, v24.4s +sqrdmulh v24.4S, v2.4S, v15.s[0] +str q27, [x0, #480] +ldr q27, [x0, #640] +ldr q29, [x0, #512] +mul v18.4S, v18.4S,v4.s[0] +sub v19.4s, v29.4s, v7.4s +mul v13.4S, v13.4S,v4.s[0] +add v29.4s, v29.4s, v7.4s +ldr q7, [x0, #656] +ldr q25, [x0, #528] +mla v18.4S, v16.4S, v31.s[0] +sub v16.4s, v25.4s, v3.4s +mla v13.4S, v26.4S, v31.s[0] +add v25.4s, v25.4s, v3.4s +ldr q3, [x0, #704] +ldr q26, [x0, #576] +mul v10.4S, v10.4S,v6.s[0] +sub v14.4s, v26.4s, v28.4s +mul v2.4S, v2.4S,v6.s[0] +add v26.4s, v26.4s, v28.4s +ldr q28, [x0, #720] +ldr q17, [x0, #592] +mla v10.4S, v21.4S, v31.s[0] +mla v2.4S, v24.4S, v31.s[0] +sub v24.4s, v17.4s, v30.4s +sqrdmulh v21.4S, v25.4S, v0.s[1] +mul v25.4S, v25.4S,v5.s[1] +add v17.4s, v17.4s, v30.4s +sqrdmulh v30.4S, v16.4S, v0.s[2] +sub v11.4s, v27.4s, v18.4s +mul v16.4S, v16.4S,v5.s[2] +add v27.4s, v27.4s, v18.4s +sqrdmulh v0.4S, v17.4S, v22.s[1] +sub v5.4s, v7.4s, v13.4s +mul v17.4S, v17.4S,v1.s[1] +add v7.4s, v7.4s, v13.4s +sqrdmulh v13.4S, v24.4S, v22.s[2] +sub v18.4s, v3.4s, v10.4s +mul v24.4S, v24.4S,v1.s[2] +add v3.4s, v3.4s, v10.4s +mla v25.4S, v21.4S, v31.s[0] +sub v21.4s, v28.4s, v2.4s +ldr q22, [x0, #992] +sqrdmulh v1.4S, v7.4S, v20.s[1] +add v28.4s, v28.4s, v2.4s +mla v16.4S, v30.4S, v31.s[0] +ldr q30, [x0, #928] +sqrdmulh v2.4S, v5.4S, v20.s[2] +mla v17.4S, v0.4S, v31.s[0] +ldr q0, [x0, #800] +sqrdmulh v10.4S, v28.4S, v15.s[1] +mla v24.4S, v13.4S, v31.s[0] +ldr q13, [x17, #+512] +sqrdmulh v12.4S, v21.4S, v15.s[2] +ldr q23, [x17, #+528] +mul v7.4S, v7.4S,v4.s[1] +sub v9.4s, v29.4s, v25.4s +str q9, [x0, #528] +mul v5.4S, v5.4S,v4.s[2] +add v29.4s, v29.4s, v25.4s +str q29, [x0, #512] +mla v7.4S, v1.4S, v31.s[0] +sub v1.4s, v19.4s, v16.4s +str q1, [x0, #560] +mla v5.4S, v2.4S, v31.s[0] +add v19.4s, v19.4s, v16.4s +str q19, [x0, #544] +mul v28.4S, v28.4S,v6.s[1] +sub v20.4s, v26.4s, v17.4s +str q20, [x0, #592] +mul v21.4S, v21.4S,v6.s[2] +add v26.4s, v26.4s, v17.4s +str q26, [x0, #576] +mla v28.4S, v10.4S, v31.s[0] +sub v10.4s, v14.4s, v24.4s +str q10, [x0, #624] +mla v21.4S, v12.4S, v31.s[0] +add v14.4s, v14.4s, v24.4s +str q14, [x0, #608] +sqrdmulh v15.4S, v0.4S, v23.s[0] +sub v6.4s, v27.4s, v7.4s +mul v0.4S, v0.4S,v13.s[0] +str q6, [x0, #656] +ldr q6, [x0, #816] +sqrdmulh v14.4S, v6.4S, v23.s[0] +add v27.4s, v27.4s, v7.4s +mul v6.4S, v6.4S,v13.s[0] +str q27, [x0, #640] +ldr q27, [x17, #+544] +ldr q7, [x17, #+560] +ldr q24, [x0, #864] +sqrdmulh v12.4S, v24.4S, v7.s[0] +sub v10.4s, v11.4s, v5.4s +mul v24.4S, v24.4S,v27.s[0] +str q10, [x0, #688] +ldr q10, [x0, #880] +sqrdmulh v26.4S, v10.4S, v7.s[0] +add v11.4s, v11.4s, v5.4s +mul v10.4S, v10.4S,v27.s[0] +str q11, [x0, #672] +ldr q11, [x17, #+576] +ldr q5, [x17, #+592] +mla v0.4S, v15.4S, v31.s[0] +sub v15.4s, v3.4s, v28.4s +sqrdmulh v17.4S, v30.4S, v5.s[0] +str q15, [x0, #720] +ldr q15, [x0, #944] +mla v6.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v28.4s +sqrdmulh v28.4S, v15.4S, v5.s[0] +str q3, [x0, #704] +ldr q3, [x17, #+608] +ldr q14, [x17, #+624] +mla v24.4S, v12.4S, v31.s[0] +sub v12.4s, v18.4s, v21.4s +sqrdmulh v20.4S, v22.4S, v14.s[0] +str q12, [x0, #752] +ldr q12, [x0, #1008] +mla v10.4S, v26.4S, v31.s[0] +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v12.4S, v14.s[0] +str q18, [x0, #736] +ldr q18, [x0, #896] +ldr q26, [x0, #768] +mul v30.4S, v30.4S,v11.s[0] +sub v4.4s, v26.4s, v0.4s +mul v15.4S, v15.4S,v11.s[0] +add v26.4s, v26.4s, v0.4s +ldr q0, [x0, #912] +ldr q19, [x0, #784] +mla v30.4S, v17.4S, v31.s[0] +sub v17.4s, v19.4s, v6.4s +mla v15.4S, v28.4S, v31.s[0] +add v19.4s, v19.4s, v6.4s +ldr q6, [x0, #960] +ldr q28, [x0, #832] +mul v22.4S, v22.4S,v3.s[0] +sub v16.4s, v28.4s, v24.4s +mul v12.4S, v12.4S,v3.s[0] +add v28.4s, v28.4s, v24.4s +ldr q24, [x0, #976] +ldr q2, [x0, #848] +mla v22.4S, v20.4S, v31.s[0] +mla v12.4S, v21.4S, v31.s[0] +sub v21.4s, v2.4s, v10.4s +sqrdmulh v20.4S, v19.4S, v23.s[1] +mul v19.4S, v19.4S,v13.s[1] +add v2.4s, v2.4s, v10.4s +sqrdmulh v10.4S, v17.4S, v23.s[2] +sub v1.4s, v18.4s, v30.4s +mul v17.4S, v17.4S,v13.s[2] +add v18.4s, v18.4s, v30.4s +sqrdmulh v23.4S, v2.4S, v7.s[1] +sub v13.4s, v0.4s, v15.4s +mul v2.4S, v2.4S,v27.s[1] +add v0.4s, v0.4s, v15.4s +sqrdmulh v15.4S, v21.4S, v7.s[2] +sub v30.4s, v6.4s, v22.4s +mul v21.4S, v21.4S,v27.s[2] +add v6.4s, v6.4s, v22.4s +mla v19.4S, v20.4S, v31.s[0] +sub v20.4s, v24.4s, v12.4s +sqrdmulh v7.4S, v0.4S, v5.s[1] +add v24.4s, v24.4s, v12.4s +mla v17.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v13.4S, v5.s[2] +mla v2.4S, v23.4S, v31.s[0] +sqrdmulh v23.4S, v24.4S, v14.s[1] +mla v21.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v20.4S, v14.s[2] +mul v0.4S, v0.4S,v11.s[1] +sub v12.4s, v26.4s, v19.4s +str q12, [x0, #784] +mul v13.4S, v13.4S,v11.s[2] +add v26.4s, v26.4s, v19.4s +str q26, [x0, #768] +mla v0.4S, v7.4S, v31.s[0] +sub v7.4s, v4.4s, v17.4s +str q7, [x0, #816] +mla v13.4S, v10.4S, v31.s[0] +add v4.4s, v4.4s, v17.4s +str q4, [x0, #800] +mul v24.4S, v24.4S,v3.s[1] +sub v5.4s, v28.4s, v2.4s +str q5, [x0, #848] +mul v20.4S, v20.4S,v3.s[2] +add v28.4s, v28.4s, v2.4s +str q28, [x0, #832] +mla v24.4S, v23.4S, v31.s[0] +sub v23.4s, v16.4s, v21.4s +str q23, [x0, #880] +mla v20.4S, v15.4S, v31.s[0] +add v16.4s, v16.4s, v21.4s +str q16, [x0, #864] +sub v14.4s, v18.4s, v0.4s +str q14, [x0, #912] +add v18.4s, v18.4s, v0.4s +str q18, [x0, #896] +sub v18.4s, v1.4s, v13.4s +str q18, [x0, #944] +add v1.4s, v1.4s, v13.4s +str q1, [x0, #928] +sub v1.4s, v6.4s, v24.4s +str q1, [x0, #976] +add v6.4s, v6.4s, v24.4s +str q6, [x0, #960] +sub v6.4s, v30.4s, v20.4s +str q6, [x0, #1008] +add v30.4s, v30.4s, v20.4s +str q30, [x0, #992] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1464 +// Instruction count: 1460 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_6.s b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_6.s new file mode 100644 index 0000000..d7c7c98 --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_6.s @@ -0,0 +1,1494 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_7_z4_6 +.global _ntt_u32_incomplete_neon_asm_var_4_2_7_z4_6 +ntt_u32_incomplete_neon_asm_var_4_2_7_z4_6: +_ntt_u32_incomplete_neon_asm_var_4_2_7_z4_6: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #928] +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +ldr q20, [x0, #992] +sqrdmulh v19.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q18, [x0, #800] +sqrdmulh v17.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +ldr q16, [x0, #864] +sqrdmulh v3.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +mla v22.4S, v21.4S, v31.s[0] +mla v20.4S, v19.4S, v31.s[0] +mla v18.4S, v17.4S, v31.s[0] +mla v16.4S, v3.4S, v31.s[0] +ldr q3, [x0, #544] +sqrdmulh v17.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +ldr q19, [x0, #608] +sqrdmulh v21.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q2, [x0, #672] +ldr q1, [x0, #416] +sqrdmulh v0.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +sub v15.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +ldr q22, [x0, #736] +ldr q14, [x0, #480] +sqrdmulh v13.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v12.4s, v14.4s, v20.4s +add v14.4s, v14.4s, v20.4s +ldr q20, [x0, #288] +mla v3.4S, v17.4S, v31.s[0] +mla v19.4S, v21.4S, v31.s[0] +sub v21.4s, v20.4s, v18.4s +mla v2.4S, v0.4S, v31.s[0] +mla v22.4S, v13.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +ldr q18, [x0, #352] +sqrdmulh v13.4S, v1.4S, v29.s[1] +mul v1.4S, v1.4S,v30.s[1] +sub v0.4s, v18.4s, v16.4s +sqrdmulh v17.4S, v14.4S, v29.s[1] +mul v14.4S, v14.4S,v30.s[1] +add v18.4s, v18.4s, v16.4s +ldr q16, [x0, #32] +sqrdmulh v11.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v10.4s, v16.4s, v3.4s +add v16.4s, v16.4s, v3.4s +ldr q3, [x0, #96] +sqrdmulh v9.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v8.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +ldr q19, [x0, #160] +mla v1.4S, v13.4S, v31.s[0] +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v19.4s, v2.4s +mla v20.4S, v11.4S, v31.s[0] +mla v18.4S, v9.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +ldr q2, [x0, #224] +sqrdmulh v9.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +sub v11.4s, v2.4s, v22.4s +sqrdmulh v13.4S, v12.4S, v29.s[2] +mul v12.4S, v12.4S,v30.s[2] +add v2.4s, v2.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +sub v7.4s, v19.4s, v1.4s +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v29.s[2] +mul v0.4S, v0.4S,v30.s[2] +sub v6.4s, v2.4s, v14.4s +add v2.4s, v2.4s, v14.4s +mla v15.4S, v9.4S, v31.s[0] +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v16.4s, v20.4s +mla v21.4S, v22.4S, v31.s[0] +mla v0.4S, v1.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v7.4S, v27.s[1] +mul v7.4S, v7.4S,v28.s[1] +sub v1.4s, v3.4s, v18.4s +sqrdmulh v22.4S, v6.4S, v27.s[1] +mul v6.4S, v6.4S,v28.s[1] +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v19.4S, v27.s[0] +mul v19.4S, v19.4S,v28.s[0] +sub v9.4s, v17.4s, v15.4s +add v17.4s, v17.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v27.s[0] +mul v2.4S, v2.4S,v28.s[0] +sub v14.4s, v11.4s, v12.4s +add v11.4s, v11.4s, v12.4s +mla v7.4S, v20.4S, v31.s[0] +mla v6.4S, v22.4S, v31.s[0] +sub v22.4s, v10.4s, v21.4s +mla v19.4S, v18.4S, v31.s[0] +mla v2.4S, v15.4S, v31.s[0] +add v10.4s, v10.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v27.s[2] +mul v17.4S, v17.4S,v28.s[2] +sub v15.4s, v8.4s, v0.4s +sqrdmulh v18.4S, v11.4S, v27.s[2] +mul v11.4S, v11.4S,v28.s[2] +add v8.4s, v8.4s, v0.4s +sqrdmulh v0.4S, v9.4S, v27.s[3] +mul v9.4S, v9.4S,v28.s[3] +sub v20.4s, v13.4s, v7.4s +add v13.4s, v13.4s, v7.4s +sqrdmulh v7.4S, v14.4S, v27.s[3] +mul v14.4S, v14.4S,v28.s[3] +sub v12.4s, v1.4s, v6.4s +add v1.4s, v1.4s, v6.4s +mla v17.4S, v21.4S, v31.s[0] +mla v11.4S, v18.4S, v31.s[0] +sub v18.4s, v16.4s, v19.4s +mla v9.4S, v0.4S, v31.s[0] +mla v14.4S, v7.4S, v31.s[0] +add v16.4s, v16.4s, v19.4s +sqrdmulh v19.4S, v1.4S, v25.s[2] +mul v1.4S, v1.4S,v26.s[2] +sub v7.4s, v3.4s, v2.4s +sqrdmulh v0.4S, v12.4S, v25.s[3] +mul v12.4S, v12.4S,v26.s[3] +add v3.4s, v3.4s, v2.4s +sqrdmulh v2.4S, v7.4S, v25.s[1] +mul v7.4S, v7.4S,v26.s[1] +sub v21.4s, v10.4s, v17.4s +add v10.4s, v10.4s, v17.4s +sqrdmulh v17.4S, v3.4S, v25.s[0] +mul v3.4S, v3.4S,v26.s[0] +sub v6.4s, v8.4s, v11.4s +add v8.4s, v8.4s, v11.4s +mla v1.4S, v19.4S, v31.s[0] +mla v12.4S, v0.4S, v31.s[0] +sub v0.4s, v22.4s, v9.4s +mla v7.4S, v2.4S, v31.s[0] +mla v3.4S, v17.4S, v31.s[0] +add v22.4s, v22.4s, v9.4s +sqrdmulh v9.4S, v8.4S, v23.s[0] +mul v8.4S, v8.4S,v24.s[0] +sub v17.4s, v15.4s, v14.4s +sqrdmulh v2.4S, v6.4S, v23.s[1] +mul v6.4S, v6.4S,v24.s[1] +add v15.4s, v15.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v23.s[2] +mul v15.4S, v15.4S,v24.s[2] +sub v19.4s, v13.4s, v1.4s +add v13.4s, v13.4s, v1.4s +sqrdmulh v1.4S, v17.4S, v23.s[3] +mul v17.4S, v17.4S,v24.s[3] +sub v11.4s, v20.4s, v12.4s +add v20.4s, v20.4s, v12.4s +mla v8.4S, v9.4S, v31.s[0] +mla v6.4S, v2.4S, v31.s[0] +sub v2.4s, v18.4s, v7.4s +str q13, [x0, #288] +mla v15.4S, v14.4S, v31.s[0] +mla v17.4S, v1.4S, v31.s[0] +add v18.4s, v18.4s, v7.4s +str q19, [x0, #352] +ldr q19, [x0, #944] +sqrdmulh v7.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +sub v1.4s, v16.4s, v3.4s +str q20, [x0, #416] +ldr q20, [x0, #1008] +sqrdmulh v14.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v16.4s, v16.4s, v3.4s +str q11, [x0, #480] +ldr q11, [x0, #816] +sqrdmulh v3.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +sub v13.4s, v10.4s, v8.4s +add v10.4s, v10.4s, v8.4s +ldr q8, [x0, #880] +sqrdmulh v9.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v12.4s, v21.4s, v6.4s +add v21.4s, v21.4s, v6.4s +mla v19.4S, v7.4S, v31.s[0] +mla v20.4S, v14.4S, v31.s[0] +sub v14.4s, v22.4s, v15.4s +str q18, [x0, #160] +mla v11.4S, v3.4S, v31.s[0] +mla v8.4S, v9.4S, v31.s[0] +add v22.4s, v22.4s, v15.4s +str q2, [x0, #224] +ldr q2, [x0, #560] +sqrdmulh v15.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +sub v9.4s, v0.4s, v17.4s +str q16, [x0, #32] +ldr q16, [x0, #624] +sqrdmulh v3.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +add v0.4s, v0.4s, v17.4s +str q1, [x0, #96] +ldr q1, [x0, #688] +ldr q17, [x0, #432] +sqrdmulh v18.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +sub v7.4s, v17.4s, v19.4s +add v17.4s, v17.4s, v19.4s +ldr q19, [x0, #752] +ldr q6, [x0, #496] +sqrdmulh v5.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +sub v4.4s, v6.4s, v20.4s +add v6.4s, v6.4s, v20.4s +ldr q20, [x0, #304] +mla v2.4S, v15.4S, v31.s[0] +mla v16.4S, v3.4S, v31.s[0] +sub v3.4s, v20.4s, v11.4s +str q10, [x0, #544] +mla v1.4S, v18.4S, v31.s[0] +mla v19.4S, v5.4S, v31.s[0] +add v20.4s, v20.4s, v11.4s +str q13, [x0, #608] +ldr q13, [x0, #368] +sqrdmulh v11.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v5.4s, v13.4s, v8.4s +str q21, [x0, #672] +sqrdmulh v21.4S, v6.4S, v29.s[1] +mul v6.4S, v6.4S,v30.s[1] +add v13.4s, v13.4s, v8.4s +str q12, [x0, #736] +ldr q12, [x0, #48] +sqrdmulh v8.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v18.4s, v12.4s, v2.4s +add v12.4s, v12.4s, v2.4s +ldr q2, [x0, #112] +sqrdmulh v10.4S, v13.4S, v29.s[1] +mul v13.4S, v13.4S,v30.s[1] +sub v15.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +ldr q16, [x0, #176] +mla v17.4S, v11.4S, v31.s[0] +mla v6.4S, v21.4S, v31.s[0] +sub v21.4s, v16.4s, v1.4s +str q22, [x0, #800] +mla v20.4S, v8.4S, v31.s[0] +mla v13.4S, v10.4S, v31.s[0] +add v16.4s, v16.4s, v1.4s +str q14, [x0, #864] +ldr q14, [x0, #240] +sqrdmulh v1.4S, v7.4S, v29.s[2] +mul v7.4S, v7.4S,v30.s[2] +sub v10.4s, v14.4s, v19.4s +str q0, [x0, #928] +sqrdmulh v0.4S, v4.4S, v29.s[2] +mul v4.4S, v4.4S,v30.s[2] +add v14.4s, v14.4s, v19.4s +str q9, [x0, #992] +sqrdmulh v9.4S, v3.4S, v29.s[2] +mul v3.4S, v3.4S,v30.s[2] +sub v19.4s, v16.4s, v17.4s +add v16.4s, v16.4s, v17.4s +sqrdmulh v17.4S, v5.4S, v29.s[2] +mul v5.4S, v5.4S,v30.s[2] +sub v8.4s, v14.4s, v6.4s +add v14.4s, v14.4s, v6.4s +mla v7.4S, v1.4S, v31.s[0] +mla v4.4S, v0.4S, v31.s[0] +sub v0.4s, v12.4s, v20.4s +mla v3.4S, v9.4S, v31.s[0] +mla v5.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v27.s[1] +mul v19.4S, v19.4S,v28.s[1] +sub v17.4s, v2.4s, v13.4s +sqrdmulh v9.4S, v8.4S, v27.s[1] +mul v8.4S, v8.4S,v28.s[1] +add v2.4s, v2.4s, v13.4s +sqrdmulh v13.4S, v16.4S, v27.s[0] +mul v16.4S, v16.4S,v28.s[0] +sub v1.4s, v21.4s, v7.4s +add v21.4s, v21.4s, v7.4s +sqrdmulh v7.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +sub v6.4s, v10.4s, v4.4s +add v10.4s, v10.4s, v4.4s +mla v19.4S, v20.4S, v31.s[0] +mla v8.4S, v9.4S, v31.s[0] +sub v9.4s, v18.4s, v3.4s +mla v16.4S, v13.4S, v31.s[0] +mla v14.4S, v7.4S, v31.s[0] +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v27.s[2] +mul v21.4S, v21.4S,v28.s[2] +sub v7.4s, v15.4s, v5.4s +sqrdmulh v13.4S, v10.4S, v27.s[2] +mul v10.4S, v10.4S,v28.s[2] +add v15.4s, v15.4s, v5.4s +sqrdmulh v5.4S, v1.4S, v27.s[3] +mul v1.4S, v1.4S,v28.s[3] +sub v20.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v27.s[3] +mul v6.4S, v6.4S,v28.s[3] +sub v4.4s, v17.4s, v8.4s +add v17.4s, v17.4s, v8.4s +mla v21.4S, v3.4S, v31.s[0] +mla v10.4S, v13.4S, v31.s[0] +sub v13.4s, v12.4s, v16.4s +mla v1.4S, v5.4S, v31.s[0] +mla v6.4S, v19.4S, v31.s[0] +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v25.s[2] +mul v17.4S, v17.4S,v26.s[2] +sub v19.4s, v2.4s, v14.4s +sqrdmulh v5.4S, v4.4S, v25.s[3] +mul v4.4S, v4.4S,v26.s[3] +add v2.4s, v2.4s, v14.4s +sqrdmulh v14.4S, v19.4S, v25.s[1] +mul v19.4S, v19.4S,v26.s[1] +sub v3.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v2.4S, v25.s[0] +mul v2.4S, v2.4S,v26.s[0] +sub v8.4s, v15.4s, v10.4s +add v15.4s, v15.4s, v10.4s +mla v17.4S, v16.4S, v31.s[0] +mla v4.4S, v5.4S, v31.s[0] +sub v5.4s, v9.4s, v1.4s +mla v19.4S, v14.4S, v31.s[0] +mla v2.4S, v21.4S, v31.s[0] +add v9.4s, v9.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v23.s[0] +mul v15.4S, v15.4S,v24.s[0] +sub v21.4s, v7.4s, v6.4s +sqrdmulh v14.4S, v8.4S, v23.s[1] +mul v8.4S, v8.4S,v24.s[1] +add v7.4s, v7.4s, v6.4s +sqrdmulh v6.4S, v7.4S, v23.s[2] +mul v7.4S, v7.4S,v24.s[2] +sub v16.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +sqrdmulh v17.4S, v21.4S, v23.s[3] +mul v21.4S, v21.4S,v24.s[3] +sub v10.4s, v20.4s, v4.4s +add v20.4s, v20.4s, v4.4s +mla v15.4S, v1.4S, v31.s[0] +mla v8.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v19.4s +str q0, [x0, #304] +mla v7.4S, v6.4S, v31.s[0] +mla v21.4S, v17.4S, v31.s[0] +add v13.4s, v13.4s, v19.4s +str q16, [x0, #368] +ldr q16, [x0, #896] +sqrdmulh v19.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v17.4s, v12.4s, v2.4s +str q20, [x0, #432] +ldr q20, [x0, #960] +sqrdmulh v6.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v12.4s, v12.4s, v2.4s +str q10, [x0, #496] +ldr q10, [x0, #768] +sqrdmulh v2.4S, v10.4S, v29.s[0] +mul v10.4S, v10.4S,v30.s[0] +sub v0.4s, v18.4s, v15.4s +add v18.4s, v18.4s, v15.4s +ldr q15, [x0, #832] +sqrdmulh v1.4S, v15.4S, v29.s[0] +mul v15.4S, v15.4S,v30.s[0] +sub v4.4s, v3.4s, v8.4s +add v3.4s, v3.4s, v8.4s +mla v16.4S, v19.4S, v31.s[0] +mla v20.4S, v6.4S, v31.s[0] +sub v6.4s, v9.4s, v7.4s +str q13, [x0, #176] +mla v10.4S, v2.4S, v31.s[0] +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v7.4s +str q14, [x0, #240] +ldr q14, [x0, #512] +sqrdmulh v7.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v1.4s, v5.4s, v21.4s +str q12, [x0, #48] +ldr q12, [x0, #576] +sqrdmulh v2.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +add v5.4s, v5.4s, v21.4s +str q17, [x0, #112] +ldr q17, [x0, #640] +ldr q21, [x0, #384] +sqrdmulh v13.4S, v17.4S, v29.s[0] +mul v17.4S, v17.4S,v30.s[0] +sub v19.4s, v21.4s, v16.4s +add v21.4s, v21.4s, v16.4s +ldr q16, [x0, #704] +ldr q8, [x0, #448] +sqrdmulh v22.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v11.4s, v8.4s, v20.4s +add v8.4s, v8.4s, v20.4s +ldr q20, [x0, #256] +mla v14.4S, v7.4S, v31.s[0] +mla v12.4S, v2.4S, v31.s[0] +sub v2.4s, v20.4s, v10.4s +str q18, [x0, #560] +mla v17.4S, v13.4S, v31.s[0] +mla v16.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v10.4s +str q0, [x0, #624] +ldr q0, [x0, #320] +sqrdmulh v10.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v22.4s, v0.4s, v15.4s +str q3, [x0, #688] +sqrdmulh v3.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +add v0.4s, v0.4s, v15.4s +str q4, [x0, #752] +ldr q4, [x0, #0] +sqrdmulh v15.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v13.4s, v4.4s, v14.4s +add v4.4s, v4.4s, v14.4s +ldr q14, [x0, #64] +sqrdmulh v18.4S, v0.4S, v29.s[1] +mul v0.4S, v0.4S,v30.s[1] +sub v7.4s, v14.4s, v12.4s +add v14.4s, v14.4s, v12.4s +ldr q12, [x0, #128] +mla v21.4S, v10.4S, v31.s[0] +mla v8.4S, v3.4S, v31.s[0] +sub v3.4s, v12.4s, v17.4s +str q9, [x0, #816] +mla v20.4S, v15.4S, v31.s[0] +mla v0.4S, v18.4S, v31.s[0] +add v12.4s, v12.4s, v17.4s +str q6, [x0, #880] +ldr q6, [x0, #192] +sqrdmulh v17.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +sub v18.4s, v6.4s, v16.4s +str q5, [x0, #944] +sqrdmulh v5.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +add v6.4s, v6.4s, v16.4s +str q1, [x0, #1008] +sqrdmulh v1.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v16.4s, v12.4s, v21.4s +add v12.4s, v12.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +sub v15.4s, v6.4s, v8.4s +add v6.4s, v6.4s, v8.4s +mla v19.4S, v17.4S, v31.s[0] +mla v11.4S, v5.4S, v31.s[0] +sub v5.4s, v4.4s, v20.4s +mla v2.4S, v1.4S, v31.s[0] +mla v22.4S, v21.4S, v31.s[0] +add v4.4s, v4.4s, v20.4s +sqrdmulh v20.4S, v16.4S, v27.s[1] +mul v16.4S, v16.4S,v28.s[1] +sub v21.4s, v14.4s, v0.4s +sqrdmulh v1.4S, v15.4S, v27.s[1] +mul v15.4S, v15.4S,v28.s[1] +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +sub v17.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v27.s[0] +mul v6.4S, v6.4S,v28.s[0] +sub v8.4s, v18.4s, v11.4s +add v18.4s, v18.4s, v11.4s +mla v16.4S, v20.4S, v31.s[0] +mla v15.4S, v1.4S, v31.s[0] +sub v1.4s, v13.4s, v2.4s +mla v12.4S, v0.4S, v31.s[0] +mla v6.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v3.4S, v27.s[2] +mul v3.4S, v3.4S,v28.s[2] +sub v19.4s, v7.4s, v22.4s +sqrdmulh v0.4S, v18.4S, v27.s[2] +mul v18.4S, v18.4S,v28.s[2] +add v7.4s, v7.4s, v22.4s +sqrdmulh v22.4S, v17.4S, v27.s[3] +mul v17.4S, v17.4S,v28.s[3] +sub v20.4s, v5.4s, v16.4s +add v5.4s, v5.4s, v16.4s +sqrdmulh v16.4S, v8.4S, v27.s[3] +mul v8.4S, v8.4S,v28.s[3] +sub v11.4s, v21.4s, v15.4s +add v21.4s, v21.4s, v15.4s +mla v3.4S, v2.4S, v31.s[0] +mla v18.4S, v0.4S, v31.s[0] +sub v0.4s, v4.4s, v12.4s +mla v17.4S, v22.4S, v31.s[0] +mla v8.4S, v16.4S, v31.s[0] +add v4.4s, v4.4s, v12.4s +sqrdmulh v12.4S, v21.4S, v25.s[2] +mul v21.4S, v21.4S,v26.s[2] +sub v16.4s, v14.4s, v6.4s +sqrdmulh v22.4S, v11.4S, v25.s[3] +mul v11.4S, v11.4S,v26.s[3] +add v14.4s, v14.4s, v6.4s +sqrdmulh v6.4S, v16.4S, v25.s[1] +mul v16.4S, v16.4S,v26.s[1] +sub v2.4s, v13.4s, v3.4s +add v13.4s, v13.4s, v3.4s +sqrdmulh v3.4S, v14.4S, v25.s[0] +mul v14.4S, v14.4S,v26.s[0] +sub v15.4s, v7.4s, v18.4s +add v7.4s, v7.4s, v18.4s +mla v21.4S, v12.4S, v31.s[0] +mla v11.4S, v22.4S, v31.s[0] +sub v22.4s, v1.4s, v17.4s +mla v16.4S, v6.4S, v31.s[0] +mla v14.4S, v3.4S, v31.s[0] +add v1.4s, v1.4s, v17.4s +sqrdmulh v17.4S, v7.4S, v23.s[0] +mul v7.4S, v7.4S,v24.s[0] +sub v3.4s, v19.4s, v8.4s +sqrdmulh v6.4S, v15.4S, v23.s[1] +mul v15.4S, v15.4S,v24.s[1] +add v19.4s, v19.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v23.s[2] +mul v19.4S, v19.4S,v24.s[2] +sub v12.4s, v5.4s, v21.4s +add v5.4s, v5.4s, v21.4s +sqrdmulh v21.4S, v3.4S, v23.s[3] +mul v3.4S, v3.4S,v24.s[3] +sub v18.4s, v20.4s, v11.4s +add v20.4s, v20.4s, v11.4s +mla v7.4S, v17.4S, v31.s[0] +mla v15.4S, v6.4S, v31.s[0] +sub v6.4s, v0.4s, v16.4s +str q5, [x0, #256] +mla v19.4S, v8.4S, v31.s[0] +mla v3.4S, v21.4S, v31.s[0] +add v0.4s, v0.4s, v16.4s +str q12, [x0, #320] +ldr q12, [x0, #912] +sqrdmulh v16.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +sub v21.4s, v4.4s, v14.4s +str q20, [x0, #384] +ldr q20, [x0, #976] +sqrdmulh v8.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v4.4s, v4.4s, v14.4s +str q18, [x0, #448] +ldr q18, [x0, #784] +sqrdmulh v14.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +sub v5.4s, v13.4s, v7.4s +add v13.4s, v13.4s, v7.4s +ldr q7, [x0, #848] +sqrdmulh v17.4S, v7.4S, v29.s[0] +mul v7.4S, v7.4S,v30.s[0] +sub v11.4s, v2.4s, v15.4s +add v2.4s, v2.4s, v15.4s +mla v12.4S, v16.4S, v31.s[0] +mla v20.4S, v8.4S, v31.s[0] +sub v8.4s, v1.4s, v19.4s +str q0, [x0, #128] +mla v18.4S, v14.4S, v31.s[0] +mla v7.4S, v17.4S, v31.s[0] +add v1.4s, v1.4s, v19.4s +str q6, [x0, #192] +ldr q6, [x0, #528] +sqrdmulh v19.4S, v6.4S, v29.s[0] +mul v6.4S, v6.4S,v30.s[0] +sub v17.4s, v22.4s, v3.4s +str q4, [x0, #0] +ldr q4, [x0, #592] +sqrdmulh v14.4S, v4.4S, v29.s[0] +mul v4.4S, v4.4S,v30.s[0] +add v22.4s, v22.4s, v3.4s +str q21, [x0, #64] +ldr q21, [x0, #656] +ldr q3, [x0, #400] +sqrdmulh v0.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +sub v16.4s, v3.4s, v12.4s +add v3.4s, v3.4s, v12.4s +ldr q12, [x0, #720] +ldr q15, [x0, #464] +sqrdmulh v9.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +sub v10.4s, v15.4s, v20.4s +add v15.4s, v15.4s, v20.4s +ldr q20, [x0, #272] +mla v6.4S, v19.4S, v31.s[0] +mla v4.4S, v14.4S, v31.s[0] +sub v14.4s, v20.4s, v18.4s +str q13, [x0, #512] +mla v21.4S, v0.4S, v31.s[0] +mla v12.4S, v9.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +str q5, [x0, #576] +ldr q5, [x0, #336] +sqrdmulh v18.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v9.4s, v5.4s, v7.4s +str q2, [x0, #640] +sqrdmulh v2.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +add v5.4s, v5.4s, v7.4s +str q11, [x0, #704] +ldr q11, [x0, #16] +sqrdmulh v7.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v0.4s, v11.4s, v6.4s +add v11.4s, v11.4s, v6.4s +ldr q6, [x0, #80] +sqrdmulh v13.4S, v5.4S, v29.s[1] +mul v5.4S, v5.4S,v30.s[1] +sub v19.4s, v6.4s, v4.4s +add v6.4s, v6.4s, v4.4s +ldr q4, [x0, #144] +mla v3.4S, v18.4S, v31.s[0] +mla v15.4S, v2.4S, v31.s[0] +sub v2.4s, v4.4s, v21.4s +str q1, [x0, #768] +mla v20.4S, v7.4S, v31.s[0] +mla v5.4S, v13.4S, v31.s[0] +add v4.4s, v4.4s, v21.4s +str q8, [x0, #832] +ldr q8, [x0, #208] +sqrdmulh v21.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +sub v13.4s, v8.4s, v12.4s +str q22, [x0, #896] +sqrdmulh v22.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +add v8.4s, v8.4s, v12.4s +str q17, [x0, #960] +sqrdmulh v17.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v12.4s, v4.4s, v3.4s +add v4.4s, v4.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v29.s[2] +mul v9.4S, v9.4S,v30.s[2] +sub v7.4s, v8.4s, v15.4s +add v8.4s, v8.4s, v15.4s +mla v16.4S, v21.4S, v31.s[0] +mla v10.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v20.4s +mla v14.4S, v17.4S, v31.s[0] +mla v9.4S, v3.4S, v31.s[0] +add v11.4s, v11.4s, v20.4s +sqrdmulh v20.4S, v12.4S, v27.s[1] +mul v12.4S, v12.4S,v28.s[1] +sub v3.4s, v6.4s, v5.4s +sqrdmulh v17.4S, v7.4S, v27.s[1] +mul v7.4S, v7.4S,v28.s[1] +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v4.4S, v27.s[0] +mul v4.4S, v4.4S,v28.s[0] +sub v21.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v8.4S, v27.s[0] +mul v8.4S, v8.4S,v28.s[0] +sub v15.4s, v13.4s, v10.4s +add v13.4s, v13.4s, v10.4s +mla v12.4S, v20.4S, v31.s[0] +mla v7.4S, v17.4S, v31.s[0] +sub v17.4s, v0.4s, v14.4s +mla v4.4S, v5.4S, v31.s[0] +mla v8.4S, v16.4S, v31.s[0] +add v0.4s, v0.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v27.s[2] +mul v2.4S, v2.4S,v28.s[2] +sub v16.4s, v19.4s, v9.4s +sqrdmulh v5.4S, v13.4S, v27.s[2] +mul v13.4S, v13.4S,v28.s[2] +add v19.4s, v19.4s, v9.4s +sqrdmulh v9.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +sub v20.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +sub v10.4s, v3.4s, v7.4s +add v3.4s, v3.4s, v7.4s +mla v2.4S, v14.4S, v31.s[0] +mla v13.4S, v5.4S, v31.s[0] +sub v5.4s, v11.4s, v4.4s +mla v21.4S, v9.4S, v31.s[0] +mla v15.4S, v12.4S, v31.s[0] +add v11.4s, v11.4s, v4.4s +sqrdmulh v4.4S, v3.4S, v25.s[2] +mul v3.4S, v3.4S,v26.s[2] +sub v12.4s, v6.4s, v8.4s +sqrdmulh v9.4S, v10.4S, v25.s[3] +mul v10.4S, v10.4S,v26.s[3] +add v6.4s, v6.4s, v8.4s +sqrdmulh v8.4S, v12.4S, v25.s[1] +mul v12.4S, v12.4S,v26.s[1] +sub v14.4s, v0.4s, v2.4s +add v0.4s, v0.4s, v2.4s +sqrdmulh v2.4S, v6.4S, v25.s[0] +mul v6.4S, v6.4S,v26.s[0] +sub v7.4s, v19.4s, v13.4s +add v19.4s, v19.4s, v13.4s +mla v3.4S, v4.4S, v31.s[0] +mla v10.4S, v9.4S, v31.s[0] +sub v9.4s, v17.4s, v21.4s +mla v12.4S, v8.4S, v31.s[0] +mla v6.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v19.4S, v23.s[0] +mul v19.4S, v19.4S,v24.s[0] +sub v2.4s, v16.4s, v15.4s +sqrdmulh v8.4S, v7.4S, v23.s[1] +mul v7.4S, v7.4S,v24.s[1] +add v16.4s, v16.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v23.s[2] +mul v16.4S, v16.4S,v24.s[2] +sub v4.4s, v22.4s, v3.4s +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v2.4S, v23.s[3] +mul v2.4S, v2.4S,v24.s[3] +sub v13.4s, v20.4s, v10.4s +add v20.4s, v20.4s, v10.4s +mla v19.4S, v21.4S, v31.s[0] +mla v7.4S, v8.4S, v31.s[0] +sub v8.4s, v5.4s, v12.4s +str q22, [x0, #272] +mla v16.4S, v15.4S, v31.s[0] +mla v2.4S, v3.4S, v31.s[0] +add v5.4s, v5.4s, v12.4s +str q4, [x0, #336] +sub v23.4s, v11.4s, v6.4s +str q20, [x0, #400] +add v11.4s, v11.4s, v6.4s +str q13, [x0, #464] +sub v13.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sub v19.4s, v14.4s, v7.4s +add v14.4s, v14.4s, v7.4s +sub v7.4s, v17.4s, v16.4s +str q5, [x0, #144] +add v17.4s, v17.4s, v16.4s +str q8, [x0, #208] +sub v8.4s, v9.4s, v2.4s +str q11, [x0, #16] +add v9.4s, v9.4s, v2.4s +str q23, [x0, #80] +str q0, [x0, #528] +str q13, [x0, #592] +str q14, [x0, #656] +str q19, [x0, #720] +str q17, [x0, #784] +str q7, [x0, #848] +str q9, [x0, #912] +str q8, [x0, #976] +ldr q18, [x0, #224] +ldr q1, [x0, #160] +ldr q10, [x0, #32] +ldr q21, [x17, #+128] +ldr q22, [x17, #+144] +sqrdmulh v15.4S, v10.4S, v22.s[0] +mul v10.4S, v10.4S,v21.s[0] +ldr q3, [x0, #48] +sqrdmulh v12.4S, v3.4S, v22.s[0] +mul v3.4S, v3.4S,v21.s[0] +ldr q4, [x17, #+160] +ldr q30, [x17, #+176] +ldr q29, [x0, #96] +sqrdmulh v28.4S, v29.4S, v30.s[0] +mul v29.4S, v29.4S,v4.s[0] +ldr q27, [x0, #112] +sqrdmulh v26.4S, v27.4S, v30.s[0] +mul v27.4S, v27.4S,v4.s[0] +ldr q25, [x17, #+192] +ldr q24, [x17, #+208] +mla v10.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v1.4S, v24.s[0] +ldr q20, [x0, #176] +mla v3.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v20.4S, v24.s[0] +ldr q6, [x17, #+224] +ldr q5, [x17, #+240] +mla v29.4S, v28.4S, v31.s[0] +sqrdmulh v28.4S, v18.4S, v5.s[0] +ldr q16, [x0, #240] +mla v27.4S, v26.4S, v31.s[0] +sqrdmulh v26.4S, v16.4S, v5.s[0] +ldr q11, [x0, #0] +ldr q2, [x0, #128] +mul v1.4S, v1.4S,v25.s[0] +sub v23.4s, v11.4s, v10.4s +ldr q0, [x0, #16] +mul v20.4S, v20.4S,v25.s[0] +add v11.4s, v11.4s, v10.4s +ldr q10, [x0, #144] +mla v1.4S, v15.4S, v31.s[0] +sub v15.4s, v0.4s, v3.4s +ldr q13, [x0, #64] +mla v20.4S, v12.4S, v31.s[0] +add v0.4s, v0.4s, v3.4s +ldr q3, [x0, #192] +mul v18.4S, v18.4S,v6.s[0] +sub v12.4s, v13.4s, v29.4s +ldr q14, [x0, #80] +mul v16.4S, v16.4S,v6.s[0] +add v13.4s, v13.4s, v29.4s +ldr q29, [x0, #208] +mla v18.4S, v28.4S, v31.s[0] +sub v28.4s, v14.4s, v27.4s +mla v16.4S, v26.4S, v31.s[0] +add v14.4s, v14.4s, v27.4s +sqrdmulh v27.4S, v0.4S, v22.s[1] +mul v0.4S, v0.4S,v21.s[1] +sqrdmulh v26.4S, v15.4S, v22.s[2] +sub v19.4s, v2.4s, v1.4s +mul v15.4S, v15.4S,v21.s[2] +add v2.4s, v2.4s, v1.4s +sqrdmulh v22.4S, v14.4S, v30.s[1] +sub v21.4s, v10.4s, v20.4s +mul v14.4S, v14.4S,v4.s[1] +add v10.4s, v10.4s, v20.4s +sqrdmulh v20.4S, v28.4S, v30.s[2] +sub v1.4s, v3.4s, v18.4s +mul v28.4S, v28.4S,v4.s[2] +add v3.4s, v3.4s, v18.4s +mla v0.4S, v27.4S, v31.s[0] +sub v27.4s, v29.4s, v16.4s +ldr q30, [x0, #480] +sqrdmulh v4.4S, v10.4S, v24.s[1] +add v29.4s, v29.4s, v16.4s +mla v15.4S, v26.4S, v31.s[0] +ldr q26, [x0, #416] +sqrdmulh v16.4S, v21.4S, v24.s[2] +sub v18.4s, v11.4s, v0.4s +mla v14.4S, v22.4S, v31.s[0] +ldr q22, [x0, #288] +sqrdmulh v17.4S, v29.4S, v5.s[1] +add v11.4s, v11.4s, v0.4s +str q18, [x0, #16] +mla v28.4S, v20.4S, v31.s[0] +ldr q20, [x17, #+256] +ldr q18, [x17, #+272] +sqrdmulh v0.4S, v27.4S, v5.s[2] +sub v7.4s, v23.4s, v15.4s +str q11, [x0, #0] +mul v10.4S, v10.4S,v25.s[1] +add v23.4s, v23.4s, v15.4s +mul v21.4S, v21.4S,v25.s[2] +str q7, [x0, #48] +mla v10.4S, v4.4S, v31.s[0] +sub v4.4s, v13.4s, v14.4s +mla v21.4S, v16.4S, v31.s[0] +str q23, [x0, #32] +mul v29.4S, v29.4S,v6.s[1] +str q4, [x0, #80] +mul v27.4S, v27.4S,v6.s[2] +add v13.4s, v13.4s, v14.4s +str q13, [x0, #64] +mla v29.4S, v17.4S, v31.s[0] +sub v17.4s, v12.4s, v28.4s +str q17, [x0, #112] +mla v27.4S, v0.4S, v31.s[0] +add v12.4s, v12.4s, v28.4s +str q12, [x0, #96] +sqrdmulh v5.4S, v22.4S, v18.s[0] +sub v6.4s, v2.4s, v10.4s +mul v22.4S, v22.4S,v20.s[0] +str q6, [x0, #144] +ldr q6, [x0, #304] +sqrdmulh v12.4S, v6.4S, v18.s[0] +add v2.4s, v2.4s, v10.4s +mul v6.4S, v6.4S,v20.s[0] +str q2, [x0, #128] +ldr q2, [x17, #+288] +ldr q10, [x17, #+304] +ldr q28, [x0, #352] +sqrdmulh v0.4S, v28.4S, v10.s[0] +sub v17.4s, v19.4s, v21.4s +mul v28.4S, v28.4S,v2.s[0] +str q17, [x0, #176] +ldr q17, [x0, #368] +sqrdmulh v13.4S, v17.4S, v10.s[0] +add v19.4s, v19.4s, v21.4s +mul v17.4S, v17.4S,v2.s[0] +str q19, [x0, #160] +ldr q19, [x17, #+320] +ldr q21, [x17, #+336] +mla v22.4S, v5.4S, v31.s[0] +sub v5.4s, v3.4s, v29.4s +sqrdmulh v14.4S, v26.4S, v21.s[0] +str q5, [x0, #208] +ldr q5, [x0, #432] +mla v6.4S, v12.4S, v31.s[0] +add v3.4s, v3.4s, v29.4s +sqrdmulh v29.4S, v5.4S, v21.s[0] +str q3, [x0, #192] +ldr q3, [x17, #+352] +ldr q12, [x17, #+368] +mla v28.4S, v0.4S, v31.s[0] +sub v0.4s, v1.4s, v27.4s +sqrdmulh v4.4S, v30.4S, v12.s[0] +str q0, [x0, #240] +ldr q0, [x0, #496] +mla v17.4S, v13.4S, v31.s[0] +add v1.4s, v1.4s, v27.4s +sqrdmulh v27.4S, v0.4S, v12.s[0] +str q1, [x0, #224] +ldr q1, [x0, #256] +ldr q13, [x0, #384] +mul v26.4S, v26.4S,v19.s[0] +sub v24.4s, v1.4s, v22.4s +ldr q25, [x0, #272] +mul v5.4S, v5.4S,v19.s[0] +add v1.4s, v1.4s, v22.4s +ldr q22, [x0, #400] +mla v26.4S, v14.4S, v31.s[0] +sub v14.4s, v25.4s, v6.4s +ldr q23, [x0, #320] +mla v5.4S, v29.4S, v31.s[0] +add v25.4s, v25.4s, v6.4s +ldr q6, [x0, #448] +mul v30.4S, v30.4S,v3.s[0] +sub v29.4s, v23.4s, v28.4s +ldr q16, [x0, #336] +mul v0.4S, v0.4S,v3.s[0] +add v23.4s, v23.4s, v28.4s +ldr q28, [x0, #464] +mla v30.4S, v4.4S, v31.s[0] +sub v4.4s, v16.4s, v17.4s +mla v0.4S, v27.4S, v31.s[0] +add v16.4s, v16.4s, v17.4s +sqrdmulh v17.4S, v25.4S, v18.s[1] +mul v25.4S, v25.4S,v20.s[1] +sqrdmulh v27.4S, v14.4S, v18.s[2] +sub v7.4s, v13.4s, v26.4s +mul v14.4S, v14.4S,v20.s[2] +add v13.4s, v13.4s, v26.4s +sqrdmulh v18.4S, v16.4S, v10.s[1] +sub v20.4s, v22.4s, v5.4s +mul v16.4S, v16.4S,v2.s[1] +add v22.4s, v22.4s, v5.4s +sqrdmulh v5.4S, v4.4S, v10.s[2] +sub v26.4s, v6.4s, v30.4s +mul v4.4S, v4.4S,v2.s[2] +add v6.4s, v6.4s, v30.4s +mla v25.4S, v17.4S, v31.s[0] +sub v17.4s, v28.4s, v0.4s +ldr q10, [x0, #736] +sqrdmulh v2.4S, v22.4S, v21.s[1] +add v28.4s, v28.4s, v0.4s +mla v14.4S, v27.4S, v31.s[0] +ldr q27, [x0, #672] +sqrdmulh v0.4S, v20.4S, v21.s[2] +sub v30.4s, v1.4s, v25.4s +mla v16.4S, v18.4S, v31.s[0] +ldr q18, [x0, #544] +sqrdmulh v15.4S, v28.4S, v12.s[1] +add v1.4s, v1.4s, v25.4s +str q30, [x0, #272] +mla v4.4S, v5.4S, v31.s[0] +ldr q5, [x17, #+384] +ldr q30, [x17, #+400] +sqrdmulh v25.4S, v17.4S, v12.s[2] +sub v11.4s, v24.4s, v14.4s +str q1, [x0, #256] +mul v22.4S, v22.4S,v19.s[1] +add v24.4s, v24.4s, v14.4s +mul v20.4S, v20.4S,v19.s[2] +str q11, [x0, #304] +mla v22.4S, v2.4S, v31.s[0] +sub v2.4s, v23.4s, v16.4s +mla v20.4S, v0.4S, v31.s[0] +str q24, [x0, #288] +mul v28.4S, v28.4S,v3.s[1] +str q2, [x0, #336] +mul v17.4S, v17.4S,v3.s[2] +add v23.4s, v23.4s, v16.4s +str q23, [x0, #320] +mla v28.4S, v15.4S, v31.s[0] +sub v15.4s, v29.4s, v4.4s +str q15, [x0, #368] +mla v17.4S, v25.4S, v31.s[0] +add v29.4s, v29.4s, v4.4s +str q29, [x0, #352] +sqrdmulh v12.4S, v18.4S, v30.s[0] +sub v3.4s, v13.4s, v22.4s +mul v18.4S, v18.4S,v5.s[0] +str q3, [x0, #400] +ldr q3, [x0, #560] +sqrdmulh v29.4S, v3.4S, v30.s[0] +add v13.4s, v13.4s, v22.4s +mul v3.4S, v3.4S,v5.s[0] +str q13, [x0, #384] +ldr q13, [x17, #+416] +ldr q22, [x17, #+432] +ldr q4, [x0, #608] +sqrdmulh v25.4S, v4.4S, v22.s[0] +sub v15.4s, v7.4s, v20.4s +mul v4.4S, v4.4S,v13.s[0] +str q15, [x0, #432] +ldr q15, [x0, #624] +sqrdmulh v23.4S, v15.4S, v22.s[0] +add v7.4s, v7.4s, v20.4s +mul v15.4S, v15.4S,v13.s[0] +str q7, [x0, #416] +ldr q7, [x17, #+448] +ldr q20, [x17, #+464] +mla v18.4S, v12.4S, v31.s[0] +sub v12.4s, v6.4s, v28.4s +sqrdmulh v16.4S, v27.4S, v20.s[0] +str q12, [x0, #464] +ldr q12, [x0, #688] +mla v3.4S, v29.4S, v31.s[0] +add v6.4s, v6.4s, v28.4s +sqrdmulh v28.4S, v12.4S, v20.s[0] +str q6, [x0, #448] +ldr q6, [x17, #+480] +ldr q29, [x17, #+496] +mla v4.4S, v25.4S, v31.s[0] +sub v25.4s, v26.4s, v17.4s +sqrdmulh v2.4S, v10.4S, v29.s[0] +str q25, [x0, #496] +ldr q25, [x0, #752] +mla v15.4S, v23.4S, v31.s[0] +add v26.4s, v26.4s, v17.4s +sqrdmulh v17.4S, v25.4S, v29.s[0] +str q26, [x0, #480] +ldr q26, [x0, #512] +ldr q23, [x0, #640] +mul v27.4S, v27.4S,v7.s[0] +sub v21.4s, v26.4s, v18.4s +ldr q19, [x0, #528] +mul v12.4S, v12.4S,v7.s[0] +add v26.4s, v26.4s, v18.4s +ldr q18, [x0, #656] +mla v27.4S, v16.4S, v31.s[0] +sub v16.4s, v19.4s, v3.4s +ldr q24, [x0, #576] +mla v12.4S, v28.4S, v31.s[0] +add v19.4s, v19.4s, v3.4s +ldr q3, [x0, #704] +mul v10.4S, v10.4S,v6.s[0] +sub v28.4s, v24.4s, v4.4s +ldr q0, [x0, #592] +mul v25.4S, v25.4S,v6.s[0] +add v24.4s, v24.4s, v4.4s +ldr q4, [x0, #720] +mla v10.4S, v2.4S, v31.s[0] +sub v2.4s, v0.4s, v15.4s +mla v25.4S, v17.4S, v31.s[0] +add v0.4s, v0.4s, v15.4s +sqrdmulh v15.4S, v19.4S, v30.s[1] +mul v19.4S, v19.4S,v5.s[1] +sqrdmulh v17.4S, v16.4S, v30.s[2] +sub v11.4s, v23.4s, v27.4s +mul v16.4S, v16.4S,v5.s[2] +add v23.4s, v23.4s, v27.4s +sqrdmulh v30.4S, v0.4S, v22.s[1] +sub v5.4s, v18.4s, v12.4s +mul v0.4S, v0.4S,v13.s[1] +add v18.4s, v18.4s, v12.4s +sqrdmulh v12.4S, v2.4S, v22.s[2] +sub v27.4s, v3.4s, v10.4s +mul v2.4S, v2.4S,v13.s[2] +add v3.4s, v3.4s, v10.4s +mla v19.4S, v15.4S, v31.s[0] +sub v15.4s, v4.4s, v25.4s +ldr q22, [x0, #992] +sqrdmulh v13.4S, v18.4S, v20.s[1] +add v4.4s, v4.4s, v25.4s +mla v16.4S, v17.4S, v31.s[0] +ldr q17, [x0, #928] +sqrdmulh v25.4S, v5.4S, v20.s[2] +sub v10.4s, v26.4s, v19.4s +mla v0.4S, v30.4S, v31.s[0] +ldr q30, [x0, #800] +sqrdmulh v14.4S, v4.4S, v29.s[1] +add v26.4s, v26.4s, v19.4s +str q10, [x0, #528] +mla v2.4S, v12.4S, v31.s[0] +ldr q12, [x17, #+512] +ldr q10, [x17, #+528] +sqrdmulh v19.4S, v15.4S, v29.s[2] +sub v1.4s, v21.4s, v16.4s +str q26, [x0, #512] +mul v18.4S, v18.4S,v7.s[1] +add v21.4s, v21.4s, v16.4s +mul v5.4S, v5.4S,v7.s[2] +str q1, [x0, #560] +mla v18.4S, v13.4S, v31.s[0] +sub v13.4s, v24.4s, v0.4s +mla v5.4S, v25.4S, v31.s[0] +str q21, [x0, #544] +mul v4.4S, v4.4S,v6.s[1] +str q13, [x0, #592] +mul v15.4S, v15.4S,v6.s[2] +add v24.4s, v24.4s, v0.4s +str q24, [x0, #576] +mla v4.4S, v14.4S, v31.s[0] +sub v14.4s, v28.4s, v2.4s +str q14, [x0, #624] +mla v15.4S, v19.4S, v31.s[0] +add v28.4s, v28.4s, v2.4s +str q28, [x0, #608] +sqrdmulh v29.4S, v30.4S, v10.s[0] +sub v6.4s, v23.4s, v18.4s +mul v30.4S, v30.4S,v12.s[0] +str q6, [x0, #656] +ldr q6, [x0, #816] +sqrdmulh v28.4S, v6.4S, v10.s[0] +add v23.4s, v23.4s, v18.4s +mul v6.4S, v6.4S,v12.s[0] +str q23, [x0, #640] +ldr q23, [x17, #+544] +ldr q18, [x17, #+560] +ldr q2, [x0, #864] +sqrdmulh v19.4S, v2.4S, v18.s[0] +sub v14.4s, v11.4s, v5.4s +mul v2.4S, v2.4S,v23.s[0] +str q14, [x0, #688] +ldr q14, [x0, #880] +sqrdmulh v24.4S, v14.4S, v18.s[0] +add v11.4s, v11.4s, v5.4s +mul v14.4S, v14.4S,v23.s[0] +str q11, [x0, #672] +ldr q11, [x17, #+576] +ldr q5, [x17, #+592] +mla v30.4S, v29.4S, v31.s[0] +sub v29.4s, v3.4s, v4.4s +sqrdmulh v0.4S, v17.4S, v5.s[0] +str q29, [x0, #720] +ldr q29, [x0, #944] +mla v6.4S, v28.4S, v31.s[0] +add v3.4s, v3.4s, v4.4s +sqrdmulh v4.4S, v29.4S, v5.s[0] +str q3, [x0, #704] +ldr q3, [x17, #+608] +ldr q28, [x17, #+624] +mla v2.4S, v19.4S, v31.s[0] +sub v19.4s, v27.4s, v15.4s +sqrdmulh v13.4S, v22.4S, v28.s[0] +str q19, [x0, #752] +ldr q19, [x0, #1008] +mla v14.4S, v24.4S, v31.s[0] +add v27.4s, v27.4s, v15.4s +sqrdmulh v15.4S, v19.4S, v28.s[0] +str q27, [x0, #736] +ldr q27, [x0, #768] +ldr q24, [x0, #896] +mul v17.4S, v17.4S,v11.s[0] +sub v20.4s, v27.4s, v30.4s +ldr q7, [x0, #784] +mul v29.4S, v29.4S,v11.s[0] +add v27.4s, v27.4s, v30.4s +ldr q30, [x0, #912] +mla v17.4S, v0.4S, v31.s[0] +sub v0.4s, v7.4s, v6.4s +ldr q21, [x0, #832] +mla v29.4S, v4.4S, v31.s[0] +add v7.4s, v7.4s, v6.4s +ldr q6, [x0, #960] +mul v22.4S, v22.4S,v3.s[0] +sub v4.4s, v21.4s, v2.4s +ldr q25, [x0, #848] +mul v19.4S, v19.4S,v3.s[0] +add v21.4s, v21.4s, v2.4s +ldr q2, [x0, #976] +mla v22.4S, v13.4S, v31.s[0] +sub v13.4s, v25.4s, v14.4s +mla v19.4S, v15.4S, v31.s[0] +add v25.4s, v25.4s, v14.4s +sqrdmulh v14.4S, v7.4S, v10.s[1] +mul v7.4S, v7.4S,v12.s[1] +sqrdmulh v15.4S, v0.4S, v10.s[2] +sub v1.4s, v24.4s, v17.4s +mul v0.4S, v0.4S,v12.s[2] +add v24.4s, v24.4s, v17.4s +sqrdmulh v10.4S, v25.4S, v18.s[1] +sub v12.4s, v30.4s, v29.4s +mul v25.4S, v25.4S,v23.s[1] +add v30.4s, v30.4s, v29.4s +sqrdmulh v29.4S, v13.4S, v18.s[2] +sub v17.4s, v6.4s, v22.4s +mul v13.4S, v13.4S,v23.s[2] +add v6.4s, v6.4s, v22.4s +mla v7.4S, v14.4S, v31.s[0] +sub v14.4s, v2.4s, v19.4s +sqrdmulh v18.4S, v30.4S, v5.s[1] +add v2.4s, v2.4s, v19.4s +mla v0.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v12.4S, v5.s[2] +sub v19.4s, v27.4s, v7.4s +mla v25.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v2.4S, v28.s[1] +add v27.4s, v27.4s, v7.4s +str q19, [x0, #784] +mla v13.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v14.4S, v28.s[2] +sub v19.4s, v20.4s, v0.4s +str q27, [x0, #768] +mul v30.4S, v30.4S,v11.s[1] +add v20.4s, v20.4s, v0.4s +mul v12.4S, v12.4S,v11.s[2] +str q19, [x0, #816] +mla v30.4S, v18.4S, v31.s[0] +sub v18.4s, v21.4s, v25.4s +mla v12.4S, v15.4S, v31.s[0] +str q20, [x0, #800] +mul v2.4S, v2.4S,v3.s[1] +str q18, [x0, #848] +mul v14.4S, v14.4S,v3.s[2] +add v21.4s, v21.4s, v25.4s +str q21, [x0, #832] +mla v2.4S, v10.4S, v31.s[0] +sub v10.4s, v4.4s, v13.4s +str q10, [x0, #880] +mla v14.4S, v29.4S, v31.s[0] +add v4.4s, v4.4s, v13.4s +str q4, [x0, #864] +sub v28.4s, v24.4s, v30.4s +str q28, [x0, #912] +add v24.4s, v24.4s, v30.4s +str q24, [x0, #896] +sub v24.4s, v1.4s, v12.4s +str q24, [x0, #944] +add v1.4s, v1.4s, v12.4s +str q1, [x0, #928] +sub v1.4s, v6.4s, v2.4s +str q1, [x0, #976] +add v6.4s, v6.4s, v2.4s +str q6, [x0, #960] +sub v6.4s, v17.4s, v14.4s +str q6, [x0, #1008] +add v17.4s, v17.4s, v14.4s +str q17, [x0, #992] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1464 +// Instruction count: 1460 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_7.s b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_7.s new file mode 100644 index 0000000..a96a052 --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_7.s @@ -0,0 +1,1494 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_7_z4_7 +.global _ntt_u32_incomplete_neon_asm_var_4_2_7_z4_7 +ntt_u32_incomplete_neon_asm_var_4_2_7_z4_7: +_ntt_u32_incomplete_neon_asm_var_4_2_7_z4_7: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #928] +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +ldr q20, [x0, #992] +sqrdmulh v19.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q18, [x0, #800] +sqrdmulh v17.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +ldr q16, [x0, #864] +sqrdmulh v3.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +mla v22.4S, v21.4S, v31.s[0] +mla v20.4S, v19.4S, v31.s[0] +mla v18.4S, v17.4S, v31.s[0] +mla v16.4S, v3.4S, v31.s[0] +ldr q3, [x0, #544] +sqrdmulh v17.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +ldr q19, [x0, #608] +sqrdmulh v21.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q2, [x0, #672] +ldr q1, [x0, #416] +sqrdmulh v0.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +sub v15.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +ldr q22, [x0, #736] +ldr q14, [x0, #480] +sqrdmulh v13.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v12.4s, v14.4s, v20.4s +add v14.4s, v14.4s, v20.4s +ldr q20, [x0, #288] +mla v3.4S, v17.4S, v31.s[0] +mla v19.4S, v21.4S, v31.s[0] +sub v21.4s, v20.4s, v18.4s +mla v2.4S, v0.4S, v31.s[0] +mla v22.4S, v13.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +ldr q18, [x0, #352] +sqrdmulh v13.4S, v1.4S, v29.s[1] +mul v1.4S, v1.4S,v30.s[1] +sub v0.4s, v18.4s, v16.4s +sqrdmulh v17.4S, v14.4S, v29.s[1] +mul v14.4S, v14.4S,v30.s[1] +add v18.4s, v18.4s, v16.4s +ldr q16, [x0, #32] +sqrdmulh v11.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v10.4s, v16.4s, v3.4s +add v16.4s, v16.4s, v3.4s +ldr q3, [x0, #96] +sqrdmulh v9.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v8.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +ldr q19, [x0, #160] +mla v1.4S, v13.4S, v31.s[0] +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v19.4s, v2.4s +mla v20.4S, v11.4S, v31.s[0] +mla v18.4S, v9.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +ldr q2, [x0, #224] +sqrdmulh v9.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +sub v11.4s, v2.4s, v22.4s +sqrdmulh v13.4S, v12.4S, v29.s[2] +mul v12.4S, v12.4S,v30.s[2] +add v2.4s, v2.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +sub v7.4s, v19.4s, v1.4s +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v29.s[2] +mul v0.4S, v0.4S,v30.s[2] +sub v6.4s, v2.4s, v14.4s +add v2.4s, v2.4s, v14.4s +mla v15.4S, v9.4S, v31.s[0] +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v16.4s, v20.4s +mla v21.4S, v22.4S, v31.s[0] +mla v0.4S, v1.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v7.4S, v27.s[1] +mul v7.4S, v7.4S,v28.s[1] +sub v1.4s, v3.4s, v18.4s +sqrdmulh v22.4S, v6.4S, v27.s[1] +mul v6.4S, v6.4S,v28.s[1] +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v19.4S, v27.s[0] +mul v19.4S, v19.4S,v28.s[0] +sub v9.4s, v17.4s, v15.4s +add v17.4s, v17.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v27.s[0] +mul v2.4S, v2.4S,v28.s[0] +sub v14.4s, v11.4s, v12.4s +add v11.4s, v11.4s, v12.4s +mla v7.4S, v20.4S, v31.s[0] +mla v6.4S, v22.4S, v31.s[0] +sub v22.4s, v10.4s, v21.4s +mla v19.4S, v18.4S, v31.s[0] +mla v2.4S, v15.4S, v31.s[0] +add v10.4s, v10.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v27.s[2] +mul v17.4S, v17.4S,v28.s[2] +sub v15.4s, v8.4s, v0.4s +sqrdmulh v18.4S, v11.4S, v27.s[2] +mul v11.4S, v11.4S,v28.s[2] +add v8.4s, v8.4s, v0.4s +sqrdmulh v0.4S, v9.4S, v27.s[3] +mul v9.4S, v9.4S,v28.s[3] +sub v20.4s, v13.4s, v7.4s +add v13.4s, v13.4s, v7.4s +sqrdmulh v7.4S, v14.4S, v27.s[3] +mul v14.4S, v14.4S,v28.s[3] +sub v12.4s, v1.4s, v6.4s +add v1.4s, v1.4s, v6.4s +mla v17.4S, v21.4S, v31.s[0] +mla v11.4S, v18.4S, v31.s[0] +sub v18.4s, v16.4s, v19.4s +mla v9.4S, v0.4S, v31.s[0] +mla v14.4S, v7.4S, v31.s[0] +add v16.4s, v16.4s, v19.4s +sqrdmulh v19.4S, v1.4S, v25.s[2] +mul v1.4S, v1.4S,v26.s[2] +sub v7.4s, v3.4s, v2.4s +sqrdmulh v0.4S, v12.4S, v25.s[3] +mul v12.4S, v12.4S,v26.s[3] +add v3.4s, v3.4s, v2.4s +sqrdmulh v2.4S, v7.4S, v25.s[1] +mul v7.4S, v7.4S,v26.s[1] +sub v21.4s, v10.4s, v17.4s +add v10.4s, v10.4s, v17.4s +sqrdmulh v17.4S, v3.4S, v25.s[0] +mul v3.4S, v3.4S,v26.s[0] +sub v6.4s, v8.4s, v11.4s +add v8.4s, v8.4s, v11.4s +mla v1.4S, v19.4S, v31.s[0] +mla v12.4S, v0.4S, v31.s[0] +sub v0.4s, v22.4s, v9.4s +mla v7.4S, v2.4S, v31.s[0] +mla v3.4S, v17.4S, v31.s[0] +add v22.4s, v22.4s, v9.4s +sqrdmulh v9.4S, v8.4S, v23.s[0] +mul v8.4S, v8.4S,v24.s[0] +sub v17.4s, v15.4s, v14.4s +sqrdmulh v2.4S, v6.4S, v23.s[1] +mul v6.4S, v6.4S,v24.s[1] +add v15.4s, v15.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v23.s[2] +mul v15.4S, v15.4S,v24.s[2] +sub v19.4s, v13.4s, v1.4s +add v13.4s, v13.4s, v1.4s +sqrdmulh v1.4S, v17.4S, v23.s[3] +mul v17.4S, v17.4S,v24.s[3] +sub v11.4s, v20.4s, v12.4s +add v20.4s, v20.4s, v12.4s +mla v8.4S, v9.4S, v31.s[0] +mla v6.4S, v2.4S, v31.s[0] +sub v2.4s, v18.4s, v7.4s +str q13, [x0, #288] +mla v15.4S, v14.4S, v31.s[0] +mla v17.4S, v1.4S, v31.s[0] +add v18.4s, v18.4s, v7.4s +str q19, [x0, #352] +ldr q19, [x0, #944] +sqrdmulh v7.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +sub v1.4s, v16.4s, v3.4s +str q20, [x0, #416] +ldr q20, [x0, #1008] +sqrdmulh v14.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v16.4s, v16.4s, v3.4s +str q11, [x0, #480] +ldr q11, [x0, #816] +sqrdmulh v3.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +sub v13.4s, v10.4s, v8.4s +add v10.4s, v10.4s, v8.4s +ldr q8, [x0, #880] +sqrdmulh v9.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v12.4s, v21.4s, v6.4s +add v21.4s, v21.4s, v6.4s +mla v19.4S, v7.4S, v31.s[0] +mla v20.4S, v14.4S, v31.s[0] +sub v14.4s, v22.4s, v15.4s +str q18, [x0, #160] +mla v11.4S, v3.4S, v31.s[0] +mla v8.4S, v9.4S, v31.s[0] +add v22.4s, v22.4s, v15.4s +str q2, [x0, #224] +ldr q2, [x0, #560] +sqrdmulh v15.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +sub v9.4s, v0.4s, v17.4s +str q16, [x0, #32] +ldr q16, [x0, #624] +sqrdmulh v3.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +add v0.4s, v0.4s, v17.4s +str q1, [x0, #96] +ldr q1, [x0, #688] +ldr q17, [x0, #432] +sqrdmulh v18.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +sub v7.4s, v17.4s, v19.4s +add v17.4s, v17.4s, v19.4s +ldr q19, [x0, #752] +ldr q6, [x0, #496] +sqrdmulh v5.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +sub v4.4s, v6.4s, v20.4s +add v6.4s, v6.4s, v20.4s +ldr q20, [x0, #304] +mla v2.4S, v15.4S, v31.s[0] +mla v16.4S, v3.4S, v31.s[0] +sub v3.4s, v20.4s, v11.4s +str q10, [x0, #544] +mla v1.4S, v18.4S, v31.s[0] +mla v19.4S, v5.4S, v31.s[0] +add v20.4s, v20.4s, v11.4s +str q13, [x0, #608] +ldr q13, [x0, #368] +sqrdmulh v11.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v5.4s, v13.4s, v8.4s +str q21, [x0, #672] +sqrdmulh v21.4S, v6.4S, v29.s[1] +mul v6.4S, v6.4S,v30.s[1] +add v13.4s, v13.4s, v8.4s +str q12, [x0, #736] +ldr q12, [x0, #48] +sqrdmulh v8.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v18.4s, v12.4s, v2.4s +add v12.4s, v12.4s, v2.4s +ldr q2, [x0, #112] +sqrdmulh v10.4S, v13.4S, v29.s[1] +mul v13.4S, v13.4S,v30.s[1] +sub v15.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +ldr q16, [x0, #176] +mla v17.4S, v11.4S, v31.s[0] +mla v6.4S, v21.4S, v31.s[0] +sub v21.4s, v16.4s, v1.4s +str q22, [x0, #800] +mla v20.4S, v8.4S, v31.s[0] +mla v13.4S, v10.4S, v31.s[0] +add v16.4s, v16.4s, v1.4s +str q14, [x0, #864] +ldr q14, [x0, #240] +sqrdmulh v1.4S, v7.4S, v29.s[2] +mul v7.4S, v7.4S,v30.s[2] +sub v10.4s, v14.4s, v19.4s +str q0, [x0, #928] +sqrdmulh v0.4S, v4.4S, v29.s[2] +mul v4.4S, v4.4S,v30.s[2] +add v14.4s, v14.4s, v19.4s +str q9, [x0, #992] +sqrdmulh v9.4S, v3.4S, v29.s[2] +mul v3.4S, v3.4S,v30.s[2] +sub v19.4s, v16.4s, v17.4s +add v16.4s, v16.4s, v17.4s +sqrdmulh v17.4S, v5.4S, v29.s[2] +mul v5.4S, v5.4S,v30.s[2] +sub v8.4s, v14.4s, v6.4s +add v14.4s, v14.4s, v6.4s +mla v7.4S, v1.4S, v31.s[0] +mla v4.4S, v0.4S, v31.s[0] +sub v0.4s, v12.4s, v20.4s +mla v3.4S, v9.4S, v31.s[0] +mla v5.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v27.s[1] +mul v19.4S, v19.4S,v28.s[1] +sub v17.4s, v2.4s, v13.4s +sqrdmulh v9.4S, v8.4S, v27.s[1] +mul v8.4S, v8.4S,v28.s[1] +add v2.4s, v2.4s, v13.4s +sqrdmulh v13.4S, v16.4S, v27.s[0] +mul v16.4S, v16.4S,v28.s[0] +sub v1.4s, v21.4s, v7.4s +add v21.4s, v21.4s, v7.4s +sqrdmulh v7.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +sub v6.4s, v10.4s, v4.4s +add v10.4s, v10.4s, v4.4s +mla v19.4S, v20.4S, v31.s[0] +mla v8.4S, v9.4S, v31.s[0] +sub v9.4s, v18.4s, v3.4s +mla v16.4S, v13.4S, v31.s[0] +mla v14.4S, v7.4S, v31.s[0] +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v27.s[2] +mul v21.4S, v21.4S,v28.s[2] +sub v7.4s, v15.4s, v5.4s +sqrdmulh v13.4S, v10.4S, v27.s[2] +mul v10.4S, v10.4S,v28.s[2] +add v15.4s, v15.4s, v5.4s +sqrdmulh v5.4S, v1.4S, v27.s[3] +mul v1.4S, v1.4S,v28.s[3] +sub v20.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v27.s[3] +mul v6.4S, v6.4S,v28.s[3] +sub v4.4s, v17.4s, v8.4s +add v17.4s, v17.4s, v8.4s +mla v21.4S, v3.4S, v31.s[0] +mla v10.4S, v13.4S, v31.s[0] +sub v13.4s, v12.4s, v16.4s +mla v1.4S, v5.4S, v31.s[0] +mla v6.4S, v19.4S, v31.s[0] +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v25.s[2] +mul v17.4S, v17.4S,v26.s[2] +sub v19.4s, v2.4s, v14.4s +sqrdmulh v5.4S, v4.4S, v25.s[3] +mul v4.4S, v4.4S,v26.s[3] +add v2.4s, v2.4s, v14.4s +sqrdmulh v14.4S, v19.4S, v25.s[1] +mul v19.4S, v19.4S,v26.s[1] +sub v3.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v2.4S, v25.s[0] +mul v2.4S, v2.4S,v26.s[0] +sub v8.4s, v15.4s, v10.4s +add v15.4s, v15.4s, v10.4s +mla v17.4S, v16.4S, v31.s[0] +mla v4.4S, v5.4S, v31.s[0] +sub v5.4s, v9.4s, v1.4s +mla v19.4S, v14.4S, v31.s[0] +mla v2.4S, v21.4S, v31.s[0] +add v9.4s, v9.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v23.s[0] +mul v15.4S, v15.4S,v24.s[0] +sub v21.4s, v7.4s, v6.4s +sqrdmulh v14.4S, v8.4S, v23.s[1] +mul v8.4S, v8.4S,v24.s[1] +add v7.4s, v7.4s, v6.4s +sqrdmulh v6.4S, v7.4S, v23.s[2] +mul v7.4S, v7.4S,v24.s[2] +sub v16.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +sqrdmulh v17.4S, v21.4S, v23.s[3] +mul v21.4S, v21.4S,v24.s[3] +sub v10.4s, v20.4s, v4.4s +add v20.4s, v20.4s, v4.4s +mla v15.4S, v1.4S, v31.s[0] +mla v8.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v19.4s +str q0, [x0, #304] +mla v7.4S, v6.4S, v31.s[0] +mla v21.4S, v17.4S, v31.s[0] +add v13.4s, v13.4s, v19.4s +str q16, [x0, #368] +ldr q16, [x0, #896] +sqrdmulh v19.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v17.4s, v12.4s, v2.4s +str q20, [x0, #432] +ldr q20, [x0, #960] +sqrdmulh v6.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v12.4s, v12.4s, v2.4s +str q10, [x0, #496] +ldr q10, [x0, #768] +sqrdmulh v2.4S, v10.4S, v29.s[0] +mul v10.4S, v10.4S,v30.s[0] +sub v0.4s, v18.4s, v15.4s +add v18.4s, v18.4s, v15.4s +ldr q15, [x0, #832] +sqrdmulh v1.4S, v15.4S, v29.s[0] +mul v15.4S, v15.4S,v30.s[0] +sub v4.4s, v3.4s, v8.4s +add v3.4s, v3.4s, v8.4s +mla v16.4S, v19.4S, v31.s[0] +mla v20.4S, v6.4S, v31.s[0] +sub v6.4s, v9.4s, v7.4s +str q13, [x0, #176] +mla v10.4S, v2.4S, v31.s[0] +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v7.4s +str q14, [x0, #240] +ldr q14, [x0, #512] +sqrdmulh v7.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v1.4s, v5.4s, v21.4s +str q12, [x0, #48] +ldr q12, [x0, #576] +sqrdmulh v2.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +add v5.4s, v5.4s, v21.4s +str q17, [x0, #112] +ldr q17, [x0, #640] +ldr q21, [x0, #384] +sqrdmulh v13.4S, v17.4S, v29.s[0] +mul v17.4S, v17.4S,v30.s[0] +sub v19.4s, v21.4s, v16.4s +add v21.4s, v21.4s, v16.4s +ldr q16, [x0, #704] +ldr q8, [x0, #448] +sqrdmulh v22.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v11.4s, v8.4s, v20.4s +add v8.4s, v8.4s, v20.4s +ldr q20, [x0, #256] +mla v14.4S, v7.4S, v31.s[0] +mla v12.4S, v2.4S, v31.s[0] +sub v2.4s, v20.4s, v10.4s +str q18, [x0, #560] +mla v17.4S, v13.4S, v31.s[0] +mla v16.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v10.4s +str q0, [x0, #624] +ldr q0, [x0, #320] +sqrdmulh v10.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v22.4s, v0.4s, v15.4s +str q3, [x0, #688] +sqrdmulh v3.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +add v0.4s, v0.4s, v15.4s +str q4, [x0, #752] +ldr q4, [x0, #0] +sqrdmulh v15.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v13.4s, v4.4s, v14.4s +add v4.4s, v4.4s, v14.4s +ldr q14, [x0, #64] +sqrdmulh v18.4S, v0.4S, v29.s[1] +mul v0.4S, v0.4S,v30.s[1] +sub v7.4s, v14.4s, v12.4s +add v14.4s, v14.4s, v12.4s +ldr q12, [x0, #128] +mla v21.4S, v10.4S, v31.s[0] +mla v8.4S, v3.4S, v31.s[0] +sub v3.4s, v12.4s, v17.4s +str q9, [x0, #816] +mla v20.4S, v15.4S, v31.s[0] +mla v0.4S, v18.4S, v31.s[0] +add v12.4s, v12.4s, v17.4s +str q6, [x0, #880] +ldr q6, [x0, #192] +sqrdmulh v17.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +sub v18.4s, v6.4s, v16.4s +str q5, [x0, #944] +sqrdmulh v5.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +add v6.4s, v6.4s, v16.4s +str q1, [x0, #1008] +sqrdmulh v1.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v16.4s, v12.4s, v21.4s +add v12.4s, v12.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +sub v15.4s, v6.4s, v8.4s +add v6.4s, v6.4s, v8.4s +mla v19.4S, v17.4S, v31.s[0] +mla v11.4S, v5.4S, v31.s[0] +sub v5.4s, v4.4s, v20.4s +mla v2.4S, v1.4S, v31.s[0] +mla v22.4S, v21.4S, v31.s[0] +add v4.4s, v4.4s, v20.4s +sqrdmulh v20.4S, v16.4S, v27.s[1] +mul v16.4S, v16.4S,v28.s[1] +sub v21.4s, v14.4s, v0.4s +sqrdmulh v1.4S, v15.4S, v27.s[1] +mul v15.4S, v15.4S,v28.s[1] +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +sub v17.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v27.s[0] +mul v6.4S, v6.4S,v28.s[0] +sub v8.4s, v18.4s, v11.4s +add v18.4s, v18.4s, v11.4s +mla v16.4S, v20.4S, v31.s[0] +mla v15.4S, v1.4S, v31.s[0] +sub v1.4s, v13.4s, v2.4s +mla v12.4S, v0.4S, v31.s[0] +mla v6.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v3.4S, v27.s[2] +mul v3.4S, v3.4S,v28.s[2] +sub v19.4s, v7.4s, v22.4s +sqrdmulh v0.4S, v18.4S, v27.s[2] +mul v18.4S, v18.4S,v28.s[2] +add v7.4s, v7.4s, v22.4s +sqrdmulh v22.4S, v17.4S, v27.s[3] +mul v17.4S, v17.4S,v28.s[3] +sub v20.4s, v5.4s, v16.4s +add v5.4s, v5.4s, v16.4s +sqrdmulh v16.4S, v8.4S, v27.s[3] +mul v8.4S, v8.4S,v28.s[3] +sub v11.4s, v21.4s, v15.4s +add v21.4s, v21.4s, v15.4s +mla v3.4S, v2.4S, v31.s[0] +mla v18.4S, v0.4S, v31.s[0] +sub v0.4s, v4.4s, v12.4s +mla v17.4S, v22.4S, v31.s[0] +mla v8.4S, v16.4S, v31.s[0] +add v4.4s, v4.4s, v12.4s +sqrdmulh v12.4S, v21.4S, v25.s[2] +mul v21.4S, v21.4S,v26.s[2] +sub v16.4s, v14.4s, v6.4s +sqrdmulh v22.4S, v11.4S, v25.s[3] +mul v11.4S, v11.4S,v26.s[3] +add v14.4s, v14.4s, v6.4s +sqrdmulh v6.4S, v16.4S, v25.s[1] +mul v16.4S, v16.4S,v26.s[1] +sub v2.4s, v13.4s, v3.4s +add v13.4s, v13.4s, v3.4s +sqrdmulh v3.4S, v14.4S, v25.s[0] +mul v14.4S, v14.4S,v26.s[0] +sub v15.4s, v7.4s, v18.4s +add v7.4s, v7.4s, v18.4s +mla v21.4S, v12.4S, v31.s[0] +mla v11.4S, v22.4S, v31.s[0] +sub v22.4s, v1.4s, v17.4s +mla v16.4S, v6.4S, v31.s[0] +mla v14.4S, v3.4S, v31.s[0] +add v1.4s, v1.4s, v17.4s +sqrdmulh v17.4S, v7.4S, v23.s[0] +mul v7.4S, v7.4S,v24.s[0] +sub v3.4s, v19.4s, v8.4s +sqrdmulh v6.4S, v15.4S, v23.s[1] +mul v15.4S, v15.4S,v24.s[1] +add v19.4s, v19.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v23.s[2] +mul v19.4S, v19.4S,v24.s[2] +sub v12.4s, v5.4s, v21.4s +add v5.4s, v5.4s, v21.4s +sqrdmulh v21.4S, v3.4S, v23.s[3] +mul v3.4S, v3.4S,v24.s[3] +sub v18.4s, v20.4s, v11.4s +add v20.4s, v20.4s, v11.4s +mla v7.4S, v17.4S, v31.s[0] +mla v15.4S, v6.4S, v31.s[0] +sub v6.4s, v0.4s, v16.4s +str q5, [x0, #256] +mla v19.4S, v8.4S, v31.s[0] +mla v3.4S, v21.4S, v31.s[0] +add v0.4s, v0.4s, v16.4s +str q12, [x0, #320] +ldr q12, [x0, #912] +sqrdmulh v16.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +sub v21.4s, v4.4s, v14.4s +str q20, [x0, #384] +ldr q20, [x0, #976] +sqrdmulh v8.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v4.4s, v4.4s, v14.4s +str q18, [x0, #448] +ldr q18, [x0, #784] +sqrdmulh v14.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +sub v5.4s, v13.4s, v7.4s +add v13.4s, v13.4s, v7.4s +ldr q7, [x0, #848] +sqrdmulh v17.4S, v7.4S, v29.s[0] +mul v7.4S, v7.4S,v30.s[0] +sub v11.4s, v2.4s, v15.4s +add v2.4s, v2.4s, v15.4s +mla v12.4S, v16.4S, v31.s[0] +mla v20.4S, v8.4S, v31.s[0] +sub v8.4s, v1.4s, v19.4s +str q0, [x0, #128] +mla v18.4S, v14.4S, v31.s[0] +mla v7.4S, v17.4S, v31.s[0] +add v1.4s, v1.4s, v19.4s +str q6, [x0, #192] +ldr q6, [x0, #528] +sqrdmulh v19.4S, v6.4S, v29.s[0] +mul v6.4S, v6.4S,v30.s[0] +sub v17.4s, v22.4s, v3.4s +str q4, [x0, #0] +ldr q4, [x0, #592] +sqrdmulh v14.4S, v4.4S, v29.s[0] +mul v4.4S, v4.4S,v30.s[0] +add v22.4s, v22.4s, v3.4s +str q21, [x0, #64] +ldr q21, [x0, #656] +ldr q3, [x0, #400] +sqrdmulh v0.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +sub v16.4s, v3.4s, v12.4s +add v3.4s, v3.4s, v12.4s +ldr q12, [x0, #720] +ldr q15, [x0, #464] +sqrdmulh v9.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +sub v10.4s, v15.4s, v20.4s +add v15.4s, v15.4s, v20.4s +ldr q20, [x0, #272] +mla v6.4S, v19.4S, v31.s[0] +mla v4.4S, v14.4S, v31.s[0] +sub v14.4s, v20.4s, v18.4s +str q13, [x0, #512] +mla v21.4S, v0.4S, v31.s[0] +mla v12.4S, v9.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +str q5, [x0, #576] +ldr q5, [x0, #336] +sqrdmulh v18.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v9.4s, v5.4s, v7.4s +str q2, [x0, #640] +sqrdmulh v2.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +add v5.4s, v5.4s, v7.4s +str q11, [x0, #704] +ldr q11, [x0, #16] +sqrdmulh v7.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v0.4s, v11.4s, v6.4s +add v11.4s, v11.4s, v6.4s +ldr q6, [x0, #80] +sqrdmulh v13.4S, v5.4S, v29.s[1] +mul v5.4S, v5.4S,v30.s[1] +sub v19.4s, v6.4s, v4.4s +add v6.4s, v6.4s, v4.4s +ldr q4, [x0, #144] +mla v3.4S, v18.4S, v31.s[0] +mla v15.4S, v2.4S, v31.s[0] +sub v2.4s, v4.4s, v21.4s +str q1, [x0, #768] +mla v20.4S, v7.4S, v31.s[0] +mla v5.4S, v13.4S, v31.s[0] +add v4.4s, v4.4s, v21.4s +str q8, [x0, #832] +ldr q8, [x0, #208] +sqrdmulh v21.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +sub v13.4s, v8.4s, v12.4s +str q22, [x0, #896] +sqrdmulh v22.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +add v8.4s, v8.4s, v12.4s +str q17, [x0, #960] +sqrdmulh v17.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v12.4s, v4.4s, v3.4s +add v4.4s, v4.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v29.s[2] +mul v9.4S, v9.4S,v30.s[2] +sub v7.4s, v8.4s, v15.4s +add v8.4s, v8.4s, v15.4s +mla v16.4S, v21.4S, v31.s[0] +mla v10.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v20.4s +mla v14.4S, v17.4S, v31.s[0] +mla v9.4S, v3.4S, v31.s[0] +add v11.4s, v11.4s, v20.4s +sqrdmulh v20.4S, v12.4S, v27.s[1] +mul v12.4S, v12.4S,v28.s[1] +sub v3.4s, v6.4s, v5.4s +sqrdmulh v17.4S, v7.4S, v27.s[1] +mul v7.4S, v7.4S,v28.s[1] +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v4.4S, v27.s[0] +mul v4.4S, v4.4S,v28.s[0] +sub v21.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v8.4S, v27.s[0] +mul v8.4S, v8.4S,v28.s[0] +sub v15.4s, v13.4s, v10.4s +add v13.4s, v13.4s, v10.4s +mla v12.4S, v20.4S, v31.s[0] +mla v7.4S, v17.4S, v31.s[0] +sub v17.4s, v0.4s, v14.4s +mla v4.4S, v5.4S, v31.s[0] +mla v8.4S, v16.4S, v31.s[0] +add v0.4s, v0.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v27.s[2] +mul v2.4S, v2.4S,v28.s[2] +sub v16.4s, v19.4s, v9.4s +sqrdmulh v5.4S, v13.4S, v27.s[2] +mul v13.4S, v13.4S,v28.s[2] +add v19.4s, v19.4s, v9.4s +sqrdmulh v9.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +sub v20.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +sub v10.4s, v3.4s, v7.4s +add v3.4s, v3.4s, v7.4s +mla v2.4S, v14.4S, v31.s[0] +mla v13.4S, v5.4S, v31.s[0] +sub v5.4s, v11.4s, v4.4s +mla v21.4S, v9.4S, v31.s[0] +mla v15.4S, v12.4S, v31.s[0] +add v11.4s, v11.4s, v4.4s +sqrdmulh v4.4S, v3.4S, v25.s[2] +mul v3.4S, v3.4S,v26.s[2] +sub v12.4s, v6.4s, v8.4s +sqrdmulh v9.4S, v10.4S, v25.s[3] +mul v10.4S, v10.4S,v26.s[3] +add v6.4s, v6.4s, v8.4s +sqrdmulh v8.4S, v12.4S, v25.s[1] +mul v12.4S, v12.4S,v26.s[1] +sub v14.4s, v0.4s, v2.4s +add v0.4s, v0.4s, v2.4s +sqrdmulh v2.4S, v6.4S, v25.s[0] +mul v6.4S, v6.4S,v26.s[0] +sub v7.4s, v19.4s, v13.4s +add v19.4s, v19.4s, v13.4s +mla v3.4S, v4.4S, v31.s[0] +mla v10.4S, v9.4S, v31.s[0] +sub v9.4s, v17.4s, v21.4s +mla v12.4S, v8.4S, v31.s[0] +mla v6.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v19.4S, v23.s[0] +mul v19.4S, v19.4S,v24.s[0] +sub v2.4s, v16.4s, v15.4s +sqrdmulh v8.4S, v7.4S, v23.s[1] +mul v7.4S, v7.4S,v24.s[1] +add v16.4s, v16.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v23.s[2] +mul v16.4S, v16.4S,v24.s[2] +sub v4.4s, v22.4s, v3.4s +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v2.4S, v23.s[3] +mul v2.4S, v2.4S,v24.s[3] +sub v13.4s, v20.4s, v10.4s +add v20.4s, v20.4s, v10.4s +mla v19.4S, v21.4S, v31.s[0] +mla v7.4S, v8.4S, v31.s[0] +sub v8.4s, v5.4s, v12.4s +str q22, [x0, #272] +mla v16.4S, v15.4S, v31.s[0] +mla v2.4S, v3.4S, v31.s[0] +add v5.4s, v5.4s, v12.4s +str q4, [x0, #336] +sub v23.4s, v11.4s, v6.4s +str q20, [x0, #400] +add v11.4s, v11.4s, v6.4s +str q13, [x0, #464] +sub v13.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sub v19.4s, v14.4s, v7.4s +add v14.4s, v14.4s, v7.4s +sub v7.4s, v17.4s, v16.4s +str q5, [x0, #144] +add v17.4s, v17.4s, v16.4s +str q8, [x0, #208] +sub v8.4s, v9.4s, v2.4s +str q11, [x0, #16] +add v9.4s, v9.4s, v2.4s +str q23, [x0, #80] +str q0, [x0, #528] +str q13, [x0, #592] +str q14, [x0, #656] +str q19, [x0, #720] +str q17, [x0, #784] +str q7, [x0, #848] +str q9, [x0, #912] +str q8, [x0, #976] +ldr q18, [x0, #224] +ldr q1, [x0, #160] +ldr q10, [x0, #32] +ldr q21, [x17, #+128] +ldr q22, [x17, #+144] +sqrdmulh v15.4S, v10.4S, v22.s[0] +mul v10.4S, v10.4S,v21.s[0] +ldr q3, [x0, #48] +sqrdmulh v12.4S, v3.4S, v22.s[0] +mul v3.4S, v3.4S,v21.s[0] +ldr q4, [x17, #+160] +ldr q30, [x17, #+176] +ldr q29, [x0, #96] +sqrdmulh v28.4S, v29.4S, v30.s[0] +mul v29.4S, v29.4S,v4.s[0] +ldr q27, [x0, #112] +sqrdmulh v26.4S, v27.4S, v30.s[0] +mul v27.4S, v27.4S,v4.s[0] +ldr q25, [x17, #+192] +ldr q24, [x17, #+208] +mla v10.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v1.4S, v24.s[0] +ldr q20, [x0, #176] +mla v3.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v20.4S, v24.s[0] +ldr q6, [x17, #+224] +ldr q5, [x17, #+240] +mla v29.4S, v28.4S, v31.s[0] +sqrdmulh v28.4S, v18.4S, v5.s[0] +ldr q16, [x0, #240] +mla v27.4S, v26.4S, v31.s[0] +sqrdmulh v26.4S, v16.4S, v5.s[0] +ldr q11, [x0, #0] +ldr q2, [x0, #128] +mul v1.4S, v1.4S,v25.s[0] +sub v23.4s, v11.4s, v10.4s +ldr q0, [x0, #16] +mul v20.4S, v20.4S,v25.s[0] +add v11.4s, v11.4s, v10.4s +ldr q10, [x0, #144] +mla v1.4S, v15.4S, v31.s[0] +sub v15.4s, v0.4s, v3.4s +ldr q13, [x0, #64] +mla v20.4S, v12.4S, v31.s[0] +add v0.4s, v0.4s, v3.4s +ldr q3, [x0, #192] +mul v18.4S, v18.4S,v6.s[0] +sub v12.4s, v13.4s, v29.4s +ldr q14, [x0, #80] +mul v16.4S, v16.4S,v6.s[0] +add v13.4s, v13.4s, v29.4s +ldr q29, [x0, #208] +mla v18.4S, v28.4S, v31.s[0] +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v14.4s, v27.4s +sqrdmulh v28.4S, v0.4S, v22.s[1] +add v14.4s, v14.4s, v27.4s +mul v0.4S, v0.4S,v21.s[1] +sqrdmulh v27.4S, v15.4S, v22.s[2] +sub v19.4s, v2.4s, v1.4s +mul v15.4S, v15.4S,v21.s[2] +add v2.4s, v2.4s, v1.4s +sqrdmulh v22.4S, v14.4S, v30.s[1] +sub v21.4s, v10.4s, v20.4s +mul v14.4S, v14.4S,v4.s[1] +add v10.4s, v10.4s, v20.4s +sqrdmulh v20.4S, v26.4S, v30.s[2] +sub v1.4s, v3.4s, v18.4s +mul v26.4S, v26.4S,v4.s[2] +add v3.4s, v3.4s, v18.4s +mla v0.4S, v28.4S, v31.s[0] +sub v28.4s, v29.4s, v16.4s +ldr q30, [x0, #480] +sqrdmulh v4.4S, v10.4S, v24.s[1] +add v29.4s, v29.4s, v16.4s +mla v15.4S, v27.4S, v31.s[0] +ldr q27, [x0, #416] +sqrdmulh v16.4S, v21.4S, v24.s[2] +sub v18.4s, v11.4s, v0.4s +mla v14.4S, v22.4S, v31.s[0] +ldr q22, [x0, #288] +sqrdmulh v17.4S, v29.4S, v5.s[1] +add v11.4s, v11.4s, v0.4s +str q18, [x0, #16] +mla v26.4S, v20.4S, v31.s[0] +ldr q20, [x17, #+256] +ldr q18, [x17, #+272] +sqrdmulh v0.4S, v28.4S, v5.s[2] +sub v7.4s, v23.4s, v15.4s +str q11, [x0, #0] +mul v10.4S, v10.4S,v25.s[1] +add v23.4s, v23.4s, v15.4s +mul v21.4S, v21.4S,v25.s[2] +str q7, [x0, #48] +mla v10.4S, v4.4S, v31.s[0] +sub v4.4s, v13.4s, v14.4s +mla v21.4S, v16.4S, v31.s[0] +str q23, [x0, #32] +mul v29.4S, v29.4S,v6.s[1] +str q4, [x0, #80] +mul v28.4S, v28.4S,v6.s[2] +add v13.4s, v13.4s, v14.4s +str q13, [x0, #64] +mla v29.4S, v17.4S, v31.s[0] +sub v17.4s, v12.4s, v26.4s +str q17, [x0, #112] +mla v28.4S, v0.4S, v31.s[0] +add v12.4s, v12.4s, v26.4s +str q12, [x0, #96] +sqrdmulh v5.4S, v22.4S, v18.s[0] +sub v6.4s, v2.4s, v10.4s +mul v22.4S, v22.4S,v20.s[0] +str q6, [x0, #144] +ldr q6, [x0, #304] +sqrdmulh v12.4S, v6.4S, v18.s[0] +add v2.4s, v2.4s, v10.4s +mul v6.4S, v6.4S,v20.s[0] +str q2, [x0, #128] +ldr q2, [x17, #+288] +ldr q10, [x17, #+304] +ldr q26, [x0, #352] +sqrdmulh v0.4S, v26.4S, v10.s[0] +sub v17.4s, v19.4s, v21.4s +mul v26.4S, v26.4S,v2.s[0] +str q17, [x0, #176] +ldr q17, [x0, #368] +sqrdmulh v13.4S, v17.4S, v10.s[0] +add v19.4s, v19.4s, v21.4s +mul v17.4S, v17.4S,v2.s[0] +str q19, [x0, #160] +ldr q19, [x17, #+320] +ldr q21, [x17, #+336] +mla v22.4S, v5.4S, v31.s[0] +sub v5.4s, v3.4s, v29.4s +sqrdmulh v14.4S, v27.4S, v21.s[0] +str q5, [x0, #208] +ldr q5, [x0, #432] +mla v6.4S, v12.4S, v31.s[0] +add v3.4s, v3.4s, v29.4s +sqrdmulh v29.4S, v5.4S, v21.s[0] +str q3, [x0, #192] +ldr q3, [x17, #+352] +ldr q12, [x17, #+368] +mla v26.4S, v0.4S, v31.s[0] +sub v0.4s, v1.4s, v28.4s +sqrdmulh v4.4S, v30.4S, v12.s[0] +str q0, [x0, #240] +ldr q0, [x0, #496] +mla v17.4S, v13.4S, v31.s[0] +add v1.4s, v1.4s, v28.4s +sqrdmulh v28.4S, v0.4S, v12.s[0] +str q1, [x0, #224] +ldr q1, [x0, #256] +ldr q13, [x0, #384] +mul v27.4S, v27.4S,v19.s[0] +sub v24.4s, v1.4s, v22.4s +ldr q25, [x0, #272] +mul v5.4S, v5.4S,v19.s[0] +add v1.4s, v1.4s, v22.4s +ldr q22, [x0, #400] +mla v27.4S, v14.4S, v31.s[0] +sub v14.4s, v25.4s, v6.4s +ldr q23, [x0, #320] +mla v5.4S, v29.4S, v31.s[0] +add v25.4s, v25.4s, v6.4s +ldr q6, [x0, #448] +mul v30.4S, v30.4S,v3.s[0] +sub v29.4s, v23.4s, v26.4s +ldr q16, [x0, #336] +mul v0.4S, v0.4S,v3.s[0] +add v23.4s, v23.4s, v26.4s +ldr q26, [x0, #464] +mla v30.4S, v4.4S, v31.s[0] +mla v0.4S, v28.4S, v31.s[0] +sub v28.4s, v16.4s, v17.4s +sqrdmulh v4.4S, v25.4S, v18.s[1] +add v16.4s, v16.4s, v17.4s +mul v25.4S, v25.4S,v20.s[1] +sqrdmulh v17.4S, v14.4S, v18.s[2] +sub v7.4s, v13.4s, v27.4s +mul v14.4S, v14.4S,v20.s[2] +add v13.4s, v13.4s, v27.4s +sqrdmulh v18.4S, v16.4S, v10.s[1] +sub v20.4s, v22.4s, v5.4s +mul v16.4S, v16.4S,v2.s[1] +add v22.4s, v22.4s, v5.4s +sqrdmulh v5.4S, v28.4S, v10.s[2] +sub v27.4s, v6.4s, v30.4s +mul v28.4S, v28.4S,v2.s[2] +add v6.4s, v6.4s, v30.4s +mla v25.4S, v4.4S, v31.s[0] +sub v4.4s, v26.4s, v0.4s +ldr q10, [x0, #736] +sqrdmulh v2.4S, v22.4S, v21.s[1] +add v26.4s, v26.4s, v0.4s +mla v14.4S, v17.4S, v31.s[0] +ldr q17, [x0, #672] +sqrdmulh v0.4S, v20.4S, v21.s[2] +sub v30.4s, v1.4s, v25.4s +mla v16.4S, v18.4S, v31.s[0] +ldr q18, [x0, #544] +sqrdmulh v15.4S, v26.4S, v12.s[1] +add v1.4s, v1.4s, v25.4s +str q30, [x0, #272] +mla v28.4S, v5.4S, v31.s[0] +ldr q5, [x17, #+384] +ldr q30, [x17, #+400] +sqrdmulh v25.4S, v4.4S, v12.s[2] +sub v11.4s, v24.4s, v14.4s +str q1, [x0, #256] +mul v22.4S, v22.4S,v19.s[1] +add v24.4s, v24.4s, v14.4s +mul v20.4S, v20.4S,v19.s[2] +str q11, [x0, #304] +mla v22.4S, v2.4S, v31.s[0] +sub v2.4s, v23.4s, v16.4s +mla v20.4S, v0.4S, v31.s[0] +str q24, [x0, #288] +mul v26.4S, v26.4S,v3.s[1] +str q2, [x0, #336] +mul v4.4S, v4.4S,v3.s[2] +add v23.4s, v23.4s, v16.4s +str q23, [x0, #320] +mla v26.4S, v15.4S, v31.s[0] +sub v15.4s, v29.4s, v28.4s +str q15, [x0, #368] +mla v4.4S, v25.4S, v31.s[0] +add v29.4s, v29.4s, v28.4s +str q29, [x0, #352] +sqrdmulh v12.4S, v18.4S, v30.s[0] +sub v3.4s, v13.4s, v22.4s +mul v18.4S, v18.4S,v5.s[0] +str q3, [x0, #400] +ldr q3, [x0, #560] +sqrdmulh v29.4S, v3.4S, v30.s[0] +add v13.4s, v13.4s, v22.4s +mul v3.4S, v3.4S,v5.s[0] +str q13, [x0, #384] +ldr q13, [x17, #+416] +ldr q22, [x17, #+432] +ldr q28, [x0, #608] +sqrdmulh v25.4S, v28.4S, v22.s[0] +sub v15.4s, v7.4s, v20.4s +mul v28.4S, v28.4S,v13.s[0] +str q15, [x0, #432] +ldr q15, [x0, #624] +sqrdmulh v23.4S, v15.4S, v22.s[0] +add v7.4s, v7.4s, v20.4s +mul v15.4S, v15.4S,v13.s[0] +str q7, [x0, #416] +ldr q7, [x17, #+448] +ldr q20, [x17, #+464] +mla v18.4S, v12.4S, v31.s[0] +sub v12.4s, v6.4s, v26.4s +sqrdmulh v16.4S, v17.4S, v20.s[0] +str q12, [x0, #464] +ldr q12, [x0, #688] +mla v3.4S, v29.4S, v31.s[0] +add v6.4s, v6.4s, v26.4s +sqrdmulh v26.4S, v12.4S, v20.s[0] +str q6, [x0, #448] +ldr q6, [x17, #+480] +ldr q29, [x17, #+496] +mla v28.4S, v25.4S, v31.s[0] +sub v25.4s, v27.4s, v4.4s +sqrdmulh v2.4S, v10.4S, v29.s[0] +str q25, [x0, #496] +ldr q25, [x0, #752] +mla v15.4S, v23.4S, v31.s[0] +add v27.4s, v27.4s, v4.4s +sqrdmulh v4.4S, v25.4S, v29.s[0] +str q27, [x0, #480] +ldr q27, [x0, #512] +ldr q23, [x0, #640] +mul v17.4S, v17.4S,v7.s[0] +sub v21.4s, v27.4s, v18.4s +ldr q19, [x0, #528] +mul v12.4S, v12.4S,v7.s[0] +add v27.4s, v27.4s, v18.4s +ldr q18, [x0, #656] +mla v17.4S, v16.4S, v31.s[0] +sub v16.4s, v19.4s, v3.4s +ldr q24, [x0, #576] +mla v12.4S, v26.4S, v31.s[0] +add v19.4s, v19.4s, v3.4s +ldr q3, [x0, #704] +mul v10.4S, v10.4S,v6.s[0] +sub v26.4s, v24.4s, v28.4s +ldr q0, [x0, #592] +mul v25.4S, v25.4S,v6.s[0] +add v24.4s, v24.4s, v28.4s +ldr q28, [x0, #720] +mla v10.4S, v2.4S, v31.s[0] +mla v25.4S, v4.4S, v31.s[0] +sub v4.4s, v0.4s, v15.4s +sqrdmulh v2.4S, v19.4S, v30.s[1] +add v0.4s, v0.4s, v15.4s +mul v19.4S, v19.4S,v5.s[1] +sqrdmulh v15.4S, v16.4S, v30.s[2] +sub v11.4s, v23.4s, v17.4s +mul v16.4S, v16.4S,v5.s[2] +add v23.4s, v23.4s, v17.4s +sqrdmulh v30.4S, v0.4S, v22.s[1] +sub v5.4s, v18.4s, v12.4s +mul v0.4S, v0.4S,v13.s[1] +add v18.4s, v18.4s, v12.4s +sqrdmulh v12.4S, v4.4S, v22.s[2] +sub v17.4s, v3.4s, v10.4s +mul v4.4S, v4.4S,v13.s[2] +add v3.4s, v3.4s, v10.4s +mla v19.4S, v2.4S, v31.s[0] +sub v2.4s, v28.4s, v25.4s +ldr q22, [x0, #992] +sqrdmulh v13.4S, v18.4S, v20.s[1] +add v28.4s, v28.4s, v25.4s +mla v16.4S, v15.4S, v31.s[0] +ldr q15, [x0, #928] +sqrdmulh v25.4S, v5.4S, v20.s[2] +sub v10.4s, v27.4s, v19.4s +mla v0.4S, v30.4S, v31.s[0] +ldr q30, [x0, #800] +sqrdmulh v14.4S, v28.4S, v29.s[1] +add v27.4s, v27.4s, v19.4s +str q10, [x0, #528] +mla v4.4S, v12.4S, v31.s[0] +ldr q12, [x17, #+512] +ldr q10, [x17, #+528] +sqrdmulh v19.4S, v2.4S, v29.s[2] +sub v1.4s, v21.4s, v16.4s +str q27, [x0, #512] +mul v18.4S, v18.4S,v7.s[1] +add v21.4s, v21.4s, v16.4s +mul v5.4S, v5.4S,v7.s[2] +str q1, [x0, #560] +mla v18.4S, v13.4S, v31.s[0] +sub v13.4s, v24.4s, v0.4s +mla v5.4S, v25.4S, v31.s[0] +str q21, [x0, #544] +mul v28.4S, v28.4S,v6.s[1] +str q13, [x0, #592] +mul v2.4S, v2.4S,v6.s[2] +add v24.4s, v24.4s, v0.4s +str q24, [x0, #576] +mla v28.4S, v14.4S, v31.s[0] +sub v14.4s, v26.4s, v4.4s +str q14, [x0, #624] +mla v2.4S, v19.4S, v31.s[0] +add v26.4s, v26.4s, v4.4s +str q26, [x0, #608] +sqrdmulh v29.4S, v30.4S, v10.s[0] +sub v6.4s, v23.4s, v18.4s +mul v30.4S, v30.4S,v12.s[0] +str q6, [x0, #656] +ldr q6, [x0, #816] +sqrdmulh v26.4S, v6.4S, v10.s[0] +add v23.4s, v23.4s, v18.4s +mul v6.4S, v6.4S,v12.s[0] +str q23, [x0, #640] +ldr q23, [x17, #+544] +ldr q18, [x17, #+560] +ldr q4, [x0, #864] +sqrdmulh v19.4S, v4.4S, v18.s[0] +sub v14.4s, v11.4s, v5.4s +mul v4.4S, v4.4S,v23.s[0] +str q14, [x0, #688] +ldr q14, [x0, #880] +sqrdmulh v24.4S, v14.4S, v18.s[0] +add v11.4s, v11.4s, v5.4s +mul v14.4S, v14.4S,v23.s[0] +str q11, [x0, #672] +ldr q11, [x17, #+576] +ldr q5, [x17, #+592] +mla v30.4S, v29.4S, v31.s[0] +sub v29.4s, v3.4s, v28.4s +sqrdmulh v0.4S, v15.4S, v5.s[0] +str q29, [x0, #720] +ldr q29, [x0, #944] +mla v6.4S, v26.4S, v31.s[0] +add v3.4s, v3.4s, v28.4s +sqrdmulh v28.4S, v29.4S, v5.s[0] +str q3, [x0, #704] +ldr q3, [x17, #+608] +ldr q26, [x17, #+624] +mla v4.4S, v19.4S, v31.s[0] +sub v19.4s, v17.4s, v2.4s +sqrdmulh v13.4S, v22.4S, v26.s[0] +str q19, [x0, #752] +ldr q19, [x0, #1008] +mla v14.4S, v24.4S, v31.s[0] +add v17.4s, v17.4s, v2.4s +sqrdmulh v2.4S, v19.4S, v26.s[0] +str q17, [x0, #736] +ldr q17, [x0, #768] +ldr q24, [x0, #896] +mul v15.4S, v15.4S,v11.s[0] +sub v20.4s, v17.4s, v30.4s +ldr q7, [x0, #784] +mul v29.4S, v29.4S,v11.s[0] +add v17.4s, v17.4s, v30.4s +ldr q30, [x0, #912] +mla v15.4S, v0.4S, v31.s[0] +sub v0.4s, v7.4s, v6.4s +ldr q21, [x0, #832] +mla v29.4S, v28.4S, v31.s[0] +add v7.4s, v7.4s, v6.4s +ldr q6, [x0, #960] +mul v22.4S, v22.4S,v3.s[0] +sub v28.4s, v21.4s, v4.4s +ldr q25, [x0, #848] +mul v19.4S, v19.4S,v3.s[0] +add v21.4s, v21.4s, v4.4s +ldr q4, [x0, #976] +mla v22.4S, v13.4S, v31.s[0] +mla v19.4S, v2.4S, v31.s[0] +sub v2.4s, v25.4s, v14.4s +sqrdmulh v13.4S, v7.4S, v10.s[1] +add v25.4s, v25.4s, v14.4s +mul v7.4S, v7.4S,v12.s[1] +sqrdmulh v14.4S, v0.4S, v10.s[2] +sub v1.4s, v24.4s, v15.4s +mul v0.4S, v0.4S,v12.s[2] +add v24.4s, v24.4s, v15.4s +sqrdmulh v10.4S, v25.4S, v18.s[1] +sub v12.4s, v30.4s, v29.4s +mul v25.4S, v25.4S,v23.s[1] +add v30.4s, v30.4s, v29.4s +sqrdmulh v29.4S, v2.4S, v18.s[2] +sub v15.4s, v6.4s, v22.4s +mul v2.4S, v2.4S,v23.s[2] +add v6.4s, v6.4s, v22.4s +mla v7.4S, v13.4S, v31.s[0] +sub v13.4s, v4.4s, v19.4s +sqrdmulh v18.4S, v30.4S, v5.s[1] +add v4.4s, v4.4s, v19.4s +mla v0.4S, v14.4S, v31.s[0] +sqrdmulh v14.4S, v12.4S, v5.s[2] +sub v19.4s, v17.4s, v7.4s +mla v25.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v4.4S, v26.s[1] +add v17.4s, v17.4s, v7.4s +str q19, [x0, #784] +mla v2.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v13.4S, v26.s[2] +sub v19.4s, v20.4s, v0.4s +str q17, [x0, #768] +mul v30.4S, v30.4S,v11.s[1] +add v20.4s, v20.4s, v0.4s +mul v12.4S, v12.4S,v11.s[2] +str q19, [x0, #816] +mla v30.4S, v18.4S, v31.s[0] +sub v18.4s, v21.4s, v25.4s +mla v12.4S, v14.4S, v31.s[0] +str q20, [x0, #800] +mul v4.4S, v4.4S,v3.s[1] +str q18, [x0, #848] +mul v13.4S, v13.4S,v3.s[2] +add v21.4s, v21.4s, v25.4s +str q21, [x0, #832] +mla v4.4S, v10.4S, v31.s[0] +sub v10.4s, v28.4s, v2.4s +str q10, [x0, #880] +mla v13.4S, v29.4S, v31.s[0] +add v28.4s, v28.4s, v2.4s +str q28, [x0, #864] +sub v26.4s, v24.4s, v30.4s +str q26, [x0, #912] +add v24.4s, v24.4s, v30.4s +str q24, [x0, #896] +sub v24.4s, v1.4s, v12.4s +str q24, [x0, #944] +add v1.4s, v1.4s, v12.4s +str q1, [x0, #928] +sub v1.4s, v6.4s, v4.4s +str q1, [x0, #976] +add v6.4s, v6.4s, v4.4s +str q6, [x0, #960] +sub v6.4s, v15.4s, v13.4s +str q6, [x0, #1008] +add v15.4s, v15.4s, v13.4s +str q15, [x0, #992] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1464 +// Instruction count: 1460 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_8.s b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_8.s new file mode 100644 index 0000000..70d520f --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_8.s @@ -0,0 +1,1494 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_7_z4_8 +.global _ntt_u32_incomplete_neon_asm_var_4_2_7_z4_8 +ntt_u32_incomplete_neon_asm_var_4_2_7_z4_8: +_ntt_u32_incomplete_neon_asm_var_4_2_7_z4_8: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #928] +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +ldr q20, [x0, #992] +sqrdmulh v19.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q18, [x0, #800] +sqrdmulh v17.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +ldr q16, [x0, #864] +sqrdmulh v3.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +mla v22.4S, v21.4S, v31.s[0] +mla v20.4S, v19.4S, v31.s[0] +mla v18.4S, v17.4S, v31.s[0] +mla v16.4S, v3.4S, v31.s[0] +ldr q3, [x0, #544] +sqrdmulh v17.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +ldr q19, [x0, #608] +sqrdmulh v21.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q2, [x0, #672] +ldr q1, [x0, #416] +sqrdmulh v0.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +sub v15.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +ldr q22, [x0, #736] +ldr q14, [x0, #480] +sqrdmulh v13.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v12.4s, v14.4s, v20.4s +add v14.4s, v14.4s, v20.4s +ldr q20, [x0, #288] +mla v3.4S, v17.4S, v31.s[0] +mla v19.4S, v21.4S, v31.s[0] +sub v21.4s, v20.4s, v18.4s +mla v2.4S, v0.4S, v31.s[0] +mla v22.4S, v13.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +ldr q18, [x0, #352] +sqrdmulh v13.4S, v1.4S, v29.s[1] +mul v1.4S, v1.4S,v30.s[1] +sub v0.4s, v18.4s, v16.4s +sqrdmulh v17.4S, v14.4S, v29.s[1] +mul v14.4S, v14.4S,v30.s[1] +add v18.4s, v18.4s, v16.4s +ldr q16, [x0, #32] +sqrdmulh v11.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v10.4s, v16.4s, v3.4s +add v16.4s, v16.4s, v3.4s +ldr q3, [x0, #96] +sqrdmulh v9.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v8.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +ldr q19, [x0, #160] +mla v1.4S, v13.4S, v31.s[0] +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v19.4s, v2.4s +mla v20.4S, v11.4S, v31.s[0] +mla v18.4S, v9.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +ldr q2, [x0, #224] +sqrdmulh v9.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +sub v11.4s, v2.4s, v22.4s +sqrdmulh v13.4S, v12.4S, v29.s[2] +mul v12.4S, v12.4S,v30.s[2] +add v2.4s, v2.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +sub v7.4s, v19.4s, v1.4s +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v29.s[2] +mul v0.4S, v0.4S,v30.s[2] +sub v6.4s, v2.4s, v14.4s +add v2.4s, v2.4s, v14.4s +mla v15.4S, v9.4S, v31.s[0] +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v16.4s, v20.4s +mla v21.4S, v22.4S, v31.s[0] +mla v0.4S, v1.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v7.4S, v27.s[1] +mul v7.4S, v7.4S,v28.s[1] +sub v1.4s, v3.4s, v18.4s +sqrdmulh v22.4S, v6.4S, v27.s[1] +mul v6.4S, v6.4S,v28.s[1] +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v19.4S, v27.s[0] +mul v19.4S, v19.4S,v28.s[0] +sub v9.4s, v17.4s, v15.4s +add v17.4s, v17.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v27.s[0] +mul v2.4S, v2.4S,v28.s[0] +sub v14.4s, v11.4s, v12.4s +add v11.4s, v11.4s, v12.4s +mla v7.4S, v20.4S, v31.s[0] +mla v6.4S, v22.4S, v31.s[0] +sub v22.4s, v10.4s, v21.4s +mla v19.4S, v18.4S, v31.s[0] +mla v2.4S, v15.4S, v31.s[0] +add v10.4s, v10.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v27.s[2] +mul v17.4S, v17.4S,v28.s[2] +sub v15.4s, v8.4s, v0.4s +sqrdmulh v18.4S, v11.4S, v27.s[2] +mul v11.4S, v11.4S,v28.s[2] +add v8.4s, v8.4s, v0.4s +sqrdmulh v0.4S, v9.4S, v27.s[3] +mul v9.4S, v9.4S,v28.s[3] +sub v20.4s, v13.4s, v7.4s +add v13.4s, v13.4s, v7.4s +sqrdmulh v7.4S, v14.4S, v27.s[3] +mul v14.4S, v14.4S,v28.s[3] +sub v12.4s, v1.4s, v6.4s +add v1.4s, v1.4s, v6.4s +mla v17.4S, v21.4S, v31.s[0] +mla v11.4S, v18.4S, v31.s[0] +sub v18.4s, v16.4s, v19.4s +mla v9.4S, v0.4S, v31.s[0] +mla v14.4S, v7.4S, v31.s[0] +add v16.4s, v16.4s, v19.4s +sqrdmulh v19.4S, v1.4S, v25.s[2] +mul v1.4S, v1.4S,v26.s[2] +sub v7.4s, v3.4s, v2.4s +sqrdmulh v0.4S, v12.4S, v25.s[3] +mul v12.4S, v12.4S,v26.s[3] +add v3.4s, v3.4s, v2.4s +sqrdmulh v2.4S, v7.4S, v25.s[1] +mul v7.4S, v7.4S,v26.s[1] +sub v21.4s, v10.4s, v17.4s +add v10.4s, v10.4s, v17.4s +sqrdmulh v17.4S, v3.4S, v25.s[0] +mul v3.4S, v3.4S,v26.s[0] +sub v6.4s, v8.4s, v11.4s +add v8.4s, v8.4s, v11.4s +mla v1.4S, v19.4S, v31.s[0] +mla v12.4S, v0.4S, v31.s[0] +sub v0.4s, v22.4s, v9.4s +mla v7.4S, v2.4S, v31.s[0] +mla v3.4S, v17.4S, v31.s[0] +add v22.4s, v22.4s, v9.4s +sqrdmulh v9.4S, v8.4S, v23.s[0] +mul v8.4S, v8.4S,v24.s[0] +sub v17.4s, v15.4s, v14.4s +sqrdmulh v2.4S, v6.4S, v23.s[1] +mul v6.4S, v6.4S,v24.s[1] +add v15.4s, v15.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v23.s[2] +mul v15.4S, v15.4S,v24.s[2] +sub v19.4s, v13.4s, v1.4s +add v13.4s, v13.4s, v1.4s +sqrdmulh v1.4S, v17.4S, v23.s[3] +mul v17.4S, v17.4S,v24.s[3] +sub v11.4s, v20.4s, v12.4s +add v20.4s, v20.4s, v12.4s +mla v8.4S, v9.4S, v31.s[0] +mla v6.4S, v2.4S, v31.s[0] +sub v2.4s, v18.4s, v7.4s +str q13, [x0, #288] +mla v15.4S, v14.4S, v31.s[0] +mla v17.4S, v1.4S, v31.s[0] +add v18.4s, v18.4s, v7.4s +str q19, [x0, #352] +ldr q19, [x0, #944] +sqrdmulh v7.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +sub v1.4s, v16.4s, v3.4s +str q20, [x0, #416] +ldr q20, [x0, #1008] +sqrdmulh v14.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v16.4s, v16.4s, v3.4s +str q11, [x0, #480] +ldr q11, [x0, #816] +sqrdmulh v3.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +sub v13.4s, v10.4s, v8.4s +add v10.4s, v10.4s, v8.4s +ldr q8, [x0, #880] +sqrdmulh v9.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v12.4s, v21.4s, v6.4s +add v21.4s, v21.4s, v6.4s +mla v19.4S, v7.4S, v31.s[0] +mla v20.4S, v14.4S, v31.s[0] +sub v14.4s, v22.4s, v15.4s +str q18, [x0, #160] +mla v11.4S, v3.4S, v31.s[0] +mla v8.4S, v9.4S, v31.s[0] +add v22.4s, v22.4s, v15.4s +str q2, [x0, #224] +ldr q2, [x0, #560] +sqrdmulh v15.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +sub v9.4s, v0.4s, v17.4s +str q16, [x0, #32] +ldr q16, [x0, #624] +sqrdmulh v3.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +add v0.4s, v0.4s, v17.4s +str q1, [x0, #96] +ldr q1, [x0, #688] +ldr q17, [x0, #432] +sqrdmulh v18.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +sub v7.4s, v17.4s, v19.4s +add v17.4s, v17.4s, v19.4s +ldr q19, [x0, #752] +ldr q6, [x0, #496] +sqrdmulh v5.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +sub v4.4s, v6.4s, v20.4s +add v6.4s, v6.4s, v20.4s +ldr q20, [x0, #304] +mla v2.4S, v15.4S, v31.s[0] +mla v16.4S, v3.4S, v31.s[0] +sub v3.4s, v20.4s, v11.4s +str q10, [x0, #544] +mla v1.4S, v18.4S, v31.s[0] +mla v19.4S, v5.4S, v31.s[0] +add v20.4s, v20.4s, v11.4s +str q13, [x0, #608] +ldr q13, [x0, #368] +sqrdmulh v11.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v5.4s, v13.4s, v8.4s +str q21, [x0, #672] +sqrdmulh v21.4S, v6.4S, v29.s[1] +mul v6.4S, v6.4S,v30.s[1] +add v13.4s, v13.4s, v8.4s +str q12, [x0, #736] +ldr q12, [x0, #48] +sqrdmulh v8.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v18.4s, v12.4s, v2.4s +add v12.4s, v12.4s, v2.4s +ldr q2, [x0, #112] +sqrdmulh v10.4S, v13.4S, v29.s[1] +mul v13.4S, v13.4S,v30.s[1] +sub v15.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +ldr q16, [x0, #176] +mla v17.4S, v11.4S, v31.s[0] +mla v6.4S, v21.4S, v31.s[0] +sub v21.4s, v16.4s, v1.4s +str q22, [x0, #800] +mla v20.4S, v8.4S, v31.s[0] +mla v13.4S, v10.4S, v31.s[0] +add v16.4s, v16.4s, v1.4s +str q14, [x0, #864] +ldr q14, [x0, #240] +sqrdmulh v1.4S, v7.4S, v29.s[2] +mul v7.4S, v7.4S,v30.s[2] +sub v10.4s, v14.4s, v19.4s +str q0, [x0, #928] +sqrdmulh v0.4S, v4.4S, v29.s[2] +mul v4.4S, v4.4S,v30.s[2] +add v14.4s, v14.4s, v19.4s +str q9, [x0, #992] +sqrdmulh v9.4S, v3.4S, v29.s[2] +mul v3.4S, v3.4S,v30.s[2] +sub v19.4s, v16.4s, v17.4s +add v16.4s, v16.4s, v17.4s +sqrdmulh v17.4S, v5.4S, v29.s[2] +mul v5.4S, v5.4S,v30.s[2] +sub v8.4s, v14.4s, v6.4s +add v14.4s, v14.4s, v6.4s +mla v7.4S, v1.4S, v31.s[0] +mla v4.4S, v0.4S, v31.s[0] +sub v0.4s, v12.4s, v20.4s +mla v3.4S, v9.4S, v31.s[0] +mla v5.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v27.s[1] +mul v19.4S, v19.4S,v28.s[1] +sub v17.4s, v2.4s, v13.4s +sqrdmulh v9.4S, v8.4S, v27.s[1] +mul v8.4S, v8.4S,v28.s[1] +add v2.4s, v2.4s, v13.4s +sqrdmulh v13.4S, v16.4S, v27.s[0] +mul v16.4S, v16.4S,v28.s[0] +sub v1.4s, v21.4s, v7.4s +add v21.4s, v21.4s, v7.4s +sqrdmulh v7.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +sub v6.4s, v10.4s, v4.4s +add v10.4s, v10.4s, v4.4s +mla v19.4S, v20.4S, v31.s[0] +mla v8.4S, v9.4S, v31.s[0] +sub v9.4s, v18.4s, v3.4s +mla v16.4S, v13.4S, v31.s[0] +mla v14.4S, v7.4S, v31.s[0] +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v27.s[2] +mul v21.4S, v21.4S,v28.s[2] +sub v7.4s, v15.4s, v5.4s +sqrdmulh v13.4S, v10.4S, v27.s[2] +mul v10.4S, v10.4S,v28.s[2] +add v15.4s, v15.4s, v5.4s +sqrdmulh v5.4S, v1.4S, v27.s[3] +mul v1.4S, v1.4S,v28.s[3] +sub v20.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v27.s[3] +mul v6.4S, v6.4S,v28.s[3] +sub v4.4s, v17.4s, v8.4s +add v17.4s, v17.4s, v8.4s +mla v21.4S, v3.4S, v31.s[0] +mla v10.4S, v13.4S, v31.s[0] +sub v13.4s, v12.4s, v16.4s +mla v1.4S, v5.4S, v31.s[0] +mla v6.4S, v19.4S, v31.s[0] +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v25.s[2] +mul v17.4S, v17.4S,v26.s[2] +sub v19.4s, v2.4s, v14.4s +sqrdmulh v5.4S, v4.4S, v25.s[3] +mul v4.4S, v4.4S,v26.s[3] +add v2.4s, v2.4s, v14.4s +sqrdmulh v14.4S, v19.4S, v25.s[1] +mul v19.4S, v19.4S,v26.s[1] +sub v3.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v2.4S, v25.s[0] +mul v2.4S, v2.4S,v26.s[0] +sub v8.4s, v15.4s, v10.4s +add v15.4s, v15.4s, v10.4s +mla v17.4S, v16.4S, v31.s[0] +mla v4.4S, v5.4S, v31.s[0] +sub v5.4s, v9.4s, v1.4s +mla v19.4S, v14.4S, v31.s[0] +mla v2.4S, v21.4S, v31.s[0] +add v9.4s, v9.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v23.s[0] +mul v15.4S, v15.4S,v24.s[0] +sub v21.4s, v7.4s, v6.4s +sqrdmulh v14.4S, v8.4S, v23.s[1] +mul v8.4S, v8.4S,v24.s[1] +add v7.4s, v7.4s, v6.4s +sqrdmulh v6.4S, v7.4S, v23.s[2] +mul v7.4S, v7.4S,v24.s[2] +sub v16.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +sqrdmulh v17.4S, v21.4S, v23.s[3] +mul v21.4S, v21.4S,v24.s[3] +sub v10.4s, v20.4s, v4.4s +add v20.4s, v20.4s, v4.4s +mla v15.4S, v1.4S, v31.s[0] +mla v8.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v19.4s +str q0, [x0, #304] +mla v7.4S, v6.4S, v31.s[0] +mla v21.4S, v17.4S, v31.s[0] +add v13.4s, v13.4s, v19.4s +str q16, [x0, #368] +ldr q16, [x0, #896] +sqrdmulh v19.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v17.4s, v12.4s, v2.4s +str q20, [x0, #432] +ldr q20, [x0, #960] +sqrdmulh v6.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v12.4s, v12.4s, v2.4s +str q10, [x0, #496] +ldr q10, [x0, #768] +sqrdmulh v2.4S, v10.4S, v29.s[0] +mul v10.4S, v10.4S,v30.s[0] +sub v0.4s, v18.4s, v15.4s +add v18.4s, v18.4s, v15.4s +ldr q15, [x0, #832] +sqrdmulh v1.4S, v15.4S, v29.s[0] +mul v15.4S, v15.4S,v30.s[0] +sub v4.4s, v3.4s, v8.4s +add v3.4s, v3.4s, v8.4s +mla v16.4S, v19.4S, v31.s[0] +mla v20.4S, v6.4S, v31.s[0] +sub v6.4s, v9.4s, v7.4s +str q13, [x0, #176] +mla v10.4S, v2.4S, v31.s[0] +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v7.4s +str q14, [x0, #240] +ldr q14, [x0, #512] +sqrdmulh v7.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v1.4s, v5.4s, v21.4s +str q12, [x0, #48] +ldr q12, [x0, #576] +sqrdmulh v2.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +add v5.4s, v5.4s, v21.4s +str q17, [x0, #112] +ldr q17, [x0, #640] +ldr q21, [x0, #384] +sqrdmulh v13.4S, v17.4S, v29.s[0] +mul v17.4S, v17.4S,v30.s[0] +sub v19.4s, v21.4s, v16.4s +add v21.4s, v21.4s, v16.4s +ldr q16, [x0, #704] +ldr q8, [x0, #448] +sqrdmulh v22.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v11.4s, v8.4s, v20.4s +add v8.4s, v8.4s, v20.4s +ldr q20, [x0, #256] +mla v14.4S, v7.4S, v31.s[0] +mla v12.4S, v2.4S, v31.s[0] +sub v2.4s, v20.4s, v10.4s +str q18, [x0, #560] +mla v17.4S, v13.4S, v31.s[0] +mla v16.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v10.4s +str q0, [x0, #624] +ldr q0, [x0, #320] +sqrdmulh v10.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v22.4s, v0.4s, v15.4s +str q3, [x0, #688] +sqrdmulh v3.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +add v0.4s, v0.4s, v15.4s +str q4, [x0, #752] +ldr q4, [x0, #0] +sqrdmulh v15.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v13.4s, v4.4s, v14.4s +add v4.4s, v4.4s, v14.4s +ldr q14, [x0, #64] +sqrdmulh v18.4S, v0.4S, v29.s[1] +mul v0.4S, v0.4S,v30.s[1] +sub v7.4s, v14.4s, v12.4s +add v14.4s, v14.4s, v12.4s +ldr q12, [x0, #128] +mla v21.4S, v10.4S, v31.s[0] +mla v8.4S, v3.4S, v31.s[0] +sub v3.4s, v12.4s, v17.4s +str q9, [x0, #816] +mla v20.4S, v15.4S, v31.s[0] +mla v0.4S, v18.4S, v31.s[0] +add v12.4s, v12.4s, v17.4s +str q6, [x0, #880] +ldr q6, [x0, #192] +sqrdmulh v17.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +sub v18.4s, v6.4s, v16.4s +str q5, [x0, #944] +sqrdmulh v5.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +add v6.4s, v6.4s, v16.4s +str q1, [x0, #1008] +sqrdmulh v1.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v16.4s, v12.4s, v21.4s +add v12.4s, v12.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +sub v15.4s, v6.4s, v8.4s +add v6.4s, v6.4s, v8.4s +mla v19.4S, v17.4S, v31.s[0] +mla v11.4S, v5.4S, v31.s[0] +sub v5.4s, v4.4s, v20.4s +mla v2.4S, v1.4S, v31.s[0] +mla v22.4S, v21.4S, v31.s[0] +add v4.4s, v4.4s, v20.4s +sqrdmulh v20.4S, v16.4S, v27.s[1] +mul v16.4S, v16.4S,v28.s[1] +sub v21.4s, v14.4s, v0.4s +sqrdmulh v1.4S, v15.4S, v27.s[1] +mul v15.4S, v15.4S,v28.s[1] +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +sub v17.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v27.s[0] +mul v6.4S, v6.4S,v28.s[0] +sub v8.4s, v18.4s, v11.4s +add v18.4s, v18.4s, v11.4s +mla v16.4S, v20.4S, v31.s[0] +mla v15.4S, v1.4S, v31.s[0] +sub v1.4s, v13.4s, v2.4s +mla v12.4S, v0.4S, v31.s[0] +mla v6.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v3.4S, v27.s[2] +mul v3.4S, v3.4S,v28.s[2] +sub v19.4s, v7.4s, v22.4s +sqrdmulh v0.4S, v18.4S, v27.s[2] +mul v18.4S, v18.4S,v28.s[2] +add v7.4s, v7.4s, v22.4s +sqrdmulh v22.4S, v17.4S, v27.s[3] +mul v17.4S, v17.4S,v28.s[3] +sub v20.4s, v5.4s, v16.4s +add v5.4s, v5.4s, v16.4s +sqrdmulh v16.4S, v8.4S, v27.s[3] +mul v8.4S, v8.4S,v28.s[3] +sub v11.4s, v21.4s, v15.4s +add v21.4s, v21.4s, v15.4s +mla v3.4S, v2.4S, v31.s[0] +mla v18.4S, v0.4S, v31.s[0] +sub v0.4s, v4.4s, v12.4s +mla v17.4S, v22.4S, v31.s[0] +mla v8.4S, v16.4S, v31.s[0] +add v4.4s, v4.4s, v12.4s +sqrdmulh v12.4S, v21.4S, v25.s[2] +mul v21.4S, v21.4S,v26.s[2] +sub v16.4s, v14.4s, v6.4s +sqrdmulh v22.4S, v11.4S, v25.s[3] +mul v11.4S, v11.4S,v26.s[3] +add v14.4s, v14.4s, v6.4s +sqrdmulh v6.4S, v16.4S, v25.s[1] +mul v16.4S, v16.4S,v26.s[1] +sub v2.4s, v13.4s, v3.4s +add v13.4s, v13.4s, v3.4s +sqrdmulh v3.4S, v14.4S, v25.s[0] +mul v14.4S, v14.4S,v26.s[0] +sub v15.4s, v7.4s, v18.4s +add v7.4s, v7.4s, v18.4s +mla v21.4S, v12.4S, v31.s[0] +mla v11.4S, v22.4S, v31.s[0] +sub v22.4s, v1.4s, v17.4s +mla v16.4S, v6.4S, v31.s[0] +mla v14.4S, v3.4S, v31.s[0] +add v1.4s, v1.4s, v17.4s +sqrdmulh v17.4S, v7.4S, v23.s[0] +mul v7.4S, v7.4S,v24.s[0] +sub v3.4s, v19.4s, v8.4s +sqrdmulh v6.4S, v15.4S, v23.s[1] +mul v15.4S, v15.4S,v24.s[1] +add v19.4s, v19.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v23.s[2] +mul v19.4S, v19.4S,v24.s[2] +sub v12.4s, v5.4s, v21.4s +add v5.4s, v5.4s, v21.4s +sqrdmulh v21.4S, v3.4S, v23.s[3] +mul v3.4S, v3.4S,v24.s[3] +sub v18.4s, v20.4s, v11.4s +add v20.4s, v20.4s, v11.4s +mla v7.4S, v17.4S, v31.s[0] +mla v15.4S, v6.4S, v31.s[0] +sub v6.4s, v0.4s, v16.4s +str q5, [x0, #256] +mla v19.4S, v8.4S, v31.s[0] +mla v3.4S, v21.4S, v31.s[0] +add v0.4s, v0.4s, v16.4s +str q12, [x0, #320] +ldr q12, [x0, #912] +sqrdmulh v16.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +sub v21.4s, v4.4s, v14.4s +str q20, [x0, #384] +ldr q20, [x0, #976] +sqrdmulh v8.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v4.4s, v4.4s, v14.4s +str q18, [x0, #448] +ldr q18, [x0, #784] +sqrdmulh v14.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +sub v5.4s, v13.4s, v7.4s +add v13.4s, v13.4s, v7.4s +ldr q7, [x0, #848] +sqrdmulh v17.4S, v7.4S, v29.s[0] +mul v7.4S, v7.4S,v30.s[0] +sub v11.4s, v2.4s, v15.4s +add v2.4s, v2.4s, v15.4s +mla v12.4S, v16.4S, v31.s[0] +mla v20.4S, v8.4S, v31.s[0] +sub v8.4s, v1.4s, v19.4s +str q0, [x0, #128] +mla v18.4S, v14.4S, v31.s[0] +mla v7.4S, v17.4S, v31.s[0] +add v1.4s, v1.4s, v19.4s +str q6, [x0, #192] +ldr q6, [x0, #528] +sqrdmulh v19.4S, v6.4S, v29.s[0] +mul v6.4S, v6.4S,v30.s[0] +sub v17.4s, v22.4s, v3.4s +str q4, [x0, #0] +ldr q4, [x0, #592] +sqrdmulh v14.4S, v4.4S, v29.s[0] +mul v4.4S, v4.4S,v30.s[0] +add v22.4s, v22.4s, v3.4s +str q21, [x0, #64] +ldr q21, [x0, #656] +ldr q3, [x0, #400] +sqrdmulh v0.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +sub v16.4s, v3.4s, v12.4s +add v3.4s, v3.4s, v12.4s +ldr q12, [x0, #720] +ldr q15, [x0, #464] +sqrdmulh v9.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +sub v10.4s, v15.4s, v20.4s +add v15.4s, v15.4s, v20.4s +ldr q20, [x0, #272] +mla v6.4S, v19.4S, v31.s[0] +mla v4.4S, v14.4S, v31.s[0] +sub v14.4s, v20.4s, v18.4s +str q13, [x0, #512] +mla v21.4S, v0.4S, v31.s[0] +mla v12.4S, v9.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +str q5, [x0, #576] +ldr q5, [x0, #336] +sqrdmulh v18.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v9.4s, v5.4s, v7.4s +str q2, [x0, #640] +sqrdmulh v2.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +add v5.4s, v5.4s, v7.4s +str q11, [x0, #704] +ldr q11, [x0, #16] +sqrdmulh v7.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v0.4s, v11.4s, v6.4s +add v11.4s, v11.4s, v6.4s +ldr q6, [x0, #80] +sqrdmulh v13.4S, v5.4S, v29.s[1] +mul v5.4S, v5.4S,v30.s[1] +sub v19.4s, v6.4s, v4.4s +add v6.4s, v6.4s, v4.4s +ldr q4, [x0, #144] +mla v3.4S, v18.4S, v31.s[0] +mla v15.4S, v2.4S, v31.s[0] +sub v2.4s, v4.4s, v21.4s +str q1, [x0, #768] +mla v20.4S, v7.4S, v31.s[0] +mla v5.4S, v13.4S, v31.s[0] +add v4.4s, v4.4s, v21.4s +str q8, [x0, #832] +ldr q8, [x0, #208] +sqrdmulh v21.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +sub v13.4s, v8.4s, v12.4s +str q22, [x0, #896] +sqrdmulh v22.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +add v8.4s, v8.4s, v12.4s +str q17, [x0, #960] +sqrdmulh v17.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v12.4s, v4.4s, v3.4s +add v4.4s, v4.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v29.s[2] +mul v9.4S, v9.4S,v30.s[2] +sub v7.4s, v8.4s, v15.4s +add v8.4s, v8.4s, v15.4s +mla v16.4S, v21.4S, v31.s[0] +mla v10.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v20.4s +mla v14.4S, v17.4S, v31.s[0] +mla v9.4S, v3.4S, v31.s[0] +add v11.4s, v11.4s, v20.4s +sqrdmulh v20.4S, v12.4S, v27.s[1] +mul v12.4S, v12.4S,v28.s[1] +sub v3.4s, v6.4s, v5.4s +sqrdmulh v17.4S, v7.4S, v27.s[1] +mul v7.4S, v7.4S,v28.s[1] +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v4.4S, v27.s[0] +mul v4.4S, v4.4S,v28.s[0] +sub v21.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v8.4S, v27.s[0] +mul v8.4S, v8.4S,v28.s[0] +sub v15.4s, v13.4s, v10.4s +add v13.4s, v13.4s, v10.4s +mla v12.4S, v20.4S, v31.s[0] +mla v7.4S, v17.4S, v31.s[0] +sub v17.4s, v0.4s, v14.4s +mla v4.4S, v5.4S, v31.s[0] +mla v8.4S, v16.4S, v31.s[0] +add v0.4s, v0.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v27.s[2] +mul v2.4S, v2.4S,v28.s[2] +sub v16.4s, v19.4s, v9.4s +sqrdmulh v5.4S, v13.4S, v27.s[2] +mul v13.4S, v13.4S,v28.s[2] +add v19.4s, v19.4s, v9.4s +sqrdmulh v9.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +sub v20.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +sub v10.4s, v3.4s, v7.4s +add v3.4s, v3.4s, v7.4s +mla v2.4S, v14.4S, v31.s[0] +mla v13.4S, v5.4S, v31.s[0] +sub v5.4s, v11.4s, v4.4s +mla v21.4S, v9.4S, v31.s[0] +mla v15.4S, v12.4S, v31.s[0] +add v11.4s, v11.4s, v4.4s +sqrdmulh v4.4S, v3.4S, v25.s[2] +mul v3.4S, v3.4S,v26.s[2] +sub v12.4s, v6.4s, v8.4s +sqrdmulh v9.4S, v10.4S, v25.s[3] +mul v10.4S, v10.4S,v26.s[3] +add v6.4s, v6.4s, v8.4s +sqrdmulh v8.4S, v12.4S, v25.s[1] +mul v12.4S, v12.4S,v26.s[1] +sub v14.4s, v0.4s, v2.4s +add v0.4s, v0.4s, v2.4s +sqrdmulh v2.4S, v6.4S, v25.s[0] +mul v6.4S, v6.4S,v26.s[0] +sub v7.4s, v19.4s, v13.4s +add v19.4s, v19.4s, v13.4s +mla v3.4S, v4.4S, v31.s[0] +mla v10.4S, v9.4S, v31.s[0] +sub v9.4s, v17.4s, v21.4s +mla v12.4S, v8.4S, v31.s[0] +mla v6.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v19.4S, v23.s[0] +mul v19.4S, v19.4S,v24.s[0] +sub v2.4s, v16.4s, v15.4s +sqrdmulh v8.4S, v7.4S, v23.s[1] +mul v7.4S, v7.4S,v24.s[1] +add v16.4s, v16.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v23.s[2] +mul v16.4S, v16.4S,v24.s[2] +sub v4.4s, v22.4s, v3.4s +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v2.4S, v23.s[3] +mul v2.4S, v2.4S,v24.s[3] +sub v13.4s, v20.4s, v10.4s +add v20.4s, v20.4s, v10.4s +mla v19.4S, v21.4S, v31.s[0] +mla v7.4S, v8.4S, v31.s[0] +sub v8.4s, v5.4s, v12.4s +str q22, [x0, #272] +mla v16.4S, v15.4S, v31.s[0] +mla v2.4S, v3.4S, v31.s[0] +add v5.4s, v5.4s, v12.4s +str q4, [x0, #336] +sub v23.4s, v11.4s, v6.4s +str q20, [x0, #400] +add v11.4s, v11.4s, v6.4s +str q13, [x0, #464] +sub v13.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sub v19.4s, v14.4s, v7.4s +add v14.4s, v14.4s, v7.4s +sub v7.4s, v17.4s, v16.4s +str q5, [x0, #144] +add v17.4s, v17.4s, v16.4s +str q8, [x0, #208] +sub v8.4s, v9.4s, v2.4s +str q11, [x0, #16] +add v9.4s, v9.4s, v2.4s +str q23, [x0, #80] +str q0, [x0, #528] +str q13, [x0, #592] +str q14, [x0, #656] +str q19, [x0, #720] +str q17, [x0, #784] +str q7, [x0, #848] +str q9, [x0, #912] +str q8, [x0, #976] +ldr q18, [x0, #224] +ldr q1, [x0, #160] +ldr q10, [x0, #32] +ldr q21, [x17, #+128] +ldr q22, [x17, #+144] +sqrdmulh v15.4S, v10.4S, v22.s[0] +mul v10.4S, v10.4S,v21.s[0] +ldr q3, [x0, #48] +sqrdmulh v12.4S, v3.4S, v22.s[0] +ldr q4, [x17, #+160] +mul v3.4S, v3.4S,v21.s[0] +ldr q30, [x17, #+176] +ldr q29, [x0, #96] +sqrdmulh v28.4S, v29.4S, v30.s[0] +mul v29.4S, v29.4S,v4.s[0] +ldr q27, [x0, #112] +sqrdmulh v26.4S, v27.4S, v30.s[0] +mul v27.4S, v27.4S,v4.s[0] +ldr q25, [x17, #+192] +mla v10.4S, v15.4S, v31.s[0] +ldr q15, [x17, #+208] +sqrdmulh v24.4S, v1.4S, v15.s[0] +ldr q20, [x0, #176] +mla v3.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v20.4S, v15.s[0] +ldr q6, [x17, #+224] +mla v29.4S, v28.4S, v31.s[0] +ldr q28, [x17, #+240] +sqrdmulh v5.4S, v18.4S, v28.s[0] +ldr q16, [x0, #240] +mla v27.4S, v26.4S, v31.s[0] +sqrdmulh v26.4S, v16.4S, v28.s[0] +ldr q11, [x0, #0] +ldr q2, [x0, #128] +mul v1.4S, v1.4S,v25.s[0] +sub v23.4s, v11.4s, v10.4s +ldr q0, [x0, #16] +mul v20.4S, v20.4S,v25.s[0] +add v11.4s, v11.4s, v10.4s +ldr q10, [x0, #144] +mla v1.4S, v24.4S, v31.s[0] +sub v24.4s, v0.4s, v3.4s +ldr q13, [x0, #64] +mla v20.4S, v12.4S, v31.s[0] +add v0.4s, v0.4s, v3.4s +ldr q3, [x0, #192] +mul v18.4S, v18.4S,v6.s[0] +sub v12.4s, v13.4s, v29.4s +ldr q14, [x0, #80] +mul v16.4S, v16.4S,v6.s[0] +add v13.4s, v13.4s, v29.4s +ldr q29, [x0, #208] +mla v18.4S, v5.4S, v31.s[0] +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v14.4s, v27.4s +sqrdmulh v5.4S, v0.4S, v22.s[1] +add v14.4s, v14.4s, v27.4s +mul v0.4S, v0.4S,v21.s[1] +sqrdmulh v27.4S, v24.4S, v22.s[2] +sub v19.4s, v2.4s, v1.4s +mul v24.4S, v24.4S,v21.s[2] +add v2.4s, v2.4s, v1.4s +sqrdmulh v22.4S, v14.4S, v30.s[1] +sub v21.4s, v10.4s, v20.4s +mul v14.4S, v14.4S,v4.s[1] +add v10.4s, v10.4s, v20.4s +sqrdmulh v20.4S, v26.4S, v30.s[2] +sub v1.4s, v3.4s, v18.4s +mul v26.4S, v26.4S,v4.s[2] +add v3.4s, v3.4s, v18.4s +mla v0.4S, v5.4S, v31.s[0] +sub v5.4s, v29.4s, v16.4s +ldr q30, [x0, #480] +sqrdmulh v4.4S, v10.4S, v15.s[1] +add v29.4s, v29.4s, v16.4s +mla v24.4S, v27.4S, v31.s[0] +ldr q27, [x0, #416] +sqrdmulh v16.4S, v21.4S, v15.s[2] +sub v18.4s, v11.4s, v0.4s +mla v14.4S, v22.4S, v31.s[0] +ldr q22, [x0, #288] +sqrdmulh v17.4S, v29.4S, v28.s[1] +add v11.4s, v11.4s, v0.4s +str q18, [x0, #16] +mla v26.4S, v20.4S, v31.s[0] +ldr q20, [x17, #+256] +sqrdmulh v18.4S, v5.4S, v28.s[2] +sub v0.4s, v23.4s, v24.4s +str q11, [x0, #0] +mul v10.4S, v10.4S,v25.s[1] +add v23.4s, v23.4s, v24.4s +ldr q24, [x17, #+272] +mul v21.4S, v21.4S,v25.s[2] +str q0, [x0, #48] +mla v10.4S, v4.4S, v31.s[0] +sub v4.4s, v13.4s, v14.4s +mla v21.4S, v16.4S, v31.s[0] +str q23, [x0, #32] +mul v29.4S, v29.4S,v6.s[1] +str q4, [x0, #80] +mul v5.4S, v5.4S,v6.s[2] +add v13.4s, v13.4s, v14.4s +str q13, [x0, #64] +mla v29.4S, v17.4S, v31.s[0] +sub v17.4s, v12.4s, v26.4s +str q17, [x0, #112] +mla v5.4S, v18.4S, v31.s[0] +add v12.4s, v12.4s, v26.4s +str q12, [x0, #96] +sqrdmulh v28.4S, v22.4S, v24.s[0] +sub v6.4s, v2.4s, v10.4s +mul v22.4S, v22.4S,v20.s[0] +str q6, [x0, #144] +ldr q6, [x0, #304] +sqrdmulh v12.4S, v6.4S, v24.s[0] +add v2.4s, v2.4s, v10.4s +ldr q10, [x17, #+288] +mul v6.4S, v6.4S,v20.s[0] +str q2, [x0, #128] +ldr q2, [x17, #+304] +ldr q26, [x0, #352] +sqrdmulh v18.4S, v26.4S, v2.s[0] +sub v17.4s, v19.4s, v21.4s +mul v26.4S, v26.4S,v10.s[0] +str q17, [x0, #176] +ldr q17, [x0, #368] +sqrdmulh v13.4S, v17.4S, v2.s[0] +add v19.4s, v19.4s, v21.4s +mul v17.4S, v17.4S,v10.s[0] +str q19, [x0, #160] +ldr q19, [x17, #+320] +mla v22.4S, v28.4S, v31.s[0] +sub v28.4s, v3.4s, v29.4s +ldr q21, [x17, #+336] +sqrdmulh v14.4S, v27.4S, v21.s[0] +str q28, [x0, #208] +ldr q28, [x0, #432] +mla v6.4S, v12.4S, v31.s[0] +add v3.4s, v3.4s, v29.4s +sqrdmulh v29.4S, v28.4S, v21.s[0] +str q3, [x0, #192] +ldr q3, [x17, #+352] +mla v26.4S, v18.4S, v31.s[0] +sub v18.4s, v1.4s, v5.4s +ldr q12, [x17, #+368] +sqrdmulh v4.4S, v30.4S, v12.s[0] +str q18, [x0, #240] +ldr q18, [x0, #496] +mla v17.4S, v13.4S, v31.s[0] +add v1.4s, v1.4s, v5.4s +sqrdmulh v5.4S, v18.4S, v12.s[0] +str q1, [x0, #224] +ldr q1, [x0, #256] +ldr q13, [x0, #384] +mul v27.4S, v27.4S,v19.s[0] +sub v15.4s, v1.4s, v22.4s +ldr q25, [x0, #272] +mul v28.4S, v28.4S,v19.s[0] +add v1.4s, v1.4s, v22.4s +ldr q22, [x0, #400] +mla v27.4S, v14.4S, v31.s[0] +sub v14.4s, v25.4s, v6.4s +ldr q23, [x0, #320] +mla v28.4S, v29.4S, v31.s[0] +add v25.4s, v25.4s, v6.4s +ldr q6, [x0, #448] +mul v30.4S, v30.4S,v3.s[0] +sub v29.4s, v23.4s, v26.4s +ldr q16, [x0, #336] +mul v18.4S, v18.4S,v3.s[0] +add v23.4s, v23.4s, v26.4s +ldr q26, [x0, #464] +mla v30.4S, v4.4S, v31.s[0] +mla v18.4S, v5.4S, v31.s[0] +sub v5.4s, v16.4s, v17.4s +sqrdmulh v4.4S, v25.4S, v24.s[1] +add v16.4s, v16.4s, v17.4s +mul v25.4S, v25.4S,v20.s[1] +sqrdmulh v17.4S, v14.4S, v24.s[2] +sub v0.4s, v13.4s, v27.4s +mul v14.4S, v14.4S,v20.s[2] +add v13.4s, v13.4s, v27.4s +sqrdmulh v24.4S, v16.4S, v2.s[1] +sub v20.4s, v22.4s, v28.4s +mul v16.4S, v16.4S,v10.s[1] +add v22.4s, v22.4s, v28.4s +sqrdmulh v28.4S, v5.4S, v2.s[2] +sub v27.4s, v6.4s, v30.4s +mul v5.4S, v5.4S,v10.s[2] +add v6.4s, v6.4s, v30.4s +mla v25.4S, v4.4S, v31.s[0] +sub v4.4s, v26.4s, v18.4s +ldr q2, [x0, #736] +sqrdmulh v10.4S, v22.4S, v21.s[1] +add v26.4s, v26.4s, v18.4s +mla v14.4S, v17.4S, v31.s[0] +ldr q17, [x0, #672] +sqrdmulh v18.4S, v20.4S, v21.s[2] +sub v30.4s, v1.4s, v25.4s +mla v16.4S, v24.4S, v31.s[0] +ldr q24, [x0, #544] +sqrdmulh v11.4S, v26.4S, v12.s[1] +add v1.4s, v1.4s, v25.4s +str q30, [x0, #272] +mla v5.4S, v28.4S, v31.s[0] +ldr q28, [x17, #+384] +sqrdmulh v30.4S, v4.4S, v12.s[2] +sub v25.4s, v15.4s, v14.4s +str q1, [x0, #256] +mul v22.4S, v22.4S,v19.s[1] +add v15.4s, v15.4s, v14.4s +ldr q14, [x17, #+400] +mul v20.4S, v20.4S,v19.s[2] +str q25, [x0, #304] +mla v22.4S, v10.4S, v31.s[0] +sub v10.4s, v23.4s, v16.4s +mla v20.4S, v18.4S, v31.s[0] +str q15, [x0, #288] +mul v26.4S, v26.4S,v3.s[1] +str q10, [x0, #336] +mul v4.4S, v4.4S,v3.s[2] +add v23.4s, v23.4s, v16.4s +str q23, [x0, #320] +mla v26.4S, v11.4S, v31.s[0] +sub v11.4s, v29.4s, v5.4s +str q11, [x0, #368] +mla v4.4S, v30.4S, v31.s[0] +add v29.4s, v29.4s, v5.4s +str q29, [x0, #352] +sqrdmulh v12.4S, v24.4S, v14.s[0] +sub v3.4s, v13.4s, v22.4s +mul v24.4S, v24.4S,v28.s[0] +str q3, [x0, #400] +ldr q3, [x0, #560] +sqrdmulh v29.4S, v3.4S, v14.s[0] +add v13.4s, v13.4s, v22.4s +ldr q22, [x17, #+416] +mul v3.4S, v3.4S,v28.s[0] +str q13, [x0, #384] +ldr q13, [x17, #+432] +ldr q5, [x0, #608] +sqrdmulh v30.4S, v5.4S, v13.s[0] +sub v11.4s, v0.4s, v20.4s +mul v5.4S, v5.4S,v22.s[0] +str q11, [x0, #432] +ldr q11, [x0, #624] +sqrdmulh v23.4S, v11.4S, v13.s[0] +add v0.4s, v0.4s, v20.4s +mul v11.4S, v11.4S,v22.s[0] +str q0, [x0, #416] +ldr q0, [x17, #+448] +mla v24.4S, v12.4S, v31.s[0] +sub v12.4s, v6.4s, v26.4s +ldr q20, [x17, #+464] +sqrdmulh v16.4S, v17.4S, v20.s[0] +str q12, [x0, #464] +ldr q12, [x0, #688] +mla v3.4S, v29.4S, v31.s[0] +add v6.4s, v6.4s, v26.4s +sqrdmulh v26.4S, v12.4S, v20.s[0] +str q6, [x0, #448] +ldr q6, [x17, #+480] +mla v5.4S, v30.4S, v31.s[0] +sub v30.4s, v27.4s, v4.4s +ldr q29, [x17, #+496] +sqrdmulh v10.4S, v2.4S, v29.s[0] +str q30, [x0, #496] +ldr q30, [x0, #752] +mla v11.4S, v23.4S, v31.s[0] +add v27.4s, v27.4s, v4.4s +sqrdmulh v4.4S, v30.4S, v29.s[0] +str q27, [x0, #480] +ldr q27, [x0, #512] +ldr q23, [x0, #640] +mul v17.4S, v17.4S,v0.s[0] +sub v21.4s, v27.4s, v24.4s +ldr q19, [x0, #528] +mul v12.4S, v12.4S,v0.s[0] +add v27.4s, v27.4s, v24.4s +ldr q24, [x0, #656] +mla v17.4S, v16.4S, v31.s[0] +sub v16.4s, v19.4s, v3.4s +ldr q15, [x0, #576] +mla v12.4S, v26.4S, v31.s[0] +add v19.4s, v19.4s, v3.4s +ldr q3, [x0, #704] +mul v2.4S, v2.4S,v6.s[0] +sub v26.4s, v15.4s, v5.4s +ldr q18, [x0, #592] +mul v30.4S, v30.4S,v6.s[0] +add v15.4s, v15.4s, v5.4s +ldr q5, [x0, #720] +mla v2.4S, v10.4S, v31.s[0] +mla v30.4S, v4.4S, v31.s[0] +sub v4.4s, v18.4s, v11.4s +sqrdmulh v10.4S, v19.4S, v14.s[1] +add v18.4s, v18.4s, v11.4s +mul v19.4S, v19.4S,v28.s[1] +sqrdmulh v11.4S, v16.4S, v14.s[2] +sub v25.4s, v23.4s, v17.4s +mul v16.4S, v16.4S,v28.s[2] +add v23.4s, v23.4s, v17.4s +sqrdmulh v14.4S, v18.4S, v13.s[1] +sub v28.4s, v24.4s, v12.4s +mul v18.4S, v18.4S,v22.s[1] +add v24.4s, v24.4s, v12.4s +sqrdmulh v12.4S, v4.4S, v13.s[2] +sub v17.4s, v3.4s, v2.4s +mul v4.4S, v4.4S,v22.s[2] +add v3.4s, v3.4s, v2.4s +mla v19.4S, v10.4S, v31.s[0] +sub v10.4s, v5.4s, v30.4s +ldr q13, [x0, #992] +sqrdmulh v22.4S, v24.4S, v20.s[1] +add v5.4s, v5.4s, v30.4s +mla v16.4S, v11.4S, v31.s[0] +ldr q11, [x0, #928] +sqrdmulh v30.4S, v28.4S, v20.s[2] +sub v2.4s, v27.4s, v19.4s +mla v18.4S, v14.4S, v31.s[0] +ldr q14, [x0, #800] +sqrdmulh v1.4S, v5.4S, v29.s[1] +add v27.4s, v27.4s, v19.4s +str q2, [x0, #528] +mla v4.4S, v12.4S, v31.s[0] +ldr q12, [x17, #+512] +sqrdmulh v2.4S, v10.4S, v29.s[2] +sub v19.4s, v21.4s, v16.4s +str q27, [x0, #512] +mul v24.4S, v24.4S,v0.s[1] +add v21.4s, v21.4s, v16.4s +ldr q16, [x17, #+528] +mul v28.4S, v28.4S,v0.s[2] +str q19, [x0, #560] +mla v24.4S, v22.4S, v31.s[0] +sub v22.4s, v15.4s, v18.4s +mla v28.4S, v30.4S, v31.s[0] +str q21, [x0, #544] +mul v5.4S, v5.4S,v6.s[1] +str q22, [x0, #592] +mul v10.4S, v10.4S,v6.s[2] +add v15.4s, v15.4s, v18.4s +str q15, [x0, #576] +mla v5.4S, v1.4S, v31.s[0] +sub v1.4s, v26.4s, v4.4s +str q1, [x0, #624] +mla v10.4S, v2.4S, v31.s[0] +add v26.4s, v26.4s, v4.4s +str q26, [x0, #608] +sqrdmulh v29.4S, v14.4S, v16.s[0] +sub v6.4s, v23.4s, v24.4s +mul v14.4S, v14.4S,v12.s[0] +str q6, [x0, #656] +ldr q6, [x0, #816] +sqrdmulh v26.4S, v6.4S, v16.s[0] +add v23.4s, v23.4s, v24.4s +ldr q24, [x17, #+544] +mul v6.4S, v6.4S,v12.s[0] +str q23, [x0, #640] +ldr q23, [x17, #+560] +ldr q4, [x0, #864] +sqrdmulh v2.4S, v4.4S, v23.s[0] +sub v1.4s, v25.4s, v28.4s +mul v4.4S, v4.4S,v24.s[0] +str q1, [x0, #688] +ldr q1, [x0, #880] +sqrdmulh v15.4S, v1.4S, v23.s[0] +add v25.4s, v25.4s, v28.4s +mul v1.4S, v1.4S,v24.s[0] +str q25, [x0, #672] +ldr q25, [x17, #+576] +mla v14.4S, v29.4S, v31.s[0] +sub v29.4s, v3.4s, v5.4s +ldr q28, [x17, #+592] +sqrdmulh v18.4S, v11.4S, v28.s[0] +str q29, [x0, #720] +ldr q29, [x0, #944] +mla v6.4S, v26.4S, v31.s[0] +add v3.4s, v3.4s, v5.4s +sqrdmulh v5.4S, v29.4S, v28.s[0] +str q3, [x0, #704] +ldr q3, [x17, #+608] +mla v4.4S, v2.4S, v31.s[0] +sub v2.4s, v17.4s, v10.4s +ldr q26, [x17, #+624] +sqrdmulh v22.4S, v13.4S, v26.s[0] +str q2, [x0, #752] +ldr q2, [x0, #1008] +mla v1.4S, v15.4S, v31.s[0] +add v17.4s, v17.4s, v10.4s +sqrdmulh v10.4S, v2.4S, v26.s[0] +str q17, [x0, #736] +ldr q17, [x0, #768] +ldr q15, [x0, #896] +mul v11.4S, v11.4S,v25.s[0] +sub v20.4s, v17.4s, v14.4s +ldr q0, [x0, #784] +mul v29.4S, v29.4S,v25.s[0] +add v17.4s, v17.4s, v14.4s +ldr q14, [x0, #912] +mla v11.4S, v18.4S, v31.s[0] +sub v18.4s, v0.4s, v6.4s +ldr q21, [x0, #832] +mla v29.4S, v5.4S, v31.s[0] +add v0.4s, v0.4s, v6.4s +ldr q6, [x0, #960] +mul v13.4S, v13.4S,v3.s[0] +sub v5.4s, v21.4s, v4.4s +ldr q30, [x0, #848] +mul v2.4S, v2.4S,v3.s[0] +add v21.4s, v21.4s, v4.4s +ldr q4, [x0, #976] +mla v13.4S, v22.4S, v31.s[0] +mla v2.4S, v10.4S, v31.s[0] +sub v10.4s, v30.4s, v1.4s +sqrdmulh v22.4S, v0.4S, v16.s[1] +add v30.4s, v30.4s, v1.4s +mul v0.4S, v0.4S,v12.s[1] +sqrdmulh v1.4S, v18.4S, v16.s[2] +sub v19.4s, v15.4s, v11.4s +mul v18.4S, v18.4S,v12.s[2] +add v15.4s, v15.4s, v11.4s +sqrdmulh v16.4S, v30.4S, v23.s[1] +sub v12.4s, v14.4s, v29.4s +mul v30.4S, v30.4S,v24.s[1] +add v14.4s, v14.4s, v29.4s +sqrdmulh v29.4S, v10.4S, v23.s[2] +sub v11.4s, v6.4s, v13.4s +mul v10.4S, v10.4S,v24.s[2] +add v6.4s, v6.4s, v13.4s +mla v0.4S, v22.4S, v31.s[0] +sub v22.4s, v4.4s, v2.4s +sqrdmulh v23.4S, v14.4S, v28.s[1] +add v4.4s, v4.4s, v2.4s +mla v18.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v12.4S, v28.s[2] +sub v2.4s, v17.4s, v0.4s +mla v30.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v4.4S, v26.s[1] +add v17.4s, v17.4s, v0.4s +str q2, [x0, #784] +mla v10.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v22.4S, v26.s[2] +sub v2.4s, v20.4s, v18.4s +str q17, [x0, #768] +mul v14.4S, v14.4S,v25.s[1] +add v20.4s, v20.4s, v18.4s +mul v12.4S, v12.4S,v25.s[2] +str q2, [x0, #816] +mla v14.4S, v23.4S, v31.s[0] +sub v23.4s, v21.4s, v30.4s +mla v12.4S, v1.4S, v31.s[0] +str q20, [x0, #800] +mul v4.4S, v4.4S,v3.s[1] +str q23, [x0, #848] +mul v22.4S, v22.4S,v3.s[2] +add v21.4s, v21.4s, v30.4s +str q21, [x0, #832] +mla v4.4S, v16.4S, v31.s[0] +sub v16.4s, v5.4s, v10.4s +str q16, [x0, #880] +mla v22.4S, v29.4S, v31.s[0] +add v5.4s, v5.4s, v10.4s +str q5, [x0, #864] +sub v26.4s, v15.4s, v14.4s +str q26, [x0, #912] +add v15.4s, v15.4s, v14.4s +str q15, [x0, #896] +sub v15.4s, v19.4s, v12.4s +str q15, [x0, #944] +add v19.4s, v19.4s, v12.4s +str q19, [x0, #928] +sub v19.4s, v6.4s, v4.4s +str q19, [x0, #976] +add v6.4s, v6.4s, v4.4s +str q6, [x0, #960] +sub v6.4s, v11.4s, v22.4s +str q6, [x0, #1008] +add v11.4s, v11.4s, v22.4s +str q11, [x0, #992] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1464 +// Instruction count: 1460 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_9.s b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_9.s new file mode 100644 index 0000000..24f5a8a --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_9.s @@ -0,0 +1,1502 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_7_z4_9 +.global _ntt_u32_incomplete_neon_asm_var_4_2_7_z4_9 +ntt_u32_incomplete_neon_asm_var_4_2_7_z4_9: +_ntt_u32_incomplete_neon_asm_var_4_2_7_z4_9: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #928] +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +ldr q20, [x0, #992] +sqrdmulh v19.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q18, [x0, #800] +sqrdmulh v17.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +ldr q16, [x0, #864] +sqrdmulh v3.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +mla v22.4S, v21.4S, v31.s[0] +mla v20.4S, v19.4S, v31.s[0] +mla v18.4S, v17.4S, v31.s[0] +mla v16.4S, v3.4S, v31.s[0] +ldr q3, [x0, #544] +sqrdmulh v17.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +ldr q19, [x0, #608] +sqrdmulh v21.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q2, [x0, #672] +ldr q1, [x0, #416] +sqrdmulh v0.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +sub v15.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +ldr q22, [x0, #736] +ldr q14, [x0, #480] +sqrdmulh v13.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v12.4s, v14.4s, v20.4s +add v14.4s, v14.4s, v20.4s +ldr q20, [x0, #288] +mla v3.4S, v17.4S, v31.s[0] +mla v19.4S, v21.4S, v31.s[0] +sub v21.4s, v20.4s, v18.4s +mla v2.4S, v0.4S, v31.s[0] +mla v22.4S, v13.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +ldr q18, [x0, #352] +sqrdmulh v13.4S, v1.4S, v29.s[1] +mul v1.4S, v1.4S,v30.s[1] +sub v0.4s, v18.4s, v16.4s +sqrdmulh v17.4S, v14.4S, v29.s[1] +mul v14.4S, v14.4S,v30.s[1] +add v18.4s, v18.4s, v16.4s +ldr q16, [x0, #32] +sqrdmulh v11.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v10.4s, v16.4s, v3.4s +add v16.4s, v16.4s, v3.4s +ldr q3, [x0, #96] +sqrdmulh v9.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v8.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +ldr q19, [x0, #160] +mla v1.4S, v13.4S, v31.s[0] +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v19.4s, v2.4s +mla v20.4S, v11.4S, v31.s[0] +mla v18.4S, v9.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +ldr q2, [x0, #224] +sqrdmulh v9.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +sub v11.4s, v2.4s, v22.4s +sqrdmulh v13.4S, v12.4S, v29.s[2] +mul v12.4S, v12.4S,v30.s[2] +add v2.4s, v2.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +sub v7.4s, v19.4s, v1.4s +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v29.s[2] +mul v0.4S, v0.4S,v30.s[2] +sub v6.4s, v2.4s, v14.4s +add v2.4s, v2.4s, v14.4s +mla v15.4S, v9.4S, v31.s[0] +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v16.4s, v20.4s +mla v21.4S, v22.4S, v31.s[0] +mla v0.4S, v1.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v7.4S, v27.s[1] +mul v7.4S, v7.4S,v28.s[1] +sub v1.4s, v3.4s, v18.4s +sqrdmulh v22.4S, v6.4S, v27.s[1] +mul v6.4S, v6.4S,v28.s[1] +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v19.4S, v27.s[0] +mul v19.4S, v19.4S,v28.s[0] +sub v9.4s, v17.4s, v15.4s +add v17.4s, v17.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v27.s[0] +mul v2.4S, v2.4S,v28.s[0] +sub v14.4s, v11.4s, v12.4s +add v11.4s, v11.4s, v12.4s +mla v7.4S, v20.4S, v31.s[0] +mla v6.4S, v22.4S, v31.s[0] +sub v22.4s, v10.4s, v21.4s +mla v19.4S, v18.4S, v31.s[0] +mla v2.4S, v15.4S, v31.s[0] +add v10.4s, v10.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v27.s[2] +mul v17.4S, v17.4S,v28.s[2] +sub v15.4s, v8.4s, v0.4s +sqrdmulh v18.4S, v11.4S, v27.s[2] +mul v11.4S, v11.4S,v28.s[2] +add v8.4s, v8.4s, v0.4s +sqrdmulh v0.4S, v9.4S, v27.s[3] +mul v9.4S, v9.4S,v28.s[3] +sub v20.4s, v13.4s, v7.4s +add v13.4s, v13.4s, v7.4s +sqrdmulh v7.4S, v14.4S, v27.s[3] +mul v14.4S, v14.4S,v28.s[3] +sub v12.4s, v1.4s, v6.4s +add v1.4s, v1.4s, v6.4s +mla v17.4S, v21.4S, v31.s[0] +mla v11.4S, v18.4S, v31.s[0] +sub v18.4s, v16.4s, v19.4s +mla v9.4S, v0.4S, v31.s[0] +mla v14.4S, v7.4S, v31.s[0] +add v16.4s, v16.4s, v19.4s +sqrdmulh v19.4S, v1.4S, v25.s[2] +mul v1.4S, v1.4S,v26.s[2] +sub v7.4s, v3.4s, v2.4s +sqrdmulh v0.4S, v12.4S, v25.s[3] +mul v12.4S, v12.4S,v26.s[3] +add v3.4s, v3.4s, v2.4s +sqrdmulh v2.4S, v7.4S, v25.s[1] +mul v7.4S, v7.4S,v26.s[1] +sub v21.4s, v10.4s, v17.4s +add v10.4s, v10.4s, v17.4s +sqrdmulh v17.4S, v3.4S, v25.s[0] +mul v3.4S, v3.4S,v26.s[0] +sub v6.4s, v8.4s, v11.4s +add v8.4s, v8.4s, v11.4s +mla v1.4S, v19.4S, v31.s[0] +mla v12.4S, v0.4S, v31.s[0] +sub v0.4s, v22.4s, v9.4s +mla v7.4S, v2.4S, v31.s[0] +mla v3.4S, v17.4S, v31.s[0] +add v22.4s, v22.4s, v9.4s +sqrdmulh v9.4S, v8.4S, v23.s[0] +mul v8.4S, v8.4S,v24.s[0] +sub v17.4s, v15.4s, v14.4s +sqrdmulh v2.4S, v6.4S, v23.s[1] +mul v6.4S, v6.4S,v24.s[1] +add v15.4s, v15.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v23.s[2] +mul v15.4S, v15.4S,v24.s[2] +sub v19.4s, v13.4s, v1.4s +add v13.4s, v13.4s, v1.4s +sqrdmulh v1.4S, v17.4S, v23.s[3] +mul v17.4S, v17.4S,v24.s[3] +sub v11.4s, v20.4s, v12.4s +add v20.4s, v20.4s, v12.4s +mla v8.4S, v9.4S, v31.s[0] +mla v6.4S, v2.4S, v31.s[0] +sub v2.4s, v18.4s, v7.4s +str q13, [x0, #288] +mla v15.4S, v14.4S, v31.s[0] +mla v17.4S, v1.4S, v31.s[0] +add v18.4s, v18.4s, v7.4s +str q19, [x0, #352] +ldr q19, [x0, #944] +sqrdmulh v7.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +sub v1.4s, v16.4s, v3.4s +str q20, [x0, #416] +ldr q20, [x0, #1008] +sqrdmulh v14.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v16.4s, v16.4s, v3.4s +str q11, [x0, #480] +ldr q11, [x0, #816] +sqrdmulh v3.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +sub v13.4s, v10.4s, v8.4s +add v10.4s, v10.4s, v8.4s +ldr q8, [x0, #880] +sqrdmulh v9.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v12.4s, v21.4s, v6.4s +add v21.4s, v21.4s, v6.4s +mla v19.4S, v7.4S, v31.s[0] +mla v20.4S, v14.4S, v31.s[0] +sub v14.4s, v22.4s, v15.4s +str q18, [x0, #160] +mla v11.4S, v3.4S, v31.s[0] +mla v8.4S, v9.4S, v31.s[0] +add v22.4s, v22.4s, v15.4s +str q2, [x0, #224] +ldr q2, [x0, #560] +sqrdmulh v15.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +sub v9.4s, v0.4s, v17.4s +str q16, [x0, #32] +ldr q16, [x0, #624] +sqrdmulh v3.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +add v0.4s, v0.4s, v17.4s +str q1, [x0, #96] +ldr q1, [x0, #688] +ldr q17, [x0, #432] +sqrdmulh v18.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +sub v7.4s, v17.4s, v19.4s +add v17.4s, v17.4s, v19.4s +ldr q19, [x0, #752] +ldr q6, [x0, #496] +sqrdmulh v5.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +sub v4.4s, v6.4s, v20.4s +add v6.4s, v6.4s, v20.4s +ldr q20, [x0, #304] +mla v2.4S, v15.4S, v31.s[0] +mla v16.4S, v3.4S, v31.s[0] +sub v3.4s, v20.4s, v11.4s +str q10, [x0, #544] +mla v1.4S, v18.4S, v31.s[0] +mla v19.4S, v5.4S, v31.s[0] +add v20.4s, v20.4s, v11.4s +str q13, [x0, #608] +ldr q13, [x0, #368] +sqrdmulh v11.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v5.4s, v13.4s, v8.4s +str q21, [x0, #672] +sqrdmulh v21.4S, v6.4S, v29.s[1] +mul v6.4S, v6.4S,v30.s[1] +add v13.4s, v13.4s, v8.4s +str q12, [x0, #736] +ldr q12, [x0, #48] +sqrdmulh v8.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v18.4s, v12.4s, v2.4s +add v12.4s, v12.4s, v2.4s +ldr q2, [x0, #112] +sqrdmulh v10.4S, v13.4S, v29.s[1] +mul v13.4S, v13.4S,v30.s[1] +sub v15.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +ldr q16, [x0, #176] +mla v17.4S, v11.4S, v31.s[0] +mla v6.4S, v21.4S, v31.s[0] +sub v21.4s, v16.4s, v1.4s +str q22, [x0, #800] +mla v20.4S, v8.4S, v31.s[0] +mla v13.4S, v10.4S, v31.s[0] +add v16.4s, v16.4s, v1.4s +str q14, [x0, #864] +ldr q14, [x0, #240] +sqrdmulh v1.4S, v7.4S, v29.s[2] +mul v7.4S, v7.4S,v30.s[2] +sub v10.4s, v14.4s, v19.4s +str q0, [x0, #928] +sqrdmulh v0.4S, v4.4S, v29.s[2] +mul v4.4S, v4.4S,v30.s[2] +add v14.4s, v14.4s, v19.4s +str q9, [x0, #992] +sqrdmulh v9.4S, v3.4S, v29.s[2] +mul v3.4S, v3.4S,v30.s[2] +sub v19.4s, v16.4s, v17.4s +add v16.4s, v16.4s, v17.4s +sqrdmulh v17.4S, v5.4S, v29.s[2] +mul v5.4S, v5.4S,v30.s[2] +sub v8.4s, v14.4s, v6.4s +add v14.4s, v14.4s, v6.4s +mla v7.4S, v1.4S, v31.s[0] +mla v4.4S, v0.4S, v31.s[0] +sub v0.4s, v12.4s, v20.4s +mla v3.4S, v9.4S, v31.s[0] +mla v5.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v27.s[1] +mul v19.4S, v19.4S,v28.s[1] +sub v17.4s, v2.4s, v13.4s +sqrdmulh v9.4S, v8.4S, v27.s[1] +mul v8.4S, v8.4S,v28.s[1] +add v2.4s, v2.4s, v13.4s +sqrdmulh v13.4S, v16.4S, v27.s[0] +mul v16.4S, v16.4S,v28.s[0] +sub v1.4s, v21.4s, v7.4s +add v21.4s, v21.4s, v7.4s +sqrdmulh v7.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +sub v6.4s, v10.4s, v4.4s +add v10.4s, v10.4s, v4.4s +mla v19.4S, v20.4S, v31.s[0] +mla v8.4S, v9.4S, v31.s[0] +sub v9.4s, v18.4s, v3.4s +mla v16.4S, v13.4S, v31.s[0] +mla v14.4S, v7.4S, v31.s[0] +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v27.s[2] +mul v21.4S, v21.4S,v28.s[2] +sub v7.4s, v15.4s, v5.4s +sqrdmulh v13.4S, v10.4S, v27.s[2] +mul v10.4S, v10.4S,v28.s[2] +add v15.4s, v15.4s, v5.4s +sqrdmulh v5.4S, v1.4S, v27.s[3] +mul v1.4S, v1.4S,v28.s[3] +sub v20.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v27.s[3] +mul v6.4S, v6.4S,v28.s[3] +sub v4.4s, v17.4s, v8.4s +add v17.4s, v17.4s, v8.4s +mla v21.4S, v3.4S, v31.s[0] +mla v10.4S, v13.4S, v31.s[0] +sub v13.4s, v12.4s, v16.4s +mla v1.4S, v5.4S, v31.s[0] +mla v6.4S, v19.4S, v31.s[0] +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v25.s[2] +mul v17.4S, v17.4S,v26.s[2] +sub v19.4s, v2.4s, v14.4s +sqrdmulh v5.4S, v4.4S, v25.s[3] +mul v4.4S, v4.4S,v26.s[3] +add v2.4s, v2.4s, v14.4s +sqrdmulh v14.4S, v19.4S, v25.s[1] +mul v19.4S, v19.4S,v26.s[1] +sub v3.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v2.4S, v25.s[0] +mul v2.4S, v2.4S,v26.s[0] +sub v8.4s, v15.4s, v10.4s +add v15.4s, v15.4s, v10.4s +mla v17.4S, v16.4S, v31.s[0] +mla v4.4S, v5.4S, v31.s[0] +sub v5.4s, v9.4s, v1.4s +mla v19.4S, v14.4S, v31.s[0] +mla v2.4S, v21.4S, v31.s[0] +add v9.4s, v9.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v23.s[0] +mul v15.4S, v15.4S,v24.s[0] +sub v21.4s, v7.4s, v6.4s +sqrdmulh v14.4S, v8.4S, v23.s[1] +mul v8.4S, v8.4S,v24.s[1] +add v7.4s, v7.4s, v6.4s +sqrdmulh v6.4S, v7.4S, v23.s[2] +mul v7.4S, v7.4S,v24.s[2] +sub v16.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +sqrdmulh v17.4S, v21.4S, v23.s[3] +mul v21.4S, v21.4S,v24.s[3] +sub v10.4s, v20.4s, v4.4s +add v20.4s, v20.4s, v4.4s +mla v15.4S, v1.4S, v31.s[0] +mla v8.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v19.4s +str q0, [x0, #304] +mla v7.4S, v6.4S, v31.s[0] +mla v21.4S, v17.4S, v31.s[0] +add v13.4s, v13.4s, v19.4s +str q16, [x0, #368] +ldr q16, [x0, #896] +sqrdmulh v19.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v17.4s, v12.4s, v2.4s +str q20, [x0, #432] +ldr q20, [x0, #960] +sqrdmulh v6.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v12.4s, v12.4s, v2.4s +str q10, [x0, #496] +ldr q10, [x0, #768] +sqrdmulh v2.4S, v10.4S, v29.s[0] +mul v10.4S, v10.4S,v30.s[0] +sub v0.4s, v18.4s, v15.4s +add v18.4s, v18.4s, v15.4s +ldr q15, [x0, #832] +sqrdmulh v1.4S, v15.4S, v29.s[0] +mul v15.4S, v15.4S,v30.s[0] +sub v4.4s, v3.4s, v8.4s +add v3.4s, v3.4s, v8.4s +mla v16.4S, v19.4S, v31.s[0] +mla v20.4S, v6.4S, v31.s[0] +sub v6.4s, v9.4s, v7.4s +str q13, [x0, #176] +mla v10.4S, v2.4S, v31.s[0] +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v7.4s +str q14, [x0, #240] +ldr q14, [x0, #512] +sqrdmulh v7.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v1.4s, v5.4s, v21.4s +str q12, [x0, #48] +ldr q12, [x0, #576] +sqrdmulh v2.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +add v5.4s, v5.4s, v21.4s +str q17, [x0, #112] +ldr q17, [x0, #640] +ldr q21, [x0, #384] +sqrdmulh v13.4S, v17.4S, v29.s[0] +mul v17.4S, v17.4S,v30.s[0] +sub v19.4s, v21.4s, v16.4s +add v21.4s, v21.4s, v16.4s +ldr q16, [x0, #704] +ldr q8, [x0, #448] +sqrdmulh v22.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v11.4s, v8.4s, v20.4s +add v8.4s, v8.4s, v20.4s +ldr q20, [x0, #256] +mla v14.4S, v7.4S, v31.s[0] +mla v12.4S, v2.4S, v31.s[0] +sub v2.4s, v20.4s, v10.4s +str q18, [x0, #560] +mla v17.4S, v13.4S, v31.s[0] +mla v16.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v10.4s +str q0, [x0, #624] +ldr q0, [x0, #320] +sqrdmulh v10.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v22.4s, v0.4s, v15.4s +str q3, [x0, #688] +sqrdmulh v3.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +add v0.4s, v0.4s, v15.4s +str q4, [x0, #752] +ldr q4, [x0, #0] +sqrdmulh v15.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v13.4s, v4.4s, v14.4s +add v4.4s, v4.4s, v14.4s +ldr q14, [x0, #64] +sqrdmulh v18.4S, v0.4S, v29.s[1] +mul v0.4S, v0.4S,v30.s[1] +sub v7.4s, v14.4s, v12.4s +add v14.4s, v14.4s, v12.4s +ldr q12, [x0, #128] +mla v21.4S, v10.4S, v31.s[0] +mla v8.4S, v3.4S, v31.s[0] +sub v3.4s, v12.4s, v17.4s +str q9, [x0, #816] +mla v20.4S, v15.4S, v31.s[0] +mla v0.4S, v18.4S, v31.s[0] +add v12.4s, v12.4s, v17.4s +str q6, [x0, #880] +ldr q6, [x0, #192] +sqrdmulh v17.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +sub v18.4s, v6.4s, v16.4s +str q5, [x0, #944] +sqrdmulh v5.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +add v6.4s, v6.4s, v16.4s +str q1, [x0, #1008] +sqrdmulh v1.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v16.4s, v12.4s, v21.4s +add v12.4s, v12.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +sub v15.4s, v6.4s, v8.4s +add v6.4s, v6.4s, v8.4s +mla v19.4S, v17.4S, v31.s[0] +mla v11.4S, v5.4S, v31.s[0] +sub v5.4s, v4.4s, v20.4s +mla v2.4S, v1.4S, v31.s[0] +mla v22.4S, v21.4S, v31.s[0] +add v4.4s, v4.4s, v20.4s +sqrdmulh v20.4S, v16.4S, v27.s[1] +mul v16.4S, v16.4S,v28.s[1] +sub v21.4s, v14.4s, v0.4s +sqrdmulh v1.4S, v15.4S, v27.s[1] +mul v15.4S, v15.4S,v28.s[1] +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +sub v17.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v27.s[0] +mul v6.4S, v6.4S,v28.s[0] +sub v8.4s, v18.4s, v11.4s +add v18.4s, v18.4s, v11.4s +mla v16.4S, v20.4S, v31.s[0] +mla v15.4S, v1.4S, v31.s[0] +sub v1.4s, v13.4s, v2.4s +mla v12.4S, v0.4S, v31.s[0] +mla v6.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v3.4S, v27.s[2] +mul v3.4S, v3.4S,v28.s[2] +sub v19.4s, v7.4s, v22.4s +sqrdmulh v0.4S, v18.4S, v27.s[2] +mul v18.4S, v18.4S,v28.s[2] +add v7.4s, v7.4s, v22.4s +sqrdmulh v22.4S, v17.4S, v27.s[3] +mul v17.4S, v17.4S,v28.s[3] +sub v20.4s, v5.4s, v16.4s +add v5.4s, v5.4s, v16.4s +sqrdmulh v16.4S, v8.4S, v27.s[3] +mul v8.4S, v8.4S,v28.s[3] +sub v11.4s, v21.4s, v15.4s +add v21.4s, v21.4s, v15.4s +mla v3.4S, v2.4S, v31.s[0] +mla v18.4S, v0.4S, v31.s[0] +sub v0.4s, v4.4s, v12.4s +mla v17.4S, v22.4S, v31.s[0] +mla v8.4S, v16.4S, v31.s[0] +add v4.4s, v4.4s, v12.4s +sqrdmulh v12.4S, v21.4S, v25.s[2] +mul v21.4S, v21.4S,v26.s[2] +sub v16.4s, v14.4s, v6.4s +sqrdmulh v22.4S, v11.4S, v25.s[3] +mul v11.4S, v11.4S,v26.s[3] +add v14.4s, v14.4s, v6.4s +sqrdmulh v6.4S, v16.4S, v25.s[1] +mul v16.4S, v16.4S,v26.s[1] +sub v2.4s, v13.4s, v3.4s +add v13.4s, v13.4s, v3.4s +sqrdmulh v3.4S, v14.4S, v25.s[0] +mul v14.4S, v14.4S,v26.s[0] +sub v15.4s, v7.4s, v18.4s +add v7.4s, v7.4s, v18.4s +mla v21.4S, v12.4S, v31.s[0] +mla v11.4S, v22.4S, v31.s[0] +sub v22.4s, v1.4s, v17.4s +mla v16.4S, v6.4S, v31.s[0] +mla v14.4S, v3.4S, v31.s[0] +add v1.4s, v1.4s, v17.4s +sqrdmulh v17.4S, v7.4S, v23.s[0] +mul v7.4S, v7.4S,v24.s[0] +sub v3.4s, v19.4s, v8.4s +sqrdmulh v6.4S, v15.4S, v23.s[1] +mul v15.4S, v15.4S,v24.s[1] +add v19.4s, v19.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v23.s[2] +mul v19.4S, v19.4S,v24.s[2] +sub v12.4s, v5.4s, v21.4s +add v5.4s, v5.4s, v21.4s +sqrdmulh v21.4S, v3.4S, v23.s[3] +mul v3.4S, v3.4S,v24.s[3] +sub v18.4s, v20.4s, v11.4s +add v20.4s, v20.4s, v11.4s +mla v7.4S, v17.4S, v31.s[0] +mla v15.4S, v6.4S, v31.s[0] +sub v6.4s, v0.4s, v16.4s +str q5, [x0, #256] +mla v19.4S, v8.4S, v31.s[0] +mla v3.4S, v21.4S, v31.s[0] +add v0.4s, v0.4s, v16.4s +str q12, [x0, #320] +ldr q12, [x0, #912] +sqrdmulh v16.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +sub v21.4s, v4.4s, v14.4s +str q20, [x0, #384] +ldr q20, [x0, #976] +sqrdmulh v8.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v4.4s, v4.4s, v14.4s +str q18, [x0, #448] +ldr q18, [x0, #784] +sqrdmulh v14.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +sub v5.4s, v13.4s, v7.4s +add v13.4s, v13.4s, v7.4s +ldr q7, [x0, #848] +sqrdmulh v17.4S, v7.4S, v29.s[0] +mul v7.4S, v7.4S,v30.s[0] +sub v11.4s, v2.4s, v15.4s +add v2.4s, v2.4s, v15.4s +mla v12.4S, v16.4S, v31.s[0] +mla v20.4S, v8.4S, v31.s[0] +sub v8.4s, v1.4s, v19.4s +str q0, [x0, #128] +mla v18.4S, v14.4S, v31.s[0] +mla v7.4S, v17.4S, v31.s[0] +add v1.4s, v1.4s, v19.4s +str q6, [x0, #192] +ldr q6, [x0, #528] +sqrdmulh v19.4S, v6.4S, v29.s[0] +mul v6.4S, v6.4S,v30.s[0] +sub v17.4s, v22.4s, v3.4s +str q4, [x0, #0] +ldr q4, [x0, #592] +sqrdmulh v14.4S, v4.4S, v29.s[0] +mul v4.4S, v4.4S,v30.s[0] +add v22.4s, v22.4s, v3.4s +str q21, [x0, #64] +ldr q21, [x0, #656] +ldr q3, [x0, #400] +sqrdmulh v0.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +sub v16.4s, v3.4s, v12.4s +add v3.4s, v3.4s, v12.4s +ldr q12, [x0, #720] +ldr q15, [x0, #464] +sqrdmulh v9.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +sub v10.4s, v15.4s, v20.4s +add v15.4s, v15.4s, v20.4s +ldr q20, [x0, #272] +mla v6.4S, v19.4S, v31.s[0] +mla v4.4S, v14.4S, v31.s[0] +sub v14.4s, v20.4s, v18.4s +str q13, [x0, #512] +mla v21.4S, v0.4S, v31.s[0] +mla v12.4S, v9.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +str q5, [x0, #576] +ldr q5, [x0, #336] +sqrdmulh v18.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v9.4s, v5.4s, v7.4s +str q2, [x0, #640] +sqrdmulh v2.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +add v5.4s, v5.4s, v7.4s +str q11, [x0, #704] +ldr q11, [x0, #16] +sqrdmulh v7.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v0.4s, v11.4s, v6.4s +add v11.4s, v11.4s, v6.4s +ldr q6, [x0, #80] +sqrdmulh v13.4S, v5.4S, v29.s[1] +mul v5.4S, v5.4S,v30.s[1] +sub v19.4s, v6.4s, v4.4s +add v6.4s, v6.4s, v4.4s +ldr q4, [x0, #144] +mla v3.4S, v18.4S, v31.s[0] +mla v15.4S, v2.4S, v31.s[0] +sub v2.4s, v4.4s, v21.4s +str q1, [x0, #768] +mla v20.4S, v7.4S, v31.s[0] +mla v5.4S, v13.4S, v31.s[0] +add v4.4s, v4.4s, v21.4s +str q8, [x0, #832] +ldr q8, [x0, #208] +sqrdmulh v21.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +sub v13.4s, v8.4s, v12.4s +str q22, [x0, #896] +sqrdmulh v22.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +add v8.4s, v8.4s, v12.4s +str q17, [x0, #960] +sqrdmulh v17.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v12.4s, v4.4s, v3.4s +add v4.4s, v4.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v29.s[2] +mul v9.4S, v9.4S,v30.s[2] +sub v7.4s, v8.4s, v15.4s +add v8.4s, v8.4s, v15.4s +mla v16.4S, v21.4S, v31.s[0] +mla v10.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v20.4s +mla v14.4S, v17.4S, v31.s[0] +mla v9.4S, v3.4S, v31.s[0] +add v11.4s, v11.4s, v20.4s +sqrdmulh v20.4S, v12.4S, v27.s[1] +mul v12.4S, v12.4S,v28.s[1] +sub v3.4s, v6.4s, v5.4s +sqrdmulh v17.4S, v7.4S, v27.s[1] +mul v7.4S, v7.4S,v28.s[1] +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v4.4S, v27.s[0] +mul v4.4S, v4.4S,v28.s[0] +sub v21.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v8.4S, v27.s[0] +mul v8.4S, v8.4S,v28.s[0] +sub v15.4s, v13.4s, v10.4s +add v13.4s, v13.4s, v10.4s +mla v12.4S, v20.4S, v31.s[0] +mla v7.4S, v17.4S, v31.s[0] +sub v17.4s, v0.4s, v14.4s +mla v4.4S, v5.4S, v31.s[0] +mla v8.4S, v16.4S, v31.s[0] +add v0.4s, v0.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v27.s[2] +mul v2.4S, v2.4S,v28.s[2] +sub v16.4s, v19.4s, v9.4s +sqrdmulh v5.4S, v13.4S, v27.s[2] +mul v13.4S, v13.4S,v28.s[2] +add v19.4s, v19.4s, v9.4s +sqrdmulh v9.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +sub v20.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +sub v10.4s, v3.4s, v7.4s +add v3.4s, v3.4s, v7.4s +mla v2.4S, v14.4S, v31.s[0] +mla v13.4S, v5.4S, v31.s[0] +sub v5.4s, v11.4s, v4.4s +mla v21.4S, v9.4S, v31.s[0] +mla v15.4S, v12.4S, v31.s[0] +add v11.4s, v11.4s, v4.4s +sqrdmulh v4.4S, v3.4S, v25.s[2] +mul v3.4S, v3.4S,v26.s[2] +sub v12.4s, v6.4s, v8.4s +sqrdmulh v9.4S, v10.4S, v25.s[3] +mul v10.4S, v10.4S,v26.s[3] +add v6.4s, v6.4s, v8.4s +sqrdmulh v8.4S, v12.4S, v25.s[1] +mul v12.4S, v12.4S,v26.s[1] +sub v14.4s, v0.4s, v2.4s +add v0.4s, v0.4s, v2.4s +sqrdmulh v2.4S, v6.4S, v25.s[0] +mul v6.4S, v6.4S,v26.s[0] +sub v7.4s, v19.4s, v13.4s +add v19.4s, v19.4s, v13.4s +mla v3.4S, v4.4S, v31.s[0] +mla v10.4S, v9.4S, v31.s[0] +sub v9.4s, v17.4s, v21.4s +mla v12.4S, v8.4S, v31.s[0] +mla v6.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v19.4S, v23.s[0] +mul v19.4S, v19.4S,v24.s[0] +sub v2.4s, v16.4s, v15.4s +sqrdmulh v8.4S, v7.4S, v23.s[1] +mul v7.4S, v7.4S,v24.s[1] +add v16.4s, v16.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v23.s[2] +mul v16.4S, v16.4S,v24.s[2] +sub v4.4s, v22.4s, v3.4s +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v2.4S, v23.s[3] +mul v2.4S, v2.4S,v24.s[3] +sub v13.4s, v20.4s, v10.4s +add v20.4s, v20.4s, v10.4s +mla v19.4S, v21.4S, v31.s[0] +mla v7.4S, v8.4S, v31.s[0] +sub v8.4s, v5.4s, v12.4s +str q22, [x0, #272] +mla v16.4S, v15.4S, v31.s[0] +mla v2.4S, v3.4S, v31.s[0] +add v5.4s, v5.4s, v12.4s +str q4, [x0, #336] +sub v23.4s, v11.4s, v6.4s +str q20, [x0, #400] +add v11.4s, v11.4s, v6.4s +str q13, [x0, #464] +sub v13.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sub v19.4s, v14.4s, v7.4s +add v14.4s, v14.4s, v7.4s +sub v7.4s, v17.4s, v16.4s +str q5, [x0, #144] +add v17.4s, v17.4s, v16.4s +str q8, [x0, #208] +sub v8.4s, v9.4s, v2.4s +str q11, [x0, #16] +add v9.4s, v9.4s, v2.4s +str q23, [x0, #80] +str q0, [x0, #528] +str q13, [x0, #592] +str q14, [x0, #656] +str q19, [x0, #720] +str q17, [x0, #784] +str q7, [x0, #848] +str q9, [x0, #912] +str q8, [x0, #976] +ldr q18, [x0, #224] +ldr q1, [x0, #160] +ldr q10, [x0, #32] +ldr q21, [x17, #+128] +ldr q22, [x17, #+144] +sqrdmulh v15.4S, v10.4S, v22.s[0] +mul v10.4S, v10.4S,v21.s[0] +ldr q3, [x0, #48] +sqrdmulh v12.4S, v3.4S, v22.s[0] +mul v3.4S, v3.4S,v21.s[0] +ldr q4, [x17, #+160] +ldr q30, [x17, #+176] +ldr q29, [x0, #96] +sqrdmulh v28.4S, v29.4S, v30.s[0] +mul v29.4S, v29.4S,v4.s[0] +ldr q27, [x0, #112] +sqrdmulh v26.4S, v27.4S, v30.s[0] +mul v27.4S, v27.4S,v4.s[0] +ldr q25, [x17, #+192] +ldr q24, [x17, #+208] +mla v10.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v1.4S, v24.s[0] +ldr q20, [x0, #176] +mla v3.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v20.4S, v24.s[0] +ldr q6, [x17, #+224] +ldr q5, [x17, #+240] +mla v29.4S, v28.4S, v31.s[0] +sqrdmulh v28.4S, v18.4S, v5.s[0] +ldr q16, [x0, #240] +mla v27.4S, v26.4S, v31.s[0] +sqrdmulh v26.4S, v16.4S, v5.s[0] +ldr q11, [x0, #0] +ldr q2, [x0, #128] +mul v1.4S, v1.4S,v25.s[0] +sub v23.4s, v11.4s, v10.4s +ldr q0, [x0, #16] +mul v20.4S, v20.4S,v25.s[0] +add v11.4s, v11.4s, v10.4s +ldr q10, [x0, #144] +mla v1.4S, v15.4S, v31.s[0] +sub v15.4s, v0.4s, v3.4s +ldr q13, [x0, #64] +mla v20.4S, v12.4S, v31.s[0] +add v0.4s, v0.4s, v3.4s +ldr q3, [x0, #192] +mul v18.4S, v18.4S,v6.s[0] +sub v12.4s, v13.4s, v29.4s +ldr q14, [x0, #80] +mul v16.4S, v16.4S,v6.s[0] +add v13.4s, v13.4s, v29.4s +ldr q29, [x0, #208] +mla v18.4S, v28.4S, v31.s[0] +nop +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v14.4s, v27.4s +sqrdmulh v28.4S, v0.4S, v22.s[1] +add v14.4s, v14.4s, v27.4s +mul v0.4S, v0.4S,v21.s[1] +nop +sqrdmulh v27.4S, v15.4S, v22.s[2] +sub v19.4s, v2.4s, v1.4s +mul v15.4S, v15.4S,v21.s[2] +add v2.4s, v2.4s, v1.4s +sqrdmulh v22.4S, v14.4S, v30.s[1] +sub v21.4s, v10.4s, v20.4s +mul v14.4S, v14.4S,v4.s[1] +add v10.4s, v10.4s, v20.4s +sqrdmulh v20.4S, v26.4S, v30.s[2] +sub v1.4s, v3.4s, v18.4s +mul v26.4S, v26.4S,v4.s[2] +add v3.4s, v3.4s, v18.4s +mla v0.4S, v28.4S, v31.s[0] +sub v28.4s, v29.4s, v16.4s +ldr q30, [x0, #480] +sqrdmulh v4.4S, v10.4S, v24.s[1] +add v29.4s, v29.4s, v16.4s +mla v15.4S, v27.4S, v31.s[0] +ldr q27, [x0, #416] +sqrdmulh v16.4S, v21.4S, v24.s[2] +sub v18.4s, v11.4s, v0.4s +mla v14.4S, v22.4S, v31.s[0] +ldr q22, [x0, #288] +sqrdmulh v17.4S, v29.4S, v5.s[1] +add v11.4s, v11.4s, v0.4s +str q18, [x0, #16] +mla v26.4S, v20.4S, v31.s[0] +ldr q20, [x17, #+256] +ldr q18, [x17, #+272] +sqrdmulh v0.4S, v28.4S, v5.s[2] +sub v7.4s, v23.4s, v15.4s +str q11, [x0, #0] +mul v10.4S, v10.4S,v25.s[1] +add v23.4s, v23.4s, v15.4s +mul v21.4S, v21.4S,v25.s[2] +str q7, [x0, #48] +mla v10.4S, v4.4S, v31.s[0] +sub v4.4s, v13.4s, v14.4s +mla v21.4S, v16.4S, v31.s[0] +str q23, [x0, #32] +mul v29.4S, v29.4S,v6.s[1] +str q4, [x0, #80] +mul v28.4S, v28.4S,v6.s[2] +add v13.4s, v13.4s, v14.4s +str q13, [x0, #64] +mla v29.4S, v17.4S, v31.s[0] +sub v17.4s, v12.4s, v26.4s +str q17, [x0, #112] +mla v28.4S, v0.4S, v31.s[0] +add v12.4s, v12.4s, v26.4s +str q12, [x0, #96] +sqrdmulh v5.4S, v22.4S, v18.s[0] +sub v6.4s, v2.4s, v10.4s +mul v22.4S, v22.4S,v20.s[0] +str q6, [x0, #144] +ldr q6, [x0, #304] +sqrdmulh v12.4S, v6.4S, v18.s[0] +add v2.4s, v2.4s, v10.4s +mul v6.4S, v6.4S,v20.s[0] +str q2, [x0, #128] +ldr q2, [x17, #+288] +ldr q10, [x17, #+304] +ldr q26, [x0, #352] +sqrdmulh v0.4S, v26.4S, v10.s[0] +sub v17.4s, v19.4s, v21.4s +mul v26.4S, v26.4S,v2.s[0] +str q17, [x0, #176] +ldr q17, [x0, #368] +sqrdmulh v13.4S, v17.4S, v10.s[0] +add v19.4s, v19.4s, v21.4s +mul v17.4S, v17.4S,v2.s[0] +str q19, [x0, #160] +ldr q19, [x17, #+320] +ldr q21, [x17, #+336] +mla v22.4S, v5.4S, v31.s[0] +sub v5.4s, v3.4s, v29.4s +sqrdmulh v14.4S, v27.4S, v21.s[0] +str q5, [x0, #208] +ldr q5, [x0, #432] +mla v6.4S, v12.4S, v31.s[0] +add v3.4s, v3.4s, v29.4s +sqrdmulh v29.4S, v5.4S, v21.s[0] +str q3, [x0, #192] +ldr q3, [x17, #+352] +ldr q12, [x17, #+368] +mla v26.4S, v0.4S, v31.s[0] +sub v0.4s, v1.4s, v28.4s +sqrdmulh v4.4S, v30.4S, v12.s[0] +str q0, [x0, #240] +ldr q0, [x0, #496] +mla v17.4S, v13.4S, v31.s[0] +add v1.4s, v1.4s, v28.4s +sqrdmulh v28.4S, v0.4S, v12.s[0] +str q1, [x0, #224] +ldr q1, [x0, #256] +ldr q13, [x0, #384] +mul v27.4S, v27.4S,v19.s[0] +sub v24.4s, v1.4s, v22.4s +ldr q25, [x0, #272] +mul v5.4S, v5.4S,v19.s[0] +add v1.4s, v1.4s, v22.4s +ldr q22, [x0, #400] +mla v27.4S, v14.4S, v31.s[0] +sub v14.4s, v25.4s, v6.4s +ldr q23, [x0, #320] +mla v5.4S, v29.4S, v31.s[0] +add v25.4s, v25.4s, v6.4s +ldr q6, [x0, #448] +mul v30.4S, v30.4S,v3.s[0] +sub v29.4s, v23.4s, v26.4s +ldr q16, [x0, #336] +mul v0.4S, v0.4S,v3.s[0] +add v23.4s, v23.4s, v26.4s +ldr q26, [x0, #464] +mla v30.4S, v4.4S, v31.s[0] +nop +mla v0.4S, v28.4S, v31.s[0] +sub v28.4s, v16.4s, v17.4s +sqrdmulh v4.4S, v25.4S, v18.s[1] +add v16.4s, v16.4s, v17.4s +mul v25.4S, v25.4S,v20.s[1] +nop +sqrdmulh v17.4S, v14.4S, v18.s[2] +sub v7.4s, v13.4s, v27.4s +mul v14.4S, v14.4S,v20.s[2] +add v13.4s, v13.4s, v27.4s +sqrdmulh v18.4S, v16.4S, v10.s[1] +sub v20.4s, v22.4s, v5.4s +mul v16.4S, v16.4S,v2.s[1] +add v22.4s, v22.4s, v5.4s +sqrdmulh v5.4S, v28.4S, v10.s[2] +sub v27.4s, v6.4s, v30.4s +mul v28.4S, v28.4S,v2.s[2] +add v6.4s, v6.4s, v30.4s +mla v25.4S, v4.4S, v31.s[0] +sub v4.4s, v26.4s, v0.4s +ldr q10, [x0, #736] +sqrdmulh v2.4S, v22.4S, v21.s[1] +add v26.4s, v26.4s, v0.4s +mla v14.4S, v17.4S, v31.s[0] +ldr q17, [x0, #672] +sqrdmulh v0.4S, v20.4S, v21.s[2] +sub v30.4s, v1.4s, v25.4s +mla v16.4S, v18.4S, v31.s[0] +ldr q18, [x0, #544] +sqrdmulh v15.4S, v26.4S, v12.s[1] +add v1.4s, v1.4s, v25.4s +str q30, [x0, #272] +mla v28.4S, v5.4S, v31.s[0] +ldr q5, [x17, #+384] +ldr q30, [x17, #+400] +sqrdmulh v25.4S, v4.4S, v12.s[2] +sub v11.4s, v24.4s, v14.4s +str q1, [x0, #256] +mul v22.4S, v22.4S,v19.s[1] +add v24.4s, v24.4s, v14.4s +mul v20.4S, v20.4S,v19.s[2] +str q11, [x0, #304] +mla v22.4S, v2.4S, v31.s[0] +sub v2.4s, v23.4s, v16.4s +mla v20.4S, v0.4S, v31.s[0] +str q24, [x0, #288] +mul v26.4S, v26.4S,v3.s[1] +str q2, [x0, #336] +mul v4.4S, v4.4S,v3.s[2] +add v23.4s, v23.4s, v16.4s +str q23, [x0, #320] +mla v26.4S, v15.4S, v31.s[0] +sub v15.4s, v29.4s, v28.4s +str q15, [x0, #368] +mla v4.4S, v25.4S, v31.s[0] +add v29.4s, v29.4s, v28.4s +str q29, [x0, #352] +sqrdmulh v12.4S, v18.4S, v30.s[0] +sub v3.4s, v13.4s, v22.4s +mul v18.4S, v18.4S,v5.s[0] +str q3, [x0, #400] +ldr q3, [x0, #560] +sqrdmulh v29.4S, v3.4S, v30.s[0] +add v13.4s, v13.4s, v22.4s +mul v3.4S, v3.4S,v5.s[0] +str q13, [x0, #384] +ldr q13, [x17, #+416] +ldr q22, [x17, #+432] +ldr q28, [x0, #608] +sqrdmulh v25.4S, v28.4S, v22.s[0] +sub v15.4s, v7.4s, v20.4s +mul v28.4S, v28.4S,v13.s[0] +str q15, [x0, #432] +ldr q15, [x0, #624] +sqrdmulh v23.4S, v15.4S, v22.s[0] +add v7.4s, v7.4s, v20.4s +mul v15.4S, v15.4S,v13.s[0] +str q7, [x0, #416] +ldr q7, [x17, #+448] +ldr q20, [x17, #+464] +mla v18.4S, v12.4S, v31.s[0] +sub v12.4s, v6.4s, v26.4s +sqrdmulh v16.4S, v17.4S, v20.s[0] +str q12, [x0, #464] +ldr q12, [x0, #688] +mla v3.4S, v29.4S, v31.s[0] +add v6.4s, v6.4s, v26.4s +sqrdmulh v26.4S, v12.4S, v20.s[0] +str q6, [x0, #448] +ldr q6, [x17, #+480] +ldr q29, [x17, #+496] +mla v28.4S, v25.4S, v31.s[0] +sub v25.4s, v27.4s, v4.4s +sqrdmulh v2.4S, v10.4S, v29.s[0] +str q25, [x0, #496] +ldr q25, [x0, #752] +mla v15.4S, v23.4S, v31.s[0] +add v27.4s, v27.4s, v4.4s +sqrdmulh v4.4S, v25.4S, v29.s[0] +str q27, [x0, #480] +ldr q27, [x0, #512] +ldr q23, [x0, #640] +mul v17.4S, v17.4S,v7.s[0] +sub v21.4s, v27.4s, v18.4s +ldr q19, [x0, #528] +mul v12.4S, v12.4S,v7.s[0] +add v27.4s, v27.4s, v18.4s +ldr q18, [x0, #656] +mla v17.4S, v16.4S, v31.s[0] +sub v16.4s, v19.4s, v3.4s +ldr q24, [x0, #576] +mla v12.4S, v26.4S, v31.s[0] +add v19.4s, v19.4s, v3.4s +ldr q3, [x0, #704] +mul v10.4S, v10.4S,v6.s[0] +sub v26.4s, v24.4s, v28.4s +ldr q0, [x0, #592] +mul v25.4S, v25.4S,v6.s[0] +add v24.4s, v24.4s, v28.4s +ldr q28, [x0, #720] +mla v10.4S, v2.4S, v31.s[0] +nop +mla v25.4S, v4.4S, v31.s[0] +sub v4.4s, v0.4s, v15.4s +sqrdmulh v2.4S, v19.4S, v30.s[1] +add v0.4s, v0.4s, v15.4s +mul v19.4S, v19.4S,v5.s[1] +nop +sqrdmulh v15.4S, v16.4S, v30.s[2] +sub v11.4s, v23.4s, v17.4s +mul v16.4S, v16.4S,v5.s[2] +add v23.4s, v23.4s, v17.4s +sqrdmulh v30.4S, v0.4S, v22.s[1] +sub v5.4s, v18.4s, v12.4s +mul v0.4S, v0.4S,v13.s[1] +add v18.4s, v18.4s, v12.4s +sqrdmulh v12.4S, v4.4S, v22.s[2] +sub v17.4s, v3.4s, v10.4s +mul v4.4S, v4.4S,v13.s[2] +add v3.4s, v3.4s, v10.4s +mla v19.4S, v2.4S, v31.s[0] +sub v2.4s, v28.4s, v25.4s +ldr q22, [x0, #992] +sqrdmulh v13.4S, v18.4S, v20.s[1] +add v28.4s, v28.4s, v25.4s +mla v16.4S, v15.4S, v31.s[0] +ldr q15, [x0, #928] +sqrdmulh v25.4S, v5.4S, v20.s[2] +sub v10.4s, v27.4s, v19.4s +mla v0.4S, v30.4S, v31.s[0] +ldr q30, [x0, #800] +sqrdmulh v14.4S, v28.4S, v29.s[1] +add v27.4s, v27.4s, v19.4s +str q10, [x0, #528] +mla v4.4S, v12.4S, v31.s[0] +ldr q12, [x17, #+512] +ldr q10, [x17, #+528] +sqrdmulh v19.4S, v2.4S, v29.s[2] +sub v1.4s, v21.4s, v16.4s +str q27, [x0, #512] +mul v18.4S, v18.4S,v7.s[1] +add v21.4s, v21.4s, v16.4s +mul v5.4S, v5.4S,v7.s[2] +str q1, [x0, #560] +mla v18.4S, v13.4S, v31.s[0] +sub v13.4s, v24.4s, v0.4s +mla v5.4S, v25.4S, v31.s[0] +str q21, [x0, #544] +mul v28.4S, v28.4S,v6.s[1] +str q13, [x0, #592] +mul v2.4S, v2.4S,v6.s[2] +add v24.4s, v24.4s, v0.4s +str q24, [x0, #576] +mla v28.4S, v14.4S, v31.s[0] +sub v14.4s, v26.4s, v4.4s +str q14, [x0, #624] +mla v2.4S, v19.4S, v31.s[0] +add v26.4s, v26.4s, v4.4s +str q26, [x0, #608] +sqrdmulh v29.4S, v30.4S, v10.s[0] +sub v6.4s, v23.4s, v18.4s +mul v30.4S, v30.4S,v12.s[0] +str q6, [x0, #656] +ldr q6, [x0, #816] +sqrdmulh v26.4S, v6.4S, v10.s[0] +add v23.4s, v23.4s, v18.4s +mul v6.4S, v6.4S,v12.s[0] +str q23, [x0, #640] +ldr q23, [x17, #+544] +ldr q18, [x17, #+560] +ldr q4, [x0, #864] +sqrdmulh v19.4S, v4.4S, v18.s[0] +sub v14.4s, v11.4s, v5.4s +mul v4.4S, v4.4S,v23.s[0] +str q14, [x0, #688] +ldr q14, [x0, #880] +sqrdmulh v24.4S, v14.4S, v18.s[0] +add v11.4s, v11.4s, v5.4s +mul v14.4S, v14.4S,v23.s[0] +str q11, [x0, #672] +ldr q11, [x17, #+576] +ldr q5, [x17, #+592] +mla v30.4S, v29.4S, v31.s[0] +sub v29.4s, v3.4s, v28.4s +sqrdmulh v0.4S, v15.4S, v5.s[0] +str q29, [x0, #720] +ldr q29, [x0, #944] +mla v6.4S, v26.4S, v31.s[0] +add v3.4s, v3.4s, v28.4s +sqrdmulh v28.4S, v29.4S, v5.s[0] +str q3, [x0, #704] +ldr q3, [x17, #+608] +ldr q26, [x17, #+624] +mla v4.4S, v19.4S, v31.s[0] +sub v19.4s, v17.4s, v2.4s +sqrdmulh v13.4S, v22.4S, v26.s[0] +str q19, [x0, #752] +ldr q19, [x0, #1008] +mla v14.4S, v24.4S, v31.s[0] +add v17.4s, v17.4s, v2.4s +sqrdmulh v2.4S, v19.4S, v26.s[0] +str q17, [x0, #736] +ldr q17, [x0, #768] +ldr q24, [x0, #896] +mul v15.4S, v15.4S,v11.s[0] +sub v20.4s, v17.4s, v30.4s +ldr q7, [x0, #784] +mul v29.4S, v29.4S,v11.s[0] +add v17.4s, v17.4s, v30.4s +ldr q30, [x0, #912] +mla v15.4S, v0.4S, v31.s[0] +sub v0.4s, v7.4s, v6.4s +ldr q21, [x0, #832] +mla v29.4S, v28.4S, v31.s[0] +add v7.4s, v7.4s, v6.4s +ldr q6, [x0, #960] +mul v22.4S, v22.4S,v3.s[0] +sub v28.4s, v21.4s, v4.4s +ldr q25, [x0, #848] +mul v19.4S, v19.4S,v3.s[0] +add v21.4s, v21.4s, v4.4s +ldr q4, [x0, #976] +mla v22.4S, v13.4S, v31.s[0] +nop +mla v19.4S, v2.4S, v31.s[0] +sub v2.4s, v25.4s, v14.4s +sqrdmulh v13.4S, v7.4S, v10.s[1] +add v25.4s, v25.4s, v14.4s +mul v7.4S, v7.4S,v12.s[1] +nop +sqrdmulh v14.4S, v0.4S, v10.s[2] +sub v1.4s, v24.4s, v15.4s +mul v0.4S, v0.4S,v12.s[2] +add v24.4s, v24.4s, v15.4s +sqrdmulh v10.4S, v25.4S, v18.s[1] +sub v12.4s, v30.4s, v29.4s +mul v25.4S, v25.4S,v23.s[1] +add v30.4s, v30.4s, v29.4s +sqrdmulh v29.4S, v2.4S, v18.s[2] +sub v15.4s, v6.4s, v22.4s +mul v2.4S, v2.4S,v23.s[2] +add v6.4s, v6.4s, v22.4s +mla v7.4S, v13.4S, v31.s[0] +sub v13.4s, v4.4s, v19.4s +sqrdmulh v18.4S, v30.4S, v5.s[1] +add v4.4s, v4.4s, v19.4s +mla v0.4S, v14.4S, v31.s[0] +sqrdmulh v14.4S, v12.4S, v5.s[2] +sub v19.4s, v17.4s, v7.4s +mla v25.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v4.4S, v26.s[1] +add v17.4s, v17.4s, v7.4s +str q19, [x0, #784] +mla v2.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v13.4S, v26.s[2] +sub v19.4s, v20.4s, v0.4s +str q17, [x0, #768] +mul v30.4S, v30.4S,v11.s[1] +add v20.4s, v20.4s, v0.4s +mul v12.4S, v12.4S,v11.s[2] +str q19, [x0, #816] +mla v30.4S, v18.4S, v31.s[0] +sub v18.4s, v21.4s, v25.4s +mla v12.4S, v14.4S, v31.s[0] +str q20, [x0, #800] +mul v4.4S, v4.4S,v3.s[1] +str q18, [x0, #848] +mul v13.4S, v13.4S,v3.s[2] +add v21.4s, v21.4s, v25.4s +str q21, [x0, #832] +mla v4.4S, v10.4S, v31.s[0] +sub v10.4s, v28.4s, v2.4s +str q10, [x0, #880] +mla v13.4S, v29.4S, v31.s[0] +add v28.4s, v28.4s, v2.4s +str q28, [x0, #864] +sub v26.4s, v24.4s, v30.4s +str q26, [x0, #912] +add v24.4s, v24.4s, v30.4s +str q24, [x0, #896] +sub v24.4s, v1.4s, v12.4s +str q24, [x0, #944] +add v1.4s, v1.4s, v12.4s +str q1, [x0, #928] +sub v1.4s, v6.4s, v4.4s +str q1, [x0, #976] +add v6.4s, v6.4s, v4.4s +str q6, [x0, #960] +sub v6.4s, v15.4s, v13.4s +str q6, [x0, #1008] +add v15.4s, v15.4s, v13.4s +str q15, [x0, #992] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1472 +// Instruction count: 1468 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_8_z4_7.s b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_8_z4_7.s new file mode 100644 index 0000000..5cdf1c8 --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_8_z4_7.s @@ -0,0 +1,1494 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_8_z4_7 +.global _ntt_u32_incomplete_neon_asm_var_4_2_8_z4_7 +ntt_u32_incomplete_neon_asm_var_4_2_8_z4_7: +_ntt_u32_incomplete_neon_asm_var_4_2_8_z4_7: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #928] +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +ldr q20, [x0, #992] +sqrdmulh v19.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q18, [x0, #800] +sqrdmulh v17.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +ldr q16, [x0, #864] +sqrdmulh v3.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +mla v22.4S, v21.4S, v31.s[0] +mla v20.4S, v19.4S, v31.s[0] +mla v18.4S, v17.4S, v31.s[0] +mla v16.4S, v3.4S, v31.s[0] +ldr q3, [x0, #544] +sqrdmulh v17.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +ldr q19, [x0, #608] +sqrdmulh v21.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q2, [x0, #672] +ldr q1, [x0, #416] +sqrdmulh v0.4S, v2.4S, v29.s[0] +sub v15.4s, v1.4s, v22.4s +mul v2.4S, v2.4S,v30.s[0] +add v1.4s, v1.4s, v22.4s +ldr q22, [x0, #736] +ldr q14, [x0, #480] +sqrdmulh v13.4S, v22.4S, v29.s[0] +sub v12.4s, v14.4s, v20.4s +mul v22.4S, v22.4S,v30.s[0] +add v14.4s, v14.4s, v20.4s +ldr q20, [x0, #288] +mla v3.4S, v17.4S, v31.s[0] +sub v17.4s, v20.4s, v18.4s +mla v19.4S, v21.4S, v31.s[0] +mla v2.4S, v0.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +mla v22.4S, v13.4S, v31.s[0] +ldr q13, [x0, #352] +sqrdmulh v18.4S, v1.4S, v29.s[1] +sub v0.4s, v13.4s, v16.4s +mul v1.4S, v1.4S,v30.s[1] +sqrdmulh v21.4S, v14.4S, v29.s[1] +add v13.4s, v13.4s, v16.4s +mul v14.4S, v14.4S,v30.s[1] +ldr q16, [x0, #32] +sqrdmulh v11.4S, v20.4S, v29.s[1] +sub v10.4s, v16.4s, v3.4s +mul v20.4S, v20.4S,v30.s[1] +add v16.4s, v16.4s, v3.4s +ldr q3, [x0, #96] +sqrdmulh v9.4S, v13.4S, v29.s[1] +sub v8.4s, v3.4s, v19.4s +mul v13.4S, v13.4S,v30.s[1] +add v3.4s, v3.4s, v19.4s +ldr q19, [x0, #160] +mla v1.4S, v18.4S, v31.s[0] +sub v18.4s, v19.4s, v2.4s +mla v14.4S, v21.4S, v31.s[0] +mla v20.4S, v11.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +mla v13.4S, v9.4S, v31.s[0] +ldr q9, [x0, #224] +sqrdmulh v2.4S, v15.4S, v29.s[2] +sub v11.4s, v9.4s, v22.4s +mul v15.4S, v15.4S,v30.s[2] +sqrdmulh v21.4S, v12.4S, v29.s[2] +add v9.4s, v9.4s, v22.4s +mul v12.4S, v12.4S,v30.s[2] +sqrdmulh v22.4S, v17.4S, v29.s[2] +sub v7.4s, v19.4s, v1.4s +mul v17.4S, v17.4S,v30.s[2] +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v29.s[2] +sub v6.4s, v9.4s, v14.4s +mul v0.4S, v0.4S,v30.s[2] +add v9.4s, v9.4s, v14.4s +mla v15.4S, v2.4S, v31.s[0] +sub v2.4s, v16.4s, v20.4s +mla v12.4S, v21.4S, v31.s[0] +mla v17.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +mla v0.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v7.4S, v27.s[1] +sub v20.4s, v3.4s, v13.4s +mul v7.4S, v7.4S,v28.s[1] +sqrdmulh v22.4S, v6.4S, v27.s[1] +add v3.4s, v3.4s, v13.4s +mul v6.4S, v6.4S,v28.s[1] +sqrdmulh v13.4S, v19.4S, v27.s[0] +sub v21.4s, v18.4s, v15.4s +mul v19.4S, v19.4S,v28.s[0] +add v18.4s, v18.4s, v15.4s +sqrdmulh v15.4S, v9.4S, v27.s[0] +sub v14.4s, v11.4s, v12.4s +mul v9.4S, v9.4S,v28.s[0] +add v11.4s, v11.4s, v12.4s +mla v7.4S, v1.4S, v31.s[0] +sub v1.4s, v10.4s, v17.4s +mla v6.4S, v22.4S, v31.s[0] +mla v19.4S, v13.4S, v31.s[0] +add v10.4s, v10.4s, v17.4s +mla v9.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v18.4S, v27.s[2] +sub v17.4s, v8.4s, v0.4s +mul v18.4S, v18.4S,v28.s[2] +sqrdmulh v13.4S, v11.4S, v27.s[2] +add v8.4s, v8.4s, v0.4s +mul v11.4S, v11.4S,v28.s[2] +sqrdmulh v0.4S, v21.4S, v27.s[3] +sub v22.4s, v2.4s, v7.4s +mul v21.4S, v21.4S,v28.s[3] +add v2.4s, v2.4s, v7.4s +sqrdmulh v7.4S, v14.4S, v27.s[3] +sub v12.4s, v20.4s, v6.4s +mul v14.4S, v14.4S,v28.s[3] +add v20.4s, v20.4s, v6.4s +mla v18.4S, v15.4S, v31.s[0] +sub v15.4s, v16.4s, v19.4s +mla v11.4S, v13.4S, v31.s[0] +mla v21.4S, v0.4S, v31.s[0] +add v16.4s, v16.4s, v19.4s +mla v14.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v20.4S, v25.s[2] +sub v19.4s, v3.4s, v9.4s +mul v20.4S, v20.4S,v26.s[2] +sqrdmulh v0.4S, v12.4S, v25.s[3] +add v3.4s, v3.4s, v9.4s +mul v12.4S, v12.4S,v26.s[3] +sqrdmulh v9.4S, v19.4S, v25.s[1] +sub v13.4s, v10.4s, v18.4s +mul v19.4S, v19.4S,v26.s[1] +add v10.4s, v10.4s, v18.4s +sqrdmulh v18.4S, v3.4S, v25.s[0] +sub v6.4s, v8.4s, v11.4s +mul v3.4S, v3.4S,v26.s[0] +add v8.4s, v8.4s, v11.4s +mla v20.4S, v7.4S, v31.s[0] +sub v7.4s, v1.4s, v21.4s +mla v12.4S, v0.4S, v31.s[0] +mla v19.4S, v9.4S, v31.s[0] +add v1.4s, v1.4s, v21.4s +mla v3.4S, v18.4S, v31.s[0] +sqrdmulh v18.4S, v8.4S, v23.s[0] +sub v21.4s, v17.4s, v14.4s +mul v8.4S, v8.4S,v24.s[0] +sqrdmulh v9.4S, v6.4S, v23.s[1] +add v17.4s, v17.4s, v14.4s +mul v6.4S, v6.4S,v24.s[1] +sqrdmulh v14.4S, v17.4S, v23.s[2] +sub v0.4s, v2.4s, v20.4s +mul v17.4S, v17.4S,v24.s[2] +add v2.4s, v2.4s, v20.4s +sqrdmulh v20.4S, v21.4S, v23.s[3] +sub v11.4s, v22.4s, v12.4s +mul v21.4S, v21.4S,v24.s[3] +add v22.4s, v22.4s, v12.4s +mla v8.4S, v18.4S, v31.s[0] +sub v18.4s, v15.4s, v19.4s +mla v6.4S, v9.4S, v31.s[0] +str q2, [x0, #288] +mla v17.4S, v14.4S, v31.s[0] +add v15.4s, v15.4s, v19.4s +mla v21.4S, v20.4S, v31.s[0] +str q0, [x0, #352] +ldr q0, [x0, #944] +sqrdmulh v20.4S, v0.4S, v29.s[0] +sub v19.4s, v16.4s, v3.4s +mul v0.4S, v0.4S,v30.s[0] +str q22, [x0, #416] +ldr q22, [x0, #1008] +sqrdmulh v14.4S, v22.4S, v29.s[0] +add v16.4s, v16.4s, v3.4s +mul v22.4S, v22.4S,v30.s[0] +str q11, [x0, #480] +ldr q11, [x0, #816] +sqrdmulh v3.4S, v11.4S, v29.s[0] +sub v2.4s, v10.4s, v8.4s +mul v11.4S, v11.4S,v30.s[0] +add v10.4s, v10.4s, v8.4s +ldr q8, [x0, #880] +sqrdmulh v9.4S, v8.4S, v29.s[0] +sub v12.4s, v13.4s, v6.4s +mul v8.4S, v8.4S,v30.s[0] +add v13.4s, v13.4s, v6.4s +mla v0.4S, v20.4S, v31.s[0] +sub v20.4s, v1.4s, v17.4s +mla v22.4S, v14.4S, v31.s[0] +str q15, [x0, #160] +mla v11.4S, v3.4S, v31.s[0] +add v1.4s, v1.4s, v17.4s +mla v8.4S, v9.4S, v31.s[0] +str q18, [x0, #224] +ldr q18, [x0, #560] +sqrdmulh v9.4S, v18.4S, v29.s[0] +sub v17.4s, v7.4s, v21.4s +mul v18.4S, v18.4S,v30.s[0] +str q16, [x0, #32] +ldr q16, [x0, #624] +sqrdmulh v3.4S, v16.4S, v29.s[0] +add v7.4s, v7.4s, v21.4s +mul v16.4S, v16.4S,v30.s[0] +str q19, [x0, #96] +ldr q19, [x0, #688] +ldr q21, [x0, #432] +sqrdmulh v15.4S, v19.4S, v29.s[0] +sub v14.4s, v21.4s, v0.4s +mul v19.4S, v19.4S,v30.s[0] +add v21.4s, v21.4s, v0.4s +ldr q0, [x0, #752] +ldr q6, [x0, #496] +sqrdmulh v5.4S, v0.4S, v29.s[0] +sub v4.4s, v6.4s, v22.4s +mul v0.4S, v0.4S,v30.s[0] +add v6.4s, v6.4s, v22.4s +ldr q22, [x0, #304] +mla v18.4S, v9.4S, v31.s[0] +sub v9.4s, v22.4s, v11.4s +mla v16.4S, v3.4S, v31.s[0] +str q10, [x0, #544] +mla v19.4S, v15.4S, v31.s[0] +add v22.4s, v22.4s, v11.4s +mla v0.4S, v5.4S, v31.s[0] +str q2, [x0, #608] +ldr q2, [x0, #368] +sqrdmulh v5.4S, v21.4S, v29.s[1] +sub v11.4s, v2.4s, v8.4s +mul v21.4S, v21.4S,v30.s[1] +str q13, [x0, #672] +sqrdmulh v13.4S, v6.4S, v29.s[1] +add v2.4s, v2.4s, v8.4s +mul v6.4S, v6.4S,v30.s[1] +str q12, [x0, #736] +ldr q12, [x0, #48] +sqrdmulh v8.4S, v22.4S, v29.s[1] +sub v15.4s, v12.4s, v18.4s +mul v22.4S, v22.4S,v30.s[1] +add v12.4s, v12.4s, v18.4s +ldr q18, [x0, #112] +sqrdmulh v10.4S, v2.4S, v29.s[1] +sub v3.4s, v18.4s, v16.4s +mul v2.4S, v2.4S,v30.s[1] +add v18.4s, v18.4s, v16.4s +ldr q16, [x0, #176] +mla v21.4S, v5.4S, v31.s[0] +sub v5.4s, v16.4s, v19.4s +mla v6.4S, v13.4S, v31.s[0] +str q1, [x0, #800] +mla v22.4S, v8.4S, v31.s[0] +add v16.4s, v16.4s, v19.4s +mla v2.4S, v10.4S, v31.s[0] +str q20, [x0, #864] +ldr q20, [x0, #240] +sqrdmulh v10.4S, v14.4S, v29.s[2] +sub v19.4s, v20.4s, v0.4s +mul v14.4S, v14.4S,v30.s[2] +str q7, [x0, #928] +sqrdmulh v7.4S, v4.4S, v29.s[2] +add v20.4s, v20.4s, v0.4s +mul v4.4S, v4.4S,v30.s[2] +str q17, [x0, #992] +sqrdmulh v17.4S, v9.4S, v29.s[2] +sub v0.4s, v16.4s, v21.4s +mul v9.4S, v9.4S,v30.s[2] +add v16.4s, v16.4s, v21.4s +sqrdmulh v21.4S, v11.4S, v29.s[2] +sub v8.4s, v20.4s, v6.4s +mul v11.4S, v11.4S,v30.s[2] +add v20.4s, v20.4s, v6.4s +mla v14.4S, v10.4S, v31.s[0] +sub v10.4s, v12.4s, v22.4s +mla v4.4S, v7.4S, v31.s[0] +mla v9.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v22.4s +mla v11.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v0.4S, v27.s[1] +sub v22.4s, v18.4s, v2.4s +mul v0.4S, v0.4S,v28.s[1] +sqrdmulh v17.4S, v8.4S, v27.s[1] +add v18.4s, v18.4s, v2.4s +mul v8.4S, v8.4S,v28.s[1] +sqrdmulh v2.4S, v16.4S, v27.s[0] +sub v7.4s, v5.4s, v14.4s +mul v16.4S, v16.4S,v28.s[0] +add v5.4s, v5.4s, v14.4s +sqrdmulh v14.4S, v20.4S, v27.s[0] +sub v6.4s, v19.4s, v4.4s +mul v20.4S, v20.4S,v28.s[0] +add v19.4s, v19.4s, v4.4s +mla v0.4S, v21.4S, v31.s[0] +sub v21.4s, v15.4s, v9.4s +mla v8.4S, v17.4S, v31.s[0] +mla v16.4S, v2.4S, v31.s[0] +add v15.4s, v15.4s, v9.4s +mla v20.4S, v14.4S, v31.s[0] +sqrdmulh v14.4S, v5.4S, v27.s[2] +sub v9.4s, v3.4s, v11.4s +mul v5.4S, v5.4S,v28.s[2] +sqrdmulh v2.4S, v19.4S, v27.s[2] +add v3.4s, v3.4s, v11.4s +mul v19.4S, v19.4S,v28.s[2] +sqrdmulh v11.4S, v7.4S, v27.s[3] +sub v17.4s, v10.4s, v0.4s +mul v7.4S, v7.4S,v28.s[3] +add v10.4s, v10.4s, v0.4s +sqrdmulh v0.4S, v6.4S, v27.s[3] +sub v4.4s, v22.4s, v8.4s +mul v6.4S, v6.4S,v28.s[3] +add v22.4s, v22.4s, v8.4s +mla v5.4S, v14.4S, v31.s[0] +sub v14.4s, v12.4s, v16.4s +mla v19.4S, v2.4S, v31.s[0] +mla v7.4S, v11.4S, v31.s[0] +add v12.4s, v12.4s, v16.4s +mla v6.4S, v0.4S, v31.s[0] +sqrdmulh v0.4S, v22.4S, v25.s[2] +sub v16.4s, v18.4s, v20.4s +mul v22.4S, v22.4S,v26.s[2] +sqrdmulh v11.4S, v4.4S, v25.s[3] +add v18.4s, v18.4s, v20.4s +mul v4.4S, v4.4S,v26.s[3] +sqrdmulh v20.4S, v16.4S, v25.s[1] +sub v2.4s, v15.4s, v5.4s +mul v16.4S, v16.4S,v26.s[1] +add v15.4s, v15.4s, v5.4s +sqrdmulh v5.4S, v18.4S, v25.s[0] +sub v8.4s, v3.4s, v19.4s +mul v18.4S, v18.4S,v26.s[0] +add v3.4s, v3.4s, v19.4s +mla v22.4S, v0.4S, v31.s[0] +sub v0.4s, v21.4s, v7.4s +mla v4.4S, v11.4S, v31.s[0] +mla v16.4S, v20.4S, v31.s[0] +add v21.4s, v21.4s, v7.4s +mla v18.4S, v5.4S, v31.s[0] +sqrdmulh v5.4S, v3.4S, v23.s[0] +sub v7.4s, v9.4s, v6.4s +mul v3.4S, v3.4S,v24.s[0] +sqrdmulh v20.4S, v8.4S, v23.s[1] +add v9.4s, v9.4s, v6.4s +mul v8.4S, v8.4S,v24.s[1] +sqrdmulh v6.4S, v9.4S, v23.s[2] +sub v11.4s, v10.4s, v22.4s +mul v9.4S, v9.4S,v24.s[2] +add v10.4s, v10.4s, v22.4s +sqrdmulh v22.4S, v7.4S, v23.s[3] +sub v19.4s, v17.4s, v4.4s +mul v7.4S, v7.4S,v24.s[3] +add v17.4s, v17.4s, v4.4s +mla v3.4S, v5.4S, v31.s[0] +sub v5.4s, v14.4s, v16.4s +mla v8.4S, v20.4S, v31.s[0] +str q10, [x0, #304] +mla v9.4S, v6.4S, v31.s[0] +add v14.4s, v14.4s, v16.4s +mla v7.4S, v22.4S, v31.s[0] +str q11, [x0, #368] +ldr q11, [x0, #896] +sqrdmulh v22.4S, v11.4S, v29.s[0] +sub v16.4s, v12.4s, v18.4s +mul v11.4S, v11.4S,v30.s[0] +str q17, [x0, #432] +ldr q17, [x0, #960] +sqrdmulh v6.4S, v17.4S, v29.s[0] +add v12.4s, v12.4s, v18.4s +mul v17.4S, v17.4S,v30.s[0] +str q19, [x0, #496] +ldr q19, [x0, #768] +sqrdmulh v18.4S, v19.4S, v29.s[0] +sub v10.4s, v15.4s, v3.4s +mul v19.4S, v19.4S,v30.s[0] +add v15.4s, v15.4s, v3.4s +ldr q3, [x0, #832] +sqrdmulh v20.4S, v3.4S, v29.s[0] +sub v4.4s, v2.4s, v8.4s +mul v3.4S, v3.4S,v30.s[0] +add v2.4s, v2.4s, v8.4s +mla v11.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v9.4s +mla v17.4S, v6.4S, v31.s[0] +str q14, [x0, #176] +mla v19.4S, v18.4S, v31.s[0] +add v21.4s, v21.4s, v9.4s +mla v3.4S, v20.4S, v31.s[0] +str q5, [x0, #240] +ldr q5, [x0, #512] +sqrdmulh v20.4S, v5.4S, v29.s[0] +sub v9.4s, v0.4s, v7.4s +mul v5.4S, v5.4S,v30.s[0] +str q12, [x0, #48] +ldr q12, [x0, #576] +sqrdmulh v18.4S, v12.4S, v29.s[0] +add v0.4s, v0.4s, v7.4s +mul v12.4S, v12.4S,v30.s[0] +str q16, [x0, #112] +ldr q16, [x0, #640] +ldr q7, [x0, #384] +sqrdmulh v14.4S, v16.4S, v29.s[0] +sub v6.4s, v7.4s, v11.4s +mul v16.4S, v16.4S,v30.s[0] +add v7.4s, v7.4s, v11.4s +ldr q11, [x0, #704] +ldr q8, [x0, #448] +sqrdmulh v1.4S, v11.4S, v29.s[0] +sub v13.4s, v8.4s, v17.4s +mul v11.4S, v11.4S,v30.s[0] +add v8.4s, v8.4s, v17.4s +ldr q17, [x0, #256] +mla v5.4S, v20.4S, v31.s[0] +sub v20.4s, v17.4s, v19.4s +mla v12.4S, v18.4S, v31.s[0] +str q15, [x0, #560] +mla v16.4S, v14.4S, v31.s[0] +add v17.4s, v17.4s, v19.4s +mla v11.4S, v1.4S, v31.s[0] +str q10, [x0, #624] +ldr q10, [x0, #320] +sqrdmulh v1.4S, v7.4S, v29.s[1] +sub v19.4s, v10.4s, v3.4s +mul v7.4S, v7.4S,v30.s[1] +str q2, [x0, #688] +sqrdmulh v2.4S, v8.4S, v29.s[1] +add v10.4s, v10.4s, v3.4s +mul v8.4S, v8.4S,v30.s[1] +str q4, [x0, #752] +ldr q4, [x0, #0] +sqrdmulh v3.4S, v17.4S, v29.s[1] +sub v14.4s, v4.4s, v5.4s +mul v17.4S, v17.4S,v30.s[1] +add v4.4s, v4.4s, v5.4s +ldr q5, [x0, #64] +sqrdmulh v15.4S, v10.4S, v29.s[1] +sub v18.4s, v5.4s, v12.4s +mul v10.4S, v10.4S,v30.s[1] +add v5.4s, v5.4s, v12.4s +ldr q12, [x0, #128] +mla v7.4S, v1.4S, v31.s[0] +sub v1.4s, v12.4s, v16.4s +mla v8.4S, v2.4S, v31.s[0] +str q21, [x0, #816] +mla v17.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v16.4s +mla v10.4S, v15.4S, v31.s[0] +str q22, [x0, #880] +ldr q22, [x0, #192] +sqrdmulh v15.4S, v6.4S, v29.s[2] +sub v16.4s, v22.4s, v11.4s +mul v6.4S, v6.4S,v30.s[2] +str q0, [x0, #944] +sqrdmulh v0.4S, v13.4S, v29.s[2] +add v22.4s, v22.4s, v11.4s +mul v13.4S, v13.4S,v30.s[2] +str q9, [x0, #1008] +sqrdmulh v9.4S, v20.4S, v29.s[2] +sub v11.4s, v12.4s, v7.4s +mul v20.4S, v20.4S,v30.s[2] +add v12.4s, v12.4s, v7.4s +sqrdmulh v7.4S, v19.4S, v29.s[2] +sub v3.4s, v22.4s, v8.4s +mul v19.4S, v19.4S,v30.s[2] +add v22.4s, v22.4s, v8.4s +mla v6.4S, v15.4S, v31.s[0] +sub v15.4s, v4.4s, v17.4s +mla v13.4S, v0.4S, v31.s[0] +mla v20.4S, v9.4S, v31.s[0] +add v4.4s, v4.4s, v17.4s +mla v19.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v11.4S, v27.s[1] +sub v17.4s, v5.4s, v10.4s +mul v11.4S, v11.4S,v28.s[1] +sqrdmulh v9.4S, v3.4S, v27.s[1] +add v5.4s, v5.4s, v10.4s +mul v3.4S, v3.4S,v28.s[1] +sqrdmulh v10.4S, v12.4S, v27.s[0] +sub v0.4s, v1.4s, v6.4s +mul v12.4S, v12.4S,v28.s[0] +add v1.4s, v1.4s, v6.4s +sqrdmulh v6.4S, v22.4S, v27.s[0] +sub v8.4s, v16.4s, v13.4s +mul v22.4S, v22.4S,v28.s[0] +add v16.4s, v16.4s, v13.4s +mla v11.4S, v7.4S, v31.s[0] +sub v7.4s, v14.4s, v20.4s +mla v3.4S, v9.4S, v31.s[0] +mla v12.4S, v10.4S, v31.s[0] +add v14.4s, v14.4s, v20.4s +mla v22.4S, v6.4S, v31.s[0] +sqrdmulh v6.4S, v1.4S, v27.s[2] +sub v20.4s, v18.4s, v19.4s +mul v1.4S, v1.4S,v28.s[2] +sqrdmulh v10.4S, v16.4S, v27.s[2] +add v18.4s, v18.4s, v19.4s +mul v16.4S, v16.4S,v28.s[2] +sqrdmulh v19.4S, v0.4S, v27.s[3] +sub v9.4s, v15.4s, v11.4s +mul v0.4S, v0.4S,v28.s[3] +add v15.4s, v15.4s, v11.4s +sqrdmulh v11.4S, v8.4S, v27.s[3] +sub v13.4s, v17.4s, v3.4s +mul v8.4S, v8.4S,v28.s[3] +add v17.4s, v17.4s, v3.4s +mla v1.4S, v6.4S, v31.s[0] +sub v6.4s, v4.4s, v12.4s +mla v16.4S, v10.4S, v31.s[0] +mla v0.4S, v19.4S, v31.s[0] +add v4.4s, v4.4s, v12.4s +mla v8.4S, v11.4S, v31.s[0] +sqrdmulh v11.4S, v17.4S, v25.s[2] +sub v12.4s, v5.4s, v22.4s +mul v17.4S, v17.4S,v26.s[2] +sqrdmulh v19.4S, v13.4S, v25.s[3] +add v5.4s, v5.4s, v22.4s +mul v13.4S, v13.4S,v26.s[3] +sqrdmulh v22.4S, v12.4S, v25.s[1] +sub v10.4s, v14.4s, v1.4s +mul v12.4S, v12.4S,v26.s[1] +add v14.4s, v14.4s, v1.4s +sqrdmulh v1.4S, v5.4S, v25.s[0] +sub v3.4s, v18.4s, v16.4s +mul v5.4S, v5.4S,v26.s[0] +add v18.4s, v18.4s, v16.4s +mla v17.4S, v11.4S, v31.s[0] +sub v11.4s, v7.4s, v0.4s +mla v13.4S, v19.4S, v31.s[0] +mla v12.4S, v22.4S, v31.s[0] +add v7.4s, v7.4s, v0.4s +mla v5.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v18.4S, v23.s[0] +sub v0.4s, v20.4s, v8.4s +mul v18.4S, v18.4S,v24.s[0] +sqrdmulh v22.4S, v3.4S, v23.s[1] +add v20.4s, v20.4s, v8.4s +mul v3.4S, v3.4S,v24.s[1] +sqrdmulh v8.4S, v20.4S, v23.s[2] +sub v19.4s, v15.4s, v17.4s +mul v20.4S, v20.4S,v24.s[2] +add v15.4s, v15.4s, v17.4s +sqrdmulh v17.4S, v0.4S, v23.s[3] +sub v16.4s, v9.4s, v13.4s +mul v0.4S, v0.4S,v24.s[3] +add v9.4s, v9.4s, v13.4s +mla v18.4S, v1.4S, v31.s[0] +sub v1.4s, v6.4s, v12.4s +mla v3.4S, v22.4S, v31.s[0] +str q15, [x0, #256] +mla v20.4S, v8.4S, v31.s[0] +add v6.4s, v6.4s, v12.4s +mla v0.4S, v17.4S, v31.s[0] +str q19, [x0, #320] +ldr q19, [x0, #912] +sqrdmulh v17.4S, v19.4S, v29.s[0] +sub v12.4s, v4.4s, v5.4s +mul v19.4S, v19.4S,v30.s[0] +str q9, [x0, #384] +ldr q9, [x0, #976] +sqrdmulh v8.4S, v9.4S, v29.s[0] +add v4.4s, v4.4s, v5.4s +mul v9.4S, v9.4S,v30.s[0] +str q16, [x0, #448] +ldr q16, [x0, #784] +sqrdmulh v5.4S, v16.4S, v29.s[0] +sub v15.4s, v14.4s, v18.4s +mul v16.4S, v16.4S,v30.s[0] +add v14.4s, v14.4s, v18.4s +ldr q18, [x0, #848] +sqrdmulh v22.4S, v18.4S, v29.s[0] +sub v13.4s, v10.4s, v3.4s +mul v18.4S, v18.4S,v30.s[0] +add v10.4s, v10.4s, v3.4s +mla v19.4S, v17.4S, v31.s[0] +sub v17.4s, v7.4s, v20.4s +mla v9.4S, v8.4S, v31.s[0] +str q6, [x0, #128] +mla v16.4S, v5.4S, v31.s[0] +add v7.4s, v7.4s, v20.4s +mla v18.4S, v22.4S, v31.s[0] +str q1, [x0, #192] +ldr q1, [x0, #528] +sqrdmulh v22.4S, v1.4S, v29.s[0] +sub v20.4s, v11.4s, v0.4s +mul v1.4S, v1.4S,v30.s[0] +str q4, [x0, #0] +ldr q4, [x0, #592] +sqrdmulh v5.4S, v4.4S, v29.s[0] +add v11.4s, v11.4s, v0.4s +mul v4.4S, v4.4S,v30.s[0] +str q12, [x0, #64] +ldr q12, [x0, #656] +ldr q0, [x0, #400] +sqrdmulh v6.4S, v12.4S, v29.s[0] +sub v8.4s, v0.4s, v19.4s +mul v12.4S, v12.4S,v30.s[0] +add v0.4s, v0.4s, v19.4s +ldr q19, [x0, #720] +ldr q3, [x0, #464] +sqrdmulh v21.4S, v19.4S, v29.s[0] +sub v2.4s, v3.4s, v9.4s +mul v19.4S, v19.4S,v30.s[0] +add v3.4s, v3.4s, v9.4s +ldr q9, [x0, #272] +mla v1.4S, v22.4S, v31.s[0] +sub v22.4s, v9.4s, v16.4s +mla v4.4S, v5.4S, v31.s[0] +str q14, [x0, #512] +mla v12.4S, v6.4S, v31.s[0] +add v9.4s, v9.4s, v16.4s +mla v19.4S, v21.4S, v31.s[0] +str q15, [x0, #576] +ldr q15, [x0, #336] +sqrdmulh v21.4S, v0.4S, v29.s[1] +sub v16.4s, v15.4s, v18.4s +mul v0.4S, v0.4S,v30.s[1] +str q10, [x0, #640] +sqrdmulh v10.4S, v3.4S, v29.s[1] +add v15.4s, v15.4s, v18.4s +mul v3.4S, v3.4S,v30.s[1] +str q13, [x0, #704] +ldr q13, [x0, #16] +sqrdmulh v18.4S, v9.4S, v29.s[1] +sub v6.4s, v13.4s, v1.4s +mul v9.4S, v9.4S,v30.s[1] +add v13.4s, v13.4s, v1.4s +ldr q1, [x0, #80] +sqrdmulh v14.4S, v15.4S, v29.s[1] +sub v5.4s, v1.4s, v4.4s +mul v15.4S, v15.4S,v30.s[1] +add v1.4s, v1.4s, v4.4s +ldr q4, [x0, #144] +mla v0.4S, v21.4S, v31.s[0] +sub v21.4s, v4.4s, v12.4s +mla v3.4S, v10.4S, v31.s[0] +str q7, [x0, #768] +mla v9.4S, v18.4S, v31.s[0] +add v4.4s, v4.4s, v12.4s +mla v15.4S, v14.4S, v31.s[0] +str q17, [x0, #832] +ldr q17, [x0, #208] +sqrdmulh v14.4S, v8.4S, v29.s[2] +sub v12.4s, v17.4s, v19.4s +mul v8.4S, v8.4S,v30.s[2] +str q11, [x0, #896] +sqrdmulh v11.4S, v2.4S, v29.s[2] +add v17.4s, v17.4s, v19.4s +mul v2.4S, v2.4S,v30.s[2] +str q20, [x0, #960] +sqrdmulh v20.4S, v22.4S, v29.s[2] +sub v19.4s, v4.4s, v0.4s +mul v22.4S, v22.4S,v30.s[2] +add v4.4s, v4.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[2] +sub v18.4s, v17.4s, v3.4s +mul v16.4S, v16.4S,v30.s[2] +add v17.4s, v17.4s, v3.4s +mla v8.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v9.4s +mla v2.4S, v11.4S, v31.s[0] +mla v22.4S, v20.4S, v31.s[0] +add v13.4s, v13.4s, v9.4s +mla v16.4S, v0.4S, v31.s[0] +sqrdmulh v0.4S, v19.4S, v27.s[1] +sub v9.4s, v1.4s, v15.4s +mul v19.4S, v19.4S,v28.s[1] +sqrdmulh v20.4S, v18.4S, v27.s[1] +add v1.4s, v1.4s, v15.4s +mul v18.4S, v18.4S,v28.s[1] +sqrdmulh v15.4S, v4.4S, v27.s[0] +sub v11.4s, v21.4s, v8.4s +mul v4.4S, v4.4S,v28.s[0] +add v21.4s, v21.4s, v8.4s +sqrdmulh v8.4S, v17.4S, v27.s[0] +sub v3.4s, v12.4s, v2.4s +mul v17.4S, v17.4S,v28.s[0] +add v12.4s, v12.4s, v2.4s +mla v19.4S, v0.4S, v31.s[0] +sub v0.4s, v6.4s, v22.4s +mla v18.4S, v20.4S, v31.s[0] +mla v4.4S, v15.4S, v31.s[0] +add v6.4s, v6.4s, v22.4s +mla v17.4S, v8.4S, v31.s[0] +sqrdmulh v8.4S, v21.4S, v27.s[2] +sub v22.4s, v5.4s, v16.4s +mul v21.4S, v21.4S,v28.s[2] +sqrdmulh v15.4S, v12.4S, v27.s[2] +add v5.4s, v5.4s, v16.4s +mul v12.4S, v12.4S,v28.s[2] +sqrdmulh v16.4S, v11.4S, v27.s[3] +sub v20.4s, v14.4s, v19.4s +mul v11.4S, v11.4S,v28.s[3] +add v14.4s, v14.4s, v19.4s +sqrdmulh v19.4S, v3.4S, v27.s[3] +sub v2.4s, v9.4s, v18.4s +mul v3.4S, v3.4S,v28.s[3] +add v9.4s, v9.4s, v18.4s +mla v21.4S, v8.4S, v31.s[0] +sub v8.4s, v13.4s, v4.4s +mla v12.4S, v15.4S, v31.s[0] +mla v11.4S, v16.4S, v31.s[0] +add v13.4s, v13.4s, v4.4s +mla v3.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v9.4S, v25.s[2] +sub v4.4s, v1.4s, v17.4s +mul v9.4S, v9.4S,v26.s[2] +sqrdmulh v16.4S, v2.4S, v25.s[3] +add v1.4s, v1.4s, v17.4s +mul v2.4S, v2.4S,v26.s[3] +sqrdmulh v17.4S, v4.4S, v25.s[1] +sub v15.4s, v6.4s, v21.4s +mul v4.4S, v4.4S,v26.s[1] +add v6.4s, v6.4s, v21.4s +sqrdmulh v21.4S, v1.4S, v25.s[0] +sub v18.4s, v5.4s, v12.4s +mul v1.4S, v1.4S,v26.s[0] +add v5.4s, v5.4s, v12.4s +mla v9.4S, v19.4S, v31.s[0] +sub v19.4s, v0.4s, v11.4s +mla v2.4S, v16.4S, v31.s[0] +mla v4.4S, v17.4S, v31.s[0] +add v0.4s, v0.4s, v11.4s +mla v1.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v5.4S, v23.s[0] +sub v11.4s, v22.4s, v3.4s +mul v5.4S, v5.4S,v24.s[0] +sqrdmulh v17.4S, v18.4S, v23.s[1] +add v22.4s, v22.4s, v3.4s +mul v18.4S, v18.4S,v24.s[1] +sqrdmulh v3.4S, v22.4S, v23.s[2] +sub v16.4s, v14.4s, v9.4s +mul v22.4S, v22.4S,v24.s[2] +add v14.4s, v14.4s, v9.4s +sqrdmulh v9.4S, v11.4S, v23.s[3] +sub v12.4s, v20.4s, v2.4s +mul v11.4S, v11.4S,v24.s[3] +add v20.4s, v20.4s, v2.4s +mla v5.4S, v21.4S, v31.s[0] +sub v21.4s, v8.4s, v4.4s +mla v18.4S, v17.4S, v31.s[0] +str q14, [x0, #272] +mla v22.4S, v3.4S, v31.s[0] +add v8.4s, v8.4s, v4.4s +mla v11.4S, v9.4S, v31.s[0] +str q16, [x0, #336] +sub v23.4s, v13.4s, v1.4s +str q20, [x0, #400] +add v13.4s, v13.4s, v1.4s +str q12, [x0, #464] +sub v12.4s, v6.4s, v5.4s +add v6.4s, v6.4s, v5.4s +sub v5.4s, v15.4s, v18.4s +add v15.4s, v15.4s, v18.4s +sub v18.4s, v0.4s, v22.4s +str q8, [x0, #144] +add v0.4s, v0.4s, v22.4s +str q21, [x0, #208] +sub v21.4s, v19.4s, v11.4s +str q13, [x0, #16] +add v19.4s, v19.4s, v11.4s +str q23, [x0, #80] +str q6, [x0, #528] +str q12, [x0, #592] +str q15, [x0, #656] +str q5, [x0, #720] +str q0, [x0, #784] +str q18, [x0, #848] +str q19, [x0, #912] +str q21, [x0, #976] +ldr q10, [x0, #224] +ldr q7, [x0, #160] +ldr q2, [x0, #32] +ldr q17, [x17, #+128] +ldr q14, [x17, #+144] +sqrdmulh v3.4S, v2.4S, v14.s[0] +mul v2.4S, v2.4S,v17.s[0] +ldr q4, [x0, #48] +sqrdmulh v9.4S, v4.4S, v14.s[0] +mul v4.4S, v4.4S,v17.s[0] +ldr q16, [x17, #+160] +ldr q30, [x17, #+176] +ldr q29, [x0, #96] +sqrdmulh v28.4S, v29.4S, v30.s[0] +mul v29.4S, v29.4S,v16.s[0] +ldr q27, [x0, #112] +sqrdmulh v26.4S, v27.4S, v30.s[0] +mul v27.4S, v27.4S,v16.s[0] +ldr q25, [x17, #+192] +ldr q24, [x17, #+208] +mla v2.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v7.4S, v24.s[0] +ldr q20, [x0, #176] +mla v4.4S, v9.4S, v31.s[0] +sqrdmulh v9.4S, v20.4S, v24.s[0] +ldr q1, [x17, #+224] +ldr q8, [x17, #+240] +mla v29.4S, v28.4S, v31.s[0] +sqrdmulh v28.4S, v10.4S, v8.s[0] +ldr q22, [x0, #240] +mla v27.4S, v26.4S, v31.s[0] +sqrdmulh v26.4S, v22.4S, v8.s[0] +ldr q13, [x0, #0] +ldr q11, [x0, #128] +mul v7.4S, v7.4S,v25.s[0] +sub v23.4s, v13.4s, v2.4s +ldr q6, [x0, #16] +mul v20.4S, v20.4S,v25.s[0] +add v13.4s, v13.4s, v2.4s +ldr q2, [x0, #144] +mla v7.4S, v3.4S, v31.s[0] +sub v3.4s, v6.4s, v4.4s +ldr q12, [x0, #64] +mla v20.4S, v9.4S, v31.s[0] +add v6.4s, v6.4s, v4.4s +ldr q4, [x0, #192] +mul v10.4S, v10.4S,v1.s[0] +sub v9.4s, v12.4s, v29.4s +ldr q15, [x0, #80] +mul v22.4S, v22.4S,v1.s[0] +add v12.4s, v12.4s, v29.4s +ldr q29, [x0, #208] +mla v10.4S, v28.4S, v31.s[0] +mla v22.4S, v26.4S, v31.s[0] +sub v26.4s, v15.4s, v27.4s +sqrdmulh v28.4S, v6.4S, v14.s[1] +add v15.4s, v15.4s, v27.4s +mul v6.4S, v6.4S,v17.s[1] +sqrdmulh v27.4S, v3.4S, v14.s[2] +sub v5.4s, v11.4s, v7.4s +mul v3.4S, v3.4S,v17.s[2] +add v11.4s, v11.4s, v7.4s +sqrdmulh v14.4S, v15.4S, v30.s[1] +sub v17.4s, v2.4s, v20.4s +mul v15.4S, v15.4S,v16.s[1] +add v2.4s, v2.4s, v20.4s +sqrdmulh v20.4S, v26.4S, v30.s[2] +sub v7.4s, v4.4s, v10.4s +mul v26.4S, v26.4S,v16.s[2] +add v4.4s, v4.4s, v10.4s +mla v6.4S, v28.4S, v31.s[0] +sub v28.4s, v29.4s, v22.4s +ldr q30, [x0, #480] +sqrdmulh v16.4S, v2.4S, v24.s[1] +add v29.4s, v29.4s, v22.4s +mla v3.4S, v27.4S, v31.s[0] +ldr q27, [x0, #416] +sqrdmulh v22.4S, v17.4S, v24.s[2] +sub v10.4s, v13.4s, v6.4s +mla v15.4S, v14.4S, v31.s[0] +ldr q14, [x0, #288] +sqrdmulh v0.4S, v29.4S, v8.s[1] +add v13.4s, v13.4s, v6.4s +str q10, [x0, #16] +mla v26.4S, v20.4S, v31.s[0] +ldr q20, [x17, #+256] +ldr q10, [x17, #+272] +sqrdmulh v6.4S, v28.4S, v8.s[2] +sub v18.4s, v23.4s, v3.4s +str q13, [x0, #0] +mul v2.4S, v2.4S,v25.s[1] +add v23.4s, v23.4s, v3.4s +mul v17.4S, v17.4S,v25.s[2] +str q18, [x0, #48] +mla v2.4S, v16.4S, v31.s[0] +sub v16.4s, v12.4s, v15.4s +mla v17.4S, v22.4S, v31.s[0] +str q23, [x0, #32] +mul v29.4S, v29.4S,v1.s[1] +str q16, [x0, #80] +mul v28.4S, v28.4S,v1.s[2] +add v12.4s, v12.4s, v15.4s +str q12, [x0, #64] +mla v29.4S, v0.4S, v31.s[0] +sub v0.4s, v9.4s, v26.4s +str q0, [x0, #112] +mla v28.4S, v6.4S, v31.s[0] +add v9.4s, v9.4s, v26.4s +str q9, [x0, #96] +sqrdmulh v8.4S, v14.4S, v10.s[0] +sub v1.4s, v11.4s, v2.4s +mul v14.4S, v14.4S,v20.s[0] +str q1, [x0, #144] +ldr q1, [x0, #304] +sqrdmulh v9.4S, v1.4S, v10.s[0] +add v11.4s, v11.4s, v2.4s +mul v1.4S, v1.4S,v20.s[0] +str q11, [x0, #128] +ldr q11, [x17, #+288] +ldr q2, [x17, #+304] +ldr q26, [x0, #352] +sqrdmulh v6.4S, v26.4S, v2.s[0] +sub v0.4s, v5.4s, v17.4s +mul v26.4S, v26.4S,v11.s[0] +str q0, [x0, #176] +ldr q0, [x0, #368] +sqrdmulh v12.4S, v0.4S, v2.s[0] +add v5.4s, v5.4s, v17.4s +mul v0.4S, v0.4S,v11.s[0] +str q5, [x0, #160] +ldr q5, [x17, #+320] +ldr q17, [x17, #+336] +mla v14.4S, v8.4S, v31.s[0] +sub v8.4s, v4.4s, v29.4s +sqrdmulh v15.4S, v27.4S, v17.s[0] +str q8, [x0, #208] +ldr q8, [x0, #432] +mla v1.4S, v9.4S, v31.s[0] +add v4.4s, v4.4s, v29.4s +sqrdmulh v29.4S, v8.4S, v17.s[0] +str q4, [x0, #192] +ldr q4, [x17, #+352] +ldr q9, [x17, #+368] +mla v26.4S, v6.4S, v31.s[0] +sub v6.4s, v7.4s, v28.4s +sqrdmulh v16.4S, v30.4S, v9.s[0] +str q6, [x0, #240] +ldr q6, [x0, #496] +mla v0.4S, v12.4S, v31.s[0] +add v7.4s, v7.4s, v28.4s +sqrdmulh v28.4S, v6.4S, v9.s[0] +str q7, [x0, #224] +ldr q7, [x0, #256] +ldr q12, [x0, #384] +mul v27.4S, v27.4S,v5.s[0] +sub v24.4s, v7.4s, v14.4s +ldr q25, [x0, #272] +mul v8.4S, v8.4S,v5.s[0] +add v7.4s, v7.4s, v14.4s +ldr q14, [x0, #400] +mla v27.4S, v15.4S, v31.s[0] +sub v15.4s, v25.4s, v1.4s +ldr q23, [x0, #320] +mla v8.4S, v29.4S, v31.s[0] +add v25.4s, v25.4s, v1.4s +ldr q1, [x0, #448] +mul v30.4S, v30.4S,v4.s[0] +sub v29.4s, v23.4s, v26.4s +ldr q22, [x0, #336] +mul v6.4S, v6.4S,v4.s[0] +add v23.4s, v23.4s, v26.4s +ldr q26, [x0, #464] +mla v30.4S, v16.4S, v31.s[0] +mla v6.4S, v28.4S, v31.s[0] +sub v28.4s, v22.4s, v0.4s +sqrdmulh v16.4S, v25.4S, v10.s[1] +add v22.4s, v22.4s, v0.4s +mul v25.4S, v25.4S,v20.s[1] +sqrdmulh v0.4S, v15.4S, v10.s[2] +sub v18.4s, v12.4s, v27.4s +mul v15.4S, v15.4S,v20.s[2] +add v12.4s, v12.4s, v27.4s +sqrdmulh v10.4S, v22.4S, v2.s[1] +sub v20.4s, v14.4s, v8.4s +mul v22.4S, v22.4S,v11.s[1] +add v14.4s, v14.4s, v8.4s +sqrdmulh v8.4S, v28.4S, v2.s[2] +sub v27.4s, v1.4s, v30.4s +mul v28.4S, v28.4S,v11.s[2] +add v1.4s, v1.4s, v30.4s +mla v25.4S, v16.4S, v31.s[0] +sub v16.4s, v26.4s, v6.4s +ldr q2, [x0, #736] +sqrdmulh v11.4S, v14.4S, v17.s[1] +add v26.4s, v26.4s, v6.4s +mla v15.4S, v0.4S, v31.s[0] +ldr q0, [x0, #672] +sqrdmulh v6.4S, v20.4S, v17.s[2] +sub v30.4s, v7.4s, v25.4s +mla v22.4S, v10.4S, v31.s[0] +ldr q10, [x0, #544] +sqrdmulh v3.4S, v26.4S, v9.s[1] +add v7.4s, v7.4s, v25.4s +str q30, [x0, #272] +mla v28.4S, v8.4S, v31.s[0] +ldr q8, [x17, #+384] +ldr q30, [x17, #+400] +sqrdmulh v25.4S, v16.4S, v9.s[2] +sub v13.4s, v24.4s, v15.4s +str q7, [x0, #256] +mul v14.4S, v14.4S,v5.s[1] +add v24.4s, v24.4s, v15.4s +mul v20.4S, v20.4S,v5.s[2] +str q13, [x0, #304] +mla v14.4S, v11.4S, v31.s[0] +sub v11.4s, v23.4s, v22.4s +mla v20.4S, v6.4S, v31.s[0] +str q24, [x0, #288] +mul v26.4S, v26.4S,v4.s[1] +str q11, [x0, #336] +mul v16.4S, v16.4S,v4.s[2] +add v23.4s, v23.4s, v22.4s +str q23, [x0, #320] +mla v26.4S, v3.4S, v31.s[0] +sub v3.4s, v29.4s, v28.4s +str q3, [x0, #368] +mla v16.4S, v25.4S, v31.s[0] +add v29.4s, v29.4s, v28.4s +str q29, [x0, #352] +sqrdmulh v9.4S, v10.4S, v30.s[0] +sub v4.4s, v12.4s, v14.4s +mul v10.4S, v10.4S,v8.s[0] +str q4, [x0, #400] +ldr q4, [x0, #560] +sqrdmulh v29.4S, v4.4S, v30.s[0] +add v12.4s, v12.4s, v14.4s +mul v4.4S, v4.4S,v8.s[0] +str q12, [x0, #384] +ldr q12, [x17, #+416] +ldr q14, [x17, #+432] +ldr q28, [x0, #608] +sqrdmulh v25.4S, v28.4S, v14.s[0] +sub v3.4s, v18.4s, v20.4s +mul v28.4S, v28.4S,v12.s[0] +str q3, [x0, #432] +ldr q3, [x0, #624] +sqrdmulh v23.4S, v3.4S, v14.s[0] +add v18.4s, v18.4s, v20.4s +mul v3.4S, v3.4S,v12.s[0] +str q18, [x0, #416] +ldr q18, [x17, #+448] +ldr q20, [x17, #+464] +mla v10.4S, v9.4S, v31.s[0] +sub v9.4s, v1.4s, v26.4s +sqrdmulh v22.4S, v0.4S, v20.s[0] +str q9, [x0, #464] +ldr q9, [x0, #688] +mla v4.4S, v29.4S, v31.s[0] +add v1.4s, v1.4s, v26.4s +sqrdmulh v26.4S, v9.4S, v20.s[0] +str q1, [x0, #448] +ldr q1, [x17, #+480] +ldr q29, [x17, #+496] +mla v28.4S, v25.4S, v31.s[0] +sub v25.4s, v27.4s, v16.4s +sqrdmulh v11.4S, v2.4S, v29.s[0] +str q25, [x0, #496] +ldr q25, [x0, #752] +mla v3.4S, v23.4S, v31.s[0] +add v27.4s, v27.4s, v16.4s +sqrdmulh v16.4S, v25.4S, v29.s[0] +str q27, [x0, #480] +ldr q27, [x0, #512] +ldr q23, [x0, #640] +mul v0.4S, v0.4S,v18.s[0] +sub v17.4s, v27.4s, v10.4s +ldr q5, [x0, #528] +mul v9.4S, v9.4S,v18.s[0] +add v27.4s, v27.4s, v10.4s +ldr q10, [x0, #656] +mla v0.4S, v22.4S, v31.s[0] +sub v22.4s, v5.4s, v4.4s +ldr q24, [x0, #576] +mla v9.4S, v26.4S, v31.s[0] +add v5.4s, v5.4s, v4.4s +ldr q4, [x0, #704] +mul v2.4S, v2.4S,v1.s[0] +sub v26.4s, v24.4s, v28.4s +ldr q6, [x0, #592] +mul v25.4S, v25.4S,v1.s[0] +add v24.4s, v24.4s, v28.4s +ldr q28, [x0, #720] +mla v2.4S, v11.4S, v31.s[0] +mla v25.4S, v16.4S, v31.s[0] +sub v16.4s, v6.4s, v3.4s +sqrdmulh v11.4S, v5.4S, v30.s[1] +add v6.4s, v6.4s, v3.4s +mul v5.4S, v5.4S,v8.s[1] +sqrdmulh v3.4S, v22.4S, v30.s[2] +sub v13.4s, v23.4s, v0.4s +mul v22.4S, v22.4S,v8.s[2] +add v23.4s, v23.4s, v0.4s +sqrdmulh v30.4S, v6.4S, v14.s[1] +sub v8.4s, v10.4s, v9.4s +mul v6.4S, v6.4S,v12.s[1] +add v10.4s, v10.4s, v9.4s +sqrdmulh v9.4S, v16.4S, v14.s[2] +sub v0.4s, v4.4s, v2.4s +mul v16.4S, v16.4S,v12.s[2] +add v4.4s, v4.4s, v2.4s +mla v5.4S, v11.4S, v31.s[0] +sub v11.4s, v28.4s, v25.4s +ldr q14, [x0, #992] +sqrdmulh v12.4S, v10.4S, v20.s[1] +add v28.4s, v28.4s, v25.4s +mla v22.4S, v3.4S, v31.s[0] +ldr q3, [x0, #928] +sqrdmulh v25.4S, v8.4S, v20.s[2] +sub v2.4s, v27.4s, v5.4s +mla v6.4S, v30.4S, v31.s[0] +ldr q30, [x0, #800] +sqrdmulh v15.4S, v28.4S, v29.s[1] +add v27.4s, v27.4s, v5.4s +str q2, [x0, #528] +mla v16.4S, v9.4S, v31.s[0] +ldr q9, [x17, #+512] +ldr q2, [x17, #+528] +sqrdmulh v5.4S, v11.4S, v29.s[2] +sub v7.4s, v17.4s, v22.4s +str q27, [x0, #512] +mul v10.4S, v10.4S,v18.s[1] +add v17.4s, v17.4s, v22.4s +mul v8.4S, v8.4S,v18.s[2] +str q7, [x0, #560] +mla v10.4S, v12.4S, v31.s[0] +sub v12.4s, v24.4s, v6.4s +mla v8.4S, v25.4S, v31.s[0] +str q17, [x0, #544] +mul v28.4S, v28.4S,v1.s[1] +str q12, [x0, #592] +mul v11.4S, v11.4S,v1.s[2] +add v24.4s, v24.4s, v6.4s +str q24, [x0, #576] +mla v28.4S, v15.4S, v31.s[0] +sub v15.4s, v26.4s, v16.4s +str q15, [x0, #624] +mla v11.4S, v5.4S, v31.s[0] +add v26.4s, v26.4s, v16.4s +str q26, [x0, #608] +sqrdmulh v29.4S, v30.4S, v2.s[0] +sub v1.4s, v23.4s, v10.4s +mul v30.4S, v30.4S,v9.s[0] +str q1, [x0, #656] +ldr q1, [x0, #816] +sqrdmulh v26.4S, v1.4S, v2.s[0] +add v23.4s, v23.4s, v10.4s +mul v1.4S, v1.4S,v9.s[0] +str q23, [x0, #640] +ldr q23, [x17, #+544] +ldr q10, [x17, #+560] +ldr q16, [x0, #864] +sqrdmulh v5.4S, v16.4S, v10.s[0] +sub v15.4s, v13.4s, v8.4s +mul v16.4S, v16.4S,v23.s[0] +str q15, [x0, #688] +ldr q15, [x0, #880] +sqrdmulh v24.4S, v15.4S, v10.s[0] +add v13.4s, v13.4s, v8.4s +mul v15.4S, v15.4S,v23.s[0] +str q13, [x0, #672] +ldr q13, [x17, #+576] +ldr q8, [x17, #+592] +mla v30.4S, v29.4S, v31.s[0] +sub v29.4s, v4.4s, v28.4s +sqrdmulh v6.4S, v3.4S, v8.s[0] +str q29, [x0, #720] +ldr q29, [x0, #944] +mla v1.4S, v26.4S, v31.s[0] +add v4.4s, v4.4s, v28.4s +sqrdmulh v28.4S, v29.4S, v8.s[0] +str q4, [x0, #704] +ldr q4, [x17, #+608] +ldr q26, [x17, #+624] +mla v16.4S, v5.4S, v31.s[0] +sub v5.4s, v0.4s, v11.4s +sqrdmulh v12.4S, v14.4S, v26.s[0] +str q5, [x0, #752] +ldr q5, [x0, #1008] +mla v15.4S, v24.4S, v31.s[0] +add v0.4s, v0.4s, v11.4s +sqrdmulh v11.4S, v5.4S, v26.s[0] +str q0, [x0, #736] +ldr q0, [x0, #768] +ldr q24, [x0, #896] +mul v3.4S, v3.4S,v13.s[0] +sub v20.4s, v0.4s, v30.4s +ldr q18, [x0, #784] +mul v29.4S, v29.4S,v13.s[0] +add v0.4s, v0.4s, v30.4s +ldr q30, [x0, #912] +mla v3.4S, v6.4S, v31.s[0] +sub v6.4s, v18.4s, v1.4s +ldr q17, [x0, #832] +mla v29.4S, v28.4S, v31.s[0] +add v18.4s, v18.4s, v1.4s +ldr q1, [x0, #960] +mul v14.4S, v14.4S,v4.s[0] +sub v28.4s, v17.4s, v16.4s +ldr q25, [x0, #848] +mul v5.4S, v5.4S,v4.s[0] +add v17.4s, v17.4s, v16.4s +ldr q16, [x0, #976] +mla v14.4S, v12.4S, v31.s[0] +mla v5.4S, v11.4S, v31.s[0] +sub v11.4s, v25.4s, v15.4s +sqrdmulh v12.4S, v18.4S, v2.s[1] +add v25.4s, v25.4s, v15.4s +mul v18.4S, v18.4S,v9.s[1] +sqrdmulh v15.4S, v6.4S, v2.s[2] +sub v7.4s, v24.4s, v3.4s +mul v6.4S, v6.4S,v9.s[2] +add v24.4s, v24.4s, v3.4s +sqrdmulh v2.4S, v25.4S, v10.s[1] +sub v9.4s, v30.4s, v29.4s +mul v25.4S, v25.4S,v23.s[1] +add v30.4s, v30.4s, v29.4s +sqrdmulh v29.4S, v11.4S, v10.s[2] +sub v3.4s, v1.4s, v14.4s +mul v11.4S, v11.4S,v23.s[2] +add v1.4s, v1.4s, v14.4s +mla v18.4S, v12.4S, v31.s[0] +sub v12.4s, v16.4s, v5.4s +sqrdmulh v10.4S, v30.4S, v8.s[1] +add v16.4s, v16.4s, v5.4s +mla v6.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v9.4S, v8.s[2] +sub v5.4s, v0.4s, v18.4s +mla v25.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v16.4S, v26.s[1] +add v0.4s, v0.4s, v18.4s +str q5, [x0, #784] +mla v11.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v12.4S, v26.s[2] +sub v5.4s, v20.4s, v6.4s +str q0, [x0, #768] +mul v30.4S, v30.4S,v13.s[1] +add v20.4s, v20.4s, v6.4s +mul v9.4S, v9.4S,v13.s[2] +str q5, [x0, #816] +mla v30.4S, v10.4S, v31.s[0] +sub v10.4s, v17.4s, v25.4s +mla v9.4S, v15.4S, v31.s[0] +str q20, [x0, #800] +mul v16.4S, v16.4S,v4.s[1] +str q10, [x0, #848] +mul v12.4S, v12.4S,v4.s[2] +add v17.4s, v17.4s, v25.4s +str q17, [x0, #832] +mla v16.4S, v2.4S, v31.s[0] +sub v2.4s, v28.4s, v11.4s +str q2, [x0, #880] +mla v12.4S, v29.4S, v31.s[0] +add v28.4s, v28.4s, v11.4s +str q28, [x0, #864] +sub v26.4s, v24.4s, v30.4s +str q26, [x0, #912] +add v24.4s, v24.4s, v30.4s +str q24, [x0, #896] +sub v24.4s, v7.4s, v9.4s +str q24, [x0, #944] +add v7.4s, v7.4s, v9.4s +str q7, [x0, #928] +sub v7.4s, v1.4s, v16.4s +str q7, [x0, #976] +add v1.4s, v1.4s, v16.4s +str q1, [x0, #960] +sub v1.4s, v3.4s, v12.4s +str q1, [x0, #1008] +add v3.4s, v3.4s, v12.4s +str q3, [x0, #992] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1464 +// Instruction count: 1460 \ No newline at end of file diff --git a/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_9_z4_7.s b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_9_z4_7.s new file mode 100644 index 0000000..c98a11f --- /dev/null +++ b/asm/auto/ntt_neon/ntt_u32_incomplete_33556993_28678040_var_4_2_9_z4_7.s @@ -0,0 +1,1494 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_9_z4_7 +.global _ntt_u32_incomplete_neon_asm_var_4_2_9_z4_7 +ntt_u32_incomplete_neon_asm_var_4_2_9_z4_7: +_ntt_u32_incomplete_neon_asm_var_4_2_9_z4_7: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #928] +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +ldr q20, [x0, #992] +sqrdmulh v19.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q18, [x0, #800] +sqrdmulh v17.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +ldr q16, [x0, #864] +sqrdmulh v3.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +mla v22.4S, v21.4S, v31.s[0] +mla v20.4S, v19.4S, v31.s[0] +mla v18.4S, v17.4S, v31.s[0] +mla v16.4S, v3.4S, v31.s[0] +ldr q3, [x0, #544] +sqrdmulh v17.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +ldr q19, [x0, #608] +sqrdmulh v21.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q2, [x0, #672] +ldr q1, [x0, #416] +sqrdmulh v0.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +sub v15.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +ldr q22, [x0, #736] +ldr q14, [x0, #480] +sqrdmulh v13.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v12.4s, v14.4s, v20.4s +add v14.4s, v14.4s, v20.4s +ldr q20, [x0, #288] +mla v3.4S, v17.4S, v31.s[0] +mla v19.4S, v21.4S, v31.s[0] +sub v21.4s, v20.4s, v18.4s +mla v2.4S, v0.4S, v31.s[0] +mla v22.4S, v13.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +ldr q18, [x0, #352] +sqrdmulh v13.4S, v1.4S, v29.s[1] +mul v1.4S, v1.4S,v30.s[1] +sub v0.4s, v18.4s, v16.4s +sqrdmulh v17.4S, v14.4S, v29.s[1] +mul v14.4S, v14.4S,v30.s[1] +add v18.4s, v18.4s, v16.4s +ldr q16, [x0, #32] +sqrdmulh v11.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v10.4s, v16.4s, v3.4s +add v16.4s, v16.4s, v3.4s +ldr q3, [x0, #96] +sqrdmulh v9.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v8.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +ldr q19, [x0, #160] +mla v1.4S, v13.4S, v31.s[0] +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v19.4s, v2.4s +mla v20.4S, v11.4S, v31.s[0] +mla v18.4S, v9.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +ldr q2, [x0, #224] +sqrdmulh v9.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +sub v11.4s, v2.4s, v22.4s +sqrdmulh v13.4S, v12.4S, v29.s[2] +mul v12.4S, v12.4S,v30.s[2] +add v2.4s, v2.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +sub v7.4s, v19.4s, v1.4s +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v29.s[2] +mul v0.4S, v0.4S,v30.s[2] +sub v6.4s, v2.4s, v14.4s +add v2.4s, v2.4s, v14.4s +mla v15.4S, v9.4S, v31.s[0] +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v16.4s, v20.4s +mla v21.4S, v22.4S, v31.s[0] +mla v0.4S, v1.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v7.4S, v27.s[1] +mul v7.4S, v7.4S,v28.s[1] +sub v1.4s, v3.4s, v18.4s +sqrdmulh v22.4S, v6.4S, v27.s[1] +mul v6.4S, v6.4S,v28.s[1] +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v19.4S, v27.s[0] +mul v19.4S, v19.4S,v28.s[0] +sub v9.4s, v17.4s, v15.4s +add v17.4s, v17.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v27.s[0] +mul v2.4S, v2.4S,v28.s[0] +sub v14.4s, v11.4s, v12.4s +add v11.4s, v11.4s, v12.4s +mla v7.4S, v20.4S, v31.s[0] +mla v6.4S, v22.4S, v31.s[0] +sub v22.4s, v10.4s, v21.4s +mla v19.4S, v18.4S, v31.s[0] +mla v2.4S, v15.4S, v31.s[0] +add v10.4s, v10.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v27.s[2] +mul v17.4S, v17.4S,v28.s[2] +sub v15.4s, v8.4s, v0.4s +sqrdmulh v18.4S, v11.4S, v27.s[2] +mul v11.4S, v11.4S,v28.s[2] +add v8.4s, v8.4s, v0.4s +sqrdmulh v0.4S, v9.4S, v27.s[3] +mul v9.4S, v9.4S,v28.s[3] +sub v20.4s, v13.4s, v7.4s +add v13.4s, v13.4s, v7.4s +sqrdmulh v7.4S, v14.4S, v27.s[3] +mul v14.4S, v14.4S,v28.s[3] +sub v12.4s, v1.4s, v6.4s +add v1.4s, v1.4s, v6.4s +mla v17.4S, v21.4S, v31.s[0] +mla v11.4S, v18.4S, v31.s[0] +sub v18.4s, v16.4s, v19.4s +mla v9.4S, v0.4S, v31.s[0] +mla v14.4S, v7.4S, v31.s[0] +add v16.4s, v16.4s, v19.4s +sqrdmulh v19.4S, v1.4S, v25.s[2] +mul v1.4S, v1.4S,v26.s[2] +sub v7.4s, v3.4s, v2.4s +sqrdmulh v0.4S, v12.4S, v25.s[3] +mul v12.4S, v12.4S,v26.s[3] +add v3.4s, v3.4s, v2.4s +sqrdmulh v2.4S, v7.4S, v25.s[1] +mul v7.4S, v7.4S,v26.s[1] +sub v21.4s, v10.4s, v17.4s +add v10.4s, v10.4s, v17.4s +sqrdmulh v17.4S, v3.4S, v25.s[0] +mul v3.4S, v3.4S,v26.s[0] +sub v6.4s, v8.4s, v11.4s +add v8.4s, v8.4s, v11.4s +mla v1.4S, v19.4S, v31.s[0] +mla v12.4S, v0.4S, v31.s[0] +sub v0.4s, v22.4s, v9.4s +mla v7.4S, v2.4S, v31.s[0] +mla v3.4S, v17.4S, v31.s[0] +add v22.4s, v22.4s, v9.4s +sqrdmulh v9.4S, v8.4S, v23.s[0] +mul v8.4S, v8.4S,v24.s[0] +sub v17.4s, v15.4s, v14.4s +sqrdmulh v2.4S, v6.4S, v23.s[1] +mul v6.4S, v6.4S,v24.s[1] +add v15.4s, v15.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v23.s[2] +mul v15.4S, v15.4S,v24.s[2] +sub v19.4s, v13.4s, v1.4s +add v13.4s, v13.4s, v1.4s +sqrdmulh v1.4S, v17.4S, v23.s[3] +mul v17.4S, v17.4S,v24.s[3] +sub v11.4s, v20.4s, v12.4s +add v20.4s, v20.4s, v12.4s +mla v8.4S, v9.4S, v31.s[0] +mla v6.4S, v2.4S, v31.s[0] +sub v2.4s, v18.4s, v7.4s +str q13, [x0, #288] +mla v15.4S, v14.4S, v31.s[0] +mla v17.4S, v1.4S, v31.s[0] +add v18.4s, v18.4s, v7.4s +str q19, [x0, #352] +ldr q19, [x0, #944] +sqrdmulh v7.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +str q20, [x0, #416] +sub v20.4s, v16.4s, v3.4s +ldr q1, [x0, #1008] +sqrdmulh v14.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +str q11, [x0, #480] +add v16.4s, v16.4s, v3.4s +ldr q3, [x0, #816] +sqrdmulh v11.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +sub v13.4s, v10.4s, v8.4s +add v10.4s, v10.4s, v8.4s +ldr q8, [x0, #880] +sqrdmulh v9.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v12.4s, v21.4s, v6.4s +add v21.4s, v21.4s, v6.4s +mla v19.4S, v7.4S, v31.s[0] +mla v1.4S, v14.4S, v31.s[0] +str q18, [x0, #160] +sub v18.4s, v22.4s, v15.4s +mla v3.4S, v11.4S, v31.s[0] +mla v8.4S, v9.4S, v31.s[0] +str q2, [x0, #224] +add v22.4s, v22.4s, v15.4s +ldr q15, [x0, #560] +sqrdmulh v2.4S, v15.4S, v29.s[0] +mul v15.4S, v15.4S,v30.s[0] +str q16, [x0, #32] +sub v16.4s, v0.4s, v17.4s +ldr q9, [x0, #624] +sqrdmulh v11.4S, v9.4S, v29.s[0] +mul v9.4S, v9.4S,v30.s[0] +str q20, [x0, #96] +add v0.4s, v0.4s, v17.4s +ldr q17, [x0, #688] +ldr q20, [x0, #432] +sqrdmulh v14.4S, v17.4S, v29.s[0] +mul v17.4S, v17.4S,v30.s[0] +sub v7.4s, v20.4s, v19.4s +add v20.4s, v20.4s, v19.4s +ldr q19, [x0, #752] +ldr q6, [x0, #496] +sqrdmulh v5.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +sub v4.4s, v6.4s, v1.4s +add v6.4s, v6.4s, v1.4s +ldr q1, [x0, #304] +mla v15.4S, v2.4S, v31.s[0] +mla v9.4S, v11.4S, v31.s[0] +str q10, [x0, #544] +sub v10.4s, v1.4s, v3.4s +mla v17.4S, v14.4S, v31.s[0] +mla v19.4S, v5.4S, v31.s[0] +str q13, [x0, #608] +add v1.4s, v1.4s, v3.4s +ldr q3, [x0, #368] +sqrdmulh v13.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +str q21, [x0, #672] +sub v21.4s, v3.4s, v8.4s +sqrdmulh v5.4S, v6.4S, v29.s[1] +mul v6.4S, v6.4S,v30.s[1] +str q12, [x0, #736] +add v3.4s, v3.4s, v8.4s +ldr q8, [x0, #48] +sqrdmulh v12.4S, v1.4S, v29.s[1] +mul v1.4S, v1.4S,v30.s[1] +sub v14.4s, v8.4s, v15.4s +add v8.4s, v8.4s, v15.4s +ldr q15, [x0, #112] +sqrdmulh v11.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v2.4s, v15.4s, v9.4s +add v15.4s, v15.4s, v9.4s +ldr q9, [x0, #176] +mla v20.4S, v13.4S, v31.s[0] +mla v6.4S, v5.4S, v31.s[0] +str q22, [x0, #800] +sub v22.4s, v9.4s, v17.4s +mla v1.4S, v12.4S, v31.s[0] +mla v3.4S, v11.4S, v31.s[0] +str q18, [x0, #864] +add v9.4s, v9.4s, v17.4s +ldr q17, [x0, #240] +sqrdmulh v18.4S, v7.4S, v29.s[2] +mul v7.4S, v7.4S,v30.s[2] +str q0, [x0, #928] +sub v0.4s, v17.4s, v19.4s +sqrdmulh v11.4S, v4.4S, v29.s[2] +mul v4.4S, v4.4S,v30.s[2] +str q16, [x0, #992] +add v17.4s, v17.4s, v19.4s +sqrdmulh v19.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v16.4s, v9.4s, v20.4s +add v9.4s, v9.4s, v20.4s +sqrdmulh v20.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +sub v12.4s, v17.4s, v6.4s +add v17.4s, v17.4s, v6.4s +mla v7.4S, v18.4S, v31.s[0] +mla v4.4S, v11.4S, v31.s[0] +sub v11.4s, v8.4s, v1.4s +mla v10.4S, v19.4S, v31.s[0] +mla v21.4S, v20.4S, v31.s[0] +add v8.4s, v8.4s, v1.4s +sqrdmulh v1.4S, v16.4S, v27.s[1] +mul v16.4S, v16.4S,v28.s[1] +sub v20.4s, v15.4s, v3.4s +sqrdmulh v19.4S, v12.4S, v27.s[1] +mul v12.4S, v12.4S,v28.s[1] +add v15.4s, v15.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v27.s[0] +mul v9.4S, v9.4S,v28.s[0] +sub v18.4s, v22.4s, v7.4s +add v22.4s, v22.4s, v7.4s +sqrdmulh v7.4S, v17.4S, v27.s[0] +mul v17.4S, v17.4S,v28.s[0] +sub v6.4s, v0.4s, v4.4s +add v0.4s, v0.4s, v4.4s +mla v16.4S, v1.4S, v31.s[0] +mla v12.4S, v19.4S, v31.s[0] +sub v19.4s, v14.4s, v10.4s +mla v9.4S, v3.4S, v31.s[0] +mla v17.4S, v7.4S, v31.s[0] +add v14.4s, v14.4s, v10.4s +sqrdmulh v10.4S, v22.4S, v27.s[2] +mul v22.4S, v22.4S,v28.s[2] +sub v7.4s, v2.4s, v21.4s +sqrdmulh v3.4S, v0.4S, v27.s[2] +mul v0.4S, v0.4S,v28.s[2] +add v2.4s, v2.4s, v21.4s +sqrdmulh v21.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v1.4s, v11.4s, v16.4s +add v11.4s, v11.4s, v16.4s +sqrdmulh v16.4S, v6.4S, v27.s[3] +mul v6.4S, v6.4S,v28.s[3] +sub v4.4s, v20.4s, v12.4s +add v20.4s, v20.4s, v12.4s +mla v22.4S, v10.4S, v31.s[0] +mla v0.4S, v3.4S, v31.s[0] +sub v3.4s, v8.4s, v9.4s +mla v18.4S, v21.4S, v31.s[0] +mla v6.4S, v16.4S, v31.s[0] +add v8.4s, v8.4s, v9.4s +sqrdmulh v9.4S, v20.4S, v25.s[2] +mul v20.4S, v20.4S,v26.s[2] +sub v16.4s, v15.4s, v17.4s +sqrdmulh v21.4S, v4.4S, v25.s[3] +mul v4.4S, v4.4S,v26.s[3] +add v15.4s, v15.4s, v17.4s +sqrdmulh v17.4S, v16.4S, v25.s[1] +mul v16.4S, v16.4S,v26.s[1] +sub v10.4s, v14.4s, v22.4s +add v14.4s, v14.4s, v22.4s +sqrdmulh v22.4S, v15.4S, v25.s[0] +mul v15.4S, v15.4S,v26.s[0] +sub v12.4s, v2.4s, v0.4s +add v2.4s, v2.4s, v0.4s +mla v20.4S, v9.4S, v31.s[0] +mla v4.4S, v21.4S, v31.s[0] +sub v21.4s, v19.4s, v18.4s +mla v16.4S, v17.4S, v31.s[0] +mla v15.4S, v22.4S, v31.s[0] +add v19.4s, v19.4s, v18.4s +sqrdmulh v18.4S, v2.4S, v23.s[0] +mul v2.4S, v2.4S,v24.s[0] +sub v22.4s, v7.4s, v6.4s +sqrdmulh v17.4S, v12.4S, v23.s[1] +mul v12.4S, v12.4S,v24.s[1] +add v7.4s, v7.4s, v6.4s +sqrdmulh v6.4S, v7.4S, v23.s[2] +mul v7.4S, v7.4S,v24.s[2] +sub v9.4s, v11.4s, v20.4s +add v11.4s, v11.4s, v20.4s +sqrdmulh v20.4S, v22.4S, v23.s[3] +mul v22.4S, v22.4S,v24.s[3] +sub v0.4s, v1.4s, v4.4s +add v1.4s, v1.4s, v4.4s +mla v2.4S, v18.4S, v31.s[0] +mla v12.4S, v17.4S, v31.s[0] +sub v17.4s, v3.4s, v16.4s +str q11, [x0, #304] +mla v7.4S, v6.4S, v31.s[0] +mla v22.4S, v20.4S, v31.s[0] +add v3.4s, v3.4s, v16.4s +str q9, [x0, #368] +ldr q9, [x0, #896] +sqrdmulh v16.4S, v9.4S, v29.s[0] +mul v9.4S, v9.4S,v30.s[0] +str q1, [x0, #432] +sub v1.4s, v8.4s, v15.4s +ldr q20, [x0, #960] +sqrdmulh v6.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +str q0, [x0, #496] +add v8.4s, v8.4s, v15.4s +ldr q15, [x0, #768] +sqrdmulh v0.4S, v15.4S, v29.s[0] +mul v15.4S, v15.4S,v30.s[0] +sub v11.4s, v14.4s, v2.4s +add v14.4s, v14.4s, v2.4s +ldr q2, [x0, #832] +sqrdmulh v18.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +sub v4.4s, v10.4s, v12.4s +add v10.4s, v10.4s, v12.4s +mla v9.4S, v16.4S, v31.s[0] +mla v20.4S, v6.4S, v31.s[0] +str q3, [x0, #176] +sub v3.4s, v19.4s, v7.4s +mla v15.4S, v0.4S, v31.s[0] +mla v2.4S, v18.4S, v31.s[0] +str q17, [x0, #240] +add v19.4s, v19.4s, v7.4s +ldr q7, [x0, #512] +sqrdmulh v17.4S, v7.4S, v29.s[0] +mul v7.4S, v7.4S,v30.s[0] +str q8, [x0, #48] +sub v8.4s, v21.4s, v22.4s +ldr q18, [x0, #576] +sqrdmulh v0.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +str q1, [x0, #112] +add v21.4s, v21.4s, v22.4s +ldr q22, [x0, #640] +ldr q1, [x0, #384] +sqrdmulh v6.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v16.4s, v1.4s, v9.4s +add v1.4s, v1.4s, v9.4s +ldr q9, [x0, #704] +ldr q12, [x0, #448] +sqrdmulh v5.4S, v9.4S, v29.s[0] +mul v9.4S, v9.4S,v30.s[0] +sub v13.4s, v12.4s, v20.4s +add v12.4s, v12.4s, v20.4s +ldr q20, [x0, #256] +mla v7.4S, v17.4S, v31.s[0] +mla v18.4S, v0.4S, v31.s[0] +str q14, [x0, #560] +sub v14.4s, v20.4s, v15.4s +mla v22.4S, v6.4S, v31.s[0] +mla v9.4S, v5.4S, v31.s[0] +str q11, [x0, #624] +add v20.4s, v20.4s, v15.4s +ldr q15, [x0, #320] +sqrdmulh v11.4S, v1.4S, v29.s[1] +mul v1.4S, v1.4S,v30.s[1] +str q10, [x0, #688] +sub v10.4s, v15.4s, v2.4s +sqrdmulh v5.4S, v12.4S, v29.s[1] +mul v12.4S, v12.4S,v30.s[1] +str q4, [x0, #752] +add v15.4s, v15.4s, v2.4s +ldr q2, [x0, #0] +sqrdmulh v4.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v6.4s, v2.4s, v7.4s +add v2.4s, v2.4s, v7.4s +ldr q7, [x0, #64] +sqrdmulh v0.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +sub v17.4s, v7.4s, v18.4s +add v7.4s, v7.4s, v18.4s +ldr q18, [x0, #128] +mla v1.4S, v11.4S, v31.s[0] +mla v12.4S, v5.4S, v31.s[0] +str q19, [x0, #816] +sub v19.4s, v18.4s, v22.4s +mla v20.4S, v4.4S, v31.s[0] +mla v15.4S, v0.4S, v31.s[0] +str q3, [x0, #880] +add v18.4s, v18.4s, v22.4s +ldr q22, [x0, #192] +sqrdmulh v3.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +str q21, [x0, #944] +sub v21.4s, v22.4s, v9.4s +sqrdmulh v0.4S, v13.4S, v29.s[2] +mul v13.4S, v13.4S,v30.s[2] +str q8, [x0, #1008] +add v22.4s, v22.4s, v9.4s +sqrdmulh v9.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v8.4s, v18.4s, v1.4s +add v18.4s, v18.4s, v1.4s +sqrdmulh v1.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v4.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +mla v16.4S, v3.4S, v31.s[0] +mla v13.4S, v0.4S, v31.s[0] +sub v0.4s, v2.4s, v20.4s +mla v14.4S, v9.4S, v31.s[0] +mla v10.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v27.s[1] +mul v8.4S, v8.4S,v28.s[1] +sub v1.4s, v7.4s, v15.4s +sqrdmulh v9.4S, v4.4S, v27.s[1] +mul v4.4S, v4.4S,v28.s[1] +add v7.4s, v7.4s, v15.4s +sqrdmulh v15.4S, v18.4S, v27.s[0] +mul v18.4S, v18.4S,v28.s[0] +sub v3.4s, v19.4s, v16.4s +add v19.4s, v19.4s, v16.4s +sqrdmulh v16.4S, v22.4S, v27.s[0] +mul v22.4S, v22.4S,v28.s[0] +sub v12.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +mla v8.4S, v20.4S, v31.s[0] +mla v4.4S, v9.4S, v31.s[0] +sub v9.4s, v6.4s, v14.4s +mla v18.4S, v15.4S, v31.s[0] +mla v22.4S, v16.4S, v31.s[0] +add v6.4s, v6.4s, v14.4s +sqrdmulh v14.4S, v19.4S, v27.s[2] +mul v19.4S, v19.4S,v28.s[2] +sub v16.4s, v17.4s, v10.4s +sqrdmulh v15.4S, v21.4S, v27.s[2] +mul v21.4S, v21.4S,v28.s[2] +add v17.4s, v17.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v27.s[3] +mul v3.4S, v3.4S,v28.s[3] +sub v20.4s, v0.4s, v8.4s +add v0.4s, v0.4s, v8.4s +sqrdmulh v8.4S, v12.4S, v27.s[3] +mul v12.4S, v12.4S,v28.s[3] +sub v13.4s, v1.4s, v4.4s +add v1.4s, v1.4s, v4.4s +mla v19.4S, v14.4S, v31.s[0] +mla v21.4S, v15.4S, v31.s[0] +sub v15.4s, v2.4s, v18.4s +mla v3.4S, v10.4S, v31.s[0] +mla v12.4S, v8.4S, v31.s[0] +add v2.4s, v2.4s, v18.4s +sqrdmulh v18.4S, v1.4S, v25.s[2] +mul v1.4S, v1.4S,v26.s[2] +sub v8.4s, v7.4s, v22.4s +sqrdmulh v10.4S, v13.4S, v25.s[3] +mul v13.4S, v13.4S,v26.s[3] +add v7.4s, v7.4s, v22.4s +sqrdmulh v22.4S, v8.4S, v25.s[1] +mul v8.4S, v8.4S,v26.s[1] +sub v14.4s, v6.4s, v19.4s +add v6.4s, v6.4s, v19.4s +sqrdmulh v19.4S, v7.4S, v25.s[0] +mul v7.4S, v7.4S,v26.s[0] +sub v4.4s, v17.4s, v21.4s +add v17.4s, v17.4s, v21.4s +mla v1.4S, v18.4S, v31.s[0] +mla v13.4S, v10.4S, v31.s[0] +sub v10.4s, v9.4s, v3.4s +mla v8.4S, v22.4S, v31.s[0] +mla v7.4S, v19.4S, v31.s[0] +add v9.4s, v9.4s, v3.4s +sqrdmulh v3.4S, v17.4S, v23.s[0] +mul v17.4S, v17.4S,v24.s[0] +sub v19.4s, v16.4s, v12.4s +sqrdmulh v22.4S, v4.4S, v23.s[1] +mul v4.4S, v4.4S,v24.s[1] +add v16.4s, v16.4s, v12.4s +sqrdmulh v12.4S, v16.4S, v23.s[2] +mul v16.4S, v16.4S,v24.s[2] +sub v18.4s, v0.4s, v1.4s +add v0.4s, v0.4s, v1.4s +sqrdmulh v1.4S, v19.4S, v23.s[3] +mul v19.4S, v19.4S,v24.s[3] +sub v21.4s, v20.4s, v13.4s +add v20.4s, v20.4s, v13.4s +mla v17.4S, v3.4S, v31.s[0] +mla v4.4S, v22.4S, v31.s[0] +sub v22.4s, v15.4s, v8.4s +str q0, [x0, #256] +mla v16.4S, v12.4S, v31.s[0] +mla v19.4S, v1.4S, v31.s[0] +add v15.4s, v15.4s, v8.4s +str q18, [x0, #320] +ldr q18, [x0, #912] +sqrdmulh v8.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +str q20, [x0, #384] +sub v20.4s, v2.4s, v7.4s +ldr q1, [x0, #976] +sqrdmulh v12.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +str q21, [x0, #448] +add v2.4s, v2.4s, v7.4s +ldr q7, [x0, #784] +sqrdmulh v21.4S, v7.4S, v29.s[0] +mul v7.4S, v7.4S,v30.s[0] +sub v0.4s, v6.4s, v17.4s +add v6.4s, v6.4s, v17.4s +ldr q17, [x0, #848] +sqrdmulh v3.4S, v17.4S, v29.s[0] +mul v17.4S, v17.4S,v30.s[0] +sub v13.4s, v14.4s, v4.4s +add v14.4s, v14.4s, v4.4s +mla v18.4S, v8.4S, v31.s[0] +mla v1.4S, v12.4S, v31.s[0] +str q15, [x0, #128] +sub v15.4s, v9.4s, v16.4s +mla v7.4S, v21.4S, v31.s[0] +mla v17.4S, v3.4S, v31.s[0] +str q22, [x0, #192] +add v9.4s, v9.4s, v16.4s +ldr q16, [x0, #528] +sqrdmulh v22.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +str q2, [x0, #0] +sub v2.4s, v10.4s, v19.4s +ldr q3, [x0, #592] +sqrdmulh v21.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +str q20, [x0, #64] +add v10.4s, v10.4s, v19.4s +ldr q19, [x0, #656] +ldr q20, [x0, #400] +sqrdmulh v12.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +sub v8.4s, v20.4s, v18.4s +add v20.4s, v20.4s, v18.4s +ldr q18, [x0, #720] +ldr q4, [x0, #464] +sqrdmulh v5.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +sub v11.4s, v4.4s, v1.4s +add v4.4s, v4.4s, v1.4s +ldr q1, [x0, #272] +mla v16.4S, v22.4S, v31.s[0] +mla v3.4S, v21.4S, v31.s[0] +str q6, [x0, #512] +sub v6.4s, v1.4s, v7.4s +mla v19.4S, v12.4S, v31.s[0] +mla v18.4S, v5.4S, v31.s[0] +str q0, [x0, #576] +add v1.4s, v1.4s, v7.4s +ldr q7, [x0, #336] +sqrdmulh v0.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +str q14, [x0, #640] +sub v14.4s, v7.4s, v17.4s +sqrdmulh v5.4S, v4.4S, v29.s[1] +mul v4.4S, v4.4S,v30.s[1] +str q13, [x0, #704] +add v7.4s, v7.4s, v17.4s +ldr q17, [x0, #16] +sqrdmulh v13.4S, v1.4S, v29.s[1] +mul v1.4S, v1.4S,v30.s[1] +sub v12.4s, v17.4s, v16.4s +add v17.4s, v17.4s, v16.4s +ldr q16, [x0, #80] +sqrdmulh v21.4S, v7.4S, v29.s[1] +mul v7.4S, v7.4S,v30.s[1] +sub v22.4s, v16.4s, v3.4s +add v16.4s, v16.4s, v3.4s +ldr q3, [x0, #144] +mla v20.4S, v0.4S, v31.s[0] +mla v4.4S, v5.4S, v31.s[0] +str q9, [x0, #768] +sub v9.4s, v3.4s, v19.4s +mla v1.4S, v13.4S, v31.s[0] +mla v7.4S, v21.4S, v31.s[0] +str q15, [x0, #832] +add v3.4s, v3.4s, v19.4s +ldr q19, [x0, #208] +sqrdmulh v15.4S, v8.4S, v29.s[2] +mul v8.4S, v8.4S,v30.s[2] +str q10, [x0, #896] +sub v10.4s, v19.4s, v18.4s +sqrdmulh v21.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +str q2, [x0, #960] +add v19.4s, v19.4s, v18.4s +sqrdmulh v18.4S, v6.4S, v29.s[2] +mul v6.4S, v6.4S,v30.s[2] +sub v2.4s, v3.4s, v20.4s +add v3.4s, v3.4s, v20.4s +sqrdmulh v20.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v13.4s, v19.4s, v4.4s +add v19.4s, v19.4s, v4.4s +mla v8.4S, v15.4S, v31.s[0] +mla v11.4S, v21.4S, v31.s[0] +sub v21.4s, v17.4s, v1.4s +mla v6.4S, v18.4S, v31.s[0] +mla v14.4S, v20.4S, v31.s[0] +add v17.4s, v17.4s, v1.4s +sqrdmulh v1.4S, v2.4S, v27.s[1] +mul v2.4S, v2.4S,v28.s[1] +sub v20.4s, v16.4s, v7.4s +sqrdmulh v18.4S, v13.4S, v27.s[1] +mul v13.4S, v13.4S,v28.s[1] +add v16.4s, v16.4s, v7.4s +sqrdmulh v7.4S, v3.4S, v27.s[0] +mul v3.4S, v3.4S,v28.s[0] +sub v15.4s, v9.4s, v8.4s +add v9.4s, v9.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v27.s[0] +mul v19.4S, v19.4S,v28.s[0] +sub v4.4s, v10.4s, v11.4s +add v10.4s, v10.4s, v11.4s +mla v2.4S, v1.4S, v31.s[0] +mla v13.4S, v18.4S, v31.s[0] +sub v18.4s, v12.4s, v6.4s +mla v3.4S, v7.4S, v31.s[0] +mla v19.4S, v8.4S, v31.s[0] +add v12.4s, v12.4s, v6.4s +sqrdmulh v6.4S, v9.4S, v27.s[2] +mul v9.4S, v9.4S,v28.s[2] +sub v8.4s, v22.4s, v14.4s +sqrdmulh v7.4S, v10.4S, v27.s[2] +mul v10.4S, v10.4S,v28.s[2] +add v22.4s, v22.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +sub v1.4s, v21.4s, v2.4s +add v21.4s, v21.4s, v2.4s +sqrdmulh v2.4S, v4.4S, v27.s[3] +mul v4.4S, v4.4S,v28.s[3] +sub v11.4s, v20.4s, v13.4s +add v20.4s, v20.4s, v13.4s +mla v9.4S, v6.4S, v31.s[0] +mla v10.4S, v7.4S, v31.s[0] +sub v7.4s, v17.4s, v3.4s +mla v15.4S, v14.4S, v31.s[0] +mla v4.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v3.4s +sqrdmulh v3.4S, v20.4S, v25.s[2] +mul v20.4S, v20.4S,v26.s[2] +sub v2.4s, v16.4s, v19.4s +sqrdmulh v14.4S, v11.4S, v25.s[3] +mul v11.4S, v11.4S,v26.s[3] +add v16.4s, v16.4s, v19.4s +sqrdmulh v19.4S, v2.4S, v25.s[1] +mul v2.4S, v2.4S,v26.s[1] +sub v6.4s, v12.4s, v9.4s +add v12.4s, v12.4s, v9.4s +sqrdmulh v9.4S, v16.4S, v25.s[0] +mul v16.4S, v16.4S,v26.s[0] +sub v13.4s, v22.4s, v10.4s +add v22.4s, v22.4s, v10.4s +mla v20.4S, v3.4S, v31.s[0] +mla v11.4S, v14.4S, v31.s[0] +sub v14.4s, v18.4s, v15.4s +mla v2.4S, v19.4S, v31.s[0] +mla v16.4S, v9.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +sqrdmulh v15.4S, v22.4S, v23.s[0] +mul v22.4S, v22.4S,v24.s[0] +sub v9.4s, v8.4s, v4.4s +sqrdmulh v19.4S, v13.4S, v23.s[1] +mul v13.4S, v13.4S,v24.s[1] +add v8.4s, v8.4s, v4.4s +sqrdmulh v4.4S, v8.4S, v23.s[2] +mul v8.4S, v8.4S,v24.s[2] +sub v3.4s, v21.4s, v20.4s +add v21.4s, v21.4s, v20.4s +sqrdmulh v20.4S, v9.4S, v23.s[3] +mul v9.4S, v9.4S,v24.s[3] +sub v10.4s, v1.4s, v11.4s +add v1.4s, v1.4s, v11.4s +mla v22.4S, v15.4S, v31.s[0] +mla v13.4S, v19.4S, v31.s[0] +sub v19.4s, v7.4s, v2.4s +str q21, [x0, #272] +mla v8.4S, v4.4S, v31.s[0] +mla v9.4S, v20.4S, v31.s[0] +add v7.4s, v7.4s, v2.4s +str q3, [x0, #336] +str q1, [x0, #400] +sub v1.4s, v17.4s, v16.4s +str q10, [x0, #464] +add v17.4s, v17.4s, v16.4s +sub v16.4s, v12.4s, v22.4s +add v12.4s, v12.4s, v22.4s +sub v22.4s, v6.4s, v13.4s +add v6.4s, v6.4s, v13.4s +str q7, [x0, #144] +sub v7.4s, v18.4s, v8.4s +str q19, [x0, #208] +add v18.4s, v18.4s, v8.4s +str q17, [x0, #16] +sub v17.4s, v14.4s, v9.4s +str q1, [x0, #80] +add v14.4s, v14.4s, v9.4s +str q12, [x0, #528] +str q16, [x0, #592] +str q6, [x0, #656] +str q22, [x0, #720] +str q18, [x0, #784] +str q7, [x0, #848] +str q14, [x0, #912] +str q17, [x0, #976] +ldr q0, [x0, #224] +ldr q5, [x0, #160] +ldr q11, [x0, #32] +ldr q15, [x17, #+128] +ldr q21, [x17, #+144] +sqrdmulh v4.4S, v11.4S, v21.s[0] +mul v11.4S, v11.4S,v15.s[0] +ldr q20, [x0, #48] +sqrdmulh v2.4S, v20.4S, v21.s[0] +mul v20.4S, v20.4S,v15.s[0] +ldr q3, [x17, #+160] +ldr q30, [x17, #+176] +ldr q29, [x0, #96] +sqrdmulh v28.4S, v29.4S, v30.s[0] +mul v29.4S, v29.4S,v3.s[0] +ldr q27, [x0, #112] +sqrdmulh v26.4S, v27.4S, v30.s[0] +mul v27.4S, v27.4S,v3.s[0] +ldr q25, [x17, #+192] +ldr q24, [x17, #+208] +mla v11.4S, v4.4S, v31.s[0] +sqrdmulh v4.4S, v5.4S, v24.s[0] +ldr q23, [x0, #176] +mla v20.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v23.4S, v24.s[0] +ldr q10, [x17, #+224] +ldr q13, [x17, #+240] +mla v29.4S, v28.4S, v31.s[0] +sqrdmulh v28.4S, v0.4S, v13.s[0] +ldr q19, [x0, #240] +mla v27.4S, v26.4S, v31.s[0] +sqrdmulh v26.4S, v19.4S, v13.s[0] +ldr q8, [x0, #0] +ldr q1, [x0, #128] +mul v5.4S, v5.4S,v25.s[0] +sub v9.4s, v8.4s, v11.4s +ldr q12, [x0, #16] +mul v23.4S, v23.4S,v25.s[0] +add v8.4s, v8.4s, v11.4s +ldr q11, [x0, #144] +mla v5.4S, v4.4S, v31.s[0] +sub v4.4s, v12.4s, v20.4s +ldr q16, [x0, #64] +mla v23.4S, v2.4S, v31.s[0] +add v12.4s, v12.4s, v20.4s +ldr q20, [x0, #192] +mul v0.4S, v0.4S,v10.s[0] +sub v2.4s, v16.4s, v29.4s +ldr q6, [x0, #80] +mul v19.4S, v19.4S,v10.s[0] +add v16.4s, v16.4s, v29.4s +ldr q29, [x0, #208] +mla v0.4S, v28.4S, v31.s[0] +mla v19.4S, v26.4S, v31.s[0] +sub v26.4s, v6.4s, v27.4s +sqrdmulh v28.4S, v12.4S, v21.s[1] +add v6.4s, v6.4s, v27.4s +mul v12.4S, v12.4S,v15.s[1] +sqrdmulh v27.4S, v4.4S, v21.s[2] +sub v22.4s, v1.4s, v5.4s +mul v4.4S, v4.4S,v15.s[2] +add v1.4s, v1.4s, v5.4s +sqrdmulh v21.4S, v6.4S, v30.s[1] +sub v15.4s, v11.4s, v23.4s +mul v6.4S, v6.4S,v3.s[1] +add v11.4s, v11.4s, v23.4s +sqrdmulh v23.4S, v26.4S, v30.s[2] +sub v5.4s, v20.4s, v0.4s +mul v26.4S, v26.4S,v3.s[2] +add v20.4s, v20.4s, v0.4s +mla v12.4S, v28.4S, v31.s[0] +sub v28.4s, v29.4s, v19.4s +ldr q30, [x0, #480] +sqrdmulh v3.4S, v11.4S, v24.s[1] +add v29.4s, v29.4s, v19.4s +mla v4.4S, v27.4S, v31.s[0] +ldr q27, [x0, #416] +sqrdmulh v19.4S, v15.4S, v24.s[2] +sub v0.4s, v8.4s, v12.4s +mla v6.4S, v21.4S, v31.s[0] +ldr q21, [x0, #288] +sqrdmulh v18.4S, v29.4S, v13.s[1] +add v8.4s, v8.4s, v12.4s +str q0, [x0, #16] +mla v26.4S, v23.4S, v31.s[0] +ldr q23, [x17, #+256] +ldr q0, [x17, #+272] +sqrdmulh v12.4S, v28.4S, v13.s[2] +sub v7.4s, v9.4s, v4.4s +str q8, [x0, #0] +mul v11.4S, v11.4S,v25.s[1] +add v9.4s, v9.4s, v4.4s +mul v15.4S, v15.4S,v25.s[2] +str q7, [x0, #48] +mla v11.4S, v3.4S, v31.s[0] +sub v3.4s, v16.4s, v6.4s +mla v15.4S, v19.4S, v31.s[0] +str q9, [x0, #32] +mul v29.4S, v29.4S,v10.s[1] +str q3, [x0, #80] +mul v28.4S, v28.4S,v10.s[2] +add v16.4s, v16.4s, v6.4s +str q16, [x0, #64] +mla v29.4S, v18.4S, v31.s[0] +sub v18.4s, v2.4s, v26.4s +str q18, [x0, #112] +mla v28.4S, v12.4S, v31.s[0] +add v2.4s, v2.4s, v26.4s +str q2, [x0, #96] +sqrdmulh v13.4S, v21.4S, v0.s[0] +sub v10.4s, v1.4s, v11.4s +mul v21.4S, v21.4S,v23.s[0] +str q10, [x0, #144] +ldr q10, [x0, #304] +sqrdmulh v2.4S, v10.4S, v0.s[0] +add v1.4s, v1.4s, v11.4s +mul v10.4S, v10.4S,v23.s[0] +str q1, [x0, #128] +ldr q1, [x17, #+288] +ldr q11, [x17, #+304] +ldr q26, [x0, #352] +sqrdmulh v12.4S, v26.4S, v11.s[0] +sub v18.4s, v22.4s, v15.4s +mul v26.4S, v26.4S,v1.s[0] +str q18, [x0, #176] +ldr q18, [x0, #368] +sqrdmulh v16.4S, v18.4S, v11.s[0] +add v22.4s, v22.4s, v15.4s +mul v18.4S, v18.4S,v1.s[0] +str q22, [x0, #160] +ldr q22, [x17, #+320] +ldr q15, [x17, #+336] +mla v21.4S, v13.4S, v31.s[0] +sub v13.4s, v20.4s, v29.4s +sqrdmulh v6.4S, v27.4S, v15.s[0] +str q13, [x0, #208] +ldr q13, [x0, #432] +mla v10.4S, v2.4S, v31.s[0] +add v20.4s, v20.4s, v29.4s +sqrdmulh v29.4S, v13.4S, v15.s[0] +str q20, [x0, #192] +ldr q20, [x17, #+352] +ldr q2, [x17, #+368] +mla v26.4S, v12.4S, v31.s[0] +sub v12.4s, v5.4s, v28.4s +sqrdmulh v3.4S, v30.4S, v2.s[0] +str q12, [x0, #240] +ldr q12, [x0, #496] +mla v18.4S, v16.4S, v31.s[0] +add v5.4s, v5.4s, v28.4s +sqrdmulh v28.4S, v12.4S, v2.s[0] +str q5, [x0, #224] +ldr q5, [x0, #256] +ldr q16, [x0, #384] +mul v27.4S, v27.4S,v22.s[0] +sub v24.4s, v5.4s, v21.4s +ldr q25, [x0, #272] +mul v13.4S, v13.4S,v22.s[0] +add v5.4s, v5.4s, v21.4s +ldr q21, [x0, #400] +mla v27.4S, v6.4S, v31.s[0] +sub v6.4s, v25.4s, v10.4s +ldr q9, [x0, #320] +mla v13.4S, v29.4S, v31.s[0] +add v25.4s, v25.4s, v10.4s +ldr q10, [x0, #448] +mul v30.4S, v30.4S,v20.s[0] +sub v29.4s, v9.4s, v26.4s +ldr q19, [x0, #336] +mul v12.4S, v12.4S,v20.s[0] +add v9.4s, v9.4s, v26.4s +ldr q26, [x0, #464] +mla v30.4S, v3.4S, v31.s[0] +mla v12.4S, v28.4S, v31.s[0] +sub v28.4s, v19.4s, v18.4s +sqrdmulh v3.4S, v25.4S, v0.s[1] +add v19.4s, v19.4s, v18.4s +mul v25.4S, v25.4S,v23.s[1] +sqrdmulh v18.4S, v6.4S, v0.s[2] +sub v7.4s, v16.4s, v27.4s +mul v6.4S, v6.4S,v23.s[2] +add v16.4s, v16.4s, v27.4s +sqrdmulh v0.4S, v19.4S, v11.s[1] +sub v23.4s, v21.4s, v13.4s +mul v19.4S, v19.4S,v1.s[1] +add v21.4s, v21.4s, v13.4s +sqrdmulh v13.4S, v28.4S, v11.s[2] +sub v27.4s, v10.4s, v30.4s +mul v28.4S, v28.4S,v1.s[2] +add v10.4s, v10.4s, v30.4s +mla v25.4S, v3.4S, v31.s[0] +sub v3.4s, v26.4s, v12.4s +ldr q11, [x0, #736] +sqrdmulh v1.4S, v21.4S, v15.s[1] +add v26.4s, v26.4s, v12.4s +mla v6.4S, v18.4S, v31.s[0] +ldr q18, [x0, #672] +sqrdmulh v12.4S, v23.4S, v15.s[2] +sub v30.4s, v5.4s, v25.4s +mla v19.4S, v0.4S, v31.s[0] +ldr q0, [x0, #544] +sqrdmulh v4.4S, v26.4S, v2.s[1] +add v5.4s, v5.4s, v25.4s +str q30, [x0, #272] +mla v28.4S, v13.4S, v31.s[0] +ldr q13, [x17, #+384] +ldr q30, [x17, #+400] +sqrdmulh v25.4S, v3.4S, v2.s[2] +sub v8.4s, v24.4s, v6.4s +str q5, [x0, #256] +mul v21.4S, v21.4S,v22.s[1] +add v24.4s, v24.4s, v6.4s +mul v23.4S, v23.4S,v22.s[2] +str q8, [x0, #304] +mla v21.4S, v1.4S, v31.s[0] +sub v1.4s, v9.4s, v19.4s +mla v23.4S, v12.4S, v31.s[0] +str q24, [x0, #288] +mul v26.4S, v26.4S,v20.s[1] +str q1, [x0, #336] +mul v3.4S, v3.4S,v20.s[2] +add v9.4s, v9.4s, v19.4s +str q9, [x0, #320] +mla v26.4S, v4.4S, v31.s[0] +sub v4.4s, v29.4s, v28.4s +str q4, [x0, #368] +mla v3.4S, v25.4S, v31.s[0] +add v29.4s, v29.4s, v28.4s +str q29, [x0, #352] +sqrdmulh v2.4S, v0.4S, v30.s[0] +sub v20.4s, v16.4s, v21.4s +mul v0.4S, v0.4S,v13.s[0] +str q20, [x0, #400] +ldr q20, [x0, #560] +sqrdmulh v29.4S, v20.4S, v30.s[0] +add v16.4s, v16.4s, v21.4s +mul v20.4S, v20.4S,v13.s[0] +str q16, [x0, #384] +ldr q16, [x17, #+416] +ldr q21, [x17, #+432] +ldr q28, [x0, #608] +sqrdmulh v25.4S, v28.4S, v21.s[0] +sub v4.4s, v7.4s, v23.4s +mul v28.4S, v28.4S,v16.s[0] +str q4, [x0, #432] +ldr q4, [x0, #624] +sqrdmulh v9.4S, v4.4S, v21.s[0] +add v7.4s, v7.4s, v23.4s +mul v4.4S, v4.4S,v16.s[0] +str q7, [x0, #416] +ldr q7, [x17, #+448] +ldr q23, [x17, #+464] +mla v0.4S, v2.4S, v31.s[0] +sub v2.4s, v10.4s, v26.4s +sqrdmulh v19.4S, v18.4S, v23.s[0] +str q2, [x0, #464] +ldr q2, [x0, #688] +mla v20.4S, v29.4S, v31.s[0] +add v10.4s, v10.4s, v26.4s +sqrdmulh v26.4S, v2.4S, v23.s[0] +str q10, [x0, #448] +ldr q10, [x17, #+480] +ldr q29, [x17, #+496] +mla v28.4S, v25.4S, v31.s[0] +sub v25.4s, v27.4s, v3.4s +sqrdmulh v1.4S, v11.4S, v29.s[0] +str q25, [x0, #496] +ldr q25, [x0, #752] +mla v4.4S, v9.4S, v31.s[0] +add v27.4s, v27.4s, v3.4s +sqrdmulh v3.4S, v25.4S, v29.s[0] +str q27, [x0, #480] +ldr q27, [x0, #512] +ldr q9, [x0, #640] +mul v18.4S, v18.4S,v7.s[0] +sub v15.4s, v27.4s, v0.4s +ldr q22, [x0, #528] +mul v2.4S, v2.4S,v7.s[0] +add v27.4s, v27.4s, v0.4s +ldr q0, [x0, #656] +mla v18.4S, v19.4S, v31.s[0] +sub v19.4s, v22.4s, v20.4s +ldr q24, [x0, #576] +mla v2.4S, v26.4S, v31.s[0] +add v22.4s, v22.4s, v20.4s +ldr q20, [x0, #704] +mul v11.4S, v11.4S,v10.s[0] +sub v26.4s, v24.4s, v28.4s +ldr q12, [x0, #592] +mul v25.4S, v25.4S,v10.s[0] +add v24.4s, v24.4s, v28.4s +ldr q28, [x0, #720] +mla v11.4S, v1.4S, v31.s[0] +mla v25.4S, v3.4S, v31.s[0] +sub v3.4s, v12.4s, v4.4s +sqrdmulh v1.4S, v22.4S, v30.s[1] +add v12.4s, v12.4s, v4.4s +mul v22.4S, v22.4S,v13.s[1] +sqrdmulh v4.4S, v19.4S, v30.s[2] +sub v8.4s, v9.4s, v18.4s +mul v19.4S, v19.4S,v13.s[2] +add v9.4s, v9.4s, v18.4s +sqrdmulh v30.4S, v12.4S, v21.s[1] +sub v13.4s, v0.4s, v2.4s +mul v12.4S, v12.4S,v16.s[1] +add v0.4s, v0.4s, v2.4s +sqrdmulh v2.4S, v3.4S, v21.s[2] +sub v18.4s, v20.4s, v11.4s +mul v3.4S, v3.4S,v16.s[2] +add v20.4s, v20.4s, v11.4s +mla v22.4S, v1.4S, v31.s[0] +sub v1.4s, v28.4s, v25.4s +ldr q21, [x0, #992] +sqrdmulh v16.4S, v0.4S, v23.s[1] +add v28.4s, v28.4s, v25.4s +mla v19.4S, v4.4S, v31.s[0] +ldr q4, [x0, #928] +sqrdmulh v25.4S, v13.4S, v23.s[2] +sub v11.4s, v27.4s, v22.4s +mla v12.4S, v30.4S, v31.s[0] +ldr q30, [x0, #800] +sqrdmulh v6.4S, v28.4S, v29.s[1] +add v27.4s, v27.4s, v22.4s +str q11, [x0, #528] +mla v3.4S, v2.4S, v31.s[0] +ldr q2, [x17, #+512] +ldr q11, [x17, #+528] +sqrdmulh v22.4S, v1.4S, v29.s[2] +sub v5.4s, v15.4s, v19.4s +str q27, [x0, #512] +mul v0.4S, v0.4S,v7.s[1] +add v15.4s, v15.4s, v19.4s +mul v13.4S, v13.4S,v7.s[2] +str q5, [x0, #560] +mla v0.4S, v16.4S, v31.s[0] +sub v16.4s, v24.4s, v12.4s +mla v13.4S, v25.4S, v31.s[0] +str q15, [x0, #544] +mul v28.4S, v28.4S,v10.s[1] +str q16, [x0, #592] +mul v1.4S, v1.4S,v10.s[2] +add v24.4s, v24.4s, v12.4s +str q24, [x0, #576] +mla v28.4S, v6.4S, v31.s[0] +sub v6.4s, v26.4s, v3.4s +str q6, [x0, #624] +mla v1.4S, v22.4S, v31.s[0] +add v26.4s, v26.4s, v3.4s +str q26, [x0, #608] +sqrdmulh v29.4S, v30.4S, v11.s[0] +sub v10.4s, v9.4s, v0.4s +mul v30.4S, v30.4S,v2.s[0] +str q10, [x0, #656] +ldr q10, [x0, #816] +sqrdmulh v26.4S, v10.4S, v11.s[0] +add v9.4s, v9.4s, v0.4s +mul v10.4S, v10.4S,v2.s[0] +str q9, [x0, #640] +ldr q9, [x17, #+544] +ldr q0, [x17, #+560] +ldr q3, [x0, #864] +sqrdmulh v22.4S, v3.4S, v0.s[0] +sub v6.4s, v8.4s, v13.4s +mul v3.4S, v3.4S,v9.s[0] +str q6, [x0, #688] +ldr q6, [x0, #880] +sqrdmulh v24.4S, v6.4S, v0.s[0] +add v8.4s, v8.4s, v13.4s +mul v6.4S, v6.4S,v9.s[0] +str q8, [x0, #672] +ldr q8, [x17, #+576] +ldr q13, [x17, #+592] +mla v30.4S, v29.4S, v31.s[0] +sub v29.4s, v20.4s, v28.4s +sqrdmulh v12.4S, v4.4S, v13.s[0] +str q29, [x0, #720] +ldr q29, [x0, #944] +mla v10.4S, v26.4S, v31.s[0] +add v20.4s, v20.4s, v28.4s +sqrdmulh v28.4S, v29.4S, v13.s[0] +str q20, [x0, #704] +ldr q20, [x17, #+608] +ldr q26, [x17, #+624] +mla v3.4S, v22.4S, v31.s[0] +sub v22.4s, v18.4s, v1.4s +sqrdmulh v16.4S, v21.4S, v26.s[0] +str q22, [x0, #752] +ldr q22, [x0, #1008] +mla v6.4S, v24.4S, v31.s[0] +add v18.4s, v18.4s, v1.4s +sqrdmulh v1.4S, v22.4S, v26.s[0] +str q18, [x0, #736] +ldr q18, [x0, #768] +ldr q24, [x0, #896] +mul v4.4S, v4.4S,v8.s[0] +sub v23.4s, v18.4s, v30.4s +ldr q7, [x0, #784] +mul v29.4S, v29.4S,v8.s[0] +add v18.4s, v18.4s, v30.4s +ldr q30, [x0, #912] +mla v4.4S, v12.4S, v31.s[0] +sub v12.4s, v7.4s, v10.4s +ldr q15, [x0, #832] +mla v29.4S, v28.4S, v31.s[0] +add v7.4s, v7.4s, v10.4s +ldr q10, [x0, #960] +mul v21.4S, v21.4S,v20.s[0] +sub v28.4s, v15.4s, v3.4s +ldr q25, [x0, #848] +mul v22.4S, v22.4S,v20.s[0] +add v15.4s, v15.4s, v3.4s +ldr q3, [x0, #976] +mla v21.4S, v16.4S, v31.s[0] +mla v22.4S, v1.4S, v31.s[0] +sub v1.4s, v25.4s, v6.4s +sqrdmulh v16.4S, v7.4S, v11.s[1] +add v25.4s, v25.4s, v6.4s +mul v7.4S, v7.4S,v2.s[1] +sqrdmulh v6.4S, v12.4S, v11.s[2] +sub v5.4s, v24.4s, v4.4s +mul v12.4S, v12.4S,v2.s[2] +add v24.4s, v24.4s, v4.4s +sqrdmulh v11.4S, v25.4S, v0.s[1] +sub v2.4s, v30.4s, v29.4s +mul v25.4S, v25.4S,v9.s[1] +add v30.4s, v30.4s, v29.4s +sqrdmulh v29.4S, v1.4S, v0.s[2] +sub v4.4s, v10.4s, v21.4s +mul v1.4S, v1.4S,v9.s[2] +add v10.4s, v10.4s, v21.4s +mla v7.4S, v16.4S, v31.s[0] +sub v16.4s, v3.4s, v22.4s +sqrdmulh v0.4S, v30.4S, v13.s[1] +add v3.4s, v3.4s, v22.4s +mla v12.4S, v6.4S, v31.s[0] +sqrdmulh v6.4S, v2.4S, v13.s[2] +sub v22.4s, v18.4s, v7.4s +mla v25.4S, v11.4S, v31.s[0] +sqrdmulh v11.4S, v3.4S, v26.s[1] +add v18.4s, v18.4s, v7.4s +str q22, [x0, #784] +mla v1.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v16.4S, v26.s[2] +sub v22.4s, v23.4s, v12.4s +str q18, [x0, #768] +mul v30.4S, v30.4S,v8.s[1] +add v23.4s, v23.4s, v12.4s +mul v2.4S, v2.4S,v8.s[2] +str q22, [x0, #816] +mla v30.4S, v0.4S, v31.s[0] +sub v0.4s, v15.4s, v25.4s +mla v2.4S, v6.4S, v31.s[0] +str q23, [x0, #800] +mul v3.4S, v3.4S,v20.s[1] +str q0, [x0, #848] +mul v16.4S, v16.4S,v20.s[2] +add v15.4s, v15.4s, v25.4s +str q15, [x0, #832] +mla v3.4S, v11.4S, v31.s[0] +sub v11.4s, v28.4s, v1.4s +str q11, [x0, #880] +mla v16.4S, v29.4S, v31.s[0] +add v28.4s, v28.4s, v1.4s +str q28, [x0, #864] +sub v26.4s, v24.4s, v30.4s +str q26, [x0, #912] +add v24.4s, v24.4s, v30.4s +str q24, [x0, #896] +sub v24.4s, v5.4s, v2.4s +str q24, [x0, #944] +add v5.4s, v5.4s, v2.4s +str q5, [x0, #928] +sub v5.4s, v10.4s, v3.4s +str q5, [x0, #976] +add v10.4s, v10.4s, v3.4s +str q10, [x0, #960] +sub v10.4s, v4.4s, v16.4s +str q10, [x0, #1008] +add v4.4s, v4.4s, v16.4s +str q4, [x0, #992] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1464 +// Instruction count: 1460 \ No newline at end of file diff --git a/asm/auto/ntt_sve2/ntt_u32_incomplete_33556993_28678040_var_3_3_0.s b/asm/auto/ntt_sve2/ntt_u32_incomplete_33556993_28678040_var_3_3_0.s new file mode 100644 index 0000000..1a3b890 --- /dev/null +++ b/asm/auto/ntt_sve2/ntt_u32_incomplete_33556993_28678040_var_3_3_0.s @@ -0,0 +1,1475 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +modulus: +.word -33556993 +.word -33556993 +.word -33556993 +.word -33556993 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 23825509 // Layer 4, block 0 +.word 27028662 // Layer 4, block 1 +.word 0 // Layer None, block None +.word 1307297022 // Layer 3, block 0 +.word 1524716204 // Layer 4, block 0 +.word 1729702351 // Layer 4, block 1 +.word 0 // Layer None, block None +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 14626653 // Layer 3, block 1 +.word 14833295 // Layer 4, block 2 +.word 2138810 // Layer 4, block 3 +.word 0 // Layer None, block None +.word 936034350 // Layer 3, block 1 +.word 949258429 // Layer 4, block 2 +.word 136873393 // Layer 4, block 3 +.word 0 // Layer None, block None +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 29737761 // Layer 3, block 2 +.word 6490403 // Layer 4, block 4 +.word 19648405 // Layer 4, block 5 +.word 0 // Layer None, block None +.word 1903071454 // Layer 3, block 2 +.word 415354091 // Layer 4, block 4 +.word 1257401950 // Layer 4, block 5 +.word 0 // Layer None, block None +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 30285189 // Layer 3, block 3 +.word 31254932 // Layer 4, block 6 +.word 26362414 // Layer 4, block 7 +.word 0 // Layer None, block None +.word 1938104173 // Layer 3, block 3 +.word 2000162988 // Layer 4, block 6 +.word 1687065733 // Layer 4, block 7 +.word 0 // Layer None, block None +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 21289485 // Layer 3, block 4 +.word 572895 // Layer 4, block 8 +.word 26691971 // Layer 4, block 9 +.word 0 // Layer None, block None +.word 1362423055 // Layer 3, block 4 +.word 36662482 // Layer 4, block 8 +.word 1708155771 // Layer 4, block 9 +.word 0 // Layer None, block None +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 9914896 // Layer 3, block 5 +.word 9249292 // Layer 4, block 10 +.word 29292862 // Layer 4, block 11 +.word 0 // Layer None, block None +.word 634504916 // Layer 3, block 5 +.word 591909511 // Layer 4, block 10 +.word 1874600091 // Layer 4, block 11 +.word 0 // Layer None, block None +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 22603682 // Layer 3, block 6 +.word 8247799 // Layer 4, block 12 +.word 5086187 // Layer 4, block 13 +.word 0 // Layer None, block None +.word 1446525244 // Layer 3, block 6 +.word 527818851 // Layer 4, block 12 +.word 325491125 // Layer 4, block 13 +.word 0 // Layer None, block None +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 16204162 // Layer 3, block 7 +.word 28113639 // Layer 4, block 14 +.word 8471290 // Layer 4, block 15 +.word 0 // Layer None, block None +.word 1036987221 // Layer 3, block 7 +.word 1799135579 // Layer 4, block 14 +.word 542121183 // Layer 4, block 15 +.word 0 // Layer None, block None +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.text +.type ntt_u32_incomplete_sve2_asm_var_3_3_0, %function +.global ntt_u32_incomplete_sve2_asm_var_3_3_0 +modulus_addr: .quad modulus +roots_merged_addr: .quad roots_merged +ntt_u32_incomplete_sve2_asm_var_3_3_0: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save SVE2 vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ldr x17, modulus_addr +ldr q31, [x17] +ptrue P0.s +ldr x17, roots_merged_addr +ldr q3, [x17, #+0] +ldr q2, [x17, #+16] +ldr q1, [x17, #+32] +ldr q0, [x17, #+48] +ldr q30, [x0, #960] +ldr q29, [x0, #832] +ldr q28, [x0, #576] +ldr q27, [x0, #704] +ldr q26, [x0, #448] +ldr q25, [x0, #320] +ldr q24, [x0, #64] +ldr q23, [x0, #192] +sqrdmulh z22.s, z30.s, z2.s[0] +mul z30.s, z30.s,z3.s[0] +mla z30.s, P0/M, z22.s, z31.s +sub z22.s, z26.s, z30.s +add z26.s, z26.s, z30.s +sqrdmulh z30.s, z29.s, z2.s[0] +mul z29.s, z29.s,z3.s[0] +mla z29.s, P0/M, z30.s, z31.s +sub z30.s, z25.s, z29.s +add z25.s, z25.s, z29.s +sqrdmulh z29.s, z28.s, z2.s[0] +mul z28.s, z28.s,z3.s[0] +mla z28.s, P0/M, z29.s, z31.s +sub z29.s, z24.s, z28.s +add z24.s, z24.s, z28.s +sqrdmulh z28.s, z27.s, z2.s[0] +mul z27.s, z27.s,z3.s[0] +mla z27.s, P0/M, z28.s, z31.s +sub z28.s, z23.s, z27.s +add z23.s, z23.s, z27.s +sqrdmulh z27.s, z26.s, z2.s[1] +mul z26.s, z26.s,z3.s[1] +mla z26.s, P0/M, z27.s, z31.s +sub z27.s, z23.s, z26.s +add z23.s, z23.s, z26.s +sqrdmulh z26.s, z25.s, z2.s[1] +mul z25.s, z25.s,z3.s[1] +mla z25.s, P0/M, z26.s, z31.s +sub z26.s, z24.s, z25.s +add z24.s, z24.s, z25.s +sqrdmulh z25.s, z22.s, z2.s[2] +mul z22.s, z22.s,z3.s[2] +mla z22.s, P0/M, z25.s, z31.s +sub z25.s, z28.s, z22.s +add z28.s, z28.s, z22.s +sqrdmulh z22.s, z30.s, z2.s[2] +mul z30.s, z30.s,z3.s[2] +mla z30.s, P0/M, z22.s, z31.s +sub z22.s, z29.s, z30.s +add z29.s, z29.s, z30.s +sqrdmulh z30.s, z23.s, z0.s[0] +mul z23.s, z23.s,z1.s[0] +mla z23.s, P0/M, z30.s, z31.s +sub z30.s, z24.s, z23.s +add z24.s, z24.s, z23.s +str q24, [x0, #64] +str q30, [x0, #192] +sqrdmulh z30.s, z27.s, z0.s[1] +mul z27.s, z27.s,z1.s[1] +mla z27.s, P0/M, z30.s, z31.s +sub z30.s, z26.s, z27.s +add z26.s, z26.s, z27.s +str q26, [x0, #320] +str q30, [x0, #448] +sqrdmulh z30.s, z25.s, z0.s[3] +mul z25.s, z25.s,z1.s[3] +mla z25.s, P0/M, z30.s, z31.s +sub z30.s, z22.s, z25.s +add z22.s, z22.s, z25.s +str q22, [x0, #832] +str q30, [x0, #960] +sqrdmulh z30.s, z28.s, z0.s[2] +mul z28.s, z28.s,z1.s[2] +mla z28.s, P0/M, z30.s, z31.s +sub z30.s, z29.s, z28.s +add z29.s, z29.s, z28.s +str q29, [x0, #576] +str q30, [x0, #704] +ldr q30, [x0, #976] +ldr q29, [x0, #848] +ldr q28, [x0, #592] +ldr q22, [x0, #720] +ldr q25, [x0, #464] +ldr q26, [x0, #336] +ldr q27, [x0, #80] +ldr q24, [x0, #208] +sqrdmulh z23.s, z30.s, z2.s[0] +mul z30.s, z30.s,z3.s[0] +mla z30.s, P0/M, z23.s, z31.s +sub z23.s, z25.s, z30.s +add z25.s, z25.s, z30.s +sqrdmulh z30.s, z29.s, z2.s[0] +mul z29.s, z29.s,z3.s[0] +mla z29.s, P0/M, z30.s, z31.s +sub z30.s, z26.s, z29.s +add z26.s, z26.s, z29.s +sqrdmulh z29.s, z28.s, z2.s[0] +mul z28.s, z28.s,z3.s[0] +mla z28.s, P0/M, z29.s, z31.s +sub z29.s, z27.s, z28.s +add z27.s, z27.s, z28.s +sqrdmulh z28.s, z22.s, z2.s[0] +mul z22.s, z22.s,z3.s[0] +mla z22.s, P0/M, z28.s, z31.s +sub z28.s, z24.s, z22.s +add z24.s, z24.s, z22.s +sqrdmulh z22.s, z25.s, z2.s[1] +mul z25.s, z25.s,z3.s[1] +mla z25.s, P0/M, z22.s, z31.s +sub z22.s, z24.s, z25.s +add z24.s, z24.s, z25.s +sqrdmulh z25.s, z26.s, z2.s[1] +mul z26.s, z26.s,z3.s[1] +mla z26.s, P0/M, z25.s, z31.s +sub z25.s, z27.s, z26.s +add z27.s, z27.s, z26.s +sqrdmulh z26.s, z23.s, z2.s[2] +mul z23.s, z23.s,z3.s[2] +mla z23.s, P0/M, z26.s, z31.s +sub z26.s, z28.s, z23.s +add z28.s, z28.s, z23.s +sqrdmulh z23.s, z30.s, z2.s[2] +mul z30.s, z30.s,z3.s[2] +mla z30.s, P0/M, z23.s, z31.s +sub z23.s, z29.s, z30.s +add z29.s, z29.s, z30.s +sqrdmulh z30.s, z24.s, z0.s[0] +mul z24.s, z24.s,z1.s[0] +mla z24.s, P0/M, z30.s, z31.s +sub z30.s, z27.s, z24.s +add z27.s, z27.s, z24.s +str q27, [x0, #80] +str q30, [x0, #208] +sqrdmulh z30.s, z22.s, z0.s[1] +mul z22.s, z22.s,z1.s[1] +mla z22.s, P0/M, z30.s, z31.s +sub z30.s, z25.s, z22.s +add z25.s, z25.s, z22.s +str q25, [x0, #336] +str q30, [x0, #464] +sqrdmulh z30.s, z26.s, z0.s[3] +mul z26.s, z26.s,z1.s[3] +mla z26.s, P0/M, z30.s, z31.s +sub z30.s, z23.s, z26.s +add z23.s, z23.s, z26.s +str q23, [x0, #848] +str q30, [x0, #976] +sqrdmulh z30.s, z28.s, z0.s[2] +mul z28.s, z28.s,z1.s[2] +mla z28.s, P0/M, z30.s, z31.s +sub z30.s, z29.s, z28.s +add z29.s, z29.s, z28.s +str q29, [x0, #592] +str q30, [x0, #720] +ldr q30, [x0, #992] +ldr q29, [x0, #864] +ldr q28, [x0, #608] +ldr q23, [x0, #736] +ldr q26, [x0, #480] +ldr q25, [x0, #352] +ldr q22, [x0, #96] +ldr q27, [x0, #224] +sqrdmulh z24.s, z30.s, z2.s[0] +mul z30.s, z30.s,z3.s[0] +mla z30.s, P0/M, z24.s, z31.s +sub z24.s, z26.s, z30.s +add z26.s, z26.s, z30.s +sqrdmulh z30.s, z29.s, z2.s[0] +mul z29.s, z29.s,z3.s[0] +mla z29.s, P0/M, z30.s, z31.s +sub z30.s, z25.s, z29.s +add z25.s, z25.s, z29.s +sqrdmulh z29.s, z28.s, z2.s[0] +mul z28.s, z28.s,z3.s[0] +mla z28.s, P0/M, z29.s, z31.s +sub z29.s, z22.s, z28.s +add z22.s, z22.s, z28.s +sqrdmulh z28.s, z23.s, z2.s[0] +mul z23.s, z23.s,z3.s[0] +mla z23.s, P0/M, z28.s, z31.s +sub z28.s, z27.s, z23.s +add z27.s, z27.s, z23.s +sqrdmulh z23.s, z26.s, z2.s[1] +mul z26.s, z26.s,z3.s[1] +mla z26.s, P0/M, z23.s, z31.s +sub z23.s, z27.s, z26.s +add z27.s, z27.s, z26.s +sqrdmulh z26.s, z25.s, z2.s[1] +mul z25.s, z25.s,z3.s[1] +mla z25.s, P0/M, z26.s, z31.s +sub z26.s, z22.s, z25.s +add z22.s, z22.s, z25.s +sqrdmulh z25.s, z24.s, z2.s[2] +mul z24.s, z24.s,z3.s[2] +mla z24.s, P0/M, z25.s, z31.s +sub z25.s, z28.s, z24.s +add z28.s, z28.s, z24.s +sqrdmulh z24.s, z30.s, z2.s[2] +mul z30.s, z30.s,z3.s[2] +mla z30.s, P0/M, z24.s, z31.s +sub z24.s, z29.s, z30.s +add z29.s, z29.s, z30.s +sqrdmulh z30.s, z27.s, z0.s[0] +mul z27.s, z27.s,z1.s[0] +mla z27.s, P0/M, z30.s, z31.s +sub z30.s, z22.s, z27.s +add z22.s, z22.s, z27.s +str q22, [x0, #96] +str q30, [x0, #224] +sqrdmulh z30.s, z23.s, z0.s[1] +mul z23.s, z23.s,z1.s[1] +mla z23.s, P0/M, z30.s, z31.s +sub z30.s, z26.s, z23.s +add z26.s, z26.s, z23.s +str q26, [x0, #352] +str q30, [x0, #480] +sqrdmulh z30.s, z25.s, z0.s[3] +mul z25.s, z25.s,z1.s[3] +mla z25.s, P0/M, z30.s, z31.s +sub z30.s, z24.s, z25.s +add z24.s, z24.s, z25.s +str q24, [x0, #864] +str q30, [x0, #992] +sqrdmulh z30.s, z28.s, z0.s[2] +mul z28.s, z28.s,z1.s[2] +mla z28.s, P0/M, z30.s, z31.s +sub z30.s, z29.s, z28.s +add z29.s, z29.s, z28.s +str q29, [x0, #608] +str q30, [x0, #736] +ldr q30, [x0, #1008] +ldr q29, [x0, #880] +ldr q28, [x0, #624] +ldr q24, [x0, #752] +ldr q25, [x0, #496] +ldr q26, [x0, #368] +ldr q23, [x0, #112] +ldr q22, [x0, #240] +sqrdmulh z27.s, z30.s, z2.s[0] +mul z30.s, z30.s,z3.s[0] +mla z30.s, P0/M, z27.s, z31.s +sub z27.s, z25.s, z30.s +add z25.s, z25.s, z30.s +sqrdmulh z30.s, z29.s, z2.s[0] +mul z29.s, z29.s,z3.s[0] +mla z29.s, P0/M, z30.s, z31.s +sub z30.s, z26.s, z29.s +add z26.s, z26.s, z29.s +sqrdmulh z29.s, z28.s, z2.s[0] +mul z28.s, z28.s,z3.s[0] +mla z28.s, P0/M, z29.s, z31.s +sub z29.s, z23.s, z28.s +add z23.s, z23.s, z28.s +sqrdmulh z28.s, z24.s, z2.s[0] +mul z24.s, z24.s,z3.s[0] +mla z24.s, P0/M, z28.s, z31.s +sub z28.s, z22.s, z24.s +add z22.s, z22.s, z24.s +sqrdmulh z24.s, z25.s, z2.s[1] +mul z25.s, z25.s,z3.s[1] +mla z25.s, P0/M, z24.s, z31.s +sub z24.s, z22.s, z25.s +add z22.s, z22.s, z25.s +sqrdmulh z25.s, z26.s, z2.s[1] +mul z26.s, z26.s,z3.s[1] +mla z26.s, P0/M, z25.s, z31.s +sub z25.s, z23.s, z26.s +add z23.s, z23.s, z26.s +sqrdmulh z26.s, z27.s, z2.s[2] +mul z27.s, z27.s,z3.s[2] +mla z27.s, P0/M, z26.s, z31.s +sub z26.s, z28.s, z27.s +add z28.s, z28.s, z27.s +sqrdmulh z27.s, z30.s, z2.s[2] +mul z30.s, z30.s,z3.s[2] +mla z30.s, P0/M, z27.s, z31.s +sub z27.s, z29.s, z30.s +add z29.s, z29.s, z30.s +sqrdmulh z30.s, z22.s, z0.s[0] +mul z22.s, z22.s,z1.s[0] +mla z22.s, P0/M, z30.s, z31.s +sub z30.s, z23.s, z22.s +add z23.s, z23.s, z22.s +str q23, [x0, #112] +str q30, [x0, #240] +sqrdmulh z30.s, z24.s, z0.s[1] +mul z24.s, z24.s,z1.s[1] +mla z24.s, P0/M, z30.s, z31.s +sub z30.s, z25.s, z24.s +add z25.s, z25.s, z24.s +str q25, [x0, #368] +str q30, [x0, #496] +sqrdmulh z30.s, z26.s, z0.s[3] +mul z26.s, z26.s,z1.s[3] +mla z26.s, P0/M, z30.s, z31.s +sub z30.s, z27.s, z26.s +add z27.s, z27.s, z26.s +str q27, [x0, #880] +str q30, [x0, #1008] +sqrdmulh z30.s, z28.s, z0.s[2] +mul z28.s, z28.s,z1.s[2] +mla z28.s, P0/M, z30.s, z31.s +sub z30.s, z29.s, z28.s +add z29.s, z29.s, z28.s +str q29, [x0, #624] +str q30, [x0, #752] +ldr q30, [x0, #896] +ldr q29, [x0, #768] +ldr q28, [x0, #512] +ldr q27, [x0, #640] +ldr q26, [x0, #384] +ldr q25, [x0, #256] +ldr q24, [x0, #0] +ldr q23, [x0, #128] +sqrdmulh z22.s, z30.s, z2.s[0] +mul z30.s, z30.s,z3.s[0] +mla z30.s, P0/M, z22.s, z31.s +sub z22.s, z26.s, z30.s +add z26.s, z26.s, z30.s +sqrdmulh z30.s, z29.s, z2.s[0] +mul z29.s, z29.s,z3.s[0] +mla z29.s, P0/M, z30.s, z31.s +sub z30.s, z25.s, z29.s +add z25.s, z25.s, z29.s +sqrdmulh z29.s, z28.s, z2.s[0] +mul z28.s, z28.s,z3.s[0] +mla z28.s, P0/M, z29.s, z31.s +sub z29.s, z24.s, z28.s +add z24.s, z24.s, z28.s +sqrdmulh z28.s, z27.s, z2.s[0] +mul z27.s, z27.s,z3.s[0] +mla z27.s, P0/M, z28.s, z31.s +sub z28.s, z23.s, z27.s +add z23.s, z23.s, z27.s +sqrdmulh z27.s, z26.s, z2.s[1] +mul z26.s, z26.s,z3.s[1] +mla z26.s, P0/M, z27.s, z31.s +sub z27.s, z23.s, z26.s +add z23.s, z23.s, z26.s +sqrdmulh z26.s, z25.s, z2.s[1] +mul z25.s, z25.s,z3.s[1] +mla z25.s, P0/M, z26.s, z31.s +sub z26.s, z24.s, z25.s +add z24.s, z24.s, z25.s +sqrdmulh z25.s, z22.s, z2.s[2] +mul z22.s, z22.s,z3.s[2] +mla z22.s, P0/M, z25.s, z31.s +sub z25.s, z28.s, z22.s +add z28.s, z28.s, z22.s +sqrdmulh z22.s, z30.s, z2.s[2] +mul z30.s, z30.s,z3.s[2] +mla z30.s, P0/M, z22.s, z31.s +sub z22.s, z29.s, z30.s +add z29.s, z29.s, z30.s +sqrdmulh z30.s, z23.s, z0.s[0] +mul z23.s, z23.s,z1.s[0] +mla z23.s, P0/M, z30.s, z31.s +sub z30.s, z24.s, z23.s +add z24.s, z24.s, z23.s +str q24, [x0, #0] +str q30, [x0, #128] +sqrdmulh z30.s, z27.s, z0.s[1] +mul z27.s, z27.s,z1.s[1] +mla z27.s, P0/M, z30.s, z31.s +sub z30.s, z26.s, z27.s +add z26.s, z26.s, z27.s +str q26, [x0, #256] +str q30, [x0, #384] +sqrdmulh z30.s, z25.s, z0.s[3] +mul z25.s, z25.s,z1.s[3] +mla z25.s, P0/M, z30.s, z31.s +sub z30.s, z22.s, z25.s +add z22.s, z22.s, z25.s +str q22, [x0, #768] +str q30, [x0, #896] +sqrdmulh z30.s, z28.s, z0.s[2] +mul z28.s, z28.s,z1.s[2] +mla z28.s, P0/M, z30.s, z31.s +sub z30.s, z29.s, z28.s +add z29.s, z29.s, z28.s +str q29, [x0, #512] +str q30, [x0, #640] +ldr q30, [x0, #912] +ldr q29, [x0, #784] +ldr q28, [x0, #528] +ldr q22, [x0, #656] +ldr q25, [x0, #400] +ldr q26, [x0, #272] +ldr q27, [x0, #16] +ldr q24, [x0, #144] +sqrdmulh z23.s, z30.s, z2.s[0] +mul z30.s, z30.s,z3.s[0] +mla z30.s, P0/M, z23.s, z31.s +sub z23.s, z25.s, z30.s +add z25.s, z25.s, z30.s +sqrdmulh z30.s, z29.s, z2.s[0] +mul z29.s, z29.s,z3.s[0] +mla z29.s, P0/M, z30.s, z31.s +sub z30.s, z26.s, z29.s +add z26.s, z26.s, z29.s +sqrdmulh z29.s, z28.s, z2.s[0] +mul z28.s, z28.s,z3.s[0] +mla z28.s, P0/M, z29.s, z31.s +sub z29.s, z27.s, z28.s +add z27.s, z27.s, z28.s +sqrdmulh z28.s, z22.s, z2.s[0] +mul z22.s, z22.s,z3.s[0] +mla z22.s, P0/M, z28.s, z31.s +sub z28.s, z24.s, z22.s +add z24.s, z24.s, z22.s +sqrdmulh z22.s, z25.s, z2.s[1] +mul z25.s, z25.s,z3.s[1] +mla z25.s, P0/M, z22.s, z31.s +sub z22.s, z24.s, z25.s +add z24.s, z24.s, z25.s +sqrdmulh z25.s, z26.s, z2.s[1] +mul z26.s, z26.s,z3.s[1] +mla z26.s, P0/M, z25.s, z31.s +sub z25.s, z27.s, z26.s +add z27.s, z27.s, z26.s +sqrdmulh z26.s, z23.s, z2.s[2] +mul z23.s, z23.s,z3.s[2] +mla z23.s, P0/M, z26.s, z31.s +sub z26.s, z28.s, z23.s +add z28.s, z28.s, z23.s +sqrdmulh z23.s, z30.s, z2.s[2] +mul z30.s, z30.s,z3.s[2] +mla z30.s, P0/M, z23.s, z31.s +sub z23.s, z29.s, z30.s +add z29.s, z29.s, z30.s +sqrdmulh z30.s, z24.s, z0.s[0] +mul z24.s, z24.s,z1.s[0] +mla z24.s, P0/M, z30.s, z31.s +sub z30.s, z27.s, z24.s +add z27.s, z27.s, z24.s +str q27, [x0, #16] +str q30, [x0, #144] +sqrdmulh z30.s, z22.s, z0.s[1] +mul z22.s, z22.s,z1.s[1] +mla z22.s, P0/M, z30.s, z31.s +sub z30.s, z25.s, z22.s +add z25.s, z25.s, z22.s +str q25, [x0, #272] +str q30, [x0, #400] +sqrdmulh z30.s, z26.s, z0.s[3] +mul z26.s, z26.s,z1.s[3] +mla z26.s, P0/M, z30.s, z31.s +sub z30.s, z23.s, z26.s +add z23.s, z23.s, z26.s +str q23, [x0, #784] +str q30, [x0, #912] +sqrdmulh z30.s, z28.s, z0.s[2] +mul z28.s, z28.s,z1.s[2] +mla z28.s, P0/M, z30.s, z31.s +sub z30.s, z29.s, z28.s +add z29.s, z29.s, z28.s +str q29, [x0, #528] +str q30, [x0, #656] +ldr q30, [x0, #928] +ldr q29, [x0, #800] +ldr q28, [x0, #544] +ldr q23, [x0, #672] +ldr q26, [x0, #416] +ldr q25, [x0, #288] +ldr q22, [x0, #32] +ldr q27, [x0, #160] +sqrdmulh z24.s, z30.s, z2.s[0] +mul z30.s, z30.s,z3.s[0] +mla z30.s, P0/M, z24.s, z31.s +sub z24.s, z26.s, z30.s +add z26.s, z26.s, z30.s +sqrdmulh z30.s, z29.s, z2.s[0] +mul z29.s, z29.s,z3.s[0] +mla z29.s, P0/M, z30.s, z31.s +sub z30.s, z25.s, z29.s +add z25.s, z25.s, z29.s +sqrdmulh z29.s, z28.s, z2.s[0] +mul z28.s, z28.s,z3.s[0] +mla z28.s, P0/M, z29.s, z31.s +sub z29.s, z22.s, z28.s +add z22.s, z22.s, z28.s +sqrdmulh z28.s, z23.s, z2.s[0] +mul z23.s, z23.s,z3.s[0] +mla z23.s, P0/M, z28.s, z31.s +sub z28.s, z27.s, z23.s +add z27.s, z27.s, z23.s +sqrdmulh z23.s, z26.s, z2.s[1] +mul z26.s, z26.s,z3.s[1] +mla z26.s, P0/M, z23.s, z31.s +sub z23.s, z27.s, z26.s +add z27.s, z27.s, z26.s +sqrdmulh z26.s, z25.s, z2.s[1] +mul z25.s, z25.s,z3.s[1] +mla z25.s, P0/M, z26.s, z31.s +sub z26.s, z22.s, z25.s +add z22.s, z22.s, z25.s +sqrdmulh z25.s, z24.s, z2.s[2] +mul z24.s, z24.s,z3.s[2] +mla z24.s, P0/M, z25.s, z31.s +sub z25.s, z28.s, z24.s +add z28.s, z28.s, z24.s +sqrdmulh z24.s, z30.s, z2.s[2] +mul z30.s, z30.s,z3.s[2] +mla z30.s, P0/M, z24.s, z31.s +sub z24.s, z29.s, z30.s +add z29.s, z29.s, z30.s +sqrdmulh z30.s, z27.s, z0.s[0] +mul z27.s, z27.s,z1.s[0] +mla z27.s, P0/M, z30.s, z31.s +sub z30.s, z22.s, z27.s +add z22.s, z22.s, z27.s +str q22, [x0, #32] +str q30, [x0, #160] +sqrdmulh z30.s, z23.s, z0.s[1] +mul z23.s, z23.s,z1.s[1] +mla z23.s, P0/M, z30.s, z31.s +sub z30.s, z26.s, z23.s +add z26.s, z26.s, z23.s +str q26, [x0, #288] +str q30, [x0, #416] +sqrdmulh z30.s, z25.s, z0.s[3] +mul z25.s, z25.s,z1.s[3] +mla z25.s, P0/M, z30.s, z31.s +sub z30.s, z24.s, z25.s +add z24.s, z24.s, z25.s +str q24, [x0, #800] +str q30, [x0, #928] +sqrdmulh z30.s, z28.s, z0.s[2] +mul z28.s, z28.s,z1.s[2] +mla z28.s, P0/M, z30.s, z31.s +sub z30.s, z29.s, z28.s +add z29.s, z29.s, z28.s +str q29, [x0, #544] +str q30, [x0, #672] +ldr q30, [x0, #944] +ldr q29, [x0, #816] +ldr q28, [x0, #560] +ldr q24, [x0, #688] +ldr q25, [x0, #432] +ldr q26, [x0, #304] +ldr q23, [x0, #48] +ldr q22, [x0, #176] +sqrdmulh z27.s, z30.s, z2.s[0] +mul z30.s, z30.s,z3.s[0] +mla z30.s, P0/M, z27.s, z31.s +sub z27.s, z25.s, z30.s +add z25.s, z25.s, z30.s +sqrdmulh z30.s, z29.s, z2.s[0] +mul z29.s, z29.s,z3.s[0] +mla z29.s, P0/M, z30.s, z31.s +sub z30.s, z26.s, z29.s +add z26.s, z26.s, z29.s +sqrdmulh z29.s, z28.s, z2.s[0] +mul z28.s, z28.s,z3.s[0] +mla z28.s, P0/M, z29.s, z31.s +sub z29.s, z23.s, z28.s +add z23.s, z23.s, z28.s +sqrdmulh z28.s, z24.s, z2.s[0] +mul z24.s, z24.s,z3.s[0] +mla z24.s, P0/M, z28.s, z31.s +sub z28.s, z22.s, z24.s +add z22.s, z22.s, z24.s +sqrdmulh z24.s, z25.s, z2.s[1] +mul z25.s, z25.s,z3.s[1] +mla z25.s, P0/M, z24.s, z31.s +sub z24.s, z22.s, z25.s +add z22.s, z22.s, z25.s +sqrdmulh z25.s, z26.s, z2.s[1] +mul z26.s, z26.s,z3.s[1] +mla z26.s, P0/M, z25.s, z31.s +sub z25.s, z23.s, z26.s +add z23.s, z23.s, z26.s +sqrdmulh z26.s, z27.s, z2.s[2] +mul z27.s, z27.s,z3.s[2] +mla z27.s, P0/M, z26.s, z31.s +sub z26.s, z28.s, z27.s +add z28.s, z28.s, z27.s +sqrdmulh z27.s, z30.s, z2.s[2] +mul z30.s, z30.s,z3.s[2] +mla z30.s, P0/M, z27.s, z31.s +sub z27.s, z29.s, z30.s +add z29.s, z29.s, z30.s +sqrdmulh z30.s, z22.s, z0.s[0] +mul z22.s, z22.s,z1.s[0] +mla z22.s, P0/M, z30.s, z31.s +sub z30.s, z23.s, z22.s +add z23.s, z23.s, z22.s +str q23, [x0, #48] +str q30, [x0, #176] +sqrdmulh z30.s, z24.s, z0.s[1] +mul z24.s, z24.s,z1.s[1] +mla z24.s, P0/M, z30.s, z31.s +sub z30.s, z25.s, z24.s +add z25.s, z25.s, z24.s +str q25, [x0, #304] +str q30, [x0, #432] +sqrdmulh z30.s, z26.s, z0.s[3] +mul z26.s, z26.s,z1.s[3] +mla z26.s, P0/M, z30.s, z31.s +sub z30.s, z27.s, z26.s +add z27.s, z27.s, z26.s +str q27, [x0, #816] +str q30, [x0, #944] +sqrdmulh z30.s, z28.s, z0.s[2] +mul z28.s, z28.s,z1.s[2] +mla z28.s, P0/M, z30.s, z31.s +sub z30.s, z29.s, z28.s +add z29.s, z29.s, z28.s +str q29, [x0, #560] +str q30, [x0, #688] +ldr q4, [x17, #+64] +ldr q5, [x17, #+80] +ldr q6, [x17, #+96] +ldr q7, [x17, #+112] +ldr q8, [x0, #112] +ldr q9, [x0, #96] +ldr q10, [x0, #64] +ldr q11, [x0, #80] +ldr q12, [x0, #48] +ldr q13, [x0, #32] +ldr q14, [x0, #0] +ldr q15, [x0, #16] +sqrdmulh z16.s, z8.s, z5.s[0] +mul z8.s, z8.s,z4.s[0] +mla z8.s, P0/M, z16.s, z31.s +sub z16.s, z12.s, z8.s +add z12.s, z12.s, z8.s +sqrdmulh z8.s, z9.s, z5.s[0] +mul z9.s, z9.s,z4.s[0] +mla z9.s, P0/M, z8.s, z31.s +sub z8.s, z13.s, z9.s +add z13.s, z13.s, z9.s +sqrdmulh z9.s, z10.s, z5.s[0] +mul z10.s, z10.s,z4.s[0] +mla z10.s, P0/M, z9.s, z31.s +sub z9.s, z14.s, z10.s +add z14.s, z14.s, z10.s +sqrdmulh z10.s, z11.s, z5.s[0] +mul z11.s, z11.s,z4.s[0] +mla z11.s, P0/M, z10.s, z31.s +sub z10.s, z15.s, z11.s +add z15.s, z15.s, z11.s +sqrdmulh z11.s, z12.s, z5.s[1] +mul z12.s, z12.s,z4.s[1] +mla z12.s, P0/M, z11.s, z31.s +sub z11.s, z15.s, z12.s +add z15.s, z15.s, z12.s +ldr q3, [x17, #+128] +ldr q2, [x17, #+144] +ldr q1, [x17, #+160] +ldr q0, [x17, #+176] +sqrdmulh z12.s, z13.s, z5.s[1] +mul z13.s, z13.s,z4.s[1] +mla z13.s, P0/M, z12.s, z31.s +sub z12.s, z14.s, z13.s +add z14.s, z14.s, z13.s +sqrdmulh z13.s, z16.s, z5.s[2] +mul z16.s, z16.s,z4.s[2] +mla z16.s, P0/M, z13.s, z31.s +sub z13.s, z10.s, z16.s +add z10.s, z10.s, z16.s +sqrdmulh z16.s, z8.s, z5.s[2] +mul z8.s, z8.s,z4.s[2] +mla z8.s, P0/M, z16.s, z31.s +sub z16.s, z9.s, z8.s +add z9.s, z9.s, z8.s +sqrdmulh z8.s, z15.s, z7.s[0] +mul z15.s, z15.s,z6.s[0] +mla z15.s, P0/M, z8.s, z31.s +sub z8.s, z14.s, z15.s +add z14.s, z14.s, z15.s +str q14, [x0, #0] +str q8, [x0, #16] +sqrdmulh z8.s, z11.s, z7.s[1] +mul z11.s, z11.s,z6.s[1] +mla z11.s, P0/M, z8.s, z31.s +sub z8.s, z12.s, z11.s +add z12.s, z12.s, z11.s +str q12, [x0, #32] +str q8, [x0, #48] +sqrdmulh z8.s, z13.s, z7.s[3] +mul z13.s, z13.s,z6.s[3] +mla z13.s, P0/M, z8.s, z31.s +sub z8.s, z16.s, z13.s +add z16.s, z16.s, z13.s +str q16, [x0, #96] +str q8, [x0, #112] +sqrdmulh z8.s, z10.s, z7.s[2] +mul z10.s, z10.s,z6.s[2] +mla z10.s, P0/M, z8.s, z31.s +sub z8.s, z9.s, z10.s +add z9.s, z9.s, z10.s +str q9, [x0, #64] +str q8, [x0, #80] +ldr q8, [x0, #240] +ldr q9, [x0, #224] +ldr q10, [x0, #192] +ldr q16, [x0, #208] +ldr q13, [x0, #176] +ldr q12, [x0, #160] +ldr q11, [x0, #128] +ldr q14, [x0, #144] +sqrdmulh z15.s, z8.s, z2.s[0] +mul z8.s, z8.s,z3.s[0] +mla z8.s, P0/M, z15.s, z31.s +sub z15.s, z13.s, z8.s +add z13.s, z13.s, z8.s +sqrdmulh z8.s, z9.s, z2.s[0] +mul z9.s, z9.s,z3.s[0] +mla z9.s, P0/M, z8.s, z31.s +sub z8.s, z12.s, z9.s +add z12.s, z12.s, z9.s +sqrdmulh z9.s, z10.s, z2.s[0] +mul z10.s, z10.s,z3.s[0] +mla z10.s, P0/M, z9.s, z31.s +sub z9.s, z11.s, z10.s +add z11.s, z11.s, z10.s +sqrdmulh z10.s, z16.s, z2.s[0] +mul z16.s, z16.s,z3.s[0] +mla z16.s, P0/M, z10.s, z31.s +sub z10.s, z14.s, z16.s +add z14.s, z14.s, z16.s +sqrdmulh z16.s, z13.s, z2.s[1] +mul z13.s, z13.s,z3.s[1] +mla z13.s, P0/M, z16.s, z31.s +sub z16.s, z14.s, z13.s +add z14.s, z14.s, z13.s +ldr q7, [x17, #+192] +ldr q6, [x17, #+208] +ldr q5, [x17, #+224] +ldr q4, [x17, #+240] +sqrdmulh z13.s, z12.s, z2.s[1] +mul z12.s, z12.s,z3.s[1] +mla z12.s, P0/M, z13.s, z31.s +sub z13.s, z11.s, z12.s +add z11.s, z11.s, z12.s +sqrdmulh z12.s, z15.s, z2.s[2] +mul z15.s, z15.s,z3.s[2] +mla z15.s, P0/M, z12.s, z31.s +sub z12.s, z10.s, z15.s +add z10.s, z10.s, z15.s +sqrdmulh z15.s, z8.s, z2.s[2] +mul z8.s, z8.s,z3.s[2] +mla z8.s, P0/M, z15.s, z31.s +sub z15.s, z9.s, z8.s +add z9.s, z9.s, z8.s +sqrdmulh z8.s, z14.s, z0.s[0] +mul z14.s, z14.s,z1.s[0] +mla z14.s, P0/M, z8.s, z31.s +sub z8.s, z11.s, z14.s +add z11.s, z11.s, z14.s +str q11, [x0, #128] +str q8, [x0, #144] +sqrdmulh z8.s, z16.s, z0.s[1] +mul z16.s, z16.s,z1.s[1] +mla z16.s, P0/M, z8.s, z31.s +sub z8.s, z13.s, z16.s +add z13.s, z13.s, z16.s +str q13, [x0, #160] +str q8, [x0, #176] +sqrdmulh z8.s, z12.s, z0.s[3] +mul z12.s, z12.s,z1.s[3] +mla z12.s, P0/M, z8.s, z31.s +sub z8.s, z15.s, z12.s +add z15.s, z15.s, z12.s +str q15, [x0, #224] +str q8, [x0, #240] +sqrdmulh z8.s, z10.s, z0.s[2] +mul z10.s, z10.s,z1.s[2] +mla z10.s, P0/M, z8.s, z31.s +sub z8.s, z9.s, z10.s +add z9.s, z9.s, z10.s +str q9, [x0, #192] +str q8, [x0, #208] +ldr q8, [x0, #368] +ldr q9, [x0, #352] +ldr q10, [x0, #320] +ldr q15, [x0, #336] +ldr q12, [x0, #304] +ldr q13, [x0, #288] +ldr q16, [x0, #256] +ldr q11, [x0, #272] +sqrdmulh z14.s, z8.s, z6.s[0] +mul z8.s, z8.s,z7.s[0] +mla z8.s, P0/M, z14.s, z31.s +sub z14.s, z12.s, z8.s +add z12.s, z12.s, z8.s +sqrdmulh z8.s, z9.s, z6.s[0] +mul z9.s, z9.s,z7.s[0] +mla z9.s, P0/M, z8.s, z31.s +sub z8.s, z13.s, z9.s +add z13.s, z13.s, z9.s +sqrdmulh z9.s, z10.s, z6.s[0] +mul z10.s, z10.s,z7.s[0] +mla z10.s, P0/M, z9.s, z31.s +sub z9.s, z16.s, z10.s +add z16.s, z16.s, z10.s +sqrdmulh z10.s, z15.s, z6.s[0] +mul z15.s, z15.s,z7.s[0] +mla z15.s, P0/M, z10.s, z31.s +sub z10.s, z11.s, z15.s +add z11.s, z11.s, z15.s +sqrdmulh z15.s, z12.s, z6.s[1] +mul z12.s, z12.s,z7.s[1] +mla z12.s, P0/M, z15.s, z31.s +sub z15.s, z11.s, z12.s +add z11.s, z11.s, z12.s +ldr q0, [x17, #+256] +ldr q1, [x17, #+272] +ldr q2, [x17, #+288] +ldr q3, [x17, #+304] +sqrdmulh z12.s, z13.s, z6.s[1] +mul z13.s, z13.s,z7.s[1] +mla z13.s, P0/M, z12.s, z31.s +sub z12.s, z16.s, z13.s +add z16.s, z16.s, z13.s +sqrdmulh z13.s, z14.s, z6.s[2] +mul z14.s, z14.s,z7.s[2] +mla z14.s, P0/M, z13.s, z31.s +sub z13.s, z10.s, z14.s +add z10.s, z10.s, z14.s +sqrdmulh z14.s, z8.s, z6.s[2] +mul z8.s, z8.s,z7.s[2] +mla z8.s, P0/M, z14.s, z31.s +sub z14.s, z9.s, z8.s +add z9.s, z9.s, z8.s +sqrdmulh z8.s, z11.s, z4.s[0] +mul z11.s, z11.s,z5.s[0] +mla z11.s, P0/M, z8.s, z31.s +sub z8.s, z16.s, z11.s +add z16.s, z16.s, z11.s +str q16, [x0, #256] +str q8, [x0, #272] +sqrdmulh z8.s, z15.s, z4.s[1] +mul z15.s, z15.s,z5.s[1] +mla z15.s, P0/M, z8.s, z31.s +sub z8.s, z12.s, z15.s +add z12.s, z12.s, z15.s +str q12, [x0, #288] +str q8, [x0, #304] +sqrdmulh z8.s, z13.s, z4.s[3] +mul z13.s, z13.s,z5.s[3] +mla z13.s, P0/M, z8.s, z31.s +sub z8.s, z14.s, z13.s +add z14.s, z14.s, z13.s +str q14, [x0, #352] +str q8, [x0, #368] +sqrdmulh z8.s, z10.s, z4.s[2] +mul z10.s, z10.s,z5.s[2] +mla z10.s, P0/M, z8.s, z31.s +sub z8.s, z9.s, z10.s +add z9.s, z9.s, z10.s +str q9, [x0, #320] +str q8, [x0, #336] +ldr q8, [x0, #496] +ldr q9, [x0, #480] +ldr q10, [x0, #448] +ldr q14, [x0, #464] +ldr q13, [x0, #432] +ldr q12, [x0, #416] +ldr q15, [x0, #384] +ldr q16, [x0, #400] +sqrdmulh z11.s, z8.s, z1.s[0] +mul z8.s, z8.s,z0.s[0] +mla z8.s, P0/M, z11.s, z31.s +sub z11.s, z13.s, z8.s +add z13.s, z13.s, z8.s +sqrdmulh z8.s, z9.s, z1.s[0] +mul z9.s, z9.s,z0.s[0] +mla z9.s, P0/M, z8.s, z31.s +sub z8.s, z12.s, z9.s +add z12.s, z12.s, z9.s +sqrdmulh z9.s, z10.s, z1.s[0] +mul z10.s, z10.s,z0.s[0] +mla z10.s, P0/M, z9.s, z31.s +sub z9.s, z15.s, z10.s +add z15.s, z15.s, z10.s +sqrdmulh z10.s, z14.s, z1.s[0] +mul z14.s, z14.s,z0.s[0] +mla z14.s, P0/M, z10.s, z31.s +sub z10.s, z16.s, z14.s +add z16.s, z16.s, z14.s +sqrdmulh z14.s, z13.s, z1.s[1] +mul z13.s, z13.s,z0.s[1] +mla z13.s, P0/M, z14.s, z31.s +sub z14.s, z16.s, z13.s +add z16.s, z16.s, z13.s +ldr q4, [x17, #+320] +ldr q5, [x17, #+336] +ldr q6, [x17, #+352] +ldr q7, [x17, #+368] +sqrdmulh z13.s, z12.s, z1.s[1] +mul z12.s, z12.s,z0.s[1] +mla z12.s, P0/M, z13.s, z31.s +sub z13.s, z15.s, z12.s +add z15.s, z15.s, z12.s +sqrdmulh z12.s, z11.s, z1.s[2] +mul z11.s, z11.s,z0.s[2] +mla z11.s, P0/M, z12.s, z31.s +sub z12.s, z10.s, z11.s +add z10.s, z10.s, z11.s +sqrdmulh z11.s, z8.s, z1.s[2] +mul z8.s, z8.s,z0.s[2] +mla z8.s, P0/M, z11.s, z31.s +sub z11.s, z9.s, z8.s +add z9.s, z9.s, z8.s +sqrdmulh z8.s, z16.s, z3.s[0] +mul z16.s, z16.s,z2.s[0] +mla z16.s, P0/M, z8.s, z31.s +sub z8.s, z15.s, z16.s +add z15.s, z15.s, z16.s +str q15, [x0, #384] +str q8, [x0, #400] +sqrdmulh z8.s, z14.s, z3.s[1] +mul z14.s, z14.s,z2.s[1] +mla z14.s, P0/M, z8.s, z31.s +sub z8.s, z13.s, z14.s +add z13.s, z13.s, z14.s +str q13, [x0, #416] +str q8, [x0, #432] +sqrdmulh z8.s, z12.s, z3.s[3] +mul z12.s, z12.s,z2.s[3] +mla z12.s, P0/M, z8.s, z31.s +sub z8.s, z11.s, z12.s +add z11.s, z11.s, z12.s +str q11, [x0, #480] +str q8, [x0, #496] +sqrdmulh z8.s, z10.s, z3.s[2] +mul z10.s, z10.s,z2.s[2] +mla z10.s, P0/M, z8.s, z31.s +sub z8.s, z9.s, z10.s +add z9.s, z9.s, z10.s +str q9, [x0, #448] +str q8, [x0, #464] +ldr q8, [x0, #624] +ldr q9, [x0, #608] +ldr q10, [x0, #576] +ldr q11, [x0, #592] +ldr q12, [x0, #560] +ldr q13, [x0, #544] +ldr q14, [x0, #512] +ldr q15, [x0, #528] +sqrdmulh z16.s, z8.s, z5.s[0] +mul z8.s, z8.s,z4.s[0] +mla z8.s, P0/M, z16.s, z31.s +sub z16.s, z12.s, z8.s +add z12.s, z12.s, z8.s +sqrdmulh z8.s, z9.s, z5.s[0] +mul z9.s, z9.s,z4.s[0] +mla z9.s, P0/M, z8.s, z31.s +sub z8.s, z13.s, z9.s +add z13.s, z13.s, z9.s +sqrdmulh z9.s, z10.s, z5.s[0] +mul z10.s, z10.s,z4.s[0] +mla z10.s, P0/M, z9.s, z31.s +sub z9.s, z14.s, z10.s +add z14.s, z14.s, z10.s +sqrdmulh z10.s, z11.s, z5.s[0] +mul z11.s, z11.s,z4.s[0] +mla z11.s, P0/M, z10.s, z31.s +sub z10.s, z15.s, z11.s +add z15.s, z15.s, z11.s +sqrdmulh z11.s, z12.s, z5.s[1] +mul z12.s, z12.s,z4.s[1] +mla z12.s, P0/M, z11.s, z31.s +sub z11.s, z15.s, z12.s +add z15.s, z15.s, z12.s +ldr q3, [x17, #+384] +ldr q2, [x17, #+400] +ldr q1, [x17, #+416] +ldr q0, [x17, #+432] +sqrdmulh z12.s, z13.s, z5.s[1] +mul z13.s, z13.s,z4.s[1] +mla z13.s, P0/M, z12.s, z31.s +sub z12.s, z14.s, z13.s +add z14.s, z14.s, z13.s +sqrdmulh z13.s, z16.s, z5.s[2] +mul z16.s, z16.s,z4.s[2] +mla z16.s, P0/M, z13.s, z31.s +sub z13.s, z10.s, z16.s +add z10.s, z10.s, z16.s +sqrdmulh z16.s, z8.s, z5.s[2] +mul z8.s, z8.s,z4.s[2] +mla z8.s, P0/M, z16.s, z31.s +sub z16.s, z9.s, z8.s +add z9.s, z9.s, z8.s +sqrdmulh z8.s, z15.s, z7.s[0] +mul z15.s, z15.s,z6.s[0] +mla z15.s, P0/M, z8.s, z31.s +sub z8.s, z14.s, z15.s +add z14.s, z14.s, z15.s +str q14, [x0, #512] +str q8, [x0, #528] +sqrdmulh z8.s, z11.s, z7.s[1] +mul z11.s, z11.s,z6.s[1] +mla z11.s, P0/M, z8.s, z31.s +sub z8.s, z12.s, z11.s +add z12.s, z12.s, z11.s +str q12, [x0, #544] +str q8, [x0, #560] +sqrdmulh z8.s, z13.s, z7.s[3] +mul z13.s, z13.s,z6.s[3] +mla z13.s, P0/M, z8.s, z31.s +sub z8.s, z16.s, z13.s +add z16.s, z16.s, z13.s +str q16, [x0, #608] +str q8, [x0, #624] +sqrdmulh z8.s, z10.s, z7.s[2] +mul z10.s, z10.s,z6.s[2] +mla z10.s, P0/M, z8.s, z31.s +sub z8.s, z9.s, z10.s +add z9.s, z9.s, z10.s +str q9, [x0, #576] +str q8, [x0, #592] +ldr q8, [x0, #752] +ldr q9, [x0, #736] +ldr q10, [x0, #704] +ldr q16, [x0, #720] +ldr q13, [x0, #688] +ldr q12, [x0, #672] +ldr q11, [x0, #640] +ldr q14, [x0, #656] +sqrdmulh z15.s, z8.s, z2.s[0] +mul z8.s, z8.s,z3.s[0] +mla z8.s, P0/M, z15.s, z31.s +sub z15.s, z13.s, z8.s +add z13.s, z13.s, z8.s +sqrdmulh z8.s, z9.s, z2.s[0] +mul z9.s, z9.s,z3.s[0] +mla z9.s, P0/M, z8.s, z31.s +sub z8.s, z12.s, z9.s +add z12.s, z12.s, z9.s +sqrdmulh z9.s, z10.s, z2.s[0] +mul z10.s, z10.s,z3.s[0] +mla z10.s, P0/M, z9.s, z31.s +sub z9.s, z11.s, z10.s +add z11.s, z11.s, z10.s +sqrdmulh z10.s, z16.s, z2.s[0] +mul z16.s, z16.s,z3.s[0] +mla z16.s, P0/M, z10.s, z31.s +sub z10.s, z14.s, z16.s +add z14.s, z14.s, z16.s +sqrdmulh z16.s, z13.s, z2.s[1] +mul z13.s, z13.s,z3.s[1] +mla z13.s, P0/M, z16.s, z31.s +sub z16.s, z14.s, z13.s +add z14.s, z14.s, z13.s +ldr q7, [x17, #+448] +ldr q6, [x17, #+464] +ldr q5, [x17, #+480] +ldr q4, [x17, #+496] +sqrdmulh z13.s, z12.s, z2.s[1] +mul z12.s, z12.s,z3.s[1] +mla z12.s, P0/M, z13.s, z31.s +sub z13.s, z11.s, z12.s +add z11.s, z11.s, z12.s +sqrdmulh z12.s, z15.s, z2.s[2] +mul z15.s, z15.s,z3.s[2] +mla z15.s, P0/M, z12.s, z31.s +sub z12.s, z10.s, z15.s +add z10.s, z10.s, z15.s +sqrdmulh z15.s, z8.s, z2.s[2] +mul z8.s, z8.s,z3.s[2] +mla z8.s, P0/M, z15.s, z31.s +sub z15.s, z9.s, z8.s +add z9.s, z9.s, z8.s +sqrdmulh z8.s, z14.s, z0.s[0] +mul z14.s, z14.s,z1.s[0] +mla z14.s, P0/M, z8.s, z31.s +sub z8.s, z11.s, z14.s +add z11.s, z11.s, z14.s +str q11, [x0, #640] +str q8, [x0, #656] +sqrdmulh z8.s, z16.s, z0.s[1] +mul z16.s, z16.s,z1.s[1] +mla z16.s, P0/M, z8.s, z31.s +sub z8.s, z13.s, z16.s +add z13.s, z13.s, z16.s +str q13, [x0, #672] +str q8, [x0, #688] +sqrdmulh z8.s, z12.s, z0.s[3] +mul z12.s, z12.s,z1.s[3] +mla z12.s, P0/M, z8.s, z31.s +sub z8.s, z15.s, z12.s +add z15.s, z15.s, z12.s +str q15, [x0, #736] +str q8, [x0, #752] +sqrdmulh z8.s, z10.s, z0.s[2] +mul z10.s, z10.s,z1.s[2] +mla z10.s, P0/M, z8.s, z31.s +sub z8.s, z9.s, z10.s +add z9.s, z9.s, z10.s +str q9, [x0, #704] +str q8, [x0, #720] +ldr q8, [x0, #880] +ldr q9, [x0, #864] +ldr q10, [x0, #832] +ldr q15, [x0, #848] +ldr q12, [x0, #816] +ldr q13, [x0, #800] +ldr q16, [x0, #768] +ldr q11, [x0, #784] +sqrdmulh z14.s, z8.s, z6.s[0] +mul z8.s, z8.s,z7.s[0] +mla z8.s, P0/M, z14.s, z31.s +sub z14.s, z12.s, z8.s +add z12.s, z12.s, z8.s +sqrdmulh z8.s, z9.s, z6.s[0] +mul z9.s, z9.s,z7.s[0] +mla z9.s, P0/M, z8.s, z31.s +sub z8.s, z13.s, z9.s +add z13.s, z13.s, z9.s +sqrdmulh z9.s, z10.s, z6.s[0] +mul z10.s, z10.s,z7.s[0] +mla z10.s, P0/M, z9.s, z31.s +sub z9.s, z16.s, z10.s +add z16.s, z16.s, z10.s +sqrdmulh z10.s, z15.s, z6.s[0] +mul z15.s, z15.s,z7.s[0] +mla z15.s, P0/M, z10.s, z31.s +sub z10.s, z11.s, z15.s +add z11.s, z11.s, z15.s +sqrdmulh z15.s, z12.s, z6.s[1] +mul z12.s, z12.s,z7.s[1] +mla z12.s, P0/M, z15.s, z31.s +sub z15.s, z11.s, z12.s +add z11.s, z11.s, z12.s +ldr q0, [x17, #+512] +ldr q1, [x17, #+528] +ldr q2, [x17, #+544] +ldr q3, [x17, #+560] +sqrdmulh z12.s, z13.s, z6.s[1] +mul z13.s, z13.s,z7.s[1] +mla z13.s, P0/M, z12.s, z31.s +sub z12.s, z16.s, z13.s +add z16.s, z16.s, z13.s +sqrdmulh z13.s, z14.s, z6.s[2] +mul z14.s, z14.s,z7.s[2] +mla z14.s, P0/M, z13.s, z31.s +sub z13.s, z10.s, z14.s +add z10.s, z10.s, z14.s +sqrdmulh z14.s, z8.s, z6.s[2] +mul z8.s, z8.s,z7.s[2] +mla z8.s, P0/M, z14.s, z31.s +sub z14.s, z9.s, z8.s +add z9.s, z9.s, z8.s +sqrdmulh z8.s, z11.s, z4.s[0] +mul z11.s, z11.s,z5.s[0] +mla z11.s, P0/M, z8.s, z31.s +sub z8.s, z16.s, z11.s +add z16.s, z16.s, z11.s +str q16, [x0, #768] +str q8, [x0, #784] +sqrdmulh z8.s, z15.s, z4.s[1] +mul z15.s, z15.s,z5.s[1] +mla z15.s, P0/M, z8.s, z31.s +sub z8.s, z12.s, z15.s +add z12.s, z12.s, z15.s +str q12, [x0, #800] +str q8, [x0, #816] +sqrdmulh z8.s, z13.s, z4.s[3] +mul z13.s, z13.s,z5.s[3] +mla z13.s, P0/M, z8.s, z31.s +sub z8.s, z14.s, z13.s +add z14.s, z14.s, z13.s +str q14, [x0, #864] +str q8, [x0, #880] +sqrdmulh z8.s, z10.s, z4.s[2] +mul z10.s, z10.s,z5.s[2] +mla z10.s, P0/M, z8.s, z31.s +sub z8.s, z9.s, z10.s +add z9.s, z9.s, z10.s +str q9, [x0, #832] +str q8, [x0, #848] +ldr q8, [x0, #1008] +ldr q9, [x0, #992] +ldr q10, [x0, #960] +ldr q14, [x0, #976] +ldr q13, [x0, #944] +ldr q12, [x0, #928] +ldr q15, [x0, #896] +ldr q16, [x0, #912] +sqrdmulh z11.s, z8.s, z1.s[0] +mul z8.s, z8.s,z0.s[0] +mla z8.s, P0/M, z11.s, z31.s +sub z11.s, z13.s, z8.s +add z13.s, z13.s, z8.s +sqrdmulh z8.s, z9.s, z1.s[0] +mul z9.s, z9.s,z0.s[0] +mla z9.s, P0/M, z8.s, z31.s +sub z8.s, z12.s, z9.s +add z12.s, z12.s, z9.s +sqrdmulh z9.s, z10.s, z1.s[0] +mul z10.s, z10.s,z0.s[0] +mla z10.s, P0/M, z9.s, z31.s +sub z9.s, z15.s, z10.s +add z15.s, z15.s, z10.s +sqrdmulh z10.s, z14.s, z1.s[0] +mul z14.s, z14.s,z0.s[0] +mla z14.s, P0/M, z10.s, z31.s +sub z10.s, z16.s, z14.s +add z16.s, z16.s, z14.s +sqrdmulh z14.s, z13.s, z1.s[1] +mul z13.s, z13.s,z0.s[1] +mla z13.s, P0/M, z14.s, z31.s +sub z14.s, z16.s, z13.s +add z16.s, z16.s, z13.s +sqrdmulh z13.s, z12.s, z1.s[1] +mul z12.s, z12.s,z0.s[1] +mla z12.s, P0/M, z13.s, z31.s +sub z13.s, z15.s, z12.s +add z15.s, z15.s, z12.s +sqrdmulh z12.s, z11.s, z1.s[2] +mul z11.s, z11.s,z0.s[2] +mla z11.s, P0/M, z12.s, z31.s +sub z12.s, z10.s, z11.s +add z10.s, z10.s, z11.s +sqrdmulh z11.s, z8.s, z1.s[2] +mul z8.s, z8.s,z0.s[2] +mla z8.s, P0/M, z11.s, z31.s +sub z11.s, z9.s, z8.s +add z9.s, z9.s, z8.s +sqrdmulh z8.s, z16.s, z3.s[0] +mul z16.s, z16.s,z2.s[0] +mla z16.s, P0/M, z8.s, z31.s +sub z8.s, z15.s, z16.s +add z15.s, z15.s, z16.s +str q15, [x0, #896] +str q8, [x0, #912] +sqrdmulh z8.s, z14.s, z3.s[1] +mul z14.s, z14.s,z2.s[1] +mla z14.s, P0/M, z8.s, z31.s +sub z8.s, z13.s, z14.s +add z13.s, z13.s, z14.s +str q13, [x0, #928] +str q8, [x0, #944] +sqrdmulh z8.s, z12.s, z3.s[3] +mul z12.s, z12.s,z2.s[3] +mla z12.s, P0/M, z8.s, z31.s +sub z8.s, z11.s, z12.s +add z11.s, z11.s, z12.s +str q11, [x0, #992] +str q8, [x0, #1008] +sqrdmulh z8.s, z10.s, z3.s[2] +mul z10.s, z10.s,z2.s[2] +mla z10.s, P0/M, z8.s, z31.s +sub z8.s, z9.s, z10.s +add z9.s, z9.s, z10.s +str q9, [x0, #960] +str q8, [x0, #976] +// Restore SVE2 vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1445 +// Instruction count: 1441 \ No newline at end of file diff --git a/asm/auto/ntt_sve2/ntt_u64_incomplete_72057594067788289_60277548896192635_var_3_3_0.s b/asm/auto/ntt_sve2/ntt_u64_incomplete_72057594067788289_60277548896192635_var_3_3_0.s new file mode 100644 index 0000000..aa5cd18 --- /dev/null +++ b/asm/auto/ntt_sve2/ntt_u64_incomplete_72057594067788289_60277548896192635_var_3_3_0.s @@ -0,0 +1,2727 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +modulus: +.dword -72057594067788289 +.dword -72057594067788289 +.dword -72057594067788289 +.dword -72057594067788289 +.align 6 +roots_merged: +.dword 25792053496987399 // Layer 0, block 0 +.dword 0 // Layer None, block None +.dword 3301382846246308405 // Layer 0, block 0 +.dword 0 // Layer None, block None +.dword 36678763444893001 // Layer 1, block 0 +.dword 12009493193917617 // Layer 1, block 1 +.dword 4694881719000765600 // Layer 1, block 0 +.dword 1537215128184439725 // Layer 1, block 1 +.dword 57226611787624233 // Layer 2, block 0 +.dword 39665359539540334 // Layer 2, block 1 +.dword 7325006305780451127 // Layer 2, block 0 +.dword 5077166018957207276 // Layer 2, block 1 +.dword 14359056949694594 // Layer 2, block 2 +.dword 63449028357011879 // Layer 2, block 3 +.dword 1837959288799265711 // Layer 2, block 2 +.dword 8121475626332016399 // Layer 2, block 3 +.dword 56437370284897879 // Layer 3, block 0 +.dword 0 // Layer None, block None +.dword 7223983393473341270 // Layer 3, block 0 +.dword 0 // Layer None, block None +.dword 15519149204003269 // Layer 4, block 0 +.dword 18945631884663455 // Layer 4, block 1 +.dword 1986451097289241753 // Layer 4, block 0 +.dword 2425040880231995866 // Layer 4, block 1 +.dword 21843809513296019 // Layer 5, block 0 +.dword 52861630939350015 // Layer 5, block 1 +.dword 2796007616543237058 // Layer 5, block 0 +.dword 6766288757432881341 // Layer 5, block 1 +.dword 58200436133340777 // Layer 5, block 2 +.dword 45581265709396633 // Layer 5, block 3 +.dword 7449655821980514543 // Layer 5, block 2 +.dword 5834402008385018253 // Layer 5, block 3 +.dword 7801853795705237 // Layer 3, block 1 +.dword 0 // Layer None, block None +.dword 998637285436439396 // Layer 3, block 1 +.dword 0 // Layer None, block None +.dword 72057409685042741 // Layer 4, block 2 +.dword 67813594624550994 // Layer 4, block 3 +.dword 9223348435863355444 // Layer 4, block 2 +.dword 8680140108345514992 // Layer 4, block 3 +.dword 16444438478993771 // Layer 5, block 4 +.dword 44738633871916757 // Layer 5, block 5 +.dword 2104888124438946221 // Layer 5, block 4 +.dword 5726545133232289544 // Layer 5, block 5 +.dword 14998888047589537 // Layer 5, block 6 +.dword 1367715298619054 // Layer 5, block 7 +.dword 1919857669295880083 // Layer 5, block 6 +.dword 175067558150691679 // Layer 5, block 7 +.dword 50810289212278368 // Layer 3, block 2 +.dword 0 // Layer None, block None +.dword 6503717016476519110 // Layer 3, block 2 +.dword 0 // Layer None, block None +.dword 38922220208018571 // Layer 4, block 4 +.dword 7966052600948377 // Layer 4, block 5 +.dword 4982044184561839686 // Layer 4, block 4 +.dword 1019654732498851778 // Layer 4, block 5 +.dword 45879272116084567 // Layer 5, block 8 +.dword 66654388400258382 // Layer 5, block 9 +.dword 5872546828425266758 // Layer 5, block 8 +.dword 8531761711697548017 // Layer 5, block 9 +.dword 8930087962801744 // Layer 5, block 10 +.dword 61848588213223279 // Layer 5, block 11 +.dword 1143051258764947771 // Layer 5, block 10 +.dword 7916619288011967173 // Layer 5, block 11 +.dword 31977682183549777 // Layer 3, block 3 +.dword 0 // Layer None, block None +.dword 4093143317798190700 // Layer 3, block 3 +.dword 0 // Layer None, block None +.dword 66070897124800871 // Layer 4, block 6 +.dword 953067252694683 // Layer 4, block 7 +.dword 8457074828469936528 // Layer 4, block 6 +.dword 121992608294366219 // Layer 4, block 7 +.dword 33801610235026337 // Layer 5, block 12 +.dword 32122784433286747 // Layer 5, block 13 +.dword 4326606108290444417 // Layer 5, block 12 +.dword 4111716405756826253 // Layer 5, block 13 +.dword 67688369535326483 // Layer 5, block 14 +.dword 45021686719473556 // Layer 5, block 15 +.dword 8664111296931419854 // Layer 5, block 14 +.dword 5762775897704545946 // Layer 5, block 15 +.dword 66662168904752601 // Layer 3, block 4 +.dword 0 // Layer None, block None +.dword 8532757616272395351 // Layer 3, block 4 +.dword 0 // Layer None, block None +.dword 23961218891132444 // Layer 4, block 8 +.dword 59012643726482518 // Layer 4, block 9 +.dword 3067036016793986470 // Layer 4, block 8 +.dword 7553618393859575754 // Layer 4, block 9 +.dword 52812533586708198 // Layer 5, block 16 +.dword 27994290036168371 // Layer 5, block 17 +.dword 6760004296297333018 // Layer 5, block 16 +.dword 3583269123144660376 // Layer 5, block 17 +.dword 45890717144660134 // Layer 5, block 18 +.dword 39684773913748863 // Layer 5, block 19 +.dword 5874011792082332260 // Layer 5, block 18 +.dword 5079651058854869198 // Layer 5, block 19 +.dword 50149898471788096 // Layer 3, block 5 +.dword 0 // Layer None, block None +.dword 6419187001728793164 // Layer 3, block 5 +.dword 0 // Layer None, block None +.dword 65714767972465509 // Layer 4, block 10 +.dword 51421828010275652 // Layer 4, block 11 +.dword 8411490296989900223 // Layer 4, block 10 +.dword 6581993982587733829 // Layer 4, block 11 +.dword 18683690578478417 // Layer 5, block 20 +.dword 3282356803714609 // Layer 5, block 21 +.dword 2391512393054205061 // Layer 5, block 20 +.dword 420141670701365074 // Layer 5, block 21 +.dword 67884452950503047 // Layer 5, block 22 +.dword 10335338564031418 // Layer 5, block 23 +.dword 8689209974063619263 // Layer 5, block 22 +.dword 1322923335647807838 // Layer 5, block 23 +.dword 30932683335866672 // Layer 3, block 6 +.dword 0 // Layer None, block None +.dword 3959383465350182760 // Layer 3, block 6 +.dword 0 // Layer None, block None +.dword 27050097608373352 // Layer 4, block 12 +.dword 67454821565758121 // Layer 4, block 13 +.dword 3462412492436980406 // Layer 4, block 12 +.dword 8634217156839057519 // Layer 4, block 13 +.dword 32828920539599153 // Layer 5, block 24 +.dword 8624332566875856 // Layer 5, block 25 +.dword 4202101827327358896 // Layer 5, block 24 +.dword 1103914568102652181 // Layer 5, block 25 +.dword 56732837753533829 // Layer 5, block 26 +.dword 14816466027490539 // Layer 5, block 27 +.dword 7261803229443070495 // Layer 5, block 26 +.dword 1896507650732884485 // Layer 5, block 27 +.dword 54968319742463037 // Layer 3, block 7 +.dword 0 // Layer None, block None +.dword 7035944924119603816 // Layer 3, block 7 +.dword 0 // Layer None, block None +.dword 55666925166425210 // Layer 4, block 14 +.dword 34241587306439298 // Layer 4, block 15 +.dword 7125366418349706083 // Layer 4, block 14 +.dword 4382923173407965878 // Layer 4, block 15 +.dword 8550051130607768 // Layer 5, block 28 +.dword 14420141705316589 // Layer 5, block 29 +.dword 1094406544264277001 // Layer 5, block 28 +.dword 1845778137515640974 // Layer 5, block 29 +.dword 55622715926092387 // Layer 5, block 30 +.dword 3405033449209397 // Layer 5, block 31 +.dword 7119707635589449714 // Layer 5, block 30 +.dword 435844281318190845 // Layer 5, block 31 +.text +.type ntt_u64_incomplete_sve2_asm_var_3_3_0, %function +.global ntt_u64_incomplete_sve2_asm_var_3_3_0 +modulus_addr: .quad modulus +roots_merged_addr: .quad roots_merged +ntt_u64_incomplete_sve2_asm_var_3_3_0: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save SVE2 vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ldr x17, modulus_addr +ldr q31, [x17] +ptrue P0.d +ldr x17, roots_merged_addr +ldr q3, [x17, #+0] +ldr q2, [x17, #+16] +ldr q1, [x17, #+32] +ldr q0, [x17, #+48] +ldr q15, [x17, #+64] +ldr q14, [x17, #+80] +ldr q13, [x17, #+96] +ldr q12, [x17, #+112] +ldr q30, [x0, #1920] +ldr q29, [x0, #1664] +ldr q28, [x0, #1152] +ldr q27, [x0, #1408] +ldr q26, [x0, #896] +ldr q25, [x0, #640] +ldr q24, [x0, #128] +ldr q23, [x0, #384] +sqrdmulh z22.d, z30.d, z2.d[0] +mul z30.d, z30.d,z3.d[0] +mla z30.d, P0/M, z22.d, z31.d +sub z22.d, z26.d, z30.d +add z26.d, z26.d, z30.d +sqrdmulh z30.d, z29.d, z2.d[0] +mul z29.d, z29.d,z3.d[0] +mla z29.d, P0/M, z30.d, z31.d +sub z30.d, z25.d, z29.d +add z25.d, z25.d, z29.d +sqrdmulh z29.d, z28.d, z2.d[0] +mul z28.d, z28.d,z3.d[0] +mla z28.d, P0/M, z29.d, z31.d +sub z29.d, z24.d, z28.d +add z24.d, z24.d, z28.d +sqrdmulh z28.d, z27.d, z2.d[0] +mul z27.d, z27.d,z3.d[0] +mla z27.d, P0/M, z28.d, z31.d +sub z28.d, z23.d, z27.d +add z23.d, z23.d, z27.d +sqrdmulh z27.d, z26.d, z0.d[0] +mul z26.d, z26.d,z1.d[0] +mla z26.d, P0/M, z27.d, z31.d +sub z27.d, z23.d, z26.d +add z23.d, z23.d, z26.d +sqrdmulh z26.d, z25.d, z0.d[0] +mul z25.d, z25.d,z1.d[0] +mla z25.d, P0/M, z26.d, z31.d +sub z26.d, z24.d, z25.d +add z24.d, z24.d, z25.d +sqrdmulh z25.d, z22.d, z0.d[1] +mul z22.d, z22.d,z1.d[1] +mla z22.d, P0/M, z25.d, z31.d +sub z25.d, z28.d, z22.d +add z28.d, z28.d, z22.d +sqrdmulh z22.d, z30.d, z0.d[1] +mul z30.d, z30.d,z1.d[1] +mla z30.d, P0/M, z22.d, z31.d +sub z22.d, z29.d, z30.d +add z29.d, z29.d, z30.d +sqrdmulh z30.d, z23.d, z14.d[0] +mul z23.d, z23.d,z15.d[0] +mla z23.d, P0/M, z30.d, z31.d +sub z30.d, z24.d, z23.d +add z24.d, z24.d, z23.d +str q24, [x0, #128] +str q30, [x0, #384] +sqrdmulh z30.d, z27.d, z14.d[1] +mul z27.d, z27.d,z15.d[1] +mla z27.d, P0/M, z30.d, z31.d +sub z30.d, z26.d, z27.d +add z26.d, z26.d, z27.d +str q26, [x0, #640] +str q30, [x0, #896] +sqrdmulh z30.d, z25.d, z12.d[1] +mul z25.d, z25.d,z13.d[1] +mla z25.d, P0/M, z30.d, z31.d +sub z30.d, z22.d, z25.d +add z22.d, z22.d, z25.d +str q22, [x0, #1664] +str q30, [x0, #1920] +sqrdmulh z30.d, z28.d, z12.d[0] +mul z28.d, z28.d,z13.d[0] +mla z28.d, P0/M, z30.d, z31.d +sub z30.d, z29.d, z28.d +add z29.d, z29.d, z28.d +str q29, [x0, #1152] +str q30, [x0, #1408] +ldr q30, [x0, #1936] +ldr q29, [x0, #1680] +ldr q28, [x0, #1168] +ldr q22, [x0, #1424] +ldr q25, [x0, #912] +ldr q26, [x0, #656] +ldr q27, [x0, #144] +ldr q24, [x0, #400] +sqrdmulh z23.d, z30.d, z2.d[0] +mul z30.d, z30.d,z3.d[0] +mla z30.d, P0/M, z23.d, z31.d +sub z23.d, z25.d, z30.d +add z25.d, z25.d, z30.d +sqrdmulh z30.d, z29.d, z2.d[0] +mul z29.d, z29.d,z3.d[0] +mla z29.d, P0/M, z30.d, z31.d +sub z30.d, z26.d, z29.d +add z26.d, z26.d, z29.d +sqrdmulh z29.d, z28.d, z2.d[0] +mul z28.d, z28.d,z3.d[0] +mla z28.d, P0/M, z29.d, z31.d +sub z29.d, z27.d, z28.d +add z27.d, z27.d, z28.d +sqrdmulh z28.d, z22.d, z2.d[0] +mul z22.d, z22.d,z3.d[0] +mla z22.d, P0/M, z28.d, z31.d +sub z28.d, z24.d, z22.d +add z24.d, z24.d, z22.d +sqrdmulh z22.d, z25.d, z0.d[0] +mul z25.d, z25.d,z1.d[0] +mla z25.d, P0/M, z22.d, z31.d +sub z22.d, z24.d, z25.d +add z24.d, z24.d, z25.d +sqrdmulh z25.d, z26.d, z0.d[0] +mul z26.d, z26.d,z1.d[0] +mla z26.d, P0/M, z25.d, z31.d +sub z25.d, z27.d, z26.d +add z27.d, z27.d, z26.d +sqrdmulh z26.d, z23.d, z0.d[1] +mul z23.d, z23.d,z1.d[1] +mla z23.d, P0/M, z26.d, z31.d +sub z26.d, z28.d, z23.d +add z28.d, z28.d, z23.d +sqrdmulh z23.d, z30.d, z0.d[1] +mul z30.d, z30.d,z1.d[1] +mla z30.d, P0/M, z23.d, z31.d +sub z23.d, z29.d, z30.d +add z29.d, z29.d, z30.d +sqrdmulh z30.d, z24.d, z14.d[0] +mul z24.d, z24.d,z15.d[0] +mla z24.d, P0/M, z30.d, z31.d +sub z30.d, z27.d, z24.d +add z27.d, z27.d, z24.d +str q27, [x0, #144] +str q30, [x0, #400] +sqrdmulh z30.d, z22.d, z14.d[1] +mul z22.d, z22.d,z15.d[1] +mla z22.d, P0/M, z30.d, z31.d +sub z30.d, z25.d, z22.d +add z25.d, z25.d, z22.d +str q25, [x0, #656] +str q30, [x0, #912] +sqrdmulh z30.d, z26.d, z12.d[1] +mul z26.d, z26.d,z13.d[1] +mla z26.d, P0/M, z30.d, z31.d +sub z30.d, z23.d, z26.d +add z23.d, z23.d, z26.d +str q23, [x0, #1680] +str q30, [x0, #1936] +sqrdmulh z30.d, z28.d, z12.d[0] +mul z28.d, z28.d,z13.d[0] +mla z28.d, P0/M, z30.d, z31.d +sub z30.d, z29.d, z28.d +add z29.d, z29.d, z28.d +str q29, [x0, #1168] +str q30, [x0, #1424] +ldr q30, [x0, #1952] +ldr q29, [x0, #1696] +ldr q28, [x0, #1184] +ldr q23, [x0, #1440] +ldr q26, [x0, #928] +ldr q25, [x0, #672] +ldr q22, [x0, #160] +ldr q27, [x0, #416] +sqrdmulh z24.d, z30.d, z2.d[0] +mul z30.d, z30.d,z3.d[0] +mla z30.d, P0/M, z24.d, z31.d +sub z24.d, z26.d, z30.d +add z26.d, z26.d, z30.d +sqrdmulh z30.d, z29.d, z2.d[0] +mul z29.d, z29.d,z3.d[0] +mla z29.d, P0/M, z30.d, z31.d +sub z30.d, z25.d, z29.d +add z25.d, z25.d, z29.d +sqrdmulh z29.d, z28.d, z2.d[0] +mul z28.d, z28.d,z3.d[0] +mla z28.d, P0/M, z29.d, z31.d +sub z29.d, z22.d, z28.d +add z22.d, z22.d, z28.d +sqrdmulh z28.d, z23.d, z2.d[0] +mul z23.d, z23.d,z3.d[0] +mla z23.d, P0/M, z28.d, z31.d +sub z28.d, z27.d, z23.d +add z27.d, z27.d, z23.d +sqrdmulh z23.d, z26.d, z0.d[0] +mul z26.d, z26.d,z1.d[0] +mla z26.d, P0/M, z23.d, z31.d +sub z23.d, z27.d, z26.d +add z27.d, z27.d, z26.d +sqrdmulh z26.d, z25.d, z0.d[0] +mul z25.d, z25.d,z1.d[0] +mla z25.d, P0/M, z26.d, z31.d +sub z26.d, z22.d, z25.d +add z22.d, z22.d, z25.d +sqrdmulh z25.d, z24.d, z0.d[1] +mul z24.d, z24.d,z1.d[1] +mla z24.d, P0/M, z25.d, z31.d +sub z25.d, z28.d, z24.d +add z28.d, z28.d, z24.d +sqrdmulh z24.d, z30.d, z0.d[1] +mul z30.d, z30.d,z1.d[1] +mla z30.d, P0/M, z24.d, z31.d +sub z24.d, z29.d, z30.d +add z29.d, z29.d, z30.d +sqrdmulh z30.d, z27.d, z14.d[0] +mul z27.d, z27.d,z15.d[0] +mla z27.d, P0/M, z30.d, z31.d +sub z30.d, z22.d, z27.d +add z22.d, z22.d, z27.d +str q22, [x0, #160] +str q30, [x0, #416] +sqrdmulh z30.d, z23.d, z14.d[1] +mul z23.d, z23.d,z15.d[1] +mla z23.d, P0/M, z30.d, z31.d +sub z30.d, z26.d, z23.d +add z26.d, z26.d, z23.d +str q26, [x0, #672] +str q30, [x0, #928] +sqrdmulh z30.d, z25.d, z12.d[1] +mul z25.d, z25.d,z13.d[1] +mla z25.d, P0/M, z30.d, z31.d +sub z30.d, z24.d, z25.d +add z24.d, z24.d, z25.d +str q24, [x0, #1696] +str q30, [x0, #1952] +sqrdmulh z30.d, z28.d, z12.d[0] +mul z28.d, z28.d,z13.d[0] +mla z28.d, P0/M, z30.d, z31.d +sub z30.d, z29.d, z28.d +add z29.d, z29.d, z28.d +str q29, [x0, #1184] +str q30, [x0, #1440] +ldr q30, [x0, #1968] +ldr q29, [x0, #1712] +ldr q28, [x0, #1200] +ldr q24, [x0, #1456] +ldr q25, [x0, #944] +ldr q26, [x0, #688] +ldr q23, [x0, #176] +ldr q22, [x0, #432] +sqrdmulh z27.d, z30.d, z2.d[0] +mul z30.d, z30.d,z3.d[0] +mla z30.d, P0/M, z27.d, z31.d +sub z27.d, z25.d, z30.d +add z25.d, z25.d, z30.d +sqrdmulh z30.d, z29.d, z2.d[0] +mul z29.d, z29.d,z3.d[0] +mla z29.d, P0/M, z30.d, z31.d +sub z30.d, z26.d, z29.d +add z26.d, z26.d, z29.d +sqrdmulh z29.d, z28.d, z2.d[0] +mul z28.d, z28.d,z3.d[0] +mla z28.d, P0/M, z29.d, z31.d +sub z29.d, z23.d, z28.d +add z23.d, z23.d, z28.d +sqrdmulh z28.d, z24.d, z2.d[0] +mul z24.d, z24.d,z3.d[0] +mla z24.d, P0/M, z28.d, z31.d +sub z28.d, z22.d, z24.d +add z22.d, z22.d, z24.d +sqrdmulh z24.d, z25.d, z0.d[0] +mul z25.d, z25.d,z1.d[0] +mla z25.d, P0/M, z24.d, z31.d +sub z24.d, z22.d, z25.d +add z22.d, z22.d, z25.d +sqrdmulh z25.d, z26.d, z0.d[0] +mul z26.d, z26.d,z1.d[0] +mla z26.d, P0/M, z25.d, z31.d +sub z25.d, z23.d, z26.d +add z23.d, z23.d, z26.d +sqrdmulh z26.d, z27.d, z0.d[1] +mul z27.d, z27.d,z1.d[1] +mla z27.d, P0/M, z26.d, z31.d +sub z26.d, z28.d, z27.d +add z28.d, z28.d, z27.d +sqrdmulh z27.d, z30.d, z0.d[1] +mul z30.d, z30.d,z1.d[1] +mla z30.d, P0/M, z27.d, z31.d +sub z27.d, z29.d, z30.d +add z29.d, z29.d, z30.d +sqrdmulh z30.d, z22.d, z14.d[0] +mul z22.d, z22.d,z15.d[0] +mla z22.d, P0/M, z30.d, z31.d +sub z30.d, z23.d, z22.d +add z23.d, z23.d, z22.d +str q23, [x0, #176] +str q30, [x0, #432] +sqrdmulh z30.d, z24.d, z14.d[1] +mul z24.d, z24.d,z15.d[1] +mla z24.d, P0/M, z30.d, z31.d +sub z30.d, z25.d, z24.d +add z25.d, z25.d, z24.d +str q25, [x0, #688] +str q30, [x0, #944] +sqrdmulh z30.d, z26.d, z12.d[1] +mul z26.d, z26.d,z13.d[1] +mla z26.d, P0/M, z30.d, z31.d +sub z30.d, z27.d, z26.d +add z27.d, z27.d, z26.d +str q27, [x0, #1712] +str q30, [x0, #1968] +sqrdmulh z30.d, z28.d, z12.d[0] +mul z28.d, z28.d,z13.d[0] +mla z28.d, P0/M, z30.d, z31.d +sub z30.d, z29.d, z28.d +add z29.d, z29.d, z28.d +str q29, [x0, #1200] +str q30, [x0, #1456] +ldr q30, [x0, #1984] +ldr q29, [x0, #1728] +ldr q28, [x0, #1216] +ldr q27, [x0, #1472] +ldr q26, [x0, #960] +ldr q25, [x0, #704] +ldr q24, [x0, #192] +ldr q23, [x0, #448] +sqrdmulh z22.d, z30.d, z2.d[0] +mul z30.d, z30.d,z3.d[0] +mla z30.d, P0/M, z22.d, z31.d +sub z22.d, z26.d, z30.d +add z26.d, z26.d, z30.d +sqrdmulh z30.d, z29.d, z2.d[0] +mul z29.d, z29.d,z3.d[0] +mla z29.d, P0/M, z30.d, z31.d +sub z30.d, z25.d, z29.d +add z25.d, z25.d, z29.d +sqrdmulh z29.d, z28.d, z2.d[0] +mul z28.d, z28.d,z3.d[0] +mla z28.d, P0/M, z29.d, z31.d +sub z29.d, z24.d, z28.d +add z24.d, z24.d, z28.d +sqrdmulh z28.d, z27.d, z2.d[0] +mul z27.d, z27.d,z3.d[0] +mla z27.d, P0/M, z28.d, z31.d +sub z28.d, z23.d, z27.d +add z23.d, z23.d, z27.d +sqrdmulh z27.d, z26.d, z0.d[0] +mul z26.d, z26.d,z1.d[0] +mla z26.d, P0/M, z27.d, z31.d +sub z27.d, z23.d, z26.d +add z23.d, z23.d, z26.d +sqrdmulh z26.d, z25.d, z0.d[0] +mul z25.d, z25.d,z1.d[0] +mla z25.d, P0/M, z26.d, z31.d +sub z26.d, z24.d, z25.d +add z24.d, z24.d, z25.d +sqrdmulh z25.d, z22.d, z0.d[1] +mul z22.d, z22.d,z1.d[1] +mla z22.d, P0/M, z25.d, z31.d +sub z25.d, z28.d, z22.d +add z28.d, z28.d, z22.d +sqrdmulh z22.d, z30.d, z0.d[1] +mul z30.d, z30.d,z1.d[1] +mla z30.d, P0/M, z22.d, z31.d +sub z22.d, z29.d, z30.d +add z29.d, z29.d, z30.d +sqrdmulh z30.d, z23.d, z14.d[0] +mul z23.d, z23.d,z15.d[0] +mla z23.d, P0/M, z30.d, z31.d +sub z30.d, z24.d, z23.d +add z24.d, z24.d, z23.d +str q24, [x0, #192] +str q30, [x0, #448] +sqrdmulh z30.d, z27.d, z14.d[1] +mul z27.d, z27.d,z15.d[1] +mla z27.d, P0/M, z30.d, z31.d +sub z30.d, z26.d, z27.d +add z26.d, z26.d, z27.d +str q26, [x0, #704] +str q30, [x0, #960] +sqrdmulh z30.d, z25.d, z12.d[1] +mul z25.d, z25.d,z13.d[1] +mla z25.d, P0/M, z30.d, z31.d +sub z30.d, z22.d, z25.d +add z22.d, z22.d, z25.d +str q22, [x0, #1728] +str q30, [x0, #1984] +sqrdmulh z30.d, z28.d, z12.d[0] +mul z28.d, z28.d,z13.d[0] +mla z28.d, P0/M, z30.d, z31.d +sub z30.d, z29.d, z28.d +add z29.d, z29.d, z28.d +str q29, [x0, #1216] +str q30, [x0, #1472] +ldr q30, [x0, #2000] +ldr q29, [x0, #1744] +ldr q28, [x0, #1232] +ldr q22, [x0, #1488] +ldr q25, [x0, #976] +ldr q26, [x0, #720] +ldr q27, [x0, #208] +ldr q24, [x0, #464] +sqrdmulh z23.d, z30.d, z2.d[0] +mul z30.d, z30.d,z3.d[0] +mla z30.d, P0/M, z23.d, z31.d +sub z23.d, z25.d, z30.d +add z25.d, z25.d, z30.d +sqrdmulh z30.d, z29.d, z2.d[0] +mul z29.d, z29.d,z3.d[0] +mla z29.d, P0/M, z30.d, z31.d +sub z30.d, z26.d, z29.d +add z26.d, z26.d, z29.d +sqrdmulh z29.d, z28.d, z2.d[0] +mul z28.d, z28.d,z3.d[0] +mla z28.d, P0/M, z29.d, z31.d +sub z29.d, z27.d, z28.d +add z27.d, z27.d, z28.d +sqrdmulh z28.d, z22.d, z2.d[0] +mul z22.d, z22.d,z3.d[0] +mla z22.d, P0/M, z28.d, z31.d +sub z28.d, z24.d, z22.d +add z24.d, z24.d, z22.d +sqrdmulh z22.d, z25.d, z0.d[0] +mul z25.d, z25.d,z1.d[0] +mla z25.d, P0/M, z22.d, z31.d +sub z22.d, z24.d, z25.d +add z24.d, z24.d, z25.d +sqrdmulh z25.d, z26.d, z0.d[0] +mul z26.d, z26.d,z1.d[0] +mla z26.d, P0/M, z25.d, z31.d +sub z25.d, z27.d, z26.d +add z27.d, z27.d, z26.d +sqrdmulh z26.d, z23.d, z0.d[1] +mul z23.d, z23.d,z1.d[1] +mla z23.d, P0/M, z26.d, z31.d +sub z26.d, z28.d, z23.d +add z28.d, z28.d, z23.d +sqrdmulh z23.d, z30.d, z0.d[1] +mul z30.d, z30.d,z1.d[1] +mla z30.d, P0/M, z23.d, z31.d +sub z23.d, z29.d, z30.d +add z29.d, z29.d, z30.d +sqrdmulh z30.d, z24.d, z14.d[0] +mul z24.d, z24.d,z15.d[0] +mla z24.d, P0/M, z30.d, z31.d +sub z30.d, z27.d, z24.d +add z27.d, z27.d, z24.d +str q27, [x0, #208] +str q30, [x0, #464] +sqrdmulh z30.d, z22.d, z14.d[1] +mul z22.d, z22.d,z15.d[1] +mla z22.d, P0/M, z30.d, z31.d +sub z30.d, z25.d, z22.d +add z25.d, z25.d, z22.d +str q25, [x0, #720] +str q30, [x0, #976] +sqrdmulh z30.d, z26.d, z12.d[1] +mul z26.d, z26.d,z13.d[1] +mla z26.d, P0/M, z30.d, z31.d +sub z30.d, z23.d, z26.d +add z23.d, z23.d, z26.d +str q23, [x0, #1744] +str q30, [x0, #2000] +sqrdmulh z30.d, z28.d, z12.d[0] +mul z28.d, z28.d,z13.d[0] +mla z28.d, P0/M, z30.d, z31.d +sub z30.d, z29.d, z28.d +add z29.d, z29.d, z28.d +str q29, [x0, #1232] +str q30, [x0, #1488] +ldr q30, [x0, #2016] +ldr q29, [x0, #1760] +ldr q28, [x0, #1248] +ldr q23, [x0, #1504] +ldr q26, [x0, #992] +ldr q25, [x0, #736] +ldr q22, [x0, #224] +ldr q27, [x0, #480] +sqrdmulh z24.d, z30.d, z2.d[0] +mul z30.d, z30.d,z3.d[0] +mla z30.d, P0/M, z24.d, z31.d +sub z24.d, z26.d, z30.d +add z26.d, z26.d, z30.d +sqrdmulh z30.d, z29.d, z2.d[0] +mul z29.d, z29.d,z3.d[0] +mla z29.d, P0/M, z30.d, z31.d +sub z30.d, z25.d, z29.d +add z25.d, z25.d, z29.d +sqrdmulh z29.d, z28.d, z2.d[0] +mul z28.d, z28.d,z3.d[0] +mla z28.d, P0/M, z29.d, z31.d +sub z29.d, z22.d, z28.d +add z22.d, z22.d, z28.d +sqrdmulh z28.d, z23.d, z2.d[0] +mul z23.d, z23.d,z3.d[0] +mla z23.d, P0/M, z28.d, z31.d +sub z28.d, z27.d, z23.d +add z27.d, z27.d, z23.d +sqrdmulh z23.d, z26.d, z0.d[0] +mul z26.d, z26.d,z1.d[0] +mla z26.d, P0/M, z23.d, z31.d +sub z23.d, z27.d, z26.d +add z27.d, z27.d, z26.d +sqrdmulh z26.d, z25.d, z0.d[0] +mul z25.d, z25.d,z1.d[0] +mla z25.d, P0/M, z26.d, z31.d +sub z26.d, z22.d, z25.d +add z22.d, z22.d, z25.d +sqrdmulh z25.d, z24.d, z0.d[1] +mul z24.d, z24.d,z1.d[1] +mla z24.d, P0/M, z25.d, z31.d +sub z25.d, z28.d, z24.d +add z28.d, z28.d, z24.d +sqrdmulh z24.d, z30.d, z0.d[1] +mul z30.d, z30.d,z1.d[1] +mla z30.d, P0/M, z24.d, z31.d +sub z24.d, z29.d, z30.d +add z29.d, z29.d, z30.d +sqrdmulh z30.d, z27.d, z14.d[0] +mul z27.d, z27.d,z15.d[0] +mla z27.d, P0/M, z30.d, z31.d +sub z30.d, z22.d, z27.d +add z22.d, z22.d, z27.d +str q22, [x0, #224] +str q30, [x0, #480] +sqrdmulh z30.d, z23.d, z14.d[1] +mul z23.d, z23.d,z15.d[1] +mla z23.d, P0/M, z30.d, z31.d +sub z30.d, z26.d, z23.d +add z26.d, z26.d, z23.d +str q26, [x0, #736] +str q30, [x0, #992] +sqrdmulh z30.d, z25.d, z12.d[1] +mul z25.d, z25.d,z13.d[1] +mla z25.d, P0/M, z30.d, z31.d +sub z30.d, z24.d, z25.d +add z24.d, z24.d, z25.d +str q24, [x0, #1760] +str q30, [x0, #2016] +sqrdmulh z30.d, z28.d, z12.d[0] +mul z28.d, z28.d,z13.d[0] +mla z28.d, P0/M, z30.d, z31.d +sub z30.d, z29.d, z28.d +add z29.d, z29.d, z28.d +str q29, [x0, #1248] +str q30, [x0, #1504] +ldr q30, [x0, #2032] +ldr q29, [x0, #1776] +ldr q28, [x0, #1264] +ldr q24, [x0, #1520] +ldr q25, [x0, #1008] +ldr q26, [x0, #752] +ldr q23, [x0, #240] +ldr q22, [x0, #496] +sqrdmulh z27.d, z30.d, z2.d[0] +mul z30.d, z30.d,z3.d[0] +mla z30.d, P0/M, z27.d, z31.d +sub z27.d, z25.d, z30.d +add z25.d, z25.d, z30.d +sqrdmulh z30.d, z29.d, z2.d[0] +mul z29.d, z29.d,z3.d[0] +mla z29.d, P0/M, z30.d, z31.d +sub z30.d, z26.d, z29.d +add z26.d, z26.d, z29.d +sqrdmulh z29.d, z28.d, z2.d[0] +mul z28.d, z28.d,z3.d[0] +mla z28.d, P0/M, z29.d, z31.d +sub z29.d, z23.d, z28.d +add z23.d, z23.d, z28.d +sqrdmulh z28.d, z24.d, z2.d[0] +mul z24.d, z24.d,z3.d[0] +mla z24.d, P0/M, z28.d, z31.d +sub z28.d, z22.d, z24.d +add z22.d, z22.d, z24.d +sqrdmulh z24.d, z25.d, z0.d[0] +mul z25.d, z25.d,z1.d[0] +mla z25.d, P0/M, z24.d, z31.d +sub z24.d, z22.d, z25.d +add z22.d, z22.d, z25.d +sqrdmulh z25.d, z26.d, z0.d[0] +mul z26.d, z26.d,z1.d[0] +mla z26.d, P0/M, z25.d, z31.d +sub z25.d, z23.d, z26.d +add z23.d, z23.d, z26.d +sqrdmulh z26.d, z27.d, z0.d[1] +mul z27.d, z27.d,z1.d[1] +mla z27.d, P0/M, z26.d, z31.d +sub z26.d, z28.d, z27.d +add z28.d, z28.d, z27.d +sqrdmulh z27.d, z30.d, z0.d[1] +mul z30.d, z30.d,z1.d[1] +mla z30.d, P0/M, z27.d, z31.d +sub z27.d, z29.d, z30.d +add z29.d, z29.d, z30.d +sqrdmulh z30.d, z22.d, z14.d[0] +mul z22.d, z22.d,z15.d[0] +mla z22.d, P0/M, z30.d, z31.d +sub z30.d, z23.d, z22.d +add z23.d, z23.d, z22.d +str q23, [x0, #240] +str q30, [x0, #496] +sqrdmulh z30.d, z24.d, z14.d[1] +mul z24.d, z24.d,z15.d[1] +mla z24.d, P0/M, z30.d, z31.d +sub z30.d, z25.d, z24.d +add z25.d, z25.d, z24.d +str q25, [x0, #752] +str q30, [x0, #1008] +sqrdmulh z30.d, z26.d, z12.d[1] +mul z26.d, z26.d,z13.d[1] +mla z26.d, P0/M, z30.d, z31.d +sub z30.d, z27.d, z26.d +add z27.d, z27.d, z26.d +str q27, [x0, #1776] +str q30, [x0, #2032] +sqrdmulh z30.d, z28.d, z12.d[0] +mul z28.d, z28.d,z13.d[0] +mla z28.d, P0/M, z30.d, z31.d +sub z30.d, z29.d, z28.d +add z29.d, z29.d, z28.d +str q29, [x0, #1264] +str q30, [x0, #1520] +ldr q30, [x0, #1792] +ldr q29, [x0, #1536] +ldr q28, [x0, #1024] +ldr q27, [x0, #1280] +ldr q26, [x0, #768] +ldr q25, [x0, #512] +ldr q24, [x0, #0] +ldr q23, [x0, #256] +sqrdmulh z22.d, z30.d, z2.d[0] +mul z30.d, z30.d,z3.d[0] +mla z30.d, P0/M, z22.d, z31.d +sub z22.d, z26.d, z30.d +add z26.d, z26.d, z30.d +sqrdmulh z30.d, z29.d, z2.d[0] +mul z29.d, z29.d,z3.d[0] +mla z29.d, P0/M, z30.d, z31.d +sub z30.d, z25.d, z29.d +add z25.d, z25.d, z29.d +sqrdmulh z29.d, z28.d, z2.d[0] +mul z28.d, z28.d,z3.d[0] +mla z28.d, P0/M, z29.d, z31.d +sub z29.d, z24.d, z28.d +add z24.d, z24.d, z28.d +sqrdmulh z28.d, z27.d, z2.d[0] +mul z27.d, z27.d,z3.d[0] +mla z27.d, P0/M, z28.d, z31.d +sub z28.d, z23.d, z27.d +add z23.d, z23.d, z27.d +sqrdmulh z27.d, z26.d, z0.d[0] +mul z26.d, z26.d,z1.d[0] +mla z26.d, P0/M, z27.d, z31.d +sub z27.d, z23.d, z26.d +add z23.d, z23.d, z26.d +sqrdmulh z26.d, z25.d, z0.d[0] +mul z25.d, z25.d,z1.d[0] +mla z25.d, P0/M, z26.d, z31.d +sub z26.d, z24.d, z25.d +add z24.d, z24.d, z25.d +sqrdmulh z25.d, z22.d, z0.d[1] +mul z22.d, z22.d,z1.d[1] +mla z22.d, P0/M, z25.d, z31.d +sub z25.d, z28.d, z22.d +add z28.d, z28.d, z22.d +sqrdmulh z22.d, z30.d, z0.d[1] +mul z30.d, z30.d,z1.d[1] +mla z30.d, P0/M, z22.d, z31.d +sub z22.d, z29.d, z30.d +add z29.d, z29.d, z30.d +sqrdmulh z30.d, z23.d, z14.d[0] +mul z23.d, z23.d,z15.d[0] +mla z23.d, P0/M, z30.d, z31.d +sub z30.d, z24.d, z23.d +add z24.d, z24.d, z23.d +str q24, [x0, #0] +str q30, [x0, #256] +sqrdmulh z30.d, z27.d, z14.d[1] +mul z27.d, z27.d,z15.d[1] +mla z27.d, P0/M, z30.d, z31.d +sub z30.d, z26.d, z27.d +add z26.d, z26.d, z27.d +str q26, [x0, #512] +str q30, [x0, #768] +sqrdmulh z30.d, z25.d, z12.d[1] +mul z25.d, z25.d,z13.d[1] +mla z25.d, P0/M, z30.d, z31.d +sub z30.d, z22.d, z25.d +add z22.d, z22.d, z25.d +str q22, [x0, #1536] +str q30, [x0, #1792] +sqrdmulh z30.d, z28.d, z12.d[0] +mul z28.d, z28.d,z13.d[0] +mla z28.d, P0/M, z30.d, z31.d +sub z30.d, z29.d, z28.d +add z29.d, z29.d, z28.d +str q29, [x0, #1024] +str q30, [x0, #1280] +ldr q30, [x0, #1808] +ldr q29, [x0, #1552] +ldr q28, [x0, #1040] +ldr q22, [x0, #1296] +ldr q25, [x0, #784] +ldr q26, [x0, #528] +ldr q27, [x0, #16] +ldr q24, [x0, #272] +sqrdmulh z23.d, z30.d, z2.d[0] +mul z30.d, z30.d,z3.d[0] +mla z30.d, P0/M, z23.d, z31.d +sub z23.d, z25.d, z30.d +add z25.d, z25.d, z30.d +sqrdmulh z30.d, z29.d, z2.d[0] +mul z29.d, z29.d,z3.d[0] +mla z29.d, P0/M, z30.d, z31.d +sub z30.d, z26.d, z29.d +add z26.d, z26.d, z29.d +sqrdmulh z29.d, z28.d, z2.d[0] +mul z28.d, z28.d,z3.d[0] +mla z28.d, P0/M, z29.d, z31.d +sub z29.d, z27.d, z28.d +add z27.d, z27.d, z28.d +sqrdmulh z28.d, z22.d, z2.d[0] +mul z22.d, z22.d,z3.d[0] +mla z22.d, P0/M, z28.d, z31.d +sub z28.d, z24.d, z22.d +add z24.d, z24.d, z22.d +sqrdmulh z22.d, z25.d, z0.d[0] +mul z25.d, z25.d,z1.d[0] +mla z25.d, P0/M, z22.d, z31.d +sub z22.d, z24.d, z25.d +add z24.d, z24.d, z25.d +sqrdmulh z25.d, z26.d, z0.d[0] +mul z26.d, z26.d,z1.d[0] +mla z26.d, P0/M, z25.d, z31.d +sub z25.d, z27.d, z26.d +add z27.d, z27.d, z26.d +sqrdmulh z26.d, z23.d, z0.d[1] +mul z23.d, z23.d,z1.d[1] +mla z23.d, P0/M, z26.d, z31.d +sub z26.d, z28.d, z23.d +add z28.d, z28.d, z23.d +sqrdmulh z23.d, z30.d, z0.d[1] +mul z30.d, z30.d,z1.d[1] +mla z30.d, P0/M, z23.d, z31.d +sub z23.d, z29.d, z30.d +add z29.d, z29.d, z30.d +sqrdmulh z30.d, z24.d, z14.d[0] +mul z24.d, z24.d,z15.d[0] +mla z24.d, P0/M, z30.d, z31.d +sub z30.d, z27.d, z24.d +add z27.d, z27.d, z24.d +str q27, [x0, #16] +str q30, [x0, #272] +sqrdmulh z30.d, z22.d, z14.d[1] +mul z22.d, z22.d,z15.d[1] +mla z22.d, P0/M, z30.d, z31.d +sub z30.d, z25.d, z22.d +add z25.d, z25.d, z22.d +str q25, [x0, #528] +str q30, [x0, #784] +sqrdmulh z30.d, z26.d, z12.d[1] +mul z26.d, z26.d,z13.d[1] +mla z26.d, P0/M, z30.d, z31.d +sub z30.d, z23.d, z26.d +add z23.d, z23.d, z26.d +str q23, [x0, #1552] +str q30, [x0, #1808] +sqrdmulh z30.d, z28.d, z12.d[0] +mul z28.d, z28.d,z13.d[0] +mla z28.d, P0/M, z30.d, z31.d +sub z30.d, z29.d, z28.d +add z29.d, z29.d, z28.d +str q29, [x0, #1040] +str q30, [x0, #1296] +ldr q30, [x0, #1824] +ldr q29, [x0, #1568] +ldr q28, [x0, #1056] +ldr q23, [x0, #1312] +ldr q26, [x0, #800] +ldr q25, [x0, #544] +ldr q22, [x0, #32] +ldr q27, [x0, #288] +sqrdmulh z24.d, z30.d, z2.d[0] +mul z30.d, z30.d,z3.d[0] +mla z30.d, P0/M, z24.d, z31.d +sub z24.d, z26.d, z30.d +add z26.d, z26.d, z30.d +sqrdmulh z30.d, z29.d, z2.d[0] +mul z29.d, z29.d,z3.d[0] +mla z29.d, P0/M, z30.d, z31.d +sub z30.d, z25.d, z29.d +add z25.d, z25.d, z29.d +sqrdmulh z29.d, z28.d, z2.d[0] +mul z28.d, z28.d,z3.d[0] +mla z28.d, P0/M, z29.d, z31.d +sub z29.d, z22.d, z28.d +add z22.d, z22.d, z28.d +sqrdmulh z28.d, z23.d, z2.d[0] +mul z23.d, z23.d,z3.d[0] +mla z23.d, P0/M, z28.d, z31.d +sub z28.d, z27.d, z23.d +add z27.d, z27.d, z23.d +sqrdmulh z23.d, z26.d, z0.d[0] +mul z26.d, z26.d,z1.d[0] +mla z26.d, P0/M, z23.d, z31.d +sub z23.d, z27.d, z26.d +add z27.d, z27.d, z26.d +sqrdmulh z26.d, z25.d, z0.d[0] +mul z25.d, z25.d,z1.d[0] +mla z25.d, P0/M, z26.d, z31.d +sub z26.d, z22.d, z25.d +add z22.d, z22.d, z25.d +sqrdmulh z25.d, z24.d, z0.d[1] +mul z24.d, z24.d,z1.d[1] +mla z24.d, P0/M, z25.d, z31.d +sub z25.d, z28.d, z24.d +add z28.d, z28.d, z24.d +sqrdmulh z24.d, z30.d, z0.d[1] +mul z30.d, z30.d,z1.d[1] +mla z30.d, P0/M, z24.d, z31.d +sub z24.d, z29.d, z30.d +add z29.d, z29.d, z30.d +sqrdmulh z30.d, z27.d, z14.d[0] +mul z27.d, z27.d,z15.d[0] +mla z27.d, P0/M, z30.d, z31.d +sub z30.d, z22.d, z27.d +add z22.d, z22.d, z27.d +str q22, [x0, #32] +str q30, [x0, #288] +sqrdmulh z30.d, z23.d, z14.d[1] +mul z23.d, z23.d,z15.d[1] +mla z23.d, P0/M, z30.d, z31.d +sub z30.d, z26.d, z23.d +add z26.d, z26.d, z23.d +str q26, [x0, #544] +str q30, [x0, #800] +sqrdmulh z30.d, z25.d, z12.d[1] +mul z25.d, z25.d,z13.d[1] +mla z25.d, P0/M, z30.d, z31.d +sub z30.d, z24.d, z25.d +add z24.d, z24.d, z25.d +str q24, [x0, #1568] +str q30, [x0, #1824] +sqrdmulh z30.d, z28.d, z12.d[0] +mul z28.d, z28.d,z13.d[0] +mla z28.d, P0/M, z30.d, z31.d +sub z30.d, z29.d, z28.d +add z29.d, z29.d, z28.d +str q29, [x0, #1056] +str q30, [x0, #1312] +ldr q30, [x0, #1840] +ldr q29, [x0, #1584] +ldr q28, [x0, #1072] +ldr q24, [x0, #1328] +ldr q25, [x0, #816] +ldr q26, [x0, #560] +ldr q23, [x0, #48] +ldr q22, [x0, #304] +sqrdmulh z27.d, z30.d, z2.d[0] +mul z30.d, z30.d,z3.d[0] +mla z30.d, P0/M, z27.d, z31.d +sub z27.d, z25.d, z30.d +add z25.d, z25.d, z30.d +sqrdmulh z30.d, z29.d, z2.d[0] +mul z29.d, z29.d,z3.d[0] +mla z29.d, P0/M, z30.d, z31.d +sub z30.d, z26.d, z29.d +add z26.d, z26.d, z29.d +sqrdmulh z29.d, z28.d, z2.d[0] +mul z28.d, z28.d,z3.d[0] +mla z28.d, P0/M, z29.d, z31.d +sub z29.d, z23.d, z28.d +add z23.d, z23.d, z28.d +sqrdmulh z28.d, z24.d, z2.d[0] +mul z24.d, z24.d,z3.d[0] +mla z24.d, P0/M, z28.d, z31.d +sub z28.d, z22.d, z24.d +add z22.d, z22.d, z24.d +sqrdmulh z24.d, z25.d, z0.d[0] +mul z25.d, z25.d,z1.d[0] +mla z25.d, P0/M, z24.d, z31.d +sub z24.d, z22.d, z25.d +add z22.d, z22.d, z25.d +sqrdmulh z25.d, z26.d, z0.d[0] +mul z26.d, z26.d,z1.d[0] +mla z26.d, P0/M, z25.d, z31.d +sub z25.d, z23.d, z26.d +add z23.d, z23.d, z26.d +sqrdmulh z26.d, z27.d, z0.d[1] +mul z27.d, z27.d,z1.d[1] +mla z27.d, P0/M, z26.d, z31.d +sub z26.d, z28.d, z27.d +add z28.d, z28.d, z27.d +sqrdmulh z27.d, z30.d, z0.d[1] +mul z30.d, z30.d,z1.d[1] +mla z30.d, P0/M, z27.d, z31.d +sub z27.d, z29.d, z30.d +add z29.d, z29.d, z30.d +sqrdmulh z30.d, z22.d, z14.d[0] +mul z22.d, z22.d,z15.d[0] +mla z22.d, P0/M, z30.d, z31.d +sub z30.d, z23.d, z22.d +add z23.d, z23.d, z22.d +str q23, [x0, #48] +str q30, [x0, #304] +sqrdmulh z30.d, z24.d, z14.d[1] +mul z24.d, z24.d,z15.d[1] +mla z24.d, P0/M, z30.d, z31.d +sub z30.d, z25.d, z24.d +add z25.d, z25.d, z24.d +str q25, [x0, #560] +str q30, [x0, #816] +sqrdmulh z30.d, z26.d, z12.d[1] +mul z26.d, z26.d,z13.d[1] +mla z26.d, P0/M, z30.d, z31.d +sub z30.d, z27.d, z26.d +add z27.d, z27.d, z26.d +str q27, [x0, #1584] +str q30, [x0, #1840] +sqrdmulh z30.d, z28.d, z12.d[0] +mul z28.d, z28.d,z13.d[0] +mla z28.d, P0/M, z30.d, z31.d +sub z30.d, z29.d, z28.d +add z29.d, z29.d, z28.d +str q29, [x0, #1072] +str q30, [x0, #1328] +ldr q30, [x0, #1856] +ldr q29, [x0, #1600] +ldr q28, [x0, #1088] +ldr q27, [x0, #1344] +ldr q26, [x0, #832] +ldr q25, [x0, #576] +ldr q24, [x0, #64] +ldr q23, [x0, #320] +sqrdmulh z22.d, z30.d, z2.d[0] +mul z30.d, z30.d,z3.d[0] +mla z30.d, P0/M, z22.d, z31.d +sub z22.d, z26.d, z30.d +add z26.d, z26.d, z30.d +sqrdmulh z30.d, z29.d, z2.d[0] +mul z29.d, z29.d,z3.d[0] +mla z29.d, P0/M, z30.d, z31.d +sub z30.d, z25.d, z29.d +add z25.d, z25.d, z29.d +sqrdmulh z29.d, z28.d, z2.d[0] +mul z28.d, z28.d,z3.d[0] +mla z28.d, P0/M, z29.d, z31.d +sub z29.d, z24.d, z28.d +add z24.d, z24.d, z28.d +sqrdmulh z28.d, z27.d, z2.d[0] +mul z27.d, z27.d,z3.d[0] +mla z27.d, P0/M, z28.d, z31.d +sub z28.d, z23.d, z27.d +add z23.d, z23.d, z27.d +sqrdmulh z27.d, z26.d, z0.d[0] +mul z26.d, z26.d,z1.d[0] +mla z26.d, P0/M, z27.d, z31.d +sub z27.d, z23.d, z26.d +add z23.d, z23.d, z26.d +sqrdmulh z26.d, z25.d, z0.d[0] +mul z25.d, z25.d,z1.d[0] +mla z25.d, P0/M, z26.d, z31.d +sub z26.d, z24.d, z25.d +add z24.d, z24.d, z25.d +sqrdmulh z25.d, z22.d, z0.d[1] +mul z22.d, z22.d,z1.d[1] +mla z22.d, P0/M, z25.d, z31.d +sub z25.d, z28.d, z22.d +add z28.d, z28.d, z22.d +sqrdmulh z22.d, z30.d, z0.d[1] +mul z30.d, z30.d,z1.d[1] +mla z30.d, P0/M, z22.d, z31.d +sub z22.d, z29.d, z30.d +add z29.d, z29.d, z30.d +sqrdmulh z30.d, z23.d, z14.d[0] +mul z23.d, z23.d,z15.d[0] +mla z23.d, P0/M, z30.d, z31.d +sub z30.d, z24.d, z23.d +add z24.d, z24.d, z23.d +str q24, [x0, #64] +str q30, [x0, #320] +sqrdmulh z30.d, z27.d, z14.d[1] +mul z27.d, z27.d,z15.d[1] +mla z27.d, P0/M, z30.d, z31.d +sub z30.d, z26.d, z27.d +add z26.d, z26.d, z27.d +str q26, [x0, #576] +str q30, [x0, #832] +sqrdmulh z30.d, z25.d, z12.d[1] +mul z25.d, z25.d,z13.d[1] +mla z25.d, P0/M, z30.d, z31.d +sub z30.d, z22.d, z25.d +add z22.d, z22.d, z25.d +str q22, [x0, #1600] +str q30, [x0, #1856] +sqrdmulh z30.d, z28.d, z12.d[0] +mul z28.d, z28.d,z13.d[0] +mla z28.d, P0/M, z30.d, z31.d +sub z30.d, z29.d, z28.d +add z29.d, z29.d, z28.d +str q29, [x0, #1088] +str q30, [x0, #1344] +ldr q30, [x0, #1872] +ldr q29, [x0, #1616] +ldr q28, [x0, #1104] +ldr q22, [x0, #1360] +ldr q25, [x0, #848] +ldr q26, [x0, #592] +ldr q27, [x0, #80] +ldr q24, [x0, #336] +sqrdmulh z23.d, z30.d, z2.d[0] +mul z30.d, z30.d,z3.d[0] +mla z30.d, P0/M, z23.d, z31.d +sub z23.d, z25.d, z30.d +add z25.d, z25.d, z30.d +sqrdmulh z30.d, z29.d, z2.d[0] +mul z29.d, z29.d,z3.d[0] +mla z29.d, P0/M, z30.d, z31.d +sub z30.d, z26.d, z29.d +add z26.d, z26.d, z29.d +sqrdmulh z29.d, z28.d, z2.d[0] +mul z28.d, z28.d,z3.d[0] +mla z28.d, P0/M, z29.d, z31.d +sub z29.d, z27.d, z28.d +add z27.d, z27.d, z28.d +sqrdmulh z28.d, z22.d, z2.d[0] +mul z22.d, z22.d,z3.d[0] +mla z22.d, P0/M, z28.d, z31.d +sub z28.d, z24.d, z22.d +add z24.d, z24.d, z22.d +sqrdmulh z22.d, z25.d, z0.d[0] +mul z25.d, z25.d,z1.d[0] +mla z25.d, P0/M, z22.d, z31.d +sub z22.d, z24.d, z25.d +add z24.d, z24.d, z25.d +sqrdmulh z25.d, z26.d, z0.d[0] +mul z26.d, z26.d,z1.d[0] +mla z26.d, P0/M, z25.d, z31.d +sub z25.d, z27.d, z26.d +add z27.d, z27.d, z26.d +sqrdmulh z26.d, z23.d, z0.d[1] +mul z23.d, z23.d,z1.d[1] +mla z23.d, P0/M, z26.d, z31.d +sub z26.d, z28.d, z23.d +add z28.d, z28.d, z23.d +sqrdmulh z23.d, z30.d, z0.d[1] +mul z30.d, z30.d,z1.d[1] +mla z30.d, P0/M, z23.d, z31.d +sub z23.d, z29.d, z30.d +add z29.d, z29.d, z30.d +sqrdmulh z30.d, z24.d, z14.d[0] +mul z24.d, z24.d,z15.d[0] +mla z24.d, P0/M, z30.d, z31.d +sub z30.d, z27.d, z24.d +add z27.d, z27.d, z24.d +str q27, [x0, #80] +str q30, [x0, #336] +sqrdmulh z30.d, z22.d, z14.d[1] +mul z22.d, z22.d,z15.d[1] +mla z22.d, P0/M, z30.d, z31.d +sub z30.d, z25.d, z22.d +add z25.d, z25.d, z22.d +str q25, [x0, #592] +str q30, [x0, #848] +sqrdmulh z30.d, z26.d, z12.d[1] +mul z26.d, z26.d,z13.d[1] +mla z26.d, P0/M, z30.d, z31.d +sub z30.d, z23.d, z26.d +add z23.d, z23.d, z26.d +str q23, [x0, #1616] +str q30, [x0, #1872] +sqrdmulh z30.d, z28.d, z12.d[0] +mul z28.d, z28.d,z13.d[0] +mla z28.d, P0/M, z30.d, z31.d +sub z30.d, z29.d, z28.d +add z29.d, z29.d, z28.d +str q29, [x0, #1104] +str q30, [x0, #1360] +ldr q30, [x0, #1888] +ldr q29, [x0, #1632] +ldr q28, [x0, #1120] +ldr q23, [x0, #1376] +ldr q26, [x0, #864] +ldr q25, [x0, #608] +ldr q22, [x0, #96] +ldr q27, [x0, #352] +sqrdmulh z24.d, z30.d, z2.d[0] +mul z30.d, z30.d,z3.d[0] +mla z30.d, P0/M, z24.d, z31.d +sub z24.d, z26.d, z30.d +add z26.d, z26.d, z30.d +sqrdmulh z30.d, z29.d, z2.d[0] +mul z29.d, z29.d,z3.d[0] +mla z29.d, P0/M, z30.d, z31.d +sub z30.d, z25.d, z29.d +add z25.d, z25.d, z29.d +sqrdmulh z29.d, z28.d, z2.d[0] +mul z28.d, z28.d,z3.d[0] +mla z28.d, P0/M, z29.d, z31.d +sub z29.d, z22.d, z28.d +add z22.d, z22.d, z28.d +sqrdmulh z28.d, z23.d, z2.d[0] +mul z23.d, z23.d,z3.d[0] +mla z23.d, P0/M, z28.d, z31.d +sub z28.d, z27.d, z23.d +add z27.d, z27.d, z23.d +sqrdmulh z23.d, z26.d, z0.d[0] +mul z26.d, z26.d,z1.d[0] +mla z26.d, P0/M, z23.d, z31.d +sub z23.d, z27.d, z26.d +add z27.d, z27.d, z26.d +sqrdmulh z26.d, z25.d, z0.d[0] +mul z25.d, z25.d,z1.d[0] +mla z25.d, P0/M, z26.d, z31.d +sub z26.d, z22.d, z25.d +add z22.d, z22.d, z25.d +sqrdmulh z25.d, z24.d, z0.d[1] +mul z24.d, z24.d,z1.d[1] +mla z24.d, P0/M, z25.d, z31.d +sub z25.d, z28.d, z24.d +add z28.d, z28.d, z24.d +sqrdmulh z24.d, z30.d, z0.d[1] +mul z30.d, z30.d,z1.d[1] +mla z30.d, P0/M, z24.d, z31.d +sub z24.d, z29.d, z30.d +add z29.d, z29.d, z30.d +sqrdmulh z30.d, z27.d, z14.d[0] +mul z27.d, z27.d,z15.d[0] +mla z27.d, P0/M, z30.d, z31.d +sub z30.d, z22.d, z27.d +add z22.d, z22.d, z27.d +str q22, [x0, #96] +str q30, [x0, #352] +sqrdmulh z30.d, z23.d, z14.d[1] +mul z23.d, z23.d,z15.d[1] +mla z23.d, P0/M, z30.d, z31.d +sub z30.d, z26.d, z23.d +add z26.d, z26.d, z23.d +str q26, [x0, #608] +str q30, [x0, #864] +sqrdmulh z30.d, z25.d, z12.d[1] +mul z25.d, z25.d,z13.d[1] +mla z25.d, P0/M, z30.d, z31.d +sub z30.d, z24.d, z25.d +add z24.d, z24.d, z25.d +str q24, [x0, #1632] +str q30, [x0, #1888] +sqrdmulh z30.d, z28.d, z12.d[0] +mul z28.d, z28.d,z13.d[0] +mla z28.d, P0/M, z30.d, z31.d +sub z30.d, z29.d, z28.d +add z29.d, z29.d, z28.d +str q29, [x0, #1120] +str q30, [x0, #1376] +ldr q30, [x0, #1904] +ldr q29, [x0, #1648] +ldr q28, [x0, #1136] +ldr q24, [x0, #1392] +ldr q25, [x0, #880] +ldr q26, [x0, #624] +ldr q23, [x0, #112] +ldr q22, [x0, #368] +sqrdmulh z27.d, z30.d, z2.d[0] +mul z30.d, z30.d,z3.d[0] +mla z30.d, P0/M, z27.d, z31.d +sub z27.d, z25.d, z30.d +add z25.d, z25.d, z30.d +sqrdmulh z30.d, z29.d, z2.d[0] +mul z29.d, z29.d,z3.d[0] +mla z29.d, P0/M, z30.d, z31.d +sub z30.d, z26.d, z29.d +add z26.d, z26.d, z29.d +sqrdmulh z29.d, z28.d, z2.d[0] +mul z28.d, z28.d,z3.d[0] +mla z28.d, P0/M, z29.d, z31.d +sub z29.d, z23.d, z28.d +add z23.d, z23.d, z28.d +sqrdmulh z28.d, z24.d, z2.d[0] +mul z24.d, z24.d,z3.d[0] +mla z24.d, P0/M, z28.d, z31.d +sub z28.d, z22.d, z24.d +add z22.d, z22.d, z24.d +sqrdmulh z24.d, z25.d, z0.d[0] +mul z25.d, z25.d,z1.d[0] +mla z25.d, P0/M, z24.d, z31.d +sub z24.d, z22.d, z25.d +add z22.d, z22.d, z25.d +sqrdmulh z25.d, z26.d, z0.d[0] +mul z26.d, z26.d,z1.d[0] +mla z26.d, P0/M, z25.d, z31.d +sub z25.d, z23.d, z26.d +add z23.d, z23.d, z26.d +sqrdmulh z26.d, z27.d, z0.d[1] +mul z27.d, z27.d,z1.d[1] +mla z27.d, P0/M, z26.d, z31.d +sub z26.d, z28.d, z27.d +add z28.d, z28.d, z27.d +sqrdmulh z27.d, z30.d, z0.d[1] +mul z30.d, z30.d,z1.d[1] +mla z30.d, P0/M, z27.d, z31.d +sub z27.d, z29.d, z30.d +add z29.d, z29.d, z30.d +sqrdmulh z30.d, z22.d, z14.d[0] +mul z22.d, z22.d,z15.d[0] +mla z22.d, P0/M, z30.d, z31.d +sub z30.d, z23.d, z22.d +add z23.d, z23.d, z22.d +str q23, [x0, #112] +str q30, [x0, #368] +sqrdmulh z30.d, z24.d, z14.d[1] +mul z24.d, z24.d,z15.d[1] +mla z24.d, P0/M, z30.d, z31.d +sub z30.d, z25.d, z24.d +add z25.d, z25.d, z24.d +str q25, [x0, #624] +str q30, [x0, #880] +sqrdmulh z30.d, z26.d, z12.d[1] +mul z26.d, z26.d,z13.d[1] +mla z26.d, P0/M, z30.d, z31.d +sub z30.d, z27.d, z26.d +add z27.d, z27.d, z26.d +str q27, [x0, #1648] +str q30, [x0, #1904] +sqrdmulh z30.d, z28.d, z12.d[0] +mul z28.d, z28.d,z13.d[0] +mla z28.d, P0/M, z30.d, z31.d +sub z30.d, z29.d, z28.d +add z29.d, z29.d, z28.d +str q29, [x0, #1136] +str q30, [x0, #1392] +ldr q4, [x17, #+128] +ldr q5, [x17, #+144] +ldr q6, [x17, #+160] +ldr q7, [x17, #+176] +ldr q8, [x17, #+192] +ldr q9, [x17, #+208] +ldr q10, [x17, #+224] +ldr q11, [x17, #+240] +ldr q16, [x0, #240] +ldr q17, [x0, #208] +ldr q18, [x0, #144] +ldr q19, [x0, #176] +ldr q20, [x0, #112] +ldr q21, [x0, #80] +ldr q22, [x0, #16] +ldr q23, [x0, #48] +sqrdmulh z24.d, z16.d, z5.d[0] +mul z16.d, z16.d,z4.d[0] +mla z16.d, P0/M, z24.d, z31.d +sub z24.d, z20.d, z16.d +add z20.d, z20.d, z16.d +sqrdmulh z16.d, z17.d, z5.d[0] +mul z17.d, z17.d,z4.d[0] +mla z17.d, P0/M, z16.d, z31.d +sub z16.d, z21.d, z17.d +add z21.d, z21.d, z17.d +sqrdmulh z17.d, z18.d, z5.d[0] +mul z18.d, z18.d,z4.d[0] +mla z18.d, P0/M, z17.d, z31.d +sub z17.d, z22.d, z18.d +add z22.d, z22.d, z18.d +sqrdmulh z18.d, z19.d, z5.d[0] +mul z19.d, z19.d,z4.d[0] +mla z19.d, P0/M, z18.d, z31.d +sub z18.d, z23.d, z19.d +add z23.d, z23.d, z19.d +sqrdmulh z19.d, z20.d, z7.d[0] +mul z20.d, z20.d,z6.d[0] +mla z20.d, P0/M, z19.d, z31.d +sub z19.d, z23.d, z20.d +add z23.d, z23.d, z20.d +sqrdmulh z20.d, z21.d, z7.d[0] +mul z21.d, z21.d,z6.d[0] +mla z21.d, P0/M, z20.d, z31.d +sub z20.d, z22.d, z21.d +add z22.d, z22.d, z21.d +sqrdmulh z21.d, z24.d, z7.d[1] +mul z24.d, z24.d,z6.d[1] +mla z24.d, P0/M, z21.d, z31.d +sub z21.d, z18.d, z24.d +add z18.d, z18.d, z24.d +sqrdmulh z24.d, z16.d, z7.d[1] +mul z16.d, z16.d,z6.d[1] +mla z16.d, P0/M, z24.d, z31.d +sub z24.d, z17.d, z16.d +add z17.d, z17.d, z16.d +sqrdmulh z16.d, z23.d, z9.d[0] +mul z23.d, z23.d,z8.d[0] +mla z23.d, P0/M, z16.d, z31.d +sub z16.d, z22.d, z23.d +add z22.d, z22.d, z23.d +str q22, [x0, #16] +str q16, [x0, #48] +sqrdmulh z16.d, z19.d, z9.d[1] +mul z19.d, z19.d,z8.d[1] +mla z19.d, P0/M, z16.d, z31.d +sub z16.d, z20.d, z19.d +add z20.d, z20.d, z19.d +str q20, [x0, #80] +str q16, [x0, #112] +sqrdmulh z16.d, z21.d, z11.d[1] +mul z21.d, z21.d,z10.d[1] +mla z21.d, P0/M, z16.d, z31.d +sub z16.d, z24.d, z21.d +add z24.d, z24.d, z21.d +str q24, [x0, #208] +str q16, [x0, #240] +sqrdmulh z16.d, z18.d, z11.d[0] +mul z18.d, z18.d,z10.d[0] +mla z18.d, P0/M, z16.d, z31.d +sub z16.d, z17.d, z18.d +add z17.d, z17.d, z18.d +str q17, [x0, #144] +str q16, [x0, #176] +ldr q16, [x0, #224] +ldr q17, [x0, #192] +ldr q18, [x0, #128] +ldr q24, [x0, #160] +ldr q21, [x0, #96] +ldr q20, [x0, #64] +ldr q19, [x0, #0] +ldr q22, [x0, #32] +sqrdmulh z23.d, z16.d, z5.d[0] +mul z16.d, z16.d,z4.d[0] +mla z16.d, P0/M, z23.d, z31.d +sub z23.d, z21.d, z16.d +add z21.d, z21.d, z16.d +sqrdmulh z16.d, z17.d, z5.d[0] +mul z17.d, z17.d,z4.d[0] +mla z17.d, P0/M, z16.d, z31.d +sub z16.d, z20.d, z17.d +add z20.d, z20.d, z17.d +sqrdmulh z17.d, z18.d, z5.d[0] +mul z18.d, z18.d,z4.d[0] +mla z18.d, P0/M, z17.d, z31.d +sub z17.d, z19.d, z18.d +add z19.d, z19.d, z18.d +sqrdmulh z18.d, z24.d, z5.d[0] +mul z24.d, z24.d,z4.d[0] +mla z24.d, P0/M, z18.d, z31.d +sub z18.d, z22.d, z24.d +add z22.d, z22.d, z24.d +sqrdmulh z24.d, z21.d, z7.d[0] +mul z21.d, z21.d,z6.d[0] +mla z21.d, P0/M, z24.d, z31.d +sub z24.d, z22.d, z21.d +add z22.d, z22.d, z21.d +ldr q3, [x17, #+256] +ldr q2, [x17, #+272] +ldr q1, [x17, #+288] +ldr q0, [x17, #+304] +ldr q15, [x17, #+320] +ldr q14, [x17, #+336] +ldr q13, [x17, #+352] +ldr q12, [x17, #+368] +sqrdmulh z21.d, z20.d, z7.d[0] +mul z20.d, z20.d,z6.d[0] +mla z20.d, P0/M, z21.d, z31.d +sub z21.d, z19.d, z20.d +add z19.d, z19.d, z20.d +sqrdmulh z20.d, z23.d, z7.d[1] +mul z23.d, z23.d,z6.d[1] +mla z23.d, P0/M, z20.d, z31.d +sub z20.d, z18.d, z23.d +add z18.d, z18.d, z23.d +sqrdmulh z23.d, z16.d, z7.d[1] +mul z16.d, z16.d,z6.d[1] +mla z16.d, P0/M, z23.d, z31.d +sub z23.d, z17.d, z16.d +add z17.d, z17.d, z16.d +sqrdmulh z16.d, z22.d, z9.d[0] +mul z22.d, z22.d,z8.d[0] +mla z22.d, P0/M, z16.d, z31.d +sub z16.d, z19.d, z22.d +add z19.d, z19.d, z22.d +str q19, [x0, #0] +str q16, [x0, #32] +sqrdmulh z16.d, z24.d, z9.d[1] +mul z24.d, z24.d,z8.d[1] +mla z24.d, P0/M, z16.d, z31.d +sub z16.d, z21.d, z24.d +add z21.d, z21.d, z24.d +str q21, [x0, #64] +str q16, [x0, #96] +sqrdmulh z16.d, z20.d, z11.d[1] +mul z20.d, z20.d,z10.d[1] +mla z20.d, P0/M, z16.d, z31.d +sub z16.d, z23.d, z20.d +add z23.d, z23.d, z20.d +str q23, [x0, #192] +str q16, [x0, #224] +sqrdmulh z16.d, z18.d, z11.d[0] +mul z18.d, z18.d,z10.d[0] +mla z18.d, P0/M, z16.d, z31.d +sub z16.d, z17.d, z18.d +add z17.d, z17.d, z18.d +str q17, [x0, #128] +str q16, [x0, #160] +ldr q16, [x0, #496] +ldr q17, [x0, #464] +ldr q18, [x0, #400] +ldr q23, [x0, #432] +ldr q20, [x0, #368] +ldr q21, [x0, #336] +ldr q24, [x0, #272] +ldr q19, [x0, #304] +sqrdmulh z22.d, z16.d, z2.d[0] +mul z16.d, z16.d,z3.d[0] +mla z16.d, P0/M, z22.d, z31.d +sub z22.d, z20.d, z16.d +add z20.d, z20.d, z16.d +sqrdmulh z16.d, z17.d, z2.d[0] +mul z17.d, z17.d,z3.d[0] +mla z17.d, P0/M, z16.d, z31.d +sub z16.d, z21.d, z17.d +add z21.d, z21.d, z17.d +sqrdmulh z17.d, z18.d, z2.d[0] +mul z18.d, z18.d,z3.d[0] +mla z18.d, P0/M, z17.d, z31.d +sub z17.d, z24.d, z18.d +add z24.d, z24.d, z18.d +sqrdmulh z18.d, z23.d, z2.d[0] +mul z23.d, z23.d,z3.d[0] +mla z23.d, P0/M, z18.d, z31.d +sub z18.d, z19.d, z23.d +add z19.d, z19.d, z23.d +sqrdmulh z23.d, z20.d, z0.d[0] +mul z20.d, z20.d,z1.d[0] +mla z20.d, P0/M, z23.d, z31.d +sub z23.d, z19.d, z20.d +add z19.d, z19.d, z20.d +sqrdmulh z20.d, z21.d, z0.d[0] +mul z21.d, z21.d,z1.d[0] +mla z21.d, P0/M, z20.d, z31.d +sub z20.d, z24.d, z21.d +add z24.d, z24.d, z21.d +sqrdmulh z21.d, z22.d, z0.d[1] +mul z22.d, z22.d,z1.d[1] +mla z22.d, P0/M, z21.d, z31.d +sub z21.d, z18.d, z22.d +add z18.d, z18.d, z22.d +sqrdmulh z22.d, z16.d, z0.d[1] +mul z16.d, z16.d,z1.d[1] +mla z16.d, P0/M, z22.d, z31.d +sub z22.d, z17.d, z16.d +add z17.d, z17.d, z16.d +sqrdmulh z16.d, z19.d, z14.d[0] +mul z19.d, z19.d,z15.d[0] +mla z19.d, P0/M, z16.d, z31.d +sub z16.d, z24.d, z19.d +add z24.d, z24.d, z19.d +str q24, [x0, #272] +str q16, [x0, #304] +sqrdmulh z16.d, z23.d, z14.d[1] +mul z23.d, z23.d,z15.d[1] +mla z23.d, P0/M, z16.d, z31.d +sub z16.d, z20.d, z23.d +add z20.d, z20.d, z23.d +str q20, [x0, #336] +str q16, [x0, #368] +sqrdmulh z16.d, z21.d, z12.d[1] +mul z21.d, z21.d,z13.d[1] +mla z21.d, P0/M, z16.d, z31.d +sub z16.d, z22.d, z21.d +add z22.d, z22.d, z21.d +str q22, [x0, #464] +str q16, [x0, #496] +sqrdmulh z16.d, z18.d, z12.d[0] +mul z18.d, z18.d,z13.d[0] +mla z18.d, P0/M, z16.d, z31.d +sub z16.d, z17.d, z18.d +add z17.d, z17.d, z18.d +str q17, [x0, #400] +str q16, [x0, #432] +ldr q16, [x0, #480] +ldr q17, [x0, #448] +ldr q18, [x0, #384] +ldr q22, [x0, #416] +ldr q21, [x0, #352] +ldr q20, [x0, #320] +ldr q23, [x0, #256] +ldr q24, [x0, #288] +sqrdmulh z19.d, z16.d, z2.d[0] +mul z16.d, z16.d,z3.d[0] +mla z16.d, P0/M, z19.d, z31.d +sub z19.d, z21.d, z16.d +add z21.d, z21.d, z16.d +sqrdmulh z16.d, z17.d, z2.d[0] +mul z17.d, z17.d,z3.d[0] +mla z17.d, P0/M, z16.d, z31.d +sub z16.d, z20.d, z17.d +add z20.d, z20.d, z17.d +sqrdmulh z17.d, z18.d, z2.d[0] +mul z18.d, z18.d,z3.d[0] +mla z18.d, P0/M, z17.d, z31.d +sub z17.d, z23.d, z18.d +add z23.d, z23.d, z18.d +sqrdmulh z18.d, z22.d, z2.d[0] +mul z22.d, z22.d,z3.d[0] +mla z22.d, P0/M, z18.d, z31.d +sub z18.d, z24.d, z22.d +add z24.d, z24.d, z22.d +sqrdmulh z22.d, z21.d, z0.d[0] +mul z21.d, z21.d,z1.d[0] +mla z21.d, P0/M, z22.d, z31.d +sub z22.d, z24.d, z21.d +add z24.d, z24.d, z21.d +ldr q11, [x17, #+384] +ldr q10, [x17, #+400] +ldr q9, [x17, #+416] +ldr q8, [x17, #+432] +ldr q7, [x17, #+448] +ldr q6, [x17, #+464] +ldr q5, [x17, #+480] +ldr q4, [x17, #+496] +sqrdmulh z21.d, z20.d, z0.d[0] +mul z20.d, z20.d,z1.d[0] +mla z20.d, P0/M, z21.d, z31.d +sub z21.d, z23.d, z20.d +add z23.d, z23.d, z20.d +sqrdmulh z20.d, z19.d, z0.d[1] +mul z19.d, z19.d,z1.d[1] +mla z19.d, P0/M, z20.d, z31.d +sub z20.d, z18.d, z19.d +add z18.d, z18.d, z19.d +sqrdmulh z19.d, z16.d, z0.d[1] +mul z16.d, z16.d,z1.d[1] +mla z16.d, P0/M, z19.d, z31.d +sub z19.d, z17.d, z16.d +add z17.d, z17.d, z16.d +sqrdmulh z16.d, z24.d, z14.d[0] +mul z24.d, z24.d,z15.d[0] +mla z24.d, P0/M, z16.d, z31.d +sub z16.d, z23.d, z24.d +add z23.d, z23.d, z24.d +str q23, [x0, #256] +str q16, [x0, #288] +sqrdmulh z16.d, z22.d, z14.d[1] +mul z22.d, z22.d,z15.d[1] +mla z22.d, P0/M, z16.d, z31.d +sub z16.d, z21.d, z22.d +add z21.d, z21.d, z22.d +str q21, [x0, #320] +str q16, [x0, #352] +sqrdmulh z16.d, z20.d, z12.d[1] +mul z20.d, z20.d,z13.d[1] +mla z20.d, P0/M, z16.d, z31.d +sub z16.d, z19.d, z20.d +add z19.d, z19.d, z20.d +str q19, [x0, #448] +str q16, [x0, #480] +sqrdmulh z16.d, z18.d, z12.d[0] +mul z18.d, z18.d,z13.d[0] +mla z18.d, P0/M, z16.d, z31.d +sub z16.d, z17.d, z18.d +add z17.d, z17.d, z18.d +str q17, [x0, #384] +str q16, [x0, #416] +ldr q16, [x0, #752] +ldr q17, [x0, #720] +ldr q18, [x0, #656] +ldr q19, [x0, #688] +ldr q20, [x0, #624] +ldr q21, [x0, #592] +ldr q22, [x0, #528] +ldr q23, [x0, #560] +sqrdmulh z24.d, z16.d, z10.d[0] +mul z16.d, z16.d,z11.d[0] +mla z16.d, P0/M, z24.d, z31.d +sub z24.d, z20.d, z16.d +add z20.d, z20.d, z16.d +sqrdmulh z16.d, z17.d, z10.d[0] +mul z17.d, z17.d,z11.d[0] +mla z17.d, P0/M, z16.d, z31.d +sub z16.d, z21.d, z17.d +add z21.d, z21.d, z17.d +sqrdmulh z17.d, z18.d, z10.d[0] +mul z18.d, z18.d,z11.d[0] +mla z18.d, P0/M, z17.d, z31.d +sub z17.d, z22.d, z18.d +add z22.d, z22.d, z18.d +sqrdmulh z18.d, z19.d, z10.d[0] +mul z19.d, z19.d,z11.d[0] +mla z19.d, P0/M, z18.d, z31.d +sub z18.d, z23.d, z19.d +add z23.d, z23.d, z19.d +sqrdmulh z19.d, z20.d, z8.d[0] +mul z20.d, z20.d,z9.d[0] +mla z20.d, P0/M, z19.d, z31.d +sub z19.d, z23.d, z20.d +add z23.d, z23.d, z20.d +sqrdmulh z20.d, z21.d, z8.d[0] +mul z21.d, z21.d,z9.d[0] +mla z21.d, P0/M, z20.d, z31.d +sub z20.d, z22.d, z21.d +add z22.d, z22.d, z21.d +sqrdmulh z21.d, z24.d, z8.d[1] +mul z24.d, z24.d,z9.d[1] +mla z24.d, P0/M, z21.d, z31.d +sub z21.d, z18.d, z24.d +add z18.d, z18.d, z24.d +sqrdmulh z24.d, z16.d, z8.d[1] +mul z16.d, z16.d,z9.d[1] +mla z16.d, P0/M, z24.d, z31.d +sub z24.d, z17.d, z16.d +add z17.d, z17.d, z16.d +sqrdmulh z16.d, z23.d, z6.d[0] +mul z23.d, z23.d,z7.d[0] +mla z23.d, P0/M, z16.d, z31.d +sub z16.d, z22.d, z23.d +add z22.d, z22.d, z23.d +str q22, [x0, #528] +str q16, [x0, #560] +sqrdmulh z16.d, z19.d, z6.d[1] +mul z19.d, z19.d,z7.d[1] +mla z19.d, P0/M, z16.d, z31.d +sub z16.d, z20.d, z19.d +add z20.d, z20.d, z19.d +str q20, [x0, #592] +str q16, [x0, #624] +sqrdmulh z16.d, z21.d, z4.d[1] +mul z21.d, z21.d,z5.d[1] +mla z21.d, P0/M, z16.d, z31.d +sub z16.d, z24.d, z21.d +add z24.d, z24.d, z21.d +str q24, [x0, #720] +str q16, [x0, #752] +sqrdmulh z16.d, z18.d, z4.d[0] +mul z18.d, z18.d,z5.d[0] +mla z18.d, P0/M, z16.d, z31.d +sub z16.d, z17.d, z18.d +add z17.d, z17.d, z18.d +str q17, [x0, #656] +str q16, [x0, #688] +ldr q16, [x0, #736] +ldr q17, [x0, #704] +ldr q18, [x0, #640] +ldr q24, [x0, #672] +ldr q21, [x0, #608] +ldr q20, [x0, #576] +ldr q19, [x0, #512] +ldr q22, [x0, #544] +sqrdmulh z23.d, z16.d, z10.d[0] +mul z16.d, z16.d,z11.d[0] +mla z16.d, P0/M, z23.d, z31.d +sub z23.d, z21.d, z16.d +add z21.d, z21.d, z16.d +sqrdmulh z16.d, z17.d, z10.d[0] +mul z17.d, z17.d,z11.d[0] +mla z17.d, P0/M, z16.d, z31.d +sub z16.d, z20.d, z17.d +add z20.d, z20.d, z17.d +sqrdmulh z17.d, z18.d, z10.d[0] +mul z18.d, z18.d,z11.d[0] +mla z18.d, P0/M, z17.d, z31.d +sub z17.d, z19.d, z18.d +add z19.d, z19.d, z18.d +sqrdmulh z18.d, z24.d, z10.d[0] +mul z24.d, z24.d,z11.d[0] +mla z24.d, P0/M, z18.d, z31.d +sub z18.d, z22.d, z24.d +add z22.d, z22.d, z24.d +sqrdmulh z24.d, z21.d, z8.d[0] +mul z21.d, z21.d,z9.d[0] +mla z21.d, P0/M, z24.d, z31.d +sub z24.d, z22.d, z21.d +add z22.d, z22.d, z21.d +ldr q12, [x17, #+512] +ldr q13, [x17, #+528] +ldr q14, [x17, #+544] +ldr q15, [x17, #+560] +ldr q0, [x17, #+576] +ldr q1, [x17, #+592] +ldr q2, [x17, #+608] +ldr q3, [x17, #+624] +sqrdmulh z21.d, z20.d, z8.d[0] +mul z20.d, z20.d,z9.d[0] +mla z20.d, P0/M, z21.d, z31.d +sub z21.d, z19.d, z20.d +add z19.d, z19.d, z20.d +sqrdmulh z20.d, z23.d, z8.d[1] +mul z23.d, z23.d,z9.d[1] +mla z23.d, P0/M, z20.d, z31.d +sub z20.d, z18.d, z23.d +add z18.d, z18.d, z23.d +sqrdmulh z23.d, z16.d, z8.d[1] +mul z16.d, z16.d,z9.d[1] +mla z16.d, P0/M, z23.d, z31.d +sub z23.d, z17.d, z16.d +add z17.d, z17.d, z16.d +sqrdmulh z16.d, z22.d, z6.d[0] +mul z22.d, z22.d,z7.d[0] +mla z22.d, P0/M, z16.d, z31.d +sub z16.d, z19.d, z22.d +add z19.d, z19.d, z22.d +str q19, [x0, #512] +str q16, [x0, #544] +sqrdmulh z16.d, z24.d, z6.d[1] +mul z24.d, z24.d,z7.d[1] +mla z24.d, P0/M, z16.d, z31.d +sub z16.d, z21.d, z24.d +add z21.d, z21.d, z24.d +str q21, [x0, #576] +str q16, [x0, #608] +sqrdmulh z16.d, z20.d, z4.d[1] +mul z20.d, z20.d,z5.d[1] +mla z20.d, P0/M, z16.d, z31.d +sub z16.d, z23.d, z20.d +add z23.d, z23.d, z20.d +str q23, [x0, #704] +str q16, [x0, #736] +sqrdmulh z16.d, z18.d, z4.d[0] +mul z18.d, z18.d,z5.d[0] +mla z18.d, P0/M, z16.d, z31.d +sub z16.d, z17.d, z18.d +add z17.d, z17.d, z18.d +str q17, [x0, #640] +str q16, [x0, #672] +ldr q16, [x0, #1008] +ldr q17, [x0, #976] +ldr q18, [x0, #912] +ldr q23, [x0, #944] +ldr q20, [x0, #880] +ldr q21, [x0, #848] +ldr q24, [x0, #784] +ldr q19, [x0, #816] +sqrdmulh z22.d, z16.d, z13.d[0] +mul z16.d, z16.d,z12.d[0] +mla z16.d, P0/M, z22.d, z31.d +sub z22.d, z20.d, z16.d +add z20.d, z20.d, z16.d +sqrdmulh z16.d, z17.d, z13.d[0] +mul z17.d, z17.d,z12.d[0] +mla z17.d, P0/M, z16.d, z31.d +sub z16.d, z21.d, z17.d +add z21.d, z21.d, z17.d +sqrdmulh z17.d, z18.d, z13.d[0] +mul z18.d, z18.d,z12.d[0] +mla z18.d, P0/M, z17.d, z31.d +sub z17.d, z24.d, z18.d +add z24.d, z24.d, z18.d +sqrdmulh z18.d, z23.d, z13.d[0] +mul z23.d, z23.d,z12.d[0] +mla z23.d, P0/M, z18.d, z31.d +sub z18.d, z19.d, z23.d +add z19.d, z19.d, z23.d +sqrdmulh z23.d, z20.d, z15.d[0] +mul z20.d, z20.d,z14.d[0] +mla z20.d, P0/M, z23.d, z31.d +sub z23.d, z19.d, z20.d +add z19.d, z19.d, z20.d +sqrdmulh z20.d, z21.d, z15.d[0] +mul z21.d, z21.d,z14.d[0] +mla z21.d, P0/M, z20.d, z31.d +sub z20.d, z24.d, z21.d +add z24.d, z24.d, z21.d +sqrdmulh z21.d, z22.d, z15.d[1] +mul z22.d, z22.d,z14.d[1] +mla z22.d, P0/M, z21.d, z31.d +sub z21.d, z18.d, z22.d +add z18.d, z18.d, z22.d +sqrdmulh z22.d, z16.d, z15.d[1] +mul z16.d, z16.d,z14.d[1] +mla z16.d, P0/M, z22.d, z31.d +sub z22.d, z17.d, z16.d +add z17.d, z17.d, z16.d +sqrdmulh z16.d, z19.d, z1.d[0] +mul z19.d, z19.d,z0.d[0] +mla z19.d, P0/M, z16.d, z31.d +sub z16.d, z24.d, z19.d +add z24.d, z24.d, z19.d +str q24, [x0, #784] +str q16, [x0, #816] +sqrdmulh z16.d, z23.d, z1.d[1] +mul z23.d, z23.d,z0.d[1] +mla z23.d, P0/M, z16.d, z31.d +sub z16.d, z20.d, z23.d +add z20.d, z20.d, z23.d +str q20, [x0, #848] +str q16, [x0, #880] +sqrdmulh z16.d, z21.d, z3.d[1] +mul z21.d, z21.d,z2.d[1] +mla z21.d, P0/M, z16.d, z31.d +sub z16.d, z22.d, z21.d +add z22.d, z22.d, z21.d +str q22, [x0, #976] +str q16, [x0, #1008] +sqrdmulh z16.d, z18.d, z3.d[0] +mul z18.d, z18.d,z2.d[0] +mla z18.d, P0/M, z16.d, z31.d +sub z16.d, z17.d, z18.d +add z17.d, z17.d, z18.d +str q17, [x0, #912] +str q16, [x0, #944] +ldr q16, [x0, #992] +ldr q17, [x0, #960] +ldr q18, [x0, #896] +ldr q22, [x0, #928] +ldr q21, [x0, #864] +ldr q20, [x0, #832] +ldr q23, [x0, #768] +ldr q24, [x0, #800] +sqrdmulh z19.d, z16.d, z13.d[0] +mul z16.d, z16.d,z12.d[0] +mla z16.d, P0/M, z19.d, z31.d +sub z19.d, z21.d, z16.d +add z21.d, z21.d, z16.d +sqrdmulh z16.d, z17.d, z13.d[0] +mul z17.d, z17.d,z12.d[0] +mla z17.d, P0/M, z16.d, z31.d +sub z16.d, z20.d, z17.d +add z20.d, z20.d, z17.d +sqrdmulh z17.d, z18.d, z13.d[0] +mul z18.d, z18.d,z12.d[0] +mla z18.d, P0/M, z17.d, z31.d +sub z17.d, z23.d, z18.d +add z23.d, z23.d, z18.d +sqrdmulh z18.d, z22.d, z13.d[0] +mul z22.d, z22.d,z12.d[0] +mla z22.d, P0/M, z18.d, z31.d +sub z18.d, z24.d, z22.d +add z24.d, z24.d, z22.d +sqrdmulh z22.d, z21.d, z15.d[0] +mul z21.d, z21.d,z14.d[0] +mla z21.d, P0/M, z22.d, z31.d +sub z22.d, z24.d, z21.d +add z24.d, z24.d, z21.d +ldr q4, [x17, #+640] +ldr q5, [x17, #+656] +ldr q6, [x17, #+672] +ldr q7, [x17, #+688] +ldr q8, [x17, #+704] +ldr q9, [x17, #+720] +ldr q10, [x17, #+736] +ldr q11, [x17, #+752] +sqrdmulh z21.d, z20.d, z15.d[0] +mul z20.d, z20.d,z14.d[0] +mla z20.d, P0/M, z21.d, z31.d +sub z21.d, z23.d, z20.d +add z23.d, z23.d, z20.d +sqrdmulh z20.d, z19.d, z15.d[1] +mul z19.d, z19.d,z14.d[1] +mla z19.d, P0/M, z20.d, z31.d +sub z20.d, z18.d, z19.d +add z18.d, z18.d, z19.d +sqrdmulh z19.d, z16.d, z15.d[1] +mul z16.d, z16.d,z14.d[1] +mla z16.d, P0/M, z19.d, z31.d +sub z19.d, z17.d, z16.d +add z17.d, z17.d, z16.d +sqrdmulh z16.d, z24.d, z1.d[0] +mul z24.d, z24.d,z0.d[0] +mla z24.d, P0/M, z16.d, z31.d +sub z16.d, z23.d, z24.d +add z23.d, z23.d, z24.d +str q23, [x0, #768] +str q16, [x0, #800] +sqrdmulh z16.d, z22.d, z1.d[1] +mul z22.d, z22.d,z0.d[1] +mla z22.d, P0/M, z16.d, z31.d +sub z16.d, z21.d, z22.d +add z21.d, z21.d, z22.d +str q21, [x0, #832] +str q16, [x0, #864] +sqrdmulh z16.d, z20.d, z3.d[1] +mul z20.d, z20.d,z2.d[1] +mla z20.d, P0/M, z16.d, z31.d +sub z16.d, z19.d, z20.d +add z19.d, z19.d, z20.d +str q19, [x0, #960] +str q16, [x0, #992] +sqrdmulh z16.d, z18.d, z3.d[0] +mul z18.d, z18.d,z2.d[0] +mla z18.d, P0/M, z16.d, z31.d +sub z16.d, z17.d, z18.d +add z17.d, z17.d, z18.d +str q17, [x0, #896] +str q16, [x0, #928] +ldr q16, [x0, #1264] +ldr q17, [x0, #1232] +ldr q18, [x0, #1168] +ldr q19, [x0, #1200] +ldr q20, [x0, #1136] +ldr q21, [x0, #1104] +ldr q22, [x0, #1040] +ldr q23, [x0, #1072] +sqrdmulh z24.d, z16.d, z5.d[0] +mul z16.d, z16.d,z4.d[0] +mla z16.d, P0/M, z24.d, z31.d +sub z24.d, z20.d, z16.d +add z20.d, z20.d, z16.d +sqrdmulh z16.d, z17.d, z5.d[0] +mul z17.d, z17.d,z4.d[0] +mla z17.d, P0/M, z16.d, z31.d +sub z16.d, z21.d, z17.d +add z21.d, z21.d, z17.d +sqrdmulh z17.d, z18.d, z5.d[0] +mul z18.d, z18.d,z4.d[0] +mla z18.d, P0/M, z17.d, z31.d +sub z17.d, z22.d, z18.d +add z22.d, z22.d, z18.d +sqrdmulh z18.d, z19.d, z5.d[0] +mul z19.d, z19.d,z4.d[0] +mla z19.d, P0/M, z18.d, z31.d +sub z18.d, z23.d, z19.d +add z23.d, z23.d, z19.d +sqrdmulh z19.d, z20.d, z7.d[0] +mul z20.d, z20.d,z6.d[0] +mla z20.d, P0/M, z19.d, z31.d +sub z19.d, z23.d, z20.d +add z23.d, z23.d, z20.d +sqrdmulh z20.d, z21.d, z7.d[0] +mul z21.d, z21.d,z6.d[0] +mla z21.d, P0/M, z20.d, z31.d +sub z20.d, z22.d, z21.d +add z22.d, z22.d, z21.d +sqrdmulh z21.d, z24.d, z7.d[1] +mul z24.d, z24.d,z6.d[1] +mla z24.d, P0/M, z21.d, z31.d +sub z21.d, z18.d, z24.d +add z18.d, z18.d, z24.d +sqrdmulh z24.d, z16.d, z7.d[1] +mul z16.d, z16.d,z6.d[1] +mla z16.d, P0/M, z24.d, z31.d +sub z24.d, z17.d, z16.d +add z17.d, z17.d, z16.d +sqrdmulh z16.d, z23.d, z9.d[0] +mul z23.d, z23.d,z8.d[0] +mla z23.d, P0/M, z16.d, z31.d +sub z16.d, z22.d, z23.d +add z22.d, z22.d, z23.d +str q22, [x0, #1040] +str q16, [x0, #1072] +sqrdmulh z16.d, z19.d, z9.d[1] +mul z19.d, z19.d,z8.d[1] +mla z19.d, P0/M, z16.d, z31.d +sub z16.d, z20.d, z19.d +add z20.d, z20.d, z19.d +str q20, [x0, #1104] +str q16, [x0, #1136] +sqrdmulh z16.d, z21.d, z11.d[1] +mul z21.d, z21.d,z10.d[1] +mla z21.d, P0/M, z16.d, z31.d +sub z16.d, z24.d, z21.d +add z24.d, z24.d, z21.d +str q24, [x0, #1232] +str q16, [x0, #1264] +sqrdmulh z16.d, z18.d, z11.d[0] +mul z18.d, z18.d,z10.d[0] +mla z18.d, P0/M, z16.d, z31.d +sub z16.d, z17.d, z18.d +add z17.d, z17.d, z18.d +str q17, [x0, #1168] +str q16, [x0, #1200] +ldr q16, [x0, #1248] +ldr q17, [x0, #1216] +ldr q18, [x0, #1152] +ldr q24, [x0, #1184] +ldr q21, [x0, #1120] +ldr q20, [x0, #1088] +ldr q19, [x0, #1024] +ldr q22, [x0, #1056] +sqrdmulh z23.d, z16.d, z5.d[0] +mul z16.d, z16.d,z4.d[0] +mla z16.d, P0/M, z23.d, z31.d +sub z23.d, z21.d, z16.d +add z21.d, z21.d, z16.d +sqrdmulh z16.d, z17.d, z5.d[0] +mul z17.d, z17.d,z4.d[0] +mla z17.d, P0/M, z16.d, z31.d +sub z16.d, z20.d, z17.d +add z20.d, z20.d, z17.d +sqrdmulh z17.d, z18.d, z5.d[0] +mul z18.d, z18.d,z4.d[0] +mla z18.d, P0/M, z17.d, z31.d +sub z17.d, z19.d, z18.d +add z19.d, z19.d, z18.d +sqrdmulh z18.d, z24.d, z5.d[0] +mul z24.d, z24.d,z4.d[0] +mla z24.d, P0/M, z18.d, z31.d +sub z18.d, z22.d, z24.d +add z22.d, z22.d, z24.d +sqrdmulh z24.d, z21.d, z7.d[0] +mul z21.d, z21.d,z6.d[0] +mla z21.d, P0/M, z24.d, z31.d +sub z24.d, z22.d, z21.d +add z22.d, z22.d, z21.d +ldr q3, [x17, #+768] +ldr q2, [x17, #+784] +ldr q1, [x17, #+800] +ldr q0, [x17, #+816] +ldr q15, [x17, #+832] +ldr q14, [x17, #+848] +ldr q13, [x17, #+864] +ldr q12, [x17, #+880] +sqrdmulh z21.d, z20.d, z7.d[0] +mul z20.d, z20.d,z6.d[0] +mla z20.d, P0/M, z21.d, z31.d +sub z21.d, z19.d, z20.d +add z19.d, z19.d, z20.d +sqrdmulh z20.d, z23.d, z7.d[1] +mul z23.d, z23.d,z6.d[1] +mla z23.d, P0/M, z20.d, z31.d +sub z20.d, z18.d, z23.d +add z18.d, z18.d, z23.d +sqrdmulh z23.d, z16.d, z7.d[1] +mul z16.d, z16.d,z6.d[1] +mla z16.d, P0/M, z23.d, z31.d +sub z23.d, z17.d, z16.d +add z17.d, z17.d, z16.d +sqrdmulh z16.d, z22.d, z9.d[0] +mul z22.d, z22.d,z8.d[0] +mla z22.d, P0/M, z16.d, z31.d +sub z16.d, z19.d, z22.d +add z19.d, z19.d, z22.d +str q19, [x0, #1024] +str q16, [x0, #1056] +sqrdmulh z16.d, z24.d, z9.d[1] +mul z24.d, z24.d,z8.d[1] +mla z24.d, P0/M, z16.d, z31.d +sub z16.d, z21.d, z24.d +add z21.d, z21.d, z24.d +str q21, [x0, #1088] +str q16, [x0, #1120] +sqrdmulh z16.d, z20.d, z11.d[1] +mul z20.d, z20.d,z10.d[1] +mla z20.d, P0/M, z16.d, z31.d +sub z16.d, z23.d, z20.d +add z23.d, z23.d, z20.d +str q23, [x0, #1216] +str q16, [x0, #1248] +sqrdmulh z16.d, z18.d, z11.d[0] +mul z18.d, z18.d,z10.d[0] +mla z18.d, P0/M, z16.d, z31.d +sub z16.d, z17.d, z18.d +add z17.d, z17.d, z18.d +str q17, [x0, #1152] +str q16, [x0, #1184] +ldr q16, [x0, #1520] +ldr q17, [x0, #1488] +ldr q18, [x0, #1424] +ldr q23, [x0, #1456] +ldr q20, [x0, #1392] +ldr q21, [x0, #1360] +ldr q24, [x0, #1296] +ldr q19, [x0, #1328] +sqrdmulh z22.d, z16.d, z2.d[0] +mul z16.d, z16.d,z3.d[0] +mla z16.d, P0/M, z22.d, z31.d +sub z22.d, z20.d, z16.d +add z20.d, z20.d, z16.d +sqrdmulh z16.d, z17.d, z2.d[0] +mul z17.d, z17.d,z3.d[0] +mla z17.d, P0/M, z16.d, z31.d +sub z16.d, z21.d, z17.d +add z21.d, z21.d, z17.d +sqrdmulh z17.d, z18.d, z2.d[0] +mul z18.d, z18.d,z3.d[0] +mla z18.d, P0/M, z17.d, z31.d +sub z17.d, z24.d, z18.d +add z24.d, z24.d, z18.d +sqrdmulh z18.d, z23.d, z2.d[0] +mul z23.d, z23.d,z3.d[0] +mla z23.d, P0/M, z18.d, z31.d +sub z18.d, z19.d, z23.d +add z19.d, z19.d, z23.d +sqrdmulh z23.d, z20.d, z0.d[0] +mul z20.d, z20.d,z1.d[0] +mla z20.d, P0/M, z23.d, z31.d +sub z23.d, z19.d, z20.d +add z19.d, z19.d, z20.d +sqrdmulh z20.d, z21.d, z0.d[0] +mul z21.d, z21.d,z1.d[0] +mla z21.d, P0/M, z20.d, z31.d +sub z20.d, z24.d, z21.d +add z24.d, z24.d, z21.d +sqrdmulh z21.d, z22.d, z0.d[1] +mul z22.d, z22.d,z1.d[1] +mla z22.d, P0/M, z21.d, z31.d +sub z21.d, z18.d, z22.d +add z18.d, z18.d, z22.d +sqrdmulh z22.d, z16.d, z0.d[1] +mul z16.d, z16.d,z1.d[1] +mla z16.d, P0/M, z22.d, z31.d +sub z22.d, z17.d, z16.d +add z17.d, z17.d, z16.d +sqrdmulh z16.d, z19.d, z14.d[0] +mul z19.d, z19.d,z15.d[0] +mla z19.d, P0/M, z16.d, z31.d +sub z16.d, z24.d, z19.d +add z24.d, z24.d, z19.d +str q24, [x0, #1296] +str q16, [x0, #1328] +sqrdmulh z16.d, z23.d, z14.d[1] +mul z23.d, z23.d,z15.d[1] +mla z23.d, P0/M, z16.d, z31.d +sub z16.d, z20.d, z23.d +add z20.d, z20.d, z23.d +str q20, [x0, #1360] +str q16, [x0, #1392] +sqrdmulh z16.d, z21.d, z12.d[1] +mul z21.d, z21.d,z13.d[1] +mla z21.d, P0/M, z16.d, z31.d +sub z16.d, z22.d, z21.d +add z22.d, z22.d, z21.d +str q22, [x0, #1488] +str q16, [x0, #1520] +sqrdmulh z16.d, z18.d, z12.d[0] +mul z18.d, z18.d,z13.d[0] +mla z18.d, P0/M, z16.d, z31.d +sub z16.d, z17.d, z18.d +add z17.d, z17.d, z18.d +str q17, [x0, #1424] +str q16, [x0, #1456] +ldr q16, [x0, #1504] +ldr q17, [x0, #1472] +ldr q18, [x0, #1408] +ldr q22, [x0, #1440] +ldr q21, [x0, #1376] +ldr q20, [x0, #1344] +ldr q23, [x0, #1280] +ldr q24, [x0, #1312] +sqrdmulh z19.d, z16.d, z2.d[0] +mul z16.d, z16.d,z3.d[0] +mla z16.d, P0/M, z19.d, z31.d +sub z19.d, z21.d, z16.d +add z21.d, z21.d, z16.d +sqrdmulh z16.d, z17.d, z2.d[0] +mul z17.d, z17.d,z3.d[0] +mla z17.d, P0/M, z16.d, z31.d +sub z16.d, z20.d, z17.d +add z20.d, z20.d, z17.d +sqrdmulh z17.d, z18.d, z2.d[0] +mul z18.d, z18.d,z3.d[0] +mla z18.d, P0/M, z17.d, z31.d +sub z17.d, z23.d, z18.d +add z23.d, z23.d, z18.d +sqrdmulh z18.d, z22.d, z2.d[0] +mul z22.d, z22.d,z3.d[0] +mla z22.d, P0/M, z18.d, z31.d +sub z18.d, z24.d, z22.d +add z24.d, z24.d, z22.d +sqrdmulh z22.d, z21.d, z0.d[0] +mul z21.d, z21.d,z1.d[0] +mla z21.d, P0/M, z22.d, z31.d +sub z22.d, z24.d, z21.d +add z24.d, z24.d, z21.d +ldr q11, [x17, #+896] +ldr q10, [x17, #+912] +ldr q9, [x17, #+928] +ldr q8, [x17, #+944] +ldr q7, [x17, #+960] +ldr q6, [x17, #+976] +ldr q5, [x17, #+992] +ldr q4, [x17, #+1008] +sqrdmulh z21.d, z20.d, z0.d[0] +mul z20.d, z20.d,z1.d[0] +mla z20.d, P0/M, z21.d, z31.d +sub z21.d, z23.d, z20.d +add z23.d, z23.d, z20.d +sqrdmulh z20.d, z19.d, z0.d[1] +mul z19.d, z19.d,z1.d[1] +mla z19.d, P0/M, z20.d, z31.d +sub z20.d, z18.d, z19.d +add z18.d, z18.d, z19.d +sqrdmulh z19.d, z16.d, z0.d[1] +mul z16.d, z16.d,z1.d[1] +mla z16.d, P0/M, z19.d, z31.d +sub z19.d, z17.d, z16.d +add z17.d, z17.d, z16.d +sqrdmulh z16.d, z24.d, z14.d[0] +mul z24.d, z24.d,z15.d[0] +mla z24.d, P0/M, z16.d, z31.d +sub z16.d, z23.d, z24.d +add z23.d, z23.d, z24.d +str q23, [x0, #1280] +str q16, [x0, #1312] +sqrdmulh z16.d, z22.d, z14.d[1] +mul z22.d, z22.d,z15.d[1] +mla z22.d, P0/M, z16.d, z31.d +sub z16.d, z21.d, z22.d +add z21.d, z21.d, z22.d +str q21, [x0, #1344] +str q16, [x0, #1376] +sqrdmulh z16.d, z20.d, z12.d[1] +mul z20.d, z20.d,z13.d[1] +mla z20.d, P0/M, z16.d, z31.d +sub z16.d, z19.d, z20.d +add z19.d, z19.d, z20.d +str q19, [x0, #1472] +str q16, [x0, #1504] +sqrdmulh z16.d, z18.d, z12.d[0] +mul z18.d, z18.d,z13.d[0] +mla z18.d, P0/M, z16.d, z31.d +sub z16.d, z17.d, z18.d +add z17.d, z17.d, z18.d +str q17, [x0, #1408] +str q16, [x0, #1440] +ldr q16, [x0, #1776] +ldr q17, [x0, #1744] +ldr q18, [x0, #1680] +ldr q19, [x0, #1712] +ldr q20, [x0, #1648] +ldr q21, [x0, #1616] +ldr q22, [x0, #1552] +ldr q23, [x0, #1584] +sqrdmulh z24.d, z16.d, z10.d[0] +mul z16.d, z16.d,z11.d[0] +mla z16.d, P0/M, z24.d, z31.d +sub z24.d, z20.d, z16.d +add z20.d, z20.d, z16.d +sqrdmulh z16.d, z17.d, z10.d[0] +mul z17.d, z17.d,z11.d[0] +mla z17.d, P0/M, z16.d, z31.d +sub z16.d, z21.d, z17.d +add z21.d, z21.d, z17.d +sqrdmulh z17.d, z18.d, z10.d[0] +mul z18.d, z18.d,z11.d[0] +mla z18.d, P0/M, z17.d, z31.d +sub z17.d, z22.d, z18.d +add z22.d, z22.d, z18.d +sqrdmulh z18.d, z19.d, z10.d[0] +mul z19.d, z19.d,z11.d[0] +mla z19.d, P0/M, z18.d, z31.d +sub z18.d, z23.d, z19.d +add z23.d, z23.d, z19.d +sqrdmulh z19.d, z20.d, z8.d[0] +mul z20.d, z20.d,z9.d[0] +mla z20.d, P0/M, z19.d, z31.d +sub z19.d, z23.d, z20.d +add z23.d, z23.d, z20.d +sqrdmulh z20.d, z21.d, z8.d[0] +mul z21.d, z21.d,z9.d[0] +mla z21.d, P0/M, z20.d, z31.d +sub z20.d, z22.d, z21.d +add z22.d, z22.d, z21.d +sqrdmulh z21.d, z24.d, z8.d[1] +mul z24.d, z24.d,z9.d[1] +mla z24.d, P0/M, z21.d, z31.d +sub z21.d, z18.d, z24.d +add z18.d, z18.d, z24.d +sqrdmulh z24.d, z16.d, z8.d[1] +mul z16.d, z16.d,z9.d[1] +mla z16.d, P0/M, z24.d, z31.d +sub z24.d, z17.d, z16.d +add z17.d, z17.d, z16.d +sqrdmulh z16.d, z23.d, z6.d[0] +mul z23.d, z23.d,z7.d[0] +mla z23.d, P0/M, z16.d, z31.d +sub z16.d, z22.d, z23.d +add z22.d, z22.d, z23.d +str q22, [x0, #1552] +str q16, [x0, #1584] +sqrdmulh z16.d, z19.d, z6.d[1] +mul z19.d, z19.d,z7.d[1] +mla z19.d, P0/M, z16.d, z31.d +sub z16.d, z20.d, z19.d +add z20.d, z20.d, z19.d +str q20, [x0, #1616] +str q16, [x0, #1648] +sqrdmulh z16.d, z21.d, z4.d[1] +mul z21.d, z21.d,z5.d[1] +mla z21.d, P0/M, z16.d, z31.d +sub z16.d, z24.d, z21.d +add z24.d, z24.d, z21.d +str q24, [x0, #1744] +str q16, [x0, #1776] +sqrdmulh z16.d, z18.d, z4.d[0] +mul z18.d, z18.d,z5.d[0] +mla z18.d, P0/M, z16.d, z31.d +sub z16.d, z17.d, z18.d +add z17.d, z17.d, z18.d +str q17, [x0, #1680] +str q16, [x0, #1712] +ldr q16, [x0, #1760] +ldr q17, [x0, #1728] +ldr q18, [x0, #1664] +ldr q24, [x0, #1696] +ldr q21, [x0, #1632] +ldr q20, [x0, #1600] +ldr q19, [x0, #1536] +ldr q22, [x0, #1568] +sqrdmulh z23.d, z16.d, z10.d[0] +mul z16.d, z16.d,z11.d[0] +mla z16.d, P0/M, z23.d, z31.d +sub z23.d, z21.d, z16.d +add z21.d, z21.d, z16.d +sqrdmulh z16.d, z17.d, z10.d[0] +mul z17.d, z17.d,z11.d[0] +mla z17.d, P0/M, z16.d, z31.d +sub z16.d, z20.d, z17.d +add z20.d, z20.d, z17.d +sqrdmulh z17.d, z18.d, z10.d[0] +mul z18.d, z18.d,z11.d[0] +mla z18.d, P0/M, z17.d, z31.d +sub z17.d, z19.d, z18.d +add z19.d, z19.d, z18.d +sqrdmulh z18.d, z24.d, z10.d[0] +mul z24.d, z24.d,z11.d[0] +mla z24.d, P0/M, z18.d, z31.d +sub z18.d, z22.d, z24.d +add z22.d, z22.d, z24.d +sqrdmulh z24.d, z21.d, z8.d[0] +mul z21.d, z21.d,z9.d[0] +mla z21.d, P0/M, z24.d, z31.d +sub z24.d, z22.d, z21.d +add z22.d, z22.d, z21.d +ldr q12, [x17, #+1024] +ldr q13, [x17, #+1040] +ldr q14, [x17, #+1056] +ldr q15, [x17, #+1072] +ldr q0, [x17, #+1088] +ldr q1, [x17, #+1104] +ldr q2, [x17, #+1120] +ldr q3, [x17, #+1136] +sqrdmulh z21.d, z20.d, z8.d[0] +mul z20.d, z20.d,z9.d[0] +mla z20.d, P0/M, z21.d, z31.d +sub z21.d, z19.d, z20.d +add z19.d, z19.d, z20.d +sqrdmulh z20.d, z23.d, z8.d[1] +mul z23.d, z23.d,z9.d[1] +mla z23.d, P0/M, z20.d, z31.d +sub z20.d, z18.d, z23.d +add z18.d, z18.d, z23.d +sqrdmulh z23.d, z16.d, z8.d[1] +mul z16.d, z16.d,z9.d[1] +mla z16.d, P0/M, z23.d, z31.d +sub z23.d, z17.d, z16.d +add z17.d, z17.d, z16.d +sqrdmulh z16.d, z22.d, z6.d[0] +mul z22.d, z22.d,z7.d[0] +mla z22.d, P0/M, z16.d, z31.d +sub z16.d, z19.d, z22.d +add z19.d, z19.d, z22.d +str q19, [x0, #1536] +str q16, [x0, #1568] +sqrdmulh z16.d, z24.d, z6.d[1] +mul z24.d, z24.d,z7.d[1] +mla z24.d, P0/M, z16.d, z31.d +sub z16.d, z21.d, z24.d +add z21.d, z21.d, z24.d +str q21, [x0, #1600] +str q16, [x0, #1632] +sqrdmulh z16.d, z20.d, z4.d[1] +mul z20.d, z20.d,z5.d[1] +mla z20.d, P0/M, z16.d, z31.d +sub z16.d, z23.d, z20.d +add z23.d, z23.d, z20.d +str q23, [x0, #1728] +str q16, [x0, #1760] +sqrdmulh z16.d, z18.d, z4.d[0] +mul z18.d, z18.d,z5.d[0] +mla z18.d, P0/M, z16.d, z31.d +sub z16.d, z17.d, z18.d +add z17.d, z17.d, z18.d +str q17, [x0, #1664] +str q16, [x0, #1696] +ldr q16, [x0, #2032] +ldr q17, [x0, #2000] +ldr q18, [x0, #1936] +ldr q23, [x0, #1968] +ldr q20, [x0, #1904] +ldr q21, [x0, #1872] +ldr q24, [x0, #1808] +ldr q19, [x0, #1840] +sqrdmulh z22.d, z16.d, z13.d[0] +mul z16.d, z16.d,z12.d[0] +mla z16.d, P0/M, z22.d, z31.d +sub z22.d, z20.d, z16.d +add z20.d, z20.d, z16.d +sqrdmulh z16.d, z17.d, z13.d[0] +mul z17.d, z17.d,z12.d[0] +mla z17.d, P0/M, z16.d, z31.d +sub z16.d, z21.d, z17.d +add z21.d, z21.d, z17.d +sqrdmulh z17.d, z18.d, z13.d[0] +mul z18.d, z18.d,z12.d[0] +mla z18.d, P0/M, z17.d, z31.d +sub z17.d, z24.d, z18.d +add z24.d, z24.d, z18.d +sqrdmulh z18.d, z23.d, z13.d[0] +mul z23.d, z23.d,z12.d[0] +mla z23.d, P0/M, z18.d, z31.d +sub z18.d, z19.d, z23.d +add z19.d, z19.d, z23.d +sqrdmulh z23.d, z20.d, z15.d[0] +mul z20.d, z20.d,z14.d[0] +mla z20.d, P0/M, z23.d, z31.d +sub z23.d, z19.d, z20.d +add z19.d, z19.d, z20.d +sqrdmulh z20.d, z21.d, z15.d[0] +mul z21.d, z21.d,z14.d[0] +mla z21.d, P0/M, z20.d, z31.d +sub z20.d, z24.d, z21.d +add z24.d, z24.d, z21.d +sqrdmulh z21.d, z22.d, z15.d[1] +mul z22.d, z22.d,z14.d[1] +mla z22.d, P0/M, z21.d, z31.d +sub z21.d, z18.d, z22.d +add z18.d, z18.d, z22.d +sqrdmulh z22.d, z16.d, z15.d[1] +mul z16.d, z16.d,z14.d[1] +mla z16.d, P0/M, z22.d, z31.d +sub z22.d, z17.d, z16.d +add z17.d, z17.d, z16.d +sqrdmulh z16.d, z19.d, z1.d[0] +mul z19.d, z19.d,z0.d[0] +mla z19.d, P0/M, z16.d, z31.d +sub z16.d, z24.d, z19.d +add z24.d, z24.d, z19.d +str q24, [x0, #1808] +str q16, [x0, #1840] +sqrdmulh z16.d, z23.d, z1.d[1] +mul z23.d, z23.d,z0.d[1] +mla z23.d, P0/M, z16.d, z31.d +sub z16.d, z20.d, z23.d +add z20.d, z20.d, z23.d +str q20, [x0, #1872] +str q16, [x0, #1904] +sqrdmulh z16.d, z21.d, z3.d[1] +mul z21.d, z21.d,z2.d[1] +mla z21.d, P0/M, z16.d, z31.d +sub z16.d, z22.d, z21.d +add z22.d, z22.d, z21.d +str q22, [x0, #2000] +str q16, [x0, #2032] +sqrdmulh z16.d, z18.d, z3.d[0] +mul z18.d, z18.d,z2.d[0] +mla z18.d, P0/M, z16.d, z31.d +sub z16.d, z17.d, z18.d +add z17.d, z17.d, z18.d +str q17, [x0, #1936] +str q16, [x0, #1968] +ldr q16, [x0, #2016] +ldr q17, [x0, #1984] +ldr q18, [x0, #1920] +ldr q22, [x0, #1952] +ldr q21, [x0, #1888] +ldr q20, [x0, #1856] +ldr q23, [x0, #1792] +ldr q24, [x0, #1824] +sqrdmulh z19.d, z16.d, z13.d[0] +mul z16.d, z16.d,z12.d[0] +mla z16.d, P0/M, z19.d, z31.d +sub z19.d, z21.d, z16.d +add z21.d, z21.d, z16.d +sqrdmulh z16.d, z17.d, z13.d[0] +mul z17.d, z17.d,z12.d[0] +mla z17.d, P0/M, z16.d, z31.d +sub z16.d, z20.d, z17.d +add z20.d, z20.d, z17.d +sqrdmulh z17.d, z18.d, z13.d[0] +mul z18.d, z18.d,z12.d[0] +mla z18.d, P0/M, z17.d, z31.d +sub z17.d, z23.d, z18.d +add z23.d, z23.d, z18.d +sqrdmulh z18.d, z22.d, z13.d[0] +mul z22.d, z22.d,z12.d[0] +mla z22.d, P0/M, z18.d, z31.d +sub z18.d, z24.d, z22.d +add z24.d, z24.d, z22.d +sqrdmulh z22.d, z21.d, z15.d[0] +mul z21.d, z21.d,z14.d[0] +mla z21.d, P0/M, z22.d, z31.d +sub z22.d, z24.d, z21.d +add z24.d, z24.d, z21.d +sqrdmulh z21.d, z20.d, z15.d[0] +mul z20.d, z20.d,z14.d[0] +mla z20.d, P0/M, z21.d, z31.d +sub z21.d, z23.d, z20.d +add z23.d, z23.d, z20.d +sqrdmulh z20.d, z19.d, z15.d[1] +mul z19.d, z19.d,z14.d[1] +mla z19.d, P0/M, z20.d, z31.d +sub z20.d, z18.d, z19.d +add z18.d, z18.d, z19.d +sqrdmulh z19.d, z16.d, z15.d[1] +mul z16.d, z16.d,z14.d[1] +mla z16.d, P0/M, z19.d, z31.d +sub z19.d, z17.d, z16.d +add z17.d, z17.d, z16.d +sqrdmulh z16.d, z24.d, z1.d[0] +mul z24.d, z24.d,z0.d[0] +mla z24.d, P0/M, z16.d, z31.d +sub z16.d, z23.d, z24.d +add z23.d, z23.d, z24.d +str q23, [x0, #1792] +str q16, [x0, #1824] +sqrdmulh z16.d, z22.d, z1.d[1] +mul z22.d, z22.d,z0.d[1] +mla z22.d, P0/M, z16.d, z31.d +sub z16.d, z21.d, z22.d +add z21.d, z21.d, z22.d +str q21, [x0, #1856] +str q16, [x0, #1888] +sqrdmulh z16.d, z20.d, z3.d[1] +mul z20.d, z20.d,z2.d[1] +mla z20.d, P0/M, z16.d, z31.d +sub z16.d, z19.d, z20.d +add z19.d, z19.d, z20.d +str q19, [x0, #1984] +str q16, [x0, #2016] +sqrdmulh z16.d, z18.d, z3.d[0] +mul z18.d, z18.d,z2.d[0] +mla z18.d, P0/M, z16.d, z31.d +sub z16.d, z17.d, z18.d +add z17.d, z17.d, z18.d +str q17, [x0, #1920] +str q16, [x0, #1952] +// Restore SVE2 vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 2697 +// Instruction count: 2693 \ No newline at end of file diff --git a/asm/auto/ntt_sve2/ntt_u64_incomplete_72057594067788289_60277548896192635_var_3_3_1.s b/asm/auto/ntt_sve2/ntt_u64_incomplete_72057594067788289_60277548896192635_var_3_3_1.s new file mode 100644 index 0000000..9cf22e1 --- /dev/null +++ b/asm/auto/ntt_sve2/ntt_u64_incomplete_72057594067788289_60277548896192635_var_3_3_1.s @@ -0,0 +1,2727 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +modulus: +.dword -72057594067788289 +.dword -72057594067788289 +.dword -72057594067788289 +.dword -72057594067788289 +.align 6 +roots_merged: +.dword 25792053496987399 // Layer 0, block 0 +.dword 0 // Layer None, block None +.dword 3301382846246308405 // Layer 0, block 0 +.dword 0 // Layer None, block None +.dword 36678763444893001 // Layer 1, block 0 +.dword 12009493193917617 // Layer 1, block 1 +.dword 4694881719000765600 // Layer 1, block 0 +.dword 1537215128184439725 // Layer 1, block 1 +.dword 57226611787624233 // Layer 2, block 0 +.dword 39665359539540334 // Layer 2, block 1 +.dword 7325006305780451127 // Layer 2, block 0 +.dword 5077166018957207276 // Layer 2, block 1 +.dword 14359056949694594 // Layer 2, block 2 +.dword 63449028357011879 // Layer 2, block 3 +.dword 1837959288799265711 // Layer 2, block 2 +.dword 8121475626332016399 // Layer 2, block 3 +.dword 56437370284897879 // Layer 3, block 0 +.dword 0 // Layer None, block None +.dword 7223983393473341270 // Layer 3, block 0 +.dword 0 // Layer None, block None +.dword 15519149204003269 // Layer 4, block 0 +.dword 18945631884663455 // Layer 4, block 1 +.dword 1986451097289241753 // Layer 4, block 0 +.dword 2425040880231995866 // Layer 4, block 1 +.dword 21843809513296019 // Layer 5, block 0 +.dword 52861630939350015 // Layer 5, block 1 +.dword 2796007616543237058 // Layer 5, block 0 +.dword 6766288757432881341 // Layer 5, block 1 +.dword 58200436133340777 // Layer 5, block 2 +.dword 45581265709396633 // Layer 5, block 3 +.dword 7449655821980514543 // Layer 5, block 2 +.dword 5834402008385018253 // Layer 5, block 3 +.dword 7801853795705237 // Layer 3, block 1 +.dword 0 // Layer None, block None +.dword 998637285436439396 // Layer 3, block 1 +.dword 0 // Layer None, block None +.dword 72057409685042741 // Layer 4, block 2 +.dword 67813594624550994 // Layer 4, block 3 +.dword 9223348435863355444 // Layer 4, block 2 +.dword 8680140108345514992 // Layer 4, block 3 +.dword 16444438478993771 // Layer 5, block 4 +.dword 44738633871916757 // Layer 5, block 5 +.dword 2104888124438946221 // Layer 5, block 4 +.dword 5726545133232289544 // Layer 5, block 5 +.dword 14998888047589537 // Layer 5, block 6 +.dword 1367715298619054 // Layer 5, block 7 +.dword 1919857669295880083 // Layer 5, block 6 +.dword 175067558150691679 // Layer 5, block 7 +.dword 50810289212278368 // Layer 3, block 2 +.dword 0 // Layer None, block None +.dword 6503717016476519110 // Layer 3, block 2 +.dword 0 // Layer None, block None +.dword 38922220208018571 // Layer 4, block 4 +.dword 7966052600948377 // Layer 4, block 5 +.dword 4982044184561839686 // Layer 4, block 4 +.dword 1019654732498851778 // Layer 4, block 5 +.dword 45879272116084567 // Layer 5, block 8 +.dword 66654388400258382 // Layer 5, block 9 +.dword 5872546828425266758 // Layer 5, block 8 +.dword 8531761711697548017 // Layer 5, block 9 +.dword 8930087962801744 // Layer 5, block 10 +.dword 61848588213223279 // Layer 5, block 11 +.dword 1143051258764947771 // Layer 5, block 10 +.dword 7916619288011967173 // Layer 5, block 11 +.dword 31977682183549777 // Layer 3, block 3 +.dword 0 // Layer None, block None +.dword 4093143317798190700 // Layer 3, block 3 +.dword 0 // Layer None, block None +.dword 66070897124800871 // Layer 4, block 6 +.dword 953067252694683 // Layer 4, block 7 +.dword 8457074828469936528 // Layer 4, block 6 +.dword 121992608294366219 // Layer 4, block 7 +.dword 33801610235026337 // Layer 5, block 12 +.dword 32122784433286747 // Layer 5, block 13 +.dword 4326606108290444417 // Layer 5, block 12 +.dword 4111716405756826253 // Layer 5, block 13 +.dword 67688369535326483 // Layer 5, block 14 +.dword 45021686719473556 // Layer 5, block 15 +.dword 8664111296931419854 // Layer 5, block 14 +.dword 5762775897704545946 // Layer 5, block 15 +.dword 66662168904752601 // Layer 3, block 4 +.dword 0 // Layer None, block None +.dword 8532757616272395351 // Layer 3, block 4 +.dword 0 // Layer None, block None +.dword 23961218891132444 // Layer 4, block 8 +.dword 59012643726482518 // Layer 4, block 9 +.dword 3067036016793986470 // Layer 4, block 8 +.dword 7553618393859575754 // Layer 4, block 9 +.dword 52812533586708198 // Layer 5, block 16 +.dword 27994290036168371 // Layer 5, block 17 +.dword 6760004296297333018 // Layer 5, block 16 +.dword 3583269123144660376 // Layer 5, block 17 +.dword 45890717144660134 // Layer 5, block 18 +.dword 39684773913748863 // Layer 5, block 19 +.dword 5874011792082332260 // Layer 5, block 18 +.dword 5079651058854869198 // Layer 5, block 19 +.dword 50149898471788096 // Layer 3, block 5 +.dword 0 // Layer None, block None +.dword 6419187001728793164 // Layer 3, block 5 +.dword 0 // Layer None, block None +.dword 65714767972465509 // Layer 4, block 10 +.dword 51421828010275652 // Layer 4, block 11 +.dword 8411490296989900223 // Layer 4, block 10 +.dword 6581993982587733829 // Layer 4, block 11 +.dword 18683690578478417 // Layer 5, block 20 +.dword 3282356803714609 // Layer 5, block 21 +.dword 2391512393054205061 // Layer 5, block 20 +.dword 420141670701365074 // Layer 5, block 21 +.dword 67884452950503047 // Layer 5, block 22 +.dword 10335338564031418 // Layer 5, block 23 +.dword 8689209974063619263 // Layer 5, block 22 +.dword 1322923335647807838 // Layer 5, block 23 +.dword 30932683335866672 // Layer 3, block 6 +.dword 0 // Layer None, block None +.dword 3959383465350182760 // Layer 3, block 6 +.dword 0 // Layer None, block None +.dword 27050097608373352 // Layer 4, block 12 +.dword 67454821565758121 // Layer 4, block 13 +.dword 3462412492436980406 // Layer 4, block 12 +.dword 8634217156839057519 // Layer 4, block 13 +.dword 32828920539599153 // Layer 5, block 24 +.dword 8624332566875856 // Layer 5, block 25 +.dword 4202101827327358896 // Layer 5, block 24 +.dword 1103914568102652181 // Layer 5, block 25 +.dword 56732837753533829 // Layer 5, block 26 +.dword 14816466027490539 // Layer 5, block 27 +.dword 7261803229443070495 // Layer 5, block 26 +.dword 1896507650732884485 // Layer 5, block 27 +.dword 54968319742463037 // Layer 3, block 7 +.dword 0 // Layer None, block None +.dword 7035944924119603816 // Layer 3, block 7 +.dword 0 // Layer None, block None +.dword 55666925166425210 // Layer 4, block 14 +.dword 34241587306439298 // Layer 4, block 15 +.dword 7125366418349706083 // Layer 4, block 14 +.dword 4382923173407965878 // Layer 4, block 15 +.dword 8550051130607768 // Layer 5, block 28 +.dword 14420141705316589 // Layer 5, block 29 +.dword 1094406544264277001 // Layer 5, block 28 +.dword 1845778137515640974 // Layer 5, block 29 +.dword 55622715926092387 // Layer 5, block 30 +.dword 3405033449209397 // Layer 5, block 31 +.dword 7119707635589449714 // Layer 5, block 30 +.dword 435844281318190845 // Layer 5, block 31 +.text +.type ntt_u64_incomplete_sve2_asm_var_3_3_1, %function +.global ntt_u64_incomplete_sve2_asm_var_3_3_1 +modulus_addr: .quad modulus +roots_merged_addr: .quad roots_merged +ntt_u64_incomplete_sve2_asm_var_3_3_1: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save SVE2 vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ldr x17, modulus_addr +ldr q31, [x17] +ptrue P0.d +ldr x17, roots_merged_addr +ldr q3, [x17, #+0] +ldr q2, [x17, #+16] +ldr q1, [x17, #+32] +ldr q0, [x17, #+48] +ldr q15, [x17, #+64] +ldr q14, [x17, #+80] +ldr q13, [x17, #+96] +ldr q12, [x17, #+112] +ldr q30, [x0, #1920] +ldr q29, [x0, #1664] +sqrdmulh z28.d, z30.d, z2.d[0] +mul z30.d, z30.d,z3.d[0] +ldr q27, [x0, #1152] +ldr q26, [x0, #1408] +sqrdmulh z25.d, z29.d, z2.d[0] +mla z30.d, P0/M, z28.d, z31.d +mul z29.d, z29.d,z3.d[0] +ldr q28, [x0, #896] +ldr q24, [x0, #640] +sqrdmulh z23.d, z27.d, z2.d[0] +sub z22.d, z28.d, z30.d +mla z29.d, P0/M, z25.d, z31.d +mul z27.d, z27.d,z3.d[0] +add z28.d, z28.d, z30.d +ldr q30, [x0, #128] +ldr q25, [x0, #384] +sqrdmulh z21.d, z26.d, z2.d[0] +sub z20.d, z24.d, z29.d +mla z27.d, P0/M, z23.d, z31.d +mul z26.d, z26.d,z3.d[0] +add z24.d, z24.d, z29.d +sqrdmulh z29.d, z28.d, z0.d[0] +sub z23.d, z30.d, z27.d +mla z26.d, P0/M, z21.d, z31.d +mul z28.d, z28.d,z1.d[0] +add z30.d, z30.d, z27.d +sqrdmulh z27.d, z24.d, z0.d[0] +sub z21.d, z25.d, z26.d +mla z28.d, P0/M, z29.d, z31.d +mul z24.d, z24.d,z1.d[0] +add z25.d, z25.d, z26.d +sqrdmulh z26.d, z22.d, z0.d[1] +sub z29.d, z25.d, z28.d +mla z24.d, P0/M, z27.d, z31.d +mul z22.d, z22.d,z1.d[1] +add z25.d, z25.d, z28.d +sqrdmulh z28.d, z20.d, z0.d[1] +sub z27.d, z30.d, z24.d +mla z22.d, P0/M, z26.d, z31.d +mul z20.d, z20.d,z1.d[1] +add z30.d, z30.d, z24.d +sqrdmulh z24.d, z25.d, z14.d[0] +sub z26.d, z21.d, z22.d +mla z20.d, P0/M, z28.d, z31.d +mul z25.d, z25.d,z15.d[0] +add z21.d, z21.d, z22.d +sqrdmulh z22.d, z29.d, z14.d[1] +sub z28.d, z23.d, z20.d +mla z25.d, P0/M, z24.d, z31.d +mul z29.d, z29.d,z15.d[1] +add z23.d, z23.d, z20.d +sqrdmulh z20.d, z26.d, z12.d[1] +sub z24.d, z30.d, z25.d +mla z29.d, P0/M, z22.d, z31.d +mul z26.d, z26.d,z13.d[1] +add z30.d, z30.d, z25.d +str q30, [x0, #128] +str q24, [x0, #384] +sqrdmulh z24.d, z21.d, z12.d[0] +sub z30.d, z27.d, z29.d +mla z26.d, P0/M, z20.d, z31.d +mul z21.d, z21.d,z13.d[0] +add z27.d, z27.d, z29.d +str q27, [x0, #640] +str q30, [x0, #896] +ldr q30, [x0, #1936] +ldr q27, [x0, #1680] +sqrdmulh z29.d, z30.d, z2.d[0] +sub z20.d, z28.d, z26.d +mla z21.d, P0/M, z24.d, z31.d +mul z30.d, z30.d,z3.d[0] +add z28.d, z28.d, z26.d +str q28, [x0, #1664] +str q20, [x0, #1920] +ldr q20, [x0, #1168] +ldr q28, [x0, #1424] +sqrdmulh z26.d, z27.d, z2.d[0] +sub z24.d, z23.d, z21.d +mla z30.d, P0/M, z29.d, z31.d +mul z27.d, z27.d,z3.d[0] +add z23.d, z23.d, z21.d +str q23, [x0, #1152] +str q24, [x0, #1408] +ldr q24, [x0, #912] +ldr q23, [x0, #656] +sqrdmulh z21.d, z20.d, z2.d[0] +sub z29.d, z24.d, z30.d +mla z27.d, P0/M, z26.d, z31.d +mul z20.d, z20.d,z3.d[0] +add z24.d, z24.d, z30.d +ldr q30, [x0, #144] +ldr q26, [x0, #400] +sqrdmulh z25.d, z28.d, z2.d[0] +sub z22.d, z23.d, z27.d +mla z20.d, P0/M, z21.d, z31.d +mul z28.d, z28.d,z3.d[0] +add z23.d, z23.d, z27.d +sqrdmulh z27.d, z24.d, z0.d[0] +sub z21.d, z30.d, z20.d +mla z28.d, P0/M, z25.d, z31.d +mul z24.d, z24.d,z1.d[0] +add z30.d, z30.d, z20.d +sqrdmulh z20.d, z23.d, z0.d[0] +sub z25.d, z26.d, z28.d +mla z24.d, P0/M, z27.d, z31.d +mul z23.d, z23.d,z1.d[0] +add z26.d, z26.d, z28.d +sqrdmulh z28.d, z29.d, z0.d[1] +sub z27.d, z26.d, z24.d +mla z23.d, P0/M, z20.d, z31.d +mul z29.d, z29.d,z1.d[1] +add z26.d, z26.d, z24.d +sqrdmulh z24.d, z22.d, z0.d[1] +sub z20.d, z30.d, z23.d +mla z29.d, P0/M, z28.d, z31.d +mul z22.d, z22.d,z1.d[1] +add z30.d, z30.d, z23.d +sqrdmulh z23.d, z26.d, z14.d[0] +sub z28.d, z25.d, z29.d +mla z22.d, P0/M, z24.d, z31.d +mul z26.d, z26.d,z15.d[0] +add z25.d, z25.d, z29.d +sqrdmulh z29.d, z27.d, z14.d[1] +sub z24.d, z21.d, z22.d +mla z26.d, P0/M, z23.d, z31.d +mul z27.d, z27.d,z15.d[1] +add z21.d, z21.d, z22.d +sqrdmulh z22.d, z28.d, z12.d[1] +sub z23.d, z30.d, z26.d +mla z27.d, P0/M, z29.d, z31.d +mul z28.d, z28.d,z13.d[1] +add z30.d, z30.d, z26.d +str q30, [x0, #144] +str q23, [x0, #400] +sqrdmulh z23.d, z25.d, z12.d[0] +sub z30.d, z20.d, z27.d +mla z28.d, P0/M, z22.d, z31.d +mul z25.d, z25.d,z13.d[0] +add z20.d, z20.d, z27.d +str q20, [x0, #656] +str q30, [x0, #912] +ldr q30, [x0, #1952] +ldr q20, [x0, #1696] +sqrdmulh z27.d, z30.d, z2.d[0] +sub z22.d, z24.d, z28.d +mla z25.d, P0/M, z23.d, z31.d +mul z30.d, z30.d,z3.d[0] +add z24.d, z24.d, z28.d +str q24, [x0, #1680] +str q22, [x0, #1936] +ldr q22, [x0, #1184] +ldr q24, [x0, #1440] +sqrdmulh z28.d, z20.d, z2.d[0] +sub z23.d, z21.d, z25.d +mla z30.d, P0/M, z27.d, z31.d +mul z20.d, z20.d,z3.d[0] +add z21.d, z21.d, z25.d +str q21, [x0, #1168] +str q23, [x0, #1424] +ldr q23, [x0, #928] +ldr q21, [x0, #672] +sqrdmulh z25.d, z22.d, z2.d[0] +sub z27.d, z23.d, z30.d +mla z20.d, P0/M, z28.d, z31.d +mul z22.d, z22.d,z3.d[0] +add z23.d, z23.d, z30.d +ldr q30, [x0, #160] +ldr q28, [x0, #416] +sqrdmulh z26.d, z24.d, z2.d[0] +sub z29.d, z21.d, z20.d +mla z22.d, P0/M, z25.d, z31.d +mul z24.d, z24.d,z3.d[0] +add z21.d, z21.d, z20.d +sqrdmulh z20.d, z23.d, z0.d[0] +sub z25.d, z30.d, z22.d +mla z24.d, P0/M, z26.d, z31.d +mul z23.d, z23.d,z1.d[0] +add z30.d, z30.d, z22.d +sqrdmulh z22.d, z21.d, z0.d[0] +sub z26.d, z28.d, z24.d +mla z23.d, P0/M, z20.d, z31.d +mul z21.d, z21.d,z1.d[0] +add z28.d, z28.d, z24.d +sqrdmulh z24.d, z27.d, z0.d[1] +sub z20.d, z28.d, z23.d +mla z21.d, P0/M, z22.d, z31.d +mul z27.d, z27.d,z1.d[1] +add z28.d, z28.d, z23.d +sqrdmulh z23.d, z29.d, z0.d[1] +sub z22.d, z30.d, z21.d +mla z27.d, P0/M, z24.d, z31.d +mul z29.d, z29.d,z1.d[1] +add z30.d, z30.d, z21.d +sqrdmulh z21.d, z28.d, z14.d[0] +sub z24.d, z26.d, z27.d +mla z29.d, P0/M, z23.d, z31.d +mul z28.d, z28.d,z15.d[0] +add z26.d, z26.d, z27.d +sqrdmulh z27.d, z20.d, z14.d[1] +sub z23.d, z25.d, z29.d +mla z28.d, P0/M, z21.d, z31.d +mul z20.d, z20.d,z15.d[1] +add z25.d, z25.d, z29.d +sqrdmulh z29.d, z24.d, z12.d[1] +sub z21.d, z30.d, z28.d +mla z20.d, P0/M, z27.d, z31.d +mul z24.d, z24.d,z13.d[1] +add z30.d, z30.d, z28.d +str q30, [x0, #160] +str q21, [x0, #416] +sqrdmulh z21.d, z26.d, z12.d[0] +sub z30.d, z22.d, z20.d +mla z24.d, P0/M, z29.d, z31.d +mul z26.d, z26.d,z13.d[0] +add z22.d, z22.d, z20.d +str q22, [x0, #672] +str q30, [x0, #928] +ldr q30, [x0, #1968] +ldr q22, [x0, #1712] +sqrdmulh z20.d, z30.d, z2.d[0] +sub z29.d, z23.d, z24.d +mla z26.d, P0/M, z21.d, z31.d +mul z30.d, z30.d,z3.d[0] +add z23.d, z23.d, z24.d +str q23, [x0, #1696] +str q29, [x0, #1952] +ldr q29, [x0, #1200] +ldr q23, [x0, #1456] +sqrdmulh z24.d, z22.d, z2.d[0] +sub z21.d, z25.d, z26.d +mla z30.d, P0/M, z20.d, z31.d +mul z22.d, z22.d,z3.d[0] +add z25.d, z25.d, z26.d +str q25, [x0, #1184] +str q21, [x0, #1440] +ldr q21, [x0, #944] +ldr q25, [x0, #688] +sqrdmulh z26.d, z29.d, z2.d[0] +sub z20.d, z21.d, z30.d +mla z22.d, P0/M, z24.d, z31.d +mul z29.d, z29.d,z3.d[0] +add z21.d, z21.d, z30.d +ldr q30, [x0, #176] +ldr q24, [x0, #432] +sqrdmulh z28.d, z23.d, z2.d[0] +sub z27.d, z25.d, z22.d +mla z29.d, P0/M, z26.d, z31.d +mul z23.d, z23.d,z3.d[0] +add z25.d, z25.d, z22.d +sqrdmulh z22.d, z21.d, z0.d[0] +sub z26.d, z30.d, z29.d +mla z23.d, P0/M, z28.d, z31.d +mul z21.d, z21.d,z1.d[0] +add z30.d, z30.d, z29.d +sqrdmulh z29.d, z25.d, z0.d[0] +sub z28.d, z24.d, z23.d +mla z21.d, P0/M, z22.d, z31.d +mul z25.d, z25.d,z1.d[0] +add z24.d, z24.d, z23.d +sqrdmulh z23.d, z20.d, z0.d[1] +sub z22.d, z24.d, z21.d +mla z25.d, P0/M, z29.d, z31.d +mul z20.d, z20.d,z1.d[1] +add z24.d, z24.d, z21.d +sqrdmulh z21.d, z27.d, z0.d[1] +sub z29.d, z30.d, z25.d +mla z20.d, P0/M, z23.d, z31.d +mul z27.d, z27.d,z1.d[1] +add z30.d, z30.d, z25.d +sqrdmulh z25.d, z24.d, z14.d[0] +sub z23.d, z28.d, z20.d +mla z27.d, P0/M, z21.d, z31.d +mul z24.d, z24.d,z15.d[0] +add z28.d, z28.d, z20.d +sqrdmulh z20.d, z22.d, z14.d[1] +sub z21.d, z26.d, z27.d +mla z24.d, P0/M, z25.d, z31.d +mul z22.d, z22.d,z15.d[1] +add z26.d, z26.d, z27.d +sqrdmulh z27.d, z23.d, z12.d[1] +sub z25.d, z30.d, z24.d +mla z22.d, P0/M, z20.d, z31.d +mul z23.d, z23.d,z13.d[1] +add z30.d, z30.d, z24.d +str q30, [x0, #176] +str q25, [x0, #432] +sqrdmulh z25.d, z28.d, z12.d[0] +sub z30.d, z29.d, z22.d +mla z23.d, P0/M, z27.d, z31.d +mul z28.d, z28.d,z13.d[0] +add z29.d, z29.d, z22.d +str q29, [x0, #688] +str q30, [x0, #944] +ldr q30, [x0, #1984] +ldr q29, [x0, #1728] +sqrdmulh z22.d, z30.d, z2.d[0] +sub z27.d, z21.d, z23.d +mla z28.d, P0/M, z25.d, z31.d +mul z30.d, z30.d,z3.d[0] +add z21.d, z21.d, z23.d +str q21, [x0, #1712] +str q27, [x0, #1968] +ldr q27, [x0, #1216] +ldr q21, [x0, #1472] +sqrdmulh z23.d, z29.d, z2.d[0] +sub z25.d, z26.d, z28.d +mla z30.d, P0/M, z22.d, z31.d +mul z29.d, z29.d,z3.d[0] +add z26.d, z26.d, z28.d +str q26, [x0, #1200] +str q25, [x0, #1456] +ldr q25, [x0, #960] +ldr q26, [x0, #704] +sqrdmulh z28.d, z27.d, z2.d[0] +sub z22.d, z25.d, z30.d +mla z29.d, P0/M, z23.d, z31.d +mul z27.d, z27.d,z3.d[0] +add z25.d, z25.d, z30.d +ldr q30, [x0, #192] +ldr q23, [x0, #448] +sqrdmulh z24.d, z21.d, z2.d[0] +sub z20.d, z26.d, z29.d +mla z27.d, P0/M, z28.d, z31.d +mul z21.d, z21.d,z3.d[0] +add z26.d, z26.d, z29.d +sqrdmulh z29.d, z25.d, z0.d[0] +sub z28.d, z30.d, z27.d +mla z21.d, P0/M, z24.d, z31.d +mul z25.d, z25.d,z1.d[0] +add z30.d, z30.d, z27.d +sqrdmulh z27.d, z26.d, z0.d[0] +sub z24.d, z23.d, z21.d +mla z25.d, P0/M, z29.d, z31.d +mul z26.d, z26.d,z1.d[0] +add z23.d, z23.d, z21.d +sqrdmulh z21.d, z22.d, z0.d[1] +sub z29.d, z23.d, z25.d +mla z26.d, P0/M, z27.d, z31.d +mul z22.d, z22.d,z1.d[1] +add z23.d, z23.d, z25.d +sqrdmulh z25.d, z20.d, z0.d[1] +sub z27.d, z30.d, z26.d +mla z22.d, P0/M, z21.d, z31.d +mul z20.d, z20.d,z1.d[1] +add z30.d, z30.d, z26.d +sqrdmulh z26.d, z23.d, z14.d[0] +sub z21.d, z24.d, z22.d +mla z20.d, P0/M, z25.d, z31.d +mul z23.d, z23.d,z15.d[0] +add z24.d, z24.d, z22.d +sqrdmulh z22.d, z29.d, z14.d[1] +sub z25.d, z28.d, z20.d +mla z23.d, P0/M, z26.d, z31.d +mul z29.d, z29.d,z15.d[1] +add z28.d, z28.d, z20.d +sqrdmulh z20.d, z21.d, z12.d[1] +sub z26.d, z30.d, z23.d +mla z29.d, P0/M, z22.d, z31.d +mul z21.d, z21.d,z13.d[1] +add z30.d, z30.d, z23.d +str q30, [x0, #192] +str q26, [x0, #448] +sqrdmulh z26.d, z24.d, z12.d[0] +sub z30.d, z27.d, z29.d +mla z21.d, P0/M, z20.d, z31.d +mul z24.d, z24.d,z13.d[0] +add z27.d, z27.d, z29.d +str q27, [x0, #704] +str q30, [x0, #960] +ldr q30, [x0, #2000] +ldr q27, [x0, #1744] +sqrdmulh z29.d, z30.d, z2.d[0] +sub z20.d, z25.d, z21.d +mla z24.d, P0/M, z26.d, z31.d +mul z30.d, z30.d,z3.d[0] +add z25.d, z25.d, z21.d +str q25, [x0, #1728] +str q20, [x0, #1984] +ldr q20, [x0, #1232] +ldr q25, [x0, #1488] +sqrdmulh z21.d, z27.d, z2.d[0] +sub z26.d, z28.d, z24.d +mla z30.d, P0/M, z29.d, z31.d +mul z27.d, z27.d,z3.d[0] +add z28.d, z28.d, z24.d +str q28, [x0, #1216] +str q26, [x0, #1472] +ldr q26, [x0, #976] +ldr q28, [x0, #720] +sqrdmulh z24.d, z20.d, z2.d[0] +sub z29.d, z26.d, z30.d +mla z27.d, P0/M, z21.d, z31.d +mul z20.d, z20.d,z3.d[0] +add z26.d, z26.d, z30.d +ldr q30, [x0, #208] +ldr q21, [x0, #464] +sqrdmulh z23.d, z25.d, z2.d[0] +sub z22.d, z28.d, z27.d +mla z20.d, P0/M, z24.d, z31.d +mul z25.d, z25.d,z3.d[0] +add z28.d, z28.d, z27.d +sqrdmulh z27.d, z26.d, z0.d[0] +sub z24.d, z30.d, z20.d +mla z25.d, P0/M, z23.d, z31.d +mul z26.d, z26.d,z1.d[0] +add z30.d, z30.d, z20.d +sqrdmulh z20.d, z28.d, z0.d[0] +sub z23.d, z21.d, z25.d +mla z26.d, P0/M, z27.d, z31.d +mul z28.d, z28.d,z1.d[0] +add z21.d, z21.d, z25.d +sqrdmulh z25.d, z29.d, z0.d[1] +sub z27.d, z21.d, z26.d +mla z28.d, P0/M, z20.d, z31.d +mul z29.d, z29.d,z1.d[1] +add z21.d, z21.d, z26.d +sqrdmulh z26.d, z22.d, z0.d[1] +sub z20.d, z30.d, z28.d +mla z29.d, P0/M, z25.d, z31.d +mul z22.d, z22.d,z1.d[1] +add z30.d, z30.d, z28.d +sqrdmulh z28.d, z21.d, z14.d[0] +sub z25.d, z23.d, z29.d +mla z22.d, P0/M, z26.d, z31.d +mul z21.d, z21.d,z15.d[0] +add z23.d, z23.d, z29.d +sqrdmulh z29.d, z27.d, z14.d[1] +sub z26.d, z24.d, z22.d +mla z21.d, P0/M, z28.d, z31.d +mul z27.d, z27.d,z15.d[1] +add z24.d, z24.d, z22.d +sqrdmulh z22.d, z25.d, z12.d[1] +sub z28.d, z30.d, z21.d +mla z27.d, P0/M, z29.d, z31.d +mul z25.d, z25.d,z13.d[1] +add z30.d, z30.d, z21.d +str q30, [x0, #208] +str q28, [x0, #464] +sqrdmulh z28.d, z23.d, z12.d[0] +sub z30.d, z20.d, z27.d +mla z25.d, P0/M, z22.d, z31.d +mul z23.d, z23.d,z13.d[0] +add z20.d, z20.d, z27.d +str q20, [x0, #720] +str q30, [x0, #976] +ldr q30, [x0, #2016] +ldr q20, [x0, #1760] +sqrdmulh z27.d, z30.d, z2.d[0] +sub z22.d, z26.d, z25.d +mla z23.d, P0/M, z28.d, z31.d +mul z30.d, z30.d,z3.d[0] +add z26.d, z26.d, z25.d +str q26, [x0, #1744] +str q22, [x0, #2000] +ldr q22, [x0, #1248] +ldr q26, [x0, #1504] +sqrdmulh z25.d, z20.d, z2.d[0] +sub z28.d, z24.d, z23.d +mla z30.d, P0/M, z27.d, z31.d +mul z20.d, z20.d,z3.d[0] +add z24.d, z24.d, z23.d +str q24, [x0, #1232] +str q28, [x0, #1488] +ldr q28, [x0, #992] +ldr q24, [x0, #736] +sqrdmulh z23.d, z22.d, z2.d[0] +sub z27.d, z28.d, z30.d +mla z20.d, P0/M, z25.d, z31.d +mul z22.d, z22.d,z3.d[0] +add z28.d, z28.d, z30.d +ldr q30, [x0, #224] +ldr q25, [x0, #480] +sqrdmulh z21.d, z26.d, z2.d[0] +sub z29.d, z24.d, z20.d +mla z22.d, P0/M, z23.d, z31.d +mul z26.d, z26.d,z3.d[0] +add z24.d, z24.d, z20.d +sqrdmulh z20.d, z28.d, z0.d[0] +sub z23.d, z30.d, z22.d +mla z26.d, P0/M, z21.d, z31.d +mul z28.d, z28.d,z1.d[0] +add z30.d, z30.d, z22.d +sqrdmulh z22.d, z24.d, z0.d[0] +sub z21.d, z25.d, z26.d +mla z28.d, P0/M, z20.d, z31.d +mul z24.d, z24.d,z1.d[0] +add z25.d, z25.d, z26.d +sqrdmulh z26.d, z27.d, z0.d[1] +sub z20.d, z25.d, z28.d +mla z24.d, P0/M, z22.d, z31.d +mul z27.d, z27.d,z1.d[1] +add z25.d, z25.d, z28.d +sqrdmulh z28.d, z29.d, z0.d[1] +sub z22.d, z30.d, z24.d +mla z27.d, P0/M, z26.d, z31.d +mul z29.d, z29.d,z1.d[1] +add z30.d, z30.d, z24.d +sqrdmulh z24.d, z25.d, z14.d[0] +sub z26.d, z21.d, z27.d +mla z29.d, P0/M, z28.d, z31.d +mul z25.d, z25.d,z15.d[0] +add z21.d, z21.d, z27.d +sqrdmulh z27.d, z20.d, z14.d[1] +sub z28.d, z23.d, z29.d +mla z25.d, P0/M, z24.d, z31.d +mul z20.d, z20.d,z15.d[1] +add z23.d, z23.d, z29.d +sqrdmulh z29.d, z26.d, z12.d[1] +sub z24.d, z30.d, z25.d +mla z20.d, P0/M, z27.d, z31.d +mul z26.d, z26.d,z13.d[1] +add z30.d, z30.d, z25.d +str q30, [x0, #224] +str q24, [x0, #480] +sqrdmulh z24.d, z21.d, z12.d[0] +sub z30.d, z22.d, z20.d +mla z26.d, P0/M, z29.d, z31.d +mul z21.d, z21.d,z13.d[0] +add z22.d, z22.d, z20.d +str q22, [x0, #736] +str q30, [x0, #992] +ldr q30, [x0, #2032] +ldr q22, [x0, #1776] +sqrdmulh z20.d, z30.d, z2.d[0] +sub z29.d, z28.d, z26.d +mla z21.d, P0/M, z24.d, z31.d +mul z30.d, z30.d,z3.d[0] +add z28.d, z28.d, z26.d +str q28, [x0, #1760] +str q29, [x0, #2016] +ldr q29, [x0, #1264] +ldr q28, [x0, #1520] +sqrdmulh z26.d, z22.d, z2.d[0] +sub z24.d, z23.d, z21.d +mla z30.d, P0/M, z20.d, z31.d +mul z22.d, z22.d,z3.d[0] +add z23.d, z23.d, z21.d +str q23, [x0, #1248] +str q24, [x0, #1504] +ldr q24, [x0, #1008] +ldr q23, [x0, #752] +sqrdmulh z21.d, z29.d, z2.d[0] +sub z20.d, z24.d, z30.d +mla z22.d, P0/M, z26.d, z31.d +mul z29.d, z29.d,z3.d[0] +add z24.d, z24.d, z30.d +ldr q30, [x0, #240] +ldr q26, [x0, #496] +sqrdmulh z25.d, z28.d, z2.d[0] +sub z27.d, z23.d, z22.d +mla z29.d, P0/M, z21.d, z31.d +mul z28.d, z28.d,z3.d[0] +add z23.d, z23.d, z22.d +sqrdmulh z22.d, z24.d, z0.d[0] +sub z21.d, z30.d, z29.d +mla z28.d, P0/M, z25.d, z31.d +mul z24.d, z24.d,z1.d[0] +add z30.d, z30.d, z29.d +sqrdmulh z29.d, z23.d, z0.d[0] +sub z25.d, z26.d, z28.d +mla z24.d, P0/M, z22.d, z31.d +mul z23.d, z23.d,z1.d[0] +add z26.d, z26.d, z28.d +sqrdmulh z28.d, z20.d, z0.d[1] +sub z22.d, z26.d, z24.d +mla z23.d, P0/M, z29.d, z31.d +mul z20.d, z20.d,z1.d[1] +add z26.d, z26.d, z24.d +sqrdmulh z24.d, z27.d, z0.d[1] +sub z29.d, z30.d, z23.d +mla z20.d, P0/M, z28.d, z31.d +mul z27.d, z27.d,z1.d[1] +add z30.d, z30.d, z23.d +sqrdmulh z23.d, z26.d, z14.d[0] +sub z28.d, z25.d, z20.d +mla z27.d, P0/M, z24.d, z31.d +mul z26.d, z26.d,z15.d[0] +add z25.d, z25.d, z20.d +sqrdmulh z20.d, z22.d, z14.d[1] +sub z24.d, z21.d, z27.d +mla z26.d, P0/M, z23.d, z31.d +mul z22.d, z22.d,z15.d[1] +add z21.d, z21.d, z27.d +sqrdmulh z27.d, z28.d, z12.d[1] +sub z23.d, z30.d, z26.d +mla z22.d, P0/M, z20.d, z31.d +mul z28.d, z28.d,z13.d[1] +add z30.d, z30.d, z26.d +str q30, [x0, #240] +str q23, [x0, #496] +sqrdmulh z23.d, z25.d, z12.d[0] +sub z30.d, z29.d, z22.d +mla z28.d, P0/M, z27.d, z31.d +mul z25.d, z25.d,z13.d[0] +add z29.d, z29.d, z22.d +str q29, [x0, #752] +str q30, [x0, #1008] +ldr q30, [x0, #1792] +ldr q29, [x0, #1536] +sqrdmulh z22.d, z30.d, z2.d[0] +sub z27.d, z24.d, z28.d +mla z25.d, P0/M, z23.d, z31.d +mul z30.d, z30.d,z3.d[0] +add z24.d, z24.d, z28.d +str q24, [x0, #1776] +str q27, [x0, #2032] +ldr q27, [x0, #1024] +ldr q24, [x0, #1280] +sqrdmulh z28.d, z29.d, z2.d[0] +sub z23.d, z21.d, z25.d +mla z30.d, P0/M, z22.d, z31.d +mul z29.d, z29.d,z3.d[0] +add z21.d, z21.d, z25.d +str q21, [x0, #1264] +str q23, [x0, #1520] +ldr q23, [x0, #768] +ldr q21, [x0, #512] +sqrdmulh z25.d, z27.d, z2.d[0] +sub z22.d, z23.d, z30.d +mla z29.d, P0/M, z28.d, z31.d +mul z27.d, z27.d,z3.d[0] +add z23.d, z23.d, z30.d +ldr q30, [x0, #0] +ldr q28, [x0, #256] +sqrdmulh z26.d, z24.d, z2.d[0] +sub z20.d, z21.d, z29.d +mla z27.d, P0/M, z25.d, z31.d +mul z24.d, z24.d,z3.d[0] +add z21.d, z21.d, z29.d +sqrdmulh z29.d, z23.d, z0.d[0] +sub z25.d, z30.d, z27.d +mla z24.d, P0/M, z26.d, z31.d +mul z23.d, z23.d,z1.d[0] +add z30.d, z30.d, z27.d +sqrdmulh z27.d, z21.d, z0.d[0] +sub z26.d, z28.d, z24.d +mla z23.d, P0/M, z29.d, z31.d +mul z21.d, z21.d,z1.d[0] +add z28.d, z28.d, z24.d +sqrdmulh z24.d, z22.d, z0.d[1] +sub z29.d, z28.d, z23.d +mla z21.d, P0/M, z27.d, z31.d +mul z22.d, z22.d,z1.d[1] +add z28.d, z28.d, z23.d +sqrdmulh z23.d, z20.d, z0.d[1] +sub z27.d, z30.d, z21.d +mla z22.d, P0/M, z24.d, z31.d +mul z20.d, z20.d,z1.d[1] +add z30.d, z30.d, z21.d +sqrdmulh z21.d, z28.d, z14.d[0] +sub z24.d, z26.d, z22.d +mla z20.d, P0/M, z23.d, z31.d +mul z28.d, z28.d,z15.d[0] +add z26.d, z26.d, z22.d +sqrdmulh z22.d, z29.d, z14.d[1] +sub z23.d, z25.d, z20.d +mla z28.d, P0/M, z21.d, z31.d +mul z29.d, z29.d,z15.d[1] +add z25.d, z25.d, z20.d +sqrdmulh z20.d, z24.d, z12.d[1] +sub z21.d, z30.d, z28.d +mla z29.d, P0/M, z22.d, z31.d +mul z24.d, z24.d,z13.d[1] +add z30.d, z30.d, z28.d +str q30, [x0, #0] +str q21, [x0, #256] +sqrdmulh z21.d, z26.d, z12.d[0] +sub z30.d, z27.d, z29.d +mla z24.d, P0/M, z20.d, z31.d +mul z26.d, z26.d,z13.d[0] +add z27.d, z27.d, z29.d +str q27, [x0, #512] +str q30, [x0, #768] +ldr q30, [x0, #1808] +ldr q27, [x0, #1552] +sqrdmulh z29.d, z30.d, z2.d[0] +sub z20.d, z23.d, z24.d +mla z26.d, P0/M, z21.d, z31.d +mul z30.d, z30.d,z3.d[0] +add z23.d, z23.d, z24.d +str q23, [x0, #1536] +str q20, [x0, #1792] +ldr q20, [x0, #1040] +ldr q23, [x0, #1296] +sqrdmulh z24.d, z27.d, z2.d[0] +sub z21.d, z25.d, z26.d +mla z30.d, P0/M, z29.d, z31.d +mul z27.d, z27.d,z3.d[0] +add z25.d, z25.d, z26.d +str q25, [x0, #1024] +str q21, [x0, #1280] +ldr q21, [x0, #784] +ldr q25, [x0, #528] +sqrdmulh z26.d, z20.d, z2.d[0] +sub z29.d, z21.d, z30.d +mla z27.d, P0/M, z24.d, z31.d +mul z20.d, z20.d,z3.d[0] +add z21.d, z21.d, z30.d +ldr q30, [x0, #16] +ldr q24, [x0, #272] +sqrdmulh z28.d, z23.d, z2.d[0] +sub z22.d, z25.d, z27.d +mla z20.d, P0/M, z26.d, z31.d +mul z23.d, z23.d,z3.d[0] +add z25.d, z25.d, z27.d +sqrdmulh z27.d, z21.d, z0.d[0] +sub z26.d, z30.d, z20.d +mla z23.d, P0/M, z28.d, z31.d +mul z21.d, z21.d,z1.d[0] +add z30.d, z30.d, z20.d +sqrdmulh z20.d, z25.d, z0.d[0] +sub z28.d, z24.d, z23.d +mla z21.d, P0/M, z27.d, z31.d +mul z25.d, z25.d,z1.d[0] +add z24.d, z24.d, z23.d +sqrdmulh z23.d, z29.d, z0.d[1] +sub z27.d, z24.d, z21.d +mla z25.d, P0/M, z20.d, z31.d +mul z29.d, z29.d,z1.d[1] +add z24.d, z24.d, z21.d +sqrdmulh z21.d, z22.d, z0.d[1] +sub z20.d, z30.d, z25.d +mla z29.d, P0/M, z23.d, z31.d +mul z22.d, z22.d,z1.d[1] +add z30.d, z30.d, z25.d +sqrdmulh z25.d, z24.d, z14.d[0] +sub z23.d, z28.d, z29.d +mla z22.d, P0/M, z21.d, z31.d +mul z24.d, z24.d,z15.d[0] +add z28.d, z28.d, z29.d +sqrdmulh z29.d, z27.d, z14.d[1] +sub z21.d, z26.d, z22.d +mla z24.d, P0/M, z25.d, z31.d +mul z27.d, z27.d,z15.d[1] +add z26.d, z26.d, z22.d +sqrdmulh z22.d, z23.d, z12.d[1] +sub z25.d, z30.d, z24.d +mla z27.d, P0/M, z29.d, z31.d +mul z23.d, z23.d,z13.d[1] +add z30.d, z30.d, z24.d +str q30, [x0, #16] +str q25, [x0, #272] +sqrdmulh z25.d, z28.d, z12.d[0] +sub z30.d, z20.d, z27.d +mla z23.d, P0/M, z22.d, z31.d +mul z28.d, z28.d,z13.d[0] +add z20.d, z20.d, z27.d +str q20, [x0, #528] +str q30, [x0, #784] +ldr q30, [x0, #1824] +ldr q20, [x0, #1568] +sqrdmulh z27.d, z30.d, z2.d[0] +sub z22.d, z21.d, z23.d +mla z28.d, P0/M, z25.d, z31.d +mul z30.d, z30.d,z3.d[0] +add z21.d, z21.d, z23.d +str q21, [x0, #1552] +str q22, [x0, #1808] +ldr q22, [x0, #1056] +ldr q21, [x0, #1312] +sqrdmulh z23.d, z20.d, z2.d[0] +sub z25.d, z26.d, z28.d +mla z30.d, P0/M, z27.d, z31.d +mul z20.d, z20.d,z3.d[0] +add z26.d, z26.d, z28.d +str q26, [x0, #1040] +str q25, [x0, #1296] +ldr q25, [x0, #800] +ldr q26, [x0, #544] +sqrdmulh z28.d, z22.d, z2.d[0] +sub z27.d, z25.d, z30.d +mla z20.d, P0/M, z23.d, z31.d +mul z22.d, z22.d,z3.d[0] +add z25.d, z25.d, z30.d +ldr q30, [x0, #32] +ldr q23, [x0, #288] +sqrdmulh z24.d, z21.d, z2.d[0] +sub z29.d, z26.d, z20.d +mla z22.d, P0/M, z28.d, z31.d +mul z21.d, z21.d,z3.d[0] +add z26.d, z26.d, z20.d +sqrdmulh z20.d, z25.d, z0.d[0] +sub z28.d, z30.d, z22.d +mla z21.d, P0/M, z24.d, z31.d +mul z25.d, z25.d,z1.d[0] +add z30.d, z30.d, z22.d +sqrdmulh z22.d, z26.d, z0.d[0] +sub z24.d, z23.d, z21.d +mla z25.d, P0/M, z20.d, z31.d +mul z26.d, z26.d,z1.d[0] +add z23.d, z23.d, z21.d +sqrdmulh z21.d, z27.d, z0.d[1] +sub z20.d, z23.d, z25.d +mla z26.d, P0/M, z22.d, z31.d +mul z27.d, z27.d,z1.d[1] +add z23.d, z23.d, z25.d +sqrdmulh z25.d, z29.d, z0.d[1] +sub z22.d, z30.d, z26.d +mla z27.d, P0/M, z21.d, z31.d +mul z29.d, z29.d,z1.d[1] +add z30.d, z30.d, z26.d +sqrdmulh z26.d, z23.d, z14.d[0] +sub z21.d, z24.d, z27.d +mla z29.d, P0/M, z25.d, z31.d +mul z23.d, z23.d,z15.d[0] +add z24.d, z24.d, z27.d +sqrdmulh z27.d, z20.d, z14.d[1] +sub z25.d, z28.d, z29.d +mla z23.d, P0/M, z26.d, z31.d +mul z20.d, z20.d,z15.d[1] +add z28.d, z28.d, z29.d +sqrdmulh z29.d, z21.d, z12.d[1] +sub z26.d, z30.d, z23.d +mla z20.d, P0/M, z27.d, z31.d +mul z21.d, z21.d,z13.d[1] +add z30.d, z30.d, z23.d +str q30, [x0, #32] +str q26, [x0, #288] +sqrdmulh z26.d, z24.d, z12.d[0] +sub z30.d, z22.d, z20.d +mla z21.d, P0/M, z29.d, z31.d +mul z24.d, z24.d,z13.d[0] +add z22.d, z22.d, z20.d +str q22, [x0, #544] +str q30, [x0, #800] +ldr q30, [x0, #1840] +ldr q22, [x0, #1584] +sqrdmulh z20.d, z30.d, z2.d[0] +sub z29.d, z25.d, z21.d +mla z24.d, P0/M, z26.d, z31.d +mul z30.d, z30.d,z3.d[0] +add z25.d, z25.d, z21.d +str q25, [x0, #1568] +str q29, [x0, #1824] +ldr q29, [x0, #1072] +ldr q25, [x0, #1328] +sqrdmulh z21.d, z22.d, z2.d[0] +sub z26.d, z28.d, z24.d +mla z30.d, P0/M, z20.d, z31.d +mul z22.d, z22.d,z3.d[0] +add z28.d, z28.d, z24.d +str q28, [x0, #1056] +str q26, [x0, #1312] +ldr q26, [x0, #816] +ldr q28, [x0, #560] +sqrdmulh z24.d, z29.d, z2.d[0] +sub z20.d, z26.d, z30.d +mla z22.d, P0/M, z21.d, z31.d +mul z29.d, z29.d,z3.d[0] +add z26.d, z26.d, z30.d +ldr q30, [x0, #48] +ldr q21, [x0, #304] +sqrdmulh z23.d, z25.d, z2.d[0] +sub z27.d, z28.d, z22.d +mla z29.d, P0/M, z24.d, z31.d +mul z25.d, z25.d,z3.d[0] +add z28.d, z28.d, z22.d +sqrdmulh z22.d, z26.d, z0.d[0] +sub z24.d, z30.d, z29.d +mla z25.d, P0/M, z23.d, z31.d +mul z26.d, z26.d,z1.d[0] +add z30.d, z30.d, z29.d +sqrdmulh z29.d, z28.d, z0.d[0] +sub z23.d, z21.d, z25.d +mla z26.d, P0/M, z22.d, z31.d +mul z28.d, z28.d,z1.d[0] +add z21.d, z21.d, z25.d +sqrdmulh z25.d, z20.d, z0.d[1] +sub z22.d, z21.d, z26.d +mla z28.d, P0/M, z29.d, z31.d +mul z20.d, z20.d,z1.d[1] +add z21.d, z21.d, z26.d +sqrdmulh z26.d, z27.d, z0.d[1] +sub z29.d, z30.d, z28.d +mla z20.d, P0/M, z25.d, z31.d +mul z27.d, z27.d,z1.d[1] +add z30.d, z30.d, z28.d +sqrdmulh z28.d, z21.d, z14.d[0] +sub z25.d, z23.d, z20.d +mla z27.d, P0/M, z26.d, z31.d +mul z21.d, z21.d,z15.d[0] +add z23.d, z23.d, z20.d +sqrdmulh z20.d, z22.d, z14.d[1] +sub z26.d, z24.d, z27.d +mla z21.d, P0/M, z28.d, z31.d +mul z22.d, z22.d,z15.d[1] +add z24.d, z24.d, z27.d +sqrdmulh z27.d, z25.d, z12.d[1] +sub z28.d, z30.d, z21.d +mla z22.d, P0/M, z20.d, z31.d +mul z25.d, z25.d,z13.d[1] +add z30.d, z30.d, z21.d +str q30, [x0, #48] +str q28, [x0, #304] +sqrdmulh z28.d, z23.d, z12.d[0] +sub z30.d, z29.d, z22.d +mla z25.d, P0/M, z27.d, z31.d +mul z23.d, z23.d,z13.d[0] +add z29.d, z29.d, z22.d +str q29, [x0, #560] +str q30, [x0, #816] +ldr q30, [x0, #1856] +ldr q29, [x0, #1600] +sqrdmulh z22.d, z30.d, z2.d[0] +sub z27.d, z26.d, z25.d +mla z23.d, P0/M, z28.d, z31.d +mul z30.d, z30.d,z3.d[0] +add z26.d, z26.d, z25.d +str q26, [x0, #1584] +str q27, [x0, #1840] +ldr q27, [x0, #1088] +ldr q26, [x0, #1344] +sqrdmulh z25.d, z29.d, z2.d[0] +sub z28.d, z24.d, z23.d +mla z30.d, P0/M, z22.d, z31.d +mul z29.d, z29.d,z3.d[0] +add z24.d, z24.d, z23.d +str q24, [x0, #1072] +str q28, [x0, #1328] +ldr q28, [x0, #832] +ldr q24, [x0, #576] +sqrdmulh z23.d, z27.d, z2.d[0] +sub z22.d, z28.d, z30.d +mla z29.d, P0/M, z25.d, z31.d +mul z27.d, z27.d,z3.d[0] +add z28.d, z28.d, z30.d +ldr q30, [x0, #64] +ldr q25, [x0, #320] +sqrdmulh z21.d, z26.d, z2.d[0] +sub z20.d, z24.d, z29.d +mla z27.d, P0/M, z23.d, z31.d +mul z26.d, z26.d,z3.d[0] +add z24.d, z24.d, z29.d +sqrdmulh z29.d, z28.d, z0.d[0] +sub z23.d, z30.d, z27.d +mla z26.d, P0/M, z21.d, z31.d +mul z28.d, z28.d,z1.d[0] +add z30.d, z30.d, z27.d +sqrdmulh z27.d, z24.d, z0.d[0] +sub z21.d, z25.d, z26.d +mla z28.d, P0/M, z29.d, z31.d +mul z24.d, z24.d,z1.d[0] +add z25.d, z25.d, z26.d +sqrdmulh z26.d, z22.d, z0.d[1] +sub z29.d, z25.d, z28.d +mla z24.d, P0/M, z27.d, z31.d +mul z22.d, z22.d,z1.d[1] +add z25.d, z25.d, z28.d +sqrdmulh z28.d, z20.d, z0.d[1] +sub z27.d, z30.d, z24.d +mla z22.d, P0/M, z26.d, z31.d +mul z20.d, z20.d,z1.d[1] +add z30.d, z30.d, z24.d +sqrdmulh z24.d, z25.d, z14.d[0] +sub z26.d, z21.d, z22.d +mla z20.d, P0/M, z28.d, z31.d +mul z25.d, z25.d,z15.d[0] +add z21.d, z21.d, z22.d +sqrdmulh z22.d, z29.d, z14.d[1] +sub z28.d, z23.d, z20.d +mla z25.d, P0/M, z24.d, z31.d +mul z29.d, z29.d,z15.d[1] +add z23.d, z23.d, z20.d +sqrdmulh z20.d, z26.d, z12.d[1] +sub z24.d, z30.d, z25.d +mla z29.d, P0/M, z22.d, z31.d +mul z26.d, z26.d,z13.d[1] +add z30.d, z30.d, z25.d +str q30, [x0, #64] +str q24, [x0, #320] +sqrdmulh z24.d, z21.d, z12.d[0] +sub z30.d, z27.d, z29.d +mla z26.d, P0/M, z20.d, z31.d +mul z21.d, z21.d,z13.d[0] +add z27.d, z27.d, z29.d +str q27, [x0, #576] +str q30, [x0, #832] +ldr q30, [x0, #1872] +ldr q27, [x0, #1616] +sqrdmulh z29.d, z30.d, z2.d[0] +sub z20.d, z28.d, z26.d +mla z21.d, P0/M, z24.d, z31.d +mul z30.d, z30.d,z3.d[0] +add z28.d, z28.d, z26.d +str q28, [x0, #1600] +str q20, [x0, #1856] +ldr q20, [x0, #1104] +ldr q28, [x0, #1360] +sqrdmulh z26.d, z27.d, z2.d[0] +sub z24.d, z23.d, z21.d +mla z30.d, P0/M, z29.d, z31.d +mul z27.d, z27.d,z3.d[0] +add z23.d, z23.d, z21.d +str q23, [x0, #1088] +str q24, [x0, #1344] +ldr q24, [x0, #848] +ldr q23, [x0, #592] +sqrdmulh z21.d, z20.d, z2.d[0] +sub z29.d, z24.d, z30.d +mla z27.d, P0/M, z26.d, z31.d +mul z20.d, z20.d,z3.d[0] +add z24.d, z24.d, z30.d +ldr q30, [x0, #80] +ldr q26, [x0, #336] +sqrdmulh z25.d, z28.d, z2.d[0] +sub z22.d, z23.d, z27.d +mla z20.d, P0/M, z21.d, z31.d +mul z28.d, z28.d,z3.d[0] +add z23.d, z23.d, z27.d +sqrdmulh z27.d, z24.d, z0.d[0] +sub z21.d, z30.d, z20.d +mla z28.d, P0/M, z25.d, z31.d +mul z24.d, z24.d,z1.d[0] +add z30.d, z30.d, z20.d +sqrdmulh z20.d, z23.d, z0.d[0] +sub z25.d, z26.d, z28.d +mla z24.d, P0/M, z27.d, z31.d +mul z23.d, z23.d,z1.d[0] +add z26.d, z26.d, z28.d +sqrdmulh z28.d, z29.d, z0.d[1] +sub z27.d, z26.d, z24.d +mla z23.d, P0/M, z20.d, z31.d +mul z29.d, z29.d,z1.d[1] +add z26.d, z26.d, z24.d +sqrdmulh z24.d, z22.d, z0.d[1] +sub z20.d, z30.d, z23.d +mla z29.d, P0/M, z28.d, z31.d +mul z22.d, z22.d,z1.d[1] +add z30.d, z30.d, z23.d +sqrdmulh z23.d, z26.d, z14.d[0] +sub z28.d, z25.d, z29.d +mla z22.d, P0/M, z24.d, z31.d +mul z26.d, z26.d,z15.d[0] +add z25.d, z25.d, z29.d +sqrdmulh z29.d, z27.d, z14.d[1] +sub z24.d, z21.d, z22.d +mla z26.d, P0/M, z23.d, z31.d +mul z27.d, z27.d,z15.d[1] +add z21.d, z21.d, z22.d +sqrdmulh z22.d, z28.d, z12.d[1] +sub z23.d, z30.d, z26.d +mla z27.d, P0/M, z29.d, z31.d +mul z28.d, z28.d,z13.d[1] +add z30.d, z30.d, z26.d +str q30, [x0, #80] +str q23, [x0, #336] +sqrdmulh z23.d, z25.d, z12.d[0] +sub z30.d, z20.d, z27.d +mla z28.d, P0/M, z22.d, z31.d +mul z25.d, z25.d,z13.d[0] +add z20.d, z20.d, z27.d +str q20, [x0, #592] +str q30, [x0, #848] +ldr q30, [x0, #1888] +ldr q20, [x0, #1632] +sqrdmulh z27.d, z30.d, z2.d[0] +sub z22.d, z24.d, z28.d +mla z25.d, P0/M, z23.d, z31.d +mul z30.d, z30.d,z3.d[0] +add z24.d, z24.d, z28.d +str q24, [x0, #1616] +str q22, [x0, #1872] +ldr q22, [x0, #1120] +ldr q24, [x0, #1376] +sqrdmulh z28.d, z20.d, z2.d[0] +sub z23.d, z21.d, z25.d +mla z30.d, P0/M, z27.d, z31.d +mul z20.d, z20.d,z3.d[0] +add z21.d, z21.d, z25.d +str q21, [x0, #1104] +str q23, [x0, #1360] +ldr q23, [x0, #864] +ldr q21, [x0, #608] +sqrdmulh z25.d, z22.d, z2.d[0] +sub z27.d, z23.d, z30.d +mla z20.d, P0/M, z28.d, z31.d +mul z22.d, z22.d,z3.d[0] +add z23.d, z23.d, z30.d +ldr q30, [x0, #96] +ldr q28, [x0, #352] +sqrdmulh z26.d, z24.d, z2.d[0] +sub z29.d, z21.d, z20.d +mla z22.d, P0/M, z25.d, z31.d +mul z24.d, z24.d,z3.d[0] +add z21.d, z21.d, z20.d +sqrdmulh z20.d, z23.d, z0.d[0] +sub z25.d, z30.d, z22.d +mla z24.d, P0/M, z26.d, z31.d +mul z23.d, z23.d,z1.d[0] +add z30.d, z30.d, z22.d +sqrdmulh z22.d, z21.d, z0.d[0] +sub z26.d, z28.d, z24.d +mla z23.d, P0/M, z20.d, z31.d +mul z21.d, z21.d,z1.d[0] +add z28.d, z28.d, z24.d +sqrdmulh z24.d, z27.d, z0.d[1] +sub z20.d, z28.d, z23.d +mla z21.d, P0/M, z22.d, z31.d +mul z27.d, z27.d,z1.d[1] +add z28.d, z28.d, z23.d +sqrdmulh z23.d, z29.d, z0.d[1] +sub z22.d, z30.d, z21.d +mla z27.d, P0/M, z24.d, z31.d +mul z29.d, z29.d,z1.d[1] +add z30.d, z30.d, z21.d +sqrdmulh z21.d, z28.d, z14.d[0] +sub z24.d, z26.d, z27.d +mla z29.d, P0/M, z23.d, z31.d +mul z28.d, z28.d,z15.d[0] +add z26.d, z26.d, z27.d +sqrdmulh z27.d, z20.d, z14.d[1] +sub z23.d, z25.d, z29.d +mla z28.d, P0/M, z21.d, z31.d +mul z20.d, z20.d,z15.d[1] +add z25.d, z25.d, z29.d +sqrdmulh z29.d, z24.d, z12.d[1] +sub z21.d, z30.d, z28.d +mla z20.d, P0/M, z27.d, z31.d +mul z24.d, z24.d,z13.d[1] +add z30.d, z30.d, z28.d +str q30, [x0, #96] +str q21, [x0, #352] +sqrdmulh z21.d, z26.d, z12.d[0] +sub z30.d, z22.d, z20.d +mla z24.d, P0/M, z29.d, z31.d +mul z26.d, z26.d,z13.d[0] +add z22.d, z22.d, z20.d +str q22, [x0, #608] +str q30, [x0, #864] +ldr q30, [x0, #1904] +ldr q22, [x0, #1648] +sqrdmulh z20.d, z30.d, z2.d[0] +sub z29.d, z23.d, z24.d +mla z26.d, P0/M, z21.d, z31.d +mul z30.d, z30.d,z3.d[0] +add z23.d, z23.d, z24.d +str q23, [x0, #1632] +str q29, [x0, #1888] +ldr q29, [x0, #1136] +ldr q23, [x0, #1392] +sqrdmulh z24.d, z22.d, z2.d[0] +sub z21.d, z25.d, z26.d +mla z30.d, P0/M, z20.d, z31.d +mul z22.d, z22.d,z3.d[0] +add z25.d, z25.d, z26.d +str q25, [x0, #1120] +str q21, [x0, #1376] +ldr q21, [x0, #880] +ldr q25, [x0, #624] +sqrdmulh z26.d, z29.d, z2.d[0] +sub z20.d, z21.d, z30.d +mla z22.d, P0/M, z24.d, z31.d +mul z29.d, z29.d,z3.d[0] +add z21.d, z21.d, z30.d +ldr q30, [x0, #112] +ldr q24, [x0, #368] +sqrdmulh z28.d, z23.d, z2.d[0] +sub z27.d, z25.d, z22.d +mla z29.d, P0/M, z26.d, z31.d +mul z23.d, z23.d,z3.d[0] +add z25.d, z25.d, z22.d +sqrdmulh z22.d, z21.d, z0.d[0] +sub z26.d, z30.d, z29.d +mla z23.d, P0/M, z28.d, z31.d +mul z21.d, z21.d,z1.d[0] +add z30.d, z30.d, z29.d +sqrdmulh z29.d, z25.d, z0.d[0] +sub z28.d, z24.d, z23.d +mla z21.d, P0/M, z22.d, z31.d +mul z25.d, z25.d,z1.d[0] +add z24.d, z24.d, z23.d +sqrdmulh z23.d, z20.d, z0.d[1] +sub z22.d, z24.d, z21.d +mla z25.d, P0/M, z29.d, z31.d +mul z20.d, z20.d,z1.d[1] +add z24.d, z24.d, z21.d +sqrdmulh z21.d, z27.d, z0.d[1] +sub z29.d, z30.d, z25.d +mla z20.d, P0/M, z23.d, z31.d +mul z27.d, z27.d,z1.d[1] +add z30.d, z30.d, z25.d +sqrdmulh z25.d, z24.d, z14.d[0] +sub z23.d, z28.d, z20.d +mla z27.d, P0/M, z21.d, z31.d +mul z24.d, z24.d,z15.d[0] +add z28.d, z28.d, z20.d +sqrdmulh z20.d, z22.d, z14.d[1] +sub z21.d, z26.d, z27.d +mla z24.d, P0/M, z25.d, z31.d +mul z22.d, z22.d,z15.d[1] +add z26.d, z26.d, z27.d +sqrdmulh z27.d, z23.d, z12.d[1] +sub z25.d, z30.d, z24.d +mla z22.d, P0/M, z20.d, z31.d +mul z23.d, z23.d,z13.d[1] +add z30.d, z30.d, z24.d +str q30, [x0, #112] +str q25, [x0, #368] +sqrdmulh z25.d, z28.d, z12.d[0] +sub z30.d, z29.d, z22.d +mla z23.d, P0/M, z27.d, z31.d +mul z28.d, z28.d,z13.d[0] +add z29.d, z29.d, z22.d +str q29, [x0, #624] +str q30, [x0, #880] +sub z30.d, z21.d, z23.d +mla z28.d, P0/M, z25.d, z31.d +add z21.d, z21.d, z23.d +str q21, [x0, #1648] +str q30, [x0, #1904] +sub z30.d, z26.d, z28.d +add z26.d, z26.d, z28.d +str q26, [x0, #1136] +str q30, [x0, #1392] +ldr q4, [x17, #+128] +ldr q5, [x17, #+144] +ldr q6, [x17, #+160] +ldr q7, [x17, #+176] +ldr q8, [x17, #+192] +ldr q9, [x17, #+208] +ldr q10, [x17, #+224] +ldr q11, [x17, #+240] +ldr q16, [x0, #240] +ldr q17, [x0, #208] +sqrdmulh z18.d, z16.d, z5.d[0] +mul z16.d, z16.d,z4.d[0] +ldr q19, [x0, #144] +ldr q20, [x0, #176] +sqrdmulh z24.d, z17.d, z5.d[0] +mul z17.d, z17.d,z4.d[0] +mla z16.d, P0/M, z18.d, z31.d +ldr q18, [x0, #112] +ldr q27, [x0, #80] +sqrdmulh z22.d, z19.d, z5.d[0] +mul z19.d, z19.d,z4.d[0] +mla z17.d, P0/M, z24.d, z31.d +ldr q24, [x0, #16] +sub z29.d, z18.d, z16.d +ldr q25, [x0, #48] +add z18.d, z18.d, z16.d +sqrdmulh z16.d, z20.d, z5.d[0] +mul z20.d, z20.d,z4.d[0] +sub z23.d, z27.d, z17.d +mla z19.d, P0/M, z22.d, z31.d +add z27.d, z27.d, z17.d +sqrdmulh z17.d, z18.d, z7.d[0] +mul z18.d, z18.d,z6.d[0] +sub z22.d, z24.d, z19.d +mla z20.d, P0/M, z16.d, z31.d +add z24.d, z24.d, z19.d +sqrdmulh z19.d, z27.d, z7.d[0] +mul z27.d, z27.d,z6.d[0] +sub z16.d, z25.d, z20.d +mla z18.d, P0/M, z17.d, z31.d +add z25.d, z25.d, z20.d +sqrdmulh z20.d, z29.d, z7.d[1] +mul z29.d, z29.d,z6.d[1] +sub z17.d, z25.d, z18.d +mla z27.d, P0/M, z19.d, z31.d +add z25.d, z25.d, z18.d +sqrdmulh z18.d, z23.d, z7.d[1] +mul z23.d, z23.d,z6.d[1] +sub z19.d, z24.d, z27.d +mla z29.d, P0/M, z20.d, z31.d +add z24.d, z24.d, z27.d +sqrdmulh z27.d, z25.d, z9.d[0] +mul z25.d, z25.d,z8.d[0] +sub z20.d, z16.d, z29.d +mla z23.d, P0/M, z18.d, z31.d +add z16.d, z16.d, z29.d +sqrdmulh z29.d, z17.d, z9.d[1] +mul z17.d, z17.d,z8.d[1] +sub z18.d, z22.d, z23.d +mla z25.d, P0/M, z27.d, z31.d +add z22.d, z22.d, z23.d +sqrdmulh z23.d, z20.d, z11.d[1] +mul z20.d, z20.d,z10.d[1] +sub z27.d, z24.d, z25.d +mla z17.d, P0/M, z29.d, z31.d +add z24.d, z24.d, z25.d +sqrdmulh z25.d, z16.d, z11.d[0] +str q24, [x0, #16] +mul z16.d, z16.d,z10.d[0] +str q27, [x0, #48] +mla z20.d, P0/M, z23.d, z31.d +sub z23.d, z19.d, z17.d +ldr q27, [x0, #224] +ldr q24, [x0, #192] +add z19.d, z19.d, z17.d +sqrdmulh z17.d, z27.d, z5.d[0] +str q19, [x0, #80] +mul z27.d, z27.d,z4.d[0] +str q23, [x0, #112] +mla z16.d, P0/M, z25.d, z31.d +ldr q25, [x0, #128] +sub z23.d, z18.d, z20.d +ldr q19, [x0, #160] +add z18.d, z18.d, z20.d +sqrdmulh z20.d, z24.d, z5.d[0] +str q18, [x0, #208] +mul z24.d, z24.d,z4.d[0] +str q23, [x0, #240] +mla z27.d, P0/M, z17.d, z31.d +ldr q17, [x0, #96] +sub z23.d, z22.d, z16.d +ldr q18, [x0, #64] +add z22.d, z22.d, z16.d +sqrdmulh z16.d, z25.d, z5.d[0] +str q22, [x0, #144] +mul z25.d, z25.d,z4.d[0] +str q23, [x0, #176] +mla z24.d, P0/M, z20.d, z31.d +ldr q20, [x0, #0] +sub z23.d, z17.d, z27.d +ldr q22, [x0, #32] +add z17.d, z17.d, z27.d +sqrdmulh z27.d, z19.d, z5.d[0] +mul z19.d, z19.d,z4.d[0] +sub z29.d, z18.d, z24.d +mla z25.d, P0/M, z16.d, z31.d +add z18.d, z18.d, z24.d +sqrdmulh z24.d, z17.d, z7.d[0] +mul z17.d, z17.d,z6.d[0] +sub z16.d, z20.d, z25.d +mla z19.d, P0/M, z27.d, z31.d +add z20.d, z20.d, z25.d +sqrdmulh z25.d, z18.d, z7.d[0] +mul z18.d, z18.d,z6.d[0] +sub z27.d, z22.d, z19.d +mla z17.d, P0/M, z24.d, z31.d +add z22.d, z22.d, z19.d +sqrdmulh z19.d, z23.d, z7.d[1] +mul z23.d, z23.d,z6.d[1] +sub z24.d, z22.d, z17.d +mla z18.d, P0/M, z25.d, z31.d +add z22.d, z22.d, z17.d +ldr q3, [x17, #+256] +ldr q2, [x17, #+272] +ldr q1, [x17, #+288] +ldr q0, [x17, #+304] +ldr q15, [x17, #+320] +ldr q14, [x17, #+336] +ldr q13, [x17, #+352] +ldr q12, [x17, #+368] +sqrdmulh z17.d, z29.d, z7.d[1] +mul z29.d, z29.d,z6.d[1] +sub z25.d, z20.d, z18.d +mla z23.d, P0/M, z19.d, z31.d +add z20.d, z20.d, z18.d +sqrdmulh z18.d, z22.d, z9.d[0] +mul z22.d, z22.d,z8.d[0] +sub z19.d, z27.d, z23.d +mla z29.d, P0/M, z17.d, z31.d +add z27.d, z27.d, z23.d +sqrdmulh z23.d, z24.d, z9.d[1] +mul z24.d, z24.d,z8.d[1] +sub z17.d, z16.d, z29.d +mla z22.d, P0/M, z18.d, z31.d +add z16.d, z16.d, z29.d +sqrdmulh z29.d, z19.d, z11.d[1] +mul z19.d, z19.d,z10.d[1] +sub z18.d, z20.d, z22.d +mla z24.d, P0/M, z23.d, z31.d +add z20.d, z20.d, z22.d +sqrdmulh z22.d, z27.d, z11.d[0] +str q20, [x0, #0] +mul z27.d, z27.d,z10.d[0] +str q18, [x0, #32] +mla z19.d, P0/M, z29.d, z31.d +sub z29.d, z25.d, z24.d +ldr q18, [x0, #496] +ldr q20, [x0, #464] +add z25.d, z25.d, z24.d +sqrdmulh z24.d, z18.d, z2.d[0] +str q25, [x0, #64] +mul z18.d, z18.d,z3.d[0] +str q29, [x0, #96] +mla z27.d, P0/M, z22.d, z31.d +ldr q22, [x0, #400] +sub z29.d, z17.d, z19.d +ldr q25, [x0, #432] +add z17.d, z17.d, z19.d +sqrdmulh z19.d, z20.d, z2.d[0] +str q17, [x0, #192] +mul z20.d, z20.d,z3.d[0] +str q29, [x0, #224] +mla z18.d, P0/M, z24.d, z31.d +ldr q24, [x0, #368] +sub z29.d, z16.d, z27.d +ldr q17, [x0, #336] +add z16.d, z16.d, z27.d +sqrdmulh z27.d, z22.d, z2.d[0] +str q16, [x0, #128] +mul z22.d, z22.d,z3.d[0] +str q29, [x0, #160] +mla z20.d, P0/M, z19.d, z31.d +ldr q19, [x0, #272] +sub z29.d, z24.d, z18.d +ldr q16, [x0, #304] +add z24.d, z24.d, z18.d +sqrdmulh z18.d, z25.d, z2.d[0] +mul z25.d, z25.d,z3.d[0] +sub z23.d, z17.d, z20.d +mla z22.d, P0/M, z27.d, z31.d +add z17.d, z17.d, z20.d +sqrdmulh z20.d, z24.d, z0.d[0] +mul z24.d, z24.d,z1.d[0] +sub z27.d, z19.d, z22.d +mla z25.d, P0/M, z18.d, z31.d +add z19.d, z19.d, z22.d +sqrdmulh z22.d, z17.d, z0.d[0] +mul z17.d, z17.d,z1.d[0] +sub z18.d, z16.d, z25.d +mla z24.d, P0/M, z20.d, z31.d +add z16.d, z16.d, z25.d +sqrdmulh z25.d, z29.d, z0.d[1] +mul z29.d, z29.d,z1.d[1] +sub z20.d, z16.d, z24.d +mla z17.d, P0/M, z22.d, z31.d +add z16.d, z16.d, z24.d +sqrdmulh z24.d, z23.d, z0.d[1] +mul z23.d, z23.d,z1.d[1] +sub z22.d, z19.d, z17.d +mla z29.d, P0/M, z25.d, z31.d +add z19.d, z19.d, z17.d +sqrdmulh z17.d, z16.d, z14.d[0] +mul z16.d, z16.d,z15.d[0] +sub z25.d, z18.d, z29.d +mla z23.d, P0/M, z24.d, z31.d +add z18.d, z18.d, z29.d +sqrdmulh z29.d, z20.d, z14.d[1] +mul z20.d, z20.d,z15.d[1] +sub z24.d, z27.d, z23.d +mla z16.d, P0/M, z17.d, z31.d +add z27.d, z27.d, z23.d +sqrdmulh z23.d, z25.d, z12.d[1] +mul z25.d, z25.d,z13.d[1] +sub z17.d, z19.d, z16.d +mla z20.d, P0/M, z29.d, z31.d +add z19.d, z19.d, z16.d +sqrdmulh z16.d, z18.d, z12.d[0] +str q19, [x0, #272] +mul z18.d, z18.d,z13.d[0] +str q17, [x0, #304] +mla z25.d, P0/M, z23.d, z31.d +sub z23.d, z22.d, z20.d +ldr q17, [x0, #480] +ldr q19, [x0, #448] +add z22.d, z22.d, z20.d +sqrdmulh z20.d, z17.d, z2.d[0] +str q22, [x0, #336] +mul z17.d, z17.d,z3.d[0] +str q23, [x0, #368] +mla z18.d, P0/M, z16.d, z31.d +ldr q16, [x0, #384] +sub z23.d, z24.d, z25.d +ldr q22, [x0, #416] +add z24.d, z24.d, z25.d +sqrdmulh z25.d, z19.d, z2.d[0] +str q24, [x0, #464] +mul z19.d, z19.d,z3.d[0] +str q23, [x0, #496] +mla z17.d, P0/M, z20.d, z31.d +ldr q20, [x0, #352] +sub z23.d, z27.d, z18.d +ldr q24, [x0, #320] +add z27.d, z27.d, z18.d +sqrdmulh z18.d, z16.d, z2.d[0] +str q27, [x0, #400] +mul z16.d, z16.d,z3.d[0] +str q23, [x0, #432] +mla z19.d, P0/M, z25.d, z31.d +ldr q25, [x0, #256] +sub z23.d, z20.d, z17.d +ldr q27, [x0, #288] +add z20.d, z20.d, z17.d +sqrdmulh z17.d, z22.d, z2.d[0] +mul z22.d, z22.d,z3.d[0] +sub z29.d, z24.d, z19.d +mla z16.d, P0/M, z18.d, z31.d +add z24.d, z24.d, z19.d +sqrdmulh z19.d, z20.d, z0.d[0] +mul z20.d, z20.d,z1.d[0] +sub z18.d, z25.d, z16.d +mla z22.d, P0/M, z17.d, z31.d +add z25.d, z25.d, z16.d +sqrdmulh z16.d, z24.d, z0.d[0] +mul z24.d, z24.d,z1.d[0] +sub z17.d, z27.d, z22.d +mla z20.d, P0/M, z19.d, z31.d +add z27.d, z27.d, z22.d +sqrdmulh z22.d, z23.d, z0.d[1] +mul z23.d, z23.d,z1.d[1] +sub z19.d, z27.d, z20.d +mla z24.d, P0/M, z16.d, z31.d +add z27.d, z27.d, z20.d +ldr q11, [x17, #+384] +ldr q10, [x17, #+400] +ldr q9, [x17, #+416] +ldr q8, [x17, #+432] +ldr q7, [x17, #+448] +ldr q6, [x17, #+464] +ldr q5, [x17, #+480] +ldr q4, [x17, #+496] +sqrdmulh z20.d, z29.d, z0.d[1] +mul z29.d, z29.d,z1.d[1] +sub z16.d, z25.d, z24.d +mla z23.d, P0/M, z22.d, z31.d +add z25.d, z25.d, z24.d +sqrdmulh z24.d, z27.d, z14.d[0] +mul z27.d, z27.d,z15.d[0] +sub z22.d, z17.d, z23.d +mla z29.d, P0/M, z20.d, z31.d +add z17.d, z17.d, z23.d +sqrdmulh z23.d, z19.d, z14.d[1] +mul z19.d, z19.d,z15.d[1] +sub z20.d, z18.d, z29.d +mla z27.d, P0/M, z24.d, z31.d +add z18.d, z18.d, z29.d +sqrdmulh z29.d, z22.d, z12.d[1] +mul z22.d, z22.d,z13.d[1] +sub z24.d, z25.d, z27.d +mla z19.d, P0/M, z23.d, z31.d +add z25.d, z25.d, z27.d +sqrdmulh z27.d, z17.d, z12.d[0] +str q25, [x0, #256] +mul z17.d, z17.d,z13.d[0] +str q24, [x0, #288] +mla z22.d, P0/M, z29.d, z31.d +sub z29.d, z16.d, z19.d +ldr q24, [x0, #752] +ldr q25, [x0, #720] +add z16.d, z16.d, z19.d +sqrdmulh z19.d, z24.d, z10.d[0] +str q16, [x0, #320] +mul z24.d, z24.d,z11.d[0] +str q29, [x0, #352] +mla z17.d, P0/M, z27.d, z31.d +ldr q27, [x0, #656] +sub z29.d, z20.d, z22.d +ldr q16, [x0, #688] +add z20.d, z20.d, z22.d +sqrdmulh z22.d, z25.d, z10.d[0] +str q20, [x0, #448] +mul z25.d, z25.d,z11.d[0] +str q29, [x0, #480] +mla z24.d, P0/M, z19.d, z31.d +ldr q19, [x0, #624] +sub z29.d, z18.d, z17.d +ldr q20, [x0, #592] +add z18.d, z18.d, z17.d +sqrdmulh z17.d, z27.d, z10.d[0] +str q18, [x0, #384] +mul z27.d, z27.d,z11.d[0] +str q29, [x0, #416] +mla z25.d, P0/M, z22.d, z31.d +ldr q22, [x0, #528] +sub z29.d, z19.d, z24.d +ldr q18, [x0, #560] +add z19.d, z19.d, z24.d +sqrdmulh z24.d, z16.d, z10.d[0] +mul z16.d, z16.d,z11.d[0] +sub z23.d, z20.d, z25.d +mla z27.d, P0/M, z17.d, z31.d +add z20.d, z20.d, z25.d +sqrdmulh z25.d, z19.d, z8.d[0] +mul z19.d, z19.d,z9.d[0] +sub z17.d, z22.d, z27.d +mla z16.d, P0/M, z24.d, z31.d +add z22.d, z22.d, z27.d +sqrdmulh z27.d, z20.d, z8.d[0] +mul z20.d, z20.d,z9.d[0] +sub z24.d, z18.d, z16.d +mla z19.d, P0/M, z25.d, z31.d +add z18.d, z18.d, z16.d +sqrdmulh z16.d, z29.d, z8.d[1] +mul z29.d, z29.d,z9.d[1] +sub z25.d, z18.d, z19.d +mla z20.d, P0/M, z27.d, z31.d +add z18.d, z18.d, z19.d +sqrdmulh z19.d, z23.d, z8.d[1] +mul z23.d, z23.d,z9.d[1] +sub z27.d, z22.d, z20.d +mla z29.d, P0/M, z16.d, z31.d +add z22.d, z22.d, z20.d +sqrdmulh z20.d, z18.d, z6.d[0] +mul z18.d, z18.d,z7.d[0] +sub z16.d, z24.d, z29.d +mla z23.d, P0/M, z19.d, z31.d +add z24.d, z24.d, z29.d +sqrdmulh z29.d, z25.d, z6.d[1] +mul z25.d, z25.d,z7.d[1] +sub z19.d, z17.d, z23.d +mla z18.d, P0/M, z20.d, z31.d +add z17.d, z17.d, z23.d +sqrdmulh z23.d, z16.d, z4.d[1] +mul z16.d, z16.d,z5.d[1] +sub z20.d, z22.d, z18.d +mla z25.d, P0/M, z29.d, z31.d +add z22.d, z22.d, z18.d +sqrdmulh z18.d, z24.d, z4.d[0] +str q22, [x0, #528] +mul z24.d, z24.d,z5.d[0] +str q20, [x0, #560] +mla z16.d, P0/M, z23.d, z31.d +sub z23.d, z27.d, z25.d +ldr q20, [x0, #736] +ldr q22, [x0, #704] +add z27.d, z27.d, z25.d +sqrdmulh z25.d, z20.d, z10.d[0] +str q27, [x0, #592] +mul z20.d, z20.d,z11.d[0] +str q23, [x0, #624] +mla z24.d, P0/M, z18.d, z31.d +ldr q18, [x0, #640] +sub z23.d, z19.d, z16.d +ldr q27, [x0, #672] +add z19.d, z19.d, z16.d +sqrdmulh z16.d, z22.d, z10.d[0] +str q19, [x0, #720] +mul z22.d, z22.d,z11.d[0] +str q23, [x0, #752] +mla z20.d, P0/M, z25.d, z31.d +ldr q25, [x0, #608] +sub z23.d, z17.d, z24.d +ldr q19, [x0, #576] +add z17.d, z17.d, z24.d +sqrdmulh z24.d, z18.d, z10.d[0] +str q17, [x0, #656] +mul z18.d, z18.d,z11.d[0] +str q23, [x0, #688] +mla z22.d, P0/M, z16.d, z31.d +ldr q16, [x0, #512] +sub z23.d, z25.d, z20.d +ldr q17, [x0, #544] +add z25.d, z25.d, z20.d +sqrdmulh z20.d, z27.d, z10.d[0] +mul z27.d, z27.d,z11.d[0] +sub z29.d, z19.d, z22.d +mla z18.d, P0/M, z24.d, z31.d +add z19.d, z19.d, z22.d +sqrdmulh z22.d, z25.d, z8.d[0] +mul z25.d, z25.d,z9.d[0] +sub z24.d, z16.d, z18.d +mla z27.d, P0/M, z20.d, z31.d +add z16.d, z16.d, z18.d +sqrdmulh z18.d, z19.d, z8.d[0] +mul z19.d, z19.d,z9.d[0] +sub z20.d, z17.d, z27.d +mla z25.d, P0/M, z22.d, z31.d +add z17.d, z17.d, z27.d +sqrdmulh z27.d, z23.d, z8.d[1] +mul z23.d, z23.d,z9.d[1] +sub z22.d, z17.d, z25.d +mla z19.d, P0/M, z18.d, z31.d +add z17.d, z17.d, z25.d +ldr q12, [x17, #+512] +ldr q13, [x17, #+528] +ldr q14, [x17, #+544] +ldr q15, [x17, #+560] +ldr q0, [x17, #+576] +ldr q1, [x17, #+592] +ldr q2, [x17, #+608] +ldr q3, [x17, #+624] +sqrdmulh z25.d, z29.d, z8.d[1] +mul z29.d, z29.d,z9.d[1] +sub z18.d, z16.d, z19.d +mla z23.d, P0/M, z27.d, z31.d +add z16.d, z16.d, z19.d +sqrdmulh z19.d, z17.d, z6.d[0] +mul z17.d, z17.d,z7.d[0] +sub z27.d, z20.d, z23.d +mla z29.d, P0/M, z25.d, z31.d +add z20.d, z20.d, z23.d +sqrdmulh z23.d, z22.d, z6.d[1] +mul z22.d, z22.d,z7.d[1] +sub z25.d, z24.d, z29.d +mla z17.d, P0/M, z19.d, z31.d +add z24.d, z24.d, z29.d +sqrdmulh z29.d, z27.d, z4.d[1] +mul z27.d, z27.d,z5.d[1] +sub z19.d, z16.d, z17.d +mla z22.d, P0/M, z23.d, z31.d +add z16.d, z16.d, z17.d +sqrdmulh z17.d, z20.d, z4.d[0] +str q16, [x0, #512] +mul z20.d, z20.d,z5.d[0] +str q19, [x0, #544] +mla z27.d, P0/M, z29.d, z31.d +sub z29.d, z18.d, z22.d +ldr q19, [x0, #1008] +ldr q16, [x0, #976] +add z18.d, z18.d, z22.d +sqrdmulh z22.d, z19.d, z13.d[0] +str q18, [x0, #576] +mul z19.d, z19.d,z12.d[0] +str q29, [x0, #608] +mla z20.d, P0/M, z17.d, z31.d +ldr q17, [x0, #912] +sub z29.d, z25.d, z27.d +ldr q18, [x0, #944] +add z25.d, z25.d, z27.d +sqrdmulh z27.d, z16.d, z13.d[0] +str q25, [x0, #704] +mul z16.d, z16.d,z12.d[0] +str q29, [x0, #736] +mla z19.d, P0/M, z22.d, z31.d +ldr q22, [x0, #880] +sub z29.d, z24.d, z20.d +ldr q25, [x0, #848] +add z24.d, z24.d, z20.d +sqrdmulh z20.d, z17.d, z13.d[0] +str q24, [x0, #640] +mul z17.d, z17.d,z12.d[0] +str q29, [x0, #672] +mla z16.d, P0/M, z27.d, z31.d +ldr q27, [x0, #784] +sub z29.d, z22.d, z19.d +ldr q24, [x0, #816] +add z22.d, z22.d, z19.d +sqrdmulh z19.d, z18.d, z13.d[0] +mul z18.d, z18.d,z12.d[0] +sub z23.d, z25.d, z16.d +mla z17.d, P0/M, z20.d, z31.d +add z25.d, z25.d, z16.d +sqrdmulh z16.d, z22.d, z15.d[0] +mul z22.d, z22.d,z14.d[0] +sub z20.d, z27.d, z17.d +mla z18.d, P0/M, z19.d, z31.d +add z27.d, z27.d, z17.d +sqrdmulh z17.d, z25.d, z15.d[0] +mul z25.d, z25.d,z14.d[0] +sub z19.d, z24.d, z18.d +mla z22.d, P0/M, z16.d, z31.d +add z24.d, z24.d, z18.d +sqrdmulh z18.d, z29.d, z15.d[1] +mul z29.d, z29.d,z14.d[1] +sub z16.d, z24.d, z22.d +mla z25.d, P0/M, z17.d, z31.d +add z24.d, z24.d, z22.d +sqrdmulh z22.d, z23.d, z15.d[1] +mul z23.d, z23.d,z14.d[1] +sub z17.d, z27.d, z25.d +mla z29.d, P0/M, z18.d, z31.d +add z27.d, z27.d, z25.d +sqrdmulh z25.d, z24.d, z1.d[0] +mul z24.d, z24.d,z0.d[0] +sub z18.d, z19.d, z29.d +mla z23.d, P0/M, z22.d, z31.d +add z19.d, z19.d, z29.d +sqrdmulh z29.d, z16.d, z1.d[1] +mul z16.d, z16.d,z0.d[1] +sub z22.d, z20.d, z23.d +mla z24.d, P0/M, z25.d, z31.d +add z20.d, z20.d, z23.d +sqrdmulh z23.d, z18.d, z3.d[1] +mul z18.d, z18.d,z2.d[1] +sub z25.d, z27.d, z24.d +mla z16.d, P0/M, z29.d, z31.d +add z27.d, z27.d, z24.d +sqrdmulh z24.d, z19.d, z3.d[0] +str q27, [x0, #784] +mul z19.d, z19.d,z2.d[0] +str q25, [x0, #816] +mla z18.d, P0/M, z23.d, z31.d +sub z23.d, z17.d, z16.d +ldr q25, [x0, #992] +ldr q27, [x0, #960] +add z17.d, z17.d, z16.d +sqrdmulh z16.d, z25.d, z13.d[0] +str q17, [x0, #848] +mul z25.d, z25.d,z12.d[0] +str q23, [x0, #880] +mla z19.d, P0/M, z24.d, z31.d +ldr q24, [x0, #896] +sub z23.d, z22.d, z18.d +ldr q17, [x0, #928] +add z22.d, z22.d, z18.d +sqrdmulh z18.d, z27.d, z13.d[0] +str q22, [x0, #976] +mul z27.d, z27.d,z12.d[0] +str q23, [x0, #1008] +mla z25.d, P0/M, z16.d, z31.d +ldr q16, [x0, #864] +sub z23.d, z20.d, z19.d +ldr q22, [x0, #832] +add z20.d, z20.d, z19.d +sqrdmulh z19.d, z24.d, z13.d[0] +str q20, [x0, #912] +mul z24.d, z24.d,z12.d[0] +str q23, [x0, #944] +mla z27.d, P0/M, z18.d, z31.d +ldr q18, [x0, #768] +sub z23.d, z16.d, z25.d +ldr q20, [x0, #800] +add z16.d, z16.d, z25.d +sqrdmulh z25.d, z17.d, z13.d[0] +mul z17.d, z17.d,z12.d[0] +sub z29.d, z22.d, z27.d +mla z24.d, P0/M, z19.d, z31.d +add z22.d, z22.d, z27.d +sqrdmulh z27.d, z16.d, z15.d[0] +mul z16.d, z16.d,z14.d[0] +sub z19.d, z18.d, z24.d +mla z17.d, P0/M, z25.d, z31.d +add z18.d, z18.d, z24.d +sqrdmulh z24.d, z22.d, z15.d[0] +mul z22.d, z22.d,z14.d[0] +sub z25.d, z20.d, z17.d +mla z16.d, P0/M, z27.d, z31.d +add z20.d, z20.d, z17.d +sqrdmulh z17.d, z23.d, z15.d[1] +mul z23.d, z23.d,z14.d[1] +sub z27.d, z20.d, z16.d +mla z22.d, P0/M, z24.d, z31.d +add z20.d, z20.d, z16.d +ldr q4, [x17, #+640] +ldr q5, [x17, #+656] +ldr q6, [x17, #+672] +ldr q7, [x17, #+688] +ldr q8, [x17, #+704] +ldr q9, [x17, #+720] +ldr q10, [x17, #+736] +ldr q11, [x17, #+752] +sqrdmulh z16.d, z29.d, z15.d[1] +mul z29.d, z29.d,z14.d[1] +sub z24.d, z18.d, z22.d +mla z23.d, P0/M, z17.d, z31.d +add z18.d, z18.d, z22.d +sqrdmulh z22.d, z20.d, z1.d[0] +mul z20.d, z20.d,z0.d[0] +sub z17.d, z25.d, z23.d +mla z29.d, P0/M, z16.d, z31.d +add z25.d, z25.d, z23.d +sqrdmulh z23.d, z27.d, z1.d[1] +mul z27.d, z27.d,z0.d[1] +sub z16.d, z19.d, z29.d +mla z20.d, P0/M, z22.d, z31.d +add z19.d, z19.d, z29.d +sqrdmulh z29.d, z17.d, z3.d[1] +mul z17.d, z17.d,z2.d[1] +sub z22.d, z18.d, z20.d +mla z27.d, P0/M, z23.d, z31.d +add z18.d, z18.d, z20.d +sqrdmulh z20.d, z25.d, z3.d[0] +str q18, [x0, #768] +mul z25.d, z25.d,z2.d[0] +str q22, [x0, #800] +mla z17.d, P0/M, z29.d, z31.d +sub z29.d, z24.d, z27.d +ldr q22, [x0, #1264] +ldr q18, [x0, #1232] +add z24.d, z24.d, z27.d +sqrdmulh z27.d, z22.d, z5.d[0] +str q24, [x0, #832] +mul z22.d, z22.d,z4.d[0] +str q29, [x0, #864] +mla z25.d, P0/M, z20.d, z31.d +ldr q20, [x0, #1168] +sub z29.d, z16.d, z17.d +ldr q24, [x0, #1200] +add z16.d, z16.d, z17.d +sqrdmulh z17.d, z18.d, z5.d[0] +str q16, [x0, #960] +mul z18.d, z18.d,z4.d[0] +str q29, [x0, #992] +mla z22.d, P0/M, z27.d, z31.d +ldr q27, [x0, #1136] +sub z29.d, z19.d, z25.d +ldr q16, [x0, #1104] +add z19.d, z19.d, z25.d +sqrdmulh z25.d, z20.d, z5.d[0] +str q19, [x0, #896] +mul z20.d, z20.d,z4.d[0] +str q29, [x0, #928] +mla z18.d, P0/M, z17.d, z31.d +ldr q17, [x0, #1040] +sub z29.d, z27.d, z22.d +ldr q19, [x0, #1072] +add z27.d, z27.d, z22.d +sqrdmulh z22.d, z24.d, z5.d[0] +mul z24.d, z24.d,z4.d[0] +sub z23.d, z16.d, z18.d +mla z20.d, P0/M, z25.d, z31.d +add z16.d, z16.d, z18.d +sqrdmulh z18.d, z27.d, z7.d[0] +mul z27.d, z27.d,z6.d[0] +sub z25.d, z17.d, z20.d +mla z24.d, P0/M, z22.d, z31.d +add z17.d, z17.d, z20.d +sqrdmulh z20.d, z16.d, z7.d[0] +mul z16.d, z16.d,z6.d[0] +sub z22.d, z19.d, z24.d +mla z27.d, P0/M, z18.d, z31.d +add z19.d, z19.d, z24.d +sqrdmulh z24.d, z29.d, z7.d[1] +mul z29.d, z29.d,z6.d[1] +sub z18.d, z19.d, z27.d +mla z16.d, P0/M, z20.d, z31.d +add z19.d, z19.d, z27.d +sqrdmulh z27.d, z23.d, z7.d[1] +mul z23.d, z23.d,z6.d[1] +sub z20.d, z17.d, z16.d +mla z29.d, P0/M, z24.d, z31.d +add z17.d, z17.d, z16.d +sqrdmulh z16.d, z19.d, z9.d[0] +mul z19.d, z19.d,z8.d[0] +sub z24.d, z22.d, z29.d +mla z23.d, P0/M, z27.d, z31.d +add z22.d, z22.d, z29.d +sqrdmulh z29.d, z18.d, z9.d[1] +mul z18.d, z18.d,z8.d[1] +sub z27.d, z25.d, z23.d +mla z19.d, P0/M, z16.d, z31.d +add z25.d, z25.d, z23.d +sqrdmulh z23.d, z24.d, z11.d[1] +mul z24.d, z24.d,z10.d[1] +sub z16.d, z17.d, z19.d +mla z18.d, P0/M, z29.d, z31.d +add z17.d, z17.d, z19.d +sqrdmulh z19.d, z22.d, z11.d[0] +str q17, [x0, #1040] +mul z22.d, z22.d,z10.d[0] +str q16, [x0, #1072] +mla z24.d, P0/M, z23.d, z31.d +sub z23.d, z20.d, z18.d +ldr q16, [x0, #1248] +ldr q17, [x0, #1216] +add z20.d, z20.d, z18.d +sqrdmulh z18.d, z16.d, z5.d[0] +str q20, [x0, #1104] +mul z16.d, z16.d,z4.d[0] +str q23, [x0, #1136] +mla z22.d, P0/M, z19.d, z31.d +ldr q19, [x0, #1152] +sub z23.d, z27.d, z24.d +ldr q20, [x0, #1184] +add z27.d, z27.d, z24.d +sqrdmulh z24.d, z17.d, z5.d[0] +str q27, [x0, #1232] +mul z17.d, z17.d,z4.d[0] +str q23, [x0, #1264] +mla z16.d, P0/M, z18.d, z31.d +ldr q18, [x0, #1120] +sub z23.d, z25.d, z22.d +ldr q27, [x0, #1088] +add z25.d, z25.d, z22.d +sqrdmulh z22.d, z19.d, z5.d[0] +str q25, [x0, #1168] +mul z19.d, z19.d,z4.d[0] +str q23, [x0, #1200] +mla z17.d, P0/M, z24.d, z31.d +ldr q24, [x0, #1024] +sub z23.d, z18.d, z16.d +ldr q25, [x0, #1056] +add z18.d, z18.d, z16.d +sqrdmulh z16.d, z20.d, z5.d[0] +mul z20.d, z20.d,z4.d[0] +sub z29.d, z27.d, z17.d +mla z19.d, P0/M, z22.d, z31.d +add z27.d, z27.d, z17.d +sqrdmulh z17.d, z18.d, z7.d[0] +mul z18.d, z18.d,z6.d[0] +sub z22.d, z24.d, z19.d +mla z20.d, P0/M, z16.d, z31.d +add z24.d, z24.d, z19.d +sqrdmulh z19.d, z27.d, z7.d[0] +mul z27.d, z27.d,z6.d[0] +sub z16.d, z25.d, z20.d +mla z18.d, P0/M, z17.d, z31.d +add z25.d, z25.d, z20.d +sqrdmulh z20.d, z23.d, z7.d[1] +mul z23.d, z23.d,z6.d[1] +sub z17.d, z25.d, z18.d +mla z27.d, P0/M, z19.d, z31.d +add z25.d, z25.d, z18.d +ldr q3, [x17, #+768] +ldr q2, [x17, #+784] +ldr q1, [x17, #+800] +ldr q0, [x17, #+816] +ldr q15, [x17, #+832] +ldr q14, [x17, #+848] +ldr q13, [x17, #+864] +ldr q12, [x17, #+880] +sqrdmulh z18.d, z29.d, z7.d[1] +mul z29.d, z29.d,z6.d[1] +sub z19.d, z24.d, z27.d +mla z23.d, P0/M, z20.d, z31.d +add z24.d, z24.d, z27.d +sqrdmulh z27.d, z25.d, z9.d[0] +mul z25.d, z25.d,z8.d[0] +sub z20.d, z16.d, z23.d +mla z29.d, P0/M, z18.d, z31.d +add z16.d, z16.d, z23.d +sqrdmulh z23.d, z17.d, z9.d[1] +mul z17.d, z17.d,z8.d[1] +sub z18.d, z22.d, z29.d +mla z25.d, P0/M, z27.d, z31.d +add z22.d, z22.d, z29.d +sqrdmulh z29.d, z20.d, z11.d[1] +mul z20.d, z20.d,z10.d[1] +sub z27.d, z24.d, z25.d +mla z17.d, P0/M, z23.d, z31.d +add z24.d, z24.d, z25.d +sqrdmulh z25.d, z16.d, z11.d[0] +str q24, [x0, #1024] +mul z16.d, z16.d,z10.d[0] +str q27, [x0, #1056] +mla z20.d, P0/M, z29.d, z31.d +sub z29.d, z19.d, z17.d +ldr q27, [x0, #1520] +ldr q24, [x0, #1488] +add z19.d, z19.d, z17.d +sqrdmulh z17.d, z27.d, z2.d[0] +str q19, [x0, #1088] +mul z27.d, z27.d,z3.d[0] +str q29, [x0, #1120] +mla z16.d, P0/M, z25.d, z31.d +ldr q25, [x0, #1424] +sub z29.d, z18.d, z20.d +ldr q19, [x0, #1456] +add z18.d, z18.d, z20.d +sqrdmulh z20.d, z24.d, z2.d[0] +str q18, [x0, #1216] +mul z24.d, z24.d,z3.d[0] +str q29, [x0, #1248] +mla z27.d, P0/M, z17.d, z31.d +ldr q17, [x0, #1392] +sub z29.d, z22.d, z16.d +ldr q18, [x0, #1360] +add z22.d, z22.d, z16.d +sqrdmulh z16.d, z25.d, z2.d[0] +str q22, [x0, #1152] +mul z25.d, z25.d,z3.d[0] +str q29, [x0, #1184] +mla z24.d, P0/M, z20.d, z31.d +ldr q20, [x0, #1296] +sub z29.d, z17.d, z27.d +ldr q22, [x0, #1328] +add z17.d, z17.d, z27.d +sqrdmulh z27.d, z19.d, z2.d[0] +mul z19.d, z19.d,z3.d[0] +sub z23.d, z18.d, z24.d +mla z25.d, P0/M, z16.d, z31.d +add z18.d, z18.d, z24.d +sqrdmulh z24.d, z17.d, z0.d[0] +mul z17.d, z17.d,z1.d[0] +sub z16.d, z20.d, z25.d +mla z19.d, P0/M, z27.d, z31.d +add z20.d, z20.d, z25.d +sqrdmulh z25.d, z18.d, z0.d[0] +mul z18.d, z18.d,z1.d[0] +sub z27.d, z22.d, z19.d +mla z17.d, P0/M, z24.d, z31.d +add z22.d, z22.d, z19.d +sqrdmulh z19.d, z29.d, z0.d[1] +mul z29.d, z29.d,z1.d[1] +sub z24.d, z22.d, z17.d +mla z18.d, P0/M, z25.d, z31.d +add z22.d, z22.d, z17.d +sqrdmulh z17.d, z23.d, z0.d[1] +mul z23.d, z23.d,z1.d[1] +sub z25.d, z20.d, z18.d +mla z29.d, P0/M, z19.d, z31.d +add z20.d, z20.d, z18.d +sqrdmulh z18.d, z22.d, z14.d[0] +mul z22.d, z22.d,z15.d[0] +sub z19.d, z27.d, z29.d +mla z23.d, P0/M, z17.d, z31.d +add z27.d, z27.d, z29.d +sqrdmulh z29.d, z24.d, z14.d[1] +mul z24.d, z24.d,z15.d[1] +sub z17.d, z16.d, z23.d +mla z22.d, P0/M, z18.d, z31.d +add z16.d, z16.d, z23.d +sqrdmulh z23.d, z19.d, z12.d[1] +mul z19.d, z19.d,z13.d[1] +sub z18.d, z20.d, z22.d +mla z24.d, P0/M, z29.d, z31.d +add z20.d, z20.d, z22.d +sqrdmulh z22.d, z27.d, z12.d[0] +str q20, [x0, #1296] +mul z27.d, z27.d,z13.d[0] +str q18, [x0, #1328] +mla z19.d, P0/M, z23.d, z31.d +sub z23.d, z25.d, z24.d +ldr q18, [x0, #1504] +ldr q20, [x0, #1472] +add z25.d, z25.d, z24.d +sqrdmulh z24.d, z18.d, z2.d[0] +str q25, [x0, #1360] +mul z18.d, z18.d,z3.d[0] +str q23, [x0, #1392] +mla z27.d, P0/M, z22.d, z31.d +ldr q22, [x0, #1408] +sub z23.d, z17.d, z19.d +ldr q25, [x0, #1440] +add z17.d, z17.d, z19.d +sqrdmulh z19.d, z20.d, z2.d[0] +str q17, [x0, #1488] +mul z20.d, z20.d,z3.d[0] +str q23, [x0, #1520] +mla z18.d, P0/M, z24.d, z31.d +ldr q24, [x0, #1376] +sub z23.d, z16.d, z27.d +ldr q17, [x0, #1344] +add z16.d, z16.d, z27.d +sqrdmulh z27.d, z22.d, z2.d[0] +str q16, [x0, #1424] +mul z22.d, z22.d,z3.d[0] +str q23, [x0, #1456] +mla z20.d, P0/M, z19.d, z31.d +ldr q19, [x0, #1280] +sub z23.d, z24.d, z18.d +ldr q16, [x0, #1312] +add z24.d, z24.d, z18.d +sqrdmulh z18.d, z25.d, z2.d[0] +mul z25.d, z25.d,z3.d[0] +sub z29.d, z17.d, z20.d +mla z22.d, P0/M, z27.d, z31.d +add z17.d, z17.d, z20.d +sqrdmulh z20.d, z24.d, z0.d[0] +mul z24.d, z24.d,z1.d[0] +sub z27.d, z19.d, z22.d +mla z25.d, P0/M, z18.d, z31.d +add z19.d, z19.d, z22.d +sqrdmulh z22.d, z17.d, z0.d[0] +mul z17.d, z17.d,z1.d[0] +sub z18.d, z16.d, z25.d +mla z24.d, P0/M, z20.d, z31.d +add z16.d, z16.d, z25.d +sqrdmulh z25.d, z23.d, z0.d[1] +mul z23.d, z23.d,z1.d[1] +sub z20.d, z16.d, z24.d +mla z17.d, P0/M, z22.d, z31.d +add z16.d, z16.d, z24.d +ldr q11, [x17, #+896] +ldr q10, [x17, #+912] +ldr q9, [x17, #+928] +ldr q8, [x17, #+944] +ldr q7, [x17, #+960] +ldr q6, [x17, #+976] +ldr q5, [x17, #+992] +ldr q4, [x17, #+1008] +sqrdmulh z24.d, z29.d, z0.d[1] +mul z29.d, z29.d,z1.d[1] +sub z22.d, z19.d, z17.d +mla z23.d, P0/M, z25.d, z31.d +add z19.d, z19.d, z17.d +sqrdmulh z17.d, z16.d, z14.d[0] +mul z16.d, z16.d,z15.d[0] +sub z25.d, z18.d, z23.d +mla z29.d, P0/M, z24.d, z31.d +add z18.d, z18.d, z23.d +sqrdmulh z23.d, z20.d, z14.d[1] +mul z20.d, z20.d,z15.d[1] +sub z24.d, z27.d, z29.d +mla z16.d, P0/M, z17.d, z31.d +add z27.d, z27.d, z29.d +sqrdmulh z29.d, z25.d, z12.d[1] +mul z25.d, z25.d,z13.d[1] +sub z17.d, z19.d, z16.d +mla z20.d, P0/M, z23.d, z31.d +add z19.d, z19.d, z16.d +sqrdmulh z16.d, z18.d, z12.d[0] +str q19, [x0, #1280] +mul z18.d, z18.d,z13.d[0] +str q17, [x0, #1312] +mla z25.d, P0/M, z29.d, z31.d +sub z29.d, z22.d, z20.d +ldr q17, [x0, #1776] +ldr q19, [x0, #1744] +add z22.d, z22.d, z20.d +sqrdmulh z20.d, z17.d, z10.d[0] +str q22, [x0, #1344] +mul z17.d, z17.d,z11.d[0] +str q29, [x0, #1376] +mla z18.d, P0/M, z16.d, z31.d +ldr q16, [x0, #1680] +sub z29.d, z24.d, z25.d +ldr q22, [x0, #1712] +add z24.d, z24.d, z25.d +sqrdmulh z25.d, z19.d, z10.d[0] +str q24, [x0, #1472] +mul z19.d, z19.d,z11.d[0] +str q29, [x0, #1504] +mla z17.d, P0/M, z20.d, z31.d +ldr q20, [x0, #1648] +sub z29.d, z27.d, z18.d +ldr q24, [x0, #1616] +add z27.d, z27.d, z18.d +sqrdmulh z18.d, z16.d, z10.d[0] +str q27, [x0, #1408] +mul z16.d, z16.d,z11.d[0] +str q29, [x0, #1440] +mla z19.d, P0/M, z25.d, z31.d +ldr q25, [x0, #1552] +sub z29.d, z20.d, z17.d +ldr q27, [x0, #1584] +add z20.d, z20.d, z17.d +sqrdmulh z17.d, z22.d, z10.d[0] +mul z22.d, z22.d,z11.d[0] +sub z23.d, z24.d, z19.d +mla z16.d, P0/M, z18.d, z31.d +add z24.d, z24.d, z19.d +sqrdmulh z19.d, z20.d, z8.d[0] +mul z20.d, z20.d,z9.d[0] +sub z18.d, z25.d, z16.d +mla z22.d, P0/M, z17.d, z31.d +add z25.d, z25.d, z16.d +sqrdmulh z16.d, z24.d, z8.d[0] +mul z24.d, z24.d,z9.d[0] +sub z17.d, z27.d, z22.d +mla z20.d, P0/M, z19.d, z31.d +add z27.d, z27.d, z22.d +sqrdmulh z22.d, z29.d, z8.d[1] +mul z29.d, z29.d,z9.d[1] +sub z19.d, z27.d, z20.d +mla z24.d, P0/M, z16.d, z31.d +add z27.d, z27.d, z20.d +sqrdmulh z20.d, z23.d, z8.d[1] +mul z23.d, z23.d,z9.d[1] +sub z16.d, z25.d, z24.d +mla z29.d, P0/M, z22.d, z31.d +add z25.d, z25.d, z24.d +sqrdmulh z24.d, z27.d, z6.d[0] +mul z27.d, z27.d,z7.d[0] +sub z22.d, z17.d, z29.d +mla z23.d, P0/M, z20.d, z31.d +add z17.d, z17.d, z29.d +sqrdmulh z29.d, z19.d, z6.d[1] +mul z19.d, z19.d,z7.d[1] +sub z20.d, z18.d, z23.d +mla z27.d, P0/M, z24.d, z31.d +add z18.d, z18.d, z23.d +sqrdmulh z23.d, z22.d, z4.d[1] +mul z22.d, z22.d,z5.d[1] +sub z24.d, z25.d, z27.d +mla z19.d, P0/M, z29.d, z31.d +add z25.d, z25.d, z27.d +sqrdmulh z27.d, z17.d, z4.d[0] +str q25, [x0, #1552] +mul z17.d, z17.d,z5.d[0] +str q24, [x0, #1584] +mla z22.d, P0/M, z23.d, z31.d +sub z23.d, z16.d, z19.d +ldr q24, [x0, #1760] +ldr q25, [x0, #1728] +add z16.d, z16.d, z19.d +sqrdmulh z19.d, z24.d, z10.d[0] +str q16, [x0, #1616] +mul z24.d, z24.d,z11.d[0] +str q23, [x0, #1648] +mla z17.d, P0/M, z27.d, z31.d +ldr q27, [x0, #1664] +sub z23.d, z20.d, z22.d +ldr q16, [x0, #1696] +add z20.d, z20.d, z22.d +sqrdmulh z22.d, z25.d, z10.d[0] +str q20, [x0, #1744] +mul z25.d, z25.d,z11.d[0] +str q23, [x0, #1776] +mla z24.d, P0/M, z19.d, z31.d +ldr q19, [x0, #1632] +sub z23.d, z18.d, z17.d +ldr q20, [x0, #1600] +add z18.d, z18.d, z17.d +sqrdmulh z17.d, z27.d, z10.d[0] +str q18, [x0, #1680] +mul z27.d, z27.d,z11.d[0] +str q23, [x0, #1712] +mla z25.d, P0/M, z22.d, z31.d +ldr q22, [x0, #1536] +sub z23.d, z19.d, z24.d +ldr q18, [x0, #1568] +add z19.d, z19.d, z24.d +sqrdmulh z24.d, z16.d, z10.d[0] +mul z16.d, z16.d,z11.d[0] +sub z29.d, z20.d, z25.d +mla z27.d, P0/M, z17.d, z31.d +add z20.d, z20.d, z25.d +sqrdmulh z25.d, z19.d, z8.d[0] +mul z19.d, z19.d,z9.d[0] +sub z17.d, z22.d, z27.d +mla z16.d, P0/M, z24.d, z31.d +add z22.d, z22.d, z27.d +sqrdmulh z27.d, z20.d, z8.d[0] +mul z20.d, z20.d,z9.d[0] +sub z24.d, z18.d, z16.d +mla z19.d, P0/M, z25.d, z31.d +add z18.d, z18.d, z16.d +sqrdmulh z16.d, z23.d, z8.d[1] +mul z23.d, z23.d,z9.d[1] +sub z25.d, z18.d, z19.d +mla z20.d, P0/M, z27.d, z31.d +add z18.d, z18.d, z19.d +ldr q12, [x17, #+1024] +ldr q13, [x17, #+1040] +ldr q14, [x17, #+1056] +ldr q15, [x17, #+1072] +ldr q0, [x17, #+1088] +ldr q1, [x17, #+1104] +ldr q2, [x17, #+1120] +ldr q3, [x17, #+1136] +sqrdmulh z19.d, z29.d, z8.d[1] +mul z29.d, z29.d,z9.d[1] +sub z27.d, z22.d, z20.d +mla z23.d, P0/M, z16.d, z31.d +add z22.d, z22.d, z20.d +sqrdmulh z20.d, z18.d, z6.d[0] +mul z18.d, z18.d,z7.d[0] +sub z16.d, z24.d, z23.d +mla z29.d, P0/M, z19.d, z31.d +add z24.d, z24.d, z23.d +sqrdmulh z23.d, z25.d, z6.d[1] +mul z25.d, z25.d,z7.d[1] +sub z19.d, z17.d, z29.d +mla z18.d, P0/M, z20.d, z31.d +add z17.d, z17.d, z29.d +sqrdmulh z29.d, z16.d, z4.d[1] +mul z16.d, z16.d,z5.d[1] +sub z20.d, z22.d, z18.d +mla z25.d, P0/M, z23.d, z31.d +add z22.d, z22.d, z18.d +sqrdmulh z18.d, z24.d, z4.d[0] +str q22, [x0, #1536] +mul z24.d, z24.d,z5.d[0] +str q20, [x0, #1568] +mla z16.d, P0/M, z29.d, z31.d +sub z29.d, z27.d, z25.d +ldr q20, [x0, #2032] +ldr q22, [x0, #2000] +add z27.d, z27.d, z25.d +sqrdmulh z25.d, z20.d, z13.d[0] +str q27, [x0, #1600] +mul z20.d, z20.d,z12.d[0] +str q29, [x0, #1632] +mla z24.d, P0/M, z18.d, z31.d +ldr q18, [x0, #1936] +sub z29.d, z19.d, z16.d +ldr q27, [x0, #1968] +add z19.d, z19.d, z16.d +sqrdmulh z16.d, z22.d, z13.d[0] +str q19, [x0, #1728] +mul z22.d, z22.d,z12.d[0] +str q29, [x0, #1760] +mla z20.d, P0/M, z25.d, z31.d +ldr q25, [x0, #1904] +sub z29.d, z17.d, z24.d +ldr q19, [x0, #1872] +add z17.d, z17.d, z24.d +sqrdmulh z24.d, z18.d, z13.d[0] +str q17, [x0, #1664] +mul z18.d, z18.d,z12.d[0] +str q29, [x0, #1696] +mla z22.d, P0/M, z16.d, z31.d +ldr q16, [x0, #1808] +sub z29.d, z25.d, z20.d +ldr q17, [x0, #1840] +add z25.d, z25.d, z20.d +sqrdmulh z20.d, z27.d, z13.d[0] +mul z27.d, z27.d,z12.d[0] +sub z23.d, z19.d, z22.d +mla z18.d, P0/M, z24.d, z31.d +add z19.d, z19.d, z22.d +sqrdmulh z22.d, z25.d, z15.d[0] +mul z25.d, z25.d,z14.d[0] +sub z24.d, z16.d, z18.d +mla z27.d, P0/M, z20.d, z31.d +add z16.d, z16.d, z18.d +sqrdmulh z18.d, z19.d, z15.d[0] +mul z19.d, z19.d,z14.d[0] +sub z20.d, z17.d, z27.d +mla z25.d, P0/M, z22.d, z31.d +add z17.d, z17.d, z27.d +sqrdmulh z27.d, z29.d, z15.d[1] +mul z29.d, z29.d,z14.d[1] +sub z22.d, z17.d, z25.d +mla z19.d, P0/M, z18.d, z31.d +add z17.d, z17.d, z25.d +sqrdmulh z25.d, z23.d, z15.d[1] +mul z23.d, z23.d,z14.d[1] +sub z18.d, z16.d, z19.d +mla z29.d, P0/M, z27.d, z31.d +add z16.d, z16.d, z19.d +sqrdmulh z19.d, z17.d, z1.d[0] +mul z17.d, z17.d,z0.d[0] +sub z27.d, z20.d, z29.d +mla z23.d, P0/M, z25.d, z31.d +add z20.d, z20.d, z29.d +sqrdmulh z29.d, z22.d, z1.d[1] +mul z22.d, z22.d,z0.d[1] +sub z25.d, z24.d, z23.d +mla z17.d, P0/M, z19.d, z31.d +add z24.d, z24.d, z23.d +sqrdmulh z23.d, z27.d, z3.d[1] +mul z27.d, z27.d,z2.d[1] +sub z19.d, z16.d, z17.d +mla z22.d, P0/M, z29.d, z31.d +add z16.d, z16.d, z17.d +sqrdmulh z17.d, z20.d, z3.d[0] +str q16, [x0, #1808] +mul z20.d, z20.d,z2.d[0] +str q19, [x0, #1840] +mla z27.d, P0/M, z23.d, z31.d +sub z23.d, z18.d, z22.d +ldr q19, [x0, #2016] +ldr q16, [x0, #1984] +add z18.d, z18.d, z22.d +sqrdmulh z22.d, z19.d, z13.d[0] +str q18, [x0, #1872] +mul z19.d, z19.d,z12.d[0] +str q23, [x0, #1904] +mla z20.d, P0/M, z17.d, z31.d +ldr q17, [x0, #1920] +sub z23.d, z25.d, z27.d +ldr q18, [x0, #1952] +add z25.d, z25.d, z27.d +sqrdmulh z27.d, z16.d, z13.d[0] +str q25, [x0, #2000] +mul z16.d, z16.d,z12.d[0] +str q23, [x0, #2032] +mla z19.d, P0/M, z22.d, z31.d +ldr q22, [x0, #1888] +sub z23.d, z24.d, z20.d +ldr q25, [x0, #1856] +add z24.d, z24.d, z20.d +sqrdmulh z20.d, z17.d, z13.d[0] +str q24, [x0, #1936] +mul z17.d, z17.d,z12.d[0] +str q23, [x0, #1968] +mla z16.d, P0/M, z27.d, z31.d +ldr q27, [x0, #1792] +sub z23.d, z22.d, z19.d +ldr q24, [x0, #1824] +add z22.d, z22.d, z19.d +sqrdmulh z19.d, z18.d, z13.d[0] +mul z18.d, z18.d,z12.d[0] +sub z29.d, z25.d, z16.d +mla z17.d, P0/M, z20.d, z31.d +add z25.d, z25.d, z16.d +sqrdmulh z16.d, z22.d, z15.d[0] +mul z22.d, z22.d,z14.d[0] +sub z20.d, z27.d, z17.d +mla z18.d, P0/M, z19.d, z31.d +add z27.d, z27.d, z17.d +sqrdmulh z17.d, z25.d, z15.d[0] +mul z25.d, z25.d,z14.d[0] +sub z19.d, z24.d, z18.d +mla z22.d, P0/M, z16.d, z31.d +add z24.d, z24.d, z18.d +sqrdmulh z18.d, z23.d, z15.d[1] +mul z23.d, z23.d,z14.d[1] +sub z16.d, z24.d, z22.d +mla z25.d, P0/M, z17.d, z31.d +add z24.d, z24.d, z22.d +sqrdmulh z22.d, z29.d, z15.d[1] +mul z29.d, z29.d,z14.d[1] +sub z17.d, z27.d, z25.d +mla z23.d, P0/M, z18.d, z31.d +add z27.d, z27.d, z25.d +sqrdmulh z25.d, z24.d, z1.d[0] +mul z24.d, z24.d,z0.d[0] +sub z18.d, z19.d, z23.d +mla z29.d, P0/M, z22.d, z31.d +add z19.d, z19.d, z23.d +sqrdmulh z23.d, z16.d, z1.d[1] +mul z16.d, z16.d,z0.d[1] +sub z22.d, z20.d, z29.d +mla z24.d, P0/M, z25.d, z31.d +add z20.d, z20.d, z29.d +sqrdmulh z29.d, z18.d, z3.d[1] +mul z18.d, z18.d,z2.d[1] +sub z25.d, z27.d, z24.d +mla z16.d, P0/M, z23.d, z31.d +add z27.d, z27.d, z24.d +sqrdmulh z24.d, z19.d, z3.d[0] +str q27, [x0, #1792] +mul z19.d, z19.d,z2.d[0] +str q25, [x0, #1824] +mla z18.d, P0/M, z29.d, z31.d +sub z29.d, z17.d, z16.d +add z17.d, z17.d, z16.d +str q17, [x0, #1856] +str q29, [x0, #1888] +mla z19.d, P0/M, z24.d, z31.d +sub z24.d, z22.d, z18.d +add z22.d, z22.d, z18.d +str q22, [x0, #1984] +str q24, [x0, #2016] +sub z24.d, z20.d, z19.d +add z20.d, z20.d, z19.d +str q20, [x0, #1920] +str q24, [x0, #1952] +// Restore SVE2 vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 2697 +// Instruction count: 2693 \ No newline at end of file diff --git a/asm/auto/ntt_sve2/ntt_u64_incomplete_72057594067788289_60277548896192635_var_3_3_2.s b/asm/auto/ntt_sve2/ntt_u64_incomplete_72057594067788289_60277548896192635_var_3_3_2.s new file mode 100644 index 0000000..cb5189a --- /dev/null +++ b/asm/auto/ntt_sve2/ntt_u64_incomplete_72057594067788289_60277548896192635_var_3_3_2.s @@ -0,0 +1,2727 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +modulus: +.dword -72057594067788289 +.dword -72057594067788289 +.dword -72057594067788289 +.dword -72057594067788289 +.align 6 +roots_merged: +.dword 25792053496987399 // Layer 0, block 0 +.dword 0 // Layer None, block None +.dword 3301382846246308405 // Layer 0, block 0 +.dword 0 // Layer None, block None +.dword 36678763444893001 // Layer 1, block 0 +.dword 12009493193917617 // Layer 1, block 1 +.dword 4694881719000765600 // Layer 1, block 0 +.dword 1537215128184439725 // Layer 1, block 1 +.dword 57226611787624233 // Layer 2, block 0 +.dword 39665359539540334 // Layer 2, block 1 +.dword 7325006305780451127 // Layer 2, block 0 +.dword 5077166018957207276 // Layer 2, block 1 +.dword 14359056949694594 // Layer 2, block 2 +.dword 63449028357011879 // Layer 2, block 3 +.dword 1837959288799265711 // Layer 2, block 2 +.dword 8121475626332016399 // Layer 2, block 3 +.dword 56437370284897879 // Layer 3, block 0 +.dword 0 // Layer None, block None +.dword 7223983393473341270 // Layer 3, block 0 +.dword 0 // Layer None, block None +.dword 15519149204003269 // Layer 4, block 0 +.dword 18945631884663455 // Layer 4, block 1 +.dword 1986451097289241753 // Layer 4, block 0 +.dword 2425040880231995866 // Layer 4, block 1 +.dword 21843809513296019 // Layer 5, block 0 +.dword 52861630939350015 // Layer 5, block 1 +.dword 2796007616543237058 // Layer 5, block 0 +.dword 6766288757432881341 // Layer 5, block 1 +.dword 58200436133340777 // Layer 5, block 2 +.dword 45581265709396633 // Layer 5, block 3 +.dword 7449655821980514543 // Layer 5, block 2 +.dword 5834402008385018253 // Layer 5, block 3 +.dword 7801853795705237 // Layer 3, block 1 +.dword 0 // Layer None, block None +.dword 998637285436439396 // Layer 3, block 1 +.dword 0 // Layer None, block None +.dword 72057409685042741 // Layer 4, block 2 +.dword 67813594624550994 // Layer 4, block 3 +.dword 9223348435863355444 // Layer 4, block 2 +.dword 8680140108345514992 // Layer 4, block 3 +.dword 16444438478993771 // Layer 5, block 4 +.dword 44738633871916757 // Layer 5, block 5 +.dword 2104888124438946221 // Layer 5, block 4 +.dword 5726545133232289544 // Layer 5, block 5 +.dword 14998888047589537 // Layer 5, block 6 +.dword 1367715298619054 // Layer 5, block 7 +.dword 1919857669295880083 // Layer 5, block 6 +.dword 175067558150691679 // Layer 5, block 7 +.dword 50810289212278368 // Layer 3, block 2 +.dword 0 // Layer None, block None +.dword 6503717016476519110 // Layer 3, block 2 +.dword 0 // Layer None, block None +.dword 38922220208018571 // Layer 4, block 4 +.dword 7966052600948377 // Layer 4, block 5 +.dword 4982044184561839686 // Layer 4, block 4 +.dword 1019654732498851778 // Layer 4, block 5 +.dword 45879272116084567 // Layer 5, block 8 +.dword 66654388400258382 // Layer 5, block 9 +.dword 5872546828425266758 // Layer 5, block 8 +.dword 8531761711697548017 // Layer 5, block 9 +.dword 8930087962801744 // Layer 5, block 10 +.dword 61848588213223279 // Layer 5, block 11 +.dword 1143051258764947771 // Layer 5, block 10 +.dword 7916619288011967173 // Layer 5, block 11 +.dword 31977682183549777 // Layer 3, block 3 +.dword 0 // Layer None, block None +.dword 4093143317798190700 // Layer 3, block 3 +.dword 0 // Layer None, block None +.dword 66070897124800871 // Layer 4, block 6 +.dword 953067252694683 // Layer 4, block 7 +.dword 8457074828469936528 // Layer 4, block 6 +.dword 121992608294366219 // Layer 4, block 7 +.dword 33801610235026337 // Layer 5, block 12 +.dword 32122784433286747 // Layer 5, block 13 +.dword 4326606108290444417 // Layer 5, block 12 +.dword 4111716405756826253 // Layer 5, block 13 +.dword 67688369535326483 // Layer 5, block 14 +.dword 45021686719473556 // Layer 5, block 15 +.dword 8664111296931419854 // Layer 5, block 14 +.dword 5762775897704545946 // Layer 5, block 15 +.dword 66662168904752601 // Layer 3, block 4 +.dword 0 // Layer None, block None +.dword 8532757616272395351 // Layer 3, block 4 +.dword 0 // Layer None, block None +.dword 23961218891132444 // Layer 4, block 8 +.dword 59012643726482518 // Layer 4, block 9 +.dword 3067036016793986470 // Layer 4, block 8 +.dword 7553618393859575754 // Layer 4, block 9 +.dword 52812533586708198 // Layer 5, block 16 +.dword 27994290036168371 // Layer 5, block 17 +.dword 6760004296297333018 // Layer 5, block 16 +.dword 3583269123144660376 // Layer 5, block 17 +.dword 45890717144660134 // Layer 5, block 18 +.dword 39684773913748863 // Layer 5, block 19 +.dword 5874011792082332260 // Layer 5, block 18 +.dword 5079651058854869198 // Layer 5, block 19 +.dword 50149898471788096 // Layer 3, block 5 +.dword 0 // Layer None, block None +.dword 6419187001728793164 // Layer 3, block 5 +.dword 0 // Layer None, block None +.dword 65714767972465509 // Layer 4, block 10 +.dword 51421828010275652 // Layer 4, block 11 +.dword 8411490296989900223 // Layer 4, block 10 +.dword 6581993982587733829 // Layer 4, block 11 +.dword 18683690578478417 // Layer 5, block 20 +.dword 3282356803714609 // Layer 5, block 21 +.dword 2391512393054205061 // Layer 5, block 20 +.dword 420141670701365074 // Layer 5, block 21 +.dword 67884452950503047 // Layer 5, block 22 +.dword 10335338564031418 // Layer 5, block 23 +.dword 8689209974063619263 // Layer 5, block 22 +.dword 1322923335647807838 // Layer 5, block 23 +.dword 30932683335866672 // Layer 3, block 6 +.dword 0 // Layer None, block None +.dword 3959383465350182760 // Layer 3, block 6 +.dword 0 // Layer None, block None +.dword 27050097608373352 // Layer 4, block 12 +.dword 67454821565758121 // Layer 4, block 13 +.dword 3462412492436980406 // Layer 4, block 12 +.dword 8634217156839057519 // Layer 4, block 13 +.dword 32828920539599153 // Layer 5, block 24 +.dword 8624332566875856 // Layer 5, block 25 +.dword 4202101827327358896 // Layer 5, block 24 +.dword 1103914568102652181 // Layer 5, block 25 +.dword 56732837753533829 // Layer 5, block 26 +.dword 14816466027490539 // Layer 5, block 27 +.dword 7261803229443070495 // Layer 5, block 26 +.dword 1896507650732884485 // Layer 5, block 27 +.dword 54968319742463037 // Layer 3, block 7 +.dword 0 // Layer None, block None +.dword 7035944924119603816 // Layer 3, block 7 +.dword 0 // Layer None, block None +.dword 55666925166425210 // Layer 4, block 14 +.dword 34241587306439298 // Layer 4, block 15 +.dword 7125366418349706083 // Layer 4, block 14 +.dword 4382923173407965878 // Layer 4, block 15 +.dword 8550051130607768 // Layer 5, block 28 +.dword 14420141705316589 // Layer 5, block 29 +.dword 1094406544264277001 // Layer 5, block 28 +.dword 1845778137515640974 // Layer 5, block 29 +.dword 55622715926092387 // Layer 5, block 30 +.dword 3405033449209397 // Layer 5, block 31 +.dword 7119707635589449714 // Layer 5, block 30 +.dword 435844281318190845 // Layer 5, block 31 +.text +.type ntt_u64_incomplete_sve2_asm_var_3_3_2, %function +.global ntt_u64_incomplete_sve2_asm_var_3_3_2 +modulus_addr: .quad modulus +roots_merged_addr: .quad roots_merged +ntt_u64_incomplete_sve2_asm_var_3_3_2: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save SVE2 vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ldr x17, modulus_addr +ldr q31, [x17] +ptrue P0.d +ldr x17, roots_merged_addr +ldr q3, [x17, #+0] +ldr q2, [x17, #+16] +ldr q1, [x17, #+32] +ldr q0, [x17, #+48] +ldr q15, [x17, #+64] +ldr q14, [x17, #+80] +ldr q13, [x17, #+96] +ldr q12, [x17, #+112] +ldr q30, [x0, #1920] +ldr q29, [x0, #1664] +sqrdmulh z28.d, z30.d, z2.d[0] +mul z30.d, z30.d,z3.d[0] +ldr q27, [x0, #1152] +sqrdmulh z26.d, z29.d, z2.d[0] +mul z29.d, z29.d,z3.d[0] +ldr q25, [x0, #1408] +mla z30.d, P0/M, z28.d, z31.d +sqrdmulh z28.d, z27.d, z2.d[0] +mul z27.d, z27.d,z3.d[0] +ldr q24, [x0, #896] +mla z29.d, P0/M, z26.d, z31.d +sub z26.d, z24.d, z30.d +add z24.d, z24.d, z30.d +sqrdmulh z30.d, z25.d, z2.d[0] +mul z25.d, z25.d,z3.d[0] +ldr q23, [x0, #640] +mla z27.d, P0/M, z28.d, z31.d +sub z28.d, z23.d, z29.d +add z23.d, z23.d, z29.d +sqrdmulh z29.d, z24.d, z0.d[0] +mul z24.d, z24.d,z1.d[0] +ldr q22, [x0, #128] +mla z25.d, P0/M, z30.d, z31.d +sub z30.d, z22.d, z27.d +add z22.d, z22.d, z27.d +sqrdmulh z27.d, z23.d, z0.d[0] +mul z23.d, z23.d,z1.d[0] +ldr q21, [x0, #384] +mla z24.d, P0/M, z29.d, z31.d +sub z29.d, z21.d, z25.d +add z21.d, z21.d, z25.d +sqrdmulh z25.d, z26.d, z0.d[1] +mul z26.d, z26.d,z1.d[1] +mla z23.d, P0/M, z27.d, z31.d +sub z27.d, z21.d, z24.d +add z21.d, z21.d, z24.d +sqrdmulh z24.d, z28.d, z0.d[1] +mul z28.d, z28.d,z1.d[1] +mla z26.d, P0/M, z25.d, z31.d +sub z25.d, z22.d, z23.d +add z22.d, z22.d, z23.d +sqrdmulh z23.d, z21.d, z14.d[0] +mul z21.d, z21.d,z15.d[0] +mla z28.d, P0/M, z24.d, z31.d +sub z24.d, z29.d, z26.d +add z29.d, z29.d, z26.d +sqrdmulh z26.d, z27.d, z14.d[1] +mul z27.d, z27.d,z15.d[1] +mla z21.d, P0/M, z23.d, z31.d +sub z23.d, z30.d, z28.d +add z30.d, z30.d, z28.d +sqrdmulh z28.d, z24.d, z12.d[1] +mul z24.d, z24.d,z13.d[1] +ldr q20, [x0, #1936] +mla z27.d, P0/M, z26.d, z31.d +sub z26.d, z22.d, z21.d +add z22.d, z22.d, z21.d +sqrdmulh z21.d, z29.d, z12.d[0] +mul z29.d, z29.d,z13.d[0] +ldr q19, [x0, #1680] +mla z24.d, P0/M, z28.d, z31.d +sub z28.d, z25.d, z27.d +add z25.d, z25.d, z27.d +sqrdmulh z27.d, z20.d, z2.d[0] +mul z20.d, z20.d,z3.d[0] +ldr q18, [x0, #1168] +mla z29.d, P0/M, z21.d, z31.d +sub z21.d, z23.d, z24.d +add z23.d, z23.d, z24.d +sqrdmulh z24.d, z19.d, z2.d[0] +str q22, [x0, #128] +mul z19.d, z19.d,z3.d[0] +ldr q22, [x0, #1424] +mla z20.d, P0/M, z27.d, z31.d +sub z27.d, z30.d, z29.d +add z30.d, z30.d, z29.d +sqrdmulh z29.d, z18.d, z2.d[0] +str q26, [x0, #384] +mul z18.d, z18.d,z3.d[0] +ldr q26, [x0, #912] +mla z19.d, P0/M, z24.d, z31.d +sub z24.d, z26.d, z20.d +add z26.d, z26.d, z20.d +sqrdmulh z20.d, z22.d, z2.d[0] +str q25, [x0, #640] +mul z22.d, z22.d,z3.d[0] +ldr q25, [x0, #656] +mla z18.d, P0/M, z29.d, z31.d +sub z29.d, z25.d, z19.d +add z25.d, z25.d, z19.d +sqrdmulh z19.d, z26.d, z0.d[0] +str q28, [x0, #896] +mul z26.d, z26.d,z1.d[0] +ldr q28, [x0, #144] +mla z22.d, P0/M, z20.d, z31.d +sub z20.d, z28.d, z18.d +add z28.d, z28.d, z18.d +sqrdmulh z18.d, z25.d, z0.d[0] +str q23, [x0, #1664] +mul z25.d, z25.d,z1.d[0] +ldr q23, [x0, #400] +mla z26.d, P0/M, z19.d, z31.d +sub z19.d, z23.d, z22.d +add z23.d, z23.d, z22.d +sqrdmulh z22.d, z24.d, z0.d[1] +str q21, [x0, #1920] +mul z24.d, z24.d,z1.d[1] +mla z25.d, P0/M, z18.d, z31.d +sub z18.d, z23.d, z26.d +add z23.d, z23.d, z26.d +sqrdmulh z26.d, z29.d, z0.d[1] +str q30, [x0, #1152] +mul z29.d, z29.d,z1.d[1] +mla z24.d, P0/M, z22.d, z31.d +sub z22.d, z28.d, z25.d +add z28.d, z28.d, z25.d +sqrdmulh z25.d, z23.d, z14.d[0] +str q27, [x0, #1408] +mul z23.d, z23.d,z15.d[0] +mla z29.d, P0/M, z26.d, z31.d +sub z26.d, z19.d, z24.d +add z19.d, z19.d, z24.d +sqrdmulh z24.d, z18.d, z14.d[1] +mul z18.d, z18.d,z15.d[1] +mla z23.d, P0/M, z25.d, z31.d +sub z25.d, z20.d, z29.d +add z20.d, z20.d, z29.d +sqrdmulh z29.d, z26.d, z12.d[1] +mul z26.d, z26.d,z13.d[1] +ldr q27, [x0, #1952] +mla z18.d, P0/M, z24.d, z31.d +sub z24.d, z28.d, z23.d +add z28.d, z28.d, z23.d +sqrdmulh z23.d, z19.d, z12.d[0] +mul z19.d, z19.d,z13.d[0] +ldr q30, [x0, #1696] +mla z26.d, P0/M, z29.d, z31.d +sub z29.d, z22.d, z18.d +add z22.d, z22.d, z18.d +sqrdmulh z18.d, z27.d, z2.d[0] +mul z27.d, z27.d,z3.d[0] +ldr q21, [x0, #1184] +mla z19.d, P0/M, z23.d, z31.d +sub z23.d, z25.d, z26.d +add z25.d, z25.d, z26.d +sqrdmulh z26.d, z30.d, z2.d[0] +str q28, [x0, #144] +mul z30.d, z30.d,z3.d[0] +ldr q28, [x0, #1440] +mla z27.d, P0/M, z18.d, z31.d +sub z18.d, z20.d, z19.d +add z20.d, z20.d, z19.d +sqrdmulh z19.d, z21.d, z2.d[0] +str q24, [x0, #400] +mul z21.d, z21.d,z3.d[0] +ldr q24, [x0, #928] +mla z30.d, P0/M, z26.d, z31.d +sub z26.d, z24.d, z27.d +add z24.d, z24.d, z27.d +sqrdmulh z27.d, z28.d, z2.d[0] +str q22, [x0, #656] +mul z28.d, z28.d,z3.d[0] +ldr q22, [x0, #672] +mla z21.d, P0/M, z19.d, z31.d +sub z19.d, z22.d, z30.d +add z22.d, z22.d, z30.d +sqrdmulh z30.d, z24.d, z0.d[0] +str q29, [x0, #912] +mul z24.d, z24.d,z1.d[0] +ldr q29, [x0, #160] +mla z28.d, P0/M, z27.d, z31.d +sub z27.d, z29.d, z21.d +add z29.d, z29.d, z21.d +sqrdmulh z21.d, z22.d, z0.d[0] +str q25, [x0, #1680] +mul z22.d, z22.d,z1.d[0] +ldr q25, [x0, #416] +mla z24.d, P0/M, z30.d, z31.d +sub z30.d, z25.d, z28.d +add z25.d, z25.d, z28.d +sqrdmulh z28.d, z26.d, z0.d[1] +str q23, [x0, #1936] +mul z26.d, z26.d,z1.d[1] +mla z22.d, P0/M, z21.d, z31.d +sub z21.d, z25.d, z24.d +add z25.d, z25.d, z24.d +sqrdmulh z24.d, z19.d, z0.d[1] +str q20, [x0, #1168] +mul z19.d, z19.d,z1.d[1] +mla z26.d, P0/M, z28.d, z31.d +sub z28.d, z29.d, z22.d +add z29.d, z29.d, z22.d +sqrdmulh z22.d, z25.d, z14.d[0] +str q18, [x0, #1424] +mul z25.d, z25.d,z15.d[0] +mla z19.d, P0/M, z24.d, z31.d +sub z24.d, z30.d, z26.d +add z30.d, z30.d, z26.d +sqrdmulh z26.d, z21.d, z14.d[1] +mul z21.d, z21.d,z15.d[1] +mla z25.d, P0/M, z22.d, z31.d +sub z22.d, z27.d, z19.d +add z27.d, z27.d, z19.d +sqrdmulh z19.d, z24.d, z12.d[1] +mul z24.d, z24.d,z13.d[1] +ldr q18, [x0, #1968] +mla z21.d, P0/M, z26.d, z31.d +sub z26.d, z29.d, z25.d +add z29.d, z29.d, z25.d +sqrdmulh z25.d, z30.d, z12.d[0] +mul z30.d, z30.d,z13.d[0] +ldr q20, [x0, #1712] +mla z24.d, P0/M, z19.d, z31.d +sub z19.d, z28.d, z21.d +add z28.d, z28.d, z21.d +sqrdmulh z21.d, z18.d, z2.d[0] +mul z18.d, z18.d,z3.d[0] +ldr q23, [x0, #1200] +mla z30.d, P0/M, z25.d, z31.d +sub z25.d, z22.d, z24.d +add z22.d, z22.d, z24.d +sqrdmulh z24.d, z20.d, z2.d[0] +str q29, [x0, #160] +mul z20.d, z20.d,z3.d[0] +ldr q29, [x0, #1456] +mla z18.d, P0/M, z21.d, z31.d +sub z21.d, z27.d, z30.d +add z27.d, z27.d, z30.d +sqrdmulh z30.d, z23.d, z2.d[0] +str q26, [x0, #416] +mul z23.d, z23.d,z3.d[0] +ldr q26, [x0, #944] +mla z20.d, P0/M, z24.d, z31.d +sub z24.d, z26.d, z18.d +add z26.d, z26.d, z18.d +sqrdmulh z18.d, z29.d, z2.d[0] +str q28, [x0, #672] +mul z29.d, z29.d,z3.d[0] +ldr q28, [x0, #688] +mla z23.d, P0/M, z30.d, z31.d +sub z30.d, z28.d, z20.d +add z28.d, z28.d, z20.d +sqrdmulh z20.d, z26.d, z0.d[0] +str q19, [x0, #928] +mul z26.d, z26.d,z1.d[0] +ldr q19, [x0, #176] +mla z29.d, P0/M, z18.d, z31.d +sub z18.d, z19.d, z23.d +add z19.d, z19.d, z23.d +sqrdmulh z23.d, z28.d, z0.d[0] +str q22, [x0, #1696] +mul z28.d, z28.d,z1.d[0] +ldr q22, [x0, #432] +mla z26.d, P0/M, z20.d, z31.d +sub z20.d, z22.d, z29.d +add z22.d, z22.d, z29.d +sqrdmulh z29.d, z24.d, z0.d[1] +str q25, [x0, #1952] +mul z24.d, z24.d,z1.d[1] +mla z28.d, P0/M, z23.d, z31.d +sub z23.d, z22.d, z26.d +add z22.d, z22.d, z26.d +sqrdmulh z26.d, z30.d, z0.d[1] +str q27, [x0, #1184] +mul z30.d, z30.d,z1.d[1] +mla z24.d, P0/M, z29.d, z31.d +sub z29.d, z19.d, z28.d +add z19.d, z19.d, z28.d +sqrdmulh z28.d, z22.d, z14.d[0] +str q21, [x0, #1440] +mul z22.d, z22.d,z15.d[0] +mla z30.d, P0/M, z26.d, z31.d +sub z26.d, z20.d, z24.d +add z20.d, z20.d, z24.d +sqrdmulh z24.d, z23.d, z14.d[1] +mul z23.d, z23.d,z15.d[1] +mla z22.d, P0/M, z28.d, z31.d +sub z28.d, z18.d, z30.d +add z18.d, z18.d, z30.d +sqrdmulh z30.d, z26.d, z12.d[1] +mul z26.d, z26.d,z13.d[1] +ldr q21, [x0, #1984] +mla z23.d, P0/M, z24.d, z31.d +sub z24.d, z19.d, z22.d +add z19.d, z19.d, z22.d +sqrdmulh z22.d, z20.d, z12.d[0] +mul z20.d, z20.d,z13.d[0] +ldr q27, [x0, #1728] +mla z26.d, P0/M, z30.d, z31.d +sub z30.d, z29.d, z23.d +add z29.d, z29.d, z23.d +sqrdmulh z23.d, z21.d, z2.d[0] +mul z21.d, z21.d,z3.d[0] +ldr q25, [x0, #1216] +mla z20.d, P0/M, z22.d, z31.d +sub z22.d, z28.d, z26.d +add z28.d, z28.d, z26.d +sqrdmulh z26.d, z27.d, z2.d[0] +str q19, [x0, #176] +mul z27.d, z27.d,z3.d[0] +ldr q19, [x0, #1472] +mla z21.d, P0/M, z23.d, z31.d +sub z23.d, z18.d, z20.d +add z18.d, z18.d, z20.d +sqrdmulh z20.d, z25.d, z2.d[0] +str q24, [x0, #432] +mul z25.d, z25.d,z3.d[0] +ldr q24, [x0, #960] +mla z27.d, P0/M, z26.d, z31.d +sub z26.d, z24.d, z21.d +add z24.d, z24.d, z21.d +sqrdmulh z21.d, z19.d, z2.d[0] +str q29, [x0, #688] +mul z19.d, z19.d,z3.d[0] +ldr q29, [x0, #704] +mla z25.d, P0/M, z20.d, z31.d +sub z20.d, z29.d, z27.d +add z29.d, z29.d, z27.d +sqrdmulh z27.d, z24.d, z0.d[0] +str q30, [x0, #944] +mul z24.d, z24.d,z1.d[0] +ldr q30, [x0, #192] +mla z19.d, P0/M, z21.d, z31.d +sub z21.d, z30.d, z25.d +add z30.d, z30.d, z25.d +sqrdmulh z25.d, z29.d, z0.d[0] +str q28, [x0, #1712] +mul z29.d, z29.d,z1.d[0] +ldr q28, [x0, #448] +mla z24.d, P0/M, z27.d, z31.d +sub z27.d, z28.d, z19.d +add z28.d, z28.d, z19.d +sqrdmulh z19.d, z26.d, z0.d[1] +str q22, [x0, #1968] +mul z26.d, z26.d,z1.d[1] +mla z29.d, P0/M, z25.d, z31.d +sub z25.d, z28.d, z24.d +add z28.d, z28.d, z24.d +sqrdmulh z24.d, z20.d, z0.d[1] +str q18, [x0, #1200] +mul z20.d, z20.d,z1.d[1] +mla z26.d, P0/M, z19.d, z31.d +sub z19.d, z30.d, z29.d +add z30.d, z30.d, z29.d +sqrdmulh z29.d, z28.d, z14.d[0] +str q23, [x0, #1456] +mul z28.d, z28.d,z15.d[0] +mla z20.d, P0/M, z24.d, z31.d +sub z24.d, z27.d, z26.d +add z27.d, z27.d, z26.d +sqrdmulh z26.d, z25.d, z14.d[1] +mul z25.d, z25.d,z15.d[1] +mla z28.d, P0/M, z29.d, z31.d +sub z29.d, z21.d, z20.d +add z21.d, z21.d, z20.d +sqrdmulh z20.d, z24.d, z12.d[1] +mul z24.d, z24.d,z13.d[1] +ldr q23, [x0, #2000] +mla z25.d, P0/M, z26.d, z31.d +sub z26.d, z30.d, z28.d +add z30.d, z30.d, z28.d +sqrdmulh z28.d, z27.d, z12.d[0] +mul z27.d, z27.d,z13.d[0] +ldr q18, [x0, #1744] +mla z24.d, P0/M, z20.d, z31.d +sub z20.d, z19.d, z25.d +add z19.d, z19.d, z25.d +sqrdmulh z25.d, z23.d, z2.d[0] +mul z23.d, z23.d,z3.d[0] +ldr q22, [x0, #1232] +mla z27.d, P0/M, z28.d, z31.d +sub z28.d, z29.d, z24.d +add z29.d, z29.d, z24.d +sqrdmulh z24.d, z18.d, z2.d[0] +str q30, [x0, #192] +mul z18.d, z18.d,z3.d[0] +ldr q30, [x0, #1488] +mla z23.d, P0/M, z25.d, z31.d +sub z25.d, z21.d, z27.d +add z21.d, z21.d, z27.d +sqrdmulh z27.d, z22.d, z2.d[0] +str q26, [x0, #448] +mul z22.d, z22.d,z3.d[0] +ldr q26, [x0, #976] +mla z18.d, P0/M, z24.d, z31.d +sub z24.d, z26.d, z23.d +add z26.d, z26.d, z23.d +sqrdmulh z23.d, z30.d, z2.d[0] +str q19, [x0, #704] +mul z30.d, z30.d,z3.d[0] +ldr q19, [x0, #720] +mla z22.d, P0/M, z27.d, z31.d +sub z27.d, z19.d, z18.d +add z19.d, z19.d, z18.d +sqrdmulh z18.d, z26.d, z0.d[0] +str q20, [x0, #960] +mul z26.d, z26.d,z1.d[0] +ldr q20, [x0, #208] +mla z30.d, P0/M, z23.d, z31.d +sub z23.d, z20.d, z22.d +add z20.d, z20.d, z22.d +sqrdmulh z22.d, z19.d, z0.d[0] +str q29, [x0, #1728] +mul z19.d, z19.d,z1.d[0] +ldr q29, [x0, #464] +mla z26.d, P0/M, z18.d, z31.d +sub z18.d, z29.d, z30.d +add z29.d, z29.d, z30.d +sqrdmulh z30.d, z24.d, z0.d[1] +str q28, [x0, #1984] +mul z24.d, z24.d,z1.d[1] +mla z19.d, P0/M, z22.d, z31.d +sub z22.d, z29.d, z26.d +add z29.d, z29.d, z26.d +sqrdmulh z26.d, z27.d, z0.d[1] +str q21, [x0, #1216] +mul z27.d, z27.d,z1.d[1] +mla z24.d, P0/M, z30.d, z31.d +sub z30.d, z20.d, z19.d +add z20.d, z20.d, z19.d +sqrdmulh z19.d, z29.d, z14.d[0] +str q25, [x0, #1472] +mul z29.d, z29.d,z15.d[0] +mla z27.d, P0/M, z26.d, z31.d +sub z26.d, z18.d, z24.d +add z18.d, z18.d, z24.d +sqrdmulh z24.d, z22.d, z14.d[1] +mul z22.d, z22.d,z15.d[1] +mla z29.d, P0/M, z19.d, z31.d +sub z19.d, z23.d, z27.d +add z23.d, z23.d, z27.d +sqrdmulh z27.d, z26.d, z12.d[1] +mul z26.d, z26.d,z13.d[1] +ldr q25, [x0, #2016] +mla z22.d, P0/M, z24.d, z31.d +sub z24.d, z20.d, z29.d +add z20.d, z20.d, z29.d +sqrdmulh z29.d, z18.d, z12.d[0] +mul z18.d, z18.d,z13.d[0] +ldr q21, [x0, #1760] +mla z26.d, P0/M, z27.d, z31.d +sub z27.d, z30.d, z22.d +add z30.d, z30.d, z22.d +sqrdmulh z22.d, z25.d, z2.d[0] +mul z25.d, z25.d,z3.d[0] +ldr q28, [x0, #1248] +mla z18.d, P0/M, z29.d, z31.d +sub z29.d, z19.d, z26.d +add z19.d, z19.d, z26.d +sqrdmulh z26.d, z21.d, z2.d[0] +str q20, [x0, #208] +mul z21.d, z21.d,z3.d[0] +ldr q20, [x0, #1504] +mla z25.d, P0/M, z22.d, z31.d +sub z22.d, z23.d, z18.d +add z23.d, z23.d, z18.d +sqrdmulh z18.d, z28.d, z2.d[0] +str q24, [x0, #464] +mul z28.d, z28.d,z3.d[0] +ldr q24, [x0, #992] +mla z21.d, P0/M, z26.d, z31.d +sub z26.d, z24.d, z25.d +add z24.d, z24.d, z25.d +sqrdmulh z25.d, z20.d, z2.d[0] +str q30, [x0, #720] +mul z20.d, z20.d,z3.d[0] +ldr q30, [x0, #736] +mla z28.d, P0/M, z18.d, z31.d +sub z18.d, z30.d, z21.d +add z30.d, z30.d, z21.d +sqrdmulh z21.d, z24.d, z0.d[0] +str q27, [x0, #976] +mul z24.d, z24.d,z1.d[0] +ldr q27, [x0, #224] +mla z20.d, P0/M, z25.d, z31.d +sub z25.d, z27.d, z28.d +add z27.d, z27.d, z28.d +sqrdmulh z28.d, z30.d, z0.d[0] +str q19, [x0, #1744] +mul z30.d, z30.d,z1.d[0] +ldr q19, [x0, #480] +mla z24.d, P0/M, z21.d, z31.d +sub z21.d, z19.d, z20.d +add z19.d, z19.d, z20.d +sqrdmulh z20.d, z26.d, z0.d[1] +str q29, [x0, #2000] +mul z26.d, z26.d,z1.d[1] +mla z30.d, P0/M, z28.d, z31.d +sub z28.d, z19.d, z24.d +add z19.d, z19.d, z24.d +sqrdmulh z24.d, z18.d, z0.d[1] +str q23, [x0, #1232] +mul z18.d, z18.d,z1.d[1] +mla z26.d, P0/M, z20.d, z31.d +sub z20.d, z27.d, z30.d +add z27.d, z27.d, z30.d +sqrdmulh z30.d, z19.d, z14.d[0] +str q22, [x0, #1488] +mul z19.d, z19.d,z15.d[0] +mla z18.d, P0/M, z24.d, z31.d +sub z24.d, z21.d, z26.d +add z21.d, z21.d, z26.d +sqrdmulh z26.d, z28.d, z14.d[1] +mul z28.d, z28.d,z15.d[1] +mla z19.d, P0/M, z30.d, z31.d +sub z30.d, z25.d, z18.d +add z25.d, z25.d, z18.d +sqrdmulh z18.d, z24.d, z12.d[1] +mul z24.d, z24.d,z13.d[1] +ldr q22, [x0, #2032] +mla z28.d, P0/M, z26.d, z31.d +sub z26.d, z27.d, z19.d +add z27.d, z27.d, z19.d +sqrdmulh z19.d, z21.d, z12.d[0] +mul z21.d, z21.d,z13.d[0] +ldr q23, [x0, #1776] +mla z24.d, P0/M, z18.d, z31.d +sub z18.d, z20.d, z28.d +add z20.d, z20.d, z28.d +sqrdmulh z28.d, z22.d, z2.d[0] +mul z22.d, z22.d,z3.d[0] +ldr q29, [x0, #1264] +mla z21.d, P0/M, z19.d, z31.d +sub z19.d, z30.d, z24.d +add z30.d, z30.d, z24.d +sqrdmulh z24.d, z23.d, z2.d[0] +str q27, [x0, #224] +mul z23.d, z23.d,z3.d[0] +ldr q27, [x0, #1520] +mla z22.d, P0/M, z28.d, z31.d +sub z28.d, z25.d, z21.d +add z25.d, z25.d, z21.d +sqrdmulh z21.d, z29.d, z2.d[0] +str q26, [x0, #480] +mul z29.d, z29.d,z3.d[0] +ldr q26, [x0, #1008] +mla z23.d, P0/M, z24.d, z31.d +sub z24.d, z26.d, z22.d +add z26.d, z26.d, z22.d +sqrdmulh z22.d, z27.d, z2.d[0] +str q20, [x0, #736] +mul z27.d, z27.d,z3.d[0] +ldr q20, [x0, #752] +mla z29.d, P0/M, z21.d, z31.d +sub z21.d, z20.d, z23.d +add z20.d, z20.d, z23.d +sqrdmulh z23.d, z26.d, z0.d[0] +str q18, [x0, #992] +mul z26.d, z26.d,z1.d[0] +ldr q18, [x0, #240] +mla z27.d, P0/M, z22.d, z31.d +sub z22.d, z18.d, z29.d +add z18.d, z18.d, z29.d +sqrdmulh z29.d, z20.d, z0.d[0] +str q30, [x0, #1760] +mul z20.d, z20.d,z1.d[0] +ldr q30, [x0, #496] +mla z26.d, P0/M, z23.d, z31.d +sub z23.d, z30.d, z27.d +add z30.d, z30.d, z27.d +sqrdmulh z27.d, z24.d, z0.d[1] +str q19, [x0, #2016] +mul z24.d, z24.d,z1.d[1] +mla z20.d, P0/M, z29.d, z31.d +sub z29.d, z30.d, z26.d +add z30.d, z30.d, z26.d +sqrdmulh z26.d, z21.d, z0.d[1] +str q25, [x0, #1248] +mul z21.d, z21.d,z1.d[1] +mla z24.d, P0/M, z27.d, z31.d +sub z27.d, z18.d, z20.d +add z18.d, z18.d, z20.d +sqrdmulh z20.d, z30.d, z14.d[0] +str q28, [x0, #1504] +mul z30.d, z30.d,z15.d[0] +mla z21.d, P0/M, z26.d, z31.d +sub z26.d, z23.d, z24.d +add z23.d, z23.d, z24.d +sqrdmulh z24.d, z29.d, z14.d[1] +mul z29.d, z29.d,z15.d[1] +mla z30.d, P0/M, z20.d, z31.d +sub z20.d, z22.d, z21.d +add z22.d, z22.d, z21.d +sqrdmulh z21.d, z26.d, z12.d[1] +mul z26.d, z26.d,z13.d[1] +ldr q28, [x0, #1792] +mla z29.d, P0/M, z24.d, z31.d +sub z24.d, z18.d, z30.d +add z18.d, z18.d, z30.d +sqrdmulh z30.d, z23.d, z12.d[0] +mul z23.d, z23.d,z13.d[0] +ldr q25, [x0, #1536] +mla z26.d, P0/M, z21.d, z31.d +sub z21.d, z27.d, z29.d +add z27.d, z27.d, z29.d +sqrdmulh z29.d, z28.d, z2.d[0] +mul z28.d, z28.d,z3.d[0] +ldr q19, [x0, #1024] +mla z23.d, P0/M, z30.d, z31.d +sub z30.d, z20.d, z26.d +add z20.d, z20.d, z26.d +sqrdmulh z26.d, z25.d, z2.d[0] +str q18, [x0, #240] +mul z25.d, z25.d,z3.d[0] +ldr q18, [x0, #1280] +mla z28.d, P0/M, z29.d, z31.d +sub z29.d, z22.d, z23.d +add z22.d, z22.d, z23.d +sqrdmulh z23.d, z19.d, z2.d[0] +str q24, [x0, #496] +mul z19.d, z19.d,z3.d[0] +ldr q24, [x0, #768] +mla z25.d, P0/M, z26.d, z31.d +sub z26.d, z24.d, z28.d +add z24.d, z24.d, z28.d +sqrdmulh z28.d, z18.d, z2.d[0] +str q27, [x0, #752] +mul z18.d, z18.d,z3.d[0] +ldr q27, [x0, #512] +mla z19.d, P0/M, z23.d, z31.d +sub z23.d, z27.d, z25.d +add z27.d, z27.d, z25.d +sqrdmulh z25.d, z24.d, z0.d[0] +str q21, [x0, #1008] +mul z24.d, z24.d,z1.d[0] +ldr q21, [x0, #0] +mla z18.d, P0/M, z28.d, z31.d +sub z28.d, z21.d, z19.d +add z21.d, z21.d, z19.d +sqrdmulh z19.d, z27.d, z0.d[0] +str q20, [x0, #1776] +mul z27.d, z27.d,z1.d[0] +ldr q20, [x0, #256] +mla z24.d, P0/M, z25.d, z31.d +sub z25.d, z20.d, z18.d +add z20.d, z20.d, z18.d +sqrdmulh z18.d, z26.d, z0.d[1] +str q30, [x0, #2032] +mul z26.d, z26.d,z1.d[1] +mla z27.d, P0/M, z19.d, z31.d +sub z19.d, z20.d, z24.d +add z20.d, z20.d, z24.d +sqrdmulh z24.d, z23.d, z0.d[1] +str q22, [x0, #1264] +mul z23.d, z23.d,z1.d[1] +mla z26.d, P0/M, z18.d, z31.d +sub z18.d, z21.d, z27.d +add z21.d, z21.d, z27.d +sqrdmulh z27.d, z20.d, z14.d[0] +str q29, [x0, #1520] +mul z20.d, z20.d,z15.d[0] +mla z23.d, P0/M, z24.d, z31.d +sub z24.d, z25.d, z26.d +add z25.d, z25.d, z26.d +sqrdmulh z26.d, z19.d, z14.d[1] +mul z19.d, z19.d,z15.d[1] +mla z20.d, P0/M, z27.d, z31.d +sub z27.d, z28.d, z23.d +add z28.d, z28.d, z23.d +sqrdmulh z23.d, z24.d, z12.d[1] +mul z24.d, z24.d,z13.d[1] +ldr q29, [x0, #1808] +mla z19.d, P0/M, z26.d, z31.d +sub z26.d, z21.d, z20.d +add z21.d, z21.d, z20.d +sqrdmulh z20.d, z25.d, z12.d[0] +mul z25.d, z25.d,z13.d[0] +ldr q22, [x0, #1552] +mla z24.d, P0/M, z23.d, z31.d +sub z23.d, z18.d, z19.d +add z18.d, z18.d, z19.d +sqrdmulh z19.d, z29.d, z2.d[0] +mul z29.d, z29.d,z3.d[0] +ldr q30, [x0, #1040] +mla z25.d, P0/M, z20.d, z31.d +sub z20.d, z27.d, z24.d +add z27.d, z27.d, z24.d +sqrdmulh z24.d, z22.d, z2.d[0] +str q21, [x0, #0] +mul z22.d, z22.d,z3.d[0] +ldr q21, [x0, #1296] +mla z29.d, P0/M, z19.d, z31.d +sub z19.d, z28.d, z25.d +add z28.d, z28.d, z25.d +sqrdmulh z25.d, z30.d, z2.d[0] +str q26, [x0, #256] +mul z30.d, z30.d,z3.d[0] +ldr q26, [x0, #784] +mla z22.d, P0/M, z24.d, z31.d +sub z24.d, z26.d, z29.d +add z26.d, z26.d, z29.d +sqrdmulh z29.d, z21.d, z2.d[0] +str q18, [x0, #512] +mul z21.d, z21.d,z3.d[0] +ldr q18, [x0, #528] +mla z30.d, P0/M, z25.d, z31.d +sub z25.d, z18.d, z22.d +add z18.d, z18.d, z22.d +sqrdmulh z22.d, z26.d, z0.d[0] +str q23, [x0, #768] +mul z26.d, z26.d,z1.d[0] +ldr q23, [x0, #16] +mla z21.d, P0/M, z29.d, z31.d +sub z29.d, z23.d, z30.d +add z23.d, z23.d, z30.d +sqrdmulh z30.d, z18.d, z0.d[0] +str q27, [x0, #1536] +mul z18.d, z18.d,z1.d[0] +ldr q27, [x0, #272] +mla z26.d, P0/M, z22.d, z31.d +sub z22.d, z27.d, z21.d +add z27.d, z27.d, z21.d +sqrdmulh z21.d, z24.d, z0.d[1] +str q20, [x0, #1792] +mul z24.d, z24.d,z1.d[1] +mla z18.d, P0/M, z30.d, z31.d +sub z30.d, z27.d, z26.d +add z27.d, z27.d, z26.d +sqrdmulh z26.d, z25.d, z0.d[1] +str q28, [x0, #1024] +mul z25.d, z25.d,z1.d[1] +mla z24.d, P0/M, z21.d, z31.d +sub z21.d, z23.d, z18.d +add z23.d, z23.d, z18.d +sqrdmulh z18.d, z27.d, z14.d[0] +str q19, [x0, #1280] +mul z27.d, z27.d,z15.d[0] +mla z25.d, P0/M, z26.d, z31.d +sub z26.d, z22.d, z24.d +add z22.d, z22.d, z24.d +sqrdmulh z24.d, z30.d, z14.d[1] +mul z30.d, z30.d,z15.d[1] +mla z27.d, P0/M, z18.d, z31.d +sub z18.d, z29.d, z25.d +add z29.d, z29.d, z25.d +sqrdmulh z25.d, z26.d, z12.d[1] +mul z26.d, z26.d,z13.d[1] +ldr q19, [x0, #1824] +mla z30.d, P0/M, z24.d, z31.d +sub z24.d, z23.d, z27.d +add z23.d, z23.d, z27.d +sqrdmulh z27.d, z22.d, z12.d[0] +mul z22.d, z22.d,z13.d[0] +ldr q28, [x0, #1568] +mla z26.d, P0/M, z25.d, z31.d +sub z25.d, z21.d, z30.d +add z21.d, z21.d, z30.d +sqrdmulh z30.d, z19.d, z2.d[0] +mul z19.d, z19.d,z3.d[0] +ldr q20, [x0, #1056] +mla z22.d, P0/M, z27.d, z31.d +sub z27.d, z18.d, z26.d +add z18.d, z18.d, z26.d +sqrdmulh z26.d, z28.d, z2.d[0] +str q23, [x0, #16] +mul z28.d, z28.d,z3.d[0] +ldr q23, [x0, #1312] +mla z19.d, P0/M, z30.d, z31.d +sub z30.d, z29.d, z22.d +add z29.d, z29.d, z22.d +sqrdmulh z22.d, z20.d, z2.d[0] +str q24, [x0, #272] +mul z20.d, z20.d,z3.d[0] +ldr q24, [x0, #800] +mla z28.d, P0/M, z26.d, z31.d +sub z26.d, z24.d, z19.d +add z24.d, z24.d, z19.d +sqrdmulh z19.d, z23.d, z2.d[0] +str q21, [x0, #528] +mul z23.d, z23.d,z3.d[0] +ldr q21, [x0, #544] +mla z20.d, P0/M, z22.d, z31.d +sub z22.d, z21.d, z28.d +add z21.d, z21.d, z28.d +sqrdmulh z28.d, z24.d, z0.d[0] +str q25, [x0, #784] +mul z24.d, z24.d,z1.d[0] +ldr q25, [x0, #32] +mla z23.d, P0/M, z19.d, z31.d +sub z19.d, z25.d, z20.d +add z25.d, z25.d, z20.d +sqrdmulh z20.d, z21.d, z0.d[0] +str q18, [x0, #1552] +mul z21.d, z21.d,z1.d[0] +ldr q18, [x0, #288] +mla z24.d, P0/M, z28.d, z31.d +sub z28.d, z18.d, z23.d +add z18.d, z18.d, z23.d +sqrdmulh z23.d, z26.d, z0.d[1] +str q27, [x0, #1808] +mul z26.d, z26.d,z1.d[1] +mla z21.d, P0/M, z20.d, z31.d +sub z20.d, z18.d, z24.d +add z18.d, z18.d, z24.d +sqrdmulh z24.d, z22.d, z0.d[1] +str q29, [x0, #1040] +mul z22.d, z22.d,z1.d[1] +mla z26.d, P0/M, z23.d, z31.d +sub z23.d, z25.d, z21.d +add z25.d, z25.d, z21.d +sqrdmulh z21.d, z18.d, z14.d[0] +str q30, [x0, #1296] +mul z18.d, z18.d,z15.d[0] +mla z22.d, P0/M, z24.d, z31.d +sub z24.d, z28.d, z26.d +add z28.d, z28.d, z26.d +sqrdmulh z26.d, z20.d, z14.d[1] +mul z20.d, z20.d,z15.d[1] +mla z18.d, P0/M, z21.d, z31.d +sub z21.d, z19.d, z22.d +add z19.d, z19.d, z22.d +sqrdmulh z22.d, z24.d, z12.d[1] +mul z24.d, z24.d,z13.d[1] +ldr q30, [x0, #1840] +mla z20.d, P0/M, z26.d, z31.d +sub z26.d, z25.d, z18.d +add z25.d, z25.d, z18.d +sqrdmulh z18.d, z28.d, z12.d[0] +mul z28.d, z28.d,z13.d[0] +ldr q29, [x0, #1584] +mla z24.d, P0/M, z22.d, z31.d +sub z22.d, z23.d, z20.d +add z23.d, z23.d, z20.d +sqrdmulh z20.d, z30.d, z2.d[0] +mul z30.d, z30.d,z3.d[0] +ldr q27, [x0, #1072] +mla z28.d, P0/M, z18.d, z31.d +sub z18.d, z21.d, z24.d +add z21.d, z21.d, z24.d +sqrdmulh z24.d, z29.d, z2.d[0] +str q25, [x0, #32] +mul z29.d, z29.d,z3.d[0] +ldr q25, [x0, #1328] +mla z30.d, P0/M, z20.d, z31.d +sub z20.d, z19.d, z28.d +add z19.d, z19.d, z28.d +sqrdmulh z28.d, z27.d, z2.d[0] +str q26, [x0, #288] +mul z27.d, z27.d,z3.d[0] +ldr q26, [x0, #816] +mla z29.d, P0/M, z24.d, z31.d +sub z24.d, z26.d, z30.d +add z26.d, z26.d, z30.d +sqrdmulh z30.d, z25.d, z2.d[0] +str q23, [x0, #544] +mul z25.d, z25.d,z3.d[0] +ldr q23, [x0, #560] +mla z27.d, P0/M, z28.d, z31.d +sub z28.d, z23.d, z29.d +add z23.d, z23.d, z29.d +sqrdmulh z29.d, z26.d, z0.d[0] +str q22, [x0, #800] +mul z26.d, z26.d,z1.d[0] +ldr q22, [x0, #48] +mla z25.d, P0/M, z30.d, z31.d +sub z30.d, z22.d, z27.d +add z22.d, z22.d, z27.d +sqrdmulh z27.d, z23.d, z0.d[0] +str q21, [x0, #1568] +mul z23.d, z23.d,z1.d[0] +ldr q21, [x0, #304] +mla z26.d, P0/M, z29.d, z31.d +sub z29.d, z21.d, z25.d +add z21.d, z21.d, z25.d +sqrdmulh z25.d, z24.d, z0.d[1] +str q18, [x0, #1824] +mul z24.d, z24.d,z1.d[1] +mla z23.d, P0/M, z27.d, z31.d +sub z27.d, z21.d, z26.d +add z21.d, z21.d, z26.d +sqrdmulh z26.d, z28.d, z0.d[1] +str q19, [x0, #1056] +mul z28.d, z28.d,z1.d[1] +mla z24.d, P0/M, z25.d, z31.d +sub z25.d, z22.d, z23.d +add z22.d, z22.d, z23.d +sqrdmulh z23.d, z21.d, z14.d[0] +str q20, [x0, #1312] +mul z21.d, z21.d,z15.d[0] +mla z28.d, P0/M, z26.d, z31.d +sub z26.d, z29.d, z24.d +add z29.d, z29.d, z24.d +sqrdmulh z24.d, z27.d, z14.d[1] +mul z27.d, z27.d,z15.d[1] +mla z21.d, P0/M, z23.d, z31.d +sub z23.d, z30.d, z28.d +add z30.d, z30.d, z28.d +sqrdmulh z28.d, z26.d, z12.d[1] +mul z26.d, z26.d,z13.d[1] +ldr q20, [x0, #1856] +mla z27.d, P0/M, z24.d, z31.d +sub z24.d, z22.d, z21.d +add z22.d, z22.d, z21.d +sqrdmulh z21.d, z29.d, z12.d[0] +mul z29.d, z29.d,z13.d[0] +ldr q19, [x0, #1600] +mla z26.d, P0/M, z28.d, z31.d +sub z28.d, z25.d, z27.d +add z25.d, z25.d, z27.d +sqrdmulh z27.d, z20.d, z2.d[0] +mul z20.d, z20.d,z3.d[0] +ldr q18, [x0, #1088] +mla z29.d, P0/M, z21.d, z31.d +sub z21.d, z23.d, z26.d +add z23.d, z23.d, z26.d +sqrdmulh z26.d, z19.d, z2.d[0] +str q22, [x0, #48] +mul z19.d, z19.d,z3.d[0] +ldr q22, [x0, #1344] +mla z20.d, P0/M, z27.d, z31.d +sub z27.d, z30.d, z29.d +add z30.d, z30.d, z29.d +sqrdmulh z29.d, z18.d, z2.d[0] +str q24, [x0, #304] +mul z18.d, z18.d,z3.d[0] +ldr q24, [x0, #832] +mla z19.d, P0/M, z26.d, z31.d +sub z26.d, z24.d, z20.d +add z24.d, z24.d, z20.d +sqrdmulh z20.d, z22.d, z2.d[0] +str q25, [x0, #560] +mul z22.d, z22.d,z3.d[0] +ldr q25, [x0, #576] +mla z18.d, P0/M, z29.d, z31.d +sub z29.d, z25.d, z19.d +add z25.d, z25.d, z19.d +sqrdmulh z19.d, z24.d, z0.d[0] +str q28, [x0, #816] +mul z24.d, z24.d,z1.d[0] +ldr q28, [x0, #64] +mla z22.d, P0/M, z20.d, z31.d +sub z20.d, z28.d, z18.d +add z28.d, z28.d, z18.d +sqrdmulh z18.d, z25.d, z0.d[0] +str q23, [x0, #1584] +mul z25.d, z25.d,z1.d[0] +ldr q23, [x0, #320] +mla z24.d, P0/M, z19.d, z31.d +sub z19.d, z23.d, z22.d +add z23.d, z23.d, z22.d +sqrdmulh z22.d, z26.d, z0.d[1] +str q21, [x0, #1840] +mul z26.d, z26.d,z1.d[1] +mla z25.d, P0/M, z18.d, z31.d +sub z18.d, z23.d, z24.d +add z23.d, z23.d, z24.d +sqrdmulh z24.d, z29.d, z0.d[1] +str q30, [x0, #1072] +mul z29.d, z29.d,z1.d[1] +mla z26.d, P0/M, z22.d, z31.d +sub z22.d, z28.d, z25.d +add z28.d, z28.d, z25.d +sqrdmulh z25.d, z23.d, z14.d[0] +str q27, [x0, #1328] +mul z23.d, z23.d,z15.d[0] +mla z29.d, P0/M, z24.d, z31.d +sub z24.d, z19.d, z26.d +add z19.d, z19.d, z26.d +sqrdmulh z26.d, z18.d, z14.d[1] +mul z18.d, z18.d,z15.d[1] +mla z23.d, P0/M, z25.d, z31.d +sub z25.d, z20.d, z29.d +add z20.d, z20.d, z29.d +sqrdmulh z29.d, z24.d, z12.d[1] +mul z24.d, z24.d,z13.d[1] +ldr q27, [x0, #1872] +mla z18.d, P0/M, z26.d, z31.d +sub z26.d, z28.d, z23.d +add z28.d, z28.d, z23.d +sqrdmulh z23.d, z19.d, z12.d[0] +mul z19.d, z19.d,z13.d[0] +ldr q30, [x0, #1616] +mla z24.d, P0/M, z29.d, z31.d +sub z29.d, z22.d, z18.d +add z22.d, z22.d, z18.d +sqrdmulh z18.d, z27.d, z2.d[0] +mul z27.d, z27.d,z3.d[0] +ldr q21, [x0, #1104] +mla z19.d, P0/M, z23.d, z31.d +sub z23.d, z25.d, z24.d +add z25.d, z25.d, z24.d +sqrdmulh z24.d, z30.d, z2.d[0] +str q28, [x0, #64] +mul z30.d, z30.d,z3.d[0] +ldr q28, [x0, #1360] +mla z27.d, P0/M, z18.d, z31.d +sub z18.d, z20.d, z19.d +add z20.d, z20.d, z19.d +sqrdmulh z19.d, z21.d, z2.d[0] +str q26, [x0, #320] +mul z21.d, z21.d,z3.d[0] +ldr q26, [x0, #848] +mla z30.d, P0/M, z24.d, z31.d +sub z24.d, z26.d, z27.d +add z26.d, z26.d, z27.d +sqrdmulh z27.d, z28.d, z2.d[0] +str q22, [x0, #576] +mul z28.d, z28.d,z3.d[0] +ldr q22, [x0, #592] +mla z21.d, P0/M, z19.d, z31.d +sub z19.d, z22.d, z30.d +add z22.d, z22.d, z30.d +sqrdmulh z30.d, z26.d, z0.d[0] +str q29, [x0, #832] +mul z26.d, z26.d,z1.d[0] +ldr q29, [x0, #80] +mla z28.d, P0/M, z27.d, z31.d +sub z27.d, z29.d, z21.d +add z29.d, z29.d, z21.d +sqrdmulh z21.d, z22.d, z0.d[0] +str q25, [x0, #1600] +mul z22.d, z22.d,z1.d[0] +ldr q25, [x0, #336] +mla z26.d, P0/M, z30.d, z31.d +sub z30.d, z25.d, z28.d +add z25.d, z25.d, z28.d +sqrdmulh z28.d, z24.d, z0.d[1] +str q23, [x0, #1856] +mul z24.d, z24.d,z1.d[1] +mla z22.d, P0/M, z21.d, z31.d +sub z21.d, z25.d, z26.d +add z25.d, z25.d, z26.d +sqrdmulh z26.d, z19.d, z0.d[1] +str q20, [x0, #1088] +mul z19.d, z19.d,z1.d[1] +mla z24.d, P0/M, z28.d, z31.d +sub z28.d, z29.d, z22.d +add z29.d, z29.d, z22.d +sqrdmulh z22.d, z25.d, z14.d[0] +str q18, [x0, #1344] +mul z25.d, z25.d,z15.d[0] +mla z19.d, P0/M, z26.d, z31.d +sub z26.d, z30.d, z24.d +add z30.d, z30.d, z24.d +sqrdmulh z24.d, z21.d, z14.d[1] +mul z21.d, z21.d,z15.d[1] +mla z25.d, P0/M, z22.d, z31.d +sub z22.d, z27.d, z19.d +add z27.d, z27.d, z19.d +sqrdmulh z19.d, z26.d, z12.d[1] +mul z26.d, z26.d,z13.d[1] +ldr q18, [x0, #1888] +mla z21.d, P0/M, z24.d, z31.d +sub z24.d, z29.d, z25.d +add z29.d, z29.d, z25.d +sqrdmulh z25.d, z30.d, z12.d[0] +mul z30.d, z30.d,z13.d[0] +ldr q20, [x0, #1632] +mla z26.d, P0/M, z19.d, z31.d +sub z19.d, z28.d, z21.d +add z28.d, z28.d, z21.d +sqrdmulh z21.d, z18.d, z2.d[0] +mul z18.d, z18.d,z3.d[0] +ldr q23, [x0, #1120] +mla z30.d, P0/M, z25.d, z31.d +sub z25.d, z22.d, z26.d +add z22.d, z22.d, z26.d +sqrdmulh z26.d, z20.d, z2.d[0] +str q29, [x0, #80] +mul z20.d, z20.d,z3.d[0] +ldr q29, [x0, #1376] +mla z18.d, P0/M, z21.d, z31.d +sub z21.d, z27.d, z30.d +add z27.d, z27.d, z30.d +sqrdmulh z30.d, z23.d, z2.d[0] +str q24, [x0, #336] +mul z23.d, z23.d,z3.d[0] +ldr q24, [x0, #864] +mla z20.d, P0/M, z26.d, z31.d +sub z26.d, z24.d, z18.d +add z24.d, z24.d, z18.d +sqrdmulh z18.d, z29.d, z2.d[0] +str q28, [x0, #592] +mul z29.d, z29.d,z3.d[0] +ldr q28, [x0, #608] +mla z23.d, P0/M, z30.d, z31.d +sub z30.d, z28.d, z20.d +add z28.d, z28.d, z20.d +sqrdmulh z20.d, z24.d, z0.d[0] +str q19, [x0, #848] +mul z24.d, z24.d,z1.d[0] +ldr q19, [x0, #96] +mla z29.d, P0/M, z18.d, z31.d +sub z18.d, z19.d, z23.d +add z19.d, z19.d, z23.d +sqrdmulh z23.d, z28.d, z0.d[0] +str q22, [x0, #1616] +mul z28.d, z28.d,z1.d[0] +ldr q22, [x0, #352] +mla z24.d, P0/M, z20.d, z31.d +sub z20.d, z22.d, z29.d +add z22.d, z22.d, z29.d +sqrdmulh z29.d, z26.d, z0.d[1] +str q25, [x0, #1872] +mul z26.d, z26.d,z1.d[1] +mla z28.d, P0/M, z23.d, z31.d +sub z23.d, z22.d, z24.d +add z22.d, z22.d, z24.d +sqrdmulh z24.d, z30.d, z0.d[1] +str q27, [x0, #1104] +mul z30.d, z30.d,z1.d[1] +mla z26.d, P0/M, z29.d, z31.d +sub z29.d, z19.d, z28.d +add z19.d, z19.d, z28.d +sqrdmulh z28.d, z22.d, z14.d[0] +str q21, [x0, #1360] +mul z22.d, z22.d,z15.d[0] +mla z30.d, P0/M, z24.d, z31.d +sub z24.d, z20.d, z26.d +add z20.d, z20.d, z26.d +sqrdmulh z26.d, z23.d, z14.d[1] +mul z23.d, z23.d,z15.d[1] +mla z22.d, P0/M, z28.d, z31.d +sub z28.d, z18.d, z30.d +add z18.d, z18.d, z30.d +sqrdmulh z30.d, z24.d, z12.d[1] +mul z24.d, z24.d,z13.d[1] +ldr q21, [x0, #1904] +mla z23.d, P0/M, z26.d, z31.d +sub z26.d, z19.d, z22.d +add z19.d, z19.d, z22.d +sqrdmulh z22.d, z20.d, z12.d[0] +mul z20.d, z20.d,z13.d[0] +ldr q27, [x0, #1648] +mla z24.d, P0/M, z30.d, z31.d +sub z30.d, z29.d, z23.d +add z29.d, z29.d, z23.d +sqrdmulh z23.d, z21.d, z2.d[0] +mul z21.d, z21.d,z3.d[0] +ldr q25, [x0, #1136] +mla z20.d, P0/M, z22.d, z31.d +sub z22.d, z28.d, z24.d +add z28.d, z28.d, z24.d +sqrdmulh z24.d, z27.d, z2.d[0] +str q19, [x0, #96] +mul z27.d, z27.d,z3.d[0] +ldr q19, [x0, #1392] +mla z21.d, P0/M, z23.d, z31.d +sub z23.d, z18.d, z20.d +add z18.d, z18.d, z20.d +sqrdmulh z20.d, z25.d, z2.d[0] +str q26, [x0, #352] +mul z25.d, z25.d,z3.d[0] +ldr q26, [x0, #880] +mla z27.d, P0/M, z24.d, z31.d +sub z24.d, z26.d, z21.d +add z26.d, z26.d, z21.d +sqrdmulh z21.d, z19.d, z2.d[0] +str q29, [x0, #608] +mul z19.d, z19.d,z3.d[0] +ldr q29, [x0, #624] +mla z25.d, P0/M, z20.d, z31.d +sub z20.d, z29.d, z27.d +add z29.d, z29.d, z27.d +sqrdmulh z27.d, z26.d, z0.d[0] +str q30, [x0, #864] +mul z26.d, z26.d,z1.d[0] +ldr q30, [x0, #112] +mla z19.d, P0/M, z21.d, z31.d +sub z21.d, z30.d, z25.d +add z30.d, z30.d, z25.d +sqrdmulh z25.d, z29.d, z0.d[0] +str q28, [x0, #1632] +mul z29.d, z29.d,z1.d[0] +ldr q28, [x0, #368] +mla z26.d, P0/M, z27.d, z31.d +sub z27.d, z28.d, z19.d +add z28.d, z28.d, z19.d +sqrdmulh z19.d, z24.d, z0.d[1] +str q22, [x0, #1888] +mul z24.d, z24.d,z1.d[1] +mla z29.d, P0/M, z25.d, z31.d +sub z25.d, z28.d, z26.d +add z28.d, z28.d, z26.d +sqrdmulh z26.d, z20.d, z0.d[1] +str q18, [x0, #1120] +mul z20.d, z20.d,z1.d[1] +mla z24.d, P0/M, z19.d, z31.d +sub z19.d, z30.d, z29.d +add z30.d, z30.d, z29.d +sqrdmulh z29.d, z28.d, z14.d[0] +str q23, [x0, #1376] +mul z28.d, z28.d,z15.d[0] +mla z20.d, P0/M, z26.d, z31.d +sub z26.d, z27.d, z24.d +add z27.d, z27.d, z24.d +sqrdmulh z24.d, z25.d, z14.d[1] +mul z25.d, z25.d,z15.d[1] +mla z28.d, P0/M, z29.d, z31.d +sub z29.d, z21.d, z20.d +add z21.d, z21.d, z20.d +sqrdmulh z20.d, z26.d, z12.d[1] +mul z26.d, z26.d,z13.d[1] +mla z25.d, P0/M, z24.d, z31.d +sub z24.d, z30.d, z28.d +add z30.d, z30.d, z28.d +sqrdmulh z28.d, z27.d, z12.d[0] +mul z27.d, z27.d,z13.d[0] +mla z26.d, P0/M, z20.d, z31.d +sub z20.d, z19.d, z25.d +add z19.d, z19.d, z25.d +mla z27.d, P0/M, z28.d, z31.d +sub z28.d, z29.d, z26.d +add z29.d, z29.d, z26.d +str q30, [x0, #112] +sub z30.d, z21.d, z27.d +add z21.d, z21.d, z27.d +str q24, [x0, #368] +str q19, [x0, #624] +str q20, [x0, #880] +str q29, [x0, #1648] +str q28, [x0, #1904] +str q21, [x0, #1136] +str q30, [x0, #1392] +ldr q4, [x17, #+128] +ldr q5, [x17, #+144] +ldr q6, [x17, #+160] +ldr q7, [x17, #+176] +ldr q8, [x17, #+192] +ldr q9, [x17, #+208] +ldr q10, [x17, #+224] +ldr q11, [x17, #+240] +ldr q16, [x0, #240] +ldr q17, [x0, #208] +sqrdmulh z22.d, z16.d, z5.d[0] +mul z16.d, z16.d,z4.d[0] +ldr q18, [x0, #144] +sqrdmulh z23.d, z17.d, z5.d[0] +mul z17.d, z17.d,z4.d[0] +ldr q25, [x0, #176] +mla z16.d, P0/M, z22.d, z31.d +sqrdmulh z22.d, z18.d, z5.d[0] +mul z18.d, z18.d,z4.d[0] +ldr q26, [x0, #112] +mla z17.d, P0/M, z23.d, z31.d +sub z23.d, z26.d, z16.d +add z26.d, z26.d, z16.d +sqrdmulh z16.d, z25.d, z5.d[0] +mul z25.d, z25.d,z4.d[0] +ldr q27, [x0, #80] +mla z18.d, P0/M, z22.d, z31.d +sub z22.d, z27.d, z17.d +add z27.d, z27.d, z17.d +sqrdmulh z17.d, z26.d, z7.d[0] +mul z26.d, z26.d,z6.d[0] +ldr q24, [x0, #16] +mla z25.d, P0/M, z16.d, z31.d +sub z16.d, z24.d, z18.d +add z24.d, z24.d, z18.d +sqrdmulh z18.d, z27.d, z7.d[0] +mul z27.d, z27.d,z6.d[0] +ldr q19, [x0, #48] +mla z26.d, P0/M, z17.d, z31.d +sub z17.d, z19.d, z25.d +add z19.d, z19.d, z25.d +sqrdmulh z25.d, z23.d, z7.d[1] +mul z23.d, z23.d,z6.d[1] +mla z27.d, P0/M, z18.d, z31.d +sub z18.d, z19.d, z26.d +add z19.d, z19.d, z26.d +sqrdmulh z26.d, z22.d, z7.d[1] +mul z22.d, z22.d,z6.d[1] +mla z23.d, P0/M, z25.d, z31.d +sub z25.d, z24.d, z27.d +add z24.d, z24.d, z27.d +sqrdmulh z27.d, z19.d, z9.d[0] +mul z19.d, z19.d,z8.d[0] +mla z22.d, P0/M, z26.d, z31.d +sub z26.d, z17.d, z23.d +add z17.d, z17.d, z23.d +sqrdmulh z23.d, z18.d, z9.d[1] +mul z18.d, z18.d,z8.d[1] +mla z19.d, P0/M, z27.d, z31.d +sub z27.d, z16.d, z22.d +add z16.d, z16.d, z22.d +sqrdmulh z22.d, z26.d, z11.d[1] +mul z26.d, z26.d,z10.d[1] +ldr q20, [x0, #224] +mla z18.d, P0/M, z23.d, z31.d +sub z23.d, z24.d, z19.d +add z24.d, z24.d, z19.d +sqrdmulh z19.d, z17.d, z11.d[0] +mul z17.d, z17.d,z10.d[0] +ldr q29, [x0, #192] +mla z26.d, P0/M, z22.d, z31.d +sub z22.d, z25.d, z18.d +add z25.d, z25.d, z18.d +sqrdmulh z18.d, z20.d, z5.d[0] +mul z20.d, z20.d,z4.d[0] +ldr q28, [x0, #128] +mla z17.d, P0/M, z19.d, z31.d +sub z19.d, z27.d, z26.d +add z27.d, z27.d, z26.d +sqrdmulh z26.d, z29.d, z5.d[0] +str q24, [x0, #16] +mul z29.d, z29.d,z4.d[0] +ldr q24, [x0, #160] +mla z20.d, P0/M, z18.d, z31.d +sub z18.d, z16.d, z17.d +add z16.d, z16.d, z17.d +sqrdmulh z17.d, z28.d, z5.d[0] +str q23, [x0, #48] +mul z28.d, z28.d,z4.d[0] +ldr q23, [x0, #96] +mla z29.d, P0/M, z26.d, z31.d +sub z26.d, z23.d, z20.d +add z23.d, z23.d, z20.d +sqrdmulh z20.d, z24.d, z5.d[0] +str q25, [x0, #80] +mul z24.d, z24.d,z4.d[0] +ldr q25, [x0, #64] +mla z28.d, P0/M, z17.d, z31.d +sub z17.d, z25.d, z29.d +add z25.d, z25.d, z29.d +sqrdmulh z29.d, z23.d, z7.d[0] +str q22, [x0, #112] +mul z23.d, z23.d,z6.d[0] +ldr q22, [x0, #0] +mla z24.d, P0/M, z20.d, z31.d +sub z20.d, z22.d, z28.d +add z22.d, z22.d, z28.d +sqrdmulh z28.d, z25.d, z7.d[0] +str q27, [x0, #208] +mul z25.d, z25.d,z6.d[0] +ldr q27, [x0, #32] +mla z23.d, P0/M, z29.d, z31.d +sub z29.d, z27.d, z24.d +add z27.d, z27.d, z24.d +sqrdmulh z24.d, z26.d, z7.d[1] +str q19, [x0, #240] +mul z26.d, z26.d,z6.d[1] +mla z25.d, P0/M, z28.d, z31.d +sub z28.d, z27.d, z23.d +add z27.d, z27.d, z23.d +sqrdmulh z23.d, z17.d, z7.d[1] +str q16, [x0, #144] +mul z17.d, z17.d,z6.d[1] +mla z26.d, P0/M, z24.d, z31.d +sub z24.d, z22.d, z25.d +add z22.d, z22.d, z25.d +ldr q3, [x17, #+256] +sqrdmulh z25.d, z27.d, z9.d[0] +str q18, [x0, #176] +mul z27.d, z27.d,z8.d[0] +mla z17.d, P0/M, z23.d, z31.d +sub z23.d, z29.d, z26.d +add z29.d, z29.d, z26.d +ldr q2, [x17, #+272] +sqrdmulh z26.d, z28.d, z9.d[1] +mul z28.d, z28.d,z8.d[1] +mla z27.d, P0/M, z25.d, z31.d +sub z25.d, z20.d, z17.d +add z20.d, z20.d, z17.d +ldr q1, [x17, #+288] +ldr q0, [x17, #+304] +ldr q15, [x17, #+320] +ldr q14, [x17, #+336] +ldr q13, [x17, #+352] +ldr q12, [x17, #+368] +sqrdmulh z17.d, z23.d, z11.d[1] +mul z23.d, z23.d,z10.d[1] +ldr q18, [x0, #496] +mla z28.d, P0/M, z26.d, z31.d +sub z26.d, z22.d, z27.d +add z22.d, z22.d, z27.d +sqrdmulh z27.d, z29.d, z11.d[0] +mul z29.d, z29.d,z10.d[0] +ldr q16, [x0, #464] +mla z23.d, P0/M, z17.d, z31.d +sub z17.d, z24.d, z28.d +add z24.d, z24.d, z28.d +sqrdmulh z28.d, z18.d, z2.d[0] +mul z18.d, z18.d,z3.d[0] +ldr q19, [x0, #400] +mla z29.d, P0/M, z27.d, z31.d +sub z27.d, z25.d, z23.d +add z25.d, z25.d, z23.d +sqrdmulh z23.d, z16.d, z2.d[0] +str q22, [x0, #0] +mul z16.d, z16.d,z3.d[0] +ldr q22, [x0, #432] +mla z18.d, P0/M, z28.d, z31.d +sub z28.d, z20.d, z29.d +add z20.d, z20.d, z29.d +sqrdmulh z29.d, z19.d, z2.d[0] +str q26, [x0, #32] +mul z19.d, z19.d,z3.d[0] +ldr q26, [x0, #368] +mla z16.d, P0/M, z23.d, z31.d +sub z23.d, z26.d, z18.d +add z26.d, z26.d, z18.d +sqrdmulh z18.d, z22.d, z2.d[0] +str q24, [x0, #64] +mul z22.d, z22.d,z3.d[0] +ldr q24, [x0, #336] +mla z19.d, P0/M, z29.d, z31.d +sub z29.d, z24.d, z16.d +add z24.d, z24.d, z16.d +sqrdmulh z16.d, z26.d, z0.d[0] +str q17, [x0, #96] +mul z26.d, z26.d,z1.d[0] +ldr q17, [x0, #272] +mla z22.d, P0/M, z18.d, z31.d +sub z18.d, z17.d, z19.d +add z17.d, z17.d, z19.d +sqrdmulh z19.d, z24.d, z0.d[0] +str q25, [x0, #192] +mul z24.d, z24.d,z1.d[0] +ldr q25, [x0, #304] +mla z26.d, P0/M, z16.d, z31.d +sub z16.d, z25.d, z22.d +add z25.d, z25.d, z22.d +sqrdmulh z22.d, z23.d, z0.d[1] +str q27, [x0, #224] +mul z23.d, z23.d,z1.d[1] +mla z24.d, P0/M, z19.d, z31.d +sub z19.d, z25.d, z26.d +add z25.d, z25.d, z26.d +sqrdmulh z26.d, z29.d, z0.d[1] +str q20, [x0, #128] +mul z29.d, z29.d,z1.d[1] +mla z23.d, P0/M, z22.d, z31.d +sub z22.d, z17.d, z24.d +add z17.d, z17.d, z24.d +sqrdmulh z24.d, z25.d, z14.d[0] +str q28, [x0, #160] +mul z25.d, z25.d,z15.d[0] +mla z29.d, P0/M, z26.d, z31.d +sub z26.d, z16.d, z23.d +add z16.d, z16.d, z23.d +sqrdmulh z23.d, z19.d, z14.d[1] +mul z19.d, z19.d,z15.d[1] +mla z25.d, P0/M, z24.d, z31.d +sub z24.d, z18.d, z29.d +add z18.d, z18.d, z29.d +sqrdmulh z29.d, z26.d, z12.d[1] +mul z26.d, z26.d,z13.d[1] +ldr q28, [x0, #480] +mla z19.d, P0/M, z23.d, z31.d +sub z23.d, z17.d, z25.d +add z17.d, z17.d, z25.d +sqrdmulh z25.d, z16.d, z12.d[0] +mul z16.d, z16.d,z13.d[0] +ldr q20, [x0, #448] +mla z26.d, P0/M, z29.d, z31.d +sub z29.d, z22.d, z19.d +add z22.d, z22.d, z19.d +sqrdmulh z19.d, z28.d, z2.d[0] +mul z28.d, z28.d,z3.d[0] +ldr q27, [x0, #384] +mla z16.d, P0/M, z25.d, z31.d +sub z25.d, z24.d, z26.d +add z24.d, z24.d, z26.d +sqrdmulh z26.d, z20.d, z2.d[0] +str q17, [x0, #272] +mul z20.d, z20.d,z3.d[0] +ldr q17, [x0, #416] +mla z28.d, P0/M, z19.d, z31.d +sub z19.d, z18.d, z16.d +add z18.d, z18.d, z16.d +sqrdmulh z16.d, z27.d, z2.d[0] +str q23, [x0, #304] +mul z27.d, z27.d,z3.d[0] +ldr q23, [x0, #352] +mla z20.d, P0/M, z26.d, z31.d +sub z26.d, z23.d, z28.d +add z23.d, z23.d, z28.d +sqrdmulh z28.d, z17.d, z2.d[0] +str q22, [x0, #336] +mul z17.d, z17.d,z3.d[0] +ldr q22, [x0, #320] +mla z27.d, P0/M, z16.d, z31.d +sub z16.d, z22.d, z20.d +add z22.d, z22.d, z20.d +sqrdmulh z20.d, z23.d, z0.d[0] +str q29, [x0, #368] +mul z23.d, z23.d,z1.d[0] +ldr q29, [x0, #256] +mla z17.d, P0/M, z28.d, z31.d +sub z28.d, z29.d, z27.d +add z29.d, z29.d, z27.d +sqrdmulh z27.d, z22.d, z0.d[0] +str q24, [x0, #464] +mul z22.d, z22.d,z1.d[0] +ldr q24, [x0, #288] +mla z23.d, P0/M, z20.d, z31.d +sub z20.d, z24.d, z17.d +add z24.d, z24.d, z17.d +sqrdmulh z17.d, z26.d, z0.d[1] +str q25, [x0, #496] +mul z26.d, z26.d,z1.d[1] +mla z22.d, P0/M, z27.d, z31.d +sub z27.d, z24.d, z23.d +add z24.d, z24.d, z23.d +sqrdmulh z23.d, z16.d, z0.d[1] +str q18, [x0, #400] +mul z16.d, z16.d,z1.d[1] +mla z26.d, P0/M, z17.d, z31.d +sub z17.d, z29.d, z22.d +add z29.d, z29.d, z22.d +ldr q11, [x17, #+384] +sqrdmulh z22.d, z24.d, z14.d[0] +str q19, [x0, #432] +mul z24.d, z24.d,z15.d[0] +mla z16.d, P0/M, z23.d, z31.d +sub z23.d, z20.d, z26.d +add z20.d, z20.d, z26.d +ldr q10, [x17, #+400] +sqrdmulh z26.d, z27.d, z14.d[1] +mul z27.d, z27.d,z15.d[1] +mla z24.d, P0/M, z22.d, z31.d +sub z22.d, z28.d, z16.d +add z28.d, z28.d, z16.d +ldr q9, [x17, #+416] +ldr q8, [x17, #+432] +ldr q7, [x17, #+448] +ldr q6, [x17, #+464] +ldr q5, [x17, #+480] +ldr q4, [x17, #+496] +sqrdmulh z16.d, z23.d, z12.d[1] +mul z23.d, z23.d,z13.d[1] +ldr q19, [x0, #752] +mla z27.d, P0/M, z26.d, z31.d +sub z26.d, z29.d, z24.d +add z29.d, z29.d, z24.d +sqrdmulh z24.d, z20.d, z12.d[0] +mul z20.d, z20.d,z13.d[0] +ldr q18, [x0, #720] +mla z23.d, P0/M, z16.d, z31.d +sub z16.d, z17.d, z27.d +add z17.d, z17.d, z27.d +sqrdmulh z27.d, z19.d, z10.d[0] +mul z19.d, z19.d,z11.d[0] +ldr q25, [x0, #656] +mla z20.d, P0/M, z24.d, z31.d +sub z24.d, z22.d, z23.d +add z22.d, z22.d, z23.d +sqrdmulh z23.d, z18.d, z10.d[0] +str q29, [x0, #256] +mul z18.d, z18.d,z11.d[0] +ldr q29, [x0, #688] +mla z19.d, P0/M, z27.d, z31.d +sub z27.d, z28.d, z20.d +add z28.d, z28.d, z20.d +sqrdmulh z20.d, z25.d, z10.d[0] +str q26, [x0, #288] +mul z25.d, z25.d,z11.d[0] +ldr q26, [x0, #624] +mla z18.d, P0/M, z23.d, z31.d +sub z23.d, z26.d, z19.d +add z26.d, z26.d, z19.d +sqrdmulh z19.d, z29.d, z10.d[0] +str q17, [x0, #320] +mul z29.d, z29.d,z11.d[0] +ldr q17, [x0, #592] +mla z25.d, P0/M, z20.d, z31.d +sub z20.d, z17.d, z18.d +add z17.d, z17.d, z18.d +sqrdmulh z18.d, z26.d, z8.d[0] +str q16, [x0, #352] +mul z26.d, z26.d,z9.d[0] +ldr q16, [x0, #528] +mla z29.d, P0/M, z19.d, z31.d +sub z19.d, z16.d, z25.d +add z16.d, z16.d, z25.d +sqrdmulh z25.d, z17.d, z8.d[0] +str q22, [x0, #448] +mul z17.d, z17.d,z9.d[0] +ldr q22, [x0, #560] +mla z26.d, P0/M, z18.d, z31.d +sub z18.d, z22.d, z29.d +add z22.d, z22.d, z29.d +sqrdmulh z29.d, z23.d, z8.d[1] +str q24, [x0, #480] +mul z23.d, z23.d,z9.d[1] +mla z17.d, P0/M, z25.d, z31.d +sub z25.d, z22.d, z26.d +add z22.d, z22.d, z26.d +sqrdmulh z26.d, z20.d, z8.d[1] +str q28, [x0, #384] +mul z20.d, z20.d,z9.d[1] +mla z23.d, P0/M, z29.d, z31.d +sub z29.d, z16.d, z17.d +add z16.d, z16.d, z17.d +sqrdmulh z17.d, z22.d, z6.d[0] +str q27, [x0, #416] +mul z22.d, z22.d,z7.d[0] +mla z20.d, P0/M, z26.d, z31.d +sub z26.d, z18.d, z23.d +add z18.d, z18.d, z23.d +sqrdmulh z23.d, z25.d, z6.d[1] +mul z25.d, z25.d,z7.d[1] +mla z22.d, P0/M, z17.d, z31.d +sub z17.d, z19.d, z20.d +add z19.d, z19.d, z20.d +sqrdmulh z20.d, z26.d, z4.d[1] +mul z26.d, z26.d,z5.d[1] +ldr q27, [x0, #736] +mla z25.d, P0/M, z23.d, z31.d +sub z23.d, z16.d, z22.d +add z16.d, z16.d, z22.d +sqrdmulh z22.d, z18.d, z4.d[0] +mul z18.d, z18.d,z5.d[0] +ldr q28, [x0, #704] +mla z26.d, P0/M, z20.d, z31.d +sub z20.d, z29.d, z25.d +add z29.d, z29.d, z25.d +sqrdmulh z25.d, z27.d, z10.d[0] +mul z27.d, z27.d,z11.d[0] +ldr q24, [x0, #640] +mla z18.d, P0/M, z22.d, z31.d +sub z22.d, z17.d, z26.d +add z17.d, z17.d, z26.d +sqrdmulh z26.d, z28.d, z10.d[0] +str q16, [x0, #528] +mul z28.d, z28.d,z11.d[0] +ldr q16, [x0, #672] +mla z27.d, P0/M, z25.d, z31.d +sub z25.d, z19.d, z18.d +add z19.d, z19.d, z18.d +sqrdmulh z18.d, z24.d, z10.d[0] +str q23, [x0, #560] +mul z24.d, z24.d,z11.d[0] +ldr q23, [x0, #608] +mla z28.d, P0/M, z26.d, z31.d +sub z26.d, z23.d, z27.d +add z23.d, z23.d, z27.d +sqrdmulh z27.d, z16.d, z10.d[0] +str q29, [x0, #592] +mul z16.d, z16.d,z11.d[0] +ldr q29, [x0, #576] +mla z24.d, P0/M, z18.d, z31.d +sub z18.d, z29.d, z28.d +add z29.d, z29.d, z28.d +sqrdmulh z28.d, z23.d, z8.d[0] +str q20, [x0, #624] +mul z23.d, z23.d,z9.d[0] +ldr q20, [x0, #512] +mla z16.d, P0/M, z27.d, z31.d +sub z27.d, z20.d, z24.d +add z20.d, z20.d, z24.d +sqrdmulh z24.d, z29.d, z8.d[0] +str q17, [x0, #720] +mul z29.d, z29.d,z9.d[0] +ldr q17, [x0, #544] +mla z23.d, P0/M, z28.d, z31.d +sub z28.d, z17.d, z16.d +add z17.d, z17.d, z16.d +sqrdmulh z16.d, z26.d, z8.d[1] +str q22, [x0, #752] +mul z26.d, z26.d,z9.d[1] +mla z29.d, P0/M, z24.d, z31.d +sub z24.d, z17.d, z23.d +add z17.d, z17.d, z23.d +sqrdmulh z23.d, z18.d, z8.d[1] +str q19, [x0, #656] +mul z18.d, z18.d,z9.d[1] +mla z26.d, P0/M, z16.d, z31.d +sub z16.d, z20.d, z29.d +add z20.d, z20.d, z29.d +ldr q12, [x17, #+512] +sqrdmulh z29.d, z17.d, z6.d[0] +str q25, [x0, #688] +mul z17.d, z17.d,z7.d[0] +mla z18.d, P0/M, z23.d, z31.d +sub z23.d, z28.d, z26.d +add z28.d, z28.d, z26.d +ldr q13, [x17, #+528] +sqrdmulh z26.d, z24.d, z6.d[1] +mul z24.d, z24.d,z7.d[1] +mla z17.d, P0/M, z29.d, z31.d +sub z29.d, z27.d, z18.d +add z27.d, z27.d, z18.d +ldr q14, [x17, #+544] +ldr q15, [x17, #+560] +ldr q0, [x17, #+576] +ldr q1, [x17, #+592] +ldr q2, [x17, #+608] +ldr q3, [x17, #+624] +sqrdmulh z18.d, z23.d, z4.d[1] +mul z23.d, z23.d,z5.d[1] +ldr q25, [x0, #1008] +mla z24.d, P0/M, z26.d, z31.d +sub z26.d, z20.d, z17.d +add z20.d, z20.d, z17.d +sqrdmulh z17.d, z28.d, z4.d[0] +mul z28.d, z28.d,z5.d[0] +ldr q19, [x0, #976] +mla z23.d, P0/M, z18.d, z31.d +sub z18.d, z16.d, z24.d +add z16.d, z16.d, z24.d +sqrdmulh z24.d, z25.d, z13.d[0] +mul z25.d, z25.d,z12.d[0] +ldr q22, [x0, #912] +mla z28.d, P0/M, z17.d, z31.d +sub z17.d, z29.d, z23.d +add z29.d, z29.d, z23.d +sqrdmulh z23.d, z19.d, z13.d[0] +str q20, [x0, #512] +mul z19.d, z19.d,z12.d[0] +ldr q20, [x0, #944] +mla z25.d, P0/M, z24.d, z31.d +sub z24.d, z27.d, z28.d +add z27.d, z27.d, z28.d +sqrdmulh z28.d, z22.d, z13.d[0] +str q26, [x0, #544] +mul z22.d, z22.d,z12.d[0] +ldr q26, [x0, #880] +mla z19.d, P0/M, z23.d, z31.d +sub z23.d, z26.d, z25.d +add z26.d, z26.d, z25.d +sqrdmulh z25.d, z20.d, z13.d[0] +str q16, [x0, #576] +mul z20.d, z20.d,z12.d[0] +ldr q16, [x0, #848] +mla z22.d, P0/M, z28.d, z31.d +sub z28.d, z16.d, z19.d +add z16.d, z16.d, z19.d +sqrdmulh z19.d, z26.d, z15.d[0] +str q18, [x0, #608] +mul z26.d, z26.d,z14.d[0] +ldr q18, [x0, #784] +mla z20.d, P0/M, z25.d, z31.d +sub z25.d, z18.d, z22.d +add z18.d, z18.d, z22.d +sqrdmulh z22.d, z16.d, z15.d[0] +str q29, [x0, #704] +mul z16.d, z16.d,z14.d[0] +ldr q29, [x0, #816] +mla z26.d, P0/M, z19.d, z31.d +sub z19.d, z29.d, z20.d +add z29.d, z29.d, z20.d +sqrdmulh z20.d, z23.d, z15.d[1] +str q17, [x0, #736] +mul z23.d, z23.d,z14.d[1] +mla z16.d, P0/M, z22.d, z31.d +sub z22.d, z29.d, z26.d +add z29.d, z29.d, z26.d +sqrdmulh z26.d, z28.d, z15.d[1] +str q27, [x0, #640] +mul z28.d, z28.d,z14.d[1] +mla z23.d, P0/M, z20.d, z31.d +sub z20.d, z18.d, z16.d +add z18.d, z18.d, z16.d +sqrdmulh z16.d, z29.d, z1.d[0] +str q24, [x0, #672] +mul z29.d, z29.d,z0.d[0] +mla z28.d, P0/M, z26.d, z31.d +sub z26.d, z19.d, z23.d +add z19.d, z19.d, z23.d +sqrdmulh z23.d, z22.d, z1.d[1] +mul z22.d, z22.d,z0.d[1] +mla z29.d, P0/M, z16.d, z31.d +sub z16.d, z25.d, z28.d +add z25.d, z25.d, z28.d +sqrdmulh z28.d, z26.d, z3.d[1] +mul z26.d, z26.d,z2.d[1] +ldr q24, [x0, #992] +mla z22.d, P0/M, z23.d, z31.d +sub z23.d, z18.d, z29.d +add z18.d, z18.d, z29.d +sqrdmulh z29.d, z19.d, z3.d[0] +mul z19.d, z19.d,z2.d[0] +ldr q27, [x0, #960] +mla z26.d, P0/M, z28.d, z31.d +sub z28.d, z20.d, z22.d +add z20.d, z20.d, z22.d +sqrdmulh z22.d, z24.d, z13.d[0] +mul z24.d, z24.d,z12.d[0] +ldr q17, [x0, #896] +mla z19.d, P0/M, z29.d, z31.d +sub z29.d, z16.d, z26.d +add z16.d, z16.d, z26.d +sqrdmulh z26.d, z27.d, z13.d[0] +str q18, [x0, #784] +mul z27.d, z27.d,z12.d[0] +ldr q18, [x0, #928] +mla z24.d, P0/M, z22.d, z31.d +sub z22.d, z25.d, z19.d +add z25.d, z25.d, z19.d +sqrdmulh z19.d, z17.d, z13.d[0] +str q23, [x0, #816] +mul z17.d, z17.d,z12.d[0] +ldr q23, [x0, #864] +mla z27.d, P0/M, z26.d, z31.d +sub z26.d, z23.d, z24.d +add z23.d, z23.d, z24.d +sqrdmulh z24.d, z18.d, z13.d[0] +str q20, [x0, #848] +mul z18.d, z18.d,z12.d[0] +ldr q20, [x0, #832] +mla z17.d, P0/M, z19.d, z31.d +sub z19.d, z20.d, z27.d +add z20.d, z20.d, z27.d +sqrdmulh z27.d, z23.d, z15.d[0] +str q28, [x0, #880] +mul z23.d, z23.d,z14.d[0] +ldr q28, [x0, #768] +mla z18.d, P0/M, z24.d, z31.d +sub z24.d, z28.d, z17.d +add z28.d, z28.d, z17.d +sqrdmulh z17.d, z20.d, z15.d[0] +str q16, [x0, #976] +mul z20.d, z20.d,z14.d[0] +ldr q16, [x0, #800] +mla z23.d, P0/M, z27.d, z31.d +sub z27.d, z16.d, z18.d +add z16.d, z16.d, z18.d +sqrdmulh z18.d, z26.d, z15.d[1] +str q29, [x0, #1008] +mul z26.d, z26.d,z14.d[1] +mla z20.d, P0/M, z17.d, z31.d +sub z17.d, z16.d, z23.d +add z16.d, z16.d, z23.d +sqrdmulh z23.d, z19.d, z15.d[1] +str q25, [x0, #912] +mul z19.d, z19.d,z14.d[1] +mla z26.d, P0/M, z18.d, z31.d +sub z18.d, z28.d, z20.d +add z28.d, z28.d, z20.d +ldr q4, [x17, #+640] +sqrdmulh z20.d, z16.d, z1.d[0] +str q22, [x0, #944] +mul z16.d, z16.d,z0.d[0] +mla z19.d, P0/M, z23.d, z31.d +sub z23.d, z27.d, z26.d +add z27.d, z27.d, z26.d +ldr q5, [x17, #+656] +sqrdmulh z26.d, z17.d, z1.d[1] +mul z17.d, z17.d,z0.d[1] +mla z16.d, P0/M, z20.d, z31.d +sub z20.d, z24.d, z19.d +add z24.d, z24.d, z19.d +ldr q6, [x17, #+672] +ldr q7, [x17, #+688] +ldr q8, [x17, #+704] +ldr q9, [x17, #+720] +ldr q10, [x17, #+736] +ldr q11, [x17, #+752] +sqrdmulh z19.d, z23.d, z3.d[1] +mul z23.d, z23.d,z2.d[1] +ldr q22, [x0, #1264] +mla z17.d, P0/M, z26.d, z31.d +sub z26.d, z28.d, z16.d +add z28.d, z28.d, z16.d +sqrdmulh z16.d, z27.d, z3.d[0] +mul z27.d, z27.d,z2.d[0] +ldr q25, [x0, #1232] +mla z23.d, P0/M, z19.d, z31.d +sub z19.d, z18.d, z17.d +add z18.d, z18.d, z17.d +sqrdmulh z17.d, z22.d, z5.d[0] +mul z22.d, z22.d,z4.d[0] +ldr q29, [x0, #1168] +mla z27.d, P0/M, z16.d, z31.d +sub z16.d, z20.d, z23.d +add z20.d, z20.d, z23.d +sqrdmulh z23.d, z25.d, z5.d[0] +str q28, [x0, #768] +mul z25.d, z25.d,z4.d[0] +ldr q28, [x0, #1200] +mla z22.d, P0/M, z17.d, z31.d +sub z17.d, z24.d, z27.d +add z24.d, z24.d, z27.d +sqrdmulh z27.d, z29.d, z5.d[0] +str q26, [x0, #800] +mul z29.d, z29.d,z4.d[0] +ldr q26, [x0, #1136] +mla z25.d, P0/M, z23.d, z31.d +sub z23.d, z26.d, z22.d +add z26.d, z26.d, z22.d +sqrdmulh z22.d, z28.d, z5.d[0] +str q18, [x0, #832] +mul z28.d, z28.d,z4.d[0] +ldr q18, [x0, #1104] +mla z29.d, P0/M, z27.d, z31.d +sub z27.d, z18.d, z25.d +add z18.d, z18.d, z25.d +sqrdmulh z25.d, z26.d, z7.d[0] +str q19, [x0, #864] +mul z26.d, z26.d,z6.d[0] +ldr q19, [x0, #1040] +mla z28.d, P0/M, z22.d, z31.d +sub z22.d, z19.d, z29.d +add z19.d, z19.d, z29.d +sqrdmulh z29.d, z18.d, z7.d[0] +str q20, [x0, #960] +mul z18.d, z18.d,z6.d[0] +ldr q20, [x0, #1072] +mla z26.d, P0/M, z25.d, z31.d +sub z25.d, z20.d, z28.d +add z20.d, z20.d, z28.d +sqrdmulh z28.d, z23.d, z7.d[1] +str q16, [x0, #992] +mul z23.d, z23.d,z6.d[1] +mla z18.d, P0/M, z29.d, z31.d +sub z29.d, z20.d, z26.d +add z20.d, z20.d, z26.d +sqrdmulh z26.d, z27.d, z7.d[1] +str q24, [x0, #896] +mul z27.d, z27.d,z6.d[1] +mla z23.d, P0/M, z28.d, z31.d +sub z28.d, z19.d, z18.d +add z19.d, z19.d, z18.d +sqrdmulh z18.d, z20.d, z9.d[0] +str q17, [x0, #928] +mul z20.d, z20.d,z8.d[0] +mla z27.d, P0/M, z26.d, z31.d +sub z26.d, z25.d, z23.d +add z25.d, z25.d, z23.d +sqrdmulh z23.d, z29.d, z9.d[1] +mul z29.d, z29.d,z8.d[1] +mla z20.d, P0/M, z18.d, z31.d +sub z18.d, z22.d, z27.d +add z22.d, z22.d, z27.d +sqrdmulh z27.d, z26.d, z11.d[1] +mul z26.d, z26.d,z10.d[1] +ldr q17, [x0, #1248] +mla z29.d, P0/M, z23.d, z31.d +sub z23.d, z19.d, z20.d +add z19.d, z19.d, z20.d +sqrdmulh z20.d, z25.d, z11.d[0] +mul z25.d, z25.d,z10.d[0] +ldr q24, [x0, #1216] +mla z26.d, P0/M, z27.d, z31.d +sub z27.d, z28.d, z29.d +add z28.d, z28.d, z29.d +sqrdmulh z29.d, z17.d, z5.d[0] +mul z17.d, z17.d,z4.d[0] +ldr q16, [x0, #1152] +mla z25.d, P0/M, z20.d, z31.d +sub z20.d, z18.d, z26.d +add z18.d, z18.d, z26.d +sqrdmulh z26.d, z24.d, z5.d[0] +str q19, [x0, #1040] +mul z24.d, z24.d,z4.d[0] +ldr q19, [x0, #1184] +mla z17.d, P0/M, z29.d, z31.d +sub z29.d, z22.d, z25.d +add z22.d, z22.d, z25.d +sqrdmulh z25.d, z16.d, z5.d[0] +str q23, [x0, #1072] +mul z16.d, z16.d,z4.d[0] +ldr q23, [x0, #1120] +mla z24.d, P0/M, z26.d, z31.d +sub z26.d, z23.d, z17.d +add z23.d, z23.d, z17.d +sqrdmulh z17.d, z19.d, z5.d[0] +str q28, [x0, #1104] +mul z19.d, z19.d,z4.d[0] +ldr q28, [x0, #1088] +mla z16.d, P0/M, z25.d, z31.d +sub z25.d, z28.d, z24.d +add z28.d, z28.d, z24.d +sqrdmulh z24.d, z23.d, z7.d[0] +str q27, [x0, #1136] +mul z23.d, z23.d,z6.d[0] +ldr q27, [x0, #1024] +mla z19.d, P0/M, z17.d, z31.d +sub z17.d, z27.d, z16.d +add z27.d, z27.d, z16.d +sqrdmulh z16.d, z28.d, z7.d[0] +str q18, [x0, #1232] +mul z28.d, z28.d,z6.d[0] +ldr q18, [x0, #1056] +mla z23.d, P0/M, z24.d, z31.d +sub z24.d, z18.d, z19.d +add z18.d, z18.d, z19.d +sqrdmulh z19.d, z26.d, z7.d[1] +str q20, [x0, #1264] +mul z26.d, z26.d,z6.d[1] +mla z28.d, P0/M, z16.d, z31.d +sub z16.d, z18.d, z23.d +add z18.d, z18.d, z23.d +sqrdmulh z23.d, z25.d, z7.d[1] +str q22, [x0, #1168] +mul z25.d, z25.d,z6.d[1] +mla z26.d, P0/M, z19.d, z31.d +sub z19.d, z27.d, z28.d +add z27.d, z27.d, z28.d +ldr q3, [x17, #+768] +sqrdmulh z28.d, z18.d, z9.d[0] +str q29, [x0, #1200] +mul z18.d, z18.d,z8.d[0] +mla z25.d, P0/M, z23.d, z31.d +sub z23.d, z24.d, z26.d +add z24.d, z24.d, z26.d +ldr q2, [x17, #+784] +sqrdmulh z26.d, z16.d, z9.d[1] +mul z16.d, z16.d,z8.d[1] +mla z18.d, P0/M, z28.d, z31.d +sub z28.d, z17.d, z25.d +add z17.d, z17.d, z25.d +ldr q1, [x17, #+800] +ldr q0, [x17, #+816] +ldr q15, [x17, #+832] +ldr q14, [x17, #+848] +ldr q13, [x17, #+864] +ldr q12, [x17, #+880] +sqrdmulh z25.d, z23.d, z11.d[1] +mul z23.d, z23.d,z10.d[1] +ldr q29, [x0, #1520] +mla z16.d, P0/M, z26.d, z31.d +sub z26.d, z27.d, z18.d +add z27.d, z27.d, z18.d +sqrdmulh z18.d, z24.d, z11.d[0] +mul z24.d, z24.d,z10.d[0] +ldr q22, [x0, #1488] +mla z23.d, P0/M, z25.d, z31.d +sub z25.d, z19.d, z16.d +add z19.d, z19.d, z16.d +sqrdmulh z16.d, z29.d, z2.d[0] +mul z29.d, z29.d,z3.d[0] +ldr q20, [x0, #1424] +mla z24.d, P0/M, z18.d, z31.d +sub z18.d, z28.d, z23.d +add z28.d, z28.d, z23.d +sqrdmulh z23.d, z22.d, z2.d[0] +str q27, [x0, #1024] +mul z22.d, z22.d,z3.d[0] +ldr q27, [x0, #1456] +mla z29.d, P0/M, z16.d, z31.d +sub z16.d, z17.d, z24.d +add z17.d, z17.d, z24.d +sqrdmulh z24.d, z20.d, z2.d[0] +str q26, [x0, #1056] +mul z20.d, z20.d,z3.d[0] +ldr q26, [x0, #1392] +mla z22.d, P0/M, z23.d, z31.d +sub z23.d, z26.d, z29.d +add z26.d, z26.d, z29.d +sqrdmulh z29.d, z27.d, z2.d[0] +str q19, [x0, #1088] +mul z27.d, z27.d,z3.d[0] +ldr q19, [x0, #1360] +mla z20.d, P0/M, z24.d, z31.d +sub z24.d, z19.d, z22.d +add z19.d, z19.d, z22.d +sqrdmulh z22.d, z26.d, z0.d[0] +str q25, [x0, #1120] +mul z26.d, z26.d,z1.d[0] +ldr q25, [x0, #1296] +mla z27.d, P0/M, z29.d, z31.d +sub z29.d, z25.d, z20.d +add z25.d, z25.d, z20.d +sqrdmulh z20.d, z19.d, z0.d[0] +str q28, [x0, #1216] +mul z19.d, z19.d,z1.d[0] +ldr q28, [x0, #1328] +mla z26.d, P0/M, z22.d, z31.d +sub z22.d, z28.d, z27.d +add z28.d, z28.d, z27.d +sqrdmulh z27.d, z23.d, z0.d[1] +str q18, [x0, #1248] +mul z23.d, z23.d,z1.d[1] +mla z19.d, P0/M, z20.d, z31.d +sub z20.d, z28.d, z26.d +add z28.d, z28.d, z26.d +sqrdmulh z26.d, z24.d, z0.d[1] +str q17, [x0, #1152] +mul z24.d, z24.d,z1.d[1] +mla z23.d, P0/M, z27.d, z31.d +sub z27.d, z25.d, z19.d +add z25.d, z25.d, z19.d +sqrdmulh z19.d, z28.d, z14.d[0] +str q16, [x0, #1184] +mul z28.d, z28.d,z15.d[0] +mla z24.d, P0/M, z26.d, z31.d +sub z26.d, z22.d, z23.d +add z22.d, z22.d, z23.d +sqrdmulh z23.d, z20.d, z14.d[1] +mul z20.d, z20.d,z15.d[1] +mla z28.d, P0/M, z19.d, z31.d +sub z19.d, z29.d, z24.d +add z29.d, z29.d, z24.d +sqrdmulh z24.d, z26.d, z12.d[1] +mul z26.d, z26.d,z13.d[1] +ldr q16, [x0, #1504] +mla z20.d, P0/M, z23.d, z31.d +sub z23.d, z25.d, z28.d +add z25.d, z25.d, z28.d +sqrdmulh z28.d, z22.d, z12.d[0] +mul z22.d, z22.d,z13.d[0] +ldr q17, [x0, #1472] +mla z26.d, P0/M, z24.d, z31.d +sub z24.d, z27.d, z20.d +add z27.d, z27.d, z20.d +sqrdmulh z20.d, z16.d, z2.d[0] +mul z16.d, z16.d,z3.d[0] +ldr q18, [x0, #1408] +mla z22.d, P0/M, z28.d, z31.d +sub z28.d, z19.d, z26.d +add z19.d, z19.d, z26.d +sqrdmulh z26.d, z17.d, z2.d[0] +str q25, [x0, #1296] +mul z17.d, z17.d,z3.d[0] +ldr q25, [x0, #1440] +mla z16.d, P0/M, z20.d, z31.d +sub z20.d, z29.d, z22.d +add z29.d, z29.d, z22.d +sqrdmulh z22.d, z18.d, z2.d[0] +str q23, [x0, #1328] +mul z18.d, z18.d,z3.d[0] +ldr q23, [x0, #1376] +mla z17.d, P0/M, z26.d, z31.d +sub z26.d, z23.d, z16.d +add z23.d, z23.d, z16.d +sqrdmulh z16.d, z25.d, z2.d[0] +str q27, [x0, #1360] +mul z25.d, z25.d,z3.d[0] +ldr q27, [x0, #1344] +mla z18.d, P0/M, z22.d, z31.d +sub z22.d, z27.d, z17.d +add z27.d, z27.d, z17.d +sqrdmulh z17.d, z23.d, z0.d[0] +str q24, [x0, #1392] +mul z23.d, z23.d,z1.d[0] +ldr q24, [x0, #1280] +mla z25.d, P0/M, z16.d, z31.d +sub z16.d, z24.d, z18.d +add z24.d, z24.d, z18.d +sqrdmulh z18.d, z27.d, z0.d[0] +str q19, [x0, #1488] +mul z27.d, z27.d,z1.d[0] +ldr q19, [x0, #1312] +mla z23.d, P0/M, z17.d, z31.d +sub z17.d, z19.d, z25.d +add z19.d, z19.d, z25.d +sqrdmulh z25.d, z26.d, z0.d[1] +str q28, [x0, #1520] +mul z26.d, z26.d,z1.d[1] +mla z27.d, P0/M, z18.d, z31.d +sub z18.d, z19.d, z23.d +add z19.d, z19.d, z23.d +sqrdmulh z23.d, z22.d, z0.d[1] +str q29, [x0, #1424] +mul z22.d, z22.d,z1.d[1] +mla z26.d, P0/M, z25.d, z31.d +sub z25.d, z24.d, z27.d +add z24.d, z24.d, z27.d +ldr q11, [x17, #+896] +sqrdmulh z27.d, z19.d, z14.d[0] +str q20, [x0, #1456] +mul z19.d, z19.d,z15.d[0] +mla z22.d, P0/M, z23.d, z31.d +sub z23.d, z17.d, z26.d +add z17.d, z17.d, z26.d +ldr q10, [x17, #+912] +sqrdmulh z26.d, z18.d, z14.d[1] +mul z18.d, z18.d,z15.d[1] +mla z19.d, P0/M, z27.d, z31.d +sub z27.d, z16.d, z22.d +add z16.d, z16.d, z22.d +ldr q9, [x17, #+928] +ldr q8, [x17, #+944] +ldr q7, [x17, #+960] +ldr q6, [x17, #+976] +ldr q5, [x17, #+992] +ldr q4, [x17, #+1008] +sqrdmulh z22.d, z23.d, z12.d[1] +mul z23.d, z23.d,z13.d[1] +ldr q20, [x0, #1776] +mla z18.d, P0/M, z26.d, z31.d +sub z26.d, z24.d, z19.d +add z24.d, z24.d, z19.d +sqrdmulh z19.d, z17.d, z12.d[0] +mul z17.d, z17.d,z13.d[0] +ldr q29, [x0, #1744] +mla z23.d, P0/M, z22.d, z31.d +sub z22.d, z25.d, z18.d +add z25.d, z25.d, z18.d +sqrdmulh z18.d, z20.d, z10.d[0] +mul z20.d, z20.d,z11.d[0] +ldr q28, [x0, #1680] +mla z17.d, P0/M, z19.d, z31.d +sub z19.d, z27.d, z23.d +add z27.d, z27.d, z23.d +sqrdmulh z23.d, z29.d, z10.d[0] +str q24, [x0, #1280] +mul z29.d, z29.d,z11.d[0] +ldr q24, [x0, #1712] +mla z20.d, P0/M, z18.d, z31.d +sub z18.d, z16.d, z17.d +add z16.d, z16.d, z17.d +sqrdmulh z17.d, z28.d, z10.d[0] +str q26, [x0, #1312] +mul z28.d, z28.d,z11.d[0] +ldr q26, [x0, #1648] +mla z29.d, P0/M, z23.d, z31.d +sub z23.d, z26.d, z20.d +add z26.d, z26.d, z20.d +sqrdmulh z20.d, z24.d, z10.d[0] +str q25, [x0, #1344] +mul z24.d, z24.d,z11.d[0] +ldr q25, [x0, #1616] +mla z28.d, P0/M, z17.d, z31.d +sub z17.d, z25.d, z29.d +add z25.d, z25.d, z29.d +sqrdmulh z29.d, z26.d, z8.d[0] +str q22, [x0, #1376] +mul z26.d, z26.d,z9.d[0] +ldr q22, [x0, #1552] +mla z24.d, P0/M, z20.d, z31.d +sub z20.d, z22.d, z28.d +add z22.d, z22.d, z28.d +sqrdmulh z28.d, z25.d, z8.d[0] +str q27, [x0, #1472] +mul z25.d, z25.d,z9.d[0] +ldr q27, [x0, #1584] +mla z26.d, P0/M, z29.d, z31.d +sub z29.d, z27.d, z24.d +add z27.d, z27.d, z24.d +sqrdmulh z24.d, z23.d, z8.d[1] +str q19, [x0, #1504] +mul z23.d, z23.d,z9.d[1] +mla z25.d, P0/M, z28.d, z31.d +sub z28.d, z27.d, z26.d +add z27.d, z27.d, z26.d +sqrdmulh z26.d, z17.d, z8.d[1] +str q16, [x0, #1408] +mul z17.d, z17.d,z9.d[1] +mla z23.d, P0/M, z24.d, z31.d +sub z24.d, z22.d, z25.d +add z22.d, z22.d, z25.d +sqrdmulh z25.d, z27.d, z6.d[0] +str q18, [x0, #1440] +mul z27.d, z27.d,z7.d[0] +mla z17.d, P0/M, z26.d, z31.d +sub z26.d, z29.d, z23.d +add z29.d, z29.d, z23.d +sqrdmulh z23.d, z28.d, z6.d[1] +mul z28.d, z28.d,z7.d[1] +mla z27.d, P0/M, z25.d, z31.d +sub z25.d, z20.d, z17.d +add z20.d, z20.d, z17.d +sqrdmulh z17.d, z26.d, z4.d[1] +mul z26.d, z26.d,z5.d[1] +ldr q18, [x0, #1760] +mla z28.d, P0/M, z23.d, z31.d +sub z23.d, z22.d, z27.d +add z22.d, z22.d, z27.d +sqrdmulh z27.d, z29.d, z4.d[0] +mul z29.d, z29.d,z5.d[0] +ldr q16, [x0, #1728] +mla z26.d, P0/M, z17.d, z31.d +sub z17.d, z24.d, z28.d +add z24.d, z24.d, z28.d +sqrdmulh z28.d, z18.d, z10.d[0] +mul z18.d, z18.d,z11.d[0] +ldr q19, [x0, #1664] +mla z29.d, P0/M, z27.d, z31.d +sub z27.d, z25.d, z26.d +add z25.d, z25.d, z26.d +sqrdmulh z26.d, z16.d, z10.d[0] +str q22, [x0, #1552] +mul z16.d, z16.d,z11.d[0] +ldr q22, [x0, #1696] +mla z18.d, P0/M, z28.d, z31.d +sub z28.d, z20.d, z29.d +add z20.d, z20.d, z29.d +sqrdmulh z29.d, z19.d, z10.d[0] +str q23, [x0, #1584] +mul z19.d, z19.d,z11.d[0] +ldr q23, [x0, #1632] +mla z16.d, P0/M, z26.d, z31.d +sub z26.d, z23.d, z18.d +add z23.d, z23.d, z18.d +sqrdmulh z18.d, z22.d, z10.d[0] +str q24, [x0, #1616] +mul z22.d, z22.d,z11.d[0] +ldr q24, [x0, #1600] +mla z19.d, P0/M, z29.d, z31.d +sub z29.d, z24.d, z16.d +add z24.d, z24.d, z16.d +sqrdmulh z16.d, z23.d, z8.d[0] +str q17, [x0, #1648] +mul z23.d, z23.d,z9.d[0] +ldr q17, [x0, #1536] +mla z22.d, P0/M, z18.d, z31.d +sub z18.d, z17.d, z19.d +add z17.d, z17.d, z19.d +sqrdmulh z19.d, z24.d, z8.d[0] +str q25, [x0, #1744] +mul z24.d, z24.d,z9.d[0] +ldr q25, [x0, #1568] +mla z23.d, P0/M, z16.d, z31.d +sub z16.d, z25.d, z22.d +add z25.d, z25.d, z22.d +sqrdmulh z22.d, z26.d, z8.d[1] +str q27, [x0, #1776] +mul z26.d, z26.d,z9.d[1] +mla z24.d, P0/M, z19.d, z31.d +sub z19.d, z25.d, z23.d +add z25.d, z25.d, z23.d +sqrdmulh z23.d, z29.d, z8.d[1] +str q20, [x0, #1680] +mul z29.d, z29.d,z9.d[1] +mla z26.d, P0/M, z22.d, z31.d +sub z22.d, z17.d, z24.d +add z17.d, z17.d, z24.d +ldr q12, [x17, #+1024] +sqrdmulh z24.d, z25.d, z6.d[0] +str q28, [x0, #1712] +mul z25.d, z25.d,z7.d[0] +mla z29.d, P0/M, z23.d, z31.d +sub z23.d, z16.d, z26.d +add z16.d, z16.d, z26.d +ldr q13, [x17, #+1040] +sqrdmulh z26.d, z19.d, z6.d[1] +mul z19.d, z19.d,z7.d[1] +mla z25.d, P0/M, z24.d, z31.d +sub z24.d, z18.d, z29.d +add z18.d, z18.d, z29.d +ldr q14, [x17, #+1056] +ldr q15, [x17, #+1072] +ldr q0, [x17, #+1088] +ldr q1, [x17, #+1104] +ldr q2, [x17, #+1120] +ldr q3, [x17, #+1136] +sqrdmulh z29.d, z23.d, z4.d[1] +mul z23.d, z23.d,z5.d[1] +ldr q28, [x0, #2032] +mla z19.d, P0/M, z26.d, z31.d +sub z26.d, z17.d, z25.d +add z17.d, z17.d, z25.d +sqrdmulh z25.d, z16.d, z4.d[0] +mul z16.d, z16.d,z5.d[0] +ldr q20, [x0, #2000] +mla z23.d, P0/M, z29.d, z31.d +sub z29.d, z22.d, z19.d +add z22.d, z22.d, z19.d +sqrdmulh z19.d, z28.d, z13.d[0] +mul z28.d, z28.d,z12.d[0] +ldr q27, [x0, #1936] +mla z16.d, P0/M, z25.d, z31.d +sub z25.d, z24.d, z23.d +add z24.d, z24.d, z23.d +sqrdmulh z23.d, z20.d, z13.d[0] +str q17, [x0, #1536] +mul z20.d, z20.d,z12.d[0] +ldr q17, [x0, #1968] +mla z28.d, P0/M, z19.d, z31.d +sub z19.d, z18.d, z16.d +add z18.d, z18.d, z16.d +sqrdmulh z16.d, z27.d, z13.d[0] +str q26, [x0, #1568] +mul z27.d, z27.d,z12.d[0] +ldr q26, [x0, #1904] +mla z20.d, P0/M, z23.d, z31.d +sub z23.d, z26.d, z28.d +add z26.d, z26.d, z28.d +sqrdmulh z28.d, z17.d, z13.d[0] +str q22, [x0, #1600] +mul z17.d, z17.d,z12.d[0] +ldr q22, [x0, #1872] +mla z27.d, P0/M, z16.d, z31.d +sub z16.d, z22.d, z20.d +add z22.d, z22.d, z20.d +sqrdmulh z20.d, z26.d, z15.d[0] +str q29, [x0, #1632] +mul z26.d, z26.d,z14.d[0] +ldr q29, [x0, #1808] +mla z17.d, P0/M, z28.d, z31.d +sub z28.d, z29.d, z27.d +add z29.d, z29.d, z27.d +sqrdmulh z27.d, z22.d, z15.d[0] +str q24, [x0, #1728] +mul z22.d, z22.d,z14.d[0] +ldr q24, [x0, #1840] +mla z26.d, P0/M, z20.d, z31.d +sub z20.d, z24.d, z17.d +add z24.d, z24.d, z17.d +sqrdmulh z17.d, z23.d, z15.d[1] +str q25, [x0, #1760] +mul z23.d, z23.d,z14.d[1] +mla z22.d, P0/M, z27.d, z31.d +sub z27.d, z24.d, z26.d +add z24.d, z24.d, z26.d +sqrdmulh z26.d, z16.d, z15.d[1] +str q18, [x0, #1664] +mul z16.d, z16.d,z14.d[1] +mla z23.d, P0/M, z17.d, z31.d +sub z17.d, z29.d, z22.d +add z29.d, z29.d, z22.d +sqrdmulh z22.d, z24.d, z1.d[0] +str q19, [x0, #1696] +mul z24.d, z24.d,z0.d[0] +mla z16.d, P0/M, z26.d, z31.d +sub z26.d, z20.d, z23.d +add z20.d, z20.d, z23.d +sqrdmulh z23.d, z27.d, z1.d[1] +mul z27.d, z27.d,z0.d[1] +mla z24.d, P0/M, z22.d, z31.d +sub z22.d, z28.d, z16.d +add z28.d, z28.d, z16.d +sqrdmulh z16.d, z26.d, z3.d[1] +mul z26.d, z26.d,z2.d[1] +ldr q19, [x0, #2016] +mla z27.d, P0/M, z23.d, z31.d +sub z23.d, z29.d, z24.d +add z29.d, z29.d, z24.d +sqrdmulh z24.d, z20.d, z3.d[0] +mul z20.d, z20.d,z2.d[0] +ldr q18, [x0, #1984] +mla z26.d, P0/M, z16.d, z31.d +sub z16.d, z17.d, z27.d +add z17.d, z17.d, z27.d +sqrdmulh z27.d, z19.d, z13.d[0] +mul z19.d, z19.d,z12.d[0] +ldr q25, [x0, #1920] +mla z20.d, P0/M, z24.d, z31.d +sub z24.d, z22.d, z26.d +add z22.d, z22.d, z26.d +sqrdmulh z26.d, z18.d, z13.d[0] +str q29, [x0, #1808] +mul z18.d, z18.d,z12.d[0] +ldr q29, [x0, #1952] +mla z19.d, P0/M, z27.d, z31.d +sub z27.d, z28.d, z20.d +add z28.d, z28.d, z20.d +sqrdmulh z20.d, z25.d, z13.d[0] +str q23, [x0, #1840] +mul z25.d, z25.d,z12.d[0] +ldr q23, [x0, #1888] +mla z18.d, P0/M, z26.d, z31.d +sub z26.d, z23.d, z19.d +add z23.d, z23.d, z19.d +sqrdmulh z19.d, z29.d, z13.d[0] +str q17, [x0, #1872] +mul z29.d, z29.d,z12.d[0] +ldr q17, [x0, #1856] +mla z25.d, P0/M, z20.d, z31.d +sub z20.d, z17.d, z18.d +add z17.d, z17.d, z18.d +sqrdmulh z18.d, z23.d, z15.d[0] +str q16, [x0, #1904] +mul z23.d, z23.d,z14.d[0] +ldr q16, [x0, #1792] +mla z29.d, P0/M, z19.d, z31.d +sub z19.d, z16.d, z25.d +add z16.d, z16.d, z25.d +sqrdmulh z25.d, z17.d, z15.d[0] +str q22, [x0, #2000] +mul z17.d, z17.d,z14.d[0] +ldr q22, [x0, #1824] +mla z23.d, P0/M, z18.d, z31.d +sub z18.d, z22.d, z29.d +add z22.d, z22.d, z29.d +sqrdmulh z29.d, z26.d, z15.d[1] +str q24, [x0, #2032] +mul z26.d, z26.d,z14.d[1] +mla z17.d, P0/M, z25.d, z31.d +sub z25.d, z22.d, z23.d +add z22.d, z22.d, z23.d +sqrdmulh z23.d, z20.d, z15.d[1] +str q28, [x0, #1936] +mul z20.d, z20.d,z14.d[1] +mla z26.d, P0/M, z29.d, z31.d +sub z29.d, z16.d, z17.d +add z16.d, z16.d, z17.d +sqrdmulh z17.d, z22.d, z1.d[0] +str q27, [x0, #1968] +mul z22.d, z22.d,z0.d[0] +mla z20.d, P0/M, z23.d, z31.d +sub z23.d, z18.d, z26.d +add z18.d, z18.d, z26.d +sqrdmulh z26.d, z25.d, z1.d[1] +mul z25.d, z25.d,z0.d[1] +mla z22.d, P0/M, z17.d, z31.d +sub z17.d, z19.d, z20.d +add z19.d, z19.d, z20.d +sqrdmulh z20.d, z23.d, z3.d[1] +mul z23.d, z23.d,z2.d[1] +mla z25.d, P0/M, z26.d, z31.d +sub z26.d, z16.d, z22.d +add z16.d, z16.d, z22.d +sqrdmulh z22.d, z18.d, z3.d[0] +mul z18.d, z18.d,z2.d[0] +mla z23.d, P0/M, z20.d, z31.d +sub z20.d, z29.d, z25.d +add z29.d, z29.d, z25.d +mla z18.d, P0/M, z22.d, z31.d +sub z22.d, z17.d, z23.d +add z17.d, z17.d, z23.d +str q16, [x0, #1792] +sub z16.d, z19.d, z18.d +add z19.d, z19.d, z18.d +str q26, [x0, #1824] +str q29, [x0, #1856] +str q20, [x0, #1888] +str q17, [x0, #1984] +str q22, [x0, #2016] +str q19, [x0, #1920] +str q16, [x0, #1952] +// Restore SVE2 vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 2697 +// Instruction count: 2693 \ No newline at end of file diff --git a/asm/manual/basemul_s64/basemul_64_72057594067788289.s b/asm/manual/basemul_s64/basemul_64_72057594067788289.s new file mode 100644 index 0000000..d82d6d2 --- /dev/null +++ b/asm/manual/basemul_s64/basemul_64_72057594067788289.s @@ -0,0 +1,105 @@ + +.macro save_regs + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + stp x29, x30, [sp, #16*5] + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_regs + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldp x29, x30, [sp, #16*5] + add sp, sp, #(16*5+16) +.endm + +.data +modulus: + .dword 72057594067788289 + .dword 249802778572774913 + + .text + .type basemul_u64, %function + .global basemul_u64 + +modulus_addr: + .dword modulus +basemul_u64: + dst .req x0 + src_a .req x1 + src_b .req x2 + count .req x3 + + addr .req x4 + + in_a0 .req z0 + in_a1 .req z1 + in_b0 .req z2 + in_b1 .req z3 + dst_0 .req z4 + dst_1 .req z5 + + in_a0q .req q0 + in_a1q .req q1 + in_b0q .req q2 + in_b1q .req q3 + dst_0q .req q4 + dst_1q .req q5 + + modulus .req z6 + twist .req z7 + + tmp .req z8 + + save_regs + + ptrue P0.d + + ldr addr, modulus_addr + ld1rd {modulus.d}, P0/z, [addr, #0] + ld1rd {twist.d}, P0/z, [addr, #8] + + // # of elements must be divisible by 4 + mov count, count, LSR #2 + cmp count, #0 + b.eq 2f +1: + ldp in_a0q, in_a1q, [src_a], #32 + ldp in_b0q, in_b1q, [src_b], #32 + + sqdmulh dst_0.d, in_a0.d, in_b0.d + mul tmp.d, in_a0.d, in_b0.d + mul tmp.d, tmp.d, twist.d + sqdmulh tmp.d, tmp.d, modulus.d + shsub dst_0.d, P0/M, dst_0.d, tmp.d + + sqdmulh dst_1.d, in_a1.d, in_b1.d + mul tmp.d, in_a1.d, in_b1.d + mul tmp.d, tmp.d, twist.d + sqdmulh tmp.d, tmp.d, modulus.d + shsub dst_1.d, P0/M, dst_1.d, tmp.d + + stp dst_0q, dst_1q, [dst], #32 + + subs count, count, 1 + bne 1b +2: + restore_regs + ret diff --git a/asm/manual/keccak_f1600/keccak_f1600.py b/asm/manual/keccak_f1600/keccak_f1600.py new file mode 100644 index 0000000..38c06e1 --- /dev/null +++ b/asm/manual/keccak_f1600/keccak_f1600.py @@ -0,0 +1,508 @@ +## MIT License +## +## Copyright (c) 2021 Arm Limited +## +## Permission is hereby granted, free of charge, to any person obtaining a copy +## of this software and associated documentation files (the "Software"), to deal +## in the Software without restriction, including without limitation the rights +## to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +## copies of the Software, and to permit persons to whom the Software is +## furnished to do so, subject to the following conditions: +## +## The above copyright notice and this permission notice shall be included in all +## copies or substantial portions of the Software. +## +## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +## IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +## FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +## AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +## LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +## OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +## SOFTWARE. +## + +## +## Author: Hanno Becker +## + +#------------------------------------------------# +# Miscellaneous # +#------------------------------------------------# + +def rev_dict(d): + return { v:k for k,v in d.items() } + +#------------------------------------------------# +# Data from the Keccak-f1600 specification # +#------------------------------------------------# + +# Keccak-f1600 state indices +idxs = [(x,y) for x in range(0,5) for y in range(0,5)] + +# Permutation +perm = {} +for x,y in idxs: + xp,yp = y, (2*x+3*y) % 5 + perm[xp,yp] = (x,y) +perm_inv = rev_dict(perm) + +# Rotation offsets +rot = [ + [ 0, 1, 62, 28, 27 ], + [ 36, 44, 6, 55, 20 ], + [ 3, 10, 43, 25, 39 ], + [ 41, 45, 15, 21, 8 ], + [ 18, 2, 61, 56, 14 ] ] + +#------------------------------------------------# +# Helper for register allocations # +#------------------------------------------------# + +v84a = False +delay_rotation = True + +if v84a: + num_registers = 32 +else: + num_registers = 31 + +# Helper to manage the available registers +class RegList(): + def __init__(self, regs): + self._orig_regs = regs + regs.reverse() + self._regs = regs + self._free = [] + for r in regs: + self._free.append(r) + self._alloc = [] + def alloc(self,reg=None): + if reg == None: + assert len(self._free) > 0 + reg = self._free.pop() + else: + assert reg in self._free + self._free.remove(reg) + self._alloc.append(reg) + return reg + def free(self,reg): + assert reg in self._regs + assert reg in self._alloc + self._alloc.remove(reg) + self._free.append(reg) + def reset(self): + self.__init__(self._orig_regs) + +regs = RegList(list(range(0,num_registers))) + +#------------------------------------------------# +# Actual work # +#------------------------------------------------# + +# How to label the Keccak-f1600 state in the code +def label(x,y): + y_label = "bgkms" + x_label = "aeiou" + return f"{y_label[y]}{x_label[x]}" + +def lbl_A(x,y,q=False): + if q == False or not v84a: + return f"A{label(x,y)}" + else: + return f"A{label(x,y)}q" +def lbl_B(x,y): + return f"A{label(x,y)}_" +def lbl_C(x): + return f"C{x}" +def lbl_D(x): + return f"E{x}" + +def eor5(d,s0,s1,s2,s3,s4, + s0_rot=0,s1_rot=0,s2_rot=0,s3_rot=0,s4_rot=0): + s = [s0,s1,s2,s3,s4] + srot = [s0_rot,s1_rot,s2_rot,s3_rot,s4_rot] + if v84a: + assert s0_rot == 0 + assert s1_rot == 0 + assert s2_rot == 0 + assert s3_rot == 0 + assert s4_rot == 0 + yield f"eor3 {d}.16b, {s0}.16b, {s1}.16b, {s2}.16b" + yield f"eor3 {d}.16b, {d}.16b, {s3}.16b, {s4}.16b" + else: + rots = [ (i,srot[i]) for i in range(0,5) ] + rots.sort(key=lambda x:x[1]) + print(f"// EOR5: {s}, {rots}") + # cur = s[rots[4][0]] + # for i in [4,3,2,1]: + # print(f"// Current delayed rotations: {rots[i][1]}, {rots[i-1][1]}") + # r = (64 - (rots[i][1] - rots[i-1][1]))%64 + # if r != 0: + # yield f"eor {d}, {s[rots[i-1][0]]}, {cur}, ROR #{r}" + # else: + # yield f"eor {d}, {s[rots[i-1][0]]}, {cur}" + # cur = d + cur = s[rots[0][0]] + for i in [1,2,3,4]: + r = (64 - (rots[i][1] - rots[0][1]))%64 + yield f"eor {d}, {cur}, {s[rots[i][0]]}, ROR #{r}" + cur = d + if rots[0][1] != 0: + yield f"ror {cur}, {cur}, {(64-rots[0][1])%64}" + # yield f"eor {d}, {s0}, {s1}" + # yield f"eor {d}, {d}, {s2}" + # yield f"eor {d}, {d}, {s3}" + # yield f"eor {d}, {d}, {s4}" + +def eor_and_rol(d,s0,s1,imm,rot=0): + if imm == 0: + if v84a: + yield f"eor {d}.16b, {s0}.16b, {s1}.16b" + else: + yield f"eor {d}, {s0}, {s1}" + else: + if v84a: + yield f"xar {d}.2d, {s0}.2d, {s1}.2d, #{(64-imm)%64}" + else: + if not delay_rotation: + yield f"eor {d}, {s0}, {s1}" + yield f"ror {d}, {d}, #{64-imm}" + else: + if rot == 0: + yield f"eor {d}, {s0}, {s1}" + else: + yield f"eor {d}, {s0}, {s1}, ROR #{(64-rot)%64}" + +def bitwise_clear_and_xor(d,s0,s1,s2,tmp=None,bic_rot=0,eor_rot=0, eor_rot2=0): + bic_rot = (64-bic_rot) % 64 + eor_rot = (64-eor_rot) % 64 + eor_rot2 = (64-eor_rot2) % 64 + if v84a: + yield f"bcax {d}.16b, {s0}.16b, {s1}.16b, {s2}.16b" + else: + assert tmp != None + if bic_rot == 0: + yield f"bic {tmp}, {s1}, {s2}" + else: + yield f"bic {tmp}, {s1}, {s2}, ROR #{bic_rot}" + if eor_rot != 0: + yield f"eor {d}, {tmp}, {s0}, ROR #{eor_rot}" + elif eor_rot2 != 0: + yield f"eor {d}, {s0}, {tmp}, ROR #{eor_rot2}" + else: + yield f"eor {d}, {tmp}, {s0}" + + +def rax1(d,s0,s1): + if v84a: + yield f"rax1 {d}.2d, {s0}.2d, {s1}.2d" + else: + yield f"eor {d}, {s0}, {s1}, ROR #63" + +def alloc_state(): + global s_stable + global s_stable_rev + + if not v84a: + regs.alloc(0) # Don't use x0 + + # Allocate locations for Keccak-f1600 state + # at the beginning and end of each round + s_stable = {} + for x,y in idxs: + # Not necessary, but fix allocation for ease of reading + # loc = 5*y+x + s_stable[x,y] = regs.alloc() + s_stable_rev = rev_dict(s_stable) + + if not v84a: + regs.free(0) # Don't use x0 + +def load_input(): + if v84a: + simd_width = 2 + else: + simd_width = 1 + + for y,x in idxs: + idx = 5*y+x + yield f"ldr {lbl_A(x,y,q=True)}, [input_addr, #({simd_width}*8*{idx})]" + +def store_input(): + if v84a: + simd_width = 2 + else: + simd_width = 1 + + for y,x in idxs: + idx = 5*y+x + yield f"str {lbl_A(x,y,q=True)}, [input_addr, #({simd_width}*8*{idx})]" + + + +delayed_rotations = {} +delayed_rotations_alt = {} +for x,y in idxs: + delayed_rotations[x,y] = 0 + delayed_rotations_alt[x,y] = 0 + +def generate_round(): + global c + global d + global s_tmp + global delayed_rotations + global delayed_rotations_alt + + # SPECIFICATION: + # C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 + + c = {} + for x in range(0,5): + c[x] = regs.alloc() + yield "" + + for x in range(0,5): + yield from eor5(f"{lbl_C(x)}", + f"{lbl_A(x,0)}", + f"{lbl_A(x,1)}", + f"{lbl_A(x,2)}", + f"{lbl_A(x,3)}", + f"{lbl_A(x,4)}", + s0_rot = delayed_rotations[x,0], + s1_rot = delayed_rotations[x,1], + s2_rot = delayed_rotations[x,2], + s3_rot = delayed_rotations[x,3], + s4_rot = delayed_rotations[x,4]) + + yield "" + + # SPECIFICATION: + # D[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 + + # Overlap D[] and C[] except for one register, to keep + # the total # of registers for C[], D[] down to 6 + # (we already allocate 25 for the state) + d = {} + for x in range(0,5): + if x == 1: + d[x] = regs.alloc() + else: + d[x] = c[(x-1)%5] + yield "" + + x_order = [1,3,0,2,4] + for x in x_order: + xm = (x-1)%5 + xp = (x+1)%5 + yield from rax1(lbl_D(x),lbl_C(xm),lbl_C(xp)) + + regs.free(c[0]) + # SPECIFICATION: + # A[x,y] = A[x,y] xor D[x], for (x,y) in (0..4,0..4) + # B[y,2*x+3*y] = rot(A[x,y], r[x,y]), for (x,y) in (0..4,0..4) + + # Compute rho and pi steps into temporary state, making sure to overwrite + # stable state only after it has been used. Consequently, we start with + # one of the two temporary states which uses a fresh register + + yield "" + + # Order of rows for xi-step + row_order = {0:1,1:2,2:3,3:4,4:0} + # Where to start each row in the xi-step + order_bases = [(0,1) for _ in range(0,5)] + row_order_rev = rev_dict(row_order) + last_row = row_order[4] + + # Assign registers for temporary Keccak-f1600 state + s_tmp = {} + for x,y in idxs: + if x in order_bases[y]: + if x == order_bases[y][0]: + idx=0 + else: + idx=1 + row_idx = row_order_rev[y] + if row_idx == 4: + s_tmp[x,y] = None # Allocation later + else: + next_y = row_order[row_idx+1] + s_tmp[x,y] = s_stable[order_bases[next_y][idx],next_y] + else: + s_tmp[x,y] = s_stable[x,y] + + loc = regs.alloc() + x = order_bases[last_row][0] + s_tmp[x, last_row] = loc + s_tmp_rev = rev_dict(s_tmp) + total = 0 + + while loc in s_tmp_rev.keys(): + xp,yp = s_tmp_rev[loc] + x,y = perm[xp,yp] + yield from eor_and_rol(f"{lbl_B(xp,yp)}", + f"{lbl_D(x)}", + f"{lbl_A(x,y)}",rot[y][x], + rot=delayed_rotations[x,y]) + loc = s_stable[x,y] + total += 1 + # The row order and order base is experimentally chosen in such a way + # that we have only two chains, one of length 24 and one of length 1. + # This means that after processing the length 24 chain, 4 out of 5 d[i] + # temporaries are not needed anymore, so we can use one of them for the + # second temporary state. + # This is only strictly necessary for the scalar case, where we have 31 registers. + assert total == 24 + + yield "" + + xp = order_bases[last_row][1] + yp = last_row + # Confirm again that this is a length 1 chain + x,y=perm[xp,yp] + # We can now free all but one D[x] + assert s_stable[x,y] not in s_tmp_rev.keys() + for i in [ i for i in range(0,5) if i != x ]: + regs.free(d[i]) + loc = regs.alloc() + s_tmp[xp,yp] = loc + s_tmp_rev = rev_dict(s_tmp) + yield from eor_and_rol(f"{lbl_B(xp,yp)}", + f"{lbl_D(x)}", + f"{lbl_A(x,y)}", + rot[y][x], + rot=delayed_rotations[x,y]) + regs.free(d[x]) + + yield "" + yield "// xi step" + + # xi-step + # + # SPECIFICATION: + # A[x,y] = B[x,y] xor ((not B[x+1,y]) and B[x+2,y]), for (x,y) in (0..4,0..4) + # + # We compute this in a specific order of rows, and order within row + + if not v84a: + global tmp + tmp = regs.alloc() + + for row in range(0,5): + y = row_order[row] + yield f"// Row {y}" + base_x = order_bases[y][0] + for offset in range(0,5): + x = (base_x + offset) % 5 + xp = (x+1)%5 + xpp = (x+2)%5 + + if delay_rotation: + xr ,yr = perm[x ,y] + xpr ,ypr = perm[xp ,y] + xppr,yppr = perm[xpp,y] + r = rot[yr] [xr] + rp = rot[ypr] [xpr] + rpp = rot[yppr][xppr] + + # We're looking at an expression of the form + # (A <<< x) XOR (not(B <<< y) AND (C <<< z)) + # and want to write it as a composition of + # XOR-with-ROT, BIC-with-ROT and ROT. + # There are two possibilities: + # 1) (A XOR (not (B <<< (y-z)) AND C) <<< (z-x)) <<< x + # or + # 2) ((not (B <<< (y-z) AND C)) XOR (A <<< (x-z))) <<< z + # + # If z is zero, we go for 2). Otherwiswe, we go for 1) + + if r != 0: + yield from bitwise_clear_and_xor(f"{lbl_A(x,y)}", + f"{lbl_B(x,y)}", + f"{lbl_B(xpp,y)}", + f"{lbl_B(xp,y)}", + tmp="tmp", + bic_rot=rp-rpp, + eor_rot=r-rpp) + delayed_rotations[x,y] = rpp + delayed_rotations_alt[x,y] = r + else: + yield from bitwise_clear_and_xor(f"{lbl_A(x,y)}", + f"{lbl_B(x,y)}", + f"{lbl_B(xpp,y)}", + f"{lbl_B(xp,y)}", + tmp="tmp", + bic_rot=rp-rpp, + eor_rot2=rpp) + delayed_rotations[x,y] = r + delayed_rotations_alt[x,y] = rpp + + else: + yield from bitwise_clear_and_xor(f"{lbl_A(x,y)}", + f"{lbl_B(x,y)}", + f"{lbl_B(xpp,y)}", + f"{lbl_B(xp,y)}", + tmp="tmp") + + yield "" + + for x,y in idxs: + yield f"// Shift for {lbl_A(x,y)}: {delayed_rotations[x,y]} (alt {delayed_rotations_alt[x,y]})" + + + if not v84a: + regs.free(tmp) + + # iota step + yield "// iota step" + yield "# FILL IN" + yield f"eor {lbl_A(0,0)}, {lbl_A(0,0)}, CONSTANT" + +def print_allocations(): + + for y,x in idxs: + if v84a: + yield f"{lbl_A(x,y)} .req v{s_stable[x,y]}" + else: + yield f"{lbl_A(x,y)} .req x{s_stable[x,y]}" + for y,x in idxs: + if v84a: + yield f"{lbl_A(x,y,q=True)} .req q{s_stable[x,y]}" + + yield "" + + # Print allocations + for y,x in idxs: + if v84a: + yield f"{lbl_B(x,y)} .req v{s_tmp[x,y]}" + else: + yield f"{lbl_B(x,y)} .req x{s_tmp[x,y]}" + yield "" + + for x in range(0,5): + if v84a: + yield f"{lbl_C(x)} .req v{c[x]}" + yield f"{lbl_D(x)} .req v{d[x]}" + else: + yield f"{lbl_C(x)} .req x{c[x]}" + yield f"{lbl_D(x)} .req x{d[x]}" + + if not v84a: + yield "" + yield f"tmp .req {tmp}" + +def codegen(): + alloc_state() + yield from generate_round() + yield "//////////////////////////////////////////////////////////" + regs.reset() + yield from generate_round() + yield "//////////////////////////////////////////////////////////" + yield from store_input() + yield "//////////////////////////////////////////////////////////" + yield from load_input() + yield "//////////////////////////////////////////////////////////" + yield from print_allocations() + +for line in codegen(): + print(line) diff --git a/asm/manual/keccak_f1600/keccak_f1600_variants.h b/asm/manual/keccak_f1600/keccak_f1600_variants.h new file mode 100644 index 0000000..400e90f --- /dev/null +++ b/asm/manual/keccak_f1600/keccak_f1600_variants.h @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#ifndef KECCAK_F1600_MANUAL_H +#define KECCAK_F1600_MANUAL_H + +#include + +#define KECCAK_F1600_X1_STATE_SIZE_BITS 1600 +#define KECCAK_F1600_X1_STATE_SIZE_BYTES (KECCAK_F1600_X1_STATE_SIZE_BITS/8) +#define KECCAK_F1600_X1_STATE_SIZE_UINT64 (KECCAK_F1600_X1_STATE_SIZE_BYTES/8) + +#define KECCAK_F1600_X2_STATE_SIZE_BITS (2*1600) +#define KECCAK_F1600_X2_STATE_SIZE_BYTES (KECCAK_F1600_X2_STATE_SIZE_BITS/8) +#define KECCAK_F1600_X2_STATE_SIZE_UINT64 (KECCAK_F1600_X2_STATE_SIZE_BYTES/8) + +/* Third party implementations */ +void keccak_f1600_x1_scalar_C ( uint64_t state[KECCAK_F1600_X1_STATE_SIZE_UINT64] ); +void keccak_f1600_x2_scalar_C ( uint64_t state[KECCAK_F1600_X2_STATE_SIZE_UINT64] ); +void keccak_f1600_x2_bas ( uint64_t state[KECCAK_F1600_X2_STATE_SIZE_UINT64] ); +#include +typedef uint64x2_t v128; +void keccak_f1600_x2_neon_C_cothan( v128 state[25] ); + +/* PQAX implementations */ +void keccak_f1600_x2_v84a_asm_v1( uint64_t state[KECCAK_F1600_X2_STATE_SIZE_UINT64] ); +void keccak_f1600_x2_v84a_asm_v1p0( uint64_t state[KECCAK_F1600_X2_STATE_SIZE_UINT64] ); +void keccak_f1600_x4_v84a_asm_v1p0( uint64_t state[KECCAK_F1600_X2_STATE_SIZE_UINT64] ); +void keccak_f1600_x2_v84a_asm_v2( uint64_t state[KECCAK_F1600_X2_STATE_SIZE_UINT64] ); +void keccak_f1600_x2_v84a_asm_v2p0( uint64_t state[KECCAK_F1600_X2_STATE_SIZE_UINT64] ); +void keccak_f1600_x2_v84a_asm_v2p1( uint64_t state[KECCAK_F1600_X2_STATE_SIZE_UINT64] ); +void keccak_f1600_x2_v84a_asm_v2p2( uint64_t state[KECCAK_F1600_X2_STATE_SIZE_UINT64] ); +void keccak_f1600_x2_v84a_asm_v2p3( uint64_t state[KECCAK_F1600_X2_STATE_SIZE_UINT64] ); +void keccak_f1600_x2_v84a_asm_v2p4( uint64_t state[KECCAK_F1600_X2_STATE_SIZE_UINT64] ); +void keccak_f1600_x2_v84a_asm_v2p5( uint64_t state[KECCAK_F1600_X2_STATE_SIZE_UINT64] ); +void keccak_f1600_x2_v84a_asm_v2p6( uint64_t state[KECCAK_F1600_X2_STATE_SIZE_UINT64] ); +void keccak_f1600_x2_v84a_asm_v2pp0( uint64_t state[KECCAK_F1600_X2_STATE_SIZE_UINT64] ); +void keccak_f1600_x2_v84a_asm_v2pp1( uint64_t state[KECCAK_F1600_X2_STATE_SIZE_UINT64] ); +void keccak_f1600_x2_v84a_asm_v2pp2( uint64_t state[KECCAK_F1600_X2_STATE_SIZE_UINT64] ); +void keccak_f1600_x2_v84a_asm_v2pp3( uint64_t state[KECCAK_F1600_X2_STATE_SIZE_UINT64] ); +void keccak_f1600_x2_v84a_asm_v2pp4( uint64_t state[KECCAK_F1600_X2_STATE_SIZE_UINT64] ); +void keccak_f1600_x2_v84a_asm_v2pp5( uint64_t state[KECCAK_F1600_X2_STATE_SIZE_UINT64] ); +void keccak_f1600_x2_v84a_asm_v2pp6( uint64_t state[KECCAK_F1600_X2_STATE_SIZE_UINT64] ); +void keccak_f1600_x2_v84a_asm_v2pp7( uint64_t state[KECCAK_F1600_X2_STATE_SIZE_UINT64] ); + +void keccak_f1600_x1_scalar_C_original( uint64_t state[KECCAK_F1600_X1_STATE_SIZE_UINT64] ); +void keccak_f1600_x1_scalar_C_v0( uint64_t state[KECCAK_F1600_X1_STATE_SIZE_UINT64] ); +void keccak_f1600_x1_scalar_C_v1( uint64_t state[KECCAK_F1600_X1_STATE_SIZE_UINT64] ); + +void keccak_f1600_x1_scalar_asm_v1( uint64_t state[KECCAK_F1600_X1_STATE_SIZE_UINT64] ); +void keccak_f1600_x1_scalar_asm_v2( uint64_t state[KECCAK_F1600_X1_STATE_SIZE_UINT64] ); +void keccak_f1600_x1_scalar_asm_v3( uint64_t state[KECCAK_F1600_X1_STATE_SIZE_UINT64] ); +void keccak_f1600_x1_scalar_asm_v4( uint64_t state[KECCAK_F1600_X1_STATE_SIZE_UINT64] ); +void keccak_f1600_x1_scalar_asm_v5( uint64_t state[KECCAK_F1600_X1_STATE_SIZE_UINT64] ); + +void keccak_f1600_x4_scalar_asm_v1( uint64_t state[4*KECCAK_F1600_X1_STATE_SIZE_UINT64] ); +void keccak_f1600_x4_scalar_asm_v5( uint64_t state[4*KECCAK_F1600_X1_STATE_SIZE_UINT64] ); + +void keccak_f1600_x3_hybrid_asm_v3p( uint64_t state[3*KECCAK_F1600_X1_STATE_SIZE_UINT64] ); +void keccak_f1600_x3_hybrid_asm_v6( uint64_t state[3*KECCAK_F1600_X1_STATE_SIZE_UINT64] ); +void keccak_f1600_x3_hybrid_asm_v7( uint64_t state[3*KECCAK_F1600_X1_STATE_SIZE_UINT64] ); + + +void keccak_f1600_x4_hybrid_asm_v1 ( uint64_t state[4*KECCAK_F1600_X1_STATE_SIZE_UINT64] ); +void keccak_f1600_x4_hybrid_asm_v2 ( uint64_t state[4*KECCAK_F1600_X1_STATE_SIZE_UINT64] ); +void keccak_f1600_x4_hybrid_asm_v2p0( uint64_t state[4*KECCAK_F1600_X1_STATE_SIZE_UINT64] ); +void keccak_f1600_x4_hybrid_asm_v3 ( uint64_t state[4*KECCAK_F1600_X1_STATE_SIZE_UINT64] ); +void keccak_f1600_x4_hybrid_asm_v3p( uint64_t state[4*KECCAK_F1600_X1_STATE_SIZE_UINT64] ); +void keccak_f1600_x4_hybrid_asm_v3pp( uint64_t state[4*KECCAK_F1600_X1_STATE_SIZE_UINT64] ); +void keccak_f1600_x4_hybrid_asm_v4 ( uint64_t state[4*KECCAK_F1600_X1_STATE_SIZE_UINT64] ); +void keccak_f1600_x4_hybrid_asm_v4p ( uint64_t state[4*KECCAK_F1600_X1_STATE_SIZE_UINT64] ); +void keccak_f1600_x4_hybrid_asm_v5 ( uint64_t state[4*KECCAK_F1600_X1_STATE_SIZE_UINT64] ); +void keccak_f1600_x4_hybrid_asm_v5p ( uint64_t state[4*KECCAK_F1600_X1_STATE_SIZE_UINT64] ); +void keccak_f1600_x4_hybrid_asm_v6 ( uint64_t state[4*KECCAK_F1600_X1_STATE_SIZE_UINT64] ); +void keccak_f1600_x4_hybrid_asm_v7 ( uint64_t state[4*KECCAK_F1600_X1_STATE_SIZE_UINT64] ); +void keccak_f1600_x4_hybrid_asm_v8 ( uint64_t state[4*KECCAK_F1600_X1_STATE_SIZE_UINT64] ); + +void keccak_f1600_x5_hybrid_asm_v8 ( uint64_t state[4*KECCAK_F1600_X1_STATE_SIZE_UINT64] ); +void keccak_f1600_x5_hybrid_asm_v8p ( uint64_t state[4*KECCAK_F1600_X1_STATE_SIZE_UINT64] ); + +void keccak_f1600_x2_hybrid_asm_v1 ( uint64_t state[2*KECCAK_F1600_X1_STATE_SIZE_UINT64] ); +void keccak_f1600_x2_hybrid_asm_v2p0 ( uint64_t state[2*KECCAK_F1600_X1_STATE_SIZE_UINT64] ); +void keccak_f1600_x2_hybrid_asm_v2p1 ( uint64_t state[2*KECCAK_F1600_X1_STATE_SIZE_UINT64] ); +void keccak_f1600_x2_hybrid_asm_v2p2 ( uint64_t state[2*KECCAK_F1600_X1_STATE_SIZE_UINT64] ); +void keccak_f1600_x2_hybrid_asm_v2pp0 ( uint64_t state[2*KECCAK_F1600_X1_STATE_SIZE_UINT64] ); +void keccak_f1600_x2_hybrid_asm_v2pp1 ( uint64_t state[2*KECCAK_F1600_X1_STATE_SIZE_UINT64] ); +void keccak_f1600_x2_hybrid_asm_v2pp2 ( uint64_t state[2*KECCAK_F1600_X1_STATE_SIZE_UINT64] ); + +#endif diff --git a/asm/manual/keccak_f1600/keccak_f1600_x1_scalar_C.c b/asm/manual/keccak_f1600/keccak_f1600_x1_scalar_C.c new file mode 100644 index 0000000..2feca64 --- /dev/null +++ b/asm/manual/keccak_f1600/keccak_f1600_x1_scalar_C.c @@ -0,0 +1,591 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +// Derived from public domain implementation +// in crypto_hash/keccakc512/simple/ from http://bench.cr.yp.to/supercop.html +// by Ronny Van Keer. + +#include "keccak_f1600_variants.h" + +#define KECCAK_F1600_ROUNDS 24 + +static const uint64_t round_constants[KECCAK_F1600_ROUNDS] = +{ + (uint64_t)0x0000000000000001ULL, + (uint64_t)0x0000000000008082ULL, + (uint64_t)0x800000000000808aULL, + (uint64_t)0x8000000080008000ULL, + (uint64_t)0x000000000000808bULL, + (uint64_t)0x0000000080000001ULL, + (uint64_t)0x8000000080008081ULL, + (uint64_t)0x8000000000008009ULL, + (uint64_t)0x000000000000008aULL, + (uint64_t)0x0000000000000088ULL, + (uint64_t)0x0000000080008009ULL, + (uint64_t)0x000000008000000aULL, + (uint64_t)0x000000008000808bULL, + (uint64_t)0x800000000000008bULL, + (uint64_t)0x8000000000008089ULL, + (uint64_t)0x8000000000008003ULL, + (uint64_t)0x8000000000008002ULL, + (uint64_t)0x8000000000000080ULL, + (uint64_t)0x000000000000800aULL, + (uint64_t)0x800000008000000aULL, + (uint64_t)0x8000000080008081ULL, + (uint64_t)0x8000000000008080ULL, + (uint64_t)0x0000000080000001ULL, + (uint64_t)0x8000000080008008ULL +}; + +/* Note: It should not be necessary to use inline assembly here, but + * compilers don't seem to reliably detect potential uses of + * EOR-with-ROR and BIC-with-ROR at the time of writing. */ + +#if defined(inline) +#undef inline +#endif + +#define inline __attribute__((unused)) inline + +#define GEN_BIC_ROL(imm) \ +static inline uint64_t bic_rol_ ## imm ( uint64_t b, uint64_t a ) \ +{ \ + uint64_t res = 0; \ + __asm ("bic %[result], %[input_a], %[input_b], ROR #(64-" #imm ")" \ + : [result] "=r" (res) \ + : [input_a] "r" (a), [input_b] "r" (b) \ + ); \ + return( res ); \ +} + +#define GEN_XOR_ROL(imm) \ +static inline uint64_t xor_rol_ ## imm ( uint64_t b, uint64_t a ) \ +{ \ + uint64_t res = 0; \ + __asm ("eor %[result], %[input_a], %[input_b], ROR #(64-" #imm ")" \ + : [result] "=r" (res) \ + : [input_a] "r" (a), [input_b] "r" (b) \ + ); \ + return( res ); \ +} + +#define GEN_ROL(imm) \ +static inline uint64_t rol_ ## imm ( uint64_t a ) \ +{ \ + uint64_t res = 0; \ + __asm ("ROR %[result], %[input_a], #(64-" #imm ")" \ + : [result] "=r" (res) \ + : [input_a] "r" (a) \ + ); \ + return( res ); \ +} + +#define GEN_ALL(F) \ + F(0) F(1) F(2) F(3) F(4) F(5) F(6) F(7) \ + F(8) F(9) F(10) F(11) F(12) F(13) F(14) F(15) \ + F(16) F(17) F(18) F(19) F(20) F(21) F(22) F(23) \ + F(24) F(25) F(26) F(27) F(28) F(29) F(30) F(31) \ + F(32) F(33) F(34) F(35) F(36) F(37) F(38) F(39) \ + F(40) F(41) F(42) F(43) F(44) F(45) F(46) F(47) \ + F(48) F(49) F(50) F(51) F(52) F(53) F(54) F(55) \ + F(56) F(57) F(58) F(59) F(60) F(61) F(62) F(63) + +GEN_ALL(GEN_BIC_ROL) +GEN_ALL(GEN_ROL) +GEN_ALL(GEN_XOR_ROL) + +void keccak_f1600_x1_scalar_C_v0( uint64_t state[KECCAK_F1600_X1_STATE_SIZE_UINT64] ) +{ + int round; + + uint64_t Aba, Abe, Abi, Abo, Abu; + uint64_t Aga, Age, Agi, Ago, Agu; + uint64_t Aka, Ake, Aki, Ako, Aku; + uint64_t Ama, Ame, Ami, Amo, Amu; + uint64_t Asa, Ase, Asi, Aso, Asu; + uint64_t BCa, BCe, BCi, BCo, BCu; + uint64_t Da, De, Di, Do, Du; + + uint64_t tmp0, tmp1; + + Aba = state[ 0]; Abe = state[ 1]; Abi = state[ 2]; Abo = state[ 3]; + Abu = state[ 4]; Aga = state[ 5]; Age = state[ 6]; Agi = state[ 7]; + Ago = state[ 8]; Agu = state[ 9]; Aka = state[10]; Ake = state[11]; + Aki = state[12]; Ako = state[13]; Aku = state[14]; Ama = state[15]; + Ame = state[16]; Ami = state[17]; Amo = state[18]; Amu = state[19]; + Asa = state[20]; Ase = state[21]; Asi = state[22]; Aso = state[23]; + Asu = state[24]; + + BCa = Aba^Aga^Aka^Ama^Asa; + BCe = Abe^Age^Ake^Ame^Ase; + BCi = Abi^Agi^Aki^Ami^Asi; + BCo = Abo^Ago^Ako^Amo^Aso; + BCu = Abu^Agu^Aku^Amu^Asu; + + Da =xor_rol_1(BCe,BCu); + De =xor_rol_1(BCi,BCa); + Di =xor_rol_1(BCo,BCe); + Do =xor_rol_1(BCu,BCi); + Du =xor_rol_1(BCa,BCo); + + tmp0 = Abe; + Aba = Aba ^ Da; Abe = Age ^ De; Age = Agu ^ Du; Agu = Asi ^ Di; + Asi = Aku ^ Du; Aku = Asa ^ Da; Asa = Abi ^ Di; Abi = Aki ^ Di; + Aki = Ako ^ Do; Ako = Amu ^ Du; Amu = Aso ^ Do; Aso = Ama ^ Da; + Ama = Abu ^ Du; Abu = Asu ^ Du; Asu = Ase ^ De; Ase = Ago ^ Do; + Ago = Ame ^ De; Ame = Aga ^ Da; Aga = Abo ^ Do; Abo = Amo ^ Do; + Amo = Ami ^ Di; Ami = Ake ^ De; Ake = Agi ^ Di; Agi = Aka ^ Da; + Aka = tmp0 ^ De; + + tmp0 = Aba ^ rol_43(bic_rol_1(Abe, Abi)); + tmp1 = xor_rol_23(Abe,bic_rol_22(Abi, Abo)); + Abi = xor_rol_29(Abi,bic_rol_7 (Abo, Abu)); + Abo = xor_rol_21(Abo,bic_rol_14(Abu, Aba)); + Abu = xor_rol_34(Abu,bic_rol_20(Aba, Abe)); + Aba = tmp0; + Abe = tmp1; + + tmp0 = xor_rol_25(Aga,bic_rol_17(Age, Agi)); + tmp1 = xor_rol_39(Age,bic_rol_22(Agi, Ago)); + Agi = xor_rol_6(Agi,bic_rol_48(Ago, Agu)); + Ago = xor_rol_17(Ago,bic_rol_33(Agu, Aga)); + Agu = xor_rol_41(Agu,bic_rol_8 (Aga, Age)); + Aga = tmp0; + Age = tmp1; + + tmp0 = xor_rol_40(Aka,bic_rol_45(Ake, Aki)); + tmp1 = xor_rol_62(Ake,bic_rol_17(Aki, Ako)); + Aki = xor_rol_7(Aki,bic_rol_54(Ako, Aku)); + Ako = xor_rol_7(Ako,bic_rol_17(Aku, Aka)); + Aku = xor_rol_12(Aku,bic_rol_59(Aka, Ake)); + Aka = tmp0; + Ake = tmp1; + + tmp0 = xor_rol_17(Ama,bic_rol_26(Ame, Ami)); + tmp1 = xor_rol_21(Ame,bic_rol_59(Ami, Amo)); + Ami = xor_rol_18(Ami,bic_rol_23(Amo, Amu)); + Amo = xor_rol_52(Amo,bic_rol_29(Amu, Ama)); + Amu = xor_rol_20(Amu,bic_rol_55(Ama, Ame)); + Ama = tmp0; + Ame = tmp1; + + tmp0 = xor_rol_23(Asa,bic_rol_16(Ase, Asi)); + tmp1 = xor_rol_14(Ase,bic_rol_62(Asi, Aso)); + Asi = xor_rol_37(Asi,bic_rol_39(Aso, Asu)); + Aso = xor_rol_43(Aso,bic_rol_4 (Asu, Asa)); + Asu = xor_rol_11(Asu,bic_rol_7 (Asa, Ase)); + Asa = tmp0; + Ase = tmp1; + + Aba ^= (uint64_t)round_constants[0]; + + for(round = 1; round < KECCAK_F1600_ROUNDS; round++ ) + { + + BCa = xor_rol_14( Asa, Aka); + BCa = xor_rol_15( BCa, Ama ); + BCa = xor_rol_7 ( BCa, Aga ); + BCa = xor_rol_3 ( BCa, Aba ); + + BCe = xor_rol_4 ( Age, Ase ); + BCe = xor_rol_20( BCe, Abe ); + BCe = xor_rol_6 ( BCe, Ame ); + BCe = xor_rol_7 ( BCe, Ake ); + BCe = rol_8( BCe ); + + BCi = xor_rol_5 ( Agi, Ami ); + BCi = xor_rol_38( BCi, Aki ); + BCi = xor_rol_4 ( BCi, Abi ); + BCi = xor_rol_12( BCi, Asi ); + BCi = rol_2( BCi ); + + BCo = xor_rol_34( Aso, Ago ); + BCo = xor_rol_1 ( BCo, Amo ); + BCo = xor_rol_26( BCo, Ako ); + BCo = xor_rol_1 ( BCo, Abo ); + + BCu = xor_rol_11( Asu, Abu ); + BCu = xor_rol_8 ( BCu, Amu ); + BCu = xor_rol_16( BCu, Agu ); + BCu = xor_rol_14( BCu, Aku ); + BCu = rol_6( BCu ); + + Da =xor_rol_1(BCe,BCu); + De =xor_rol_1(BCi,BCa); + Di =xor_rol_1(BCo,BCe); + Do =xor_rol_1(BCu,BCi); + Du =xor_rol_1(BCa,BCo); + + tmp0 = Abe; + Aba = Aba ^ Da; + + Abe = xor_rol_45(Age,De); + Age = xor_rol_20(Agu,Du); + Agu = xor_rol_2 (Asi,Di); + Asi = xor_rol_6 (Aku,Du); + Aku = xor_rol_39(Asa,Da); + Asa = xor_rol_14(Abi,Di); + Abi = xor_rol_18(Aki,Di); + Aki = xor_rol_1 (Ako,Do); + Ako = xor_rol_36(Amu,Du); + Amu = xor_rol_62(Aso,Do); + Aso = xor_rol_10(Ama,Da); + Ama = xor_rol_44(Abu,Du); + Abu = xor_rol_55(Asu,Du); + Asu = xor_rol_41(Ase,De); + Ase = xor_rol_28(Ago,Do); + Ago = xor_rol_15(Ame,De); + Ame = xor_rol_3(Aga,Da); + Aga = Abo ^ Do; + Abo = xor_rol_27(Amo,Do); + Amo = xor_rol_56(Ami,Di); + Ami = xor_rol_8 (Ake,De); + Ake = xor_rol_61(Agi,Di); + Agi = xor_rol_25(Aka,Da); + Aka = xor_rol_21(tmp0, De); + + tmp0 = xor_rol_43(bic_rol_1(Abe, Abi), Aba ); + tmp1 = xor_rol_23(Abe, bic_rol_22(Abi, Abo) ); + Abi = xor_rol_29(Abi, bic_rol_7 (Abo, Abu) ); + Abo = xor_rol_21(Abo, bic_rol_14(Abu, Aba) ); + Abu = xor_rol_34(Abu, bic_rol_20(Aba, Abe) ); + Aba = tmp0; + Abe = tmp1; + + tmp0 = xor_rol_25(Aga, bic_rol_17(Age, Agi) ); + tmp1 = xor_rol_39(Age, bic_rol_22(Agi, Ago) ); + Agi = xor_rol_6 (Agi, bic_rol_48(Ago, Agu) ); + Ago = xor_rol_17(Ago, bic_rol_33(Agu, Aga) ); + Agu = xor_rol_41(Agu, bic_rol_8 (Aga, Age) ); + Aga = tmp0; + Age = tmp1; + + tmp0 = xor_rol_40(Aka, bic_rol_45(Ake, Aki) ); + tmp1 = xor_rol_62(Ake, bic_rol_17(Aki, Ako) ); + Aki = xor_rol_7 (Aki, bic_rol_54(Ako, Aku) ); + Ako = xor_rol_7 (Ako, bic_rol_17(Aku, Aka) ); + Aku = xor_rol_12(Aku, bic_rol_59(Aka, Ake) ); + Aka = tmp0; + Ake = tmp1; + + tmp0 = xor_rol_17(Ama, bic_rol_26(Ame, Ami) ); + tmp1 = xor_rol_21(Ame, bic_rol_59(Ami, Amo) ); + Ami = xor_rol_18(Ami, bic_rol_23(Amo, Amu) ); + Amo = xor_rol_52(Amo, bic_rol_29(Amu, Ama) ); + Amu = xor_rol_20(Amu, bic_rol_55(Ama, Ame) ); + Ama = tmp0; + Ame = tmp1; + + tmp0 = xor_rol_23(Asa, bic_rol_16(Ase, Asi) ); + tmp1 = xor_rol_14(Ase, bic_rol_62(Asi, Aso) ); + Asi = xor_rol_37(Asi, bic_rol_39(Aso, Asu) ); + Aso = xor_rol_43(Aso, bic_rol_4 (Asu, Asa) ); + Asu = xor_rol_11(Asu, bic_rol_7 (Asa, Ase) ); + Asa = tmp0; + Ase = tmp1; + + Aba ^= (uint64_t)round_constants[round]; + + } + + Aga = rol_3 (Aga); Aka = rol_25(Aka); Ama = rol_10(Ama); Asa = rol_39(Asa); + Abe = rol_21(Abe); Age = rol_45(Age); Ake = rol_8 (Ake); Ame = rol_15(Ame); + Ase = rol_41(Ase); Abi = rol_14(Abi); Agi = rol_61(Agi); Aki = rol_18(Aki); + Ami = rol_56(Ami); Asi = rol_2 (Asi); Ago = rol_28(Ago); Ako = rol_1 (Ako); + Amo = rol_27(Amo); Aso = rol_62(Aso); Abu = rol_44(Abu); Agu = rol_20(Agu); + Aku = rol_6 (Aku); Amu = rol_36(Amu); Asu = rol_55(Asu); + + state[ 0] = Aba; state[ 1] = Abe; state[ 2] = Abi; state[ 3] = Abo; + state[ 4] = Abu; state[ 5] = Aga; state[ 6] = Age; state[ 7] = Agi; + state[ 8] = Ago; state[ 9] = Agu; state[10] = Aka; state[11] = Ake; + state[12] = Aki; state[13] = Ako; state[14] = Aku; state[15] = Ama; + state[16] = Ame; state[17] = Ami; state[18] = Amo; state[19] = Amu; + state[20] = Asa; state[21] = Ase; state[22] = Asi; state[23] = Aso; + state[24] = Asu; +} + +void keccak_f1600_x1_scalar_C_v1( uint64_t state[KECCAK_F1600_X1_STATE_SIZE_UINT64] ) +{ + int round; + + uint64_t Aba, Abe, Abi, Abo, Abu; + uint64_t Aga, Age, Agi, Ago, Agu; + uint64_t Aka, Ake, Aki, Ako, Aku; + uint64_t Ama, Ame, Ami, Amo, Amu; + uint64_t Asa, Ase, Asi, Aso, Asu; + uint64_t BCa, BCe, BCi, BCo, BCu; + uint64_t Da, De, Di, Do, Du; + + uint64_t tmp0, tmp1; + + Aba = state[ 0]; Abe = state[ 1]; Abi = state[ 2]; Abo = state[ 3]; + Abu = state[ 4]; Aga = state[ 5]; Age = state[ 6]; Agi = state[ 7]; + Ago = state[ 8]; Agu = state[ 9]; Aka = state[10]; Ake = state[11]; + Aki = state[12]; Ako = state[13]; Aku = state[14]; Ama = state[15]; + Ame = state[16]; Ami = state[17]; Amo = state[18]; Amu = state[19]; + Asa = state[20]; Ase = state[21]; Asi = state[22]; Aso = state[23]; + Asu = state[24]; + + BCa = Aba^Aga^Aka^Ama^Asa; + BCe = Abe^Age^Ake^Ame^Ase; + BCi = Abi^Agi^Aki^Ami^Asi; + BCo = Abo^Ago^Ako^Amo^Aso; + BCu = Abu^Agu^Aku^Amu^Asu; + + Da =xor_rol_1(BCe,BCu); + De =xor_rol_1(BCi,BCa); + Di =xor_rol_1(BCo,BCe); + Do =xor_rol_1(BCu,BCi); + Du =xor_rol_1(BCa,BCo); + + tmp0 = Abu; + Agu = Agu ^ Du; Abu = Age ^ De; Age = Ame ^ De; Ame = Ami ^ Di; + Ami = Aso ^ Do; Aso = Abi ^ Di; Abi = Asu ^ Du; Asu = Ago ^ Do; + Ago = Abo ^ Do; Abo = Aba ^ Da; Aba = Aki ^ Di; Aki = Asa ^ Da; + Asa = Aku ^ Du; Aku = Agi ^ Di; Agi = Asi ^ Di; Asi = Ase ^ De; + Ase = Ama ^ Da; Ama = Ake ^ De; Ake = Amu ^ Du; Amu = Aga ^ Da; + Aga = Aka ^ Da; Aka = Ako ^ Do; Ako = Abe ^ De; Abe = Amo ^ Do; + Amo = tmp0 ^ Du; + + tmp0 = bic_rol_1 (Abu, Aba ); + tmp0 = xor_rol_43(tmp0, Abo ); + tmp1 = bic_rol_22(Aba, Abe ); + tmp1 = xor_rol_23(Abu, tmp1); + Abu = bic_rol_20(Abo, Abu ); + Abu = xor_rol_34(Abi, Abu ); + Abo = bic_rol_14(Abi, Abo ); + Abo = xor_rol_21(Abe, Abo ); + Abi = bic_rol_7 (Abe, Abi ); + Abi = xor_rol_29(Aba, Abi ); + Aba = tmp0; + Abe = tmp1; + + tmp0 = bic_rol_17(Agu, Aga ); + tmp0 = xor_rol_25(Ago, tmp0); + tmp1 = bic_rol_22(Aga, Age ); + tmp1 = xor_rol_39(Agu, tmp1); + Agu = bic_rol_8 (Ago, Agu ); + Agu = xor_rol_41(Agi, Agu ); + Ago = bic_rol_33(Agi, Ago ); + Ago = xor_rol_17(Age, Ago ); + Agi = bic_rol_48(Age, Agi ); + Agi = xor_rol_6 (Aga, Agi ); + Aga = tmp0; + Age = tmp1; + + tmp0 = bic_rol_45(Aku, Aka ); + tmp0 = xor_rol_40(Ako, tmp0); + tmp1 = bic_rol_17(Aka, Ake ); + tmp1 = xor_rol_62(Aku, tmp1); + Aku = bic_rol_59(Ako, Aku ); + Aku = xor_rol_12(Aki, Aku ); + Ako = bic_rol_17(Aki, Ako ); + Ako = xor_rol_7 (Ake, Ako ); + Aki = bic_rol_54(Ake, Aki ); + Aki = xor_rol_7 (Aka, Aki ); + Aka = tmp0; + Ake = tmp1; + + tmp0 = bic_rol_26(Amu, Ama ); + tmp0 = xor_rol_17(Amo, tmp0); + tmp1 = bic_rol_59(Ama, Ame ); + tmp1 = xor_rol_21(Amu, tmp1); + Amu = bic_rol_55(Amo, Amu ); + Amu = xor_rol_20(Ami, Amu ); + Amo = bic_rol_29(Ami, Amo ); + Amo = xor_rol_52(Ame, Amo ); + Ami = bic_rol_23(Ame, Ami ); + Ami = xor_rol_18(Ama, Ami ); + Ama = tmp0; + Ame = tmp1; + + tmp0 = bic_rol_16(Asu, Asa ); + tmp0 = xor_rol_23(Aso, tmp0); + tmp1 = bic_rol_62(Asa, Ase ); + tmp1 = xor_rol_14(Asu, tmp1); + Asu = bic_rol_7 (Aso, Asu ); + Asu = xor_rol_11(Asi, Asu ); + Aso = bic_rol_4 (Asi, Aso ); + Aso = xor_rol_43(Ase, Aso ); + Asi = bic_rol_39(Ase, Asi ); + Asi = xor_rol_37(Asa, Asi ); + Asa = tmp0; + Ase = tmp1; + + Aba ^= (uint64_t)round_constants[0]; + + for(round = 1; round < KECCAK_F1600_ROUNDS; round++ ) { + + BCa = xor_rol_14( Asa, Aka); + BCe = xor_rol_4 ( Age, Ase ); + BCi = xor_rol_5 ( Agi, Ami ); + BCo = xor_rol_34( Aso, Ago ); + BCu = xor_rol_11( Asu, Abu ); + + BCa = xor_rol_15( BCa, Ama ); + BCe = xor_rol_20( BCe, Abe ); + BCi = xor_rol_38( BCi, Aki ); + BCo = xor_rol_1 ( BCo, Amo ); + BCa = xor_rol_7 ( BCa, Aga ); + BCu = xor_rol_8 ( BCu, Amu ); + + BCa = xor_rol_3 ( BCa, Aba ); + BCe = xor_rol_6 ( BCe, Ame ); + BCi = xor_rol_4 ( BCi, Abi ); + BCo = xor_rol_26( BCo, Ako ); + BCu = xor_rol_16( BCu, Agu ); + + BCe = xor_rol_7 ( BCe, Ake ); + BCi = xor_rol_12( BCi, Asi ); + BCo = xor_rol_1 ( BCo, Abo ); + BCu = xor_rol_14( BCu, Aku ); + + BCe = rol_8( BCe ); + BCi = rol_2( BCi ); + BCu = rol_6( BCu ); + + Da = xor_rol_1(BCe,BCu); + De = xor_rol_1(BCi,BCa); + Di = xor_rol_1(BCo,BCe); + Do = xor_rol_1(BCu,BCi); + Du = xor_rol_1(BCa,BCo); + + Agu = xor_rol_20(Agu,Du); + tmp0 = Abu; + Abu = xor_rol_45(Age,De); + Age = xor_rol_15(Ame,De); + Ame = xor_rol_56(Ami,Di); + Ami = xor_rol_62(Aso,Do); + Aso = xor_rol_14(Abi,Di); + Abi = xor_rol_55(Asu,Du); + Asu = xor_rol_28(Ago,Do); + Ago = Abo ^ Do; + Abo = Aba ^ Da; + Aba = xor_rol_18(Aki,Di); + Aki = xor_rol_39(Asa,Da); + Asa = xor_rol_6 (Aku,Du); + Aku = xor_rol_61(Agi,Di); + Agi = xor_rol_2 (Asi,Di); + Asi = xor_rol_41(Ase,De); + Ase = xor_rol_10(Ama,Da); + Ama = xor_rol_8 (Ake,De); + Ake = xor_rol_36(Amu,Du); + Amu = xor_rol_3(Aga,Da); + Aga = xor_rol_25(Aka,Da); + Aka = xor_rol_1 (Ako,Do); + Ako = xor_rol_21(Abe,De); + Abe = xor_rol_27(Amo,Do); + Amo = xor_rol_44(tmp0,Du); + + + tmp0 = bic_rol_1 (Abu, Aba ); + tmp0 = xor_rol_43(tmp0, Abo ); + tmp1 = bic_rol_22(Aba, Abe ); + tmp1 = xor_rol_23(Abu, tmp1); + Abu = bic_rol_20(Abo, Abu ); + Abu = xor_rol_34(Abi, Abu ); + Abo = bic_rol_14(Abi, Abo ); + Abo = xor_rol_21(Abe, Abo ); + Abi = bic_rol_7 (Abe, Abi ); + Abi = xor_rol_29(Aba, Abi ); + Aba = tmp0; + Abe = tmp1; + + tmp0 = bic_rol_17(Agu, Aga ); + tmp0 = xor_rol_25(Ago, tmp0); + tmp1 = bic_rol_22(Aga, Age ); + tmp1 = xor_rol_39(Agu, tmp1); + Agu = bic_rol_8 (Ago, Agu ); + Agu = xor_rol_41(Agi, Agu ); + Ago = bic_rol_33(Agi, Ago ); + Ago = xor_rol_17(Age, Ago ); + Agi = bic_rol_48(Age, Agi ); + Agi = xor_rol_6 (Aga, Agi ); + Aga = tmp0; + Age = tmp1; + + tmp0 = bic_rol_45(Aku, Aka ); + tmp0 = xor_rol_40(Ako, tmp0); + tmp1 = bic_rol_17(Aka, Ake ); + tmp1 = xor_rol_62(Aku, tmp1); + Aku = bic_rol_59(Ako, Aku ); + Aku = xor_rol_12(Aki, Aku ); + Ako = bic_rol_17(Aki, Ako ); + Ako = xor_rol_7 (Ake, Ako ); + Aki = bic_rol_54(Ake, Aki ); + Aki = xor_rol_7 (Aka, Aki ); + Aka = tmp0; + Ake = tmp1; + + tmp0 = bic_rol_26(Amu, Ama ); + tmp0 = xor_rol_17(Amo, tmp0); + tmp1 = bic_rol_59(Ama, Ame ); + tmp1 = xor_rol_21(Amu, tmp1); + Amu = bic_rol_55(Amo, Amu ); + Amu = xor_rol_20(Ami, Amu ); + Amo = bic_rol_29(Ami, Amo ); + Amo = xor_rol_52(Ame, Amo ); + Ami = bic_rol_23(Ame, Ami ); + Ami = xor_rol_18(Ama, Ami ); + Ama = tmp0; + Ame = tmp1; + + tmp0 = bic_rol_16(Asu, Asa ); + tmp0 = xor_rol_23(Aso, tmp0); + tmp1 = bic_rol_62(Asa, Ase ); + tmp1 = xor_rol_14(Asu, tmp1); + Asu = bic_rol_7 (Aso, Asu ); + Asu = xor_rol_11(Asi, Asu ); + Aso = bic_rol_4 (Asi, Aso ); + Aso = xor_rol_43(Ase, Aso ); + Asi = bic_rol_39(Ase, Asi ); + Asi = xor_rol_37(Asa, Asi ); + Asa = tmp0; + Ase = tmp1; + + Aba ^= (uint64_t)round_constants[round]; + + } + + Aga = rol_3 (Aga); Aka = rol_25(Aka); Ama = rol_10(Ama); Asa = rol_39(Asa); + Abe = rol_21(Abe); Age = rol_45(Age); Ake = rol_8 (Ake); Ame = rol_15(Ame); + Ase = rol_41(Ase); Abi = rol_14(Abi); Agi = rol_61(Agi); Aki = rol_18(Aki); + Ami = rol_56(Ami); Asi = rol_2 (Asi); Ago = rol_28(Ago); Ako = rol_1 (Ako); + Amo = rol_27(Amo); Aso = rol_62(Aso); Abu = rol_44(Abu); Agu = rol_20(Agu); + Aku = rol_6 (Aku); Amu = rol_36(Amu); Asu = rol_55(Asu); + + state[ 0] = Aba; state[ 1] = Abe; state[ 2] = Abi; state[ 3] = Abo; + state[ 4] = Abu; state[ 5] = Aga; state[ 6] = Age; state[ 7] = Agi; + state[ 8] = Ago; state[ 9] = Agu; state[10] = Aka; state[11] = Ake; + state[12] = Aki; state[13] = Ako; state[14] = Aku; state[15] = Ama; + state[16] = Ame; state[17] = Ami; state[18] = Amo; state[19] = Amu; + state[20] = Asa; state[21] = Ase; state[22] = Asi; state[23] = Aso; + state[24] = Asu; +} diff --git a/asm/manual/keccak_f1600/keccak_f1600_x1_scalar_asm_v1.s b/asm/manual/keccak_f1600/keccak_f1600_x1_scalar_asm_v1.s new file mode 100644 index 0000000..477272c --- /dev/null +++ b/asm/manual/keccak_f1600/keccak_f1600_x1_scalar_asm_v1.s @@ -0,0 +1,413 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +/********************** CONSTANTS *************************/ + .data + .balign 64 +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x28 + count .req x29 + cur_const .req x30 + + /* Mapping of Kecck-f1600 state to scalar registers + * at the beginning and end of each round. */ + Aba .req x1 + Abe .req x6 + Abi .req x11 + Abo .req x16 + Abu .req x21 + Aga .req x2 + Age .req x7 + Agi .req x12 + Ago .req x17 + Agu .req x22 + Aka .req x3 + Ake .req x8 + Aki .req x13 + Ako .req x18 + Aku .req x23 + Ama .req x4 + Ame .req x9 + Ami .req x14 + Amo .req x19 + Amu .req x24 + Asa .req x5 + Ase .req x10 + Asi .req x15 + Aso .req x20 + Asu .req x25 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + Aba_ .req x0 + Abe_ .req x28 + Abi_ .req x11 + Abo_ .req x16 + Abu_ .req x21 + Aga_ .req x3 + Age_ .req x8 + Agi_ .req x12 + Ago_ .req x17 + Agu_ .req x22 + Aka_ .req x4 + Ake_ .req x9 + Aki_ .req x13 + Ako_ .req x18 + Aku_ .req x23 + Ama_ .req x5 + Ame_ .req x10 + Ami_ .req x14 + Amo_ .req x19 + Amu_ .req x24 + Asa_ .req x1 + Ase_ .req x6 + Asi_ .req x15 + Aso_ .req x20 + Asu_ .req x25 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + C0 .req x0 + E0 .req x29 + C1 .req x26 + E1 .req x30 + C2 .req x27 + E2 .req x26 + C3 .req x28 + E3 .req x27 + C4 .req x29 + E4 .req x28 + + tmp .req x30 + +/************************ MACROS ****************************/ + +.macro load_input + ldr Aba, [input_addr, #(1*8*0)] + ldr Abe, [input_addr, #(1*8*1)] + ldr Abi, [input_addr, #(1*8*2)] + ldr Abo, [input_addr, #(1*8*3)] + ldr Abu, [input_addr, #(1*8*4)] + ldr Aga, [input_addr, #(1*8*5)] + ldr Age, [input_addr, #(1*8*6)] + ldr Agi, [input_addr, #(1*8*7)] + ldr Ago, [input_addr, #(1*8*8)] + ldr Agu, [input_addr, #(1*8*9)] + ldr Aka, [input_addr, #(1*8*10)] + ldr Ake, [input_addr, #(1*8*11)] + ldr Aki, [input_addr, #(1*8*12)] + ldr Ako, [input_addr, #(1*8*13)] + ldr Aku, [input_addr, #(1*8*14)] + ldr Ama, [input_addr, #(1*8*15)] + ldr Ame, [input_addr, #(1*8*16)] + ldr Ami, [input_addr, #(1*8*17)] + ldr Amo, [input_addr, #(1*8*18)] + ldr Amu, [input_addr, #(1*8*19)] + ldr Asa, [input_addr, #(1*8*20)] + ldr Ase, [input_addr, #(1*8*21)] + ldr Asi, [input_addr, #(1*8*22)] + ldr Aso, [input_addr, #(1*8*23)] + ldr Asu, [input_addr, #(1*8*24)] +.endm + +.macro store_input + str Aba, [input_addr, #(1*8*0)] + str Abe, [input_addr, #(1*8*1)] + str Abi, [input_addr, #(1*8*2)] + str Abo, [input_addr, #(1*8*3)] + str Abu, [input_addr, #(1*8*4)] + str Aga, [input_addr, #(1*8*5)] + str Age, [input_addr, #(1*8*6)] + str Agi, [input_addr, #(1*8*7)] + str Ago, [input_addr, #(1*8*8)] + str Agu, [input_addr, #(1*8*9)] + str Aka, [input_addr, #(1*8*10)] + str Ake, [input_addr, #(1*8*11)] + str Aki, [input_addr, #(1*8*12)] + str Ako, [input_addr, #(1*8*13)] + str Aku, [input_addr, #(1*8*14)] + str Ama, [input_addr, #(1*8*15)] + str Ame, [input_addr, #(1*8*16)] + str Ami, [input_addr, #(1*8*17)] + str Amo, [input_addr, #(1*8*18)] + str Amu, [input_addr, #(1*8*19)] + str Asa, [input_addr, #(1*8*20)] + str Ase, [input_addr, #(1*8*21)] + str Asi, [input_addr, #(1*8*22)] + str Aso, [input_addr, #(1*8*23)] + str Asu, [input_addr, #(1*8*24)] +.endm + +#define STACK_SIZE (16*6 + 3*8 + 8) // GPRs (16*6), count (8), const (8), input (8), padding (8) +#define STACK_BASE_GPRS (3*8+8) +#define STACK_OFFSET_INPUT (0*8) +#define STACK_OFFSET_CONST (1*8) +#define STACK_OFFSET_COUNT (2*8) + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +.macro save reg, offset + str \reg, [sp, #\offset] +.endm + +.macro restore reg, offset + ldr \reg, [sp, #\offset] +.endm + +.macro save_gprs + stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +/* Keccak-f1600 round */ + +.macro keccak_f1600_round + save count, STACK_OFFSET_COUNT + +eor C0, Aba, Aga +eor C0, C0, Aka +eor C0, C0, Ama +eor C0, C0, Asa +eor C1, Abe, Age +eor C1, C1, Ake +eor C1, C1, Ame +eor C1, C1, Ase +eor C2, Abi, Agi +eor C2, C2, Aki +eor C2, C2, Ami +eor C2, C2, Asi +eor C3, Abo, Ago +eor C3, C3, Ako +eor C3, C3, Amo +eor C3, C3, Aso +eor C4, Abu, Agu +eor C4, C4, Aku +eor C4, C4, Amu +eor C4, C4, Asu + + +eor E1, C0, C2, ROR #63 +eor E3, C2, C4, ROR #63 +eor E0, C4, C1, ROR #63 +eor E2, C1, C3, ROR #63 +eor E4, C3, C0, ROR #63 + +eor Aba_, Aba, E0 +eor Asa_, Abi, E2 +ror Asa_, Asa_, #2 +eor Abi_, Aki, E2 +ror Abi_, Abi_, #21 +eor Aki_, Ako, E3 +ror Aki_, Aki_, #39 +eor Ako_, Amu, E4 +ror Ako_, Ako_, #56 +eor Amu_, Aso, E3 +ror Amu_, Amu_, #8 +eor Aso_, Ama, E0 +ror Aso_, Aso_, #23 +eor Aka_, Abe, E1 +ror Aka_, Aka_, #63 +eor Ase_, Ago, E3 +ror Ase_, Ase_, #9 +eor Ago_, Ame, E1 +ror Ago_, Ago_, #19 +eor Ake_, Agi, E2 +ror Ake_, Ake_, #58 +eor Agi_, Aka, E0 +ror Agi_, Agi_, #61 +eor Aga_, Abo, E3 +ror Aga_, Aga_, #36 +eor Abo_, Amo, E3 +ror Abo_, Abo_, #43 +eor Amo_, Ami, E2 +ror Amo_, Amo_, #49 +eor Ami_, Ake, E1 +ror Ami_, Ami_, #54 +eor Age_, Agu, E4 +ror Age_, Age_, #44 +eor Agu_, Asi, E2 +ror Agu_, Agu_, #3 +eor Asi_, Aku, E4 +ror Asi_, Asi_, #25 +eor Aku_, Asa, E0 +ror Aku_, Aku_, #46 +eor Ama_, Abu, E4 +ror Ama_, Ama_, #37 +eor Abu_, Asu, E4 +ror Abu_, Abu_, #50 +eor Asu_, Ase, E1 +ror Asu_, Asu_, #62 +eor Ame_, Aga, E0 +ror Ame_, Ame_, #28 + +eor Abe_, Age, E1 +ror Abe_, Abe_, #20 + +// xi step +// Row 1 +bic tmp, Agi_, Age_ +eor Aga, tmp, Aga_ +bic tmp, Ago_, Agi_ +eor Age, tmp, Age_ +bic tmp, Agu_, Ago_ +eor Agi, tmp, Agi_ +bic tmp, Aga_, Agu_ +eor Ago, tmp, Ago_ +bic tmp, Age_, Aga_ +eor Agu, tmp, Agu_ +// Row 2 +bic tmp, Aki_, Ake_ +eor Aka, tmp, Aka_ +bic tmp, Ako_, Aki_ +eor Ake, tmp, Ake_ +bic tmp, Aku_, Ako_ +eor Aki, tmp, Aki_ +bic tmp, Aka_, Aku_ +eor Ako, tmp, Ako_ +bic tmp, Ake_, Aka_ +eor Aku, tmp, Aku_ +// Row 3 +bic tmp, Ami_, Ame_ +eor Ama, tmp, Ama_ +bic tmp, Amo_, Ami_ +eor Ame, tmp, Ame_ +bic tmp, Amu_, Amo_ +eor Ami, tmp, Ami_ +bic tmp, Ama_, Amu_ +eor Amo, tmp, Amo_ +bic tmp, Ame_, Ama_ +eor Amu, tmp, Amu_ +// Row 4 +bic tmp, Asi_, Ase_ +eor Asa, tmp, Asa_ +bic tmp, Aso_, Asi_ +eor Ase, tmp, Ase_ +bic tmp, Asu_, Aso_ +eor Asi, tmp, Asi_ +bic tmp, Asa_, Asu_ +eor Aso, tmp, Aso_ +bic tmp, Ase_, Asa_ +eor Asu, tmp, Asu_ +// Row 0 +bic tmp, Abi_, Abe_ +eor Aba, tmp, Aba_ +bic tmp, Abo_, Abi_ +eor Abe, tmp, Abe_ +bic tmp, Abu_, Abo_ +eor Abi, tmp, Abi_ +bic tmp, Aba_, Abu_ +eor Abo, tmp, Abo_ +bic tmp, Abe_, Aba_ +eor Abu, tmp, Abu_ + + restore const_addr, STACK_OFFSET_CONST + ldr cur_const, [const_addr], #8 + eor Aba, Aba, cur_const + save const_addr, STACK_OFFSET_CONST + + restore count, STACK_OFFSET_COUNT +.endm + +#define KECCAK_F1600_ROUNDS 24 + +.text +.balign 16 +.global keccak_f1600_x1_scalar_asm_v1 +.global _keccak_f1600_x1_scalar_asm_v1 + +keccak_f1600_x1_scalar_asm_v1: +_keccak_f1600_x1_scalar_asm_v1: + alloc_stack + save_gprs + load_constant_ptr + save const_addr, STACK_OFFSET_CONST + load_input + save input_addr, STACK_OFFSET_INPUT + + mov count, #0 +loop: + keccak_f1600_round + add count, count, #1 + cmp count, #(KECCAK_F1600_ROUNDS-1) + ble loop + + restore input_addr, STACK_OFFSET_INPUT + store_input + restore_gprs + free_stack + ret diff --git a/asm/manual/keccak_f1600/keccak_f1600_x1_scalar_asm_v2.s b/asm/manual/keccak_f1600/keccak_f1600_x1_scalar_asm_v2.s new file mode 100644 index 0000000..68f2c71 --- /dev/null +++ b/asm/manual/keccak_f1600/keccak_f1600_x1_scalar_asm_v2.s @@ -0,0 +1,505 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +/********************** CONSTANTS *************************/ + .data + .balign 64 +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x29 + count .req w27 + cur_const .req x26 + + /* Mapping of Kecck-f1600 state to scalar registers + * at the beginning and end of each round. */ + Aba .req x1 + Abe .req x6 + Abi .req x11 + Abo .req x16 + Abu .req x21 + Aga .req x2 + Age .req x7 + Agi .req x12 + Ago .req x17 + Agu .req x22 + Aka .req x3 + Ake .req x8 + Aki .req x13 + Ako .req x18 + Aku .req x23 + Ama .req x4 + Ame .req x9 + Ami .req x14 + Amo .req x19 + Amu .req x24 + Asa .req x5 + Ase .req x10 + Asi .req x15 + Aso .req x20 + Asu .req x25 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + Aba_ .req x0 + Abe_ .req x28 + Abi_ .req x11 + Abo_ .req x16 + Abu_ .req x21 + Aga_ .req x3 + Age_ .req x8 + Agi_ .req x12 + Ago_ .req x17 + Agu_ .req x22 + Aka_ .req x4 + Ake_ .req x9 + Aki_ .req x13 + Ako_ .req x18 + Aku_ .req x23 + Ama_ .req x5 + Ame_ .req x10 + Ami_ .req x14 + Amo_ .req x19 + Amu_ .req x24 + Asa_ .req x1 + Ase_ .req x6 + Asi_ .req x15 + Aso_ .req x20 + Asu_ .req x25 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + C0 .req x0 + E0 .req x29 + C1 .req x26 + E1 .req x30 + C2 .req x27 + E2 .req x26 + C3 .req x28 + E3 .req x27 + C4 .req x29 + E4 .req x28 + + tmp .req x30 + +/************************ MACROS ****************************/ + +.macro load_input + ldp Aba, Abe, [input_addr, #(1*8*0)] + ldp Abi, Abo, [input_addr, #(1*8*2)] + ldp Abu, Aga, [input_addr, #(1*8*4)] + ldp Age, Agi, [input_addr, #(1*8*6)] + ldp Ago, Agu, [input_addr, #(1*8*8)] + ldp Aka, Ake, [input_addr, #(1*8*10)] + ldp Aki, Ako, [input_addr, #(1*8*12)] + ldp Aku, Ama, [input_addr, #(1*8*14)] + ldp Ame, Ami, [input_addr, #(1*8*16)] + ldp Amo, Amu, [input_addr, #(1*8*18)] + ldp Asa, Ase, [input_addr, #(1*8*20)] + ldp Asi, Aso, [input_addr, #(1*8*22)] + ldr Asu, [input_addr, #(1*8*24)] +.endm + +.macro store_input + stp Aba, Abe, [input_addr, #(1*8*0)] + stp Abi, Abo, [input_addr, #(1*8*2)] + stp Abu, Aga, [input_addr, #(1*8*4)] + stp Age, Agi, [input_addr, #(1*8*6)] + stp Ago, Agu, [input_addr, #(1*8*8)] + stp Aka, Ake, [input_addr, #(1*8*10)] + stp Aki, Ako, [input_addr, #(1*8*12)] + stp Aku, Ama, [input_addr, #(1*8*14)] + stp Ame, Ami, [input_addr, #(1*8*16)] + stp Amo, Amu, [input_addr, #(1*8*18)] + stp Asa, Ase, [input_addr, #(1*8*20)] + stp Asi, Aso, [input_addr, #(1*8*22)] + str Asu, [input_addr, #(1*8*24)] +.endm + +#define STACK_SIZE (16*6 + 3*8 + 8) // GPRs (16*6), count (8), const (8), input (8), padding (8) +#define STACK_BASE_GPRS (3*8+8) +#define STACK_OFFSET_INPUT (0*8) +#define STACK_OFFSET_CONST (1*8) +#define STACK_OFFSET_COUNT (2*8) + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +.macro save reg, offset + str \reg, [sp, #\offset] +.endm + +.macro restore reg, offset + ldr \reg, [sp, #\offset] +.endm + +.macro save_gprs + stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro keccak_f1600_round_initial + + eor C0, Ama, Asa + eor C1, Ame, Ase + eor C2, Ami, Asi + eor C3, Amo, Aso + eor C4, Amu, Asu + eor C0, Aka, C0 + eor C1, Ake, C1 + eor C2, Aki, C2 + eor C3, Ako, C3 + eor C4, Aku, C4 + eor C0, Aga, C0 + eor C1, Age, C1 + eor C2, Agi, C2 + eor C3, Ago, C3 + eor C4, Agu, C4 + eor C0, Aba, C0 + eor C1, Abe, C1 + eor C2, Abi, C2 + eor C3, Abo, C3 + eor C4, Abu, C4 + + eor E1, C0, C2, ROR #63 + eor E3, C2, C4, ROR #63 + eor E0, C4, C1, ROR #63 + eor E2, C1, C3, ROR #63 + eor E4, C3, C0, ROR #63 + + eor Aba_, Aba, E0 + eor Asa_, Abi, E2 + eor Abi_, Aki, E2 + eor Aki_, Ako, E3 + eor Ako_, Amu, E4 + eor Amu_, Aso, E3 + eor Aso_, Ama, E0 + eor Aka_, Abe, E1 + eor Ase_, Ago, E3 + eor Ago_, Ame, E1 + eor Ake_, Agi, E2 + eor Agi_, Aka, E0 + eor Aga_, Abo, E3 + eor Abo_, Amo, E3 + eor Amo_, Ami, E2 + eor Ami_, Ake, E1 + eor Age_, Agu, E4 + eor Agu_, Asi, E2 + eor Asi_, Aku, E4 + eor Aku_, Asa, E0 + eor Ama_, Abu, E4 + eor Abu_, Asu, E4 + eor Asu_, Ase, E1 + eor Ame_, Aga, E0 + eor Abe_, Age, E1 + + load_constant_ptr + + bic tmp, Agi_, Age_, ROR #47 + eor Aga, tmp, Aga_, ROR #39 + bic tmp, Ago_, Agi_, ROR #42 + eor Age, tmp, Age_, ROR #25 + bic tmp, Agu_, Ago_, ROR #16 + eor Agi, tmp, Agi_, ROR #58 + bic tmp, Aga_, Agu_, ROR #31 + eor Ago, tmp, Ago_, ROR #47 + bic tmp, Age_, Aga_, ROR #56 + eor Agu, tmp, Agu_, ROR #23 + bic tmp, Aki_, Ake_, ROR #19 + eor Aka, tmp, Aka_, ROR #24 + bic tmp, Ako_, Aki_, ROR #47 + eor Ake, tmp, Ake_, ROR #2 + bic tmp, Aku_, Ako_, ROR #10 + eor Aki, tmp, Aki_, ROR #57 + bic tmp, Aka_, Aku_, ROR #47 + eor Ako, tmp, Ako_, ROR #57 + bic tmp, Ake_, Aka_, ROR #5 + eor Aku, tmp, Aku_, ROR #52 + bic tmp, Ami_, Ame_, ROR #38 + eor Ama, tmp, Ama_, ROR #47 + bic tmp, Amo_, Ami_, ROR #5 + eor Ame, tmp, Ame_, ROR #43 + bic tmp, Amu_, Amo_, ROR #41 + eor Ami, tmp, Ami_, ROR #46 + + ldr cur_const, [const_addr] + mov count, #1 + + bic tmp, Ama_, Amu_, ROR #35 + eor Amo, tmp, Amo_, ROR #12 + bic tmp, Ame_, Ama_, ROR #9 + eor Amu, tmp, Amu_, ROR #44 + bic tmp, Asi_, Ase_, ROR #48 + eor Asa, tmp, Asa_, ROR #41 + bic tmp, Aso_, Asi_, ROR #2 + eor Ase, tmp, Ase_, ROR #50 + bic tmp, Asu_, Aso_, ROR #25 + eor Asi, tmp, Asi_, ROR #27 + bic tmp, Asa_, Asu_, ROR #60 + eor Aso, tmp, Aso_, ROR #21 + bic tmp, Ase_, Asa_, ROR #57 + eor Asu, tmp, Asu_, ROR #53 + bic tmp, Abi_, Abe_, ROR #63 + eor Aba, Aba_, tmp, ROR #21 + bic tmp, Abo_, Abi_, ROR #42 + eor Abe, tmp, Abe_, ROR #41 + bic tmp, Abu_, Abo_, ROR #57 + eor Abi, tmp, Abi_, ROR #35 + bic tmp, Aba_, Abu_, ROR #50 + eor Abo, tmp, Abo_, ROR #43 + bic tmp, Abe_, Aba_, ROR #44 + eor Abu, tmp, Abu_, ROR #30 + + eor Aba, Aba, cur_const + +.endm + + +.macro keccak_f1600_round_noninitial + + save count, STACK_OFFSET_COUNT + + eor C0, Aka, Asa, ROR #50 + eor C1, Ase, Age, ROR #60 + eor C2, Ami, Agi, ROR #59 + eor C3, Ago, Aso, ROR #30 + eor C4, Abu, Asu, ROR #53 + eor C0, Ama, C0, ROR #49 + eor C1, Abe, C1, ROR #44 + eor C2, Aki, C2, ROR #26 + eor C3, Amo, C3, ROR #63 + eor C4, Amu, C4, ROR #56 + eor C0, Aga, C0, ROR #57 + eor C1, Ame, C1, ROR #58 + eor C2, Abi, C2, ROR #60 + eor C3, Ako, C3, ROR #38 + eor C4, Agu, C4, ROR #48 + eor C0, Aba, C0, ROR #61 + eor C1, Ake, C1, ROR #57 + eor C2, Asi, C2, ROR #52 + eor C3, Abo, C3, ROR #63 + eor C4, Aku, C4, ROR #50 + ror C1, C1, 56 + ror C4, C4, 58 + ror C2, C2, 62 + + eor E1, C0, C2, ROR #63 + eor E3, C2, C4, ROR #63 + eor E0, C4, C1, ROR #63 + eor E2, C1, C3, ROR #63 + eor E4, C3, C0, ROR #63 + + eor Aba_, E0, Aba + eor Asa_, E2, Abi, ROR #50 + eor Abi_, E2, Aki, ROR #46 + eor Aki_, E3, Ako, ROR #63 + eor Ako_, E4, Amu, ROR #28 + eor Amu_, E3, Aso, ROR #2 + eor Aso_, E0, Ama, ROR #54 + eor Aka_, E1, Abe, ROR #43 + eor Ase_, E3, Ago, ROR #36 + eor Ago_, E1, Ame, ROR #49 + eor Ake_, E2, Agi, ROR #3 + eor Agi_, E0, Aka, ROR #39 + eor Aga_, E3, Abo + eor Abo_, E3, Amo, ROR #37 + eor Amo_, E2, Ami, ROR #8 + eor Ami_, E1, Ake, ROR #56 + eor Age_, E4, Agu, ROR #44 + eor Agu_, E2, Asi, ROR #62 + eor Asi_, E4, Aku, ROR #58 + eor Aku_, E0, Asa, ROR #25 + eor Ama_, E4, Abu, ROR #20 + eor Abu_, E4, Asu, ROR #9 + eor Asu_, E1, Ase, ROR #23 + eor Ame_, E0, Aga, ROR #61 + eor Abe_, E1, Age, ROR #19 + + load_constant_ptr + restore count, STACK_OFFSET_COUNT + + bic tmp, Agi_, Age_, ROR #47 + eor Aga, tmp, Aga_, ROR #39 + bic tmp, Ago_, Agi_, ROR #42 + eor Age, tmp, Age_, ROR #25 + bic tmp, Agu_, Ago_, ROR #16 + eor Agi, tmp, Agi_, ROR #58 + bic tmp, Aga_, Agu_, ROR #31 + eor Ago, tmp, Ago_, ROR #47 + bic tmp, Age_, Aga_, ROR #56 + eor Agu, tmp, Agu_, ROR #23 + bic tmp, Aki_, Ake_, ROR #19 + eor Aka, tmp, Aka_, ROR #24 + bic tmp, Ako_, Aki_, ROR #47 + eor Ake, tmp, Ake_, ROR #2 + bic tmp, Aku_, Ako_, ROR #10 + eor Aki, tmp, Aki_, ROR #57 + bic tmp, Aka_, Aku_, ROR #47 + eor Ako, tmp, Ako_, ROR #57 + bic tmp, Ake_, Aka_, ROR #5 + eor Aku, tmp, Aku_, ROR #52 + bic tmp, Ami_, Ame_, ROR #38 + eor Ama, tmp, Ama_, ROR #47 + bic tmp, Amo_, Ami_, ROR #5 + eor Ame, tmp, Ame_, ROR #43 + bic tmp, Amu_, Amo_, ROR #41 + eor Ami, tmp, Ami_, ROR #46 + bic tmp, Ama_, Amu_, ROR #35 + + ldr cur_const, [const_addr, count, UXTW #3] + add count, count, #1 + + eor Amo, tmp, Amo_, ROR #12 + bic tmp, Ame_, Ama_, ROR #9 + eor Amu, tmp, Amu_, ROR #44 + bic tmp, Asi_, Ase_, ROR #48 + eor Asa, tmp, Asa_, ROR #41 + bic tmp, Aso_, Asi_, ROR #2 + eor Ase, tmp, Ase_, ROR #50 + bic tmp, Asu_, Aso_, ROR #25 + eor Asi, tmp, Asi_, ROR #27 + bic tmp, Asa_, Asu_, ROR #60 + eor Aso, tmp, Aso_, ROR #21 + bic tmp, Ase_, Asa_, ROR #57 + eor Asu, tmp, Asu_, ROR #53 + bic tmp, Abi_, Abe_, ROR #63 + eor Aba, Aba_, tmp, ROR #21 + bic tmp, Abo_, Abi_, ROR #42 + eor Abe, tmp, Abe_, ROR #41 + bic tmp, Abu_, Abo_, ROR #57 + eor Abi, tmp, Abi_, ROR #35 + bic tmp, Aba_, Abu_, ROR #50 + eor Abo, tmp, Abo_, ROR #43 + bic tmp, Abe_, Aba_, ROR #44 + eor Abu, tmp, Abu_, ROR #30 + + eor Aba, Aba, cur_const + +.endm + +.macro final_rotate + ror Aga, Aga,#(64-3) + ror Aka, Aka,#(64-25) + ror Ama, Ama,#(64-10) + ror Asa, Asa,#(64-39) + ror Abe, Abe,#(64-21) + ror Age, Age,#(64-45) + ror Ake, Ake,#(64-8) + ror Ame, Ame,#(64-15) + ror Ase, Ase,#(64-41) + ror Abi, Abi,#(64-14) + ror Agi, Agi,#(64-61) + ror Aki, Aki,#(64-18) + ror Ami, Ami,#(64-56) + ror Asi, Asi,#(64-2) + ror Ago, Ago,#(64-28) + ror Ako, Ako,#(64-1) + ror Amo, Amo,#(64-27) + ror Aso, Aso,#(64-62) + ror Abu, Abu,#(64-44) + ror Agu, Agu,#(64-20) + ror Aku, Aku,#(64-6) + ror Amu, Amu,#(64-36) + ror Asu, Asu,#(64-55) +.endm + + +#define KECCAK_F1600_ROUNDS 24 + +.text +.balign 16 +.global keccak_f1600_x1_scalar_asm_v2 +.global _keccak_f1600_x1_scalar_asm_v2 + +keccak_f1600_x1_scalar_asm_v2: +_keccak_f1600_x1_scalar_asm_v2: + alloc_stack + save_gprs + load_input + save input_addr, STACK_OFFSET_INPUT + + keccak_f1600_round_initial +loop: + keccak_f1600_round_noninitial + cmp count, #(KECCAK_F1600_ROUNDS-1) + ble loop + + final_rotate + + restore input_addr, STACK_OFFSET_INPUT + store_input + restore_gprs + free_stack + ret diff --git a/asm/manual/keccak_f1600/keccak_f1600_x1_scalar_asm_v3.s b/asm/manual/keccak_f1600/keccak_f1600_x1_scalar_asm_v3.s new file mode 100644 index 0000000..9a5d04b --- /dev/null +++ b/asm/manual/keccak_f1600/keccak_f1600_x1_scalar_asm_v3.s @@ -0,0 +1,494 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +/********************** CONSTANTS *************************/ + .data + .balign 64 +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x29 + count .req w27 + cur_const .req x26 + + /* Mapping of Kecck-f1600 state to scalar registers + * at the beginning and end of each round. */ + Aba .req x1 + Abe .req x6 + Abi .req x11 + Abo .req x16 + Abu .req x21 + Aga .req x2 + Age .req x7 + Agi .req x12 + Ago .req x17 + Agu .req x22 + Aka .req x3 + Ake .req x8 + Aki .req x13 + Ako .req x18 + Aku .req x23 + Ama .req x4 + Ame .req x9 + Ami .req x14 + Amo .req x19 + Amu .req x24 + Asa .req x5 + Ase .req x10 + Asi .req x15 + Aso .req x20 + Asu .req x25 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + Aba_ .req x30 + Abe_ .req x28 + Abi_ .req x11 + Abo_ .req x16 + Abu_ .req x21 + Aga_ .req x3 + Age_ .req x8 + Agi_ .req x12 + Ago_ .req x17 + Agu_ .req x22 + Aka_ .req x4 + Ake_ .req x9 + Aki_ .req x13 + Ako_ .req x18 + Aku_ .req x23 + Ama_ .req x5 + Ame_ .req x10 + Ami_ .req x14 + Amo_ .req x19 + Amu_ .req x24 + Asa_ .req x1 + Ase_ .req x6 + Asi_ .req x15 + Aso_ .req x20 + Asu_ .req x25 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + C0 .req x30 + E0 .req x29 + C1 .req x26 + E1 .req x0 + C2 .req x27 + E2 .req x26 + C3 .req x28 + E3 .req x27 + C4 .req x29 + E4 .req x28 + + tmp .req x0 + +/************************ MACROS ****************************/ + +#define STACK_SIZE (16*6 + 3*8 + 8) // GPRs (16*6), count (8), const (8), input (8), padding (8) +#define STACK_BASE_GPRS (3*8+8) +#define STACK_OFFSET_INPUT (0*8) +#define STACK_OFFSET_CONST (1*8) +#define STACK_OFFSET_COUNT (2*8) + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +.macro save reg, offset + str \reg, [sp, #\offset] +.endm + +.macro restore reg, offset + ldr \reg, [sp, #\offset] +.endm + +.macro save_gprs + stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro keccak_f1600_round_initial + ldp Aku, Ama, [input_addr, #(1*8*14)] + ldp Asa, Ase, [input_addr, #(1*8*20)] + eor C0, Ama, Asa + ldp Ame, Ami, [input_addr, #(1*8*16)] + eor C1, Ame, Ase + ldp Asi, Aso, [input_addr, #(1*8*22)] + eor C2, Ami, Asi + ldp Amo, Amu, [input_addr, #(1*8*18)] + eor C3, Amo, Aso + ldr Asu, [input_addr, #(1*8*24)] + eor C4, Amu, Asu + ldp Aka, Ake, [input_addr, #(1*8*10)] + eor C0, Aka, C0 + eor C1, Ake, C1 + ldp Aki, Ako, [input_addr, #(1*8*12)] + eor C2, Aki, C2 + ldp Abu, Aga, [input_addr, #(1*8*4)] + eor C3, Ako, C3 + eor C4, Aku, C4 + ldp Age, Agi, [input_addr, #(1*8*6)] + eor C0, Aga, C0 + ldp Ago, Agu, [input_addr, #(1*8*8)] + eor C1, Age, C1 + ldp Aba, Abe, [input_addr, #(1*8*0)] + eor C2, Agi, C2 + ldp Abi, Abo, [input_addr, #(1*8*2)] + eor C3, Ago, C3 + save input_addr, STACK_OFFSET_INPUT + eor C4, Agu, C4 + eor C0, Aba, C0 + eor C1, Abe, C1 + eor C2, Abi, C2 + eor C3, Abo, C3 + eor C4, Abu, C4 + + eor E1, C0, C2, ROR #63 + eor E3, C2, C4, ROR #63 + eor E0, C4, C1, ROR #63 + eor E2, C1, C3, ROR #63 + eor E4, C3, C0, ROR #63 + + eor Aba_, Aba, E0 + eor Asa_, Abi, E2 + eor Abi_, Aki, E2 + eor Aki_, Ako, E3 + eor Ako_, Amu, E4 + eor Amu_, Aso, E3 + eor Aso_, Ama, E0 + eor Aka_, Abe, E1 + eor Ase_, Ago, E3 + eor Ago_, Ame, E1 + eor Ake_, Agi, E2 + eor Agi_, Aka, E0 + eor Aga_, Abo, E3 + eor Abo_, Amo, E3 + eor Amo_, Ami, E2 + eor Ami_, Ake, E1 + eor Age_, Agu, E4 + eor Agu_, Asi, E2 + eor Asi_, Aku, E4 + eor Aku_, Asa, E0 + eor Ama_, Abu, E4 + eor Abu_, Asu, E4 + eor Asu_, Ase, E1 + eor Ame_, Aga, E0 + eor Abe_, Age, E1 + + load_constant_ptr + + bic tmp, Agi_, Age_, ROR #47 + eor Aga, tmp, Aga_, ROR #39 + bic tmp, Ago_, Agi_, ROR #42 + eor Age, tmp, Age_, ROR #25 + bic tmp, Agu_, Ago_, ROR #16 + eor Agi, tmp, Agi_, ROR #58 + bic tmp, Aga_, Agu_, ROR #31 + eor Ago, tmp, Ago_, ROR #47 + bic tmp, Age_, Aga_, ROR #56 + eor Agu, tmp, Agu_, ROR #23 + bic tmp, Aki_, Ake_, ROR #19 + eor Aka, tmp, Aka_, ROR #24 + bic tmp, Ako_, Aki_, ROR #47 + eor Ake, tmp, Ake_, ROR #2 + bic tmp, Aku_, Ako_, ROR #10 + eor Aki, tmp, Aki_, ROR #57 + bic tmp, Aka_, Aku_, ROR #47 + eor Ako, tmp, Ako_, ROR #57 + bic tmp, Ake_, Aka_, ROR #5 + eor Aku, tmp, Aku_, ROR #52 + bic tmp, Ami_, Ame_, ROR #38 + eor Ama, tmp, Ama_, ROR #47 + bic tmp, Amo_, Ami_, ROR #5 + eor Ame, tmp, Ame_, ROR #43 + bic tmp, Amu_, Amo_, ROR #41 + eor Ami, tmp, Ami_, ROR #46 + + ldr cur_const, [const_addr] + mov count, #1 + + bic tmp, Ama_, Amu_, ROR #35 + eor Amo, tmp, Amo_, ROR #12 + bic tmp, Ame_, Ama_, ROR #9 + eor Amu, tmp, Amu_, ROR #44 + bic tmp, Asi_, Ase_, ROR #48 + eor Asa, tmp, Asa_, ROR #41 + bic tmp, Aso_, Asi_, ROR #2 + eor Ase, tmp, Ase_, ROR #50 + bic tmp, Asu_, Aso_, ROR #25 + eor Asi, tmp, Asi_, ROR #27 + bic tmp, Asa_, Asu_, ROR #60 + eor Aso, tmp, Aso_, ROR #21 + bic tmp, Ase_, Asa_, ROR #57 + eor Asu, tmp, Asu_, ROR #53 + bic tmp, Abi_, Abe_, ROR #63 + eor Aba, Aba_, tmp, ROR #21 + bic tmp, Abo_, Abi_, ROR #42 + eor Abe, tmp, Abe_, ROR #41 + bic tmp, Abu_, Abo_, ROR #57 + eor Abi, tmp, Abi_, ROR #35 + bic tmp, Aba_, Abu_, ROR #50 + eor Abo, tmp, Abo_, ROR #43 + bic tmp, Abe_, Aba_, ROR #44 + eor Abu, tmp, Abu_, ROR #30 + + eor Aba, Aba, cur_const + +.endm + + +.macro keccak_f1600_round_noninitial + + save count, STACK_OFFSET_COUNT + + eor C0, Aka, Asa, ROR #50 + eor C1, Ase, Age, ROR #60 + eor C2, Ami, Agi, ROR #59 + eor C3, Ago, Aso, ROR #30 + eor C4, Abu, Asu, ROR #53 + eor C0, Ama, C0, ROR #49 + eor C1, Abe, C1, ROR #44 + eor C2, Aki, C2, ROR #26 + eor C3, Amo, C3, ROR #63 + eor C4, Amu, C4, ROR #56 + eor C0, Aga, C0, ROR #57 + eor C1, Ame, C1, ROR #58 + eor C2, Abi, C2, ROR #60 + eor C3, Ako, C3, ROR #38 + eor C4, Agu, C4, ROR #48 + eor C0, Aba, C0, ROR #61 + eor C1, Ake, C1, ROR #57 + eor C2, Asi, C2, ROR #52 + eor C3, Abo, C3, ROR #63 + eor C4, Aku, C4, ROR #50 + ror C1, C1, 56 + ror C4, C4, 58 + ror C2, C2, 62 + + eor E1, C0, C2, ROR #63 + eor E3, C2, C4, ROR #63 + eor E0, C4, C1, ROR #63 + eor E2, C1, C3, ROR #63 + eor E4, C3, C0, ROR #63 + + eor Aba_, E0, Aba + eor Asa_, E2, Abi, ROR #50 + eor Abi_, E2, Aki, ROR #46 + eor Aki_, E3, Ako, ROR #63 + eor Ako_, E4, Amu, ROR #28 + eor Amu_, E3, Aso, ROR #2 + eor Aso_, E0, Ama, ROR #54 + eor Aka_, E1, Abe, ROR #43 + eor Ase_, E3, Ago, ROR #36 + eor Ago_, E1, Ame, ROR #49 + eor Ake_, E2, Agi, ROR #3 + eor Agi_, E0, Aka, ROR #39 + eor Aga_, E3, Abo + eor Abo_, E3, Amo, ROR #37 + eor Amo_, E2, Ami, ROR #8 + eor Ami_, E1, Ake, ROR #56 + eor Age_, E4, Agu, ROR #44 + eor Agu_, E2, Asi, ROR #62 + eor Asi_, E4, Aku, ROR #58 + eor Aku_, E0, Asa, ROR #25 + eor Ama_, E4, Abu, ROR #20 + eor Abu_, E4, Asu, ROR #9 + eor Asu_, E1, Ase, ROR #23 + eor Ame_, E0, Aga, ROR #61 + eor Abe_, E1, Age, ROR #19 + + load_constant_ptr + restore count, STACK_OFFSET_COUNT + + bic tmp, Agi_, Age_, ROR #47 + eor Aga, tmp, Aga_, ROR #39 + bic tmp, Ago_, Agi_, ROR #42 + eor Age, tmp, Age_, ROR #25 + bic tmp, Agu_, Ago_, ROR #16 + eor Agi, tmp, Agi_, ROR #58 + bic tmp, Aga_, Agu_, ROR #31 + eor Ago, tmp, Ago_, ROR #47 + bic tmp, Age_, Aga_, ROR #56 + eor Agu, tmp, Agu_, ROR #23 + bic tmp, Aki_, Ake_, ROR #19 + eor Aka, tmp, Aka_, ROR #24 + bic tmp, Ako_, Aki_, ROR #47 + eor Ake, tmp, Ake_, ROR #2 + bic tmp, Aku_, Ako_, ROR #10 + eor Aki, tmp, Aki_, ROR #57 + bic tmp, Aka_, Aku_, ROR #47 + eor Ako, tmp, Ako_, ROR #57 + bic tmp, Ake_, Aka_, ROR #5 + eor Aku, tmp, Aku_, ROR #52 + bic tmp, Ami_, Ame_, ROR #38 + eor Ama, tmp, Ama_, ROR #47 + bic tmp, Amo_, Ami_, ROR #5 + eor Ame, tmp, Ame_, ROR #43 + bic tmp, Amu_, Amo_, ROR #41 + eor Ami, tmp, Ami_, ROR #46 + bic tmp, Ama_, Amu_, ROR #35 + + ldr cur_const, [const_addr, count, UXTW #3] + add count, count, #1 + + eor Amo, tmp, Amo_, ROR #12 + bic tmp, Ame_, Ama_, ROR #9 + eor Amu, tmp, Amu_, ROR #44 + bic tmp, Asi_, Ase_, ROR #48 + eor Asa, tmp, Asa_, ROR #41 + bic tmp, Aso_, Asi_, ROR #2 + eor Ase, tmp, Ase_, ROR #50 + bic tmp, Asu_, Aso_, ROR #25 + eor Asi, tmp, Asi_, ROR #27 + bic tmp, Asa_, Asu_, ROR #60 + eor Aso, tmp, Aso_, ROR #21 + bic tmp, Ase_, Asa_, ROR #57 + eor Asu, tmp, Asu_, ROR #53 + bic tmp, Abi_, Abe_, ROR #63 + eor Aba, Aba_, tmp, ROR #21 + bic tmp, Abo_, Abi_, ROR #42 + eor Abe, tmp, Abe_, ROR #41 + bic tmp, Abu_, Abo_, ROR #57 + eor Abi, tmp, Abi_, ROR #35 + bic tmp, Aba_, Abu_, ROR #50 + eor Abo, tmp, Abo_, ROR #43 + bic tmp, Abe_, Aba_, ROR #44 + eor Abu, tmp, Abu_, ROR #30 + + eor Aba, Aba, cur_const + +.endm + +.macro final_rotate_store + ror Aga, Aga,#(64-3) + restore input_addr, STACK_OFFSET_INPUT + ror Abu, Abu,#(64-44) + ror Aka, Aka,#(64-25) + ror Ake, Ake,#(64-8) + stp Abu, Aga, [input_addr, #(1*8*4)] + ror Ama, Ama,#(64-10) + ror Aku, Aku,#(64-6) + stp Aka, Ake, [input_addr, #(1*8*10)] + ror Asa, Asa,#(64-39) + ror Ase, Ase,#(64-41) + stp Aku, Ama, [input_addr, #(1*8*14)] + ror Abe, Abe,#(64-21) + ror Age, Age,#(64-45) + stp Asa, Ase, [input_addr, #(1*8*20)] + ror Agi, Agi,#(64-61) + stp Aba, Abe, [input_addr, #(1*8*0)] + ror Ame, Ame,#(64-15) + ror Ami, Ami,#(64-56) + stp Age, Agi, [input_addr, #(1*8*6)] + ror Abi, Abi,#(64-14) + ror Aki, Aki,#(64-18) + stp Ame, Ami, [input_addr, #(1*8*16)] + ror Ako, Ako,#(64-1) + stp Abi, Abo, [input_addr, #(1*8*2)] + ror Asi, Asi,#(64-2) + ror Aso, Aso,#(64-62) + stp Aki, Ako, [input_addr, #(1*8*12)] + ror Ago, Ago,#(64-28) + ror Agu, Agu,#(64-20) + stp Asi, Aso, [input_addr, #(1*8*22)] + ror Amo, Amo,#(64-27) + ror Amu, Amu,#(64-36) + stp Ago, Agu, [input_addr, #(1*8*8)] + ror Asu, Asu,#(64-55) + stp Amo, Amu, [input_addr, #(1*8*18)] + str Asu, [input_addr, #(1*8*24)] +.endm + +#define KECCAK_F1600_ROUNDS 24 + +.text +.balign 16 +.global keccak_f1600_x1_scalar_asm_v3 +.global _keccak_f1600_x1_scalar_asm_v3 + +keccak_f1600_x1_scalar_asm_v3: +_keccak_f1600_x1_scalar_asm_v3: + alloc_stack + save_gprs + + keccak_f1600_round_initial +loop: + keccak_f1600_round_noninitial + cmp count, #(KECCAK_F1600_ROUNDS-1) + ble loop + + final_rotate_store + restore_gprs + free_stack + ret diff --git a/asm/manual/keccak_f1600/keccak_f1600_x1_scalar_asm_v4.s b/asm/manual/keccak_f1600/keccak_f1600_x1_scalar_asm_v4.s new file mode 100644 index 0000000..95f2275 --- /dev/null +++ b/asm/manual/keccak_f1600/keccak_f1600_x1_scalar_asm_v4.s @@ -0,0 +1,495 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +/********************** CONSTANTS *************************/ + .data + .balign 64 +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x29 + count .req w27 + cur_const .req x26 + + /* Mapping of Kecck-f1600 state to scalar registers + * at the beginning and end of each round. */ + Aba .req x1 + Abe .req x6 + Abi .req x11 + Abo .req x16 + Abu .req x21 + Aga .req x2 + Age .req x7 + Agi .req x12 + Ago .req x17 + Agu .req x22 + Aka .req x3 + Ake .req x8 + Aki .req x13 + Ako .req x18 + Aku .req x23 + Ama .req x4 + Ame .req x9 + Ami .req x14 + Amo .req x19 + Amu .req x24 + Asa .req x5 + Ase .req x10 + Asi .req x15 + Aso .req x20 + Asu .req x25 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + Aba_ .req x30 + Abe_ .req x28 + Abi_ .req x11 + Abo_ .req x16 + Abu_ .req x21 + Aga_ .req x3 + Age_ .req x8 + Agi_ .req x12 + Ago_ .req x17 + Agu_ .req x22 + Aka_ .req x4 + Ake_ .req x9 + Aki_ .req x13 + Ako_ .req x18 + Aku_ .req x23 + Ama_ .req x5 + Ame_ .req x10 + Ami_ .req x14 + Amo_ .req x19 + Amu_ .req x24 + Asa_ .req x1 + Ase_ .req x6 + Asi_ .req x15 + Aso_ .req x20 + Asu_ .req x25 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + C0 .req x30 + E0 .req x29 + C1 .req x26 + E1 .req x0 + C2 .req x27 + E2 .req x26 + C3 .req x28 + E3 .req x27 + C4 .req x29 + E4 .req x28 + + tmp .req x0 + tmp0 .req x0 + tmp1 .req x26 + +/************************ MACROS ****************************/ + +#define STACK_SIZE (16*6 + 3*8 + 8) // GPRs (16*6), count (8), const (8), input (8), padding (8) +#define STACK_BASE_GPRS (3*8+8) +#define STACK_OFFSET_INPUT (0*8) +#define STACK_OFFSET_CONST (1*8) +#define STACK_OFFSET_COUNT (2*8) + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +.macro save reg, offset + str \reg, [sp, #\offset] +.endm + +.macro restore reg, offset + ldr \reg, [sp, #\offset] +.endm + +.macro save_gprs + stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro keccak_f1600_round_initial + ldp Aku, Ama, [input_addr, #(1*8*14)] + ldp Asa, Ase, [input_addr, #(1*8*20)] + eor C0, Ama, Asa + ldp Ame, Ami, [input_addr, #(1*8*16)] + eor C1, Ame, Ase + ldp Asi, Aso, [input_addr, #(1*8*22)] + eor C2, Ami, Asi + ldp Amo, Amu, [input_addr, #(1*8*18)] + eor C3, Amo, Aso + ldr Asu, [input_addr, #(1*8*24)] + eor C4, Amu, Asu + ldp Aka, Ake, [input_addr, #(1*8*10)] + eor C0, Aka, C0 + eor C1, Ake, C1 + ldp Aki, Ako, [input_addr, #(1*8*12)] + eor C2, Aki, C2 + ldp Abu, Aga, [input_addr, #(1*8*4)] + eor C3, Ako, C3 + eor C4, Aku, C4 + ldp Age, Agi, [input_addr, #(1*8*6)] + eor C0, Aga, C0 + ldp Ago, Agu, [input_addr, #(1*8*8)] + eor C1, Age, C1 + ldp Aba, Abe, [input_addr, #(1*8*0)] + eor C2, Agi, C2 + ldp Abi, Abo, [input_addr, #(1*8*2)] + eor C3, Ago, C3 + save input_addr, STACK_OFFSET_INPUT + eor C4, Agu, C4 + eor C0, Aba, C0 + eor C1, Abe, C1 + eor C2, Abi, C2 + eor C3, Abo, C3 + eor C4, Abu, C4 + + eor E1, C0, C2, ROR #63 + eor E3, C2, C4, ROR #63 + eor E0, C4, C1, ROR #63 + eor E2, C1, C3, ROR #63 + eor E4, C3, C0, ROR #63 + + eor Aba_, Aba, E0 + eor Asa_, Abi, E2 + eor Abi_, Aki, E2 + eor Aki_, Ako, E3 + eor Ako_, Amu, E4 + eor Amu_, Aso, E3 + eor Aso_, Ama, E0 + eor Aka_, Abe, E1 + eor Ase_, Ago, E3 + eor Ago_, Ame, E1 + eor Ake_, Agi, E2 + eor Agi_, Aka, E0 + eor Aga_, Abo, E3 + eor Abo_, Amo, E3 + eor Amo_, Ami, E2 + eor Ami_, Ake, E1 + eor Age_, Agu, E4 + eor Agu_, Asi, E2 + eor Asi_, Aku, E4 + eor Aku_, Asa, E0 + eor Ama_, Abu, E4 + eor Abu_, Asu, E4 + eor Asu_, Ase, E1 + eor Ame_, Aga, E0 + eor Abe_, Age, E1 + + load_constant_ptr + + bic tmp, Agi_, Age_, ROR #47 + eor Aga, tmp, Aga_, ROR #39 + bic tmp, Ago_, Agi_, ROR #42 + eor Age, tmp, Age_, ROR #25 + bic tmp, Agu_, Ago_, ROR #16 + eor Agi, tmp, Agi_, ROR #58 + bic tmp, Aga_, Agu_, ROR #31 + eor Ago, tmp, Ago_, ROR #47 + bic tmp, Age_, Aga_, ROR #56 + eor Agu, tmp, Agu_, ROR #23 + bic tmp, Aki_, Ake_, ROR #19 + eor Aka, tmp, Aka_, ROR #24 + bic tmp, Ako_, Aki_, ROR #47 + eor Ake, tmp, Ake_, ROR #2 + bic tmp, Aku_, Ako_, ROR #10 + eor Aki, tmp, Aki_, ROR #57 + bic tmp, Aka_, Aku_, ROR #47 + eor Ako, tmp, Ako_, ROR #57 + bic tmp, Ake_, Aka_, ROR #5 + eor Aku, tmp, Aku_, ROR #52 + bic tmp, Ami_, Ame_, ROR #38 + eor Ama, tmp, Ama_, ROR #47 + bic tmp, Amo_, Ami_, ROR #5 + eor Ame, tmp, Ame_, ROR #43 + bic tmp, Amu_, Amo_, ROR #41 + eor Ami, tmp, Ami_, ROR #46 + + ldr cur_const, [const_addr] + mov count, #1 + + bic tmp, Ama_, Amu_, ROR #35 + eor Amo, tmp, Amo_, ROR #12 + bic tmp, Ame_, Ama_, ROR #9 + eor Amu, tmp, Amu_, ROR #44 + bic tmp, Asi_, Ase_, ROR #48 + eor Asa, tmp, Asa_, ROR #41 + bic tmp, Aso_, Asi_, ROR #2 + eor Ase, tmp, Ase_, ROR #50 + bic tmp, Asu_, Aso_, ROR #25 + eor Asi, tmp, Asi_, ROR #27 + bic tmp, Asa_, Asu_, ROR #60 + eor Aso, tmp, Aso_, ROR #21 + bic tmp, Ase_, Asa_, ROR #57 + eor Asu, tmp, Asu_, ROR #53 + bic tmp, Abi_, Abe_, ROR #63 + eor Aba, Aba_, tmp, ROR #21 + bic tmp, Abo_, Abi_, ROR #42 + eor Abe, tmp, Abe_, ROR #41 + bic tmp, Abu_, Abo_, ROR #57 + eor Abi, tmp, Abi_, ROR #35 + bic tmp, Aba_, Abu_, ROR #50 + eor Abo, tmp, Abo_, ROR #43 + bic tmp, Abe_, Aba_, ROR #44 + eor Abu, tmp, Abu_, ROR #30 + + eor Aba, Aba, cur_const + +.endm + + +.macro keccak_f1600_round_noninitial + + save count, STACK_OFFSET_COUNT + + eor C0, Aka, Asa, ROR #50 + eor C1, Ase, Age, ROR #60 + eor C2, Ami, Agi, ROR #59 + eor C3, Ago, Aso, ROR #30 + eor C4, Abu, Asu, ROR #53 + eor C0, Ama, C0, ROR #49 + eor C1, Abe, C1, ROR #44 + eor C2, Aki, C2, ROR #26 + eor C3, Amo, C3, ROR #63 + eor C4, Amu, C4, ROR #56 + eor C0, Aga, C0, ROR #57 + eor C1, Ame, C1, ROR #58 + eor C2, Abi, C2, ROR #60 + eor C3, Ako, C3, ROR #38 + eor C4, Agu, C4, ROR #48 + eor C0, Aba, C0, ROR #61 + eor C1, Ake, C1, ROR #57 + eor C2, Asi, C2, ROR #52 + eor C3, Abo, C3, ROR #63 + eor C4, Aku, C4, ROR #50 + ror C1, C1, 56 + ror C4, C4, 58 + ror C2, C2, 62 + + eor E1, C0, C2, ROR #63 + eor E3, C2, C4, ROR #63 + eor E0, C4, C1, ROR #63 + eor E2, C1, C3, ROR #63 + eor E4, C3, C0, ROR #63 + + eor Aba_, E0, Aba + eor Asa_, E2, Abi, ROR #50 + eor Abi_, E2, Aki, ROR #46 + eor Aki_, E3, Ako, ROR #63 + eor Ako_, E4, Amu, ROR #28 + eor Amu_, E3, Aso, ROR #2 + eor Aso_, E0, Ama, ROR #54 + eor Aka_, E1, Abe, ROR #43 + eor Ase_, E3, Ago, ROR #36 + eor Ago_, E1, Ame, ROR #49 + eor Ake_, E2, Agi, ROR #3 + eor Agi_, E0, Aka, ROR #39 + eor Aga_, E3, Abo + eor Abo_, E3, Amo, ROR #37 + eor Amo_, E2, Ami, ROR #8 + eor Ami_, E1, Ake, ROR #56 + eor Age_, E4, Agu, ROR #44 + eor Agu_, E2, Asi, ROR #62 + eor Asi_, E4, Aku, ROR #58 + eor Aku_, E0, Asa, ROR #25 + eor Ama_, E4, Abu, ROR #20 + eor Abu_, E4, Asu, ROR #9 + eor Asu_, E1, Ase, ROR #23 + eor Ame_, E0, Aga, ROR #61 + eor Abe_, E1, Age, ROR #19 + + load_constant_ptr + restore count, STACK_OFFSET_COUNT + + bic tmp0, Agi_, Age_, ROR #47 + bic tmp1, Ago_, Agi_, ROR #42 + eor Aga, tmp0, Aga_, ROR #39 + bic tmp0, Agu_, Ago_, ROR #16 + eor Age, tmp1, Age_, ROR #25 + bic tmp1, Aga_, Agu_, ROR #31 + eor Agi, tmp0, Agi_, ROR #58 + bic tmp0, Age_, Aga_, ROR #56 + eor Ago, tmp1, Ago_, ROR #47 + bic tmp1, Aki_, Ake_, ROR #19 + eor Agu, tmp0, Agu_, ROR #23 + bic tmp0, Ako_, Aki_, ROR #47 + eor Aka, tmp1, Aka_, ROR #24 + bic tmp1, Aku_, Ako_, ROR #10 + eor Ake, tmp0, Ake_, ROR #2 + bic tmp0, Aka_, Aku_, ROR #47 + eor Aki, tmp1, Aki_, ROR #57 + bic tmp1, Ake_, Aka_, ROR #5 + eor Ako, tmp0, Ako_, ROR #57 + bic tmp0, Ami_, Ame_, ROR #38 + eor Aku, tmp1, Aku_, ROR #52 + bic tmp1, Amo_, Ami_, ROR #5 + eor Ama, tmp0, Ama_, ROR #47 + bic tmp0, Amu_, Amo_, ROR #41 + eor Ame, tmp1, Ame_, ROR #43 + bic tmp1, Ama_, Amu_, ROR #35 + eor Ami, tmp0, Ami_, ROR #46 + bic tmp0, Ame_, Ama_, ROR #9 + eor Amo, tmp1, Amo_, ROR #12 + bic tmp1, Asi_, Ase_, ROR #48 + eor Amu, tmp0, Amu_, ROR #44 + bic tmp0, Aso_, Asi_, ROR #2 + eor Asa, tmp1, Asa_, ROR #41 + bic tmp1, Asu_, Aso_, ROR #25 + eor Ase, tmp0, Ase_, ROR #50 + bic tmp0, Asa_, Asu_, ROR #60 + eor Asi, tmp1, Asi_, ROR #27 + bic tmp1, Ase_, Asa_, ROR #57 + eor Aso, tmp0, Aso_, ROR #21 + bic tmp0, Abi_, Abe_, ROR #63 + eor Asu, tmp1, Asu_, ROR #53 + bic tmp1, Abo_, Abi_, ROR #42 + eor Aba, Aba_, tmp0, ROR #21 + bic tmp0, Abu_, Abo_, ROR #57 + eor Abe, tmp1, Abe_, ROR #41 + bic tmp1, Aba_, Abu_, ROR #50 + eor Abi, tmp0, Abi_, ROR #35 + bic tmp0, Abe_, Aba_, ROR #44 + eor Abo, tmp1, Abo_, ROR #43 + eor Abu, tmp0, Abu_, ROR #30 + + ldr cur_const, [const_addr, count, UXTW #3] + add count, count, #1 + + eor Aba, Aba, cur_const + +.endm + +.macro final_rotate_store + ror Aga, Aga,#(64-3) + restore input_addr, STACK_OFFSET_INPUT + ror Abu, Abu,#(64-44) + ror Aka, Aka,#(64-25) + ror Ake, Ake,#(64-8) + stp Abu, Aga, [input_addr, #(1*8*4)] + ror Ama, Ama,#(64-10) + ror Aku, Aku,#(64-6) + stp Aka, Ake, [input_addr, #(1*8*10)] + ror Asa, Asa,#(64-39) + ror Ase, Ase,#(64-41) + stp Aku, Ama, [input_addr, #(1*8*14)] + ror Abe, Abe,#(64-21) + ror Age, Age,#(64-45) + stp Asa, Ase, [input_addr, #(1*8*20)] + ror Agi, Agi,#(64-61) + stp Aba, Abe, [input_addr, #(1*8*0)] + ror Ame, Ame,#(64-15) + ror Ami, Ami,#(64-56) + stp Age, Agi, [input_addr, #(1*8*6)] + ror Abi, Abi,#(64-14) + ror Aki, Aki,#(64-18) + stp Ame, Ami, [input_addr, #(1*8*16)] + ror Ako, Ako,#(64-1) + stp Abi, Abo, [input_addr, #(1*8*2)] + ror Asi, Asi,#(64-2) + ror Aso, Aso,#(64-62) + stp Aki, Ako, [input_addr, #(1*8*12)] + ror Ago, Ago,#(64-28) + ror Agu, Agu,#(64-20) + stp Asi, Aso, [input_addr, #(1*8*22)] + ror Amo, Amo,#(64-27) + ror Amu, Amu,#(64-36) + stp Ago, Agu, [input_addr, #(1*8*8)] + ror Asu, Asu,#(64-55) + stp Amo, Amu, [input_addr, #(1*8*18)] + str Asu, [input_addr, #(1*8*24)] +.endm + +#define KECCAK_F1600_ROUNDS 24 + +.text +.balign 16 +.global keccak_f1600_x1_scalar_asm_v4 +.global _keccak_f1600_x1_scalar_asm_v4 + +keccak_f1600_x1_scalar_asm_v4: +_keccak_f1600_x1_scalar_asm_v4: + alloc_stack + save_gprs + + keccak_f1600_round_initial +loop: + keccak_f1600_round_noninitial + cmp count, #(KECCAK_F1600_ROUNDS-1) + ble loop + + final_rotate_store + restore_gprs + free_stack + ret diff --git a/asm/manual/keccak_f1600/keccak_f1600_x1_scalar_asm_v5.s b/asm/manual/keccak_f1600/keccak_f1600_x1_scalar_asm_v5.s new file mode 100644 index 0000000..19f1cc2 --- /dev/null +++ b/asm/manual/keccak_f1600/keccak_f1600_x1_scalar_asm_v5.s @@ -0,0 +1,506 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +/********************** CONSTANTS *************************/ + .data + .balign 64 +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x26 + cur_const .req x26 + count .req w27 + + /* Mapping of Kecck-f1600 state to scalar registers + * at the beginning and end of each round. */ + Aba .req x1 + Abe .req x6 + Abi .req x11 + Abo .req x16 + Abu .req x21 + Aga .req x2 + Age .req x7 + Agi .req x12 + Ago .req x17 + Agu .req x22 + Aka .req x3 + Ake .req x8 + Aki .req x13 + Ako .req x18 + Aku .req x23 + Ama .req x4 + Ame .req x9 + Ami .req x14 + Amo .req x19 + Amu .req x24 + Asa .req x5 + Ase .req x10 + Asi .req x15 + Aso .req x20 + Asu .req x25 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + Aba_ .req x30 + Abe_ .req x28 + Abi_ .req x11 + Abo_ .req x16 + Abu_ .req x21 + Aga_ .req x3 + Age_ .req x8 + Agi_ .req x12 + Ago_ .req x17 + Agu_ .req x22 + Aka_ .req x4 + Ake_ .req x9 + Aki_ .req x13 + Ako_ .req x18 + Aku_ .req x23 + Ama_ .req x5 + Ame_ .req x10 + Ami_ .req x14 + Amo_ .req x19 + Amu_ .req x24 + Asa_ .req x1 + Ase_ .req x6 + Asi_ .req x15 + Aso_ .req x20 + Asu_ .req x25 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + C0 .req x30 + E0 .req x29 + C1 .req x26 + E1 .req x0 + C2 .req x27 + E2 .req x26 + C3 .req x28 + E3 .req x27 + C4 .req x29 + E4 .req x28 + + tmp .req x0 + +/************************ MACROS ****************************/ + +#define STACK_SIZE (16*6 + 3*8 + 8) // GPRs (16*6), count (8), const (8), input (8), padding (8) +#define STACK_BASE_GPRS (3*8+8) +#define STACK_OFFSET_INPUT (0*8) +#define STACK_OFFSET_CONST (1*8) +#define STACK_OFFSET_COUNT (2*8) + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +.macro save reg, offset + str \reg, [sp, #\offset] +.endm + +.macro restore reg, offset + ldr \reg, [sp, #\offset] +.endm + +.macro save_gprs + stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro keccak_f1600_round_initial + ldp Aku, Ama, [input_addr, #(1*8*14)] + ldp Asa, Ase, [input_addr, #(1*8*20)] + eor C0, Ama, Asa + ldp Ame, Ami, [input_addr, #(1*8*16)] + eor C1, Ame, Ase + ldp Asi, Aso, [input_addr, #(1*8*22)] + eor C2, Ami, Asi + ldp Amo, Amu, [input_addr, #(1*8*18)] + eor C3, Amo, Aso + ldr Asu, [input_addr, #(1*8*24)] + eor C4, Amu, Asu + ldp Aka, Ake, [input_addr, #(1*8*10)] + eor C0, Aka, C0 + eor C1, Ake, C1 + ldp Aki, Ako, [input_addr, #(1*8*12)] + eor C2, Aki, C2 + ldp Abu, Aga, [input_addr, #(1*8*4)] + eor C3, Ako, C3 + eor C4, Aku, C4 + ldp Age, Agi, [input_addr, #(1*8*6)] + eor C0, Aga, C0 + ldp Ago, Agu, [input_addr, #(1*8*8)] + eor C1, Age, C1 + ldp Aba, Abe, [input_addr, #(1*8*0)] + eor C2, Agi, C2 + ldp Abi, Abo, [input_addr, #(1*8*2)] + eor C3, Ago, C3 + save input_addr, STACK_OFFSET_INPUT + eor C4, Agu, C4 + eor C0, Aba, C0 + eor C1, Abe, C1 + eor C2, Abi, C2 + eor C3, Abo, C3 + eor C4, Abu, C4 + + eor E1, C0, C2, ROR #63 + eor E3, C2, C4, ROR #63 + eor E0, C4, C1, ROR #63 + eor E2, C1, C3, ROR #63 + eor E4, C3, C0, ROR #63 + + eor Aba_, Aba, E0 + eor Asa_, Abi, E2 + eor Abi_, Aki, E2 + eor Aki_, Ako, E3 + eor Ako_, Amu, E4 + eor Amu_, Aso, E3 + eor Aso_, Ama, E0 + eor Aka_, Abe, E1 + eor Ase_, Ago, E3 + eor Ago_, Ame, E1 + eor Ake_, Agi, E2 + eor Agi_, Aka, E0 + eor Aga_, Abo, E3 + eor Abo_, Amo, E3 + eor Amo_, Ami, E2 + eor Ami_, Ake, E1 + eor Age_, Agu, E4 + eor Agu_, Asi, E2 + eor Asi_, Aku, E4 + eor Aku_, Asa, E0 + eor Ama_, Abu, E4 + eor Abu_, Asu, E4 + eor Asu_, Ase, E1 + eor Ame_, Aga, E0 + eor Abe_, Age, E1 + + load_constant_ptr + + tmp0 .req x0 + tmp1 .req x29 + + bic tmp0, Agi_, Age_, ROR #47 + bic tmp1, Ago_, Agi_, ROR #42 + eor Aga, tmp0, Aga_, ROR #39 + bic tmp0, Agu_, Ago_, ROR #16 + eor Age, tmp1, Age_, ROR #25 + bic tmp1, Aga_, Agu_, ROR #31 + eor Agi, tmp0, Agi_, ROR #58 + bic tmp0, Age_, Aga_, ROR #56 + eor Ago, tmp1, Ago_, ROR #47 + bic tmp1, Aki_, Ake_, ROR #19 + eor Agu, tmp0, Agu_, ROR #23 + bic tmp0, Ako_, Aki_, ROR #47 + eor Aka, tmp1, Aka_, ROR #24 + bic tmp1, Aku_, Ako_, ROR #10 + eor Ake, tmp0, Ake_, ROR #2 + bic tmp0, Aka_, Aku_, ROR #47 + eor Aki, tmp1, Aki_, ROR #57 + bic tmp1, Ake_, Aka_, ROR #5 + eor Ako, tmp0, Ako_, ROR #57 + bic tmp0, Ami_, Ame_, ROR #38 + eor Aku, tmp1, Aku_, ROR #52 + bic tmp1, Amo_, Ami_, ROR #5 + eor Ama, tmp0, Ama_, ROR #47 + bic tmp0, Amu_, Amo_, ROR #41 + eor Ame, tmp1, Ame_, ROR #43 + bic tmp1, Ama_, Amu_, ROR #35 + eor Ami, tmp0, Ami_, ROR #46 + bic tmp0, Ame_, Ama_, ROR #9 + + str const_addr, [sp, #(STACK_OFFSET_CONST)] + ldr cur_const, [const_addr] + + eor Amo, tmp1, Amo_, ROR #12 + bic tmp1, Asi_, Ase_, ROR #48 + eor Amu, tmp0, Amu_, ROR #44 + bic tmp0, Aso_, Asi_, ROR #2 + eor Asa, tmp1, Asa_, ROR #41 + bic tmp1, Asu_, Aso_, ROR #25 + eor Ase, tmp0, Ase_, ROR #50 + bic tmp0, Asa_, Asu_, ROR #60 + eor Asi, tmp1, Asi_, ROR #27 + bic tmp1, Ase_, Asa_, ROR #57 + eor Aso, tmp0, Aso_, ROR #21 + + mov count, #1 + + bic tmp0, Abi_, Abe_, ROR #63 + eor Asu, tmp1, Asu_, ROR #53 + bic tmp1, Abo_, Abi_, ROR #42 + eor Aba, Aba_, tmp0, ROR #21 + bic tmp0, Abu_, Abo_, ROR #57 + eor Abe, tmp1, Abe_, ROR #41 + bic tmp1, Aba_, Abu_, ROR #50 + eor Abi, tmp0, Abi_, ROR #35 + bic tmp0, Abe_, Aba_, ROR #44 + eor Abo, tmp1, Abo_, ROR #43 + eor Abu, tmp0, Abu_, ROR #30 + + eor Aba, Aba, cur_const + save count, STACK_OFFSET_COUNT + +.endm + + +.macro keccak_f1600_round_noninitial + + eor C2, Asi, Abi, ROR #52 + eor C0, Aba, Aga, ROR #61 + eor C4, Aku, Agu, ROR #50 + eor C1, Ake, Ame, ROR #57 + eor C3, Abo, Ako, ROR #63 + eor C2, C2, Aki, ROR #48 + eor C0, C0, Ama, ROR #54 + eor C4, C4, Amu, ROR #34 + eor C1, C1, Abe, ROR #51 + eor C3, C3, Amo, ROR #37 + eor C2, C2, Ami, ROR #10 + eor C0, C0, Aka, ROR #39 + eor C4, C4, Abu, ROR #26 + eor C1, C1, Ase, ROR #31 + eor C3, C3, Ago, ROR #36 + eor C2, C2, Agi, ROR #5 + eor C0, C0, Asa, ROR #25 + eor C4, C4, Asu, ROR #15 + eor C1, C1, Age, ROR #27 + eor C3, C3, Aso, ROR #2 + + eor E1, C0, C2, ROR #61 + ror C2, C2, 62 + eor E3, C2, C4, ROR #57 + ror C4, C4, 58 + eor E0, C4, C1, ROR #55 + ror C1, C1, 56 + eor E2, C1, C3, ROR #63 + eor E4, C3, C0, ROR #63 + + eor Aba_, E0, Aba + eor Asa_, E2, Abi, ROR #50 + eor Abi_, E2, Aki, ROR #46 + eor Aki_, E3, Ako, ROR #63 + eor Ako_, E4, Amu, ROR #28 + eor Amu_, E3, Aso, ROR #2 + eor Aso_, E0, Ama, ROR #54 + eor Aka_, E1, Abe, ROR #43 + eor Ase_, E3, Ago, ROR #36 + eor Ago_, E1, Ame, ROR #49 + eor Ake_, E2, Agi, ROR #3 + eor Agi_, E0, Aka, ROR #39 + eor Aga_, E3, Abo + eor Abo_, E3, Amo, ROR #37 + eor Amo_, E2, Ami, ROR #8 + eor Ami_, E1, Ake, ROR #56 + eor Age_, E4, Agu, ROR #44 + eor Agu_, E2, Asi, ROR #62 + eor Asi_, E4, Aku, ROR #58 + eor Aku_, E0, Asa, ROR #25 + eor Ama_, E4, Abu, ROR #20 + eor Abu_, E4, Asu, ROR #9 + eor Asu_, E1, Ase, ROR #23 + eor Ame_, E0, Aga, ROR #61 + eor Abe_, E1, Age, ROR #19 + + load_constant_ptr_stack + restore count, STACK_OFFSET_COUNT + + tmp0 .req x0 + tmp1 .req x29 + + bic tmp0, Agi_, Age_, ROR #47 + bic tmp1, Ago_, Agi_, ROR #42 + eor Aga, tmp0, Aga_, ROR #39 + bic tmp0, Agu_, Ago_, ROR #16 + eor Age, tmp1, Age_, ROR #25 + bic tmp1, Aga_, Agu_, ROR #31 + eor Agi, tmp0, Agi_, ROR #58 + bic tmp0, Age_, Aga_, ROR #56 + eor Ago, tmp1, Ago_, ROR #47 + bic tmp1, Aki_, Ake_, ROR #19 + eor Agu, tmp0, Agu_, ROR #23 + bic tmp0, Ako_, Aki_, ROR #47 + eor Aka, tmp1, Aka_, ROR #24 + bic tmp1, Aku_, Ako_, ROR #10 + eor Ake, tmp0, Ake_, ROR #2 + bic tmp0, Aka_, Aku_, ROR #47 + eor Aki, tmp1, Aki_, ROR #57 + bic tmp1, Ake_, Aka_, ROR #5 + eor Ako, tmp0, Ako_, ROR #57 + bic tmp0, Ami_, Ame_, ROR #38 + eor Aku, tmp1, Aku_, ROR #52 + bic tmp1, Amo_, Ami_, ROR #5 + eor Ama, tmp0, Ama_, ROR #47 + bic tmp0, Amu_, Amo_, ROR #41 + eor Ame, tmp1, Ame_, ROR #43 + bic tmp1, Ama_, Amu_, ROR #35 + eor Ami, tmp0, Ami_, ROR #46 + bic tmp0, Ame_, Ama_, ROR #9 + + ldr cur_const, [const_addr, count, UXTW #3] + + eor Amo, tmp1, Amo_, ROR #12 + bic tmp1, Asi_, Ase_, ROR #48 + eor Amu, tmp0, Amu_, ROR #44 + bic tmp0, Aso_, Asi_, ROR #2 + eor Asa, tmp1, Asa_, ROR #41 + bic tmp1, Asu_, Aso_, ROR #25 + eor Ase, tmp0, Ase_, ROR #50 + bic tmp0, Asa_, Asu_, ROR #60 + eor Asi, tmp1, Asi_, ROR #27 + bic tmp1, Ase_, Asa_, ROR #57 + eor Aso, tmp0, Aso_, ROR #21 + bic tmp0, Abi_, Abe_, ROR #63 + add count, count, #1 + save count, STACK_OFFSET_COUNT + eor Asu, tmp1, Asu_, ROR #53 + bic tmp1, Abo_, Abi_, ROR #42 + eor Aba, Aba_, tmp0, ROR #21 + bic tmp0, Abu_, Abo_, ROR #57 + eor Abe, tmp1, Abe_, ROR #41 + bic tmp1, Aba_, Abu_, ROR #50 + eor Abi, tmp0, Abi_, ROR #35 + bic tmp0, Abe_, Aba_, ROR #44 + eor Abo, tmp1, Abo_, ROR #43 + eor Abu, tmp0, Abu_, ROR #30 + + eor Aba, Aba, cur_const + +.endm + +.macro final_rotate_store + ror Aga, Aga,#(64-3) + restore input_addr, STACK_OFFSET_INPUT + ror Abu, Abu,#(64-44) + ror Aka, Aka,#(64-25) + ror Ake, Ake,#(64-8) + stp Abu, Aga, [input_addr, #(1*8*4)] + ror Ama, Ama,#(64-10) + ror Aku, Aku,#(64-6) + stp Aka, Ake, [input_addr, #(1*8*10)] + ror Asa, Asa,#(64-39) + ror Ase, Ase,#(64-41) + stp Aku, Ama, [input_addr, #(1*8*14)] + ror Abe, Abe,#(64-21) + ror Age, Age,#(64-45) + stp Asa, Ase, [input_addr, #(1*8*20)] + ror Agi, Agi,#(64-61) + stp Aba, Abe, [input_addr, #(1*8*0)] + ror Ame, Ame,#(64-15) + ror Ami, Ami,#(64-56) + stp Age, Agi, [input_addr, #(1*8*6)] + ror Abi, Abi,#(64-14) + ror Aki, Aki,#(64-18) + stp Ame, Ami, [input_addr, #(1*8*16)] + ror Ako, Ako,#(64-1) + stp Abi, Abo, [input_addr, #(1*8*2)] + ror Asi, Asi,#(64-2) + ror Aso, Aso,#(64-62) + stp Aki, Ako, [input_addr, #(1*8*12)] + ror Ago, Ago,#(64-28) + ror Agu, Agu,#(64-20) + stp Asi, Aso, [input_addr, #(1*8*22)] + ror Amo, Amo,#(64-27) + ror Amu, Amu,#(64-36) + stp Ago, Agu, [input_addr, #(1*8*8)] + ror Asu, Asu,#(64-55) + stp Amo, Amu, [input_addr, #(1*8*18)] + str Asu, [input_addr, #(1*8*24)] +.endm + +#define KECCAK_F1600_ROUNDS 24 + +.text +.balign 16 +.global keccak_f1600_x1_scalar_asm_v5 +.global _keccak_f1600_x1_scalar_asm_v5 + +.macro load_constant_ptr_stack + ldr const_addr, [sp, #(STACK_OFFSET_CONST)] +.endm +keccak_f1600_x1_scalar_asm_v5: +_keccak_f1600_x1_scalar_asm_v5: + alloc_stack + save_gprs + + keccak_f1600_round_initial +loop: + keccak_f1600_round_noninitial + cmp count, #(KECCAK_F1600_ROUNDS-1) + ble loop + + final_rotate_store + restore_gprs + free_stack + ret diff --git a/asm/manual/keccak_f1600/keccak_f1600_x2_hybrid_asm_v1.s b/asm/manual/keccak_f1600/keccak_f1600_x2_hybrid_asm_v1.s new file mode 100644 index 0000000..4073530 --- /dev/null +++ b/asm/manual/keccak_f1600/keccak_f1600_x2_hybrid_asm_v1.s @@ -0,0 +1,417 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +#if defined(__ARM_FEATURE_SHA3) + +/********************** CONSTANTS *************************/ + .data + .align(8) +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x1 + count .req x2 + cur_const .req x3 + + /* Mapping of Kecck-f1600 state to vector registers + * at the beginning and end of each round. */ + Aba .req v0 + Abe .req v1 + Abi .req v2 + Abo .req v3 + Abu .req v4 + Aga .req v5 + Age .req v6 + Agi .req v7 + Ago .req v8 + Agu .req v9 + Aka .req v10 + Ake .req v11 + Aki .req v12 + Ako .req v13 + Aku .req v14 + Ama .req v15 + Ame .req v16 + Ami .req v17 + Amo .req v18 + Amu .req v19 + Asa .req v20 + Ase .req v21 + Asi .req v22 + Aso .req v23 + Asu .req v24 + + /* q-form of the above mapping */ + Abaq .req q0 + Abeq .req q1 + Abiq .req q2 + Aboq .req q3 + Abuq .req q4 + Agaq .req q5 + Ageq .req q6 + Agiq .req q7 + Agoq .req q8 + Aguq .req q9 + Akaq .req q10 + Akeq .req q11 + Akiq .req q12 + Akoq .req q13 + Akuq .req q14 + Amaq .req q15 + Ameq .req q16 + Amiq .req q17 + Amoq .req q18 + Amuq .req q19 + Asaq .req q20 + Aseq .req q21 + Asiq .req q22 + Asoq .req q23 + Asuq .req q24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v30 + C1 .req v29 + C2 .req v28 + C3 .req v27 + C4 .req v26 + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req v26 + E1 .req v25 + E2 .req v29 + E3 .req v28 + E4 .req v27 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + Abi_ .req v2 + Abo_ .req v3 + Abu_ .req v4 + Aga_ .req v10 + Age_ .req v11 + Agi_ .req v7 + Ago_ .req v8 + Agu_ .req v9 + Aka_ .req v15 + Ake_ .req v16 + Aki_ .req v12 + Ako_ .req v13 + Aku_ .req v14 + Ama_ .req v20 + Ame_ .req v21 + Ami_ .req v17 + Amo_ .req v18 + Amu_ .req v19 + Asa_ .req v0 + Ase_ .req v1 + Asi_ .req v22 + Aso_ .req v23 + Asu_ .req v24 + Aba_ .req v30 + Abe_ .req v27 + + + vtmp .req v31 +/************************ MACROS ****************************/ + +.macro eor3_m1 d s0 s1 s2 + eor \d\().16b, \s0\().16b, \s1\().16b + eor \d\().16b, \d\().16b, \s2\().16b +.endm +.macro rax1_m1 d s0 s1 + add vtmp.2d, \s1\().2d, \s1\().2d + sri vtmp.2d, \s1\().2d, #63 + eor \d\().16b, vtmp.16b, \s0\().16b +.endm +.macro xar_m1 d s0 s1 imm + eor vtmp.16b, \s0\().16b, \s1\().16b + shl \d\().2d, vtmp.2d, #(64-\imm) + sri \d\().2d, vtmp.2d, #(\imm) +.endm + +.macro bcax_m1 d s0 s1 s2 + bic vtmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, vtmp.16b, \s0\().16b +.endm + + +.macro load_input + ldr Abaq, [input_addr, #(2*8*0)] + ldr Abeq, [input_addr, #(2*8*1)] + ldr Abiq, [input_addr, #(2*8*2)] + ldr Aboq, [input_addr, #(2*8*3)] + ldr Abuq, [input_addr, #(2*8*4)] + ldr Agaq, [input_addr, #(2*8*5)] + ldr Ageq, [input_addr, #(2*8*6)] + ldr Agiq, [input_addr, #(2*8*7)] + ldr Agoq, [input_addr, #(2*8*8)] + ldr Aguq, [input_addr, #(2*8*9)] + ldr Akaq, [input_addr, #(2*8*10)] + ldr Akeq, [input_addr, #(2*8*11)] + ldr Akiq, [input_addr, #(2*8*12)] + ldr Akoq, [input_addr, #(2*8*13)] + ldr Akuq, [input_addr, #(2*8*14)] + ldr Amaq, [input_addr, #(2*8*15)] + ldr Ameq, [input_addr, #(2*8*16)] + ldr Amiq, [input_addr, #(2*8*17)] + ldr Amoq, [input_addr, #(2*8*18)] + ldr Amuq, [input_addr, #(2*8*19)] + ldr Asaq, [input_addr, #(2*8*20)] + ldr Aseq, [input_addr, #(2*8*21)] + ldr Asiq, [input_addr, #(2*8*22)] + ldr Asoq, [input_addr, #(2*8*23)] + ldr Asuq, [input_addr, #(2*8*24)] +.endm + +.macro store_input + str Abaq, [input_addr, #(2*8*0)] + str Abeq, [input_addr, #(2*8*1)] + str Abiq, [input_addr, #(2*8*2)] + str Aboq, [input_addr, #(2*8*3)] + str Abuq, [input_addr, #(2*8*4)] + str Agaq, [input_addr, #(2*8*5)] + str Ageq, [input_addr, #(2*8*6)] + str Agiq, [input_addr, #(2*8*7)] + str Agoq, [input_addr, #(2*8*8)] + str Aguq, [input_addr, #(2*8*9)] + str Akaq, [input_addr, #(2*8*10)] + str Akeq, [input_addr, #(2*8*11)] + str Akiq, [input_addr, #(2*8*12)] + str Akoq, [input_addr, #(2*8*13)] + str Akuq, [input_addr, #(2*8*14)] + str Amaq, [input_addr, #(2*8*15)] + str Ameq, [input_addr, #(2*8*16)] + str Amiq, [input_addr, #(2*8*17)] + str Amoq, [input_addr, #(2*8*18)] + str Amuq, [input_addr, #(2*8*19)] + str Asaq, [input_addr, #(2*8*20)] + str Aseq, [input_addr, #(2*8*21)] + str Asiq, [input_addr, #(2*8*22)] + str Asoq, [input_addr, #(2*8*23)] + str Asuq, [input_addr, #(2*8*24)] +.endm + +#define STACK_SIZE (16*4 + 16*6) // VREGS (16*4) + GPRS (TODO: Remove) + +#define STACK_BASE_GPRS (16*4) +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) + .endm + +.macro save_gprs + stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro save_vregs + stp d8, d9, [sp, #(16*0)] + stp d10, d11, [sp, #(16*1)] + stp d12, d13, [sp, #(16*2)] + stp d14, d15, [sp, #(16*3)] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #(16*0)] + ldp d10, d11, [sp, #(16*1)] + ldp d12, d13, [sp, #(16*2)] + ldp d14, d15, [sp, #(16*3)] +.endm + +/* Macros using v8.4-A SHA-3 instructions */ + +.macro eor3_m0 d s0 s1 s2 + eor3 \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +.macro rax1_m0 d s0 s1 + rax1 \d\().2d, \s0\().2d, \s1\().2d +.endm + +.macro xar_m0 d s0 s1 imm + xar \d\().2d, \s0\().2d, \s1\().2d, #\imm +.endm + +.macro bcax_m0 d s0 s1 s2 + bcax \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +/* Keccak-f1600 round */ + +.macro hybrid_round + + eor3_m1 C0, Aba, Aga, Aka + eor3_m0 C0, C0, Ama, Asa + eor3_m1 C1, Abe, Age, Ake + eor3_m0 C1, C1, Ame, Ase + eor3_m1 C2, Abi, Agi, Aki + eor3_m0 C2, C2, Ami, Asi + eor3_m1 C3, Abo, Ago, Ako + eor3_m0 C3, C3, Amo, Aso + eor3_m1 C4, Abu, Agu, Aku + eor3_m0 C4, C4, Amu, Asu + + rax1_m1 E1, C0, C2 + rax1_m0 E3, C2, C4 + rax1_m1 E0, C4, C1 + rax1_m0 E2, C1, C3 + rax1_m1 E4, C3, C0 + + eor Aba_.16b, Aba.16b, E0.16b + xar_m0 Asa_, Abi, E2, 2 + xar_m1 Abi_, Aki, E2, 21 + xar_m0 Aki_, Ako, E3, 39 + xar_m1 Ako_, Amu, E4, 56 + xar_m0 Amu_, Aso, E3, 8 + xar_m1 Aso_, Ama, E0, 23 + xar_m0 Aka_, Abe, E1, 63 + xar_m1 Ase_, Ago, E3, 9 + xar_m0 Ago_, Ame, E1, 19 + xar_m1 Ake_, Agi, E2, 58 + xar_m0 Agi_, Aka, E0, 61 + xar_m1 Aga_, Abo, E3, 36 + xar_m0 Abo_, Amo, E3, 43 + xar_m1 Amo_, Ami, E2, 49 + xar_m0 Ami_, Ake, E1, 54 + xar_m1 Age_, Agu, E4, 44 + xar_m0 Agu_, Asi, E2, 3 + xar_m1 Asi_, Aku, E4, 25 + xar_m0 Aku_, Asa, E0, 46 + xar_m1 Ama_, Abu, E4, 37 + xar_m0 Abu_, Asu, E4, 50 + xar_m1 Asu_, Ase, E1, 62 + xar_m0 Ame_, Aga, E0, 28 + xar_m1 Abe_, Age, E1, 20 + + ld1r {v28.2d}, [const_addr], #8 + + bcax_m0 Aga, Aga_, Agi_, Age_ + bcax_m1 Age, Age_, Ago_, Agi_ + bcax_m0 Agi, Agi_, Agu_, Ago_ + bcax_m1 Ago, Ago_, Aga_, Agu_ + bcax_m0 Agu, Agu_, Age_, Aga_ + bcax_m1 Aka, Aka_, Aki_, Ake_ + bcax_m0 Ake, Ake_, Ako_, Aki_ + bcax_m1 Aki, Aki_, Aku_, Ako_ + bcax_m0 Ako, Ako_, Aka_, Aku_ + bcax_m1 Aku, Aku_, Ake_, Aka_ + bcax_m0 Ama, Ama_, Ami_, Ame_ + bcax_m1 Ame, Ame_, Amo_, Ami_ + bcax_m0 Ami, Ami_, Amu_, Amo_ + bcax_m1 Amo, Amo_, Ama_, Amu_ + bcax_m0 Amu, Amu_, Ame_, Ama_ + bcax_m1 Asa, Asa_, Asi_, Ase_ + bcax_m0 Ase, Ase_, Aso_, Asi_ + bcax_m1 Asi, Asi_, Asu_, Aso_ + bcax_m0 Aso, Aso_, Asa_, Asu_ + bcax_m1 Asu, Asu_, Ase_, Asa_ + bcax_m0 Aba, Aba_, Abi_, Abe_ + bcax_m1 Abe, Abe_, Abo_, Abi_ + bcax_m0 Abi, Abi_, Abu_, Abo_ + bcax_m1 Abo, Abo_, Aba_, Abu_ + bcax_m0 Abu, Abu_, Abe_, Aba_ + + // iota step + eor Aba.16b, Aba.16b, v28.16b + +.endm + +#define KECCAK_F1600_ROUNDS 24 + +.text +.align 4 +.global keccak_f1600_x2_hybrid_asm_v1 +.global _keccak_f1600_x2_hybrid_asm_v1 + +keccak_f1600_x2_hybrid_asm_v1: +_keccak_f1600_x2_hybrid_asm_v1: + alloc_stack + save_gprs + save_vregs + load_constant_ptr + load_input + + mov count, #(KECCAK_F1600_ROUNDS) + +loop: + hybrid_round + sub count, count, #1 + cbnz count, loop + + store_input + restore_vregs + restore_gprs + free_stack + ret + +#endif diff --git a/asm/manual/keccak_f1600/keccak_f1600_x2_hybrid_asm_v2p0.s b/asm/manual/keccak_f1600/keccak_f1600_x2_hybrid_asm_v2p0.s new file mode 100644 index 0000000..f6985c1 --- /dev/null +++ b/asm/manual/keccak_f1600/keccak_f1600_x2_hybrid_asm_v2p0.s @@ -0,0 +1,830 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + + +#if defined(__ARM_FEATURE_SHA3) +/********************** CONSTANTS *************************/ + .data + .align(8) +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x1 + count .req x2 + cur_const .req x3 + + /* Mapping of Kecck-f1600 state to vector registers + * at the beginning and end of each round. */ + ASba .req v0 + ASbe .req v1 + ASbi .req v2 + ASbo .req v3 + ASbu .req v4 + ASga .req v5 + ASge .req v6 + ASgi .req v7 + ASgo .req v8 + ASgu .req v9 + ASka .req v10 + ASke .req v11 + ASki .req v12 + ASko .req v13 + ASku .req v14 + ASma .req v15 + ASme .req v16 + ASmi .req v17 + ASmo .req v18 + ASmu .req v19 + ASsa .req v20 + ASse .req v21 + ASsi .req v22 + ASso .req v23 + ASsu .req v24 + + /* q-form of the above mapping */ + ASbaq .req q0 + ASbeq .req q1 + ASbiq .req q2 + ASboq .req q3 + ASbuq .req q4 + ASgaq .req q5 + ASgeq .req q6 + ASgiq .req q7 + ASgoq .req q8 + ASguq .req q9 + ASkaq .req q10 + ASkeq .req q11 + ASkiq .req q12 + ASkoq .req q13 + ASkuq .req q14 + ASmaq .req q15 + ASmeq .req q16 + ASmiq .req q17 + ASmoq .req q18 + ASmuq .req q19 + ASsaq .req q20 + ASseq .req q21 + ASsiq .req q22 + ASsoq .req q23 + ASsuq .req q24 + + Ascratch0 .req v25 + Ascratch1 .req v26 + Ascratch2 .req v27 + Ascratch3 .req v28 + Ascratch4 .req v29 + Ascratch5 .req v30 + Ascratch6 .req v31 + + Ascratch0q .req q25 + Ascratch1q .req q26 + Ascratch2q .req q27 + Ascratch3q .req q28 + Ascratch4q .req q29 + Ascratch5q .req q30 + Ascratch6q .req q31 + +/************************ MACROS ****************************/ + +.macro load_input + ldp ASbaq, ASbeq, [input_addr, #(2*8*0)] + ldp ASbiq, ASboq, [input_addr, #(2*8*2)] + ldp ASbuq, ASgaq, [input_addr, #(2*8*4)] + ldp ASgeq, ASgiq, [input_addr, #(2*8*6)] + ldp ASgoq, ASguq, [input_addr, #(2*8*8)] + ldp ASkaq, ASkeq, [input_addr, #(2*8*10)] + ldp ASkiq, ASkoq, [input_addr, #(2*8*12)] + ldp ASkuq, ASmaq, [input_addr, #(2*8*14)] + ldp ASmeq, ASmiq, [input_addr, #(2*8*16)] + ldp ASmoq, ASmuq, [input_addr, #(2*8*18)] + ldp ASsaq, ASseq, [input_addr, #(2*8*20)] + ldp ASsiq, ASsoq, [input_addr, #(2*8*22)] + ldr ASsuq, [input_addr, #(2*8*24)] +.endm + +.macro store_input + str ASbaq, [input_addr, #(2*8*0)] + str ASbeq, [input_addr, #(2*8*1)] + str ASbiq, [input_addr, #(2*8*2)] + str ASboq, [input_addr, #(2*8*3)] + str ASbuq, [input_addr, #(2*8*4)] + str ASgaq, [input_addr, #(2*8*5)] + str ASgeq, [input_addr, #(2*8*6)] + str ASgiq, [input_addr, #(2*8*7)] + str ASgoq, [input_addr, #(2*8*8)] + str ASguq, [input_addr, #(2*8*9)] + str ASkaq, [input_addr, #(2*8*10)] + str ASkeq, [input_addr, #(2*8*11)] + str ASkiq, [input_addr, #(2*8*12)] + str ASkoq, [input_addr, #(2*8*13)] + str ASkuq, [input_addr, #(2*8*14)] + str ASmaq, [input_addr, #(2*8*15)] + str ASmeq, [input_addr, #(2*8*16)] + str ASmiq, [input_addr, #(2*8*17)] + str ASmoq, [input_addr, #(2*8*18)] + str ASmuq, [input_addr, #(2*8*19)] + str ASsaq, [input_addr, #(2*8*20)] + str ASseq, [input_addr, #(2*8*21)] + str ASsiq, [input_addr, #(2*8*22)] + str ASsoq, [input_addr, #(2*8*23)] + str ASsuq, [input_addr, #(2*8*24)] +.endm + +#define STACK_SIZE (16*4 + 16*30) +#define STACK_BASE_VREGS 0 +#define STACK_BASE_TMP 16*4 + +#define E0_offset 0 +#define E1_offset 1 +#define E2_offset 2 +#define E3_offset 3 +#define E4_offset 4 + +#define Aba_offset (5 + 0 ) +#define Abe_offset (5 + 1 ) +#define Abi_offset (5 + 2 ) +#define Abo_offset (5 + 3 ) +#define Abu_offset (5 + 4 ) +#define Aga_offset (5 + 5 ) +#define Age_offset (5 + 6 ) +#define Agi_offset (5 + 7 ) +#define Ago_offset (5 + 8 ) +#define Agu_offset (5 + 9 ) +#define Aka_offset (5 + 10 ) +#define Ake_offset (5 + 11 ) +#define Aki_offset (5 + 12 ) +#define Ako_offset (5 + 13 ) +#define Aku_offset (5 + 14 ) +#define Ama_offset (5 + 15 ) +#define Ame_offset (5 + 16 ) +#define Ami_offset (5 + 17 ) +#define Amo_offset (5 + 18 ) +#define Amu_offset (5 + 19 ) +#define Asa_offset (5 + 20 ) +#define Ase_offset (5 + 21 ) +#define Asi_offset (5 + 22 ) +#define Aso_offset (5 + 23 ) +#define Asu_offset (5 + 24 ) + +#define ba_offset (5 + 0 ) +#define be_offset (5 + 1 ) +#define bi_offset (5 + 2 ) +#define bo_offset (5 + 3 ) +#define bu_offset (5 + 4 ) +#define ga_offset (5 + 5 ) +#define ge_offset (5 + 6 ) +#define gi_offset (5 + 7 ) +#define go_offset (5 + 8 ) +#define gu_offset (5 + 9 ) +#define ka_offset (5 + 10 ) +#define ke_offset (5 + 11 ) +#define ki_offset (5 + 12 ) +#define ko_offset (5 + 13 ) +#define ku_offset (5 + 14 ) +#define ma_offset (5 + 15 ) +#define me_offset (5 + 16 ) +#define mi_offset (5 + 17 ) +#define mo_offset (5 + 18 ) +#define mu_offset (5 + 19 ) +#define sa_offset (5 + 20 ) +#define se_offset (5 + 21 ) +#define si_offset (5 + 22 ) +#define so_offset (5 + 23 ) +#define su_offset (5 + 24 ) + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +#define savep(reg, offset_prefix) \ + str reg, [sp, #(STACK_BASE_TMP + 16 * offset_prefix ## _offset)] +#define restorep(reg, offset_prefix) \ + ldr reg, [sp, #(STACK_BASE_TMP + 16 * offset_prefix ## _offset)] +#define save(name) savep(name ## q,name) +#define restore(name) restorep(name ## q,name) + +.macro save_vregs + stp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + stp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + stp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + stp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + ldp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + ldp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + ldp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +/* Macros using v8.4-A SHA-3 instructions */ + +.macro eor3_m1_0 d s0 s1 s2 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor2 d s0 s1 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor3_m1_1 d s0 s1 s2 + eor \d\().16b, \d\().16b, \s2\().16b +.endm + +.macro eor3_m1 d s0 s1 s2 + eor3_m1_0 \d, \s0, \s1, \s2 + eor3_m1_1 \d, \s0, \s1, \s2 +.endm + +.macro rax1_m1 d s0 s1 + add tmp.2d, \s1\().2d, \s1\().2d + sri tmp.2d, \s1\().2d, #63 + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +.macro xar_m1 d s0 s1 imm + eor tmp.16b, \s0\().16b, \s1\().16b + shl \d\().2d, tmp.2d, #(64-\imm) + sri \d\().2d, tmp.2d, #(\imm) +.endm + +.macro bcax_m1 d s0 s1 s2 + bic tmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +.macro eor3_m0 d s0 s1 s2 + eor3 \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +.macro rax1_m0 d s0 s1 + rax1 \d\().2d, \s0\().2d, \s1\().2d +.endm + +.macro xar_m0 d s0 s1 imm + xar \d\().2d, \s0\().2d, \s1\().2d, #\imm +.endm + +.macro bcax_m0 d s0 s1 s2 + bcax \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +#define CONCAT5(a,b,c,d,e) a ## b ## c ## d ## e +#define CONCAT4(a,b,c,d) a ## b ## c ## d + +#define OUT(x) \out\()S##x +#define IN(x) \in\()S##x +#define B(x) \in\()B##x +#define E(x) \in\()E##x +#define C(x) \in\()C##x +#define TMP_IN(x) \in\()scratch ## x +#define TMP_OUT(x) \out\()scratch ## x + +#define OUTq(x) \out\()S##x##q +#define INq(x) \in\()S##x##q +#define Bq(x) \in\()B##x##q +#define Eq(x) \in\()E##x##q +#define Cq(x) \in\()C##x##q +#define TMP_INq(x) \in\()scratch ## x ## q +#define TMP_OUTq(x) \out\()scratch ## x ## q + +.macro declare_mappings out, in + + C(0) .req TMP_IN(0) + C(1) .req TMP_IN(1) + C(2) .req TMP_IN(2) + C(3) .req TMP_IN(3) + C(4) .req TMP_IN(4) + + Cq(0) .req TMP_INq(0) + Cq(1) .req TMP_INq(1) + Cq(2) .req TMP_INq(2) + Cq(3) .req TMP_INq(3) + Cq(4) .req TMP_INq(4) + + E(2) .req TMP_IN(5) + E(4) .req C(3) + E(1) .req C(0) + E(3) .req C(2) + E(0) .req C(4) + + Eq(2) .req TMP_INq(5) + Eq(4) .req Cq(3) + Eq(1) .req Cq(0) + Eq(3) .req Cq(2) + Eq(0) .req Cq(4) + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + B(go) .req IN(me) + B(gi) .req IN(ka) + B(ga) .req IN(bo) + B(ge) .req IN(gu) + B(gu) .req IN(si) + B(ki) .req IN(ko) + B(ko) .req IN(mu) + B(ka) .req IN(be) + B(ke) .req IN(gi) + B(ku) .req IN(sa) + B(mu) .req IN(so) + B(mo) .req IN(mi) + B(mi) .req IN(ke) + B(ma) .req IN(bu) + B(me) .req IN(ga) + B(ba) .req IN(ba) + B(bi) .req IN(ki) + B(bo) .req IN(mo) + B(bu) .req IN(su) + B(be) .req IN(ge) + B(sa) .req IN(bi) + B(so) .req IN(ma) + B(se) .req IN(go) + B(si) .req IN(ku) + B(su) .req IN(se) + + Bq(go) .req INq(me) + Bq(gi) .req INq(ka) + Bq(ga) .req INq(bo) + Bq(ge) .req INq(gu) + Bq(gu) .req INq(si) + Bq(ki) .req INq(ko) + Bq(ko) .req INq(mu) + Bq(ka) .req INq(be) + Bq(ke) .req INq(gi) + Bq(ku) .req INq(sa) + Bq(mu) .req INq(so) + Bq(mo) .req INq(mi) + Bq(mi) .req INq(ke) + Bq(ma) .req INq(bu) + Bq(me) .req INq(ga) + Bq(ba) .req INq(ba) + Bq(bi) .req INq(ki) + Bq(bo) .req INq(mo) + Bq(bu) .req INq(su) + Bq(be) .req INq(ge) + Bq(sa) .req INq(bi) + Bq(so) .req INq(ma) + Bq(se) .req INq(go) + Bq(si) .req INq(ku) + Bq(su) .req INq(se) + + OUT(ga) .req TMP_IN(0) + OUT(ge) .req TMP_IN(1) + OUT(gi) .req B(gi) + OUT(go) .req B(go) + OUT(gu) .req B(gu) + OUT(ka) .req B(ga) + OUT(ke) .req B(ge) + OUT(ki) .req B(ki) + OUT(ko) .req B(ko) + OUT(ku) .req B(ku) + OUT(ma) .req B(ka) + OUT(me) .req B(ke) + OUT(mi) .req B(mi) + OUT(mo) .req B(mo) + OUT(mu) .req B(mu) + OUT(ba) .req B(ma) + OUT(be) .req B(me) + OUT(bi) .req B(bi) + OUT(bo) .req B(bo) + OUT(bu) .req B(bu) + OUT(sa) .req B(ba) + OUT(se) .req B(be) + OUT(si) .req B(si) + OUT(so) .req B(so) + OUT(su) .req B(su) + + OUTq(ga) .req TMP_INq(0) + OUTq(ge) .req TMP_INq(1) + OUTq(gi) .req Bq(gi) + OUTq(go) .req Bq(go) + OUTq(gu) .req Bq(gu) + OUTq(ka) .req Bq(ga) + OUTq(ke) .req Bq(ge) + OUTq(ki) .req Bq(ki) + OUTq(ko) .req Bq(ko) + OUTq(ku) .req Bq(ku) + OUTq(ma) .req Bq(ka) + OUTq(me) .req Bq(ke) + OUTq(mi) .req Bq(mi) + OUTq(mo) .req Bq(mo) + OUTq(mu) .req Bq(mu) + OUTq(ba) .req Bq(ma) + OUTq(be) .req Bq(me) + OUTq(bi) .req Bq(bi) + OUTq(bo) .req Bq(bo) + OUTq(bu) .req Bq(bu) + OUTq(sa) .req Bq(ba) + OUTq(se) .req Bq(be) + OUTq(si) .req Bq(si) + OUTq(so) .req Bq(so) + OUTq(su) .req Bq(su) + + TMP_OUT(0) .req B(sa) + TMP_OUT(1) .req B(se) + TMP_OUT(2) .req TMP_IN(2) + TMP_OUT(3) .req TMP_IN(3) + TMP_OUT(4) .req TMP_IN(4) + TMP_OUT(5) .req TMP_IN(5) + TMP_OUT(6) .req TMP_IN(6) + + TMP_OUTq(0) .req Bq(sa) + TMP_OUTq(1) .req Bq(se) + TMP_OUTq(2) .req TMP_INq(2) + TMP_OUTq(3) .req TMP_INq(3) + TMP_OUTq(4) .req TMP_INq(4) + TMP_OUTq(5) .req TMP_INq(5) + TMP_OUTq(6) .req TMP_INq(6) + + tmp .req v0 + .unreq tmp + tmp .req TMP_IN(6) +.endm + +.macro undeclare_mappings out, in + + .unreq C(0) + .unreq C(1) + .unreq C(2) + .unreq C(3) + .unreq C(4) + + .unreq Cq(0) + .unreq Cq(1) + .unreq Cq(2) + .unreq Cq(3) + .unreq Cq(4) + + .unreq E(2) + .unreq E(4) + .unreq E(1) + .unreq E(3) + .unreq E(0) + + .unreq Eq(2) + .unreq Eq(4) + .unreq Eq(1) + .unreq Eq(3) + .unreq Eq(0) + + .unreq B(go) + .unreq B(gi) + .unreq B(ga) + .unreq B(ge) + .unreq B(gu) + .unreq B(ki) + .unreq B(ko) + .unreq B(ka) + .unreq B(ke) + .unreq B(ku) + .unreq B(mu) + .unreq B(mo) + .unreq B(mi) + .unreq B(ma) + .unreq B(me) + .unreq B(ba) + .unreq B(bi) + .unreq B(bo) + .unreq B(bu) + .unreq B(be) + .unreq B(sa) + .unreq B(so) + .unreq B(se) + .unreq B(si) + .unreq B(su) + + .unreq Bq(go) + .unreq Bq(gi) + .unreq Bq(ga) + .unreq Bq(ge) + .unreq Bq(gu) + .unreq Bq(ki) + .unreq Bq(ko) + .unreq Bq(ka) + .unreq Bq(ke) + .unreq Bq(ku) + .unreq Bq(mu) + .unreq Bq(mo) + .unreq Bq(mi) + .unreq Bq(ma) + .unreq Bq(me) + .unreq Bq(ba) + .unreq Bq(bi) + .unreq Bq(bo) + .unreq Bq(bu) + .unreq Bq(be) + .unreq Bq(sa) + .unreq Bq(so) + .unreq Bq(se) + .unreq Bq(si) + .unreq Bq(su) + + .unreq OUT(ga) + .unreq OUT(ge) + .unreq OUT(gi) + .unreq OUT(go) + .unreq OUT(gu) + .unreq OUT(ka) + .unreq OUT(ke) + .unreq OUT(ki) + .unreq OUT(ko) + .unreq OUT(ku) + .unreq OUT(ma) + .unreq OUT(me) + .unreq OUT(mi) + .unreq OUT(mo) + .unreq OUT(mu) + .unreq OUT(ba) + .unreq OUT(be) + .unreq OUT(bi) + .unreq OUT(bo) + .unreq OUT(bu) + .unreq OUT(sa) + .unreq OUT(se) + .unreq OUT(si) + .unreq OUT(so) + .unreq OUT(su) + + .unreq OUTq(ga) + .unreq OUTq(ge) + .unreq OUTq(gi) + .unreq OUTq(go) + .unreq OUTq(gu) + .unreq OUTq(ka) + .unreq OUTq(ke) + .unreq OUTq(ki) + .unreq OUTq(ko) + .unreq OUTq(ku) + .unreq OUTq(ma) + .unreq OUTq(me) + .unreq OUTq(mi) + .unreq OUTq(mo) + .unreq OUTq(mu) + .unreq OUTq(ba) + .unreq OUTq(be) + .unreq OUTq(bi) + .unreq OUTq(bo) + .unreq OUTq(bu) + .unreq OUTq(sa) + .unreq OUTq(se) + .unreq OUTq(si) + .unreq OUTq(so) + .unreq OUTq(su) + + .unreq TMP_OUT(0) + .unreq TMP_OUT(1) + .unreq TMP_OUT(2) + .unreq TMP_OUT(3) + .unreq TMP_OUT(4) + .unreq TMP_OUT(5) + .unreq TMP_OUT(6) + + .unreq TMP_OUTq(0) + .unreq TMP_OUTq(1) + .unreq TMP_OUTq(2) + .unreq TMP_OUTq(3) + .unreq TMP_OUTq(4) + .unreq TMP_OUTq(5) + .unreq TMP_OUTq(6) + + .unreq tmp +.endm + +.macro keccak_f1600_round out, in + + eor3_m1 C(0), IN(ba), IN(ga), IN(ka) + eor3_m0 C(0), C(0), IN(ma), IN(sa) + eor3_m1 C(1), IN(be), IN(ge), IN(ke) + eor3_m0 C(1), C(1), IN(me), IN(se) + eor3_m1 C(2), IN(bi), IN(gi), IN(ki) + eor3_m0 C(2), C(2), IN(mi), IN(si) + eor3_m1 C(3), IN(bo), IN(go), IN(ko) + eor3_m0 C(3), C(3), IN(mo), IN(so) + eor3_m1 C(4), IN(bu), IN(gu), IN(ku) + eor3_m0 C(4), C(4), IN(mu), IN(su) + + rax1_m0 E(2), C(1), C(3) + rax1_m1 E(4), C(3), C(0) + rax1_m0 E(1), C(0), C(2) + rax1_m1 E(3), C(2), C(4) + rax1_m0 E(0), C(4), C(1) + + xar_m0 B(go), IN(me), E(1), 19 + xar_m1 B(gi), IN(ka), E(0), 61 + xar_m0 B(ga), IN(bo), E(3), 36 + xar_m1 B(ge), IN(gu), E(4), 44 + xar_m0 B(gu), IN(si), E(2), 3 + + xar_m1 B(ki), IN(ko), E(3), 39 + xar_m0 B(ko), IN(mu), E(4), 56 + xar_m1 B(ka), IN(be), E(1), 63 + xar_m0 B(ke), IN(gi), E(2), 58 + xar_m1 B(ku), IN(sa), E(0), 46 + + xar_m0 B(mu), IN(so), E(3), 8 + xar_m1 B(mo), IN(mi), E(2), 49 + xar_m0 B(mi), IN(ke), E(1), 54 + xar_m1 B(ma), IN(bu), E(4), 37 + xar_m0 B(me), IN(ga), E(0), 28 + + eor2 B(ba), IN(ba), E(0) + xar_m1 B(bi), IN(ki), E(2), 21 + xar_m0 B(bo), IN(mo), E(3), 43 + xar_m1 B(bu), IN(su), E(4), 50 + xar_m0 B(be), IN(ge), E(1), 20 + + xar_m1 B(sa), IN(bi), E(2), 2 + xar_m0 B(so), IN(ma), E(0), 23 + xar_m1 B(se), IN(go), E(3), 9 + xar_m0 B(si), IN(ku), E(4), 25 + xar_m1 B(su), IN(se), E(1), 62 + + bcax_m0 OUT(ga), B(ga), B(gi), B(ge) + bcax_m1 OUT(ge), B(ge), B(go), B(gi) + bcax_m0 OUT(gi), B(gi), B(gu), B(go) + bcax_m1 OUT(go), B(go), B(ga), B(gu) + bcax_m0 OUT(gu), B(gu), B(ge), B(ga) + + bcax_m1 OUT(ka), B(ka), B(ki), B(ke) + bcax_m0 OUT(ke), B(ke), B(ko), B(ki) + bcax_m1 OUT(ki), B(ki), B(ku), B(ko) + bcax_m0 OUT(ko), B(ko), B(ka), B(ku) + bcax_m1 OUT(ku), B(ku), B(ke), B(ka) + + bcax_m1 OUT(ma), B(ma), B(mi), B(me) + bcax_m0 OUT(me), B(me), B(mo), B(mi) + bcax_m1 OUT(mi), B(mi), B(mu), B(mo) + bcax_m0 OUT(mo), B(mo), B(ma), B(mu) + bcax_m1 OUT(mu), B(mu), B(me), B(ma) + + bcax_m0 OUT(ba), B(ba), B(bi), B(be) + bcax_m1 OUT(be), B(be), B(bo), B(bi) + bcax_m0 OUT(bi), B(bi), B(bu), B(bo) + bcax_m1 OUT(bo), B(bo), B(ba), B(bu) + bcax_m0 OUT(bu), B(bu), B(be), B(ba) + + bcax_m1 OUT(sa), B(sa), B(si), B(se) + bcax_m0 OUT(se), B(se), B(so), B(si) + bcax_m1 OUT(si), B(si), B(su), B(so) + bcax_m0 OUT(so), B(so), B(sa), B(su) + bcax_m1 OUT(su), B(su), B(se), B(sa) + + ld1r {tmp.2d}, [const_addr], #8 + eor OUT(ba).16b, OUT(ba).16b, tmp.16b +.endm + +.macro transfer_state out, in + + savep(INq(ga),ga) + savep(INq(ge),ge) + savep(INq(gi),gi) + savep(INq(go),go) + savep(INq(gu),gu) + savep(INq(ka),ka) + savep(INq(ke),ke) + savep(INq(ki),ki) + savep(INq(ko),ko) + savep(INq(ku),ku) + savep(INq(ma),ma) + savep(INq(me),me) + savep(INq(mi),mi) + savep(INq(mo),mo) + savep(INq(mu),mu) + savep(INq(ba),ba) + savep(INq(be),be) + savep(INq(bi),bi) + savep(INq(bo),bo) + savep(INq(bu),bu) + savep(INq(sa),sa) + savep(INq(se),se) + savep(INq(si),si) + savep(INq(so),so) + savep(INq(su),su) + + restorep(OUTq(ga),ga) + restorep(OUTq(ge),ge) + restorep(OUTq(gi),gi) + restorep(OUTq(go),go) + restorep(OUTq(gu),gu) + restorep(OUTq(ka),ka) + restorep(OUTq(ke),ke) + restorep(OUTq(ki),ki) + restorep(OUTq(ko),ko) + restorep(OUTq(ku),ku) + restorep(OUTq(ma),ma) + restorep(OUTq(me),me) + restorep(OUTq(mi),mi) + restorep(OUTq(mo),mo) + restorep(OUTq(mu),mu) + restorep(OUTq(ba),ba) + restorep(OUTq(be),be) + restorep(OUTq(bi),bi) + restorep(OUTq(bo),bo) + restorep(OUTq(bu),bu) + restorep(OUTq(sa),sa) + restorep(OUTq(se),se) + restorep(OUTq(si),si) + restorep(OUTq(so),so) + restorep(OUTq(su),su) + +.endm + +.text +.align 4 +.global keccak_f1600_x2_hybrid_asm_v2p0 +.global _keccak_f1600_x2_hybrid_asm_v2p0 + +#define KECCAK_F1600_ROUNDS 24 + +keccak_f1600_x2_hybrid_asm_v2p0: +_keccak_f1600_x2_hybrid_asm_v2p0: + alloc_stack + save_vregs + load_constant_ptr + load_input + + mov count, #24 + +loop: + declare_mappings A1, A + keccak_f1600_round A1, A + + declare_mappings A2, A1 + keccak_f1600_round A2, A1 + + declare_mappings A3, A2 + keccak_f1600_round A3, A2 + + declare_mappings A4, A3 + keccak_f1600_round A4, A3 + + transfer_state A, A4 + undeclare_mappings A4, A + + sub count, count, #4 + cbnz count, loop + + store_input + restore_vregs + free_stack + ret + + #endif diff --git a/asm/manual/keccak_f1600/keccak_f1600_x2_hybrid_asm_v2p1.s b/asm/manual/keccak_f1600/keccak_f1600_x2_hybrid_asm_v2p1.s new file mode 100644 index 0000000..8fbb78c --- /dev/null +++ b/asm/manual/keccak_f1600/keccak_f1600_x2_hybrid_asm_v2p1.s @@ -0,0 +1,880 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + + +#if defined(__ARM_FEATURE_SHA3) +/********************** CONSTANTS *************************/ + .data + .align(8) +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + .quad 0x0 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x1 + count .req x2 + cur_const .req x3 + + /* Mapping of Kecck-f1600 state to vector registers + * at the beginning and end of each round. */ + ASba .req v0 + ASbe .req v1 + ASbi .req v2 + ASbo .req v3 + ASbu .req v4 + ASga .req v5 + ASge .req v6 + ASgi .req v7 + ASgo .req v8 + ASgu .req v9 + ASka .req v10 + ASke .req v11 + ASki .req v12 + ASko .req v13 + ASku .req v14 + ASma .req v15 + ASme .req v16 + ASmi .req v17 + ASmo .req v18 + ASmu .req v19 + ASsa .req v20 + ASse .req v21 + ASsi .req v22 + ASso .req v23 + ASsu .req v24 + + /* q-form of the above mapping */ + ASbaq .req q0 + ASbeq .req q1 + ASbiq .req q2 + ASboq .req q3 + ASbuq .req q4 + ASgaq .req q5 + ASgeq .req q6 + ASgiq .req q7 + ASgoq .req q8 + ASguq .req q9 + ASkaq .req q10 + ASkeq .req q11 + ASkiq .req q12 + ASkoq .req q13 + ASkuq .req q14 + ASmaq .req q15 + ASmeq .req q16 + ASmiq .req q17 + ASmoq .req q18 + ASmuq .req q19 + ASsaq .req q20 + ASseq .req q21 + ASsiq .req q22 + ASsoq .req q23 + ASsuq .req q24 + + Ascratch0 .req v25 + Ascratch1 .req v26 + Ascratch2 .req v27 + Ascratch3 .req v28 + Ascratch4 .req v29 + Ascratch5 .req v30 + Ascratch6 .req v31 + + Ascratch0q .req q25 + Ascratch1q .req q26 + Ascratch2q .req q27 + Ascratch3q .req q28 + Ascratch4q .req q29 + Ascratch5q .req q30 + Ascratch6q .req q31 + +/************************ MACROS ****************************/ + +.macro load_input + ldp ASbaq, ASbeq, [input_addr, #(2*8*0)] + ldp ASbiq, ASboq, [input_addr, #(2*8*2)] + ldp ASbuq, ASgaq, [input_addr, #(2*8*4)] + ldp ASgeq, ASgiq, [input_addr, #(2*8*6)] + ldp ASgoq, ASguq, [input_addr, #(2*8*8)] + ldp ASkaq, ASkeq, [input_addr, #(2*8*10)] + ldp ASkiq, ASkoq, [input_addr, #(2*8*12)] + ldp ASkuq, ASmaq, [input_addr, #(2*8*14)] + ldp ASmeq, ASmiq, [input_addr, #(2*8*16)] + ldp ASmoq, ASmuq, [input_addr, #(2*8*18)] + ldp ASsaq, ASseq, [input_addr, #(2*8*20)] + ldp ASsiq, ASsoq, [input_addr, #(2*8*22)] + ldr ASsuq, [input_addr, #(2*8*24)] +.endm + +.macro store_input in + str \in\()Sbaq, [input_addr, #(2*8*0)] + str \in\()Sbeq, [input_addr, #(2*8*1)] + str \in\()Sbiq, [input_addr, #(2*8*2)] + str \in\()Sboq, [input_addr, #(2*8*3)] + str \in\()Sbuq, [input_addr, #(2*8*4)] + str \in\()Sgaq, [input_addr, #(2*8*5)] + str \in\()Sgeq, [input_addr, #(2*8*6)] + str \in\()Sgiq, [input_addr, #(2*8*7)] + str \in\()Sgoq, [input_addr, #(2*8*8)] + str \in\()Sguq, [input_addr, #(2*8*9)] + str \in\()Skaq, [input_addr, #(2*8*10)] + str \in\()Skeq, [input_addr, #(2*8*11)] + str \in\()Skiq, [input_addr, #(2*8*12)] + str \in\()Skoq, [input_addr, #(2*8*13)] + str \in\()Skuq, [input_addr, #(2*8*14)] + str \in\()Smaq, [input_addr, #(2*8*15)] + str \in\()Smeq, [input_addr, #(2*8*16)] + str \in\()Smiq, [input_addr, #(2*8*17)] + str \in\()Smoq, [input_addr, #(2*8*18)] + str \in\()Smuq, [input_addr, #(2*8*19)] + str \in\()Ssaq, [input_addr, #(2*8*20)] + str \in\()Sseq, [input_addr, #(2*8*21)] + str \in\()Ssiq, [input_addr, #(2*8*22)] + str \in\()Ssoq, [input_addr, #(2*8*23)] + str \in\()Ssuq, [input_addr, #(2*8*24)] +.endm + +#define STACK_SIZE (16*4 + 16*30) +#define STACK_BASE_VREGS 0 +#define STACK_BASE_TMP 16*4 + +#define E0_offset 0 +#define E1_offset 1 +#define E2_offset 2 +#define E3_offset 3 +#define E4_offset 4 + +#define Aba_offset (5 + 0 ) +#define Abe_offset (5 + 1 ) +#define Abi_offset (5 + 2 ) +#define Abo_offset (5 + 3 ) +#define Abu_offset (5 + 4 ) +#define Aga_offset (5 + 5 ) +#define Age_offset (5 + 6 ) +#define Agi_offset (5 + 7 ) +#define Ago_offset (5 + 8 ) +#define Agu_offset (5 + 9 ) +#define Aka_offset (5 + 10 ) +#define Ake_offset (5 + 11 ) +#define Aki_offset (5 + 12 ) +#define Ako_offset (5 + 13 ) +#define Aku_offset (5 + 14 ) +#define Ama_offset (5 + 15 ) +#define Ame_offset (5 + 16 ) +#define Ami_offset (5 + 17 ) +#define Amo_offset (5 + 18 ) +#define Amu_offset (5 + 19 ) +#define Asa_offset (5 + 20 ) +#define Ase_offset (5 + 21 ) +#define Asi_offset (5 + 22 ) +#define Aso_offset (5 + 23 ) +#define Asu_offset (5 + 24 ) + +#define ba_offset (5 + 0 ) +#define be_offset (5 + 1 ) +#define bi_offset (5 + 2 ) +#define bo_offset (5 + 3 ) +#define bu_offset (5 + 4 ) +#define ga_offset (5 + 5 ) +#define ge_offset (5 + 6 ) +#define gi_offset (5 + 7 ) +#define go_offset (5 + 8 ) +#define gu_offset (5 + 9 ) +#define ka_offset (5 + 10 ) +#define ke_offset (5 + 11 ) +#define ki_offset (5 + 12 ) +#define ko_offset (5 + 13 ) +#define ku_offset (5 + 14 ) +#define ma_offset (5 + 15 ) +#define me_offset (5 + 16 ) +#define mi_offset (5 + 17 ) +#define mo_offset (5 + 18 ) +#define mu_offset (5 + 19 ) +#define sa_offset (5 + 20 ) +#define se_offset (5 + 21 ) +#define si_offset (5 + 22 ) +#define so_offset (5 + 23 ) +#define su_offset (5 + 24 ) + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +#define savep(reg, offset_prefix) \ + str reg, [sp, #(STACK_BASE_TMP + 16 * offset_prefix ## _offset)] +#define restorep(reg, offset_prefix) \ + ldr reg, [sp, #(STACK_BASE_TMP + 16 * offset_prefix ## _offset)] +#define save(name) savep(name ## q,name) +#define restore(name) restorep(name ## q,name) + +.macro save_vregs + stp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + stp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + stp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + stp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + ldp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + ldp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + ldp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +/* Macros using v8.4-A SHA-3 instructions */ + +.macro eor3_m1_0 d s0 s1 s2 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor2 d s0 s1 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor3_m1_1 d s0 s1 s2 + eor \d\().16b, \d\().16b, \s2\().16b +.endm + +.macro eor3_m1 d s0 s1 s2 + eor3_m1_0 \d, \s0, \s1, \s2 + eor3_m1_1 \d, \s0, \s1, \s2 +.endm + +.macro rax1_m1 d s0 s1 + add tmp.2d, \s1\().2d, \s1\().2d + sri tmp.2d, \s1\().2d, #63 + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +.macro xar_m1 d s0 s1 imm + eor tmp.16b, \s0\().16b, \s1\().16b + shl \d\().2d, tmp.2d, #(64-\imm) + sri \d\().2d, tmp.2d, #(\imm) +.endm + +.macro bcax_m1 d s0 s1 s2 + bic tmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +.macro eor3_m0 d s0 s1 s2 + eor3 \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +.macro rax1_m0 d s0 s1 + rax1 \d\().2d, \s0\().2d, \s1\().2d +.endm + +.macro xar_m0 d s0 s1 imm + xar \d\().2d, \s0\().2d, \s1\().2d, #\imm +.endm + +.macro bcax_m0 d s0 s1 s2 + bcax \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +#define CONCAT5(a,b,c,d,e) a ## b ## c ## d ## e +#define CONCAT4(a,b,c,d) a ## b ## c ## d + +#define OUT(x) \out\()S##x +#define IN(x) \in\()S##x +#define B(x) \in\()B##x +#define E(x) \in\()E##x +#define C(x) \in\()C##x +#define Cnext(x) \out\()C##x +#define TMP_IN(x) \in\()scratch ## x +#define TMP_OUT(x) \out\()scratch ## x + +#define OUTq(x) \out\()S##x##q +#define INq(x) \in\()S##x##q +#define Bq(x) \in\()B##x##q +#define Eq(x) \in\()E##x##q +#define Cq(x) \in\()C##x##q +#define Cnextq(x) \out\()C##x##q +#define TMP_INq(x) \in\()scratch ## x ## q +#define TMP_OUTq(x) \out\()scratch ## x ## q + +.macro declare_mappings out, in + + C(0) .req TMP_IN(0) + C(1) .req TMP_IN(1) + C(2) .req TMP_IN(2) + C(3) .req TMP_IN(3) + C(4) .req TMP_IN(4) + + Cq(0) .req TMP_INq(0) + Cq(1) .req TMP_INq(1) + Cq(2) .req TMP_INq(2) + Cq(3) .req TMP_INq(3) + Cq(4) .req TMP_INq(4) + + E(1) .req TMP_IN(5) + E(3) .req C(2) + E(0) .req C(4) + E(2) .req C(1) + E(4) .req C(3) + + Eq(1) .req TMP_INq(5) + Eq(3) .req Cq(2) + Eq(0) .req Cq(4) + Eq(2) .req Cq(1) + Eq(4) .req Cq(3) + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + B(go) .req IN(me) + B(gi) .req IN(ka) + B(ga) .req IN(bo) + B(ge) .req IN(gu) + B(gu) .req IN(si) + B(ki) .req IN(ko) + B(ko) .req IN(mu) + B(ka) .req IN(be) + B(ke) .req IN(gi) + B(ku) .req IN(sa) + B(mu) .req IN(so) + B(mo) .req IN(mi) + B(mi) .req IN(ke) + B(ma) .req IN(bu) + B(me) .req IN(ga) + B(ba) .req IN(ba) + B(bi) .req IN(ki) + B(bo) .req IN(mo) + B(bu) .req IN(su) + B(be) .req IN(ge) + B(sa) .req IN(bi) + B(so) .req IN(ma) + B(se) .req IN(go) + B(si) .req IN(ku) + B(su) .req IN(se) + + Bq(go) .req INq(me) + Bq(gi) .req INq(ka) + Bq(ga) .req INq(bo) + Bq(ge) .req INq(gu) + Bq(gu) .req INq(si) + Bq(ki) .req INq(ko) + Bq(ko) .req INq(mu) + Bq(ka) .req INq(be) + Bq(ke) .req INq(gi) + Bq(ku) .req INq(sa) + Bq(mu) .req INq(so) + Bq(mo) .req INq(mi) + Bq(mi) .req INq(ke) + Bq(ma) .req INq(bu) + Bq(me) .req INq(ga) + Bq(ba) .req INq(ba) + Bq(bi) .req INq(ki) + Bq(bo) .req INq(mo) + Bq(bu) .req INq(su) + Bq(be) .req INq(ge) + Bq(sa) .req INq(bi) + Bq(so) .req INq(ma) + Bq(se) .req INq(go) + Bq(si) .req INq(ku) + Bq(su) .req INq(se) + + OUT(ba) .req TMP_IN(0) + OUT(be) .req TMP_IN(5) + OUT(bi) .req B(bi) + OUT(bo) .req B(bo) + OUT(bu) .req B(bu) + OUT(ga) .req B(ba) + OUT(ge) .req B(be) + OUT(gi) .req B(gi) + OUT(go) .req B(go) + OUT(gu) .req B(gu) + OUT(ka) .req B(ga) + OUT(ke) .req B(ge) + OUT(ki) .req B(ki) + OUT(ko) .req B(ko) + OUT(ku) .req B(ku) + OUT(ma) .req B(ka) + OUT(me) .req B(ke) + OUT(mi) .req B(mi) + OUT(mo) .req B(mo) + OUT(mu) .req B(mu) + OUT(sa) .req B(ma) + OUT(se) .req B(me) + OUT(si) .req B(si) + OUT(so) .req B(so) + OUT(su) .req B(su) + + OUTq(ba) .req TMP_INq(0) + OUTq(be) .req TMP_INq(5) + OUTq(bi) .req Bq(bi) + OUTq(bo) .req Bq(bo) + OUTq(bu) .req Bq(bu) + OUTq(ga) .req Bq(ba) + OUTq(ge) .req Bq(be) + OUTq(gi) .req Bq(gi) + OUTq(go) .req Bq(go) + OUTq(gu) .req Bq(gu) + OUTq(ka) .req Bq(ga) + OUTq(ke) .req Bq(ge) + OUTq(ki) .req Bq(ki) + OUTq(ko) .req Bq(ko) + OUTq(ku) .req Bq(ku) + OUTq(ma) .req Bq(ka) + OUTq(me) .req Bq(ke) + OUTq(mi) .req Bq(mi) + OUTq(mo) .req Bq(mo) + OUTq(mu) .req Bq(mu) + OUTq(sa) .req Bq(ma) + OUTq(se) .req Bq(me) + OUTq(si) .req Bq(si) + OUTq(so) .req Bq(so) + OUTq(su) .req Bq(su) + + TMP_OUT(0) .req B(sa) + TMP_OUT(1) .req B(se) + TMP_OUT(2) .req TMP_IN(1) + TMP_OUT(3) .req TMP_IN(2) + TMP_OUT(4) .req TMP_IN(3) + TMP_OUT(5) .req TMP_IN(4) + TMP_OUT(6) .req TMP_IN(6) + + TMP_OUTq(0) .req Bq(sa) + TMP_OUTq(1) .req Bq(se) + TMP_OUTq(2) .req TMP_INq(1) + TMP_OUTq(3) .req TMP_INq(2) + TMP_OUTq(4) .req TMP_INq(3) + TMP_OUTq(5) .req TMP_INq(4) + TMP_OUTq(6) .req TMP_INq(6) + + Cnext(0) .req TMP_OUT(0) + Cnext(1) .req TMP_OUT(1) + Cnext(2) .req TMP_OUT(2) + Cnext(3) .req TMP_OUT(3) + Cnext(4) .req TMP_OUT(4) + + Cnextq(0) .req TMP_OUTq(0) + Cnextq(1) .req TMP_OUTq(1) + Cnextq(2) .req TMP_OUTq(2) + Cnextq(3) .req TMP_OUTq(3) + Cnextq(4) .req TMP_OUTq(4) + + tmp .req v0 + .unreq tmp + tmp .req TMP_IN(6) +.endm + +.macro undeclare_mappings out, in + + .unreq C(0) + .unreq C(1) + .unreq C(2) + .unreq C(3) + .unreq C(4) + + .unreq Cq(0) + .unreq Cq(1) + .unreq Cq(2) + .unreq Cq(3) + .unreq Cq(4) + + .unreq E(2) + .unreq E(4) + .unreq E(1) + .unreq E(3) + .unreq E(0) + + .unreq Eq(2) + .unreq Eq(4) + .unreq Eq(1) + .unreq Eq(3) + .unreq Eq(0) + + .unreq B(go) + .unreq B(gi) + .unreq B(ga) + .unreq B(ge) + .unreq B(gu) + .unreq B(ki) + .unreq B(ko) + .unreq B(ka) + .unreq B(ke) + .unreq B(ku) + .unreq B(mu) + .unreq B(mo) + .unreq B(mi) + .unreq B(ma) + .unreq B(me) + .unreq B(ba) + .unreq B(bi) + .unreq B(bo) + .unreq B(bu) + .unreq B(be) + .unreq B(sa) + .unreq B(so) + .unreq B(se) + .unreq B(si) + .unreq B(su) + + .unreq Bq(go) + .unreq Bq(gi) + .unreq Bq(ga) + .unreq Bq(ge) + .unreq Bq(gu) + .unreq Bq(ki) + .unreq Bq(ko) + .unreq Bq(ka) + .unreq Bq(ke) + .unreq Bq(ku) + .unreq Bq(mu) + .unreq Bq(mo) + .unreq Bq(mi) + .unreq Bq(ma) + .unreq Bq(me) + .unreq Bq(ba) + .unreq Bq(bi) + .unreq Bq(bo) + .unreq Bq(bu) + .unreq Bq(be) + .unreq Bq(sa) + .unreq Bq(so) + .unreq Bq(se) + .unreq Bq(si) + .unreq Bq(su) + + .unreq OUT(ga) + .unreq OUT(ge) + .unreq OUT(gi) + .unreq OUT(go) + .unreq OUT(gu) + .unreq OUT(ka) + .unreq OUT(ke) + .unreq OUT(ki) + .unreq OUT(ko) + .unreq OUT(ku) + .unreq OUT(ma) + .unreq OUT(me) + .unreq OUT(mi) + .unreq OUT(mo) + .unreq OUT(mu) + .unreq OUT(ba) + .unreq OUT(be) + .unreq OUT(bi) + .unreq OUT(bo) + .unreq OUT(bu) + .unreq OUT(sa) + .unreq OUT(se) + .unreq OUT(si) + .unreq OUT(so) + .unreq OUT(su) + + .unreq OUTq(ga) + .unreq OUTq(ge) + .unreq OUTq(gi) + .unreq OUTq(go) + .unreq OUTq(gu) + .unreq OUTq(ka) + .unreq OUTq(ke) + .unreq OUTq(ki) + .unreq OUTq(ko) + .unreq OUTq(ku) + .unreq OUTq(ma) + .unreq OUTq(me) + .unreq OUTq(mi) + .unreq OUTq(mo) + .unreq OUTq(mu) + .unreq OUTq(ba) + .unreq OUTq(be) + .unreq OUTq(bi) + .unreq OUTq(bo) + .unreq OUTq(bu) + .unreq OUTq(sa) + .unreq OUTq(se) + .unreq OUTq(si) + .unreq OUTq(so) + .unreq OUTq(su) + + .unreq TMP_OUT(0) + .unreq TMP_OUT(1) + .unreq TMP_OUT(2) + .unreq TMP_OUT(3) + .unreq TMP_OUT(4) + .unreq TMP_OUT(5) + .unreq TMP_OUT(6) + + .unreq TMP_OUTq(0) + .unreq TMP_OUTq(1) + .unreq TMP_OUTq(2) + .unreq TMP_OUTq(3) + .unreq TMP_OUTq(4) + .unreq TMP_OUTq(5) + .unreq TMP_OUTq(6) + + .unreq tmp +.endm + +.macro keccak_f1600_round out, in + + eor3_m0 C(0), IN(ba), IN(ga), IN(ka) + eor3_m1 C(3), IN(bo), IN(go), IN(ko) + eor3_m0 C(2), IN(bi), IN(gi), IN(ki) + eor3_m1 C(1), IN(be), IN(ge), IN(ke) + eor3_m0 C(0), C(0), IN(ma), IN(sa) + eor3_m1 C(3), C(3), IN(mo), IN(so) + eor3_m0 C(2), C(2), IN(mi), IN(si) + eor3_m1 C(1), C(1), IN(me), IN(se) + eor3_m0 C(4), IN(bu), IN(gu), IN(ku) + rax1_m0 E(1), C(0), C(2) + xar_m1 B(mi), IN(ke), E(1), 54 + eor3_m0 C(4), C(4), IN(mu), IN(su) + xar_m1 B(go), IN(me), E(1), 19 + rax1_m0 E(3), C(2), C(4) + xar_m1 B(ka), IN(be), E(1), 63 + rax1_m0 E(0), C(4), C(1) + xar_m1 B(be), IN(ge), E(1), 20 + rax1_m0 E(2), C(1), C(3) + xar_m1 B(su), IN(se), E(1), 62 + rax1_m0 E(4), C(3), C(0) + + // TODO: * Interleave (fast) v8.4-A based 5-block with (slow) v8-A based 5-block, + // and then pull forward BCAX for the v8.4-A block + // * Handle XAR's for a fixed E(?) first, so that the remaining E(?)'s + // can be computed in parallel? + + eor2 B(ba), IN(ba), E(0) + xar_m1 B(ga), IN(bo), E(3), 36 + xar_m0 B(bi), IN(ki), E(2), 21 + xar_m1 B(ge), IN(gu), E(4), 44 + xar_m0 B(bo), IN(mo), E(3), 43 + xar_m1 B(gi), IN(ka), E(0), 61 + xar_m0 B(bu), IN(su), E(4), 50 + xar_m1 B(gu), IN(si), E(2), 3 + + xar_m0 B(ke), IN(gi), E(2), 58 + xar_m0 B(ki), IN(ko), E(3), 39 + bcax_m1 OUT(ba), B(ba), B(bi), B(be) + bcax_m1 OUT(be), B(be), B(bo), B(bi) + xar_m0 B(ko), IN(mu), E(4), 56 + xar_m0 B(ku), IN(sa), E(0), 46 + bcax_m1 OUT(bi), B(bi), B(bu), B(bo) + bcax_m1 OUT(bo), B(bo), B(ba), B(bu) + + xar_m0 B(ma), IN(bu), E(4), 37 + xar_m0 B(me), IN(ga), E(0), 28 + bcax_m1 OUT(bu), B(bu), B(be), B(ba) + bcax_m1 OUT(ga), B(ga), B(gi), B(ge) + xar_m0 B(mo), IN(mi), E(2), 49 + xar_m0 B(mu), IN(so), E(3), 8 + bcax_m1 OUT(ge), B(ge), B(go), B(gi) + bcax_m1 OUT(gi), B(gi), B(gu), B(go) + + ld1r {tmp.2d}, [const_addr], #8 + eor OUT(ba).16b, OUT(ba).16b, tmp.16b + + xar_m0 B(sa), IN(bi), E(2), 2 + bcax_m1 OUT(go), B(go), B(ga), B(gu) + xar_m0 B(se), IN(go), E(3), 9 + bcax_m1 OUT(gu), B(gu), B(ge), B(ga) + bcax_m1 OUT(ka), B(ka), B(ki), B(ke) + xar_m0 B(si), IN(ku), E(4), 25 + bcax_m1 OUT(ke), B(ke), B(ko), B(ki) + bcax_m1 OUT(ki), B(ki), B(ku), B(ko) + xar_m0 B(so), IN(ma), E(0), 23 + bcax_m1 OUT(ko), B(ko), B(ka), B(ku) + bcax_m1 OUT(ku), B(ku), B(ke), B(ka) + + bcax_m0 OUT(ma), B(ma), B(mi), B(me) + bcax_m1 OUT(me), B(me), B(mo), B(mi) + bcax_m1 OUT(mi), B(mi), B(mu), B(mo) + bcax_m0 OUT(mo), B(mo), B(ma), B(mu) + bcax_m1 OUT(mu), B(mu), B(me), B(ma) + + bcax_m0 OUT(sa), B(sa), B(si), B(se) + bcax_m1 OUT(se), B(se), B(so), B(si) + bcax_m1 OUT(si), B(si), B(su), B(so) + bcax_m0 OUT(so), B(so), B(sa), B(su) + bcax_m1 OUT(su), B(su), B(se), B(sa) + +.endm + +.macro transfer_state out, in + + savep(INq(ga),ga) + savep(INq(ge),ge) + savep(INq(gi),gi) + savep(INq(go),go) + savep(INq(gu),gu) + savep(INq(ka),ka) + savep(INq(ke),ke) + savep(INq(ki),ki) + savep(INq(ko),ko) + savep(INq(ku),ku) + savep(INq(ma),ma) + savep(INq(me),me) + savep(INq(mi),mi) + savep(INq(mo),mo) + savep(INq(mu),mu) + savep(INq(ba),ba) + savep(INq(be),be) + savep(INq(bi),bi) + savep(INq(bo),bo) + savep(INq(bu),bu) + savep(INq(sa),sa) + savep(INq(se),se) + savep(INq(si),si) + savep(INq(so),so) + savep(INq(su),su) + + restorep(OUTq(ga),ga) + restorep(OUTq(ge),ge) + restorep(OUTq(gi),gi) + restorep(OUTq(go),go) + restorep(OUTq(gu),gu) + restorep(OUTq(ka),ka) + restorep(OUTq(ke),ke) + restorep(OUTq(ki),ki) + restorep(OUTq(ko),ko) + restorep(OUTq(ku),ku) + restorep(OUTq(ma),ma) + restorep(OUTq(me),me) + restorep(OUTq(mi),mi) + restorep(OUTq(mo),mo) + restorep(OUTq(mu),mu) + restorep(OUTq(ba),ba) + restorep(OUTq(be),be) + restorep(OUTq(bi),bi) + restorep(OUTq(bo),bo) + restorep(OUTq(bu),bu) + restorep(OUTq(sa),sa) + restorep(OUTq(se),se) + restorep(OUTq(si),si) + restorep(OUTq(so),so) + restorep(OUTq(su),su) + +.endm + +.text +.align 4 +.global keccak_f1600_x2_hybrid_asm_v2p1 +.global _keccak_f1600_x2_hybrid_asm_v2p1 + +#define KECCAK_F1600_ROUNDS 24 + +keccak_f1600_x2_hybrid_asm_v2p1: +_keccak_f1600_x2_hybrid_asm_v2p1: + alloc_stack + save_vregs + load_constant_ptr + load_input + + /* NOTE: Unrolling the whole loop isn't really practical, but for now + * this is just for the sake of understanding the theoretical performance + * uplift of the present approach. */ + + declare_mappings A1, A + keccak_f1600_round A1, A + declare_mappings A2, A1 + keccak_f1600_round A2, A1 + declare_mappings A3, A2 + keccak_f1600_round A3, A2 + declare_mappings A4, A3 + keccak_f1600_round A4, A3 + declare_mappings A5, A4 + keccak_f1600_round A5, A4 + declare_mappings A6, A5 + keccak_f1600_round A6, A5 + declare_mappings A7, A6 + keccak_f1600_round A7, A6 + declare_mappings A8, A7 + keccak_f1600_round A8, A7 + + declare_mappings A9, A8 + keccak_f1600_round A9, A8 + declare_mappings A10, A9 + keccak_f1600_round A10, A9 + declare_mappings A11, A10 + keccak_f1600_round A11, A10 + declare_mappings A12, A11 + keccak_f1600_round A12, A11 + declare_mappings A13, A12 + keccak_f1600_round A13, A12 + declare_mappings A14, A13 + keccak_f1600_round A14, A13 + declare_mappings A15, A14 + keccak_f1600_round A15, A14 + declare_mappings A16, A15 + keccak_f1600_round A16, A15 + + declare_mappings A17, A16 + keccak_f1600_round A17, A16 + declare_mappings A18, A17 + keccak_f1600_round A18, A17 + declare_mappings A19, A18 + keccak_f1600_round A19, A18 + declare_mappings A20, A19 + keccak_f1600_round A20, A19 + declare_mappings A21, A20 + keccak_f1600_round A21, A20 + declare_mappings A22, A21 + keccak_f1600_round A22, A21 + declare_mappings A23, A22 + keccak_f1600_round A23, A22 + declare_mappings A24, A23 + keccak_f1600_round A24, A23 + + store_input A24 + restore_vregs + free_stack + ret + +#endif \ No newline at end of file diff --git a/asm/manual/keccak_f1600/keccak_f1600_x2_hybrid_asm_v2p2.s b/asm/manual/keccak_f1600/keccak_f1600_x2_hybrid_asm_v2p2.s new file mode 100644 index 0000000..fa64540 --- /dev/null +++ b/asm/manual/keccak_f1600/keccak_f1600_x2_hybrid_asm_v2p2.s @@ -0,0 +1,971 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +#if defined(__ARM_FEATURE_SHA3) + +/********************** CONSTANTS *************************/ + .data + .align(8) +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + .quad 0x0 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x1 + count .req x2 + cur_const .req x3 + + /* Mapping of Kecck-f1600 state to vector registers + * at the beginning and end of each round. */ + ASba .req v0 + ASbe .req v1 + ASbi .req v2 + ASbo .req v3 + ASbu .req v4 + ASga .req v5 + ASge .req v6 + ASgi .req v7 + ASgo .req v8 + ASgu .req v9 + ASka .req v10 + ASke .req v11 + ASki .req v12 + ASko .req v13 + ASku .req v14 + ASma .req v15 + ASme .req v16 + ASmi .req v17 + ASmo .req v18 + ASmu .req v19 + ASsa .req v20 + ASse .req v21 + ASsi .req v22 + ASso .req v23 + ASsu .req v24 + + /* q-form of the above mapping */ + ASbaq .req q0 + ASbeq .req q1 + ASbiq .req q2 + ASboq .req q3 + ASbuq .req q4 + ASgaq .req q5 + ASgeq .req q6 + ASgiq .req q7 + ASgoq .req q8 + ASguq .req q9 + ASkaq .req q10 + ASkeq .req q11 + ASkiq .req q12 + ASkoq .req q13 + ASkuq .req q14 + ASmaq .req q15 + ASmeq .req q16 + ASmiq .req q17 + ASmoq .req q18 + ASmuq .req q19 + ASsaq .req q20 + ASseq .req q21 + ASsiq .req q22 + ASsoq .req q23 + ASsuq .req q24 + + Ascratch0 .req v25 + Ascratch1 .req v26 + Ascratch2 .req v27 + Ascratch3 .req v28 + Ascratch4 .req v29 + Ascratch5 .req v30 + Ascratch6 .req v31 + + Ascratch0q .req q25 + Ascratch1q .req q26 + Ascratch2q .req q27 + Ascratch3q .req q28 + Ascratch4q .req q29 + Ascratch5q .req q30 + Ascratch6q .req q31 + +/************************ MACROS ****************************/ + +.macro load_input + ldp ASbaq, ASbeq, [input_addr, #(2*8*0)] + ldp ASbiq, ASboq, [input_addr, #(2*8*2)] + ldp ASbuq, ASgaq, [input_addr, #(2*8*4)] + ldp ASgeq, ASgiq, [input_addr, #(2*8*6)] + ldp ASgoq, ASguq, [input_addr, #(2*8*8)] + ldp ASkaq, ASkeq, [input_addr, #(2*8*10)] + ldp ASkiq, ASkoq, [input_addr, #(2*8*12)] + ldp ASkuq, ASmaq, [input_addr, #(2*8*14)] + ldp ASmeq, ASmiq, [input_addr, #(2*8*16)] + ldp ASmoq, ASmuq, [input_addr, #(2*8*18)] + ldp ASsaq, ASseq, [input_addr, #(2*8*20)] + ldp ASsiq, ASsoq, [input_addr, #(2*8*22)] + ldr ASsuq, [input_addr, #(2*8*24)] +.endm + +.macro store_input in + str \in\()Sbaq, [input_addr, #(2*8*0)] + str \in\()Sbeq, [input_addr, #(2*8*1)] + str \in\()Sbiq, [input_addr, #(2*8*2)] + str \in\()Sboq, [input_addr, #(2*8*3)] + str \in\()Sbuq, [input_addr, #(2*8*4)] + str \in\()Sgaq, [input_addr, #(2*8*5)] + str \in\()Sgeq, [input_addr, #(2*8*6)] + str \in\()Sgiq, [input_addr, #(2*8*7)] + str \in\()Sgoq, [input_addr, #(2*8*8)] + str \in\()Sguq, [input_addr, #(2*8*9)] + str \in\()Skaq, [input_addr, #(2*8*10)] + str \in\()Skeq, [input_addr, #(2*8*11)] + str \in\()Skiq, [input_addr, #(2*8*12)] + str \in\()Skoq, [input_addr, #(2*8*13)] + str \in\()Skuq, [input_addr, #(2*8*14)] + str \in\()Smaq, [input_addr, #(2*8*15)] + str \in\()Smeq, [input_addr, #(2*8*16)] + str \in\()Smiq, [input_addr, #(2*8*17)] + str \in\()Smoq, [input_addr, #(2*8*18)] + str \in\()Smuq, [input_addr, #(2*8*19)] + str \in\()Ssaq, [input_addr, #(2*8*20)] + str \in\()Sseq, [input_addr, #(2*8*21)] + str \in\()Ssiq, [input_addr, #(2*8*22)] + str \in\()Ssoq, [input_addr, #(2*8*23)] + str \in\()Ssuq, [input_addr, #(2*8*24)] +.endm + +#define STACK_SIZE (16*4 + 16*30) +#define STACK_BASE_VREGS 0 +#define STACK_BASE_TMP 16*4 + +#define E0_offset 0 +#define E1_offset 1 +#define E2_offset 2 +#define E3_offset 3 +#define E4_offset 4 + +#define Aba_offset (5 + 0 ) +#define Abe_offset (5 + 1 ) +#define Abi_offset (5 + 2 ) +#define Abo_offset (5 + 3 ) +#define Abu_offset (5 + 4 ) +#define Aga_offset (5 + 5 ) +#define Age_offset (5 + 6 ) +#define Agi_offset (5 + 7 ) +#define Ago_offset (5 + 8 ) +#define Agu_offset (5 + 9 ) +#define Aka_offset (5 + 10 ) +#define Ake_offset (5 + 11 ) +#define Aki_offset (5 + 12 ) +#define Ako_offset (5 + 13 ) +#define Aku_offset (5 + 14 ) +#define Ama_offset (5 + 15 ) +#define Ame_offset (5 + 16 ) +#define Ami_offset (5 + 17 ) +#define Amo_offset (5 + 18 ) +#define Amu_offset (5 + 19 ) +#define Asa_offset (5 + 20 ) +#define Ase_offset (5 + 21 ) +#define Asi_offset (5 + 22 ) +#define Aso_offset (5 + 23 ) +#define Asu_offset (5 + 24 ) + +#define ba_offset (5 + 0 ) +#define be_offset (5 + 1 ) +#define bi_offset (5 + 2 ) +#define bo_offset (5 + 3 ) +#define bu_offset (5 + 4 ) +#define ga_offset (5 + 5 ) +#define ge_offset (5 + 6 ) +#define gi_offset (5 + 7 ) +#define go_offset (5 + 8 ) +#define gu_offset (5 + 9 ) +#define ka_offset (5 + 10 ) +#define ke_offset (5 + 11 ) +#define ki_offset (5 + 12 ) +#define ko_offset (5 + 13 ) +#define ku_offset (5 + 14 ) +#define ma_offset (5 + 15 ) +#define me_offset (5 + 16 ) +#define mi_offset (5 + 17 ) +#define mo_offset (5 + 18 ) +#define mu_offset (5 + 19 ) +#define sa_offset (5 + 20 ) +#define se_offset (5 + 21 ) +#define si_offset (5 + 22 ) +#define so_offset (5 + 23 ) +#define su_offset (5 + 24 ) + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +#define savep(reg, offset_prefix) \ + str reg, [sp, #(STACK_BASE_TMP + 16 * offset_prefix ## _offset)] +#define restorep(reg, offset_prefix) \ + ldr reg, [sp, #(STACK_BASE_TMP + 16 * offset_prefix ## _offset)] +#define save(name) savep(name ## q,name) +#define restore(name) restorep(name ## q,name) + +.macro save_vregs + stp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + stp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + stp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + stp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + ldp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + ldp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + ldp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +/* Macros using v8.4-A SHA-3 instructions */ + +.macro eor3_m1_0 d s0 s1 s2 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor2 d s0 s1 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor3_m1_1 d s0 s1 s2 + eor \d\().16b, \d\().16b, \s2\().16b +.endm + +.macro eor3_m1 d s0 s1 s2 + eor3_m1_0 \d, \s0, \s1, \s2 + eor3_m1_1 \d, \s0, \s1, \s2 +.endm + +.macro rax1_m1 d s0 s1 + add tmp.2d, \s1\().2d, \s1\().2d + sri tmp.2d, \s1\().2d, #63 + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +.macro xar_m1 d s0 s1 imm + eor tmp.16b, \s0\().16b, \s1\().16b + shl \d\().2d, tmp.2d, #(64-\imm) + sri \d\().2d, tmp.2d, #(\imm) +.endm + +.macro bcax_m1 d s0 s1 s2 + bic tmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +.macro eor3_m0 d s0 s1 s2 + eor3 \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +.macro rax1_m0 d s0 s1 + rax1 \d\().2d, \s0\().2d, \s1\().2d +.endm + +.macro xar_m0 d s0 s1 imm + xar \d\().2d, \s0\().2d, \s1\().2d, #\imm +.endm + +.macro bcax_m0 d s0 s1 s2 + bcax \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +#define CONCAT5(a,b,c,d,e) a ## b ## c ## d ## e +#define CONCAT4(a,b,c,d) a ## b ## c ## d + +#define OUT(x) \out\()S##x +#define IN(x) \in\()S##x +#define B(x) \in\()B##x +#define E(x) \in\()E##x +#define C(x) \in\()C##x +#define Cnext(x) \out\()C##x +#define TMP_IN(x) \in\()scratch ## x +#define TMP_OUT(x) \out\()scratch ## x + +#define OUTq(x) \out\()S##x##q +#define INq(x) \in\()S##x##q +#define Bq(x) \in\()B##x##q +#define Eq(x) \in\()E##x##q +#define Cq(x) \in\()C##x##q +#define Cnextq(x) \out\()C##x##q +#define TMP_INq(x) \in\()scratch ## x ## q +#define TMP_OUTq(x) \out\()scratch ## x ## q + +.macro declare_mappings out, in + + C(0) .req TMP_IN(0) + C(1) .req TMP_IN(1) + C(2) .req TMP_IN(2) + C(3) .req TMP_IN(3) + C(4) .req TMP_IN(4) + + Cq(0) .req TMP_INq(0) + Cq(1) .req TMP_INq(1) + Cq(2) .req TMP_INq(2) + Cq(3) .req TMP_INq(3) + Cq(4) .req TMP_INq(4) + + E(1) .req TMP_IN(5) + E(3) .req C(2) + E(0) .req C(4) + E(2) .req C(1) + E(4) .req C(3) + + Eq(1) .req TMP_INq(5) + Eq(3) .req Cq(2) + Eq(0) .req Cq(4) + Eq(2) .req Cq(1) + Eq(4) .req Cq(3) + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + B(go) .req IN(me) + B(gi) .req IN(ka) + B(ga) .req IN(bo) + B(ge) .req IN(gu) + B(gu) .req IN(si) + B(ki) .req IN(ko) + B(ko) .req IN(mu) + B(ka) .req IN(be) + B(ke) .req IN(gi) + B(ku) .req IN(sa) + B(mu) .req IN(so) + B(mo) .req IN(mi) + B(mi) .req IN(ke) + B(ma) .req IN(bu) + B(me) .req IN(ga) + B(ba) .req IN(ba) + B(bi) .req IN(ki) + B(bo) .req IN(mo) + B(bu) .req IN(su) + B(be) .req IN(ge) + B(sa) .req IN(bi) + B(so) .req IN(ma) + B(se) .req IN(go) + B(si) .req IN(ku) + B(su) .req IN(se) + + Bq(go) .req INq(me) + Bq(gi) .req INq(ka) + Bq(ga) .req INq(bo) + Bq(ge) .req INq(gu) + Bq(gu) .req INq(si) + Bq(ki) .req INq(ko) + Bq(ko) .req INq(mu) + Bq(ka) .req INq(be) + Bq(ke) .req INq(gi) + Bq(ku) .req INq(sa) + Bq(mu) .req INq(so) + Bq(mo) .req INq(mi) + Bq(mi) .req INq(ke) + Bq(ma) .req INq(bu) + Bq(me) .req INq(ga) + Bq(ba) .req INq(ba) + Bq(bi) .req INq(ki) + Bq(bo) .req INq(mo) + Bq(bu) .req INq(su) + Bq(be) .req INq(ge) + Bq(sa) .req INq(bi) + Bq(so) .req INq(ma) + Bq(se) .req INq(go) + Bq(si) .req INq(ku) + Bq(su) .req INq(se) + + OUT(ba) .req TMP_IN(0) + OUT(be) .req TMP_IN(5) + OUT(bi) .req B(bi) + OUT(bo) .req B(bo) + OUT(bu) .req B(bu) + OUT(ga) .req B(ba) + OUT(ge) .req B(be) + OUT(gi) .req B(gi) + OUT(go) .req B(go) + OUT(gu) .req B(gu) + OUT(ka) .req B(ga) + OUT(ke) .req B(ge) + OUT(ki) .req B(ki) + OUT(ko) .req B(ko) + OUT(ku) .req B(ku) + OUT(ma) .req B(ka) + OUT(me) .req B(ke) + OUT(mi) .req B(mi) + OUT(mo) .req B(mo) + OUT(mu) .req B(mu) + OUT(sa) .req B(ma) + OUT(se) .req B(me) + OUT(si) .req B(si) + OUT(so) .req B(so) + OUT(su) .req B(su) + + OUTq(ba) .req TMP_INq(0) + OUTq(be) .req TMP_INq(5) + OUTq(bi) .req Bq(bi) + OUTq(bo) .req Bq(bo) + OUTq(bu) .req Bq(bu) + OUTq(ga) .req Bq(ba) + OUTq(ge) .req Bq(be) + OUTq(gi) .req Bq(gi) + OUTq(go) .req Bq(go) + OUTq(gu) .req Bq(gu) + OUTq(ka) .req Bq(ga) + OUTq(ke) .req Bq(ge) + OUTq(ki) .req Bq(ki) + OUTq(ko) .req Bq(ko) + OUTq(ku) .req Bq(ku) + OUTq(ma) .req Bq(ka) + OUTq(me) .req Bq(ke) + OUTq(mi) .req Bq(mi) + OUTq(mo) .req Bq(mo) + OUTq(mu) .req Bq(mu) + OUTq(sa) .req Bq(ma) + OUTq(se) .req Bq(me) + OUTq(si) .req Bq(si) + OUTq(so) .req Bq(so) + OUTq(su) .req Bq(su) + + TMP_OUT(0) .req TMP_IN(1) + TMP_OUT(1) .req TMP_IN(2) + TMP_OUT(2) .req TMP_IN(3) + TMP_OUT(3) .req TMP_IN(4) + TMP_OUT(4) .req B(sa) + TMP_OUT(5) .req B(se) + TMP_OUT(6) .req TMP_IN(6) + + TMP_OUTq(0) .req TMP_INq(1) + TMP_OUTq(1) .req TMP_INq(2) + TMP_OUTq(2) .req TMP_INq(3) + TMP_OUTq(3) .req TMP_INq(4) + TMP_OUTq(4) .req Bq(sa) + TMP_OUTq(5) .req Bq(se) + TMP_OUTq(6) .req TMP_INq(6) + + Cnext(0) .req TMP_OUT(0) + Cnext(1) .req TMP_OUT(1) + Cnext(2) .req TMP_OUT(2) + Cnext(3) .req TMP_OUT(3) + Cnext(4) .req TMP_OUT(4) + + Cnextq(0) .req TMP_OUTq(0) + Cnextq(1) .req TMP_OUTq(1) + Cnextq(2) .req TMP_OUTq(2) + Cnextq(3) .req TMP_OUTq(3) + Cnextq(4) .req TMP_OUTq(4) + + tmp .req v0 + .unreq tmp + tmp .req TMP_IN(6) +.endm + +.macro undeclare_mappings out, in + + .unreq C(0) + .unreq C(1) + .unreq C(2) + .unreq C(3) + .unreq C(4) + + .unreq Cq(0) + .unreq Cq(1) + .unreq Cq(2) + .unreq Cq(3) + .unreq Cq(4) + + .unreq E(2) + .unreq E(4) + .unreq E(1) + .unreq E(3) + .unreq E(0) + + .unreq Eq(2) + .unreq Eq(4) + .unreq Eq(1) + .unreq Eq(3) + .unreq Eq(0) + + .unreq B(go) + .unreq B(gi) + .unreq B(ga) + .unreq B(ge) + .unreq B(gu) + .unreq B(ki) + .unreq B(ko) + .unreq B(ka) + .unreq B(ke) + .unreq B(ku) + .unreq B(mu) + .unreq B(mo) + .unreq B(mi) + .unreq B(ma) + .unreq B(me) + .unreq B(ba) + .unreq B(bi) + .unreq B(bo) + .unreq B(bu) + .unreq B(be) + .unreq B(sa) + .unreq B(so) + .unreq B(se) + .unreq B(si) + .unreq B(su) + + .unreq Bq(go) + .unreq Bq(gi) + .unreq Bq(ga) + .unreq Bq(ge) + .unreq Bq(gu) + .unreq Bq(ki) + .unreq Bq(ko) + .unreq Bq(ka) + .unreq Bq(ke) + .unreq Bq(ku) + .unreq Bq(mu) + .unreq Bq(mo) + .unreq Bq(mi) + .unreq Bq(ma) + .unreq Bq(me) + .unreq Bq(ba) + .unreq Bq(bi) + .unreq Bq(bo) + .unreq Bq(bu) + .unreq Bq(be) + .unreq Bq(sa) + .unreq Bq(so) + .unreq Bq(se) + .unreq Bq(si) + .unreq Bq(su) + + .unreq OUT(ga) + .unreq OUT(ge) + .unreq OUT(gi) + .unreq OUT(go) + .unreq OUT(gu) + .unreq OUT(ka) + .unreq OUT(ke) + .unreq OUT(ki) + .unreq OUT(ko) + .unreq OUT(ku) + .unreq OUT(ma) + .unreq OUT(me) + .unreq OUT(mi) + .unreq OUT(mo) + .unreq OUT(mu) + .unreq OUT(ba) + .unreq OUT(be) + .unreq OUT(bi) + .unreq OUT(bo) + .unreq OUT(bu) + .unreq OUT(sa) + .unreq OUT(se) + .unreq OUT(si) + .unreq OUT(so) + .unreq OUT(su) + + .unreq OUTq(ga) + .unreq OUTq(ge) + .unreq OUTq(gi) + .unreq OUTq(go) + .unreq OUTq(gu) + .unreq OUTq(ka) + .unreq OUTq(ke) + .unreq OUTq(ki) + .unreq OUTq(ko) + .unreq OUTq(ku) + .unreq OUTq(ma) + .unreq OUTq(me) + .unreq OUTq(mi) + .unreq OUTq(mo) + .unreq OUTq(mu) + .unreq OUTq(ba) + .unreq OUTq(be) + .unreq OUTq(bi) + .unreq OUTq(bo) + .unreq OUTq(bu) + .unreq OUTq(sa) + .unreq OUTq(se) + .unreq OUTq(si) + .unreq OUTq(so) + .unreq OUTq(su) + + .unreq TMP_OUT(0) + .unreq TMP_OUT(1) + .unreq TMP_OUT(2) + .unreq TMP_OUT(3) + .unreq TMP_OUT(4) + .unreq TMP_OUT(5) + .unreq TMP_OUT(6) + + .unreq TMP_OUTq(0) + .unreq TMP_OUTq(1) + .unreq TMP_OUTq(2) + .unreq TMP_OUTq(3) + .unreq TMP_OUTq(4) + .unreq TMP_OUTq(5) + .unreq TMP_OUTq(6) + + .unreq tmp +.endm + +.macro keccak_f1600_round_pre out, in + + eor3_m0 C(0), IN(ba), IN(ga), IN(ka) + eor3_m1 C(3), IN(bo), IN(go), IN(ko) + eor3_m0 C(2), IN(bi), IN(gi), IN(ki) + eor3_m1 C(1), IN(be), IN(ge), IN(ke) + eor3_m0 C(0), C(0), IN(ma), IN(sa) + eor3_m1 C(3), C(3), IN(mo), IN(so) + eor3_m0 C(2), C(2), IN(mi), IN(si) + eor3_m1 C(1), C(1), IN(me), IN(se) + eor3_m0 C(4), IN(bu), IN(gu), IN(ku) + +.endm + +.macro keccak_f1600_round_core out, in + + rax1_m0 E(1), C(0), C(2) + xar_m1 B(mi), IN(ke), E(1), 54 + eor3_m0 C(4), C(4), IN(mu), IN(su) + xar_m1 B(go), IN(me), E(1), 19 + rax1_m0 E(3), C(2), C(4) + xar_m1 B(ka), IN(be), E(1), 63 + rax1_m0 E(0), C(4), C(1) + xar_m1 B(be), IN(ge), E(1), 20 + rax1_m0 E(2), C(1), C(3) + xar_m1 B(su), IN(se), E(1), 62 + rax1_m0 E(4), C(3), C(0) + + // TODO: * Interleave (fast) v8.4-A based 5-block with (slow) v8-A based 5-block, + // and then pull forward BCAX for the v8.4-A block + // * Handle XAR's for a fixed E(?) first, so that the remaining E(?)'s + // can be computed in parallel? + + eor2 B(ba), IN(ba), E(0) + xar_m1 B(ga), IN(bo), E(3), 36 + xar_m0 B(bi), IN(ki), E(2), 21 + xar_m1 B(ge), IN(gu), E(4), 44 + xar_m0 B(bo), IN(mo), E(3), 43 + xar_m1 B(gi), IN(ka), E(0), 61 + xar_m0 B(bu), IN(su), E(4), 50 + xar_m1 B(gu), IN(si), E(2), 3 + + xar_m0 B(ke), IN(gi), E(2), 58 + xar_m0 B(ki), IN(ko), E(3), 39 + bcax_m1 OUT(ba), B(ba), B(bi), B(be) + bcax_m1 OUT(be), B(be), B(bo), B(bi) + xar_m0 B(ko), IN(mu), E(4), 56 + xar_m0 B(ku), IN(sa), E(0), 46 + bcax_m1 OUT(bi), B(bi), B(bu), B(bo) + bcax_m1 OUT(bo), B(bo), B(ba), B(bu) + + xar_m0 B(ma), IN(bu), E(4), 37 + xar_m0 B(me), IN(ga), E(0), 28 + bcax_m1 OUT(bu), B(bu), B(be), B(ba) + bcax_m1 OUT(ga), B(ga), B(gi), B(ge) + xar_m0 B(mo), IN(mi), E(2), 49 + xar_m0 B(mu), IN(so), E(3), 8 + bcax_m1 OUT(ge), B(ge), B(go), B(gi) + bcax_m1 OUT(gi), B(gi), B(gu), B(go) + + ld1r {tmp.2d}, [const_addr], #8 + eor OUT(ba).16b, OUT(ba).16b, tmp.16b + + xar_m0 B(sa), IN(bi), E(2), 2 + bcax_m1 OUT(go), B(go), B(ga), B(gu) + xar_m0 B(se), IN(go), E(3), 9 + bcax_m1 OUT(gu), B(gu), B(ge), B(ga) + bcax_m1 OUT(ka), B(ka), B(ki), B(ke) + xar_m0 B(si), IN(ku), E(4), 25 + bcax_m1 OUT(ke), B(ke), B(ko), B(ki) + bcax_m1 OUT(ki), B(ki), B(ku), B(ko) + xar_m0 B(so), IN(ma), E(0), 23 + bcax_m1 OUT(ko), B(ko), B(ka), B(ku) + bcax_m1 OUT(ku), B(ku), B(ke), B(ka) + + bcax_m0 OUT(ma), B(ma), B(mi), B(me) + bcax_m1 OUT(me), B(me), B(mo), B(mi) + bcax_m1 OUT(mi), B(mi), B(mu), B(mo) + bcax_m0 OUT(mo), B(mo), B(ma), B(mu) + bcax_m1 OUT(mu), B(mu), B(me), B(ma) + + bcax_m0 OUT(sa), B(sa), B(si), B(se) + bcax_m1 OUT(se), B(se), B(so), B(si) + bcax_m1 OUT(si), B(si), B(su), B(so) + bcax_m0 OUT(so), B(so), B(sa), B(su) + bcax_m1 OUT(su), B(su), B(se), B(sa) + + eor3_m0 Cnext(0), OUT(ba), OUT(ga), OUT(ka) + eor3_m1 Cnext(3), OUT(bo), OUT(go), OUT(ko) + eor3_m0 Cnext(2), OUT(bi), OUT(gi), OUT(ki) + eor3_m1 Cnext(1), OUT(be), OUT(ge), OUT(ke) + + eor3_m0 Cnext(0), Cnext(0), OUT(ma), OUT(sa) + eor3_m1 Cnext(3), Cnext(3), OUT(mo), OUT(so) + eor3_m0 Cnext(2), Cnext(2), OUT(mi), OUT(si) + eor3_m1 Cnext(1), Cnext(1), OUT(me), OUT(se) + eor3_m0 Cnext(4), OUT(bu), OUT(gu), OUT(ku) + +.endm + +.macro keccak_f1600_round_last out, in + + rax1_m0 E(1), C(0), C(2) + xar_m1 B(mi), IN(ke), E(1), 54 + eor3_m0 C(4), C(4), IN(mu), IN(su) + xar_m1 B(go), IN(me), E(1), 19 + rax1_m0 E(3), C(2), C(4) + xar_m1 B(ka), IN(be), E(1), 63 + rax1_m0 E(0), C(4), C(1) + xar_m1 B(be), IN(ge), E(1), 20 + rax1_m0 E(2), C(1), C(3) + xar_m1 B(su), IN(se), E(1), 62 + rax1_m0 E(4), C(3), C(0) + + // TODO: * Interleave (fast) v8.4-A based 5-block with (slow) v8-A based 5-block, + // and then pull forward BCAX for the v8.4-A block + // * Handle XAR's for a fixed E(?) first, so that the remaining E(?)'s + // can be computed in parallel? + + eor2 B(ba), IN(ba), E(0) + xar_m1 B(ga), IN(bo), E(3), 36 + xar_m0 B(bi), IN(ki), E(2), 21 + xar_m1 B(ge), IN(gu), E(4), 44 + xar_m0 B(bo), IN(mo), E(3), 43 + xar_m1 B(gi), IN(ka), E(0), 61 + xar_m0 B(bu), IN(su), E(4), 50 + xar_m1 B(gu), IN(si), E(2), 3 + + xar_m0 B(ke), IN(gi), E(2), 58 + xar_m0 B(ki), IN(ko), E(3), 39 + bcax_m1 OUT(ba), B(ba), B(bi), B(be) + bcax_m1 OUT(be), B(be), B(bo), B(bi) + xar_m0 B(ko), IN(mu), E(4), 56 + xar_m0 B(ku), IN(sa), E(0), 46 + bcax_m1 OUT(bi), B(bi), B(bu), B(bo) + bcax_m1 OUT(bo), B(bo), B(ba), B(bu) + + xar_m0 B(ma), IN(bu), E(4), 37 + xar_m0 B(me), IN(ga), E(0), 28 + bcax_m1 OUT(bu), B(bu), B(be), B(ba) + bcax_m1 OUT(ga), B(ga), B(gi), B(ge) + xar_m0 B(mo), IN(mi), E(2), 49 + xar_m0 B(mu), IN(so), E(3), 8 + bcax_m1 OUT(ge), B(ge), B(go), B(gi) + bcax_m1 OUT(gi), B(gi), B(gu), B(go) + + ld1r {tmp.2d}, [const_addr], #8 + eor OUT(ba).16b, OUT(ba).16b, tmp.16b + + xar_m0 B(sa), IN(bi), E(2), 2 + bcax_m1 OUT(go), B(go), B(ga), B(gu) + xar_m0 B(se), IN(go), E(3), 9 + bcax_m1 OUT(gu), B(gu), B(ge), B(ga) + bcax_m1 OUT(ka), B(ka), B(ki), B(ke) + xar_m0 B(si), IN(ku), E(4), 25 + bcax_m1 OUT(ke), B(ke), B(ko), B(ki) + bcax_m1 OUT(ki), B(ki), B(ku), B(ko) + xar_m0 B(so), IN(ma), E(0), 23 + bcax_m1 OUT(ko), B(ko), B(ka), B(ku) + bcax_m1 OUT(ku), B(ku), B(ke), B(ka) + + bcax_m0 OUT(ma), B(ma), B(mi), B(me) + bcax_m1 OUT(me), B(me), B(mo), B(mi) + bcax_m1 OUT(mi), B(mi), B(mu), B(mo) + bcax_m0 OUT(mo), B(mo), B(ma), B(mu) + bcax_m1 OUT(mu), B(mu), B(me), B(ma) + + bcax_m0 OUT(sa), B(sa), B(si), B(se) + bcax_m1 OUT(se), B(se), B(so), B(si) + bcax_m1 OUT(si), B(si), B(su), B(so) + bcax_m0 OUT(so), B(so), B(sa), B(su) + bcax_m1 OUT(su), B(su), B(se), B(sa) +.endm + +.macro transfer_state out, in + + savep(INq(ga),ga) + savep(INq(ge),ge) + savep(INq(gi),gi) + savep(INq(go),go) + savep(INq(gu),gu) + savep(INq(ka),ka) + savep(INq(ke),ke) + savep(INq(ki),ki) + savep(INq(ko),ko) + savep(INq(ku),ku) + savep(INq(ma),ma) + savep(INq(me),me) + savep(INq(mi),mi) + savep(INq(mo),mo) + savep(INq(mu),mu) + savep(INq(ba),ba) + savep(INq(be),be) + savep(INq(bi),bi) + savep(INq(bo),bo) + savep(INq(bu),bu) + savep(INq(sa),sa) + savep(INq(se),se) + savep(INq(si),si) + savep(INq(so),so) + savep(INq(su),su) + + restorep(OUTq(ga),ga) + restorep(OUTq(ge),ge) + restorep(OUTq(gi),gi) + restorep(OUTq(go),go) + restorep(OUTq(gu),gu) + restorep(OUTq(ka),ka) + restorep(OUTq(ke),ke) + restorep(OUTq(ki),ki) + restorep(OUTq(ko),ko) + restorep(OUTq(ku),ku) + restorep(OUTq(ma),ma) + restorep(OUTq(me),me) + restorep(OUTq(mi),mi) + restorep(OUTq(mo),mo) + restorep(OUTq(mu),mu) + restorep(OUTq(ba),ba) + restorep(OUTq(be),be) + restorep(OUTq(bi),bi) + restorep(OUTq(bo),bo) + restorep(OUTq(bu),bu) + restorep(OUTq(sa),sa) + restorep(OUTq(se),se) + restorep(OUTq(si),si) + restorep(OUTq(so),so) + restorep(OUTq(su),su) + +.endm + +.text +.align 4 +.global keccak_f1600_x2_hybrid_asm_v2p2 +.global _keccak_f1600_x2_hybrid_asm_v2p2 + +#define KECCAK_F1600_ROUNDS 24 + +keccak_f1600_x2_hybrid_asm_v2p2: +_keccak_f1600_x2_hybrid_asm_v2p2: + alloc_stack + save_vregs + load_constant_ptr + load_input + + /* NOTE: Unrolling the whole loop isn't really practical, but for now + * this is just for the sake of understanding the theoretical performance + * uplift of the present approach. */ + + declare_mappings A1, A + keccak_f1600_round_pre A1, A + keccak_f1600_round_core A1, A + declare_mappings A2, A1 + keccak_f1600_round_core A2, A1 + declare_mappings A3, A2 + keccak_f1600_round_core A3, A2 + declare_mappings A4, A3 + keccak_f1600_round_core A4, A3 + declare_mappings A5, A4 + keccak_f1600_round_core A5, A4 + declare_mappings A6, A5 + keccak_f1600_round_core A6, A5 + declare_mappings A7, A6 + keccak_f1600_round_core A7, A6 + declare_mappings A8, A7 + keccak_f1600_round_core A8, A7 + + declare_mappings A9, A8 + keccak_f1600_round_core A9, A8 + declare_mappings A10, A9 + keccak_f1600_round_core A10, A9 + declare_mappings A11, A10 + keccak_f1600_round_core A11, A10 + declare_mappings A12, A11 + keccak_f1600_round_core A12, A11 + declare_mappings A13, A12 + keccak_f1600_round_core A13, A12 + declare_mappings A14, A13 + keccak_f1600_round_core A14, A13 + declare_mappings A15, A14 + keccak_f1600_round_core A15, A14 + declare_mappings A16, A15 + keccak_f1600_round_core A16, A15 + + declare_mappings A17, A16 + keccak_f1600_round_core A17, A16 + declare_mappings A18, A17 + keccak_f1600_round_core A18, A17 + declare_mappings A19, A18 + keccak_f1600_round_core A19, A18 + declare_mappings A20, A19 + keccak_f1600_round_core A20, A19 + declare_mappings A21, A20 + keccak_f1600_round_core A21, A20 + declare_mappings A22, A21 + keccak_f1600_round_core A22, A21 + declare_mappings A23, A22 + keccak_f1600_round_core A23, A22 + declare_mappings A24, A23 + keccak_f1600_round_last A24, A23 + + store_input A24 + restore_vregs + free_stack + ret + +#endif \ No newline at end of file diff --git a/asm/manual/keccak_f1600/keccak_f1600_x2_hybrid_asm_v2pp0.s b/asm/manual/keccak_f1600/keccak_f1600_x2_hybrid_asm_v2pp0.s new file mode 100644 index 0000000..517338e --- /dev/null +++ b/asm/manual/keccak_f1600/keccak_f1600_x2_hybrid_asm_v2pp0.s @@ -0,0 +1,804 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +#if defined(__ARM_FEATURE_SHA3) +/********************** CONSTANTS *************************/ + .data + .align(8) +_round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x1 + count .req x2 + cur_const .req x3 + + /* Mapping of Kecck-f1600 state to vector registers + * at the beginning and end of each round. */ + Aba .req v0 + Abe .req v1 + Abi .req v2 + Abo .req v3 + Abu .req v4 + Aga .req v5 + Age .req v6 + Agi .req v7 + Ago .req v8 + Agu .req v9 + Aka .req v10 + Ake .req v11 + Aki .req v12 + Ako .req v13 + Aku .req v14 + Ama .req v15 + Ame .req v16 + Ami .req v17 + Amo .req v18 + Amu .req v19 + Asa .req v20 + Ase .req v21 + Asi .req v22 + Aso .req v23 + Asu .req v24 + + /* q-form of the above mapping */ + Abaq .req q0 + Abeq .req q1 + Abiq .req q2 + Aboq .req q3 + Abuq .req q4 + Agaq .req q5 + Ageq .req q6 + Agiq .req q7 + Agoq .req q8 + Aguq .req q9 + Akaq .req q10 + Akeq .req q11 + Akiq .req q12 + Akoq .req q13 + Akuq .req q14 + Amaq .req q15 + Ameq .req q16 + Amiq .req q17 + Amoq .req q18 + Amuq .req q19 + Asaq .req q20 + Aseq .req q21 + Asiq .req q22 + Asoq .req q23 + Asuq .req q24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v27 + C1 .req v28 + C2 .req v29 + C3 .req v30 + C4 .req v31 + + C0q .req q27 + C1q .req q28 + C2q .req q29 + C3q .req q30 + C4q .req q31 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + vBba .req v25 // fresh + vBbe .req v26 // fresh + vBbi .req Abi + vBbo .req Abo + vBbu .req Abu + vBga .req Aka + vBge .req Ake + vBgi .req Agi + vBgo .req Ago + vBgu .req Agu + vBka .req Ama + vBke .req Ame + vBki .req Aki + vBko .req Ako + vBku .req Aku + vBma .req Asa + vBme .req Ase + vBmi .req Ami + vBmo .req Amo + vBmu .req Amu + vBsa .req Aba + vBse .req Abe + vBsi .req Asi + vBso .req Aso + vBsu .req Asu + + vBbaq .req q25 // fresh + vBbeq .req q26 // fresh + vBbiq .req Abiq + vBboq .req Aboq + vBbuq .req Abuq + vBgaq .req Akaq + vBgeq .req Akeq + vBgiq .req Agiq + vBgoq .req Agoq + vBguq .req Aguq + vBkaq .req Amaq + vBkeq .req Ameq + vBkiq .req Akiq + vBkoq .req Akoq + vBkuq .req Akuq + vBmaq .req Asaq + vBmeq .req Aseq + vBmiq .req Amiq + vBmoq .req Amoq + vBmuq .req Amuq + vBsaq .req Abaq + vBseq .req Abeq + vBsiq .req Asiq + vBsoq .req Asoq + vBsuq .req Asuq + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req C4 + E1 .req C0 + E2 .req vBbe // fresh + E3 .req C2 + E4 .req C3 + + E0q .req C4q + E1q .req C0q + E2q .req vBbeq // fresh + E3q .req C2q + E4q .req C3q + + +/************************ MACROS ****************************/ + +.macro load_input + ldp Abaq, Abeq, [input_addr, #(2*8*0)] + ldp Abiq, Aboq, [input_addr, #(2*8*2)] + ldp Abuq, Agaq, [input_addr, #(2*8*4)] + ldp Ageq, Agiq, [input_addr, #(2*8*6)] + ldp Agoq, Aguq, [input_addr, #(2*8*8)] + ldp Akaq, Akeq, [input_addr, #(2*8*10)] + ldp Akiq, Akoq, [input_addr, #(2*8*12)] + ldp Akuq, Amaq, [input_addr, #(2*8*14)] + ldp Ameq, Amiq, [input_addr, #(2*8*16)] + ldp Amoq, Amuq, [input_addr, #(2*8*18)] + ldp Asaq, Aseq, [input_addr, #(2*8*20)] + ldp Asiq, Asoq, [input_addr, #(2*8*22)] + ldr Asuq, [input_addr, #(2*8*24)] +.endm + +.macro store_input + str Abaq, [input_addr, #(2*8*0)] + str Abeq, [input_addr, #(2*8*1)] + str Abiq, [input_addr, #(2*8*2)] + str Aboq, [input_addr, #(2*8*3)] + str Abuq, [input_addr, #(2*8*4)] + str Agaq, [input_addr, #(2*8*5)] + str Ageq, [input_addr, #(2*8*6)] + str Agiq, [input_addr, #(2*8*7)] + str Agoq, [input_addr, #(2*8*8)] + str Aguq, [input_addr, #(2*8*9)] + str Akaq, [input_addr, #(2*8*10)] + str Akeq, [input_addr, #(2*8*11)] + str Akiq, [input_addr, #(2*8*12)] + str Akoq, [input_addr, #(2*8*13)] + str Akuq, [input_addr, #(2*8*14)] + str Amaq, [input_addr, #(2*8*15)] + str Ameq, [input_addr, #(2*8*16)] + str Amiq, [input_addr, #(2*8*17)] + str Amoq, [input_addr, #(2*8*18)] + str Amuq, [input_addr, #(2*8*19)] + str Asaq, [input_addr, #(2*8*20)] + str Aseq, [input_addr, #(2*8*21)] + str Asiq, [input_addr, #(2*8*22)] + str Asoq, [input_addr, #(2*8*23)] + str Asuq, [input_addr, #(2*8*24)] +.endm + +#define STACK_SIZE (16*4 + 16*34) +#define STACK_BASE_VREGS 0 +#define STACK_BASE_TMP 16*4 + +#define Aga_offset 0 +#define E0_offset 1 +#define E1_offset 2 +#define E2_offset 3 +#define E3_offset 4 +#define E4_offset 5 +#define Ame_offset 7 +#define Agi_offset 8 +#define Aka_offset 9 +#define Abo_offset 10 +#define Amo_offset 11 +#define Ami_offset 12 +#define Ake_offset 13 +#define Agu_offset 14 +#define Asi_offset 15 +#define Aku_offset 16 +#define Asa_offset 17 +#define Abu_offset 18 +#define Asu_offset 19 +#define Ase_offset 20 +//#define Aga_offset 21 +#define Age_offset 22 +#define vBgo_offset 23 +#define vBke_offset 24 +#define vBgi_offset 25 +#define vBga_offset 26 +#define vBbo_offset 27 +#define vBmo_offset 28 +#define vBmi_offset 29 +#define vBge_offset 30 + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +#define save(name) \ + str name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] +#define restore(name) \ + ldr name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] + +.macro save_vregs + stp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + stp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + stp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + stp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + ldp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + ldp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + ldp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +.macro eor3_m0 d s0 s1 s2 + eor3 \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +.macro rax1_m0 d s0 s1 + rax1 \d\().2d, \s0\().2d, \s1\().2d +.endm + +.macro xar_m0 d s0 s1 imm + xar \d\().2d, \s0\().2d, \s1\().2d, #\imm +.endm + +.macro bcax_m0 d s0 s1 s2 + bcax \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +.macro eor3_m1_0 d s0 s1 s2 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor2 d s0 s1 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor3_m1_1 d s0 s1 s2 + eor \d\().16b, \d\().16b, \s2\().16b +.endm + +.macro eor3_m1 d s0 s1 s2 + eor3_m1_0 \d, \s0, \s1, \s2 + eor3_m1_1 \d, \s0, \s1, \s2 +.endm + +.macro rax1_m1 d s0 s1 + // Use add instead of SHL #1 + add tmp.2d, \s1\().2d, \s1\().2d + sri tmp.2d, \s1\().2d, #63 + eor \d\().16b, tmp.16b, \s0\().16b +.endm + + .macro xar_m1 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 63 + eor \s0\().16b, \s0\().16b, \s1\().16b + add \d\().2d, \s0\().2d, \s0\().2d + sri \d\().2d, \s0\().2d, #(63) + // .elseif \imm == 62 + // eor \s0\().16b, \s0\().16b, \s1\().16b + // add \d\().2d, \s0\().2d, \s0\().2d + // add \d\().2d, \d\().2d, \d\().2d + // sri \d\().2d, \s0\().2d, #(62) + // .elseif \imm == 61 + // eor \s0\().16b, \s0\().16b, \s1\().16b + // add \d\().2d, \s0\().2d, \s0\().2d + // add \d\().2d, \d\().2d, \d\().2d + // add \d\().2d, \d\().2d, \d\().2d + // sri \d\().2d, \s0\().2d, #(61) + .else + eor \s0\().16b, \s0\().16b, \s1\().16b + shl \d\().2d, \s0\().2d, #(64-\imm) + sri \d\().2d, \s0\().2d, #(\imm) + .endif +.endm + + .macro xar_m1_0 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 63 + eor \s0\().16b, \s0\().16b, \s1\().16b + .elseif \imm == 62 + eor \s0\().16b, \s0\().16b, \s1\().16b + .else + eor \s0\().16b, \s0\().16b, \s1\().16b + .endif +.endm + + .macro xar_m1_1 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 63 + add \d\().2d, \s0\().2d, \s0\().2d + sri \d\().2d, \s0\().2d, #(63) + .elseif \imm == 62 + add \d\().2d, \s0\().2d, \s0\().2d + add \d\().2d, \d\().2d, \d\().2d + sri \d\().2d, \s0\().2d, #(62) + .else + shl \d\().2d, \s0\().2d, #(64-\imm) + sri \d\().2d, \s0\().2d, #(\imm) + .endif +.endm + +.macro bcax_m1 d s0 s1 s2 + bic tmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +/* Keccak-f1600 round */ + +.macro keccak_f1600_round_pre + + /* 10 EOR3, so 20 individual EOR */ + + eor3_m0 C1, Abe, Age, Ake + eor3_m1 C3, Abo, Ago, Ako + eor3_m0 C0, Aba, Aga, Aka + eor3_m1 C2, Abi, Agi, Aki + eor3_m0 C4, Abu, Agu, Aku + eor3_m1 C1, C1, Ame, Ase + eor3_m0 C3, C3, Amo, Aso + eor3_m1 C0, C0, Ama, Asa + eor3_m0 C2, C2, Ami, Asi + eor3_m1 C4, C4, Amu, Asu + +.endm + +.macro keccak_f1600_round + + /* 10 EOR3, so 20 individual EOR */ + + eor3_m1_0 C0, Aba, Aga, Aka + eor3_m1_0 C1, Abe, Age, Ake + eor3_m1_0 C2, Abi, Agi, Aki + eor3_m1_0 C3, Abo, Ago, Ako + eor3_m1_0 C4, Abu, Agu, Aku + eor3_m1_1 C0, Aba, Aga, Aka + eor3_m1_1 C1, Abe, Age, Ake + eor3_m1_1 C2, Abi, Agi, Aki + eor3_m1_1 C3, Abo, Ago, Ako + eor3_m1_1 C4, Abu, Agu, Aku + eor3_m1_0 C0, C0, Ama, Asa + eor3_m1_0 C1, C1, Ame, Ase + eor3_m1_0 C2, C2, Ami, Asi + eor3_m1_0 C3, C3, Amo, Aso + eor3_m1_0 C4, C4, Amu, Asu + eor3_m1_1 C0, C0, Ama, Asa + eor3_m1_1 C1, C1, Ame, Ase + eor3_m1_1 C2, C2, Ami, Asi + eor3_m1_1 C3, C3, Amo, Aso + eor3_m1_1 C4, C4, Amu, Asu + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req vBba + rax1_m1 E2, C1, C3 + rax1_m1 E4, C3, C0 + rax1_m1 E1, C0, C2 + rax1_m1 E3, C2, C4 + rax1_m1 E0, C4, C1 + .unreq tmp + + /* 25x XAR, 75 in total */ + + tmp .req C1 + tmpq .req C1q + + eor vBba.16b, Aba.16b, E0.16b + xar_m1 vBsa, Abi, E2, 2 + xar_m1 vBbi, Aki, E2, 21 + xar_m1 vBki, Ako, E3, 39 + xar_m1 vBko, Amu, E4, 56 + xar_m1 vBmu, Aso, E3, 8 + xar_m1 vBso, Ama, E0, 23 + xar_m1 vBka, Abe, E1, 63 + xar_m1 vBse, Ago, E3, 9 + xar_m1 vBgo, Ame, E1, 19 + xar_m1 vBke, Agi, E2, 58 + xar_m1 vBgi, Aka, E0, 61 + xar_m1 vBga, Abo, E3, 36 + xar_m1 vBbo, Amo, E3, 43 + xar_m1 vBmo, Ami, E2, 49 + xar_m1 vBmi, Ake, E1, 54 + xar_m1 vBge, Agu, E4, 44 + xar_m1 vBgu, Asi, E2, 3 + xar_m1 vBsi, Aku, E4, 25 + xar_m1 vBku, Asa, E0, 46 + xar_m1 vBma, Abu, E4, 37 + xar_m1 vBbu, Asu, E4, 50 + xar_m1 vBsu, Ase, E1, 62 + xar_m1 vBme, Aga, E0, 28 + xar_m1 vBbe, Age, E1, 20 + + /* 25x BCAX, 50 in total */ + + bcax_m1 Aga, vBga, vBgi, vBge + bcax_m1 Age, vBge, vBgo, vBgi + bcax_m1 Agi, vBgi, vBgu, vBgo + bcax_m1 Ago, vBgo, vBga, vBgu + bcax_m1 Agu, vBgu, vBge, vBga + bcax_m1 Aka, vBka, vBki, vBke + bcax_m1 Ake, vBke, vBko, vBki + bcax_m1 Aki, vBki, vBku, vBko + bcax_m1 Ako, vBko, vBka, vBku + bcax_m1 Aku, vBku, vBke, vBka + bcax_m1 Ama, vBma, vBmi, vBme + bcax_m1 Ame, vBme, vBmo, vBmi + bcax_m1 Ami, vBmi, vBmu, vBmo + bcax_m1 Amo, vBmo, vBma, vBmu + bcax_m1 Amu, vBmu, vBme, vBma + bcax_m1 Asa, vBsa, vBsi, vBse + bcax_m1 Ase, vBse, vBso, vBsi + bcax_m1 Asi, vBsi, vBsu, vBso + bcax_m1 Aso, vBso, vBsa, vBsu + bcax_m1 Asu, vBsu, vBse, vBsa + bcax_m1 Aba, vBba, vBbi, vBbe + bcax_m1 Abe, vBbe, vBbo, vBbi + bcax_m1 Abi, vBbi, vBbu, vBbo + bcax_m1 Abo, vBbo, vBba, vBbu + bcax_m1 Abu, vBbu, vBbe, vBba + + // iota step + //ld1r {tmp.2d}, [const_addr], #8 + ldr tmpq, [const_addr], #16 + eor Aba.16b, Aba.16b, tmp.16b + + .unreq tmp + .unreq tmpq + +.endm + +.macro keccak_f1600_round_core + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req vBba + rax1_m0 E2, C1, C3 + rax1_m1 E4, C3, C0 + rax1_m0 E1, C0, C2 + rax1_m1 E3, C2, C4 + rax1_m0 E0, C4, C1 + + /* 25x XAR, 75 in total */ + + .unreq tmp + tmp .req C1 + tmpq .req C1q + + eor vBba.16b, Aba.16b, E0.16b + xar_m1 vBsa, Abi, E2, 2 + xar_m0 vBbi, Aki, E2, 21 + xar_m1 vBki, Ako, E3, 39 + xar_m0 vBko, Amu, E4, 56 + xar_m1 vBmu, Aso, E3, 8 + xar_m0 vBso, Ama, E0, 23 + xar_m1 vBka, Abe, E1, 63 + xar_m0 vBse, Ago, E3, 9 + xar_m1 vBgo, Ame, E1, 19 + xar_m0 vBke, Agi, E2, 58 + xar_m1 vBgi, Aka, E0, 61 + xar_m0 vBga, Abo, E3, 36 + xar_m1 vBbo, Amo, E3, 43 + xar_m0 vBmo, Ami, E2, 49 + xar_m1 vBmi, Ake, E1, 54 + xar_m0 vBge, Agu, E4, 44 + mov E3.16b, Aga.16b + bcax_m1 Aga, vBga, vBgi, vBge + xar_m0 vBgu, Asi, E2, 3 + xar_m1 vBsi, Aku, E4, 25 + xar_m0 vBku, Asa, E0, 46 + xar_m1 vBma, Abu, E4, 37 + xar_m0 vBbu, Asu, E4, 50 + xar_m1 vBsu, Ase, E1, 62 + xar_m0 vBme, E3, E0, 28 + xar_m1 vBbe, Age, E1, 20 + + /* 25x BCAX, 50 in total */ + + bcax_m1 Age, vBge, vBgo, vBgi + bcax_m0 Agi, vBgi, vBgu, vBgo + bcax_m1 Ago, vBgo, vBga, vBgu + bcax_m0 Agu, vBgu, vBge, vBga + bcax_m1 Aka, vBka, vBki, vBke + bcax_m0 Ake, vBke, vBko, vBki + + .unreq tmp + .unreq tmpq + + eor2 C0, Aka, Aga + save(Aga) + + tmp .req Aga + tmpq .req Agaq + bcax_m0 Aki, vBki, vBku, vBko + bcax_m1 Ako, vBko, vBka, vBku + eor2 C1, Ake, Age + bcax_m0 Aku, vBku, vBke, vBka + eor2 C2, Aki, Agi + bcax_m1 Ama, vBma, vBmi, vBme + eor2 C3, Ako, Ago + bcax_m0 Ame, vBme, vBmo, vBmi + eor2 C4, Aku, Agu + bcax_m1 Ami, vBmi, vBmu, vBmo + eor2 C0, C0, Ama + bcax_m0 Amo, vBmo, vBma, vBmu + eor2 C1, C1, Ame + bcax_m1 Amu, vBmu, vBme, vBma + eor2 C2, C2, Ami + bcax_m0 Asa, vBsa, vBsi, vBse + eor2 C3, C3, Amo + bcax_m1 Ase, vBse, vBso, vBsi + eor2 C4, C4, Amu + bcax_m0 Asi, vBsi, vBsu, vBso + eor2 C0, C0, Asa + bcax_m1 Aso, vBso, vBsa, vBsu + eor2 C1, C1, Ase + bcax_m0 Asu, vBsu, vBse, vBsa + eor2 C2, C2, Asi + eor2 C3, C3, Aso + bcax_m1 Aba, vBba, vBbi, vBbe + bcax_m0 Abe, vBbe, vBbo, vBbi + eor2 C1, C1, Abe + + // iota step + //ld1r {tmp.2d}, [const_addr], #8 + ldr tmpq, [const_addr], #16 + eor Aba.16b, Aba.16b, tmp.16b + eor2 C4, C4, Asu + bcax_m0 Abi, vBbi, vBbu, vBbo + bcax_m1 Abo, vBbo, vBba, vBbu + eor2 C3, C3, Abo + eor2 C2, C2, Abi + eor2 C0, C0, Aba + bcax_m0 Abu, vBbu, vBbe, vBba + eor2 C4, C4, Abu + + restore(Aga) + .unreq tmp + .unreq tmpq + +.endm + +.macro keccak_f1600_round_post + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req vBba + rax1_m0 E2, C1, C3 + rax1_m1 E4, C3, C0 + rax1_m0 E1, C0, C2 + rax1_m1 E3, C2, C4 + rax1_m0 E0, C4, C1 + + /* 25x XAR, 75 in total */ + + .unreq tmp + tmp .req C1 + tmpq .req C1q + + eor vBba.16b, Aba.16b, E0.16b + xar_m0 vBsa, Abi, E2, 2 + xar_m1 vBbi, Aki, E2, 21 + xar_m0 vBki, Ako, E3, 39 + xar_m1 vBko, Amu, E4, 56 + xar_m0 vBmu, Aso, E3, 8 + xar_m1 vBso, Ama, E0, 23 + xar_m0 vBka, Abe, E1, 63 + xar_m1 vBse, Ago, E3, 9 + xar_m0 vBgo, Ame, E1, 19 + xar_m1 vBke, Agi, E2, 58 + xar_m0 vBgi, Aka, E0, 61 + xar_m1 vBga, Abo, E3, 36 + xar_m0 vBbo, Amo, E3, 43 + xar_m1 vBmo, Ami, E2, 49 + xar_m0 vBmi, Ake, E1, 54 + xar_m1 vBge, Agu, E4, 44 + mov E3.16b, Aga.16b + bcax_m1 Aga, vBga, vBgi, vBge + xar_m0 vBgu, Asi, E2, 3 + xar_m1 vBsi, Aku, E4, 25 + xar_m0 vBku, Asa, E0, 46 + xar_m1 vBma, Abu, E4, 37 + xar_m0 vBbu, Asu, E4, 50 + xar_m1 vBsu, Ase, E1, 62 + xar_m0 vBme, E3, E0, 28 + xar_m1 vBbe, Age, E1, 20 + + /* 25x BCAX, 50 in total */ + + bcax_m0 Age, vBge, vBgo, vBgi + bcax_m1 Agi, vBgi, vBgu, vBgo + bcax_m0 Ago, vBgo, vBga, vBgu + bcax_m1 Agu, vBgu, vBge, vBga + bcax_m0 Aka, vBka, vBki, vBke + bcax_m1 Ake, vBke, vBko, vBki + bcax_m0 Aki, vBki, vBku, vBko + bcax_m1 Ako, vBko, vBka, vBku + bcax_m0 Aku, vBku, vBke, vBka + bcax_m1 Ama, vBma, vBmi, vBme + bcax_m0 Ame, vBme, vBmo, vBmi + bcax_m1 Ami, vBmi, vBmu, vBmo + bcax_m0 Amo, vBmo, vBma, vBmu + bcax_m1 Amu, vBmu, vBme, vBma + bcax_m0 Asa, vBsa, vBsi, vBse + bcax_m1 Ase, vBse, vBso, vBsi + bcax_m0 Asi, vBsi, vBsu, vBso + bcax_m1 Aso, vBso, vBsa, vBsu + bcax_m0 Asu, vBsu, vBse, vBsa + bcax_m1 Aba, vBba, vBbi, vBbe + bcax_m0 Abe, vBbe, vBbo, vBbi + bcax_m1 Abi, vBbi, vBbu, vBbo + bcax_m0 Abo, vBbo, vBba, vBbu + bcax_m1 Abu, vBbu, vBbe, vBba + + // iota step + //ld1r {tmp.2d}, [const_addr], #8 + ldr tmpq, [const_addr], #16 + eor Aba.16b, Aba.16b, tmp.16b + + .unreq tmp + .unreq tmpq + +.endm + + +.text +.align 4 +.global keccak_f1600_x2_hybrid_asm_v2pp0 +.global _keccak_f1600_x2_hybrid_asm_v2pp0 + +#define KECCAK_F1600_ROUNDS 24 + +keccak_f1600_x2_hybrid_asm_v2pp0: +_keccak_f1600_x2_hybrid_asm_v2pp0: + alloc_stack + save_vregs + load_constant_ptr + load_input + + //mov count, #(KECCAK_F1600_ROUNDS-2) + mov count, #11 + keccak_f1600_round_pre +loop: + keccak_f1600_round_core + keccak_f1600_round_core + sub count, count, #1 + cbnz count, loop + + keccak_f1600_round_core + keccak_f1600_round_post + store_input + restore_vregs + free_stack + ret + +#endif \ No newline at end of file diff --git a/asm/manual/keccak_f1600/keccak_f1600_x2_hybrid_asm_v2pp1.s b/asm/manual/keccak_f1600/keccak_f1600_x2_hybrid_asm_v2pp1.s new file mode 100644 index 0000000..cac0bcd --- /dev/null +++ b/asm/manual/keccak_f1600/keccak_f1600_x2_hybrid_asm_v2pp1.s @@ -0,0 +1,805 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" +#if defined(__ARM_FEATURE_SHA3) + +/********************** CONSTANTS *************************/ + .data + .align(8) +_round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x1 + count .req x2 + cur_const .req x3 + + /* Mapping of Kecck-f1600 state to vector registers + * at the beginning and end of each round. */ + Aba .req v0 + Abe .req v1 + Abi .req v2 + Abo .req v3 + Abu .req v4 + Aga .req v5 + Age .req v6 + Agi .req v7 + Ago .req v8 + Agu .req v9 + Aka .req v10 + Ake .req v11 + Aki .req v12 + Ako .req v13 + Aku .req v14 + Ama .req v15 + Ame .req v16 + Ami .req v17 + Amo .req v18 + Amu .req v19 + Asa .req v20 + Ase .req v21 + Asi .req v22 + Aso .req v23 + Asu .req v24 + + /* q-form of the above mapping */ + Abaq .req q0 + Abeq .req q1 + Abiq .req q2 + Aboq .req q3 + Abuq .req q4 + Agaq .req q5 + Ageq .req q6 + Agiq .req q7 + Agoq .req q8 + Aguq .req q9 + Akaq .req q10 + Akeq .req q11 + Akiq .req q12 + Akoq .req q13 + Akuq .req q14 + Amaq .req q15 + Ameq .req q16 + Amiq .req q17 + Amoq .req q18 + Amuq .req q19 + Asaq .req q20 + Aseq .req q21 + Asiq .req q22 + Asoq .req q23 + Asuq .req q24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v27 + C1 .req v28 + C2 .req v29 + C3 .req v30 + C4 .req v31 + + C0q .req q27 + C1q .req q28 + C2q .req q29 + C3q .req q30 + C4q .req q31 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + vBba .req v25 // fresh + vBbe .req v26 // fresh + vBbi .req Abi + vBbo .req Abo + vBbu .req Abu + vBga .req Aka + vBge .req Ake + vBgi .req Agi + vBgo .req Ago + vBgu .req Agu + vBka .req Ama + vBke .req Ame + vBki .req Aki + vBko .req Ako + vBku .req Aku + vBma .req Asa + vBme .req Ase + vBmi .req Ami + vBmo .req Amo + vBmu .req Amu + vBsa .req Aba + vBse .req Abe + vBsi .req Asi + vBso .req Aso + vBsu .req Asu + + vBbaq .req q25 // fresh + vBbeq .req q26 // fresh + vBbiq .req Abiq + vBboq .req Aboq + vBbuq .req Abuq + vBgaq .req Akaq + vBgeq .req Akeq + vBgiq .req Agiq + vBgoq .req Agoq + vBguq .req Aguq + vBkaq .req Amaq + vBkeq .req Ameq + vBkiq .req Akiq + vBkoq .req Akoq + vBkuq .req Akuq + vBmaq .req Asaq + vBmeq .req Aseq + vBmiq .req Amiq + vBmoq .req Amoq + vBmuq .req Amuq + vBsaq .req Abaq + vBseq .req Abeq + vBsiq .req Asiq + vBsoq .req Asoq + vBsuq .req Asuq + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req C4 + E1 .req C0 + E2 .req vBbe // fresh + E3 .req C2 + E4 .req C3 + + E0q .req C4q + E1q .req C0q + E2q .req vBbeq // fresh + E3q .req C2q + E4q .req C3q + + +/************************ MACROS ****************************/ + +.macro load_input + ldp Abaq, Abeq, [input_addr, #(2*8*0)] + ldp Abiq, Aboq, [input_addr, #(2*8*2)] + ldp Abuq, Agaq, [input_addr, #(2*8*4)] + ldp Ageq, Agiq, [input_addr, #(2*8*6)] + ldp Agoq, Aguq, [input_addr, #(2*8*8)] + ldp Akaq, Akeq, [input_addr, #(2*8*10)] + ldp Akiq, Akoq, [input_addr, #(2*8*12)] + ldp Akuq, Amaq, [input_addr, #(2*8*14)] + ldp Ameq, Amiq, [input_addr, #(2*8*16)] + ldp Amoq, Amuq, [input_addr, #(2*8*18)] + ldp Asaq, Aseq, [input_addr, #(2*8*20)] + ldp Asiq, Asoq, [input_addr, #(2*8*22)] + ldr Asuq, [input_addr, #(2*8*24)] +.endm + +.macro store_input + str Abaq, [input_addr, #(2*8*0)] + str Abeq, [input_addr, #(2*8*1)] + str Abiq, [input_addr, #(2*8*2)] + str Aboq, [input_addr, #(2*8*3)] + str Abuq, [input_addr, #(2*8*4)] + str Agaq, [input_addr, #(2*8*5)] + str Ageq, [input_addr, #(2*8*6)] + str Agiq, [input_addr, #(2*8*7)] + str Agoq, [input_addr, #(2*8*8)] + str Aguq, [input_addr, #(2*8*9)] + str Akaq, [input_addr, #(2*8*10)] + str Akeq, [input_addr, #(2*8*11)] + str Akiq, [input_addr, #(2*8*12)] + str Akoq, [input_addr, #(2*8*13)] + str Akuq, [input_addr, #(2*8*14)] + str Amaq, [input_addr, #(2*8*15)] + str Ameq, [input_addr, #(2*8*16)] + str Amiq, [input_addr, #(2*8*17)] + str Amoq, [input_addr, #(2*8*18)] + str Amuq, [input_addr, #(2*8*19)] + str Asaq, [input_addr, #(2*8*20)] + str Aseq, [input_addr, #(2*8*21)] + str Asiq, [input_addr, #(2*8*22)] + str Asoq, [input_addr, #(2*8*23)] + str Asuq, [input_addr, #(2*8*24)] +.endm + +#define STACK_SIZE (16*4 + 16*34) +#define STACK_BASE_VREGS 0 +#define STACK_BASE_TMP 16*4 + +#define Aga_offset 0 +#define E0_offset 1 +#define E1_offset 2 +#define E2_offset 3 +#define E3_offset 4 +#define E4_offset 5 +#define Ame_offset 7 +#define Agi_offset 8 +#define Aka_offset 9 +#define Abo_offset 10 +#define Amo_offset 11 +#define Ami_offset 12 +#define Ake_offset 13 +#define Agu_offset 14 +#define Asi_offset 15 +#define Aku_offset 16 +#define Asa_offset 17 +#define Abu_offset 18 +#define Asu_offset 19 +#define Ase_offset 20 +//#define Aga_offset 21 +#define Age_offset 22 +#define vBgo_offset 23 +#define vBke_offset 24 +#define vBgi_offset 25 +#define vBga_offset 26 +#define vBbo_offset 27 +#define vBmo_offset 28 +#define vBmi_offset 29 +#define vBge_offset 30 + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +#define save(name) \ + str name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] +#define restore(name) \ + ldr name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] + +.macro save_vregs + stp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + stp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + stp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + stp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + ldp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + ldp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + ldp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +.macro eor3_m0 d s0 s1 s2 + eor3 \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +.macro rax1_m0 d s0 s1 + rax1 \d\().2d, \s0\().2d, \s1\().2d +.endm + +.macro xar_m0 d s0 s1 imm + xar \d\().2d, \s0\().2d, \s1\().2d, #\imm +.endm + +.macro bcax_m0 d s0 s1 s2 + bcax \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +.macro eor3_m1_0 d s0 s1 s2 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor2 d s0 s1 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor3_m1_1 d s0 s1 s2 + eor \d\().16b, \d\().16b, \s2\().16b +.endm + +.macro eor3_m1 d s0 s1 s2 + eor3_m1_0 \d, \s0, \s1, \s2 + eor3_m1_1 \d, \s0, \s1, \s2 +.endm + +.macro rax1_m1 d s0 s1 + // Use add instead of SHL #1 + add tmp.2d, \s1\().2d, \s1\().2d + sri tmp.2d, \s1\().2d, #63 + eor \d\().16b, tmp.16b, \s0\().16b +.endm + + .macro xar_m1 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 63 + eor \s0\().16b, \s0\().16b, \s1\().16b + add \d\().2d, \s0\().2d, \s0\().2d + sri \d\().2d, \s0\().2d, #(63) + // .elseif \imm == 62 + // eor \s0\().16b, \s0\().16b, \s1\().16b + // add \d\().2d, \s0\().2d, \s0\().2d + // add \d\().2d, \d\().2d, \d\().2d + // sri \d\().2d, \s0\().2d, #(62) + // .elseif \imm == 61 + // eor \s0\().16b, \s0\().16b, \s1\().16b + // add \d\().2d, \s0\().2d, \s0\().2d + // add \d\().2d, \d\().2d, \d\().2d + // add \d\().2d, \d\().2d, \d\().2d + // sri \d\().2d, \s0\().2d, #(61) + .else + eor \s0\().16b, \s0\().16b, \s1\().16b + shl \d\().2d, \s0\().2d, #(64-\imm) + sri \d\().2d, \s0\().2d, #(\imm) + .endif +.endm + + .macro xar_m1_0 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 63 + eor \s0\().16b, \s0\().16b, \s1\().16b + .elseif \imm == 62 + eor \s0\().16b, \s0\().16b, \s1\().16b + .else + eor \s0\().16b, \s0\().16b, \s1\().16b + .endif +.endm + + .macro xar_m1_1 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 63 + add \d\().2d, \s0\().2d, \s0\().2d + sri \d\().2d, \s0\().2d, #(63) + .elseif \imm == 62 + add \d\().2d, \s0\().2d, \s0\().2d + add \d\().2d, \d\().2d, \d\().2d + sri \d\().2d, \s0\().2d, #(62) + .else + shl \d\().2d, \s0\().2d, #(64-\imm) + sri \d\().2d, \s0\().2d, #(\imm) + .endif +.endm + +.macro bcax_m1 d s0 s1 s2 + bic tmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +/* Keccak-f1600 round */ + +.macro keccak_f1600_round_pre + + /* 10 EOR3, so 20 individual EOR */ + + eor3_m0 C1, Abe, Age, Ake + eor3_m1 C3, Abo, Ago, Ako + eor3_m0 C0, Aba, Aga, Aka + eor3_m1 C2, Abi, Agi, Aki + eor3_m0 C4, Abu, Agu, Aku + eor3_m1 C1, C1, Ame, Ase + eor3_m0 C3, C3, Amo, Aso + eor3_m1 C0, C0, Ama, Asa + eor3_m0 C2, C2, Ami, Asi + eor3_m1 C4, C4, Amu, Asu + +.endm + +.macro keccak_f1600_round + + /* 10 EOR3, so 20 individual EOR */ + + eor3_m1_0 C0, Aba, Aga, Aka + eor3_m1_0 C1, Abe, Age, Ake + eor3_m1_0 C2, Abi, Agi, Aki + eor3_m1_0 C3, Abo, Ago, Ako + eor3_m1_0 C4, Abu, Agu, Aku + eor3_m1_1 C0, Aba, Aga, Aka + eor3_m1_1 C1, Abe, Age, Ake + eor3_m1_1 C2, Abi, Agi, Aki + eor3_m1_1 C3, Abo, Ago, Ako + eor3_m1_1 C4, Abu, Agu, Aku + eor3_m1_0 C0, C0, Ama, Asa + eor3_m1_0 C1, C1, Ame, Ase + eor3_m1_0 C2, C2, Ami, Asi + eor3_m1_0 C3, C3, Amo, Aso + eor3_m1_0 C4, C4, Amu, Asu + eor3_m1_1 C0, C0, Ama, Asa + eor3_m1_1 C1, C1, Ame, Ase + eor3_m1_1 C2, C2, Ami, Asi + eor3_m1_1 C3, C3, Amo, Aso + eor3_m1_1 C4, C4, Amu, Asu + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req vBba + rax1_m1 E2, C1, C3 + rax1_m1 E4, C3, C0 + rax1_m1 E1, C0, C2 + rax1_m1 E3, C2, C4 + rax1_m1 E0, C4, C1 + .unreq tmp + + /* 25x XAR, 75 in total */ + + tmp .req C1 + tmpq .req C1q + + eor vBba.16b, Aba.16b, E0.16b + xar_m1 vBsa, Abi, E2, 2 + xar_m1 vBbi, Aki, E2, 21 + xar_m1 vBki, Ako, E3, 39 + xar_m1 vBko, Amu, E4, 56 + xar_m1 vBmu, Aso, E3, 8 + xar_m1 vBso, Ama, E0, 23 + xar_m1 vBka, Abe, E1, 63 + xar_m1 vBse, Ago, E3, 9 + xar_m1 vBgo, Ame, E1, 19 + xar_m1 vBke, Agi, E2, 58 + xar_m1 vBgi, Aka, E0, 61 + xar_m1 vBga, Abo, E3, 36 + xar_m1 vBbo, Amo, E3, 43 + xar_m1 vBmo, Ami, E2, 49 + xar_m1 vBmi, Ake, E1, 54 + xar_m1 vBge, Agu, E4, 44 + xar_m1 vBgu, Asi, E2, 3 + xar_m1 vBsi, Aku, E4, 25 + xar_m1 vBku, Asa, E0, 46 + xar_m1 vBma, Abu, E4, 37 + xar_m1 vBbu, Asu, E4, 50 + xar_m1 vBsu, Ase, E1, 62 + xar_m1 vBme, Aga, E0, 28 + xar_m1 vBbe, Age, E1, 20 + + /* 25x BCAX, 50 in total */ + + bcax_m1 Aga, vBga, vBgi, vBge + bcax_m1 Age, vBge, vBgo, vBgi + bcax_m1 Agi, vBgi, vBgu, vBgo + bcax_m1 Ago, vBgo, vBga, vBgu + bcax_m1 Agu, vBgu, vBge, vBga + bcax_m1 Aka, vBka, vBki, vBke + bcax_m1 Ake, vBke, vBko, vBki + bcax_m1 Aki, vBki, vBku, vBko + bcax_m1 Ako, vBko, vBka, vBku + bcax_m1 Aku, vBku, vBke, vBka + bcax_m1 Ama, vBma, vBmi, vBme + bcax_m1 Ame, vBme, vBmo, vBmi + bcax_m1 Ami, vBmi, vBmu, vBmo + bcax_m1 Amo, vBmo, vBma, vBmu + bcax_m1 Amu, vBmu, vBme, vBma + bcax_m1 Asa, vBsa, vBsi, vBse + bcax_m1 Ase, vBse, vBso, vBsi + bcax_m1 Asi, vBsi, vBsu, vBso + bcax_m1 Aso, vBso, vBsa, vBsu + bcax_m1 Asu, vBsu, vBse, vBsa + bcax_m1 Aba, vBba, vBbi, vBbe + bcax_m1 Abe, vBbe, vBbo, vBbi + bcax_m1 Abi, vBbi, vBbu, vBbo + bcax_m1 Abo, vBbo, vBba, vBbu + bcax_m1 Abu, vBbu, vBbe, vBba + + // iota step + //ld1r {tmp.2d}, [const_addr], #8 + ldr tmpq, [const_addr], #16 + eor Aba.16b, Aba.16b, tmp.16b + + .unreq tmp + .unreq tmpq + +.endm + +.macro keccak_f1600_round_core + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req vBba + rax1_m0 E2, C1, C3 + rax1_m0 E4, C3, C0 + rax1_m0 E1, C0, C2 + rax1_m0 E3, C2, C4 + rax1_m0 E0, C4, C1 + + /* 25x XAR, 75 in total */ + + .unreq tmp + tmp .req C1 + tmpq .req C1q + + eor vBba.16b, Aba.16b, E0.16b + xar_m0 vBsa, Abi, E2, 2 + xar_m0 vBbi, Aki, E2, 21 + xar_m0 vBki, Ako, E3, 39 + xar_m1 vBko, Amu, E4, 56 + xar_m0 vBmu, Aso, E3, 8 + xar_m0 vBso, Ama, E0, 23 + xar_m0 vBka, Abe, E1, 63 + xar_m1 vBse, Ago, E3, 9 + xar_m0 vBgo, Ame, E1, 19 + xar_m0 vBke, Agi, E2, 58 + xar_m0 vBgi, Aka, E0, 61 + xar_m1 vBga, Abo, E3, 36 + xar_m0 vBbo, Amo, E3, 43 + xar_m0 vBmo, Ami, E2, 49 + xar_m0 vBmi, Ake, E1, 54 + xar_m1 vBge, Agu, E4, 44 + mov E3.16b, Aga.16b + bcax_m0 Aga, vBga, vBgi, vBge + xar_m0 vBgu, Asi, E2, 3 + xar_m0 vBsi, Aku, E4, 25 + xar_m1 vBku, Asa, E0, 46 + xar_m0 vBma, Abu, E4, 37 + xar_m0 vBbu, Asu, E4, 50 + xar_m0 vBsu, Ase, E1, 62 + xar_m1 vBme, E3, E0, 28 + xar_m0 vBbe, Age, E1, 20 + + /* 25x BCAX, 50 in total */ + + bcax_m0 Age, vBge, vBgo, vBgi + bcax_m0 Agi, vBgi, vBgu, vBgo + bcax_m1 Ago, vBgo, vBga, vBgu + bcax_m0 Agu, vBgu, vBge, vBga + bcax_m0 Aka, vBka, vBki, vBke + bcax_m0 Ake, vBke, vBko, vBki + + .unreq tmp + .unreq tmpq + + eor2 C0, Aka, Aga + save(Aga) + + tmp .req Aga + tmpq .req Agaq + bcax_m0 Aki, vBki, vBku, vBko + bcax_m0 Ako, vBko, vBka, vBku + eor2 C1, Ake, Age + bcax_m0 Aku, vBku, vBke, vBka + eor2 C2, Aki, Agi + bcax_m0 Ama, vBma, vBmi, vBme + eor2 C3, Ako, Ago + bcax_m0 Ame, vBme, vBmo, vBmi + eor2 C4, Aku, Agu + bcax_m0 Ami, vBmi, vBmu, vBmo + eor2 C0, C0, Ama + bcax_m0 Amo, vBmo, vBma, vBmu + eor2 C1, C1, Ame + bcax_m0 Amu, vBmu, vBme, vBma + eor2 C2, C2, Ami + bcax_m0 Asa, vBsa, vBsi, vBse + eor2 C3, C3, Amo + bcax_m0 Ase, vBse, vBso, vBsi + eor2 C4, C4, Amu + bcax_m0 Asi, vBsi, vBsu, vBso + eor2 C0, C0, Asa + bcax_m0 Aso, vBso, vBsa, vBsu + eor2 C1, C1, Ase + bcax_m0 Asu, vBsu, vBse, vBsa + eor2 C2, C2, Asi + eor2 C3, C3, Aso + bcax_m0 Aba, vBba, vBbi, vBbe + bcax_m0 Abe, vBbe, vBbo, vBbi + eor2 C1, C1, Abe + + // iota step + //ld1r {tmp.2d}, [const_addr], #8 + ldr tmpq, [const_addr], #16 + eor Aba.16b, Aba.16b, tmp.16b + eor2 C4, C4, Asu + bcax_m0 Abi, vBbi, vBbu, vBbo + bcax_m0 Abo, vBbo, vBba, vBbu + eor2 C3, C3, Abo + eor2 C2, C2, Abi + eor2 C0, C0, Aba + bcax_m0 Abu, vBbu, vBbe, vBba + eor2 C4, C4, Abu + + restore(Aga) + .unreq tmp + .unreq tmpq + +.endm + +.macro keccak_f1600_round_post + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req vBba + rax1_m0 E2, C1, C3 + rax1_m1 E4, C3, C0 + rax1_m0 E1, C0, C2 + rax1_m1 E3, C2, C4 + rax1_m0 E0, C4, C1 + + /* 25x XAR, 75 in total */ + + .unreq tmp + tmp .req C1 + tmpq .req C1q + + eor vBba.16b, Aba.16b, E0.16b + xar_m0 vBsa, Abi, E2, 2 + xar_m1 vBbi, Aki, E2, 21 + xar_m0 vBki, Ako, E3, 39 + xar_m1 vBko, Amu, E4, 56 + xar_m0 vBmu, Aso, E3, 8 + xar_m1 vBso, Ama, E0, 23 + xar_m0 vBka, Abe, E1, 63 + xar_m1 vBse, Ago, E3, 9 + xar_m0 vBgo, Ame, E1, 19 + xar_m1 vBke, Agi, E2, 58 + xar_m0 vBgi, Aka, E0, 61 + xar_m1 vBga, Abo, E3, 36 + xar_m0 vBbo, Amo, E3, 43 + xar_m1 vBmo, Ami, E2, 49 + xar_m0 vBmi, Ake, E1, 54 + xar_m1 vBge, Agu, E4, 44 + mov E3.16b, Aga.16b + bcax_m1 Aga, vBga, vBgi, vBge + xar_m0 vBgu, Asi, E2, 3 + xar_m1 vBsi, Aku, E4, 25 + xar_m0 vBku, Asa, E0, 46 + xar_m1 vBma, Abu, E4, 37 + xar_m0 vBbu, Asu, E4, 50 + xar_m1 vBsu, Ase, E1, 62 + xar_m0 vBme, E3, E0, 28 + xar_m1 vBbe, Age, E1, 20 + + /* 25x BCAX, 50 in total */ + + bcax_m0 Age, vBge, vBgo, vBgi + bcax_m1 Agi, vBgi, vBgu, vBgo + bcax_m0 Ago, vBgo, vBga, vBgu + bcax_m1 Agu, vBgu, vBge, vBga + bcax_m0 Aka, vBka, vBki, vBke + bcax_m1 Ake, vBke, vBko, vBki + bcax_m0 Aki, vBki, vBku, vBko + bcax_m1 Ako, vBko, vBka, vBku + bcax_m0 Aku, vBku, vBke, vBka + bcax_m1 Ama, vBma, vBmi, vBme + bcax_m0 Ame, vBme, vBmo, vBmi + bcax_m1 Ami, vBmi, vBmu, vBmo + bcax_m0 Amo, vBmo, vBma, vBmu + bcax_m1 Amu, vBmu, vBme, vBma + bcax_m0 Asa, vBsa, vBsi, vBse + bcax_m1 Ase, vBse, vBso, vBsi + bcax_m0 Asi, vBsi, vBsu, vBso + bcax_m1 Aso, vBso, vBsa, vBsu + bcax_m0 Asu, vBsu, vBse, vBsa + bcax_m1 Aba, vBba, vBbi, vBbe + bcax_m0 Abe, vBbe, vBbo, vBbi + bcax_m1 Abi, vBbi, vBbu, vBbo + bcax_m0 Abo, vBbo, vBba, vBbu + bcax_m1 Abu, vBbu, vBbe, vBba + + // iota step + //ld1r {tmp.2d}, [const_addr], #8 + ldr tmpq, [const_addr], #16 + eor Aba.16b, Aba.16b, tmp.16b + + .unreq tmp + .unreq tmpq + +.endm + + +.text +.align 4 +.global keccak_f1600_x2_hybrid_asm_v2pp1 +.global _keccak_f1600_x2_hybrid_asm_v2pp1 + +#define KECCAK_F1600_ROUNDS 24 + +keccak_f1600_x2_hybrid_asm_v2pp1: +_keccak_f1600_x2_hybrid_asm_v2pp1: + alloc_stack + save_vregs + load_constant_ptr + load_input + + //mov count, #(KECCAK_F1600_ROUNDS-2) + mov count, #11 + keccak_f1600_round_pre +loop: + keccak_f1600_round_core + keccak_f1600_round_core + sub count, count, #1 + cbnz count, loop + + keccak_f1600_round_core + keccak_f1600_round_post + store_input + restore_vregs + free_stack + ret + + +#endif /* SHA3 */ diff --git a/asm/manual/keccak_f1600/keccak_f1600_x2_hybrid_asm_v2pp2.s b/asm/manual/keccak_f1600/keccak_f1600_x2_hybrid_asm_v2pp2.s new file mode 100644 index 0000000..1c22182 --- /dev/null +++ b/asm/manual/keccak_f1600/keccak_f1600_x2_hybrid_asm_v2pp2.s @@ -0,0 +1,804 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +#if defined(__ARM_FEATURE_SHA3) +/********************** CONSTANTS *************************/ + .data + .align(8) +_round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x1 + count .req x2 + cur_const .req x3 + + /* Mapping of Kecck-f1600 state to vector registers + * at the beginning and end of each round. */ + Aba .req v0 + Abe .req v1 + Abi .req v2 + Abo .req v3 + Abu .req v4 + Aga .req v5 + Age .req v6 + Agi .req v7 + Ago .req v8 + Agu .req v9 + Aka .req v10 + Ake .req v11 + Aki .req v12 + Ako .req v13 + Aku .req v14 + Ama .req v15 + Ame .req v16 + Ami .req v17 + Amo .req v18 + Amu .req v19 + Asa .req v20 + Ase .req v21 + Asi .req v22 + Aso .req v23 + Asu .req v24 + + /* q-form of the above mapping */ + Abaq .req q0 + Abeq .req q1 + Abiq .req q2 + Aboq .req q3 + Abuq .req q4 + Agaq .req q5 + Ageq .req q6 + Agiq .req q7 + Agoq .req q8 + Aguq .req q9 + Akaq .req q10 + Akeq .req q11 + Akiq .req q12 + Akoq .req q13 + Akuq .req q14 + Amaq .req q15 + Ameq .req q16 + Amiq .req q17 + Amoq .req q18 + Amuq .req q19 + Asaq .req q20 + Aseq .req q21 + Asiq .req q22 + Asoq .req q23 + Asuq .req q24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v27 + C1 .req v28 + C2 .req v29 + C3 .req v30 + C4 .req v31 + + C0q .req q27 + C1q .req q28 + C2q .req q29 + C3q .req q30 + C4q .req q31 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + vBba .req v25 // fresh + vBbe .req v26 // fresh + vBbi .req Abi + vBbo .req Abo + vBbu .req Abu + vBga .req Aka + vBge .req Ake + vBgi .req Agi + vBgo .req Ago + vBgu .req Agu + vBka .req Ama + vBke .req Ame + vBki .req Aki + vBko .req Ako + vBku .req Aku + vBma .req Asa + vBme .req Ase + vBmi .req Ami + vBmo .req Amo + vBmu .req Amu + vBsa .req Aba + vBse .req Abe + vBsi .req Asi + vBso .req Aso + vBsu .req Asu + + vBbaq .req q25 // fresh + vBbeq .req q26 // fresh + vBbiq .req Abiq + vBboq .req Aboq + vBbuq .req Abuq + vBgaq .req Akaq + vBgeq .req Akeq + vBgiq .req Agiq + vBgoq .req Agoq + vBguq .req Aguq + vBkaq .req Amaq + vBkeq .req Ameq + vBkiq .req Akiq + vBkoq .req Akoq + vBkuq .req Akuq + vBmaq .req Asaq + vBmeq .req Aseq + vBmiq .req Amiq + vBmoq .req Amoq + vBmuq .req Amuq + vBsaq .req Abaq + vBseq .req Abeq + vBsiq .req Asiq + vBsoq .req Asoq + vBsuq .req Asuq + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req C4 + E1 .req C0 + E2 .req vBbe // fresh + E3 .req C2 + E4 .req C3 + + E0q .req C4q + E1q .req C0q + E2q .req vBbeq // fresh + E3q .req C2q + E4q .req C3q + + +/************************ MACROS ****************************/ + +.macro load_input + ldp Abaq, Abeq, [input_addr, #(2*8*0)] + ldp Abiq, Aboq, [input_addr, #(2*8*2)] + ldp Abuq, Agaq, [input_addr, #(2*8*4)] + ldp Ageq, Agiq, [input_addr, #(2*8*6)] + ldp Agoq, Aguq, [input_addr, #(2*8*8)] + ldp Akaq, Akeq, [input_addr, #(2*8*10)] + ldp Akiq, Akoq, [input_addr, #(2*8*12)] + ldp Akuq, Amaq, [input_addr, #(2*8*14)] + ldp Ameq, Amiq, [input_addr, #(2*8*16)] + ldp Amoq, Amuq, [input_addr, #(2*8*18)] + ldp Asaq, Aseq, [input_addr, #(2*8*20)] + ldp Asiq, Asoq, [input_addr, #(2*8*22)] + ldr Asuq, [input_addr, #(2*8*24)] +.endm + +.macro store_input + str Abaq, [input_addr, #(2*8*0)] + str Abeq, [input_addr, #(2*8*1)] + str Abiq, [input_addr, #(2*8*2)] + str Aboq, [input_addr, #(2*8*3)] + str Abuq, [input_addr, #(2*8*4)] + str Agaq, [input_addr, #(2*8*5)] + str Ageq, [input_addr, #(2*8*6)] + str Agiq, [input_addr, #(2*8*7)] + str Agoq, [input_addr, #(2*8*8)] + str Aguq, [input_addr, #(2*8*9)] + str Akaq, [input_addr, #(2*8*10)] + str Akeq, [input_addr, #(2*8*11)] + str Akiq, [input_addr, #(2*8*12)] + str Akoq, [input_addr, #(2*8*13)] + str Akuq, [input_addr, #(2*8*14)] + str Amaq, [input_addr, #(2*8*15)] + str Ameq, [input_addr, #(2*8*16)] + str Amiq, [input_addr, #(2*8*17)] + str Amoq, [input_addr, #(2*8*18)] + str Amuq, [input_addr, #(2*8*19)] + str Asaq, [input_addr, #(2*8*20)] + str Aseq, [input_addr, #(2*8*21)] + str Asiq, [input_addr, #(2*8*22)] + str Asoq, [input_addr, #(2*8*23)] + str Asuq, [input_addr, #(2*8*24)] +.endm + +#define STACK_SIZE (16*4 + 16*34) +#define STACK_BASE_VREGS 0 +#define STACK_BASE_TMP 16*4 + +#define Aga_offset 0 +#define E0_offset 1 +#define E1_offset 2 +#define E2_offset 3 +#define E3_offset 4 +#define E4_offset 5 +#define Ame_offset 7 +#define Agi_offset 8 +#define Aka_offset 9 +#define Abo_offset 10 +#define Amo_offset 11 +#define Ami_offset 12 +#define Ake_offset 13 +#define Agu_offset 14 +#define Asi_offset 15 +#define Aku_offset 16 +#define Asa_offset 17 +#define Abu_offset 18 +#define Asu_offset 19 +#define Ase_offset 20 +//#define Aga_offset 21 +#define Age_offset 22 +#define vBgo_offset 23 +#define vBke_offset 24 +#define vBgi_offset 25 +#define vBga_offset 26 +#define vBbo_offset 27 +#define vBmo_offset 28 +#define vBmi_offset 29 +#define vBge_offset 30 + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +#define save(name) \ + str name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] +#define restore(name) \ + ldr name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] + +.macro save_vregs + stp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + stp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + stp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + stp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + ldp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + ldp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + ldp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +.macro eor3_m0 d s0 s1 s2 + eor3 \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +.macro rax1_m0 d s0 s1 + rax1 \d\().2d, \s0\().2d, \s1\().2d +.endm + +.macro xar_m0 d s0 s1 imm + xar \d\().2d, \s0\().2d, \s1\().2d, #\imm +.endm + +.macro bcax_m0 d s0 s1 s2 + bcax \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +.macro eor3_m1_0 d s0 s1 s2 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor2 d s0 s1 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor3_m1_1 d s0 s1 s2 + eor \d\().16b, \d\().16b, \s2\().16b +.endm + +.macro eor3_m1 d s0 s1 s2 + eor3_m1_0 \d, \s0, \s1, \s2 + eor3_m1_1 \d, \s0, \s1, \s2 +.endm + +.macro rax1_m1 d s0 s1 + // Use add instead of SHL #1 + add tmp.2d, \s1\().2d, \s1\().2d + sri tmp.2d, \s1\().2d, #63 + eor \d\().16b, tmp.16b, \s0\().16b +.endm + + .macro xar_m1 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 63 + eor \s0\().16b, \s0\().16b, \s1\().16b + add \d\().2d, \s0\().2d, \s0\().2d + sri \d\().2d, \s0\().2d, #(63) + // .elseif \imm == 62 + // eor \s0\().16b, \s0\().16b, \s1\().16b + // add \d\().2d, \s0\().2d, \s0\().2d + // add \d\().2d, \d\().2d, \d\().2d + // sri \d\().2d, \s0\().2d, #(62) + // .elseif \imm == 61 + // eor \s0\().16b, \s0\().16b, \s1\().16b + // add \d\().2d, \s0\().2d, \s0\().2d + // add \d\().2d, \d\().2d, \d\().2d + // add \d\().2d, \d\().2d, \d\().2d + // sri \d\().2d, \s0\().2d, #(61) + .else + eor \s0\().16b, \s0\().16b, \s1\().16b + shl \d\().2d, \s0\().2d, #(64-\imm) + sri \d\().2d, \s0\().2d, #(\imm) + .endif +.endm + + .macro xar_m1_0 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 63 + eor \s0\().16b, \s0\().16b, \s1\().16b + .elseif \imm == 62 + eor \s0\().16b, \s0\().16b, \s1\().16b + .else + eor \s0\().16b, \s0\().16b, \s1\().16b + .endif +.endm + + .macro xar_m1_1 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 63 + add \d\().2d, \s0\().2d, \s0\().2d + sri \d\().2d, \s0\().2d, #(63) + .elseif \imm == 62 + add \d\().2d, \s0\().2d, \s0\().2d + add \d\().2d, \d\().2d, \d\().2d + sri \d\().2d, \s0\().2d, #(62) + .else + shl \d\().2d, \s0\().2d, #(64-\imm) + sri \d\().2d, \s0\().2d, #(\imm) + .endif +.endm + +.macro bcax_m1 d s0 s1 s2 + bic tmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +/* Keccak-f1600 round */ + +.macro keccak_f1600_round_pre + + /* 10 EOR3, so 20 individual EOR */ + + eor3_m0 C1, Abe, Age, Ake + eor3_m1 C3, Abo, Ago, Ako + eor3_m0 C0, Aba, Aga, Aka + eor3_m1 C2, Abi, Agi, Aki + eor3_m0 C4, Abu, Agu, Aku + eor3_m1 C1, C1, Ame, Ase + eor3_m0 C3, C3, Amo, Aso + eor3_m1 C0, C0, Ama, Asa + eor3_m0 C2, C2, Ami, Asi + eor3_m1 C4, C4, Amu, Asu + +.endm + +.macro keccak_f1600_round + + /* 10 EOR3, so 20 individual EOR */ + + eor3_m1_0 C0, Aba, Aga, Aka + eor3_m1_0 C1, Abe, Age, Ake + eor3_m1_0 C2, Abi, Agi, Aki + eor3_m1_0 C3, Abo, Ago, Ako + eor3_m1_0 C4, Abu, Agu, Aku + eor3_m1_1 C0, Aba, Aga, Aka + eor3_m1_1 C1, Abe, Age, Ake + eor3_m1_1 C2, Abi, Agi, Aki + eor3_m1_1 C3, Abo, Ago, Ako + eor3_m1_1 C4, Abu, Agu, Aku + eor3_m1_0 C0, C0, Ama, Asa + eor3_m1_0 C1, C1, Ame, Ase + eor3_m1_0 C2, C2, Ami, Asi + eor3_m1_0 C3, C3, Amo, Aso + eor3_m1_0 C4, C4, Amu, Asu + eor3_m1_1 C0, C0, Ama, Asa + eor3_m1_1 C1, C1, Ame, Ase + eor3_m1_1 C2, C2, Ami, Asi + eor3_m1_1 C3, C3, Amo, Aso + eor3_m1_1 C4, C4, Amu, Asu + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req vBba + rax1_m1 E2, C1, C3 + rax1_m1 E4, C3, C0 + rax1_m1 E1, C0, C2 + rax1_m1 E3, C2, C4 + rax1_m1 E0, C4, C1 + .unreq tmp + + /* 25x XAR, 75 in total */ + + tmp .req C1 + tmpq .req C1q + + eor vBba.16b, Aba.16b, E0.16b + xar_m1 vBsa, Abi, E2, 2 + xar_m1 vBbi, Aki, E2, 21 + xar_m1 vBki, Ako, E3, 39 + xar_m1 vBko, Amu, E4, 56 + xar_m1 vBmu, Aso, E3, 8 + xar_m1 vBso, Ama, E0, 23 + xar_m1 vBka, Abe, E1, 63 + xar_m1 vBse, Ago, E3, 9 + xar_m1 vBgo, Ame, E1, 19 + xar_m1 vBke, Agi, E2, 58 + xar_m1 vBgi, Aka, E0, 61 + xar_m1 vBga, Abo, E3, 36 + xar_m1 vBbo, Amo, E3, 43 + xar_m1 vBmo, Ami, E2, 49 + xar_m1 vBmi, Ake, E1, 54 + xar_m1 vBge, Agu, E4, 44 + xar_m1 vBgu, Asi, E2, 3 + xar_m1 vBsi, Aku, E4, 25 + xar_m1 vBku, Asa, E0, 46 + xar_m1 vBma, Abu, E4, 37 + xar_m1 vBbu, Asu, E4, 50 + xar_m1 vBsu, Ase, E1, 62 + xar_m1 vBme, Aga, E0, 28 + xar_m1 vBbe, Age, E1, 20 + + /* 25x BCAX, 50 in total */ + + bcax_m1 Aga, vBga, vBgi, vBge + bcax_m1 Age, vBge, vBgo, vBgi + bcax_m1 Agi, vBgi, vBgu, vBgo + bcax_m1 Ago, vBgo, vBga, vBgu + bcax_m1 Agu, vBgu, vBge, vBga + bcax_m1 Aka, vBka, vBki, vBke + bcax_m1 Ake, vBke, vBko, vBki + bcax_m1 Aki, vBki, vBku, vBko + bcax_m1 Ako, vBko, vBka, vBku + bcax_m1 Aku, vBku, vBke, vBka + bcax_m1 Ama, vBma, vBmi, vBme + bcax_m1 Ame, vBme, vBmo, vBmi + bcax_m1 Ami, vBmi, vBmu, vBmo + bcax_m1 Amo, vBmo, vBma, vBmu + bcax_m1 Amu, vBmu, vBme, vBma + bcax_m1 Asa, vBsa, vBsi, vBse + bcax_m1 Ase, vBse, vBso, vBsi + bcax_m1 Asi, vBsi, vBsu, vBso + bcax_m1 Aso, vBso, vBsa, vBsu + bcax_m1 Asu, vBsu, vBse, vBsa + bcax_m1 Aba, vBba, vBbi, vBbe + bcax_m1 Abe, vBbe, vBbo, vBbi + bcax_m1 Abi, vBbi, vBbu, vBbo + bcax_m1 Abo, vBbo, vBba, vBbu + bcax_m1 Abu, vBbu, vBbe, vBba + + // iota step + //ld1r {tmp.2d}, [const_addr], #8 + ldr tmpq, [const_addr], #16 + eor Aba.16b, Aba.16b, tmp.16b + + .unreq tmp + .unreq tmpq + +.endm + +.macro keccak_f1600_round_core + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req vBba + rax1_m0 E2, C1, C3 + rax1_m1 E4, C3, C0 + rax1_m0 E1, C0, C2 + rax1_m1 E3, C2, C4 + rax1_m0 E0, C4, C1 + + /* 25x XAR, 75 in total */ + + .unreq tmp + tmp .req C1 + tmpq .req C1q + + eor vBba.16b, Aba.16b, E0.16b + xar_m1 vBsa, Abi, E2, 2 + xar_m0 vBbi, Aki, E2, 21 + xar_m1 vBki, Ako, E3, 39 + xar_m0 vBko, Amu, E4, 56 + xar_m1 vBmu, Aso, E3, 8 + xar_m0 vBso, Ama, E0, 23 + xar_m1 vBka, Abe, E1, 63 + xar_m0 vBse, Ago, E3, 9 + xar_m1 vBgo, Ame, E1, 19 + xar_m0 vBke, Agi, E2, 58 + xar_m1 vBgi, Aka, E0, 61 + xar_m0 vBga, Abo, E3, 36 + xar_m1 vBbo, Amo, E3, 43 + xar_m0 vBmo, Ami, E2, 49 + xar_m1 vBmi, Ake, E1, 54 + xar_m0 vBge, Agu, E4, 44 + mov E3.16b, Aga.16b + bcax_m1 Aga, vBga, vBgi, vBge + xar_m0 vBgu, Asi, E2, 3 + xar_m1 vBsi, Aku, E4, 25 + xar_m0 vBku, Asa, E0, 46 + xar_m1 vBma, Abu, E4, 37 + xar_m0 vBbu, Asu, E4, 50 + xar_m1 vBsu, Ase, E1, 62 + xar_m0 vBme, E3, E0, 28 + xar_m1 vBbe, Age, E1, 20 + + /* 25x BCAX, 50 in total */ + + bcax_m1 Age, vBge, vBgo, vBgi + bcax_m0 Agi, vBgi, vBgu, vBgo + bcax_m1 Ago, vBgo, vBga, vBgu + bcax_m0 Agu, vBgu, vBge, vBga + bcax_m1 Aka, vBka, vBki, vBke + bcax_m0 Ake, vBke, vBko, vBki + + .unreq tmp + .unreq tmpq + + eor2 C0, Aka, Aga + save(Aga) + + tmp .req Aga + tmpq .req Agaq + bcax_m0 Aki, vBki, vBku, vBko + bcax_m1 Ako, vBko, vBka, vBku + eor2 C1, Ake, Age + bcax_m0 Aku, vBku, vBke, vBka + eor2 C2, Aki, Agi + bcax_m1 Ama, vBma, vBmi, vBme + eor2 C3, Ako, Ago + bcax_m0 Ame, vBme, vBmo, vBmi + eor2 C4, Aku, Agu + bcax_m1 Ami, vBmi, vBmu, vBmo + eor2 C0, C0, Ama + bcax_m0 Amo, vBmo, vBma, vBmu + eor2 C1, C1, Ame + bcax_m1 Amu, vBmu, vBme, vBma + eor2 C2, C2, Ami + bcax_m0 Asa, vBsa, vBsi, vBse + eor2 C3, C3, Amo + bcax_m1 Ase, vBse, vBso, vBsi + eor2 C4, C4, Amu + bcax_m0 Asi, vBsi, vBsu, vBso + eor2 C0, C0, Asa + bcax_m1 Aso, vBso, vBsa, vBsu + eor2 C1, C1, Ase + bcax_m0 Asu, vBsu, vBse, vBsa + eor2 C2, C2, Asi + eor2 C3, C3, Aso + bcax_m1 Aba, vBba, vBbi, vBbe + bcax_m0 Abe, vBbe, vBbo, vBbi + eor2 C1, C1, Abe + + // iota step + //ld1r {tmp.2d}, [const_addr], #8 + ldr tmpq, [const_addr], #16 + eor Aba.16b, Aba.16b, tmp.16b + eor2 C4, C4, Asu + bcax_m0 Abi, vBbi, vBbu, vBbo + bcax_m1 Abo, vBbo, vBba, vBbu + eor2 C3, C3, Abo + eor2 C2, C2, Abi + eor2 C0, C0, Aba + bcax_m0 Abu, vBbu, vBbe, vBba + eor2 C4, C4, Abu + + restore(Aga) + .unreq tmp + .unreq tmpq + +.endm + +.macro keccak_f1600_round_post + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req vBba + rax1_m0 E2, C1, C3 + rax1_m1 E4, C3, C0 + rax1_m0 E1, C0, C2 + rax1_m1 E3, C2, C4 + rax1_m0 E0, C4, C1 + + /* 25x XAR, 75 in total */ + + .unreq tmp + tmp .req C1 + tmpq .req C1q + + eor vBba.16b, Aba.16b, E0.16b + xar_m0 vBsa, Abi, E2, 2 + xar_m1 vBbi, Aki, E2, 21 + xar_m0 vBki, Ako, E3, 39 + xar_m1 vBko, Amu, E4, 56 + xar_m0 vBmu, Aso, E3, 8 + xar_m1 vBso, Ama, E0, 23 + xar_m0 vBka, Abe, E1, 63 + xar_m1 vBse, Ago, E3, 9 + xar_m0 vBgo, Ame, E1, 19 + xar_m1 vBke, Agi, E2, 58 + xar_m0 vBgi, Aka, E0, 61 + xar_m1 vBga, Abo, E3, 36 + xar_m0 vBbo, Amo, E3, 43 + xar_m1 vBmo, Ami, E2, 49 + xar_m0 vBmi, Ake, E1, 54 + xar_m1 vBge, Agu, E4, 44 + mov E3.16b, Aga.16b + bcax_m1 Aga, vBga, vBgi, vBge + xar_m0 vBgu, Asi, E2, 3 + xar_m1 vBsi, Aku, E4, 25 + xar_m0 vBku, Asa, E0, 46 + xar_m1 vBma, Abu, E4, 37 + xar_m0 vBbu, Asu, E4, 50 + xar_m1 vBsu, Ase, E1, 62 + xar_m0 vBme, E3, E0, 28 + xar_m1 vBbe, Age, E1, 20 + + /* 25x BCAX, 50 in total */ + + bcax_m0 Age, vBge, vBgo, vBgi + bcax_m1 Agi, vBgi, vBgu, vBgo + bcax_m0 Ago, vBgo, vBga, vBgu + bcax_m1 Agu, vBgu, vBge, vBga + bcax_m0 Aka, vBka, vBki, vBke + bcax_m1 Ake, vBke, vBko, vBki + bcax_m0 Aki, vBki, vBku, vBko + bcax_m1 Ako, vBko, vBka, vBku + bcax_m0 Aku, vBku, vBke, vBka + bcax_m1 Ama, vBma, vBmi, vBme + bcax_m0 Ame, vBme, vBmo, vBmi + bcax_m1 Ami, vBmi, vBmu, vBmo + bcax_m0 Amo, vBmo, vBma, vBmu + bcax_m1 Amu, vBmu, vBme, vBma + bcax_m0 Asa, vBsa, vBsi, vBse + bcax_m1 Ase, vBse, vBso, vBsi + bcax_m0 Asi, vBsi, vBsu, vBso + bcax_m1 Aso, vBso, vBsa, vBsu + bcax_m0 Asu, vBsu, vBse, vBsa + bcax_m1 Aba, vBba, vBbi, vBbe + bcax_m0 Abe, vBbe, vBbo, vBbi + bcax_m1 Abi, vBbi, vBbu, vBbo + bcax_m0 Abo, vBbo, vBba, vBbu + bcax_m1 Abu, vBbu, vBbe, vBba + + // iota step + //ld1r {tmp.2d}, [const_addr], #8 + ldr tmpq, [const_addr], #16 + eor Aba.16b, Aba.16b, tmp.16b + + .unreq tmp + .unreq tmpq + +.endm + + +.text +.align 4 +.global keccak_f1600_x2_hybrid_asm_v2pp2 +.global _keccak_f1600_x2_hybrid_asm_v2pp2 + +#define KECCAK_F1600_ROUNDS 24 + +keccak_f1600_x2_hybrid_asm_v2pp2: +_keccak_f1600_x2_hybrid_asm_v2pp2: + alloc_stack + save_vregs + load_constant_ptr + load_input + + //mov count, #(KECCAK_F1600_ROUNDS-2) + mov count, #11 + keccak_f1600_round_pre +loop: + keccak_f1600_round_core + keccak_f1600_round_core + sub count, count, #1 + cbnz count, loop + + keccak_f1600_round_core + keccak_f1600_round_post + store_input + restore_vregs + free_stack + ret + +#endif diff --git a/asm/manual/keccak_f1600/keccak_f1600_x2_v84a_asm_v1.s b/asm/manual/keccak_f1600/keccak_f1600_x2_v84a_asm_v1.s new file mode 100644 index 0000000..3f2635e --- /dev/null +++ b/asm/manual/keccak_f1600/keccak_f1600_x2_v84a_asm_v1.s @@ -0,0 +1,338 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +#if defined(__ARM_FEATURE_SHA3) + +/********************** CONSTANTS *************************/ + .data + .align(8) +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x1 + count .req x2 + cur_const .req x3 + + /* Mapping of Kecck-f1600 state to vector registers + * at the beginning and end of each round. */ + Aba .req v0 + Abe .req v1 + Abi .req v2 + Abo .req v3 + Abu .req v4 + Aga .req v5 + Age .req v6 + Agi .req v7 + Ago .req v8 + Agu .req v9 + Aka .req v10 + Ake .req v11 + Aki .req v12 + Ako .req v13 + Aku .req v14 + Ama .req v15 + Ame .req v16 + Ami .req v17 + Amo .req v18 + Amu .req v19 + Asa .req v20 + Ase .req v21 + Asi .req v22 + Aso .req v23 + Asu .req v24 + + /* q-form of the above mapping */ + Abaq .req q0 + Abeq .req q1 + Abiq .req q2 + Aboq .req q3 + Abuq .req q4 + Agaq .req q5 + Ageq .req q6 + Agiq .req q7 + Agoq .req q8 + Aguq .req q9 + Akaq .req q10 + Akeq .req q11 + Akiq .req q12 + Akoq .req q13 + Akuq .req q14 + Amaq .req q15 + Ameq .req q16 + Amiq .req q17 + Amoq .req q18 + Amuq .req q19 + Asaq .req q20 + Aseq .req q21 + Asiq .req q22 + Asoq .req q23 + Asuq .req q24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v30 + C1 .req v29 + C2 .req v28 + C3 .req v27 + C4 .req v26 + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req v26 + E1 .req v25 + E2 .req v29 + E3 .req v28 + E4 .req v27 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + Abi_ .req v2 + Abo_ .req v3 + Abu_ .req v4 + Aga_ .req v10 + Age_ .req v11 + Agi_ .req v7 + Ago_ .req v8 + Agu_ .req v9 + Aka_ .req v15 + Ake_ .req v16 + Aki_ .req v12 + Ako_ .req v13 + Aku_ .req v14 + Ama_ .req v20 + Ame_ .req v21 + Ami_ .req v17 + Amo_ .req v18 + Amu_ .req v19 + Asa_ .req v0 + Ase_ .req v1 + Asi_ .req v22 + Aso_ .req v23 + Asu_ .req v24 + Aba_ .req v30 + Abe_ .req v27 + +/************************ MACROS ****************************/ + +.macro load_input + ld1 {Aba.2d, Abe.2d, Abi.2d, Abo.2d}, [input_addr], #64 + ld1 {Abu.2d, Aga.2d, Age.2d, Agi.2d}, [input_addr], #64 + ld1 {Ago.2d, Agu.2d, Aka.2d, Ake.2d}, [input_addr], #64 + ld1 {Aki.2d, Ako.2d, Aku.2d, Ama.2d}, [input_addr], #64 + ld1 {Ame.2d, Ami.2d, Amo.2d, Amu.2d}, [input_addr], #64 + ld1 {Asa.2d, Ase.2d, Asi.2d, Aso.2d}, [input_addr], #64 + ld1 {Asu.2d}, [input_addr] + sub input_addr, input_addr, #(6*64) +.endm + +.macro store_input + st1 {Aba.2d, Abe.2d, Abi.2d, Abo.2d}, [input_addr], #64 + st1 {Abu.2d, Aga.2d, Age.2d, Agi.2d}, [input_addr], #64 + st1 {Ago.2d, Agu.2d, Aka.2d, Ake.2d}, [input_addr], #64 + st1 {Aki.2d, Ako.2d, Aku.2d, Ama.2d}, [input_addr], #64 + st1 {Ame.2d, Ami.2d, Amo.2d, Amu.2d}, [input_addr], #64 + st1 {Asa.2d, Ase.2d, Asi.2d, Aso.2d}, [input_addr], #64 + st1 {Asu.2d}, [input_addr] +.endm + +#define STACK_SIZE (16*4 + 16*6) // VREGS (16*4) + GPRS (TODO: Remove) + +#define STACK_BASE_GPRS (16*4) +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) + .endm + +.macro save_vregs + stp d8, d9, [sp, #(16*0)] + stp d10, d11, [sp, #(16*1)] + stp d12, d13, [sp, #(16*2)] + stp d14, d15, [sp, #(16*3)] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #(16*0)] + ldp d10, d11, [sp, #(16*1)] + ldp d12, d13, [sp, #(16*2)] + ldp d14, d15, [sp, #(16*3)] +.endm + +/* Macros using v8.4-A SHA-3 instructions */ + +.macro eor3_m0 d s0 s1 s2 + eor3 \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +.macro rax1_m0 d s0 s1 + rax1 \d\().2d, \s0\().2d, \s1\().2d +.endm + +.macro xar_m0 d s0 s1 imm + xar \d\().2d, \s0\().2d, \s1\().2d, #\imm +.endm + +.macro bcax_m0 d s0 s1 s2 + bcax \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +/* Keccak-f1600 round */ + +.macro keccak_f1600_round + + eor3_m0 C0, Aba, Aga, Aka + eor3_m0 C1, Abe, Age, Ake + eor3_m0 C2, Abi, Agi, Aki + eor3_m0 C3, Abo, Ago, Ako + eor3_m0 C4, Abu, Agu, Aku + eor3_m0 C0, C0, Ama, Asa + eor3_m0 C1, C1, Ame, Ase + eor3_m0 C2, C2, Ami, Asi + eor3_m0 C3, C3, Amo, Aso + eor3_m0 C4, C4, Amu, Asu + + rax1_m0 E1, C0, C2 + rax1_m0 E3, C2, C4 + rax1_m0 E0, C4, C1 + rax1_m0 E2, C1, C3 + rax1_m0 E4, C3, C0 + + eor Aba_.16b, Aba.16b, E0.16b + xar_m0 Asa_, Abi, E2, 2 + xar_m0 Abi_, Aki, E2, 21 + xar_m0 Aki_, Ako, E3, 39 + xar_m0 Ako_, Amu, E4, 56 + xar_m0 Amu_, Aso, E3, 8 + xar_m0 Aso_, Ama, E0, 23 + xar_m0 Aka_, Abe, E1, 63 + xar_m0 Ase_, Ago, E3, 9 + xar_m0 Ago_, Ame, E1, 19 + xar_m0 Ake_, Agi, E2, 58 + xar_m0 Agi_, Aka, E0, 61 + xar_m0 Aga_, Abo, E3, 36 + xar_m0 Abo_, Amo, E3, 43 + xar_m0 Amo_, Ami, E2, 49 + xar_m0 Ami_, Ake, E1, 54 + xar_m0 Age_, Agu, E4, 44 + xar_m0 Agu_, Asi, E2, 3 + xar_m0 Asi_, Aku, E4, 25 + xar_m0 Aku_, Asa, E0, 46 + xar_m0 Ama_, Abu, E4, 37 + xar_m0 Abu_, Asu, E4, 50 + xar_m0 Asu_, Ase, E1, 62 + xar_m0 Ame_, Aga, E0, 28 + xar_m0 Abe_, Age, E1, 20 + + ld1r {v31.2d}, [const_addr], #8 + + bcax_m0 Aga, Aga_, Agi_, Age_ + bcax_m0 Age, Age_, Ago_, Agi_ + bcax_m0 Agi, Agi_, Agu_, Ago_ + bcax_m0 Ago, Ago_, Aga_, Agu_ + bcax_m0 Agu, Agu_, Age_, Aga_ + bcax_m0 Aka, Aka_, Aki_, Ake_ + bcax_m0 Ake, Ake_, Ako_, Aki_ + bcax_m0 Aki, Aki_, Aku_, Ako_ + bcax_m0 Ako, Ako_, Aka_, Aku_ + bcax_m0 Aku, Aku_, Ake_, Aka_ + bcax_m0 Ama, Ama_, Ami_, Ame_ + bcax_m0 Ame, Ame_, Amo_, Ami_ + bcax_m0 Ami, Ami_, Amu_, Amo_ + bcax_m0 Amo, Amo_, Ama_, Amu_ + bcax_m0 Amu, Amu_, Ame_, Ama_ + bcax_m0 Asa, Asa_, Asi_, Ase_ + bcax_m0 Ase, Ase_, Aso_, Asi_ + bcax_m0 Asi, Asi_, Asu_, Aso_ + bcax_m0 Aso, Aso_, Asa_, Asu_ + bcax_m0 Asu, Asu_, Ase_, Asa_ + bcax_m0 Aba, Aba_, Abi_, Abe_ + bcax_m0 Abe, Abe_, Abo_, Abi_ + bcax_m0 Abi, Abi_, Abu_, Abo_ + bcax_m0 Abo, Abo_, Aba_, Abu_ + bcax_m0 Abu, Abu_, Abe_, Aba_ + + // iota step + eor Aba.16b, Aba.16b, v31.16b + +.endm + +#define KECCAK_F1600_ROUNDS 24 + +.text +.align 4 +.global keccak_f1600_x2_v84a_asm_v1 +.global _keccak_f1600_x2_v84a_asm_v1 + +keccak_f1600_x2_v84a_asm_v1: +_keccak_f1600_x2_v84a_asm_v1: + alloc_stack + save_vregs + load_constant_ptr + load_input + + mov count, #(KECCAK_F1600_ROUNDS) +loop: + keccak_f1600_round + sub count, count, #1 + cbnz count, loop + + store_input + restore_vregs + free_stack + ret + +#endif diff --git a/asm/manual/keccak_f1600/keccak_f1600_x2_v84a_asm_v1p0.s b/asm/manual/keccak_f1600/keccak_f1600_x2_v84a_asm_v1p0.s new file mode 100644 index 0000000..0287c70 --- /dev/null +++ b/asm/manual/keccak_f1600/keccak_f1600_x2_v84a_asm_v1p0.s @@ -0,0 +1,465 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +#if defined(__ARM_FEATURE_SHA3) + +/********************** CONSTANTS *************************/ + .data + .align(8) +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x1 + count .req x2 + cur_const .req x3 + + /* Mapping of Kecck-f1600 state to vector registers + * at the beginning and end of each round. */ + Aba .req v0 + Abe .req v1 + Abi .req v2 + Abo .req v3 + Abu .req v4 + Aga .req v5 + Age .req v6 + Agi .req v7 + Ago .req v8 + Agu .req v9 + Aka .req v10 + Ake .req v11 + Aki .req v12 + Ako .req v13 + Aku .req v14 + Ama .req v15 + Ame .req v16 + Ami .req v17 + Amo .req v18 + Amu .req v19 + Asa .req v20 + Ase .req v21 + Asi .req v22 + Aso .req v23 + Asu .req v24 + + /* q-form of the above mapping */ + Abaq .req q0 + Abeq .req q1 + Abiq .req q2 + Aboq .req q3 + Abuq .req q4 + Agaq .req q5 + Ageq .req q6 + Agiq .req q7 + Agoq .req q8 + Aguq .req q9 + Akaq .req q10 + Akeq .req q11 + Akiq .req q12 + Akoq .req q13 + Akuq .req q14 + Amaq .req q15 + Ameq .req q16 + Amiq .req q17 + Amoq .req q18 + Amuq .req q19 + Asaq .req q20 + Aseq .req q21 + Asiq .req q22 + Asoq .req q23 + Asuq .req q24 + + Abaz .req z0 + Abez .req z1 + Abiz .req z2 + Aboz .req z3 + Abuz .req z4 + Agaz .req z5 + Agez .req z6 + Agiz .req z7 + Agoz .req z8 + Aguz .req z9 + Akaz .req z10 + Akez .req z11 + Akiz .req z12 + Akoz .req z13 + Akuz .req z14 + Amaz .req z15 + Amez .req z16 + Amiz .req z17 + Amoz .req z18 + Amuz .req z19 + Asaz .req z20 + Asez .req z21 + Asiz .req z22 + Asoz .req z23 + Asuz .req z24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v25 + C1 .req v26 + C2 .req v27 + C3 .req v28 + C4 .req v29 + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req C4 + E1 .req C0 + E2 .req C1 + E3 .req C2 + E4 .req C3 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + Abi_ .req v2 + Abo_ .req v3 + Abu_ .req v4 + Aga_ .req v10 + Age_ .req v11 + Agi_ .req v7 + Ago_ .req v8 + Agu_ .req v9 + Aka_ .req v15 + Ake_ .req v16 + Aki_ .req v12 + Ako_ .req v13 + Aku_ .req v14 + Ama_ .req v20 + Ame_ .req v21 + Ami_ .req v17 + Amo_ .req v18 + Amu_ .req v19 + Asa_ .req v0 + Ase_ .req v1 + Asi_ .req v22 + Aso_ .req v23 + Asu_ .req v24 + Aba_ .req v30 + Abe_ .req E0 + +/************************ MACROS ****************************/ + +.macro load_input + ld1 {Aba.2d, Abe.2d, Abi.2d, Abo.2d}, [input_addr], #64 + ld1 {Abu.2d, Aga.2d, Age.2d, Agi.2d}, [input_addr], #64 + ld1 {Ago.2d, Agu.2d, Aka.2d, Ake.2d}, [input_addr], #64 + ld1 {Aki.2d, Ako.2d, Aku.2d, Ama.2d}, [input_addr], #64 + ld1 {Ame.2d, Ami.2d, Amo.2d, Amu.2d}, [input_addr], #64 + ld1 {Asa.2d, Ase.2d, Asi.2d, Aso.2d}, [input_addr], #64 + ld1 {Asu.2d}, [input_addr] + sub input_addr, input_addr, #(6*64) +.endm + +.macro store_input + st1 {Aba.2d, Abe.2d, Abi.2d, Abo.2d}, [input_addr], #64 + st1 {Abu.2d, Aga.2d, Age.2d, Agi.2d}, [input_addr], #64 + st1 {Ago.2d, Agu.2d, Aka.2d, Ake.2d}, [input_addr], #64 + st1 {Aki.2d, Ako.2d, Aku.2d, Ama.2d}, [input_addr], #64 + st1 {Ame.2d, Ami.2d, Amo.2d, Amu.2d}, [input_addr], #64 + st1 {Asa.2d, Ase.2d, Asi.2d, Aso.2d}, [input_addr], #64 + st1 {Asu.2d}, [input_addr] +.endm + +// .macro load_input +// ldr Abaq, [input_addr, #(2*8*0)] +// ldr Abeq, [input_addr, #(2*8*1)] +// ldr Abiq, [input_addr, #(2*8*2)] +// ldr Aboq, [input_addr, #(2*8*3)] +// ldr Abuq, [input_addr, #(2*8*4)] +// ldr Agaq, [input_addr, #(2*8*5)] +// ldr Ageq, [input_addr, #(2*8*6)] +// ldr Agiq, [input_addr, #(2*8*7)] +// ldr Agoq, [input_addr, #(2*8*8)] +// ldr Aguq, [input_addr, #(2*8*9)] +// ldr Akaq, [input_addr, #(2*8*10)] +// ldr Akeq, [input_addr, #(2*8*11)] +// ldr Akiq, [input_addr, #(2*8*12)] +// ldr Akoq, [input_addr, #(2*8*13)] +// ldr Akuq, [input_addr, #(2*8*14)] +// ldr Amaq, [input_addr, #(2*8*15)] +// ldr Ameq, [input_addr, #(2*8*16)] +// ldr Amiq, [input_addr, #(2*8*17)] +// ldr Amoq, [input_addr, #(2*8*18)] +// ldr Amuq, [input_addr, #(2*8*19)] +// ldr Asaq, [input_addr, #(2*8*20)] +// ldr Aseq, [input_addr, #(2*8*21)] +// ldr Asiq, [input_addr, #(2*8*22)] +// ldr Asoq, [input_addr, #(2*8*23)] +// ldr Asuq, [input_addr, #(2*8*24)] +// .endm + +// .macro store_input +// str Abaq, [input_addr, #(2*8*0)] +// str Abeq, [input_addr, #(2*8*1)] +// str Abiq, [input_addr, #(2*8*2)] +// str Aboq, [input_addr, #(2*8*3)] +// str Abuq, [input_addr, #(2*8*4)] +// str Agaq, [input_addr, #(2*8*5)] +// str Ageq, [input_addr, #(2*8*6)] +// str Agiq, [input_addr, #(2*8*7)] +// str Agoq, [input_addr, #(2*8*8)] +// str Aguq, [input_addr, #(2*8*9)] +// str Akaq, [input_addr, #(2*8*10)] +// str Akeq, [input_addr, #(2*8*11)] +// str Akiq, [input_addr, #(2*8*12)] +// str Akoq, [input_addr, #(2*8*13)] +// str Akuq, [input_addr, #(2*8*14)] +// str Amaq, [input_addr, #(2*8*15)] +// str Ameq, [input_addr, #(2*8*16)] +// str Amiq, [input_addr, #(2*8*17)] +// str Amoq, [input_addr, #(2*8*18)] +// str Amuq, [input_addr, #(2*8*19)] +// str Asaq, [input_addr, #(2*8*20)] +// str Aseq, [input_addr, #(2*8*21)] +// str Asiq, [input_addr, #(2*8*22)] +// str Asoq, [input_addr, #(2*8*23)] +// str Asuq, [input_addr, #(2*8*24)] +// .endm + +#define STACK_SIZE (16*4 + 16*6 + 16*5) // VREGS (16*4) + GPRS (TODO: Remove) + +#define STACK_BASE_GPRS (16*4) +#define STACK_BASE_VTMP (16*4 + 16*6) + +#define save(name)\ + str name ## q, [sp, #(STACK_BASE_VTMP + 16*(name ## _offset))] +#define restore(name) \ + ldr name ## q, [sp, #(STACK_BASE_VTMP + 16*(name ## _offset))] + +#define Aga_offset 0 +#define Age_offset 1 +#define Agi_offset 2 +#define Ago_offset 3 +#define Agu_offset 4 + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +.macro save_vregs + stp d8, d9, [sp, #(16*0)] + stp d10, d11, [sp, #(16*1)] + stp d12, d13, [sp, #(16*2)] + stp d14, d15, [sp, #(16*3)] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #(16*0)] + ldp d10, d11, [sp, #(16*1)] + ldp d12, d13, [sp, #(16*2)] + ldp d14, d15, [sp, #(16*3)] +.endm + +/* Macros using v8.4-A SHA-3 instructions */ + +.macro eor2 d s0 s1 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor3_m0 d s0 s1 s2 + eor3 \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +.macro rax1_m0 d s0 s1 + rax1 \d\().2d, \s0\().2d, \s1\().2d +.endm + +.macro xar_m0 d s0 s1 imm + xar \d\().2d, \s0\().2d, \s1\().2d, #\imm +.endm + +.macro rax1_m1 d s0 s1 + xar_m0 tmp, vzr, \s1, 63 + eor \d\().16b, \s0\().16b, tmp.16b +.endm + +.macro bcax_m0 d s0 s1 s2 + bcax \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +.macro bcax_m2 d s0 s1 s2 + bcax \d\()z.d, \s0\()z.d, \s1\()z.d, \s2\()z.d +.endm + +/* Keccak-f1600 round */ + +.macro keccak_f1600_round + + eor3_m0 C2, Ami, Agi, Aki + eor3_m0 C0, Ama, Aga, Aka + eor3_m0 C1, Ame, Age, Ake + eor3_m0 C3, Amo, Ago, Ako + eor3_m0 C4, Asu, Agu, Aku + + vzr .req v31 + movi vzr.2d, #0 + + eor3_m0 C2, C2, Abi, Asi + save(Agi) SEP C1r .req Agi + eor3_m0 C0, C0, Aba, Asa + eor3_m0 C1, C1, Abe, Ase + save(Agu) SEP C3r .req Agu + eor3_m0 C3, C3, Abo, Aso + eor3_m0 C4, C4, Amu, Abu + + save(Ago) SEP C2r .req Ago + xar_m0 C1r, vzr, C1, 63 + xar_m0 C3r, vzr, C3, 63 + save(Aga) SEP C4r .req Aga + xar_m0 C2r, vzr, C2, 63 + xar_m0 C4r, vzr, C4, 63 + save(Age) SEP C0r .req Age + eor2 E0, C4, C1r + xar_m0 C0r, vzr, C0, 63 + eor2 E2, C1, C3r + eor2 E1, C0, C2r + restore(Agu) // C3r + eor2 E3, C2, C4r + eor2 E4, C3, C0r + restore(Ago) // C2r + restore(Agi) // C1r/Cor + + eor Aba_.16b, Aba.16b, E0.16b + xar_m0 Asa_, Abi, E2, 2 + restore(Aga) // C4r + xar_m0 Abi_, Aki, E2, 21 + xar_m0 Aki_, Ako, E3, 39 + restore(Age) // C0r + xar_m0 Ako_, Amu, E4, 56 + xar_m0 Amu_, Aso, E3, 8 + xar_m0 Aso_, Ama, E0, 23 + xar_m0 Aka_, Abe, E1, 63 + xar_m0 Ase_, Ago, E3, 9 + xar_m0 Ago_, Ame, E1, 19 + xar_m0 Ake_, Agi, E2, 58 + xar_m0 Agi_, Aka, E0, 61 + xar_m0 Aga_, Abo, E3, 36 + xar_m0 Abo_, Amo, E3, 43 + xar_m0 Amo_, Ami, E2, 49 + xar_m0 Ami_, Ake, E1, 54 + xar_m0 Age_, Agu, E4, 44 + xar_m0 Agu_, Asi, E2, 3 + xar_m0 Asi_, Aku, E4, 25 + xar_m0 Aku_, Asa, E0, 46 + xar_m0 Ama_, Abu, E4, 37 + xar_m0 Abu_, Asu, E4, 50 + xar_m0 Asu_, Ase, E1, 62 + xar_m0 Ame_, Aga, E0, 28 + xar_m0 Abe_, Age, E1, 20 + + ld1r {v31.2d}, [const_addr], #8 + + bcax_m0 Aga, Aga_, Agi_, Age_ + bcax_m0 Age, Age_, Ago_, Agi_ + bcax_m0 Agi, Agi_, Agu_, Ago_ + bcax_m0 Ago, Ago_, Aga_, Agu_ + bcax_m0 Agu, Agu_, Age_, Aga_ + bcax_m0 Aka, Aka_, Aki_, Ake_ + bcax_m0 Ake, Ake_, Ako_, Aki_ + bcax_m0 Aki, Aki_, Aku_, Ako_ + bcax_m0 Ako, Ako_, Aka_, Aku_ + bcax_m0 Aku, Aku_, Ake_, Aka_ + bcax_m0 Ama, Ama_, Ami_, Ame_ + bcax_m0 Ame, Ame_, Amo_, Ami_ + bcax_m0 Ami, Ami_, Amu_, Amo_ + bcax_m0 Amo, Amo_, Ama_, Amu_ + bcax_m0 Amu, Amu_, Ame_, Ama_ + bcax_m0 Asa, Asa_, Asi_, Ase_ + bcax_m0 Ase, Ase_, Aso_, Asi_ + bcax_m0 Asi, Asi_, Asu_, Aso_ + bcax_m0 Aso, Aso_, Asa_, Asu_ + bcax_m0 Asu, Asu_, Ase_, Asa_ + bcax_m0 Aba, Aba_, Abi_, Abe_ + bcax_m0 Abe, Abe_, Abo_, Abi_ + bcax_m0 Abi, Abi_, Abu_, Abo_ + bcax_m0 Abo, Abo_, Aba_, Abu_ + bcax_m0 Abu, Abu_, Abe_, Aba_ + + // iota step + eor Aba.16b, Aba.16b, v31.16b + +.endm + +#define KECCAK_F1600_ROUNDS 24 + +.text +.align 4 +.global keccak_f1600_x2_v84a_asm_v1p0 +.global _keccak_f1600_x2_v84a_asm_v1p0 + +keccak_f1600_x2_v84a_asm_v1p0: +_keccak_f1600_x2_v84a_asm_v1p0: + alloc_stack + save_vregs + load_constant_ptr + load_input + + mov count, #(KECCAK_F1600_ROUNDS) +loop: + keccak_f1600_round + sub count, count, #1 + cbnz count, loop + + store_input + restore_vregs + free_stack + ret + +#endif diff --git a/asm/manual/keccak_f1600/keccak_f1600_x2_v84a_asm_v2.s b/asm/manual/keccak_f1600/keccak_f1600_x2_v84a_asm_v2.s new file mode 100644 index 0000000..698c257 --- /dev/null +++ b/asm/manual/keccak_f1600/keccak_f1600_x2_v84a_asm_v2.s @@ -0,0 +1,375 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +/********************** CONSTANTS *************************/ + .data + .align(8) +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x1 + count .req x2 + cur_const .req x3 + + /* Mapping of Kecck-f1600 state to vector registers + * at the beginning and end of each round. */ + Aba .req v0 + Abe .req v1 + Abi .req v2 + Abo .req v3 + Abu .req v4 + Aga .req v5 + Age .req v6 + Agi .req v7 + Ago .req v8 + Agu .req v9 + Aka .req v10 + Ake .req v11 + Aki .req v12 + Ako .req v13 + Aku .req v14 + Ama .req v15 + Ame .req v16 + Ami .req v17 + Amo .req v18 + Amu .req v19 + Asa .req v20 + Ase .req v21 + Asi .req v22 + Aso .req v23 + Asu .req v24 + + /* q-form of the above mapping */ + Abaq .req q0 + Abeq .req q1 + Abiq .req q2 + Aboq .req q3 + Abuq .req q4 + Agaq .req q5 + Ageq .req q6 + Agiq .req q7 + Agoq .req q8 + Aguq .req q9 + Akaq .req q10 + Akeq .req q11 + Akiq .req q12 + Akoq .req q13 + Akuq .req q14 + Amaq .req q15 + Ameq .req q16 + Amiq .req q17 + Amoq .req q18 + Amuq .req q19 + Asaq .req q20 + Aseq .req q21 + Asiq .req q22 + Asoq .req q23 + Asuq .req q24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v30 + C1 .req v29 + C2 .req v28 + C3 .req v27 + C4 .req v26 + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req v26 + E1 .req v25 + E2 .req v29 + E3 .req v28 + E4 .req v27 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + Abi_ .req v2 + Abo_ .req v3 + Abu_ .req v4 + Aga_ .req v10 + Age_ .req v11 + Agi_ .req v7 + Ago_ .req v8 + Agu_ .req v9 + Aka_ .req v15 + Ake_ .req v16 + Aki_ .req v12 + Ako_ .req v13 + Aku_ .req v14 + Ama_ .req v20 + Ame_ .req v21 + Ami_ .req v17 + Amo_ .req v18 + Amu_ .req v19 + Asa_ .req v0 + Ase_ .req v1 + Asi_ .req v22 + Aso_ .req v23 + Asu_ .req v24 + Aba_ .req v30 + Abe_ .req v27 + + /* Unused temporary */ + tmp .req v31 + +/************************ MACROS ****************************/ + +.macro load_input + ldr Abaq, [input_addr, #(2*8*0)] + ldr Abeq, [input_addr, #(2*8*1)] + ldr Abiq, [input_addr, #(2*8*2)] + ldr Aboq, [input_addr, #(2*8*3)] + ldr Abuq, [input_addr, #(2*8*4)] + ldr Agaq, [input_addr, #(2*8*5)] + ldr Ageq, [input_addr, #(2*8*6)] + ldr Agiq, [input_addr, #(2*8*7)] + ldr Agoq, [input_addr, #(2*8*8)] + ldr Aguq, [input_addr, #(2*8*9)] + ldr Akaq, [input_addr, #(2*8*10)] + ldr Akeq, [input_addr, #(2*8*11)] + ldr Akiq, [input_addr, #(2*8*12)] + ldr Akoq, [input_addr, #(2*8*13)] + ldr Akuq, [input_addr, #(2*8*14)] + ldr Amaq, [input_addr, #(2*8*15)] + ldr Ameq, [input_addr, #(2*8*16)] + ldr Amiq, [input_addr, #(2*8*17)] + ldr Amoq, [input_addr, #(2*8*18)] + ldr Amuq, [input_addr, #(2*8*19)] + ldr Asaq, [input_addr, #(2*8*20)] + ldr Aseq, [input_addr, #(2*8*21)] + ldr Asiq, [input_addr, #(2*8*22)] + ldr Asoq, [input_addr, #(2*8*23)] + ldr Asuq, [input_addr, #(2*8*24)] +.endm + +.macro store_input + str Abaq, [input_addr, #(2*8*0)] + str Abeq, [input_addr, #(2*8*1)] + str Abiq, [input_addr, #(2*8*2)] + str Aboq, [input_addr, #(2*8*3)] + str Abuq, [input_addr, #(2*8*4)] + str Agaq, [input_addr, #(2*8*5)] + str Ageq, [input_addr, #(2*8*6)] + str Agiq, [input_addr, #(2*8*7)] + str Agoq, [input_addr, #(2*8*8)] + str Aguq, [input_addr, #(2*8*9)] + str Akaq, [input_addr, #(2*8*10)] + str Akeq, [input_addr, #(2*8*11)] + str Akiq, [input_addr, #(2*8*12)] + str Akoq, [input_addr, #(2*8*13)] + str Akuq, [input_addr, #(2*8*14)] + str Amaq, [input_addr, #(2*8*15)] + str Ameq, [input_addr, #(2*8*16)] + str Amiq, [input_addr, #(2*8*17)] + str Amoq, [input_addr, #(2*8*18)] + str Amuq, [input_addr, #(2*8*19)] + str Asaq, [input_addr, #(2*8*20)] + str Aseq, [input_addr, #(2*8*21)] + str Asiq, [input_addr, #(2*8*22)] + str Asoq, [input_addr, #(2*8*23)] + str Asuq, [input_addr, #(2*8*24)] +.endm + +#define STACK_SIZE (16*4) // VREGS (16*4) +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +.macro save_vregs + stp d8, d9, [sp, #(16*0)] + stp d10, d11, [sp, #(16*1)] + stp d12, d13, [sp, #(16*2)] + stp d14, d15, [sp, #(16*3)] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #(16*0)] + ldp d10, d11, [sp, #(16*1)] + ldp d12, d13, [sp, #(16*2)] + ldp d14, d15, [sp, #(16*3)] +.endm + +/* Macros using v8.4-A SHA-3 instructions */ + +.macro eor3_m1 d s0 s1 s2 + eor \d\().16b, \s0\().16b, \s1\().16b + eor \d\().16b, \d\().16b, \s2\().16b +.endm + +.macro rax1_m1 d s0 s1 + shl tmp.2d, \s1\().2d, #1 + sri tmp.2d, \s1\().2d, #63 + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +.macro xar_m1 d s0 s1 imm + eor tmp.16b, \s0\().16b, \s1\().16b + shl \d\().2d, tmp.2d, #(64-\imm) + sri \d\().2d, tmp.2d, #(\imm) +.endm + +.macro bcax_m1 d s0 s1 s2 + bic tmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +/* Keccak-f1600 round */ + +.macro keccak_f1600_round + + eor3_m1 C0, Aba, Aga, Aka + eor3_m1 C0, C0, Ama, Asa + eor3_m1 C1, Abe, Age, Ake + eor3_m1 C1, C1, Ame, Ase + eor3_m1 C2, Abi, Agi, Aki + eor3_m1 C2, C2, Ami, Asi + eor3_m1 C3, Abo, Ago, Ako + eor3_m1 C3, C3, Amo, Aso + eor3_m1 C4, Abu, Agu, Aku + eor3_m1 C4, C4, Amu, Asu + + rax1_m1 E1, C0, C2 + rax1_m1 E3, C2, C4 + rax1_m1 E0, C4, C1 + rax1_m1 E2, C1, C3 + rax1_m1 E4, C3, C0 + + eor Aba_.16b, Aba.16b, E0.16b + xar_m1 Asa_, Abi, E2, 2 + xar_m1 Abi_, Aki, E2, 21 + xar_m1 Aki_, Ako, E3, 39 + xar_m1 Ako_, Amu, E4, 56 + xar_m1 Amu_, Aso, E3, 8 + xar_m1 Aso_, Ama, E0, 23 + xar_m1 Aka_, Abe, E1, 63 + xar_m1 Ase_, Ago, E3, 9 + xar_m1 Ago_, Ame, E1, 19 + xar_m1 Ake_, Agi, E2, 58 + xar_m1 Agi_, Aka, E0, 61 + xar_m1 Aga_, Abo, E3, 36 + xar_m1 Abo_, Amo, E3, 43 + xar_m1 Amo_, Ami, E2, 49 + xar_m1 Ami_, Ake, E1, 54 + xar_m1 Age_, Agu, E4, 44 + xar_m1 Agu_, Asi, E2, 3 + xar_m1 Asi_, Aku, E4, 25 + xar_m1 Aku_, Asa, E0, 46 + xar_m1 Ama_, Abu, E4, 37 + xar_m1 Abu_, Asu, E4, 50 + xar_m1 Asu_, Ase, E1, 62 + xar_m1 Ame_, Aga, E0, 28 + xar_m1 Abe_, Age, E1, 20 + + bcax_m1 Aga, Aga_, Agi_, Age_ + bcax_m1 Age, Age_, Ago_, Agi_ + bcax_m1 Agi, Agi_, Agu_, Ago_ + bcax_m1 Ago, Ago_, Aga_, Agu_ + bcax_m1 Agu, Agu_, Age_, Aga_ + bcax_m1 Aka, Aka_, Aki_, Ake_ + bcax_m1 Ake, Ake_, Ako_, Aki_ + bcax_m1 Aki, Aki_, Aku_, Ako_ + bcax_m1 Ako, Ako_, Aka_, Aku_ + bcax_m1 Aku, Aku_, Ake_, Aka_ + bcax_m1 Ama, Ama_, Ami_, Ame_ + bcax_m1 Ame, Ame_, Amo_, Ami_ + bcax_m1 Ami, Ami_, Amu_, Amo_ + bcax_m1 Amo, Amo_, Ama_, Amu_ + bcax_m1 Amu, Amu_, Ame_, Ama_ + bcax_m1 Asa, Asa_, Asi_, Ase_ + bcax_m1 Ase, Ase_, Aso_, Asi_ + bcax_m1 Asi, Asi_, Asu_, Aso_ + bcax_m1 Aso, Aso_, Asa_, Asu_ + bcax_m1 Asu, Asu_, Ase_, Asa_ + bcax_m1 Aba, Aba_, Abi_, Abe_ + bcax_m1 Abe, Abe_, Abo_, Abi_ + bcax_m1 Abi, Abi_, Abu_, Abo_ + bcax_m1 Abo, Abo_, Aba_, Abu_ + bcax_m1 Abu, Abu_, Abe_, Aba_ + + // iota step + ld1r {tmp.2d}, [const_addr], #8 + eor Aba.16b, Aba.16b, tmp.16b + +.endm + +#define KECCAK_F1600_ROUNDS 24 + +.text +.align 4 +.global keccak_f1600_x2_v84a_asm_v2 +.global _keccak_f1600_x2_v84a_asm_v2 + +keccak_f1600_x2_v84a_asm_v2: +_keccak_f1600_x2_v84a_asm_v2: + alloc_stack + save_vregs + load_constant_ptr + load_input + + mov count, #(KECCAK_F1600_ROUNDS) +loop: + keccak_f1600_round + sub count, count, #1 + cbnz count, loop + + store_input + restore_vregs + free_stack + ret diff --git a/asm/manual/keccak_f1600/keccak_f1600_x2_v84a_asm_v2p0.s b/asm/manual/keccak_f1600/keccak_f1600_x2_v84a_asm_v2p0.s new file mode 100644 index 0000000..c9547da --- /dev/null +++ b/asm/manual/keccak_f1600/keccak_f1600_x2_v84a_asm_v2p0.s @@ -0,0 +1,596 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +/********************** CONSTANTS *************************/ + .data + .align(8) +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x1 + count .req x2 + cur_const .req x3 + + /* Mapping of Kecck-f1600 state to vector registers + * at the beginning and end of each round. */ + Aba .req v0 + Abe .req v1 + Abi .req v2 + Abo .req v3 + Abu .req v4 + Aga .req v5 + Age .req v6 + Agi .req v7 + Ago .req v8 + Agu .req v9 + Aka .req v10 + Ake .req v11 + Aki .req v12 + Ako .req v13 + Aku .req v14 + Ama .req v15 + Ame .req v16 + Ami .req v17 + Amo .req v18 + Amu .req v19 + Asa .req v20 + Ase .req v21 + Asi .req v22 + Aso .req v23 + Asu .req v24 + + /* q-form of the above mapping */ + Abaq .req q0 + Abeq .req q1 + Abiq .req q2 + Aboq .req q3 + Abuq .req q4 + Agaq .req q5 + Ageq .req q6 + Agiq .req q7 + Agoq .req q8 + Aguq .req q9 + Akaq .req q10 + Akeq .req q11 + Akiq .req q12 + Akoq .req q13 + Akuq .req q14 + Amaq .req q15 + Ameq .req q16 + Amiq .req q17 + Amoq .req q18 + Amuq .req q19 + Asaq .req q20 + Aseq .req q21 + Asiq .req q22 + Asoq .req q23 + Asuq .req q24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v27 + C1 .req v28 + C2 .req v29 + C3 .req v30 + C4 .req v31 + + C0q .req q27 + C1q .req q28 + C2q .req q29 + C3q .req q30 + C4q .req q31 + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req v26 + E1 .req v26 + E2 .req v26 + E3 .req v26 + E4 .req v26 + + E0q .req q26 + E1q .req q26 + E2q .req q26 + E3q .req q26 + E4q .req q26 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + vBgo .req v27 + vBgi .req v28 + vBga .req v29 + vBge .req v30 + vBgu .req v31 + vBki .req v27 + vBko .req v28 + vBka .req v29 + vBke .req v30 + vBku .req v31 + vBmu .req v27 + vBmo .req v28 + vBmi .req v29 + vBma .req v30 + vBme .req v31 + vBba .req v27 + vBbi .req v28 + vBbo .req v29 + vBbu .req v30 + vBbe .req v31 + vBsa .req v27 + vBso .req v28 + vBse .req v29 + vBsi .req v30 + vBsu .req v31 + + vBgoq .req q27 + vBgiq .req q28 + vBgaq .req q29 + vBgeq .req q30 + vBguq .req q31 + vBkiq .req q27 + vBkoq .req q28 + vBkaq .req q29 + vBkeq .req q30 + vBkuq .req q31 + vBmuq .req q27 + vBmoq .req q28 + vBmiq .req q29 + vBmaq .req q30 + vBmeq .req q31 + vBbaq .req q27 + vBbiq .req q28 + vBboq .req q29 + vBbuq .req q30 + vBbeq .req q31 + vBsaq .req q27 + vBsoq .req q28 + vBseq .req q29 + vBsiq .req q30 + vBsuq .req q31 + + vEgu .req Agu + vEga .req v26 + vEge .req v26 + vEgi .req v26 + vEgo .req v26 + vEka .req Aka + vEko .req Ako + vEke .req v26 + vEki .req v26 + vEku .req v26 + vEma .req v26 + vEme .req Ame + vEmi .req Ami + vEmo .req v26 + vEmu .req Amu + vEba .req Aba + vEbe .req Abe + vEbi .req v26 + vEbo .req Abo + vEbu .req Abu + vEsa .req Asa + vEse .req Ase + vEsi .req Asi + vEso .req Aso + vEsu .req Asu + + vEguq .req Aguq + vEgaq .req q26 + vEgeq .req q26 + vEgiq .req q26 + vEgoq .req q26 + vEkaq .req Akaq + vEkoq .req Akoq + vEkeq .req q26 + vEkiq .req q26 + vEkuq .req q26 + vEmaq .req q26 + vEmeq .req Ameq + vEmiq .req Amiq + vEmoq .req q26 + vEmuq .req Amuq + vEbaq .req Abaq + vEbeq .req Abeq + vEbiq .req q26 + vEboq .req Aboq + vEbuq .req Abuq + vEsaq .req Asaq + vEseq .req Aseq + vEsiq .req Asiq + vEsoq .req Asoq + vEsuq .req Asuq + +/************************ MACROS ****************************/ + +.macro load_input + ldp Abaq, Abeq, [input_addr, #(2*8*0)] + ldp Abiq, Aboq, [input_addr, #(2*8*2)] + ldp Abuq, Agaq, [input_addr, #(2*8*4)] + ldp Ageq, Agiq, [input_addr, #(2*8*6)] + ldp Agoq, Aguq, [input_addr, #(2*8*8)] + ldp Akaq, Akeq, [input_addr, #(2*8*10)] + ldp Akiq, Akoq, [input_addr, #(2*8*12)] + ldp Akuq, Amaq, [input_addr, #(2*8*14)] + ldp Ameq, Amiq, [input_addr, #(2*8*16)] + ldp Amoq, Amuq, [input_addr, #(2*8*18)] + ldp Asaq, Aseq, [input_addr, #(2*8*20)] + ldp Asiq, Asoq, [input_addr, #(2*8*22)] + ldr Asuq, [input_addr, #(2*8*24)] +.endm + +.macro store_input + str Abaq, [input_addr, #(2*8*0)] + str Abeq, [input_addr, #(2*8*1)] + str Abiq, [input_addr, #(2*8*2)] + str Aboq, [input_addr, #(2*8*3)] + str Abuq, [input_addr, #(2*8*4)] + str Agaq, [input_addr, #(2*8*5)] + str Ageq, [input_addr, #(2*8*6)] + str Agiq, [input_addr, #(2*8*7)] + str Agoq, [input_addr, #(2*8*8)] + str Aguq, [input_addr, #(2*8*9)] + str Akaq, [input_addr, #(2*8*10)] + str Akeq, [input_addr, #(2*8*11)] + str Akiq, [input_addr, #(2*8*12)] + str Akoq, [input_addr, #(2*8*13)] + str Akuq, [input_addr, #(2*8*14)] + str Amaq, [input_addr, #(2*8*15)] + str Ameq, [input_addr, #(2*8*16)] + str Amiq, [input_addr, #(2*8*17)] + str Amoq, [input_addr, #(2*8*18)] + str Amuq, [input_addr, #(2*8*19)] + str Asaq, [input_addr, #(2*8*20)] + str Aseq, [input_addr, #(2*8*21)] + str Asiq, [input_addr, #(2*8*22)] + str Asoq, [input_addr, #(2*8*23)] + str Asuq, [input_addr, #(2*8*24)] +.endm + +#define STACK_SIZE (16*4 + 16*30) +#define STACK_BASE_VREGS 0 +#define STACK_BASE_TMP 16*4 + +#define E0_offset 0 +#define E1_offset 1 +#define E2_offset 2 +#define E3_offset 3 +#define E4_offset 4 + +#define Aba_offset (5 + 0 ) +#define Abe_offset (5 + 1 ) +#define Abi_offset (5 + 2 ) +#define Abo_offset (5 + 3 ) +#define Abu_offset (5 + 4 ) +#define Aga_offset (5 + 5 ) +#define Age_offset (5 + 6 ) +#define Agi_offset (5 + 7 ) +#define Ago_offset (5 + 8 ) +#define Agu_offset (5 + 9 ) +#define Aka_offset (5 + 10 ) +#define Ake_offset (5 + 11 ) +#define Aki_offset (5 + 12 ) +#define Ako_offset (5 + 13 ) +#define Aku_offset (5 + 14 ) +#define Ama_offset (5 + 15 ) +#define Ame_offset (5 + 16 ) +#define Ami_offset (5 + 17 ) +#define Amo_offset (5 + 18 ) +#define Amu_offset (5 + 19 ) +#define Asa_offset (5 + 20 ) +#define Ase_offset (5 + 21 ) +#define Asi_offset (5 + 22 ) +#define Aso_offset (5 + 23 ) +#define Asu_offset (5 + 24 ) + +#define vEba_offset (5 + 0 ) +#define vEbe_offset (5 + 1 ) +#define vEbi_offset (5 + 2 ) +#define vEbo_offset (5 + 3 ) +#define vEbu_offset (5 + 4 ) +#define vEga_offset (5 + 5 ) +#define vEge_offset (5 + 6 ) +#define vEgi_offset (5 + 7 ) +#define vEgo_offset (5 + 8 ) +#define vEgu_offset (5 + 9 ) +#define vEka_offset (5 + 10 ) +#define vEke_offset (5 + 11 ) +#define vEki_offset (5 + 12 ) +#define vEko_offset (5 + 13 ) +#define vEku_offset (5 + 14 ) +#define vEma_offset (5 + 15 ) +#define vEme_offset (5 + 16 ) +#define vEmi_offset (5 + 17 ) +#define vEmo_offset (5 + 18 ) +#define vEmu_offset (5 + 19 ) +#define vEsa_offset (5 + 20 ) +#define vEse_offset (5 + 21 ) +#define vEsi_offset (5 + 22 ) +#define vEso_offset (5 + 23 ) +#define vEsu_offset (5 + 24 ) + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +#define save(name) \ + str name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] +#define restore(name) \ + ldr name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] + +.macro save_vregs + stp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + stp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + stp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + stp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + ldp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + ldp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + ldp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +/* Macros using v8.4-A SHA-3 instructions */ + +.macro eor3_m1_0 d s0 s1 s2 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor2 d s0 s1 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor3_m1_1 d s0 s1 s2 + eor \d\().16b, \d\().16b, \s2\().16b +.endm + +.macro eor3_m1 d s0 s1 s2 + eor3_m1_0 \d, \s0, \s1, \s2 + eor3_m1_1 \d, \s0, \s1, \s2 +.endm + +.macro rax1_m1 d s0 s1 + add tmp.2d, \s1\().2d, \s1\().2d + sri tmp.2d, \s1\().2d, #63 + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +.macro xar_m1 d s0 s1 imm + eor \s0\().16b, \s0\().16b, \s1\().16b + shl \d\().2d, \s0\().2d, #(64-\imm) + sri \d\().2d, \s0\().2d, #(\imm) +.endm + +.macro xar_m1_0 d s0 s1 imm tmp + eor \tmp\().16b, \s0\().16b, \s1\().16b +.endm + +.macro xar_m1_1 d s0 s1 imm tmp + shl \d\().2d, \tmp\().2d, #(64-\imm) +.endm + +.macro xar_m1_2 d s0 s1 imm tmp + sri \d\().2d, \tmp\().2d, #(\imm) +.endm + +.macro bcax_m1 d s0 s1 s2 + bic tmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +.macro refresh d + mov \d\().16b, \d\().16b +.endm +/* Keccak-f1600 round */ + +.macro keccak_f1600_round + + eor2 C0, Aka, Aga + eor2 C1, Ake, Age + eor2 C2, Aki, Agi + eor2 C3, Ako, Ago + eor2 C4, Aku, Agu + eor2 C0, C0, Ama + eor2 C1, C1, Ame + eor2 C2, C2, Ami + eor2 C3, C3, Amo + eor2 C4, C4, Amu + eor2 C0, C0, Asa + eor2 C1, C1, Ase + eor2 C2, C2, Asi + eor2 C3, C3, Aso + eor2 C4, C4, Asu + eor2 C0, C0, Aba + eor2 C1, C1, Abe + eor2 C2, C2, Abi + eor2 C3, C3, Abo + eor2 C4, C4, Abu + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req v25 + rax1_m1 E2, C1, C3 SEP save(E2) + rax1_m1 E4, C3, C0 SEP save(E4) + rax1_m1 E1, C0, C2 SEP save(E1) + rax1_m1 E3, C2, C4 SEP save(E3) + rax1_m1 E0, C4, C1 SEP save(E0) + + restore(E1) + xar_m1 vBgo, Ame, E1, 19 SEP restore(E0) + xar_m1 vBgi, Aka, E0, 61 SEP restore(E3) + xar_m1 vBga, Abo, E3, 36 SEP restore(E4) + xar_m1 vBge, Agu, E4, 44 SEP restore(E2) + xar_m1 vBgu, Asi, E2, 3 SEP + + bcax_m1 vEga, vBga, vBgi, vBge SEP save(vEga) + bcax_m1 vEge, vBge, vBgo, vBgi SEP save(vEge) + bcax_m1 vEgi, vBgi, vBgu, vBgo SEP save(vEgi) + bcax_m1 vEgo, vBgo, vBga, vBgu SEP save(vEgo) + bcax_m1 vEgu, vBgu, vBge, vBga SEP save(vEgu) + + restore(E3) + xar_m1 vBki, Ako, E3, 39 SEP restore(E4) + xar_m1 vBko, Amu, E4, 56 SEP restore(E1) + xar_m1 vBka, Abe, E1, 63 SEP restore(E2) + xar_m1 vBke, Agi, E2, 58 SEP restore(E0) + xar_m1 vBku, Asa, E0, 46 + + bcax_m1 vEka, vBka, vBki, vBke SEP save(vEka) + bcax_m1 vEke, vBke, vBko, vBki SEP save(vEke) + bcax_m1 vEki, vBki, vBku, vBko SEP save(vEki) + bcax_m1 vEko, vBko, vBka, vBku SEP save(vEko) + bcax_m1 vEku, vBku, vBke, vBka SEP save(vEku) + + restore(E3) + xar_m1 vBmu, Aso, E3, 8 SEP restore(E2) + xar_m1 vBmo, Ami, E2, 49 SEP restore(E1) + xar_m1 vBmi, Ake, E1, 54 SEP restore(E4) + xar_m1 vBma, Abu, E4, 37 SEP restore(E0) + xar_m1 vBme, Aga, E0, 28 + + bcax_m1 vEma, vBma, vBmi, vBme SEP save(vEma) + bcax_m1 vEme, vBme, vBmo, vBmi SEP save(vEme) + bcax_m1 vEmi, vBmi, vBmu, vBmo SEP save(vEmi) + bcax_m1 vEmo, vBmo, vBma, vBmu SEP save(vEmo) + bcax_m1 vEmu, vBmu, vBme, vBma SEP save(vEmu) + + restore(E0) + eor2 vBba, Aba, E0 SEP restore(E2) + xar_m1 vBbi, Aki, E2, 21 SEP restore(E3) + xar_m1 vBbo, Amo, E3, 43 SEP restore(E4) + xar_m1 vBbu, Asu, E4, 50 SEP restore(E1) + xar_m1 vBbe, Age, E1, 20 + + bcax_m1 vEba, vBba, vBbi, vBbe SEP save(vEba) + bcax_m1 vEbe, vBbe, vBbo, vBbi SEP save(vEbe) + bcax_m1 vEbi, vBbi, vBbu, vBbo SEP save(vEbi) + bcax_m1 vEbo, vBbo, vBba, vBbu SEP save(vEbo) + bcax_m1 vEbu, vBbu, vBbe, vBba SEP save(vEbu) + + restore(E2) + xar_m1 vBsa, Abi, E2, 2 SEP restore(E0) + xar_m1 vBso, Ama, E0, 23 SEP restore(E3) + xar_m1 vBse, Ago, E3, 9 SEP restore(E4) + xar_m1 vBsi, Aku, E4, 25 SEP restore(E1) + xar_m1 vBsu, Ase, E1, 62 + + bcax_m1 vEsa, vBsa, vBsi, vBse SEP save(vEsa) + bcax_m1 vEse, vBse, vBso, vBsi SEP save(vEse) + bcax_m1 vEsi, vBsi, vBsu, vBso SEP save(vEsi) + bcax_m1 vEso, vBso, vBsa, vBsu SEP save(vEso) + bcax_m1 vEsu, vBsu, vBse, vBsa SEP save(vEsu) + + restore(Aba) + restore(Abe) + restore(Abi) + restore(Abo) + restore(Abu) + restore(Aga) + restore(Age) + restore(Agi) + restore(Ago) + restore(Agu) + restore(Aka) + restore(Ake) + restore(Aki) + restore(Ako) + restore(Aku) + restore(Ama) + restore(Ame) + restore(Ami) + restore(Amo) + restore(Amu) + restore(Asa) + restore(Ase) + restore(Asi) + restore(Aso) + restore(Asu) + + ld1r {tmp.2d}, [const_addr], #8 + eor Aba.16b, Aba.16b, tmp.16b + + .unreq tmp +.endm + +.text +.align 4 +.global keccak_f1600_x2_v84a_asm_v2p0 +.global _keccak_f1600_x2_v84a_asm_v2p0 + +#define KECCAK_F1600_ROUNDS 24 + +keccak_f1600_x2_v84a_asm_v2p0: +_keccak_f1600_x2_v84a_asm_v2p0: + alloc_stack + save_vregs + load_constant_ptr + load_input + + mov count, #12 +loop: + keccak_f1600_round + keccak_f1600_round + sub count, count, #1 + cbnz count, loop + + store_input + restore_vregs + free_stack + ret diff --git a/asm/manual/keccak_f1600/keccak_f1600_x2_v84a_asm_v2p1.s b/asm/manual/keccak_f1600/keccak_f1600_x2_v84a_asm_v2p1.s new file mode 100644 index 0000000..2b24b1a --- /dev/null +++ b/asm/manual/keccak_f1600/keccak_f1600_x2_v84a_asm_v2p1.s @@ -0,0 +1,732 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +/********************** CONSTANTS *************************/ + .data + .align(8) +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x1 + count .req x2 + cur_const .req x3 + + /* Mapping of Kecck-f1600 state to vector registers + * at the beginning and end of each round. */ + Aba .req v0 + Abe .req v1 + Abi .req v2 + Abo .req v3 + Abu .req v4 + Aga .req v5 + Age .req v6 + Agi .req v7 + Ago .req v8 + Agu .req v9 + Aka .req v10 + Ake .req v11 + Aki .req v12 + Ako .req v13 + Aku .req v14 + Ama .req v15 + Ame .req v16 + Ami .req v17 + Amo .req v18 + Amu .req v19 + Asa .req v20 + Ase .req v21 + Asi .req v22 + Aso .req v23 + Asu .req v24 + + /* q-form of the above mapping */ + Abaq .req q0 + Abeq .req q1 + Abiq .req q2 + Aboq .req q3 + Abuq .req q4 + Agaq .req q5 + Ageq .req q6 + Agiq .req q7 + Agoq .req q8 + Aguq .req q9 + Akaq .req q10 + Akeq .req q11 + Akiq .req q12 + Akoq .req q13 + Akuq .req q14 + Amaq .req q15 + Ameq .req q16 + Amiq .req q17 + Amoq .req q18 + Amuq .req q19 + Asaq .req q20 + Aseq .req q21 + Asiq .req q22 + Asoq .req q23 + Asuq .req q24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v27 + C1 .req v28 + C2 .req v29 + C3 .req v30 + C4 .req v31 + + C0q .req q27 + C1q .req q28 + C2q .req q29 + C3q .req q30 + C4q .req q31 + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req v26 + E1 .req v26 + E2 .req v26 + E3 .req v26 + E4 .req v26 + + E0q .req q26 + E1q .req q26 + E2q .req q26 + E3q .req q26 + E4q .req q26 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + vBgo .req v27 + vBgi .req Ame + vBga .req Aka + vBge .req Abo + vBgu .req Agu + vBki .req Asi + vBko .req Ako + vBka .req Amu + vBke .req Abe + vBku .req Agi + vBmu .req Asa + vBmo .req Aso + vBmi .req Ami + vBma .req Ake + vBme .req Abu + vBba .req Aga + vBbi .req Aba + vBbo .req Aki + vBbu .req Amo + vBbe .req Asu + vBsa .req Age + vBso .req Abi + vBse .req Ama + vBsi .req Ago + vBsu .req Aku + + vBgoq .req q27 + vBgiq .req Ameq + vBgaq .req Akaq + vBgeq .req Aboq + vBguq .req Aguq + vBkiq .req Asiq + vBkoq .req Akoq + vBkaq .req Amuq + vBkeq .req Abeq + vBkuq .req Agiq + vBmuq .req Asaq + vBmoq .req Asoq + vBmiq .req Amiq + vBmaq .req Akeq + vBmeq .req Abuq + vBbaq .req Agaq + vBbiq .req Abaq + vBboq .req Akiq + vBbuq .req Amoq + vBbeq .req Asuq + vBsaq .req Ageq + vBsoq .req Abiq + vBseq .req Amaq + vBsiq .req Agoq + vBsuq .req Akuq + + vEga .req v28 + vEge .req v29 + vEgi .req vBgi + vEgo .req vBgo + vEgu .req vBgu + vEka .req vBga + vEke .req vBge + vEki .req vBki + vEko .req vBko + vEku .req vBku + vEma .req vBka + vEme .req vBke + vEmi .req vBmi + vEmo .req vBmo + vEmu .req vBmu + vEba .req vBma + vEbe .req vBme + vEbi .req vBbi + vEbo .req vBbo + vEbu .req vBbu + vEsa .req vBba + vEse .req vBbe + vEsi .req vBsi + vEso .req vBso + vEsu .req vBsu + + vEgaq .req q28 + vEgeq .req q29 + vEgiq .req vBgiq + vEgoq .req vBgoq + vEguq .req vBguq + vEkaq .req vBgaq + vEkeq .req vBgeq + vEkiq .req vBkiq + vEkoq .req vBkoq + vEkuq .req vBkuq + vEmaq .req vBkaq + vEmeq .req vBkeq + vEmiq .req vBmiq + vEmoq .req vBmoq + vEmuq .req vBmuq + vEbaq .req vBmaq + vEbeq .req vBmeq + vEbiq .req vBbiq + vEboq .req vBboq + vEbuq .req vBbuq + vEsaq .req vBbaq + vEseq .req vBbeq + vEsiq .req vBsiq + vEsoq .req vBsoq + vEsuq .req vBsuq + +/************************ MACROS ****************************/ + +.macro load_input + ldp Abaq, Abeq, [input_addr, #(2*8*0)] + ldp Abiq, Aboq, [input_addr, #(2*8*2)] + ldp Abuq, Agaq, [input_addr, #(2*8*4)] + ldp Ageq, Agiq, [input_addr, #(2*8*6)] + ldp Agoq, Aguq, [input_addr, #(2*8*8)] + ldp Akaq, Akeq, [input_addr, #(2*8*10)] + ldp Akiq, Akoq, [input_addr, #(2*8*12)] + ldp Akuq, Amaq, [input_addr, #(2*8*14)] + ldp Ameq, Amiq, [input_addr, #(2*8*16)] + ldp Amoq, Amuq, [input_addr, #(2*8*18)] + ldp Asaq, Aseq, [input_addr, #(2*8*20)] + ldp Asiq, Asoq, [input_addr, #(2*8*22)] + ldr Asuq, [input_addr, #(2*8*24)] +.endm + +.macro store_input + str Abaq, [input_addr, #(2*8*0)] + str Abeq, [input_addr, #(2*8*1)] + str Abiq, [input_addr, #(2*8*2)] + str Aboq, [input_addr, #(2*8*3)] + str Abuq, [input_addr, #(2*8*4)] + str Agaq, [input_addr, #(2*8*5)] + str Ageq, [input_addr, #(2*8*6)] + str Agiq, [input_addr, #(2*8*7)] + str Agoq, [input_addr, #(2*8*8)] + str Aguq, [input_addr, #(2*8*9)] + str Akaq, [input_addr, #(2*8*10)] + str Akeq, [input_addr, #(2*8*11)] + str Akiq, [input_addr, #(2*8*12)] + str Akoq, [input_addr, #(2*8*13)] + str Akuq, [input_addr, #(2*8*14)] + str Amaq, [input_addr, #(2*8*15)] + str Ameq, [input_addr, #(2*8*16)] + str Amiq, [input_addr, #(2*8*17)] + str Amoq, [input_addr, #(2*8*18)] + str Amuq, [input_addr, #(2*8*19)] + str Asaq, [input_addr, #(2*8*20)] + str Aseq, [input_addr, #(2*8*21)] + str Asiq, [input_addr, #(2*8*22)] + str Asoq, [input_addr, #(2*8*23)] + str Asuq, [input_addr, #(2*8*24)] +.endm + +#define STACK_SIZE (16*4 + 16*30) +#define STACK_BASE_VREGS 0 +#define STACK_BASE_TMP 16*4 + +#define E0_offset 0 +#define E1_offset 1 +#define E2_offset 2 +#define E3_offset 3 +#define E4_offset 4 + +#define Aba_offset (5 + 0 ) +#define Abe_offset (5 + 1 ) +#define Abi_offset (5 + 2 ) +#define Abo_offset (5 + 3 ) +#define Abu_offset (5 + 4 ) +#define Aga_offset (5 + 5 ) +#define Age_offset (5 + 6 ) +#define Agi_offset (5 + 7 ) +#define Ago_offset (5 + 8 ) +#define Agu_offset (5 + 9 ) +#define Aka_offset (5 + 10 ) +#define Ake_offset (5 + 11 ) +#define Aki_offset (5 + 12 ) +#define Ako_offset (5 + 13 ) +#define Aku_offset (5 + 14 ) +#define Ama_offset (5 + 15 ) +#define Ame_offset (5 + 16 ) +#define Ami_offset (5 + 17 ) +#define Amo_offset (5 + 18 ) +#define Amu_offset (5 + 19 ) +#define Asa_offset (5 + 20 ) +#define Ase_offset (5 + 21 ) +#define Asi_offset (5 + 22 ) +#define Aso_offset (5 + 23 ) +#define Asu_offset (5 + 24 ) + +#define vEba_offset (5 + 0 ) +#define vEbe_offset (5 + 1 ) +#define vEbi_offset (5 + 2 ) +#define vEbo_offset (5 + 3 ) +#define vEbu_offset (5 + 4 ) +#define vEga_offset (5 + 5 ) +#define vEge_offset (5 + 6 ) +#define vEgi_offset (5 + 7 ) +#define vEgo_offset (5 + 8 ) +#define vEgu_offset (5 + 9 ) +#define vEka_offset (5 + 10 ) +#define vEke_offset (5 + 11 ) +#define vEki_offset (5 + 12 ) +#define vEko_offset (5 + 13 ) +#define vEku_offset (5 + 14 ) +#define vEma_offset (5 + 15 ) +#define vEme_offset (5 + 16 ) +#define vEmi_offset (5 + 17 ) +#define vEmo_offset (5 + 18 ) +#define vEmu_offset (5 + 19 ) +#define vEsa_offset (5 + 20 ) +#define vEse_offset (5 + 21 ) +#define vEsi_offset (5 + 22 ) +#define vEso_offset (5 + 23 ) +#define vEsu_offset (5 + 24 ) + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +#define save(name) \ + str name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] +#define restore(name) \ + ldr name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] + +.macro save_vregs + stp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + stp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + stp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + stp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + ldp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + ldp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + ldp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +/* Macros using v8.4-A SHA-3 instructions */ + +.macro eor3_m1_0 d s0 s1 s2 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor2 d s0 s1 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor3_m1_1 d s0 s1 s2 + eor \d\().16b, \d\().16b, \s2\().16b +.endm + +.macro eor3_m1 d s0 s1 s2 + eor3_m1_0 \d, \s0, \s1, \s2 + eor3_m1_1 \d, \s0, \s1, \s2 +.endm + +.macro rax1_m1 d s0 s1 + add tmp.2d, \s1\().2d, \s1\().2d + sri tmp.2d, \s1\().2d, #63 + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +.macro xar_m1 d s0 s1 imm + eor \s0\().16b, \s0\().16b, \s1\().16b + shl \d\().2d, \s0\().2d, #(64-\imm) + sri \d\().2d, \s0\().2d, #(\imm) +.endm + +.macro xar_m1_0 d s0 s1 imm tmp + eor \tmp\().16b, \s0\().16b, \s1\().16b +.endm + +.macro xar_m1_1 d s0 s1 imm tmp + shl \d\().2d, \tmp\().2d, #(64-\imm) +.endm + +.macro xar_m1_2 d s0 s1 imm tmp + sri \d\().2d, \tmp\().2d, #(\imm) +.endm + +.macro bcax_m1 d s0 s1 s2 + bic tmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +.macro refresh d + mov \d\().16b, \d\().16b +.endm +/* Keccak-f1600 round */ + +.macro keccak_f1600_round_pre + eor2 C0, Aka, Aga + eor2 C1, Ake, Age + eor2 C2, Aki, Agi + eor2 C3, Ako, Ago + eor2 C4, Aku, Agu + eor2 C0, C0, Ama + eor2 C1, C1, Ame + eor2 C2, C2, Ami + eor2 C3, C3, Amo + eor2 C4, C4, Amu + eor2 C0, C0, Asa + eor2 C1, C1, Ase + eor2 C2, C2, Asi + eor2 C3, C3, Aso + eor2 C4, C4, Asu + eor2 C0, C0, Aba + eor2 C1, C1, Abe + eor2 C2, C2, Abi + eor2 C3, C3, Abo + eor2 C4, C4, Abu +.endm + +.macro keccak_f1600_round_post + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req v25 + rax1_m1 E2, C1, C3 SEP save(E2) + rax1_m1 E4, C3, C0 SEP save(E4) + rax1_m1 E1, C0, C2 SEP save(E1) + rax1_m1 E3, C2, C4 SEP save(E3) + rax1_m1 E0, C4, C1 SEP save(E0) + + restore(E1) + xar_m1 vBgo, Ame, E1, 19 SEP restore(E0) + xar_m1 vBgi, Aka, E0, 61 SEP restore(E3) + xar_m1 vBga, Abo, E3, 36 SEP restore(E4) + xar_m1 vBge, Agu, E4, 44 SEP restore(E2) + xar_m1 vBgu, Asi, E2, 3 SEP + + bcax_m1 vEga, vBga, vBgi, vBge SEP save(vEga) + bcax_m1 vEge, vBge, vBgo, vBgi SEP save(vEge) + bcax_m1 vEgi, vBgi, vBgu, vBgo SEP save(vEgi) + bcax_m1 vEgo, vBgo, vBga, vBgu SEP save(vEgo) + bcax_m1 vEgu, vBgu, vBge, vBga SEP save(vEgu) + + restore(E3) + xar_m1 vBki, Ako, E3, 39 SEP restore(E4) + xar_m1 vBko, Amu, E4, 56 SEP restore(E1) + xar_m1 vBka, Abe, E1, 63 SEP restore(E2) + xar_m1 vBke, Agi, E2, 58 SEP restore(E0) + xar_m1 vBku, Asa, E0, 46 + + bcax_m1 vEka, vBka, vBki, vBke SEP save(vEka) + bcax_m1 vEke, vBke, vBko, vBki SEP save(vEke) + bcax_m1 vEki, vBki, vBku, vBko SEP save(vEki) + bcax_m1 vEko, vBko, vBka, vBku SEP save(vEko) + bcax_m1 vEku, vBku, vBke, vBka SEP save(vEku) + + restore(E3) + xar_m1 vBmu, Aso, E3, 8 SEP restore(E2) + xar_m1 vBmo, Ami, E2, 49 SEP restore(E1) + xar_m1 vBmi, Ake, E1, 54 SEP restore(E4) + xar_m1 vBma, Abu, E4, 37 SEP restore(E0) + xar_m1 vBme, Aga, E0, 28 + + bcax_m1 vEma, vBma, vBmi, vBme SEP save(vEma) + bcax_m1 vEme, vBme, vBmo, vBmi SEP save(vEme) + bcax_m1 vEmi, vBmi, vBmu, vBmo SEP save(vEmi) + bcax_m1 vEmo, vBmo, vBma, vBmu SEP save(vEmo) + bcax_m1 vEmu, vBmu, vBme, vBma SEP save(vEmu) + + restore(E0) + eor2 vBba, Aba, E0 SEP restore(E2) + xar_m1 vBbi, Aki, E2, 21 SEP restore(E3) + xar_m1 vBbo, Amo, E3, 43 SEP restore(E4) + xar_m1 vBbu, Asu, E4, 50 SEP restore(E1) + xar_m1 vBbe, Age, E1, 20 + + bcax_m1 vEba, vBba, vBbi, vBbe SEP save(vEba) + bcax_m1 vEbe, vBbe, vBbo, vBbi SEP save(vEbe) + bcax_m1 vEbi, vBbi, vBbu, vBbo SEP save(vEbi) + bcax_m1 vEbo, vBbo, vBba, vBbu SEP save(vEbo) + bcax_m1 vEbu, vBbu, vBbe, vBba SEP save(vEbu) + + restore(E2) + xar_m1 vBsa, Abi, E2, 2 SEP restore(E0) + xar_m1 vBso, Ama, E0, 23 SEP restore(E3) + xar_m1 vBse, Ago, E3, 9 SEP restore(E4) + xar_m1 vBsi, Aku, E4, 25 SEP restore(E1) + xar_m1 vBsu, Ase, E1, 62 + + bcax_m1 vEsa, vBsa, vBsi, vBse SEP save(vEsa) + bcax_m1 vEse, vBse, vBso, vBsi SEP save(vEse) + bcax_m1 vEsi, vBsi, vBsu, vBso SEP save(vEsi) + bcax_m1 vEso, vBso, vBsa, vBsu SEP save(vEso) + bcax_m1 vEsu, vBsu, vBse, vBsa SEP save(vEsu) + + restore(Aba) + restore(Abe) + restore(Abi) + restore(Abo) + restore(Abu) + restore(Aga) + restore(Age) + restore(Agi) + restore(Ago) + restore(Agu) + restore(Aka) + restore(Ake) + restore(Aki) + restore(Ako) + restore(Aku) + restore(Ama) + restore(Ame) + restore(Ami) + restore(Amo) + restore(Amu) + restore(Asa) + restore(Ase) + restore(Asi) + restore(Aso) + restore(Asu) + + ld1r {tmp.2d}, [const_addr], #8 + eor Aba.16b, Aba.16b, tmp.16b + + .unreq tmp + +.endm + +.macro keccak_f1600_round_core + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req v25 + + rax1_m1 E1, C0, C2 SEP save(E1) + rax1_m1 E0, C4, C1 SEP save(E0) + rax1_m1 E3, C2, C4 SEP save(E3) + rax1_m1 E4, C3, C0 SEP save(E4) + rax1_m1 E2, C1, C3 SEP save(E2) + + restore(E1) + xar_m1 vBgo, Ame, E1, 19 SEP restore(E0) + xar_m1 vBgi, Aka, E0, 61 SEP restore(E3) + xar_m1 vBga, Abo, E3, 36 SEP restore(E4) + xar_m1 vBge, Agu, E4, 44 SEP restore(E2) + xar_m1 vBgu, Asi, E2, 3 SEP + + bcax_m1 vEga, vBga, vBgi, vBge SEP save(vEga) + bcax_m1 vEge, vBge, vBgo, vBgi SEP save(vEge) + bcax_m1 vEgi, vBgi, vBgu, vBgo SEP save(vEgi) + bcax_m1 vEgo, vBgo, vBga, vBgu SEP save(vEgo) + bcax_m1 vEgu, vBgu, vBge, vBga SEP save(vEgu) + + restore(E3) + xar_m1 vBki, Ako, E3, 39 SEP restore(E4) + xar_m1 vBko, Amu, E4, 56 SEP restore(E1) + xar_m1 vBka, Abe, E1, 63 SEP restore(E2) + xar_m1 vBke, Agi, E2, 58 SEP restore(E0) + xar_m1 vBku, Asa, E0, 46 + + bcax_m1 vEka, vBka, vBki, vBke SEP save(vEka) + bcax_m1 vEke, vBke, vBko, vBki SEP save(vEke) + bcax_m1 vEki, vBki, vBku, vBko SEP save(vEki) + bcax_m1 vEko, vBko, vBka, vBku SEP save(vEko) + bcax_m1 vEku, vBku, vBke, vBka SEP save(vEku) + + eor2 C3 /* 30 */, vEko, vEgo /* 27 */ + eor2 C0 /* 27 */, vEka, vEga /* 28 */ + eor2 C1 /* 28 */, vEke, vEge /* 29 */ + eor2 C2 /* 29 */, vEki, vEgi + eor2 C4 /* 31 */, vEku, vEgu + restore(E3) + xar_m1 vBmu, Aso, E3, 8 SEP restore(E2) + xar_m1 vBmo, Ami, E2, 49 SEP restore(E1) + xar_m1 vBmi, Ake, E1, 54 SEP restore(E4) + xar_m1 vBma, Abu, E4, 37 SEP restore(E0) + xar_m1 vBme, Aga, E0, 28 + + bcax_m1 vEma, vBma, vBmi, vBme SEP save(vEma) + bcax_m1 vEme, vBme, vBmo, vBmi SEP save(vEme) + bcax_m1 vEmi, vBmi, vBmu, vBmo SEP save(vEmi) + bcax_m1 vEmo, vBmo, vBma, vBmu SEP save(vEmo) + bcax_m1 vEmu, vBmu, vBme, vBma SEP save(vEmu) + + eor2 C0, C0, vEma + eor2 C1, C1, vEme + eor2 C2, C2, vEmi + eor2 C3, C3, vEmo + eor2 C4, C4, vEmu + restore(E0) + eor2 vBba, Aba, E0 SEP restore(E2) + xar_m1 vBbi, Aki, E2, 21 SEP restore(E3) + xar_m1 vBbo, Amo, E3, 43 SEP restore(E4) + xar_m1 vBbu, Asu, E4, 50 SEP restore(E1) + xar_m1 vBbe, Age, E1, 20 + + bcax_m1 vEba, vBba, vBbi, vBbe + ld1r {tmp.2d}, [const_addr], #8 + eor2 vEba, vEba, tmp SEP save(vEba) + bcax_m1 vEbe, vBbe, vBbo, vBbi SEP save(vEbe) + bcax_m1 vEbi, vBbi, vBbu, vBbo SEP save(vEbi) + bcax_m1 vEbo, vBbo, vBba, vBbu SEP save(vEbo) + bcax_m1 vEbu, vBbu, vBbe, vBba SEP save(vEbu) + + eor2 C0, C0, vEba + eor2 C1, C1, vEbe + eor2 C2, C2, vEbi + eor2 C3, C3, vEbo + eor2 C4, C4, vEbu + restore(E2) + xar_m1 vBsa, Abi, E2, 2 SEP restore(E0) + xar_m1 vBso, Ama, E0, 23 SEP restore(E3) + xar_m1 vBse, Ago, E3, 9 SEP restore(E4) + xar_m1 vBsi, Aku, E4, 25 SEP restore(E1) + xar_m1 vBsu, Ase, E1, 62 + + bcax_m1 vEsa, vBsa, vBsi, vBse SEP save(vEsa) + bcax_m1 vEse, vBse, vBso, vBsi SEP save(vEse) + bcax_m1 vEsi, vBsi, vBsu, vBso SEP save(vEsi) + bcax_m1 vEso, vBso, vBsa, vBsu SEP save(vEso) + bcax_m1 vEsu, vBsu, vBse, vBsa SEP save(vEsu) + + eor2 C0, C0, vEsa + eor2 C1, C1, vEse + eor2 C2, C2, vEsi + eor2 C3, C3, vEso + eor2 C4, C4, vEsu + + restore(Aba) + restore(Abe) + restore(Abi) + restore(Abo) + restore(Abu) + restore(Aga) + restore(Age) + restore(Agi) + restore(Ago) + restore(Agu) + restore(Aka) + restore(Ake) + restore(Aki) + restore(Ako) + restore(Aku) + restore(Ama) + restore(Ame) + restore(Ami) + restore(Amo) + restore(Amu) + restore(Asa) + restore(Ase) + restore(Asi) + restore(Aso) + restore(Asu) + + .unreq tmp + +.endm + +.text +.align 4 +.global keccak_f1600_x2_v84a_asm_v2p1 +.global _keccak_f1600_x2_v84a_asm_v2p1 + +#define KECCAK_F1600_ROUNDS 24 + +keccak_f1600_x2_v84a_asm_v2p1: +_keccak_f1600_x2_v84a_asm_v2p1: + alloc_stack + save_vregs + load_constant_ptr + load_input + + //mov count, #(KECCAK_F1600_ROUNDS-2) + mov count, #11 + keccak_f1600_round_pre +loop: + keccak_f1600_round_core + keccak_f1600_round_core + sub count, count, #1 + cbnz count, loop + + keccak_f1600_round_core + keccak_f1600_round_post + store_input + restore_vregs + free_stack + ret diff --git a/asm/manual/keccak_f1600/keccak_f1600_x2_v84a_asm_v2p2.s b/asm/manual/keccak_f1600/keccak_f1600_x2_v84a_asm_v2p2.s new file mode 100644 index 0000000..c224667 --- /dev/null +++ b/asm/manual/keccak_f1600/keccak_f1600_x2_v84a_asm_v2p2.s @@ -0,0 +1,802 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +/********************** CONSTANTS *************************/ + .data + .align(8) +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x1 + count .req x2 + cur_const .req x3 + + /* Mapping of Kecck-f1600 state to vector registers + * at the beginning and end of each round. */ + Aba .req v0 + Abe .req v1 + Abi .req v2 + Abo .req v3 + Abu .req v4 + Aga .req v5 + Age .req v6 + Agi .req v7 + Ago .req v8 + Agu .req v9 + Aka .req v10 + Ake .req v11 + Aki .req v12 + Ako .req v13 + Aku .req v14 + Ama .req v15 + Ame .req v16 + Ami .req v17 + Amo .req v18 + Amu .req v19 + Asa .req v20 + Ase .req v21 + Asi .req v22 + Aso .req v23 + Asu .req v24 + + /* q-form of the above mapping */ + Abaq .req q0 + Abeq .req q1 + Abiq .req q2 + Aboq .req q3 + Abuq .req q4 + Agaq .req q5 + Ageq .req q6 + Agiq .req q7 + Agoq .req q8 + Aguq .req q9 + Akaq .req q10 + Akeq .req q11 + Akiq .req q12 + Akoq .req q13 + Akuq .req q14 + Amaq .req q15 + Ameq .req q16 + Amiq .req q17 + Amoq .req q18 + Amuq .req q19 + Asaq .req q20 + Aseq .req q21 + Asiq .req q22 + Asoq .req q23 + Asuq .req q24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v27 + C1 .req v28 + C2 .req v29 + C3 .req v30 + C4 .req v31 + + C0q .req q27 + C1q .req q28 + C2q .req q29 + C3q .req q30 + C4q .req q31 + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req v26 + E1 .req v26 + E2 .req v26 + E3 .req v26 + E4 .req v26 + + E0q .req q26 + E1q .req q26 + E2q .req q26 + E3q .req q26 + E4q .req q26 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + // vBgi .req v27 + // vBgo .req v28 + // vBga .req v29 + // vBge .req v30 + // vBgu .req v31 + vBki .req v27 + vBko .req v28 + vBka .req v29 + vBke .req v30 + vBku .req v31 + vBmu .req v27 + vBmo .req v28 + vBmi .req v29 + vBma .req v30 + vBme .req v31 + vBba .req v27 + vBbi .req v28 + vBbo .req v29 + vBbu .req v30 + vBbe .req v31 + vBsa .req v27 + vBso .req v28 + vBse .req v29 + vBsi .req v30 + vBsu .req v31 + + // vBgiq .req q27 + // vBgoq .req q28 + // vBgaq .req q29 + // vBgeq .req q30 + // vBguq .req q31 + vBkiq .req q27 + vBkoq .req q28 + vBkaq .req q29 + vBkeq .req q30 + vBkuq .req q31 + vBmuq .req q27 + vBmoq .req q28 + vBmiq .req q29 + vBmaq .req q30 + vBmeq .req q31 + vBbaq .req q27 + vBbiq .req q28 + vBboq .req q29 + vBbuq .req q30 + vBbeq .req q31 + vBsaq .req q27 + vBsoq .req q28 + vBseq .req q29 + vBsiq .req q30 + vBsuq .req q31 + + vEgu .req Agu + vEga .req v26 + vEge .req v26 + vEgi .req v26 + vEgo .req v26 + vEka .req Aka + vEko .req Ako + vEke .req v26 + vEki .req v26 + vEku .req v26 + vEma .req v26 + vEme .req Ame + vEmi .req Ami + vEmo .req v26 + vEmu .req Amu + vEba .req Aba + vEbe .req Abe + vEbi .req v26 + vEbo .req Abo + vEbu .req Abu + vEsa .req Asa + vEse .req Ase + vEsi .req Asi + vEso .req Aso + vEsu .req Asu + + vEguq .req Aguq + vEgaq .req q26 + vEgeq .req q26 + vEgiq .req q26 + vEgoq .req q26 + vEkaq .req Akaq + vEkoq .req Akoq + vEkeq .req q26 + vEkiq .req q26 + vEkuq .req q26 + vEmaq .req q26 + vEmeq .req Ameq + vEmiq .req Amiq + vEmoq .req q26 + vEmuq .req Amuq + vEbaq .req Abaq + vEbeq .req Abeq + vEbiq .req q26 + vEboq .req Aboq + vEbuq .req Abuq + vEsaq .req Asaq + vEseq .req Aseq + vEsiq .req Asiq + vEsoq .req Asoq + vEsuq .req Asuq + +/************************ MACROS ****************************/ + +.macro load_input + ldp Abaq, Abeq, [input_addr, #(2*8*0)] + ldp Abiq, Aboq, [input_addr, #(2*8*2)] + ldp Abuq, Agaq, [input_addr, #(2*8*4)] + ldp Ageq, Agiq, [input_addr, #(2*8*6)] + ldp Agoq, Aguq, [input_addr, #(2*8*8)] + ldp Akaq, Akeq, [input_addr, #(2*8*10)] + ldp Akiq, Akoq, [input_addr, #(2*8*12)] + ldp Akuq, Amaq, [input_addr, #(2*8*14)] + ldp Ameq, Amiq, [input_addr, #(2*8*16)] + ldp Amoq, Amuq, [input_addr, #(2*8*18)] + ldp Asaq, Aseq, [input_addr, #(2*8*20)] + ldp Asiq, Asoq, [input_addr, #(2*8*22)] + ldr Asuq, [input_addr, #(2*8*24)] +.endm + +.macro store_input + str Abaq, [input_addr, #(2*8*0)] + str Abeq, [input_addr, #(2*8*1)] + str Abiq, [input_addr, #(2*8*2)] + str Aboq, [input_addr, #(2*8*3)] + str Abuq, [input_addr, #(2*8*4)] + str Agaq, [input_addr, #(2*8*5)] + str Ageq, [input_addr, #(2*8*6)] + str Agiq, [input_addr, #(2*8*7)] + str Agoq, [input_addr, #(2*8*8)] + str Aguq, [input_addr, #(2*8*9)] + str Akaq, [input_addr, #(2*8*10)] + str Akeq, [input_addr, #(2*8*11)] + str Akiq, [input_addr, #(2*8*12)] + str Akoq, [input_addr, #(2*8*13)] + str Akuq, [input_addr, #(2*8*14)] + str Amaq, [input_addr, #(2*8*15)] + str Ameq, [input_addr, #(2*8*16)] + str Amiq, [input_addr, #(2*8*17)] + str Amoq, [input_addr, #(2*8*18)] + str Amuq, [input_addr, #(2*8*19)] + str Asaq, [input_addr, #(2*8*20)] + str Aseq, [input_addr, #(2*8*21)] + str Asiq, [input_addr, #(2*8*22)] + str Asoq, [input_addr, #(2*8*23)] + str Asuq, [input_addr, #(2*8*24)] +.endm + +#define STACK_SIZE (16*4 + 16*30) +#define STACK_BASE_VREGS 0 +#define STACK_BASE_TMP 16*4 + +#define E0_offset 0 +#define E1_offset 1 +#define E2_offset 2 +#define E3_offset 3 +#define E4_offset 4 + +#define Aba_offset (5 + 0 ) +#define Abe_offset (5 + 1 ) +#define Abi_offset (5 + 2 ) +#define Abo_offset (5 + 3 ) +#define Abu_offset (5 + 4 ) +#define Aga_offset (5 + 5 ) +#define Age_offset (5 + 6 ) +#define Agi_offset (5 + 7 ) +#define Ago_offset (5 + 8 ) +#define Agu_offset (5 + 9 ) +#define Aka_offset (5 + 10 ) +#define Ake_offset (5 + 11 ) +#define Aki_offset (5 + 12 ) +#define Ako_offset (5 + 13 ) +#define Aku_offset (5 + 14 ) +#define Ama_offset (5 + 15 ) +#define Ame_offset (5 + 16 ) +#define Ami_offset (5 + 17 ) +#define Amo_offset (5 + 18 ) +#define Amu_offset (5 + 19 ) +#define Asa_offset (5 + 20 ) +#define Ase_offset (5 + 21 ) +#define Asi_offset (5 + 22 ) +#define Aso_offset (5 + 23 ) +#define Asu_offset (5 + 24 ) + +#define vEba_offset (5 + 0 ) +#define vEbe_offset (5 + 1 ) +#define vEbi_offset (5 + 2 ) +#define vEbo_offset (5 + 3 ) +#define vEbu_offset (5 + 4 ) +#define vEga_offset (5 + 5 ) +#define vEge_offset (5 + 6 ) +#define vEgi_offset (5 + 7 ) +#define vEgo_offset (5 + 8 ) +#define vEgu_offset (5 + 9 ) +#define vEka_offset (5 + 10 ) +#define vEke_offset (5 + 11 ) +#define vEki_offset (5 + 12 ) +#define vEko_offset (5 + 13 ) +#define vEku_offset (5 + 14 ) +#define vEma_offset (5 + 15 ) +#define vEme_offset (5 + 16 ) +#define vEmi_offset (5 + 17 ) +#define vEmo_offset (5 + 18 ) +#define vEmu_offset (5 + 19 ) +#define vEsa_offset (5 + 20 ) +#define vEse_offset (5 + 21 ) +#define vEsi_offset (5 + 22 ) +#define vEso_offset (5 + 23 ) +#define vEsu_offset (5 + 24 ) + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +#define save(name) \ + str name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] +#define restore(name) \ + ldr name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] + +.macro save_vregs + stp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + stp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + stp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + stp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + ldp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + ldp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + ldp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +/* Macros using v8.4-A SHA-3 instructions */ + +.macro eor3_m1_0 d s0 s1 s2 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor2 d s0 s1 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor3_m1_1 d s0 s1 s2 + eor \d\().16b, \d\().16b, \s2\().16b +.endm + +.macro eor3_m1 d s0 s1 s2 + eor3_m1_0 \d, \s0, \s1, \s2 + eor3_m1_1 \d, \s0, \s1, \s2 +.endm + +.macro rax1_m1 d s0 s1 + add tmp.2d, \s1\().2d, \s1\().2d + sri tmp.2d, \s1\().2d, #63 + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +.macro xar_m1 d s0 s1 imm + eor \s0\().16b, \s0\().16b, \s1\().16b + shl \d\().2d, \s0\().2d, #(64-\imm) + sri \d\().2d, \s0\().2d, #(\imm) +.endm + +.macro xar_m1_0 d s0 s1 imm tmp + eor \tmp\().16b, \s0\().16b, \s1\().16b +.endm + +.macro xar_m1_1 d s0 s1 imm tmp + shl \d\().2d, \tmp\().2d, #(64-\imm) +.endm + +.macro xar_m1_2 d s0 s1 imm tmp + sri \d\().2d, \tmp\().2d, #(\imm) +.endm + +.macro bcax_m1 d s0 s1 s2 + bic tmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +.macro refresh d + mov \d\().16b, \d\().16b +.endm +/* Keccak-f1600 round */ + +.macro keccak_f1600_round_pre + eor2 C0, Aka, Aga + eor2 C1, Ake, Age + eor2 C2, Aki, Agi + eor2 C3, Ako, Ago + eor2 C4, Aku, Agu + eor2 C0, C0, Ama + eor2 C1, C1, Ame + eor2 C2, C2, Ami + eor2 C3, C3, Amo + eor2 C4, C4, Amu + eor2 C0, C0, Asa + eor2 C1, C1, Ase + eor2 C2, C2, Asi + eor2 C3, C3, Aso + eor2 C4, C4, Asu + eor2 C0, C0, Aba + eor2 C1, C1, Abe + eor2 C2, C2, Abi + eor2 C3, C3, Abo + eor2 C4, C4, Abu +.endm + +.macro keccak_f1600_round_core + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req v25 + + .unreq E0 + .unreq E1 + .unreq E2 + .unreq E3 + .unreq E4 + .unreq E0q + .unreq E1q + .unreq E2q + .unreq E3q + .unreq E4q + + E1 .req v26 + E3 .req C2 + E0 .req C4 + E2 .req C1 + E4 .req C3 + + E1q .req q26 + E3q .req C2q + E0q .req C4q + E2q .req C1q + E4q .req C3q + + rax1_m1 E1, C0, C2 SEP save(E1) + rax1_m1 E3, C2, C4 SEP save(E3) + rax1_m1 E0, C4, C1 SEP save(E0) + rax1_m1 E2, C1, C3 SEP save(E2) + rax1_m1 E4, C3, C0 SEP save(E4) + + vBgi .req E0 + vBgo .req v27 + vBga .req E3 + vBge .req E4 + vBgu .req E2 + + xar_m1 vBgi, Aka, E0, 61 SEP + xar_m1 vBgo, Ame, E1, 19 SEP + xar_m1 vBga, Abo, E3, 36 SEP + xar_m1 vBge, Agu, E4, 44 SEP + xar_m1 vBgu, Asi, E2, 3 SEP + + bcax_m1 vEga, vBga, vBgi, vBge SEP save(vEga) + bcax_m1 vEge, vBge, vBgo, vBgi SEP save(vEge) + bcax_m1 vEgi, vBgi, vBgu, vBgo SEP save(vEgi) + bcax_m1 vEgo, vBgo, vBga, vBgu SEP save(vEgo) + bcax_m1 vEgu, vBgu, vBge, vBga + + .unreq E0 + .unreq E1 + .unreq E2 + .unreq E3 + .unreq E4 + .unreq E0q + .unreq E1q + .unreq E2q + .unreq E3q + .unreq E4q + + E0 .req v26 + E1 .req v26 + E2 .req v26 + E3 .req v26 + E4 .req v26 + E0q .req q26 + E1q .req q26 + E2q .req q26 + E3q .req q26 + E4q .req q26 + + restore(E3) + xar_m1 vBki, Ako, E3, 39 SEP restore(E4) + xar_m1 vBko, Amu, E4, 56 SEP restore(E1) + xar_m1 vBka, Abe, E1, 63 SEP restore(E2) + xar_m1 vBke, Agi, E2, 58 SEP restore(E0) + xar_m1 vBku, Asa, E0, 46 + + bcax_m1 vEke, vBke, vBko, vBki SEP save(vEke) + bcax_m1 vEki, vBki, vBku, vBko SEP save(vEki) + bcax_m1 vEku, vBku, vBke, vBka SEP save(vEku) + bcax_m1 vEko, vBko, vBka, vBku + bcax_m1 vEka, vBka, vBki, vBke + + restore(E3) + xar_m1 vBmu, Aso, E3, 8 SEP restore(E2) + xar_m1 vBmo, Ami, E2, 49 SEP restore(E1) + xar_m1 vBmi, Ake, E1, 54 SEP restore(E4) + xar_m1 vBma, Abu, E4, 37 SEP restore(E0) + xar_m1 vBme, Aga, E0, 28 + + bcax_m1 vEma, vBma, vBmi, vBme SEP save(vEma) + bcax_m1 vEmo, vBmo, vBma, vBmu SEP save(vEmo) + bcax_m1 vEme, vBme, vBmo, vBmi + bcax_m1 vEmi, vBmi, vBmu, vBmo + bcax_m1 vEmu, vBmu, vBme, vBma + + restore(E0) + eor2 vBba, Aba, E0 SEP restore(E2) + xar_m1 vBbi, Aki, E2, 21 SEP restore(E3) + xar_m1 vBbo, Amo, E3, 43 SEP restore(E4) + xar_m1 vBbu, Asu, E4, 50 SEP restore(E1) + xar_m1 vBbe, Age, E1, 20 + + bcax_m1 vEbi, vBbi, vBbu, vBbo SEP save(vEbi) + bcax_m1 vEba, vBba, vBbi, vBbe + ld1r {tmp.2d}, [const_addr], #8 + eor2 vEba, vEba, tmp + bcax_m1 vEbe, vBbe, vBbo, vBbi + bcax_m1 vEbo, vBbo, vBba, vBbu + bcax_m1 vEbu, vBbu, vBbe, vBba + + restore(E2) + xar_m1 vBsa, Abi, E2, 2 SEP restore(E0) + xar_m1 vBso, Ama, E0, 23 SEP restore(E3) + xar_m1 vBse, Ago, E3, 9 SEP restore(E4) + xar_m1 vBsi, Aku, E4, 25 SEP restore(E1) + xar_m1 vBsu, Ase, E1, 62 + + bcax_m1 vEsa, vBsa, vBsi, vBse SEP restore(Amo) + bcax_m1 vEse, vBse, vBso, vBsi SEP restore(Agi) + bcax_m1 vEsi, vBsi, vBsu, vBso SEP restore(Abi) + bcax_m1 vEso, vBso, vBsa, vBsu SEP restore(Ake) + bcax_m1 vEsu, vBsu, vBse, vBsa SEP restore(Aki) + + restore(Age) + restore(Aku) + restore(Ama) + restore(Aga) + restore(Ago) + + eor2 C3, Ako, Ago + eor2 C0, Aka, Aga + eor2 C1, Ake, Age + eor2 C2, Aki, Agi + eor2 C4, Aku, Agu + + eor2 C0, C0, Ama + eor2 C1, C1, Ame + eor2 C2, C2, Ami + eor2 C3, C3, Amo + eor2 C4, C4, Amu + + eor2 C0, C0, Aba + eor2 C1, C1, Abe + eor2 C2, C2, Abi + eor2 C3, C3, Abo + eor2 C4, C4, Abu + + eor2 C0, C0, Asa + eor2 C1, C1, Ase + eor2 C2, C2, Asi + eor2 C3, C3, Aso + eor2 C4, C4, Asu + + .unreq tmp + +.endm + +.macro keccak_f1600_round_post + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req v25 + + .unreq E0 + .unreq E1 + .unreq E2 + .unreq E3 + .unreq E4 + .unreq E0q + .unreq E1q + .unreq E2q + .unreq E3q + .unreq E4q + + E1 .req v26 + E3 .req C2 + E0 .req C4 + E2 .req C1 + E4 .req C3 + + E1q .req q26 + E3q .req C2q + E0q .req C4q + E2q .req C1q + E4q .req C3q + + rax1_m1 E1, C0, C2 SEP save(E1) + rax1_m1 E3, C2, C4 SEP save(E3) + rax1_m1 E0, C4, C1 SEP save(E0) + rax1_m1 E2, C1, C3 SEP save(E2) + rax1_m1 E4, C3, C0 SEP save(E4) + + .unreq vBgi + .unreq vBgo + .unreq vBga + .unreq vBge + .unreq vBgu + vBgi .req E0 + vBgo .req v27 + vBga .req E3 + vBge .req E4 + vBgu .req E2 + + xar_m1 vBgi, Aka, E0, 61 SEP + xar_m1 vBgo, Ame, E1, 19 SEP + xar_m1 vBga, Abo, E3, 36 SEP + xar_m1 vBge, Agu, E4, 44 SEP + xar_m1 vBgu, Asi, E2, 3 SEP + + bcax_m1 vEga, vBga, vBgi, vBge SEP save(vEga) + bcax_m1 vEge, vBge, vBgo, vBgi SEP save(vEge) + bcax_m1 vEgi, vBgi, vBgu, vBgo SEP save(vEgi) + bcax_m1 vEgo, vBgo, vBga, vBgu SEP save(vEgo) + bcax_m1 vEgu, vBgu, vBge, vBga + + .unreq E0 + .unreq E1 + .unreq E2 + .unreq E3 + .unreq E4 + .unreq E0q + .unreq E1q + .unreq E2q + .unreq E3q + .unreq E4q + + E0 .req v26 + E1 .req v26 + E2 .req v26 + E3 .req v26 + E4 .req v26 + E0q .req q26 + E1q .req q26 + E2q .req q26 + E3q .req q26 + E4q .req q26 + + restore(E3) + xar_m1 vBki, Ako, E3, 39 SEP restore(E4) + xar_m1 vBko, Amu, E4, 56 SEP restore(E1) + xar_m1 vBka, Abe, E1, 63 SEP restore(E2) + xar_m1 vBke, Agi, E2, 58 SEP restore(E0) + xar_m1 vBku, Asa, E0, 46 + + bcax_m1 vEke, vBke, vBko, vBki SEP save(vEke) + bcax_m1 vEki, vBki, vBku, vBko SEP save(vEki) + bcax_m1 vEku, vBku, vBke, vBka SEP save(vEku) + bcax_m1 vEko, vBko, vBka, vBku + bcax_m1 vEka, vBka, vBki, vBke + + restore(E3) + xar_m1 vBmu, Aso, E3, 8 SEP restore(E2) + xar_m1 vBmo, Ami, E2, 49 SEP restore(E1) + xar_m1 vBmi, Ake, E1, 54 SEP restore(E4) + xar_m1 vBma, Abu, E4, 37 SEP restore(E0) + xar_m1 vBme, Aga, E0, 28 + + bcax_m1 vEma, vBma, vBmi, vBme SEP save(vEma) + bcax_m1 vEmo, vBmo, vBma, vBmu SEP save(vEmo) + bcax_m1 vEme, vBme, vBmo, vBmi + bcax_m1 vEmi, vBmi, vBmu, vBmo + bcax_m1 vEmu, vBmu, vBme, vBma + + restore(E0) + eor2 vBba, Aba, E0 SEP restore(E2) + xar_m1 vBbi, Aki, E2, 21 SEP restore(E3) + xar_m1 vBbo, Amo, E3, 43 SEP restore(E4) + xar_m1 vBbu, Asu, E4, 50 SEP restore(E1) + xar_m1 vBbe, Age, E1, 20 + + bcax_m1 vEbi, vBbi, vBbu, vBbo SEP save(vEbi) + bcax_m1 vEba, vBba, vBbi, vBbe + ld1r {tmp.2d}, [const_addr], #8 + eor2 vEba, vEba, tmp + bcax_m1 vEbe, vBbe, vBbo, vBbi + bcax_m1 vEbo, vBbo, vBba, vBbu + bcax_m1 vEbu, vBbu, vBbe, vBba + + restore(E2) + xar_m1 vBsa, Abi, E2, 2 SEP restore(E0) + xar_m1 vBso, Ama, E0, 23 SEP restore(E3) + xar_m1 vBse, Ago, E3, 9 SEP restore(E4) + xar_m1 vBsi, Aku, E4, 25 SEP restore(E1) + xar_m1 vBsu, Ase, E1, 62 + + bcax_m1 vEsa, vBsa, vBsi, vBse SEP restore(Amo) + bcax_m1 vEse, vBse, vBso, vBsi SEP restore(Agi) + bcax_m1 vEsi, vBsi, vBsu, vBso SEP restore(Abi) + bcax_m1 vEso, vBso, vBsa, vBsu SEP restore(Ake) + bcax_m1 vEsu, vBsu, vBse, vBsa SEP restore(Aki) + + restore(Age) + restore(Aku) + restore(Ama) + restore(Aga) + restore(Ago) + + .unreq tmp + +.endm + + +.text +.align 4 +.global keccak_f1600_x2_v84a_asm_v2p2 +.global _keccak_f1600_x2_v84a_asm_v2p2 + +#define KECCAK_F1600_ROUNDS 24 + +keccak_f1600_x2_v84a_asm_v2p2: +_keccak_f1600_x2_v84a_asm_v2p2: + alloc_stack + save_vregs + load_constant_ptr + load_input + + //mov count, #(KECCAK_F1600_ROUNDS-2) + mov count, #11 + keccak_f1600_round_pre +loop: + keccak_f1600_round_core + keccak_f1600_round_core + sub count, count, #1 + cbnz count, loop + + keccak_f1600_round_core + keccak_f1600_round_post + store_input + restore_vregs + free_stack + ret diff --git a/asm/manual/keccak_f1600/keccak_f1600_x2_v84a_asm_v2p3.s b/asm/manual/keccak_f1600/keccak_f1600_x2_v84a_asm_v2p3.s new file mode 100644 index 0000000..e83d5ce --- /dev/null +++ b/asm/manual/keccak_f1600/keccak_f1600_x2_v84a_asm_v2p3.s @@ -0,0 +1,773 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +/********************** CONSTANTS *************************/ + .data + .align(8) +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x1 + count .req x2 + cur_const .req x3 + + /* Mapping of Kecck-f1600 state to vector registers + * at the beginning and end of each round. */ + Aba .req v0 + Abe .req v1 + Abo .req v2 + Abu .req v3 + Agu .req v4 + Aka .req v5 + Ako .req v6 + Ame .req v7 + Ami .req v8 + Amu .req v9 + Asa .req v10 + Ase .req v11 + Asi .req v12 + Aso .req v13 + Asu .req v14 + + Agi .req v15 + Ake .req v16 + Aga .req v17 + Aki .req v18 + + Abi .req v19 + Ama .req v20 + Ago .req v21 + Aku .req v22 + Age .req v23 + Amo .req v24 + + /* q-form of the above mapping */ + Abaq .req q0 + Abeq .req q1 + Aboq .req q2 + Abuq .req q3 + Aguq .req q4 + Akaq .req q5 + Akoq .req q6 + Ameq .req q7 + Amiq .req q8 + Amuq .req q9 + Asaq .req q10 + Aseq .req q11 + Asiq .req q12 + Asoq .req q13 + Asuq .req q14 + + Agiq .req q15 + Akeq .req q16 + Agaq .req q17 + Akiq .req q18 + + Abiq .req q19 + Amaq .req q20 + Agoq .req q21 + Akuq .req q22 + Ageq .req q23 + Amoq .req q24 + + spare0 .req v25 + spare1 .req v26 + spare2 .req v27 + spare3 .req v28 + spare4 .req v29 + spare5 .req v30 + spare0q .req q25 + spare1q .req q26 + spare2q .req q27 + spare3q .req q28 + spare4q .req q29 + spare5q .req q30 + + vEgu .req Agu /* keep */ + vEga .req spare0 /* out */ + vEge .req spare1 /* out */ + vEgi .req spare2 /* out */ + vEgo .req spare3 /* out */ + + vEka .req Aka /* keep */ + vEko .req Ako /* keep */ + vEke .req spare4 /* out */ + vEki .req spare5 /* out */ + vEku .req Agi /* in */ + + vEma .req Ake /* in */ + vEme .req Ame /* keep */ + vEmi .req Ami /* keep */ + vEmo .req Aga /* in */ + vEmu .req Amu /* keep */ + + vEba .req Aba /* keep */ + vEbe .req Abe /* keep */ + vEbi .req Aki /* in */ + vEbo .req Abo /* keep */ + vEbu .req Abu /* keep */ + + vEsa .req Asa /* keep */ + vEse .req Ase /* keep */ + vEsi .req Asi /* keep */ + vEso .req Aso /* keep */ + vEsu .req Asu /* keep */ + + vEguq .req Aguq + vEgaq .req spare0q + vEgeq .req spare1q + vEgiq .req spare2q + vEgoq .req spare3q + + vEkaq .req Akaq + vEkoq .req Akoq + vEkeq .req spare4q + vEkiq .req spare5q + vEkuq .req Agiq + + vEmaq .req Akeq + vEmeq .req Ameq + vEmiq .req Amiq + vEmoq .req Agaq + vEmuq .req Amuq + + vEbaq .req Abaq + vEbeq .req Abeq + vEbiq .req Akiq + vEboq .req Aboq + vEbuq .req Abuq + + vEsaq .req Asaq + vEseq .req Aseq + vEsiq .req Asiq + vEsoq .req Asoq + vEsuq .req Asuq + + tmp .req v31 + tmpq .req q31 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req spare0 + C1 .req spare1 + C2 .req spare2 + C3 .req spare3 + C4 .req spare4 + C0q .req spare0q + C1q .req spare1q + C2q .req spare2q + C3q .req spare3q + C4q .req spare4q + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + + // Registers used during computation time + E1c .req spare5 + E3c .req C2 + E0c .req C4 + E2c .req C1 + E4c .req C3 + + E1cq .req spare5q + E3cq .req C2q + E0cq .req C4q + E2cq .req C1q + E4cq .req C3q + + // Registers during use time + E0u .req tmp + E1u .req tmp + E2u .req tmp + E3u .req tmp + E4u .req tmp + + E0uq .req tmpq + E1uq .req tmpq + E2uq .req tmpq + E3uq .req tmpq + E4uq .req tmpq + + vBgo .req E1c + vBgi .req Ame + vBga .req Aka + vBge .req Abo + vBgu .req Agu + + vBko .req Ame + vBka .req Amu + vBke .req Abe + vBku .req Agi + vBki .req Asa + + vBmu .req Abo + vBmo .req Aso + vBmi .req Abu + vBma .req Asi + vBme .req Abe + + vBba .req Asi + vBbi .req Asa + vBbo .req Aso + vBbu .req Amo + vBbe .req Asu + + vBsa .req Amo + vBso .req Abi + vBse .req Ama + vBsi .req Ago + vBsu .req Aku + +/************************ MACROS ****************************/ + +.macro load_input + ldp Abaq, Abeq, [input_addr, #(2*8*0)] + ldp Abiq, Aboq, [input_addr, #(2*8*2)] + ldp Abuq, Agaq, [input_addr, #(2*8*4)] + ldp Ageq, Agiq, [input_addr, #(2*8*6)] + ldp Agoq, Aguq, [input_addr, #(2*8*8)] + ldp Akaq, Akeq, [input_addr, #(2*8*10)] + ldp Akiq, Akoq, [input_addr, #(2*8*12)] + ldp Akuq, Amaq, [input_addr, #(2*8*14)] + ldp Ameq, Amiq, [input_addr, #(2*8*16)] + ldp Amoq, Amuq, [input_addr, #(2*8*18)] + ldp Asaq, Aseq, [input_addr, #(2*8*20)] + ldp Asiq, Asoq, [input_addr, #(2*8*22)] + ldr Asuq, [input_addr, #(2*8*24)] +.endm + +.macro store_input + str Abaq, [input_addr, #(2*8*0)] + str Abeq, [input_addr, #(2*8*1)] + str Abiq, [input_addr, #(2*8*2)] + str Aboq, [input_addr, #(2*8*3)] + str Abuq, [input_addr, #(2*8*4)] + str Agaq, [input_addr, #(2*8*5)] + str Ageq, [input_addr, #(2*8*6)] + str Agiq, [input_addr, #(2*8*7)] + str Agoq, [input_addr, #(2*8*8)] + str Aguq, [input_addr, #(2*8*9)] + str Akaq, [input_addr, #(2*8*10)] + str Akeq, [input_addr, #(2*8*11)] + str Akiq, [input_addr, #(2*8*12)] + str Akoq, [input_addr, #(2*8*13)] + str Akuq, [input_addr, #(2*8*14)] + str Amaq, [input_addr, #(2*8*15)] + str Ameq, [input_addr, #(2*8*16)] + str Amiq, [input_addr, #(2*8*17)] + str Amoq, [input_addr, #(2*8*18)] + str Amuq, [input_addr, #(2*8*19)] + str Asaq, [input_addr, #(2*8*20)] + str Aseq, [input_addr, #(2*8*21)] + str Asiq, [input_addr, #(2*8*22)] + str Asoq, [input_addr, #(2*8*23)] + str Asuq, [input_addr, #(2*8*24)] +.endm + +#define STACK_SIZE (16*4 + 16*30) +#define STACK_BASE_VREGS 0 +#define STACK_BASE_TMP 16*4 + +#define E0c_offset 0 +#define E1c_offset 1 +#define E2c_offset 2 +#define E3c_offset 3 +#define E4c_offset 4 +#define E0u_offset 0 +#define E1u_offset 1 +#define E2u_offset 2 +#define E3u_offset 3 +#define E4u_offset 4 + +#define Aba_offset (5 + 0 ) +#define Abe_offset (5 + 1 ) +#define Abi_offset (5 + 2 ) +#define Abo_offset (5 + 3 ) +#define Abu_offset (5 + 4 ) +#define Aga_offset (5 + 5 ) +#define Age_offset (5 + 6 ) +#define Agi_offset (5 + 7 ) +#define Ago_offset (5 + 8 ) +#define Agu_offset (5 + 9 ) +#define Aka_offset (5 + 10 ) +#define Ake_offset (5 + 11 ) +#define Aki_offset (5 + 12 ) +#define Ako_offset (5 + 13 ) +#define Aku_offset (5 + 14 ) +#define Ama_offset (5 + 15 ) +#define Ame_offset (5 + 16 ) +#define Ami_offset (5 + 17 ) +#define Amo_offset (5 + 18 ) +#define Amu_offset (5 + 19 ) +#define Asa_offset (5 + 20 ) +#define Ase_offset (5 + 21 ) +#define Asi_offset (5 + 22 ) +#define Aso_offset (5 + 23 ) +#define Asu_offset (5 + 24 ) + +#define vEba_offset (5 + 0 ) +#define vEbe_offset (5 + 1 ) +#define vEbi_offset (5 + 2 ) +#define vEbo_offset (5 + 3 ) +#define vEbu_offset (5 + 4 ) +#define vEga_offset (5 + 5 ) +#define vEge_offset (5 + 6 ) +#define vEgi_offset (5 + 7 ) +#define vEgo_offset (5 + 8 ) +#define vEgu_offset (5 + 9 ) +#define vEka_offset (5 + 10 ) +#define vEke_offset (5 + 11 ) +#define vEki_offset (5 + 12 ) +#define vEko_offset (5 + 13 ) +#define vEku_offset (5 + 14 ) +#define vEma_offset (5 + 15 ) +#define vEme_offset (5 + 16 ) +#define vEmi_offset (5 + 17 ) +#define vEmo_offset (5 + 18 ) +#define vEmu_offset (5 + 19 ) +#define vEsa_offset (5 + 20 ) +#define vEse_offset (5 + 21 ) +#define vEsi_offset (5 + 22 ) +#define vEso_offset (5 + 23 ) +#define vEsu_offset (5 + 24 ) + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +#define save(name) \ + str name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] +#define restore(name) \ + ldr name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] + +.macro save_vregs + stp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + stp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + stp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + stp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + ldp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + ldp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + ldp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +/* Macros using v8.4-A SHA-3 instructions */ + +.macro eor3_m1_0 d s0 s1 s2 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor2 d s0 s1 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor5 out i0 i1 i2 i3 i4 tmp + eor2 \out, \i0, \i1 + eor2 \tmp, \i3, \i4 + eor2 \out, \out, \i2 + eor2 \out, \out, \tmp +.endm + +.macro move d s + mov \d\().16b, \s\().16b +.endm + + +.macro eor3_m1_1 d s0 s1 s2 + eor \d\().16b, \d\().16b, \s2\().16b +.endm + +.macro eor3_m1 d s0 s1 s2 + eor3_m1_0 \d, \s0, \s1, \s2 + eor3_m1_1 \d, \s0, \s1, \s2 +.endm + +.macro rax1_m1 d s0 s1 + add tmp.2d, \s1\().2d, \s1\().2d + sri tmp.2d, \s1\().2d, #63 + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +.macro xar_m1 d s0 s1 imm + eor \s0\().16b, \s0\().16b, \s1\().16b + shl \d\().2d, \s0\().2d, #(64-\imm) + sri \d\().2d, \s0\().2d, #(\imm) +.endm + +.macro xar_m1_0 d s0 s1 imm tmp + eor \tmp\().16b, \s0\().16b, \s1\().16b +.endm + +.macro xar_m1_1 d s0 s1 imm tmp + shl \d\().2d, \tmp\().2d, #(64-\imm) +.endm + +.macro xar_m1_2 d s0 s1 imm tmp + sri \d\().2d, \tmp\().2d, #(\imm) +.endm + +.macro bcax_m1 d s0 s1 s2 + bic tmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +.macro refresh d + mov \d\().16b, \d\().16b +.endm +/* Keccak-f1600 round */ + +.macro keccak_f1600_round_pre + eor2 C0, Aka, Aga + eor2 C1, Ake, Age + eor2 C2, Aki, Agi + eor2 C3, Ako, Ago + eor2 C4, Aku, Agu + eor2 C0, C0, Ama + eor2 C1, C1, Ame + eor2 C2, C2, Ami + eor2 C3, C3, Amo + eor2 C4, C4, Amu + eor2 C0, C0, Asa + eor2 C1, C1, Ase + eor2 C2, C2, Asi + eor2 C3, C3, Aso + eor2 C4, C4, Asu + eor2 C0, C0, Aba + eor2 C1, C1, Abe + eor2 C2, C2, Abi + eor2 C3, C3, Abo + eor2 C4, C4, Abu +.endm + +.macro keccak_f1600_round_core + + /* 5x RAX1, 15 Neon Instructions total */ + + rax1_m1 E1c, C0, C2 SEP save(E1c) + rax1_m1 E3c, C2, C4 SEP save(E3c) + rax1_m1 E0c, C4, C1 SEP save(E0c) + rax1_m1 E2c, C1, C3 SEP save(E2c) + rax1_m1 E4c, C3, C0 SEP save(E4c) + + xar_m1 vBgo, Ame /* used at block 3 */, E1c, 19 + xar_m1 vBgi, Aka /* used at block 2 */, E0c, 61 + xar_m1 vBga, Abo /* used at block 4 */, E3c, 36 + xar_m1 vBge, Agu /* used at block 1 */, E4c, 44 + xar_m1 vBgu, Asi /* used at block 5 */, E2c, 3 + + bcax_m1 vEga, vBga, vBgi, vBge SEP save(vEga) /* TEMP */ + bcax_m1 vEge, vBge, vBgo, vBgi + bcax_m1 vEgi, vBgi, vBgu, vBgo SEP save(vEgi) /* TEMP */ + bcax_m1 vEgo, vBgo, vBga, vBgu + bcax_m1 vEgu, vBgu, vBge, vBga + + restore(E4u) + xar_m1 vBko, Amu /* used at block 3 */, E4u, 56 SEP restore(E1u) + xar_m1 vBka, Abe /* used at block 4 */, E1u, 63 SEP restore(E2u) + xar_m1 vBke, Agi /* not used */, E2u, 58 SEP restore(E0u) + xar_m1 vBku, Asa /* used at block 5 */, E0u, 46 SEP restore(E3u) + xar_m1 vBki, Ako /* used at block 2 */, E3u, 39 + + bcax_m1 vEke, vBke, vBko, vBki SEP save(vEke) /* TEMP */ + bcax_m1 vEki, vBki, vBku, vBko SEP save(vEki) /* TEMP */ + bcax_m1 vEku, vBku, vBke, vBka + bcax_m1 vEko, vBko, vBka, vBku + bcax_m1 vEka, vBka, vBki, vBke + + // Can use: Abo, Asi, Abe, Asa; Abu, Aso + SEP restore(E3u) + xar_m1 vBmu, Aso /* used at block 5 */, E3u, 8 SEP restore(E4u) + xar_m1 vBma, Abu /* used at block 4 */, E4u, 37 SEP restore(E2u) + xar_m1 vBmo, Ami /* used at block 3 */, E2u, 49 SEP restore(E1u) + xar_m1 vBmi, Ake /* not used */, E1u, 54 SEP restore(E0u) + xar_m1 vBme, Aga /* not used */, E0u, 28 + + bcax_m1 vEma, vBma, vBmi, vBme + bcax_m1 vEmo, vBmo, vBma, vBmu + bcax_m1 vEme, vBme, vBmo, vBmi + bcax_m1 vEmi, vBmi, vBmu, vBmo + bcax_m1 vEmu, vBmu, vBme, vBma + + // Can use: Asi, Asa, Aso, Asu, Amo + restore(E0u) + eor2 vBba, Aba /* used at block 4 */, E0u SEP restore(E2u) + xar_m1 vBbi, Aki /* not used */, E2u, 21 SEP restore(E3u) + xar_m1 vBbo, Amo /* not used */, E3u, 43 SEP restore(E4u) + xar_m1 vBbu, Asu /* used at block 5 */, E4u, 50 SEP restore(E1u) + xar_m1 vBbe, Age /* not used */, E1u, 20 + + bcax_m1 vEba, vBba, vBbi, vBbe + ld1r {tmp.2d}, [const_addr], #8 + eor2 vEba, vEba, tmp + bcax_m1 vEbe, vBbe, vBbo, vBbi + bcax_m1 vEbo, vBbo, vBba, vBbu + bcax_m1 vEbu, vBbu, vBbe, vBba + bcax_m1 vEbi, vBbi, vBbu, vBbo + + // Can use: Amo, Age, Abi, Ama, Ago, Aku + restore(E2u) + xar_m1 vBsa, Abi /* not used */, E2u, 2 SEP restore(E0u) + xar_m1 vBso, Ama /* not used */, E0u, 23 SEP restore(E3u) + xar_m1 vBse, Ago /* not used */, E3u, 9 SEP restore(E4u) + xar_m1 vBsi, Aku /* not used */, E4u, 25 SEP restore(E1u) + xar_m1 vBsu, Ase /* used at block 5 */, E1u, 62 + + bcax_m1 vEsa, vBsa, vBsi, vBse + bcax_m1 vEse, vBse, vBso, vBsi + bcax_m1 vEsi, vBsi, vBsu, vBso + bcax_m1 vEso, vBso, vBsa, vBsu + bcax_m1 vEsu, vBsu, vBse, vBsa + + /* TODO: Unroll twice and arrange things so that after two iterations + we end up at the same allocation of state registers? */ + + /* New spare registers: + * - Abi, Ama, Ago, Aku, Age, Amo */ + + move Abi, vEbi + move Ama, vEma + move Ago, vEgo + move Aku, vEku + move Age, vEge + move Amo, vEmo + + /* Overlapping registers + * - Agi, Ake, Aga, Aki */ +// save(vEgi) +// save(vEke) +// save(vEga) +// save(vEki) + + restore(Agi) + restore(Ake) + restore(Aga) + restore(Aki) + + eor5 C0, Aka, Aga, Ama, Aba, Asa, tmp + eor5 C2, Aki, Agi, Ami, Abi, Asi, tmp + eor5 C4, Aku, Agu, Amu, Abu, Asu, tmp + eor5 C1, Ake, Age, Ame, Abe, Ase, tmp + eor5 C3, Ako, Ago, Amo, Abo, Aso, tmp + + // eor2 C3, Ako, Ago + // eor2 C0, Aka, Aga + // eor2 C1, Ake, Age + // eor2 C2, Aki, Agi + // eor2 C4, Aku, Agu + + // eor2 C0, C0, Ama + // eor2 C1, C1, Ame + // eor2 C2, C2, Ami + // eor2 C3, C3, Amo + // eor2 C4, C4, Amu + + // eor2 C0, C0, Aba + // eor2 C1, C1, Abe + // eor2 C2, C2, Abi + // eor2 C3, C3, Abo + // eor2 C4, C4, Abu + + // eor2 C0, C0, Asa + // eor2 C1, C1, Ase + // eor2 C2, C2, Asi + // eor2 C3, C3, Aso + // eor2 C4, C4, Asu + +.endm + +.macro keccak_f1600_round_post + + /* 5x RAX1, 15 Neon Instructions total */ + + rax1_m1 E1c, C0, C2 SEP save(E1c) + rax1_m1 E3c, C2, C4 SEP save(E3c) + rax1_m1 E0c, C4, C1 SEP save(E0c) + rax1_m1 E2c, C1, C3 SEP save(E2c) + rax1_m1 E4c, C3, C0 SEP save(E4c) + + xar_m1 vBgo, Ame /* used at block 3 */, E1c, 19 + xar_m1 vBgi, Aka /* used at block 2 */, E0c, 61 + xar_m1 vBga, Abo /* used at block 4 */, E3c, 36 + xar_m1 vBge, Agu /* used at block 1 */, E4c, 44 + xar_m1 vBgu, Asi /* used at block 5 */, E2c, 3 + + bcax_m1 vEga, vBga, vBgi, vBge SEP save(vEga) /* TEMP */ + bcax_m1 vEge, vBge, vBgo, vBgi + bcax_m1 vEgi, vBgi, vBgu, vBgo SEP save(vEgi) /* TEMP */ + bcax_m1 vEgo, vBgo, vBga, vBgu + bcax_m1 vEgu, vBgu, vBge, vBga + + restore(E4u) + xar_m1 vBko, Amu /* used at block 3 */, E4u, 56 SEP restore(E1u) + xar_m1 vBka, Abe /* used at block 4 */, E1u, 63 SEP restore(E2u) + xar_m1 vBke, Agi /* not used */, E2u, 58 SEP restore(E0u) + xar_m1 vBku, Asa /* used at block 5 */, E0u, 46 SEP restore(E3u) + xar_m1 vBki, Ako /* used at block 2 */, E3u, 39 + + bcax_m1 vEke, vBke, vBko, vBki SEP save(vEke) /* TEMP */ + bcax_m1 vEki, vBki, vBku, vBko SEP save(vEki) /* TEMP */ + bcax_m1 vEku, vBku, vBke, vBka + bcax_m1 vEko, vBko, vBka, vBku + bcax_m1 vEka, vBka, vBki, vBke + + // Can use: Abo, Asi, Abe, Asa; Abu, Aso + SEP restore(E3u) + xar_m1 vBmu, Aso /* used at block 5 */, E3u, 8 SEP restore(E4u) + xar_m1 vBma, Abu /* used at block 4 */, E4u, 37 SEP restore(E2u) + xar_m1 vBmo, Ami /* used at block 3 */, E2u, 49 SEP restore(E1u) + xar_m1 vBmi, Ake /* not used */, E1u, 54 SEP restore(E0u) + xar_m1 vBme, Aga /* not used */, E0u, 28 + + bcax_m1 vEma, vBma, vBmi, vBme + bcax_m1 vEmo, vBmo, vBma, vBmu + bcax_m1 vEme, vBme, vBmo, vBmi + bcax_m1 vEmi, vBmi, vBmu, vBmo + bcax_m1 vEmu, vBmu, vBme, vBma + + // Can use: Asi, Asa, Aso, Asu, Amo + restore(E0u) + eor2 vBba, Aba /* used at block 4 */, E0u SEP restore(E2u) + xar_m1 vBbi, Aki /* not used */, E2u, 21 SEP restore(E3u) + xar_m1 vBbo, Amo /* not used */, E3u, 43 SEP restore(E4u) + xar_m1 vBbu, Asu /* used at block 5 */, E4u, 50 SEP restore(E1u) + xar_m1 vBbe, Age /* not used */, E1u, 20 + + bcax_m1 vEba, vBba, vBbi, vBbe + ld1r {tmp.2d}, [const_addr], #8 + eor2 vEba, vEba, tmp + bcax_m1 vEbe, vBbe, vBbo, vBbi + bcax_m1 vEbo, vBbo, vBba, vBbu + bcax_m1 vEbu, vBbu, vBbe, vBba + bcax_m1 vEbi, vBbi, vBbu, vBbo + + // Can use: Amo, Age, Abi, Ama, Ago, Aku + restore(E2u) + xar_m1 vBsa, Abi /* not used */, E2u, 2 SEP restore(E0u) + xar_m1 vBso, Ama /* not used */, E0u, 23 SEP restore(E3u) + xar_m1 vBse, Ago /* not used */, E3u, 9 SEP restore(E4u) + xar_m1 vBsi, Aku /* not used */, E4u, 25 SEP restore(E1u) + xar_m1 vBsu, Ase /* used at block 5 */, E1u, 62 + + bcax_m1 vEsa, vBsa, vBsi, vBse + bcax_m1 vEse, vBse, vBso, vBsi + bcax_m1 vEsi, vBsi, vBsu, vBso + bcax_m1 vEso, vBso, vBsa, vBsu + bcax_m1 vEsu, vBsu, vBse, vBsa + + /* TODO: Unroll twice and arrange things so that after two iterations + we end up at the same allocation of state registers? */ + + /* New spare registers: + * - Abi, Ama, Ago, Aku, Age, Amo */ + + move Abi, vEbi + move Ama, vEma + move Ago, vEgo + move Aku, vEku + move Age, vEge + move Amo, vEmo + + /* Overlapping registers + * - Agi, Ake, Aga, Aki */ +// save(vEgi) +// save(vEke) +// save(vEga) +// save(vEki) + + restore(Agi) + restore(Ake) + restore(Aga) + restore(Aki) + +.endm + + +.text +.align 4 +.global keccak_f1600_x2_v84a_asm_v2p3 +.global _keccak_f1600_x2_v84a_asm_v2p3 + +#define KECCAK_F1600_ROUNDS 24 + +keccak_f1600_x2_v84a_asm_v2p3: +_keccak_f1600_x2_v84a_asm_v2p3: + alloc_stack + save_vregs + load_constant_ptr + load_input + + //mov count, #(KECCAK_F1600_ROUNDS-2) + mov count, #11 + keccak_f1600_round_pre +loop: + keccak_f1600_round_core + keccak_f1600_round_core + sub count, count, #1 + cbnz count, loop + + keccak_f1600_round_core + keccak_f1600_round_post + store_input + restore_vregs + free_stack + ret diff --git a/asm/manual/keccak_f1600/keccak_f1600_x2_v84a_asm_v2p4.s b/asm/manual/keccak_f1600/keccak_f1600_x2_v84a_asm_v2p4.s new file mode 100644 index 0000000..75fe603 --- /dev/null +++ b/asm/manual/keccak_f1600/keccak_f1600_x2_v84a_asm_v2p4.s @@ -0,0 +1,689 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +#define STACK_SIZE (16*4 + 16*30) +#define STACK_BASE_VREGS 0 +#define STACK_BASE_TMP 16*4 + +#define E0c_offset 0 +#define E1c_offset 1 +#define E2c_offset 2 +#define E3c_offset 3 +#define E4c_offset 4 +#define E0u_offset 0 +#define E1u_offset 1 +#define E2u_offset 2 +#define E3u_offset 3 +#define E4u_offset 4 + +#define ba_offset (5 + 0 ) +#define be_offset (5 + 1 ) +#define bi_offset (5 + 2 ) +#define bo_offset (5 + 3 ) +#define bu_offset (5 + 4 ) +#define ga_offset (5 + 5 ) +#define ge_offset (5 + 6 ) +#define gi_offset (5 + 7 ) +#define go_offset (5 + 8 ) +#define gu_offset (5 + 9 ) +#define ka_offset (5 + 10 ) +#define ke_offset (5 + 11 ) +#define ki_offset (5 + 12 ) +#define ko_offset (5 + 13 ) +#define ku_offset (5 + 14 ) +#define ma_offset (5 + 15 ) +#define me_offset (5 + 16 ) +#define mi_offset (5 + 17 ) +#define mo_offset (5 + 18 ) +#define mu_offset (5 + 19 ) +#define sa_offset (5 + 20 ) +#define se_offset (5 + 21 ) +#define si_offset (5 + 22 ) +#define so_offset (5 + 23 ) +#define su_offset (5 + 24 ) + +#define savep(reg, offset_prefix) \ + str reg ## q, [sp, #(STACK_BASE_TMP + 16 * offset_prefix ## _offset)] +#define restorep(reg, offset_prefix) \ + ldr reg ## q, [sp, #(STACK_BASE_TMP + 16 * offset_prefix ## _offset)] +#define save(name) savep(name,name) +#define restore(name) restorep(name,name) + +/********************** CONSTANTS *************************/ + .data + .align(8) +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x1 + count .req x2 + cur_const .req x3 + + /* Mapping of Kecck-f1600 state to vector registers + * at the beginning and end of each round. */ + Aba .req v0 + Abe .req v1 + Abo .req v2 + Abu .req v3 + Agu .req v4 + Aka .req v5 + Ako .req v6 + Ame .req v7 + Ami .req v8 + Amu .req v9 + Asa .req v10 + Ase .req v11 + Asi .req v12 + Aso .req v13 + Asu .req v14 + + Agi .req v15 + Ake .req v16 + Aga .req v17 + Aki .req v18 + + Abi .req v19 + Ama .req v20 + Ago .req v21 + Aku .req v22 + Age .req v23 + Amo .req v24 + + /* q-form of the above mapping */ + Abaq .req q0 + Abeq .req q1 + Aboq .req q2 + Abuq .req q3 + Aguq .req q4 + Akaq .req q5 + Akoq .req q6 + Ameq .req q7 + Amiq .req q8 + Amuq .req q9 + Asaq .req q10 + Aseq .req q11 + Asiq .req q12 + Asoq .req q13 + Asuq .req q14 + + Agiq .req q15 + Akeq .req q16 + Agaq .req q17 + Akiq .req q18 + + Abiq .req q19 + Amaq .req q20 + Agoq .req q21 + Akuq .req q22 + Ageq .req q23 + Amoq .req q24 + + Aspare0 .req v25 + Aspare1 .req v26 + Aspare2 .req v27 + Aspare3 .req v28 + Aspare4 .req v29 + Aspare5 .req v30 + Aspare6 .req v31 + Aspare0q .req q25 + Aspare1q .req q26 + Aspare2q .req q27 + Aspare3q .req q28 + Aspare4q .req q29 + Aspare5q .req q30 + Aspare6q .req q31 + +.macro declare_remappings out,in + tmp .req \in\()spare6 + tmpq .req \in\()spare6q + + \out\()gu .req \in\()gu /* keep */ + \out\()ga .req \in\()spare0 /* out */ + \out\()ge .req \in\()spare1 /* out */ + \out\()gi .req \in\()spare2 /* out */ + \out\()go .req \in\()spare3 /* out */ + + \out\()ka .req \in\()ka /* keep */ + \out\()ko .req \in\()ko /* keep */ + \out\()ke .req \in\()spare4 /* out */ + \out\()ki .req \in\()spare5 /* out */ + \out\()ku .req \in\()gi /* in */ + + \out\()ma .req \in\()ke /* in */ + \out\()me .req \in\()me /* keep */ + \out\()mi .req \in\()mi /* keep */ + \out\()mo .req \in\()ga /* in */ + \out\()mu .req \in\()mu /* keep */ + + \out\()ba .req \in\()ba /* keep */ + \out\()be .req \in\()be /* keep */ + \out\()bi .req \in\()ki /* in */ + \out\()bo .req \in\()bo /* keep */ + \out\()bu .req \in\()bu /* keep */ + + \out\()sa .req \in\()sa /* keep */ + \out\()se .req \in\()se /* keep */ + \out\()si .req \in\()si /* keep */ + \out\()so .req \in\()so /* keep */ + \out\()su .req \in\()su /* keep */ + + \out\()guq .req \in\()guq + \out\()gaq .req \in\()spare0q + \out\()geq .req \in\()spare1q + \out\()giq .req \in\()spare2q + \out\()goq .req \in\()spare3q + + \out\()kaq .req \in\()kaq + \out\()koq .req \in\()koq + \out\()keq .req \in\()spare4q + \out\()kiq .req \in\()spare5q + \out\()kuq .req \in\()giq + + \out\()maq .req \in\()keq + \out\()meq .req \in\()meq + \out\()miq .req \in\()miq + \out\()moq .req \in\()gaq + \out\()muq .req \in\()muq + + \out\()baq .req \in\()baq + \out\()beq .req \in\()beq + \out\()biq .req \in\()kiq + \out\()boq .req \in\()boq + \out\()buq .req \in\()buq + + \out\()saq .req \in\()saq + \out\()seq .req \in\()seq + \out\()siq .req \in\()siq + \out\()soq .req \in\()soq + \out\()suq .req \in\()suq + + \out\()spare0 .req \in\()bi + \out\()spare1 .req \in\()ma + \out\()spare2 .req \in\()go + \out\()spare3 .req \in\()ku + \out\()spare4 .req \in\()ge + \out\()spare5 .req \in\()mo + \out\()spare6 .req \in\()spare6 + \out\()spare0q .req \in\()biq + \out\()spare1q .req \in\()maq + \out\()spare2q .req \in\()goq + \out\()spare3q .req \in\()kuq + \out\()spare4q .req \in\()geq + \out\()spare5q .req \in\()moq + \out\()spare6q .req \in\()spare6q + + C0 .req \in\()spare0 + C1 .req \in\()spare1 + C2 .req \in\()spare2 + C3 .req \in\()spare3 + C4 .req \in\()spare4 + C0q .req \in\()spare0q + C1q .req \in\()spare1q + C2q .req \in\()spare2q + C3q .req \in\()spare3q + C4q .req \in\()spare4q + + E1c .req \in\()spare5 + E3c .req C2 + E0c .req C4 + E2c .req C1 + E4c .req C3 + + E1cq .req \in\()spare5q + E3cq .req C2q + E0cq .req C4q + E2cq .req C1q + E4cq .req C3q + + E0u .req tmp + E1u .req tmp + E2u .req tmp + E3u .req tmp + E4u .req tmp + + E0uq .req tmpq + E1uq .req tmpq + E2uq .req tmpq + E3uq .req tmpq + E4uq .req tmpq + + vBgo .req E1c + vBgi .req \in\()me + vBga .req \in\()ka + vBge .req \in\()bo + vBgu .req \in\()gu + + vBko .req \in\()me + vBka .req \in\()mu + vBke .req \in\()be + vBku .req \in\()gi + vBki .req \in\()sa + + vBmu .req \in\()bo + vBmo .req \in\()so + vBmi .req \in\()bu + vBma .req \in\()si + vBme .req \in\()be + + vBba .req \in\()si + vBbi .req \in\()sa + vBbo .req \in\()so + vBbu .req \in\()mo + vBbe .req \in\()su + + vBsa .req \in\()mo + vBso .req \in\()bi + vBse .req \in\()ma + vBsi .req \in\()go + vBsu .req \in\()ku +.endm + +.macro transfer_uncommon out, in + savep(\in\()ga, ga) + savep(\in\()gi, gi) + savep(\in\()ki, ki) + savep(\in\()ke, ke) + savep(\in\()bi, bi) + savep(\in\()ma, ma) + savep(\in\()go, go) + savep(\in\()ku, ku) + savep(\in\()ge, ge) + savep(\in\()mo, mo) + + restorep(\out\()gi, gi) + restorep(\out\()ke, ke) + restorep(\out\()ga, ga) + restorep(\out\()ki, ki) + restorep(\out\()bi, bi) + restorep(\out\()ma, ma) + restorep(\out\()go, go) + restorep(\out\()ku, ku) + restorep(\out\()ge, ge) + restorep(\out\()mo, mo) +.endm + +.macro undeclare_remappings out, in + .unreq vBgo + .unreq vBgi + .unreq vBga + .unreq vBge + .unreq vBgu + .unreq vBko + .unreq vBka + .unreq vBke + .unreq vBku + .unreq vBki + .unreq vBmu + .unreq vBmo + .unreq vBmi + .unreq vBma + .unreq vBme + .unreq vBba + .unreq vBbi + .unreq vBbo + .unreq vBbu + .unreq vBbe + .unreq vBsa + .unreq vBso + .unreq vBse + .unreq vBsi + .unreq vBsu + .unreq C0 + .unreq C1 + .unreq C2 + .unreq C3 + .unreq C4 + .unreq C0q + .unreq C1q + .unreq C2q + .unreq C3q + .unreq C4q + .unreq E1u + .unreq E3u + .unreq E0u + .unreq E2u + .unreq E4u + .unreq E1c + .unreq E3c + .unreq E0c + .unreq E2c + .unreq E4c + .unreq E1uq + .unreq E3uq + .unreq E0uq + .unreq E2uq + .unreq E4uq + .unreq E1cq + .unreq E3cq + .unreq E0cq + .unreq E2cq + .unreq E4cq +.endm + +/************************ MACROS ****************************/ + +.macro load_input + ldp Abaq, Abeq, [input_addr, #(2*8*0)] + ldp Abiq, Aboq, [input_addr, #(2*8*2)] + ldp Abuq, Agaq, [input_addr, #(2*8*4)] + ldp Ageq, Agiq, [input_addr, #(2*8*6)] + ldp Agoq, Aguq, [input_addr, #(2*8*8)] + ldp Akaq, Akeq, [input_addr, #(2*8*10)] + ldp Akiq, Akoq, [input_addr, #(2*8*12)] + ldp Akuq, Amaq, [input_addr, #(2*8*14)] + ldp Ameq, Amiq, [input_addr, #(2*8*16)] + ldp Amoq, Amuq, [input_addr, #(2*8*18)] + ldp Asaq, Aseq, [input_addr, #(2*8*20)] + ldp Asiq, Asoq, [input_addr, #(2*8*22)] + ldr Asuq, [input_addr, #(2*8*24)] +.endm + +.macro store_input + str Abaq, [input_addr, #(2*8*0)] + str Abeq, [input_addr, #(2*8*1)] + str Abiq, [input_addr, #(2*8*2)] + str Aboq, [input_addr, #(2*8*3)] + str Abuq, [input_addr, #(2*8*4)] + str Agaq, [input_addr, #(2*8*5)] + str Ageq, [input_addr, #(2*8*6)] + str Agiq, [input_addr, #(2*8*7)] + str Agoq, [input_addr, #(2*8*8)] + str Aguq, [input_addr, #(2*8*9)] + str Akaq, [input_addr, #(2*8*10)] + str Akeq, [input_addr, #(2*8*11)] + str Akiq, [input_addr, #(2*8*12)] + str Akoq, [input_addr, #(2*8*13)] + str Akuq, [input_addr, #(2*8*14)] + str Amaq, [input_addr, #(2*8*15)] + str Ameq, [input_addr, #(2*8*16)] + str Amiq, [input_addr, #(2*8*17)] + str Amoq, [input_addr, #(2*8*18)] + str Amuq, [input_addr, #(2*8*19)] + str Asaq, [input_addr, #(2*8*20)] + str Aseq, [input_addr, #(2*8*21)] + str Asiq, [input_addr, #(2*8*22)] + str Asoq, [input_addr, #(2*8*23)] + str Asuq, [input_addr, #(2*8*24)] +.endm + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +.macro save_vregs + stp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + stp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + stp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + stp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + ldp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + ldp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + ldp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +/* Macros using v8.4-A SHA-3 instructions */ + +.macro eor3_m1_0 d, s0, s1, s2 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor2 d, s0, s1 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro move d, s + mov \d\().16b, \s\().16b +.endm + + +.macro eor3_m1_1 d, s0, s1, s2 + eor \d\().16b, \d\().16b, \s2\().16b +.endm + +.macro eor3_m1 d, s0, s1, s2 + eor3_m1_0 \d, \s0, \s1, \s2 + eor3_m1_1 \d, \s0, \s1, \s2 +.endm + +.macro rax1_m1 d, s0, s1 + add tmp.2d, \s1\().2d, \s1\().2d + sri tmp.2d, \s1\().2d, #63 + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +.macro xar_m1 d, s0, s1, imm + eor \s0\().16b, \s0\().16b, \s1\().16b + shl \d\().2d, \s0\().2d, #(64-\imm) + sri \d\().2d, \s0\().2d, #(\imm) +.endm + +.macro xar_m1_0 d, s0, s1, imm, tmp + eor \tmp\().16b, \s0\().16b, \s1\().16b +.endm + +.macro xar_m1_1 d, s0, s1, imm, tmp + shl \d\().2d, \tmp\().2d, #(64-\imm) +.endm + +.macro xar_m1_2 d s0 s1 imm tmp + sri \d\().2d, \tmp\().2d, #(\imm) +.endm + +.macro bcax_m1 d s0 s1 s2 + bic tmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +.macro refresh d + mov \d\().16b, \d\().16b +.endm +/* Keccak-f1600 round */ + +.macro keccak_f1600_round_core out in + + eor2 C3, \in\()ko, \in\()go + eor2 C0, \in\()ka, \in\()ga + eor2 C1, \in\()ke, \in\()ge + eor2 C2, \in\()ki, \in\()gi + eor2 C4, \in\()ku, \in\()gu + + eor2 C0, C0, \in\()ma + eor2 C1, C1, \in\()me + eor2 C2, C2, \in\()mi + eor2 C3, C3, \in\()mo + eor2 C4, C4, \in\()mu + + eor2 C0, C0, \in\()ba + eor2 C1, C1, \in\()be + eor2 C2, C2, \in\()bi + eor2 C3, C3, \in\()bo + eor2 C4, C4, \in\()bu + + eor2 C0, C0, \in\()sa + eor2 C1, C1, \in\()se + eor2 C2, C2, \in\()si + eor2 C3, C3, \in\()so + eor2 C4, C4, \in\()su + + rax1_m1 E1c, C0, C2 SEP save(E1c) + rax1_m1 E3c, C2, C4 SEP save(E3c) + rax1_m1 E0c, C4, C1 SEP save(E0c) + rax1_m1 E2c, C1, C3 SEP save(E2c) + rax1_m1 E4c, C3, C0 SEP save(E4c) + + xar_m1 vBgo, \in\()me /* used at block 3 */, E1c, 19 + xar_m1 vBgi, \in\()ka /* used at block 2 */, E0c, 61 + xar_m1 vBga, \in\()bo /* used at block 4 */, E3c, 36 + xar_m1 vBge, \in\()gu /* used at block 1 */, E4c, 44 + xar_m1 vBgu, \in\()si /* used at block 5 */, E2c, 3 + + bcax_m1 \out\()ga, vBga, vBgi, vBge + bcax_m1 \out\()ge, vBge, vBgo, vBgi + bcax_m1 \out\()gi, vBgi, vBgu, vBgo + bcax_m1 \out\()go, vBgo, vBga, vBgu + bcax_m1 \out\()gu, vBgu, vBge, vBga + restore(E4u) + xar_m1 vBko, \in\()mu /* used at block 3 */, E4u, 56 SEP restore(E1u) + xar_m1 vBka, \in\()be /* used at block 4 */, E1u, 63 SEP restore(E2u) + xar_m1 vBke, \in\()gi /* not used */, E2u, 58 SEP restore(E0u) + xar_m1 vBku, \in\()sa /* used at block 5 */, E0u, 46 SEP restore(E3u) + xar_m1 vBki, \in\()ko /* used at block 2 */, E3u, 39 + + bcax_m1 \out\()ke, vBke, vBko, vBki + bcax_m1 \out\()ki, vBki, vBku, vBko + bcax_m1 \out\()ku, vBku, vBke, vBka + bcax_m1 \out\()ko, vBko, vBka, vBku + bcax_m1 \out\()ka, vBka, vBki, vBke + + // Can use: Abo, Asi, Abe, Asa; Abu, Aso + restore(E3u) + xar_m1 vBmu, \in\()so /* used at block 5 */, E3u, 8 SEP restore(E4u) + xar_m1 vBma, \in\()bu /* used at block 4 */, E4u, 37 SEP restore(E2u) + xar_m1 vBmo, \in\()mi /* used at block 3 */, E2u, 49 SEP restore(E1u) + xar_m1 vBmi, \in\()ke /* not used */, E1u, 54 SEP restore(E0u) + xar_m1 vBme, \in\()ga /* not used */, E0u, 28 + + bcax_m1 \out\()ma, vBma, vBmi, vBme + bcax_m1 \out\()mo, vBmo, vBma, vBmu + bcax_m1 \out\()me, vBme, vBmo, vBmi + bcax_m1 \out\()mi, vBmi, vBmu, vBmo + bcax_m1 \out\()mu, vBmu, vBme, vBma + + // Can use: Asi, Asa, Aso, Asu, Amo + restore(E0u) + eor2 vBba, \in\()ba /* used at block 4 */, E0u SEP restore(E2u) + xar_m1 vBbi, \in\()ki /* not used */, E2u, 21 SEP restore(E3u) + xar_m1 vBbo, \in\()mo /* not used */, E3u, 43 SEP restore(E4u) + xar_m1 vBbu, \in\()su /* used at block 5 */, E4u, 50 SEP restore(E1u) + xar_m1 vBbe, \in\()ge /* not used */, E1u, 20 + + bcax_m1 \out\()ba, vBba, vBbi, vBbe + ld1r {tmp.2d}, [const_addr], #8 + eor2 \out\()ba, \out\()ba, tmp + bcax_m1 \out\()be, vBbe, vBbo, vBbi + bcax_m1 \out\()bo, vBbo, vBba, vBbu + bcax_m1 \out\()bu, vBbu, vBbe, vBba + bcax_m1 \out\()bi, vBbi, vBbu, vBbo + + // Can use: Amo, Age, Abi, Ama, Ago, Aku + restore(E2u) + xar_m1 vBsa, \in\()bi /* not used */, E2u, 2 SEP restore(E0u) + xar_m1 vBso, \in\()ma /* not used */, E0u, 23 SEP restore(E3u) + xar_m1 vBse, \in\()go /* not used */, E3u, 9 SEP restore(E4u) + xar_m1 vBsi, \in\()ku /* not used */, E4u, 25 SEP restore(E1u) + xar_m1 vBsu, \in\()se /* used at block 5 */, E1u, 62 + + bcax_m1 \out\()sa, vBsa, vBsi, vBse + bcax_m1 \out\()se, vBse, vBso, vBsi + bcax_m1 \out\()si, vBsi, vBsu, vBso + bcax_m1 \out\()so, vBso, vBsa, vBsu + bcax_m1 \out\()su, vBsu, vBse, vBsa + +.endm + +.text +.align 4 +.global keccak_f1600_x2_v84a_asm_v2p4 +.global _keccak_f1600_x2_v84a_asm_v2p4 + +#define KECCAK_F1600_ROUNDS 24 + +keccak_f1600_x2_v84a_asm_v2p4: +_keccak_f1600_x2_v84a_asm_v2p4: + alloc_stack + save_vregs + load_constant_ptr + load_input + + + //mov count, #(KECCAK_F1600_ROUNDS-2) + mov count, #24 +loop: + declare_remappings A1, A + keccak_f1600_round_core A1, A + undeclare_remappings A1, A + + declare_remappings A2, A1 + keccak_f1600_round_core A2, A1 + undeclare_remappings A2, A1 + + declare_remappings A3, A2 + keccak_f1600_round_core A3, A2 + undeclare_remappings A3, A2 + + declare_remappings A4, A3 + keccak_f1600_round_core A4, A3 + undeclare_remappings A4, A3 + + transfer_uncommon A, A4 + + sub count, count, #4 + cbnz count, loop + + + store_input + restore_vregs + free_stack + ret diff --git a/asm/manual/keccak_f1600/keccak_f1600_x2_v84a_asm_v2p5.s b/asm/manual/keccak_f1600/keccak_f1600_x2_v84a_asm_v2p5.s new file mode 100644 index 0000000..22e0373 --- /dev/null +++ b/asm/manual/keccak_f1600/keccak_f1600_x2_v84a_asm_v2p5.s @@ -0,0 +1,949 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +#define STACK_SIZE (16*4 + 16*30) +#define STACK_BASE_VREGS 0 +#define STACK_BASE_TMP 16*4 + +#define E0c_offset 0 +#define E1c_offset 1 +#define E2c_offset 2 +#define E3c_offset 3 +#define E4c_offset 4 +#define E0u_offset 0 +#define E1u_offset 1 +#define E2u_offset 2 +#define E3u_offset 3 +#define E4u_offset 4 + +#define ba_offset (5 + 0 ) +#define be_offset (5 + 1 ) +#define bi_offset (5 + 2 ) +#define bo_offset (5 + 3 ) +#define bu_offset (5 + 4 ) +#define ga_offset (5 + 5 ) +#define ge_offset (5 + 6 ) +#define gi_offset (5 + 7 ) +#define go_offset (5 + 8 ) +#define gu_offset (5 + 9 ) +#define ka_offset (5 + 10 ) +#define ke_offset (5 + 11 ) +#define ki_offset (5 + 12 ) +#define ko_offset (5 + 13 ) +#define ku_offset (5 + 14 ) +#define ma_offset (5 + 15 ) +#define me_offset (5 + 16 ) +#define mi_offset (5 + 17 ) +#define mo_offset (5 + 18 ) +#define mu_offset (5 + 19 ) +#define sa_offset (5 + 20 ) +#define se_offset (5 + 21 ) +#define si_offset (5 + 22 ) +#define so_offset (5 + 23 ) +#define su_offset (5 + 24 ) + +#define savep(reg, offset_prefix) \ + str reg ## q, [sp, #(STACK_BASE_TMP + 16 * offset_prefix ## _offset)] +#define restorep(reg, offset_prefix) \ + ldr reg ## q, [sp, #(STACK_BASE_TMP + 16 * offset_prefix ## _offset)] +#define save(name) savep(name,name) +#define restore(name) restorep(name,name) + +/********************** CONSTANTS *************************/ + .data + .align(8) +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x1 + count .req x2 + cur_const .req x3 + + /* Mapping of Kecck-f1600 state to vector registers + * at the beginning and end of each round. */ + Aba .req v0 + Abe .req v1 + Abo .req v2 + Abu .req v3 + Agu .req v4 + Aka .req v5 + Ako .req v6 + Ame .req v7 + Ami .req v8 + Amu .req v9 + Asa .req v10 + Ase .req v11 + Asi .req v12 + Aso .req v13 + Asu .req v14 + + Agi .req v15 + Ake .req v16 + Aga .req v17 + Aki .req v18 + + Abi .req v19 + Ama .req v20 + Ago .req v21 + Aku .req v22 + Age .req v23 + Amo .req v24 + + /* q-form of the above mapping */ + Abaq .req q0 + Abeq .req q1 + Aboq .req q2 + Abuq .req q3 + Aguq .req q4 + Akaq .req q5 + Akoq .req q6 + Ameq .req q7 + Amiq .req q8 + Amuq .req q9 + Asaq .req q10 + Aseq .req q11 + Asiq .req q12 + Asoq .req q13 + Asuq .req q14 + + Agiq .req q15 + Akeq .req q16 + Agaq .req q17 + Akiq .req q18 + + Abiq .req q19 + Amaq .req q20 + Agoq .req q21 + Akuq .req q22 + Ageq .req q23 + Amoq .req q24 + + Aspare0 .req v25 + Aspare1 .req v26 + Aspare2 .req v27 + Aspare3 .req v28 + Aspare4 .req v29 + Aspare5 .req v30 + Aspare6 .req v31 + Aspare0q .req q25 + Aspare1q .req q26 + Aspare2q .req q27 + Aspare3q .req q28 + Aspare4q .req q29 + Aspare5q .req q30 + Aspare6q .req q31 + +.macro declare_remappings out,in + tmp .req \in\()spare6 + tmpq .req \in\()spare6q + + \out\()gu .req \in\()gu /* keep */ + \out\()ga .req \in\()spare0 /* out */ + \out\()ge .req \in\()spare1 /* out */ + \out\()gi .req \in\()spare2 /* out */ + \out\()go .req \in\()spare3 /* out */ + + \out\()ka .req \in\()ka /* keep */ + \out\()ko .req \in\()ko /* keep */ + \out\()ke .req \in\()spare4 /* out */ + \out\()ki .req \in\()spare5 /* out */ + \out\()ku .req \in\()gi /* in */ + + \out\()ma .req \in\()ke /* in */ + \out\()me .req \in\()me /* keep */ + \out\()mi .req \in\()mi /* keep */ + \out\()mo .req \in\()ga /* in */ + \out\()mu .req \in\()mu /* keep */ + + \out\()ba .req \in\()ba /* keep */ + \out\()be .req \in\()be /* keep */ + \out\()bi .req \in\()ki /* in */ + \out\()bo .req \in\()bo /* keep */ + \out\()bu .req \in\()bu /* keep */ + + \out\()sa .req \in\()sa /* keep */ + \out\()se .req \in\()se /* keep */ + \out\()si .req \in\()si /* keep */ + \out\()so .req \in\()so /* keep */ + \out\()su .req \in\()su /* keep */ + + \out\()guq .req \in\()guq + \out\()gaq .req \in\()spare0q + \out\()geq .req \in\()spare1q + \out\()giq .req \in\()spare2q + \out\()goq .req \in\()spare3q + + \out\()kaq .req \in\()kaq + \out\()koq .req \in\()koq + \out\()keq .req \in\()spare4q + \out\()kiq .req \in\()spare5q + \out\()kuq .req \in\()giq + + \out\()maq .req \in\()keq + \out\()meq .req \in\()meq + \out\()miq .req \in\()miq + \out\()moq .req \in\()gaq + \out\()muq .req \in\()muq + + \out\()baq .req \in\()baq + \out\()beq .req \in\()beq + \out\()biq .req \in\()kiq + \out\()boq .req \in\()boq + \out\()buq .req \in\()buq + + \out\()saq .req \in\()saq + \out\()seq .req \in\()seq + \out\()siq .req \in\()siq + \out\()soq .req \in\()soq + \out\()suq .req \in\()suq + + \out\()spare0 .req \in\()bi + \out\()spare1 .req \in\()ma + \out\()spare4 .req \in\()go + \out\()spare2 .req \in\()ku + \out\()spare3 .req \in\()ge + \out\()spare5 .req \in\()mo + \out\()spare6 .req \in\()spare6 + \out\()spare0q .req \in\()biq + \out\()spare1q .req \in\()maq + \out\()spare4q .req \in\()goq + \out\()spare2q .req \in\()kuq + \out\()spare3q .req \in\()geq + \out\()spare5q .req \in\()moq + \out\()spare6q .req \in\()spare6q + + C0 .req \in\()spare3 + C1 .req \in\()spare1 + C2 .req \in\()spare2 + C3 .req \in\()spare0 + C4 .req \in\()spare4 + C0q .req \in\()spare3q + C1q .req \in\()spare1q + C2q .req \in\()spare2q + C3q .req \in\()spare0q + C4q .req \in\()spare4q + + E1c .req \in\()spare5 + E3c .req C2 + E0c .req C4 + E2c .req C1 + E4c .req C3 + + E1cq .req \in\()spare5q + E3cq .req C2q + E0cq .req C4q + E2cq .req C1q + E4cq .req C3q + + E0u .req tmp + E1u .req tmp + E2u .req tmp + E3u .req tmp + E4u .req tmp + + E0uq .req tmpq + E1uq .req tmpq + E2uq .req tmpq + E3uq .req tmpq + E4uq .req tmpq + + vBgo .req E1c + vBgi .req \in\()me + vBga .req \in\()ka + vBge .req \in\()bo + vBgu .req \in\()gu + + vBko .req \in\()me + vBka .req \in\()mu + vBke .req \in\()be + vBku .req \in\()gi + vBki .req \in\()sa + + vBmu .req \in\()bo + vBmo .req \in\()so + vBmi .req \in\()bu + vBma .req \in\()si + vBme .req \in\()be + + vBba .req \in\()si + vBbi .req \in\()sa + vBbo .req \in\()so + vBbu .req \in\()mo + vBbe .req \in\()su + + vBsa .req \in\()mo + vBso .req \in\()bi + vBse .req \in\()ma + vBsi .req \in\()go + vBsu .req E1u //\in\()ku +.endm + +.macro transfer_uncommon out, in + savep(\in\()ga, ga) + savep(\in\()gi, gi) + savep(\in\()ki, ki) + savep(\in\()ke, ke) + savep(\in\()bi, bi) + savep(\in\()ma, ma) + savep(\in\()go, go) + savep(\in\()ku, ku) + savep(\in\()ge, ge) + savep(\in\()mo, mo) + + restorep(\out\()gi, gi) + restorep(\out\()ke, ke) + restorep(\out\()ga, ga) + restorep(\out\()ki, ki) + restorep(\out\()bi, bi) + restorep(\out\()ma, ma) + restorep(\out\()go, go) + restorep(\out\()ku, ku) + restorep(\out\()ge, ge) + restorep(\out\()mo, mo) +.endm + +.macro undeclare_remappings out, in + .unreq vBgo + .unreq vBgi + .unreq vBga + .unreq vBge + .unreq vBgu + .unreq vBko + .unreq vBka + .unreq vBke + .unreq vBku + .unreq vBki + .unreq vBmu + .unreq vBmo + .unreq vBmi + .unreq vBma + .unreq vBme + .unreq vBba + .unreq vBbi + .unreq vBbo + .unreq vBbu + .unreq vBbe + .unreq vBsa + .unreq vBso + .unreq vBse + .unreq vBsi + .unreq vBsu + .unreq C0 + .unreq C1 + .unreq C2 + .unreq C3 + .unreq C4 + .unreq C0q + .unreq C1q + .unreq C2q + .unreq C3q + .unreq C4q + .unreq E1u + .unreq E3u + .unreq E0u + .unreq E2u + .unreq E4u + .unreq E1c + .unreq E3c + .unreq E0c + .unreq E2c + .unreq E4c + .unreq E1uq + .unreq E3uq + .unreq E0uq + .unreq E2uq + .unreq E4uq + .unreq E1cq + .unreq E3cq + .unreq E0cq + .unreq E2cq + .unreq E4cq +.endm + +/************************ MACROS ****************************/ + +.macro load_input + ldp Abaq, Abeq, [input_addr, #(2*8*0)] + ldp Abiq, Aboq, [input_addr, #(2*8*2)] + ldp Abuq, Agaq, [input_addr, #(2*8*4)] + ldp Ageq, Agiq, [input_addr, #(2*8*6)] + ldp Agoq, Aguq, [input_addr, #(2*8*8)] + ldp Akaq, Akeq, [input_addr, #(2*8*10)] + ldp Akiq, Akoq, [input_addr, #(2*8*12)] + ldp Akuq, Amaq, [input_addr, #(2*8*14)] + ldp Ameq, Amiq, [input_addr, #(2*8*16)] + ldp Amoq, Amuq, [input_addr, #(2*8*18)] + ldp Asaq, Aseq, [input_addr, #(2*8*20)] + ldp Asiq, Asoq, [input_addr, #(2*8*22)] + ldr Asuq, [input_addr, #(2*8*24)] +.endm + +.macro store_input + str Abaq, [input_addr, #(2*8*0)] + str Abeq, [input_addr, #(2*8*1)] + str Abiq, [input_addr, #(2*8*2)] + str Aboq, [input_addr, #(2*8*3)] + str Abuq, [input_addr, #(2*8*4)] + str Agaq, [input_addr, #(2*8*5)] + str Ageq, [input_addr, #(2*8*6)] + str Agiq, [input_addr, #(2*8*7)] + str Agoq, [input_addr, #(2*8*8)] + str Aguq, [input_addr, #(2*8*9)] + str Akaq, [input_addr, #(2*8*10)] + str Akeq, [input_addr, #(2*8*11)] + str Akiq, [input_addr, #(2*8*12)] + str Akoq, [input_addr, #(2*8*13)] + str Akuq, [input_addr, #(2*8*14)] + str Amaq, [input_addr, #(2*8*15)] + str Ameq, [input_addr, #(2*8*16)] + str Amiq, [input_addr, #(2*8*17)] + str Amoq, [input_addr, #(2*8*18)] + str Amuq, [input_addr, #(2*8*19)] + str Asaq, [input_addr, #(2*8*20)] + str Aseq, [input_addr, #(2*8*21)] + str Asiq, [input_addr, #(2*8*22)] + str Asoq, [input_addr, #(2*8*23)] + str Asuq, [input_addr, #(2*8*24)] +.endm + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +.macro save_vregs + stp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + stp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + stp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + stp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + ldp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + ldp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + ldp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +/* Macros using v8.4-A SHA-3 instructions */ + +.macro eor3_m1_0 d s0 s1 s2 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor2 d s0 s1 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor5 out i0 i1 i2 i3 i4 tmp + eor2 \out, \i0, \i1 + eor2 \tmp, \i3, \i4 + eor2 \out, \out, \i2 + eor2 \out, \out, \tmp +.endm + +.macro move d s + mov \d\().16b, \s\().16b +.endm + + +.macro eor3_m1_1 d s0 s1 s2 + eor \d\().16b, \d\().16b, \s2\().16b +.endm + +.macro eor3_m1 d s0 s1 s2 + eor3_m1_0 \d, \s0, \s1, \s2 + eor3_m1_1 \d, \s0, \s1, \s2 +.endm + +.macro rax1_m1 d s0 s1 + add tmp.2d, \s1\().2d, \s1\().2d + sri tmp.2d, \s1\().2d, #63 + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +.macro xar_m1 d s0 s1 imm + eor \s0\().16b, \s0\().16b, \s1\().16b + shl \d\().2d, \s0\().2d, #(64-\imm) + sri \d\().2d, \s0\().2d, #(\imm) +.endm + +.macro xar_m1_0 d s0 s1 imm tmp + eor \tmp\().16b, \s0\().16b, \s1\().16b +.endm + +.macro xar_m1_1 d s0 s1 imm tmp + shl \d\().2d, \tmp\().2d, #(64-\imm) +.endm + +.macro xar_m1_2 d s0 s1 imm tmp + sri \d\().2d, \tmp\().2d, #(\imm) +.endm + +.macro bcax_m1 d s0 s1 s2 + bic tmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +.macro bcax_m1_d d s0 s1 s2 + bic \d\().16b, \s1\().16b, \s2\().16b + eor \d\().16b, \d\().16b, \s0\().16b +.endm + +.macro refresh d + mov \d\().16b, \d\().16b +.endm +/* Keccak-f1600 round */ + +.macro keccak_f1600_round_full out in + + eor5 C0, \in\()ka, \in\()ga, \in\()ma, \in\()ba, \in\()sa, tmp + eor5 C2, \in\()ki, \in\()gi, \in\()mi, \in\()bi, \in\()si, tmp + eor5 C4, \in\()ku, \in\()gu, \in\()mu, \in\()bu, \in\()su, tmp + eor5 C1, \in\()ke, \in\()ge, \in\()me, \in\()be, \in\()se, tmp + eor5 C3, \in\()ko, \in\()go, \in\()mo, \in\()bo, \in\()so, tmp + + rax1_m1 E1c, C0, C2 SEP save(E1c) + rax1_m1 E3c, C2, C4 SEP save(E3c) + rax1_m1 E0c, C4, C1 SEP save(E0c) + rax1_m1 E2c, C1, C3 SEP save(E2c) + rax1_m1 E4c, C3, C0 SEP save(E4c) + + xar_m1 vBgo, \in\()me /* used at block 3 */, E1c, 19 + xar_m1 vBgi, \in\()ka /* used at block 2 */, E0c, 61 + xar_m1 vBga, \in\()bo /* used at block 4 */, E3c, 36 + xar_m1 vBge, \in\()gu /* used at block 1 */, E4c, 44 + xar_m1 vBgu, \in\()si /* used at block 5 */, E2c, 3 + + bcax_m1 \out\()ga, vBga, vBgi, vBge + bcax_m1 \out\()ge, vBge, vBgo, vBgi + bcax_m1 \out\()gi, vBgi, vBgu, vBgo + bcax_m1 \out\()go, vBgo, vBga, vBgu + bcax_m1 \out\()gu, vBgu, vBge, vBga + restore(E4u) + xar_m1 vBko, \in\()mu /* used at block 3 */, E4u, 56 SEP restore(E1u) + xar_m1 vBka, \in\()be /* used at block 4 */, E1u, 63 SEP restore(E2u) + xar_m1 vBke, \in\()gi /* not used */, E2u, 58 SEP restore(E0u) + xar_m1 vBku, \in\()sa /* used at block 5 */, E0u, 46 SEP restore(E3u) + xar_m1 vBki, \in\()ko /* used at block 2 */, E3u, 39 + + bcax_m1 \out\()ke, vBke, vBko, vBki + bcax_m1 \out\()ki, vBki, vBku, vBko + bcax_m1 \out\()ku, vBku, vBke, vBka + bcax_m1 \out\()ko, vBko, vBka, vBku + bcax_m1 \out\()ka, vBka, vBki, vBke + + // Can use: Abo, Asi, Abe, Asa; Abu, Aso + restore(E3u) + xar_m1 vBmu, \in\()so /* used at block 5 */, E3u, 8 SEP restore(E4u) + xar_m1 vBma, \in\()bu /* used at block 4 */, E4u, 37 SEP restore(E2u) + xar_m1 vBmo, \in\()mi /* used at block 3 */, E2u, 49 SEP restore(E1u) + xar_m1 vBmi, \in\()ke /* not used */, E1u, 54 SEP restore(E0u) + xar_m1 vBme, \in\()ga /* not used */, E0u, 28 + + bcax_m1 \out\()ma, vBma, vBmi, vBme + bcax_m1 \out\()mo, vBmo, vBma, vBmu + bcax_m1 \out\()me, vBme, vBmo, vBmi + bcax_m1 \out\()mi, vBmi, vBmu, vBmo + bcax_m1 \out\()mu, vBmu, vBme, vBma + + // Can use: Asi, Asa, Aso, Asu, Amo + restore(E0u) + eor2 vBba, \in\()ba /* used at block 4 */, E0u SEP restore(E2u) + xar_m1 vBbi, \in\()ki /* not used* */, E2u, 21 SEP restore(E3u) + xar_m1 vBbo, \in\()mo /* not used+ */, E3u, 43 SEP restore(E4u) + xar_m1 vBbu, \in\()su /* used at block 5 */, E4u, 50 SEP restore(E1u) + xar_m1 vBbe, \in\()ge /* not used */, E1u, 20 + + bcax_m1 \out\()ba, vBba, vBbi, vBbe + ld1r {tmp.2d}, [const_addr], #8 + eor2 \out\()ba, \out\()ba, tmp + bcax_m1 \out\()be, vBbe, vBbo, vBbi + bcax_m1 \out\()bo, vBbo, vBba, vBbu + bcax_m1 \out\()bu, vBbu, vBbe, vBba + bcax_m1 \out\()bi, vBbi, vBbu, vBbo + + // Can use: Amo, Age, Abi, Ama, Ago, Aku + restore(E2u) + xar_m1 vBsa, \in\()bi /* not used+ */, E2u, 2 SEP restore(E0u) + xar_m1 vBso, \in\()ma /* not used+ */, E0u, 23 SEP restore(E3u) + xar_m1 vBse, \in\()go /* not used+ */, E3u, 9 SEP restore(E4u) + xar_m1 vBsi, \in\()ku /* not used */, E4u, 25 SEP restore(E1u) + xar_m1 vBsu, \in\()se /* used at block 5 */, E1u, 62 + + bcax_m1_d \out\()sa, vBsa, vBsi, vBse + bcax_m1_d \out\()se, vBse, vBso, vBsi + bcax_m1_d \out\()si, vBsi, vBsu, vBso + bcax_m1_d \out\()so, vBso, vBsa, vBsu + bcax_m1_d \out\()su, vBsu, vBse, vBsa + +.endm + +.macro keccak_f1600_round_pre out in + + eor5 C0, \in\()ka, \in\()ga, \in\()ma, \in\()ba, \in\()sa, tmp + eor5 C2, \in\()ki, \in\()gi, \in\()mi, \in\()bi, \in\()si, tmp + eor5 C4, \in\()ku, \in\()gu, \in\()mu, \in\()bu, \in\()su, tmp + eor5 C1, \in\()ke, \in\()ge, \in\()me, \in\()be, \in\()se, tmp + eor5 C3, \in\()ko, \in\()go, \in\()mo, \in\()bo, \in\()so, tmp + +.endm + +.macro keccak_f1600_round_post out in + + .unreq C0 + .unreq C1 + .unreq C2 + .unreq C3 + .unreq C4 + .unreq C0q + .unreq C1q + .unreq C2q + .unreq C3q + .unreq C4q + + C0 .req \out\()spare3 + C1 .req \out\()spare1 + C2 .req \out\()spare2 + C3 .req \out\()spare0 + C4 .req \out\()spare4 + C0q .req \out\()spare3q + C1q .req \out\()spare1q + C2q .req \out\()spare2q + C3q .req \out\()spare0q + C4q .req \out\()spare4q + + eor5 C0, \out\()ka, \out\()ga, \out\()ma, \out\()ba, \out\()sa, tmp + eor5 C2, \out\()ki, \out\()gi, \out\()mi, \out\()bi, \out\()si, tmp + eor5 C4, \out\()ku, \out\()gu, \out\()mu, \out\()bu, \out\()su, tmp + eor5 C1, \out\()ke, \out\()ge, \out\()me, \out\()be, \out\()se, tmp + eor5 C3, \out\()ko, \out\()go, \out\()mo, \out\()bo, \out\()so, tmp + +.endm +.macro keccak_f1600_round_core out in + + rax1_m1 E1c, C0, C2 SEP save(E1c) + rax1_m1 E3c, C2, C4 SEP save(E3c) + rax1_m1 E0c, C4, C1 SEP save(E0c) + rax1_m1 E2c, C1, C3 SEP save(E2c) + rax1_m1 E4c, C3, C0 SEP save(E4c) + + xar_m1 vBgo, \in\()me /* used at block 3 */, E1c, 19 + xar_m1 vBgi, \in\()ka /* used at block 2 */, E0c, 61 + xar_m1 vBga, \in\()bo /* used at block 4 */, E3c, 36 + xar_m1 vBge, \in\()gu /* used at block 1 */, E4c, 44 + xar_m1 vBgu, \in\()si /* used at block 5 */, E2c, 3 + + bcax_m1 \out\()ga, vBga, vBgi, vBge + bcax_m1 \out\()ge, vBge, vBgo, vBgi + bcax_m1 \out\()gi, vBgi, vBgu, vBgo + bcax_m1 \out\()go, vBgo, vBga, vBgu + bcax_m1 \out\()gu, vBgu, vBge, vBga + restore(E4u) + xar_m1 vBko, \in\()mu /* used at block 3 */, E4u, 56 SEP restore(E1u) + xar_m1 vBka, \in\()be /* used at block 4 */, E1u, 63 SEP restore(E2u) + xar_m1 vBke, \in\()gi /* not used */, E2u, 58 SEP restore(E0u) + xar_m1 vBku, \in\()sa /* used at block 5 */, E0u, 46 SEP restore(E3u) + xar_m1 vBki, \in\()ko /* used at block 2 */, E3u, 39 + + bcax_m1 \out\()ke, vBke, vBko, vBki + bcax_m1 \out\()ki, vBki, vBku, vBko + bcax_m1 \out\()ku, vBku, vBke, vBka + bcax_m1 \out\()ko, vBko, vBka, vBku + bcax_m1 \out\()ka, vBka, vBki, vBke + + // Can use: Abo, Asi, Abe, Asa; Abu, Aso + restore(E3u) + xar_m1 vBmu, \in\()so /* used at block 5 */, E3u, 8 SEP restore(E4u) + xar_m1 vBma, \in\()bu /* used at block 4 */, E4u, 37 SEP restore(E2u) + xar_m1 vBmo, \in\()mi /* used at block 3 */, E2u, 49 SEP restore(E1u) + xar_m1 vBmi, \in\()ke /* not used */, E1u, 54 SEP restore(E0u) + xar_m1 vBme, \in\()ga /* not used */, E0u, 28 + + bcax_m1 \out\()ma, vBma, vBmi, vBme + bcax_m1 \out\()mo, vBmo, vBma, vBmu + bcax_m1 \out\()me, vBme, vBmo, vBmi + bcax_m1 \out\()mi, vBmi, vBmu, vBmo + bcax_m1 \out\()mu, vBmu, vBme, vBma + + // Can use: Asi, Asa, Aso, Asu, Amo + restore(E0u) + eor2 vBba, \in\()ba /* used at block 4 */, E0u SEP restore(E2u) + xar_m1 vBbi, \in\()ki /* not used* */, E2u, 21 SEP restore(E3u) + xar_m1 vBbo, \in\()mo /* not used+ */, E3u, 43 SEP restore(E4u) + xar_m1 vBbu, \in\()su /* used at block 5 */, E4u, 50 SEP restore(E1u) + xar_m1 vBbe, \in\()ge /* not used */, E1u, 20 + + bcax_m1 \out\()ba, vBba, vBbi, vBbe + ld1r {tmp.2d}, [const_addr], #8 + eor2 \out\()ba, \out\()ba, tmp + bcax_m1 \out\()be, vBbe, vBbo, vBbi + bcax_m1 \out\()bo, vBbo, vBba, vBbu + bcax_m1 \out\()bu, vBbu, vBbe, vBba + bcax_m1 \out\()bi, vBbi, vBbu, vBbo + + // Can use: Amo, Age, Abi, Ama, Ago, Aku + restore(E2u) + xar_m1 vBsa, \in\()bi /* not used+ */, E2u, 2 SEP restore(E0u) + xar_m1 vBso, \in\()ma /* not used+ */, E0u, 23 SEP restore(E3u) + xar_m1 vBse, \in\()go /* not used+ */, E3u, 9 SEP restore(E4u) + xar_m1 vBsi, \in\()ku /* not used */, E4u, 25 SEP restore(E1u) + xar_m1 vBsu, \in\()se /* used at block 5 */, E1u, 62 + + bcax_m1_d \out\()sa, vBsa, vBsi, vBse + bcax_m1_d \out\()se, vBse, vBso, vBsi + bcax_m1_d \out\()si, vBsi, vBsu, vBso + bcax_m1_d \out\()so, vBso, vBsa, vBsu + bcax_m1_d \out\()su, vBsu, vBse, vBsa + +.endm + +.macro keccak_f1600_round_first out in + keccak_f1600_round_pre \out, \in + keccak_f1600_round_core \out, \in + keccak_f1600_round_post \out, \in +.endm + +.macro keccak_f1600_round_inner out in + keccak_f1600_round_core \out, \in + keccak_f1600_round_post \out, \in +.endm + +.macro keccak_f1600_round_last out in + keccak_f1600_round_core \out, \in +.endm + +.macro keccak_f1600_round_inner_optim out in + + rax1_m1 E1c, C0, C2 SEP save(E1c) + rax1_m1 E3c, C2, C4 SEP save(E3c) + rax1_m1 E0c, C4, C1 SEP save(E0c) + rax1_m1 E2c, C1, C3 SEP save(E2c) + rax1_m1 E4c, C3, C0 SEP save(E4c) + + xar_m1 vBgo, \in\()me /* used at block 3 */, E1c, 19 + xar_m1 vBgi, \in\()ka /* used at block 2 */, E0c, 61 + xar_m1 vBga, \in\()bo /* used at block 4 */, E3c, 36 + xar_m1 vBge, \in\()gu /* used at block 1 */, E4c, 44 + xar_m1 vBgu, \in\()si /* used at block 5 */, E2c, 3 + + bcax_m1 \out\()ga, vBga, vBgi, vBge + bcax_m1 \out\()ge, vBge, vBgo, vBgi + bcax_m1 \out\()gi, vBgi, vBgu, vBgo + bcax_m1 \out\()go, vBgo, vBga, vBgu + bcax_m1 \out\()gu, vBgu, vBge, vBga + restore(E4u) + xar_m1 vBko, \in\()mu /* used at block 3 */, E4u, 56 SEP restore(E1u) + xar_m1 vBka, \in\()be /* used at block 4 */, E1u, 63 SEP restore(E2u) + xar_m1 vBke, \in\()gi /* not used */, E2u, 58 SEP restore(E0u) + xar_m1 vBku, \in\()sa /* used at block 5 */, E0u, 46 SEP restore(E3u) + xar_m1 vBki, \in\()ko /* used at block 2 */, E3u, 39 + + bcax_m1 \out\()ke, vBke, vBko, vBki + bcax_m1 \out\()ki, vBki, vBku, vBko + bcax_m1 \out\()ku, vBku, vBke, vBka + bcax_m1 \out\()ko, vBko, vBka, vBku + bcax_m1 \out\()ka, vBka, vBki, vBke + + // Can use: Abo, Asi, Abe, Asa; Abu, Aso + restore(E3u) + xar_m1 vBmu, \in\()so /* used at block 5 */, E3u, 8 SEP restore(E4u) + xar_m1 vBma, \in\()bu /* used at block 4 */, E4u, 37 SEP restore(E2u) + xar_m1 vBmo, \in\()mi /* used at block 3 */, E2u, 49 SEP restore(E1u) + xar_m1 vBmi, \in\()ke /* not used */, E1u, 54 SEP restore(E0u) + xar_m1 vBme, \in\()ga /* not used */, E0u, 28 + + bcax_m1 \out\()ma, vBma, vBmi, vBme + bcax_m1 \out\()mo, vBmo, vBma, vBmu + bcax_m1 \out\()me, vBme, vBmo, vBmi + bcax_m1 \out\()mi, vBmi, vBmu, vBmo + bcax_m1 \out\()mu, vBmu, vBme, vBma + + // Can use: Asi, Asa, Aso, Asu, Amo + restore(E0u) + eor2 vBba, \in\()ba /* used at block 4 */, E0u SEP restore(E2u) + xar_m1 vBbi, \in\()ki /* not used* */, E2u, 21 SEP restore(E3u) + xar_m1 vBbo, \in\()mo /* not used+ */, E3u, 43 SEP restore(E4u) + xar_m1 vBbu, \in\()su /* used at block 5 */, E4u, 50 SEP restore(E1u) + xar_m1 vBbe, \in\()ge /* not used */, E1u, 20 + + .unreq C0 + .unreq C0q + C0 .req \out\()spare3 + C0q .req \out\()spare3q + + eor2 C0, \out\()ka, \out\()ga + eor2 C0, C0, \out\()ma + + bcax_m1 \out\()ba, vBba, vBbi, vBbe + ld1r {tmp.2d}, [const_addr], #8 + eor2 \out\()ba, \out\()ba, tmp + bcax_m1 \out\()be, vBbe, vBbo, vBbi + bcax_m1 \out\()bo, vBbo, vBba, vBbu + bcax_m1 \out\()bu, vBbu, vBbe, vBba + bcax_m1 \out\()bi, vBbi, vBbu, vBbo + + eor2 C0, C0, \out\()ba + + // Can use: Amo, Age, Abi, Ama, Ago, Aku + restore(E2u) + xar_m1 vBsa, \in\()bi /* not used+ */, E2u, 2 SEP restore(E0u) + xar_m1 vBso, \in\()ma /* not used+ */, E0u, 23 SEP restore(E3u) + xar_m1 vBse, \in\()go /* not used+ */, E3u, 9 SEP restore(E4u) + xar_m1 vBsi, \in\()ku /* not used */, E4u, 25 SEP restore(E1u) + + .unreq C2 + .unreq C2q + C2 .req \out\()spare2 + C2q .req \out\()spare2q + + eor2 C2, \out\()ki, \out\()gi + eor2 C2, C2, \out\()mi + eor2 C2, C2, \out\()bi + + xar_m1 vBsu, \in\()se /* used at block 5 */, E1u, 62 + + bcax_m1_d \out\()sa, vBsa, vBsi, vBse + eor2 C0, C0, \out\()sa + bcax_m1_d \out\()se, vBse, vBso, vBsi + bcax_m1_d \out\()si, vBsi, vBsu, vBso + eor2 C2, C2, \out\()si + bcax_m1_d \out\()so, vBso, vBsa, vBsu + bcax_m1_d \out\()su, vBsu, vBse, vBsa + + .unreq C1 + .unreq C1q + C1 .req \out\()spare1 + C1q .req \out\()spare1q + + + .unreq C3 + .unreq C4 + .unreq C3q + .unreq C4q + + C3 .req \out\()spare0 + C4 .req \out\()spare4 + C3q .req \out\()spare0q + C4q .req \out\()spare4q + +// eor5 C0, \out\()ka, \out\()ga, \out\()ma, \out\()ba, \out\()sa, tmp +// eor5 C2, \out\()ki, \out\()gi, \out\()mi, \out\()bi, \out\()si, tmp + eor5 C4, \out\()ku, \out\()gu, \out\()mu, \out\()bu, \out\()su, tmp + eor5 C1, \out\()ke, \out\()ge, \out\()me, \out\()be, \out\()se, tmp + eor5 C3, \out\()ko, \out\()go, \out\()mo, \out\()bo, \out\()so, tmp + +.endm + +.text +.align 4 +.global keccak_f1600_x2_v84a_asm_v2p5 +.global _keccak_f1600_x2_v84a_asm_v2p5 + +#define KECCAK_F1600_ROUNDS 24 + +keccak_f1600_x2_v84a_asm_v2p5: +_keccak_f1600_x2_v84a_asm_v2p5: + alloc_stack + save_vregs + load_constant_ptr + load_input + + + //mov count, #(KECCAK_F1600_ROUNDS-2) + mov count, #24 +loop: + declare_remappings A1, A + keccak_f1600_round_first A1, A +// keccak_f1600_round_pre A1 A +// keccak_f1600_round_core A1 A +// keccak_f1600_round_post A1 A + undeclare_remappings A1, A + + declare_remappings A2, A1 +// keccak_f1600_round_pre A2 A1 +// keccak_f1600_round_core A2 A1 +// keccak_f1600_round_post A2 A1 + keccak_f1600_round_inner_optim A2, A1 + undeclare_remappings A2, A1 + + declare_remappings A3, A2 +// keccak_f1600_round_pre A3 A2 +// keccak_f1600_round_core A3 A2 +// keccak_f1600_round_post A3 A2 + keccak_f1600_round_inner_optim A3, A2 + undeclare_remappings A3, A2 + + declare_remappings A4, A3 +// keccak_f1600_round_pre A4 A3 + keccak_f1600_round_last A4, A3 +// keccak_f1600_round_post A4 A3 + undeclare_remappings A4, A3 + + transfer_uncommon A, A4 + + sub count, count, #4 + cbnz count, loop + + store_input + restore_vregs + free_stack + ret diff --git a/asm/manual/keccak_f1600/keccak_f1600_x2_v84a_asm_v2p6.s b/asm/manual/keccak_f1600/keccak_f1600_x2_v84a_asm_v2p6.s new file mode 100644 index 0000000..856a374 --- /dev/null +++ b/asm/manual/keccak_f1600/keccak_f1600_x2_v84a_asm_v2p6.s @@ -0,0 +1,948 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +#define STACK_SIZE (16*4 + 16*30) +#define STACK_BASE_VREGS 0 +#define STACK_BASE_TMP 16*4 + +#define E0c_offset 0 +#define E1c_offset 1 +#define E2c_offset 2 +#define E3c_offset 3 +#define E4c_offset 4 +#define E0u_offset 0 +#define E1u_offset 1 +#define E2u_offset 2 +#define E3u_offset 3 +#define E4u_offset 4 + +#define ba_offset (5 + 0 ) +#define be_offset (5 + 1 ) +#define bi_offset (5 + 2 ) +#define bo_offset (5 + 3 ) +#define bu_offset (5 + 4 ) +#define ga_offset (5 + 5 ) +#define ge_offset (5 + 6 ) +#define gi_offset (5 + 7 ) +#define go_offset (5 + 8 ) +#define gu_offset (5 + 9 ) +#define ka_offset (5 + 10 ) +#define ke_offset (5 + 11 ) +#define ki_offset (5 + 12 ) +#define ko_offset (5 + 13 ) +#define ku_offset (5 + 14 ) +#define ma_offset (5 + 15 ) +#define me_offset (5 + 16 ) +#define mi_offset (5 + 17 ) +#define mo_offset (5 + 18 ) +#define mu_offset (5 + 19 ) +#define sa_offset (5 + 20 ) +#define se_offset (5 + 21 ) +#define si_offset (5 + 22 ) +#define so_offset (5 + 23 ) +#define su_offset (5 + 24 ) + +#define savep(reg, offset_prefix) \ + str reg ## q, [sp, #(STACK_BASE_TMP + 16 * offset_prefix ## _offset)] +#define restorep(reg, offset_prefix) \ + ldr reg ## q, [sp, #(STACK_BASE_TMP + 16 * offset_prefix ## _offset)] +#define save(name) savep(name,name) +#define restore(name) restorep(name,name) + +/********************** CONSTANTS *************************/ + .data + .align(8) +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x1 + count .req x2 + cur_const .req x3 + + /* Mapping of Kecck-f1600 state to vector registers + * at the beginning and end of each round. */ + Aba .req v0 + Abe .req v1 + Abo .req v2 + Abu .req v3 + Agu .req v4 + Aka .req v5 + Ako .req v6 + Ame .req v7 + Ami .req v8 + Amu .req v9 + Asa .req v10 + Ase .req v11 + Asi .req v12 + Aso .req v13 + Asu .req v14 + + Agi .req v15 + Ake .req v16 + Aga .req v17 + Aki .req v18 + + Abi .req v19 + Ama .req v20 + Ago .req v21 + Aku .req v22 + Age .req v23 + Amo .req v24 + + /* q-form of the above mapping */ + Abaq .req q0 + Abeq .req q1 + Aboq .req q2 + Abuq .req q3 + Aguq .req q4 + Akaq .req q5 + Akoq .req q6 + Ameq .req q7 + Amiq .req q8 + Amuq .req q9 + Asaq .req q10 + Aseq .req q11 + Asiq .req q12 + Asoq .req q13 + Asuq .req q14 + + Agiq .req q15 + Akeq .req q16 + Agaq .req q17 + Akiq .req q18 + + Abiq .req q19 + Amaq .req q20 + Agoq .req q21 + Akuq .req q22 + Ageq .req q23 + Amoq .req q24 + + Aspare0 .req v25 + Aspare1 .req v26 + Aspare2 .req v27 + Aspare3 .req v28 + Aspare4 .req v29 + Aspare5 .req v30 + Aspare6 .req v31 + Aspare0q .req q25 + Aspare1q .req q26 + Aspare2q .req q27 + Aspare3q .req q28 + Aspare4q .req q29 + Aspare5q .req q30 + Aspare6q .req q31 + +.macro declare_remappings out,in + tmp .req \in\()spare6 + tmpq .req \in\()spare6q + + \out\()gu .req \in\()gu /* keep */ + \out\()ga .req \in\()spare0 /* out */ + \out\()ge .req \in\()spare1 /* out */ + \out\()gi .req \in\()spare2 /* out */ + \out\()go .req \in\()spare3 /* out */ + + \out\()ka .req \in\()ka /* keep */ + \out\()ko .req \in\()ko /* keep */ + \out\()ke .req \in\()spare4 /* out */ + \out\()ki .req \in\()spare5 /* out */ + \out\()ku .req \in\()gi /* in */ + + \out\()ma .req \in\()ke /* in */ + \out\()me .req \in\()me /* keep */ + \out\()mi .req \in\()mi /* keep */ + \out\()mo .req \in\()ga /* in */ + \out\()mu .req \in\()mu /* keep */ + + \out\()ba .req \in\()ba /* keep */ + \out\()be .req \in\()be /* keep */ + \out\()bi .req \in\()ki /* in */ + \out\()bo .req \in\()bo /* keep */ + \out\()bu .req \in\()bu /* keep */ + + \out\()sa .req \in\()sa /* keep */ + \out\()se .req \in\()se /* keep */ + \out\()si .req \in\()si /* keep */ + \out\()so .req \in\()so /* keep */ + \out\()su .req \in\()su /* keep */ + + \out\()guq .req \in\()guq + \out\()gaq .req \in\()spare0q + \out\()geq .req \in\()spare1q + \out\()giq .req \in\()spare2q + \out\()goq .req \in\()spare3q + + \out\()kaq .req \in\()kaq + \out\()koq .req \in\()koq + \out\()keq .req \in\()spare4q + \out\()kiq .req \in\()spare5q + \out\()kuq .req \in\()giq + + \out\()maq .req \in\()keq + \out\()meq .req \in\()meq + \out\()miq .req \in\()miq + \out\()moq .req \in\()gaq + \out\()muq .req \in\()muq + + \out\()baq .req \in\()baq + \out\()beq .req \in\()beq + \out\()biq .req \in\()kiq + \out\()boq .req \in\()boq + \out\()buq .req \in\()buq + + \out\()saq .req \in\()saq + \out\()seq .req \in\()seq + \out\()siq .req \in\()siq + \out\()soq .req \in\()soq + \out\()suq .req \in\()suq + + \out\()spare0 .req \in\()bi + \out\()spare1 .req \in\()ma + \out\()spare4 .req \in\()go + \out\()spare2 .req \in\()ku + \out\()spare3 .req \in\()ge + \out\()spare5 .req \in\()mo + \out\()spare6 .req \in\()spare6 + \out\()spare0q .req \in\()biq + \out\()spare1q .req \in\()maq + \out\()spare4q .req \in\()goq + \out\()spare2q .req \in\()kuq + \out\()spare3q .req \in\()geq + \out\()spare5q .req \in\()moq + \out\()spare6q .req \in\()spare6q + + C0 .req \in\()spare3 + C1 .req \in\()spare1 + C2 .req \in\()spare2 + C3 .req \in\()spare0 + C4 .req \in\()spare4 + C0q .req \in\()spare3q + C1q .req \in\()spare1q + C2q .req \in\()spare2q + C3q .req \in\()spare0q + C4q .req \in\()spare4q + + E1c .req \in\()spare5 + E3c .req C2 + E0c .req C4 + E2c .req C1 + E4c .req C3 + + E1cq .req \in\()spare5q + E3cq .req C2q + E0cq .req C4q + E2cq .req C1q + E4cq .req C3q + + E0u .req tmp + E1u .req tmp + E2u .req tmp + E3u .req tmp + E4u .req tmp + + E0uq .req tmpq + E1uq .req tmpq + E2uq .req tmpq + E3uq .req tmpq + E4uq .req tmpq + + vBgo .req E1c + vBgi .req \in\()me + vBga .req \in\()ka + vBge .req \in\()bo + vBgu .req \in\()gu + + vBko .req \in\()me + vBka .req \in\()mu + vBke .req \in\()be + vBku .req \in\()gi + vBki .req \in\()sa + + vBmu .req \in\()bo + vBmo .req \in\()so + vBmi .req \in\()bu + vBma .req \in\()si + vBme .req \in\()be + + vBba .req \in\()si + vBbi .req \in\()sa + vBbo .req \in\()so + vBbu .req \in\()mo + vBbe .req \in\()su + + vBsa .req \in\()mo + vBso .req \in\()bi + vBse .req \in\()ma + vBsi .req \in\()go + vBsu .req E1u //\in\()ku +.endm + +.macro transfer_uncommon out in + savep(\in\()ga, ga) + savep(\in\()gi, gi) + savep(\in\()ki, ki) + savep(\in\()ke, ke) + savep(\in\()bi, bi) + savep(\in\()ma, ma) + savep(\in\()go, go) + savep(\in\()ku, ku) + savep(\in\()ge, ge) + savep(\in\()mo, mo) + + restorep(\out\()gi, gi) + restorep(\out\()ke, ke) + restorep(\out\()ga, ga) + restorep(\out\()ki, ki) + restorep(\out\()bi, bi) + restorep(\out\()ma, ma) + restorep(\out\()go, go) + restorep(\out\()ku, ku) + restorep(\out\()ge, ge) + restorep(\out\()mo, mo) +.endm + +.macro undeclare_remappings out in + .unreq vBgo + .unreq vBgi + .unreq vBga + .unreq vBge + .unreq vBgu + .unreq vBko + .unreq vBka + .unreq vBke + .unreq vBku + .unreq vBki + .unreq vBmu + .unreq vBmo + .unreq vBmi + .unreq vBma + .unreq vBme + .unreq vBba + .unreq vBbi + .unreq vBbo + .unreq vBbu + .unreq vBbe + .unreq vBsa + .unreq vBso + .unreq vBse + .unreq vBsi + .unreq vBsu + .unreq C0 + .unreq C1 + .unreq C2 + .unreq C3 + .unreq C4 + .unreq C0q + .unreq C1q + .unreq C2q + .unreq C3q + .unreq C4q + .unreq E1u + .unreq E3u + .unreq E0u + .unreq E2u + .unreq E4u + .unreq E1c + .unreq E3c + .unreq E0c + .unreq E2c + .unreq E4c + .unreq E1uq + .unreq E3uq + .unreq E0uq + .unreq E2uq + .unreq E4uq + .unreq E1cq + .unreq E3cq + .unreq E0cq + .unreq E2cq + .unreq E4cq +.endm + +/************************ MACROS ****************************/ + +.macro load_input + ldp Abaq, Abeq, [input_addr, #(2*8*0)] + ldp Abiq, Aboq, [input_addr, #(2*8*2)] + ldp Abuq, Agaq, [input_addr, #(2*8*4)] + ldp Ageq, Agiq, [input_addr, #(2*8*6)] + ldp Agoq, Aguq, [input_addr, #(2*8*8)] + ldp Akaq, Akeq, [input_addr, #(2*8*10)] + ldp Akiq, Akoq, [input_addr, #(2*8*12)] + ldp Akuq, Amaq, [input_addr, #(2*8*14)] + ldp Ameq, Amiq, [input_addr, #(2*8*16)] + ldp Amoq, Amuq, [input_addr, #(2*8*18)] + ldp Asaq, Aseq, [input_addr, #(2*8*20)] + ldp Asiq, Asoq, [input_addr, #(2*8*22)] + ldr Asuq, [input_addr, #(2*8*24)] +.endm + +.macro store_input + str Abaq, [input_addr, #(2*8*0)] + str Abeq, [input_addr, #(2*8*1)] + str Abiq, [input_addr, #(2*8*2)] + str Aboq, [input_addr, #(2*8*3)] + str Abuq, [input_addr, #(2*8*4)] + str Agaq, [input_addr, #(2*8*5)] + str Ageq, [input_addr, #(2*8*6)] + str Agiq, [input_addr, #(2*8*7)] + str Agoq, [input_addr, #(2*8*8)] + str Aguq, [input_addr, #(2*8*9)] + str Akaq, [input_addr, #(2*8*10)] + str Akeq, [input_addr, #(2*8*11)] + str Akiq, [input_addr, #(2*8*12)] + str Akoq, [input_addr, #(2*8*13)] + str Akuq, [input_addr, #(2*8*14)] + str Amaq, [input_addr, #(2*8*15)] + str Ameq, [input_addr, #(2*8*16)] + str Amiq, [input_addr, #(2*8*17)] + str Amoq, [input_addr, #(2*8*18)] + str Amuq, [input_addr, #(2*8*19)] + str Asaq, [input_addr, #(2*8*20)] + str Aseq, [input_addr, #(2*8*21)] + str Asiq, [input_addr, #(2*8*22)] + str Asoq, [input_addr, #(2*8*23)] + str Asuq, [input_addr, #(2*8*24)] +.endm + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +.macro save_vregs + stp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + stp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + stp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + stp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + ldp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + ldp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + ldp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +/* Macros using v8.4-A SHA-3 instructions */ + +.macro eor3_m1_0 d s0 s1 s2 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor2 d s0 s1 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor5 out i0 i1 i2 i3 i4 tmp + eor2 \out, \i0, \i1 + eor2 \tmp, \i3, \i4 + eor2 \out, \out, \i2 + eor2 \out, \out, \tmp +.endm + +.macro move d s + mov \d\().16b, \s\().16b +.endm + + +.macro eor3_m1_1 d s0 s1 s2 + eor \d\().16b, \d\().16b, \s2\().16b +.endm + +.macro eor3_m1 d s0 s1 s2 + eor3_m1_0 \d, \s0, \s1, \s2 + eor3_m1_1 \d, \s0, \s1, \s2 +.endm + +.macro rax1_m1 d s0 s1 + add tmp.2d, \s1\().2d, \s1\().2d + sri tmp.2d, \s1\().2d, #63 + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +.macro xar_m1 d s0 s1 imm + eor \s0\().16b, \s0\().16b, \s1\().16b + shl \d\().2d, \s0\().2d, #(64-\imm) + sri \d\().2d, \s0\().2d, #(\imm) +.endm + +.macro xar_m1_0 d s0 s1 imm tmp + eor \tmp\().16b, \s0\().16b, \s1\().16b +.endm + +.macro xar_m1_1 d s0 s1 imm tmp + shl \d\().2d, \tmp\().2d, #(64-\imm) +.endm + +.macro xar_m1_2 d s0 s1 imm tmp + sri \d\().2d, \tmp\().2d, #(\imm) +.endm + +.macro bcax_m1 d s0 s1 s2 + bic tmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +.macro bcax_m1_d d s0 s1 s2 + bic \d\().16b, \s1\().16b, \s2\().16b + eor \d\().16b, \d\().16b, \s0\().16b +.endm + +.macro refresh d + mov \d\().16b, \d\().16b +.endm +/* Keccak-f1600 round */ + +.macro keccak_f1600_round_full out in + + eor5 C0, \in\()ka, \in\()ga, \in\()ma, \in\()ba, \in\()sa, tmp + eor5 C2, \in\()ki, \in\()gi, \in\()mi, \in\()bi, \in\()si, tmp + eor5 C4, \in\()ku, \in\()gu, \in\()mu, \in\()bu, \in\()su, tmp + eor5 C1, \in\()ke, \in\()ge, \in\()me, \in\()be, \in\()se, tmp + eor5 C3, \in\()ko, \in\()go, \in\()mo, \in\()bo, \in\()so, tmp + + rax1_m1 E1c, C0, C2 SEP save(E1c) + rax1_m1 E3c, C2, C4 SEP save(E3c) + rax1_m1 E0c, C4, C1 SEP save(E0c) + rax1_m1 E2c, C1, C3 SEP save(E2c) + rax1_m1 E4c, C3, C0 SEP save(E4c) + + xar_m1 vBgi, \in\()ka /* used at block 2 */, E0c, 61 + xar_m1 vBga, \in\()bo /* used at block 4 */, E3c, 36 + xar_m1 vBge, \in\()gu /* used at block 1 */, E4c, 44 + xar_m1 vBgu, \in\()si /* used at block 5 */, E2c, 3 + + bcax_m1 \out\()ga, vBga, vBgi, vBge + bcax_m1 \out\()ge, vBge, vBgo, vBgi + bcax_m1 \out\()gi, vBgi, vBgu, vBgo + bcax_m1 \out\()go, vBgo, vBga, vBgu + bcax_m1 \out\()gu, vBgu, vBge, vBga + restore(E4u) + xar_m1 vBko, \in\()mu /* used at block 3 */, E4u, 56 SEP restore(E1u) + xar_m1 vBka, \in\()be /* used at block 4 */, E1u, 63 SEP restore(E2u) + xar_m1 vBke, \in\()gi /* not used */, E2u, 58 SEP restore(E0u) + xar_m1 vBku, \in\()sa /* used at block 5 */, E0u, 46 SEP restore(E3u) + xar_m1 vBki, \in\()ko /* used at block 2 */, E3u, 39 + + bcax_m1 \out\()ke, vBke, vBko, vBki + bcax_m1 \out\()ki, vBki, vBku, vBko + bcax_m1 \out\()ku, vBku, vBke, vBka + bcax_m1 \out\()ko, vBko, vBka, vBku + bcax_m1 \out\()ka, vBka, vBki, vBke + + // Can use: Abo, Asi, Abe, Asa; Abu, Aso + restore(E3u) + xar_m1 vBmu, \in\()so /* used at block 5 */, E3u, 8 SEP restore(E4u) + xar_m1 vBma, \in\()bu /* used at block 4 */, E4u, 37 SEP restore(E2u) + xar_m1 vBmo, \in\()mi /* used at block 3 */, E2u, 49 SEP restore(E1u) + xar_m1 vBmi, \in\()ke /* not used */, E1u, 54 SEP restore(E0u) + xar_m1 vBme, \in\()ga /* not used */, E0u, 28 + + bcax_m1 \out\()ma, vBma, vBmi, vBme + bcax_m1 \out\()mo, vBmo, vBma, vBmu + bcax_m1 \out\()me, vBme, vBmo, vBmi + bcax_m1 \out\()mi, vBmi, vBmu, vBmo + bcax_m1 \out\()mu, vBmu, vBme, vBma + + // Can use: Asi, Asa, Aso, Asu, Amo + restore(E0u) + eor2 vBba, \in\()ba /* used at block 4 */, E0u SEP restore(E2u) + xar_m1 vBbi, \in\()ki /* not used* */, E2u, 21 SEP restore(E3u) + xar_m1 vBbo, \in\()mo /* not used+ */, E3u, 43 SEP restore(E4u) + xar_m1 vBbu, \in\()su /* used at block 5 */, E4u, 50 SEP restore(E1u) + xar_m1 vBbe, \in\()ge /* not used */, E1u, 20 + + bcax_m1 \out\()ba, vBba, vBbi, vBbe + ld1r {tmp.2d}, [const_addr], #8 + eor2 \out\()ba, \out\()ba, tmp + bcax_m1 \out\()be, vBbe, vBbo, vBbi + bcax_m1 \out\()bo, vBbo, vBba, vBbu + bcax_m1 \out\()bu, vBbu, vBbe, vBba + bcax_m1 \out\()bi, vBbi, vBbu, vBbo + + // Can use: Amo, Age, Abi, Ama, Ago, Aku + restore(E2u) + xar_m1 vBsa, \in\()bi /* not used+ */, E2u, 2 SEP restore(E0u) + xar_m1 vBso, \in\()ma /* not used+ */, E0u, 23 SEP restore(E3u) + xar_m1 vBse, \in\()go /* not used+ */, E3u, 9 SEP restore(E4u) + xar_m1 vBsi, \in\()ku /* not used */, E4u, 25 SEP restore(E1u) + xar_m1 vBsu, \in\()se /* used at block 5 */, E1u, 62 + + bcax_m1_d \out\()sa, vBsa, vBsi, vBse + bcax_m1_d \out\()se, vBse, vBso, vBsi + bcax_m1_d \out\()si, vBsi, vBsu, vBso + bcax_m1_d \out\()so, vBso, vBsa, vBsu + bcax_m1_d \out\()su, vBsu, vBse, vBsa + +.endm + +.macro keccak_f1600_round_pre out in + + eor5 C0, \in\()ka, \in\()ga, \in\()ma, \in\()ba, \in\()sa, tmp + eor5 C2, \in\()ki, \in\()gi, \in\()mi, \in\()bi, \in\()si, tmp + eor5 C4, \in\()ku, \in\()gu, \in\()mu, \in\()bu, \in\()su, tmp + eor5 C1, \in\()ke, \in\()ge, \in\()me, \in\()be, \in\()se, tmp + eor5 C3, \in\()ko, \in\()go, \in\()mo, \in\()bo, \in\()so, tmp + +.endm + +.macro keccak_f1600_round_post out in + + .unreq C0 + .unreq C1 + .unreq C2 + .unreq C3 + .unreq C4 + .unreq C0q + .unreq C1q + .unreq C2q + .unreq C3q + .unreq C4q + + C0 .req \out\()spare3 + C1 .req \out\()spare1 + C2 .req \out\()spare2 + C3 .req \out\()spare0 + C4 .req \out\()spare4 + C0q .req \out\()spare3q + C1q .req \out\()spare1q + C2q .req \out\()spare2q + C3q .req \out\()spare0q + C4q .req \out\()spare4q + + eor5 C0, \out\()ka, \out\()ga, \out\()ma, \out\()ba, \out\()sa, tmp + eor5 C2, \out\()ki, \out\()gi, \out\()mi, \out\()bi, \out\()si, tmp + eor5 C4, \out\()ku, \out\()gu, \out\()mu, \out\()bu, \out\()su, tmp + eor5 C1, \out\()ke, \out\()ge, \out\()me, \out\()be, \out\()se, tmp + eor5 C3, \out\()ko, \out\()go, \out\()mo, \out\()bo, \out\()so, tmp + +.endm +.macro keccak_f1600_round_core out in + + rax1_m1 E1c, C0, C2 SEP save(E1c) + xar_m1 vBgo, \in\()me /* used at block 3 */, E1c, 19 + rax1_m1 E3c, C2, C4 SEP save(E3c) + rax1_m1 E0c, C4, C1 SEP save(E0c) + xar_m1 vBgi, \in\()ka /* used at block 2 */, E0c, 61 + rax1_m1 E2c, C1, C3 SEP save(E2c) + xar_m1 vBga, \in\()bo /* used at block 4 */, E3c, 36 + rax1_m1 E4c, C3, C0 SEP save(E4c) + + xar_m1 vBge, \in\()gu /* used at block 1 */, E4c, 44 + xar_m1 vBgu, \in\()si /* used at block 5 */, E2c, 3 + + bcax_m1 \out\()ga, vBga, vBgi, vBge + bcax_m1 \out\()ge, vBge, vBgo, vBgi + bcax_m1 \out\()gi, vBgi, vBgu, vBgo + bcax_m1 \out\()go, vBgo, vBga, vBgu + bcax_m1 \out\()gu, vBgu, vBge, vBga + restore(E4u) + xar_m1 vBko, \in\()mu /* used at block 3 */, E4u, 56 SEP restore(E1u) + xar_m1 vBka, \in\()be /* used at block 4 */, E1u, 63 SEP restore(E2u) + xar_m1 vBke, \in\()gi /* not used */, E2u, 58 SEP restore(E0u) + xar_m1 vBku, \in\()sa /* used at block 5 */, E0u, 46 SEP restore(E3u) + xar_m1 vBki, \in\()ko /* used at block 2 */, E3u, 39 + + bcax_m1 \out\()ke, vBke, vBko, vBki + bcax_m1 \out\()ki, vBki, vBku, vBko + bcax_m1 \out\()ku, vBku, vBke, vBka + bcax_m1 \out\()ko, vBko, vBka, vBku + bcax_m1 \out\()ka, vBka, vBki, vBke + + // Can use: Abo, Asi, Abe, Asa; Abu, Aso + restore(E3u) + xar_m1 vBmu, \in\()so /* used at block 5 */, E3u, 8 SEP restore(E4u) + xar_m1 vBma, \in\()bu /* used at block 4 */, E4u, 37 SEP restore(E2u) + xar_m1 vBmo, \in\()mi /* used at block 3 */, E2u, 49 SEP restore(E1u) + xar_m1 vBmi, \in\()ke /* not used */, E1u, 54 SEP restore(E0u) + xar_m1 vBme, \in\()ga /* not used */, E0u, 28 + + bcax_m1 \out\()ma, vBma, vBmi, vBme + bcax_m1 \out\()mo, vBmo, vBma, vBmu + bcax_m1 \out\()me, vBme, vBmo, vBmi + bcax_m1 \out\()mi, vBmi, vBmu, vBmo + bcax_m1 \out\()mu, vBmu, vBme, vBma + + // Can use: Asi, Asa, Aso, Asu, Amo + restore(E0u) + eor2 vBba, \in\()ba /* used at block 4 */, E0u SEP restore(E2u) + xar_m1 vBbi, \in\()ki /* not used* */, E2u, 21 SEP restore(E3u) + xar_m1 vBbo, \in\()mo /* not used+ */, E3u, 43 SEP restore(E4u) + xar_m1 vBbu, \in\()su /* used at block 5 */, E4u, 50 SEP restore(E1u) + xar_m1 vBbe, \in\()ge /* not used */, E1u, 20 + + bcax_m1 \out\()ba, vBba, vBbi, vBbe + ld1r {tmp.2d}, [const_addr], #8 + eor2 \out\()ba, \out\()ba, tmp + bcax_m1 \out\()be, vBbe, vBbo, vBbi + bcax_m1 \out\()bo, vBbo, vBba, vBbu + bcax_m1 \out\()bu, vBbu, vBbe, vBba + bcax_m1 \out\()bi, vBbi, vBbu, vBbo + + // Can use: Amo, Age, Abi, Ama, Ago, Aku + restore(E2u) + xar_m1 vBsa, \in\()bi /* not used+ */, E2u, 2 SEP restore(E0u) + xar_m1 vBso, \in\()ma /* not used+ */, E0u, 23 SEP restore(E3u) + xar_m1 vBse, \in\()go /* not used+ */, E3u, 9 SEP restore(E4u) + xar_m1 vBsi, \in\()ku /* not used */, E4u, 25 SEP restore(E1u) + xar_m1 vBsu, \in\()se /* used at block 5 */, E1u, 62 + + bcax_m1_d \out\()sa, vBsa, vBsi, vBse + bcax_m1_d \out\()se, vBse, vBso, vBsi + bcax_m1_d \out\()si, vBsi, vBsu, vBso + bcax_m1_d \out\()so, vBso, vBsa, vBsu + bcax_m1_d \out\()su, vBsu, vBse, vBsa + +.endm + +.macro keccak_f1600_round_first out in + keccak_f1600_round_pre \out, \in + keccak_f1600_round_core \out, \in + keccak_f1600_round_post \out, \in +.endm + +.macro keccak_f1600_round_inner out in + keccak_f1600_round_core \out, \in + keccak_f1600_round_post \out, \in +.endm + +.macro keccak_f1600_round_last out in + keccak_f1600_round_core \out, \in +.endm + +.macro keccak_f1600_round_inner_optim out in + + rax1_m1 E1c, C0, C2 SEP save(E1c) + xar_m1 vBgo, \in\()me /* used at block 3 */, E1c, 19 + rax1_m1 E3c, C2, C4 SEP save(E3c) + rax1_m1 E0c, C4, C1 SEP save(E0c) + xar_m1 vBgi, \in\()ka /* used at block 2 */, E0c, 61 + rax1_m1 E2c, C1, C3 SEP save(E2c) + xar_m1 vBga, \in\()bo /* used at block 4 */, E3c, 36 + rax1_m1 E4c, C3, C0 SEP save(E4c) + + xar_m1 vBge, \in\()gu /* used at block 1 */, E4c, 44 + xar_m1 vBgu, \in\()si /* used at block 5 */, E2c, 3 + + bcax_m1 \out\()ga, vBga, vBgi, vBge + bcax_m1 \out\()ge, vBge, vBgo, vBgi + bcax_m1 \out\()gi, vBgi, vBgu, vBgo + bcax_m1 \out\()go, vBgo, vBga, vBgu + bcax_m1 \out\()gu, vBgu, vBge, vBga + restore(E4u) + xar_m1 vBko, \in\()mu /* used at block 3 */, E4u, 56 SEP restore(E1u) + xar_m1 vBka, \in\()be /* used at block 4 */, E1u, 63 SEP restore(E2u) + xar_m1 vBke, \in\()gi /* not used */, E2u, 58 SEP restore(E0u) + xar_m1 vBku, \in\()sa /* used at block 5 */, E0u, 46 SEP restore(E3u) + xar_m1 vBki, \in\()ko /* used at block 2 */, E3u, 39 + + bcax_m1 \out\()ke, vBke, vBko, vBki + bcax_m1 \out\()ki, vBki, vBku, vBko + bcax_m1 \out\()ku, vBku, vBke, vBka + bcax_m1 \out\()ko, vBko, vBka, vBku + bcax_m1 \out\()ka, vBka, vBki, vBke + + // Can use: Abo, Asi, Abe, Asa; Abu, Aso + restore(E3u) + xar_m1 vBmu, \in\()so /* used at block 5 */, E3u, 8 SEP restore(E4u) + xar_m1 vBma, \in\()bu /* used at block 4 */, E4u, 37 SEP restore(E2u) + xar_m1 vBmo, \in\()mi /* used at block 3 */, E2u, 49 SEP restore(E1u) + xar_m1 vBmi, \in\()ke /* not used */, E1u, 54 SEP restore(E0u) + xar_m1 vBme, \in\()ga /* not used */, E0u, 28 + + bcax_m1 \out\()ma, vBma, vBmi, vBme + bcax_m1 \out\()mo, vBmo, vBma, vBmu + bcax_m1 \out\()me, vBme, vBmo, vBmi + bcax_m1 \out\()mi, vBmi, vBmu, vBmo + bcax_m1 \out\()mu, vBmu, vBme, vBma + + // Can use: Asi, Asa, Aso, Asu, Amo + restore(E0u) + eor2 vBba, \in\()ba /* used at block 4 */, E0u SEP restore(E2u) + xar_m1 vBbi, \in\()ki /* not used* */, E2u, 21 SEP restore(E3u) + xar_m1 vBbo, \in\()mo /* not used+ */, E3u, 43 SEP restore(E4u) + xar_m1 vBbu, \in\()su /* used at block 5 */, E4u, 50 SEP restore(E1u) + xar_m1 vBbe, \in\()ge /* not used */, E1u, 20 + + .unreq C0 + .unreq C0q + C0 .req \out\()spare3 + C0q .req \out\()spare3q + + eor2 C0, \out\()ka, \out\()ga + eor2 C0, C0, \out\()ma + + bcax_m1 \out\()ba, vBba, vBbi, vBbe + ld1r {tmp.2d}, [const_addr], #8 + eor2 \out\()ba, \out\()ba, tmp + bcax_m1 \out\()be, vBbe, vBbo, vBbi + bcax_m1 \out\()bo, vBbo, vBba, vBbu + bcax_m1 \out\()bu, vBbu, vBbe, vBba + bcax_m1 \out\()bi, vBbi, vBbu, vBbo + + eor2 C0, C0, \out\()ba + + // Can use: Amo, Age, Abi, Ama, Ago, Aku + restore(E2u) + xar_m1 vBsa, \in\()bi /* not used+ */, E2u, 2 SEP restore(E0u) + xar_m1 vBso, \in\()ma /* not used+ */, E0u, 23 SEP restore(E3u) + xar_m1 vBse, \in\()go /* not used+ */, E3u, 9 SEP restore(E4u) + xar_m1 vBsi, \in\()ku /* not used */, E4u, 25 SEP restore(E1u) + + .unreq C2 + .unreq C2q + C2 .req \out\()spare2 + C2q .req \out\()spare2q + + eor2 C2, \out\()ki, \out\()gi + eor2 C2, C2, \out\()mi + eor2 C2, C2, \out\()bi + + xar_m1 vBsu, \in\()se /* used at block 5 */, E1u, 62 + + bcax_m1_d \out\()sa, vBsa, vBsi, vBse + eor2 C0, C0, \out\()sa + bcax_m1_d \out\()se, vBse, vBso, vBsi + bcax_m1_d \out\()si, vBsi, vBsu, vBso + eor2 C2, C2, \out\()si + bcax_m1_d \out\()so, vBso, vBsa, vBsu + bcax_m1_d \out\()su, vBsu, vBse, vBsa + + .unreq C1 + .unreq C1q + C1 .req \out\()spare1 + C1q .req \out\()spare1q + + + .unreq C3 + .unreq C4 + .unreq C3q + .unreq C4q + + C3 .req \out\()spare0 + C4 .req \out\()spare4 + C3q .req \out\()spare0q + C4q .req \out\()spare4q + +// eor5 C0, \out\()ka, \out\()ga, \out\()ma, \out\()ba, \out\()sa, tmp +// eor5 C2, \out\()ki, \out\()gi, \out\()mi, \out\()bi, \out\()si, tmp + eor5 C4, \out\()ku, \out\()gu, \out\()mu, \out\()bu, \out\()su, tmp + eor5 C1, \out\()ke, \out\()ge, \out\()me, \out\()be, \out\()se, tmp + eor5 C3, \out\()ko, \out\()go, \out\()mo, \out\()bo, \out\()so, tmp + +.endm + +.text +.align 4 +.global keccak_f1600_x2_v84a_asm_v2p6 +.global _keccak_f1600_x2_v84a_asm_v2p6 + +#define KECCAK_F1600_ROUNDS 24 + +keccak_f1600_x2_v84a_asm_v2p6: +_keccak_f1600_x2_v84a_asm_v2p6: + alloc_stack + save_vregs + load_constant_ptr + load_input + + + //mov count, #(KECCAK_F1600_ROUNDS-2) + mov count, #24 +loop: + declare_remappings A1, A + keccak_f1600_round_first A1, A +// keccak_f1600_round_pre A1 A +// keccak_f1600_round_core A1 A +// keccak_f1600_round_post A1 A + undeclare_remappings A1, A + + declare_remappings A2, A1 +// keccak_f1600_round_pre A2 A1 +// keccak_f1600_round_core A2 A1 +// keccak_f1600_round_post A2 A1 + keccak_f1600_round_inner_optim A2, A1 + undeclare_remappings A2, A1 + + declare_remappings A3, A2 +// keccak_f1600_round_pre A3 A2 +// keccak_f1600_round_core A3 A2 +// keccak_f1600_round_post A3 A2 + keccak_f1600_round_inner_optim A3, A2 + undeclare_remappings A3, A2 + + declare_remappings A4, A3 +// keccak_f1600_round_pre A4 A3 + keccak_f1600_round_last A4, A3 +// keccak_f1600_round_post A4 A3 + undeclare_remappings A4, A3 + + transfer_uncommon A, A4 + + sub count, count, #4 + cbnz count, loop + + store_input + restore_vregs + free_stack + ret diff --git a/asm/manual/keccak_f1600/keccak_f1600_x2_v84a_asm_v2pp0.s b/asm/manual/keccak_f1600/keccak_f1600_x2_v84a_asm_v2pp0.s new file mode 100644 index 0000000..2a994b5 --- /dev/null +++ b/asm/manual/keccak_f1600/keccak_f1600_x2_v84a_asm_v2pp0.s @@ -0,0 +1,729 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +/********************** CONSTANTS *************************/ + .data + .align(8) +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x1 + count .req x2 + cur_const .req x3 + + /* Mapping of Kecck-f1600 state to vector registers + * at the beginning and end of each round. */ + Aba .req v0 + Abe .req v1 + Abi .req v2 + Abo .req v3 + Abu .req v4 + Aga .req v5 + Age .req v6 + Agi .req v7 + Ago .req v8 + Agu .req v9 + Aka .req v10 + Ake .req v11 + Aki .req v12 + Ako .req v13 + Aku .req v14 + Ama .req v15 + Ame .req v16 + Ami .req v17 + Amo .req v18 + Amu .req v19 + Asa .req v20 + Ase .req v21 + Asi .req v22 + Aso .req v23 + Asu .req v24 + + /* q-form of the above mapping */ + Abaq .req q0 + Abeq .req q1 + Abiq .req q2 + Aboq .req q3 + Abuq .req q4 + Agaq .req q5 + Ageq .req q6 + Agiq .req q7 + Agoq .req q8 + Aguq .req q9 + Akaq .req q10 + Akeq .req q11 + Akiq .req q12 + Akoq .req q13 + Akuq .req q14 + Amaq .req q15 + Ameq .req q16 + Amiq .req q17 + Amoq .req q18 + Amuq .req q19 + Asaq .req q20 + Aseq .req q21 + Asiq .req q22 + Asoq .req q23 + Asuq .req q24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v27 + C1 .req v28 + C2 .req v29 + C3 .req v30 + C4 .req v31 + + C0q .req q27 + C1q .req q28 + C2q .req q29 + C3q .req q30 + C4q .req q31 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + vBba .req v25 // fresh + vBbe .req v26 // fresh + vBbi .req Abi + vBbo .req Abo + vBbu .req Abu + vBga .req Aka + vBge .req Ake + vBgi .req Agi + vBgo .req Ago + vBgu .req Agu + vBka .req Ama + vBke .req Ame + vBki .req Aki + vBko .req Ako + vBku .req Aku + vBma .req Asa + vBme .req Ase + vBmi .req Ami + vBmo .req Amo + vBmu .req Amu + vBsa .req Aba + vBse .req Abe + vBsi .req Asi + vBso .req Aso + vBsu .req Asu + + vBbaq .req q25 // fresh + vBbeq .req q26 // fresh + vBbiq .req Abiq + vBboq .req Aboq + vBbuq .req Abuq + vBgaq .req Akaq + vBgeq .req Akeq + vBgiq .req Agiq + vBgoq .req Agoq + vBguq .req Aguq + vBkaq .req Amaq + vBkeq .req Ameq + vBkiq .req Akiq + vBkoq .req Akoq + vBkuq .req Akuq + vBmaq .req Asaq + vBmeq .req Aseq + vBmiq .req Amiq + vBmoq .req Amoq + vBmuq .req Amuq + vBsaq .req Abaq + vBseq .req Abeq + vBsiq .req Asiq + vBsoq .req Asoq + vBsuq .req Asuq + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req C4 + E1 .req C0 + E2 .req vBbe // fresh + E3 .req C2 + E4 .req C3 + + E0q .req C4q + E1q .req C0q + E2q .req vBbeq // fresh + E3q .req C2q + E4q .req C3q + + +/************************ MACROS ****************************/ + +.macro load_input + ldp Abaq, Abeq, [input_addr, #(2*8*0)] + ldp Abiq, Aboq, [input_addr, #(2*8*2)] + ldp Abuq, Agaq, [input_addr, #(2*8*4)] + ldp Ageq, Agiq, [input_addr, #(2*8*6)] + ldp Agoq, Aguq, [input_addr, #(2*8*8)] + ldp Akaq, Akeq, [input_addr, #(2*8*10)] + ldp Akiq, Akoq, [input_addr, #(2*8*12)] + ldp Akuq, Amaq, [input_addr, #(2*8*14)] + ldp Ameq, Amiq, [input_addr, #(2*8*16)] + ldp Amoq, Amuq, [input_addr, #(2*8*18)] + ldp Asaq, Aseq, [input_addr, #(2*8*20)] + ldp Asiq, Asoq, [input_addr, #(2*8*22)] + ldr Asuq, [input_addr, #(2*8*24)] + + // ldr Abaq, [input_addr, #(2*8*0)] + // ldr Abeq, [input_addr, #(2*8*1)] + // ldr Abiq, [input_addr, #(2*8*2)] + // ldr Aboq, [input_addr, #(2*8*3)] + // ldr Abuq, [input_addr, #(2*8*4)] + // ldr Agaq, [input_addr, #(2*8*5)] + // ldr Ageq, [input_addr, #(2*8*6)] + // ldr Agiq, [input_addr, #(2*8*7)] + // ldr Agoq, [input_addr, #(2*8*8)] + // ldr Aguq, [input_addr, #(2*8*9)] + // ldr Akaq, [input_addr, #(2*8*10)] + // ldr Akeq, [input_addr, #(2*8*11)] + // ldr Akiq, [input_addr, #(2*8*12)] + // ldr Akoq, [input_addr, #(2*8*13)] + // ldr Akuq, [input_addr, #(2*8*14)] + // ldr Amaq, [input_addr, #(2*8*15)] + // ldr Ameq, [input_addr, #(2*8*16)] + // ldr Amiq, [input_addr, #(2*8*17)] + // ldr Amoq, [input_addr, #(2*8*18)] + // ldr Amuq, [input_addr, #(2*8*19)] + // ldr Asaq, [input_addr, #(2*8*20)] + // ldr Aseq, [input_addr, #(2*8*21)] + // ldr Asiq, [input_addr, #(2*8*22)] + // ldr Asoq, [input_addr, #(2*8*23)] + // ldr Asuq, [input_addr, #(2*8*24)] +.endm + +.macro store_input + str Abaq, [input_addr, #(2*8*0)] + str Abeq, [input_addr, #(2*8*1)] + str Abiq, [input_addr, #(2*8*2)] + str Aboq, [input_addr, #(2*8*3)] + str Abuq, [input_addr, #(2*8*4)] + str Agaq, [input_addr, #(2*8*5)] + str Ageq, [input_addr, #(2*8*6)] + str Agiq, [input_addr, #(2*8*7)] + str Agoq, [input_addr, #(2*8*8)] + str Aguq, [input_addr, #(2*8*9)] + str Akaq, [input_addr, #(2*8*10)] + str Akeq, [input_addr, #(2*8*11)] + str Akiq, [input_addr, #(2*8*12)] + str Akoq, [input_addr, #(2*8*13)] + str Akuq, [input_addr, #(2*8*14)] + str Amaq, [input_addr, #(2*8*15)] + str Ameq, [input_addr, #(2*8*16)] + str Amiq, [input_addr, #(2*8*17)] + str Amoq, [input_addr, #(2*8*18)] + str Amuq, [input_addr, #(2*8*19)] + str Asaq, [input_addr, #(2*8*20)] + str Aseq, [input_addr, #(2*8*21)] + str Asiq, [input_addr, #(2*8*22)] + str Asoq, [input_addr, #(2*8*23)] + str Asuq, [input_addr, #(2*8*24)] +.endm + +#define STACK_SIZE (16*4 + 16*31) +#define STACK_BASE_VREGS 0 +#define STACK_BASE_TMP 16*4 + +#define Aga_offset 0 +#define E0_offset 1 +#define E1_offset 2 +#define E2_offset 3 +#define E3_offset 4 +#define E4_offset 5 +#define Ame_offset 7 +#define Agi_offset 8 +#define Aka_offset 9 +#define Abo_offset 10 +#define Amo_offset 11 +#define Ami_offset 12 +#define Ake_offset 13 +#define Agu_offset 14 +#define Asi_offset 15 +#define Aku_offset 16 +#define Asa_offset 17 +#define Abu_offset 18 +#define Asu_offset 19 +#define Ase_offset 20 +//#define Aga_offset 21 +#define Age_offset 22 +#define vBgo_offset 23 +#define vBke_offset 24 +#define vBgi_offset 25 +#define vBga_offset 26 +#define vBbo_offset 27 +#define vBmo_offset 28 +#define vBmi_offset 29 +#define vBge_offset 30 + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +#define save(name) \ + str name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] +#define restore(name) \ + ldr name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] + +.macro save_vregs + stp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + stp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + stp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + stp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + ldp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + ldp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + ldp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +/* Macros using v8.4-A SHA-3 instructions */ + +.macro eor3_m1_0 d s0 s1 s2 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor2 d s0 s1 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor3_m1_1 d s0 s1 s2 + eor \d\().16b, \d\().16b, \s2\().16b +.endm + +.macro eor3_m1 d s0 s1 s2 + eor3_m1_0 \d, \s0, \s1, \s2 + eor3_m1_1 \d, \s0, \s1, \s2 +.endm + +.macro rax1_m1 d s0 s1 + add tmp.2d, \s1\().2d, \s1\().2d + sri tmp.2d, \s1\().2d, #63 + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +.macro xar_m1 d s0 s1 imm + eor \s0\().16b, \s0\().16b, \s1\().16b + shl \d\().2d, \s0\().2d, #(64-\imm) + sri \d\().2d, \s0\().2d, #(\imm) +.endm + +.macro xar_m1_0 d s0 s1 imm tmp + eor \tmp\().16b, \s0\().16b, \s1\().16b +.endm + +.macro xar_m1_1 d s0 s1 imm tmp + shl \d\().2d, \tmp\().2d, #(64-\imm) +.endm + +.macro xar_m1_2 d s0 s1 imm tmp + sri \d\().2d, \tmp\().2d, #(\imm) +.endm + +.macro bcax_m1 d s0 s1 s2 + bic tmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +.macro refresh d + mov \d\().16b, \d\().16b +.endm +/* Keccak-f1600 round */ + +.macro keccak_f1600_round_pre + + /* 10 EOR3, so 20 individual EOR */ + + eor3_m1_0 C0, Aba, Aga, Aka + eor3_m1_0 C1, Abe, Age, Ake + eor3_m1_0 C2, Abi, Agi, Aki + eor3_m1_0 C3, Abo, Ago, Ako + eor3_m1_0 C4, Abu, Agu, Aku + eor3_m1_1 C0, Aba, Aga, Aka + eor3_m1_1 C1, Abe, Age, Ake + eor3_m1_1 C2, Abi, Agi, Aki + eor3_m1_1 C3, Abo, Ago, Ako + eor3_m1_1 C4, Abu, Agu, Aku + eor3_m1_0 C0, C0, Ama, Asa + eor3_m1_0 C1, C1, Ame, Ase + eor3_m1_0 C2, C2, Ami, Asi + eor3_m1_0 C3, C3, Amo, Aso + eor3_m1_0 C4, C4, Amu, Asu + eor3_m1_1 C0, C0, Ama, Asa + eor3_m1_1 C1, C1, Ame, Ase + eor3_m1_1 C2, C2, Ami, Asi + eor3_m1_1 C3, C3, Amo, Aso + eor3_m1_1 C4, C4, Amu, Asu + +.endm + +.macro keccak_f1600_round + + /* 10 EOR3, so 20 individual EOR */ + + eor3_m1_0 C0, Aba, Aga, Aka + eor3_m1_0 C1, Abe, Age, Ake + eor3_m1_0 C2, Abi, Agi, Aki + eor3_m1_0 C3, Abo, Ago, Ako + eor3_m1_0 C4, Abu, Agu, Aku + eor3_m1_1 C0, Aba, Aga, Aka + eor3_m1_1 C1, Abe, Age, Ake + eor3_m1_1 C2, Abi, Agi, Aki + eor3_m1_1 C3, Abo, Ago, Ako + eor3_m1_1 C4, Abu, Agu, Aku + eor3_m1_0 C0, C0, Ama, Asa + eor3_m1_0 C1, C1, Ame, Ase + eor3_m1_0 C2, C2, Ami, Asi + eor3_m1_0 C3, C3, Amo, Aso + eor3_m1_0 C4, C4, Amu, Asu + eor3_m1_1 C0, C0, Ama, Asa + eor3_m1_1 C1, C1, Ame, Ase + eor3_m1_1 C2, C2, Ami, Asi + eor3_m1_1 C3, C3, Amo, Aso + eor3_m1_1 C4, C4, Amu, Asu + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req vBba + rax1_m1 E2, C1, C3 + rax1_m1 E4, C3, C0 + rax1_m1 E1, C0, C2 + rax1_m1 E3, C2, C4 + rax1_m1 E0, C4, C1 + .unreq tmp + + /* 25x XAR, 75 in total */ + + tmp .req C1 + + eor vBba.16b, Aba.16b, E0.16b + xar_m1 vBsa, Abi, E2, 2 + xar_m1 vBbi, Aki, E2, 21 + xar_m1 vBki, Ako, E3, 39 + xar_m1 vBko, Amu, E4, 56 + xar_m1 vBmu, Aso, E3, 8 + xar_m1 vBso, Ama, E0, 23 + xar_m1 vBka, Abe, E1, 63 + xar_m1 vBse, Ago, E3, 9 + xar_m1 vBgo, Ame, E1, 19 + xar_m1 vBke, Agi, E2, 58 + xar_m1 vBgi, Aka, E0, 61 + xar_m1 vBga, Abo, E3, 36 + xar_m1 vBbo, Amo, E3, 43 + xar_m1 vBmo, Ami, E2, 49 + xar_m1 vBmi, Ake, E1, 54 + xar_m1 vBge, Agu, E4, 44 + xar_m1 vBgu, Asi, E2, 3 + xar_m1 vBsi, Aku, E4, 25 + xar_m1 vBku, Asa, E0, 46 + xar_m1 vBma, Abu, E4, 37 + xar_m1 vBbu, Asu, E4, 50 + xar_m1 vBsu, Ase, E1, 62 + xar_m1 vBme, Aga, E0, 28 + xar_m1 vBbe, Age, E1, 20 + + /* 25x BCAX, 50 in total */ + + bcax_m1 Aga, vBga, vBgi, vBge + bcax_m1 Age, vBge, vBgo, vBgi + bcax_m1 Agi, vBgi, vBgu, vBgo + bcax_m1 Ago, vBgo, vBga, vBgu + bcax_m1 Agu, vBgu, vBge, vBga + bcax_m1 Aka, vBka, vBki, vBke + bcax_m1 Ake, vBke, vBko, vBki + bcax_m1 Aki, vBki, vBku, vBko + bcax_m1 Ako, vBko, vBka, vBku + bcax_m1 Aku, vBku, vBke, vBka + bcax_m1 Ama, vBma, vBmi, vBme + bcax_m1 Ame, vBme, vBmo, vBmi + bcax_m1 Ami, vBmi, vBmu, vBmo + bcax_m1 Amo, vBmo, vBma, vBmu + bcax_m1 Amu, vBmu, vBme, vBma + bcax_m1 Asa, vBsa, vBsi, vBse + bcax_m1 Ase, vBse, vBso, vBsi + bcax_m1 Asi, vBsi, vBsu, vBso + bcax_m1 Aso, vBso, vBsa, vBsu + bcax_m1 Asu, vBsu, vBse, vBsa + bcax_m1 Aba, vBba, vBbi, vBbe + bcax_m1 Abe, vBbe, vBbo, vBbi + bcax_m1 Abi, vBbi, vBbu, vBbo + bcax_m1 Abo, vBbo, vBba, vBbu + bcax_m1 Abu, vBbu, vBbe, vBba + + // iota step + ld1r {tmp.2d}, [const_addr], #8 + eor Aba.16b, Aba.16b, tmp.16b + + .unreq tmp + +.endm + +.macro keccak_f1600_round_core + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req vBba + rax1_m1 E2, C1, C3 + rax1_m1 E4, C3, C0 + rax1_m1 E1, C0, C2 + rax1_m1 E3, C2, C4 + rax1_m1 E0, C4, C1 + + /* 25x XAR, 75 in total */ + + eor vBba.16b, Aba.16b, E0.16b + xar_m1 vBsa, Abi, E2, 2 + xar_m1 vBbi, Aki, E2, 21 + xar_m1 vBki, Ako, E3, 39 + xar_m1 vBko, Amu, E4, 56 + xar_m1 vBmu, Aso, E3, 8 + xar_m1 vBso, Ama, E0, 23 + xar_m1 vBka, Abe, E1, 63 + xar_m1 vBse, Ago, E3, 9 + xar_m1 vBgo, Ame, E1, 19 + xar_m1 vBke, Agi, E2, 58 + xar_m1 vBgi, Aka, E0, 61 + xar_m1 vBga, Abo, E3, 36 + xar_m1 vBbo, Amo, E3, 43 + xar_m1 vBmo, Ami, E2, 49 + xar_m1 vBmi, Ake, E1, 54 + xar_m1 vBge, Agu, E4, 44 + xar_m1 vBgu, Asi, E2, 3 + xar_m1 vBsi, Aku, E4, 25 + xar_m1 vBku, Asa, E0, 46 + xar_m1 vBma, Abu, E4, 37 + xar_m1 vBbu, Asu, E4, 50 + xar_m1 vBsu, Ase, E1, 62 + xar_m1 vBme, Aga, E0, 28 + xar_m1 vBbe, Age, E1, 20 + + /* 25x BCAX, 50 in total */ + + .unreq tmp + tmp .req C1 + bcax_m1 Aga, vBga, vBgi, vBge + bcax_m1 Age, vBge, vBgo, vBgi + bcax_m1 Agi, vBgi, vBgu, vBgo + bcax_m1 Ago, vBgo, vBga, vBgu + bcax_m1 Agu, vBgu, vBge, vBga + bcax_m1 Aka, vBka, vBki, vBke + bcax_m1 Ake, vBke, vBko, vBki + .unreq tmp + + eor2 C0, Aka, Aga + save(Aga) + + tmp .req Aga + bcax_m1 Aki, vBki, vBku, vBko + bcax_m1 Ako, vBko, vBka, vBku + eor2 C1, Ake, Age + bcax_m1 Aku, vBku, vBke, vBka + eor2 C2, Aki, Agi + bcax_m1 Ama, vBma, vBmi, vBme + eor2 C3, Ako, Ago + bcax_m1 Ame, vBme, vBmo, vBmi + eor2 C4, Aku, Agu + bcax_m1 Ami, vBmi, vBmu, vBmo + eor2 C0, C0, Ama + bcax_m1 Amo, vBmo, vBma, vBmu + eor2 C1, C1, Ame + bcax_m1 Amu, vBmu, vBme, vBma + eor2 C2, C2, Ami + bcax_m1 Asa, vBsa, vBsi, vBse + eor2 C3, C3, Amo + bcax_m1 Ase, vBse, vBso, vBsi + eor2 C4, C4, Amu + bcax_m1 Asi, vBsi, vBsu, vBso + eor2 C0, C0, Asa + bcax_m1 Aso, vBso, vBsa, vBsu + eor2 C1, C1, Ase + bcax_m1 Asu, vBsu, vBse, vBsa + eor2 C2, C2, Asi + eor2 C3, C3, Aso + bcax_m1 Aba, vBba, vBbi, vBbe + bcax_m1 Abe, vBbe, vBbo, vBbi + + // iota step + ld1r {tmp.2d}, [const_addr], #8 + eor Aba.16b, Aba.16b, tmp.16b + eor2 C4, C4, Asu + + eor2 C0, C0, Aba + bcax_m1 Abi, vBbi, vBbu, vBbo + eor2 C1, C1, Abe + bcax_m1 Abo, vBbo, vBba, vBbu + eor2 C2, C2, Abi + bcax_m1 Abu, vBbu, vBbe, vBba + eor2 C3, C3, Abo + eor2 C4, C4, Abu + + restore(Aga) + .unreq tmp + +.endm + +.macro keccak_f1600_round_post + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req vBba + rax1_m1 E2, C1, C3 + rax1_m1 E4, C3, C0 + rax1_m1 E1, C0, C2 + rax1_m1 E3, C2, C4 + rax1_m1 E0, C4, C1 + .unreq tmp + + /* 25x XAR, 75 in total */ + + tmp .req C1 + eor vBba.16b, Aba.16b, E0.16b + xar_m1 vBsa, Abi, E2, 2 + xar_m1 vBbi, Aki, E2, 21 + xar_m1 vBki, Ako, E3, 39 + xar_m1 vBko, Amu, E4, 56 + xar_m1 vBmu, Aso, E3, 8 + xar_m1 vBso, Ama, E0, 23 + xar_m1 vBka, Abe, E1, 63 + xar_m1 vBse, Ago, E3, 9 + xar_m1 vBgo, Ame, E1, 19 + xar_m1 vBke, Agi, E2, 58 + xar_m1 vBgi, Aka, E0, 61 + xar_m1 vBga, Abo, E3, 36 + xar_m1 vBbo, Amo, E3, 43 + xar_m1 vBmo, Ami, E2, 49 + xar_m1 vBmi, Ake, E1, 54 + xar_m1 vBge, Agu, E4, 44 + xar_m1 vBgu, Asi, E2, 3 + xar_m1 vBsi, Aku, E4, 25 + xar_m1 vBku, Asa, E0, 46 + xar_m1 vBma, Abu, E4, 37 + xar_m1 vBbu, Asu, E4, 50 + xar_m1 vBsu, Ase, E1, 62 + xar_m1 vBme, Aga, E0, 28 + xar_m1 vBbe, Age, E1, 20 + + /* 25x BCAX, 50 in total */ + + bcax_m1 Aga, vBga, vBgi, vBge + bcax_m1 Age, vBge, vBgo, vBgi + bcax_m1 Agi, vBgi, vBgu, vBgo + bcax_m1 Ago, vBgo, vBga, vBgu + bcax_m1 Agu, vBgu, vBge, vBga + bcax_m1 Aka, vBka, vBki, vBke + bcax_m1 Ake, vBke, vBko, vBki + bcax_m1 Aki, vBki, vBku, vBko + bcax_m1 Ako, vBko, vBka, vBku + bcax_m1 Aku, vBku, vBke, vBka + bcax_m1 Ama, vBma, vBmi, vBme + bcax_m1 Ame, vBme, vBmo, vBmi + bcax_m1 Ami, vBmi, vBmu, vBmo + bcax_m1 Amo, vBmo, vBma, vBmu + bcax_m1 Amu, vBmu, vBme, vBma + bcax_m1 Asa, vBsa, vBsi, vBse + bcax_m1 Ase, vBse, vBso, vBsi + bcax_m1 Asi, vBsi, vBsu, vBso + bcax_m1 Aso, vBso, vBsa, vBsu + bcax_m1 Asu, vBsu, vBse, vBsa + bcax_m1 Aba, vBba, vBbi, vBbe + bcax_m1 Abe, vBbe, vBbo, vBbi + bcax_m1 Abi, vBbi, vBbu, vBbo + bcax_m1 Abo, vBbo, vBba, vBbu + bcax_m1 Abu, vBbu, vBbe, vBba + + // iota step + ld1r {tmp.2d}, [const_addr], #8 + eor Aba.16b, Aba.16b, tmp.16b + + .unreq tmp + +.endm + + +.text +.align 4 +.global keccak_f1600_x2_v84a_asm_v2pp0 +.global _keccak_f1600_x2_v84a_asm_v2pp0 + +#define KECCAK_F1600_ROUNDS 24 + +keccak_f1600_x2_v84a_asm_v2pp0: +_keccak_f1600_x2_v84a_asm_v2pp0: + alloc_stack + save_vregs + load_constant_ptr + load_input + + //mov count, #(KECCAK_F1600_ROUNDS-2) + mov count, #11 + keccak_f1600_round_pre +loop: + keccak_f1600_round_core + keccak_f1600_round_core + sub count, count, #1 + cbnz count, loop + + keccak_f1600_round_core + keccak_f1600_round_post + store_input + restore_vregs + free_stack + ret diff --git a/asm/manual/keccak_f1600/keccak_f1600_x2_v84a_asm_v2pp1.s b/asm/manual/keccak_f1600/keccak_f1600_x2_v84a_asm_v2pp1.s new file mode 100644 index 0000000..f8650ed --- /dev/null +++ b/asm/manual/keccak_f1600/keccak_f1600_x2_v84a_asm_v2pp1.s @@ -0,0 +1,755 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +/********************** CONSTANTS *************************/ + .data + .align(8) +_round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x1 + count .req x2 + cur_const .req x3 + + /* Mapping of Kecck-f1600 state to vector registers + * at the beginning and end of each round. */ + Aba .req v0 + Abe .req v1 + Abi .req v2 + Abo .req v3 + Abu .req v4 + Aga .req v5 + Age .req v6 + Agi .req v7 + Ago .req v8 + Agu .req v9 + Aka .req v10 + Ake .req v11 + Aki .req v12 + Ako .req v13 + Aku .req v14 + Ama .req v15 + Ame .req v16 + Ami .req v17 + Amo .req v18 + Amu .req v19 + Asa .req v20 + Ase .req v21 + Asi .req v22 + Aso .req v23 + Asu .req v24 + + /* q-form of the above mapping */ + Abaq .req q0 + Abeq .req q1 + Abiq .req q2 + Aboq .req q3 + Abuq .req q4 + Agaq .req q5 + Ageq .req q6 + Agiq .req q7 + Agoq .req q8 + Aguq .req q9 + Akaq .req q10 + Akeq .req q11 + Akiq .req q12 + Akoq .req q13 + Akuq .req q14 + Amaq .req q15 + Ameq .req q16 + Amiq .req q17 + Amoq .req q18 + Amuq .req q19 + Asaq .req q20 + Aseq .req q21 + Asiq .req q22 + Asoq .req q23 + Asuq .req q24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v27 + C1 .req v28 + C2 .req v29 + C3 .req v30 + C4 .req v31 + + C0q .req q27 + C1q .req q28 + C2q .req q29 + C3q .req q30 + C4q .req q31 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + vBba .req v25 // fresh + vBbe .req v26 // fresh + vBbi .req Abi + vBbo .req Abo + vBbu .req Abu + vBga .req Aka + vBge .req Ake + vBgi .req Agi + vBgo .req Ago + vBgu .req Agu + vBka .req Ama + vBke .req Ame + vBki .req Aki + vBko .req Ako + vBku .req Aku + vBma .req Asa + vBme .req Ase + vBmi .req Ami + vBmo .req Amo + vBmu .req Amu + vBsa .req Aba + vBse .req Abe + vBsi .req Asi + vBso .req Aso + vBsu .req Asu + + vBbaq .req q25 // fresh + vBbeq .req q26 // fresh + vBbiq .req Abiq + vBboq .req Aboq + vBbuq .req Abuq + vBgaq .req Akaq + vBgeq .req Akeq + vBgiq .req Agiq + vBgoq .req Agoq + vBguq .req Aguq + vBkaq .req Amaq + vBkeq .req Ameq + vBkiq .req Akiq + vBkoq .req Akoq + vBkuq .req Akuq + vBmaq .req Asaq + vBmeq .req Aseq + vBmiq .req Amiq + vBmoq .req Amoq + vBmuq .req Amuq + vBsaq .req Abaq + vBseq .req Abeq + vBsiq .req Asiq + vBsoq .req Asoq + vBsuq .req Asuq + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req C4 + E1 .req C0 + E2 .req vBbe // fresh + E3 .req C2 + E4 .req C3 + + E0q .req C4q + E1q .req C0q + E2q .req vBbeq // fresh + E3q .req C2q + E4q .req C3q + + +/************************ MACROS ****************************/ + +.macro load_input + ldp Abaq, Abeq, [input_addr, #(2*8*0)] + ldp Abiq, Aboq, [input_addr, #(2*8*2)] + ldp Abuq, Agaq, [input_addr, #(2*8*4)] + ldp Ageq, Agiq, [input_addr, #(2*8*6)] + ldp Agoq, Aguq, [input_addr, #(2*8*8)] + ldp Akaq, Akeq, [input_addr, #(2*8*10)] + ldp Akiq, Akoq, [input_addr, #(2*8*12)] + ldp Akuq, Amaq, [input_addr, #(2*8*14)] + ldp Ameq, Amiq, [input_addr, #(2*8*16)] + ldp Amoq, Amuq, [input_addr, #(2*8*18)] + ldp Asaq, Aseq, [input_addr, #(2*8*20)] + ldp Asiq, Asoq, [input_addr, #(2*8*22)] + ldr Asuq, [input_addr, #(2*8*24)] +.endm + +.macro store_input + str Abaq, [input_addr, #(2*8*0)] + str Abeq, [input_addr, #(2*8*1)] + str Abiq, [input_addr, #(2*8*2)] + str Aboq, [input_addr, #(2*8*3)] + str Abuq, [input_addr, #(2*8*4)] + str Agaq, [input_addr, #(2*8*5)] + str Ageq, [input_addr, #(2*8*6)] + str Agiq, [input_addr, #(2*8*7)] + str Agoq, [input_addr, #(2*8*8)] + str Aguq, [input_addr, #(2*8*9)] + str Akaq, [input_addr, #(2*8*10)] + str Akeq, [input_addr, #(2*8*11)] + str Akiq, [input_addr, #(2*8*12)] + str Akoq, [input_addr, #(2*8*13)] + str Akuq, [input_addr, #(2*8*14)] + str Amaq, [input_addr, #(2*8*15)] + str Ameq, [input_addr, #(2*8*16)] + str Amiq, [input_addr, #(2*8*17)] + str Amoq, [input_addr, #(2*8*18)] + str Amuq, [input_addr, #(2*8*19)] + str Asaq, [input_addr, #(2*8*20)] + str Aseq, [input_addr, #(2*8*21)] + str Asiq, [input_addr, #(2*8*22)] + str Asoq, [input_addr, #(2*8*23)] + str Asuq, [input_addr, #(2*8*24)] +.endm + +#define STACK_SIZE (16*4 + 16*31) +#define STACK_BASE_VREGS 0 +#define STACK_BASE_TMP 16*4 + +#define Aga_offset 0 +#define E0_offset 1 +#define E1_offset 2 +#define E2_offset 3 +#define E3_offset 4 +#define E4_offset 5 +#define Ame_offset 7 +#define Agi_offset 8 +#define Aka_offset 9 +#define Abo_offset 10 +#define Amo_offset 11 +#define Ami_offset 12 +#define Ake_offset 13 +#define Agu_offset 14 +#define Asi_offset 15 +#define Aku_offset 16 +#define Asa_offset 17 +#define Abu_offset 18 +#define Asu_offset 19 +#define Ase_offset 20 +//#define Aga_offset 21 +#define Age_offset 22 +#define vBgo_offset 23 +#define vBke_offset 24 +#define vBgi_offset 25 +#define vBga_offset 26 +#define vBbo_offset 27 +#define vBmo_offset 28 +#define vBmi_offset 29 +#define vBge_offset 30 + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +#define save(name) \ + str name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] +#define restore(name) \ + ldr name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] + +.macro save_vregs + stp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + stp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + stp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + stp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + ldp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + ldp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + ldp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +/* Macros using v8.4-A SHA-3 instructions */ + +.macro eor3_m1_0 d s0 s1 s2 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor2 d s0 s1 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor3_m1_1 d s0 s1 s2 + eor \d\().16b, \d\().16b, \s2\().16b +.endm + +.macro eor3_m1 d s0 s1 s2 + eor3_m1_0 \d, \s0, \s1, \s2 + eor3_m1_1 \d, \s0, \s1, \s2 +.endm + +.macro rax1_m1 d s0 s1 + // Use add instead of SHL #1 + add tmp.2d, \s1\().2d, \s1\().2d + sri tmp.2d, \s1\().2d, #63 + eor \d\().16b, tmp.16b, \s0\().16b +.endm + + .macro xar_m1 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 63 + eor \s0\().16b, \s0\().16b, \s1\().16b + add \d\().2d, \s0\().2d, \s0\().2d + sri \d\().2d, \s0\().2d, #(63) + .elseif \imm == 62 + eor \s0\().16b, \s0\().16b, \s1\().16b + add \d\().2d, \s0\().2d, \s0\().2d + add \d\().2d, \d\().2d, \d\().2d + sri \d\().2d, \s0\().2d, #(62) + .else + eor \s0\().16b, \s0\().16b, \s1\().16b + shl \d\().2d, \s0\().2d, #(64-\imm) + sri \d\().2d, \s0\().2d, #(\imm) + .endif +.endm + +.macro bcax_m1 d s0 s1 s2 + bic tmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +/* Keccak-f1600 round */ + +.macro keccak_f1600_round_pre + + /* 10 EOR3, so 20 individual EOR */ + + eor3_m1_0 C0, Aba, Aga, Aka + eor3_m1_0 C1, Abe, Age, Ake + eor3_m1_0 C2, Abi, Agi, Aki + eor3_m1_0 C3, Abo, Ago, Ako + eor3_m1_0 C4, Abu, Agu, Aku + eor3_m1_1 C0, Aba, Aga, Aka + eor3_m1_1 C1, Abe, Age, Ake + eor3_m1_1 C2, Abi, Agi, Aki + eor3_m1_1 C3, Abo, Ago, Ako + eor3_m1_1 C4, Abu, Agu, Aku + eor3_m1_0 C0, C0, Ama, Asa + eor3_m1_0 C1, C1, Ame, Ase + eor3_m1_0 C2, C2, Ami, Asi + eor3_m1_0 C3, C3, Amo, Aso + eor3_m1_0 C4, C4, Amu, Asu + eor3_m1_1 C0, C0, Ama, Asa + eor3_m1_1 C1, C1, Ame, Ase + eor3_m1_1 C2, C2, Ami, Asi + eor3_m1_1 C3, C3, Amo, Aso + eor3_m1_1 C4, C4, Amu, Asu + +.endm + +.macro keccak_f1600_round + + /* 10 EOR3, so 20 individual EOR */ + + eor3_m1_0 C0, Aba, Aga, Aka + eor3_m1_0 C1, Abe, Age, Ake + eor3_m1_0 C2, Abi, Agi, Aki + eor3_m1_0 C3, Abo, Ago, Ako + eor3_m1_0 C4, Abu, Agu, Aku + eor3_m1_1 C0, Aba, Aga, Aka + eor3_m1_1 C1, Abe, Age, Ake + eor3_m1_1 C2, Abi, Agi, Aki + eor3_m1_1 C3, Abo, Ago, Ako + eor3_m1_1 C4, Abu, Agu, Aku + eor3_m1_0 C0, C0, Ama, Asa + eor3_m1_0 C1, C1, Ame, Ase + eor3_m1_0 C2, C2, Ami, Asi + eor3_m1_0 C3, C3, Amo, Aso + eor3_m1_0 C4, C4, Amu, Asu + eor3_m1_1 C0, C0, Ama, Asa + eor3_m1_1 C1, C1, Ame, Ase + eor3_m1_1 C2, C2, Ami, Asi + eor3_m1_1 C3, C3, Amo, Aso + eor3_m1_1 C4, C4, Amu, Asu + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req vBba + rax1_m1 E2, C1, C3 + rax1_m1 E4, C3, C0 + rax1_m1 E1, C0, C2 + rax1_m1 E3, C2, C4 + rax1_m1 E0, C4, C1 + .unreq tmp + + /* 25x XAR, 75 in total */ + + tmp .req C1 + tmpq .req C1q + + eor vBba.16b, Aba.16b, E0.16b + xar_m1 vBsa, Abi, E2, 2 + xar_m1 vBbi, Aki, E2, 21 + xar_m1 vBki, Ako, E3, 39 + xar_m1 vBko, Amu, E4, 56 + xar_m1 vBmu, Aso, E3, 8 + xar_m1 vBso, Ama, E0, 23 + xar_m1 vBka, Abe, E1, 63 + xar_m1 vBse, Ago, E3, 9 + xar_m1 vBgo, Ame, E1, 19 + xar_m1 vBke, Agi, E2, 58 + xar_m1 vBgi, Aka, E0, 61 + xar_m1 vBga, Abo, E3, 36 + xar_m1 vBbo, Amo, E3, 43 + xar_m1 vBmo, Ami, E2, 49 + xar_m1 vBmi, Ake, E1, 54 + xar_m1 vBge, Agu, E4, 44 + xar_m1 vBgu, Asi, E2, 3 + xar_m1 vBsi, Aku, E4, 25 + xar_m1 vBku, Asa, E0, 46 + xar_m1 vBma, Abu, E4, 37 + xar_m1 vBbu, Asu, E4, 50 + xar_m1 vBsu, Ase, E1, 62 + xar_m1 vBme, Aga, E0, 28 + xar_m1 vBbe, Age, E1, 20 + + /* 25x BCAX, 50 in total */ + + bcax_m1 Aga, vBga, vBgi, vBge + bcax_m1 Age, vBge, vBgo, vBgi + bcax_m1 Agi, vBgi, vBgu, vBgo + bcax_m1 Ago, vBgo, vBga, vBgu + bcax_m1 Agu, vBgu, vBge, vBga + bcax_m1 Aka, vBka, vBki, vBke + bcax_m1 Ake, vBke, vBko, vBki + bcax_m1 Aki, vBki, vBku, vBko + bcax_m1 Ako, vBko, vBka, vBku + bcax_m1 Aku, vBku, vBke, vBka + bcax_m1 Ama, vBma, vBmi, vBme + bcax_m1 Ame, vBme, vBmo, vBmi + bcax_m1 Ami, vBmi, vBmu, vBmo + bcax_m1 Amo, vBmo, vBma, vBmu + bcax_m1 Amu, vBmu, vBme, vBma + bcax_m1 Asa, vBsa, vBsi, vBse + bcax_m1 Ase, vBse, vBso, vBsi + bcax_m1 Asi, vBsi, vBsu, vBso + bcax_m1 Aso, vBso, vBsa, vBsu + bcax_m1 Asu, vBsu, vBse, vBsa + bcax_m1 Aba, vBba, vBbi, vBbe + bcax_m1 Abe, vBbe, vBbo, vBbi + bcax_m1 Abi, vBbi, vBbu, vBbo + bcax_m1 Abo, vBbo, vBba, vBbu + bcax_m1 Abu, vBbu, vBbe, vBba + + // iota step + //ld1r {tmp.2d}, [const_addr], #8 + ldr tmpq, [const_addr], #16 + eor Aba.16b, Aba.16b, tmp.16b + + .unreq tmp + .unreq tmpq + +.endm + +.macro keccak_f1600_round_core + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req vBba + rax1_m1 E2, C1, C3 + rax1_m1 E4, C3, C0 + rax1_m1 E1, C0, C2 + rax1_m1 E3, C2, C4 + rax1_m1 E0, C4, C1 + + /* 25x XAR, 75 in total */ + + eor vBba.16b, Aba.16b, E0.16b + xar_m1 vBsa, Abi, E2, 2 + xar_m1 vBbi, Aki, E2, 21 + xar_m1 vBki, Ako, E3, 39 + xar_m1 vBko, Amu, E4, 56 + xar_m1 vBmu, Aso, E3, 8 + xar_m1 vBso, Ama, E0, 23 + xar_m1 vBka, Abe, E1, 63 + xar_m1 vBse, Ago, E3, 9 + xar_m1 vBgo, Ame, E1, 19 + xar_m1 vBke, Agi, E2, 58 + xar_m1 vBgi, Aka, E0, 61 + xar_m1 vBga, Abo, E3, 36 + xar_m1 vBbo, Amo, E3, 43 + xar_m1 vBmo, Ami, E2, 49 + xar_m1 vBmi, Ake, E1, 54 + xar_m1 vBge, Agu, E4, 44 + xar_m1 vBgu, Asi, E2, 3 + xar_m1 vBsi, Aku, E4, 25 + xar_m1 vBku, Asa, E0, 46 + xar_m1 vBma, Abu, E4, 37 + xar_m1 vBbu, Asu, E4, 50 + xar_m1 vBsu, Ase, E1, 62 + xar_m1 vBme, Aga, E0, 28 + xar_m1 vBbe, Age, E1, 20 + + /* 25x BCAX, 50 in total */ + + .unreq tmp + tmp .req C1 + bcax_m1 Aga, vBga, vBgi, vBge + bcax_m1 Age, vBge, vBgo, vBgi + bcax_m1 Agi, vBgi, vBgu, vBgo + bcax_m1 Ago, vBgo, vBga, vBgu + bcax_m1 Agu, vBgu, vBge, vBga + bcax_m1 Aka, vBka, vBki, vBke + bcax_m1 Ake, vBke, vBko, vBki + .unreq tmp + + eor2 C0, Aka, Aga + save(Aga) + + tmp .req Aga + tmpq .req Agaq + bcax_m1 Aki, vBki, vBku, vBko + bcax_m1 Ako, vBko, vBka, vBku + eor2 C1, Ake, Age + bcax_m1 Aku, vBku, vBke, vBka + eor2 C2, Aki, Agi + bcax_m1 Ama, vBma, vBmi, vBme + eor2 C3, Ako, Ago + bcax_m1 Ame, vBme, vBmo, vBmi + eor2 C4, Aku, Agu + bcax_m1 Ami, vBmi, vBmu, vBmo + eor2 C0, C0, Ama + bcax_m1 Amo, vBmo, vBma, vBmu + eor2 C1, C1, Ame + bcax_m1 Amu, vBmu, vBme, vBma + eor2 C2, C2, Ami + bcax_m1 Asa, vBsa, vBsi, vBse + eor2 C3, C3, Amo + bcax_m1 Ase, vBse, vBso, vBsi + eor2 C4, C4, Amu + bcax_m1 Asi, vBsi, vBsu, vBso + eor2 C0, C0, Asa + bcax_m1 Aso, vBso, vBsa, vBsu + eor2 C1, C1, Ase + bcax_m1 Asu, vBsu, vBse, vBsa + eor2 C2, C2, Asi + eor2 C3, C3, Aso + bcax_m1 Aba, vBba, vBbi, vBbe + bcax_m1 Abe, vBbe, vBbo, vBbi + eor2 C1, C1, Abe + + // iota step + //ld1r {tmp.2d}, [const_addr], #8 + ldr tmpq, [const_addr], #16 + eor Aba.16b, Aba.16b, tmp.16b + eor2 C4, C4, Asu + bcax_m1 Abi, vBbi, vBbu, vBbo + bcax_m1 Abo, vBbo, vBba, vBbu + eor2 C3, C3, Abo + eor2 C2, C2, Abi + eor2 C0, C0, Aba + bcax_m1 Abu, vBbu, vBbe, vBba + eor2 C4, C4, Abu + + restore(Aga) + .unreq tmp + .unreq tmpq + +.endm + +.macro keccak_f1600_round_post + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req vBba + rax1_m1 E2, C1, C3 + rax1_m1 E4, C3, C0 + rax1_m1 E1, C0, C2 + rax1_m1 E3, C2, C4 + rax1_m1 E0, C4, C1 + .unreq tmp + + /* 25x XAR, 75 in total */ + + tmp .req C1 + eor vBba.16b, Aba.16b, E0.16b + xar_m1 vBsa, Abi, E2, 2 + xar_m1 vBbi, Aki, E2, 21 + xar_m1 vBki, Ako, E3, 39 + xar_m1 vBko, Amu, E4, 56 + xar_m1 vBmu, Aso, E3, 8 + xar_m1 vBso, Ama, E0, 23 + xar_m1 vBka, Abe, E1, 63 + xar_m1 vBse, Ago, E3, 9 + xar_m1 vBgo, Ame, E1, 19 + xar_m1 vBke, Agi, E2, 58 + xar_m1 vBgi, Aka, E0, 61 + xar_m1 vBga, Abo, E3, 36 + xar_m1 vBbo, Amo, E3, 43 + xar_m1 vBmo, Ami, E2, 49 + xar_m1 vBmi, Ake, E1, 54 + xar_m1 vBge, Agu, E4, 44 + xar_m1 vBgu, Asi, E2, 3 + xar_m1 vBsi, Aku, E4, 25 + xar_m1 vBku, Asa, E0, 46 + xar_m1 vBma, Abu, E4, 37 + xar_m1 vBbu, Asu, E4, 50 + xar_m1 vBsu, Ase, E1, 62 + xar_m1 vBme, Aga, E0, 28 + xar_m1 vBbe, Age, E1, 20 + + /* 25x BCAX, 50 in total */ + + bcax_m1 Aga, vBga, vBgi, vBge + bcax_m1 Age, vBge, vBgo, vBgi + bcax_m1 Agi, vBgi, vBgu, vBgo + bcax_m1 Ago, vBgo, vBga, vBgu + bcax_m1 Agu, vBgu, vBge, vBga + bcax_m1 Aka, vBka, vBki, vBke + bcax_m1 Ake, vBke, vBko, vBki + bcax_m1 Aki, vBki, vBku, vBko + bcax_m1 Ako, vBko, vBka, vBku + bcax_m1 Aku, vBku, vBke, vBka + bcax_m1 Ama, vBma, vBmi, vBme + bcax_m1 Ame, vBme, vBmo, vBmi + bcax_m1 Ami, vBmi, vBmu, vBmo + bcax_m1 Amo, vBmo, vBma, vBmu + bcax_m1 Amu, vBmu, vBme, vBma + bcax_m1 Asa, vBsa, vBsi, vBse + bcax_m1 Ase, vBse, vBso, vBsi + bcax_m1 Asi, vBsi, vBsu, vBso + bcax_m1 Aso, vBso, vBsa, vBsu + bcax_m1 Asu, vBsu, vBse, vBsa + bcax_m1 Aba, vBba, vBbi, vBbe + bcax_m1 Abe, vBbe, vBbo, vBbi + bcax_m1 Abi, vBbi, vBbu, vBbo + bcax_m1 Abo, vBbo, vBba, vBbu + bcax_m1 Abu, vBbu, vBbe, vBba + + // iota step + ld1r {tmp.2d}, [const_addr], #8 + eor Aba.16b, Aba.16b, tmp.16b + + .unreq tmp + +.endm + + +.text +.align 4 +.global keccak_f1600_x2_v84a_asm_v2pp1 +.global _keccak_f1600_x2_v84a_asm_v2pp1 + +#define KECCAK_F1600_ROUNDS 24 + +keccak_f1600_x2_v84a_asm_v2pp1: +_keccak_f1600_x2_v84a_asm_v2pp1: + alloc_stack + save_vregs + load_constant_ptr + load_input + + //mov count, #(KECCAK_F1600_ROUNDS-2) + mov count, #11 + keccak_f1600_round_pre +loop: + keccak_f1600_round_core + keccak_f1600_round_core + sub count, count, #1 + cbnz count, loop + + keccak_f1600_round_core + keccak_f1600_round_post + store_input + restore_vregs + free_stack + ret diff --git a/asm/manual/keccak_f1600/keccak_f1600_x2_v84a_asm_v2pp2.s b/asm/manual/keccak_f1600/keccak_f1600_x2_v84a_asm_v2pp2.s new file mode 100644 index 0000000..8b76c2b --- /dev/null +++ b/asm/manual/keccak_f1600/keccak_f1600_x2_v84a_asm_v2pp2.s @@ -0,0 +1,798 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +/********************** CONSTANTS *************************/ + .data + .align(8) +_round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x1 + count .req x2 + cur_const .req x3 + + /* Mapping of Kecck-f1600 state to vector registers + * at the beginning and end of each round. */ + Aba .req v0 + Abe .req v1 + Abi .req v2 + Abo .req v3 + Abu .req v4 + Aga .req v5 + Age .req v6 + Agi .req v7 + Ago .req v8 + Agu .req v9 + Aka .req v10 + Ake .req v11 + Aki .req v12 + Ako .req v13 + Aku .req v14 + Ama .req v15 + Ame .req v16 + Ami .req v17 + Amo .req v18 + Amu .req v19 + Asa .req v20 + Ase .req v21 + Asi .req v22 + Aso .req v23 + Asu .req v24 + + /* q-form of the above mapping */ + Abaq .req q0 + Abeq .req q1 + Abiq .req q2 + Aboq .req q3 + Abuq .req q4 + Agaq .req q5 + Ageq .req q6 + Agiq .req q7 + Agoq .req q8 + Aguq .req q9 + Akaq .req q10 + Akeq .req q11 + Akiq .req q12 + Akoq .req q13 + Akuq .req q14 + Amaq .req q15 + Ameq .req q16 + Amiq .req q17 + Amoq .req q18 + Amuq .req q19 + Asaq .req q20 + Aseq .req q21 + Asiq .req q22 + Asoq .req q23 + Asuq .req q24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v27 + C1 .req v28 + C2 .req v29 + C3 .req v30 + C4 .req v31 + + C0q .req q27 + C1q .req q28 + C2q .req q29 + C3q .req q30 + C4q .req q31 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + vBba .req v25 // fresh + vBbe .req v26 // fresh + vBbi .req Abi + vBbo .req Abo + vBbu .req Abu + vBga .req Aka + vBge .req Ake + vBgi .req Agi + vBgo .req Ago + vBgu .req Agu + vBka .req Ama + vBke .req Ame + vBki .req Aki + vBko .req Ako + vBku .req Aku + vBma .req Asa + vBme .req Ase + vBmi .req Ami + vBmo .req Amo + vBmu .req Amu + vBsa .req Aba + vBse .req Abe + vBsi .req Asi + vBso .req Aso + vBsu .req Asu + + vBbaq .req q25 // fresh + vBbeq .req q26 // fresh + vBbiq .req Abiq + vBboq .req Aboq + vBbuq .req Abuq + vBgaq .req Akaq + vBgeq .req Akeq + vBgiq .req Agiq + vBgoq .req Agoq + vBguq .req Aguq + vBkaq .req Amaq + vBkeq .req Ameq + vBkiq .req Akiq + vBkoq .req Akoq + vBkuq .req Akuq + vBmaq .req Asaq + vBmeq .req Aseq + vBmiq .req Amiq + vBmoq .req Amoq + vBmuq .req Amuq + vBsaq .req Abaq + vBseq .req Abeq + vBsiq .req Asiq + vBsoq .req Asoq + vBsuq .req Asuq + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req C4 + E1 .req C0 + E2 .req vBbe // fresh + E3 .req C2 + E4 .req C3 + + E0q .req C4q + E1q .req C0q + E2q .req vBbeq // fresh + E3q .req C2q + E4q .req C3q + + +/************************ MACROS ****************************/ + +.macro load_input + ldp Abaq, Abeq, [input_addr, #(2*8*0)] + ldp Abiq, Aboq, [input_addr, #(2*8*2)] + ldp Abuq, Agaq, [input_addr, #(2*8*4)] + ldp Ageq, Agiq, [input_addr, #(2*8*6)] + ldp Agoq, Aguq, [input_addr, #(2*8*8)] + ldp Akaq, Akeq, [input_addr, #(2*8*10)] + ldp Akiq, Akoq, [input_addr, #(2*8*12)] + ldp Akuq, Amaq, [input_addr, #(2*8*14)] + ldp Ameq, Amiq, [input_addr, #(2*8*16)] + ldp Amoq, Amuq, [input_addr, #(2*8*18)] + ldp Asaq, Aseq, [input_addr, #(2*8*20)] + ldp Asiq, Asoq, [input_addr, #(2*8*22)] + ldr Asuq, [input_addr, #(2*8*24)] +.endm + +.macro store_input + str Abaq, [input_addr, #(2*8*0)] + str Abeq, [input_addr, #(2*8*1)] + str Abiq, [input_addr, #(2*8*2)] + str Aboq, [input_addr, #(2*8*3)] + str Abuq, [input_addr, #(2*8*4)] + str Agaq, [input_addr, #(2*8*5)] + str Ageq, [input_addr, #(2*8*6)] + str Agiq, [input_addr, #(2*8*7)] + str Agoq, [input_addr, #(2*8*8)] + str Aguq, [input_addr, #(2*8*9)] + str Akaq, [input_addr, #(2*8*10)] + str Akeq, [input_addr, #(2*8*11)] + str Akiq, [input_addr, #(2*8*12)] + str Akoq, [input_addr, #(2*8*13)] + str Akuq, [input_addr, #(2*8*14)] + str Amaq, [input_addr, #(2*8*15)] + str Ameq, [input_addr, #(2*8*16)] + str Amiq, [input_addr, #(2*8*17)] + str Amoq, [input_addr, #(2*8*18)] + str Amuq, [input_addr, #(2*8*19)] + str Asaq, [input_addr, #(2*8*20)] + str Aseq, [input_addr, #(2*8*21)] + str Asiq, [input_addr, #(2*8*22)] + str Asoq, [input_addr, #(2*8*23)] + str Asuq, [input_addr, #(2*8*24)] +.endm + +#define STACK_SIZE (16*4 + 16*34) +#define STACK_BASE_VREGS 0 +#define STACK_BASE_TMP 16*4 + +#define Aga_offset 0 +#define E0_offset 1 +#define E1_offset 2 +#define E2_offset 3 +#define E3_offset 4 +#define E4_offset 5 +#define Ame_offset 7 +#define Agi_offset 8 +#define Aka_offset 9 +#define Abo_offset 10 +#define Amo_offset 11 +#define Ami_offset 12 +#define Ake_offset 13 +#define Agu_offset 14 +#define Asi_offset 15 +#define Aku_offset 16 +#define Asa_offset 17 +#define Abu_offset 18 +#define Asu_offset 19 +#define Ase_offset 20 +//#define Aga_offset 21 +#define Age_offset 22 +#define vBgo_offset 23 +#define vBke_offset 24 +#define vBgi_offset 25 +#define vBga_offset 26 +#define vBbo_offset 27 +#define vBmo_offset 28 +#define vBmi_offset 29 +#define vBge_offset 30 + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +#define save(name) \ + str name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] +#define restore(name) \ + ldr name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] + +.macro save_vregs + stp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + stp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + stp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + stp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + ldp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + ldp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + ldp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +/* Macros using v8.4-A SHA-3 instructions */ + +.macro eor3_m1_0 d s0 s1 s2 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor2 d s0 s1 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor3_m1_1 d s0 s1 s2 + eor \d\().16b, \d\().16b, \s2\().16b +.endm + +.macro eor3_m1 d s0 s1 s2 + eor3_m1_0 \d, \s0, \s1, \s2 + eor3_m1_1 \d, \s0, \s1, \s2 +.endm + +.macro rax1_m1 d s0 s1 + // Use add instead of SHL #1 + add tmp.2d, \s1\().2d, \s1\().2d + sri tmp.2d, \s1\().2d, #63 + eor \d\().16b, tmp.16b, \s0\().16b +.endm + + .macro xar_m1 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 63 + eor \s0\().16b, \s0\().16b, \s1\().16b + add \d\().2d, \s0\().2d, \s0\().2d + sri \d\().2d, \s0\().2d, #(63) + .elseif \imm == 62 + eor \s0\().16b, \s0\().16b, \s1\().16b + add \d\().2d, \s0\().2d, \s0\().2d + add \d\().2d, \d\().2d, \d\().2d + sri \d\().2d, \s0\().2d, #(62) + // .elseif \imm == 61 + // eor \s0\().16b, \s0\().16b, \s1\().16b + // add \d\().2d, \s0\().2d, \s0\().2d + // add \d\().2d, \d\().2d, \d\().2d + // add \d\().2d, \d\().2d, \d\().2d + // sri \d\().2d, \s0\().2d, #(61) + .else + eor \s0\().16b, \s0\().16b, \s1\().16b + shl \d\().2d, \s0\().2d, #(64-\imm) + sri \d\().2d, \s0\().2d, #(\imm) + .endif +.endm + + .macro xar_m1_0 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 63 + eor \s0\().16b, \s0\().16b, \s1\().16b + .elseif \imm == 62 + eor \s0\().16b, \s0\().16b, \s1\().16b + .else + eor \s0\().16b, \s0\().16b, \s1\().16b + .endif +.endm + + .macro xar_m1_1 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 63 + add \d\().2d, \s0\().2d, \s0\().2d + sri \d\().2d, \s0\().2d, #(63) + .elseif \imm == 62 + add \d\().2d, \s0\().2d, \s0\().2d + add \d\().2d, \d\().2d, \d\().2d + sri \d\().2d, \s0\().2d, #(62) + .else + shl \d\().2d, \s0\().2d, #(64-\imm) + sri \d\().2d, \s0\().2d, #(\imm) + .endif +.endm + +.macro bcax_m1 d s0 s1 s2 + bic tmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +/* Keccak-f1600 round */ + +.macro keccak_f1600_round_pre + + /* 10 EOR3, so 20 individual EOR */ + + eor3_m1_0 C1, Abe, Age, Ake + eor3_m1_0 C3, Abo, Ago, Ako + eor3_m1_0 C0, Aba, Aga, Aka + eor3_m1_0 C2, Abi, Agi, Aki + eor3_m1_0 C4, Abu, Agu, Aku + eor3_m1_1 C1, Abe, Age, Ake + eor3_m1_1 C3, Abo, Ago, Ako + eor3_m1_1 C0, Aba, Aga, Aka + eor3_m1_1 C2, Abi, Agi, Aki + eor3_m1_1 C4, Abu, Agu, Aku + eor3_m1_0 C1, C1, Ame, Ase + eor3_m1_0 C3, C3, Amo, Aso + eor3_m1_0 C0, C0, Ama, Asa + eor3_m1_0 C2, C2, Ami, Asi + eor3_m1_0 C4, C4, Amu, Asu + eor3_m1_1 C1, C1, Ame, Ase + eor3_m1_1 C3, C3, Amo, Aso + eor3_m1_1 C0, C0, Ama, Asa + eor3_m1_1 C2, C2, Ami, Asi + eor3_m1_1 C4, C4, Amu, Asu + +.endm + +.macro keccak_f1600_round + + /* 10 EOR3, so 20 individual EOR */ + + eor3_m1_0 C0, Aba, Aga, Aka + eor3_m1_0 C1, Abe, Age, Ake + eor3_m1_0 C2, Abi, Agi, Aki + eor3_m1_0 C3, Abo, Ago, Ako + eor3_m1_0 C4, Abu, Agu, Aku + eor3_m1_1 C0, Aba, Aga, Aka + eor3_m1_1 C1, Abe, Age, Ake + eor3_m1_1 C2, Abi, Agi, Aki + eor3_m1_1 C3, Abo, Ago, Ako + eor3_m1_1 C4, Abu, Agu, Aku + eor3_m1_0 C0, C0, Ama, Asa + eor3_m1_0 C1, C1, Ame, Ase + eor3_m1_0 C2, C2, Ami, Asi + eor3_m1_0 C3, C3, Amo, Aso + eor3_m1_0 C4, C4, Amu, Asu + eor3_m1_1 C0, C0, Ama, Asa + eor3_m1_1 C1, C1, Ame, Ase + eor3_m1_1 C2, C2, Ami, Asi + eor3_m1_1 C3, C3, Amo, Aso + eor3_m1_1 C4, C4, Amu, Asu + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req vBba + rax1_m1 E2, C1, C3 + rax1_m1 E4, C3, C0 + rax1_m1 E1, C0, C2 + rax1_m1 E3, C2, C4 + rax1_m1 E0, C4, C1 + .unreq tmp + + /* 25x XAR, 75 in total */ + + tmp .req C1 + tmpq .req C1q + + eor vBba.16b, Aba.16b, E0.16b + xar_m1 vBsa, Abi, E2, 2 + xar_m1 vBbi, Aki, E2, 21 + xar_m1 vBki, Ako, E3, 39 + xar_m1 vBko, Amu, E4, 56 + xar_m1 vBmu, Aso, E3, 8 + xar_m1 vBso, Ama, E0, 23 + xar_m1 vBka, Abe, E1, 63 + xar_m1 vBse, Ago, E3, 9 + xar_m1 vBgo, Ame, E1, 19 + xar_m1 vBke, Agi, E2, 58 + xar_m1 vBgi, Aka, E0, 61 + xar_m1 vBga, Abo, E3, 36 + xar_m1 vBbo, Amo, E3, 43 + xar_m1 vBmo, Ami, E2, 49 + xar_m1 vBmi, Ake, E1, 54 + xar_m1 vBge, Agu, E4, 44 + xar_m1 vBgu, Asi, E2, 3 + xar_m1 vBsi, Aku, E4, 25 + xar_m1 vBku, Asa, E0, 46 + xar_m1 vBma, Abu, E4, 37 + xar_m1 vBbu, Asu, E4, 50 + xar_m1 vBsu, Ase, E1, 62 + xar_m1 vBme, Aga, E0, 28 + xar_m1 vBbe, Age, E1, 20 + + /* 25x BCAX, 50 in total */ + + bcax_m1 Aga, vBga, vBgi, vBge + bcax_m1 Age, vBge, vBgo, vBgi + bcax_m1 Agi, vBgi, vBgu, vBgo + bcax_m1 Ago, vBgo, vBga, vBgu + bcax_m1 Agu, vBgu, vBge, vBga + bcax_m1 Aka, vBka, vBki, vBke + bcax_m1 Ake, vBke, vBko, vBki + bcax_m1 Aki, vBki, vBku, vBko + bcax_m1 Ako, vBko, vBka, vBku + bcax_m1 Aku, vBku, vBke, vBka + bcax_m1 Ama, vBma, vBmi, vBme + bcax_m1 Ame, vBme, vBmo, vBmi + bcax_m1 Ami, vBmi, vBmu, vBmo + bcax_m1 Amo, vBmo, vBma, vBmu + bcax_m1 Amu, vBmu, vBme, vBma + bcax_m1 Asa, vBsa, vBsi, vBse + bcax_m1 Ase, vBse, vBso, vBsi + bcax_m1 Asi, vBsi, vBsu, vBso + bcax_m1 Aso, vBso, vBsa, vBsu + bcax_m1 Asu, vBsu, vBse, vBsa + bcax_m1 Aba, vBba, vBbi, vBbe + bcax_m1 Abe, vBbe, vBbo, vBbi + bcax_m1 Abi, vBbi, vBbu, vBbo + bcax_m1 Abo, vBbo, vBba, vBbu + bcax_m1 Abu, vBbu, vBbe, vBba + + // iota step + //ld1r {tmp.2d}, [const_addr], #8 + ldr tmpq, [const_addr], #16 + eor Aba.16b, Aba.16b, tmp.16b + + .unreq tmp + .unreq tmpq + +.endm + +.macro keccak_f1600_round_core + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req vBba + rax1_m1 E2, C1, C3 + str Agaq, [sp, #(STACK_BASE_TMP + 16 * 30)] + rax1_m1 E4, C3, C0 + rax1_m1 E1, C0, C2 + rax1_m1 E3, C2, C4 + rax1_m1 E0, C4, C1 + + /* 25x XAR, 75 in total */ + + .unreq tmp + tmp .req C1 + tmpq .req C1q + + eor vBba.16b, Aba.16b, E0.16b + xar_m1 vBsa, Abi, E2, 2 + xar_m1 vBbi, Aki, E2, 21 + xar_m1 vBki, Ako, E3, 39 + xar_m1 vBko, Amu, E4, 56 + xar_m1 vBmu, Aso, E3, 8 + xar_m1 vBso, Ama, E0, 23 + xar_m1 vBka, Abe, E1, 63 + xar_m1 vBse, Ago, E3, 9 + xar_m1 vBgo, Ame, E1, 19 + xar_m1 vBke, Agi, E2, 58 + xar_m1 vBgi, Aka, E0, 61 + xar_m1 vBga, Abo, E3, 36 + xar_m1 vBbo, Amo, E3, 43 + xar_m1 vBmo, Ami, E2, 49 + xar_m1 vBmi, Ake, E1, 54 + xar_m1 vBge, Agu, E4, 44 + bcax_m1 Aga, vBga, vBgi, vBge + xar_m1 vBgu, Asi, E2, 3 + xar_m1 vBsi, Aku, E4, 25 + xar_m1 vBku, Asa, E0, 46 + xar_m1 vBma, Abu, E4, 37 + xar_m1 vBbu, Asu, E4, 50 + xar_m1 vBsu, Ase, E1, 62 + ldr tmpq, [sp, #(STACK_BASE_TMP + 16*30)] + xar_m1 vBme, tmp, E0, 28 + xar_m1 vBbe, Age, E1, 20 + + /* 25x BCAX, 50 in total */ + + bcax_m1 Age, vBge, vBgo, vBgi + bcax_m1 Agi, vBgi, vBgu, vBgo + bcax_m1 Ago, vBgo, vBga, vBgu + bcax_m1 Agu, vBgu, vBge, vBga + bcax_m1 Aka, vBka, vBki, vBke + bcax_m1 Ake, vBke, vBko, vBki + + .unreq tmp + .unreq tmpq + + eor2 C0, Aka, Aga + save(Aga) + + tmp .req Aga + tmpq .req Agaq + bcax_m1 Aki, vBki, vBku, vBko + bcax_m1 Ako, vBko, vBka, vBku + eor2 C1, Ake, Age + bcax_m1 Aku, vBku, vBke, vBka + eor2 C2, Aki, Agi + bcax_m1 Ama, vBma, vBmi, vBme + eor2 C3, Ako, Ago + bcax_m1 Ame, vBme, vBmo, vBmi + eor2 C4, Aku, Agu + bcax_m1 Ami, vBmi, vBmu, vBmo + eor2 C0, C0, Ama + bcax_m1 Amo, vBmo, vBma, vBmu + eor2 C1, C1, Ame + bcax_m1 Amu, vBmu, vBme, vBma + eor2 C2, C2, Ami + bcax_m1 Asa, vBsa, vBsi, vBse + eor2 C3, C3, Amo + bcax_m1 Ase, vBse, vBso, vBsi + eor2 C4, C4, Amu + bcax_m1 Asi, vBsi, vBsu, vBso + eor2 C0, C0, Asa + bcax_m1 Aso, vBso, vBsa, vBsu + eor2 C1, C1, Ase + bcax_m1 Asu, vBsu, vBse, vBsa + eor2 C2, C2, Asi + eor2 C3, C3, Aso + bcax_m1 Aba, vBba, vBbi, vBbe + bcax_m1 Abe, vBbe, vBbo, vBbi + eor2 C1, C1, Abe + + // iota step + //ld1r {tmp.2d}, [const_addr], #8 + ldr tmpq, [const_addr], #16 + eor Aba.16b, Aba.16b, tmp.16b + eor2 C4, C4, Asu + bcax_m1 Abi, vBbi, vBbu, vBbo + bcax_m1 Abo, vBbo, vBba, vBbu + eor2 C3, C3, Abo + eor2 C2, C2, Abi + eor2 C0, C0, Aba + bcax_m1 Abu, vBbu, vBbe, vBba + eor2 C4, C4, Abu + + restore(Aga) + .unreq tmp + .unreq tmpq + +.endm + +.macro keccak_f1600_round_post + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req vBba + rax1_m1 E2, C1, C3 + str Agaq, [sp, #(STACK_BASE_TMP + 16 * 30)] + rax1_m1 E4, C3, C0 + rax1_m1 E1, C0, C2 + rax1_m1 E3, C2, C4 + rax1_m1 E0, C4, C1 + + /* 25x XAR, 75 in total */ + + .unreq tmp + tmp .req C1 + tmpq .req C1q + + eor vBba.16b, Aba.16b, E0.16b + xar_m1 vBsa, Abi, E2, 2 + xar_m1 vBbi, Aki, E2, 21 + xar_m1 vBki, Ako, E3, 39 + xar_m1 vBko, Amu, E4, 56 + xar_m1 vBmu, Aso, E3, 8 + xar_m1 vBso, Ama, E0, 23 + xar_m1 vBka, Abe, E1, 63 + xar_m1 vBse, Ago, E3, 9 + xar_m1 vBgo, Ame, E1, 19 + xar_m1 vBke, Agi, E2, 58 + xar_m1 vBgi, Aka, E0, 61 + xar_m1 vBga, Abo, E3, 36 + xar_m1 vBbo, Amo, E3, 43 + xar_m1 vBmo, Ami, E2, 49 + xar_m1 vBmi, Ake, E1, 54 + xar_m1 vBge, Agu, E4, 44 + bcax_m1 Aga, vBga, vBgi, vBge + xar_m1 vBgu, Asi, E2, 3 + xar_m1 vBsi, Aku, E4, 25 + xar_m1 vBku, Asa, E0, 46 + xar_m1 vBma, Abu, E4, 37 + xar_m1 vBbu, Asu, E4, 50 + xar_m1 vBsu, Ase, E1, 62 + ldr tmpq, [sp, #(STACK_BASE_TMP + 16*30)] + xar_m1 vBme, tmp, E0, 28 + xar_m1 vBbe, Age, E1, 20 + + /* 25x BCAX, 50 in total */ + + bcax_m1 Age, vBge, vBgo, vBgi + bcax_m1 Agi, vBgi, vBgu, vBgo + bcax_m1 Ago, vBgo, vBga, vBgu + bcax_m1 Agu, vBgu, vBge, vBga + bcax_m1 Aka, vBka, vBki, vBke + bcax_m1 Ake, vBke, vBko, vBki + bcax_m1 Aki, vBki, vBku, vBko + bcax_m1 Ako, vBko, vBka, vBku + bcax_m1 Aku, vBku, vBke, vBka + bcax_m1 Ama, vBma, vBmi, vBme + bcax_m1 Ame, vBme, vBmo, vBmi + bcax_m1 Ami, vBmi, vBmu, vBmo + bcax_m1 Amo, vBmo, vBma, vBmu + bcax_m1 Amu, vBmu, vBme, vBma + bcax_m1 Asa, vBsa, vBsi, vBse + bcax_m1 Ase, vBse, vBso, vBsi + bcax_m1 Asi, vBsi, vBsu, vBso + bcax_m1 Aso, vBso, vBsa, vBsu + bcax_m1 Asu, vBsu, vBse, vBsa + bcax_m1 Aba, vBba, vBbi, vBbe + bcax_m1 Abe, vBbe, vBbo, vBbi + bcax_m1 Abi, vBbi, vBbu, vBbo + bcax_m1 Abo, vBbo, vBba, vBbu + bcax_m1 Abu, vBbu, vBbe, vBba + + // iota step + //ld1r {tmp.2d}, [const_addr], #8 + ldr tmpq, [const_addr], #16 + eor Aba.16b, Aba.16b, tmp.16b + + .unreq tmp + +.endm + + +.text +.align 4 +.global keccak_f1600_x2_v84a_asm_v2pp2 +.global _keccak_f1600_x2_v84a_asm_v2pp2 + +#define KECCAK_F1600_ROUNDS 24 + +keccak_f1600_x2_v84a_asm_v2pp2: +_keccak_f1600_x2_v84a_asm_v2pp2: + alloc_stack + save_vregs + load_constant_ptr + load_input + + //mov count, #(KECCAK_F1600_ROUNDS-2) + mov count, #11 + keccak_f1600_round_pre +loop: + keccak_f1600_round_core + keccak_f1600_round_core + sub count, count, #1 + cbnz count, loop + + keccak_f1600_round_core + keccak_f1600_round_post + store_input + restore_vregs + free_stack + ret diff --git a/asm/manual/keccak_f1600/keccak_f1600_x2_v84a_asm_v2pp3.s b/asm/manual/keccak_f1600/keccak_f1600_x2_v84a_asm_v2pp3.s new file mode 100644 index 0000000..ff8359e --- /dev/null +++ b/asm/manual/keccak_f1600/keccak_f1600_x2_v84a_asm_v2pp3.s @@ -0,0 +1,905 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#if defined(__ARM_FEATURE_SVE2) +#include "macros.s" + +/********************** CONSTANTS *************************/ + .data + .align(8) +_round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x1 + count .req x2 + cur_const .req x3 + + /* Mapping of Kecck-f1600 state to vector registers + * at the beginning and end of each round. */ + Aba .req v0 + Abe .req v1 + Abi .req v2 + Abo .req v3 + Abu .req v4 + Aga .req v5 + Age .req v6 + Agi .req v7 + Ago .req v8 + Agu .req v9 + Aka .req v10 + Ake .req v11 + Aki .req v12 + Ako .req v13 + Aku .req v14 + Ama .req v15 + Ame .req v16 + Ami .req v17 + Amo .req v18 + Amu .req v19 + Asa .req v20 + Ase .req v21 + Asi .req v22 + Aso .req v23 + Asu .req v24 + + /* q-form of the above mapping */ + Abaq .req q0 + Abeq .req q1 + Abiq .req q2 + Aboq .req q3 + Abuq .req q4 + Agaq .req q5 + Ageq .req q6 + Agiq .req q7 + Agoq .req q8 + Aguq .req q9 + Akaq .req q10 + Akeq .req q11 + Akiq .req q12 + Akoq .req q13 + Akuq .req q14 + Amaq .req q15 + Ameq .req q16 + Amiq .req q17 + Amoq .req q18 + Amuq .req q19 + Asaq .req q20 + Aseq .req q21 + Asiq .req q22 + Asoq .req q23 + Asuq .req q24 + + /* z-form of the above mapping */ + Abaz .req z0 + Abez .req z1 + Abiz .req z2 + Aboz .req z3 + Abuz .req z4 + Agaz .req z5 + Agez .req z6 + Agiz .req z7 + Agoz .req z8 + Aguz .req z9 + Akaz .req z10 + Akez .req z11 + Akiz .req z12 + Akoz .req z13 + Akuz .req z14 + Amaz .req z15 + Amez .req z16 + Amiz .req z17 + Amoz .req z18 + Amuz .req z19 + Asaz .req z20 + Asez .req z21 + Asiz .req z22 + Asoz .req z23 + Asuz .req z24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v27 + C1 .req v28 + C2 .req v29 + C3 .req v30 + C4 .req v31 + + C0q .req q27 + C1q .req q28 + C2q .req q29 + C3q .req q30 + C4q .req q31 + + C0z .req z27 + C1z .req z28 + C2z .req z29 + C3z .req z30 + C4z .req z31 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + vBba .req v25 // fresh + vBbe .req v26 // fresh + vBbi .req Abi + vBbo .req Abo + vBbu .req Abu + vBga .req Aka + vBge .req Ake + vBgi .req Agi + vBgo .req Ago + vBgu .req Agu + vBka .req Ama + vBke .req Ame + vBki .req Aki + vBko .req Ako + vBku .req Aku + vBma .req Asa + vBme .req Ase + vBmi .req Ami + vBmo .req Amo + vBmu .req Amu + vBsa .req Aba + vBse .req Abe + vBsi .req Asi + vBso .req Aso + vBsu .req Asu + + vBbaq .req q25 // fresh + vBbeq .req q26 // fresh + vBbiq .req Abiq + vBboq .req Aboq + vBbuq .req Abuq + vBgaq .req Akaq + vBgeq .req Akeq + vBgiq .req Agiq + vBgoq .req Agoq + vBguq .req Aguq + vBkaq .req Amaq + vBkeq .req Ameq + vBkiq .req Akiq + vBkoq .req Akoq + vBkuq .req Akuq + vBmaq .req Asaq + vBmeq .req Aseq + vBmiq .req Amiq + vBmoq .req Amoq + vBmuq .req Amuq + vBsaq .req Abaq + vBseq .req Abeq + vBsiq .req Asiq + vBsoq .req Asoq + vBsuq .req Asuq + + vBbaz .req z25 // fresh + vBbez .req z26 // fresh + vBbiz .req Abiz + vBboz .req Aboz + vBbuz .req Abuz + vBgaz .req Akaz + vBgez .req Akez + vBgiz .req Agiz + vBgoz .req Agoz + vBguz .req Aguz + vBkaz .req Amaz + vBkez .req Amez + vBkiz .req Akiz + vBkoz .req Akoz + vBkuz .req Akuz + vBmaz .req Asaz + vBmez .req Asez + vBmiz .req Amiz + vBmoz .req Amoz + vBmuz .req Amuz + vBsaz .req Abaz + vBsez .req Abez + vBsiz .req Asiz + vBsoz .req Asoz + vBsuz .req Asuz + + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req C4 + E1 .req C0 + E2 .req vBbe // fresh + E3 .req C2 + E4 .req C3 + + E0q .req C4q + E1q .req C0q + E2q .req vBbeq // fresh + E3q .req C2q + E4q .req C3q + + E0z .req C4z + E1z .req C0z + E2z .req vBbez // fresh + E3z .req C2z + E4z .req C3z + + + +/************************ MACROS ****************************/ + +.macro load_input + ldp Abaq, Abeq, [input_addr, #(2*8*0)] + ldp Abiq, Aboq, [input_addr, #(2*8*2)] + ldp Abuq, Agaq, [input_addr, #(2*8*4)] + ldp Ageq, Agiq, [input_addr, #(2*8*6)] + ldp Agoq, Aguq, [input_addr, #(2*8*8)] + ldp Akaq, Akeq, [input_addr, #(2*8*10)] + ldp Akiq, Akoq, [input_addr, #(2*8*12)] + ldp Akuq, Amaq, [input_addr, #(2*8*14)] + ldp Ameq, Amiq, [input_addr, #(2*8*16)] + ldp Amoq, Amuq, [input_addr, #(2*8*18)] + ldp Asaq, Aseq, [input_addr, #(2*8*20)] + ldp Asiq, Asoq, [input_addr, #(2*8*22)] + ldr Asuq, [input_addr, #(2*8*24)] +.endm + +.macro store_input + str Abaq, [input_addr, #(2*8*0)] + str Abeq, [input_addr, #(2*8*1)] + str Abiq, [input_addr, #(2*8*2)] + str Aboq, [input_addr, #(2*8*3)] + str Abuq, [input_addr, #(2*8*4)] + str Agaq, [input_addr, #(2*8*5)] + str Ageq, [input_addr, #(2*8*6)] + str Agiq, [input_addr, #(2*8*7)] + str Agoq, [input_addr, #(2*8*8)] + str Aguq, [input_addr, #(2*8*9)] + str Akaq, [input_addr, #(2*8*10)] + str Akeq, [input_addr, #(2*8*11)] + str Akiq, [input_addr, #(2*8*12)] + str Akoq, [input_addr, #(2*8*13)] + str Akuq, [input_addr, #(2*8*14)] + str Amaq, [input_addr, #(2*8*15)] + str Ameq, [input_addr, #(2*8*16)] + str Amiq, [input_addr, #(2*8*17)] + str Amoq, [input_addr, #(2*8*18)] + str Amuq, [input_addr, #(2*8*19)] + str Asaq, [input_addr, #(2*8*20)] + str Aseq, [input_addr, #(2*8*21)] + str Asiq, [input_addr, #(2*8*22)] + str Asoq, [input_addr, #(2*8*23)] + str Asuq, [input_addr, #(2*8*24)] +.endm + +#define STACK_SIZE (16*4 + 16*34) +#define STACK_BASE_VREGS 0 +#define STACK_BASE_TMP 16*4 + +#define Aga_offset 0 +#define E0_offset 1 +#define E1_offset 2 +#define E2_offset 3 +#define E3_offset 4 +#define E4_offset 5 +#define Ame_offset 7 +#define Agi_offset 8 +#define Aka_offset 9 +#define Abo_offset 10 +#define Amo_offset 11 +#define Ami_offset 12 +#define Ake_offset 13 +#define Agu_offset 14 +#define Asi_offset 15 +#define Aku_offset 16 +#define Asa_offset 17 +#define Abu_offset 18 +#define Asu_offset 19 +#define Ase_offset 20 +//#define Aga_offset 21 +#define Age_offset 22 +#define vBgo_offset 23 +#define vBke_offset 24 +#define vBgi_offset 25 +#define vBga_offset 26 +#define vBbo_offset 27 +#define vBmo_offset 28 +#define vBmi_offset 29 +#define vBge_offset 30 + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +#define save(name) \ + str name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] +#define restore(name) \ + ldr name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] + +.macro save_vregs + stp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + stp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + stp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + stp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + ldp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + ldp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + ldp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +/* Macros using v8.4-A SHA-3 instructions */ + +.macro eor3_m1_0 d s0 s1 s2 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor2 d s0 s1 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor3_m1_1 d s0 s1 s2 + eor \d\().16b, \d\().16b, \s2\().16b +.endm + +.macro eor3_m1 d s0 s1 s2 + eor3_m1_0 \d, \s0, \s1, \s2 + eor3_m1_1 \d, \s0, \s1, \s2 +.endm + +.macro rax1_m1 d s0 s1 + // Use add instead of SHL #1 + add tmp.2d, \s1\().2d, \s1\().2d + sri tmp.2d, \s1\().2d, #63 + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +xar_m1_const: + .quad (1ULL<<(64-61)) + .quad (1ULL<<(64-56)) + .quad (1ULL<<(64-50)) + .quad (1ULL<<(64-46)) + .quad (1ULL<<(64-44)) + .quad (1ULL<<(64-43)) + .quad (1ULL<<(64-39)) + .quad (1ULL<<(64-36)) + .quad (1ULL<<(64-21)) + .quad (1ULL<<(64-19)) + .quad (1ULL<<(64-9)) + .quad (1ULL<<(64-3)) + + +xar_m1_const_addr: .quad xar_m1_const + + .macro xar_m1 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 21 + eor \s0\().16b, \s0\().16b, \s1\().16b + ldr \d\()q, [x17, #64] + mul \d\()z\().d, \s0\()z\().d, \d\()z\().d[0] + sri \d\().2d, \s0\().2d, #(\imm) + .elseif \imm == 39 + eor \s0\().16b, \s0\().16b, \s1\().16b + ldr \d\()q, [x17, #48] + mul \d\()z\().d, \s0\()z\().d, \d\()z\().d[0] + sri \d\().2d, \s0\().2d, #(\imm) + .elseif \imm == 56 + eor \s0\().16b, \s0\().16b, \s1\().16b + ldr \d\()q, [x17] + mul \d\()z\().d, \s0\()z\().d, \d\()z\().d[1] + sri \d\().2d, \s0\().2d, #(\imm) + .elseif \imm == 63 + eor \s0\().16b, \s0\().16b, \s1\().16b + add \d\().2d, \s0\().2d, \s0\().2d + sri \d\().2d, \s0\().2d, #(63) + .elseif \imm == 9 + eor \s0\().16b, \s0\().16b, \s1\().16b + ldr \d\()q, [x17, #80] + mul \d\()z\().d, \s0\()z\().d, \d\()z\().d[0] + sri \d\().2d, \s0\().2d, #(\imm) + .elseif \imm == 19 + eor \s0\().16b, \s0\().16b, \s1\().16b + ldr \d\()q, [x17, #64] + mul \d\()z\().d, \s0\()z\().d, \d\()z\().d[1] + sri \d\().2d, \s0\().2d, #(\imm) + .elseif \imm == 61 + eor \s0\().16b, \s0\().16b, \s1\().16b + ldr \d\()q, [x17] + mul \d\()z\().d, \s0\()z\().d, \d\()z\().d[0] + sri \d\().2d, \s0\().2d, #(\imm) + .elseif \imm == 36 + eor \s0\().16b, \s0\().16b, \s1\().16b + ldr \d\()q, [x17, #48] + mul \d\()z\().d, \s0\()z\().d, \d\()z\().d[1] + sri \d\().2d, \s0\().2d, #(\imm) + .elseif \imm == 43 + eor \s0\().16b, \s0\().16b, \s1\().16b + ldr \d\()q, [x17, #32] + mul \d\()z\().d, \s0\()z\().d, \d\()z\().d[1] + sri \d\().2d, \s0\().2d, #(\imm) + .elseif \imm == 44 + eor \s0\().16b, \s0\().16b, \s1\().16b + ldr \d\()q, [x17, #32] + mul \d\()z\().d, \s0\()z\().d, \d\()z\().d[0] + sri \d\().2d, \s0\().2d, #(\imm) + .elseif \imm == 3 + eor \s0\().16b, \s0\().16b, \s1\().16b + ldr \d\()q, [x17, #80] + mul \d\()z\().d, \s0\()z\().d, \d\()z\().d[1] + sri \d\().2d, \s0\().2d, #(\imm) + .elseif \imm == 46 + eor \s0\().16b, \s0\().16b, \s1\().16b + ldr \d\()q, [x17, #16] + mul \d\()z\().d, \s0\()z\().d, \d\()z\().d[1] + sri \d\().2d, \s0\().2d, #(\imm) + .elseif \imm == 50 + eor \s0\().16b, \s0\().16b, \s1\().16b + ldr \d\()q, [x17, #16] + mul \d\()z\().d, \s0\()z\().d, \d\()z\().d[0] + sri \d\().2d, \s0\().2d, #(\imm) + .elseif \imm == 62 + eor \s0\().16b, \s0\().16b, \s1\().16b + add \d\().2d, \s0\().2d, \s0\().2d + add \d\().2d, \d\().2d, \d\().2d + sri \d\().2d, \s0\().2d, #(62) + .else + eor \s0\().16b, \s0\().16b, \s1\().16b + shl \d\().2d, \s0\().2d, #(64-\imm) + sri \d\().2d, \s0\().2d, #(\imm) + .endif +.endm + +.macro bcax_m1 d s0 s1 s2 + bic tmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +/* Keccak-f1600 round */ + +.macro keccak_f1600_round_pre + + /* 10 EOR3, so 20 individual EOR */ + + eor3_m1_0 C1, Abe, Age, Ake + eor3_m1_0 C3, Abo, Ago, Ako + eor3_m1_0 C0, Aba, Aga, Aka + eor3_m1_0 C2, Abi, Agi, Aki + eor3_m1_0 C4, Abu, Agu, Aku + eor3_m1_1 C1, Abe, Age, Ake + eor3_m1_1 C3, Abo, Ago, Ako + eor3_m1_1 C0, Aba, Aga, Aka + eor3_m1_1 C2, Abi, Agi, Aki + eor3_m1_1 C4, Abu, Agu, Aku + eor3_m1_0 C1, C1, Ame, Ase + eor3_m1_0 C3, C3, Amo, Aso + eor3_m1_0 C0, C0, Ama, Asa + eor3_m1_0 C2, C2, Ami, Asi + eor3_m1_0 C4, C4, Amu, Asu + eor3_m1_1 C1, C1, Ame, Ase + eor3_m1_1 C3, C3, Amo, Aso + eor3_m1_1 C0, C0, Ama, Asa + eor3_m1_1 C2, C2, Ami, Asi + eor3_m1_1 C4, C4, Amu, Asu + +.endm + +.macro keccak_f1600_round + + /* 10 EOR3, so 20 individual EOR */ + + eor3_m1_0 C0, Aba, Aga, Aka + eor3_m1_0 C1, Abe, Age, Ake + eor3_m1_0 C2, Abi, Agi, Aki + eor3_m1_0 C3, Abo, Ago, Ako + eor3_m1_0 C4, Abu, Agu, Aku + eor3_m1_1 C0, Aba, Aga, Aka + eor3_m1_1 C1, Abe, Age, Ake + eor3_m1_1 C2, Abi, Agi, Aki + eor3_m1_1 C3, Abo, Ago, Ako + eor3_m1_1 C4, Abu, Agu, Aku + eor3_m1_0 C0, C0, Ama, Asa + eor3_m1_0 C1, C1, Ame, Ase + eor3_m1_0 C2, C2, Ami, Asi + eor3_m1_0 C3, C3, Amo, Aso + eor3_m1_0 C4, C4, Amu, Asu + eor3_m1_1 C0, C0, Ama, Asa + eor3_m1_1 C1, C1, Ame, Ase + eor3_m1_1 C2, C2, Ami, Asi + eor3_m1_1 C3, C3, Amo, Aso + eor3_m1_1 C4, C4, Amu, Asu + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req vBba + rax1_m1 E2, C1, C3 + rax1_m1 E4, C3, C0 + rax1_m1 E1, C0, C2 + rax1_m1 E3, C2, C4 + rax1_m1 E0, C4, C1 + .unreq tmp + + /* 25x XAR, 75 in total */ + + tmp .req C1 + tmpq .req C1q + + eor vBba.16b, Aba.16b, E0.16b + xar_m1 vBsa, Abi, E2, 2 + xar_m1 vBbi, Aki, E2, 21 + xar_m1 vBki, Ako, E3, 39 + xar_m1 vBko, Amu, E4, 56 + xar_m1 vBmu, Aso, E3, 8 + xar_m1 vBso, Ama, E0, 23 + xar_m1 vBka, Abe, E1, 63 + xar_m1 vBse, Ago, E3, 9 + xar_m1 vBgo, Ame, E1, 19 + xar_m1 vBke, Agi, E2, 58 + xar_m1 vBgi, Aka, E0, 61 + xar_m1 vBga, Abo, E3, 36 + xar_m1 vBbo, Amo, E3, 43 + xar_m1 vBmo, Ami, E2, 49 + xar_m1 vBmi, Ake, E1, 54 + xar_m1 vBge, Agu, E4, 44 + xar_m1 vBgu, Asi, E2, 3 + xar_m1 vBsi, Aku, E4, 25 + xar_m1 vBku, Asa, E0, 46 + xar_m1 vBma, Abu, E4, 37 + xar_m1 vBbu, Asu, E4, 50 + xar_m1 vBsu, Ase, E1, 62 + xar_m1 vBme, Aga, E0, 28 + xar_m1 vBbe, Age, E1, 20 + + /* 25x BCAX, 50 in total */ + + bcax_m1 Aga, vBga, vBgi, vBge + bcax_m1 Age, vBge, vBgo, vBgi + bcax_m1 Agi, vBgi, vBgu, vBgo + bcax_m1 Ago, vBgo, vBga, vBgu + bcax_m1 Agu, vBgu, vBge, vBga + bcax_m1 Aka, vBka, vBki, vBke + bcax_m1 Ake, vBke, vBko, vBki + bcax_m1 Aki, vBki, vBku, vBko + bcax_m1 Ako, vBko, vBka, vBku + bcax_m1 Aku, vBku, vBke, vBka + bcax_m1 Ama, vBma, vBmi, vBme + bcax_m1 Ame, vBme, vBmo, vBmi + bcax_m1 Ami, vBmi, vBmu, vBmo + bcax_m1 Amo, vBmo, vBma, vBmu + bcax_m1 Amu, vBmu, vBme, vBma + bcax_m1 Asa, vBsa, vBsi, vBse + bcax_m1 Ase, vBse, vBso, vBsi + bcax_m1 Asi, vBsi, vBsu, vBso + bcax_m1 Aso, vBso, vBsa, vBsu + bcax_m1 Asu, vBsu, vBse, vBsa + bcax_m1 Aba, vBba, vBbi, vBbe + bcax_m1 Abe, vBbe, vBbo, vBbi + bcax_m1 Abi, vBbi, vBbu, vBbo + bcax_m1 Abo, vBbo, vBba, vBbu + bcax_m1 Abu, vBbu, vBbe, vBba + + // iota step + //ld1r {tmp.2d}, [const_addr], #8 + ldr tmpq, [const_addr], #16 + eor Aba.16b, Aba.16b, tmp.16b + + .unreq tmp + .unreq tmpq + +.endm + +.macro keccak_f1600_round_core + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req vBba + rax1_m1 E2, C1, C3 + rax1_m1 E4, C3, C0 + rax1_m1 E1, C0, C2 + rax1_m1 E3, C2, C4 + rax1_m1 E0, C4, C1 + + /* 25x XAR, 75 in total */ + + eor vBba.16b, Aba.16b, E0.16b + xar_m1 vBsa, Abi, E2, 2 + xar_m1 vBbi, Aki, E2, 21 + xar_m1 vBki, Ako, E3, 39 + xar_m1 vBko, Amu, E4, 56 + xar_m1 vBmu, Aso, E3, 8 + xar_m1 vBso, Ama, E0, 23 + xar_m1 vBka, Abe, E1, 63 + xar_m1 vBse, Ago, E3, 9 + xar_m1 vBgo, Ame, E1, 19 + xar_m1 vBke, Agi, E2, 58 + xar_m1 vBgi, Aka, E0, 61 + xar_m1 vBga, Abo, E3, 36 + xar_m1 vBbo, Amo, E3, 43 + xar_m1 vBmo, Ami, E2, 49 + xar_m1 vBmi, Ake, E1, 54 + xar_m1 vBge, Agu, E4, 44 + xar_m1 vBgu, Asi, E2, 3 + xar_m1 vBsi, Aku, E4, 25 + xar_m1 vBku, Asa, E0, 46 + xar_m1 vBma, Abu, E4, 37 + xar_m1 vBbu, Asu, E4, 50 + xar_m1 vBsu, Ase, E1, 62 + xar_m1 vBme, Aga, E0, 28 + xar_m1 vBbe, Age, E1, 20 + + /* 25x BCAX, 50 in total */ + + .unreq tmp + tmp .req C1 + bcax_m1 Aga, vBga, vBgi, vBge + bcax_m1 Age, vBge, vBgo, vBgi + bcax_m1 Agi, vBgi, vBgu, vBgo + bcax_m1 Ago, vBgo, vBga, vBgu + bcax_m1 Agu, vBgu, vBge, vBga + bcax_m1 Aka, vBka, vBki, vBke + bcax_m1 Ake, vBke, vBko, vBki + .unreq tmp + + eor2 C0, Aka, Aga + save(Aga) + + tmp .req Aga + tmpq .req Agaq + bcax_m1 Aki, vBki, vBku, vBko + bcax_m1 Ako, vBko, vBka, vBku + eor2 C1, Ake, Age + bcax_m1 Aku, vBku, vBke, vBka + eor2 C2, Aki, Agi + bcax_m1 Ama, vBma, vBmi, vBme + eor2 C3, Ako, Ago + bcax_m1 Ame, vBme, vBmo, vBmi + eor2 C4, Aku, Agu + bcax_m1 Ami, vBmi, vBmu, vBmo + eor2 C0, C0, Ama + bcax_m1 Amo, vBmo, vBma, vBmu + eor2 C1, C1, Ame + bcax_m1 Amu, vBmu, vBme, vBma + eor2 C2, C2, Ami + bcax_m1 Asa, vBsa, vBsi, vBse + eor2 C3, C3, Amo + bcax_m1 Ase, vBse, vBso, vBsi + eor2 C4, C4, Amu + bcax_m1 Asi, vBsi, vBsu, vBso + eor2 C0, C0, Asa + bcax_m1 Aso, vBso, vBsa, vBsu + eor2 C1, C1, Ase + bcax_m1 Asu, vBsu, vBse, vBsa + eor2 C2, C2, Asi + eor2 C3, C3, Aso + bcax_m1 Aba, vBba, vBbi, vBbe + bcax_m1 Abe, vBbe, vBbo, vBbi + eor2 C1, C1, Abe + + // iota step + //ld1r {tmp.2d}, [const_addr], #8 + ldr tmpq, [const_addr], #16 + eor Aba.16b, Aba.16b, tmp.16b + eor2 C4, C4, Asu + bcax_m1 Abi, vBbi, vBbu, vBbo + bcax_m1 Abo, vBbo, vBba, vBbu + eor2 C3, C3, Abo + eor2 C2, C2, Abi + eor2 C0, C0, Aba + bcax_m1 Abu, vBbu, vBbe, vBba + eor2 C4, C4, Abu + + restore(Aga) + .unreq tmp + .unreq tmpq + +.endm + +.macro keccak_f1600_round_post + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req vBba + rax1_m1 E2, C1, C3 + rax1_m1 E4, C3, C0 + rax1_m1 E1, C0, C2 + rax1_m1 E3, C2, C4 + rax1_m1 E0, C4, C1 + .unreq tmp + + /* 25x XAR, 75 in total */ + + tmp .req C1 + tmpq .req C1q + eor vBba.16b, Aba.16b, E0.16b + xar_m1 vBsa, Abi, E2, 2 + xar_m1 vBbi, Aki, E2, 21 + xar_m1 vBki, Ako, E3, 39 + xar_m1 vBko, Amu, E4, 56 + xar_m1 vBmu, Aso, E3, 8 + xar_m1 vBso, Ama, E0, 23 + xar_m1 vBka, Abe, E1, 63 + xar_m1 vBse, Ago, E3, 9 + xar_m1 vBgo, Ame, E1, 19 + xar_m1 vBke, Agi, E2, 58 + xar_m1 vBgi, Aka, E0, 61 + xar_m1 vBga, Abo, E3, 36 + xar_m1 vBbo, Amo, E3, 43 + xar_m1 vBmo, Ami, E2, 49 + xar_m1 vBmi, Ake, E1, 54 + xar_m1 vBge, Agu, E4, 44 + xar_m1 vBgu, Asi, E2, 3 + xar_m1 vBsi, Aku, E4, 25 + xar_m1 vBku, Asa, E0, 46 + xar_m1 vBma, Abu, E4, 37 + xar_m1 vBbu, Asu, E4, 50 + xar_m1 vBsu, Ase, E1, 62 + xar_m1 vBme, Aga, E0, 28 + xar_m1 vBbe, Age, E1, 20 + + /* 25x BCAX, 50 in total */ + + bcax_m1 Aga, vBga, vBgi, vBge + bcax_m1 Age, vBge, vBgo, vBgi + bcax_m1 Agi, vBgi, vBgu, vBgo + bcax_m1 Ago, vBgo, vBga, vBgu + bcax_m1 Agu, vBgu, vBge, vBga + bcax_m1 Aka, vBka, vBki, vBke + bcax_m1 Ake, vBke, vBko, vBki + bcax_m1 Aki, vBki, vBku, vBko + bcax_m1 Ako, vBko, vBka, vBku + bcax_m1 Aku, vBku, vBke, vBka + bcax_m1 Ama, vBma, vBmi, vBme + bcax_m1 Ame, vBme, vBmo, vBmi + bcax_m1 Ami, vBmi, vBmu, vBmo + bcax_m1 Amo, vBmo, vBma, vBmu + bcax_m1 Amu, vBmu, vBme, vBma + bcax_m1 Asa, vBsa, vBsi, vBse + bcax_m1 Ase, vBse, vBso, vBsi + bcax_m1 Asi, vBsi, vBsu, vBso + bcax_m1 Aso, vBso, vBsa, vBsu + bcax_m1 Asu, vBsu, vBse, vBsa + bcax_m1 Aba, vBba, vBbi, vBbe + bcax_m1 Abe, vBbe, vBbo, vBbi + bcax_m1 Abi, vBbi, vBbu, vBbo + bcax_m1 Abo, vBbo, vBba, vBbu + bcax_m1 Abu, vBbu, vBbe, vBba + + // iota step + //ld1r {tmp.2d}, [const_addr], #8 + ldr tmpq, [const_addr], #16 + eor Aba.16b, Aba.16b, tmp.16b + + .unreq tmp + +.endm + + +.text +.align 4 +.global keccak_f1600_x2_v84a_asm_v2pp3 +.global _keccak_f1600_x2_v84a_asm_v2pp3 + +#define KECCAK_F1600_ROUNDS 24 + +keccak_f1600_x2_v84a_asm_v2pp3: +_keccak_f1600_x2_v84a_asm_v2pp3: + alloc_stack + save_vregs + load_constant_ptr + load_input + + ldr x17, xar_m1_const_addr + + //mov count, #(KECCAK_F1600_ROUNDS-2) + mov count, #11 + keccak_f1600_round_pre +loop: + keccak_f1600_round_core + keccak_f1600_round_core + sub count, count, #1 + cbnz count, loop + + keccak_f1600_round_core + keccak_f1600_round_post + store_input + restore_vregs + free_stack + ret +#endif diff --git a/asm/manual/keccak_f1600/keccak_f1600_x2_v84a_asm_v2pp4.s b/asm/manual/keccak_f1600/keccak_f1600_x2_v84a_asm_v2pp4.s new file mode 100644 index 0000000..60a859e --- /dev/null +++ b/asm/manual/keccak_f1600/keccak_f1600_x2_v84a_asm_v2pp4.s @@ -0,0 +1,797 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +/********************** CONSTANTS *************************/ + .data + .align(8) +_round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x1 + count .req x2 + cur_const .req x3 + + /* Mapping of Kecck-f1600 state to vector registers + * at the beginning and end of each round. */ + Aba .req v0 + Abe .req v1 + Abi .req v2 + Abo .req v3 + Abu .req v4 + Aga .req v5 + Age .req v6 + Agi .req v7 + Ago .req v8 + Agu .req v9 + Aka .req v10 + Ake .req v11 + Aki .req v12 + Ako .req v13 + Aku .req v14 + Ama .req v15 + Ame .req v16 + Ami .req v17 + Amo .req v18 + Amu .req v19 + Asa .req v20 + Ase .req v21 + Asi .req v22 + Aso .req v23 + Asu .req v24 + + /* q-form of the above mapping */ + Abaq .req q0 + Abeq .req q1 + Abiq .req q2 + Aboq .req q3 + Abuq .req q4 + Agaq .req q5 + Ageq .req q6 + Agiq .req q7 + Agoq .req q8 + Aguq .req q9 + Akaq .req q10 + Akeq .req q11 + Akiq .req q12 + Akoq .req q13 + Akuq .req q14 + Amaq .req q15 + Ameq .req q16 + Amiq .req q17 + Amoq .req q18 + Amuq .req q19 + Asaq .req q20 + Aseq .req q21 + Asiq .req q22 + Asoq .req q23 + Asuq .req q24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v27 + C1 .req v28 + C2 .req v29 + C3 .req v30 + C4 .req v31 + + C0q .req q27 + C1q .req q28 + C2q .req q29 + C3q .req q30 + C4q .req q31 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + vBba .req v25 // fresh + vBbe .req v26 // fresh + vBbi .req Abi + vBbo .req Abo + vBbu .req Abu + vBga .req Aka + vBge .req Ake + vBgi .req Agi + vBgo .req Ago + vBgu .req Agu + vBka .req Ama + vBke .req Ame + vBki .req Aki + vBko .req Ako + vBku .req Aku + vBma .req Asa + vBme .req Ase + vBmi .req Ami + vBmo .req Amo + vBmu .req Amu + vBsa .req Aba + vBse .req Abe + vBsi .req Asi + vBso .req Aso + vBsu .req Asu + + vBbaq .req q25 // fresh + vBbeq .req q26 // fresh + vBbiq .req Abiq + vBboq .req Aboq + vBbuq .req Abuq + vBgaq .req Akaq + vBgeq .req Akeq + vBgiq .req Agiq + vBgoq .req Agoq + vBguq .req Aguq + vBkaq .req Amaq + vBkeq .req Ameq + vBkiq .req Akiq + vBkoq .req Akoq + vBkuq .req Akuq + vBmaq .req Asaq + vBmeq .req Aseq + vBmiq .req Amiq + vBmoq .req Amoq + vBmuq .req Amuq + vBsaq .req Abaq + vBseq .req Abeq + vBsiq .req Asiq + vBsoq .req Asoq + vBsuq .req Asuq + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req C4 + E1 .req C0 + E2 .req vBbe // fresh + E3 .req C2 + E4 .req C3 + + E0q .req C4q + E1q .req C0q + E2q .req vBbeq // fresh + E3q .req C2q + E4q .req C3q + + +/************************ MACROS ****************************/ + +.macro load_input + ldp Abaq, Abeq, [input_addr, #(2*8*0)] + ldp Abiq, Aboq, [input_addr, #(2*8*2)] + ldp Abuq, Agaq, [input_addr, #(2*8*4)] + ldp Ageq, Agiq, [input_addr, #(2*8*6)] + ldp Agoq, Aguq, [input_addr, #(2*8*8)] + ldp Akaq, Akeq, [input_addr, #(2*8*10)] + ldp Akiq, Akoq, [input_addr, #(2*8*12)] + ldp Akuq, Amaq, [input_addr, #(2*8*14)] + ldp Ameq, Amiq, [input_addr, #(2*8*16)] + ldp Amoq, Amuq, [input_addr, #(2*8*18)] + ldp Asaq, Aseq, [input_addr, #(2*8*20)] + ldp Asiq, Asoq, [input_addr, #(2*8*22)] + ldr Asuq, [input_addr, #(2*8*24)] +.endm + +.macro store_input + str Abaq, [input_addr, #(2*8*0)] + str Abeq, [input_addr, #(2*8*1)] + str Abiq, [input_addr, #(2*8*2)] + str Aboq, [input_addr, #(2*8*3)] + str Abuq, [input_addr, #(2*8*4)] + str Agaq, [input_addr, #(2*8*5)] + str Ageq, [input_addr, #(2*8*6)] + str Agiq, [input_addr, #(2*8*7)] + str Agoq, [input_addr, #(2*8*8)] + str Aguq, [input_addr, #(2*8*9)] + str Akaq, [input_addr, #(2*8*10)] + str Akeq, [input_addr, #(2*8*11)] + str Akiq, [input_addr, #(2*8*12)] + str Akoq, [input_addr, #(2*8*13)] + str Akuq, [input_addr, #(2*8*14)] + str Amaq, [input_addr, #(2*8*15)] + str Ameq, [input_addr, #(2*8*16)] + str Amiq, [input_addr, #(2*8*17)] + str Amoq, [input_addr, #(2*8*18)] + str Amuq, [input_addr, #(2*8*19)] + str Asaq, [input_addr, #(2*8*20)] + str Aseq, [input_addr, #(2*8*21)] + str Asiq, [input_addr, #(2*8*22)] + str Asoq, [input_addr, #(2*8*23)] + str Asuq, [input_addr, #(2*8*24)] +.endm + +#define STACK_SIZE (16*4 + 16*34) +#define STACK_BASE_VREGS 0 +#define STACK_BASE_TMP 16*4 + +#define Aga_offset 0 +#define E0_offset 1 +#define E1_offset 2 +#define E2_offset 3 +#define E3_offset 4 +#define E4_offset 5 +#define Ame_offset 7 +#define Agi_offset 8 +#define Aka_offset 9 +#define Abo_offset 10 +#define Amo_offset 11 +#define Ami_offset 12 +#define Ake_offset 13 +#define Agu_offset 14 +#define Asi_offset 15 +#define Aku_offset 16 +#define Asa_offset 17 +#define Abu_offset 18 +#define Asu_offset 19 +#define Ase_offset 20 +//#define Aga_offset 21 +#define Age_offset 22 +#define vBgo_offset 23 +#define vBke_offset 24 +#define vBgi_offset 25 +#define vBga_offset 26 +#define vBbo_offset 27 +#define vBmo_offset 28 +#define vBmi_offset 29 +#define vBge_offset 30 + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +#define save(name) \ + str name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] +#define restore(name) \ + ldr name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] + +.macro save_vregs + stp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + stp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + stp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + stp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + ldp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + ldp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + ldp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +/* Macros using v8.4-A SHA-3 instructions */ + +.macro eor3_m1_0 d s0 s1 s2 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor2 d s0 s1 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor3_m1_1 d s0 s1 s2 + eor \d\().16b, \d\().16b, \s2\().16b +.endm + +.macro eor3_m1 d s0 s1 s2 + eor3_m1_0 \d, \s0, \s1, \s2 + eor3_m1_1 \d, \s0, \s1, \s2 +.endm + +.macro rax1_m1 d s0 s1 + // Use add instead of SHL #1 + add tmp.2d, \s1\().2d, \s1\().2d + sri tmp.2d, \s1\().2d, #63 + eor \d\().16b, tmp.16b, \s0\().16b +.endm + + .macro xar_m1 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 63 + eor \s0\().16b, \s0\().16b, \s1\().16b + add \d\().2d, \s0\().2d, \s0\().2d + sri \d\().2d, \s0\().2d, #(63) + .elseif \imm == 62 + eor \s0\().16b, \s0\().16b, \s1\().16b + add \d\().2d, \s0\().2d, \s0\().2d + add \d\().2d, \d\().2d, \d\().2d + sri \d\().2d, \s0\().2d, #(62) + // .elseif \imm == 61 + // eor \s0\().16b, \s0\().16b, \s1\().16b + // add \d\().2d, \s0\().2d, \s0\().2d + // add \d\().2d, \d\().2d, \d\().2d + // add \d\().2d, \d\().2d, \d\().2d + // sri \d\().2d, \s0\().2d, #(61) + .else + eor \s0\().16b, \s0\().16b, \s1\().16b + shl \d\().2d, \s0\().2d, #(64-\imm) + sri \d\().2d, \s0\().2d, #(\imm) + .endif +.endm + + .macro xar_m1_0 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 63 + eor \s0\().16b, \s0\().16b, \s1\().16b + .elseif \imm == 62 + eor \s0\().16b, \s0\().16b, \s1\().16b + .else + eor \s0\().16b, \s0\().16b, \s1\().16b + .endif +.endm + + .macro xar_m1_1 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 63 + add \d\().2d, \s0\().2d, \s0\().2d + sri \d\().2d, \s0\().2d, #(63) + .elseif \imm == 62 + add \d\().2d, \s0\().2d, \s0\().2d + add \d\().2d, \d\().2d, \d\().2d + sri \d\().2d, \s0\().2d, #(62) + .else + shl \d\().2d, \s0\().2d, #(64-\imm) + sri \d\().2d, \s0\().2d, #(\imm) + .endif +.endm + +.macro bcax_m1 d s0 s1 s2 + bic tmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +/* Keccak-f1600 round */ + +.macro keccak_f1600_round_pre + + /* 10 EOR3, so 20 individual EOR */ + + eor3_m1_0 C1, Abe, Age, Ake + eor3_m1_0 C3, Abo, Ago, Ako + eor3_m1_0 C0, Aba, Aga, Aka + eor3_m1_0 C2, Abi, Agi, Aki + eor3_m1_0 C4, Abu, Agu, Aku + eor3_m1_1 C1, Abe, Age, Ake + eor3_m1_1 C3, Abo, Ago, Ako + eor3_m1_1 C0, Aba, Aga, Aka + eor3_m1_1 C2, Abi, Agi, Aki + eor3_m1_1 C4, Abu, Agu, Aku + eor3_m1_0 C1, C1, Ame, Ase + eor3_m1_0 C3, C3, Amo, Aso + eor3_m1_0 C0, C0, Ama, Asa + eor3_m1_0 C2, C2, Ami, Asi + eor3_m1_0 C4, C4, Amu, Asu + eor3_m1_1 C1, C1, Ame, Ase + eor3_m1_1 C3, C3, Amo, Aso + eor3_m1_1 C0, C0, Ama, Asa + eor3_m1_1 C2, C2, Ami, Asi + eor3_m1_1 C4, C4, Amu, Asu + +.endm + +.macro keccak_f1600_round + + /* 10 EOR3, so 20 individual EOR */ + + eor3_m1_0 C0, Aba, Aga, Aka + eor3_m1_0 C1, Abe, Age, Ake + eor3_m1_0 C2, Abi, Agi, Aki + eor3_m1_0 C3, Abo, Ago, Ako + eor3_m1_0 C4, Abu, Agu, Aku + eor3_m1_1 C0, Aba, Aga, Aka + eor3_m1_1 C1, Abe, Age, Ake + eor3_m1_1 C2, Abi, Agi, Aki + eor3_m1_1 C3, Abo, Ago, Ako + eor3_m1_1 C4, Abu, Agu, Aku + eor3_m1_0 C0, C0, Ama, Asa + eor3_m1_0 C1, C1, Ame, Ase + eor3_m1_0 C2, C2, Ami, Asi + eor3_m1_0 C3, C3, Amo, Aso + eor3_m1_0 C4, C4, Amu, Asu + eor3_m1_1 C0, C0, Ama, Asa + eor3_m1_1 C1, C1, Ame, Ase + eor3_m1_1 C2, C2, Ami, Asi + eor3_m1_1 C3, C3, Amo, Aso + eor3_m1_1 C4, C4, Amu, Asu + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req vBba + rax1_m1 E2, C1, C3 + rax1_m1 E4, C3, C0 + rax1_m1 E1, C0, C2 + rax1_m1 E3, C2, C4 + rax1_m1 E0, C4, C1 + .unreq tmp + + /* 25x XAR, 75 in total */ + + tmp .req C1 + tmpq .req C1q + + eor vBba.16b, Aba.16b, E0.16b + xar_m1 vBsa, Abi, E2, 2 + xar_m1 vBbi, Aki, E2, 21 + xar_m1 vBki, Ako, E3, 39 + xar_m1 vBko, Amu, E4, 56 + xar_m1 vBmu, Aso, E3, 8 + xar_m1 vBso, Ama, E0, 23 + xar_m1 vBka, Abe, E1, 63 + xar_m1 vBse, Ago, E3, 9 + xar_m1 vBgo, Ame, E1, 19 + xar_m1 vBke, Agi, E2, 58 + xar_m1 vBgi, Aka, E0, 61 + xar_m1 vBga, Abo, E3, 36 + xar_m1 vBbo, Amo, E3, 43 + xar_m1 vBmo, Ami, E2, 49 + xar_m1 vBmi, Ake, E1, 54 + xar_m1 vBge, Agu, E4, 44 + xar_m1 vBgu, Asi, E2, 3 + xar_m1 vBsi, Aku, E4, 25 + xar_m1 vBku, Asa, E0, 46 + xar_m1 vBma, Abu, E4, 37 + xar_m1 vBbu, Asu, E4, 50 + xar_m1 vBsu, Ase, E1, 62 + xar_m1 vBme, Aga, E0, 28 + xar_m1 vBbe, Age, E1, 20 + + /* 25x BCAX, 50 in total */ + + bcax_m1 Aga, vBga, vBgi, vBge + bcax_m1 Age, vBge, vBgo, vBgi + bcax_m1 Agi, vBgi, vBgu, vBgo + bcax_m1 Ago, vBgo, vBga, vBgu + bcax_m1 Agu, vBgu, vBge, vBga + bcax_m1 Aka, vBka, vBki, vBke + bcax_m1 Ake, vBke, vBko, vBki + bcax_m1 Aki, vBki, vBku, vBko + bcax_m1 Ako, vBko, vBka, vBku + bcax_m1 Aku, vBku, vBke, vBka + bcax_m1 Ama, vBma, vBmi, vBme + bcax_m1 Ame, vBme, vBmo, vBmi + bcax_m1 Ami, vBmi, vBmu, vBmo + bcax_m1 Amo, vBmo, vBma, vBmu + bcax_m1 Amu, vBmu, vBme, vBma + bcax_m1 Asa, vBsa, vBsi, vBse + bcax_m1 Ase, vBse, vBso, vBsi + bcax_m1 Asi, vBsi, vBsu, vBso + bcax_m1 Aso, vBso, vBsa, vBsu + bcax_m1 Asu, vBsu, vBse, vBsa + bcax_m1 Aba, vBba, vBbi, vBbe + bcax_m1 Abe, vBbe, vBbo, vBbi + bcax_m1 Abi, vBbi, vBbu, vBbo + bcax_m1 Abo, vBbo, vBba, vBbu + bcax_m1 Abu, vBbu, vBbe, vBba + + // iota step + //ld1r {tmp.2d}, [const_addr], #8 + ldr tmpq, [const_addr], #16 + eor Aba.16b, Aba.16b, tmp.16b + + .unreq tmp + .unreq tmpq + +.endm + +.macro keccak_f1600_round_core + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req vBba + rax1_m1 E2, C1, C3 + rax1_m1 E4, C3, C0 + rax1_m1 E1, C0, C2 + rax1_m1 E3, C2, C4 + rax1_m1 E0, C4, C1 + + /* 25x XAR, 75 in total */ + + .unreq tmp + tmp .req C1 + tmpq .req C1q + + eor vBba.16b, Aba.16b, E0.16b + xar_m1 vBsa, Abi, E2, 2 + xar_m1 vBbi, Aki, E2, 21 + xar_m1 vBki, Ako, E3, 39 + xar_m1 vBko, Amu, E4, 56 + xar_m1 vBmu, Aso, E3, 8 + xar_m1 vBso, Ama, E0, 23 + xar_m1 vBka, Abe, E1, 63 + xar_m1 vBse, Ago, E3, 9 + xar_m1 vBgo, Ame, E1, 19 + xar_m1 vBke, Agi, E2, 58 + xar_m1 vBgi, Aka, E0, 61 + xar_m1 vBga, Abo, E3, 36 + xar_m1 vBbo, Amo, E3, 43 + xar_m1 vBmo, Ami, E2, 49 + xar_m1 vBmi, Ake, E1, 54 + xar_m1 vBge, Agu, E4, 44 + mov E3.16b, Aga.16b + bcax_m1 Aga, vBga, vBgi, vBge + xar_m1 vBgu, Asi, E2, 3 + xar_m1 vBsi, Aku, E4, 25 + xar_m1 vBku, Asa, E0, 46 + xar_m1 vBma, Abu, E4, 37 + xar_m1 vBbu, Asu, E4, 50 + xar_m1 vBsu, Ase, E1, 62 + xar_m1 vBme, E3, E0, 28 + xar_m1 vBbe, Age, E1, 20 + + /* 25x BCAX, 50 in total */ + + bcax_m1 Age, vBge, vBgo, vBgi + bcax_m1 Agi, vBgi, vBgu, vBgo + bcax_m1 Ago, vBgo, vBga, vBgu + bcax_m1 Agu, vBgu, vBge, vBga + bcax_m1 Aka, vBka, vBki, vBke + bcax_m1 Ake, vBke, vBko, vBki + + .unreq tmp + .unreq tmpq + + eor2 C0, Aka, Aga + save(Aga) + + tmp .req Aga + tmpq .req Agaq + bcax_m1 Aki, vBki, vBku, vBko + bcax_m1 Ako, vBko, vBka, vBku + eor2 C1, Ake, Age + bcax_m1 Aku, vBku, vBke, vBka + eor2 C2, Aki, Agi + bcax_m1 Ama, vBma, vBmi, vBme + eor2 C3, Ako, Ago + bcax_m1 Ame, vBme, vBmo, vBmi + eor2 C4, Aku, Agu + bcax_m1 Ami, vBmi, vBmu, vBmo + eor2 C0, C0, Ama + bcax_m1 Amo, vBmo, vBma, vBmu + eor2 C1, C1, Ame + bcax_m1 Amu, vBmu, vBme, vBma + eor2 C2, C2, Ami + bcax_m1 Asa, vBsa, vBsi, vBse + eor2 C3, C3, Amo + bcax_m1 Ase, vBse, vBso, vBsi + eor2 C4, C4, Amu + bcax_m1 Asi, vBsi, vBsu, vBso + eor2 C0, C0, Asa + bcax_m1 Aso, vBso, vBsa, vBsu + eor2 C1, C1, Ase + bcax_m1 Asu, vBsu, vBse, vBsa + eor2 C2, C2, Asi + eor2 C3, C3, Aso + bcax_m1 Aba, vBba, vBbi, vBbe + bcax_m1 Abe, vBbe, vBbo, vBbi + eor2 C1, C1, Abe + + // iota step + //ld1r {tmp.2d}, [const_addr], #8 + ldr tmpq, [const_addr], #16 + eor Aba.16b, Aba.16b, tmp.16b + eor2 C4, C4, Asu + bcax_m1 Abi, vBbi, vBbu, vBbo + bcax_m1 Abo, vBbo, vBba, vBbu + eor2 C3, C3, Abo + eor2 C2, C2, Abi + eor2 C0, C0, Aba + bcax_m1 Abu, vBbu, vBbe, vBba + eor2 C4, C4, Abu + + restore(Aga) + .unreq tmp + .unreq tmpq + +.endm + +.macro keccak_f1600_round_post + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req vBba + rax1_m1 E2, C1, C3 + rax1_m1 E4, C3, C0 + rax1_m1 E1, C0, C2 + rax1_m1 E3, C2, C4 + rax1_m1 E0, C4, C1 + + /* 25x XAR, 75 in total */ + + .unreq tmp + tmp .req C1 + tmpq .req C1q + + eor vBba.16b, Aba.16b, E0.16b + xar_m1 vBsa, Abi, E2, 2 + xar_m1 vBbi, Aki, E2, 21 + xar_m1 vBki, Ako, E3, 39 + xar_m1 vBko, Amu, E4, 56 + xar_m1 vBmu, Aso, E3, 8 + xar_m1 vBso, Ama, E0, 23 + xar_m1 vBka, Abe, E1, 63 + xar_m1 vBse, Ago, E3, 9 + xar_m1 vBgo, Ame, E1, 19 + xar_m1 vBke, Agi, E2, 58 + xar_m1 vBgi, Aka, E0, 61 + xar_m1 vBga, Abo, E3, 36 + xar_m1 vBbo, Amo, E3, 43 + xar_m1 vBmo, Ami, E2, 49 + xar_m1 vBmi, Ake, E1, 54 + xar_m1 vBge, Agu, E4, 44 + mov E3.16b, Aga.16b + bcax_m1 Aga, vBga, vBgi, vBge + xar_m1 vBgu, Asi, E2, 3 + xar_m1 vBsi, Aku, E4, 25 + xar_m1 vBku, Asa, E0, 46 + xar_m1 vBma, Abu, E4, 37 + xar_m1 vBbu, Asu, E4, 50 + xar_m1 vBsu, Ase, E1, 62 + xar_m1 vBme, E3, E0, 28 + xar_m1 vBbe, Age, E1, 20 + + /* 25x BCAX, 50 in total */ + + bcax_m1 Age, vBge, vBgo, vBgi + bcax_m1 Agi, vBgi, vBgu, vBgo + bcax_m1 Ago, vBgo, vBga, vBgu + bcax_m1 Agu, vBgu, vBge, vBga + bcax_m1 Aka, vBka, vBki, vBke + bcax_m1 Ake, vBke, vBko, vBki + bcax_m1 Aki, vBki, vBku, vBko + bcax_m1 Ako, vBko, vBka, vBku + bcax_m1 Aku, vBku, vBke, vBka + bcax_m1 Ama, vBma, vBmi, vBme + bcax_m1 Ame, vBme, vBmo, vBmi + bcax_m1 Ami, vBmi, vBmu, vBmo + bcax_m1 Amo, vBmo, vBma, vBmu + bcax_m1 Amu, vBmu, vBme, vBma + bcax_m1 Asa, vBsa, vBsi, vBse + bcax_m1 Ase, vBse, vBso, vBsi + bcax_m1 Asi, vBsi, vBsu, vBso + bcax_m1 Aso, vBso, vBsa, vBsu + bcax_m1 Asu, vBsu, vBse, vBsa + bcax_m1 Aba, vBba, vBbi, vBbe + bcax_m1 Abe, vBbe, vBbo, vBbi + bcax_m1 Abi, vBbi, vBbu, vBbo + bcax_m1 Abo, vBbo, vBba, vBbu + bcax_m1 Abu, vBbu, vBbe, vBba + + // iota step + //ld1r {tmp.2d}, [const_addr], #8 + ldr tmpq, [const_addr], #16 + eor Aba.16b, Aba.16b, tmp.16b + + .unreq tmp + .unreq tmpq + +.endm + + +.text +.align 4 +.global keccak_f1600_x2_v84a_asm_v2pp4 +.global _keccak_f1600_x2_v84a_asm_v2pp4 + +#define KECCAK_F1600_ROUNDS 24 + +keccak_f1600_x2_v84a_asm_v2pp4: +_keccak_f1600_x2_v84a_asm_v2pp4: + alloc_stack + save_vregs + load_constant_ptr + load_input + + //mov count, #(KECCAK_F1600_ROUNDS-2) + mov count, #11 + keccak_f1600_round_pre +loop: + keccak_f1600_round_core + keccak_f1600_round_core + sub count, count, #1 + cbnz count, loop + + keccak_f1600_round_core + keccak_f1600_round_post + store_input + restore_vregs + free_stack + ret diff --git a/asm/manual/keccak_f1600/keccak_f1600_x2_v84a_asm_v2pp5.s b/asm/manual/keccak_f1600/keccak_f1600_x2_v84a_asm_v2pp5.s new file mode 100644 index 0000000..89571de --- /dev/null +++ b/asm/manual/keccak_f1600/keccak_f1600_x2_v84a_asm_v2pp5.s @@ -0,0 +1,806 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +/********************** CONSTANTS *************************/ + .data + .align(8) +_round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x1 + count .req x2 + cur_const .req x3 + + /* Mapping of Kecck-f1600 state to vector registers + * at the beginning and end of each round. */ + Aba .req v0 + Abe .req v1 + Abi .req v2 + Abo .req v3 + Abu .req v4 + Aga .req v5 + Age .req v6 + Agi .req v7 + Ago .req v8 + Agu .req v9 + Aka .req v10 + Ake .req v11 + Aki .req v12 + Ako .req v13 + Aku .req v14 + Ama .req v15 + Ame .req v16 + Ami .req v17 + Amo .req v18 + Amu .req v19 + Asa .req v20 + Ase .req v21 + Asi .req v22 + Aso .req v23 + Asu .req v24 + + /* q-form of the above mapping */ + Abaq .req q0 + Abeq .req q1 + Abiq .req q2 + Aboq .req q3 + Abuq .req q4 + Agaq .req q5 + Ageq .req q6 + Agiq .req q7 + Agoq .req q8 + Aguq .req q9 + Akaq .req q10 + Akeq .req q11 + Akiq .req q12 + Akoq .req q13 + Akuq .req q14 + Amaq .req q15 + Ameq .req q16 + Amiq .req q17 + Amoq .req q18 + Amuq .req q19 + Asaq .req q20 + Aseq .req q21 + Asiq .req q22 + Asoq .req q23 + Asuq .req q24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v27 + C1 .req v28 + C2 .req v29 + C3 .req v30 + C4 .req v31 + + C0q .req q27 + C1q .req q28 + C2q .req q29 + C3q .req q30 + C4q .req q31 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + vBba .req v25 // fresh + vBbe .req v26 // fresh + vBbi .req Abi + vBbo .req Abo + vBbu .req Abu + vBga .req Aka + vBge .req Ake + vBgi .req Agi + vBgo .req Ago + vBgu .req Agu + vBka .req Ama + vBke .req Ame + vBki .req Aki + vBko .req Ako + vBku .req Aku + vBma .req Asa + vBme .req Ase + vBmi .req Ami + vBmo .req Amo + vBmu .req Amu + vBsa .req Aba + vBse .req Abe + vBsi .req Asi + vBso .req Aso + vBsu .req Asu + + vBbaq .req q25 // fresh + vBbeq .req q26 // fresh + vBbiq .req Abiq + vBboq .req Aboq + vBbuq .req Abuq + vBgaq .req Akaq + vBgeq .req Akeq + vBgiq .req Agiq + vBgoq .req Agoq + vBguq .req Aguq + vBkaq .req Amaq + vBkeq .req Ameq + vBkiq .req Akiq + vBkoq .req Akoq + vBkuq .req Akuq + vBmaq .req Asaq + vBmeq .req Aseq + vBmiq .req Amiq + vBmoq .req Amoq + vBmuq .req Amuq + vBsaq .req Abaq + vBseq .req Abeq + vBsiq .req Asiq + vBsoq .req Asoq + vBsuq .req Asuq + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req C4 + E1 .req C0 + E2 .req vBbe // fresh + E3 .req C2 + E4 .req C3 + + E0q .req C4q + E1q .req C0q + E2q .req vBbeq // fresh + E3q .req C2q + E4q .req C3q + + +/************************ MACROS ****************************/ + +.macro load_input + ldp Abaq, Abeq, [input_addr, #(2*8*0)] + ldp Abiq, Aboq, [input_addr, #(2*8*2)] + ldp Abuq, Agaq, [input_addr, #(2*8*4)] + ldp Ageq, Agiq, [input_addr, #(2*8*6)] + ldp Agoq, Aguq, [input_addr, #(2*8*8)] + ldp Akaq, Akeq, [input_addr, #(2*8*10)] + ldp Akiq, Akoq, [input_addr, #(2*8*12)] + ldp Akuq, Amaq, [input_addr, #(2*8*14)] + ldp Ameq, Amiq, [input_addr, #(2*8*16)] + ldp Amoq, Amuq, [input_addr, #(2*8*18)] + ldp Asaq, Aseq, [input_addr, #(2*8*20)] + ldp Asiq, Asoq, [input_addr, #(2*8*22)] + ldr Asuq, [input_addr, #(2*8*24)] +.endm + +.macro store_input + str Abaq, [input_addr, #(2*8*0)] + str Abeq, [input_addr, #(2*8*1)] + str Abiq, [input_addr, #(2*8*2)] + str Aboq, [input_addr, #(2*8*3)] + str Abuq, [input_addr, #(2*8*4)] + str Agaq, [input_addr, #(2*8*5)] + str Ageq, [input_addr, #(2*8*6)] + str Agiq, [input_addr, #(2*8*7)] + str Agoq, [input_addr, #(2*8*8)] + str Aguq, [input_addr, #(2*8*9)] + str Akaq, [input_addr, #(2*8*10)] + str Akeq, [input_addr, #(2*8*11)] + str Akiq, [input_addr, #(2*8*12)] + str Akoq, [input_addr, #(2*8*13)] + str Akuq, [input_addr, #(2*8*14)] + str Amaq, [input_addr, #(2*8*15)] + str Ameq, [input_addr, #(2*8*16)] + str Amiq, [input_addr, #(2*8*17)] + str Amoq, [input_addr, #(2*8*18)] + str Amuq, [input_addr, #(2*8*19)] + str Asaq, [input_addr, #(2*8*20)] + str Aseq, [input_addr, #(2*8*21)] + str Asiq, [input_addr, #(2*8*22)] + str Asoq, [input_addr, #(2*8*23)] + str Asuq, [input_addr, #(2*8*24)] +.endm + +#define STACK_SIZE (16*4 + 16*34) +#define STACK_BASE_VREGS 0 +#define STACK_BASE_TMP 16*4 + +#define Aga_offset 0 +#define E0_offset 1 +#define E1_offset 2 +#define E2_offset 3 +#define E3_offset 4 +#define E4_offset 5 +#define Ame_offset 7 +#define Agi_offset 8 +#define Aka_offset 9 +#define Abo_offset 10 +#define Amo_offset 11 +#define Ami_offset 12 +#define Ake_offset 13 +#define Agu_offset 14 +#define Asi_offset 15 +#define Aku_offset 16 +#define Asa_offset 17 +#define Abu_offset 18 +#define Asu_offset 19 +#define Ase_offset 20 +//#define Aga_offset 21 +#define Age_offset 22 +#define vBgo_offset 23 +#define vBke_offset 24 +#define vBgi_offset 25 +#define vBga_offset 26 +#define vBbo_offset 27 +#define vBmo_offset 28 +#define vBmi_offset 29 +#define vBge_offset 30 + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +#define save(name) \ + str name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] +#define restore(name) \ + ldr name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] + +.macro save_vregs + stp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + stp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + stp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + stp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + ldp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + ldp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + ldp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +/* Macros using v8.4-A SHA-3 instructions */ + +.macro eor3_m1_0 d s0 s1 s2 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor2 d s0 s1 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor3_m1_1 d s0 s1 s2 + eor \d\().16b, \d\().16b, \s2\().16b +.endm + + +.macro eor3_m1 d s0 s1 s2 + eor3_m1_0 \d, \s0, \s1, \s2 + eor3_m1_1 \d, \s0, \s1, \s2 +.endm + +.macro rax1_m1 d s0 s1 + // Use add instead of SHL #1 + add tmp.2d, \s1\().2d, \s1\().2d + sri tmp.2d, \s1\().2d, #63 + eor \d\().16b, tmp.16b, \s0\().16b +.endm + + .macro xar_m1 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 63 + eor \s0\().16b, \s0\().16b, \s1\().16b + add \d\().2d, \s0\().2d, \s0\().2d + sri \d\().2d, \s0\().2d, #(63) + .elseif \imm == 62 + eor \s0\().16b, \s0\().16b, \s1\().16b + add \d\().2d, \s0\().2d, \s0\().2d + add \d\().2d, \d\().2d, \d\().2d + sri \d\().2d, \s0\().2d, #(62) + .else + eor \s0\().16b, \s0\().16b, \s1\().16b + shl \d\().2d, \s0\().2d, #(64-\imm) + sri \d\().2d, \s0\().2d, #(\imm) + .endif +.endm + + .macro xar_m1_0 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 63 + eor \s0\().16b, \s0\().16b, \s1\().16b + .elseif \imm == 62 + eor \s0\().16b, \s0\().16b, \s1\().16b + .else + eor \s0\().16b, \s0\().16b, \s1\().16b + .endif +.endm + + .macro xar_m1_1 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 63 + add \d\().2d, \s0\().2d, \s0\().2d + sri \d\().2d, \s0\().2d, #(63) + .elseif \imm == 62 + add \d\().2d, \s0\().2d, \s0\().2d + add \d\().2d, \d\().2d, \d\().2d + sri \d\().2d, \s0\().2d, #(62) + .else + shl \d\().2d, \s0\().2d, #(64-\imm) + sri \d\().2d, \s0\().2d, #(\imm) + .endif +.endm + +.macro bcax_m1 d s0 s1 s2 + bic tmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +/* Keccak-f1600 round */ + +.macro keccak_f1600_round_pre + + /* 10 EOR3, so 20 individual EOR */ + + eor3_m1_0 C1, Abe, Age, Ake + eor3_m1_0 C3, Abo, Ago, Ako + eor3_m1_0 C0, Aba, Aga, Aka + eor3_m1_0 C2, Abi, Agi, Aki + eor3_m1_0 C4, Abu, Agu, Aku + eor3_m1_1 C1, Abe, Age, Ake + eor3_m1_1 C3, Abo, Ago, Ako + eor3_m1_1 C0, Aba, Aga, Aka + eor3_m1_1 C2, Abi, Agi, Aki + eor3_m1_1 C4, Abu, Agu, Aku + eor3_m1_0 C1, C1, Ame, Ase + eor3_m1_0 C3, C3, Amo, Aso + eor3_m1_0 C0, C0, Ama, Asa + eor3_m1_0 C2, C2, Ami, Asi + eor3_m1_0 C4, C4, Amu, Asu + eor3_m1_1 C1, C1, Ame, Ase + eor3_m1_1 C3, C3, Amo, Aso + eor3_m1_1 C0, C0, Ama, Asa + eor3_m1_1 C2, C2, Ami, Asi + eor3_m1_1 C4, C4, Amu, Asu + +.endm + +.macro keccak_f1600_round + + /* 10 EOR3, so 20 individual EOR */ + + eor3_m1_0 C0, Aba, Aga, Aka + eor3_m1_0 C1, Abe, Age, Ake + eor3_m1_0 C2, Abi, Agi, Aki + eor3_m1_0 C3, Abo, Ago, Ako + eor3_m1_0 C4, Abu, Agu, Aku + eor3_m1_1 C0, Aba, Aga, Aka + eor3_m1_1 C1, Abe, Age, Ake + eor3_m1_1 C2, Abi, Agi, Aki + eor3_m1_1 C3, Abo, Ago, Ako + eor3_m1_1 C4, Abu, Agu, Aku + eor3_m1_0 C0, C0, Ama, Asa + eor3_m1_0 C1, C1, Ame, Ase + eor3_m1_0 C2, C2, Ami, Asi + eor3_m1_0 C3, C3, Amo, Aso + eor3_m1_0 C4, C4, Amu, Asu + eor3_m1_1 C0, C0, Ama, Asa + eor3_m1_1 C1, C1, Ame, Ase + eor3_m1_1 C2, C2, Ami, Asi + eor3_m1_1 C3, C3, Amo, Aso + eor3_m1_1 C4, C4, Amu, Asu + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req vBba + rax1_m1 E2, C1, C3 + rax1_m1 E4, C3, C0 + rax1_m1 E1, C0, C2 + rax1_m1 E3, C2, C4 + rax1_m1 E0, C4, C1 + .unreq tmp + + /* 25x XAR, 75 in total */ + + tmp .req C1 + tmpq .req C1q + + eor vBba.16b, Aba.16b, E0.16b + xar_m1 vBsa, Abi, E2, 2 + xar_m1 vBbi, Aki, E2, 21 + xar_m1 vBki, Ako, E3, 39 + xar_m1 vBko, Amu, E4, 56 + xar_m1 vBmu, Aso, E3, 8 + xar_m1 vBso, Ama, E0, 23 + xar_m1 vBka, Abe, E1, 63 + xar_m1 vBse, Ago, E3, 9 + xar_m1 vBgo, Ame, E1, 19 + xar_m1 vBke, Agi, E2, 58 + xar_m1 vBgi, Aka, E0, 61 + xar_m1 vBga, Abo, E3, 36 + xar_m1 vBbo, Amo, E3, 43 + xar_m1 vBmo, Ami, E2, 49 + xar_m1 vBmi, Ake, E1, 54 + xar_m1 vBge, Agu, E4, 44 + xar_m1 vBgu, Asi, E2, 3 + xar_m1 vBsi, Aku, E4, 25 + xar_m1 vBku, Asa, E0, 46 + xar_m1 vBma, Abu, E4, 37 + xar_m1 vBbu, Asu, E4, 50 + xar_m1 vBsu, Ase, E1, 62 + xar_m1 vBme, Aga, E0, 28 + xar_m1 vBbe, Age, E1, 20 + + /* 25x BCAX, 50 in total */ + + bcax_m1 Aga, vBga, vBgi, vBge + bcax_m1 Age, vBge, vBgo, vBgi + bcax_m1 Agi, vBgi, vBgu, vBgo + bcax_m1 Ago, vBgo, vBga, vBgu + bcax_m1 Agu, vBgu, vBge, vBga + bcax_m1 Aka, vBka, vBki, vBke + bcax_m1 Ake, vBke, vBko, vBki + bcax_m1 Aki, vBki, vBku, vBko + bcax_m1 Ako, vBko, vBka, vBku + bcax_m1 Aku, vBku, vBke, vBka + bcax_m1 Ama, vBma, vBmi, vBme + bcax_m1 Ame, vBme, vBmo, vBmi + bcax_m1 Ami, vBmi, vBmu, vBmo + bcax_m1 Amo, vBmo, vBma, vBmu + bcax_m1 Amu, vBmu, vBme, vBma + bcax_m1 Asa, vBsa, vBsi, vBse + bcax_m1 Ase, vBse, vBso, vBsi + bcax_m1 Asi, vBsi, vBsu, vBso + bcax_m1 Aso, vBso, vBsa, vBsu + bcax_m1 Asu, vBsu, vBse, vBsa + bcax_m1 Aba, vBba, vBbi, vBbe + bcax_m1 Abe, vBbe, vBbo, vBbi + bcax_m1 Abi, vBbi, vBbu, vBbo + bcax_m1 Abo, vBbo, vBba, vBbu + bcax_m1 Abu, vBbu, vBbe, vBba + + // iota step + //ld1r {tmp.2d}, [const_addr], #8 + ldr tmpq, [const_addr], #16 + eor Aba.16b, Aba.16b, tmp.16b + + .unreq tmp + .unreq tmpq + +.endm + +.macro keccak_f1600_round_core + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req vBba + rax1_m1 E2, C1, C3 + rax1_m1 E4, C3, C0 + rax1_m1 E1, C0, C2 + rax1_m1 E3, C2, C4 + str Agiq, [sp, #(STACK_BASE_TMP + 16*32)] + rax1_m1 E0, C4, C1 + + /* 25x XAR, 75 in total */ + + .unreq tmp + tmp .req C1 + tmpq .req C1q + + xar_m1 vBgi, Aka, E0, 61 + xar_m1 vBga, Abo, E3, 36 + str Agaq, [sp, #(STACK_BASE_TMP + 16 * 30)] + xar_m1 vBbo, Amo, E3, 43 + xar_m1 vBmo, Ami, E2, 49 + str Ageq, [sp, #(STACK_BASE_TMP + 16 * 31)] + xar_m1 vBmi, Ake, E1, 54 + xar_m1 vBge, Agu, E4, 44 + bcax_m1 Aga, vBga, vBgi, vBge + + eor vBba.16b, Aba.16b, E0.16b + xar_m1 vBsa, Abi, E2, 2 + xar_m1 vBbi, Aki, E2, 21 + xar_m1 vBki, Ako, E3, 39 + xar_m1 vBko, Amu, E4, 56 + xar_m1 vBmu, Aso, E3, 8 + xar_m1 vBso, Ama, E0, 23 + xar_m1 vBka, Abe, E1, 63 + xar_m1 vBse, Ago, E3, 9 + xar_m1 vBgo, Ame, E1, 19 + bcax_m1 Age, vBge, vBgo, vBgi + + ldr tmpq, [sp, #(STACK_BASE_TMP + 16*32)] + xar_m1 vBke, tmp, E2, 58 + + xar_m1 vBgu, Asi, E2, 3 + bcax_m1 Agi, vBgi, vBgu, vBgo + xar_m1 vBsi, Aku, E4, 25 + xar_m1 vBku, Asa, E0, 46 + xar_m1 vBma, Abu, E4, 37 + xar_m1 vBbu, Asu, E4, 50 + xar_m1 vBsu, Ase, E1, 62 + ldp tmpq, E3q, [sp, #(STACK_BASE_TMP + 16*30)] + xar_m1 vBme, tmp, E0, 28 + xar_m1 vBbe, E3, E1, 20 + + /* 25x BCAX, 50 in total */ + + bcax_m1 Ago, vBgo, vBga, vBgu + bcax_m1 Agu, vBgu, vBge, vBga + bcax_m1 Aka, vBka, vBki, vBke + bcax_m1 Ake, vBke, vBko, vBki + + .unreq tmp + .unreq tmpq + + eor2 C0, Aka, Aga + save(Aga) + + tmp .req Aga + tmpq .req Agaq + bcax_m1 Aki, vBki, vBku, vBko + bcax_m1 Ako, vBko, vBka, vBku + eor2 C1, Ake, Age + bcax_m1 Aku, vBku, vBke, vBka + eor2 C2, Aki, Agi + bcax_m1 Ama, vBma, vBmi, vBme + eor2 C3, Ako, Ago + bcax_m1 Ame, vBme, vBmo, vBmi + eor2 C4, Aku, Agu + bcax_m1 Ami, vBmi, vBmu, vBmo + eor2 C0, C0, Ama + bcax_m1 Amo, vBmo, vBma, vBmu + eor2 C1, C1, Ame + bcax_m1 Amu, vBmu, vBme, vBma + eor2 C2, C2, Ami + bcax_m1 Asa, vBsa, vBsi, vBse + eor2 C3, C3, Amo + bcax_m1 Ase, vBse, vBso, vBsi + eor2 C4, C4, Amu + bcax_m1 Asi, vBsi, vBsu, vBso + eor2 C0, C0, Asa + bcax_m1 Aso, vBso, vBsa, vBsu + eor2 C1, C1, Ase + bcax_m1 Asu, vBsu, vBse, vBsa + eor2 C2, C2, Asi + eor2 C3, C3, Aso + bcax_m1 Aba, vBba, vBbi, vBbe + bcax_m1 Abe, vBbe, vBbo, vBbi + eor2 C1, C1, Abe + + // iota step + //ld1r {tmp.2d}, [const_addr], #8 + ldr tmpq, [const_addr], #16 + eor Aba.16b, Aba.16b, tmp.16b + eor2 C4, C4, Asu + bcax_m1 Abi, vBbi, vBbu, vBbo + bcax_m1 Abo, vBbo, vBba, vBbu + eor2 C3, C3, Abo + eor2 C2, C2, Abi + eor2 C0, C0, Aba + bcax_m1 Abu, vBbu, vBbe, vBba + eor2 C4, C4, Abu + + restore(Aga) + .unreq tmp + .unreq tmpq + +.endm + +.macro keccak_f1600_round_post + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req vBba + rax1_m1 E2, C1, C3 + rax1_m1 E4, C3, C0 + rax1_m1 E1, C0, C2 + rax1_m1 E3, C2, C4 + str Agiq, [sp, #(STACK_BASE_TMP + 16*32)] + rax1_m1 E0, C4, C1 + + /* 25x XAR, 75 in total */ + + .unreq tmp + tmp .req C1 + tmpq .req C1q + + xar_m1 vBgi, Aka, E0, 61 + xar_m1 vBga, Abo, E3, 36 + str Agaq, [sp, #(STACK_BASE_TMP + 16 * 30)] + xar_m1 vBbo, Amo, E3, 43 + xar_m1 vBmo, Ami, E2, 49 + str Ageq, [sp, #(STACK_BASE_TMP + 16 * 31)] + xar_m1 vBmi, Ake, E1, 54 + xar_m1 vBge, Agu, E4, 44 + bcax_m1 Aga, vBga, vBgi, vBge + + eor vBba.16b, Aba.16b, E0.16b + xar_m1 vBsa, Abi, E2, 2 + xar_m1 vBbi, Aki, E2, 21 + xar_m1 vBki, Ako, E3, 39 + xar_m1 vBko, Amu, E4, 56 + xar_m1 vBmu, Aso, E3, 8 + xar_m1 vBso, Ama, E0, 23 + xar_m1 vBka, Abe, E1, 63 + xar_m1 vBse, Ago, E3, 9 + xar_m1 vBgo, Ame, E1, 19 + bcax_m1 Age, vBge, vBgo, vBgi + + ldr tmpq, [sp, #(STACK_BASE_TMP + 16*32)] + xar_m1 vBke, tmp, E2, 58 + + xar_m1 vBgu, Asi, E2, 3 + bcax_m1 Agi, vBgi, vBgu, vBgo + xar_m1 vBsi, Aku, E4, 25 + xar_m1 vBku, Asa, E0, 46 + xar_m1 vBma, Abu, E4, 37 + xar_m1 vBbu, Asu, E4, 50 + xar_m1 vBsu, Ase, E1, 62 + ldp tmpq, E3q, [sp, #(STACK_BASE_TMP + 16*30)] + xar_m1 vBme, tmp, E0, 28 + xar_m1 vBbe, E3, E1, 20 + + /* 25x BCAX, 50 in total */ + + bcax_m1 Ago, vBgo, vBga, vBgu + bcax_m1 Agu, vBgu, vBge, vBga + bcax_m1 Aka, vBka, vBki, vBke + bcax_m1 Ake, vBke, vBko, vBki + bcax_m1 Aki, vBki, vBku, vBko + bcax_m1 Ako, vBko, vBka, vBku + bcax_m1 Aku, vBku, vBke, vBka + bcax_m1 Ama, vBma, vBmi, vBme + bcax_m1 Ame, vBme, vBmo, vBmi + bcax_m1 Ami, vBmi, vBmu, vBmo + bcax_m1 Amo, vBmo, vBma, vBmu + bcax_m1 Amu, vBmu, vBme, vBma + bcax_m1 Asa, vBsa, vBsi, vBse + bcax_m1 Ase, vBse, vBso, vBsi + bcax_m1 Asi, vBsi, vBsu, vBso + bcax_m1 Aso, vBso, vBsa, vBsu + bcax_m1 Asu, vBsu, vBse, vBsa + bcax_m1 Aba, vBba, vBbi, vBbe + bcax_m1 Abe, vBbe, vBbo, vBbi + bcax_m1 Abi, vBbi, vBbu, vBbo + bcax_m1 Abo, vBbo, vBba, vBbu + bcax_m1 Abu, vBbu, vBbe, vBba + + // iota step + //ld1r {tmp.2d}, [const_addr], #8 + ldr tmpq, [const_addr], #16 + eor Aba.16b, Aba.16b, tmp.16b + + .unreq tmp + .unreq tmpq + +.endm + + +.text +.align 4 +.global keccak_f1600_x2_v84a_asm_v2pp5 +.global _keccak_f1600_x2_v84a_asm_v2pp5 + +#define KECCAK_F1600_ROUNDS 24 + +keccak_f1600_x2_v84a_asm_v2pp5: +_keccak_f1600_x2_v84a_asm_v2pp5: + alloc_stack + save_vregs + load_constant_ptr + load_input + + //mov count, #(KECCAK_F1600_ROUNDS-2) + mov count, #11 + keccak_f1600_round_pre +loop: + keccak_f1600_round_core + keccak_f1600_round_core + sub count, count, #1 + cbnz count, loop + + keccak_f1600_round_core + keccak_f1600_round_post + store_input + restore_vregs + free_stack + ret diff --git a/asm/manual/keccak_f1600/keccak_f1600_x2_v84a_asm_v2pp6.s b/asm/manual/keccak_f1600/keccak_f1600_x2_v84a_asm_v2pp6.s new file mode 100644 index 0000000..213f214 --- /dev/null +++ b/asm/manual/keccak_f1600/keccak_f1600_x2_v84a_asm_v2pp6.s @@ -0,0 +1,917 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#if defined(__ARM_FEATURE_SVE2) +#include "macros.s" + +/********************** CONSTANTS *************************/ + .data + .align(8) +_round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x1 + count .req x2 + cur_const .req x3 + + /* Mapping of Kecck-f1600 state to vector registers + * at the beginning and end of each round. */ + Aba .req v0 + Abe .req v1 + Abi .req v2 + Abo .req v3 + Abu .req v4 + Aga .req v5 + Age .req v6 + Agi .req v7 + Ago .req v8 + Agu .req v9 + Aka .req v10 + Ake .req v11 + Aki .req v12 + Ako .req v13 + Aku .req v14 + Ama .req v15 + Ame .req v16 + Ami .req v17 + Amo .req v18 + Amu .req v19 + Asa .req v20 + Ase .req v21 + Asi .req v22 + Aso .req v23 + Asu .req v24 + + /* q-form of the above mapping */ + Abaq .req q0 + Abeq .req q1 + Abiq .req q2 + Aboq .req q3 + Abuq .req q4 + Agaq .req q5 + Ageq .req q6 + Agiq .req q7 + Agoq .req q8 + Aguq .req q9 + Akaq .req q10 + Akeq .req q11 + Akiq .req q12 + Akoq .req q13 + Akuq .req q14 + Amaq .req q15 + Ameq .req q16 + Amiq .req q17 + Amoq .req q18 + Amuq .req q19 + Asaq .req q20 + Aseq .req q21 + Asiq .req q22 + Asoq .req q23 + Asuq .req q24 + + /* z-form of the above mapping */ + Abaz .req z0 + Abez .req z1 + Abiz .req z2 + Aboz .req z3 + Abuz .req z4 + Agaz .req z5 + Agez .req z6 + Agiz .req z7 + Agoz .req z8 + Aguz .req z9 + Akaz .req z10 + Akez .req z11 + Akiz .req z12 + Akoz .req z13 + Akuz .req z14 + Amaz .req z15 + Amez .req z16 + Amiz .req z17 + Amoz .req z18 + Amuz .req z19 + Asaz .req z20 + Asez .req z21 + Asiz .req z22 + Asoz .req z23 + Asuz .req z24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v27 + C1 .req v28 + C2 .req v29 + C3 .req v30 + C4 .req v31 + + C0q .req q27 + C1q .req q28 + C2q .req q29 + C3q .req q30 + C4q .req q31 + + C0z .req z27 + C1z .req z28 + C2z .req z29 + C3z .req z30 + C4z .req z31 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + vBba .req v25 // fresh + vBbe .req v26 // fresh + vBbi .req Abi + vBbo .req Abo + vBbu .req Abu + vBga .req Aka + vBge .req Ake + vBgi .req Agi + vBgo .req Ago + vBgu .req Agu + vBka .req Ama + vBke .req Ame + vBki .req Aki + vBko .req Ako + vBku .req Aku + vBma .req Asa + vBme .req Ase + vBmi .req Ami + vBmo .req Amo + vBmu .req Amu + vBsa .req Aba + vBse .req Abe + vBsi .req Asi + vBso .req Aso + vBsu .req Asu + + vBbaq .req q25 // fresh + vBbeq .req q26 // fresh + vBbiq .req Abiq + vBboq .req Aboq + vBbuq .req Abuq + vBgaq .req Akaq + vBgeq .req Akeq + vBgiq .req Agiq + vBgoq .req Agoq + vBguq .req Aguq + vBkaq .req Amaq + vBkeq .req Ameq + vBkiq .req Akiq + vBkoq .req Akoq + vBkuq .req Akuq + vBmaq .req Asaq + vBmeq .req Aseq + vBmiq .req Amiq + vBmoq .req Amoq + vBmuq .req Amuq + vBsaq .req Abaq + vBseq .req Abeq + vBsiq .req Asiq + vBsoq .req Asoq + vBsuq .req Asuq + + vBbaz .req z25 // fresh + vBbez .req z26 // fresh + vBbiz .req Abiz + vBboz .req Aboz + vBbuz .req Abuz + vBgaz .req Akaz + vBgez .req Akez + vBgiz .req Agiz + vBgoz .req Agoz + vBguz .req Aguz + vBkaz .req Amaz + vBkez .req Amez + vBkiz .req Akiz + vBkoz .req Akoz + vBkuz .req Akuz + vBmaz .req Asaz + vBmez .req Asez + vBmiz .req Amiz + vBmoz .req Amoz + vBmuz .req Amuz + vBsaz .req Abaz + vBsez .req Abez + vBsiz .req Asiz + vBsoz .req Asoz + vBsuz .req Asuz + + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req C4 + E1 .req C0 + E2 .req vBbe // fresh + E3 .req C2 + E4 .req C3 + + E0q .req C4q + E1q .req C0q + E2q .req vBbeq // fresh + E3q .req C2q + E4q .req C3q + + E0z .req C4z + E1z .req C0z + E2z .req vBbez // fresh + E3z .req C2z + E4z .req C3z + + + +/************************ MACROS ****************************/ + +.macro load_input + ldp Abaq, Abeq, [input_addr, #(2*8*0)] + ldp Abiq, Aboq, [input_addr, #(2*8*2)] + ldp Abuq, Agaq, [input_addr, #(2*8*4)] + ldp Ageq, Agiq, [input_addr, #(2*8*6)] + ldp Agoq, Aguq, [input_addr, #(2*8*8)] + ldp Akaq, Akeq, [input_addr, #(2*8*10)] + ldp Akiq, Akoq, [input_addr, #(2*8*12)] + ldp Akuq, Amaq, [input_addr, #(2*8*14)] + ldp Ameq, Amiq, [input_addr, #(2*8*16)] + ldp Amoq, Amuq, [input_addr, #(2*8*18)] + ldp Asaq, Aseq, [input_addr, #(2*8*20)] + ldp Asiq, Asoq, [input_addr, #(2*8*22)] + ldr Asuq, [input_addr, #(2*8*24)] +.endm + +.macro store_input + str Abaq, [input_addr, #(2*8*0)] + str Abeq, [input_addr, #(2*8*1)] + str Abiq, [input_addr, #(2*8*2)] + str Aboq, [input_addr, #(2*8*3)] + str Abuq, [input_addr, #(2*8*4)] + str Agaq, [input_addr, #(2*8*5)] + str Ageq, [input_addr, #(2*8*6)] + str Agiq, [input_addr, #(2*8*7)] + str Agoq, [input_addr, #(2*8*8)] + str Aguq, [input_addr, #(2*8*9)] + str Akaq, [input_addr, #(2*8*10)] + str Akeq, [input_addr, #(2*8*11)] + str Akiq, [input_addr, #(2*8*12)] + str Akoq, [input_addr, #(2*8*13)] + str Akuq, [input_addr, #(2*8*14)] + str Amaq, [input_addr, #(2*8*15)] + str Ameq, [input_addr, #(2*8*16)] + str Amiq, [input_addr, #(2*8*17)] + str Amoq, [input_addr, #(2*8*18)] + str Amuq, [input_addr, #(2*8*19)] + str Asaq, [input_addr, #(2*8*20)] + str Aseq, [input_addr, #(2*8*21)] + str Asiq, [input_addr, #(2*8*22)] + str Asoq, [input_addr, #(2*8*23)] + str Asuq, [input_addr, #(2*8*24)] +.endm + +#define STACK_SIZE (16*4 + 16*34) +#define STACK_BASE_VREGS 0 +#define STACK_BASE_TMP 16*4 + +#define Aga_offset 0 +#define E0_offset 1 +#define E1_offset 2 +#define E2_offset 3 +#define E3_offset 4 +#define E4_offset 5 +#define Ame_offset 7 +#define Agi_offset 8 +#define Aka_offset 9 +#define Abo_offset 10 +#define Amo_offset 11 +#define Ami_offset 12 +#define Ake_offset 13 +#define Agu_offset 14 +#define Asi_offset 15 +#define Aku_offset 16 +#define Asa_offset 17 +#define Abu_offset 18 +#define Asu_offset 19 +#define Ase_offset 20 +//#define Aga_offset 21 +#define Age_offset 22 +#define vBgo_offset 23 +#define vBke_offset 24 +#define vBgi_offset 25 +#define vBga_offset 26 +#define vBbo_offset 27 +#define vBmo_offset 28 +#define vBmi_offset 29 +#define vBge_offset 30 + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +#define save(name) \ + str name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] +#define restore(name) \ + ldr name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] + +.macro save_vregs + stp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + stp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + stp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + stp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + ldp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + ldp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + ldp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +/* Macros using v8.4-A SHA-3 instructions */ + +.macro eor3_m1_0 d s0 s1 s2 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor2 d s0 s1 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor3_m1_1 d s0 s1 s2 + eor \d\().16b, \d\().16b, \s2\().16b +.endm + +.macro eor3_m1 d s0 s1 s2 + eor3_m1_0 \d, \s0, \s1, \s2 + eor3_m1_1 \d, \s0, \s1, \s2 +.endm + +.macro rax1_m1 d s0 s1 + // Use add instead of SHL #1 + add tmp.2d, \s1\().2d, \s1\().2d + sri tmp.2d, \s1\().2d, #63 + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +xar_m1_const: + .quad (1ULL<<(64-61)) + .quad (1ULL<<(64-56)) + .quad (1ULL<<(64-50)) + .quad (1ULL<<(64-46)) + .quad (1ULL<<(64-44)) + .quad (1ULL<<(64-43)) + .quad (1ULL<<(64-39)) + .quad (1ULL<<(64-36)) + .quad (1ULL<<(64-21)) + .quad (1ULL<<(64-19)) + .quad (1ULL<<(64-9)) + .quad (1ULL<<(64-3)) + + +xar_m1_const_addr: .quad xar_m1_const + + .macro xar_m1 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 21 + eor \s0\().16b, \s0\().16b, \s1\().16b + ldr \d\()q, [x17, #64] + mul \d\()z\().d, \s0\()z\().d, \d\()z\().d[0] + sri \d\().2d, \s0\().2d, #(\imm) + .elseif \imm == 39 + eor \s0\().16b, \s0\().16b, \s1\().16b + ldr \d\()q, [x17, #48] + mul \d\()z\().d, \s0\()z\().d, \d\()z\().d[0] + sri \d\().2d, \s0\().2d, #(\imm) + .elseif \imm == 56 + eor \s0\().16b, \s0\().16b, \s1\().16b + ldr \d\()q, [x17] + mul \d\()z\().d, \s0\()z\().d, \d\()z\().d[1] + sri \d\().2d, \s0\().2d, #(\imm) + .elseif \imm == 63 + eor \s0\().16b, \s0\().16b, \s1\().16b + add \d\().2d, \s0\().2d, \s0\().2d + sri \d\().2d, \s0\().2d, #(63) + .elseif \imm == 9 + eor \s0\().16b, \s0\().16b, \s1\().16b + ldr \d\()q, [x17, #80] + mul \d\()z\().d, \s0\()z\().d, \d\()z\().d[0] + sri \d\().2d, \s0\().2d, #(\imm) + .elseif \imm == 19 + eor \s0\().16b, \s0\().16b, \s1\().16b + ldr \d\()q, [x17, #64] + mul \d\()z\().d, \s0\()z\().d, \d\()z\().d[1] + sri \d\().2d, \s0\().2d, #(\imm) + .elseif \imm == 61 + eor \s0\().16b, \s0\().16b, \s1\().16b + ldr \d\()q, [x17] + mul \d\()z\().d, \s0\()z\().d, \d\()z\().d[0] + sri \d\().2d, \s0\().2d, #(\imm) + .elseif \imm == 36 + eor \s0\().16b, \s0\().16b, \s1\().16b + ldr \d\()q, [x17, #48] + mul \d\()z\().d, \s0\()z\().d, \d\()z\().d[1] + sri \d\().2d, \s0\().2d, #(\imm) + .elseif \imm == 43 + eor \s0\().16b, \s0\().16b, \s1\().16b + ldr \d\()q, [x17, #32] + mul \d\()z\().d, \s0\()z\().d, \d\()z\().d[1] + sri \d\().2d, \s0\().2d, #(\imm) + .elseif \imm == 44 + eor \s0\().16b, \s0\().16b, \s1\().16b + ldr \d\()q, [x17, #32] + mul \d\()z\().d, \s0\()z\().d, \d\()z\().d[0] + sri \d\().2d, \s0\().2d, #(\imm) + .elseif \imm == 3 + eor \s0\().16b, \s0\().16b, \s1\().16b + ldr \d\()q, [x17, #80] + mul \d\()z\().d, \s0\()z\().d, \d\()z\().d[1] + sri \d\().2d, \s0\().2d, #(\imm) + .elseif \imm == 46 + eor \s0\().16b, \s0\().16b, \s1\().16b + ldr \d\()q, [x17, #16] + mul \d\()z\().d, \s0\()z\().d, \d\()z\().d[1] + sri \d\().2d, \s0\().2d, #(\imm) + .elseif \imm == 50 + eor \s0\().16b, \s0\().16b, \s1\().16b + ldr \d\()q, [x17, #16] + mul \d\()z\().d, \s0\()z\().d, \d\()z\().d[0] + sri \d\().2d, \s0\().2d, #(\imm) + .elseif \imm == 62 + eor \s0\().16b, \s0\().16b, \s1\().16b + add \d\().2d, \s0\().2d, \s0\().2d + add \d\().2d, \d\().2d, \d\().2d + sri \d\().2d, \s0\().2d, #(62) + .else + eor \s0\().16b, \s0\().16b, \s1\().16b + shl \d\().2d, \s0\().2d, #(64-\imm) + sri \d\().2d, \s0\().2d, #(\imm) + .endif +.endm + +.macro bcax_m1 d s0 s1 s2 + bic tmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +/* Keccak-f1600 round */ + +.macro keccak_f1600_round_pre + + /* 10 EOR3, so 20 individual EOR */ + + eor3_m1_0 C1, Abe, Age, Ake + eor3_m1_0 C3, Abo, Ago, Ako + eor3_m1_0 C0, Aba, Aga, Aka + eor3_m1_0 C2, Abi, Agi, Aki + eor3_m1_0 C4, Abu, Agu, Aku + eor3_m1_1 C1, Abe, Age, Ake + eor3_m1_1 C3, Abo, Ago, Ako + eor3_m1_1 C0, Aba, Aga, Aka + eor3_m1_1 C2, Abi, Agi, Aki + eor3_m1_1 C4, Abu, Agu, Aku + eor3_m1_0 C1, C1, Ame, Ase + eor3_m1_0 C3, C3, Amo, Aso + eor3_m1_0 C0, C0, Ama, Asa + eor3_m1_0 C2, C2, Ami, Asi + eor3_m1_0 C4, C4, Amu, Asu + eor3_m1_1 C1, C1, Ame, Ase + eor3_m1_1 C3, C3, Amo, Aso + eor3_m1_1 C0, C0, Ama, Asa + eor3_m1_1 C2, C2, Ami, Asi + eor3_m1_1 C4, C4, Amu, Asu + +.endm + +.macro keccak_f1600_round + + /* 10 EOR3, so 20 individual EOR */ + + eor3_m1_0 C0, Aba, Aga, Aka + eor3_m1_0 C1, Abe, Age, Ake + eor3_m1_0 C2, Abi, Agi, Aki + eor3_m1_0 C3, Abo, Ago, Ako + eor3_m1_0 C4, Abu, Agu, Aku + eor3_m1_1 C0, Aba, Aga, Aka + eor3_m1_1 C1, Abe, Age, Ake + eor3_m1_1 C2, Abi, Agi, Aki + eor3_m1_1 C3, Abo, Ago, Ako + eor3_m1_1 C4, Abu, Agu, Aku + eor3_m1_0 C0, C0, Ama, Asa + eor3_m1_0 C1, C1, Ame, Ase + eor3_m1_0 C2, C2, Ami, Asi + eor3_m1_0 C3, C3, Amo, Aso + eor3_m1_0 C4, C4, Amu, Asu + eor3_m1_1 C0, C0, Ama, Asa + eor3_m1_1 C1, C1, Ame, Ase + eor3_m1_1 C2, C2, Ami, Asi + eor3_m1_1 C3, C3, Amo, Aso + eor3_m1_1 C4, C4, Amu, Asu + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req vBba + rax1_m1 E2, C1, C3 + rax1_m1 E4, C3, C0 + rax1_m1 E1, C0, C2 + rax1_m1 E3, C2, C4 + rax1_m1 E0, C4, C1 + .unreq tmp + + /* 25x XAR, 75 in total */ + + tmp .req C1 + tmpq .req C1q + + eor vBba.16b, Aba.16b, E0.16b + xar_m1 vBsa, Abi, E2, 2 + xar_m1 vBbi, Aki, E2, 21 + xar_m1 vBki, Ako, E3, 39 + xar_m1 vBko, Amu, E4, 56 + xar_m1 vBmu, Aso, E3, 8 + xar_m1 vBso, Ama, E0, 23 + xar_m1 vBka, Abe, E1, 63 + xar_m1 vBse, Ago, E3, 9 + xar_m1 vBgo, Ame, E1, 19 + xar_m1 vBke, Agi, E2, 58 + xar_m1 vBgi, Aka, E0, 61 + xar_m1 vBga, Abo, E3, 36 + xar_m1 vBbo, Amo, E3, 43 + xar_m1 vBmo, Ami, E2, 49 + xar_m1 vBmi, Ake, E1, 54 + xar_m1 vBge, Agu, E4, 44 + xar_m1 vBgu, Asi, E2, 3 + xar_m1 vBsi, Aku, E4, 25 + xar_m1 vBku, Asa, E0, 46 + xar_m1 vBma, Abu, E4, 37 + xar_m1 vBbu, Asu, E4, 50 + xar_m1 vBsu, Ase, E1, 62 + xar_m1 vBme, Aga, E0, 28 + xar_m1 vBbe, Age, E1, 20 + + /* 25x BCAX, 50 in total */ + + bcax_m1 Aga, vBga, vBgi, vBge + bcax_m1 Age, vBge, vBgo, vBgi + bcax_m1 Agi, vBgi, vBgu, vBgo + bcax_m1 Ago, vBgo, vBga, vBgu + bcax_m1 Agu, vBgu, vBge, vBga + bcax_m1 Aka, vBka, vBki, vBke + bcax_m1 Ake, vBke, vBko, vBki + bcax_m1 Aki, vBki, vBku, vBko + bcax_m1 Ako, vBko, vBka, vBku + bcax_m1 Aku, vBku, vBke, vBka + bcax_m1 Ama, vBma, vBmi, vBme + bcax_m1 Ame, vBme, vBmo, vBmi + bcax_m1 Ami, vBmi, vBmu, vBmo + bcax_m1 Amo, vBmo, vBma, vBmu + bcax_m1 Amu, vBmu, vBme, vBma + bcax_m1 Asa, vBsa, vBsi, vBse + bcax_m1 Ase, vBse, vBso, vBsi + bcax_m1 Asi, vBsi, vBsu, vBso + bcax_m1 Aso, vBso, vBsa, vBsu + bcax_m1 Asu, vBsu, vBse, vBsa + bcax_m1 Aba, vBba, vBbi, vBbe + bcax_m1 Abe, vBbe, vBbo, vBbi + bcax_m1 Abi, vBbi, vBbu, vBbo + bcax_m1 Abo, vBbo, vBba, vBbu + bcax_m1 Abu, vBbu, vBbe, vBba + + // iota step + //ld1r {tmp.2d}, [const_addr], #8 + ldr tmpq, [const_addr], #16 + eor Aba.16b, Aba.16b, tmp.16b + + .unreq tmp + .unreq tmpq + +.endm + +.macro keccak_f1600_round_core + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req vBba + rax1_m1 E2, C1, C3 + rax1_m1 E4, C3, C0 + rax1_m1 E1, C0, C2 + rax1_m1 E3, C2, C4 + str Agiq, [sp, #(STACK_BASE_TMP + 16*32)] + rax1_m1 E0, C4, C1 + + /* 25x XAR, 75 in total */ + + .unreq tmp + tmp .req C1 + tmpq .req C1q + + xar_m1 vBgi, Aka, E0, 61 + xar_m1 vBga, Abo, E3, 36 + str Agaq, [sp, #(STACK_BASE_TMP + 16 * 30)] + xar_m1 vBbo, Amo, E3, 43 + xar_m1 vBmo, Ami, E2, 49 + str Ageq, [sp, #(STACK_BASE_TMP + 16 * 31)] + xar_m1 vBmi, Ake, E1, 54 + xar_m1 vBge, Agu, E4, 44 + bcax_m1 Aga, vBga, vBgi, vBge + + eor vBba.16b, Aba.16b, E0.16b + xar_m1 vBsa, Abi, E2, 2 + xar_m1 vBbi, Aki, E2, 21 + xar_m1 vBki, Ako, E3, 39 + xar_m1 vBko, Amu, E4, 56 + xar_m1 vBmu, Aso, E3, 8 + xar_m1 vBso, Ama, E0, 23 + xar_m1 vBka, Abe, E1, 63 + xar_m1 vBse, Ago, E3, 9 + xar_m1 vBgo, Ame, E1, 19 + bcax_m1 Age, vBge, vBgo, vBgi + + ldr tmpq, [sp, #(STACK_BASE_TMP + 16*32)] + xar_m1 vBke, tmp, E2, 58 + + xar_m1 vBgu, Asi, E2, 3 + bcax_m1 Agi, vBgi, vBgu, vBgo + xar_m1 vBsi, Aku, E4, 25 + xar_m1 vBku, Asa, E0, 46 + xar_m1 vBma, Abu, E4, 37 + xar_m1 vBbu, Asu, E4, 50 + xar_m1 vBsu, Ase, E1, 62 + ldp tmpq, E3q, [sp, #(STACK_BASE_TMP + 16*30)] + xar_m1 vBme, tmp, E0, 28 + xar_m1 vBbe, E3, E1, 20 + + /* 25x BCAX, 50 in total */ + + bcax_m1 Ago, vBgo, vBga, vBgu + bcax_m1 Agu, vBgu, vBge, vBga + bcax_m1 Aka, vBka, vBki, vBke + bcax_m1 Ake, vBke, vBko, vBki + + .unreq tmp + .unreq tmpq + + eor2 C0, Aka, Aga + save(Aga) + + tmp .req Aga + tmpq .req Agaq + bcax_m1 Aki, vBki, vBku, vBko + bcax_m1 Ako, vBko, vBka, vBku + eor2 C1, Ake, Age + bcax_m1 Aku, vBku, vBke, vBka + eor2 C2, Aki, Agi + bcax_m1 Ama, vBma, vBmi, vBme + eor2 C3, Ako, Ago + bcax_m1 Ame, vBme, vBmo, vBmi + eor2 C4, Aku, Agu + bcax_m1 Ami, vBmi, vBmu, vBmo + eor2 C0, C0, Ama + bcax_m1 Amo, vBmo, vBma, vBmu + eor2 C1, C1, Ame + bcax_m1 Amu, vBmu, vBme, vBma + eor2 C2, C2, Ami + bcax_m1 Asa, vBsa, vBsi, vBse + eor2 C3, C3, Amo + bcax_m1 Ase, vBse, vBso, vBsi + eor2 C4, C4, Amu + bcax_m1 Asi, vBsi, vBsu, vBso + eor2 C0, C0, Asa + bcax_m1 Aso, vBso, vBsa, vBsu + eor2 C1, C1, Ase + bcax_m1 Asu, vBsu, vBse, vBsa + eor2 C2, C2, Asi + eor2 C3, C3, Aso + bcax_m1 Aba, vBba, vBbi, vBbe + bcax_m1 Abe, vBbe, vBbo, vBbi + eor2 C1, C1, Abe + + // iota step + //ld1r {tmp.2d}, [const_addr], #8 + ldr tmpq, [const_addr], #16 + eor Aba.16b, Aba.16b, tmp.16b + eor2 C4, C4, Asu + bcax_m1 Abi, vBbi, vBbu, vBbo + bcax_m1 Abo, vBbo, vBba, vBbu + eor2 C3, C3, Abo + eor2 C2, C2, Abi + eor2 C0, C0, Aba + bcax_m1 Abu, vBbu, vBbe, vBba + eor2 C4, C4, Abu + + restore(Aga) + .unreq tmp + .unreq tmpq + +.endm + +.macro keccak_f1600_round_post + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req vBba + rax1_m1 E2, C1, C3 + rax1_m1 E4, C3, C0 + rax1_m1 E1, C0, C2 + rax1_m1 E3, C2, C4 + rax1_m1 E0, C4, C1 + .unreq tmp + + /* 25x XAR, 75 in total */ + + tmp .req C1 + tmpq .req C1q + eor vBba.16b, Aba.16b, E0.16b + xar_m1 vBsa, Abi, E2, 2 + xar_m1 vBbi, Aki, E2, 21 + xar_m1 vBki, Ako, E3, 39 + xar_m1 vBko, Amu, E4, 56 + xar_m1 vBmu, Aso, E3, 8 + xar_m1 vBso, Ama, E0, 23 + xar_m1 vBka, Abe, E1, 63 + xar_m1 vBse, Ago, E3, 9 + xar_m1 vBgo, Ame, E1, 19 + xar_m1 vBke, Agi, E2, 58 + xar_m1 vBgi, Aka, E0, 61 + xar_m1 vBga, Abo, E3, 36 + xar_m1 vBbo, Amo, E3, 43 + xar_m1 vBmo, Ami, E2, 49 + xar_m1 vBmi, Ake, E1, 54 + xar_m1 vBge, Agu, E4, 44 + xar_m1 vBgu, Asi, E2, 3 + xar_m1 vBsi, Aku, E4, 25 + xar_m1 vBku, Asa, E0, 46 + xar_m1 vBma, Abu, E4, 37 + xar_m1 vBbu, Asu, E4, 50 + xar_m1 vBsu, Ase, E1, 62 + xar_m1 vBme, Aga, E0, 28 + xar_m1 vBbe, Age, E1, 20 + + /* 25x BCAX, 50 in total */ + + bcax_m1 Aga, vBga, vBgi, vBge + bcax_m1 Age, vBge, vBgo, vBgi + bcax_m1 Agi, vBgi, vBgu, vBgo + bcax_m1 Ago, vBgo, vBga, vBgu + bcax_m1 Agu, vBgu, vBge, vBga + bcax_m1 Aka, vBka, vBki, vBke + bcax_m1 Ake, vBke, vBko, vBki + bcax_m1 Aki, vBki, vBku, vBko + bcax_m1 Ako, vBko, vBka, vBku + bcax_m1 Aku, vBku, vBke, vBka + bcax_m1 Ama, vBma, vBmi, vBme + bcax_m1 Ame, vBme, vBmo, vBmi + bcax_m1 Ami, vBmi, vBmu, vBmo + bcax_m1 Amo, vBmo, vBma, vBmu + bcax_m1 Amu, vBmu, vBme, vBma + bcax_m1 Asa, vBsa, vBsi, vBse + bcax_m1 Ase, vBse, vBso, vBsi + bcax_m1 Asi, vBsi, vBsu, vBso + bcax_m1 Aso, vBso, vBsa, vBsu + bcax_m1 Asu, vBsu, vBse, vBsa + bcax_m1 Aba, vBba, vBbi, vBbe + bcax_m1 Abe, vBbe, vBbo, vBbi + bcax_m1 Abi, vBbi, vBbu, vBbo + bcax_m1 Abo, vBbo, vBba, vBbu + bcax_m1 Abu, vBbu, vBbe, vBba + + // iota step + //ld1r {tmp.2d}, [const_addr], #8 + ldr tmpq, [const_addr], #16 + eor Aba.16b, Aba.16b, tmp.16b + + .unreq tmp + +.endm + + +.text +.align 4 +.global keccak_f1600_x2_v84a_asm_v2pp6 +.global _keccak_f1600_x2_v84a_asm_v2pp6 + +#define KECCAK_F1600_ROUNDS 24 + +keccak_f1600_x2_v84a_asm_v2pp6: +_keccak_f1600_x2_v84a_asm_v2pp6: + alloc_stack + save_vregs + load_constant_ptr + load_input + + ldr x17, xar_m1_const_addr + + //mov count, #(KECCAK_F1600_ROUNDS-2) + mov count, #11 + keccak_f1600_round_pre +loop: + keccak_f1600_round_core + keccak_f1600_round_core + sub count, count, #1 + cbnz count, loop + + keccak_f1600_round_core + keccak_f1600_round_post + store_input + restore_vregs + free_stack + ret +#endif diff --git a/asm/manual/keccak_f1600/keccak_f1600_x2_v84a_asm_v2pp7.s b/asm/manual/keccak_f1600/keccak_f1600_x2_v84a_asm_v2pp7.s new file mode 100644 index 0000000..ae72584 --- /dev/null +++ b/asm/manual/keccak_f1600/keccak_f1600_x2_v84a_asm_v2pp7.s @@ -0,0 +1,901 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + + +#if defined(__ARM_FEATURE_SVE2) +/********************** CONSTANTS *************************/ + .data + .align(8) +_round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x1 + count .req x2 + cur_const .req x3 + + /* Mapping of Kecck-f1600 state to vector registers + * at the beginning and end of each round. */ + Aba .req v0 + Abe .req v1 + Abi .req v2 + Abo .req v3 + Abu .req v4 + Aga .req v5 + Age .req v6 + Agi .req v7 + Ago .req v8 + Agu .req v9 + Aka .req v10 + Ake .req v11 + Aki .req v12 + Ako .req v13 + Aku .req v14 + Ama .req v15 + Ame .req v16 + Ami .req v17 + Amo .req v18 + Amu .req v19 + Asa .req v20 + Ase .req v21 + Asi .req v22 + Aso .req v23 + Asu .req v24 + + /* q-form of the above mapping */ + Abaq .req q0 + Abeq .req q1 + Abiq .req q2 + Aboq .req q3 + Abuq .req q4 + Agaq .req q5 + Ageq .req q6 + Agiq .req q7 + Agoq .req q8 + Aguq .req q9 + Akaq .req q10 + Akeq .req q11 + Akiq .req q12 + Akoq .req q13 + Akuq .req q14 + Amaq .req q15 + Ameq .req q16 + Amiq .req q17 + Amoq .req q18 + Amuq .req q19 + Asaq .req q20 + Aseq .req q21 + Asiq .req q22 + Asoq .req q23 + Asuq .req q24 + + /* z-form of the above mapping */ + Abaz .req z0 + Abez .req z1 + Abiz .req z2 + Aboz .req z3 + Abuz .req z4 + Agaz .req z5 + Agez .req z6 + Agiz .req z7 + Agoz .req z8 + Aguz .req z9 + Akaz .req z10 + Akez .req z11 + Akiz .req z12 + Akoz .req z13 + Akuz .req z14 + Amaz .req z15 + Amez .req z16 + Amiz .req z17 + Amoz .req z18 + Amuz .req z19 + Asaz .req z20 + Asez .req z21 + Asiz .req z22 + Asoz .req z23 + Asuz .req z24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v27 + C1 .req v28 + C2 .req v29 + C3 .req v30 + C4 .req v31 + + C0q .req q27 + C1q .req q28 + C2q .req q29 + C3q .req q30 + C4q .req q31 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + vBba .req v25 // fresh + vBbe .req v26 // fresh + vBbi .req Abi + vBbo .req Abo + vBbu .req Abu + vBga .req Aka + vBge .req Ake + vBgi .req Agi + vBgo .req Ago + vBgu .req Agu + vBka .req Ama + vBke .req Ame + vBki .req Aki + vBko .req Ako + vBku .req Aku + vBma .req Asa + vBme .req Ase + vBmi .req Ami + vBmo .req Amo + vBmu .req Amu + vBsa .req Aba + vBse .req Abe + vBsi .req Asi + vBso .req Aso + vBsu .req Asu + + vBbaq .req q25 // fresh + vBbeq .req q26 // fresh + vBbiq .req Abiq + vBboq .req Aboq + vBbuq .req Abuq + vBgaq .req Akaq + vBgeq .req Akeq + vBgiq .req Agiq + vBgoq .req Agoq + vBguq .req Aguq + vBkaq .req Amaq + vBkeq .req Ameq + vBkiq .req Akiq + vBkoq .req Akoq + vBkuq .req Akuq + vBmaq .req Asaq + vBmeq .req Aseq + vBmiq .req Amiq + vBmoq .req Amoq + vBmuq .req Amuq + vBsaq .req Abaq + vBseq .req Abeq + vBsiq .req Asiq + vBsoq .req Asoq + vBsuq .req Asuq + + vBbaz .req z25 // fresh + vBbez .req z26 // fresh + vBbiz .req Abiz + vBboz .req Aboz + vBbuz .req Abuz + vBgaz .req Akaz + vBgez .req Akez + vBgiz .req Agiz + vBgoz .req Agoz + vBguz .req Aguz + vBkaz .req Amaz + vBkez .req Amez + vBkiz .req Akiz + vBkoz .req Akoz + vBkuz .req Akuz + vBmaz .req Asaz + vBmez .req Asez + vBmiz .req Amiz + vBmoz .req Amoz + vBmuz .req Amuz + vBsaz .req Abaz + vBsez .req Abez + vBsiz .req Asiz + vBsoz .req Asoz + vBsuz .req Asuz + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req C4 + E1 .req C0 + E2 .req vBbe // fresh + E3 .req C2 + E4 .req C3 + + E0q .req C4q + E1q .req C0q + E2q .req vBbeq // fresh + E3q .req C2q + E4q .req C3q + + +/************************ MACROS ****************************/ + +.macro load_input + ldp Abaq, Abeq, [input_addr, #(2*8*0)] + ldp Abiq, Aboq, [input_addr, #(2*8*2)] + ldp Abuq, Agaq, [input_addr, #(2*8*4)] + ldp Ageq, Agiq, [input_addr, #(2*8*6)] + ldp Agoq, Aguq, [input_addr, #(2*8*8)] + ldp Akaq, Akeq, [input_addr, #(2*8*10)] + ldp Akiq, Akoq, [input_addr, #(2*8*12)] + ldp Akuq, Amaq, [input_addr, #(2*8*14)] + ldp Ameq, Amiq, [input_addr, #(2*8*16)] + ldp Amoq, Amuq, [input_addr, #(2*8*18)] + ldp Asaq, Aseq, [input_addr, #(2*8*20)] + ldp Asiq, Asoq, [input_addr, #(2*8*22)] + ldr Asuq, [input_addr, #(2*8*24)] +.endm + +.macro store_input + str Abaq, [input_addr, #(2*8*0)] + str Abeq, [input_addr, #(2*8*1)] + str Abiq, [input_addr, #(2*8*2)] + str Aboq, [input_addr, #(2*8*3)] + str Abuq, [input_addr, #(2*8*4)] + str Agaq, [input_addr, #(2*8*5)] + str Ageq, [input_addr, #(2*8*6)] + str Agiq, [input_addr, #(2*8*7)] + str Agoq, [input_addr, #(2*8*8)] + str Aguq, [input_addr, #(2*8*9)] + str Akaq, [input_addr, #(2*8*10)] + str Akeq, [input_addr, #(2*8*11)] + str Akiq, [input_addr, #(2*8*12)] + str Akoq, [input_addr, #(2*8*13)] + str Akuq, [input_addr, #(2*8*14)] + str Amaq, [input_addr, #(2*8*15)] + str Ameq, [input_addr, #(2*8*16)] + str Amiq, [input_addr, #(2*8*17)] + str Amoq, [input_addr, #(2*8*18)] + str Amuq, [input_addr, #(2*8*19)] + str Asaq, [input_addr, #(2*8*20)] + str Aseq, [input_addr, #(2*8*21)] + str Asiq, [input_addr, #(2*8*22)] + str Asoq, [input_addr, #(2*8*23)] + str Asuq, [input_addr, #(2*8*24)] +.endm + +#define STACK_SIZE (16*4 + 16*34) +#define STACK_BASE_VREGS 0 +#define STACK_BASE_TMP 16*4 + +#define Aga_offset 0 +#define E0_offset 1 +#define E1_offset 2 +#define E2_offset 3 +#define E3_offset 4 +#define E4_offset 5 +#define Ame_offset 7 +#define Agi_offset 8 +#define Aka_offset 9 +#define Abo_offset 10 +#define Amo_offset 11 +#define Ami_offset 12 +#define Ake_offset 13 +#define Agu_offset 14 +#define Asi_offset 15 +#define Aku_offset 16 +#define Asa_offset 17 +#define Abu_offset 18 +#define Asu_offset 19 +#define Ase_offset 20 +//#define Aga_offset 21 +#define Age_offset 22 +#define vBgo_offset 23 +#define vBke_offset 24 +#define vBgi_offset 25 +#define vBga_offset 26 +#define vBbo_offset 27 +#define vBmo_offset 28 +#define vBmi_offset 29 +#define vBge_offset 30 + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +#define save(name) \ + str name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] +#define restore(name) \ + ldr name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] + +.macro save_vregs + stp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + stp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + stp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + stp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + ldp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + ldp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + ldp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +/* Macros using v8.4-A SHA-3 instructions */ + +.macro eor3_m1_0 d s0 s1 s2 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor2 d s0 s1 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor3_m1_1 d s0 s1 s2 + eor \d\().16b, \d\().16b, \s2\().16b +.endm + +.macro eor3_m1 d s0 s1 s2 + eor3_m1_0 \d, \s0, \s1, \s2 + eor3_m1_1 \d, \s0, \s1, \s2 +.endm + +.macro rax1_m1 d s0 s1 + // Use add instead of SHL #1 + add tmp.2d, \s1\().2d, \s1\().2d + sri tmp.2d, \s1\().2d, #63 + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +xar_m1_const: + .quad (1ULL<<(64-61)) + .quad (1ULL<<(64-56)) + .quad (1ULL<<(64-50)) + .quad (1ULL<<(64-46)) + .quad (1ULL<<(64-44)) + .quad (1ULL<<(64-43)) + .quad (1ULL<<(64-39)) + .quad (1ULL<<(64-36)) + .quad (1ULL<<(64-21)) + .quad (1ULL<<(64-19)) + .quad (1ULL<<(64-9)) + .quad (1ULL<<(64-3)) + + +xar_m1_const_addr: .quad xar_m1_const + + .macro xar_m1 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 21 + eor \s0\().16b, \s0\().16b, \s1\().16b + ldr \d\()q, [x17, #64] + mul \d\()z\().d, \s0\()z\().d, \d\()z\().d[0] + sri \d\().2d, \s0\().2d, #(\imm) + .elseif \imm == 39 + eor \s0\().16b, \s0\().16b, \s1\().16b + ldr \d\()q, [x17, #48] + mul \d\()z\().d, \s0\()z\().d, \d\()z\().d[0] + sri \d\().2d, \s0\().2d, #(\imm) + .elseif \imm == 56 + eor \s0\().16b, \s0\().16b, \s1\().16b + ldr \d\()q, [x17] + mul \d\()z\().d, \s0\()z\().d, \d\()z\().d[1] + sri \d\().2d, \s0\().2d, #(\imm) + .elseif \imm == 63 + eor \s0\().16b, \s0\().16b, \s1\().16b + add \d\().2d, \s0\().2d, \s0\().2d + sri \d\().2d, \s0\().2d, #(63) + .elseif \imm == 9 + eor \s0\().16b, \s0\().16b, \s1\().16b + ldr \d\()q, [x17, #80] + mul \d\()z\().d, \s0\()z\().d, \d\()z\().d[0] + sri \d\().2d, \s0\().2d, #(\imm) + .elseif \imm == 19 + eor \s0\().16b, \s0\().16b, \s1\().16b + ldr \d\()q, [x17, #64] + mul \d\()z\().d, \s0\()z\().d, \d\()z\().d[1] + sri \d\().2d, \s0\().2d, #(\imm) + .elseif \imm == 61 + eor \s0\().16b, \s0\().16b, \s1\().16b + ldr \d\()q, [x17] + mul \d\()z\().d, \s0\()z\().d, \d\()z\().d[0] + sri \d\().2d, \s0\().2d, #(\imm) + .elseif \imm == 36 + eor \s0\().16b, \s0\().16b, \s1\().16b + ldr \d\()q, [x17, #48] + mul \d\()z\().d, \s0\()z\().d, \d\()z\().d[1] + sri \d\().2d, \s0\().2d, #(\imm) + .elseif \imm == 43 + eor \s0\().16b, \s0\().16b, \s1\().16b + ldr \d\()q, [x17, #32] + mul \d\()z\().d, \s0\()z\().d, \d\()z\().d[1] + sri \d\().2d, \s0\().2d, #(\imm) + .elseif \imm == 44 + eor \s0\().16b, \s0\().16b, \s1\().16b + ldr \d\()q, [x17, #32] + mul \d\()z\().d, \s0\()z\().d, \d\()z\().d[0] + sri \d\().2d, \s0\().2d, #(\imm) + .elseif \imm == 3 + eor \s0\().16b, \s0\().16b, \s1\().16b + ldr \d\()q, [x17, #80] + mul \d\()z\().d, \s0\()z\().d, \d\()z\().d[1] + sri \d\().2d, \s0\().2d, #(\imm) + .elseif \imm == 46 + eor \s0\().16b, \s0\().16b, \s1\().16b + ldr \d\()q, [x17, #16] + mul \d\()z\().d, \s0\()z\().d, \d\()z\().d[1] + sri \d\().2d, \s0\().2d, #(\imm) + .elseif \imm == 50 + eor \s0\().16b, \s0\().16b, \s1\().16b + ldr \d\()q, [x17, #16] + mul \d\()z\().d, \s0\()z\().d, \d\()z\().d[0] + sri \d\().2d, \s0\().2d, #(\imm) + .elseif \imm == 62 + eor \s0\().16b, \s0\().16b, \s1\().16b + add \d\().2d, \s0\().2d, \s0\().2d + add \d\().2d, \d\().2d, \d\().2d + sri \d\().2d, \s0\().2d, #(62) + .else + eor \s0\().16b, \s0\().16b, \s1\().16b + shl \d\().2d, \s0\().2d, #(64-\imm) + sri \d\().2d, \s0\().2d, #(\imm) + .endif +.endm + +.macro bcax_m1 d s0 s1 s2 + bic tmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +/* Keccak-f1600 round */ + +.macro keccak_f1600_round_pre + + /* 10 EOR3, so 20 individual EOR */ + + eor3_m1_0 C1, Abe, Age, Ake + eor3_m1_0 C3, Abo, Ago, Ako + eor3_m1_0 C0, Aba, Aga, Aka + eor3_m1_0 C2, Abi, Agi, Aki + eor3_m1_0 C4, Abu, Agu, Aku + eor3_m1_1 C1, Abe, Age, Ake + eor3_m1_1 C3, Abo, Ago, Ako + eor3_m1_1 C0, Aba, Aga, Aka + eor3_m1_1 C2, Abi, Agi, Aki + eor3_m1_1 C4, Abu, Agu, Aku + eor3_m1_0 C1, C1, Ame, Ase + eor3_m1_0 C3, C3, Amo, Aso + eor3_m1_0 C0, C0, Ama, Asa + eor3_m1_0 C2, C2, Ami, Asi + eor3_m1_0 C4, C4, Amu, Asu + eor3_m1_1 C1, C1, Ame, Ase + eor3_m1_1 C3, C3, Amo, Aso + eor3_m1_1 C0, C0, Ama, Asa + eor3_m1_1 C2, C2, Ami, Asi + eor3_m1_1 C4, C4, Amu, Asu + +.endm + +.macro keccak_f1600_round + + /* 10 EOR3, so 20 individual EOR */ + + eor3_m1_0 C0, Aba, Aga, Aka + eor3_m1_0 C1, Abe, Age, Ake + eor3_m1_0 C2, Abi, Agi, Aki + eor3_m1_0 C3, Abo, Ago, Ako + eor3_m1_0 C4, Abu, Agu, Aku + eor3_m1_1 C0, Aba, Aga, Aka + eor3_m1_1 C1, Abe, Age, Ake + eor3_m1_1 C2, Abi, Agi, Aki + eor3_m1_1 C3, Abo, Ago, Ako + eor3_m1_1 C4, Abu, Agu, Aku + eor3_m1_0 C0, C0, Ama, Asa + eor3_m1_0 C1, C1, Ame, Ase + eor3_m1_0 C2, C2, Ami, Asi + eor3_m1_0 C3, C3, Amo, Aso + eor3_m1_0 C4, C4, Amu, Asu + eor3_m1_1 C0, C0, Ama, Asa + eor3_m1_1 C1, C1, Ame, Ase + eor3_m1_1 C2, C2, Ami, Asi + eor3_m1_1 C3, C3, Amo, Aso + eor3_m1_1 C4, C4, Amu, Asu + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req vBba + rax1_m1 E2, C1, C3 + rax1_m1 E4, C3, C0 + rax1_m1 E1, C0, C2 + rax1_m1 E3, C2, C4 + rax1_m1 E0, C4, C1 + .unreq tmp + + /* 25x XAR, 75 in total */ + + tmp .req C1 + tmpq .req C1q + + eor vBba.16b, Aba.16b, E0.16b + xar_m1 vBsa, Abi, E2, 2 + xar_m1 vBbi, Aki, E2, 21 + xar_m1 vBki, Ako, E3, 39 + xar_m1 vBko, Amu, E4, 56 + xar_m1 vBmu, Aso, E3, 8 + xar_m1 vBso, Ama, E0, 23 + xar_m1 vBka, Abe, E1, 63 + xar_m1 vBse, Ago, E3, 9 + xar_m1 vBgo, Ame, E1, 19 + xar_m1 vBke, Agi, E2, 58 + xar_m1 vBgi, Aka, E0, 61 + xar_m1 vBga, Abo, E3, 36 + xar_m1 vBbo, Amo, E3, 43 + xar_m1 vBmo, Ami, E2, 49 + xar_m1 vBmi, Ake, E1, 54 + xar_m1 vBge, Agu, E4, 44 + xar_m1 vBgu, Asi, E2, 3 + xar_m1 vBsi, Aku, E4, 25 + xar_m1 vBku, Asa, E0, 46 + xar_m1 vBma, Abu, E4, 37 + xar_m1 vBbu, Asu, E4, 50 + xar_m1 vBsu, Ase, E1, 62 + xar_m1 vBme, Aga, E0, 28 + xar_m1 vBbe, Age, E1, 20 + + /* 25x BCAX, 50 in total */ + + bcax_m1 Aga, vBga, vBgi, vBge + bcax_m1 Age, vBge, vBgo, vBgi + bcax_m1 Agi, vBgi, vBgu, vBgo + bcax_m1 Ago, vBgo, vBga, vBgu + bcax_m1 Agu, vBgu, vBge, vBga + bcax_m1 Aka, vBka, vBki, vBke + bcax_m1 Ake, vBke, vBko, vBki + bcax_m1 Aki, vBki, vBku, vBko + bcax_m1 Ako, vBko, vBka, vBku + bcax_m1 Aku, vBku, vBke, vBka + bcax_m1 Ama, vBma, vBmi, vBme + bcax_m1 Ame, vBme, vBmo, vBmi + bcax_m1 Ami, vBmi, vBmu, vBmo + bcax_m1 Amo, vBmo, vBma, vBmu + bcax_m1 Amu, vBmu, vBme, vBma + bcax_m1 Asa, vBsa, vBsi, vBse + bcax_m1 Ase, vBse, vBso, vBsi + bcax_m1 Asi, vBsi, vBsu, vBso + bcax_m1 Aso, vBso, vBsa, vBsu + bcax_m1 Asu, vBsu, vBse, vBsa + bcax_m1 Aba, vBba, vBbi, vBbe + bcax_m1 Abe, vBbe, vBbo, vBbi + bcax_m1 Abi, vBbi, vBbu, vBbo + bcax_m1 Abo, vBbo, vBba, vBbu + bcax_m1 Abu, vBbu, vBbe, vBba + + // iota step + //ld1r {tmp.2d}, [const_addr], #8 + ldr tmpq, [const_addr], #16 + eor Aba.16b, Aba.16b, tmp.16b + + .unreq tmp + .unreq tmpq + +.endm + +.macro keccak_f1600_round_core + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req vBba + rax1_m1 E2, C1, C3 + rax1_m1 E4, C3, C0 + rax1_m1 E1, C0, C2 + rax1_m1 E3, C2, C4 + rax1_m1 E0, C4, C1 + + /* 25x XAR, 75 in total */ + + .unreq tmp + tmp .req C1 + tmpq .req C1q + + eor vBba.16b, Aba.16b, E0.16b + xar_m1 vBsa, Abi, E2, 2 + xar_m1 vBbi, Aki, E2, 21 + xar_m1 vBki, Ako, E3, 39 + xar_m1 vBko, Amu, E4, 56 + xar_m1 vBmu, Aso, E3, 8 + xar_m1 vBso, Ama, E0, 23 + xar_m1 vBka, Abe, E1, 63 + xar_m1 vBse, Ago, E3, 9 + xar_m1 vBgo, Ame, E1, 19 + xar_m1 vBke, Agi, E2, 58 + xar_m1 vBgi, Aka, E0, 61 + xar_m1 vBga, Abo, E3, 36 + xar_m1 vBbo, Amo, E3, 43 + xar_m1 vBmo, Ami, E2, 49 + xar_m1 vBmi, Ake, E1, 54 + xar_m1 vBge, Agu, E4, 44 + mov E3.16b, Aga.16b + bcax_m1 Aga, vBga, vBgi, vBge + xar_m1 vBgu, Asi, E2, 3 + xar_m1 vBsi, Aku, E4, 25 + xar_m1 vBku, Asa, E0, 46 + xar_m1 vBma, Abu, E4, 37 + xar_m1 vBbu, Asu, E4, 50 + xar_m1 vBsu, Ase, E1, 62 + xar_m1 vBme, E3, E0, 28 + xar_m1 vBbe, Age, E1, 20 + + /* 25x BCAX, 50 in total */ + + bcax_m1 Age, vBge, vBgo, vBgi + bcax_m1 Agi, vBgi, vBgu, vBgo + bcax_m1 Ago, vBgo, vBga, vBgu + bcax_m1 Agu, vBgu, vBge, vBga + bcax_m1 Aka, vBka, vBki, vBke + bcax_m1 Ake, vBke, vBko, vBki + + .unreq tmp + .unreq tmpq + + eor2 C0, Aka, Aga + save(Aga) + + tmp .req Aga + tmpq .req Agaq + bcax_m1 Aki, vBki, vBku, vBko + bcax_m1 Ako, vBko, vBka, vBku + eor2 C1, Ake, Age + bcax_m1 Aku, vBku, vBke, vBka + eor2 C2, Aki, Agi + bcax_m1 Ama, vBma, vBmi, vBme + eor2 C3, Ako, Ago + bcax_m1 Ame, vBme, vBmo, vBmi + eor2 C4, Aku, Agu + bcax_m1 Ami, vBmi, vBmu, vBmo + eor2 C0, C0, Ama + bcax_m1 Amo, vBmo, vBma, vBmu + eor2 C1, C1, Ame + bcax_m1 Amu, vBmu, vBme, vBma + eor2 C2, C2, Ami + bcax_m1 Asa, vBsa, vBsi, vBse + eor2 C3, C3, Amo + bcax_m1 Ase, vBse, vBso, vBsi + eor2 C4, C4, Amu + bcax_m1 Asi, vBsi, vBsu, vBso + eor2 C0, C0, Asa + bcax_m1 Aso, vBso, vBsa, vBsu + eor2 C1, C1, Ase + bcax_m1 Asu, vBsu, vBse, vBsa + eor2 C2, C2, Asi + eor2 C3, C3, Aso + bcax_m1 Aba, vBba, vBbi, vBbe + bcax_m1 Abe, vBbe, vBbo, vBbi + eor2 C1, C1, Abe + + // iota step + //ld1r {tmp.2d}, [const_addr], #8 + ldr tmpq, [const_addr], #16 + eor Aba.16b, Aba.16b, tmp.16b + eor2 C4, C4, Asu + bcax_m1 Abi, vBbi, vBbu, vBbo + bcax_m1 Abo, vBbo, vBba, vBbu + eor2 C3, C3, Abo + eor2 C2, C2, Abi + eor2 C0, C0, Aba + bcax_m1 Abu, vBbu, vBbe, vBba + eor2 C4, C4, Abu + + restore(Aga) + .unreq tmp + .unreq tmpq + +.endm + +.macro keccak_f1600_round_post + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req vBba + rax1_m1 E2, C1, C3 + rax1_m1 E4, C3, C0 + rax1_m1 E1, C0, C2 + rax1_m1 E3, C2, C4 + rax1_m1 E0, C4, C1 + + /* 25x XAR, 75 in total */ + + .unreq tmp + tmp .req C1 + tmpq .req C1q + + eor vBba.16b, Aba.16b, E0.16b + xar_m1 vBsa, Abi, E2, 2 + xar_m1 vBbi, Aki, E2, 21 + xar_m1 vBki, Ako, E3, 39 + xar_m1 vBko, Amu, E4, 56 + xar_m1 vBmu, Aso, E3, 8 + xar_m1 vBso, Ama, E0, 23 + xar_m1 vBka, Abe, E1, 63 + xar_m1 vBse, Ago, E3, 9 + xar_m1 vBgo, Ame, E1, 19 + xar_m1 vBke, Agi, E2, 58 + xar_m1 vBgi, Aka, E0, 61 + xar_m1 vBga, Abo, E3, 36 + xar_m1 vBbo, Amo, E3, 43 + xar_m1 vBmo, Ami, E2, 49 + xar_m1 vBmi, Ake, E1, 54 + xar_m1 vBge, Agu, E4, 44 + mov E3.16b, Aga.16b + bcax_m1 Aga, vBga, vBgi, vBge + xar_m1 vBgu, Asi, E2, 3 + xar_m1 vBsi, Aku, E4, 25 + xar_m1 vBku, Asa, E0, 46 + xar_m1 vBma, Abu, E4, 37 + xar_m1 vBbu, Asu, E4, 50 + xar_m1 vBsu, Ase, E1, 62 + xar_m1 vBme, E3, E0, 28 + xar_m1 vBbe, Age, E1, 20 + + /* 25x BCAX, 50 in total */ + + bcax_m1 Age, vBge, vBgo, vBgi + bcax_m1 Agi, vBgi, vBgu, vBgo + bcax_m1 Ago, vBgo, vBga, vBgu + bcax_m1 Agu, vBgu, vBge, vBga + bcax_m1 Aka, vBka, vBki, vBke + bcax_m1 Ake, vBke, vBko, vBki + bcax_m1 Aki, vBki, vBku, vBko + bcax_m1 Ako, vBko, vBka, vBku + bcax_m1 Aku, vBku, vBke, vBka + bcax_m1 Ama, vBma, vBmi, vBme + bcax_m1 Ame, vBme, vBmo, vBmi + bcax_m1 Ami, vBmi, vBmu, vBmo + bcax_m1 Amo, vBmo, vBma, vBmu + bcax_m1 Amu, vBmu, vBme, vBma + bcax_m1 Asa, vBsa, vBsi, vBse + bcax_m1 Ase, vBse, vBso, vBsi + bcax_m1 Asi, vBsi, vBsu, vBso + bcax_m1 Aso, vBso, vBsa, vBsu + bcax_m1 Asu, vBsu, vBse, vBsa + bcax_m1 Aba, vBba, vBbi, vBbe + bcax_m1 Abe, vBbe, vBbo, vBbi + bcax_m1 Abi, vBbi, vBbu, vBbo + bcax_m1 Abo, vBbo, vBba, vBbu + bcax_m1 Abu, vBbu, vBbe, vBba + + // iota step + //ld1r {tmp.2d}, [const_addr], #8 + ldr tmpq, [const_addr], #16 + eor Aba.16b, Aba.16b, tmp.16b + + .unreq tmp + .unreq tmpq + +.endm + + +.text +.align 4 +.global keccak_f1600_x2_v84a_asm_v2pp7 +.global _keccak_f1600_x2_v84a_asm_v2pp7 + +#define KECCAK_F1600_ROUNDS 24 + +keccak_f1600_x2_v84a_asm_v2pp7: +_keccak_f1600_x2_v84a_asm_v2pp7: + alloc_stack + save_vregs + load_constant_ptr + load_input + + ldr x17, xar_m1_const_addr + + //mov count, #(KECCAK_F1600_ROUNDS-2) + mov count, #11 + keccak_f1600_round_pre +loop: + keccak_f1600_round_core + keccak_f1600_round_core + sub count, count, #1 + cbnz count, loop + + keccak_f1600_round_core + keccak_f1600_round_post + store_input + restore_vregs + free_stack + ret + +#endif \ No newline at end of file diff --git a/asm/manual/keccak_f1600/keccak_f1600_x3_hybrid_asm_v3p.s b/asm/manual/keccak_f1600/keccak_f1600_x3_hybrid_asm_v3p.s new file mode 100644 index 0000000..cbc282b --- /dev/null +++ b/asm/manual/keccak_f1600/keccak_f1600_x3_hybrid_asm_v3p.s @@ -0,0 +1,971 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +/********************** CONSTANTS *************************/ + .data + .align(8) +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x29 + count .req w27 + cur_const .req x26 + + /* Mapping of Kecck-f1600 SIMD state to vector registers + * at the beginning and end of each round. */ + + vAba .req v0 + vAbe .req v1 + vAbi .req v2 + vAbo .req v3 + vAbu .req v4 + vAga .req v5 + vAge .req v6 + vAgi .req v7 + vAgo .req v8 + vAgu .req v9 + vAka .req v10 + vAke .req v11 + vAki .req v12 + vAko .req v13 + vAku .req v14 + vAma .req v15 + vAme .req v16 + vAmi .req v17 + vAmo .req v18 + vAmu .req v19 + vAsa .req v20 + vAse .req v21 + vAsi .req v22 + vAso .req v23 + vAsu .req v24 + + /* q-form of the above mapping */ + vAbaq .req q0 + vAbeq .req q1 + vAbiq .req q2 + vAboq .req q3 + vAbuq .req q4 + vAgaq .req q5 + vAgeq .req q6 + vAgiq .req q7 + vAgoq .req q8 + vAguq .req q9 + vAkaq .req q10 + vAkeq .req q11 + vAkiq .req q12 + vAkoq .req q13 + vAkuq .req q14 + vAmaq .req q15 + vAmeq .req q16 + vAmiq .req q17 + vAmoq .req q18 + vAmuq .req q19 + vAsaq .req q20 + vAseq .req q21 + vAsiq .req q22 + vAsoq .req q23 + vAsuq .req q24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v30 + C1 .req v29 + C2 .req v28 + C3 .req v27 + C4 .req v26 + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req v26 + E1 .req v25 + E2 .req v29 + E3 .req v28 + E4 .req v27 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + vAbi_ .req v2 + vAbo_ .req v3 + vAbu_ .req v4 + vAga_ .req v10 + vAge_ .req v11 + vAgi_ .req v7 + vAgo_ .req v8 + vAgu_ .req v9 + vAka_ .req v15 + vAke_ .req v16 + vAki_ .req v12 + vAko_ .req v13 + vAku_ .req v14 + vAma_ .req v20 + vAme_ .req v21 + vAmi_ .req v17 + vAmo_ .req v18 + vAmu_ .req v19 + vAsa_ .req v0 + vAse_ .req v1 + vAsi_ .req v22 + vAso_ .req v23 + vAsu_ .req v24 + vAba_ .req v30 + vAbe_ .req v27 + + /* Unused temporary */ + vtmp .req v31 + + /* Mapping of Kecck-f1600 state to scalar registers + * at the beginning and end of each round. */ + s_Aba .req x1 + sAbe .req x6 + sAbi .req x11 + sAbo .req x16 + sAbu .req x21 + sAga .req x2 + sAge .req x7 + sAgi .req x12 + sAgo .req x17 + sAgu .req x22 + sAka .req x3 + sAke .req x8 + sAki .req x13 + sAko .req x18 + sAku .req x23 + sAma .req x4 + sAme .req x9 + sAmi .req x14 + sAmo .req x19 + sAmu .req x24 + sAsa .req x5 + sAse .req x10 + sAsi .req x15 + sAso .req x20 + sAsu .req x25 + + /* sA_[y,2*x+3*y] = rot(A[x,y]) */ + s_Aba_ .req x0 + sAbe_ .req x28 + sAbi_ .req x11 + sAbo_ .req x16 + sAbu_ .req x21 + sAga_ .req x3 + sAge_ .req x8 + sAgi_ .req x12 + sAgo_ .req x17 + sAgu_ .req x22 + sAka_ .req x4 + sAke_ .req x9 + sAki_ .req x13 + sAko_ .req x18 + sAku_ .req x23 + sAma_ .req x5 + sAme_ .req x10 + sAmi_ .req x14 + sAmo_ .req x19 + sAmu_ .req x24 + sAsa_ .req x1 + sAse_ .req x6 + sAsi_ .req x15 + sAso_ .req x20 + sAsu_ .req x25 + + /* sC[x] = sA[x,0] xor sA[x,1] xor sA[x,2] xor sA[x,3] xor sA[x,4], for x in 0..4 */ + /* sE[x] = sC[x-1] xor rot(C[x+1],1), for x in 0..4 */ + sC0 .req x0 + sE0 .req x29 + sC1 .req x26 + sE1 .req x30 + sC2 .req x27 + sE2 .req x26 + sC3 .req x28 + sE3 .req x27 + sC4 .req x29 + sE4 .req x28 + + tmp .req x30 + +/************************ MACROS ****************************/ + +/* Macros using v8.4-A SHA-3 instructions */ + + +.macro eor3_m1 d s0 s1 s2 + eor \d\().16b, \s0\().16b, \s1\().16b + eor \d\().16b, \d\().16b, \s2\().16b +.endm + +.macro rax1_m1 d s0 s1 + add vtmp.2d, \s1\().2d, \s1\().2d + sri vtmp.2d, \s1\().2d, #63 + eor \d\().16b, vtmp.16b, \s0\().16b +.endm + +.macro xar_m1 d s0 s1 imm + eor vtmp.16b, \s0\().16b, \s1\().16b + shl \d\().2d, vtmp.2d, #(64-\imm) + sri \d\().2d, vtmp.2d, #(\imm) +.endm + +.macro bcax_m1 d s0 s1 s2 + bic vtmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, vtmp.16b, \s0\().16b + .endm + + +.macro eor3_m0 d s0 s1 s2 + eor3 \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +.macro rax1_m0 d s0 s1 + rax1 \d\().2d, \s0\().2d, \s1\().2d +.endm + +.macro xar_m0 d s0 s1 imm + xar \d\().2d, \s0\().2d, \s1\().2d, #\imm +.endm + +.macro bcax_m0 d s0 s1 s2 + bcax \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + + +.macro load_input_vector num idx + ldr vAbaq, [input_addr, #(16*(\num*0+\idx))] + ldr vAbeq, [input_addr, #(16*(\num*1+\idx))] + ldr vAbiq, [input_addr, #(16*(\num*2+\idx))] + ldr vAboq, [input_addr, #(16*(\num*3+\idx))] + ldr vAbuq, [input_addr, #(16*(\num*4+\idx))] + ldr vAgaq, [input_addr, #(16*(\num*5+\idx))] + ldr vAgeq, [input_addr, #(16*(\num*6+\idx))] + ldr vAgiq, [input_addr, #(16*(\num*7+\idx))] + ldr vAgoq, [input_addr, #(16*(\num*8+\idx))] + ldr vAguq, [input_addr, #(16*(\num*9+\idx))] + ldr vAkaq, [input_addr, #(16*(\num*10+\idx))] + ldr vAkeq, [input_addr, #(16*(\num*11+\idx))] + ldr vAkiq, [input_addr, #(16*(\num*12+\idx))] + ldr vAkoq, [input_addr, #(16*(\num*13+\idx))] + ldr vAkuq, [input_addr, #(16*(\num*14+\idx))] + ldr vAmaq, [input_addr, #(16*(\num*15+\idx))] + ldr vAmeq, [input_addr, #(16*(\num*16+\idx))] + ldr vAmiq, [input_addr, #(16*(\num*17+\idx))] + ldr vAmoq, [input_addr, #(16*(\num*18+\idx))] + ldr vAmuq, [input_addr, #(16*(\num*19+\idx))] + ldr vAsaq, [input_addr, #(16*(\num*20+\idx))] + ldr vAseq, [input_addr, #(16*(\num*21+\idx))] + ldr vAsiq, [input_addr, #(16*(\num*22+\idx))] + ldr vAsoq, [input_addr, #(16*(\num*23+\idx))] + ldr vAsuq, [input_addr, #(16*(\num*24+\idx))] +.endm + +.macro store_input_vector num idx + str vAbaq, [input_addr, #(16*(\num*0+\idx))] + str vAbeq, [input_addr, #(16*(\num*1+\idx))] + str vAbiq, [input_addr, #(16*(\num*2+\idx))] + str vAboq, [input_addr, #(16*(\num*3+\idx))] + str vAbuq, [input_addr, #(16*(\num*4+\idx))] + str vAgaq, [input_addr, #(16*(\num*5+\idx))] + str vAgeq, [input_addr, #(16*(\num*6+\idx))] + str vAgiq, [input_addr, #(16*(\num*7+\idx))] + str vAgoq, [input_addr, #(16*(\num*8+\idx))] + str vAguq, [input_addr, #(16*(\num*9+\idx))] + str vAkaq, [input_addr, #(16*(\num*10+\idx))] + str vAkeq, [input_addr, #(16*(\num*11+\idx))] + str vAkiq, [input_addr, #(16*(\num*12+\idx))] + str vAkoq, [input_addr, #(16*(\num*13+\idx))] + str vAkuq, [input_addr, #(16*(\num*14+\idx))] + str vAmaq, [input_addr, #(16*(\num*15+\idx))] + str vAmeq, [input_addr, #(16*(\num*16+\idx))] + str vAmiq, [input_addr, #(16*(\num*17+\idx))] + str vAmoq, [input_addr, #(16*(\num*18+\idx))] + str vAmuq, [input_addr, #(16*(\num*19+\idx))] + str vAsaq, [input_addr, #(16*(\num*20+\idx))] + str vAseq, [input_addr, #(16*(\num*21+\idx))] + str vAsiq, [input_addr, #(16*(\num*22+\idx))] + str vAsoq, [input_addr, #(16*(\num*23+\idx))] + str vAsuq, [input_addr, #(16*(\num*24+\idx))] +.endm + +.macro store_input_scalar num idx + str s_Aba, [input_addr, 8*(\num*(0) +\idx)] + str sAbe, [input_addr, 8*(\num*(0+1) +\idx)] + str sAbi, [input_addr, 8*(\num*(2)+ \idx)] + str sAbo, [input_addr, 8*(\num*(2+1) +\idx)] + str sAbu, [input_addr, 8*(\num*(4)+ \idx)] + str sAga, [input_addr, 8*(\num*(4+1) +\idx)] + str sAge, [input_addr, 8*(\num*(6)+ \idx)] + str sAgi, [input_addr, 8*(\num*(6+1) +\idx)] + str sAgo, [input_addr, 8*(\num*(8)+ \idx)] + str sAgu, [input_addr, 8*(\num*(8+1) +\idx)] + str sAka, [input_addr, 8*(\num*(10) +\idx)] + str sAke, [input_addr, 8*(\num*(10+1)+\idx)] + str sAki, [input_addr, 8*(\num*(12) +\idx)] + str sAko, [input_addr, 8*(\num*(12+1)+\idx)] + str sAku, [input_addr, 8*(\num*(14) +\idx)] + str sAma, [input_addr, 8*(\num*(14+1)+\idx)] + str sAme, [input_addr, 8*(\num*(16) +\idx)] + str sAmi, [input_addr, 8*(\num*(16+1)+\idx)] + str sAmo, [input_addr, 8*(\num*(18) +\idx)] + str sAmu, [input_addr, 8*(\num*(18+1)+\idx)] + str sAsa, [input_addr, 8*(\num*(20) +\idx)] + str sAse, [input_addr, 8*(\num*(20+1)+\idx)] + str sAsi, [input_addr, 8*(\num*(22) +\idx)] + str sAso, [input_addr, 8*(\num*(22+1)+\idx)] + str sAsu, [input_addr, 8*(\num*(24) +\idx)] +.endm + +.macro load_input_scalar num idx + ldr s_Aba, [input_addr, 8*(\num*(0) +\idx)] + ldr sAbe, [input_addr, 8*(\num*(0+1) +\idx)] + ldr sAbi, [input_addr, 8*(\num*(2)+ \idx)] + ldr sAbo, [input_addr, 8*(\num*(2+1) +\idx)] + ldr sAbu, [input_addr, 8*(\num*(4)+ \idx)] + ldr sAga, [input_addr, 8*(\num*(4+1) +\idx)] + ldr sAge, [input_addr, 8*(\num*(6)+ \idx)] + ldr sAgi, [input_addr, 8*(\num*(6+1) +\idx)] + ldr sAgo, [input_addr, 8*(\num*(8)+ \idx)] + ldr sAgu, [input_addr, 8*(\num*(8+1) +\idx)] + ldr sAka, [input_addr, 8*(\num*(10) +\idx)] + ldr sAke, [input_addr, 8*(\num*(10+1)+\idx)] + ldr sAki, [input_addr, 8*(\num*(12) +\idx)] + ldr sAko, [input_addr, 8*(\num*(12+1)+\idx)] + ldr sAku, [input_addr, 8*(\num*(14) +\idx)] + ldr sAma, [input_addr, 8*(\num*(14+1)+\idx)] + ldr sAme, [input_addr, 8*(\num*(16) +\idx)] + ldr sAmi, [input_addr, 8*(\num*(16+1)+\idx)] + ldr sAmo, [input_addr, 8*(\num*(18) +\idx)] + ldr sAmu, [input_addr, 8*(\num*(18+1)+\idx)] + ldr sAsa, [input_addr, 8*(\num*(20) +\idx)] + ldr sAse, [input_addr, 8*(\num*(20+1)+\idx)] + ldr sAsi, [input_addr, 8*(\num*(22) +\idx)] + ldr sAso, [input_addr, 8*(\num*(22+1)+\idx)] + ldr sAsu, [input_addr, 8*(\num*(24) +\idx)] +.endm + +#define STACK_SIZE (4*16 + 8*12 + 4*8) +#define STACK_BASE_GPRS (0) +#define STACK_BASE_VREGS (12*8) +#define STACK_BASE_TMP_GPRS (12*8 + 4*16) +#define STACK_OFFSET_INPUT (0*8) +#define STACK_OFFSET_CONST (1*8) +#define STACK_OFFSET_COUNT (2*8) + + +.macro save reg, offset + str \reg, [sp, #(STACK_BASE_TMP_GPRS + \offset)] +.endm + +.macro restore reg, offset + ldr \reg, [sp, #(STACK_BASE_TMP_GPRS + \offset)] +.endm + +.macro save_gprs + stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro save_vregs + stp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] + stp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + stp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + stp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] +.endm + +.macro restore_vregs + ldp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] + ldp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + ldp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + ldp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] +.endm + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +.macro eor5 dst, src0, src1, src2, src3, src4 + eor \dst, \src0, \src1 + eor \dst, \dst, \src2 + eor \dst, \dst, \src3 + eor \dst, \dst, \src4 +.endm + +.macro xor_rol dst, src1, src0, imm + eor \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro bic_rol dst, src1, src0, imm + bic \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro rotate dst, src, imm + ror \dst, \src, #(64-\imm) +.endm + +.macro hybrid_round_initial +eor sC0, sAma, sAsa SEP +eor sC1, sAme, sAse SEP eor3_m1 C0, vAba, vAga, vAka +eor sC2, sAmi, sAsi SEP eor3_m1 C0, C0, vAma, vAsa +eor sC3, sAmo, sAso SEP +eor sC4, sAmu, sAsu SEP eor3_m1 C1, vAbe, vAge, vAke +eor sC0, sAka, sC0 SEP eor3_m1 C1, C1, vAme, vAse +eor sC1, sAke, sC1 SEP +eor sC2, sAki, sC2 SEP eor3_m1 C2, vAbi, vAgi, vAki +eor sC3, sAko, sC3 SEP eor3_m1 C2, C2, vAmi, vAsi +eor sC4, sAku, sC4 SEP +eor sC0, sAga, sC0 SEP eor3_m1 C3, vAbo, vAgo, vAko +eor sC1, sAge, sC1 SEP eor3_m1 C3, C3, vAmo, vAso +eor sC2, sAgi, sC2 SEP +eor sC3, sAgo, sC3 SEP eor3_m1 C4, vAbu, vAgu, vAku +eor sC4, sAgu, sC4 SEP eor3_m1 C4, C4, vAmu, vAsu +eor sC0, s_Aba, sC0 SEP +eor sC1, sAbe, sC1 SEP rax1_m1 E1, C0, C2 +eor sC2, sAbi, sC2 SEP rax1_m1 E3, C2, C4 +eor sC3, sAbo, sC3 SEP +eor sC4, sAbu, sC4 SEP rax1_m1 E0, C4, C1 +eor sE1, sC0, sC2, ROR #63 SEP +eor sE3, sC2, sC4, ROR #63 SEP rax1_m1 E2, C1, C3 +eor sE0, sC4, sC1, ROR #63 SEP rax1_m1 E4, C3, C0 +eor sE2, sC1, sC3, ROR #63 SEP +eor sE4, sC3, sC0, ROR #63 SEP eor vAba_.16b, vAba.16b, E0.16b +eor s_Aba_, s_Aba, sE0 SEP xar_m1 vAsa_, vAbi, E2, 2 +eor sAsa_, sAbi, sE2 SEP +eor sAbi_, sAki, sE2 SEP xar_m1 vAbi_, vAki, E2, 21 +eor sAki_, sAko, sE3 SEP xar_m1 vAki_, vAko, E3, 39 +eor sAko_, sAmu, sE4 SEP +eor sAmu_, sAso, sE3 SEP xar_m1 vAko_, vAmu, E4, 56 +eor sAso_, sAma, sE0 SEP xar_m1 vAmu_, vAso, E3, 8 +eor sAka_, sAbe, sE1 SEP +eor sAse_, sAgo, sE3 SEP xar_m1 vAso_, vAma, E0, 23 +eor sAgo_, sAme, sE1 SEP xar_m1 vAka_, vAbe, E1, 63 +eor sAke_, sAgi, sE2 SEP +eor sAgi_, sAka, sE0 SEP xar_m1 vAse_, vAgo, E3, 9 +eor sAga_, sAbo, sE3 SEP +eor sAbo_, sAmo, sE3 SEP xar_m1 vAgo_, vAme, E1, 19 +eor sAmo_, sAmi, sE2 SEP xar_m1 vAke_, vAgi, E2, 58 +eor sAmi_, sAke, sE1 SEP +eor sAge_, sAgu, sE4 SEP xar_m1 vAgi_, vAka, E0, 61 +eor sAgu_, sAsi, sE2 SEP xar_m1 vAga_, vAbo, E3, 36 +eor sAsi_, sAku, sE4 SEP +eor sAku_, sAsa, sE0 SEP xar_m1 vAbo_, vAmo, E3, 43 +eor sAma_, sAbu, sE4 SEP xar_m1 vAmo_, vAmi, E2, 49 +eor sAbu_, sAsu, sE4 SEP +eor sAsu_, sAse, sE1 SEP xar_m1 vAmi_, vAke, E1, 54 +eor sAme_, sAga, sE0 SEP xar_m1 vAge_, vAgu, E4, 44 +eor sAbe_, sAge, sE1 SEP +load_constant_ptr SEP xar_m1 vAgu_, vAsi, E2, 3 +bic tmp, sAgi_, sAge_, ROR #47 SEP xar_m1 vAsi_, vAku, E4, 25 +eor sAga, tmp, sAga_, ROR #39 SEP +bic tmp, sAgo_, sAgi_, ROR #42 SEP xar_m1 vAku_, vAsa, E0, 46 +eor sAge, tmp, sAge_, ROR #25 SEP +bic tmp, sAgu_, sAgo_, ROR #16 SEP xar_m1 vAma_, vAbu, E4, 37 +eor sAgi, tmp, sAgi_, ROR #58 SEP xar_m1 vAbu_, vAsu, E4, 50 +bic tmp, sAga_, sAgu_, ROR #31 SEP +eor sAgo, tmp, sAgo_, ROR #47 SEP xar_m1 vAsu_, vAse, E1, 62 +bic tmp, sAge_, sAga_, ROR #56 SEP xar_m1 vAme_, vAga, E0, 28 +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP xar_m1 vAbe_, vAge, E1, 20 +eor sAka, tmp, sAka_, ROR #24 SEP bcax_m1 vAga, vAga_, vAgi_, vAge_ +bic tmp, sAko_, sAki_, ROR #47 SEP +eor sAke, tmp, sAke_, ROR #2 SEP bcax_m1 vAge, vAge_, vAgo_, vAgi_ +bic tmp, sAku_, sAko_, ROR #10 SEP bcax_m1 vAgi, vAgi_, vAgu_, vAgo_ +eor sAki, tmp, sAki_, ROR #57 SEP +bic tmp, sAka_, sAku_, ROR #47 SEP bcax_m1 vAgo, vAgo_, vAga_, vAgu_ +eor sAko, tmp, sAko_, ROR #57 SEP bcax_m1 vAgu, vAgu_, vAge_, vAga_ +bic tmp, sAke_, sAka_, ROR #5 SEP +eor sAku, tmp, sAku_, ROR #52 SEP bcax_m1 vAka, vAka_, vAki_, vAke_ +bic tmp, sAmi_, sAme_, ROR #38 SEP +eor sAma, tmp, sAma_, ROR #47 SEP restore x26, STACK_OFFSET_CONST +bic tmp, sAmo_, sAmi_, ROR #5 SEP ld1r {v28.2d}, [x26], #8 +eor sAme, tmp, sAme_, ROR #43 SEP save x26, STACK_OFFSET_CONST +bic tmp, sAmu_, sAmo_, ROR #41 SEP +eor sAmi, tmp, sAmi_, ROR #46 SEP bcax_m1 vAke, vAke_, vAko_, vAki_ +ldr cur_const, [const_addr] SEP bcax_m1 vAki, vAki_, vAku_, vAko_ +mov count, #1 SEP +bic tmp, sAma_, sAmu_, ROR #35 SEP bcax_m1 vAko, vAko_, vAka_, vAku_ +eor sAmo, tmp, sAmo_, ROR #12 SEP bcax_m1 vAku, vAku_, vAke_, vAka_ +bic tmp, sAme_, sAma_, ROR #9 SEP +eor sAmu, tmp, sAmu_, ROR #44 SEP +bic tmp, sAsi_, sAse_, ROR #48 SEP bcax_m1 vAma, vAma_, vAmi_, vAme_ +eor sAsa, tmp, sAsa_, ROR #41 SEP bcax_m1 vAme, vAme_, vAmo_, vAmi_ +bic tmp, sAso_, sAsi_, ROR #2 SEP bcax_m1 vAmi, vAmi_, vAmu_, vAmo_ +eor sAse, tmp, sAse_, ROR #50 SEP +bic tmp, sAsu_, sAso_, ROR #25 SEP bcax_m1 vAmo, vAmo_, vAma_, vAmu_ +eor sAsi, tmp, sAsi_, ROR #27 SEP bcax_m1 vAmu, vAmu_, vAme_, vAma_ +bic tmp, sAsa_, sAsu_, ROR #60 SEP +eor sAso, tmp, sAso_, ROR #21 SEP bcax_m1 vAsa, vAsa_, vAsi_, vAse_ +bic tmp, sAse_, sAsa_, ROR #57 SEP +eor sAsu, tmp, sAsu_, ROR #53 SEP bcax_m1 vAse, vAse_, vAso_, vAsi_ +bic tmp, sAbi_, sAbe_, ROR #63 SEP bcax_m1 vAsi, vAsi_, vAsu_, vAso_ +eor s_Aba, s_Aba_, tmp, ROR #21 SEP +bic tmp, sAbo_, sAbi_, ROR #42 SEP bcax_m1 vAso, vAso_, vAsa_, vAsu_ +eor sAbe, tmp, sAbe_, ROR #41 SEP bcax_m1 vAsu, vAsu_, vAse_, vAsa_ +bic tmp, sAbu_, sAbo_, ROR #57 SEP +eor sAbi, tmp, sAbi_, ROR #35 SEP bcax_m1 vAba, vAba_, vAbi_, vAbe_ +bic tmp, s_Aba_, sAbu_, ROR #50 SEP bcax_m1 vAbe, vAbe_, vAbo_, vAbi_ +eor sAbo, tmp, sAbo_, ROR #43 SEP +bic tmp, sAbe_, s_Aba_, ROR #44 SEP bcax_m1 vAbi, vAbi_, vAbu_, vAbo_ +eor sAbu, tmp, sAbu_, ROR #30 SEP bcax_m1 vAbo, vAbo_, vAba_, vAbu_ +eor s_Aba, s_Aba, cur_const SEP +save count, STACK_OFFSET_COUNT SEP bcax_m1 vAbu, vAbu_, vAbe_, vAba_ +eor sC0, sAka, sAsa, ROR #50 SEP eor vAba.16b, vAba.16b, v28.16b +eor sC1, sAse, sAge, ROR #60 SEP +eor sC2, sAmi, sAgi, ROR #59 SEP eor3_m1 C0, vAba, vAga, vAka +eor sC3, sAgo, sAso, ROR #30 SEP +eor sC4, sAbu, sAsu, ROR #53 SEP eor3_m1 C0, C0, vAma, vAsa +eor sC0, sAma, sC0, ROR #49 SEP eor3_m1 C1, vAbe, vAge, vAke +eor sC1, sAbe, sC1, ROR #44 SEP +eor sC2, sAki, sC2, ROR #26 SEP eor3_m1 C1, C1, vAme, vAse +eor sC3, sAmo, sC3, ROR #63 SEP eor3_m1 C2, vAbi, vAgi, vAki +eor sC4, sAmu, sC4, ROR #56 SEP +eor sC0, sAga, sC0, ROR #57 SEP eor3_m1 C2, C2, vAmi, vAsi +eor sC1, sAme, sC1, ROR #58 SEP eor3_m1 C3, vAbo, vAgo, vAko +eor sC2, sAbi, sC2, ROR #60 SEP +eor sC3, sAko, sC3, ROR #38 SEP eor3_m1 C3, C3, vAmo, vAso +eor sC4, sAgu, sC4, ROR #48 SEP eor3_m1 C4, vAbu, vAgu, vAku +eor sC0, s_Aba, sC0, ROR #61 SEP +eor sC1, sAke, sC1, ROR #57 SEP eor3_m1 C4, C4, vAmu, vAsu +eor sC2, sAsi, sC2, ROR #52 SEP rax1_m1 E1, C0, C2 +eor sC3, sAbo, sC3, ROR #63 SEP +eor sC4, sAku, sC4, ROR #50 SEP rax1_m1 E3, C2, C4 +ror sC1, sC1, 56 SEP +ror sC4, sC4, 58 SEP rax1_m1 E0, C4, C1 +ror sC2, sC2, 62 SEP rax1_m1 E2, C1, C3 +eor sE1, sC0, sC2, ROR #63 SEP +eor sE3, sC2, sC4, ROR #63 SEP rax1_m1 E4, C3, C0 +eor sE0, sC4, sC1, ROR #63 SEP eor vAba_.16b, vAba.16b, E0.16b +eor sE2, sC1, sC3, ROR #63 SEP +eor sE4, sC3, sC0, ROR #63 SEP xar_m1 vAsa_, vAbi, E2, 2 +eor s_Aba_, sE0, s_Aba SEP xar_m1 vAbi_, vAki, E2, 21 +eor sAsa_, sE2, sAbi, ROR #50 SEP +eor sAbi_, sE2, sAki, ROR #46 SEP xar_m1 vAki_, vAko, E3, 39 +eor sAki_, sE3, sAko, ROR #63 SEP xar_m1 vAko_, vAmu, E4, 56 +eor sAko_, sE4, sAmu, ROR #28 SEP +eor sAmu_, sE3, sAso, ROR #2 SEP xar_m1 vAmu_, vAso, E3, 8 +eor sAso_, sE0, sAma, ROR #54 SEP xar_m1 vAso_, vAma, E0, 23 +eor sAka_, sE1, sAbe, ROR #43 SEP +eor sAse_, sE3, sAgo, ROR #36 SEP xar_m1 vAka_, vAbe, E1, 63 +eor sAgo_, sE1, sAme, ROR #49 SEP +eor sAke_, sE2, sAgi, ROR #3 SEP xar_m1 vAse_, vAgo, E3, 9 +eor sAgi_, sE0, sAka, ROR #39 SEP xar_m1 vAgo_, vAme, E1, 19 +eor sAga_, sE3, sAbo SEP +eor sAbo_, sE3, sAmo, ROR #37 SEP xar_m1 vAke_, vAgi, E2, 58 +eor sAmo_, sE2, sAmi, ROR #8 SEP xar_m1 vAgi_, vAka, E0, 61 +eor sAmi_, sE1, sAke, ROR #56 SEP +eor sAge_, sE4, sAgu, ROR #44 SEP xar_m1 vAga_, vAbo, E3, 36 +eor sAgu_, sE2, sAsi, ROR #62 SEP xar_m1 vAbo_, vAmo, E3, 43 +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP xar_m1 vAmo_, vAmi, E2, 49 +eor sAma_, sE4, sAbu, ROR #20 SEP xar_m1 vAmi_, vAke, E1, 54 +eor sAbu_, sE4, sAsu, ROR #9 SEP +eor sAsu_, sE1, sAse, ROR #23 SEP xar_m1 vAge_, vAgu, E4, 44 +eor sAme_, sE0, sAga, ROR #61 SEP xar_m1 vAgu_, vAsi, E2, 3 +eor sAbe_, sE1, sAge, ROR #19 SEP +load_constant_ptr SEP xar_m1 vAsi_, vAku, E4, 25 +restore count, STACK_OFFSET_COUNT SEP xar_m1 vAku_, vAsa, E0, 46 +bic tmp, sAgi_, sAge_, ROR #47 SEP +eor sAga, tmp, sAga_, ROR #39 SEP xar_m1 vAma_, vAbu, E4, 37 +bic tmp, sAgo_, sAgi_, ROR #42 SEP +eor sAge, tmp, sAge_, ROR #25 SEP xar_m1 vAbu_, vAsu, E4, 50 +bic tmp, sAgu_, sAgo_, ROR #16 SEP xar_m1 vAsu_, vAse, E1, 62 +eor sAgi, tmp, sAgi_, ROR #58 SEP +bic tmp, sAga_, sAgu_, ROR #31 SEP xar_m1 vAme_, vAga, E0, 28 +eor sAgo, tmp, sAgo_, ROR #47 SEP xar_m1 vAbe_, vAge, E1, 20 +bic tmp, sAge_, sAga_, ROR #56 SEP +eor sAgu, tmp, sAgu_, ROR #23 SEP bcax_m1 vAga, vAga_, vAgi_, vAge_ +bic tmp, sAki_, sAke_, ROR #19 SEP bcax_m1 vAge, vAge_, vAgo_, vAgi_ +eor sAka, tmp, sAka_, ROR #24 SEP +bic tmp, sAko_, sAki_, ROR #47 SEP bcax_m1 vAgi, vAgi_, vAgu_, vAgo_ +eor sAke, tmp, sAke_, ROR #2 SEP bcax_m1 vAgo, vAgo_, vAga_, vAgu_ +bic tmp, sAku_, sAko_, ROR #10 SEP +eor sAki, tmp, sAki_, ROR #57 SEP bcax_m1 vAgu, vAgu_, vAge_, vAga_ +bic tmp, sAka_, sAku_, ROR #47 SEP bcax_m1 vAka, vAka_, vAki_, vAke_ +eor sAko, tmp, sAko_, ROR #57 SEP +bic tmp, sAke_, sAka_, ROR #5 SEP bcax_m1 vAke, vAke_, vAko_, vAki_ +eor sAku, tmp, sAku_, ROR #52 SEP +bic tmp, sAmi_, sAme_, ROR #38 SEP bcax_m1 vAki, vAki_, vAku_, vAko_ +eor sAma, tmp, sAma_, ROR #47 SEP bcax_m1 vAko, vAko_, vAka_, vAku_ +bic tmp, sAmo_, sAmi_, ROR #5 SEP +eor sAme, tmp, sAme_, ROR #43 SEP bcax_m1 vAku, vAku_, vAke_, vAka_ +bic tmp, sAmu_, sAmo_, ROR #41 SEP restore x26, STACK_OFFSET_CONST +eor sAmi, tmp, sAmi_, ROR #46 SEP ld1r {v28.2d}, [x26], #8 +bic tmp, sAma_, sAmu_, ROR #35 SEP save x26, STACK_OFFSET_CONST +ldr cur_const, [const_addr, count, UXTW #3] SEP bcax_m1 vAme, vAme_, vAmo_, vAmi_ +eor sAmo, tmp, sAmo_, ROR #12 SEP bcax_m1 vAma, vAma_, vAmi_, vAme_ +bic tmp, sAme_, sAma_, ROR #9 SEP +eor sAmu, tmp, sAmu_, ROR #44 SEP bcax_m1 vAmi, vAmi_, vAmu_, vAmo_ +bic tmp, sAsi_, sAse_, ROR #48 SEP +eor sAsa, tmp, sAsa_, ROR #41 SEP +bic tmp, sAso_, sAsi_, ROR #2 SEP bcax_m1 vAmo, vAmo_, vAma_, vAmu_ +eor sAse, tmp, sAse_, ROR #50 SEP +bic tmp, sAsu_, sAso_, ROR #25 SEP bcax_m1 vAmu, vAmu_, vAme_, vAma_ +eor sAsi, tmp, sAsi_, ROR #27 SEP +bic tmp, sAsa_, sAsu_, ROR #60 SEP bcax_m1 vAsa, vAsa_, vAsi_, vAse_ +eor sAso, tmp, sAso_, ROR #21 SEP bcax_m1 vAse, vAse_, vAso_, vAsi_ +bic tmp, sAse_, sAsa_, ROR #57 SEP +eor sAsu, tmp, sAsu_, ROR #53 SEP bcax_m1 vAsi, vAsi_, vAsu_, vAso_ +bic tmp, sAbi_, sAbe_, ROR #63 SEP bcax_m1 vAso, vAso_, vAsa_, vAsu_ +eor s_Aba, s_Aba_, tmp, ROR #21 SEP +bic tmp, sAbo_, sAbi_, ROR #42 SEP bcax_m1 vAsu, vAsu_, vAse_, vAsa_ +eor sAbe, tmp, sAbe_, ROR #41 SEP bcax_m1 vAba, vAba_, vAbi_, vAbe_ +bic tmp, sAbu_, sAbo_, ROR #57 SEP +eor sAbi, tmp, sAbi_, ROR #35 SEP bcax_m1 vAbe, vAbe_, vAbo_, vAbi_ +bic tmp, s_Aba_, sAbu_, ROR #50 SEP bcax_m1 vAbi, vAbi_, vAbu_, vAbo_ +eor sAbo, tmp, sAbo_, ROR #43 SEP +bic tmp, sAbe_, s_Aba_, ROR #44 SEP bcax_m1 vAbo, vAbo_, vAba_, vAbu_ +eor sAbu, tmp, sAbu_, ROR #30 SEP bcax_m1 vAbu, vAbu_, vAbe_, vAba_ +add count, count, #1 SEP +eor s_Aba, s_Aba, cur_const SEP eor vAba.16b, vAba.16b, v28.16b +.endm + +.macro hybrid_round_noninitial +save count, STACK_OFFSET_COUNT SEP +eor sC0, sAka, sAsa, ROR #50 SEP eor3_m1 C0, vAba, vAga, vAka +eor sC1, sAse, sAge, ROR #60 SEP eor3_m1 C0, C0, vAma, vAsa +eor sC2, sAmi, sAgi, ROR #59 SEP +eor sC3, sAgo, sAso, ROR #30 SEP eor3_m1 C1, vAbe, vAge, vAke +eor sC4, sAbu, sAsu, ROR #53 SEP eor3_m1 C1, C1, vAme, vAse +eor sC0, sAma, sC0, ROR #49 SEP +eor sC1, sAbe, sC1, ROR #44 SEP eor3_m1 C2, vAbi, vAgi, vAki +eor sC2, sAki, sC2, ROR #26 SEP eor3_m1 C2, C2, vAmi, vAsi +eor sC3, sAmo, sC3, ROR #63 SEP +eor sC4, sAmu, sC4, ROR #56 SEP eor3_m1 C3, vAbo, vAgo, vAko +eor sC0, sAga, sC0, ROR #57 SEP +eor sC1, sAme, sC1, ROR #58 SEP eor3_m1 C3, C3, vAmo, vAso +eor sC2, sAbi, sC2, ROR #60 SEP eor3_m1 C4, vAbu, vAgu, vAku +eor sC3, sAko, sC3, ROR #38 SEP +eor sC4, sAgu, sC4, ROR #48 SEP eor3_m1 C4, C4, vAmu, vAsu +eor sC0, s_Aba, sC0, ROR #61 SEP rax1_m1 E1, C0, C2 +eor sC1, sAke, sC1, ROR #57 SEP +eor sC2, sAsi, sC2, ROR #52 SEP rax1_m1 E3, C2, C4 +eor sC3, sAbo, sC3, ROR #63 SEP rax1_m1 E0, C4, C1 +eor sC4, sAku, sC4, ROR #50 SEP +ror sC1, sC1, 56 SEP rax1_m1 E2, C1, C3 +ror sC4, sC4, 58 SEP +ror sC2, sC2, 62 SEP rax1_m1 E4, C3, C0 +eor sE1, sC0, sC2, ROR #63 SEP eor vAba_.16b, vAba.16b, E0.16b +eor sE3, sC2, sC4, ROR #63 SEP +eor sE0, sC4, sC1, ROR #63 SEP xar_m1 vAsa_, vAbi, E2, 2 +eor sE2, sC1, sC3, ROR #63 SEP xar_m1 vAbi_, vAki, E2, 21 +eor sE4, sC3, sC0, ROR #63 SEP +eor s_Aba_, sE0, s_Aba SEP xar_m1 vAki_, vAko, E3, 39 +eor sAsa_, sE2, sAbi, ROR #50 SEP +eor sAbi_, sE2, sAki, ROR #46 SEP xar_m1 vAko_, vAmu, E4, 56 +eor sAki_, sE3, sAko, ROR #63 SEP xar_m1 vAmu_, vAso, E3, 8 +eor sAko_, sE4, sAmu, ROR #28 SEP +eor sAmu_, sE3, sAso, ROR #2 SEP xar_m1 vAso_, vAma, E0, 23 +eor sAso_, sE0, sAma, ROR #54 SEP xar_m1 vAka_, vAbe, E1, 63 +eor sAka_, sE1, sAbe, ROR #43 SEP +eor sAse_, sE3, sAgo, ROR #36 SEP xar_m1 vAse_, vAgo, E3, 9 +eor sAgo_, sE1, sAme, ROR #49 SEP xar_m1 vAgo_, vAme, E1, 19 +eor sAke_, sE2, sAgi, ROR #3 SEP +eor sAgi_, sE0, sAka, ROR #39 SEP xar_m1 vAke_, vAgi, E2, 58 +eor sAga_, sE3, sAbo SEP +eor sAbo_, sE3, sAmo, ROR #37 SEP xar_m1 vAgi_, vAka, E0, 61 +eor sAmo_, sE2, sAmi, ROR #8 SEP xar_m1 vAga_, vAbo, E3, 36 +eor sAmi_, sE1, sAke, ROR #56 SEP +eor sAge_, sE4, sAgu, ROR #44 SEP xar_m1 vAbo_, vAmo, E3, 43 +eor sAgu_, sE2, sAsi, ROR #62 SEP xar_m1 vAmo_, vAmi, E2, 49 +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP xar_m1 vAmi_, vAke, E1, 54 +eor sAma_, sE4, sAbu, ROR #20 SEP xar_m1 vAge_, vAgu, E4, 44 +eor sAbu_, sE4, sAsu, ROR #9 SEP +eor sAsu_, sE1, sAse, ROR #23 SEP xar_m1 vAgu_, vAsi, E2, 3 +eor sAme_, sE0, sAga, ROR #61 SEP +eor sAbe_, sE1, sAge, ROR #19 SEP xar_m1 vAsi_, vAku, E4, 25 +load_constant_ptr SEP xar_m1 vAku_, vAsa, E0, 46 +restore count, STACK_OFFSET_COUNT SEP +bic tmp, sAgi_, sAge_, ROR #47 SEP xar_m1 vAma_, vAbu, E4, 37 +eor sAga, tmp, sAga_, ROR #39 SEP xar_m1 vAbu_, vAsu, E4, 50 +bic tmp, sAgo_, sAgi_, ROR #42 SEP +eor sAge, tmp, sAge_, ROR #25 SEP xar_m1 vAsu_, vAse, E1, 62 +bic tmp, sAgu_, sAgo_, ROR #16 SEP +eor sAgi, tmp, sAgi_, ROR #58 SEP xar_m1 vAme_, vAga, E0, 28 +bic tmp, sAga_, sAgu_, ROR #31 SEP xar_m1 vAbe_, vAge, E1, 20 +eor sAgo, tmp, sAgo_, ROR #47 SEP +bic tmp, sAge_, sAga_, ROR #56 SEP bcax_m1 vAga, vAga_, vAgi_, vAge_ +eor sAgu, tmp, sAgu_, ROR #23 SEP bcax_m1 vAge, vAge_, vAgo_, vAgi_ +bic tmp, sAki_, sAke_, ROR #19 SEP +eor sAka, tmp, sAka_, ROR #24 SEP bcax_m1 vAgi, vAgi_, vAgu_, vAgo_ +bic tmp, sAko_, sAki_, ROR #47 SEP bcax_m1 vAgo, vAgo_, vAga_, vAgu_ +eor sAke, tmp, sAke_, ROR #2 SEP +bic tmp, sAku_, sAko_, ROR #10 SEP bcax_m1 vAgu, vAgu_, vAge_, vAga_ +eor sAki, tmp, sAki_, ROR #57 SEP +bic tmp, sAka_, sAku_, ROR #47 SEP bcax_m1 vAka, vAka_, vAki_, vAke_ +eor sAko, tmp, sAko_, ROR #57 SEP bcax_m1 vAke, vAke_, vAko_, vAki_ +bic tmp, sAke_, sAka_, ROR #5 SEP +eor sAku, tmp, sAku_, ROR #52 SEP bcax_m1 vAki, vAki_, vAku_, vAko_ +bic tmp, sAmi_, sAme_, ROR #38 SEP bcax_m1 vAko, vAko_, vAka_, vAku_ +eor sAma, tmp, sAma_, ROR #47 SEP +bic tmp, sAmo_, sAmi_, ROR #5 SEP bcax_m1 vAku, vAku_, vAke_, vAka_ +eor sAme, tmp, sAme_, ROR #43 SEP bcax_m1 vAma, vAma_, vAmi_, vAme_ +bic tmp, sAmu_, sAmo_, ROR #41 SEP restore x26, STACK_OFFSET_CONST +eor sAmi, tmp, sAmi_, ROR #46 SEP ld1r {v28.2d}, [x26], #8 +bic tmp, sAma_, sAmu_, ROR #35 SEP save x26, STACK_OFFSET_CONST +ldr cur_const, [const_addr, count, UXTW #3] SEP +add count, count, #1 SEP +eor sAmo, tmp, sAmo_, ROR #12 SEP bcax_m1 vAme, vAme_, vAmo_, vAmi_ +bic tmp, sAme_, sAma_, ROR #9 SEP bcax_m1 vAmi, vAmi_, vAmu_, vAmo_ +eor sAmu, tmp, sAmu_, ROR #44 SEP +bic tmp, sAsi_, sAse_, ROR #48 SEP +eor sAsa, tmp, sAsa_, ROR #41 SEP bcax_m1 vAmo, vAmo_, vAma_, vAmu_ +bic tmp, sAso_, sAsi_, ROR #2 SEP +eor sAse, tmp, sAse_, ROR #50 SEP bcax_m1 vAmu, vAmu_, vAme_, vAma_ +bic tmp, sAsu_, sAso_, ROR #25 SEP bcax_m1 vAsa, vAsa_, vAsi_, vAse_ +eor sAsi, tmp, sAsi_, ROR #27 SEP +bic tmp, sAsa_, sAsu_, ROR #60 SEP bcax_m1 vAse, vAse_, vAso_, vAsi_ +eor sAso, tmp, sAso_, ROR #21 SEP bcax_m1 vAsi, vAsi_, vAsu_, vAso_ +bic tmp, sAse_, sAsa_, ROR #57 SEP +eor sAsu, tmp, sAsu_, ROR #53 SEP bcax_m1 vAso, vAso_, vAsa_, vAsu_ +bic tmp, sAbi_, sAbe_, ROR #63 SEP bcax_m1 vAsu, vAsu_, vAse_, vAsa_ +eor s_Aba, s_Aba_, tmp, ROR #21 SEP +bic tmp, sAbo_, sAbi_, ROR #42 SEP bcax_m1 vAba, vAba_, vAbi_, vAbe_ +eor sAbe, tmp, sAbe_, ROR #41 SEP +bic tmp, sAbu_, sAbo_, ROR #57 SEP bcax_m1 vAbe, vAbe_, vAbo_, vAbi_ +eor sAbi, tmp, sAbi_, ROR #35 SEP bcax_m1 vAbi, vAbi_, vAbu_, vAbo_ +bic tmp, s_Aba_, sAbu_, ROR #50 SEP +eor sAbo, tmp, sAbo_, ROR #43 SEP bcax_m1 vAbo, vAbo_, vAba_, vAbu_ +bic tmp, sAbe_, s_Aba_, ROR #44 SEP bcax_m1 vAbu, vAbu_, vAbe_, vAba_ +eor sAbu, tmp, sAbu_, ROR #30 SEP +eor s_Aba, s_Aba, cur_const SEP eor vAba.16b, vAba.16b, v28.16b +save count, STACK_OFFSET_COUNT SEP +eor sC0, sAka, sAsa, ROR #50 SEP eor3_m1 C0, vAba, vAga, vAka +eor sC1, sAse, sAge, ROR #60 SEP eor3_m1 C0, C0, vAma, vAsa +eor sC2, sAmi, sAgi, ROR #59 SEP +eor sC3, sAgo, sAso, ROR #30 SEP eor3_m1 C1, vAbe, vAge, vAke +eor sC4, sAbu, sAsu, ROR #53 SEP eor3_m1 C1, C1, vAme, vAse +eor sC0, sAma, sC0, ROR #49 SEP +eor sC1, sAbe, sC1, ROR #44 SEP eor3_m1 C2, vAbi, vAgi, vAki +eor sC2, sAki, sC2, ROR #26 SEP eor3_m1 C2, C2, vAmi, vAsi +eor sC3, sAmo, sC3, ROR #63 SEP +eor sC4, sAmu, sC4, ROR #56 SEP eor3_m1 C3, vAbo, vAgo, vAko +eor sC0, sAga, sC0, ROR #57 SEP +eor sC1, sAme, sC1, ROR #58 SEP eor3_m1 C3, C3, vAmo, vAso +eor sC2, sAbi, sC2, ROR #60 SEP eor3_m1 C4, vAbu, vAgu, vAku +eor sC3, sAko, sC3, ROR #38 SEP +eor sC4, sAgu, sC4, ROR #48 SEP eor3_m1 C4, C4, vAmu, vAsu +eor sC0, s_Aba, sC0, ROR #61 SEP rax1_m1 E1, C0, C2 +eor sC1, sAke, sC1, ROR #57 SEP +eor sC2, sAsi, sC2, ROR #52 SEP rax1_m1 E3, C2, C4 +eor sC3, sAbo, sC3, ROR #63 SEP rax1_m1 E0, C4, C1 +eor sC4, sAku, sC4, ROR #50 SEP +ror sC1, sC1, 56 SEP rax1_m1 E2, C1, C3 +ror sC4, sC4, 58 SEP +ror sC2, sC2, 62 SEP rax1_m1 E4, C3, C0 +eor sE1, sC0, sC2, ROR #63 SEP eor vAba_.16b, vAba.16b, E0.16b +eor sE3, sC2, sC4, ROR #63 SEP +eor sE0, sC4, sC1, ROR #63 SEP xar_m1 vAsa_, vAbi, E2, 2 +eor sE2, sC1, sC3, ROR #63 SEP xar_m1 vAbi_, vAki, E2, 21 +eor sE4, sC3, sC0, ROR #63 SEP +eor s_Aba_, sE0, s_Aba SEP xar_m1 vAki_, vAko, E3, 39 +eor sAsa_, sE2, sAbi, ROR #50 SEP +eor sAbi_, sE2, sAki, ROR #46 SEP xar_m1 vAko_, vAmu, E4, 56 +eor sAki_, sE3, sAko, ROR #63 SEP xar_m1 vAmu_, vAso, E3, 8 +eor sAko_, sE4, sAmu, ROR #28 SEP +eor sAmu_, sE3, sAso, ROR #2 SEP xar_m1 vAso_, vAma, E0, 23 +eor sAso_, sE0, sAma, ROR #54 SEP xar_m1 vAka_, vAbe, E1, 63 +eor sAka_, sE1, sAbe, ROR #43 SEP +eor sAse_, sE3, sAgo, ROR #36 SEP xar_m1 vAse_, vAgo, E3, 9 +eor sAgo_, sE1, sAme, ROR #49 SEP xar_m1 vAgo_, vAme, E1, 19 +eor sAke_, sE2, sAgi, ROR #3 SEP +eor sAgi_, sE0, sAka, ROR #39 SEP xar_m1 vAke_, vAgi, E2, 58 +eor sAga_, sE3, sAbo SEP +eor sAbo_, sE3, sAmo, ROR #37 SEP xar_m1 vAgi_, vAka, E0, 61 +eor sAmo_, sE2, sAmi, ROR #8 SEP xar_m1 vAga_, vAbo, E3, 36 +eor sAmi_, sE1, sAke, ROR #56 SEP +eor sAge_, sE4, sAgu, ROR #44 SEP xar_m1 vAbo_, vAmo, E3, 43 +eor sAgu_, sE2, sAsi, ROR #62 SEP xar_m1 vAmo_, vAmi, E2, 49 +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP xar_m1 vAmi_, vAke, E1, 54 +eor sAma_, sE4, sAbu, ROR #20 SEP xar_m1 vAge_, vAgu, E4, 44 +eor sAbu_, sE4, sAsu, ROR #9 SEP +eor sAsu_, sE1, sAse, ROR #23 SEP xar_m1 vAgu_, vAsi, E2, 3 +eor sAme_, sE0, sAga, ROR #61 SEP +eor sAbe_, sE1, sAge, ROR #19 SEP xar_m1 vAsi_, vAku, E4, 25 +load_constant_ptr SEP xar_m1 vAku_, vAsa, E0, 46 +restore count, STACK_OFFSET_COUNT SEP +bic tmp, sAgi_, sAge_, ROR #47 SEP xar_m1 vAma_, vAbu, E4, 37 +eor sAga, tmp, sAga_, ROR #39 SEP xar_m1 vAbu_, vAsu, E4, 50 +bic tmp, sAgo_, sAgi_, ROR #42 SEP +eor sAge, tmp, sAge_, ROR #25 SEP xar_m1 vAsu_, vAse, E1, 62 +bic tmp, sAgu_, sAgo_, ROR #16 SEP +eor sAgi, tmp, sAgi_, ROR #58 SEP xar_m1 vAme_, vAga, E0, 28 +bic tmp, sAga_, sAgu_, ROR #31 SEP xar_m1 vAbe_, vAge, E1, 20 +eor sAgo, tmp, sAgo_, ROR #47 SEP +bic tmp, sAge_, sAga_, ROR #56 SEP bcax_m1 vAga, vAga_, vAgi_, vAge_ +eor sAgu, tmp, sAgu_, ROR #23 SEP bcax_m1 vAge, vAge_, vAgo_, vAgi_ +bic tmp, sAki_, sAke_, ROR #19 SEP +eor sAka, tmp, sAka_, ROR #24 SEP bcax_m1 vAgi, vAgi_, vAgu_, vAgo_ +bic tmp, sAko_, sAki_, ROR #47 SEP bcax_m1 vAgo, vAgo_, vAga_, vAgu_ +eor sAke, tmp, sAke_, ROR #2 SEP +bic tmp, sAku_, sAko_, ROR #10 SEP bcax_m1 vAgu, vAgu_, vAge_, vAga_ +eor sAki, tmp, sAki_, ROR #57 SEP +bic tmp, sAka_, sAku_, ROR #47 SEP bcax_m1 vAka, vAka_, vAki_, vAke_ +eor sAko, tmp, sAko_, ROR #57 SEP bcax_m1 vAke, vAke_, vAko_, vAki_ +bic tmp, sAke_, sAka_, ROR #5 SEP +eor sAku, tmp, sAku_, ROR #52 SEP bcax_m1 vAki, vAki_, vAku_, vAko_ +bic tmp, sAmi_, sAme_, ROR #38 SEP bcax_m1 vAko, vAko_, vAka_, vAku_ +eor sAma, tmp, sAma_, ROR #47 SEP +bic tmp, sAmo_, sAmi_, ROR #5 SEP bcax_m1 vAku, vAku_, vAke_, vAka_ +eor sAme, tmp, sAme_, ROR #43 SEP bcax_m1 vAma, vAma_, vAmi_, vAme_ +bic tmp, sAmu_, sAmo_, ROR #41 SEP restore x26, STACK_OFFSET_CONST +eor sAmi, tmp, sAmi_, ROR #46 SEP ld1r {v28.2d}, [x26], #8 +bic tmp, sAma_, sAmu_, ROR #35 SEP save x26, STACK_OFFSET_CONST +ldr cur_const, [const_addr, count, UXTW #3] SEP +add count, count, #1 SEP bcax_m1 vAme, vAme_, vAmo_, vAmi_ +eor sAmo, tmp, sAmo_, ROR #12 SEP +bic tmp, sAme_, sAma_, ROR #9 SEP bcax_m1 vAmi, vAmi_, vAmu_, vAmo_ +eor sAmu, tmp, sAmu_, ROR #44 SEP +bic tmp, sAsi_, sAse_, ROR #48 SEP +eor sAsa, tmp, sAsa_, ROR #41 SEP bcax_m1 vAmo, vAmo_, vAma_, vAmu_ +bic tmp, sAso_, sAsi_, ROR #2 SEP +eor sAse, tmp, sAse_, ROR #50 SEP bcax_m1 vAmu, vAmu_, vAme_, vAma_ +bic tmp, sAsu_, sAso_, ROR #25 SEP bcax_m1 vAsa, vAsa_, vAsi_, vAse_ +eor sAsi, tmp, sAsi_, ROR #27 SEP +bic tmp, sAsa_, sAsu_, ROR #60 SEP bcax_m1 vAse, vAse_, vAso_, vAsi_ +eor sAso, tmp, sAso_, ROR #21 SEP bcax_m1 vAsi, vAsi_, vAsu_, vAso_ +bic tmp, sAse_, sAsa_, ROR #57 SEP +eor sAsu, tmp, sAsu_, ROR #53 SEP bcax_m1 vAso, vAso_, vAsa_, vAsu_ +bic tmp, sAbi_, sAbe_, ROR #63 SEP bcax_m1 vAsu, vAsu_, vAse_, vAsa_ +eor s_Aba, s_Aba_, tmp, ROR #21 SEP +bic tmp, sAbo_, sAbi_, ROR #42 SEP bcax_m1 vAba, vAba_, vAbi_, vAbe_ +eor sAbe, tmp, sAbe_, ROR #41 SEP +bic tmp, sAbu_, sAbo_, ROR #57 SEP bcax_m1 vAbe, vAbe_, vAbo_, vAbi_ +eor sAbi, tmp, sAbi_, ROR #35 SEP bcax_m1 vAbi, vAbi_, vAbu_, vAbo_ +bic tmp, s_Aba_, sAbu_, ROR #50 SEP +eor sAbo, tmp, sAbo_, ROR #43 SEP bcax_m1 vAbo, vAbo_, vAba_, vAbu_ +bic tmp, sAbe_, s_Aba_, ROR #44 SEP bcax_m1 vAbu, vAbu_, vAbe_, vAba_ +eor sAbu, tmp, sAbu_, ROR #30 SEP +eor s_Aba, s_Aba, cur_const SEP eor vAba.16b, vAba.16b, v28.16b +.endm + +.macro final_rotate + ror sAga, sAga,#(64-3) + ror sAka, sAka,#(64-25) + ror sAma, sAma,#(64-10) + ror sAsa, sAsa,#(64-39) + ror sAbe, sAbe,#(64-21) + ror sAge, sAge,#(64-45) + ror sAke, sAke,#(64-8) + ror sAme, sAme,#(64-15) + ror sAse, sAse,#(64-41) + ror sAbi, sAbi,#(64-14) + ror sAgi, sAgi,#(64-61) + ror sAki, sAki,#(64-18) + ror sAmi, sAmi,#(64-56) + ror sAsi, sAsi,#(64-2) + ror sAgo, sAgo,#(64-28) + ror sAko, sAko,#(64-1) + ror sAmo, sAmo,#(64-27) + ror sAso, sAso,#(64-62) + ror sAbu, sAbu,#(64-44) + ror sAgu, sAgu,#(64-20) + ror sAku, sAku,#(64-6) + ror sAmu, sAmu,#(64-36) + ror sAsu, sAsu,#(64-55) +.endm + +#define KECCAK_F1600_ROUNDS 24 + +.global keccak_f1600_x3_hybrid_asm_v3p +.global _keccak_f1600_x3_hybrid_asm_v3p +.text +.align 4 + +keccak_f1600_x3_hybrid_asm_v3p: +_keccak_f1600_x3_hybrid_asm_v3p: + alloc_stack + save_gprs + save_vregs + save input_addr, STACK_OFFSET_INPUT + + load_input_vector 1,0 + + load_constant_ptr + + save const_addr, STACK_OFFSET_CONST + + add input_addr, input_addr, #400 + load_input_scalar 1,0 + hybrid_round_initial + loop_0: + hybrid_round_noninitial + cmp count, #(KECCAK_F1600_ROUNDS) + blt loop_0 + final_rotate + restore input_addr, STACK_OFFSET_INPUT + store_input_vector 1,0 + add input_addr, input_addr, #400 + store_input_scalar 1,0 + + restore_vregs + restore_gprs + free_stack + ret diff --git a/asm/manual/keccak_f1600/keccak_f1600_x3_hybrid_asm_v6.s b/asm/manual/keccak_f1600/keccak_f1600_x3_hybrid_asm_v6.s new file mode 100644 index 0000000..5352d31 --- /dev/null +++ b/asm/manual/keccak_f1600/keccak_f1600_x3_hybrid_asm_v6.s @@ -0,0 +1,1377 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" +#if defined(__ARM_FEATURE_SHA3) + +/********************** CONSTANTS *************************/ + .data + .align(8) +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 +round_constants_vec: + .quad 0x0000000000000001 + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + .quad 0x8000000080008008 +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x29 + count .req w27 + cur_const .req x26 + + /* Mapping of Kecck-f1600 SIMD state to vector registers + * at the beginning and end of each round. */ + + /* Mapping of Kecck-f1600 state to vector registers + * at the beginning and end of each round. */ + vAba .req v0 + vAbe .req v1 + vAbi .req v2 + vAbo .req v3 + vAbu .req v4 + vAga .req v5 + vAge .req v6 + vAgi .req v7 + vAgo .req v8 + vAgu .req v9 + vAka .req v10 + vAke .req v11 + vAki .req v12 + vAko .req v13 + vAku .req v14 + vAma .req v15 + vAme .req v16 + vAmi .req v17 + vAmo .req v18 + vAmu .req v19 + vAsa .req v20 + vAse .req v21 + vAsi .req v22 + vAso .req v23 + vAsu .req v24 + + /* q-form of the above mapping */ + vAbaq .req q0 + vAbeq .req q1 + vAbiq .req q2 + vAboq .req q3 + vAbuq .req q4 + vAgaq .req q5 + vAgeq .req q6 + vAgiq .req q7 + vAgoq .req q8 + vAguq .req q9 + vAkaq .req q10 + vAkeq .req q11 + vAkiq .req q12 + vAkoq .req q13 + vAkuq .req q14 + vAmaq .req q15 + vAmeq .req q16 + vAmiq .req q17 + vAmoq .req q18 + vAmuq .req q19 + vAsaq .req q20 + vAseq .req q21 + vAsiq .req q22 + vAsoq .req q23 + vAsuq .req q24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v27 + C1 .req v28 + C2 .req v29 + C3 .req v30 + C4 .req v31 + + C0q .req q27 + C1q .req q28 + C2q .req q29 + C3q .req q30 + C4q .req q31 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + vBba .req v25 // fresh + vBbe .req v26 // fresh + vBbi .req vAbi + vBbo .req vAbo + vBbu .req vAbu + vBga .req vAka + vBge .req vAke + vBgi .req vAgi + vBgo .req vAgo + vBgu .req vAgu + vBka .req vAma + vBke .req vAme + vBki .req vAki + vBko .req vAko + vBku .req vAku + vBma .req vAsa + vBme .req vAse + vBmi .req vAmi + vBmo .req vAmo + vBmu .req vAmu + vBsa .req vAba + vBse .req vAbe + vBsi .req vAsi + vBso .req vAso + vBsu .req vAsu + + vBbaq .req q25 // fresh + vBbeq .req q26 // fresh + vBbiq .req vAbiq + vBboq .req vAboq + vBbuq .req vAbuq + vBgaq .req vAkaq + vBgeq .req vAkeq + vBgiq .req vAgiq + vBgoq .req vAgoq + vBguq .req vAguq + vBkaq .req vAmaq + vBkeq .req vAmeq + vBkiq .req vAkiq + vBkoq .req vAkoq + vBkuq .req vAkuq + vBmaq .req vAsaq + vBmeq .req vAseq + vBmiq .req vAmiq + vBmoq .req vAmoq + vBmuq .req vAmuq + vBsaq .req vAbaq + vBseq .req vAbeq + vBsiq .req vAsiq + vBsoq .req vAsoq + vBsuq .req vAsuq + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req C4 + E1 .req C0 + E2 .req vBbe // fresh + E3 .req C2 + E4 .req C3 + + E0q .req C4q + E1q .req C0q + E2q .req vBbeq // fresh + E3q .req C2q + E4q .req C3q + + /* Mapping of Kecck-f1600 state to scalar registers + * at the beginning and end of each round. */ + s_Aba .req x1 + sAbe .req x6 + sAbi .req x11 + sAbo .req x16 + sAbu .req x21 + sAga .req x2 + sAge .req x7 + sAgi .req x12 + sAgo .req x17 + sAgu .req x22 + sAka .req x3 + sAke .req x8 + sAki .req x13 + sAko .req x18 + sAku .req x23 + sAma .req x4 + sAme .req x9 + sAmi .req x14 + sAmo .req x19 + sAmu .req x24 + sAsa .req x5 + sAse .req x10 + sAsi .req x15 + sAso .req x20 + sAsu .req x25 + + /* sA_[y,2*x+3*y] = rot(A[x,y]) */ + s_Aba_ .req x0 + sAbe_ .req x28 + sAbi_ .req x11 + sAbo_ .req x16 + sAbu_ .req x21 + sAga_ .req x3 + sAge_ .req x8 + sAgi_ .req x12 + sAgo_ .req x17 + sAgu_ .req x22 + sAka_ .req x4 + sAke_ .req x9 + sAki_ .req x13 + sAko_ .req x18 + sAku_ .req x23 + sAma_ .req x5 + sAme_ .req x10 + sAmi_ .req x14 + sAmo_ .req x19 + sAmu_ .req x24 + sAsa_ .req x1 + sAse_ .req x6 + sAsi_ .req x15 + sAso_ .req x20 + sAsu_ .req x25 + + /* sC[x] = sA[x,0] xor sA[x,1] xor sA[x,2] xor sA[x,3] xor sA[x,4], for x in 0..4 */ + /* sE[x] = sC[x-1] xor rot(C[x+1],1), for x in 0..4 */ + sC0 .req x0 + sE0 .req x29 + sC1 .req x26 + sE1 .req x30 + sC2 .req x27 + sE2 .req x26 + sC3 .req x28 + sE3 .req x27 + sC4 .req x29 + sE4 .req x28 + + tmp .req x30 + +/************************ MACROS ****************************/ + +/* Macros using v8.4-A SHA-3 instructions */ + +.macro eor3_m0 d s0 s1 s2 + eor3 \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +.macro rax1_m0 d s0 s1 + rax1 \d\().2d, \s0\().2d, \s1\().2d +.endm + +.macro xar_m0 d s0 s1 imm + xar \d\().2d, \s0\().2d, \s1\().2d, #\imm +.endm + +.macro bcax_m0 d s0 s1 s2 + bcax \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +.macro eor3_m1_0 d s0 s1 s2 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor2 d s0 s1 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor3_m1_1 d s0 s1 s2 + eor \d\().16b, \d\().16b, \s2\().16b +.endm + +.macro eor3_m1 d s0 s1 s2 + eor3_m1_0 \d, \s0, \s1, \s2 + eor3_m1_1 \d, \s0, \s1, \s2 +.endm + +.macro rax1_m1 d s0 s1 + // Use add instead of SHL #1 + add vvtmp.2d, \s1\().2d, \s1\().2d + sri vvtmp.2d, \s1\().2d, #63 + eor \d\().16b, vvtmp.16b, \s0\().16b +.endm + + .macro xar_m1 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 63 + eor \s0\().16b, \s0\().16b, \s1\().16b + add \d\().2d, \s0\().2d, \s0\().2d + sri \d\().2d, \s0\().2d, #(63) + // .elseif \imm == 62 + // eor \s0\().16b, \s0\().16b, \s1\().16b + // add \d\().2d, \s0\().2d, \s0\().2d + // add \d\().2d, \d\().2d, \d\().2d + // sri \d\().2d, \s0\().2d, #(62) + // .elseif \imm == 61 + // eor \s0\().16b, \s0\().16b, \s1\().16b + // add \d\().2d, \s0\().2d, \s0\().2d + // add \d\().2d, \d\().2d, \d\().2d + // add \d\().2d, \d\().2d, \d\().2d + // sri \d\().2d, \s0\().2d, #(61) + .else + eor \s0\().16b, \s0\().16b, \s1\().16b + shl \d\().2d, \s0\().2d, #(64-\imm) + sri \d\().2d, \s0\().2d, #(\imm) + .endif +.endm + + .macro xar_m1_0 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 63 + eor \s0\().16b, \s0\().16b, \s1\().16b + .elseif \imm == 62 + eor \s0\().16b, \s0\().16b, \s1\().16b + .else + eor \s0\().16b, \s0\().16b, \s1\().16b + .endif +.endm + + .macro xar_m1_1 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 63 + add \d\().2d, \s0\().2d, \s0\().2d + sri \d\().2d, \s0\().2d, #(63) + .elseif \imm == 62 + add \d\().2d, \s0\().2d, \s0\().2d + add \d\().2d, \d\().2d, \d\().2d + sri \d\().2d, \s0\().2d, #(62) + .else + shl \d\().2d, \s0\().2d, #(64-\imm) + sri \d\().2d, \s0\().2d, #(\imm) + .endif +.endm + +.macro bcax_m1 d s0 s1 s2 + bic vvtmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, vvtmp.16b, \s0\().16b +.endm + +.macro load_input_vector num idx + ldr vAbaq, [input_addr, #(16*(\num*0+\idx))] + ldr vAbeq, [input_addr, #(16*(\num*1+\idx))] + ldr vAbiq, [input_addr, #(16*(\num*2+\idx))] + ldr vAboq, [input_addr, #(16*(\num*3+\idx))] + ldr vAbuq, [input_addr, #(16*(\num*4+\idx))] + ldr vAgaq, [input_addr, #(16*(\num*5+\idx))] + ldr vAgeq, [input_addr, #(16*(\num*6+\idx))] + ldr vAgiq, [input_addr, #(16*(\num*7+\idx))] + ldr vAgoq, [input_addr, #(16*(\num*8+\idx))] + ldr vAguq, [input_addr, #(16*(\num*9+\idx))] + ldr vAkaq, [input_addr, #(16*(\num*10+\idx))] + ldr vAkeq, [input_addr, #(16*(\num*11+\idx))] + ldr vAkiq, [input_addr, #(16*(\num*12+\idx))] + ldr vAkoq, [input_addr, #(16*(\num*13+\idx))] + ldr vAkuq, [input_addr, #(16*(\num*14+\idx))] + ldr vAmaq, [input_addr, #(16*(\num*15+\idx))] + ldr vAmeq, [input_addr, #(16*(\num*16+\idx))] + ldr vAmiq, [input_addr, #(16*(\num*17+\idx))] + ldr vAmoq, [input_addr, #(16*(\num*18+\idx))] + ldr vAmuq, [input_addr, #(16*(\num*19+\idx))] + ldr vAsaq, [input_addr, #(16*(\num*20+\idx))] + ldr vAseq, [input_addr, #(16*(\num*21+\idx))] + ldr vAsiq, [input_addr, #(16*(\num*22+\idx))] + ldr vAsoq, [input_addr, #(16*(\num*23+\idx))] + ldr vAsuq, [input_addr, #(16*(\num*24+\idx))] +.endm + +.macro store_input_vector num idx + str vAbaq, [input_addr, #(16*(\num*0+\idx))] + str vAbeq, [input_addr, #(16*(\num*1+\idx))] + str vAbiq, [input_addr, #(16*(\num*2+\idx))] + str vAboq, [input_addr, #(16*(\num*3+\idx))] + str vAbuq, [input_addr, #(16*(\num*4+\idx))] + str vAgaq, [input_addr, #(16*(\num*5+\idx))] + str vAgeq, [input_addr, #(16*(\num*6+\idx))] + str vAgiq, [input_addr, #(16*(\num*7+\idx))] + str vAgoq, [input_addr, #(16*(\num*8+\idx))] + str vAguq, [input_addr, #(16*(\num*9+\idx))] + str vAkaq, [input_addr, #(16*(\num*10+\idx))] + str vAkeq, [input_addr, #(16*(\num*11+\idx))] + str vAkiq, [input_addr, #(16*(\num*12+\idx))] + str vAkoq, [input_addr, #(16*(\num*13+\idx))] + str vAkuq, [input_addr, #(16*(\num*14+\idx))] + str vAmaq, [input_addr, #(16*(\num*15+\idx))] + str vAmeq, [input_addr, #(16*(\num*16+\idx))] + str vAmiq, [input_addr, #(16*(\num*17+\idx))] + str vAmoq, [input_addr, #(16*(\num*18+\idx))] + str vAmuq, [input_addr, #(16*(\num*19+\idx))] + str vAsaq, [input_addr, #(16*(\num*20+\idx))] + str vAseq, [input_addr, #(16*(\num*21+\idx))] + str vAsiq, [input_addr, #(16*(\num*22+\idx))] + str vAsoq, [input_addr, #(16*(\num*23+\idx))] + str vAsuq, [input_addr, #(16*(\num*24+\idx))] +.endm + +.macro store_input_scalar num idx + str s_Aba, [input_addr, 8*(\num*(0) +\idx)] + str sAbe, [input_addr, 8*(\num*(0+1) +\idx)] + str sAbi, [input_addr, 8*(\num*(2)+ \idx)] + str sAbo, [input_addr, 8*(\num*(2+1) +\idx)] + str sAbu, [input_addr, 8*(\num*(4)+ \idx)] + str sAga, [input_addr, 8*(\num*(4+1) +\idx)] + str sAge, [input_addr, 8*(\num*(6)+ \idx)] + str sAgi, [input_addr, 8*(\num*(6+1) +\idx)] + str sAgo, [input_addr, 8*(\num*(8)+ \idx)] + str sAgu, [input_addr, 8*(\num*(8+1) +\idx)] + str sAka, [input_addr, 8*(\num*(10) +\idx)] + str sAke, [input_addr, 8*(\num*(10+1)+\idx)] + str sAki, [input_addr, 8*(\num*(12) +\idx)] + str sAko, [input_addr, 8*(\num*(12+1)+\idx)] + str sAku, [input_addr, 8*(\num*(14) +\idx)] + str sAma, [input_addr, 8*(\num*(14+1)+\idx)] + str sAme, [input_addr, 8*(\num*(16) +\idx)] + str sAmi, [input_addr, 8*(\num*(16+1)+\idx)] + str sAmo, [input_addr, 8*(\num*(18) +\idx)] + str sAmu, [input_addr, 8*(\num*(18+1)+\idx)] + str sAsa, [input_addr, 8*(\num*(20) +\idx)] + str sAse, [input_addr, 8*(\num*(20+1)+\idx)] + str sAsi, [input_addr, 8*(\num*(22) +\idx)] + str sAso, [input_addr, 8*(\num*(22+1)+\idx)] + str sAsu, [input_addr, 8*(\num*(24) +\idx)] +.endm + +.macro load_input_scalar num idx + ldr s_Aba, [input_addr, 8*(\num*(0) +\idx)] + ldr sAbe, [input_addr, 8*(\num*(0+1) +\idx)] + ldr sAbi, [input_addr, 8*(\num*(2)+ \idx)] + ldr sAbo, [input_addr, 8*(\num*(2+1) +\idx)] + ldr sAbu, [input_addr, 8*(\num*(4)+ \idx)] + ldr sAga, [input_addr, 8*(\num*(4+1) +\idx)] + ldr sAge, [input_addr, 8*(\num*(6)+ \idx)] + ldr sAgi, [input_addr, 8*(\num*(6+1) +\idx)] + ldr sAgo, [input_addr, 8*(\num*(8)+ \idx)] + ldr sAgu, [input_addr, 8*(\num*(8+1) +\idx)] + ldr sAka, [input_addr, 8*(\num*(10) +\idx)] + ldr sAke, [input_addr, 8*(\num*(10+1)+\idx)] + ldr sAki, [input_addr, 8*(\num*(12) +\idx)] + ldr sAko, [input_addr, 8*(\num*(12+1)+\idx)] + ldr sAku, [input_addr, 8*(\num*(14) +\idx)] + ldr sAma, [input_addr, 8*(\num*(14+1)+\idx)] + ldr sAme, [input_addr, 8*(\num*(16) +\idx)] + ldr sAmi, [input_addr, 8*(\num*(16+1)+\idx)] + ldr sAmo, [input_addr, 8*(\num*(18) +\idx)] + ldr sAmu, [input_addr, 8*(\num*(18+1)+\idx)] + ldr sAsa, [input_addr, 8*(\num*(20) +\idx)] + ldr sAse, [input_addr, 8*(\num*(20+1)+\idx)] + ldr sAsi, [input_addr, 8*(\num*(22) +\idx)] + ldr sAso, [input_addr, 8*(\num*(22+1)+\idx)] + ldr sAsu, [input_addr, 8*(\num*(24) +\idx)] +.endm + +#define STACK_SIZE (8*8 + 16*6 + 3*8 + 8 + 16*34) // VREGS (8*8), GPRs (16*6), count (8), const (8), input (8), padding (8) +#define STACK_BASE_GPRS (3*8+8) +#define STACK_BASE_VREGS (3*8+8+16*6) +#define STACK_BASE_TMP (8*8 + 16*6 + 3*8 + 8) +#define STACK_OFFSET_INPUT (0*8) +#define STACK_OFFSET_CONST (1*8) +#define STACK_OFFSET_COUNT (2*8) + +#define vAga_offset 0 +#define E0_offset 1 +#define E1_offset 2 +#define E2_offset 3 +#define E3_offset 4 +#define E4_offset 5 +#define Ame_offset 7 +#define Agi_offset 8 +#define Aka_offset 9 +#define Abo_offset 10 +#define Amo_offset 11 +#define Ami_offset 12 +#define Ake_offset 13 +#define Agu_offset 14 +#define Asi_offset 15 +#define Aku_offset 16 +#define Asa_offset 17 +#define Abu_offset 18 +#define Asu_offset 19 +#define Ase_offset 20 +//#define Aga_offset 21 +#define Age_offset 22 +#define vBgo_offset 23 +#define vBke_offset 24 +#define vBgi_offset 25 +#define vBga_offset 26 +#define vBbo_offset 27 +#define vBmo_offset 28 +#define vBmi_offset 29 +#define vBge_offset 30 + +#define save(name) \ + str name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] +#define restore(name) \ + ldr name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] + + +.macro save_gprs + stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro save_vregs + stp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] + stp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + stp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + stp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] +.endm + +.macro restore_vregs + ldp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] + ldp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + ldp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + ldp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] +.endm + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +.macro eor5 dst, src0, src1, src2, src3, src4 + eor \dst, \src0, \src1 + eor \dst, \dst, \src2 + eor \dst, \dst, \src3 + eor \dst, \dst, \src4 +.endm + +.macro xor_rol dst, src1, src0, imm + eor \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro bic_rol dst, src1, src0, imm + bic \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro rotate dst, src, imm + ror \dst, \src, #(64-\imm) +.endm + +.macro save reg, offset + str \reg, [sp, #\offset] +.endm + +.macro restore reg, offset + ldr \reg, [sp, #\offset] +.endm + +.macro hybrid_round_initial +eor sC0, sAma, sAsa SEP +eor sC1, sAme, sAse SEP eor3_m0 C1,vAbe,vAge,vAke +eor sC2, sAmi, sAsi SEP eor3_m1 C3,vAbo,vAgo,vAko +eor sC3, sAmo, sAso SEP eor3_m0 C0,vAba,vAga,vAka +eor sC4, sAmu, sAsu SEP eor3_m1 C2,vAbi,vAgi,vAki +eor sC0, sAka, sC0 SEP eor3_m0 C4,vAbu,vAgu,vAku +eor sC1, sAke, sC1 SEP eor3_m1 C1, C1,vAme, vAse +eor sC2, sAki, sC2 SEP eor3_m0 C3, C3,vAmo, vAso +eor sC3, sAko, sC3 SEP eor3_m1 C0, C0,vAma, vAsa +eor sC4, sAku, sC4 SEP eor3_m0 C2, C2,vAmi, vAsi +eor sC0, sAga, sC0 SEP eor3_m1 C4, C4,vAmu, vAsu +eor sC1, sAge, sC1 SEP vvtmp .req vBba +eor sC2, sAgi, sC2 SEP +eor sC3, sAgo, sC3 SEP rax1_m0 E2, C1, C3 +eor sC4, sAgu, sC4 SEP rax1_m1 E4, C3, C0 +eor sC0, s_Aba, sC0 SEP rax1_m0 E1, C0, C2 +eor sC1, sAbe, sC1 SEP rax1_m1 E3, C2, C4 +eor sC2, sAbi, sC2 SEP rax1_m0 E0, C4, C1 +eor sC3, sAbo, sC3 SEP .unreq vvtmp +eor sC4, sAbu, sC4 SEP vvtmp .req C1 +eor sE1, sC0, sC2, ROR #63 SEP vvtmpq .req C1q +eor sE3, sC2, sC4, ROR #63 SEP eor vBba.16b, vAba.16b, E0.16b +eor sE0, sC4, sC1, ROR #63 SEP xar_m1 vBsa, vAbi, E2, 2 +eor sE2, sC1, sC3, ROR #63 SEP +eor sE4, sC3, sC0, ROR #63 SEP xar_m0 vBbi, vAki, E2, 21 +eor s_Aba_, s_Aba, sE0 SEP xar_m1 vBki, vAko, E3, 39 +eor sAsa_, sAbi, sE2 SEP xar_m0 vBko, vAmu, E4, 56 +eor sAbi_, sAki, sE2 SEP xar_m1 vBmu, vAso, E3, 8 +eor sAki_, sAko, sE3 SEP xar_m0 vBso, vAma, E0, 23 +eor sAko_, sAmu, sE4 SEP xar_m1 vBka, vAbe, E1, 63 +eor sAmu_, sAso, sE3 SEP xar_m0 vBse, vAgo, E3, 9 +eor sAso_, sAma, sE0 SEP xar_m1 vBgo, vAme, E1, 19 +eor sAka_, sAbe, sE1 SEP xar_m0 vBke, vAgi, E2, 58 +eor sAse_, sAgo, sE3 SEP xar_m1 vBgi, vAka, E0, 61 +eor sAgo_, sAme, sE1 SEP +eor sAke_, sAgi, sE2 SEP xar_m0 vBga, vAbo, E3, 36 +eor sAgi_, sAka, sE0 SEP xar_m1 vBbo, vAmo, E3, 43 +eor sAga_, sAbo, sE3 SEP xar_m0 vBmo, vAmi, E2, 49 +eor sAbo_, sAmo, sE3 SEP xar_m1 vBmi, vAke, E1, 54 +eor sAmo_, sAmi, sE2 SEP xar_m0 vBge, vAgu, E4, 44 +eor sAmi_, sAke, sE1 SEP mov E3.16b, vAga.16b +eor sAge_, sAgu, sE4 SEP bcax_m1 vAga, vBga, vBgi, vBge +eor sAgu_, sAsi, sE2 SEP xar_m0 vBgu, vAsi, E2, 3 +eor sAsi_, sAku, sE4 SEP xar_m1 vBsi, vAku, E4, 25 +eor sAku_, sAsa, sE0 SEP xar_m0 vBku, vAsa, E0, 46 +eor sAma_, sAbu, sE4 SEP +eor sAbu_, sAsu, sE4 SEP xar_m1 vBma, vAbu, E4, 37 +eor sAsu_, sAse, sE1 SEP xar_m0 vBbu, vAsu, E4, 50 +eor sAme_, sAga, sE0 SEP xar_m1 vBsu, vAse, E1, 62 +eor sAbe_, sAge, sE1 SEP xar_m0 vBme, E3, E0, 28 +load_constant_ptr SEP xar_m1 vBbe, vAge, E1, 20 +bic tmp, sAgi_, sAge_, ROR #47 SEP bcax_m1 vAge, vBge, vBgo, vBgi +eor sAga, tmp, sAga_, ROR #39 SEP bcax_m0 vAgi, vBgi, vBgu, vBgo +bic tmp, sAgo_, sAgi_, ROR #42 SEP bcax_m1 vAgo, vBgo, vBga, vBgu +eor sAge, tmp, sAge_, ROR #25 SEP bcax_m0 vAgu, vBgu, vBge, vBga +bic tmp, sAgu_, sAgo_, ROR #16 SEP bcax_m1 vAka, vBka, vBki, vBke +eor sAgi, tmp, sAgi_, ROR #58 SEP bcax_m0 vAke, vBke, vBko, vBki +bic tmp, sAga_, sAgu_, ROR #31 SEP +eor sAgo, tmp, sAgo_, ROR #47 SEP .unreq vvtmp +bic tmp, sAge_, sAga_, ROR #56 SEP .unreq vvtmpq +eor sAgu, tmp, sAgu_, ROR #23 SEP eor2 C0, vAka, vAga +bic tmp, sAki_, sAke_, ROR #19 SEP save(vAga) +eor sAka, tmp, sAka_, ROR #24 SEP vvtmp .req vAga +bic tmp, sAko_, sAki_, ROR #47 SEP vvtmpq .req vAgaq +eor sAke, tmp, sAke_, ROR #2 SEP bcax_m0 vAki, vBki, vBku, vBko +bic tmp, sAku_, sAko_, ROR #10 SEP bcax_m1 vAko, vBko, vBka, vBku +eor sAki, tmp, sAki_, ROR #57 SEP eor2 C1, vAke, vAge +bic tmp, sAka_, sAku_, ROR #47 SEP bcax_m0 vAku, vBku, vBke, vBka +eor sAko, tmp, sAko_, ROR #57 SEP +bic tmp, sAke_, sAka_, ROR #5 SEP eor2 C2, vAki, vAgi +eor sAku, tmp, sAku_, ROR #52 SEP bcax_m1 vAma, vBma, vBmi, vBme +bic tmp, sAmi_, sAme_, ROR #38 SEP eor2 C3, vAko, vAgo +eor sAma, tmp, sAma_, ROR #47 SEP bcax_m0 vAme, vBme, vBmo, vBmi +bic tmp, sAmo_, sAmi_, ROR #5 SEP eor2 C4, vAku, vAgu +eor sAme, tmp, sAme_, ROR #43 SEP bcax_m1 vAmi, vBmi, vBmu, vBmo +bic tmp, sAmu_, sAmo_, ROR #41 SEP eor2 C0, C0, vAma +eor sAmi, tmp, sAmi_, ROR #46 SEP bcax_m0 vAmo, vBmo, vBma, vBmu +ldr cur_const, [const_addr] SEP eor2 C1, C1, vAme +mov count, #1 SEP bcax_m1 vAmu, vBmu, vBme, vBma +bic tmp, sAma_, sAmu_, ROR #35 SEP +eor sAmo, tmp, sAmo_, ROR #12 SEP eor2 C2, C2, vAmi +bic tmp, sAme_, sAma_, ROR #9 SEP bcax_m0 vAsa, vBsa, vBsi, vBse +eor sAmu, tmp, sAmu_, ROR #44 SEP eor2 C3, C3, vAmo +bic tmp, sAsi_, sAse_, ROR #48 SEP bcax_m1 vAse, vBse, vBso, vBsi +eor sAsa, tmp, sAsa_, ROR #41 SEP eor2 C4, C4, vAmu +bic tmp, sAso_, sAsi_, ROR #2 SEP bcax_m0 vAsi, vBsi, vBsu, vBso +eor sAse, tmp, sAse_, ROR #50 SEP eor2 C0, C0, vAsa +bic tmp, sAsu_, sAso_, ROR #25 SEP bcax_m1 vAso, vBso, vBsa, vBsu +eor sAsi, tmp, sAsi_, ROR #27 SEP eor2 C1, C1, vAse +bic tmp, sAsa_, sAsu_, ROR #60 SEP bcax_m0 vAsu, vBsu, vBse, vBsa +eor sAso, tmp, sAso_, ROR #21 SEP +save count, STACK_OFFSET_COUNT SEP +bic tmp, sAse_, sAsa_, ROR #57 SEP eor2 C2, C2, vAsi +eor sAsu, tmp, sAsu_, ROR #53 SEP eor2 C3, C3, vAso +bic tmp, sAbi_, sAbe_, ROR #63 SEP bcax_m1 vAba, vBba, vBbi, vBbe +eor s_Aba, s_Aba_, tmp, ROR #21 SEP bcax_m0 vAbe, vBbe, vBbo, vBbi +bic tmp, sAbo_, sAbi_, ROR #42 SEP eor2 C1, C1, vAbe +eor sAbe, tmp, sAbe_, ROR #41 SEP restore x27, STACK_OFFSET_CONST +bic tmp, sAbu_, sAbo_, ROR #57 SEP ldr vvtmpq, [x27], #16 +eor sAbi, tmp, sAbi_, ROR #35 SEP save x27, STACK_OFFSET_CONST +bic tmp, s_Aba_, sAbu_, ROR #50 SEP eor vAba.16b, vAba.16b, vvtmp.16b +eor sAbo, tmp, sAbo_, ROR #43 SEP eor2 C4, C4, vAsu +bic tmp, sAbe_, s_Aba_, ROR #44 SEP +eor sAbu, tmp, sAbu_, ROR #30 SEP bcax_m0 vAbi, vBbi, vBbu, vBbo +eor s_Aba, s_Aba, cur_const SEP bcax_m1 vAbo, vBbo, vBba, vBbu + SEP eor2 C3, C3, vAbo +eor sC0, sAka, sAsa, ROR #50 SEP eor2 C2, C2, vAbi +eor sC1, sAse, sAge, ROR #60 SEP eor2 C0, C0, vAba +eor sC2, sAmi, sAgi, ROR #59 SEP bcax_m0 vAbu, vBbu, vBbe, vBba +eor sC3, sAgo, sAso, ROR #30 SEP eor2 C4, C4, vAbu +eor sC4, sAbu, sAsu, ROR #53 SEP restore(vAga) +eor sC0, sAma, sC0, ROR #49 SEP .unreq vvtmp +eor sC1, sAbe, sC1, ROR #44 SEP .unreq vvtmpq +eor sC2, sAki, sC2, ROR #26 SEP vvtmp .req vBba +eor sC3, sAmo, sC3, ROR #63 SEP +eor sC4, sAmu, sC4, ROR #56 SEP rax1_m0 E2, C1, C3 +eor sC0, sAga, sC0, ROR #57 SEP rax1_m1 E4, C3, C0 +eor sC1, sAme, sC1, ROR #58 SEP rax1_m0 E1, C0, C2 +eor sC2, sAbi, sC2, ROR #60 SEP rax1_m1 E3, C2, C4 +eor sC3, sAko, sC3, ROR #38 SEP rax1_m0 E0, C4, C1 +eor sC4, sAgu, sC4, ROR #48 SEP .unreq vvtmp +eor sC0, s_Aba, sC0, ROR #61 SEP vvtmp .req C1 +eor sC1, sAke, sC1, ROR #57 SEP vvtmpq .req C1q +eor sC2, sAsi, sC2, ROR #52 SEP eor vBba.16b, vAba.16b, E0.16b +eor sC3, sAbo, sC3, ROR #63 SEP xar_m1 vBsa, vAbi, E2, 2 +eor sC4, sAku, sC4, ROR #50 SEP +ror sC1, sC1, 56 SEP xar_m0 vBbi, vAki, E2, 21 +ror sC4, sC4, 58 SEP xar_m1 vBki, vAko, E3, 39 +ror sC2, sC2, 62 SEP xar_m0 vBko, vAmu, E4, 56 +eor sE1, sC0, sC2, ROR #63 SEP xar_m1 vBmu, vAso, E3, 8 +eor sE3, sC2, sC4, ROR #63 SEP xar_m0 vBso, vAma, E0, 23 +eor sE0, sC4, sC1, ROR #63 SEP xar_m1 vBka, vAbe, E1, 63 +eor sE2, sC1, sC3, ROR #63 SEP xar_m0 vBse, vAgo, E3, 9 +eor sE4, sC3, sC0, ROR #63 SEP xar_m1 vBgo, vAme, E1, 19 +eor s_Aba_, sE0, s_Aba SEP xar_m0 vBke, vAgi, E2, 58 +eor sAsa_, sE2, sAbi, ROR #50 SEP xar_m1 vBgi, vAka, E0, 61 +eor sAbi_, sE2, sAki, ROR #46 SEP +eor sAki_, sE3, sAko, ROR #63 SEP xar_m0 vBga, vAbo, E3, 36 +eor sAko_, sE4, sAmu, ROR #28 SEP xar_m1 vBbo, vAmo, E3, 43 +eor sAmu_, sE3, sAso, ROR #2 SEP xar_m0 vBmo, vAmi, E2, 49 +eor sAso_, sE0, sAma, ROR #54 SEP xar_m1 vBmi, vAke, E1, 54 +eor sAka_, sE1, sAbe, ROR #43 SEP xar_m0 vBge, vAgu, E4, 44 +eor sAse_, sE3, sAgo, ROR #36 SEP mov E3.16b, vAga.16b +eor sAgo_, sE1, sAme, ROR #49 SEP bcax_m1 vAga, vBga, vBgi, vBge +eor sAke_, sE2, sAgi, ROR #3 SEP xar_m0 vBgu, vAsi, E2, 3 +eor sAgi_, sE0, sAka, ROR #39 SEP xar_m1 vBsi, vAku, E4, 25 +eor sAga_, sE3, sAbo SEP xar_m0 vBku, vAsa, E0, 46 +eor sAbo_, sE3, sAmo, ROR #37 SEP +eor sAmo_, sE2, sAmi, ROR #8 SEP xar_m1 vBma, vAbu, E4, 37 +eor sAmi_, sE1, sAke, ROR #56 SEP xar_m0 vBbu, vAsu, E4, 50 +eor sAge_, sE4, sAgu, ROR #44 SEP xar_m1 vBsu, vAse, E1, 62 +eor sAgu_, sE2, sAsi, ROR #62 SEP xar_m0 vBme, E3, E0, 28 +eor sAsi_, sE4, sAku, ROR #58 SEP xar_m1 vBbe, vAge, E1, 20 +eor sAku_, sE0, sAsa, ROR #25 SEP bcax_m1 vAge, vBge, vBgo, vBgi +eor sAma_, sE4, sAbu, ROR #20 SEP bcax_m0 vAgi, vBgi, vBgu, vBgo +eor sAbu_, sE4, sAsu, ROR #9 SEP bcax_m1 vAgo, vBgo, vBga, vBgu +eor sAsu_, sE1, sAse, ROR #23 SEP bcax_m0 vAgu, vBgu, vBge, vBga +eor sAme_, sE0, sAga, ROR #61 SEP bcax_m1 vAka, vBka, vBki, vBke +eor sAbe_, sE1, sAge, ROR #19 SEP +load_constant_ptr SEP bcax_m0 vAke, vBke, vBko, vBki +restore count, STACK_OFFSET_COUNT SEP .unreq vvtmp +bic tmp, sAgi_, sAge_, ROR #47 SEP .unreq vvtmpq +eor sAga, tmp, sAga_, ROR #39 SEP eor2 C0, vAka, vAga +bic tmp, sAgo_, sAgi_, ROR #42 SEP save(vAga) +eor sAge, tmp, sAge_, ROR #25 SEP vvtmp .req vAga +bic tmp, sAgu_, sAgo_, ROR #16 SEP vvtmpq .req vAgaq +eor sAgi, tmp, sAgi_, ROR #58 SEP bcax_m0 vAki, vBki, vBku, vBko +bic tmp, sAga_, sAgu_, ROR #31 SEP bcax_m1 vAko, vBko, vBka, vBku +eor sAgo, tmp, sAgo_, ROR #47 SEP eor2 C1, vAke, vAge +bic tmp, sAge_, sAga_, ROR #56 SEP bcax_m0 vAku, vBku, vBke, vBka +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP eor2 C2, vAki, vAgi +eor sAka, tmp, sAka_, ROR #24 SEP bcax_m1 vAma, vBma, vBmi, vBme +bic tmp, sAko_, sAki_, ROR #47 SEP eor2 C3, vAko, vAgo +eor sAke, tmp, sAke_, ROR #2 SEP bcax_m0 vAme, vBme, vBmo, vBmi +bic tmp, sAku_, sAko_, ROR #10 SEP eor2 C4, vAku, vAgu +eor sAki, tmp, sAki_, ROR #57 SEP bcax_m1 vAmi, vBmi, vBmu, vBmo +bic tmp, sAka_, sAku_, ROR #47 SEP eor2 C0, C0, vAma +eor sAko, tmp, sAko_, ROR #57 SEP bcax_m0 vAmo, vBmo, vBma, vBmu +bic tmp, sAke_, sAka_, ROR #5 SEP eor2 C1, C1, vAme +eor sAku, tmp, sAku_, ROR #52 SEP bcax_m1 vAmu, vBmu, vBme, vBma +bic tmp, sAmi_, sAme_, ROR #38 SEP +eor sAma, tmp, sAma_, ROR #47 SEP eor2 C2, C2, vAmi +bic tmp, sAmo_, sAmi_, ROR #5 SEP bcax_m0 vAsa, vBsa, vBsi, vBse +eor sAme, tmp, sAme_, ROR #43 SEP eor2 C3, C3, vAmo +bic tmp, sAmu_, sAmo_, ROR #41 SEP bcax_m1 vAse, vBse, vBso, vBsi +eor sAmi, tmp, sAmi_, ROR #46 SEP eor2 C4, C4, vAmu +bic tmp, sAma_, sAmu_, ROR #35 SEP bcax_m0 vAsi, vBsi, vBsu, vBso +eor sAmo, tmp, sAmo_, ROR #12 SEP eor2 C0, C0, vAsa +bic tmp, sAme_, sAma_, ROR #9 SEP bcax_m1 vAso, vBso, vBsa, vBsu +eor sAmu, tmp, sAmu_, ROR #44 SEP eor2 C1, C1, vAse +bic tmp, sAsi_, sAse_, ROR #48 SEP bcax_m0 vAsu, vBsu, vBse, vBsa + +eor sAsa, tmp, sAsa_, ROR #41 SEP eor2 C2, C2, vAsi +bic tmp, sAso_, sAsi_, ROR #2 SEP eor2 C3, C3, vAso +eor sAse, tmp, sAse_, ROR #50 SEP bcax_m1 vAba, vBba, vBbi, vBbe +bic tmp, sAsu_, sAso_, ROR #25 SEP bcax_m0 vAbe, vBbe, vBbo, vBbi +eor sAsi, tmp, sAsi_, ROR #27 SEP eor2 C1, C1, vAbe +bic tmp, sAsa_, sAsu_, ROR #60 SEP restore x26, STACK_OFFSET_CONST +eor sAso, tmp, sAso_, ROR #21 SEP ldr vvtmpq, [x26], #16 +bic tmp, sAse_, sAsa_, ROR #57 SEP save x26, STACK_OFFSET_CONST +eor sAsu, tmp, sAsu_, ROR #53 SEP eor vAba.16b, vAba.16b, vvtmp.16b +bic tmp, sAbi_, sAbe_, ROR #63 SEP eor2 C4, C4, vAsu +eor s_Aba, s_Aba_, tmp, ROR #21 SEP +ldr cur_const, [const_addr, count, UXTW #3] SEP +bic tmp, sAbo_, sAbi_, ROR #42 SEP bcax_m0 vAbi, vBbi, vBbu, vBbo +eor sAbe, tmp, sAbe_, ROR #41 SEP bcax_m1 vAbo, vBbo, vBba, vBbu +bic tmp, sAbu_, sAbo_, ROR #57 SEP eor2 C3, C3, vAbo +eor sAbi, tmp, sAbi_, ROR #35 SEP eor2 C2, C2, vAbi +bic tmp, s_Aba_, sAbu_, ROR #50 SEP eor2 C0, C0, vAba +eor sAbo, tmp, sAbo_, ROR #43 SEP bcax_m0 vAbu, vBbu, vBbe, vBba +bic tmp, sAbe_, s_Aba_, ROR #44 SEP eor2 C4, C4, vAbu +eor sAbu, tmp, sAbu_, ROR #30 SEP restore(vAga) +add count, count, #1 SEP .unreq vvtmp +eor s_Aba, s_Aba, cur_const SEP .unreq vvtmpq +.endm + + +.macro hybrid_round_noninitial +save count, STACK_OFFSET_COUNT SEP +eor sC0, sAka, sAsa, ROR #50 SEP vvtmp .req vBba +eor sC1, sAse, sAge, ROR #60 SEP rax1_m0 E2, C1, C3 +eor sC2, sAmi, sAgi, ROR #59 SEP rax1_m1 E4, C3, C0 +eor sC3, sAgo, sAso, ROR #30 SEP rax1_m0 E1, C0, C2 +eor sC4, sAbu, sAsu, ROR #53 SEP rax1_m1 E3, C2, C4 +eor sC0, sAma, sC0, ROR #49 SEP rax1_m0 E0, C4, C1 +eor sC1, sAbe, sC1, ROR #44 SEP +eor sC2, sAki, sC2, ROR #26 SEP .unreq vvtmp +eor sC3, sAmo, sC3, ROR #63 SEP vvtmp .req C1 +eor sC4, sAmu, sC4, ROR #56 SEP vvtmpq .req C1q +eor sC0, sAga, sC0, ROR #57 SEP eor vBba.16b, vAba.16b, E0.16b +eor sC1, sAme, sC1, ROR #58 SEP xar_m1 vBsa, vAbi, E2, 2 +eor sC2, sAbi, sC2, ROR #60 SEP +eor sC3, sAko, sC3, ROR #38 SEP xar_m0 vBbi, vAki, E2, 21 +eor sC4, sAgu, sC4, ROR #48 SEP xar_m1 vBki, vAko, E3, 39 +eor sC0, s_Aba, sC0, ROR #61 SEP xar_m0 vBko, vAmu, E4, 56 +eor sC1, sAke, sC1, ROR #57 SEP xar_m1 vBmu, vAso, E3, 8 +eor sC2, sAsi, sC2, ROR #52 SEP xar_m0 vBso, vAma, E0, 23 +eor sC3, sAbo, sC3, ROR #63 SEP xar_m1 vBka, vAbe, E1, 63 +eor sC4, sAku, sC4, ROR #50 SEP +ror sC1, sC1, 56 SEP xar_m0 vBse, vAgo, E3, 9 +ror sC4, sC4, 58 SEP xar_m1 vBgo, vAme, E1, 19 +ror sC2, sC2, 62 SEP xar_m0 vBke, vAgi, E2, 58 +eor sE1, sC0, sC2, ROR #63 SEP xar_m1 vBgi, vAka, E0, 61 +eor sE3, sC2, sC4, ROR #63 SEP xar_m0 vBga, vAbo, E3, 36 +eor sE0, sC4, sC1, ROR #63 SEP +eor sE2, sC1, sC3, ROR #63 SEP xar_m1 vBbo, vAmo, E3, 43 +eor sE4, sC3, sC0, ROR #63 SEP xar_m0 vBmo, vAmi, E2, 49 +eor s_Aba_, sE0, s_Aba SEP xar_m1 vBmi, vAke, E1, 54 +eor sAsa_, sE2, sAbi, ROR #50 SEP xar_m0 vBge, vAgu, E4, 44 +eor sAbi_, sE2, sAki, ROR #46 SEP mov E3.16b, vAga.16b +eor sAki_, sE3, sAko, ROR #63 SEP bcax_m1 vAga, vBga, vBgi, vBge +eor sAko_, sE4, sAmu, ROR #28 SEP +eor sAmu_, sE3, sAso, ROR #2 SEP xar_m0 vBgu, vAsi, E2, 3 +eor sAso_, sE0, sAma, ROR #54 SEP xar_m1 vBsi, vAku, E4, 25 +eor sAka_, sE1, sAbe, ROR #43 SEP xar_m0 vBku, vAsa, E0, 46 +eor sAse_, sE3, sAgo, ROR #36 SEP xar_m1 vBma, vAbu, E4, 37 +eor sAgo_, sE1, sAme, ROR #49 SEP xar_m0 vBbu, vAsu, E4, 50 +eor sAke_, sE2, sAgi, ROR #3 SEP +eor sAgi_, sE0, sAka, ROR #39 SEP xar_m1 vBsu, vAse, E1, 62 +eor sAga_, sE3, sAbo SEP xar_m0 vBme, E3, E0, 28 +eor sAbo_, sE3, sAmo, ROR #37 SEP xar_m1 vBbe, vAge, E1, 20 +eor sAmo_, sE2, sAmi, ROR #8 SEP bcax_m1 vAge, vBge, vBgo, vBgi +eor sAmi_, sE1, sAke, ROR #56 SEP bcax_m0 vAgi, vBgi, vBgu, vBgo +eor sAge_, sE4, sAgu, ROR #44 SEP +eor sAgu_, sE2, sAsi, ROR #62 SEP bcax_m1 vAgo, vBgo, vBga, vBgu +eor sAsi_, sE4, sAku, ROR #58 SEP bcax_m0 vAgu, vBgu, vBge, vBga +eor sAku_, sE0, sAsa, ROR #25 SEP bcax_m1 vAka, vBka, vBki, vBke +eor sAma_, sE4, sAbu, ROR #20 SEP bcax_m0 vAke, vBke, vBko, vBki +eor sAbu_, sE4, sAsu, ROR #9 SEP .unreq vvtmp +eor sAsu_, sE1, sAse, ROR #23 SEP .unreq vvtmpq +eor sAme_, sE0, sAga, ROR #61 SEP +eor sAbe_, sE1, sAge, ROR #19 SEP eor2 C0, vAka, vAga +load_constant_ptr SEP save(vAga) +restore count, STACK_OFFSET_COUNT SEP vvtmp .req vAga +bic tmp, sAgi_, sAge_, ROR #47 SEP vvtmpq .req vAgaq +eor sAga, tmp, sAga_, ROR #39 SEP bcax_m0 vAki, vBki, vBku, vBko +bic tmp, sAgo_, sAgi_, ROR #42 SEP +eor sAge, tmp, sAge_, ROR #25 SEP bcax_m1 vAko, vBko, vBka, vBku +bic tmp, sAgu_, sAgo_, ROR #16 SEP eor2 C1, vAke, vAge +eor sAgi, tmp, sAgi_, ROR #58 SEP bcax_m0 vAku, vBku, vBke, vBka +bic tmp, sAga_, sAgu_, ROR #31 SEP eor2 C2, vAki, vAgi +eor sAgo, tmp, sAgo_, ROR #47 SEP bcax_m1 vAma, vBma, vBmi, vBme +bic tmp, sAge_, sAga_, ROR #56 SEP eor2 C3, vAko, vAgo +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP bcax_m0 vAme, vBme, vBmo, vBmi +eor sAka, tmp, sAka_, ROR #24 SEP eor2 C4, vAku, vAgu +bic tmp, sAko_, sAki_, ROR #47 SEP bcax_m1 vAmi, vBmi, vBmu, vBmo +eor sAke, tmp, sAke_, ROR #2 SEP eor2 C0, C0, vAma +bic tmp, sAku_, sAko_, ROR #10 SEP bcax_m0 vAmo, vBmo, vBma, vBmu +eor sAki, tmp, sAki_, ROR #57 SEP +bic tmp, sAka_, sAku_, ROR #47 SEP eor2 C1, C1, vAme +eor sAko, tmp, sAko_, ROR #57 SEP bcax_m1 vAmu, vBmu, vBme, vBma +bic tmp, sAke_, sAka_, ROR #5 SEP eor2 C2, C2, vAmi +eor sAku, tmp, sAku_, ROR #52 SEP bcax_m0 vAsa, vBsa, vBsi, vBse +bic tmp, sAmi_, sAme_, ROR #38 SEP eor2 C3, C3, vAmo +eor sAma, tmp, sAma_, ROR #47 SEP +bic tmp, sAmo_, sAmi_, ROR #5 SEP bcax_m1 vAse, vBse, vBso, vBsi +eor sAme, tmp, sAme_, ROR #43 SEP eor2 C4, C4, vAmu +bic tmp, sAmu_, sAmo_, ROR #41 SEP bcax_m0 vAsi, vBsi, vBsu, vBso +eor sAmi, tmp, sAmi_, ROR #46 SEP eor2 C0, C0, vAsa +bic tmp, sAma_, sAmu_, ROR #35 SEP bcax_m1 vAso, vBso, vBsa, vBsu +ldr cur_const, [const_addr, count, UXTW #3] SEP eor2 C1, C1, vAse +add count, count, #1 SEP +eor sAmo, tmp, sAmo_, ROR #12 SEP bcax_m0 vAsu, vBsu, vBse, vBsa +bic tmp, sAme_, sAma_, ROR #9 SEP eor2 C2, C2, vAsi +eor sAmu, tmp, sAmu_, ROR #44 SEP eor2 C3, C3, vAso +bic tmp, sAsi_, sAse_, ROR #48 SEP bcax_m1 vAba, vBba, vBbi, vBbe +eor sAsa, tmp, sAsa_, ROR #41 SEP bcax_m0 vAbe, vBbe, vBbo, vBbi +bic tmp, sAso_, sAsi_, ROR #2 SEP +save count, STACK_OFFSET_COUNT SEP +eor sAse, tmp, sAse_, ROR #50 SEP eor2 C1, C1, vAbe +bic tmp, sAsu_, sAso_, ROR #25 SEP restore x27, STACK_OFFSET_CONST +eor sAsi, tmp, sAsi_, ROR #27 SEP ldr vvtmpq, [x27], #16 +bic tmp, sAsa_, sAsu_, ROR #60 SEP save x27, STACK_OFFSET_CONST +eor sAso, tmp, sAso_, ROR #21 SEP eor vAba.16b, vAba.16b, vvtmp.16b +bic tmp, sAse_, sAsa_, ROR #57 SEP eor2 C4, C4, vAsu +eor sAsu, tmp, sAsu_, ROR #53 SEP +bic tmp, sAbi_, sAbe_, ROR #63 SEP bcax_m0 vAbi, vBbi, vBbu, vBbo +eor s_Aba, s_Aba_, tmp, ROR #21 SEP bcax_m1 vAbo, vBbo, vBba, vBbu +bic tmp, sAbo_, sAbi_, ROR #42 SEP eor2 C3, C3, vAbo +eor sAbe, tmp, sAbe_, ROR #41 SEP eor2 C2, C2, vAbi +bic tmp, sAbu_, sAbo_, ROR #57 SEP eor2 C0, C0, vAba +eor sAbi, tmp, sAbi_, ROR #35 SEP +bic tmp, s_Aba_, sAbu_, ROR #50 SEP bcax_m0 vAbu, vBbu, vBbe, vBba +eor sAbo, tmp, sAbo_, ROR #43 SEP eor2 C4, C4, vAbu +bic tmp, sAbe_, s_Aba_, ROR #44 SEP restore(vAga) +eor sAbu, tmp, sAbu_, ROR #30 SEP .unreq vvtmp +eor s_Aba, s_Aba, cur_const SEP .unreq vvtmpq +eor sC0, sAka, sAsa, ROR #50 SEP vvtmp .req vBba +eor sC1, sAse, sAge, ROR #60 SEP rax1_m0 E2, C1, C3 +eor sC2, sAmi, sAgi, ROR #59 SEP rax1_m1 E4, C3, C0 +eor sC3, sAgo, sAso, ROR #30 SEP rax1_m0 E1, C0, C2 +eor sC4, sAbu, sAsu, ROR #53 SEP rax1_m1 E3, C2, C4 +eor sC0, sAma, sC0, ROR #49 SEP rax1_m0 E0, C4, C1 +eor sC1, sAbe, sC1, ROR #44 SEP +eor sC2, sAki, sC2, ROR #26 SEP .unreq vvtmp +eor sC3, sAmo, sC3, ROR #63 SEP vvtmp .req C1 +eor sC4, sAmu, sC4, ROR #56 SEP vvtmpq .req C1q +eor sC0, sAga, sC0, ROR #57 SEP eor vBba.16b, vAba.16b, E0.16b +eor sC1, sAme, sC1, ROR #58 SEP xar_m1 vBsa, vAbi, E2, 2 +eor sC2, sAbi, sC2, ROR #60 SEP +eor sC3, sAko, sC3, ROR #38 SEP xar_m0 vBbi, vAki, E2, 21 +eor sC4, sAgu, sC4, ROR #48 SEP xar_m1 vBki, vAko, E3, 39 +eor sC0, s_Aba, sC0, ROR #61 SEP xar_m0 vBko, vAmu, E4, 56 +eor sC1, sAke, sC1, ROR #57 SEP xar_m1 vBmu, vAso, E3, 8 +eor sC2, sAsi, sC2, ROR #52 SEP xar_m0 vBso, vAma, E0, 23 +eor sC3, sAbo, sC3, ROR #63 SEP xar_m1 vBka, vAbe, E1, 63 +eor sC4, sAku, sC4, ROR #50 SEP +ror sC1, sC1, 56 SEP xar_m0 vBse, vAgo, E3, 9 +ror sC4, sC4, 58 SEP xar_m1 vBgo, vAme, E1, 19 +ror sC2, sC2, 62 SEP xar_m0 vBke, vAgi, E2, 58 +eor sE1, sC0, sC2, ROR #63 SEP xar_m1 vBgi, vAka, E0, 61 +eor sE3, sC2, sC4, ROR #63 SEP xar_m0 vBga, vAbo, E3, 36 +eor sE0, sC4, sC1, ROR #63 SEP +eor sE2, sC1, sC3, ROR #63 SEP xar_m1 vBbo, vAmo, E3, 43 +eor sE4, sC3, sC0, ROR #63 SEP xar_m0 vBmo, vAmi, E2, 49 +eor s_Aba_, sE0, s_Aba SEP xar_m1 vBmi, vAke, E1, 54 +eor sAsa_, sE2, sAbi, ROR #50 SEP xar_m0 vBge, vAgu, E4, 44 +eor sAbi_, sE2, sAki, ROR #46 SEP mov E3.16b, vAga.16b +eor sAki_, sE3, sAko, ROR #63 SEP bcax_m1 vAga, vBga, vBgi, vBge +eor sAko_, sE4, sAmu, ROR #28 SEP +eor sAmu_, sE3, sAso, ROR #2 SEP xar_m0 vBgu, vAsi, E2, 3 +eor sAso_, sE0, sAma, ROR #54 SEP xar_m1 vBsi, vAku, E4, 25 +eor sAka_, sE1, sAbe, ROR #43 SEP xar_m0 vBku, vAsa, E0, 46 +eor sAse_, sE3, sAgo, ROR #36 SEP xar_m1 vBma, vAbu, E4, 37 +eor sAgo_, sE1, sAme, ROR #49 SEP xar_m0 vBbu, vAsu, E4, 50 +eor sAke_, sE2, sAgi, ROR #3 SEP +eor sAgi_, sE0, sAka, ROR #39 SEP xar_m1 vBsu, vAse, E1, 62 +eor sAga_, sE3, sAbo SEP xar_m0 vBme, E3, E0, 28 +eor sAbo_, sE3, sAmo, ROR #37 SEP xar_m1 vBbe, vAge, E1, 20 +eor sAmo_, sE2, sAmi, ROR #8 SEP bcax_m1 vAge, vBge, vBgo, vBgi +eor sAmi_, sE1, sAke, ROR #56 SEP bcax_m0 vAgi, vBgi, vBgu, vBgo +eor sAge_, sE4, sAgu, ROR #44 SEP +eor sAgu_, sE2, sAsi, ROR #62 SEP bcax_m1 vAgo, vBgo, vBga, vBgu +eor sAsi_, sE4, sAku, ROR #58 SEP bcax_m0 vAgu, vBgu, vBge, vBga +eor sAku_, sE0, sAsa, ROR #25 SEP bcax_m1 vAka, vBka, vBki, vBke +eor sAma_, sE4, sAbu, ROR #20 SEP bcax_m0 vAke, vBke, vBko, vBki +eor sAbu_, sE4, sAsu, ROR #9 SEP .unreq vvtmp +eor sAsu_, sE1, sAse, ROR #23 SEP .unreq vvtmpq +eor sAme_, sE0, sAga, ROR #61 SEP +eor sAbe_, sE1, sAge, ROR #19 SEP eor2 C0, vAka, vAga +load_constant_ptr SEP save(vAga) +restore count, STACK_OFFSET_COUNT SEP vvtmp .req vAga +bic tmp, sAgi_, sAge_, ROR #47 SEP vvtmpq .req vAgaq +eor sAga, tmp, sAga_, ROR #39 SEP bcax_m0 vAki, vBki, vBku, vBko +bic tmp, sAgo_, sAgi_, ROR #42 SEP +eor sAge, tmp, sAge_, ROR #25 SEP bcax_m1 vAko, vBko, vBka, vBku +bic tmp, sAgu_, sAgo_, ROR #16 SEP eor2 C1, vAke, vAge +eor sAgi, tmp, sAgi_, ROR #58 SEP bcax_m0 vAku, vBku, vBke, vBka +bic tmp, sAga_, sAgu_, ROR #31 SEP eor2 C2, vAki, vAgi +eor sAgo, tmp, sAgo_, ROR #47 SEP bcax_m1 vAma, vBma, vBmi, vBme +bic tmp, sAge_, sAga_, ROR #56 SEP eor2 C3, vAko, vAgo +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP bcax_m0 vAme, vBme, vBmo, vBmi +eor sAka, tmp, sAka_, ROR #24 SEP eor2 C4, vAku, vAgu +bic tmp, sAko_, sAki_, ROR #47 SEP bcax_m1 vAmi, vBmi, vBmu, vBmo +eor sAke, tmp, sAke_, ROR #2 SEP eor2 C0, C0, vAma +bic tmp, sAku_, sAko_, ROR #10 SEP bcax_m0 vAmo, vBmo, vBma, vBmu +eor sAki, tmp, sAki_, ROR #57 SEP +bic tmp, sAka_, sAku_, ROR #47 SEP eor2 C1, C1, vAme +eor sAko, tmp, sAko_, ROR #57 SEP bcax_m1 vAmu, vBmu, vBme, vBma +bic tmp, sAke_, sAka_, ROR #5 SEP eor2 C2, C2, vAmi +eor sAku, tmp, sAku_, ROR #52 SEP bcax_m0 vAsa, vBsa, vBsi, vBse +bic tmp, sAmi_, sAme_, ROR #38 SEP eor2 C3, C3, vAmo +eor sAma, tmp, sAma_, ROR #47 SEP +bic tmp, sAmo_, sAmi_, ROR #5 SEP bcax_m1 vAse, vBse, vBso, vBsi +eor sAme, tmp, sAme_, ROR #43 SEP eor2 C4, C4, vAmu +bic tmp, sAmu_, sAmo_, ROR #41 SEP bcax_m0 vAsi, vBsi, vBsu, vBso +eor sAmi, tmp, sAmi_, ROR #46 SEP eor2 C0, C0, vAsa +bic tmp, sAma_, sAmu_, ROR #35 SEP bcax_m1 vAso, vBso, vBsa, vBsu + SEP eor2 C1, C1, vAse +eor sAmo, tmp, sAmo_, ROR #12 SEP bcax_m0 vAsu, vBsu, vBse, vBsa +bic tmp, sAme_, sAma_, ROR #9 SEP eor2 C2, C2, vAsi +eor sAmu, tmp, sAmu_, ROR #44 SEP eor2 C3, C3, vAso +bic tmp, sAsi_, sAse_, ROR #48 SEP bcax_m1 vAba, vBba, vBbi, vBbe +eor sAsa, tmp, sAsa_, ROR #41 SEP bcax_m0 vAbe, vBbe, vBbo, vBbi +bic tmp, sAso_, sAsi_, ROR #2 SEP +eor sAse, tmp, sAse_, ROR #50 SEP eor2 C1, C1, vAbe +bic tmp, sAsu_, sAso_, ROR #25 SEP restore x26, STACK_OFFSET_CONST +eor sAsi, tmp, sAsi_, ROR #27 SEP ldr vvtmpq, [x26], #16 +bic tmp, sAsa_, sAsu_, ROR #60 SEP save x26, STACK_OFFSET_CONST +eor sAso, tmp, sAso_, ROR #21 SEP eor vAba.16b, vAba.16b, vvtmp.16b +bic tmp, sAse_, sAsa_, ROR #57 SEP eor2 C4, C4, vAsu +eor sAsu, tmp, sAsu_, ROR #53 SEP +ldr cur_const, [const_addr, count, UXTW #3] SEP +add count, count, #1 SEP +bic tmp, sAbi_, sAbe_, ROR #63 SEP bcax_m0 vAbi, vBbi, vBbu, vBbo +eor s_Aba, s_Aba_, tmp, ROR #21 SEP bcax_m1 vAbo, vBbo, vBba, vBbu +bic tmp, sAbo_, sAbi_, ROR #42 SEP eor2 C3, C3, vAbo +eor sAbe, tmp, sAbe_, ROR #41 SEP eor2 C2, C2, vAbi +bic tmp, sAbu_, sAbo_, ROR #57 SEP eor2 C0, C0, vAba +eor sAbi, tmp, sAbi_, ROR #35 SEP +bic tmp, s_Aba_, sAbu_, ROR #50 SEP bcax_m0 vAbu, vBbu, vBbe, vBba +eor sAbo, tmp, sAbo_, ROR #43 SEP eor2 C4, C4, vAbu +bic tmp, sAbe_, s_Aba_, ROR #44 SEP restore(vAga) +eor sAbu, tmp, sAbu_, ROR #30 SEP .unreq vvtmp +eor s_Aba, s_Aba, cur_const SEP .unreq vvtmpq +.endm + + +.macro hybrid_round_final +save count, STACK_OFFSET_COUNT SEP +eor sC0, sAka, sAsa, ROR #50 SEP vvtmp .req vBba +eor sC1, sAse, sAge, ROR #60 SEP rax1_m0 E2, C1, C3 +eor sC2, sAmi, sAgi, ROR #59 SEP +eor sC3, sAgo, sAso, ROR #30 SEP rax1_m1 E4, C3, C0 +eor sC4, sAbu, sAsu, ROR #53 SEP rax1_m0 E1, C0, C2 +eor sC0, sAma, sC0, ROR #49 SEP +eor sC1, sAbe, sC1, ROR #44 SEP rax1_m1 E3, C2, C4 +eor sC2, sAki, sC2, ROR #26 SEP rax1_m0 E0, C4, C1 +eor sC3, sAmo, sC3, ROR #63 SEP +eor sC4, sAmu, sC4, ROR #56 SEP .unreq vvtmp +eor sC0, sAga, sC0, ROR #57 SEP vvtmp .req C1 +eor sC1, sAme, sC1, ROR #58 SEP +eor sC2, sAbi, sC2, ROR #60 SEP vvtmpq .req C1q +eor sC3, sAko, sC3, ROR #38 SEP eor vBba.16b, vAba.16b, E0.16b +eor sC4, sAgu, sC4, ROR #48 SEP +eor sC0, s_Aba, sC0, ROR #61 SEP xar_m1 vBsa, vAbi, E2, 2 +eor sC1, sAke, sC1, ROR #57 SEP xar_m0 vBbi, vAki, E2, 21 +eor sC2, sAsi, sC2, ROR #52 SEP +eor sC3, sAbo, sC3, ROR #63 SEP xar_m1 vBki, vAko, E3, 39 +eor sC4, sAku, sC4, ROR #50 SEP xar_m0 vBko, vAmu, E4, 56 +ror sC1, sC1, 56 SEP +ror sC4, sC4, 58 SEP xar_m1 vBmu, vAso, E3, 8 +ror sC2, sC2, 62 SEP xar_m0 vBso, vAma, E0, 23 +eor sE1, sC0, sC2, ROR #63 SEP +eor sE3, sC2, sC4, ROR #63 SEP xar_m1 vBka, vAbe, E1, 63 +eor sE0, sC4, sC1, ROR #63 SEP xar_m0 vBse, vAgo, E3, 9 +eor sE2, sC1, sC3, ROR #63 SEP +eor sE4, sC3, sC0, ROR #63 SEP xar_m1 vBgo, vAme, E1, 19 +eor s_Aba_, sE0, s_Aba SEP xar_m0 vBke, vAgi, E2, 58 +eor sAsa_, sE2, sAbi, ROR #50 SEP +eor sAbi_, sE2, sAki, ROR #46 SEP xar_m1 vBgi, vAka, E0, 61 +eor sAki_, sE3, sAko, ROR #63 SEP +eor sAko_, sE4, sAmu, ROR #28 SEP xar_m0 vBga, vAbo, E3, 36 +eor sAmu_, sE3, sAso, ROR #2 SEP xar_m1 vBbo, vAmo, E3, 43 +eor sAso_, sE0, sAma, ROR #54 SEP +eor sAka_, sE1, sAbe, ROR #43 SEP xar_m0 vBmo, vAmi, E2, 49 +eor sAse_, sE3, sAgo, ROR #36 SEP xar_m1 vBmi, vAke, E1, 54 +eor sAgo_, sE1, sAme, ROR #49 SEP +eor sAke_, sE2, sAgi, ROR #3 SEP xar_m0 vBge, vAgu, E4, 44 +eor sAgi_, sE0, sAka, ROR #39 SEP mov E3.16b, vAga.16b +eor sAga_, sE3, sAbo SEP +eor sAbo_, sE3, sAmo, ROR #37 SEP bcax_m1 vAga, vBga, vBgi, vBge +eor sAmo_, sE2, sAmi, ROR #8 SEP xar_m0 vBgu, vAsi, E2, 3 +eor sAmi_, sE1, sAke, ROR #56 SEP +eor sAge_, sE4, sAgu, ROR #44 SEP xar_m1 vBsi, vAku, E4, 25 +eor sAgu_, sE2, sAsi, ROR #62 SEP xar_m0 vBku, vAsa, E0, 46 +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP xar_m1 vBma, vAbu, E4, 37 +eor sAma_, sE4, sAbu, ROR #20 SEP xar_m0 vBbu, vAsu, E4, 50 +eor sAbu_, sE4, sAsu, ROR #9 SEP +eor sAsu_, sE1, sAse, ROR #23 SEP xar_m1 vBsu, vAse, E1, 62 +eor sAme_, sE0, sAga, ROR #61 SEP xar_m0 vBme, E3, E0, 28 +eor sAbe_, sE1, sAge, ROR #19 SEP +load_constant_ptr SEP xar_m1 vBbe, vAge, E1, 20 +restore count, STACK_OFFSET_COUNT SEP bcax_m1 vAge, vBge, vBgo, vBgi +bic tmp, sAgi_, sAge_, ROR #47 SEP +eor sAga, tmp, sAga_, ROR #39 SEP bcax_m0 vAgi, vBgi, vBgu, vBgo +bic tmp, sAgo_, sAgi_, ROR #42 SEP bcax_m1 vAgo, vBgo, vBga, vBgu +eor sAge, tmp, sAge_, ROR #25 SEP +bic tmp, sAgu_, sAgo_, ROR #16 SEP bcax_m0 vAgu, vBgu, vBge, vBga +eor sAgi, tmp, sAgi_, ROR #58 SEP +bic tmp, sAga_, sAgu_, ROR #31 SEP bcax_m1 vAka, vBka, vBki, vBke +eor sAgo, tmp, sAgo_, ROR #47 SEP bcax_m0 vAke, vBke, vBko, vBki +bic tmp, sAge_, sAga_, ROR #56 SEP +eor sAgu, tmp, sAgu_, ROR #23 SEP .unreq vvtmp +bic tmp, sAki_, sAke_, ROR #19 SEP .unreq vvtmpq +eor sAka, tmp, sAka_, ROR #24 SEP +bic tmp, sAko_, sAki_, ROR #47 SEP eor2 C0, vAka, vAga +eor sAke, tmp, sAke_, ROR #2 SEP save(vAga) +bic tmp, sAku_, sAko_, ROR #10 SEP +eor sAki, tmp, sAki_, ROR #57 SEP vvtmp .req vAga +bic tmp, sAka_, sAku_, ROR #47 SEP vvtmpq .req vAgaq +eor sAko, tmp, sAko_, ROR #57 SEP +bic tmp, sAke_, sAka_, ROR #5 SEP bcax_m0 vAki, vBki, vBku, vBko +eor sAku, tmp, sAku_, ROR #52 SEP bcax_m1 vAko, vBko, vBka, vBku +bic tmp, sAmi_, sAme_, ROR #38 SEP +eor sAma, tmp, sAma_, ROR #47 SEP eor2 C1, vAke, vAge +bic tmp, sAmo_, sAmi_, ROR #5 SEP bcax_m0 vAku, vBku, vBke, vBka +eor sAme, tmp, sAme_, ROR #43 SEP +bic tmp, sAmu_, sAmo_, ROR #41 SEP eor2 C2, vAki, vAgi +eor sAmi, tmp, sAmi_, ROR #46 SEP bcax_m1 vAma, vBma, vBmi, vBme +bic tmp, sAma_, sAmu_, ROR #35 SEP +ldr cur_const, [const_addr, count, UXTW #3] SEP eor2 C3, vAko, vAgo +add count, count, #1 SEP bcax_m0 vAme, vBme, vBmo, vBmi +eor sAmo, tmp, sAmo_, ROR #12 SEP +bic tmp, sAme_, sAma_, ROR #9 SEP eor2 C4, vAku, vAgu +eor sAmu, tmp, sAmu_, ROR #44 SEP bcax_m1 vAmi, vBmi, vBmu, vBmo +bic tmp, sAsi_, sAse_, ROR #48 SEP +eor sAsa, tmp, sAsa_, ROR #41 SEP eor2 C0, C0, vAma +bic tmp, sAso_, sAsi_, ROR #2 SEP bcax_m0 vAmo, vBmo, vBma, vBmu +eor sAse, tmp, sAse_, ROR #50 SEP +bic tmp, sAsu_, sAso_, ROR #25 SEP eor2 C1, C1, vAme +eor sAsi, tmp, sAsi_, ROR #27 SEP +bic tmp, sAsa_, sAsu_, ROR #60 SEP bcax_m1 vAmu, vBmu, vBme, vBma +eor sAso, tmp, sAso_, ROR #21 SEP eor2 C2, C2, vAmi +bic tmp, sAse_, sAsa_, ROR #57 SEP +eor sAsu, tmp, sAsu_, ROR #53 SEP bcax_m0 vAsa, vBsa, vBsi, vBse +bic tmp, sAbi_, sAbe_, ROR #63 SEP eor2 C3, C3, vAmo +eor s_Aba, s_Aba_, tmp, ROR #21 SEP +bic tmp, sAbo_, sAbi_, ROR #42 SEP bcax_m1 vAse, vBse, vBso, vBsi +eor sAbe, tmp, sAbe_, ROR #41 SEP eor2 C4, C4, vAmu +bic tmp, sAbu_, sAbo_, ROR #57 SEP +eor sAbi, tmp, sAbi_, ROR #35 SEP bcax_m0 vAsi, vBsi, vBsu, vBso +bic tmp, s_Aba_, sAbu_, ROR #50 SEP eor2 C0, C0, vAsa +eor sAbo, tmp, sAbo_, ROR #43 SEP +bic tmp, sAbe_, s_Aba_, ROR #44 SEP bcax_m1 vAso, vBso, vBsa, vBsu +eor sAbu, tmp, sAbu_, ROR #30 SEP eor2 C1, C1, vAse +eor s_Aba, s_Aba, cur_const SEP +save count, STACK_OFFSET_COUNT SEP bcax_m0 vAsu, vBsu, vBse, vBsa +eor sC0, sAka, sAsa, ROR #50 SEP eor2 C2, C2, vAsi +eor sC1, sAse, sAge, ROR #60 SEP +eor sC2, sAmi, sAgi, ROR #59 SEP eor2 C3, C3, vAso +eor sC3, sAgo, sAso, ROR #30 SEP bcax_m1 vAba, vBba, vBbi, vBbe +eor sC4, sAbu, sAsu, ROR #53 SEP +eor sC0, sAma, sC0, ROR #49 SEP bcax_m0 vAbe, vBbe, vBbo, vBbi +eor sC1, sAbe, sC1, ROR #44 SEP eor2 C1, C1, vAbe +eor sC2, sAki, sC2, ROR #26 SEP +eor sC3, sAmo, sC3, ROR #63 SEP restore x30, STACK_OFFSET_CONST +eor sC4, sAmu, sC4, ROR #56 SEP ldr vvtmpq, [x30], #16 +eor sC0, sAga, sC0, ROR #57 SEP +eor sC1, sAme, sC1, ROR #58 SEP save x30, STACK_OFFSET_CONST +eor sC2, sAbi, sC2, ROR #60 SEP +eor sC3, sAko, sC3, ROR #38 SEP eor vAba.16b, vAba.16b, vvtmp.16b +eor sC4, sAgu, sC4, ROR #48 SEP eor2 C4, C4, vAsu +eor sC0, s_Aba, sC0, ROR #61 SEP +eor sC1, sAke, sC1, ROR #57 SEP bcax_m0 vAbi, vBbi, vBbu, vBbo +eor sC2, sAsi, sC2, ROR #52 SEP bcax_m1 vAbo, vBbo, vBba, vBbu +eor sC3, sAbo, sC3, ROR #63 SEP +eor sC4, sAku, sC4, ROR #50 SEP eor2 C3, C3, vAbo +ror sC1, sC1, 56 SEP eor2 C2, C2, vAbi +ror sC4, sC4, 58 SEP +ror sC2, sC2, 62 SEP eor2 C0, C0, vAba +eor sE1, sC0, sC2, ROR #63 SEP bcax_m0 vAbu, vBbu, vBbe, vBba +eor sE3, sC2, sC4, ROR #63 SEP +eor sE0, sC4, sC1, ROR #63 SEP eor2 C4, C4, vAbu +eor sE2, sC1, sC3, ROR #63 SEP restore(vAga) +eor sE4, sC3, sC0, ROR #63 SEP +eor s_Aba_, sE0, s_Aba SEP .unreq vvtmp +eor sAsa_, sE2, sAbi, ROR #50 SEP .unreq vvtmpq +eor sAbi_, sE2, sAki, ROR #46 SEP +eor sAki_, sE3, sAko, ROR #63 SEP vvtmp .req vBba +eor sAko_, sE4, sAmu, ROR #28 SEP rax1_m0 E2, C1, C3 +eor sAmu_, sE3, sAso, ROR #2 SEP +eor sAso_, sE0, sAma, ROR #54 SEP rax1_m1 E4, C3, C0 +eor sAka_, sE1, sAbe, ROR #43 SEP rax1_m0 E1, C0, C2 +eor sAse_, sE3, sAgo, ROR #36 SEP +eor sAgo_, sE1, sAme, ROR #49 SEP rax1_m1 E3, C2, C4 +eor sAke_, sE2, sAgi, ROR #3 SEP rax1_m0 E0, C4, C1 +eor sAgi_, sE0, sAka, ROR #39 SEP +eor sAga_, sE3, sAbo SEP .unreq vvtmp +eor sAbo_, sE3, sAmo, ROR #37 SEP +eor sAmo_, sE2, sAmi, ROR #8 SEP vvtmp .req C1 +eor sAmi_, sE1, sAke, ROR #56 SEP vvtmpq .req C1q +eor sAge_, sE4, sAgu, ROR #44 SEP +eor sAgu_, sE2, sAsi, ROR #62 SEP eor vBba.16b, vAba.16b, E0.16b +eor sAsi_, sE4, sAku, ROR #58 SEP xar_m0 vBsa, vAbi, E2, 2 +eor sAku_, sE0, sAsa, ROR #25 SEP +eor sAma_, sE4, sAbu, ROR #20 SEP xar_m1 vBbi, vAki, E2, 21 +eor sAbu_, sE4, sAsu, ROR #9 SEP xar_m0 vBki, vAko, E3, 39 +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP xar_m1 vBko, vAmu, E4, 56 +eor sAbe_, sE1, sAge, ROR #19 SEP xar_m0 vBmu, vAso, E3, 8 +load_constant_ptr SEP +restore count, STACK_OFFSET_COUNT SEP xar_m1 vBso, vAma, E0, 23 +bic tmp, sAgi_, sAge_, ROR #47 SEP xar_m0 vBka, vAbe, E1, 63 +eor sAga, tmp, sAga_, ROR #39 SEP +bic tmp, sAgo_, sAgi_, ROR #42 SEP xar_m1 vBse, vAgo, E3, 9 +eor sAge, tmp, sAge_, ROR #25 SEP xar_m0 vBgo, vAme, E1, 19 +bic tmp, sAgu_, sAgo_, ROR #16 SEP +eor sAgi, tmp, sAgi_, ROR #58 SEP xar_m1 vBke, vAgi, E2, 58 +bic tmp, sAga_, sAgu_, ROR #31 SEP xar_m0 vBgi, vAka, E0, 61 +eor sAgo, tmp, sAgo_, ROR #47 SEP +bic tmp, sAge_, sAga_, ROR #56 SEP xar_m1 vBga, vAbo, E3, 36 +eor sAgu, tmp, sAgu_, ROR #23 SEP xar_m0 vBbo, vAmo, E3, 43 +bic tmp, sAki_, sAke_, ROR #19 SEP +eor sAka, tmp, sAka_, ROR #24 SEP xar_m1 vBmo, vAmi, E2, 49 +bic tmp, sAko_, sAki_, ROR #47 SEP xar_m0 vBmi, vAke, E1, 54 +eor sAke, tmp, sAke_, ROR #2 SEP +bic tmp, sAku_, sAko_, ROR #10 SEP xar_m1 vBge, vAgu, E4, 44 +eor sAki, tmp, sAki_, ROR #57 SEP mov E3.16b, vAga.16b +bic tmp, sAka_, sAku_, ROR #47 SEP +eor sAko, tmp, sAko_, ROR #57 SEP bcax_m1 vAga, vBga, vBgi, vBge +bic tmp, sAke_, sAka_, ROR #5 SEP +eor sAku, tmp, sAku_, ROR #52 SEP xar_m0 vBgu, vAsi, E2, 3 +bic tmp, sAmi_, sAme_, ROR #38 SEP xar_m1 vBsi, vAku, E4, 25 +eor sAma, tmp, sAma_, ROR #47 SEP +bic tmp, sAmo_, sAmi_, ROR #5 SEP xar_m0 vBku, vAsa, E0, 46 +eor sAme, tmp, sAme_, ROR #43 SEP xar_m1 vBma, vAbu, E4, 37 +bic tmp, sAmu_, sAmo_, ROR #41 SEP +eor sAmi, tmp, sAmi_, ROR #46 SEP xar_m0 vBbu, vAsu, E4, 50 +bic tmp, sAma_, sAmu_, ROR #35 SEP xar_m1 vBsu, vAse, E1, 62 +ldr cur_const, [const_addr, count, UXTW #3] SEP +add count, count, #1 SEP xar_m0 vBme, E3, E0, 28 +eor sAmo, tmp, sAmo_, ROR #12 SEP xar_m1 vBbe, vAge, E1, 20 +bic tmp, sAme_, sAma_, ROR #9 SEP +eor sAmu, tmp, sAmu_, ROR #44 SEP bcax_m0 vAge, vBge, vBgo, vBgi +bic tmp, sAsi_, sAse_, ROR #48 SEP bcax_m1 vAgi, vBgi, vBgu, vBgo +eor sAsa, tmp, sAsa_, ROR #41 SEP +bic tmp, sAso_, sAsi_, ROR #2 SEP bcax_m0 vAgo, vBgo, vBga, vBgu +eor sAse, tmp, sAse_, ROR #50 SEP bcax_m1 vAgu, vBgu, vBge, vBga +bic tmp, sAsu_, sAso_, ROR #25 SEP +eor sAsi, tmp, sAsi_, ROR #27 SEP bcax_m0 vAka, vBka, vBki, vBke +bic tmp, sAsa_, sAsu_, ROR #60 SEP bcax_m1 vAke, vBke, vBko, vBki +eor sAso, tmp, sAso_, ROR #21 SEP +bic tmp, sAse_, sAsa_, ROR #57 SEP bcax_m0 vAki, vBki, vBku, vBko +eor sAsu, tmp, sAsu_, ROR #53 SEP bcax_m1 vAko, vBko, vBka, vBku +bic tmp, sAbi_, sAbe_, ROR #63 SEP +eor s_Aba, s_Aba_, tmp, ROR #21 SEP bcax_m0 vAku, vBku, vBke, vBka +bic tmp, sAbo_, sAbi_, ROR #42 SEP bcax_m1 vAma, vBma, vBmi, vBme +eor sAbe, tmp, sAbe_, ROR #41 SEP +bic tmp, sAbu_, sAbo_, ROR #57 SEP bcax_m0 vAme, vBme, vBmo, vBmi +eor sAbi, tmp, sAbi_, ROR #35 SEP +bic tmp, s_Aba_, sAbu_, ROR #50 SEP bcax_m1 vAmi, vBmi, vBmu, vBmo +eor sAbo, tmp, sAbo_, ROR #43 SEP bcax_m0 vAmo, vBmo, vBma, vBmu +bic tmp, sAbe_, s_Aba_, ROR #44 SEP +eor sAbu, tmp, sAbu_, ROR #30 SEP bcax_m1 vAmu, vBmu, vBme, vBma +eor s_Aba, s_Aba, cur_const SEP bcax_m0 vAsa, vBsa, vBsi, vBse +ror sAga, sAga,(64-3) SEP +ror sAka, sAka,(64-25) SEP bcax_m1 vAse, vBse, vBso, vBsi +ror sAma, sAma,(64-10) SEP bcax_m0 vAsi, vBsi, vBsu, vBso +ror sAsa, sAsa,(64-39) SEP +ror sAbe, sAbe,(64-21) SEP bcax_m1 vAso, vBso, vBsa, vBsu +ror sAge, sAge,(64-45) SEP bcax_m0 vAsu, vBsu, vBse, vBsa +ror sAke, sAke,(64-8) SEP +ror sAme, sAme,(64-15) SEP bcax_m1 vAba, vBba, vBbi, vBbe +ror sAse, sAse,(64-41) SEP bcax_m0 vAbe, vBbe, vBbo, vBbi +ror sAbi, sAbi,(64-14) SEP +ror sAgi, sAgi,(64-61) SEP bcax_m1 vAbi, vBbi, vBbu, vBbo +ror sAki, sAki,(64-18) SEP bcax_m0 vAbo, vBbo, vBba, vBbu +ror sAmi, sAmi,(64-56) SEP +ror sAsi, sAsi,(64-2) SEP bcax_m1 vAbu, vBbu, vBbe, vBba +ror sAgo, sAgo,(64-28) SEP +ror sAko, sAko,(64-1) SEP +ror sAmo, sAmo,(64-27) SEP restore x26, STACK_OFFSET_CONST +ror sAso, sAso,(64-62) SEP ldr vvtmpq, [x26], #16 +ror sAbu, sAbu,(64-44) SEP +ror sAgu, sAgu,(64-20) SEP save x26, STACK_OFFSET_CONST +ror sAku, sAku,(64-6) SEP eor vAba.16b, vAba.16b, vvtmp.16b +ror sAmu, sAmu,(64-36) SEP .unreq vvtmp +ror sAsu, sAsu,(64-55) SEP .unreq vvtmpq +.endm + + + +#define KECCAK_F1600_ROUNDS 24 + +.global keccak_f1600_x3_hybrid_asm_v6 +.global _keccak_f1600_x3_hybrid_asm_v6 +.text +.align 4 + +keccak_f1600_x3_hybrid_asm_v6: +_keccak_f1600_x3_hybrid_asm_v6: + alloc_stack + save_gprs + save_vregs + save input_addr, STACK_OFFSET_INPUT + + + ASM_LOAD(const_addr,round_constants_vec) + + save const_addr, STACK_OFFSET_CONST + load_input_vector 1,0 + + add input_addr, input_addr, #400 + load_input_scalar 1,0 + hybrid_round_initial + loop_0: + hybrid_round_noninitial + cmp count, #(KECCAK_F1600_ROUNDS-3) + ble loop_0 + + hybrid_round_final + + restore input_addr, STACK_OFFSET_INPUT + store_input_vector 1,0 + add input_addr, input_addr, #400 + store_input_scalar 1,0 + + restore_vregs + restore_gprs + free_stack + + + ret +#endif \ No newline at end of file diff --git a/asm/manual/keccak_f1600/keccak_f1600_x3_hybrid_asm_v7.s b/asm/manual/keccak_f1600/keccak_f1600_x3_hybrid_asm_v7.s new file mode 100644 index 0000000..559b9f2 --- /dev/null +++ b/asm/manual/keccak_f1600/keccak_f1600_x3_hybrid_asm_v7.s @@ -0,0 +1,924 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" +#if defined(__ARM_FEATURE_SHA3) + +/********************** CONSTANTS *************************/ + .data + .align(8) +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 +round_constants_vec: + .quad 0x0000000000000001 + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + .quad 0x8000000080008008 +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x26 + cur_const .req x26 + count .req w27 + + /* Mapping of Kecck-f1600 state to vector registers + * at the beginning and end of each round. */ + vAba .req v0 + vAbe .req v1 + vAbi .req v2 + vAbo .req v3 + vAbu .req v4 + vAga .req v5 + vAge .req v6 + vAgi .req v7 + vAgo .req v8 + vAgu .req v9 + vAka .req v10 + vAke .req v11 + vAki .req v12 + vAko .req v13 + vAku .req v14 + vAma .req v15 + vAme .req v16 + vAmi .req v17 + vAmo .req v18 + vAmu .req v19 + vAsa .req v20 + vAse .req v21 + vAsi .req v22 + vAso .req v23 + vAsu .req v24 + + /* q-form of the above mapping */ + vAbaq .req q0 + vAbeq .req q1 + vAbiq .req q2 + vAboq .req q3 + vAbuq .req q4 + vAgaq .req q5 + vAgeq .req q6 + vAgiq .req q7 + vAgoq .req q8 + vAguq .req q9 + vAkaq .req q10 + vAkeq .req q11 + vAkiq .req q12 + vAkoq .req q13 + vAkuq .req q14 + vAmaq .req q15 + vAmeq .req q16 + vAmiq .req q17 + vAmoq .req q18 + vAmuq .req q19 + vAsaq .req q20 + vAseq .req q21 + vAsiq .req q22 + vAsoq .req q23 + vAsuq .req q24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v30 + C1 .req v29 + C2 .req v28 + C3 .req v27 + C4 .req v26 + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req v26 + E1 .req v25 + E2 .req v29 + E3 .req v28 + E4 .req v27 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + vAbi_ .req v2 + vAbo_ .req v3 + vAbu_ .req v4 + vAga_ .req v10 + vAge_ .req v11 + vAgi_ .req v7 + vAgo_ .req v8 + vAgu_ .req v9 + vAka_ .req v15 + vAke_ .req v16 + vAki_ .req v12 + vAko_ .req v13 + vAku_ .req v14 + vAma_ .req v20 + vAme_ .req v21 + vAmi_ .req v17 + vAmo_ .req v18 + vAmu_ .req v19 + vAsa_ .req v0 + vAse_ .req v1 + vAsi_ .req v22 + vAso_ .req v23 + vAsu_ .req v24 + vAba_ .req v30 + vAbe_ .req v27 + + /* Mapping of Kecck-f1600 state to scalar registers + * at the beginning and end of each round. */ + s_Aba .req x1 + sAbe .req x6 + sAbi .req x11 + sAbo .req x16 + sAbu .req x21 + sAga .req x2 + sAge .req x7 + sAgi .req x12 + sAgo .req x17 + sAgu .req x22 + sAka .req x3 + sAke .req x8 + sAki .req x13 + sAko .req x18 + sAku .req x23 + sAma .req x4 + sAme .req x9 + sAmi .req x14 + sAmo .req x19 + sAmu .req x24 + sAsa .req x5 + sAse .req x10 + sAsi .req x15 + sAso .req x20 + sAsu .req x25 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + s_Aba_ .req x30 + sAbe_ .req x28 + sAbi_ .req x11 + sAbo_ .req x16 + sAbu_ .req x21 + sAga_ .req x3 + sAge_ .req x8 + sAgi_ .req x12 + sAgo_ .req x17 + sAgu_ .req x22 + sAka_ .req x4 + sAke_ .req x9 + sAki_ .req x13 + sAko_ .req x18 + sAku_ .req x23 + sAma_ .req x5 + sAme_ .req x10 + sAmi_ .req x14 + sAmo_ .req x19 + sAmu_ .req x24 + sAsa_ .req x1 + sAse_ .req x6 + sAsi_ .req x15 + sAso_ .req x20 + sAsu_ .req x25 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + sC0 .req x30 + sE0 .req x29 + sC1 .req x26 + sE1 .req x0 + sC2 .req x27 + sE2 .req x26 + sC3 .req x28 + sE3 .req x27 + sC4 .req x29 + sE4 .req x28 + + tmp .req x0 + +/************************ MACROS ****************************/ + +/* Macros using v8.4-A SHA-3 instructions */ + + +.macro eor2 d s0 s1 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor3_m0 d s0 s1 s2 + eor3 \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +.macro rax1_m0 d s0 s1 + rax1 \d\().2d, \s0\().2d, \s1\().2d +.endm + +.macro xar_m0 d s0 s1 imm + xar \d\().2d, \s0\().2d, \s1\().2d, #\imm +.endm + +.macro rax1_m1 d s0 s1 + xar_m0 tmp, vzr, \s1, 63 + eor \d\().16b, \s0\().16b, tmp.16b +.endm + +.macro bcax_m0 d s0 s1 s2 + bcax \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +.macro load_input_vector num idx + ldr vAbaq, [input_addr, #(16*(\num*0+\idx))] + ldr vAbeq, [input_addr, #(16*(\num*1+\idx))] + ldr vAbiq, [input_addr, #(16*(\num*2+\idx))] + ldr vAboq, [input_addr, #(16*(\num*3+\idx))] + ldr vAbuq, [input_addr, #(16*(\num*4+\idx))] + ldr vAgaq, [input_addr, #(16*(\num*5+\idx))] + ldr vAgeq, [input_addr, #(16*(\num*6+\idx))] + ldr vAgiq, [input_addr, #(16*(\num*7+\idx))] + ldr vAgoq, [input_addr, #(16*(\num*8+\idx))] + ldr vAguq, [input_addr, #(16*(\num*9+\idx))] + ldr vAkaq, [input_addr, #(16*(\num*10+\idx))] + ldr vAkeq, [input_addr, #(16*(\num*11+\idx))] + ldr vAkiq, [input_addr, #(16*(\num*12+\idx))] + ldr vAkoq, [input_addr, #(16*(\num*13+\idx))] + ldr vAkuq, [input_addr, #(16*(\num*14+\idx))] + ldr vAmaq, [input_addr, #(16*(\num*15+\idx))] + ldr vAmeq, [input_addr, #(16*(\num*16+\idx))] + ldr vAmiq, [input_addr, #(16*(\num*17+\idx))] + ldr vAmoq, [input_addr, #(16*(\num*18+\idx))] + ldr vAmuq, [input_addr, #(16*(\num*19+\idx))] + ldr vAsaq, [input_addr, #(16*(\num*20+\idx))] + ldr vAseq, [input_addr, #(16*(\num*21+\idx))] + ldr vAsiq, [input_addr, #(16*(\num*22+\idx))] + ldr vAsoq, [input_addr, #(16*(\num*23+\idx))] + ldr vAsuq, [input_addr, #(16*(\num*24+\idx))] +.endm + +.macro store_input_vector num idx + str vAbaq, [input_addr, #(16*(\num*0+\idx))] + str vAbeq, [input_addr, #(16*(\num*1+\idx))] + str vAbiq, [input_addr, #(16*(\num*2+\idx))] + str vAboq, [input_addr, #(16*(\num*3+\idx))] + str vAbuq, [input_addr, #(16*(\num*4+\idx))] + str vAgaq, [input_addr, #(16*(\num*5+\idx))] + str vAgeq, [input_addr, #(16*(\num*6+\idx))] + str vAgiq, [input_addr, #(16*(\num*7+\idx))] + str vAgoq, [input_addr, #(16*(\num*8+\idx))] + str vAguq, [input_addr, #(16*(\num*9+\idx))] + str vAkaq, [input_addr, #(16*(\num*10+\idx))] + str vAkeq, [input_addr, #(16*(\num*11+\idx))] + str vAkiq, [input_addr, #(16*(\num*12+\idx))] + str vAkoq, [input_addr, #(16*(\num*13+\idx))] + str vAkuq, [input_addr, #(16*(\num*14+\idx))] + str vAmaq, [input_addr, #(16*(\num*15+\idx))] + str vAmeq, [input_addr, #(16*(\num*16+\idx))] + str vAmiq, [input_addr, #(16*(\num*17+\idx))] + str vAmoq, [input_addr, #(16*(\num*18+\idx))] + str vAmuq, [input_addr, #(16*(\num*19+\idx))] + str vAsaq, [input_addr, #(16*(\num*20+\idx))] + str vAseq, [input_addr, #(16*(\num*21+\idx))] + str vAsiq, [input_addr, #(16*(\num*22+\idx))] + str vAsoq, [input_addr, #(16*(\num*23+\idx))] + str vAsuq, [input_addr, #(16*(\num*24+\idx))] +.endm + +.macro store_input_scalar num idx + str s_Aba, [input_addr, 8*(\num*(0) +\idx)] + str sAbe, [input_addr, 8*(\num*(0+1) +\idx)] + str sAbi, [input_addr, 8*(\num*(2)+ \idx)] + str sAbo, [input_addr, 8*(\num*(2+1) +\idx)] + str sAbu, [input_addr, 8*(\num*(4)+ \idx)] + str sAga, [input_addr, 8*(\num*(4+1) +\idx)] + str sAge, [input_addr, 8*(\num*(6)+ \idx)] + str sAgi, [input_addr, 8*(\num*(6+1) +\idx)] + str sAgo, [input_addr, 8*(\num*(8)+ \idx)] + str sAgu, [input_addr, 8*(\num*(8+1) +\idx)] + str sAka, [input_addr, 8*(\num*(10) +\idx)] + str sAke, [input_addr, 8*(\num*(10+1)+\idx)] + str sAki, [input_addr, 8*(\num*(12) +\idx)] + str sAko, [input_addr, 8*(\num*(12+1)+\idx)] + str sAku, [input_addr, 8*(\num*(14) +\idx)] + str sAma, [input_addr, 8*(\num*(14+1)+\idx)] + str sAme, [input_addr, 8*(\num*(16) +\idx)] + str sAmi, [input_addr, 8*(\num*(16+1)+\idx)] + str sAmo, [input_addr, 8*(\num*(18) +\idx)] + str sAmu, [input_addr, 8*(\num*(18+1)+\idx)] + str sAsa, [input_addr, 8*(\num*(20) +\idx)] + str sAse, [input_addr, 8*(\num*(20+1)+\idx)] + str sAsi, [input_addr, 8*(\num*(22) +\idx)] + str sAso, [input_addr, 8*(\num*(22+1)+\idx)] + str sAsu, [input_addr, 8*(\num*(24) +\idx)] +.endm + +.macro load_input_scalar num idx + ldr s_Aba, [input_addr, 8*(\num*(0) +\idx)] + ldr sAbe, [input_addr, 8*(\num*(0+1) +\idx)] + ldr sAbi, [input_addr, 8*(\num*(2)+ \idx)] + ldr sAbo, [input_addr, 8*(\num*(2+1) +\idx)] + ldr sAbu, [input_addr, 8*(\num*(4)+ \idx)] + ldr sAga, [input_addr, 8*(\num*(4+1) +\idx)] + ldr sAge, [input_addr, 8*(\num*(6)+ \idx)] + ldr sAgi, [input_addr, 8*(\num*(6+1) +\idx)] + ldr sAgo, [input_addr, 8*(\num*(8)+ \idx)] + ldr sAgu, [input_addr, 8*(\num*(8+1) +\idx)] + ldr sAka, [input_addr, 8*(\num*(10) +\idx)] + ldr sAke, [input_addr, 8*(\num*(10+1)+\idx)] + ldr sAki, [input_addr, 8*(\num*(12) +\idx)] + ldr sAko, [input_addr, 8*(\num*(12+1)+\idx)] + ldr sAku, [input_addr, 8*(\num*(14) +\idx)] + ldr sAma, [input_addr, 8*(\num*(14+1)+\idx)] + ldr sAme, [input_addr, 8*(\num*(16) +\idx)] + ldr sAmi, [input_addr, 8*(\num*(16+1)+\idx)] + ldr sAmo, [input_addr, 8*(\num*(18) +\idx)] + ldr sAmu, [input_addr, 8*(\num*(18+1)+\idx)] + ldr sAsa, [input_addr, 8*(\num*(20) +\idx)] + ldr sAse, [input_addr, 8*(\num*(20+1)+\idx)] + ldr sAsi, [input_addr, 8*(\num*(22) +\idx)] + ldr sAso, [input_addr, 8*(\num*(22+1)+\idx)] + ldr sAsu, [input_addr, 8*(\num*(24) +\idx)] +.endm + +#define STACK_SIZE (8*8 + 16*6 + 4*8 + 16*5) // VREGS (8*8), GPRs (16*6), count (8), const (8), input (8), padding (8) +#define STACK_BASE_GPRS (4*8) +#define STACK_BASE_VREGS (4*8+16*6) +#define STACK_BASE_TMP (8*8 + 16*6 + 4*8) +#define STACK_OFFSET_INPUT (0*8) +#define STACK_OFFSET_CONST (1*8) +#define STACK_OFFSET_COUNT (2*8) +#define STACK_OFFSET_INPUT_SCALAR (3*8) + +#define vAga_offset 0 +#define vAge_offset 1 +#define vAgi_offset 2 +#define vAgo_offset 3 +#define vAgu_offset 4 + +#define save(name) \ + str name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] +#define restore(name) \ + ldr name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] + + +.macro save_gprs + stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro save_vregs + stp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] + stp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + stp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + stp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] +.endm + +.macro restore_vregs + ldp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] + ldp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + ldp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + ldp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] +.endm + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +.macro eor5 dst, src0, src1, src2, src3, src4 + eor \dst, \src0, \src1 + eor \dst, \dst, \src2 + eor \dst, \dst, \src3 + eor \dst, \dst, \src4 +.endm + +.macro xor_rol dst, src1, src0, imm + eor \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro bic_rol dst, src1, src0, imm + bic \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro rotate dst, src, imm + ror \dst, \src, #(64-\imm) +.endm + +.macro save reg, offset + str \reg, [sp, #\offset] +.endm + +.macro restore reg, offset + ldr \reg, [sp, #\offset] +.endm + +.macro hybrid_round_initial +eor sC0, sAma, sAsa SEP +eor sC1, sAme, sAse SEP eor3_m0 C0, vAba, vAga, vAka +eor sC2, sAmi, sAsi SEP eor3_m0 C1, vAbe, vAge, vAke +eor sC3, sAmo, sAso SEP eor3_m0 C2, vAbi, vAgi, vAki +eor sC4, sAmu, sAsu SEP eor3_m0 C3, vAbo, vAgo, vAko +eor sC0, sAka, sC0 SEP eor3_m0 C4, vAbu, vAgu, vAku +eor sC1, sAke, sC1 SEP save(vAga) +eor sC2, sAki, sC2 SEP +eor sC3, sAko, sC3 SEP vzr .req vAga +eor sC4, sAku, sC4 SEP eor vzr.16b, vzr.16b, vzr.16b +eor sC0, sAga, sC0 SEP save(vAge) +eor sC1, sAge, sC1 SEP save(vAgi) +eor sC2, sAgi, sC2 SEP save(vAgo) +eor sC3, sAgo, sC3 SEP save(vAgu) +eor sC4, sAgu, sC4 SEP +eor sC0, s_Aba, sC0 SEP C0r .req vAge +eor sC1, sAbe, sC1 SEP C1r .req vAgi +eor sC2, sAbi, sC2 SEP C2r .req vAgo +eor sC3, sAbo, sC3 SEP C3r .req vAgu +eor sC4, sAbu, sC4 SEP C4r .req v31 +eor sE1, sC0, sC2, ROR #63 SEP eor3_m0 C0, C0, vAma, vAsa +eor sE3, sC2, sC4, ROR #63 SEP +eor sE0, sC4, sC1, ROR #63 SEP eor3_m0 C1, C1, vAme, vAse +eor sE2, sC1, sC3, ROR #63 SEP eor3_m0 C2, C2, vAmi, vAsi +eor sE4, sC3, sC0, ROR #63 SEP eor3_m0 C3, C3, vAmo, vAso +eor s_Aba_, s_Aba, sE0 SEP eor3_m0 C4, C4, vAmu, vAsu +eor sAsa_, sAbi, sE2 SEP xar_m0 C2r, vzr, C2, 63 +eor sAbi_, sAki, sE2 SEP +eor sAki_, sAko, sE3 SEP xar_m0 C4r, vzr, C4, 63 +eor sAko_, sAmu, sE4 SEP xar_m0 C1r, vzr, C1, 63 +eor sAmu_, sAso, sE3 SEP xar_m0 C3r, vzr, C3, 63 +eor sAso_, sAma, sE0 SEP xar_m0 C0r, vzr, C0, 63 +eor sAka_, sAbe, sE1 SEP eor2 E1, C0, C2r +eor sAse_, sAgo, sE3 SEP restore(vAgo) +eor sAgo_, sAme, sE1 SEP +eor sAke_, sAgi, sE2 SEP eor2 E3, C2, C4r +eor sAgi_, sAka, sE0 SEP restore(vAga) +eor sAga_, sAbo, sE3 SEP eor2 E0, C4, C1r +eor sAbo_, sAmo, sE3 SEP restore(vAgi) +eor sAmo_, sAmi, sE2 SEP eor2 E2, C1, C3r +eor sAmi_, sAke, sE1 SEP restore(vAgu) +eor sAge_, sAgu, sE4 SEP +eor sAgu_, sAsi, sE2 SEP eor2 E4, C3, C0r +eor sAsi_, sAku, sE4 SEP restore(vAge) +eor sAku_, sAsa, sE0 SEP eor vAba_.16b, vAba.16b, E0.16b +eor sAma_, sAbu, sE4 SEP xar_m0 vAsa_, vAbi, E2, 2 +eor sAbu_, sAsu, sE4 SEP xar_m0 vAbi_, vAki, E2, 21 +eor sAsu_, sAse, sE1 SEP +eor sAme_, sAga, sE0 SEP xar_m0 vAki_, vAko, E3, 39 +eor sAbe_, sAge, sE1 SEP xar_m0 vAko_, vAmu, E4, 56 +load_constant_ptr SEP xar_m0 vAmu_, vAso, E3, 8 +tmp0 .req x0 SEP xar_m0 vAso_, vAma, E0, 23 +tmp1 .req x29 SEP xar_m0 vAka_, vAbe, E1, 63 +bic tmp0, sAgi_, sAge_, ROR #47 SEP xar_m0 vAse_, vAgo, E3, 9 +bic tmp1, sAgo_, sAgi_, ROR #42 SEP +eor sAga, tmp0, sAga_, ROR #39 SEP xar_m0 vAgo_, vAme, E1, 19 +bic tmp0, sAgu_, sAgo_, ROR #16 SEP xar_m0 vAke_, vAgi, E2, 58 +eor sAge, tmp1, sAge_, ROR #25 SEP xar_m0 vAgi_, vAka, E0, 61 +bic tmp1, sAga_, sAgu_, ROR #31 SEP xar_m0 vAga_, vAbo, E3, 36 +eor sAgi, tmp0, sAgi_, ROR #58 SEP xar_m0 vAbo_, vAmo, E3, 43 +bic tmp0, sAge_, sAga_, ROR #56 SEP xar_m0 vAmo_, vAmi, E2, 49 +eor sAgo, tmp1, sAgo_, ROR #47 SEP +bic tmp1, sAki_, sAke_, ROR #19 SEP xar_m0 vAmi_, vAke, E1, 54 +eor sAgu, tmp0, sAgu_, ROR #23 SEP xar_m0 vAge_, vAgu, E4, 44 +bic tmp0, sAko_, sAki_, ROR #47 SEP xar_m0 vAgu_, vAsi, E2, 3 +eor sAka, tmp1, sAka_, ROR #24 SEP xar_m0 vAsi_, vAku, E4, 25 +bic tmp1, sAku_, sAko_, ROR #10 SEP xar_m0 vAku_, vAsa, E0, 46 +eor sAke, tmp0, sAke_, ROR #2 SEP +bic tmp0, sAka_, sAku_, ROR #47 SEP xar_m0 vAma_, vAbu, E4, 37 +eor sAki, tmp1, sAki_, ROR #57 SEP xar_m0 vAbu_, vAsu, E4, 50 +bic tmp1, sAke_, sAka_, ROR #5 SEP xar_m0 vAsu_, vAse, E1, 62 +eor sAko, tmp0, sAko_, ROR #57 SEP xar_m0 vAme_, vAga, E0, 28 +bic tmp0, sAmi_, sAme_, ROR #38 SEP xar_m0 vAbe_, vAge, E1, 20 +eor sAku, tmp1, sAku_, ROR #52 SEP restore x27, STACK_OFFSET_CONST +bic tmp1, sAmo_, sAmi_, ROR #5 SEP +eor sAma, tmp0, sAma_, ROR #47 SEP ldr q31, [x27], #16 +bic tmp0, sAmu_, sAmo_, ROR #41 SEP save x27, STACK_OFFSET_CONST +eor sAme, tmp1, sAme_, ROR #43 SEP bcax_m0 vAga, vAga_, vAgi_, vAge_ +bic tmp1, sAma_, sAmu_, ROR #35 SEP bcax_m0 vAge, vAge_, vAgo_, vAgi_ +eor sAmi, tmp0, sAmi_, ROR #46 SEP bcax_m0 vAgi, vAgi_, vAgu_, vAgo_ +bic tmp0, sAme_, sAma_, ROR #9 SEP bcax_m0 vAgo, vAgo_, vAga_, vAgu_ +ldr cur_const, [const_addr] SEP +eor sAmo, tmp1, sAmo_, ROR #12 SEP bcax_m0 vAgu, vAgu_, vAge_, vAga_ +bic tmp1, sAsi_, sAse_, ROR #48 SEP bcax_m0 vAka, vAka_, vAki_, vAke_ +eor sAmu, tmp0, sAmu_, ROR #44 SEP bcax_m0 vAke, vAke_, vAko_, vAki_ +bic tmp0, sAso_, sAsi_, ROR #2 SEP bcax_m0 vAki, vAki_, vAku_, vAko_ +eor sAsa, tmp1, sAsa_, ROR #41 SEP bcax_m0 vAko, vAko_, vAka_, vAku_ +bic tmp1, sAsu_, sAso_, ROR #25 SEP +eor sAse, tmp0, sAse_, ROR #50 SEP bcax_m0 vAku, vAku_, vAke_, vAka_ +bic tmp0, sAsa_, sAsu_, ROR #60 SEP bcax_m0 vAma, vAma_, vAmi_, vAme_ +eor sAsi, tmp1, sAsi_, ROR #27 SEP bcax_m0 vAme, vAme_, vAmo_, vAmi_ +bic tmp1, sAse_, sAsa_, ROR #57 SEP bcax_m0 vAmi, vAmi_, vAmu_, vAmo_ +eor sAso, tmp0, sAso_, ROR #21 SEP bcax_m0 vAmo, vAmo_, vAma_, vAmu_ +mov count, #1 SEP bcax_m0 vAmu, vAmu_, vAme_, vAma_ +bic tmp0, sAbi_, sAbe_, ROR #63 SEP +eor sAsu, tmp1, sAsu_, ROR #53 SEP bcax_m0 vAsa, vAsa_, vAsi_, vAse_ +bic tmp1, sAbo_, sAbi_, ROR #42 SEP bcax_m0 vAse, vAse_, vAso_, vAsi_ +eor s_Aba, s_Aba_, tmp0, ROR #21 SEP bcax_m0 vAsi, vAsi_, vAsu_, vAso_ +bic tmp0, sAbu_, sAbo_, ROR #57 SEP bcax_m0 vAso, vAso_, vAsa_, vAsu_ +eor sAbe, tmp1, sAbe_, ROR #41 SEP bcax_m0 vAsu, vAsu_, vAse_, vAsa_ +bic tmp1, s_Aba_, sAbu_, ROR #50 SEP bcax_m0 vAba, vAba_, vAbi_, vAbe_ +eor sAbi, tmp0, sAbi_, ROR #35 SEP +bic tmp0, sAbe_, s_Aba_, ROR #44 SEP bcax_m0 vAbe, vAbe_, vAbo_, vAbi_ +eor sAbo, tmp1, sAbo_, ROR #43 SEP bcax_m0 vAbi, vAbi_, vAbu_, vAbo_ +eor sAbu, tmp0, sAbu_, ROR #30 SEP bcax_m0 vAbo, vAbo_, vAba_, vAbu_ +eor s_Aba, s_Aba, cur_const SEP bcax_m0 vAbu, vAbu_, vAbe_, vAba_ +save count, STACK_OFFSET_COUNT SEP eor vAba.16b, vAba.16b, v31.16b +.endm + +.macro hybrid_round_noninitial +eor sC2, sAsi, sAbi, ROR #52 SEP +eor sC0, s_Aba, sAga, ROR #61 SEP eor3_m0 C0, vAba, vAga, vAka +eor sC4, sAku, sAgu, ROR #50 SEP eor3_m0 C1, vAbe, vAge, vAke +eor sC1, sAke, sAme, ROR #57 SEP eor3_m0 C2, vAbi, vAgi, vAki +eor sC3, sAbo, sAko, ROR #63 SEP eor3_m0 C3, vAbo, vAgo, vAko +eor sC2, sC2, sAki, ROR #48 SEP eor3_m0 C4, vAbu, vAgu, vAku +eor sC0, sC0, sAma, ROR #54 SEP +eor sC4, sC4, sAmu, ROR #34 SEP save(vAga) +eor sC1, sC1, sAbe, ROR #51 SEP vzr .req vAga +eor sC3, sC3, sAmo, ROR #37 SEP eor vzr.16b, vzr.16b, vzr.16b +eor sC2, sC2, sAmi, ROR #10 SEP save(vAge) +eor sC0, sC0, sAka, ROR #39 SEP save(vAgi) +eor sC4, sC4, sAbu, ROR #26 SEP +eor sC1, sC1, sAse, ROR #31 SEP save(vAgo) +eor sC3, sC3, sAgo, ROR #36 SEP save(vAgu) +eor sC2, sC2, sAgi, ROR #5 SEP C0r .req vAge +eor sC0, sC0, sAsa, ROR #25 SEP C1r .req vAgi +eor sC4, sC4, sAsu, ROR #15 SEP +eor sC1, sC1, sAge, ROR #27 SEP C2r .req vAgo +eor sC3, sC3, sAso, ROR #2 SEP C3r .req vAgu +eor sE1, sC0, sC2, ROR #61 SEP C4r .req v31 +ror sC2, sC2, 62 SEP eor3_m0 C0, C0, vAma, vAsa +eor sE3, sC2, sC4, ROR #57 SEP eor3_m0 C1, C1, vAme, vAse +ror sC4, sC4, 58 SEP +eor sE0, sC4, sC1, ROR #55 SEP eor3_m0 C2, C2, vAmi, vAsi +ror sC1, sC1, 56 SEP eor3_m0 C3, C3, vAmo, vAso +eor sE2, sC1, sC3, ROR #63 SEP eor3_m0 C4, C4, vAmu, vAsu +eor sE4, sC3, sC0, ROR #63 SEP xar_m0 C2r, vzr, C2, 63 +eor s_Aba_, sE0, s_Aba SEP +eor sAsa_, sE2, sAbi, ROR #50 SEP xar_m0 C4r, vzr, C4, 63 +eor sAbi_, sE2, sAki, ROR #46 SEP xar_m0 C1r, vzr, C1, 63 +eor sAki_, sE3, sAko, ROR #63 SEP xar_m0 C3r, vzr, C3, 63 +eor sAko_, sE4, sAmu, ROR #28 SEP xar_m0 C0r, vzr, C0, 63 +eor sAmu_, sE3, sAso, ROR #2 SEP eor2 E1, C0, C2r +eor sAso_, sE0, sAma, ROR #54 SEP +eor sAka_, sE1, sAbe, ROR #43 SEP restore(vAgo) +eor sAse_, sE3, sAgo, ROR #36 SEP eor2 E3, C2, C4r +eor sAgo_, sE1, sAme, ROR #49 SEP restore(vAga) +eor sAke_, sE2, sAgi, ROR #3 SEP eor2 E0, C4, C1r +eor sAgi_, sE0, sAka, ROR #39 SEP +eor sAga_, sE3, sAbo SEP restore(vAgi) +eor sAbo_, sE3, sAmo, ROR #37 SEP eor2 E2, C1, C3r +eor sAmo_, sE2, sAmi, ROR #8 SEP restore(vAgu) +eor sAmi_, sE1, sAke, ROR #56 SEP eor2 E4, C3, C0r +eor sAge_, sE4, sAgu, ROR #44 SEP restore(vAge) +eor sAgu_, sE2, sAsi, ROR #62 SEP +eor sAsi_, sE4, sAku, ROR #58 SEP eor vAba_.16b, vAba.16b, E0.16b +eor sAku_, sE0, sAsa, ROR #25 SEP xar_m0 vAsa_, vAbi, E2, 2 +eor sAma_, sE4, sAbu, ROR #20 SEP xar_m0 vAbi_, vAki, E2, 21 +eor sAbu_, sE4, sAsu, ROR #9 SEP xar_m0 vAki_, vAko, E3, 39 +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP xar_m0 vAko_, vAmu, E4, 56 +eor sAbe_, sE1, sAge, ROR #19 SEP xar_m0 vAmu_, vAso, E3, 8 +load_constant_ptr SEP xar_m0 vAso_, vAma, E0, 23 +restore count, STACK_OFFSET_COUNT SEP xar_m0 vAka_, vAbe, E1, 63 +tmp0 .req x0 SEP xar_m0 vAse_, vAgo, E3, 9 +tmp1 .req x29 SEP +bic tmp0, sAgi_, sAge_, ROR #47 SEP xar_m0 vAgo_, vAme, E1, 19 +bic tmp1, sAgo_, sAgi_, ROR #42 SEP xar_m0 vAke_, vAgi, E2, 58 +eor sAga, tmp0, sAga_, ROR #39 SEP xar_m0 vAgi_, vAka, E0, 61 +bic tmp0, sAgu_, sAgo_, ROR #16 SEP xar_m0 vAga_, vAbo, E3, 36 +eor sAge, tmp1, sAge_, ROR #25 SEP xar_m0 vAbo_, vAmo, E3, 43 +bic tmp1, sAga_, sAgu_, ROR #31 SEP +eor sAgi, tmp0, sAgi_, ROR #58 SEP xar_m0 vAmo_, vAmi, E2, 49 +bic tmp0, sAge_, sAga_, ROR #56 SEP xar_m0 vAmi_, vAke, E1, 54 +eor sAgo, tmp1, sAgo_, ROR #47 SEP xar_m0 vAge_, vAgu, E4, 44 +bic tmp1, sAki_, sAke_, ROR #19 SEP xar_m0 vAgu_, vAsi, E2, 3 +eor sAgu, tmp0, sAgu_, ROR #23 SEP +bic tmp0, sAko_, sAki_, ROR #47 SEP xar_m0 vAsi_, vAku, E4, 25 +eor sAka, tmp1, sAka_, ROR #24 SEP xar_m0 vAku_, vAsa, E0, 46 +bic tmp1, sAku_, sAko_, ROR #10 SEP xar_m0 vAma_, vAbu, E4, 37 +eor sAke, tmp0, sAke_, ROR #2 SEP xar_m0 vAbu_, vAsu, E4, 50 +bic tmp0, sAka_, sAku_, ROR #47 SEP xar_m0 vAsu_, vAse, E1, 62 +eor sAki, tmp1, sAki_, ROR #57 SEP +bic tmp1, sAke_, sAka_, ROR #5 SEP xar_m0 vAme_, vAga, E0, 28 +eor sAko, tmp0, sAko_, ROR #57 SEP xar_m0 vAbe_, vAge, E1, 20 +bic tmp0, sAmi_, sAme_, ROR #38 SEP +eor sAku, tmp1, sAku_, ROR #52 SEP +bic tmp1, sAmo_, sAmi_, ROR #5 SEP +eor sAma, tmp0, sAma_, ROR #47 SEP +bic tmp0, sAmu_, sAmo_, ROR #41 SEP bcax_m0 vAga, vAga_, vAgi_, vAge_ +eor sAme, tmp1, sAme_, ROR #43 SEP bcax_m0 vAge, vAge_, vAgo_, vAgi_ +bic tmp1, sAma_, sAmu_, ROR #35 SEP bcax_m0 vAgi, vAgi_, vAgu_, vAgo_ +eor sAmi, tmp0, sAmi_, ROR #46 SEP bcax_m0 vAgo, vAgo_, vAga_, vAgu_ +bic tmp0, sAme_, sAma_, ROR #9 SEP +ldr cur_const, [const_addr, count, UXTW #3] SEP bcax_m0 vAgu, vAgu_, vAge_, vAga_ +eor sAmo, tmp1, sAmo_, ROR #12 SEP bcax_m0 vAka, vAka_, vAki_, vAke_ +bic tmp1, sAsi_, sAse_, ROR #48 SEP bcax_m0 vAke, vAke_, vAko_, vAki_ +eor sAmu, tmp0, sAmu_, ROR #44 SEP bcax_m0 vAki, vAki_, vAku_, vAko_ +bic tmp0, sAso_, sAsi_, ROR #2 SEP +eor sAsa, tmp1, sAsa_, ROR #41 SEP bcax_m0 vAko, vAko_, vAka_, vAku_ +bic tmp1, sAsu_, sAso_, ROR #25 SEP bcax_m0 vAku, vAku_, vAke_, vAka_ +eor sAse, tmp0, sAse_, ROR #50 SEP bcax_m0 vAma, vAma_, vAmi_, vAme_ +bic tmp0, sAsa_, sAsu_, ROR #60 SEP bcax_m0 vAme, vAme_, vAmo_, vAmi_ +eor sAsi, tmp1, sAsi_, ROR #27 SEP bcax_m0 vAmi, vAmi_, vAmu_, vAmo_ +bic tmp1, sAse_, sAsa_, ROR #57 SEP +eor sAso, tmp0, sAso_, ROR #21 SEP bcax_m0 vAmo, vAmo_, vAma_, vAmu_ +bic tmp0, sAbi_, sAbe_, ROR #63 SEP bcax_m0 vAmu, vAmu_, vAme_, vAma_ +add count, count, #1 SEP bcax_m0 vAsa, vAsa_, vAsi_, vAse_ +save count, STACK_OFFSET_COUNT SEP bcax_m0 vAse, vAse_, vAso_, vAsi_ +//TODO: schedule this better SEP +restore x27, STACK_OFFSET_CONST SEP +ldr q31, [x27], #16 SEP +save x27, STACK_OFFSET_CONST SEP +eor sAsu, tmp1, sAsu_, ROR #53 SEP +bic tmp1, sAbo_, sAbi_, ROR #42 SEP bcax_m0 vAsi, vAsi_, vAsu_, vAso_ +eor s_Aba, s_Aba_, tmp0, ROR #21 SEP bcax_m0 vAso, vAso_, vAsa_, vAsu_ +bic tmp0, sAbu_, sAbo_, ROR #57 SEP bcax_m0 vAsu, vAsu_, vAse_, vAsa_ +eor sAbe, tmp1, sAbe_, ROR #41 SEP bcax_m0 vAba, vAba_, vAbi_, vAbe_ +bic tmp1, s_Aba_, sAbu_, ROR #50 SEP bcax_m0 vAbe, vAbe_, vAbo_, vAbi_ +eor sAbi, tmp0, sAbi_, ROR #35 SEP +bic tmp0, sAbe_, s_Aba_, ROR #44 SEP bcax_m0 vAbi, vAbi_, vAbu_, vAbo_ +eor sAbo, tmp1, sAbo_, ROR #43 SEP bcax_m0 vAbo, vAbo_, vAba_, vAbu_ +eor sAbu, tmp0, sAbu_, ROR #30 SEP bcax_m0 vAbu, vAbu_, vAbe_, vAba_ +eor s_Aba, s_Aba, cur_const SEP eor vAba.16b, vAba.16b, v31.16b +.endm + + +.macro hybrid_round_final +eor sC2, sAsi, sAbi, ROR #52 SEP +eor sC0, s_Aba, sAga, ROR #61 SEP eor3_m0 C0, vAba, vAga, vAka +eor sC4, sAku, sAgu, ROR #50 SEP eor3_m0 C1, vAbe, vAge, vAke +eor sC1, sAke, sAme, ROR #57 SEP eor3_m0 C2, vAbi, vAgi, vAki +eor sC3, sAbo, sAko, ROR #63 SEP +eor sC2, sC2, sAki, ROR #48 SEP eor3_m0 C3, vAbo, vAgo, vAko +eor sC0, sC0, sAma, ROR #54 SEP eor3_m0 C4, vAbu, vAgu, vAku +eor sC4, sC4, sAmu, ROR #34 SEP +eor sC1, sC1, sAbe, ROR #51 SEP save(vAga) +eor sC3, sC3, sAmo, ROR #37 SEP vzr .req vAga +eor sC2, sC2, sAmi, ROR #10 SEP +eor sC0, sC0, sAka, ROR #39 SEP eor vzr.16b, vzr.16b, vzr.16b +eor sC4, sC4, sAbu, ROR #26 SEP save(vAge) +eor sC1, sC1, sAse, ROR #31 SEP +eor sC3, sC3, sAgo, ROR #36 SEP save(vAgi) +eor sC2, sC2, sAgi, ROR #5 SEP save(vAgo) +eor sC0, sC0, sAsa, ROR #25 SEP +eor sC4, sC4, sAsu, ROR #15 SEP save(vAgu) +eor sC1, sC1, sAge, ROR #27 SEP C0r .req vAge +eor sC3, sC3, sAso, ROR #2 SEP +eor sE1, sC0, sC2, ROR #61 SEP C1r .req vAgi +ror sC2, sC2, 62 SEP C2r .req vAgo +eor sE3, sC2, sC4, ROR #57 SEP +ror sC4, sC4, 58 SEP C3r .req vAgu +eor sE0, sC4, sC1, ROR #55 SEP C4r .req v31 +ror sC1, sC1, 56 SEP +eor sE2, sC1, sC3, ROR #63 SEP eor3_m0 C0, C0, vAma, vAsa +eor sE4, sC3, sC0, ROR #63 SEP eor3_m0 C1, C1, vAme, vAse +eor s_Aba_, sE0, s_Aba SEP eor3_m0 C2, C2, vAmi, vAsi +eor sAsa_, sE2, sAbi, ROR #50 SEP +eor sAbi_, sE2, sAki, ROR #46 SEP eor3_m0 C3, C3, vAmo, vAso +eor sAki_, sE3, sAko, ROR #63 SEP eor3_m0 C4, C4, vAmu, vAsu +eor sAko_, sE4, sAmu, ROR #28 SEP +eor sAmu_, sE3, sAso, ROR #2 SEP xar_m0 C2r, vzr, C2, 63 +eor sAso_, sE0, sAma, ROR #54 SEP xar_m0 C4r, vzr, C4, 63 +eor sAka_, sE1, sAbe, ROR #43 SEP +eor sAse_, sE3, sAgo, ROR #36 SEP xar_m0 C1r, vzr, C1, 63 +eor sAgo_, sE1, sAme, ROR #49 SEP xar_m0 C3r, vzr, C3, 63 +eor sAke_, sE2, sAgi, ROR #3 SEP +eor sAgi_, sE0, sAka, ROR #39 SEP xar_m0 C0r, vzr, C0, 63 +eor sAga_, sE3, sAbo SEP eor2 E1, C0, C2r +eor sAbo_, sE3, sAmo, ROR #37 SEP +eor sAmo_, sE2, sAmi, ROR #8 SEP restore(vAgo) +eor sAmi_, sE1, sAke, ROR #56 SEP eor2 E3, C2, C4r +eor sAge_, sE4, sAgu, ROR #44 SEP +eor sAgu_, sE2, sAsi, ROR #62 SEP restore(vAga) +eor sAsi_, sE4, sAku, ROR #58 SEP eor2 E0, C4, C1r +eor sAku_, sE0, sAsa, ROR #25 SEP +eor sAma_, sE4, sAbu, ROR #20 SEP restore(vAgi) +eor sAbu_, sE4, sAsu, ROR #9 SEP eor2 E2, C1, C3r +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP restore(vAgu) +eor sAbe_, sE1, sAge, ROR #19 SEP eor2 E4, C3, C0r +load_constant_ptr SEP +tmp0 .req x0 SEP restore(vAge) +tmp1 .req x29 SEP eor vAba_.16b, vAba.16b, E0.16b +bic tmp0, sAgi_, sAge_, ROR #47 SEP xar_m0 vAsa_, vAbi, E2, 2 +bic tmp1, sAgo_, sAgi_, ROR #42 SEP +eor sAga, tmp0, sAga_, ROR #39 SEP xar_m0 vAbi_, vAki, E2, 21 +bic tmp0, sAgu_, sAgo_, ROR #16 SEP xar_m0 vAki_, vAko, E3, 39 +eor sAge, tmp1, sAge_, ROR #25 SEP +bic tmp1, sAga_, sAgu_, ROR #31 SEP xar_m0 vAko_, vAmu, E4, 56 +restore count, STACK_OFFSET_COUNT SEP xar_m0 vAmu_, vAso, E3, 8 +eor sAgi, tmp0, sAgi_, ROR #58 SEP +bic tmp0, sAge_, sAga_, ROR #56 SEP xar_m0 vAso_, vAma, E0, 23 +eor sAgo, tmp1, sAgo_, ROR #47 SEP xar_m0 vAka_, vAbe, E1, 63 +bic tmp1, sAki_, sAke_, ROR #19 SEP +eor sAgu, tmp0, sAgu_, ROR #23 SEP xar_m0 vAse_, vAgo, E3, 9 +bic tmp0, sAko_, sAki_, ROR #47 SEP xar_m0 vAgo_, vAme, E1, 19 +eor sAka, tmp1, sAka_, ROR #24 SEP +bic tmp1, sAku_, sAko_, ROR #10 SEP xar_m0 vAke_, vAgi, E2, 58 +eor sAke, tmp0, sAke_, ROR #2 SEP xar_m0 vAgi_, vAka, E0, 61 +bic tmp0, sAka_, sAku_, ROR #47 SEP +eor sAki, tmp1, sAki_, ROR #57 SEP xar_m0 vAga_, vAbo, E3, 36 +bic tmp1, sAke_, sAka_, ROR #5 SEP xar_m0 vAbo_, vAmo, E3, 43 +eor sAko, tmp0, sAko_, ROR #57 SEP +bic tmp0, sAmi_, sAme_, ROR #38 SEP xar_m0 vAmo_, vAmi, E2, 49 +eor sAku, tmp1, sAku_, ROR #52 SEP xar_m0 vAmi_, vAke, E1, 54 +bic tmp1, sAmo_, sAmi_, ROR #5 SEP +eor sAma, tmp0, sAma_, ROR #47 SEP xar_m0 vAge_, vAgu, E4, 44 +bic tmp0, sAmu_, sAmo_, ROR #41 SEP xar_m0 vAgu_, vAsi, E2, 3 +eor sAme, tmp1, sAme_, ROR #43 SEP xar_m0 vAsi_, vAku, E4, 25 +bic tmp1, sAma_, sAmu_, ROR #35 SEP +eor sAmi, tmp0, sAmi_, ROR #46 SEP xar_m0 vAku_, vAsa, E0, 46 +bic tmp0, sAme_, sAma_, ROR #9 SEP xar_m0 vAma_, vAbu, E4, 37 +ldr cur_const, [const_addr, count, UXTW #3] SEP +eor sAmo, tmp1, sAmo_, ROR #12 SEP xar_m0 vAbu_, vAsu, E4, 50 +bic tmp1, sAsi_, sAse_, ROR #48 SEP xar_m0 vAsu_, vAse, E1, 62 +eor sAmu, tmp0, sAmu_, ROR #44 SEP +bic tmp0, sAso_, sAsi_, ROR #2 SEP xar_m0 vAme_, vAga, E0, 28 +eor sAsa, tmp1, sAsa_, ROR #41 SEP xar_m0 vAbe_, vAge, E1, 20 +bic tmp1, sAsu_, sAso_, ROR #25 SEP +eor sAse, tmp0, sAse_, ROR #50 SEP restore x27, STACK_OFFSET_CONST +bic tmp0, sAsa_, sAsu_, ROR #60 SEP ldr q31, [x27], #16 +eor sAsi, tmp1, sAsi_, ROR #27 SEP +bic tmp1, sAse_, sAsa_, ROR #57 SEP save x27, STACK_OFFSET_CONST +eor sAso, tmp0, sAso_, ROR #21 SEP bcax_m0 vAga, vAga_, vAgi_, vAge_ +bic tmp0, sAbi_, sAbe_, ROR #63 SEP +add count, count, #1 SEP bcax_m0 vAge, vAge_, vAgo_, vAgi_ +save count, STACK_OFFSET_COUNT SEP bcax_m0 vAgi, vAgi_, vAgu_, vAgo_ +eor sAsu, tmp1, sAsu_, ROR #53 SEP +bic tmp1, sAbo_, sAbi_, ROR #42 SEP bcax_m0 vAgo, vAgo_, vAga_, vAgu_ +eor s_Aba, s_Aba_, tmp0, ROR #21 SEP bcax_m0 vAgu, vAgu_, vAge_, vAga_ +bic tmp0, sAbu_, sAbo_, ROR #57 SEP +eor sAbe, tmp1, sAbe_, ROR #41 SEP bcax_m0 vAka, vAka_, vAki_, vAke_ +bic tmp1, s_Aba_, sAbu_, ROR #50 SEP bcax_m0 vAke, vAke_, vAko_, vAki_ +eor sAbi, tmp0, sAbi_, ROR #35 SEP +bic tmp0, sAbe_, s_Aba_, ROR #44 SEP bcax_m0 vAki, vAki_, vAku_, vAko_ +eor sAbo, tmp1, sAbo_, ROR #43 SEP bcax_m0 vAko, vAko_, vAka_, vAku_ +eor sAbu, tmp0, sAbu_, ROR #30 SEP bcax_m0 vAku, vAku_, vAke_, vAka_ +eor s_Aba, s_Aba, cur_const SEP +ror sAga, sAga,(64-3) SEP bcax_m0 vAma, vAma_, vAmi_, vAme_ +ror sAbu, sAbu,(64-44) SEP bcax_m0 vAme, vAme_, vAmo_, vAmi_ +ror sAka, sAka,(64-25) SEP +ror sAke, sAke,(64-8) SEP bcax_m0 vAmi, vAmi_, vAmu_, vAmo_ +ror sAma, sAma,(64-10) SEP bcax_m0 vAmo, vAmo_, vAma_, vAmu_ +ror sAku, sAku,(64-6) SEP +ror sAsa, sAsa,(64-39) SEP bcax_m0 vAmu, vAmu_, vAme_, vAma_ +ror sAse, sAse,(64-41) SEP bcax_m0 vAsa, vAsa_, vAsi_, vAse_ +ror sAbe, sAbe,(64-21) SEP +ror sAge, sAge,(64-45) SEP bcax_m0 vAse, vAse_, vAso_, vAsi_ +ror sAgi, sAgi,(64-61) SEP bcax_m0 vAsi, vAsi_, vAsu_, vAso_ +ror sAme, sAme,(64-15) SEP +ror sAmi, sAmi,(64-56) SEP bcax_m0 vAso, vAso_, vAsa_, vAsu_ +ror sAbi, sAbi,(64-14) SEP bcax_m0 vAsu, vAsu_, vAse_, vAsa_ +ror sAki, sAki,(64-18) SEP +ror sAko, sAko,(64-1) SEP bcax_m0 vAba, vAba_, vAbi_, vAbe_ +ror sAsi, sAsi,(64-2) SEP bcax_m0 vAbe, vAbe_, vAbo_, vAbi_ +ror sAso, sAso,(64-62) SEP +ror sAgo, sAgo,(64-28) SEP bcax_m0 vAbi, vAbi_, vAbu_, vAbo_ +ror sAgu, sAgu,(64-20) SEP bcax_m0 vAbo, vAbo_, vAba_, vAbu_ +ror sAmo, sAmo,(64-27) SEP +ror sAmu, sAmu,(64-36) SEP bcax_m0 vAbu, vAbu_, vAbe_, vAba_ +ror sAsu, sAsu,(64-55) SEP eor vAba.16b, vAba.16b, v31.16b +.endm + + + +#define KECCAK_F1600_ROUNDS 24 + +.global keccak_f1600_x3_hybrid_asm_v7 +.global _keccak_f1600_x3_hybrid_asm_v7 +.text +.align 4 + +keccak_f1600_x3_hybrid_asm_v7: +_keccak_f1600_x3_hybrid_asm_v7: + alloc_stack + save_gprs + save_vregs + save input_addr, STACK_OFFSET_INPUT + + + ASM_LOAD(const_addr,round_constants_vec) + + save const_addr, STACK_OFFSET_CONST + load_input_vector 1,0 + + add input_addr, input_addr, #400 + load_input_scalar 1,0 + hybrid_round_initial + loop_0: + hybrid_round_noninitial + restore count, STACK_OFFSET_COUNT + cmp count, #(KECCAK_F1600_ROUNDS-2) + ble loop_0 + + hybrid_round_final + + restore input_addr, STACK_OFFSET_INPUT + store_input_vector 1,0 + add input_addr, input_addr, #400 + store_input_scalar 1,0 + + restore_vregs + restore_gprs + free_stack + + + ret +#endif \ No newline at end of file diff --git a/asm/manual/keccak_f1600/keccak_f1600_x4_hybrid_asm_v1.s b/asm/manual/keccak_f1600/keccak_f1600_x4_hybrid_asm_v1.s new file mode 100644 index 0000000..ae453d6 --- /dev/null +++ b/asm/manual/keccak_f1600/keccak_f1600_x4_hybrid_asm_v1.s @@ -0,0 +1,1142 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +#if defined(__ARM_FEATURE_SHA3) + +/********************** CONSTANTS *************************/ + .data + .align(8) +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x29 + count .req w27 + cur_const .req x26 + + /* Mapping of Kecck-f1600 SIMD state to vector registers + * at the beginning and end of each round. */ + + vAba .req v0 + vAbe .req v1 + vAbi .req v2 + vAbo .req v3 + vAbu .req v4 + vAga .req v5 + vAge .req v6 + vAgi .req v7 + vAgo .req v8 + vAgu .req v9 + vAka .req v10 + vAke .req v11 + vAki .req v12 + vAko .req v13 + vAku .req v14 + vAma .req v15 + vAme .req v16 + vAmi .req v17 + vAmo .req v18 + vAmu .req v19 + vAsa .req v20 + vAse .req v21 + vAsi .req v22 + vAso .req v23 + vAsu .req v24 + + /* q-form of the above mapping */ + vAbaq .req q0 + vAbeq .req q1 + vAbiq .req q2 + vAboq .req q3 + vAbuq .req q4 + vAgaq .req q5 + vAgeq .req q6 + vAgiq .req q7 + vAgoq .req q8 + vAguq .req q9 + vAkaq .req q10 + vAkeq .req q11 + vAkiq .req q12 + vAkoq .req q13 + vAkuq .req q14 + vAmaq .req q15 + vAmeq .req q16 + vAmiq .req q17 + vAmoq .req q18 + vAmuq .req q19 + vAsaq .req q20 + vAseq .req q21 + vAsiq .req q22 + vAsoq .req q23 + vAsuq .req q24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v30 + C1 .req v29 + C2 .req v28 + C3 .req v27 + C4 .req v26 + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req v26 + E1 .req v25 + E2 .req v29 + E3 .req v28 + E4 .req v27 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + vAbi_ .req v2 + vAbo_ .req v3 + vAbu_ .req v4 + vAga_ .req v10 + vAge_ .req v11 + vAgi_ .req v7 + vAgo_ .req v8 + vAgu_ .req v9 + vAka_ .req v15 + vAke_ .req v16 + vAki_ .req v12 + vAko_ .req v13 + vAku_ .req v14 + vAma_ .req v20 + vAme_ .req v21 + vAmi_ .req v17 + vAmo_ .req v18 + vAmu_ .req v19 + vAsa_ .req v0 + vAse_ .req v1 + vAsi_ .req v22 + vAso_ .req v23 + vAsu_ .req v24 + vAba_ .req v30 + vAbe_ .req v27 + + /* Mapping of Kecck-f1600 state to scalar registers + * at the beginning and end of each round. */ + s_Aba .req x1 + sAbe .req x6 + sAbi .req x11 + sAbo .req x16 + sAbu .req x21 + sAga .req x2 + sAge .req x7 + sAgi .req x12 + sAgo .req x17 + sAgu .req x22 + sAka .req x3 + sAke .req x8 + sAki .req x13 + sAko .req x18 + sAku .req x23 + sAma .req x4 + sAme .req x9 + sAmi .req x14 + sAmo .req x19 + sAmu .req x24 + sAsa .req x5 + sAse .req x10 + sAsi .req x15 + sAso .req x20 + sAsu .req x25 + + /* sA_[y,2*x+3*y] = rot(A[x,y]) */ + s_Aba_ .req x0 + sAbe_ .req x28 + sAbi_ .req x11 + sAbo_ .req x16 + sAbu_ .req x21 + sAga_ .req x3 + sAge_ .req x8 + sAgi_ .req x12 + sAgo_ .req x17 + sAgu_ .req x22 + sAka_ .req x4 + sAke_ .req x9 + sAki_ .req x13 + sAko_ .req x18 + sAku_ .req x23 + sAma_ .req x5 + sAme_ .req x10 + sAmi_ .req x14 + sAmo_ .req x19 + sAmu_ .req x24 + sAsa_ .req x1 + sAse_ .req x6 + sAsi_ .req x15 + sAso_ .req x20 + sAsu_ .req x25 + + /* sC[x] = sA[x,0] xor sA[x,1] xor sA[x,2] xor sA[x,3] xor sA[x,4], for x in 0..4 */ + /* sE[x] = sC[x-1] xor rot(C[x+1],1), for x in 0..4 */ + sC0 .req x0 + sE0 .req x29 + sC1 .req x26 + sE1 .req x30 + sC2 .req x27 + sE2 .req x26 + sC3 .req x28 + sE3 .req x27 + sC4 .req x29 + sE4 .req x28 + + tmp .req x30 + +/************************ MACROS ****************************/ + +/* Macros using v8.4-A SHA-3 instructions */ + +.macro eor3_m0 d s0 s1 s2 + eor3 \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +.macro rax1_m0 d s0 s1 + rax1 \d\().2d, \s0\().2d, \s1\().2d +.endm + +.macro xar_m0 d s0 s1 imm + xar \d\().2d, \s0\().2d, \s1\().2d, #\imm +.endm + +.macro bcax_m0 d s0 s1 s2 + bcax \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +.macro load_input_vector num idx + ldr vAbaq, [input_addr, #(16*(\num*0+\idx))] + ldr vAbeq, [input_addr, #(16*(\num*1+\idx))] + ldr vAbiq, [input_addr, #(16*(\num*2+\idx))] + ldr vAboq, [input_addr, #(16*(\num*3+\idx))] + ldr vAbuq, [input_addr, #(16*(\num*4+\idx))] + ldr vAgaq, [input_addr, #(16*(\num*5+\idx))] + ldr vAgeq, [input_addr, #(16*(\num*6+\idx))] + ldr vAgiq, [input_addr, #(16*(\num*7+\idx))] + ldr vAgoq, [input_addr, #(16*(\num*8+\idx))] + ldr vAguq, [input_addr, #(16*(\num*9+\idx))] + ldr vAkaq, [input_addr, #(16*(\num*10+\idx))] + ldr vAkeq, [input_addr, #(16*(\num*11+\idx))] + ldr vAkiq, [input_addr, #(16*(\num*12+\idx))] + ldr vAkoq, [input_addr, #(16*(\num*13+\idx))] + ldr vAkuq, [input_addr, #(16*(\num*14+\idx))] + ldr vAmaq, [input_addr, #(16*(\num*15+\idx))] + ldr vAmeq, [input_addr, #(16*(\num*16+\idx))] + ldr vAmiq, [input_addr, #(16*(\num*17+\idx))] + ldr vAmoq, [input_addr, #(16*(\num*18+\idx))] + ldr vAmuq, [input_addr, #(16*(\num*19+\idx))] + ldr vAsaq, [input_addr, #(16*(\num*20+\idx))] + ldr vAseq, [input_addr, #(16*(\num*21+\idx))] + ldr vAsiq, [input_addr, #(16*(\num*22+\idx))] + ldr vAsoq, [input_addr, #(16*(\num*23+\idx))] + ldr vAsuq, [input_addr, #(16*(\num*24+\idx))] +.endm + +.macro store_input_vector num idx + str vAbaq, [input_addr, #(16*(\num*0+\idx))] + str vAbeq, [input_addr, #(16*(\num*1+\idx))] + str vAbiq, [input_addr, #(16*(\num*2+\idx))] + str vAboq, [input_addr, #(16*(\num*3+\idx))] + str vAbuq, [input_addr, #(16*(\num*4+\idx))] + str vAgaq, [input_addr, #(16*(\num*5+\idx))] + str vAgeq, [input_addr, #(16*(\num*6+\idx))] + str vAgiq, [input_addr, #(16*(\num*7+\idx))] + str vAgoq, [input_addr, #(16*(\num*8+\idx))] + str vAguq, [input_addr, #(16*(\num*9+\idx))] + str vAkaq, [input_addr, #(16*(\num*10+\idx))] + str vAkeq, [input_addr, #(16*(\num*11+\idx))] + str vAkiq, [input_addr, #(16*(\num*12+\idx))] + str vAkoq, [input_addr, #(16*(\num*13+\idx))] + str vAkuq, [input_addr, #(16*(\num*14+\idx))] + str vAmaq, [input_addr, #(16*(\num*15+\idx))] + str vAmeq, [input_addr, #(16*(\num*16+\idx))] + str vAmiq, [input_addr, #(16*(\num*17+\idx))] + str vAmoq, [input_addr, #(16*(\num*18+\idx))] + str vAmuq, [input_addr, #(16*(\num*19+\idx))] + str vAsaq, [input_addr, #(16*(\num*20+\idx))] + str vAseq, [input_addr, #(16*(\num*21+\idx))] + str vAsiq, [input_addr, #(16*(\num*22+\idx))] + str vAsoq, [input_addr, #(16*(\num*23+\idx))] + str vAsuq, [input_addr, #(16*(\num*24+\idx))] +.endm + +.macro store_input_scalar num idx + str s_Aba, [input_addr, 8*(\num*(0) +\idx)] + str sAbe, [input_addr, 8*(\num*(0+1) +\idx)] + str sAbi, [input_addr, 8*(\num*(2)+ \idx)] + str sAbo, [input_addr, 8*(\num*(2+1) +\idx)] + str sAbu, [input_addr, 8*(\num*(4)+ \idx)] + str sAga, [input_addr, 8*(\num*(4+1) +\idx)] + str sAge, [input_addr, 8*(\num*(6)+ \idx)] + str sAgi, [input_addr, 8*(\num*(6+1) +\idx)] + str sAgo, [input_addr, 8*(\num*(8)+ \idx)] + str sAgu, [input_addr, 8*(\num*(8+1) +\idx)] + str sAka, [input_addr, 8*(\num*(10) +\idx)] + str sAke, [input_addr, 8*(\num*(10+1)+\idx)] + str sAki, [input_addr, 8*(\num*(12) +\idx)] + str sAko, [input_addr, 8*(\num*(12+1)+\idx)] + str sAku, [input_addr, 8*(\num*(14) +\idx)] + str sAma, [input_addr, 8*(\num*(14+1)+\idx)] + str sAme, [input_addr, 8*(\num*(16) +\idx)] + str sAmi, [input_addr, 8*(\num*(16+1)+\idx)] + str sAmo, [input_addr, 8*(\num*(18) +\idx)] + str sAmu, [input_addr, 8*(\num*(18+1)+\idx)] + str sAsa, [input_addr, 8*(\num*(20) +\idx)] + str sAse, [input_addr, 8*(\num*(20+1)+\idx)] + str sAsi, [input_addr, 8*(\num*(22) +\idx)] + str sAso, [input_addr, 8*(\num*(22+1)+\idx)] + str sAsu, [input_addr, 8*(\num*(24) +\idx)] +.endm + +.macro load_input_scalar num idx + ldr s_Aba, [input_addr, 8*(\num*(0) +\idx)] + ldr sAbe, [input_addr, 8*(\num*(0+1) +\idx)] + ldr sAbi, [input_addr, 8*(\num*(2)+ \idx)] + ldr sAbo, [input_addr, 8*(\num*(2+1) +\idx)] + ldr sAbu, [input_addr, 8*(\num*(4)+ \idx)] + ldr sAga, [input_addr, 8*(\num*(4+1) +\idx)] + ldr sAge, [input_addr, 8*(\num*(6)+ \idx)] + ldr sAgi, [input_addr, 8*(\num*(6+1) +\idx)] + ldr sAgo, [input_addr, 8*(\num*(8)+ \idx)] + ldr sAgu, [input_addr, 8*(\num*(8+1) +\idx)] + ldr sAka, [input_addr, 8*(\num*(10) +\idx)] + ldr sAke, [input_addr, 8*(\num*(10+1)+\idx)] + ldr sAki, [input_addr, 8*(\num*(12) +\idx)] + ldr sAko, [input_addr, 8*(\num*(12+1)+\idx)] + ldr sAku, [input_addr, 8*(\num*(14) +\idx)] + ldr sAma, [input_addr, 8*(\num*(14+1)+\idx)] + ldr sAme, [input_addr, 8*(\num*(16) +\idx)] + ldr sAmi, [input_addr, 8*(\num*(16+1)+\idx)] + ldr sAmo, [input_addr, 8*(\num*(18) +\idx)] + ldr sAmu, [input_addr, 8*(\num*(18+1)+\idx)] + ldr sAsa, [input_addr, 8*(\num*(20) +\idx)] + ldr sAse, [input_addr, 8*(\num*(20+1)+\idx)] + ldr sAsi, [input_addr, 8*(\num*(22) +\idx)] + ldr sAso, [input_addr, 8*(\num*(22+1)+\idx)] + ldr sAsu, [input_addr, 8*(\num*(24) +\idx)] +.endm + +#define STACK_SIZE (8*8 + 16*6 + 3*8 + 8) // VREGS (8*8), GPRs (16*6), count (8), const (8), input (8), padding (8) +#define STACK_BASE_GPRS (3*8+8) +#define STACK_BASE_VREGS (3*8+8+16*6) +#define STACK_OFFSET_INPUT (0*8) +#define STACK_OFFSET_CONST (1*8) +#define STACK_OFFSET_COUNT (2*8) + +.macro save_gprs + stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro save_vregs + stp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] + stp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + stp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + stp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] +.endm + +.macro restore_vregs + ldp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] + ldp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + ldp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + ldp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] +.endm + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +.macro eor5 dst, src0, src1, src2, src3, src4 + eor \dst, \src0, \src1 + eor \dst, \dst, \src2 + eor \dst, \dst, \src3 + eor \dst, \dst, \src4 +.endm + +.macro xor_rol dst, src1, src0, imm + eor \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro bic_rol dst, src1, src0, imm + bic \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro rotate dst, src, imm + ror \dst, \src, #(64-\imm) +.endm + +.macro save reg, offset + str \reg, [sp, #\offset] +.endm + +.macro restore reg, offset + ldr \reg, [sp, #\offset] +.endm + +.macro hybrid_round_initial + + eor sC0, sAma, sAsa SEP + eor sC1, sAme, sAse SEP + eor sC2, sAmi, sAsi SEP + eor sC3, sAmo, sAso SEP + eor sC4, sAmu, sAsu SEP + eor sC0, sAka, sC0 SEP + eor sC1, sAke, sC1 SEP + eor sC2, sAki, sC2 SEP + eor sC3, sAko, sC3 SEP + eor sC4, sAku, sC4 SEP + eor sC0, sAga, sC0 SEP + eor sC1, sAge, sC1 SEP + eor sC2, sAgi, sC2 SEP + eor sC3, sAgo, sC3 SEP + eor sC4, sAgu, sC4 SEP + eor sC0, s_Aba, sC0 SEP + eor sC1, sAbe, sC1 SEP + eor sC2, sAbi, sC2 SEP + eor sC3, sAbo, sC3 SEP + eor sC4, sAbu, sC4 SEP + SEP + eor sE1, sC0, sC2, ROR #63 SEP + eor sE3, sC2, sC4, ROR #63 SEP + eor sE0, sC4, sC1, ROR #63 SEP + eor sE2, sC1, sC3, ROR #63 SEP + eor sE4, sC3, sC0, ROR #63 SEP + SEP + eor s_Aba_, s_Aba, sE0 SEP + eor sAsa_, sAbi, sE2 SEP + eor sAbi_, sAki, sE2 SEP + eor sAki_, sAko, sE3 SEP + eor sAko_, sAmu, sE4 SEP + eor sAmu_, sAso, sE3 SEP + eor sAso_, sAma, sE0 SEP + eor sAka_, sAbe, sE1 SEP + eor sAse_, sAgo, sE3 SEP + eor sAgo_, sAme, sE1 SEP + eor sAke_, sAgi, sE2 SEP + eor sAgi_, sAka, sE0 SEP + eor sAga_, sAbo, sE3 SEP + eor sAbo_, sAmo, sE3 SEP + eor sAmo_, sAmi, sE2 SEP + eor sAmi_, sAke, sE1 SEP + eor sAge_, sAgu, sE4 SEP + eor sAgu_, sAsi, sE2 SEP + eor sAsi_, sAku, sE4 SEP + eor sAku_, sAsa, sE0 SEP + eor sAma_, sAbu, sE4 SEP + eor sAbu_, sAsu, sE4 SEP + eor sAsu_, sAse, sE1 SEP + eor sAme_, sAga, sE0 SEP + eor sAbe_, sAge, sE1 SEP + SEP + load_constant_ptr SEP + SEP + bic tmp, sAgi_, sAge_, ROR #47 SEP + eor sAga, tmp, sAga_, ROR #39 SEP + bic tmp, sAgo_, sAgi_, ROR #42 SEP + eor sAge, tmp, sAge_, ROR #25 SEP + bic tmp, sAgu_, sAgo_, ROR #16 SEP + eor sAgi, tmp, sAgi_, ROR #58 SEP + bic tmp, sAga_, sAgu_, ROR #31 SEP + eor sAgo, tmp, sAgo_, ROR #47 SEP + bic tmp, sAge_, sAga_, ROR #56 SEP + eor sAgu, tmp, sAgu_, ROR #23 SEP + bic tmp, sAki_, sAke_, ROR #19 SEP + eor sAka, tmp, sAka_, ROR #24 SEP + bic tmp, sAko_, sAki_, ROR #47 SEP + eor sAke, tmp, sAke_, ROR #2 SEP + bic tmp, sAku_, sAko_, ROR #10 SEP + eor sAki, tmp, sAki_, ROR #57 SEP + bic tmp, sAka_, sAku_, ROR #47 SEP + eor sAko, tmp, sAko_, ROR #57 SEP + bic tmp, sAke_, sAka_, ROR #5 SEP + eor sAku, tmp, sAku_, ROR #52 SEP + bic tmp, sAmi_, sAme_, ROR #38 SEP + eor sAma, tmp, sAma_, ROR #47 SEP + bic tmp, sAmo_, sAmi_, ROR #5 SEP + eor sAme, tmp, sAme_, ROR #43 SEP + bic tmp, sAmu_, sAmo_, ROR #41 SEP + eor sAmi, tmp, sAmi_, ROR #46 SEP + SEP + ldr cur_const, [const_addr] SEP + mov count, #1 SEP + SEP + bic tmp, sAma_, sAmu_, ROR #35 SEP + eor sAmo, tmp, sAmo_, ROR #12 SEP + bic tmp, sAme_, sAma_, ROR #9 SEP + eor sAmu, tmp, sAmu_, ROR #44 SEP + bic tmp, sAsi_, sAse_, ROR #48 SEP + eor sAsa, tmp, sAsa_, ROR #41 SEP + bic tmp, sAso_, sAsi_, ROR #2 SEP + eor sAse, tmp, sAse_, ROR #50 SEP + bic tmp, sAsu_, sAso_, ROR #25 SEP + eor sAsi, tmp, sAsi_, ROR #27 SEP + bic tmp, sAsa_, sAsu_, ROR #60 SEP + eor sAso, tmp, sAso_, ROR #21 SEP + bic tmp, sAse_, sAsa_, ROR #57 SEP + eor sAsu, tmp, sAsu_, ROR #53 SEP + bic tmp, sAbi_, sAbe_, ROR #63 SEP + eor s_Aba, s_Aba_, tmp, ROR #21 SEP + bic tmp, sAbo_, sAbi_, ROR #42 SEP + eor sAbe, tmp, sAbe_, ROR #41 SEP + bic tmp, sAbu_, sAbo_, ROR #57 SEP + eor sAbi, tmp, sAbi_, ROR #35 SEP + bic tmp, s_Aba_, sAbu_, ROR #50 SEP + eor sAbo, tmp, sAbo_, ROR #43 SEP + bic tmp, sAbe_, s_Aba_, ROR #44 SEP + eor sAbu, tmp, sAbu_, ROR #30 SEP + SEP + eor s_Aba, s_Aba, cur_const SEP + SEP + save count, STACK_OFFSET_COUNT SEP + SEP + eor sC0, sAka, sAsa, ROR #50 SEP + eor sC1, sAse, sAge, ROR #60 SEP + eor sC2, sAmi, sAgi, ROR #59 SEP + eor sC3, sAgo, sAso, ROR #30 SEP + eor sC4, sAbu, sAsu, ROR #53 SEP + eor sC0, sAma, sC0, ROR #49 SEP + eor sC1, sAbe, sC1, ROR #44 SEP + eor sC2, sAki, sC2, ROR #26 SEP + eor sC3, sAmo, sC3, ROR #63 SEP + eor sC4, sAmu, sC4, ROR #56 SEP + eor sC0, sAga, sC0, ROR #57 SEP + eor sC1, sAme, sC1, ROR #58 SEP + eor sC2, sAbi, sC2, ROR #60 SEP + eor sC3, sAko, sC3, ROR #38 SEP + eor sC4, sAgu, sC4, ROR #48 SEP + eor sC0, s_Aba, sC0, ROR #61 SEP + eor sC1, sAke, sC1, ROR #57 SEP + eor sC2, sAsi, sC2, ROR #52 SEP + eor sC3, sAbo, sC3, ROR #63 SEP + eor sC4, sAku, sC4, ROR #50 SEP + ror sC1, sC1, 56 SEP + ror sC4, sC4, 58 SEP + ror sC2, sC2, 62 SEP + SEP + eor sE1, sC0, sC2, ROR #63 SEP + eor sE3, sC2, sC4, ROR #63 SEP + eor sE0, sC4, sC1, ROR #63 SEP + eor sE2, sC1, sC3, ROR #63 SEP + eor sE4, sC3, sC0, ROR #63 SEP + SEP + eor s_Aba_, sE0, s_Aba SEP + eor sAsa_, sE2, sAbi, ROR #50 SEP + eor sAbi_, sE2, sAki, ROR #46 SEP + eor sAki_, sE3, sAko, ROR #63 SEP + eor sAko_, sE4, sAmu, ROR #28 SEP + eor sAmu_, sE3, sAso, ROR #2 SEP + eor sAso_, sE0, sAma, ROR #54 SEP + eor sAka_, sE1, sAbe, ROR #43 SEP + eor sAse_, sE3, sAgo, ROR #36 SEP + eor sAgo_, sE1, sAme, ROR #49 SEP + eor sAke_, sE2, sAgi, ROR #3 SEP + eor sAgi_, sE0, sAka, ROR #39 SEP + eor sAga_, sE3, sAbo SEP + eor sAbo_, sE3, sAmo, ROR #37 SEP + eor sAmo_, sE2, sAmi, ROR #8 SEP + eor sAmi_, sE1, sAke, ROR #56 SEP + eor sAge_, sE4, sAgu, ROR #44 SEP + eor sAgu_, sE2, sAsi, ROR #62 SEP + eor sAsi_, sE4, sAku, ROR #58 SEP + eor sAku_, sE0, sAsa, ROR #25 SEP + eor sAma_, sE4, sAbu, ROR #20 SEP + eor sAbu_, sE4, sAsu, ROR #9 SEP + eor sAsu_, sE1, sAse, ROR #23 SEP + eor sAme_, sE0, sAga, ROR #61 SEP + eor sAbe_, sE1, sAge, ROR #19 SEP + SEP + load_constant_ptr SEP + restore count, STACK_OFFSET_COUNT SEP + SEP + bic tmp, sAgi_, sAge_, ROR #47 SEP + eor sAga, tmp, sAga_, ROR #39 SEP + bic tmp, sAgo_, sAgi_, ROR #42 SEP + eor sAge, tmp, sAge_, ROR #25 SEP + bic tmp, sAgu_, sAgo_, ROR #16 SEP + eor sAgi, tmp, sAgi_, ROR #58 SEP + bic tmp, sAga_, sAgu_, ROR #31 SEP + eor sAgo, tmp, sAgo_, ROR #47 SEP + bic tmp, sAge_, sAga_, ROR #56 SEP + eor sAgu, tmp, sAgu_, ROR #23 SEP + bic tmp, sAki_, sAke_, ROR #19 SEP + eor sAka, tmp, sAka_, ROR #24 SEP + bic tmp, sAko_, sAki_, ROR #47 SEP + eor sAke, tmp, sAke_, ROR #2 SEP + bic tmp, sAku_, sAko_, ROR #10 SEP + eor sAki, tmp, sAki_, ROR #57 SEP + bic tmp, sAka_, sAku_, ROR #47 SEP + eor sAko, tmp, sAko_, ROR #57 SEP + bic tmp, sAke_, sAka_, ROR #5 SEP + eor sAku, tmp, sAku_, ROR #52 SEP + bic tmp, sAmi_, sAme_, ROR #38 SEP + eor sAma, tmp, sAma_, ROR #47 SEP + bic tmp, sAmo_, sAmi_, ROR #5 SEP + eor sAme, tmp, sAme_, ROR #43 SEP + bic tmp, sAmu_, sAmo_, ROR #41 SEP + eor sAmi, tmp, sAmi_, ROR #46 SEP + bic tmp, sAma_, sAmu_, ROR #35 SEP + SEP + ldr cur_const, [const_addr, count, UXTW #3] SEP + SEP + eor sAmo, tmp, sAmo_, ROR #12 SEP + bic tmp, sAme_, sAma_, ROR #9 SEP + eor sAmu, tmp, sAmu_, ROR #44 SEP + bic tmp, sAsi_, sAse_, ROR #48 SEP + eor sAsa, tmp, sAsa_, ROR #41 SEP + bic tmp, sAso_, sAsi_, ROR #2 SEP + eor sAse, tmp, sAse_, ROR #50 SEP + bic tmp, sAsu_, sAso_, ROR #25 SEP + eor sAsi, tmp, sAsi_, ROR #27 SEP + bic tmp, sAsa_, sAsu_, ROR #60 SEP + eor sAso, tmp, sAso_, ROR #21 SEP + bic tmp, sAse_, sAsa_, ROR #57 SEP + eor sAsu, tmp, sAsu_, ROR #53 SEP + bic tmp, sAbi_, sAbe_, ROR #63 SEP + eor s_Aba, s_Aba_, tmp, ROR #21 SEP + bic tmp, sAbo_, sAbi_, ROR #42 SEP + eor sAbe, tmp, sAbe_, ROR #41 SEP + bic tmp, sAbu_, sAbo_, ROR #57 SEP + eor sAbi, tmp, sAbi_, ROR #35 SEP + bic tmp, s_Aba_, sAbu_, ROR #50 SEP + eor sAbo, tmp, sAbo_, ROR #43 SEP + bic tmp, sAbe_, s_Aba_, ROR #44 SEP + eor sAbu, tmp, sAbu_, ROR #30 SEP + SEP + add count, count, #1 SEP + SEP + eor s_Aba, s_Aba, cur_const SEP + SEP + SEP + SEP eor3_m0 C0, vAba, vAga, vAka + SEP eor3_m0 C0, C0, vAma, vAsa + SEP eor3_m0 C1, vAbe, vAge, vAke + SEP eor3_m0 C1, C1, vAme, vAse + SEP eor3_m0 C2, vAbi, vAgi, vAki + SEP eor3_m0 C2, C2, vAmi, vAsi + SEP eor3_m0 C3, vAbo, vAgo, vAko + SEP eor3_m0 C3, C3, vAmo, vAso + SEP eor3_m0 C4, vAbu, vAgu, vAku + SEP eor3_m0 C4, C4, vAmu, vAsu + SEP + SEP rax1_m0 E1, C0, C2 + SEP rax1_m0 E3, C2, C4 + SEP rax1_m0 E0, C4, C1 + SEP rax1_m0 E2, C1, C3 + SEP rax1_m0 E4, C3, C0 + SEP + SEP eor vAba_.16b, vAba.16b, E0.16b + SEP xar_m0 vAsa_, vAbi, E2, 2 + SEP xar_m0 vAbi_, vAki, E2, 21 + SEP xar_m0 vAki_, vAko, E3, 39 + SEP xar_m0 vAko_, vAmu, E4, 56 + SEP xar_m0 vAmu_, vAso, E3, 8 + SEP xar_m0 vAso_, vAma, E0, 23 + SEP xar_m0 vAka_, vAbe, E1, 63 + SEP xar_m0 vAse_, vAgo, E3, 9 + SEP xar_m0 vAgo_, vAme, E1, 19 + SEP xar_m0 vAke_, vAgi, E2, 58 + SEP xar_m0 vAgi_, vAka, E0, 61 + SEP xar_m0 vAga_, vAbo, E3, 36 + SEP xar_m0 vAbo_, vAmo, E3, 43 + SEP xar_m0 vAmo_, vAmi, E2, 49 + SEP xar_m0 vAmi_, vAke, E1, 54 + SEP xar_m0 vAge_, vAgu, E4, 44 + SEP xar_m0 vAgu_, vAsi, E2, 3 + SEP xar_m0 vAsi_, vAku, E4, 25 + SEP xar_m0 vAku_, vAsa, E0, 46 + SEP xar_m0 vAma_, vAbu, E4, 37 + SEP xar_m0 vAbu_, vAsu, E4, 50 + SEP xar_m0 vAsu_, vAse, E1, 62 + SEP xar_m0 vAme_, vAga, E0, 28 + SEP xar_m0 vAbe_, vAge, E1, 20 + SEP + SEP restore const_addr, STACK_OFFSET_CONST + SEP ld1r {v31.2d}, [const_addr], #8 + SEP save const_addr, STACK_OFFSET_CONST + SEP + SEP bcax_m0 vAga, vAga_, vAgi_, vAge_ + SEP bcax_m0 vAge, vAge_, vAgo_, vAgi_ + SEP bcax_m0 vAgi, vAgi_, vAgu_, vAgo_ + SEP bcax_m0 vAgo, vAgo_, vAga_, vAgu_ + SEP bcax_m0 vAgu, vAgu_, vAge_, vAga_ + SEP bcax_m0 vAka, vAka_, vAki_, vAke_ + SEP bcax_m0 vAke, vAke_, vAko_, vAki_ + SEP bcax_m0 vAki, vAki_, vAku_, vAko_ + SEP bcax_m0 vAko, vAko_, vAka_, vAku_ + SEP bcax_m0 vAku, vAku_, vAke_, vAka_ + SEP bcax_m0 vAma, vAma_, vAmi_, vAme_ + SEP bcax_m0 vAme, vAme_, vAmo_, vAmi_ + SEP bcax_m0 vAmi, vAmi_, vAmu_, vAmo_ + SEP bcax_m0 vAmo, vAmo_, vAma_, vAmu_ + SEP bcax_m0 vAmu, vAmu_, vAme_, vAma_ + SEP bcax_m0 vAsa, vAsa_, vAsi_, vAse_ + SEP bcax_m0 vAse, vAse_, vAso_, vAsi_ + SEP bcax_m0 vAsi, vAsi_, vAsu_, vAso_ + SEP bcax_m0 vAso, vAso_, vAsa_, vAsu_ + SEP bcax_m0 vAsu, vAsu_, vAse_, vAsa_ + SEP bcax_m0 vAba, vAba_, vAbi_, vAbe_ + SEP bcax_m0 vAbe, vAbe_, vAbo_, vAbi_ + SEP bcax_m0 vAbi, vAbi_, vAbu_, vAbo_ + SEP bcax_m0 vAbo, vAbo_, vAba_, vAbu_ + SEP bcax_m0 vAbu, vAbu_, vAbe_, vAba_ + SEP + SEP eor vAba.16b, vAba.16b, v31.16b +.endm + +.macro hybrid_round_noninitial + save count, STACK_OFFSET_COUNT SEP + SEP + eor sC0, sAka, sAsa, ROR #50 SEP + eor sC1, sAse, sAge, ROR #60 SEP + eor sC2, sAmi, sAgi, ROR #59 SEP + eor sC3, sAgo, sAso, ROR #30 SEP + eor sC4, sAbu, sAsu, ROR #53 SEP + eor sC0, sAma, sC0, ROR #49 SEP + eor sC1, sAbe, sC1, ROR #44 SEP + eor sC2, sAki, sC2, ROR #26 SEP + eor sC3, sAmo, sC3, ROR #63 SEP + eor sC4, sAmu, sC4, ROR #56 SEP + eor sC0, sAga, sC0, ROR #57 SEP + eor sC1, sAme, sC1, ROR #58 SEP + eor sC2, sAbi, sC2, ROR #60 SEP + eor sC3, sAko, sC3, ROR #38 SEP + eor sC4, sAgu, sC4, ROR #48 SEP + eor sC0, s_Aba, sC0, ROR #61 SEP + eor sC1, sAke, sC1, ROR #57 SEP + eor sC2, sAsi, sC2, ROR #52 SEP + eor sC3, sAbo, sC3, ROR #63 SEP + eor sC4, sAku, sC4, ROR #50 SEP + ror sC1, sC1, 56 SEP + ror sC4, sC4, 58 SEP + ror sC2, sC2, 62 SEP + SEP + eor sE1, sC0, sC2, ROR #63 SEP + eor sE3, sC2, sC4, ROR #63 SEP + eor sE0, sC4, sC1, ROR #63 SEP + eor sE2, sC1, sC3, ROR #63 SEP + eor sE4, sC3, sC0, ROR #63 SEP + SEP + eor s_Aba_, sE0, s_Aba SEP + eor sAsa_, sE2, sAbi, ROR #50 SEP + eor sAbi_, sE2, sAki, ROR #46 SEP + eor sAki_, sE3, sAko, ROR #63 SEP + eor sAko_, sE4, sAmu, ROR #28 SEP + eor sAmu_, sE3, sAso, ROR #2 SEP + eor sAso_, sE0, sAma, ROR #54 SEP + eor sAka_, sE1, sAbe, ROR #43 SEP + eor sAse_, sE3, sAgo, ROR #36 SEP + eor sAgo_, sE1, sAme, ROR #49 SEP + eor sAke_, sE2, sAgi, ROR #3 SEP + eor sAgi_, sE0, sAka, ROR #39 SEP + eor sAga_, sE3, sAbo SEP + eor sAbo_, sE3, sAmo, ROR #37 SEP + eor sAmo_, sE2, sAmi, ROR #8 SEP + eor sAmi_, sE1, sAke, ROR #56 SEP + eor sAge_, sE4, sAgu, ROR #44 SEP + eor sAgu_, sE2, sAsi, ROR #62 SEP + eor sAsi_, sE4, sAku, ROR #58 SEP + eor sAku_, sE0, sAsa, ROR #25 SEP + eor sAma_, sE4, sAbu, ROR #20 SEP + eor sAbu_, sE4, sAsu, ROR #9 SEP + eor sAsu_, sE1, sAse, ROR #23 SEP + eor sAme_, sE0, sAga, ROR #61 SEP + eor sAbe_, sE1, sAge, ROR #19 SEP + SEP + load_constant_ptr SEP + restore count, STACK_OFFSET_COUNT SEP + SEP + bic tmp, sAgi_, sAge_, ROR #47 SEP + eor sAga, tmp, sAga_, ROR #39 SEP + bic tmp, sAgo_, sAgi_, ROR #42 SEP + eor sAge, tmp, sAge_, ROR #25 SEP + bic tmp, sAgu_, sAgo_, ROR #16 SEP + eor sAgi, tmp, sAgi_, ROR #58 SEP + bic tmp, sAga_, sAgu_, ROR #31 SEP + eor sAgo, tmp, sAgo_, ROR #47 SEP + bic tmp, sAge_, sAga_, ROR #56 SEP + eor sAgu, tmp, sAgu_, ROR #23 SEP + bic tmp, sAki_, sAke_, ROR #19 SEP + eor sAka, tmp, sAka_, ROR #24 SEP + bic tmp, sAko_, sAki_, ROR #47 SEP + eor sAke, tmp, sAke_, ROR #2 SEP + bic tmp, sAku_, sAko_, ROR #10 SEP + eor sAki, tmp, sAki_, ROR #57 SEP + bic tmp, sAka_, sAku_, ROR #47 SEP + eor sAko, tmp, sAko_, ROR #57 SEP + bic tmp, sAke_, sAka_, ROR #5 SEP + eor sAku, tmp, sAku_, ROR #52 SEP + bic tmp, sAmi_, sAme_, ROR #38 SEP + eor sAma, tmp, sAma_, ROR #47 SEP + bic tmp, sAmo_, sAmi_, ROR #5 SEP + eor sAme, tmp, sAme_, ROR #43 SEP + bic tmp, sAmu_, sAmo_, ROR #41 SEP + eor sAmi, tmp, sAmi_, ROR #46 SEP + bic tmp, sAma_, sAmu_, ROR #35 SEP + SEP + ldr cur_const, [const_addr, count, UXTW #3] SEP + add count, count, #1 SEP + SEP + eor sAmo, tmp, sAmo_, ROR #12 SEP + bic tmp, sAme_, sAma_, ROR #9 SEP + eor sAmu, tmp, sAmu_, ROR #44 SEP + bic tmp, sAsi_, sAse_, ROR #48 SEP + eor sAsa, tmp, sAsa_, ROR #41 SEP + bic tmp, sAso_, sAsi_, ROR #2 SEP + eor sAse, tmp, sAse_, ROR #50 SEP + bic tmp, sAsu_, sAso_, ROR #25 SEP + eor sAsi, tmp, sAsi_, ROR #27 SEP + bic tmp, sAsa_, sAsu_, ROR #60 SEP + eor sAso, tmp, sAso_, ROR #21 SEP + bic tmp, sAse_, sAsa_, ROR #57 SEP + eor sAsu, tmp, sAsu_, ROR #53 SEP + bic tmp, sAbi_, sAbe_, ROR #63 SEP + eor s_Aba, s_Aba_, tmp, ROR #21 SEP + bic tmp, sAbo_, sAbi_, ROR #42 SEP + eor sAbe, tmp, sAbe_, ROR #41 SEP + bic tmp, sAbu_, sAbo_, ROR #57 SEP + eor sAbi, tmp, sAbi_, ROR #35 SEP + bic tmp, s_Aba_, sAbu_, ROR #50 SEP + eor sAbo, tmp, sAbo_, ROR #43 SEP + bic tmp, sAbe_, s_Aba_, ROR #44 SEP + eor sAbu, tmp, sAbu_, ROR #30 SEP + SEP + eor s_Aba, s_Aba, cur_const SEP + save count, STACK_OFFSET_COUNT SEP + SEP + eor sC0, sAka, sAsa, ROR #50 SEP + eor sC1, sAse, sAge, ROR #60 SEP + eor sC2, sAmi, sAgi, ROR #59 SEP + eor sC3, sAgo, sAso, ROR #30 SEP + eor sC4, sAbu, sAsu, ROR #53 SEP + eor sC0, sAma, sC0, ROR #49 SEP + eor sC1, sAbe, sC1, ROR #44 SEP + eor sC2, sAki, sC2, ROR #26 SEP + eor sC3, sAmo, sC3, ROR #63 SEP + eor sC4, sAmu, sC4, ROR #56 SEP + eor sC0, sAga, sC0, ROR #57 SEP + eor sC1, sAme, sC1, ROR #58 SEP + eor sC2, sAbi, sC2, ROR #60 SEP + eor sC3, sAko, sC3, ROR #38 SEP + eor sC4, sAgu, sC4, ROR #48 SEP + eor sC0, s_Aba, sC0, ROR #61 SEP + eor sC1, sAke, sC1, ROR #57 SEP + eor sC2, sAsi, sC2, ROR #52 SEP + eor sC3, sAbo, sC3, ROR #63 SEP + eor sC4, sAku, sC4, ROR #50 SEP + ror sC1, sC1, 56 SEP + ror sC4, sC4, 58 SEP + ror sC2, sC2, 62 SEP + SEP + eor sE1, sC0, sC2, ROR #63 SEP + eor sE3, sC2, sC4, ROR #63 SEP + eor sE0, sC4, sC1, ROR #63 SEP + eor sE2, sC1, sC3, ROR #63 SEP + eor sE4, sC3, sC0, ROR #63 SEP + SEP + eor s_Aba_, sE0, s_Aba SEP + eor sAsa_, sE2, sAbi, ROR #50 SEP + eor sAbi_, sE2, sAki, ROR #46 SEP + eor sAki_, sE3, sAko, ROR #63 SEP + eor sAko_, sE4, sAmu, ROR #28 SEP + eor sAmu_, sE3, sAso, ROR #2 SEP + eor sAso_, sE0, sAma, ROR #54 SEP + eor sAka_, sE1, sAbe, ROR #43 SEP + eor sAse_, sE3, sAgo, ROR #36 SEP + eor sAgo_, sE1, sAme, ROR #49 SEP + eor sAke_, sE2, sAgi, ROR #3 SEP + eor sAgi_, sE0, sAka, ROR #39 SEP + eor sAga_, sE3, sAbo SEP + eor sAbo_, sE3, sAmo, ROR #37 SEP + eor sAmo_, sE2, sAmi, ROR #8 SEP + eor sAmi_, sE1, sAke, ROR #56 SEP + eor sAge_, sE4, sAgu, ROR #44 SEP + eor sAgu_, sE2, sAsi, ROR #62 SEP + eor sAsi_, sE4, sAku, ROR #58 SEP + eor sAku_, sE0, sAsa, ROR #25 SEP + eor sAma_, sE4, sAbu, ROR #20 SEP + eor sAbu_, sE4, sAsu, ROR #9 SEP + eor sAsu_, sE1, sAse, ROR #23 SEP + eor sAme_, sE0, sAga, ROR #61 SEP + eor sAbe_, sE1, sAge, ROR #19 SEP + SEP + load_constant_ptr SEP + restore count, STACK_OFFSET_COUNT SEP + SEP + bic tmp, sAgi_, sAge_, ROR #47 SEP + eor sAga, tmp, sAga_, ROR #39 SEP + bic tmp, sAgo_, sAgi_, ROR #42 SEP + eor sAge, tmp, sAge_, ROR #25 SEP + bic tmp, sAgu_, sAgo_, ROR #16 SEP + eor sAgi, tmp, sAgi_, ROR #58 SEP + bic tmp, sAga_, sAgu_, ROR #31 SEP + eor sAgo, tmp, sAgo_, ROR #47 SEP + bic tmp, sAge_, sAga_, ROR #56 SEP + eor sAgu, tmp, sAgu_, ROR #23 SEP + bic tmp, sAki_, sAke_, ROR #19 SEP + eor sAka, tmp, sAka_, ROR #24 SEP + bic tmp, sAko_, sAki_, ROR #47 SEP + eor sAke, tmp, sAke_, ROR #2 SEP + bic tmp, sAku_, sAko_, ROR #10 SEP + eor sAki, tmp, sAki_, ROR #57 SEP + bic tmp, sAka_, sAku_, ROR #47 SEP + eor sAko, tmp, sAko_, ROR #57 SEP + bic tmp, sAke_, sAka_, ROR #5 SEP + eor sAku, tmp, sAku_, ROR #52 SEP + bic tmp, sAmi_, sAme_, ROR #38 SEP + eor sAma, tmp, sAma_, ROR #47 SEP + bic tmp, sAmo_, sAmi_, ROR #5 SEP + eor sAme, tmp, sAme_, ROR #43 SEP + bic tmp, sAmu_, sAmo_, ROR #41 SEP + eor sAmi, tmp, sAmi_, ROR #46 SEP + bic tmp, sAma_, sAmu_, ROR #35 SEP + SEP + ldr cur_const, [const_addr, count, UXTW #3] SEP + add count, count, #1 SEP + SEP + eor sAmo, tmp, sAmo_, ROR #12 SEP + bic tmp, sAme_, sAma_, ROR #9 SEP + eor sAmu, tmp, sAmu_, ROR #44 SEP + bic tmp, sAsi_, sAse_, ROR #48 SEP + eor sAsa, tmp, sAsa_, ROR #41 SEP + bic tmp, sAso_, sAsi_, ROR #2 SEP + eor sAse, tmp, sAse_, ROR #50 SEP + bic tmp, sAsu_, sAso_, ROR #25 SEP + eor sAsi, tmp, sAsi_, ROR #27 SEP + bic tmp, sAsa_, sAsu_, ROR #60 SEP + eor sAso, tmp, sAso_, ROR #21 SEP + bic tmp, sAse_, sAsa_, ROR #57 SEP + eor sAsu, tmp, sAsu_, ROR #53 SEP + bic tmp, sAbi_, sAbe_, ROR #63 SEP + eor s_Aba, s_Aba_, tmp, ROR #21 SEP + bic tmp, sAbo_, sAbi_, ROR #42 SEP + eor sAbe, tmp, sAbe_, ROR #41 SEP + bic tmp, sAbu_, sAbo_, ROR #57 SEP + eor sAbi, tmp, sAbi_, ROR #35 SEP + bic tmp, s_Aba_, sAbu_, ROR #50 SEP + eor sAbo, tmp, sAbo_, ROR #43 SEP + bic tmp, sAbe_, s_Aba_, ROR #44 SEP + eor sAbu, tmp, sAbu_, ROR #30 SEP + SEP + eor s_Aba, s_Aba, cur_const SEP + SEP + SEP + SEP eor3_m0 C0, vAba, vAga, vAka + SEP eor3_m0 C0, C0, vAma, vAsa + SEP eor3_m0 C1, vAbe, vAge, vAke + SEP eor3_m0 C1, C1, vAme, vAse + SEP eor3_m0 C2, vAbi, vAgi, vAki + SEP eor3_m0 C2, C2, vAmi, vAsi + SEP eor3_m0 C3, vAbo, vAgo, vAko + SEP eor3_m0 C3, C3, vAmo, vAso + SEP eor3_m0 C4, vAbu, vAgu, vAku + SEP eor3_m0 C4, C4, vAmu, vAsu + SEP + SEP rax1_m0 E1, C0, C2 + SEP rax1_m0 E3, C2, C4 + SEP rax1_m0 E0, C4, C1 + SEP rax1_m0 E2, C1, C3 + SEP rax1_m0 E4, C3, C0 + SEP + SEP eor vAba_.16b, vAba.16b, E0.16b + SEP xar_m0 vAsa_, vAbi, E2, 2 + SEP xar_m0 vAbi_, vAki, E2, 21 + SEP xar_m0 vAki_, vAko, E3, 39 + SEP xar_m0 vAko_, vAmu, E4, 56 + SEP xar_m0 vAmu_, vAso, E3, 8 + SEP xar_m0 vAso_, vAma, E0, 23 + SEP xar_m0 vAka_, vAbe, E1, 63 + SEP xar_m0 vAse_, vAgo, E3, 9 + SEP xar_m0 vAgo_, vAme, E1, 19 + SEP xar_m0 vAke_, vAgi, E2, 58 + SEP xar_m0 vAgi_, vAka, E0, 61 + SEP xar_m0 vAga_, vAbo, E3, 36 + SEP xar_m0 vAbo_, vAmo, E3, 43 + SEP xar_m0 vAmo_, vAmi, E2, 49 + SEP xar_m0 vAmi_, vAke, E1, 54 + SEP xar_m0 vAge_, vAgu, E4, 44 + SEP xar_m0 vAgu_, vAsi, E2, 3 + SEP xar_m0 vAsi_, vAku, E4, 25 + SEP xar_m0 vAku_, vAsa, E0, 46 + SEP xar_m0 vAma_, vAbu, E4, 37 + SEP xar_m0 vAbu_, vAsu, E4, 50 + SEP xar_m0 vAsu_, vAse, E1, 62 + SEP xar_m0 vAme_, vAga, E0, 28 + SEP xar_m0 vAbe_, vAge, E1, 20 + SEP + SEP restore const_addr, STACK_OFFSET_CONST + SEP ld1r {v31.2d}, [const_addr], #8 + SEP save const_addr, STACK_OFFSET_CONST + SEP + SEP bcax_m0 vAga, vAga_, vAgi_, vAge_ + SEP bcax_m0 vAge, vAge_, vAgo_, vAgi_ + SEP bcax_m0 vAgi, vAgi_, vAgu_, vAgo_ + SEP bcax_m0 vAgo, vAgo_, vAga_, vAgu_ + SEP bcax_m0 vAgu, vAgu_, vAge_, vAga_ + SEP bcax_m0 vAka, vAka_, vAki_, vAke_ + SEP bcax_m0 vAke, vAke_, vAko_, vAki_ + SEP bcax_m0 vAki, vAki_, vAku_, vAko_ + SEP bcax_m0 vAko, vAko_, vAka_, vAku_ + SEP bcax_m0 vAku, vAku_, vAke_, vAka_ + SEP bcax_m0 vAma, vAma_, vAmi_, vAme_ + SEP bcax_m0 vAme, vAme_, vAmo_, vAmi_ + SEP bcax_m0 vAmi, vAmi_, vAmu_, vAmo_ + SEP bcax_m0 vAmo, vAmo_, vAma_, vAmu_ + SEP bcax_m0 vAmu, vAmu_, vAme_, vAma_ + SEP bcax_m0 vAsa, vAsa_, vAsi_, vAse_ + SEP bcax_m0 vAse, vAse_, vAso_, vAsi_ + SEP bcax_m0 vAsi, vAsi_, vAsu_, vAso_ + SEP bcax_m0 vAso, vAso_, vAsa_, vAsu_ + SEP bcax_m0 vAsu, vAsu_, vAse_, vAsa_ + SEP bcax_m0 vAba, vAba_, vAbi_, vAbe_ + SEP bcax_m0 vAbe, vAbe_, vAbo_, vAbi_ + SEP bcax_m0 vAbi, vAbi_, vAbu_, vAbo_ + SEP bcax_m0 vAbo, vAbo_, vAba_, vAbu_ + SEP bcax_m0 vAbu, vAbu_, vAbe_, vAba_ + SEP + SEP eor vAba.16b, vAba.16b, v31.16b + +.endm + +.macro final_rotate + ror sAga, sAga,#(64-3) + ror sAka, sAka,#(64-25) + ror sAma, sAma,#(64-10) + ror sAsa, sAsa,#(64-39) + ror sAbe, sAbe,#(64-21) + ror sAge, sAge,#(64-45) + ror sAke, sAke,#(64-8) + ror sAme, sAme,#(64-15) + ror sAse, sAse,#(64-41) + ror sAbi, sAbi,#(64-14) + ror sAgi, sAgi,#(64-61) + ror sAki, sAki,#(64-18) + ror sAmi, sAmi,#(64-56) + ror sAsi, sAsi,#(64-2) + ror sAgo, sAgo,#(64-28) + ror sAko, sAko,#(64-1) + ror sAmo, sAmo,#(64-27) + ror sAso, sAso,#(64-62) + ror sAbu, sAbu,#(64-44) + ror sAgu, sAgu,#(64-20) + ror sAku, sAku,#(64-6) + ror sAmu, sAmu,#(64-36) + ror sAsu, sAsu,#(64-55) +.endm + +#define KECCAK_F1600_ROUNDS 24 + +.global keccak_f1600_x4_hybrid_asm_v1 +.global _keccak_f1600_x4_hybrid_asm_v1 +.text +.align 4 + +keccak_f1600_x4_hybrid_asm_v1: +_keccak_f1600_x4_hybrid_asm_v1: + alloc_stack + save_gprs + save_vregs + save input_addr, STACK_OFFSET_INPUT + + load_input_vector 2,1 + + load_constant_ptr + save const_addr, STACK_OFFSET_CONST + + // First scalar Keccak computation alongside first half of SIMD computation + load_input_scalar 4,0 + hybrid_round_initial + loop_0: + hybrid_round_noninitial + cmp count, #(KECCAK_F1600_ROUNDS-1) + ble loop_0 + final_rotate + restore input_addr, STACK_OFFSET_INPUT + store_input_scalar 4,0 + + // Second scalar Keccak computation alongsie second half of SIMD computation + load_input_scalar 4,1 + hybrid_round_initial + loop_1: + hybrid_round_noninitial + cmp count, #(KECCAK_F1600_ROUNDS-1) + ble loop_1 + final_rotate + restore input_addr, STACK_OFFSET_INPUT + store_input_scalar 4, 1 + + store_input_vector 2,1 + + restore_vregs + restore_gprs + free_stack + ret + +#endif diff --git a/asm/manual/keccak_f1600/keccak_f1600_x4_hybrid_asm_v2.s b/asm/manual/keccak_f1600/keccak_f1600_x4_hybrid_asm_v2.s new file mode 100644 index 0000000..778e1c6 --- /dev/null +++ b/asm/manual/keccak_f1600/keccak_f1600_x4_hybrid_asm_v2.s @@ -0,0 +1,991 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +#if defined(__ARM_FEATURE_SHA3) + +/********************** CONSTANTS *************************/ + .data + .align(8) +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x29 + count .req w27 + cur_const .req x26 + + /* Mapping of Kecck-f1600 SIMD state to vector registers + * at the beginning and end of each round. */ + + vAba .req v0 + vAbe .req v1 + vAbi .req v2 + vAbo .req v3 + vAbu .req v4 + vAga .req v5 + vAge .req v6 + vAgi .req v7 + vAgo .req v8 + vAgu .req v9 + vAka .req v10 + vAke .req v11 + vAki .req v12 + vAko .req v13 + vAku .req v14 + vAma .req v15 + vAme .req v16 + vAmi .req v17 + vAmo .req v18 + vAmu .req v19 + vAsa .req v20 + vAse .req v21 + vAsi .req v22 + vAso .req v23 + vAsu .req v24 + + /* q-form of the above mapping */ + vAbaq .req q0 + vAbeq .req q1 + vAbiq .req q2 + vAboq .req q3 + vAbuq .req q4 + vAgaq .req q5 + vAgeq .req q6 + vAgiq .req q7 + vAgoq .req q8 + vAguq .req q9 + vAkaq .req q10 + vAkeq .req q11 + vAkiq .req q12 + vAkoq .req q13 + vAkuq .req q14 + vAmaq .req q15 + vAmeq .req q16 + vAmiq .req q17 + vAmoq .req q18 + vAmuq .req q19 + vAsaq .req q20 + vAseq .req q21 + vAsiq .req q22 + vAsoq .req q23 + vAsuq .req q24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v30 + C1 .req v29 + C2 .req v28 + C3 .req v27 + C4 .req v26 + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req v26 + E1 .req v25 + E2 .req v29 + E3 .req v28 + E4 .req v27 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + vAbi_ .req v2 + vAbo_ .req v3 + vAbu_ .req v4 + vAga_ .req v10 + vAge_ .req v11 + vAgi_ .req v7 + vAgo_ .req v8 + vAgu_ .req v9 + vAka_ .req v15 + vAke_ .req v16 + vAki_ .req v12 + vAko_ .req v13 + vAku_ .req v14 + vAma_ .req v20 + vAme_ .req v21 + vAmi_ .req v17 + vAmo_ .req v18 + vAmu_ .req v19 + vAsa_ .req v0 + vAse_ .req v1 + vAsi_ .req v22 + vAso_ .req v23 + vAsu_ .req v24 + vAba_ .req v30 + vAbe_ .req v27 + + /* Mapping of Kecck-f1600 state to scalar registers + * at the beginning and end of each round. */ + s_Aba .req x1 + sAbe .req x6 + sAbi .req x11 + sAbo .req x16 + sAbu .req x21 + sAga .req x2 + sAge .req x7 + sAgi .req x12 + sAgo .req x17 + sAgu .req x22 + sAka .req x3 + sAke .req x8 + sAki .req x13 + sAko .req x18 + sAku .req x23 + sAma .req x4 + sAme .req x9 + sAmi .req x14 + sAmo .req x19 + sAmu .req x24 + sAsa .req x5 + sAse .req x10 + sAsi .req x15 + sAso .req x20 + sAsu .req x25 + + /* sA_[y,2*x+3*y] = rot(A[x,y]) */ + s_Aba_ .req x0 + sAbe_ .req x28 + sAbi_ .req x11 + sAbo_ .req x16 + sAbu_ .req x21 + sAga_ .req x3 + sAge_ .req x8 + sAgi_ .req x12 + sAgo_ .req x17 + sAgu_ .req x22 + sAka_ .req x4 + sAke_ .req x9 + sAki_ .req x13 + sAko_ .req x18 + sAku_ .req x23 + sAma_ .req x5 + sAme_ .req x10 + sAmi_ .req x14 + sAmo_ .req x19 + sAmu_ .req x24 + sAsa_ .req x1 + sAse_ .req x6 + sAsi_ .req x15 + sAso_ .req x20 + sAsu_ .req x25 + + /* sC[x] = sA[x,0] xor sA[x,1] xor sA[x,2] xor sA[x,3] xor sA[x,4], for x in 0..4 */ + /* sE[x] = sC[x-1] xor rot(C[x+1],1), for x in 0..4 */ + sC0 .req x0 + sE0 .req x29 + sC1 .req x26 + sE1 .req x30 + sC2 .req x27 + sE2 .req x26 + sC3 .req x28 + sE3 .req x27 + sC4 .req x29 + sE4 .req x28 + + tmp .req x30 + +/************************ MACROS ****************************/ + +/* Macros using v8.4-A SHA-3 instructions */ + +.macro eor3_m0 d s0 s1 s2 + eor3 \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +.macro rax1_m0 d s0 s1 + rax1 \d\().2d, \s0\().2d, \s1\().2d +.endm + +.macro xar_m0 d s0 s1 imm + xar \d\().2d, \s0\().2d, \s1\().2d, #\imm +.endm + +.macro bcax_m0 d s0 s1 s2 + bcax \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +.macro load_input_vector num idx + ldr vAbaq, [input_addr, #(16*(\num*0+\idx))] + ldr vAbeq, [input_addr, #(16*(\num*1+\idx))] + ldr vAbiq, [input_addr, #(16*(\num*2+\idx))] + ldr vAboq, [input_addr, #(16*(\num*3+\idx))] + ldr vAbuq, [input_addr, #(16*(\num*4+\idx))] + ldr vAgaq, [input_addr, #(16*(\num*5+\idx))] + ldr vAgeq, [input_addr, #(16*(\num*6+\idx))] + ldr vAgiq, [input_addr, #(16*(\num*7+\idx))] + ldr vAgoq, [input_addr, #(16*(\num*8+\idx))] + ldr vAguq, [input_addr, #(16*(\num*9+\idx))] + ldr vAkaq, [input_addr, #(16*(\num*10+\idx))] + ldr vAkeq, [input_addr, #(16*(\num*11+\idx))] + ldr vAkiq, [input_addr, #(16*(\num*12+\idx))] + ldr vAkoq, [input_addr, #(16*(\num*13+\idx))] + ldr vAkuq, [input_addr, #(16*(\num*14+\idx))] + ldr vAmaq, [input_addr, #(16*(\num*15+\idx))] + ldr vAmeq, [input_addr, #(16*(\num*16+\idx))] + ldr vAmiq, [input_addr, #(16*(\num*17+\idx))] + ldr vAmoq, [input_addr, #(16*(\num*18+\idx))] + ldr vAmuq, [input_addr, #(16*(\num*19+\idx))] + ldr vAsaq, [input_addr, #(16*(\num*20+\idx))] + ldr vAseq, [input_addr, #(16*(\num*21+\idx))] + ldr vAsiq, [input_addr, #(16*(\num*22+\idx))] + ldr vAsoq, [input_addr, #(16*(\num*23+\idx))] + ldr vAsuq, [input_addr, #(16*(\num*24+\idx))] +.endm + +.macro store_input_vector num idx + str vAbaq, [input_addr, #(16*(\num*0+\idx))] + str vAbeq, [input_addr, #(16*(\num*1+\idx))] + str vAbiq, [input_addr, #(16*(\num*2+\idx))] + str vAboq, [input_addr, #(16*(\num*3+\idx))] + str vAbuq, [input_addr, #(16*(\num*4+\idx))] + str vAgaq, [input_addr, #(16*(\num*5+\idx))] + str vAgeq, [input_addr, #(16*(\num*6+\idx))] + str vAgiq, [input_addr, #(16*(\num*7+\idx))] + str vAgoq, [input_addr, #(16*(\num*8+\idx))] + str vAguq, [input_addr, #(16*(\num*9+\idx))] + str vAkaq, [input_addr, #(16*(\num*10+\idx))] + str vAkeq, [input_addr, #(16*(\num*11+\idx))] + str vAkiq, [input_addr, #(16*(\num*12+\idx))] + str vAkoq, [input_addr, #(16*(\num*13+\idx))] + str vAkuq, [input_addr, #(16*(\num*14+\idx))] + str vAmaq, [input_addr, #(16*(\num*15+\idx))] + str vAmeq, [input_addr, #(16*(\num*16+\idx))] + str vAmiq, [input_addr, #(16*(\num*17+\idx))] + str vAmoq, [input_addr, #(16*(\num*18+\idx))] + str vAmuq, [input_addr, #(16*(\num*19+\idx))] + str vAsaq, [input_addr, #(16*(\num*20+\idx))] + str vAseq, [input_addr, #(16*(\num*21+\idx))] + str vAsiq, [input_addr, #(16*(\num*22+\idx))] + str vAsoq, [input_addr, #(16*(\num*23+\idx))] + str vAsuq, [input_addr, #(16*(\num*24+\idx))] +.endm + +.macro store_input_scalar num idx + str s_Aba, [input_addr, 8*(\num*(0) +\idx)] + str sAbe, [input_addr, 8*(\num*(0+1) +\idx)] + str sAbi, [input_addr, 8*(\num*(2)+ \idx)] + str sAbo, [input_addr, 8*(\num*(2+1) +\idx)] + str sAbu, [input_addr, 8*(\num*(4)+ \idx)] + str sAga, [input_addr, 8*(\num*(4+1) +\idx)] + str sAge, [input_addr, 8*(\num*(6)+ \idx)] + str sAgi, [input_addr, 8*(\num*(6+1) +\idx)] + str sAgo, [input_addr, 8*(\num*(8)+ \idx)] + str sAgu, [input_addr, 8*(\num*(8+1) +\idx)] + str sAka, [input_addr, 8*(\num*(10) +\idx)] + str sAke, [input_addr, 8*(\num*(10+1)+\idx)] + str sAki, [input_addr, 8*(\num*(12) +\idx)] + str sAko, [input_addr, 8*(\num*(12+1)+\idx)] + str sAku, [input_addr, 8*(\num*(14) +\idx)] + str sAma, [input_addr, 8*(\num*(14+1)+\idx)] + str sAme, [input_addr, 8*(\num*(16) +\idx)] + str sAmi, [input_addr, 8*(\num*(16+1)+\idx)] + str sAmo, [input_addr, 8*(\num*(18) +\idx)] + str sAmu, [input_addr, 8*(\num*(18+1)+\idx)] + str sAsa, [input_addr, 8*(\num*(20) +\idx)] + str sAse, [input_addr, 8*(\num*(20+1)+\idx)] + str sAsi, [input_addr, 8*(\num*(22) +\idx)] + str sAso, [input_addr, 8*(\num*(22+1)+\idx)] + str sAsu, [input_addr, 8*(\num*(24) +\idx)] +.endm + +.macro load_input_scalar num idx + ldr s_Aba, [input_addr, 8*(\num*(0) +\idx)] + ldr sAbe, [input_addr, 8*(\num*(0+1) +\idx)] + ldr sAbi, [input_addr, 8*(\num*(2)+ \idx)] + ldr sAbo, [input_addr, 8*(\num*(2+1) +\idx)] + ldr sAbu, [input_addr, 8*(\num*(4)+ \idx)] + ldr sAga, [input_addr, 8*(\num*(4+1) +\idx)] + ldr sAge, [input_addr, 8*(\num*(6)+ \idx)] + ldr sAgi, [input_addr, 8*(\num*(6+1) +\idx)] + ldr sAgo, [input_addr, 8*(\num*(8)+ \idx)] + ldr sAgu, [input_addr, 8*(\num*(8+1) +\idx)] + ldr sAka, [input_addr, 8*(\num*(10) +\idx)] + ldr sAke, [input_addr, 8*(\num*(10+1)+\idx)] + ldr sAki, [input_addr, 8*(\num*(12) +\idx)] + ldr sAko, [input_addr, 8*(\num*(12+1)+\idx)] + ldr sAku, [input_addr, 8*(\num*(14) +\idx)] + ldr sAma, [input_addr, 8*(\num*(14+1)+\idx)] + ldr sAme, [input_addr, 8*(\num*(16) +\idx)] + ldr sAmi, [input_addr, 8*(\num*(16+1)+\idx)] + ldr sAmo, [input_addr, 8*(\num*(18) +\idx)] + ldr sAmu, [input_addr, 8*(\num*(18+1)+\idx)] + ldr sAsa, [input_addr, 8*(\num*(20) +\idx)] + ldr sAse, [input_addr, 8*(\num*(20+1)+\idx)] + ldr sAsi, [input_addr, 8*(\num*(22) +\idx)] + ldr sAso, [input_addr, 8*(\num*(22+1)+\idx)] + ldr sAsu, [input_addr, 8*(\num*(24) +\idx)] +.endm + +#define STACK_SIZE (8*8 + 16*6 + 3*8 + 8) // VREGS (8*8), GPRs (16*6), count (8), const (8), input (8), padding (8) +#define STACK_BASE_GPRS (3*8+8) +#define STACK_BASE_VREGS (3*8+8+16*6) +#define STACK_OFFSET_INPUT (0*8) +#define STACK_OFFSET_CONST (1*8) +#define STACK_OFFSET_COUNT (2*8) + +.macro save_gprs + stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro save_vregs + stp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] + stp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + stp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + stp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] +.endm + +.macro restore_vregs + ldp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] + ldp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + ldp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + ldp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] +.endm + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +.macro eor5 dst, src0, src1, src2, src3, src4 + eor \dst, \src0, \src1 + eor \dst, \dst, \src2 + eor \dst, \dst, \src3 + eor \dst, \dst, \src4 +.endm + +.macro xor_rol dst, src1, src0, imm + eor \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro bic_rol dst, src1, src0, imm + bic \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro rotate dst, src, imm + ror \dst, \src, #(64-\imm) +.endm + +.macro save reg, offset + str \reg, [sp, #\offset] +.endm + +.macro restore reg, offset + ldr \reg, [sp, #\offset] +.endm + +.macro hybrid_round_initial + + eor sC0, sAma, sAsa SEP eor3_m0 C0, vAba, vAga, vAka + eor sC1, sAme, sAse SEP + eor sC2, sAmi, sAsi SEP + eor sC3, sAmo, sAso SEP eor3_m0 C0, C0, vAma, vAsa + eor sC4, sAmu, sAsu SEP + eor sC0, sAka, sC0 SEP + eor sC1, sAke, sC1 SEP eor3_m0 C1, vAbe, vAge, vAke + eor sC2, sAki, sC2 SEP + eor sC3, sAko, sC3 SEP + eor sC4, sAku, sC4 SEP eor3_m0 C1, C1, vAme, vAse + eor sC0, sAga, sC0 SEP + eor sC1, sAge, sC1 SEP + eor sC2, sAgi, sC2 SEP eor3_m0 C2, vAbi, vAgi, vAki + eor sC3, sAgo, sC3 SEP + eor sC4, sAgu, sC4 SEP + eor sC0, s_Aba, sC0 SEP eor3_m0 C2, C2, vAmi, vAsi + eor sC1, sAbe, sC1 SEP + eor sC2, sAbi, sC2 SEP + eor sC3, sAbo, sC3 SEP eor3_m0 C3, vAbo, vAgo, vAko + eor sC4, sAbu, sC4 SEP + SEP + eor sE1, sC0, sC2, ROR #63 SEP eor3_m0 C3, C3, vAmo, vAso + eor sE3, sC2, sC4, ROR #63 SEP + eor sE0, sC4, sC1, ROR #63 SEP + eor sE2, sC1, sC3, ROR #63 SEP eor3_m0 C4, vAbu, vAgu, vAku + eor sE4, sC3, sC0, ROR #63 SEP + SEP + eor s_Aba_, s_Aba, sE0 SEP eor3_m0 C4, C4, vAmu, vAsu + eor sAsa_, sAbi, sE2 SEP + eor sAbi_, sAki, sE2 SEP + eor sAki_, sAko, sE3 SEP + eor sAko_, sAmu, sE4 SEP rax1_m0 E1, C0, C2 + eor sAmu_, sAso, sE3 SEP + eor sAso_, sAma, sE0 SEP + eor sAka_, sAbe, sE1 SEP rax1_m0 E3, C2, C4 + eor sAse_, sAgo, sE3 SEP + eor sAgo_, sAme, sE1 SEP + eor sAke_, sAgi, sE2 SEP rax1_m0 E0, C4, C1 + eor sAgi_, sAka, sE0 SEP + eor sAga_, sAbo, sE3 SEP + eor sAbo_, sAmo, sE3 SEP rax1_m0 E2, C1, C3 + eor sAmo_, sAmi, sE2 SEP + eor sAmi_, sAke, sE1 SEP + eor sAge_, sAgu, sE4 SEP rax1_m0 E4, C3, C0 + eor sAgu_, sAsi, sE2 SEP + eor sAsi_, sAku, sE4 SEP + eor sAku_, sAsa, sE0 SEP + eor sAma_, sAbu, sE4 SEP eor vAba_.16b, vAba.16b, E0.16b + eor sAbu_, sAsu, sE4 SEP + eor sAsu_, sAse, sE1 SEP + eor sAme_, sAga, sE0 SEP xar_m0 vAsa_, vAbi, E2, 2 + eor sAbe_, sAge, sE1 SEP + SEP + load_constant_ptr SEP xar_m0 vAbi_, vAki, E2, 21 + SEP + bic tmp, sAgi_, sAge_, ROR #47 SEP + eor sAga, tmp, sAga_, ROR #39 SEP xar_m0 vAki_, vAko, E3, 39 + bic tmp, sAgo_, sAgi_, ROR #42 SEP + eor sAge, tmp, sAge_, ROR #25 SEP + bic tmp, sAgu_, sAgo_, ROR #16 SEP xar_m0 vAko_, vAmu, E4, 56 + eor sAgi, tmp, sAgi_, ROR #58 SEP + bic tmp, sAga_, sAgu_, ROR #31 SEP + eor sAgo, tmp, sAgo_, ROR #47 SEP xar_m0 vAmu_, vAso, E3, 8 + bic tmp, sAge_, sAga_, ROR #56 SEP + eor sAgu, tmp, sAgu_, ROR #23 SEP + bic tmp, sAki_, sAke_, ROR #19 SEP xar_m0 vAso_, vAma, E0, 23 + eor sAka, tmp, sAka_, ROR #24 SEP + bic tmp, sAko_, sAki_, ROR #47 SEP + eor sAke, tmp, sAke_, ROR #2 SEP xar_m0 vAka_, vAbe, E1, 63 + bic tmp, sAku_, sAko_, ROR #10 SEP + eor sAki, tmp, sAki_, ROR #57 SEP + bic tmp, sAka_, sAku_, ROR #47 SEP xar_m0 vAse_, vAgo, E3, 9 + eor sAko, tmp, sAko_, ROR #57 SEP + bic tmp, sAke_, sAka_, ROR #5 SEP + eor sAku, tmp, sAku_, ROR #52 SEP xar_m0 vAgo_, vAme, E1, 19 + bic tmp, sAmi_, sAme_, ROR #38 SEP + eor sAma, tmp, sAma_, ROR #47 SEP + bic tmp, sAmo_, sAmi_, ROR #5 SEP xar_m0 vAke_, vAgi, E2, 58 + eor sAme, tmp, sAme_, ROR #43 SEP + bic tmp, sAmu_, sAmo_, ROR #41 SEP + eor sAmi, tmp, sAmi_, ROR #46 SEP xar_m0 vAgi_, vAka, E0, 61 + SEP + ldr cur_const, [const_addr] SEP + mov count, #1 SEP xar_m0 vAga_, vAbo, E3, 36 + SEP + bic tmp, sAma_, sAmu_, ROR #35 SEP + eor sAmo, tmp, sAmo_, ROR #12 SEP xar_m0 vAbo_, vAmo, E3, 43 + bic tmp, sAme_, sAma_, ROR #9 SEP + eor sAmu, tmp, sAmu_, ROR #44 SEP + bic tmp, sAsi_, sAse_, ROR #48 SEP xar_m0 vAmo_, vAmi, E2, 49 + eor sAsa, tmp, sAsa_, ROR #41 SEP + bic tmp, sAso_, sAsi_, ROR #2 SEP + eor sAse, tmp, sAse_, ROR #50 SEP xar_m0 vAmi_, vAke, E1, 54 + bic tmp, sAsu_, sAso_, ROR #25 SEP + eor sAsi, tmp, sAsi_, ROR #27 SEP + bic tmp, sAsa_, sAsu_, ROR #60 SEP xar_m0 vAge_, vAgu, E4, 44 + eor sAso, tmp, sAso_, ROR #21 SEP + bic tmp, sAse_, sAsa_, ROR #57 SEP + eor sAsu, tmp, sAsu_, ROR #53 SEP xar_m0 vAgu_, vAsi, E2, 3 + bic tmp, sAbi_, sAbe_, ROR #63 SEP + eor s_Aba, s_Aba_, tmp, ROR #21 SEP + bic tmp, sAbo_, sAbi_, ROR #42 SEP xar_m0 vAsi_, vAku, E4, 25 + eor sAbe, tmp, sAbe_, ROR #41 SEP + bic tmp, sAbu_, sAbo_, ROR #57 SEP + eor sAbi, tmp, sAbi_, ROR #35 SEP xar_m0 vAku_, vAsa, E0, 46 + bic tmp, s_Aba_, sAbu_, ROR #50 SEP + eor sAbo, tmp, sAbo_, ROR #43 SEP + bic tmp, sAbe_, s_Aba_, ROR #44 SEP xar_m0 vAma_, vAbu, E4, 37 + eor sAbu, tmp, sAbu_, ROR #30 SEP + SEP + eor s_Aba, s_Aba, cur_const SEP xar_m0 vAbu_, vAsu, E4, 50 + SEP + save count, STACK_OFFSET_COUNT SEP + SEP xar_m0 vAsu_, vAse, E1, 62 + eor sC0, sAka, sAsa, ROR #50 SEP + eor sC1, sAse, sAge, ROR #60 SEP + eor sC2, sAmi, sAgi, ROR #59 SEP xar_m0 vAme_, vAga, E0, 28 + eor sC3, sAgo, sAso, ROR #30 SEP + eor sC4, sAbu, sAsu, ROR #53 SEP + eor sC0, sAma, sC0, ROR #49 SEP xar_m0 vAbe_, vAge, E1, 20 + eor sC1, sAbe, sC1, ROR #44 SEP + eor sC2, sAki, sC2, ROR #26 SEP restore sE1, STACK_OFFSET_CONST + eor sC3, sAmo, sC3, ROR #63 SEP + eor sC4, sAmu, sC4, ROR #56 SEP + eor sC0, sAga, sC0, ROR #57 SEP ld1r {v31.2d}, [sE1], #8 + eor sC1, sAme, sC1, ROR #58 SEP + eor sC2, sAbi, sC2, ROR #60 SEP + eor sC3, sAko, sC3, ROR #38 SEP save sE1, STACK_OFFSET_CONST + eor sC4, sAgu, sC4, ROR #48 SEP + eor sC0, s_Aba, sC0, ROR #61 SEP bcax_m0 vAga, vAga_, vAgi_, vAge_ + eor sC1, sAke, sC1, ROR #57 SEP + eor sC2, sAsi, sC2, ROR #52 SEP + eor sC3, sAbo, sC3, ROR #63 SEP bcax_m0 vAge, vAge_, vAgo_, vAgi_ + eor sC4, sAku, sC4, ROR #50 SEP + ror sC1, sC1, 56 SEP + ror sC4, sC4, 58 SEP bcax_m0 vAgi, vAgi_, vAgu_, vAgo_ + ror sC2, sC2, 62 SEP + SEP + eor sE1, sC0, sC2, ROR #63 SEP bcax_m0 vAgo, vAgo_, vAga_, vAgu_ + eor sE3, sC2, sC4, ROR #63 SEP + eor sE0, sC4, sC1, ROR #63 SEP + eor sE2, sC1, sC3, ROR #63 SEP bcax_m0 vAgu, vAgu_, vAge_, vAga_ + eor sE4, sC3, sC0, ROR #63 SEP + SEP + eor s_Aba_, sE0, s_Aba SEP bcax_m0 vAka, vAka_, vAki_, vAke_ + eor sAsa_, sE2, sAbi, ROR #50 SEP + eor sAbi_, sE2, sAki, ROR #46 SEP + eor sAki_, sE3, sAko, ROR #63 SEP bcax_m0 vAke, vAke_, vAko_, vAki_ + eor sAko_, sE4, sAmu, ROR #28 SEP + eor sAmu_, sE3, sAso, ROR #2 SEP + eor sAso_, sE0, sAma, ROR #54 SEP bcax_m0 vAki, vAki_, vAku_, vAko_ + eor sAka_, sE1, sAbe, ROR #43 SEP + eor sAse_, sE3, sAgo, ROR #36 SEP + eor sAgo_, sE1, sAme, ROR #49 SEP bcax_m0 vAko, vAko_, vAka_, vAku_ + eor sAke_, sE2, sAgi, ROR #3 SEP + eor sAgi_, sE0, sAka, ROR #39 SEP + eor sAga_, sE3, sAbo SEP bcax_m0 vAku, vAku_, vAke_, vAka_ + eor sAbo_, sE3, sAmo, ROR #37 SEP + eor sAmo_, sE2, sAmi, ROR #8 SEP + eor sAmi_, sE1, sAke, ROR #56 SEP bcax_m0 vAma, vAma_, vAmi_, vAme_ + eor sAge_, sE4, sAgu, ROR #44 SEP + eor sAgu_, sE2, sAsi, ROR #62 SEP + eor sAsi_, sE4, sAku, ROR #58 SEP bcax_m0 vAme, vAme_, vAmo_, vAmi_ + eor sAku_, sE0, sAsa, ROR #25 SEP + eor sAma_, sE4, sAbu, ROR #20 SEP + eor sAbu_, sE4, sAsu, ROR #9 SEP bcax_m0 vAmi, vAmi_, vAmu_, vAmo_ + eor sAsu_, sE1, sAse, ROR #23 SEP + eor sAme_, sE0, sAga, ROR #61 SEP + eor sAbe_, sE1, sAge, ROR #19 SEP bcax_m0 vAmo, vAmo_, vAma_, vAmu_ + SEP + load_constant_ptr SEP + restore count, STACK_OFFSET_COUNT SEP bcax_m0 vAmu, vAmu_, vAme_, vAma_ + SEP + bic tmp, sAgi_, sAge_, ROR #47 SEP + eor sAga, tmp, sAga_, ROR #39 SEP bcax_m0 vAsa, vAsa_, vAsi_, vAse_ + bic tmp, sAgo_, sAgi_, ROR #42 SEP + eor sAge, tmp, sAge_, ROR #25 SEP + bic tmp, sAgu_, sAgo_, ROR #16 SEP bcax_m0 vAse, vAse_, vAso_, vAsi_ + eor sAgi, tmp, sAgi_, ROR #58 SEP + bic tmp, sAga_, sAgu_, ROR #31 SEP + eor sAgo, tmp, sAgo_, ROR #47 SEP bcax_m0 vAsi, vAsi_, vAsu_, vAso_ + bic tmp, sAge_, sAga_, ROR #56 SEP + eor sAgu, tmp, sAgu_, ROR #23 SEP + bic tmp, sAki_, sAke_, ROR #19 SEP bcax_m0 vAso, vAso_, vAsa_, vAsu_ + eor sAka, tmp, sAka_, ROR #24 SEP + bic tmp, sAko_, sAki_, ROR #47 SEP + eor sAke, tmp, sAke_, ROR #2 SEP bcax_m0 vAsu, vAsu_, vAse_, vAsa_ + bic tmp, sAku_, sAko_, ROR #10 SEP + eor sAki, tmp, sAki_, ROR #57 SEP + bic tmp, sAka_, sAku_, ROR #47 SEP bcax_m0 vAba, vAba_, vAbi_, vAbe_ + eor sAko, tmp, sAko_, ROR #57 SEP + bic tmp, sAke_, sAka_, ROR #5 SEP + eor sAku, tmp, sAku_, ROR #52 SEP bcax_m0 vAbe, vAbe_, vAbo_, vAbi_ + bic tmp, sAmi_, sAme_, ROR #38 SEP + eor sAma, tmp, sAma_, ROR #47 SEP + bic tmp, sAmo_, sAmi_, ROR #5 SEP bcax_m0 vAbi, vAbi_, vAbu_, vAbo_ + eor sAme, tmp, sAme_, ROR #43 SEP + bic tmp, sAmu_, sAmo_, ROR #41 SEP + eor sAmi, tmp, sAmi_, ROR #46 SEP bcax_m0 vAbo, vAbo_, vAba_, vAbu_ + bic tmp, sAma_, sAmu_, ROR #35 SEP + SEP + ldr cur_const, [const_addr, count, UXTW #3] SEP bcax_m0 vAbu, vAbu_, vAbe_, vAba_ + SEP + eor sAmo, tmp, sAmo_, ROR #12 SEP + bic tmp, sAme_, sAma_, ROR #9 SEP + eor sAmu, tmp, sAmu_, ROR #44 SEP eor vAba.16b, vAba.16b, v31.16b + bic tmp, sAsi_, sAse_, ROR #48 SEP + eor sAsa, tmp, sAsa_, ROR #41 SEP + bic tmp, sAso_, sAsi_, ROR #2 SEP + eor sAse, tmp, sAse_, ROR #50 SEP + bic tmp, sAsu_, sAso_, ROR #25 SEP + eor sAsi, tmp, sAsi_, ROR #27 SEP + bic tmp, sAsa_, sAsu_, ROR #60 SEP + eor sAso, tmp, sAso_, ROR #21 SEP + bic tmp, sAse_, sAsa_, ROR #57 SEP + eor sAsu, tmp, sAsu_, ROR #53 SEP + bic tmp, sAbi_, sAbe_, ROR #63 SEP + eor s_Aba, s_Aba_, tmp, ROR #21 SEP + bic tmp, sAbo_, sAbi_, ROR #42 SEP + eor sAbe, tmp, sAbe_, ROR #41 SEP + bic tmp, sAbu_, sAbo_, ROR #57 SEP + eor sAbi, tmp, sAbi_, ROR #35 SEP + bic tmp, s_Aba_, sAbu_, ROR #50 SEP + eor sAbo, tmp, sAbo_, ROR #43 SEP + bic tmp, sAbe_, s_Aba_, ROR #44 SEP + eor sAbu, tmp, sAbu_, ROR #30 SEP + SEP + add count, count, #1 SEP + SEP + eor s_Aba, s_Aba, cur_const SEP + SEP +.endm + +.macro hybrid_round_noninitial + save count, STACK_OFFSET_COUNT SEP eor3_m0 C0, vAba, vAga, vAka + SEP + eor sC0, sAka, sAsa, ROR #50 SEP + eor sC1, sAse, sAge, ROR #60 SEP eor3_m0 C0, C0, vAma, vAsa + eor sC2, sAmi, sAgi, ROR #59 SEP + eor sC3, sAgo, sAso, ROR #30 SEP + eor sC4, sAbu, sAsu, ROR #53 SEP eor3_m0 C1, vAbe, vAge, vAke + eor sC0, sAma, sC0, ROR #49 SEP + eor sC1, sAbe, sC1, ROR #44 SEP + eor sC2, sAki, sC2, ROR #26 SEP eor3_m0 C1, C1, vAme, vAse + eor sC3, sAmo, sC3, ROR #63 SEP + eor sC4, sAmu, sC4, ROR #56 SEP + eor sC0, sAga, sC0, ROR #57 SEP eor3_m0 C2, vAbi, vAgi, vAki + eor sC1, sAme, sC1, ROR #58 SEP + eor sC2, sAbi, sC2, ROR #60 SEP + eor sC3, sAko, sC3, ROR #38 SEP eor3_m0 C2, C2, vAmi, vAsi + eor sC4, sAgu, sC4, ROR #48 SEP + eor sC0, s_Aba, sC0, ROR #61 SEP + eor sC1, sAke, sC1, ROR #57 SEP eor3_m0 C3, vAbo, vAgo, vAko + eor sC2, sAsi, sC2, ROR #52 SEP + eor sC3, sAbo, sC3, ROR #63 SEP + eor sC4, sAku, sC4, ROR #50 SEP eor3_m0 C3, C3, vAmo, vAso + ror sC1, sC1, 56 SEP + ror sC4, sC4, 58 SEP + ror sC2, sC2, 62 SEP eor3_m0 C4, vAbu, vAgu, vAku + SEP + eor sE1, sC0, sC2, ROR #63 SEP + eor sE3, sC2, sC4, ROR #63 SEP eor3_m0 C4, C4, vAmu, vAsu + eor sE0, sC4, sC1, ROR #63 SEP + eor sE2, sC1, sC3, ROR #63 SEP + eor sE4, sC3, sC0, ROR #63 SEP + SEP rax1_m0 E1, C0, C2 + eor s_Aba_, sE0, s_Aba SEP + eor sAsa_, sE2, sAbi, ROR #50 SEP + eor sAbi_, sE2, sAki, ROR #46 SEP rax1_m0 E3, C2, C4 + eor sAki_, sE3, sAko, ROR #63 SEP + eor sAko_, sE4, sAmu, ROR #28 SEP + eor sAmu_, sE3, sAso, ROR #2 SEP rax1_m0 E0, C4, C1 + eor sAso_, sE0, sAma, ROR #54 SEP + eor sAka_, sE1, sAbe, ROR #43 SEP + eor sAse_, sE3, sAgo, ROR #36 SEP rax1_m0 E2, C1, C3 + eor sAgo_, sE1, sAme, ROR #49 SEP + eor sAke_, sE2, sAgi, ROR #3 SEP + eor sAgi_, sE0, sAka, ROR #39 SEP rax1_m0 E4, C3, C0 + eor sAga_, sE3, sAbo SEP + eor sAbo_, sE3, sAmo, ROR #37 SEP + eor sAmo_, sE2, sAmi, ROR #8 SEP + eor sAmi_, sE1, sAke, ROR #56 SEP eor vAba_.16b, vAba.16b, E0.16b + eor sAge_, sE4, sAgu, ROR #44 SEP + eor sAgu_, sE2, sAsi, ROR #62 SEP + eor sAsi_, sE4, sAku, ROR #58 SEP xar_m0 vAsa_, vAbi, E2, 2 + eor sAku_, sE0, sAsa, ROR #25 SEP + eor sAma_, sE4, sAbu, ROR #20 SEP + eor sAbu_, sE4, sAsu, ROR #9 SEP xar_m0 vAbi_, vAki, E2, 21 + eor sAsu_, sE1, sAse, ROR #23 SEP + eor sAme_, sE0, sAga, ROR #61 SEP + eor sAbe_, sE1, sAge, ROR #19 SEP xar_m0 vAki_, vAko, E3, 39 + SEP + load_constant_ptr SEP + restore count, STACK_OFFSET_COUNT SEP xar_m0 vAko_, vAmu, E4, 56 + SEP + bic tmp, sAgi_, sAge_, ROR #47 SEP + eor sAga, tmp, sAga_, ROR #39 SEP xar_m0 vAmu_, vAso, E3, 8 + bic tmp, sAgo_, sAgi_, ROR #42 SEP + eor sAge, tmp, sAge_, ROR #25 SEP + bic tmp, sAgu_, sAgo_, ROR #16 SEP xar_m0 vAso_, vAma, E0, 23 + eor sAgi, tmp, sAgi_, ROR #58 SEP + bic tmp, sAga_, sAgu_, ROR #31 SEP + eor sAgo, tmp, sAgo_, ROR #47 SEP xar_m0 vAka_, vAbe, E1, 63 + bic tmp, sAge_, sAga_, ROR #56 SEP + eor sAgu, tmp, sAgu_, ROR #23 SEP + bic tmp, sAki_, sAke_, ROR #19 SEP xar_m0 vAse_, vAgo, E3, 9 + eor sAka, tmp, sAka_, ROR #24 SEP + bic tmp, sAko_, sAki_, ROR #47 SEP + eor sAke, tmp, sAke_, ROR #2 SEP xar_m0 vAgo_, vAme, E1, 19 + bic tmp, sAku_, sAko_, ROR #10 SEP + eor sAki, tmp, sAki_, ROR #57 SEP + bic tmp, sAka_, sAku_, ROR #47 SEP xar_m0 vAke_, vAgi, E2, 58 + eor sAko, tmp, sAko_, ROR #57 SEP + bic tmp, sAke_, sAka_, ROR #5 SEP + eor sAku, tmp, sAku_, ROR #52 SEP xar_m0 vAgi_, vAka, E0, 61 + bic tmp, sAmi_, sAme_, ROR #38 SEP + eor sAma, tmp, sAma_, ROR #47 SEP + bic tmp, sAmo_, sAmi_, ROR #5 SEP xar_m0 vAga_, vAbo, E3, 36 + eor sAme, tmp, sAme_, ROR #43 SEP + bic tmp, sAmu_, sAmo_, ROR #41 SEP + eor sAmi, tmp, sAmi_, ROR #46 SEP xar_m0 vAbo_, vAmo, E3, 43 + bic tmp, sAma_, sAmu_, ROR #35 SEP + SEP + ldr cur_const, [const_addr, count, UXTW #3] SEP xar_m0 vAmo_, vAmi, E2, 49 + add count, count, #1 SEP + SEP + eor sAmo, tmp, sAmo_, ROR #12 SEP xar_m0 vAmi_, vAke, E1, 54 + bic tmp, sAme_, sAma_, ROR #9 SEP + eor sAmu, tmp, sAmu_, ROR #44 SEP + bic tmp, sAsi_, sAse_, ROR #48 SEP xar_m0 vAge_, vAgu, E4, 44 + eor sAsa, tmp, sAsa_, ROR #41 SEP + bic tmp, sAso_, sAsi_, ROR #2 SEP + eor sAse, tmp, sAse_, ROR #50 SEP xar_m0 vAgu_, vAsi, E2, 3 + bic tmp, sAsu_, sAso_, ROR #25 SEP + eor sAsi, tmp, sAsi_, ROR #27 SEP + bic tmp, sAsa_, sAsu_, ROR #60 SEP xar_m0 vAsi_, vAku, E4, 25 + eor sAso, tmp, sAso_, ROR #21 SEP + bic tmp, sAse_, sAsa_, ROR #57 SEP + eor sAsu, tmp, sAsu_, ROR #53 SEP xar_m0 vAku_, vAsa, E0, 46 + bic tmp, sAbi_, sAbe_, ROR #63 SEP + eor s_Aba, s_Aba_, tmp, ROR #21 SEP + bic tmp, sAbo_, sAbi_, ROR #42 SEP xar_m0 vAma_, vAbu, E4, 37 + eor sAbe, tmp, sAbe_, ROR #41 SEP + bic tmp, sAbu_, sAbo_, ROR #57 SEP + eor sAbi, tmp, sAbi_, ROR #35 SEP xar_m0 vAbu_, vAsu, E4, 50 + bic tmp, s_Aba_, sAbu_, ROR #50 SEP + eor sAbo, tmp, sAbo_, ROR #43 SEP + bic tmp, sAbe_, s_Aba_, ROR #44 SEP xar_m0 vAsu_, vAse, E1, 62 + eor sAbu, tmp, sAbu_, ROR #30 SEP + SEP + eor s_Aba, s_Aba, cur_const SEP xar_m0 vAme_, vAga, E0, 28 + save count, STACK_OFFSET_COUNT SEP + SEP + eor sC0, sAka, sAsa, ROR #50 SEP xar_m0 vAbe_, vAge, E1, 20 + eor sC1, sAse, sAge, ROR #60 SEP + eor sC2, sAmi, sAgi, ROR #59 SEP + eor sC3, sAgo, sAso, ROR #30 SEP + eor sC4, sAbu, sAsu, ROR #53 SEP restore sE1, STACK_OFFSET_CONST + eor sC0, sAma, sC0, ROR #49 SEP + eor sC1, sAbe, sC1, ROR #44 SEP + eor sC2, sAki, sC2, ROR #26 SEP ld1r {v31.2d}, [sE1], #8 + eor sC3, sAmo, sC3, ROR #63 SEP + eor sC4, sAmu, sC4, ROR #56 SEP + eor sC0, sAga, sC0, ROR #57 SEP save sE1, STACK_OFFSET_CONST + eor sC1, sAme, sC1, ROR #58 SEP + eor sC2, sAbi, sC2, ROR #60 SEP + eor sC3, sAko, sC3, ROR #38 SEP + eor sC4, sAgu, sC4, ROR #48 SEP bcax_m0 vAga, vAga_, vAgi_, vAge_ + eor sC0, s_Aba, sC0, ROR #61 SEP + eor sC1, sAke, sC1, ROR #57 SEP + eor sC2, sAsi, sC2, ROR #52 SEP bcax_m0 vAge, vAge_, vAgo_, vAgi_ + eor sC3, sAbo, sC3, ROR #63 SEP + eor sC4, sAku, sC4, ROR #50 SEP + ror sC1, sC1, 56 SEP bcax_m0 vAgi, vAgi_, vAgu_, vAgo_ + ror sC4, sC4, 58 SEP + ror sC2, sC2, 62 SEP + SEP bcax_m0 vAgo, vAgo_, vAga_, vAgu_ + eor sE1, sC0, sC2, ROR #63 SEP + eor sE3, sC2, sC4, ROR #63 SEP + eor sE0, sC4, sC1, ROR #63 SEP bcax_m0 vAgu, vAgu_, vAge_, vAga_ + eor sE2, sC1, sC3, ROR #63 SEP + eor sE4, sC3, sC0, ROR #63 SEP + SEP bcax_m0 vAka, vAka_, vAki_, vAke_ + eor s_Aba_, sE0, s_Aba SEP + eor sAsa_, sE2, sAbi, ROR #50 SEP + eor sAbi_, sE2, sAki, ROR #46 SEP bcax_m0 vAke, vAke_, vAko_, vAki_ + eor sAki_, sE3, sAko, ROR #63 SEP + eor sAko_, sE4, sAmu, ROR #28 SEP + eor sAmu_, sE3, sAso, ROR #2 SEP bcax_m0 vAki, vAki_, vAku_, vAko_ + eor sAso_, sE0, sAma, ROR #54 SEP + eor sAka_, sE1, sAbe, ROR #43 SEP + eor sAse_, sE3, sAgo, ROR #36 SEP bcax_m0 vAko, vAko_, vAka_, vAku_ + eor sAgo_, sE1, sAme, ROR #49 SEP + eor sAke_, sE2, sAgi, ROR #3 SEP + eor sAgi_, sE0, sAka, ROR #39 SEP bcax_m0 vAku, vAku_, vAke_, vAka_ + eor sAga_, sE3, sAbo SEP + eor sAbo_, sE3, sAmo, ROR #37 SEP + eor sAmo_, sE2, sAmi, ROR #8 SEP bcax_m0 vAma, vAma_, vAmi_, vAme_ + eor sAmi_, sE1, sAke, ROR #56 SEP + eor sAge_, sE4, sAgu, ROR #44 SEP + eor sAgu_, sE2, sAsi, ROR #62 SEP bcax_m0 vAme, vAme_, vAmo_, vAmi_ + eor sAsi_, sE4, sAku, ROR #58 SEP + eor sAku_, sE0, sAsa, ROR #25 SEP + eor sAma_, sE4, sAbu, ROR #20 SEP bcax_m0 vAmi, vAmi_, vAmu_, vAmo_ + eor sAbu_, sE4, sAsu, ROR #9 SEP + eor sAsu_, sE1, sAse, ROR #23 SEP + eor sAme_, sE0, sAga, ROR #61 SEP bcax_m0 vAmo, vAmo_, vAma_, vAmu_ + eor sAbe_, sE1, sAge, ROR #19 SEP + SEP + load_constant_ptr SEP bcax_m0 vAmu, vAmu_, vAme_, vAma_ + restore count, STACK_OFFSET_COUNT SEP + SEP + bic tmp, sAgi_, sAge_, ROR #47 SEP bcax_m0 vAsa, vAsa_, vAsi_, vAse_ + eor sAga, tmp, sAga_, ROR #39 SEP + bic tmp, sAgo_, sAgi_, ROR #42 SEP + eor sAge, tmp, sAge_, ROR #25 SEP bcax_m0 vAse, vAse_, vAso_, vAsi_ + bic tmp, sAgu_, sAgo_, ROR #16 SEP + eor sAgi, tmp, sAgi_, ROR #58 SEP + bic tmp, sAga_, sAgu_, ROR #31 SEP bcax_m0 vAsi, vAsi_, vAsu_, vAso_ + eor sAgo, tmp, sAgo_, ROR #47 SEP + bic tmp, sAge_, sAga_, ROR #56 SEP + eor sAgu, tmp, sAgu_, ROR #23 SEP bcax_m0 vAso, vAso_, vAsa_, vAsu_ + bic tmp, sAki_, sAke_, ROR #19 SEP + eor sAka, tmp, sAka_, ROR #24 SEP + bic tmp, sAko_, sAki_, ROR #47 SEP bcax_m0 vAsu, vAsu_, vAse_, vAsa_ + eor sAke, tmp, sAke_, ROR #2 SEP + bic tmp, sAku_, sAko_, ROR #10 SEP + eor sAki, tmp, sAki_, ROR #57 SEP bcax_m0 vAba, vAba_, vAbi_, vAbe_ + bic tmp, sAka_, sAku_, ROR #47 SEP + eor sAko, tmp, sAko_, ROR #57 SEP + bic tmp, sAke_, sAka_, ROR #5 SEP bcax_m0 vAbe, vAbe_, vAbo_, vAbi_ + eor sAku, tmp, sAku_, ROR #52 SEP + bic tmp, sAmi_, sAme_, ROR #38 SEP + eor sAma, tmp, sAma_, ROR #47 SEP bcax_m0 vAbi, vAbi_, vAbu_, vAbo_ + bic tmp, sAmo_, sAmi_, ROR #5 SEP + eor sAme, tmp, sAme_, ROR #43 SEP + bic tmp, sAmu_, sAmo_, ROR #41 SEP bcax_m0 vAbo, vAbo_, vAba_, vAbu_ + eor sAmi, tmp, sAmi_, ROR #46 SEP + bic tmp, sAma_, sAmu_, ROR #35 SEP + SEP bcax_m0 vAbu, vAbu_, vAbe_, vAba_ + ldr cur_const, [const_addr, count, UXTW #3] SEP + add count, count, #1 SEP + SEP eor vAba.16b, vAba.16b, v31.16b + eor sAmo, tmp, sAmo_, ROR #12 SEP + bic tmp, sAme_, sAma_, ROR #9 SEP + eor sAmu, tmp, sAmu_, ROR #44 SEP + bic tmp, sAsi_, sAse_, ROR #48 SEP + eor sAsa, tmp, sAsa_, ROR #41 SEP + bic tmp, sAso_, sAsi_, ROR #2 SEP + eor sAse, tmp, sAse_, ROR #50 SEP + bic tmp, sAsu_, sAso_, ROR #25 SEP + eor sAsi, tmp, sAsi_, ROR #27 SEP + bic tmp, sAsa_, sAsu_, ROR #60 SEP + eor sAso, tmp, sAso_, ROR #21 SEP + bic tmp, sAse_, sAsa_, ROR #57 SEP + eor sAsu, tmp, sAsu_, ROR #53 SEP + bic tmp, sAbi_, sAbe_, ROR #63 SEP + eor s_Aba, s_Aba_, tmp, ROR #21 SEP + bic tmp, sAbo_, sAbi_, ROR #42 SEP + eor sAbe, tmp, sAbe_, ROR #41 SEP + bic tmp, sAbu_, sAbo_, ROR #57 SEP + eor sAbi, tmp, sAbi_, ROR #35 SEP + bic tmp, s_Aba_, sAbu_, ROR #50 SEP + eor sAbo, tmp, sAbo_, ROR #43 SEP + bic tmp, sAbe_, s_Aba_, ROR #44 SEP + eor sAbu, tmp, sAbu_, ROR #30 SEP + SEP + eor s_Aba, s_Aba, cur_const SEP + +.endm + +.macro final_rotate + ror sAga, sAga,#(64-3) + ror sAka, sAka,#(64-25) + ror sAma, sAma,#(64-10) + ror sAsa, sAsa,#(64-39) + ror sAbe, sAbe,#(64-21) + ror sAge, sAge,#(64-45) + ror sAke, sAke,#(64-8) + ror sAme, sAme,#(64-15) + ror sAse, sAse,#(64-41) + ror sAbi, sAbi,#(64-14) + ror sAgi, sAgi,#(64-61) + ror sAki, sAki,#(64-18) + ror sAmi, sAmi,#(64-56) + ror sAsi, sAsi,#(64-2) + ror sAgo, sAgo,#(64-28) + ror sAko, sAko,#(64-1) + ror sAmo, sAmo,#(64-27) + ror sAso, sAso,#(64-62) + ror sAbu, sAbu,#(64-44) + ror sAgu, sAgu,#(64-20) + ror sAku, sAku,#(64-6) + ror sAmu, sAmu,#(64-36) + ror sAsu, sAsu,#(64-55) +.endm + +#define KECCAK_F1600_ROUNDS 24 + +.global keccak_f1600_x4_hybrid_asm_v2 +.global _keccak_f1600_x4_hybrid_asm_v2 +.text +.align 4 + +keccak_f1600_x4_hybrid_asm_v2: +_keccak_f1600_x4_hybrid_asm_v2: + alloc_stack + save_gprs + save_vregs + save input_addr, STACK_OFFSET_INPUT + + load_input_vector 2,1 + + load_constant_ptr + save const_addr, STACK_OFFSET_CONST + + // First scalar Keccak computation alongside first half of SIMD computation + load_input_scalar 4,0 + hybrid_round_initial + loop_0: + hybrid_round_noninitial + cmp count, #(KECCAK_F1600_ROUNDS-1) + ble loop_0 + final_rotate + restore input_addr, STACK_OFFSET_INPUT + store_input_scalar 4,0 + + // Second scalar Keccak computation alongsie second half of SIMD computation + load_input_scalar 4,1 + hybrid_round_initial + loop_1: + hybrid_round_noninitial + cmp count, #(KECCAK_F1600_ROUNDS-1) + ble loop_1 + final_rotate + restore input_addr, STACK_OFFSET_INPUT + store_input_scalar 4, 1 + + store_input_vector 2,1 + + restore_vregs + restore_gprs + free_stack + ret + +#endif diff --git a/asm/manual/keccak_f1600/keccak_f1600_x4_hybrid_asm_v2p0.s b/asm/manual/keccak_f1600/keccak_f1600_x4_hybrid_asm_v2p0.s new file mode 100644 index 0000000..7b5a203 --- /dev/null +++ b/asm/manual/keccak_f1600/keccak_f1600_x4_hybrid_asm_v2p0.s @@ -0,0 +1,993 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +#if defined(__ARM_FEATURE_SHA3) + +/********************** CONSTANTS *************************/ + .data + .align(8) +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x29 + count .req w27 + cur_const .req x26 + + /* Mapping of Kecck-f1600 SIMD state to vector registers + * at the beginning and end of each round. */ + + vAba .req v0 + vAbe .req v1 + vAbi .req v2 + vAbo .req v3 + vAbu .req v4 + vAga .req v5 + vAge .req v6 + vAgi .req v7 + vAgo .req v8 + vAgu .req v9 + vAka .req v10 + vAke .req v11 + vAki .req v12 + vAko .req v13 + vAku .req v14 + vAma .req v15 + vAme .req v16 + vAmi .req v17 + vAmo .req v18 + vAmu .req v19 + vAsa .req v20 + vAse .req v21 + vAsi .req v22 + vAso .req v23 + vAsu .req v24 + + /* q-form of the above mapping */ + vAbaq .req q0 + vAbeq .req q1 + vAbiq .req q2 + vAboq .req q3 + vAbuq .req q4 + vAgaq .req q5 + vAgeq .req q6 + vAgiq .req q7 + vAgoq .req q8 + vAguq .req q9 + vAkaq .req q10 + vAkeq .req q11 + vAkiq .req q12 + vAkoq .req q13 + vAkuq .req q14 + vAmaq .req q15 + vAmeq .req q16 + vAmiq .req q17 + vAmoq .req q18 + vAmuq .req q19 + vAsaq .req q20 + vAseq .req q21 + vAsiq .req q22 + vAsoq .req q23 + vAsuq .req q24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v30 + C1 .req v29 + C2 .req v28 + C3 .req v27 + C4 .req v26 + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req v26 + E1 .req v25 + E2 .req v29 + E3 .req v28 + E4 .req v27 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + vAbi_ .req v2 + vAbo_ .req v3 + vAbu_ .req v4 + vAga_ .req v10 + vAge_ .req v11 + vAgi_ .req v7 + vAgo_ .req v8 + vAgu_ .req v9 + vAka_ .req v15 + vAke_ .req v16 + vAki_ .req v12 + vAko_ .req v13 + vAku_ .req v14 + vAma_ .req v20 + vAme_ .req v21 + vAmi_ .req v17 + vAmo_ .req v18 + vAmu_ .req v19 + vAsa_ .req v0 + vAse_ .req v1 + vAsi_ .req v22 + vAso_ .req v23 + vAsu_ .req v24 + vAba_ .req v30 + vAbe_ .req v27 + + /* Mapping of Kecck-f1600 state to scalar registers + * at the beginning and end of each round. */ + s_Aba .req x1 + sAbe .req x6 + sAbi .req x11 + sAbo .req x16 + sAbu .req x21 + sAga .req x2 + sAge .req x7 + sAgi .req x12 + sAgo .req x17 + sAgu .req x22 + sAka .req x3 + sAke .req x8 + sAki .req x13 + sAko .req x18 + sAku .req x23 + sAma .req x4 + sAme .req x9 + sAmi .req x14 + sAmo .req x19 + sAmu .req x24 + sAsa .req x5 + sAse .req x10 + sAsi .req x15 + sAso .req x20 + sAsu .req x25 + + /* sA_[y,2*x+3*y] = rot(A[x,y]) */ + s_Aba_ .req x0 + sAbe_ .req x28 + sAbi_ .req x11 + sAbo_ .req x16 + sAbu_ .req x21 + sAga_ .req x3 + sAge_ .req x8 + sAgi_ .req x12 + sAgo_ .req x17 + sAgu_ .req x22 + sAka_ .req x4 + sAke_ .req x9 + sAki_ .req x13 + sAko_ .req x18 + sAku_ .req x23 + sAma_ .req x5 + sAme_ .req x10 + sAmi_ .req x14 + sAmo_ .req x19 + sAmu_ .req x24 + sAsa_ .req x1 + sAse_ .req x6 + sAsi_ .req x15 + sAso_ .req x20 + sAsu_ .req x25 + + /* sC[x] = sA[x,0] xor sA[x,1] xor sA[x,2] xor sA[x,3] xor sA[x,4], for x in 0..4 */ + /* sE[x] = sC[x-1] xor rot(C[x+1],1), for x in 0..4 */ + sC0 .req x0 + sE0 .req x29 + sC1 .req x26 + sE1 .req x30 + sC2 .req x27 + sE2 .req x26 + sC3 .req x28 + sE3 .req x27 + sC4 .req x29 + sE4 .req x28 + + tmp .req x30 + +/************************ MACROS ****************************/ + +/* Macros using v8.4-A SHA-3 instructions */ + +.macro eor3_m0 d s0 s1 s2 + eor3 \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +.macro rax1_m1 d s0 s1 + xar_m0 tmpp, vzr, \s1, 63 + eor \d\().16b, \s0\().16b, tmpp.16b +.endm + +.macro xar_m0 d s0 s1 imm + xar \d\().2d, \s0\().2d, \s1\().2d, #\imm +.endm + +.macro bcax_m0 d s0 s1 s2 + bcax \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +.macro load_input_vector num idx + ldr vAbaq, [input_addr, #(16*(\num*0+\idx))] + ldr vAbeq, [input_addr, #(16*(\num*1+\idx))] + ldr vAbiq, [input_addr, #(16*(\num*2+\idx))] + ldr vAboq, [input_addr, #(16*(\num*3+\idx))] + ldr vAbuq, [input_addr, #(16*(\num*4+\idx))] + ldr vAgaq, [input_addr, #(16*(\num*5+\idx))] + ldr vAgeq, [input_addr, #(16*(\num*6+\idx))] + ldr vAgiq, [input_addr, #(16*(\num*7+\idx))] + ldr vAgoq, [input_addr, #(16*(\num*8+\idx))] + ldr vAguq, [input_addr, #(16*(\num*9+\idx))] + ldr vAkaq, [input_addr, #(16*(\num*10+\idx))] + ldr vAkeq, [input_addr, #(16*(\num*11+\idx))] + ldr vAkiq, [input_addr, #(16*(\num*12+\idx))] + ldr vAkoq, [input_addr, #(16*(\num*13+\idx))] + ldr vAkuq, [input_addr, #(16*(\num*14+\idx))] + ldr vAmaq, [input_addr, #(16*(\num*15+\idx))] + ldr vAmeq, [input_addr, #(16*(\num*16+\idx))] + ldr vAmiq, [input_addr, #(16*(\num*17+\idx))] + ldr vAmoq, [input_addr, #(16*(\num*18+\idx))] + ldr vAmuq, [input_addr, #(16*(\num*19+\idx))] + ldr vAsaq, [input_addr, #(16*(\num*20+\idx))] + ldr vAseq, [input_addr, #(16*(\num*21+\idx))] + ldr vAsiq, [input_addr, #(16*(\num*22+\idx))] + ldr vAsoq, [input_addr, #(16*(\num*23+\idx))] + ldr vAsuq, [input_addr, #(16*(\num*24+\idx))] +.endm + +.macro store_input_vector num idx + str vAbaq, [input_addr, #(16*(\num*0+\idx))] + str vAbeq, [input_addr, #(16*(\num*1+\idx))] + str vAbiq, [input_addr, #(16*(\num*2+\idx))] + str vAboq, [input_addr, #(16*(\num*3+\idx))] + str vAbuq, [input_addr, #(16*(\num*4+\idx))] + str vAgaq, [input_addr, #(16*(\num*5+\idx))] + str vAgeq, [input_addr, #(16*(\num*6+\idx))] + str vAgiq, [input_addr, #(16*(\num*7+\idx))] + str vAgoq, [input_addr, #(16*(\num*8+\idx))] + str vAguq, [input_addr, #(16*(\num*9+\idx))] + str vAkaq, [input_addr, #(16*(\num*10+\idx))] + str vAkeq, [input_addr, #(16*(\num*11+\idx))] + str vAkiq, [input_addr, #(16*(\num*12+\idx))] + str vAkoq, [input_addr, #(16*(\num*13+\idx))] + str vAkuq, [input_addr, #(16*(\num*14+\idx))] + str vAmaq, [input_addr, #(16*(\num*15+\idx))] + str vAmeq, [input_addr, #(16*(\num*16+\idx))] + str vAmiq, [input_addr, #(16*(\num*17+\idx))] + str vAmoq, [input_addr, #(16*(\num*18+\idx))] + str vAmuq, [input_addr, #(16*(\num*19+\idx))] + str vAsaq, [input_addr, #(16*(\num*20+\idx))] + str vAseq, [input_addr, #(16*(\num*21+\idx))] + str vAsiq, [input_addr, #(16*(\num*22+\idx))] + str vAsoq, [input_addr, #(16*(\num*23+\idx))] + str vAsuq, [input_addr, #(16*(\num*24+\idx))] +.endm + +.macro store_input_scalar num idx + str s_Aba, [input_addr, 8*(\num*(0) +\idx)] + str sAbe, [input_addr, 8*(\num*(0+1) +\idx)] + str sAbi, [input_addr, 8*(\num*(2)+ \idx)] + str sAbo, [input_addr, 8*(\num*(2+1) +\idx)] + str sAbu, [input_addr, 8*(\num*(4)+ \idx)] + str sAga, [input_addr, 8*(\num*(4+1) +\idx)] + str sAge, [input_addr, 8*(\num*(6)+ \idx)] + str sAgi, [input_addr, 8*(\num*(6+1) +\idx)] + str sAgo, [input_addr, 8*(\num*(8)+ \idx)] + str sAgu, [input_addr, 8*(\num*(8+1) +\idx)] + str sAka, [input_addr, 8*(\num*(10) +\idx)] + str sAke, [input_addr, 8*(\num*(10+1)+\idx)] + str sAki, [input_addr, 8*(\num*(12) +\idx)] + str sAko, [input_addr, 8*(\num*(12+1)+\idx)] + str sAku, [input_addr, 8*(\num*(14) +\idx)] + str sAma, [input_addr, 8*(\num*(14+1)+\idx)] + str sAme, [input_addr, 8*(\num*(16) +\idx)] + str sAmi, [input_addr, 8*(\num*(16+1)+\idx)] + str sAmo, [input_addr, 8*(\num*(18) +\idx)] + str sAmu, [input_addr, 8*(\num*(18+1)+\idx)] + str sAsa, [input_addr, 8*(\num*(20) +\idx)] + str sAse, [input_addr, 8*(\num*(20+1)+\idx)] + str sAsi, [input_addr, 8*(\num*(22) +\idx)] + str sAso, [input_addr, 8*(\num*(22+1)+\idx)] + str sAsu, [input_addr, 8*(\num*(24) +\idx)] +.endm + +.macro load_input_scalar num idx + ldr s_Aba, [input_addr, 8*(\num*(0) +\idx)] + ldr sAbe, [input_addr, 8*(\num*(0+1) +\idx)] + ldr sAbi, [input_addr, 8*(\num*(2)+ \idx)] + ldr sAbo, [input_addr, 8*(\num*(2+1) +\idx)] + ldr sAbu, [input_addr, 8*(\num*(4)+ \idx)] + ldr sAga, [input_addr, 8*(\num*(4+1) +\idx)] + ldr sAge, [input_addr, 8*(\num*(6)+ \idx)] + ldr sAgi, [input_addr, 8*(\num*(6+1) +\idx)] + ldr sAgo, [input_addr, 8*(\num*(8)+ \idx)] + ldr sAgu, [input_addr, 8*(\num*(8+1) +\idx)] + ldr sAka, [input_addr, 8*(\num*(10) +\idx)] + ldr sAke, [input_addr, 8*(\num*(10+1)+\idx)] + ldr sAki, [input_addr, 8*(\num*(12) +\idx)] + ldr sAko, [input_addr, 8*(\num*(12+1)+\idx)] + ldr sAku, [input_addr, 8*(\num*(14) +\idx)] + ldr sAma, [input_addr, 8*(\num*(14+1)+\idx)] + ldr sAme, [input_addr, 8*(\num*(16) +\idx)] + ldr sAmi, [input_addr, 8*(\num*(16+1)+\idx)] + ldr sAmo, [input_addr, 8*(\num*(18) +\idx)] + ldr sAmu, [input_addr, 8*(\num*(18+1)+\idx)] + ldr sAsa, [input_addr, 8*(\num*(20) +\idx)] + ldr sAse, [input_addr, 8*(\num*(20+1)+\idx)] + ldr sAsi, [input_addr, 8*(\num*(22) +\idx)] + ldr sAso, [input_addr, 8*(\num*(22+1)+\idx)] + ldr sAsu, [input_addr, 8*(\num*(24) +\idx)] +.endm + +#define STACK_SIZE (8*8 + 16*6 + 3*8 + 8) // VREGS (8*8), GPRs (16*6), count (8), const (8), input (8), padding (8) +#define STACK_BASE_GPRS (3*8+8) +#define STACK_BASE_VREGS (3*8+8+16*6) +#define STACK_OFFSET_INPUT (0*8) +#define STACK_OFFSET_CONST (1*8) +#define STACK_OFFSET_COUNT (2*8) + +.macro save_gprs + stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro save_vregs + stp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] + stp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + stp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + stp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] +.endm + +.macro restore_vregs + ldp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] + ldp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + ldp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + ldp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] +.endm + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +.macro eor5 dst, src0, src1, src2, src3, src4 + eor \dst, \src0, \src1 + eor \dst, \dst, \src2 + eor \dst, \dst, \src3 + eor \dst, \dst, \src4 +.endm + +.macro xor_rol dst, src1, src0, imm + eor \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro bic_rol dst, src1, src0, imm + bic \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro rotate dst, src, imm + ror \dst, \src, #(64-\imm) +.endm + +.macro save reg, offset + str \reg, [sp, #\offset] +.endm + +.macro restore reg, offset + ldr \reg, [sp, #\offset] +.endm + +.macro hybrid_round_initial + + eor sC0, sAma, sAsa SEP eor3_m0 C0, vAba, vAga, vAka + eor sC1, sAme, sAse SEP + eor sC2, sAmi, sAsi SEP + eor sC3, sAmo, sAso SEP eor3_m0 C0, C0, vAma, vAsa + eor sC4, sAmu, sAsu SEP + eor sC0, sAka, sC0 SEP + eor sC1, sAke, sC1 SEP eor3_m0 C1, vAbe, vAge, vAke + eor sC2, sAki, sC2 SEP + eor sC3, sAko, sC3 SEP + eor sC4, sAku, sC4 SEP eor3_m0 C1, C1, vAme, vAse + eor sC0, sAga, sC0 SEP + eor sC1, sAge, sC1 SEP + eor sC2, sAgi, sC2 SEP eor3_m0 C2, vAbi, vAgi, vAki + eor sC3, sAgo, sC3 SEP + eor sC4, sAgu, sC4 SEP + eor sC0, s_Aba, sC0 SEP eor3_m0 C2, C2, vAmi, vAsi + eor sC1, sAbe, sC1 SEP + eor sC2, sAbi, sC2 SEP + eor sC3, sAbo, sC3 SEP eor3_m0 C3, vAbo, vAgo, vAko + eor sC4, sAbu, sC4 SEP + SEP + eor sE1, sC0, sC2, ROR #63 SEP eor3_m0 C3, C3, vAmo, vAso + eor sE3, sC2, sC4, ROR #63 SEP + eor sE0, sC4, sC1, ROR #63 SEP + eor sE2, sC1, sC3, ROR #63 SEP eor3_m0 C4, vAbu, vAgu, vAku + eor sE4, sC3, sC0, ROR #63 SEP + SEP + eor s_Aba_, s_Aba, sE0 SEP eor3_m0 C4, C4, vAmu, vAsu + eor sAsa_, sAbi, sE2 SEP vzr .req v31 + eor sAbi_, sAki, sE2 SEP eor vzr.16b, vzr.16b, vzr.16b // zero + eor sAki_, sAko, sE3 SEP tmpp .req E1 + eor sAko_, sAmu, sE4 SEP rax1_m1 E1, C0, C2 + eor sAmu_, sAso, sE3 SEP .unreq tmpp + eor sAso_, sAma, sE0 SEP tmpp .req C0 + eor sAka_, sAbe, sE1 SEP rax1_m1 E3, C2, C4 + eor sAse_, sAgo, sE3 SEP + eor sAgo_, sAme, sE1 SEP + eor sAke_, sAgi, sE2 SEP rax1_m1 E0, C4, C1 + eor sAgi_, sAka, sE0 SEP + eor sAga_, sAbo, sE3 SEP + eor sAbo_, sAmo, sE3 SEP rax1_m1 E2, C1, C3 + eor sAmo_, sAmi, sE2 SEP + eor sAmi_, sAke, sE1 SEP + eor sAge_, sAgu, sE4 SEP rax1_m1 E4, C3, C0 + eor sAgu_, sAsi, sE2 SEP .unreq vzr + eor sAsi_, sAku, sE4 SEP .unreq tmpp + eor sAku_, sAsa, sE0 SEP + eor sAma_, sAbu, sE4 SEP eor vAba_.16b, vAba.16b, E0.16b + eor sAbu_, sAsu, sE4 SEP + eor sAsu_, sAse, sE1 SEP + eor sAme_, sAga, sE0 SEP xar_m0 vAsa_, vAbi, E2, 2 + eor sAbe_, sAge, sE1 SEP + SEP + load_constant_ptr SEP xar_m0 vAbi_, vAki, E2, 21 + SEP + bic tmp, sAgi_, sAge_, ROR #47 SEP + eor sAga, tmp, sAga_, ROR #39 SEP xar_m0 vAki_, vAko, E3, 39 + bic tmp, sAgo_, sAgi_, ROR #42 SEP + eor sAge, tmp, sAge_, ROR #25 SEP + bic tmp, sAgu_, sAgo_, ROR #16 SEP xar_m0 vAko_, vAmu, E4, 56 + eor sAgi, tmp, sAgi_, ROR #58 SEP + bic tmp, sAga_, sAgu_, ROR #31 SEP + eor sAgo, tmp, sAgo_, ROR #47 SEP xar_m0 vAmu_, vAso, E3, 8 + bic tmp, sAge_, sAga_, ROR #56 SEP + eor sAgu, tmp, sAgu_, ROR #23 SEP + bic tmp, sAki_, sAke_, ROR #19 SEP xar_m0 vAso_, vAma, E0, 23 + eor sAka, tmp, sAka_, ROR #24 SEP + bic tmp, sAko_, sAki_, ROR #47 SEP + eor sAke, tmp, sAke_, ROR #2 SEP xar_m0 vAka_, vAbe, E1, 63 + bic tmp, sAku_, sAko_, ROR #10 SEP + eor sAki, tmp, sAki_, ROR #57 SEP + bic tmp, sAka_, sAku_, ROR #47 SEP xar_m0 vAse_, vAgo, E3, 9 + eor sAko, tmp, sAko_, ROR #57 SEP + bic tmp, sAke_, sAka_, ROR #5 SEP + eor sAku, tmp, sAku_, ROR #52 SEP xar_m0 vAgo_, vAme, E1, 19 + bic tmp, sAmi_, sAme_, ROR #38 SEP + eor sAma, tmp, sAma_, ROR #47 SEP + bic tmp, sAmo_, sAmi_, ROR #5 SEP xar_m0 vAke_, vAgi, E2, 58 + eor sAme, tmp, sAme_, ROR #43 SEP + bic tmp, sAmu_, sAmo_, ROR #41 SEP + eor sAmi, tmp, sAmi_, ROR #46 SEP xar_m0 vAgi_, vAka, E0, 61 + SEP + ldr cur_const, [const_addr] SEP + mov count, #1 SEP xar_m0 vAga_, vAbo, E3, 36 + SEP + bic tmp, sAma_, sAmu_, ROR #35 SEP + eor sAmo, tmp, sAmo_, ROR #12 SEP xar_m0 vAbo_, vAmo, E3, 43 + bic tmp, sAme_, sAma_, ROR #9 SEP + eor sAmu, tmp, sAmu_, ROR #44 SEP + bic tmp, sAsi_, sAse_, ROR #48 SEP xar_m0 vAmo_, vAmi, E2, 49 + eor sAsa, tmp, sAsa_, ROR #41 SEP + bic tmp, sAso_, sAsi_, ROR #2 SEP + eor sAse, tmp, sAse_, ROR #50 SEP xar_m0 vAmi_, vAke, E1, 54 + bic tmp, sAsu_, sAso_, ROR #25 SEP + eor sAsi, tmp, sAsi_, ROR #27 SEP + bic tmp, sAsa_, sAsu_, ROR #60 SEP xar_m0 vAge_, vAgu, E4, 44 + eor sAso, tmp, sAso_, ROR #21 SEP + bic tmp, sAse_, sAsa_, ROR #57 SEP + eor sAsu, tmp, sAsu_, ROR #53 SEP xar_m0 vAgu_, vAsi, E2, 3 + bic tmp, sAbi_, sAbe_, ROR #63 SEP + eor s_Aba, s_Aba_, tmp, ROR #21 SEP + bic tmp, sAbo_, sAbi_, ROR #42 SEP xar_m0 vAsi_, vAku, E4, 25 + eor sAbe, tmp, sAbe_, ROR #41 SEP + bic tmp, sAbu_, sAbo_, ROR #57 SEP + eor sAbi, tmp, sAbi_, ROR #35 SEP xar_m0 vAku_, vAsa, E0, 46 + bic tmp, s_Aba_, sAbu_, ROR #50 SEP + eor sAbo, tmp, sAbo_, ROR #43 SEP + bic tmp, sAbe_, s_Aba_, ROR #44 SEP xar_m0 vAma_, vAbu, E4, 37 + eor sAbu, tmp, sAbu_, ROR #30 SEP + SEP + eor s_Aba, s_Aba, cur_const SEP xar_m0 vAbu_, vAsu, E4, 50 + SEP + save count, STACK_OFFSET_COUNT SEP + SEP xar_m0 vAsu_, vAse, E1, 62 + eor sC0, sAka, sAsa, ROR #50 SEP + eor sC1, sAse, sAge, ROR #60 SEP + eor sC2, sAmi, sAgi, ROR #59 SEP xar_m0 vAme_, vAga, E0, 28 + eor sC3, sAgo, sAso, ROR #30 SEP + eor sC4, sAbu, sAsu, ROR #53 SEP + eor sC0, sAma, sC0, ROR #49 SEP xar_m0 vAbe_, vAge, E1, 20 + eor sC1, sAbe, sC1, ROR #44 SEP + eor sC2, sAki, sC2, ROR #26 SEP restore sE1, STACK_OFFSET_CONST + eor sC3, sAmo, sC3, ROR #63 SEP + eor sC4, sAmu, sC4, ROR #56 SEP + eor sC0, sAga, sC0, ROR #57 SEP ld1r {v31.2d}, [sE1], #8 + eor sC1, sAme, sC1, ROR #58 SEP + eor sC2, sAbi, sC2, ROR #60 SEP + eor sC3, sAko, sC3, ROR #38 SEP save sE1, STACK_OFFSET_CONST + eor sC4, sAgu, sC4, ROR #48 SEP + eor sC0, s_Aba, sC0, ROR #61 SEP bcax_m0 vAga, vAga_, vAgi_, vAge_ + eor sC1, sAke, sC1, ROR #57 SEP + eor sC2, sAsi, sC2, ROR #52 SEP + eor sC3, sAbo, sC3, ROR #63 SEP bcax_m0 vAge, vAge_, vAgo_, vAgi_ + eor sC4, sAku, sC4, ROR #50 SEP + ror sC1, sC1, 56 SEP + ror sC4, sC4, 58 SEP bcax_m0 vAgi, vAgi_, vAgu_, vAgo_ + ror sC2, sC2, 62 SEP + SEP + eor sE1, sC0, sC2, ROR #63 SEP bcax_m0 vAgo, vAgo_, vAga_, vAgu_ + eor sE3, sC2, sC4, ROR #63 SEP + eor sE0, sC4, sC1, ROR #63 SEP + eor sE2, sC1, sC3, ROR #63 SEP bcax_m0 vAgu, vAgu_, vAge_, vAga_ + eor sE4, sC3, sC0, ROR #63 SEP + SEP + eor s_Aba_, sE0, s_Aba SEP bcax_m0 vAka, vAka_, vAki_, vAke_ + eor sAsa_, sE2, sAbi, ROR #50 SEP + eor sAbi_, sE2, sAki, ROR #46 SEP + eor sAki_, sE3, sAko, ROR #63 SEP bcax_m0 vAke, vAke_, vAko_, vAki_ + eor sAko_, sE4, sAmu, ROR #28 SEP + eor sAmu_, sE3, sAso, ROR #2 SEP + eor sAso_, sE0, sAma, ROR #54 SEP bcax_m0 vAki, vAki_, vAku_, vAko_ + eor sAka_, sE1, sAbe, ROR #43 SEP + eor sAse_, sE3, sAgo, ROR #36 SEP + eor sAgo_, sE1, sAme, ROR #49 SEP bcax_m0 vAko, vAko_, vAka_, vAku_ + eor sAke_, sE2, sAgi, ROR #3 SEP + eor sAgi_, sE0, sAka, ROR #39 SEP + eor sAga_, sE3, sAbo SEP bcax_m0 vAku, vAku_, vAke_, vAka_ + eor sAbo_, sE3, sAmo, ROR #37 SEP + eor sAmo_, sE2, sAmi, ROR #8 SEP + eor sAmi_, sE1, sAke, ROR #56 SEP bcax_m0 vAma, vAma_, vAmi_, vAme_ + eor sAge_, sE4, sAgu, ROR #44 SEP + eor sAgu_, sE2, sAsi, ROR #62 SEP + eor sAsi_, sE4, sAku, ROR #58 SEP bcax_m0 vAme, vAme_, vAmo_, vAmi_ + eor sAku_, sE0, sAsa, ROR #25 SEP + eor sAma_, sE4, sAbu, ROR #20 SEP + eor sAbu_, sE4, sAsu, ROR #9 SEP bcax_m0 vAmi, vAmi_, vAmu_, vAmo_ + eor sAsu_, sE1, sAse, ROR #23 SEP + eor sAme_, sE0, sAga, ROR #61 SEP + eor sAbe_, sE1, sAge, ROR #19 SEP bcax_m0 vAmo, vAmo_, vAma_, vAmu_ + SEP + load_constant_ptr SEP + restore count, STACK_OFFSET_COUNT SEP bcax_m0 vAmu, vAmu_, vAme_, vAma_ + SEP + bic tmp, sAgi_, sAge_, ROR #47 SEP + eor sAga, tmp, sAga_, ROR #39 SEP bcax_m0 vAsa, vAsa_, vAsi_, vAse_ + bic tmp, sAgo_, sAgi_, ROR #42 SEP + eor sAge, tmp, sAge_, ROR #25 SEP + bic tmp, sAgu_, sAgo_, ROR #16 SEP bcax_m0 vAse, vAse_, vAso_, vAsi_ + eor sAgi, tmp, sAgi_, ROR #58 SEP + bic tmp, sAga_, sAgu_, ROR #31 SEP + eor sAgo, tmp, sAgo_, ROR #47 SEP bcax_m0 vAsi, vAsi_, vAsu_, vAso_ + bic tmp, sAge_, sAga_, ROR #56 SEP + eor sAgu, tmp, sAgu_, ROR #23 SEP + bic tmp, sAki_, sAke_, ROR #19 SEP bcax_m0 vAso, vAso_, vAsa_, vAsu_ + eor sAka, tmp, sAka_, ROR #24 SEP + bic tmp, sAko_, sAki_, ROR #47 SEP + eor sAke, tmp, sAke_, ROR #2 SEP bcax_m0 vAsu, vAsu_, vAse_, vAsa_ + bic tmp, sAku_, sAko_, ROR #10 SEP + eor sAki, tmp, sAki_, ROR #57 SEP + bic tmp, sAka_, sAku_, ROR #47 SEP bcax_m0 vAba, vAba_, vAbi_, vAbe_ + eor sAko, tmp, sAko_, ROR #57 SEP + bic tmp, sAke_, sAka_, ROR #5 SEP + eor sAku, tmp, sAku_, ROR #52 SEP bcax_m0 vAbe, vAbe_, vAbo_, vAbi_ + bic tmp, sAmi_, sAme_, ROR #38 SEP + eor sAma, tmp, sAma_, ROR #47 SEP + bic tmp, sAmo_, sAmi_, ROR #5 SEP bcax_m0 vAbi, vAbi_, vAbu_, vAbo_ + eor sAme, tmp, sAme_, ROR #43 SEP + bic tmp, sAmu_, sAmo_, ROR #41 SEP + eor sAmi, tmp, sAmi_, ROR #46 SEP bcax_m0 vAbo, vAbo_, vAba_, vAbu_ + bic tmp, sAma_, sAmu_, ROR #35 SEP + SEP + ldr cur_const, [const_addr, count, UXTW #3] SEP bcax_m0 vAbu, vAbu_, vAbe_, vAba_ + SEP + eor sAmo, tmp, sAmo_, ROR #12 SEP + bic tmp, sAme_, sAma_, ROR #9 SEP + eor sAmu, tmp, sAmu_, ROR #44 SEP eor vAba.16b, vAba.16b, v31.16b + bic tmp, sAsi_, sAse_, ROR #48 SEP + eor sAsa, tmp, sAsa_, ROR #41 SEP + bic tmp, sAso_, sAsi_, ROR #2 SEP + eor sAse, tmp, sAse_, ROR #50 SEP + bic tmp, sAsu_, sAso_, ROR #25 SEP + eor sAsi, tmp, sAsi_, ROR #27 SEP + bic tmp, sAsa_, sAsu_, ROR #60 SEP + eor sAso, tmp, sAso_, ROR #21 SEP + bic tmp, sAse_, sAsa_, ROR #57 SEP + eor sAsu, tmp, sAsu_, ROR #53 SEP + bic tmp, sAbi_, sAbe_, ROR #63 SEP + eor s_Aba, s_Aba_, tmp, ROR #21 SEP + bic tmp, sAbo_, sAbi_, ROR #42 SEP + eor sAbe, tmp, sAbe_, ROR #41 SEP + bic tmp, sAbu_, sAbo_, ROR #57 SEP + eor sAbi, tmp, sAbi_, ROR #35 SEP + bic tmp, s_Aba_, sAbu_, ROR #50 SEP + eor sAbo, tmp, sAbo_, ROR #43 SEP + bic tmp, sAbe_, s_Aba_, ROR #44 SEP + eor sAbu, tmp, sAbu_, ROR #30 SEP + SEP + add count, count, #1 SEP + SEP + eor s_Aba, s_Aba, cur_const SEP + SEP +.endm + +.macro hybrid_round_noninitial + save count, STACK_OFFSET_COUNT SEP eor3_m0 C0, vAba, vAga, vAka + SEP + eor sC0, sAka, sAsa, ROR #50 SEP + eor sC1, sAse, sAge, ROR #60 SEP eor3_m0 C0, C0, vAma, vAsa + eor sC2, sAmi, sAgi, ROR #59 SEP + eor sC3, sAgo, sAso, ROR #30 SEP + eor sC4, sAbu, sAsu, ROR #53 SEP eor3_m0 C1, vAbe, vAge, vAke + eor sC0, sAma, sC0, ROR #49 SEP + eor sC1, sAbe, sC1, ROR #44 SEP + eor sC2, sAki, sC2, ROR #26 SEP eor3_m0 C1, C1, vAme, vAse + eor sC3, sAmo, sC3, ROR #63 SEP + eor sC4, sAmu, sC4, ROR #56 SEP + eor sC0, sAga, sC0, ROR #57 SEP eor3_m0 C2, vAbi, vAgi, vAki + eor sC1, sAme, sC1, ROR #58 SEP + eor sC2, sAbi, sC2, ROR #60 SEP + eor sC3, sAko, sC3, ROR #38 SEP eor3_m0 C2, C2, vAmi, vAsi + eor sC4, sAgu, sC4, ROR #48 SEP + eor sC0, s_Aba, sC0, ROR #61 SEP + eor sC1, sAke, sC1, ROR #57 SEP eor3_m0 C3, vAbo, vAgo, vAko + eor sC2, sAsi, sC2, ROR #52 SEP + eor sC3, sAbo, sC3, ROR #63 SEP + eor sC4, sAku, sC4, ROR #50 SEP eor3_m0 C3, C3, vAmo, vAso + ror sC1, sC1, 56 SEP + ror sC4, sC4, 58 SEP + ror sC2, sC2, 62 SEP eor3_m0 C4, vAbu, vAgu, vAku + SEP + eor sE1, sC0, sC2, ROR #63 SEP + eor sE3, sC2, sC4, ROR #63 SEP eor3_m0 C4, C4, vAmu, vAsu + eor sE0, sC4, sC1, ROR #63 SEP eor3_m0 C4, C4, vAmu, vAsu + vzr .req v31 + eor sE2, sC1, sC3, ROR #63 SEP eor vzr.16b, vzr.16b, vzr.16b // zero + eor sE4, sC3, sC0, ROR #63 SEP tmpp .req E1 + SEP rax1_m1 E1, C0, C2 + eor s_Aba_, sE0, s_Aba SEP .unreq tmpp + eor sAsa_, sE2, sAbi, ROR #50 SEP tmpp .req C0 + eor sAbi_, sE2, sAki, ROR #46 SEP rax1_m1 E3, C2, C4 + eor sAki_, sE3, sAko, ROR #63 SEP + eor sAko_, sE4, sAmu, ROR #28 SEP + eor sAmu_, sE3, sAso, ROR #2 SEP rax1_m1 E0, C4, C1 + eor sAso_, sE0, sAma, ROR #54 SEP + eor sAka_, sE1, sAbe, ROR #43 SEP + eor sAse_, sE3, sAgo, ROR #36 SEP rax1_m1 E2, C1, C3 + eor sAgo_, sE1, sAme, ROR #49 SEP + eor sAke_, sE2, sAgi, ROR #3 SEP + eor sAgi_, sE0, sAka, ROR #39 SEP rax1_m1 E4, C3, C0 + eor sAga_, sE3, sAbo SEP .unreq vzr + eor sAbo_, sE3, sAmo, ROR #37 SEP .unreq tmpp + eor sAmo_, sE2, sAmi, ROR #8 SEP + eor sAmi_, sE1, sAke, ROR #56 SEP eor vAba_.16b, vAba.16b, E0.16b + eor sAge_, sE4, sAgu, ROR #44 SEP + eor sAgu_, sE2, sAsi, ROR #62 SEP + eor sAsi_, sE4, sAku, ROR #58 SEP xar_m0 vAsa_, vAbi, E2, 2 + eor sAku_, sE0, sAsa, ROR #25 SEP + eor sAma_, sE4, sAbu, ROR #20 SEP + eor sAbu_, sE4, sAsu, ROR #9 SEP xar_m0 vAbi_, vAki, E2, 21 + eor sAsu_, sE1, sAse, ROR #23 SEP + eor sAme_, sE0, sAga, ROR #61 SEP + eor sAbe_, sE1, sAge, ROR #19 SEP xar_m0 vAki_, vAko, E3, 39 + SEP + load_constant_ptr SEP + restore count, STACK_OFFSET_COUNT SEP xar_m0 vAko_, vAmu, E4, 56 + SEP + bic tmp, sAgi_, sAge_, ROR #47 SEP + eor sAga, tmp, sAga_, ROR #39 SEP xar_m0 vAmu_, vAso, E3, 8 + bic tmp, sAgo_, sAgi_, ROR #42 SEP + eor sAge, tmp, sAge_, ROR #25 SEP + bic tmp, sAgu_, sAgo_, ROR #16 SEP xar_m0 vAso_, vAma, E0, 23 + eor sAgi, tmp, sAgi_, ROR #58 SEP + bic tmp, sAga_, sAgu_, ROR #31 SEP + eor sAgo, tmp, sAgo_, ROR #47 SEP xar_m0 vAka_, vAbe, E1, 63 + bic tmp, sAge_, sAga_, ROR #56 SEP + eor sAgu, tmp, sAgu_, ROR #23 SEP + bic tmp, sAki_, sAke_, ROR #19 SEP xar_m0 vAse_, vAgo, E3, 9 + eor sAka, tmp, sAka_, ROR #24 SEP + bic tmp, sAko_, sAki_, ROR #47 SEP + eor sAke, tmp, sAke_, ROR #2 SEP xar_m0 vAgo_, vAme, E1, 19 + bic tmp, sAku_, sAko_, ROR #10 SEP + eor sAki, tmp, sAki_, ROR #57 SEP + bic tmp, sAka_, sAku_, ROR #47 SEP xar_m0 vAke_, vAgi, E2, 58 + eor sAko, tmp, sAko_, ROR #57 SEP + bic tmp, sAke_, sAka_, ROR #5 SEP + eor sAku, tmp, sAku_, ROR #52 SEP xar_m0 vAgi_, vAka, E0, 61 + bic tmp, sAmi_, sAme_, ROR #38 SEP + eor sAma, tmp, sAma_, ROR #47 SEP + bic tmp, sAmo_, sAmi_, ROR #5 SEP xar_m0 vAga_, vAbo, E3, 36 + eor sAme, tmp, sAme_, ROR #43 SEP + bic tmp, sAmu_, sAmo_, ROR #41 SEP + eor sAmi, tmp, sAmi_, ROR #46 SEP xar_m0 vAbo_, vAmo, E3, 43 + bic tmp, sAma_, sAmu_, ROR #35 SEP + SEP + ldr cur_const, [const_addr, count, UXTW #3] SEP xar_m0 vAmo_, vAmi, E2, 49 + add count, count, #1 SEP + SEP + eor sAmo, tmp, sAmo_, ROR #12 SEP xar_m0 vAmi_, vAke, E1, 54 + bic tmp, sAme_, sAma_, ROR #9 SEP + eor sAmu, tmp, sAmu_, ROR #44 SEP + bic tmp, sAsi_, sAse_, ROR #48 SEP xar_m0 vAge_, vAgu, E4, 44 + eor sAsa, tmp, sAsa_, ROR #41 SEP + bic tmp, sAso_, sAsi_, ROR #2 SEP + eor sAse, tmp, sAse_, ROR #50 SEP xar_m0 vAgu_, vAsi, E2, 3 + bic tmp, sAsu_, sAso_, ROR #25 SEP + eor sAsi, tmp, sAsi_, ROR #27 SEP + bic tmp, sAsa_, sAsu_, ROR #60 SEP xar_m0 vAsi_, vAku, E4, 25 + eor sAso, tmp, sAso_, ROR #21 SEP + bic tmp, sAse_, sAsa_, ROR #57 SEP + eor sAsu, tmp, sAsu_, ROR #53 SEP xar_m0 vAku_, vAsa, E0, 46 + bic tmp, sAbi_, sAbe_, ROR #63 SEP + eor s_Aba, s_Aba_, tmp, ROR #21 SEP + bic tmp, sAbo_, sAbi_, ROR #42 SEP xar_m0 vAma_, vAbu, E4, 37 + eor sAbe, tmp, sAbe_, ROR #41 SEP + bic tmp, sAbu_, sAbo_, ROR #57 SEP + eor sAbi, tmp, sAbi_, ROR #35 SEP xar_m0 vAbu_, vAsu, E4, 50 + bic tmp, s_Aba_, sAbu_, ROR #50 SEP + eor sAbo, tmp, sAbo_, ROR #43 SEP + bic tmp, sAbe_, s_Aba_, ROR #44 SEP xar_m0 vAsu_, vAse, E1, 62 + eor sAbu, tmp, sAbu_, ROR #30 SEP + SEP + eor s_Aba, s_Aba, cur_const SEP xar_m0 vAme_, vAga, E0, 28 + save count, STACK_OFFSET_COUNT SEP + SEP + eor sC0, sAka, sAsa, ROR #50 SEP xar_m0 vAbe_, vAge, E1, 20 + eor sC1, sAse, sAge, ROR #60 SEP + eor sC2, sAmi, sAgi, ROR #59 SEP + eor sC3, sAgo, sAso, ROR #30 SEP + eor sC4, sAbu, sAsu, ROR #53 SEP restore sE1, STACK_OFFSET_CONST + eor sC0, sAma, sC0, ROR #49 SEP + eor sC1, sAbe, sC1, ROR #44 SEP + eor sC2, sAki, sC2, ROR #26 SEP ld1r {v31.2d}, [sE1], #8 + eor sC3, sAmo, sC3, ROR #63 SEP + eor sC4, sAmu, sC4, ROR #56 SEP + eor sC0, sAga, sC0, ROR #57 SEP save sE1, STACK_OFFSET_CONST + eor sC1, sAme, sC1, ROR #58 SEP + eor sC2, sAbi, sC2, ROR #60 SEP + eor sC3, sAko, sC3, ROR #38 SEP + eor sC4, sAgu, sC4, ROR #48 SEP bcax_m0 vAga, vAga_, vAgi_, vAge_ + eor sC0, s_Aba, sC0, ROR #61 SEP + eor sC1, sAke, sC1, ROR #57 SEP + eor sC2, sAsi, sC2, ROR #52 SEP bcax_m0 vAge, vAge_, vAgo_, vAgi_ + eor sC3, sAbo, sC3, ROR #63 SEP + eor sC4, sAku, sC4, ROR #50 SEP + ror sC1, sC1, 56 SEP bcax_m0 vAgi, vAgi_, vAgu_, vAgo_ + ror sC4, sC4, 58 SEP + ror sC2, sC2, 62 SEP + SEP bcax_m0 vAgo, vAgo_, vAga_, vAgu_ + eor sE1, sC0, sC2, ROR #63 SEP + eor sE3, sC2, sC4, ROR #63 SEP + eor sE0, sC4, sC1, ROR #63 SEP bcax_m0 vAgu, vAgu_, vAge_, vAga_ + eor sE2, sC1, sC3, ROR #63 SEP + eor sE4, sC3, sC0, ROR #63 SEP + SEP bcax_m0 vAka, vAka_, vAki_, vAke_ + eor s_Aba_, sE0, s_Aba SEP + eor sAsa_, sE2, sAbi, ROR #50 SEP + eor sAbi_, sE2, sAki, ROR #46 SEP bcax_m0 vAke, vAke_, vAko_, vAki_ + eor sAki_, sE3, sAko, ROR #63 SEP + eor sAko_, sE4, sAmu, ROR #28 SEP + eor sAmu_, sE3, sAso, ROR #2 SEP bcax_m0 vAki, vAki_, vAku_, vAko_ + eor sAso_, sE0, sAma, ROR #54 SEP + eor sAka_, sE1, sAbe, ROR #43 SEP + eor sAse_, sE3, sAgo, ROR #36 SEP bcax_m0 vAko, vAko_, vAka_, vAku_ + eor sAgo_, sE1, sAme, ROR #49 SEP + eor sAke_, sE2, sAgi, ROR #3 SEP + eor sAgi_, sE0, sAka, ROR #39 SEP bcax_m0 vAku, vAku_, vAke_, vAka_ + eor sAga_, sE3, sAbo SEP + eor sAbo_, sE3, sAmo, ROR #37 SEP + eor sAmo_, sE2, sAmi, ROR #8 SEP bcax_m0 vAma, vAma_, vAmi_, vAme_ + eor sAmi_, sE1, sAke, ROR #56 SEP + eor sAge_, sE4, sAgu, ROR #44 SEP + eor sAgu_, sE2, sAsi, ROR #62 SEP bcax_m0 vAme, vAme_, vAmo_, vAmi_ + eor sAsi_, sE4, sAku, ROR #58 SEP + eor sAku_, sE0, sAsa, ROR #25 SEP + eor sAma_, sE4, sAbu, ROR #20 SEP bcax_m0 vAmi, vAmi_, vAmu_, vAmo_ + eor sAbu_, sE4, sAsu, ROR #9 SEP + eor sAsu_, sE1, sAse, ROR #23 SEP + eor sAme_, sE0, sAga, ROR #61 SEP bcax_m0 vAmo, vAmo_, vAma_, vAmu_ + eor sAbe_, sE1, sAge, ROR #19 SEP + SEP + load_constant_ptr SEP bcax_m0 vAmu, vAmu_, vAme_, vAma_ + restore count, STACK_OFFSET_COUNT SEP + SEP + bic tmp, sAgi_, sAge_, ROR #47 SEP bcax_m0 vAsa, vAsa_, vAsi_, vAse_ + eor sAga, tmp, sAga_, ROR #39 SEP + bic tmp, sAgo_, sAgi_, ROR #42 SEP + eor sAge, tmp, sAge_, ROR #25 SEP bcax_m0 vAse, vAse_, vAso_, vAsi_ + bic tmp, sAgu_, sAgo_, ROR #16 SEP + eor sAgi, tmp, sAgi_, ROR #58 SEP + bic tmp, sAga_, sAgu_, ROR #31 SEP bcax_m0 vAsi, vAsi_, vAsu_, vAso_ + eor sAgo, tmp, sAgo_, ROR #47 SEP + bic tmp, sAge_, sAga_, ROR #56 SEP + eor sAgu, tmp, sAgu_, ROR #23 SEP bcax_m0 vAso, vAso_, vAsa_, vAsu_ + bic tmp, sAki_, sAke_, ROR #19 SEP + eor sAka, tmp, sAka_, ROR #24 SEP + bic tmp, sAko_, sAki_, ROR #47 SEP bcax_m0 vAsu, vAsu_, vAse_, vAsa_ + eor sAke, tmp, sAke_, ROR #2 SEP + bic tmp, sAku_, sAko_, ROR #10 SEP + eor sAki, tmp, sAki_, ROR #57 SEP bcax_m0 vAba, vAba_, vAbi_, vAbe_ + bic tmp, sAka_, sAku_, ROR #47 SEP + eor sAko, tmp, sAko_, ROR #57 SEP + bic tmp, sAke_, sAka_, ROR #5 SEP bcax_m0 vAbe, vAbe_, vAbo_, vAbi_ + eor sAku, tmp, sAku_, ROR #52 SEP + bic tmp, sAmi_, sAme_, ROR #38 SEP + eor sAma, tmp, sAma_, ROR #47 SEP bcax_m0 vAbi, vAbi_, vAbu_, vAbo_ + bic tmp, sAmo_, sAmi_, ROR #5 SEP + eor sAme, tmp, sAme_, ROR #43 SEP + bic tmp, sAmu_, sAmo_, ROR #41 SEP bcax_m0 vAbo, vAbo_, vAba_, vAbu_ + eor sAmi, tmp, sAmi_, ROR #46 SEP + bic tmp, sAma_, sAmu_, ROR #35 SEP + SEP bcax_m0 vAbu, vAbu_, vAbe_, vAba_ + ldr cur_const, [const_addr, count, UXTW #3] SEP + add count, count, #1 SEP + SEP eor vAba.16b, vAba.16b, v31.16b + eor sAmo, tmp, sAmo_, ROR #12 SEP + bic tmp, sAme_, sAma_, ROR #9 SEP + eor sAmu, tmp, sAmu_, ROR #44 SEP + bic tmp, sAsi_, sAse_, ROR #48 SEP + eor sAsa, tmp, sAsa_, ROR #41 SEP + bic tmp, sAso_, sAsi_, ROR #2 SEP + eor sAse, tmp, sAse_, ROR #50 SEP + bic tmp, sAsu_, sAso_, ROR #25 SEP + eor sAsi, tmp, sAsi_, ROR #27 SEP + bic tmp, sAsa_, sAsu_, ROR #60 SEP + eor sAso, tmp, sAso_, ROR #21 SEP + bic tmp, sAse_, sAsa_, ROR #57 SEP + eor sAsu, tmp, sAsu_, ROR #53 SEP + bic tmp, sAbi_, sAbe_, ROR #63 SEP + eor s_Aba, s_Aba_, tmp, ROR #21 SEP + bic tmp, sAbo_, sAbi_, ROR #42 SEP + eor sAbe, tmp, sAbe_, ROR #41 SEP + bic tmp, sAbu_, sAbo_, ROR #57 SEP + eor sAbi, tmp, sAbi_, ROR #35 SEP + bic tmp, s_Aba_, sAbu_, ROR #50 SEP + eor sAbo, tmp, sAbo_, ROR #43 SEP + bic tmp, sAbe_, s_Aba_, ROR #44 SEP + eor sAbu, tmp, sAbu_, ROR #30 SEP + SEP + eor s_Aba, s_Aba, cur_const SEP + +.endm + +.macro final_rotate + ror sAga, sAga,#(64-3) + ror sAka, sAka,#(64-25) + ror sAma, sAma,#(64-10) + ror sAsa, sAsa,#(64-39) + ror sAbe, sAbe,#(64-21) + ror sAge, sAge,#(64-45) + ror sAke, sAke,#(64-8) + ror sAme, sAme,#(64-15) + ror sAse, sAse,#(64-41) + ror sAbi, sAbi,#(64-14) + ror sAgi, sAgi,#(64-61) + ror sAki, sAki,#(64-18) + ror sAmi, sAmi,#(64-56) + ror sAsi, sAsi,#(64-2) + ror sAgo, sAgo,#(64-28) + ror sAko, sAko,#(64-1) + ror sAmo, sAmo,#(64-27) + ror sAso, sAso,#(64-62) + ror sAbu, sAbu,#(64-44) + ror sAgu, sAgu,#(64-20) + ror sAku, sAku,#(64-6) + ror sAmu, sAmu,#(64-36) + ror sAsu, sAsu,#(64-55) +.endm + +#define KECCAK_F1600_ROUNDS 24 + +.global keccak_f1600_x4_hybrid_asm_v2p0 +.global _keccak_f1600_x4_hybrid_asm_v2p0 +.text +.align 4 + +keccak_f1600_x4_hybrid_asm_v2p0: +_keccak_f1600_x4_hybrid_asm_v2p0: + alloc_stack + save_gprs + save_vregs + save input_addr, STACK_OFFSET_INPUT + + load_input_vector 2,1 + + load_constant_ptr + save const_addr, STACK_OFFSET_CONST + + // First scalar Keccak computation alongside first half of SIMD computation + load_input_scalar 4,0 + hybrid_round_initial + loop_0: + hybrid_round_noninitial + cmp count, #(KECCAK_F1600_ROUNDS-1) + ble loop_0 + final_rotate + restore input_addr, STACK_OFFSET_INPUT + store_input_scalar 4,0 + + // Second scalar Keccak computation alongsie second half of SIMD computation + load_input_scalar 4,1 + hybrid_round_initial + loop_1: + hybrid_round_noninitial + cmp count, #(KECCAK_F1600_ROUNDS-1) + ble loop_1 + final_rotate + restore input_addr, STACK_OFFSET_INPUT + store_input_scalar 4, 1 + + store_input_vector 2,1 + + restore_vregs + restore_gprs + free_stack + ret + +#endif diff --git a/asm/manual/keccak_f1600/keccak_f1600_x4_hybrid_asm_v3.s b/asm/manual/keccak_f1600/keccak_f1600_x4_hybrid_asm_v3.s new file mode 100644 index 0000000..44795aa --- /dev/null +++ b/asm/manual/keccak_f1600/keccak_f1600_x4_hybrid_asm_v3.s @@ -0,0 +1,1015 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +/********************** CONSTANTS *************************/ + .data + .align(8) +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x29 + count .req w27 + cur_const .req x26 + + /* Mapping of Kecck-f1600 SIMD state to vector registers + * at the beginning and end of each round. */ + + vAba .req v0 + vAbe .req v1 + vAbi .req v2 + vAbo .req v3 + vAbu .req v4 + vAga .req v5 + vAge .req v6 + vAgi .req v7 + vAgo .req v8 + vAgu .req v9 + vAka .req v10 + vAke .req v11 + vAki .req v12 + vAko .req v13 + vAku .req v14 + vAma .req v15 + vAme .req v16 + vAmi .req v17 + vAmo .req v18 + vAmu .req v19 + vAsa .req v20 + vAse .req v21 + vAsi .req v22 + vAso .req v23 + vAsu .req v24 + + /* q-form of the above mapping */ + vAbaq .req q0 + vAbeq .req q1 + vAbiq .req q2 + vAboq .req q3 + vAbuq .req q4 + vAgaq .req q5 + vAgeq .req q6 + vAgiq .req q7 + vAgoq .req q8 + vAguq .req q9 + vAkaq .req q10 + vAkeq .req q11 + vAkiq .req q12 + vAkoq .req q13 + vAkuq .req q14 + vAmaq .req q15 + vAmeq .req q16 + vAmiq .req q17 + vAmoq .req q18 + vAmuq .req q19 + vAsaq .req q20 + vAseq .req q21 + vAsiq .req q22 + vAsoq .req q23 + vAsuq .req q24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v30 + C1 .req v29 + C2 .req v28 + C3 .req v27 + C4 .req v26 + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req v26 + E1 .req v25 + E2 .req v29 + E3 .req v28 + E4 .req v27 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + vAbi_ .req v2 + vAbo_ .req v3 + vAbu_ .req v4 + vAga_ .req v10 + vAge_ .req v11 + vAgi_ .req v7 + vAgo_ .req v8 + vAgu_ .req v9 + vAka_ .req v15 + vAke_ .req v16 + vAki_ .req v12 + vAko_ .req v13 + vAku_ .req v14 + vAma_ .req v20 + vAme_ .req v21 + vAmi_ .req v17 + vAmo_ .req v18 + vAmu_ .req v19 + vAsa_ .req v0 + vAse_ .req v1 + vAsi_ .req v22 + vAso_ .req v23 + vAsu_ .req v24 + vAba_ .req v30 + vAbe_ .req v27 + + /* Unused temporary */ + vtmp .req v31 + + /* Mapping of Kecck-f1600 state to scalar registers + * at the beginning and end of each round. */ + s_Aba .req x1 + sAbe .req x6 + sAbi .req x11 + sAbo .req x16 + sAbu .req x21 + sAga .req x2 + sAge .req x7 + sAgi .req x12 + sAgo .req x17 + sAgu .req x22 + sAka .req x3 + sAke .req x8 + sAki .req x13 + sAko .req x18 + sAku .req x23 + sAma .req x4 + sAme .req x9 + sAmi .req x14 + sAmo .req x19 + sAmu .req x24 + sAsa .req x5 + sAse .req x10 + sAsi .req x15 + sAso .req x20 + sAsu .req x25 + + /* sA_[y,2*x+3*y] = rot(A[x,y]) */ + s_Aba_ .req x0 + sAbe_ .req x28 + sAbi_ .req x11 + sAbo_ .req x16 + sAbu_ .req x21 + sAga_ .req x3 + sAge_ .req x8 + sAgi_ .req x12 + sAgo_ .req x17 + sAgu_ .req x22 + sAka_ .req x4 + sAke_ .req x9 + sAki_ .req x13 + sAko_ .req x18 + sAku_ .req x23 + sAma_ .req x5 + sAme_ .req x10 + sAmi_ .req x14 + sAmo_ .req x19 + sAmu_ .req x24 + sAsa_ .req x1 + sAse_ .req x6 + sAsi_ .req x15 + sAso_ .req x20 + sAsu_ .req x25 + + /* sC[x] = sA[x,0] xor sA[x,1] xor sA[x,2] xor sA[x,3] xor sA[x,4], for x in 0..4 */ + /* sE[x] = sC[x-1] xor rot(C[x+1],1), for x in 0..4 */ + sC0 .req x0 + sE0 .req x29 + sC1 .req x26 + sE1 .req x30 + sC2 .req x27 + sE2 .req x26 + sC3 .req x28 + sE3 .req x27 + sC4 .req x29 + sE4 .req x28 + + tmp .req x30 + +/************************ MACROS ****************************/ + +/* Macros using v8.4-A SHA-3 instructions */ + + +.macro eor3_m1 d s0 s1 s2 + eor \d\().16b, \s0\().16b, \s1\().16b + eor \d\().16b, \d\().16b, \s2\().16b +.endm + +.macro rax1_m1 d s0 s1 + add vtmp.2d, \s1\().2d, \s1\().2d + sri vtmp.2d, \s1\().2d, #63 + eor \d\().16b, vtmp.16b, \s0\().16b +.endm + +.macro xar_m1 d s0 s1 imm + eor vtmp.16b, \s0\().16b, \s1\().16b + shl \d\().2d, vtmp.2d, #(64-\imm) + sri \d\().2d, vtmp.2d, #(\imm) +.endm + +.macro bcax_m1 d s0 s1 s2 + bic vtmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, vtmp.16b, \s0\().16b + .endm + + +.macro eor3_m0 d s0 s1 s2 + eor3 \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +.macro rax1_m0 d s0 s1 + rax1 \d\().2d, \s0\().2d, \s1\().2d +.endm + +.macro xar_m0 d s0 s1 imm + xar \d\().2d, \s0\().2d, \s1\().2d, #\imm +.endm + +.macro bcax_m0 d s0 s1 s2 + bcax \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + + +.macro load_input_vector num idx + ldr vAbaq, [input_addr, #(16*(\num*0+\idx))] + ldr vAbeq, [input_addr, #(16*(\num*1+\idx))] + ldr vAbiq, [input_addr, #(16*(\num*2+\idx))] + ldr vAboq, [input_addr, #(16*(\num*3+\idx))] + ldr vAbuq, [input_addr, #(16*(\num*4+\idx))] + ldr vAgaq, [input_addr, #(16*(\num*5+\idx))] + ldr vAgeq, [input_addr, #(16*(\num*6+\idx))] + ldr vAgiq, [input_addr, #(16*(\num*7+\idx))] + ldr vAgoq, [input_addr, #(16*(\num*8+\idx))] + ldr vAguq, [input_addr, #(16*(\num*9+\idx))] + ldr vAkaq, [input_addr, #(16*(\num*10+\idx))] + ldr vAkeq, [input_addr, #(16*(\num*11+\idx))] + ldr vAkiq, [input_addr, #(16*(\num*12+\idx))] + ldr vAkoq, [input_addr, #(16*(\num*13+\idx))] + ldr vAkuq, [input_addr, #(16*(\num*14+\idx))] + ldr vAmaq, [input_addr, #(16*(\num*15+\idx))] + ldr vAmeq, [input_addr, #(16*(\num*16+\idx))] + ldr vAmiq, [input_addr, #(16*(\num*17+\idx))] + ldr vAmoq, [input_addr, #(16*(\num*18+\idx))] + ldr vAmuq, [input_addr, #(16*(\num*19+\idx))] + ldr vAsaq, [input_addr, #(16*(\num*20+\idx))] + ldr vAseq, [input_addr, #(16*(\num*21+\idx))] + ldr vAsiq, [input_addr, #(16*(\num*22+\idx))] + ldr vAsoq, [input_addr, #(16*(\num*23+\idx))] + ldr vAsuq, [input_addr, #(16*(\num*24+\idx))] +.endm + +.macro store_input_vector num idx + str vAbaq, [input_addr, #(16*(\num*0+\idx))] + str vAbeq, [input_addr, #(16*(\num*1+\idx))] + str vAbiq, [input_addr, #(16*(\num*2+\idx))] + str vAboq, [input_addr, #(16*(\num*3+\idx))] + str vAbuq, [input_addr, #(16*(\num*4+\idx))] + str vAgaq, [input_addr, #(16*(\num*5+\idx))] + str vAgeq, [input_addr, #(16*(\num*6+\idx))] + str vAgiq, [input_addr, #(16*(\num*7+\idx))] + str vAgoq, [input_addr, #(16*(\num*8+\idx))] + str vAguq, [input_addr, #(16*(\num*9+\idx))] + str vAkaq, [input_addr, #(16*(\num*10+\idx))] + str vAkeq, [input_addr, #(16*(\num*11+\idx))] + str vAkiq, [input_addr, #(16*(\num*12+\idx))] + str vAkoq, [input_addr, #(16*(\num*13+\idx))] + str vAkuq, [input_addr, #(16*(\num*14+\idx))] + str vAmaq, [input_addr, #(16*(\num*15+\idx))] + str vAmeq, [input_addr, #(16*(\num*16+\idx))] + str vAmiq, [input_addr, #(16*(\num*17+\idx))] + str vAmoq, [input_addr, #(16*(\num*18+\idx))] + str vAmuq, [input_addr, #(16*(\num*19+\idx))] + str vAsaq, [input_addr, #(16*(\num*20+\idx))] + str vAseq, [input_addr, #(16*(\num*21+\idx))] + str vAsiq, [input_addr, #(16*(\num*22+\idx))] + str vAsoq, [input_addr, #(16*(\num*23+\idx))] + str vAsuq, [input_addr, #(16*(\num*24+\idx))] +.endm + +.macro store_input_scalar num idx + str s_Aba, [input_addr, 8*(\num*(0) +\idx)] + str sAbe, [input_addr, 8*(\num*(0+1) +\idx)] + str sAbi, [input_addr, 8*(\num*(2)+ \idx)] + str sAbo, [input_addr, 8*(\num*(2+1) +\idx)] + str sAbu, [input_addr, 8*(\num*(4)+ \idx)] + str sAga, [input_addr, 8*(\num*(4+1) +\idx)] + str sAge, [input_addr, 8*(\num*(6)+ \idx)] + str sAgi, [input_addr, 8*(\num*(6+1) +\idx)] + str sAgo, [input_addr, 8*(\num*(8)+ \idx)] + str sAgu, [input_addr, 8*(\num*(8+1) +\idx)] + str sAka, [input_addr, 8*(\num*(10) +\idx)] + str sAke, [input_addr, 8*(\num*(10+1)+\idx)] + str sAki, [input_addr, 8*(\num*(12) +\idx)] + str sAko, [input_addr, 8*(\num*(12+1)+\idx)] + str sAku, [input_addr, 8*(\num*(14) +\idx)] + str sAma, [input_addr, 8*(\num*(14+1)+\idx)] + str sAme, [input_addr, 8*(\num*(16) +\idx)] + str sAmi, [input_addr, 8*(\num*(16+1)+\idx)] + str sAmo, [input_addr, 8*(\num*(18) +\idx)] + str sAmu, [input_addr, 8*(\num*(18+1)+\idx)] + str sAsa, [input_addr, 8*(\num*(20) +\idx)] + str sAse, [input_addr, 8*(\num*(20+1)+\idx)] + str sAsi, [input_addr, 8*(\num*(22) +\idx)] + str sAso, [input_addr, 8*(\num*(22+1)+\idx)] + str sAsu, [input_addr, 8*(\num*(24) +\idx)] +.endm + +.macro load_input_scalar num idx + ldr s_Aba, [input_addr, 8*(\num*(0) +\idx)] + ldr sAbe, [input_addr, 8*(\num*(0+1) +\idx)] + ldr sAbi, [input_addr, 8*(\num*(2)+ \idx)] + ldr sAbo, [input_addr, 8*(\num*(2+1) +\idx)] + ldr sAbu, [input_addr, 8*(\num*(4)+ \idx)] + ldr sAga, [input_addr, 8*(\num*(4+1) +\idx)] + ldr sAge, [input_addr, 8*(\num*(6)+ \idx)] + ldr sAgi, [input_addr, 8*(\num*(6+1) +\idx)] + ldr sAgo, [input_addr, 8*(\num*(8)+ \idx)] + ldr sAgu, [input_addr, 8*(\num*(8+1) +\idx)] + ldr sAka, [input_addr, 8*(\num*(10) +\idx)] + ldr sAke, [input_addr, 8*(\num*(10+1)+\idx)] + ldr sAki, [input_addr, 8*(\num*(12) +\idx)] + ldr sAko, [input_addr, 8*(\num*(12+1)+\idx)] + ldr sAku, [input_addr, 8*(\num*(14) +\idx)] + ldr sAma, [input_addr, 8*(\num*(14+1)+\idx)] + ldr sAme, [input_addr, 8*(\num*(16) +\idx)] + ldr sAmi, [input_addr, 8*(\num*(16+1)+\idx)] + ldr sAmo, [input_addr, 8*(\num*(18) +\idx)] + ldr sAmu, [input_addr, 8*(\num*(18+1)+\idx)] + ldr sAsa, [input_addr, 8*(\num*(20) +\idx)] + ldr sAse, [input_addr, 8*(\num*(20+1)+\idx)] + ldr sAsi, [input_addr, 8*(\num*(22) +\idx)] + ldr sAso, [input_addr, 8*(\num*(22+1)+\idx)] + ldr sAsu, [input_addr, 8*(\num*(24) +\idx)] +.endm + +#define STACK_SIZE (8*8 + 16*6 + 3*8 + 8) // VREGS (8*8), GPRs (16*6), count (8), const (8), input (8), padding (8) +#define STACK_BASE_GPRS (3*8+8) +#define STACK_BASE_VREGS (3*8+8+16*6) +#define STACK_OFFSET_INPUT (0*8) +#define STACK_OFFSET_CONST (1*8) +#define STACK_OFFSET_COUNT (2*8) + +.macro save_gprs + stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro save_vregs + stp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] + stp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + stp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + stp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] +.endm + +.macro restore_vregs + ldp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] + ldp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + ldp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + ldp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] +.endm + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +.macro eor5 dst, src0, src1, src2, src3, src4 + eor \dst, \src0, \src1 + eor \dst, \dst, \src2 + eor \dst, \dst, \src3 + eor \dst, \dst, \src4 +.endm + +.macro xor_rol dst, src1, src0, imm + eor \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro bic_rol dst, src1, src0, imm + bic \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro rotate dst, src, imm + ror \dst, \src, #(64-\imm) +.endm + +.macro save reg, offset + str \reg, [sp, #\offset] +.endm + +.macro restore reg, offset + ldr \reg, [sp, #\offset] +.endm + +.macro hybrid_round_initial + + eor sC0, sAma, sAsa SEP eor3_m1 C0, vAba, vAga, vAka + eor sC1, sAme, sAse SEP + eor sC2, sAmi, sAsi SEP + eor sC3, sAmo, sAso SEP eor3_m1 C0, C0, vAma, vAsa + eor sC4, sAmu, sAsu SEP + eor sC0, sAka, sC0 SEP + eor sC1, sAke, sC1 SEP eor3_m1 C1, vAbe, vAge, vAke + eor sC2, sAki, sC2 SEP + eor sC3, sAko, sC3 SEP + eor sC4, sAku, sC4 SEP eor3_m1 C1, C1, vAme, vAse + eor sC0, sAga, sC0 SEP + eor sC1, sAge, sC1 SEP + eor sC2, sAgi, sC2 SEP eor3_m1 C2, vAbi, vAgi, vAki + eor sC3, sAgo, sC3 SEP + eor sC4, sAgu, sC4 SEP + eor sC0, s_Aba, sC0 SEP eor3_m1 C2, C2, vAmi, vAsi + eor sC1, sAbe, sC1 SEP + eor sC2, sAbi, sC2 SEP + eor sC3, sAbo, sC3 SEP eor3_m1 C3, vAbo, vAgo, vAko + eor sC4, sAbu, sC4 SEP + SEP + eor sE1, sC0, sC2, ROR #63 SEP eor3_m1 C3, C3, vAmo, vAso + eor sE3, sC2, sC4, ROR #63 SEP + eor sE0, sC4, sC1, ROR #63 SEP + eor sE2, sC1, sC3, ROR #63 SEP eor3_m1 C4, vAbu, vAgu, vAku + eor sE4, sC3, sC0, ROR #63 SEP + SEP + eor s_Aba_, s_Aba, sE0 SEP eor3_m1 C4, C4, vAmu, vAsu + eor sAsa_, sAbi, sE2 SEP + eor sAbi_, sAki, sE2 SEP + eor sAki_, sAko, sE3 SEP + eor sAko_, sAmu, sE4 SEP rax1_m1 E1, C0, C2 + eor sAmu_, sAso, sE3 SEP + eor sAso_, sAma, sE0 SEP + eor sAka_, sAbe, sE1 SEP rax1_m1 E3, C2, C4 + eor sAse_, sAgo, sE3 SEP + eor sAgo_, sAme, sE1 SEP + eor sAke_, sAgi, sE2 SEP rax1_m1 E0, C4, C1 + eor sAgi_, sAka, sE0 SEP + eor sAga_, sAbo, sE3 SEP + eor sAbo_, sAmo, sE3 SEP rax1_m1 E2, C1, C3 + eor sAmo_, sAmi, sE2 SEP + eor sAmi_, sAke, sE1 SEP + eor sAge_, sAgu, sE4 SEP rax1_m1 E4, C3, C0 + eor sAgu_, sAsi, sE2 SEP + eor sAsi_, sAku, sE4 SEP + eor sAku_, sAsa, sE0 SEP + eor sAma_, sAbu, sE4 SEP eor vAba_.16b, vAba.16b, E0.16b + eor sAbu_, sAsu, sE4 SEP + eor sAsu_, sAse, sE1 SEP + eor sAme_, sAga, sE0 SEP xar_m1 vAsa_, vAbi, E2, 2 + eor sAbe_, sAge, sE1 SEP + SEP + load_constant_ptr SEP xar_m1 vAbi_, vAki, E2, 21 + SEP + bic tmp, sAgi_, sAge_, ROR #47 SEP + eor sAga, tmp, sAga_, ROR #39 SEP xar_m1 vAki_, vAko, E3, 39 + bic tmp, sAgo_, sAgi_, ROR #42 SEP + eor sAge, tmp, sAge_, ROR #25 SEP + bic tmp, sAgu_, sAgo_, ROR #16 SEP xar_m1 vAko_, vAmu, E4, 56 + eor sAgi, tmp, sAgi_, ROR #58 SEP + bic tmp, sAga_, sAgu_, ROR #31 SEP + eor sAgo, tmp, sAgo_, ROR #47 SEP xar_m1 vAmu_, vAso, E3, 8 + bic tmp, sAge_, sAga_, ROR #56 SEP + eor sAgu, tmp, sAgu_, ROR #23 SEP + bic tmp, sAki_, sAke_, ROR #19 SEP xar_m1 vAso_, vAma, E0, 23 + eor sAka, tmp, sAka_, ROR #24 SEP + bic tmp, sAko_, sAki_, ROR #47 SEP + eor sAke, tmp, sAke_, ROR #2 SEP xar_m1 vAka_, vAbe, E1, 63 + bic tmp, sAku_, sAko_, ROR #10 SEP + eor sAki, tmp, sAki_, ROR #57 SEP + bic tmp, sAka_, sAku_, ROR #47 SEP xar_m1 vAse_, vAgo, E3, 9 + eor sAko, tmp, sAko_, ROR #57 SEP + bic tmp, sAke_, sAka_, ROR #5 SEP + eor sAku, tmp, sAku_, ROR #52 SEP xar_m1 vAgo_, vAme, E1, 19 + bic tmp, sAmi_, sAme_, ROR #38 SEP + eor sAma, tmp, sAma_, ROR #47 SEP + bic tmp, sAmo_, sAmi_, ROR #5 SEP xar_m1 vAke_, vAgi, E2, 58 + eor sAme, tmp, sAme_, ROR #43 SEP + bic tmp, sAmu_, sAmo_, ROR #41 SEP + eor sAmi, tmp, sAmi_, ROR #46 SEP xar_m1 vAgi_, vAka, E0, 61 + SEP + ldr cur_const, [const_addr] SEP + mov count, #1 SEP xar_m1 vAga_, vAbo, E3, 36 + SEP + bic tmp, sAma_, sAmu_, ROR #35 SEP + eor sAmo, tmp, sAmo_, ROR #12 SEP xar_m1 vAbo_, vAmo, E3, 43 + bic tmp, sAme_, sAma_, ROR #9 SEP + eor sAmu, tmp, sAmu_, ROR #44 SEP + bic tmp, sAsi_, sAse_, ROR #48 SEP xar_m1 vAmo_, vAmi, E2, 49 + eor sAsa, tmp, sAsa_, ROR #41 SEP + bic tmp, sAso_, sAsi_, ROR #2 SEP + eor sAse, tmp, sAse_, ROR #50 SEP xar_m1 vAmi_, vAke, E1, 54 + bic tmp, sAsu_, sAso_, ROR #25 SEP + eor sAsi, tmp, sAsi_, ROR #27 SEP + bic tmp, sAsa_, sAsu_, ROR #60 SEP xar_m1 vAge_, vAgu, E4, 44 + eor sAso, tmp, sAso_, ROR #21 SEP + bic tmp, sAse_, sAsa_, ROR #57 SEP + eor sAsu, tmp, sAsu_, ROR #53 SEP xar_m1 vAgu_, vAsi, E2, 3 + bic tmp, sAbi_, sAbe_, ROR #63 SEP + eor s_Aba, s_Aba_, tmp, ROR #21 SEP + bic tmp, sAbo_, sAbi_, ROR #42 SEP xar_m1 vAsi_, vAku, E4, 25 + eor sAbe, tmp, sAbe_, ROR #41 SEP + bic tmp, sAbu_, sAbo_, ROR #57 SEP + eor sAbi, tmp, sAbi_, ROR #35 SEP xar_m1 vAku_, vAsa, E0, 46 + bic tmp, s_Aba_, sAbu_, ROR #50 SEP + eor sAbo, tmp, sAbo_, ROR #43 SEP + bic tmp, sAbe_, s_Aba_, ROR #44 SEP xar_m1 vAma_, vAbu, E4, 37 + eor sAbu, tmp, sAbu_, ROR #30 SEP + SEP + eor s_Aba, s_Aba, cur_const SEP xar_m1 vAbu_, vAsu, E4, 50 + SEP + save count, STACK_OFFSET_COUNT SEP + SEP xar_m1 vAsu_, vAse, E1, 62 + eor sC0, sAka, sAsa, ROR #50 SEP + eor sC1, sAse, sAge, ROR #60 SEP + eor sC2, sAmi, sAgi, ROR #59 SEP xar_m1 vAme_, vAga, E0, 28 + eor sC3, sAgo, sAso, ROR #30 SEP + eor sC4, sAbu, sAsu, ROR #53 SEP + eor sC0, sAma, sC0, ROR #49 SEP xar_m1 vAbe_, vAge, E1, 20 + eor sC1, sAbe, sC1, ROR #44 SEP + eor sC2, sAki, sC2, ROR #26 SEP restore sE1, STACK_OFFSET_CONST + eor sC3, sAmo, sC3, ROR #63 SEP + eor sC4, sAmu, sC4, ROR #56 SEP + eor sC0, sAga, sC0, ROR #57 SEP ld1r {v28.2d}, [sE1], #8 + eor sC1, sAme, sC1, ROR #58 SEP + eor sC2, sAbi, sC2, ROR #60 SEP + eor sC3, sAko, sC3, ROR #38 SEP save sE1, STACK_OFFSET_CONST + eor sC4, sAgu, sC4, ROR #48 SEP + eor sC0, s_Aba, sC0, ROR #61 SEP bcax_m1 vAga, vAga_, vAgi_, vAge_ + eor sC1, sAke, sC1, ROR #57 SEP + eor sC2, sAsi, sC2, ROR #52 SEP + eor sC3, sAbo, sC3, ROR #63 SEP bcax_m1 vAge, vAge_, vAgo_, vAgi_ + eor sC4, sAku, sC4, ROR #50 SEP + ror sC1, sC1, 56 SEP + ror sC4, sC4, 58 SEP bcax_m1 vAgi, vAgi_, vAgu_, vAgo_ + ror sC2, sC2, 62 SEP + SEP + eor sE1, sC0, sC2, ROR #63 SEP bcax_m1 vAgo, vAgo_, vAga_, vAgu_ + eor sE3, sC2, sC4, ROR #63 SEP + eor sE0, sC4, sC1, ROR #63 SEP + eor sE2, sC1, sC3, ROR #63 SEP bcax_m1 vAgu, vAgu_, vAge_, vAga_ + eor sE4, sC3, sC0, ROR #63 SEP + SEP + eor s_Aba_, sE0, s_Aba SEP bcax_m1 vAka, vAka_, vAki_, vAke_ + eor sAsa_, sE2, sAbi, ROR #50 SEP + eor sAbi_, sE2, sAki, ROR #46 SEP + eor sAki_, sE3, sAko, ROR #63 SEP bcax_m1 vAke, vAke_, vAko_, vAki_ + eor sAko_, sE4, sAmu, ROR #28 SEP + eor sAmu_, sE3, sAso, ROR #2 SEP + eor sAso_, sE0, sAma, ROR #54 SEP bcax_m1 vAki, vAki_, vAku_, vAko_ + eor sAka_, sE1, sAbe, ROR #43 SEP + eor sAse_, sE3, sAgo, ROR #36 SEP + eor sAgo_, sE1, sAme, ROR #49 SEP bcax_m1 vAko, vAko_, vAka_, vAku_ + eor sAke_, sE2, sAgi, ROR #3 SEP + eor sAgi_, sE0, sAka, ROR #39 SEP + eor sAga_, sE3, sAbo SEP bcax_m1 vAku, vAku_, vAke_, vAka_ + eor sAbo_, sE3, sAmo, ROR #37 SEP + eor sAmo_, sE2, sAmi, ROR #8 SEP + eor sAmi_, sE1, sAke, ROR #56 SEP bcax_m1 vAma, vAma_, vAmi_, vAme_ + eor sAge_, sE4, sAgu, ROR #44 SEP + eor sAgu_, sE2, sAsi, ROR #62 SEP + eor sAsi_, sE4, sAku, ROR #58 SEP bcax_m1 vAme, vAme_, vAmo_, vAmi_ + eor sAku_, sE0, sAsa, ROR #25 SEP + eor sAma_, sE4, sAbu, ROR #20 SEP + eor sAbu_, sE4, sAsu, ROR #9 SEP bcax_m1 vAmi, vAmi_, vAmu_, vAmo_ + eor sAsu_, sE1, sAse, ROR #23 SEP + eor sAme_, sE0, sAga, ROR #61 SEP + eor sAbe_, sE1, sAge, ROR #19 SEP bcax_m1 vAmo, vAmo_, vAma_, vAmu_ + SEP + load_constant_ptr SEP + restore count, STACK_OFFSET_COUNT SEP bcax_m1 vAmu, vAmu_, vAme_, vAma_ + SEP + bic tmp, sAgi_, sAge_, ROR #47 SEP + eor sAga, tmp, sAga_, ROR #39 SEP bcax_m1 vAsa, vAsa_, vAsi_, vAse_ + bic tmp, sAgo_, sAgi_, ROR #42 SEP + eor sAge, tmp, sAge_, ROR #25 SEP + bic tmp, sAgu_, sAgo_, ROR #16 SEP bcax_m1 vAse, vAse_, vAso_, vAsi_ + eor sAgi, tmp, sAgi_, ROR #58 SEP + bic tmp, sAga_, sAgu_, ROR #31 SEP + eor sAgo, tmp, sAgo_, ROR #47 SEP bcax_m1 vAsi, vAsi_, vAsu_, vAso_ + bic tmp, sAge_, sAga_, ROR #56 SEP + eor sAgu, tmp, sAgu_, ROR #23 SEP + bic tmp, sAki_, sAke_, ROR #19 SEP bcax_m1 vAso, vAso_, vAsa_, vAsu_ + eor sAka, tmp, sAka_, ROR #24 SEP + bic tmp, sAko_, sAki_, ROR #47 SEP + eor sAke, tmp, sAke_, ROR #2 SEP bcax_m1 vAsu, vAsu_, vAse_, vAsa_ + bic tmp, sAku_, sAko_, ROR #10 SEP + eor sAki, tmp, sAki_, ROR #57 SEP + bic tmp, sAka_, sAku_, ROR #47 SEP bcax_m1 vAba, vAba_, vAbi_, vAbe_ + eor sAko, tmp, sAko_, ROR #57 SEP + bic tmp, sAke_, sAka_, ROR #5 SEP + eor sAku, tmp, sAku_, ROR #52 SEP bcax_m1 vAbe, vAbe_, vAbo_, vAbi_ + bic tmp, sAmi_, sAme_, ROR #38 SEP + eor sAma, tmp, sAma_, ROR #47 SEP + bic tmp, sAmo_, sAmi_, ROR #5 SEP bcax_m1 vAbi, vAbi_, vAbu_, vAbo_ + eor sAme, tmp, sAme_, ROR #43 SEP + bic tmp, sAmu_, sAmo_, ROR #41 SEP + eor sAmi, tmp, sAmi_, ROR #46 SEP bcax_m1 vAbo, vAbo_, vAba_, vAbu_ + bic tmp, sAma_, sAmu_, ROR #35 SEP + SEP + ldr cur_const, [const_addr, count, UXTW #3] SEP bcax_m1 vAbu, vAbu_, vAbe_, vAba_ + SEP + eor sAmo, tmp, sAmo_, ROR #12 SEP + bic tmp, sAme_, sAma_, ROR #9 SEP + eor sAmu, tmp, sAmu_, ROR #44 SEP eor vAba.16b, vAba.16b, v28.16b + bic tmp, sAsi_, sAse_, ROR #48 SEP + eor sAsa, tmp, sAsa_, ROR #41 SEP + bic tmp, sAso_, sAsi_, ROR #2 SEP + eor sAse, tmp, sAse_, ROR #50 SEP + bic tmp, sAsu_, sAso_, ROR #25 SEP + eor sAsi, tmp, sAsi_, ROR #27 SEP + bic tmp, sAsa_, sAsu_, ROR #60 SEP + eor sAso, tmp, sAso_, ROR #21 SEP + bic tmp, sAse_, sAsa_, ROR #57 SEP + eor sAsu, tmp, sAsu_, ROR #53 SEP + bic tmp, sAbi_, sAbe_, ROR #63 SEP + eor s_Aba, s_Aba_, tmp, ROR #21 SEP + bic tmp, sAbo_, sAbi_, ROR #42 SEP + eor sAbe, tmp, sAbe_, ROR #41 SEP + bic tmp, sAbu_, sAbo_, ROR #57 SEP + eor sAbi, tmp, sAbi_, ROR #35 SEP + bic tmp, s_Aba_, sAbu_, ROR #50 SEP + eor sAbo, tmp, sAbo_, ROR #43 SEP + bic tmp, sAbe_, s_Aba_, ROR #44 SEP + eor sAbu, tmp, sAbu_, ROR #30 SEP + SEP + add count, count, #1 SEP + SEP + eor s_Aba, s_Aba, cur_const SEP + SEP +.endm + +.macro hybrid_round_noninitial + save count, STACK_OFFSET_COUNT SEP eor3_m1 C0, vAba, vAga, vAka + SEP + eor sC0, sAka, sAsa, ROR #50 SEP + eor sC1, sAse, sAge, ROR #60 SEP eor3_m1 C0, C0, vAma, vAsa + eor sC2, sAmi, sAgi, ROR #59 SEP + eor sC3, sAgo, sAso, ROR #30 SEP + eor sC4, sAbu, sAsu, ROR #53 SEP eor3_m1 C1, vAbe, vAge, vAke + eor sC0, sAma, sC0, ROR #49 SEP + eor sC1, sAbe, sC1, ROR #44 SEP + eor sC2, sAki, sC2, ROR #26 SEP eor3_m1 C1, C1, vAme, vAse + eor sC3, sAmo, sC3, ROR #63 SEP + eor sC4, sAmu, sC4, ROR #56 SEP + eor sC0, sAga, sC0, ROR #57 SEP eor3_m1 C2, vAbi, vAgi, vAki + eor sC1, sAme, sC1, ROR #58 SEP + eor sC2, sAbi, sC2, ROR #60 SEP + eor sC3, sAko, sC3, ROR #38 SEP eor3_m1 C2, C2, vAmi, vAsi + eor sC4, sAgu, sC4, ROR #48 SEP + eor sC0, s_Aba, sC0, ROR #61 SEP + eor sC1, sAke, sC1, ROR #57 SEP eor3_m1 C3, vAbo, vAgo, vAko + eor sC2, sAsi, sC2, ROR #52 SEP + eor sC3, sAbo, sC3, ROR #63 SEP + eor sC4, sAku, sC4, ROR #50 SEP eor3_m1 C3, C3, vAmo, vAso + ror sC1, sC1, 56 SEP + ror sC4, sC4, 58 SEP + ror sC2, sC2, 62 SEP eor3_m1 C4, vAbu, vAgu, vAku + SEP + eor sE1, sC0, sC2, ROR #63 SEP + eor sE3, sC2, sC4, ROR #63 SEP eor3_m1 C4, C4, vAmu, vAsu + eor sE0, sC4, sC1, ROR #63 SEP + eor sE2, sC1, sC3, ROR #63 SEP + eor sE4, sC3, sC0, ROR #63 SEP + SEP rax1_m1 E1, C0, C2 + eor s_Aba_, sE0, s_Aba SEP + eor sAsa_, sE2, sAbi, ROR #50 SEP + eor sAbi_, sE2, sAki, ROR #46 SEP rax1_m1 E3, C2, C4 + eor sAki_, sE3, sAko, ROR #63 SEP + eor sAko_, sE4, sAmu, ROR #28 SEP + eor sAmu_, sE3, sAso, ROR #2 SEP rax1_m1 E0, C4, C1 + eor sAso_, sE0, sAma, ROR #54 SEP + eor sAka_, sE1, sAbe, ROR #43 SEP + eor sAse_, sE3, sAgo, ROR #36 SEP rax1_m1 E2, C1, C3 + eor sAgo_, sE1, sAme, ROR #49 SEP + eor sAke_, sE2, sAgi, ROR #3 SEP + eor sAgi_, sE0, sAka, ROR #39 SEP rax1_m1 E4, C3, C0 + eor sAga_, sE3, sAbo SEP + eor sAbo_, sE3, sAmo, ROR #37 SEP + eor sAmo_, sE2, sAmi, ROR #8 SEP + eor sAmi_, sE1, sAke, ROR #56 SEP eor vAba_.16b, vAba.16b, E0.16b + eor sAge_, sE4, sAgu, ROR #44 SEP + eor sAgu_, sE2, sAsi, ROR #62 SEP + eor sAsi_, sE4, sAku, ROR #58 SEP xar_m1 vAsa_, vAbi, E2, 2 + eor sAku_, sE0, sAsa, ROR #25 SEP + eor sAma_, sE4, sAbu, ROR #20 SEP + eor sAbu_, sE4, sAsu, ROR #9 SEP xar_m1 vAbi_, vAki, E2, 21 + eor sAsu_, sE1, sAse, ROR #23 SEP + eor sAme_, sE0, sAga, ROR #61 SEP + eor sAbe_, sE1, sAge, ROR #19 SEP xar_m1 vAki_, vAko, E3, 39 + SEP + load_constant_ptr SEP + restore count, STACK_OFFSET_COUNT SEP xar_m1 vAko_, vAmu, E4, 56 + SEP + bic tmp, sAgi_, sAge_, ROR #47 SEP + eor sAga, tmp, sAga_, ROR #39 SEP xar_m1 vAmu_, vAso, E3, 8 + bic tmp, sAgo_, sAgi_, ROR #42 SEP + eor sAge, tmp, sAge_, ROR #25 SEP + bic tmp, sAgu_, sAgo_, ROR #16 SEP xar_m1 vAso_, vAma, E0, 23 + eor sAgi, tmp, sAgi_, ROR #58 SEP + bic tmp, sAga_, sAgu_, ROR #31 SEP + eor sAgo, tmp, sAgo_, ROR #47 SEP xar_m1 vAka_, vAbe, E1, 63 + bic tmp, sAge_, sAga_, ROR #56 SEP + eor sAgu, tmp, sAgu_, ROR #23 SEP + bic tmp, sAki_, sAke_, ROR #19 SEP xar_m1 vAse_, vAgo, E3, 9 + eor sAka, tmp, sAka_, ROR #24 SEP + bic tmp, sAko_, sAki_, ROR #47 SEP + eor sAke, tmp, sAke_, ROR #2 SEP xar_m1 vAgo_, vAme, E1, 19 + bic tmp, sAku_, sAko_, ROR #10 SEP + eor sAki, tmp, sAki_, ROR #57 SEP + bic tmp, sAka_, sAku_, ROR #47 SEP xar_m1 vAke_, vAgi, E2, 58 + eor sAko, tmp, sAko_, ROR #57 SEP + bic tmp, sAke_, sAka_, ROR #5 SEP + eor sAku, tmp, sAku_, ROR #52 SEP xar_m1 vAgi_, vAka, E0, 61 + bic tmp, sAmi_, sAme_, ROR #38 SEP + eor sAma, tmp, sAma_, ROR #47 SEP + bic tmp, sAmo_, sAmi_, ROR #5 SEP xar_m1 vAga_, vAbo, E3, 36 + eor sAme, tmp, sAme_, ROR #43 SEP + bic tmp, sAmu_, sAmo_, ROR #41 SEP + eor sAmi, tmp, sAmi_, ROR #46 SEP xar_m1 vAbo_, vAmo, E3, 43 + bic tmp, sAma_, sAmu_, ROR #35 SEP + SEP + ldr cur_const, [const_addr, count, UXTW #3] SEP xar_m1 vAmo_, vAmi, E2, 49 + add count, count, #1 SEP + SEP + eor sAmo, tmp, sAmo_, ROR #12 SEP xar_m1 vAmi_, vAke, E1, 54 + bic tmp, sAme_, sAma_, ROR #9 SEP + eor sAmu, tmp, sAmu_, ROR #44 SEP + bic tmp, sAsi_, sAse_, ROR #48 SEP xar_m1 vAge_, vAgu, E4, 44 + eor sAsa, tmp, sAsa_, ROR #41 SEP + bic tmp, sAso_, sAsi_, ROR #2 SEP + eor sAse, tmp, sAse_, ROR #50 SEP xar_m1 vAgu_, vAsi, E2, 3 + bic tmp, sAsu_, sAso_, ROR #25 SEP + eor sAsi, tmp, sAsi_, ROR #27 SEP + bic tmp, sAsa_, sAsu_, ROR #60 SEP xar_m1 vAsi_, vAku, E4, 25 + eor sAso, tmp, sAso_, ROR #21 SEP + bic tmp, sAse_, sAsa_, ROR #57 SEP + eor sAsu, tmp, sAsu_, ROR #53 SEP xar_m1 vAku_, vAsa, E0, 46 + bic tmp, sAbi_, sAbe_, ROR #63 SEP + eor s_Aba, s_Aba_, tmp, ROR #21 SEP + bic tmp, sAbo_, sAbi_, ROR #42 SEP xar_m1 vAma_, vAbu, E4, 37 + eor sAbe, tmp, sAbe_, ROR #41 SEP + bic tmp, sAbu_, sAbo_, ROR #57 SEP + eor sAbi, tmp, sAbi_, ROR #35 SEP xar_m1 vAbu_, vAsu, E4, 50 + bic tmp, s_Aba_, sAbu_, ROR #50 SEP + eor sAbo, tmp, sAbo_, ROR #43 SEP + bic tmp, sAbe_, s_Aba_, ROR #44 SEP xar_m1 vAsu_, vAse, E1, 62 + eor sAbu, tmp, sAbu_, ROR #30 SEP + SEP + eor s_Aba, s_Aba, cur_const SEP xar_m1 vAme_, vAga, E0, 28 + save count, STACK_OFFSET_COUNT SEP + SEP + eor sC0, sAka, sAsa, ROR #50 SEP xar_m1 vAbe_, vAge, E1, 20 + eor sC1, sAse, sAge, ROR #60 SEP + eor sC2, sAmi, sAgi, ROR #59 SEP + eor sC3, sAgo, sAso, ROR #30 SEP + eor sC4, sAbu, sAsu, ROR #53 SEP restore sE1, STACK_OFFSET_CONST + eor sC0, sAma, sC0, ROR #49 SEP + eor sC1, sAbe, sC1, ROR #44 SEP + eor sC2, sAki, sC2, ROR #26 SEP ld1r {v28.2d}, [sE1], #8 + eor sC3, sAmo, sC3, ROR #63 SEP + eor sC4, sAmu, sC4, ROR #56 SEP + eor sC0, sAga, sC0, ROR #57 SEP save sE1, STACK_OFFSET_CONST + eor sC1, sAme, sC1, ROR #58 SEP + eor sC2, sAbi, sC2, ROR #60 SEP + eor sC3, sAko, sC3, ROR #38 SEP + eor sC4, sAgu, sC4, ROR #48 SEP bcax_m1 vAga, vAga_, vAgi_, vAge_ + eor sC0, s_Aba, sC0, ROR #61 SEP + eor sC1, sAke, sC1, ROR #57 SEP + eor sC2, sAsi, sC2, ROR #52 SEP bcax_m1 vAge, vAge_, vAgo_, vAgi_ + eor sC3, sAbo, sC3, ROR #63 SEP + eor sC4, sAku, sC4, ROR #50 SEP + ror sC1, sC1, 56 SEP bcax_m1 vAgi, vAgi_, vAgu_, vAgo_ + ror sC4, sC4, 58 SEP + ror sC2, sC2, 62 SEP + SEP bcax_m1 vAgo, vAgo_, vAga_, vAgu_ + eor sE1, sC0, sC2, ROR #63 SEP + eor sE3, sC2, sC4, ROR #63 SEP + eor sE0, sC4, sC1, ROR #63 SEP bcax_m1 vAgu, vAgu_, vAge_, vAga_ + eor sE2, sC1, sC3, ROR #63 SEP + eor sE4, sC3, sC0, ROR #63 SEP + SEP bcax_m1 vAka, vAka_, vAki_, vAke_ + eor s_Aba_, sE0, s_Aba SEP + eor sAsa_, sE2, sAbi, ROR #50 SEP + eor sAbi_, sE2, sAki, ROR #46 SEP bcax_m1 vAke, vAke_, vAko_, vAki_ + eor sAki_, sE3, sAko, ROR #63 SEP + eor sAko_, sE4, sAmu, ROR #28 SEP + eor sAmu_, sE3, sAso, ROR #2 SEP bcax_m1 vAki, vAki_, vAku_, vAko_ + eor sAso_, sE0, sAma, ROR #54 SEP + eor sAka_, sE1, sAbe, ROR #43 SEP + eor sAse_, sE3, sAgo, ROR #36 SEP bcax_m1 vAko, vAko_, vAka_, vAku_ + eor sAgo_, sE1, sAme, ROR #49 SEP + eor sAke_, sE2, sAgi, ROR #3 SEP + eor sAgi_, sE0, sAka, ROR #39 SEP bcax_m1 vAku, vAku_, vAke_, vAka_ + eor sAga_, sE3, sAbo SEP + eor sAbo_, sE3, sAmo, ROR #37 SEP + eor sAmo_, sE2, sAmi, ROR #8 SEP bcax_m1 vAma, vAma_, vAmi_, vAme_ + eor sAmi_, sE1, sAke, ROR #56 SEP + eor sAge_, sE4, sAgu, ROR #44 SEP + eor sAgu_, sE2, sAsi, ROR #62 SEP bcax_m1 vAme, vAme_, vAmo_, vAmi_ + eor sAsi_, sE4, sAku, ROR #58 SEP + eor sAku_, sE0, sAsa, ROR #25 SEP + eor sAma_, sE4, sAbu, ROR #20 SEP bcax_m1 vAmi, vAmi_, vAmu_, vAmo_ + eor sAbu_, sE4, sAsu, ROR #9 SEP + eor sAsu_, sE1, sAse, ROR #23 SEP + eor sAme_, sE0, sAga, ROR #61 SEP bcax_m1 vAmo, vAmo_, vAma_, vAmu_ + eor sAbe_, sE1, sAge, ROR #19 SEP + SEP + load_constant_ptr SEP bcax_m1 vAmu, vAmu_, vAme_, vAma_ + restore count, STACK_OFFSET_COUNT SEP + SEP + bic tmp, sAgi_, sAge_, ROR #47 SEP bcax_m1 vAsa, vAsa_, vAsi_, vAse_ + eor sAga, tmp, sAga_, ROR #39 SEP + bic tmp, sAgo_, sAgi_, ROR #42 SEP + eor sAge, tmp, sAge_, ROR #25 SEP bcax_m1 vAse, vAse_, vAso_, vAsi_ + bic tmp, sAgu_, sAgo_, ROR #16 SEP + eor sAgi, tmp, sAgi_, ROR #58 SEP + bic tmp, sAga_, sAgu_, ROR #31 SEP bcax_m1 vAsi, vAsi_, vAsu_, vAso_ + eor sAgo, tmp, sAgo_, ROR #47 SEP + bic tmp, sAge_, sAga_, ROR #56 SEP + eor sAgu, tmp, sAgu_, ROR #23 SEP bcax_m1 vAso, vAso_, vAsa_, vAsu_ + bic tmp, sAki_, sAke_, ROR #19 SEP + eor sAka, tmp, sAka_, ROR #24 SEP + bic tmp, sAko_, sAki_, ROR #47 SEP bcax_m1 vAsu, vAsu_, vAse_, vAsa_ + eor sAke, tmp, sAke_, ROR #2 SEP + bic tmp, sAku_, sAko_, ROR #10 SEP + eor sAki, tmp, sAki_, ROR #57 SEP bcax_m1 vAba, vAba_, vAbi_, vAbe_ + bic tmp, sAka_, sAku_, ROR #47 SEP + eor sAko, tmp, sAko_, ROR #57 SEP + bic tmp, sAke_, sAka_, ROR #5 SEP bcax_m1 vAbe, vAbe_, vAbo_, vAbi_ + eor sAku, tmp, sAku_, ROR #52 SEP + bic tmp, sAmi_, sAme_, ROR #38 SEP + eor sAma, tmp, sAma_, ROR #47 SEP bcax_m1 vAbi, vAbi_, vAbu_, vAbo_ + bic tmp, sAmo_, sAmi_, ROR #5 SEP + eor sAme, tmp, sAme_, ROR #43 SEP + bic tmp, sAmu_, sAmo_, ROR #41 SEP bcax_m1 vAbo, vAbo_, vAba_, vAbu_ + eor sAmi, tmp, sAmi_, ROR #46 SEP + bic tmp, sAma_, sAmu_, ROR #35 SEP + SEP bcax_m1 vAbu, vAbu_, vAbe_, vAba_ + ldr cur_const, [const_addr, count, UXTW #3] SEP + add count, count, #1 SEP + SEP eor vAba.16b, vAba.16b, v28.16b + eor sAmo, tmp, sAmo_, ROR #12 SEP + bic tmp, sAme_, sAma_, ROR #9 SEP + eor sAmu, tmp, sAmu_, ROR #44 SEP + bic tmp, sAsi_, sAse_, ROR #48 SEP + eor sAsa, tmp, sAsa_, ROR #41 SEP + bic tmp, sAso_, sAsi_, ROR #2 SEP + eor sAse, tmp, sAse_, ROR #50 SEP + bic tmp, sAsu_, sAso_, ROR #25 SEP + eor sAsi, tmp, sAsi_, ROR #27 SEP + bic tmp, sAsa_, sAsu_, ROR #60 SEP + eor sAso, tmp, sAso_, ROR #21 SEP + bic tmp, sAse_, sAsa_, ROR #57 SEP + eor sAsu, tmp, sAsu_, ROR #53 SEP + bic tmp, sAbi_, sAbe_, ROR #63 SEP + eor s_Aba, s_Aba_, tmp, ROR #21 SEP + bic tmp, sAbo_, sAbi_, ROR #42 SEP + eor sAbe, tmp, sAbe_, ROR #41 SEP + bic tmp, sAbu_, sAbo_, ROR #57 SEP + eor sAbi, tmp, sAbi_, ROR #35 SEP + bic tmp, s_Aba_, sAbu_, ROR #50 SEP + eor sAbo, tmp, sAbo_, ROR #43 SEP + bic tmp, sAbe_, s_Aba_, ROR #44 SEP + eor sAbu, tmp, sAbu_, ROR #30 SEP + SEP + eor s_Aba, s_Aba, cur_const SEP + +.endm + +.macro final_rotate + ror sAga, sAga,#(64-3) + ror sAka, sAka,#(64-25) + ror sAma, sAma,#(64-10) + ror sAsa, sAsa,#(64-39) + ror sAbe, sAbe,#(64-21) + ror sAge, sAge,#(64-45) + ror sAke, sAke,#(64-8) + ror sAme, sAme,#(64-15) + ror sAse, sAse,#(64-41) + ror sAbi, sAbi,#(64-14) + ror sAgi, sAgi,#(64-61) + ror sAki, sAki,#(64-18) + ror sAmi, sAmi,#(64-56) + ror sAsi, sAsi,#(64-2) + ror sAgo, sAgo,#(64-28) + ror sAko, sAko,#(64-1) + ror sAmo, sAmo,#(64-27) + ror sAso, sAso,#(64-62) + ror sAbu, sAbu,#(64-44) + ror sAgu, sAgu,#(64-20) + ror sAku, sAku,#(64-6) + ror sAmu, sAmu,#(64-36) + ror sAsu, sAsu,#(64-55) +.endm + +#define KECCAK_F1600_ROUNDS 24 + +.global keccak_f1600_x4_hybrid_asm_v3 +.global _keccak_f1600_x4_hybrid_asm_v3 +.text +.align 4 + +keccak_f1600_x4_hybrid_asm_v3: +_keccak_f1600_x4_hybrid_asm_v3: + alloc_stack + save_gprs + save_vregs + save input_addr, STACK_OFFSET_INPUT + + load_input_vector 2,1 + + load_constant_ptr + save const_addr, STACK_OFFSET_CONST + + // First scalar Keccak computation alongside first half of SIMD computation + load_input_scalar 4,0 + hybrid_round_initial + loop_0: + hybrid_round_noninitial + cmp count, #(KECCAK_F1600_ROUNDS-1) + ble loop_0 + final_rotate + restore input_addr, STACK_OFFSET_INPUT + store_input_scalar 4,0 + + // Second scalar Keccak computation alongsie second half of SIMD computation + load_input_scalar 4,1 + hybrid_round_initial + loop_1: + hybrid_round_noninitial + cmp count, #(KECCAK_F1600_ROUNDS-1) + ble loop_1 + final_rotate + restore input_addr, STACK_OFFSET_INPUT + store_input_scalar 4, 1 + + store_input_vector 2,1 + + restore_vregs + restore_gprs + free_stack + ret diff --git a/asm/manual/keccak_f1600/keccak_f1600_x4_hybrid_asm_v3p.s b/asm/manual/keccak_f1600/keccak_f1600_x4_hybrid_asm_v3p.s new file mode 100644 index 0000000..86f3074 --- /dev/null +++ b/asm/manual/keccak_f1600/keccak_f1600_x4_hybrid_asm_v3p.s @@ -0,0 +1,1016 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +/********************** CONSTANTS *************************/ + .data + .align(8) +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x29 + count .req w27 + cur_const .req x26 + + /* Mapping of Kecck-f1600 SIMD state to vector registers + * at the beginning and end of each round. */ + + vAba .req v0 + vAbe .req v1 + vAbi .req v2 + vAbo .req v3 + vAbu .req v4 + vAga .req v5 + vAge .req v6 + vAgi .req v7 + vAgo .req v8 + vAgu .req v9 + vAka .req v10 + vAke .req v11 + vAki .req v12 + vAko .req v13 + vAku .req v14 + vAma .req v15 + vAme .req v16 + vAmi .req v17 + vAmo .req v18 + vAmu .req v19 + vAsa .req v20 + vAse .req v21 + vAsi .req v22 + vAso .req v23 + vAsu .req v24 + + /* q-form of the above mapping */ + vAbaq .req q0 + vAbeq .req q1 + vAbiq .req q2 + vAboq .req q3 + vAbuq .req q4 + vAgaq .req q5 + vAgeq .req q6 + vAgiq .req q7 + vAgoq .req q8 + vAguq .req q9 + vAkaq .req q10 + vAkeq .req q11 + vAkiq .req q12 + vAkoq .req q13 + vAkuq .req q14 + vAmaq .req q15 + vAmeq .req q16 + vAmiq .req q17 + vAmoq .req q18 + vAmuq .req q19 + vAsaq .req q20 + vAseq .req q21 + vAsiq .req q22 + vAsoq .req q23 + vAsuq .req q24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v30 + C1 .req v29 + C2 .req v28 + C3 .req v27 + C4 .req v26 + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req v26 + E1 .req v25 + E2 .req v29 + E3 .req v28 + E4 .req v27 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + vAbi_ .req v2 + vAbo_ .req v3 + vAbu_ .req v4 + vAga_ .req v10 + vAge_ .req v11 + vAgi_ .req v7 + vAgo_ .req v8 + vAgu_ .req v9 + vAka_ .req v15 + vAke_ .req v16 + vAki_ .req v12 + vAko_ .req v13 + vAku_ .req v14 + vAma_ .req v20 + vAme_ .req v21 + vAmi_ .req v17 + vAmo_ .req v18 + vAmu_ .req v19 + vAsa_ .req v0 + vAse_ .req v1 + vAsi_ .req v22 + vAso_ .req v23 + vAsu_ .req v24 + vAba_ .req v30 + vAbe_ .req v27 + + /* Unused temporary */ + vtmp .req v31 + + /* Mapping of Kecck-f1600 state to scalar registers + * at the beginning and end of each round. */ + s_Aba .req x1 + sAbe .req x6 + sAbi .req x11 + sAbo .req x16 + sAbu .req x21 + sAga .req x2 + sAge .req x7 + sAgi .req x12 + sAgo .req x17 + sAgu .req x22 + sAka .req x3 + sAke .req x8 + sAki .req x13 + sAko .req x18 + sAku .req x23 + sAma .req x4 + sAme .req x9 + sAmi .req x14 + sAmo .req x19 + sAmu .req x24 + sAsa .req x5 + sAse .req x10 + sAsi .req x15 + sAso .req x20 + sAsu .req x25 + + /* sA_[y,2*x+3*y] = rot(A[x,y]) */ + s_Aba_ .req x0 + sAbe_ .req x28 + sAbi_ .req x11 + sAbo_ .req x16 + sAbu_ .req x21 + sAga_ .req x3 + sAge_ .req x8 + sAgi_ .req x12 + sAgo_ .req x17 + sAgu_ .req x22 + sAka_ .req x4 + sAke_ .req x9 + sAki_ .req x13 + sAko_ .req x18 + sAku_ .req x23 + sAma_ .req x5 + sAme_ .req x10 + sAmi_ .req x14 + sAmo_ .req x19 + sAmu_ .req x24 + sAsa_ .req x1 + sAse_ .req x6 + sAsi_ .req x15 + sAso_ .req x20 + sAsu_ .req x25 + + /* sC[x] = sA[x,0] xor sA[x,1] xor sA[x,2] xor sA[x,3] xor sA[x,4], for x in 0..4 */ + /* sE[x] = sC[x-1] xor rot(C[x+1],1), for x in 0..4 */ + sC0 .req x0 + sE0 .req x29 + sC1 .req x26 + sE1 .req x30 + sC2 .req x27 + sE2 .req x26 + sC3 .req x28 + sE3 .req x27 + sC4 .req x29 + sE4 .req x28 + + tmp .req x30 + +/************************ MACROS ****************************/ + +/* Macros using v8.4-A SHA-3 instructions */ + + +.macro eor3_m1 d s0 s1 s2 + eor \d\().16b, \s0\().16b, \s1\().16b + eor \d\().16b, \d\().16b, \s2\().16b +.endm + +.macro rax1_m1 d s0 s1 + add vtmp.2d, \s1\().2d, \s1\().2d + sri vtmp.2d, \s1\().2d, #63 + eor \d\().16b, vtmp.16b, \s0\().16b +.endm + +.macro xar_m1 d s0 s1 imm + eor vtmp.16b, \s0\().16b, \s1\().16b + shl \d\().2d, vtmp.2d, #(64-\imm) + sri \d\().2d, vtmp.2d, #(\imm) +.endm + +.macro bcax_m1 d s0 s1 s2 + bic vtmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, vtmp.16b, \s0\().16b + .endm + + +.macro eor3_m0 d s0 s1 s2 + eor3 \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +.macro rax1_m0 d s0 s1 + rax1 \d\().2d, \s0\().2d, \s1\().2d +.endm + +.macro xar_m0 d s0 s1 imm + xar \d\().2d, \s0\().2d, \s1\().2d, #\imm +.endm + +.macro bcax_m0 d s0 s1 s2 + bcax \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + + +.macro load_input_vector num idx + ldr vAbaq, [input_addr, #(16*(\num*0+\idx))] + ldr vAbeq, [input_addr, #(16*(\num*1+\idx))] + ldr vAbiq, [input_addr, #(16*(\num*2+\idx))] + ldr vAboq, [input_addr, #(16*(\num*3+\idx))] + ldr vAbuq, [input_addr, #(16*(\num*4+\idx))] + ldr vAgaq, [input_addr, #(16*(\num*5+\idx))] + ldr vAgeq, [input_addr, #(16*(\num*6+\idx))] + ldr vAgiq, [input_addr, #(16*(\num*7+\idx))] + ldr vAgoq, [input_addr, #(16*(\num*8+\idx))] + ldr vAguq, [input_addr, #(16*(\num*9+\idx))] + ldr vAkaq, [input_addr, #(16*(\num*10+\idx))] + ldr vAkeq, [input_addr, #(16*(\num*11+\idx))] + ldr vAkiq, [input_addr, #(16*(\num*12+\idx))] + ldr vAkoq, [input_addr, #(16*(\num*13+\idx))] + ldr vAkuq, [input_addr, #(16*(\num*14+\idx))] + ldr vAmaq, [input_addr, #(16*(\num*15+\idx))] + ldr vAmeq, [input_addr, #(16*(\num*16+\idx))] + ldr vAmiq, [input_addr, #(16*(\num*17+\idx))] + ldr vAmoq, [input_addr, #(16*(\num*18+\idx))] + ldr vAmuq, [input_addr, #(16*(\num*19+\idx))] + ldr vAsaq, [input_addr, #(16*(\num*20+\idx))] + ldr vAseq, [input_addr, #(16*(\num*21+\idx))] + ldr vAsiq, [input_addr, #(16*(\num*22+\idx))] + ldr vAsoq, [input_addr, #(16*(\num*23+\idx))] + ldr vAsuq, [input_addr, #(16*(\num*24+\idx))] +.endm + +.macro store_input_vector num idx + str vAbaq, [input_addr, #(16*(\num*0+\idx))] + str vAbeq, [input_addr, #(16*(\num*1+\idx))] + str vAbiq, [input_addr, #(16*(\num*2+\idx))] + str vAboq, [input_addr, #(16*(\num*3+\idx))] + str vAbuq, [input_addr, #(16*(\num*4+\idx))] + str vAgaq, [input_addr, #(16*(\num*5+\idx))] + str vAgeq, [input_addr, #(16*(\num*6+\idx))] + str vAgiq, [input_addr, #(16*(\num*7+\idx))] + str vAgoq, [input_addr, #(16*(\num*8+\idx))] + str vAguq, [input_addr, #(16*(\num*9+\idx))] + str vAkaq, [input_addr, #(16*(\num*10+\idx))] + str vAkeq, [input_addr, #(16*(\num*11+\idx))] + str vAkiq, [input_addr, #(16*(\num*12+\idx))] + str vAkoq, [input_addr, #(16*(\num*13+\idx))] + str vAkuq, [input_addr, #(16*(\num*14+\idx))] + str vAmaq, [input_addr, #(16*(\num*15+\idx))] + str vAmeq, [input_addr, #(16*(\num*16+\idx))] + str vAmiq, [input_addr, #(16*(\num*17+\idx))] + str vAmoq, [input_addr, #(16*(\num*18+\idx))] + str vAmuq, [input_addr, #(16*(\num*19+\idx))] + str vAsaq, [input_addr, #(16*(\num*20+\idx))] + str vAseq, [input_addr, #(16*(\num*21+\idx))] + str vAsiq, [input_addr, #(16*(\num*22+\idx))] + str vAsoq, [input_addr, #(16*(\num*23+\idx))] + str vAsuq, [input_addr, #(16*(\num*24+\idx))] +.endm + +.macro store_input_scalar num idx + str s_Aba, [input_addr, 8*(\num*(0) +\idx)] + str sAbe, [input_addr, 8*(\num*(0+1) +\idx)] + str sAbi, [input_addr, 8*(\num*(2)+ \idx)] + str sAbo, [input_addr, 8*(\num*(2+1) +\idx)] + str sAbu, [input_addr, 8*(\num*(4)+ \idx)] + str sAga, [input_addr, 8*(\num*(4+1) +\idx)] + str sAge, [input_addr, 8*(\num*(6)+ \idx)] + str sAgi, [input_addr, 8*(\num*(6+1) +\idx)] + str sAgo, [input_addr, 8*(\num*(8)+ \idx)] + str sAgu, [input_addr, 8*(\num*(8+1) +\idx)] + str sAka, [input_addr, 8*(\num*(10) +\idx)] + str sAke, [input_addr, 8*(\num*(10+1)+\idx)] + str sAki, [input_addr, 8*(\num*(12) +\idx)] + str sAko, [input_addr, 8*(\num*(12+1)+\idx)] + str sAku, [input_addr, 8*(\num*(14) +\idx)] + str sAma, [input_addr, 8*(\num*(14+1)+\idx)] + str sAme, [input_addr, 8*(\num*(16) +\idx)] + str sAmi, [input_addr, 8*(\num*(16+1)+\idx)] + str sAmo, [input_addr, 8*(\num*(18) +\idx)] + str sAmu, [input_addr, 8*(\num*(18+1)+\idx)] + str sAsa, [input_addr, 8*(\num*(20) +\idx)] + str sAse, [input_addr, 8*(\num*(20+1)+\idx)] + str sAsi, [input_addr, 8*(\num*(22) +\idx)] + str sAso, [input_addr, 8*(\num*(22+1)+\idx)] + str sAsu, [input_addr, 8*(\num*(24) +\idx)] +.endm + +.macro load_input_scalar num idx + ldr s_Aba, [input_addr, 8*(\num*(0) +\idx)] + ldr sAbe, [input_addr, 8*(\num*(0+1) +\idx)] + ldr sAbi, [input_addr, 8*(\num*(2)+ \idx)] + ldr sAbo, [input_addr, 8*(\num*(2+1) +\idx)] + ldr sAbu, [input_addr, 8*(\num*(4)+ \idx)] + ldr sAga, [input_addr, 8*(\num*(4+1) +\idx)] + ldr sAge, [input_addr, 8*(\num*(6)+ \idx)] + ldr sAgi, [input_addr, 8*(\num*(6+1) +\idx)] + ldr sAgo, [input_addr, 8*(\num*(8)+ \idx)] + ldr sAgu, [input_addr, 8*(\num*(8+1) +\idx)] + ldr sAka, [input_addr, 8*(\num*(10) +\idx)] + ldr sAke, [input_addr, 8*(\num*(10+1)+\idx)] + ldr sAki, [input_addr, 8*(\num*(12) +\idx)] + ldr sAko, [input_addr, 8*(\num*(12+1)+\idx)] + ldr sAku, [input_addr, 8*(\num*(14) +\idx)] + ldr sAma, [input_addr, 8*(\num*(14+1)+\idx)] + ldr sAme, [input_addr, 8*(\num*(16) +\idx)] + ldr sAmi, [input_addr, 8*(\num*(16+1)+\idx)] + ldr sAmo, [input_addr, 8*(\num*(18) +\idx)] + ldr sAmu, [input_addr, 8*(\num*(18+1)+\idx)] + ldr sAsa, [input_addr, 8*(\num*(20) +\idx)] + ldr sAse, [input_addr, 8*(\num*(20+1)+\idx)] + ldr sAsi, [input_addr, 8*(\num*(22) +\idx)] + ldr sAso, [input_addr, 8*(\num*(22+1)+\idx)] + ldr sAsu, [input_addr, 8*(\num*(24) +\idx)] +.endm + +#define STACK_SIZE (8*8 + 16*6 + 3*8 + 8) // VREGS (8*8), GPRs (16*6), count (8), const (8), input (8), padding (8) +#define STACK_BASE_GPRS (3*8+8) +#define STACK_BASE_VREGS (3*8+8+16*6) +#define STACK_OFFSET_INPUT (0*8) +#define STACK_OFFSET_CONST (1*8) +#define STACK_OFFSET_COUNT (2*8) + +.macro save_gprs + stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro save_vregs + stp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] + stp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + stp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + stp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] +.endm + +.macro restore_vregs + ldp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] + ldp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + ldp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + ldp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] +.endm + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +.macro eor5 dst, src0, src1, src2, src3, src4 + eor \dst, \src0, \src1 + eor \dst, \dst, \src2 + eor \dst, \dst, \src3 + eor \dst, \dst, \src4 +.endm + +.macro xor_rol dst, src1, src0, imm + eor \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro bic_rol dst, src1, src0, imm + bic \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro rotate dst, src, imm + ror \dst, \src, #(64-\imm) +.endm + +.macro save reg, offset + str \reg, [sp, #\offset] +.endm + +.macro restore reg, offset + ldr \reg, [sp, #\offset] +.endm + +.macro hybrid_round_initial + + eor sC0, sAma, sAsa SEP eor3_m1 C0, vAba, vAga, vAka + eor sC1, sAme, sAse SEP + eor sC2, sAmi, sAsi SEP + eor sC3, sAmo, sAso SEP eor3_m1 C0, C0, vAma, vAsa + eor sC4, sAmu, sAsu SEP + eor sC0, sAka, sC0 SEP + eor sC1, sAke, sC1 SEP eor3_m1 C1, vAbe, vAge, vAke + eor sC2, sAki, sC2 SEP + eor sC3, sAko, sC3 SEP + eor sC4, sAku, sC4 SEP eor3_m1 C1, C1, vAme, vAse + eor sC0, sAga, sC0 SEP + eor sC1, sAge, sC1 SEP + eor sC2, sAgi, sC2 SEP eor3_m1 C2, vAbi, vAgi, vAki + eor sC3, sAgo, sC3 SEP + eor sC4, sAgu, sC4 SEP + eor sC0, s_Aba, sC0 SEP eor3_m1 C2, C2, vAmi, vAsi + eor sC1, sAbe, sC1 SEP + eor sC2, sAbi, sC2 SEP + eor sC3, sAbo, sC3 SEP eor3_m1 C3, vAbo, vAgo, vAko + eor sC4, sAbu, sC4 SEP + SEP + eor sE1, sC0, sC2, ROR #63 SEP eor3_m1 C3, C3, vAmo, vAso + eor sE3, sC2, sC4, ROR #63 SEP + eor sE0, sC4, sC1, ROR #63 SEP + eor sE2, sC1, sC3, ROR #63 SEP eor3_m1 C4, vAbu, vAgu, vAku + eor sE4, sC3, sC0, ROR #63 SEP + SEP + eor s_Aba_, s_Aba, sE0 SEP eor3_m1 C4, C4, vAmu, vAsu + eor sAsa_, sAbi, sE2 SEP + eor sAbi_, sAki, sE2 SEP + eor sAki_, sAko, sE3 SEP rax1_m1 E1, C0, C2 + eor sAko_, sAmu, sE4 SEP + eor sAmu_, sAso, sE3 SEP + eor sAso_, sAma, sE0 SEP + eor sAka_, sAbe, sE1 SEP rax1_m1 E3, C2, C4 + eor sAse_, sAgo, sE3 SEP + eor sAgo_, sAme, sE1 SEP + eor sAke_, sAgi, sE2 SEP + eor sAgi_, sAka, sE0 SEP rax1_m1 E0, C4, C1 + eor sAga_, sAbo, sE3 SEP + eor sAbo_, sAmo, sE3 SEP + eor sAmo_, sAmi, sE2 SEP + eor sAmi_, sAke, sE1 SEP rax1_m1 E2, C1, C3 + eor sAge_, sAgu, sE4 SEP + eor sAgu_, sAsi, sE2 SEP + eor sAsi_, sAku, sE4 SEP + eor sAku_, sAsa, sE0 SEP rax1_m1 E4, C3, C0 + eor sAma_, sAbu, sE4 SEP + eor sAbu_, sAsu, sE4 SEP + eor sAsu_, sAse, sE1 SEP + eor sAme_, sAga, sE0 SEP eor vAba_.16b, vAba.16b, E0.16b + eor sAbe_, sAge, sE1 SEP + SEP + load_constant_ptr SEP xar_m1 vAsa_, vAbi, E2, 2 + SEP + bic tmp, sAgi_, sAge_, ROR #47 SEP + eor sAga, tmp, sAga_, ROR #39 SEP + bic tmp, sAgo_, sAgi_, ROR #42 SEP xar_m1 vAbi_, vAki, E2, 21 + eor sAge, tmp, sAge_, ROR #25 SEP + bic tmp, sAgu_, sAgo_, ROR #16 SEP + eor sAgi, tmp, sAgi_, ROR #58 SEP + bic tmp, sAga_, sAgu_, ROR #31 SEP xar_m1 vAki_, vAko, E3, 39 + eor sAgo, tmp, sAgo_, ROR #47 SEP + bic tmp, sAge_, sAga_, ROR #56 SEP + eor sAgu, tmp, sAgu_, ROR #23 SEP xar_m1 vAko_, vAmu, E4, 56 + bic tmp, sAki_, sAke_, ROR #19 SEP + eor sAka, tmp, sAka_, ROR #24 SEP + bic tmp, sAko_, sAki_, ROR #47 SEP + eor sAke, tmp, sAke_, ROR #2 SEP xar_m1 vAmu_, vAso, E3, 8 + bic tmp, sAku_, sAko_, ROR #10 SEP + eor sAki, tmp, sAki_, ROR #57 SEP + bic tmp, sAka_, sAku_, ROR #47 SEP xar_m1 vAso_, vAma, E0, 23 + eor sAko, tmp, sAko_, ROR #57 SEP + bic tmp, sAke_, sAka_, ROR #5 SEP + eor sAku, tmp, sAku_, ROR #52 SEP + bic tmp, sAmi_, sAme_, ROR #38 SEP xar_m1 vAka_, vAbe, E1, 63 + eor sAma, tmp, sAma_, ROR #47 SEP + bic tmp, sAmo_, sAmi_, ROR #5 SEP + eor sAme, tmp, sAme_, ROR #43 SEP xar_m1 vAse_, vAgo, E3, 9 + bic tmp, sAmu_, sAmo_, ROR #41 SEP + eor sAmi, tmp, sAmi_, ROR #46 SEP + SEP + ldr cur_const, [const_addr] SEP + mov count, #1 SEP xar_m1 vAgo_, vAme, E1, 19 + SEP + bic tmp, sAma_, sAmu_, ROR #35 SEP + eor sAmo, tmp, sAmo_, ROR #12 SEP + bic tmp, sAme_, sAma_, ROR #9 SEP xar_m1 vAke_, vAgi, E2, 58 + eor sAmu, tmp, sAmu_, ROR #44 SEP + bic tmp, sAsi_, sAse_, ROR #48 SEP + eor sAsa, tmp, sAsa_, ROR #41 SEP xar_m1 vAgi_, vAka, E0, 61 + bic tmp, sAso_, sAsi_, ROR #2 SEP + eor sAse, tmp, sAse_, ROR #50 SEP + bic tmp, sAsu_, sAso_, ROR #25 SEP + eor sAsi, tmp, sAsi_, ROR #27 SEP xar_m1 vAga_, vAbo, E3, 36 + bic tmp, sAsa_, sAsu_, ROR #60 SEP + eor sAso, tmp, sAso_, ROR #21 SEP + bic tmp, sAse_, sAsa_, ROR #57 SEP xar_m1 vAbo_, vAmo, E3, 43 + eor sAsu, tmp, sAsu_, ROR #53 SEP + bic tmp, sAbi_, sAbe_, ROR #63 SEP + eor s_Aba, s_Aba_, tmp, ROR #21 SEP + bic tmp, sAbo_, sAbi_, ROR #42 SEP xar_m1 vAmo_, vAmi, E2, 49 + eor sAbe, tmp, sAbe_, ROR #41 SEP + bic tmp, sAbu_, sAbo_, ROR #57 SEP + eor sAbi, tmp, sAbi_, ROR #35 SEP xar_m1 vAmi_, vAke, E1, 54 + bic tmp, s_Aba_, sAbu_, ROR #50 SEP + eor sAbo, tmp, sAbo_, ROR #43 SEP + bic tmp, sAbe_, s_Aba_, ROR #44 SEP + eor sAbu, tmp, sAbu_, ROR #30 SEP xar_m1 vAge_, vAgu, E4, 44 + SEP + eor s_Aba, s_Aba, cur_const SEP + SEP xar_m1 vAgu_, vAsi, E2, 3 + save count, STACK_OFFSET_COUNT SEP + SEP + eor sC0, sAka, sAsa, ROR #50 SEP + eor sC1, sAse, sAge, ROR #60 SEP + eor sC2, sAmi, sAgi, ROR #59 SEP xar_m1 vAsi_, vAku, E4, 25 + eor sC3, sAgo, sAso, ROR #30 SEP + eor sC4, sAbu, sAsu, ROR #53 SEP + eor sC0, sAma, sC0, ROR #49 SEP xar_m1 vAku_, vAsa, E0, 46 + eor sC1, sAbe, sC1, ROR #44 SEP + eor sC2, sAki, sC2, ROR #26 SEP + eor sC3, sAmo, sC3, ROR #63 SEP + eor sC4, sAmu, sC4, ROR #56 SEP xar_m1 vAma_, vAbu, E4, 37 + eor sC0, sAga, sC0, ROR #57 SEP + eor sC1, sAme, sC1, ROR #58 SEP + eor sC2, sAbi, sC2, ROR #60 SEP xar_m1 vAbu_, vAsu, E4, 50 + eor sC3, sAko, sC3, ROR #38 SEP + eor sC4, sAgu, sC4, ROR #48 SEP + eor sC0, s_Aba, sC0, ROR #61 SEP + eor sC1, sAke, sC1, ROR #57 SEP xar_m1 vAsu_, vAse, E1, 62 + eor sC2, sAsi, sC2, ROR #52 SEP + eor sC3, sAbo, sC3, ROR #63 SEP + eor sC4, sAku, sC4, ROR #50 SEP xar_m1 vAme_, vAga, E0, 28 + ror sC1, sC1, 56 SEP + ror sC4, sC4, 58 SEP + ror sC2, sC2, 62 SEP xar_m1 vAbe_, vAge, E1, 20 + SEP + eor sE1, sC0, sC2, ROR #63 SEP + eor sE3, sC2, sC4, ROR #63 SEP + eor sE0, sC4, sC1, ROR #63 SEP bcax_m1 vAga, vAga_, vAgi_, vAge_ + eor sE2, sC1, sC3, ROR #63 SEP + eor sE4, sC3, sC0, ROR #63 SEP + SEP bcax_m1 vAge, vAge_, vAgo_, vAgi_ + eor s_Aba_, sE0, s_Aba SEP + eor sAsa_, sE2, sAbi, ROR #50 SEP + eor sAbi_, sE2, sAki, ROR #46 SEP bcax_m1 vAgi, vAgi_, vAgu_, vAgo_ + eor sAki_, sE3, sAko, ROR #63 SEP + eor sAko_, sE4, sAmu, ROR #28 SEP + eor sAmu_, sE3, sAso, ROR #2 SEP bcax_m1 vAgo, vAgo_, vAga_, vAgu_ + eor sAso_, sE0, sAma, ROR #54 SEP + eor sAka_, sE1, sAbe, ROR #43 SEP + eor sAse_, sE3, sAgo, ROR #36 SEP bcax_m1 vAgu, vAgu_, vAge_, vAga_ + eor sAgo_, sE1, sAme, ROR #49 SEP + eor sAke_, sE2, sAgi, ROR #3 SEP + eor sAgi_, sE0, sAka, ROR #39 SEP bcax_m1 vAka, vAka_, vAki_, vAke_ + eor sAga_, sE3, sAbo SEP + eor sAbo_, sE3, sAmo, ROR #37 SEP + eor sAmo_, sE2, sAmi, ROR #8 SEP bcax_m1 vAke, vAke_, vAko_, vAki_ + eor sAmi_, sE1, sAke, ROR #56 SEP + eor sAge_, sE4, sAgu, ROR #44 SEP + eor sAgu_, sE2, sAsi, ROR #62 SEP bcax_m1 vAki, vAki_, vAku_, vAko_ + eor sAsi_, sE4, sAku, ROR #58 SEP + eor sAku_, sE0, sAsa, ROR #25 SEP + eor sAma_, sE4, sAbu, ROR #20 SEP bcax_m1 vAko, vAko_, vAka_, vAku_ + eor sAbu_, sE4, sAsu, ROR #9 SEP + eor sAsu_, sE1, sAse, ROR #23 SEP + eor sAme_, sE0, sAga, ROR #61 SEP bcax_m1 vAku, vAku_, vAke_, vAka_ + eor sAbe_, sE1, sAge, ROR #19 SEP + SEP + load_constant_ptr SEP bcax_m1 vAma, vAma_, vAmi_, vAme_ + restore count, STACK_OFFSET_COUNT SEP + SEP + bic tmp, sAgi_, sAge_, ROR #47 SEP restore x26, STACK_OFFSET_CONST + eor sAga, tmp, sAga_, ROR #39 SEP + bic tmp, sAgo_, sAgi_, ROR #42 SEP + eor sAge, tmp, sAge_, ROR #25 SEP bcax_m1 vAme, vAme_, vAmo_, vAmi_ + bic tmp, sAgu_, sAgo_, ROR #16 SEP + eor sAgi, tmp, sAgi_, ROR #58 SEP ld1r {v28.2d}, [x26], #8 + bic tmp, sAga_, sAgu_, ROR #31 SEP + eor sAgo, tmp, sAgo_, ROR #47 SEP + bic tmp, sAge_, sAga_, ROR #56 SEP bcax_m1 vAmi, vAmi_, vAmu_, vAmo_ + eor sAgu, tmp, sAgu_, ROR #23 SEP + bic tmp, sAki_, sAke_, ROR #19 SEP save x26, STACK_OFFSET_CONST + eor sAka, tmp, sAka_, ROR #24 SEP + bic tmp, sAko_, sAki_, ROR #47 SEP + eor sAke, tmp, sAke_, ROR #2 SEP bcax_m1 vAmo, vAmo_, vAma_, vAmu_ + bic tmp, sAku_, sAko_, ROR #10 SEP + eor sAki, tmp, sAki_, ROR #57 SEP + bic tmp, sAka_, sAku_, ROR #47 SEP bcax_m1 vAmu, vAmu_, vAme_, vAma_ + eor sAko, tmp, sAko_, ROR #57 SEP + bic tmp, sAke_, sAka_, ROR #5 SEP + eor sAku, tmp, sAku_, ROR #52 SEP bcax_m1 vAsa, vAsa_, vAsi_, vAse_ + bic tmp, sAmi_, sAme_, ROR #38 SEP + eor sAma, tmp, sAma_, ROR #47 SEP + bic tmp, sAmo_, sAmi_, ROR #5 SEP bcax_m1 vAse, vAse_, vAso_, vAsi_ + eor sAme, tmp, sAme_, ROR #43 SEP + bic tmp, sAmu_, sAmo_, ROR #41 SEP + eor sAmi, tmp, sAmi_, ROR #46 SEP bcax_m1 vAsi, vAsi_, vAsu_, vAso_ + bic tmp, sAma_, sAmu_, ROR #35 SEP + SEP + ldr cur_const, [const_addr, count, UXTW #3] SEP bcax_m1 vAso, vAso_, vAsa_, vAsu_ + SEP + eor sAmo, tmp, sAmo_, ROR #12 SEP + bic tmp, sAme_, sAma_, ROR #9 SEP bcax_m1 vAsu, vAsu_, vAse_, vAsa_ + eor sAmu, tmp, sAmu_, ROR #44 SEP + bic tmp, sAsi_, sAse_, ROR #48 SEP + eor sAsa, tmp, sAsa_, ROR #41 SEP bcax_m1 vAba, vAba_, vAbi_, vAbe_ + bic tmp, sAso_, sAsi_, ROR #2 SEP + eor sAse, tmp, sAse_, ROR #50 SEP + bic tmp, sAsu_, sAso_, ROR #25 SEP bcax_m1 vAbe, vAbe_, vAbo_, vAbi_ + eor sAsi, tmp, sAsi_, ROR #27 SEP + bic tmp, sAsa_, sAsu_, ROR #60 SEP + eor sAso, tmp, sAso_, ROR #21 SEP bcax_m1 vAbi, vAbi_, vAbu_, vAbo_ + bic tmp, sAse_, sAsa_, ROR #57 SEP + eor sAsu, tmp, sAsu_, ROR #53 SEP + bic tmp, sAbi_, sAbe_, ROR #63 SEP bcax_m1 vAbo, vAbo_, vAba_, vAbu_ + eor s_Aba, s_Aba_, tmp, ROR #21 SEP + bic tmp, sAbo_, sAbi_, ROR #42 SEP + eor sAbe, tmp, sAbe_, ROR #41 SEP bcax_m1 vAbu, vAbu_, vAbe_, vAba_ + bic tmp, sAbu_, sAbo_, ROR #57 SEP + eor sAbi, tmp, sAbi_, ROR #35 SEP + bic tmp, s_Aba_, sAbu_, ROR #50 SEP + eor sAbo, tmp, sAbo_, ROR #43 SEP eor vAba.16b, vAba.16b, v28.16b + bic tmp, sAbe_, s_Aba_, ROR #44 SEP + eor sAbu, tmp, sAbu_, ROR #30 SEP + SEP + add count, count, #1 SEP + SEP + eor s_Aba, s_Aba, cur_const SEP + SEP +.endm + +.macro hybrid_round_noninitial + save count, STACK_OFFSET_COUNT SEP eor3_m1 C0, vAba, vAga, vAka + SEP + eor sC0, sAka, sAsa, ROR #50 SEP + eor sC1, sAse, sAge, ROR #60 SEP eor3_m1 C0, C0, vAma, vAsa + eor sC2, sAmi, sAgi, ROR #59 SEP + eor sC3, sAgo, sAso, ROR #30 SEP + eor sC4, sAbu, sAsu, ROR #53 SEP eor3_m1 C1, vAbe, vAge, vAke + eor sC0, sAma, sC0, ROR #49 SEP + eor sC1, sAbe, sC1, ROR #44 SEP + eor sC2, sAki, sC2, ROR #26 SEP eor3_m1 C1, C1, vAme, vAse + eor sC3, sAmo, sC3, ROR #63 SEP + eor sC4, sAmu, sC4, ROR #56 SEP + eor sC0, sAga, sC0, ROR #57 SEP eor3_m1 C2, vAbi, vAgi, vAki + eor sC1, sAme, sC1, ROR #58 SEP + eor sC2, sAbi, sC2, ROR #60 SEP + eor sC3, sAko, sC3, ROR #38 SEP eor3_m1 C2, C2, vAmi, vAsi + eor sC4, sAgu, sC4, ROR #48 SEP + eor sC0, s_Aba, sC0, ROR #61 SEP + eor sC1, sAke, sC1, ROR #57 SEP eor3_m1 C3, vAbo, vAgo, vAko + eor sC2, sAsi, sC2, ROR #52 SEP + eor sC3, sAbo, sC3, ROR #63 SEP + eor sC4, sAku, sC4, ROR #50 SEP eor3_m1 C3, C3, vAmo, vAso + ror sC1, sC1, 56 SEP + ror sC4, sC4, 58 SEP + ror sC2, sC2, 62 SEP eor3_m1 C4, vAbu, vAgu, vAku + SEP + eor sE1, sC0, sC2, ROR #63 SEP + eor sE3, sC2, sC4, ROR #63 SEP eor3_m1 C4, C4, vAmu, vAsu + eor sE0, sC4, sC1, ROR #63 SEP + eor sE2, sC1, sC3, ROR #63 SEP + eor sE4, sC3, sC0, ROR #63 SEP rax1_m1 E1, C0, C2 + SEP + eor s_Aba_, sE0, s_Aba SEP + eor sAsa_, sE2, sAbi, ROR #50 SEP + eor sAbi_, sE2, sAki, ROR #46 SEP rax1_m1 E3, C2, C4 + eor sAki_, sE3, sAko, ROR #63 SEP + eor sAko_, sE4, sAmu, ROR #28 SEP + eor sAmu_, sE3, sAso, ROR #2 SEP + eor sAso_, sE0, sAma, ROR #54 SEP rax1_m1 E0, C4, C1 + eor sAka_, sE1, sAbe, ROR #43 SEP + eor sAse_, sE3, sAgo, ROR #36 SEP + eor sAgo_, sE1, sAme, ROR #49 SEP + eor sAke_, sE2, sAgi, ROR #3 SEP rax1_m1 E2, C1, C3 + eor sAgi_, sE0, sAka, ROR #39 SEP + eor sAga_, sE3, sAbo SEP + eor sAbo_, sE3, sAmo, ROR #37 SEP + eor sAmo_, sE2, sAmi, ROR #8 SEP rax1_m1 E4, C3, C0 + eor sAmi_, sE1, sAke, ROR #56 SEP + eor sAge_, sE4, sAgu, ROR #44 SEP + eor sAgu_, sE2, sAsi, ROR #62 SEP + eor sAsi_, sE4, sAku, ROR #58 SEP eor vAba_.16b, vAba.16b, E0.16b + eor sAku_, sE0, sAsa, ROR #25 SEP + eor sAma_, sE4, sAbu, ROR #20 SEP + eor sAbu_, sE4, sAsu, ROR #9 SEP xar_m1 vAsa_, vAbi, E2, 2 + eor sAsu_, sE1, sAse, ROR #23 SEP + eor sAme_, sE0, sAga, ROR #61 SEP + eor sAbe_, sE1, sAge, ROR #19 SEP + SEP xar_m1 vAbi_, vAki, E2, 21 + load_constant_ptr SEP + restore count, STACK_OFFSET_COUNT SEP + SEP + bic tmp, sAgi_, sAge_, ROR #47 SEP xar_m1 vAki_, vAko, E3, 39 + eor sAga, tmp, sAga_, ROR #39 SEP + bic tmp, sAgo_, sAgi_, ROR #42 SEP + eor sAge, tmp, sAge_, ROR #25 SEP xar_m1 vAko_, vAmu, E4, 56 + bic tmp, sAgu_, sAgo_, ROR #16 SEP + eor sAgi, tmp, sAgi_, ROR #58 SEP + bic tmp, sAga_, sAgu_, ROR #31 SEP + eor sAgo, tmp, sAgo_, ROR #47 SEP xar_m1 vAmu_, vAso, E3, 8 + bic tmp, sAge_, sAga_, ROR #56 SEP + eor sAgu, tmp, sAgu_, ROR #23 SEP + bic tmp, sAki_, sAke_, ROR #19 SEP + eor sAka, tmp, sAka_, ROR #24 SEP xar_m1 vAso_, vAma, E0, 23 + bic tmp, sAko_, sAki_, ROR #47 SEP + eor sAke, tmp, sAke_, ROR #2 SEP + bic tmp, sAku_, sAko_, ROR #10 SEP + eor sAki, tmp, sAki_, ROR #57 SEP xar_m1 vAka_, vAbe, E1, 63 + bic tmp, sAka_, sAku_, ROR #47 SEP + eor sAko, tmp, sAko_, ROR #57 SEP + bic tmp, sAke_, sAka_, ROR #5 SEP xar_m1 vAse_, vAgo, E3, 9 + eor sAku, tmp, sAku_, ROR #52 SEP + bic tmp, sAmi_, sAme_, ROR #38 SEP + eor sAma, tmp, sAma_, ROR #47 SEP + bic tmp, sAmo_, sAmi_, ROR #5 SEP + eor sAme, tmp, sAme_, ROR #43 SEP xar_m1 vAgo_, vAme, E1, 19 + bic tmp, sAmu_, sAmo_, ROR #41 SEP + eor sAmi, tmp, sAmi_, ROR #46 SEP + bic tmp, sAma_, sAmu_, ROR #35 SEP + SEP xar_m1 vAke_, vAgi, E2, 58 + ldr cur_const, [const_addr, count, UXTW #3] SEP + add count, count, #1 SEP + SEP + eor sAmo, tmp, sAmo_, ROR #12 SEP + bic tmp, sAme_, sAma_, ROR #9 SEP xar_m1 vAgi_, vAka, E0, 61 + eor sAmu, tmp, sAmu_, ROR #44 SEP + bic tmp, sAsi_, sAse_, ROR #48 SEP + eor sAsa, tmp, sAsa_, ROR #41 SEP + bic tmp, sAso_, sAsi_, ROR #2 SEP xar_m1 vAga_, vAbo, E3, 36 + eor sAse, tmp, sAse_, ROR #50 SEP + bic tmp, sAsu_, sAso_, ROR #25 SEP + eor sAsi, tmp, sAsi_, ROR #27 SEP + bic tmp, sAsa_, sAsu_, ROR #60 SEP xar_m1 vAbo_, vAmo, E3, 43 + eor sAso, tmp, sAso_, ROR #21 SEP + bic tmp, sAse_, sAsa_, ROR #57 SEP + eor sAsu, tmp, sAsu_, ROR #53 SEP + bic tmp, sAbi_, sAbe_, ROR #63 SEP xar_m1 vAmo_, vAmi, E2, 49 + eor s_Aba, s_Aba_, tmp, ROR #21 SEP + bic tmp, sAbo_, sAbi_, ROR #42 SEP + eor sAbe, tmp, sAbe_, ROR #41 SEP + bic tmp, sAbu_, sAbo_, ROR #57 SEP xar_m1 vAmi_, vAke, E1, 54 + eor sAbi, tmp, sAbi_, ROR #35 SEP + bic tmp, s_Aba_, sAbu_, ROR #50 SEP + eor sAbo, tmp, sAbo_, ROR #43 SEP + bic tmp, sAbe_, s_Aba_, ROR #44 SEP xar_m1 vAge_, vAgu, E4, 44 + eor sAbu, tmp, sAbu_, ROR #30 SEP + SEP + eor s_Aba, s_Aba, cur_const SEP xar_m1 vAgu_, vAsi, E2, 3 + save count, STACK_OFFSET_COUNT SEP + SEP + eor sC0, sAka, sAsa, ROR #50 SEP + eor sC1, sAse, sAge, ROR #60 SEP + eor sC2, sAmi, sAgi, ROR #59 SEP xar_m1 vAsi_, vAku, E4, 25 + eor sC3, sAgo, sAso, ROR #30 SEP + eor sC4, sAbu, sAsu, ROR #53 SEP + eor sC0, sAma, sC0, ROR #49 SEP xar_m1 vAku_, vAsa, E0, 46 + eor sC1, sAbe, sC1, ROR #44 SEP + eor sC2, sAki, sC2, ROR #26 SEP + eor sC3, sAmo, sC3, ROR #63 SEP + eor sC4, sAmu, sC4, ROR #56 SEP xar_m1 vAma_, vAbu, E4, 37 + eor sC0, sAga, sC0, ROR #57 SEP + eor sC1, sAme, sC1, ROR #58 SEP + eor sC2, sAbi, sC2, ROR #60 SEP + eor sC3, sAko, sC3, ROR #38 SEP xar_m1 vAbu_, vAsu, E4, 50 + eor sC4, sAgu, sC4, ROR #48 SEP + eor sC0, s_Aba, sC0, ROR #61 SEP + eor sC1, sAke, sC1, ROR #57 SEP + eor sC2, sAsi, sC2, ROR #52 SEP xar_m1 vAsu_, vAse, E1, 62 + eor sC3, sAbo, sC3, ROR #63 SEP + eor sC4, sAku, sC4, ROR #50 SEP + ror sC1, sC1, 56 SEP xar_m1 vAme_, vAga, E0, 28 + ror sC4, sC4, 58 SEP + ror sC2, sC2, 62 SEP + SEP xar_m1 vAbe_, vAge, E1, 20 + eor sE1, sC0, sC2, ROR #63 SEP + eor sE3, sC2, sC4, ROR #63 SEP + eor sE0, sC4, sC1, ROR #63 SEP + eor sE2, sC1, sC3, ROR #63 SEP bcax_m1 vAga, vAga_, vAgi_, vAge_ + eor sE4, sC3, sC0, ROR #63 SEP + SEP + eor s_Aba_, sE0, s_Aba SEP bcax_m1 vAge, vAge_, vAgo_, vAgi_ + eor sAsa_, sE2, sAbi, ROR #50 SEP + eor sAbi_, sE2, sAki, ROR #46 SEP + eor sAki_, sE3, sAko, ROR #63 SEP bcax_m1 vAgi, vAgi_, vAgu_, vAgo_ + eor sAko_, sE4, sAmu, ROR #28 SEP + eor sAmu_, sE3, sAso, ROR #2 SEP + eor sAso_, sE0, sAma, ROR #54 SEP bcax_m1 vAgo, vAgo_, vAga_, vAgu_ + eor sAka_, sE1, sAbe, ROR #43 SEP + eor sAse_, sE3, sAgo, ROR #36 SEP + eor sAgo_, sE1, sAme, ROR #49 SEP bcax_m1 vAgu, vAgu_, vAge_, vAga_ + eor sAke_, sE2, sAgi, ROR #3 SEP + eor sAgi_, sE0, sAka, ROR #39 SEP + eor sAga_, sE3, sAbo SEP bcax_m1 vAka, vAka_, vAki_, vAke_ + eor sAbo_, sE3, sAmo, ROR #37 SEP + eor sAmo_, sE2, sAmi, ROR #8 SEP + eor sAmi_, sE1, sAke, ROR #56 SEP bcax_m1 vAke, vAke_, vAko_, vAki_ + eor sAge_, sE4, sAgu, ROR #44 SEP + eor sAgu_, sE2, sAsi, ROR #62 SEP + eor sAsi_, sE4, sAku, ROR #58 SEP bcax_m1 vAki, vAki_, vAku_, vAko_ + eor sAku_, sE0, sAsa, ROR #25 SEP + eor sAma_, sE4, sAbu, ROR #20 SEP + eor sAbu_, sE4, sAsu, ROR #9 SEP bcax_m1 vAko, vAko_, vAka_, vAku_ + eor sAsu_, sE1, sAse, ROR #23 SEP + eor sAme_, sE0, sAga, ROR #61 SEP + eor sAbe_, sE1, sAge, ROR #19 SEP bcax_m1 vAku, vAku_, vAke_, vAka_ + SEP + load_constant_ptr SEP + restore count, STACK_OFFSET_COUNT SEP bcax_m1 vAma, vAma_, vAmi_, vAme_ + SEP + bic tmp, sAgi_, sAge_, ROR #47 SEP + eor sAga, tmp, sAga_, ROR #39 SEP restore x26, STACK_OFFSET_CONST + bic tmp, sAgo_, sAgi_, ROR #42 SEP + eor sAge, tmp, sAge_, ROR #25 SEP + bic tmp, sAgu_, sAgo_, ROR #16 SEP bcax_m1 vAme, vAme_, vAmo_, vAmi_ + eor sAgi, tmp, sAgi_, ROR #58 SEP + bic tmp, sAga_, sAgu_, ROR #31 SEP ld1r {v28.2d}, [x26], #8 + eor sAgo, tmp, sAgo_, ROR #47 SEP + bic tmp, sAge_, sAga_, ROR #56 SEP + eor sAgu, tmp, sAgu_, ROR #23 SEP bcax_m1 vAmi, vAmi_, vAmu_, vAmo_ + bic tmp, sAki_, sAke_, ROR #19 SEP + eor sAka, tmp, sAka_, ROR #24 SEP save x26, STACK_OFFSET_CONST + bic tmp, sAko_, sAki_, ROR #47 SEP + eor sAke, tmp, sAke_, ROR #2 SEP + bic tmp, sAku_, sAko_, ROR #10 SEP bcax_m1 vAmo, vAmo_, vAma_, vAmu_ + eor sAki, tmp, sAki_, ROR #57 SEP + bic tmp, sAka_, sAku_, ROR #47 SEP + eor sAko, tmp, sAko_, ROR #57 SEP bcax_m1 vAmu, vAmu_, vAme_, vAma_ + bic tmp, sAke_, sAka_, ROR #5 SEP + eor sAku, tmp, sAku_, ROR #52 SEP + bic tmp, sAmi_, sAme_, ROR #38 SEP bcax_m1 vAsa, vAsa_, vAsi_, vAse_ + eor sAma, tmp, sAma_, ROR #47 SEP + bic tmp, sAmo_, sAmi_, ROR #5 SEP + eor sAme, tmp, sAme_, ROR #43 SEP bcax_m1 vAse, vAse_, vAso_, vAsi_ + bic tmp, sAmu_, sAmo_, ROR #41 SEP + eor sAmi, tmp, sAmi_, ROR #46 SEP + bic tmp, sAma_, sAmu_, ROR #35 SEP bcax_m1 vAsi, vAsi_, vAsu_, vAso_ + SEP + ldr cur_const, [const_addr, count, UXTW #3] SEP + add count, count, #1 SEP bcax_m1 vAso, vAso_, vAsa_, vAsu_ + SEP + eor sAmo, tmp, sAmo_, ROR #12 SEP + bic tmp, sAme_, sAma_, ROR #9 SEP bcax_m1 vAsu, vAsu_, vAse_, vAsa_ + eor sAmu, tmp, sAmu_, ROR #44 SEP + bic tmp, sAsi_, sAse_, ROR #48 SEP + eor sAsa, tmp, sAsa_, ROR #41 SEP bcax_m1 vAba, vAba_, vAbi_, vAbe_ + bic tmp, sAso_, sAsi_, ROR #2 SEP + eor sAse, tmp, sAse_, ROR #50 SEP + bic tmp, sAsu_, sAso_, ROR #25 SEP bcax_m1 vAbe, vAbe_, vAbo_, vAbi_ + eor sAsi, tmp, sAsi_, ROR #27 SEP + bic tmp, sAsa_, sAsu_, ROR #60 SEP + eor sAso, tmp, sAso_, ROR #21 SEP bcax_m1 vAbi, vAbi_, vAbu_, vAbo_ + bic tmp, sAse_, sAsa_, ROR #57 SEP + eor sAsu, tmp, sAsu_, ROR #53 SEP + bic tmp, sAbi_, sAbe_, ROR #63 SEP bcax_m1 vAbo, vAbo_, vAba_, vAbu_ + eor s_Aba, s_Aba_, tmp, ROR #21 SEP + bic tmp, sAbo_, sAbi_, ROR #42 SEP + eor sAbe, tmp, sAbe_, ROR #41 SEP bcax_m1 vAbu, vAbu_, vAbe_, vAba_ + bic tmp, sAbu_, sAbo_, ROR #57 SEP + eor sAbi, tmp, sAbi_, ROR #35 SEP + bic tmp, s_Aba_, sAbu_, ROR #50 SEP + eor sAbo, tmp, sAbo_, ROR #43 SEP eor vAba.16b, vAba.16b, v28.16b + bic tmp, sAbe_, s_Aba_, ROR #44 SEP + eor sAbu, tmp, sAbu_, ROR #30 SEP + SEP + eor s_Aba, s_Aba, cur_const SEP + +.endm + +.macro final_rotate + ror sAga, sAga,#(64-3) + ror sAka, sAka,#(64-25) + ror sAma, sAma,#(64-10) + ror sAsa, sAsa,#(64-39) + ror sAbe, sAbe,#(64-21) + ror sAge, sAge,#(64-45) + ror sAke, sAke,#(64-8) + ror sAme, sAme,#(64-15) + ror sAse, sAse,#(64-41) + ror sAbi, sAbi,#(64-14) + ror sAgi, sAgi,#(64-61) + ror sAki, sAki,#(64-18) + ror sAmi, sAmi,#(64-56) + ror sAsi, sAsi,#(64-2) + ror sAgo, sAgo,#(64-28) + ror sAko, sAko,#(64-1) + ror sAmo, sAmo,#(64-27) + ror sAso, sAso,#(64-62) + ror sAbu, sAbu,#(64-44) + ror sAgu, sAgu,#(64-20) + ror sAku, sAku,#(64-6) + ror sAmu, sAmu,#(64-36) + ror sAsu, sAsu,#(64-55) +.endm + +#define KECCAK_F1600_ROUNDS 24 + +.global keccak_f1600_x4_hybrid_asm_v3p +.global _keccak_f1600_x4_hybrid_asm_v3p +.text +.align 4 + +keccak_f1600_x4_hybrid_asm_v3p: +_keccak_f1600_x4_hybrid_asm_v3p: + alloc_stack + save_gprs + save_vregs + save input_addr, STACK_OFFSET_INPUT + + load_input_vector 2,1 + + load_constant_ptr + + save const_addr, STACK_OFFSET_CONST + + // First scalar Keccak computation alongside first half of SIMD computation + load_input_scalar 4,0 + hybrid_round_initial + loop_0: + hybrid_round_noninitial + cmp count, #(KECCAK_F1600_ROUNDS-1) + ble loop_0 + final_rotate + restore input_addr, STACK_OFFSET_INPUT + store_input_scalar 4,0 + + // Second scalar Keccak computation alongsie second half of SIMD computation + load_input_scalar 4,1 + hybrid_round_initial + loop_1: + hybrid_round_noninitial + cmp count, #(KECCAK_F1600_ROUNDS-1) + ble loop_1 + final_rotate + restore input_addr, STACK_OFFSET_INPUT + store_input_scalar 4, 1 + + store_input_vector 2,1 + + restore_vregs + restore_gprs + free_stack + ret diff --git a/asm/manual/keccak_f1600/keccak_f1600_x4_hybrid_asm_v3pp.s b/asm/manual/keccak_f1600/keccak_f1600_x4_hybrid_asm_v3pp.s new file mode 100644 index 0000000..3b7e3bc --- /dev/null +++ b/asm/manual/keccak_f1600/keccak_f1600_x4_hybrid_asm_v3pp.s @@ -0,0 +1,1022 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +/********************** CONSTANTS *************************/ + .data + .align(8) +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x29 + count .req w27 + out_count .req w27 + cur_const .req x26 + + /* Mapping of Kecck-f1600 SIMD state to vector registers + * at the beginning and end of each round. */ + + vAba .req v0 + vAbe .req v1 + vAbi .req v2 + vAbo .req v3 + vAbu .req v4 + vAga .req v5 + vAge .req v6 + vAgi .req v7 + vAgo .req v8 + vAgu .req v9 + vAka .req v10 + vAke .req v11 + vAki .req v12 + vAko .req v13 + vAku .req v14 + vAma .req v15 + vAme .req v16 + vAmi .req v17 + vAmo .req v18 + vAmu .req v19 + vAsa .req v20 + vAse .req v21 + vAsi .req v22 + vAso .req v23 + vAsu .req v24 + + /* q-form of the above mapping */ + vAbaq .req q0 + vAbeq .req q1 + vAbiq .req q2 + vAboq .req q3 + vAbuq .req q4 + vAgaq .req q5 + vAgeq .req q6 + vAgiq .req q7 + vAgoq .req q8 + vAguq .req q9 + vAkaq .req q10 + vAkeq .req q11 + vAkiq .req q12 + vAkoq .req q13 + vAkuq .req q14 + vAmaq .req q15 + vAmeq .req q16 + vAmiq .req q17 + vAmoq .req q18 + vAmuq .req q19 + vAsaq .req q20 + vAseq .req q21 + vAsiq .req q22 + vAsoq .req q23 + vAsuq .req q24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v30 + C1 .req v29 + C2 .req v28 + C3 .req v27 + C4 .req v26 + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req v26 + E1 .req v25 + E2 .req v29 + E3 .req v28 + E4 .req v27 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + vAbi_ .req v2 + vAbo_ .req v3 + vAbu_ .req v4 + vAga_ .req v10 + vAge_ .req v11 + vAgi_ .req v7 + vAgo_ .req v8 + vAgu_ .req v9 + vAka_ .req v15 + vAke_ .req v16 + vAki_ .req v12 + vAko_ .req v13 + vAku_ .req v14 + vAma_ .req v20 + vAme_ .req v21 + vAmi_ .req v17 + vAmo_ .req v18 + vAmu_ .req v19 + vAsa_ .req v0 + vAse_ .req v1 + vAsi_ .req v22 + vAso_ .req v23 + vAsu_ .req v24 + vAba_ .req v30 + vAbe_ .req v27 + + /* Unused temporary */ + vtmp .req v31 + + /* Mapping of Kecck-f1600 state to scalar registers + * at the beginning and end of each round. */ + s_Aba .req x1 + sAbe .req x6 + sAbi .req x11 + sAbo .req x16 + sAbu .req x21 + sAga .req x2 + sAge .req x7 + sAgi .req x12 + sAgo .req x17 + sAgu .req x22 + sAka .req x3 + sAke .req x8 + sAki .req x13 + sAko .req x18 + sAku .req x23 + sAma .req x4 + sAme .req x9 + sAmi .req x14 + sAmo .req x19 + sAmu .req x24 + sAsa .req x5 + sAse .req x10 + sAsi .req x15 + sAso .req x20 + sAsu .req x25 + + /* sA_[y,2*x+3*y] = rot(A[x,y]) */ + s_Aba_ .req x0 + sAbe_ .req x28 + sAbi_ .req x11 + sAbo_ .req x16 + sAbu_ .req x21 + sAga_ .req x3 + sAge_ .req x8 + sAgi_ .req x12 + sAgo_ .req x17 + sAgu_ .req x22 + sAka_ .req x4 + sAke_ .req x9 + sAki_ .req x13 + sAko_ .req x18 + sAku_ .req x23 + sAma_ .req x5 + sAme_ .req x10 + sAmi_ .req x14 + sAmo_ .req x19 + sAmu_ .req x24 + sAsa_ .req x1 + sAse_ .req x6 + sAsi_ .req x15 + sAso_ .req x20 + sAsu_ .req x25 + + /* sC[x] = sA[x,0] xor sA[x,1] xor sA[x,2] xor sA[x,3] xor sA[x,4], for x in 0..4 */ + /* sE[x] = sC[x-1] xor rot(C[x+1],1), for x in 0..4 */ + sC0 .req x0 + sE0 .req x29 + sC1 .req x26 + sE1 .req x30 + sC2 .req x27 + sE2 .req x26 + sC3 .req x28 + sE3 .req x27 + sC4 .req x29 + sE4 .req x28 + + tmp .req x30 + +/************************ MACROS ****************************/ + +/* Macros using v8.4-A SHA-3 instructions */ + + +.macro eor3_m1 d s0 s1 s2 + eor \d\().16b, \s0\().16b, \s1\().16b + eor \d\().16b, \d\().16b, \s2\().16b +.endm + +.macro rax1_m1 d s0 s1 + add vtmp.2d, \s1\().2d, \s1\().2d + sri vtmp.2d, \s1\().2d, #63 + eor \d\().16b, vtmp.16b, \s0\().16b +.endm + +.macro xar_m1 d s0 s1 imm + eor vtmp.16b, \s0\().16b, \s1\().16b + shl \d\().2d, vtmp.2d, #(64-\imm) + sri \d\().2d, vtmp.2d, #(\imm) +.endm + +.macro bcax_m1 d s0 s1 s2 + bic vtmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, vtmp.16b, \s0\().16b +.endm + +.macro eor3_m0 d s0 s1 s2 + eor3 \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +.macro rax1_m0 d s0 s1 + rax1 \d\().2d, \s0\().2d, \s1\().2d +.endm + +.macro xar_m0 d s0 s1 imm + xar \d\().2d, \s0\().2d, \s1\().2d, #\imm +.endm + +.macro bcax_m0 d s0 s1 s2 + bcax \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + + +.macro load_input_vector + ldr vAbaq, [input_addr, #(32*0)] + ldr vAbeq, [input_addr, #(32*0+32)] + ldr vAbiq, [input_addr, #(32*2)] + ldr vAboq, [input_addr, #(32*2+32)] + ldr vAbuq, [input_addr, #(32*4)] + ldr vAgaq, [input_addr, #(32*4+32)] + ldr vAgeq, [input_addr, #(32*6)] + ldr vAgiq, [input_addr, #(32*6+32)] + ldr vAgoq, [input_addr, #(32*8)] + ldr vAguq, [input_addr, #(32*8+32)] + ldr vAkaq, [input_addr, #(32*10)] + ldr vAkeq, [input_addr, #(32*10+32)] + ldr vAkiq, [input_addr, #(32*12)] + ldr vAkoq, [input_addr, #(32*12+32)] + ldr vAkuq, [input_addr, #(32*14)] + ldr vAmaq, [input_addr, #(32*14+32)] + ldr vAmeq, [input_addr, #(32*16)] + ldr vAmiq, [input_addr, #(32*16+32)] + ldr vAmoq, [input_addr, #(32*18)] + ldr vAmuq, [input_addr, #(32*18+32)] + ldr vAsaq, [input_addr, #(32*20)] + ldr vAseq, [input_addr, #(32*20+32)] + ldr vAsiq, [input_addr, #(32*22)] + ldr vAsoq, [input_addr, #(32*22+32)] + ldr vAsuq, [input_addr, #(32*24)] +.endm + +.macro store_input_vector + str vAbaq, [input_addr, #(32*0)] + str vAbeq, [input_addr, #(32*0+32)] + str vAbiq, [input_addr, #(32*2)] + str vAboq, [input_addr, #(32*2+32)] + str vAbuq, [input_addr, #(32*4)] + str vAgaq, [input_addr, #(32*4+32)] + str vAgeq, [input_addr, #(32*6)] + str vAgiq, [input_addr, #(32*6+32)] + str vAgoq, [input_addr, #(32*8)] + str vAguq, [input_addr, #(32*8+32)] + str vAkaq, [input_addr, #(32*10)] + str vAkeq, [input_addr, #(32*10+32)] + str vAkiq, [input_addr, #(32*12)] + str vAkoq, [input_addr, #(32*12+32)] + str vAkuq, [input_addr, #(32*14)] + str vAmaq, [input_addr, #(32*14+32)] + str vAmeq, [input_addr, #(32*16)] + str vAmiq, [input_addr, #(32*16+32)] + str vAmoq, [input_addr, #(32*18)] + str vAmuq, [input_addr, #(32*18+32)] + str vAsaq, [input_addr, #(32*20)] + str vAseq, [input_addr, #(32*20+32)] + str vAsiq, [input_addr, #(32*22)] + str vAsoq, [input_addr, #(32*22+32)] + str vAsuq, [input_addr, #(32*24)] +.endm + +.macro store_input_scalar + str s_Aba,[input_addr, 32*0 ] + str sAbe, [input_addr, 32*1 ] + str sAbi, [input_addr, 32*2 ] + str sAbo, [input_addr, 32*3 ] + str sAbu, [input_addr, 32*4 ] + str sAga, [input_addr, 32*5 ] + str sAge, [input_addr, 32*6 ] + str sAgi, [input_addr, 32*7 ] + str sAgo, [input_addr, 32*8 ] + str sAgu, [input_addr, 32*9 ] + str sAka, [input_addr, 32*10] + str sAke, [input_addr, 32*11] + str sAki, [input_addr, 32*12] + str sAko, [input_addr, 32*13] + str sAku, [input_addr, 32*14] + str sAma, [input_addr, 32*15] + str sAme, [input_addr, 32*16] + str sAmi, [input_addr, 32*17] + str sAmo, [input_addr, 32*18] + str sAmu, [input_addr, 32*19] + str sAsa, [input_addr, 32*20] + str sAse, [input_addr, 32*21] + str sAsi, [input_addr, 32*22] + str sAso, [input_addr, 32*23] + str sAsu, [input_addr, 32*24] +.endm + +.macro load_input_scalar + ldr s_Aba,[input_addr, 32*0 ] + ldr sAbe, [input_addr, 32*1 ] + ldr sAbi, [input_addr, 32*2 ] + ldr sAbo, [input_addr, 32*3 ] + ldr sAbu, [input_addr, 32*4 ] + ldr sAga, [input_addr, 32*5 ] + ldr sAge, [input_addr, 32*6 ] + ldr sAgi, [input_addr, 32*7 ] + ldr sAgo, [input_addr, 32*8 ] + ldr sAgu, [input_addr, 32*9 ] + ldr sAka, [input_addr, 32*10] + ldr sAke, [input_addr, 32*11] + ldr sAki, [input_addr, 32*12] + ldr sAko, [input_addr, 32*13] + ldr sAku, [input_addr, 32*14] + ldr sAma, [input_addr, 32*15] + ldr sAme, [input_addr, 32*16] + ldr sAmi, [input_addr, 32*17] + ldr sAmo, [input_addr, 32*18] + ldr sAmu, [input_addr, 32*19] + ldr sAsa, [input_addr, 32*20] + ldr sAse, [input_addr, 32*21] + ldr sAsi, [input_addr, 32*22] + ldr sAso, [input_addr, 32*23] + ldr sAsu, [input_addr, 32*24] +.endm + +#define STACK_SIZE (4*16 + 12*8 + 6*8) +#define STACK_BASE_VREGS (0) +#define STACK_BASE_GPRS (4*16) +#define STACK_BASE_TMP_GPRS (4*16 + 12*8) +#define STACK_OFFSET_INPUT (0*8) +#define STACK_OFFSET_CONST (1*8) +#define STACK_OFFSET_COUNT (2*8) +#define STACK_OFFSET_COUNT_OUT (3*8) +#define STACK_OFFSET_CUR_INPUT (4*8) + +.macro save_gprs + stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro save_vregs + stp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] + stp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + stp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + stp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] +.endm + +.macro restore_vregs + ldp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] + ldp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + ldp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + ldp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] +.endm + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +.macro eor5 dst, src0, src1, src2, src3, src4 + eor \dst, \src0, \src1 + eor \dst, \dst, \src2 + eor \dst, \dst, \src3 + eor \dst, \dst, \src4 +.endm + +.macro xor_rol dst, src1, src0, imm + eor \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro bic_rol dst, src1, src0, imm + bic \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro rotate dst, src, imm + ror \dst, \src, #(64-\imm) +.endm + +.macro save reg, offset + str \reg, [sp, #(STACK_BASE_TMP_GPRS + \offset)] +.endm + +.macro restore reg, offset + ldr \reg, [sp, #(STACK_BASE_TMP_GPRS + \offset)] +.endm + +.macro hybrid_round_initial + + eor sC0, sAma, sAsa SEP eor3_m1 C0, vAba, vAga, vAka + eor sC1, sAme, sAse SEP + eor sC2, sAmi, sAsi SEP + eor sC3, sAmo, sAso SEP eor3_m1 C0, C0, vAma, vAsa + eor sC4, sAmu, sAsu SEP + eor sC0, sAka, sC0 SEP + eor sC1, sAke, sC1 SEP eor3_m1 C1, vAbe, vAge, vAke + eor sC2, sAki, sC2 SEP + eor sC3, sAko, sC3 SEP + eor sC4, sAku, sC4 SEP eor3_m1 C1, C1, vAme, vAse + eor sC0, sAga, sC0 SEP + eor sC1, sAge, sC1 SEP + eor sC2, sAgi, sC2 SEP eor3_m1 C2, vAbi, vAgi, vAki + eor sC3, sAgo, sC3 SEP + eor sC4, sAgu, sC4 SEP + eor sC0, s_Aba, sC0 SEP eor3_m1 C2, C2, vAmi, vAsi + eor sC1, sAbe, sC1 SEP + eor sC2, sAbi, sC2 SEP + eor sC3, sAbo, sC3 SEP eor3_m1 C3, vAbo, vAgo, vAko + eor sC4, sAbu, sC4 SEP + SEP + eor sE1, sC0, sC2, ROR #63 SEP eor3_m1 C3, C3, vAmo, vAso + eor sE3, sC2, sC4, ROR #63 SEP + eor sE0, sC4, sC1, ROR #63 SEP + eor sE2, sC1, sC3, ROR #63 SEP eor3_m1 C4, vAbu, vAgu, vAku + eor sE4, sC3, sC0, ROR #63 SEP + SEP + eor s_Aba_, s_Aba, sE0 SEP eor3_m1 C4, C4, vAmu, vAsu + eor sAsa_, sAbi, sE2 SEP + eor sAbi_, sAki, sE2 SEP + eor sAki_, sAko, sE3 SEP rax1_m1 E1, C0, C2 + eor sAko_, sAmu, sE4 SEP + eor sAmu_, sAso, sE3 SEP + eor sAso_, sAma, sE0 SEP + eor sAka_, sAbe, sE1 SEP rax1_m1 E3, C2, C4 + eor sAse_, sAgo, sE3 SEP + eor sAgo_, sAme, sE1 SEP + eor sAke_, sAgi, sE2 SEP + eor sAgi_, sAka, sE0 SEP rax1_m1 E0, C4, C1 + eor sAga_, sAbo, sE3 SEP + eor sAbo_, sAmo, sE3 SEP + eor sAmo_, sAmi, sE2 SEP + eor sAmi_, sAke, sE1 SEP rax1_m1 E2, C1, C3 + eor sAge_, sAgu, sE4 SEP + eor sAgu_, sAsi, sE2 SEP + eor sAsi_, sAku, sE4 SEP + eor sAku_, sAsa, sE0 SEP rax1_m1 E4, C3, C0 + eor sAma_, sAbu, sE4 SEP + eor sAbu_, sAsu, sE4 SEP + eor sAsu_, sAse, sE1 SEP + eor sAme_, sAga, sE0 SEP eor vAba_.16b, vAba.16b, E0.16b + eor sAbe_, sAge, sE1 SEP + SEP + load_constant_ptr SEP xar_m1 vAsa_, vAbi, E2, 2 + SEP + bic tmp, sAgi_, sAge_, ROR #47 SEP + eor sAga, tmp, sAga_, ROR #39 SEP + bic tmp, sAgo_, sAgi_, ROR #42 SEP xar_m1 vAbi_, vAki, E2, 21 + eor sAge, tmp, sAge_, ROR #25 SEP + bic tmp, sAgu_, sAgo_, ROR #16 SEP + eor sAgi, tmp, sAgi_, ROR #58 SEP + bic tmp, sAga_, sAgu_, ROR #31 SEP xar_m1 vAki_, vAko, E3, 39 + eor sAgo, tmp, sAgo_, ROR #47 SEP + bic tmp, sAge_, sAga_, ROR #56 SEP + eor sAgu, tmp, sAgu_, ROR #23 SEP xar_m1 vAko_, vAmu, E4, 56 + bic tmp, sAki_, sAke_, ROR #19 SEP + eor sAka, tmp, sAka_, ROR #24 SEP + bic tmp, sAko_, sAki_, ROR #47 SEP + eor sAke, tmp, sAke_, ROR #2 SEP xar_m1 vAmu_, vAso, E3, 8 + bic tmp, sAku_, sAko_, ROR #10 SEP + eor sAki, tmp, sAki_, ROR #57 SEP + bic tmp, sAka_, sAku_, ROR #47 SEP xar_m1 vAso_, vAma, E0, 23 + eor sAko, tmp, sAko_, ROR #57 SEP + bic tmp, sAke_, sAka_, ROR #5 SEP + eor sAku, tmp, sAku_, ROR #52 SEP + bic tmp, sAmi_, sAme_, ROR #38 SEP xar_m1 vAka_, vAbe, E1, 63 + eor sAma, tmp, sAma_, ROR #47 SEP + bic tmp, sAmo_, sAmi_, ROR #5 SEP + eor sAme, tmp, sAme_, ROR #43 SEP xar_m1 vAse_, vAgo, E3, 9 + bic tmp, sAmu_, sAmo_, ROR #41 SEP + eor sAmi, tmp, sAmi_, ROR #46 SEP + SEP + ldr cur_const, [const_addr] SEP + mov count, #1 SEP xar_m1 vAgo_, vAme, E1, 19 + SEP + bic tmp, sAma_, sAmu_, ROR #35 SEP + eor sAmo, tmp, sAmo_, ROR #12 SEP + bic tmp, sAme_, sAma_, ROR #9 SEP xar_m1 vAke_, vAgi, E2, 58 + eor sAmu, tmp, sAmu_, ROR #44 SEP + bic tmp, sAsi_, sAse_, ROR #48 SEP + eor sAsa, tmp, sAsa_, ROR #41 SEP xar_m1 vAgi_, vAka, E0, 61 + bic tmp, sAso_, sAsi_, ROR #2 SEP + eor sAse, tmp, sAse_, ROR #50 SEP + bic tmp, sAsu_, sAso_, ROR #25 SEP + eor sAsi, tmp, sAsi_, ROR #27 SEP xar_m1 vAga_, vAbo, E3, 36 + bic tmp, sAsa_, sAsu_, ROR #60 SEP + eor sAso, tmp, sAso_, ROR #21 SEP + bic tmp, sAse_, sAsa_, ROR #57 SEP xar_m1 vAbo_, vAmo, E3, 43 + eor sAsu, tmp, sAsu_, ROR #53 SEP + bic tmp, sAbi_, sAbe_, ROR #63 SEP + eor s_Aba, s_Aba_, tmp, ROR #21 SEP + bic tmp, sAbo_, sAbi_, ROR #42 SEP xar_m1 vAmo_, vAmi, E2, 49 + eor sAbe, tmp, sAbe_, ROR #41 SEP + bic tmp, sAbu_, sAbo_, ROR #57 SEP + eor sAbi, tmp, sAbi_, ROR #35 SEP xar_m1 vAmi_, vAke, E1, 54 + bic tmp, s_Aba_, sAbu_, ROR #50 SEP + eor sAbo, tmp, sAbo_, ROR #43 SEP + bic tmp, sAbe_, s_Aba_, ROR #44 SEP + eor sAbu, tmp, sAbu_, ROR #30 SEP xar_m1 vAge_, vAgu, E4, 44 + SEP + eor s_Aba, s_Aba, cur_const SEP + SEP xar_m1 vAgu_, vAsi, E2, 3 + save count, STACK_OFFSET_COUNT SEP + SEP + eor sC0, sAka, sAsa, ROR #50 SEP + eor sC1, sAse, sAge, ROR #60 SEP + eor sC2, sAmi, sAgi, ROR #59 SEP xar_m1 vAsi_, vAku, E4, 25 + eor sC3, sAgo, sAso, ROR #30 SEP + eor sC4, sAbu, sAsu, ROR #53 SEP + eor sC0, sAma, sC0, ROR #49 SEP xar_m1 vAku_, vAsa, E0, 46 + eor sC1, sAbe, sC1, ROR #44 SEP + eor sC2, sAki, sC2, ROR #26 SEP + eor sC3, sAmo, sC3, ROR #63 SEP + eor sC4, sAmu, sC4, ROR #56 SEP xar_m1 vAma_, vAbu, E4, 37 + eor sC0, sAga, sC0, ROR #57 SEP + eor sC1, sAme, sC1, ROR #58 SEP + eor sC2, sAbi, sC2, ROR #60 SEP xar_m1 vAbu_, vAsu, E4, 50 + eor sC3, sAko, sC3, ROR #38 SEP + eor sC4, sAgu, sC4, ROR #48 SEP + eor sC0, s_Aba, sC0, ROR #61 SEP + eor sC1, sAke, sC1, ROR #57 SEP xar_m1 vAsu_, vAse, E1, 62 + eor sC2, sAsi, sC2, ROR #52 SEP + eor sC3, sAbo, sC3, ROR #63 SEP + eor sC4, sAku, sC4, ROR #50 SEP xar_m1 vAme_, vAga, E0, 28 + ror sC1, sC1, 56 SEP + ror sC4, sC4, 58 SEP + ror sC2, sC2, 62 SEP xar_m1 vAbe_, vAge, E1, 20 + SEP + eor sE1, sC0, sC2, ROR #63 SEP + eor sE3, sC2, sC4, ROR #63 SEP + eor sE0, sC4, sC1, ROR #63 SEP bcax_m1 vAga, vAga_, vAgi_, vAge_ + eor sE2, sC1, sC3, ROR #63 SEP + eor sE4, sC3, sC0, ROR #63 SEP + SEP bcax_m1 vAge, vAge_, vAgo_, vAgi_ + eor s_Aba_, sE0, s_Aba SEP + eor sAsa_, sE2, sAbi, ROR #50 SEP + eor sAbi_, sE2, sAki, ROR #46 SEP bcax_m1 vAgi, vAgi_, vAgu_, vAgo_ + eor sAki_, sE3, sAko, ROR #63 SEP + eor sAko_, sE4, sAmu, ROR #28 SEP + eor sAmu_, sE3, sAso, ROR #2 SEP bcax_m1 vAgo, vAgo_, vAga_, vAgu_ + eor sAso_, sE0, sAma, ROR #54 SEP + eor sAka_, sE1, sAbe, ROR #43 SEP + eor sAse_, sE3, sAgo, ROR #36 SEP bcax_m1 vAgu, vAgu_, vAge_, vAga_ + eor sAgo_, sE1, sAme, ROR #49 SEP + eor sAke_, sE2, sAgi, ROR #3 SEP + eor sAgi_, sE0, sAka, ROR #39 SEP bcax_m1 vAka, vAka_, vAki_, vAke_ + eor sAga_, sE3, sAbo SEP + eor sAbo_, sE3, sAmo, ROR #37 SEP + eor sAmo_, sE2, sAmi, ROR #8 SEP bcax_m1 vAke, vAke_, vAko_, vAki_ + eor sAmi_, sE1, sAke, ROR #56 SEP + eor sAge_, sE4, sAgu, ROR #44 SEP + eor sAgu_, sE2, sAsi, ROR #62 SEP bcax_m1 vAki, vAki_, vAku_, vAko_ + eor sAsi_, sE4, sAku, ROR #58 SEP + eor sAku_, sE0, sAsa, ROR #25 SEP + eor sAma_, sE4, sAbu, ROR #20 SEP bcax_m1 vAko, vAko_, vAka_, vAku_ + eor sAbu_, sE4, sAsu, ROR #9 SEP + eor sAsu_, sE1, sAse, ROR #23 SEP + eor sAme_, sE0, sAga, ROR #61 SEP bcax_m1 vAku, vAku_, vAke_, vAka_ + eor sAbe_, sE1, sAge, ROR #19 SEP + SEP + load_constant_ptr SEP bcax_m1 vAma, vAma_, vAmi_, vAme_ + restore count, STACK_OFFSET_COUNT SEP + SEP + bic tmp, sAgi_, sAge_, ROR #47 SEP restore x26, STACK_OFFSET_CONST + eor sAga, tmp, sAga_, ROR #39 SEP + bic tmp, sAgo_, sAgi_, ROR #42 SEP + eor sAge, tmp, sAge_, ROR #25 SEP bcax_m1 vAme, vAme_, vAmo_, vAmi_ + bic tmp, sAgu_, sAgo_, ROR #16 SEP + eor sAgi, tmp, sAgi_, ROR #58 SEP ld1r {v28.2d}, [x26], #8 + bic tmp, sAga_, sAgu_, ROR #31 SEP + eor sAgo, tmp, sAgo_, ROR #47 SEP + bic tmp, sAge_, sAga_, ROR #56 SEP bcax_m1 vAmi, vAmi_, vAmu_, vAmo_ + eor sAgu, tmp, sAgu_, ROR #23 SEP + bic tmp, sAki_, sAke_, ROR #19 SEP save x26, STACK_OFFSET_CONST + eor sAka, tmp, sAka_, ROR #24 SEP + bic tmp, sAko_, sAki_, ROR #47 SEP + eor sAke, tmp, sAke_, ROR #2 SEP bcax_m1 vAmo, vAmo_, vAma_, vAmu_ + bic tmp, sAku_, sAko_, ROR #10 SEP + eor sAki, tmp, sAki_, ROR #57 SEP + bic tmp, sAka_, sAku_, ROR #47 SEP bcax_m1 vAmu, vAmu_, vAme_, vAma_ + eor sAko, tmp, sAko_, ROR #57 SEP + bic tmp, sAke_, sAka_, ROR #5 SEP + eor sAku, tmp, sAku_, ROR #52 SEP bcax_m1 vAsa, vAsa_, vAsi_, vAse_ + bic tmp, sAmi_, sAme_, ROR #38 SEP + eor sAma, tmp, sAma_, ROR #47 SEP + bic tmp, sAmo_, sAmi_, ROR #5 SEP bcax_m1 vAse, vAse_, vAso_, vAsi_ + eor sAme, tmp, sAme_, ROR #43 SEP + bic tmp, sAmu_, sAmo_, ROR #41 SEP + eor sAmi, tmp, sAmi_, ROR #46 SEP bcax_m1 vAsi, vAsi_, vAsu_, vAso_ + bic tmp, sAma_, sAmu_, ROR #35 SEP + SEP + ldr cur_const, [const_addr, count, UXTW #3] SEP bcax_m1 vAso, vAso_, vAsa_, vAsu_ + SEP + eor sAmo, tmp, sAmo_, ROR #12 SEP + bic tmp, sAme_, sAma_, ROR #9 SEP bcax_m1 vAsu, vAsu_, vAse_, vAsa_ + eor sAmu, tmp, sAmu_, ROR #44 SEP + bic tmp, sAsi_, sAse_, ROR #48 SEP + eor sAsa, tmp, sAsa_, ROR #41 SEP bcax_m1 vAba, vAba_, vAbi_, vAbe_ + bic tmp, sAso_, sAsi_, ROR #2 SEP + eor sAse, tmp, sAse_, ROR #50 SEP + bic tmp, sAsu_, sAso_, ROR #25 SEP bcax_m1 vAbe, vAbe_, vAbo_, vAbi_ + eor sAsi, tmp, sAsi_, ROR #27 SEP + bic tmp, sAsa_, sAsu_, ROR #60 SEP + eor sAso, tmp, sAso_, ROR #21 SEP bcax_m1 vAbi, vAbi_, vAbu_, vAbo_ + bic tmp, sAse_, sAsa_, ROR #57 SEP + eor sAsu, tmp, sAsu_, ROR #53 SEP + bic tmp, sAbi_, sAbe_, ROR #63 SEP bcax_m1 vAbo, vAbo_, vAba_, vAbu_ + eor s_Aba, s_Aba_, tmp, ROR #21 SEP + bic tmp, sAbo_, sAbi_, ROR #42 SEP + eor sAbe, tmp, sAbe_, ROR #41 SEP bcax_m1 vAbu, vAbu_, vAbe_, vAba_ + bic tmp, sAbu_, sAbo_, ROR #57 SEP + eor sAbi, tmp, sAbi_, ROR #35 SEP + bic tmp, s_Aba_, sAbu_, ROR #50 SEP + eor sAbo, tmp, sAbo_, ROR #43 SEP eor vAba.16b, vAba.16b, v28.16b + bic tmp, sAbe_, s_Aba_, ROR #44 SEP + eor sAbu, tmp, sAbu_, ROR #30 SEP + SEP + add count, count, #1 SEP + SEP + eor s_Aba, s_Aba, cur_const SEP + SEP +.endm + +.macro hybrid_round_noninitial + save count, STACK_OFFSET_COUNT SEP eor3_m1 C0, vAba, vAga, vAka + SEP + eor sC0, sAka, sAsa, ROR #50 SEP + eor sC1, sAse, sAge, ROR #60 SEP eor3_m1 C0, C0, vAma, vAsa + eor sC2, sAmi, sAgi, ROR #59 SEP + eor sC3, sAgo, sAso, ROR #30 SEP + eor sC4, sAbu, sAsu, ROR #53 SEP eor3_m1 C1, vAbe, vAge, vAke + eor sC0, sAma, sC0, ROR #49 SEP + eor sC1, sAbe, sC1, ROR #44 SEP + eor sC2, sAki, sC2, ROR #26 SEP eor3_m1 C1, C1, vAme, vAse + eor sC3, sAmo, sC3, ROR #63 SEP + eor sC4, sAmu, sC4, ROR #56 SEP + eor sC0, sAga, sC0, ROR #57 SEP eor3_m1 C2, vAbi, vAgi, vAki + eor sC1, sAme, sC1, ROR #58 SEP + eor sC2, sAbi, sC2, ROR #60 SEP + eor sC3, sAko, sC3, ROR #38 SEP eor3_m1 C2, C2, vAmi, vAsi + eor sC4, sAgu, sC4, ROR #48 SEP + eor sC0, s_Aba, sC0, ROR #61 SEP + eor sC1, sAke, sC1, ROR #57 SEP eor3_m1 C3, vAbo, vAgo, vAko + eor sC2, sAsi, sC2, ROR #52 SEP + eor sC3, sAbo, sC3, ROR #63 SEP + eor sC4, sAku, sC4, ROR #50 SEP eor3_m1 C3, C3, vAmo, vAso + ror sC1, sC1, 56 SEP + ror sC4, sC4, 58 SEP + ror sC2, sC2, 62 SEP eor3_m1 C4, vAbu, vAgu, vAku + SEP + eor sE1, sC0, sC2, ROR #63 SEP + eor sE3, sC2, sC4, ROR #63 SEP eor3_m1 C4, C4, vAmu, vAsu + eor sE0, sC4, sC1, ROR #63 SEP + eor sE2, sC1, sC3, ROR #63 SEP + eor sE4, sC3, sC0, ROR #63 SEP rax1_m1 E1, C0, C2 + SEP + eor s_Aba_, sE0, s_Aba SEP + eor sAsa_, sE2, sAbi, ROR #50 SEP + eor sAbi_, sE2, sAki, ROR #46 SEP rax1_m1 E3, C2, C4 + eor sAki_, sE3, sAko, ROR #63 SEP + eor sAko_, sE4, sAmu, ROR #28 SEP + eor sAmu_, sE3, sAso, ROR #2 SEP + eor sAso_, sE0, sAma, ROR #54 SEP rax1_m1 E0, C4, C1 + eor sAka_, sE1, sAbe, ROR #43 SEP + eor sAse_, sE3, sAgo, ROR #36 SEP + eor sAgo_, sE1, sAme, ROR #49 SEP + eor sAke_, sE2, sAgi, ROR #3 SEP rax1_m1 E2, C1, C3 + eor sAgi_, sE0, sAka, ROR #39 SEP + eor sAga_, sE3, sAbo SEP + eor sAbo_, sE3, sAmo, ROR #37 SEP + eor sAmo_, sE2, sAmi, ROR #8 SEP rax1_m1 E4, C3, C0 + eor sAmi_, sE1, sAke, ROR #56 SEP + eor sAge_, sE4, sAgu, ROR #44 SEP + eor sAgu_, sE2, sAsi, ROR #62 SEP + eor sAsi_, sE4, sAku, ROR #58 SEP eor vAba_.16b, vAba.16b, E0.16b + eor sAku_, sE0, sAsa, ROR #25 SEP + eor sAma_, sE4, sAbu, ROR #20 SEP + eor sAbu_, sE4, sAsu, ROR #9 SEP xar_m1 vAsa_, vAbi, E2, 2 + eor sAsu_, sE1, sAse, ROR #23 SEP + eor sAme_, sE0, sAga, ROR #61 SEP + eor sAbe_, sE1, sAge, ROR #19 SEP + SEP xar_m1 vAbi_, vAki, E2, 21 + load_constant_ptr SEP + restore count, STACK_OFFSET_COUNT SEP + SEP + bic tmp, sAgi_, sAge_, ROR #47 SEP xar_m1 vAki_, vAko, E3, 39 + eor sAga, tmp, sAga_, ROR #39 SEP + bic tmp, sAgo_, sAgi_, ROR #42 SEP + eor sAge, tmp, sAge_, ROR #25 SEP xar_m1 vAko_, vAmu, E4, 56 + bic tmp, sAgu_, sAgo_, ROR #16 SEP + eor sAgi, tmp, sAgi_, ROR #58 SEP + bic tmp, sAga_, sAgu_, ROR #31 SEP + eor sAgo, tmp, sAgo_, ROR #47 SEP xar_m1 vAmu_, vAso, E3, 8 + bic tmp, sAge_, sAga_, ROR #56 SEP + eor sAgu, tmp, sAgu_, ROR #23 SEP + bic tmp, sAki_, sAke_, ROR #19 SEP + eor sAka, tmp, sAka_, ROR #24 SEP xar_m1 vAso_, vAma, E0, 23 + bic tmp, sAko_, sAki_, ROR #47 SEP + eor sAke, tmp, sAke_, ROR #2 SEP + bic tmp, sAku_, sAko_, ROR #10 SEP + eor sAki, tmp, sAki_, ROR #57 SEP xar_m1 vAka_, vAbe, E1, 63 + bic tmp, sAka_, sAku_, ROR #47 SEP + eor sAko, tmp, sAko_, ROR #57 SEP + bic tmp, sAke_, sAka_, ROR #5 SEP xar_m1 vAse_, vAgo, E3, 9 + eor sAku, tmp, sAku_, ROR #52 SEP + bic tmp, sAmi_, sAme_, ROR #38 SEP + eor sAma, tmp, sAma_, ROR #47 SEP + bic tmp, sAmo_, sAmi_, ROR #5 SEP + eor sAme, tmp, sAme_, ROR #43 SEP xar_m1 vAgo_, vAme, E1, 19 + bic tmp, sAmu_, sAmo_, ROR #41 SEP + eor sAmi, tmp, sAmi_, ROR #46 SEP + bic tmp, sAma_, sAmu_, ROR #35 SEP + SEP xar_m1 vAke_, vAgi, E2, 58 + ldr cur_const, [const_addr, count, UXTW #3] SEP + add count, count, #1 SEP + SEP + eor sAmo, tmp, sAmo_, ROR #12 SEP + bic tmp, sAme_, sAma_, ROR #9 SEP xar_m1 vAgi_, vAka, E0, 61 + eor sAmu, tmp, sAmu_, ROR #44 SEP + bic tmp, sAsi_, sAse_, ROR #48 SEP + eor sAsa, tmp, sAsa_, ROR #41 SEP + bic tmp, sAso_, sAsi_, ROR #2 SEP xar_m1 vAga_, vAbo, E3, 36 + eor sAse, tmp, sAse_, ROR #50 SEP + bic tmp, sAsu_, sAso_, ROR #25 SEP + eor sAsi, tmp, sAsi_, ROR #27 SEP + bic tmp, sAsa_, sAsu_, ROR #60 SEP xar_m1 vAbo_, vAmo, E3, 43 + eor sAso, tmp, sAso_, ROR #21 SEP + bic tmp, sAse_, sAsa_, ROR #57 SEP + eor sAsu, tmp, sAsu_, ROR #53 SEP + bic tmp, sAbi_, sAbe_, ROR #63 SEP xar_m1 vAmo_, vAmi, E2, 49 + eor s_Aba, s_Aba_, tmp, ROR #21 SEP + bic tmp, sAbo_, sAbi_, ROR #42 SEP + eor sAbe, tmp, sAbe_, ROR #41 SEP + bic tmp, sAbu_, sAbo_, ROR #57 SEP xar_m1 vAmi_, vAke, E1, 54 + eor sAbi, tmp, sAbi_, ROR #35 SEP + bic tmp, s_Aba_, sAbu_, ROR #50 SEP + eor sAbo, tmp, sAbo_, ROR #43 SEP + bic tmp, sAbe_, s_Aba_, ROR #44 SEP xar_m1 vAge_, vAgu, E4, 44 + eor sAbu, tmp, sAbu_, ROR #30 SEP + SEP + eor s_Aba, s_Aba, cur_const SEP xar_m1 vAgu_, vAsi, E2, 3 + save count, STACK_OFFSET_COUNT SEP + SEP + eor sC0, sAka, sAsa, ROR #50 SEP + eor sC1, sAse, sAge, ROR #60 SEP + eor sC2, sAmi, sAgi, ROR #59 SEP xar_m1 vAsi_, vAku, E4, 25 + eor sC3, sAgo, sAso, ROR #30 SEP + eor sC4, sAbu, sAsu, ROR #53 SEP + eor sC0, sAma, sC0, ROR #49 SEP xar_m1 vAku_, vAsa, E0, 46 + eor sC1, sAbe, sC1, ROR #44 SEP + eor sC2, sAki, sC2, ROR #26 SEP + eor sC3, sAmo, sC3, ROR #63 SEP + eor sC4, sAmu, sC4, ROR #56 SEP xar_m1 vAma_, vAbu, E4, 37 + eor sC0, sAga, sC0, ROR #57 SEP + eor sC1, sAme, sC1, ROR #58 SEP + eor sC2, sAbi, sC2, ROR #60 SEP + eor sC3, sAko, sC3, ROR #38 SEP xar_m1 vAbu_, vAsu, E4, 50 + eor sC4, sAgu, sC4, ROR #48 SEP + eor sC0, s_Aba, sC0, ROR #61 SEP + eor sC1, sAke, sC1, ROR #57 SEP + eor sC2, sAsi, sC2, ROR #52 SEP xar_m1 vAsu_, vAse, E1, 62 + eor sC3, sAbo, sC3, ROR #63 SEP + eor sC4, sAku, sC4, ROR #50 SEP + ror sC1, sC1, 56 SEP xar_m1 vAme_, vAga, E0, 28 + ror sC4, sC4, 58 SEP + ror sC2, sC2, 62 SEP + SEP xar_m1 vAbe_, vAge, E1, 20 + eor sE1, sC0, sC2, ROR #63 SEP + eor sE3, sC2, sC4, ROR #63 SEP + eor sE0, sC4, sC1, ROR #63 SEP + eor sE2, sC1, sC3, ROR #63 SEP bcax_m1 vAga, vAga_, vAgi_, vAge_ + eor sE4, sC3, sC0, ROR #63 SEP + SEP + eor s_Aba_, sE0, s_Aba SEP bcax_m1 vAge, vAge_, vAgo_, vAgi_ + eor sAsa_, sE2, sAbi, ROR #50 SEP + eor sAbi_, sE2, sAki, ROR #46 SEP + eor sAki_, sE3, sAko, ROR #63 SEP bcax_m1 vAgi, vAgi_, vAgu_, vAgo_ + eor sAko_, sE4, sAmu, ROR #28 SEP + eor sAmu_, sE3, sAso, ROR #2 SEP + eor sAso_, sE0, sAma, ROR #54 SEP bcax_m1 vAgo, vAgo_, vAga_, vAgu_ + eor sAka_, sE1, sAbe, ROR #43 SEP + eor sAse_, sE3, sAgo, ROR #36 SEP + eor sAgo_, sE1, sAme, ROR #49 SEP bcax_m1 vAgu, vAgu_, vAge_, vAga_ + eor sAke_, sE2, sAgi, ROR #3 SEP + eor sAgi_, sE0, sAka, ROR #39 SEP + eor sAga_, sE3, sAbo SEP bcax_m1 vAka, vAka_, vAki_, vAke_ + eor sAbo_, sE3, sAmo, ROR #37 SEP + eor sAmo_, sE2, sAmi, ROR #8 SEP + eor sAmi_, sE1, sAke, ROR #56 SEP bcax_m1 vAke, vAke_, vAko_, vAki_ + eor sAge_, sE4, sAgu, ROR #44 SEP + eor sAgu_, sE2, sAsi, ROR #62 SEP + eor sAsi_, sE4, sAku, ROR #58 SEP bcax_m1 vAki, vAki_, vAku_, vAko_ + eor sAku_, sE0, sAsa, ROR #25 SEP + eor sAma_, sE4, sAbu, ROR #20 SEP + eor sAbu_, sE4, sAsu, ROR #9 SEP bcax_m1 vAko, vAko_, vAka_, vAku_ + eor sAsu_, sE1, sAse, ROR #23 SEP + eor sAme_, sE0, sAga, ROR #61 SEP + eor sAbe_, sE1, sAge, ROR #19 SEP bcax_m1 vAku, vAku_, vAke_, vAka_ + SEP + load_constant_ptr SEP + restore count, STACK_OFFSET_COUNT SEP bcax_m1 vAma, vAma_, vAmi_, vAme_ + SEP + bic tmp, sAgi_, sAge_, ROR #47 SEP + eor sAga, tmp, sAga_, ROR #39 SEP restore x26, STACK_OFFSET_CONST + bic tmp, sAgo_, sAgi_, ROR #42 SEP + eor sAge, tmp, sAge_, ROR #25 SEP + bic tmp, sAgu_, sAgo_, ROR #16 SEP bcax_m1 vAme, vAme_, vAmo_, vAmi_ + eor sAgi, tmp, sAgi_, ROR #58 SEP + bic tmp, sAga_, sAgu_, ROR #31 SEP ld1r {v28.2d}, [x26], #8 + eor sAgo, tmp, sAgo_, ROR #47 SEP + bic tmp, sAge_, sAga_, ROR #56 SEP + eor sAgu, tmp, sAgu_, ROR #23 SEP bcax_m1 vAmi, vAmi_, vAmu_, vAmo_ + bic tmp, sAki_, sAke_, ROR #19 SEP + eor sAka, tmp, sAka_, ROR #24 SEP save x26, STACK_OFFSET_CONST + bic tmp, sAko_, sAki_, ROR #47 SEP + eor sAke, tmp, sAke_, ROR #2 SEP + bic tmp, sAku_, sAko_, ROR #10 SEP bcax_m1 vAmo, vAmo_, vAma_, vAmu_ + eor sAki, tmp, sAki_, ROR #57 SEP + bic tmp, sAka_, sAku_, ROR #47 SEP + eor sAko, tmp, sAko_, ROR #57 SEP bcax_m1 vAmu, vAmu_, vAme_, vAma_ + bic tmp, sAke_, sAka_, ROR #5 SEP + eor sAku, tmp, sAku_, ROR #52 SEP + bic tmp, sAmi_, sAme_, ROR #38 SEP bcax_m1 vAsa, vAsa_, vAsi_, vAse_ + eor sAma, tmp, sAma_, ROR #47 SEP + bic tmp, sAmo_, sAmi_, ROR #5 SEP + eor sAme, tmp, sAme_, ROR #43 SEP bcax_m1 vAse, vAse_, vAso_, vAsi_ + bic tmp, sAmu_, sAmo_, ROR #41 SEP + eor sAmi, tmp, sAmi_, ROR #46 SEP + bic tmp, sAma_, sAmu_, ROR #35 SEP bcax_m1 vAsi, vAsi_, vAsu_, vAso_ + SEP + ldr cur_const, [const_addr, count, UXTW #3] SEP + add count, count, #1 SEP bcax_m1 vAso, vAso_, vAsa_, vAsu_ + SEP + eor sAmo, tmp, sAmo_, ROR #12 SEP + bic tmp, sAme_, sAma_, ROR #9 SEP bcax_m1 vAsu, vAsu_, vAse_, vAsa_ + eor sAmu, tmp, sAmu_, ROR #44 SEP + bic tmp, sAsi_, sAse_, ROR #48 SEP + eor sAsa, tmp, sAsa_, ROR #41 SEP bcax_m1 vAba, vAba_, vAbi_, vAbe_ + bic tmp, sAso_, sAsi_, ROR #2 SEP + eor sAse, tmp, sAse_, ROR #50 SEP + bic tmp, sAsu_, sAso_, ROR #25 SEP bcax_m1 vAbe, vAbe_, vAbo_, vAbi_ + eor sAsi, tmp, sAsi_, ROR #27 SEP + bic tmp, sAsa_, sAsu_, ROR #60 SEP + eor sAso, tmp, sAso_, ROR #21 SEP bcax_m1 vAbi, vAbi_, vAbu_, vAbo_ + bic tmp, sAse_, sAsa_, ROR #57 SEP + eor sAsu, tmp, sAsu_, ROR #53 SEP + bic tmp, sAbi_, sAbe_, ROR #63 SEP bcax_m1 vAbo, vAbo_, vAba_, vAbu_ + eor s_Aba, s_Aba_, tmp, ROR #21 SEP + bic tmp, sAbo_, sAbi_, ROR #42 SEP + eor sAbe, tmp, sAbe_, ROR #41 SEP bcax_m1 vAbu, vAbu_, vAbe_, vAba_ + bic tmp, sAbu_, sAbo_, ROR #57 SEP + eor sAbi, tmp, sAbi_, ROR #35 SEP + bic tmp, s_Aba_, sAbu_, ROR #50 SEP + eor sAbo, tmp, sAbo_, ROR #43 SEP eor vAba.16b, vAba.16b, v28.16b + bic tmp, sAbe_, s_Aba_, ROR #44 SEP + eor sAbu, tmp, sAbu_, ROR #30 SEP + SEP + eor s_Aba, s_Aba, cur_const SEP + +.endm + +.macro final_rotate + ror sAga, sAga,#(64-3) + ror sAka, sAka,#(64-25) + ror sAma, sAma,#(64-10) + ror sAsa, sAsa,#(64-39) + ror sAbe, sAbe,#(64-21) + ror sAge, sAge,#(64-45) + ror sAke, sAke,#(64-8) + ror sAme, sAme,#(64-15) + ror sAse, sAse,#(64-41) + ror sAbi, sAbi,#(64-14) + ror sAgi, sAgi,#(64-61) + ror sAki, sAki,#(64-18) + ror sAmi, sAmi,#(64-56) + ror sAsi, sAsi,#(64-2) + ror sAgo, sAgo,#(64-28) + ror sAko, sAko,#(64-1) + ror sAmo, sAmo,#(64-27) + ror sAso, sAso,#(64-62) + ror sAbu, sAbu,#(64-44) + ror sAgu, sAgu,#(64-20) + ror sAku, sAku,#(64-6) + ror sAmu, sAmu,#(64-36) + ror sAsu, sAsu,#(64-55) +.endm + +#define KECCAK_F1600_ROUNDS 24 + +.global keccak_f1600_x4_hybrid_asm_v3pp +.global _keccak_f1600_x4_hybrid_asm_v3pp +.text +.align 4 + +keccak_f1600_x4_hybrid_asm_v3pp: +_keccak_f1600_x4_hybrid_asm_v3pp: + alloc_stack + save_gprs + save_vregs + save input_addr, STACK_OFFSET_INPUT + + ASM_LOAD(const_addr,round_constants) + save const_addr, STACK_OFFSET_CONST + + load_input_vector + + add input_addr, input_addr, #16 + + mov out_count, #0 +outer_loop: + save out_count, STACK_OFFSET_COUNT_OUT + + load_input_scalar + save input_addr, STACK_OFFSET_CUR_INPUT + + hybrid_round_initial +1: + hybrid_round_noninitial + cmp count, #(KECCAK_F1600_ROUNDS) + blt 1b + + final_rotate + restore input_addr, STACK_OFFSET_CUR_INPUT + store_input_scalar + add input_addr, input_addr, #8 + + restore out_count, STACK_OFFSET_COUNT_OUT + add out_count, out_count, #1 + cmp out_count, #2 + blt outer_loop + + restore input_addr, STACK_OFFSET_INPUT + store_input_vector + + restore_vregs + restore_gprs + free_stack + ret diff --git a/asm/manual/keccak_f1600/keccak_f1600_x4_hybrid_asm_v4.s b/asm/manual/keccak_f1600/keccak_f1600_x4_hybrid_asm_v4.s new file mode 100644 index 0000000..a5aa8cd --- /dev/null +++ b/asm/manual/keccak_f1600/keccak_f1600_x4_hybrid_asm_v4.s @@ -0,0 +1,1018 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +#if defined(__ARM_FEATURE_SHA3) + +/********************** CONSTANTS *************************/ + .data + .align(8) +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x29 + count .req w27 + cur_const .req x26 + + /* Mapping of Kecck-f1600 SIMD state to vector registers + * at the beginning and end of each round. */ + + vAba .req v0 + vAbe .req v1 + vAbi .req v2 + vAbo .req v3 + vAbu .req v4 + vAga .req v5 + vAge .req v6 + vAgi .req v7 + vAgo .req v8 + vAgu .req v9 + vAka .req v10 + vAke .req v11 + vAki .req v12 + vAko .req v13 + vAku .req v14 + vAma .req v15 + vAme .req v16 + vAmi .req v17 + vAmo .req v18 + vAmu .req v19 + vAsa .req v20 + vAse .req v21 + vAsi .req v22 + vAso .req v23 + vAsu .req v24 + + /* q-form of the above mapping */ + vAbaq .req q0 + vAbeq .req q1 + vAbiq .req q2 + vAboq .req q3 + vAbuq .req q4 + vAgaq .req q5 + vAgeq .req q6 + vAgiq .req q7 + vAgoq .req q8 + vAguq .req q9 + vAkaq .req q10 + vAkeq .req q11 + vAkiq .req q12 + vAkoq .req q13 + vAkuq .req q14 + vAmaq .req q15 + vAmeq .req q16 + vAmiq .req q17 + vAmoq .req q18 + vAmuq .req q19 + vAsaq .req q20 + vAseq .req q21 + vAsiq .req q22 + vAsoq .req q23 + vAsuq .req q24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v30 + C1 .req v29 + C2 .req v28 + C3 .req v27 + C4 .req v26 + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req v26 + E1 .req v25 + E2 .req v29 + E3 .req v28 + E4 .req v27 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + vAbi_ .req v2 + vAbo_ .req v3 + vAbu_ .req v4 + vAga_ .req v10 + vAge_ .req v11 + vAgi_ .req v7 + vAgo_ .req v8 + vAgu_ .req v9 + vAka_ .req v15 + vAke_ .req v16 + vAki_ .req v12 + vAko_ .req v13 + vAku_ .req v14 + vAma_ .req v20 + vAme_ .req v21 + vAmi_ .req v17 + vAmo_ .req v18 + vAmu_ .req v19 + vAsa_ .req v0 + vAse_ .req v1 + vAsi_ .req v22 + vAso_ .req v23 + vAsu_ .req v24 + vAba_ .req v30 + vAbe_ .req v27 + + /* Unused temporary */ + vtmp .req v31 + + /* Mapping of Kecck-f1600 state to scalar registers + * at the beginning and end of each round. */ + s_Aba .req x1 + sAbe .req x6 + sAbi .req x11 + sAbo .req x16 + sAbu .req x21 + sAga .req x2 + sAge .req x7 + sAgi .req x12 + sAgo .req x17 + sAgu .req x22 + sAka .req x3 + sAke .req x8 + sAki .req x13 + sAko .req x18 + sAku .req x23 + sAma .req x4 + sAme .req x9 + sAmi .req x14 + sAmo .req x19 + sAmu .req x24 + sAsa .req x5 + sAse .req x10 + sAsi .req x15 + sAso .req x20 + sAsu .req x25 + + /* sA_[y,2*x+3*y] = rot(A[x,y]) */ + s_Aba_ .req x0 + sAbe_ .req x28 + sAbi_ .req x11 + sAbo_ .req x16 + sAbu_ .req x21 + sAga_ .req x3 + sAge_ .req x8 + sAgi_ .req x12 + sAgo_ .req x17 + sAgu_ .req x22 + sAka_ .req x4 + sAke_ .req x9 + sAki_ .req x13 + sAko_ .req x18 + sAku_ .req x23 + sAma_ .req x5 + sAme_ .req x10 + sAmi_ .req x14 + sAmo_ .req x19 + sAmu_ .req x24 + sAsa_ .req x1 + sAse_ .req x6 + sAsi_ .req x15 + sAso_ .req x20 + sAsu_ .req x25 + + /* sC[x] = sA[x,0] xor sA[x,1] xor sA[x,2] xor sA[x,3] xor sA[x,4], for x in 0..4 */ + /* sE[x] = sC[x-1] xor rot(C[x+1],1), for x in 0..4 */ + sC0 .req x0 + sE0 .req x29 + sC1 .req x26 + sE1 .req x30 + sC2 .req x27 + sE2 .req x26 + sC3 .req x28 + sE3 .req x27 + sC4 .req x29 + sE4 .req x28 + + tmp .req x30 + +/************************ MACROS ****************************/ + +/* Macros using v8.4-A SHA-3 instructions */ + + +.macro eor3_m1 d s0 s1 s2 + eor \d\().16b, \s0\().16b, \s1\().16b + eor \d\().16b, \d\().16b, \s2\().16b +.endm + +.macro rax1_m1 d s0 s1 + add vtmp.2d, \s1\().2d, \s1\().2d + sri vtmp.2d, \s1\().2d, #63 + eor \d\().16b, vtmp.16b, \s0\().16b +.endm + +.macro xar_m1 d s0 s1 imm + eor vtmp.16b, \s0\().16b, \s1\().16b + shl \d\().2d, vtmp.2d, #(64-\imm) + sri \d\().2d, vtmp.2d, #(\imm) +.endm + +.macro bcax_m1 d s0 s1 s2 + bic vtmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, vtmp.16b, \s0\().16b +.endm + +.macro eor3_m0 d s0 s1 s2 + eor3 \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +.macro rax1_m0 d s0 s1 + rax1 \d\().2d, \s0\().2d, \s1\().2d +.endm + +.macro xar_m0 d s0 s1 imm + xar \d\().2d, \s0\().2d, \s1\().2d, #\imm +.endm + +.macro bcax_m0 d s0 s1 s2 + bcax \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + + +.macro load_input_vector num idx + ldr vAbaq, [input_addr, #(16*(\num*0+\idx))] + ldr vAbeq, [input_addr, #(16*(\num*1+\idx))] + ldr vAbiq, [input_addr, #(16*(\num*2+\idx))] + ldr vAboq, [input_addr, #(16*(\num*3+\idx))] + ldr vAbuq, [input_addr, #(16*(\num*4+\idx))] + ldr vAgaq, [input_addr, #(16*(\num*5+\idx))] + ldr vAgeq, [input_addr, #(16*(\num*6+\idx))] + ldr vAgiq, [input_addr, #(16*(\num*7+\idx))] + ldr vAgoq, [input_addr, #(16*(\num*8+\idx))] + ldr vAguq, [input_addr, #(16*(\num*9+\idx))] + ldr vAkaq, [input_addr, #(16*(\num*10+\idx))] + ldr vAkeq, [input_addr, #(16*(\num*11+\idx))] + ldr vAkiq, [input_addr, #(16*(\num*12+\idx))] + ldr vAkoq, [input_addr, #(16*(\num*13+\idx))] + ldr vAkuq, [input_addr, #(16*(\num*14+\idx))] + ldr vAmaq, [input_addr, #(16*(\num*15+\idx))] + ldr vAmeq, [input_addr, #(16*(\num*16+\idx))] + ldr vAmiq, [input_addr, #(16*(\num*17+\idx))] + ldr vAmoq, [input_addr, #(16*(\num*18+\idx))] + ldr vAmuq, [input_addr, #(16*(\num*19+\idx))] + ldr vAsaq, [input_addr, #(16*(\num*20+\idx))] + ldr vAseq, [input_addr, #(16*(\num*21+\idx))] + ldr vAsiq, [input_addr, #(16*(\num*22+\idx))] + ldr vAsoq, [input_addr, #(16*(\num*23+\idx))] + ldr vAsuq, [input_addr, #(16*(\num*24+\idx))] +.endm + +.macro store_input_vector num idx + str vAbaq, [input_addr, #(16*(\num*0+\idx))] + str vAbeq, [input_addr, #(16*(\num*1+\idx))] + str vAbiq, [input_addr, #(16*(\num*2+\idx))] + str vAboq, [input_addr, #(16*(\num*3+\idx))] + str vAbuq, [input_addr, #(16*(\num*4+\idx))] + str vAgaq, [input_addr, #(16*(\num*5+\idx))] + str vAgeq, [input_addr, #(16*(\num*6+\idx))] + str vAgiq, [input_addr, #(16*(\num*7+\idx))] + str vAgoq, [input_addr, #(16*(\num*8+\idx))] + str vAguq, [input_addr, #(16*(\num*9+\idx))] + str vAkaq, [input_addr, #(16*(\num*10+\idx))] + str vAkeq, [input_addr, #(16*(\num*11+\idx))] + str vAkiq, [input_addr, #(16*(\num*12+\idx))] + str vAkoq, [input_addr, #(16*(\num*13+\idx))] + str vAkuq, [input_addr, #(16*(\num*14+\idx))] + str vAmaq, [input_addr, #(16*(\num*15+\idx))] + str vAmeq, [input_addr, #(16*(\num*16+\idx))] + str vAmiq, [input_addr, #(16*(\num*17+\idx))] + str vAmoq, [input_addr, #(16*(\num*18+\idx))] + str vAmuq, [input_addr, #(16*(\num*19+\idx))] + str vAsaq, [input_addr, #(16*(\num*20+\idx))] + str vAseq, [input_addr, #(16*(\num*21+\idx))] + str vAsiq, [input_addr, #(16*(\num*22+\idx))] + str vAsoq, [input_addr, #(16*(\num*23+\idx))] + str vAsuq, [input_addr, #(16*(\num*24+\idx))] +.endm + +.macro store_input_scalar num idx + str s_Aba, [input_addr, 8*(\num*(0) +\idx)] + str sAbe, [input_addr, 8*(\num*(0+1) +\idx)] + str sAbi, [input_addr, 8*(\num*(2)+ \idx)] + str sAbo, [input_addr, 8*(\num*(2+1) +\idx)] + str sAbu, [input_addr, 8*(\num*(4)+ \idx)] + str sAga, [input_addr, 8*(\num*(4+1) +\idx)] + str sAge, [input_addr, 8*(\num*(6)+ \idx)] + str sAgi, [input_addr, 8*(\num*(6+1) +\idx)] + str sAgo, [input_addr, 8*(\num*(8)+ \idx)] + str sAgu, [input_addr, 8*(\num*(8+1) +\idx)] + str sAka, [input_addr, 8*(\num*(10) +\idx)] + str sAke, [input_addr, 8*(\num*(10+1)+\idx)] + str sAki, [input_addr, 8*(\num*(12) +\idx)] + str sAko, [input_addr, 8*(\num*(12+1)+\idx)] + str sAku, [input_addr, 8*(\num*(14) +\idx)] + str sAma, [input_addr, 8*(\num*(14+1)+\idx)] + str sAme, [input_addr, 8*(\num*(16) +\idx)] + str sAmi, [input_addr, 8*(\num*(16+1)+\idx)] + str sAmo, [input_addr, 8*(\num*(18) +\idx)] + str sAmu, [input_addr, 8*(\num*(18+1)+\idx)] + str sAsa, [input_addr, 8*(\num*(20) +\idx)] + str sAse, [input_addr, 8*(\num*(20+1)+\idx)] + str sAsi, [input_addr, 8*(\num*(22) +\idx)] + str sAso, [input_addr, 8*(\num*(22+1)+\idx)] + str sAsu, [input_addr, 8*(\num*(24) +\idx)] +.endm + +.macro load_input_scalar num idx + ldr s_Aba, [input_addr, 8*(\num*(0) +\idx)] + ldr sAbe, [input_addr, 8*(\num*(0+1) +\idx)] + ldr sAbi, [input_addr, 8*(\num*(2)+ \idx)] + ldr sAbo, [input_addr, 8*(\num*(2+1) +\idx)] + ldr sAbu, [input_addr, 8*(\num*(4)+ \idx)] + ldr sAga, [input_addr, 8*(\num*(4+1) +\idx)] + ldr sAge, [input_addr, 8*(\num*(6)+ \idx)] + ldr sAgi, [input_addr, 8*(\num*(6+1) +\idx)] + ldr sAgo, [input_addr, 8*(\num*(8)+ \idx)] + ldr sAgu, [input_addr, 8*(\num*(8+1) +\idx)] + ldr sAka, [input_addr, 8*(\num*(10) +\idx)] + ldr sAke, [input_addr, 8*(\num*(10+1)+\idx)] + ldr sAki, [input_addr, 8*(\num*(12) +\idx)] + ldr sAko, [input_addr, 8*(\num*(12+1)+\idx)] + ldr sAku, [input_addr, 8*(\num*(14) +\idx)] + ldr sAma, [input_addr, 8*(\num*(14+1)+\idx)] + ldr sAme, [input_addr, 8*(\num*(16) +\idx)] + ldr sAmi, [input_addr, 8*(\num*(16+1)+\idx)] + ldr sAmo, [input_addr, 8*(\num*(18) +\idx)] + ldr sAmu, [input_addr, 8*(\num*(18+1)+\idx)] + ldr sAsa, [input_addr, 8*(\num*(20) +\idx)] + ldr sAse, [input_addr, 8*(\num*(20+1)+\idx)] + ldr sAsi, [input_addr, 8*(\num*(22) +\idx)] + ldr sAso, [input_addr, 8*(\num*(22+1)+\idx)] + ldr sAsu, [input_addr, 8*(\num*(24) +\idx)] +.endm + +#define STACK_SIZE (8*8 + 16*6 + 3*8 + 8) // VREGS (8*8), GPRs (16*6), count (8), const (8), input (8), padding (8) +#define STACK_BASE_GPRS (3*8+8) +#define STACK_BASE_VREGS (3*8+8+16*6) +#define STACK_OFFSET_INPUT (0*8) +#define STACK_OFFSET_CONST (1*8) +#define STACK_OFFSET_COUNT (2*8) + +.macro save_gprs + stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro save_vregs + stp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] + stp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + stp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + stp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] +.endm + +.macro restore_vregs + ldp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] + ldp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + ldp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + ldp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] +.endm + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +.macro eor5 dst, src0, src1, src2, src3, src4 + eor \dst, \src0, \src1 + eor \dst, \dst, \src2 + eor \dst, \dst, \src3 + eor \dst, \dst, \src4 +.endm + +.macro xor_rol dst, src1, src0, imm + eor \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro bic_rol dst, src1, src0, imm + bic \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro rotate dst, src, imm + ror \dst, \src, #(64-\imm) +.endm + +.macro save reg, offset + str \reg, [sp, #\offset] +.endm + +.macro restore reg, offset + ldr \reg, [sp, #\offset] +.endm + +.macro hybrid_round_initial + + eor sC0, sAma, sAsa SEP eor3_m1 C0, vAba, vAga, vAka + eor sC1, sAme, sAse SEP + eor sC2, sAmi, sAsi SEP + eor sC3, sAmo, sAso SEP eor3_m0 C0, C0, vAma, vAsa + eor sC4, sAmu, sAsu SEP + eor sC0, sAka, sC0 SEP + eor sC1, sAke, sC1 SEP eor3_m1 C1, vAbe, vAge, vAke + eor sC2, sAki, sC2 SEP + eor sC3, sAko, sC3 SEP + eor sC4, sAku, sC4 SEP eor3_m0 C1, C1, vAme, vAse + eor sC0, sAga, sC0 SEP + eor sC1, sAge, sC1 SEP + eor sC2, sAgi, sC2 SEP eor3_m1 C2, vAbi, vAgi, vAki + eor sC3, sAgo, sC3 SEP + eor sC4, sAgu, sC4 SEP + eor sC0, s_Aba, sC0 SEP eor3_m0 C2, C2, vAmi, vAsi + eor sC1, sAbe, sC1 SEP + eor sC2, sAbi, sC2 SEP + eor sC3, sAbo, sC3 SEP eor3_m1 C3, vAbo, vAgo, vAko + eor sC4, sAbu, sC4 SEP + SEP + eor sE1, sC0, sC2, ROR #63 SEP eor3_m0 C3, C3, vAmo, vAso + eor sE3, sC2, sC4, ROR #63 SEP + eor sE0, sC4, sC1, ROR #63 SEP + eor sE2, sC1, sC3, ROR #63 SEP eor3_m1 C4, vAbu, vAgu, vAku + eor sE4, sC3, sC0, ROR #63 SEP + SEP + eor s_Aba_, s_Aba, sE0 SEP eor3_m0 C4, C4, vAmu, vAsu + eor sAsa_, sAbi, sE2 SEP + eor sAbi_, sAki, sE2 SEP + eor sAki_, sAko, sE3 SEP + eor sAko_, sAmu, sE4 SEP rax1_m1 E1, C0, C2 + eor sAmu_, sAso, sE3 SEP + eor sAso_, sAma, sE0 SEP + eor sAka_, sAbe, sE1 SEP rax1_m0 E3, C2, C4 + eor sAse_, sAgo, sE3 SEP + eor sAgo_, sAme, sE1 SEP + eor sAke_, sAgi, sE2 SEP rax1_m1 E0, C4, C1 + eor sAgi_, sAka, sE0 SEP + eor sAga_, sAbo, sE3 SEP + eor sAbo_, sAmo, sE3 SEP rax1_m0 E2, C1, C3 + eor sAmo_, sAmi, sE2 SEP + eor sAmi_, sAke, sE1 SEP + eor sAge_, sAgu, sE4 SEP rax1_m1 E4, C3, C0 + eor sAgu_, sAsi, sE2 SEP + eor sAsi_, sAku, sE4 SEP + eor sAku_, sAsa, sE0 SEP + eor sAma_, sAbu, sE4 SEP eor vAba_.16b, vAba.16b, E0.16b + eor sAbu_, sAsu, sE4 SEP + eor sAsu_, sAse, sE1 SEP + eor sAme_, sAga, sE0 SEP xar_m0 vAsa_, vAbi, E2, 2 + eor sAbe_, sAge, sE1 SEP + SEP + load_constant_ptr SEP xar_m1 vAbi_, vAki, E2, 21 + SEP + bic tmp, sAgi_, sAge_, ROR #47 SEP + eor sAga, tmp, sAga_, ROR #39 SEP xar_m0 vAki_, vAko, E3, 39 + bic tmp, sAgo_, sAgi_, ROR #42 SEP + eor sAge, tmp, sAge_, ROR #25 SEP + bic tmp, sAgu_, sAgo_, ROR #16 SEP xar_m1 vAko_, vAmu, E4, 56 + eor sAgi, tmp, sAgi_, ROR #58 SEP + bic tmp, sAga_, sAgu_, ROR #31 SEP + eor sAgo, tmp, sAgo_, ROR #47 SEP xar_m0 vAmu_, vAso, E3, 8 + bic tmp, sAge_, sAga_, ROR #56 SEP + eor sAgu, tmp, sAgu_, ROR #23 SEP + bic tmp, sAki_, sAke_, ROR #19 SEP xar_m1 vAso_, vAma, E0, 23 + eor sAka, tmp, sAka_, ROR #24 SEP + bic tmp, sAko_, sAki_, ROR #47 SEP + eor sAke, tmp, sAke_, ROR #2 SEP xar_m0 vAka_, vAbe, E1, 63 + bic tmp, sAku_, sAko_, ROR #10 SEP + eor sAki, tmp, sAki_, ROR #57 SEP + bic tmp, sAka_, sAku_, ROR #47 SEP xar_m1 vAse_, vAgo, E3, 9 + eor sAko, tmp, sAko_, ROR #57 SEP + bic tmp, sAke_, sAka_, ROR #5 SEP + eor sAku, tmp, sAku_, ROR #52 SEP xar_m0 vAgo_, vAme, E1, 19 + bic tmp, sAmi_, sAme_, ROR #38 SEP + eor sAma, tmp, sAma_, ROR #47 SEP + bic tmp, sAmo_, sAmi_, ROR #5 SEP xar_m1 vAke_, vAgi, E2, 58 + eor sAme, tmp, sAme_, ROR #43 SEP + bic tmp, sAmu_, sAmo_, ROR #41 SEP + eor sAmi, tmp, sAmi_, ROR #46 SEP xar_m0 vAgi_, vAka, E0, 61 + SEP + ldr cur_const, [const_addr] SEP + mov count, #1 SEP xar_m1 vAga_, vAbo, E3, 36 + SEP + bic tmp, sAma_, sAmu_, ROR #35 SEP + eor sAmo, tmp, sAmo_, ROR #12 SEP xar_m0 vAbo_, vAmo, E3, 43 + bic tmp, sAme_, sAma_, ROR #9 SEP + eor sAmu, tmp, sAmu_, ROR #44 SEP + bic tmp, sAsi_, sAse_, ROR #48 SEP xar_m1 vAmo_, vAmi, E2, 49 + eor sAsa, tmp, sAsa_, ROR #41 SEP + bic tmp, sAso_, sAsi_, ROR #2 SEP + eor sAse, tmp, sAse_, ROR #50 SEP xar_m0 vAmi_, vAke, E1, 54 + bic tmp, sAsu_, sAso_, ROR #25 SEP + eor sAsi, tmp, sAsi_, ROR #27 SEP + bic tmp, sAsa_, sAsu_, ROR #60 SEP xar_m1 vAge_, vAgu, E4, 44 + eor sAso, tmp, sAso_, ROR #21 SEP + bic tmp, sAse_, sAsa_, ROR #57 SEP + eor sAsu, tmp, sAsu_, ROR #53 SEP xar_m0 vAgu_, vAsi, E2, 3 + bic tmp, sAbi_, sAbe_, ROR #63 SEP + eor s_Aba, s_Aba_, tmp, ROR #21 SEP + bic tmp, sAbo_, sAbi_, ROR #42 SEP xar_m1 vAsi_, vAku, E4, 25 + eor sAbe, tmp, sAbe_, ROR #41 SEP + bic tmp, sAbu_, sAbo_, ROR #57 SEP + eor sAbi, tmp, sAbi_, ROR #35 SEP xar_m0 vAku_, vAsa, E0, 46 + bic tmp, s_Aba_, sAbu_, ROR #50 SEP + eor sAbo, tmp, sAbo_, ROR #43 SEP + bic tmp, sAbe_, s_Aba_, ROR #44 SEP xar_m1 vAma_, vAbu, E4, 37 + eor sAbu, tmp, sAbu_, ROR #30 SEP + SEP + eor s_Aba, s_Aba, cur_const SEP xar_m0 vAbu_, vAsu, E4, 50 + SEP + save count, STACK_OFFSET_COUNT SEP + SEP xar_m1 vAsu_, vAse, E1, 62 + eor sC0, sAka, sAsa, ROR #50 SEP + eor sC1, sAse, sAge, ROR #60 SEP + eor sC2, sAmi, sAgi, ROR #59 SEP xar_m0 vAme_, vAga, E0, 28 + eor sC3, sAgo, sAso, ROR #30 SEP + eor sC4, sAbu, sAsu, ROR #53 SEP + eor sC0, sAma, sC0, ROR #49 SEP xar_m1 vAbe_, vAge, E1, 20 + eor sC1, sAbe, sC1, ROR #44 SEP + eor sC2, sAki, sC2, ROR #26 SEP restore sE1, STACK_OFFSET_CONST + eor sC3, sAmo, sC3, ROR #63 SEP + eor sC4, sAmu, sC4, ROR #56 SEP + eor sC0, sAga, sC0, ROR #57 SEP ld1r {v28.2d}, [sE1], #8 + eor sC1, sAme, sC1, ROR #58 SEP + eor sC2, sAbi, sC2, ROR #60 SEP + eor sC3, sAko, sC3, ROR #38 SEP save sE1, STACK_OFFSET_CONST + eor sC4, sAgu, sC4, ROR #48 SEP + eor sC0, s_Aba, sC0, ROR #61 SEP bcax_m0 vAga, vAga_, vAgi_, vAge_ + eor sC1, sAke, sC1, ROR #57 SEP + eor sC2, sAsi, sC2, ROR #52 SEP + eor sC3, sAbo, sC3, ROR #63 SEP bcax_m1 vAge, vAge_, vAgo_, vAgi_ + eor sC4, sAku, sC4, ROR #50 SEP + ror sC1, sC1, 56 SEP + ror sC4, sC4, 58 SEP bcax_m0 vAgi, vAgi_, vAgu_, vAgo_ + ror sC2, sC2, 62 SEP + SEP + eor sE1, sC0, sC2, ROR #63 SEP bcax_m1 vAgo, vAgo_, vAga_, vAgu_ + eor sE3, sC2, sC4, ROR #63 SEP + eor sE0, sC4, sC1, ROR #63 SEP + eor sE2, sC1, sC3, ROR #63 SEP bcax_m0 vAgu, vAgu_, vAge_, vAga_ + eor sE4, sC3, sC0, ROR #63 SEP + SEP + eor s_Aba_, sE0, s_Aba SEP bcax_m1 vAka, vAka_, vAki_, vAke_ + eor sAsa_, sE2, sAbi, ROR #50 SEP + eor sAbi_, sE2, sAki, ROR #46 SEP + eor sAki_, sE3, sAko, ROR #63 SEP bcax_m0 vAke, vAke_, vAko_, vAki_ + eor sAko_, sE4, sAmu, ROR #28 SEP + eor sAmu_, sE3, sAso, ROR #2 SEP + eor sAso_, sE0, sAma, ROR #54 SEP bcax_m1 vAki, vAki_, vAku_, vAko_ + eor sAka_, sE1, sAbe, ROR #43 SEP + eor sAse_, sE3, sAgo, ROR #36 SEP + eor sAgo_, sE1, sAme, ROR #49 SEP bcax_m0 vAko, vAko_, vAka_, vAku_ + eor sAke_, sE2, sAgi, ROR #3 SEP + eor sAgi_, sE0, sAka, ROR #39 SEP + eor sAga_, sE3, sAbo SEP bcax_m1 vAku, vAku_, vAke_, vAka_ + eor sAbo_, sE3, sAmo, ROR #37 SEP + eor sAmo_, sE2, sAmi, ROR #8 SEP + eor sAmi_, sE1, sAke, ROR #56 SEP bcax_m0 vAma, vAma_, vAmi_, vAme_ + eor sAge_, sE4, sAgu, ROR #44 SEP + eor sAgu_, sE2, sAsi, ROR #62 SEP + eor sAsi_, sE4, sAku, ROR #58 SEP bcax_m1 vAme, vAme_, vAmo_, vAmi_ + eor sAku_, sE0, sAsa, ROR #25 SEP + eor sAma_, sE4, sAbu, ROR #20 SEP + eor sAbu_, sE4, sAsu, ROR #9 SEP bcax_m0 vAmi, vAmi_, vAmu_, vAmo_ + eor sAsu_, sE1, sAse, ROR #23 SEP + eor sAme_, sE0, sAga, ROR #61 SEP + eor sAbe_, sE1, sAge, ROR #19 SEP bcax_m1 vAmo, vAmo_, vAma_, vAmu_ + SEP + load_constant_ptr SEP + restore count, STACK_OFFSET_COUNT SEP bcax_m0 vAmu, vAmu_, vAme_, vAma_ + SEP + bic tmp, sAgi_, sAge_, ROR #47 SEP + eor sAga, tmp, sAga_, ROR #39 SEP bcax_m1 vAsa, vAsa_, vAsi_, vAse_ + bic tmp, sAgo_, sAgi_, ROR #42 SEP + eor sAge, tmp, sAge_, ROR #25 SEP + bic tmp, sAgu_, sAgo_, ROR #16 SEP bcax_m0 vAse, vAse_, vAso_, vAsi_ + eor sAgi, tmp, sAgi_, ROR #58 SEP + bic tmp, sAga_, sAgu_, ROR #31 SEP + eor sAgo, tmp, sAgo_, ROR #47 SEP bcax_m1 vAsi, vAsi_, vAsu_, vAso_ + bic tmp, sAge_, sAga_, ROR #56 SEP + eor sAgu, tmp, sAgu_, ROR #23 SEP + bic tmp, sAki_, sAke_, ROR #19 SEP bcax_m0 vAso, vAso_, vAsa_, vAsu_ + eor sAka, tmp, sAka_, ROR #24 SEP + bic tmp, sAko_, sAki_, ROR #47 SEP + eor sAke, tmp, sAke_, ROR #2 SEP bcax_m1 vAsu, vAsu_, vAse_, vAsa_ + bic tmp, sAku_, sAko_, ROR #10 SEP + eor sAki, tmp, sAki_, ROR #57 SEP + bic tmp, sAka_, sAku_, ROR #47 SEP bcax_m0 vAba, vAba_, vAbi_, vAbe_ + eor sAko, tmp, sAko_, ROR #57 SEP + bic tmp, sAke_, sAka_, ROR #5 SEP + eor sAku, tmp, sAku_, ROR #52 SEP bcax_m1 vAbe, vAbe_, vAbo_, vAbi_ + bic tmp, sAmi_, sAme_, ROR #38 SEP + eor sAma, tmp, sAma_, ROR #47 SEP + bic tmp, sAmo_, sAmi_, ROR #5 SEP bcax_m0 vAbi, vAbi_, vAbu_, vAbo_ + eor sAme, tmp, sAme_, ROR #43 SEP + bic tmp, sAmu_, sAmo_, ROR #41 SEP + eor sAmi, tmp, sAmi_, ROR #46 SEP bcax_m1 vAbo, vAbo_, vAba_, vAbu_ + bic tmp, sAma_, sAmu_, ROR #35 SEP + SEP + ldr cur_const, [const_addr, count, UXTW #3] SEP bcax_m0 vAbu, vAbu_, vAbe_, vAba_ + SEP + eor sAmo, tmp, sAmo_, ROR #12 SEP + bic tmp, sAme_, sAma_, ROR #9 SEP + eor sAmu, tmp, sAmu_, ROR #44 SEP eor vAba.16b, vAba.16b, v28.16b + bic tmp, sAsi_, sAse_, ROR #48 SEP + eor sAsa, tmp, sAsa_, ROR #41 SEP + bic tmp, sAso_, sAsi_, ROR #2 SEP + eor sAse, tmp, sAse_, ROR #50 SEP + bic tmp, sAsu_, sAso_, ROR #25 SEP + eor sAsi, tmp, sAsi_, ROR #27 SEP + bic tmp, sAsa_, sAsu_, ROR #60 SEP + eor sAso, tmp, sAso_, ROR #21 SEP + bic tmp, sAse_, sAsa_, ROR #57 SEP + eor sAsu, tmp, sAsu_, ROR #53 SEP + bic tmp, sAbi_, sAbe_, ROR #63 SEP + eor s_Aba, s_Aba_, tmp, ROR #21 SEP + bic tmp, sAbo_, sAbi_, ROR #42 SEP + eor sAbe, tmp, sAbe_, ROR #41 SEP + bic tmp, sAbu_, sAbo_, ROR #57 SEP + eor sAbi, tmp, sAbi_, ROR #35 SEP + bic tmp, s_Aba_, sAbu_, ROR #50 SEP + eor sAbo, tmp, sAbo_, ROR #43 SEP + bic tmp, sAbe_, s_Aba_, ROR #44 SEP + eor sAbu, tmp, sAbu_, ROR #30 SEP + SEP + add count, count, #1 SEP + SEP + eor s_Aba, s_Aba, cur_const SEP + SEP +.endm + +.macro hybrid_round_noninitial + save count, STACK_OFFSET_COUNT SEP eor3_m1 C0, vAba, vAga, vAka + SEP + eor sC0, sAka, sAsa, ROR #50 SEP + eor sC1, sAse, sAge, ROR #60 SEP eor3_m0 C0, C0, vAma, vAsa + eor sC2, sAmi, sAgi, ROR #59 SEP + eor sC3, sAgo, sAso, ROR #30 SEP + eor sC4, sAbu, sAsu, ROR #53 SEP eor3_m1 C1, vAbe, vAge, vAke + eor sC0, sAma, sC0, ROR #49 SEP + eor sC1, sAbe, sC1, ROR #44 SEP + eor sC2, sAki, sC2, ROR #26 SEP eor3_m0 C1, C1, vAme, vAse + eor sC3, sAmo, sC3, ROR #63 SEP + eor sC4, sAmu, sC4, ROR #56 SEP + eor sC0, sAga, sC0, ROR #57 SEP eor3_m1 C2, vAbi, vAgi, vAki + eor sC1, sAme, sC1, ROR #58 SEP + eor sC2, sAbi, sC2, ROR #60 SEP + eor sC3, sAko, sC3, ROR #38 SEP eor3_m0 C2, C2, vAmi, vAsi + eor sC4, sAgu, sC4, ROR #48 SEP + eor sC0, s_Aba, sC0, ROR #61 SEP + eor sC1, sAke, sC1, ROR #57 SEP eor3_m1 C3, vAbo, vAgo, vAko + eor sC2, sAsi, sC2, ROR #52 SEP + eor sC3, sAbo, sC3, ROR #63 SEP + eor sC4, sAku, sC4, ROR #50 SEP eor3_m0 C3, C3, vAmo, vAso + ror sC1, sC1, 56 SEP + ror sC4, sC4, 58 SEP + ror sC2, sC2, 62 SEP eor3_m1 C4, vAbu, vAgu, vAku + SEP + eor sE1, sC0, sC2, ROR #63 SEP + eor sE3, sC2, sC4, ROR #63 SEP eor3_m0 C4, C4, vAmu, vAsu + eor sE0, sC4, sC1, ROR #63 SEP + eor sE2, sC1, sC3, ROR #63 SEP + eor sE4, sC3, sC0, ROR #63 SEP + SEP rax1_m1 E1, C0, C2 + eor s_Aba_, sE0, s_Aba SEP + eor sAsa_, sE2, sAbi, ROR #50 SEP + eor sAbi_, sE2, sAki, ROR #46 SEP rax1_m0 E3, C2, C4 + eor sAki_, sE3, sAko, ROR #63 SEP + eor sAko_, sE4, sAmu, ROR #28 SEP + eor sAmu_, sE3, sAso, ROR #2 SEP rax1_m1 E0, C4, C1 + eor sAso_, sE0, sAma, ROR #54 SEP + eor sAka_, sE1, sAbe, ROR #43 SEP + eor sAse_, sE3, sAgo, ROR #36 SEP rax1_m0 E2, C1, C3 + eor sAgo_, sE1, sAme, ROR #49 SEP + eor sAke_, sE2, sAgi, ROR #3 SEP + eor sAgi_, sE0, sAka, ROR #39 SEP rax1_m1 E4, C3, C0 + eor sAga_, sE3, sAbo SEP + eor sAbo_, sE3, sAmo, ROR #37 SEP + eor sAmo_, sE2, sAmi, ROR #8 SEP + eor sAmi_, sE1, sAke, ROR #56 SEP eor vAba_.16b, vAba.16b, E0.16b + eor sAge_, sE4, sAgu, ROR #44 SEP + eor sAgu_, sE2, sAsi, ROR #62 SEP + eor sAsi_, sE4, sAku, ROR #58 SEP xar_m0 vAsa_, vAbi, E2, 2 + eor sAku_, sE0, sAsa, ROR #25 SEP + eor sAma_, sE4, sAbu, ROR #20 SEP + eor sAbu_, sE4, sAsu, ROR #9 SEP xar_m1 vAbi_, vAki, E2, 21 + eor sAsu_, sE1, sAse, ROR #23 SEP + eor sAme_, sE0, sAga, ROR #61 SEP + eor sAbe_, sE1, sAge, ROR #19 SEP xar_m0 vAki_, vAko, E3, 39 + SEP + load_constant_ptr SEP + restore count, STACK_OFFSET_COUNT SEP xar_m1 vAko_, vAmu, E4, 56 + SEP + bic tmp, sAgi_, sAge_, ROR #47 SEP + eor sAga, tmp, sAga_, ROR #39 SEP xar_m0 vAmu_, vAso, E3, 8 + bic tmp, sAgo_, sAgi_, ROR #42 SEP + eor sAge, tmp, sAge_, ROR #25 SEP + bic tmp, sAgu_, sAgo_, ROR #16 SEP xar_m1 vAso_, vAma, E0, 23 + eor sAgi, tmp, sAgi_, ROR #58 SEP + bic tmp, sAga_, sAgu_, ROR #31 SEP + eor sAgo, tmp, sAgo_, ROR #47 SEP xar_m0 vAka_, vAbe, E1, 63 + bic tmp, sAge_, sAga_, ROR #56 SEP + eor sAgu, tmp, sAgu_, ROR #23 SEP + bic tmp, sAki_, sAke_, ROR #19 SEP xar_m1 vAse_, vAgo, E3, 9 + eor sAka, tmp, sAka_, ROR #24 SEP + bic tmp, sAko_, sAki_, ROR #47 SEP + eor sAke, tmp, sAke_, ROR #2 SEP xar_m0 vAgo_, vAme, E1, 19 + bic tmp, sAku_, sAko_, ROR #10 SEP + eor sAki, tmp, sAki_, ROR #57 SEP + bic tmp, sAka_, sAku_, ROR #47 SEP xar_m1 vAke_, vAgi, E2, 58 + eor sAko, tmp, sAko_, ROR #57 SEP + bic tmp, sAke_, sAka_, ROR #5 SEP + eor sAku, tmp, sAku_, ROR #52 SEP xar_m0 vAgi_, vAka, E0, 61 + bic tmp, sAmi_, sAme_, ROR #38 SEP + eor sAma, tmp, sAma_, ROR #47 SEP + bic tmp, sAmo_, sAmi_, ROR #5 SEP xar_m1 vAga_, vAbo, E3, 36 + eor sAme, tmp, sAme_, ROR #43 SEP + bic tmp, sAmu_, sAmo_, ROR #41 SEP + eor sAmi, tmp, sAmi_, ROR #46 SEP xar_m0 vAbo_, vAmo, E3, 43 + bic tmp, sAma_, sAmu_, ROR #35 SEP + SEP + ldr cur_const, [const_addr, count, UXTW #3] SEP xar_m1 vAmo_, vAmi, E2, 49 + add count, count, #1 SEP + SEP + eor sAmo, tmp, sAmo_, ROR #12 SEP xar_m0 vAmi_, vAke, E1, 54 + bic tmp, sAme_, sAma_, ROR #9 SEP + eor sAmu, tmp, sAmu_, ROR #44 SEP + bic tmp, sAsi_, sAse_, ROR #48 SEP xar_m1 vAge_, vAgu, E4, 44 + eor sAsa, tmp, sAsa_, ROR #41 SEP + bic tmp, sAso_, sAsi_, ROR #2 SEP + eor sAse, tmp, sAse_, ROR #50 SEP xar_m0 vAgu_, vAsi, E2, 3 + bic tmp, sAsu_, sAso_, ROR #25 SEP + eor sAsi, tmp, sAsi_, ROR #27 SEP + bic tmp, sAsa_, sAsu_, ROR #60 SEP xar_m1 vAsi_, vAku, E4, 25 + eor sAso, tmp, sAso_, ROR #21 SEP + bic tmp, sAse_, sAsa_, ROR #57 SEP + eor sAsu, tmp, sAsu_, ROR #53 SEP xar_m0 vAku_, vAsa, E0, 46 + bic tmp, sAbi_, sAbe_, ROR #63 SEP + eor s_Aba, s_Aba_, tmp, ROR #21 SEP + bic tmp, sAbo_, sAbi_, ROR #42 SEP xar_m1 vAma_, vAbu, E4, 37 + eor sAbe, tmp, sAbe_, ROR #41 SEP + bic tmp, sAbu_, sAbo_, ROR #57 SEP + eor sAbi, tmp, sAbi_, ROR #35 SEP xar_m0 vAbu_, vAsu, E4, 50 + bic tmp, s_Aba_, sAbu_, ROR #50 SEP + eor sAbo, tmp, sAbo_, ROR #43 SEP + bic tmp, sAbe_, s_Aba_, ROR #44 SEP xar_m1 vAsu_, vAse, E1, 62 + eor sAbu, tmp, sAbu_, ROR #30 SEP + SEP + eor s_Aba, s_Aba, cur_const SEP xar_m0 vAme_, vAga, E0, 28 + save count, STACK_OFFSET_COUNT SEP + SEP + eor sC0, sAka, sAsa, ROR #50 SEP xar_m1 vAbe_, vAge, E1, 20 + eor sC1, sAse, sAge, ROR #60 SEP + eor sC2, sAmi, sAgi, ROR #59 SEP + eor sC3, sAgo, sAso, ROR #30 SEP + eor sC4, sAbu, sAsu, ROR #53 SEP restore sE1, STACK_OFFSET_CONST + eor sC0, sAma, sC0, ROR #49 SEP + eor sC1, sAbe, sC1, ROR #44 SEP + eor sC2, sAki, sC2, ROR #26 SEP ld1r {v28.2d}, [sE1], #8 + eor sC3, sAmo, sC3, ROR #63 SEP + eor sC4, sAmu, sC4, ROR #56 SEP + eor sC0, sAga, sC0, ROR #57 SEP save sE1, STACK_OFFSET_CONST + eor sC1, sAme, sC1, ROR #58 SEP + eor sC2, sAbi, sC2, ROR #60 SEP + eor sC3, sAko, sC3, ROR #38 SEP + eor sC4, sAgu, sC4, ROR #48 SEP bcax_m0 vAga, vAga_, vAgi_, vAge_ + eor sC0, s_Aba, sC0, ROR #61 SEP + eor sC1, sAke, sC1, ROR #57 SEP + eor sC2, sAsi, sC2, ROR #52 SEP bcax_m1 vAge, vAge_, vAgo_, vAgi_ + eor sC3, sAbo, sC3, ROR #63 SEP + eor sC4, sAku, sC4, ROR #50 SEP + ror sC1, sC1, 56 SEP bcax_m0 vAgi, vAgi_, vAgu_, vAgo_ + ror sC4, sC4, 58 SEP + ror sC2, sC2, 62 SEP + SEP bcax_m1 vAgo, vAgo_, vAga_, vAgu_ + eor sE1, sC0, sC2, ROR #63 SEP + eor sE3, sC2, sC4, ROR #63 SEP + eor sE0, sC4, sC1, ROR #63 SEP bcax_m0 vAgu, vAgu_, vAge_, vAga_ + eor sE2, sC1, sC3, ROR #63 SEP + eor sE4, sC3, sC0, ROR #63 SEP + SEP bcax_m1 vAka, vAka_, vAki_, vAke_ + eor s_Aba_, sE0, s_Aba SEP + eor sAsa_, sE2, sAbi, ROR #50 SEP + eor sAbi_, sE2, sAki, ROR #46 SEP bcax_m0 vAke, vAke_, vAko_, vAki_ + eor sAki_, sE3, sAko, ROR #63 SEP + eor sAko_, sE4, sAmu, ROR #28 SEP + eor sAmu_, sE3, sAso, ROR #2 SEP bcax_m1 vAki, vAki_, vAku_, vAko_ + eor sAso_, sE0, sAma, ROR #54 SEP + eor sAka_, sE1, sAbe, ROR #43 SEP + eor sAse_, sE3, sAgo, ROR #36 SEP bcax_m0 vAko, vAko_, vAka_, vAku_ + eor sAgo_, sE1, sAme, ROR #49 SEP + eor sAke_, sE2, sAgi, ROR #3 SEP + eor sAgi_, sE0, sAka, ROR #39 SEP bcax_m1 vAku, vAku_, vAke_, vAka_ + eor sAga_, sE3, sAbo SEP + eor sAbo_, sE3, sAmo, ROR #37 SEP + eor sAmo_, sE2, sAmi, ROR #8 SEP bcax_m0 vAma, vAma_, vAmi_, vAme_ + eor sAmi_, sE1, sAke, ROR #56 SEP + eor sAge_, sE4, sAgu, ROR #44 SEP + eor sAgu_, sE2, sAsi, ROR #62 SEP bcax_m1 vAme, vAme_, vAmo_, vAmi_ + eor sAsi_, sE4, sAku, ROR #58 SEP + eor sAku_, sE0, sAsa, ROR #25 SEP + eor sAma_, sE4, sAbu, ROR #20 SEP bcax_m0 vAmi, vAmi_, vAmu_, vAmo_ + eor sAbu_, sE4, sAsu, ROR #9 SEP + eor sAsu_, sE1, sAse, ROR #23 SEP + eor sAme_, sE0, sAga, ROR #61 SEP bcax_m1 vAmo, vAmo_, vAma_, vAmu_ + eor sAbe_, sE1, sAge, ROR #19 SEP + SEP + load_constant_ptr SEP bcax_m0 vAmu, vAmu_, vAme_, vAma_ + restore count, STACK_OFFSET_COUNT SEP + SEP + bic tmp, sAgi_, sAge_, ROR #47 SEP bcax_m1 vAsa, vAsa_, vAsi_, vAse_ + eor sAga, tmp, sAga_, ROR #39 SEP + bic tmp, sAgo_, sAgi_, ROR #42 SEP + eor sAge, tmp, sAge_, ROR #25 SEP bcax_m0 vAse, vAse_, vAso_, vAsi_ + bic tmp, sAgu_, sAgo_, ROR #16 SEP + eor sAgi, tmp, sAgi_, ROR #58 SEP + bic tmp, sAga_, sAgu_, ROR #31 SEP bcax_m1 vAsi, vAsi_, vAsu_, vAso_ + eor sAgo, tmp, sAgo_, ROR #47 SEP + bic tmp, sAge_, sAga_, ROR #56 SEP + eor sAgu, tmp, sAgu_, ROR #23 SEP bcax_m0 vAso, vAso_, vAsa_, vAsu_ + bic tmp, sAki_, sAke_, ROR #19 SEP + eor sAka, tmp, sAka_, ROR #24 SEP + bic tmp, sAko_, sAki_, ROR #47 SEP bcax_m1 vAsu, vAsu_, vAse_, vAsa_ + eor sAke, tmp, sAke_, ROR #2 SEP + bic tmp, sAku_, sAko_, ROR #10 SEP + eor sAki, tmp, sAki_, ROR #57 SEP bcax_m0 vAba, vAba_, vAbi_, vAbe_ + bic tmp, sAka_, sAku_, ROR #47 SEP + eor sAko, tmp, sAko_, ROR #57 SEP + bic tmp, sAke_, sAka_, ROR #5 SEP bcax_m1 vAbe, vAbe_, vAbo_, vAbi_ + eor sAku, tmp, sAku_, ROR #52 SEP + bic tmp, sAmi_, sAme_, ROR #38 SEP + eor sAma, tmp, sAma_, ROR #47 SEP bcax_m0 vAbi, vAbi_, vAbu_, vAbo_ + bic tmp, sAmo_, sAmi_, ROR #5 SEP + eor sAme, tmp, sAme_, ROR #43 SEP + bic tmp, sAmu_, sAmo_, ROR #41 SEP bcax_m1 vAbo, vAbo_, vAba_, vAbu_ + eor sAmi, tmp, sAmi_, ROR #46 SEP + bic tmp, sAma_, sAmu_, ROR #35 SEP + SEP bcax_m0 vAbu, vAbu_, vAbe_, vAba_ + ldr cur_const, [const_addr, count, UXTW #3] SEP + add count, count, #1 SEP + SEP eor vAba.16b, vAba.16b, v28.16b + eor sAmo, tmp, sAmo_, ROR #12 SEP + bic tmp, sAme_, sAma_, ROR #9 SEP + eor sAmu, tmp, sAmu_, ROR #44 SEP + bic tmp, sAsi_, sAse_, ROR #48 SEP + eor sAsa, tmp, sAsa_, ROR #41 SEP + bic tmp, sAso_, sAsi_, ROR #2 SEP + eor sAse, tmp, sAse_, ROR #50 SEP + bic tmp, sAsu_, sAso_, ROR #25 SEP + eor sAsi, tmp, sAsi_, ROR #27 SEP + bic tmp, sAsa_, sAsu_, ROR #60 SEP + eor sAso, tmp, sAso_, ROR #21 SEP + bic tmp, sAse_, sAsa_, ROR #57 SEP + eor sAsu, tmp, sAsu_, ROR #53 SEP + bic tmp, sAbi_, sAbe_, ROR #63 SEP + eor s_Aba, s_Aba_, tmp, ROR #21 SEP + bic tmp, sAbo_, sAbi_, ROR #42 SEP + eor sAbe, tmp, sAbe_, ROR #41 SEP + bic tmp, sAbu_, sAbo_, ROR #57 SEP + eor sAbi, tmp, sAbi_, ROR #35 SEP + bic tmp, s_Aba_, sAbu_, ROR #50 SEP + eor sAbo, tmp, sAbo_, ROR #43 SEP + bic tmp, sAbe_, s_Aba_, ROR #44 SEP + eor sAbu, tmp, sAbu_, ROR #30 SEP + SEP + eor s_Aba, s_Aba, cur_const SEP + +.endm + +.macro final_rotate + ror sAga, sAga,#(64-3) + ror sAka, sAka,#(64-25) + ror sAma, sAma,#(64-10) + ror sAsa, sAsa,#(64-39) + ror sAbe, sAbe,#(64-21) + ror sAge, sAge,#(64-45) + ror sAke, sAke,#(64-8) + ror sAme, sAme,#(64-15) + ror sAse, sAse,#(64-41) + ror sAbi, sAbi,#(64-14) + ror sAgi, sAgi,#(64-61) + ror sAki, sAki,#(64-18) + ror sAmi, sAmi,#(64-56) + ror sAsi, sAsi,#(64-2) + ror sAgo, sAgo,#(64-28) + ror sAko, sAko,#(64-1) + ror sAmo, sAmo,#(64-27) + ror sAso, sAso,#(64-62) + ror sAbu, sAbu,#(64-44) + ror sAgu, sAgu,#(64-20) + ror sAku, sAku,#(64-6) + ror sAmu, sAmu,#(64-36) + ror sAsu, sAsu,#(64-55) +.endm + +#define KECCAK_F1600_ROUNDS 24 + +.global keccak_f1600_x4_hybrid_asm_v4 +.global _keccak_f1600_x4_hybrid_asm_v4 +.text +.align 4 + +keccak_f1600_x4_hybrid_asm_v4: +_keccak_f1600_x4_hybrid_asm_v4: + alloc_stack + save_gprs + save_vregs + save input_addr, STACK_OFFSET_INPUT + + load_input_vector 2,1 + + load_constant_ptr + save const_addr, STACK_OFFSET_CONST + + // First scalar Keccak computation alongside first half of SIMD computation + load_input_scalar 4,0 + hybrid_round_initial + loop_0: + hybrid_round_noninitial + cmp count, #(KECCAK_F1600_ROUNDS-1) + ble loop_0 + final_rotate + restore input_addr, STACK_OFFSET_INPUT + store_input_scalar 4,0 + + // Second scalar Keccak computation alongsie second half of SIMD computation + load_input_scalar 4,1 + hybrid_round_initial + loop_1: + hybrid_round_noninitial + cmp count, #(KECCAK_F1600_ROUNDS-1) + ble loop_1 + final_rotate + restore input_addr, STACK_OFFSET_INPUT + store_input_scalar 4, 1 + + store_input_vector 2,1 + + restore_vregs + restore_gprs + free_stack + ret + +#endif diff --git a/asm/manual/keccak_f1600/keccak_f1600_x4_hybrid_asm_v4p.s b/asm/manual/keccak_f1600/keccak_f1600_x4_hybrid_asm_v4p.s new file mode 100644 index 0000000..69a8718 --- /dev/null +++ b/asm/manual/keccak_f1600/keccak_f1600_x4_hybrid_asm_v4p.s @@ -0,0 +1,1026 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +#if defined(__ARM_FEATURE_SHA3) + +/********************** CONSTANTS *************************/ + .data + .align(8) +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x29 + count .req w27 + out_count .req w27 + cur_const .req x26 + + /* Mapping of Kecck-f1600 SIMD state to vector registers + * at the beginning and end of each round. */ + + vAba .req v0 + vAbe .req v1 + vAbi .req v2 + vAbo .req v3 + vAbu .req v4 + vAga .req v5 + vAge .req v6 + vAgi .req v7 + vAgo .req v8 + vAgu .req v9 + vAka .req v10 + vAke .req v11 + vAki .req v12 + vAko .req v13 + vAku .req v14 + vAma .req v15 + vAme .req v16 + vAmi .req v17 + vAmo .req v18 + vAmu .req v19 + vAsa .req v20 + vAse .req v21 + vAsi .req v22 + vAso .req v23 + vAsu .req v24 + + /* q-form of the above mapping */ + vAbaq .req q0 + vAbeq .req q1 + vAbiq .req q2 + vAboq .req q3 + vAbuq .req q4 + vAgaq .req q5 + vAgeq .req q6 + vAgiq .req q7 + vAgoq .req q8 + vAguq .req q9 + vAkaq .req q10 + vAkeq .req q11 + vAkiq .req q12 + vAkoq .req q13 + vAkuq .req q14 + vAmaq .req q15 + vAmeq .req q16 + vAmiq .req q17 + vAmoq .req q18 + vAmuq .req q19 + vAsaq .req q20 + vAseq .req q21 + vAsiq .req q22 + vAsoq .req q23 + vAsuq .req q24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v30 + C1 .req v29 + C2 .req v28 + C3 .req v27 + C4 .req v26 + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req v26 + E1 .req v25 + E2 .req v29 + E3 .req v28 + E4 .req v27 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + vAbi_ .req v2 + vAbo_ .req v3 + vAbu_ .req v4 + vAga_ .req v10 + vAge_ .req v11 + vAgi_ .req v7 + vAgo_ .req v8 + vAgu_ .req v9 + vAka_ .req v15 + vAke_ .req v16 + vAki_ .req v12 + vAko_ .req v13 + vAku_ .req v14 + vAma_ .req v20 + vAme_ .req v21 + vAmi_ .req v17 + vAmo_ .req v18 + vAmu_ .req v19 + vAsa_ .req v0 + vAse_ .req v1 + vAsi_ .req v22 + vAso_ .req v23 + vAsu_ .req v24 + vAba_ .req v30 + vAbe_ .req v27 + + /* Unused temporary */ + vtmp .req v31 + + /* Mapping of Kecck-f1600 state to scalar registers + * at the beginning and end of each round. */ + s_Aba .req x1 + sAbe .req x6 + sAbi .req x11 + sAbo .req x16 + sAbu .req x21 + sAga .req x2 + sAge .req x7 + sAgi .req x12 + sAgo .req x17 + sAgu .req x22 + sAka .req x3 + sAke .req x8 + sAki .req x13 + sAko .req x18 + sAku .req x23 + sAma .req x4 + sAme .req x9 + sAmi .req x14 + sAmo .req x19 + sAmu .req x24 + sAsa .req x5 + sAse .req x10 + sAsi .req x15 + sAso .req x20 + sAsu .req x25 + + /* sA_[y,2*x+3*y] = rot(A[x,y]) */ + s_Aba_ .req x0 + sAbe_ .req x28 + sAbi_ .req x11 + sAbo_ .req x16 + sAbu_ .req x21 + sAga_ .req x3 + sAge_ .req x8 + sAgi_ .req x12 + sAgo_ .req x17 + sAgu_ .req x22 + sAka_ .req x4 + sAke_ .req x9 + sAki_ .req x13 + sAko_ .req x18 + sAku_ .req x23 + sAma_ .req x5 + sAme_ .req x10 + sAmi_ .req x14 + sAmo_ .req x19 + sAmu_ .req x24 + sAsa_ .req x1 + sAse_ .req x6 + sAsi_ .req x15 + sAso_ .req x20 + sAsu_ .req x25 + + /* sC[x] = sA[x,0] xor sA[x,1] xor sA[x,2] xor sA[x,3] xor sA[x,4], for x in 0..4 */ + /* sE[x] = sC[x-1] xor rot(C[x+1],1), for x in 0..4 */ + sC0 .req x0 + sE0 .req x29 + sC1 .req x26 + sE1 .req x30 + sC2 .req x27 + sE2 .req x26 + sC3 .req x28 + sE3 .req x27 + sC4 .req x29 + sE4 .req x28 + + tmp .req x30 + +/************************ MACROS ****************************/ + +/* Macros using v8.4-A SHA-3 instructions */ + + +.macro eor3_m1 d s0 s1 s2 + eor \d\().16b, \s0\().16b, \s1\().16b + eor \d\().16b, \d\().16b, \s2\().16b +.endm + +.macro rax1_m1 d s0 s1 + add vtmp.2d, \s1\().2d, \s1\().2d + sri vtmp.2d, \s1\().2d, #63 + eor \d\().16b, vtmp.16b, \s0\().16b +.endm + +.macro xar_m1 d s0 s1 imm + eor vtmp.16b, \s0\().16b, \s1\().16b + shl \d\().2d, vtmp.2d, #(64-\imm) + sri \d\().2d, vtmp.2d, #(\imm) +.endm + +.macro bcax_m1 d s0 s1 s2 + bic vtmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, vtmp.16b, \s0\().16b +.endm + +.macro eor3_m0 d s0 s1 s2 + eor3 \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +.macro rax1_m0 d s0 s1 + rax1 \d\().2d, \s0\().2d, \s1\().2d +.endm + +.macro xar_m0 d s0 s1 imm + xar \d\().2d, \s0\().2d, \s1\().2d, #\imm +.endm + +.macro bcax_m0 d s0 s1 s2 + bcax \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + + +.macro load_input_vector + ldr vAbaq, [input_addr, #(32*0)] + ldr vAbeq, [input_addr, #(32*0+32)] + ldr vAbiq, [input_addr, #(32*2)] + ldr vAboq, [input_addr, #(32*2+32)] + ldr vAbuq, [input_addr, #(32*4)] + ldr vAgaq, [input_addr, #(32*4+32)] + ldr vAgeq, [input_addr, #(32*6)] + ldr vAgiq, [input_addr, #(32*6+32)] + ldr vAgoq, [input_addr, #(32*8)] + ldr vAguq, [input_addr, #(32*8+32)] + ldr vAkaq, [input_addr, #(32*10)] + ldr vAkeq, [input_addr, #(32*10+32)] + ldr vAkiq, [input_addr, #(32*12)] + ldr vAkoq, [input_addr, #(32*12+32)] + ldr vAkuq, [input_addr, #(32*14)] + ldr vAmaq, [input_addr, #(32*14+32)] + ldr vAmeq, [input_addr, #(32*16)] + ldr vAmiq, [input_addr, #(32*16+32)] + ldr vAmoq, [input_addr, #(32*18)] + ldr vAmuq, [input_addr, #(32*18+32)] + ldr vAsaq, [input_addr, #(32*20)] + ldr vAseq, [input_addr, #(32*20+32)] + ldr vAsiq, [input_addr, #(32*22)] + ldr vAsoq, [input_addr, #(32*22+32)] + ldr vAsuq, [input_addr, #(32*24)] +.endm + +.macro store_input_vector + str vAbaq, [input_addr, #(32*0)] + str vAbeq, [input_addr, #(32*0+32)] + str vAbiq, [input_addr, #(32*2)] + str vAboq, [input_addr, #(32*2+32)] + str vAbuq, [input_addr, #(32*4)] + str vAgaq, [input_addr, #(32*4+32)] + str vAgeq, [input_addr, #(32*6)] + str vAgiq, [input_addr, #(32*6+32)] + str vAgoq, [input_addr, #(32*8)] + str vAguq, [input_addr, #(32*8+32)] + str vAkaq, [input_addr, #(32*10)] + str vAkeq, [input_addr, #(32*10+32)] + str vAkiq, [input_addr, #(32*12)] + str vAkoq, [input_addr, #(32*12+32)] + str vAkuq, [input_addr, #(32*14)] + str vAmaq, [input_addr, #(32*14+32)] + str vAmeq, [input_addr, #(32*16)] + str vAmiq, [input_addr, #(32*16+32)] + str vAmoq, [input_addr, #(32*18)] + str vAmuq, [input_addr, #(32*18+32)] + str vAsaq, [input_addr, #(32*20)] + str vAseq, [input_addr, #(32*20+32)] + str vAsiq, [input_addr, #(32*22)] + str vAsoq, [input_addr, #(32*22+32)] + str vAsuq, [input_addr, #(32*24)] +.endm + +.macro store_input_scalar + str s_Aba,[input_addr, 32*0 ] + str sAbe, [input_addr, 32*1 ] + str sAbi, [input_addr, 32*2 ] + str sAbo, [input_addr, 32*3 ] + str sAbu, [input_addr, 32*4 ] + str sAga, [input_addr, 32*5 ] + str sAge, [input_addr, 32*6 ] + str sAgi, [input_addr, 32*7 ] + str sAgo, [input_addr, 32*8 ] + str sAgu, [input_addr, 32*9 ] + str sAka, [input_addr, 32*10] + str sAke, [input_addr, 32*11] + str sAki, [input_addr, 32*12] + str sAko, [input_addr, 32*13] + str sAku, [input_addr, 32*14] + str sAma, [input_addr, 32*15] + str sAme, [input_addr, 32*16] + str sAmi, [input_addr, 32*17] + str sAmo, [input_addr, 32*18] + str sAmu, [input_addr, 32*19] + str sAsa, [input_addr, 32*20] + str sAse, [input_addr, 32*21] + str sAsi, [input_addr, 32*22] + str sAso, [input_addr, 32*23] + str sAsu, [input_addr, 32*24] +.endm + +.macro load_input_scalar + ldr s_Aba,[input_addr, 32*0 ] + ldr sAbe, [input_addr, 32*1 ] + ldr sAbi, [input_addr, 32*2 ] + ldr sAbo, [input_addr, 32*3 ] + ldr sAbu, [input_addr, 32*4 ] + ldr sAga, [input_addr, 32*5 ] + ldr sAge, [input_addr, 32*6 ] + ldr sAgi, [input_addr, 32*7 ] + ldr sAgo, [input_addr, 32*8 ] + ldr sAgu, [input_addr, 32*9 ] + ldr sAka, [input_addr, 32*10] + ldr sAke, [input_addr, 32*11] + ldr sAki, [input_addr, 32*12] + ldr sAko, [input_addr, 32*13] + ldr sAku, [input_addr, 32*14] + ldr sAma, [input_addr, 32*15] + ldr sAme, [input_addr, 32*16] + ldr sAmi, [input_addr, 32*17] + ldr sAmo, [input_addr, 32*18] + ldr sAmu, [input_addr, 32*19] + ldr sAsa, [input_addr, 32*20] + ldr sAse, [input_addr, 32*21] + ldr sAsi, [input_addr, 32*22] + ldr sAso, [input_addr, 32*23] + ldr sAsu, [input_addr, 32*24] +.endm + +#define STACK_SIZE (4*16 + 12*8 + 6*8) +#define STACK_BASE_VREGS (0) +#define STACK_BASE_GPRS (4*16) +#define STACK_BASE_TMP_GPRS (4*16 + 12*8) +#define STACK_OFFSET_INPUT (0*8) +#define STACK_OFFSET_CONST (1*8) +#define STACK_OFFSET_COUNT (2*8) +#define STACK_OFFSET_COUNT_OUT (3*8) +#define STACK_OFFSET_CUR_INPUT (4*8) + +.macro save_gprs + stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro save_vregs + stp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] + stp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + stp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + stp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] +.endm + +.macro restore_vregs + ldp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] + ldp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + ldp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + ldp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] +.endm + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +.macro eor5 dst, src0, src1, src2, src3, src4 + eor \dst, \src0, \src1 + eor \dst, \dst, \src2 + eor \dst, \dst, \src3 + eor \dst, \dst, \src4 +.endm + +.macro xor_rol dst, src1, src0, imm + eor \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro bic_rol dst, src1, src0, imm + bic \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro rotate dst, src, imm + ror \dst, \src, #(64-\imm) +.endm + +.macro save reg, offset + str \reg, [sp, #(STACK_BASE_TMP_GPRS + \offset)] +.endm + +.macro restore reg, offset + ldr \reg, [sp, #(STACK_BASE_TMP_GPRS + \offset)] +.endm + +.macro hybrid_round_initial + + eor sC0, sAma, sAsa SEP eor3_m1 C0, vAba, vAga, vAka + eor sC1, sAme, sAse SEP + eor sC2, sAmi, sAsi SEP + eor sC3, sAmo, sAso SEP eor3_m0 C0, C0, vAma, vAsa + eor sC4, sAmu, sAsu SEP + eor sC0, sAka, sC0 SEP + eor sC1, sAke, sC1 SEP eor3_m1 C1, vAbe, vAge, vAke + eor sC2, sAki, sC2 SEP + eor sC3, sAko, sC3 SEP + eor sC4, sAku, sC4 SEP eor3_m0 C1, C1, vAme, vAse + eor sC0, sAga, sC0 SEP + eor sC1, sAge, sC1 SEP + eor sC2, sAgi, sC2 SEP eor3_m1 C2, vAbi, vAgi, vAki + eor sC3, sAgo, sC3 SEP + eor sC4, sAgu, sC4 SEP + eor sC0, s_Aba, sC0 SEP eor3_m0 C2, C2, vAmi, vAsi + eor sC1, sAbe, sC1 SEP + eor sC2, sAbi, sC2 SEP + eor sC3, sAbo, sC3 SEP eor3_m1 C3, vAbo, vAgo, vAko + eor sC4, sAbu, sC4 SEP + SEP + eor sE1, sC0, sC2, ROR #63 SEP eor3_m0 C3, C3, vAmo, vAso + eor sE3, sC2, sC4, ROR #63 SEP + eor sE0, sC4, sC1, ROR #63 SEP + eor sE2, sC1, sC3, ROR #63 SEP eor3_m1 C4, vAbu, vAgu, vAku + eor sE4, sC3, sC0, ROR #63 SEP + SEP + eor s_Aba_, s_Aba, sE0 SEP eor3_m0 C4, C4, vAmu, vAsu + eor sAsa_, sAbi, sE2 SEP + eor sAbi_, sAki, sE2 SEP + eor sAki_, sAko, sE3 SEP + eor sAko_, sAmu, sE4 SEP rax1_m1 E1, C0, C2 + eor sAmu_, sAso, sE3 SEP + eor sAso_, sAma, sE0 SEP + eor sAka_, sAbe, sE1 SEP rax1_m0 E3, C2, C4 + eor sAse_, sAgo, sE3 SEP + eor sAgo_, sAme, sE1 SEP + eor sAke_, sAgi, sE2 SEP rax1_m1 E0, C4, C1 + eor sAgi_, sAka, sE0 SEP + eor sAga_, sAbo, sE3 SEP + eor sAbo_, sAmo, sE3 SEP rax1_m0 E2, C1, C3 + eor sAmo_, sAmi, sE2 SEP + eor sAmi_, sAke, sE1 SEP + eor sAge_, sAgu, sE4 SEP rax1_m1 E4, C3, C0 + eor sAgu_, sAsi, sE2 SEP + eor sAsi_, sAku, sE4 SEP + eor sAku_, sAsa, sE0 SEP + eor sAma_, sAbu, sE4 SEP eor vAba_.16b, vAba.16b, E0.16b + eor sAbu_, sAsu, sE4 SEP + eor sAsu_, sAse, sE1 SEP + eor sAme_, sAga, sE0 SEP xar_m0 vAsa_, vAbi, E2, 2 + eor sAbe_, sAge, sE1 SEP + SEP + load_constant_ptr SEP xar_m1 vAbi_, vAki, E2, 21 + SEP + bic tmp, sAgi_, sAge_, ROR #47 SEP + eor sAga, tmp, sAga_, ROR #39 SEP xar_m0 vAki_, vAko, E3, 39 + bic tmp, sAgo_, sAgi_, ROR #42 SEP + eor sAge, tmp, sAge_, ROR #25 SEP + bic tmp, sAgu_, sAgo_, ROR #16 SEP xar_m1 vAko_, vAmu, E4, 56 + eor sAgi, tmp, sAgi_, ROR #58 SEP + bic tmp, sAga_, sAgu_, ROR #31 SEP + eor sAgo, tmp, sAgo_, ROR #47 SEP xar_m0 vAmu_, vAso, E3, 8 + bic tmp, sAge_, sAga_, ROR #56 SEP + eor sAgu, tmp, sAgu_, ROR #23 SEP + bic tmp, sAki_, sAke_, ROR #19 SEP xar_m1 vAso_, vAma, E0, 23 + eor sAka, tmp, sAka_, ROR #24 SEP + bic tmp, sAko_, sAki_, ROR #47 SEP + eor sAke, tmp, sAke_, ROR #2 SEP xar_m0 vAka_, vAbe, E1, 63 + bic tmp, sAku_, sAko_, ROR #10 SEP + eor sAki, tmp, sAki_, ROR #57 SEP + bic tmp, sAka_, sAku_, ROR #47 SEP xar_m1 vAse_, vAgo, E3, 9 + eor sAko, tmp, sAko_, ROR #57 SEP + bic tmp, sAke_, sAka_, ROR #5 SEP + eor sAku, tmp, sAku_, ROR #52 SEP xar_m0 vAgo_, vAme, E1, 19 + bic tmp, sAmi_, sAme_, ROR #38 SEP + eor sAma, tmp, sAma_, ROR #47 SEP + bic tmp, sAmo_, sAmi_, ROR #5 SEP xar_m1 vAke_, vAgi, E2, 58 + eor sAme, tmp, sAme_, ROR #43 SEP + bic tmp, sAmu_, sAmo_, ROR #41 SEP + eor sAmi, tmp, sAmi_, ROR #46 SEP xar_m0 vAgi_, vAka, E0, 61 + SEP + ldr cur_const, [const_addr] SEP + mov count, #1 SEP xar_m1 vAga_, vAbo, E3, 36 + SEP + bic tmp, sAma_, sAmu_, ROR #35 SEP + eor sAmo, tmp, sAmo_, ROR #12 SEP xar_m0 vAbo_, vAmo, E3, 43 + bic tmp, sAme_, sAma_, ROR #9 SEP + eor sAmu, tmp, sAmu_, ROR #44 SEP + bic tmp, sAsi_, sAse_, ROR #48 SEP xar_m1 vAmo_, vAmi, E2, 49 + eor sAsa, tmp, sAsa_, ROR #41 SEP + bic tmp, sAso_, sAsi_, ROR #2 SEP + eor sAse, tmp, sAse_, ROR #50 SEP xar_m0 vAmi_, vAke, E1, 54 + bic tmp, sAsu_, sAso_, ROR #25 SEP + eor sAsi, tmp, sAsi_, ROR #27 SEP + bic tmp, sAsa_, sAsu_, ROR #60 SEP xar_m1 vAge_, vAgu, E4, 44 + eor sAso, tmp, sAso_, ROR #21 SEP + bic tmp, sAse_, sAsa_, ROR #57 SEP + eor sAsu, tmp, sAsu_, ROR #53 SEP xar_m0 vAgu_, vAsi, E2, 3 + bic tmp, sAbi_, sAbe_, ROR #63 SEP + eor s_Aba, s_Aba_, tmp, ROR #21 SEP + bic tmp, sAbo_, sAbi_, ROR #42 SEP xar_m1 vAsi_, vAku, E4, 25 + eor sAbe, tmp, sAbe_, ROR #41 SEP + bic tmp, sAbu_, sAbo_, ROR #57 SEP + eor sAbi, tmp, sAbi_, ROR #35 SEP xar_m0 vAku_, vAsa, E0, 46 + bic tmp, s_Aba_, sAbu_, ROR #50 SEP + eor sAbo, tmp, sAbo_, ROR #43 SEP + bic tmp, sAbe_, s_Aba_, ROR #44 SEP xar_m1 vAma_, vAbu, E4, 37 + eor sAbu, tmp, sAbu_, ROR #30 SEP + SEP + eor s_Aba, s_Aba, cur_const SEP xar_m0 vAbu_, vAsu, E4, 50 + SEP + save count, STACK_OFFSET_COUNT SEP + SEP xar_m1 vAsu_, vAse, E1, 62 + eor sC0, sAka, sAsa, ROR #50 SEP + eor sC1, sAse, sAge, ROR #60 SEP + eor sC2, sAmi, sAgi, ROR #59 SEP xar_m0 vAme_, vAga, E0, 28 + eor sC3, sAgo, sAso, ROR #30 SEP + eor sC4, sAbu, sAsu, ROR #53 SEP + eor sC0, sAma, sC0, ROR #49 SEP xar_m1 vAbe_, vAge, E1, 20 + eor sC1, sAbe, sC1, ROR #44 SEP + eor sC2, sAki, sC2, ROR #26 SEP restore sE1, STACK_OFFSET_CONST + eor sC3, sAmo, sC3, ROR #63 SEP + eor sC4, sAmu, sC4, ROR #56 SEP + eor sC0, sAga, sC0, ROR #57 SEP ld1r {v28.2d}, [sE1], #8 + eor sC1, sAme, sC1, ROR #58 SEP + eor sC2, sAbi, sC2, ROR #60 SEP + eor sC3, sAko, sC3, ROR #38 SEP save sE1, STACK_OFFSET_CONST + eor sC4, sAgu, sC4, ROR #48 SEP + eor sC0, s_Aba, sC0, ROR #61 SEP bcax_m0 vAga, vAga_, vAgi_, vAge_ + eor sC1, sAke, sC1, ROR #57 SEP + eor sC2, sAsi, sC2, ROR #52 SEP + eor sC3, sAbo, sC3, ROR #63 SEP bcax_m1 vAge, vAge_, vAgo_, vAgi_ + eor sC4, sAku, sC4, ROR #50 SEP + ror sC1, sC1, 56 SEP + ror sC4, sC4, 58 SEP bcax_m0 vAgi, vAgi_, vAgu_, vAgo_ + ror sC2, sC2, 62 SEP + SEP + eor sE1, sC0, sC2, ROR #63 SEP bcax_m1 vAgo, vAgo_, vAga_, vAgu_ + eor sE3, sC2, sC4, ROR #63 SEP + eor sE0, sC4, sC1, ROR #63 SEP + eor sE2, sC1, sC3, ROR #63 SEP bcax_m0 vAgu, vAgu_, vAge_, vAga_ + eor sE4, sC3, sC0, ROR #63 SEP + SEP + eor s_Aba_, sE0, s_Aba SEP bcax_m1 vAka, vAka_, vAki_, vAke_ + eor sAsa_, sE2, sAbi, ROR #50 SEP + eor sAbi_, sE2, sAki, ROR #46 SEP + eor sAki_, sE3, sAko, ROR #63 SEP bcax_m0 vAke, vAke_, vAko_, vAki_ + eor sAko_, sE4, sAmu, ROR #28 SEP + eor sAmu_, sE3, sAso, ROR #2 SEP + eor sAso_, sE0, sAma, ROR #54 SEP bcax_m1 vAki, vAki_, vAku_, vAko_ + eor sAka_, sE1, sAbe, ROR #43 SEP + eor sAse_, sE3, sAgo, ROR #36 SEP + eor sAgo_, sE1, sAme, ROR #49 SEP bcax_m0 vAko, vAko_, vAka_, vAku_ + eor sAke_, sE2, sAgi, ROR #3 SEP + eor sAgi_, sE0, sAka, ROR #39 SEP + eor sAga_, sE3, sAbo SEP bcax_m1 vAku, vAku_, vAke_, vAka_ + eor sAbo_, sE3, sAmo, ROR #37 SEP + eor sAmo_, sE2, sAmi, ROR #8 SEP + eor sAmi_, sE1, sAke, ROR #56 SEP bcax_m0 vAma, vAma_, vAmi_, vAme_ + eor sAge_, sE4, sAgu, ROR #44 SEP + eor sAgu_, sE2, sAsi, ROR #62 SEP + eor sAsi_, sE4, sAku, ROR #58 SEP bcax_m1 vAme, vAme_, vAmo_, vAmi_ + eor sAku_, sE0, sAsa, ROR #25 SEP + eor sAma_, sE4, sAbu, ROR #20 SEP + eor sAbu_, sE4, sAsu, ROR #9 SEP bcax_m0 vAmi, vAmi_, vAmu_, vAmo_ + eor sAsu_, sE1, sAse, ROR #23 SEP + eor sAme_, sE0, sAga, ROR #61 SEP + eor sAbe_, sE1, sAge, ROR #19 SEP bcax_m1 vAmo, vAmo_, vAma_, vAmu_ + SEP + load_constant_ptr SEP + restore count, STACK_OFFSET_COUNT SEP bcax_m0 vAmu, vAmu_, vAme_, vAma_ + SEP + bic tmp, sAgi_, sAge_, ROR #47 SEP + eor sAga, tmp, sAga_, ROR #39 SEP bcax_m1 vAsa, vAsa_, vAsi_, vAse_ + bic tmp, sAgo_, sAgi_, ROR #42 SEP + eor sAge, tmp, sAge_, ROR #25 SEP + bic tmp, sAgu_, sAgo_, ROR #16 SEP bcax_m0 vAse, vAse_, vAso_, vAsi_ + eor sAgi, tmp, sAgi_, ROR #58 SEP + bic tmp, sAga_, sAgu_, ROR #31 SEP + eor sAgo, tmp, sAgo_, ROR #47 SEP bcax_m1 vAsi, vAsi_, vAsu_, vAso_ + bic tmp, sAge_, sAga_, ROR #56 SEP + eor sAgu, tmp, sAgu_, ROR #23 SEP + bic tmp, sAki_, sAke_, ROR #19 SEP bcax_m0 vAso, vAso_, vAsa_, vAsu_ + eor sAka, tmp, sAka_, ROR #24 SEP + bic tmp, sAko_, sAki_, ROR #47 SEP + eor sAke, tmp, sAke_, ROR #2 SEP bcax_m1 vAsu, vAsu_, vAse_, vAsa_ + bic tmp, sAku_, sAko_, ROR #10 SEP + eor sAki, tmp, sAki_, ROR #57 SEP + bic tmp, sAka_, sAku_, ROR #47 SEP bcax_m0 vAba, vAba_, vAbi_, vAbe_ + eor sAko, tmp, sAko_, ROR #57 SEP + bic tmp, sAke_, sAka_, ROR #5 SEP + eor sAku, tmp, sAku_, ROR #52 SEP bcax_m1 vAbe, vAbe_, vAbo_, vAbi_ + bic tmp, sAmi_, sAme_, ROR #38 SEP + eor sAma, tmp, sAma_, ROR #47 SEP + bic tmp, sAmo_, sAmi_, ROR #5 SEP bcax_m0 vAbi, vAbi_, vAbu_, vAbo_ + eor sAme, tmp, sAme_, ROR #43 SEP + bic tmp, sAmu_, sAmo_, ROR #41 SEP + eor sAmi, tmp, sAmi_, ROR #46 SEP bcax_m1 vAbo, vAbo_, vAba_, vAbu_ + bic tmp, sAma_, sAmu_, ROR #35 SEP + SEP + ldr cur_const, [const_addr, count, UXTW #3] SEP bcax_m0 vAbu, vAbu_, vAbe_, vAba_ + SEP + eor sAmo, tmp, sAmo_, ROR #12 SEP + bic tmp, sAme_, sAma_, ROR #9 SEP + eor sAmu, tmp, sAmu_, ROR #44 SEP eor vAba.16b, vAba.16b, v28.16b + bic tmp, sAsi_, sAse_, ROR #48 SEP + eor sAsa, tmp, sAsa_, ROR #41 SEP + bic tmp, sAso_, sAsi_, ROR #2 SEP + eor sAse, tmp, sAse_, ROR #50 SEP + bic tmp, sAsu_, sAso_, ROR #25 SEP + eor sAsi, tmp, sAsi_, ROR #27 SEP + bic tmp, sAsa_, sAsu_, ROR #60 SEP + eor sAso, tmp, sAso_, ROR #21 SEP + bic tmp, sAse_, sAsa_, ROR #57 SEP + eor sAsu, tmp, sAsu_, ROR #53 SEP + bic tmp, sAbi_, sAbe_, ROR #63 SEP + eor s_Aba, s_Aba_, tmp, ROR #21 SEP + bic tmp, sAbo_, sAbi_, ROR #42 SEP + eor sAbe, tmp, sAbe_, ROR #41 SEP + bic tmp, sAbu_, sAbo_, ROR #57 SEP + eor sAbi, tmp, sAbi_, ROR #35 SEP + bic tmp, s_Aba_, sAbu_, ROR #50 SEP + eor sAbo, tmp, sAbo_, ROR #43 SEP + bic tmp, sAbe_, s_Aba_, ROR #44 SEP + eor sAbu, tmp, sAbu_, ROR #30 SEP + SEP + add count, count, #1 SEP + SEP + eor s_Aba, s_Aba, cur_const SEP + SEP +.endm + +.macro hybrid_round_noninitial + save count, STACK_OFFSET_COUNT SEP eor3_m1 C0, vAba, vAga, vAka + SEP + eor sC0, sAka, sAsa, ROR #50 SEP + eor sC1, sAse, sAge, ROR #60 SEP eor3_m0 C0, C0, vAma, vAsa + eor sC2, sAmi, sAgi, ROR #59 SEP + eor sC3, sAgo, sAso, ROR #30 SEP + eor sC4, sAbu, sAsu, ROR #53 SEP eor3_m1 C1, vAbe, vAge, vAke + eor sC0, sAma, sC0, ROR #49 SEP + eor sC1, sAbe, sC1, ROR #44 SEP + eor sC2, sAki, sC2, ROR #26 SEP eor3_m0 C1, C1, vAme, vAse + eor sC3, sAmo, sC3, ROR #63 SEP + eor sC4, sAmu, sC4, ROR #56 SEP + eor sC0, sAga, sC0, ROR #57 SEP eor3_m1 C2, vAbi, vAgi, vAki + eor sC1, sAme, sC1, ROR #58 SEP + eor sC2, sAbi, sC2, ROR #60 SEP + eor sC3, sAko, sC3, ROR #38 SEP eor3_m0 C2, C2, vAmi, vAsi + eor sC4, sAgu, sC4, ROR #48 SEP + eor sC0, s_Aba, sC0, ROR #61 SEP + eor sC1, sAke, sC1, ROR #57 SEP eor3_m1 C3, vAbo, vAgo, vAko + eor sC2, sAsi, sC2, ROR #52 SEP + eor sC3, sAbo, sC3, ROR #63 SEP + eor sC4, sAku, sC4, ROR #50 SEP eor3_m0 C3, C3, vAmo, vAso + ror sC1, sC1, 56 SEP + ror sC4, sC4, 58 SEP + ror sC2, sC2, 62 SEP eor3_m1 C4, vAbu, vAgu, vAku + SEP + eor sE1, sC0, sC2, ROR #63 SEP + eor sE3, sC2, sC4, ROR #63 SEP eor3_m0 C4, C4, vAmu, vAsu + eor sE0, sC4, sC1, ROR #63 SEP + eor sE2, sC1, sC3, ROR #63 SEP + eor sE4, sC3, sC0, ROR #63 SEP + SEP rax1_m1 E1, C0, C2 + eor s_Aba_, sE0, s_Aba SEP + eor sAsa_, sE2, sAbi, ROR #50 SEP + eor sAbi_, sE2, sAki, ROR #46 SEP rax1_m0 E3, C2, C4 + eor sAki_, sE3, sAko, ROR #63 SEP + eor sAko_, sE4, sAmu, ROR #28 SEP + eor sAmu_, sE3, sAso, ROR #2 SEP rax1_m1 E0, C4, C1 + eor sAso_, sE0, sAma, ROR #54 SEP + eor sAka_, sE1, sAbe, ROR #43 SEP + eor sAse_, sE3, sAgo, ROR #36 SEP rax1_m0 E2, C1, C3 + eor sAgo_, sE1, sAme, ROR #49 SEP + eor sAke_, sE2, sAgi, ROR #3 SEP + eor sAgi_, sE0, sAka, ROR #39 SEP rax1_m1 E4, C3, C0 + eor sAga_, sE3, sAbo SEP + eor sAbo_, sE3, sAmo, ROR #37 SEP + eor sAmo_, sE2, sAmi, ROR #8 SEP + eor sAmi_, sE1, sAke, ROR #56 SEP eor vAba_.16b, vAba.16b, E0.16b + eor sAge_, sE4, sAgu, ROR #44 SEP + eor sAgu_, sE2, sAsi, ROR #62 SEP + eor sAsi_, sE4, sAku, ROR #58 SEP xar_m0 vAsa_, vAbi, E2, 2 + eor sAku_, sE0, sAsa, ROR #25 SEP + eor sAma_, sE4, sAbu, ROR #20 SEP + eor sAbu_, sE4, sAsu, ROR #9 SEP xar_m1 vAbi_, vAki, E2, 21 + eor sAsu_, sE1, sAse, ROR #23 SEP + eor sAme_, sE0, sAga, ROR #61 SEP + eor sAbe_, sE1, sAge, ROR #19 SEP xar_m0 vAki_, vAko, E3, 39 + SEP + load_constant_ptr SEP + restore count, STACK_OFFSET_COUNT SEP xar_m1 vAko_, vAmu, E4, 56 + SEP + bic tmp, sAgi_, sAge_, ROR #47 SEP + eor sAga, tmp, sAga_, ROR #39 SEP xar_m0 vAmu_, vAso, E3, 8 + bic tmp, sAgo_, sAgi_, ROR #42 SEP + eor sAge, tmp, sAge_, ROR #25 SEP + bic tmp, sAgu_, sAgo_, ROR #16 SEP xar_m1 vAso_, vAma, E0, 23 + eor sAgi, tmp, sAgi_, ROR #58 SEP + bic tmp, sAga_, sAgu_, ROR #31 SEP + eor sAgo, tmp, sAgo_, ROR #47 SEP xar_m0 vAka_, vAbe, E1, 63 + bic tmp, sAge_, sAga_, ROR #56 SEP + eor sAgu, tmp, sAgu_, ROR #23 SEP + bic tmp, sAki_, sAke_, ROR #19 SEP xar_m1 vAse_, vAgo, E3, 9 + eor sAka, tmp, sAka_, ROR #24 SEP + bic tmp, sAko_, sAki_, ROR #47 SEP + eor sAke, tmp, sAke_, ROR #2 SEP xar_m0 vAgo_, vAme, E1, 19 + bic tmp, sAku_, sAko_, ROR #10 SEP + eor sAki, tmp, sAki_, ROR #57 SEP + bic tmp, sAka_, sAku_, ROR #47 SEP xar_m1 vAke_, vAgi, E2, 58 + eor sAko, tmp, sAko_, ROR #57 SEP + bic tmp, sAke_, sAka_, ROR #5 SEP + eor sAku, tmp, sAku_, ROR #52 SEP xar_m0 vAgi_, vAka, E0, 61 + bic tmp, sAmi_, sAme_, ROR #38 SEP + eor sAma, tmp, sAma_, ROR #47 SEP + bic tmp, sAmo_, sAmi_, ROR #5 SEP xar_m1 vAga_, vAbo, E3, 36 + eor sAme, tmp, sAme_, ROR #43 SEP + bic tmp, sAmu_, sAmo_, ROR #41 SEP + eor sAmi, tmp, sAmi_, ROR #46 SEP xar_m0 vAbo_, vAmo, E3, 43 + bic tmp, sAma_, sAmu_, ROR #35 SEP + SEP + ldr cur_const, [const_addr, count, UXTW #3] SEP xar_m1 vAmo_, vAmi, E2, 49 + add count, count, #1 SEP + SEP + eor sAmo, tmp, sAmo_, ROR #12 SEP xar_m0 vAmi_, vAke, E1, 54 + bic tmp, sAme_, sAma_, ROR #9 SEP + eor sAmu, tmp, sAmu_, ROR #44 SEP + bic tmp, sAsi_, sAse_, ROR #48 SEP xar_m1 vAge_, vAgu, E4, 44 + eor sAsa, tmp, sAsa_, ROR #41 SEP + bic tmp, sAso_, sAsi_, ROR #2 SEP + eor sAse, tmp, sAse_, ROR #50 SEP xar_m0 vAgu_, vAsi, E2, 3 + bic tmp, sAsu_, sAso_, ROR #25 SEP + eor sAsi, tmp, sAsi_, ROR #27 SEP + bic tmp, sAsa_, sAsu_, ROR #60 SEP xar_m1 vAsi_, vAku, E4, 25 + eor sAso, tmp, sAso_, ROR #21 SEP + bic tmp, sAse_, sAsa_, ROR #57 SEP + eor sAsu, tmp, sAsu_, ROR #53 SEP xar_m0 vAku_, vAsa, E0, 46 + bic tmp, sAbi_, sAbe_, ROR #63 SEP + eor s_Aba, s_Aba_, tmp, ROR #21 SEP + bic tmp, sAbo_, sAbi_, ROR #42 SEP xar_m1 vAma_, vAbu, E4, 37 + eor sAbe, tmp, sAbe_, ROR #41 SEP + bic tmp, sAbu_, sAbo_, ROR #57 SEP + eor sAbi, tmp, sAbi_, ROR #35 SEP xar_m0 vAbu_, vAsu, E4, 50 + bic tmp, s_Aba_, sAbu_, ROR #50 SEP + eor sAbo, tmp, sAbo_, ROR #43 SEP + bic tmp, sAbe_, s_Aba_, ROR #44 SEP xar_m1 vAsu_, vAse, E1, 62 + eor sAbu, tmp, sAbu_, ROR #30 SEP + SEP + eor s_Aba, s_Aba, cur_const SEP xar_m0 vAme_, vAga, E0, 28 + save count, STACK_OFFSET_COUNT SEP + SEP + eor sC0, sAka, sAsa, ROR #50 SEP xar_m1 vAbe_, vAge, E1, 20 + eor sC1, sAse, sAge, ROR #60 SEP + eor sC2, sAmi, sAgi, ROR #59 SEP + eor sC3, sAgo, sAso, ROR #30 SEP + eor sC4, sAbu, sAsu, ROR #53 SEP restore sE1, STACK_OFFSET_CONST + eor sC0, sAma, sC0, ROR #49 SEP + eor sC1, sAbe, sC1, ROR #44 SEP + eor sC2, sAki, sC2, ROR #26 SEP ld1r {v28.2d}, [sE1], #8 + eor sC3, sAmo, sC3, ROR #63 SEP + eor sC4, sAmu, sC4, ROR #56 SEP + eor sC0, sAga, sC0, ROR #57 SEP save sE1, STACK_OFFSET_CONST + eor sC1, sAme, sC1, ROR #58 SEP + eor sC2, sAbi, sC2, ROR #60 SEP + eor sC3, sAko, sC3, ROR #38 SEP + eor sC4, sAgu, sC4, ROR #48 SEP bcax_m0 vAga, vAga_, vAgi_, vAge_ + eor sC0, s_Aba, sC0, ROR #61 SEP + eor sC1, sAke, sC1, ROR #57 SEP + eor sC2, sAsi, sC2, ROR #52 SEP bcax_m1 vAge, vAge_, vAgo_, vAgi_ + eor sC3, sAbo, sC3, ROR #63 SEP + eor sC4, sAku, sC4, ROR #50 SEP + ror sC1, sC1, 56 SEP bcax_m0 vAgi, vAgi_, vAgu_, vAgo_ + ror sC4, sC4, 58 SEP + ror sC2, sC2, 62 SEP + SEP bcax_m1 vAgo, vAgo_, vAga_, vAgu_ + eor sE1, sC0, sC2, ROR #63 SEP + eor sE3, sC2, sC4, ROR #63 SEP + eor sE0, sC4, sC1, ROR #63 SEP bcax_m0 vAgu, vAgu_, vAge_, vAga_ + eor sE2, sC1, sC3, ROR #63 SEP + eor sE4, sC3, sC0, ROR #63 SEP + SEP bcax_m1 vAka, vAka_, vAki_, vAke_ + eor s_Aba_, sE0, s_Aba SEP + eor sAsa_, sE2, sAbi, ROR #50 SEP + eor sAbi_, sE2, sAki, ROR #46 SEP bcax_m0 vAke, vAke_, vAko_, vAki_ + eor sAki_, sE3, sAko, ROR #63 SEP + eor sAko_, sE4, sAmu, ROR #28 SEP + eor sAmu_, sE3, sAso, ROR #2 SEP bcax_m1 vAki, vAki_, vAku_, vAko_ + eor sAso_, sE0, sAma, ROR #54 SEP + eor sAka_, sE1, sAbe, ROR #43 SEP + eor sAse_, sE3, sAgo, ROR #36 SEP bcax_m0 vAko, vAko_, vAka_, vAku_ + eor sAgo_, sE1, sAme, ROR #49 SEP + eor sAke_, sE2, sAgi, ROR #3 SEP + eor sAgi_, sE0, sAka, ROR #39 SEP bcax_m1 vAku, vAku_, vAke_, vAka_ + eor sAga_, sE3, sAbo SEP + eor sAbo_, sE3, sAmo, ROR #37 SEP + eor sAmo_, sE2, sAmi, ROR #8 SEP bcax_m0 vAma, vAma_, vAmi_, vAme_ + eor sAmi_, sE1, sAke, ROR #56 SEP + eor sAge_, sE4, sAgu, ROR #44 SEP + eor sAgu_, sE2, sAsi, ROR #62 SEP bcax_m1 vAme, vAme_, vAmo_, vAmi_ + eor sAsi_, sE4, sAku, ROR #58 SEP + eor sAku_, sE0, sAsa, ROR #25 SEP + eor sAma_, sE4, sAbu, ROR #20 SEP bcax_m0 vAmi, vAmi_, vAmu_, vAmo_ + eor sAbu_, sE4, sAsu, ROR #9 SEP + eor sAsu_, sE1, sAse, ROR #23 SEP + eor sAme_, sE0, sAga, ROR #61 SEP bcax_m1 vAmo, vAmo_, vAma_, vAmu_ + eor sAbe_, sE1, sAge, ROR #19 SEP + SEP + load_constant_ptr SEP bcax_m0 vAmu, vAmu_, vAme_, vAma_ + restore count, STACK_OFFSET_COUNT SEP + SEP + bic tmp, sAgi_, sAge_, ROR #47 SEP bcax_m1 vAsa, vAsa_, vAsi_, vAse_ + eor sAga, tmp, sAga_, ROR #39 SEP + bic tmp, sAgo_, sAgi_, ROR #42 SEP + eor sAge, tmp, sAge_, ROR #25 SEP bcax_m0 vAse, vAse_, vAso_, vAsi_ + bic tmp, sAgu_, sAgo_, ROR #16 SEP + eor sAgi, tmp, sAgi_, ROR #58 SEP + bic tmp, sAga_, sAgu_, ROR #31 SEP bcax_m1 vAsi, vAsi_, vAsu_, vAso_ + eor sAgo, tmp, sAgo_, ROR #47 SEP + bic tmp, sAge_, sAga_, ROR #56 SEP + eor sAgu, tmp, sAgu_, ROR #23 SEP bcax_m0 vAso, vAso_, vAsa_, vAsu_ + bic tmp, sAki_, sAke_, ROR #19 SEP + eor sAka, tmp, sAka_, ROR #24 SEP + bic tmp, sAko_, sAki_, ROR #47 SEP bcax_m1 vAsu, vAsu_, vAse_, vAsa_ + eor sAke, tmp, sAke_, ROR #2 SEP + bic tmp, sAku_, sAko_, ROR #10 SEP + eor sAki, tmp, sAki_, ROR #57 SEP bcax_m0 vAba, vAba_, vAbi_, vAbe_ + bic tmp, sAka_, sAku_, ROR #47 SEP + eor sAko, tmp, sAko_, ROR #57 SEP + bic tmp, sAke_, sAka_, ROR #5 SEP bcax_m1 vAbe, vAbe_, vAbo_, vAbi_ + eor sAku, tmp, sAku_, ROR #52 SEP + bic tmp, sAmi_, sAme_, ROR #38 SEP + eor sAma, tmp, sAma_, ROR #47 SEP bcax_m0 vAbi, vAbi_, vAbu_, vAbo_ + bic tmp, sAmo_, sAmi_, ROR #5 SEP + eor sAme, tmp, sAme_, ROR #43 SEP + bic tmp, sAmu_, sAmo_, ROR #41 SEP bcax_m1 vAbo, vAbo_, vAba_, vAbu_ + eor sAmi, tmp, sAmi_, ROR #46 SEP + bic tmp, sAma_, sAmu_, ROR #35 SEP + SEP bcax_m0 vAbu, vAbu_, vAbe_, vAba_ + ldr cur_const, [const_addr, count, UXTW #3] SEP + add count, count, #1 SEP + SEP eor vAba.16b, vAba.16b, v28.16b + eor sAmo, tmp, sAmo_, ROR #12 SEP + bic tmp, sAme_, sAma_, ROR #9 SEP + eor sAmu, tmp, sAmu_, ROR #44 SEP + bic tmp, sAsi_, sAse_, ROR #48 SEP + eor sAsa, tmp, sAsa_, ROR #41 SEP + bic tmp, sAso_, sAsi_, ROR #2 SEP + eor sAse, tmp, sAse_, ROR #50 SEP + bic tmp, sAsu_, sAso_, ROR #25 SEP + eor sAsi, tmp, sAsi_, ROR #27 SEP + bic tmp, sAsa_, sAsu_, ROR #60 SEP + eor sAso, tmp, sAso_, ROR #21 SEP + bic tmp, sAse_, sAsa_, ROR #57 SEP + eor sAsu, tmp, sAsu_, ROR #53 SEP + bic tmp, sAbi_, sAbe_, ROR #63 SEP + eor s_Aba, s_Aba_, tmp, ROR #21 SEP + bic tmp, sAbo_, sAbi_, ROR #42 SEP + eor sAbe, tmp, sAbe_, ROR #41 SEP + bic tmp, sAbu_, sAbo_, ROR #57 SEP + eor sAbi, tmp, sAbi_, ROR #35 SEP + bic tmp, s_Aba_, sAbu_, ROR #50 SEP + eor sAbo, tmp, sAbo_, ROR #43 SEP + bic tmp, sAbe_, s_Aba_, ROR #44 SEP + eor sAbu, tmp, sAbu_, ROR #30 SEP + SEP + eor s_Aba, s_Aba, cur_const SEP + +.endm + +.macro final_rotate + ror sAga, sAga,#(64-3) + ror sAka, sAka,#(64-25) + ror sAma, sAma,#(64-10) + ror sAsa, sAsa,#(64-39) + ror sAbe, sAbe,#(64-21) + ror sAge, sAge,#(64-45) + ror sAke, sAke,#(64-8) + ror sAme, sAme,#(64-15) + ror sAse, sAse,#(64-41) + ror sAbi, sAbi,#(64-14) + ror sAgi, sAgi,#(64-61) + ror sAki, sAki,#(64-18) + ror sAmi, sAmi,#(64-56) + ror sAsi, sAsi,#(64-2) + ror sAgo, sAgo,#(64-28) + ror sAko, sAko,#(64-1) + ror sAmo, sAmo,#(64-27) + ror sAso, sAso,#(64-62) + ror sAbu, sAbu,#(64-44) + ror sAgu, sAgu,#(64-20) + ror sAku, sAku,#(64-6) + ror sAmu, sAmu,#(64-36) + ror sAsu, sAsu,#(64-55) +.endm + +#define KECCAK_F1600_ROUNDS 24 + +.global keccak_f1600_x4_hybrid_asm_v4p +.global _keccak_f1600_x4_hybrid_asm_v4p +.text +.align 4 + +keccak_f1600_x4_hybrid_asm_v4p: +_keccak_f1600_x4_hybrid_asm_v4p: + alloc_stack + save_gprs + save_vregs + save input_addr, STACK_OFFSET_INPUT + + ASM_LOAD(const_addr,round_constants) + save const_addr, STACK_OFFSET_CONST + + load_input_vector + + add input_addr, input_addr, #16 + + mov out_count, #0 +outer_loop: + save out_count, STACK_OFFSET_COUNT_OUT + + load_input_scalar + save input_addr, STACK_OFFSET_CUR_INPUT + + hybrid_round_initial +1: + hybrid_round_noninitial + cmp count, #(KECCAK_F1600_ROUNDS) + blt 1b + + final_rotate + restore input_addr, STACK_OFFSET_CUR_INPUT + store_input_scalar + add input_addr, input_addr, #8 + + restore out_count, STACK_OFFSET_COUNT_OUT + add out_count, out_count, #1 + cmp out_count, #2 + blt outer_loop + + restore input_addr, STACK_OFFSET_INPUT + store_input_vector + + restore_vregs + restore_gprs + free_stack + ret + +#endif diff --git a/asm/manual/keccak_f1600/keccak_f1600_x4_hybrid_asm_v5.s b/asm/manual/keccak_f1600/keccak_f1600_x4_hybrid_asm_v5.s new file mode 100644 index 0000000..8a16f20 --- /dev/null +++ b/asm/manual/keccak_f1600/keccak_f1600_x4_hybrid_asm_v5.s @@ -0,0 +1,1360 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +/********************** CONSTANTS *************************/ + .data + .align(8) +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 +round_constants_vec: + .quad 0x0000000000000001 + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + .quad 0x8000000080008008 +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x29 + count .req w27 + cur_const .req x26 + + /* Mapping of Kecck-f1600 SIMD state to vector registers + * at the beginning and end of each round. */ + + /* Mapping of Kecck-f1600 state to vector registers + * at the beginning and end of each round. */ + vAba .req v0 + vAbe .req v1 + vAbi .req v2 + vAbo .req v3 + vAbu .req v4 + vAga .req v5 + vAge .req v6 + vAgi .req v7 + vAgo .req v8 + vAgu .req v9 + vAka .req v10 + vAke .req v11 + vAki .req v12 + vAko .req v13 + vAku .req v14 + vAma .req v15 + vAme .req v16 + vAmi .req v17 + vAmo .req v18 + vAmu .req v19 + vAsa .req v20 + vAse .req v21 + vAsi .req v22 + vAso .req v23 + vAsu .req v24 + + /* q-form of the above mapping */ + vAbaq .req q0 + vAbeq .req q1 + vAbiq .req q2 + vAboq .req q3 + vAbuq .req q4 + vAgaq .req q5 + vAgeq .req q6 + vAgiq .req q7 + vAgoq .req q8 + vAguq .req q9 + vAkaq .req q10 + vAkeq .req q11 + vAkiq .req q12 + vAkoq .req q13 + vAkuq .req q14 + vAmaq .req q15 + vAmeq .req q16 + vAmiq .req q17 + vAmoq .req q18 + vAmuq .req q19 + vAsaq .req q20 + vAseq .req q21 + vAsiq .req q22 + vAsoq .req q23 + vAsuq .req q24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v27 + C1 .req v28 + C2 .req v29 + C3 .req v30 + C4 .req v31 + + C0q .req q27 + C1q .req q28 + C2q .req q29 + C3q .req q30 + C4q .req q31 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + vBba .req v25 // fresh + vBbe .req v26 // fresh + vBbi .req vAbi + vBbo .req vAbo + vBbu .req vAbu + vBga .req vAka + vBge .req vAke + vBgi .req vAgi + vBgo .req vAgo + vBgu .req vAgu + vBka .req vAma + vBke .req vAme + vBki .req vAki + vBko .req vAko + vBku .req vAku + vBma .req vAsa + vBme .req vAse + vBmi .req vAmi + vBmo .req vAmo + vBmu .req vAmu + vBsa .req vAba + vBse .req vAbe + vBsi .req vAsi + vBso .req vAso + vBsu .req vAsu + + vBbaq .req q25 // fresh + vBbeq .req q26 // fresh + vBbiq .req vAbiq + vBboq .req vAboq + vBbuq .req vAbuq + vBgaq .req vAkaq + vBgeq .req vAkeq + vBgiq .req vAgiq + vBgoq .req vAgoq + vBguq .req vAguq + vBkaq .req vAmaq + vBkeq .req vAmeq + vBkiq .req vAkiq + vBkoq .req vAkoq + vBkuq .req vAkuq + vBmaq .req vAsaq + vBmeq .req vAseq + vBmiq .req vAmiq + vBmoq .req vAmoq + vBmuq .req vAmuq + vBsaq .req vAbaq + vBseq .req vAbeq + vBsiq .req vAsiq + vBsoq .req vAsoq + vBsuq .req vAsuq + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req C4 + E1 .req C0 + E2 .req vBbe // fresh + E3 .req C2 + E4 .req C3 + + E0q .req C4q + E1q .req C0q + E2q .req vBbeq // fresh + E3q .req C2q + E4q .req C3q + + /* Mapping of Kecck-f1600 state to scalar registers + * at the beginning and end of each round. */ + s_Aba .req x1 + sAbe .req x6 + sAbi .req x11 + sAbo .req x16 + sAbu .req x21 + sAga .req x2 + sAge .req x7 + sAgi .req x12 + sAgo .req x17 + sAgu .req x22 + sAka .req x3 + sAke .req x8 + sAki .req x13 + sAko .req x18 + sAku .req x23 + sAma .req x4 + sAme .req x9 + sAmi .req x14 + sAmo .req x19 + sAmu .req x24 + sAsa .req x5 + sAse .req x10 + sAsi .req x15 + sAso .req x20 + sAsu .req x25 + + /* sA_[y,2*x+3*y] = rot(A[x,y]) */ + s_Aba_ .req x0 + sAbe_ .req x28 + sAbi_ .req x11 + sAbo_ .req x16 + sAbu_ .req x21 + sAga_ .req x3 + sAge_ .req x8 + sAgi_ .req x12 + sAgo_ .req x17 + sAgu_ .req x22 + sAka_ .req x4 + sAke_ .req x9 + sAki_ .req x13 + sAko_ .req x18 + sAku_ .req x23 + sAma_ .req x5 + sAme_ .req x10 + sAmi_ .req x14 + sAmo_ .req x19 + sAmu_ .req x24 + sAsa_ .req x1 + sAse_ .req x6 + sAsi_ .req x15 + sAso_ .req x20 + sAsu_ .req x25 + + /* sC[x] = sA[x,0] xor sA[x,1] xor sA[x,2] xor sA[x,3] xor sA[x,4], for x in 0..4 */ + /* sE[x] = sC[x-1] xor rot(C[x+1],1), for x in 0..4 */ + sC0 .req x0 + sE0 .req x29 + sC1 .req x26 + sE1 .req x30 + sC2 .req x27 + sE2 .req x26 + sC3 .req x28 + sE3 .req x27 + sC4 .req x29 + sE4 .req x28 + + tmp .req x30 + +/************************ MACROS ****************************/ + +/* Macros using v8.4-A SHA-3 instructions */ + +.macro eor3_m1_0 d s0 s1 s2 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor2 d s0 s1 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor3_m1_1 d s0 s1 s2 + eor \d\().16b, \d\().16b, \s2\().16b +.endm + + +.macro eor3_m1 d s0 s1 s2 + eor3_m1_0 \d, \s0, \s1, \s2 + eor3_m1_1 \d, \s0, \s1, \s2 +.endm + +.macro rax1_m1 d s0 s1 + // Use add instead of SHL #1 + add vvtmp.2d, \s1\().2d, \s1\().2d + sri vvtmp.2d, \s1\().2d, #63 + eor \d\().16b, vvtmp.16b, \s0\().16b +.endm + + .macro xar_m1 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 63 + eor \s0\().16b, \s0\().16b, \s1\().16b + add \d\().2d, \s0\().2d, \s0\().2d + sri \d\().2d, \s0\().2d, #(63) + .elseif \imm == 62 + eor \s0\().16b, \s0\().16b, \s1\().16b + add \d\().2d, \s0\().2d, \s0\().2d + add \d\().2d, \d\().2d, \d\().2d + sri \d\().2d, \s0\().2d, #(62) + .else + eor \s0\().16b, \s0\().16b, \s1\().16b + shl \d\().2d, \s0\().2d, #(64-\imm) + sri \d\().2d, \s0\().2d, #(\imm) + .endif +.endm + + .macro xar_m1_0 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 63 + eor \s0\().16b, \s0\().16b, \s1\().16b + .elseif \imm == 62 + eor \s0\().16b, \s0\().16b, \s1\().16b + .else + eor \s0\().16b, \s0\().16b, \s1\().16b + .endif +.endm + + .macro xar_m1_1 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 63 + add \d\().2d, \s0\().2d, \s0\().2d + sri \d\().2d, \s0\().2d, #(63) + .elseif \imm == 62 + add \d\().2d, \s0\().2d, \s0\().2d + add \d\().2d, \d\().2d, \d\().2d + sri \d\().2d, \s0\().2d, #(62) + .else + shl \d\().2d, \s0\().2d, #(64-\imm) + sri \d\().2d, \s0\().2d, #(\imm) + .endif +.endm + +.macro bcax_m1 d s0 s1 s2 + bic vvtmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, vvtmp.16b, \s0\().16b +.endm + +.macro load_input_vector num idx + ldr vAbaq, [input_addr, #(16*(\num*0+\idx))] + ldr vAbeq, [input_addr, #(16*(\num*1+\idx))] + ldr vAbiq, [input_addr, #(16*(\num*2+\idx))] + ldr vAboq, [input_addr, #(16*(\num*3+\idx))] + ldr vAbuq, [input_addr, #(16*(\num*4+\idx))] + ldr vAgaq, [input_addr, #(16*(\num*5+\idx))] + ldr vAgeq, [input_addr, #(16*(\num*6+\idx))] + ldr vAgiq, [input_addr, #(16*(\num*7+\idx))] + ldr vAgoq, [input_addr, #(16*(\num*8+\idx))] + ldr vAguq, [input_addr, #(16*(\num*9+\idx))] + ldr vAkaq, [input_addr, #(16*(\num*10+\idx))] + ldr vAkeq, [input_addr, #(16*(\num*11+\idx))] + ldr vAkiq, [input_addr, #(16*(\num*12+\idx))] + ldr vAkoq, [input_addr, #(16*(\num*13+\idx))] + ldr vAkuq, [input_addr, #(16*(\num*14+\idx))] + ldr vAmaq, [input_addr, #(16*(\num*15+\idx))] + ldr vAmeq, [input_addr, #(16*(\num*16+\idx))] + ldr vAmiq, [input_addr, #(16*(\num*17+\idx))] + ldr vAmoq, [input_addr, #(16*(\num*18+\idx))] + ldr vAmuq, [input_addr, #(16*(\num*19+\idx))] + ldr vAsaq, [input_addr, #(16*(\num*20+\idx))] + ldr vAseq, [input_addr, #(16*(\num*21+\idx))] + ldr vAsiq, [input_addr, #(16*(\num*22+\idx))] + ldr vAsoq, [input_addr, #(16*(\num*23+\idx))] + ldr vAsuq, [input_addr, #(16*(\num*24+\idx))] +.endm + +.macro store_input_vector num idx + str vAbaq, [input_addr, #(16*(\num*0+\idx))] + str vAbeq, [input_addr, #(16*(\num*1+\idx))] + str vAbiq, [input_addr, #(16*(\num*2+\idx))] + str vAboq, [input_addr, #(16*(\num*3+\idx))] + str vAbuq, [input_addr, #(16*(\num*4+\idx))] + str vAgaq, [input_addr, #(16*(\num*5+\idx))] + str vAgeq, [input_addr, #(16*(\num*6+\idx))] + str vAgiq, [input_addr, #(16*(\num*7+\idx))] + str vAgoq, [input_addr, #(16*(\num*8+\idx))] + str vAguq, [input_addr, #(16*(\num*9+\idx))] + str vAkaq, [input_addr, #(16*(\num*10+\idx))] + str vAkeq, [input_addr, #(16*(\num*11+\idx))] + str vAkiq, [input_addr, #(16*(\num*12+\idx))] + str vAkoq, [input_addr, #(16*(\num*13+\idx))] + str vAkuq, [input_addr, #(16*(\num*14+\idx))] + str vAmaq, [input_addr, #(16*(\num*15+\idx))] + str vAmeq, [input_addr, #(16*(\num*16+\idx))] + str vAmiq, [input_addr, #(16*(\num*17+\idx))] + str vAmoq, [input_addr, #(16*(\num*18+\idx))] + str vAmuq, [input_addr, #(16*(\num*19+\idx))] + str vAsaq, [input_addr, #(16*(\num*20+\idx))] + str vAseq, [input_addr, #(16*(\num*21+\idx))] + str vAsiq, [input_addr, #(16*(\num*22+\idx))] + str vAsoq, [input_addr, #(16*(\num*23+\idx))] + str vAsuq, [input_addr, #(16*(\num*24+\idx))] +.endm + +.macro store_input_scalar num idx + str s_Aba, [input_addr, 8*(\num*(0) +\idx)] + str sAbe, [input_addr, 8*(\num*(0+1) +\idx)] + str sAbi, [input_addr, 8*(\num*(2)+ \idx)] + str sAbo, [input_addr, 8*(\num*(2+1) +\idx)] + str sAbu, [input_addr, 8*(\num*(4)+ \idx)] + str sAga, [input_addr, 8*(\num*(4+1) +\idx)] + str sAge, [input_addr, 8*(\num*(6)+ \idx)] + str sAgi, [input_addr, 8*(\num*(6+1) +\idx)] + str sAgo, [input_addr, 8*(\num*(8)+ \idx)] + str sAgu, [input_addr, 8*(\num*(8+1) +\idx)] + str sAka, [input_addr, 8*(\num*(10) +\idx)] + str sAke, [input_addr, 8*(\num*(10+1)+\idx)] + str sAki, [input_addr, 8*(\num*(12) +\idx)] + str sAko, [input_addr, 8*(\num*(12+1)+\idx)] + str sAku, [input_addr, 8*(\num*(14) +\idx)] + str sAma, [input_addr, 8*(\num*(14+1)+\idx)] + str sAme, [input_addr, 8*(\num*(16) +\idx)] + str sAmi, [input_addr, 8*(\num*(16+1)+\idx)] + str sAmo, [input_addr, 8*(\num*(18) +\idx)] + str sAmu, [input_addr, 8*(\num*(18+1)+\idx)] + str sAsa, [input_addr, 8*(\num*(20) +\idx)] + str sAse, [input_addr, 8*(\num*(20+1)+\idx)] + str sAsi, [input_addr, 8*(\num*(22) +\idx)] + str sAso, [input_addr, 8*(\num*(22+1)+\idx)] + str sAsu, [input_addr, 8*(\num*(24) +\idx)] +.endm + +.macro load_input_scalar num idx + ldr s_Aba, [input_addr, 8*(\num*(0) +\idx)] + ldr sAbe, [input_addr, 8*(\num*(0+1) +\idx)] + ldr sAbi, [input_addr, 8*(\num*(2)+ \idx)] + ldr sAbo, [input_addr, 8*(\num*(2+1) +\idx)] + ldr sAbu, [input_addr, 8*(\num*(4)+ \idx)] + ldr sAga, [input_addr, 8*(\num*(4+1) +\idx)] + ldr sAge, [input_addr, 8*(\num*(6)+ \idx)] + ldr sAgi, [input_addr, 8*(\num*(6+1) +\idx)] + ldr sAgo, [input_addr, 8*(\num*(8)+ \idx)] + ldr sAgu, [input_addr, 8*(\num*(8+1) +\idx)] + ldr sAka, [input_addr, 8*(\num*(10) +\idx)] + ldr sAke, [input_addr, 8*(\num*(10+1)+\idx)] + ldr sAki, [input_addr, 8*(\num*(12) +\idx)] + ldr sAko, [input_addr, 8*(\num*(12+1)+\idx)] + ldr sAku, [input_addr, 8*(\num*(14) +\idx)] + ldr sAma, [input_addr, 8*(\num*(14+1)+\idx)] + ldr sAme, [input_addr, 8*(\num*(16) +\idx)] + ldr sAmi, [input_addr, 8*(\num*(16+1)+\idx)] + ldr sAmo, [input_addr, 8*(\num*(18) +\idx)] + ldr sAmu, [input_addr, 8*(\num*(18+1)+\idx)] + ldr sAsa, [input_addr, 8*(\num*(20) +\idx)] + ldr sAse, [input_addr, 8*(\num*(20+1)+\idx)] + ldr sAsi, [input_addr, 8*(\num*(22) +\idx)] + ldr sAso, [input_addr, 8*(\num*(22+1)+\idx)] + ldr sAsu, [input_addr, 8*(\num*(24) +\idx)] +.endm + +#define STACK_SIZE (8*8 + 16*6 + 3*8 + 8 + 16*34) // VREGS (8*8), GPRs (16*6), count (8), const (8), input (8), padding (8) +#define STACK_BASE_GPRS (3*8+8) +#define STACK_BASE_VREGS (3*8+8+16*6) +#define STACK_BASE_TMP (8*8 + 16*6 + 3*8 + 8) +#define STACK_OFFSET_INPUT (0*8) +#define STACK_OFFSET_CONST (1*8) +#define STACK_OFFSET_COUNT (2*8) + +#define vAga_offset 0 +#define E0_offset 1 +#define E1_offset 2 +#define E2_offset 3 +#define E3_offset 4 +#define E4_offset 5 +#define Ame_offset 7 +#define Agi_offset 8 +#define Aka_offset 9 +#define Abo_offset 10 +#define Amo_offset 11 +#define Ami_offset 12 +#define Ake_offset 13 +#define Agu_offset 14 +#define Asi_offset 15 +#define Aku_offset 16 +#define Asa_offset 17 +#define Abu_offset 18 +#define Asu_offset 19 +#define Ase_offset 20 +//#define Aga_offset 21 +#define Age_offset 22 +#define vBgo_offset 23 +#define vBke_offset 24 +#define vBgi_offset 25 +#define vBga_offset 26 +#define vBbo_offset 27 +#define vBmo_offset 28 +#define vBmi_offset 29 +#define vBge_offset 30 + +#define save(name) \ + str name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] +#define restore(name) \ + ldr name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] + + +.macro save_gprs + stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro save_vregs + stp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] + stp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + stp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + stp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] +.endm + +.macro restore_vregs + ldp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] + ldp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + ldp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + ldp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] +.endm + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +.macro eor5 dst, src0, src1, src2, src3, src4 + eor \dst, \src0, \src1 + eor \dst, \dst, \src2 + eor \dst, \dst, \src3 + eor \dst, \dst, \src4 +.endm + +.macro xor_rol dst, src1, src0, imm + eor \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro bic_rol dst, src1, src0, imm + bic \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro rotate dst, src, imm + ror \dst, \src, #(64-\imm) +.endm + +.macro save reg, offset + str \reg, [sp, #\offset] +.endm + +.macro restore reg, offset + ldr \reg, [sp, #\offset] +.endm + +.macro hybrid_round_initial +eor sC0, sAma, sAsa SEP eor3_m1_0 C1,vAbe,vAge,vAke +eor sC1, sAme, sAse SEP +eor sC2, sAmi, sAsi SEP eor3_m1_0 C3,vAbo,vAgo,vAko +eor sC3, sAmo, sAso SEP +eor sC4, sAmu, sAsu SEP eor3_m1_0 C0,vAba,vAga,vAka +eor sC0, sAka, sC0 SEP +eor sC1, sAke, sC1 SEP eor3_m1_0 C2,vAbi,vAgi,vAki +eor sC2, sAki, sC2 SEP +eor sC3, sAko, sC3 SEP eor3_m1_0 C4,vAbu,vAgu,vAku +eor sC4, sAku, sC4 SEP +eor sC0, sAga, sC0 SEP eor3_m1_1 C1,vAbe,vAge,vAke +eor sC1, sAge, sC1 SEP eor3_m1_1 C3,vAbo,vAgo,vAko +eor sC2, sAgi, sC2 SEP +eor sC3, sAgo, sC3 SEP eor3_m1_1 C0,vAba,vAga,vAka +eor sC4, sAgu, sC4 SEP +eor sC0, s_Aba, sC0 SEP eor3_m1_1 C2,vAbi,vAgi,vAki +eor sC1, sAbe, sC1 SEP +eor sC2, sAbi, sC2 SEP eor3_m1_1 C4,vAbu,vAgu,vAku +eor sC3, sAbo, sC3 SEP +eor sC4, sAbu, sC4 SEP eor3_m1_0 C1, C1,vAme, vAse +eor sE1, sC0, sC2, ROR #63 SEP eor3_m1_0 C3, C3,vAmo, vAso +eor sE3, sC2, sC4, ROR #63 SEP +eor sE0, sC4, sC1, ROR #63 SEP eor3_m1_0 C0, C0,vAma, vAsa +eor sE2, sC1, sC3, ROR #63 SEP +eor sE4, sC3, sC0, ROR #63 SEP eor3_m1_0 C2, C2,vAmi, vAsi +eor s_Aba_, s_Aba, sE0 SEP +eor sAsa_, sAbi, sE2 SEP eor3_m1_0 C4, C4,vAmu, vAsu +eor sAbi_, sAki, sE2 SEP +eor sAki_, sAko, sE3 SEP eor3_m1_1 C1, C1,vAme, vAse +eor sAko_, sAmu, sE4 SEP eor3_m1_1 C3, C3,vAmo, vAso +eor sAmu_, sAso, sE3 SEP +eor sAso_, sAma, sE0 SEP eor3_m1_1 C0, C0,vAma, vAsa +eor sAka_, sAbe, sE1 SEP +eor sAse_, sAgo, sE3 SEP eor3_m1_1 C2, C2,vAmi, vAsi +eor sAgo_, sAme, sE1 SEP +eor sAke_, sAgi, sE2 SEP eor3_m1_1 C4, C4,vAmu, vAsu +eor sAgi_, sAka, sE0 SEP +eor sAga_, sAbo, sE3 SEP vvtmp .req vBba +eor sAbo_, sAmo, sE3 SEP rax1_m1 E2, C1, C3 +eor sAmo_, sAmi, sE2 SEP +eor sAmi_, sAke, sE1 SEP rax1_m1 E4, C3, C0 +eor sAge_, sAgu, sE4 SEP +eor sAgu_, sAsi, sE2 SEP rax1_m1 E1, C0, C2 +eor sAsi_, sAku, sE4 SEP +eor sAku_, sAsa, sE0 SEP rax1_m1 E3, C2, C4 +eor sAma_, sAbu, sE4 SEP +eor sAbu_, sAsu, sE4 SEP str vAgiq, [sp, #(STACK_BASE_TMP + 16*32)] +eor sAsu_, sAse, sE1 SEP rax1_m1 E0, C4, C1 +eor sAme_, sAga, sE0 SEP +eor sAbe_, sAge, sE1 SEP /* 25x XAR, 75 in total */ +load_constant_ptr SEP +bic tmp, sAgi_, sAge_, ROR #47 SEP .unreq vvtmp +eor sAga, tmp, sAga_, ROR #39 SEP +bic tmp, sAgo_, sAgi_, ROR #42 SEP vvtmp .req C1 +eor sAge, tmp, sAge_, ROR #25 SEP +bic tmp, sAgu_, sAgo_, ROR #16 SEP vvtmpq .req C1q +eor sAgi, tmp, sAgi_, ROR #58 SEP xar_m1 vBgi, vAka, E0, 61 +bic tmp, sAga_, sAgu_, ROR #31 SEP +eor sAgo, tmp, sAgo_, ROR #47 SEP xar_m1 vBga, vAbo, E3, 36 +bic tmp, sAge_, sAga_, ROR #56 SEP +eor sAgu, tmp, sAgu_, ROR #23 SEP str vAgaq, [sp, #(STACK_BASE_TMP + 16 * 30)] +bic tmp, sAki_, sAke_, ROR #19 SEP +eor sAka, tmp, sAka_, ROR #24 SEP xar_m1 vBbo, vAmo, E3, 43 +bic tmp, sAko_, sAki_, ROR #47 SEP +eor sAke, tmp, sAke_, ROR #2 SEP xar_m1 vBmo, vAmi, E2, 49 +bic tmp, sAku_, sAko_, ROR #10 SEP str vAgeq, [sp, #(STACK_BASE_TMP + 16 * 31)] +eor sAki, tmp, sAki_, ROR #57 SEP +bic tmp, sAka_, sAku_, ROR #47 SEP xar_m1 vBmi, vAke, E1, 54 +eor sAko, tmp, sAko_, ROR #57 SEP +bic tmp, sAke_, sAka_, ROR #5 SEP xar_m1 vBge, vAgu, E4, 44 +eor sAku, tmp, sAku_, ROR #52 SEP +bic tmp, sAmi_, sAme_, ROR #38 SEP bcax_m1 vAga, vBga, vBgi, vBge +eor sAma, tmp, sAma_, ROR #47 SEP +bic tmp, sAmo_, sAmi_, ROR #5 SEP eor vBba.16b, vAba.16b, E0.16b +eor sAme, tmp, sAme_, ROR #43 SEP +bic tmp, sAmu_, sAmo_, ROR #41 SEP xar_m1 vBsa, vAbi, E2, 2 +eor sAmi, tmp, sAmi_, ROR #46 SEP xar_m1 vBbi, vAki, E2, 21 +ldr cur_const, [const_addr] SEP +mov count, #1 SEP xar_m1 vBki, vAko, E3, 39 +bic tmp, sAma_, sAmu_, ROR #35 SEP +eor sAmo, tmp, sAmo_, ROR #12 SEP xar_m1 vBko, vAmu, E4, 56 +bic tmp, sAme_, sAma_, ROR #9 SEP +eor sAmu, tmp, sAmu_, ROR #44 SEP xar_m1 vBmu, vAso, E3, 8 +bic tmp, sAsi_, sAse_, ROR #48 SEP +eor sAsa, tmp, sAsa_, ROR #41 SEP xar_m1 vBso, vAma, E0, 23 +bic tmp, sAso_, sAsi_, ROR #2 SEP xar_m1 vBka, vAbe, E1, 63 +eor sAse, tmp, sAse_, ROR #50 SEP +bic tmp, sAsu_, sAso_, ROR #25 SEP xar_m1 vBse, vAgo, E3, 9 +eor sAsi, tmp, sAsi_, ROR #27 SEP +bic tmp, sAsa_, sAsu_, ROR #60 SEP xar_m1 vBgo, vAme, E1, 19 +eor sAso, tmp, sAso_, ROR #21 SEP +bic tmp, sAse_, sAsa_, ROR #57 SEP bcax_m1 vAge, vBge, vBgo, vBgi +eor sAsu, tmp, sAsu_, ROR #53 SEP +bic tmp, sAbi_, sAbe_, ROR #63 SEP ldr vvtmpq, [sp, #(STACK_BASE_TMP + 16*32)] +eor s_Aba, s_Aba_, tmp, ROR #21 SEP xar_m1 vBke, vvtmp, E2, 58 +bic tmp, sAbo_, sAbi_, ROR #42 SEP +eor sAbe, tmp, sAbe_, ROR #41 SEP xar_m1 vBgu, vAsi, E2, 3 +bic tmp, sAbu_, sAbo_, ROR #57 SEP +eor sAbi, tmp, sAbi_, ROR #35 SEP bcax_m1 vAgi, vBgi, vBgu, vBgo +bic tmp, s_Aba_, sAbu_, ROR #50 SEP +eor sAbo, tmp, sAbo_, ROR #43 SEP xar_m1 vBsi, vAku, E4, 25 +bic tmp, sAbe_, s_Aba_, ROR #44 SEP +eor sAbu, tmp, sAbu_, ROR #30 SEP xar_m1 vBku, vAsa, E0, 46 +eor s_Aba, s_Aba, cur_const SEP xar_m1 vBma, vAbu, E4, 37 +save count, STACK_OFFSET_COUNT SEP +eor sC0, sAka, sAsa, ROR #50 SEP xar_m1 vBbu, vAsu, E4, 50 +eor sC1, sAse, sAge, ROR #60 SEP +eor sC2, sAmi, sAgi, ROR #59 SEP xar_m1 vBsu, vAse, E1, 62 +eor sC3, sAgo, sAso, ROR #30 SEP +eor sC4, sAbu, sAsu, ROR #53 SEP ldp vvtmpq, E3q, [sp, #(STACK_BASE_TMP + 16*30)] +eor sC0, sAma, sC0, ROR #49 SEP +eor sC1, sAbe, sC1, ROR #44 SEP xar_m1 vBme, vvtmp, E0, 28 +eor sC2, sAki, sC2, ROR #26 SEP xar_m1 vBbe, E3, E1, 20 +eor sC3, sAmo, sC3, ROR #63 SEP +eor sC4, sAmu, sC4, ROR #56 SEP /* 25x BCAX, 50 in total */ +eor sC0, sAga, sC0, ROR #57 SEP +eor sC1, sAme, sC1, ROR #58 SEP bcax_m1 vAgo, vBgo, vBga, vBgu +eor sC2, sAbi, sC2, ROR #60 SEP +eor sC3, sAko, sC3, ROR #38 SEP bcax_m1 vAgu, vBgu, vBge, vBga +eor sC4, sAgu, sC4, ROR #48 SEP +eor sC0, s_Aba, sC0, ROR #61 SEP bcax_m1 vAka, vBka, vBki, vBke +eor sC1, sAke, sC1, ROR #57 SEP bcax_m1 vAke, vBke, vBko, vBki +eor sC2, sAsi, sC2, ROR #52 SEP +eor sC3, sAbo, sC3, ROR #63 SEP .unreq vvtmp +eor sC4, sAku, sC4, ROR #50 SEP +ror sC1, sC1, 56 SEP .unreq vvtmpq +ror sC4, sC4, 58 SEP +ror sC2, sC2, 62 SEP eor2 C0, vAka, vAga +eor sE1, sC0, sC2, ROR #63 SEP +eor sE3, sC2, sC4, ROR #63 SEP save(vAga) +eor sE0, sC4, sC1, ROR #63 SEP vvtmp .req vAga +eor sE2, sC1, sC3, ROR #63 SEP +eor sE4, sC3, sC0, ROR #63 SEP vvtmpq .req vAgaq +eor s_Aba_, sE0, s_Aba SEP +eor sAsa_, sE2, sAbi, ROR #50 SEP bcax_m1 vAki, vBki, vBku, vBko +eor sAbi_, sE2, sAki, ROR #46 SEP +eor sAki_, sE3, sAko, ROR #63 SEP bcax_m1 vAko, vBko, vBka, vBku +eor sAko_, sE4, sAmu, ROR #28 SEP +eor sAmu_, sE3, sAso, ROR #2 SEP eor2 C1, vAke, vAge +eor sAso_, sE0, sAma, ROR #54 SEP bcax_m1 vAku, vBku, vBke, vBka +eor sAka_, sE1, sAbe, ROR #43 SEP +eor sAse_, sE3, sAgo, ROR #36 SEP eor2 C2, vAki, vAgi +eor sAgo_, sE1, sAme, ROR #49 SEP +eor sAke_, sE2, sAgi, ROR #3 SEP bcax_m1 vAma, vBma, vBmi, vBme +eor sAgi_, sE0, sAka, ROR #39 SEP +eor sAga_, sE3, sAbo SEP eor2 C3, vAko, vAgo +eor sAbo_, sE3, sAmo, ROR #37 SEP +eor sAmo_, sE2, sAmi, ROR #8 SEP bcax_m1 vAme, vBme, vBmo, vBmi +eor sAmi_, sE1, sAke, ROR #56 SEP +eor sAge_, sE4, sAgu, ROR #44 SEP eor2 C4, vAku, vAgu +eor sAgu_, sE2, sAsi, ROR #62 SEP bcax_m1 vAmi, vBmi, vBmu, vBmo +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP eor2 C0, C0, vAma +eor sAma_, sE4, sAbu, ROR #20 SEP +eor sAbu_, sE4, sAsu, ROR #9 SEP bcax_m1 vAmo, vBmo, vBma, vBmu +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP eor2 C1, C1, vAme +eor sAbe_, sE1, sAge, ROR #19 SEP +load_constant_ptr SEP bcax_m1 vAmu, vBmu, vBme, vBma +restore count, STACK_OFFSET_COUNT SEP eor2 C2, C2, vAmi +bic tmp, sAgi_, sAge_, ROR #47 SEP +eor sAga, tmp, sAga_, ROR #39 SEP bcax_m1 vAsa, vBsa, vBsi, vBse +bic tmp, sAgo_, sAgi_, ROR #42 SEP +eor sAge, tmp, sAge_, ROR #25 SEP eor2 C3, C3, vAmo +bic tmp, sAgu_, sAgo_, ROR #16 SEP +eor sAgi, tmp, sAgi_, ROR #58 SEP bcax_m1 vAse, vBse, vBso, vBsi +bic tmp, sAga_, sAgu_, ROR #31 SEP +eor sAgo, tmp, sAgo_, ROR #47 SEP eor2 C4, C4, vAmu +bic tmp, sAge_, sAga_, ROR #56 SEP bcax_m1 vAsi, vBsi, vBsu, vBso +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP eor2 C0, C0, vAsa +eor sAka, tmp, sAka_, ROR #24 SEP +bic tmp, sAko_, sAki_, ROR #47 SEP bcax_m1 vAso, vBso, vBsa, vBsu +eor sAke, tmp, sAke_, ROR #2 SEP +bic tmp, sAku_, sAko_, ROR #10 SEP eor2 C1, C1, vAse +eor sAki, tmp, sAki_, ROR #57 SEP +bic tmp, sAka_, sAku_, ROR #47 SEP bcax_m1 vAsu, vBsu, vBse, vBsa +eor sAko, tmp, sAko_, ROR #57 SEP eor2 C2, C2, vAsi +bic tmp, sAke_, sAka_, ROR #5 SEP +eor sAku, tmp, sAku_, ROR #52 SEP eor2 C3, C3, vAso +bic tmp, sAmi_, sAme_, ROR #38 SEP +eor sAma, tmp, sAma_, ROR #47 SEP bcax_m1 vAba, vBba, vBbi, vBbe +bic tmp, sAmo_, sAmi_, ROR #5 SEP +eor sAme, tmp, sAme_, ROR #43 SEP bcax_m1 vAbe, vBbe, vBbo, vBbi +bic tmp, sAmu_, sAmo_, ROR #41 SEP +eor sAmi, tmp, sAmi_, ROR #46 SEP eor2 C1, C1, vAbe +bic tmp, sAma_, sAmu_, ROR #35 SEP restore x26, STACK_OFFSET_CONST +eor sAmo, tmp, sAmo_, ROR #12 SEP ldr vvtmpq, [x26], #16 +bic tmp, sAme_, sAma_, ROR #9 SEP +eor sAmu, tmp, sAmu_, ROR #44 SEP save x26, STACK_OFFSET_CONST +bic tmp, sAsi_, sAse_, ROR #48 SEP +ldr cur_const, [const_addr, count, UXTW #3] SEP +eor sAsa, tmp, sAsa_, ROR #41 SEP eor vAba.16b, vAba.16b, vvtmp.16b +bic tmp, sAso_, sAsi_, ROR #2 SEP +eor sAse, tmp, sAse_, ROR #50 SEP eor2 C4, C4, vAsu +bic tmp, sAsu_, sAso_, ROR #25 SEP bcax_m1 vAbi, vBbi, vBbu, vBbo +eor sAsi, tmp, sAsi_, ROR #27 SEP +bic tmp, sAsa_, sAsu_, ROR #60 SEP bcax_m1 vAbo, vBbo, vBba, vBbu +eor sAso, tmp, sAso_, ROR #21 SEP +bic tmp, sAse_, sAsa_, ROR #57 SEP eor2 C3, C3, vAbo +eor sAsu, tmp, sAsu_, ROR #53 SEP +bic tmp, sAbi_, sAbe_, ROR #63 SEP eor2 C2, C2, vAbi +eor s_Aba, s_Aba_, tmp, ROR #21 SEP +bic tmp, sAbo_, sAbi_, ROR #42 SEP eor2 C0, C0, vAba +eor sAbe, tmp, sAbe_, ROR #41 SEP bcax_m1 vAbu, vBbu, vBbe, vBba +bic tmp, sAbu_, sAbo_, ROR #57 SEP +eor sAbi, tmp, sAbi_, ROR #35 SEP eor2 C4, C4, vAbu +bic tmp, s_Aba_, sAbu_, ROR #50 SEP +eor sAbo, tmp, sAbo_, ROR #43 SEP restore(vAga) +bic tmp, sAbe_, s_Aba_, ROR #44 SEP +eor sAbu, tmp, sAbu_, ROR #30 SEP .unreq vvtmp +add count, count, #1 SEP +eor s_Aba, s_Aba, cur_const SEP .unreq vvtmpq +.endm + + +.macro hybrid_round_noninitial +save count, STACK_OFFSET_COUNT SEP +eor sC0, sAka, sAsa, ROR #50 SEP vvtmp .req vBba +eor sC1, sAse, sAge, ROR #60 SEP rax1_m1 E2, C1, C3 +eor sC2, sAmi, sAgi, ROR #59 SEP rax1_m1 E4, C3, C0 +eor sC3, sAgo, sAso, ROR #30 SEP +eor sC4, sAbu, sAsu, ROR #53 SEP +eor sC0, sAma, sC0, ROR #49 SEP +eor sC1, sAbe, sC1, ROR #44 SEP rax1_m1 E1, C0, C2 +eor sC2, sAki, sC2, ROR #26 SEP +eor sC3, sAmo, sC3, ROR #63 SEP +eor sC4, sAmu, sC4, ROR #56 SEP rax1_m1 E3, C2, C4 +eor sC0, sAga, sC0, ROR #57 SEP +eor sC1, sAme, sC1, ROR #58 SEP str vAgiq, [sp, #(STACK_BASE_TMP + 16*32)] +eor sC2, sAbi, sC2, ROR #60 SEP +eor sC3, sAko, sC3, ROR #38 SEP rax1_m1 E0, C4, C1 +eor sC4, sAgu, sC4, ROR #48 SEP +eor sC0, s_Aba, sC0, ROR #61 SEP .unreq vvtmp +eor sC1, sAke, sC1, ROR #57 SEP +eor sC2, sAsi, sC2, ROR #52 SEP +eor sC3, sAbo, sC3, ROR #63 SEP vvtmp .req C1 +eor sC4, sAku, sC4, ROR #50 SEP +ror sC1, sC1, 56 SEP vvtmpq .req C1q +ror sC4, sC4, 58 SEP +ror sC2, sC2, 62 SEP xar_m1 vBgi, vAka, E0, 61 +eor sE1, sC0, sC2, ROR #63 SEP +eor sE3, sC2, sC4, ROR #63 SEP xar_m1 vBga, vAbo, E3, 36 +eor sE0, sC4, sC1, ROR #63 SEP +eor sE2, sC1, sC3, ROR #63 SEP +eor sE4, sC3, sC0, ROR #63 SEP str vAgaq, [sp, #(STACK_BASE_TMP + 16 * 30)] +eor s_Aba_, sE0, s_Aba SEP +eor sAsa_, sE2, sAbi, ROR #50 SEP xar_m1 vBbo, vAmo, E3, 43 +eor sAbi_, sE2, sAki, ROR #46 SEP +eor sAki_, sE3, sAko, ROR #63 SEP xar_m1 vBmo, vAmi, E2, 49 +eor sAko_, sE4, sAmu, ROR #28 SEP +eor sAmu_, sE3, sAso, ROR #2 SEP +eor sAso_, sE0, sAma, ROR #54 SEP str vAgeq, [sp, #(STACK_BASE_TMP + 16 * 31)] +eor sAka_, sE1, sAbe, ROR #43 SEP +eor sAse_, sE3, sAgo, ROR #36 SEP xar_m1 vBmi, vAke, E1, 54 +eor sAgo_, sE1, sAme, ROR #49 SEP +eor sAke_, sE2, sAgi, ROR #3 SEP xar_m1 vBge, vAgu, E4, 44 +eor sAgi_, sE0, sAka, ROR #39 SEP +eor sAga_, sE3, sAbo SEP bcax_m1 vAga, vBga, vBgi, vBge +eor sAbo_, sE3, sAmo, ROR #37 SEP +eor sAmo_, sE2, sAmi, ROR #8 SEP +eor sAmi_, sE1, sAke, ROR #56 SEP eor vBba.16b, vAba.16b, E0.16b +eor sAge_, sE4, sAgu, ROR #44 SEP +eor sAgu_, sE2, sAsi, ROR #62 SEP xar_m1 vBsa, vAbi, E2, 2 +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP xar_m1 vBbi, vAki, E2, 21 +eor sAma_, sE4, sAbu, ROR #20 SEP +eor sAbu_, sE4, sAsu, ROR #9 SEP xar_m1 vBki, vAko, E3, 39 +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP +eor sAbe_, sE1, sAge, ROR #19 SEP xar_m1 vBko, vAmu, E4, 56 +load_constant_ptr SEP +restore count, STACK_OFFSET_COUNT SEP xar_m1 vBmu, vAso, E3, 8 +bic tmp, sAgi_, sAge_, ROR #47 SEP +eor sAga, tmp, sAga_, ROR #39 SEP xar_m1 vBso, vAma, E0, 23 +bic tmp, sAgo_, sAgi_, ROR #42 SEP +eor sAge, tmp, sAge_, ROR #25 SEP +bic tmp, sAgu_, sAgo_, ROR #16 SEP xar_m1 vBka, vAbe, E1, 63 +eor sAgi, tmp, sAgi_, ROR #58 SEP +bic tmp, sAga_, sAgu_, ROR #31 SEP xar_m1 vBse, vAgo, E3, 9 +eor sAgo, tmp, sAgo_, ROR #47 SEP +bic tmp, sAge_, sAga_, ROR #56 SEP xar_m1 vBgo, vAme, E1, 19 +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP bcax_m1 vAge, vBge, vBgo, vBgi +eor sAka, tmp, sAka_, ROR #24 SEP +bic tmp, sAko_, sAki_, ROR #47 SEP +eor sAke, tmp, sAke_, ROR #2 SEP ldr vvtmpq, [sp, #(STACK_BASE_TMP + 16*32)] +bic tmp, sAku_, sAko_, ROR #10 SEP +eor sAki, tmp, sAki_, ROR #57 SEP xar_m1 vBke, vvtmp, E2, 58 +bic tmp, sAka_, sAku_, ROR #47 SEP +eor sAko, tmp, sAko_, ROR #57 SEP xar_m1 vBgu, vAsi, E2, 3 +bic tmp, sAke_, sAka_, ROR #5 SEP +eor sAku, tmp, sAku_, ROR #52 SEP bcax_m1 vAgi, vBgi, vBgu, vBgo +bic tmp, sAmi_, sAme_, ROR #38 SEP +eor sAma, tmp, sAma_, ROR #47 SEP +bic tmp, sAmo_, sAmi_, ROR #5 SEP xar_m1 vBsi, vAku, E4, 25 +eor sAme, tmp, sAme_, ROR #43 SEP +bic tmp, sAmu_, sAmo_, ROR #41 SEP xar_m1 vBku, vAsa, E0, 46 +eor sAmi, tmp, sAmi_, ROR #46 SEP +bic tmp, sAma_, sAmu_, ROR #35 SEP xar_m1 vBma, vAbu, E4, 37 +ldr cur_const, [const_addr, count, UXTW #3] SEP +add count, count, #1 SEP +eor sAmo, tmp, sAmo_, ROR #12 SEP xar_m1 vBbu, vAsu, E4, 50 +bic tmp, sAme_, sAma_, ROR #9 SEP +eor sAmu, tmp, sAmu_, ROR #44 SEP xar_m1 vBsu, vAse, E1, 62 +bic tmp, sAsi_, sAse_, ROR #48 SEP +eor sAsa, tmp, sAsa_, ROR #41 SEP ldp vvtmpq, E3q, [sp, #(STACK_BASE_TMP + 16*30)] +bic tmp, sAso_, sAsi_, ROR #2 SEP +eor sAse, tmp, sAse_, ROR #50 SEP xar_m1 vBme, vvtmp, E0, 28 +bic tmp, sAsu_, sAso_, ROR #25 SEP +eor sAsi, tmp, sAsi_, ROR #27 SEP +bic tmp, sAsa_, sAsu_, ROR #60 SEP xar_m1 vBbe, E3, E1, 20 +eor sAso, tmp, sAso_, ROR #21 SEP +bic tmp, sAse_, sAsa_, ROR #57 SEP bcax_m1 vAgo, vBgo, vBga, vBgu +eor sAsu, tmp, sAsu_, ROR #53 SEP +bic tmp, sAbi_, sAbe_, ROR #63 SEP bcax_m1 vAgu, vBgu, vBge, vBga +eor s_Aba, s_Aba_, tmp, ROR #21 SEP +bic tmp, sAbo_, sAbi_, ROR #42 SEP bcax_m1 vAka, vBka, vBki, vBke +eor sAbe, tmp, sAbe_, ROR #41 SEP +bic tmp, sAbu_, sAbo_, ROR #57 SEP +eor sAbi, tmp, sAbi_, ROR #35 SEP bcax_m1 vAke, vBke, vBko, vBki +bic tmp, s_Aba_, sAbu_, ROR #50 SEP +eor sAbo, tmp, sAbo_, ROR #43 SEP .unreq vvtmp +bic tmp, sAbe_, s_Aba_, ROR #44 SEP +eor sAbu, tmp, sAbu_, ROR #30 SEP .unreq vvtmpq +eor s_Aba, s_Aba, cur_const SEP +save count, STACK_OFFSET_COUNT SEP +eor sC0, sAka, sAsa, ROR #50 SEP eor2 C0, vAka, vAga +eor sC1, sAse, sAge, ROR #60 SEP +eor sC2, sAmi, sAgi, ROR #59 SEP save(vAga) +eor sC3, sAgo, sAso, ROR #30 SEP +eor sC4, sAbu, sAsu, ROR #53 SEP vvtmp .req vAga +eor sC0, sAma, sC0, ROR #49 SEP +eor sC1, sAbe, sC1, ROR #44 SEP vvtmpq .req vAgaq +eor sC2, sAki, sC2, ROR #26 SEP +eor sC3, sAmo, sC3, ROR #63 SEP +eor sC4, sAmu, sC4, ROR #56 SEP bcax_m1 vAki, vBki, vBku, vBko +eor sC0, sAga, sC0, ROR #57 SEP +eor sC1, sAme, sC1, ROR #58 SEP bcax_m1 vAko, vBko, vBka, vBku +eor sC2, sAbi, sC2, ROR #60 SEP +eor sC3, sAko, sC3, ROR #38 SEP eor2 C1, vAke, vAge +eor sC4, sAgu, sC4, ROR #48 SEP +eor sC0, s_Aba, sC0, ROR #61 SEP bcax_m1 vAku, vBku, vBke, vBka +eor sC1, sAke, sC1, ROR #57 SEP +eor sC2, sAsi, sC2, ROR #52 SEP +eor sC3, sAbo, sC3, ROR #63 SEP eor2 C2, vAki, vAgi +eor sC4, sAku, sC4, ROR #50 SEP +ror sC1, sC1, 56 SEP bcax_m1 vAma, vBma, vBmi, vBme +ror sC4, sC4, 58 SEP +ror sC2, sC2, 62 SEP eor2 C3, vAko, vAgo +eor sE1, sC0, sC2, ROR #63 SEP +eor sE3, sC2, sC4, ROR #63 SEP bcax_m1 vAme, vBme, vBmo, vBmi +eor sE0, sC4, sC1, ROR #63 SEP +eor sE2, sC1, sC3, ROR #63 SEP +eor sE4, sC3, sC0, ROR #63 SEP eor2 C4, vAku, vAgu +eor s_Aba_, sE0, s_Aba SEP +eor sAsa_, sE2, sAbi, ROR #50 SEP bcax_m1 vAmi, vBmi, vBmu, vBmo +eor sAbi_, sE2, sAki, ROR #46 SEP +eor sAki_, sE3, sAko, ROR #63 SEP eor2 C0, C0, vAma +eor sAko_, sE4, sAmu, ROR #28 SEP +eor sAmu_, sE3, sAso, ROR #2 SEP +eor sAso_, sE0, sAma, ROR #54 SEP bcax_m1 vAmo, vBmo, vBma, vBmu +eor sAka_, sE1, sAbe, ROR #43 SEP +eor sAse_, sE3, sAgo, ROR #36 SEP eor2 C1, C1, vAme +eor sAgo_, sE1, sAme, ROR #49 SEP +eor sAke_, sE2, sAgi, ROR #3 SEP bcax_m1 vAmu, vBmu, vBme, vBma +eor sAgi_, sE0, sAka, ROR #39 SEP +eor sAga_, sE3, sAbo SEP eor2 C2, C2, vAmi +eor sAbo_, sE3, sAmo, ROR #37 SEP +eor sAmo_, sE2, sAmi, ROR #8 SEP +eor sAmi_, sE1, sAke, ROR #56 SEP bcax_m1 vAsa, vBsa, vBsi, vBse +eor sAge_, sE4, sAgu, ROR #44 SEP +eor sAgu_, sE2, sAsi, ROR #62 SEP eor2 C3, C3, vAmo +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP bcax_m1 vAse, vBse, vBso, vBsi +eor sAma_, sE4, sAbu, ROR #20 SEP +eor sAbu_, sE4, sAsu, ROR #9 SEP eor2 C4, C4, vAmu +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP +eor sAbe_, sE1, sAge, ROR #19 SEP bcax_m1 vAsi, vBsi, vBsu, vBso +load_constant_ptr SEP +restore count, STACK_OFFSET_COUNT SEP eor2 C0, C0, vAsa +bic tmp, sAgi_, sAge_, ROR #47 SEP +eor sAga, tmp, sAga_, ROR #39 SEP bcax_m1 vAso, vBso, vBsa, vBsu +bic tmp, sAgo_, sAgi_, ROR #42 SEP +eor sAge, tmp, sAge_, ROR #25 SEP +bic tmp, sAgu_, sAgo_, ROR #16 SEP eor2 C1, C1, vAse +eor sAgi, tmp, sAgi_, ROR #58 SEP +bic tmp, sAga_, sAgu_, ROR #31 SEP bcax_m1 vAsu, vBsu, vBse, vBsa +eor sAgo, tmp, sAgo_, ROR #47 SEP +bic tmp, sAge_, sAga_, ROR #56 SEP eor2 C2, C2, vAsi +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP eor2 C3, C3, vAso +eor sAka, tmp, sAka_, ROR #24 SEP +bic tmp, sAko_, sAki_, ROR #47 SEP +eor sAke, tmp, sAke_, ROR #2 SEP bcax_m1 vAba, vBba, vBbi, vBbe +bic tmp, sAku_, sAko_, ROR #10 SEP +eor sAki, tmp, sAki_, ROR #57 SEP bcax_m1 vAbe, vBbe, vBbo, vBbi +bic tmp, sAka_, sAku_, ROR #47 SEP +eor sAko, tmp, sAko_, ROR #57 SEP eor2 C1, C1, vAbe +bic tmp, sAke_, sAka_, ROR #5 SEP +eor sAku, tmp, sAku_, ROR #52 SEP restore x26, STACK_OFFSET_CONST +bic tmp, sAmi_, sAme_, ROR #38 SEP +eor sAma, tmp, sAma_, ROR #47 SEP +bic tmp, sAmo_, sAmi_, ROR #5 SEP ldr vvtmpq, [x26], #16 +eor sAme, tmp, sAme_, ROR #43 SEP +bic tmp, sAmu_, sAmo_, ROR #41 SEP save x26, STACK_OFFSET_CONST +eor sAmi, tmp, sAmi_, ROR #46 SEP +bic tmp, sAma_, sAmu_, ROR #35 SEP eor vAba.16b, vAba.16b, vvtmp.16b +ldr cur_const, [const_addr, count, UXTW #3] SEP +add count, count, #1 SEP +eor sAmo, tmp, sAmo_, ROR #12 SEP eor2 C4, C4, vAsu +bic tmp, sAme_, sAma_, ROR #9 SEP +eor sAmu, tmp, sAmu_, ROR #44 SEP bcax_m1 vAbi, vBbi, vBbu, vBbo +bic tmp, sAsi_, sAse_, ROR #48 SEP +eor sAsa, tmp, sAsa_, ROR #41 SEP bcax_m1 vAbo, vBbo, vBba, vBbu +bic tmp, sAso_, sAsi_, ROR #2 SEP +eor sAse, tmp, sAse_, ROR #50 SEP eor2 C3, C3, vAbo +bic tmp, sAsu_, sAso_, ROR #25 SEP +eor sAsi, tmp, sAsi_, ROR #27 SEP +bic tmp, sAsa_, sAsu_, ROR #60 SEP eor2 C2, C2, vAbi +eor sAso, tmp, sAso_, ROR #21 SEP +bic tmp, sAse_, sAsa_, ROR #57 SEP eor2 C0, C0, vAba +eor sAsu, tmp, sAsu_, ROR #53 SEP +bic tmp, sAbi_, sAbe_, ROR #63 SEP bcax_m1 vAbu, vBbu, vBbe, vBba +eor s_Aba, s_Aba_, tmp, ROR #21 SEP +bic tmp, sAbo_, sAbi_, ROR #42 SEP eor2 C4, C4, vAbu +eor sAbe, tmp, sAbe_, ROR #41 SEP +bic tmp, sAbu_, sAbo_, ROR #57 SEP +eor sAbi, tmp, sAbi_, ROR #35 SEP restore(vAga) +bic tmp, s_Aba_, sAbu_, ROR #50 SEP +eor sAbo, tmp, sAbo_, ROR #43 SEP .unreq vvtmp +bic tmp, sAbe_, s_Aba_, ROR #44 SEP +eor sAbu, tmp, sAbu_, ROR #30 SEP .unreq vvtmpq +eor s_Aba, s_Aba, cur_const SEP +.endm +.macro hybrid_round_final + SEP vvtmp .req vBba +save count, STACK_OFFSET_COUNT SEP rax1_m1 E2, C1, C3 +eor sC0, sAka, sAsa, ROR #50 SEP +eor sC1, sAse, sAge, ROR #60 SEP rax1_m1 E4, C3, C0 +eor sC2, sAmi, sAgi, ROR #59 SEP +eor sC3, sAgo, sAso, ROR #30 SEP rax1_m1 E1, C0, C2 +eor sC4, sAbu, sAsu, ROR #53 SEP +eor sC0, sAma, sC0, ROR #49 SEP +eor sC1, sAbe, sC1, ROR #44 SEP +eor sC2, sAki, sC2, ROR #26 SEP +eor sC3, sAmo, sC3, ROR #63 SEP +eor sC4, sAmu, sC4, ROR #56 SEP +eor sC0, sAga, sC0, ROR #57 SEP +eor sC1, sAme, sC1, ROR #58 SEP +eor sC2, sAbi, sC2, ROR #60 SEP +eor sC3, sAko, sC3, ROR #38 SEP rax1_m1 E3, C2, C4 +eor sC4, sAgu, sC4, ROR #48 SEP +eor sC0, s_Aba, sC0, ROR #61 SEP +eor sC1, sAke, sC1, ROR #57 SEP +eor sC2, sAsi, sC2, ROR #52 SEP str vAgiq, [sp, #(STACK_BASE_TMP + 16*32)] +eor sC3, sAbo, sC3, ROR #63 SEP +eor sC4, sAku, sC4, ROR #50 SEP +ror sC1, sC1, 56 SEP rax1_m1 E0, C4, C1 +ror sC4, sC4, 58 SEP +ror sC2, sC2, 62 SEP +eor sE1, sC0, sC2, ROR #63 SEP +eor sE3, sC2, sC4, ROR #63 SEP .unreq vvtmp +eor sE0, sC4, sC1, ROR #63 SEP +eor sE2, sC1, sC3, ROR #63 SEP +eor sE4, sC3, sC0, ROR #63 SEP vvtmp .req C1 +eor s_Aba_, sE0, s_Aba SEP +eor sAsa_, sE2, sAbi, ROR #50 SEP +eor sAbi_, sE2, sAki, ROR #46 SEP vvtmpq .req C1q +eor sAki_, sE3, sAko, ROR #63 SEP +eor sAko_, sE4, sAmu, ROR #28 SEP +eor sAmu_, sE3, sAso, ROR #2 SEP +eor sAso_, sE0, sAma, ROR #54 SEP xar_m1 vBgi, vAka, E0, 61 +eor sAka_, sE1, sAbe, ROR #43 SEP +eor sAse_, sE3, sAgo, ROR #36 SEP +eor sAgo_, sE1, sAme, ROR #49 SEP xar_m1 vBga, vAbo, E3, 36 +eor sAke_, sE2, sAgi, ROR #3 SEP +eor sAgi_, sE0, sAka, ROR #39 SEP +eor sAga_, sE3, sAbo SEP +eor sAbo_, sE3, sAmo, ROR #37 SEP str vAgaq, [sp, #(STACK_BASE_TMP + 16 * 30)] +eor sAmo_, sE2, sAmi, ROR #8 SEP +eor sAmi_, sE1, sAke, ROR #56 SEP +eor sAge_, sE4, sAgu, ROR #44 SEP xar_m1 vBbo, vAmo, E3, 43 +eor sAgu_, sE2, sAsi, ROR #62 SEP +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP +eor sAma_, sE4, sAbu, ROR #20 SEP xar_m1 vBmo, vAmi, E2, 49 +eor sAbu_, sE4, sAsu, ROR #9 SEP +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP str vAgeq, [sp, #(STACK_BASE_TMP + 16 * 31)] +eor sAbe_, sE1, sAge, ROR #19 SEP +load_constant_ptr SEP +restore count, STACK_OFFSET_COUNT SEP +bic tmp, sAgi_, sAge_, ROR #47 SEP xar_m1 vBmi, vAke, E1, 54 +eor sAga, tmp, sAga_, ROR #39 SEP +bic tmp, sAgo_, sAgi_, ROR #42 SEP +eor sAge, tmp, sAge_, ROR #25 SEP xar_m1 vBge, vAgu, E4, 44 +bic tmp, sAgu_, sAgo_, ROR #16 SEP +eor sAgi, tmp, sAgi_, ROR #58 SEP +bic tmp, sAga_, sAgu_, ROR #31 SEP bcax_m1 vAga, vBga, vBgi, vBge +eor sAgo, tmp, sAgo_, ROR #47 SEP +bic tmp, sAge_, sAga_, ROR #56 SEP +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP eor vBba.16b, vAba.16b, E0.16b +eor sAka, tmp, sAka_, ROR #24 SEP +bic tmp, sAko_, sAki_, ROR #47 SEP +eor sAke, tmp, sAke_, ROR #2 SEP xar_m1 vBsa, vAbi, E2, 2 +bic tmp, sAku_, sAko_, ROR #10 SEP +eor sAki, tmp, sAki_, ROR #57 SEP +bic tmp, sAka_, sAku_, ROR #47 SEP +eor sAko, tmp, sAko_, ROR #57 SEP xar_m1 vBbi, vAki, E2, 21 +bic tmp, sAke_, sAka_, ROR #5 SEP +eor sAku, tmp, sAku_, ROR #52 SEP +bic tmp, sAmi_, sAme_, ROR #38 SEP xar_m1 vBki, vAko, E3, 39 +eor sAma, tmp, sAma_, ROR #47 SEP +bic tmp, sAmo_, sAmi_, ROR #5 SEP +eor sAme, tmp, sAme_, ROR #43 SEP +bic tmp, sAmu_, sAmo_, ROR #41 SEP xar_m1 vBko, vAmu, E4, 56 +eor sAmi, tmp, sAmi_, ROR #46 SEP +bic tmp, sAma_, sAmu_, ROR #35 SEP +ldr cur_const, [const_addr, count, UXTW #3] SEP xar_m1 vBmu, vAso, E3, 8 +add count, count, #1 SEP +eor sAmo, tmp, sAmo_, ROR #12 SEP +bic tmp, sAme_, sAma_, ROR #9 SEP +eor sAmu, tmp, sAmu_, ROR #44 SEP xar_m1 vBso, vAma, E0, 23 +bic tmp, sAsi_, sAse_, ROR #48 SEP +eor sAsa, tmp, sAsa_, ROR #41 SEP +bic tmp, sAso_, sAsi_, ROR #2 SEP xar_m1 vBka, vAbe, E1, 63 +eor sAse, tmp, sAse_, ROR #50 SEP +bic tmp, sAsu_, sAso_, ROR #25 SEP +eor sAsi, tmp, sAsi_, ROR #27 SEP xar_m1 vBse, vAgo, E3, 9 +bic tmp, sAsa_, sAsu_, ROR #60 SEP +eor sAso, tmp, sAso_, ROR #21 SEP +bic tmp, sAse_, sAsa_, ROR #57 SEP +eor sAsu, tmp, sAsu_, ROR #53 SEP xar_m1 vBgo, vAme, E1, 19 +bic tmp, sAbi_, sAbe_, ROR #63 SEP +eor s_Aba, s_Aba_, tmp, ROR #21 SEP +bic tmp, sAbo_, sAbi_, ROR #42 SEP bcax_m1 vAge, vBge, vBgo, vBgi +eor sAbe, tmp, sAbe_, ROR #41 SEP +bic tmp, sAbu_, sAbo_, ROR #57 SEP +eor sAbi, tmp, sAbi_, ROR #35 SEP +bic tmp, s_Aba_, sAbu_, ROR #50 SEP ldr vvtmpq, [sp, #(STACK_BASE_TMP + 16*32)] +eor sAbo, tmp, sAbo_, ROR #43 SEP +bic tmp, sAbe_, s_Aba_, ROR #44 SEP +eor sAbu, tmp, sAbu_, ROR #30 SEP xar_m1 vBke, vvtmp, E2, 58 +eor s_Aba, s_Aba, cur_const SEP +save count, STACK_OFFSET_COUNT SEP +eor sC0, sAka, sAsa, ROR #50 SEP +eor sC1, sAse, sAge, ROR #60 SEP xar_m1 vBgu, vAsi, E2, 3 +eor sC2, sAmi, sAgi, ROR #59 SEP +eor sC3, sAgo, sAso, ROR #30 SEP +eor sC4, sAbu, sAsu, ROR #53 SEP bcax_m1 vAgi, vBgi, vBgu, vBgo +eor sC0, sAma, sC0, ROR #49 SEP +eor sC1, sAbe, sC1, ROR #44 SEP +eor sC2, sAki, sC2, ROR #26 SEP +eor sC3, sAmo, sC3, ROR #63 SEP xar_m1 vBsi, vAku, E4, 25 +eor sC4, sAmu, sC4, ROR #56 SEP +eor sC0, sAga, sC0, ROR #57 SEP +eor sC1, sAme, sC1, ROR #58 SEP xar_m1 vBku, vAsa, E0, 46 +eor sC2, sAbi, sC2, ROR #60 SEP +eor sC3, sAko, sC3, ROR #38 SEP +eor sC4, sAgu, sC4, ROR #48 SEP xar_m1 vBma, vAbu, E4, 37 +eor sC0, s_Aba, sC0, ROR #61 SEP +eor sC1, sAke, sC1, ROR #57 SEP +eor sC2, sAsi, sC2, ROR #52 SEP +eor sC3, sAbo, sC3, ROR #63 SEP xar_m1 vBbu, vAsu, E4, 50 +eor sC4, sAku, sC4, ROR #50 SEP +ror sC1, sC1, 56 SEP +ror sC4, sC4, 58 SEP xar_m1 vBsu, vAse, E1, 62 +ror sC2, sC2, 62 SEP +eor sE1, sC0, sC2, ROR #63 SEP +eor sE3, sC2, sC4, ROR #63 SEP +eor sE0, sC4, sC1, ROR #63 SEP ldp vvtmpq, E3q, [sp, #(STACK_BASE_TMP + 16*30)] +eor sE2, sC1, sC3, ROR #63 SEP +eor sE4, sC3, sC0, ROR #63 SEP +eor s_Aba_, sE0, s_Aba SEP xar_m1 vBme, vvtmp, E0, 28 +eor sAsa_, sE2, sAbi, ROR #50 SEP +eor sAbi_, sE2, sAki, ROR #46 SEP +eor sAki_, sE3, sAko, ROR #63 SEP +eor sAko_, sE4, sAmu, ROR #28 SEP xar_m1 vBbe, E3, E1, 20 +eor sAmu_, sE3, sAso, ROR #2 SEP +eor sAso_, sE0, sAma, ROR #54 SEP +eor sAka_, sE1, sAbe, ROR #43 SEP bcax_m1 vAgo, vBgo, vBga, vBgu +eor sAse_, sE3, sAgo, ROR #36 SEP +eor sAgo_, sE1, sAme, ROR #49 SEP +eor sAke_, sE2, sAgi, ROR #3 SEP +eor sAgi_, sE0, sAka, ROR #39 SEP bcax_m1 vAgu, vBgu, vBge, vBga +eor sAga_, sE3, sAbo SEP +eor sAbo_, sE3, sAmo, ROR #37 SEP +eor sAmo_, sE2, sAmi, ROR #8 SEP bcax_m1 vAka, vBka, vBki, vBke +eor sAmi_, sE1, sAke, ROR #56 SEP +eor sAge_, sE4, sAgu, ROR #44 SEP +eor sAgu_, sE2, sAsi, ROR #62 SEP bcax_m1 vAke, vBke, vBko, vBki +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP +eor sAma_, sE4, sAbu, ROR #20 SEP +eor sAbu_, sE4, sAsu, ROR #9 SEP bcax_m1 vAki, vBki, vBku, vBko +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP +eor sAbe_, sE1, sAge, ROR #19 SEP bcax_m1 vAko, vBko, vBka, vBku +load_constant_ptr SEP +restore count, STACK_OFFSET_COUNT SEP +bic tmp, sAgi_, sAge_, ROR #47 SEP +eor sAga, tmp, sAga_, ROR #39 SEP bcax_m1 vAku, vBku, vBke, vBka +bic tmp, sAgo_, sAgi_, ROR #42 SEP +eor sAge, tmp, sAge_, ROR #25 SEP +bic tmp, sAgu_, sAgo_, ROR #16 SEP bcax_m1 vAma, vBma, vBmi, vBme +eor sAgi, tmp, sAgi_, ROR #58 SEP +bic tmp, sAga_, sAgu_, ROR #31 SEP +eor sAgo, tmp, sAgo_, ROR #47 SEP +bic tmp, sAge_, sAga_, ROR #56 SEP bcax_m1 vAme, vBme, vBmo, vBmi +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP +eor sAka, tmp, sAka_, ROR #24 SEP bcax_m1 vAmi, vBmi, vBmu, vBmo +bic tmp, sAko_, sAki_, ROR #47 SEP +eor sAke, tmp, sAke_, ROR #2 SEP +bic tmp, sAku_, sAko_, ROR #10 SEP +eor sAki, tmp, sAki_, ROR #57 SEP bcax_m1 vAmo, vBmo, vBma, vBmu +bic tmp, sAka_, sAku_, ROR #47 SEP +eor sAko, tmp, sAko_, ROR #57 SEP +bic tmp, sAke_, sAka_, ROR #5 SEP bcax_m1 vAmu, vBmu, vBme, vBma +eor sAku, tmp, sAku_, ROR #52 SEP +bic tmp, sAmi_, sAme_, ROR #38 SEP +eor sAma, tmp, sAma_, ROR #47 SEP bcax_m1 vAsa, vBsa, vBsi, vBse +bic tmp, sAmo_, sAmi_, ROR #5 SEP +eor sAme, tmp, sAme_, ROR #43 SEP +bic tmp, sAmu_, sAmo_, ROR #41 SEP +eor sAmi, tmp, sAmi_, ROR #46 SEP bcax_m1 vAse, vBse, vBso, vBsi +bic tmp, sAma_, sAmu_, ROR #35 SEP +ldr cur_const, [const_addr, count, UXTW #3] SEP +add count, count, #1 SEP bcax_m1 vAsi, vBsi, vBsu, vBso +eor sAmo, tmp, sAmo_, ROR #12 SEP +bic tmp, sAme_, sAma_, ROR #9 SEP +eor sAmu, tmp, sAmu_, ROR #44 SEP +bic tmp, sAsi_, sAse_, ROR #48 SEP bcax_m1 vAso, vBso, vBsa, vBsu +eor sAsa, tmp, sAsa_, ROR #41 SEP +bic tmp, sAso_, sAsi_, ROR #2 SEP +eor sAse, tmp, sAse_, ROR #50 SEP bcax_m1 vAsu, vBsu, vBse, vBsa +bic tmp, sAsu_, sAso_, ROR #25 SEP +eor sAsi, tmp, sAsi_, ROR #27 SEP +bic tmp, sAsa_, sAsu_, ROR #60 SEP +eor sAso, tmp, sAso_, ROR #21 SEP bcax_m1 vAba, vBba, vBbi, vBbe +bic tmp, sAse_, sAsa_, ROR #57 SEP +eor sAsu, tmp, sAsu_, ROR #53 SEP +bic tmp, sAbi_, sAbe_, ROR #63 SEP bcax_m1 vAbe, vBbe, vBbo, vBbi +eor s_Aba, s_Aba_, tmp, ROR #21 SEP +bic tmp, sAbo_, sAbi_, ROR #42 SEP +eor sAbe, tmp, sAbe_, ROR #41 SEP +bic tmp, sAbu_, sAbo_, ROR #57 SEP bcax_m1 vAbi, vBbi, vBbu, vBbo +eor sAbi, tmp, sAbi_, ROR #35 SEP +bic tmp, s_Aba_, sAbu_, ROR #50 SEP +eor sAbo, tmp, sAbo_, ROR #43 SEP bcax_m1 vAbo, vBbo, vBba, vBbu +bic tmp, sAbe_, s_Aba_, ROR #44 SEP +eor sAbu, tmp, sAbu_, ROR #30 SEP +eor s_Aba, s_Aba, cur_const SEP bcax_m1 vAbu, vBbu, vBbe, vBba +ror sAga, sAga,(64-3) SEP +ror sAka, sAka,(64-25) SEP +ror sAma, sAma,(64-10) SEP +ror sAsa, sAsa,(64-39) SEP restore x26, STACK_OFFSET_CONST +ror sAbe, sAbe,(64-21) SEP +ror sAge, sAge,(64-45) SEP +ror sAke, sAke,(64-8) SEP ldr vvtmpq, [x26], #16 +ror sAme, sAme,(64-15) SEP +ror sAse, sAse,(64-41) SEP +ror sAbi, sAbi,(64-14) SEP +ror sAgi, sAgi,(64-61) SEP save x26, STACK_OFFSET_CONST +ror sAki, sAki,(64-18) SEP +ror sAmi, sAmi,(64-56) SEP +ror sAsi, sAsi,(64-2) SEP eor vAba.16b, vAba.16b, vvtmp.16b +ror sAgo, sAgo,(64-28) SEP +ror sAko, sAko,(64-1) SEP +ror sAmo, sAmo,(64-27) SEP +ror sAso, sAso,(64-62) SEP .unreq vvtmp +ror sAbu, sAbu,(64-44) SEP +ror sAgu, sAgu,(64-20) SEP +ror sAku, sAku,(64-6) SEP .unreq vvtmpq +ror sAmu, sAmu,(64-36) SEP +ror sAsu, sAsu,(64-55) SEP +.endm + +#define KECCAK_F1600_ROUNDS 24 + +.global keccak_f1600_x4_hybrid_asm_v5 +.global _keccak_f1600_x4_hybrid_asm_v5 +.text +.align 4 + +keccak_f1600_x4_hybrid_asm_v5: +_keccak_f1600_x4_hybrid_asm_v5: + alloc_stack + save_gprs + save_vregs + save input_addr, STACK_OFFSET_INPUT + + + ASM_LOAD(const_addr,round_constants_vec) + + save const_addr, STACK_OFFSET_CONST + load_input_vector 2,1 + + // First scalar Keccak computation alongside first half of SIMD computation + load_input_scalar 4,0 + hybrid_round_initial + loop_0: + hybrid_round_noninitial + cmp count, #(KECCAK_F1600_ROUNDS-3) + ble loop_0 + + hybrid_round_final + + restore input_addr, STACK_OFFSET_INPUT + store_input_scalar 4,0 + + // Second scalar Keccak computation alongsie second half of SIMD computation + load_input_scalar 4,1 + hybrid_round_initial + loop_1: + hybrid_round_noninitial + cmp count, #(KECCAK_F1600_ROUNDS-3) + ble loop_1 + + hybrid_round_final + + restore input_addr, STACK_OFFSET_INPUT + store_input_scalar 4,1 + store_input_vector 2,1 + + restore_vregs + restore_gprs + free_stack + + + ret diff --git a/asm/manual/keccak_f1600/keccak_f1600_x4_hybrid_asm_v5p.s b/asm/manual/keccak_f1600/keccak_f1600_x4_hybrid_asm_v5p.s new file mode 100644 index 0000000..960f781 --- /dev/null +++ b/asm/manual/keccak_f1600/keccak_f1600_x4_hybrid_asm_v5p.s @@ -0,0 +1,1337 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +/********************** CONSTANTS *************************/ + .data + .align(8) +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 +round_constants_vec: + .quad 0x0000000000000001 + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + .quad 0x8000000080008008 +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x29 + count .req w27 + out_count .req w27 + cur_const .req x26 + + /* Mapping of Kecck-f1600 SIMD state to vector registers + * at the beginning and end of each round. */ + + /* Mapping of Kecck-f1600 state to vector registers + * at the beginning and end of each round. */ + vAba .req v0 + vAbe .req v1 + vAbi .req v2 + vAbo .req v3 + vAbu .req v4 + vAga .req v5 + vAge .req v6 + vAgi .req v7 + vAgo .req v8 + vAgu .req v9 + vAka .req v10 + vAke .req v11 + vAki .req v12 + vAko .req v13 + vAku .req v14 + vAma .req v15 + vAme .req v16 + vAmi .req v17 + vAmo .req v18 + vAmu .req v19 + vAsa .req v20 + vAse .req v21 + vAsi .req v22 + vAso .req v23 + vAsu .req v24 + + /* q-form of the above mapping */ + vAbaq .req q0 + vAbeq .req q1 + vAbiq .req q2 + vAboq .req q3 + vAbuq .req q4 + vAgaq .req q5 + vAgeq .req q6 + vAgiq .req q7 + vAgoq .req q8 + vAguq .req q9 + vAkaq .req q10 + vAkeq .req q11 + vAkiq .req q12 + vAkoq .req q13 + vAkuq .req q14 + vAmaq .req q15 + vAmeq .req q16 + vAmiq .req q17 + vAmoq .req q18 + vAmuq .req q19 + vAsaq .req q20 + vAseq .req q21 + vAsiq .req q22 + vAsoq .req q23 + vAsuq .req q24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v27 + C1 .req v28 + C2 .req v29 + C3 .req v30 + C4 .req v31 + + C0q .req q27 + C1q .req q28 + C2q .req q29 + C3q .req q30 + C4q .req q31 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + vBba .req v25 // fresh + vBbe .req v26 // fresh + vBbi .req vAbi + vBbo .req vAbo + vBbu .req vAbu + vBga .req vAka + vBge .req vAke + vBgi .req vAgi + vBgo .req vAgo + vBgu .req vAgu + vBka .req vAma + vBke .req vAme + vBki .req vAki + vBko .req vAko + vBku .req vAku + vBma .req vAsa + vBme .req vAse + vBmi .req vAmi + vBmo .req vAmo + vBmu .req vAmu + vBsa .req vAba + vBse .req vAbe + vBsi .req vAsi + vBso .req vAso + vBsu .req vAsu + + vBbaq .req q25 // fresh + vBbeq .req q26 // fresh + vBbiq .req vAbiq + vBboq .req vAboq + vBbuq .req vAbuq + vBgaq .req vAkaq + vBgeq .req vAkeq + vBgiq .req vAgiq + vBgoq .req vAgoq + vBguq .req vAguq + vBkaq .req vAmaq + vBkeq .req vAmeq + vBkiq .req vAkiq + vBkoq .req vAkoq + vBkuq .req vAkuq + vBmaq .req vAsaq + vBmeq .req vAseq + vBmiq .req vAmiq + vBmoq .req vAmoq + vBmuq .req vAmuq + vBsaq .req vAbaq + vBseq .req vAbeq + vBsiq .req vAsiq + vBsoq .req vAsoq + vBsuq .req vAsuq + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req C4 + E1 .req C0 + E2 .req vBbe // fresh + E3 .req C2 + E4 .req C3 + + E0q .req C4q + E1q .req C0q + E2q .req vBbeq // fresh + E3q .req C2q + E4q .req C3q + + /* Mapping of Kecck-f1600 state to scalar registers + * at the beginning and end of each round. */ + s_Aba .req x1 + sAbe .req x6 + sAbi .req x11 + sAbo .req x16 + sAbu .req x21 + sAga .req x2 + sAge .req x7 + sAgi .req x12 + sAgo .req x17 + sAgu .req x22 + sAka .req x3 + sAke .req x8 + sAki .req x13 + sAko .req x18 + sAku .req x23 + sAma .req x4 + sAme .req x9 + sAmi .req x14 + sAmo .req x19 + sAmu .req x24 + sAsa .req x5 + sAse .req x10 + sAsi .req x15 + sAso .req x20 + sAsu .req x25 + + /* sA_[y,2*x+3*y] = rot(A[x,y]) */ + s_Aba_ .req x0 + sAbe_ .req x28 + sAbi_ .req x11 + sAbo_ .req x16 + sAbu_ .req x21 + sAga_ .req x3 + sAge_ .req x8 + sAgi_ .req x12 + sAgo_ .req x17 + sAgu_ .req x22 + sAka_ .req x4 + sAke_ .req x9 + sAki_ .req x13 + sAko_ .req x18 + sAku_ .req x23 + sAma_ .req x5 + sAme_ .req x10 + sAmi_ .req x14 + sAmo_ .req x19 + sAmu_ .req x24 + sAsa_ .req x1 + sAse_ .req x6 + sAsi_ .req x15 + sAso_ .req x20 + sAsu_ .req x25 + + /* sC[x] = sA[x,0] xor sA[x,1] xor sA[x,2] xor sA[x,3] xor sA[x,4], for x in 0..4 */ + /* sE[x] = sC[x-1] xor rot(C[x+1],1), for x in 0..4 */ + sC0 .req x0 + sE0 .req x29 + sC1 .req x26 + sE1 .req x30 + sC2 .req x27 + sE2 .req x26 + sC3 .req x28 + sE3 .req x27 + sC4 .req x29 + sE4 .req x28 + + tmp .req x30 + +/************************ MACROS ****************************/ + +/* Macros using v8.4-A SHA-3 instructions */ + +.macro eor3_m1_0 d s0 s1 s2 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor2 d s0 s1 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor3_m1_1 d s0 s1 s2 + eor \d\().16b, \d\().16b, \s2\().16b +.endm + + +.macro eor3_m1 d s0 s1 s2 + eor3_m1_0 \d, \s0, \s1, \s2 + eor3_m1_1 \d, \s0, \s1, \s2 +.endm + +.macro rax1_m1 d s0 s1 + // Use add instead of SHL #1 + add vvtmp.2d, \s1\().2d, \s1\().2d + sri vvtmp.2d, \s1\().2d, #63 + eor \d\().16b, vvtmp.16b, \s0\().16b +.endm + + .macro xar_m1 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 63 + eor \s0\().16b, \s0\().16b, \s1\().16b + add \d\().2d, \s0\().2d, \s0\().2d + sri \d\().2d, \s0\().2d, #(63) + .elseif \imm == 62 + eor \s0\().16b, \s0\().16b, \s1\().16b + add \d\().2d, \s0\().2d, \s0\().2d + add \d\().2d, \d\().2d, \d\().2d + sri \d\().2d, \s0\().2d, #(62) + .else + eor \s0\().16b, \s0\().16b, \s1\().16b + shl \d\().2d, \s0\().2d, #(64-\imm) + sri \d\().2d, \s0\().2d, #(\imm) + .endif +.endm + + .macro xar_m1_0 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 63 + eor \s0\().16b, \s0\().16b, \s1\().16b + .elseif \imm == 62 + eor \s0\().16b, \s0\().16b, \s1\().16b + .else + eor \s0\().16b, \s0\().16b, \s1\().16b + .endif +.endm + + .macro xar_m1_1 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 63 + add \d\().2d, \s0\().2d, \s0\().2d + sri \d\().2d, \s0\().2d, #(63) + .elseif \imm == 62 + add \d\().2d, \s0\().2d, \s0\().2d + add \d\().2d, \d\().2d, \d\().2d + sri \d\().2d, \s0\().2d, #(62) + .else + shl \d\().2d, \s0\().2d, #(64-\imm) + sri \d\().2d, \s0\().2d, #(\imm) + .endif +.endm + +.macro bcax_m1 d s0 s1 s2 + bic vvtmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, vvtmp.16b, \s0\().16b +.endm + +.macro load_input_vector + ldr vAbaq, [input_addr, #(32*0)] + ldr vAbeq, [input_addr, #(32*0+32)] + ldr vAbiq, [input_addr, #(32*2)] + ldr vAboq, [input_addr, #(32*2+32)] + ldr vAbuq, [input_addr, #(32*4)] + ldr vAgaq, [input_addr, #(32*4+32)] + ldr vAgeq, [input_addr, #(32*6)] + ldr vAgiq, [input_addr, #(32*6+32)] + ldr vAgoq, [input_addr, #(32*8)] + ldr vAguq, [input_addr, #(32*8+32)] + ldr vAkaq, [input_addr, #(32*10)] + ldr vAkeq, [input_addr, #(32*10+32)] + ldr vAkiq, [input_addr, #(32*12)] + ldr vAkoq, [input_addr, #(32*12+32)] + ldr vAkuq, [input_addr, #(32*14)] + ldr vAmaq, [input_addr, #(32*14+32)] + ldr vAmeq, [input_addr, #(32*16)] + ldr vAmiq, [input_addr, #(32*16+32)] + ldr vAmoq, [input_addr, #(32*18)] + ldr vAmuq, [input_addr, #(32*18+32)] + ldr vAsaq, [input_addr, #(32*20)] + ldr vAseq, [input_addr, #(32*20+32)] + ldr vAsiq, [input_addr, #(32*22)] + ldr vAsoq, [input_addr, #(32*22+32)] + ldr vAsuq, [input_addr, #(32*24)] +.endm + +.macro store_input_vector + str vAbaq, [input_addr, #(32*0)] + str vAbeq, [input_addr, #(32*0+32)] + str vAbiq, [input_addr, #(32*2)] + str vAboq, [input_addr, #(32*2+32)] + str vAbuq, [input_addr, #(32*4)] + str vAgaq, [input_addr, #(32*4+32)] + str vAgeq, [input_addr, #(32*6)] + str vAgiq, [input_addr, #(32*6+32)] + str vAgoq, [input_addr, #(32*8)] + str vAguq, [input_addr, #(32*8+32)] + str vAkaq, [input_addr, #(32*10)] + str vAkeq, [input_addr, #(32*10+32)] + str vAkiq, [input_addr, #(32*12)] + str vAkoq, [input_addr, #(32*12+32)] + str vAkuq, [input_addr, #(32*14)] + str vAmaq, [input_addr, #(32*14+32)] + str vAmeq, [input_addr, #(32*16)] + str vAmiq, [input_addr, #(32*16+32)] + str vAmoq, [input_addr, #(32*18)] + str vAmuq, [input_addr, #(32*18+32)] + str vAsaq, [input_addr, #(32*20)] + str vAseq, [input_addr, #(32*20+32)] + str vAsiq, [input_addr, #(32*22)] + str vAsoq, [input_addr, #(32*22+32)] + str vAsuq, [input_addr, #(32*24)] +.endm + +.macro store_input_scalar + str s_Aba,[input_addr, 32*0 ] + str sAbe, [input_addr, 32*1 ] + str sAbi, [input_addr, 32*2 ] + str sAbo, [input_addr, 32*3 ] + str sAbu, [input_addr, 32*4 ] + str sAga, [input_addr, 32*5 ] + str sAge, [input_addr, 32*6 ] + str sAgi, [input_addr, 32*7 ] + str sAgo, [input_addr, 32*8 ] + str sAgu, [input_addr, 32*9 ] + str sAka, [input_addr, 32*10] + str sAke, [input_addr, 32*11] + str sAki, [input_addr, 32*12] + str sAko, [input_addr, 32*13] + str sAku, [input_addr, 32*14] + str sAma, [input_addr, 32*15] + str sAme, [input_addr, 32*16] + str sAmi, [input_addr, 32*17] + str sAmo, [input_addr, 32*18] + str sAmu, [input_addr, 32*19] + str sAsa, [input_addr, 32*20] + str sAse, [input_addr, 32*21] + str sAsi, [input_addr, 32*22] + str sAso, [input_addr, 32*23] + str sAsu, [input_addr, 32*24] +.endm + +.macro load_input_scalar + ldr s_Aba,[input_addr, 32*0 ] + ldr sAbe, [input_addr, 32*1 ] + ldr sAbi, [input_addr, 32*2 ] + ldr sAbo, [input_addr, 32*3 ] + ldr sAbu, [input_addr, 32*4 ] + ldr sAga, [input_addr, 32*5 ] + ldr sAge, [input_addr, 32*6 ] + ldr sAgi, [input_addr, 32*7 ] + ldr sAgo, [input_addr, 32*8 ] + ldr sAgu, [input_addr, 32*9 ] + ldr sAka, [input_addr, 32*10] + ldr sAke, [input_addr, 32*11] + ldr sAki, [input_addr, 32*12] + ldr sAko, [input_addr, 32*13] + ldr sAku, [input_addr, 32*14] + ldr sAma, [input_addr, 32*15] + ldr sAme, [input_addr, 32*16] + ldr sAmi, [input_addr, 32*17] + ldr sAmo, [input_addr, 32*18] + ldr sAmu, [input_addr, 32*19] + ldr sAsa, [input_addr, 32*20] + ldr sAse, [input_addr, 32*21] + ldr sAsi, [input_addr, 32*22] + ldr sAso, [input_addr, 32*23] + ldr sAsu, [input_addr, 32*24] +.endm + +#define STACK_SIZE (4*16 + 12*8 + 6*8 + 3*16) +#define STACK_BASE_VREGS (0) +#define STACK_BASE_GPRS (4*16) +#define STACK_BASE_TMP_GPRS (4*16 + 12*8) +#define STACK_BASE_TMP_VREGS (4*16 + 12*8 + 6*8) +#define STACK_OFFSET_INPUT (0*8) +#define STACK_OFFSET_CONST (1*8) +#define STACK_OFFSET_COUNT (2*8) +#define STACK_OFFSET_COUNT_OUT (3*8) +#define STACK_OFFSET_CUR_INPUT (4*8) + +#define vAgi_offset 0 +#define vAga_offset 1 +#define vAge_offset 2 + +#define save(name) \ + str name ## q, [sp, #(STACK_BASE_TMP_VREGS + 16 * name ## _offset)] +#define restore(name) \ + ldr name ## q, [sp, #(STACK_BASE_TMP_VREGS + 16 * name ## _offset)] + +#define restore_as(reg,name) \ + ldr reg, [sp, #(STACK_BASE_TMP_VREGS + 16 * name ## _offset)] + +.macro save reg, offset + str \reg, [sp, #(STACK_BASE_TMP_GPRS + \offset)] +.endm + +.macro restore reg, offset + ldr \reg, [sp, #(STACK_BASE_TMP_GPRS + \offset)] +.endm + +.macro save_gprs + stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro save_vregs + stp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] + stp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + stp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + stp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] +.endm + +.macro restore_vregs + ldp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] + ldp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + ldp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + ldp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] +.endm + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +.macro eor5 dst, src0, src1, src2, src3, src4 + eor \dst, \src0, \src1 + eor \dst, \dst, \src2 + eor \dst, \dst, \src3 + eor \dst, \dst, \src4 +.endm + +.macro xor_rol dst, src1, src0, imm + eor \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro bic_rol dst, src1, src0, imm + bic \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro rotate dst, src, imm + ror \dst, \src, #(64-\imm) +.endm + +.macro hybrid_round_initial +eor sC0, sAma, sAsa SEP eor3_m1_0 C1,vAbe,vAge,vAke +eor sC1, sAme, sAse SEP +eor sC2, sAmi, sAsi SEP eor3_m1_0 C3,vAbo,vAgo,vAko +eor sC3, sAmo, sAso SEP +eor sC4, sAmu, sAsu SEP eor3_m1_0 C0,vAba,vAga,vAka +eor sC0, sAka, sC0 SEP +eor sC1, sAke, sC1 SEP eor3_m1_0 C2,vAbi,vAgi,vAki +eor sC2, sAki, sC2 SEP +eor sC3, sAko, sC3 SEP eor3_m1_0 C4,vAbu,vAgu,vAku +eor sC4, sAku, sC4 SEP +eor sC0, sAga, sC0 SEP eor3_m1_1 C1,vAbe,vAge,vAke +eor sC1, sAge, sC1 SEP eor3_m1_1 C3,vAbo,vAgo,vAko +eor sC2, sAgi, sC2 SEP +eor sC3, sAgo, sC3 SEP eor3_m1_1 C0,vAba,vAga,vAka +eor sC4, sAgu, sC4 SEP +eor sC0, s_Aba, sC0 SEP eor3_m1_1 C2,vAbi,vAgi,vAki +eor sC1, sAbe, sC1 SEP +eor sC2, sAbi, sC2 SEP eor3_m1_1 C4,vAbu,vAgu,vAku +eor sC3, sAbo, sC3 SEP +eor sC4, sAbu, sC4 SEP eor3_m1_0 C1, C1,vAme, vAse +eor sE1, sC0, sC2, ROR #63 SEP eor3_m1_0 C3, C3,vAmo, vAso +eor sE3, sC2, sC4, ROR #63 SEP +eor sE0, sC4, sC1, ROR #63 SEP eor3_m1_0 C0, C0,vAma, vAsa +eor sE2, sC1, sC3, ROR #63 SEP +eor sE4, sC3, sC0, ROR #63 SEP eor3_m1_0 C2, C2,vAmi, vAsi +eor s_Aba_, s_Aba, sE0 SEP +eor sAsa_, sAbi, sE2 SEP eor3_m1_0 C4, C4,vAmu, vAsu +eor sAbi_, sAki, sE2 SEP +eor sAki_, sAko, sE3 SEP eor3_m1_1 C1, C1,vAme, vAse +eor sAko_, sAmu, sE4 SEP eor3_m1_1 C3, C3,vAmo, vAso +eor sAmu_, sAso, sE3 SEP +eor sAso_, sAma, sE0 SEP eor3_m1_1 C0, C0,vAma, vAsa +eor sAka_, sAbe, sE1 SEP +eor sAse_, sAgo, sE3 SEP eor3_m1_1 C2, C2,vAmi, vAsi +eor sAgo_, sAme, sE1 SEP +eor sAke_, sAgi, sE2 SEP eor3_m1_1 C4, C4,vAmu, vAsu +eor sAgi_, sAka, sE0 SEP +eor sAga_, sAbo, sE3 SEP vvtmp .req vBba +eor sAbo_, sAmo, sE3 SEP rax1_m1 E2, C1, C3 +eor sAmo_, sAmi, sE2 SEP +eor sAmi_, sAke, sE1 SEP rax1_m1 E4, C3, C0 +eor sAge_, sAgu, sE4 SEP +eor sAgu_, sAsi, sE2 SEP rax1_m1 E1, C0, C2 +eor sAsi_, sAku, sE4 SEP +eor sAku_, sAsa, sE0 SEP rax1_m1 E3, C2, C4 +eor sAma_, sAbu, sE4 SEP +eor sAbu_, sAsu, sE4 SEP save(vAgi) +eor sAsu_, sAse, sE1 SEP rax1_m1 E0, C4, C1 +eor sAme_, sAga, sE0 SEP +eor sAbe_, sAge, sE1 SEP /* 25x XAR, 75 in total */ +load_constant_ptr SEP +bic tmp, sAgi_, sAge_, ROR #47 SEP .unreq vvtmp +eor sAga, tmp, sAga_, ROR #39 SEP +bic tmp, sAgo_, sAgi_, ROR #42 SEP vvtmp .req C1 +eor sAge, tmp, sAge_, ROR #25 SEP +bic tmp, sAgu_, sAgo_, ROR #16 SEP vvtmpq .req C1q +eor sAgi, tmp, sAgi_, ROR #58 SEP xar_m1 vBgi, vAka, E0, 61 +bic tmp, sAga_, sAgu_, ROR #31 SEP +eor sAgo, tmp, sAgo_, ROR #47 SEP xar_m1 vBga, vAbo, E3, 36 +bic tmp, sAge_, sAga_, ROR #56 SEP +eor sAgu, tmp, sAgu_, ROR #23 SEP save(vAga) +bic tmp, sAki_, sAke_, ROR #19 SEP +eor sAka, tmp, sAka_, ROR #24 SEP xar_m1 vBbo, vAmo, E3, 43 +bic tmp, sAko_, sAki_, ROR #47 SEP +eor sAke, tmp, sAke_, ROR #2 SEP xar_m1 vBmo, vAmi, E2, 49 +bic tmp, sAku_, sAko_, ROR #10 SEP save(vAge) +eor sAki, tmp, sAki_, ROR #57 SEP +bic tmp, sAka_, sAku_, ROR #47 SEP xar_m1 vBmi, vAke, E1, 54 +eor sAko, tmp, sAko_, ROR #57 SEP +bic tmp, sAke_, sAka_, ROR #5 SEP xar_m1 vBge, vAgu, E4, 44 +eor sAku, tmp, sAku_, ROR #52 SEP +bic tmp, sAmi_, sAme_, ROR #38 SEP bcax_m1 vAga, vBga, vBgi, vBge +eor sAma, tmp, sAma_, ROR #47 SEP +bic tmp, sAmo_, sAmi_, ROR #5 SEP eor vBba.16b, vAba.16b, E0.16b +eor sAme, tmp, sAme_, ROR #43 SEP +bic tmp, sAmu_, sAmo_, ROR #41 SEP xar_m1 vBsa, vAbi, E2, 2 +eor sAmi, tmp, sAmi_, ROR #46 SEP xar_m1 vBbi, vAki, E2, 21 +ldr cur_const, [const_addr] SEP +mov count, #1 SEP xar_m1 vBki, vAko, E3, 39 +bic tmp, sAma_, sAmu_, ROR #35 SEP +eor sAmo, tmp, sAmo_, ROR #12 SEP xar_m1 vBko, vAmu, E4, 56 +bic tmp, sAme_, sAma_, ROR #9 SEP +eor sAmu, tmp, sAmu_, ROR #44 SEP xar_m1 vBmu, vAso, E3, 8 +bic tmp, sAsi_, sAse_, ROR #48 SEP +eor sAsa, tmp, sAsa_, ROR #41 SEP xar_m1 vBso, vAma, E0, 23 +bic tmp, sAso_, sAsi_, ROR #2 SEP xar_m1 vBka, vAbe, E1, 63 +eor sAse, tmp, sAse_, ROR #50 SEP +bic tmp, sAsu_, sAso_, ROR #25 SEP xar_m1 vBse, vAgo, E3, 9 +eor sAsi, tmp, sAsi_, ROR #27 SEP +bic tmp, sAsa_, sAsu_, ROR #60 SEP xar_m1 vBgo, vAme, E1, 19 +eor sAso, tmp, sAso_, ROR #21 SEP +bic tmp, sAse_, sAsa_, ROR #57 SEP bcax_m1 vAge, vBge, vBgo, vBgi +eor sAsu, tmp, sAsu_, ROR #53 SEP +bic tmp, sAbi_, sAbe_, ROR #63 SEP restore_as(vvtmpq, vAgi) +eor s_Aba, s_Aba_, tmp, ROR #21 SEP xar_m1 vBke, vvtmp, E2, 58 +bic tmp, sAbo_, sAbi_, ROR #42 SEP +eor sAbe, tmp, sAbe_, ROR #41 SEP xar_m1 vBgu, vAsi, E2, 3 +bic tmp, sAbu_, sAbo_, ROR #57 SEP +eor sAbi, tmp, sAbi_, ROR #35 SEP bcax_m1 vAgi, vBgi, vBgu, vBgo +bic tmp, s_Aba_, sAbu_, ROR #50 SEP +eor sAbo, tmp, sAbo_, ROR #43 SEP xar_m1 vBsi, vAku, E4, 25 +bic tmp, sAbe_, s_Aba_, ROR #44 SEP +eor sAbu, tmp, sAbu_, ROR #30 SEP xar_m1 vBku, vAsa, E0, 46 +eor s_Aba, s_Aba, cur_const SEP xar_m1 vBma, vAbu, E4, 37 +save count, STACK_OFFSET_COUNT SEP +eor sC0, sAka, sAsa, ROR #50 SEP xar_m1 vBbu, vAsu, E4, 50 +eor sC1, sAse, sAge, ROR #60 SEP +eor sC2, sAmi, sAgi, ROR #59 SEP xar_m1 vBsu, vAse, E1, 62 +eor sC3, sAgo, sAso, ROR #30 SEP +eor sC4, sAbu, sAsu, ROR #53 SEP ldp vvtmpq, E3q, [sp, #(STACK_BASE_TMP_VREGS + 16*vAga_offset)] +eor sC0, sAma, sC0, ROR #49 SEP +eor sC1, sAbe, sC1, ROR #44 SEP xar_m1 vBme, vvtmp, E0, 28 +eor sC2, sAki, sC2, ROR #26 SEP xar_m1 vBbe, E3, E1, 20 +eor sC3, sAmo, sC3, ROR #63 SEP +eor sC4, sAmu, sC4, ROR #56 SEP /* 25x BCAX, 50 in total */ +eor sC0, sAga, sC0, ROR #57 SEP +eor sC1, sAme, sC1, ROR #58 SEP bcax_m1 vAgo, vBgo, vBga, vBgu +eor sC2, sAbi, sC2, ROR #60 SEP +eor sC3, sAko, sC3, ROR #38 SEP bcax_m1 vAgu, vBgu, vBge, vBga +eor sC4, sAgu, sC4, ROR #48 SEP +eor sC0, s_Aba, sC0, ROR #61 SEP bcax_m1 vAka, vBka, vBki, vBke +eor sC1, sAke, sC1, ROR #57 SEP bcax_m1 vAke, vBke, vBko, vBki +eor sC2, sAsi, sC2, ROR #52 SEP +eor sC3, sAbo, sC3, ROR #63 SEP .unreq vvtmp +eor sC4, sAku, sC4, ROR #50 SEP +ror sC1, sC1, 56 SEP .unreq vvtmpq +ror sC4, sC4, 58 SEP +ror sC2, sC2, 62 SEP eor2 C0, vAka, vAga +eor sE1, sC0, sC2, ROR #63 SEP +eor sE3, sC2, sC4, ROR #63 SEP save(vAga) +eor sE0, sC4, sC1, ROR #63 SEP vvtmp .req vAga +eor sE2, sC1, sC3, ROR #63 SEP +eor sE4, sC3, sC0, ROR #63 SEP vvtmpq .req vAgaq +eor s_Aba_, sE0, s_Aba SEP +eor sAsa_, sE2, sAbi, ROR #50 SEP bcax_m1 vAki, vBki, vBku, vBko +eor sAbi_, sE2, sAki, ROR #46 SEP +eor sAki_, sE3, sAko, ROR #63 SEP bcax_m1 vAko, vBko, vBka, vBku +eor sAko_, sE4, sAmu, ROR #28 SEP +eor sAmu_, sE3, sAso, ROR #2 SEP eor2 C1, vAke, vAge +eor sAso_, sE0, sAma, ROR #54 SEP bcax_m1 vAku, vBku, vBke, vBka +eor sAka_, sE1, sAbe, ROR #43 SEP +eor sAse_, sE3, sAgo, ROR #36 SEP eor2 C2, vAki, vAgi +eor sAgo_, sE1, sAme, ROR #49 SEP +eor sAke_, sE2, sAgi, ROR #3 SEP bcax_m1 vAma, vBma, vBmi, vBme +eor sAgi_, sE0, sAka, ROR #39 SEP +eor sAga_, sE3, sAbo SEP eor2 C3, vAko, vAgo +eor sAbo_, sE3, sAmo, ROR #37 SEP +eor sAmo_, sE2, sAmi, ROR #8 SEP bcax_m1 vAme, vBme, vBmo, vBmi +eor sAmi_, sE1, sAke, ROR #56 SEP +eor sAge_, sE4, sAgu, ROR #44 SEP eor2 C4, vAku, vAgu +eor sAgu_, sE2, sAsi, ROR #62 SEP bcax_m1 vAmi, vBmi, vBmu, vBmo +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP eor2 C0, C0, vAma +eor sAma_, sE4, sAbu, ROR #20 SEP +eor sAbu_, sE4, sAsu, ROR #9 SEP bcax_m1 vAmo, vBmo, vBma, vBmu +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP eor2 C1, C1, vAme +eor sAbe_, sE1, sAge, ROR #19 SEP +load_constant_ptr SEP bcax_m1 vAmu, vBmu, vBme, vBma +restore count, STACK_OFFSET_COUNT SEP eor2 C2, C2, vAmi +bic tmp, sAgi_, sAge_, ROR #47 SEP +eor sAga, tmp, sAga_, ROR #39 SEP bcax_m1 vAsa, vBsa, vBsi, vBse +bic tmp, sAgo_, sAgi_, ROR #42 SEP +eor sAge, tmp, sAge_, ROR #25 SEP eor2 C3, C3, vAmo +bic tmp, sAgu_, sAgo_, ROR #16 SEP +eor sAgi, tmp, sAgi_, ROR #58 SEP bcax_m1 vAse, vBse, vBso, vBsi +bic tmp, sAga_, sAgu_, ROR #31 SEP +eor sAgo, tmp, sAgo_, ROR #47 SEP eor2 C4, C4, vAmu +bic tmp, sAge_, sAga_, ROR #56 SEP bcax_m1 vAsi, vBsi, vBsu, vBso +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP eor2 C0, C0, vAsa +eor sAka, tmp, sAka_, ROR #24 SEP +bic tmp, sAko_, sAki_, ROR #47 SEP bcax_m1 vAso, vBso, vBsa, vBsu +eor sAke, tmp, sAke_, ROR #2 SEP +bic tmp, sAku_, sAko_, ROR #10 SEP eor2 C1, C1, vAse +eor sAki, tmp, sAki_, ROR #57 SEP +bic tmp, sAka_, sAku_, ROR #47 SEP bcax_m1 vAsu, vBsu, vBse, vBsa +eor sAko, tmp, sAko_, ROR #57 SEP eor2 C2, C2, vAsi +bic tmp, sAke_, sAka_, ROR #5 SEP +eor sAku, tmp, sAku_, ROR #52 SEP eor2 C3, C3, vAso +bic tmp, sAmi_, sAme_, ROR #38 SEP +eor sAma, tmp, sAma_, ROR #47 SEP bcax_m1 vAba, vBba, vBbi, vBbe +bic tmp, sAmo_, sAmi_, ROR #5 SEP +eor sAme, tmp, sAme_, ROR #43 SEP bcax_m1 vAbe, vBbe, vBbo, vBbi +bic tmp, sAmu_, sAmo_, ROR #41 SEP +eor sAmi, tmp, sAmi_, ROR #46 SEP eor2 C1, C1, vAbe +bic tmp, sAma_, sAmu_, ROR #35 SEP restore x26, STACK_OFFSET_CONST +eor sAmo, tmp, sAmo_, ROR #12 SEP ldr vvtmpq, [x26], #16 +bic tmp, sAme_, sAma_, ROR #9 SEP +eor sAmu, tmp, sAmu_, ROR #44 SEP save x26, STACK_OFFSET_CONST +bic tmp, sAsi_, sAse_, ROR #48 SEP +ldr cur_const, [const_addr, count, UXTW #3] SEP +eor sAsa, tmp, sAsa_, ROR #41 SEP eor vAba.16b, vAba.16b, vvtmp.16b +bic tmp, sAso_, sAsi_, ROR #2 SEP +eor sAse, tmp, sAse_, ROR #50 SEP eor2 C4, C4, vAsu +bic tmp, sAsu_, sAso_, ROR #25 SEP bcax_m1 vAbi, vBbi, vBbu, vBbo +eor sAsi, tmp, sAsi_, ROR #27 SEP +bic tmp, sAsa_, sAsu_, ROR #60 SEP bcax_m1 vAbo, vBbo, vBba, vBbu +eor sAso, tmp, sAso_, ROR #21 SEP +bic tmp, sAse_, sAsa_, ROR #57 SEP eor2 C3, C3, vAbo +eor sAsu, tmp, sAsu_, ROR #53 SEP +bic tmp, sAbi_, sAbe_, ROR #63 SEP eor2 C2, C2, vAbi +eor s_Aba, s_Aba_, tmp, ROR #21 SEP +bic tmp, sAbo_, sAbi_, ROR #42 SEP eor2 C0, C0, vAba +eor sAbe, tmp, sAbe_, ROR #41 SEP bcax_m1 vAbu, vBbu, vBbe, vBba +bic tmp, sAbu_, sAbo_, ROR #57 SEP +eor sAbi, tmp, sAbi_, ROR #35 SEP eor2 C4, C4, vAbu +bic tmp, s_Aba_, sAbu_, ROR #50 SEP +eor sAbo, tmp, sAbo_, ROR #43 SEP restore(vAga) +bic tmp, sAbe_, s_Aba_, ROR #44 SEP +eor sAbu, tmp, sAbu_, ROR #30 SEP .unreq vvtmp +add count, count, #1 SEP +eor s_Aba, s_Aba, cur_const SEP .unreq vvtmpq +.endm + + +.macro hybrid_round_noninitial +save count, STACK_OFFSET_COUNT SEP +eor sC0, sAka, sAsa, ROR #50 SEP vvtmp .req vBba +eor sC1, sAse, sAge, ROR #60 SEP rax1_m1 E2, C1, C3 +eor sC2, sAmi, sAgi, ROR #59 SEP rax1_m1 E4, C3, C0 +eor sC3, sAgo, sAso, ROR #30 SEP +eor sC4, sAbu, sAsu, ROR #53 SEP +eor sC0, sAma, sC0, ROR #49 SEP +eor sC1, sAbe, sC1, ROR #44 SEP rax1_m1 E1, C0, C2 +eor sC2, sAki, sC2, ROR #26 SEP +eor sC3, sAmo, sC3, ROR #63 SEP +eor sC4, sAmu, sC4, ROR #56 SEP rax1_m1 E3, C2, C4 +eor sC0, sAga, sC0, ROR #57 SEP +eor sC1, sAme, sC1, ROR #58 SEP save(vAgi) +eor sC2, sAbi, sC2, ROR #60 SEP +eor sC3, sAko, sC3, ROR #38 SEP rax1_m1 E0, C4, C1 +eor sC4, sAgu, sC4, ROR #48 SEP +eor sC0, s_Aba, sC0, ROR #61 SEP .unreq vvtmp +eor sC1, sAke, sC1, ROR #57 SEP +eor sC2, sAsi, sC2, ROR #52 SEP +eor sC3, sAbo, sC3, ROR #63 SEP vvtmp .req C1 +eor sC4, sAku, sC4, ROR #50 SEP +ror sC1, sC1, 56 SEP vvtmpq .req C1q +ror sC4, sC4, 58 SEP +ror sC2, sC2, 62 SEP xar_m1 vBgi, vAka, E0, 61 +eor sE1, sC0, sC2, ROR #63 SEP +eor sE3, sC2, sC4, ROR #63 SEP xar_m1 vBga, vAbo, E3, 36 +eor sE0, sC4, sC1, ROR #63 SEP +eor sE2, sC1, sC3, ROR #63 SEP +eor sE4, sC3, sC0, ROR #63 SEP save(vAga) +eor s_Aba_, sE0, s_Aba SEP +eor sAsa_, sE2, sAbi, ROR #50 SEP xar_m1 vBbo, vAmo, E3, 43 +eor sAbi_, sE2, sAki, ROR #46 SEP +eor sAki_, sE3, sAko, ROR #63 SEP xar_m1 vBmo, vAmi, E2, 49 +eor sAko_, sE4, sAmu, ROR #28 SEP +eor sAmu_, sE3, sAso, ROR #2 SEP +eor sAso_, sE0, sAma, ROR #54 SEP save(vAge) +eor sAka_, sE1, sAbe, ROR #43 SEP +eor sAse_, sE3, sAgo, ROR #36 SEP xar_m1 vBmi, vAke, E1, 54 +eor sAgo_, sE1, sAme, ROR #49 SEP +eor sAke_, sE2, sAgi, ROR #3 SEP xar_m1 vBge, vAgu, E4, 44 +eor sAgi_, sE0, sAka, ROR #39 SEP +eor sAga_, sE3, sAbo SEP bcax_m1 vAga, vBga, vBgi, vBge +eor sAbo_, sE3, sAmo, ROR #37 SEP +eor sAmo_, sE2, sAmi, ROR #8 SEP +eor sAmi_, sE1, sAke, ROR #56 SEP eor vBba.16b, vAba.16b, E0.16b +eor sAge_, sE4, sAgu, ROR #44 SEP +eor sAgu_, sE2, sAsi, ROR #62 SEP xar_m1 vBsa, vAbi, E2, 2 +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP xar_m1 vBbi, vAki, E2, 21 +eor sAma_, sE4, sAbu, ROR #20 SEP +eor sAbu_, sE4, sAsu, ROR #9 SEP xar_m1 vBki, vAko, E3, 39 +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP +eor sAbe_, sE1, sAge, ROR #19 SEP xar_m1 vBko, vAmu, E4, 56 +load_constant_ptr SEP +restore count, STACK_OFFSET_COUNT SEP xar_m1 vBmu, vAso, E3, 8 +bic tmp, sAgi_, sAge_, ROR #47 SEP +eor sAga, tmp, sAga_, ROR #39 SEP xar_m1 vBso, vAma, E0, 23 +bic tmp, sAgo_, sAgi_, ROR #42 SEP +eor sAge, tmp, sAge_, ROR #25 SEP +bic tmp, sAgu_, sAgo_, ROR #16 SEP xar_m1 vBka, vAbe, E1, 63 +eor sAgi, tmp, sAgi_, ROR #58 SEP +bic tmp, sAga_, sAgu_, ROR #31 SEP xar_m1 vBse, vAgo, E3, 9 +eor sAgo, tmp, sAgo_, ROR #47 SEP +bic tmp, sAge_, sAga_, ROR #56 SEP xar_m1 vBgo, vAme, E1, 19 +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP bcax_m1 vAge, vBge, vBgo, vBgi +eor sAka, tmp, sAka_, ROR #24 SEP +bic tmp, sAko_, sAki_, ROR #47 SEP +eor sAke, tmp, sAke_, ROR #2 SEP restore_as(vvtmpq, vAgi) +bic tmp, sAku_, sAko_, ROR #10 SEP +eor sAki, tmp, sAki_, ROR #57 SEP xar_m1 vBke, vvtmp, E2, 58 +bic tmp, sAka_, sAku_, ROR #47 SEP +eor sAko, tmp, sAko_, ROR #57 SEP xar_m1 vBgu, vAsi, E2, 3 +bic tmp, sAke_, sAka_, ROR #5 SEP +eor sAku, tmp, sAku_, ROR #52 SEP bcax_m1 vAgi, vBgi, vBgu, vBgo +bic tmp, sAmi_, sAme_, ROR #38 SEP +eor sAma, tmp, sAma_, ROR #47 SEP +bic tmp, sAmo_, sAmi_, ROR #5 SEP xar_m1 vBsi, vAku, E4, 25 +eor sAme, tmp, sAme_, ROR #43 SEP +bic tmp, sAmu_, sAmo_, ROR #41 SEP xar_m1 vBku, vAsa, E0, 46 +eor sAmi, tmp, sAmi_, ROR #46 SEP +bic tmp, sAma_, sAmu_, ROR #35 SEP xar_m1 vBma, vAbu, E4, 37 +ldr cur_const, [const_addr, count, UXTW #3] SEP +add count, count, #1 SEP +eor sAmo, tmp, sAmo_, ROR #12 SEP xar_m1 vBbu, vAsu, E4, 50 +bic tmp, sAme_, sAma_, ROR #9 SEP +eor sAmu, tmp, sAmu_, ROR #44 SEP xar_m1 vBsu, vAse, E1, 62 +bic tmp, sAsi_, sAse_, ROR #48 SEP +eor sAsa, tmp, sAsa_, ROR #41 SEP ldp vvtmpq, E3q, [sp, #(STACK_BASE_TMP_VREGS + 16*vAga_offset)] +bic tmp, sAso_, sAsi_, ROR #2 SEP +eor sAse, tmp, sAse_, ROR #50 SEP xar_m1 vBme, vvtmp, E0, 28 +bic tmp, sAsu_, sAso_, ROR #25 SEP +eor sAsi, tmp, sAsi_, ROR #27 SEP +bic tmp, sAsa_, sAsu_, ROR #60 SEP xar_m1 vBbe, E3, E1, 20 +eor sAso, tmp, sAso_, ROR #21 SEP +bic tmp, sAse_, sAsa_, ROR #57 SEP bcax_m1 vAgo, vBgo, vBga, vBgu +eor sAsu, tmp, sAsu_, ROR #53 SEP +bic tmp, sAbi_, sAbe_, ROR #63 SEP bcax_m1 vAgu, vBgu, vBge, vBga +eor s_Aba, s_Aba_, tmp, ROR #21 SEP +bic tmp, sAbo_, sAbi_, ROR #42 SEP bcax_m1 vAka, vBka, vBki, vBke +eor sAbe, tmp, sAbe_, ROR #41 SEP +bic tmp, sAbu_, sAbo_, ROR #57 SEP +eor sAbi, tmp, sAbi_, ROR #35 SEP bcax_m1 vAke, vBke, vBko, vBki +bic tmp, s_Aba_, sAbu_, ROR #50 SEP +eor sAbo, tmp, sAbo_, ROR #43 SEP .unreq vvtmp +bic tmp, sAbe_, s_Aba_, ROR #44 SEP +eor sAbu, tmp, sAbu_, ROR #30 SEP .unreq vvtmpq +eor s_Aba, s_Aba, cur_const SEP +save count, STACK_OFFSET_COUNT SEP +eor sC0, sAka, sAsa, ROR #50 SEP eor2 C0, vAka, vAga +eor sC1, sAse, sAge, ROR #60 SEP +eor sC2, sAmi, sAgi, ROR #59 SEP save(vAga) +eor sC3, sAgo, sAso, ROR #30 SEP +eor sC4, sAbu, sAsu, ROR #53 SEP vvtmp .req vAga +eor sC0, sAma, sC0, ROR #49 SEP +eor sC1, sAbe, sC1, ROR #44 SEP vvtmpq .req vAgaq +eor sC2, sAki, sC2, ROR #26 SEP +eor sC3, sAmo, sC3, ROR #63 SEP +eor sC4, sAmu, sC4, ROR #56 SEP bcax_m1 vAki, vBki, vBku, vBko +eor sC0, sAga, sC0, ROR #57 SEP +eor sC1, sAme, sC1, ROR #58 SEP bcax_m1 vAko, vBko, vBka, vBku +eor sC2, sAbi, sC2, ROR #60 SEP +eor sC3, sAko, sC3, ROR #38 SEP eor2 C1, vAke, vAge +eor sC4, sAgu, sC4, ROR #48 SEP +eor sC0, s_Aba, sC0, ROR #61 SEP bcax_m1 vAku, vBku, vBke, vBka +eor sC1, sAke, sC1, ROR #57 SEP +eor sC2, sAsi, sC2, ROR #52 SEP +eor sC3, sAbo, sC3, ROR #63 SEP eor2 C2, vAki, vAgi +eor sC4, sAku, sC4, ROR #50 SEP +ror sC1, sC1, 56 SEP bcax_m1 vAma, vBma, vBmi, vBme +ror sC4, sC4, 58 SEP +ror sC2, sC2, 62 SEP eor2 C3, vAko, vAgo +eor sE1, sC0, sC2, ROR #63 SEP +eor sE3, sC2, sC4, ROR #63 SEP bcax_m1 vAme, vBme, vBmo, vBmi +eor sE0, sC4, sC1, ROR #63 SEP +eor sE2, sC1, sC3, ROR #63 SEP +eor sE4, sC3, sC0, ROR #63 SEP eor2 C4, vAku, vAgu +eor s_Aba_, sE0, s_Aba SEP +eor sAsa_, sE2, sAbi, ROR #50 SEP bcax_m1 vAmi, vBmi, vBmu, vBmo +eor sAbi_, sE2, sAki, ROR #46 SEP +eor sAki_, sE3, sAko, ROR #63 SEP eor2 C0, C0, vAma +eor sAko_, sE4, sAmu, ROR #28 SEP +eor sAmu_, sE3, sAso, ROR #2 SEP +eor sAso_, sE0, sAma, ROR #54 SEP bcax_m1 vAmo, vBmo, vBma, vBmu +eor sAka_, sE1, sAbe, ROR #43 SEP +eor sAse_, sE3, sAgo, ROR #36 SEP eor2 C1, C1, vAme +eor sAgo_, sE1, sAme, ROR #49 SEP +eor sAke_, sE2, sAgi, ROR #3 SEP bcax_m1 vAmu, vBmu, vBme, vBma +eor sAgi_, sE0, sAka, ROR #39 SEP +eor sAga_, sE3, sAbo SEP eor2 C2, C2, vAmi +eor sAbo_, sE3, sAmo, ROR #37 SEP +eor sAmo_, sE2, sAmi, ROR #8 SEP +eor sAmi_, sE1, sAke, ROR #56 SEP bcax_m1 vAsa, vBsa, vBsi, vBse +eor sAge_, sE4, sAgu, ROR #44 SEP +eor sAgu_, sE2, sAsi, ROR #62 SEP eor2 C3, C3, vAmo +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP bcax_m1 vAse, vBse, vBso, vBsi +eor sAma_, sE4, sAbu, ROR #20 SEP +eor sAbu_, sE4, sAsu, ROR #9 SEP eor2 C4, C4, vAmu +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP +eor sAbe_, sE1, sAge, ROR #19 SEP bcax_m1 vAsi, vBsi, vBsu, vBso +load_constant_ptr SEP +restore count, STACK_OFFSET_COUNT SEP eor2 C0, C0, vAsa +bic tmp, sAgi_, sAge_, ROR #47 SEP +eor sAga, tmp, sAga_, ROR #39 SEP bcax_m1 vAso, vBso, vBsa, vBsu +bic tmp, sAgo_, sAgi_, ROR #42 SEP +eor sAge, tmp, sAge_, ROR #25 SEP +bic tmp, sAgu_, sAgo_, ROR #16 SEP eor2 C1, C1, vAse +eor sAgi, tmp, sAgi_, ROR #58 SEP +bic tmp, sAga_, sAgu_, ROR #31 SEP bcax_m1 vAsu, vBsu, vBse, vBsa +eor sAgo, tmp, sAgo_, ROR #47 SEP +bic tmp, sAge_, sAga_, ROR #56 SEP eor2 C2, C2, vAsi +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP eor2 C3, C3, vAso +eor sAka, tmp, sAka_, ROR #24 SEP +bic tmp, sAko_, sAki_, ROR #47 SEP +eor sAke, tmp, sAke_, ROR #2 SEP bcax_m1 vAba, vBba, vBbi, vBbe +bic tmp, sAku_, sAko_, ROR #10 SEP +eor sAki, tmp, sAki_, ROR #57 SEP bcax_m1 vAbe, vBbe, vBbo, vBbi +bic tmp, sAka_, sAku_, ROR #47 SEP +eor sAko, tmp, sAko_, ROR #57 SEP eor2 C1, C1, vAbe +bic tmp, sAke_, sAka_, ROR #5 SEP +eor sAku, tmp, sAku_, ROR #52 SEP restore x26, STACK_OFFSET_CONST +bic tmp, sAmi_, sAme_, ROR #38 SEP +eor sAma, tmp, sAma_, ROR #47 SEP +bic tmp, sAmo_, sAmi_, ROR #5 SEP ldr vvtmpq, [x26], #16 +eor sAme, tmp, sAme_, ROR #43 SEP +bic tmp, sAmu_, sAmo_, ROR #41 SEP save x26, STACK_OFFSET_CONST +eor sAmi, tmp, sAmi_, ROR #46 SEP +bic tmp, sAma_, sAmu_, ROR #35 SEP eor vAba.16b, vAba.16b, vvtmp.16b +ldr cur_const, [const_addr, count, UXTW #3] SEP +add count, count, #1 SEP +eor sAmo, tmp, sAmo_, ROR #12 SEP eor2 C4, C4, vAsu +bic tmp, sAme_, sAma_, ROR #9 SEP +eor sAmu, tmp, sAmu_, ROR #44 SEP bcax_m1 vAbi, vBbi, vBbu, vBbo +bic tmp, sAsi_, sAse_, ROR #48 SEP +eor sAsa, tmp, sAsa_, ROR #41 SEP bcax_m1 vAbo, vBbo, vBba, vBbu +bic tmp, sAso_, sAsi_, ROR #2 SEP +eor sAse, tmp, sAse_, ROR #50 SEP eor2 C3, C3, vAbo +bic tmp, sAsu_, sAso_, ROR #25 SEP +eor sAsi, tmp, sAsi_, ROR #27 SEP +bic tmp, sAsa_, sAsu_, ROR #60 SEP eor2 C2, C2, vAbi +eor sAso, tmp, sAso_, ROR #21 SEP +bic tmp, sAse_, sAsa_, ROR #57 SEP eor2 C0, C0, vAba +eor sAsu, tmp, sAsu_, ROR #53 SEP +bic tmp, sAbi_, sAbe_, ROR #63 SEP bcax_m1 vAbu, vBbu, vBbe, vBba +eor s_Aba, s_Aba_, tmp, ROR #21 SEP +bic tmp, sAbo_, sAbi_, ROR #42 SEP eor2 C4, C4, vAbu +eor sAbe, tmp, sAbe_, ROR #41 SEP +bic tmp, sAbu_, sAbo_, ROR #57 SEP +eor sAbi, tmp, sAbi_, ROR #35 SEP restore(vAga) +bic tmp, s_Aba_, sAbu_, ROR #50 SEP +eor sAbo, tmp, sAbo_, ROR #43 SEP .unreq vvtmp +bic tmp, sAbe_, s_Aba_, ROR #44 SEP +eor sAbu, tmp, sAbu_, ROR #30 SEP .unreq vvtmpq +eor s_Aba, s_Aba, cur_const SEP +.endm +.macro hybrid_round_final + SEP vvtmp .req vBba +save count, STACK_OFFSET_COUNT SEP rax1_m1 E2, C1, C3 +eor sC0, sAka, sAsa, ROR #50 SEP +eor sC1, sAse, sAge, ROR #60 SEP rax1_m1 E4, C3, C0 +eor sC2, sAmi, sAgi, ROR #59 SEP +eor sC3, sAgo, sAso, ROR #30 SEP rax1_m1 E1, C0, C2 +eor sC4, sAbu, sAsu, ROR #53 SEP +eor sC0, sAma, sC0, ROR #49 SEP +eor sC1, sAbe, sC1, ROR #44 SEP +eor sC2, sAki, sC2, ROR #26 SEP +eor sC3, sAmo, sC3, ROR #63 SEP +eor sC4, sAmu, sC4, ROR #56 SEP +eor sC0, sAga, sC0, ROR #57 SEP +eor sC1, sAme, sC1, ROR #58 SEP +eor sC2, sAbi, sC2, ROR #60 SEP +eor sC3, sAko, sC3, ROR #38 SEP rax1_m1 E3, C2, C4 +eor sC4, sAgu, sC4, ROR #48 SEP +eor sC0, s_Aba, sC0, ROR #61 SEP +eor sC1, sAke, sC1, ROR #57 SEP +eor sC2, sAsi, sC2, ROR #52 SEP save(vAgi) +eor sC3, sAbo, sC3, ROR #63 SEP +eor sC4, sAku, sC4, ROR #50 SEP +ror sC1, sC1, 56 SEP rax1_m1 E0, C4, C1 +ror sC4, sC4, 58 SEP +ror sC2, sC2, 62 SEP +eor sE1, sC0, sC2, ROR #63 SEP +eor sE3, sC2, sC4, ROR #63 SEP .unreq vvtmp +eor sE0, sC4, sC1, ROR #63 SEP +eor sE2, sC1, sC3, ROR #63 SEP +eor sE4, sC3, sC0, ROR #63 SEP vvtmp .req C1 +eor s_Aba_, sE0, s_Aba SEP +eor sAsa_, sE2, sAbi, ROR #50 SEP +eor sAbi_, sE2, sAki, ROR #46 SEP vvtmpq .req C1q +eor sAki_, sE3, sAko, ROR #63 SEP +eor sAko_, sE4, sAmu, ROR #28 SEP +eor sAmu_, sE3, sAso, ROR #2 SEP +eor sAso_, sE0, sAma, ROR #54 SEP xar_m1 vBgi, vAka, E0, 61 +eor sAka_, sE1, sAbe, ROR #43 SEP +eor sAse_, sE3, sAgo, ROR #36 SEP +eor sAgo_, sE1, sAme, ROR #49 SEP xar_m1 vBga, vAbo, E3, 36 +eor sAke_, sE2, sAgi, ROR #3 SEP +eor sAgi_, sE0, sAka, ROR #39 SEP +eor sAga_, sE3, sAbo SEP +eor sAbo_, sE3, sAmo, ROR #37 SEP save(vAga) +eor sAmo_, sE2, sAmi, ROR #8 SEP +eor sAmi_, sE1, sAke, ROR #56 SEP +eor sAge_, sE4, sAgu, ROR #44 SEP xar_m1 vBbo, vAmo, E3, 43 +eor sAgu_, sE2, sAsi, ROR #62 SEP +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP +eor sAma_, sE4, sAbu, ROR #20 SEP xar_m1 vBmo, vAmi, E2, 49 +eor sAbu_, sE4, sAsu, ROR #9 SEP +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP save(vAge) +eor sAbe_, sE1, sAge, ROR #19 SEP +load_constant_ptr SEP +restore count, STACK_OFFSET_COUNT SEP +bic tmp, sAgi_, sAge_, ROR #47 SEP xar_m1 vBmi, vAke, E1, 54 +eor sAga, tmp, sAga_, ROR #39 SEP +bic tmp, sAgo_, sAgi_, ROR #42 SEP +eor sAge, tmp, sAge_, ROR #25 SEP xar_m1 vBge, vAgu, E4, 44 +bic tmp, sAgu_, sAgo_, ROR #16 SEP +eor sAgi, tmp, sAgi_, ROR #58 SEP +bic tmp, sAga_, sAgu_, ROR #31 SEP bcax_m1 vAga, vBga, vBgi, vBge +eor sAgo, tmp, sAgo_, ROR #47 SEP +bic tmp, sAge_, sAga_, ROR #56 SEP +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP eor vBba.16b, vAba.16b, E0.16b +eor sAka, tmp, sAka_, ROR #24 SEP +bic tmp, sAko_, sAki_, ROR #47 SEP +eor sAke, tmp, sAke_, ROR #2 SEP xar_m1 vBsa, vAbi, E2, 2 +bic tmp, sAku_, sAko_, ROR #10 SEP +eor sAki, tmp, sAki_, ROR #57 SEP +bic tmp, sAka_, sAku_, ROR #47 SEP +eor sAko, tmp, sAko_, ROR #57 SEP xar_m1 vBbi, vAki, E2, 21 +bic tmp, sAke_, sAka_, ROR #5 SEP +eor sAku, tmp, sAku_, ROR #52 SEP +bic tmp, sAmi_, sAme_, ROR #38 SEP xar_m1 vBki, vAko, E3, 39 +eor sAma, tmp, sAma_, ROR #47 SEP +bic tmp, sAmo_, sAmi_, ROR #5 SEP +eor sAme, tmp, sAme_, ROR #43 SEP +bic tmp, sAmu_, sAmo_, ROR #41 SEP xar_m1 vBko, vAmu, E4, 56 +eor sAmi, tmp, sAmi_, ROR #46 SEP +bic tmp, sAma_, sAmu_, ROR #35 SEP +ldr cur_const, [const_addr, count, UXTW #3] SEP xar_m1 vBmu, vAso, E3, 8 +add count, count, #1 SEP +eor sAmo, tmp, sAmo_, ROR #12 SEP +bic tmp, sAme_, sAma_, ROR #9 SEP +eor sAmu, tmp, sAmu_, ROR #44 SEP xar_m1 vBso, vAma, E0, 23 +bic tmp, sAsi_, sAse_, ROR #48 SEP +eor sAsa, tmp, sAsa_, ROR #41 SEP +bic tmp, sAso_, sAsi_, ROR #2 SEP xar_m1 vBka, vAbe, E1, 63 +eor sAse, tmp, sAse_, ROR #50 SEP +bic tmp, sAsu_, sAso_, ROR #25 SEP +eor sAsi, tmp, sAsi_, ROR #27 SEP xar_m1 vBse, vAgo, E3, 9 +bic tmp, sAsa_, sAsu_, ROR #60 SEP +eor sAso, tmp, sAso_, ROR #21 SEP +bic tmp, sAse_, sAsa_, ROR #57 SEP +eor sAsu, tmp, sAsu_, ROR #53 SEP xar_m1 vBgo, vAme, E1, 19 +bic tmp, sAbi_, sAbe_, ROR #63 SEP +eor s_Aba, s_Aba_, tmp, ROR #21 SEP +bic tmp, sAbo_, sAbi_, ROR #42 SEP bcax_m1 vAge, vBge, vBgo, vBgi +eor sAbe, tmp, sAbe_, ROR #41 SEP +bic tmp, sAbu_, sAbo_, ROR #57 SEP +eor sAbi, tmp, sAbi_, ROR #35 SEP +bic tmp, s_Aba_, sAbu_, ROR #50 SEP restore_as(vvtmpq, vAgi) +eor sAbo, tmp, sAbo_, ROR #43 SEP +bic tmp, sAbe_, s_Aba_, ROR #44 SEP +eor sAbu, tmp, sAbu_, ROR #30 SEP xar_m1 vBke, vvtmp, E2, 58 +eor s_Aba, s_Aba, cur_const SEP +save count, STACK_OFFSET_COUNT SEP +eor sC0, sAka, sAsa, ROR #50 SEP +eor sC1, sAse, sAge, ROR #60 SEP xar_m1 vBgu, vAsi, E2, 3 +eor sC2, sAmi, sAgi, ROR #59 SEP +eor sC3, sAgo, sAso, ROR #30 SEP +eor sC4, sAbu, sAsu, ROR #53 SEP bcax_m1 vAgi, vBgi, vBgu, vBgo +eor sC0, sAma, sC0, ROR #49 SEP +eor sC1, sAbe, sC1, ROR #44 SEP +eor sC2, sAki, sC2, ROR #26 SEP +eor sC3, sAmo, sC3, ROR #63 SEP xar_m1 vBsi, vAku, E4, 25 +eor sC4, sAmu, sC4, ROR #56 SEP +eor sC0, sAga, sC0, ROR #57 SEP +eor sC1, sAme, sC1, ROR #58 SEP xar_m1 vBku, vAsa, E0, 46 +eor sC2, sAbi, sC2, ROR #60 SEP +eor sC3, sAko, sC3, ROR #38 SEP +eor sC4, sAgu, sC4, ROR #48 SEP xar_m1 vBma, vAbu, E4, 37 +eor sC0, s_Aba, sC0, ROR #61 SEP +eor sC1, sAke, sC1, ROR #57 SEP +eor sC2, sAsi, sC2, ROR #52 SEP +eor sC3, sAbo, sC3, ROR #63 SEP xar_m1 vBbu, vAsu, E4, 50 +eor sC4, sAku, sC4, ROR #50 SEP +ror sC1, sC1, 56 SEP +ror sC4, sC4, 58 SEP xar_m1 vBsu, vAse, E1, 62 +ror sC2, sC2, 62 SEP +eor sE1, sC0, sC2, ROR #63 SEP +eor sE3, sC2, sC4, ROR #63 SEP +eor sE0, sC4, sC1, ROR #63 SEP ldp vvtmpq, E3q, [sp, #(STACK_BASE_TMP_VREGS + 16*vAga_offset)] +eor sE2, sC1, sC3, ROR #63 SEP +eor sE4, sC3, sC0, ROR #63 SEP +eor s_Aba_, sE0, s_Aba SEP xar_m1 vBme, vvtmp, E0, 28 +eor sAsa_, sE2, sAbi, ROR #50 SEP +eor sAbi_, sE2, sAki, ROR #46 SEP +eor sAki_, sE3, sAko, ROR #63 SEP +eor sAko_, sE4, sAmu, ROR #28 SEP xar_m1 vBbe, E3, E1, 20 +eor sAmu_, sE3, sAso, ROR #2 SEP +eor sAso_, sE0, sAma, ROR #54 SEP +eor sAka_, sE1, sAbe, ROR #43 SEP bcax_m1 vAgo, vBgo, vBga, vBgu +eor sAse_, sE3, sAgo, ROR #36 SEP +eor sAgo_, sE1, sAme, ROR #49 SEP +eor sAke_, sE2, sAgi, ROR #3 SEP +eor sAgi_, sE0, sAka, ROR #39 SEP bcax_m1 vAgu, vBgu, vBge, vBga +eor sAga_, sE3, sAbo SEP +eor sAbo_, sE3, sAmo, ROR #37 SEP +eor sAmo_, sE2, sAmi, ROR #8 SEP bcax_m1 vAka, vBka, vBki, vBke +eor sAmi_, sE1, sAke, ROR #56 SEP +eor sAge_, sE4, sAgu, ROR #44 SEP +eor sAgu_, sE2, sAsi, ROR #62 SEP bcax_m1 vAke, vBke, vBko, vBki +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP +eor sAma_, sE4, sAbu, ROR #20 SEP +eor sAbu_, sE4, sAsu, ROR #9 SEP bcax_m1 vAki, vBki, vBku, vBko +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP +eor sAbe_, sE1, sAge, ROR #19 SEP bcax_m1 vAko, vBko, vBka, vBku +load_constant_ptr SEP +restore count, STACK_OFFSET_COUNT SEP +bic tmp, sAgi_, sAge_, ROR #47 SEP +eor sAga, tmp, sAga_, ROR #39 SEP bcax_m1 vAku, vBku, vBke, vBka +bic tmp, sAgo_, sAgi_, ROR #42 SEP +eor sAge, tmp, sAge_, ROR #25 SEP +bic tmp, sAgu_, sAgo_, ROR #16 SEP bcax_m1 vAma, vBma, vBmi, vBme +eor sAgi, tmp, sAgi_, ROR #58 SEP +bic tmp, sAga_, sAgu_, ROR #31 SEP +eor sAgo, tmp, sAgo_, ROR #47 SEP +bic tmp, sAge_, sAga_, ROR #56 SEP bcax_m1 vAme, vBme, vBmo, vBmi +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP +eor sAka, tmp, sAka_, ROR #24 SEP bcax_m1 vAmi, vBmi, vBmu, vBmo +bic tmp, sAko_, sAki_, ROR #47 SEP +eor sAke, tmp, sAke_, ROR #2 SEP +bic tmp, sAku_, sAko_, ROR #10 SEP +eor sAki, tmp, sAki_, ROR #57 SEP bcax_m1 vAmo, vBmo, vBma, vBmu +bic tmp, sAka_, sAku_, ROR #47 SEP +eor sAko, tmp, sAko_, ROR #57 SEP +bic tmp, sAke_, sAka_, ROR #5 SEP bcax_m1 vAmu, vBmu, vBme, vBma +eor sAku, tmp, sAku_, ROR #52 SEP +bic tmp, sAmi_, sAme_, ROR #38 SEP +eor sAma, tmp, sAma_, ROR #47 SEP bcax_m1 vAsa, vBsa, vBsi, vBse +bic tmp, sAmo_, sAmi_, ROR #5 SEP +eor sAme, tmp, sAme_, ROR #43 SEP +bic tmp, sAmu_, sAmo_, ROR #41 SEP +eor sAmi, tmp, sAmi_, ROR #46 SEP bcax_m1 vAse, vBse, vBso, vBsi +bic tmp, sAma_, sAmu_, ROR #35 SEP +ldr cur_const, [const_addr, count, UXTW #3] SEP +add count, count, #1 SEP bcax_m1 vAsi, vBsi, vBsu, vBso +eor sAmo, tmp, sAmo_, ROR #12 SEP +bic tmp, sAme_, sAma_, ROR #9 SEP +eor sAmu, tmp, sAmu_, ROR #44 SEP +bic tmp, sAsi_, sAse_, ROR #48 SEP bcax_m1 vAso, vBso, vBsa, vBsu +eor sAsa, tmp, sAsa_, ROR #41 SEP +bic tmp, sAso_, sAsi_, ROR #2 SEP +eor sAse, tmp, sAse_, ROR #50 SEP bcax_m1 vAsu, vBsu, vBse, vBsa +bic tmp, sAsu_, sAso_, ROR #25 SEP +eor sAsi, tmp, sAsi_, ROR #27 SEP +bic tmp, sAsa_, sAsu_, ROR #60 SEP +eor sAso, tmp, sAso_, ROR #21 SEP bcax_m1 vAba, vBba, vBbi, vBbe +bic tmp, sAse_, sAsa_, ROR #57 SEP +eor sAsu, tmp, sAsu_, ROR #53 SEP +bic tmp, sAbi_, sAbe_, ROR #63 SEP bcax_m1 vAbe, vBbe, vBbo, vBbi +eor s_Aba, s_Aba_, tmp, ROR #21 SEP +bic tmp, sAbo_, sAbi_, ROR #42 SEP +eor sAbe, tmp, sAbe_, ROR #41 SEP +bic tmp, sAbu_, sAbo_, ROR #57 SEP bcax_m1 vAbi, vBbi, vBbu, vBbo +eor sAbi, tmp, sAbi_, ROR #35 SEP +bic tmp, s_Aba_, sAbu_, ROR #50 SEP +eor sAbo, tmp, sAbo_, ROR #43 SEP bcax_m1 vAbo, vBbo, vBba, vBbu +bic tmp, sAbe_, s_Aba_, ROR #44 SEP +eor sAbu, tmp, sAbu_, ROR #30 SEP +eor s_Aba, s_Aba, cur_const SEP bcax_m1 vAbu, vBbu, vBbe, vBba +ror sAga, sAga,(64-3) SEP +ror sAka, sAka,(64-25) SEP +ror sAma, sAma,(64-10) SEP +ror sAsa, sAsa,(64-39) SEP restore x26, STACK_OFFSET_CONST +ror sAbe, sAbe,(64-21) SEP +ror sAge, sAge,(64-45) SEP +ror sAke, sAke,(64-8) SEP ldr vvtmpq, [x26], #16 +ror sAme, sAme,(64-15) SEP +ror sAse, sAse,(64-41) SEP +ror sAbi, sAbi,(64-14) SEP +ror sAgi, sAgi,(64-61) SEP save x26, STACK_OFFSET_CONST +ror sAki, sAki,(64-18) SEP +ror sAmi, sAmi,(64-56) SEP +ror sAsi, sAsi,(64-2) SEP eor vAba.16b, vAba.16b, vvtmp.16b +ror sAgo, sAgo,(64-28) SEP +ror sAko, sAko,(64-1) SEP +ror sAmo, sAmo,(64-27) SEP +ror sAso, sAso,(64-62) SEP .unreq vvtmp +ror sAbu, sAbu,(64-44) SEP +ror sAgu, sAgu,(64-20) SEP +ror sAku, sAku,(64-6) SEP .unreq vvtmpq +ror sAmu, sAmu,(64-36) SEP +ror sAsu, sAsu,(64-55) SEP +.endm + +#define KECCAK_F1600_ROUNDS 24 + +.global keccak_f1600_x4_hybrid_asm_v5p +.global _keccak_f1600_x4_hybrid_asm_v5p +.text +.align 4 + +keccak_f1600_x4_hybrid_asm_v5p: +_keccak_f1600_x4_hybrid_asm_v5p: + alloc_stack + save_gprs + save_vregs + save input_addr, STACK_OFFSET_INPUT + + ASM_LOAD(const_addr,round_constants_vec) + save const_addr, STACK_OFFSET_CONST + + load_input_vector + + add input_addr, input_addr, #16 + + mov out_count, #0 +outer_loop: + save out_count, STACK_OFFSET_COUNT_OUT + + load_input_scalar + save input_addr, STACK_OFFSET_CUR_INPUT + + hybrid_round_initial +1: + hybrid_round_noninitial + cmp count, #(KECCAK_F1600_ROUNDS-3) + blt 1b + hybrid_round_final + + restore input_addr, STACK_OFFSET_CUR_INPUT + store_input_scalar + add input_addr, input_addr, #8 + + restore out_count, STACK_OFFSET_COUNT_OUT + add out_count, out_count, #1 + cmp out_count, #2 + blt outer_loop + + restore input_addr, STACK_OFFSET_INPUT + store_input_vector + + restore_vregs + restore_gprs + free_stack + ret diff --git a/asm/manual/keccak_f1600/keccak_f1600_x4_hybrid_asm_v6.s b/asm/manual/keccak_f1600/keccak_f1600_x4_hybrid_asm_v6.s new file mode 100644 index 0000000..183fa2c --- /dev/null +++ b/asm/manual/keccak_f1600/keccak_f1600_x4_hybrid_asm_v6.s @@ -0,0 +1,1385 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" +#if defined(__ARM_FEATURE_SHA3) + +/********************** CONSTANTS *************************/ + .data + .align(8) +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 +round_constants_vec: + .quad 0x0000000000000001 + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + .quad 0x8000000080008008 +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x29 + count .req w27 + cur_const .req x26 + + /* Mapping of Kecck-f1600 SIMD state to vector registers + * at the beginning and end of each round. */ + + /* Mapping of Kecck-f1600 state to vector registers + * at the beginning and end of each round. */ + vAba .req v0 + vAbe .req v1 + vAbi .req v2 + vAbo .req v3 + vAbu .req v4 + vAga .req v5 + vAge .req v6 + vAgi .req v7 + vAgo .req v8 + vAgu .req v9 + vAka .req v10 + vAke .req v11 + vAki .req v12 + vAko .req v13 + vAku .req v14 + vAma .req v15 + vAme .req v16 + vAmi .req v17 + vAmo .req v18 + vAmu .req v19 + vAsa .req v20 + vAse .req v21 + vAsi .req v22 + vAso .req v23 + vAsu .req v24 + + /* q-form of the above mapping */ + vAbaq .req q0 + vAbeq .req q1 + vAbiq .req q2 + vAboq .req q3 + vAbuq .req q4 + vAgaq .req q5 + vAgeq .req q6 + vAgiq .req q7 + vAgoq .req q8 + vAguq .req q9 + vAkaq .req q10 + vAkeq .req q11 + vAkiq .req q12 + vAkoq .req q13 + vAkuq .req q14 + vAmaq .req q15 + vAmeq .req q16 + vAmiq .req q17 + vAmoq .req q18 + vAmuq .req q19 + vAsaq .req q20 + vAseq .req q21 + vAsiq .req q22 + vAsoq .req q23 + vAsuq .req q24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v27 + C1 .req v28 + C2 .req v29 + C3 .req v30 + C4 .req v31 + + C0q .req q27 + C1q .req q28 + C2q .req q29 + C3q .req q30 + C4q .req q31 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + vBba .req v25 // fresh + vBbe .req v26 // fresh + vBbi .req vAbi + vBbo .req vAbo + vBbu .req vAbu + vBga .req vAka + vBge .req vAke + vBgi .req vAgi + vBgo .req vAgo + vBgu .req vAgu + vBka .req vAma + vBke .req vAme + vBki .req vAki + vBko .req vAko + vBku .req vAku + vBma .req vAsa + vBme .req vAse + vBmi .req vAmi + vBmo .req vAmo + vBmu .req vAmu + vBsa .req vAba + vBse .req vAbe + vBsi .req vAsi + vBso .req vAso + vBsu .req vAsu + + vBbaq .req q25 // fresh + vBbeq .req q26 // fresh + vBbiq .req vAbiq + vBboq .req vAboq + vBbuq .req vAbuq + vBgaq .req vAkaq + vBgeq .req vAkeq + vBgiq .req vAgiq + vBgoq .req vAgoq + vBguq .req vAguq + vBkaq .req vAmaq + vBkeq .req vAmeq + vBkiq .req vAkiq + vBkoq .req vAkoq + vBkuq .req vAkuq + vBmaq .req vAsaq + vBmeq .req vAseq + vBmiq .req vAmiq + vBmoq .req vAmoq + vBmuq .req vAmuq + vBsaq .req vAbaq + vBseq .req vAbeq + vBsiq .req vAsiq + vBsoq .req vAsoq + vBsuq .req vAsuq + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req C4 + E1 .req C0 + E2 .req vBbe // fresh + E3 .req C2 + E4 .req C3 + + E0q .req C4q + E1q .req C0q + E2q .req vBbeq // fresh + E3q .req C2q + E4q .req C3q + + /* Mapping of Kecck-f1600 state to scalar registers + * at the beginning and end of each round. */ + s_Aba .req x1 + sAbe .req x6 + sAbi .req x11 + sAbo .req x16 + sAbu .req x21 + sAga .req x2 + sAge .req x7 + sAgi .req x12 + sAgo .req x17 + sAgu .req x22 + sAka .req x3 + sAke .req x8 + sAki .req x13 + sAko .req x18 + sAku .req x23 + sAma .req x4 + sAme .req x9 + sAmi .req x14 + sAmo .req x19 + sAmu .req x24 + sAsa .req x5 + sAse .req x10 + sAsi .req x15 + sAso .req x20 + sAsu .req x25 + + /* sA_[y,2*x+3*y] = rot(A[x,y]) */ + s_Aba_ .req x0 + sAbe_ .req x28 + sAbi_ .req x11 + sAbo_ .req x16 + sAbu_ .req x21 + sAga_ .req x3 + sAge_ .req x8 + sAgi_ .req x12 + sAgo_ .req x17 + sAgu_ .req x22 + sAka_ .req x4 + sAke_ .req x9 + sAki_ .req x13 + sAko_ .req x18 + sAku_ .req x23 + sAma_ .req x5 + sAme_ .req x10 + sAmi_ .req x14 + sAmo_ .req x19 + sAmu_ .req x24 + sAsa_ .req x1 + sAse_ .req x6 + sAsi_ .req x15 + sAso_ .req x20 + sAsu_ .req x25 + + /* sC[x] = sA[x,0] xor sA[x,1] xor sA[x,2] xor sA[x,3] xor sA[x,4], for x in 0..4 */ + /* sE[x] = sC[x-1] xor rot(C[x+1],1), for x in 0..4 */ + sC0 .req x0 + sE0 .req x29 + sC1 .req x26 + sE1 .req x30 + sC2 .req x27 + sE2 .req x26 + sC3 .req x28 + sE3 .req x27 + sC4 .req x29 + sE4 .req x28 + + tmp .req x30 + +/************************ MACROS ****************************/ + +/* Macros using v8.4-A SHA-3 instructions */ + +.macro eor3_m0 d s0 s1 s2 + eor3 \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +.macro rax1_m0 d s0 s1 + rax1 \d\().2d, \s0\().2d, \s1\().2d +.endm + +.macro xar_m0 d s0 s1 imm + xar \d\().2d, \s0\().2d, \s1\().2d, #\imm +.endm + +.macro bcax_m0 d s0 s1 s2 + bcax \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +.macro eor3_m1_0 d s0 s1 s2 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor2 d s0 s1 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor3_m1_1 d s0 s1 s2 + eor \d\().16b, \d\().16b, \s2\().16b +.endm + +.macro eor3_m1 d s0 s1 s2 + eor3_m1_0 \d, \s0, \s1, \s2 + eor3_m1_1 \d, \s0, \s1, \s2 +.endm + +.macro rax1_m1 d s0 s1 + // Use add instead of SHL #1 + add vvtmp.2d, \s1\().2d, \s1\().2d + sri vvtmp.2d, \s1\().2d, #63 + eor \d\().16b, vvtmp.16b, \s0\().16b +.endm + + .macro xar_m1 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 63 + eor \s0\().16b, \s0\().16b, \s1\().16b + add \d\().2d, \s0\().2d, \s0\().2d + sri \d\().2d, \s0\().2d, #(63) + // .elseif \imm == 62 + // eor \s0\().16b, \s0\().16b, \s1\().16b + // add \d\().2d, \s0\().2d, \s0\().2d + // add \d\().2d, \d\().2d, \d\().2d + // sri \d\().2d, \s0\().2d, #(62) + // .elseif \imm == 61 + // eor \s0\().16b, \s0\().16b, \s1\().16b + // add \d\().2d, \s0\().2d, \s0\().2d + // add \d\().2d, \d\().2d, \d\().2d + // add \d\().2d, \d\().2d, \d\().2d + // sri \d\().2d, \s0\().2d, #(61) + .else + eor \s0\().16b, \s0\().16b, \s1\().16b + shl \d\().2d, \s0\().2d, #(64-\imm) + sri \d\().2d, \s0\().2d, #(\imm) + .endif +.endm + + .macro xar_m1_0 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 63 + eor \s0\().16b, \s0\().16b, \s1\().16b + .elseif \imm == 62 + eor \s0\().16b, \s0\().16b, \s1\().16b + .else + eor \s0\().16b, \s0\().16b, \s1\().16b + .endif +.endm + + .macro xar_m1_1 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 63 + add \d\().2d, \s0\().2d, \s0\().2d + sri \d\().2d, \s0\().2d, #(63) + .elseif \imm == 62 + add \d\().2d, \s0\().2d, \s0\().2d + add \d\().2d, \d\().2d, \d\().2d + sri \d\().2d, \s0\().2d, #(62) + .else + shl \d\().2d, \s0\().2d, #(64-\imm) + sri \d\().2d, \s0\().2d, #(\imm) + .endif +.endm + +.macro bcax_m1 d s0 s1 s2 + bic vvtmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, vvtmp.16b, \s0\().16b +.endm + +.macro load_input_vector num idx + ldr vAbaq, [input_addr, #(16*(\num*0+\idx))] + ldr vAbeq, [input_addr, #(16*(\num*1+\idx))] + ldr vAbiq, [input_addr, #(16*(\num*2+\idx))] + ldr vAboq, [input_addr, #(16*(\num*3+\idx))] + ldr vAbuq, [input_addr, #(16*(\num*4+\idx))] + ldr vAgaq, [input_addr, #(16*(\num*5+\idx))] + ldr vAgeq, [input_addr, #(16*(\num*6+\idx))] + ldr vAgiq, [input_addr, #(16*(\num*7+\idx))] + ldr vAgoq, [input_addr, #(16*(\num*8+\idx))] + ldr vAguq, [input_addr, #(16*(\num*9+\idx))] + ldr vAkaq, [input_addr, #(16*(\num*10+\idx))] + ldr vAkeq, [input_addr, #(16*(\num*11+\idx))] + ldr vAkiq, [input_addr, #(16*(\num*12+\idx))] + ldr vAkoq, [input_addr, #(16*(\num*13+\idx))] + ldr vAkuq, [input_addr, #(16*(\num*14+\idx))] + ldr vAmaq, [input_addr, #(16*(\num*15+\idx))] + ldr vAmeq, [input_addr, #(16*(\num*16+\idx))] + ldr vAmiq, [input_addr, #(16*(\num*17+\idx))] + ldr vAmoq, [input_addr, #(16*(\num*18+\idx))] + ldr vAmuq, [input_addr, #(16*(\num*19+\idx))] + ldr vAsaq, [input_addr, #(16*(\num*20+\idx))] + ldr vAseq, [input_addr, #(16*(\num*21+\idx))] + ldr vAsiq, [input_addr, #(16*(\num*22+\idx))] + ldr vAsoq, [input_addr, #(16*(\num*23+\idx))] + ldr vAsuq, [input_addr, #(16*(\num*24+\idx))] +.endm + +.macro store_input_vector num idx + str vAbaq, [input_addr, #(16*(\num*0+\idx))] + str vAbeq, [input_addr, #(16*(\num*1+\idx))] + str vAbiq, [input_addr, #(16*(\num*2+\idx))] + str vAboq, [input_addr, #(16*(\num*3+\idx))] + str vAbuq, [input_addr, #(16*(\num*4+\idx))] + str vAgaq, [input_addr, #(16*(\num*5+\idx))] + str vAgeq, [input_addr, #(16*(\num*6+\idx))] + str vAgiq, [input_addr, #(16*(\num*7+\idx))] + str vAgoq, [input_addr, #(16*(\num*8+\idx))] + str vAguq, [input_addr, #(16*(\num*9+\idx))] + str vAkaq, [input_addr, #(16*(\num*10+\idx))] + str vAkeq, [input_addr, #(16*(\num*11+\idx))] + str vAkiq, [input_addr, #(16*(\num*12+\idx))] + str vAkoq, [input_addr, #(16*(\num*13+\idx))] + str vAkuq, [input_addr, #(16*(\num*14+\idx))] + str vAmaq, [input_addr, #(16*(\num*15+\idx))] + str vAmeq, [input_addr, #(16*(\num*16+\idx))] + str vAmiq, [input_addr, #(16*(\num*17+\idx))] + str vAmoq, [input_addr, #(16*(\num*18+\idx))] + str vAmuq, [input_addr, #(16*(\num*19+\idx))] + str vAsaq, [input_addr, #(16*(\num*20+\idx))] + str vAseq, [input_addr, #(16*(\num*21+\idx))] + str vAsiq, [input_addr, #(16*(\num*22+\idx))] + str vAsoq, [input_addr, #(16*(\num*23+\idx))] + str vAsuq, [input_addr, #(16*(\num*24+\idx))] +.endm + +.macro store_input_scalar num idx + str s_Aba, [input_addr, 8*(\num*(0) +\idx)] + str sAbe, [input_addr, 8*(\num*(0+1) +\idx)] + str sAbi, [input_addr, 8*(\num*(2)+ \idx)] + str sAbo, [input_addr, 8*(\num*(2+1) +\idx)] + str sAbu, [input_addr, 8*(\num*(4)+ \idx)] + str sAga, [input_addr, 8*(\num*(4+1) +\idx)] + str sAge, [input_addr, 8*(\num*(6)+ \idx)] + str sAgi, [input_addr, 8*(\num*(6+1) +\idx)] + str sAgo, [input_addr, 8*(\num*(8)+ \idx)] + str sAgu, [input_addr, 8*(\num*(8+1) +\idx)] + str sAka, [input_addr, 8*(\num*(10) +\idx)] + str sAke, [input_addr, 8*(\num*(10+1)+\idx)] + str sAki, [input_addr, 8*(\num*(12) +\idx)] + str sAko, [input_addr, 8*(\num*(12+1)+\idx)] + str sAku, [input_addr, 8*(\num*(14) +\idx)] + str sAma, [input_addr, 8*(\num*(14+1)+\idx)] + str sAme, [input_addr, 8*(\num*(16) +\idx)] + str sAmi, [input_addr, 8*(\num*(16+1)+\idx)] + str sAmo, [input_addr, 8*(\num*(18) +\idx)] + str sAmu, [input_addr, 8*(\num*(18+1)+\idx)] + str sAsa, [input_addr, 8*(\num*(20) +\idx)] + str sAse, [input_addr, 8*(\num*(20+1)+\idx)] + str sAsi, [input_addr, 8*(\num*(22) +\idx)] + str sAso, [input_addr, 8*(\num*(22+1)+\idx)] + str sAsu, [input_addr, 8*(\num*(24) +\idx)] +.endm + +.macro load_input_scalar num idx + ldr s_Aba, [input_addr, 8*(\num*(0) +\idx)] + ldr sAbe, [input_addr, 8*(\num*(0+1) +\idx)] + ldr sAbi, [input_addr, 8*(\num*(2)+ \idx)] + ldr sAbo, [input_addr, 8*(\num*(2+1) +\idx)] + ldr sAbu, [input_addr, 8*(\num*(4)+ \idx)] + ldr sAga, [input_addr, 8*(\num*(4+1) +\idx)] + ldr sAge, [input_addr, 8*(\num*(6)+ \idx)] + ldr sAgi, [input_addr, 8*(\num*(6+1) +\idx)] + ldr sAgo, [input_addr, 8*(\num*(8)+ \idx)] + ldr sAgu, [input_addr, 8*(\num*(8+1) +\idx)] + ldr sAka, [input_addr, 8*(\num*(10) +\idx)] + ldr sAke, [input_addr, 8*(\num*(10+1)+\idx)] + ldr sAki, [input_addr, 8*(\num*(12) +\idx)] + ldr sAko, [input_addr, 8*(\num*(12+1)+\idx)] + ldr sAku, [input_addr, 8*(\num*(14) +\idx)] + ldr sAma, [input_addr, 8*(\num*(14+1)+\idx)] + ldr sAme, [input_addr, 8*(\num*(16) +\idx)] + ldr sAmi, [input_addr, 8*(\num*(16+1)+\idx)] + ldr sAmo, [input_addr, 8*(\num*(18) +\idx)] + ldr sAmu, [input_addr, 8*(\num*(18+1)+\idx)] + ldr sAsa, [input_addr, 8*(\num*(20) +\idx)] + ldr sAse, [input_addr, 8*(\num*(20+1)+\idx)] + ldr sAsi, [input_addr, 8*(\num*(22) +\idx)] + ldr sAso, [input_addr, 8*(\num*(22+1)+\idx)] + ldr sAsu, [input_addr, 8*(\num*(24) +\idx)] +.endm + +#define STACK_SIZE (8*8 + 16*6 + 3*8 + 8 + 16*34) // VREGS (8*8), GPRs (16*6), count (8), const (8), input (8), padding (8) +#define STACK_BASE_GPRS (3*8+8) +#define STACK_BASE_VREGS (3*8+8+16*6) +#define STACK_BASE_TMP (8*8 + 16*6 + 3*8 + 8) +#define STACK_OFFSET_INPUT (0*8) +#define STACK_OFFSET_CONST (1*8) +#define STACK_OFFSET_COUNT (2*8) + +#define vAga_offset 0 +#define E0_offset 1 +#define E1_offset 2 +#define E2_offset 3 +#define E3_offset 4 +#define E4_offset 5 +#define Ame_offset 7 +#define Agi_offset 8 +#define Aka_offset 9 +#define Abo_offset 10 +#define Amo_offset 11 +#define Ami_offset 12 +#define Ake_offset 13 +#define Agu_offset 14 +#define Asi_offset 15 +#define Aku_offset 16 +#define Asa_offset 17 +#define Abu_offset 18 +#define Asu_offset 19 +#define Ase_offset 20 +//#define Aga_offset 21 +#define Age_offset 22 +#define vBgo_offset 23 +#define vBke_offset 24 +#define vBgi_offset 25 +#define vBga_offset 26 +#define vBbo_offset 27 +#define vBmo_offset 28 +#define vBmi_offset 29 +#define vBge_offset 30 + +#define save(name) \ + str name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] +#define restore(name) \ + ldr name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] + + +.macro save_gprs + stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro save_vregs + stp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] + stp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + stp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + stp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] +.endm + +.macro restore_vregs + ldp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] + ldp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + ldp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + ldp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] +.endm + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +.macro eor5 dst, src0, src1, src2, src3, src4 + eor \dst, \src0, \src1 + eor \dst, \dst, \src2 + eor \dst, \dst, \src3 + eor \dst, \dst, \src4 +.endm + +.macro xor_rol dst, src1, src0, imm + eor \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro bic_rol dst, src1, src0, imm + bic \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro rotate dst, src, imm + ror \dst, \src, #(64-\imm) +.endm + +.macro save reg, offset + str \reg, [sp, #\offset] +.endm + +.macro restore reg, offset + ldr \reg, [sp, #\offset] +.endm + +.macro hybrid_round_initial +eor sC0, sAma, sAsa SEP +eor sC1, sAme, sAse SEP eor3_m0 C1,vAbe,vAge,vAke +eor sC2, sAmi, sAsi SEP +eor sC3, sAmo, sAso SEP eor3_m1 C3,vAbo,vAgo,vAko +eor sC4, sAmu, sAsu SEP +eor sC0, sAka, sC0 SEP eor3_m0 C0,vAba,vAga,vAka +eor sC1, sAke, sC1 SEP +eor sC2, sAki, sC2 SEP eor3_m1 C2,vAbi,vAgi,vAki +eor sC3, sAko, sC3 SEP +eor sC4, sAku, sC4 SEP eor3_m0 C4,vAbu,vAgu,vAku +eor sC0, sAga, sC0 SEP +eor sC1, sAge, sC1 SEP eor3_m1 C1, C1,vAme, vAse +eor sC2, sAgi, sC2 SEP +eor sC3, sAgo, sC3 SEP eor3_m0 C3, C3,vAmo, vAso +eor sC4, sAgu, sC4 SEP +eor sC0, s_Aba, sC0 SEP eor3_m1 C0, C0,vAma, vAsa +eor sC1, sAbe, sC1 SEP +eor sC2, sAbi, sC2 SEP eor3_m0 C2, C2,vAmi, vAsi +eor sC3, sAbo, sC3 SEP +eor sC4, sAbu, sC4 SEP eor3_m1 C4, C4,vAmu, vAsu +eor sE1, sC0, sC2, ROR #63 SEP +eor sE3, sC2, sC4, ROR #63 SEP vvtmp .req vBba +eor sE0, sC4, sC1, ROR #63 SEP +eor sE2, sC1, sC3, ROR #63 SEP rax1_m0 E2, C1, C3 +eor sE4, sC3, sC0, ROR #63 SEP +eor s_Aba_, s_Aba, sE0 SEP +eor sAsa_, sAbi, sE2 SEP rax1_m1 E4, C3, C0 +eor sAbi_, sAki, sE2 SEP +eor sAki_, sAko, sE3 SEP rax1_m0 E1, C0, C2 +eor sAko_, sAmu, sE4 SEP +eor sAmu_, sAso, sE3 SEP rax1_m1 E3, C2, C4 +eor sAso_, sAma, sE0 SEP +eor sAka_, sAbe, sE1 SEP rax1_m0 E0, C4, C1 +eor sAse_, sAgo, sE3 SEP +eor sAgo_, sAme, sE1 SEP .unreq vvtmp +eor sAke_, sAgi, sE2 SEP +eor sAgi_, sAka, sE0 SEP vvtmp .req C1 +eor sAga_, sAbo, sE3 SEP +eor sAbo_, sAmo, sE3 SEP vvtmpq .req C1q +eor sAmo_, sAmi, sE2 SEP +eor sAmi_, sAke, sE1 SEP eor vBba.16b, vAba.16b, E0.16b +eor sAge_, sAgu, sE4 SEP +eor sAgu_, sAsi, sE2 SEP xar_m1 vBsa, vAbi, E2, 2 +eor sAsi_, sAku, sE4 SEP +eor sAku_, sAsa, sE0 SEP xar_m0 vBbi, vAki, E2, 21 +eor sAma_, sAbu, sE4 SEP +eor sAbu_, sAsu, sE4 SEP xar_m1 vBki, vAko, E3, 39 +eor sAsu_, sAse, sE1 SEP +eor sAme_, sAga, sE0 SEP +eor sAbe_, sAge, sE1 SEP xar_m0 vBko, vAmu, E4, 56 +load_constant_ptr SEP +bic tmp, sAgi_, sAge_, ROR #47 SEP xar_m1 vBmu, vAso, E3, 8 +eor sAga, tmp, sAga_, ROR #39 SEP +bic tmp, sAgo_, sAgi_, ROR #42 SEP xar_m0 vBso, vAma, E0, 23 +eor sAge, tmp, sAge_, ROR #25 SEP +bic tmp, sAgu_, sAgo_, ROR #16 SEP xar_m1 vBka, vAbe, E1, 63 +eor sAgi, tmp, sAgi_, ROR #58 SEP +bic tmp, sAga_, sAgu_, ROR #31 SEP xar_m0 vBse, vAgo, E3, 9 +eor sAgo, tmp, sAgo_, ROR #47 SEP +bic tmp, sAge_, sAga_, ROR #56 SEP xar_m1 vBgo, vAme, E1, 19 +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP xar_m0 vBke, vAgi, E2, 58 +eor sAka, tmp, sAka_, ROR #24 SEP +bic tmp, sAko_, sAki_, ROR #47 SEP xar_m1 vBgi, vAka, E0, 61 +eor sAke, tmp, sAke_, ROR #2 SEP +bic tmp, sAku_, sAko_, ROR #10 SEP xar_m0 vBga, vAbo, E3, 36 +eor sAki, tmp, sAki_, ROR #57 SEP +bic tmp, sAka_, sAku_, ROR #47 SEP xar_m1 vBbo, vAmo, E3, 43 +eor sAko, tmp, sAko_, ROR #57 SEP +bic tmp, sAke_, sAka_, ROR #5 SEP xar_m0 vBmo, vAmi, E2, 49 +eor sAku, tmp, sAku_, ROR #52 SEP +bic tmp, sAmi_, sAme_, ROR #38 SEP +eor sAma, tmp, sAma_, ROR #47 SEP xar_m1 vBmi, vAke, E1, 54 +bic tmp, sAmo_, sAmi_, ROR #5 SEP +eor sAme, tmp, sAme_, ROR #43 SEP xar_m0 vBge, vAgu, E4, 44 +bic tmp, sAmu_, sAmo_, ROR #41 SEP +eor sAmi, tmp, sAmi_, ROR #46 SEP mov E3.16b, vAga.16b +ldr cur_const, [const_addr] SEP +mov count, #1 SEP bcax_m1 vAga, vBga, vBgi, vBge +bic tmp, sAma_, sAmu_, ROR #35 SEP +eor sAmo, tmp, sAmo_, ROR #12 SEP xar_m0 vBgu, vAsi, E2, 3 +bic tmp, sAme_, sAma_, ROR #9 SEP +eor sAmu, tmp, sAmu_, ROR #44 SEP xar_m1 vBsi, vAku, E4, 25 +bic tmp, sAsi_, sAse_, ROR #48 SEP +eor sAsa, tmp, sAsa_, ROR #41 SEP xar_m0 vBku, vAsa, E0, 46 +bic tmp, sAso_, sAsi_, ROR #2 SEP +eor sAse, tmp, sAse_, ROR #50 SEP xar_m1 vBma, vAbu, E4, 37 +bic tmp, sAsu_, sAso_, ROR #25 SEP +eor sAsi, tmp, sAsi_, ROR #27 SEP xar_m0 vBbu, vAsu, E4, 50 +bic tmp, sAsa_, sAsu_, ROR #60 SEP +eor sAso, tmp, sAso_, ROR #21 SEP xar_m1 vBsu, vAse, E1, 62 +bic tmp, sAse_, sAsa_, ROR #57 SEP +eor sAsu, tmp, sAsu_, ROR #53 SEP xar_m0 vBme, E3, E0, 28 +bic tmp, sAbi_, sAbe_, ROR #63 SEP +eor s_Aba, s_Aba_, tmp, ROR #21 SEP xar_m1 vBbe, vAge, E1, 20 +bic tmp, sAbo_, sAbi_, ROR #42 SEP +eor sAbe, tmp, sAbe_, ROR #41 SEP +bic tmp, sAbu_, sAbo_, ROR #57 SEP bcax_m1 vAge, vBge, vBgo, vBgi +eor sAbi, tmp, sAbi_, ROR #35 SEP +bic tmp, s_Aba_, sAbu_, ROR #50 SEP bcax_m0 vAgi, vBgi, vBgu, vBgo +eor sAbo, tmp, sAbo_, ROR #43 SEP +bic tmp, sAbe_, s_Aba_, ROR #44 SEP bcax_m1 vAgo, vBgo, vBga, vBgu +eor sAbu, tmp, sAbu_, ROR #30 SEP +eor s_Aba, s_Aba, cur_const SEP bcax_m0 vAgu, vBgu, vBge, vBga +save count, STACK_OFFSET_COUNT SEP +eor sC0, sAka, sAsa, ROR #50 SEP bcax_m1 vAka, vBka, vBki, vBke +eor sC1, sAse, sAge, ROR #60 SEP +eor sC2, sAmi, sAgi, ROR #59 SEP bcax_m0 vAke, vBke, vBko, vBki +eor sC3, sAgo, sAso, ROR #30 SEP +eor sC4, sAbu, sAsu, ROR #53 SEP .unreq vvtmp +eor sC0, sAma, sC0, ROR #49 SEP +eor sC1, sAbe, sC1, ROR #44 SEP .unreq vvtmpq +eor sC2, sAki, sC2, ROR #26 SEP +eor sC3, sAmo, sC3, ROR #63 SEP eor2 C0, vAka, vAga +eor sC4, sAmu, sC4, ROR #56 SEP +eor sC0, sAga, sC0, ROR #57 SEP save(vAga) +eor sC1, sAme, sC1, ROR #58 SEP +eor sC2, sAbi, sC2, ROR #60 SEP vvtmp .req vAga +eor sC3, sAko, sC3, ROR #38 SEP +eor sC4, sAgu, sC4, ROR #48 SEP +eor sC0, s_Aba, sC0, ROR #61 SEP vvtmpq .req vAgaq +eor sC1, sAke, sC1, ROR #57 SEP +eor sC2, sAsi, sC2, ROR #52 SEP bcax_m0 vAki, vBki, vBku, vBko +eor sC3, sAbo, sC3, ROR #63 SEP +eor sC4, sAku, sC4, ROR #50 SEP bcax_m1 vAko, vBko, vBka, vBku +ror sC1, sC1, 56 SEP +ror sC4, sC4, 58 SEP eor2 C1, vAke, vAge +ror sC2, sC2, 62 SEP +eor sE1, sC0, sC2, ROR #63 SEP bcax_m0 vAku, vBku, vBke, vBka +eor sE3, sC2, sC4, ROR #63 SEP +eor sE0, sC4, sC1, ROR #63 SEP eor2 C2, vAki, vAgi +eor sE2, sC1, sC3, ROR #63 SEP +eor sE4, sC3, sC0, ROR #63 SEP bcax_m1 vAma, vBma, vBmi, vBme +eor s_Aba_, sE0, s_Aba SEP +eor sAsa_, sE2, sAbi, ROR #50 SEP eor2 C3, vAko, vAgo +eor sAbi_, sE2, sAki, ROR #46 SEP +eor sAki_, sE3, sAko, ROR #63 SEP bcax_m0 vAme, vBme, vBmo, vBmi +eor sAko_, sE4, sAmu, ROR #28 SEP +eor sAmu_, sE3, sAso, ROR #2 SEP eor2 C4, vAku, vAgu +eor sAso_, sE0, sAma, ROR #54 SEP +eor sAka_, sE1, sAbe, ROR #43 SEP bcax_m1 vAmi, vBmi, vBmu, vBmo +eor sAse_, sE3, sAgo, ROR #36 SEP +eor sAgo_, sE1, sAme, ROR #49 SEP +eor sAke_, sE2, sAgi, ROR #3 SEP eor2 C0, C0, vAma +eor sAgi_, sE0, sAka, ROR #39 SEP +eor sAga_, sE3, sAbo SEP bcax_m0 vAmo, vBmo, vBma, vBmu +eor sAbo_, sE3, sAmo, ROR #37 SEP +eor sAmo_, sE2, sAmi, ROR #8 SEP eor2 C1, C1, vAme +eor sAmi_, sE1, sAke, ROR #56 SEP +eor sAge_, sE4, sAgu, ROR #44 SEP bcax_m1 vAmu, vBmu, vBme, vBma +eor sAgu_, sE2, sAsi, ROR #62 SEP +eor sAsi_, sE4, sAku, ROR #58 SEP eor2 C2, C2, vAmi +eor sAku_, sE0, sAsa, ROR #25 SEP +eor sAma_, sE4, sAbu, ROR #20 SEP bcax_m0 vAsa, vBsa, vBsi, vBse +eor sAbu_, sE4, sAsu, ROR #9 SEP +eor sAsu_, sE1, sAse, ROR #23 SEP eor2 C3, C3, vAmo +eor sAme_, sE0, sAga, ROR #61 SEP +eor sAbe_, sE1, sAge, ROR #19 SEP bcax_m1 vAse, vBse, vBso, vBsi +load_constant_ptr SEP +restore count, STACK_OFFSET_COUNT SEP eor2 C4, C4, vAmu +bic tmp, sAgi_, sAge_, ROR #47 SEP +eor sAga, tmp, sAga_, ROR #39 SEP bcax_m0 vAsi, vBsi, vBsu, vBso +bic tmp, sAgo_, sAgi_, ROR #42 SEP +eor sAge, tmp, sAge_, ROR #25 SEP eor2 C0, C0, vAsa +bic tmp, sAgu_, sAgo_, ROR #16 SEP +eor sAgi, tmp, sAgi_, ROR #58 SEP bcax_m1 vAso, vBso, vBsa, vBsu +bic tmp, sAga_, sAgu_, ROR #31 SEP +eor sAgo, tmp, sAgo_, ROR #47 SEP +bic tmp, sAge_, sAga_, ROR #56 SEP eor2 C1, C1, vAse +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP bcax_m0 vAsu, vBsu, vBse, vBsa +eor sAka, tmp, sAka_, ROR #24 SEP +bic tmp, sAko_, sAki_, ROR #47 SEP eor2 C2, C2, vAsi +eor sAke, tmp, sAke_, ROR #2 SEP +bic tmp, sAku_, sAko_, ROR #10 SEP eor2 C3, C3, vAso +eor sAki, tmp, sAki_, ROR #57 SEP +bic tmp, sAka_, sAku_, ROR #47 SEP bcax_m1 vAba, vBba, vBbi, vBbe +eor sAko, tmp, sAko_, ROR #57 SEP +bic tmp, sAke_, sAka_, ROR #5 SEP bcax_m0 vAbe, vBbe, vBbo, vBbi +eor sAku, tmp, sAku_, ROR #52 SEP +bic tmp, sAmi_, sAme_, ROR #38 SEP eor2 C1, C1, vAbe +eor sAma, tmp, sAma_, ROR #47 SEP +bic tmp, sAmo_, sAmi_, ROR #5 SEP restore x26, STACK_OFFSET_CONST +eor sAme, tmp, sAme_, ROR #43 SEP +bic tmp, sAmu_, sAmo_, ROR #41 SEP ldr vvtmpq, [x26], #16 +eor sAmi, tmp, sAmi_, ROR #46 SEP +bic tmp, sAma_, sAmu_, ROR #35 SEP save x26, STACK_OFFSET_CONST +eor sAmo, tmp, sAmo_, ROR #12 SEP +bic tmp, sAme_, sAma_, ROR #9 SEP eor vAba.16b, vAba.16b, vvtmp.16b +eor sAmu, tmp, sAmu_, ROR #44 SEP +bic tmp, sAsi_, sAse_, ROR #48 SEP +ldr cur_const, [const_addr, count, UXTW #3] SEP eor2 C4, C4, vAsu +eor sAsa, tmp, sAsa_, ROR #41 SEP +bic tmp, sAso_, sAsi_, ROR #2 SEP bcax_m0 vAbi, vBbi, vBbu, vBbo +eor sAse, tmp, sAse_, ROR #50 SEP +bic tmp, sAsu_, sAso_, ROR #25 SEP bcax_m1 vAbo, vBbo, vBba, vBbu +eor sAsi, tmp, sAsi_, ROR #27 SEP +bic tmp, sAsa_, sAsu_, ROR #60 SEP eor2 C3, C3, vAbo +eor sAso, tmp, sAso_, ROR #21 SEP +bic tmp, sAse_, sAsa_, ROR #57 SEP eor2 C2, C2, vAbi +eor sAsu, tmp, sAsu_, ROR #53 SEP +bic tmp, sAbi_, sAbe_, ROR #63 SEP eor2 C0, C0, vAba +eor s_Aba, s_Aba_, tmp, ROR #21 SEP +bic tmp, sAbo_, sAbi_, ROR #42 SEP bcax_m0 vAbu, vBbu, vBbe, vBba +eor sAbe, tmp, sAbe_, ROR #41 SEP +bic tmp, sAbu_, sAbo_, ROR #57 SEP eor2 C4, C4, vAbu +eor sAbi, tmp, sAbi_, ROR #35 SEP +bic tmp, s_Aba_, sAbu_, ROR #50 SEP restore(vAga) +eor sAbo, tmp, sAbo_, ROR #43 SEP +bic tmp, sAbe_, s_Aba_, ROR #44 SEP .unreq vvtmp +eor sAbu, tmp, sAbu_, ROR #30 SEP +add count, count, #1 SEP .unreq vvtmpq +eor s_Aba, s_Aba, cur_const SEP +.endm + + + +.macro hybrid_round_noninitial +save count, STACK_OFFSET_COUNT SEP +eor sC0, sAka, sAsa, ROR #50 SEP vvtmp .req vBba +eor sC1, sAse, sAge, ROR #60 SEP +eor sC2, sAmi, sAgi, ROR #59 SEP rax1_m0 E2, C1, C3 +eor sC3, sAgo, sAso, ROR #30 SEP +eor sC4, sAbu, sAsu, ROR #53 SEP rax1_m1 E4, C3, C0 +eor sC0, sAma, sC0, ROR #49 SEP +eor sC1, sAbe, sC1, ROR #44 SEP +eor sC2, sAki, sC2, ROR #26 SEP rax1_m0 E1, C0, C2 +eor sC3, sAmo, sC3, ROR #63 SEP +eor sC4, sAmu, sC4, ROR #56 SEP rax1_m1 E3, C2, C4 +eor sC0, sAga, sC0, ROR #57 SEP +eor sC1, sAme, sC1, ROR #58 SEP rax1_m0 E0, C4, C1 +eor sC2, sAbi, sC2, ROR #60 SEP +eor sC3, sAko, sC3, ROR #38 SEP +eor sC4, sAgu, sC4, ROR #48 SEP .unreq vvtmp +eor sC0, s_Aba, sC0, ROR #61 SEP +eor sC1, sAke, sC1, ROR #57 SEP vvtmp .req C1 +eor sC2, sAsi, sC2, ROR #52 SEP +eor sC3, sAbo, sC3, ROR #63 SEP vvtmpq .req C1q +eor sC4, sAku, sC4, ROR #50 SEP +ror sC1, sC1, 56 SEP +ror sC4, sC4, 58 SEP eor vBba.16b, vAba.16b, E0.16b +ror sC2, sC2, 62 SEP +eor sE1, sC0, sC2, ROR #63 SEP xar_m1 vBsa, vAbi, E2, 2 +eor sE3, sC2, sC4, ROR #63 SEP +eor sE0, sC4, sC1, ROR #63 SEP +eor sE2, sC1, sC3, ROR #63 SEP xar_m0 vBbi, vAki, E2, 21 +eor sE4, sC3, sC0, ROR #63 SEP +eor s_Aba_, sE0, s_Aba SEP xar_m1 vBki, vAko, E3, 39 +eor sAsa_, sE2, sAbi, ROR #50 SEP +eor sAbi_, sE2, sAki, ROR #46 SEP xar_m0 vBko, vAmu, E4, 56 +eor sAki_, sE3, sAko, ROR #63 SEP +eor sAko_, sE4, sAmu, ROR #28 SEP +eor sAmu_, sE3, sAso, ROR #2 SEP xar_m1 vBmu, vAso, E3, 8 +eor sAso_, sE0, sAma, ROR #54 SEP +eor sAka_, sE1, sAbe, ROR #43 SEP xar_m0 vBso, vAma, E0, 23 +eor sAse_, sE3, sAgo, ROR #36 SEP +eor sAgo_, sE1, sAme, ROR #49 SEP xar_m1 vBka, vAbe, E1, 63 +eor sAke_, sE2, sAgi, ROR #3 SEP +eor sAgi_, sE0, sAka, ROR #39 SEP +eor sAga_, sE3, sAbo SEP xar_m0 vBse, vAgo, E3, 9 +eor sAbo_, sE3, sAmo, ROR #37 SEP +eor sAmo_, sE2, sAmi, ROR #8 SEP xar_m1 vBgo, vAme, E1, 19 +eor sAmi_, sE1, sAke, ROR #56 SEP +eor sAge_, sE4, sAgu, ROR #44 SEP +eor sAgu_, sE2, sAsi, ROR #62 SEP xar_m0 vBke, vAgi, E2, 58 +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP xar_m1 vBgi, vAka, E0, 61 +eor sAma_, sE4, sAbu, ROR #20 SEP +eor sAbu_, sE4, sAsu, ROR #9 SEP xar_m0 vBga, vAbo, E3, 36 +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP +eor sAbe_, sE1, sAge, ROR #19 SEP xar_m1 vBbo, vAmo, E3, 43 +load_constant_ptr SEP +restore count, STACK_OFFSET_COUNT SEP xar_m0 vBmo, vAmi, E2, 49 +bic tmp, sAgi_, sAge_, ROR #47 SEP +eor sAga, tmp, sAga_, ROR #39 SEP xar_m1 vBmi, vAke, E1, 54 +bic tmp, sAgo_, sAgi_, ROR #42 SEP +eor sAge, tmp, sAge_, ROR #25 SEP +bic tmp, sAgu_, sAgo_, ROR #16 SEP xar_m0 vBge, vAgu, E4, 44 +eor sAgi, tmp, sAgi_, ROR #58 SEP +bic tmp, sAga_, sAgu_, ROR #31 SEP mov E3.16b, vAga.16b +eor sAgo, tmp, sAgo_, ROR #47 SEP +bic tmp, sAge_, sAga_, ROR #56 SEP bcax_m1 vAga, vBga, vBgi, vBge +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP +eor sAka, tmp, sAka_, ROR #24 SEP xar_m0 vBgu, vAsi, E2, 3 +bic tmp, sAko_, sAki_, ROR #47 SEP +eor sAke, tmp, sAke_, ROR #2 SEP xar_m1 vBsi, vAku, E4, 25 +bic tmp, sAku_, sAko_, ROR #10 SEP +eor sAki, tmp, sAki_, ROR #57 SEP +bic tmp, sAka_, sAku_, ROR #47 SEP xar_m0 vBku, vAsa, E0, 46 +eor sAko, tmp, sAko_, ROR #57 SEP +bic tmp, sAke_, sAka_, ROR #5 SEP xar_m1 vBma, vAbu, E4, 37 +eor sAku, tmp, sAku_, ROR #52 SEP +bic tmp, sAmi_, sAme_, ROR #38 SEP xar_m0 vBbu, vAsu, E4, 50 +eor sAma, tmp, sAma_, ROR #47 SEP +bic tmp, sAmo_, sAmi_, ROR #5 SEP +eor sAme, tmp, sAme_, ROR #43 SEP xar_m1 vBsu, vAse, E1, 62 +bic tmp, sAmu_, sAmo_, ROR #41 SEP +eor sAmi, tmp, sAmi_, ROR #46 SEP xar_m0 vBme, E3, E0, 28 +bic tmp, sAma_, sAmu_, ROR #35 SEP +ldr cur_const, [const_addr, count, UXTW #3] SEP xar_m1 vBbe, vAge, E1, 20 +add count, count, #1 SEP +eor sAmo, tmp, sAmo_, ROR #12 SEP +bic tmp, sAme_, sAma_, ROR #9 SEP bcax_m1 vAge, vBge, vBgo, vBgi +eor sAmu, tmp, sAmu_, ROR #44 SEP +bic tmp, sAsi_, sAse_, ROR #48 SEP bcax_m0 vAgi, vBgi, vBgu, vBgo +eor sAsa, tmp, sAsa_, ROR #41 SEP +bic tmp, sAso_, sAsi_, ROR #2 SEP +eor sAse, tmp, sAse_, ROR #50 SEP bcax_m1 vAgo, vBgo, vBga, vBgu +bic tmp, sAsu_, sAso_, ROR #25 SEP +eor sAsi, tmp, sAsi_, ROR #27 SEP bcax_m0 vAgu, vBgu, vBge, vBga +bic tmp, sAsa_, sAsu_, ROR #60 SEP +eor sAso, tmp, sAso_, ROR #21 SEP bcax_m1 vAka, vBka, vBki, vBke +bic tmp, sAse_, sAsa_, ROR #57 SEP +eor sAsu, tmp, sAsu_, ROR #53 SEP +bic tmp, sAbi_, sAbe_, ROR #63 SEP bcax_m0 vAke, vBke, vBko, vBki +eor s_Aba, s_Aba_, tmp, ROR #21 SEP +bic tmp, sAbo_, sAbi_, ROR #42 SEP .unreq vvtmp +eor sAbe, tmp, sAbe_, ROR #41 SEP +bic tmp, sAbu_, sAbo_, ROR #57 SEP .unreq vvtmpq +eor sAbi, tmp, sAbi_, ROR #35 SEP +bic tmp, s_Aba_, sAbu_, ROR #50 SEP +eor sAbo, tmp, sAbo_, ROR #43 SEP eor2 C0, vAka, vAga +bic tmp, sAbe_, s_Aba_, ROR #44 SEP +eor sAbu, tmp, sAbu_, ROR #30 SEP save(vAga) +eor s_Aba, s_Aba, cur_const SEP +save count, STACK_OFFSET_COUNT SEP +eor sC0, sAka, sAsa, ROR #50 SEP vvtmp .req vAga +eor sC1, sAse, sAge, ROR #60 SEP +eor sC2, sAmi, sAgi, ROR #59 SEP vvtmpq .req vAgaq +eor sC3, sAgo, sAso, ROR #30 SEP +eor sC4, sAbu, sAsu, ROR #53 SEP bcax_m0 vAki, vBki, vBku, vBko +eor sC0, sAma, sC0, ROR #49 SEP +eor sC1, sAbe, sC1, ROR #44 SEP +eor sC2, sAki, sC2, ROR #26 SEP bcax_m1 vAko, vBko, vBka, vBku +eor sC3, sAmo, sC3, ROR #63 SEP +eor sC4, sAmu, sC4, ROR #56 SEP eor2 C1, vAke, vAge +eor sC0, sAga, sC0, ROR #57 SEP +eor sC1, sAme, sC1, ROR #58 SEP bcax_m0 vAku, vBku, vBke, vBka +eor sC2, sAbi, sC2, ROR #60 SEP +eor sC3, sAko, sC3, ROR #38 SEP +eor sC4, sAgu, sC4, ROR #48 SEP eor2 C2, vAki, vAgi +eor sC0, s_Aba, sC0, ROR #61 SEP +eor sC1, sAke, sC1, ROR #57 SEP bcax_m1 vAma, vBma, vBmi, vBme +eor sC2, sAsi, sC2, ROR #52 SEP +eor sC3, sAbo, sC3, ROR #63 SEP eor2 C3, vAko, vAgo +eor sC4, sAku, sC4, ROR #50 SEP +ror sC1, sC1, 56 SEP +ror sC4, sC4, 58 SEP bcax_m0 vAme, vBme, vBmo, vBmi +ror sC2, sC2, 62 SEP +eor sE1, sC0, sC2, ROR #63 SEP eor2 C4, vAku, vAgu +eor sE3, sC2, sC4, ROR #63 SEP +eor sE0, sC4, sC1, ROR #63 SEP +eor sE2, sC1, sC3, ROR #63 SEP bcax_m1 vAmi, vBmi, vBmu, vBmo +eor sE4, sC3, sC0, ROR #63 SEP +eor s_Aba_, sE0, s_Aba SEP eor2 C0, C0, vAma +eor sAsa_, sE2, sAbi, ROR #50 SEP +eor sAbi_, sE2, sAki, ROR #46 SEP bcax_m0 vAmo, vBmo, vBma, vBmu +eor sAki_, sE3, sAko, ROR #63 SEP +eor sAko_, sE4, sAmu, ROR #28 SEP +eor sAmu_, sE3, sAso, ROR #2 SEP eor2 C1, C1, vAme +eor sAso_, sE0, sAma, ROR #54 SEP +eor sAka_, sE1, sAbe, ROR #43 SEP bcax_m1 vAmu, vBmu, vBme, vBma +eor sAse_, sE3, sAgo, ROR #36 SEP +eor sAgo_, sE1, sAme, ROR #49 SEP eor2 C2, C2, vAmi +eor sAke_, sE2, sAgi, ROR #3 SEP +eor sAgi_, sE0, sAka, ROR #39 SEP +eor sAga_, sE3, sAbo SEP bcax_m0 vAsa, vBsa, vBsi, vBse +eor sAbo_, sE3, sAmo, ROR #37 SEP +eor sAmo_, sE2, sAmi, ROR #8 SEP eor2 C3, C3, vAmo +eor sAmi_, sE1, sAke, ROR #56 SEP +eor sAge_, sE4, sAgu, ROR #44 SEP +eor sAgu_, sE2, sAsi, ROR #62 SEP bcax_m1 vAse, vBse, vBso, vBsi +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP eor2 C4, C4, vAmu +eor sAma_, sE4, sAbu, ROR #20 SEP +eor sAbu_, sE4, sAsu, ROR #9 SEP bcax_m0 vAsi, vBsi, vBsu, vBso +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP +eor sAbe_, sE1, sAge, ROR #19 SEP eor2 C0, C0, vAsa +load_constant_ptr SEP +restore count, STACK_OFFSET_COUNT SEP bcax_m1 vAso, vBso, vBsa, vBsu +bic tmp, sAgi_, sAge_, ROR #47 SEP +eor sAga, tmp, sAga_, ROR #39 SEP eor2 C1, C1, vAse +bic tmp, sAgo_, sAgi_, ROR #42 SEP +eor sAge, tmp, sAge_, ROR #25 SEP +bic tmp, sAgu_, sAgo_, ROR #16 SEP bcax_m0 vAsu, vBsu, vBse, vBsa +eor sAgi, tmp, sAgi_, ROR #58 SEP +bic tmp, sAga_, sAgu_, ROR #31 SEP eor2 C2, C2, vAsi +eor sAgo, tmp, sAgo_, ROR #47 SEP +bic tmp, sAge_, sAga_, ROR #56 SEP eor2 C3, C3, vAso +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP +eor sAka, tmp, sAka_, ROR #24 SEP bcax_m1 vAba, vBba, vBbi, vBbe +bic tmp, sAko_, sAki_, ROR #47 SEP +eor sAke, tmp, sAke_, ROR #2 SEP bcax_m0 vAbe, vBbe, vBbo, vBbi +bic tmp, sAku_, sAko_, ROR #10 SEP +eor sAki, tmp, sAki_, ROR #57 SEP +bic tmp, sAka_, sAku_, ROR #47 SEP eor2 C1, C1, vAbe +eor sAko, tmp, sAko_, ROR #57 SEP +bic tmp, sAke_, sAka_, ROR #5 SEP restore x26, STACK_OFFSET_CONST +eor sAku, tmp, sAku_, ROR #52 SEP +bic tmp, sAmi_, sAme_, ROR #38 SEP ldr vvtmpq, [x26], #16 +eor sAma, tmp, sAma_, ROR #47 SEP +bic tmp, sAmo_, sAmi_, ROR #5 SEP +eor sAme, tmp, sAme_, ROR #43 SEP save x26, STACK_OFFSET_CONST +bic tmp, sAmu_, sAmo_, ROR #41 SEP +eor sAmi, tmp, sAmi_, ROR #46 SEP eor vAba.16b, vAba.16b, vvtmp.16b +bic tmp, sAma_, sAmu_, ROR #35 SEP +ldr cur_const, [const_addr, count, UXTW #3] SEP eor2 C4, C4, vAsu +add count, count, #1 SEP +eor sAmo, tmp, sAmo_, ROR #12 SEP +bic tmp, sAme_, sAma_, ROR #9 SEP bcax_m0 vAbi, vBbi, vBbu, vBbo +eor sAmu, tmp, sAmu_, ROR #44 SEP +bic tmp, sAsi_, sAse_, ROR #48 SEP bcax_m1 vAbo, vBbo, vBba, vBbu +eor sAsa, tmp, sAsa_, ROR #41 SEP +bic tmp, sAso_, sAsi_, ROR #2 SEP +eor sAse, tmp, sAse_, ROR #50 SEP eor2 C3, C3, vAbo +bic tmp, sAsu_, sAso_, ROR #25 SEP +eor sAsi, tmp, sAsi_, ROR #27 SEP eor2 C2, C2, vAbi +bic tmp, sAsa_, sAsu_, ROR #60 SEP +eor sAso, tmp, sAso_, ROR #21 SEP eor2 C0, C0, vAba +bic tmp, sAse_, sAsa_, ROR #57 SEP +eor sAsu, tmp, sAsu_, ROR #53 SEP +bic tmp, sAbi_, sAbe_, ROR #63 SEP bcax_m0 vAbu, vBbu, vBbe, vBba +eor s_Aba, s_Aba_, tmp, ROR #21 SEP +bic tmp, sAbo_, sAbi_, ROR #42 SEP eor2 C4, C4, vAbu +eor sAbe, tmp, sAbe_, ROR #41 SEP +bic tmp, sAbu_, sAbo_, ROR #57 SEP restore(vAga) +eor sAbi, tmp, sAbi_, ROR #35 SEP +bic tmp, s_Aba_, sAbu_, ROR #50 SEP +eor sAbo, tmp, sAbo_, ROR #43 SEP .unreq vvtmp +bic tmp, sAbe_, s_Aba_, ROR #44 SEP +eor sAbu, tmp, sAbu_, ROR #30 SEP .unreq vvtmpq +eor s_Aba, s_Aba, cur_const SEP +.endm + +.macro hybrid_round_final +save count, STACK_OFFSET_COUNT SEP +eor sC0, sAka, sAsa, ROR #50 SEP vvtmp .req vBba +eor sC1, sAse, sAge, ROR #60 SEP +eor sC2, sAmi, sAgi, ROR #59 SEP +eor sC3, sAgo, sAso, ROR #30 SEP rax1_m0 E2, C1, C3 +eor sC4, sAbu, sAsu, ROR #53 SEP +eor sC0, sAma, sC0, ROR #49 SEP +eor sC1, sAbe, sC1, ROR #44 SEP +eor sC2, sAki, sC2, ROR #26 SEP rax1_m1 E4, C3, C0 +eor sC3, sAmo, sC3, ROR #63 SEP +eor sC4, sAmu, sC4, ROR #56 SEP +eor sC0, sAga, sC0, ROR #57 SEP rax1_m0 E1, C0, C2 +eor sC1, sAme, sC1, ROR #58 SEP +eor sC2, sAbi, sC2, ROR #60 SEP +eor sC3, sAko, sC3, ROR #38 SEP +eor sC4, sAgu, sC4, ROR #48 SEP rax1_m1 E3, C2, C4 +eor sC0, s_Aba, sC0, ROR #61 SEP +eor sC1, sAke, sC1, ROR #57 SEP +eor sC2, sAsi, sC2, ROR #52 SEP +eor sC3, sAbo, sC3, ROR #63 SEP rax1_m0 E0, C4, C1 +eor sC4, sAku, sC4, ROR #50 SEP +ror sC1, sC1, 56 SEP +ror sC4, sC4, 58 SEP .unreq vvtmp +ror sC2, sC2, 62 SEP +eor sE1, sC0, sC2, ROR #63 SEP +eor sE3, sC2, sC4, ROR #63 SEP +eor sE0, sC4, sC1, ROR #63 SEP vvtmp .req C1 +eor sE2, sC1, sC3, ROR #63 SEP +eor sE4, sC3, sC0, ROR #63 SEP +eor s_Aba_, sE0, s_Aba SEP +eor sAsa_, sE2, sAbi, ROR #50 SEP vvtmpq .req C1q +eor sAbi_, sE2, sAki, ROR #46 SEP +eor sAki_, sE3, sAko, ROR #63 SEP +eor sAko_, sE4, sAmu, ROR #28 SEP eor vBba.16b, vAba.16b, E0.16b +eor sAmu_, sE3, sAso, ROR #2 SEP +eor sAso_, sE0, sAma, ROR #54 SEP +eor sAka_, sE1, sAbe, ROR #43 SEP +eor sAse_, sE3, sAgo, ROR #36 SEP xar_m0 vBsa, vAbi, E2, 2 +eor sAgo_, sE1, sAme, ROR #49 SEP +eor sAke_, sE2, sAgi, ROR #3 SEP +eor sAgi_, sE0, sAka, ROR #39 SEP +eor sAga_, sE3, sAbo SEP xar_m1 vBbi, vAki, E2, 21 +eor sAbo_, sE3, sAmo, ROR #37 SEP +eor sAmo_, sE2, sAmi, ROR #8 SEP +eor sAmi_, sE1, sAke, ROR #56 SEP xar_m0 vBki, vAko, E3, 39 +eor sAge_, sE4, sAgu, ROR #44 SEP +eor sAgu_, sE2, sAsi, ROR #62 SEP +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP xar_m1 vBko, vAmu, E4, 56 +eor sAma_, sE4, sAbu, ROR #20 SEP +eor sAbu_, sE4, sAsu, ROR #9 SEP +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP xar_m0 vBmu, vAso, E3, 8 +eor sAbe_, sE1, sAge, ROR #19 SEP +load_constant_ptr SEP +restore count, STACK_OFFSET_COUNT SEP xar_m1 vBso, vAma, E0, 23 +bic tmp, sAgi_, sAge_, ROR #47 SEP +eor sAga, tmp, sAga_, ROR #39 SEP +bic tmp, sAgo_, sAgi_, ROR #42 SEP +eor sAge, tmp, sAge_, ROR #25 SEP xar_m0 vBka, vAbe, E1, 63 +bic tmp, sAgu_, sAgo_, ROR #16 SEP +eor sAgi, tmp, sAgi_, ROR #58 SEP +bic tmp, sAga_, sAgu_, ROR #31 SEP +eor sAgo, tmp, sAgo_, ROR #47 SEP xar_m1 vBse, vAgo, E3, 9 +bic tmp, sAge_, sAga_, ROR #56 SEP +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP xar_m0 vBgo, vAme, E1, 19 +eor sAka, tmp, sAka_, ROR #24 SEP +bic tmp, sAko_, sAki_, ROR #47 SEP +eor sAke, tmp, sAke_, ROR #2 SEP +bic tmp, sAku_, sAko_, ROR #10 SEP xar_m1 vBke, vAgi, E2, 58 +eor sAki, tmp, sAki_, ROR #57 SEP +bic tmp, sAka_, sAku_, ROR #47 SEP +eor sAko, tmp, sAko_, ROR #57 SEP +bic tmp, sAke_, sAka_, ROR #5 SEP xar_m0 vBgi, vAka, E0, 61 +eor sAku, tmp, sAku_, ROR #52 SEP +bic tmp, sAmi_, sAme_, ROR #38 SEP +eor sAma, tmp, sAma_, ROR #47 SEP xar_m1 vBga, vAbo, E3, 36 +bic tmp, sAmo_, sAmi_, ROR #5 SEP +eor sAme, tmp, sAme_, ROR #43 SEP +bic tmp, sAmu_, sAmo_, ROR #41 SEP +eor sAmi, tmp, sAmi_, ROR #46 SEP xar_m0 vBbo, vAmo, E3, 43 +bic tmp, sAma_, sAmu_, ROR #35 SEP +ldr cur_const, [const_addr, count, UXTW #3] SEP +add count, count, #1 SEP xar_m1 vBmo, vAmi, E2, 49 +eor sAmo, tmp, sAmo_, ROR #12 SEP +bic tmp, sAme_, sAma_, ROR #9 SEP +eor sAmu, tmp, sAmu_, ROR #44 SEP +bic tmp, sAsi_, sAse_, ROR #48 SEP xar_m0 vBmi, vAke, E1, 54 +eor sAsa, tmp, sAsa_, ROR #41 SEP +bic tmp, sAso_, sAsi_, ROR #2 SEP +eor sAse, tmp, sAse_, ROR #50 SEP +bic tmp, sAsu_, sAso_, ROR #25 SEP xar_m1 vBge, vAgu, E4, 44 +eor sAsi, tmp, sAsi_, ROR #27 SEP +bic tmp, sAsa_, sAsu_, ROR #60 SEP +eor sAso, tmp, sAso_, ROR #21 SEP mov E3.16b, vAga.16b +bic tmp, sAse_, sAsa_, ROR #57 SEP +eor sAsu, tmp, sAsu_, ROR #53 SEP +bic tmp, sAbi_, sAbe_, ROR #63 SEP +eor s_Aba, s_Aba_, tmp, ROR #21 SEP bcax_m1 vAga, vBga, vBgi, vBge +bic tmp, sAbo_, sAbi_, ROR #42 SEP +eor sAbe, tmp, sAbe_, ROR #41 SEP +bic tmp, sAbu_, sAbo_, ROR #57 SEP +eor sAbi, tmp, sAbi_, ROR #35 SEP xar_m0 vBgu, vAsi, E2, 3 +bic tmp, s_Aba_, sAbu_, ROR #50 SEP +eor sAbo, tmp, sAbo_, ROR #43 SEP +bic tmp, sAbe_, s_Aba_, ROR #44 SEP xar_m1 vBsi, vAku, E4, 25 +eor sAbu, tmp, sAbu_, ROR #30 SEP +eor s_Aba, s_Aba, cur_const SEP +save count, STACK_OFFSET_COUNT SEP +eor sC0, sAka, sAsa, ROR #50 SEP xar_m0 vBku, vAsa, E0, 46 +eor sC1, sAse, sAge, ROR #60 SEP +eor sC2, sAmi, sAgi, ROR #59 SEP +eor sC3, sAgo, sAso, ROR #30 SEP +eor sC4, sAbu, sAsu, ROR #53 SEP xar_m1 vBma, vAbu, E4, 37 +eor sC0, sAma, sC0, ROR #49 SEP +eor sC1, sAbe, sC1, ROR #44 SEP +eor sC2, sAki, sC2, ROR #26 SEP xar_m0 vBbu, vAsu, E4, 50 +eor sC3, sAmo, sC3, ROR #63 SEP +eor sC4, sAmu, sC4, ROR #56 SEP +eor sC0, sAga, sC0, ROR #57 SEP +eor sC1, sAme, sC1, ROR #58 SEP xar_m1 vBsu, vAse, E1, 62 +eor sC2, sAbi, sC2, ROR #60 SEP +eor sC3, sAko, sC3, ROR #38 SEP +eor sC4, sAgu, sC4, ROR #48 SEP +eor sC0, s_Aba, sC0, ROR #61 SEP xar_m0 vBme, E3, E0, 28 +eor sC1, sAke, sC1, ROR #57 SEP +eor sC2, sAsi, sC2, ROR #52 SEP +eor sC3, sAbo, sC3, ROR #63 SEP xar_m1 vBbe, vAge, E1, 20 +eor sC4, sAku, sC4, ROR #50 SEP +ror sC1, sC1, 56 SEP +ror sC4, sC4, 58 SEP +ror sC2, sC2, 62 SEP bcax_m0 vAge, vBge, vBgo, vBgi +eor sE1, sC0, sC2, ROR #63 SEP +eor sE3, sC2, sC4, ROR #63 SEP +eor sE0, sC4, sC1, ROR #63 SEP +eor sE2, sC1, sC3, ROR #63 SEP bcax_m1 vAgi, vBgi, vBgu, vBgo +eor sE4, sC3, sC0, ROR #63 SEP +eor s_Aba_, sE0, s_Aba SEP +eor sAsa_, sE2, sAbi, ROR #50 SEP bcax_m0 vAgo, vBgo, vBga, vBgu +eor sAbi_, sE2, sAki, ROR #46 SEP +eor sAki_, sE3, sAko, ROR #63 SEP +eor sAko_, sE4, sAmu, ROR #28 SEP +eor sAmu_, sE3, sAso, ROR #2 SEP bcax_m1 vAgu, vBgu, vBge, vBga +eor sAso_, sE0, sAma, ROR #54 SEP +eor sAka_, sE1, sAbe, ROR #43 SEP +eor sAse_, sE3, sAgo, ROR #36 SEP +eor sAgo_, sE1, sAme, ROR #49 SEP bcax_m0 vAka, vBka, vBki, vBke +eor sAke_, sE2, sAgi, ROR #3 SEP +eor sAgi_, sE0, sAka, ROR #39 SEP +eor sAga_, sE3, sAbo SEP bcax_m1 vAke, vBke, vBko, vBki +eor sAbo_, sE3, sAmo, ROR #37 SEP +eor sAmo_, sE2, sAmi, ROR #8 SEP +eor sAmi_, sE1, sAke, ROR #56 SEP +eor sAge_, sE4, sAgu, ROR #44 SEP bcax_m0 vAki, vBki, vBku, vBko +eor sAgu_, sE2, sAsi, ROR #62 SEP +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP +eor sAma_, sE4, sAbu, ROR #20 SEP bcax_m1 vAko, vBko, vBka, vBku +eor sAbu_, sE4, sAsu, ROR #9 SEP +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP bcax_m0 vAku, vBku, vBke, vBka +eor sAbe_, sE1, sAge, ROR #19 SEP +load_constant_ptr SEP +restore count, STACK_OFFSET_COUNT SEP +bic tmp, sAgi_, sAge_, ROR #47 SEP bcax_m1 vAma, vBma, vBmi, vBme +eor sAga, tmp, sAga_, ROR #39 SEP +bic tmp, sAgo_, sAgi_, ROR #42 SEP +eor sAge, tmp, sAge_, ROR #25 SEP bcax_m0 vAme, vBme, vBmo, vBmi +bic tmp, sAgu_, sAgo_, ROR #16 SEP +eor sAgi, tmp, sAgi_, ROR #58 SEP +bic tmp, sAga_, sAgu_, ROR #31 SEP +eor sAgo, tmp, sAgo_, ROR #47 SEP bcax_m1 vAmi, vBmi, vBmu, vBmo +bic tmp, sAge_, sAga_, ROR #56 SEP +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP +eor sAka, tmp, sAka_, ROR #24 SEP bcax_m0 vAmo, vBmo, vBma, vBmu +bic tmp, sAko_, sAki_, ROR #47 SEP +eor sAke, tmp, sAke_, ROR #2 SEP +bic tmp, sAku_, sAko_, ROR #10 SEP bcax_m1 vAmu, vBmu, vBme, vBma +eor sAki, tmp, sAki_, ROR #57 SEP +bic tmp, sAka_, sAku_, ROR #47 SEP +eor sAko, tmp, sAko_, ROR #57 SEP +bic tmp, sAke_, sAka_, ROR #5 SEP bcax_m0 vAsa, vBsa, vBsi, vBse +eor sAku, tmp, sAku_, ROR #52 SEP +bic tmp, sAmi_, sAme_, ROR #38 SEP +eor sAma, tmp, sAma_, ROR #47 SEP +bic tmp, sAmo_, sAmi_, ROR #5 SEP bcax_m1 vAse, vBse, vBso, vBsi +eor sAme, tmp, sAme_, ROR #43 SEP +bic tmp, sAmu_, sAmo_, ROR #41 SEP +eor sAmi, tmp, sAmi_, ROR #46 SEP bcax_m0 vAsi, vBsi, vBsu, vBso +bic tmp, sAma_, sAmu_, ROR #35 SEP +ldr cur_const, [const_addr, count, UXTW #3] SEP +add count, count, #1 SEP +eor sAmo, tmp, sAmo_, ROR #12 SEP bcax_m1 vAso, vBso, vBsa, vBsu +bic tmp, sAme_, sAma_, ROR #9 SEP +eor sAmu, tmp, sAmu_, ROR #44 SEP +bic tmp, sAsi_, sAse_, ROR #48 SEP +eor sAsa, tmp, sAsa_, ROR #41 SEP bcax_m0 vAsu, vBsu, vBse, vBsa +bic tmp, sAso_, sAsi_, ROR #2 SEP +eor sAse, tmp, sAse_, ROR #50 SEP +bic tmp, sAsu_, sAso_, ROR #25 SEP bcax_m1 vAba, vBba, vBbi, vBbe +eor sAsi, tmp, sAsi_, ROR #27 SEP +bic tmp, sAsa_, sAsu_, ROR #60 SEP +eor sAso, tmp, sAso_, ROR #21 SEP +bic tmp, sAse_, sAsa_, ROR #57 SEP bcax_m0 vAbe, vBbe, vBbo, vBbi +eor sAsu, tmp, sAsu_, ROR #53 SEP +bic tmp, sAbi_, sAbe_, ROR #63 SEP +eor s_Aba, s_Aba_, tmp, ROR #21 SEP +bic tmp, sAbo_, sAbi_, ROR #42 SEP bcax_m1 vAbi, vBbi, vBbu, vBbo +eor sAbe, tmp, sAbe_, ROR #41 SEP +bic tmp, sAbu_, sAbo_, ROR #57 SEP +eor sAbi, tmp, sAbi_, ROR #35 SEP bcax_m0 vAbo, vBbo, vBba, vBbu +bic tmp, s_Aba_, sAbu_, ROR #50 SEP +eor sAbo, tmp, sAbo_, ROR #43 SEP +bic tmp, sAbe_, s_Aba_, ROR #44 SEP +eor sAbu, tmp, sAbu_, ROR #30 SEP bcax_m1 vAbu, vBbu, vBbe, vBba +eor s_Aba, s_Aba, cur_const SEP +ror sAga, sAga,(64-3) SEP +ror sAka, sAka,(64-25) SEP +ror sAma, sAma,(64-10) SEP restore x26, STACK_OFFSET_CONST +ror sAsa, sAsa,(64-39) SEP +ror sAbe, sAbe,(64-21) SEP +ror sAge, sAge,(64-45) SEP ldr vvtmpq, [x26], #16 +ror sAke, sAke,(64-8) SEP +ror sAme, sAme,(64-15) SEP +ror sAse, sAse,(64-41) SEP +ror sAbi, sAbi,(64-14) SEP save x26, STACK_OFFSET_CONST +ror sAgi, sAgi,(64-61) SEP +ror sAki, sAki,(64-18) SEP +ror sAmi, sAmi,(64-56) SEP +ror sAsi, sAsi,(64-2) SEP eor vAba.16b, vAba.16b, vvtmp.16b +ror sAgo, sAgo,(64-28) SEP +ror sAko, sAko,(64-1) SEP +ror sAmo, sAmo,(64-27) SEP .unreq vvtmp +ror sAso, sAso,(64-62) SEP +ror sAbu, sAbu,(64-44) SEP +ror sAgu, sAgu,(64-20) SEP +ror sAku, sAku,(64-6) SEP .unreq vvtmpq +ror sAmu, sAmu,(64-36) SEP +ror sAsu, sAsu,(64-55) SEP +.endm + + +#define KECCAK_F1600_ROUNDS 24 + +.global keccak_f1600_x4_hybrid_asm_v6 +.global _keccak_f1600_x4_hybrid_asm_v6 +.text +.align 4 + +keccak_f1600_x4_hybrid_asm_v6: +_keccak_f1600_x4_hybrid_asm_v6: + alloc_stack + save_gprs + save_vregs + save input_addr, STACK_OFFSET_INPUT + + + ASM_LOAD(const_addr,round_constants_vec) + + save const_addr, STACK_OFFSET_CONST + load_input_vector 2,1 + + // First scalar Keccak computation alongside first half of SIMD computation + load_input_scalar 4,0 + hybrid_round_initial + loop_0: + hybrid_round_noninitial + cmp count, #(KECCAK_F1600_ROUNDS-3) + ble loop_0 + + hybrid_round_final + + restore input_addr, STACK_OFFSET_INPUT + store_input_scalar 4,0 + + // Second scalar Keccak computation alongsie second half of SIMD computation + load_input_scalar 4,1 + hybrid_round_initial + loop_1: + hybrid_round_noninitial + cmp count, #(KECCAK_F1600_ROUNDS-3) + ble loop_1 + + hybrid_round_final + + restore input_addr, STACK_OFFSET_INPUT + store_input_scalar 4,1 + store_input_vector 2,1 + + restore_vregs + restore_gprs + free_stack + + + ret +#endif \ No newline at end of file diff --git a/asm/manual/keccak_f1600/keccak_f1600_x4_hybrid_asm_v7.s b/asm/manual/keccak_f1600/keccak_f1600_x4_hybrid_asm_v7.s new file mode 100644 index 0000000..661bda5 --- /dev/null +++ b/asm/manual/keccak_f1600/keccak_f1600_x4_hybrid_asm_v7.s @@ -0,0 +1,1266 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" +#if defined(__ARM_FEATURE_SHA3) + +/********************** CONSTANTS *************************/ + .data + .align(8) +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 +round_constants_vec: + .quad 0x0000000000000001 + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + .quad 0x8000000080008008 +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x26 + cur_const .req x26 + count .req w27 + + /* Mapping of Kecck-f1600 state to vector registers + * at the beginning and end of each round. */ + vAba .req v0 + vAbe .req v1 + vAbi .req v2 + vAbo .req v3 + vAbu .req v4 + vAga .req v5 + vAge .req v6 + vAgi .req v7 + vAgo .req v8 + vAgu .req v9 + vAka .req v10 + vAke .req v11 + vAki .req v12 + vAko .req v13 + vAku .req v14 + vAma .req v15 + vAme .req v16 + vAmi .req v17 + vAmo .req v18 + vAmu .req v19 + vAsa .req v20 + vAse .req v21 + vAsi .req v22 + vAso .req v23 + vAsu .req v24 + + /* q-form of the above mapping */ + vAbaq .req q0 + vAbeq .req q1 + vAbiq .req q2 + vAboq .req q3 + vAbuq .req q4 + vAgaq .req q5 + vAgeq .req q6 + vAgiq .req q7 + vAgoq .req q8 + vAguq .req q9 + vAkaq .req q10 + vAkeq .req q11 + vAkiq .req q12 + vAkoq .req q13 + vAkuq .req q14 + vAmaq .req q15 + vAmeq .req q16 + vAmiq .req q17 + vAmoq .req q18 + vAmuq .req q19 + vAsaq .req q20 + vAseq .req q21 + vAsiq .req q22 + vAsoq .req q23 + vAsuq .req q24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v30 + C1 .req v29 + C2 .req v28 + C3 .req v27 + C4 .req v26 + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req v26 + E1 .req v25 + E2 .req v29 + E3 .req v28 + E4 .req v27 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + vAbi_ .req v2 + vAbo_ .req v3 + vAbu_ .req v4 + vAga_ .req v10 + vAge_ .req v11 + vAgi_ .req v7 + vAgo_ .req v8 + vAgu_ .req v9 + vAka_ .req v15 + vAke_ .req v16 + vAki_ .req v12 + vAko_ .req v13 + vAku_ .req v14 + vAma_ .req v20 + vAme_ .req v21 + vAmi_ .req v17 + vAmo_ .req v18 + vAmu_ .req v19 + vAsa_ .req v0 + vAse_ .req v1 + vAsi_ .req v22 + vAso_ .req v23 + vAsu_ .req v24 + vAba_ .req v30 + vAbe_ .req v27 + + /* Mapping of Kecck-f1600 state to scalar registers + * at the beginning and end of each round. */ + s_Aba .req x1 + sAbe .req x6 + sAbi .req x11 + sAbo .req x16 + sAbu .req x21 + sAga .req x2 + sAge .req x7 + sAgi .req x12 + sAgo .req x17 + sAgu .req x22 + sAka .req x3 + sAke .req x8 + sAki .req x13 + sAko .req x18 + sAku .req x23 + sAma .req x4 + sAme .req x9 + sAmi .req x14 + sAmo .req x19 + sAmu .req x24 + sAsa .req x5 + sAse .req x10 + sAsi .req x15 + sAso .req x20 + sAsu .req x25 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + s_Aba_ .req x30 + sAbe_ .req x28 + sAbi_ .req x11 + sAbo_ .req x16 + sAbu_ .req x21 + sAga_ .req x3 + sAge_ .req x8 + sAgi_ .req x12 + sAgo_ .req x17 + sAgu_ .req x22 + sAka_ .req x4 + sAke_ .req x9 + sAki_ .req x13 + sAko_ .req x18 + sAku_ .req x23 + sAma_ .req x5 + sAme_ .req x10 + sAmi_ .req x14 + sAmo_ .req x19 + sAmu_ .req x24 + sAsa_ .req x1 + sAse_ .req x6 + sAsi_ .req x15 + sAso_ .req x20 + sAsu_ .req x25 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + sC0 .req x30 + sE0 .req x29 + sC1 .req x26 + sE1 .req x0 + sC2 .req x27 + sE2 .req x26 + sC3 .req x28 + sE3 .req x27 + sC4 .req x29 + sE4 .req x28 + + tmp .req x0 + +/************************ MACROS ****************************/ + +/* Macros using v8.4-A SHA-3 instructions */ + + +.macro eor2 d s0 s1 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor3_m0 d s0 s1 s2 + eor3 \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +.macro rax1_m0 d s0 s1 + rax1 \d\().2d, \s0\().2d, \s1\().2d +.endm + +.macro xar_m0 d s0 s1 imm + xar \d\().2d, \s0\().2d, \s1\().2d, #\imm +.endm + +.macro rax1_m1 d s0 s1 + xar_m0 tmp, vzr, \s1, 63 + eor \d\().16b, \s0\().16b, tmp.16b +.endm + +.macro bcax_m0 d s0 s1 s2 + bcax \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + + +.macro load_input_vector num idx + ldr vAbaq, [input_addr, #(16*(\num*0+\idx))] + ldr vAbeq, [input_addr, #(16*(\num*1+\idx))] + ldr vAbiq, [input_addr, #(16*(\num*2+\idx))] + ldr vAboq, [input_addr, #(16*(\num*3+\idx))] + ldr vAbuq, [input_addr, #(16*(\num*4+\idx))] + ldr vAgaq, [input_addr, #(16*(\num*5+\idx))] + ldr vAgeq, [input_addr, #(16*(\num*6+\idx))] + ldr vAgiq, [input_addr, #(16*(\num*7+\idx))] + ldr vAgoq, [input_addr, #(16*(\num*8+\idx))] + ldr vAguq, [input_addr, #(16*(\num*9+\idx))] + ldr vAkaq, [input_addr, #(16*(\num*10+\idx))] + ldr vAkeq, [input_addr, #(16*(\num*11+\idx))] + ldr vAkiq, [input_addr, #(16*(\num*12+\idx))] + ldr vAkoq, [input_addr, #(16*(\num*13+\idx))] + ldr vAkuq, [input_addr, #(16*(\num*14+\idx))] + ldr vAmaq, [input_addr, #(16*(\num*15+\idx))] + ldr vAmeq, [input_addr, #(16*(\num*16+\idx))] + ldr vAmiq, [input_addr, #(16*(\num*17+\idx))] + ldr vAmoq, [input_addr, #(16*(\num*18+\idx))] + ldr vAmuq, [input_addr, #(16*(\num*19+\idx))] + ldr vAsaq, [input_addr, #(16*(\num*20+\idx))] + ldr vAseq, [input_addr, #(16*(\num*21+\idx))] + ldr vAsiq, [input_addr, #(16*(\num*22+\idx))] + ldr vAsoq, [input_addr, #(16*(\num*23+\idx))] + ldr vAsuq, [input_addr, #(16*(\num*24+\idx))] +.endm + +.macro store_input_vector num idx + str vAbaq, [input_addr, #(16*(\num*0+\idx))] + str vAbeq, [input_addr, #(16*(\num*1+\idx))] + str vAbiq, [input_addr, #(16*(\num*2+\idx))] + str vAboq, [input_addr, #(16*(\num*3+\idx))] + str vAbuq, [input_addr, #(16*(\num*4+\idx))] + str vAgaq, [input_addr, #(16*(\num*5+\idx))] + str vAgeq, [input_addr, #(16*(\num*6+\idx))] + str vAgiq, [input_addr, #(16*(\num*7+\idx))] + str vAgoq, [input_addr, #(16*(\num*8+\idx))] + str vAguq, [input_addr, #(16*(\num*9+\idx))] + str vAkaq, [input_addr, #(16*(\num*10+\idx))] + str vAkeq, [input_addr, #(16*(\num*11+\idx))] + str vAkiq, [input_addr, #(16*(\num*12+\idx))] + str vAkoq, [input_addr, #(16*(\num*13+\idx))] + str vAkuq, [input_addr, #(16*(\num*14+\idx))] + str vAmaq, [input_addr, #(16*(\num*15+\idx))] + str vAmeq, [input_addr, #(16*(\num*16+\idx))] + str vAmiq, [input_addr, #(16*(\num*17+\idx))] + str vAmoq, [input_addr, #(16*(\num*18+\idx))] + str vAmuq, [input_addr, #(16*(\num*19+\idx))] + str vAsaq, [input_addr, #(16*(\num*20+\idx))] + str vAseq, [input_addr, #(16*(\num*21+\idx))] + str vAsiq, [input_addr, #(16*(\num*22+\idx))] + str vAsoq, [input_addr, #(16*(\num*23+\idx))] + str vAsuq, [input_addr, #(16*(\num*24+\idx))] +.endm + +.macro store_input_scalar num idx + str s_Aba, [input_addr, 8*(\num*(0) +\idx)] + str sAbe, [input_addr, 8*(\num*(0+1) +\idx)] + str sAbi, [input_addr, 8*(\num*(2)+ \idx)] + str sAbo, [input_addr, 8*(\num*(2+1) +\idx)] + str sAbu, [input_addr, 8*(\num*(4)+ \idx)] + str sAga, [input_addr, 8*(\num*(4+1) +\idx)] + str sAge, [input_addr, 8*(\num*(6)+ \idx)] + str sAgi, [input_addr, 8*(\num*(6+1) +\idx)] + str sAgo, [input_addr, 8*(\num*(8)+ \idx)] + str sAgu, [input_addr, 8*(\num*(8+1) +\idx)] + str sAka, [input_addr, 8*(\num*(10) +\idx)] + str sAke, [input_addr, 8*(\num*(10+1)+\idx)] + str sAki, [input_addr, 8*(\num*(12) +\idx)] + str sAko, [input_addr, 8*(\num*(12+1)+\idx)] + str sAku, [input_addr, 8*(\num*(14) +\idx)] + str sAma, [input_addr, 8*(\num*(14+1)+\idx)] + str sAme, [input_addr, 8*(\num*(16) +\idx)] + str sAmi, [input_addr, 8*(\num*(16+1)+\idx)] + str sAmo, [input_addr, 8*(\num*(18) +\idx)] + str sAmu, [input_addr, 8*(\num*(18+1)+\idx)] + str sAsa, [input_addr, 8*(\num*(20) +\idx)] + str sAse, [input_addr, 8*(\num*(20+1)+\idx)] + str sAsi, [input_addr, 8*(\num*(22) +\idx)] + str sAso, [input_addr, 8*(\num*(22+1)+\idx)] + str sAsu, [input_addr, 8*(\num*(24) +\idx)] +.endm + +.macro load_input_scalar num idx + ldr s_Aba, [input_addr, 8*(\num*(0) +\idx)] + ldr sAbe, [input_addr, 8*(\num*(0+1) +\idx)] + ldr sAbi, [input_addr, 8*(\num*(2)+ \idx)] + ldr sAbo, [input_addr, 8*(\num*(2+1) +\idx)] + ldr sAbu, [input_addr, 8*(\num*(4)+ \idx)] + ldr sAga, [input_addr, 8*(\num*(4+1) +\idx)] + ldr sAge, [input_addr, 8*(\num*(6)+ \idx)] + ldr sAgi, [input_addr, 8*(\num*(6+1) +\idx)] + ldr sAgo, [input_addr, 8*(\num*(8)+ \idx)] + ldr sAgu, [input_addr, 8*(\num*(8+1) +\idx)] + ldr sAka, [input_addr, 8*(\num*(10) +\idx)] + ldr sAke, [input_addr, 8*(\num*(10+1)+\idx)] + ldr sAki, [input_addr, 8*(\num*(12) +\idx)] + ldr sAko, [input_addr, 8*(\num*(12+1)+\idx)] + ldr sAku, [input_addr, 8*(\num*(14) +\idx)] + ldr sAma, [input_addr, 8*(\num*(14+1)+\idx)] + ldr sAme, [input_addr, 8*(\num*(16) +\idx)] + ldr sAmi, [input_addr, 8*(\num*(16+1)+\idx)] + ldr sAmo, [input_addr, 8*(\num*(18) +\idx)] + ldr sAmu, [input_addr, 8*(\num*(18+1)+\idx)] + ldr sAsa, [input_addr, 8*(\num*(20) +\idx)] + ldr sAse, [input_addr, 8*(\num*(20+1)+\idx)] + ldr sAsi, [input_addr, 8*(\num*(22) +\idx)] + ldr sAso, [input_addr, 8*(\num*(22+1)+\idx)] + ldr sAsu, [input_addr, 8*(\num*(24) +\idx)] +.endm + +#define STACK_SIZE (8*8 + 16*6 + 4*8 + 16*5) // VREGS (8*8), GPRs (16*6), count (8), const (8), input (8), padding (8) +#define STACK_BASE_GPRS (4*8) +#define STACK_BASE_VREGS (4*8+16*6) +#define STACK_BASE_TMP (8*8 + 16*6 + 4*8) +#define STACK_OFFSET_INPUT (0*8) +#define STACK_OFFSET_CONST (1*8) +#define STACK_OFFSET_COUNT (2*8) +#define STACK_OFFSET_INPUT_SCALAR (3*8) + +#define vAga_offset 0 +#define vAge_offset 1 +#define vAgi_offset 2 +#define vAgo_offset 3 +#define vAgu_offset 4 + +#define save(name) \ + str name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] +#define restore(name) \ + ldr name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] + + +.macro save_gprs + stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro save_vregs + stp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] + stp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + stp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + stp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] +.endm + +.macro restore_vregs + ldp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] + ldp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + ldp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + ldp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] +.endm + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +.macro eor5 dst, src0, src1, src2, src3, src4 + eor \dst, \src0, \src1 + eor \dst, \dst, \src2 + eor \dst, \dst, \src3 + eor \dst, \dst, \src4 +.endm + +.macro xor_rol dst, src1, src0, imm + eor \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro bic_rol dst, src1, src0, imm + bic \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro rotate dst, src, imm + ror \dst, \src, #(64-\imm) +.endm + +.macro save reg, offset + str \reg, [sp, #\offset] +.endm + +.macro restore reg, offset + ldr \reg, [sp, #\offset] +.endm + +.macro hybrid_round_initial +eor sC0, sAma, sAsa SEP +eor sC1, sAme, sAse SEP eor3_m0 C0, vAba, vAga, vAka +eor sC2, sAmi, sAsi SEP +eor sC3, sAmo, sAso SEP eor3_m0 C1, vAbe, vAge, vAke +eor sC4, sAmu, sAsu SEP +eor sC0, sAka, sC0 SEP eor3_m0 C2, vAbi, vAgi, vAki +eor sC1, sAke, sC1 SEP +eor sC2, sAki, sC2 SEP +eor sC3, sAko, sC3 SEP eor3_m0 C3, vAbo, vAgo, vAko +eor sC4, sAku, sC4 SEP +eor sC0, sAga, sC0 SEP eor3_m0 C4, vAbu, vAgu, vAku +eor sC1, sAge, sC1 SEP +eor sC2, sAgi, sC2 SEP save(vAga) +eor sC3, sAgo, sC3 SEP +eor sC4, sAgu, sC4 SEP +eor sC0, s_Aba, sC0 SEP vzr .req vAga +eor sC1, sAbe, sC1 SEP +eor sC2, sAbi, sC2 SEP eor vzr.16b, vzr.16b, vzr.16b +eor sC3, sAbo, sC3 SEP +eor sC4, sAbu, sC4 SEP +eor sE1, sC0, sC2, ROR #63 SEP save(vAge) +eor sE3, sC2, sC4, ROR #63 SEP +eor sE0, sC4, sC1, ROR #63 SEP save(vAgi) +eor sE2, sC1, sC3, ROR #63 SEP +eor sE4, sC3, sC0, ROR #63 SEP save(vAgo) +eor s_Aba_, s_Aba, sE0 SEP +eor sAsa_, sAbi, sE2 SEP +eor sAbi_, sAki, sE2 SEP save(vAgu) +eor sAki_, sAko, sE3 SEP +eor sAko_, sAmu, sE4 SEP C0r .req vAge +eor sAmu_, sAso, sE3 SEP +eor sAso_, sAma, sE0 SEP +eor sAka_, sAbe, sE1 SEP C1r .req vAgi +eor sAse_, sAgo, sE3 SEP +eor sAgo_, sAme, sE1 SEP C2r .req vAgo +eor sAke_, sAgi, sE2 SEP +eor sAgi_, sAka, sE0 SEP C3r .req vAgu +eor sAga_, sAbo, sE3 SEP +eor sAbo_, sAmo, sE3 SEP +eor sAmo_, sAmi, sE2 SEP C4r .req v31 +eor sAmi_, sAke, sE1 SEP +eor sAge_, sAgu, sE4 SEP eor3_m0 C0, C0, vAma, vAsa +eor sAgu_, sAsi, sE2 SEP +eor sAsi_, sAku, sE4 SEP +eor sAku_, sAsa, sE0 SEP eor3_m0 C1, C1, vAme, vAse +eor sAma_, sAbu, sE4 SEP +eor sAbu_, sAsu, sE4 SEP eor3_m0 C2, C2, vAmi, vAsi +eor sAsu_, sAse, sE1 SEP +eor sAme_, sAga, sE0 SEP eor3_m0 C3, C3, vAmo, vAso +eor sAbe_, sAge, sE1 SEP +load_constant_ptr SEP +tmp0 .req x0 SEP eor3_m0 C4, C4, vAmu, vAsu +tmp1 .req x29 SEP +bic tmp0, sAgi_, sAge_, ROR #47 SEP xar_m0 C2r, vzr, C2, 63 +bic tmp1, sAgo_, sAgi_, ROR #42 SEP +eor sAga, tmp0, sAga_, ROR #39 SEP +bic tmp0, sAgu_, sAgo_, ROR #16 SEP xar_m0 C4r, vzr, C4, 63 +eor sAge, tmp1, sAge_, ROR #25 SEP +bic tmp1, sAga_, sAgu_, ROR #31 SEP xar_m0 C1r, vzr, C1, 63 +eor sAgi, tmp0, sAgi_, ROR #58 SEP +bic tmp0, sAge_, sAga_, ROR #56 SEP xar_m0 C3r, vzr, C3, 63 +eor sAgo, tmp1, sAgo_, ROR #47 SEP +bic tmp1, sAki_, sAke_, ROR #19 SEP +eor sAgu, tmp0, sAgu_, ROR #23 SEP xar_m0 C0r, vzr, C0, 63 +bic tmp0, sAko_, sAki_, ROR #47 SEP +eor sAka, tmp1, sAka_, ROR #24 SEP eor2 E1, C0, C2r +bic tmp1, sAku_, sAko_, ROR #10 SEP +eor sAke, tmp0, sAke_, ROR #2 SEP +bic tmp0, sAka_, sAku_, ROR #47 SEP restore(vAgo) +eor sAki, tmp1, sAki_, ROR #57 SEP +bic tmp1, sAke_, sAka_, ROR #5 SEP eor2 E3, C2, C4r +eor sAko, tmp0, sAko_, ROR #57 SEP +bic tmp0, sAmi_, sAme_, ROR #38 SEP restore(vAga) +eor sAku, tmp1, sAku_, ROR #52 SEP +bic tmp1, sAmo_, sAmi_, ROR #5 SEP +eor sAma, tmp0, sAma_, ROR #47 SEP eor2 E0, C4, C1r +bic tmp0, sAmu_, sAmo_, ROR #41 SEP +eor sAme, tmp1, sAme_, ROR #43 SEP restore(vAgi) +bic tmp1, sAma_, sAmu_, ROR #35 SEP +eor sAmi, tmp0, sAmi_, ROR #46 SEP +bic tmp0, sAme_, sAma_, ROR #9 SEP eor2 E2, C1, C3r +ldr cur_const, [const_addr] SEP +eor sAmo, tmp1, sAmo_, ROR #12 SEP restore(vAgu) +bic tmp1, sAsi_, sAse_, ROR #48 SEP +eor sAmu, tmp0, sAmu_, ROR #44 SEP eor2 E4, C3, C0r +bic tmp0, sAso_, sAsi_, ROR #2 SEP +eor sAsa, tmp1, sAsa_, ROR #41 SEP +bic tmp1, sAsu_, sAso_, ROR #25 SEP restore(vAge) +eor sAse, tmp0, sAse_, ROR #50 SEP +bic tmp0, sAsa_, sAsu_, ROR #60 SEP eor vAba_.16b, vAba.16b, E0.16b +eor sAsi, tmp1, sAsi_, ROR #27 SEP +bic tmp1, sAse_, sAsa_, ROR #57 SEP +eor sAso, tmp0, sAso_, ROR #21 SEP xar_m0 vAsa_, vAbi, E2, 2 +mov count, #1 SEP +bic tmp0, sAbi_, sAbe_, ROR #63 SEP xar_m0 vAbi_, vAki, E2, 21 +eor sAsu, tmp1, sAsu_, ROR #53 SEP +bic tmp1, sAbo_, sAbi_, ROR #42 SEP xar_m0 vAki_, vAko, E3, 39 +eor s_Aba, s_Aba_, tmp0, ROR #21 SEP +bic tmp0, sAbu_, sAbo_, ROR #57 SEP +eor sAbe, tmp1, sAbe_, ROR #41 SEP xar_m0 vAko_, vAmu, E4, 56 +bic tmp1, s_Aba_, sAbu_, ROR #50 SEP +eor sAbi, tmp0, sAbi_, ROR #35 SEP xar_m0 vAmu_, vAso, E3, 8 +bic tmp0, sAbe_, s_Aba_, ROR #44 SEP +eor sAbo, tmp1, sAbo_, ROR #43 SEP +eor sAbu, tmp0, sAbu_, ROR #30 SEP xar_m0 vAso_, vAma, E0, 23 +eor s_Aba, s_Aba, cur_const SEP +save count, STACK_OFFSET_COUNT SEP xar_m0 vAka_, vAbe, E1, 63 +eor sC2, sAsi, sAbi, ROR #52 SEP +eor sC0, s_Aba, sAga, ROR #61 SEP xar_m0 vAse_, vAgo, E3, 9 +eor sC4, sAku, sAgu, ROR #50 SEP +eor sC1, sAke, sAme, ROR #57 SEP +eor sC3, sAbo, sAko, ROR #63 SEP xar_m0 vAgo_, vAme, E1, 19 +eor sC2, sC2, sAki, ROR #48 SEP +eor sC0, sC0, sAma, ROR #54 SEP xar_m0 vAke_, vAgi, E2, 58 +eor sC4, sC4, sAmu, ROR #34 SEP +eor sC1, sC1, sAbe, ROR #51 SEP xar_m0 vAgi_, vAka, E0, 61 +eor sC3, sC3, sAmo, ROR #37 SEP +eor sC2, sC2, sAmi, ROR #10 SEP +eor sC0, sC0, sAka, ROR #39 SEP xar_m0 vAga_, vAbo, E3, 36 +eor sC4, sC4, sAbu, ROR #26 SEP +eor sC1, sC1, sAse, ROR #31 SEP xar_m0 vAbo_, vAmo, E3, 43 +eor sC3, sC3, sAgo, ROR #36 SEP +eor sC2, sC2, sAgi, ROR #5 SEP +eor sC0, sC0, sAsa, ROR #25 SEP xar_m0 vAmo_, vAmi, E2, 49 +eor sC4, sC4, sAsu, ROR #15 SEP +eor sC1, sC1, sAge, ROR #27 SEP xar_m0 vAmi_, vAke, E1, 54 +eor sC3, sC3, sAso, ROR #2 SEP +eor sE1, sC0, sC2, ROR #61 SEP xar_m0 vAge_, vAgu, E4, 44 +ror sC2, sC2, 62 SEP +eor sE3, sC2, sC4, ROR #57 SEP +ror sC4, sC4, 58 SEP xar_m0 vAgu_, vAsi, E2, 3 +eor sE0, sC4, sC1, ROR #55 SEP +ror sC1, sC1, 56 SEP xar_m0 vAsi_, vAku, E4, 25 +eor sE2, sC1, sC3, ROR #63 SEP +eor sE4, sC3, sC0, ROR #63 SEP +eor s_Aba_, sE0, s_Aba SEP xar_m0 vAku_, vAsa, E0, 46 +eor sAsa_, sE2, sAbi, ROR #50 SEP +eor sAbi_, sE2, sAki, ROR #46 SEP xar_m0 vAma_, vAbu, E4, 37 +eor sAki_, sE3, sAko, ROR #63 SEP +eor sAko_, sE4, sAmu, ROR #28 SEP xar_m0 vAbu_, vAsu, E4, 50 +eor sAmu_, sE3, sAso, ROR #2 SEP +eor sAso_, sE0, sAma, ROR #54 SEP +eor sAka_, sE1, sAbe, ROR #43 SEP xar_m0 vAsu_, vAse, E1, 62 +eor sAse_, sE3, sAgo, ROR #36 SEP +eor sAgo_, sE1, sAme, ROR #49 SEP xar_m0 vAme_, vAga, E0, 28 +eor sAke_, sE2, sAgi, ROR #3 SEP +eor sAgi_, sE0, sAka, ROR #39 SEP +eor sAga_, sE3, sAbo SEP xar_m0 vAbe_, vAge, E1, 20 +eor sAbo_, sE3, sAmo, ROR #37 SEP +eor sAmo_, sE2, sAmi, ROR #8 SEP restore x27, STACK_OFFSET_CONST +eor sAmi_, sE1, sAke, ROR #56 SEP +eor sAge_, sE4, sAgu, ROR #44 SEP ldr q31, [x27], #16 +eor sAgu_, sE2, sAsi, ROR #62 SEP +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP save x27, STACK_OFFSET_CONST +eor sAma_, sE4, sAbu, ROR #20 SEP +eor sAbu_, sE4, sAsu, ROR #9 SEP bcax_m0 vAga, vAga_, vAgi_, vAge_ +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP +eor sAbe_, sE1, sAge, ROR #19 SEP bcax_m0 vAge, vAge_, vAgo_, vAgi_ +load_constant_ptr SEP +restore count, STACK_OFFSET_COUNT SEP bcax_m0 vAgi, vAgi_, vAgu_, vAgo_ +tmp0 .req x0 SEP +tmp1 .req x29 SEP bcax_m0 vAgo, vAgo_, vAga_, vAgu_ +bic tmp0, sAgi_, sAge_, ROR #47 SEP +bic tmp1, sAgo_, sAgi_, ROR #42 SEP +eor sAga, tmp0, sAga_, ROR #39 SEP bcax_m0 vAgu, vAgu_, vAge_, vAga_ +bic tmp0, sAgu_, sAgo_, ROR #16 SEP +eor sAge, tmp1, sAge_, ROR #25 SEP bcax_m0 vAka, vAka_, vAki_, vAke_ +bic tmp1, sAga_, sAgu_, ROR #31 SEP +eor sAgi, tmp0, sAgi_, ROR #58 SEP +bic tmp0, sAge_, sAga_, ROR #56 SEP bcax_m0 vAke, vAke_, vAko_, vAki_ +eor sAgo, tmp1, sAgo_, ROR #47 SEP +bic tmp1, sAki_, sAke_, ROR #19 SEP bcax_m0 vAki, vAki_, vAku_, vAko_ +eor sAgu, tmp0, sAgu_, ROR #23 SEP +bic tmp0, sAko_, sAki_, ROR #47 SEP bcax_m0 vAko, vAko_, vAka_, vAku_ +eor sAka, tmp1, sAka_, ROR #24 SEP +bic tmp1, sAku_, sAko_, ROR #10 SEP +eor sAke, tmp0, sAke_, ROR #2 SEP bcax_m0 vAku, vAku_, vAke_, vAka_ +bic tmp0, sAka_, sAku_, ROR #47 SEP +eor sAki, tmp1, sAki_, ROR #57 SEP bcax_m0 vAma, vAma_, vAmi_, vAme_ +bic tmp1, sAke_, sAka_, ROR #5 SEP +eor sAko, tmp0, sAko_, ROR #57 SEP +bic tmp0, sAmi_, sAme_, ROR #38 SEP bcax_m0 vAme, vAme_, vAmo_, vAmi_ +eor sAku, tmp1, sAku_, ROR #52 SEP +bic tmp1, sAmo_, sAmi_, ROR #5 SEP bcax_m0 vAmi, vAmi_, vAmu_, vAmo_ +eor sAma, tmp0, sAma_, ROR #47 SEP +bic tmp0, sAmu_, sAmo_, ROR #41 SEP bcax_m0 vAmo, vAmo_, vAma_, vAmu_ +eor sAme, tmp1, sAme_, ROR #43 SEP +bic tmp1, sAma_, sAmu_, ROR #35 SEP +eor sAmi, tmp0, sAmi_, ROR #46 SEP bcax_m0 vAmu, vAmu_, vAme_, vAma_ +bic tmp0, sAme_, sAma_, ROR #9 SEP +ldr cur_const, [const_addr, count, UXTW #3] SEP bcax_m0 vAsa, vAsa_, vAsi_, vAse_ +eor sAmo, tmp1, sAmo_, ROR #12 SEP +bic tmp1, sAsi_, sAse_, ROR #48 SEP +eor sAmu, tmp0, sAmu_, ROR #44 SEP bcax_m0 vAse, vAse_, vAso_, vAsi_ +bic tmp0, sAso_, sAsi_, ROR #2 SEP +eor sAsa, tmp1, sAsa_, ROR #41 SEP bcax_m0 vAsi, vAsi_, vAsu_, vAso_ +bic tmp1, sAsu_, sAso_, ROR #25 SEP +eor sAse, tmp0, sAse_, ROR #50 SEP bcax_m0 vAso, vAso_, vAsa_, vAsu_ +bic tmp0, sAsa_, sAsu_, ROR #60 SEP +eor sAsi, tmp1, sAsi_, ROR #27 SEP +bic tmp1, sAse_, sAsa_, ROR #57 SEP bcax_m0 vAsu, vAsu_, vAse_, vAsa_ +eor sAso, tmp0, sAso_, ROR #21 SEP +bic tmp0, sAbi_, sAbe_, ROR #63 SEP bcax_m0 vAba, vAba_, vAbi_, vAbe_ +add count, count, #1 SEP +save count, STACK_OFFSET_COUNT SEP +eor sAsu, tmp1, sAsu_, ROR #53 SEP bcax_m0 vAbe, vAbe_, vAbo_, vAbi_ +bic tmp1, sAbo_, sAbi_, ROR #42 SEP +eor s_Aba, s_Aba_, tmp0, ROR #21 SEP bcax_m0 vAbi, vAbi_, vAbu_, vAbo_ +bic tmp0, sAbu_, sAbo_, ROR #57 SEP +eor sAbe, tmp1, sAbe_, ROR #41 SEP bcax_m0 vAbo, vAbo_, vAba_, vAbu_ +bic tmp1, s_Aba_, sAbu_, ROR #50 SEP +eor sAbi, tmp0, sAbi_, ROR #35 SEP +bic tmp0, sAbe_, s_Aba_, ROR #44 SEP bcax_m0 vAbu, vAbu_, vAbe_, vAba_ +eor sAbo, tmp1, sAbo_, ROR #43 SEP +eor sAbu, tmp0, sAbu_, ROR #30 SEP eor vAba.16b, vAba.16b, v31.16b +eor s_Aba, s_Aba, cur_const SEP +.endm + + +.macro hybrid_round_noninitial +eor sC2, sAsi, sAbi, ROR #52 SEP +eor sC0, s_Aba, sAga, ROR #61 SEP eor3_m0 C0, vAba, vAga, vAka +eor sC4, sAku, sAgu, ROR #50 SEP +eor sC1, sAke, sAme, ROR #57 SEP eor3_m0 C1, vAbe, vAge, vAke +eor sC3, sAbo, sAko, ROR #63 SEP +eor sC2, sC2, sAki, ROR #48 SEP eor3_m0 C2, vAbi, vAgi, vAki +eor sC0, sC0, sAma, ROR #54 SEP +eor sC4, sC4, sAmu, ROR #34 SEP +eor sC1, sC1, sAbe, ROR #51 SEP eor3_m0 C3, vAbo, vAgo, vAko +eor sC3, sC3, sAmo, ROR #37 SEP +eor sC2, sC2, sAmi, ROR #10 SEP eor3_m0 C4, vAbu, vAgu, vAku +eor sC0, sC0, sAka, ROR #39 SEP +eor sC4, sC4, sAbu, ROR #26 SEP +eor sC1, sC1, sAse, ROR #31 SEP save(vAga) +eor sC3, sC3, sAgo, ROR #36 SEP +eor sC2, sC2, sAgi, ROR #5 SEP vzr .req vAga +eor sC0, sC0, sAsa, ROR #25 SEP +eor sC4, sC4, sAsu, ROR #15 SEP +eor sC1, sC1, sAge, ROR #27 SEP eor vzr.16b, vzr.16b, vzr.16b +eor sC3, sC3, sAso, ROR #2 SEP +eor sE1, sC0, sC2, ROR #61 SEP save(vAge) +ror sC2, sC2, 62 SEP +eor sE3, sC2, sC4, ROR #57 SEP save(vAgi) +ror sC4, sC4, 58 SEP +eor sE0, sC4, sC1, ROR #55 SEP +ror sC1, sC1, 56 SEP save(vAgo) +eor sE2, sC1, sC3, ROR #63 SEP +eor sE4, sC3, sC0, ROR #63 SEP save(vAgu) +eor s_Aba_, sE0, s_Aba SEP +eor sAsa_, sE2, sAbi, ROR #50 SEP +eor sAbi_, sE2, sAki, ROR #46 SEP C0r .req vAge +eor sAki_, sE3, sAko, ROR #63 SEP +eor sAko_, sE4, sAmu, ROR #28 SEP C1r .req vAgi +eor sAmu_, sE3, sAso, ROR #2 SEP +eor sAso_, sE0, sAma, ROR #54 SEP +eor sAka_, sE1, sAbe, ROR #43 SEP C2r .req vAgo +eor sAse_, sE3, sAgo, ROR #36 SEP +eor sAgo_, sE1, sAme, ROR #49 SEP C3r .req vAgu +eor sAke_, sE2, sAgi, ROR #3 SEP +eor sAgi_, sE0, sAka, ROR #39 SEP +eor sAga_, sE3, sAbo SEP C4r .req v31 +eor sAbo_, sE3, sAmo, ROR #37 SEP +eor sAmo_, sE2, sAmi, ROR #8 SEP eor3_m0 C0, C0, vAma, vAsa +eor sAmi_, sE1, sAke, ROR #56 SEP +eor sAge_, sE4, sAgu, ROR #44 SEP eor3_m0 C1, C1, vAme, vAse +eor sAgu_, sE2, sAsi, ROR #62 SEP +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP eor3_m0 C2, C2, vAmi, vAsi +eor sAma_, sE4, sAbu, ROR #20 SEP +eor sAbu_, sE4, sAsu, ROR #9 SEP eor3_m0 C3, C3, vAmo, vAso +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP +eor sAbe_, sE1, sAge, ROR #19 SEP eor3_m0 C4, C4, vAmu, vAsu +load_constant_ptr SEP +restore count, STACK_OFFSET_COUNT SEP xar_m0 C2r, vzr, C2, 63 +tmp0 .req x0 SEP +tmp1 .req x29 SEP +bic tmp0, sAgi_, sAge_, ROR #47 SEP xar_m0 C4r, vzr, C4, 63 +bic tmp1, sAgo_, sAgi_, ROR #42 SEP +eor sAga, tmp0, sAga_, ROR #39 SEP xar_m0 C1r, vzr, C1, 63 +bic tmp0, sAgu_, sAgo_, ROR #16 SEP +eor sAge, tmp1, sAge_, ROR #25 SEP xar_m0 C3r, vzr, C3, 63 +bic tmp1, sAga_, sAgu_, ROR #31 SEP +eor sAgi, tmp0, sAgi_, ROR #58 SEP +bic tmp0, sAge_, sAga_, ROR #56 SEP xar_m0 C0r, vzr, C0, 63 +eor sAgo, tmp1, sAgo_, ROR #47 SEP +bic tmp1, sAki_, sAke_, ROR #19 SEP eor2 E1, C0, C2r +eor sAgu, tmp0, sAgu_, ROR #23 SEP +bic tmp0, sAko_, sAki_, ROR #47 SEP +eor sAka, tmp1, sAka_, ROR #24 SEP restore(vAgo) +bic tmp1, sAku_, sAko_, ROR #10 SEP +eor sAke, tmp0, sAke_, ROR #2 SEP eor2 E3, C2, C4r +bic tmp0, sAka_, sAku_, ROR #47 SEP +eor sAki, tmp1, sAki_, ROR #57 SEP +bic tmp1, sAke_, sAka_, ROR #5 SEP restore(vAga) +eor sAko, tmp0, sAko_, ROR #57 SEP +bic tmp0, sAmi_, sAme_, ROR #38 SEP eor2 E0, C4, C1r +eor sAku, tmp1, sAku_, ROR #52 SEP +bic tmp1, sAmo_, sAmi_, ROR #5 SEP +eor sAma, tmp0, sAma_, ROR #47 SEP restore(vAgi) +bic tmp0, sAmu_, sAmo_, ROR #41 SEP +eor sAme, tmp1, sAme_, ROR #43 SEP eor2 E2, C1, C3r +bic tmp1, sAma_, sAmu_, ROR #35 SEP +eor sAmi, tmp0, sAmi_, ROR #46 SEP restore(vAgu) +bic tmp0, sAme_, sAma_, ROR #9 SEP +ldr cur_const, [const_addr, count, UXTW #3] SEP +eor sAmo, tmp1, sAmo_, ROR #12 SEP eor2 E4, C3, C0r +bic tmp1, sAsi_, sAse_, ROR #48 SEP +eor sAmu, tmp0, sAmu_, ROR #44 SEP restore(vAge) +bic tmp0, sAso_, sAsi_, ROR #2 SEP +eor sAsa, tmp1, sAsa_, ROR #41 SEP +bic tmp1, sAsu_, sAso_, ROR #25 SEP eor vAba_.16b, vAba.16b, E0.16b +eor sAse, tmp0, sAse_, ROR #50 SEP +bic tmp0, sAsa_, sAsu_, ROR #60 SEP xar_m0 vAsa_, vAbi, E2, 2 +eor sAsi, tmp1, sAsi_, ROR #27 SEP +bic tmp1, sAse_, sAsa_, ROR #57 SEP +eor sAso, tmp0, sAso_, ROR #21 SEP xar_m0 vAbi_, vAki, E2, 21 +bic tmp0, sAbi_, sAbe_, ROR #63 SEP +add count, count, #1 SEP xar_m0 vAki_, vAko, E3, 39 +save count, STACK_OFFSET_COUNT SEP +eor sAsu, tmp1, sAsu_, ROR #53 SEP +bic tmp1, sAbo_, sAbi_, ROR #42 SEP xar_m0 vAko_, vAmu, E4, 56 +eor s_Aba, s_Aba_, tmp0, ROR #21 SEP +bic tmp0, sAbu_, sAbo_, ROR #57 SEP xar_m0 vAmu_, vAso, E3, 8 +eor sAbe, tmp1, sAbe_, ROR #41 SEP +bic tmp1, s_Aba_, sAbu_, ROR #50 SEP xar_m0 vAso_, vAma, E0, 23 +eor sAbi, tmp0, sAbi_, ROR #35 SEP +bic tmp0, sAbe_, s_Aba_, ROR #44 SEP +eor sAbo, tmp1, sAbo_, ROR #43 SEP xar_m0 vAka_, vAbe, E1, 63 +eor sAbu, tmp0, sAbu_, ROR #30 SEP +eor s_Aba, s_Aba, cur_const SEP xar_m0 vAse_, vAgo, E3, 9 +eor sC2, sAsi, sAbi, ROR #52 SEP +eor sC0, s_Aba, sAga, ROR #61 SEP +eor sC4, sAku, sAgu, ROR #50 SEP xar_m0 vAgo_, vAme, E1, 19 +eor sC1, sAke, sAme, ROR #57 SEP +eor sC3, sAbo, sAko, ROR #63 SEP xar_m0 vAke_, vAgi, E2, 58 +eor sC2, sC2, sAki, ROR #48 SEP +eor sC0, sC0, sAma, ROR #54 SEP +eor sC4, sC4, sAmu, ROR #34 SEP xar_m0 vAgi_, vAka, E0, 61 +eor sC1, sC1, sAbe, ROR #51 SEP +eor sC3, sC3, sAmo, ROR #37 SEP xar_m0 vAga_, vAbo, E3, 36 +eor sC2, sC2, sAmi, ROR #10 SEP +eor sC0, sC0, sAka, ROR #39 SEP xar_m0 vAbo_, vAmo, E3, 43 +eor sC4, sC4, sAbu, ROR #26 SEP +eor sC1, sC1, sAse, ROR #31 SEP +eor sC3, sC3, sAgo, ROR #36 SEP xar_m0 vAmo_, vAmi, E2, 49 +eor sC2, sC2, sAgi, ROR #5 SEP +eor sC0, sC0, sAsa, ROR #25 SEP xar_m0 vAmi_, vAke, E1, 54 +eor sC4, sC4, sAsu, ROR #15 SEP +eor sC1, sC1, sAge, ROR #27 SEP +eor sC3, sC3, sAso, ROR #2 SEP xar_m0 vAge_, vAgu, E4, 44 +eor sE1, sC0, sC2, ROR #61 SEP +ror sC2, sC2, 62 SEP xar_m0 vAgu_, vAsi, E2, 3 +eor sE3, sC2, sC4, ROR #57 SEP +ror sC4, sC4, 58 SEP +eor sE0, sC4, sC1, ROR #55 SEP xar_m0 vAsi_, vAku, E4, 25 +ror sC1, sC1, 56 SEP +eor sE2, sC1, sC3, ROR #63 SEP xar_m0 vAku_, vAsa, E0, 46 +eor sE4, sC3, sC0, ROR #63 SEP +eor s_Aba_, sE0, s_Aba SEP +eor sAsa_, sE2, sAbi, ROR #50 SEP xar_m0 vAma_, vAbu, E4, 37 +eor sAbi_, sE2, sAki, ROR #46 SEP +eor sAki_, sE3, sAko, ROR #63 SEP xar_m0 vAbu_, vAsu, E4, 50 +eor sAko_, sE4, sAmu, ROR #28 SEP +eor sAmu_, sE3, sAso, ROR #2 SEP xar_m0 vAsu_, vAse, E1, 62 +eor sAso_, sE0, sAma, ROR #54 SEP +eor sAka_, sE1, sAbe, ROR #43 SEP +eor sAse_, sE3, sAgo, ROR #36 SEP xar_m0 vAme_, vAga, E0, 28 +eor sAgo_, sE1, sAme, ROR #49 SEP +eor sAke_, sE2, sAgi, ROR #3 SEP xar_m0 vAbe_, vAge, E1, 20 +eor sAgi_, sE0, sAka, ROR #39 SEP +eor sAga_, sE3, sAbo SEP +eor sAbo_, sE3, sAmo, ROR #37 SEP restore x27, STACK_OFFSET_CONST +eor sAmo_, sE2, sAmi, ROR #8 SEP +eor sAmi_, sE1, sAke, ROR #56 SEP ldr q31, [x27], #16 +eor sAge_, sE4, sAgu, ROR #44 SEP +eor sAgu_, sE2, sAsi, ROR #62 SEP +eor sAsi_, sE4, sAku, ROR #58 SEP save x27, STACK_OFFSET_CONST +eor sAku_, sE0, sAsa, ROR #25 SEP +eor sAma_, sE4, sAbu, ROR #20 SEP bcax_m0 vAga, vAga_, vAgi_, vAge_ +eor sAbu_, sE4, sAsu, ROR #9 SEP +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP bcax_m0 vAge, vAge_, vAgo_, vAgi_ +eor sAbe_, sE1, sAge, ROR #19 SEP +load_constant_ptr SEP bcax_m0 vAgi, vAgi_, vAgu_, vAgo_ +restore count, STACK_OFFSET_COUNT SEP +tmp0 .req x0 SEP bcax_m0 vAgo, vAgo_, vAga_, vAgu_ +tmp1 .req x29 SEP +bic tmp0, sAgi_, sAge_, ROR #47 SEP +bic tmp1, sAgo_, sAgi_, ROR #42 SEP bcax_m0 vAgu, vAgu_, vAge_, vAga_ +eor sAga, tmp0, sAga_, ROR #39 SEP +bic tmp0, sAgu_, sAgo_, ROR #16 SEP bcax_m0 vAka, vAka_, vAki_, vAke_ +eor sAge, tmp1, sAge_, ROR #25 SEP +bic tmp1, sAga_, sAgu_, ROR #31 SEP +eor sAgi, tmp0, sAgi_, ROR #58 SEP bcax_m0 vAke, vAke_, vAko_, vAki_ +bic tmp0, sAge_, sAga_, ROR #56 SEP +eor sAgo, tmp1, sAgo_, ROR #47 SEP bcax_m0 vAki, vAki_, vAku_, vAko_ +bic tmp1, sAki_, sAke_, ROR #19 SEP +eor sAgu, tmp0, sAgu_, ROR #23 SEP +bic tmp0, sAko_, sAki_, ROR #47 SEP bcax_m0 vAko, vAko_, vAka_, vAku_ +eor sAka, tmp1, sAka_, ROR #24 SEP +bic tmp1, sAku_, sAko_, ROR #10 SEP bcax_m0 vAku, vAku_, vAke_, vAka_ +eor sAke, tmp0, sAke_, ROR #2 SEP +bic tmp0, sAka_, sAku_, ROR #47 SEP bcax_m0 vAma, vAma_, vAmi_, vAme_ +eor sAki, tmp1, sAki_, ROR #57 SEP +bic tmp1, sAke_, sAka_, ROR #5 SEP +eor sAko, tmp0, sAko_, ROR #57 SEP bcax_m0 vAme, vAme_, vAmo_, vAmi_ +bic tmp0, sAmi_, sAme_, ROR #38 SEP +eor sAku, tmp1, sAku_, ROR #52 SEP bcax_m0 vAmi, vAmi_, vAmu_, vAmo_ +bic tmp1, sAmo_, sAmi_, ROR #5 SEP +eor sAma, tmp0, sAma_, ROR #47 SEP +bic tmp0, sAmu_, sAmo_, ROR #41 SEP bcax_m0 vAmo, vAmo_, vAma_, vAmu_ +eor sAme, tmp1, sAme_, ROR #43 SEP +bic tmp1, sAma_, sAmu_, ROR #35 SEP bcax_m0 vAmu, vAmu_, vAme_, vAma_ +eor sAmi, tmp0, sAmi_, ROR #46 SEP +bic tmp0, sAme_, sAma_, ROR #9 SEP +ldr cur_const, [const_addr, count, UXTW #3] SEP bcax_m0 vAsa, vAsa_, vAsi_, vAse_ +eor sAmo, tmp1, sAmo_, ROR #12 SEP +bic tmp1, sAsi_, sAse_, ROR #48 SEP bcax_m0 vAse, vAse_, vAso_, vAsi_ +eor sAmu, tmp0, sAmu_, ROR #44 SEP +bic tmp0, sAso_, sAsi_, ROR #2 SEP +eor sAsa, tmp1, sAsa_, ROR #41 SEP bcax_m0 vAsi, vAsi_, vAsu_, vAso_ +bic tmp1, sAsu_, sAso_, ROR #25 SEP +eor sAse, tmp0, sAse_, ROR #50 SEP bcax_m0 vAso, vAso_, vAsa_, vAsu_ +bic tmp0, sAsa_, sAsu_, ROR #60 SEP +eor sAsi, tmp1, sAsi_, ROR #27 SEP bcax_m0 vAsu, vAsu_, vAse_, vAsa_ +bic tmp1, sAse_, sAsa_, ROR #57 SEP +eor sAso, tmp0, sAso_, ROR #21 SEP +bic tmp0, sAbi_, sAbe_, ROR #63 SEP bcax_m0 vAba, vAba_, vAbi_, vAbe_ +add count, count, #1 SEP +save count, STACK_OFFSET_COUNT SEP bcax_m0 vAbe, vAbe_, vAbo_, vAbi_ +eor sAsu, tmp1, sAsu_, ROR #53 SEP +bic tmp1, sAbo_, sAbi_, ROR #42 SEP +eor s_Aba, s_Aba_, tmp0, ROR #21 SEP bcax_m0 vAbi, vAbi_, vAbu_, vAbo_ +bic tmp0, sAbu_, sAbo_, ROR #57 SEP +eor sAbe, tmp1, sAbe_, ROR #41 SEP bcax_m0 vAbo, vAbo_, vAba_, vAbu_ +bic tmp1, s_Aba_, sAbu_, ROR #50 SEP +eor sAbi, tmp0, sAbi_, ROR #35 SEP +bic tmp0, sAbe_, s_Aba_, ROR #44 SEP bcax_m0 vAbu, vAbu_, vAbe_, vAba_ +eor sAbo, tmp1, sAbo_, ROR #43 SEP +eor sAbu, tmp0, sAbu_, ROR #30 SEP eor vAba.16b, vAba.16b, v31.16b +eor s_Aba, s_Aba, cur_const SEP +.endm + + +.macro hybrid_round_final +eor sC2, sAsi, sAbi, ROR #52 SEP +eor sC0, s_Aba, sAga, ROR #61 SEP eor3_m0 C0, vAba, vAga, vAka +eor sC4, sAku, sAgu, ROR #50 SEP +eor sC1, sAke, sAme, ROR #57 SEP eor3_m0 C1, vAbe, vAge, vAke +eor sC3, sAbo, sAko, ROR #63 SEP +eor sC2, sC2, sAki, ROR #48 SEP +eor sC0, sC0, sAma, ROR #54 SEP eor3_m0 C2, vAbi, vAgi, vAki +eor sC4, sC4, sAmu, ROR #34 SEP +eor sC1, sC1, sAbe, ROR #51 SEP +eor sC3, sC3, sAmo, ROR #37 SEP eor3_m0 C3, vAbo, vAgo, vAko +eor sC2, sC2, sAmi, ROR #10 SEP +eor sC0, sC0, sAka, ROR #39 SEP eor3_m0 C4, vAbu, vAgu, vAku +eor sC4, sC4, sAbu, ROR #26 SEP +eor sC1, sC1, sAse, ROR #31 SEP +eor sC3, sC3, sAgo, ROR #36 SEP save(vAga) +eor sC2, sC2, sAgi, ROR #5 SEP +eor sC0, sC0, sAsa, ROR #25 SEP +eor sC4, sC4, sAsu, ROR #15 SEP vzr .req vAga +eor sC1, sC1, sAge, ROR #27 SEP +eor sC3, sC3, sAso, ROR #2 SEP eor vzr.16b, vzr.16b, vzr.16b +eor sE1, sC0, sC2, ROR #61 SEP +ror sC2, sC2, 62 SEP +eor sE3, sC2, sC4, ROR #57 SEP save(vAge) +ror sC4, sC4, 58 SEP +eor sE0, sC4, sC1, ROR #55 SEP +ror sC1, sC1, 56 SEP save(vAgi) +eor sE2, sC1, sC3, ROR #63 SEP +eor sE4, sC3, sC0, ROR #63 SEP save(vAgo) +eor s_Aba_, sE0, s_Aba SEP +eor sAsa_, sE2, sAbi, ROR #50 SEP +eor sAbi_, sE2, sAki, ROR #46 SEP save(vAgu) +eor sAki_, sE3, sAko, ROR #63 SEP +eor sAko_, sE4, sAmu, ROR #28 SEP +eor sAmu_, sE3, sAso, ROR #2 SEP C0r .req vAge +eor sAso_, sE0, sAma, ROR #54 SEP +eor sAka_, sE1, sAbe, ROR #43 SEP +eor sAse_, sE3, sAgo, ROR #36 SEP C1r .req vAgi +eor sAgo_, sE1, sAme, ROR #49 SEP +eor sAke_, sE2, sAgi, ROR #3 SEP C2r .req vAgo +eor sAgi_, sE0, sAka, ROR #39 SEP +eor sAga_, sE3, sAbo SEP +eor sAbo_, sE3, sAmo, ROR #37 SEP C3r .req vAgu +eor sAmo_, sE2, sAmi, ROR #8 SEP +eor sAmi_, sE1, sAke, ROR #56 SEP +eor sAge_, sE4, sAgu, ROR #44 SEP C4r .req v31 +eor sAgu_, sE2, sAsi, ROR #62 SEP +eor sAsi_, sE4, sAku, ROR #58 SEP eor3_m0 C0, C0, vAma, vAsa +eor sAku_, sE0, sAsa, ROR #25 SEP +eor sAma_, sE4, sAbu, ROR #20 SEP +eor sAbu_, sE4, sAsu, ROR #9 SEP eor3_m0 C1, C1, vAme, vAse +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP +eor sAbe_, sE1, sAge, ROR #19 SEP eor3_m0 C2, C2, vAmi, vAsi +load_constant_ptr SEP +restore count, STACK_OFFSET_COUNT SEP eor3_m0 C3, C3, vAmo, vAso +tmp0 .req x0 SEP +tmp1 .req x29 SEP +bic tmp0, sAgi_, sAge_, ROR #47 SEP eor3_m0 C4, C4, vAmu, vAsu +bic tmp1, sAgo_, sAgi_, ROR #42 SEP +eor sAga, tmp0, sAga_, ROR #39 SEP +bic tmp0, sAgu_, sAgo_, ROR #16 SEP xar_m0 C2r, vzr, C2, 63 +eor sAge, tmp1, sAge_, ROR #25 SEP +bic tmp1, sAga_, sAgu_, ROR #31 SEP xar_m0 C4r, vzr, C4, 63 +eor sAgi, tmp0, sAgi_, ROR #58 SEP +bic tmp0, sAge_, sAga_, ROR #56 SEP +eor sAgo, tmp1, sAgo_, ROR #47 SEP xar_m0 C1r, vzr, C1, 63 +bic tmp1, sAki_, sAke_, ROR #19 SEP +eor sAgu, tmp0, sAgu_, ROR #23 SEP +bic tmp0, sAko_, sAki_, ROR #47 SEP xar_m0 C3r, vzr, C3, 63 +eor sAka, tmp1, sAka_, ROR #24 SEP +bic tmp1, sAku_, sAko_, ROR #10 SEP +eor sAke, tmp0, sAke_, ROR #2 SEP xar_m0 C0r, vzr, C0, 63 +bic tmp0, sAka_, sAku_, ROR #47 SEP +eor sAki, tmp1, sAki_, ROR #57 SEP eor2 E1, C0, C2r +bic tmp1, sAke_, sAka_, ROR #5 SEP +eor sAko, tmp0, sAko_, ROR #57 SEP +bic tmp0, sAmi_, sAme_, ROR #38 SEP restore(vAgo) +eor sAku, tmp1, sAku_, ROR #52 SEP +bic tmp1, sAmo_, sAmi_, ROR #5 SEP +eor sAma, tmp0, sAma_, ROR #47 SEP eor2 E3, C2, C4r +bic tmp0, sAmu_, sAmo_, ROR #41 SEP +eor sAme, tmp1, sAme_, ROR #43 SEP restore(vAga) +bic tmp1, sAma_, sAmu_, ROR #35 SEP +eor sAmi, tmp0, sAmi_, ROR #46 SEP +bic tmp0, sAme_, sAma_, ROR #9 SEP eor2 E0, C4, C1r +ldr cur_const, [const_addr, count, UXTW #3] SEP +eor sAmo, tmp1, sAmo_, ROR #12 SEP +bic tmp1, sAsi_, sAse_, ROR #48 SEP restore(vAgi) +eor sAmu, tmp0, sAmu_, ROR #44 SEP +bic tmp0, sAso_, sAsi_, ROR #2 SEP eor2 E2, C1, C3r +eor sAsa, tmp1, sAsa_, ROR #41 SEP +bic tmp1, sAsu_, sAso_, ROR #25 SEP +eor sAse, tmp0, sAse_, ROR #50 SEP restore(vAgu) +bic tmp0, sAsa_, sAsu_, ROR #60 SEP +eor sAsi, tmp1, sAsi_, ROR #27 SEP +bic tmp1, sAse_, sAsa_, ROR #57 SEP eor2 E4, C3, C0r +eor sAso, tmp0, sAso_, ROR #21 SEP +bic tmp0, sAbi_, sAbe_, ROR #63 SEP restore(vAge) +add count, count, #1 SEP +save count, STACK_OFFSET_COUNT SEP +eor sAsu, tmp1, sAsu_, ROR #53 SEP eor vAba_.16b, vAba.16b, E0.16b +bic tmp1, sAbo_, sAbi_, ROR #42 SEP +eor s_Aba, s_Aba_, tmp0, ROR #21 SEP +bic tmp0, sAbu_, sAbo_, ROR #57 SEP xar_m0 vAsa_, vAbi, E2, 2 +eor sAbe, tmp1, sAbe_, ROR #41 SEP +bic tmp1, s_Aba_, sAbu_, ROR #50 SEP +eor sAbi, tmp0, sAbi_, ROR #35 SEP xar_m0 vAbi_, vAki, E2, 21 +bic tmp0, sAbe_, s_Aba_, ROR #44 SEP +eor sAbo, tmp1, sAbo_, ROR #43 SEP xar_m0 vAki_, vAko, E3, 39 +eor sAbu, tmp0, sAbu_, ROR #30 SEP +eor s_Aba, s_Aba, cur_const SEP +eor sC2, sAsi, sAbi, ROR #52 SEP xar_m0 vAko_, vAmu, E4, 56 +eor sC0, s_Aba, sAga, ROR #61 SEP +eor sC4, sAku, sAgu, ROR #50 SEP +eor sC1, sAke, sAme, ROR #57 SEP xar_m0 vAmu_, vAso, E3, 8 +eor sC3, sAbo, sAko, ROR #63 SEP +eor sC2, sC2, sAki, ROR #48 SEP xar_m0 vAso_, vAma, E0, 23 +eor sC0, sC0, sAma, ROR #54 SEP +eor sC4, sC4, sAmu, ROR #34 SEP +eor sC1, sC1, sAbe, ROR #51 SEP xar_m0 vAka_, vAbe, E1, 63 +eor sC3, sC3, sAmo, ROR #37 SEP +eor sC2, sC2, sAmi, ROR #10 SEP +eor sC0, sC0, sAka, ROR #39 SEP xar_m0 vAse_, vAgo, E3, 9 +eor sC4, sC4, sAbu, ROR #26 SEP +eor sC1, sC1, sAse, ROR #31 SEP xar_m0 vAgo_, vAme, E1, 19 +eor sC3, sC3, sAgo, ROR #36 SEP +eor sC2, sC2, sAgi, ROR #5 SEP +eor sC0, sC0, sAsa, ROR #25 SEP xar_m0 vAke_, vAgi, E2, 58 +eor sC4, sC4, sAsu, ROR #15 SEP +eor sC1, sC1, sAge, ROR #27 SEP +eor sC3, sC3, sAso, ROR #2 SEP xar_m0 vAgi_, vAka, E0, 61 +eor sE1, sC0, sC2, ROR #61 SEP +ror sC2, sC2, 62 SEP xar_m0 vAga_, vAbo, E3, 36 +eor sE3, sC2, sC4, ROR #57 SEP +ror sC4, sC4, 58 SEP +eor sE0, sC4, sC1, ROR #55 SEP xar_m0 vAbo_, vAmo, E3, 43 +ror sC1, sC1, 56 SEP +eor sE2, sC1, sC3, ROR #63 SEP +eor sE4, sC3, sC0, ROR #63 SEP xar_m0 vAmo_, vAmi, E2, 49 +eor s_Aba_, sE0, s_Aba SEP +eor sAsa_, sE2, sAbi, ROR #50 SEP +eor sAbi_, sE2, sAki, ROR #46 SEP xar_m0 vAmi_, vAke, E1, 54 +eor sAki_, sE3, sAko, ROR #63 SEP +eor sAko_, sE4, sAmu, ROR #28 SEP xar_m0 vAge_, vAgu, E4, 44 +eor sAmu_, sE3, sAso, ROR #2 SEP +eor sAso_, sE0, sAma, ROR #54 SEP +eor sAka_, sE1, sAbe, ROR #43 SEP xar_m0 vAgu_, vAsi, E2, 3 +eor sAse_, sE3, sAgo, ROR #36 SEP +eor sAgo_, sE1, sAme, ROR #49 SEP +eor sAke_, sE2, sAgi, ROR #3 SEP xar_m0 vAsi_, vAku, E4, 25 +eor sAgi_, sE0, sAka, ROR #39 SEP +eor sAga_, sE3, sAbo SEP xar_m0 vAku_, vAsa, E0, 46 +eor sAbo_, sE3, sAmo, ROR #37 SEP +eor sAmo_, sE2, sAmi, ROR #8 SEP +eor sAmi_, sE1, sAke, ROR #56 SEP xar_m0 vAma_, vAbu, E4, 37 +eor sAge_, sE4, sAgu, ROR #44 SEP +eor sAgu_, sE2, sAsi, ROR #62 SEP +eor sAsi_, sE4, sAku, ROR #58 SEP xar_m0 vAbu_, vAsu, E4, 50 +eor sAku_, sE0, sAsa, ROR #25 SEP +eor sAma_, sE4, sAbu, ROR #20 SEP xar_m0 vAsu_, vAse, E1, 62 +eor sAbu_, sE4, sAsu, ROR #9 SEP +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP xar_m0 vAme_, vAga, E0, 28 +eor sAbe_, sE1, sAge, ROR #19 SEP +load_constant_ptr SEP xar_m0 vAbe_, vAge, E1, 20 +tmp0 .req x0 SEP +tmp1 .req x29 SEP restore x27, STACK_OFFSET_CONST +bic tmp0, sAgi_, sAge_, ROR #47 SEP +bic tmp1, sAgo_, sAgi_, ROR #42 SEP +eor sAga, tmp0, sAga_, ROR #39 SEP ldr q31, [x27], #16 +bic tmp0, sAgu_, sAgo_, ROR #16 SEP +eor sAge, tmp1, sAge_, ROR #25 SEP +bic tmp1, sAga_, sAgu_, ROR #31 SEP save x27, STACK_OFFSET_CONST +restore count, STACK_OFFSET_COUNT SEP +eor sAgi, tmp0, sAgi_, ROR #58 SEP +bic tmp0, sAge_, sAga_, ROR #56 SEP +eor sAgo, tmp1, sAgo_, ROR #47 SEP bcax_m0 vAga, vAga_, vAgi_, vAge_ +bic tmp1, sAki_, sAke_, ROR #19 SEP +eor sAgu, tmp0, sAgu_, ROR #23 SEP bcax_m0 vAge, vAge_, vAgo_, vAgi_ +bic tmp0, sAko_, sAki_, ROR #47 SEP +eor sAka, tmp1, sAka_, ROR #24 SEP +bic tmp1, sAku_, sAko_, ROR #10 SEP bcax_m0 vAgi, vAgi_, vAgu_, vAgo_ +eor sAke, tmp0, sAke_, ROR #2 SEP +bic tmp0, sAka_, sAku_, ROR #47 SEP +eor sAki, tmp1, sAki_, ROR #57 SEP bcax_m0 vAgo, vAgo_, vAga_, vAgu_ +bic tmp1, sAke_, sAka_, ROR #5 SEP +eor sAko, tmp0, sAko_, ROR #57 SEP bcax_m0 vAgu, vAgu_, vAge_, vAga_ +bic tmp0, sAmi_, sAme_, ROR #38 SEP +eor sAku, tmp1, sAku_, ROR #52 SEP +bic tmp1, sAmo_, sAmi_, ROR #5 SEP bcax_m0 vAka, vAka_, vAki_, vAke_ +eor sAma, tmp0, sAma_, ROR #47 SEP +bic tmp0, sAmu_, sAmo_, ROR #41 SEP +eor sAme, tmp1, sAme_, ROR #43 SEP bcax_m0 vAke, vAke_, vAko_, vAki_ +bic tmp1, sAma_, sAmu_, ROR #35 SEP +eor sAmi, tmp0, sAmi_, ROR #46 SEP bcax_m0 vAki, vAki_, vAku_, vAko_ +bic tmp0, sAme_, sAma_, ROR #9 SEP +ldr cur_const, [const_addr, count, UXTW #3] SEP +eor sAmo, tmp1, sAmo_, ROR #12 SEP bcax_m0 vAko, vAko_, vAka_, vAku_ +bic tmp1, sAsi_, sAse_, ROR #48 SEP +eor sAmu, tmp0, sAmu_, ROR #44 SEP +bic tmp0, sAso_, sAsi_, ROR #2 SEP bcax_m0 vAku, vAku_, vAke_, vAka_ +eor sAsa, tmp1, sAsa_, ROR #41 SEP +bic tmp1, sAsu_, sAso_, ROR #25 SEP bcax_m0 vAma, vAma_, vAmi_, vAme_ +eor sAse, tmp0, sAse_, ROR #50 SEP +bic tmp0, sAsa_, sAsu_, ROR #60 SEP +eor sAsi, tmp1, sAsi_, ROR #27 SEP bcax_m0 vAme, vAme_, vAmo_, vAmi_ +bic tmp1, sAse_, sAsa_, ROR #57 SEP +eor sAso, tmp0, sAso_, ROR #21 SEP +bic tmp0, sAbi_, sAbe_, ROR #63 SEP bcax_m0 vAmi, vAmi_, vAmu_, vAmo_ +add count, count, #1 SEP +save count, STACK_OFFSET_COUNT SEP +eor sAsu, tmp1, sAsu_, ROR #53 SEP bcax_m0 vAmo, vAmo_, vAma_, vAmu_ +bic tmp1, sAbo_, sAbi_, ROR #42 SEP +eor s_Aba, s_Aba_, tmp0, ROR #21 SEP bcax_m0 vAmu, vAmu_, vAme_, vAma_ +bic tmp0, sAbu_, sAbo_, ROR #57 SEP +eor sAbe, tmp1, sAbe_, ROR #41 SEP +bic tmp1, s_Aba_, sAbu_, ROR #50 SEP bcax_m0 vAsa, vAsa_, vAsi_, vAse_ +eor sAbi, tmp0, sAbi_, ROR #35 SEP +bic tmp0, sAbe_, s_Aba_, ROR #44 SEP +eor sAbo, tmp1, sAbo_, ROR #43 SEP bcax_m0 vAse, vAse_, vAso_, vAsi_ +eor sAbu, tmp0, sAbu_, ROR #30 SEP +eor s_Aba, s_Aba, cur_const SEP bcax_m0 vAsi, vAsi_, vAsu_, vAso_ +ror sAga, sAga,(64-3) SEP +ror sAbu, sAbu,(64-44) SEP +ror sAka, sAka,(64-25) SEP bcax_m0 vAso, vAso_, vAsa_, vAsu_ +ror sAke, sAke,(64-8) SEP +ror sAma, sAma,(64-10) SEP +ror sAku, sAku,(64-6) SEP bcax_m0 vAsu, vAsu_, vAse_, vAsa_ +ror sAsa, sAsa,(64-39) SEP +ror sAse, sAse,(64-41) SEP bcax_m0 vAba, vAba_, vAbi_, vAbe_ +ror sAbe, sAbe,(64-21) SEP +ror sAge, sAge,(64-45) SEP +ror sAgi, sAgi,(64-61) SEP bcax_m0 vAbe, vAbe_, vAbo_, vAbi_ +ror sAme, sAme,(64-15) SEP +ror sAmi, sAmi,(64-56) SEP +ror sAbi, sAbi,(64-14) SEP bcax_m0 vAbi, vAbi_, vAbu_, vAbo_ +ror sAki, sAki,(64-18) SEP +ror sAko, sAko,(64-1) SEP bcax_m0 vAbo, vAbo_, vAba_, vAbu_ +ror sAsi, sAsi,(64-2) SEP +ror sAso, sAso,(64-62) SEP +ror sAgo, sAgo,(64-28) SEP bcax_m0 vAbu, vAbu_, vAbe_, vAba_ +ror sAgu, sAgu,(64-20) SEP +ror sAmo, sAmo,(64-27) SEP +ror sAmu, sAmu,(64-36) SEP eor vAba.16b, vAba.16b, v31.16b +ror sAsu, sAsu,(64-55) SEP +.endm + + + +#define KECCAK_F1600_ROUNDS 24 + +.global keccak_f1600_x4_hybrid_asm_v7 +.global _keccak_f1600_x4_hybrid_asm_v7 +.text +.align 4 + +keccak_f1600_x4_hybrid_asm_v7: +_keccak_f1600_x4_hybrid_asm_v7: + alloc_stack + save_gprs + save_vregs + save input_addr, STACK_OFFSET_INPUT + + + ASM_LOAD(const_addr,round_constants_vec) + + save const_addr, STACK_OFFSET_CONST + load_input_vector 2,1 + + // First scalar Keccak computation alongside first half of SIMD computation + load_input_scalar 4,0 + hybrid_round_initial + loop_0: + hybrid_round_noninitial + cmp count, #(KECCAK_F1600_ROUNDS-3) + ble loop_0 + + hybrid_round_final + + restore input_addr, STACK_OFFSET_INPUT + store_input_scalar 4,0 + + // Second scalar Keccak computation alongsie second half of SIMD computation + load_input_scalar 4,1 + hybrid_round_initial + loop_1: + hybrid_round_noninitial + cmp count, #(KECCAK_F1600_ROUNDS-3) + ble loop_1 + + hybrid_round_final + + restore input_addr, STACK_OFFSET_INPUT + store_input_scalar 4,1 + store_input_vector 2,1 + + restore_vregs + restore_gprs + free_stack + + + ret +#endif \ No newline at end of file diff --git a/asm/manual/keccak_f1600/keccak_f1600_x4_hybrid_asm_v8.s b/asm/manual/keccak_f1600/keccak_f1600_x4_hybrid_asm_v8.s new file mode 100644 index 0000000..10e3410 --- /dev/null +++ b/asm/manual/keccak_f1600/keccak_f1600_x4_hybrid_asm_v8.s @@ -0,0 +1,1367 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" +#if defined(__ARM_FEATURE_SHA3) + +/********************** CONSTANTS *************************/ + .data + .align(8) +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 +round_constants_vec: + .quad 0x0000000000000001 + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + .quad 0x8000000080008008 +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x29 + count .req w27 + cur_const .req x26 + + /* Mapping of Kecck-f1600 SIMD state to vector registers + * at the beginning and end of each round. */ + + /* Mapping of Kecck-f1600 state to vector registers + * at the beginning and end of each round. */ + vAba .req v0 + vAbe .req v1 + vAbi .req v2 + vAbo .req v3 + vAbu .req v4 + vAga .req v5 + vAge .req v6 + vAgi .req v7 + vAgo .req v8 + vAgu .req v9 + vAka .req v10 + vAke .req v11 + vAki .req v12 + vAko .req v13 + vAku .req v14 + vAma .req v15 + vAme .req v16 + vAmi .req v17 + vAmo .req v18 + vAmu .req v19 + vAsa .req v20 + vAse .req v21 + vAsi .req v22 + vAso .req v23 + vAsu .req v24 + + /* q-form of the above mapping */ + vAbaq .req q0 + vAbeq .req q1 + vAbiq .req q2 + vAboq .req q3 + vAbuq .req q4 + vAgaq .req q5 + vAgeq .req q6 + vAgiq .req q7 + vAgoq .req q8 + vAguq .req q9 + vAkaq .req q10 + vAkeq .req q11 + vAkiq .req q12 + vAkoq .req q13 + vAkuq .req q14 + vAmaq .req q15 + vAmeq .req q16 + vAmiq .req q17 + vAmoq .req q18 + vAmuq .req q19 + vAsaq .req q20 + vAseq .req q21 + vAsiq .req q22 + vAsoq .req q23 + vAsuq .req q24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v27 + C1 .req v28 + C2 .req v29 + C3 .req v30 + C4 .req v31 + + C0q .req q27 + C1q .req q28 + C2q .req q29 + C3q .req q30 + C4q .req q31 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + vBba .req v25 // fresh + vBbe .req v26 // fresh + vBbi .req vAbi + vBbo .req vAbo + vBbu .req vAbu + vBga .req vAka + vBge .req vAke + vBgi .req vAgi + vBgo .req vAgo + vBgu .req vAgu + vBka .req vAma + vBke .req vAme + vBki .req vAki + vBko .req vAko + vBku .req vAku + vBma .req vAsa + vBme .req vAse + vBmi .req vAmi + vBmo .req vAmo + vBmu .req vAmu + vBsa .req vAba + vBse .req vAbe + vBsi .req vAsi + vBso .req vAso + vBsu .req vAsu + + vBbaq .req q25 // fresh + vBbeq .req q26 // fresh + vBbiq .req vAbiq + vBboq .req vAboq + vBbuq .req vAbuq + vBgaq .req vAkaq + vBgeq .req vAkeq + vBgiq .req vAgiq + vBgoq .req vAgoq + vBguq .req vAguq + vBkaq .req vAmaq + vBkeq .req vAmeq + vBkiq .req vAkiq + vBkoq .req vAkoq + vBkuq .req vAkuq + vBmaq .req vAsaq + vBmeq .req vAseq + vBmiq .req vAmiq + vBmoq .req vAmoq + vBmuq .req vAmuq + vBsaq .req vAbaq + vBseq .req vAbeq + vBsiq .req vAsiq + vBsoq .req vAsoq + vBsuq .req vAsuq + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req C4 + E1 .req C0 + E2 .req vBbe // fresh + E3 .req C2 + E4 .req C3 + + E0q .req C4q + E1q .req C0q + E2q .req vBbeq // fresh + E3q .req C2q + E4q .req C3q + + /* Mapping of Kecck-f1600 state to scalar registers + * at the beginning and end of each round. */ + s_Aba .req x1 + sAbe .req x6 + sAbi .req x11 + sAbo .req x16 + sAbu .req x21 + sAga .req x2 + sAge .req x7 + sAgi .req x12 + sAgo .req x17 + sAgu .req x22 + sAka .req x3 + sAke .req x8 + sAki .req x13 + sAko .req x18 + sAku .req x23 + sAma .req x4 + sAme .req x9 + sAmi .req x14 + sAmo .req x19 + sAmu .req x24 + sAsa .req x5 + sAse .req x10 + sAsi .req x15 + sAso .req x20 + sAsu .req x25 + + /* sA_[y,2*x+3*y] = rot(A[x,y]) */ + s_Aba_ .req x0 + sAbe_ .req x28 + sAbi_ .req x11 + sAbo_ .req x16 + sAbu_ .req x21 + sAga_ .req x3 + sAge_ .req x8 + sAgi_ .req x12 + sAgo_ .req x17 + sAgu_ .req x22 + sAka_ .req x4 + sAke_ .req x9 + sAki_ .req x13 + sAko_ .req x18 + sAku_ .req x23 + sAma_ .req x5 + sAme_ .req x10 + sAmi_ .req x14 + sAmo_ .req x19 + sAmu_ .req x24 + sAsa_ .req x1 + sAse_ .req x6 + sAsi_ .req x15 + sAso_ .req x20 + sAsu_ .req x25 + + /* sC[x] = sA[x,0] xor sA[x,1] xor sA[x,2] xor sA[x,3] xor sA[x,4], for x in 0..4 */ + /* sE[x] = sC[x-1] xor rot(C[x+1],1), for x in 0..4 */ + sC0 .req x0 + sE0 .req x29 + sC1 .req x26 + sE1 .req x30 + sC2 .req x27 + sE2 .req x26 + sC3 .req x28 + sE3 .req x27 + sC4 .req x29 + sE4 .req x28 + + tmp .req x30 + +/************************ MACROS ****************************/ + +/* Macros using v8.4-A SHA-3 instructions */ + +.macro eor3_m0 d s0 s1 s2 + eor3 \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +.macro rax1_m0 d s0 s1 + rax1 \d\().2d, \s0\().2d, \s1\().2d +.endm + +.macro xar_m0 d s0 s1 imm + xar \d\().2d, \s0\().2d, \s1\().2d, #\imm +.endm + +.macro bcax_m0 d s0 s1 s2 + bcax \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +.macro eor3_m1_0 d s0 s1 s2 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor2 d s0 s1 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor3_m1_1 d s0 s1 s2 + eor \d\().16b, \d\().16b, \s2\().16b +.endm + +.macro eor3_m1 d s0 s1 s2 + eor3_m1_0 \d, \s0, \s1, \s2 + eor3_m1_1 \d, \s0, \s1, \s2 +.endm + +.macro rax1_m1 d s0 s1 + // Use add instead of SHL #1 + shl vvtmp.2d, \s1\().2d, #1 + sri vvtmp.2d, \s1\().2d, #63 + eor \d\().16b, vvtmp.16b, \s0\().16b +.endm + + .macro xar_m1 d s0 s1 imm + eor \s0\().16b, \s0\().16b, \s1\().16b + shl \d\().2d, \s0\().2d, #(64-\imm) + sri \d\().2d, \s0\().2d, #(\imm) +.endm + + .macro xar_m1_0 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 63 + eor \s0\().16b, \s0\().16b, \s1\().16b + .elseif \imm == 62 + eor \s0\().16b, \s0\().16b, \s1\().16b + .else + eor \s0\().16b, \s0\().16b, \s1\().16b + .endif +.endm + + .macro xar_m1_1 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 63 + add \d\().2d, \s0\().2d, \s0\().2d + sri \d\().2d, \s0\().2d, #(63) + .elseif \imm == 62 + add \d\().2d, \s0\().2d, \s0\().2d + add \d\().2d, \d\().2d, \d\().2d + sri \d\().2d, \s0\().2d, #(62) + .else + shl \d\().2d, \s0\().2d, #(64-\imm) + sri \d\().2d, \s0\().2d, #(\imm) + .endif +.endm + +.macro bcax_m1 d s0 s1 s2 + bic vvtmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, vvtmp.16b, \s0\().16b +.endm + +.macro load_input_vector num idx + ldr vAbaq, [input_addr, #(16*(\num*0+\idx))] + ldr vAbeq, [input_addr, #(16*(\num*1+\idx))] + ldr vAbiq, [input_addr, #(16*(\num*2+\idx))] + ldr vAboq, [input_addr, #(16*(\num*3+\idx))] + ldr vAbuq, [input_addr, #(16*(\num*4+\idx))] + ldr vAgaq, [input_addr, #(16*(\num*5+\idx))] + ldr vAgeq, [input_addr, #(16*(\num*6+\idx))] + ldr vAgiq, [input_addr, #(16*(\num*7+\idx))] + ldr vAgoq, [input_addr, #(16*(\num*8+\idx))] + ldr vAguq, [input_addr, #(16*(\num*9+\idx))] + ldr vAkaq, [input_addr, #(16*(\num*10+\idx))] + ldr vAkeq, [input_addr, #(16*(\num*11+\idx))] + ldr vAkiq, [input_addr, #(16*(\num*12+\idx))] + ldr vAkoq, [input_addr, #(16*(\num*13+\idx))] + ldr vAkuq, [input_addr, #(16*(\num*14+\idx))] + ldr vAmaq, [input_addr, #(16*(\num*15+\idx))] + ldr vAmeq, [input_addr, #(16*(\num*16+\idx))] + ldr vAmiq, [input_addr, #(16*(\num*17+\idx))] + ldr vAmoq, [input_addr, #(16*(\num*18+\idx))] + ldr vAmuq, [input_addr, #(16*(\num*19+\idx))] + ldr vAsaq, [input_addr, #(16*(\num*20+\idx))] + ldr vAseq, [input_addr, #(16*(\num*21+\idx))] + ldr vAsiq, [input_addr, #(16*(\num*22+\idx))] + ldr vAsoq, [input_addr, #(16*(\num*23+\idx))] + ldr vAsuq, [input_addr, #(16*(\num*24+\idx))] +.endm + +.macro store_input_vector num idx + str vAbaq, [input_addr, #(16*(\num*0+\idx))] + str vAbeq, [input_addr, #(16*(\num*1+\idx))] + str vAbiq, [input_addr, #(16*(\num*2+\idx))] + str vAboq, [input_addr, #(16*(\num*3+\idx))] + str vAbuq, [input_addr, #(16*(\num*4+\idx))] + str vAgaq, [input_addr, #(16*(\num*5+\idx))] + str vAgeq, [input_addr, #(16*(\num*6+\idx))] + str vAgiq, [input_addr, #(16*(\num*7+\idx))] + str vAgoq, [input_addr, #(16*(\num*8+\idx))] + str vAguq, [input_addr, #(16*(\num*9+\idx))] + str vAkaq, [input_addr, #(16*(\num*10+\idx))] + str vAkeq, [input_addr, #(16*(\num*11+\idx))] + str vAkiq, [input_addr, #(16*(\num*12+\idx))] + str vAkoq, [input_addr, #(16*(\num*13+\idx))] + str vAkuq, [input_addr, #(16*(\num*14+\idx))] + str vAmaq, [input_addr, #(16*(\num*15+\idx))] + str vAmeq, [input_addr, #(16*(\num*16+\idx))] + str vAmiq, [input_addr, #(16*(\num*17+\idx))] + str vAmoq, [input_addr, #(16*(\num*18+\idx))] + str vAmuq, [input_addr, #(16*(\num*19+\idx))] + str vAsaq, [input_addr, #(16*(\num*20+\idx))] + str vAseq, [input_addr, #(16*(\num*21+\idx))] + str vAsiq, [input_addr, #(16*(\num*22+\idx))] + str vAsoq, [input_addr, #(16*(\num*23+\idx))] + str vAsuq, [input_addr, #(16*(\num*24+\idx))] +.endm + +.macro store_input_scalar num idx + str s_Aba, [input_addr, 8*(\num*(0) +\idx)] + str sAbe, [input_addr, 8*(\num*(0+1) +\idx)] + str sAbi, [input_addr, 8*(\num*(2)+ \idx)] + str sAbo, [input_addr, 8*(\num*(2+1) +\idx)] + str sAbu, [input_addr, 8*(\num*(4)+ \idx)] + str sAga, [input_addr, 8*(\num*(4+1) +\idx)] + str sAge, [input_addr, 8*(\num*(6)+ \idx)] + str sAgi, [input_addr, 8*(\num*(6+1) +\idx)] + str sAgo, [input_addr, 8*(\num*(8)+ \idx)] + str sAgu, [input_addr, 8*(\num*(8+1) +\idx)] + str sAka, [input_addr, 8*(\num*(10) +\idx)] + str sAke, [input_addr, 8*(\num*(10+1)+\idx)] + str sAki, [input_addr, 8*(\num*(12) +\idx)] + str sAko, [input_addr, 8*(\num*(12+1)+\idx)] + str sAku, [input_addr, 8*(\num*(14) +\idx)] + str sAma, [input_addr, 8*(\num*(14+1)+\idx)] + str sAme, [input_addr, 8*(\num*(16) +\idx)] + str sAmi, [input_addr, 8*(\num*(16+1)+\idx)] + str sAmo, [input_addr, 8*(\num*(18) +\idx)] + str sAmu, [input_addr, 8*(\num*(18+1)+\idx)] + str sAsa, [input_addr, 8*(\num*(20) +\idx)] + str sAse, [input_addr, 8*(\num*(20+1)+\idx)] + str sAsi, [input_addr, 8*(\num*(22) +\idx)] + str sAso, [input_addr, 8*(\num*(22+1)+\idx)] + str sAsu, [input_addr, 8*(\num*(24) +\idx)] +.endm + +.macro load_input_scalar num idx + ldr s_Aba, [input_addr, 8*(\num*(0) +\idx)] + ldr sAbe, [input_addr, 8*(\num*(0+1) +\idx)] + ldr sAbi, [input_addr, 8*(\num*(2)+ \idx)] + ldr sAbo, [input_addr, 8*(\num*(2+1) +\idx)] + ldr sAbu, [input_addr, 8*(\num*(4)+ \idx)] + ldr sAga, [input_addr, 8*(\num*(4+1) +\idx)] + ldr sAge, [input_addr, 8*(\num*(6)+ \idx)] + ldr sAgi, [input_addr, 8*(\num*(6+1) +\idx)] + ldr sAgo, [input_addr, 8*(\num*(8)+ \idx)] + ldr sAgu, [input_addr, 8*(\num*(8+1) +\idx)] + ldr sAka, [input_addr, 8*(\num*(10) +\idx)] + ldr sAke, [input_addr, 8*(\num*(10+1)+\idx)] + ldr sAki, [input_addr, 8*(\num*(12) +\idx)] + ldr sAko, [input_addr, 8*(\num*(12+1)+\idx)] + ldr sAku, [input_addr, 8*(\num*(14) +\idx)] + ldr sAma, [input_addr, 8*(\num*(14+1)+\idx)] + ldr sAme, [input_addr, 8*(\num*(16) +\idx)] + ldr sAmi, [input_addr, 8*(\num*(16+1)+\idx)] + ldr sAmo, [input_addr, 8*(\num*(18) +\idx)] + ldr sAmu, [input_addr, 8*(\num*(18+1)+\idx)] + ldr sAsa, [input_addr, 8*(\num*(20) +\idx)] + ldr sAse, [input_addr, 8*(\num*(20+1)+\idx)] + ldr sAsi, [input_addr, 8*(\num*(22) +\idx)] + ldr sAso, [input_addr, 8*(\num*(22+1)+\idx)] + ldr sAsu, [input_addr, 8*(\num*(24) +\idx)] +.endm + +#define STACK_SIZE (8*8 + 16*6 + 3*8 + 8 + 16*34) // VREGS (8*8), GPRs (16*6), count (8), const (8), input (8), padding (8) +#define STACK_BASE_GPRS (3*8+8) +#define STACK_BASE_VREGS (3*8+8+16*6) +#define STACK_BASE_TMP (8*8 + 16*6 + 3*8 + 8) +#define STACK_OFFSET_INPUT (0*8) +#define STACK_OFFSET_CONST (1*8) +#define STACK_OFFSET_COUNT (2*8) + +#define vAga_offset 0 +#define E0_offset 1 +#define E1_offset 2 +#define E2_offset 3 +#define E3_offset 4 +#define E4_offset 5 +#define Ame_offset 7 +#define Agi_offset 8 +#define Aka_offset 9 +#define Abo_offset 10 +#define Amo_offset 11 +#define Ami_offset 12 +#define Ake_offset 13 +#define Agu_offset 14 +#define Asi_offset 15 +#define Aku_offset 16 +#define Asa_offset 17 +#define Abu_offset 18 +#define Asu_offset 19 +#define Ase_offset 20 +//#define Aga_offset 21 +#define Age_offset 22 +#define vBgo_offset 23 +#define vBke_offset 24 +#define vBgi_offset 25 +#define vBga_offset 26 +#define vBbo_offset 27 +#define vBmo_offset 28 +#define vBmi_offset 29 +#define vBge_offset 30 + +#define save(name) \ + str name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] +#define restore(name) \ + ldr name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] + + +.macro save_gprs + stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro save_vregs + stp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] + stp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + stp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + stp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] +.endm + +.macro restore_vregs + ldp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] + ldp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + ldp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + ldp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] +.endm + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +.macro eor5 dst, src0, src1, src2, src3, src4 + eor \dst, \src0, \src1 + eor \dst, \dst, \src2 + eor \dst, \dst, \src3 + eor \dst, \dst, \src4 +.endm + +.macro xor_rol dst, src1, src0, imm + eor \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro bic_rol dst, src1, src0, imm + bic \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro rotate dst, src, imm + ror \dst, \src, #(64-\imm) +.endm + +.macro save reg, offset + str \reg, [sp, #\offset] +.endm + +.macro restore reg, offset + ldr \reg, [sp, #\offset] +.endm + +.macro hybrid_round_initial +eor sC0, sAma, sAsa SEP +eor sC1, sAme, sAse SEP eor3_m0 C1,vAbe,vAge,vAke +eor sC2, sAmi, sAsi SEP +eor sC3, sAmo, sAso SEP eor3_m1 C3,vAbo,vAgo,vAko +eor sC4, sAmu, sAsu SEP +eor sC0, sAka, sC0 SEP eor3_m0 C0,vAba,vAga,vAka +eor sC1, sAke, sC1 SEP +eor sC2, sAki, sC2 SEP eor3_m1 C2,vAbi,vAgi,vAki +eor sC3, sAko, sC3 SEP +eor sC4, sAku, sC4 SEP eor3_m0 C4,vAbu,vAgu,vAku +eor sC0, sAga, sC0 SEP +eor sC1, sAge, sC1 SEP eor3_m1 C1, C1,vAme, vAse +eor sC2, sAgi, sC2 SEP +eor sC3, sAgo, sC3 SEP eor3_m0 C3, C3,vAmo, vAso +eor sC4, sAgu, sC4 SEP +eor sC0, s_Aba, sC0 SEP eor3_m1 C0, C0,vAma, vAsa +eor sC1, sAbe, sC1 SEP +eor sC2, sAbi, sC2 SEP eor3_m0 C2, C2,vAmi, vAsi +eor sC3, sAbo, sC3 SEP +eor sC4, sAbu, sC4 SEP eor3_m1 C4, C4,vAmu, vAsu +eor sE1, sC0, sC2, ROR #63 SEP +eor sE3, sC2, sC4, ROR #63 SEP vvtmp .req vBba +eor sE0, sC4, sC1, ROR #63 SEP +eor sE2, sC1, sC3, ROR #63 SEP rax1_m0 E2, C1, C3 +eor sE4, sC3, sC0, ROR #63 SEP +eor s_Aba_, s_Aba, sE0 SEP +eor sAsa_, sAbi, sE2 SEP rax1_m1 E4, C3, C0 +eor sAbi_, sAki, sE2 SEP +eor sAki_, sAko, sE3 SEP rax1_m0 E1, C0, C2 +eor sAko_, sAmu, sE4 SEP +eor sAmu_, sAso, sE3 SEP rax1_m1 E3, C2, C4 +eor sAso_, sAma, sE0 SEP +eor sAka_, sAbe, sE1 SEP rax1_m0 E0, C4, C1 +eor sAse_, sAgo, sE3 SEP +eor sAgo_, sAme, sE1 SEP .unreq vvtmp +eor sAke_, sAgi, sE2 SEP +eor sAgi_, sAka, sE0 SEP vvtmp .req C1 +eor sAga_, sAbo, sE3 SEP +eor sAbo_, sAmo, sE3 SEP vvtmpq .req C1q +eor sAmo_, sAmi, sE2 SEP +eor sAmi_, sAke, sE1 SEP eor vBba.16b, vAba.16b, E0.16b +eor sAge_, sAgu, sE4 SEP +eor sAgu_, sAsi, sE2 SEP xar_m1 vBsa, vAbi, E2, 2 +eor sAsi_, sAku, sE4 SEP +eor sAku_, sAsa, sE0 SEP xar_m0 vBbi, vAki, E2, 21 +eor sAma_, sAbu, sE4 SEP +eor sAbu_, sAsu, sE4 SEP xar_m1 vBki, vAko, E3, 39 +eor sAsu_, sAse, sE1 SEP +eor sAme_, sAga, sE0 SEP +eor sAbe_, sAge, sE1 SEP xar_m0 vBko, vAmu, E4, 56 +load_constant_ptr SEP +bic tmp, sAgi_, sAge_, ROR #47 SEP xar_m1 vBmu, vAso, E3, 8 +eor sAga, tmp, sAga_, ROR #39 SEP +bic tmp, sAgo_, sAgi_, ROR #42 SEP xar_m0 vBso, vAma, E0, 23 +eor sAge, tmp, sAge_, ROR #25 SEP +bic tmp, sAgu_, sAgo_, ROR #16 SEP xar_m1 vBka, vAbe, E1, 63 +eor sAgi, tmp, sAgi_, ROR #58 SEP +bic tmp, sAga_, sAgu_, ROR #31 SEP xar_m0 vBse, vAgo, E3, 9 +eor sAgo, tmp, sAgo_, ROR #47 SEP +bic tmp, sAge_, sAga_, ROR #56 SEP xar_m1 vBgo, vAme, E1, 19 +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP xar_m0 vBke, vAgi, E2, 58 +eor sAka, tmp, sAka_, ROR #24 SEP +bic tmp, sAko_, sAki_, ROR #47 SEP xar_m1 vBgi, vAka, E0, 61 +eor sAke, tmp, sAke_, ROR #2 SEP +bic tmp, sAku_, sAko_, ROR #10 SEP xar_m0 vBga, vAbo, E3, 36 +eor sAki, tmp, sAki_, ROR #57 SEP +bic tmp, sAka_, sAku_, ROR #47 SEP xar_m1 vBbo, vAmo, E3, 43 +eor sAko, tmp, sAko_, ROR #57 SEP +bic tmp, sAke_, sAka_, ROR #5 SEP xar_m0 vBmo, vAmi, E2, 49 +eor sAku, tmp, sAku_, ROR #52 SEP +bic tmp, sAmi_, sAme_, ROR #38 SEP +eor sAma, tmp, sAma_, ROR #47 SEP xar_m1 vBmi, vAke, E1, 54 +bic tmp, sAmo_, sAmi_, ROR #5 SEP +eor sAme, tmp, sAme_, ROR #43 SEP xar_m0 vBge, vAgu, E4, 44 +bic tmp, sAmu_, sAmo_, ROR #41 SEP +eor sAmi, tmp, sAmi_, ROR #46 SEP mov E3.16b, vAga.16b +ldr cur_const, [const_addr] SEP +mov count, #1 SEP bcax_m1 vAga, vBga, vBgi, vBge +bic tmp, sAma_, sAmu_, ROR #35 SEP +eor sAmo, tmp, sAmo_, ROR #12 SEP xar_m0 vBgu, vAsi, E2, 3 +bic tmp, sAme_, sAma_, ROR #9 SEP +eor sAmu, tmp, sAmu_, ROR #44 SEP xar_m1 vBsi, vAku, E4, 25 +bic tmp, sAsi_, sAse_, ROR #48 SEP +eor sAsa, tmp, sAsa_, ROR #41 SEP xar_m0 vBku, vAsa, E0, 46 +bic tmp, sAso_, sAsi_, ROR #2 SEP +eor sAse, tmp, sAse_, ROR #50 SEP xar_m1 vBma, vAbu, E4, 37 +bic tmp, sAsu_, sAso_, ROR #25 SEP +eor sAsi, tmp, sAsi_, ROR #27 SEP xar_m0 vBbu, vAsu, E4, 50 +bic tmp, sAsa_, sAsu_, ROR #60 SEP +eor sAso, tmp, sAso_, ROR #21 SEP xar_m1 vBsu, vAse, E1, 62 +bic tmp, sAse_, sAsa_, ROR #57 SEP +eor sAsu, tmp, sAsu_, ROR #53 SEP xar_m0 vBme, E3, E0, 28 +bic tmp, sAbi_, sAbe_, ROR #63 SEP +eor s_Aba, s_Aba_, tmp, ROR #21 SEP xar_m1 vBbe, vAge, E1, 20 +bic tmp, sAbo_, sAbi_, ROR #42 SEP +eor sAbe, tmp, sAbe_, ROR #41 SEP +bic tmp, sAbu_, sAbo_, ROR #57 SEP bcax_m1 vAge, vBge, vBgo, vBgi +eor sAbi, tmp, sAbi_, ROR #35 SEP +bic tmp, s_Aba_, sAbu_, ROR #50 SEP bcax_m0 vAgi, vBgi, vBgu, vBgo +eor sAbo, tmp, sAbo_, ROR #43 SEP +bic tmp, sAbe_, s_Aba_, ROR #44 SEP bcax_m1 vAgo, vBgo, vBga, vBgu +eor sAbu, tmp, sAbu_, ROR #30 SEP +eor s_Aba, s_Aba, cur_const SEP bcax_m0 vAgu, vBgu, vBge, vBga +save count, STACK_OFFSET_COUNT SEP +eor sC0, sAka, sAsa, ROR #50 SEP bcax_m1 vAka, vBka, vBki, vBke +eor sC1, sAse, sAge, ROR #60 SEP +eor sC2, sAmi, sAgi, ROR #59 SEP bcax_m0 vAke, vBke, vBko, vBki +eor sC3, sAgo, sAso, ROR #30 SEP +eor sC4, sAbu, sAsu, ROR #53 SEP .unreq vvtmp +eor sC0, sAma, sC0, ROR #49 SEP +eor sC1, sAbe, sC1, ROR #44 SEP .unreq vvtmpq +eor sC2, sAki, sC2, ROR #26 SEP +eor sC3, sAmo, sC3, ROR #63 SEP eor2 C0, vAka, vAga +eor sC4, sAmu, sC4, ROR #56 SEP +eor sC0, sAga, sC0, ROR #57 SEP save(vAga) +eor sC1, sAme, sC1, ROR #58 SEP +eor sC2, sAbi, sC2, ROR #60 SEP vvtmp .req vAga +eor sC3, sAko, sC3, ROR #38 SEP +eor sC4, sAgu, sC4, ROR #48 SEP +eor sC0, s_Aba, sC0, ROR #61 SEP vvtmpq .req vAgaq +eor sC1, sAke, sC1, ROR #57 SEP +eor sC2, sAsi, sC2, ROR #52 SEP bcax_m0 vAki, vBki, vBku, vBko +eor sC3, sAbo, sC3, ROR #63 SEP +eor sC4, sAku, sC4, ROR #50 SEP bcax_m1 vAko, vBko, vBka, vBku +ror sC1, sC1, 56 SEP +ror sC4, sC4, 58 SEP eor2 C1, vAke, vAge +ror sC2, sC2, 62 SEP +eor sE1, sC0, sC2, ROR #63 SEP bcax_m0 vAku, vBku, vBke, vBka +eor sE3, sC2, sC4, ROR #63 SEP +eor sE0, sC4, sC1, ROR #63 SEP eor2 C2, vAki, vAgi +eor sE2, sC1, sC3, ROR #63 SEP +eor sE4, sC3, sC0, ROR #63 SEP bcax_m1 vAma, vBma, vBmi, vBme +eor s_Aba_, sE0, s_Aba SEP +eor sAsa_, sE2, sAbi, ROR #50 SEP eor2 C3, vAko, vAgo +eor sAbi_, sE2, sAki, ROR #46 SEP +eor sAki_, sE3, sAko, ROR #63 SEP bcax_m0 vAme, vBme, vBmo, vBmi +eor sAko_, sE4, sAmu, ROR #28 SEP +eor sAmu_, sE3, sAso, ROR #2 SEP eor2 C4, vAku, vAgu +eor sAso_, sE0, sAma, ROR #54 SEP +eor sAka_, sE1, sAbe, ROR #43 SEP bcax_m1 vAmi, vBmi, vBmu, vBmo +eor sAse_, sE3, sAgo, ROR #36 SEP +eor sAgo_, sE1, sAme, ROR #49 SEP +eor sAke_, sE2, sAgi, ROR #3 SEP eor2 C0, C0, vAma +eor sAgi_, sE0, sAka, ROR #39 SEP +eor sAga_, sE3, sAbo SEP bcax_m0 vAmo, vBmo, vBma, vBmu +eor sAbo_, sE3, sAmo, ROR #37 SEP +eor sAmo_, sE2, sAmi, ROR #8 SEP eor2 C1, C1, vAme +eor sAmi_, sE1, sAke, ROR #56 SEP +eor sAge_, sE4, sAgu, ROR #44 SEP bcax_m1 vAmu, vBmu, vBme, vBma +eor sAgu_, sE2, sAsi, ROR #62 SEP +eor sAsi_, sE4, sAku, ROR #58 SEP eor2 C2, C2, vAmi +eor sAku_, sE0, sAsa, ROR #25 SEP +eor sAma_, sE4, sAbu, ROR #20 SEP bcax_m0 vAsa, vBsa, vBsi, vBse +eor sAbu_, sE4, sAsu, ROR #9 SEP +eor sAsu_, sE1, sAse, ROR #23 SEP eor2 C3, C3, vAmo +eor sAme_, sE0, sAga, ROR #61 SEP +eor sAbe_, sE1, sAge, ROR #19 SEP bcax_m1 vAse, vBse, vBso, vBsi +load_constant_ptr SEP +restore count, STACK_OFFSET_COUNT SEP eor2 C4, C4, vAmu +bic tmp, sAgi_, sAge_, ROR #47 SEP +eor sAga, tmp, sAga_, ROR #39 SEP bcax_m0 vAsi, vBsi, vBsu, vBso +bic tmp, sAgo_, sAgi_, ROR #42 SEP +eor sAge, tmp, sAge_, ROR #25 SEP eor2 C0, C0, vAsa +bic tmp, sAgu_, sAgo_, ROR #16 SEP +eor sAgi, tmp, sAgi_, ROR #58 SEP bcax_m1 vAso, vBso, vBsa, vBsu +bic tmp, sAga_, sAgu_, ROR #31 SEP +eor sAgo, tmp, sAgo_, ROR #47 SEP +bic tmp, sAge_, sAga_, ROR #56 SEP eor2 C1, C1, vAse +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP bcax_m0 vAsu, vBsu, vBse, vBsa +eor sAka, tmp, sAka_, ROR #24 SEP +bic tmp, sAko_, sAki_, ROR #47 SEP eor2 C2, C2, vAsi +eor sAke, tmp, sAke_, ROR #2 SEP +bic tmp, sAku_, sAko_, ROR #10 SEP eor2 C3, C3, vAso +eor sAki, tmp, sAki_, ROR #57 SEP +bic tmp, sAka_, sAku_, ROR #47 SEP bcax_m1 vAba, vBba, vBbi, vBbe +eor sAko, tmp, sAko_, ROR #57 SEP +bic tmp, sAke_, sAka_, ROR #5 SEP bcax_m0 vAbe, vBbe, vBbo, vBbi +eor sAku, tmp, sAku_, ROR #52 SEP +bic tmp, sAmi_, sAme_, ROR #38 SEP eor2 C1, C1, vAbe +eor sAma, tmp, sAma_, ROR #47 SEP +bic tmp, sAmo_, sAmi_, ROR #5 SEP restore x26, STACK_OFFSET_CONST +eor sAme, tmp, sAme_, ROR #43 SEP +bic tmp, sAmu_, sAmo_, ROR #41 SEP ldr vvtmpq, [x26], #16 +eor sAmi, tmp, sAmi_, ROR #46 SEP +bic tmp, sAma_, sAmu_, ROR #35 SEP save x26, STACK_OFFSET_CONST +eor sAmo, tmp, sAmo_, ROR #12 SEP +bic tmp, sAme_, sAma_, ROR #9 SEP eor vAba.16b, vAba.16b, vvtmp.16b +eor sAmu, tmp, sAmu_, ROR #44 SEP +bic tmp, sAsi_, sAse_, ROR #48 SEP +ldr cur_const, [const_addr, count, UXTW #3] SEP eor2 C4, C4, vAsu +eor sAsa, tmp, sAsa_, ROR #41 SEP +bic tmp, sAso_, sAsi_, ROR #2 SEP bcax_m0 vAbi, vBbi, vBbu, vBbo +eor sAse, tmp, sAse_, ROR #50 SEP +bic tmp, sAsu_, sAso_, ROR #25 SEP bcax_m1 vAbo, vBbo, vBba, vBbu +eor sAsi, tmp, sAsi_, ROR #27 SEP +bic tmp, sAsa_, sAsu_, ROR #60 SEP eor2 C3, C3, vAbo +eor sAso, tmp, sAso_, ROR #21 SEP +bic tmp, sAse_, sAsa_, ROR #57 SEP eor2 C2, C2, vAbi +eor sAsu, tmp, sAsu_, ROR #53 SEP +bic tmp, sAbi_, sAbe_, ROR #63 SEP eor2 C0, C0, vAba +eor s_Aba, s_Aba_, tmp, ROR #21 SEP +bic tmp, sAbo_, sAbi_, ROR #42 SEP bcax_m0 vAbu, vBbu, vBbe, vBba +eor sAbe, tmp, sAbe_, ROR #41 SEP +bic tmp, sAbu_, sAbo_, ROR #57 SEP eor2 C4, C4, vAbu +eor sAbi, tmp, sAbi_, ROR #35 SEP +bic tmp, s_Aba_, sAbu_, ROR #50 SEP restore(vAga) +eor sAbo, tmp, sAbo_, ROR #43 SEP +bic tmp, sAbe_, s_Aba_, ROR #44 SEP .unreq vvtmp +eor sAbu, tmp, sAbu_, ROR #30 SEP +add count, count, #1 SEP .unreq vvtmpq +eor s_Aba, s_Aba, cur_const SEP +.endm + + + +.macro hybrid_round_noninitial +save count, STACK_OFFSET_COUNT SEP +eor sC0, sAka, sAsa, ROR #50 SEP vvtmp .req vBba +eor sC1, sAse, sAge, ROR #60 SEP +eor sC2, sAmi, sAgi, ROR #59 SEP rax1_m0 E2, C1, C3 +eor sC3, sAgo, sAso, ROR #30 SEP +eor sC4, sAbu, sAsu, ROR #53 SEP rax1_m0 E4, C3, C0 +eor sC0, sAma, sC0, ROR #49 SEP +eor sC1, sAbe, sC1, ROR #44 SEP +eor sC2, sAki, sC2, ROR #26 SEP rax1_m0 E1, C0, C2 +eor sC3, sAmo, sC3, ROR #63 SEP +eor sC4, sAmu, sC4, ROR #56 SEP rax1_m0 E3, C2, C4 +eor sC0, sAga, sC0, ROR #57 SEP +eor sC1, sAme, sC1, ROR #58 SEP rax1_m0 E0, C4, C1 +eor sC2, sAbi, sC2, ROR #60 SEP +eor sC3, sAko, sC3, ROR #38 SEP +eor sC4, sAgu, sC4, ROR #48 SEP .unreq vvtmp +eor sC0, s_Aba, sC0, ROR #61 SEP +eor sC1, sAke, sC1, ROR #57 SEP vvtmp .req C1 +eor sC2, sAsi, sC2, ROR #52 SEP +eor sC3, sAbo, sC3, ROR #63 SEP vvtmpq .req C1q +eor sC4, sAku, sC4, ROR #50 SEP +ror sC1, sC1, 56 SEP +ror sC4, sC4, 58 SEP eor vBba.16b, vAba.16b, E0.16b +ror sC2, sC2, 62 SEP +eor sE1, sC0, sC2, ROR #63 SEP xar_m0 vBsa, vAbi, E2, 2 +eor sE3, sC2, sC4, ROR #63 SEP +eor sE0, sC4, sC1, ROR #63 SEP +eor sE2, sC1, sC3, ROR #63 SEP xar_m0 vBbi, vAki, E2, 21 +eor sE4, sC3, sC0, ROR #63 SEP +eor s_Aba_, sE0, s_Aba SEP xar_m0 vBki, vAko, E3, 39 +eor sAsa_, sE2, sAbi, ROR #50 SEP +eor sAbi_, sE2, sAki, ROR #46 SEP xar_m1 vBko, vAmu, E4, 56 +eor sAki_, sE3, sAko, ROR #63 SEP +eor sAko_, sE4, sAmu, ROR #28 SEP +eor sAmu_, sE3, sAso, ROR #2 SEP xar_m0 vBmu, vAso, E3, 8 +eor sAso_, sE0, sAma, ROR #54 SEP +eor sAka_, sE1, sAbe, ROR #43 SEP xar_m0 vBso, vAma, E0, 23 +eor sAse_, sE3, sAgo, ROR #36 SEP +eor sAgo_, sE1, sAme, ROR #49 SEP xar_m0 vBka, vAbe, E1, 63 +eor sAke_, sE2, sAgi, ROR #3 SEP +eor sAgi_, sE0, sAka, ROR #39 SEP +eor sAga_, sE3, sAbo SEP xar_m1 vBse, vAgo, E3, 9 +eor sAbo_, sE3, sAmo, ROR #37 SEP +eor sAmo_, sE2, sAmi, ROR #8 SEP xar_m0 vBgo, vAme, E1, 19 +eor sAmi_, sE1, sAke, ROR #56 SEP +eor sAge_, sE4, sAgu, ROR #44 SEP +eor sAgu_, sE2, sAsi, ROR #62 SEP xar_m0 vBke, vAgi, E2, 58 +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP xar_m0 vBgi, vAka, E0, 61 +eor sAma_, sE4, sAbu, ROR #20 SEP +eor sAbu_, sE4, sAsu, ROR #9 SEP xar_m1 vBga, vAbo, E3, 36 +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP +eor sAbe_, sE1, sAge, ROR #19 SEP xar_m0 vBbo, vAmo, E3, 43 +load_constant_ptr SEP +restore count, STACK_OFFSET_COUNT SEP xar_m0 vBmo, vAmi, E2, 49 +bic tmp, sAgi_, sAge_, ROR #47 SEP +eor sAga, tmp, sAga_, ROR #39 SEP xar_m0 vBmi, vAke, E1, 54 +bic tmp, sAgo_, sAgi_, ROR #42 SEP +eor sAge, tmp, sAge_, ROR #25 SEP +bic tmp, sAgu_, sAgo_, ROR #16 SEP xar_m1 vBge, vAgu, E4, 44 +eor sAgi, tmp, sAgi_, ROR #58 SEP +bic tmp, sAga_, sAgu_, ROR #31 SEP mov E3.16b, vAga.16b +eor sAgo, tmp, sAgo_, ROR #47 SEP +bic tmp, sAge_, sAga_, ROR #56 SEP bcax_m0 vAga, vBga, vBgi, vBge +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP +eor sAka, tmp, sAka_, ROR #24 SEP xar_m0 vBgu, vAsi, E2, 3 +bic tmp, sAko_, sAki_, ROR #47 SEP +eor sAke, tmp, sAke_, ROR #2 SEP xar_m0 vBsi, vAku, E4, 25 +bic tmp, sAku_, sAko_, ROR #10 SEP +eor sAki, tmp, sAki_, ROR #57 SEP +bic tmp, sAka_, sAku_, ROR #47 SEP xar_m1 vBku, vAsa, E0, 46 +eor sAko, tmp, sAko_, ROR #57 SEP +bic tmp, sAke_, sAka_, ROR #5 SEP xar_m0 vBma, vAbu, E4, 37 +eor sAku, tmp, sAku_, ROR #52 SEP +bic tmp, sAmi_, sAme_, ROR #38 SEP xar_m0 vBbu, vAsu, E4, 50 +eor sAma, tmp, sAma_, ROR #47 SEP +bic tmp, sAmo_, sAmi_, ROR #5 SEP +eor sAme, tmp, sAme_, ROR #43 SEP xar_m0 vBsu, vAse, E1, 62 +bic tmp, sAmu_, sAmo_, ROR #41 SEP +eor sAmi, tmp, sAmi_, ROR #46 SEP xar_m1 vBme, E3, E0, 28 +bic tmp, sAma_, sAmu_, ROR #35 SEP +ldr cur_const, [const_addr, count, UXTW #3] SEP xar_m0 vBbe, vAge, E1, 20 +add count, count, #1 SEP +eor sAmo, tmp, sAmo_, ROR #12 SEP +bic tmp, sAme_, sAma_, ROR #9 SEP bcax_m0 vAge, vBge, vBgo, vBgi +eor sAmu, tmp, sAmu_, ROR #44 SEP +bic tmp, sAsi_, sAse_, ROR #48 SEP bcax_m0 vAgi, vBgi, vBgu, vBgo +eor sAsa, tmp, sAsa_, ROR #41 SEP +bic tmp, sAso_, sAsi_, ROR #2 SEP +eor sAse, tmp, sAse_, ROR #50 SEP bcax_m1 vAgo, vBgo, vBga, vBgu +bic tmp, sAsu_, sAso_, ROR #25 SEP +eor sAsi, tmp, sAsi_, ROR #27 SEP bcax_m0 vAgu, vBgu, vBge, vBga +bic tmp, sAsa_, sAsu_, ROR #60 SEP +eor sAso, tmp, sAso_, ROR #21 SEP bcax_m0 vAka, vBka, vBki, vBke +bic tmp, sAse_, sAsa_, ROR #57 SEP +eor sAsu, tmp, sAsu_, ROR #53 SEP +bic tmp, sAbi_, sAbe_, ROR #63 SEP bcax_m0 vAke, vBke, vBko, vBki +eor s_Aba, s_Aba_, tmp, ROR #21 SEP +bic tmp, sAbo_, sAbi_, ROR #42 SEP .unreq vvtmp +eor sAbe, tmp, sAbe_, ROR #41 SEP +bic tmp, sAbu_, sAbo_, ROR #57 SEP .unreq vvtmpq +eor sAbi, tmp, sAbi_, ROR #35 SEP +bic tmp, s_Aba_, sAbu_, ROR #50 SEP +eor sAbo, tmp, sAbo_, ROR #43 SEP eor2 C0, vAka, vAga +bic tmp, sAbe_, s_Aba_, ROR #44 SEP +eor sAbu, tmp, sAbu_, ROR #30 SEP save(vAga) +eor s_Aba, s_Aba, cur_const SEP +save count, STACK_OFFSET_COUNT SEP +eor sC0, sAka, sAsa, ROR #50 SEP vvtmp .req vAga +eor sC1, sAse, sAge, ROR #60 SEP +eor sC2, sAmi, sAgi, ROR #59 SEP vvtmpq .req vAgaq +eor sC3, sAgo, sAso, ROR #30 SEP +eor sC4, sAbu, sAsu, ROR #53 SEP bcax_m0 vAki, vBki, vBku, vBko +eor sC0, sAma, sC0, ROR #49 SEP +eor sC1, sAbe, sC1, ROR #44 SEP +eor sC2, sAki, sC2, ROR #26 SEP bcax_m0 vAko, vBko, vBka, vBku +eor sC3, sAmo, sC3, ROR #63 SEP +eor sC4, sAmu, sC4, ROR #56 SEP eor2 C1, vAke, vAge +eor sC0, sAga, sC0, ROR #57 SEP +eor sC1, sAme, sC1, ROR #58 SEP bcax_m0 vAku, vBku, vBke, vBka +eor sC2, sAbi, sC2, ROR #60 SEP +eor sC3, sAko, sC3, ROR #38 SEP +eor sC4, sAgu, sC4, ROR #48 SEP eor2 C2, vAki, vAgi +eor sC0, s_Aba, sC0, ROR #61 SEP +eor sC1, sAke, sC1, ROR #57 SEP bcax_m0 vAma, vBma, vBmi, vBme +eor sC2, sAsi, sC2, ROR #52 SEP +eor sC3, sAbo, sC3, ROR #63 SEP eor2 C3, vAko, vAgo +eor sC4, sAku, sC4, ROR #50 SEP +ror sC1, sC1, 56 SEP +ror sC4, sC4, 58 SEP bcax_m0 vAme, vBme, vBmo, vBmi +ror sC2, sC2, 62 SEP +eor sE1, sC0, sC2, ROR #63 SEP eor2 C4, vAku, vAgu +eor sE3, sC2, sC4, ROR #63 SEP +eor sE0, sC4, sC1, ROR #63 SEP +eor sE2, sC1, sC3, ROR #63 SEP bcax_m0 vAmi, vBmi, vBmu, vBmo +eor sE4, sC3, sC0, ROR #63 SEP +eor s_Aba_, sE0, s_Aba SEP eor2 C0, C0, vAma +eor sAsa_, sE2, sAbi, ROR #50 SEP +eor sAbi_, sE2, sAki, ROR #46 SEP bcax_m0 vAmo, vBmo, vBma, vBmu +eor sAki_, sE3, sAko, ROR #63 SEP +eor sAko_, sE4, sAmu, ROR #28 SEP +eor sAmu_, sE3, sAso, ROR #2 SEP eor2 C1, C1, vAme +eor sAso_, sE0, sAma, ROR #54 SEP +eor sAka_, sE1, sAbe, ROR #43 SEP bcax_m1 vAmu, vBmu, vBme, vBma +eor sAse_, sE3, sAgo, ROR #36 SEP +eor sAgo_, sE1, sAme, ROR #49 SEP eor2 C2, C2, vAmi +eor sAke_, sE2, sAgi, ROR #3 SEP +eor sAgi_, sE0, sAka, ROR #39 SEP +eor sAga_, sE3, sAbo SEP bcax_m0 vAsa, vBsa, vBsi, vBse +eor sAbo_, sE3, sAmo, ROR #37 SEP +eor sAmo_, sE2, sAmi, ROR #8 SEP eor2 C3, C3, vAmo +eor sAmi_, sE1, sAke, ROR #56 SEP +eor sAge_, sE4, sAgu, ROR #44 SEP +eor sAgu_, sE2, sAsi, ROR #62 SEP bcax_m0 vAse, vBse, vBso, vBsi +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP eor2 C4, C4, vAmu +eor sAma_, sE4, sAbu, ROR #20 SEP +eor sAbu_, sE4, sAsu, ROR #9 SEP bcax_m0 vAsi, vBsi, vBsu, vBso +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP +eor sAbe_, sE1, sAge, ROR #19 SEP eor2 C0, C0, vAsa +load_constant_ptr SEP +restore count, STACK_OFFSET_COUNT SEP bcax_m0 vAso, vBso, vBsa, vBsu +bic tmp, sAgi_, sAge_, ROR #47 SEP +eor sAga, tmp, sAga_, ROR #39 SEP eor2 C1, C1, vAse +bic tmp, sAgo_, sAgi_, ROR #42 SEP +eor sAge, tmp, sAge_, ROR #25 SEP +bic tmp, sAgu_, sAgo_, ROR #16 SEP bcax_m0 vAsu, vBsu, vBse, vBsa +eor sAgi, tmp, sAgi_, ROR #58 SEP +bic tmp, sAga_, sAgu_, ROR #31 SEP eor2 C2, C2, vAsi +eor sAgo, tmp, sAgo_, ROR #47 SEP +bic tmp, sAge_, sAga_, ROR #56 SEP eor2 C3, C3, vAso +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP +eor sAka, tmp, sAka_, ROR #24 SEP bcax_m0 vAba, vBba, vBbi, vBbe +bic tmp, sAko_, sAki_, ROR #47 SEP +eor sAke, tmp, sAke_, ROR #2 SEP bcax_m0 vAbe, vBbe, vBbo, vBbi +bic tmp, sAku_, sAko_, ROR #10 SEP +eor sAki, tmp, sAki_, ROR #57 SEP +bic tmp, sAka_, sAku_, ROR #47 SEP eor2 C1, C1, vAbe +eor sAko, tmp, sAko_, ROR #57 SEP +bic tmp, sAke_, sAka_, ROR #5 SEP restore x26, STACK_OFFSET_CONST +eor sAku, tmp, sAku_, ROR #52 SEP +bic tmp, sAmi_, sAme_, ROR #38 SEP ldr vvtmpq, [x26], #16 +eor sAma, tmp, sAma_, ROR #47 SEP +bic tmp, sAmo_, sAmi_, ROR #5 SEP +eor sAme, tmp, sAme_, ROR #43 SEP save x26, STACK_OFFSET_CONST +bic tmp, sAmu_, sAmo_, ROR #41 SEP +eor sAmi, tmp, sAmi_, ROR #46 SEP eor vAba.16b, vAba.16b, vvtmp.16b +bic tmp, sAma_, sAmu_, ROR #35 SEP +ldr cur_const, [const_addr, count, UXTW #3] SEP eor2 C4, C4, vAsu +add count, count, #1 SEP +eor sAmo, tmp, sAmo_, ROR #12 SEP +bic tmp, sAme_, sAma_, ROR #9 SEP bcax_m0 vAbi, vBbi, vBbu, vBbo +eor sAmu, tmp, sAmu_, ROR #44 SEP +bic tmp, sAsi_, sAse_, ROR #48 SEP bcax_m0 vAbo, vBbo, vBba, vBbu +eor sAsa, tmp, sAsa_, ROR #41 SEP +bic tmp, sAso_, sAsi_, ROR #2 SEP +eor sAse, tmp, sAse_, ROR #50 SEP eor2 C3, C3, vAbo +bic tmp, sAsu_, sAso_, ROR #25 SEP +eor sAsi, tmp, sAsi_, ROR #27 SEP eor2 C2, C2, vAbi +bic tmp, sAsa_, sAsu_, ROR #60 SEP +eor sAso, tmp, sAso_, ROR #21 SEP eor2 C0, C0, vAba +bic tmp, sAse_, sAsa_, ROR #57 SEP +eor sAsu, tmp, sAsu_, ROR #53 SEP +bic tmp, sAbi_, sAbe_, ROR #63 SEP bcax_m0 vAbu, vBbu, vBbe, vBba +eor s_Aba, s_Aba_, tmp, ROR #21 SEP +bic tmp, sAbo_, sAbi_, ROR #42 SEP eor2 C4, C4, vAbu +eor sAbe, tmp, sAbe_, ROR #41 SEP +bic tmp, sAbu_, sAbo_, ROR #57 SEP restore(vAga) +eor sAbi, tmp, sAbi_, ROR #35 SEP +bic tmp, s_Aba_, sAbu_, ROR #50 SEP +eor sAbo, tmp, sAbo_, ROR #43 SEP .unreq vvtmp +bic tmp, sAbe_, s_Aba_, ROR #44 SEP +eor sAbu, tmp, sAbu_, ROR #30 SEP .unreq vvtmpq +eor s_Aba, s_Aba, cur_const SEP +.endm + +.macro hybrid_round_final +save count, STACK_OFFSET_COUNT SEP +eor sC0, sAka, sAsa, ROR #50 SEP vvtmp .req vBba +eor sC1, sAse, sAge, ROR #60 SEP +eor sC2, sAmi, sAgi, ROR #59 SEP +eor sC3, sAgo, sAso, ROR #30 SEP rax1_m0 E2, C1, C3 +eor sC4, sAbu, sAsu, ROR #53 SEP +eor sC0, sAma, sC0, ROR #49 SEP +eor sC1, sAbe, sC1, ROR #44 SEP +eor sC2, sAki, sC2, ROR #26 SEP rax1_m0 E4, C3, C0 +eor sC3, sAmo, sC3, ROR #63 SEP +eor sC4, sAmu, sC4, ROR #56 SEP +eor sC0, sAga, sC0, ROR #57 SEP rax1_m0 E1, C0, C2 +eor sC1, sAme, sC1, ROR #58 SEP +eor sC2, sAbi, sC2, ROR #60 SEP +eor sC3, sAko, sC3, ROR #38 SEP +eor sC4, sAgu, sC4, ROR #48 SEP rax1_m0 E3, C2, C4 +eor sC0, s_Aba, sC0, ROR #61 SEP +eor sC1, sAke, sC1, ROR #57 SEP +eor sC2, sAsi, sC2, ROR #52 SEP +eor sC3, sAbo, sC3, ROR #63 SEP rax1_m0 E0, C4, C1 +eor sC4, sAku, sC4, ROR #50 SEP +ror sC1, sC1, 56 SEP +ror sC4, sC4, 58 SEP .unreq vvtmp +ror sC2, sC2, 62 SEP +eor sE1, sC0, sC2, ROR #63 SEP +eor sE3, sC2, sC4, ROR #63 SEP +eor sE0, sC4, sC1, ROR #63 SEP vvtmp .req C1 +eor sE2, sC1, sC3, ROR #63 SEP +eor sE4, sC3, sC0, ROR #63 SEP +eor s_Aba_, sE0, s_Aba SEP +eor sAsa_, sE2, sAbi, ROR #50 SEP vvtmpq .req C1q +eor sAbi_, sE2, sAki, ROR #46 SEP +eor sAki_, sE3, sAko, ROR #63 SEP +eor sAko_, sE4, sAmu, ROR #28 SEP eor vBba.16b, vAba.16b, E0.16b +eor sAmu_, sE3, sAso, ROR #2 SEP +eor sAso_, sE0, sAma, ROR #54 SEP +eor sAka_, sE1, sAbe, ROR #43 SEP +eor sAse_, sE3, sAgo, ROR #36 SEP xar_m0 vBsa, vAbi, E2, 2 +eor sAgo_, sE1, sAme, ROR #49 SEP +eor sAke_, sE2, sAgi, ROR #3 SEP +eor sAgi_, sE0, sAka, ROR #39 SEP +eor sAga_, sE3, sAbo SEP xar_m0 vBbi, vAki, E2, 21 +eor sAbo_, sE3, sAmo, ROR #37 SEP +eor sAmo_, sE2, sAmi, ROR #8 SEP +eor sAmi_, sE1, sAke, ROR #56 SEP xar_m0 vBki, vAko, E3, 39 +eor sAge_, sE4, sAgu, ROR #44 SEP +eor sAgu_, sE2, sAsi, ROR #62 SEP +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP xar_m1 vBko, vAmu, E4, 56 +eor sAma_, sE4, sAbu, ROR #20 SEP +eor sAbu_, sE4, sAsu, ROR #9 SEP +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP xar_m0 vBmu, vAso, E3, 8 +eor sAbe_, sE1, sAge, ROR #19 SEP +load_constant_ptr SEP +restore count, STACK_OFFSET_COUNT SEP xar_m0 vBso, vAma, E0, 23 +bic tmp, sAgi_, sAge_, ROR #47 SEP +eor sAga, tmp, sAga_, ROR #39 SEP +bic tmp, sAgo_, sAgi_, ROR #42 SEP +eor sAge, tmp, sAge_, ROR #25 SEP xar_m0 vBka, vAbe, E1, 63 +bic tmp, sAgu_, sAgo_, ROR #16 SEP +eor sAgi, tmp, sAgi_, ROR #58 SEP +bic tmp, sAga_, sAgu_, ROR #31 SEP +eor sAgo, tmp, sAgo_, ROR #47 SEP xar_m1 vBse, vAgo, E3, 9 +bic tmp, sAge_, sAga_, ROR #56 SEP +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP xar_m0 vBgo, vAme, E1, 19 +eor sAka, tmp, sAka_, ROR #24 SEP +bic tmp, sAko_, sAki_, ROR #47 SEP +eor sAke, tmp, sAke_, ROR #2 SEP +bic tmp, sAku_, sAko_, ROR #10 SEP xar_m0 vBke, vAgi, E2, 58 +eor sAki, tmp, sAki_, ROR #57 SEP +bic tmp, sAka_, sAku_, ROR #47 SEP +eor sAko, tmp, sAko_, ROR #57 SEP +bic tmp, sAke_, sAka_, ROR #5 SEP xar_m0 vBgi, vAka, E0, 61 +eor sAku, tmp, sAku_, ROR #52 SEP +bic tmp, sAmi_, sAme_, ROR #38 SEP +eor sAma, tmp, sAma_, ROR #47 SEP xar_m1 vBga, vAbo, E3, 36 +bic tmp, sAmo_, sAmi_, ROR #5 SEP +eor sAme, tmp, sAme_, ROR #43 SEP +bic tmp, sAmu_, sAmo_, ROR #41 SEP +eor sAmi, tmp, sAmi_, ROR #46 SEP xar_m0 vBbo, vAmo, E3, 43 +bic tmp, sAma_, sAmu_, ROR #35 SEP +ldr cur_const, [const_addr, count, UXTW #3] SEP +add count, count, #1 SEP xar_m0 vBmo, vAmi, E2, 49 +eor sAmo, tmp, sAmo_, ROR #12 SEP +bic tmp, sAme_, sAma_, ROR #9 SEP +eor sAmu, tmp, sAmu_, ROR #44 SEP +bic tmp, sAsi_, sAse_, ROR #48 SEP xar_m0 vBmi, vAke, E1, 54 +eor sAsa, tmp, sAsa_, ROR #41 SEP +bic tmp, sAso_, sAsi_, ROR #2 SEP +eor sAse, tmp, sAse_, ROR #50 SEP +bic tmp, sAsu_, sAso_, ROR #25 SEP xar_m1 vBge, vAgu, E4, 44 +eor sAsi, tmp, sAsi_, ROR #27 SEP +bic tmp, sAsa_, sAsu_, ROR #60 SEP +eor sAso, tmp, sAso_, ROR #21 SEP mov E3.16b, vAga.16b +bic tmp, sAse_, sAsa_, ROR #57 SEP +eor sAsu, tmp, sAsu_, ROR #53 SEP +bic tmp, sAbi_, sAbe_, ROR #63 SEP +eor s_Aba, s_Aba_, tmp, ROR #21 SEP bcax_m0 vAga, vBga, vBgi, vBge +bic tmp, sAbo_, sAbi_, ROR #42 SEP +eor sAbe, tmp, sAbe_, ROR #41 SEP +bic tmp, sAbu_, sAbo_, ROR #57 SEP +eor sAbi, tmp, sAbi_, ROR #35 SEP xar_m0 vBgu, vAsi, E2, 3 +bic tmp, s_Aba_, sAbu_, ROR #50 SEP +eor sAbo, tmp, sAbo_, ROR #43 SEP +bic tmp, sAbe_, s_Aba_, ROR #44 SEP xar_m0 vBsi, vAku, E4, 25 +eor sAbu, tmp, sAbu_, ROR #30 SEP +eor s_Aba, s_Aba, cur_const SEP +save count, STACK_OFFSET_COUNT SEP +eor sC0, sAka, sAsa, ROR #50 SEP xar_m1 vBku, vAsa, E0, 46 +eor sC1, sAse, sAge, ROR #60 SEP +eor sC2, sAmi, sAgi, ROR #59 SEP +eor sC3, sAgo, sAso, ROR #30 SEP +eor sC4, sAbu, sAsu, ROR #53 SEP xar_m0 vBma, vAbu, E4, 37 +eor sC0, sAma, sC0, ROR #49 SEP +eor sC1, sAbe, sC1, ROR #44 SEP +eor sC2, sAki, sC2, ROR #26 SEP xar_m0 vBbu, vAsu, E4, 50 +eor sC3, sAmo, sC3, ROR #63 SEP +eor sC4, sAmu, sC4, ROR #56 SEP +eor sC0, sAga, sC0, ROR #57 SEP +eor sC1, sAme, sC1, ROR #58 SEP xar_m0 vBsu, vAse, E1, 62 +eor sC2, sAbi, sC2, ROR #60 SEP +eor sC3, sAko, sC3, ROR #38 SEP +eor sC4, sAgu, sC4, ROR #48 SEP +eor sC0, s_Aba, sC0, ROR #61 SEP xar_m1 vBme, E3, E0, 28 +eor sC1, sAke, sC1, ROR #57 SEP +eor sC2, sAsi, sC2, ROR #52 SEP +eor sC3, sAbo, sC3, ROR #63 SEP xar_m0 vBbe, vAge, E1, 20 +eor sC4, sAku, sC4, ROR #50 SEP +ror sC1, sC1, 56 SEP +ror sC4, sC4, 58 SEP +ror sC2, sC2, 62 SEP bcax_m0 vAge, vBge, vBgo, vBgi +eor sE1, sC0, sC2, ROR #63 SEP +eor sE3, sC2, sC4, ROR #63 SEP +eor sE0, sC4, sC1, ROR #63 SEP +eor sE2, sC1, sC3, ROR #63 SEP bcax_m0 vAgi, vBgi, vBgu, vBgo +eor sE4, sC3, sC0, ROR #63 SEP +eor s_Aba_, sE0, s_Aba SEP +eor sAsa_, sE2, sAbi, ROR #50 SEP bcax_m1 vAgo, vBgo, vBga, vBgu +eor sAbi_, sE2, sAki, ROR #46 SEP +eor sAki_, sE3, sAko, ROR #63 SEP +eor sAko_, sE4, sAmu, ROR #28 SEP +eor sAmu_, sE3, sAso, ROR #2 SEP bcax_m0 vAgu, vBgu, vBge, vBga +eor sAso_, sE0, sAma, ROR #54 SEP +eor sAka_, sE1, sAbe, ROR #43 SEP +eor sAse_, sE3, sAgo, ROR #36 SEP +eor sAgo_, sE1, sAme, ROR #49 SEP bcax_m0 vAka, vBka, vBki, vBke +eor sAke_, sE2, sAgi, ROR #3 SEP +eor sAgi_, sE0, sAka, ROR #39 SEP +eor sAga_, sE3, sAbo SEP bcax_m0 vAke, vBke, vBko, vBki +eor sAbo_, sE3, sAmo, ROR #37 SEP +eor sAmo_, sE2, sAmi, ROR #8 SEP +eor sAmi_, sE1, sAke, ROR #56 SEP +eor sAge_, sE4, sAgu, ROR #44 SEP bcax_m1 vAki, vBki, vBku, vBko +eor sAgu_, sE2, sAsi, ROR #62 SEP +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP +eor sAma_, sE4, sAbu, ROR #20 SEP bcax_m0 vAko, vBko, vBka, vBku +eor sAbu_, sE4, sAsu, ROR #9 SEP +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP bcax_m0 vAku, vBku, vBke, vBka +eor sAbe_, sE1, sAge, ROR #19 SEP +load_constant_ptr SEP +restore count, STACK_OFFSET_COUNT SEP +bic tmp, sAgi_, sAge_, ROR #47 SEP bcax_m0 vAma, vBma, vBmi, vBme +eor sAga, tmp, sAga_, ROR #39 SEP +bic tmp, sAgo_, sAgi_, ROR #42 SEP +eor sAge, tmp, sAge_, ROR #25 SEP bcax_m1 vAme, vBme, vBmo, vBmi +bic tmp, sAgu_, sAgo_, ROR #16 SEP +eor sAgi, tmp, sAgi_, ROR #58 SEP +bic tmp, sAga_, sAgu_, ROR #31 SEP +eor sAgo, tmp, sAgo_, ROR #47 SEP bcax_m0 vAmi, vBmi, vBmu, vBmo +bic tmp, sAge_, sAga_, ROR #56 SEP +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP +eor sAka, tmp, sAka_, ROR #24 SEP bcax_m0 vAmo, vBmo, vBma, vBmu +bic tmp, sAko_, sAki_, ROR #47 SEP +eor sAke, tmp, sAke_, ROR #2 SEP +bic tmp, sAku_, sAko_, ROR #10 SEP bcax_m0 vAmu, vBmu, vBme, vBma +eor sAki, tmp, sAki_, ROR #57 SEP +bic tmp, sAka_, sAku_, ROR #47 SEP +eor sAko, tmp, sAko_, ROR #57 SEP +bic tmp, sAke_, sAka_, ROR #5 SEP bcax_m1 vAsa, vBsa, vBsi, vBse +eor sAku, tmp, sAku_, ROR #52 SEP +bic tmp, sAmi_, sAme_, ROR #38 SEP +eor sAma, tmp, sAma_, ROR #47 SEP +bic tmp, sAmo_, sAmi_, ROR #5 SEP bcax_m0 vAse, vBse, vBso, vBsi +eor sAme, tmp, sAme_, ROR #43 SEP +bic tmp, sAmu_, sAmo_, ROR #41 SEP +eor sAmi, tmp, sAmi_, ROR #46 SEP bcax_m0 vAsi, vBsi, vBsu, vBso +bic tmp, sAma_, sAmu_, ROR #35 SEP +ldr cur_const, [const_addr, count, UXTW #3] SEP +add count, count, #1 SEP +eor sAmo, tmp, sAmo_, ROR #12 SEP bcax_m0 vAso, vBso, vBsa, vBsu +bic tmp, sAme_, sAma_, ROR #9 SEP +eor sAmu, tmp, sAmu_, ROR #44 SEP +bic tmp, sAsi_, sAse_, ROR #48 SEP +eor sAsa, tmp, sAsa_, ROR #41 SEP bcax_m1 vAsu, vBsu, vBse, vBsa +bic tmp, sAso_, sAsi_, ROR #2 SEP +eor sAse, tmp, sAse_, ROR #50 SEP +bic tmp, sAsu_, sAso_, ROR #25 SEP bcax_m0 vAba, vBba, vBbi, vBbe +eor sAsi, tmp, sAsi_, ROR #27 SEP +bic tmp, sAsa_, sAsu_, ROR #60 SEP +eor sAso, tmp, sAso_, ROR #21 SEP +bic tmp, sAse_, sAsa_, ROR #57 SEP bcax_m0 vAbe, vBbe, vBbo, vBbi +eor sAsu, tmp, sAsu_, ROR #53 SEP +bic tmp, sAbi_, sAbe_, ROR #63 SEP +eor s_Aba, s_Aba_, tmp, ROR #21 SEP +bic tmp, sAbo_, sAbi_, ROR #42 SEP bcax_m0 vAbi, vBbi, vBbu, vBbo +eor sAbe, tmp, sAbe_, ROR #41 SEP +bic tmp, sAbu_, sAbo_, ROR #57 SEP +eor sAbi, tmp, sAbi_, ROR #35 SEP bcax_m0 vAbo, vBbo, vBba, vBbu +bic tmp, s_Aba_, sAbu_, ROR #50 SEP +eor sAbo, tmp, sAbo_, ROR #43 SEP +bic tmp, sAbe_, s_Aba_, ROR #44 SEP +eor sAbu, tmp, sAbu_, ROR #30 SEP bcax_m0 vAbu, vBbu, vBbe, vBba +eor s_Aba, s_Aba, cur_const SEP +ror sAga, sAga,(64-3) SEP +ror sAka, sAka,(64-25) SEP +ror sAma, sAma,(64-10) SEP restore x26, STACK_OFFSET_CONST +ror sAsa, sAsa,(64-39) SEP +ror sAbe, sAbe,(64-21) SEP +ror sAge, sAge,(64-45) SEP ldr vvtmpq, [x26], #16 +ror sAke, sAke,(64-8) SEP +ror sAme, sAme,(64-15) SEP +ror sAse, sAse,(64-41) SEP +ror sAbi, sAbi,(64-14) SEP save x26, STACK_OFFSET_CONST +ror sAgi, sAgi,(64-61) SEP +ror sAki, sAki,(64-18) SEP +ror sAmi, sAmi,(64-56) SEP +ror sAsi, sAsi,(64-2) SEP eor vAba.16b, vAba.16b, vvtmp.16b +ror sAgo, sAgo,(64-28) SEP +ror sAko, sAko,(64-1) SEP +ror sAmo, sAmo,(64-27) SEP .unreq vvtmp +ror sAso, sAso,(64-62) SEP +ror sAbu, sAbu,(64-44) SEP +ror sAgu, sAgu,(64-20) SEP +ror sAku, sAku,(64-6) SEP .unreq vvtmpq +ror sAmu, sAmu,(64-36) SEP +ror sAsu, sAsu,(64-55) SEP +.endm + + +#define KECCAK_F1600_ROUNDS 24 + +.global keccak_f1600_x4_hybrid_asm_v8 +.global _keccak_f1600_x4_hybrid_asm_v8 +.text +.align 4 + +keccak_f1600_x4_hybrid_asm_v8: +_keccak_f1600_x4_hybrid_asm_v8: + alloc_stack + save_gprs + save_vregs + save input_addr, STACK_OFFSET_INPUT + + + ASM_LOAD(const_addr,round_constants_vec) + + save const_addr, STACK_OFFSET_CONST + load_input_vector 2,1 + + // First scalar Keccak computation alongside first half of SIMD computation + load_input_scalar 4,0 + hybrid_round_initial + loop_0: + hybrid_round_noninitial + cmp count, #(KECCAK_F1600_ROUNDS-3) + ble loop_0 + + hybrid_round_final + + restore input_addr, STACK_OFFSET_INPUT + store_input_scalar 4,0 + + // Second scalar Keccak computation alongsie second half of SIMD computation + load_input_scalar 4,1 + hybrid_round_initial + loop_1: + hybrid_round_noninitial + cmp count, #(KECCAK_F1600_ROUNDS-3) + ble loop_1 + + hybrid_round_final + + restore input_addr, STACK_OFFSET_INPUT + store_input_scalar 4,1 + store_input_vector 2,1 + + restore_vregs + restore_gprs + free_stack + + + ret +#endif diff --git a/asm/manual/keccak_f1600/keccak_f1600_x4_scalar_asm_v1.s b/asm/manual/keccak_f1600/keccak_f1600_x4_scalar_asm_v1.s new file mode 100644 index 0000000..7ce0c0d --- /dev/null +++ b/asm/manual/keccak_f1600/keccak_f1600_x4_scalar_asm_v1.s @@ -0,0 +1,561 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + + .data + .align(8) +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + + input_addr .req x0 + const_addr .req x1 + count .req w0 + cur_const .req x1 + + /* Allocation of GPRs for Keccak-f1600 state */ +#define ABA x2 +#define ABE x3 +#define ABI x4 +#define ABO x5 +#define ABU x6 +#define AGA x7 +#define AGE x8 +#define AGI x9 +#define AGO x10 +#define AGU x11 +#define AKA x12 +#define AKE x13 +#define AKI x14 +#define AKO x15 +#define AKU x16 +#define AMA x17 +#define AME x18 +#define AMI x19 +#define AMO x20 +#define AMU x21 +#define ASA x22 +#define ASE x23 +#define ASI x24 +#define ASO x25 +#define ASU x26 + + Aba .req ABA + Abe .req ABE + Abi .req ABI + Abo .req ABO + Abu .req ABU + Aga .req AGA + Age .req AGE + Agi .req AGI + Ago .req AGO + Agu .req AGU + Aka .req AKA + Ake .req AKE + Aki .req AKI + Ako .req AKO + Aku .req AKU + Ama .req AMA + Ame .req AME + Ami .req AMI + Amo .req AMO + Amu .req AMU + Asa .req ASA + Ase .req ASE + Asi .req ASI + Aso .req ASO + Asu .req ASU + + Aba_tmp .req AGA + Abe_tmp .req AGE + Abi_tmp .req ABI + Abo_tmp .req ABO + Abu_tmp .req ABU + Aga_tmp .req AKA + Age_tmp .req AKE + Agi_tmp .req AGI + Ago_tmp .req AGO + Agu_tmp .req AGU + Aka_tmp .req AMA + Ake_tmp .req AME + Aki_tmp .req AKI + Ako_tmp .req AKO + Aku_tmp .req AKU + Ama_tmp .req ASA + Ame_tmp .req ASE + Ami_tmp .req AMI + Amo_tmp .req AMO + Amu_tmp .req AMU + Asa_tmp .req x28 + Ase_tmp .req x27 + Asi_tmp .req ASI + Aso_tmp .req ASO + Asu_tmp .req ASU + +#define STACK_SIZE (16*6 + 3*8 + 8) // GPRs (16*6), count (8), const (8), input (8), padding (8) +#define STACK_BASE_GPRS (3*8+8) +#define STACK_OFFSET_INPUT (0*8) +#define STACK_OFFSET_CONST (1*8) +#define STACK_OFFSET_COUNT (2*8) + +.macro store_input_scalar num idx + str Aba, [input_addr, 8*(\num*(0) +\idx)] + str Abe, [input_addr, 8*(\num*(0+1) +\idx)] + str Abi, [input_addr, 8*(\num*(2)+ \idx)] + str Abo, [input_addr, 8*(\num*(2+1) +\idx)] + str Abu, [input_addr, 8*(\num*(4)+ \idx)] + str Aga, [input_addr, 8*(\num*(4+1) +\idx)] + str Age, [input_addr, 8*(\num*(6)+ \idx)] + str Agi, [input_addr, 8*(\num*(6+1) +\idx)] + str Ago, [input_addr, 8*(\num*(8)+ \idx)] + str Agu, [input_addr, 8*(\num*(8+1) +\idx)] + str Aka, [input_addr, 8*(\num*(10) +\idx)] + str Ake, [input_addr, 8*(\num*(10+1)+\idx)] + str Aki, [input_addr, 8*(\num*(12) +\idx)] + str Ako, [input_addr, 8*(\num*(12+1)+\idx)] + str Aku, [input_addr, 8*(\num*(14) +\idx)] + str Ama, [input_addr, 8*(\num*(14+1)+\idx)] + str Ame, [input_addr, 8*(\num*(16) +\idx)] + str Ami, [input_addr, 8*(\num*(16+1)+\idx)] + str Amo, [input_addr, 8*(\num*(18) +\idx)] + str Amu, [input_addr, 8*(\num*(18+1)+\idx)] + str Asa, [input_addr, 8*(\num*(20) +\idx)] + str Ase, [input_addr, 8*(\num*(20+1)+\idx)] + str Asi, [input_addr, 8*(\num*(22) +\idx)] + str Aso, [input_addr, 8*(\num*(22+1)+\idx)] + str Asu, [input_addr, 8*(\num*(24) +\idx)] +.endm + +.macro load_input_scalar num idx + ldr Aba, [input_addr, 8*(\num*(0) +\idx)] + ldr Abe, [input_addr, 8*(\num*(0+1) +\idx)] + ldr Abi, [input_addr, 8*(\num*(2)+ \idx)] + ldr Abo, [input_addr, 8*(\num*(2+1) +\idx)] + ldr Abu, [input_addr, 8*(\num*(4)+ \idx)] + ldr Aga, [input_addr, 8*(\num*(4+1) +\idx)] + ldr Age, [input_addr, 8*(\num*(6)+ \idx)] + ldr Agi, [input_addr, 8*(\num*(6+1) +\idx)] + ldr Ago, [input_addr, 8*(\num*(8)+ \idx)] + ldr Agu, [input_addr, 8*(\num*(8+1) +\idx)] + ldr Aka, [input_addr, 8*(\num*(10) +\idx)] + ldr Ake, [input_addr, 8*(\num*(10+1)+\idx)] + ldr Aki, [input_addr, 8*(\num*(12) +\idx)] + ldr Ako, [input_addr, 8*(\num*(12+1)+\idx)] + ldr Aku, [input_addr, 8*(\num*(14) +\idx)] + ldr Ama, [input_addr, 8*(\num*(14+1)+\idx)] + ldr Ame, [input_addr, 8*(\num*(16) +\idx)] + ldr Ami, [input_addr, 8*(\num*(16+1)+\idx)] + ldr Amo, [input_addr, 8*(\num*(18) +\idx)] + ldr Amu, [input_addr, 8*(\num*(18+1)+\idx)] + ldr Asa, [input_addr, 8*(\num*(20) +\idx)] + ldr Ase, [input_addr, 8*(\num*(20+1)+\idx)] + ldr Asi, [input_addr, 8*(\num*(22) +\idx)] + ldr Aso, [input_addr, 8*(\num*(22+1)+\idx)] + ldr Asu, [input_addr, 8*(\num*(24) +\idx)] +.endm + +.macro save_gprs + stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +.macro eor5 dst, src0, src1, src2, src3, src4 + eor \dst, \src0, \src1 + eor \dst, \dst, \src2 + eor \dst, \dst, \src3 + eor \dst, \dst, \src4 +.endm + +.macro xor_rol dst, src1, src0, imm + eor \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro bic_rol dst, src1, src0, imm + bic \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro rotate dst, src, imm + ror \dst, \src, #(64-\imm) +.endm + +.macro save reg, offset + str \reg, [sp, #\offset] +.endm + +.macro restore reg, offset + ldr \reg, [sp, #\offset] +.endm + +.macro keccak_f1600_round is_first + + .if \is_first == 0 + save count, STACK_OFFSET_COUNT + .endif + +#define BCE x30 +#define BCA x0 +#define BCI x27 +#define BCO x28 +#define BCU x29 + + BCe .req BCE + BCa .req BCA + BCi .req BCI + BCo .req BCO + BCu .req BCU + + .if \is_first == 1 + eor5 BCa, Aba, Aga, Aka, Ama, Asa + eor5 BCe, Abe, Age, Ake, Ame, Ase + eor5 BCi, Abi, Agi, Aki, Ami, Asi + eor5 BCo, Abo, Ago, Ako, Amo, Aso + eor5 BCu, Abu, Agu, Aku, Amu, Asu + .else + xor_rol BCu, Asu, Abu , 11 + xor_rol BCa, Asa, Aka, 14 + xor_rol BCe, Age, Ase , 4 + xor_rol BCi, Agi, Ami , 5 + xor_rol BCu, BCu, Amu , 8 + xor_rol BCo, Aso, Ago , 34 + xor_rol BCe, BCe, Abe , 20 + xor_rol BCa, BCa, Ama , 15 + xor_rol BCi, BCi, Aki , 38 + xor_rol BCu, BCu, Agu , 16 + xor_rol BCe, BCe, Ame , 6 + xor_rol BCo, BCo, Amo , 1 + xor_rol BCi, BCi, Abi , 4 + xor_rol BCu, BCu, Aku , 14 + xor_rol BCe, BCe, Ake , 7 + xor_rol BCo, BCo, Ako , 26 + xor_rol BCa, BCa, Aga , 7 + xor_rol BCi, BCi, Asi , 12 + rotate BCe, BCe, 8 + xor_rol BCo, BCo, Abo , 1 + rotate BCu, BCu, 6 + rotate BCi, BCi, 2 + xor_rol BCa, BCa, Aba , 3 + .endif + + Da .req BCE + Du .req BCA + De .req BCI + Di .req x1 + Do .req BCU + + xor_rol Di,BCo,BCe,1 + xor_rol Da,BCe,BCu,1 + xor_rol Do,BCu,BCi,1 + .unreq BCu + xor_rol De,BCi,BCa,1 + .unreq BCi + xor_rol Du,BCa,BCo,1 + .unreq BCa + .unreq BCo + .unreq BCe + + .if \is_first == 1 + + eor Asa_tmp,Abi,Di + eor Abi_tmp,Aki,Di + eor Aki_tmp,Ako,Do + eor Ako_tmp,Amu,Du + eor Amu_tmp,Aso,Do + eor Aso_tmp,Ama,Da + eor Aka_tmp,Abe,De + + eor Abe_tmp,Age,De + + temp .req ABE + eor temp,Ago,Do + eor Ago_tmp,Ame,De + eor Ake_tmp,Agi,Di + eor Agi_tmp,Aka,Da + eor Aga_tmp,Abo,Do + eor Abo_tmp,Amo,Do + eor Amo_tmp,Ami,Di + eor Ami_tmp,Ake,De + eor Age_tmp,Agu,Du + eor Agu_tmp,Asi,Di + eor Asi_tmp,Aku,Du + eor Aku_tmp,Asa,Da + eor Ama_tmp,Abu,Du + eor Abu_tmp,Asu,Du + eor Asu_tmp,Ase,De + eor Ame_tmp,Aga,Da + eor Aba_tmp,Aba,Da + mov Ase_tmp,temp + .unreq temp + + .else + + xor_rol Asa_tmp,Abi,Di,14 + xor_rol Abi_tmp,Aki,Di,18 + xor_rol Aki_tmp,Ako,Do,1 + xor_rol Ako_tmp,Amu,Du,36 + xor_rol Amu_tmp,Aso,Do,62 + xor_rol Aso_tmp,Ama,Da,10 + xor_rol Aka_tmp,Abe,De,21 + + xor_rol Abe_tmp,Age,De,45 + + temp .req ABE + xor_rol temp,Ago,Do,28 + xor_rol Ago_tmp,Ame,De,15 + xor_rol Ake_tmp,Agi,Di,61 + xor_rol Agi_tmp,Aka,Da,25 + eor Aga_tmp,Abo,Do + xor_rol Abo_tmp,Amo,Do,27 + xor_rol Amo_tmp,Ami,Di,56 + xor_rol Ami_tmp,Ake,De,8 + xor_rol Age_tmp,Agu,Du,20 + xor_rol Agu_tmp,Asi,Di,2 + xor_rol Asi_tmp,Aku,Du,6 + xor_rol Aku_tmp,Asa,Da,39 + xor_rol Ama_tmp,Abu,Du,44 + xor_rol Abu_tmp,Asu,Du,55 + xor_rol Asu_tmp,Ase,De,41 + xor_rol Ame_tmp,Aga,Da,3 + eor Aba_tmp,Aba,Da + mov Ase_tmp,temp + .unreq temp + + .endif + + .unreq Da + .unreq De + .unreq Di + .unreq Do + .unreq Du + + tmp .req x30 + + bic_rol tmp, Abe_tmp, Abi_tmp,1 + xor_rol Aba, tmp, Aba_tmp,43 + bic_rol tmp, Abi_tmp, Abo_tmp,22 + xor_rol Abe, Abe_tmp, tmp,23 + bic_rol tmp ,Abo_tmp, Abu_tmp,7 + xor_rol Abi ,Abi_tmp, tmp,29 + bic_rol tmp ,Abu_tmp, Aba_tmp,14 + xor_rol Abo ,Abo_tmp, tmp,21 + bic_rol tmp ,Aba_tmp, Abe_tmp,20 + xor_rol Abu ,Abu_tmp, tmp,34 + + bic_rol tmp, Age_tmp, Agi_tmp,17 + xor_rol Aga, Aga_tmp, tmp,25 + bic_rol tmp, Agi_tmp, Ago_tmp,22 + xor_rol Age, Age_tmp, tmp,39 + bic_rol tmp ,Ago_tmp, Agu_tmp,48 + xor_rol Agi ,Agi_tmp, tmp,6 + bic_rol tmp ,Agu_tmp, Aga_tmp,33 + xor_rol Ago ,Ago_tmp, tmp,17 + bic_rol tmp ,Aga_tmp, Age_tmp,8 + xor_rol Agu ,Agu_tmp, tmp,41 + + .if \is_first == 0 + restore count, STACK_OFFSET_COUNT + .endif + + load_constant_ptr + + bic_rol tmp, Ake_tmp, Aki_tmp,45 + xor_rol Aka, Aka_tmp, tmp,40 + bic_rol tmp, Aki_tmp, Ako_tmp,17 + xor_rol Ake, Ake_tmp, tmp,62 + bic_rol tmp ,Ako_tmp, Aku_tmp,54 + xor_rol Aki ,Aki_tmp, tmp,7 + bic_rol tmp ,Aku_tmp, Aka_tmp,17 + xor_rol Ako ,Ako_tmp, tmp,7 + bic_rol tmp ,Aka_tmp, Ake_tmp,59 + xor_rol Aku ,Aku_tmp, tmp,12 + + bic_rol tmp, Ame_tmp, Ami_tmp,26 + xor_rol Ama, Ama_tmp, tmp,17 + bic_rol tmp, Ami_tmp, Amo_tmp,59 + xor_rol Ame, Ame_tmp, tmp,21 + bic_rol tmp ,Amo_tmp, Amu_tmp,23 + xor_rol Ami ,Ami_tmp, tmp,18 + bic_rol tmp ,Amu_tmp, Ama_tmp,29 + xor_rol Amo ,Amo_tmp, tmp,52 + bic_rol tmp ,Ama_tmp, Ame_tmp,55 + xor_rol Amu ,Amu_tmp, tmp,20 + + .if \is_first == 0 + ldr cur_const, [const_addr, count, UXTW #3] + add count, count, #1 + .else + ldr cur_const, [const_addr] + mov count, #1 + .endif + + bic_rol tmp, Ase_tmp, Asi_tmp,16 + xor_rol Asa, Asa_tmp, tmp,23 + bic_rol tmp, Asi_tmp, Aso_tmp,62 + xor_rol Ase, Ase_tmp, tmp,14 + bic_rol tmp ,Aso_tmp, Asu_tmp,39 + xor_rol Asi ,Asi_tmp, tmp,37 + bic_rol tmp ,Asu_tmp, Asa_tmp,4 + xor_rol Aso ,Aso_tmp, tmp,43 + bic_rol tmp ,Asa_tmp, Ase_tmp,7 + xor_rol Asu ,Asu_tmp, tmp,11 + + eor Aba, Aba, cur_const + +.endm + +.macro final_rotate + rotate Aga, Aga,3 + rotate Aka, Aka,25 + rotate Ama, Ama,10 + rotate Asa, Asa,39 + rotate Abe, Abe,21 + rotate Age, Age,45 + rotate Ake, Ake,8 + rotate Ame, Ame,15 + rotate Ase, Ase,41 + rotate Abi, Abi,14 + rotate Agi, Agi,61 + rotate Aki, Aki,18 + rotate Ami, Ami,56 + rotate Asi, Asi,2 + rotate Ago, Ago,28 + rotate Ako, Ako,1 + rotate Amo, Amo,27 + rotate Aso, Aso,62 + rotate Abu, Abu,44 + rotate Agu, Agu,20 + rotate Aku, Aku,6 + rotate Amu, Amu,36 + rotate Asu, Asu,55 +.endm + +#define KECCAK_F1600_ROUNDS 24 + +.global keccak_f1600_x4_scalar_asm_v1 +.global _keccak_f1600_x4_scalar_asm_v1 +.text +.align 4 + +keccak_f1600_x4_scalar_asm_v1: +_keccak_f1600_x4_scalar_asm_v1: + alloc_stack + save_gprs + save input_addr, STACK_OFFSET_INPUT + + // First scalar Keccak computation + load_input_scalar 4,0 + keccak_f1600_round 1 +loop_0: + keccak_f1600_round 0 + cmp count, #(KECCAK_F1600_ROUNDS-1) + ble loop_0 + final_rotate + restore input_addr, STACK_OFFSET_INPUT + store_input_scalar 4,0 + + // Second scalar Keccak computation + load_input_scalar 4, 1 + keccak_f1600_round 1 +loop_1: + keccak_f1600_round 0 + cmp count, #(KECCAK_F1600_ROUNDS-1) + ble loop_1 + final_rotate + restore input_addr, STACK_OFFSET_INPUT + store_input_scalar 4, 1 + + // Third scalar Keccak computation + load_input_scalar 4, 2 + keccak_f1600_round 1 +loop_2: + keccak_f1600_round 0 + cmp count, #(KECCAK_F1600_ROUNDS-1) + ble loop_2 + final_rotate + restore input_addr, STACK_OFFSET_INPUT + store_input_scalar 4, 2 + + // Fourth scalar Keccak computation + load_input_scalar 4, 3 + keccak_f1600_round 1 +loop_3: + keccak_f1600_round 0 + cmp count, #(KECCAK_F1600_ROUNDS-1) + ble loop_3 + final_rotate + restore input_addr, STACK_OFFSET_INPUT + store_input_scalar 4, 3 + + restore_gprs + free_stack + ret diff --git a/asm/manual/keccak_f1600/keccak_f1600_x4_scalar_asm_v5.s b/asm/manual/keccak_f1600/keccak_f1600_x4_scalar_asm_v5.s new file mode 100644 index 0000000..90fc545 --- /dev/null +++ b/asm/manual/keccak_f1600/keccak_f1600_x4_scalar_asm_v5.s @@ -0,0 +1,543 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +/********************** CONSTANTS *************************/ + .data + .balign 64 +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x26 + cur_const .req x26 + count .req w27 + out_count .req w27 + + /* Mapping of Kecck-f1600 state to scalar registers + * at the beginning and end of each round. */ + Aba .req x1 + Abe .req x6 + Abi .req x11 + Abo .req x16 + Abu .req x21 + Aga .req x2 + Age .req x7 + Agi .req x12 + Ago .req x17 + Agu .req x22 + Aka .req x3 + Ake .req x8 + Aki .req x13 + Ako .req x18 + Aku .req x23 + Ama .req x4 + Ame .req x9 + Ami .req x14 + Amo .req x19 + Amu .req x24 + Asa .req x5 + Ase .req x10 + Asi .req x15 + Aso .req x20 + Asu .req x25 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + Aba_ .req x30 + Abe_ .req x28 + Abi_ .req x11 + Abo_ .req x16 + Abu_ .req x21 + Aga_ .req x3 + Age_ .req x8 + Agi_ .req x12 + Ago_ .req x17 + Agu_ .req x22 + Aka_ .req x4 + Ake_ .req x9 + Aki_ .req x13 + Ako_ .req x18 + Aku_ .req x23 + Ama_ .req x5 + Ame_ .req x10 + Ami_ .req x14 + Amo_ .req x19 + Amu_ .req x24 + Asa_ .req x1 + Ase_ .req x6 + Asi_ .req x15 + Aso_ .req x20 + Asu_ .req x25 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + C0 .req x30 + E0 .req x29 + C1 .req x26 + E1 .req x0 + C2 .req x27 + E2 .req x26 + C3 .req x28 + E3 .req x27 + C4 .req x29 + E4 .req x28 + + tmp .req x0 + +/************************ MACROS ****************************/ + +#define STACK_SIZE (16*6 + 3*8 + 8) // GPRs (16*6), count (8), const (8), input (8), padding (8) +#define STACK_BASE_GPRS (3*8+8) +#define STACK_OFFSET_INPUT (0*8) +#define STACK_OFFSET_CONST (1*8) +#define STACK_OFFSET_COUNT (2*8) +#define STACK_OFFSET_OUTCOUNT (3*8) + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +.macro save reg, offset + str \reg, [sp, #\offset] +.endm + +.macro restore reg, offset + ldr \reg, [sp, #\offset] +.endm + +.macro save_gprs + stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro keccak_f1600_round_initial + ldr Aku, [input_addr, 8*(4*(14) )] + ldr Ama, [input_addr, 8*(4*(14+1))] + ldr Asa, [input_addr, 8*(4*(20) )] + ldr Ase, [input_addr, 8*(4*(20+1))] + eor C0, Ama, Asa + ldr Ame, [input_addr, 8*(4*(16) )] + ldr Ami, [input_addr, 8*(4*(16+1))] + eor C1, Ame, Ase + ldr Asi, [input_addr, 8*(4*(22) )] + ldr Aso, [input_addr, 8*(4*(22+1))] + eor C2, Ami, Asi + ldr Amo, [input_addr, 8*(4*(18) )] + ldr Amu, [input_addr, 8*(4*(18+1))] + eor C3, Amo, Aso + ldr Asu, [input_addr, #(4*8*24)] + eor C4, Amu, Asu + ldr Aka, [input_addr, 8*(4*(10) )] + ldr Ake, [input_addr, 8*(4*(10+1))] + eor C0, Aka, C0 + eor C1, Ake, C1 + ldr Aki, [input_addr, 8*(4*(12) )] + ldr Ako, [input_addr, 8*(4*(12+1))] + eor C2, Aki, C2 + ldr Abu, [input_addr, 8*(4*(4))] + ldr Aga, [input_addr, 8*(4*(4+1) )] + eor C3, Ako, C3 + eor C4, Aku, C4 + ldr Age, [input_addr, 8*(4*(6))] + ldr Agi, [input_addr, 8*(4*(6+1) )] + eor C0, Aga, C0 + ldr Ago, [input_addr, 8*(4*(8))] + ldr Agu, [input_addr, 8*(4*(8+1) )] + eor C1, Age, C1 + ldr Aba, [input_addr, 8*(4*(0) )] + ldr Abe, [input_addr, 8*(4*(0+1) )] + eor C2, Agi, C2 + ldr Abi, [input_addr, 8*(4*(2))] + ldr Abo, [input_addr, 8*(4*(2+1) )] + eor C3, Ago, C3 + save input_addr, STACK_OFFSET_INPUT + eor C4, Agu, C4 + eor C0, Aba, C0 + eor C1, Abe, C1 + eor C2, Abi, C2 + eor C3, Abo, C3 + eor C4, Abu, C4 + + eor E1, C0, C2, ROR #63 + eor E3, C2, C4, ROR #63 + eor E0, C4, C1, ROR #63 + eor E2, C1, C3, ROR #63 + eor E4, C3, C0, ROR #63 + + eor Aba_, Aba, E0 + eor Asa_, Abi, E2 + eor Abi_, Aki, E2 + eor Aki_, Ako, E3 + eor Ako_, Amu, E4 + eor Amu_, Aso, E3 + eor Aso_, Ama, E0 + eor Aka_, Abe, E1 + eor Ase_, Ago, E3 + eor Ago_, Ame, E1 + eor Ake_, Agi, E2 + eor Agi_, Aka, E0 + eor Aga_, Abo, E3 + eor Abo_, Amo, E3 + eor Amo_, Ami, E2 + eor Ami_, Ake, E1 + eor Age_, Agu, E4 + eor Agu_, Asi, E2 + eor Asi_, Aku, E4 + eor Aku_, Asa, E0 + eor Ama_, Abu, E4 + eor Abu_, Asu, E4 + eor Asu_, Ase, E1 + eor Ame_, Aga, E0 + eor Abe_, Age, E1 + + load_constant_ptr + + tmp0 .req x0 + tmp1 .req x29 + + bic tmp0, Agi_, Age_, ROR #47 + bic tmp1, Ago_, Agi_, ROR #42 + eor Aga, tmp0, Aga_, ROR #39 + bic tmp0, Agu_, Ago_, ROR #16 + eor Age, tmp1, Age_, ROR #25 + bic tmp1, Aga_, Agu_, ROR #31 + eor Agi, tmp0, Agi_, ROR #58 + bic tmp0, Age_, Aga_, ROR #56 + eor Ago, tmp1, Ago_, ROR #47 + bic tmp1, Aki_, Ake_, ROR #19 + eor Agu, tmp0, Agu_, ROR #23 + bic tmp0, Ako_, Aki_, ROR #47 + eor Aka, tmp1, Aka_, ROR #24 + bic tmp1, Aku_, Ako_, ROR #10 + eor Ake, tmp0, Ake_, ROR #2 + bic tmp0, Aka_, Aku_, ROR #47 + eor Aki, tmp1, Aki_, ROR #57 + bic tmp1, Ake_, Aka_, ROR #5 + eor Ako, tmp0, Ako_, ROR #57 + bic tmp0, Ami_, Ame_, ROR #38 + eor Aku, tmp1, Aku_, ROR #52 + bic tmp1, Amo_, Ami_, ROR #5 + eor Ama, tmp0, Ama_, ROR #47 + bic tmp0, Amu_, Amo_, ROR #41 + eor Ame, tmp1, Ame_, ROR #43 + bic tmp1, Ama_, Amu_, ROR #35 + eor Ami, tmp0, Ami_, ROR #46 + bic tmp0, Ame_, Ama_, ROR #9 + + str const_addr, [sp, #(STACK_OFFSET_CONST)] + ldr cur_const, [const_addr] + + eor Amo, tmp1, Amo_, ROR #12 + bic tmp1, Asi_, Ase_, ROR #48 + eor Amu, tmp0, Amu_, ROR #44 + bic tmp0, Aso_, Asi_, ROR #2 + eor Asa, tmp1, Asa_, ROR #41 + bic tmp1, Asu_, Aso_, ROR #25 + eor Ase, tmp0, Ase_, ROR #50 + bic tmp0, Asa_, Asu_, ROR #60 + eor Asi, tmp1, Asi_, ROR #27 + bic tmp1, Ase_, Asa_, ROR #57 + eor Aso, tmp0, Aso_, ROR #21 + + mov count, #1 + + bic tmp0, Abi_, Abe_, ROR #63 + eor Asu, tmp1, Asu_, ROR #53 + bic tmp1, Abo_, Abi_, ROR #42 + eor Aba, Aba_, tmp0, ROR #21 + bic tmp0, Abu_, Abo_, ROR #57 + eor Abe, tmp1, Abe_, ROR #41 + bic tmp1, Aba_, Abu_, ROR #50 + eor Abi, tmp0, Abi_, ROR #35 + bic tmp0, Abe_, Aba_, ROR #44 + eor Abo, tmp1, Abo_, ROR #43 + eor Abu, tmp0, Abu_, ROR #30 + + eor Aba, Aba, cur_const + save count, STACK_OFFSET_COUNT + +.endm + + +.macro keccak_f1600_round_noninitial + + eor C2, Asi, Abi, ROR #52 + eor C0, Aba, Aga, ROR #61 + eor C4, Aku, Agu, ROR #50 + eor C1, Ake, Ame, ROR #57 + eor C3, Abo, Ako, ROR #63 + eor C2, C2, Aki, ROR #48 + eor C0, C0, Ama, ROR #54 + eor C4, C4, Amu, ROR #34 + eor C1, C1, Abe, ROR #51 + eor C3, C3, Amo, ROR #37 + eor C2, C2, Ami, ROR #10 + eor C0, C0, Aka, ROR #39 + eor C4, C4, Abu, ROR #26 + eor C1, C1, Ase, ROR #31 + eor C3, C3, Ago, ROR #36 + eor C2, C2, Agi, ROR #5 + eor C0, C0, Asa, ROR #25 + eor C4, C4, Asu, ROR #15 + eor C1, C1, Age, ROR #27 + eor C3, C3, Aso, ROR #2 + + eor E1, C0, C2, ROR #61 + ror C2, C2, 62 + eor E3, C2, C4, ROR #57 + ror C4, C4, 58 + eor E0, C4, C1, ROR #55 + ror C1, C1, 56 + eor E2, C1, C3, ROR #63 + eor E4, C3, C0, ROR #63 + + eor Aba_, E0, Aba + eor Asa_, E2, Abi, ROR #50 + eor Abi_, E2, Aki, ROR #46 + eor Aki_, E3, Ako, ROR #63 + eor Ako_, E4, Amu, ROR #28 + eor Amu_, E3, Aso, ROR #2 + eor Aso_, E0, Ama, ROR #54 + eor Aka_, E1, Abe, ROR #43 + eor Ase_, E3, Ago, ROR #36 + eor Ago_, E1, Ame, ROR #49 + eor Ake_, E2, Agi, ROR #3 + eor Agi_, E0, Aka, ROR #39 + eor Aga_, E3, Abo + eor Abo_, E3, Amo, ROR #37 + eor Amo_, E2, Ami, ROR #8 + eor Ami_, E1, Ake, ROR #56 + eor Age_, E4, Agu, ROR #44 + eor Agu_, E2, Asi, ROR #62 + eor Asi_, E4, Aku, ROR #58 + eor Aku_, E0, Asa, ROR #25 + eor Ama_, E4, Abu, ROR #20 + eor Abu_, E4, Asu, ROR #9 + eor Asu_, E1, Ase, ROR #23 + eor Ame_, E0, Aga, ROR #61 + eor Abe_, E1, Age, ROR #19 + + load_constant_ptr_stack + restore count, STACK_OFFSET_COUNT + + tmp0 .req x0 + tmp1 .req x29 + + bic tmp0, Agi_, Age_, ROR #47 + bic tmp1, Ago_, Agi_, ROR #42 + eor Aga, tmp0, Aga_, ROR #39 + bic tmp0, Agu_, Ago_, ROR #16 + eor Age, tmp1, Age_, ROR #25 + bic tmp1, Aga_, Agu_, ROR #31 + eor Agi, tmp0, Agi_, ROR #58 + bic tmp0, Age_, Aga_, ROR #56 + eor Ago, tmp1, Ago_, ROR #47 + bic tmp1, Aki_, Ake_, ROR #19 + eor Agu, tmp0, Agu_, ROR #23 + bic tmp0, Ako_, Aki_, ROR #47 + eor Aka, tmp1, Aka_, ROR #24 + bic tmp1, Aku_, Ako_, ROR #10 + eor Ake, tmp0, Ake_, ROR #2 + bic tmp0, Aka_, Aku_, ROR #47 + eor Aki, tmp1, Aki_, ROR #57 + bic tmp1, Ake_, Aka_, ROR #5 + eor Ako, tmp0, Ako_, ROR #57 + bic tmp0, Ami_, Ame_, ROR #38 + eor Aku, tmp1, Aku_, ROR #52 + bic tmp1, Amo_, Ami_, ROR #5 + eor Ama, tmp0, Ama_, ROR #47 + bic tmp0, Amu_, Amo_, ROR #41 + eor Ame, tmp1, Ame_, ROR #43 + bic tmp1, Ama_, Amu_, ROR #35 + eor Ami, tmp0, Ami_, ROR #46 + bic tmp0, Ame_, Ama_, ROR #9 + + ldr cur_const, [const_addr, count, UXTW #3] + + eor Amo, tmp1, Amo_, ROR #12 + bic tmp1, Asi_, Ase_, ROR #48 + eor Amu, tmp0, Amu_, ROR #44 + bic tmp0, Aso_, Asi_, ROR #2 + eor Asa, tmp1, Asa_, ROR #41 + bic tmp1, Asu_, Aso_, ROR #25 + eor Ase, tmp0, Ase_, ROR #50 + bic tmp0, Asa_, Asu_, ROR #60 + eor Asi, tmp1, Asi_, ROR #27 + bic tmp1, Ase_, Asa_, ROR #57 + eor Aso, tmp0, Aso_, ROR #21 + bic tmp0, Abi_, Abe_, ROR #63 + add count, count, #1 + save count, STACK_OFFSET_COUNT + eor Asu, tmp1, Asu_, ROR #53 + bic tmp1, Abo_, Abi_, ROR #42 + eor Aba, Aba_, tmp0, ROR #21 + bic tmp0, Abu_, Abo_, ROR #57 + eor Abe, tmp1, Abe_, ROR #41 + bic tmp1, Aba_, Abu_, ROR #50 + eor Abi, tmp0, Abi_, ROR #35 + bic tmp0, Abe_, Aba_, ROR #44 + eor Abo, tmp1, Abo_, ROR #43 + eor Abu, tmp0, Abu_, ROR #30 + + eor Aba, Aba, cur_const + +.endm + +.macro final_rotate_store + ror Aga, Aga,#(64-3) + restore input_addr, STACK_OFFSET_INPUT + ror Abu, Abu,#(64-44) + ror Aka, Aka,#(64-25) + ror Ake, Ake,#(64-8) + str Abu, [input_addr, 8*(4*(4))] + str Aga, [input_addr, 8*(4*(4+1) )] + ror Ama, Ama,#(64-10) + ror Aku, Aku,#(64-6) + str Aka, [input_addr, 8*(4*(10) )] + str Ake, [input_addr, 8*(4*(10+1))] + ror Asa, Asa,#(64-39) + ror Ase, Ase,#(64-41) + str Aku, [input_addr, 8*(4*(14) )] + str Ama, [input_addr, 8*(4*(14+1))] + ror Abe, Abe,#(64-21) + ror Age, Age,#(64-45) + str Asa, [input_addr, 8*(4*(20) )] + str Ase, [input_addr, 8*(4*(20+1))] + ror Agi, Agi,#(64-61) + str Aba, [input_addr, 8*(4*(0) )] + str Abe, [input_addr, 8*(4*(0+1) )] + ror Ame, Ame,#(64-15) + ror Ami, Ami,#(64-56) + str Age, [input_addr, 8*(4*(6))] + str Agi, [input_addr, 8*(4*(6+1) )] + ror Abi, Abi,#(64-14) + ror Aki, Aki,#(64-18) + str Ame, [input_addr, 8*(4*(16) )] + str Ami, [input_addr, 8*(4*(16+1))] + ror Ako, Ako,#(64-1) + str Abi, [input_addr, 8*(4*(2))] + str Abo, [input_addr, 8*(4*(2+1) )] + ror Asi, Asi,#(64-2) + ror Aso, Aso,#(64-62) + str Aki, [input_addr, 8*(4*(12) )] + str Ako, [input_addr, 8*(4*(12+1))] + ror Ago, Ago,#(64-28) + ror Agu, Agu,#(64-20) + str Asi, [input_addr, 8*(4*(22) )] + str Aso, [input_addr, 8*(4*(22+1))] + ror Amo, Amo,#(64-27) + ror Amu, Amu,#(64-36) + str Ago, [input_addr, 8*(4*(8))] + str Agu, [input_addr, 8*(4*(8+1) )] + ror Asu, Asu,#(64-55) + str Amo, [input_addr, 8*(4*(18) )] + str Amu, [input_addr, 8*(4*(18+1))] + str Asu, [input_addr, #(4*8*24)] +.endm + +#define KECCAK_F1600_ROUNDS 24 + +.text +.balign 16 +.global keccak_f1600_x4_scalar_asm_v5 +.global _keccak_f1600_x4_scalar_asm_v5 + +.macro load_constant_ptr_stack + ldr const_addr, [sp, #(STACK_OFFSET_CONST)] +.endm +keccak_f1600_x4_scalar_asm_v5: +_keccak_f1600_x4_scalar_asm_v5: + alloc_stack + save_gprs + + mov out_count, #4 +1: + save out_count, STACK_OFFSET_OUTCOUNT + + keccak_f1600_round_initial +loop: + keccak_f1600_round_noninitial + cmp count, #(KECCAK_F1600_ROUNDS-1) + ble loop + + final_rotate_store + add input_addr, input_addr, #8 + + restore out_count, STACK_OFFSET_OUTCOUNT + sub out_count, out_count, #1 + cbnz out_count, 1b + + + restore_gprs + free_stack + ret diff --git a/asm/manual/keccak_f1600/keccak_f1600_x4_v84a_asm_v1p0.s b/asm/manual/keccak_f1600/keccak_f1600_x4_v84a_asm_v1p0.s new file mode 100644 index 0000000..acce4c7 --- /dev/null +++ b/asm/manual/keccak_f1600/keccak_f1600_x4_v84a_asm_v1p0.s @@ -0,0 +1,452 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +#if defined(__ARM_FEATURE_SHA3) + +/********************** CONSTANTS *************************/ + .data + .align(8) +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x1 + count .req x2 + cur_const .req x3 + out_count .req x4 + + /* Mapping of Kecck-f1600 state to vector registers + * at the beginning and end of each round. */ + Aba .req v0 + Abe .req v1 + Abi .req v2 + Abo .req v3 + Abu .req v4 + Aga .req v5 + Age .req v6 + Agi .req v7 + Ago .req v8 + Agu .req v9 + Aka .req v10 + Ake .req v11 + Aki .req v12 + Ako .req v13 + Aku .req v14 + Ama .req v15 + Ame .req v16 + Ami .req v17 + Amo .req v18 + Amu .req v19 + Asa .req v20 + Ase .req v21 + Asi .req v22 + Aso .req v23 + Asu .req v24 + + /* q-form of the above mapping */ + Abaq .req q0 + Abeq .req q1 + Abiq .req q2 + Aboq .req q3 + Abuq .req q4 + Agaq .req q5 + Ageq .req q6 + Agiq .req q7 + Agoq .req q8 + Aguq .req q9 + Akaq .req q10 + Akeq .req q11 + Akiq .req q12 + Akoq .req q13 + Akuq .req q14 + Amaq .req q15 + Ameq .req q16 + Amiq .req q17 + Amoq .req q18 + Amuq .req q19 + Asaq .req q20 + Aseq .req q21 + Asiq .req q22 + Asoq .req q23 + Asuq .req q24 + + Abaz .req z0 + Abez .req z1 + Abiz .req z2 + Aboz .req z3 + Abuz .req z4 + Agaz .req z5 + Agez .req z6 + Agiz .req z7 + Agoz .req z8 + Aguz .req z9 + Akaz .req z10 + Akez .req z11 + Akiz .req z12 + Akoz .req z13 + Akuz .req z14 + Amaz .req z15 + Amez .req z16 + Amiz .req z17 + Amoz .req z18 + Amuz .req z19 + Asaz .req z20 + Asez .req z21 + Asiz .req z22 + Asoz .req z23 + Asuz .req z24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v25 + C1 .req v26 + C2 .req v27 + C3 .req v28 + C4 .req v29 + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req C4 + E1 .req C0 + E2 .req C1 + E3 .req C2 + E4 .req C3 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + Abi_ .req v2 + Abo_ .req v3 + Abu_ .req v4 + Aga_ .req v10 + Age_ .req v11 + Agi_ .req v7 + Ago_ .req v8 + Agu_ .req v9 + Aka_ .req v15 + Ake_ .req v16 + Aki_ .req v12 + Ako_ .req v13 + Aku_ .req v14 + Ama_ .req v20 + Ame_ .req v21 + Ami_ .req v17 + Amo_ .req v18 + Amu_ .req v19 + Asa_ .req v0 + Ase_ .req v1 + Asi_ .req v22 + Aso_ .req v23 + Asu_ .req v24 + Aba_ .req v30 + Abe_ .req E0 + +/************************ MACROS ****************************/ + +.macro load_input + ldr Abaq, [input_addr, #(4*8*0)] + ldr Abeq, [input_addr, #(4*8*1)] + ldr Abiq, [input_addr, #(4*8*2)] + ldr Aboq, [input_addr, #(4*8*3)] + ldr Abuq, [input_addr, #(4*8*4)] + ldr Agaq, [input_addr, #(4*8*5)] + ldr Ageq, [input_addr, #(4*8*6)] + ldr Agiq, [input_addr, #(4*8*7)] + ldr Agoq, [input_addr, #(4*8*8)] + ldr Aguq, [input_addr, #(4*8*9)] + ldr Akaq, [input_addr, #(4*8*10)] + ldr Akeq, [input_addr, #(4*8*11)] + ldr Akiq, [input_addr, #(4*8*12)] + ldr Akoq, [input_addr, #(4*8*13)] + ldr Akuq, [input_addr, #(4*8*14)] + ldr Amaq, [input_addr, #(4*8*15)] + ldr Ameq, [input_addr, #(4*8*16)] + ldr Amiq, [input_addr, #(4*8*17)] + ldr Amoq, [input_addr, #(4*8*18)] + ldr Amuq, [input_addr, #(4*8*19)] + ldr Asaq, [input_addr, #(4*8*20)] + ldr Aseq, [input_addr, #(4*8*21)] + ldr Asiq, [input_addr, #(4*8*22)] + ldr Asoq, [input_addr, #(4*8*23)] + ldr Asuq, [input_addr, #(4*8*24)] +.endm + +.macro store_input + str Abaq, [input_addr, #(4*8*0)] + str Abeq, [input_addr, #(4*8*1)] + str Abiq, [input_addr, #(4*8*2)] + str Aboq, [input_addr, #(4*8*3)] + str Abuq, [input_addr, #(4*8*4)] + str Agaq, [input_addr, #(4*8*5)] + str Ageq, [input_addr, #(4*8*6)] + str Agiq, [input_addr, #(4*8*7)] + str Agoq, [input_addr, #(4*8*8)] + str Aguq, [input_addr, #(4*8*9)] + str Akaq, [input_addr, #(4*8*10)] + str Akeq, [input_addr, #(4*8*11)] + str Akiq, [input_addr, #(4*8*12)] + str Akoq, [input_addr, #(4*8*13)] + str Akuq, [input_addr, #(4*8*14)] + str Amaq, [input_addr, #(4*8*15)] + str Ameq, [input_addr, #(4*8*16)] + str Amiq, [input_addr, #(4*8*17)] + str Amoq, [input_addr, #(4*8*18)] + str Amuq, [input_addr, #(4*8*19)] + str Asaq, [input_addr, #(4*8*20)] + str Aseq, [input_addr, #(4*8*21)] + str Asiq, [input_addr, #(4*8*22)] + str Asoq, [input_addr, #(4*8*23)] + str Asuq, [input_addr, #(4*8*24)] +.endm + +#define STACK_SIZE (16*4 + 16*6 + 16*5) // VREGS (16*4) + GPRS (TODO: Remove) + +#define STACK_BASE_GPRS (16*4) +#define STACK_BASE_VTMP (16*4 + 16*6) + +#define save(name)\ + str name ## q, [sp, #(STACK_BASE_VTMP + 16*(name ## _offset))] +#define restore(name) \ + ldr name ## q, [sp, #(STACK_BASE_VTMP + 16*(name ## _offset))] + +#define Aga_offset 0 +#define Age_offset 1 +#define Agi_offset 2 +#define Ago_offset 3 +#define Agu_offset 4 + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +.macro save_vregs + stp d8, d9, [sp, #(16*0)] + stp d10, d11, [sp, #(16*1)] + stp d12, d13, [sp, #(16*2)] + stp d14, d15, [sp, #(16*3)] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #(16*0)] + ldp d10, d11, [sp, #(16*1)] + ldp d12, d13, [sp, #(16*2)] + ldp d14, d15, [sp, #(16*3)] +.endm + +/* Macros using v8.4-A SHA-3 instructions */ + +.macro eor2 d s0 s1 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor3_m0 d s0 s1 s2 + eor3 \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +.macro rax1_m0 d s0 s1 + rax1 \d\().2d, \s0\().2d, \s1\().2d +.endm + +.macro xar_m0 d s0 s1 imm + xar \d\().2d, \s0\().2d, \s1\().2d, #\imm +.endm + +.macro rax1_m1 d s0 s1 + xar_m0 tmp, vzr, \s1, 63 + eor \d\().16b, \s0\().16b, tmp.16b +.endm + +.macro bcax_m0 d s0 s1 s2 + bcax \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +.macro bcax_m2 d s0 s1 s2 + bcax \d\()z.d, \s0\()z.d, \s1\()z.d, \s2\()z.d +.endm + +/* Keccak-f1600 round */ + +.macro keccak_f1600_round + + eor3_m0 C2, Ami, Agi, Aki + eor3_m0 C0, Ama, Aga, Aka + eor3_m0 C1, Ame, Age, Ake + eor3_m0 C3, Amo, Ago, Ako + eor3_m0 C4, Asu, Agu, Aku + + vzr .req v31 + movi vzr.2d, #0 + + eor3_m0 C2, C2, Abi, Asi + save(Agi) SEP C1r .req Agi + eor3_m0 C0, C0, Aba, Asa + eor3_m0 C1, C1, Abe, Ase + save(Agu) SEP C3r .req Agu + eor3_m0 C3, C3, Abo, Aso + eor3_m0 C4, C4, Amu, Abu + + save(Ago) SEP C2r .req Ago + xar_m0 C1r, vzr, C1, 63 + xar_m0 C3r, vzr, C3, 63 + save(Aga) SEP C4r .req Aga + xar_m0 C2r, vzr, C2, 63 + xar_m0 C4r, vzr, C4, 63 + save(Age) SEP C0r .req Age + eor2 E0, C4, C1r + xar_m0 C0r, vzr, C0, 63 + eor2 E2, C1, C3r + eor2 E1, C0, C2r + restore(Agu) // C3r + eor2 E3, C2, C4r + eor2 E4, C3, C0r + restore(Ago) // C2r + restore(Agi) // C1r/Cor + + eor Aba_.16b, Aba.16b, E0.16b + xar_m0 Asa_, Abi, E2, 2 + restore(Aga) // C4r + xar_m0 Abi_, Aki, E2, 21 + xar_m0 Aki_, Ako, E3, 39 + restore(Age) // C0r + xar_m0 Ako_, Amu, E4, 56 + xar_m0 Amu_, Aso, E3, 8 + xar_m0 Aso_, Ama, E0, 23 + xar_m0 Aka_, Abe, E1, 63 + xar_m0 Ase_, Ago, E3, 9 + xar_m0 Ago_, Ame, E1, 19 + xar_m0 Ake_, Agi, E2, 58 + xar_m0 Agi_, Aka, E0, 61 + xar_m0 Aga_, Abo, E3, 36 + xar_m0 Abo_, Amo, E3, 43 + xar_m0 Amo_, Ami, E2, 49 + xar_m0 Ami_, Ake, E1, 54 + xar_m0 Age_, Agu, E4, 44 + xar_m0 Agu_, Asi, E2, 3 + xar_m0 Asi_, Aku, E4, 25 + xar_m0 Aku_, Asa, E0, 46 + xar_m0 Ama_, Abu, E4, 37 + xar_m0 Abu_, Asu, E4, 50 + xar_m0 Asu_, Ase, E1, 62 + xar_m0 Ame_, Aga, E0, 28 + xar_m0 Abe_, Age, E1, 20 + + ld1r {v31.2d}, [const_addr], #8 + + bcax_m0 Aga, Aga_, Agi_, Age_ + bcax_m0 Age, Age_, Ago_, Agi_ + bcax_m0 Agi, Agi_, Agu_, Ago_ + bcax_m0 Ago, Ago_, Aga_, Agu_ + bcax_m0 Agu, Agu_, Age_, Aga_ + bcax_m0 Aka, Aka_, Aki_, Ake_ + bcax_m0 Ake, Ake_, Ako_, Aki_ + bcax_m0 Aki, Aki_, Aku_, Ako_ + bcax_m0 Ako, Ako_, Aka_, Aku_ + bcax_m0 Aku, Aku_, Ake_, Aka_ + bcax_m0 Ama, Ama_, Ami_, Ame_ + bcax_m0 Ame, Ame_, Amo_, Ami_ + bcax_m0 Ami, Ami_, Amu_, Amo_ + bcax_m0 Amo, Amo_, Ama_, Amu_ + bcax_m0 Amu, Amu_, Ame_, Ama_ + bcax_m0 Asa, Asa_, Asi_, Ase_ + bcax_m0 Ase, Ase_, Aso_, Asi_ + bcax_m0 Asi, Asi_, Asu_, Aso_ + bcax_m0 Aso, Aso_, Asa_, Asu_ + bcax_m0 Asu, Asu_, Ase_, Asa_ + bcax_m0 Aba, Aba_, Abi_, Abe_ + bcax_m0 Abe, Abe_, Abo_, Abi_ + bcax_m0 Abi, Abi_, Abu_, Abo_ + bcax_m0 Abo, Abo_, Aba_, Abu_ + bcax_m0 Abu, Abu_, Abe_, Aba_ + + // iota step + eor Aba.16b, Aba.16b, v31.16b + +.endm + +#define KECCAK_F1600_ROUNDS 24 + +.text +.align 4 +.global keccak_f1600_x4_v84a_asm_v1p0 +.global _keccak_f1600_x4_v84a_asm_v1p0 + +keccak_f1600_x4_v84a_asm_v1p0: +_keccak_f1600_x4_v84a_asm_v1p0: + alloc_stack + save_vregs + + mov out_count, #2 +1: + load_constant_ptr + load_input + mov count, #(KECCAK_F1600_ROUNDS) +2: + keccak_f1600_round + sub count, count, #1 + cbnz count, 2b + + store_input + add input_addr, input_addr, #16 + + sub out_count, out_count, #1 + cbnz out_count, 1b + + restore_vregs + free_stack + ret + +#endif diff --git a/asm/manual/keccak_f1600/keccak_f1600_x5_hybrid_asm_v8.s b/asm/manual/keccak_f1600/keccak_f1600_x5_hybrid_asm_v8.s new file mode 100644 index 0000000..b26e3fa --- /dev/null +++ b/asm/manual/keccak_f1600/keccak_f1600_x5_hybrid_asm_v8.s @@ -0,0 +1,1635 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +/********************** CONSTANTS *************************/ + .data + .align(8) +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 +round_constants_vec: + .quad 0x0000000000000001 + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + .quad 0x8000000080008008 +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x29 + count .req w27 + out_count .req w27 + cur_const .req x26 + + /* Mapping of Kecck-f1600 SIMD state to vector registers + * at the beginning and end of each round. */ + + /* Mapping of Kecck-f1600 state to vector registers + * at the beginning and end of each round. */ + vAba .req v0 + vAbe .req v1 + vAbi .req v2 + vAbo .req v3 + vAbu .req v4 + vAga .req v5 + vAge .req v6 + vAgi .req v7 + vAgo .req v8 + vAgu .req v9 + vAka .req v10 + vAke .req v11 + vAki .req v12 + vAko .req v13 + vAku .req v14 + vAma .req v15 + vAme .req v16 + vAmi .req v17 + vAmo .req v18 + vAmu .req v19 + vAsa .req v20 + vAse .req v21 + vAsi .req v22 + vAso .req v23 + vAsu .req v24 + + /* q-form of the above mapping */ + vAbaq .req q0 + vAbeq .req q1 + vAbiq .req q2 + vAboq .req q3 + vAbuq .req q4 + vAgaq .req q5 + vAgeq .req q6 + vAgiq .req q7 + vAgoq .req q8 + vAguq .req q9 + vAkaq .req q10 + vAkeq .req q11 + vAkiq .req q12 + vAkoq .req q13 + vAkuq .req q14 + vAmaq .req q15 + vAmeq .req q16 + vAmiq .req q17 + vAmoq .req q18 + vAmuq .req q19 + vAsaq .req q20 + vAseq .req q21 + vAsiq .req q22 + vAsoq .req q23 + vAsuq .req q24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v27 + C1 .req v28 + C2 .req v29 + C3 .req v30 + C4 .req v31 + + C0q .req q27 + C1q .req q28 + C2q .req q29 + C3q .req q30 + C4q .req q31 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + vBba .req v25 // fresh + vBbe .req v26 // fresh + vBbi .req vAbi + vBbo .req vAbo + vBbu .req vAbu + vBga .req vAka + vBge .req vAke + vBgi .req vAgi + vBgo .req vAgo + vBgu .req vAgu + vBka .req vAma + vBke .req vAme + vBki .req vAki + vBko .req vAko + vBku .req vAku + vBma .req vAsa + vBme .req vAse + vBmi .req vAmi + vBmo .req vAmo + vBmu .req vAmu + vBsa .req vAba + vBse .req vAbe + vBsi .req vAsi + vBso .req vAso + vBsu .req vAsu + + vBbaq .req q25 // fresh + vBbeq .req q26 // fresh + vBbiq .req vAbiq + vBboq .req vAboq + vBbuq .req vAbuq + vBgaq .req vAkaq + vBgeq .req vAkeq + vBgiq .req vAgiq + vBgoq .req vAgoq + vBguq .req vAguq + vBkaq .req vAmaq + vBkeq .req vAmeq + vBkiq .req vAkiq + vBkoq .req vAkoq + vBkuq .req vAkuq + vBmaq .req vAsaq + vBmeq .req vAseq + vBmiq .req vAmiq + vBmoq .req vAmoq + vBmuq .req vAmuq + vBsaq .req vAbaq + vBseq .req vAbeq + vBsiq .req vAsiq + vBsoq .req vAsoq + vBsuq .req vAsuq + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req C4 + E1 .req C0 + E2 .req vBbe // fresh + E3 .req C2 + E4 .req C3 + + E0q .req C4q + E1q .req C0q + E2q .req vBbeq // fresh + E3q .req C2q + E4q .req C3q + + /* Mapping of Kecck-f1600 state to scalar registers + * at the beginning and end of each round. */ + s_Aba .req x1 + sAbe .req x6 + sAbi .req x11 + sAbo .req x16 + sAbu .req x21 + sAga .req x2 + sAge .req x7 + sAgi .req x12 + sAgo .req x17 + sAgu .req x22 + sAka .req x3 + sAke .req x8 + sAki .req x13 + sAko .req x18 + sAku .req x23 + sAma .req x4 + sAme .req x9 + sAmi .req x14 + sAmo .req x19 + sAmu .req x24 + sAsa .req x5 + sAse .req x10 + sAsi .req x15 + sAso .req x20 + sAsu .req x25 + + /* sA_[y,2*x+3*y] = rot(A[x,y]) */ + s_Aba_ .req x0 + sAbe_ .req x28 + sAbi_ .req x11 + sAbo_ .req x16 + sAbu_ .req x21 + sAga_ .req x3 + sAge_ .req x8 + sAgi_ .req x12 + sAgo_ .req x17 + sAgu_ .req x22 + sAka_ .req x4 + sAke_ .req x9 + sAki_ .req x13 + sAko_ .req x18 + sAku_ .req x23 + sAma_ .req x5 + sAme_ .req x10 + sAmi_ .req x14 + sAmo_ .req x19 + sAmu_ .req x24 + sAsa_ .req x1 + sAse_ .req x6 + sAsi_ .req x15 + sAso_ .req x20 + sAsu_ .req x25 + + /* sC[x] = sA[x,0] xor sA[x,1] xor sA[x,2] xor sA[x,3] xor sA[x,4], for x in 0..4 */ + /* sE[x] = sC[x-1] xor rot(C[x+1],1), for x in 0..4 */ + sC0 .req x0 + sE0 .req x29 + sC1 .req x26 + sE1 .req x30 + sC2 .req x27 + sE2 .req x26 + sC3 .req x28 + sE3 .req x27 + sC4 .req x29 + sE4 .req x28 + + tmp .req x30 + +/************************ MACROS ****************************/ + +.macro eor2 d s0 s1 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor3_m1 d s0 s1 s2 + eor2 \d, \s0, \s1 + eor2 \d, \d, \s2 +.endm + +.macro rax1_m1 d s0 s1 + shl vvtmp.2d, \s1\().2d, #1 + sri vvtmp.2d, \s1\().2d, #63 + eor \d\().16b, vvtmp.16b, \s0\().16b +.endm + + .macro xar_m1 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 63 + eor \s0\().16b, \s0\().16b, \s1\().16b + add \d\().2d, \s0\().2d, \s0\().2d + sri \d\().2d, \s0\().2d, #(63) + .else + eor \s0\().16b, \s0\().16b, \s1\().16b + shl \d\().2d, \s0\().2d, #(64-\imm) + sri \d\().2d, \s0\().2d, #(\imm) + .endif +.endm + +.macro bcax_m1 d s0 s1 s2 + bic vvtmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, vvtmp.16b, \s0\().16b +.endm + +.macro load_input_vector + ldp vAbaq, vAbeq, [input_addr, #(16*0)] + ldp vAbiq, vAboq, [input_addr, #(16*2)] + ldp vAbuq, vAgaq, [input_addr, #(16*4)] + ldp vAgeq, vAgiq, [input_addr, #(16*6)] + ldp vAgoq, vAguq, [input_addr, #(16*8)] + ldp vAkaq, vAkeq, [input_addr, #(16*10)] + ldp vAkiq, vAkoq, [input_addr, #(16*12)] + ldp vAkuq, vAmaq, [input_addr, #(16*14)] + ldp vAmeq, vAmiq, [input_addr, #(16*16)] + ldp vAmoq, vAmuq, [input_addr, #(16*18)] + ldp vAsaq, vAseq, [input_addr, #(16*20)] + ldp vAsiq, vAsoq, [input_addr, #(16*22)] + ldr vAsuq, [input_addr, #(16*24)] + + // ldr vAbaq, [input_addr, #(16*0)] + // ldr vAbeq, [input_addr, #(16*1)] + // ldr vAbiq, [input_addr, #(16*2)] + // ldr vAboq, [input_addr, #(16*3)] + // ldr vAbuq, [input_addr, #(16*4)] + // ldr vAgaq, [input_addr, #(16*5)] + // ldr vAgeq, [input_addr, #(16*6)] + // ldr vAgiq, [input_addr, #(16*7)] + // ldr vAgoq, [input_addr, #(16*8)] + // ldr vAguq, [input_addr, #(16*9)] + // ldr vAkaq, [input_addr, #(16*10)] + // ldr vAkeq, [input_addr, #(16*11)] + // ldr vAkiq, [input_addr, #(16*12)] + // ldr vAkoq, [input_addr, #(16*13)] + // ldr vAkuq, [input_addr, #(16*14)] + // ldr vAmaq, [input_addr, #(16*15)] + // ldr vAmeq, [input_addr, #(16*16)] + // ldr vAmiq, [input_addr, #(16*17)] + // ldr vAmoq, [input_addr, #(16*18)] + // ldr vAmuq, [input_addr, #(16*19)] + // ldr vAsaq, [input_addr, #(16*20)] + // ldr vAseq, [input_addr, #(16*21)] + // ldr vAsiq, [input_addr, #(16*22)] + // ldr vAsoq, [input_addr, #(16*23)] + // ldr vAsuq, [input_addr, #(16*24)] +.endm + +.macro store_input_vector + stp vAbaq, vAbeq, [input_addr, #(16*0)] + stp vAbiq, vAboq, [input_addr, #(16*2)] + stp vAbuq, vAgaq, [input_addr, #(16*4)] + stp vAgeq, vAgiq, [input_addr, #(16*6)] + stp vAgoq, vAguq, [input_addr, #(16*8)] + stp vAkaq, vAkeq, [input_addr, #(16*10)] + stp vAkiq, vAkoq, [input_addr, #(16*12)] + stp vAkuq, vAmaq, [input_addr, #(16*14)] + stp vAmeq, vAmiq, [input_addr, #(16*16)] + stp vAmoq, vAmuq, [input_addr, #(16*18)] + stp vAsaq, vAseq, [input_addr, #(16*20)] + stp vAsiq, vAsoq, [input_addr, #(16*22)] + str vAsuq, [input_addr, #(16*24)] + + // str vAbaq, [input_addr, #(16*0)] + // str vAbeq, [input_addr, #(16*1)] + // str vAbiq, [input_addr, #(16*2)] + // str vAboq, [input_addr, #(16*3)] + // str vAbuq, [input_addr, #(16*4)] + // str vAgaq, [input_addr, #(16*5)] + // str vAgeq, [input_addr, #(16*6)] + // str vAgiq, [input_addr, #(16*7)] + // str vAgoq, [input_addr, #(16*8)] + // str vAguq, [input_addr, #(16*9)] + // str vAkaq, [input_addr, #(16*10)] + // str vAkeq, [input_addr, #(16*11)] + // str vAkiq, [input_addr, #(16*12)] + // str vAkoq, [input_addr, #(16*13)] + // str vAkuq, [input_addr, #(16*14)] + // str vAmaq, [input_addr, #(16*15)] + // str vAmeq, [input_addr, #(16*16)] + // str vAmiq, [input_addr, #(16*17)] + // str vAmoq, [input_addr, #(16*18)] + // str vAmuq, [input_addr, #(16*19)] + // str vAsaq, [input_addr, #(16*20)] + // str vAseq, [input_addr, #(16*21)] + // str vAsiq, [input_addr, #(16*22)] + // str vAsoq, [input_addr, #(16*23)] + // str vAsuq, [input_addr, #(16*24)] +.endm + +.macro load_input_scalar + ldp s_Aba, sAbe, [input_addr,8*0 ] + ldp sAbi, sAbo, [input_addr,8*2 ] + ldp sAbu, sAga, [input_addr,8*4 ] + ldp sAge, sAgi, [input_addr,8*6 ] + ldp sAgo, sAgu, [input_addr,8*8 ] + ldp sAka, sAke, [input_addr,8*10] + ldp sAki, sAko, [input_addr,8*12] + ldp sAku, sAma, [input_addr,8*14] + ldp sAme, sAmi, [input_addr,8*16] + ldp sAmo, sAmu, [input_addr,8*18] + ldp sAsa, sAse, [input_addr,8*20] + ldp sAsi, sAso, [input_addr,8*22] + ldr sAsu, [input_addr,8*24] +.endm + +.macro store_input_scalar + stp s_Aba, sAbe, [input_addr,8*0 ] + stp sAbi, sAbo, [input_addr,8*2 ] + stp sAbu, sAga, [input_addr,8*4 ] + stp sAge, sAgi, [input_addr,8*6 ] + stp sAgo, sAgu, [input_addr,8*8 ] + stp sAka, sAke, [input_addr,8*10] + stp sAki, sAko, [input_addr,8*12] + stp sAku, sAma, [input_addr,8*14] + stp sAme, sAmi, [input_addr,8*16] + stp sAmo, sAmu, [input_addr,8*18] + stp sAsa, sAse, [input_addr,8*20] + stp sAsi, sAso, [input_addr,8*22] + str sAsu, [input_addr,8*24] +.endm + + +#define STACK_SIZE (4*16 + 12*8 + 6*8 + 16*1) +#define STACK_BASE_VREGS (0) +#define STACK_BASE_GPRS (4*16) +#define STACK_BASE_TMP_GPRS (4*16 + 12*8) +#define STACK_BASE_TMP_VREGS (4*16 + 12*8 + 6*8) +#define STACK_OFFSET_INPUT (0*8) +#define STACK_OFFSET_CONST (1*8) +#define STACK_OFFSET_COUNT (2*8) +#define STACK_OFFSET_COUNT_OUT (3*8) +#define STACK_OFFSET_CUR_INPUT (4*8) + +#define vAga_offset 0 + +#define save(name) \ + str name ## q, [sp, #(STACK_BASE_TMP_VREGS + 16 * name ## _offset)] +#define restore(name) \ + ldr name ## q, [sp, #(STACK_BASE_TMP_VREGS + 16 * name ## _offset)] + + +.macro save_gprs + stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro save_vregs + stp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] + stp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + stp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + stp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] +.endm + +.macro restore_vregs + ldp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] + ldp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + ldp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + ldp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] +.endm + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +.macro eor5 dst, src0, src1, src2, src3, src4 + eor \dst, \src0, \src1 + eor \dst, \dst, \src2 + eor \dst, \dst, \src3 + eor \dst, \dst, \src4 +.endm + +.macro xor_rol dst, src1, src0, imm + eor \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro bic_rol dst, src1, src0, imm + bic \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro rotate dst, src, imm + ror \dst, \src, #(64-\imm) +.endm + +.macro save reg, offset + str \reg, [sp, #(STACK_BASE_TMP_GPRS + \offset)] +.endm + +.macro restore reg, offset + ldr \reg, [sp, #(STACK_BASE_TMP_GPRS + \offset)] +.endm + +.macro hybrid_round_initial +eor sC0, sAma, sAsa SEP +eor sC1, sAme, sAse SEP eor3_m1 C1,vAbe,vAge,vAke +eor sC2, sAmi, sAsi SEP +eor sC3, sAmo, sAso SEP +eor sC4, sAmu, sAsu SEP +eor sC0, sAka, sC0 SEP eor3_m1 C3,vAbo,vAgo,vAko +eor sC1, sAke, sC1 SEP +eor sC2, sAki, sC2 SEP +eor sC3, sAko, sC3 SEP +eor sC4, sAku, sC4 SEP eor3_m1 C0,vAba,vAga,vAka +eor sC0, sAga, sC0 SEP +eor sC1, sAge, sC1 SEP +eor sC2, sAgi, sC2 SEP +eor sC3, sAgo, sC3 SEP eor3_m1 C2,vAbi,vAgi,vAki +eor sC4, sAgu, sC4 SEP +eor sC0, s_Aba, sC0 SEP +eor sC1, sAbe, sC1 SEP +eor sC2, sAbi, sC2 SEP eor3_m1 C4,vAbu,vAgu,vAku +eor sC3, sAbo, sC3 SEP +eor sC4, sAbu, sC4 SEP +eor sE1, sC0, sC2, ROR #63 SEP +eor sE3, sC2, sC4, ROR #63 SEP eor3_m1 C1, C1,vAme, vAse +eor sE0, sC4, sC1, ROR #63 SEP +eor sE2, sC1, sC3, ROR #63 SEP +eor sE4, sC3, sC0, ROR #63 SEP +eor s_Aba_, s_Aba, sE0 SEP eor3_m1 C3, C3,vAmo, vAso +eor sAsa_, sAbi, sE2 SEP +eor sAbi_, sAki, sE2 SEP +eor sAki_, sAko, sE3 SEP +eor sAko_, sAmu, sE4 SEP eor3_m1 C0, C0,vAma, vAsa +eor sAmu_, sAso, sE3 SEP +eor sAso_, sAma, sE0 SEP +eor sAka_, sAbe, sE1 SEP +eor sAse_, sAgo, sE3 SEP eor3_m1 C2, C2,vAmi, vAsi +eor sAgo_, sAme, sE1 SEP +eor sAke_, sAgi, sE2 SEP +eor sAgi_, sAka, sE0 SEP +eor sAga_, sAbo, sE3 SEP eor3_m1 C4, C4,vAmu, vAsu +eor sAbo_, sAmo, sE3 SEP +eor sAmo_, sAmi, sE2 SEP vvtmp .req vBba +eor sAmi_, sAke, sE1 SEP +eor sAge_, sAgu, sE4 SEP rax1_m1 E2, C1, C3 +eor sAgu_, sAsi, sE2 SEP +eor sAsi_, sAku, sE4 SEP +eor sAku_, sAsa, sE0 SEP +eor sAma_, sAbu, sE4 SEP rax1_m1 E4, C3, C0 +eor sAbu_, sAsu, sE4 SEP +eor sAsu_, sAse, sE1 SEP +eor sAme_, sAga, sE0 SEP +eor sAbe_, sAge, sE1 SEP rax1_m1 E1, C0, C2 +load_constant_ptr SEP +bic tmp, sAgi_, sAge_, ROR #47 SEP +eor sAga, tmp, sAga_, ROR #39 SEP +bic tmp, sAgo_, sAgi_, ROR #42 SEP rax1_m1 E3, C2, C4 +eor sAge, tmp, sAge_, ROR #25 SEP +bic tmp, sAgu_, sAgo_, ROR #16 SEP +eor sAgi, tmp, sAgi_, ROR #58 SEP +bic tmp, sAga_, sAgu_, ROR #31 SEP rax1_m1 E0, C4, C1 +eor sAgo, tmp, sAgo_, ROR #47 SEP +bic tmp, sAge_, sAga_, ROR #56 SEP .unreq vvtmp +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP vvtmp .req C1 +eor sAka, tmp, sAka_, ROR #24 SEP +bic tmp, sAko_, sAki_, ROR #47 SEP vvtmpq .req C1q +eor sAke, tmp, sAke_, ROR #2 SEP +bic tmp, sAku_, sAko_, ROR #10 SEP eor vBba.16b, vAba.16b, E0.16b +eor sAki, tmp, sAki_, ROR #57 SEP +bic tmp, sAka_, sAku_, ROR #47 SEP xar_m1 vBsa, vAbi, E2, 2 +eor sAko, tmp, sAko_, ROR #57 SEP +bic tmp, sAke_, sAka_, ROR #5 SEP +eor sAku, tmp, sAku_, ROR #52 SEP +bic tmp, sAmi_, sAme_, ROR #38 SEP xar_m1 vBbi, vAki, E2, 21 +eor sAma, tmp, sAma_, ROR #47 SEP +bic tmp, sAmo_, sAmi_, ROR #5 SEP +eor sAme, tmp, sAme_, ROR #43 SEP +bic tmp, sAmu_, sAmo_, ROR #41 SEP xar_m1 vBki, vAko, E3, 39 +eor sAmi, tmp, sAmi_, ROR #46 SEP +ldr cur_const, [const_addr] SEP +mov count, #1 SEP +bic tmp, sAma_, sAmu_, ROR #35 SEP xar_m1 vBko, vAmu, E4, 56 +eor sAmo, tmp, sAmo_, ROR #12 SEP +bic tmp, sAme_, sAma_, ROR #9 SEP +eor sAmu, tmp, sAmu_, ROR #44 SEP +bic tmp, sAsi_, sAse_, ROR #48 SEP xar_m1 vBmu, vAso, E3, 8 +eor sAsa, tmp, sAsa_, ROR #41 SEP +bic tmp, sAso_, sAsi_, ROR #2 SEP +eor sAse, tmp, sAse_, ROR #50 SEP +bic tmp, sAsu_, sAso_, ROR #25 SEP xar_m1 vBso, vAma, E0, 23 +eor sAsi, tmp, sAsi_, ROR #27 SEP +bic tmp, sAsa_, sAsu_, ROR #60 SEP +eor sAso, tmp, sAso_, ROR #21 SEP +bic tmp, sAse_, sAsa_, ROR #57 SEP xar_m1 vBka, vAbe, E1, 63 +eor sAsu, tmp, sAsu_, ROR #53 SEP +bic tmp, sAbi_, sAbe_, ROR #63 SEP +eor s_Aba, s_Aba_, tmp, ROR #21 SEP +bic tmp, sAbo_, sAbi_, ROR #42 SEP xar_m1 vBse, vAgo, E3, 9 +eor sAbe, tmp, sAbe_, ROR #41 SEP +bic tmp, sAbu_, sAbo_, ROR #57 SEP +eor sAbi, tmp, sAbi_, ROR #35 SEP +bic tmp, s_Aba_, sAbu_, ROR #50 SEP xar_m1 vBgo, vAme, E1, 19 +eor sAbo, tmp, sAbo_, ROR #43 SEP +bic tmp, sAbe_, s_Aba_, ROR #44 SEP +eor sAbu, tmp, sAbu_, ROR #30 SEP +eor s_Aba, s_Aba, cur_const SEP xar_m1 vBke, vAgi, E2, 58 +save count, STACK_OFFSET_COUNT SEP +eor sC0, sAka, sAsa, ROR #50 SEP +eor sC1, sAse, sAge, ROR #60 SEP +eor sC2, sAmi, sAgi, ROR #59 SEP xar_m1 vBgi, vAka, E0, 61 +eor sC3, sAgo, sAso, ROR #30 SEP +eor sC4, sAbu, sAsu, ROR #53 SEP +eor sC0, sAma, sC0, ROR #49 SEP +eor sC1, sAbe, sC1, ROR #44 SEP xar_m1 vBga, vAbo, E3, 36 +eor sC2, sAki, sC2, ROR #26 SEP +eor sC3, sAmo, sC3, ROR #63 SEP +eor sC4, sAmu, sC4, ROR #56 SEP +eor sC0, sAga, sC0, ROR #57 SEP xar_m1 vBbo, vAmo, E3, 43 +eor sC1, sAme, sC1, ROR #58 SEP +eor sC2, sAbi, sC2, ROR #60 SEP +eor sC3, sAko, sC3, ROR #38 SEP +eor sC4, sAgu, sC4, ROR #48 SEP xar_m1 vBmo, vAmi, E2, 49 +eor sC0, s_Aba, sC0, ROR #61 SEP +eor sC1, sAke, sC1, ROR #57 SEP +eor sC2, sAsi, sC2, ROR #52 SEP +eor sC3, sAbo, sC3, ROR #63 SEP +eor sC4, sAku, sC4, ROR #50 SEP xar_m1 vBmi, vAke, E1, 54 +ror sC1, sC1, 56 SEP +ror sC4, sC4, 58 SEP +ror sC2, sC2, 62 SEP +eor sE1, sC0, sC2, ROR #63 SEP xar_m1 vBge, vAgu, E4, 44 +eor sE3, sC2, sC4, ROR #63 SEP +eor sE0, sC4, sC1, ROR #63 SEP mov E3.16b, vAga.16b +eor sE2, sC1, sC3, ROR #63 SEP +eor sE4, sC3, sC0, ROR #63 SEP bcax_m1 vAga, vBga, vBgi, vBge +eor s_Aba_, sE0, s_Aba SEP +eor sAsa_, sE2, sAbi, ROR #50 SEP +eor sAbi_, sE2, sAki, ROR #46 SEP xar_m1 vBgu, vAsi, E2, 3 +eor sAki_, sE3, sAko, ROR #63 SEP +eor sAko_, sE4, sAmu, ROR #28 SEP +eor sAmu_, sE3, sAso, ROR #2 SEP +eor sAso_, sE0, sAma, ROR #54 SEP xar_m1 vBsi, vAku, E4, 25 +eor sAka_, sE1, sAbe, ROR #43 SEP +eor sAse_, sE3, sAgo, ROR #36 SEP +eor sAgo_, sE1, sAme, ROR #49 SEP +eor sAke_, sE2, sAgi, ROR #3 SEP xar_m1 vBku, vAsa, E0, 46 +eor sAgi_, sE0, sAka, ROR #39 SEP +eor sAga_, sE3, sAbo SEP +eor sAbo_, sE3, sAmo, ROR #37 SEP +eor sAmo_, sE2, sAmi, ROR #8 SEP +eor sAmi_, sE1, sAke, ROR #56 SEP +eor sAge_, sE4, sAgu, ROR #44 SEP +eor sAgu_, sE2, sAsi, ROR #62 SEP xar_m1 vBma, vAbu, E4, 37 +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP +eor sAma_, sE4, sAbu, ROR #20 SEP +eor sAbu_, sE4, sAsu, ROR #9 SEP +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP +eor sAbe_, sE1, sAge, ROR #19 SEP xar_m1 vBbu, vAsu, E4, 50 +load_constant_ptr SEP +restore count, STACK_OFFSET_COUNT SEP +bic tmp, sAgi_, sAge_, ROR #47 SEP +eor sAga, tmp, sAga_, ROR #39 SEP +bic tmp, sAgo_, sAgi_, ROR #42 SEP xar_m1 vBsu, vAse, E1, 62 +eor sAge, tmp, sAge_, ROR #25 SEP +bic tmp, sAgu_, sAgo_, ROR #16 SEP +eor sAgi, tmp, sAgi_, ROR #58 SEP +bic tmp, sAga_, sAgu_, ROR #31 SEP +eor sAgo, tmp, sAgo_, ROR #47 SEP +bic tmp, sAge_, sAga_, ROR #56 SEP +eor sAgu, tmp, sAgu_, ROR #23 SEP xar_m1 vBme, E3, E0, 28 +bic tmp, sAki_, sAke_, ROR #19 SEP +eor sAka, tmp, sAka_, ROR #24 SEP +bic tmp, sAko_, sAki_, ROR #47 SEP +eor sAke, tmp, sAke_, ROR #2 SEP +bic tmp, sAku_, sAko_, ROR #10 SEP +eor sAki, tmp, sAki_, ROR #57 SEP xar_m1 vBbe, vAge, E1, 20 +bic tmp, sAka_, sAku_, ROR #47 SEP +eor sAko, tmp, sAko_, ROR #57 SEP +bic tmp, sAke_, sAka_, ROR #5 SEP +eor sAku, tmp, sAku_, ROR #52 SEP +bic tmp, sAmi_, sAme_, ROR #38 SEP +eor sAma, tmp, sAma_, ROR #47 SEP bcax_m1 vAge, vBge, vBgo, vBgi +bic tmp, sAmo_, sAmi_, ROR #5 SEP +eor sAme, tmp, sAme_, ROR #43 SEP +bic tmp, sAmu_, sAmo_, ROR #41 SEP +eor sAmi, tmp, sAmi_, ROR #46 SEP bcax_m1 vAgi, vBgi, vBgu, vBgo +bic tmp, sAma_, sAmu_, ROR #35 SEP +eor sAmo, tmp, sAmo_, ROR #12 SEP +bic tmp, sAme_, sAma_, ROR #9 SEP +eor sAmu, tmp, sAmu_, ROR #44 SEP bcax_m1 vAgo, vBgo, vBga, vBgu +bic tmp, sAsi_, sAse_, ROR #48 SEP +ldr cur_const, [const_addr, count, UXTW #3] SEP +eor sAsa, tmp, sAsa_, ROR #41 SEP +bic tmp, sAso_, sAsi_, ROR #2 SEP bcax_m1 vAgu, vBgu, vBge, vBga +eor sAse, tmp, sAse_, ROR #50 SEP +bic tmp, sAsu_, sAso_, ROR #25 SEP +eor sAsi, tmp, sAsi_, ROR #27 SEP +bic tmp, sAsa_, sAsu_, ROR #60 SEP bcax_m1 vAka, vBka, vBki, vBke +eor sAso, tmp, sAso_, ROR #21 SEP +bic tmp, sAse_, sAsa_, ROR #57 SEP +eor sAsu, tmp, sAsu_, ROR #53 SEP +bic tmp, sAbi_, sAbe_, ROR #63 SEP bcax_m1 vAke, vBke, vBko, vBki +eor s_Aba, s_Aba_, tmp, ROR #21 SEP .unreq vvtmp +bic tmp, sAbo_, sAbi_, ROR #42 SEP +eor sAbe, tmp, sAbe_, ROR #41 SEP .unreq vvtmpq +bic tmp, sAbu_, sAbo_, ROR #57 SEP eor2 C0, vAka, vAga +eor sAbi, tmp, sAbi_, ROR #35 SEP vvtmp .req vAga +bic tmp, s_Aba_, sAbu_, ROR #50 SEP save(vAga) +eor sAbo, tmp, sAbo_, ROR #43 SEP vvtmpq .req vAgaq +bic tmp, sAbe_, s_Aba_, ROR #44 SEP bcax_m1 vAki, vBki, vBku, vBko +eor sAbu, tmp, sAbu_, ROR #30 SEP +add count, count, #1 SEP +eor s_Aba, s_Aba, cur_const SEP + SEP +save count, STACK_OFFSET_COUNT SEP bcax_m1 vAko, vBko, vBka, vBku +eor sC0, sAka, sAsa, ROR #50 SEP +eor sC1, sAse, sAge, ROR #60 SEP +eor sC2, sAmi, sAgi, ROR #59 SEP +eor sC3, sAgo, sAso, ROR #30 SEP eor2 C1, vAke, vAge +eor sC4, sAbu, sAsu, ROR #53 SEP +eor sC0, sAma, sC0, ROR #49 SEP bcax_m1 vAku, vBku, vBke, vBka +eor sC1, sAbe, sC1, ROR #44 SEP +eor sC2, sAki, sC2, ROR #26 SEP +eor sC3, sAmo, sC3, ROR #63 SEP +eor sC4, sAmu, sC4, ROR #56 SEP eor2 C2, vAki, vAgi +eor sC0, sAga, sC0, ROR #57 SEP +eor sC1, sAme, sC1, ROR #58 SEP bcax_m1 vAma, vBma, vBmi, vBme +eor sC2, sAbi, sC2, ROR #60 SEP +eor sC3, sAko, sC3, ROR #38 SEP +eor sC4, sAgu, sC4, ROR #48 SEP +eor sC0, s_Aba, sC0, ROR #61 SEP eor2 C3, vAko, vAgo +eor sC1, sAke, sC1, ROR #57 SEP +eor sC2, sAsi, sC2, ROR #52 SEP bcax_m1 vAme, vBme, vBmo, vBmi +eor sC3, sAbo, sC3, ROR #63 SEP +eor sC4, sAku, sC4, ROR #50 SEP +ror sC1, sC1, 56 SEP +ror sC4, sC4, 58 SEP eor2 C4, vAku, vAgu +ror sC2, sC2, 62 SEP +eor sE1, sC0, sC2, ROR #63 SEP bcax_m1 vAmi, vBmi, vBmu, vBmo +eor sE3, sC2, sC4, ROR #63 SEP +eor sE0, sC4, sC1, ROR #63 SEP +eor sE2, sC1, sC3, ROR #63 SEP eor2 C0, C0, vAma +eor sE4, sC3, sC0, ROR #63 SEP +eor s_Aba_, sE0, s_Aba SEP bcax_m1 vAmo, vBmo, vBma, vBmu +eor sAsa_, sE2, sAbi, ROR #50 SEP +eor sAbi_, sE2, sAki, ROR #46 SEP +eor sAki_, sE3, sAko, ROR #63 SEP +eor sAko_, sE4, sAmu, ROR #28 SEP eor2 C1, C1, vAme +eor sAmu_, sE3, sAso, ROR #2 SEP +eor sAso_, sE0, sAma, ROR #54 SEP bcax_m1 vAmu, vBmu, vBme, vBma +eor sAka_, sE1, sAbe, ROR #43 SEP +eor sAse_, sE3, sAgo, ROR #36 SEP +eor sAgo_, sE1, sAme, ROR #49 SEP eor2 C2, C2, vAmi +eor sAke_, sE2, sAgi, ROR #3 SEP +eor sAgi_, sE0, sAka, ROR #39 SEP bcax_m1 vAsa, vBsa, vBsi, vBse +eor sAga_, sE3, sAbo SEP +eor sAbo_, sE3, sAmo, ROR #37 SEP eor2 C3, C3, vAmo +eor sAmo_, sE2, sAmi, ROR #8 SEP +eor sAmi_, sE1, sAke, ROR #56 SEP bcax_m1 vAse, vBse, vBso, vBsi +eor sAge_, sE4, sAgu, ROR #44 SEP +eor sAgu_, sE2, sAsi, ROR #62 SEP +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP eor2 C4, C4, vAmu +eor sAma_, sE4, sAbu, ROR #20 SEP +eor sAbu_, sE4, sAsu, ROR #9 SEP bcax_m1 vAsi, vBsi, vBsu, vBso +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP +eor sAbe_, sE1, sAge, ROR #19 SEP +load_constant_ptr SEP eor2 C0, C0, vAsa +restore count, STACK_OFFSET_COUNT SEP +bic tmp, sAgi_, sAge_, ROR #47 SEP bcax_m1 vAso, vBso, vBsa, vBsu +eor sAga, tmp, sAga_, ROR #39 SEP +bic tmp, sAgo_, sAgi_, ROR #42 SEP +eor sAge, tmp, sAge_, ROR #25 SEP +bic tmp, sAgu_, sAgo_, ROR #16 SEP eor2 C1, C1, vAse +eor sAgi, tmp, sAgi_, ROR #58 SEP +bic tmp, sAga_, sAgu_, ROR #31 SEP bcax_m1 vAsu, vBsu, vBse, vBsa +eor sAgo, tmp, sAgo_, ROR #47 SEP +bic tmp, sAge_, sAga_, ROR #56 SEP +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP eor2 C2, C2, vAsi +eor sAka, tmp, sAka_, ROR #24 SEP +bic tmp, sAko_, sAki_, ROR #47 SEP eor2 C3, C3, vAso +eor sAke, tmp, sAke_, ROR #2 SEP +bic tmp, sAku_, sAko_, ROR #10 SEP bcax_m1 vAba, vBba, vBbi, vBbe +eor sAki, tmp, sAki_, ROR #57 SEP +bic tmp, sAka_, sAku_, ROR #47 SEP +eor sAko, tmp, sAko_, ROR #57 SEP +bic tmp, sAke_, sAka_, ROR #5 SEP bcax_m1 vAbe, vBbe, vBbo, vBbi +eor sAku, tmp, sAku_, ROR #52 SEP +bic tmp, sAmi_, sAme_, ROR #38 SEP +eor sAma, tmp, sAma_, ROR #47 SEP +bic tmp, sAmo_, sAmi_, ROR #5 SEP eor2 C1, C1, vAbe +eor sAme, tmp, sAme_, ROR #43 SEP restore x26, STACK_OFFSET_CONST +bic tmp, sAmu_, sAmo_, ROR #41 SEP ldr vvtmpq, [x26], #16 +eor sAmi, tmp, sAmi_, ROR #46 SEP save x26, STACK_OFFSET_CONST +bic tmp, sAma_, sAmu_, ROR #35 SEP +eor sAmo, tmp, sAmo_, ROR #12 SEP eor vAba.16b, vAba.16b, vvtmp.16b +bic tmp, sAme_, sAma_, ROR #9 SEP +eor sAmu, tmp, sAmu_, ROR #44 SEP eor2 C4, C4, vAsu +bic tmp, sAsi_, sAse_, ROR #48 SEP +ldr cur_const, [const_addr, count, UXTW #3] SEP bcax_m1 vAbi, vBbi, vBbu, vBbo +eor sAsa, tmp, sAsa_, ROR #41 SEP +bic tmp, sAso_, sAsi_, ROR #2 SEP +eor sAse, tmp, sAse_, ROR #50 SEP +bic tmp, sAsu_, sAso_, ROR #25 SEP bcax_m1 vAbo, vBbo, vBba, vBbu +eor sAsi, tmp, sAsi_, ROR #27 SEP +bic tmp, sAsa_, sAsu_, ROR #60 SEP +eor sAso, tmp, sAso_, ROR #21 SEP +bic tmp, sAse_, sAsa_, ROR #57 SEP eor2 C3, C3, vAbo +eor sAsu, tmp, sAsu_, ROR #53 SEP +bic tmp, sAbi_, sAbe_, ROR #63 SEP eor2 C2, C2, vAbi +eor s_Aba, s_Aba_, tmp, ROR #21 SEP +bic tmp, sAbo_, sAbi_, ROR #42 SEP eor2 C0, C0, vAba +eor sAbe, tmp, sAbe_, ROR #41 SEP +bic tmp, sAbu_, sAbo_, ROR #57 SEP bcax_m1 vAbu, vBbu, vBbe, vBba +eor sAbi, tmp, sAbi_, ROR #35 SEP +bic tmp, s_Aba_, sAbu_, ROR #50 SEP +eor sAbo, tmp, sAbo_, ROR #43 SEP +bic tmp, sAbe_, s_Aba_, ROR #44 SEP eor2 C4, C4, vAbu +eor sAbu, tmp, sAbu_, ROR #30 SEP +add count, count, #1 SEP restore(vAga) +eor s_Aba, s_Aba, cur_const SEP + .unreq vvtmp + + .unreq vvtmpq +.endm + +.macro hybrid_round_noninitial + SEP vvtmp .req vBba +save count, STACK_OFFSET_COUNT SEP rax1_m1 E2, C1, C3 +eor sC0, sAka, sAsa, ROR #50 SEP +eor sC1, sAse, sAge, ROR #60 SEP +eor sC2, sAmi, sAgi, ROR #59 SEP +eor sC3, sAgo, sAso, ROR #30 SEP +eor sC4, sAbu, sAsu, ROR #53 SEP +eor sC0, sAma, sC0, ROR #49 SEP rax1_m1 E4, C3, C0 +eor sC1, sAbe, sC1, ROR #44 SEP +eor sC2, sAki, sC2, ROR #26 SEP +eor sC3, sAmo, sC3, ROR #63 SEP +eor sC4, sAmu, sC4, ROR #56 SEP +eor sC0, sAga, sC0, ROR #57 SEP +eor sC1, sAme, sC1, ROR #58 SEP rax1_m1 E1, C0, C2 +eor sC2, sAbi, sC2, ROR #60 SEP +eor sC3, sAko, sC3, ROR #38 SEP +eor sC4, sAgu, sC4, ROR #48 SEP +eor sC0, s_Aba, sC0, ROR #61 SEP +eor sC1, sAke, sC1, ROR #57 SEP +eor sC2, sAsi, sC2, ROR #52 SEP rax1_m1 E3, C2, C4 +eor sC3, sAbo, sC3, ROR #63 SEP +eor sC4, sAku, sC4, ROR #50 SEP +ror sC1, sC1, 56 SEP +ror sC4, sC4, 58 SEP +ror sC2, sC2, 62 SEP +eor sE1, sC0, sC2, ROR #63 SEP rax1_m1 E0, C4, C1 +eor sE3, sC2, sC4, ROR #63 SEP +eor sE0, sC4, sC1, ROR #63 SEP .unreq vvtmp +eor sE2, sC1, sC3, ROR #63 SEP vvtmp .req C1 +eor sE4, sC3, sC0, ROR #63 SEP vvtmpq .req C1q +eor s_Aba_, sE0, s_Aba SEP +eor sAsa_, sE2, sAbi, ROR #50 SEP eor vBba.16b, vAba.16b, E0.16b +eor sAbi_, sE2, sAki, ROR #46 SEP +eor sAki_, sE3, sAko, ROR #63 SEP xar_m1 vBsa, vAbi, E2, 2 +eor sAko_, sE4, sAmu, ROR #28 SEP +eor sAmu_, sE3, sAso, ROR #2 SEP +eor sAso_, sE0, sAma, ROR #54 SEP +eor sAka_, sE1, sAbe, ROR #43 SEP +eor sAse_, sE3, sAgo, ROR #36 SEP +eor sAgo_, sE1, sAme, ROR #49 SEP xar_m1 vBbi, vAki, E2, 21 +eor sAke_, sE2, sAgi, ROR #3 SEP +eor sAgi_, sE0, sAka, ROR #39 SEP +eor sAga_, sE3, sAbo SEP +eor sAbo_, sE3, sAmo, ROR #37 SEP +eor sAmo_, sE2, sAmi, ROR #8 SEP +eor sAmi_, sE1, sAke, ROR #56 SEP xar_m1 vBki, vAko, E3, 39 +eor sAge_, sE4, sAgu, ROR #44 SEP +eor sAgu_, sE2, sAsi, ROR #62 SEP +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP +eor sAma_, sE4, sAbu, ROR #20 SEP +eor sAbu_, sE4, sAsu, ROR #9 SEP xar_m1 vBko, vAmu, E4, 56 +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP +eor sAbe_, sE1, sAge, ROR #19 SEP +load_constant_ptr SEP +restore count, STACK_OFFSET_COUNT SEP +bic tmp, sAgi_, sAge_, ROR #47 SEP xar_m1 vBmu, vAso, E3, 8 +eor sAga, tmp, sAga_, ROR #39 SEP +bic tmp, sAgo_, sAgi_, ROR #42 SEP +eor sAge, tmp, sAge_, ROR #25 SEP +bic tmp, sAgu_, sAgo_, ROR #16 SEP +eor sAgi, tmp, sAgi_, ROR #58 SEP +bic tmp, sAga_, sAgu_, ROR #31 SEP xar_m1 vBso, vAma, E0, 23 +eor sAgo, tmp, sAgo_, ROR #47 SEP +bic tmp, sAge_, sAga_, ROR #56 SEP +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP +eor sAka, tmp, sAka_, ROR #24 SEP +bic tmp, sAko_, sAki_, ROR #47 SEP xar_m1 vBka, vAbe, E1, 63 +eor sAke, tmp, sAke_, ROR #2 SEP +bic tmp, sAku_, sAko_, ROR #10 SEP +eor sAki, tmp, sAki_, ROR #57 SEP +bic tmp, sAka_, sAku_, ROR #47 SEP +eor sAko, tmp, sAko_, ROR #57 SEP +bic tmp, sAke_, sAka_, ROR #5 SEP xar_m1 vBse, vAgo, E3, 9 +eor sAku, tmp, sAku_, ROR #52 SEP +bic tmp, sAmi_, sAme_, ROR #38 SEP +eor sAma, tmp, sAma_, ROR #47 SEP +bic tmp, sAmo_, sAmi_, ROR #5 SEP +eor sAme, tmp, sAme_, ROR #43 SEP xar_m1 vBgo, vAme, E1, 19 +bic tmp, sAmu_, sAmo_, ROR #41 SEP +eor sAmi, tmp, sAmi_, ROR #46 SEP +bic tmp, sAma_, sAmu_, ROR #35 SEP +ldr cur_const, [const_addr, count, UXTW #3] +add count, count, #1 SEP xar_m1 vBke, vAgi, E2, 58 +eor sAmo, tmp, sAmo_, ROR #12 SEP +bic tmp, sAme_, sAma_, ROR #9 SEP +eor sAmu, tmp, sAmu_, ROR #44 SEP +bic tmp, sAsi_, sAse_, ROR #48 SEP +eor sAsa, tmp, sAsa_, ROR #41 SEP xar_m1 vBgi, vAka, E0, 61 +bic tmp, sAso_, sAsi_, ROR #2 SEP +eor sAse, tmp, sAse_, ROR #50 SEP +bic tmp, sAsu_, sAso_, ROR #25 SEP +eor sAsi, tmp, sAsi_, ROR #27 SEP +bic tmp, sAsa_, sAsu_, ROR #60 SEP +eor sAso, tmp, sAso_, ROR #21 SEP xar_m1 vBga, vAbo, E3, 36 +bic tmp, sAse_, sAsa_, ROR #57 SEP +eor sAsu, tmp, sAsu_, ROR #53 SEP +bic tmp, sAbi_, sAbe_, ROR #63 SEP +eor s_Aba, s_Aba_, tmp, ROR #21 SEP +bic tmp, sAbo_, sAbi_, ROR #42 SEP +eor sAbe, tmp, sAbe_, ROR #41 SEP xar_m1 vBbo, vAmo, E3, 43 +bic tmp, sAbu_, sAbo_, ROR #57 SEP +eor sAbi, tmp, sAbi_, ROR #35 SEP +bic tmp, s_Aba_, sAbu_, ROR #50 SEP +eor sAbo, tmp, sAbo_, ROR #43 SEP +bic tmp, sAbe_, s_Aba_, ROR #44 SEP +eor sAbu, tmp, sAbu_, ROR #30 SEP xar_m1 vBmo, vAmi, E2, 49 +eor s_Aba, s_Aba, cur_const SEP +save count, STACK_OFFSET_COUNT SEP +eor sC0, sAka, sAsa, ROR #50 SEP +eor sC1, sAse, sAge, ROR #60 SEP +eor sC2, sAmi, sAgi, ROR #59 SEP +eor sC3, sAgo, sAso, ROR #30 SEP xar_m1 vBmi, vAke, E1, 54 +eor sC4, sAbu, sAsu, ROR #53 SEP +eor sC0, sAma, sC0, ROR #49 SEP +eor sC1, sAbe, sC1, ROR #44 SEP +eor sC2, sAki, sC2, ROR #26 SEP +eor sC3, sAmo, sC3, ROR #63 SEP +eor sC4, sAmu, sC4, ROR #56 SEP +eor sC0, sAga, sC0, ROR #57 SEP xar_m1 vBge, vAgu, E4, 44 +eor sC1, sAme, sC1, ROR #58 SEP +eor sC2, sAbi, sC2, ROR #60 SEP +eor sC3, sAko, sC3, ROR #38 SEP +eor sC4, sAgu, sC4, ROR #48 SEP +eor sC0, s_Aba, sC0, ROR #61 SEP +eor sC1, sAke, sC1, ROR #57 SEP mov E3.16b, vAga.16b +eor sC2, sAsi, sC2, ROR #52 SEP +eor sC3, sAbo, sC3, ROR #63 SEP bcax_m1 vAga, vBga, vBgi, vBge +eor sC4, sAku, sC4, ROR #50 SEP +ror sC1, sC1, 56 SEP +ror sC4, sC4, 58 SEP +ror sC2, sC2, 62 SEP xar_m1 vBgu, vAsi, E2, 3 +eor sE1, sC0, sC2, ROR #63 SEP +eor sE3, sC2, sC4, ROR #63 SEP +eor sE0, sC4, sC1, ROR #63 SEP +eor sE2, sC1, sC3, ROR #63 SEP +eor sE4, sC3, sC0, ROR #63 SEP +eor s_Aba_, sE0, s_Aba SEP xar_m1 vBsi, vAku, E4, 25 +eor sAsa_, sE2, sAbi, ROR #50 SEP +eor sAbi_, sE2, sAki, ROR #46 SEP +eor sAki_, sE3, sAko, ROR #63 SEP +eor sAko_, sE4, sAmu, ROR #28 SEP +eor sAmu_, sE3, sAso, ROR #2 SEP +eor sAso_, sE0, sAma, ROR #54 SEP xar_m1 vBku, vAsa, E0, 46 +eor sAka_, sE1, sAbe, ROR #43 SEP +eor sAse_, sE3, sAgo, ROR #36 SEP +eor sAgo_, sE1, sAme, ROR #49 SEP +eor sAke_, sE2, sAgi, ROR #3 SEP +eor sAgi_, sE0, sAka, ROR #39 SEP +eor sAga_, sE3, sAbo SEP xar_m1 vBma, vAbu, E4, 37 +eor sAbo_, sE3, sAmo, ROR #37 SEP +eor sAmo_, sE2, sAmi, ROR #8 SEP +eor sAmi_, sE1, sAke, ROR #56 SEP +eor sAge_, sE4, sAgu, ROR #44 SEP +eor sAgu_, sE2, sAsi, ROR #62 SEP xar_m1 vBbu, vAsu, E4, 50 +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP +eor sAma_, sE4, sAbu, ROR #20 SEP +eor sAbu_, sE4, sAsu, ROR #9 SEP +eor sAsu_, sE1, sAse, ROR #23 SEP xar_m1 vBsu, vAse, E1, 62 +eor sAme_, sE0, sAga, ROR #61 SEP +eor sAbe_, sE1, sAge, ROR #19 SEP +load_constant_ptr SEP +restore count, STACK_OFFSET_COUNT SEP +bic tmp, sAgi_, sAge_, ROR #47 SEP +eor sAga, tmp, sAga_, ROR #39 SEP xar_m1 vBme, E3, E0, 28 +bic tmp, sAgo_, sAgi_, ROR #42 SEP +eor sAge, tmp, sAge_, ROR #25 SEP +bic tmp, sAgu_, sAgo_, ROR #16 SEP +eor sAgi, tmp, sAgi_, ROR #58 SEP +bic tmp, sAga_, sAgu_, ROR #31 SEP +eor sAgo, tmp, sAgo_, ROR #47 SEP xar_m1 vBbe, vAge, E1, 20 +bic tmp, sAge_, sAga_, ROR #56 SEP +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP +eor sAka, tmp, sAka_, ROR #24 SEP +bic tmp, sAko_, sAki_, ROR #47 SEP bcax_m1 vAge, vBge, vBgo, vBgi +eor sAke, tmp, sAke_, ROR #2 SEP +bic tmp, sAku_, sAko_, ROR #10 SEP +eor sAki, tmp, sAki_, ROR #57 SEP +bic tmp, sAka_, sAku_, ROR #47 SEP bcax_m1 vAgi, vBgi, vBgu, vBgo +eor sAko, tmp, sAko_, ROR #57 SEP +bic tmp, sAke_, sAka_, ROR #5 SEP +eor sAku, tmp, sAku_, ROR #52 SEP +bic tmp, sAmi_, sAme_, ROR #38 SEP bcax_m1 vAgo, vBgo, vBga, vBgu +eor sAma, tmp, sAma_, ROR #47 SEP +bic tmp, sAmo_, sAmi_, ROR #5 SEP +eor sAme, tmp, sAme_, ROR #43 SEP +bic tmp, sAmu_, sAmo_, ROR #41 SEP bcax_m1 vAgu, vBgu, vBge, vBga +eor sAmi, tmp, sAmi_, ROR #46 SEP +bic tmp, sAma_, sAmu_, ROR #35 SEP +ldr cur_const, [const_addr, count, UXTW #3] +add count, count, #1 SEP bcax_m1 vAka, vBka, vBki, vBke +eor sAmo, tmp, sAmo_, ROR #12 SEP +bic tmp, sAme_, sAma_, ROR #9 SEP +eor sAmu, tmp, sAmu_, ROR #44 SEP +bic tmp, sAsi_, sAse_, ROR #48 SEP bcax_m1 vAke, vBke, vBko, vBki +eor sAsa, tmp, sAsa_, ROR #41 SEP .unreq vvtmp +bic tmp, sAso_, sAsi_, ROR #2 SEP .unreq vvtmpq +eor sAse, tmp, sAse_, ROR #50 SEP +bic tmp, sAsu_, sAso_, ROR #25 SEP eor2 C0, vAka, vAga +eor sAsi, tmp, sAsi_, ROR #27 SEP save(vAga) +bic tmp, sAsa_, sAsu_, ROR #60 SEP vvtmp .req vAga +eor sAso, tmp, sAso_, ROR #21 SEP vvtmpq .req vAgaq +bic tmp, sAse_, sAsa_, ROR #57 SEP bcax_m1 vAki, vBki, vBku, vBko +eor sAsu, tmp, sAsu_, ROR #53 SEP +bic tmp, sAbi_, sAbe_, ROR #63 SEP +eor s_Aba, s_Aba_, tmp, ROR #21 SEP +bic tmp, sAbo_, sAbi_, ROR #42 SEP bcax_m1 vAko, vBko, vBka, vBku +eor sAbe, tmp, sAbe_, ROR #41 SEP +bic tmp, sAbu_, sAbo_, ROR #57 SEP +eor sAbi, tmp, sAbi_, ROR #35 SEP +bic tmp, s_Aba_, sAbu_, ROR #50 SEP eor2 C1, vAke, vAge +eor sAbo, tmp, sAbo_, ROR #43 SEP +bic tmp, sAbe_, s_Aba_, ROR #44 SEP bcax_m1 vAku, vBku, vBke, vBka +eor sAbu, tmp, sAbu_, ROR #30 SEP +eor s_Aba, s_Aba, cur_const SEP + SEP +save count, STACK_OFFSET_COUNT SEP +eor sC0, sAka, sAsa, ROR #50 SEP +eor sC1, sAse, sAge, ROR #60 SEP eor2 C2, vAki, vAgi +eor sC2, sAmi, sAgi, ROR #59 SEP +eor sC3, sAgo, sAso, ROR #30 SEP bcax_m1 vAma, vBma, vBmi, vBme +eor sC4, sAbu, sAsu, ROR #53 SEP +eor sC0, sAma, sC0, ROR #49 SEP +eor sC1, sAbe, sC1, ROR #44 SEP +eor sC2, sAki, sC2, ROR #26 SEP eor2 C3, vAko, vAgo +eor sC3, sAmo, sC3, ROR #63 SEP +eor sC4, sAmu, sC4, ROR #56 SEP bcax_m1 vAme, vBme, vBmo, vBmi +eor sC0, sAga, sC0, ROR #57 SEP +eor sC1, sAme, sC1, ROR #58 SEP +eor sC2, sAbi, sC2, ROR #60 SEP +eor sC3, sAko, sC3, ROR #38 SEP eor2 C4, vAku, vAgu +eor sC4, sAgu, sC4, ROR #48 SEP +eor sC0, s_Aba, sC0, ROR #61 SEP bcax_m1 vAmi, vBmi, vBmu, vBmo +eor sC1, sAke, sC1, ROR #57 SEP +eor sC2, sAsi, sC2, ROR #52 SEP +eor sC3, sAbo, sC3, ROR #63 SEP +eor sC4, sAku, sC4, ROR #50 SEP eor2 C0, C0, vAma +ror sC1, sC1, 56 SEP +ror sC4, sC4, 58 SEP bcax_m1 vAmo, vBmo, vBma, vBmu +ror sC2, sC2, 62 SEP +eor sE1, sC0, sC2, ROR #63 SEP +eor sE3, sC2, sC4, ROR #63 SEP +eor sE0, sC4, sC1, ROR #63 SEP eor2 C1, C1, vAme +eor sE2, sC1, sC3, ROR #63 SEP +eor sE4, sC3, sC0, ROR #63 SEP bcax_m1 vAmu, vBmu, vBme, vBma +eor s_Aba_, sE0, s_Aba SEP +eor sAsa_, sE2, sAbi, ROR #50 SEP +eor sAbi_, sE2, sAki, ROR #46 SEP +eor sAki_, sE3, sAko, ROR #63 SEP eor2 C2, C2, vAmi +eor sAko_, sE4, sAmu, ROR #28 SEP +eor sAmu_, sE3, sAso, ROR #2 SEP bcax_m1 vAsa, vBsa, vBsi, vBse +eor sAso_, sE0, sAma, ROR #54 SEP +eor sAka_, sE1, sAbe, ROR #43 SEP +eor sAse_, sE3, sAgo, ROR #36 SEP +eor sAgo_, sE1, sAme, ROR #49 SEP eor2 C3, C3, vAmo +eor sAke_, sE2, sAgi, ROR #3 SEP +eor sAgi_, sE0, sAka, ROR #39 SEP bcax_m1 vAse, vBse, vBso, vBsi +eor sAga_, sE3, sAbo SEP +eor sAbo_, sE3, sAmo, ROR #37 SEP +eor sAmo_, sE2, sAmi, ROR #8 SEP +eor sAmi_, sE1, sAke, ROR #56 SEP eor2 C4, C4, vAmu +eor sAge_, sE4, sAgu, ROR #44 SEP +eor sAgu_, sE2, sAsi, ROR #62 SEP bcax_m1 vAsi, vBsi, vBsu, vBso +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP +eor sAma_, sE4, sAbu, ROR #20 SEP +eor sAbu_, sE4, sAsu, ROR #9 SEP eor2 C0, C0, vAsa +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP bcax_m1 vAso, vBso, vBsa, vBsu +eor sAbe_, sE1, sAge, ROR #19 SEP +load_constant_ptr SEP +restore count, STACK_OFFSET_COUNT SEP +bic tmp, sAgi_, sAge_, ROR #47 SEP +eor sAga, tmp, sAga_, ROR #39 SEP +bic tmp, sAgo_, sAgi_, ROR #42 SEP eor2 C1, C1, vAse +eor sAge, tmp, sAge_, ROR #25 SEP +bic tmp, sAgu_, sAgo_, ROR #16 SEP bcax_m1 vAsu, vBsu, vBse, vBsa +eor sAgi, tmp, sAgi_, ROR #58 SEP +bic tmp, sAga_, sAgu_, ROR #31 SEP +eor sAgo, tmp, sAgo_, ROR #47 SEP +bic tmp, sAge_, sAga_, ROR #56 SEP eor2 C2, C2, vAsi +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP eor2 C3, C3, vAso +eor sAka, tmp, sAka_, ROR #24 SEP +bic tmp, sAko_, sAki_, ROR #47 SEP bcax_m1 vAba, vBba, vBbi, vBbe +eor sAke, tmp, sAke_, ROR #2 SEP +bic tmp, sAku_, sAko_, ROR #10 SEP +eor sAki, tmp, sAki_, ROR #57 SEP +bic tmp, sAka_, sAku_, ROR #47 SEP bcax_m1 vAbe, vBbe, vBbo, vBbi +eor sAko, tmp, sAko_, ROR #57 SEP +bic tmp, sAke_, sAka_, ROR #5 SEP +eor sAku, tmp, sAku_, ROR #52 SEP +bic tmp, sAmi_, sAme_, ROR #38 SEP eor2 C1, C1, vAbe +eor sAma, tmp, sAma_, ROR #47 SEP +bic tmp, sAmo_, sAmi_, ROR #5 SEP restore x26, STACK_OFFSET_CONST +eor sAme, tmp, sAme_, ROR #43 SEP ldr vvtmpq, [x26], #16 +bic tmp, sAmu_, sAmo_, ROR #41 SEP save x26, STACK_OFFSET_CONST +eor sAmi, tmp, sAmi_, ROR #46 SEP +bic tmp, sAma_, sAmu_, ROR #35 SEP eor vAba.16b, vAba.16b, vvtmp.16b +ldr cur_const, [const_addr, count, UXTW #3] +add count, count, #1 SEP +eor sAmo, tmp, sAmo_, ROR #12 SEP eor2 C4, C4, vAsu +bic tmp, sAme_, sAma_, ROR #9 SEP +eor sAmu, tmp, sAmu_, ROR #44 SEP bcax_m1 vAbi, vBbi, vBbu, vBbo +bic tmp, sAsi_, sAse_, ROR #48 SEP +eor sAsa, tmp, sAsa_, ROR #41 SEP +bic tmp, sAso_, sAsi_, ROR #2 SEP +eor sAse, tmp, sAse_, ROR #50 SEP bcax_m1 vAbo, vBbo, vBba, vBbu +bic tmp, sAsu_, sAso_, ROR #25 SEP +eor sAsi, tmp, sAsi_, ROR #27 SEP +bic tmp, sAsa_, sAsu_, ROR #60 SEP +eor sAso, tmp, sAso_, ROR #21 SEP eor2 C3, C3, vAbo +bic tmp, sAse_, sAsa_, ROR #57 SEP +eor sAsu, tmp, sAsu_, ROR #53 SEP eor2 C2, C2, vAbi +bic tmp, sAbi_, sAbe_, ROR #63 SEP +eor s_Aba, s_Aba_, tmp, ROR #21 SEP eor2 C0, C0, vAba +bic tmp, sAbo_, sAbi_, ROR #42 SEP +eor sAbe, tmp, sAbe_, ROR #41 SEP bcax_m1 vAbu, vBbu, vBbe, vBba +bic tmp, sAbu_, sAbo_, ROR #57 SEP +eor sAbi, tmp, sAbi_, ROR #35 SEP +bic tmp, s_Aba_, sAbu_, ROR #50 SEP +eor sAbo, tmp, sAbo_, ROR #43 SEP eor2 C4, C4, vAbu +bic tmp, sAbe_, s_Aba_, ROR #44 SEP +eor sAbu, tmp, sAbu_, ROR #30 SEP restore(vAga) +eor s_Aba, s_Aba, cur_const SEP .unreq vvtmp + .unreq vvtmpq + +.endm + +.macro hybrid_round_final + +save count, STACK_OFFSET_COUNT SEP vvtmp .req vBba +eor sC0, sAka, sAsa, ROR #50 SEP +eor sC1, sAse, sAge, ROR #60 SEP +eor sC2, sAmi, sAgi, ROR #59 SEP rax1_m1 E2, C1, C3 +eor sC3, sAgo, sAso, ROR #30 SEP +eor sC4, sAbu, sAsu, ROR #53 SEP +eor sC0, sAma, sC0, ROR #49 SEP +eor sC1, sAbe, sC1, ROR #44 SEP +eor sC2, sAki, sC2, ROR #26 SEP +eor sC3, sAmo, sC3, ROR #63 SEP rax1_m1 E4, C3, C0 +eor sC4, sAmu, sC4, ROR #56 SEP +eor sC0, sAga, sC0, ROR #57 SEP +eor sC1, sAme, sC1, ROR #58 SEP +eor sC2, sAbi, sC2, ROR #60 SEP +eor sC3, sAko, sC3, ROR #38 SEP +eor sC4, sAgu, sC4, ROR #48 SEP rax1_m1 E1, C0, C2 +eor sC0, s_Aba, sC0, ROR #61 SEP +eor sC1, sAke, sC1, ROR #57 SEP +eor sC2, sAsi, sC2, ROR #52 SEP +eor sC3, sAbo, sC3, ROR #63 SEP +eor sC4, sAku, sC4, ROR #50 SEP +ror sC1, sC1, 56 SEP rax1_m1 E3, C2, C4 +ror sC4, sC4, 58 SEP +ror sC2, sC2, 62 SEP +eor sE1, sC0, sC2, ROR #63 SEP +eor sE3, sC2, sC4, ROR #63 SEP +eor sE0, sC4, sC1, ROR #63 SEP +eor sE2, sC1, sC3, ROR #63 SEP rax1_m1 E0, C4, C1 +eor sE4, sC3, sC0, ROR #63 SEP +eor s_Aba_, sE0, s_Aba SEP +eor sAsa_, sE2, sAbi, ROR #50 SEP .unreq vvtmp +eor sAbi_, sE2, sAki, ROR #46 SEP vvtmp .req C1 +eor sAki_, sE3, sAko, ROR #63 SEP vvtmpq .req C1q +eor sAko_, sE4, sAmu, ROR #28 SEP +eor sAmu_, sE3, sAso, ROR #2 SEP eor vBba.16b, vAba.16b, E0.16b +eor sAso_, sE0, sAma, ROR #54 SEP +eor sAka_, sE1, sAbe, ROR #43 SEP xar_m1 vBsa, vAbi, E2, 2 +eor sAse_, sE3, sAgo, ROR #36 SEP +eor sAgo_, sE1, sAme, ROR #49 SEP +eor sAke_, sE2, sAgi, ROR #3 SEP +eor sAgi_, sE0, sAka, ROR #39 SEP +eor sAga_, sE3, sAbo SEP +eor sAbo_, sE3, sAmo, ROR #37 SEP xar_m1 vBbi, vAki, E2, 21 +eor sAmo_, sE2, sAmi, ROR #8 SEP +eor sAmi_, sE1, sAke, ROR #56 SEP +eor sAge_, sE4, sAgu, ROR #44 SEP +eor sAgu_, sE2, sAsi, ROR #62 SEP +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP xar_m1 vBki, vAko, E3, 39 +eor sAma_, sE4, sAbu, ROR #20 SEP +eor sAbu_, sE4, sAsu, ROR #9 SEP +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP +eor sAbe_, sE1, sAge, ROR #19 SEP +load_constant_ptr SEP xar_m1 vBko, vAmu, E4, 56 +restore count, STACK_OFFSET_COUNT SEP +bic tmp, sAgi_, sAge_, ROR #47 SEP +eor sAga, tmp, sAga_, ROR #39 SEP +bic tmp, sAgo_, sAgi_, ROR #42 SEP +eor sAge, tmp, sAge_, ROR #25 SEP +bic tmp, sAgu_, sAgo_, ROR #16 SEP xar_m1 vBmu, vAso, E3, 8 +eor sAgi, tmp, sAgi_, ROR #58 SEP +bic tmp, sAga_, sAgu_, ROR #31 SEP +eor sAgo, tmp, sAgo_, ROR #47 SEP +bic tmp, sAge_, sAga_, ROR #56 SEP +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP xar_m1 vBso, vAma, E0, 23 +eor sAka, tmp, sAka_, ROR #24 SEP +bic tmp, sAko_, sAki_, ROR #47 SEP +eor sAke, tmp, sAke_, ROR #2 SEP +bic tmp, sAku_, sAko_, ROR #10 SEP +eor sAki, tmp, sAki_, ROR #57 SEP +bic tmp, sAka_, sAku_, ROR #47 SEP +eor sAko, tmp, sAko_, ROR #57 SEP xar_m1 vBka, vAbe, E1, 63 +bic tmp, sAke_, sAka_, ROR #5 SEP +eor sAku, tmp, sAku_, ROR #52 SEP +bic tmp, sAmi_, sAme_, ROR #38 SEP +eor sAma, tmp, sAma_, ROR #47 SEP +bic tmp, sAmo_, sAmi_, ROR #5 SEP +eor sAme, tmp, sAme_, ROR #43 SEP xar_m1 vBse, vAgo, E3, 9 +bic tmp, sAmu_, sAmo_, ROR #41 SEP +eor sAmi, tmp, sAmi_, ROR #46 SEP +bic tmp, sAma_, sAmu_, ROR #35 SEP +ldr cur_const, [const_addr, count, UXTW #3] SEP +add count, count, #1 SEP +eor sAmo, tmp, sAmo_, ROR #12 SEP xar_m1 vBgo, vAme, E1, 19 +bic tmp, sAme_, sAma_, ROR #9 SEP +eor sAmu, tmp, sAmu_, ROR #44 SEP +bic tmp, sAsi_, sAse_, ROR #48 SEP +eor sAsa, tmp, sAsa_, ROR #41 SEP +bic tmp, sAso_, sAsi_, ROR #2 SEP +eor sAse, tmp, sAse_, ROR #50 SEP xar_m1 vBke, vAgi, E2, 58 +bic tmp, sAsu_, sAso_, ROR #25 SEP +eor sAsi, tmp, sAsi_, ROR #27 SEP +bic tmp, sAsa_, sAsu_, ROR #60 SEP +eor sAso, tmp, sAso_, ROR #21 SEP +bic tmp, sAse_, sAsa_, ROR #57 SEP +eor sAsu, tmp, sAsu_, ROR #53 SEP xar_m1 vBgi, vAka, E0, 61 +bic tmp, sAbi_, sAbe_, ROR #63 SEP +eor s_Aba, s_Aba_, tmp, ROR #21 SEP +bic tmp, sAbo_, sAbi_, ROR #42 SEP +eor sAbe, tmp, sAbe_, ROR #41 SEP +bic tmp, sAbu_, sAbo_, ROR #57 SEP +eor sAbi, tmp, sAbi_, ROR #35 SEP xar_m1 vBga, vAbo, E3, 36 +bic tmp, s_Aba_, sAbu_, ROR #50 SEP +eor sAbo, tmp, sAbo_, ROR #43 SEP +bic tmp, sAbe_, s_Aba_, ROR #44 SEP +eor sAbu, tmp, sAbu_, ROR #30 SEP +eor s_Aba, s_Aba, cur_const SEP + SEP xar_m1 vBbo, vAmo, E3, 43 +save count, STACK_OFFSET_COUNT SEP +eor sC0, sAka, sAsa, ROR #50 SEP +eor sC1, sAse, sAge, ROR #60 SEP +eor sC2, sAmi, sAgi, ROR #59 SEP +eor sC3, sAgo, sAso, ROR #30 SEP +eor sC4, sAbu, sAsu, ROR #53 SEP +eor sC0, sAma, sC0, ROR #49 SEP xar_m1 vBmo, vAmi, E2, 49 +eor sC1, sAbe, sC1, ROR #44 SEP +eor sC2, sAki, sC2, ROR #26 SEP +eor sC3, sAmo, sC3, ROR #63 SEP +eor sC4, sAmu, sC4, ROR #56 SEP +eor sC0, sAga, sC0, ROR #57 SEP +eor sC1, sAme, sC1, ROR #58 SEP +eor sC2, sAbi, sC2, ROR #60 SEP xar_m1 vBmi, vAke, E1, 54 +eor sC3, sAko, sC3, ROR #38 SEP +eor sC4, sAgu, sC4, ROR #48 SEP +eor sC0, s_Aba, sC0, ROR #61 SEP +eor sC1, sAke, sC1, ROR #57 SEP +eor sC2, sAsi, sC2, ROR #52 SEP +eor sC3, sAbo, sC3, ROR #63 SEP xar_m1 vBge, vAgu, E4, 44 +eor sC4, sAku, sC4, ROR #50 SEP +ror sC1, sC1, 56 SEP +ror sC4, sC4, 58 SEP +ror sC2, sC2, 62 SEP +eor sE1, sC0, sC2, ROR #63 SEP +eor sE3, sC2, sC4, ROR #63 SEP mov E3.16b, vAga.16b +eor sE0, sC4, sC1, ROR #63 SEP +eor sE2, sC1, sC3, ROR #63 SEP +eor sE4, sC3, sC0, ROR #63 SEP +eor s_Aba_, sE0, s_Aba SEP bcax_m1 vAga, vBga, vBgi, vBge +eor sAsa_, sE2, sAbi, ROR #50 SEP +eor sAbi_, sE2, sAki, ROR #46 SEP +eor sAki_, sE3, sAko, ROR #63 SEP +eor sAko_, sE4, sAmu, ROR #28 SEP xar_m1 vBgu, vAsi, E2, 3 +eor sAmu_, sE3, sAso, ROR #2 SEP +eor sAso_, sE0, sAma, ROR #54 SEP +eor sAka_, sE1, sAbe, ROR #43 SEP +eor sAse_, sE3, sAgo, ROR #36 SEP +eor sAgo_, sE1, sAme, ROR #49 SEP +eor sAke_, sE2, sAgi, ROR #3 SEP xar_m1 vBsi, vAku, E4, 25 +eor sAgi_, sE0, sAka, ROR #39 SEP +eor sAga_, sE3, sAbo SEP +eor sAbo_, sE3, sAmo, ROR #37 SEP +eor sAmo_, sE2, sAmi, ROR #8 SEP +eor sAmi_, sE1, sAke, ROR #56 SEP +eor sAge_, sE4, sAgu, ROR #44 SEP +eor sAgu_, sE2, sAsi, ROR #62 SEP xar_m1 vBku, vAsa, E0, 46 +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP +eor sAma_, sE4, sAbu, ROR #20 SEP +eor sAbu_, sE4, sAsu, ROR #9 SEP +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP xar_m1 vBma, vAbu, E4, 37 +eor sAbe_, sE1, sAge, ROR #19 SEP +load_constant_ptr SEP +restore count, STACK_OFFSET_COUNT SEP +bic tmp, sAgi_, sAge_, ROR #47 SEP +eor sAga, tmp, sAga_, ROR #39 SEP +bic tmp, sAgo_, sAgi_, ROR #42 SEP xar_m1 vBbu, vAsu, E4, 50 +eor sAge, tmp, sAge_, ROR #25 SEP +bic tmp, sAgu_, sAgo_, ROR #16 SEP +eor sAgi, tmp, sAgi_, ROR #58 SEP +bic tmp, sAga_, sAgu_, ROR #31 SEP +eor sAgo, tmp, sAgo_, ROR #47 SEP +bic tmp, sAge_, sAga_, ROR #56 SEP +eor sAgu, tmp, sAgu_, ROR #23 SEP xar_m1 vBsu, vAse, E1, 62 +bic tmp, sAki_, sAke_, ROR #19 SEP +eor sAka, tmp, sAka_, ROR #24 SEP +bic tmp, sAko_, sAki_, ROR #47 SEP +eor sAke, tmp, sAke_, ROR #2 SEP +bic tmp, sAku_, sAko_, ROR #10 SEP +eor sAki, tmp, sAki_, ROR #57 SEP xar_m1 vBme, E3, E0, 28 +bic tmp, sAka_, sAku_, ROR #47 SEP +eor sAko, tmp, sAko_, ROR #57 SEP +bic tmp, sAke_, sAka_, ROR #5 SEP +eor sAku, tmp, sAku_, ROR #52 SEP +bic tmp, sAmi_, sAme_, ROR #38 SEP +eor sAma, tmp, sAma_, ROR #47 SEP xar_m1 vBbe, vAge, E1, 20 +bic tmp, sAmo_, sAmi_, ROR #5 SEP +eor sAme, tmp, sAme_, ROR #43 SEP +bic tmp, sAmu_, sAmo_, ROR #41 SEP +eor sAmi, tmp, sAmi_, ROR #46 SEP +bic tmp, sAma_, sAmu_, ROR #35 SEP +ldr cur_const, [const_addr, count, UXTW #3] SEP bcax_m1 vAge, vBge, vBgo, vBgi +add count, count, #1 SEP +eor sAmo, tmp, sAmo_, ROR #12 SEP +bic tmp, sAme_, sAma_, ROR #9 SEP +eor sAmu, tmp, sAmu_, ROR #44 SEP bcax_m1 vAgi, vBgi, vBgu, vBgo +bic tmp, sAsi_, sAse_, ROR #48 SEP +eor sAsa, tmp, sAsa_, ROR #41 SEP +bic tmp, sAso_, sAsi_, ROR #2 SEP +eor sAse, tmp, sAse_, ROR #50 SEP bcax_m1 vAgo, vBgo, vBga, vBgu +bic tmp, sAsu_, sAso_, ROR #25 SEP +eor sAsi, tmp, sAsi_, ROR #27 SEP +bic tmp, sAsa_, sAsu_, ROR #60 SEP +eor sAso, tmp, sAso_, ROR #21 SEP bcax_m1 vAgu, vBgu, vBge, vBga +bic tmp, sAse_, sAsa_, ROR #57 SEP +eor sAsu, tmp, sAsu_, ROR #53 SEP +bic tmp, sAbi_, sAbe_, ROR #63 SEP +eor s_Aba, s_Aba_, tmp, ROR #21 SEP bcax_m1 vAka, vBka, vBki, vBke +bic tmp, sAbo_, sAbi_, ROR #42 SEP +eor sAbe, tmp, sAbe_, ROR #41 SEP +bic tmp, sAbu_, sAbo_, ROR #57 SEP +eor sAbi, tmp, sAbi_, ROR #35 SEP bcax_m1 vAke, vBke, vBko, vBki +bic tmp, s_Aba_, sAbu_, ROR #50 SEP +eor sAbo, tmp, sAbo_, ROR #43 SEP +bic tmp, sAbe_, s_Aba_, ROR #44 SEP +eor sAbu, tmp, sAbu_, ROR #30 SEP bcax_m1 vAki, vBki, vBku, vBko +eor s_Aba, s_Aba, cur_const SEP +save count, STACK_OFFSET_COUNT SEP +eor sC0, sAka, sAsa, ROR #50 SEP +eor sC1, sAse, sAge, ROR #60 SEP bcax_m1 vAko, vBko, vBka, vBku +eor sC2, sAmi, sAgi, ROR #59 SEP +eor sC3, sAgo, sAso, ROR #30 SEP +eor sC4, sAbu, sAsu, ROR #53 SEP +eor sC0, sAma, sC0, ROR #49 SEP bcax_m1 vAku, vBku, vBke, vBka +eor sC1, sAbe, sC1, ROR #44 SEP +eor sC2, sAki, sC2, ROR #26 SEP +eor sC3, sAmo, sC3, ROR #63 SEP +eor sC4, sAmu, sC4, ROR #56 SEP bcax_m1 vAma, vBma, vBmi, vBme +eor sC0, sAga, sC0, ROR #57 SEP +eor sC1, sAme, sC1, ROR #58 SEP +eor sC2, sAbi, sC2, ROR #60 SEP +eor sC3, sAko, sC3, ROR #38 SEP bcax_m1 vAme, vBme, vBmo, vBmi +eor sC4, sAgu, sC4, ROR #48 SEP +eor sC0, s_Aba, sC0, ROR #61 SEP +eor sC1, sAke, sC1, ROR #57 SEP +eor sC2, sAsi, sC2, ROR #52 SEP bcax_m1 vAmi, vBmi, vBmu, vBmo +eor sC3, sAbo, sC3, ROR #63 SEP +eor sC4, sAku, sC4, ROR #50 SEP +ror sC1, sC1, 56 SEP +ror sC4, sC4, 58 SEP bcax_m1 vAmo, vBmo, vBma, vBmu +ror sC2, sC2, 62 SEP +eor sE1, sC0, sC2, ROR #63 SEP +eor sE3, sC2, sC4, ROR #63 SEP +eor sE0, sC4, sC1, ROR #63 SEP bcax_m1 vAmu, vBmu, vBme, vBma +eor sE2, sC1, sC3, ROR #63 SEP +eor sE4, sC3, sC0, ROR #63 SEP +eor s_Aba_, sE0, s_Aba SEP +eor sAsa_, sE2, sAbi, ROR #50 SEP bcax_m1 vAsa, vBsa, vBsi, vBse +eor sAbi_, sE2, sAki, ROR #46 SEP +eor sAki_, sE3, sAko, ROR #63 SEP +eor sAko_, sE4, sAmu, ROR #28 SEP +eor sAmu_, sE3, sAso, ROR #2 SEP bcax_m1 vAse, vBse, vBso, vBsi +eor sAso_, sE0, sAma, ROR #54 SEP +eor sAka_, sE1, sAbe, ROR #43 SEP +eor sAse_, sE3, sAgo, ROR #36 SEP +eor sAgo_, sE1, sAme, ROR #49 SEP bcax_m1 vAsi, vBsi, vBsu, vBso +eor sAke_, sE2, sAgi, ROR #3 SEP +eor sAgi_, sE0, sAka, ROR #39 SEP +eor sAga_, sE3, sAbo SEP +eor sAbo_, sE3, sAmo, ROR #37 SEP bcax_m1 vAso, vBso, vBsa, vBsu +eor sAmo_, sE2, sAmi, ROR #8 SEP +eor sAmi_, sE1, sAke, ROR #56 SEP +eor sAge_, sE4, sAgu, ROR #44 SEP +eor sAgu_, sE2, sAsi, ROR #62 SEP bcax_m1 vAsu, vBsu, vBse, vBsa +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP +eor sAma_, sE4, sAbu, ROR #20 SEP +eor sAbu_, sE4, sAsu, ROR #9 SEP bcax_m1 vAba, vBba, vBbi, vBbe +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP +eor sAbe_, sE1, sAge, ROR #19 SEP +load_constant_ptr SEP bcax_m1 vAbe, vBbe, vBbo, vBbi +restore count, STACK_OFFSET_COUNT SEP +bic tmp, sAgi_, sAge_, ROR #47 SEP +eor sAga, tmp, sAga_, ROR #39 SEP +bic tmp, sAgo_, sAgi_, ROR #42 SEP bcax_m1 vAbi, vBbi, vBbu, vBbo +eor sAge, tmp, sAge_, ROR #25 SEP +bic tmp, sAgu_, sAgo_, ROR #16 SEP +eor sAgi, tmp, sAgi_, ROR #58 SEP +bic tmp, sAga_, sAgu_, ROR #31 SEP bcax_m1 vAbo, vBbo, vBba, vBbu +eor sAgo, tmp, sAgo_, ROR #47 SEP +bic tmp, sAge_, sAga_, ROR #56 SEP +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP bcax_m1 vAbu, vBbu, vBbe, vBba +eor sAka, tmp, sAka_, ROR #24 SEP +bic tmp, sAko_, sAki_, ROR #47 SEP +eor sAke, tmp, sAke_, ROR #2 SEP +bic tmp, sAku_, sAko_, ROR #10 SEP restore x26, STACK_OFFSET_CONST +eor sAki, tmp, sAki_, ROR #57 SEP +bic tmp, sAka_, sAku_, ROR #47 SEP +eor sAko, tmp, sAko_, ROR #57 SEP ldr vvtmpq, [x26], #16 +bic tmp, sAke_, sAka_, ROR #5 SEP +eor sAku, tmp, sAku_, ROR #52 SEP +bic tmp, sAmi_, sAme_, ROR #38 SEP +eor sAma, tmp, sAma_, ROR #47 SEP save x26, STACK_OFFSET_CONST +bic tmp, sAmo_, sAmi_, ROR #5 SEP +eor sAme, tmp, sAme_, ROR #43 SEP +bic tmp, sAmu_, sAmo_, ROR #41 SEP +eor sAmi, tmp, sAmi_, ROR #46 SEP eor vAba.16b, vAba.16b, vvtmp.16b +bic tmp, sAma_, sAmu_, ROR #35 SEP +ldr cur_const, [const_addr, count, UXTW #3] SEP +add count, count, #1 SEP .unreq vvtmp +eor sAmo, tmp, sAmo_, ROR #12 SEP +bic tmp, sAme_, sAma_, ROR #9 SEP +eor sAmu, tmp, sAmu_, ROR #44 SEP +bic tmp, sAsi_, sAse_, ROR #48 SEP .unreq vvtmpq +eor sAsa, tmp, sAsa_, ROR #41 SEP +bic tmp, sAso_, sAsi_, ROR #2 SEP +eor sAse, tmp, sAse_, ROR #50 SEP +bic tmp, sAsu_, sAso_, ROR #25 SEP +eor sAsi, tmp, sAsi_, ROR #27 SEP +bic tmp, sAsa_, sAsu_, ROR #60 SEP +eor sAso, tmp, sAso_, ROR #21 SEP +bic tmp, sAse_, sAsa_, ROR #57 SEP +eor sAsu, tmp, sAsu_, ROR #53 SEP +bic tmp, sAbi_, sAbe_, ROR #63 SEP +eor s_Aba, s_Aba_, tmp, ROR #21 SEP +bic tmp, sAbo_, sAbi_, ROR #42 SEP +eor sAbe, tmp, sAbe_, ROR #41 SEP +bic tmp, sAbu_, sAbo_, ROR #57 SEP +eor sAbi, tmp, sAbi_, ROR #35 SEP +bic tmp, s_Aba_, sAbu_, ROR #50 SEP +eor sAbo, tmp, sAbo_, ROR #43 SEP +bic tmp, sAbe_, s_Aba_, ROR #44 SEP +eor sAbu, tmp, sAbu_, ROR #30 SEP +eor s_Aba, s_Aba, cur_const SEP +ror sAga, sAga,(64-3) SEP +ror sAka, sAka,(64-25) SEP +ror sAma, sAma,(64-10) SEP +ror sAsa, sAsa,(64-39) SEP +ror sAbe, sAbe,(64-21) SEP +ror sAge, sAge,(64-45) SEP +ror sAke, sAke,(64-8) SEP +ror sAme, sAme,(64-15) SEP +ror sAse, sAse,(64-41) SEP +ror sAbi, sAbi,(64-14) SEP +ror sAgi, sAgi,(64-61) SEP +ror sAki, sAki,(64-18) SEP +ror sAmi, sAmi,(64-56) SEP +ror sAsi, sAsi,(64-2) SEP +ror sAgo, sAgo,(64-28) SEP +ror sAko, sAko,(64-1) SEP +ror sAmo, sAmo,(64-27) SEP +ror sAso, sAso,(64-62) SEP +ror sAbu, sAbu,(64-44) SEP +ror sAgu, sAgu,(64-20) SEP +ror sAku, sAku,(64-6) SEP +ror sAmu, sAmu,(64-36) SEP +ror sAsu, sAsu,(64-55) SEP +.endm + + +#define KECCAK_F1600_ROUNDS 24 + +.global keccak_f1600_x5_hybrid_asm_v8 +.global _keccak_f1600_x5_hybrid_asm_v8 +.text +.align 4 + +keccak_f1600_x5_hybrid_asm_v8: +_keccak_f1600_x5_hybrid_asm_v8: + alloc_stack + save_gprs + save_vregs + + save input_addr, STACK_OFFSET_INPUT + + ASM_LOAD(const_addr,round_constants_vec) + save const_addr, STACK_OFFSET_CONST + + load_input_vector + + add input_addr, input_addr, #(2*8*25) + save input_addr, STACK_OFFSET_CUR_INPUT + + mov out_count, #0 +outer_loop: + save out_count, STACK_OFFSET_COUNT_OUT + + load_input_scalar + save input_addr, STACK_OFFSET_CUR_INPUT + + hybrid_round_initial +inner_loop: + hybrid_round_noninitial + cmp count, #(KECCAK_F1600_ROUNDS-6) + ble inner_loop + hybrid_round_final + + restore input_addr, STACK_OFFSET_CUR_INPUT + store_input_scalar + add input_addr, input_addr, #(8*25) + + restore out_count, STACK_OFFSET_COUNT_OUT + add out_count, out_count, #1 + cmp out_count, #3 + blt outer_loop + + restore input_addr, STACK_OFFSET_INPUT + store_input_vector + + restore_vregs + restore_gprs + free_stack + + ret diff --git a/asm/manual/keccak_f1600/keccak_f1600_x5_hybrid_asm_v8p.s b/asm/manual/keccak_f1600/keccak_f1600_x5_hybrid_asm_v8p.s new file mode 100644 index 0000000..c904df4 --- /dev/null +++ b/asm/manual/keccak_f1600/keccak_f1600_x5_hybrid_asm_v8p.s @@ -0,0 +1,1306 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +/********************** CONSTANTS *************************/ + .data + .align(8) +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 +round_constants_vec: + .quad 0x0000000000000001 + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + .quad 0x8000000080008008 +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x29 + count .req w27 + out_count .req w27 + cur_const .req x26 + + /* Mapping of Kecck-f1600 SIMD state to vector registers + * at the beginning and end of each round. */ + + /* Mapping of Kecck-f1600 state to vector registers + * at the beginning and end of each round. */ + vAba .req v0 + vAbe .req v1 + vAbi .req v2 + vAbo .req v3 + vAbu .req v4 + vAga .req v5 + vAge .req v6 + vAgi .req v7 + vAgo .req v8 + vAgu .req v9 + vAka .req v10 + vAke .req v11 + vAki .req v12 + vAko .req v13 + vAku .req v14 + vAma .req v15 + vAme .req v16 + vAmi .req v17 + vAmo .req v18 + vAmu .req v19 + vAsa .req v20 + vAse .req v21 + vAsi .req v22 + vAso .req v23 + vAsu .req v24 + + /* q-form of the above mapping */ + vAbaq .req q0 + vAbeq .req q1 + vAbiq .req q2 + vAboq .req q3 + vAbuq .req q4 + vAgaq .req q5 + vAgeq .req q6 + vAgiq .req q7 + vAgoq .req q8 + vAguq .req q9 + vAkaq .req q10 + vAkeq .req q11 + vAkiq .req q12 + vAkoq .req q13 + vAkuq .req q14 + vAmaq .req q15 + vAmeq .req q16 + vAmiq .req q17 + vAmoq .req q18 + vAmuq .req q19 + vAsaq .req q20 + vAseq .req q21 + vAsiq .req q22 + vAsoq .req q23 + vAsuq .req q24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v27 + C1 .req v28 + C2 .req v29 + C3 .req v30 + C4 .req v31 + + C0q .req q27 + C1q .req q28 + C2q .req q29 + C3q .req q30 + C4q .req q31 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + vBba .req v25 // fresh + vBbe .req v26 // fresh + vBbi .req vAbi + vBbo .req vAbo + vBbu .req vAbu + vBga .req vAka + vBge .req vAke + vBgi .req vAgi + vBgo .req vAgo + vBgu .req vAgu + vBka .req vAma + vBke .req vAme + vBki .req vAki + vBko .req vAko + vBku .req vAku + vBma .req vAsa + vBme .req vAse + vBmi .req vAmi + vBmo .req vAmo + vBmu .req vAmu + vBsa .req vAba + vBse .req vAbe + vBsi .req vAsi + vBso .req vAso + vBsu .req vAsu + + vBbaq .req q25 // fresh + vBbeq .req q26 // fresh + vBbiq .req vAbiq + vBboq .req vAboq + vBbuq .req vAbuq + vBgaq .req vAkaq + vBgeq .req vAkeq + vBgiq .req vAgiq + vBgoq .req vAgoq + vBguq .req vAguq + vBkaq .req vAmaq + vBkeq .req vAmeq + vBkiq .req vAkiq + vBkoq .req vAkoq + vBkuq .req vAkuq + vBmaq .req vAsaq + vBmeq .req vAseq + vBmiq .req vAmiq + vBmoq .req vAmoq + vBmuq .req vAmuq + vBsaq .req vAbaq + vBseq .req vAbeq + vBsiq .req vAsiq + vBsoq .req vAsoq + vBsuq .req vAsuq + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req C4 + E1 .req C0 + E2 .req vBbe // fresh + E3 .req C2 + E4 .req C3 + + E0q .req C4q + E1q .req C0q + E2q .req vBbeq // fresh + E3q .req C2q + E4q .req C3q + + /* Mapping of Kecck-f1600 state to scalar registers + * at the beginning and end of each round. */ + s_Aba .req x1 + sAbe .req x6 + sAbi .req x11 + sAbo .req x16 + sAbu .req x21 + sAga .req x2 + sAge .req x7 + sAgi .req x12 + sAgo .req x17 + sAgu .req x22 + sAka .req x3 + sAke .req x8 + sAki .req x13 + sAko .req x18 + sAku .req x23 + sAma .req x4 + sAme .req x9 + sAmi .req x14 + sAmo .req x19 + sAmu .req x24 + sAsa .req x5 + sAse .req x10 + sAsi .req x15 + sAso .req x20 + sAsu .req x25 + + /* sA_[y,2*x+3*y] = rot(A[x,y]) */ + s_Aba_ .req x0 + sAbe_ .req x28 + sAbi_ .req x11 + sAbo_ .req x16 + sAbu_ .req x21 + sAga_ .req x3 + sAge_ .req x8 + sAgi_ .req x12 + sAgo_ .req x17 + sAgu_ .req x22 + sAka_ .req x4 + sAke_ .req x9 + sAki_ .req x13 + sAko_ .req x18 + sAku_ .req x23 + sAma_ .req x5 + sAme_ .req x10 + sAmi_ .req x14 + sAmo_ .req x19 + sAmu_ .req x24 + sAsa_ .req x1 + sAse_ .req x6 + sAsi_ .req x15 + sAso_ .req x20 + sAsu_ .req x25 + + /* sC[x] = sA[x,0] xor sA[x,1] xor sA[x,2] xor sA[x,3] xor sA[x,4], for x in 0..4 */ + /* sE[x] = sC[x-1] xor rot(C[x+1],1), for x in 0..4 */ + sC0 .req x0 + sE0 .req x29 + sC1 .req x26 + sE1 .req x30 + sC2 .req x27 + sE2 .req x26 + sC3 .req x28 + sE3 .req x27 + sC4 .req x29 + sE4 .req x28 + + tmp .req x30 + +/************************ MACROS ****************************/ + +.macro eor2 d s0 s1 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor3_m1 d s0 s1 s2 + eor2 \d, \s0, \s1 + eor2 \d, \d, \s2 +.endm + +.macro rax1_m1 d s0 s1 + shl vvtmp.2d, \s1\().2d, #1 + sri vvtmp.2d, \s1\().2d, #63 + eor \d\().16b, vvtmp.16b, \s0\().16b +.endm + + .macro xar_m1 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 63 + eor \s0\().16b, \s0\().16b, \s1\().16b + add \d\().2d, \s0\().2d, \s0\().2d + sri \d\().2d, \s0\().2d, #(63) + .else + eor \s0\().16b, \s0\().16b, \s1\().16b + shl \d\().2d, \s0\().2d, #(64-\imm) + sri \d\().2d, \s0\().2d, #(\imm) + .endif +.endm + +.macro bcax_m1 d s0 s1 s2 + bic vvtmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, vvtmp.16b, \s0\().16b +.endm + +.macro load_input_vector + ldp vAbaq, vAbeq, [input_addr, #(16*0)] + ldp vAbiq, vAboq, [input_addr, #(16*2)] + ldp vAbuq, vAgaq, [input_addr, #(16*4)] + ldp vAgeq, vAgiq, [input_addr, #(16*6)] + ldp vAgoq, vAguq, [input_addr, #(16*8)] + ldp vAkaq, vAkeq, [input_addr, #(16*10)] + ldp vAkiq, vAkoq, [input_addr, #(16*12)] + ldp vAkuq, vAmaq, [input_addr, #(16*14)] + ldp vAmeq, vAmiq, [input_addr, #(16*16)] + ldp vAmoq, vAmuq, [input_addr, #(16*18)] + ldp vAsaq, vAseq, [input_addr, #(16*20)] + ldp vAsiq, vAsoq, [input_addr, #(16*22)] + ldr vAsuq, [input_addr, #(16*24)] + + // ldr vAbaq, [input_addr, #(16*0)] + // ldr vAbeq, [input_addr, #(16*1)] + // ldr vAbiq, [input_addr, #(16*2)] + // ldr vAboq, [input_addr, #(16*3)] + // ldr vAbuq, [input_addr, #(16*4)] + // ldr vAgaq, [input_addr, #(16*5)] + // ldr vAgeq, [input_addr, #(16*6)] + // ldr vAgiq, [input_addr, #(16*7)] + // ldr vAgoq, [input_addr, #(16*8)] + // ldr vAguq, [input_addr, #(16*9)] + // ldr vAkaq, [input_addr, #(16*10)] + // ldr vAkeq, [input_addr, #(16*11)] + // ldr vAkiq, [input_addr, #(16*12)] + // ldr vAkoq, [input_addr, #(16*13)] + // ldr vAkuq, [input_addr, #(16*14)] + // ldr vAmaq, [input_addr, #(16*15)] + // ldr vAmeq, [input_addr, #(16*16)] + // ldr vAmiq, [input_addr, #(16*17)] + // ldr vAmoq, [input_addr, #(16*18)] + // ldr vAmuq, [input_addr, #(16*19)] + // ldr vAsaq, [input_addr, #(16*20)] + // ldr vAseq, [input_addr, #(16*21)] + // ldr vAsiq, [input_addr, #(16*22)] + // ldr vAsoq, [input_addr, #(16*23)] + // ldr vAsuq, [input_addr, #(16*24)] +.endm + +.macro store_input_vector + stp vAbaq, vAbeq, [input_addr, #(16*0)] + stp vAbiq, vAboq, [input_addr, #(16*2)] + stp vAbuq, vAgaq, [input_addr, #(16*4)] + stp vAgeq, vAgiq, [input_addr, #(16*6)] + stp vAgoq, vAguq, [input_addr, #(16*8)] + stp vAkaq, vAkeq, [input_addr, #(16*10)] + stp vAkiq, vAkoq, [input_addr, #(16*12)] + stp vAkuq, vAmaq, [input_addr, #(16*14)] + stp vAmeq, vAmiq, [input_addr, #(16*16)] + stp vAmoq, vAmuq, [input_addr, #(16*18)] + stp vAsaq, vAseq, [input_addr, #(16*20)] + stp vAsiq, vAsoq, [input_addr, #(16*22)] + str vAsuq, [input_addr, #(16*24)] + + // str vAbaq, [input_addr, #(16*0)] + // str vAbeq, [input_addr, #(16*1)] + // str vAbiq, [input_addr, #(16*2)] + // str vAboq, [input_addr, #(16*3)] + // str vAbuq, [input_addr, #(16*4)] + // str vAgaq, [input_addr, #(16*5)] + // str vAgeq, [input_addr, #(16*6)] + // str vAgiq, [input_addr, #(16*7)] + // str vAgoq, [input_addr, #(16*8)] + // str vAguq, [input_addr, #(16*9)] + // str vAkaq, [input_addr, #(16*10)] + // str vAkeq, [input_addr, #(16*11)] + // str vAkiq, [input_addr, #(16*12)] + // str vAkoq, [input_addr, #(16*13)] + // str vAkuq, [input_addr, #(16*14)] + // str vAmaq, [input_addr, #(16*15)] + // str vAmeq, [input_addr, #(16*16)] + // str vAmiq, [input_addr, #(16*17)] + // str vAmoq, [input_addr, #(16*18)] + // str vAmuq, [input_addr, #(16*19)] + // str vAsaq, [input_addr, #(16*20)] + // str vAseq, [input_addr, #(16*21)] + // str vAsiq, [input_addr, #(16*22)] + // str vAsoq, [input_addr, #(16*23)] + // str vAsuq, [input_addr, #(16*24)] +.endm + +.macro load_input_scalar + ldp s_Aba, sAbe, [input_addr,8*0 ] + ldp sAbi, sAbo, [input_addr,8*2 ] + ldp sAbu, sAga, [input_addr,8*4 ] + ldp sAge, sAgi, [input_addr,8*6 ] + ldp sAgo, sAgu, [input_addr,8*8 ] + ldp sAka, sAke, [input_addr,8*10] + ldp sAki, sAko, [input_addr,8*12] + ldp sAku, sAma, [input_addr,8*14] + ldp sAme, sAmi, [input_addr,8*16] + ldp sAmo, sAmu, [input_addr,8*18] + ldp sAsa, sAse, [input_addr,8*20] + ldp sAsi, sAso, [input_addr,8*22] + ldr sAsu, [input_addr,8*24] +.endm + +.macro store_input_scalar + stp s_Aba, sAbe, [input_addr,8*0 ] + stp sAbi, sAbo, [input_addr,8*2 ] + stp sAbu, sAga, [input_addr,8*4 ] + stp sAge, sAgi, [input_addr,8*6 ] + stp sAgo, sAgu, [input_addr,8*8 ] + stp sAka, sAke, [input_addr,8*10] + stp sAki, sAko, [input_addr,8*12] + stp sAku, sAma, [input_addr,8*14] + stp sAme, sAmi, [input_addr,8*16] + stp sAmo, sAmu, [input_addr,8*18] + stp sAsa, sAse, [input_addr,8*20] + stp sAsi, sAso, [input_addr,8*22] + str sAsu, [input_addr,8*24] +.endm + + +#define STACK_SIZE (4*16 + 12*8 + 6*8 + 16*1) +#define STACK_BASE_VREGS (0) +#define STACK_BASE_GPRS (4*16) +#define STACK_BASE_TMP_GPRS (4*16 + 12*8) +#define STACK_BASE_TMP_VREGS (4*16 + 12*8 + 6*8) +#define STACK_OFFSET_INPUT (0*8) +#define STACK_OFFSET_CONST (1*8) +#define STACK_OFFSET_COUNT (2*8) +#define STACK_OFFSET_COUNT_OUT (3*8) +#define STACK_OFFSET_CUR_INPUT (4*8) + +#define vAga_offset 0 + +#define save(name) \ + str name ## q, [sp, #(STACK_BASE_TMP_VREGS + 16 * name ## _offset)] +#define restore(name) \ + ldr name ## q, [sp, #(STACK_BASE_TMP_VREGS + 16 * name ## _offset)] + + +.macro save_gprs + stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro save_vregs + stp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] + stp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + stp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + stp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] +.endm + +.macro restore_vregs + ldp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] + ldp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + ldp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + ldp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] +.endm + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +.macro eor5 dst, src0, src1, src2, src3, src4 + eor \dst, \src0, \src1 + eor \dst, \dst, \src2 + eor \dst, \dst, \src3 + eor \dst, \dst, \src4 +.endm + +.macro xor_rol dst, src1, src0, imm + eor \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro bic_rol dst, src1, src0, imm + bic \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro rotate dst, src, imm + ror \dst, \src, #(64-\imm) +.endm + +.macro save reg, offset + str \reg, [sp, #(STACK_BASE_TMP_GPRS + \offset)] +.endm + +.macro restore reg, offset + ldr \reg, [sp, #(STACK_BASE_TMP_GPRS + \offset)] +.endm + +.macro hybrid_round_initial +eor sC0, sAma, sAsa SEP +eor sC1, sAme, sAse SEP eor3_m1 C1,vAbe,vAge,vAke +eor sC2, sAmi, sAsi SEP +eor sC3, sAmo, sAso SEP +eor sC4, sAmu, sAsu SEP +eor sC0, sAka, sC0 SEP eor3_m1 C3,vAbo,vAgo,vAko +eor sC1, sAke, sC1 SEP +eor sC2, sAki, sC2 SEP +eor sC3, sAko, sC3 SEP +eor sC4, sAku, sC4 SEP eor3_m1 C0,vAba,vAga,vAka +eor sC0, sAga, sC0 SEP +eor sC1, sAge, sC1 SEP +eor sC2, sAgi, sC2 SEP +eor sC3, sAgo, sC3 SEP eor3_m1 C2,vAbi,vAgi,vAki +eor sC4, sAgu, sC4 SEP +eor sC0, s_Aba, sC0 SEP +eor sC1, sAbe, sC1 SEP +eor sC2, sAbi, sC2 SEP eor3_m1 C4,vAbu,vAgu,vAku +eor sC3, sAbo, sC3 SEP +eor sC4, sAbu, sC4 SEP +eor sE1, sC0, sC2, ROR #63 SEP +eor sE3, sC2, sC4, ROR #63 SEP eor3_m1 C1, C1,vAme, vAse +eor sE0, sC4, sC1, ROR #63 SEP +eor sE2, sC1, sC3, ROR #63 SEP +eor sE4, sC3, sC0, ROR #63 SEP +eor s_Aba_, s_Aba, sE0 SEP eor3_m1 C3, C3,vAmo, vAso +eor sAsa_, sAbi, sE2 SEP +eor sAbi_, sAki, sE2 SEP +eor sAki_, sAko, sE3 SEP +eor sAko_, sAmu, sE4 SEP eor3_m1 C0, C0,vAma, vAsa +eor sAmu_, sAso, sE3 SEP +eor sAso_, sAma, sE0 SEP +eor sAka_, sAbe, sE1 SEP +eor sAse_, sAgo, sE3 SEP eor3_m1 C2, C2,vAmi, vAsi +eor sAgo_, sAme, sE1 SEP +eor sAke_, sAgi, sE2 SEP +eor sAgi_, sAka, sE0 SEP +eor sAga_, sAbo, sE3 SEP eor3_m1 C4, C4,vAmu, vAsu +eor sAbo_, sAmo, sE3 SEP +eor sAmo_, sAmi, sE2 SEP vvtmp .req vBba +eor sAmi_, sAke, sE1 SEP +eor sAge_, sAgu, sE4 SEP rax1_m1 E2, C1, C3 +eor sAgu_, sAsi, sE2 SEP +eor sAsi_, sAku, sE4 SEP +eor sAku_, sAsa, sE0 SEP +eor sAma_, sAbu, sE4 SEP rax1_m1 E4, C3, C0 +eor sAbu_, sAsu, sE4 SEP +eor sAsu_, sAse, sE1 SEP +eor sAme_, sAga, sE0 SEP +eor sAbe_, sAge, sE1 SEP rax1_m1 E1, C0, C2 +load_constant_ptr SEP +bic tmp, sAgi_, sAge_, ROR #47 SEP +eor sAga, tmp, sAga_, ROR #39 SEP +bic tmp, sAgo_, sAgi_, ROR #42 SEP rax1_m1 E3, C2, C4 +eor sAge, tmp, sAge_, ROR #25 SEP +bic tmp, sAgu_, sAgo_, ROR #16 SEP +eor sAgi, tmp, sAgi_, ROR #58 SEP +bic tmp, sAga_, sAgu_, ROR #31 SEP rax1_m1 E0, C4, C1 +eor sAgo, tmp, sAgo_, ROR #47 SEP +bic tmp, sAge_, sAga_, ROR #56 SEP .unreq vvtmp +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP vvtmp .req C1 +eor sAka, tmp, sAka_, ROR #24 SEP +bic tmp, sAko_, sAki_, ROR #47 SEP vvtmpq .req C1q +eor sAke, tmp, sAke_, ROR #2 SEP +bic tmp, sAku_, sAko_, ROR #10 SEP eor vBba.16b, vAba.16b, E0.16b +eor sAki, tmp, sAki_, ROR #57 SEP +bic tmp, sAka_, sAku_, ROR #47 SEP xar_m1 vBsa, vAbi, E2, 2 +eor sAko, tmp, sAko_, ROR #57 SEP +bic tmp, sAke_, sAka_, ROR #5 SEP +eor sAku, tmp, sAku_, ROR #52 SEP +bic tmp, sAmi_, sAme_, ROR #38 SEP xar_m1 vBbi, vAki, E2, 21 +eor sAma, tmp, sAma_, ROR #47 SEP +bic tmp, sAmo_, sAmi_, ROR #5 SEP +eor sAme, tmp, sAme_, ROR #43 SEP +bic tmp, sAmu_, sAmo_, ROR #41 SEP xar_m1 vBki, vAko, E3, 39 +eor sAmi, tmp, sAmi_, ROR #46 SEP +ldr cur_const, [const_addr] SEP +mov count, #1 SEP +bic tmp, sAma_, sAmu_, ROR #35 SEP xar_m1 vBko, vAmu, E4, 56 +eor sAmo, tmp, sAmo_, ROR #12 SEP +bic tmp, sAme_, sAma_, ROR #9 SEP +eor sAmu, tmp, sAmu_, ROR #44 SEP +bic tmp, sAsi_, sAse_, ROR #48 SEP xar_m1 vBmu, vAso, E3, 8 +eor sAsa, tmp, sAsa_, ROR #41 SEP +bic tmp, sAso_, sAsi_, ROR #2 SEP +eor sAse, tmp, sAse_, ROR #50 SEP +bic tmp, sAsu_, sAso_, ROR #25 SEP xar_m1 vBso, vAma, E0, 23 +eor sAsi, tmp, sAsi_, ROR #27 SEP +bic tmp, sAsa_, sAsu_, ROR #60 SEP +eor sAso, tmp, sAso_, ROR #21 SEP +bic tmp, sAse_, sAsa_, ROR #57 SEP xar_m1 vBka, vAbe, E1, 63 +eor sAsu, tmp, sAsu_, ROR #53 SEP +bic tmp, sAbi_, sAbe_, ROR #63 SEP +eor s_Aba, s_Aba_, tmp, ROR #21 SEP +bic tmp, sAbo_, sAbi_, ROR #42 SEP xar_m1 vBse, vAgo, E3, 9 +eor sAbe, tmp, sAbe_, ROR #41 SEP +bic tmp, sAbu_, sAbo_, ROR #57 SEP +eor sAbi, tmp, sAbi_, ROR #35 SEP +bic tmp, s_Aba_, sAbu_, ROR #50 SEP xar_m1 vBgo, vAme, E1, 19 +eor sAbo, tmp, sAbo_, ROR #43 SEP +bic tmp, sAbe_, s_Aba_, ROR #44 SEP +eor sAbu, tmp, sAbu_, ROR #30 SEP +eor s_Aba, s_Aba, cur_const SEP xar_m1 vBke, vAgi, E2, 58 +save count, STACK_OFFSET_COUNT SEP +eor sC0, sAka, sAsa, ROR #50 SEP +eor sC1, sAse, sAge, ROR #60 SEP +eor sC2, sAmi, sAgi, ROR #59 SEP xar_m1 vBgi, vAka, E0, 61 +eor sC3, sAgo, sAso, ROR #30 SEP +eor sC4, sAbu, sAsu, ROR #53 SEP +eor sC0, sAma, sC0, ROR #49 SEP +eor sC1, sAbe, sC1, ROR #44 SEP xar_m1 vBga, vAbo, E3, 36 +eor sC2, sAki, sC2, ROR #26 SEP +eor sC3, sAmo, sC3, ROR #63 SEP +eor sC4, sAmu, sC4, ROR #56 SEP +eor sC0, sAga, sC0, ROR #57 SEP xar_m1 vBbo, vAmo, E3, 43 +eor sC1, sAme, sC1, ROR #58 SEP +eor sC2, sAbi, sC2, ROR #60 SEP +eor sC3, sAko, sC3, ROR #38 SEP +eor sC4, sAgu, sC4, ROR #48 SEP xar_m1 vBmo, vAmi, E2, 49 +eor sC0, s_Aba, sC0, ROR #61 SEP +eor sC1, sAke, sC1, ROR #57 SEP +eor sC2, sAsi, sC2, ROR #52 SEP +eor sC3, sAbo, sC3, ROR #63 SEP +eor sC4, sAku, sC4, ROR #50 SEP xar_m1 vBmi, vAke, E1, 54 +ror sC1, sC1, 56 SEP +ror sC4, sC4, 58 SEP +ror sC2, sC2, 62 SEP +eor sE1, sC0, sC2, ROR #63 SEP xar_m1 vBge, vAgu, E4, 44 +eor sE3, sC2, sC4, ROR #63 SEP +eor sE0, sC4, sC1, ROR #63 SEP mov E3.16b, vAga.16b +eor sE2, sC1, sC3, ROR #63 SEP +eor sE4, sC3, sC0, ROR #63 SEP bcax_m1 vAga, vBga, vBgi, vBge +eor s_Aba_, sE0, s_Aba SEP +eor sAsa_, sE2, sAbi, ROR #50 SEP +eor sAbi_, sE2, sAki, ROR #46 SEP xar_m1 vBgu, vAsi, E2, 3 +eor sAki_, sE3, sAko, ROR #63 SEP +eor sAko_, sE4, sAmu, ROR #28 SEP +eor sAmu_, sE3, sAso, ROR #2 SEP +eor sAso_, sE0, sAma, ROR #54 SEP xar_m1 vBsi, vAku, E4, 25 +eor sAka_, sE1, sAbe, ROR #43 SEP +eor sAse_, sE3, sAgo, ROR #36 SEP +eor sAgo_, sE1, sAme, ROR #49 SEP +eor sAke_, sE2, sAgi, ROR #3 SEP xar_m1 vBku, vAsa, E0, 46 +eor sAgi_, sE0, sAka, ROR #39 SEP +eor sAga_, sE3, sAbo SEP +eor sAbo_, sE3, sAmo, ROR #37 SEP +eor sAmo_, sE2, sAmi, ROR #8 SEP +eor sAmi_, sE1, sAke, ROR #56 SEP +eor sAge_, sE4, sAgu, ROR #44 SEP +eor sAgu_, sE2, sAsi, ROR #62 SEP xar_m1 vBma, vAbu, E4, 37 +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP +eor sAma_, sE4, sAbu, ROR #20 SEP +eor sAbu_, sE4, sAsu, ROR #9 SEP +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP +eor sAbe_, sE1, sAge, ROR #19 SEP xar_m1 vBbu, vAsu, E4, 50 +load_constant_ptr SEP +restore count, STACK_OFFSET_COUNT SEP +bic tmp, sAgi_, sAge_, ROR #47 SEP +eor sAga, tmp, sAga_, ROR #39 SEP +bic tmp, sAgo_, sAgi_, ROR #42 SEP xar_m1 vBsu, vAse, E1, 62 +eor sAge, tmp, sAge_, ROR #25 SEP +bic tmp, sAgu_, sAgo_, ROR #16 SEP +eor sAgi, tmp, sAgi_, ROR #58 SEP +bic tmp, sAga_, sAgu_, ROR #31 SEP +eor sAgo, tmp, sAgo_, ROR #47 SEP +bic tmp, sAge_, sAga_, ROR #56 SEP +eor sAgu, tmp, sAgu_, ROR #23 SEP xar_m1 vBme, E3, E0, 28 +bic tmp, sAki_, sAke_, ROR #19 SEP +eor sAka, tmp, sAka_, ROR #24 SEP +bic tmp, sAko_, sAki_, ROR #47 SEP +eor sAke, tmp, sAke_, ROR #2 SEP +bic tmp, sAku_, sAko_, ROR #10 SEP +eor sAki, tmp, sAki_, ROR #57 SEP xar_m1 vBbe, vAge, E1, 20 +bic tmp, sAka_, sAku_, ROR #47 SEP +eor sAko, tmp, sAko_, ROR #57 SEP +bic tmp, sAke_, sAka_, ROR #5 SEP +eor sAku, tmp, sAku_, ROR #52 SEP +bic tmp, sAmi_, sAme_, ROR #38 SEP +eor sAma, tmp, sAma_, ROR #47 SEP bcax_m1 vAge, vBge, vBgo, vBgi +bic tmp, sAmo_, sAmi_, ROR #5 SEP +eor sAme, tmp, sAme_, ROR #43 SEP +bic tmp, sAmu_, sAmo_, ROR #41 SEP +eor sAmi, tmp, sAmi_, ROR #46 SEP bcax_m1 vAgi, vBgi, vBgu, vBgo +bic tmp, sAma_, sAmu_, ROR #35 SEP +eor sAmo, tmp, sAmo_, ROR #12 SEP +bic tmp, sAme_, sAma_, ROR #9 SEP +eor sAmu, tmp, sAmu_, ROR #44 SEP bcax_m1 vAgo, vBgo, vBga, vBgu +bic tmp, sAsi_, sAse_, ROR #48 SEP +ldr cur_const, [const_addr, count, UXTW #3] SEP +eor sAsa, tmp, sAsa_, ROR #41 SEP +bic tmp, sAso_, sAsi_, ROR #2 SEP bcax_m1 vAgu, vBgu, vBge, vBga +eor sAse, tmp, sAse_, ROR #50 SEP +bic tmp, sAsu_, sAso_, ROR #25 SEP +eor sAsi, tmp, sAsi_, ROR #27 SEP +bic tmp, sAsa_, sAsu_, ROR #60 SEP bcax_m1 vAka, vBka, vBki, vBke +eor sAso, tmp, sAso_, ROR #21 SEP +bic tmp, sAse_, sAsa_, ROR #57 SEP +eor sAsu, tmp, sAsu_, ROR #53 SEP +bic tmp, sAbi_, sAbe_, ROR #63 SEP bcax_m1 vAke, vBke, vBko, vBki +eor s_Aba, s_Aba_, tmp, ROR #21 SEP .unreq vvtmp +bic tmp, sAbo_, sAbi_, ROR #42 SEP +eor sAbe, tmp, sAbe_, ROR #41 SEP .unreq vvtmpq +bic tmp, sAbu_, sAbo_, ROR #57 SEP eor2 C0, vAka, vAga +eor sAbi, tmp, sAbi_, ROR #35 SEP vvtmp .req vAga +bic tmp, s_Aba_, sAbu_, ROR #50 SEP save(vAga) +eor sAbo, tmp, sAbo_, ROR #43 SEP vvtmpq .req vAgaq +bic tmp, sAbe_, s_Aba_, ROR #44 SEP bcax_m1 vAki, vBki, vBku, vBko +eor sAbu, tmp, sAbu_, ROR #30 SEP +add count, count, #1 SEP +eor s_Aba, s_Aba, cur_const SEP + SEP +save count, STACK_OFFSET_COUNT SEP bcax_m1 vAko, vBko, vBka, vBku +eor sC0, sAka, sAsa, ROR #50 SEP +eor sC1, sAse, sAge, ROR #60 SEP +eor sC2, sAmi, sAgi, ROR #59 SEP +eor sC3, sAgo, sAso, ROR #30 SEP eor2 C1, vAke, vAge +eor sC4, sAbu, sAsu, ROR #53 SEP +eor sC0, sAma, sC0, ROR #49 SEP bcax_m1 vAku, vBku, vBke, vBka +eor sC1, sAbe, sC1, ROR #44 SEP +eor sC2, sAki, sC2, ROR #26 SEP +eor sC3, sAmo, sC3, ROR #63 SEP +eor sC4, sAmu, sC4, ROR #56 SEP eor2 C2, vAki, vAgi +eor sC0, sAga, sC0, ROR #57 SEP +eor sC1, sAme, sC1, ROR #58 SEP bcax_m1 vAma, vBma, vBmi, vBme +eor sC2, sAbi, sC2, ROR #60 SEP +eor sC3, sAko, sC3, ROR #38 SEP +eor sC4, sAgu, sC4, ROR #48 SEP +eor sC0, s_Aba, sC0, ROR #61 SEP eor2 C3, vAko, vAgo +eor sC1, sAke, sC1, ROR #57 SEP +eor sC2, sAsi, sC2, ROR #52 SEP bcax_m1 vAme, vBme, vBmo, vBmi +eor sC3, sAbo, sC3, ROR #63 SEP +eor sC4, sAku, sC4, ROR #50 SEP +ror sC1, sC1, 56 SEP +ror sC4, sC4, 58 SEP eor2 C4, vAku, vAgu +ror sC2, sC2, 62 SEP +eor sE1, sC0, sC2, ROR #63 SEP bcax_m1 vAmi, vBmi, vBmu, vBmo +eor sE3, sC2, sC4, ROR #63 SEP +eor sE0, sC4, sC1, ROR #63 SEP +eor sE2, sC1, sC3, ROR #63 SEP eor2 C0, C0, vAma +eor sE4, sC3, sC0, ROR #63 SEP +eor s_Aba_, sE0, s_Aba SEP bcax_m1 vAmo, vBmo, vBma, vBmu +eor sAsa_, sE2, sAbi, ROR #50 SEP +eor sAbi_, sE2, sAki, ROR #46 SEP +eor sAki_, sE3, sAko, ROR #63 SEP +eor sAko_, sE4, sAmu, ROR #28 SEP eor2 C1, C1, vAme +eor sAmu_, sE3, sAso, ROR #2 SEP +eor sAso_, sE0, sAma, ROR #54 SEP bcax_m1 vAmu, vBmu, vBme, vBma +eor sAka_, sE1, sAbe, ROR #43 SEP +eor sAse_, sE3, sAgo, ROR #36 SEP +eor sAgo_, sE1, sAme, ROR #49 SEP eor2 C2, C2, vAmi +eor sAke_, sE2, sAgi, ROR #3 SEP +eor sAgi_, sE0, sAka, ROR #39 SEP bcax_m1 vAsa, vBsa, vBsi, vBse +eor sAga_, sE3, sAbo SEP +eor sAbo_, sE3, sAmo, ROR #37 SEP eor2 C3, C3, vAmo +eor sAmo_, sE2, sAmi, ROR #8 SEP +eor sAmi_, sE1, sAke, ROR #56 SEP bcax_m1 vAse, vBse, vBso, vBsi +eor sAge_, sE4, sAgu, ROR #44 SEP +eor sAgu_, sE2, sAsi, ROR #62 SEP +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP eor2 C4, C4, vAmu +eor sAma_, sE4, sAbu, ROR #20 SEP +eor sAbu_, sE4, sAsu, ROR #9 SEP bcax_m1 vAsi, vBsi, vBsu, vBso +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP +eor sAbe_, sE1, sAge, ROR #19 SEP +load_constant_ptr SEP eor2 C0, C0, vAsa +restore count, STACK_OFFSET_COUNT SEP +bic tmp, sAgi_, sAge_, ROR #47 SEP bcax_m1 vAso, vBso, vBsa, vBsu +eor sAga, tmp, sAga_, ROR #39 SEP +bic tmp, sAgo_, sAgi_, ROR #42 SEP +eor sAge, tmp, sAge_, ROR #25 SEP +bic tmp, sAgu_, sAgo_, ROR #16 SEP eor2 C1, C1, vAse +eor sAgi, tmp, sAgi_, ROR #58 SEP +bic tmp, sAga_, sAgu_, ROR #31 SEP bcax_m1 vAsu, vBsu, vBse, vBsa +eor sAgo, tmp, sAgo_, ROR #47 SEP +bic tmp, sAge_, sAga_, ROR #56 SEP +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP eor2 C2, C2, vAsi +eor sAka, tmp, sAka_, ROR #24 SEP +bic tmp, sAko_, sAki_, ROR #47 SEP eor2 C3, C3, vAso +eor sAke, tmp, sAke_, ROR #2 SEP +bic tmp, sAku_, sAko_, ROR #10 SEP bcax_m1 vAba, vBba, vBbi, vBbe +eor sAki, tmp, sAki_, ROR #57 SEP +bic tmp, sAka_, sAku_, ROR #47 SEP +eor sAko, tmp, sAko_, ROR #57 SEP +bic tmp, sAke_, sAka_, ROR #5 SEP bcax_m1 vAbe, vBbe, vBbo, vBbi +eor sAku, tmp, sAku_, ROR #52 SEP +bic tmp, sAmi_, sAme_, ROR #38 SEP +eor sAma, tmp, sAma_, ROR #47 SEP +bic tmp, sAmo_, sAmi_, ROR #5 SEP eor2 C1, C1, vAbe +eor sAme, tmp, sAme_, ROR #43 SEP restore x26, STACK_OFFSET_CONST +bic tmp, sAmu_, sAmo_, ROR #41 SEP ldr vvtmpq, [x26], #16 +eor sAmi, tmp, sAmi_, ROR #46 SEP save x26, STACK_OFFSET_CONST +bic tmp, sAma_, sAmu_, ROR #35 SEP +eor sAmo, tmp, sAmo_, ROR #12 SEP eor vAba.16b, vAba.16b, vvtmp.16b +bic tmp, sAme_, sAma_, ROR #9 SEP +eor sAmu, tmp, sAmu_, ROR #44 SEP eor2 C4, C4, vAsu +bic tmp, sAsi_, sAse_, ROR #48 SEP +ldr cur_const, [const_addr, count, UXTW #3] SEP bcax_m1 vAbi, vBbi, vBbu, vBbo +eor sAsa, tmp, sAsa_, ROR #41 SEP +bic tmp, sAso_, sAsi_, ROR #2 SEP +eor sAse, tmp, sAse_, ROR #50 SEP +bic tmp, sAsu_, sAso_, ROR #25 SEP bcax_m1 vAbo, vBbo, vBba, vBbu +eor sAsi, tmp, sAsi_, ROR #27 SEP +bic tmp, sAsa_, sAsu_, ROR #60 SEP +eor sAso, tmp, sAso_, ROR #21 SEP +bic tmp, sAse_, sAsa_, ROR #57 SEP eor2 C3, C3, vAbo +eor sAsu, tmp, sAsu_, ROR #53 SEP +bic tmp, sAbi_, sAbe_, ROR #63 SEP eor2 C2, C2, vAbi +eor s_Aba, s_Aba_, tmp, ROR #21 SEP +bic tmp, sAbo_, sAbi_, ROR #42 SEP eor2 C0, C0, vAba +eor sAbe, tmp, sAbe_, ROR #41 SEP +bic tmp, sAbu_, sAbo_, ROR #57 SEP bcax_m1 vAbu, vBbu, vBbe, vBba +eor sAbi, tmp, sAbi_, ROR #35 SEP +bic tmp, s_Aba_, sAbu_, ROR #50 SEP +eor sAbo, tmp, sAbo_, ROR #43 SEP +bic tmp, sAbe_, s_Aba_, ROR #44 SEP eor2 C4, C4, vAbu +eor sAbu, tmp, sAbu_, ROR #30 SEP +add count, count, #1 SEP restore(vAga) +eor s_Aba, s_Aba, cur_const SEP + .unreq vvtmp + + .unreq vvtmpq +.endm + +.macro hybrid_round_noninitial + SEP vvtmp .req vBba +save count, STACK_OFFSET_COUNT SEP rax1_m1 E2, C1, C3 +eor sC0, sAka, sAsa, ROR #50 SEP +eor sC1, sAse, sAge, ROR #60 SEP +eor sC2, sAmi, sAgi, ROR #59 SEP +eor sC3, sAgo, sAso, ROR #30 SEP +eor sC4, sAbu, sAsu, ROR #53 SEP +eor sC0, sAma, sC0, ROR #49 SEP rax1_m1 E4, C3, C0 +eor sC1, sAbe, sC1, ROR #44 SEP +eor sC2, sAki, sC2, ROR #26 SEP +eor sC3, sAmo, sC3, ROR #63 SEP +eor sC4, sAmu, sC4, ROR #56 SEP +eor sC0, sAga, sC0, ROR #57 SEP +eor sC1, sAme, sC1, ROR #58 SEP rax1_m1 E1, C0, C2 +eor sC2, sAbi, sC2, ROR #60 SEP +eor sC3, sAko, sC3, ROR #38 SEP +eor sC4, sAgu, sC4, ROR #48 SEP +eor sC0, s_Aba, sC0, ROR #61 SEP +eor sC1, sAke, sC1, ROR #57 SEP +eor sC2, sAsi, sC2, ROR #52 SEP rax1_m1 E3, C2, C4 +eor sC3, sAbo, sC3, ROR #63 SEP +eor sC4, sAku, sC4, ROR #50 SEP +ror sC1, sC1, 56 SEP +ror sC4, sC4, 58 SEP +ror sC2, sC2, 62 SEP +eor sE1, sC0, sC2, ROR #63 SEP rax1_m1 E0, C4, C1 +eor sE3, sC2, sC4, ROR #63 SEP +eor sE0, sC4, sC1, ROR #63 SEP .unreq vvtmp +eor sE2, sC1, sC3, ROR #63 SEP vvtmp .req C1 +eor sE4, sC3, sC0, ROR #63 SEP vvtmpq .req C1q +eor s_Aba_, sE0, s_Aba SEP +eor sAsa_, sE2, sAbi, ROR #50 SEP eor vBba.16b, vAba.16b, E0.16b +eor sAbi_, sE2, sAki, ROR #46 SEP +eor sAki_, sE3, sAko, ROR #63 SEP xar_m1 vBsa, vAbi, E2, 2 +eor sAko_, sE4, sAmu, ROR #28 SEP +eor sAmu_, sE3, sAso, ROR #2 SEP +eor sAso_, sE0, sAma, ROR #54 SEP +eor sAka_, sE1, sAbe, ROR #43 SEP +eor sAse_, sE3, sAgo, ROR #36 SEP +eor sAgo_, sE1, sAme, ROR #49 SEP xar_m1 vBbi, vAki, E2, 21 +eor sAke_, sE2, sAgi, ROR #3 SEP +eor sAgi_, sE0, sAka, ROR #39 SEP +eor sAga_, sE3, sAbo SEP +eor sAbo_, sE3, sAmo, ROR #37 SEP +eor sAmo_, sE2, sAmi, ROR #8 SEP +eor sAmi_, sE1, sAke, ROR #56 SEP xar_m1 vBki, vAko, E3, 39 +eor sAge_, sE4, sAgu, ROR #44 SEP +eor sAgu_, sE2, sAsi, ROR #62 SEP +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP +eor sAma_, sE4, sAbu, ROR #20 SEP +eor sAbu_, sE4, sAsu, ROR #9 SEP xar_m1 vBko, vAmu, E4, 56 +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP +eor sAbe_, sE1, sAge, ROR #19 SEP +load_constant_ptr SEP +restore count, STACK_OFFSET_COUNT SEP +bic tmp, sAgi_, sAge_, ROR #47 SEP xar_m1 vBmu, vAso, E3, 8 +eor sAga, tmp, sAga_, ROR #39 SEP +bic tmp, sAgo_, sAgi_, ROR #42 SEP +eor sAge, tmp, sAge_, ROR #25 SEP +bic tmp, sAgu_, sAgo_, ROR #16 SEP +eor sAgi, tmp, sAgi_, ROR #58 SEP +bic tmp, sAga_, sAgu_, ROR #31 SEP xar_m1 vBso, vAma, E0, 23 +eor sAgo, tmp, sAgo_, ROR #47 SEP +bic tmp, sAge_, sAga_, ROR #56 SEP +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP +eor sAka, tmp, sAka_, ROR #24 SEP +bic tmp, sAko_, sAki_, ROR #47 SEP xar_m1 vBka, vAbe, E1, 63 +eor sAke, tmp, sAke_, ROR #2 SEP +bic tmp, sAku_, sAko_, ROR #10 SEP +eor sAki, tmp, sAki_, ROR #57 SEP +bic tmp, sAka_, sAku_, ROR #47 SEP +eor sAko, tmp, sAko_, ROR #57 SEP +bic tmp, sAke_, sAka_, ROR #5 SEP xar_m1 vBse, vAgo, E3, 9 +eor sAku, tmp, sAku_, ROR #52 SEP +bic tmp, sAmi_, sAme_, ROR #38 SEP +eor sAma, tmp, sAma_, ROR #47 SEP +bic tmp, sAmo_, sAmi_, ROR #5 SEP +eor sAme, tmp, sAme_, ROR #43 SEP xar_m1 vBgo, vAme, E1, 19 +bic tmp, sAmu_, sAmo_, ROR #41 SEP +eor sAmi, tmp, sAmi_, ROR #46 SEP +bic tmp, sAma_, sAmu_, ROR #35 SEP +ldr cur_const, [const_addr, count, UXTW #3] +add count, count, #1 SEP xar_m1 vBke, vAgi, E2, 58 +eor sAmo, tmp, sAmo_, ROR #12 SEP +bic tmp, sAme_, sAma_, ROR #9 SEP +eor sAmu, tmp, sAmu_, ROR #44 SEP +bic tmp, sAsi_, sAse_, ROR #48 SEP +eor sAsa, tmp, sAsa_, ROR #41 SEP xar_m1 vBgi, vAka, E0, 61 +bic tmp, sAso_, sAsi_, ROR #2 SEP +eor sAse, tmp, sAse_, ROR #50 SEP +bic tmp, sAsu_, sAso_, ROR #25 SEP +eor sAsi, tmp, sAsi_, ROR #27 SEP +bic tmp, sAsa_, sAsu_, ROR #60 SEP +eor sAso, tmp, sAso_, ROR #21 SEP xar_m1 vBga, vAbo, E3, 36 +bic tmp, sAse_, sAsa_, ROR #57 SEP +eor sAsu, tmp, sAsu_, ROR #53 SEP +bic tmp, sAbi_, sAbe_, ROR #63 SEP +eor s_Aba, s_Aba_, tmp, ROR #21 SEP +bic tmp, sAbo_, sAbi_, ROR #42 SEP +eor sAbe, tmp, sAbe_, ROR #41 SEP xar_m1 vBbo, vAmo, E3, 43 +bic tmp, sAbu_, sAbo_, ROR #57 SEP +eor sAbi, tmp, sAbi_, ROR #35 SEP +bic tmp, s_Aba_, sAbu_, ROR #50 SEP +eor sAbo, tmp, sAbo_, ROR #43 SEP +bic tmp, sAbe_, s_Aba_, ROR #44 SEP +eor sAbu, tmp, sAbu_, ROR #30 SEP xar_m1 vBmo, vAmi, E2, 49 +eor s_Aba, s_Aba, cur_const SEP +save count, STACK_OFFSET_COUNT SEP +eor sC0, sAka, sAsa, ROR #50 SEP +eor sC1, sAse, sAge, ROR #60 SEP +eor sC2, sAmi, sAgi, ROR #59 SEP +eor sC3, sAgo, sAso, ROR #30 SEP xar_m1 vBmi, vAke, E1, 54 +eor sC4, sAbu, sAsu, ROR #53 SEP +eor sC0, sAma, sC0, ROR #49 SEP +eor sC1, sAbe, sC1, ROR #44 SEP +eor sC2, sAki, sC2, ROR #26 SEP +eor sC3, sAmo, sC3, ROR #63 SEP +eor sC4, sAmu, sC4, ROR #56 SEP +eor sC0, sAga, sC0, ROR #57 SEP xar_m1 vBge, vAgu, E4, 44 +eor sC1, sAme, sC1, ROR #58 SEP +eor sC2, sAbi, sC2, ROR #60 SEP +eor sC3, sAko, sC3, ROR #38 SEP +eor sC4, sAgu, sC4, ROR #48 SEP +eor sC0, s_Aba, sC0, ROR #61 SEP +eor sC1, sAke, sC1, ROR #57 SEP mov E3.16b, vAga.16b +eor sC2, sAsi, sC2, ROR #52 SEP +eor sC3, sAbo, sC3, ROR #63 SEP bcax_m1 vAga, vBga, vBgi, vBge +eor sC4, sAku, sC4, ROR #50 SEP +ror sC1, sC1, 56 SEP +ror sC4, sC4, 58 SEP +ror sC2, sC2, 62 SEP xar_m1 vBgu, vAsi, E2, 3 +eor sE1, sC0, sC2, ROR #63 SEP +eor sE3, sC2, sC4, ROR #63 SEP +eor sE0, sC4, sC1, ROR #63 SEP +eor sE2, sC1, sC3, ROR #63 SEP +eor sE4, sC3, sC0, ROR #63 SEP +eor s_Aba_, sE0, s_Aba SEP xar_m1 vBsi, vAku, E4, 25 +eor sAsa_, sE2, sAbi, ROR #50 SEP +eor sAbi_, sE2, sAki, ROR #46 SEP +eor sAki_, sE3, sAko, ROR #63 SEP +eor sAko_, sE4, sAmu, ROR #28 SEP +eor sAmu_, sE3, sAso, ROR #2 SEP +eor sAso_, sE0, sAma, ROR #54 SEP xar_m1 vBku, vAsa, E0, 46 +eor sAka_, sE1, sAbe, ROR #43 SEP +eor sAse_, sE3, sAgo, ROR #36 SEP +eor sAgo_, sE1, sAme, ROR #49 SEP +eor sAke_, sE2, sAgi, ROR #3 SEP +eor sAgi_, sE0, sAka, ROR #39 SEP +eor sAga_, sE3, sAbo SEP xar_m1 vBma, vAbu, E4, 37 +eor sAbo_, sE3, sAmo, ROR #37 SEP +eor sAmo_, sE2, sAmi, ROR #8 SEP +eor sAmi_, sE1, sAke, ROR #56 SEP +eor sAge_, sE4, sAgu, ROR #44 SEP +eor sAgu_, sE2, sAsi, ROR #62 SEP xar_m1 vBbu, vAsu, E4, 50 +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP +eor sAma_, sE4, sAbu, ROR #20 SEP +eor sAbu_, sE4, sAsu, ROR #9 SEP +eor sAsu_, sE1, sAse, ROR #23 SEP xar_m1 vBsu, vAse, E1, 62 +eor sAme_, sE0, sAga, ROR #61 SEP +eor sAbe_, sE1, sAge, ROR #19 SEP +load_constant_ptr SEP +restore count, STACK_OFFSET_COUNT SEP +bic tmp, sAgi_, sAge_, ROR #47 SEP +eor sAga, tmp, sAga_, ROR #39 SEP xar_m1 vBme, E3, E0, 28 +bic tmp, sAgo_, sAgi_, ROR #42 SEP +eor sAge, tmp, sAge_, ROR #25 SEP +bic tmp, sAgu_, sAgo_, ROR #16 SEP +eor sAgi, tmp, sAgi_, ROR #58 SEP +bic tmp, sAga_, sAgu_, ROR #31 SEP +eor sAgo, tmp, sAgo_, ROR #47 SEP xar_m1 vBbe, vAge, E1, 20 +bic tmp, sAge_, sAga_, ROR #56 SEP +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP +eor sAka, tmp, sAka_, ROR #24 SEP +bic tmp, sAko_, sAki_, ROR #47 SEP bcax_m1 vAge, vBge, vBgo, vBgi +eor sAke, tmp, sAke_, ROR #2 SEP +bic tmp, sAku_, sAko_, ROR #10 SEP +eor sAki, tmp, sAki_, ROR #57 SEP +bic tmp, sAka_, sAku_, ROR #47 SEP bcax_m1 vAgi, vBgi, vBgu, vBgo +eor sAko, tmp, sAko_, ROR #57 SEP +bic tmp, sAke_, sAka_, ROR #5 SEP +eor sAku, tmp, sAku_, ROR #52 SEP +bic tmp, sAmi_, sAme_, ROR #38 SEP bcax_m1 vAgo, vBgo, vBga, vBgu +eor sAma, tmp, sAma_, ROR #47 SEP +bic tmp, sAmo_, sAmi_, ROR #5 SEP +eor sAme, tmp, sAme_, ROR #43 SEP +bic tmp, sAmu_, sAmo_, ROR #41 SEP bcax_m1 vAgu, vBgu, vBge, vBga +eor sAmi, tmp, sAmi_, ROR #46 SEP +bic tmp, sAma_, sAmu_, ROR #35 SEP +ldr cur_const, [const_addr, count, UXTW #3] +add count, count, #1 SEP bcax_m1 vAka, vBka, vBki, vBke +eor sAmo, tmp, sAmo_, ROR #12 SEP +bic tmp, sAme_, sAma_, ROR #9 SEP +eor sAmu, tmp, sAmu_, ROR #44 SEP +bic tmp, sAsi_, sAse_, ROR #48 SEP bcax_m1 vAke, vBke, vBko, vBki +eor sAsa, tmp, sAsa_, ROR #41 SEP .unreq vvtmp +bic tmp, sAso_, sAsi_, ROR #2 SEP .unreq vvtmpq +eor sAse, tmp, sAse_, ROR #50 SEP +bic tmp, sAsu_, sAso_, ROR #25 SEP eor2 C0, vAka, vAga +eor sAsi, tmp, sAsi_, ROR #27 SEP save(vAga) +bic tmp, sAsa_, sAsu_, ROR #60 SEP vvtmp .req vAga +eor sAso, tmp, sAso_, ROR #21 SEP vvtmpq .req vAgaq +bic tmp, sAse_, sAsa_, ROR #57 SEP bcax_m1 vAki, vBki, vBku, vBko +eor sAsu, tmp, sAsu_, ROR #53 SEP +bic tmp, sAbi_, sAbe_, ROR #63 SEP +eor s_Aba, s_Aba_, tmp, ROR #21 SEP +bic tmp, sAbo_, sAbi_, ROR #42 SEP bcax_m1 vAko, vBko, vBka, vBku +eor sAbe, tmp, sAbe_, ROR #41 SEP +bic tmp, sAbu_, sAbo_, ROR #57 SEP +eor sAbi, tmp, sAbi_, ROR #35 SEP +bic tmp, s_Aba_, sAbu_, ROR #50 SEP eor2 C1, vAke, vAge +eor sAbo, tmp, sAbo_, ROR #43 SEP +bic tmp, sAbe_, s_Aba_, ROR #44 SEP bcax_m1 vAku, vBku, vBke, vBka +eor sAbu, tmp, sAbu_, ROR #30 SEP +eor s_Aba, s_Aba, cur_const SEP + SEP +save count, STACK_OFFSET_COUNT SEP +eor sC0, sAka, sAsa, ROR #50 SEP +eor sC1, sAse, sAge, ROR #60 SEP eor2 C2, vAki, vAgi +eor sC2, sAmi, sAgi, ROR #59 SEP +eor sC3, sAgo, sAso, ROR #30 SEP bcax_m1 vAma, vBma, vBmi, vBme +eor sC4, sAbu, sAsu, ROR #53 SEP +eor sC0, sAma, sC0, ROR #49 SEP +eor sC1, sAbe, sC1, ROR #44 SEP +eor sC2, sAki, sC2, ROR #26 SEP eor2 C3, vAko, vAgo +eor sC3, sAmo, sC3, ROR #63 SEP +eor sC4, sAmu, sC4, ROR #56 SEP bcax_m1 vAme, vBme, vBmo, vBmi +eor sC0, sAga, sC0, ROR #57 SEP +eor sC1, sAme, sC1, ROR #58 SEP +eor sC2, sAbi, sC2, ROR #60 SEP +eor sC3, sAko, sC3, ROR #38 SEP eor2 C4, vAku, vAgu +eor sC4, sAgu, sC4, ROR #48 SEP +eor sC0, s_Aba, sC0, ROR #61 SEP bcax_m1 vAmi, vBmi, vBmu, vBmo +eor sC1, sAke, sC1, ROR #57 SEP +eor sC2, sAsi, sC2, ROR #52 SEP +eor sC3, sAbo, sC3, ROR #63 SEP +eor sC4, sAku, sC4, ROR #50 SEP eor2 C0, C0, vAma +ror sC1, sC1, 56 SEP +ror sC4, sC4, 58 SEP bcax_m1 vAmo, vBmo, vBma, vBmu +ror sC2, sC2, 62 SEP +eor sE1, sC0, sC2, ROR #63 SEP +eor sE3, sC2, sC4, ROR #63 SEP +eor sE0, sC4, sC1, ROR #63 SEP eor2 C1, C1, vAme +eor sE2, sC1, sC3, ROR #63 SEP +eor sE4, sC3, sC0, ROR #63 SEP bcax_m1 vAmu, vBmu, vBme, vBma +eor s_Aba_, sE0, s_Aba SEP +eor sAsa_, sE2, sAbi, ROR #50 SEP +eor sAbi_, sE2, sAki, ROR #46 SEP +eor sAki_, sE3, sAko, ROR #63 SEP eor2 C2, C2, vAmi +eor sAko_, sE4, sAmu, ROR #28 SEP +eor sAmu_, sE3, sAso, ROR #2 SEP bcax_m1 vAsa, vBsa, vBsi, vBse +eor sAso_, sE0, sAma, ROR #54 SEP +eor sAka_, sE1, sAbe, ROR #43 SEP +eor sAse_, sE3, sAgo, ROR #36 SEP +eor sAgo_, sE1, sAme, ROR #49 SEP eor2 C3, C3, vAmo +eor sAke_, sE2, sAgi, ROR #3 SEP +eor sAgi_, sE0, sAka, ROR #39 SEP bcax_m1 vAse, vBse, vBso, vBsi +eor sAga_, sE3, sAbo SEP +eor sAbo_, sE3, sAmo, ROR #37 SEP +eor sAmo_, sE2, sAmi, ROR #8 SEP +eor sAmi_, sE1, sAke, ROR #56 SEP eor2 C4, C4, vAmu +eor sAge_, sE4, sAgu, ROR #44 SEP +eor sAgu_, sE2, sAsi, ROR #62 SEP bcax_m1 vAsi, vBsi, vBsu, vBso +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP +eor sAma_, sE4, sAbu, ROR #20 SEP +eor sAbu_, sE4, sAsu, ROR #9 SEP eor2 C0, C0, vAsa +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP bcax_m1 vAso, vBso, vBsa, vBsu +eor sAbe_, sE1, sAge, ROR #19 SEP +load_constant_ptr SEP +restore count, STACK_OFFSET_COUNT SEP +bic tmp, sAgi_, sAge_, ROR #47 SEP +eor sAga, tmp, sAga_, ROR #39 SEP +bic tmp, sAgo_, sAgi_, ROR #42 SEP eor2 C1, C1, vAse +eor sAge, tmp, sAge_, ROR #25 SEP +bic tmp, sAgu_, sAgo_, ROR #16 SEP bcax_m1 vAsu, vBsu, vBse, vBsa +eor sAgi, tmp, sAgi_, ROR #58 SEP +bic tmp, sAga_, sAgu_, ROR #31 SEP +eor sAgo, tmp, sAgo_, ROR #47 SEP +bic tmp, sAge_, sAga_, ROR #56 SEP eor2 C2, C2, vAsi +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP eor2 C3, C3, vAso +eor sAka, tmp, sAka_, ROR #24 SEP +bic tmp, sAko_, sAki_, ROR #47 SEP bcax_m1 vAba, vBba, vBbi, vBbe +eor sAke, tmp, sAke_, ROR #2 SEP +bic tmp, sAku_, sAko_, ROR #10 SEP +eor sAki, tmp, sAki_, ROR #57 SEP +bic tmp, sAka_, sAku_, ROR #47 SEP bcax_m1 vAbe, vBbe, vBbo, vBbi +eor sAko, tmp, sAko_, ROR #57 SEP +bic tmp, sAke_, sAka_, ROR #5 SEP +eor sAku, tmp, sAku_, ROR #52 SEP +bic tmp, sAmi_, sAme_, ROR #38 SEP eor2 C1, C1, vAbe +eor sAma, tmp, sAma_, ROR #47 SEP +bic tmp, sAmo_, sAmi_, ROR #5 SEP restore x26, STACK_OFFSET_CONST +eor sAme, tmp, sAme_, ROR #43 SEP ldr vvtmpq, [x26], #16 +bic tmp, sAmu_, sAmo_, ROR #41 SEP save x26, STACK_OFFSET_CONST +eor sAmi, tmp, sAmi_, ROR #46 SEP +bic tmp, sAma_, sAmu_, ROR #35 SEP eor vAba.16b, vAba.16b, vvtmp.16b +ldr cur_const, [const_addr, count, UXTW #3] +add count, count, #1 SEP +eor sAmo, tmp, sAmo_, ROR #12 SEP eor2 C4, C4, vAsu +bic tmp, sAme_, sAma_, ROR #9 SEP +eor sAmu, tmp, sAmu_, ROR #44 SEP bcax_m1 vAbi, vBbi, vBbu, vBbo +bic tmp, sAsi_, sAse_, ROR #48 SEP +eor sAsa, tmp, sAsa_, ROR #41 SEP +bic tmp, sAso_, sAsi_, ROR #2 SEP +eor sAse, tmp, sAse_, ROR #50 SEP bcax_m1 vAbo, vBbo, vBba, vBbu +bic tmp, sAsu_, sAso_, ROR #25 SEP +eor sAsi, tmp, sAsi_, ROR #27 SEP +bic tmp, sAsa_, sAsu_, ROR #60 SEP +eor sAso, tmp, sAso_, ROR #21 SEP eor2 C3, C3, vAbo +bic tmp, sAse_, sAsa_, ROR #57 SEP +eor sAsu, tmp, sAsu_, ROR #53 SEP eor2 C2, C2, vAbi +bic tmp, sAbi_, sAbe_, ROR #63 SEP +eor s_Aba, s_Aba_, tmp, ROR #21 SEP eor2 C0, C0, vAba +bic tmp, sAbo_, sAbi_, ROR #42 SEP +eor sAbe, tmp, sAbe_, ROR #41 SEP bcax_m1 vAbu, vBbu, vBbe, vBba +bic tmp, sAbu_, sAbo_, ROR #57 SEP +eor sAbi, tmp, sAbi_, ROR #35 SEP +bic tmp, s_Aba_, sAbu_, ROR #50 SEP +eor sAbo, tmp, sAbo_, ROR #43 SEP eor2 C4, C4, vAbu +bic tmp, sAbe_, s_Aba_, ROR #44 SEP +eor sAbu, tmp, sAbu_, ROR #30 SEP restore(vAga) +eor s_Aba, s_Aba, cur_const SEP .unreq vvtmp + .unreq vvtmpq + +.endm + + +.macro final_rotate +ror sAga, sAga,(64-3) SEP +ror sAka, sAka,(64-25) SEP +ror sAma, sAma,(64-10) SEP +ror sAsa, sAsa,(64-39) SEP +ror sAbe, sAbe,(64-21) SEP +ror sAge, sAge,(64-45) SEP +ror sAke, sAke,(64-8) SEP +ror sAme, sAme,(64-15) SEP +ror sAse, sAse,(64-41) SEP +ror sAbi, sAbi,(64-14) SEP +ror sAgi, sAgi,(64-61) SEP +ror sAki, sAki,(64-18) SEP +ror sAmi, sAmi,(64-56) SEP +ror sAsi, sAsi,(64-2) SEP +ror sAgo, sAgo,(64-28) SEP +ror sAko, sAko,(64-1) SEP +ror sAmo, sAmo,(64-27) SEP +ror sAso, sAso,(64-62) SEP +ror sAbu, sAbu,(64-44) SEP +ror sAgu, sAgu,(64-20) SEP +ror sAku, sAku,(64-6) SEP +ror sAmu, sAmu,(64-36) SEP +ror sAsu, sAsu,(64-55) SEP +.endm + +#define KECCAK_F1600_ROUNDS 24 + +.global keccak_f1600_x5_hybrid_asm_v8p +.global _keccak_f1600_x5_hybrid_asm_v8p +.text +.align 4 + +keccak_f1600_x5_hybrid_asm_v8p: +_keccak_f1600_x5_hybrid_asm_v8p: + alloc_stack + save_gprs + save_vregs + + save input_addr, STACK_OFFSET_INPUT + + ASM_LOAD(const_addr,round_constants_vec) + save const_addr, STACK_OFFSET_CONST + + load_input_vector + + add input_addr, input_addr, #(2*8*25) + save input_addr, STACK_OFFSET_CUR_INPUT + + mov out_count, #0 +outer_loop: + save out_count, STACK_OFFSET_COUNT_OUT + + load_input_scalar + save input_addr, STACK_OFFSET_CUR_INPUT + + hybrid_round_initial +inner_loop: + hybrid_round_noninitial + cmp count, #(KECCAK_F1600_ROUNDS-3) + ble inner_loop + final_rotate + + restore input_addr, STACK_OFFSET_CUR_INPUT + store_input_scalar + add input_addr, input_addr, #(8*25) + + restore out_count, STACK_OFFSET_COUNT_OUT + add out_count, out_count, #1 + cmp out_count, #3 + blt outer_loop + + restore input_addr, STACK_OFFSET_INPUT + store_input_vector + + restore_vregs + restore_gprs + free_stack + + ret diff --git a/asm/manual/keccak_f1600/macros.s b/asm/manual/keccak_f1600/macros.s new file mode 100644 index 0000000..77e0bd4 --- /dev/null +++ b/asm/manual/keccak_f1600/macros.s @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include + +.macro load_constant_ptr + ASM_LOAD(const_addr, round_constants) +.endm diff --git a/asm/manual/keccak_f1600/third_party/LICENSE b/asm/manual/keccak_f1600/third_party/LICENSE new file mode 100644 index 0000000..cdde493 --- /dev/null +++ b/asm/manual/keccak_f1600/third_party/LICENSE @@ -0,0 +1 @@ +This directory contains third party implementations of Keccak-f1600. See the individual files for their licenses. \ No newline at end of file diff --git a/asm/manual/keccak_f1600/third_party/keccakx2_C.c b/asm/manual/keccak_f1600/third_party/keccakx2_C.c new file mode 100644 index 0000000..1ed19d9 --- /dev/null +++ b/asm/manual/keccak_f1600/third_party/keccakx2_C.c @@ -0,0 +1,330 @@ + +// Derived, with minor modifications, from public domain implementation +// in crypto_hash/keccakc512/simple/ from http://bench.cr.yp.to/supercop.html +// by Ronny Van Keer. +// +// To the extent possible under law, the implementer has waived all copyright +// and related or neighboring rights to the source code in this file. +// http://creativecommons.org/publicdomain/zero/1.0/ + + + +#include "../keccak_f1600_variants.h" + +#include +#include + +#define KECCAK_F1600_ROUNDS 24 + +static const uint64_t round_constants[KECCAK_F1600_ROUNDS] = +{ + (uint64_t)0x0000000000000001ULL, + (uint64_t)0x0000000000008082ULL, + (uint64_t)0x800000000000808aULL, + (uint64_t)0x8000000080008000ULL, + (uint64_t)0x000000000000808bULL, + (uint64_t)0x0000000080000001ULL, + (uint64_t)0x8000000080008081ULL, + (uint64_t)0x8000000000008009ULL, + (uint64_t)0x000000000000008aULL, + (uint64_t)0x0000000000000088ULL, + (uint64_t)0x0000000080008009ULL, + (uint64_t)0x000000008000000aULL, + (uint64_t)0x000000008000808bULL, + (uint64_t)0x800000000000008bULL, + (uint64_t)0x8000000000008089ULL, + (uint64_t)0x8000000000008003ULL, + (uint64_t)0x8000000000008002ULL, + (uint64_t)0x8000000000000080ULL, + (uint64_t)0x000000000000800aULL, + (uint64_t)0x800000008000000aULL, + (uint64_t)0x8000000080008081ULL, + (uint64_t)0x8000000000008080ULL, + (uint64_t)0x0000000080000001ULL, + (uint64_t)0x8000000080008008ULL +}; + +#define ROL(a, offset) (((a) << (offset)) ^ ((a) >> (64-(offset)))) +void keccak_f1600_x1_scalar_C( uint64_t state[KECCAK_F1600_X1_STATE_SIZE_UINT64] ) +{ + uint64_t Aba, Abe, Abi, Abo, Abu; + uint64_t Aga, Age, Agi, Ago, Agu; + uint64_t Aka, Ake, Aki, Ako, Aku; + uint64_t Ama, Ame, Ami, Amo, Amu; + uint64_t Asa, Ase, Asi, Aso, Asu; + uint64_t BCa, BCe, BCi, BCo, BCu; + uint64_t Da, De, Di, Do, Du; + uint64_t Eba, Ebe, Ebi, Ebo, Ebu; + uint64_t Ega, Ege, Egi, Ego, Egu; + uint64_t Eka, Eke, Eki, Eko, Eku; + uint64_t Ema, Eme, Emi, Emo, Emu; + uint64_t Esa, Ese, Esi, Eso, Esu; + + //copyFromState(A, state) + Aba = state[ 0]; + Abe = state[ 1]; + Abi = state[ 2]; + Abo = state[ 3]; + Abu = state[ 4]; + Aga = state[ 5]; + Age = state[ 6]; + Agi = state[ 7]; + Ago = state[ 8]; + Agu = state[ 9]; + Aka = state[10]; + Ake = state[11]; + Aki = state[12]; + Ako = state[13]; + Aku = state[14]; + Ama = state[15]; + Ame = state[16]; + Ami = state[17]; + Amo = state[18]; + Amu = state[19]; + Asa = state[20]; + Ase = state[21]; + Asi = state[22]; + Aso = state[23]; + Asu = state[24]; + + for( int round = 0; round < KECCAK_F1600_ROUNDS; round += 2 ) + { + // prepareTheta + BCa = Aba^Aga^Aka^Ama^Asa; + BCe = Abe^Age^Ake^Ame^Ase; + BCi = Abi^Agi^Aki^Ami^Asi; + BCo = Abo^Ago^Ako^Amo^Aso; + BCu = Abu^Agu^Aku^Amu^Asu; + + //thetaRhoPiChiIotaPrepareTheta(round , A, E) + Da = BCu^ROL(BCe, 1); + De = BCa^ROL(BCi, 1); + Di = BCe^ROL(BCo, 1); + Do = BCi^ROL(BCu, 1); + Du = BCo^ROL(BCa, 1); + + Aba ^= Da; + BCa = Aba; + Age ^= De; + BCe = ROL(Age, 44); + Aki ^= Di; + BCi = ROL(Aki, 43); + Amo ^= Do; + BCo = ROL(Amo, 21); + Asu ^= Du; + BCu = ROL(Asu, 14); + Eba = BCa ^((~BCe)& BCi ); + Eba ^= (uint64_t)round_constants[round]; + Ebe = BCe ^((~BCi)& BCo ); + Ebi = BCi ^((~BCo)& BCu ); + Ebo = BCo ^((~BCu)& BCa ); + Ebu = BCu ^((~BCa)& BCe ); + + Abo ^= Do; + BCa = ROL(Abo, 28); + Agu ^= Du; + BCe = ROL(Agu, 20); + Aka ^= Da; + BCi = ROL(Aka, 3); + Ame ^= De; + BCo = ROL(Ame, 45); + Asi ^= Di; + BCu = ROL(Asi, 61); + Ega = BCa ^((~BCe)& BCi ); + Ege = BCe ^((~BCi)& BCo ); + Egi = BCi ^((~BCo)& BCu ); + Ego = BCo ^((~BCu)& BCa ); + Egu = BCu ^((~BCa)& BCe ); + + Abe ^= De; + BCa = ROL(Abe, 1); + Agi ^= Di; + BCe = ROL(Agi, 6); + Ako ^= Do; + BCi = ROL(Ako, 25); + Amu ^= Du; + BCo = ROL(Amu, 8); + Asa ^= Da; + BCu = ROL(Asa, 18); + Eka = BCa ^((~BCe)& BCi ); + Eke = BCe ^((~BCi)& BCo ); + Eki = BCi ^((~BCo)& BCu ); + Eko = BCo ^((~BCu)& BCa ); + Eku = BCu ^((~BCa)& BCe ); + + Abu ^= Du; + BCa = ROL(Abu, 27); + Aga ^= Da; + BCe = ROL(Aga, 36); + Ake ^= De; + BCi = ROL(Ake, 10); + Ami ^= Di; + BCo = ROL(Ami, 15); + Aso ^= Do; + BCu = ROL(Aso, 56); + Ema = BCa ^((~BCe)& BCi ); + Eme = BCe ^((~BCi)& BCo ); + Emi = BCi ^((~BCo)& BCu ); + Emo = BCo ^((~BCu)& BCa ); + Emu = BCu ^((~BCa)& BCe ); + + Abi ^= Di; + BCa = ROL(Abi, 62); + Ago ^= Do; + BCe = ROL(Ago, 55); + Aku ^= Du; + BCi = ROL(Aku, 39); + Ama ^= Da; + BCo = ROL(Ama, 41); + Ase ^= De; + BCu = ROL(Ase, 2); + Esa = BCa ^((~BCe)& BCi ); + Ese = BCe ^((~BCi)& BCo ); + Esi = BCi ^((~BCo)& BCu ); + Eso = BCo ^((~BCu)& BCa ); + Esu = BCu ^((~BCa)& BCe ); + + // prepareTheta + BCa = Eba^Ega^Eka^Ema^Esa; + BCe = Ebe^Ege^Eke^Eme^Ese; + BCi = Ebi^Egi^Eki^Emi^Esi; + BCo = Ebo^Ego^Eko^Emo^Eso; + BCu = Ebu^Egu^Eku^Emu^Esu; + + //thetaRhoPiChiIotaPrepareTheta(round+1, E, A) + Da = BCu^ROL(BCe, 1); + De = BCa^ROL(BCi, 1); + Di = BCe^ROL(BCo, 1); + Do = BCi^ROL(BCu, 1); + Du = BCo^ROL(BCa, 1); + + Eba ^= Da; + BCa = Eba; + Ege ^= De; + BCe = ROL(Ege, 44); + Eki ^= Di; + BCi = ROL(Eki, 43); + Emo ^= Do; + BCo = ROL(Emo, 21); + Esu ^= Du; + BCu = ROL(Esu, 14); + Aba = BCa ^((~BCe)& BCi ); + Aba ^= (uint64_t)round_constants[round+1]; + Abe = BCe ^((~BCi)& BCo ); + Abi = BCi ^((~BCo)& BCu ); + Abo = BCo ^((~BCu)& BCa ); + Abu = BCu ^((~BCa)& BCe ); + + Ebo ^= Do; + BCa = ROL(Ebo, 28); + Egu ^= Du; + BCe = ROL(Egu, 20); + Eka ^= Da; + BCi = ROL(Eka, 3); + Eme ^= De; + BCo = ROL(Eme, 45); + Esi ^= Di; + BCu = ROL(Esi, 61); + Aga = BCa ^((~BCe)& BCi ); + Age = BCe ^((~BCi)& BCo ); + Agi = BCi ^((~BCo)& BCu ); + Ago = BCo ^((~BCu)& BCa ); + Agu = BCu ^((~BCa)& BCe ); + + Ebe ^= De; + BCa = ROL(Ebe, 1); + Egi ^= Di; + BCe = ROL(Egi, 6); + Eko ^= Do; + BCi = ROL(Eko, 25); + Emu ^= Du; + BCo = ROL(Emu, 8); + Esa ^= Da; + BCu = ROL(Esa, 18); + Aka = BCa ^((~BCe)& BCi ); + Ake = BCe ^((~BCi)& BCo ); + Aki = BCi ^((~BCo)& BCu ); + Ako = BCo ^((~BCu)& BCa ); + Aku = BCu ^((~BCa)& BCe ); + + Ebu ^= Du; + BCa = ROL(Ebu, 27); + Ega ^= Da; + BCe = ROL(Ega, 36); + Eke ^= De; + BCi = ROL(Eke, 10); + Emi ^= Di; + BCo = ROL(Emi, 15); + Eso ^= Do; + BCu = ROL(Eso, 56); + Ama = BCa ^((~BCe)& BCi ); + Ame = BCe ^((~BCi)& BCo ); + Ami = BCi ^((~BCo)& BCu ); + Amo = BCo ^((~BCu)& BCa ); + Amu = BCu ^((~BCa)& BCe ); + + Ebi ^= Di; + BCa = ROL(Ebi, 62); + Ego ^= Do; + BCe = ROL(Ego, 55); + Eku ^= Du; + BCi = ROL(Eku, 39); + Ema ^= Da; + BCo = ROL(Ema, 41); + Ese ^= De; + BCu = ROL(Ese, 2); + Asa = BCa ^((~BCe)& BCi ); + Ase = BCe ^((~BCi)& BCo ); + Asi = BCi ^((~BCo)& BCu ); + Aso = BCo ^((~BCu)& BCa ); + Asu = BCu ^((~BCa)& BCe ); + } + + //copyToState(state, A) + state[ 0] = Aba; + state[ 1] = Abe; + state[ 2] = Abi; + state[ 3] = Abo; + state[ 4] = Abu; + state[ 5] = Aga; + state[ 6] = Age; + state[ 7] = Agi; + state[ 8] = Ago; + state[ 9] = Agu; + state[10] = Aka; + state[11] = Ake; + state[12] = Aki; + state[13] = Ako; + state[14] = Aku; + state[15] = Ama; + state[16] = Ame; + state[17] = Ami; + state[18] = Amo; + state[19] = Amu; + state[20] = Asa; + state[21] = Ase; + state[22] = Asi; + state[23] = Aso; + state[24] = Asu; +} + +void keccak_f1600_x2_scalar_C(uint64_t state[2*25]) +{ + uint64_t state1[25]; + uint64_t state2[25]; + + // de-interleave + for(size_t i=0;i<25;i++){ + state1[i] = state[2*i+0]; + state2[i] = state[2*i+1]; + } + + keccak_f1600_x1_scalar_C(state1); + keccak_f1600_x1_scalar_C(state2); + + // interleave + for(size_t i=0;i<25;i++){ + state[2*i+0] = state1[i]; + state[2*i+1] = state2[i]; + } +} diff --git a/asm/manual/keccak_f1600/third_party/keccakx2_bas.s b/asm/manual/keccak_f1600/third_party/keccakx2_bas.s new file mode 100644 index 0000000..ef29c69 --- /dev/null +++ b/asm/manual/keccak_f1600/third_party/keccakx2_bas.s @@ -0,0 +1,203 @@ +// MIT License +// +// Copyright (c) 2020 Bas Westerbaan +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +// +// With trivial modifications for PQAX +// + +#if defined(__ARM_FEATURE_SHA3) + +#include // For ASM_LOAD only + +.macro load_constant_ptr + ASM_LOAD(const_addr, round_constants) +.endm + +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + +const_addr .req x1 + +.macro round + // Execute theta, but without xoring into the state yet. + // Compute parities p[i] = a[i] ^ a[5+i] ^ ... ^ a[20+i]. + eor3 v25.16b, v0.16b, v5.16b, v10.16b + eor3 v26.16b, v1.16b, v6.16b, v11.16b + eor3 v27.16b, v2.16b, v7.16b, v12.16b + eor3 v28.16b, v3.16b, v8.16b, v13.16b + eor3 v29.16b, v4.16b, v9.16b, v14.16b + + eor3 v25.16b, v25.16b, v15.16b, v20.16b + eor3 v26.16b, v26.16b, v16.16b, v21.16b + eor3 v27.16b, v27.16b, v17.16b, v22.16b + eor3 v28.16b, v28.16b, v18.16b, v23.16b + eor3 v29.16b, v29.16b, v19.16b, v24.16b + + rax1 v30.2d, v29.2d, v26.2d // d[0] = rotl(p[1], 1) ^ p[4] + rax1 v29.2d, v27.2d, v29.2d // d[3] = rotl(p[4], 1) ^ p[2] + rax1 v27.2d, v25.2d, v27.2d // d[1] = rotl(p[2], 1) ^ p[0] + rax1 v25.2d, v28.2d, v25.2d // d[4] = rotl(p[0], 1) ^ p[3] + rax1 v28.2d, v26.2d, v28.2d // d[2] = rotl(p[3], 1) ^ p[1] + + // Xor parities from step theta into the state at the same time + // as executing rho and pi. + eor v0.16b, v0.16b, v30.16b + mov v31.16b, v1.16b + xar v1.2d, v6.2d, v27.2d, 20 + xar v6.2d, v9.2d, v25.2d, 44 + xar v9.2d, v22.2d, v28.2d, 3 + xar v22.2d, v14.2d, v25.2d, 25 + xar v14.2d, v20.2d, v30.2d, 46 + xar v20.2d, v2.2d, v28.2d, 2 + xar v2.2d, v12.2d, v28.2d, 21 + xar v12.2d, v13.2d, v29.2d, 39 + xar v13.2d, v19.2d, v25.2d, 56 + xar v19.2d, v23.2d, v29.2d, 8 + xar v23.2d, v15.2d, v30.2d, 23 + xar v15.2d, v4.2d, v25.2d, 37 + xar v4.2d, v24.2d, v25.2d, 50 + xar v24.2d, v21.2d, v27.2d, 62 + xar v21.2d, v8.2d, v29.2d, 9 + xar v8.2d, v16.2d, v27.2d, 19 + xar v16.2d, v5.2d, v30.2d, 28 + xar v5.2d, v3.2d, v29.2d, 36 + xar v3.2d, v18.2d, v29.2d, 43 + xar v18.2d, v17.2d, v28.2d, 49 + xar v17.2d, v11.2d, v27.2d, 54 + xar v11.2d, v7.2d, v28.2d, 58 + xar v7.2d, v10.2d, v30.2d, 61 + xar v10.2d, v31.2d, v27.2d, 63 + + // Chi + bcax v25.16b, v0.16b, v2.16b, v1.16b + bcax v26.16b, v1.16b, v3.16b, v2.16b + bcax v2.16b, v2.16b, v4.16b, v3.16b + bcax v3.16b, v3.16b, v0.16b, v4.16b + bcax v4.16b, v4.16b, v1.16b, v0.16b + mov v0.16b, v25.16b + mov v1.16b, v26.16b + + bcax v25.16b, v5.16b, v7.16b, v6.16b + bcax v26.16b, v6.16b, v8.16b, v7.16b + bcax v7.16b, v7.16b, v9.16b, v8.16b + bcax v8.16b, v8.16b, v5.16b, v9.16b + bcax v9.16b, v9.16b, v6.16b, v5.16b + mov v5.16b, v25.16b + mov v6.16b, v26.16b + + bcax v25.16b, v10.16b, v12.16b, v11.16b + bcax v26.16b, v11.16b, v13.16b, v12.16b + bcax v12.16b, v12.16b, v14.16b, v13.16b + bcax v13.16b, v13.16b, v10.16b, v14.16b + bcax v14.16b, v14.16b, v11.16b, v10.16b + mov v10.16b, v25.16b + mov v11.16b, v26.16b + + bcax v25.16b, v15.16b, v17.16b, v16.16b + bcax v26.16b, v16.16b, v18.16b, v17.16b + bcax v17.16b, v17.16b, v19.16b, v18.16b + bcax v18.16b, v18.16b, v15.16b, v19.16b + bcax v19.16b, v19.16b, v16.16b, v15.16b + mov v15.16b, v25.16b + mov v16.16b, v26.16b + + bcax v25.16b, v20.16b, v22.16b, v21.16b + bcax v26.16b, v21.16b, v23.16b, v22.16b + bcax v22.16b, v22.16b, v24.16b, v23.16b + bcax v23.16b, v23.16b, v20.16b, v24.16b + bcax v24.16b, v24.16b, v21.16b, v20.16b + mov v20.16b, v25.16b + mov v21.16b, v26.16b + + // iota + ld1r {v25.2d}, [const_addr], #8 + eor v0.16b, v0.16b, v25.16b +.endm + +.align 4 +.global keccak_f1600_x2_bas +.global _keccak_f1600_x2_bas +keccak_f1600_x2_bas: +_keccak_f1600_x2_bas: + stp d8, d9, [sp,#-16]! + stp d10, d11, [sp,#-16]! + stp d12, d13, [sp,#-16]! + stp d14, d15, [sp,#-16]! + + load_constant_ptr + mov x2, x0 + mov x3, #24 + + ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x0], #64 + ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [x0], #64 + ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x0], #64 + ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [x0], #64 + ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x0], #64 + ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [x0], #64 + ld1 {v24.2d}, [x0] + +loop: + round + + subs x3, x3, #1 + cbnz x3, loop + + mov x0, x2 + st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x0], #64 + st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [x0], #64 + st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x0], #64 + st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [x0], #64 + st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x0], #64 + st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [x0], #64 + st1 {v24.2d}, [x0] + + ldp d14, d15, [sp], #16 + ldp d12, d13, [sp], #16 + ldp d10, d11, [sp], #16 + ldp d8, d9, [sp], #16 + + ret lr + +#endif diff --git a/asm/manual/keccak_f1600/third_party/keccakx2_cothan.c b/asm/manual/keccak_f1600/third_party/keccakx2_cothan.c new file mode 100644 index 0000000..42a1433 --- /dev/null +++ b/asm/manual/keccak_f1600/third_party/keccakx2_cothan.c @@ -0,0 +1,404 @@ +/*============================================================================= + * Copyright (c) 2020 by Cryptographic Engineering Research Group (CERG) + * ECE Department, George Mason University + * Fairfax, VA, U.S.A. + * Author: Duc Tri Nguyen +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +=============================================================================*/ +#include +#include + +#include "../keccak_f1600_variants.h" + +#define NROUNDS 24 +#define SHA3 0 + +#define SHAKE128_RATE 168 +#define SHAKE256_RATE 136 +#define SHA3_256_RATE 136 +#define SHA3_512_RATE 72 + +/* + * Using vld1q_u64_x4 is consider harmful + */ +#ifndef MEM +#define MEM 0 +#endif + +// Define NEON operation + +// Bitwise-XOR: c = a ^ b +#define vxor(c, a, b) c = veorq_u64(a, b); + +#define pack(out, a, b, c, d) \ + out.val[0] = a; \ + out.val[1] = b; \ + out.val[2] = c; \ + out.val[3] = d; + +#define unpack(a, b, c, d, out) \ + a = out.val[0]; \ + b = out.val[1]; \ + c = out.val[2]; \ + d = out.val[3]; + +#if SHA3 == 1 + +/* + * At least ARMv8.2-sha3 supported + */ + +// Xor chain: out = a ^ b ^ c ^ d ^ e +#define vXOR5(out, a, b, c, d, e) \ + out = veor3q_u64(a, b, c); \ + out = veor3q_u64(out, d, e); + +// Rotate left by 1 bit, then XOR: a ^ ROL(b) +#define vRXOR(c, a, b) c = vrax1q_u64(a, b); + +// XOR then Rotate by n bit: c = ROL(a^b, n) +#define vXORR(c, a, b, n) c = vxarq_u64(a, b, n); + +// Xor Not And: out = a ^ ( (~b) & c) +#define vXNA(out, a, b, c) out = vbcaxq_u64(a, c, b); + +#else + +// Rotate left by n bit +#define vROL(out, a, offset) \ + out = vshlq_n_u64(a, (offset)); \ + out = vsriq_n_u64(out, a, 64 - (offset)); + +// Xor chain: out = a ^ b ^ c ^ d ^ e +#define vXOR5(out, a, b, c, d, e) \ + out = veorq_u64(a, b); \ + out = veorq_u64(out, c); \ + out = veorq_u64(out, d); \ + out = veorq_u64(out, e); + +// Xor Not And: out = a ^ ( (~b) & c) +#define vXNA(out, a, b, c) \ + out = vbicq_u64(c, b); \ + out = veorq_u64(out, a); + +#define vRXOR(c, a, b) \ + vROL(c, b, 1); \ + vxor(c, c, a); + +#define vXORR(c, a, b, n) \ + a = veorq_u64(a, b); \ + vROL(c, a, 64 - n); + +#endif + +// End + +/* Keccak round constants */ +static const uint64_t neon_KeccakF_RoundConstants[NROUNDS] = { + (uint64_t)0x0000000000000001ULL, + (uint64_t)0x0000000000008082ULL, + (uint64_t)0x800000000000808aULL, + (uint64_t)0x8000000080008000ULL, + (uint64_t)0x000000000000808bULL, + (uint64_t)0x0000000080000001ULL, + (uint64_t)0x8000000080008081ULL, + (uint64_t)0x8000000000008009ULL, + (uint64_t)0x000000000000008aULL, + (uint64_t)0x0000000000000088ULL, + (uint64_t)0x0000000080008009ULL, + (uint64_t)0x000000008000000aULL, + (uint64_t)0x000000008000808bULL, + (uint64_t)0x800000000000008bULL, + (uint64_t)0x8000000000008089ULL, + (uint64_t)0x8000000000008003ULL, + (uint64_t)0x8000000000008002ULL, + (uint64_t)0x8000000000000080ULL, + (uint64_t)0x000000000000800aULL, + (uint64_t)0x800000008000000aULL, + (uint64_t)0x8000000080008081ULL, + (uint64_t)0x8000000000008080ULL, + (uint64_t)0x0000000080000001ULL, + (uint64_t)0x8000000080008008ULL}; + +/************************************************* + * Name: KeccakF1600_StatePermutex2 + * + * Description: The Keccak F1600 Permutation + * + * Arguments: - v128 *state: pointer to input/output Keccak state + **************************************************/ +void keccak_f1600_x2_neon_C_cothan(v128 state[25]) +{ + v128 Aba, Abe, Abi, Abo, Abu; + v128 Aga, Age, Agi, Ago, Agu; + v128 Aka, Ake, Aki, Ako, Aku; + v128 Ama, Ame, Ami, Amo, Amu; + v128 Asa, Ase, Asi, Aso, Asu; + v128 BCa, BCe, BCi, BCo, BCu; // tmp + v128 Da, De, Di, Do, Du; // D + v128 Eba, Ebe, Ebi, Ebo, Ebu; + v128 Ega, Ege, Egi, Ego, Egu; + v128 Eka, Eke, Eki, Eko, Eku; + v128 Ema, Eme, Emi, Emo, Emu; + v128 Esa, Ese, Esi, Eso, Esu; + +#if MEM == 1 + uint64x2x4_t holder; + + holder = vld1q_u64_x4((uint64_t *)&state[0]); + unpack(Aba, Abe, Abi, Abo, holder); + + holder = vld1q_u64_x4((uint64_t *)&state[4]); + unpack(Abu, Aga, Age, Agi, holder); + + holder = vld1q_u64_x4((uint64_t *)&state[8]); + unpack(Ago, Agu, Aka, Ake, holder); + + holder = vld1q_u64_x4((uint64_t *)&state[12]); + unpack(Aki, Ako, Aku, Ama, holder); + + holder = vld1q_u64_x4((uint64_t *)&state[16]); + unpack(Ame, Ami, Amo, Amu, holder); + + holder = vld1q_u64_x4((uint64_t *)&state[20]); + unpack(Asa, Ase, Asi, Aso, holder); + + Asu = vld1q_u64((uint64_t *)&state[24]); +#else + Aba = state[0]; + Abe = state[1]; + Abi = state[2]; + Abo = state[3]; + Abu = state[4]; + Aga = state[5]; + Age = state[6]; + Agi = state[7]; + Ago = state[8]; + Agu = state[9]; + Aka = state[10]; + Ake = state[11]; + Aki = state[12]; + Ako = state[13]; + Aku = state[14]; + Ama = state[15]; + Ame = state[16]; + Ami = state[17]; + Amo = state[18]; + Amu = state[19]; + Asa = state[20]; + Ase = state[21]; + Asi = state[22]; + Aso = state[23]; + Asu = state[24]; +#endif + + for (int round = 0; round < NROUNDS; round += 2) + { + // prepareTheta + vXOR5(BCa, Aba, Aga, Aka, Ama, Asa); + vXOR5(BCe, Abe, Age, Ake, Ame, Ase); + vXOR5(BCi, Abi, Agi, Aki, Ami, Asi); + vXOR5(BCo, Abo, Ago, Ako, Amo, Aso); + vXOR5(BCu, Abu, Agu, Aku, Amu, Asu); + + vRXOR(Da, BCu, BCe); + vRXOR(De, BCa, BCi); + vRXOR(Di, BCe, BCo); + vRXOR(Do, BCi, BCu); + vRXOR(Du, BCo, BCa); + + vxor(Aba, Aba, Da); + vXORR(BCe, Age, De, 20); + vXORR(BCi, Aki, Di, 21); + vXORR(BCo, Amo, Do, 43); + vXORR(BCu, Asu, Du, 50); + + vXNA(Eba, Aba, BCe, BCi); + vxor(Eba, Eba, vld1q_dup_u64(&neon_KeccakF_RoundConstants[round])); + vXNA(Ebe, BCe, BCi, BCo); + vXNA(Ebi, BCi, BCo, BCu); + vXNA(Ebo, BCo, BCu, Aba); + vXNA(Ebu, BCu, Aba, BCe); + + vXORR(BCa, Abo, Do, 36); + vXORR(BCe, Agu, Du, 44); + vXORR(BCi, Aka, Da, 61); + vXORR(BCo, Ame, De, 19); + vXORR(BCu, Asi, Di, 3); + + vXNA(Ega, BCa, BCe, BCi); + vXNA(Ege, BCe, BCi, BCo); + vXNA(Egi, BCi, BCo, BCu); + vXNA(Ego, BCo, BCu, BCa); + vXNA(Egu, BCu, BCa, BCe); + + vXORR(BCa, Abe, De, 63); + vXORR(BCe, Agi, Di, 58); + vXORR(BCi, Ako, Do, 39); + vXORR(BCo, Amu, Du, 56); + vXORR(BCu, Asa, Da, 46); + + vXNA(Eka, BCa, BCe, BCi); + vXNA(Eke, BCe, BCi, BCo); + vXNA(Eki, BCi, BCo, BCu); + vXNA(Eko, BCo, BCu, BCa); + vXNA(Eku, BCu, BCa, BCe); + + vXORR(BCa, Abu, Du, 37); + vXORR(BCe, Aga, Da, 28); + vXORR(BCi, Ake, De, 54); + vXORR(BCo, Ami, Di, 49); + vXORR(BCu, Aso, Do, 8); + + vXNA(Ema, BCa, BCe, BCi); + vXNA(Eme, BCe, BCi, BCo); + vXNA(Emi, BCi, BCo, BCu); + vXNA(Emo, BCo, BCu, BCa); + vXNA(Emu, BCu, BCa, BCe); + + vXORR(BCa, Abi, Di, 2); + vXORR(BCe, Ago, Do, 9); + vXORR(BCi, Aku, Du, 25); + vXORR(BCo, Ama, Da, 23); + vXORR(BCu, Ase, De, 62); + + vXNA(Esa, BCa, BCe, BCi); + vXNA(Ese, BCe, BCi, BCo); + vXNA(Esi, BCi, BCo, BCu); + vXNA(Eso, BCo, BCu, BCa); + vXNA(Esu, BCu, BCa, BCe); + + // Next Round + + // prepareTheta + vXOR5(BCa, Eba, Ega, Eka, Ema, Esa); + vXOR5(BCe, Ebe, Ege, Eke, Eme, Ese); + vXOR5(BCi, Ebi, Egi, Eki, Emi, Esi); + vXOR5(BCo, Ebo, Ego, Eko, Emo, Eso); + vXOR5(BCu, Ebu, Egu, Eku, Emu, Esu); + + // thetaRhoPiChiIotaPrepareTheta(round+1, E, A) + vRXOR(Da, BCu, BCe); + vRXOR(De, BCa, BCi); + vRXOR(Di, BCe, BCo); + vRXOR(Do, BCi, BCu); + vRXOR(Du, BCo, BCa); + + vxor(Eba, Eba, Da); + vXORR(BCe, Ege, De, 20); + vXORR(BCi, Eki, Di, 21); + vXORR(BCo, Emo, Do, 43); + vXORR(BCu, Esu, Du, 50); + + vXNA(Aba, Eba, BCe, BCi); + vxor(Aba, Aba, vld1q_dup_u64(&neon_KeccakF_RoundConstants[round + 1])); + vXNA(Abe, BCe, BCi, BCo); + vXNA(Abi, BCi, BCo, BCu); + vXNA(Abo, BCo, BCu, Eba); + vXNA(Abu, BCu, Eba, BCe); + + vXORR(BCa, Ebo, Do, 36); + vXORR(BCe, Egu, Du, 44); + vXORR(BCi, Eka, Da, 61); + vXORR(BCo, Eme, De, 19); + vXORR(BCu, Esi, Di, 3); + + vXNA(Aga, BCa, BCe, BCi); + vXNA(Age, BCe, BCi, BCo); + vXNA(Agi, BCi, BCo, BCu); + vXNA(Ago, BCo, BCu, BCa); + vXNA(Agu, BCu, BCa, BCe); + + vXORR(BCa, Ebe, De, 63); + vXORR(BCe, Egi, Di, 58); + vXORR(BCi, Eko, Do, 39); + vXORR(BCo, Emu, Du, 56); + vXORR(BCu, Esa, Da, 46); + + vXNA(Aka, BCa, BCe, BCi); + vXNA(Ake, BCe, BCi, BCo); + vXNA(Aki, BCi, BCo, BCu); + vXNA(Ako, BCo, BCu, BCa); + vXNA(Aku, BCu, BCa, BCe); + + vXORR(BCa, Ebu, Du, 37); + vXORR(BCe, Ega, Da, 28); + vXORR(BCi, Eke, De, 54); + vXORR(BCo, Emi, Di, 49); + vXORR(BCu, Eso, Do, 8); + + vXNA(Ama, BCa, BCe, BCi); + vXNA(Ame, BCe, BCi, BCo); + vXNA(Ami, BCi, BCo, BCu); + vXNA(Amo, BCo, BCu, BCa); + vXNA(Amu, BCu, BCa, BCe); + + vXORR(BCa, Ebi, Di, 2); + vXORR(BCe, Ego, Do, 9); + vXORR(BCi, Eku, Du, 25); + vXORR(BCo, Ema, Da, 23); + vXORR(BCu, Ese, De, 62); + + vXNA(Asa, BCa, BCe, BCi); + vXNA(Ase, BCe, BCi, BCo); + vXNA(Asi, BCi, BCo, BCu); + vXNA(Aso, BCo, BCu, BCa); + vXNA(Asu, BCu, BCa, BCe); + } + +#if MEM == 1 + pack(holder, Aba, Abe, Abi, Abo); + vst1q_u64_x4((uint64_t *)&state[0], holder); + + pack(holder, Abu, Aga, Age, Agi); + vst1q_u64_x4((uint64_t *)&state[4], holder); + + pack(holder, Ago, Agu, Aka, Ake); + vst1q_u64_x4((uint64_t *)&state[8], holder); + + pack(holder, Aki, Ako, Aku, Ama); + vst1q_u64_x4((uint64_t *)&state[12], holder); + + pack(holder, Ame, Ami, Amo, Amu); + vst1q_u64_x4((uint64_t *)&state[16], holder); + + pack(holder, Asa, Ase, Asi, Aso); + vst1q_u64_x4((uint64_t *)&state[20], holder); + + vst1q_u64((uint64_t *)&state[24], Asu); +#else + state[0] = Aba; + state[1] = Abe; + state[2] = Abi; + state[3] = Abo; + state[4] = Abu; + state[5] = Aga; + state[6] = Age; + state[7] = Agi; + state[8] = Ago; + state[9] = Agu; + state[10] = Aka; + state[11] = Ake; + state[12] = Aki; + state[13] = Ako; + state[14] = Aku; + state[15] = Ama; + state[16] = Ame; + state[17] = Ami; + state[18] = Amo; + state[19] = Amu; + state[20] = Asa; + state[21] = Ase; + state[22] = Asi; + state[23] = Aso; + state[24] = Asu; +#endif +} diff --git a/asm/scripts/ntt_neon/ntt_neon.py b/asm/scripts/ntt_neon/ntt_neon.py new file mode 100644 index 0000000..9486e21 --- /dev/null +++ b/asm/scripts/ntt_neon/ntt_neon.py @@ -0,0 +1,6200 @@ +# Copyright (c) 2021 Arm Limited +# SPDX-License-Identifier: MIT + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +import sys, argparse, math + +class Snippets(): + + def autogen_warning(): + warning = """ +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// +""" + yield warning + + def license(): + yield """ +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE +""" + + def function_decl(func_name): + yield f'.text' + yield f'.global {func_name}' + yield f'.global _{func_name}' + + def function_header(func_name): + yield f"{func_name}:" + yield f"_{func_name}:" + + def function_footer(): + yield 'ret' + + def save_gprs(): + yield '// Save GPRs' + yield "sub sp, sp, #(16*5+16)" + yield "stp x19, x20, [sp, #16*0]" + yield "stp x19, x20, [sp, #16*0]" + yield "stp x21, x22, [sp, #16*1]" + yield "stp x23, x24, [sp, #16*2]" + yield "stp x25, x26, [sp, #16*3]" + yield "stp x27, x28, [sp, #16*4]" + yield "str x29, [sp, #16*5]" + + def restore_gprs(): + + # # TODO: Update + yield '// Restore GPRs' + yield "ldp x19, x20, [sp, #16*0]" + yield "ldp x21, x22, [sp, #16*1]" + yield "ldp x23, x24, [sp, #16*2]" + yield "ldp x25, x26, [sp, #16*3]" + yield "ldp x27, x28, [sp, #16*4]" + yield "ldr x29, [sp, #16*5]" + yield "add sp, sp, #(16*5+16)" + + def save_vregs(): + # TODO: Update + yield '// Save NEON vector registers' + yield "sub sp, sp, #(16*4)" + yield "stp d8, d9, [sp, #16*0]" + yield "stp d10, d11, [sp, #16*1]" + yield "stp d12, d13, [sp, #16*2]" + yield "stp d14, d15, [sp, #16*3]" + + def restore_vregs(): + # TODO: Update + yield '// Restore NEON vector registers' + yield "ldp d8, d9, [sp, #16*0]" + yield "ldp d10, d11, [sp, #16*1]" + yield "ldp d12, d13, [sp, #16*2]" + yield "ldp d14, d15, [sp, #16*3]" + yield "add sp, sp, #(16*4)" + +class RegList(): + + def __init__(self, regs): + + self._regs = regs + + self._free = [] + for r in regs: + self._free.append(r) + + self._alloc = [] + + def alloc(self,reg=None): + + if reg == None: + if len(self._free) == 0: + raise Exception("No more free registers") + reg = self._free.pop() + else: + if not reg in self._free: + raise Exception(f"Register {reg} already allocated") + self._free.remove(reg) + + self._alloc.append(reg) + +# print(f"Allocated: {len(self._alloc)}") +# print(f"Free: {len(self._free)}") + + return reg + + def revfree(self): + self._free.reverse() + + def free(self,reg): + if reg not in self._regs: + raise Exception("Invalid register") + if not reg in self._alloc: + raise Exception("Register not allocated") + self._alloc.remove(reg) + self._free.append(reg) + +class Butterfly(): + + def __init__(self,base,stride,block,layer,merged,load_roots=False,shuffle=False): + + self.layer = layer + self.merged = merged + self.block = block + + self.num_gs = merged * pow(2,merged-1) + if shuffle: + self.num_gs *= 2 + + self.base = base + self.stride = stride + self.load_roots = load_roots + + self.load_idx = 0 + self.store_idx = 0 + self.scalar_load = None + self.transpose = None + self.free_root_scalars = None + + def __getitem__(self,idx): + return self.base + idx * self.stride + +class NTT(): + + def __init__(self,size,modulus,root,schedules=[0,0], layers=[3,3]): + + self.size = size + self.bitwidth = 32 + self.vector_bitlen = 128 + self.vector_bytelen = self.vector_bitlen // 8 + self.elements_per_vector = self.vector_bitlen // self.bitwidth + + self.interleave_twiddles = True + + # Determine layer at which NTT requires intra-vector shuffling + self.shuffle_boundary = int(math.log(self.size,2) - math.log(self.elements_per_vector, 2)) + + if self.bitwidth == 32: + self.data_prefix = "word" + self.vector_suffix = "4S" + self.element_size = 4 + elif self.bitwidth == 16: + self.vector_suffix = "8H" + self.data_prefix = "half" + self.element_size = 2 + + self.root = root + self.modulus = modulus + + self.data = {} + self._src = 0 + + # Alignment for arrays of twiddle factors + self.root_align = 64 + self.root_offset = 0 # 32 + + # Layer configuration + last_layer = 0 + self.layers = [] + for l in layers: + self.layers.append((last_layer,l)) + last_layer += l + + # Schedule configuration + self.schedules = [] + for s in schedules: + if s[:3] == "z2_": + self.schedules.append((2,int(s[3:]))) + elif s[:3] == "z4_": + self.schedules.append((4,int(s[3:]))) + else: + self.schedules.append((1,int(s))) + + if len(self.schedules) != len(self.layers): + raise Exception("Bad configuration") + + # We only support + # - not crossing the shuffle boundary + # - crossing it by exactly 2 layers + self.check_layer_config() + + # Whether to use growing immediate offsets or + # post-increment loads for the twiddles. + self.increment_root_ptr = False # Immediate offsets + # self.increment_root_ptr = True # Post-increment + # self.multi_access_strategy = 0 # Only relevant if increment_root_ptr == True + + vregs = list(range(4,8)) + list(range(8,16)) + list(range(0,4)) + list(range(16,32)) + # vregs = list(range(0,32)) + self.vregs = RegList(vregs) + self.gprs = RegList(list(range(0,18))) + + if self.modulus % 2 == 0: + raise Exception("Modulus must be odd") + if pow(root, 2*size, modulus) != 1: + raise Exception(f"{root} is not a {size}-th root of unity modulo {modulus}") + + def is_power_of_2(n): + if n == 1: + return True + if n % 2 == 1: + raise False + return is_power_of_2(int(n/2)) + if not is_power_of_2(size) or size <= 4: + raise Exception(f"NTT size must be a power of 2, but {size} isn't") + + # Compute modular inverse of modulus w.r.t 2^32, which is the fixed point of + # the iteration x |-> x^2 * modulus mod 2^32 stabilizing in less than 32 steps. + # [ Reason: If a=b mod p^k, then a^p = b^p mod p^(k+1), + # so for odd p=2 and odd a, we have a^{2^k}=1 mod 2^{k+1} ] + self._inv_mod = self.modulus + for i in range(0,32): + self._inv_mod = (self._inv_mod * self._inv_mod * self.modulus) % 2**32 + # This test should never fail for an odd modulus, but double-check anyway. + if (self._inv_mod * self.modulus - 1) % 2**32 != 0: + raise Exception("Failed to compute modular inverse") + + self.log2size = int(math.log(size,2)) + + def check_layer_config(self): + for (base_layer, num_merge) in self.layers: + end_layer = base_layer + num_merge + if end_layer > self.shuffle_boundary and end_layer != self.shuffle_boundary + 2: + raise Exception("Unsupported layer configuration") + + def prepare_constants(self): + self.modulus_vector = self.vregs.alloc() + + def free_constants(self): + self.vregs.free(self.modulus_vector) + + def root_of_unity_for_block(self,layer,block): + + def reverse_bit(num,width): + result = 0 + while width > 0: + result = (result << 1) + (num & 1) + num >>= 1 + width -= 1 + return result + + log = reverse_bit(pow(2,layer) + block, self.log2size) + root = pow(self.root, log, self.modulus) + + def res_even_frac(c,n): + res = c % n + if res >= n // 2: + res -= n // 2 + if res % 2 != 0: + if res < 0: + res += n + else: + res -= n + return res + + def even_frac(c,n): + res = res_even_frac(c,n) + return (c - res)//n + + root_twisted = even_frac(root * pow(2,32), self.modulus) % pow(2,32) + root_twisted = root_twisted // 2 + + return log, root, root_twisted + + def generate_constants(self): + + prefix = self.data_prefix + + yield "modulus:" + yield f".{prefix} {-self.modulus}" + yield f".{prefix} 0" + yield f".{prefix} 0" + yield f".{prefix} 0" + + root_asm = [] + root_twisted_asm = [] + + def append_root(layer,block): + nonlocal root_asm, root_twisted_asm + if layer == None: + root, root_twisted = 0,0 + else: + _, root, root_twisted = self.root_of_unity_for_block(layer,block) + new_asm_root = f".{prefix} {root} // Layer {layer}, block {block}" + new_asm_twist = f".{prefix} {root_twisted} // Layer {layer}, block {block}" + root_asm.append(new_asm_root) + root_twisted_asm.append(new_asm_twist) + + def roots_for_merged_layers(start_layer, num_layers): + + for block in range(0, pow(2,start_layer)): + + start_len = len(root_asm) + + for layer in range(0,num_layers): + cur_layer = start_layer + layer + + # TODO: Document + if cur_layer >= self.shuffle_boundary and self.bitwidth == 16: + multiply = 2 + else: + multiply = 1 + + roots_in_layer = pow(2,layer) + idx_seq = list(range(0,roots_in_layer)) + + # TODO: Document + if self.shuffle_boundary < cur_layer: + idx_seq = idx_seq[::2] + idx_seq[1::2] + + if roots_in_layer == self.elements_per_vector: + # Add padding + append_root(None,None) + for idx in idx_seq: + for _ in range(0,multiply): + append_root(start_layer + layer, + roots_in_layer * block + idx) + + end_len = len(root_asm) + mod = (end_len - start_len) % self.elements_per_vector + + if mod != 0: + for _ in range(self.elements_per_vector - mod): + append_root(None,None) + + end_len = len(root_asm) + mod = (end_len - start_len) % self.elements_per_vector + if mod != 0: + raise Exception("Something went wrong") + + vectors_emitted = (end_len - start_len)//self.elements_per_vector + self.vector_storage_per_block_at_layer[start_layer] = vectors_emitted + + self.root_offset_for_layer = {} + self.vector_storage_per_block_at_layer = {} + + # Build twiddle factors for given layer configuration + for base,merged in self.layers: + self.root_offset_for_layer[base] = len(root_asm) * self.element_size + roots_for_merged_layers(base, merged) + + align_log2 = int(math.log(self.root_align,2)) + align_offset = self.root_offset // (self.bitwidth//8) + + if not self.interleave_twiddles: + yield f".align {align_log2}" + yield "roots:" + yield from root_asm + yield f".align {align_log2}" + yield "roots_twisted:" + yield from root_twisted_asm + + else: + + def chunks(lst,size): + for i in range(0,len(lst),size): + yield lst[i:i+size] + + root_blocks = list(chunks(root_asm,self.elements_per_vector)) + root_twisted_blocks = list(chunks(root_twisted_asm,self.elements_per_vector)) + + roots = zip(root_blocks,root_twisted_blocks) + roots = [ e for p in roots for b in p for e in b] + + yield f".align {align_log2}" + for _ in range(0,align_offset): + yield f".{self.data_prefix} 0" + yield "roots_merged:" + yield from roots + + def init_constants(self): + + modulus_base = self.gprs.alloc() + yield f"ASM_LOAD (x{modulus_base}, modulus)" + yield f"ldr q{self.modulus_vector}, [x{modulus_base}]" + self.modulus_lane = 0 + + self.gprs.free(modulus_base) + + if not self.interleave_twiddles: + self.ptr_roots = self.gprs.alloc() + yield f"ASM_LOAD(x{self.ptr_roots}, roots)" + self.ptr_roots_twisted = self.gprs.alloc() + yield f"ASM_LOAD(x{self.ptr_roots_twisted}, roots_twisted)" + else: + self.ptr_roots_merged = self.gprs.alloc() + yield f"ASM_LOAD(x{self.ptr_roots_merged}, roots_merged)" + + self.roots = None + + def get_data(self,index): + if not index in self.data.keys(): + raise Exception(f"Data at index {index} hasn't been loaded") + return self.data[index] + + def load_data(self,index,reg=None): + if index in self.data.keys(): + # Data has already been loaded + return iter([]) + + self.data[index] = self.vregs.alloc(reg) + yield f"ldr q{self.data[index]}, [x{self._src}, #{self.element_size*index}]" + + def release_data(self,index): + if index not in self.data.keys(): + raise Exception(f"Data at index {index} hasn't been loaded") + + self.vregs.free(self.data[index]) + del self.data[index] + + def store_data(self,index,release=True): + if index not in self.data.keys(): + raise Exception(f"Data at index {index} hasn't been loaded") + + yield f"str q{self.data[index]}, [x{self._src}, #{self.element_size*index}]" + + if release: + self.release_data(index) + + def gs_butterfly_single(self, butterfly, i, j, root_index): + + root = butterfly.root(root_index) + root_lane = butterfly.root_lane(root_index) + root_twisted = butterfly.root_twisted(root_index) + root_twisted_lane = butterfly.root_twisted_lane(root_index) + + if root == None: + raise Exception(f"Invalid root, index {root_index}") + if root_twisted == None: + raise Exception(f"Invalid twisted root, index {root_index}") + + modulus = self.modulus_vector + modulus_lane = self.modulus_lane + + suf = self.vector_suffix + + # A lane value of None means that we don't want lane-indexing + if root_lane != None: + root_name = f"{root}.s[{root_lane}]" + else: + root_name = f"{root}.{suf}" + + # A lane value of None means that we don't want lane-indexing + if root_twisted_lane != None: + root_twisted_name = f"{root_twisted}.s[{root_twisted_lane}]" + else: + root_twisted_name = f"{root_twisted}.{suf}" + + tmp = self.vregs.alloc() + yield f"sqrdmulh v{tmp}.{suf}, " \ + f"v{self.get_data(butterfly[j])}.{suf}, " \ + f"v{root_twisted_name}" + + yield f"mul v{self.get_data(butterfly[j])}.{suf}, "\ + f"v{self.get_data(butterfly[j])}.{suf}," \ + f"v{root_name}" + + self.vregs.free(tmp) + yield f"mla v{self.get_data(butterfly[j])}.{suf}, v{tmp}.{suf}, "\ + f"v{modulus}.s[{modulus_lane}]" + + tmp = self.vregs.alloc() + a = self.get_data(butterfly[i]) + b = self.get_data(butterfly[j]) + + self.data[butterfly[j]] = tmp + yield f"sub v{tmp}.4s, v{a}.4s, v{b}.4s" + + # Make sure i is still allocated + assert a == self.get_data(butterfly[i]) + + self.vregs.free(b) + yield f"add v{a}.4s, v{a}.4s, v{b}.4s" + + + def copy_root_scalars(self,dst,src): + dst.root_vecs = src.root_vecs + dst.root_twisted_vecs = src.root_twisted_vecs + dst.root = src.root + dst.root_lane = src.root_lane + dst.root_twisted = src.root_twisted + dst.root_twisted_lane = src.root_twisted_lane + + def load_input(self,butterfly, first=False): + if butterfly == None: + return iter([]) + + if butterfly.load_idx >= pow(2,butterfly.merged): + raise Exception("Too many loads") + + if not first: + load_order = butterfly.load_order + else: + load_order = butterfly.load_order_first + + yield from self.load_data(butterfly[load_order[butterfly.load_idx]]) + butterfly.load_idx += 1 + + def store_input(self,butterfly,last=False): + if butterfly == None: + return iter([]) + + if butterfly.store_idx >= pow(2,butterfly.merged): + raise Exception("Too many late stores") + + if not last: + store_order = butterfly.store_order + else: + store_order = butterfly.store_order_last + if butterfly.store_idx >= len(store_order): + return iter([]) + + yield from self.store_data(butterfly[store_order[butterfly.store_idx]]) + butterfly.store_idx += 1 + + def transpose4(self,idx): + + # Need four temporaries for the transposition + t = [ None for _ in range(0,4) ] + + t[0] = self.vregs.alloc() + yield f"trn1 v{t[0]}.4S, v{idx(0)}.4S, v{idx(1)}.4S" + t[1] = self.vregs.alloc() + yield f"trn2 v{t[1]}.4S, v{idx(0)}.4S, v{idx(1)}.4S" + t[2] = self.vregs.alloc() + yield f"trn1 v{t[2]}.4S, v{idx(2)}.4S, v{idx(3)}.4S" + t[3] = self.vregs.alloc() + yield f"trn2 v{t[3]}.4S, v{idx(2)}.4S, v{idx(3)}.4S" + + yield f"trn2 v{idx(2)}.2D, v{t[0]}.2D, v{t[2]}.2D" + yield f"trn2 v{idx(3)}.2D, v{t[1]}.2D, v{t[3]}.2D" + + # Do this here and not after the yield + self.vregs.free(t[0]) + self.vregs.free(t[2]) + yield f"trn1 v{idx(0)}.2D, v{t[0]}.2D, v{t[2]}.2D" + + # Do this here and not after the yield + self.vregs.free(t[1]) + self.vregs.free(t[3]) + yield f"trn1 v{idx(1)}.2D, v{t[1]}.2D, v{t[3]}.2D" + + + def load_root_scalars(self,butterfly): + + if butterfly == None or butterfly.load_roots == False: + return iter([]) + + def gen(): + + root_vec_storage = self.vector_storage_per_block_at_layer[butterfly.layer] + root_storage_byte = self.vector_bytelen * root_vec_storage + + r = [ None for _ in range(0, root_vec_storage) ] + rt = [ None for _ in range(0, root_vec_storage) ] + + butterfly.root_vecs = r + butterfly.root_twisted_vecs = rt + + order = butterfly.root_load_order + + if self.increment_root_ptr: + assert(self.interleave_twiddles == False) + if self.multi_access_strategy == 0: + for i in range(0,root_vec_storage): + r[order[i]] = self.vregs.alloc() + yield f"ldr q{r[order[i]]}, [x{self.ptr_roots}], #+{self.vector_bytelen}" + rt[order[i]] = self.vregs.alloc() + yield f"ldr q{rt[order[i]]}, [x{self.ptr_roots_twisted}], #+{self.vector_bytelen}" + elif self.multi_access_strategy == 1: + for i in range(0,root_vec_storage,2): + rt[order[i]] = self.vregs.alloc() + rt[i+1] = self.vregs.alloc() + yield f"ldp q{rt[order[i]]}, q{rt[i+1]}, [x{self.ptr_roots_twisted}], #+{2*self.vector_bytelen}" + r[order[i]] = self.vregs.alloc() + r[order[i+1]] = self.vregs.alloc() + yield f"ldp q{r[order[i]]}, q{r[i+1]}, [x{self.ptr_roots}], #+{2*self.vector_bytelen}" + else: + + offset_base = self.root_offset_for_layer[butterfly.layer] + offset_base += root_storage_byte * butterfly.block + + for i in range(0,root_vec_storage): + + offset = offset_base + order[i] * self.vector_bytelen + if not self.interleave_twiddles: + r[order[i]] = self.vregs.alloc() + yield f"ldr q{r[order[i]]}, [x{self.ptr_roots}, #+{offset}]" + rt[order[i]] = self.vregs.alloc() + yield f"ldr q{rt[order[i]]}, [x{self.ptr_roots_twisted}, #+{offset}]" + else: + r[order[i]] = self.vregs.alloc() + yield f"ldr q{r[order[i]]}, [x{self.ptr_roots_merged}, #+{2*offset+0}]" + rt[order[i]] = self.vregs.alloc() + yield f"ldr q{rt[order[i]]}, [x{self.ptr_roots_merged}, #+{2*offset+self.vector_bytelen}]" + + if butterfly.scalar_load == None: + butterfly.scalar_load = gen() + + return butterfly.scalar_load + + def get_transpose(self,butterfly): + + def butterfly_accessor(idx): + return self.get_data(butterfly[idx]) + + if butterfly == None: + return iter([]) + + if butterfly.transpose == None: + butterfly.transpose = self.transpose4(butterfly_accessor) + + while True: + n = next(butterfly.transpose,None) + if n == None: + break + else: + yield n + + def progress_arithmetic(self,butterfly,idx): + if butterfly == None: + return iter([]) + + yield next(butterfly.gs[idx]) + + def get_schedule_triple_no_transpose(self, idx): + + default = { "load_order": [7,6,4,5,3,2,0,1], + "store_order": [0,1,2,3,6,7,4,5], + "numbering": list(zip([3,2,0,1, 1,0,5,4, 0,2,6,4], + [7,6,4,5, 3,2,7,6, 1,3,7,5], + [0,0,0,0, 1,1,2,2, 3,4,6,5])), + "twiddles": { 0: (0,0), + 1: (0,1), + 2: (0,2), + 3: (1,0), + 4: (1,1), + 5: (1,2), + 6: (1,3) }, + "root_load_order": list(range(0,10)), + "schedule": None, + } + + modifications = { + + # INDEX 0 + # Trivial implementation, no interleaving whatsoever + 0: { "schedule": + ["m", "l", "l", "l", "l", "l", "l", "l", "l", + 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, + 4, 4, 4, 4, 4, "lre", + 5, 5, 5, 5, 5, + 6, 6, 6, 6, 6, + 7, 7, 7, 7, 7, + 8, 8, 8, 8, 8, "s", "s", + 9, 9, 9, 9, 9, "s", "s", + 10, 10, 10, 10, 10, "s", "s", + 11, 11, 11, 11, 11, "s", "s", "frl" ] }, + + # INDEX 1 + # - No early loads, few late stores + # - suitable for two inputs + # - Lots of spacing between all GS components + 1: { "schedule": + ["m", "l", "l", 0, -2, -1, 0, -2, "sl", "sl", + "l", "l", 1, -1, 0, 1, -1, "sl", "sl", "frl", + "l", "l", 2, 0, 1, 2, 0, + "l", "l", 3, 1, 2, 3, 1, + 4, 2, 3, 4, 2, + 5, 3, 4, 5, 3, + 6, 4, 5, 6, 4, "lre", + 7, 5, 6, 7, 5, + 8, 6, 7, 8, 6, + 9, 7, 8, 9, 7, + 10, 8, 9, 10, 8, "s", "s", + 11, 9, 10, 11, 9, "s", "s" ] }, + + # INDEX 2 + # - Late-store, but only few early loads + # - GS blocks (mul,mul),(mul),(add,sub) + 2: { "schedule": + ["m", 0, 0, "l", -1, -2, -2, + 1, "sl", 1, "l", 0, -1, -1, "frl", + 2, "sl", 2, "l", 1, 0, 0, + 3, "sl", 3, "l", 2, 1, 1, + 4, "sl", 4, "l", 3, 2, 2, + 5, "sl", 5, "l", 4, 3, 3, + 6, "sl", 6, 5, 4, 4, + 7, "sl", 7, 6, 5, 5, "lres", + 8, "sl", 8, 7, 6, 6, "lres", + 9, 9, 8, 7, 7, "lre", + 10, 10, "le", 9, 8, 8, + 11, 11, "le", 10, 9, 9 ] }, + + # INDEX 3 + # - Extensive pre-loading and late-storing + # - GS blocks (mul,mul),(mul),(add,sub) + 3 : { "schedule": + ["m", 0, 0, -1, -2, -2, + 1, 1, 0, -1, -1, "frl", + 2, 2, "le", 1, 0, 0, + 3, 3, "le", 2, 1, 1, + 4, "sl", 4, "le", 3, 2, 2, + 5, "sl", 5, "le", 4, 3, 3, + 6, "sl", 6, "le", 5, 4, 4, "lre", + 7, "sl", 7, "le", 6, 5, 5, + 8, "sl", 8, "le", 7, 6, 6, + 9, "sl", 9, "le", 8, 7, 7, + 10, "sl", 10, 9, 8, 8, + 11, "sl", 11, 10, 9, 9 ] }, + + # INDEX 4 + # - Extensive pre-loading and late-storing + # - GS blocks (mul), (mul) (mul),(add,sub) + 4 : { "schedule": + ["m", 0, -1, 0, -2, -2, + 1, 0, 1, -1, -1, "frl", + 2, 1, "le", 2, 0, 0, + 3, 2, "le", 3, 1, 1, + 4, "sl", 3, "le", 4, 2, 2, + 5, "sl", 4, "le", 5, 3, 3, + 6, "sl", 5, "le", 6, 4, 4, "lre", + 7, "sl", 6, "le", 7, 5, 5, + 8, "sl", 7, "le", 8, 6, 6, + 9, "sl", 8, "le", 9, 7, 7, + 10, "sl", 9, 10, 8, 8, + 11, "sl", 10, 11, 9, 9 ] }, + + # INDEX 5 + # - Extensive pre-loading and late-storing + # - GS blocks (mul),(mul,mul),(add,sub) + 5 : { "schedule": + ["m", 0, -1, -1, -2, -2, + 1, 0, 0, -1, -1, "frl", + 2, "le", 1, 1, 0, 0, + 3, "le", 2, 2, 1, 1, + 4, "sl", "le", 3, 3, 2, 2, + 5, "sl", "le", 4, 4, 3, 3, + 6, "sl", "le", 5, 5, 4, 4, "lre", + 7, "sl", "le", 6, 6, 5, 5, + 8, "sl", "le", 7, 7, 6, 6, + 9, "sl", "le", 8, 8, 7, 7, + 10, "sl", 9, 9, 8, 8, + 11, "sl", 10, 10, 9, 9 ] }, + + # INDEX 6 + # - No early loads, few late stores + # - suitable for two inputs + # - GS blocks (mul,mul),(mul),(add), (sub) + 6 : { "schedule": + ["m", "l", "l", 0, 0, -2, -1, -2, "sl", "sl", + "l", "l", 1, 1, -1, 0, -1, "sl", "sl", "frl", + "l", "l", 2, 2, 0, 1, 0, + "l", "l", 3, 3, 1, 2, 1, + 4, 4, 2, 3, 2, + 5, 5, 3, 4, 3, + 6, 6, 4, 5, 4, "lre", + 7, 7, 5, 6, 5, + 8, 8, 6, 7, 6, + 9, 9, 7, 8, 7, + 10, 10, 8, 9, 8, "s", "s", + 11, 11, 9, 10, 9, "s", "s" ] }, + + # INDEX 7 + # - No early loads, few late stores + # - suitable for two inputs + # - GS blocks (mul,mul),(mul),(add), (sub) + # - scattered stores + 7 : { "schedule": + ["m", "l", "l", 0, 0, "sl", -2, -1, -2, "sl", + "l", "l", 1, "sl", 1, -1, 0, -1, "sl", "frl", + "l", "l", 2, "sl", 2, 0, 1, 0, + "l", "l", 3, 3, 1, 2, 1, + 4, 4, 2, 3, 2, + 5, 5, 3, 4, 3, + 6, 6, 4, 5, 4, "lre", + 7, 7, 5, 6, 5, + 8, 8, 6, 7, 6, + 9, 9, 7, 8, 7, + 10, 10, 8, 9, 8, "s", + 11, 11, "s", 9, 10, 9, "s" ] }, + + # INDEX 8 + # - No early loads, few late stores + # - suitable for two inputs + # - GS blocks (mul,mul),(mul),(add), (sub) + # - stores after muls only, trying to avoid them going + # multiply-capable neon units + 8 : { "schedule": + ["m", "l", "l", 0, "sl", 0, "sl", -2, -1, -2, + "l", "l", 1, "sl", 1, "sl", -1, 0, -1, "frl", + "l", "l", 2, "sl", 2, "sl", 0, 1, 0, + "l", "l", 3, 3, 1, 2, 1, + 4, 4, 2, 3, 2, + 5, 5, 3, 4, 3, + 6, 6, 4, 5, 4, "lre", + 7, 7, 5, 6, 5, + 8, 8, 6, 7, 6, + 9, 9, 7, 8, 7, + 10, 10, 8, 9, 8, + 11, "s", 11, "s", 9, 10, 9 ] }, + + # INDEX 9 + # - No early loads, few late stores + # - suitable for two inputs + # - GS blocks (mul,mul),(mul),(add), (sub) + # - stores after muls only, trying to avoid them going + # multiply-capable neon units + 9 : { "schedule": + ["m", "l", "l", 0, "sl", 0, -2, -1, "sl", -2, + "l", "l", 1, "sl", 1, -1, 0, "sl", -1, "frl", + "l", "l", 2, "sl", 2, 0, 1, "sl", 0, + "l", "l", 3, 3, 1, 2, 1, + 4, 4, 2, 3, 2, + 5, 5, 3, 4, 3, + 6, 6, 4, 5, 4, "lre", + 7, 7, 5, 6, 5, + 8, 8, 6, 7, 6, + 9, 9, 7, 8, 7, + 10, 10, 8, 9, 8, + 11, "s", 11, 9, 10, "s", 9 ] }, + + # INDEX 10 + # - No early loads, few late stores + # - suitable for two inputs + # - GS blocks (mul,mul),(mul),(add), (sub) + # - stores after muls only, trying to avoid them going + # multiply-capable neon units + 10 : { "schedule": + ["m", "l", "l", 0, "sl", 0, "sl", -1, -2, -2, + "l", "l", 1, "sl", 1, "sl", 0, -1, -1, "frl", + "l", "l", 2, "sl", 2, "sl", 1, 0, 0, + "l", "l", 3, 3, 2, 1, 1, + 4, 4, 3, 2, 2, + 5, 5, 4, 3, 3, + 6, 6, 5, 4, 4, "lre", + 7, 7, 6, 5, 5, + 8, 8, 7, 6, 6, + 9, 9, 8, 7, 7, + 10, 10, 9, 8, 8, + 11, "s", 11, "s", 10, 9, 9 ] }, + + # INDEX 11 + # - No early loads, few late stores + # - suitable for two inputs + # - GS blocks (mul,mul),(mul),(add), (sub) + # - stores after muls only, trying to avoid them going + # multiply-capable neon units + 11 : { "schedule": + ["m", 0, "sl", 0, "sl", -1, "l", -2, "l", -2, + 1, "sl", 1, "sl", 0, "l", -1, "l", -1, "frl", + 2, "sl", 2, "sl", 1, "l", 0, "l", 0, + 3, 3, 1, 2, 1, + 4, 4, 2, 3, 2, + 5, 5, 3, 4, 3, + 6, 6, 4, 5, 4, "lre", + 7, 7, 5, 6, 5, + 8, 8, 6, 7, 6, + 9, 9, 7, 8, 7, + 10, 10, 8, 9, 8, + 11, "s", 11, "s", 10, 9, "le", "le", 9 ] }, + + } + + modification = modifications[idx] + + for k,v in modification.items(): + if not k in default.keys(): + raise Exception(f"Invalid modification: {k}") + + dic = { **default, **modification } + + dic["load_order_first"] = dic.get("load_order_first", dic["load_order"]) + dic["store_order_last"] = dic.get("store_order_last", dic["store_order"]) + + return dic + + def get_schedule_double_no_transpose_zipped(self, idx): + + load_order_default = [2,3,0,1] + store_order_default = [0,1,2,3] + butterfly_numbering_default = list(zip([0,1,0,2], + [2,3,1,3], + [0,0,1,2])) + twiddle_numbering_default = { 0: (0,0), + 1: (0,1), + 2: (0,2) } + root_load_order_default = list(range(0,10)) # Identity + + schedules = [ + + # INDEX 0 + # Trivial implementation, no interleaving whatsoever + [ None, None, None, None, + (0, "m"), (0, "l"), (0, "l"), (0, "l"), (0, "l"), + (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), + (0, 1), (0, 1), (0, 1), (0, 1), (0, 1), (0,"lre"), + (0, 2), (0, 2), (0, 2), (0, 2), (0, 2), + (0, 3), (0, 3), (0, 3), (0, 3), (0, 3), + (0,"s"), (0,"s"), (0,"s"), (0,"s"), (0,"frl"), + (1, "m"), (1, "l"), (1, "l"), (1, "l"), (1, "l"), + (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), + (1, 1), (1, 1), (1, 1), (1, 1), (1, 1), (1,"lre"), + (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), + (1, 3), (1, 3), (1, 3), (1, 3), (1, 3), + (1,"s"), (1,"s"), (1,"s"), (1,"s"), (1,"frl") ] + ] + + load_order, store_order, numbering, twiddles, root_order, schedule = schedules[idx] + + if load_order == None: + load_order = load_order_default + if store_order == None: + store_order = store_order_default + if numbering == None: + numbering = butterfly_numbering_default + if twiddles == None: + twiddles = twiddle_numbering_default + if root_order == None: + root_order = root_load_order_default + + return load_order, store_order, numbering, twiddles, root_order, schedule + + def get_schedule_double_no_transpose_quad_zipped(self, idx): + + default = { "load_order": [2,3,0,1], + "store_order": [0,1,2,3], + "numbering": list(zip([0,1,0,2], + [2,3,1,3], + [0,0,1,2])), + "twiddles": { 0: (0,0), + 1: (0,1), + 2: (0,2) }, + "root_load_order": list(range(0,10)), + "schedule": None } + + modifications = { + + # INDEX 0 + # Trivial implementation, no interleaving whatsoever + 0 : { "schedule": + [ (0, "m"), (0, "l"), (0, "l"), (0, "l"), (0, "l"), + (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), + (0, 1), (0, 1), (0, 1), (0, 1), (0, 1), (0,"lre"), + (0, 2), (0, 2), (0, 2), (0, 2), (0, 2), + (0, 3), (0, 3), (0, 3), (0, 3), (0, 3), + (0,"s"), (0,"s"), (0,"s"), (0,"s"), (0,"frl"), + + (1, "m"), (1, "l"), (1, "l"), (1, "l"), (1, "l"), + (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), + (1, 1), (1, 1), (1, 1), (1, 1), (1, 1), (1,"lre"), + (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), + (1, 3), (1, 3), (1, 3), (1, 3), (1, 3), + (1,"s"), (1,"s"), (1,"s"), (1,"s"), (1,"frl"), + + (2, "m"), (2, "l"), (2, "l"), (2, "l"), (2, "l"), + (2, 0), (2, 0), (2, 0), (2, 0), (2, 0), + (2, 1), (2, 1), (2, 1), (2, 1), (2, 1), (2,"lre"), + (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), + (2, 3), (2, 3), (2, 3), (2, 3), (2, 3), + (2,"s"), (2,"s"), (2,"s"), (2,"s"), (2,"frl"), + + (3, "m"), (3, "l"), (3, "l"), (3, "l"), (3, "l"), + (3, 0), (3, 0), (3, 0), (3, 0), (3, 0), + (3, 1), (3, 1), (3, 1), (3, 1), (3, 1), (3,"lre"), + (3, 2), (3, 2), (3, 2), (3, 2), (3, 2), + (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), + (3,"s"), (3,"s"), (3,"s"), (3,"s"), (3,"frl") ] }, + + # INDEX 1 + # Interleaved arithmetic + 1 : { "schedule": [ + (0,"m"), (0,"l"), (0,"l"), (0,"l"), (0,"l"), + (1,"m"), (1,"l"), (1,"l"), (1,"l"), (1,"l"), + (2,"m"), (2,"l"), (2,"l"), (2,"l"), (2,"l"), + (3,"m"), (3,"l"), (3,"l"), (3,"l"), (3,"l"), + + (0,"lrs"), (0,"lrs"), + (1,"lrs"), (1,"lrs"), + (2,"lrs"), (2,"lrs"), + (3,"lrs"), (3,"lrs"), + + (0,0), (0,0), + (0,1), (0,1), (0,0), + (1,0), (1,0), (0,1), (0,0), (0,0), + (1,1), (1,1), (1,0), (0,1), (0,1), + (0,2), (0,2), (1,1), (1,0), (1,0), + (0,3), (0,3), (0,2), (1,1), (1,1), + (1,2), (1,2), (0,3), (0,2), (0,2), (0, "frs"), (0, "frs"), + (1,3), (1,3), (1,2), (0,3), (0,3), + (2,0), (2,0), (1,3), (1,2), (1,2), (1, "frs"), (1, "frs"), + (2,1), (2,1), (2,0), (1,3), (1,3), + (3,0), (3,0), (2,1), (2,0), (2,0), + (3,1), (3,1), (3,0), (2,1), (2,1), + (2,2), (2,2), (3,1), (3,0), (3,0), + (2,3), (2,3), (2,2), (3,1), (3,1), + (3,2), (3,2), (2,3), (2,2), (2,2), (2, "frs"), (2, "frs"), + (3,3), (3,3), (3,2), (2,3), (2,3), + (3,3), (3,2), (3,2), (3, "frs"), (3, "frs"), + (3,3), (3,3), + + (0, "s"), (0, "s"), (0, "s"), (0, "s"), + (1, "s"), (1, "s"), (1, "s"), (1, "s"), + (2, "s"), (2, "s"), (2, "s"), (2, "s"), + (3, "s"), (3, "s"), (3, "s"), (3, "s") ] }, + + # INDEX 2 + # Butterfly-wise interleaving + 2 : { "schedule": [ + (0, "l"), (0, "l"), (0, "l"), + (1, "l"), (1, "l"), (1, "l"), + (2, "l"), (2, "l"), (2, "l"), + (3, "l"), (3, "l"), (3, "l"), + (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, "l"), + (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, "l"), + (2, 0), (2, 0), (2, 0), (2, 0), (2, 0), (2, "l"), + (3, 0), (3, 0), (3, 0), (3, 0), (3, 0), (3, "l"), + (0, 1), (0, 1), (0, 1), (0, 1), (0, 1), + (1, 1), (1, 1), (1, 1), (1, 1), (1, 1), + (2, 1), (2, 1), (2, 1), (2, 1), (2, 1), + (3, 1), (3, 1), (3, 1), (3, 1), (3, 1), + (0, 2), (0, 2), (0, 2), (0, 2), (0, 2), + (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (0, "s"), (0, "s"), + (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (1, "s"), (1, "s"), + (3, 2), (3, 2), (3, 2), (3, 2), (3, 2), (2, "s"), (2, "s"), + (0, 3), (0, 3), (0, 3), (0, 3), (0, 3), (3, "s"), (3, "s"), + (0,"fr"), (0,"lre"), + (1, 3), (1, 3), (1, 3), (1, 3), (1, 3), (1,"fr"), (1,"lre"), + (2, 3), (2, 3), (2, 3), (2, 3), (2, 3), (2,"fr"), (2,"lre"), + (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3,"fr"), (3,"lre"), + (0,"s"), (0,"s"), + (1,"s"), (1,"s"), + (2,"s"), (2,"s"), + (3,"s"), (3,"s"), + ] }, + + # INDEX 3 + # Butterfly-wise interleaving + 3: { "schedule": [ + (0, "l"), (0, "l"), (0, "l"), + (1, "l"), (1, "l"), (1, "l"), + (2, "l"), (2, "l"), (2, "l"), + (3, "l"), (3, "l"), (3, "l"), + + (0, 0), (1, 0), (2, 0), (3, 0), (0, "sl"), + (0, 0), (1, 0), (2, 0), (3, 0), (1, "sl"), + (0, 0), (1, 0), (2, 0), (3, 0), (2, "sl"), + (0, 0), (1, 0), (2, 0), (3, 0), (3, "sl"), + (0, 0), (1, 0), (2, 0), (3, 0), + + (0, "l"), + (1, "l"), + (2, "l"), + (3, "l"), + + (0, 1), (1, 1), (2, 1), (3, 1), + (0, 1), (1, 1), (2, 1), (3, 1), + (0, 1), (1, 1), (2, 1), (3, 1), + (0, 1), (1, 1), (2, 1), (3, 1), + (0, 1), (1, 1), (2, 1), (3, 1), + + (0, 2), (1, 2), (2, 2), (3, 2), + (0, 2), (1, 2), (2, 2), (3, 2), + (0, 2), (1, 2), (2, 2), (3, 2), + (0, 2), (1, 2), (2, 2), (3, 2), + (0, 2), (1, 2), (2, 2), (3, 2), + + (0, 3), (1, 3), (2, 3), (3, 3), (0,"s"), (0,"s"), + (0, 3), (1, 3), (2, 3), (3, 3), (1,"s"), (1,"s"), + (0, "fr"), + (1, "fr"), + (2, "fr"), + (3, "fr"), + (0,"lre"), + (1,"lre"), + (0, 3), (1, 3), (2, 3), (3, 3), (2,"s"), (2,"s"), + (2,"lre"), + (0, 3), (1, 3), (2, 3), (3, 3), (3,"s"), (3,"s"), + (3,"lre"), + (0, 3), (1, 3), (2, 3), (3, 3), + + (0, "s"), + (1, "s"), + (2, "s"), + (3, "s"), + ] }, + + # INDEX 4 + # Careful scheduling of arithmetic instructions, + # tailored to microarchitectures like Cortex-X1: + # - 4 SIMD units + # - 2 of them multiply capable + # - Multiply latency 4, but 1-cycle fwd for MUL-MLA + # Note the asymmetry between 0-3 and 4-7, leveraging + # the fast fwd from MUL to MLA. + 4 : { "store_order": [1,0,3,2], + "schedule": + [ + + (0,"lrs"), (0,"lrs"), + (0,"l"), + (0,0), (2,-2), (2,"sl"), + (0,0), (2,-2), (2,"sl"), + (0,"l"), + (0,1), (2,-1), + (0,1), (2,-1), (2,"sl"), (2,"sl"), + (1,"lrs"), (1,"lrs"), + (1,"l"), + (1,0), (3,-2), + (1,0), (3,-2), (3,"sl"), (3,"sl"), + (1,"l"), + (1,1), (3,-1), + (1,1), (3,-1), (3,"sl"), (3,"sl"), + (2,"l"), + (2,"lrs"), (2,"lrs"), + (0,0), + (2,0), + (2,"l"), + (0,1), + (2,1), + (3,"l"), + (3,"lrs"), (3,"lrs"), + (1,0), + (3,0), + (3,"l"), + (1,1), + (3,1), + + (2,"l"), (0,"l"), + (2,0), (0,0), + (2,1), (0,0), + (2,"l"), (0,"l"), + (2,0), (0,1), + (2,1), (0,1), + (3,"l"), (1,"l"), + (3,0), (1,0), + (3,1), (1,0), + (3,"l"), (1,"l"), + (3,0), (1,1), + (3,1), (1,1), + + (0,2), + (0,2), + (0,3), (2,0), + (0,3), (2,0), + (0,"frs"), + (0,"frs"), + (1,2), (2,1), + (1,2), (2,1), + (1,3), (3,0), + (1,3), (3,0), + (1,"frs"), + (1,"frs"), + (0,2), (3,1), + (2,2), (3,1), + (0,3), + (2,3), + (1,2), + (3,2), + (1,3), + (3,3), + + (2,2), (0,2), + (2,3), (0,2), (0,"s"), (0,"s"), + (2,2), (0,3), + (2,3), (0,3), (0,"s"), (0,"s"), + (2,"frs"), + (2,"frs"), + (3,2), (1,2), + (3,3), (1,2), (1,"s"), (1,"s"), + (3,2), (1,3), + (3,3), (1,3), (1,"s"), (1,"s"), + (3,"frs"), + (3,"frs"), + + ] }, + + # INDEX 5 + # Variation of 4 which tries to have blocks + # 2 multiply + 1 add/sub + 1 str + # This combination can in principle keep all SIMD units busy + 5 : { "store_order": [1,0,3,2], + "schedule": [ + + (0,0), (2,-2), + (0,0), (2,"sl"), + (0,"l"), + (0,1), (2,-2), + (0,1), (2,"sl"), + (1,"lrs"), + (1,"lrs"), + (1,"l"), + (1,0), (2,-1), + (1,0), (2,"sl"), + (1,"l"), + (1,1), (2,-1), + (1,1), (2,"sl"), + (2,"lrs"), + (2,"lrs"), + (0,0), (3,-2), + (2,0), (3,"sl"), + (2,"l"), + (0,1), (3,-2), + (2,1), (3,"sl"), + (3,"lrs"), + (3,"lrs"), + (1,0), (3,-1), + (3,0), (3,"sl"), + (3,"l"), + (1,1), (3,-1), + (3,1), (3,"sl"), + + (2,"l"), (0,"l"), + (2,0), (0,0), + (2,1), (0,0), + (2,"l"), (0,"l"), + (2,0), (0,1), + (2,1), (0,1), + (3,"l"), (1,"l"), + (3,0), (1,0), + (3,1), (1,0), + (3,"l"), (1,"l"), + (3,0), + (3,1), (1,1), + + (0,2), + (0,2), (1,1), + (0,3), (2,0), + (0,3), (2,0), + (0,"frs"), + (0,"frs"), + (1,2), (2,1), + (1,2), (2,1), + (1,3), (3,0), + (1,3), (3,0), + (1,"frs"), + (1,"frs"), + (0,2), (3,1), (3,"le"), + (2,2), (3,1), + (0,3), (2,"le"), + (2,3), + (1,2), (0,"le"), + (3,2), + (1,3), (0,"lres"), + (3,3), (0,"lres"), + + (2,2), (0,2), (0,"s"), + (2,3), (0,2), (0,"s"), + (2,2), (0,3), (0,"s"), + (2,3), (0,3), (0,"s"), + (2,"frs"), + (2,"frs"), + (3,2), (1,2), (1,"s"), + (3,3), (1,2), (1,"s"), + (3,2), (1,3), (1,"s"), + (3,3), (1,3), (1,"s"), + (3,"frs"), + (3,"frs"), + ] }, + + # INDEX 6 + # Variation of 5 + 6 : { "store_order": [1,0,3,2], + "schedule": [ + + (0,0), (2,-2), + (0,0), (2,"sl"), + (0,"l"), + (0,1), (2,-2), + (0,1), (2,"sl"), + (1,"lrs"), + (1,"lrs"), + (1,"l"), + (1,0), (2,-1), + (1,0), (2,"sl"), + (1,"l"), + (1,1), (2,-1), + (1,1), (2,"sl"), + (2,"lrs"), + (2,"lrs"), + (0,0), (3,-2), + (2,0), (3,"sl"), + (2,"l"), + (0,1), (3,-2), + (2,1), (3,"sl"), + (3,"lrs"), + (3,"lrs"), + (1,0), (3,-1), + (3,0), (3,"sl"), + (3,"l"), + (1,1), (3,-1), + (3,1), (3,"sl"), (0,"l"), + + (2,"l"), + (2,0), (0,0), (0,"l"), + (2,1), (0,0), + (2,"l"), + (2,0), (0,1), (1,"l"), + (2,1), (0,1), + (3,"l"), + (3,0), (1,0), (1,"l"), + (3,1), (1,0), + (3,"l"), + (3,0), (1,1), + (3,1), (1,1), + + (0,2), + (0,2), + (0,3), (2,0), + (0,3), (2,0), + (0,"frs"), + (0,"frs"), + (1,2), (2,1), + (1,2), (2,1), + (1,3), (3,0), + (1,3), (3,0), + (1,"frs"), + (1,"frs"), + (0,2), (3,1), (3,"le"), + (2,2), (3,1), + (0,3), (2,"le"), + (2,3), (0,2), # This isn't ready yet, but at least + # we keep the balance of mul/add/str + # instructions and don't have a + # bottleneck at the end + (1,2), (0,"le"), + (3,2), (0,2), (0,"s"), + (1,3), (0,"lres"), (0,"lres"), + (3,3), (0,3), (0,"s"), + + (2,2), (0,3), + (2,3), (0,"s"), + (2,2), (1,2), + (2,3), (0,"s"), + (2,"frs"), + (2,"frs"), + (3,2), (1,"s"), + (3,3), (1,2), (1,"s"), + (3,2), (1,3), (1,"s"), + (3,3), (1,3), (1,"s"), + (3,"frs"), + (3,"frs"), + + ] }, + + # INDEX 7 + # Variation of 6 + 7 : { "store_order": [1,0,3,2], + "schedule": [ + + (0,0), (2,-2), + (0,0), (2,"sl"), + (0,"l"), + (0,1), (2,-2), + (0,1), (2,"sl"), + (1,"lrs"), + (1,"lrs"), + (1,"l"), + (1,0), (2,-1), + (1,0), (2,"sl"), + (1,"l"), + (1,1), (2,-1), + (1,1), (2,"sl"), + (2,"lrs"), + (2,"lrs"), + (0,0), (3,-2), + (2,0), (3,"sl"), + (2,"l"), + (0,1), (3,-2), + (2,1), (3,"sl"), + (3,"lrs"), + (3,"lrs"), + (1,0), (3,-1), + (3,0), (3,"sl"), + (3,"l"), + (1,1), (3,-1), + (3,1), (3,"sl"), (0,"l"), + + (2,"l"), + (2,0), (0,0), (0,"l"), + (2,1), (0,0), + (2,"l"), + (2,0), (0,1), (1,"l"), + (2,1), (0,1), + (3,"l"), + (3,0), (1,0), (1,"l"), + (3,1), (1,0), + (3,"l"), + (3,0), + (3,1), (1,1), + + (0,2), (1,1), + (0,2), + (0,3), (2,0), + (0,3), (2,0), + (0,"frs"), + (0,"frs"), + (1,2), (2,1), + (1,2), (2,1), + (1,3), (3,0), + (1,3), (3,0), + (1,"frs"), + (1,"frs"), + (0,2), (3,1), (3,"le"), + (2,2), (3,1), + (0,3), (2,"le"), + (2,3), (0,2), # This isn't ready yet, but at least + # we keep the balance of mul/add/str + # instructions and don't have a + # bottleneck at the end + (1,2), (0,"le"), + (3,2), (0,2), (0,"s"), + (1,3), (0,"lres"), (0,"lres"), + (3,3), (0,3), (0,"s"), + + (2,2), (0,3), + (2,3), (0,"s"), + (2,2), (1,2), + (2,3), (0,"s"), + (2,"frs"), + (2,"frs"), + (3,2), (1,"s"), + (3,3), (1,2), (1,"s"), + (3,2), (1,3), (1,"s"), + (3,3), (1,3), (1,"s"), + (3,"frs"), + (3,"frs"), + + ] }, + + # INDEX 8 + # Variation of 7, experimentally removing consecutive loads + 8 : { "store_order": [1,0,3,2], + "schedule": + [ + + (0,0), (2,-2), + (0,0), (2,"sl"), + (0,"l"), + (0,1), (2,-2), (1,"lrs"), + (0,1), (2,"sl"), + (1,"lrs"), + (1,"l"), + (1,0), (2,-1), + (1,0), (2,"sl"), + (1,"l"), + (1,1), (2,-1), + (1,1), (2,"sl"), + (2,"lrs"), + (0,0), (3,-2), (2,"lrs"), + (2,0), (3,"sl"), + (2,"l"), + (0,1), (3,-2), + (2,1), (3,"sl"), + (3,"lrs"), + (1,0), (3,-1), (3,"lrs"), + (3,0), (3,"sl"), + (3,"l"), + (1,1), (3,-1), + (3,1), (3,"sl"), (0,"l"), + + (2,"l"), + (2,0), (0,0), (0,"l"), + (2,1), (0,0), + (2,"l"), + (2,0), (0,1), (1,"l"), + (2,1), (0,1), + (3,"l"), + (3,0), (1,0), (1,"l"), + (3,1), (1,0), + (3,"l"), + (3,0), + (3,1), (1,1), + + (0,2), (1,1), + (0,2), + (0,3), (2,0), + (0,3), (2,0), + (0,"frs"), + (0,"frs"), + (1,2), (2,1), + (1,2), (2,1), + (1,3), (3,0), + (1,3), (3,0), + (1,"frs"), + (1,"frs"), + (0,2), (3,1), (3,"le"), + (2,2), (3,1), + (0,3), (2,"le"), + (2,3), (0,2), # This isn't ready yet, but at least + # we keep the balance of mul/add/str + # instructions and don't have a + # bottleneck at the end + (1,2), (0,"le"), + (3,2), (0,2), (0,"s"), + (1,3), (0,"lres"), + (3,3), (0,3), (0,"s"), + + (2,2), (0,3), (0,"lres"), + (2,3), (0,"s"), + (2,2), (1,2), + (2,3), (0,"s"), + (2,"frs"), + (2,"frs"), + (3,2), (1,"s"), + (3,3), (1,2), (1,"s"), + (3,2), (1,3), (1,"s"), + (3,3), (1,3), (1,"s"), + (3,"frs"), + (3,"frs"), + + ] }, + + # INDEX 9 + 9 : { "store_order": [1,0,3,2], + "schedule": [ + + (0,0), (2,-2), + (0,0), (2,"sl"), + (0,"l"), + (0,1), (2,-2), + (0,1), (2,"sl"), + (1,"lrs"), + (1,"lrs"), + (1,"l"), + (1,0), (2,-1), + (1,0), (2,"sl"), + (1,"l"), + (1,1), (2,-1), + (1,1), (2,"sl"), + (2,"lrs"), + (2,"lrs"), + (0,0), (3,-2), + (2,0), (3,"sl"), + (2,"l"), + (0,1), (3,-2), + (2,1), (3,"sl"), + (3,"lrs"), + (3,"lrs"), + (1,0), (3,-1), + (3,0), (3,"sl"), + (3,"l"), + (1,1), (3,-1), + (3,1), (3,"sl"), (0,"l"), + + (2,"l"), + (2,0), (0,0), (0,"l"), + (2,1), (0,0), + (2,"l"), + (2,0), (0,1), (1,"l"), + (2,1), (0,1), + (3,"l"), + (3,0), (1,0), (1,"l"), + (3,1), (1,0), + (3,"l"), + (3,0), "nop", + (3,1), (1,1), + + (0,2), (1,1), + (0,2), "nop", + (0,3), (2,0), + (0,3), (2,0), + (0,"frs"), + (0,"frs"), + (1,2), (2,1), + (1,2), (2,1), + (1,3), (3,0), + (1,3), (3,0), + (1,"frs"), + (1,"frs"), + (0,2), (3,1), (3,"le"), + (2,2), (3,1), + (0,3), (2,"le"), + (2,3), (0,2), # This isn't ready yet, but at least + # we keep the balance of mul/add/str + # instructions and don't have a + # bottleneck at the end + (1,2), (0,"le"), + (3,2), (0,2), (0,"s"), + (1,3), (0,"lres"), (0,"lres"), + (3,3), (0,3), (0,"s"), + + (2,2), (0,3), + (2,3), (0,"s"), + (2,2), (1,2), + (2,3), (0,"s"), + (2,"frs"), + (2,"frs"), + (3,2), (1,"s"), + (3,3), (1,2), (1,"s"), + (3,2), (1,3), (1,"s"), + (3,3), (1,3), (1,"s"), + (3,"frs"), + (3,"frs"), + + ] }, + + # INDEX 10 + # Based on 7, pairing mul ops, making sure we never have two add/sub/str + # between blocks of two muls + 10 : { "store_order": [1,0,3,2], + "schedule": + [ + (0,0), (0,0), (1,-1), (0,"l"), + (1,"sl"), (1,"lrs"), + + (0,1), (0,1), (2,-2), (1,"lrs"), + (2,"sl"), (1,"l"), + (1,0), (1,0), (2,-2), + (2,"sl"), (1,"l"), + + (1,1), (1,1), (2,-1), (2,"lrs"), + (2,"sl"), (2,"lrs"), + (0,0), (2,0), (2,-1), (2,"l"), + (2,"sl"), + (0,1), (2,1), (3,-2), (3,"lrs"), + (3,"sl"), (3,"lrs"), + (1,0), (3,0), (3,-2), + (3,"sl"), (3,"l"), + (1,1), (3,1), (3,-1), (0,"l"), + (3,"sl"), + + (2,"l"), + (2,0), (2,1), (3,-1), (0,"l"), + (3,"sl"), (2,"l"), + (2,0), (2,1), (0,0), (1,"l"), + (0,0), (3,"l"), + (3,0), (3,1), (0,1), (1,"l"), + (0,1), (3,"l"), + (3,0), (3,1), (1,0), + (1,0), + + (0,2), (0,2), (1,1), + (1,1), + (0,3), (0,3), (2,0), (0,"frs"), + (2,0), (0,"frs"), + (1,2), (1,2), (2,1), + (2,1), + (1,3), (1,3), (3,0), (1,"frs"), + (3,0), (1,"frs"), + + (0,2), (2,2), (3,1), (3,"le"), + (3,1), + (0,3), (2,3), (0,2), (2,"le"), + (0,"s"), + (1,2), (3,2), (0,2), (0,"le"), + (0,"s"), + (1,3), (3,3), (0,3), (0,"lres"), + (0,"s"), + + (2,2), (2,3), (0,3), + (0,"s"), (0,"lres"), + (2,2), (2,3), (1,2), (2,"frs"), + (1,"s"), (2,"frs"), + (3,2), (3,3), (1,2), + (1,"s"), + (3,2), (3,3), (1,3), (3,"frs"), + (1,"s"), (3,"frs"), + + ] }, + + # INDEX 11 + # Based on 10, trying to find a better spacing for LDRs + # Note: - #LDRs per iteration is 4*(4+2)=24 + # - _Exactly_ matches the number of cycles spent on multiplications + # - So we can arrange code in a way that every mul-block has precisely + # one LDR in it. That's what we're experimenting with here... + 11 : { "store_order": [1,0,3,2], + "schedule": + [ + (0,0), (0,0), (1,-1), (3,"l"), + (1,"sl"), + (0,1), (0,1), (2,-2), (3,"l"), + (2,"sl"), + (1,0), (1,0), (2,-2), (2,"lrs"), + (2,"sl"), + (1,1), (1,1), (2,-1), (2,"lrs"), + (2,"sl"), + (0,0), (2,0), (2,-1), (3,"lrs"), + (2,"sl"), + (0,1), (2,1), (3,-2), (3,"lrs"), + (3,"sl"), + (1,0), (3,0), (3,-2), (0,"l"), + (3,"sl"), + (1,1), (3,1), (3,-1), (0,"l"), + (3,"sl"), + (2,0), (2,1), (3,-1), (1,"l"), + (3,"sl"), + (2,0), (2,1), (0,0), (1,"l"), + (0,0), + (3,0), (3,1), (0,1), (2,"l"), + (0,1), + (3,0), (3,1), (1,0), (2,"l"), + (1,0), + + (0,2), (0,2), (1,1), (3,"l"), + (1,1), + (0,3), (0,3), (2,0), (3,"l"), (0,"frs"), + (2,0), (0,"frs"), + (1,2), (1,2), (2,1), (0,"le"), + (2,1), + (1,3), (1,3), (3,0), (0,"le"), (1,"frs"), + (3,0), (1,"frs"), + + (0,2), (2,2), (3,1), (1,"le"), + (3,1), + (0,3), (2,3), (0,2), (1,"le"), + (0,"s"), + (1,2), (3,2), (0,2), (0,"lres"), + (0,"s"), + (1,3), (3,3), (0,3), (0,"lres"), + (0,"s"), + + (2,2), (2,3), (0,3), (1,"lres"), + (0,"s"), + (2,2), (2,3), (1,2), (1,"lres"), (2,"frs"), + (1,"s"), (2,"frs"), + (3,2), (3,3), (1,2), (2,"le"), + (1,"s"), + (3,2), (3,3), (1,3), (2,"le"), (3,"frs"), + (1,"s"), (3,"frs"), + + ] }, + + # INDEX 12 + # Based on 11, but using a different load/store order + 12 : { "load_order": [3,2,1,0], + "store_order": [3,2,1,0], + "numbering": list(zip( + [1,0,2,0], + [3,2,3,1], + [0,0,2,1])), + "schedule": + [ + (0,0), (0,0), (1,-1), (3,"l"), + (1,"sl"), + (0,1), (0,1), (2,-2), (3,"l"), + (2,"sl"), + (1,0), (1,0), (2,-2), (2,"lrs"), + (2,"sl"), + (1,1), (1,1), (2,-1), (2,"lrs"), + (2,"sl"), + (0,0), (2,0), (2,-1), (3,"lrs"), + (2,"sl"), + (0,1), (2,1), (3,-2), (3,"lrs"), + (3,"sl"), + (1,0), (3,0), (3,-2), (0,"l"), + (3,"sl"), + (1,1), (3,1), (3,-1), (0,"l"), + (3,"sl"), + (2,0), (2,1), (3,-1), (1,"l"), + (3,"sl"), + (2,0), (2,1), (0,0), (1,"l"), + (0,0), + (3,0), (3,1), (0,1), (2,"l"), + (0,1), + (3,0), (3,1), (1,0), (2,"l"), + (1,0), + + (0,2), (0,2), (1,1), (3,"l"), + (1,1), + (0,3), (0,3), (2,0), (3,"l"), (0,"frs"), + (2,0), (0,"frs"), + (1,2), (1,2), (2,1), (0,"le"), + (2,1), + (1,3), (1,3), (3,0), (0,"le"), (1,"frs"), + (3,0), (1,"frs"), + + (0,2), (2,2), (3,1), (1,"le"), + (3,1), + (0,3), (2,3), (0,2), (1,"le"), + (0,"s"), + (1,2), (3,2), (0,2), (0,"lres"), + (0,"s"), + (1,3), (3,3), (0,3), (0,"lres"), + (0,"s"), + + (2,2), (2,3), (0,3), (1,"lres"), + (0,"s"), + (2,2), (2,3), (1,2), (1,"lres"), (2,"frs"), + (1,"s"), (2,"frs"), + (3,2), (3,3), (1,2), (2,"le"), + (1,"s"), + (3,2), (3,3), (1,3), (2,"le"), (3,"frs"), + (1,"s"), (3,"frs"), + + ] }, + + # INDEX 13 + # Based on 11, shifting the whole add/sub/store block up by two places + 13 : { "store_order": [1,0,3,2], + "schedule": + [ + (0,0), (0,0), (2,-2), (3,"l"), + (2,"sl"), + (0,1), (0,1), (2,-2), (3,"l"), + (2,"sl"), + (1,0), (1,0), (2,-1), (2,"lrs"), + (2,"sl"), + (1,1), (1,1), (2,-1), (2,"lrs"), + (2,"sl"), + + (0,0), (2,0), (3,-2), (3,"lrs"), + (3,"sl"), + (0,1), (2,1), (3,-2), (3,"lrs"), + (3,"sl"), + (1,0), (3,0), (3,-1), (0,"l"), + (3,"sl"), + (1,1), (3,1), (3,-1), (0,"l"), + (3,"sl"), + + (2,0), (2,1), (0,0), (1,"l"), + (0,0), + (2,0), (2,1), (0,1), (1,"l"), + (0,1), + (3,0), (3,1), (1,0), (2,"l"), + (1,0), + (3,0), (3,1), (1,1), (2,"l"), + (1,1), + + (0,2), (0,2), (2,0), (3,"l"), + (2,0), + (0,3), (0,3), (2,1), (3,"l"), (0,"frs"), + (2,1), (0,"frs"), + (1,2), (1,2), (3,0), (0,"le"), + (3,0), + (1,3), (1,3), (3,1), (0,"le"), (1,"frs"), + (3,1), (1,"frs"), + + (0,2), (2,2), (0,2), (1,"le"), + (0,"s"), + (0,3), (2,3), (0,2), (1,"le"), + (0,"s"), + (1,2), (3,2), (0,3), (0,"lres"), + (0,"s"), + (1,3), (3,3), (0,3), (0,"lres"), + (0,"s"), + + (2,2), (2,3), (1,2), (1,"lres"), + (1,"s"), + (2,2), (2,3), (1,2), (1,"lres"), (2,"frs"), + (1,"s"), (2,"frs"), + (3,2), (3,3), (1,3), (2,"le"), + (1,"s"), + (3,2), (3,3), (1,3), (2,"le"), (3,"frs"), + (1,"s"), (3,"frs"), + + ] }, + + # INDEX 14 + # Merge of 12+13: Shifted add/sub/str's and modified load/store order + 14 : { "load_order": [3,2,1,0], + "store_order": [3,2,1,0], + "numbering": list(zip( + [1,0,2,0], + [3,2,3,1], + [0,0,2,1])), + "schedule": + [ + (0,0), (0,0), (2,-2), (3,"l"), + (2,"sl"), + (0,1), (0,1), (2,-2), (3,"l"), + (2,"sl"), + (1,0), (1,0), (2,-1), (2,"lrs"), + (2,"sl"), + (1,1), (1,1), (2,-1), (2,"lrs"), + (2,"sl"), + + (0,0), (2,0), (3,-2), (3,"lrs"), + (3,"sl"), + (0,1), (2,1), (3,-2), (3,"lrs"), + (3,"sl"), + (1,0), (3,0), (3,-1), (0,"l"), + (3,"sl"), + (1,1), (3,1), (3,-1), (0,"l"), + (3,"sl"), + + (2,0), (2,1), (0,0), (1,"l"), + (0,0), + (2,0), (2,1), (0,1), (1,"l"), + (0,1), + (3,0), (3,1), (1,0), (2,"l"), + (1,0), + (3,0), (3,1), (1,1), (2,"l"), + (1,1), + + (0,2), (0,2), (2,0), (3,"l"), + (2,0), + (0,3), (0,3), (2,1), (3,"l"), (0,"frs"), + (2,1), (0,"frs"), + (1,2), (1,2), (3,0), (0,"le"), + (3,0), + (1,3), (1,3), (3,1), (0,"le"), (1,"frs"), + (3,1), (1,"frs"), + + (0,2), (2,2), (0,2), (1,"le"), + (0,"s"), + (0,3), (2,3), (0,2), (1,"le"), + (0,"s"), + (1,2), (3,2), (0,3), (0,"lres"), + (0,"s"), + (1,3), (3,3), (0,3), (0,"lres"), + (0,"s"), + + (2,2), (2,3), (1,2), (1,"lres"), + (1,"s"), + (2,2), (2,3), (1,2), (1,"lres"), (2,"frs"), + (1,"s"), (2,"frs"), + (3,2), (3,3), (1,3), (2,"le"), + (1,"s"), + (3,2), (3,3), (1,3), (2,"le"), (3,"frs"), + (1,"s"), (3,"frs"), + + ] }, + + # INDEX 15 + # Based on 11, moving add/sub/str's down by two places, and changing load/store order + # Moving down the add/sub/str's reduces pressure on the corresponding SIMD unit because + # the last add/sub/str's are farther away from their producers. + 15 : { "load_order": [3,2,1,0], + "store_order": [3,2,1,0], + "numbering": list(zip( + [1,0,2,0], + [3,2,3,1], + [0,0,2,1])), + "schedule": + [ + (0,0), (0,0), (1,-1), (3,"l"), + (1,"sl"), + (0,1), (0,1), (1,-1), (3,"l"), + (1,"sl"), + (1,0), (1,0), (2,-2), (2,"lrs"), + (2,"sl"), + (1,1), (1,1), (2,-2), (2,"lrs"), + (2,"sl"), + (0,0), (2,0), (2,-1), (3,"lrs"), + (2,"sl"), + (0,1), (2,1), (2,-1), (3,"lrs"), + (2,"sl"), + (1,0), (3,0), (3,-2), (0,"l"), + (3,"sl"), + (1,1), (3,1), (3,-2), (0,"l"), + (3,"sl"), + (2,0), (2,1), (3,-1), (1,"l"), + (3,"sl"), + (2,0), (2,1), (3,-1), (1,"l"), + (3,"sl"), + (3,0), (3,1), (0,0), (2,"l"), + (0,0), + (3,0), (3,1), (0,1), (2,"l"), + (0,1), + (0,2), (0,2), (1,0), (3,"l"), + (1,0), + (0,3), (0,3), (1,1), (3,"l"), (0,"frs"), + (1,1), (0,"frs"), + (1,2), (1,2), (2,0), (0,"le"), + (2,0), + (1,3), (1,3), (2,1), (0,"le"), (1,"frs"), + (2,1), (1,"frs"), + (0,2), (2,2), (3,0), (1,"le"), + (3,0), + (0,3), (2,3), (3,1), (1,"le"), + (3,1), + (1,2), (3,2), (0,2), (0,"lres"), + (0,"s"), + (1,3), (3,3), (0,2), (0,"lres"), + (0,"s"), + (2,2), (2,3), (0,3), (1,"lres"), + (0,"s"), + (2,2), (2,3), (0,3), (1,"lres"), (2,"frs"), + (0,"s"), (2,"frs"), + (3,2), (3,3), (1,2), (2,"le"), + (1,"s"), + (3,2), (3,3), (1,2), (2,"le"), (3,"frs"), + (1,"s"), (3,"frs"), + + ] }, + + # INDEX 16 + # Deliberately bad, bunch lots of MULs + 16 : { "schedule": + [ (0, "m"), (0, "l"), (0, "l"), (0, "l"), (0, "l"), (0,"lre"), + (0, 0), (0, 0), (0, 0), + (0, 1), (0, 1), (0, 1), + (0, 0), (0, 0), + (0, 1), (0, 1), + (0, 2), (0, 2), (0, 2), + (0, 3), (0, 3), (0, 3), + (0, 2), (0, 2), + (0, 3), (0, 3), + (0,"s"), (0,"s"), (0,"s"), (0,"s"), (0,"frl"), + + (1, "m"), (1, "l"), (1, "l"), (1, "l"), (1, "l"), (1,"lre"), + (1, 0), (1, 0), (1, 0), + (1, 1), (1, 1), (1, 1), + (1, 0), (1, 0), + (1, 1), (1, 1), + (1, 2), (1, 2), (1, 2), + (1, 3), (1, 3), (1, 3), + (1, 2), (1, 2), + (1, 3), (1, 3), + (1,"s"), (1,"s"), (1,"s"), (1,"s"), (1,"frl"), + + (2, "m"), (2, "l"), (2, "l"), (2, "l"), (2, "l"), (2,"lre"), + (2, 0), (2, 0), (2, 0), + (2, 1), (2, 1), (2, 1), + (2, 0), (2, 0), + (2, 1), (2, 1), + (2, 2), (2, 2), (2, 2), + (2, 3), (2, 3), (2, 3), + (2, 2), (2, 2), + (2, 3), (2, 3), + (2,"s"), (2,"s"), (2,"s"), (2,"s"), (2,"frl"), + + (3, "m"), (3, "l"), (3, "l"), (3, "l"), (3, "l"), (3,"lre"), + (3, 0), (3, 0), (3, 0), + (3, 1), (3, 1), (3, 1), + (3, 0), (3, 0), + (3, 1), (3, 1), + (3, 2), (3, 2), (3, 2), + (3, 3), (3, 3), (3, 3), + (3, 2), (3, 2), + (3, 3), (3, 3), + (3,"s"), (3,"s"), (3,"s"), (3,"s"), (3,"frl") ] }, + } + + modification = modifications[idx] + + for k,v in modification.items(): + if not k in default.keys(): + raise Exception(f"Invalid modification: {k}") + + dic = { **default, **modification } + + dic["load_order_first"] = dic.get("load_order_first", dic["load_order"]) + dic["store_order_last"] = dic.get("store_order_last", dic["store_order"]) + + return dic + + def get_schedule_double_no_transpose(self, idx): + + load_order_default = [2,3,0,1] + store_order_default = [0,1,2,3] + butterfly_numbering_default = list(zip([0,1,0,2], + [2,3,1,3], + [0,0,1,2])) + twiddle_numbering_default = { 0: (0,0), + 1: (0,1), + 2: (0,2) } + root_load_order_default = list(range(0,10)) # Identity + + schedules = [ + + # INDEX 0 + # Trivial implementation, no interleaving whatsoever + (None, None, None, None, None, + ["m", "frl", + "l", "l", "l", "l", + 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, "lre", + 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, "s", "s", "s", "s" ]) + ] + + load_order, store_order, numbering, twiddles, root_order, schedule = schedules[idx] + + if load_order == None: + load_order = load_order_default + if store_order == None: + store_order = store_order_default + if numbering == None: + numbering = butterfly_numbering_default + if twiddles == None: + twiddles = twiddle_numbering_default + if root_order == None: + root_order = root_load_order_default + + return load_order, store_order, numbering, twiddles, root_order, schedule + + def get_schedule_quad_transpose(self, idx): + + load_order_default = [2,3,0,1] + store_order_default = [0,1,2,3] + butterfly_numbering_default = \ + list(zip([0,1,0,2, 0,1,0,2], + [2,3,1,3, 2,3,1,3], + [0,0,1,2, 3,3,4,5])) + twiddle_numbering_default = { 0: (0,0), + 1: (0,1), + 2: (0,2), + 3: (1,None), + 4: (2,None), + 5: (3,None) } + root_load_order_default = list(range(0,10)) # Identity + + schedules = [ + + # INDEX 0 + # Trivial implementation, no interleaving whatsoever + ( None, None, None, None, None, # All defaults + ["m", "frl", "l", "l", "l", "l", + 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, + "t", + 4, 4, 4, 4, 4, + 5, 5, 5, 5, 5, + 6, 6, 6, 6, 6, + 7, 7, 7, 7, 7, + "s", "s", "s", "s", "lre" ] ) + ] + + load_order, store_order, numbering, twiddles, root_order, schedule = schedules[idx] + + if load_order == None: + load_order = load_order_default + if store_order == None: + store_order = store_order_default + if numbering == None: + numbering = butterfly_numbering_default + if twiddles == None: + twiddles = twiddle_numbering_default + if root_order == None: + root_order = root_load_order_default + + return load_order, store_order, numbering, twiddles, root_order, schedule + + def get_schedule_quad_transpose_zipped(self, idx): + + load_order_default = [2,3,0,1] + store_order_default = [0,1,2,3] + butterfly_numbering_default = \ + list(zip([0,1,0,2, 0,1,0,2], + [2,3,1,3, 2,3,1,3], + [0,0,1,2, 3,3,4,5])) + twiddle_numbering_default = { 0: (0,0), + 1: (0,1), + 2: (0,2), + 3: (1,None), + 4: (2,None), + 5: (3,None) } + root_load_order_default = list(range(0,10)) # Identity + + schedules = [ + + # INDEX 0 + # Trivial implementation, no interleaving whatsoever + ( None, None, None, None, None, # All defaults + [(0,"m"), (0,"lr"), (0,"l"), (0,"l"), (0,"l"), (0,"l"), + (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), + (0, 1), (0, 1), (0, 1), (0, 1), (0, 1), + (0, 2), (0, 2), (0, 2), (0, 2), (0, 2), + (0, 3), (0, 3), (0, 3), (0, 3), (0, 3), + (0, "t"), + (0, 4), (0, 4), (0, 4), (0, 4), (0, 4), + (0, 5), (0, 5), (0, 5), (0, 5), (0, 5), + (0, 6), (0, 6), (0, 6), (0, 6), (0, 6), + (0, 7), (0, 7), (0, 7), (0, 7), (0, 7), + (0, "s"), (0,"s"), (0,"s"), (0,"s"), (0,"fr"), + + (1,"m"), (1,"lr"), (1,"l"), (1,"l"), (1,"l"), (1,"l"), + (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), + (1, 1), (1, 1), (1, 1), (1, 1), (1, 1), + (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), + (1, 3), (1, 3), (1, 3), (1, 3), (1, 3), + (1, "t"), + (1, 4), (1, 4), (1, 4), (1, 4), (1, 4), + (1, 5), (1, 5), (1, 5), (1, 5), (1, 5), + (1, 6), (1, 6), (1, 6), (1, 6), (1, 6), + (1, 7), (1, 7), (1, 7), (1, 7), (1, 7), + (1, "s"), (1,"s"), (1,"s"), (1,"s"), (1,"fr")] ), + + # INDEX 1 + # Zipped together two trivial implementations + ( None, None, None, None, None, # All defaults + [(0,"m"), (0,"lr"), (0,"l"), (0,"l"), (0,"l"), (0,"l"), + (1,"m"), (1,"lr"), (1,"l"), (1,"l"), (1,"l"), (1,"l"), + + (0,0),(0,0),(0,0),(0,0),(0,0), + (1,0),(1,0),(1,0),(1,0),(1,0), + + (0,1),(0,1),(0,1),(0,1),(0,1), + (1,1),(1,1),(1,1),(1,1),(1,1), + + (0,2),(0,2),(0,2),(0,2),(0,2), + (1,2),(1,2),(1,2),(1,2),(1,2), + + (0,3),(0,3),(0,3),(0,3),(0,3), + (1,3),(1,3),(1,3),(1,3),(1,3), + + (0,"t"), + (1,"t"), + + (0,4),(0,4),(0,4),(0,4),(0,4), + (1,4),(1,4),(1,4),(1,4),(1,4), + + (0,5),(0,5),(0,5),(0,5),(0,5), + (1,5),(1,5),(1,5),(1,5),(1,5), + + (0,6),(0,6),(0,6),(0,6),(0,6), + (1,6),(1,6),(1,6),(1,6),(1,6), + + (0,7),(0,7),(0,7),(0,7),(0,7), + (1,7),(1,7),(1,7),(1,7),(1,7), + + (0,"s"),(0,"s"),(0,"s"),(0,"s"),(0,"fr"), + (1,"s"),(1,"s"),(1,"s"),(1,"s"),(1,"fr")] ), + + # INDEX 2 + # Zipped together slightly different, but still at butterfly granularity + ( None, None, None, None, None, # All defaults + [(0,"m"), (0,"lr"), (0,"l"), (0,"l"), (0,"l"), (0,"l"), + (1,"m"), (1,"lr"), (1,"l"), (1,"l"), (1,"l"), (1,"l"), + + (0,0),(0,0),(0,0),(0,0),(0,0), + (0,1),(0,1),(0,1),(0,1),(0,1), + + (1,0),(1,0),(1,0),(1,0),(1,0), + (1,1),(1,1),(1,1),(1,1),(1,1), + + (0,2),(0,2),(0,2),(0,2),(0,2), + (0,3),(0,3),(0,3),(0,3),(0,3), + + (1,2),(1,2),(1,2),(1,2),(1,2), + (1,3),(1,3),(1,3),(1,3),(1,3), + + (0,"t"), + (1,"t"), + + (0,4),(0,4),(0,4),(0,4),(0,4), + (0,5),(0,5),(0,5),(0,5),(0,5), + + (1,4),(1,4),(1,4),(1,4),(1,4), + (1,5),(1,5),(1,5),(1,5),(1,5), + + (0,6),(0,6),(0,6),(0,6),(0,6), + (0,7),(0,7),(0,7),(0,7),(0,7), + + (1,6),(1,6),(1,6),(1,6),(1,6), + (1,7),(1,7),(1,7),(1,7),(1,7), + + (0,"s"),(0,"s"),(0,"s"),(0,"s"),(0,"fr"), + (1,"s"),(1,"s"),(1,"s"),(1,"s"),(1,"fr")] ), + + # INDEX 3 + # Interleave some loads + ( None, None, None, None, None, # All defaults + [(0,"m"), (0,"lrs"), (0,"lrs"), (0,"l"), (0,"l"), + (1,"m"), (1,"lrs"), (1,"lrs"), + + (0,0), (1,"l"), (0,0), (1,"l"), (0,0),(0,0),(0,0), + (0,1), (1,"l"), (0,1), (1,"l"), (0,1),(0,1),(0,1), + + (1,0),(1,0),(1,0),(1,0),(1,0), + (1,1),(1,1),(1,1),(1,1),(1,1), + + (0,2),(0,2),(0,2),(0,2),(0,2), + (0,3),(0,3),(0,3),(0,3),(0,3), + + (1,2),(1,2),(1,2),(1,2),(1,2), + (1,3),(1,3),(1,3),(1,3),(1,3), + + (0,"t"), (0,"lrs"), (0,"lrs"), + (1,"t"), (1,"lrs"), (1,"lrs"), + + (0,4),(0,4),(0,4),(0,4),(0,4), + (0,5),(0,5),(0,5),(0,5),(0,5), + + (1,4),(1,4),(1,4),(1,4),(1,4), (0,"lrs"), (0,"lrs"), + (1,5),(1,5),(1,5),(1,5),(1,5), (0,"lrs"), (0,"lrs"), + + (0,6),(0,6),(0,6),(0,6),(0,6), (1,"lrs"), (1,"lrs"), + (0,7),(0,7),(0,7),(0,7),(0,7), (1,"lrs"), (1,"lrs"), + + (1,6), (0, "le"), (1,6), (1,6),(1,6),(1,6), + (1,7), (0, "le"), (1,7), (1,7),(1,7),(1,7), + + (0,"s"),(0,"s"), (0,"s"),(0,"s"), (0,"fr"), + (1,"s"),(1,"s"),(1,"s"),(1,"s"),(1,"fr")] ), + + # INDEX 4 + # Interleave loads + transposition + ( None, None, None, None, None, # All defaults + [(0,"m"), (0,"lrs"), (0,"lrs"), (0,"l"), (0,"l"), + (1,"m"), (1,"lrs"), (1,"lrs"), + + (0,0), (1,"l"), (0,0), (1,"l"), (0,0),(0,0),(0,0), + (0,1), (1,"l"), (0,1), (1,"l"), (0,1),(0,1),(0,1), + + (1,0),(1,0),(1,0),(1,0),(1,0), + (1,1),(1,1),(1,1),(1,1),(1,1), + + (0,2),(0,2),(0,2),(0,2),(0,2), + (0,3),(0,3),(0,3),(0,3),(0,3), + + (1,2), (1,2), (1,2), + (1,2), (0,"ts"), (0,"ts"), + (1,2), (0,"ts"), (0,"ts"), + (1,3), (0,"lrs"), (1,3), (0,"lrs"), + (1,3), (0,"ts"), (0,"ts"), + (1,3), (0,"ts"), (0,"ts"), + (1,3), + + + (0,4), (0,4), + (0,4), (1, "ts"), (1, "ts"), + (0,4), (1, "ts"), (1, "ts"), + (0,4), (1, "ts"), (1, "ts"), + (0,5), (1, "ts"), (1, "ts"), + (1,"lrs"), (1,"lrs"), (0,5),(0,5),(0,5),(0,5), + + (1,4),(1,4),(1,4),(1,4),(1,4), (0,"lrs"), (0,"lrs"), + (1,5),(1,5),(1,5),(1,5),(1,5), (0,"lrs"), (0,"lrs"), + + (0,6),(0,6),(0,6),(0,6),(0,6), (1,"lrs"), (1,"lrs"), + (0,7),(0,7),(0,7),(0,7),(0,7), (1,"lrs"), (1,"lrs"), + + (1,6), (0, "le"), (1,6), (1,6),(1,6),(1,6), + (1,7), (0, "le"), (1,7), (1,7),(1,7),(1,7), + + (0,"s"),(0,"s"), (0,"s"),(0,"s"), (0,"fr"), + (1,"s"),(1,"s"),(1,"s"),(1,"s"),(1,"fr")] ), + + # INDEX 5 + # Interleave arithmetic only + ( None, None, None, None, None, # All defaults + [(0,"m"), (0,"lrs"), (0,"lrs"), (0,"l"), (0,"l"), + (1,"m"), (1,"lrs"), (1,"lrs"), + + (0,0), (1,"l"), (0,0), (1,"l"), + (0,1), (1,"l"), (0,1), (1,"l"), (0,0), + (1,0), (1,0), (0,1), (0,0), (0,0), + (1,1), (1,1), (1,0), (0,1), (0,1), + (0,2), (0,2), (1,1), (1,0), (1,0), + (0,3), (0,3), (0,2), (1,1), (1,1), + (1,2), (1,2), (0,3), (0,2), (0,2), + (1,3), (1,3), (1,2), (0,3), (0,3), + (0, "t"), (0, "lrs"), (0, "lrs"), + (0,4), (0,4), (1,3), (1,2), (1,2), + (0,5), (0,5), (0,4), (1,3), (1,3), + (1, "t"), (1, "lrs"), (1, "lrs"), + (1,4), + (0,"lrs"), (0,"lrs"), + (1,4), (0,5), (0,4), (0,4), + (1,5), (0,"lrs"), (0,"lrs"), (1,5), (1,4), (0,5), (0,5), + (0,6), (1,"lrs"), (1,"lrs"), (0,6), (1,5), (1,4), (1,4), + (0,7), (1,"lrs"), (1,"lrs"), (0,7), (0,6), (1,5), (1,5), + (1,6), (0, "le"), (1,6), (0,7), (0,6), (0,6), + (1,7), (0, "le"), (1,7), (1,6), (0,7), (0,7), + (1,7), (1,6), (1,6), + (1,7), (1,7), + + (0,"s"),(0,"s"), (0,"s"),(0,"s"), (0,"fr"), + (1,"s"),(1,"s"),(1,"s"),(1,"s"),(1,"fr")] ), + + ] + + load_order, store_order, numbering, twiddles, root_order, schedule = schedules[idx] + + if load_order == None: + load_order = load_order_default + if store_order == None: + store_order = store_order_default + if numbering == None: + numbering = butterfly_numbering_default + if twiddles == None: + twiddles = twiddle_numbering_default + if root_order == None: + root_order = root_load_order_default + + return load_order, store_order, numbering, twiddles, root_order, schedule + + def get_schedule_quad_transpose_quad_zipped(self, idx): + + load_order_default = [2,3,0,1] + store_order_default = [0,1,2,3] + butterfly_numbering_default = \ + list(zip([0,1,0,2, 0,1,0,2], + [2,3,1,3, 2,3,1,3], + [0,0,1,2, 3,3,4,5])) + twiddle_numbering_default = { 0: (0,0), + 1: (0,1), + 2: (0,2), + 3: (1,None), + 4: (2,None), + 5: (3,None) } + root_load_order_default = list(range(0,10)) # Identity + + schedules = [ + + # INDEX 0 + # Trivial implementation, no interleaving whatsoever + ( None, None, None, None, None, # All defaults + [(0,"m"), (0,"lr"), (0,"l"), (0,"l"), (0,"l"), (0,"l"), + (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), + (0, 1), (0, 1), (0, 1), (0, 1), (0, 1), + (0, 2), (0, 2), (0, 2), (0, 2), (0, 2), + (0, 3), (0, 3), (0, 3), (0, 3), (0, 3), + (0, "t"), + (0, 4), (0, 4), (0, 4), (0, 4), (0, 4), + (0, 5), (0, 5), (0, 5), (0, 5), (0, 5), + (0, 6), (0, 6), (0, 6), (0, 6), (0, 6), + (0, 7), (0, 7), (0, 7), (0, 7), (0, 7), + (0, "s"), (0,"s"), (0,"s"), (0,"s"), (0,"fr"), + + (1,"m"), (1,"lr"), (1,"l"), (1,"l"), (1,"l"), (1,"l"), + (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), + (1, 1), (1, 1), (1, 1), (1, 1), (1, 1), + (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), + (1, 3), (1, 3), (1, 3), (1, 3), (1, 3), + (1, "t"), + (1, 4), (1, 4), (1, 4), (1, 4), (1, 4), + (1, 5), (1, 5), (1, 5), (1, 5), (1, 5), + (1, 6), (1, 6), (1, 6), (1, 6), (1, 6), + (1, 7), (1, 7), (1, 7), (1, 7), (1, 7), + (1, "s"), (1,"s"), (1,"s"), (1,"s"), (1,"fr"), + + (2,"m"), (2,"lr"), (2,"l"), (2,"l"), (2,"l"), (2,"l"), + (2, 0), (2, 0), (2, 0), (2, 0), (2, 0), + (2, 1), (2, 1), (2, 1), (2, 1), (2, 1), + (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), + (2, 3), (2, 3), (2, 3), (2, 3), (2, 3), + (2, "t"), + (2, 4), (2, 4), (2, 4), (2, 4), (2, 4), + (2, 5), (2, 5), (2, 5), (2, 5), (2, 5), + (2, 6), (2, 6), (2, 6), (2, 6), (2, 6), + (2, 7), (2, 7), (2, 7), (2, 7), (2, 7), + (2, "s"), (2,"s"), (2,"s"), (2,"s"), (2,"fr"), + + (3,"m"), (3,"lr"), (3,"l"), (3,"l"), (3,"l"), (3,"l"), + (3, 0), (3, 0), (3, 0), (3, 0), (3, 0), + (3, 1), (3, 1), (3, 1), (3, 1), (3, 1), + (3, 2), (3, 2), (3, 2), (3, 2), (3, 2), + (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), + (3, "t"), + (3, 4), (3, 4), (3, 4), (3, 4), (3, 4), + (3, 5), (3, 5), (3, 5), (3, 5), (3, 5), + (3, 6), (3, 6), (3, 6), (3, 6), (3, 6), + (3, 7), (3, 7), (3, 7), (3, 7), (3, 7), + (3, "s"), (3,"s"), (3,"s"), (3,"s"), (3,"fr")] ), + + # INDEX 1 + # Interleave pre- and post-transpose arithmetic + ( None, None, None, None, None, # All defaults + [(0,"m"), (0,"l"), (0,"l"), (0,"l"), (0,"l"), + (1,"m"), (1,"l"), (1,"l"), (1,"l"), (1,"l"), + (2,"m"), (2,"l"), (2,"l"), (2,"l"), (2,"l"), + (3,"m"), (3,"l"), (3,"l"), (3,"l"), (3,"l"), + + (0,"lrs"), (0,"lrs"), + (1,"lrs"), (1,"lrs"), + (2,"lrs"), (2,"lrs"), + (3,"lrs"), (3,"lrs"), + + (0,0), (0,0), + (0,1), (0,1), (0,0), + (1,0), (1,0), (0,1), (0,0), (0,0), + (1,1), (1,1), (1,0), (0,1), (0,1), + (0,2), (0,2), (1,1), (1,0), (1,0), + (0,3), (0,3), (0,2), (1,1), (1,1), + (1,2), (1,2), (0,3), (0,2), (0,2), (0, "frs"), (0, "frs"), + (1,3), (1,3), (1,2), (0,3), (0,3), + (2,0), (2,0), (1,3), (1,2), (1,2), (1, "frs"), (1, "frs"), + (2,1), (2,1), (2,0), (1,3), (1,3), + (3,0), (3,0), (2,1), (2,0), (2,0), + (3,1), (3,1), (3,0), (2,1), (2,1), + (2,2), (2,2), (3,1), (3,0), (3,0), + (2,3), (2,3), (2,2), (3,1), (3,1), + (3,2), (3,2), (2,3), (2,2), (2,2), (2, "frs"), (2, "frs"), + (3,3), (3,3), (3,2), (2,3), (2,3), + (3,3), (3,2), (3,2), (3, "frs"), (3, "frs"), + (3,3), (3,3), + + (0, "t"), + (1, "t"), + (2, "t"), + (3, "t"), + + (0, "lrs"), (0, "lrs"), + (0, 4), (0, 4), + (0, 5), (0, 5), + (0, "frs"), (0, "frs"), + (0, 4), + (1, "lrs"), (1, "lrs"), + (1, 4), (1, 4), (0, 5), (0, 4), (0, 4), + (1, 5), (1, 5), + (1, "frs"), (1, "frs"), + (1, 4), (0, 5), (0, 5), + (0, "lrs"), (0, "lrs"), + (0, 6), (0, 6), + (0, "frs"), (0, "frs"), + (1, 5), (1, 4), (1, 4), + (0, "lrs"), (0, "lrs"), + (0, 7), (0, 7), + (0, "frs"), (0, "frs"), + (0, 6), (1, 5), (1, 5), + (1, "lrs"), (1, "lrs"), + (1, 6), (1, 6), + (1, "frs"), (1, "frs"), + (0, 7), (0, 6), (0, 6), + (1, "lrs"), (1, "lrs"), + (1, 7), (1, 7), + (1, "frs"), (1, "frs"), + (1, 6), (0, 7), (0, 7), + (1, 7), (1, 6), (1, 6), + (1, 7), (1, 7), + + (0, "s"), (0,"s"), (0,"s"), (0,"s"), + (1, "s"), (1,"s"), (1,"s"), (1,"s"), + + (2, "lrs"), (2, "lrs"), + (2, 4), (2, 4), + (2, 5), (2, 5), + (2, "frs"), (2, "frs"), + (2, 4), + (3, "lrs"), (3, "lrs"), + (3, 4), (3, 4), (2, 5), (2, 4), (2, 4), + (3, 5), (3, 5), + (3, "frs"), (3, "frs"), + (3, 4), (2, 5), (2, 5), + (2, "lrs"), (2, "lrs"), + (2, 6), (2, 6), + (2, "frs"), (2, "frs"), + (3, 5), (3, 4), (3, 4), + (2, "lrs"), (2, "lrs"), + (2, 7), (2, 7), + (2, "frs"), (2, "frs"), + (2, 6), (3, 5), (3, 5), + (3, "lrs"), (3, "lrs"), + (3, 6), (3, 6), + (3, "frs"), (3, "frs"), + (2, 7), (2, 6), (2, 6), + (3, "lrs"), (3, "lrs"), + (3, 7), (3, 7), + (3, "frs"), (3, "frs"), + (3, 6), (2, 7), (2, 7), + (3, 7), (3, 6), (3, 6), + (3, 7), (3, 7), + + (2, "s"), (2,"s"), (2,"s"), (2,"s"), + (3, "s"), (3,"s"), (3,"s"), (3,"s") ] ), + + # INDEX 2 + # Interleave pre- and post-transpose arithmetic, and transpose + ( None, None, None, None, None, # All defaults + [(0,"m"), (0,"l"), (0,"l"), (0,"l"), (0,"l"), + (1,"m"), (1,"l"), (1,"l"), (1,"l"), (1,"l"), + (2,"m"), (2,"l"), (2,"l"), (2,"l"), (2,"l"), + (3,"m"), (3,"l"), (3,"l"), (3,"l"), (3,"l"), + + (0,"lrs"), (0,"lrs"), + (1,"lrs"), (1,"lrs"), + (2,"lrs"), (2,"lrs"), + (3,"lrs"), (3,"lrs"), + + (0,0), (0,0), + (0,1), (0,1), (0,0), + (1,0), (1,0), (0,1), (0,0), (0,0), + (1,1), (1,1), (1,0), (0,1), (0,1), + (0,2), (0,2), (1,1), (1,0), (1,0), + (0,3), (0,3), (0,2), (1,1), (1,1), + (1,2), (1,2), (0,3), (0,2), (0,2), (0, "frs"), (0, "frs"), + (1,3), (1,3), (1,2), (0,3), (0,3), + (2,0), (2,0), + (0, "ts"), + (0, "ts"), + (1,3), (1,2), (1,2), (1, "frs"), (1, "frs"), + (2,1), (2,1), + (0, "ts"), + (0, "ts"), + (2,0), (1,3), (1,3), + (3,0), (3,0), + (0, "ts"), + (0, "ts"), + (2,1), (2,0), (2,0), + (3,1), (3,1), + (0, "ts"), + (0, "ts"), + (3,0), (2,1), (2,1), + (2,2), (2,2), + (1, "ts"), + (1, "ts"), + (3,1), (3,0), (3,0), + (2,3), (2,3), + (1, "ts"), + (1, "ts"), + (2,2), (3,1), (3,1), + (3,2), (3,2), + (1, "ts"), + (1, "ts"), + (2,3), (2,2), (2,2), (2, "frs"), (2, "frs"), + (3,3), (3,3), + (1, "ts"), + (1, "ts"), + (3,2), (2,3), (2,3), + (3,3), (3,2), (3,2), (3, "frs"), (3, "frs"), + (3,3), (3,3), + + (0, "lrs"), (0, "lrs"), + (0, 4), (0, 4), + (2, "ts"), + (2, "ts"), + (0, 5), (0, 5), + (2, "ts"), + (2, "ts"), + (0, "frs"), (0, "frs"), + (0, 4), + (1, "lrs"), (1, "lrs"), + (1, 4), (1, 4), + (2, "ts"), + (2, "ts"), + (0, 5), (0, 4), (0, 4), + (1, 5), (1, 5), + (1, "frs"), (1, "frs"), + (2, "ts"), + (2, "ts"), + (1, 4), (0, 5), (0, 5), + (0, "lrs"), (0, "lrs"), + (0, 6), (0, 6), + (0, "frs"), (0, "frs"), + (3, "ts"), + (3, "ts"), + (1, 5), (1, 4), (1, 4), + (0, "lrs"), (0, "lrs"), + (0, 7), (0, 7), + (0, "frs"), (0, "frs"), + (3, "ts"), + (3, "ts"), + (0, 6), (1, 5), (1, 5), + (1, "lrs"), (1, "lrs"), + (1, 6), (1, 6), + (1, "frs"), (1, "frs"), + (3, "ts"), + (3, "ts"), + (0, 7), (0, 6), (0, 6), + (1, "lrs"), (1, "lrs"), + (1, 7), (1, 7), + (1, "frs"), (1, "frs"), + (3, "ts"), + (3, "ts"), + (1, 6), (0, 7), (0, 7), + (1, 7), (1, 6), (1, 6), + (1, 7), (1, 7), + + (0, "s"), (0,"s"), (0,"s"), (0,"s"), + (1, "s"), (1,"s"), (1,"s"), (1,"s"), + + (2, "lrs"), (2, "lrs"), + (2, 4), (2, 4), + (2, 5), (2, 5), + (2, "frs"), (2, "frs"), + (2, 4), + (3, "lrs"), (3, "lrs"), + (3, 4), (3, 4), (2, 5), (2, 4), (2, 4), + (3, 5), (3, 5), + (3, "frs"), (3, "frs"), + (3, 4), (2, 5), (2, 5), + (2, "lrs"), (2, "lrs"), + (2, 6), (2, 6), + (2, "frs"), (2, "frs"), + (3, 5), (3, 4), (3, 4), + (2, "lrs"), (2, "lrs"), + (2, 7), (2, 7), + (2, "frs"), (2, "frs"), + (2, 6), (3, 5), (3, 5), + (3, "lrs"), (3, "lrs"), + (3, 6), (3, 6), + (3, "frs"), (3, "frs"), + (2, 7), (2, 6), (2, 6), + (3, "lrs"), (3, "lrs"), + (3, 7), (3, 7), + (3, "frs"), (3, "frs"), + (3, 6), (2, 7), (2, 7), + (3, 7), (3, 6), (3, 6), + (3, 7), (3, 7), + + (2, "s"), (2,"s"), (2,"s"), (2,"s"), + (3, "s"), (3,"s"), (3,"s"), (3,"s") ] ), + + # INDEX 3 + # Interleave pre- and post-transpose arithmetic, and transpose + # And loads + stores + ( None, None, None, None, None, # All defaults + [(0,"m"), + (1,"m"), + + (0,0), (0,0), + (0,1), (0,1), + (2, "sl"), + (2, "sl"), + (2, "sl"), + (2, "sl"), + (0,0), + (0, "l"), + (1,0), + (0, "l"), + (1,0), + (3, "sl"), + (3, "sl"), + (3, "sl"), + (3, "sl"), + (0,1), + (0,0), (0,0), + (1,1), + (2,"l"), + (1,1), + (2,"l"), + (1,0), + (1,"l"), + (0,1), (0,1), + (0,2), + (2,"l"), + (0,2), + (2,"l"), + (1,1), + (1,"l"), + (1,0), (1,0), + (0,3), + (2,"lrs"), + (0,3), + (2,"lrs"), + (0,2), + (1,1), + (1,1), + (1,2), + (3,"l"), + (1,2), + (3,"l"), + (0,3), (0,2), (0,2), + (0, "frs"), + (0, "frs"), + (1,3), + (3,"l"), + (1,3), + (3,"l"), + (1,2), (0,3), (0,3), + (2,0), + (3,"lrs"), + (2,0), + (3,"lrs"), + (0, "ts"), + (0, "ts"), + (1,3), (1,2), (1,2), + (1, "frs"), + (1, "frs"), + (2,1), (2,1), + (0, "ts"), + (0, "ts"), + (2,0), (1,3), (1,3), + (3,0), (3,0), + (0, "ts"), + (0, "ts"), + (2,1), (2,0), (2,0), + (3,1), (3,1), + (0, "ts"), + (0, "ts"), + (3,0), (2,1), (2,1), + (2,2), (2,2), + (1, "ts"), + (1, "ts"), + (3,1), (3,0), (3,0), + (2,3), (2,3), + (1, "ts"), + (1, "ts"), + (2,2), (3,1), (3,1), + (3,2), (3,2), + (1, "ts"), + (1, "ts"), + (2,3), (2,2), (2,2), + (2, "frs"), + (2, "frs"), + (3,3), (3,3), + (1, "ts"), + (1, "ts"), + (3,2), (2,3), (2,3), + (3,3), (3,2), (3,2), + (3, "frs"), + (3, "frs"), + (3,3), (3,3), + + (0, "lrs"), + (0, "lrs"), + (0, 4), (0, 4), + (2, "ts"), + (2, "ts"), + (0, 5), (0, 5), + (0, "frs"), + (0, "frs"), + (2, "ts"), + (2, "ts"), + (0, 4), + (1, "lrs"), + (1, "lrs"), + (1, 4), (1, 4), + (2, "ts"), + (2, "ts"), + (0, 5), (0, 4), (0, 4), + (1, 5), (1, 5), + (1, "frs"), + (1, "frs"), + (2, "ts"), + (2, "ts"), + (1, 4), (0, 5), (0, 5), + (0, "lrs"), + (0, "lrs"), + (0, 6), (0, 6), + (0, "frs"), + (0, "frs"), + (3, "ts"), + (3, "ts"), + (1, 5), (1, 4), (1, 4), + (0, "lrs"), + (0, "lrs"), + (0, 7), (0, 7), + (0, "frs"), + (0, "frs"), + (3, "ts"), + (3, "ts"), + (0, 6), (1, 5), (1, 5), + (1, "lrs"), + (1, "lrs"), + (1, 6), (1, 6), + (1, "frs"), + (1, "frs"), + (3, "ts"), + (3, "ts"), + (0, 7), (0, 6), (0, 6), + (1, "lrs"), + (1, "lrs"), + (1, 7), (1, 7), + (1, "frs"), + (1, "frs"), + (3, "ts"), + (3, "ts"), + (1, 6), (0, 7), (0, 7), + (1, 7), (1, 6), (1, 6), + (1, 7), (1, 7), + + ######################################################### + + (2, "lrs"), + (2, "lrs"), + (2, 4), (2, 4), + (0, "s"), + (2, 5), + (0, "s"), + (2, 5), + (2, "frs"), + (2, "frs"), + (0, "s"), + (2, 4), + (3, "lrs"), + (3, "lrs"), + (3, 4), + (1, "s"), + (3, 4), + (0, "s"), + (2, 5), + (1, "s"), + (2, 4), (2, 4), + (3, 5), (3, 5), + (3, "frs"), + (3, "frs"), + (1, "s"), + (3, 4), (2, 5), (2, 5), + (2, "lrs"), + (2, "lrs"), + (2, 6), (2, 6), + (2, "frs"), + (2, "frs"), + (1, "s"), + (3, 5), (3, 4), (3, 4), + (2, "lrs"), + (2, "lrs"), + (2, 7), (2, 7), + (2, "frs"), + (2, "frs"), + (2, 6), (3, 5), (3, 5), + (3, "lrs"), + (3, "lrs"), + (3, 6), (3, 6), + (3, "frs"), + (3, "frs"), + (2, 7), (2, 6), (2, 6), + (3, "lrs"), + (3, "lrs"), + (3, 7), (3, 7), + (3, "frs"), + (3, "frs"), + (0, "le"), + (3, 6), + (0, "le"), + (2, 7), + (0, "lres"), + (2, 7), + (0, "lres"), + (3, 7), + (1,"lres"), + (3, 6), + (1, "le"), + (3, 6), + (1,"lres"), + (3, 7), + (1, "le"), + (3, 7), + + ] ), + + # INDEX 4 + # Interleave pre- and post-transpose arithmetic, and transpose + # And loads + stores + ( None, None, None, None, None, # All defaults + [(0,0), (0,0), (3, -1), (3, -2), (3, -2), + (0,1), (0,1), + (2, "sl"), + (2, "sl"), + (2, "sl"), + (2, "sl"), + (0,0), (3,-1), (3,-1), + (0, "l"), + (1,0), + (0, "l"), + (1,0), + (3, "sl"), + (3, "sl"), + (3, "sl"), + (3, "sl"), + (0,1), + (0,0), (0,0), + (1,1), + (2,"l"), + (1,1), + (2,"l"), + (1,0), + (1,"l"), + (0,1), (0,1), + (0,2), + (2,"l"), + (0,2), + (2,"l"), + (1,1), + (1,"l"), + (1,0), (1,0), + (0,3), + (2,"lrs"), + (0,3), + (2,"lrs"), + (0,2), + (1,1), + (1,1), + (1,2), + (3,"l"), + (1,2), + (3,"l"), + (0,3), (0,2), (0,2), + (0, "frs"), + (0, "frs"), + (1,3), + (3,"l"), + (1,3), + (3,"l"), + (1,2), (0,3), (0,3), + (2,0), + (3,"lrs"), + (2,0), + (3,"lrs"), + (0, "ts"), + (0, "ts"), + (1,3), (1,2), (1,2), + (1, "frs"), + (1, "frs"), + (2,1), (2,1), + (0, "ts"), + (0, "ts"), + (2,0), (1,3), (1,3), + (3,0), (3,0), + (0, "ts"), + (0, "ts"), + (2,1), (2,0), (2,0), + (3,1), (3,1), + (0, "ts"), + (0, "ts"), + (3,0), (2,1), (2,1), + (2,2), (2,2), + (1, "ts"), + (1, "ts"), + (3,1), (3,0), (3,0), + (2,3), (2,3), + (1, "ts"), + (1, "ts"), + (0, "lrs"), + (0, "lrs"), + (2,2), (3,1), (3,1), + (3,2), (3,2), + (1, "ts"), + (1, "ts"), + (2,3), (2,2), (2,2), + (2, "frs"), + (2, "frs"), + (3,3), (3,3), + (1, "ts"), + (1, "ts"), + (3,2), (2,3), (2,3), + (0,4), (0,4), (3,3), (3,2), (3,2), + (3, "frs"), + (3, "frs"), + (1, "lrs"), + (1, "lrs"), + (0,5), (0,5), + (2, "ts"), + (2, "ts"), + + (0,4), (3,3), (3,3), + + (0, "frs"), + (0, "frs"), + (0, "lrs"), + (0, "lrs"), + (2, "ts"), + (2, "ts"), + + (1, 4), (1, 4), + (2, "ts"), + (2, "ts"), + (0, "lrs"), + (0, "lrs"), + (0, 5), (0, 4), (0, 4), + (1, 5), (1, 5), + (1, "frs"), + (1, "frs"), + (2, "ts"), + (2, "ts"), + (1, "lrs"), + (1, "lrs"), + (1, 4), (0, 5), (0, 5), + (0, 6), (0, 6), + (0, "frs"), + (0, "frs"), + (3, "ts"), + (3, "ts"), + (1, "lrs"), + (1, "lrs"), + (1, 5), (1, 4), (1, 4), + (0, 7), (0, 7), + (0, "frs"), + (0, "frs"), + (3, "ts"), + (3, "ts"), + (0, 6), (1, 5), (1, 5), + (1, 6), (1, 6), + (2, "lrs"), + (2, "lrs"), + (1, "frs"), + (1, "frs"), + (3, "ts"), + (3, "ts"), + (0, 7), (0, 6), (0, 6), + (1, 7), (1, 7), + (1, "frs"), + (1, "frs"), + (3, "ts"), + (3, "ts"), + (1, 6), (0, 7), (0, 7), + (2, 4), + (3, "lrs"), + (3, "lrs"), + (2, 4), + (0, "s"), + (0, "s"), + (1, 7), (1, 6), (1, 6), + (2, 5), (2, 5), + (2, "frs"), + (2, "frs"), + (0, "s"), + (2, 4), (1, 7), (1, 7), + (2, "lrs"), + (2, "lrs"), + (3, 4), + (1, "s"), + (3, 4), + (0, "s"), + (2, 5), + (1, "s"), + (2, 4), (2, 4), + (2, "lrs"), + (2, "lrs"), + (3, 5), (3, 5), + (3, "frs"), + (3, "frs"), + (1, "s"), + (3, 4), (2, 5), (2, 5), + (3, "lrs"), + (3, "lrs"), + (2, 6), (2, 6), + (2, "frs"), + (2, "frs"), + (3, "lrs"), + (3, "lrs"), + (1, "s"), + (3, 5), (3, 4), (3, 4), + (2, 7), (2, 7), + (2, "frs"), + (2, "frs"), + (2, 6), (3, 5), (3, 5), + (3, 6), (3, 6), + (3, "frs"), + (3, "frs"), + (0, "le"), + (2, 7), + (0, "le"), + (2, 6), (2, 6), + (3, 7), + (0, "lres"), + (3, 7), + (3, "frs"), + (3, "frs"), + (0, "lres"), + (3, 6), + (1,"lres"), + (2, 7), + (1, "le"), + (2, 7), + (1,"lres"), + (1, "le"), + + ]), + ] + + load_order, store_order, numbering, twiddles, root_order, schedule = schedules[idx] + + if load_order == None: + load_order = load_order_default + if store_order == None: + store_order = store_order_default + if numbering == None: + numbering = butterfly_numbering_default + if twiddles == None: + twiddles = twiddle_numbering_default + if root_order == None: + root_order = root_load_order_default + + return load_order, store_order, numbering, twiddles, root_order, schedule + + def get_schedule_quad_no_transpose(self, idx): + + def add(n): + def _add(x): + if isinstance(x,int): + return x + n + else: + return x + return _add + + butterfly_numbering_default = \ + list(zip( + [0, 1, 2, 3, 4, 5, 6, 7, 0,1,2,3, 8, 9,10,11, 0,1,4,5, 8, 9,12,13, 0,2,4, 6, 8,10,12,14], + [8, 9,10,11,12,13,14,15, 4,5,6,7,12,13,14,15, 2,3,6,7,10,11,14,15, 1,3,5, 7, 9,11,13,15], + [0, 0, 0, 0, 0, 0, 0, 0, 1,1,1,1, 2, 2, 2, 2, 3,3,4,4, 5, 5, 6, 6, 7,8,9,10,11,12,13,14])) + + + default = { + "load_order": [12,13,14,15,4,5,6,7,8,9,10,11,0,1,2,3], + "store_order": list(range(0,16)), + "numbering": butterfly_numbering_default[4:8] + butterfly_numbering_default[0:4] + \ + butterfly_numbering_default[10:12] + butterfly_numbering_default[8:10] + \ + butterfly_numbering_default[14:16] + butterfly_numbering_default[12:14] + \ + butterfly_numbering_default[16:32], + "twiddles": { 0: (0,0), + 1: (0,1), + 2: (0,2), + 3: (1,0), + 4: (1,1), + 5: (1,2), + 6: (1,3), + 7: (2,0), + 8: (2,1), + 9: (2,2), + 10: (2,3), + 11: (3,0), + 12: (3,1), + 13: (3,2), + 14: (3,3) }, + "root_load_order": list(range(0,10)), + "schedule": None } + + modifications = { + + # INDEX 0 + # Trivial implementation, no interleaving whatsoever + 0 : { "schedule": + ["m", "frl", "l", "l", "l", "l", + "l", "l", "l", "l", + "l", "l", "l", "l", + "l", "l", "l", "l", + + 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, + 4, 4, 4, 4, 4, + 5, 5, 5, 5, 5, + 6, 6, 6, 6, 6, + 7, 7, 7, 7, 7, + + 8, 8, 8, 8, 8, + 9, 9, 9, 9, 9, + 10, 10, 10, 10, 10, + 11, 11, 11, 11, 11, + 12, 12, 12, 12, 12, + 13, 13, 13, 13, 13, + 14, 14, 14, 14, 14, + 15, 15, 15, 15, 15, + + 16, 16, 16, 16, 16, + 17, 17, 17, 17, 17, + 18, 18, 18, 18, 18, + 19, 19, 19, 19, 19, + 20, 20, 20, 20, 20, + 21, 21, 21, 21, 21, + 22, 22, 22, 22, 22, + 23, 23, 23, 23, 23, + + 24, 24, 24, 24, 24, + 25, 25, 25, 25, 25, + 26, 26, 26, 26, 26, "lre", + 27, 27, 27, 27, 27, + 28, 28, 28, 28, 28, + 29, 29, 29, 29, 29, + 30, 30, 30, 30, 30, + 31, 31, 31, 31, 31, + + "s", "s", "s", "s", + "s", "s", "s", "s", + "s", "s", "s", "s", + "s", "s", "s", "s" ] }, + + # INDEX 1 + # First interleaving attempt: Arithmetic only + # Space out arithmetic operations to account + # for A72/N1 latencies of multiplications. + 1 : { "schedule": + ["m", "l", "l", "l", "l", + "l", "l", "l", "l", + "l", "l", "l", "l", + "l", "l", "l", "l", + + 0, 0, + 1, 1, 0, + 2, 2, 1, 0, 0, + 3, 3, 2, 1, 1, + 4, 4, 3, 2, 2, + 5, 5, 4, 3, 3, + 6, 6, 5, 4, 4, + 7, 7, 6, 5, 5, + + 8, 8, 7, 6, 6, + 9, 9, 8, 7, 7, + 10, 10, 9, 8, 8, + 11, 11,10, 9, 9, + 12, 12,11,10, 10, + 13, 13,12,11, 11, + + 14, 14,13,12, 12, + 15, 15,14,13, 13, + + 16, 16,15,14, 14, + 17, 17,16,15, 15, + 18, 18,17,16, 16, + 19, 19,18,17, 17, + + 20, 20,19,18, 18, + 21, 21,20,19, 19, + 22, 22,21,20, 20, + 23, 23,22,21, 21, + + 24, 24,23,22, 22, + 25, 25,24,23, 23, "lre", + 26, 26,25,24, 24, + 27, 27,26,25, 25, + + 28, 28,27,26, 26, + 29, 29,28,27, 27, + 30, 30,29,28, 28, + 31, 31,30,29, 29, + 31,30, 30, + 31, 31, + + "s", "s", "s", "s", + "s", "s", "s", "s", + "s", "s", "s", "s", + "s", "s", "s", "s", "frl" ] }, + + # INDEX 2 + # TODO: Document + 2 : { "schedule": + ["m", 0, "sl", "l", 0, "l", -1, -2, -2, "sl", + 1, "sl", "l", 1, "l", 0, -1, -1, "sl", + 2, "sl", 2, "l", 1, 0, 0, "sl", + 3, "sl", 3, "l", 2, 1, 1, + 4, "l", 4, 3, 2, 2, + 5, "l", 5, 4, 3, 3, + 6, 6, 5, 4, 4, + 7, 7, 6, 5, 5, + + 8, 8, 7, 6, 6, + 9, 9, 8, 7, 7, + 10, 10, 9, 8, 8, + 11, 11,10, 9, 9, + 12, 12,11,10, 10, + 13, 13,12,11, 11, + + 14, 14,13,12, 12, + 15, 15,14,13, 13, + + 16, 16,15,14, 14, + 17, 17,16,15, 15, + 18, 18,17,16, 16, + 19, 19,18,17, 17, + + 20, 20,19,18, 18, + 21, 21,20,19, 19, + 22, 22,21,20, 20, + 23, 23,22,21, 21, + + 24, 24,23,22, 22, "lre", + 25, 25,24,23, 23, + 26, 26,25,24, 24, + 27, 27, 26, 25, 25, "s", + 28, "s", 28, "le", 27, "le", 26, 26, "s", + 29, "s", 29, "le", 28, "le", 27, 27, "s", + 30, "s", 30, "le", 29, "le", 28, 28, "s", + 31, "s", 31, "frl", "le", 30, "le", 29, 29, "s" ] }, + + # INDEX 3 + # TODO: Document + 3 : { "schedule": + ["m", 0, "sl", "l", 0, "l", -2, -1, -2, "sl", + 1, "sl", "l", 1, "l", -1, 0, -1, "sl", + 2, "sl", 2, "l", 0, 1, 0, "sl", + 3, "sl", 3, "l", 1, 2, 1, + 4, "l", 4, 2, 3, 2, + 5, "l", 5, 3, 4, 3, + 6, 6, 4, 5, 4, + 7, 7, 5, 6, 5, + + 8, 8, 6, 7, 6, + 9, 9, 7, 8, 7, + 10, 10, 8, 9, 8, + 11, 11, 9,10, 9, + 12, 12,10,11, 10, + 13, 13,11,12, 11, + + 14, 14,12,13, 12, + 15, 15,13,14, 13, + + 16, 16,14,15, 14, + 17, 17,15,16, 15, + 18, 18,16,17, 16, + 19, 19,17,18, 17, + + 20, 20,18,19, 18, + 21, 21,19,20, 19, + 22, 22,20,21, 20, + 23, 23,21,22, 21, + + 24, 24,22,23, 22, "lre", + 25, 25,23,24, 23, + 26, 26,24,25, 24, + 27, 27, 25, 26, 25, "s", + 28, "s", 28, "le", 26, "le", 27, 26, "s", + 29, "s", 29, "le", 27, "le", 28, 27, "s", + 30, "s", 30, "le", 28, "le", 29, 28, "s", + 31, "s", 31, "frl", "le", 29, "le", 30, 29, "s" ] }, + + # INDEX 4 + 4 : { "schedule": + ["m", "frl", + "l", "l", "l", "l", + "l", "l", "l", "l", + "l", "l", "l", "l", + "l", "l", "l", "l", + + "lr", + + 0, 1, 2, 3, + 0, 1, 2, 3, + 0, 1, 2, 3, + 0, 1, 2, 3, + 0, 1, 2, 3, + + 4, 5, 6, 7, + 4, 5, 6, 7, + 4, 5, 6, 7, + 4, 5, 6, 7, + 4, 5, 6, 7, + + 8+0, 8+1, 8+2, 8+3, + 8+0, 8+1, 8+2, 8+3, + 8+0, 8+1, 8+2, 8+3, + 8+0, 8+1, 8+2, 8+3, + 8+0, 8+1, 8+2, 8+3, + + 8+4, 8+5, 8+6, 8+7, + 8+4, 8+5, 8+6, 8+7, + 8+4, 8+5, 8+6, 8+7, + 8+4, 8+5, 8+6, 8+7, + 8+4, 8+5, 8+6, 8+7, + + 16+0, 16+1, 16+2, 16+3, + 16+0, 16+1, 16+2, 16+3, + 16+0, 16+1, 16+2, 16+3, + 16+0, 16+1, 16+2, 16+3, + 16+0, 16+1, 16+2, 16+3, + + 16+4, 16+5, 16+6, 16+7, + 16+4, 16+5, 16+6, 16+7, + 16+4, 16+5, 16+6, 16+7, + 16+4, 16+5, 16+6, 16+7, + 16+4, 16+5, 16+6, 16+7, + + 24+0, 24+1, 24+2, 24+3, + 24+0, 24+1, 24+2, 24+3, + 24+0, 24+1, 24+2, 24+3, + 24+0, 24+1, 24+2, 24+3, + 24+0, 24+1, 24+2, 24+3, + + 24+4, 24+5, 24+6, 24+7, + 24+4, 24+5, 24+6, 24+7, + 24+4, 24+5, 24+6, 24+7, + 24+4, 24+5, 24+6, 24+7, + 24+4, 24+5, 24+6, 24+7, + + "s", "s", "s", "s", + "s", "s", "s", "s", + "s", "s", "s", "s", + "s", "s", "s", "s" ] }, + + # INDEX 5 + 5 : { "schedule": + ["m", "frl", + "l", "l", "l", "l", + "l", "l", "l", "l", + "l", "l", "l", "l", + "l", "l", "l", "l", + + "lr", + + 0, 0, 1, 1, 2, 2, 3, 3, + 0, 1, 2, 3, + 4, 4, 5, 5, 6, 6, 7, 7, + 4, 5, 6, 7, + 0, 0, 1, 1, 2, 2, 3, 3, + 4, 4, 5, 5, 6, 6, 7, 7, + + 8+0, 8+0, 8+1, 8+1, 8+2, 8+2, 8+3, 8+3, + 8+0, 8+1, 8+2, 8+3, + 8+4, 8+4, 8+5, 8+5, 8+6, 8+6, 8+7, 8+7, + 8+4, 8+5, 8+6, 8+7, + 8+0, 8+0, 8+1, 8+1, 8+2, 8+2, 8+3, 8+3, + 8+4, 8+4, 8+5, 8+5, 8+6, 8+6, 8+7, 8+7, + + 16+0, 16+0, 16+1, 16+1, 16+2, 16+2, 16+3, 16+3, + 16+0, 16+1, 16+2, 16+3, + 16+4, 16+4, 16+5, 16+5, 16+6, 16+6, 16+7, 16+7, + 16+4, 16+5, 16+6, 16+7, + 16+0, 16+0, 16+1, 16+1, 16+2, 16+2, 16+3, 16+3, + 16+4, 16+4, 16+5, 16+5, 16+6, 16+6, 16+7, 16+7, + + 24+0, 24+0, 24+1, 24+1, 24+2, 24+2, 24+3, 24+3, + 24+0, 24+1, 24+2, 24+3, + 24+4, 24+4, 24+5, 24+5, 24+6, 24+6, 24+7, 24+7, + 24+4, 24+5, 24+6, 24+7, + 24+0, 24+0, 24+1, 24+1, 24+2, 24+2, 24+3, 24+3, + 24+4, 24+4, 24+5, 24+5, 24+6, 24+6, 24+7, 24+7, + + "s", "s", "s", "s", + "s", "s", "s", "s", + "s", "s", "s", "s", + "s", "s", "s", "s" ] }, + + # INDEX 6 + # A totally messy manual attempt to interleave + 6 : { "schedule": + ["m", "frl", + "lr"] + + + [0, "l", "l", 0, -1, -1, "sl", "sl", "l", "l", 1, "l", 1, "l", 2, "l", 2, "l", 3, 3, + 0, 1, 2, 3, + 4, 4, 0, 0, 5, 5, 1, 1, 6, 6, 2, 2, 7, 7, + 4, 3, 5, 3, 6, 7] + + + list(map(add(8), + [0, 0, -4, -4, 1, 1, -3, -3, 2, 2, -2, -2, 3, 3, + 0, -1, -1, 1, 2, 3, + 4, 4, 0, 0, 5, 5, 1, 1, 6, 6, 2, 2, 7, 7, + 4, 3, 5, 3, 6, 7])) + + list(map(add(16), + [0, 0, -4, -4, 1, 1, -3, -3, 2, 2, -2, -2, 3, 3, + 0, -1, -1, 1, 2, 3, + 4, 4, 0, 0, 5, 5, 1, 1, 6, 6, 2, 2, 7, 7, + 4, 3, 5, 3, 6, 7])) + + list(map(add(24), + [0, 0, -4, -4, 1, 1, -3, -3, 2, 2, -2, -2, 3, 3, + 0, -1, -1, 1, 2, 3, + 4, 4, 0, 0, "s", "s", 5, 5, "le", "le", 1, 1, "s", "s", 6, 6, "le", "le", 2, 2, "s", "s", 7, 7, + "le", "le", 4, 3, 5, 3, "s", "s", 6, "le", "le", 7])) + + + [28,28,29,29,30,30] + + + ["s", "s", "s", "s", + "s", "s" ] }, + + # INDEX 7 + # Careful manual interleaving, accounting for latencies + # and usage of vector pipes for vector stores + 7 : { "load_order": [14, 15, 12,13, 8,9,10, 6, 11,7,4,5,0,1,2,3], + "store_order": [4,5,6,7,2,3,0,1,8,9,10,11,12,13,14,15], + "numbering": list(zip( + [6, 7, 4, 5, 0,1, 2, 3, 2,3,0,1,10,11, 8, 9, 4,5,0,1, 8, 9,12,13, 4,6,2,0, 8,10,12,14], + [14, 15, 12,13,8,9,10,11, 6,7,4,5,14,15,12,13, 6,7,2,3,10,11,14,15, 5,7,3,1, 9,11,13,15], + [0, 0, 0, 0, 0, 0, 0, 0, 1,1,1,1, 2, 2, 2, 2, 4,4,3,3, 5, 5, 6, 6, 9,10,8,7,11,12,13,14])), + "schedule": + ["m", "frl", + "lr", + + "l", + 0, 0, -5 , "sl", + "l", + 1, 1, -5 , "sl", + "l", + 2, 2, -4 , -4, + "l", + 3, 3, -3 , -3, + 0, 1, -2 , "sl", + 2, 3, -2 , "sl", + "l", + 4, 4, -1 , "sl", + "l", + 5, 5, -1 , "sl", + "l", + "l", + 6, 6, 0 , 0, + "l", + "l", + 7, 7, 1 , 1, + "l", + 4, 5, 2 , "sl", + 6, 7, 2 , "sl", + "l", + 8, 8, 3 , "sl", + 9, 9, 3 , "sl", + "l", + 10, 10, 4 , 4, + "l", + 11, 11, 5 , 5, + "l", + 8, 9, 6 , "sl", + 10, 11, 6 , "sl", + "l", + 12,12, 7 , "sl", + 13,13, 7 , "sl", + 14,14, 8 , 8, + 15,15, 9 , 9, + 12,13, 10, + 14,15, 10, + 16, 16, 11, + 17,17, 11, + 18,18, 12, 12, + 19, 19, 13, 13, + 16, 17, 14, + 18, 19, 14, + 20, 20, 15, + 21,21, 15, + 22,22, 16, 16, + 23,23, 17, 17, + 20,21, 18, + 22,23, 18, + 24,24, 19, + 25,25, 19, + 26,26, 20, 20, + 27,27, 21, 21, + 24,25, 22, + 26,27, 22, + 28,28, 23, + 29,29, 23, + 30,30, 24,24, + 31,31, 25,25, + 28,29, 26, "s", # S(8) # 15 old + 30,31, 26, "s", # S(8) # 14 old + ] }, + + # INDEX 8 + # The same as 7, but exploring a slightly different interleaving within each line + # which avoids consecutive multiplication operations. + 8 : { "load_order": [14, 15, 12,13, 8,9,10, 6, 11,7,4,5,0,1,2,3], # Load order + "store_order": [4,5,6,7,2,3,0,1,8,9,10,11,12,13,14,15], # Default store order + "numbering": list(zip( + [6, 7, 4, 5, 0,1, 2, 3, 2,3,0,1,10,11, 8, 9, 4,5,0,1, 8, 9,12,13, 4,6,2,0, 8,10,12,14], + [14, 15, 12,13,8,9,10,11, 6,7,4,5,14,15,12,13, 6,7,2,3,10,11,14,15, 5,7,3,1, 9,11,13,15], + [0, 0, 0, 0, 0, 0, 0, 0, 1,1,1,1, 2, 2, 2, 2, 4,4,3,3, 5, 5, 6, 6, 9,10,8,7,11,12,13,14])), + "schedule": ["m", "frl", + "lr", + + "l", + 0, -5 , 0, "sl", + "l", + 1, -5 , 1, "sl", + "l", + 2, -4 , 2, -4, + "l", + 3, -3 , 3, -3, + 0, -2 , 1, "sl", + 2, -2 , 3, "sl", + "l", + 4, -1 , 4, "sl", + "l", + 5, -1 , 5, "sl", + "l", + "l", + 6, 0 , 6, 0, + "l", + "l", + 7, 1 , 7, 1, + "l", + 4, 2 , 5, "sl", + 6, 2 , 7, "sl", + "l", + 8, 3 , 8, "sl", + 9, 3 , 9, "sl", + "l", + 10, 4 , 10, 4, + "l", + 11, 5 , 11, 5, + "l", + 8, 6 , 9, "sl", + 10, 6 , 11, "sl", + "l", + 12, 7 , 12, "sl", + 13, 7 , 13, "sl", + 14, 8 , 14, 8, + 15, 9 , 15, 9, + 12, 10, 13, + 14, 10, 15, + 16, 11, 16, + 17, 11, 17, + 18, 12, 18, 12, + 19, 13, 19, 13, + 16, 14, 17, + 18, 14, 19, + 20, 15, 20, + 21, 15, 21, + 22, 16, 22, 16, + 23, 17, 23, 17, + 20, 18, 21, + 22, 18, 23, + 24, 19, 24, + 25, 19, 25, + 26, 20, 26, 20, + 27, 21, 27, 21, + 24, 22, 25, + 26, 22, 27, + 28, 23, 28, + 29, 23, 29, + 30, 24, 30, 24, + 31, 25, 31, 25, + 28, 26, 29, "s", # S(8) # 15 old + 30, 26, 31, "s", # S(8) # 14 old + ] }, + + # INDEX 9 + # The same as 7, but experimenting whether avoiding ST-LD pairs makes + # any tangible difference + 9 : { "load_order": [14, 15, 12,13, 8,9,10, 6, 11,7,4,5,0,1,2,3], # Load order + "store_order": [4,5,6,7,2,3,0,1,8,9,10,11,12,13,14,15], # Default store order + "numbering": list(zip( + [6, 7, 4, 5, 0,1, 2, 3, 2,3,0,1,10,11, 8, 9, 4,5,0,1, 8, 9,12,13, 4,6,2,0, 8,10,12,14], + [14, 15, 12,13,8,9,10,11, 6,7,4,5,14,15,12,13, 6,7,2,3,10,11,14,15, 5,7,3,1, 9,11,13,15], + [0, 0, 0, 0, 0, 0, 0, 0, 1,1,1,1, 2, 2, 2, 2, 4,4,3,3, 5, 5, 6, 6, 9,10,8,7,11,12,13,14])), + "schedule": + ["m", "frl", + "lr", + + "l", + 0, 0, "sl", -5 , + "l", + 1, 1, "sl", -5 , + "l", + 2, 2, -4 , -4, + "l", + 3, 3, -3 , -3, + 0, 1, "sl", -2 , + 2, 3, "sl", -2 , + "l", + 4, 4, "sl", -1 , + "l", + 5, 5, "sl", -1 , + "l", + "l", + 6, 6, 0 , 0, + "l", + "l", + 7, 7, 1 , 1, + "l", + 4, 5, "sl", 2 , + 6, 7, "sl", 2 , + "l", + 8, 8, "sl", 3 , + 9, 9, "sl", 3 , + "l", + 10, 10, 4 , 4, + "l", + 11, 11, 5 , 5, + "l", + 8, 9, "sl", 6 , + 10, 11, "sl", 6 , + "l", + 12,12, "sl", 7 , + 13,13, "sl", 7 , + 14,14, 8 , 8, + 15,15, 9 , 9, + 12,13, 10, + 14,15, 10, + 16, 16, 11, + 17,17, 11, + 18,18, 12, 12, + 19, 19, 13, 13, + 16, 17, 14, + 18, 19, 14, + 20, 20, 15, + 21,21, 15, + 22,22, 16, 16, + 23,23, 17, 17, + 20,21, 18, + 22,23, 18, + 24,24, 19, + 25,25, 19, + 26,26, 20, 20, + 27,27, 21, 21, + 24,25, 22, + 26,27, 22, + 28,28, 23, + 29,29, 23, + 30,30, 24,24, + 31,31, 25,25, + 28,29, 26, "s", # S(8) # 15 old + 30,31, 26, "s", # S(8) # 14 old + ] }, + + # INDEX 10 + # Same as 7, but inserting some nops to always have blocks of + # four instructions with two multiplies + 10 : { "load_order": [14, 15, 12,13, 8,9,10, 6, 11,7,4,5,0,1,2,3], # Load order + "store_order": [4,5,6,7,2,3,0,1,8,9,10,11,12,13,14,15], # Default store order + "numbering": list(zip( + [6, 7, 4, 5, 0,1, 2, 3, 2,3,0,1,10,11, 8, 9, 4,5,0,1, 8, 9,12,13, 4,6,2,0, 8,10,12,14], + [14, 15, 12,13,8,9,10,11, 6,7,4,5,14,15,12,13, 6,7,2,3,10,11,14,15, 5,7,3,1, 9,11,13,15], + [0, 0, 0, 0, 0, 0, 0, 0, 1,1,1,1, 2, 2, 2, 2, 4,4,3,3, 5, 5, 6, 6, 9,10,8,7,11,12,13,14])), + "schedule": ["m", "frl", + "lr", + + "l", + 0, 0, -5 , "sl", + "l", + 1, 1, -5 , "sl", + "l", + 2, 2, -4 , -4, + "l", + 3, 3, -3 , -3, + 0, 1, -2 , "sl", + 2, 3, -2 , "sl", + "l", + 4, 4, -1 , "sl", + "l", + 5, 5, -1 , "sl", + "l", + "l", + 6, 6, 0 , 0, + "l", + "l", + 7, 7, 1 , 1, + "l", + 4, 5, 2 , "sl", + 6, 7, 2 , "sl", + "l", + 8, 8, 3 , "sl", + 9, 9, 3 , "sl", + "l", + 10, 10, 4 , 4, + "l", + 11, 11, 5 , 5, + "l", + 8, 9, 6 , "sl", + 10, 11, 6 , "sl", + "l", + 12,12, 7 , "sl", + 13,13, 7 , "sl", + 14,14, 8 , 8, + 15,15, 9 , 9, + 12,13, 10, "nop", + 14,15, 10, "nop", + 16, 16, 11, "nop", + 17,17, 11, "nop", + 18,18, 12, 12, + 19, 19, 13, 13, + 16, 17, 14, "nop", + 18, 19, 14, "nop", + 20, 20, 15, "nop", + 21,21, 15, "nop", + 22,22, 16, 16, + 23,23, 17, 17, + 20,21, 18, "nop", + 22,23, 18, "nop", + 24,24, 19, "nop", + 25,25, 19, "nop", + 26,26, 20, 20, + 27,27, 21, 21, + 24,25, 22, "nop", + 26,27, 22, "nop", + 28,28, 23, "nop", + 29,29, 23, "nop", + 30,30, 24,24, + 31,31, 25,25, + 28,29, 26, "s", # S(8) # 15 old + 30,31, 26, "s", # S(8) # 14 old + ] }, + + # INDEX 11 + # Careful manual interleaving, accounting for latencies + # and usage of vector pipes for vector stores + 11 : { "load_order": [14, 15, 12,13, 8,9,10, 11, 6,7,4,5,0,1,2,3], # Load order + "store_order": [5,4,7,6,3,2,1,0,9,8,11,10,13,12,15,14], + "numbering": list(zip( + [6, 7, 4, 5, 0,1, 2, 3, 2,3,0,1,10,11, 8, 9, 4,5,0,1, 8, 9,12,13, 4,6,2,0, 8,10,12,14], + [14, 15, 12,13,8,9,10,11, 6,7,4,5,14,15,12,13, 6,7,2,3,10,11,14,15, 5,7,3,1, 9,11,13,15], + [0, 0, 0, 0, 0, 0, 0, 0, 1,1,1,1, 2, 2, 2, 2, 4,4,3,3, 5, 5, 6, 6, 9,10,8,7,11,12,13,14])), + "schedule": + ["m", "frl", + + "l", + "lrs", + "lrs", + 0, + 0, + "l", + 1, -4, "sl", + 1, -4, "sl", + "l", + 2, -3, "sl", + 2, -3, "sl", + "l", + 3, -2, "sl", + 3, -2, "sl", + + "l", + 0, -1, "sl", + 4, -1, "sl", + "l", + 1, + 5, + "l", + 2, + 6, + "l", + 3, + 7, + + "l", "l", + 4, 0, + 5, 0, + "l", "l", + 4, 1, + 5, 1, + "l", "l", + 6, 2, + 7, 2, + "l", "l", + 6, 3, + 7, 3, + 8+0, + 8+0, + 8+1, 4, + 8+1, 4, + 8+2, 5, + 8+2, 5, + 8+3, 6, + 8+3, 6, + + 8+0, 7, + 8+4, 7, + 8+1, + 8+5, + 8+2, + 8+6, + 8+3, + 8+7, + "lrs", + "lrs", + 8+4, 8, + 8+5, 8, + 8+4, 9, + 8+5, 9, + 8+6, 10, + 8+7, 10, + 8+6, 11, + 8+7, 11, + "frs", + "frs", + 16+0, + 16+0, + 16+1, 12, + 16+1, 12, + 16+2, 13, + 16+2, 13, + 16+3, 14, + 16+3, 14, + + "lrs", + "lrs", + 16+0, 15, + 16+4, 15, + 16+1, + 16+5, + 16+2, + 16+6, + 16+3, + 16+7, + "lrs", + "lrs", + 16+4, 16, + 16+5, 16, + 16+4, 17, + 16+5, 17, + 16+6, 18, + 16+7, 18, + 16+6, 19, + 16+7, 19, + "frs", + "frs", + 24+0, + 24+0, + 24+1, 20, + 24+1, 20, + 24+2, 21, + 24+2, 21, + 24+3, 22, + 24+3, 22, + + 24+0, 23, + 24+4, 23, + 24+1, + 24+5, + 24+2, + 24+6, + 24+3, + 24+7, + 24+4, 24, "s", + 24+5, 24, "s", + 24+4, 25, "s", + 24+5, 25, "s", + 24+6, 26, "s", + 24+7, 26, "s", + 24+6, 27, "s", + 24+7, 27, "s", + ] }, + + # INDEX 12 + # Variant of 11, avoiding blocks with 2x mul, 2x add, 2x str + 12 : { "load_order": [14, 15, 12,13, 8,9,10, 11, 6,7,4,5,0,1,2,3], # Load order + "store_order": [5,4,7,6,3,2,1,0,9,8,11,10,13,12,15,14], + "numbering": list(zip( + [6, 7, 4, 5, 0,1, 2, 3, 2,3,0,1,10,11, 8, 9, 4,5,0,1, 8, 9,12,13, 4,6,2,0, 8,10,12,14], + [14, 15, 12,13,8,9,10,11, 6,7,4,5,14,15,12,13, 6,7,2,3,10,11,14,15, 5,7,3,1, 9,11,13,15], + [0, 0, 0, 0, 0, 0, 0, 0, 1,1,1,1, 2, 2, 2, 2, 4,4,3,3, 5, 5, 6, 6, 9,10,8,7,11,12,13,14])), + "schedule": ["m", "frl", + + "l", + "lrs", + "lrs", + 0, "sl", + 0, "sl", + "l", + 1, -4, "sl", + 1, -4, + "l", + 2, -3, "sl", + 2, -3, + "l", + 3, -2, "sl", + 3, -2, + + "l", + 0, -1, "sl", + 4, -1, + "l", + 1, "sl", + 5, + "l", + 2, "sl", + 6, + "l", + 3, "sl", + 7, + + "l", "l", + 4, 0, "sl", + 5, 0, + "l", "l", + 4, 1, + 5, 1, + "l", "l", + 6, 2, + 7, 2, + "l", "l", + 6, 3, + 7, 3, + 8+0, + 8+0, + 8+1, 4, + 8+1, 4, + 8+2, 5, + 8+2, 5, + 8+3, 6, + 8+3, 6, + + 8+0, 7, + 8+4, 7, + 8+1, + 8+5, + 8+2, + 8+6, + 8+3, + 8+7, + "lrs", + "lrs", + 8+4, 8, + 8+5, 8, + 8+4, 9, + 8+5, 9, + 8+6, 10, + 8+7, 10, + 8+6, 11, + 8+7, 11, + "frs", + "frs", + 16+0, + 16+0, + 16+1, 12, + 16+1, 12, + 16+2, 13, + 16+2, 13, + 16+3, 14, + 16+3, 14, + + "lrs", + "lrs", + 16+0, 15, + 16+4, 15, + 16+1, + 16+5, + 16+2, + 16+6, + 16+3, + 16+7, + "lrs", + "lrs", + 16+4, 16, + 16+5, 16, + 16+4, 17, + 16+5, 17, + 16+6, 18, + 16+7, 18, + 16+6, 19, + 16+7, 19, + "frs", + "frs", + 24+0, + 24+0, + 24+1, 20, + 24+1, 20, + 24+2, 21, + 24+2, 21, + 24+3, 22, + 24+3, 22, + "frs", + "frs", + + 24+0, 23, + 24+4, 23, + 24+1, 24, + 24+5, 24, + 24+2, 25, + 24+6, 25, + 24+3, 26, + 24+7, 26, + + 24+4, 27, + 24+5, 27, + 24+4, "s", + 24+5, "s", + 24+6, "s", + 24+7, "s", + 24+6, "s", + 24+7, "s", + ] }, + + # INDEX 13 + 13 : { "load_order": [14, 15, 12,13, 8,9,10, 11, 6,7,4,5,0,1,2,3], # Load order + "store_order": [5,4,7,6,3,2,1,0,9,8,11,10,13,12,15,14], + "numbering": list(zip( + [6, 7, 4, 5, 0,1, 2, 3, 2,3,0,1,10,11, 8, 9, 4,5,0,1, 8, 9,12,13, 4,6,2,0, 8,10,12,14], + [14, 15, 12,13,8,9,10,11, 6,7,4,5,14,15,12,13, 6,7,2,3,10,11,14,15, 5,7,3,1, 9,11,13,15], + [0, 0, 0, 0, 0, 0, 0, 0, 1,1,1,1, 2, 2, 2, 2, 4,4,3,3, 5, 5, 6, 6, 9,10,8,7,11,12,13,14])), + "schedule": ["m", "frl", + + "l", + "lrs", + 0, "sl", + 0, "sl", + "l", + 1, -4, "sl", + 1, -4, + "l", + 2, -3, "sl", + 2, -3, + "l", + 3, -2, "sl", + 3, -2, + + "l", + 0, -1, "sl", + 4, -1, + "l", + 1, "sl", + 5, + "l", + 2, "sl", + 6, + "l", + 3, "sl", + 7, + + "l", "l", + 4, 0, "sl", + 5, 0, + "l", "l", + 4, 1, + 5, 1, + "l", "l", + 6, 2, + 7, 2, + "l", "l", + 6, 3, + 7, 3, + 8+0, + 8+0, + 8+1, 4, + 8+1, 4, + 8+2, 5, + 8+2, 5, + 8+3, 6, + 8+3, 6, + + "lrs", + 8+0, 7, + 8+4, 7, + "lrs", + 8+1, + 8+5, + 8+2, + 8+6, + 8+3, + 8+7, + 8+4, 8, + 8+5, 8, + 8+4, 9, + 8+5, 9, + 8+6, 10, + 8+7, 10, + 8+6, 11, + 8+7, 11, + "frs", + "frs", + 16+0, + 16+0, + 16+1, 12, + 16+1, 12, + 16+2, 13, + 16+2, 13, + 16+3, 14, + 16+3, 14, + + "lrs", + 16+0, 15, + 16+4, 15, + 16+1, + 16+5, + "lrs", + 16+2, + 16+6, + "lrs", + 16+3, + 16+7, + "lrs", + 16+4, 16, + 16+5, 16, + 16+4, 17, + 16+5, 17, + 16+6, 18, + 16+7, 18, + 16+6, 19, + 16+7, 19, + "frs", + "frs", + 24+0, + 24+0, + 24+1, 20, + 24+1, 20, + 24+2, 21, + 24+2, 21, + 24+3, 22, + 24+3, 22, + "frs", + "frs", + + 24+0, 23, + 24+4, 23, + 24+1, 24, + 24+5, 24, + 24+2, 25, + 24+6, 25, + 24+3, 26, + 24+7, 26, + + "lre", + 24+4, 27, + 24+5, 27, + 24+4, "s", + 24+5, "s", + 24+6, "s", + 24+7, "s", + 24+6, "s", + 24+7, "s", + ] }, + + # INDEX 14 + # Variant of 12, insert some NOPs + 14 : { "load_order": [14, 15, 12,13, 8,9,10, 11, 6,7,4,5,0,1,2,3], # Load order + "store_order": [5,4,7,6,3,2,1,0,9,8,11,10,13,12,15,14], + "numbering": list(zip( + [6, 7, 4, 5, 0,1, 2, 3, 2,3,0,1,10,11, 8, 9, 4,5,0,1, 8, 9,12,13, 6,4,2,0, 8,10,12,14], + [14, 15, 12,13,8,9,10,11, 6,7,4,5,14,15,12,13, 6,7,2,3,10,11,14,15, 7,5,3,1, 9,11,13,15], + [0, 0, 0, 0, 0, 0, 0, 0, 1,1,1,1, 2, 2, 2, 2, 4,4,3,3, 5, 5, 6, 6, 10,9,8,7,11,12,13,14])), + "schedule": + ["m", "frl", + "l", + "lrs", + "lrs", + 0, "sl", + 0, "sl", + "l", + 1, -4, "sl", + 1, -4, + "l", + 2, -3, "sl", + 2, -3, + "l", + 3, -2, "sl", + 3, -2, + + "l", + 0, -1, "sl", + 4, -1, + "l", + 1, "sl", + 5, "nop", + "l", + 2, "sl", + 6, "nop", + "l", + 3, "sl", + 7, "nop", + + "l", "l", + 4, 0, "sl", + 5, 0, + "l", "l", + 4, 1, + 5, 1, + "l", "l", + 6, 2, + 7, 2, + "l", "l", + 6, 3, + 7, 3, + 8+0, "nop", + 8+0, "nop", + 8+1, 4, + 8+1, 4, + 8+2, 5, + 8+2, 5, + 8+3, 6, + 8+3, 6, + + 8+0, 7, + 8+4, 7, + 8+1, "nop", + 8+5, "nop", + 8+2, "nop", + 8+6, "nop", + 8+3, "nop", + 8+7, "nop", + "lrs", + "lrs", + 8+4, 8, + 8+5, 8, + 8+4, 9, + 8+5, 9, + 8+6, 10, + 8+7, 10, + 8+6, 11, + 8+7, 11, + "frs", + "frs", + 16+0, "nop", + 16+0, "nop", + 16+1, 12, + 16+1, 12, + 16+2, 13, + 16+2, 13, + 16+3, 14, + 16+3, 14, + + "lrs", + "lrs", + 16+0, 15, + 16+4, 15, + 16+1, "nop", + 16+5, "nop", + 16+2, "nop", + 16+6, "nop", + 16+3, "nop", + 16+7, "nop", + "lrs", + "lrs", + 16+4, 16, + 16+5, 16, + 16+4, 17, + 16+5, 17, + 16+6, 18, + 16+7, 18, + 16+6, 19, + 16+7, 19, + "frs", + "frs", + 24+0, "nop", + 24+0, "nop", + 24+1, 20, + 24+1, 20, + 24+2, 21, + 24+2, 21, + 24+3, 22, + 24+3, 22, + "frs", + "frs", + + 24+0, 23, + 24+4, 23, + 24+1, 24, + 24+5, 24, + 24+2, 25, + 24+6, 25, + 24+3, 26, + 24+7, 26, + + 24+4, 27, + 24+5, 27, + 24+4, "s", + 24+5, "s", + 24+6, "s", + 24+7, "s", + 24+6, "s", + 24+7, "s", + ] }, + + # INDEX 15 + # Different butterfly ordering + 15 : { "load_order": [15, 14, 13,12, 11, 10, 9, 8, 7,6,5,4,3,2,1,0], + "store_order": [15, 14, 13,12, 11, 10, 9, 8, 7,6,5,4,3,2,1,0], + "numbering": list(zip( + [ 7, 6, 5, 4, 3, 2, 1, 0, 11,10, 9, 8,3,2,1,0, 13,12, 9, 8,5,4,1,0, 14, 12, 10, 8, 6, 4, 2, 0], + [15, 14, 13, 12, 11, 10, 9, 8, 15,14,13,12,7,6,5,4, 15,14,11,10,7,6,3,2, 15, 13, 11, 9, 7, 5, 3, 1], + [ 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2,1,1,1,1, 6, 6, 5, 5,4,4,3,3, 14, 13, 12,11,10, 9, 8, 7])), + "root_load_order": [0,1,3,2], # Root load order + "schedule": + ["m", "frl", + + "l", + "lrs", + "lrs", + 0, "sl", + 0, "sl", + "l", + 1, -4, "sl", + 1, -4, + "l", + 2, -3, "sl", + 2, -3, + "l", + 3, -2, "sl", + 3, -2, + + "l", + 0, -1, "sl", + 4, -1, + "l", + 1, "sl", + 5, "nop", + "l", + 2, "sl", + 6, "nop", + "l", + 3, "sl", + 7, "nop", + + "l", "l", + 4, 0, "sl", + 5, 0, + "l", "l", + 4, 1, + 5, 1, + "l", "l", + 6, 2, + 7, 2, + "l", "l", + 6, 3, + 7, 3, + 8+0, "nop", + 8+0, "nop", + 8+1, 4, + 8+1, 4, + 8+2, 5, + 8+2, 5, + 8+3, 6, + 8+3, 6, + + 8+0, 7, + 8+4, 7, + 8+1, "nop", + 8+5, "nop", + 8+2, "nop", + 8+6, "nop", + 8+3, "nop", + 8+7, "nop", + "lrs", + "lrs", + 8+4, 8, + 8+5, 8, + 8+4, 9, + 8+5, 9, + 8+6, 10, + 8+7, 10, + 8+6, 11, + 8+7, 11, + "frs", + "frs", + 16+0, "nop", + 16+0, "nop", + 16+1, 12, + 16+1, 12, + 16+2, 13, + 16+2, 13, + 16+3, 14, + 16+3, 14, + + "lrs", + "lrs", + 16+0, 15, + 16+4, 15, + 16+1, "nop", + 16+5, "nop", + 16+2, "nop", + 16+6, "nop", + 16+3, "nop", + 16+7, "nop", + "lrs", + "lrs", + 16+4, 16, + 16+5, 16, + 16+4, 17, + 16+5, 17, + 16+6, 18, + 16+7, 18, + 16+6, 19, + 16+7, 19, + "frs", + "frs", + 24+0, "nop", + 24+0, "nop", + 24+1, 20, + 24+1, 20, + 24+2, 21, + 24+2, 21, + 24+3, 22, + 24+3, 22, + "frs", + "frs", + + 24+0, 23, + 24+4, 23, + 24+1, 24, + 24+5, 24, + 24+2, 25, + 24+6, 25, + 24+3, 26, + 24+7, 26, + + 24+4, 27, + 24+5, 27, + 24+4, "s", + 24+5, "s", + 24+6, "s", + 24+7, "s", + 24+6, "s", + 24+7, "s", + ] }, + + # INDEX 16 + # Different butterfly ordering + 16 : { "load_order": [15, 14, 13,12, 11, 10, 9, 8, 7,6,5,4,3,2,1,0], + "store_order": [15, 14, 13,12, 11, 10, 9, 8, 7,6,5,4,3,2,1,0], + "numbering": list(zip( + [ 7, 6, 5, 4, 3, 2, 1, 0, 11,10,3,2, 9, 8,1,0, 13, 9,5,1,12, 8,4,0, 14, 12, 10, 8, 6, 4, 2, 0], + [15, 14, 13, 12, 11, 10, 9, 8, 15,14,7,6,13,12,5,4, 15,11,7,3,14,10,6,2, 15, 13, 11, 9, 7, 5, 3, 1], + [ 0, 0, 0, 0, 0, 0, 0, 0, 2, 2,1,1, 2, 2,1,1, 6, 5,4,3, 6, 5,4,3, 14, 13, 12,11,10, 9, 8, 7])), + "root_load_order": [0,1,3,2], # Root load order + "schedule": + ["m", "frl", + + "l", + "lrs", + "lrs", + 0, "sl", + 0, "sl", + "l", + 1, -4, "sl", + 1, -4, + "l", + 2, -3, "sl", + 2, -3, + "l", + 3, -2, "sl", + 3, -2, + + "l", + 0, -1, "sl", + 4, -1, + "l", + 1, "sl", + 5, "nop", + "l", + 2, "sl", + 6, "nop", + "l", + 3, "nop", + 7, "sl", + + "l", "l", + 4, 0, "sl", + 5, 0, + "l", "l", + 4, 1, + 5, 1, + "l", "l", + 6, 2, + 7, 2, + "l", "l", + 6, 3, + 7, 3, + 8+0, "nop", + 8+0, "nop", + 8+1, 4, + 8+1, 4, + 8+2, 5, + 8+2, 5, + 8+3, 6, + 8+3, 6, + + 8+0, 7, + 8+4, 7, + 8+1, "nop", + 8+5, "nop", + 8+2, "nop", + 8+6, "nop", + 8+3, "nop", + 8+7, "nop", + "lrs", + "lrs", + 8+4, 8, + 8+5, 8, + 8+4, 9, + 8+5, 9, + 8+6, 10, + 8+7, 10, + 8+6, 11, + 8+7, 11, + "frs", + "frs", + 16+0, "nop", + 16+0, "nop", + 16+1, 12, + 16+1, 12, + 16+2, 13, + 16+2, 13, + 16+3, 14, + 16+3, 14, + + "lrs", + "lrs", + 16+0, 15, + 16+4, 15, + 16+1, "nop", + 16+5, "nop", + 16+2, "nop", + 16+6, "nop", + 16+3, "nop", + 16+7, "nop", + "lrs", + "lrs", + 16+4, 16, + 16+5, 16, + 16+4, 17, + 16+5, 17, + 16+6, 18, + 16+7, 18, + 16+6, 19, + 16+7, 19, + "frs", + "frs", + 24+0, "nop", + 24+0, "nop", + 24+1, 20, + 24+1, 20, + 24+2, 21, + 24+2, 21, + 24+3, 22, + 24+3, 22, + "frs", + "frs", + + 24+0, 23, + 24+4, 23, + 24+1, 24, + 24+5, 24, + 24+2, 25, + 24+6, 25, + 24+3, 26, + 24+7, 26, + + 24+4, 27, + 24+5, 27, + 24+4, "s", + 24+5, "s", + 24+6, "s", + 24+7, "s", + 24+6, "s", + 24+7, "s", + ] }, + + # INDEX 17 + # Different butterfly ordering, space out non-MUL ops + 17 : { "load_order": [15, 14, 13,12, 11, 10, 9, 8, 7,6,5,4,3,2,1,0], + "store_order": [15, 14, 13,12, 11, 10, 9, 8, 7,6,5,4,3,2,1,0], + "numbering": list(zip( + [ 7, 6, 5, 4, 3, 2, 1, 0, 11,10,3,2, 9, 8,1,0, 13, 9,5,1,12, 8,4,0, 14, 12, 10, 8, 6, 4, 2, 0], + [15, 14, 13, 12, 11, 10, 9, 8, 15,14,7,6,13,12,5,4, 15,11,7,3,14,10,6,2, 15, 13, 11, 9, 7, 5, 3, 1], + [ 0, 0, 0, 0, 0, 0, 0, 0, 2, 2,1,1, 2, 2,1,1, 6, 5,4,3, 6, 5,4,3, 14, 13, 12,11,10, 9, 8, 7])), + "root_load_order": [0,1,3,2], # Root load order + "schedule": ["m", "frl", + + "l", + 0, "sl", + 0, -4, + "l", + 1, "sl", + 1, -4, + "l", + 2, "sl", + 2, -3, + "l", + 3, "sl", + 3, -3, + + "l", + 0, "sl", + 4, -2, + "l", + 1, "sl", + 5, -2, + "l", + 2, "sl", + 6, -1, + "l", + 3, "sl", + 7, -1, + + "l", "l", + 4, 0, + 5, 0, + "l", "l", + 4, 1, + 5, 1, + "l", "l", + 6, 2, + 7, 2, + "l", "l", + 6, 3, + 7, 3, + 8+0, "nop", + 8+0, "nop", + 8+1, 4, + 8+1, 4, + 8+2, 5, + 8+2, 5, + 8+3, 6, + 8+3, 6, + + 8+0, 7, + 8+4, 7, + 8+1, "sl", + 8+5, "nop", + 8+2, "sl", + 8+6, "nop", + 8+3, "nop", + 8+7, "nop", + "lrs", + "lrs", + 8+4, 8, + 8+5, 8, + 8+4, 9, + 8+5, 9, + 8+6, 10, + 8+7, 10, + 8+6, 11, + 8+7, 11, + "frs", + "frs", + 16+0, "nop", + 16+0, "nop", + 16+1, 12, + 16+1, 12, + 16+2, 13, + 16+2, 13, + 16+3, 14, + 16+3, 14, + + "lrs", + "lrs", + 16+0, 15, + 16+4, 15, + 16+1, "nop", + 16+5, "nop", + 16+2, "nop", + 16+6, "nop", + 16+3, "nop", + 16+7, "nop", + "lrs", + "lrs", + 16+4, 16, + 16+5, 16, + 16+4, 17, + 16+5, 17, + 16+6, 18, + 16+7, 18, + 16+6, 19, + 16+7, 19, + "frs", + "frs", + 24+0, "nop", + 24+0, "nop", + 24+1, 20, + 24+1, 20, + 24+2, 21, + 24+2, 21, + 24+3, 22, + 24+3, 22, + "frs", + "frs", + + 24+0, 23, + 24+4, 23, + 24+1, 24, + 24+5, 24, + 24+2, 25, + 24+6, 25, + 24+3, 26, + 24+7, 26, + + 24+4, 27, "lres", + 24+5, 27, "lres", + 24+4, "s", + 24+5, "s", + 24+6, "s", + 24+7, "s", + 24+6, "s", + 24+7, "s", + ] }, + + # INDEX 18 + # Based on 17, change interleaving to always have 2x MULs next to each other + 18 : { "load_order": [15, 14, 13,12, 11, 10, 9, 8, 7,6,5,4,3,2,1,0], + "store_order": [15, 14, 13,12, 11, 10, 9, 8, 7,6,5,4,3,2,1,0], + "numbering": list(zip( + [ 7, 6, 5, 4, 3, 2, 1, 0, 11,10,3,2, 9, 8,1,0, 13, 9,5,1,12, 8,4,0, 14, 12, 10, 8, 6, 4, 2, 0], + [15, 14, 13, 12, 11, 10, 9, 8, 15,14,7,6,13,12,5,4, 15,11,7,3,14,10,6,2, 15, 13, 11, 9, 7, 5, 3, 1], + [ 0, 0, 0, 0, 0, 0, 0, 0, 2, 2,1,1, 2, 2,1,1, 6, 5,4,3, 6, 5,4,3, 14, 13, 12,11,10, 9, 8, 7])), + "root_load_order": [0,1,3,2], # Root load order + "schedule": ["m", "frl", + + "l", + 0,0, "sl", + -4, + "l", + 1,1, "sl", + -4, + "l", + 2,2, "sl", + -3, + "l", + 3,3, "sl", + -3, + + "l", + 4,0, "sl", + -2, + "l", + 5,1, "sl", + -2, + "l", + 6,2, "sl", + -1, + "l", + 7,3, "sl", + -1, + + "l", "l", + 5,4, 0, + 0, + "l", "l", + 5,4, 1, + 1, + "l", "l", + 7,6, 2, + 2, + "l", "l", + 7,6, 3, + 3, + 8+0,8+0, 4, + "nop", + 8+1,8+1, 4, + "nop", + 8+2,8+2, 5, + 5, + 8+3,8+3, 6, + 6, + + 8+4,8+0, 7, + "sl", + 8+5,8+1, 7, + "nop", + 8+6,8+2, "sl", + "nop", + 8+7,8+3, "nop", + "nop", + "lrs", + "lrs", + 8+5,8+4, 8, + 8, + 8+5,8+4, 9, + 9, + 8+7,8+6, 10, + 10, + 8+7,8+6, 11, + 11, + "frs", + "frs", + 16+0,16+0, "nop", + "nop", + 16+1,16+1, 12, + 12, + 16+2,16+2, 13, + 13, + 16+3,16+3, 14, + 14, + + "lrs", + "lrs", + 16+4,16+0, 15, + 15, + 16+5,16+1, "nop", + "nop", + 16+6,16+2, "nop", + "nop", + 16+7,16+3, "nop", + "nop", + "lrs", + "lrs", + 16+5,16+4, 16, + 16, + 16+5,16+4, 17, + 17, + 16+7,16+6, 18, + 18, + 16+7,16+6, 19, + 19, + "frs", + "frs", + 24+0,24+0, "nop", + "nop", + 24+1,24+1, 20, + 20, + 24+2,24+2, 21, + 21, + 24+3,24+3, 22, + 22, + "frs", + "frs", + + 24+4,24+0, 23, + 23, + 24+5,24+1, 24, + "s", + 24+6,24+2, 24, + "s", + 24+7,24+3, 25, + "s", + + 24+5,24+4, 25, "lres", + 26, "lres", + 24+5,24+4, 26, + "s", + 24+7,24+6, 27, + "s", + 24+7,24+6, 27, + "s", + + # 24+4,24+0, 23, + # 23, + # 24+5,24+1, 24, + # "s", + # 24+6,24+2, 24, + # "s", + # 24+7,24+3, 25, + # "s", + + # 24+5,24+4, 25, "lres", + # "s", "lres", + # 24+5,24+4, 26, + # "s", + # 24+7,24+6, 27, + # "s", + # 24+7,24+6, 27, + # "nop", + + ] }, + + # INDEX 19 + # Based on 18, but minor changes wrt placement of nop's. + 19 : { "load_order": [15, 14, 13,12, 11, 10, 9, 8, 7,6,5,4,3,2,1,0], + "store_order": [15, 14, 13,12, 11, 10, 9, 8, 7,6,5,4,3,2,1,0], + "numbering": list(zip( + [ 7, 6, 5, 4, 3, 2, 1, 0, 11,10,3,2, 9, 8,1,0, 13, 9,5,1,12, 8,4,0, 14, 12, 10, 8, 6, 4, 2, 0], + [15, 14, 13, 12, 11, 10, 9, 8, 15,14,7,6,13,12,5,4, 15,11,7,3,14,10,6,2, 15, 13, 11, 9, 7, 5, 3, 1], + [ 0, 0, 0, 0, 0, 0, 0, 0, 2, 2,1,1, 2, 2,1,1, 6, 5,4,3, 6, 5,4,3, 14, 13, 12,11,10, 9, 8, 7])), + "root_load_order": [0,1,3,2], # Root load order + "schedule": ["m", "frl", + + "l", + 0,0, "sl", + -4, + "l", + 1,1, "sl", + -4, + "l", + 2,2, "sl", + -3, + "l", + 3,3, "sl", + -3, + + "l", + 4,0, "sl", + -2, + "l", + 5,1, "sl", + -2, + "l", + 6,2, "sl", + -1, + "l", + 7,3, "sl", + -1, + + "l", "l", + 5,4, 0, + 0, + "l", "l", + 5,4, 1, + 1, + "l", "l", + 7,6, 2, + 2, + "l", "l", + 7,6, 3, + 3, + 8+0,8+0, 4, + "nop", + 8+1,8+1, 4, + "nop", + 8+2,8+2, 5, + 5, + 8+3,8+3, 6, + 6, + + 8+4,8+0, 7, + "sl", + 8+5,8+1, 7, + "nop", + 8+6,8+2, "sl", + "nop", + 8+7,8+3, "nop", + "nop", + "lrs", + "lrs", + 8+5,8+4, 8, + 8, + 8+5,8+4, 9, + 9, + 8+7,8+6, 10, + 10, + 8+7,8+6, 11, + 11, + "frs", + "frs", + 16+0,16+0, 12, + 12, + 16+1,16+1, 13, + 13, + 16+2,16+2, 14, + 14, + 16+3,16+3, 15, + 15, + + "lrs", + "lrs", + 16+4,16+0, "nop", + "nop", + 16+5,16+1, "nop", + "nop", + 16+6,16+2, "nop", + "nop", + 16+7,16+3, "nop", + "nop", + "lrs", + "lrs", + 16+5,16+4, 16, + 16, + 16+5,16+4, 17, + 17, + 16+7,16+6, 18, + 18, + 16+7,16+6, 19, + 19, + "frs", + "frs", + 24+0,24+0, 20, + 20, + 24+1,24+1, 21, + 21, + 24+2,24+2, 22, + 22, + 24+3,24+3, 23, + 23, + "frs", + "frs", + + 24+4,24+0, "nop", + "nop", + 24+5,24+1, 24, + "s", + 24+6,24+2, 24, + "s", + 24+7,24+3, 25, + "s", + + 24+5,24+4, 25, "lres", + 26, "lres", + 24+5,24+4, 26, + "s", + 24+7,24+6, 27, + "s", + 24+7,24+6, 27, + "s", + + ] }, + + # INDEX 20 + # Based on 19, trying to balance issue queues a bit better + # by extending the overlapping of iterations. This takes off + # pressure from the add/sub issue queue, but increases load + # load of the mul issue queues + 20 : { "load_order": [15, 14, 13,12, 11, 10, 9, 8, 7,6,5,4,3,2,1,0], + "store_order": [15, 14, 13,12, 11, 10, 9, 8, 7,6,5,4,3,2,1,0], + "numbering": list(zip( + [ 7, 6, 5, 4, 3, 2, 1, 0, 11,10,3,2, 9, 8,1,0, 13, 9,5,1,12, 8,4,0, 14, 12, 10, 8, 6, 4, 2, 0], + [15, 14, 13, 12, 11, 10, 9, 8, 15,14,7,6,13,12,5,4, 15,11,7,3,14,10,6,2, 15, 13, 11, 9, 7, 5, 3, 1], + [ 0, 0, 0, 0, 0, 0, 0, 0, 2, 2,1,1, 2, 2,1,1, 6, 5,4,3, 6, 5,4,3, 14, 13, 12,11,10, 9, 8, 7])), + "root_load_order": [0,1,3,2], # Root load order + "schedule": + ["m", "frl", + + "l", + 0,0, -6, + "sl", + "l", + 1,1, -5, + "sl", + "l", + 2,2, -5, + "sl", + "l", + 3,3, "sl", + -4, + + "l", + 4,0, "sl", + -4, + "l", + 5,1, "sl", + -3, + "l", + 6,2, "sl", + -3, + "l", + 7,3, "sl", + -2, + + "l", "l", + 5,4, "sl", + -2, + "l", "l", + 5,4, "sl", + -1, + "l", "l", + 7,6, "sl", + -1, + "l", "l", + 7,6, 0, + 0, + 8+0,8+0, 1, + 1, + 8+1,8+1, 2, + 2, + 8+2,8+2, 3, + 3, + 8+3,8+3, 4, + 4, + + 8+4,8+0, 5, + 5, + 8+5,8+1, 6, + 6, + 8+6,8+2, 7, + "sl", + 8+7,8+3, 7, + "sl", + "lrs", + "lrs", + 8+5,8+4, 8, + 8, + 8+5,8+4, 9, + 9, + 8+7,8+6, 10, + 10, + 8+7,8+6, 11, + 11, + "frs", + "frs", + 16+0,16+0, 12, + 12, + 16+1,16+1, 13, + 13, + 16+2,16+2, 14, + 14, + 16+3,16+3, 15, + 15, + + "lrs", + "lrs", + 16+4,16+0, "nop", + "nop", + 16+5,16+1, "nop", + "nop", + 16+6,16+2, "nop", + "nop", + 16+7,16+3, "nop", + "nop", + "lrs", + "lrs", + 16+5,16+4, 16, + 16, + 16+5,16+4, 17, + 17, + 16+7,16+6, 18, + 18, + 16+7,16+6, 19, + 19, + "frs", + "frs", + 24+0,24+0, 20, + 20, + 24+1,24+1, 21, + 21, + 24+2,24+2, 22, + 22, + 24+3,24+3, 23, + 23, + "frs", + "frs", + + 24+4,24+0, "nop", + "nop", + 24+5,24+1, "nop", + "nop", + 24+6,24+2, "nop", + "nop", + 24+7,24+3, "nop", + "nop", + 24+5,24+4, 24, + "s", + 24+5,24+4, 24, + "s", + 24+7,24+6, 25, + "s", + 24+7,24+6, 25, "lres", + 26, "lres", + + ] }, + + # INDEX 21 + # Based on 20, swapping some ADD/SUB and late stores + 21 : { "load_order": [15, 14, 13,12, 11, 10, 9, 8, 7,6,5,4,3,2,1,0], + "store_order": [15, 14, 13,12, 11, 10, 9, 8, 7,6,5,4,3,2,1,0], + "numbering": list(zip( + [ 7, 6, 5, 4, 3, 2, 1, 0, 11,10,3,2, 9, 8,1,0, 13, 9,5,1,12, 8,4,0, 14, 12, 10, 8, 6, 4, 2, 0], + [15, 14, 13, 12, 11, 10, 9, 8, 15,14,7,6,13,12,5,4, 15,11,7,3,14,10,6,2, 15, 13, 11, 9, 7, 5, 3, 1], + [ 0, 0, 0, 0, 0, 0, 0, 0, 2, 2,1,1, 2, 2,1,1, 6, 5,4,3, 6, 5,4,3, 14, 13, 12,11,10, 9, 8, 7])), + "root_load_order": [0,1,3,2], # Root load order + "schedule": + ["m", "frl", + + "l", + 0,0, -6, + "sl", + "l", + 1,1, -5, + "sl", + "l", + 2,2, -5, + "sl", + "l", + 3,3, "sl", + -4, + + "l", + 4,0, -4, + "sl", + "l", + 5,1, -3, + "sl", + "l", + 6,2, -3, + "sl", + "l", + 7,3, -2, + "sl", + + "l", "l", + 5,4, -2, + "sl", + "l", "l", + 5,4, -1, + "sl", + "l", "l", + 7,6, -1, + "sl", + "l", "l", + 7,6, 0, + 0, + 8+0,8+0, 1, + 1, + 8+1,8+1, 2, + 2, + 8+2,8+2, 3, + 3, + 8+3,8+3, 4, + 4, + + 8+4,8+0, 5, + 5, + 8+5,8+1, 6, + 6, + 8+6,8+2, "sl", + 7, + 8+7,8+3, "sl", + 7, + "lrs", + "lrs", + 8+5,8+4, 8, + 8, + 8+5,8+4, 9, + 9, + 8+7,8+6, 10, + 10, + 8+7,8+6, 11, + 11, + "frs", + "frs", + 16+0,16+0, 12, + 12, + 16+1,16+1, 13, + 13, + 16+2,16+2, 14, + 14, + 16+3,16+3, 15, + 15, + + "lrs", + "lrs", + 16+4,16+0, "nop", + "nop", + 16+5,16+1, "nop", + "nop", + 16+6,16+2, "nop", + "nop", + 16+7,16+3, "nop", + "nop", + "lrs", + "lrs", + 16+5,16+4, 16, + 16, + 16+5,16+4, 17, + 17, + 16+7,16+6, 18, + 18, + 16+7,16+6, 19, + 19, + "frs", + "frs", + 24+0,24+0, 20, + 20, + 24+1,24+1, 21, + 21, + 24+2,24+2, 22, + 22, + 24+3,24+3, 23, + 23, + "frs", + "frs", + + 24+4,24+0, "nop", + "nop", + 24+5,24+1, "nop", + "nop", + 24+6,24+2, "nop", + "nop", + 24+7,24+3, "nop", + "nop", + 24+5,24+4, 24, + "s", + 24+5,24+4, 24, + "s", + 24+7,24+6, 25, + "s", + 24+7,24+6, 25, "lres", + 26, "lres", + + ] }, + + # INDEX 22 + # Omit some late stores to smoothen transition + # to next layers. + 22 : { + "load_order": [15, 14, 13,12, 11, 10, 9, 8, 7,6,5,4,3,2,1,0], + "store_order": [15, 14, 13,12, 11, 10, 9, 8, 7,6,5,4,3,2,1,0], + "store_order_last": [15, 14, 13,12, 11, 10, 9, 8, 7,6,5,4], + "numbering": list(zip( + [ 7, 6, 5, 4, 3, 2, 1, 0, 11,10,3,2, 9, 8,1,0, 13, 9,5,1,12, 8,4,0, 14, 12, 10, 8, 6, 4, 2, 0], + [15, 14, 13, 12, 11, 10, 9, 8, 15,14,7,6,13,12,5,4, 15,11,7,3,14,10,6,2, 15, 13, 11, 9, 7, 5, 3, 1], + [ 0, 0, 0, 0, 0, 0, 0, 0, 2, 2,1,1, 2, 2,1,1, 6, 5,4,3, 6, 5,4,3, 14, 13, 12,11,10, 9, 8, 7])), + + "root_load_order": [0,1,3,2], # Root load order + + "schedule": + ["m", "frl", + "l", + 0,0, -6, + "sl", + "l", + 1,1, -5, + "sl", + "l", + 2,2, -5, + "sl", + "l", + 3,3, -4, + "sl", + + "l", + 4,0, -4, + "sl", + "l", + 5,1, -3, + "sl", + "l", + 6,2, -3, + "sl", + "l", + 7,3, -2, + "sl", + + "l", -2, + 5,4, "l", + "sl", + "l", "l", + 5,4, "sl", + -1, + "l", "l", + 7,6, "sl", + -1, + "l", "l", + 7,6, 0, + 0, + 8+0,8+0, 1, + 1, + 8+1,8+1, 2, + 2, + 8+2,8+2, 3, + 3, + 8+3,8+3, 4, + 4, + + 8+4,8+0, 5, + 5, + 8+5,8+1, 6, + 6, + 8+6,8+2, 7, + "sl", + 8+7,8+3, 7, + "sl", + "lrs", + "lrs", + 8+5,8+4, 8, + 8, + 8+5,8+4, 9, + 9, + 8+7,8+6, 10, + 10, + 8+7,8+6, 11, + 11, + "frs", + "frs", + 16+0,16+0, 12, + 12, + 16+1,16+1, 13, + 13, + 16+2,16+2, 14, + 14, + 16+3,16+3, 15, + 15, + + "lrs", + "lrs", + 16+4,16+0, "nop", + "nop", + 16+5,16+1, "nop", + "nop", + 16+6,16+2, "nop", + "nop", + 16+7,16+3, "nop", + "nop", + "lrs", + "lrs", + 16+5,16+4, 16, + 16, + 16+5,16+4, 17, + 17, + 16+7,16+6, 18, + 18, + 16+7,16+6, 19, + 19, + "frs", + "frs", + 24+0,24+0, 20, + 20, + 24+1,24+1, 21, + 21, + 24+2,24+2, 22, + 22, + 24+3,24+3, 23, + 23, + "frs", + "frs", + + 24+4,24+0, "nop", + "nop", + 24+5,24+1, "nop", + "nop", + 24+6,24+2, "nop", + "nop", + 24+7,24+3, "nop", + "nop", + 24+5,24+4, 24, + "s", + 24+5,24+4, 24, + "s", + 24+7,24+6, 25, + "s", + 24+7,24+6, 25, "lres", + 26, "lres", + + ] }, + + # INDEX 23 + # Omit some late stores to smoothen transition + # to next layers. + 23 : { + "load_order": [15, 14, 13,12, 11, 10, 9, 8, 7,6,5,4,3,2,1,0], + "store_order": [15, 14, 13,12, 11, 10, 9, 8, 7,6,5,4,3,2,1,0], + "store_order_last": [15, 14, 13,12, 11, 10, 9, 8, 7,6,5,4], + "numbering": list(zip( + [ 7, 6, 5, 4, 3, 2, 1, 0, 11,10,3,2, 9, 8,1,0, 13, 9,5,1,12, 8,4,0, 14, 12, 10, 8, 6, 4, 2, 0], + [15, 14, 13, 12, 11, 10, 9, 8, 15,14,7,6,13,12,5,4, 15,11,7,3,14,10,6,2, 15, 13, 11, 9, 7, 5, 3, 1], + [ 0, 0, 0, 0, 0, 0, 0, 0, 2, 2,1,1, 2, 2,1,1, 6, 5,4,3, 6, 5,4,3, 14, 13, 12,11,10, 9, 8, 7])), + + "root_load_order": [0,1,3,2], # Root load order + + "schedule": + ["m", "frl", + "l", + 0,0, -6, + "sl", + "l", + 1,1, -5, + "sl", + "l", + 2,2, -5, + "sl", + "l", + 3,3, -4, + "sl", + + "l", + 4,0, -4, + "sl", + "l", + 5,1, -3, + "sl", + "l", + 6,2, -3, + "sl", + "l", + 7,3, -2, + "sl", + + "l", "l", + 5,4, -2, + "sl", + "l", "l", + 5,4, "sl", + -1, + "l", "l", + 7,6, "sl", + -1, + "l", "l", + 7,6, 0, + 0, + 8+0,8+0, 1, + 1, + 8+1,8+1, 2, + 2, + 8+2,8+2, 3, + 3, + 8+3,8+3, 4, + 4, + + 8+4,8+0, 5, + 5, + 8+5,8+1, 6, + 6, + 8+6,8+2, 7, + "sl", + 8+7,8+3, 7, + "sl", + "lrs", + "lrs", + 8+5,8+4, 8, + 8, + 8+5,8+4, 9, + 9, + 8+7,8+6, 10, + 10, + 8+7,8+6, 11, + 11, + "frs", + "frs", + 16+0,16+0, 12, + 12, + 16+1,16+1, 13, + 13, + 16+2,16+2, 14, + 14, + 16+3,16+3, 15, + 15, + + "lrs", + "lrs", + 16+4,16+0, "nop", + "nop", + 16+5,16+1, "nop", + "nop", + 16+6,16+2, "nop", + "nop", + 16+7,16+3, "nop", + "nop", + "lrs", + "lrs", + 16+5,16+4, 16, + 16, + 16+5,16+4, 17, + 17, + 16+7,16+6, 18, + 18, + 16+7,16+6, 19, + 19, + "frs", + "frs", + 24+0,24+0, 20, + 20, + 24+1,24+1, 21, + 21, + 24+2,24+2, 22, + 22, + 24+3,24+3, 23, + 23, + "frs", + "frs", + + 24+4,24+0, "nop", + "nop", + 24+5,24+1, "nop", + "nop", + 24+6,24+2, "nop", + "nop", + 24+7,24+3, "nop", + "nop", + 24+5,24+4, 24, + "s", + 24+5,24+4, 24, + "s", + 24+7,24+6, 25, + "s", + 24+7,24+6, 25, "lres", + 26, "lres", + + ] }, + + # INDEX 24 + # Deliberately bad, bunch lots of MULs + 24 : { "schedule": + ["m", "frl", "l", "l", "l", "l", + "l", "l", "l", "l", + "l", "l", "l", "l", + "l", "l", "l", "l", + + 0, 0, 0, + 1, 1, 1, + 2, 2, 2, + 3, 3, 3, + 4, 4, 4, + 5, 5, 5, + 6, 6, 6, + 7, 7, 7, + 0, 0, + 1, 1, + 2, 2, + 3, 3, + 4, 4, + 5, 5, + 6, 6, + 7, 7, + + 8, 8, 8, + 9, 9, 9, + 10, 10, 10, + 11, 11, 11, + 12, 12, 12, + 13, 13, 13, + 14, 14, 14, + 15, 15, 15, + 8, 8, + 9, 9, + 10, 10, + 11, 11, + 12, 12, + 13, 13, + 14, 14, + 15, 15, + + + 16, 16, 16, + 17, 17, 17, + 18, 18, 18, + 19, 19, 19, + 20, 20, 20, + 21, 21, 21, + 22, 22, 22, + 23, 23, 23, + 16, 16, + 17, 17, + 18, 18, + 19, 19, + 20, 20, + 21, 21, + 22, 22, + 23, 23, + + 24, 24, 24, + 25, 25, 25, + 26, 26, 26, + 27, 27, 27, + 28, 28, 28, + 29, 29, 29, + 30, 30, 30, + 31, 31, 31, + 24, 24, + 25, 25, + 26, 26, + 27, 27, + 28, 28, + 29, 29, + 30, 30, + 31, 31, + + "lre", + + "s", "s", "s", "s", + "s", "s", "s", "s", + "s", "s", "s", "s", + "s", "s", "s", "s" ] }, + + } + + modification = modifications[idx] + + # for k,v in modification.items(): + # if not k in default.keys(): + # raise Exception(f"Invalid modification: {k}") + + dic = { **default, **modification } + + dic["load_order_first"] = dic.get("load_order_first", dic["load_order"]) + dic["store_order_last"] = dic.get("store_order_last", dic["store_order"]) + + return dic + + def run_schedule(self, gs_schedule, + last_butterfly_arr, butterfly_arr, next_butterfly_arr): + + # Process the operation array + for op in gs_schedule: + +# print(f"OP: {op}") + + if not isinstance(op,tuple): + op = (0,op) + + idx = op[0] + op = op[1] + + if last_butterfly_arr != None: + last_butterfly = last_butterfly_arr[idx] + else: + last_butterfly = None + + if butterfly_arr != None: + butterfly = butterfly_arr[idx] + else: + butterfly = None + + if next_butterfly_arr != None: + next_butterfly = next_butterfly_arr[idx] + else: + next_butterfly = None + + # Progress one of the GS butterflies + if isinstance(op,int): + idx = op + # Operation for current block of butterflies + if butterfly != None and idx >= 0 and idx < butterfly.num_gs: + yield from self.progress_arithmetic(butterfly,idx) + # Operation for last butterfly + if last_butterfly != None and idx < 0: + idx += last_butterfly.num_gs + yield from self.progress_arithmetic(last_butterfly,idx) + # Non GS operations (memory + transpose) + elif isinstance(op,str): + if op == "fnop": + if butterfly != None and last_butterfly == None: + yield "nop" + elif op == "nop": + if butterfly != None: + yield "nop" + elif op == "s": # Store + yield from self.store_input(butterfly, last=(next_butterfly==None)) + elif op == "sl": # Store late + yield from self.store_input(last_butterfly, last=(butterfly==None)) + elif op == "l": # Load + yield from self.load_input(butterfly, first=(last_butterfly==None)) + elif op == "le": # Load early + yield from self.load_input(next_butterfly) + elif op == "t": # Transpose + yield from self.get_transpose(butterfly) + elif op == "ts": # Transpose single + n = next(self.get_transpose(butterfly),None) + if n != None: + yield n + elif op == "lr": # Load root + yield from self.load_root_scalars(butterfly) + elif op == "lrs": # Load root single + n = next(self.load_root_scalars(butterfly),None) + if n != None: + yield n + elif op == "lre": # Load roots early + yield from self.load_root_scalars(next_butterfly) + elif op == "lres": # Load roots early single + n = next(self.load_root_scalars(next_butterfly),None) + if n != None: + yield n + elif op == "frl": # Free roots late + if butterfly == None or butterfly.load_roots: + list(self.free_root_scalars(last_butterfly)) + elif op == "fr": # Free roots + if next_butterfly == None or next_butterfly.load_roots: + list(self.free_root_scalars(butterfly)) + elif op == "frs": # Free roots single + if next_butterfly == None or next_butterfly.load_roots: + next(self.free_root_scalars(butterfly),None) + elif op == "m": # Move roots + if butterfly != None and not butterfly.load_roots: + self.copy_root_scalars(butterfly, last_butterfly) + else: + raise Exception("Unknown operation") + + + def free_root_scalars(self,butterfly): + + if butterfly == None: + return iter([]) + + if butterfly.root_vecs == None: + return iter([]) + + def free_roots(): + + l = len(butterfly.root_vecs) + + order = butterfly.root_load_order + + for i in range(0,l): + self.vregs.free(butterfly.root_vecs[order[i]]) + butterfly.root_vecs[order[i]] = None + yield + self.vregs.free(butterfly.root_twisted_vecs[order[i]]) + butterfly.root_twisted_vecs[order[i]] = None + yield + + butterfly.root_vecs = None + butterfly.root_twisted_vecs = None + + butterfly.root = None + butterfly.root_lane = None + butterfly.root_twisted = None + butterfly.root_twisted_lane = None + + if butterfly.free_root_scalars == None: + butterfly.free_root_scalars = free_roots() + + return butterfly.free_root_scalars + + def make_twiddle_accessors(self,butterfly,root_to_vec_idx_lane): + + def find_root(idx): + return butterfly.root_vecs[root_to_vec_idx_lane[idx][0]] + def find_root_twisted(idx): + return butterfly.root_twisted_vecs[root_to_vec_idx_lane[idx][0]] + def find_lane(idx): + return root_to_vec_idx_lane[idx][1] + + butterfly.root = find_root + butterfly.root_twisted = find_root_twisted + butterfly.root_lane = find_lane + butterfly.root_twisted_lane = find_lane + + def get_butterfly_list(self, layer_start, merged_layers): + + shuffle = False + + if layer_start + merged_layers > self.shuffle_boundary: + merged_layers = self.shuffle_boundary - layer_start + shuffle = True + + num_blocks = pow(2,layer_start) + block_size = self.size // num_blocks + vectors_per_butterfly = pow(2,merged_layers) + elements_per_butterfly = self.elements_per_vector * vectors_per_butterfly + butterflies_per_block = block_size // elements_per_butterfly + + block_stride = block_size // vectors_per_butterfly + + for block in range(0,num_blocks): + block_base = block * block_size + idxs = list(range(0,butterflies_per_block)) + # idxs = idxs[1::2] + idxs[0::2] + idxs = idxs[len(idxs)//2:] + idxs[:len(idxs)//2] + for i, idx in enumerate(idxs): + butterfly_base = block_base + idx * self.elements_per_vector + yield Butterfly(layer=layer_start, + merged=merged_layers, + block=block, + shuffle=shuffle, + base=butterfly_base, + stride=block_stride, + load_roots=(i==0)) + + def do_butterflies(self,schedule,butterflies,zip_type=1): + + if zip_type == 2: + half = len(butterflies) // 2 + butterflies = list(zip(butterflies[:half],butterflies[half:])) + elif zip_type == 4: + quarter = len(butterflies) // 4 + butterflies = list(zip(butterflies[0::4], + butterflies[1::4], + butterflies[2::4], + butterflies[3::4])) + # butterflies = list(zip(butterflies[0*quarter:1*quarter], + # butterflies[1*quarter:2*quarter], + # butterflies[2*quarter:3*quarter], + # butterflies[3*quarter:4*quarter])) + else: + butterflies = [[b] for b in butterflies] + + def get_butterfly(idx): + if idx < 0: + return None + if idx >= len(butterflies): + return None + return butterflies[idx] + + for i in range(-1,len(butterflies)+1): + cur_butterfly = get_butterfly(i) + last_butterfly = get_butterfly(i-1) + next_butterfly = get_butterfly(i+1) + yield from self.run_schedule(schedule, + last_butterfly, + cur_butterfly, + next_butterfly) + + def attach_butterfly_info_old(self, + butterflies, + load_order, load_order_first, + store_order, store_order_last, + numbering, twiddles, + root_load_order): + + dic = { "load_order": load_order, + "load_order_first": load_order_first, + "store_order": store_order, + "store_order_last": store_order_last, + "numbering": numbering, + "root_load_order": root_load_order, + "twiddles": twiddles } + + self.attach_butterfly_info(butterflies,dic) + + def attach_butterfly_info(self, + butterflies, + dic): + for b in butterflies: + b.load_order = dic["load_order"] + b.load_order_first = dic["load_order_first"] + b.store_order = dic["store_order"] + b.store_order_last = dic["store_order_last"] + b.root_load_order = dic["root_load_order"] + gs = [] + for i,j,r in dic["numbering"]: + gs.append(self.gs_butterfly_single(b,i,j,r)) + b.gs = gs + self.make_twiddle_accessors(b, dic["twiddles"]) + + def core(self): + + yield from self.init_constants() + + for (base,merge),(zipped,schedule_idx) in zip(self.layers,self.schedules): + + butterflies = list(self.get_butterfly_list(base,merge)) + + schedules = { + (0,3,1): self.get_schedule_triple_no_transpose, + (3,3,1): self.get_schedule_triple_no_transpose, + (0,4,1): self.get_schedule_quad_no_transpose, + (4,2,1): self.get_schedule_double_no_transpose, + (4,2,2): self.get_schedule_double_no_transpose_zipped, + (4,2,4): self.get_schedule_double_no_transpose_quad_zipped, + (4,4,1): self.get_schedule_quad_transpose, + (4,4,2): self.get_schedule_quad_transpose_zipped, + (4,4,4): self.get_schedule_quad_transpose_quad_zipped } + + sched_func = schedules[(base,merge,zipped)] + + s = sched_func(schedule_idx) + + if not type(s) is dict: + load_order, store_order, numbering, twiddles, root_load_order, schedule = s + load_order_first = load_order + store_order_last = store_order + sched_func(schedule_idx) + self.attach_butterfly_info_old(butterflies, load_order, load_order_first, + store_order, store_order_last, numbering, twiddles, root_load_order) + yield from self.do_butterflies(schedule,butterflies,zip_type=zipped) + else: + self.attach_butterfly_info(butterflies, s) + yield from self.do_butterflies(s["schedule"],butterflies,zip_type=zipped) + + self.vregs.revfree() + + def standalone(self,funcname): + + # Preamble + yield from Snippets.license() + yield from Snippets.autogen_warning() + + yield "#include " + + yield from self.generate_constants() + yield from Snippets.function_decl(funcname) + + yield from Snippets.function_header(funcname) + yield from Snippets.save_gprs() # Not necessary + yield from Snippets.save_vregs() + + self.gprs.alloc(self._src) + + self.prepare_constants() + + # Actual code + yield from self.core() + + # Wrapup + self.free_constants() + + self.gprs.free(self._src) + + yield from Snippets.restore_vregs() + yield from Snippets.restore_gprs() # Not necessary + yield from Snippets.function_footer() + + def get_code(self): + gen = self.standalone() + for line in gen: + print(line) + +def main(argv): + + outfile = None + degree = None + + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument("--out", type=str, default=None) + parser.add_argument("--schedule", type=str, default="0,0") + parser.add_argument("--layers", type=str, default="3,3") + parser.add_argument("size", type=int) + parser.add_argument("modulus", type=int) + parser.add_argument("root", type=int) + parser.add_argument("symbol", type=str) + + args = parser.parse_args() + + code_all = [] + code_essential = [] + + line_count = 0; + + args.layers = list(map(int,args.layers.split(','))) + args.schedule = list(args.schedule.split(',')) + + ntt = NTT(args.size,args.modulus,args.root,layers=args.layers,schedules=args.schedule) + code_gen = ntt.standalone(args.symbol) + + for line in code_gen: + code_all.append(line) + + def is_code_line(line): + if len(line) < 2: + return False + if line[0:2] == '//': + return False + return True + + code_essential = filter(is_code_line, code_all) + line_count_total = len(list(code_all)) + line_count_essential = len(list(code_essential)) + + code_all.append(f'') + code_all.append(f'// Line count: {line_count_total}') + code_all.append(f'// Instruction count: {line_count_essential}') + + code_all_str = "\n".join(code_all) + + if not args.out == None: + f = open(args.out,"w") + f.write(code_all_str) + f.close() + else: + print(code_all_str) + +if __name__ == "__main__": + main(sys.argv[1:]) diff --git a/asm/scripts/ntt_sve2/ntt_sve2.py b/asm/scripts/ntt_sve2/ntt_sve2.py new file mode 100644 index 0000000..e141afd --- /dev/null +++ b/asm/scripts/ntt_sve2/ntt_sve2.py @@ -0,0 +1,6249 @@ +# Copyright (c) 2021 Arm Limited +# SPDX-License-Identifier: MIT + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +import sys, argparse, math, traceback + +class Snippets(): + + def autogen_warning(): + warning = """ +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// +""" + yield warning + + def license(): + yield """ +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE +""" + + def function_decl(func_name): + yield f'.text' + yield f'.type {func_name}, %function' + yield f'.global {func_name}' + + def function_header(func_name): + yield f"{func_name}:" + + def function_footer(): + yield 'ret' + + def save_gprs(): + yield '// Save GPRs' + yield "sub sp, sp, #(16*5+16)" + yield "stp x19, x20, [sp, #16*0]" + yield "stp x19, x20, [sp, #16*0]" + yield "stp x21, x22, [sp, #16*1]" + yield "stp x23, x24, [sp, #16*2]" + yield "stp x25, x26, [sp, #16*3]" + yield "stp x27, x28, [sp, #16*4]" + yield "str x29, [sp, #16*5]" + + def restore_gprs(): + + # # TODO: Update + yield '// Restore GPRs' + yield "ldp x19, x20, [sp, #16*0]" + yield "ldp x21, x22, [sp, #16*1]" + yield "ldp x23, x24, [sp, #16*2]" + yield "ldp x25, x26, [sp, #16*3]" + yield "ldp x27, x28, [sp, #16*4]" + yield "ldr x29, [sp, #16*5]" + yield "add sp, sp, #(16*5+16)" + + def save_vregs(): + # TODO: Update + yield '// Save SVE2 vector registers' + yield "sub sp, sp, #(16*4)" + yield "stp d8, d9, [sp, #16*0]" + yield "stp d10, d11, [sp, #16*1]" + yield "stp d12, d13, [sp, #16*2]" + yield "stp d14, d15, [sp, #16*3]" + + def restore_vregs(): + # TODO: Update + yield '// Restore SVE2 vector registers' + yield "ldp d8, d9, [sp, #16*0]" + yield "ldp d10, d11, [sp, #16*1]" + yield "ldp d12, d13, [sp, #16*2]" + yield "ldp d14, d15, [sp, #16*3]" + yield "add sp, sp, #(16*4)" + +class RegList(): + + def __init__(self, regs): + + self._regs = regs + + self._free = [] + for r in regs: + self._free.append(r) + + self._alloc = [] + + def alloc(self,reg=None,constraint=None,lax=False): + + if constraint == None: + constraint = lambda _: True + + if reg == None: + reg_idx = None + for i,r in enumerate(self._free): + if not constraint(r): + continue + reg_idx = i + if reg_idx == None: + if not lax or len(self._free) == 0: + raise Exception("No more free registers") + print("WARNING: Have to disregard preference") + reg_idx = len(self._free)-1 + reg = self._free.pop(reg_idx) + else: + if not reg in self._free: + raise Exception(f"Register {reg} already allocated") + if not constraint(reg): + raise Exception(f"Register {reg} doesn't satisfy constraint") + self._free.remove(reg) + + self._alloc.append(reg) + +# print(f"Allocated: {len(self._alloc)}") +# print(f"Free: {len(self._free)}") + + return reg + + def revfree(self): + self._free.reverse() + + def free(self,reg): + if reg not in self._regs: + raise Exception("Invalid register") + if not reg in self._alloc: + raise Exception("Register not allocated") + self._alloc.remove(reg) + self._free.append(reg) + +class Butterfly(): + + def __init__(self,base,stride,block,layer,merged,load_roots=False,shuffle=False): + + self.layer = layer + self.merged = merged + self.block = block + + self.num_gs = merged * pow(2,merged-1) + if shuffle: + self.num_gs *= 2 + + self.base = base + self.stride = stride + self.load_roots = load_roots + + self.load_idx = 0 + self.store_idx = 0 + self.scalar_load = None + self.transpose = None + self.free_root_scalars = None + + def __getitem__(self,idx): + return self.base + idx * self.stride + + def __repr__(self): + return f"[{self.layer}:{self.block}]: {[self[i] for i in range(0,4)]}]" + +class NTT(): + + def __init__(self,size,modulus,root,schedules=[0,0], layers=[3,3], bitwidth=32): + + self.size = size + self.bitwidth = bitwidth + self.R = 2**bitwidth + self.vector_bitlen = 128 + self.vector_bytelen = self.vector_bitlen // 8 + self.elements_per_vector = self.vector_bitlen // self.bitwidth + + self.interleave_twiddles = True + + # Determine layer at which NTT requires intra-vector shuffling + self.shuffle_boundary = int(math.log(self.size,2) - math.log(self.elements_per_vector, 2)) + + if self.bitwidth == 64: + self.data_prefix = "dword" + self.vector_suffix = "d" + self.element_size = 8 + elif self.bitwidth == 32: + self.data_prefix = "word" + self.vector_suffix = "s" + self.element_size = 4 + elif self.bitwidth == 16: + self.data_prefix = "half" + self.vector_suffix = "h" + self.element_size = 2 + + self.root = root + self.modulus = modulus + + self.data = {} + self._src = 0 + + # Alignment for arrays of twiddle factors + self.root_align = 64 + self.root_offset = 0 # 32 + + # Layer configuration + last_layer = 0 + self.layers = [] + for l in layers: + self.layers.append((last_layer,l)) + last_layer += l + + # Schedule configuration + self.schedules = [] + for s in schedules: + if s[:3] == "z2_": + self.schedules.append((2,int(s[3:]))) + elif s[:3] == "z4_": + self.schedules.append((4,int(s[3:]))) + else: + self.schedules.append((1,int(s))) + + if len(self.schedules) != len(self.layers): + raise Exception("Bad configuration") + + # We only support + # - not crossing the shuffle boundary + # - crossing it by exactly 2 layers + self.check_layer_config() + + # Whether to use growing immediate offsets or + # post-increment loads for the twiddles. + self.increment_root_ptr = False # Immediate offsets + # self.increment_root_ptr = True # Post-increment + # self.multi_access_strategy = 0 # Only relevant if increment_root_ptr == True + + vregs = list(range(4,8)) + list(range(8,16)) + list(range(0,4)) + list(range(16,32)) + # vregs = list(range(0,32)) + self.vregs = RegList(vregs) + self.gprs = RegList(list(range(0,18))) + + if self.modulus % 2 == 0: + raise Exception("Modulus must be odd") + if pow(root, 2*size, modulus) != 1: + raise Exception(f"{root} is not a primitive {2*size}-th root of unity modulo {modulus}") + if pow(root, size, modulus) == 1: + raise Exception(f"{root} is not a primitive {2*size}-th root of unity modulo {modulus}") + + def is_power_of_2(n): + if n == 1: + return True + if n % 2 == 1: + raise False + return is_power_of_2(n//2) + if not is_power_of_2(size) or size <= 4: + raise Exception(f"NTT size must be a power of 2, but {size} isn't") + + self.inv_mod = pow(self.modulus, -1, self.R) + + self.log2size = int(math.log(size,2)) + + def check_layer_config(self): + for (base_layer, num_merge) in self.layers: + end_layer = base_layer + num_merge + if end_layer > self.shuffle_boundary and end_layer != self.shuffle_boundary + 2: + raise Exception("Unsupported layer configuration") + + def prepare_constants(self): + self.modulus_vector = self.vregs.alloc(constraint=self.zreg_lane_hi,lax=True) + + def free_constants(self): + self.vregs.free(self.modulus_vector) + + def root_of_unity_for_block(self,layer,block): + + def reverse_bit(num,width): + result = 0 + while width > 0: + result = (result << 1) + (num & 1) + num >>= 1 + width -= 1 + return result + + log = reverse_bit(pow(2,layer) + block, self.log2size) + root = pow(self.root, log, self.modulus) + + def res_even_frac(c,n): + res = c % n + if res >= n // 2: + res -= n // 2 + if res % 2 != 0: + if res < 0: + res += n + else: + res -= n + return res + + def even_frac(c,n): + res = res_even_frac(c,n) + return (c - res)//n + + root_twisted = even_frac(root * self.R, self.modulus) % self.R + root_twisted = root_twisted // 2 + + return log, root, root_twisted + + def generate_constants(self): + + prefix = self.data_prefix + + yield "modulus:" + yield f".{prefix} {-self.modulus}" + yield f".{prefix} {-self.modulus}" + yield f".{prefix} {-self.modulus}" + yield f".{prefix} {-self.modulus}" + + root_asm = [] + root_twisted_asm = [] + + def append_root(layer,block): + nonlocal root_asm, root_twisted_asm + if layer == None: + root, root_twisted = 0,0 + else: + _, root, root_twisted = self.root_of_unity_for_block(layer,block) + new_asm_root = f".{prefix} {root} // Layer {layer}, block {block}" + new_asm_twist = f".{prefix} {root_twisted} // Layer {layer}, block {block}" + root_asm.append(new_asm_root) + root_twisted_asm.append(new_asm_twist) + + def roots_for_merged_layers(start_layer, num_layers): + + for block in range(0, pow(2,start_layer)): + + start_len = len(root_asm) + + for layer in range(0,num_layers): + cur_layer = start_layer + layer + + # TODO: Document + if cur_layer >= self.shuffle_boundary and self.bitwidth == 16: + multiply = 2 + else: + multiply = 1 + + roots_in_layer = pow(2,layer) + idx_seq = list(range(0,roots_in_layer)) + + # TODO: Document + if self.shuffle_boundary < cur_layer: + idx_seq = idx_seq[::2] + idx_seq[1::2] + + if roots_in_layer == self.elements_per_vector: + # Add padding + append_root(None,None) + for idx in idx_seq: + for _ in range(0,multiply): + append_root(start_layer + layer, + roots_in_layer * block + idx) + + end_len = len(root_asm) + mod = (end_len - start_len) % self.elements_per_vector + + if mod != 0: + for _ in range(self.elements_per_vector - mod): + append_root(None,None) + + end_len = len(root_asm) + mod = (end_len - start_len) % self.elements_per_vector + if mod != 0: + raise Exception("Something went wrong") + + vectors_emitted = (end_len - start_len)//self.elements_per_vector + self.vector_storage_per_block_at_layer[start_layer] = vectors_emitted + + self.root_offset_for_layer = {} + self.vector_storage_per_block_at_layer = {} + + # Build twiddle factors for given layer configuration + for base,merged in self.layers: + self.root_offset_for_layer[base] = len(root_asm) * self.element_size + roots_for_merged_layers(base, merged) + + align_log2 = int(math.log(self.root_align,2)) + align_offset = self.root_offset // (self.bitwidth//8) + + if not self.interleave_twiddles: + yield f".align {align_log2}" + yield "roots:" + yield from root_asm + yield f".align {align_log2}" + yield "roots_twisted:" + yield from root_twisted_asm + + else: + + def chunks(lst,size): + for i in range(0,len(lst),size): + yield lst[i:i+size] + + root_blocks = list(chunks(root_asm,self.elements_per_vector)) + root_twisted_blocks = list(chunks(root_twisted_asm,self.elements_per_vector)) + + roots = zip(root_blocks,root_twisted_blocks) + roots = [ e for p in roots for b in p for e in b] + + yield f".align {align_log2}" + for _ in range(0,align_offset): + yield f".{self.data_prefix} 0" + yield "roots_merged:" + yield from roots + + def init_constants(self): + + modulus_base = self.gprs.alloc() + yield f"ldr x{modulus_base}, modulus_addr" + yield f"ldr q{self.modulus_vector}, [x{modulus_base}]" + self.gprs.free(modulus_base) + + self.ptrue = "P0" + yield f"ptrue {self.ptrue}.{self.vector_suffix}" + + if not self.interleave_twiddles: + self.ptr_roots = self.gprs.alloc() + yield f"ldr x{self.ptr_roots}, roots_addr" + self.ptr_roots_twisted = self.gprs.alloc() + yield f"ldr x{self.ptr_roots_twisted}, roots_twisted_addr" + else: + self.ptr_roots_merged = self.gprs.alloc() + yield f"ldr x{self.ptr_roots_merged}, roots_merged_addr" + + self.roots = None + + def get_data(self,index): + if not index in self.data.keys(): + raise Exception(f"Data at index {index} hasn't been loaded") + return self.data[index] + + def load_data(self,index,reg=None): + if index in self.data.keys(): + # Data has already been loaded + return iter([]) + + self.data[index] = self.vregs.alloc(reg,constraint=self.zreg_lane_hi,lax=True) + yield f"ldr q{self.data[index]}, [x{self._src}, #{self.element_size*index}]" + + def release_data(self,index): + if index not in self.data.keys(): + raise Exception(f"Data at index {index} hasn't been loaded") + + self.vregs.free(self.data[index]) + del self.data[index] + + def store_data(self,index,release=True): + if index not in self.data.keys(): + raise Exception(f"Data at index {index} hasn't been loaded") + + yield f"str q{self.data[index]}, [x{self._src}, #{self.element_size*index}]" + + if release: + self.release_data(index) + + def ct_butterfly_single(self, butterfly, i, j, root_index): + + root = butterfly.root(root_index) + root_lane = butterfly.root_lane(root_index) + root_twisted = butterfly.root_twisted(root_index) + root_twisted_lane = butterfly.root_twisted_lane(root_index) + + if root == None: + raise Exception(f"Invalid root, index {root_index}") + if root_twisted == None: + raise Exception(f"Invalid twisted root, index {root_index}") + + modulus = self.modulus_vector + + suf = self.vector_suffix + + # A lane value of None means that we don't want lane-indexing + if root_lane != None: + root_name = f"{root}.{suf}[{root_lane}]" + else: + root_name = f"{root}.{suf}" + + # A lane value of None means that we don't want lane-indexing + if root_twisted_lane != None: + root_twisted_name = f"{root_twisted}.{suf}[{root_twisted_lane}]" + else: + root_twisted_name = f"{root_twisted}.{suf}" + + tmp = self.vregs.alloc(constraint=self.zreg_lane_hi,lax=True) + yield f"sqrdmulh z{tmp}.{suf}, " \ + f"z{self.get_data(butterfly[j])}.{suf}, " \ + f"z{root_twisted_name}" + + yield f"mul z{self.get_data(butterfly[j])}.{suf}, "\ + f"z{self.get_data(butterfly[j])}.{suf}," \ + f"z{root_name}" + + self.vregs.free(tmp) + yield f"mla z{self.get_data(butterfly[j])}.{suf}, {self.ptrue}/M, z{tmp}.{suf}, "\ + f"z{modulus}.{suf}" + + tmp = self.vregs.alloc(constraint=self.zreg_lane_hi,lax=True) + a = self.get_data(butterfly[i]) + b = self.get_data(butterfly[j]) + + self.data[butterfly[j]] = tmp + yield f"sub z{tmp}.{suf}, z{a}.{suf}, z{b}.{suf}" + + # Make sure i is still allocated + assert a == self.get_data(butterfly[i]) + + self.vregs.free(b) + yield f"add z{a}.{suf}, z{a}.{suf}, z{b}.{suf}" + + + def copy_root_scalars(self,dst,src): + dst.root_vecs = src.root_vecs + dst.root_twisted_vecs = src.root_twisted_vecs + dst.root = src.root + dst.root_lane = src.root_lane + dst.root_twisted = src.root_twisted + dst.root_twisted_lane = src.root_twisted_lane + + def load_input(self,butterfly, first=False): + if butterfly == None: + return iter([]) + + if butterfly.load_idx >= pow(2,butterfly.merged): + raise Exception("Too many loads") + + if not first: + load_order = butterfly.load_order + else: + load_order = butterfly.load_order_first + + yield from self.load_data(butterfly[load_order[butterfly.load_idx]]) + butterfly.load_idx += 1 + + def store_input(self,butterfly,last=False): + if butterfly == None: + return iter([]) + + if butterfly.store_idx >= pow(2,butterfly.merged): + raise Exception("Too many late stores") + + if not last: + store_order = butterfly.store_order + else: + store_order = butterfly.store_order_last + if butterfly.store_idx >= len(store_order): + return iter([]) + + yield from self.store_data(butterfly[store_order[butterfly.store_idx]]) + butterfly.store_idx += 1 + + def transpose4(self,idx): + + # Need four temporaries for the transposition + t = [ None for _ in range(0,4) ] + + t[0] = self.vregs.alloc(constraint=self.zreg_lane_hi,lax=True) + yield f"trn1 z{t[0]}.S, z{idx(0)}.S, z{idx(1)}.S" + t[1] = self.vregs.alloc(constraint=self.zreg_lane_hi,lax=True) + yield f"trn2 z{t[1]}.S, z{idx(0)}.S, z{idx(1)}.S" + t[2] = self.vregs.alloc(constraint=self.zreg_lane_hi,lax=True) + yield f"trn1 z{t[2]}.S, z{idx(2)}.S, z{idx(3)}.S" + t[3] = self.vregs.alloc(constraint=self.zreg_lane_hi,lax=True) + yield f"trn2 z{t[3]}.S, z{idx(2)}.S, z{idx(3)}.S" + + yield f"trn2 z{idx(2)}.d, z{t[0]}.d, z{t[2]}.d" + yield f"trn2 z{idx(3)}.d, z{t[1]}.d, z{t[3]}.d" + + # Do this here and not after the yield + self.vregs.free(t[0]) + self.vregs.free(t[2]) + yield f"trn1 z{idx(0)}.d, z{t[0]}.d, z{t[2]}.d" + + # Do this here and not after the yield + self.vregs.free(t[1]) + self.vregs.free(t[3]) + yield f"trn1 z{idx(1)}.d, z{t[1]}.d, z{t[3]}.d" + + def zreg_lane_lo(self,r): + if self.bitwidth == 32: + return (r in range(0,8)) + else: + return (r in range(0,16)) + + def zreg_lane_hi(self,r): + return not self.zreg_lane_lo(r) + + def load_root_scalars(self,butterfly): + + if butterfly == None or butterfly.load_roots == False: + return iter([]) + + def gen(): + + root_vec_storage = self.vector_storage_per_block_at_layer[butterfly.layer] + root_storage_byte = self.vector_bytelen * root_vec_storage + + r = [ None for _ in range(0, root_vec_storage) ] + rt = [ None for _ in range(0, root_vec_storage) ] + + butterfly.root_vecs = r + butterfly.root_twisted_vecs = rt + + order = butterfly.root_load_order + + if self.increment_root_ptr: + assert(self.interleave_twiddles == False) + if self.multi_access_strategy == 0: + for i in range(0,root_vec_storage): + r[order[i]] = self.vregs.alloc(constraint=self.zreg_lane_lo) + yield f"ldr q{r[order[i]]}, [x{self.ptr_roots}], #+{self.vector_bytelen}" + rt[order[i]] = self.vregs.alloc(constraint=self.zreg_lane_lo) + yield f"ldr q{rt[order[i]]}, [x{self.ptr_roots_twisted}], #+{self.vector_bytelen}" + elif self.multi_access_strategy == 1: + for i in range(0,root_vec_storage,2): + rt[order[i]] = self.vregs.alloc(constraint=self.zreg_lane_lo) + rt[i+1] = self.vregs.alloc(constraint=self.zreg_lane_lo) + yield f"ldp q{rt[order[i]]}, q{rt[i+1]}, [x{self.ptr_roots_twisted}], #+{2*self.vector_bytelen}" + r[order[i]] = self.vregs.alloc(constraint=self.zreg_lane_lo) + r[order[i+1]] = self.vregs.alloc(constraint=self.zreg_lane_lo) + yield f"ldp q{r[order[i]]}, q{r[i+1]}, [x{self.ptr_roots}], #+{2*self.vector_bytelen}" + else: + + offset_base = self.root_offset_for_layer[butterfly.layer] + offset_base += root_storage_byte * butterfly.block + + for i in range(0,root_vec_storage): + + offset = offset_base + order[i] * self.vector_bytelen + if not self.interleave_twiddles: + r[order[i]] = self.vregs.alloc(constraint=self.zreg_lane_lo) + yield f"ldr q{r[order[i]]}, [x{self.ptr_roots}, #+{offset}]" + rt[order[i]] = self.vregs.alloc(constraint=self.zreg_lane_lo) + yield f"ldr q{rt[order[i]]}, [x{self.ptr_roots_twisted}, #+{offset}]" + else: + r[order[i]] = self.vregs.alloc(constraint=self.zreg_lane_lo) + yield f"ldr q{r[order[i]]}, [x{self.ptr_roots_merged}, #+{2*offset+0}]" + rt[order[i]] = self.vregs.alloc(constraint=self.zreg_lane_lo) + yield f"ldr q{rt[order[i]]}, [x{self.ptr_roots_merged}, #+{2*offset+self.vector_bytelen}]" + + if butterfly.scalar_load == None: + butterfly.scalar_load = gen() + + return butterfly.scalar_load + + def get_transpose(self,butterfly): + + def butterfly_accessor(idx): + return self.get_data(butterfly[idx]) + + if butterfly == None: + return iter([]) + + if butterfly.transpose == None: + butterfly.transpose = self.transpose4(butterfly_accessor) + + while True: + n = next(butterfly.transpose,None) + if n == None: + break + else: + yield n + + def progress_arithmetic(self,butterfly,idx): + if butterfly == None: + return iter([]) + + yield next(butterfly.gs[idx]) + + def get_schedule_triple_no_transpose(self, idx): + + if self.bitwidth == 32: + default = { "load_order": [7,6,4,5,3,2,0,1], + "store_order": [0,1,2,3,6,7,4,5], + "numbering": list(zip([3,2,0,1, 1,0,5,4, 0,2,6,4], + [7,6,4,5, 3,2,7,6, 1,3,7,5], + [0,0,0,0, 1,1,2,2, 3,4,6,5])), + "twiddles": { 0: (0,0), + 1: (0,1), + 2: (0,2), + 3: (1,0), + 4: (1,1), + 5: (1,2), + 6: (1,3) }, + "root_load_order": list(range(0,10)), + "schedule": None, + } + elif self.bitwidth == 64: + default = { "load_order": [7,6,4,5,3,2,0,1], + "store_order": [0,1,2,3,6,7,4,5], + "numbering": list(zip([3,2,0,1, 1,0,5,4, 0,2,6,4], + [7,6,4,5, 3,2,7,6, 1,3,7,5], + [0,0,0,0, 1,1,2,2, 3,4,6,5])), + "twiddles": { 0: (0,0), + 1: (1,0), + 2: (1,1), + 3: (2,0), + 4: (2,1), + 5: (3,0), + 6: (3,1) }, + "root_load_order": list(range(0,10)), + "schedule": None, + } + + modifications = { + + # INDEX 0 + # Trivial implementation, no interleaving whatsoever + 0: { "schedule": + ["m", "l", "l", "l", "l", "l", "l", "l", "l", + 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, + 4, 4, 4, 4, 4, "lre", + 5, 5, 5, 5, 5, + 6, 6, 6, 6, 6, + 7, 7, 7, 7, 7, + 8, 8, 8, 8, 8, "s", "s", + 9, 9, 9, 9, 9, "s", "s", + 10, 10, 10, 10, 10, "s", "s", + 11, 11, 11, 11, 11, "s", "s", "fr" ] }, + + # INDEX 1 + # - No early loads, few late stores + # - suitable for two inputs + # - Lots of spacing between all GS components + 1: { "schedule": + ["m", "l", "l", 0, -2, -1, 0, -2, "sl", "sl", + "l", "l", 1, -1, 0, 1, -1, "sl", "sl", "frl", + "l", "l", 2, 0, 1, 2, 0, + "l", "l", 3, 1, 2, 3, 1, + 4, 2, 3, 4, 2, + 5, 3, 4, 5, 3, + 6, 4, 5, 6, 4, "lre", + 7, 5, 6, 7, 5, + 8, 6, 7, 8, 6, + 9, 7, 8, 9, 7, + 10, 8, 9, 10, 8, "s", "s", + 11, 9, 10, 11, 9, "s", "s" ] }, + + # INDEX 2 + # - Late-store, but only few early loads + # - GS blocks (mul,mul),(mul),(add,sub) + 2: { "schedule": + ["m", 0, 0, "l", -1, -2, -2, + 1, "sl", 1, "l", 0, -1, -1, "frl", + 2, "sl", 2, "l", 1, 0, 0, + 3, "sl", 3, "l", 2, 1, 1, + 4, "sl", 4, "l", 3, 2, 2, + 5, "sl", 5, "l", 4, 3, 3, + 6, "sl", 6, 5, 4, 4, + 7, "sl", 7, 6, 5, 5, "lres", + 8, "sl", 8, 7, 6, 6, "lres", + 9, 9, 8, 7, 7, "lre", + 10, 10, "le", 9, 8, 8, + 11, 11, "le", 10, 9, 9 ] }, + + # INDEX 3 + # - Extensive pre-loading and late-storing + # - GS blocks (mul,mul),(mul),(add,sub) + 3 : { "schedule": + ["m", 0, 0, -1, -2, -2, + 1, 1, 0, -1, -1, "frl", + 2, 2, "le", 1, 0, 0, + 3, 3, "le", 2, 1, 1, + 4, "sl", 4, "le", 3, 2, 2, + 5, "sl", 5, "le", 4, 3, 3, + 6, "sl", 6, "le", 5, 4, 4, "lre", + 7, "sl", 7, "le", 6, 5, 5, + 8, "sl", 8, "le", 7, 6, 6, + 9, "sl", 9, "le", 8, 7, 7, + 10, "sl", 10, 9, 8, 8, + 11, "sl", 11, 10, 9, 9 ] }, + + # INDEX 4 + # - Extensive pre-loading and late-storing + # - GS blocks (mul), (mul) (mul),(add,sub) + 4 : { "schedule": + ["m", 0, -1, 0, -2, -2, + 1, 0, 1, -1, -1, "frl", + 2, 1, "le", 2, 0, 0, + 3, 2, "le", 3, 1, 1, + 4, "sl", 3, "le", 4, 2, 2, + 5, "sl", 4, "le", 5, 3, 3, + 6, "sl", 5, "le", 6, 4, 4, "lre", + 7, "sl", 6, "le", 7, 5, 5, + 8, "sl", 7, "le", 8, 6, 6, + 9, "sl", 8, "le", 9, 7, 7, + 10, "sl", 9, 10, 8, 8, + 11, "sl", 10, 11, 9, 9 ] }, + + # INDEX 5 + # - Extensive pre-loading and late-storing + # - GS blocks (mul),(mul,mul),(add,sub) + 5 : { "schedule": + ["m", 0, -1, -1, -2, -2, + 1, 0, 0, -1, -1, "frl", + 2, "le", 1, 1, 0, 0, + 3, "le", 2, 2, 1, 1, + 4, "sl", "le", 3, 3, 2, 2, + 5, "sl", "le", 4, 4, 3, 3, + 6, "sl", "le", 5, 5, 4, 4, "lre", + 7, "sl", "le", 6, 6, 5, 5, + 8, "sl", "le", 7, 7, 6, 6, + 9, "sl", "le", 8, 8, 7, 7, + 10, "sl", 9, 9, 8, 8, + 11, "sl", 10, 10, 9, 9 ] }, + + # INDEX 6 + # - No early loads, few late stores + # - suitable for two inputs + # - GS blocks (mul,mul),(mul),(add), (sub) + 6 : { "schedule": + ["m", "l", "l", 0, 0, -2, -1, -2, "sl", "sl", + "l", "l", 1, 1, -1, 0, -1, "sl", "sl", "frl", + "l", "l", 2, 2, 0, 1, 0, + "l", "l", 3, 3, 1, 2, 1, + 4, 4, 2, 3, 2, + 5, 5, 3, 4, 3, + 6, 6, 4, 5, 4, "lre", + 7, 7, 5, 6, 5, + 8, 8, 6, 7, 6, + 9, 9, 7, 8, 7, + 10, 10, 8, 9, 8, "s", "s", + 11, 11, 9, 10, 9, "s", "s" ] }, + + # INDEX 7 + # - No early loads, few late stores + # - suitable for two inputs + # - GS blocks (mul,mul),(mul),(add), (sub) + # - scattered stores + 7 : { "schedule": + ["m", "l", "l", 0, 0, "sl", -2, -1, -2, "sl", + "l", "l", 1, "sl", 1, -1, 0, -1, "sl", "frl", + "l", "l", 2, "sl", 2, 0, 1, 0, + "l", "l", 3, 3, 1, 2, 1, + 4, 4, 2, 3, 2, + 5, 5, 3, 4, 3, + 6, 6, 4, 5, 4, "lre", + 7, 7, 5, 6, 5, + 8, 8, 6, 7, 6, + 9, 9, 7, 8, 7, + 10, 10, 8, 9, 8, "s", + 11, 11, "s", 9, 10, 9, "s" ] }, + + # INDEX 8 + # - No early loads, few late stores + # - suitable for two inputs + # - GS blocks (mul,mul),(mul),(add), (sub) + # - stores after muls only, trying to avoid them going + # multiply-capable SIMD units + 8 : { "schedule": + ["m", "l", "l", 0, "sl", 0, "sl", -2, -1, -2, + "l", "l", 1, "sl", 1, "sl", -1, 0, -1, "frl", + "l", "l", 2, "sl", 2, "sl", 0, 1, 0, + "l", "l", 3, 3, 1, 2, 1, + 4, 4, 2, 3, 2, + 5, 5, 3, 4, 3, + 6, 6, 4, 5, 4, "lre", + 7, 7, 5, 6, 5, + 8, 8, 6, 7, 6, + 9, 9, 7, 8, 7, + 10, 10, 8, 9, 8, + 11, "s", 11, "s", 9, 10, 9 ] }, + + # INDEX 9 + # - No early loads, few late stores + # - suitable for two inputs + # - GS blocks (mul,mul),(mul),(add), (sub) + # - stores after muls only, trying to avoid them going + # multiply-capable SIMD units + 9 : { "schedule": + ["m", "l", "l", 0, "sl", 0, -2, -1, "sl", -2, + "l", "l", 1, "sl", 1, -1, 0, "sl", -1, "frl", + "l", "l", 2, "sl", 2, 0, 1, "sl", 0, + "l", "l", 3, 3, 1, 2, 1, + 4, 4, 2, 3, 2, + 5, 5, 3, 4, 3, + 6, 6, 4, 5, 4, "lre", + 7, 7, 5, 6, 5, + 8, 8, 6, 7, 6, + 9, 9, 7, 8, 7, + 10, 10, 8, 9, 8, + 11, "s", 11, 9, 10, "s", 9 ] }, + + # INDEX 10 + # - No early loads, few late stores + # - suitable for two inputs + # - GS blocks (mul,mul),(mul),(add), (sub) + # - stores after muls only, trying to avoid them going + # multiply-capable SIMD units + 10 : { "schedule": + ["m", "l", "l", 0, "sl", 0, "sl", -1, -2, -2, + "l", "l", 1, "sl", 1, "sl", 0, -1, -1, "frl", + "l", "l", 2, "sl", 2, "sl", 1, 0, 0, + "l", "l", 3, 3, 2, 1, 1, + 4, 4, 3, 2, 2, + 5, 5, 4, 3, 3, + 6, 6, 5, 4, 4, "lre", + 7, 7, 6, 5, 5, + 8, 8, 7, 6, 6, + 9, 9, 8, 7, 7, + 10, 10, 9, 8, 8, + 11, "s", 11, "s", 10, 9, 9 ] }, + + # INDEX 11 + # - No early loads, few late stores + # - suitable for two inputs + # - GS blocks (mul,mul),(mul),(add), (sub) + # - stores after muls only, trying to avoid them going + # multiply-capable SIMD units + 11 : { "schedule": + ["m", 0, "sl", 0, "sl", -1, "l", -2, "l", -2, + 1, "sl", 1, "sl", 0, "l", -1, "l", -1, "frl", + 2, "sl", 2, "sl", 1, "l", 0, "l", 0, + 3, 3, 1, 2, 1, + 4, 4, 2, 3, 2, + 5, 5, 3, 4, 3, + 6, 6, 4, 5, 4, "lre", + 7, 7, 5, 6, 5, + 8, 8, 6, 7, 6, + 9, 9, 7, 8, 7, + 10, 10, 8, 9, 8, + 11, "s", 11, "s", 10, 9, "le", "le", 9 ] }, + + } + + modification = modifications[idx] + + for k,v in modification.items(): + if not k in default.keys(): + raise Exception(f"Invalid modification: {k}") + + dic = { **default, **modification } + + dic["load_order_first"] = dic.get("load_order_first", dic["load_order"]) + dic["store_order_last"] = dic.get("store_order_last", dic["store_order"]) + + return dic + + def get_schedule_double_no_transpose_zipped(self, idx): + + load_order_default = [2,3,0,1] + store_order_default = [0,1,2,3] + butterfly_numbering_default = list(zip([0,1,0,2], + [2,3,1,3], + [0,0,1,2])) + twiddle_numbering_default = { 0: (0,0), + 1: (0,1), + 2: (0,2) } + root_load_order_default = list(range(0,10)) # Identity + + schedules = [ + + # INDEX 0 + # Trivial implementation, no interleaving whatsoever + [ None, None, None, None, + (0, "m"), (0, "l"), (0, "l"), (0, "l"), (0, "l"), + (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), + (0, 1), (0, 1), (0, 1), (0, 1), (0, 1), (0,"lre"), + (0, 2), (0, 2), (0, 2), (0, 2), (0, 2), + (0, 3), (0, 3), (0, 3), (0, 3), (0, 3), + (0,"s"), (0,"s"), (0,"s"), (0,"s"), (0,"frl"), + (1, "m"), (1, "l"), (1, "l"), (1, "l"), (1, "l"), + (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), + (1, 1), (1, 1), (1, 1), (1, 1), (1, 1), (1,"lre"), + (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), + (1, 3), (1, 3), (1, 3), (1, 3), (1, 3), + (1,"s"), (1,"s"), (1,"s"), (1,"s"), (1,"frl") ] + ] + + load_order, store_order, numbering, twiddles, root_order, schedule = schedules[idx] + + if load_order == None: + load_order = load_order_default + if store_order == None: + store_order = store_order_default + if numbering == None: + numbering = butterfly_numbering_default + if twiddles == None: + twiddles = twiddle_numbering_default + if root_order == None: + root_order = root_load_order_default + + return load_order, store_order, numbering, twiddles, root_order, schedule + + def get_schedule_double_no_transpose_quad_zipped(self, idx): + + default = { "load_order": [2,3,0,1], + "store_order": [0,1,2,3], + "numbering": list(zip([0,1,0,2], + [2,3,1,3], + [0,0,1,2])), + "twiddles": { 0: (0,0), + 1: (0,1), + 2: (0,2) }, + "root_load_order": list(range(0,10)), + "schedule": None } + + modifications = { + + # INDEX 0 + # Trivial implementation, no interleaving whatsoever + 0 : { "schedule": + [ (0, "m"), (0, "l"), (0, "l"), (0, "l"), (0, "l"), + (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), + (0, 1), (0, 1), (0, 1), (0, 1), (0, 1), (0,"lre"), + (0, 2), (0, 2), (0, 2), (0, 2), (0, 2), + (0, 3), (0, 3), (0, 3), (0, 3), (0, 3), + (0,"s"), (0,"s"), (0,"s"), (0,"s"), (0,"frl"), + + (1, "m"), (1, "l"), (1, "l"), (1, "l"), (1, "l"), + (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), + (1, 1), (1, 1), (1, 1), (1, 1), (1, 1), (1,"lre"), + (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), + (1, 3), (1, 3), (1, 3), (1, 3), (1, 3), + (1,"s"), (1,"s"), (1,"s"), (1,"s"), (1,"frl"), + + (2, "m"), (2, "l"), (2, "l"), (2, "l"), (2, "l"), + (2, 0), (2, 0), (2, 0), (2, 0), (2, 0), + (2, 1), (2, 1), (2, 1), (2, 1), (2, 1), (2,"lre"), + (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), + (2, 3), (2, 3), (2, 3), (2, 3), (2, 3), + (2,"s"), (2,"s"), (2,"s"), (2,"s"), (2,"frl"), + + (3, "m"), (3, "l"), (3, "l"), (3, "l"), (3, "l"), + (3, 0), (3, 0), (3, 0), (3, 0), (3, 0), + (3, 1), (3, 1), (3, 1), (3, 1), (3, 1), (3,"lre"), + (3, 2), (3, 2), (3, 2), (3, 2), (3, 2), + (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), + (3,"s"), (3,"s"), (3,"s"), (3,"s"), (3,"frl") ] }, + + # INDEX 1 + # Interleaved arithmetic + 1 : { "schedule": [ + (0,"m"), (0,"l"), (0,"l"), (0,"l"), (0,"l"), + (1,"m"), (1,"l"), (1,"l"), (1,"l"), (1,"l"), + (2,"m"), (2,"l"), (2,"l"), (2,"l"), (2,"l"), + (3,"m"), (3,"l"), (3,"l"), (3,"l"), (3,"l"), + + (0,"lrs"), (0,"lrs"), + (1,"lrs"), (1,"lrs"), + (2,"lrs"), (2,"lrs"), + (3,"lrs"), (3,"lrs"), + + (0,0), (0,0), + (0,1), (0,1), (0,0), + (1,0), (1,0), (0,1), (0,0), (0,0), + (1,1), (1,1), (1,0), (0,1), (0,1), + (0,2), (0,2), (1,1), (1,0), (1,0), + (0,3), (0,3), (0,2), (1,1), (1,1), + (1,2), (1,2), (0,3), (0,2), (0,2), (0, "frs"), (0, "frs"), + (1,3), (1,3), (1,2), (0,3), (0,3), + (2,0), (2,0), (1,3), (1,2), (1,2), (1, "frs"), (1, "frs"), + (2,1), (2,1), (2,0), (1,3), (1,3), + (3,0), (3,0), (2,1), (2,0), (2,0), + (3,1), (3,1), (3,0), (2,1), (2,1), + (2,2), (2,2), (3,1), (3,0), (3,0), + (2,3), (2,3), (2,2), (3,1), (3,1), + (3,2), (3,2), (2,3), (2,2), (2,2), (2, "frs"), (2, "frs"), + (3,3), (3,3), (3,2), (2,3), (2,3), + (3,3), (3,2), (3,2), (3, "frs"), (3, "frs"), + (3,3), (3,3), + + (0, "s"), (0, "s"), (0, "s"), (0, "s"), + (1, "s"), (1, "s"), (1, "s"), (1, "s"), + (2, "s"), (2, "s"), (2, "s"), (2, "s"), + (3, "s"), (3, "s"), (3, "s"), (3, "s") ] }, + + # INDEX 2 + # Butterfly-wise interleaving + 2 : { "schedule": [ + (0, "l"), (0, "l"), (0, "l"), + (1, "l"), (1, "l"), (1, "l"), + (2, "l"), (2, "l"), (2, "l"), + (3, "l"), (3, "l"), (3, "l"), + (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, "l"), + (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, "l"), + (2, 0), (2, 0), (2, 0), (2, 0), (2, 0), (2, "l"), + (3, 0), (3, 0), (3, 0), (3, 0), (3, 0), (3, "l"), + (0, 1), (0, 1), (0, 1), (0, 1), (0, 1), + (1, 1), (1, 1), (1, 1), (1, 1), (1, 1), + (2, 1), (2, 1), (2, 1), (2, 1), (2, 1), + (3, 1), (3, 1), (3, 1), (3, 1), (3, 1), + (0, 2), (0, 2), (0, 2), (0, 2), (0, 2), + (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), (0, "s"), (0, "s"), + (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (1, "s"), (1, "s"), + (3, 2), (3, 2), (3, 2), (3, 2), (3, 2), (2, "s"), (2, "s"), + (0, 3), (0, 3), (0, 3), (0, 3), (0, 3), (3, "s"), (3, "s"), + (0,"fr"), (0,"lre"), + (1, 3), (1, 3), (1, 3), (1, 3), (1, 3), (1,"fr"), (1,"lre"), + (2, 3), (2, 3), (2, 3), (2, 3), (2, 3), (2,"fr"), (2,"lre"), + (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3,"fr"), (3,"lre"), + (0,"s"), (0,"s"), + (1,"s"), (1,"s"), + (2,"s"), (2,"s"), + (3,"s"), (3,"s"), + ] }, + + # INDEX 3 + # Butterfly-wise interleaving + 3: { "schedule": [ + (0, "l"), (0, "l"), (0, "l"), + (1, "l"), (1, "l"), (1, "l"), + (2, "l"), (2, "l"), (2, "l"), + (3, "l"), (3, "l"), (3, "l"), + + (0, 0), (1, 0), (2, 0), (3, 0), (0, "sl"), + (0, 0), (1, 0), (2, 0), (3, 0), (1, "sl"), + (0, 0), (1, 0), (2, 0), (3, 0), (2, "sl"), + (0, 0), (1, 0), (2, 0), (3, 0), (3, "sl"), + (0, 0), (1, 0), (2, 0), (3, 0), + + (0, "l"), + (1, "l"), + (2, "l"), + (3, "l"), + + (0, 1), (1, 1), (2, 1), (3, 1), + (0, 1), (1, 1), (2, 1), (3, 1), + (0, 1), (1, 1), (2, 1), (3, 1), + (0, 1), (1, 1), (2, 1), (3, 1), + (0, 1), (1, 1), (2, 1), (3, 1), + + (0, 2), (1, 2), (2, 2), (3, 2), + (0, 2), (1, 2), (2, 2), (3, 2), + (0, 2), (1, 2), (2, 2), (3, 2), + (0, 2), (1, 2), (2, 2), (3, 2), + (0, 2), (1, 2), (2, 2), (3, 2), + + (0, 3), (1, 3), (2, 3), (3, 3), (0,"s"), (0,"s"), + (0, 3), (1, 3), (2, 3), (3, 3), (1,"s"), (1,"s"), + (0, "fr"), + (1, "fr"), + (2, "fr"), + (3, "fr"), + (0,"lre"), + (1,"lre"), + (0, 3), (1, 3), (2, 3), (3, 3), (2,"s"), (2,"s"), + (2,"lre"), + (0, 3), (1, 3), (2, 3), (3, 3), (3,"s"), (3,"s"), + (3,"lre"), + (0, 3), (1, 3), (2, 3), (3, 3), + + (0, "s"), + (1, "s"), + (2, "s"), + (3, "s"), + ] }, + + # INDEX 4 + # Careful scheduling of arithmetic instructions, + # tailored to microarchitectures like Cortex-X1: + # - 4 SIMD units + # - 2 of them multiply capable + # - Multiply latency 4, but 1-cycle fwd for MUL-MLA + # Note the asymmetry between 0-3 and 4-7, leveraging + # the fast fwd from MUL to MLA. + 4 : { "store_order": [1,0,3,2], + "schedule": + [ + + (0,"lrs"), (0,"lrs"), + (0,"l"), + (0,0), (2,-2), (2,"sl"), + (0,0), (2,-2), (2,"sl"), + (0,"l"), + (0,1), (2,-1), + (0,1), (2,-1), (2,"sl"), (2,"sl"), + (1,"lrs"), (1,"lrs"), + (1,"l"), + (1,0), (3,-2), + (1,0), (3,-2), (3,"sl"), (3,"sl"), + (1,"l"), + (1,1), (3,-1), + (1,1), (3,-1), (3,"sl"), (3,"sl"), + (2,"l"), + (2,"lrs"), (2,"lrs"), + (0,0), + (2,0), + (2,"l"), + (0,1), + (2,1), + (3,"l"), + (3,"lrs"), (3,"lrs"), + (1,0), + (3,0), + (3,"l"), + (1,1), + (3,1), + + (2,"l"), (0,"l"), + (2,0), (0,0), + (2,1), (0,0), + (2,"l"), (0,"l"), + (2,0), (0,1), + (2,1), (0,1), + (3,"l"), (1,"l"), + (3,0), (1,0), + (3,1), (1,0), + (3,"l"), (1,"l"), + (3,0), (1,1), + (3,1), (1,1), + + (0,2), + (0,2), + (0,3), (2,0), + (0,3), (2,0), + (0,"frs"), + (0,"frs"), + (1,2), (2,1), + (1,2), (2,1), + (1,3), (3,0), + (1,3), (3,0), + (1,"frs"), + (1,"frs"), + (0,2), (3,1), + (2,2), (3,1), + (0,3), + (2,3), + (1,2), + (3,2), + (1,3), + (3,3), + + (2,2), (0,2), + (2,3), (0,2), (0,"s"), (0,"s"), + (2,2), (0,3), + (2,3), (0,3), (0,"s"), (0,"s"), + (2,"frs"), + (2,"frs"), + (3,2), (1,2), + (3,3), (1,2), (1,"s"), (1,"s"), + (3,2), (1,3), + (3,3), (1,3), (1,"s"), (1,"s"), + (3,"frs"), + (3,"frs"), + + ] }, + + # INDEX 5 + # Variation of 4 which tries to have blocks + # 2 multiply + 1 add/sub + 1 str + # This combination can in principle keep all SIMD units busy + 5 : { "store_order": [1,0,3,2], + "schedule": [ + + (0,0), (2,-2), + (0,0), (2,"sl"), + (0,"l"), + (0,1), (2,-2), + (0,1), (2,"sl"), + (1,"lrs"), + (1,"lrs"), + (1,"l"), + (1,0), (2,-1), + (1,0), (2,"sl"), + (1,"l"), + (1,1), (2,-1), + (1,1), (2,"sl"), + (2,"lrs"), + (2,"lrs"), + (0,0), (3,-2), + (2,0), (3,"sl"), + (2,"l"), + (0,1), (3,-2), + (2,1), (3,"sl"), + (3,"lrs"), + (3,"lrs"), + (1,0), (3,-1), + (3,0), (3,"sl"), + (3,"l"), + (1,1), (3,-1), + (3,1), (3,"sl"), + + (2,"l"), (0,"l"), + (2,0), (0,0), + (2,1), (0,0), + (2,"l"), (0,"l"), + (2,0), (0,1), + (2,1), (0,1), + (3,"l"), (1,"l"), + (3,0), (1,0), + (3,1), (1,0), + (3,"l"), (1,"l"), + (3,0), + (3,1), (1,1), + + (0,2), + (0,2), (1,1), + (0,3), (2,0), + (0,3), (2,0), + (0,"frs"), + (0,"frs"), + (1,2), (2,1), + (1,2), (2,1), + (1,3), (3,0), + (1,3), (3,0), + (1,"frs"), + (1,"frs"), + (0,2), (3,1), (3,"le"), + (2,2), (3,1), + (0,3), (2,"le"), + (2,3), + (1,2), (0,"le"), + (3,2), + (1,3), (0,"lres"), + (3,3), (0,"lres"), + + (2,2), (0,2), (0,"s"), + (2,3), (0,2), (0,"s"), + (2,2), (0,3), (0,"s"), + (2,3), (0,3), (0,"s"), + (2,"frs"), + (2,"frs"), + (3,2), (1,2), (1,"s"), + (3,3), (1,2), (1,"s"), + (3,2), (1,3), (1,"s"), + (3,3), (1,3), (1,"s"), + (3,"frs"), + (3,"frs"), + ] }, + + # INDEX 6 + # Variation of 5 + 6 : { "store_order": [1,0,3,2], + "schedule": [ + + (0,0), (2,-2), + (0,0), (2,"sl"), + (0,"l"), + (0,1), (2,-2), + (0,1), (2,"sl"), + (1,"lrs"), + (1,"lrs"), + (1,"l"), + (1,0), (2,-1), + (1,0), (2,"sl"), + (1,"l"), + (1,1), (2,-1), + (1,1), (2,"sl"), + (2,"lrs"), + (2,"lrs"), + (0,0), (3,-2), + (2,0), (3,"sl"), + (2,"l"), + (0,1), (3,-2), + (2,1), (3,"sl"), + (3,"lrs"), + (3,"lrs"), + (1,0), (3,-1), + (3,0), (3,"sl"), + (3,"l"), + (1,1), (3,-1), + (3,1), (3,"sl"), (0,"l"), + + (2,"l"), + (2,0), (0,0), (0,"l"), + (2,1), (0,0), + (2,"l"), + (2,0), (0,1), (1,"l"), + (2,1), (0,1), + (3,"l"), + (3,0), (1,0), (1,"l"), + (3,1), (1,0), + (3,"l"), + (3,0), (1,1), + (3,1), (1,1), + + (0,2), + (0,2), + (0,3), (2,0), + (0,3), (2,0), + (0,"frs"), + (0,"frs"), + (1,2), (2,1), + (1,2), (2,1), + (1,3), (3,0), + (1,3), (3,0), + (1,"frs"), + (1,"frs"), + (0,2), (3,1), (3,"le"), + (2,2), (3,1), + (0,3), (2,"le"), + (2,3), (0,2), # This isn't ready yet, but at least + # we keep the balance of mul/add/str + # instructions and don't have a + # bottleneck at the end + (1,2), (0,"le"), + (3,2), (0,2), (0,"s"), + (1,3), (0,"lres"), (0,"lres"), + (3,3), (0,3), (0,"s"), + + (2,2), (0,3), + (2,3), (0,"s"), + (2,2), (1,2), + (2,3), (0,"s"), + (2,"frs"), + (2,"frs"), + (3,2), (1,"s"), + (3,3), (1,2), (1,"s"), + (3,2), (1,3), (1,"s"), + (3,3), (1,3), (1,"s"), + (3,"frs"), + (3,"frs"), + + ] }, + + # INDEX 7 + # Variation of 6 + 7 : { "store_order": [1,0,3,2], + "schedule": [ + + (0,0), (2,-2), + (0,0), (2,"sl"), + (0,"l"), + (0,1), (2,-2), + (0,1), (2,"sl"), + (1,"lrs"), + (1,"lrs"), + (1,"l"), + (1,0), (2,-1), + (1,0), (2,"sl"), + (1,"l"), + (1,1), (2,-1), + (1,1), (2,"sl"), + (2,"lrs"), + (2,"lrs"), + (0,0), (3,-2), + (2,0), (3,"sl"), + (2,"l"), + (0,1), (3,-2), + (2,1), (3,"sl"), + (3,"lrs"), + (3,"lrs"), + (1,0), (3,-1), + (3,0), (3,"sl"), + (3,"l"), + (1,1), (3,-1), + (3,1), (3,"sl"), (0,"l"), + + (2,"l"), + (2,0), (0,0), (0,"l"), + (2,1), (0,0), + (2,"l"), + (2,0), (0,1), (1,"l"), + (2,1), (0,1), + (3,"l"), + (3,0), (1,0), (1,"l"), + (3,1), (1,0), + (3,"l"), + (3,0), + (3,1), (1,1), + + (0,2), (1,1), + (0,2), + (0,3), (2,0), + (0,3), (2,0), + (0,"frs"), + (0,"frs"), + (1,2), (2,1), + (1,2), (2,1), + (1,3), (3,0), + (1,3), (3,0), + (1,"frs"), + (1,"frs"), + (0,2), (3,1), (3,"le"), + (2,2), (3,1), + (0,3), (2,"le"), + (2,3), (0,2), # This isn't ready yet, but at least + # we keep the balance of mul/add/str + # instructions and don't have a + # bottleneck at the end + (1,2), (0,"le"), + (3,2), (0,2), (0,"s"), + (1,3), (0,"lres"), (0,"lres"), + (3,3), (0,3), (0,"s"), + + (2,2), (0,3), + (2,3), (0,"s"), + (2,2), (1,2), + (2,3), (0,"s"), + (2,"frs"), + (2,"frs"), + (3,2), (1,"s"), + (3,3), (1,2), (1,"s"), + (3,2), (1,3), (1,"s"), + (3,3), (1,3), (1,"s"), + (3,"frs"), + (3,"frs"), + + ] }, + + # INDEX 8 + # Variation of 7, experimentally removing consecutive loads + 8 : { "store_order": [1,0,3,2], + "schedule": + [ + + (0,0), (2,-2), + (0,0), (2,"sl"), + (0,"l"), + (0,1), (2,-2), (1,"lrs"), + (0,1), (2,"sl"), + (1,"lrs"), + (1,"l"), + (1,0), (2,-1), + (1,0), (2,"sl"), + (1,"l"), + (1,1), (2,-1), + (1,1), (2,"sl"), + (2,"lrs"), + (0,0), (3,-2), (2,"lrs"), + (2,0), (3,"sl"), + (2,"l"), + (0,1), (3,-2), + (2,1), (3,"sl"), + (3,"lrs"), + (1,0), (3,-1), (3,"lrs"), + (3,0), (3,"sl"), + (3,"l"), + (1,1), (3,-1), + (3,1), (3,"sl"), (0,"l"), + + (2,"l"), + (2,0), (0,0), (0,"l"), + (2,1), (0,0), + (2,"l"), + (2,0), (0,1), (1,"l"), + (2,1), (0,1), + (3,"l"), + (3,0), (1,0), (1,"l"), + (3,1), (1,0), + (3,"l"), + (3,0), + (3,1), (1,1), + + (0,2), (1,1), + (0,2), + (0,3), (2,0), + (0,3), (2,0), + (0,"frs"), + (0,"frs"), + (1,2), (2,1), + (1,2), (2,1), + (1,3), (3,0), + (1,3), (3,0), + (1,"frs"), + (1,"frs"), + (0,2), (3,1), (3,"le"), + (2,2), (3,1), + (0,3), (2,"le"), + (2,3), (0,2), # This isn't ready yet, but at least + # we keep the balance of mul/add/str + # instructions and don't have a + # bottleneck at the end + (1,2), (0,"le"), + (3,2), (0,2), (0,"s"), + (1,3), (0,"lres"), + (3,3), (0,3), (0,"s"), + + (2,2), (0,3), (0,"lres"), + (2,3), (0,"s"), + (2,2), (1,2), + (2,3), (0,"s"), + (2,"frs"), + (2,"frs"), + (3,2), (1,"s"), + (3,3), (1,2), (1,"s"), + (3,2), (1,3), (1,"s"), + (3,3), (1,3), (1,"s"), + (3,"frs"), + (3,"frs"), + + ] }, + + # INDEX 9 + 9 : { "store_order": [1,0,3,2], + "schedule": [ + + (0,0), (2,-2), + (0,0), (2,"sl"), + (0,"l"), + (0,1), (2,-2), + (0,1), (2,"sl"), + (1,"lrs"), + (1,"lrs"), + (1,"l"), + (1,0), (2,-1), + (1,0), (2,"sl"), + (1,"l"), + (1,1), (2,-1), + (1,1), (2,"sl"), + (2,"lrs"), + (2,"lrs"), + (0,0), (3,-2), + (2,0), (3,"sl"), + (2,"l"), + (0,1), (3,-2), + (2,1), (3,"sl"), + (3,"lrs"), + (3,"lrs"), + (1,0), (3,-1), + (3,0), (3,"sl"), + (3,"l"), + (1,1), (3,-1), + (3,1), (3,"sl"), (0,"l"), + + (2,"l"), + (2,0), (0,0), (0,"l"), + (2,1), (0,0), + (2,"l"), + (2,0), (0,1), (1,"l"), + (2,1), (0,1), + (3,"l"), + (3,0), (1,0), (1,"l"), + (3,1), (1,0), + (3,"l"), + (3,0), "nop", + (3,1), (1,1), + + (0,2), (1,1), + (0,2), "nop", + (0,3), (2,0), + (0,3), (2,0), + (0,"frs"), + (0,"frs"), + (1,2), (2,1), + (1,2), (2,1), + (1,3), (3,0), + (1,3), (3,0), + (1,"frs"), + (1,"frs"), + (0,2), (3,1), (3,"le"), + (2,2), (3,1), + (0,3), (2,"le"), + (2,3), (0,2), # This isn't ready yet, but at least + # we keep the balance of mul/add/str + # instructions and don't have a + # bottleneck at the end + (1,2), (0,"le"), + (3,2), (0,2), (0,"s"), + (1,3), (0,"lres"), (0,"lres"), + (3,3), (0,3), (0,"s"), + + (2,2), (0,3), + (2,3), (0,"s"), + (2,2), (1,2), + (2,3), (0,"s"), + (2,"frs"), + (2,"frs"), + (3,2), (1,"s"), + (3,3), (1,2), (1,"s"), + (3,2), (1,3), (1,"s"), + (3,3), (1,3), (1,"s"), + (3,"frs"), + (3,"frs"), + + ] }, + + # INDEX 10 + # Based on 7, pairing mul ops, making sure we never have two add/sub/str + # between blocks of two muls + 10 : { "store_order": [1,0,3,2], + "schedule": + [ + (0,0), (0,0), (1,-1), (0,"l"), + (1,"sl"), (1,"lrs"), + + (0,1), (0,1), (2,-2), (1,"lrs"), + (2,"sl"), (1,"l"), + (1,0), (1,0), (2,-2), + (2,"sl"), (1,"l"), + + (1,1), (1,1), (2,-1), (2,"lrs"), + (2,"sl"), (2,"lrs"), + (0,0), (2,0), (2,-1), (2,"l"), + (2,"sl"), + (0,1), (2,1), (3,-2), (3,"lrs"), + (3,"sl"), (3,"lrs"), + (1,0), (3,0), (3,-2), + (3,"sl"), (3,"l"), + (1,1), (3,1), (3,-1), (0,"l"), + (3,"sl"), + + (2,"l"), + (2,0), (2,1), (3,-1), (0,"l"), + (3,"sl"), (2,"l"), + (2,0), (2,1), (0,0), (1,"l"), + (0,0), (3,"l"), + (3,0), (3,1), (0,1), (1,"l"), + (0,1), (3,"l"), + (3,0), (3,1), (1,0), + (1,0), + + (0,2), (0,2), (1,1), + (1,1), + (0,3), (0,3), (2,0), (0,"frs"), + (2,0), (0,"frs"), + (1,2), (1,2), (2,1), + (2,1), + (1,3), (1,3), (3,0), (1,"frs"), + (3,0), (1,"frs"), + + (0,2), (2,2), (3,1), (3,"le"), + (3,1), + (0,3), (2,3), (0,2), (2,"le"), + (0,"s"), + (1,2), (3,2), (0,2), (0,"le"), + (0,"s"), + (1,3), (3,3), (0,3), (0,"lres"), + (0,"s"), + + (2,2), (2,3), (0,3), + (0,"s"), (0,"lres"), + (2,2), (2,3), (1,2), (2,"frs"), + (1,"s"), (2,"frs"), + (3,2), (3,3), (1,2), + (1,"s"), + (3,2), (3,3), (1,3), (3,"frs"), + (1,"s"), (3,"frs"), + + ] }, + + # INDEX 11 + # Based on 10, trying to find a better spacing for LDRs + # Note: - #LDRs per iteration is 4*(4+2)=24 + # - _Exactly_ matches the number of cycles spent on multiplications + # - So we can arrange code in a way that every mul-block has precisely + # one LDR in it. That's what we're experimenting with here... + 11 : { "store_order": [1,0,3,2], + "schedule": + [ + (0,0), (0,0), (1,-1), (3,"l"), + (1,"sl"), + (0,1), (0,1), (2,-2), (3,"l"), + (2,"sl"), + (1,0), (1,0), (2,-2), (2,"lrs"), + (2,"sl"), + (1,1), (1,1), (2,-1), (2,"lrs"), + (2,"sl"), + (0,0), (2,0), (2,-1), (3,"lrs"), + (2,"sl"), + (0,1), (2,1), (3,-2), (3,"lrs"), + (3,"sl"), + (1,0), (3,0), (3,-2), (0,"l"), + (3,"sl"), + (1,1), (3,1), (3,-1), (0,"l"), + (3,"sl"), + (2,0), (2,1), (3,-1), (1,"l"), + (3,"sl"), + (2,0), (2,1), (0,0), (1,"l"), + (0,0), + (3,0), (3,1), (0,1), (2,"l"), + (0,1), + (3,0), (3,1), (1,0), (2,"l"), + (1,0), + + (0,2), (0,2), (1,1), (3,"l"), + (1,1), + (0,3), (0,3), (2,0), (3,"l"), (0,"frs"), + (2,0), (0,"frs"), + (1,2), (1,2), (2,1), (0,"le"), + (2,1), + (1,3), (1,3), (3,0), (0,"le"), (1,"frs"), + (3,0), (1,"frs"), + + (0,2), (2,2), (3,1), (1,"le"), + (3,1), + (0,3), (2,3), (0,2), (1,"le"), + (0,"s"), + (1,2), (3,2), (0,2), (0,"lres"), + (0,"s"), + (1,3), (3,3), (0,3), (0,"lres"), + (0,"s"), + + (2,2), (2,3), (0,3), (1,"lres"), + (0,"s"), + (2,2), (2,3), (1,2), (1,"lres"), (2,"frs"), + (1,"s"), (2,"frs"), + (3,2), (3,3), (1,2), (2,"le"), + (1,"s"), + (3,2), (3,3), (1,3), (2,"le"), (3,"frs"), + (1,"s"), (3,"frs"), + + ] }, + + # INDEX 12 + # Based on 11, but using a different load/store order + 12 : { "load_order": [3,2,1,0], + "store_order": [3,2,1,0], + "numbering": list(zip( + [1,0,2,0], + [3,2,3,1], + [0,0,2,1])), + "schedule": + [ + (0,0), (0,0), (1,-1), (3,"l"), + (1,"sl"), + (0,1), (0,1), (2,-2), (3,"l"), + (2,"sl"), + (1,0), (1,0), (2,-2), (2,"lrs"), + (2,"sl"), + (1,1), (1,1), (2,-1), (2,"lrs"), + (2,"sl"), + (0,0), (2,0), (2,-1), (3,"lrs"), + (2,"sl"), + (0,1), (2,1), (3,-2), (3,"lrs"), + (3,"sl"), + (1,0), (3,0), (3,-2), (0,"l"), + (3,"sl"), + (1,1), (3,1), (3,-1), (0,"l"), + (3,"sl"), + (2,0), (2,1), (3,-1), (1,"l"), + (3,"sl"), + (2,0), (2,1), (0,0), (1,"l"), + (0,0), + (3,0), (3,1), (0,1), (2,"l"), + (0,1), + (3,0), (3,1), (1,0), (2,"l"), + (1,0), + + (0,2), (0,2), (1,1), (3,"l"), + (1,1), + (0,3), (0,3), (2,0), (3,"l"), (0,"frs"), + (2,0), (0,"frs"), + (1,2), (1,2), (2,1), (0,"le"), + (2,1), + (1,3), (1,3), (3,0), (0,"le"), (1,"frs"), + (3,0), (1,"frs"), + + (0,2), (2,2), (3,1), (1,"le"), + (3,1), + (0,3), (2,3), (0,2), (1,"le"), + (0,"s"), + (1,2), (3,2), (0,2), (0,"lres"), + (0,"s"), + (1,3), (3,3), (0,3), (0,"lres"), + (0,"s"), + + (2,2), (2,3), (0,3), (1,"lres"), + (0,"s"), + (2,2), (2,3), (1,2), (1,"lres"), (2,"frs"), + (1,"s"), (2,"frs"), + (3,2), (3,3), (1,2), (2,"le"), + (1,"s"), + (3,2), (3,3), (1,3), (2,"le"), (3,"frs"), + (1,"s"), (3,"frs"), + + ] }, + + # INDEX 13 + # Based on 11, shifting the whole add/sub/store block up by two places + 13 : { "store_order": [1,0,3,2], + "schedule": + [ + (0,0), (0,0), (2,-2), (3,"l"), + (2,"sl"), + (0,1), (0,1), (2,-2), (3,"l"), + (2,"sl"), + (1,0), (1,0), (2,-1), (2,"lrs"), + (2,"sl"), + (1,1), (1,1), (2,-1), (2,"lrs"), + (2,"sl"), + + (0,0), (2,0), (3,-2), (3,"lrs"), + (3,"sl"), + (0,1), (2,1), (3,-2), (3,"lrs"), + (3,"sl"), + (1,0), (3,0), (3,-1), (0,"l"), + (3,"sl"), + (1,1), (3,1), (3,-1), (0,"l"), + (3,"sl"), + + (2,0), (2,1), (0,0), (1,"l"), + (0,0), + (2,0), (2,1), (0,1), (1,"l"), + (0,1), + (3,0), (3,1), (1,0), (2,"l"), + (1,0), + (3,0), (3,1), (1,1), (2,"l"), + (1,1), + + (0,2), (0,2), (2,0), (3,"l"), + (2,0), + (0,3), (0,3), (2,1), (3,"l"), (0,"frs"), + (2,1), (0,"frs"), + (1,2), (1,2), (3,0), (0,"le"), + (3,0), + (1,3), (1,3), (3,1), (0,"le"), (1,"frs"), + (3,1), (1,"frs"), + + (0,2), (2,2), (0,2), (1,"le"), + (0,"s"), + (0,3), (2,3), (0,2), (1,"le"), + (0,"s"), + (1,2), (3,2), (0,3), (0,"lres"), + (0,"s"), + (1,3), (3,3), (0,3), (0,"lres"), + (0,"s"), + + (2,2), (2,3), (1,2), (1,"lres"), + (1,"s"), + (2,2), (2,3), (1,2), (1,"lres"), (2,"frs"), + (1,"s"), (2,"frs"), + (3,2), (3,3), (1,3), (2,"le"), + (1,"s"), + (3,2), (3,3), (1,3), (2,"le"), (3,"frs"), + (1,"s"), (3,"frs"), + + ] }, + + # INDEX 14 + # Merge of 12+13: Shifted add/sub/str's and modified load/store order + 14 : { "load_order": [3,2,1,0], + "store_order": [3,2,1,0], + "numbering": list(zip( + [1,0,2,0], + [3,2,3,1], + [0,0,2,1])), + "schedule": + [ + (0,0), (0,0), (2,-2), (3,"l"), + (2,"sl"), + (0,1), (0,1), (2,-2), (3,"l"), + (2,"sl"), + (1,0), (1,0), (2,-1), (2,"lrs"), + (2,"sl"), + (1,1), (1,1), (2,-1), (2,"lrs"), + (2,"sl"), + + (0,0), (2,0), (3,-2), (3,"lrs"), + (3,"sl"), + (0,1), (2,1), (3,-2), (3,"lrs"), + (3,"sl"), + (1,0), (3,0), (3,-1), (0,"l"), + (3,"sl"), + (1,1), (3,1), (3,-1), (0,"l"), + (3,"sl"), + + (2,0), (2,1), (0,0), (1,"l"), + (0,0), + (2,0), (2,1), (0,1), (1,"l"), + (0,1), + (3,0), (3,1), (1,0), (2,"l"), + (1,0), + (3,0), (3,1), (1,1), (2,"l"), + (1,1), + + (0,2), (0,2), (2,0), (3,"l"), + (2,0), + (0,3), (0,3), (2,1), (3,"l"), (0,"frs"), + (2,1), (0,"frs"), + (1,2), (1,2), (3,0), (0,"le"), + (3,0), + (1,3), (1,3), (3,1), (0,"le"), (1,"frs"), + (3,1), (1,"frs"), + + (0,2), (2,2), (0,2), (1,"le"), + (0,"s"), + (0,3), (2,3), (0,2), (1,"le"), + (0,"s"), + (1,2), (3,2), (0,3), (0,"lres"), + (0,"s"), + (1,3), (3,3), (0,3), (0,"lres"), + (0,"s"), + + (2,2), (2,3), (1,2), (1,"lres"), + (1,"s"), + (2,2), (2,3), (1,2), (1,"lres"), (2,"frs"), + (1,"s"), (2,"frs"), + (3,2), (3,3), (1,3), (2,"le"), + (1,"s"), + (3,2), (3,3), (1,3), (2,"le"), (3,"frs"), + (1,"s"), (3,"frs"), + + ] }, + + # INDEX 15 + # Based on 11, moving add/sub/str's down by two places, and changing load/store order + # Moving down the add/sub/str's reduces pressure on the corresponding SIMD unit because + # the last add/sub/str's are farther away from their producers. + 15 : { "load_order": [3,2,1,0], + "store_order": [3,2,1,0], + "numbering": list(zip( + [1,0,2,0], + [3,2,3,1], + [0,0,2,1])), + "schedule": + [ + (0,0), (0,0), (1,-1), (3,"l"), + (1,"sl"), + (0,1), (0,1), (1,-1), (3,"l"), + (1,"sl"), + (1,0), (1,0), (2,-2), (2,"lrs"), + (2,"sl"), + (1,1), (1,1), (2,-2), (2,"lrs"), + (2,"sl"), + (0,0), (2,0), (2,-1), (3,"lrs"), + (2,"sl"), + (0,1), (2,1), (2,-1), (3,"lrs"), + (2,"sl"), + (1,0), (3,0), (3,-2), (0,"l"), + (3,"sl"), + (1,1), (3,1), (3,-2), (0,"l"), + (3,"sl"), + (2,0), (2,1), (3,-1), (1,"l"), + (3,"sl"), + (2,0), (2,1), (3,-1), (1,"l"), + (3,"sl"), + (3,0), (3,1), (0,0), (2,"l"), + (0,0), + (3,0), (3,1), (0,1), (2,"l"), + (0,1), + (0,2), (0,2), (1,0), (3,"l"), + (1,0), + (0,3), (0,3), (1,1), (3,"l"), (0,"frs"), + (1,1), (0,"frs"), + (1,2), (1,2), (2,0), (0,"le"), + (2,0), + (1,3), (1,3), (2,1), (0,"le"), (1,"frs"), + (2,1), (1,"frs"), + (0,2), (2,2), (3,0), (1,"le"), + (3,0), + (0,3), (2,3), (3,1), (1,"le"), + (3,1), + (1,2), (3,2), (0,2), (0,"lres"), + (0,"s"), + (1,3), (3,3), (0,2), (0,"lres"), + (0,"s"), + (2,2), (2,3), (0,3), (1,"lres"), + (0,"s"), + (2,2), (2,3), (0,3), (1,"lres"), (2,"frs"), + (0,"s"), (2,"frs"), + (3,2), (3,3), (1,2), (2,"le"), + (1,"s"), + (3,2), (3,3), (1,2), (2,"le"), (3,"frs"), + (1,"s"), (3,"frs"), + + ] }, + + # INDEX 16 + # Deliberately bad, bunch lots of MULs + 16 : { "schedule": + [ (0, "m"), (0, "l"), (0, "l"), (0, "l"), (0, "l"), (0,"lre"), + (0, 0), (0, 0), (0, 0), + (0, 1), (0, 1), (0, 1), + (0, 0), (0, 0), + (0, 1), (0, 1), + (0, 2), (0, 2), (0, 2), + (0, 3), (0, 3), (0, 3), + (0, 2), (0, 2), + (0, 3), (0, 3), + (0,"s"), (0,"s"), (0,"s"), (0,"s"), (0,"frl"), + + (1, "m"), (1, "l"), (1, "l"), (1, "l"), (1, "l"), (1,"lre"), + (1, 0), (1, 0), (1, 0), + (1, 1), (1, 1), (1, 1), + (1, 0), (1, 0), + (1, 1), (1, 1), + (1, 2), (1, 2), (1, 2), + (1, 3), (1, 3), (1, 3), + (1, 2), (1, 2), + (1, 3), (1, 3), + (1,"s"), (1,"s"), (1,"s"), (1,"s"), (1,"frl"), + + (2, "m"), (2, "l"), (2, "l"), (2, "l"), (2, "l"), (2,"lre"), + (2, 0), (2, 0), (2, 0), + (2, 1), (2, 1), (2, 1), + (2, 0), (2, 0), + (2, 1), (2, 1), + (2, 2), (2, 2), (2, 2), + (2, 3), (2, 3), (2, 3), + (2, 2), (2, 2), + (2, 3), (2, 3), + (2,"s"), (2,"s"), (2,"s"), (2,"s"), (2,"frl"), + + (3, "m"), (3, "l"), (3, "l"), (3, "l"), (3, "l"), (3,"lre"), + (3, 0), (3, 0), (3, 0), + (3, 1), (3, 1), (3, 1), + (3, 0), (3, 0), + (3, 1), (3, 1), + (3, 2), (3, 2), (3, 2), + (3, 3), (3, 3), (3, 3), + (3, 2), (3, 2), + (3, 3), (3, 3), + (3,"s"), (3,"s"), (3,"s"), (3,"s"), (3,"frl") ] }, + } + + modification = modifications[idx] + + for k,v in modification.items(): + if not k in default.keys(): + raise Exception(f"Invalid modification: {k}") + + dic = { **default, **modification } + + dic["load_order_first"] = dic.get("load_order_first", dic["load_order"]) + dic["store_order_last"] = dic.get("store_order_last", dic["store_order"]) + + return dic + + def get_schedule_double_no_transpose(self, idx): + + load_order_default = [2,3,0,1] + store_order_default = [0,1,2,3] + butterfly_numbering_default = list(zip([0,1,0,2], + [2,3,1,3], + [0,0,1,2])) + twiddle_numbering_default = { 0: (0,0), + 1: (0,1), + 2: (0,2) } + root_load_order_default = list(range(0,10)) # Identity + + schedules = [ + + # INDEX 0 + # Trivial implementation, no interleaving whatsoever + (None, None, None, None, None, + ["m", "frl", + "l", "l", "l", "l", + 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, "lre", + 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, "s", "s", "s", "s" ]) + ] + + load_order, store_order, numbering, twiddles, root_order, schedule = schedules[idx] + + if load_order == None: + load_order = load_order_default + if store_order == None: + store_order = store_order_default + if numbering == None: + numbering = butterfly_numbering_default + if twiddles == None: + twiddles = twiddle_numbering_default + if root_order == None: + root_order = root_load_order_default + + return load_order, store_order, numbering, twiddles, root_order, schedule + + def get_schedule_quad_transpose(self, idx): + + load_order_default = [2,3,0,1] + store_order_default = [0,1,2,3] + butterfly_numbering_default = \ + list(zip([0,1,0,2, 0,1,0,2], + [2,3,1,3, 2,3,1,3], + [0,0,1,2, 3,3,4,5])) + twiddle_numbering_default = { 0: (0,0), + 1: (0,1), + 2: (0,2), + 3: (1,None), + 4: (2,None), + 5: (3,None) } + root_load_order_default = list(range(0,10)) # Identity + + schedules = [ + + # INDEX 0 + # Trivial implementation, no interleaving whatsoever + ( None, None, None, None, None, # All defaults + ["m", "frl", "l", "l", "l", "l", + 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, + "t", + 4, 4, 4, 4, 4, + 5, 5, 5, 5, 5, + 6, 6, 6, 6, 6, + 7, 7, 7, 7, 7, + "s", "s", "s", "s", "lre" ] ) + ] + + load_order, store_order, numbering, twiddles, root_order, schedule = schedules[idx] + + if load_order == None: + load_order = load_order_default + if store_order == None: + store_order = store_order_default + if numbering == None: + numbering = butterfly_numbering_default + if twiddles == None: + twiddles = twiddle_numbering_default + if root_order == None: + root_order = root_load_order_default + + return load_order, store_order, numbering, twiddles, root_order, schedule + + def get_schedule_quad_transpose_zipped(self, idx): + + load_order_default = [2,3,0,1] + store_order_default = [0,1,2,3] + butterfly_numbering_default = \ + list(zip([0,1,0,2, 0,1,0,2], + [2,3,1,3, 2,3,1,3], + [0,0,1,2, 3,3,4,5])) + twiddle_numbering_default = { 0: (0,0), + 1: (0,1), + 2: (0,2), + 3: (1,None), + 4: (2,None), + 5: (3,None) } + root_load_order_default = list(range(0,10)) # Identity + + schedules = [ + + # INDEX 0 + # Trivial implementation, no interleaving whatsoever + ( None, None, None, None, None, # All defaults + [(0,"m"), (0,"lr"), (0,"l"), (0,"l"), (0,"l"), (0,"l"), + (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), + (0, 1), (0, 1), (0, 1), (0, 1), (0, 1), + (0, 2), (0, 2), (0, 2), (0, 2), (0, 2), + (0, 3), (0, 3), (0, 3), (0, 3), (0, 3), + (0, "t"), + (0, 4), (0, 4), (0, 4), (0, 4), (0, 4), + (0, 5), (0, 5), (0, 5), (0, 5), (0, 5), + (0, 6), (0, 6), (0, 6), (0, 6), (0, 6), + (0, 7), (0, 7), (0, 7), (0, 7), (0, 7), + (0, "s"), (0,"s"), (0,"s"), (0,"s"), (0,"fr"), + + (1,"m"), (1,"lr"), (1,"l"), (1,"l"), (1,"l"), (1,"l"), + (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), + (1, 1), (1, 1), (1, 1), (1, 1), (1, 1), + (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), + (1, 3), (1, 3), (1, 3), (1, 3), (1, 3), + (1, "t"), + (1, 4), (1, 4), (1, 4), (1, 4), (1, 4), + (1, 5), (1, 5), (1, 5), (1, 5), (1, 5), + (1, 6), (1, 6), (1, 6), (1, 6), (1, 6), + (1, 7), (1, 7), (1, 7), (1, 7), (1, 7), + (1, "s"), (1,"s"), (1,"s"), (1,"s"), (1,"fr")] ), + + # INDEX 1 + # Zipped together two trivial implementations + ( None, None, None, None, None, # All defaults + [(0,"m"), (0,"lr"), (0,"l"), (0,"l"), (0,"l"), (0,"l"), + (1,"m"), (1,"lr"), (1,"l"), (1,"l"), (1,"l"), (1,"l"), + + (0,0),(0,0),(0,0),(0,0),(0,0), + (1,0),(1,0),(1,0),(1,0),(1,0), + + (0,1),(0,1),(0,1),(0,1),(0,1), + (1,1),(1,1),(1,1),(1,1),(1,1), + + (0,2),(0,2),(0,2),(0,2),(0,2), + (1,2),(1,2),(1,2),(1,2),(1,2), + + (0,3),(0,3),(0,3),(0,3),(0,3), + (1,3),(1,3),(1,3),(1,3),(1,3), + + (0,"t"), + (1,"t"), + + (0,4),(0,4),(0,4),(0,4),(0,4), + (1,4),(1,4),(1,4),(1,4),(1,4), + + (0,5),(0,5),(0,5),(0,5),(0,5), + (1,5),(1,5),(1,5),(1,5),(1,5), + + (0,6),(0,6),(0,6),(0,6),(0,6), + (1,6),(1,6),(1,6),(1,6),(1,6), + + (0,7),(0,7),(0,7),(0,7),(0,7), + (1,7),(1,7),(1,7),(1,7),(1,7), + + (0,"s"),(0,"s"),(0,"s"),(0,"s"),(0,"fr"), + (1,"s"),(1,"s"),(1,"s"),(1,"s"),(1,"fr")] ), + + # INDEX 2 + # Zipped together slightly different, but still at butterfly granularity + ( None, None, None, None, None, # All defaults + [(0,"m"), (0,"lr"), (0,"l"), (0,"l"), (0,"l"), (0,"l"), + (1,"m"), (1,"lr"), (1,"l"), (1,"l"), (1,"l"), (1,"l"), + + (0,0),(0,0),(0,0),(0,0),(0,0), + (0,1),(0,1),(0,1),(0,1),(0,1), + + (1,0),(1,0),(1,0),(1,0),(1,0), + (1,1),(1,1),(1,1),(1,1),(1,1), + + (0,2),(0,2),(0,2),(0,2),(0,2), + (0,3),(0,3),(0,3),(0,3),(0,3), + + (1,2),(1,2),(1,2),(1,2),(1,2), + (1,3),(1,3),(1,3),(1,3),(1,3), + + (0,"t"), + (1,"t"), + + (0,4),(0,4),(0,4),(0,4),(0,4), + (0,5),(0,5),(0,5),(0,5),(0,5), + + (1,4),(1,4),(1,4),(1,4),(1,4), + (1,5),(1,5),(1,5),(1,5),(1,5), + + (0,6),(0,6),(0,6),(0,6),(0,6), + (0,7),(0,7),(0,7),(0,7),(0,7), + + (1,6),(1,6),(1,6),(1,6),(1,6), + (1,7),(1,7),(1,7),(1,7),(1,7), + + (0,"s"),(0,"s"),(0,"s"),(0,"s"),(0,"fr"), + (1,"s"),(1,"s"),(1,"s"),(1,"s"),(1,"fr")] ), + + # INDEX 3 + # Interleave some loads + ( None, None, None, None, None, # All defaults + [(0,"m"), (0,"lrs"), (0,"lrs"), (0,"l"), (0,"l"), + (1,"m"), (1,"lrs"), (1,"lrs"), + + (0,0), (1,"l"), (0,0), (1,"l"), (0,0),(0,0),(0,0), + (0,1), (1,"l"), (0,1), (1,"l"), (0,1),(0,1),(0,1), + + (1,0),(1,0),(1,0),(1,0),(1,0), + (1,1),(1,1),(1,1),(1,1),(1,1), + + (0,2),(0,2),(0,2),(0,2),(0,2), + (0,3),(0,3),(0,3),(0,3),(0,3), + + (1,2),(1,2),(1,2),(1,2),(1,2), + (1,3),(1,3),(1,3),(1,3),(1,3), + + (0,"t"), (0,"lrs"), (0,"lrs"), + (1,"t"), (1,"lrs"), (1,"lrs"), + + (0,4),(0,4),(0,4),(0,4),(0,4), + (0,5),(0,5),(0,5),(0,5),(0,5), + + (1,4),(1,4),(1,4),(1,4),(1,4), (0,"lrs"), (0,"lrs"), + (1,5),(1,5),(1,5),(1,5),(1,5), (0,"lrs"), (0,"lrs"), + + (0,6),(0,6),(0,6),(0,6),(0,6), (1,"lrs"), (1,"lrs"), + (0,7),(0,7),(0,7),(0,7),(0,7), (1,"lrs"), (1,"lrs"), + + (1,6), (0, "le"), (1,6), (1,6),(1,6),(1,6), + (1,7), (0, "le"), (1,7), (1,7),(1,7),(1,7), + + (0,"s"),(0,"s"), (0,"s"),(0,"s"), (0,"fr"), + (1,"s"),(1,"s"),(1,"s"),(1,"s"),(1,"fr")] ), + + # INDEX 4 + # Interleave loads + transposition + ( None, None, None, None, None, # All defaults + [(0,"m"), (0,"lrs"), (0,"lrs"), (0,"l"), (0,"l"), + (1,"m"), (1,"lrs"), (1,"lrs"), + + (0,0), (1,"l"), (0,0), (1,"l"), (0,0),(0,0),(0,0), + (0,1), (1,"l"), (0,1), (1,"l"), (0,1),(0,1),(0,1), + + (1,0),(1,0),(1,0),(1,0),(1,0), + (1,1),(1,1),(1,1),(1,1),(1,1), + + (0,2),(0,2),(0,2),(0,2),(0,2), + (0,3),(0,3),(0,3),(0,3),(0,3), + + (1,2), (1,2), (1,2), + (1,2), (0,"ts"), (0,"ts"), + (1,2), (0,"ts"), (0,"ts"), + (1,3), (0,"lrs"), (1,3), (0,"lrs"), + (1,3), (0,"ts"), (0,"ts"), + (1,3), (0,"ts"), (0,"ts"), + (1,3), + + + (0,4), (0,4), + (0,4), (1, "ts"), (1, "ts"), + (0,4), (1, "ts"), (1, "ts"), + (0,4), (1, "ts"), (1, "ts"), + (0,5), (1, "ts"), (1, "ts"), + (1,"lrs"), (1,"lrs"), (0,5),(0,5),(0,5),(0,5), + + (1,4),(1,4),(1,4),(1,4),(1,4), (0,"lrs"), (0,"lrs"), + (1,5),(1,5),(1,5),(1,5),(1,5), (0,"lrs"), (0,"lrs"), + + (0,6),(0,6),(0,6),(0,6),(0,6), (1,"lrs"), (1,"lrs"), + (0,7),(0,7),(0,7),(0,7),(0,7), (1,"lrs"), (1,"lrs"), + + (1,6), (0, "le"), (1,6), (1,6),(1,6),(1,6), + (1,7), (0, "le"), (1,7), (1,7),(1,7),(1,7), + + (0,"s"),(0,"s"), (0,"s"),(0,"s"), (0,"fr"), + (1,"s"),(1,"s"),(1,"s"),(1,"s"),(1,"fr")] ), + + # INDEX 5 + # Interleave arithmetic only + ( None, None, None, None, None, # All defaults + [(0,"m"), (0,"lrs"), (0,"lrs"), (0,"l"), (0,"l"), + (1,"m"), (1,"lrs"), (1,"lrs"), + + (0,0), (1,"l"), (0,0), (1,"l"), + (0,1), (1,"l"), (0,1), (1,"l"), (0,0), + (1,0), (1,0), (0,1), (0,0), (0,0), + (1,1), (1,1), (1,0), (0,1), (0,1), + (0,2), (0,2), (1,1), (1,0), (1,0), + (0,3), (0,3), (0,2), (1,1), (1,1), + (1,2), (1,2), (0,3), (0,2), (0,2), + (1,3), (1,3), (1,2), (0,3), (0,3), + (0, "t"), (0, "lrs"), (0, "lrs"), + (0,4), (0,4), (1,3), (1,2), (1,2), + (0,5), (0,5), (0,4), (1,3), (1,3), + (1, "t"), (1, "lrs"), (1, "lrs"), + (1,4), + (0,"lrs"), (0,"lrs"), + (1,4), (0,5), (0,4), (0,4), + (1,5), (0,"lrs"), (0,"lrs"), (1,5), (1,4), (0,5), (0,5), + (0,6), (1,"lrs"), (1,"lrs"), (0,6), (1,5), (1,4), (1,4), + (0,7), (1,"lrs"), (1,"lrs"), (0,7), (0,6), (1,5), (1,5), + (1,6), (0, "le"), (1,6), (0,7), (0,6), (0,6), + (1,7), (0, "le"), (1,7), (1,6), (0,7), (0,7), + (1,7), (1,6), (1,6), + (1,7), (1,7), + + (0,"s"),(0,"s"), (0,"s"),(0,"s"), (0,"fr"), + (1,"s"),(1,"s"),(1,"s"),(1,"s"),(1,"fr")] ), + + ] + + load_order, store_order, numbering, twiddles, root_order, schedule = schedules[idx] + + if load_order == None: + load_order = load_order_default + if store_order == None: + store_order = store_order_default + if numbering == None: + numbering = butterfly_numbering_default + if twiddles == None: + twiddles = twiddle_numbering_default + if root_order == None: + root_order = root_load_order_default + + return load_order, store_order, numbering, twiddles, root_order, schedule + + def get_schedule_quad_transpose_quad_zipped(self, idx): + + load_order_default = [2,3,0,1] + store_order_default = [0,1,2,3] + butterfly_numbering_default = \ + list(zip([0,1,0,2, 0,1,0,2], + [2,3,1,3, 2,3,1,3], + [0,0,1,2, 3,3,4,5])) + twiddle_numbering_default = { 0: (0,0), + 1: (0,1), + 2: (0,2), + 3: (1,None), + 4: (2,None), + 5: (3,None) } + root_load_order_default = list(range(0,10)) # Identity + + schedules = [ + + # INDEX 0 + # Trivial implementation, no interleaving whatsoever + ( None, None, None, None, None, # All defaults + [(0,"m"), (0,"lr"), (0,"l"), (0,"l"), (0,"l"), (0,"l"), + (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), + (0, 1), (0, 1), (0, 1), (0, 1), (0, 1), + (0, 2), (0, 2), (0, 2), (0, 2), (0, 2), + (0, 3), (0, 3), (0, 3), (0, 3), (0, 3), + (0, "t"), + (0, 4), (0, 4), (0, 4), (0, 4), (0, 4), + (0, 5), (0, 5), (0, 5), (0, 5), (0, 5), + (0, 6), (0, 6), (0, 6), (0, 6), (0, 6), + (0, 7), (0, 7), (0, 7), (0, 7), (0, 7), + (0, "s"), (0,"s"), (0,"s"), (0,"s"), (0,"fr"), + + (1,"m"), (1,"lr"), (1,"l"), (1,"l"), (1,"l"), (1,"l"), + (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), + (1, 1), (1, 1), (1, 1), (1, 1), (1, 1), + (1, 2), (1, 2), (1, 2), (1, 2), (1, 2), + (1, 3), (1, 3), (1, 3), (1, 3), (1, 3), + (1, "t"), + (1, 4), (1, 4), (1, 4), (1, 4), (1, 4), + (1, 5), (1, 5), (1, 5), (1, 5), (1, 5), + (1, 6), (1, 6), (1, 6), (1, 6), (1, 6), + (1, 7), (1, 7), (1, 7), (1, 7), (1, 7), + (1, "s"), (1,"s"), (1,"s"), (1,"s"), (1,"fr"), + + (2,"m"), (2,"lr"), (2,"l"), (2,"l"), (2,"l"), (2,"l"), + (2, 0), (2, 0), (2, 0), (2, 0), (2, 0), + (2, 1), (2, 1), (2, 1), (2, 1), (2, 1), + (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), + (2, 3), (2, 3), (2, 3), (2, 3), (2, 3), + (2, "t"), + (2, 4), (2, 4), (2, 4), (2, 4), (2, 4), + (2, 5), (2, 5), (2, 5), (2, 5), (2, 5), + (2, 6), (2, 6), (2, 6), (2, 6), (2, 6), + (2, 7), (2, 7), (2, 7), (2, 7), (2, 7), + (2, "s"), (2,"s"), (2,"s"), (2,"s"), (2,"fr"), + + (3,"m"), (3,"lr"), (3,"l"), (3,"l"), (3,"l"), (3,"l"), + (3, 0), (3, 0), (3, 0), (3, 0), (3, 0), + (3, 1), (3, 1), (3, 1), (3, 1), (3, 1), + (3, 2), (3, 2), (3, 2), (3, 2), (3, 2), + (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), + (3, "t"), + (3, 4), (3, 4), (3, 4), (3, 4), (3, 4), + (3, 5), (3, 5), (3, 5), (3, 5), (3, 5), + (3, 6), (3, 6), (3, 6), (3, 6), (3, 6), + (3, 7), (3, 7), (3, 7), (3, 7), (3, 7), + (3, "s"), (3,"s"), (3,"s"), (3,"s"), (3,"fr")] ), + + # INDEX 1 + # Interleave pre- and post-transpose arithmetic + ( None, None, None, None, None, # All defaults + [(0,"m"), (0,"l"), (0,"l"), (0,"l"), (0,"l"), + (1,"m"), (1,"l"), (1,"l"), (1,"l"), (1,"l"), + (2,"m"), (2,"l"), (2,"l"), (2,"l"), (2,"l"), + (3,"m"), (3,"l"), (3,"l"), (3,"l"), (3,"l"), + + (0,"lrs"), (0,"lrs"), + (1,"lrs"), (1,"lrs"), + (2,"lrs"), (2,"lrs"), + (3,"lrs"), (3,"lrs"), + + (0,0), (0,0), + (0,1), (0,1), (0,0), + (1,0), (1,0), (0,1), (0,0), (0,0), + (1,1), (1,1), (1,0), (0,1), (0,1), + (0,2), (0,2), (1,1), (1,0), (1,0), + (0,3), (0,3), (0,2), (1,1), (1,1), + (1,2), (1,2), (0,3), (0,2), (0,2), (0, "frs"), (0, "frs"), + (1,3), (1,3), (1,2), (0,3), (0,3), + (2,0), (2,0), (1,3), (1,2), (1,2), (1, "frs"), (1, "frs"), + (2,1), (2,1), (2,0), (1,3), (1,3), + (3,0), (3,0), (2,1), (2,0), (2,0), + (3,1), (3,1), (3,0), (2,1), (2,1), + (2,2), (2,2), (3,1), (3,0), (3,0), + (2,3), (2,3), (2,2), (3,1), (3,1), + (3,2), (3,2), (2,3), (2,2), (2,2), (2, "frs"), (2, "frs"), + (3,3), (3,3), (3,2), (2,3), (2,3), + (3,3), (3,2), (3,2), (3, "frs"), (3, "frs"), + (3,3), (3,3), + + (0, "t"), + (1, "t"), + (2, "t"), + (3, "t"), + + (0, "lrs"), (0, "lrs"), + (0, 4), (0, 4), + (0, 5), (0, 5), + (0, "frs"), (0, "frs"), + (0, 4), + (1, "lrs"), (1, "lrs"), + (1, 4), (1, 4), (0, 5), (0, 4), (0, 4), + (1, 5), (1, 5), + (1, "frs"), (1, "frs"), + (1, 4), (0, 5), (0, 5), + (0, "lrs"), (0, "lrs"), + (0, 6), (0, 6), + (0, "frs"), (0, "frs"), + (1, 5), (1, 4), (1, 4), + (0, "lrs"), (0, "lrs"), + (0, 7), (0, 7), + (0, "frs"), (0, "frs"), + (0, 6), (1, 5), (1, 5), + (1, "lrs"), (1, "lrs"), + (1, 6), (1, 6), + (1, "frs"), (1, "frs"), + (0, 7), (0, 6), (0, 6), + (1, "lrs"), (1, "lrs"), + (1, 7), (1, 7), + (1, "frs"), (1, "frs"), + (1, 6), (0, 7), (0, 7), + (1, 7), (1, 6), (1, 6), + (1, 7), (1, 7), + + (0, "s"), (0,"s"), (0,"s"), (0,"s"), + (1, "s"), (1,"s"), (1,"s"), (1,"s"), + + (2, "lrs"), (2, "lrs"), + (2, 4), (2, 4), + (2, 5), (2, 5), + (2, "frs"), (2, "frs"), + (2, 4), + (3, "lrs"), (3, "lrs"), + (3, 4), (3, 4), (2, 5), (2, 4), (2, 4), + (3, 5), (3, 5), + (3, "frs"), (3, "frs"), + (3, 4), (2, 5), (2, 5), + (2, "lrs"), (2, "lrs"), + (2, 6), (2, 6), + (2, "frs"), (2, "frs"), + (3, 5), (3, 4), (3, 4), + (2, "lrs"), (2, "lrs"), + (2, 7), (2, 7), + (2, "frs"), (2, "frs"), + (2, 6), (3, 5), (3, 5), + (3, "lrs"), (3, "lrs"), + (3, 6), (3, 6), + (3, "frs"), (3, "frs"), + (2, 7), (2, 6), (2, 6), + (3, "lrs"), (3, "lrs"), + (3, 7), (3, 7), + (3, "frs"), (3, "frs"), + (3, 6), (2, 7), (2, 7), + (3, 7), (3, 6), (3, 6), + (3, 7), (3, 7), + + (2, "s"), (2,"s"), (2,"s"), (2,"s"), + (3, "s"), (3,"s"), (3,"s"), (3,"s") ] ), + + # INDEX 2 + # Interleave pre- and post-transpose arithmetic, and transpose + ( None, None, None, None, None, # All defaults + [(0,"m"), (0,"l"), (0,"l"), (0,"l"), (0,"l"), + (1,"m"), (1,"l"), (1,"l"), (1,"l"), (1,"l"), + (2,"m"), (2,"l"), (2,"l"), (2,"l"), (2,"l"), + (3,"m"), (3,"l"), (3,"l"), (3,"l"), (3,"l"), + + (0,"lrs"), (0,"lrs"), + (1,"lrs"), (1,"lrs"), + (2,"lrs"), (2,"lrs"), + (3,"lrs"), (3,"lrs"), + + (0,0), (0,0), + (0,1), (0,1), (0,0), + (1,0), (1,0), (0,1), (0,0), (0,0), + (1,1), (1,1), (1,0), (0,1), (0,1), + (0,2), (0,2), (1,1), (1,0), (1,0), + (0,3), (0,3), (0,2), (1,1), (1,1), + (1,2), (1,2), (0,3), (0,2), (0,2), (0, "frs"), (0, "frs"), + (1,3), (1,3), (1,2), (0,3), (0,3), + (2,0), (2,0), + (0, "ts"), + (0, "ts"), + (1,3), (1,2), (1,2), (1, "frs"), (1, "frs"), + (2,1), (2,1), + (0, "ts"), + (0, "ts"), + (2,0), (1,3), (1,3), + (3,0), (3,0), + (0, "ts"), + (0, "ts"), + (2,1), (2,0), (2,0), + (3,1), (3,1), + (0, "ts"), + (0, "ts"), + (3,0), (2,1), (2,1), + (2,2), (2,2), + (1, "ts"), + (1, "ts"), + (3,1), (3,0), (3,0), + (2,3), (2,3), + (1, "ts"), + (1, "ts"), + (2,2), (3,1), (3,1), + (3,2), (3,2), + (1, "ts"), + (1, "ts"), + (2,3), (2,2), (2,2), (2, "frs"), (2, "frs"), + (3,3), (3,3), + (1, "ts"), + (1, "ts"), + (3,2), (2,3), (2,3), + (3,3), (3,2), (3,2), (3, "frs"), (3, "frs"), + (3,3), (3,3), + + (0, "lrs"), (0, "lrs"), + (0, 4), (0, 4), + (2, "ts"), + (2, "ts"), + (0, 5), (0, 5), + (2, "ts"), + (2, "ts"), + (0, "frs"), (0, "frs"), + (0, 4), + (1, "lrs"), (1, "lrs"), + (1, 4), (1, 4), + (2, "ts"), + (2, "ts"), + (0, 5), (0, 4), (0, 4), + (1, 5), (1, 5), + (1, "frs"), (1, "frs"), + (2, "ts"), + (2, "ts"), + (1, 4), (0, 5), (0, 5), + (0, "lrs"), (0, "lrs"), + (0, 6), (0, 6), + (0, "frs"), (0, "frs"), + (3, "ts"), + (3, "ts"), + (1, 5), (1, 4), (1, 4), + (0, "lrs"), (0, "lrs"), + (0, 7), (0, 7), + (0, "frs"), (0, "frs"), + (3, "ts"), + (3, "ts"), + (0, 6), (1, 5), (1, 5), + (1, "lrs"), (1, "lrs"), + (1, 6), (1, 6), + (1, "frs"), (1, "frs"), + (3, "ts"), + (3, "ts"), + (0, 7), (0, 6), (0, 6), + (1, "lrs"), (1, "lrs"), + (1, 7), (1, 7), + (1, "frs"), (1, "frs"), + (3, "ts"), + (3, "ts"), + (1, 6), (0, 7), (0, 7), + (1, 7), (1, 6), (1, 6), + (1, 7), (1, 7), + + (0, "s"), (0,"s"), (0,"s"), (0,"s"), + (1, "s"), (1,"s"), (1,"s"), (1,"s"), + + (2, "lrs"), (2, "lrs"), + (2, 4), (2, 4), + (2, 5), (2, 5), + (2, "frs"), (2, "frs"), + (2, 4), + (3, "lrs"), (3, "lrs"), + (3, 4), (3, 4), (2, 5), (2, 4), (2, 4), + (3, 5), (3, 5), + (3, "frs"), (3, "frs"), + (3, 4), (2, 5), (2, 5), + (2, "lrs"), (2, "lrs"), + (2, 6), (2, 6), + (2, "frs"), (2, "frs"), + (3, 5), (3, 4), (3, 4), + (2, "lrs"), (2, "lrs"), + (2, 7), (2, 7), + (2, "frs"), (2, "frs"), + (2, 6), (3, 5), (3, 5), + (3, "lrs"), (3, "lrs"), + (3, 6), (3, 6), + (3, "frs"), (3, "frs"), + (2, 7), (2, 6), (2, 6), + (3, "lrs"), (3, "lrs"), + (3, 7), (3, 7), + (3, "frs"), (3, "frs"), + (3, 6), (2, 7), (2, 7), + (3, 7), (3, 6), (3, 6), + (3, 7), (3, 7), + + (2, "s"), (2,"s"), (2,"s"), (2,"s"), + (3, "s"), (3,"s"), (3,"s"), (3,"s") ] ), + + # INDEX 3 + # Interleave pre- and post-transpose arithmetic, and transpose + # And loads + stores + ( None, None, None, None, None, # All defaults + [(0,"m"), + (1,"m"), + + (0,0), (0,0), + (0,1), (0,1), + (2, "sl"), + (2, "sl"), + (2, "sl"), + (2, "sl"), + (0,0), + (0, "l"), + (1,0), + (0, "l"), + (1,0), + (3, "sl"), + (3, "sl"), + (3, "sl"), + (3, "sl"), + (0,1), + (0,0), (0,0), + (1,1), + (2,"l"), + (1,1), + (2,"l"), + (1,0), + (1,"l"), + (0,1), (0,1), + (0,2), + (2,"l"), + (0,2), + (2,"l"), + (1,1), + (1,"l"), + (1,0), (1,0), + (0,3), + (2,"lrs"), + (0,3), + (2,"lrs"), + (0,2), + (1,1), + (1,1), + (1,2), + (3,"l"), + (1,2), + (3,"l"), + (0,3), (0,2), (0,2), + (0, "frs"), + (0, "frs"), + (1,3), + (3,"l"), + (1,3), + (3,"l"), + (1,2), (0,3), (0,3), + (2,0), + (3,"lrs"), + (2,0), + (3,"lrs"), + (0, "ts"), + (0, "ts"), + (1,3), (1,2), (1,2), + (1, "frs"), + (1, "frs"), + (2,1), (2,1), + (0, "ts"), + (0, "ts"), + (2,0), (1,3), (1,3), + (3,0), (3,0), + (0, "ts"), + (0, "ts"), + (2,1), (2,0), (2,0), + (3,1), (3,1), + (0, "ts"), + (0, "ts"), + (3,0), (2,1), (2,1), + (2,2), (2,2), + (1, "ts"), + (1, "ts"), + (3,1), (3,0), (3,0), + (2,3), (2,3), + (1, "ts"), + (1, "ts"), + (2,2), (3,1), (3,1), + (3,2), (3,2), + (1, "ts"), + (1, "ts"), + (2,3), (2,2), (2,2), + (2, "frs"), + (2, "frs"), + (3,3), (3,3), + (1, "ts"), + (1, "ts"), + (3,2), (2,3), (2,3), + (3,3), (3,2), (3,2), + (3, "frs"), + (3, "frs"), + (3,3), (3,3), + + (0, "lrs"), + (0, "lrs"), + (0, 4), (0, 4), + (2, "ts"), + (2, "ts"), + (0, 5), (0, 5), + (0, "frs"), + (0, "frs"), + (2, "ts"), + (2, "ts"), + (0, 4), + (1, "lrs"), + (1, "lrs"), + (1, 4), (1, 4), + (2, "ts"), + (2, "ts"), + (0, 5), (0, 4), (0, 4), + (1, 5), (1, 5), + (1, "frs"), + (1, "frs"), + (2, "ts"), + (2, "ts"), + (1, 4), (0, 5), (0, 5), + (0, "lrs"), + (0, "lrs"), + (0, 6), (0, 6), + (0, "frs"), + (0, "frs"), + (3, "ts"), + (3, "ts"), + (1, 5), (1, 4), (1, 4), + (0, "lrs"), + (0, "lrs"), + (0, 7), (0, 7), + (0, "frs"), + (0, "frs"), + (3, "ts"), + (3, "ts"), + (0, 6), (1, 5), (1, 5), + (1, "lrs"), + (1, "lrs"), + (1, 6), (1, 6), + (1, "frs"), + (1, "frs"), + (3, "ts"), + (3, "ts"), + (0, 7), (0, 6), (0, 6), + (1, "lrs"), + (1, "lrs"), + (1, 7), (1, 7), + (1, "frs"), + (1, "frs"), + (3, "ts"), + (3, "ts"), + (1, 6), (0, 7), (0, 7), + (1, 7), (1, 6), (1, 6), + (1, 7), (1, 7), + + ######################################################### + + (2, "lrs"), + (2, "lrs"), + (2, 4), (2, 4), + (0, "s"), + (2, 5), + (0, "s"), + (2, 5), + (2, "frs"), + (2, "frs"), + (0, "s"), + (2, 4), + (3, "lrs"), + (3, "lrs"), + (3, 4), + (1, "s"), + (3, 4), + (0, "s"), + (2, 5), + (1, "s"), + (2, 4), (2, 4), + (3, 5), (3, 5), + (3, "frs"), + (3, "frs"), + (1, "s"), + (3, 4), (2, 5), (2, 5), + (2, "lrs"), + (2, "lrs"), + (2, 6), (2, 6), + (2, "frs"), + (2, "frs"), + (1, "s"), + (3, 5), (3, 4), (3, 4), + (2, "lrs"), + (2, "lrs"), + (2, 7), (2, 7), + (2, "frs"), + (2, "frs"), + (2, 6), (3, 5), (3, 5), + (3, "lrs"), + (3, "lrs"), + (3, 6), (3, 6), + (3, "frs"), + (3, "frs"), + (2, 7), (2, 6), (2, 6), + (3, "lrs"), + (3, "lrs"), + (3, 7), (3, 7), + (3, "frs"), + (3, "frs"), + (0, "le"), + (3, 6), + (0, "le"), + (2, 7), + (0, "lres"), + (2, 7), + (0, "lres"), + (3, 7), + (1,"lres"), + (3, 6), + (1, "le"), + (3, 6), + (1,"lres"), + (3, 7), + (1, "le"), + (3, 7), + + ] ), + + # INDEX 4 + # Interleave pre- and post-transpose arithmetic, and transpose + # And loads + stores + ( None, None, None, None, None, # All defaults + [(0,0), (0,0), (3, -1), (3, -2), (3, -2), + (0,1), (0,1), + (2, "sl"), + (2, "sl"), + (2, "sl"), + (2, "sl"), + (0,0), (3,-1), (3,-1), + (0, "l"), + (1,0), + (0, "l"), + (1,0), + (3, "sl"), + (3, "sl"), + (3, "sl"), + (3, "sl"), + (0,1), + (0,0), (0,0), + (1,1), + (2,"l"), + (1,1), + (2,"l"), + (1,0), + (1,"l"), + (0,1), (0,1), + (0,2), + (2,"l"), + (0,2), + (2,"l"), + (1,1), + (1,"l"), + (1,0), (1,0), + (0,3), + (2,"lrs"), + (0,3), + (2,"lrs"), + (0,2), + (1,1), + (1,1), + (1,2), + (3,"l"), + (1,2), + (3,"l"), + (0,3), (0,2), (0,2), + (0, "frs"), + (0, "frs"), + (1,3), + (3,"l"), + (1,3), + (3,"l"), + (1,2), (0,3), (0,3), + (2,0), + (3,"lrs"), + (2,0), + (3,"lrs"), + (0, "ts"), + (0, "ts"), + (1,3), (1,2), (1,2), + (1, "frs"), + (1, "frs"), + (2,1), (2,1), + (0, "ts"), + (0, "ts"), + (2,0), (1,3), (1,3), + (3,0), (3,0), + (0, "ts"), + (0, "ts"), + (2,1), (2,0), (2,0), + (3,1), (3,1), + (0, "ts"), + (0, "ts"), + (3,0), (2,1), (2,1), + (2,2), (2,2), + (1, "ts"), + (1, "ts"), + (3,1), (3,0), (3,0), + (2,3), (2,3), + (1, "ts"), + (1, "ts"), + (0, "lrs"), + (0, "lrs"), + (2,2), (3,1), (3,1), + (3,2), (3,2), + (1, "ts"), + (1, "ts"), + (2,3), (2,2), (2,2), + (2, "frs"), + (2, "frs"), + (3,3), (3,3), + (1, "ts"), + (1, "ts"), + (3,2), (2,3), (2,3), + (0,4), (0,4), (3,3), (3,2), (3,2), + (3, "frs"), + (3, "frs"), + (1, "lrs"), + (1, "lrs"), + (0,5), (0,5), + (2, "ts"), + (2, "ts"), + + (0,4), (3,3), (3,3), + + (0, "frs"), + (0, "frs"), + (0, "lrs"), + (0, "lrs"), + (2, "ts"), + (2, "ts"), + + (1, 4), (1, 4), + (2, "ts"), + (2, "ts"), + (0, "lrs"), + (0, "lrs"), + (0, 5), (0, 4), (0, 4), + (1, 5), (1, 5), + (1, "frs"), + (1, "frs"), + (2, "ts"), + (2, "ts"), + (1, "lrs"), + (1, "lrs"), + (1, 4), (0, 5), (0, 5), + (0, 6), (0, 6), + (0, "frs"), + (0, "frs"), + (3, "ts"), + (3, "ts"), + (1, "lrs"), + (1, "lrs"), + (1, 5), (1, 4), (1, 4), + (0, 7), (0, 7), + (0, "frs"), + (0, "frs"), + (3, "ts"), + (3, "ts"), + (0, 6), (1, 5), (1, 5), + (1, 6), (1, 6), + (2, "lrs"), + (2, "lrs"), + (1, "frs"), + (1, "frs"), + (3, "ts"), + (3, "ts"), + (0, 7), (0, 6), (0, 6), + (1, 7), (1, 7), + (1, "frs"), + (1, "frs"), + (3, "ts"), + (3, "ts"), + (1, 6), (0, 7), (0, 7), + (2, 4), + (3, "lrs"), + (3, "lrs"), + (2, 4), + (0, "s"), + (0, "s"), + (1, 7), (1, 6), (1, 6), + (2, 5), (2, 5), + (2, "frs"), + (2, "frs"), + (0, "s"), + (2, 4), (1, 7), (1, 7), + (2, "lrs"), + (2, "lrs"), + (3, 4), + (1, "s"), + (3, 4), + (0, "s"), + (2, 5), + (1, "s"), + (2, 4), (2, 4), + (2, "lrs"), + (2, "lrs"), + (3, 5), (3, 5), + (3, "frs"), + (3, "frs"), + (1, "s"), + (3, 4), (2, 5), (2, 5), + (3, "lrs"), + (3, "lrs"), + (2, 6), (2, 6), + (2, "frs"), + (2, "frs"), + (3, "lrs"), + (3, "lrs"), + (1, "s"), + (3, 5), (3, 4), (3, 4), + (2, 7), (2, 7), + (2, "frs"), + (2, "frs"), + (2, 6), (3, 5), (3, 5), + (3, 6), (3, 6), + (3, "frs"), + (3, "frs"), + (0, "le"), + (2, 7), + (0, "le"), + (2, 6), (2, 6), + (3, 7), + (0, "lres"), + (3, 7), + (3, "frs"), + (3, "frs"), + (0, "lres"), + (3, 6), + (1,"lres"), + (2, 7), + (1, "le"), + (2, 7), + (1,"lres"), + (1, "le"), + + ]), + ] + + load_order, store_order, numbering, twiddles, root_order, schedule = schedules[idx] + + if load_order == None: + load_order = load_order_default + if store_order == None: + store_order = store_order_default + if numbering == None: + numbering = butterfly_numbering_default + if twiddles == None: + twiddles = twiddle_numbering_default + if root_order == None: + root_order = root_load_order_default + + return load_order, store_order, numbering, twiddles, root_order, schedule + + def get_schedule_quad_no_transpose(self, idx): + + def add(n): + def _add(x): + if isinstance(x,int): + return x + n + else: + return x + return _add + + butterfly_numbering_default = \ + list(zip( + [0, 1, 2, 3, 4, 5, 6, 7, 0,1,2,3, 8, 9,10,11, 0,1,4,5, 8, 9,12,13, 0,2,4, 6, 8,10,12,14], + [8, 9,10,11,12,13,14,15, 4,5,6,7,12,13,14,15, 2,3,6,7,10,11,14,15, 1,3,5, 7, 9,11,13,15], + [0, 0, 0, 0, 0, 0, 0, 0, 1,1,1,1, 2, 2, 2, 2, 3,3,4,4, 5, 5, 6, 6, 7,8,9,10,11,12,13,14])) + + + default = { + "load_order": [12,13,14,15,4,5,6,7,8,9,10,11,0,1,2,3], + "store_order": list(range(0,16)), + "numbering": butterfly_numbering_default[4:8] + butterfly_numbering_default[0:4] + \ + butterfly_numbering_default[10:12] + butterfly_numbering_default[8:10] + \ + butterfly_numbering_default[14:16] + butterfly_numbering_default[12:14] + \ + butterfly_numbering_default[16:32], + "twiddles": { 0: (0,0), + 1: (0,1), + 2: (0,2), + 3: (1,0), + 4: (1,1), + 5: (1,2), + 6: (1,3), + 7: (2,0), + 8: (2,1), + 9: (2,2), + 10: (2,3), + 11: (3,0), + 12: (3,1), + 13: (3,2), + 14: (3,3) }, + "root_load_order": list(range(0,10)), + "schedule": None } + + modifications = { + + # INDEX 0 + # Trivial implementation, no interleaving whatsoever + 0 : { "schedule": + ["m", "frl", "l", "l", "l", "l", + "l", "l", "l", "l", + "l", "l", "l", "l", + "l", "l", "l", "l", + + 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, + 4, 4, 4, 4, 4, + 5, 5, 5, 5, 5, + 6, 6, 6, 6, 6, + 7, 7, 7, 7, 7, + + 8, 8, 8, 8, 8, + 9, 9, 9, 9, 9, + 10, 10, 10, 10, 10, + 11, 11, 11, 11, 11, + 12, 12, 12, 12, 12, + 13, 13, 13, 13, 13, + 14, 14, 14, 14, 14, + 15, 15, 15, 15, 15, + + 16, 16, 16, 16, 16, + 17, 17, 17, 17, 17, + 18, 18, 18, 18, 18, + 19, 19, 19, 19, 19, + 20, 20, 20, 20, 20, + 21, 21, 21, 21, 21, + 22, 22, 22, 22, 22, + 23, 23, 23, 23, 23, + + 24, 24, 24, 24, 24, + 25, 25, 25, 25, 25, + 26, 26, 26, 26, 26, "lre", + 27, 27, 27, 27, 27, + 28, 28, 28, 28, 28, + 29, 29, 29, 29, 29, + 30, 30, 30, 30, 30, + 31, 31, 31, 31, 31, + + "s", "s", "s", "s", + "s", "s", "s", "s", + "s", "s", "s", "s", + "s", "s", "s", "s" ] }, + + # INDEX 1 + # First interleaving attempt: Arithmetic only + # Space out arithmetic operations to account + # for A72/N1 latencies of multiplications. + 1 : { "schedule": + ["m", "l", "l", "l", "l", + "l", "l", "l", "l", + "l", "l", "l", "l", + "l", "l", "l", "l", + + 0, 0, + 1, 1, 0, + 2, 2, 1, 0, 0, + 3, 3, 2, 1, 1, + 4, 4, 3, 2, 2, + 5, 5, 4, 3, 3, + 6, 6, 5, 4, 4, + 7, 7, 6, 5, 5, + + 8, 8, 7, 6, 6, + 9, 9, 8, 7, 7, + 10, 10, 9, 8, 8, + 11, 11,10, 9, 9, + 12, 12,11,10, 10, + 13, 13,12,11, 11, + + 14, 14,13,12, 12, + 15, 15,14,13, 13, + + 16, 16,15,14, 14, + 17, 17,16,15, 15, + 18, 18,17,16, 16, + 19, 19,18,17, 17, + + 20, 20,19,18, 18, + 21, 21,20,19, 19, + 22, 22,21,20, 20, + 23, 23,22,21, 21, + + 24, 24,23,22, 22, + 25, 25,24,23, 23, "lre", + 26, 26,25,24, 24, + 27, 27,26,25, 25, + + 28, 28,27,26, 26, + 29, 29,28,27, 27, + 30, 30,29,28, 28, + 31, 31,30,29, 29, + 31,30, 30, + 31, 31, + + "s", "s", "s", "s", + "s", "s", "s", "s", + "s", "s", "s", "s", + "s", "s", "s", "s", "frl" ] }, + + # INDEX 2 + # TODO: Document + 2 : { "schedule": + ["m", 0, "sl", "l", 0, "l", -1, -2, -2, "sl", + 1, "sl", "l", 1, "l", 0, -1, -1, "sl", + 2, "sl", 2, "l", 1, 0, 0, "sl", + 3, "sl", 3, "l", 2, 1, 1, + 4, "l", 4, 3, 2, 2, + 5, "l", 5, 4, 3, 3, + 6, 6, 5, 4, 4, + 7, 7, 6, 5, 5, + + 8, 8, 7, 6, 6, + 9, 9, 8, 7, 7, + 10, 10, 9, 8, 8, + 11, 11,10, 9, 9, + 12, 12,11,10, 10, + 13, 13,12,11, 11, + + 14, 14,13,12, 12, + 15, 15,14,13, 13, + + 16, 16,15,14, 14, + 17, 17,16,15, 15, + 18, 18,17,16, 16, + 19, 19,18,17, 17, + + 20, 20,19,18, 18, + 21, 21,20,19, 19, + 22, 22,21,20, 20, + 23, 23,22,21, 21, + + 24, 24,23,22, 22, "lre", + 25, 25,24,23, 23, + 26, 26,25,24, 24, + 27, 27, 26, 25, 25, "s", + 28, "s", 28, "le", 27, "le", 26, 26, "s", + 29, "s", 29, "le", 28, "le", 27, 27, "s", + 30, "s", 30, "le", 29, "le", 28, 28, "s", + 31, "s", 31, "frl", "le", 30, "le", 29, 29, "s" ] }, + + # INDEX 3 + # TODO: Document + 3 : { "schedule": + ["m", 0, "sl", "l", 0, "l", -2, -1, -2, "sl", + 1, "sl", "l", 1, "l", -1, 0, -1, "sl", + 2, "sl", 2, "l", 0, 1, 0, "sl", + 3, "sl", 3, "l", 1, 2, 1, + 4, "l", 4, 2, 3, 2, + 5, "l", 5, 3, 4, 3, + 6, 6, 4, 5, 4, + 7, 7, 5, 6, 5, + + 8, 8, 6, 7, 6, + 9, 9, 7, 8, 7, + 10, 10, 8, 9, 8, + 11, 11, 9,10, 9, + 12, 12,10,11, 10, + 13, 13,11,12, 11, + + 14, 14,12,13, 12, + 15, 15,13,14, 13, + + 16, 16,14,15, 14, + 17, 17,15,16, 15, + 18, 18,16,17, 16, + 19, 19,17,18, 17, + + 20, 20,18,19, 18, + 21, 21,19,20, 19, + 22, 22,20,21, 20, + 23, 23,21,22, 21, + + 24, 24,22,23, 22, "lre", + 25, 25,23,24, 23, + 26, 26,24,25, 24, + 27, 27, 25, 26, 25, "s", + 28, "s", 28, "le", 26, "le", 27, 26, "s", + 29, "s", 29, "le", 27, "le", 28, 27, "s", + 30, "s", 30, "le", 28, "le", 29, 28, "s", + 31, "s", 31, "frl", "le", 29, "le", 30, 29, "s" ] }, + + # INDEX 4 + 4 : { "schedule": + ["m", "frl", + "l", "l", "l", "l", + "l", "l", "l", "l", + "l", "l", "l", "l", + "l", "l", "l", "l", + + "lr", + + 0, 1, 2, 3, + 0, 1, 2, 3, + 0, 1, 2, 3, + 0, 1, 2, 3, + 0, 1, 2, 3, + + 4, 5, 6, 7, + 4, 5, 6, 7, + 4, 5, 6, 7, + 4, 5, 6, 7, + 4, 5, 6, 7, + + 8+0, 8+1, 8+2, 8+3, + 8+0, 8+1, 8+2, 8+3, + 8+0, 8+1, 8+2, 8+3, + 8+0, 8+1, 8+2, 8+3, + 8+0, 8+1, 8+2, 8+3, + + 8+4, 8+5, 8+6, 8+7, + 8+4, 8+5, 8+6, 8+7, + 8+4, 8+5, 8+6, 8+7, + 8+4, 8+5, 8+6, 8+7, + 8+4, 8+5, 8+6, 8+7, + + 16+0, 16+1, 16+2, 16+3, + 16+0, 16+1, 16+2, 16+3, + 16+0, 16+1, 16+2, 16+3, + 16+0, 16+1, 16+2, 16+3, + 16+0, 16+1, 16+2, 16+3, + + 16+4, 16+5, 16+6, 16+7, + 16+4, 16+5, 16+6, 16+7, + 16+4, 16+5, 16+6, 16+7, + 16+4, 16+5, 16+6, 16+7, + 16+4, 16+5, 16+6, 16+7, + + 24+0, 24+1, 24+2, 24+3, + 24+0, 24+1, 24+2, 24+3, + 24+0, 24+1, 24+2, 24+3, + 24+0, 24+1, 24+2, 24+3, + 24+0, 24+1, 24+2, 24+3, + + 24+4, 24+5, 24+6, 24+7, + 24+4, 24+5, 24+6, 24+7, + 24+4, 24+5, 24+6, 24+7, + 24+4, 24+5, 24+6, 24+7, + 24+4, 24+5, 24+6, 24+7, + + "s", "s", "s", "s", + "s", "s", "s", "s", + "s", "s", "s", "s", + "s", "s", "s", "s" ] }, + + # INDEX 5 + 5 : { "schedule": + ["m", "frl", + "l", "l", "l", "l", + "l", "l", "l", "l", + "l", "l", "l", "l", + "l", "l", "l", "l", + + "lr", + + 0, 0, 1, 1, 2, 2, 3, 3, + 0, 1, 2, 3, + 4, 4, 5, 5, 6, 6, 7, 7, + 4, 5, 6, 7, + 0, 0, 1, 1, 2, 2, 3, 3, + 4, 4, 5, 5, 6, 6, 7, 7, + + 8+0, 8+0, 8+1, 8+1, 8+2, 8+2, 8+3, 8+3, + 8+0, 8+1, 8+2, 8+3, + 8+4, 8+4, 8+5, 8+5, 8+6, 8+6, 8+7, 8+7, + 8+4, 8+5, 8+6, 8+7, + 8+0, 8+0, 8+1, 8+1, 8+2, 8+2, 8+3, 8+3, + 8+4, 8+4, 8+5, 8+5, 8+6, 8+6, 8+7, 8+7, + + 16+0, 16+0, 16+1, 16+1, 16+2, 16+2, 16+3, 16+3, + 16+0, 16+1, 16+2, 16+3, + 16+4, 16+4, 16+5, 16+5, 16+6, 16+6, 16+7, 16+7, + 16+4, 16+5, 16+6, 16+7, + 16+0, 16+0, 16+1, 16+1, 16+2, 16+2, 16+3, 16+3, + 16+4, 16+4, 16+5, 16+5, 16+6, 16+6, 16+7, 16+7, + + 24+0, 24+0, 24+1, 24+1, 24+2, 24+2, 24+3, 24+3, + 24+0, 24+1, 24+2, 24+3, + 24+4, 24+4, 24+5, 24+5, 24+6, 24+6, 24+7, 24+7, + 24+4, 24+5, 24+6, 24+7, + 24+0, 24+0, 24+1, 24+1, 24+2, 24+2, 24+3, 24+3, + 24+4, 24+4, 24+5, 24+5, 24+6, 24+6, 24+7, 24+7, + + "s", "s", "s", "s", + "s", "s", "s", "s", + "s", "s", "s", "s", + "s", "s", "s", "s" ] }, + + # INDEX 6 + # A totally messy manual attempt to interleave + 6 : { "schedule": + ["m", "frl", + "lr"] + + + [0, "l", "l", 0, -1, -1, "sl", "sl", "l", "l", 1, "l", 1, "l", 2, "l", 2, "l", 3, 3, + 0, 1, 2, 3, + 4, 4, 0, 0, 5, 5, 1, 1, 6, 6, 2, 2, 7, 7, + 4, 3, 5, 3, 6, 7] + + + list(map(add(8), + [0, 0, -4, -4, 1, 1, -3, -3, 2, 2, -2, -2, 3, 3, + 0, -1, -1, 1, 2, 3, + 4, 4, 0, 0, 5, 5, 1, 1, 6, 6, 2, 2, 7, 7, + 4, 3, 5, 3, 6, 7])) + + list(map(add(16), + [0, 0, -4, -4, 1, 1, -3, -3, 2, 2, -2, -2, 3, 3, + 0, -1, -1, 1, 2, 3, + 4, 4, 0, 0, 5, 5, 1, 1, 6, 6, 2, 2, 7, 7, + 4, 3, 5, 3, 6, 7])) + + list(map(add(24), + [0, 0, -4, -4, 1, 1, -3, -3, 2, 2, -2, -2, 3, 3, + 0, -1, -1, 1, 2, 3, + 4, 4, 0, 0, "s", "s", 5, 5, "le", "le", 1, 1, "s", "s", 6, 6, "le", "le", 2, 2, "s", "s", 7, 7, + "le", "le", 4, 3, 5, 3, "s", "s", 6, "le", "le", 7])) + + + [28,28,29,29,30,30] + + + ["s", "s", "s", "s", + "s", "s" ] }, + + # INDEX 7 + # Careful manual interleaving, accounting for latencies + # and usage of vector pipes for vector stores + 7 : { "load_order": [14, 15, 12,13, 8,9,10, 6, 11,7,4,5,0,1,2,3], + "store_order": [4,5,6,7,2,3,0,1,8,9,10,11,12,13,14,15], + "numbering": list(zip( + [6, 7, 4, 5, 0,1, 2, 3, 2,3,0,1,10,11, 8, 9, 4,5,0,1, 8, 9,12,13, 4,6,2,0, 8,10,12,14], + [14, 15, 12,13,8,9,10,11, 6,7,4,5,14,15,12,13, 6,7,2,3,10,11,14,15, 5,7,3,1, 9,11,13,15], + [0, 0, 0, 0, 0, 0, 0, 0, 1,1,1,1, 2, 2, 2, 2, 4,4,3,3, 5, 5, 6, 6, 9,10,8,7,11,12,13,14])), + "schedule": + ["m", "frl", + "lr", + + "l", + 0, 0, -5 , "sl", + "l", + 1, 1, -5 , "sl", + "l", + 2, 2, -4 , -4, + "l", + 3, 3, -3 , -3, + 0, 1, -2 , "sl", + 2, 3, -2 , "sl", + "l", + 4, 4, -1 , "sl", + "l", + 5, 5, -1 , "sl", + "l", + "l", + 6, 6, 0 , 0, + "l", + "l", + 7, 7, 1 , 1, + "l", + 4, 5, 2 , "sl", + 6, 7, 2 , "sl", + "l", + 8, 8, 3 , "sl", + 9, 9, 3 , "sl", + "l", + 10, 10, 4 , 4, + "l", + 11, 11, 5 , 5, + "l", + 8, 9, 6 , "sl", + 10, 11, 6 , "sl", + "l", + 12,12, 7 , "sl", + 13,13, 7 , "sl", + 14,14, 8 , 8, + 15,15, 9 , 9, + 12,13, 10, + 14,15, 10, + 16, 16, 11, + 17,17, 11, + 18,18, 12, 12, + 19, 19, 13, 13, + 16, 17, 14, + 18, 19, 14, + 20, 20, 15, + 21,21, 15, + 22,22, 16, 16, + 23,23, 17, 17, + 20,21, 18, + 22,23, 18, + 24,24, 19, + 25,25, 19, + 26,26, 20, 20, + 27,27, 21, 21, + 24,25, 22, + 26,27, 22, + 28,28, 23, + 29,29, 23, + 30,30, 24,24, + 31,31, 25,25, + 28,29, 26, "s", # S(8) # 15 old + 30,31, 26, "s", # S(8) # 14 old + ] }, + + # INDEX 8 + # The same as 7, but exploring a slightly different interleaving within each line + # which avoids consecutive multiplication operations. + 8 : { "load_order": [14, 15, 12,13, 8,9,10, 6, 11,7,4,5,0,1,2,3], # Load order + "store_order": [4,5,6,7,2,3,0,1,8,9,10,11,12,13,14,15], # Default store order + "numbering": list(zip( + [6, 7, 4, 5, 0,1, 2, 3, 2,3,0,1,10,11, 8, 9, 4,5,0,1, 8, 9,12,13, 4,6,2,0, 8,10,12,14], + [14, 15, 12,13,8,9,10,11, 6,7,4,5,14,15,12,13, 6,7,2,3,10,11,14,15, 5,7,3,1, 9,11,13,15], + [0, 0, 0, 0, 0, 0, 0, 0, 1,1,1,1, 2, 2, 2, 2, 4,4,3,3, 5, 5, 6, 6, 9,10,8,7,11,12,13,14])), + "schedule": ["m", "frl", + "lr", + + "l", + 0, -5 , 0, "sl", + "l", + 1, -5 , 1, "sl", + "l", + 2, -4 , 2, -4, + "l", + 3, -3 , 3, -3, + 0, -2 , 1, "sl", + 2, -2 , 3, "sl", + "l", + 4, -1 , 4, "sl", + "l", + 5, -1 , 5, "sl", + "l", + "l", + 6, 0 , 6, 0, + "l", + "l", + 7, 1 , 7, 1, + "l", + 4, 2 , 5, "sl", + 6, 2 , 7, "sl", + "l", + 8, 3 , 8, "sl", + 9, 3 , 9, "sl", + "l", + 10, 4 , 10, 4, + "l", + 11, 5 , 11, 5, + "l", + 8, 6 , 9, "sl", + 10, 6 , 11, "sl", + "l", + 12, 7 , 12, "sl", + 13, 7 , 13, "sl", + 14, 8 , 14, 8, + 15, 9 , 15, 9, + 12, 10, 13, + 14, 10, 15, + 16, 11, 16, + 17, 11, 17, + 18, 12, 18, 12, + 19, 13, 19, 13, + 16, 14, 17, + 18, 14, 19, + 20, 15, 20, + 21, 15, 21, + 22, 16, 22, 16, + 23, 17, 23, 17, + 20, 18, 21, + 22, 18, 23, + 24, 19, 24, + 25, 19, 25, + 26, 20, 26, 20, + 27, 21, 27, 21, + 24, 22, 25, + 26, 22, 27, + 28, 23, 28, + 29, 23, 29, + 30, 24, 30, 24, + 31, 25, 31, 25, + 28, 26, 29, "s", # S(8) # 15 old + 30, 26, 31, "s", # S(8) # 14 old + ] }, + + # INDEX 9 + # The same as 7, but experimenting whether avoiding ST-LD pairs makes + # any tangible difference + 9 : { "load_order": [14, 15, 12,13, 8,9,10, 6, 11,7,4,5,0,1,2,3], # Load order + "store_order": [4,5,6,7,2,3,0,1,8,9,10,11,12,13,14,15], # Default store order + "numbering": list(zip( + [6, 7, 4, 5, 0,1, 2, 3, 2,3,0,1,10,11, 8, 9, 4,5,0,1, 8, 9,12,13, 4,6,2,0, 8,10,12,14], + [14, 15, 12,13,8,9,10,11, 6,7,4,5,14,15,12,13, 6,7,2,3,10,11,14,15, 5,7,3,1, 9,11,13,15], + [0, 0, 0, 0, 0, 0, 0, 0, 1,1,1,1, 2, 2, 2, 2, 4,4,3,3, 5, 5, 6, 6, 9,10,8,7,11,12,13,14])), + "schedule": + ["m", "frl", + "lr", + + "l", + 0, 0, "sl", -5 , + "l", + 1, 1, "sl", -5 , + "l", + 2, 2, -4 , -4, + "l", + 3, 3, -3 , -3, + 0, 1, "sl", -2 , + 2, 3, "sl", -2 , + "l", + 4, 4, "sl", -1 , + "l", + 5, 5, "sl", -1 , + "l", + "l", + 6, 6, 0 , 0, + "l", + "l", + 7, 7, 1 , 1, + "l", + 4, 5, "sl", 2 , + 6, 7, "sl", 2 , + "l", + 8, 8, "sl", 3 , + 9, 9, "sl", 3 , + "l", + 10, 10, 4 , 4, + "l", + 11, 11, 5 , 5, + "l", + 8, 9, "sl", 6 , + 10, 11, "sl", 6 , + "l", + 12,12, "sl", 7 , + 13,13, "sl", 7 , + 14,14, 8 , 8, + 15,15, 9 , 9, + 12,13, 10, + 14,15, 10, + 16, 16, 11, + 17,17, 11, + 18,18, 12, 12, + 19, 19, 13, 13, + 16, 17, 14, + 18, 19, 14, + 20, 20, 15, + 21,21, 15, + 22,22, 16, 16, + 23,23, 17, 17, + 20,21, 18, + 22,23, 18, + 24,24, 19, + 25,25, 19, + 26,26, 20, 20, + 27,27, 21, 21, + 24,25, 22, + 26,27, 22, + 28,28, 23, + 29,29, 23, + 30,30, 24,24, + 31,31, 25,25, + 28,29, 26, "s", # S(8) # 15 old + 30,31, 26, "s", # S(8) # 14 old + ] }, + + # INDEX 10 + # Same as 7, but inserting some nops to always have blocks of + # four instructions with two multiplies + 10 : { "load_order": [14, 15, 12,13, 8,9,10, 6, 11,7,4,5,0,1,2,3], # Load order + "store_order": [4,5,6,7,2,3,0,1,8,9,10,11,12,13,14,15], # Default store order + "numbering": list(zip( + [6, 7, 4, 5, 0,1, 2, 3, 2,3,0,1,10,11, 8, 9, 4,5,0,1, 8, 9,12,13, 4,6,2,0, 8,10,12,14], + [14, 15, 12,13,8,9,10,11, 6,7,4,5,14,15,12,13, 6,7,2,3,10,11,14,15, 5,7,3,1, 9,11,13,15], + [0, 0, 0, 0, 0, 0, 0, 0, 1,1,1,1, 2, 2, 2, 2, 4,4,3,3, 5, 5, 6, 6, 9,10,8,7,11,12,13,14])), + "schedule": ["m", "frl", + "lr", + + "l", + 0, 0, -5 , "sl", + "l", + 1, 1, -5 , "sl", + "l", + 2, 2, -4 , -4, + "l", + 3, 3, -3 , -3, + 0, 1, -2 , "sl", + 2, 3, -2 , "sl", + "l", + 4, 4, -1 , "sl", + "l", + 5, 5, -1 , "sl", + "l", + "l", + 6, 6, 0 , 0, + "l", + "l", + 7, 7, 1 , 1, + "l", + 4, 5, 2 , "sl", + 6, 7, 2 , "sl", + "l", + 8, 8, 3 , "sl", + 9, 9, 3 , "sl", + "l", + 10, 10, 4 , 4, + "l", + 11, 11, 5 , 5, + "l", + 8, 9, 6 , "sl", + 10, 11, 6 , "sl", + "l", + 12,12, 7 , "sl", + 13,13, 7 , "sl", + 14,14, 8 , 8, + 15,15, 9 , 9, + 12,13, 10, "nop", + 14,15, 10, "nop", + 16, 16, 11, "nop", + 17,17, 11, "nop", + 18,18, 12, 12, + 19, 19, 13, 13, + 16, 17, 14, "nop", + 18, 19, 14, "nop", + 20, 20, 15, "nop", + 21,21, 15, "nop", + 22,22, 16, 16, + 23,23, 17, 17, + 20,21, 18, "nop", + 22,23, 18, "nop", + 24,24, 19, "nop", + 25,25, 19, "nop", + 26,26, 20, 20, + 27,27, 21, 21, + 24,25, 22, "nop", + 26,27, 22, "nop", + 28,28, 23, "nop", + 29,29, 23, "nop", + 30,30, 24,24, + 31,31, 25,25, + 28,29, 26, "s", # S(8) # 15 old + 30,31, 26, "s", # S(8) # 14 old + ] }, + + # INDEX 11 + # Careful manual interleaving, accounting for latencies + # and usage of vector pipes for vector stores + 11 : { "load_order": [14, 15, 12,13, 8,9,10, 11, 6,7,4,5,0,1,2,3], # Load order + "store_order": [5,4,7,6,3,2,1,0,9,8,11,10,13,12,15,14], + "numbering": list(zip( + [6, 7, 4, 5, 0,1, 2, 3, 2,3,0,1,10,11, 8, 9, 4,5,0,1, 8, 9,12,13, 4,6,2,0, 8,10,12,14], + [14, 15, 12,13,8,9,10,11, 6,7,4,5,14,15,12,13, 6,7,2,3,10,11,14,15, 5,7,3,1, 9,11,13,15], + [0, 0, 0, 0, 0, 0, 0, 0, 1,1,1,1, 2, 2, 2, 2, 4,4,3,3, 5, 5, 6, 6, 9,10,8,7,11,12,13,14])), + "schedule": + ["m", "frl", + + "l", + "lrs", + "lrs", + 0, + 0, + "l", + 1, -4, "sl", + 1, -4, "sl", + "l", + 2, -3, "sl", + 2, -3, "sl", + "l", + 3, -2, "sl", + 3, -2, "sl", + + "l", + 0, -1, "sl", + 4, -1, "sl", + "l", + 1, + 5, + "l", + 2, + 6, + "l", + 3, + 7, + + "l", "l", + 4, 0, + 5, 0, + "l", "l", + 4, 1, + 5, 1, + "l", "l", + 6, 2, + 7, 2, + "l", "l", + 6, 3, + 7, 3, + 8+0, + 8+0, + 8+1, 4, + 8+1, 4, + 8+2, 5, + 8+2, 5, + 8+3, 6, + 8+3, 6, + + 8+0, 7, + 8+4, 7, + 8+1, + 8+5, + 8+2, + 8+6, + 8+3, + 8+7, + "lrs", + "lrs", + 8+4, 8, + 8+5, 8, + 8+4, 9, + 8+5, 9, + 8+6, 10, + 8+7, 10, + 8+6, 11, + 8+7, 11, + "frs", + "frs", + 16+0, + 16+0, + 16+1, 12, + 16+1, 12, + 16+2, 13, + 16+2, 13, + 16+3, 14, + 16+3, 14, + + "lrs", + "lrs", + 16+0, 15, + 16+4, 15, + 16+1, + 16+5, + 16+2, + 16+6, + 16+3, + 16+7, + "lrs", + "lrs", + 16+4, 16, + 16+5, 16, + 16+4, 17, + 16+5, 17, + 16+6, 18, + 16+7, 18, + 16+6, 19, + 16+7, 19, + "frs", + "frs", + 24+0, + 24+0, + 24+1, 20, + 24+1, 20, + 24+2, 21, + 24+2, 21, + 24+3, 22, + 24+3, 22, + + 24+0, 23, + 24+4, 23, + 24+1, + 24+5, + 24+2, + 24+6, + 24+3, + 24+7, + 24+4, 24, "s", + 24+5, 24, "s", + 24+4, 25, "s", + 24+5, 25, "s", + 24+6, 26, "s", + 24+7, 26, "s", + 24+6, 27, "s", + 24+7, 27, "s", + ] }, + + # INDEX 12 + # Variant of 11, avoiding blocks with 2x mul, 2x add, 2x str + 12 : { "load_order": [14, 15, 12,13, 8,9,10, 11, 6,7,4,5,0,1,2,3], # Load order + "store_order": [5,4,7,6,3,2,1,0,9,8,11,10,13,12,15,14], + "numbering": list(zip( + [6, 7, 4, 5, 0,1, 2, 3, 2,3,0,1,10,11, 8, 9, 4,5,0,1, 8, 9,12,13, 4,6,2,0, 8,10,12,14], + [14, 15, 12,13,8,9,10,11, 6,7,4,5,14,15,12,13, 6,7,2,3,10,11,14,15, 5,7,3,1, 9,11,13,15], + [0, 0, 0, 0, 0, 0, 0, 0, 1,1,1,1, 2, 2, 2, 2, 4,4,3,3, 5, 5, 6, 6, 9,10,8,7,11,12,13,14])), + "schedule": ["m", "frl", + + "l", + "lrs", + "lrs", + 0, "sl", + 0, "sl", + "l", + 1, -4, "sl", + 1, -4, + "l", + 2, -3, "sl", + 2, -3, + "l", + 3, -2, "sl", + 3, -2, + + "l", + 0, -1, "sl", + 4, -1, + "l", + 1, "sl", + 5, + "l", + 2, "sl", + 6, + "l", + 3, "sl", + 7, + + "l", "l", + 4, 0, "sl", + 5, 0, + "l", "l", + 4, 1, + 5, 1, + "l", "l", + 6, 2, + 7, 2, + "l", "l", + 6, 3, + 7, 3, + 8+0, + 8+0, + 8+1, 4, + 8+1, 4, + 8+2, 5, + 8+2, 5, + 8+3, 6, + 8+3, 6, + + 8+0, 7, + 8+4, 7, + 8+1, + 8+5, + 8+2, + 8+6, + 8+3, + 8+7, + "lrs", + "lrs", + 8+4, 8, + 8+5, 8, + 8+4, 9, + 8+5, 9, + 8+6, 10, + 8+7, 10, + 8+6, 11, + 8+7, 11, + "frs", + "frs", + 16+0, + 16+0, + 16+1, 12, + 16+1, 12, + 16+2, 13, + 16+2, 13, + 16+3, 14, + 16+3, 14, + + "lrs", + "lrs", + 16+0, 15, + 16+4, 15, + 16+1, + 16+5, + 16+2, + 16+6, + 16+3, + 16+7, + "lrs", + "lrs", + 16+4, 16, + 16+5, 16, + 16+4, 17, + 16+5, 17, + 16+6, 18, + 16+7, 18, + 16+6, 19, + 16+7, 19, + "frs", + "frs", + 24+0, + 24+0, + 24+1, 20, + 24+1, 20, + 24+2, 21, + 24+2, 21, + 24+3, 22, + 24+3, 22, + "frs", + "frs", + + 24+0, 23, + 24+4, 23, + 24+1, 24, + 24+5, 24, + 24+2, 25, + 24+6, 25, + 24+3, 26, + 24+7, 26, + + 24+4, 27, + 24+5, 27, + 24+4, "s", + 24+5, "s", + 24+6, "s", + 24+7, "s", + 24+6, "s", + 24+7, "s", + ] }, + + # INDEX 13 + 13 : { "load_order": [14, 15, 12,13, 8,9,10, 11, 6,7,4,5,0,1,2,3], # Load order + "store_order": [5,4,7,6,3,2,1,0,9,8,11,10,13,12,15,14], + "numbering": list(zip( + [6, 7, 4, 5, 0,1, 2, 3, 2,3,0,1,10,11, 8, 9, 4,5,0,1, 8, 9,12,13, 4,6,2,0, 8,10,12,14], + [14, 15, 12,13,8,9,10,11, 6,7,4,5,14,15,12,13, 6,7,2,3,10,11,14,15, 5,7,3,1, 9,11,13,15], + [0, 0, 0, 0, 0, 0, 0, 0, 1,1,1,1, 2, 2, 2, 2, 4,4,3,3, 5, 5, 6, 6, 9,10,8,7,11,12,13,14])), + "schedule": ["m", "frl", + + "l", + "lrs", + 0, "sl", + 0, "sl", + "l", + 1, -4, "sl", + 1, -4, + "l", + 2, -3, "sl", + 2, -3, + "l", + 3, -2, "sl", + 3, -2, + + "l", + 0, -1, "sl", + 4, -1, + "l", + 1, "sl", + 5, + "l", + 2, "sl", + 6, + "l", + 3, "sl", + 7, + + "l", "l", + 4, 0, "sl", + 5, 0, + "l", "l", + 4, 1, + 5, 1, + "l", "l", + 6, 2, + 7, 2, + "l", "l", + 6, 3, + 7, 3, + 8+0, + 8+0, + 8+1, 4, + 8+1, 4, + 8+2, 5, + 8+2, 5, + 8+3, 6, + 8+3, 6, + + "lrs", + 8+0, 7, + 8+4, 7, + "lrs", + 8+1, + 8+5, + 8+2, + 8+6, + 8+3, + 8+7, + 8+4, 8, + 8+5, 8, + 8+4, 9, + 8+5, 9, + 8+6, 10, + 8+7, 10, + 8+6, 11, + 8+7, 11, + "frs", + "frs", + 16+0, + 16+0, + 16+1, 12, + 16+1, 12, + 16+2, 13, + 16+2, 13, + 16+3, 14, + 16+3, 14, + + "lrs", + 16+0, 15, + 16+4, 15, + 16+1, + 16+5, + "lrs", + 16+2, + 16+6, + "lrs", + 16+3, + 16+7, + "lrs", + 16+4, 16, + 16+5, 16, + 16+4, 17, + 16+5, 17, + 16+6, 18, + 16+7, 18, + 16+6, 19, + 16+7, 19, + "frs", + "frs", + 24+0, + 24+0, + 24+1, 20, + 24+1, 20, + 24+2, 21, + 24+2, 21, + 24+3, 22, + 24+3, 22, + "frs", + "frs", + + 24+0, 23, + 24+4, 23, + 24+1, 24, + 24+5, 24, + 24+2, 25, + 24+6, 25, + 24+3, 26, + 24+7, 26, + + "lre", + 24+4, 27, + 24+5, 27, + 24+4, "s", + 24+5, "s", + 24+6, "s", + 24+7, "s", + 24+6, "s", + 24+7, "s", + ] }, + + # INDEX 14 + # Variant of 12, insert some NOPs + 14 : { "load_order": [14, 15, 12,13, 8,9,10, 11, 6,7,4,5,0,1,2,3], # Load order + "store_order": [5,4,7,6,3,2,1,0,9,8,11,10,13,12,15,14], + "numbering": list(zip( + [6, 7, 4, 5, 0,1, 2, 3, 2,3,0,1,10,11, 8, 9, 4,5,0,1, 8, 9,12,13, 6,4,2,0, 8,10,12,14], + [14, 15, 12,13,8,9,10,11, 6,7,4,5,14,15,12,13, 6,7,2,3,10,11,14,15, 7,5,3,1, 9,11,13,15], + [0, 0, 0, 0, 0, 0, 0, 0, 1,1,1,1, 2, 2, 2, 2, 4,4,3,3, 5, 5, 6, 6, 10,9,8,7,11,12,13,14])), + "schedule": + ["m", "frl", + "l", + "lrs", + "lrs", + 0, "sl", + 0, "sl", + "l", + 1, -4, "sl", + 1, -4, + "l", + 2, -3, "sl", + 2, -3, + "l", + 3, -2, "sl", + 3, -2, + + "l", + 0, -1, "sl", + 4, -1, + "l", + 1, "sl", + 5, "nop", + "l", + 2, "sl", + 6, "nop", + "l", + 3, "sl", + 7, "nop", + + "l", "l", + 4, 0, "sl", + 5, 0, + "l", "l", + 4, 1, + 5, 1, + "l", "l", + 6, 2, + 7, 2, + "l", "l", + 6, 3, + 7, 3, + 8+0, "nop", + 8+0, "nop", + 8+1, 4, + 8+1, 4, + 8+2, 5, + 8+2, 5, + 8+3, 6, + 8+3, 6, + + 8+0, 7, + 8+4, 7, + 8+1, "nop", + 8+5, "nop", + 8+2, "nop", + 8+6, "nop", + 8+3, "nop", + 8+7, "nop", + "lrs", + "lrs", + 8+4, 8, + 8+5, 8, + 8+4, 9, + 8+5, 9, + 8+6, 10, + 8+7, 10, + 8+6, 11, + 8+7, 11, + "frs", + "frs", + 16+0, "nop", + 16+0, "nop", + 16+1, 12, + 16+1, 12, + 16+2, 13, + 16+2, 13, + 16+3, 14, + 16+3, 14, + + "lrs", + "lrs", + 16+0, 15, + 16+4, 15, + 16+1, "nop", + 16+5, "nop", + 16+2, "nop", + 16+6, "nop", + 16+3, "nop", + 16+7, "nop", + "lrs", + "lrs", + 16+4, 16, + 16+5, 16, + 16+4, 17, + 16+5, 17, + 16+6, 18, + 16+7, 18, + 16+6, 19, + 16+7, 19, + "frs", + "frs", + 24+0, "nop", + 24+0, "nop", + 24+1, 20, + 24+1, 20, + 24+2, 21, + 24+2, 21, + 24+3, 22, + 24+3, 22, + "frs", + "frs", + + 24+0, 23, + 24+4, 23, + 24+1, 24, + 24+5, 24, + 24+2, 25, + 24+6, 25, + 24+3, 26, + 24+7, 26, + + 24+4, 27, + 24+5, 27, + 24+4, "s", + 24+5, "s", + 24+6, "s", + 24+7, "s", + 24+6, "s", + 24+7, "s", + ] }, + + # INDEX 15 + # Different butterfly ordering + 15 : { "load_order": [15, 14, 13,12, 11, 10, 9, 8, 7,6,5,4,3,2,1,0], + "store_order": [15, 14, 13,12, 11, 10, 9, 8, 7,6,5,4,3,2,1,0], + "numbering": list(zip( + [ 7, 6, 5, 4, 3, 2, 1, 0, 11,10, 9, 8,3,2,1,0, 13,12, 9, 8,5,4,1,0, 14, 12, 10, 8, 6, 4, 2, 0], + [15, 14, 13, 12, 11, 10, 9, 8, 15,14,13,12,7,6,5,4, 15,14,11,10,7,6,3,2, 15, 13, 11, 9, 7, 5, 3, 1], + [ 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2,1,1,1,1, 6, 6, 5, 5,4,4,3,3, 14, 13, 12,11,10, 9, 8, 7])), + "root_load_order": [0,1,3,2], # Root load order + "schedule": + ["m", "frl", + + "l", + "lrs", + "lrs", + 0, "sl", + 0, "sl", + "l", + 1, -4, "sl", + 1, -4, + "l", + 2, -3, "sl", + 2, -3, + "l", + 3, -2, "sl", + 3, -2, + + "l", + 0, -1, "sl", + 4, -1, + "l", + 1, "sl", + 5, "nop", + "l", + 2, "sl", + 6, "nop", + "l", + 3, "sl", + 7, "nop", + + "l", "l", + 4, 0, "sl", + 5, 0, + "l", "l", + 4, 1, + 5, 1, + "l", "l", + 6, 2, + 7, 2, + "l", "l", + 6, 3, + 7, 3, + 8+0, "nop", + 8+0, "nop", + 8+1, 4, + 8+1, 4, + 8+2, 5, + 8+2, 5, + 8+3, 6, + 8+3, 6, + + 8+0, 7, + 8+4, 7, + 8+1, "nop", + 8+5, "nop", + 8+2, "nop", + 8+6, "nop", + 8+3, "nop", + 8+7, "nop", + "lrs", + "lrs", + 8+4, 8, + 8+5, 8, + 8+4, 9, + 8+5, 9, + 8+6, 10, + 8+7, 10, + 8+6, 11, + 8+7, 11, + "frs", + "frs", + 16+0, "nop", + 16+0, "nop", + 16+1, 12, + 16+1, 12, + 16+2, 13, + 16+2, 13, + 16+3, 14, + 16+3, 14, + + "lrs", + "lrs", + 16+0, 15, + 16+4, 15, + 16+1, "nop", + 16+5, "nop", + 16+2, "nop", + 16+6, "nop", + 16+3, "nop", + 16+7, "nop", + "lrs", + "lrs", + 16+4, 16, + 16+5, 16, + 16+4, 17, + 16+5, 17, + 16+6, 18, + 16+7, 18, + 16+6, 19, + 16+7, 19, + "frs", + "frs", + 24+0, "nop", + 24+0, "nop", + 24+1, 20, + 24+1, 20, + 24+2, 21, + 24+2, 21, + 24+3, 22, + 24+3, 22, + "frs", + "frs", + + 24+0, 23, + 24+4, 23, + 24+1, 24, + 24+5, 24, + 24+2, 25, + 24+6, 25, + 24+3, 26, + 24+7, 26, + + 24+4, 27, + 24+5, 27, + 24+4, "s", + 24+5, "s", + 24+6, "s", + 24+7, "s", + 24+6, "s", + 24+7, "s", + ] }, + + # INDEX 16 + # Different butterfly ordering + 16 : { "load_order": [15, 14, 13,12, 11, 10, 9, 8, 7,6,5,4,3,2,1,0], + "store_order": [15, 14, 13,12, 11, 10, 9, 8, 7,6,5,4,3,2,1,0], + "numbering": list(zip( + [ 7, 6, 5, 4, 3, 2, 1, 0, 11,10,3,2, 9, 8,1,0, 13, 9,5,1,12, 8,4,0, 14, 12, 10, 8, 6, 4, 2, 0], + [15, 14, 13, 12, 11, 10, 9, 8, 15,14,7,6,13,12,5,4, 15,11,7,3,14,10,6,2, 15, 13, 11, 9, 7, 5, 3, 1], + [ 0, 0, 0, 0, 0, 0, 0, 0, 2, 2,1,1, 2, 2,1,1, 6, 5,4,3, 6, 5,4,3, 14, 13, 12,11,10, 9, 8, 7])), + "root_load_order": [0,1,3,2], # Root load order + "schedule": + ["m", "frl", + + "l", + "lrs", + "lrs", + 0, "sl", + 0, "sl", + "l", + 1, -4, "sl", + 1, -4, + "l", + 2, -3, "sl", + 2, -3, + "l", + 3, -2, "sl", + 3, -2, + + "l", + 0, -1, "sl", + 4, -1, + "l", + 1, "sl", + 5, "nop", + "l", + 2, "sl", + 6, "nop", + "l", + 3, "nop", + 7, "sl", + + "l", "l", + 4, 0, "sl", + 5, 0, + "l", "l", + 4, 1, + 5, 1, + "l", "l", + 6, 2, + 7, 2, + "l", "l", + 6, 3, + 7, 3, + 8+0, "nop", + 8+0, "nop", + 8+1, 4, + 8+1, 4, + 8+2, 5, + 8+2, 5, + 8+3, 6, + 8+3, 6, + + 8+0, 7, + 8+4, 7, + 8+1, "nop", + 8+5, "nop", + 8+2, "nop", + 8+6, "nop", + 8+3, "nop", + 8+7, "nop", + "lrs", + "lrs", + 8+4, 8, + 8+5, 8, + 8+4, 9, + 8+5, 9, + 8+6, 10, + 8+7, 10, + 8+6, 11, + 8+7, 11, + "frs", + "frs", + 16+0, "nop", + 16+0, "nop", + 16+1, 12, + 16+1, 12, + 16+2, 13, + 16+2, 13, + 16+3, 14, + 16+3, 14, + + "lrs", + "lrs", + 16+0, 15, + 16+4, 15, + 16+1, "nop", + 16+5, "nop", + 16+2, "nop", + 16+6, "nop", + 16+3, "nop", + 16+7, "nop", + "lrs", + "lrs", + 16+4, 16, + 16+5, 16, + 16+4, 17, + 16+5, 17, + 16+6, 18, + 16+7, 18, + 16+6, 19, + 16+7, 19, + "frs", + "frs", + 24+0, "nop", + 24+0, "nop", + 24+1, 20, + 24+1, 20, + 24+2, 21, + 24+2, 21, + 24+3, 22, + 24+3, 22, + "frs", + "frs", + + 24+0, 23, + 24+4, 23, + 24+1, 24, + 24+5, 24, + 24+2, 25, + 24+6, 25, + 24+3, 26, + 24+7, 26, + + 24+4, 27, + 24+5, 27, + 24+4, "s", + 24+5, "s", + 24+6, "s", + 24+7, "s", + 24+6, "s", + 24+7, "s", + ] }, + + # INDEX 17 + # Different butterfly ordering, space out non-MUL ops + 17 : { "load_order": [15, 14, 13,12, 11, 10, 9, 8, 7,6,5,4,3,2,1,0], + "store_order": [15, 14, 13,12, 11, 10, 9, 8, 7,6,5,4,3,2,1,0], + "numbering": list(zip( + [ 7, 6, 5, 4, 3, 2, 1, 0, 11,10,3,2, 9, 8,1,0, 13, 9,5,1,12, 8,4,0, 14, 12, 10, 8, 6, 4, 2, 0], + [15, 14, 13, 12, 11, 10, 9, 8, 15,14,7,6,13,12,5,4, 15,11,7,3,14,10,6,2, 15, 13, 11, 9, 7, 5, 3, 1], + [ 0, 0, 0, 0, 0, 0, 0, 0, 2, 2,1,1, 2, 2,1,1, 6, 5,4,3, 6, 5,4,3, 14, 13, 12,11,10, 9, 8, 7])), + "root_load_order": [0,1,3,2], # Root load order + "schedule": ["m", "frl", + + "l", + 0, "sl", + 0, -4, + "l", + 1, "sl", + 1, -4, + "l", + 2, "sl", + 2, -3, + "l", + 3, "sl", + 3, -3, + + "l", + 0, "sl", + 4, -2, + "l", + 1, "sl", + 5, -2, + "l", + 2, "sl", + 6, -1, + "l", + 3, "sl", + 7, -1, + + "l", "l", + 4, 0, + 5, 0, + "l", "l", + 4, 1, + 5, 1, + "l", "l", + 6, 2, + 7, 2, + "l", "l", + 6, 3, + 7, 3, + 8+0, "nop", + 8+0, "nop", + 8+1, 4, + 8+1, 4, + 8+2, 5, + 8+2, 5, + 8+3, 6, + 8+3, 6, + + 8+0, 7, + 8+4, 7, + 8+1, "sl", + 8+5, "nop", + 8+2, "sl", + 8+6, "nop", + 8+3, "nop", + 8+7, "nop", + "lrs", + "lrs", + 8+4, 8, + 8+5, 8, + 8+4, 9, + 8+5, 9, + 8+6, 10, + 8+7, 10, + 8+6, 11, + 8+7, 11, + "frs", + "frs", + 16+0, "nop", + 16+0, "nop", + 16+1, 12, + 16+1, 12, + 16+2, 13, + 16+2, 13, + 16+3, 14, + 16+3, 14, + + "lrs", + "lrs", + 16+0, 15, + 16+4, 15, + 16+1, "nop", + 16+5, "nop", + 16+2, "nop", + 16+6, "nop", + 16+3, "nop", + 16+7, "nop", + "lrs", + "lrs", + 16+4, 16, + 16+5, 16, + 16+4, 17, + 16+5, 17, + 16+6, 18, + 16+7, 18, + 16+6, 19, + 16+7, 19, + "frs", + "frs", + 24+0, "nop", + 24+0, "nop", + 24+1, 20, + 24+1, 20, + 24+2, 21, + 24+2, 21, + 24+3, 22, + 24+3, 22, + "frs", + "frs", + + 24+0, 23, + 24+4, 23, + 24+1, 24, + 24+5, 24, + 24+2, 25, + 24+6, 25, + 24+3, 26, + 24+7, 26, + + 24+4, 27, "lres", + 24+5, 27, "lres", + 24+4, "s", + 24+5, "s", + 24+6, "s", + 24+7, "s", + 24+6, "s", + 24+7, "s", + ] }, + + # INDEX 18 + # Based on 17, change interleaving to always have 2x MULs next to each other + 18 : { "load_order": [15, 14, 13,12, 11, 10, 9, 8, 7,6,5,4,3,2,1,0], + "store_order": [15, 14, 13,12, 11, 10, 9, 8, 7,6,5,4,3,2,1,0], + "numbering": list(zip( + [ 7, 6, 5, 4, 3, 2, 1, 0, 11,10,3,2, 9, 8,1,0, 13, 9,5,1,12, 8,4,0, 14, 12, 10, 8, 6, 4, 2, 0], + [15, 14, 13, 12, 11, 10, 9, 8, 15,14,7,6,13,12,5,4, 15,11,7,3,14,10,6,2, 15, 13, 11, 9, 7, 5, 3, 1], + [ 0, 0, 0, 0, 0, 0, 0, 0, 2, 2,1,1, 2, 2,1,1, 6, 5,4,3, 6, 5,4,3, 14, 13, 12,11,10, 9, 8, 7])), + "root_load_order": [0,1,3,2], # Root load order + "schedule": ["m", "frl", + + "l", + 0,0, "sl", + -4, + "l", + 1,1, "sl", + -4, + "l", + 2,2, "sl", + -3, + "l", + 3,3, "sl", + -3, + + "l", + 4,0, "sl", + -2, + "l", + 5,1, "sl", + -2, + "l", + 6,2, "sl", + -1, + "l", + 7,3, "sl", + -1, + + "l", "l", + 5,4, 0, + 0, + "l", "l", + 5,4, 1, + 1, + "l", "l", + 7,6, 2, + 2, + "l", "l", + 7,6, 3, + 3, + 8+0,8+0, 4, + "nop", + 8+1,8+1, 4, + "nop", + 8+2,8+2, 5, + 5, + 8+3,8+3, 6, + 6, + + 8+4,8+0, 7, + "sl", + 8+5,8+1, 7, + "nop", + 8+6,8+2, "sl", + "nop", + 8+7,8+3, "nop", + "nop", + "lrs", + "lrs", + 8+5,8+4, 8, + 8, + 8+5,8+4, 9, + 9, + 8+7,8+6, 10, + 10, + 8+7,8+6, 11, + 11, + "frs", + "frs", + 16+0,16+0, "nop", + "nop", + 16+1,16+1, 12, + 12, + 16+2,16+2, 13, + 13, + 16+3,16+3, 14, + 14, + + "lrs", + "lrs", + 16+4,16+0, 15, + 15, + 16+5,16+1, "nop", + "nop", + 16+6,16+2, "nop", + "nop", + 16+7,16+3, "nop", + "nop", + "lrs", + "lrs", + 16+5,16+4, 16, + 16, + 16+5,16+4, 17, + 17, + 16+7,16+6, 18, + 18, + 16+7,16+6, 19, + 19, + "frs", + "frs", + 24+0,24+0, "nop", + "nop", + 24+1,24+1, 20, + 20, + 24+2,24+2, 21, + 21, + 24+3,24+3, 22, + 22, + "frs", + "frs", + + 24+4,24+0, 23, + 23, + 24+5,24+1, 24, + "s", + 24+6,24+2, 24, + "s", + 24+7,24+3, 25, + "s", + + 24+5,24+4, 25, "lres", + 26, "lres", + 24+5,24+4, 26, + "s", + 24+7,24+6, 27, + "s", + 24+7,24+6, 27, + "s", + + # 24+4,24+0, 23, + # 23, + # 24+5,24+1, 24, + # "s", + # 24+6,24+2, 24, + # "s", + # 24+7,24+3, 25, + # "s", + + # 24+5,24+4, 25, "lres", + # "s", "lres", + # 24+5,24+4, 26, + # "s", + # 24+7,24+6, 27, + # "s", + # 24+7,24+6, 27, + # "nop", + + ] }, + + # INDEX 19 + # Based on 18, but minor changes wrt placement of nop's. + 19 : { "load_order": [15, 14, 13,12, 11, 10, 9, 8, 7,6,5,4,3,2,1,0], + "store_order": [15, 14, 13,12, 11, 10, 9, 8, 7,6,5,4,3,2,1,0], + "numbering": list(zip( + [ 7, 6, 5, 4, 3, 2, 1, 0, 11,10,3,2, 9, 8,1,0, 13, 9,5,1,12, 8,4,0, 14, 12, 10, 8, 6, 4, 2, 0], + [15, 14, 13, 12, 11, 10, 9, 8, 15,14,7,6,13,12,5,4, 15,11,7,3,14,10,6,2, 15, 13, 11, 9, 7, 5, 3, 1], + [ 0, 0, 0, 0, 0, 0, 0, 0, 2, 2,1,1, 2, 2,1,1, 6, 5,4,3, 6, 5,4,3, 14, 13, 12,11,10, 9, 8, 7])), + "root_load_order": [0,1,3,2], # Root load order + "schedule": ["m", "frl", + + "l", + 0,0, "sl", + -4, + "l", + 1,1, "sl", + -4, + "l", + 2,2, "sl", + -3, + "l", + 3,3, "sl", + -3, + + "l", + 4,0, "sl", + -2, + "l", + 5,1, "sl", + -2, + "l", + 6,2, "sl", + -1, + "l", + 7,3, "sl", + -1, + + "l", "l", + 5,4, 0, + 0, + "l", "l", + 5,4, 1, + 1, + "l", "l", + 7,6, 2, + 2, + "l", "l", + 7,6, 3, + 3, + 8+0,8+0, 4, + "nop", + 8+1,8+1, 4, + "nop", + 8+2,8+2, 5, + 5, + 8+3,8+3, 6, + 6, + + 8+4,8+0, 7, + "sl", + 8+5,8+1, 7, + "nop", + 8+6,8+2, "sl", + "nop", + 8+7,8+3, "nop", + "nop", + "lrs", + "lrs", + 8+5,8+4, 8, + 8, + 8+5,8+4, 9, + 9, + 8+7,8+6, 10, + 10, + 8+7,8+6, 11, + 11, + "frs", + "frs", + 16+0,16+0, 12, + 12, + 16+1,16+1, 13, + 13, + 16+2,16+2, 14, + 14, + 16+3,16+3, 15, + 15, + + "lrs", + "lrs", + 16+4,16+0, "nop", + "nop", + 16+5,16+1, "nop", + "nop", + 16+6,16+2, "nop", + "nop", + 16+7,16+3, "nop", + "nop", + "lrs", + "lrs", + 16+5,16+4, 16, + 16, + 16+5,16+4, 17, + 17, + 16+7,16+6, 18, + 18, + 16+7,16+6, 19, + 19, + "frs", + "frs", + 24+0,24+0, 20, + 20, + 24+1,24+1, 21, + 21, + 24+2,24+2, 22, + 22, + 24+3,24+3, 23, + 23, + "frs", + "frs", + + 24+4,24+0, "nop", + "nop", + 24+5,24+1, 24, + "s", + 24+6,24+2, 24, + "s", + 24+7,24+3, 25, + "s", + + 24+5,24+4, 25, "lres", + 26, "lres", + 24+5,24+4, 26, + "s", + 24+7,24+6, 27, + "s", + 24+7,24+6, 27, + "s", + + ] }, + + # INDEX 20 + # Based on 19, trying to balance issue queues a bit better + # by extending the overlapping of iterations. This takes off + # pressure from the add/sub issue queue, but increases load + # load of the mul issue queues + 20 : { "load_order": [15, 14, 13,12, 11, 10, 9, 8, 7,6,5,4,3,2,1,0], + "store_order": [15, 14, 13,12, 11, 10, 9, 8, 7,6,5,4,3,2,1,0], + "numbering": list(zip( + [ 7, 6, 5, 4, 3, 2, 1, 0, 11,10,3,2, 9, 8,1,0, 13, 9,5,1,12, 8,4,0, 14, 12, 10, 8, 6, 4, 2, 0], + [15, 14, 13, 12, 11, 10, 9, 8, 15,14,7,6,13,12,5,4, 15,11,7,3,14,10,6,2, 15, 13, 11, 9, 7, 5, 3, 1], + [ 0, 0, 0, 0, 0, 0, 0, 0, 2, 2,1,1, 2, 2,1,1, 6, 5,4,3, 6, 5,4,3, 14, 13, 12,11,10, 9, 8, 7])), + "root_load_order": [0,1,3,2], # Root load order + "schedule": + ["m", "frl", + + "l", + 0,0, -6, + "sl", + "l", + 1,1, -5, + "sl", + "l", + 2,2, -5, + "sl", + "l", + 3,3, "sl", + -4, + + "l", + 4,0, "sl", + -4, + "l", + 5,1, "sl", + -3, + "l", + 6,2, "sl", + -3, + "l", + 7,3, "sl", + -2, + + "l", "l", + 5,4, "sl", + -2, + "l", "l", + 5,4, "sl", + -1, + "l", "l", + 7,6, "sl", + -1, + "l", "l", + 7,6, 0, + 0, + 8+0,8+0, 1, + 1, + 8+1,8+1, 2, + 2, + 8+2,8+2, 3, + 3, + 8+3,8+3, 4, + 4, + + 8+4,8+0, 5, + 5, + 8+5,8+1, 6, + 6, + 8+6,8+2, 7, + "sl", + 8+7,8+3, 7, + "sl", + "lrs", + "lrs", + 8+5,8+4, 8, + 8, + 8+5,8+4, 9, + 9, + 8+7,8+6, 10, + 10, + 8+7,8+6, 11, + 11, + "frs", + "frs", + 16+0,16+0, 12, + 12, + 16+1,16+1, 13, + 13, + 16+2,16+2, 14, + 14, + 16+3,16+3, 15, + 15, + + "lrs", + "lrs", + 16+4,16+0, "nop", + "nop", + 16+5,16+1, "nop", + "nop", + 16+6,16+2, "nop", + "nop", + 16+7,16+3, "nop", + "nop", + "lrs", + "lrs", + 16+5,16+4, 16, + 16, + 16+5,16+4, 17, + 17, + 16+7,16+6, 18, + 18, + 16+7,16+6, 19, + 19, + "frs", + "frs", + 24+0,24+0, 20, + 20, + 24+1,24+1, 21, + 21, + 24+2,24+2, 22, + 22, + 24+3,24+3, 23, + 23, + "frs", + "frs", + + 24+4,24+0, "nop", + "nop", + 24+5,24+1, "nop", + "nop", + 24+6,24+2, "nop", + "nop", + 24+7,24+3, "nop", + "nop", + 24+5,24+4, 24, + "s", + 24+5,24+4, 24, + "s", + 24+7,24+6, 25, + "s", + 24+7,24+6, 25, "lres", + 26, "lres", + + ] }, + + # INDEX 21 + # Based on 20, swapping some ADD/SUB and late stores + 21 : { "load_order": [15, 14, 13,12, 11, 10, 9, 8, 7,6,5,4,3,2,1,0], + "store_order": [15, 14, 13,12, 11, 10, 9, 8, 7,6,5,4,3,2,1,0], + "numbering": list(zip( + [ 7, 6, 5, 4, 3, 2, 1, 0, 11,10,3,2, 9, 8,1,0, 13, 9,5,1,12, 8,4,0, 14, 12, 10, 8, 6, 4, 2, 0], + [15, 14, 13, 12, 11, 10, 9, 8, 15,14,7,6,13,12,5,4, 15,11,7,3,14,10,6,2, 15, 13, 11, 9, 7, 5, 3, 1], + [ 0, 0, 0, 0, 0, 0, 0, 0, 2, 2,1,1, 2, 2,1,1, 6, 5,4,3, 6, 5,4,3, 14, 13, 12,11,10, 9, 8, 7])), + "root_load_order": [0,1,3,2], # Root load order + "schedule": + ["m", "frl", + + "l", + 0,0, -6, + "sl", + "l", + 1,1, -5, + "sl", + "l", + 2,2, -5, + "sl", + "l", + 3,3, "sl", + -4, + + "l", + 4,0, -4, + "sl", + "l", + 5,1, -3, + "sl", + "l", + 6,2, -3, + "sl", + "l", + 7,3, -2, + "sl", + + "l", "l", + 5,4, -2, + "sl", + "l", "l", + 5,4, -1, + "sl", + "l", "l", + 7,6, -1, + "sl", + "l", "l", + 7,6, 0, + 0, + 8+0,8+0, 1, + 1, + 8+1,8+1, 2, + 2, + 8+2,8+2, 3, + 3, + 8+3,8+3, 4, + 4, + + 8+4,8+0, 5, + 5, + 8+5,8+1, 6, + 6, + 8+6,8+2, "sl", + 7, + 8+7,8+3, "sl", + 7, + "lrs", + "lrs", + 8+5,8+4, 8, + 8, + 8+5,8+4, 9, + 9, + 8+7,8+6, 10, + 10, + 8+7,8+6, 11, + 11, + "frs", + "frs", + 16+0,16+0, 12, + 12, + 16+1,16+1, 13, + 13, + 16+2,16+2, 14, + 14, + 16+3,16+3, 15, + 15, + + "lrs", + "lrs", + 16+4,16+0, "nop", + "nop", + 16+5,16+1, "nop", + "nop", + 16+6,16+2, "nop", + "nop", + 16+7,16+3, "nop", + "nop", + "lrs", + "lrs", + 16+5,16+4, 16, + 16, + 16+5,16+4, 17, + 17, + 16+7,16+6, 18, + 18, + 16+7,16+6, 19, + 19, + "frs", + "frs", + 24+0,24+0, 20, + 20, + 24+1,24+1, 21, + 21, + 24+2,24+2, 22, + 22, + 24+3,24+3, 23, + 23, + "frs", + "frs", + + 24+4,24+0, "nop", + "nop", + 24+5,24+1, "nop", + "nop", + 24+6,24+2, "nop", + "nop", + 24+7,24+3, "nop", + "nop", + 24+5,24+4, 24, + "s", + 24+5,24+4, 24, + "s", + 24+7,24+6, 25, + "s", + 24+7,24+6, 25, "lres", + 26, "lres", + + ] }, + + # INDEX 22 + # Omit some late stores to smoothen transition + # to next layers. + 22 : { + "load_order": [15, 14, 13,12, 11, 10, 9, 8, 7,6,5,4,3,2,1,0], + "store_order": [15, 14, 13,12, 11, 10, 9, 8, 7,6,5,4,3,2,1,0], + "store_order_last": [15, 14, 13,12, 11, 10, 9, 8, 7,6,5,4], + "numbering": list(zip( + [ 7, 6, 5, 4, 3, 2, 1, 0, 11,10,3,2, 9, 8,1,0, 13, 9,5,1,12, 8,4,0, 14, 12, 10, 8, 6, 4, 2, 0], + [15, 14, 13, 12, 11, 10, 9, 8, 15,14,7,6,13,12,5,4, 15,11,7,3,14,10,6,2, 15, 13, 11, 9, 7, 5, 3, 1], + [ 0, 0, 0, 0, 0, 0, 0, 0, 2, 2,1,1, 2, 2,1,1, 6, 5,4,3, 6, 5,4,3, 14, 13, 12,11,10, 9, 8, 7])), + + "root_load_order": [0,1,3,2], # Root load order + + "schedule": + ["m", "frl", + "l", + 0,0, -6, + "sl", + "l", + 1,1, -5, + "sl", + "l", + 2,2, -5, + "sl", + "l", + 3,3, -4, + "sl", + + "l", + 4,0, -4, + "sl", + "l", + 5,1, -3, + "sl", + "l", + 6,2, -3, + "sl", + "l", + 7,3, -2, + "sl", + + "l", -2, + 5,4, "l", + "sl", + "l", "l", + 5,4, "sl", + -1, + "l", "l", + 7,6, "sl", + -1, + "l", "l", + 7,6, 0, + 0, + 8+0,8+0, 1, + 1, + 8+1,8+1, 2, + 2, + 8+2,8+2, 3, + 3, + 8+3,8+3, 4, + 4, + + 8+4,8+0, 5, + 5, + 8+5,8+1, 6, + 6, + 8+6,8+2, 7, + "sl", + 8+7,8+3, 7, + "sl", + "lrs", + "lrs", + 8+5,8+4, 8, + 8, + 8+5,8+4, 9, + 9, + 8+7,8+6, 10, + 10, + 8+7,8+6, 11, + 11, + "frs", + "frs", + 16+0,16+0, 12, + 12, + 16+1,16+1, 13, + 13, + 16+2,16+2, 14, + 14, + 16+3,16+3, 15, + 15, + + "lrs", + "lrs", + 16+4,16+0, "nop", + "nop", + 16+5,16+1, "nop", + "nop", + 16+6,16+2, "nop", + "nop", + 16+7,16+3, "nop", + "nop", + "lrs", + "lrs", + 16+5,16+4, 16, + 16, + 16+5,16+4, 17, + 17, + 16+7,16+6, 18, + 18, + 16+7,16+6, 19, + 19, + "frs", + "frs", + 24+0,24+0, 20, + 20, + 24+1,24+1, 21, + 21, + 24+2,24+2, 22, + 22, + 24+3,24+3, 23, + 23, + "frs", + "frs", + + 24+4,24+0, "nop", + "nop", + 24+5,24+1, "nop", + "nop", + 24+6,24+2, "nop", + "nop", + 24+7,24+3, "nop", + "nop", + 24+5,24+4, 24, + "s", + 24+5,24+4, 24, + "s", + 24+7,24+6, 25, + "s", + 24+7,24+6, 25, "lres", + 26, "lres", + + ] }, + + # INDEX 23 + # Omit some late stores to smoothen transition + # to next layers. + 23 : { + "load_order": [15, 14, 13,12, 11, 10, 9, 8, 7,6,5,4,3,2,1,0], + "store_order": [15, 14, 13,12, 11, 10, 9, 8, 7,6,5,4,3,2,1,0], + "store_order_last": [15, 14, 13,12, 11, 10, 9, 8, 7,6,5,4], + "numbering": list(zip( + [ 7, 6, 5, 4, 3, 2, 1, 0, 11,10,3,2, 9, 8,1,0, 13, 9,5,1,12, 8,4,0, 14, 12, 10, 8, 6, 4, 2, 0], + [15, 14, 13, 12, 11, 10, 9, 8, 15,14,7,6,13,12,5,4, 15,11,7,3,14,10,6,2, 15, 13, 11, 9, 7, 5, 3, 1], + [ 0, 0, 0, 0, 0, 0, 0, 0, 2, 2,1,1, 2, 2,1,1, 6, 5,4,3, 6, 5,4,3, 14, 13, 12,11,10, 9, 8, 7])), + + "root_load_order": [0,1,3,2], # Root load order + + "schedule": + ["m", "frl", + "l", + 0,0, -6, + "sl", + "l", + 1,1, -5, + "sl", + "l", + 2,2, -5, + "sl", + "l", + 3,3, -4, + "sl", + + "l", + 4,0, -4, + "sl", + "l", + 5,1, -3, + "sl", + "l", + 6,2, -3, + "sl", + "l", + 7,3, -2, + "sl", + + "l", "l", + 5,4, -2, + "sl", + "l", "l", + 5,4, "sl", + -1, + "l", "l", + 7,6, "sl", + -1, + "l", "l", + 7,6, 0, + 0, + 8+0,8+0, 1, + 1, + 8+1,8+1, 2, + 2, + 8+2,8+2, 3, + 3, + 8+3,8+3, 4, + 4, + + 8+4,8+0, 5, + 5, + 8+5,8+1, 6, + 6, + 8+6,8+2, 7, + "sl", + 8+7,8+3, 7, + "sl", + "lrs", + "lrs", + 8+5,8+4, 8, + 8, + 8+5,8+4, 9, + 9, + 8+7,8+6, 10, + 10, + 8+7,8+6, 11, + 11, + "frs", + "frs", + 16+0,16+0, 12, + 12, + 16+1,16+1, 13, + 13, + 16+2,16+2, 14, + 14, + 16+3,16+3, 15, + 15, + + "lrs", + "lrs", + 16+4,16+0, "nop", + "nop", + 16+5,16+1, "nop", + "nop", + 16+6,16+2, "nop", + "nop", + 16+7,16+3, "nop", + "nop", + "lrs", + "lrs", + 16+5,16+4, 16, + 16, + 16+5,16+4, 17, + 17, + 16+7,16+6, 18, + 18, + 16+7,16+6, 19, + 19, + "frs", + "frs", + 24+0,24+0, 20, + 20, + 24+1,24+1, 21, + 21, + 24+2,24+2, 22, + 22, + 24+3,24+3, 23, + 23, + "frs", + "frs", + + 24+4,24+0, "nop", + "nop", + 24+5,24+1, "nop", + "nop", + 24+6,24+2, "nop", + "nop", + 24+7,24+3, "nop", + "nop", + 24+5,24+4, 24, + "s", + 24+5,24+4, 24, + "s", + 24+7,24+6, 25, + "s", + 24+7,24+6, 25, "lres", + 26, "lres", + + ] }, + + # INDEX 24 + # Deliberately bad, bunch lots of MULs + 24 : { "schedule": + ["m", "frl", "l", "l", "l", "l", + "l", "l", "l", "l", + "l", "l", "l", "l", + "l", "l", "l", "l", + + 0, 0, 0, + 1, 1, 1, + 2, 2, 2, + 3, 3, 3, + 4, 4, 4, + 5, 5, 5, + 6, 6, 6, + 7, 7, 7, + 0, 0, + 1, 1, + 2, 2, + 3, 3, + 4, 4, + 5, 5, + 6, 6, + 7, 7, + + 8, 8, 8, + 9, 9, 9, + 10, 10, 10, + 11, 11, 11, + 12, 12, 12, + 13, 13, 13, + 14, 14, 14, + 15, 15, 15, + 8, 8, + 9, 9, + 10, 10, + 11, 11, + 12, 12, + 13, 13, + 14, 14, + 15, 15, + + + 16, 16, 16, + 17, 17, 17, + 18, 18, 18, + 19, 19, 19, + 20, 20, 20, + 21, 21, 21, + 22, 22, 22, + 23, 23, 23, + 16, 16, + 17, 17, + 18, 18, + 19, 19, + 20, 20, + 21, 21, + 22, 22, + 23, 23, + + 24, 24, 24, + 25, 25, 25, + 26, 26, 26, + 27, 27, 27, + 28, 28, 28, + 29, 29, 29, + 30, 30, 30, + 31, 31, 31, + 24, 24, + 25, 25, + 26, 26, + 27, 27, + 28, 28, + 29, 29, + 30, 30, + 31, 31, + + "lre", + + "s", "s", "s", "s", + "s", "s", "s", "s", + "s", "s", "s", "s", + "s", "s", "s", "s" ] }, + + } + + modification = modifications[idx] + + # for k,v in modification.items(): + # if not k in default.keys(): + # raise Exception(f"Invalid modification: {k}") + + dic = { **default, **modification } + + dic["load_order_first"] = dic.get("load_order_first", dic["load_order"]) + dic["store_order_last"] = dic.get("store_order_last", dic["store_order"]) + + return dic + + def run_schedule(self, ct_schedule, + last_butterfly_arr, butterfly_arr, next_butterfly_arr): + + # Process the operation array + for op in ct_schedule: + +# print(f"OP: {op}") + + if not isinstance(op,tuple): + op = (0,op) + + idx = op[0] + op = op[1] + + if last_butterfly_arr != None: + last_butterfly = last_butterfly_arr[idx] + else: + last_butterfly = None + + if butterfly_arr != None: + butterfly = butterfly_arr[idx] + else: + butterfly = None + + if next_butterfly_arr != None: + next_butterfly = next_butterfly_arr[idx] + else: + next_butterfly = None + + # Progress one of the GS butterflies + if isinstance(op,int): + idx = op + # Operation for current block of butterflies + if butterfly != None and idx >= 0 and idx < butterfly.num_gs: + yield from self.progress_arithmetic(butterfly,idx) + # Operation for last butterfly + if last_butterfly != None and idx < 0: + idx += last_butterfly.num_gs + yield from self.progress_arithmetic(last_butterfly,idx) + # Non GS operations (memory + transpose) + elif isinstance(op,str): + if op == "fnop": + if butterfly != None and last_butterfly == None: + yield "nop" + elif op == "nop": + if butterfly != None: + yield "nop" + elif op == "s": # Store + yield from self.store_input(butterfly, last=(next_butterfly==None)) + elif op == "sl": # Store late + yield from self.store_input(last_butterfly, last=(butterfly==None)) + elif op == "l": # Load + yield from self.load_input(butterfly, first=(last_butterfly==None)) + elif op == "le": # Load early + yield from self.load_input(next_butterfly) + elif op == "t": # Transpose + yield from self.get_transpose(butterfly) + elif op == "ts": # Transpose single + n = next(self.get_transpose(butterfly),None) + if n != None: + yield n + elif op == "lr": # Load root + yield from self.load_root_scalars(butterfly) + elif op == "lrs": # Load root single + n = next(self.load_root_scalars(butterfly),None) + if n != None: + yield n + elif op == "lre": # Load roots early + yield from self.load_root_scalars(next_butterfly) + elif op == "lres": # Load roots early single + n = next(self.load_root_scalars(next_butterfly),None) + if n != None: + yield n + elif op == "frl": # Free roots late + if butterfly == None or butterfly.load_roots: + list(self.free_root_scalars(last_butterfly)) + elif op == "fr": # Free roots + if next_butterfly == None or next_butterfly.load_roots: + list(self.free_root_scalars(butterfly)) + elif op == "frs": # Free roots single + if next_butterfly == None or next_butterfly.load_roots: + next(self.free_root_scalars(butterfly),None) + elif op == "m": # Move roots + if butterfly != None and not butterfly.load_roots: + self.copy_root_scalars(butterfly, last_butterfly) + else: + raise Exception("Unknown operation") + + + def free_root_scalars(self,butterfly): + + if butterfly == None: + return iter([]) + + if butterfly.root_vecs == None: + return iter([]) + + def free_roots(): + + l = len(butterfly.root_vecs) + + order = butterfly.root_load_order + + for i in range(0,l): + self.vregs.free(butterfly.root_vecs[order[i]]) + butterfly.root_vecs[order[i]] = None + yield + self.vregs.free(butterfly.root_twisted_vecs[order[i]]) + butterfly.root_twisted_vecs[order[i]] = None + yield + + butterfly.root_vecs = None + butterfly.root_twisted_vecs = None + + butterfly.root = None + butterfly.root_lane = None + butterfly.root_twisted = None + butterfly.root_twisted_lane = None + + if butterfly.free_root_scalars == None: + butterfly.free_root_scalars = free_roots() + + return butterfly.free_root_scalars + + def make_twiddle_accessors(self,butterfly,root_to_vec_idx_lane): + + def find_root(idx): + return butterfly.root_vecs[root_to_vec_idx_lane[idx][0]] + def find_root_twisted(idx): + return butterfly.root_twisted_vecs[root_to_vec_idx_lane[idx][0]] + def find_lane(idx): + return root_to_vec_idx_lane[idx][1] + + butterfly.root = find_root + butterfly.root_twisted = find_root_twisted + butterfly.root_lane = find_lane + butterfly.root_twisted_lane = find_lane + + def get_butterfly_list(self, layer_start, merged_layers): + + shuffle = False + + if layer_start + merged_layers > self.shuffle_boundary: + merged_layers = self.shuffle_boundary - layer_start + shuffle = True + + num_blocks = pow(2,layer_start) + block_size = self.size // num_blocks + vectors_per_butterfly = pow(2,merged_layers) + elements_per_butterfly = self.elements_per_vector * vectors_per_butterfly + butterflies_per_block = block_size // elements_per_butterfly + + block_stride = block_size // vectors_per_butterfly + + for block in range(0,num_blocks): + block_base = block * block_size + idxs = list(range(0,butterflies_per_block)) + # idxs = idxs[1::2] + idxs[0::2] + idxs = idxs[len(idxs)//2:] + idxs[:len(idxs)//2] + for i, idx in enumerate(idxs): + butterfly_base = block_base + idx * self.elements_per_vector + yield Butterfly(layer=layer_start, + merged=merged_layers, + block=block, + shuffle=shuffle, + base=butterfly_base, + stride=block_stride, + load_roots=(i==0)) + + def do_butterflies(self,schedule,butterflies,zip_type=1): + + if zip_type == 2: + half = len(butterflies) // 2 + butterflies = list(zip(butterflies[:half],butterflies[half:])) + elif zip_type == 4: + quarter = len(butterflies) // 4 + butterflies = list(zip(butterflies[0::4], + butterflies[1::4], + butterflies[2::4], + butterflies[3::4])) + # butterflies = list(zip(butterflies[0*quarter:1*quarter], + # butterflies[1*quarter:2*quarter], + # butterflies[2*quarter:3*quarter], + # butterflies[3*quarter:4*quarter])) + else: + butterflies = [[b] for b in butterflies] + + def get_butterfly(idx): + if idx < 0: + return None + if idx >= len(butterflies): + return None + return butterflies[idx] + + for i in range(-1,len(butterflies)+1): + cur_butterfly = get_butterfly(i) + last_butterfly = get_butterfly(i-1) + next_butterfly = get_butterfly(i+1) + yield from self.run_schedule(schedule, + last_butterfly, + cur_butterfly, + next_butterfly) + + def attach_butterfly_info_old(self, + butterflies, + load_order, load_order_first, + store_order, store_order_last, + numbering, twiddles, + root_load_order): + + dic = { "load_order": load_order, + "load_order_first": load_order_first, + "store_order": store_order, + "store_order_last": store_order_last, + "numbering": numbering, + "root_load_order": root_load_order, + "twiddles": twiddles } + + self.attach_butterfly_info(butterflies,dic) + + def attach_butterfly_info(self, + butterflies, + dic): + for b in butterflies: + b.load_order = dic["load_order"] + b.load_order_first = dic["load_order_first"] + b.store_order = dic["store_order"] + b.store_order_last = dic["store_order_last"] + b.root_load_order = dic["root_load_order"] + gs = [] + for i,j,r in dic["numbering"]: + gs.append(self.ct_butterfly_single(b,i,j,r)) + b.gs = gs + self.make_twiddle_accessors(b, dic["twiddles"]) + + def core(self): + + yield from self.init_constants() + + for (base,merge),(zipped,schedule_idx) in zip(self.layers,self.schedules): + + butterflies = list(self.get_butterfly_list(base,merge)) + + schedules = { + (0,3,1): self.get_schedule_triple_no_transpose, + (3,3,1): self.get_schedule_triple_no_transpose, + (0,4,1): self.get_schedule_quad_no_transpose, + (4,2,1): self.get_schedule_double_no_transpose, + (4,2,2): self.get_schedule_double_no_transpose_zipped, + (4,2,4): self.get_schedule_double_no_transpose_quad_zipped, + (4,4,1): self.get_schedule_quad_transpose, + (4,4,2): self.get_schedule_quad_transpose_zipped, + (4,4,4): self.get_schedule_quad_transpose_quad_zipped } + + sched_func = schedules[(base,merge,zipped)] + + s = sched_func(schedule_idx) + + if not type(s) is dict: + load_order, store_order, numbering, twiddles, root_load_order, schedule = s + load_order_first = load_order + store_order_last = store_order + sched_func(schedule_idx) + self.attach_butterfly_info_old(butterflies, load_order, load_order_first, + store_order, store_order_last, numbering, twiddles, root_load_order) + yield from self.do_butterflies(schedule,butterflies,zip_type=zipped) + else: + self.attach_butterfly_info(butterflies, s) + yield from self.do_butterflies(s["schedule"],butterflies,zip_type=zipped) + + self.vregs.revfree() + + def standalone(self,funcname): + + # Preamble + yield from Snippets.license() + yield from Snippets.autogen_warning() + yield from self.generate_constants() + yield from Snippets.function_decl(funcname) + + yield "modulus_addr: .quad modulus" + + if not self.interleave_twiddles: + yield "roots_addr: .quad roots" + yield "roots_twisted_addr: .quad roots_twisted" + else: + yield "roots_merged_addr: .quad roots_merged" + + yield from Snippets.function_header(funcname) + yield from Snippets.save_gprs() # Not necessary + yield from Snippets.save_vregs() + + self.gprs.alloc(self._src) + + self.prepare_constants() + + # Actual code + yield from self.core() + + # Wrapup + self.free_constants() + + self.gprs.free(self._src) + + yield from Snippets.restore_vregs() + yield from Snippets.restore_gprs() # Not necessary + yield from Snippets.function_footer() + + def get_code(self): + gen = self.standalone() + for line in gen: + print(line) + +def main(argv): + + outfile = None + degree = None + + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument("--out", type=str, default=None) + parser.add_argument("--schedule", type=str, default="0,0") + parser.add_argument("--layers", type=str, default="3,3") + parser.add_argument("--bitwidth", type=int, default=32) + parser.add_argument("size", type=int) + parser.add_argument("modulus", type=int) + parser.add_argument("root", type=int) + parser.add_argument("symbol", type=str) + + args = parser.parse_args() + + code_all = [] + code_essential = [] + + line_count = 0; + + args.layers = list(map(int,args.layers.split(','))) + args.schedule = list(args.schedule.split(',')) + + ntt = NTT(args.size, + args.modulus, + args.root, + layers=args.layers, + schedules=args.schedule, + bitwidth=args.bitwidth) + code_gen = ntt.standalone(args.symbol) + + for line in code_gen: + code_all.append(line) + + def is_code_line(line): + if len(line) < 2: + return False + if line[0:2] == '//': + return False + return True + + code_essential = filter(is_code_line, code_all) + line_count_total = len(list(code_all)) + line_count_essential = len(list(code_essential)) + + code_all.append(f'') + code_all.append(f'// Line count: {line_count_total}') + code_all.append(f'// Instruction count: {line_count_essential}') + + code_all_str = "\n".join(code_all) + + if not args.out == None: + f = open(args.out,"w") + f.write(code_all_str) + f.close() + else: + print(code_all_str) + +if __name__ == "__main__": + main(sys.argv[1:]) diff --git a/envs/cross/.gitignore b/envs/cross/.gitignore new file mode 100644 index 0000000..b882758 --- /dev/null +++ b/envs/cross/.gitignore @@ -0,0 +1,2 @@ +test_loaded_* +test \ No newline at end of file diff --git a/envs/cross/Makefile b/envs/cross/Makefile new file mode 100644 index 0000000..03e22dc --- /dev/null +++ b/envs/cross/Makefile @@ -0,0 +1,106 @@ +# Armv8-A test environment based on QEMU +# +# Copyright (c) 2021 Arm Limited (or its affiliates). All rights reserved. +# Use, modification and redistribution of this file is subject to your possession of a +# valid End User License Agreement for the Arm Product of which these examples are part of +# and your compliance with all applicable terms and conditions of such licence agreement. + +################################################################################ +### ### +### USER CONFIGURATION ### +### ADAPT THIS ### +### ### +################################################################################ + +# +# See README.md for setup instructions +# + +QEMU=qemu-aarch64 +CC=aarch64-none-linux-gnu-gcc +LD=$(CC) + +PLATFORM ?= v84a + +CFLAGS = -fpic -Wall -Wextra -Werror -Wshadow -Wno-unused-parameter + +################################################################################ +### ### +### END OF USER CONFIGURATION ### +### ### +################################################################################ + +# Final image +TARGET=test + +INC_DIR=./inc +INC_DIR_TEST=$(INC_DIR)/test_inc -I$(SRC_DIR)/test_src/manual -I$(SRC_DIR)/test_src/auto +BUILD_DIR=./build +SRC_DIR=./src + +# Scatter files before/after preprocessing +LDFLAGS = -static + +CFLAGS+= -Ofast -I$(INC_DIR) \ + -I$(INC_DIR_TEST) + +CYCLES?=NO # PMU / PERF + +ifeq ($(CYCLES),PMU) + CFLAGS += -DPMU_CYCLES +endif + +ifeq ($(CYCLES),PERF) + CFLAGS += -DPERF_CYCLES +endif + +ifeq ($(CYCLES),NO) + CFLAGS += -DNO_CYCLES +endif + +CFLAGS_V8A := $(CFLAGS) -march=armv8-a +ifeq ($(PLATFORM),v84a) + CFLAGS += -march=armv8.4-a+crypto+sha3+sve2 +else + CFLAGS += -march=armv8-a +endif + +C_SRC_FILES_PRE=$(wildcard $(SRC_DIR)/*.c) $(wildcard $(SRC_DIR)/*/*.c) $(wildcard $(SRC_DIR)/*/*/*.c) $(wildcard $(SRC_DIR)/*/*/*/*.c) +C_SRC_FILES=$(patsubst $(SRC_DIR)/%.c, %.c, $(C_SRC_FILES_PRE)) + +ASM_SRC_FILES_PRE=$(wildcard $(SRC_DIR)/*/*.s) $(wildcard $(SRC_DIR)/*.s) $(wildcard $(SRC_DIR)/*/*/*.s) $(wildcard $(SRC_DIR)/*/*/*/*.s) +ASM_SRC_FILES=$(patsubst $(SRC_DIR)/%.s, %.s, $(ASM_SRC_FILES_PRE)) + +HEADER_FILES_PRE=$(wildcard $(SRC_DIR)/*.h) $(wildcard $(SRC_DIR)/*/*.h) $(wildcard $(SRC_DIR)/*/*/*.h) + +ASM_OBJ_FILES=$(patsubst %.s, $(BUILD_DIR)/%.o, $(ASM_SRC_FILES)) +C_OBJ_FILES=$(patsubst %.c, $(BUILD_DIR)/%.o, $(C_SRC_FILES)) +OBJ_FILES=$(ASM_OBJ_FILES) $(C_OBJ_FILES) + +.phony: all clean debug run + +all: $(TARGET) + +# Compilation +$(C_OBJ_FILES): $(BUILD_DIR)/%.o: $(SRC_DIR)/%.c $(HEADER_FILES_PRE) + mkdir -p $(@D) + $(CC) $(CFLAGS) -c -o $@ $< +$(BUILD_DIR)/test_src/manual/third_party/keccakx2_cothan.o: $(SRC_DIR)/test_src/manual/third_party/keccakx2_cothan.c $(HEADER_FILES_PRE) + mkdir -p $(@D) + $(CC) $(CFLAGS_V8A) -c -o $@ $< +$(ASM_OBJ_FILES): $(BUILD_DIR)/%.o: $(SRC_DIR)/%.s $(HEADER_FILES_PRE) + mkdir -p $(@D) + $(CC) -x assembler-with-cpp $(CFLAGS) -c -o $@ $< + +# Linking +$(TARGET): $(OBJS_DIR) $(OBJ_FILES) + mkdir -p $(@D) + $(LD) $(LDFLAGS) $(OBJ_FILES) -o $(TARGET) + +# Running +run: $(TARGET) + $(QEMU) ./$(TARGET) + +clean: + rm -rf $(OBJ_FILES) + rm -rf $(TARGET) diff --git a/envs/cross/inc/hal_env.h b/envs/cross/inc/hal_env.h new file mode 100644 index 0000000..e8dd570 --- /dev/null +++ b/envs/cross/inc/hal_env.h @@ -0,0 +1,9 @@ +#ifndef QEMU_V8A_HAL_ENV_H +#define QEMU_V8A_HAL_ENV_H + +#define SEP ; + +#define ASM_LOAD(dst,symbol) \ + adrp dst, symbol ; add dst, dst, :lo12:symbol; + +#endif diff --git a/envs/cross/inc/test_inc b/envs/cross/inc/test_inc new file mode 120000 index 0000000..31da609 --- /dev/null +++ b/envs/cross/inc/test_inc @@ -0,0 +1 @@ +../../../tests/inc \ No newline at end of file diff --git a/envs/cross/src/hal.c b/envs/cross/src/hal.c new file mode 100644 index 0000000..e42ab50 --- /dev/null +++ b/envs/cross/src/hal.c @@ -0,0 +1,172 @@ +/* + * Copyright (c) 2022 Arm Limited + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include + +/* Dependency on standard library: + * - rand(), srand() + * - time() + * - printf() + * - fflush() + */ +#include +#include +#include +#include + +#define FILENO stderr + +void rand_init( unsigned long seed ) +{ + ((void) seed); + srand(time(NULL)); +} + +uint8_t get_random_byte() +{ + return( rand() ); +} + +/* Debugging stubs */ + +void debug_test_start( const char *testname ) +{ + fprintf( FILENO, "%s ... ", testname ); + fflush( FILENO ); +} + +void debug_printf(const char * format, ... ) +{ + va_list argp; + va_start( argp, format ); + vfprintf( FILENO, format, argp ); + va_end( argp ); +} + +void debug_test_ok() { printf( "Ok\n" ); } +void debug_test_fail() { printf( "FAIL!\n" ); } + + +#if !defined(EXTERNAL_CYCLES) && !defined(PERF_CYCLES) && !defined(PMU_CYCLES) && !defined(NO_CYCLES) +#define NO_CYCLES +#endif + +#if defined(PMU_CYCLES) +void enable_cyclecounter() { + uint64_t tmp; + __asm __volatile ( + "mrs %[tmp], pmcr_el0\n" + "orr %[tmp], %[tmp], #1\n" + "msr pmcr_el0, %[tmp]\n" + "mrs %[tmp], pmcntenset_el0\n" + "orr %[tmp], %[tmp], #1<<31\n" + "msr pmcntenset_el0, %[tmp]\n" + : [tmp] "=r" (tmp) + ); +} + +void disable_cyclecounter() { + uint64_t tmp; + __asm __volatile ( + "mov %[tmp], #0x3f\n" + "orr %[tmp], %[tmp], #1<<31\n" + "msr pmcntenclr_el0, %[tmp]\n" + : [tmp] "=r" (tmp) + ); +} + +uint64_t get_cyclecounter() { + uint64_t retval; + __asm __volatile ( + "mrs %[retval], pmccntr_el0\n" + : [retval] "=r" (retval)); + return retval; +} + +#elif defined(PERF_CYCLES) + +#include +#include +#include +#include +#include +#include +#include +#include + +static int perf_fd = 0; +void enable_cyclecounter() { + struct perf_event_attr pe; + memset(&pe, 0, sizeof(struct perf_event_attr)); + pe.type = PERF_TYPE_HARDWARE; + pe.size = sizeof(struct perf_event_attr); + pe.config = PERF_COUNT_HW_CPU_CYCLES; + pe.disabled = 1; + pe.exclude_kernel = 1; + pe.exclude_hv = 1; + + perf_fd = syscall(__NR_perf_event_open, &pe, 0, -1, -1, 0); + + ioctl(perf_fd, PERF_EVENT_IOC_RESET, 0); + ioctl(perf_fd, PERF_EVENT_IOC_ENABLE, 0); +} + +void disable_cyclecounter() { + ioctl(perf_fd, PERF_EVENT_IOC_DISABLE, 0); + close(perf_fd); +} + +uint64_t get_cyclecounter() { + long long cpu_cycles; + ioctl(perf_fd, PERF_EVENT_IOC_DISABLE, 0); + ssize_t read_count = read(perf_fd, &cpu_cycles, sizeof(cpu_cycles)); + if (read_count < 0) { + perror("read"); + exit(EXIT_FAILURE); + } else if (read_count == 0) { + /* Should not happen */ + printf("perf counter empty\n"); + exit(EXIT_FAILURE); + } + ioctl(perf_fd, PERF_EVENT_IOC_ENABLE, 0); + return cpu_cycles; +} + +#elif defined(EXTERNAL_CYCLES) + +// nothing to do + +#else /* NO_CYCLES */ + +void enable_cyclecounter() { + return; +} +void disable_cyclecounter() { + return; +} +uint64_t get_cyclecounter() { + return(0); +} + +#endif /* NO_CYCLES */ diff --git a/envs/cross/src/test_common b/envs/cross/src/test_common new file mode 120000 index 0000000..7c5f7b1 --- /dev/null +++ b/envs/cross/src/test_common @@ -0,0 +1 @@ +../../../tests/common \ No newline at end of file diff --git a/envs/cross/src/test_src b/envs/cross/src/test_src new file mode 120000 index 0000000..b102f77 --- /dev/null +++ b/envs/cross/src/test_src @@ -0,0 +1 @@ +../../../tests/ntt_neon \ No newline at end of file diff --git a/envs/native_linux/.gitignore b/envs/native_linux/.gitignore new file mode 100644 index 0000000..b882758 --- /dev/null +++ b/envs/native_linux/.gitignore @@ -0,0 +1,2 @@ +test_loaded_* +test \ No newline at end of file diff --git a/envs/native_linux/Makefile b/envs/native_linux/Makefile new file mode 100644 index 0000000..0c76694 --- /dev/null +++ b/envs/native_linux/Makefile @@ -0,0 +1,102 @@ +# Native AArch64 Linux test environment +# +# Copyright (c) 2021 Arm Limited (or its affiliates). All rights reserved. +# Use, modification and redistribution of this file is subject to your possession of a +# valid End User License Agreement for the Arm Product of which these examples are part of +# and your compliance with all applicable terms and conditions of such licence agreement. + +################################################################################ +### ### +### USER CONFIGURATION ### +### ADAPT THIS ### +### ### +################################################################################ + +# +# See README.md for setup instructions +# + +CC=gcc +LD=$(CC) + +PLATFORM ?= v84a + +CFLAGS = -fpic -Wall -Wextra -Werror -Wshadow -Wno-unused-parameter -Wno-incompatible-pointer-types + +################################################################################ +### ### +### END OF USER CONFIGURATION ### +### ### +################################################################################ + +# Final image +TARGET=test + +INC_DIR=./inc +INC_DIR_TEST=$(INC_DIR)/test_inc -I$(SRC_DIR)/test_src/manual -I$(SRC_DIR)/test_src/auto +BUILD_DIR=./build +SRC_DIR=./src + +CFLAGS+= -Ofast -I$(INC_DIR) \ + -I$(INC_DIR_TEST) + +CYCLES?=NO # PMU / PERF + +ifeq ($(CYCLES),PMU) + CFLAGS += -DPMU_CYCLES +endif + +ifeq ($(CYCLES),PERF) + CFLAGS += -DPERF_CYCLES +endif + +ifeq ($(CYCLES),NO) + CFLAGS += -DNO_CYCLES +endif + +CFLAGS_V8A := $(CFLAGS) -march=armv8-a +ifeq ($(PLATFORM),v84a) + CFLAGS += -march=armv8.4-a+crypto+sha3 +else + CFLAGS += -march=armv8-a +endif + +C_SRC_FILES_PRE=$(wildcard $(SRC_DIR)/*.c) $(wildcard $(SRC_DIR)/*/*.c) $(wildcard $(SRC_DIR)/*/*/*.c) $(wildcard $(SRC_DIR)/*/*/*/*.c) +C_SRC_FILES=$(patsubst $(SRC_DIR)/%.c, %.c, $(C_SRC_FILES_PRE)) + +ASM_SRC_FILES_PRE=$(wildcard $(SRC_DIR)/*/*.s) $(wildcard $(SRC_DIR)/*.s) $(wildcard $(SRC_DIR)/*/*/*.s) $(wildcard $(SRC_DIR)/*/*/*/*.s) +ASM_SRC_FILES=$(patsubst $(SRC_DIR)/%.s, %.s, $(ASM_SRC_FILES_PRE)) + +HEADER_FILES_PRE=$(wildcard $(SRC_DIR)/*.h) $(wildcard $(SRC_DIR)/*/*.h) $(wildcard $(SRC_DIR)/*/*/*.h) + +ASM_OBJ_FILES=$(patsubst %.s, $(BUILD_DIR)/%.o, $(ASM_SRC_FILES)) +C_OBJ_FILES=$(patsubst %.c, $(BUILD_DIR)/%.o, $(C_SRC_FILES)) +OBJ_FILES=$(ASM_OBJ_FILES) $(C_OBJ_FILES) + +.phony: all clean debug run + +all: $(TARGET) + +# Compilation +$(C_OBJ_FILES): $(BUILD_DIR)/%.o: $(SRC_DIR)/%.c $(HEADER_FILES_PRE) + mkdir -p $(@D) + $(CC) $(CFLAGS) -c -o $@ $< +$(BUILD_DIR)/test_src/manual/third_party/keccakx2_cothan.o: $(SRC_DIR)/test_src/manual/third_party/keccakx2_cothan.c $(HEADER_FILES_PRE) + mkdir -p $(@D) + $(CC) $(CFLAGS_V8A) -c -o $@ $< +$(ASM_OBJ_FILES): $(BUILD_DIR)/%.o: $(SRC_DIR)/%.s $(HEADER_FILES_PRE) + mkdir -p $(@D) + $(CC) -x assembler-with-cpp $(CFLAGS) -c -o $@ $< + +# Linking +$(TARGET): $(OBJS_DIR) $(OBJ_FILES) + mkdir -p $(@D) + $(LD) $(OBJ_FILES) -o $(TARGET) + +# Running +run: $(TARGET) + ./$(TARGET) + +clean: + rm -rf $(OBJ_FILES) + rm -rf $(TARGET) diff --git a/envs/native_linux/inc/hal_env.h b/envs/native_linux/inc/hal_env.h new file mode 100644 index 0000000..7440a3f --- /dev/null +++ b/envs/native_linux/inc/hal_env.h @@ -0,0 +1,9 @@ +#ifndef QEMU_V8A_HAL_ENV_H +#define QEMU_V8A_HAL_ENV_H + +#define SEP ; + +#define ASM_LOAD(dst,symbol) \ + adrp dst, symbol ; add dst, dst, :lo12:symbol; + +#endif /* QEMU_V8A_HAL_ENV_H */ diff --git a/envs/native_linux/inc/test_inc b/envs/native_linux/inc/test_inc new file mode 120000 index 0000000..31da609 --- /dev/null +++ b/envs/native_linux/inc/test_inc @@ -0,0 +1 @@ +../../../tests/inc \ No newline at end of file diff --git a/envs/native_linux/src/hal.c b/envs/native_linux/src/hal.c new file mode 100644 index 0000000..e42ab50 --- /dev/null +++ b/envs/native_linux/src/hal.c @@ -0,0 +1,172 @@ +/* + * Copyright (c) 2022 Arm Limited + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include + +/* Dependency on standard library: + * - rand(), srand() + * - time() + * - printf() + * - fflush() + */ +#include +#include +#include +#include + +#define FILENO stderr + +void rand_init( unsigned long seed ) +{ + ((void) seed); + srand(time(NULL)); +} + +uint8_t get_random_byte() +{ + return( rand() ); +} + +/* Debugging stubs */ + +void debug_test_start( const char *testname ) +{ + fprintf( FILENO, "%s ... ", testname ); + fflush( FILENO ); +} + +void debug_printf(const char * format, ... ) +{ + va_list argp; + va_start( argp, format ); + vfprintf( FILENO, format, argp ); + va_end( argp ); +} + +void debug_test_ok() { printf( "Ok\n" ); } +void debug_test_fail() { printf( "FAIL!\n" ); } + + +#if !defined(EXTERNAL_CYCLES) && !defined(PERF_CYCLES) && !defined(PMU_CYCLES) && !defined(NO_CYCLES) +#define NO_CYCLES +#endif + +#if defined(PMU_CYCLES) +void enable_cyclecounter() { + uint64_t tmp; + __asm __volatile ( + "mrs %[tmp], pmcr_el0\n" + "orr %[tmp], %[tmp], #1\n" + "msr pmcr_el0, %[tmp]\n" + "mrs %[tmp], pmcntenset_el0\n" + "orr %[tmp], %[tmp], #1<<31\n" + "msr pmcntenset_el0, %[tmp]\n" + : [tmp] "=r" (tmp) + ); +} + +void disable_cyclecounter() { + uint64_t tmp; + __asm __volatile ( + "mov %[tmp], #0x3f\n" + "orr %[tmp], %[tmp], #1<<31\n" + "msr pmcntenclr_el0, %[tmp]\n" + : [tmp] "=r" (tmp) + ); +} + +uint64_t get_cyclecounter() { + uint64_t retval; + __asm __volatile ( + "mrs %[retval], pmccntr_el0\n" + : [retval] "=r" (retval)); + return retval; +} + +#elif defined(PERF_CYCLES) + +#include +#include +#include +#include +#include +#include +#include +#include + +static int perf_fd = 0; +void enable_cyclecounter() { + struct perf_event_attr pe; + memset(&pe, 0, sizeof(struct perf_event_attr)); + pe.type = PERF_TYPE_HARDWARE; + pe.size = sizeof(struct perf_event_attr); + pe.config = PERF_COUNT_HW_CPU_CYCLES; + pe.disabled = 1; + pe.exclude_kernel = 1; + pe.exclude_hv = 1; + + perf_fd = syscall(__NR_perf_event_open, &pe, 0, -1, -1, 0); + + ioctl(perf_fd, PERF_EVENT_IOC_RESET, 0); + ioctl(perf_fd, PERF_EVENT_IOC_ENABLE, 0); +} + +void disable_cyclecounter() { + ioctl(perf_fd, PERF_EVENT_IOC_DISABLE, 0); + close(perf_fd); +} + +uint64_t get_cyclecounter() { + long long cpu_cycles; + ioctl(perf_fd, PERF_EVENT_IOC_DISABLE, 0); + ssize_t read_count = read(perf_fd, &cpu_cycles, sizeof(cpu_cycles)); + if (read_count < 0) { + perror("read"); + exit(EXIT_FAILURE); + } else if (read_count == 0) { + /* Should not happen */ + printf("perf counter empty\n"); + exit(EXIT_FAILURE); + } + ioctl(perf_fd, PERF_EVENT_IOC_ENABLE, 0); + return cpu_cycles; +} + +#elif defined(EXTERNAL_CYCLES) + +// nothing to do + +#else /* NO_CYCLES */ + +void enable_cyclecounter() { + return; +} +void disable_cyclecounter() { + return; +} +uint64_t get_cyclecounter() { + return(0); +} + +#endif /* NO_CYCLES */ diff --git a/envs/native_linux/src/test_common b/envs/native_linux/src/test_common new file mode 120000 index 0000000..7c5f7b1 --- /dev/null +++ b/envs/native_linux/src/test_common @@ -0,0 +1 @@ +../../../tests/common \ No newline at end of file diff --git a/envs/native_mac/.gitignore b/envs/native_mac/.gitignore new file mode 100644 index 0000000..b882758 --- /dev/null +++ b/envs/native_mac/.gitignore @@ -0,0 +1,2 @@ +test_loaded_* +test \ No newline at end of file diff --git a/envs/native_mac/Makefile b/envs/native_mac/Makefile new file mode 100644 index 0000000..712c91c --- /dev/null +++ b/envs/native_mac/Makefile @@ -0,0 +1,78 @@ +# Armv8-A test environment based on QEMU +# +# Copyright (c) 2021 Arm Limited (or its affiliates). All rights reserved. +# Use, modification and redistribution of this file is subject to your possession of a +# valid End User License Agreement for the Arm Product of which these examples are part of +# and your compliance with all applicable terms and conditions of such licence agreement. + +################################################################################ +### ### +### USER CONFIGURATION ### +### ADAPT THIS ### +### ### +################################################################################ + +# +# See README.md for setup instructions +# + +LD=clang +CC=clang + +CFLAGS = -march=armv8.4-a+crypto+sha3 \ + -Wall -Wextra -Werror -Wshadow -Wno-unused-parameter -Wno-incompatible-pointer-types + +################################################################################ +### ### +### END OF USER CONFIGURATION ### +### ### +################################################################################ + +# Final image +TARGET=test + +INC_DIR=./inc +INC_DIR_TEST=$(INC_DIR)/test_inc -I$(SRC_DIR)/test_src/manual -I$(SRC_DIR)/test_src/auto +BUILD_DIR=./build +SRC_DIR=./src + +# Scatter files before/after preprocessing + +CFLAGS+= -Ofast -I$(INC_DIR) \ + -I$(INC_DIR_TEST) + +C_SRC_FILES_PRE=$(wildcard $(SRC_DIR)/*.c) $(wildcard $(SRC_DIR)/*/*.c) $(wildcard $(SRC_DIR)/*/*/*.c $(SRC_DIR)/*/*/*/*.c) +C_SRC_FILES=$(patsubst $(SRC_DIR)/%.c, %.c, $(C_SRC_FILES_PRE)) +ASM_SRC_FILES_PRE=$(wildcard $(SRC_DIR)/*/*.s) $(wildcard $(SRC_DIR)/*.s) $(wildcard $(SRC_DIR)/*/*/*.s) $(wildcard $(SRC_DIR)/*/*/*/*.s) +ASM_SRC_FILES=$(patsubst $(SRC_DIR)/%.s, %.s, $(ASM_SRC_FILES_PRE)) + +HEADER_FILES_PRE=$(wildcard $(SRC_DIR)/*.h) $(wildcard $(SRC_DIR)/*/*.h) $(wildcard $(SRC_DIR)/*/*/*.h) + +ASM_OBJ_FILES=$(patsubst %.s, $(BUILD_DIR)/%.o, $(ASM_SRC_FILES)) +C_OBJ_FILES=$(patsubst %.c, $(BUILD_DIR)/%.o, $(C_SRC_FILES)) +OBJ_FILES=$(ASM_OBJ_FILES) $(C_OBJ_FILES) $(CMSIS_OBJ_FILES) + +.phony: all clean debug run + +all: $(TARGET) + +# Compilation +$(C_OBJ_FILES): $(BUILD_DIR)/%.o: $(SRC_DIR)/%.c $(HEADER_FILES_PRE) + mkdir -p $(@D) + $(CC) $(CFLAGS) -c -o $@ $< +$(ASM_OBJ_FILES): $(BUILD_DIR)/%.o: $(SRC_DIR)/%.s $(HEADER_FILES_PRE) + mkdir -p $(@D) + $(CC) -x assembler-with-cpp $(CFLAGS) -c -o $@ $< + +# Linking +$(TARGET): $(OBJS_DIR) $(OBJ_FILES) + mkdir -p $(@D) + $(LD) $(OBJ_FILES) -o $(TARGET) + +# Running +run: $(TARGET) + ./$(TARGET) + +clean: + rm -rf $(OBJ_FILES) + rm -rf $(TARGET) diff --git a/envs/native_mac/inc/hal_env.h b/envs/native_mac/inc/hal_env.h new file mode 100644 index 0000000..806fc22 --- /dev/null +++ b/envs/native_mac/inc/hal_env.h @@ -0,0 +1,9 @@ +#ifndef QEMU_V8A_HAL_ENV_H +#define QEMU_V8A_HAL_ENV_H + +#define SEP %% + +#define ASM_LOAD(dst,symbol) \ + adrp dst, symbol@PAGE %% add dst, dst, symbol@PAGEOFF + +#endif /* QEMU_V8A_HAL_ENV_H */ diff --git a/envs/native_mac/inc/test_inc b/envs/native_mac/inc/test_inc new file mode 120000 index 0000000..31da609 --- /dev/null +++ b/envs/native_mac/inc/test_inc @@ -0,0 +1 @@ +../../../tests/inc \ No newline at end of file diff --git a/envs/native_mac/src/hal.c b/envs/native_mac/src/hal.c new file mode 100644 index 0000000..e8b2fa6 --- /dev/null +++ b/envs/native_mac/src/hal.c @@ -0,0 +1,52 @@ +#include + +/* Dependency on standard library: + * - rand(), srand() + * - time() + * - printf() + * - fflush() + */ +#include +#include +#include +#include + +void rand_init( unsigned long seed ) +{ + ((void) seed); + srand(time(NULL)); +} + +uint8_t get_random_byte() +{ + return( rand() ); +} + +/* Debugging stubs */ + +void debug_test_start( const char *testname ) +{ + printf( "%s ... ", testname ); + fflush( stdout ); +} + +void debug_printf(const char * format, ... ) +{ + va_list argp; + va_start( argp, format ); + vprintf( format, argp ); + va_end( argp ); +} + +void debug_test_ok() { printf( "Ok\n" ); } +void debug_test_fail() { printf( "FAIL!\n" ); } + +void enable_cyclecounter() { + return; +} +void disable_cyclecounter() { + return; +} +uint64_t get_cyclecounter() { + return(0); +} diff --git a/envs/native_mac/src/test_common b/envs/native_mac/src/test_common new file mode 120000 index 0000000..7c5f7b1 --- /dev/null +++ b/envs/native_mac/src/test_common @@ -0,0 +1 @@ +../../../tests/common \ No newline at end of file diff --git a/envs/native_mac/src/test_src b/envs/native_mac/src/test_src new file mode 120000 index 0000000..32dc84a --- /dev/null +++ b/envs/native_mac/src/test_src @@ -0,0 +1 @@ +../../../tests/keccak_neon \ No newline at end of file diff --git a/nelight b/nelight new file mode 160000 index 0000000..8c26905 --- /dev/null +++ b/nelight @@ -0,0 +1 @@ +Subproject commit 8c2690540c94243da391ab3a31b372cf35bf176d diff --git a/sphincsplus/README.md b/sphincsplus/README.md new file mode 100644 index 0000000..213ee56 --- /dev/null +++ b/sphincsplus/README.md @@ -0,0 +1,20 @@ +# SPHINCS+ on AArch64 + +## Overview + +This directory contains source code, scripts and benchmarks accompanying the paper "Hybrid scalar/vector +implementations for Keccak on AArch64" by Becker and Kannwischer. + +## Structure + +* [sphincsplus-keccakx2](sphincsplus-keccakx2) hosts the implementation of SPHINCS+ from the [official SPHINCS+ +repository](https://github.com/sphincs/sphincsplus) making use of $2$-way parallel Keccak-f1600 implementations. + +* [sphincsplus-keccakxN](sphincsplus-keccakxN) is a derived implementation of SPHINCS+ which can leverage general N-way +parallel Keccak-f1600 implementations, and is used with the AArch64 assembly implementations found in [this +repository](../asm/manual/keccak_1600). + + +## License + +See [sphincsplus-keccakx2/LICENSE](sphincsplus-keccakx2/LICENSE) and [sphincsplus-keccakxN/LICENSE](sphincsplus-keccakxN/LICENSE) diff --git a/sphincsplus/convert-keccak-benchmarks.py b/sphincsplus/convert-keccak-benchmarks.py new file mode 100644 index 0000000..6a4d64d --- /dev/null +++ b/sphincsplus/convert-keccak-benchmarks.py @@ -0,0 +1,211 @@ +#! /usr/bin/env python3 + +## MIT License +## +## Copyright (c) 2022 Arm Limited +## Copyright (c) 2022 Matthias Kannwischer +## +## Permission is hereby granted, free of charge, to any person obtaining a copy +## of this software and associated documentation files (the "Software"), to deal +## in the Software without restriction, including without limitation the rights +## to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +## copies of the Software, and to permit persons to whom the Software is +## furnished to do so, subject to the following conditions: +## +## The above copyright notice and this permission notice shall be included in all +## copies or substantial portions of the Software. +## +## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +## IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +## FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +## AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +## LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +## OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +## SOFTWARE. +## + +import os +import argparse +import re + +parser = argparse.ArgumentParser() + +parser.add_argument("-f","--fmt", choices=["tex", "md"], required=True) +args = parser.parse_args() + +markdown = args.fmt == "md" + +with open("keccak-benchmarks.md") as f: + lines = f.readlines() +categories = { "refScalar" : { "text" : "Reference C", + "ref" : "\\cite{XKCP}", + "way" : 1}, + "ourScalar" : { "text" : "Scalar", + "ref" : "Ours", + "way" : 1}, + "refNeon" : { "text" : "Neon", + "ref" : "\\cite{CothanSHA3}", + "way" : 2}, + "ourNeon" : { "text" : "Neon", + "ref" : "Ours", + "way" : 2}, + "refSHA3" : { "text" : "\\neonsha", + "ref" : "\\cite{BasSHA3}", + "way" : 2}, + "ourSHA3" : { "text" : "\\neonsha", + "ref" : "Ours", + "way" : 2}, + "hybridNN" : { "text" : "Neon/\\neonsha", + "ref" : "Ours", + "way" : 2}, + "hybridSN3" : { "text" : "Scalar/Neon/\\neonsha", + "ref" : "Ours", + "way" : 3 }, + "hybridSN8" : { "text" : "Scalar/Neon", + "ref" : "Ours", + "way" : 4 }, + "hybridSN84" : { "text" : "Scalar/\\neonsha", + "ref" : "Ours", + "way" : 4 }, + "hybridSN5" : { "text" : "Scalar/Neon", + "ref" : "Ours", + "way" : 5 }, + "hybridSNN" : { "text": "Scalar/Neon/\\neonsha", + "ref": "Ours", + "way": 4 } } + +if markdown: + categories["refScalar"]["ref"] = "[C][C]" + categories["refNeon"]["ref"] = "[Ngu][Ngu]" + categories["refSHA3"]["ref"] = "[Wes][Wes]" + + for key in categories: + categories[key]["text"] = categories[key]["text"].replace("\\neonsha", "Neon+SHA-3") + + +default_functions = { "refScalar" : "keccak_f1600_x1_scalar_C_original", + "ourScalar" : "keccak_f1600_x1_scalar_asm_v5", + "refNeon" : "keccak_f1600_x2_neon_C_cothan", + "refSHA3" : "keccak_f1600_x2_bas", + "ourSHA3" : "keccak_f1600_x2_v84a_asm_v1", + "ourNeon" : "keccak_f1600_x2_v84a_asm_v2pp2", + "hybridSN8" : "keccak_f1600_x4_hybrid_asm_v3p", + "hybridSN84": "keccak_f1600_x4_hybrid_asm_v2", + "hybridNN" : "keccak_f1600_x2_hybrid_asm_v2pp2", + "hybridSNN" : "keccak_f1600_x4_hybrid_asm_v4", + "hybridSN3" : "keccak_f1600_x3_hybrid_asm_v6", + "hybridSN5" : "keccak_f1600_x5_hybrid_asm_v8p" } + + +exceptions = { "Cortex-A510" : { "ourSHA3" : "keccak_f1600_x2_v84a_asm_v1p0" }, + "Cortex-A55" : { "ourNeon" : "keccak_f1600_x2_v84a_asm_v2" }, + "Cortex-A710" : { "ourNeon" : "keccak_f1600_x2_v84a_asm_v2pp6", + "hybridNN" : "keccak_f1600_x2_hybrid_asm_v2pp2" } } + +def do(platforms, lines, categories=categories): + linesPerPlatform = {} + # filter out right lines and group by platform + start = None + curPlatform = None + for idx, line in enumerate(lines): + if not (line.startswith("#") and "taskset" not in line): + continue + # print(line) + if start != None: + linesPerPlatform[curPlatform] = lines[start:idx] + curPlatform = None + start = None + + for pltfrm in platforms: + if pltfrm in line: + curPlatform = pltfrm + start = idx + # print(start) + + # print( sorted(list(linesPerPlatform.keys())) ) + assert sorted(list(linesPerPlatform.keys())) == sorted(platforms) + + def parseMedianCC(line): + line = re.sub(r"^.*\*(.*)\*.*$", r"\1", line) + return int(line.strip()) + + cycles = {} + for platform, lines in linesPerPlatform.items(): + cycles[platform] = {} + def get_func_for_platform(plt,cat): + func = default_functions[cat] + if plt in exceptions.keys() and \ + cat in exceptions[plt].keys(): + #print(f"Exception on {platform}: use {exceptions[plt][cat]} instead of {func} for {cat}") + func = exceptions[plt][cat] + return func + for category in categories.keys(): + func = get_func_for_platform(platform, category) + #print(f"{platform}: Use {func} for {category}") + for line in lines: + if f"{func})" not in line or "AVGs" not in line: + continue + cc = parseMedianCC(line) + cycles[platform][category] = cc + # print(f"{platform}.{category} ({func}): {cc} cycles)") + + if markdown: + def fmtc(cycles): + value = f"{cycles}" + return value + else: + def fmtc(cycles): + value = f"{cycles:,}" + value = value.replace(",", "\\,") + return value + + if not markdown: + header = "&".join([f"\multicolumn{{2}}{{c|}}{{{p}}}" for p in platforms]) + header = "c".join(header.rsplit("c|", 1)) + print(f" Approach & & & {header} \\\\\\hline") + for category, params in categories.items(): + no_data = True + cc = [] + way = params["way"] + txt = params["text"] + ref = params["ref"] + for platform in platforms: + if category in cycles[platform]: + cyc = cycles[platform][category] + avg = cyc // way + if markdown: + cc.append(f"{fmtc(cyc)} ({avg})" ) + else: + cc.append(f"{fmtc(cyc)}&({avg})" ) + no_data = False + else: + if markdown: + cc.append("--") + else: + cc.append("-- & ") + if not no_data: + if markdown: + print(f"| {txt} | {ref} | {way}x | " + " | ".join(cc)) + else: + print(f"{txt} & {ref} & {way}x & " + " & ".join(cc) + "\\\\") + +if markdown: + print("| Approach | | |Cortex-X1 | Cortex-A78 | Cortex-A55 |") + print("| -------- | - | - |--------- | ---------- | -----------|") + do(["Cortex-X1", "Cortex-A78", "Cortex-A55"], lines) + print() + print() + print("| Approach | | | Cortex-X2 | Cortex-A710 | Cortex-A510 |") + print("| -------- | - | - | --------- | ----------- | ------------|") + do(["Cortex-X2", "Cortex-A710", "Cortex-A510"], lines) + print() + print() + print("[C]: https://github.com/XKCP/XKCP") + print("[Ngu]: https://github.com/cothan/NEON-SHA3_2x") + print("[Wes]: https://github.com/bwesterb/armed-keccak") +else: + print("\\begin{tabular}{c|c|c|rr|rr|rr}") + do(["Cortex-X1", "Cortex-A78", "Cortex-A55"], lines) + print("\\hline\\hline") + do(["Cortex-X2", "Cortex-A710", "Cortex-A510"], lines) + print("\\end{tabular}") diff --git a/sphincsplus/convert-sphincs-benchmarks.py b/sphincsplus/convert-sphincs-benchmarks.py new file mode 100644 index 0000000..8536f23 --- /dev/null +++ b/sphincsplus/convert-sphincs-benchmarks.py @@ -0,0 +1,306 @@ +#! /usr/bin/env python3 + +## MIT License +## +## Copyright (c) 2021 Arm Limited +## Copyright (c) 2022 Matthias Kannwischer +## +## Permission is hereby granted, free of charge, to any person obtaining a copy +## of this software and associated documentation files (the "Software"), to deal +## in the Software without restriction, including without limitation the rights +## to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +## copies of the Software, and to permit persons to whom the Software is +## furnished to do so, subject to the following conditions: +## +## The above copyright notice and this permission notice shall be included in all +## copies or substantial portions of the Software. +## +## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +## IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +## FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +## AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +## LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +## OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +## SOFTWARE. +## + +import os +from re import S +import argparse +import itertools + +parser = argparse.ArgumentParser() + +parser.add_argument("-f","--fmt", choices=["tex", "md"], required=True) +parser.add_argument("-a", "--all", default=False, action="store_true") +args = parser.parse_args() + +markdown = args.fmt == "md" +all = args.all + +def parse(line): + cycles = line.split(":")[-1] + cycles = cycles.replace("cycles", "").replace(",", "") + cycles = int(cycles) + return cycles + + + +def fmts(value): + return f"({value:.2f} $\\times$)" + +def getBench(bench_dir, variant=None): + d = {} + + # parse benchmark files + for paramset in os.listdir(bench_dir): + parts = paramset.split("_") + paramname = parts[0] + if len(parts) > 1 and parts[1] != variant and variant is not None: + continue + + with open(os.path.join(bench_dir, paramset)) as f: + lines = f.readlines() + + keypair = None + sign = None + verify = None + + for line in lines: + if "Generating keypair" in line: + keypair = parse(line) + + if "Signing.." in line: + sign = parse(line) + + if "Verifying.." in line: + verify = parse(line) + d[paramname] = { + "k" : keypair, + "s" : sign, + "v" : verify + } + return d + + +platforms = ["X1", "A78", "A55", "X2", "A710", "A510"] +baselineVariants = ["C", "COTHANV8", "BAS"] +optimizedVariants = ["x3", "x4", "x5"] + +if markdown: + implementationNames = { + "C" : "[C][C]", + "COTHANV8" : "[Ngu][Ngu]", + "BAS" : "[Wes][Wes]", + "x3": "Ours", + "x4": "Ours", + "x5": "Ours" + } + + if all: + implementationNames["x3"] = "Ours (x3)" + implementationNames["x4"] = "Ours (x4)" + implementationNames["x5"] = "Ours (x5)" +else : + implementationNames = { + "C" : "C\\cite{XKCP}", + "COTHANV8" : "\\cite{CothanSHA3}", + "BAS" : "\\cite{BasSHA3}", + "x3": "Ours", + "x4": "Ours", + "x5": "Ours" + } + + +if all: + + options = ["f", "s"] + sizes = [128, 192, 256] + thashes = ['simple', 'robust'] + parameterSets = [] + for size, opt, thash in itertools.product(sizes, options,thashes): + parameterSets.append(f"sphincs-shake-{size}{opt}-{thash}") + +else: + parameterSets = ["sphincs-shake-128f-robust", "sphincs-shake-128s-robust"] + +# set to only display one variant in the table +if not all: + filterVariants = { + "X1" : "x4", + "A78" : { + "sphincs-shake-128f-robust": "x4", + "sphincs-shake-128s-robust": "x5" + }, + "A55" : "x4", + "X2" : { + "sphincs-shake-128f-robust": "x4", + "sphincs-shake-128s-robust": "x3" + }, + "A710" : "x4", + "A510" : "x4" + } +else: + filterVariants = None + + +def getBenchmarksForPlatform(platform): + baseline = {} + + + for baselineVariant in baselineVariants: + b = getBench(f"sphincsplus-keccakx2/benchmarks_{platform}", baselineVariant) + if len(b.keys()) > 0: + baseline[baselineVariant] = b + + optimized = {} + for optimizedVariant in optimizedVariants: + d = f"sphincsplus-keccakxN/benchmarks_{platform}" + + if filterVariants is not None and platform in filterVariants and type(filterVariants[platform]) != dict and filterVariants[platform] != optimizedVariant: + continue + + + if os.path.exists(d): + results = getBench(d, optimizedVariant) + + + if filterVariants is not None and platform in filterVariants and type(filterVariants[platform]) == dict: + filteredResults = {} + for param in parameterSets: + # print(filterVariants[platform]) + if filterVariants[platform][param] == optimizedVariant: + filteredResults[param] = results[param] + else: + filteredResults = results + + + optimized[optimizedVariant] = filteredResults + + + + # print(baseline) + return baseline, optimized + + + +d = {} +for platform in platforms: + d[platform] = getBenchmarksForPlatform(platform) + + +first=True + +def getFastest(l): + k = [i['k'] for i in l] + s = [i['s'] for i in l] + v = [i['v'] for i in l] + + return { + "k" : min(k), + "s" : min(s), + "v" : min(v) + } + +def speedup(old, new): + v = old/new + + if v < 1: + v = "" + else: + v = fmts(v) + + return v + + +if markdown: + def fmtc(cycles): + value = f"{round(cycles):,}" + return value + def printHeader(): + pass + + def printPlatformStart(platform, first): + print(f"# Cortex-{platform}") + print(f"| Parameter set | Implementation | Key Generation | Signing | Verification |") + print(f"| ------------- | -------------- | -------------- | ------- | ------------ |") + + + def printParamStart(paramName, numImplementations): + pass + + def printParamEnd(): + pass + def printFooter(): + print() + print() + print("[C]: https://github.com/XKCP/XKCP") + print("[Ngu]: https://github.com/cothan/NEON-SHA3_2x") + print("[Wes]: https://github.com/bwesterb/armed-keccak") + + def printRow(variant, cycles, fastestBaseline, paramName): + name = implementationNames[variant] + print(f"| {paramName} | {name} | {fmtc(cycles['k'])} | {fmtc(cycles['s'])} | {fmtc(cycles['v'])} |") + +else: + def fmtc(cycles): + value = f"{round(cycles/1000):,}k" + value = value.replace(",", "\\,") + return value + def printHeader(): + print("\\begin{tabular}{c|c|rr|rr|rr}") + print("Parameter set & Impl. & \multicolumn{2}{c|}{Key Generation} & \multicolumn{2}{c|}{Signing} & \multicolumn{2}{c|}{Verification}\\\\") + print("\\hline") + + def printPlatformStart(platform, first): + if not first: + print("\\hline\\hline") + print(f"\multicolumn{{8}}{{c}}{{Cortex-{platform}}}\\\\\\hline") + + def printParamStart(paramName, numImplementations): + print(f"\multirow{{{numImplementations}}}{{*}}{{{paramName}}}") + + def printParamEnd(): + print("\\cline{2-8}") + + def printFooter(): + print("\\hline") + print("\\end{tabular}") + + def printRow(variant, cycles, fastestBaseline, paramName): + name = implementationNames[variant] + + speedupK = speedup(fastestBaseline['k'], cycles['k']) + speedupS = speedup(fastestBaseline['s'], cycles['s']) + speedupV = speedup(fastestBaseline['v'], cycles['v']) + print(f" & {name} & {fmtc(cycles['k'])} & {speedupK} & {fmtc(cycles['s'])} & {speedupS}& {fmtc(cycles['v'])} & {speedupV} \\\\") + + +printHeader() +for platform in platforms: + printPlatformStart(platform, first) + if first == True: + first=False + baseline, optimized = d[platform] + + numImplementations = len(baseline.keys()) + len(optimized.keys()) + for param in parameterSets: + + baselineFastest = getFastest([baseline[baselineVariant][param] for baselineVariant in baselineVariants if baselineVariant in baseline]) + # print(baselineFastest) + paramName = param.replace("sphincs-shake-", "") + printParamStart(paramName, numImplementations) + for baselineVariant in baselineVariants: + if baselineVariant not in baseline: + continue + cycles = baseline[baselineVariant][param] + printRow(baselineVariant, cycles, baselineFastest, paramName) + + + for optimizedVariant in optimizedVariants: + if optimizedVariant not in optimized or param not in optimized[optimizedVariant]: + continue + cycles = optimized[optimizedVariant][param] + printRow(optimizedVariant, cycles, baselineFastest, paramName) + printParamEnd() +printFooter() diff --git a/sphincsplus/keccak-benchmarks.md b/sphincsplus/keccak-benchmarks.md new file mode 100644 index 0000000..8f7fe85 --- /dev/null +++ b/sphincsplus/keccak-benchmarks.md @@ -0,0 +1,423 @@ +# Cortex-X1 +``` +validate_keccak_f1600_x1_scalar_C_v0 ... validate_keccak_f1600_x1_scalar_C_v1 ... validate_keccak_f1600_x1_scalar_asm_v1 ... validate_keccak_f1600_x1_scalar_asm_v2 ... validate_keccak_f1600_x1_scalar_asm_v3 ... validate_keccak_f1600_x1_scalar_asm_v4 ... validate_keccak_f1600_x1_scalar_asm_v5 ... validate_keccak_f1600_x2_v84a_asm_v1 ... skip +validate_keccak_f1600_x2_v84a_asm_v1p0 ... skip +validate_keccak_f1600_x4_v84a_asm_v1p0 ... skip +validate_keccak_f1600_x2_v84a_asm_v2 ... validate_keccak_f1600_x2_v84a_asm_v2p0 ... validate_keccak_f1600_x2_v84a_asm_v2p1 ... validate_keccak_f1600_x2_v84a_asm_v2p2 ... validate_keccak_f1600_x2_v84a_asm_v2p3 ... validate_keccak_f1600_x2_v84a_asm_v2p4 ... validate_keccak_f1600_x2_v84a_asm_v2p5 ... validate_keccak_f1600_x2_v84a_asm_v2p6 ... validate_keccak_f1600_x2_v84a_asm_v2pp0 ... validate_keccak_f1600_x2_v84a_asm_v2pp1 ... validate_keccak_f1600_x2_v84a_asm_v2pp2 ... validate_keccak_f1600_x2_v84a_asm_v2pp3 ... skip +validate_keccak_f1600_x2_v84a_asm_v2pp4 ... validate_keccak_f1600_x2_v84a_asm_v2pp5 ... validate_keccak_f1600_x2_v84a_asm_v2pp6 ... skip +validate_keccak_f1600_x2_v84a_asm_v2pp7 ... skip +validate_keccak_f1600_x2_neon_C_cothan ... validate_keccak_f1600_x2_bas ... skip +validate_keccak_f1600_x3_hybrid_asm_v3p ... validate_keccak_f1600_x3_hybrid_asm_v6 ... skip +validate_keccak_f1600_x3_hybrid_asm_v7 ... skip +validate_keccak_f1600_x4_hybrid_asm_v1 ... skip +validate_keccak_f1600_x4_hybrid_asm_v2 ... skip +validate_keccak_f1600_x4_hybrid_asm_v3 ... validate_keccak_f1600_x4_hybrid_asm_v3p ... validate_keccak_f1600_x4_hybrid_asm_v3pp ... validate_keccak_f1600_x4_hybrid_asm_v4 ... skip +validate_keccak_f1600_x4_hybrid_asm_v4p ... skip +validate_keccak_f1600_x4_hybrid_asm_v5 ... validate_keccak_f1600_x4_hybrid_asm_v5p ... validate_keccak_f1600_x4_hybrid_asm_v6 ... skip +validate_keccak_f1600_x4_hybrid_asm_v7 ... skip +validate_keccak_f1600_x4_hybrid_asm_v8 ... skip +validate_keccak_f1600_x5_hybrid_asm_v8 ... validate_keccak_f1600_x2_hybrid_asm_v1 ... skip +validate_keccak_f1600_x2_hybrid_asm_v2p0 ... skip +validate_keccak_f1600_x2_hybrid_asm_v2p1 ... skip +validate_keccak_f1600_x2_hybrid_asm_v2p2 ... skip +validate_keccak_f1600_x2_hybrid_asm_v2pp0 ... skip +validate_keccak_f1600_x2_hybrid_asm_v2pp1 ... skip +validate_keccak_f1600_x2_hybrid_asm_v2pp2 ... skip +[0|5|25|50|75|95|100] = [( 808) | 808 | 810 |* 811 *| 812 | 813 | ( 814)] (100-th AVGs of keccak_f1600_x1_scalar_C_original) +[0|5|25|50|75|95|100] = [( 754) | 755 | 756 |* 756 *| 757 | 758 | ( 761)] (100-th AVGs of keccak_f1600_x1_scalar_C_v0) +[0|5|25|50|75|95|100] = [( 748) | 748 | 750 |* 751 *| 752 | 754 | ( 754)] (100-th AVGs of keccak_f1600_x1_scalar_C_v1) +[0|5|25|50|75|95|100] = [( 844) | 844 | 844 |* 845 *| 849 | 850 | ( 850)] (100-th AVGs of keccak_f1600_x1_scalar_asm_v1) +[0|5|25|50|75|95|100] = [( 693) | 694 | 696 |* 696 *| 697 | 698 | ( 698)] (100-th AVGs of keccak_f1600_x1_scalar_asm_v2) +[0|5|25|50|75|95|100] = [( 693) | 694 | 694 |* 694 *| 695 | 695 | ( 696)] (100-th AVGs of keccak_f1600_x1_scalar_asm_v3) +[0|5|25|50|75|95|100] = [( 694) | 694 | 695 |* 695 *| 695 | 696 | ( 697)] (100-th AVGs of keccak_f1600_x1_scalar_asm_v4) +[0|5|25|50|75|95|100] = [( 689) | 689 | 690 |* 690 *| 691 | 691 | ( 692)] (100-th AVGs of keccak_f1600_x1_scalar_asm_v5) +[0|5|25|50|75|95|100] = [(1497) | 1497 | 1504 |* 1514 *| 1524 | 1525 | (1527)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2) +benchmark_keccak_f1600_x2_v84a_asm_v1 ... skip +benchmark_keccak_f1600_x2_v84a_asm_v1p0 ... skip +benchmark_keccak_f1600_x4_v84a_asm_v1p0 ... skip +[0|5|25|50|75|95|100] = [(1884) | 1885 | 1886 |* 1886 *| 1887 | 1889 | (1891)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2p0) +[0|5|25|50|75|95|100] = [(1835) | 1844 | 1844 |* 1845 *| 1846 | 1848 | (1851)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2p1) +[0|5|25|50|75|95|100] = [(1530) | 1531 | 1533 |* 1534 *| 1535 | 1536 | (1538)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2p2) +[0|5|25|50|75|95|100] = [(1499) | 1503 | 1505 |* 1506 *| 1506 | 1507 | (1508)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2p3) +[0|5|25|50|75|95|100] = [(1516) | 1525 | 1530 |* 1533 *| 1536 | 1538 | (1539)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2p4) +[0|5|25|50|75|95|100] = [(1544) | 1545 | 1549 |* 1553 *| 1554 | 1555 | (1557)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2p5) +[0|5|25|50|75|95|100] = [(1546) | 1551 | 1553 |* 1554 *| 1555 | 1557 | (1558)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2p6) +[0|5|25|50|75|95|100] = [(1391) | 1393 | 1394 |* 1395 *| 1398 | 1406 | (1406)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2pp0) +[0|5|25|50|75|95|100] = [(1334) | 1334 | 1335 |* 1336 *| 1338 | 1338 | (1339)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2pp1) +[0|5|25|50|75|95|100] = [(1313) | 1314 | 1315 |* 1317 *| 1318 | 1319 | (1323)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2pp2) +benchmark_keccak_f1600_x2_v84a_asm_v2pp3 ... skip +[0|5|25|50|75|95|100] = [(1303) | 1303 | 1305 |* 1305 *| 1307 | 1309 | (1312)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2pp4) +[0|5|25|50|75|95|100] = [(1302) | 1306 | 1308 |* 1309 *| 1310 | 1311 | (1315)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2pp5) +benchmark_keccak_f1600_x2_v84a_asm_v2pp6 ... skip +benchmark_keccak_f1600_x2_v84a_asm_v2pp7 ... skip +[0|5|25|50|75|95|100] = [(1358) | 1358 | 1364 |* 1370 *| 1371 | 1373 | (1373)] (100-th AVGs of keccak_f1600_x2_neon_C_cothan) +benchmark_keccak_f1600_x2_bas ... skip +benchmark_keccak_f1600_x2_hybrid_asm_v1 ... skip +benchmark_keccak_f1600_x2_hybrid_asm_v2p0 ... skip +benchmark_keccak_f1600_x2_hybrid_asm_v2p1 ... skip +benchmark_keccak_f1600_x2_hybrid_asm_v2p2 ... skip +benchmark_keccak_f1600_x2_hybrid_asm_v2pp0 ... skip +benchmark_keccak_f1600_x2_hybrid_asm_v2pp1 ... skip +benchmark_keccak_f1600_x2_hybrid_asm_v2pp2 ... skip +[0|5|25|50|75|95|100] = [(1501) | 1502 | 1503 |* 1507 *| 1508 | 1518 | (1521)] (100-th AVGs of keccak_f1600_x3_hybrid_asm_v3p) +benchmark_keccak_f1600_x3_hybrid_asm_v6 ... skip +benchmark_keccak_f1600_x3_hybrid_asm_v7 ... skip +benchmark_keccak_f1600_x4_hybrid_asm_v1 ... skip +benchmark_keccak_f1600_x4_hybrid_asm_v2 ... skip +benchmark_keccak_f1600_x4_hybrid_asm_v2p0 ... skip +[0|5|25|50|75|95|100] = [(1512) | 1514 | 1516 |* 1517 *| 1519 | 1520 | (1528)] (100-th AVGs of keccak_f1600_x4_hybrid_asm_v3) +[0|5|25|50|75|95|100] = [(1521) | 1522 | 1523 |* 1524 *| 1525 | 1526 | (1531)] (100-th AVGs of keccak_f1600_x4_hybrid_asm_v3p) +[0|5|25|50|75|95|100] = [(1510) | 1512 | 1513 |* 1514 *| 1514 | 1517 | (1518)] (100-th AVGs of keccak_f1600_x4_hybrid_asm_v3pp) +benchmark_keccak_f1600_x4_hybrid_asm_v4 ... skip +benchmark_keccak_f1600_x4_hybrid_asm_v4p ... skip +[0|5|25|50|75|95|100] = [(1517) | 1520 | 1525 |* 1528 *| 1530 | 1536 | (1539)] (100-th AVGs of keccak_f1600_x4_hybrid_asm_v5) +[0|5|25|50|75|95|100] = [(1466) | 1467 | 1468 |* 1469 *| 1469 | 1470 | (1475)] (100-th AVGs of keccak_f1600_x4_hybrid_asm_v5p) +benchmark_keccak_f1600_x4_hybrid_asm_v6 ... skip +benchmark_keccak_f1600_x4_hybrid_asm_v7 ... skip +benchmark_keccak_f1600_x4_hybrid_asm_v8 ... skip +[0|5|25|50|75|95|100] = [(2163) | 2163 | 2165 |* 2168 *| 2188 | 2191 | (2194)] (100-th AVGs of keccak_f1600_x5_hybrid_asm_v8) +[0|5|25|50|75|95|100] = [(2158) | 2159 | 2161 |* 2161 *| 2162 | 2164 | (2168)] (100-th AVGs of keccak_f1600_x5_hybrid_asm_v8p) +``` +# Cortex-A78 + +``` +validate_keccak_f1600_x1_scalar_C_v0 ... validate_keccak_f1600_x1_scalar_C_v1 ... validate_keccak_f1600_x1_scalar_asm_v1 ... validate_keccak_f1600_x1_scalar_asm_v2 ... validate_keccak_f1600_x1_scalar_asm_v3 ... validate_keccak_f1600_x1_scalar_asm_v4 ... validate_keccak_f1600_x1_scalar_asm_v5 ... validate_keccak_f1600_x2_v84a_asm_v1 ... skip +validate_keccak_f1600_x2_v84a_asm_v1p0 ... skip +validate_keccak_f1600_x4_v84a_asm_v1p0 ... skip +validate_keccak_f1600_x2_v84a_asm_v2 ... validate_keccak_f1600_x2_v84a_asm_v2p0 ... validate_keccak_f1600_x2_v84a_asm_v2p1 ... validate_keccak_f1600_x2_v84a_asm_v2p2 ... validate_keccak_f1600_x2_v84a_asm_v2p3 ... validate_keccak_f1600_x2_v84a_asm_v2p4 ... validate_keccak_f1600_x2_v84a_asm_v2p5 ... validate_keccak_f1600_x2_v84a_asm_v2p6 ... validate_keccak_f1600_x2_v84a_asm_v2pp0 ... validate_keccak_f1600_x2_v84a_asm_v2pp1 ... validate_keccak_f1600_x2_v84a_asm_v2pp2 ... validate_keccak_f1600_x2_v84a_asm_v2pp3 ... skip +validate_keccak_f1600_x2_v84a_asm_v2pp4 ... validate_keccak_f1600_x2_v84a_asm_v2pp5 ... validate_keccak_f1600_x2_v84a_asm_v2pp6 ... skip +validate_keccak_f1600_x2_v84a_asm_v2pp7 ... skip +validate_keccak_f1600_x2_neon_C_cothan ... validate_keccak_f1600_x2_bas ... skip +validate_keccak_f1600_x3_hybrid_asm_v3p ... validate_keccak_f1600_x3_hybrid_asm_v6 ... skip +validate_keccak_f1600_x3_hybrid_asm_v7 ... skip +validate_keccak_f1600_x4_hybrid_asm_v1 ... skip +validate_keccak_f1600_x4_hybrid_asm_v2 ... skip +validate_keccak_f1600_x4_hybrid_asm_v3 ... validate_keccak_f1600_x4_hybrid_asm_v3p ... validate_keccak_f1600_x4_hybrid_asm_v3pp ... validate_keccak_f1600_x4_hybrid_asm_v4 ... skip +validate_keccak_f1600_x4_hybrid_asm_v4p ... skip +validate_keccak_f1600_x4_hybrid_asm_v5 ... validate_keccak_f1600_x4_hybrid_asm_v5p ... validate_keccak_f1600_x4_hybrid_asm_v6 ... skip +validate_keccak_f1600_x4_hybrid_asm_v7 ... skip +validate_keccak_f1600_x4_hybrid_asm_v8 ... skip +validate_keccak_f1600_x5_hybrid_asm_v8 ... validate_keccak_f1600_x2_hybrid_asm_v1 ... skip +validate_keccak_f1600_x2_hybrid_asm_v2p0 ... skip +validate_keccak_f1600_x2_hybrid_asm_v2p1 ... skip +validate_keccak_f1600_x2_hybrid_asm_v2p2 ... skip +validate_keccak_f1600_x2_hybrid_asm_v2pp0 ... skip +validate_keccak_f1600_x2_hybrid_asm_v2pp1 ... skip +validate_keccak_f1600_x2_hybrid_asm_v2pp2 ... skip +[0|5|25|50|75|95|100] = [( 817) | 818 | 818 |* 819 *| 819 | 820 | ( 839)] (100-th AVGs of keccak_f1600_x1_scalar_C_original) +[0|5|25|50|75|95|100] = [( 757) | 758 | 759 |* 760 *| 760 | 761 | ( 765)] (100-th AVGs of keccak_f1600_x1_scalar_C_v0) +[0|5|25|50|75|95|100] = [( 755) | 756 | 757 |* 758 *| 758 | 759 | ( 762)] (100-th AVGs of keccak_f1600_x1_scalar_C_v1) +[0|5|25|50|75|95|100] = [( 871) | 873 | 874 |* 874 *| 875 | 875 | ( 878)] (100-th AVGs of keccak_f1600_x1_scalar_asm_v1) +[0|5|25|50|75|95|100] = [( 718) | 718 | 719 |* 719 *| 720 | 720 | ( 725)] (100-th AVGs of keccak_f1600_x1_scalar_asm_v2) +[0|5|25|50|75|95|100] = [( 717) | 717 | 717 |* 718 *| 718 | 719 | ( 720)] (100-th AVGs of keccak_f1600_x1_scalar_asm_v3) +[0|5|25|50|75|95|100] = [( 711) | 712 | 712 |* 712 *| 713 | 714 | ( 717)] (100-th AVGs of keccak_f1600_x1_scalar_asm_v4) +[0|5|25|50|75|95|100] = [( 707) | 708 | 708 |* 709 *| 709 | 710 | ( 715)] (100-th AVGs of keccak_f1600_x1_scalar_asm_v5) +[0|5|25|50|75|95|100] = [(2315) | 2316 | 2316 |* 2316 *| 2316 | 2318 | (2319)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2) +benchmark_keccak_f1600_x2_v84a_asm_v1 ... skip +benchmark_keccak_f1600_x2_v84a_asm_v1p0 ... skip +benchmark_keccak_f1600_x4_v84a_asm_v1p0 ... skip +[0|5|25|50|75|95|100] = [(2551) | 2552 | 2553 |* 2554 *| 2555 | 2556 | (2560)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2p0) +[0|5|25|50|75|95|100] = [(2621) | 2621 | 2621 |* 2622 *| 2622 | 2625 | (2630)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2p1) +[0|5|25|50|75|95|100] = [(2344) | 2345 | 2345 |* 2348 *| 2349 | 2349 | (2350)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2p2) +[0|5|25|50|75|95|100] = [(2274) | 2275 | 2276 |* 2280 *| 2281 | 2281 | (2286)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2p3) +[0|5|25|50|75|95|100] = [(2260) | 2260 | 2262 |* 2265 *| 2265 | 2267 | (2269)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2p4) +[0|5|25|50|75|95|100] = [(2189) | 2189 | 2190 |* 2191 *| 2192 | 2193 | (2199)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2p5) +[0|5|25|50|75|95|100] = [(2192) | 2193 | 2193 |* 2194 *| 2195 | 2196 | (2199)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2p6) +[0|5|25|50|75|95|100] = [(2222) | 2222 | 2222 |* 2222 *| 2223 | 2225 | (2226)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2pp0) +[0|5|25|50|75|95|100] = [(2175) | 2175 | 2175 |* 2175 *| 2175 | 2176 | (2180)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2pp1) +[0|5|25|50|75|95|100] = [(2196) | 2197 | 2197 |* 2197 *| 2197 | 2199 | (2203)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2pp2) +benchmark_keccak_f1600_x2_v84a_asm_v2pp3 ... skip +[0|5|25|50|75|95|100] = [(2173) | 2173 | 2173 |* 2174 *| 2174 | 2176 | (2178)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2pp4) +[0|5|25|50|75|95|100] = [(2171) | 2171 | 2172 |* 2173 *| 2173 | 2174 | (2179)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2pp5) +benchmark_keccak_f1600_x2_v84a_asm_v2pp6 ... skip +benchmark_keccak_f1600_x2_v84a_asm_v2pp7 ... skip +[0|5|25|50|75|95|100] = [(2406) | 2406 | 2407 |* 2409 *| 2410 | 2410 | (2411)] (100-th AVGs of keccak_f1600_x2_neon_C_cothan) +benchmark_keccak_f1600_x2_bas ... skip +benchmark_keccak_f1600_x2_hybrid_asm_v1 ... skip +benchmark_keccak_f1600_x2_hybrid_asm_v2p0 ... skip +benchmark_keccak_f1600_x2_hybrid_asm_v2p1 ... skip +benchmark_keccak_f1600_x2_hybrid_asm_v2p2 ... skip +benchmark_keccak_f1600_x2_hybrid_asm_v2pp0 ... skip +benchmark_keccak_f1600_x2_hybrid_asm_v2pp1 ... skip +benchmark_keccak_f1600_x2_hybrid_asm_v2pp2 ... skip +[0|5|25|50|75|95|100] = [(2267) | 2268 | 2269 |* 2270 *| 2271 | 2272 | (2274)] (100-th AVGs of keccak_f1600_x3_hybrid_asm_v3p) +benchmark_keccak_f1600_x3_hybrid_asm_v6 ... skip +benchmark_keccak_f1600_x3_hybrid_asm_v7 ... skip +benchmark_keccak_f1600_x4_hybrid_asm_v1 ... skip +benchmark_keccak_f1600_x4_hybrid_asm_v2 ... skip +benchmark_keccak_f1600_x4_hybrid_asm_v2p0 ... skip +[0|5|25|50|75|95|100] = [(2224) | 2225 | 2227 |* 2227 *| 2229 | 2232 | (2244)] (100-th AVGs of keccak_f1600_x4_hybrid_asm_v3) +[0|5|25|50|75|95|100] = [(2198) | 2199 | 2200 |* 2201 *| 2201 | 2208 | (2217)] (100-th AVGs of keccak_f1600_x4_hybrid_asm_v3p) +[0|5|25|50|75|95|100] = [(2200) | 2202 | 2203 |* 2205 *| 2206 | 2212 | (2220)] (100-th AVGs of keccak_f1600_x4_hybrid_asm_v3pp) +benchmark_keccak_f1600_x4_hybrid_asm_v4 ... skip +benchmark_keccak_f1600_x4_hybrid_asm_v4p ... skip +[0|5|25|50|75|95|100] = [(2206) | 2208 | 2211 |* 2213 *| 2215 | 2220 | (2227)] (100-th AVGs of keccak_f1600_x4_hybrid_asm_v5) +[0|5|25|50|75|95|100] = [(2148) | 2150 | 2151 |* 2151 *| 2152 | 2158 | (2161)] (100-th AVGs of keccak_f1600_x4_hybrid_asm_v5p) +benchmark_keccak_f1600_x4_hybrid_asm_v6 ... skip +benchmark_keccak_f1600_x4_hybrid_asm_v7 ... skip +benchmark_keccak_f1600_x4_hybrid_asm_v8 ... skip +[0|5|25|50|75|95|100] = [(2291) | 2293 | 2296 |* 2300 *| 2305 | 2325 | (2341)] (100-th AVGs of keccak_f1600_x5_hybrid_asm_v8) +[0|5|25|50|75|95|100] = [(2188) | 2188 | 2190 |* 2191 *| 2192 | 2202 | (2213)] (100-th AVGs of keccak_f1600_x5_hybrid_asm_v8p) +``` + +# Cortex-A55 +``` +validate_keccak_f1600_x1_scalar_C_v0 ... validate_keccak_f1600_x1_scalar_C_v1 ... validate_keccak_f1600_x1_scalar_asm_v1 ... validate_keccak_f1600_x1_scalar_asm_v2 ... validate_keccak_f1600_x1_scalar_asm_v3 ... validate_keccak_f1600_x1_scalar_asm_v4 ... validate_keccak_f1600_x1_scalar_asm_v5 ... validate_keccak_f1600_x2_v84a_asm_v1 ... skip +validate_keccak_f1600_x2_v84a_asm_v1p0 ... skip +validate_keccak_f1600_x4_v84a_asm_v1p0 ... skip +validate_keccak_f1600_x2_v84a_asm_v2 ... validate_keccak_f1600_x2_v84a_asm_v2p0 ... validate_keccak_f1600_x2_v84a_asm_v2p1 ... validate_keccak_f1600_x2_v84a_asm_v2p2 ... validate_keccak_f1600_x2_v84a_asm_v2p3 ... validate_keccak_f1600_x2_v84a_asm_v2p4 ... validate_keccak_f1600_x2_v84a_asm_v2p5 ... validate_keccak_f1600_x2_v84a_asm_v2p6 ... validate_keccak_f1600_x2_v84a_asm_v2pp0 ... validate_keccak_f1600_x2_v84a_asm_v2pp1 ... validate_keccak_f1600_x2_v84a_asm_v2pp2 ... validate_keccak_f1600_x2_v84a_asm_v2pp3 ... skip +validate_keccak_f1600_x2_v84a_asm_v2pp4 ... validate_keccak_f1600_x2_v84a_asm_v2pp5 ... validate_keccak_f1600_x2_v84a_asm_v2pp6 ... skip +validate_keccak_f1600_x2_v84a_asm_v2pp7 ... skip +validate_keccak_f1600_x2_neon_C_cothan ... validate_keccak_f1600_x2_bas ... skip +validate_keccak_f1600_x3_hybrid_asm_v3p ... validate_keccak_f1600_x3_hybrid_asm_v6 ... skip +validate_keccak_f1600_x3_hybrid_asm_v7 ... skip +validate_keccak_f1600_x4_hybrid_asm_v1 ... skip +validate_keccak_f1600_x4_hybrid_asm_v2 ... skip +validate_keccak_f1600_x4_hybrid_asm_v3 ... validate_keccak_f1600_x4_hybrid_asm_v3p ... validate_keccak_f1600_x4_hybrid_asm_v3pp ... validate_keccak_f1600_x4_hybrid_asm_v4 ... skip +validate_keccak_f1600_x4_hybrid_asm_v4p ... skip +validate_keccak_f1600_x4_hybrid_asm_v5 ... validate_keccak_f1600_x4_hybrid_asm_v5p ... validate_keccak_f1600_x4_hybrid_asm_v6 ... skip +validate_keccak_f1600_x4_hybrid_asm_v7 ... skip +validate_keccak_f1600_x4_hybrid_asm_v8 ... skip +validate_keccak_f1600_x5_hybrid_asm_v8 ... validate_keccak_f1600_x2_hybrid_asm_v1 ... skip +validate_keccak_f1600_x2_hybrid_asm_v2p0 ... skip +validate_keccak_f1600_x2_hybrid_asm_v2p1 ... skip +validate_keccak_f1600_x2_hybrid_asm_v2p2 ... skip +validate_keccak_f1600_x2_hybrid_asm_v2pp0 ... skip +validate_keccak_f1600_x2_hybrid_asm_v2pp1 ... skip +validate_keccak_f1600_x2_hybrid_asm_v2pp2 ... skip +[0|5|25|50|75|95|100] = [(1934) | 1935 | 1935 |* 1935 *| 1935 | 1941 | (1988)] (100-th AVGs of keccak_f1600_x1_scalar_C_original) +[0|5|25|50|75|95|100] = [(2527) | 2528 | 2528 |* 2528 *| 2528 | 2531 | (2552)] (100-th AVGs of keccak_f1600_x1_scalar_C_v0) +[0|5|25|50|75|95|100] = [(2471) | 2471 | 2471 |* 2471 *| 2471 | 2474 | (2495)] (100-th AVGs of keccak_f1600_x1_scalar_C_v1) +[0|5|25|50|75|95|100] = [(1833) | 1833 | 1833 |* 1833 *| 1833 | 1835 | (1852)] (100-th AVGs of keccak_f1600_x1_scalar_asm_v1) +[0|5|25|50|75|95|100] = [(1492) | 1492 | 1493 |* 1493 *| 1493 | 1494 | (1521)] (100-th AVGs of keccak_f1600_x1_scalar_asm_v2) +[0|5|25|50|75|95|100] = [(1487) | 1487 | 1487 |* 1487 *| 1487 | 1491 | (1526)] (100-th AVGs of keccak_f1600_x1_scalar_asm_v3) +[0|5|25|50|75|95|100] = [(1487) | 1487 | 1487 |* 1487 *| 1487 | 1489 | (1506)] (100-th AVGs of keccak_f1600_x1_scalar_asm_v4) +[0|5|25|50|75|95|100] = [(1418) | 1418 | 1418 |* 1418 *| 1418 | 1422 | (1441)] (100-th AVGs of keccak_f1600_x1_scalar_asm_v5) +[0|5|25|50|75|95|100] = [(4559) | 4559 | 4559 |* 4560 *| 4560 | 4577 | (4580)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2) +benchmark_keccak_f1600_x2_v84a_asm_v1 ... skip +benchmark_keccak_f1600_x2_v84a_asm_v1p0 ... skip +benchmark_keccak_f1600_x4_v84a_asm_v1p0 ... skip +[0|5|25|50|75|95|100] = [(8435) | 8435 | 8435 |* 8436 *| 8436 | 8460 | (8465)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2p0) +[0|5|25|50|75|95|100] = [(8390) | 8390 | 8390 |* 8390 *| 8392 | 8432 | (8439)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2p1) +[0|5|25|50|75|95|100] = [(6969) | 6969 | 6970 |* 6970 *| 6972 | 7010 | (7022)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2p2) +[0|5|25|50|75|95|100] = [(6659) | 6659 | 6659 |* 6660 *| 6663 | 6698 | (6717)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2p3) +[0|5|25|50|75|95|100] = [(6434) | 6435 | 6435 |* 6435 *| 6436 | 6474 | (6476)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2p4) +[0|5|25|50|75|95|100] = [(6422) | 6423 | 6423 |* 6423 *| 6424 | 6454 | (6457)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2p5) +[0|5|25|50|75|95|100] = [(6422) | 6422 | 6423 |* 6423 *| 6424 | 6458 | (6463)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2p6) +[0|5|25|50|75|95|100] = [(4736) | 4736 | 4736 |* 4737 *| 4737 | 4768 | (4793)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2pp0) +[0|5|25|50|75|95|100] = [(4808) | 4808 | 4808 |* 4808 *| 4809 | 4839 | (4853)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2pp1) +[0|5|25|50|75|95|100] = [(4904) | 4904 | 4905 |* 4905 *| 4905 | 4937 | (4940)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2pp2) +benchmark_keccak_f1600_x2_v84a_asm_v2pp3 ... skip +[0|5|25|50|75|95|100] = [(4832) | 4832 | 4833 |* 4833 *| 4833 | 4870 | (4872)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2pp4) +[0|5|25|50|75|95|100] = [(5049) | 5049 | 5050 |* 5050 *| 5051 | 5086 | (5090)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2pp5) +benchmark_keccak_f1600_x2_v84a_asm_v2pp6 ... skip +benchmark_keccak_f1600_x2_v84a_asm_v2pp7 ... skip +[0|5|25|50|75|95|100] = [(5221) | 5222 | 5222 |* 5222 *| 5223 | 5248 | (5252)] (100-th AVGs of keccak_f1600_x2_neon_C_cothan) +benchmark_keccak_f1600_x2_bas ... skip +benchmark_keccak_f1600_x2_hybrid_asm_v1 ... skip +benchmark_keccak_f1600_x2_hybrid_asm_v2p0 ... skip +benchmark_keccak_f1600_x2_hybrid_asm_v2p1 ... skip +benchmark_keccak_f1600_x2_hybrid_asm_v2p2 ... skip +benchmark_keccak_f1600_x2_hybrid_asm_v2pp0 ... skip +benchmark_keccak_f1600_x2_hybrid_asm_v2pp1 ... skip +benchmark_keccak_f1600_x2_hybrid_asm_v2pp2 ... skip +[0|5|25|50|75|95|100] = [(5762) | 5762 | 5762 |* 5763 *| 5763 | 5808 | (5817)] (100-th AVGs of keccak_f1600_x3_hybrid_asm_v3p) +benchmark_keccak_f1600_x3_hybrid_asm_v6 ... skip +benchmark_keccak_f1600_x3_hybrid_asm_v7 ... skip +benchmark_keccak_f1600_x4_hybrid_asm_v1 ... skip +benchmark_keccak_f1600_x4_hybrid_asm_v2 ... skip +benchmark_keccak_f1600_x4_hybrid_asm_v2p0 ... skip +[0|5|25|50|75|95|100] = [(7240) | 7240 | 7241 |* 7241 *| 7244 | 7300 | (7307)] (100-th AVGs of keccak_f1600_x4_hybrid_asm_v3) +[0|5|25|50|75|95|100] = [(7285) | 7286 | 7287 |* 7288 *| 7292 | 7365 | (7371)] (100-th AVGs of keccak_f1600_x4_hybrid_asm_v3p) +[0|5|25|50|75|95|100] = [(7295) | 7296 | 7296 |* 7296 *| 7298 | 7336 | (7345)] (100-th AVGs of keccak_f1600_x4_hybrid_asm_v3pp) +benchmark_keccak_f1600_x4_hybrid_asm_v4 ... skip +benchmark_keccak_f1600_x4_hybrid_asm_v4p ... skip +[0|5|25|50|75|95|100] = [(7649) | 7650 | 7650 |* 7651 *| 7657 | 7738 | (7746)] (100-th AVGs of keccak_f1600_x4_hybrid_asm_v5) +[0|5|25|50|75|95|100] = [(7652) | 7652 | 7653 |* 7653 *| 7655 | 7713 | (7737)] (100-th AVGs of keccak_f1600_x4_hybrid_asm_v5p) +benchmark_keccak_f1600_x4_hybrid_asm_v6 ... skip +benchmark_keccak_f1600_x4_hybrid_asm_v7 ... skip +benchmark_keccak_f1600_x4_hybrid_asm_v8 ... skip +[0|5|25|50|75|95|100] = [(8896) | 8897 | 8898 |* 8899 *| 8905 | 8960 | (8967)] (100-th AVGs of keccak_f1600_x5_hybrid_asm_v8) +[0|5|25|50|75|95|100] = [(8959) | 8959 | 8959 |* 8960 *| 8964 | 9006 | (9012)] (100-th AVGs of keccak_f1600_x5_hybrid_asm_v8p) +``` +# Cortex-X2 + +``` +[0|5|25|50|75|95|100] = [( 815) | 816 | 817 |* 817 *| 817 | 818 | ( 819)] (100-th AVGs of keccak_f1600_x1_scalar_C_original) +[0|5|25|50|75|95|100] = [( 748) | 749 | 749 |* 750 *| 750 | 751 | ( 752)] (100-th AVGs of keccak_f1600_x1_scalar_C_v0) +[0|5|25|50|75|95|100] = [( 759) | 760 | 760 |* 761 *| 764 | 767 | ( 793)] (100-th AVGs of keccak_f1600_x1_scalar_C_v1) +[0|5|25|50|75|95|100] = [( 834) | 836 | 837 |* 838 *| 839 | 839 | ( 854)] (100-th AVGs of keccak_f1600_x1_scalar_asm_v1) +[0|5|25|50|75|95|100] = [( 689) | 690 | 690 |* 690 *| 691 | 691 | ( 693)] (100-th AVGs of keccak_f1600_x1_scalar_asm_v2) +[0|5|25|50|75|95|100] = [( 692) | 692 | 692 |* 693 *| 693 | 693 | ( 694)] (100-th AVGs of keccak_f1600_x1_scalar_asm_v3) +[0|5|25|50|75|95|100] = [( 692) | 693 | 693 |* 694 *| 694 | 695 | ( 740)] (100-th AVGs of keccak_f1600_x1_scalar_asm_v4) +[0|5|25|50|75|95|100] = [( 686) | 687 | 687 |* 687 *| 688 | 688 | ( 690)] (100-th AVGs of keccak_f1600_x1_scalar_asm_v5) +[0|5|25|50|75|95|100] = [(1629) | 1630 | 1634 |* 1635 *| 1638 | 1640 | (1644)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2) +[0|5|25|50|75|95|100] = [(1546) | 1547 | 1547 |* 1547 *| 1547 | 1547 | (1551)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v1) +[0|5|25|50|75|95|100] = [(1572) | 1572 | 1572 |* 1572 *| 1572 | 1573 | (1574)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v1p0) +[0|5|25|50|75|95|100] = [(3146) | 3146 | 3146 |* 3146 *| 3146 | 3148 | (3172)] (100-th AVGs of keccak_f1600_x4_v84a_asm_v1p0) +[0|5|25|50|75|95|100] = [(1746) | 1748 | 1749 |* 1749 *| 1750 | 1752 | (1774)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2p0) +[0|5|25|50|75|95|100] = [(1713) | 1714 | 1715 |* 1716 *| 1717 | 1718 | (1719)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2p1) +[0|5|25|50|75|95|100] = [(1483) | 1484 | 1486 |* 1487 *| 1488 | 1489 | (1513)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2p2) +[0|5|25|50|75|95|100] = [(1431) | 1432 | 1433 |* 1434 *| 1435 | 1436 | (1437)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2p3) +[0|5|25|50|75|95|100] = [(1483) | 1484 | 1485 |* 1486 *| 1487 | 1489 | (1516)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2p4) +[0|5|25|50|75|95|100] = [(1503) | 1504 | 1506 |* 1507 *| 1508 | 1510 | (1510)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2p5) +[0|5|25|50|75|95|100] = [(1494) | 1495 | 1496 |* 1497 *| 1498 | 1500 | (1523)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2p6) +[0|5|25|50|75|95|100] = [(1404) | 1417 | 1422 |* 1431 *| 1440 | 1447 | (1450)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2pp0) +[0|5|25|50|75|95|100] = [(1324) | 1325 | 1326 |* 1327 *| 1328 | 1329 | (1329)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2pp1) +[0|5|25|50|75|95|100] = [(1272) | 1273 | 1273 |* 1274 *| 1275 | 1277 | (1303)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2pp2) +[0|5|25|50|75|95|100] = [(1253) | 1254 | 1254 |* 1255 *| 1256 | 1257 | (1257)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2pp3) +[0|5|25|50|75|95|100] = [(1263) | 1265 | 1267 |* 1268 *| 1268 | 1270 | (1293)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2pp4) +[0|5|25|50|75|95|100] = [(1245) | 1262 | 1263 |* 1264 *| 1265 | 1266 | (1267)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2pp5) +[0|5|25|50|75|95|100] = [(1251) | 1252 | 1253 |* 1253 *| 1254 | 1255 | (1256)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2pp6) +[0|5|25|50|75|95|100] = [(1254) | 1254 | 1255 |* 1256 *| 1256 | 1260 | (1279)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2pp7) +[0|5|25|50|75|95|100] = [(1323) | 1324 | 1324 |* 1325 *| 1326 | 1327 | (1327)] (100-th AVGs of keccak_f1600_x2_neon_C_cothan) +[0|5|25|50|75|95|100] = [(1547) | 1547 | 1547 |* 1547 *| 1547 | 1547 | (1565)] (100-th AVGs of keccak_f1600_x2_bas) +[0|5|25|50|75|95|100] = [(1009) | 1009 | 1010 |* 1010 *| 1011 | 1016 | (1016)] (100-th AVGs of keccak_f1600_x2_hybrid_asm_v1) +[0|5|25|50|75|95|100] = [(1054) | 1054 | 1055 |* 1055 *| 1056 | 1056 | (1056)] (100-th AVGs of keccak_f1600_x2_hybrid_asm_v2p0) +[0|5|25|50|75|95|100] = [( 896) | 899 | 902 |* 903 *| 904 | 907 | ( 942)] (100-th AVGs of keccak_f1600_x2_hybrid_asm_v2p1) +[0|5|25|50|75|95|100] = [( 896) | 897 | 898 |* 899 *| 901 | 904 | ( 906)] (100-th AVGs of keccak_f1600_x2_hybrid_asm_v2p2) +[0|5|25|50|75|95|100] = [( 941) | 942 | 944 |* 944 *| 947 | 950 | ( 955)] (100-th AVGs of keccak_f1600_x2_hybrid_asm_v2pp0) +[0|5|25|50|75|95|100] = [(1227) | 1227 | 1227 |* 1228 *| 1228 | 1228 | (1229)] (100-th AVGs of keccak_f1600_x2_hybrid_asm_v2pp1) +[0|5|25|50|75|95|100] = [( 938) | 942 | 943 |* 944 *| 947 | 949 | ( 966)] (100-th AVGs of keccak_f1600_x2_hybrid_asm_v2pp2) +[0|5|25|50|75|95|100] = [(1437) | 1438 | 1440 |* 1442 *| 1444 | 1444 | (1444)] (100-th AVGs of keccak_f1600_x3_hybrid_asm_v3p) +[0|5|25|50|75|95|100] = [( 977) | 983 | 984 |* 985 *| 985 | 986 | ( 990)] (100-th AVGs of keccak_f1600_x3_hybrid_asm_v6) +[0|5|25|50|75|95|100] = [(1660) | 1660 | 1660 |* 1660 *| 1660 | 1660 | (1682)] (100-th AVGs of keccak_f1600_x3_hybrid_asm_v7) +[0|5|25|50|75|95|100] = [(2017) | 2018 | 2018 |* 2019 *| 2019 | 2020 | (2021)] (100-th AVGs of keccak_f1600_x4_hybrid_asm_v1) +[0|5|25|50|75|95|100] = [(1551) | 1551 | 1551 |* 1551 *| 1551 | 1551 | (1577)] (100-th AVGs of keccak_f1600_x4_hybrid_asm_v2) +[0|5|25|50|75|95|100] = [(1577) | 1577 | 1577 |* 1577 *| 1577 | 1577 | (1578)] (100-th AVGs of keccak_f1600_x4_hybrid_asm_v2p0) +[0|5|25|50|75|95|100] = [(1437) | 1440 | 1442 |* 1442 *| 1443 | 1444 | (1484)] (100-th AVGs of keccak_f1600_x4_hybrid_asm_v3) +[0|5|25|50|75|95|100] = [(1462) | 1466 | 1468 |* 1469 *| 1470 | 1472 | (1536)] (100-th AVGs of keccak_f1600_x4_hybrid_asm_v3p) +[0|5|25|50|75|95|100] = [(1465) | 1466 | 1467 |* 1468 *| 1469 | 1470 | (1474)] (100-th AVGs of keccak_f1600_x4_hybrid_asm_v3pp) +[0|5|25|50|75|95|100] = [(1437) | 1437 | 1438 |* 1439 *| 1439 | 1441 | (1474)] (100-th AVGs of keccak_f1600_x4_hybrid_asm_v4) +[0|5|25|50|75|95|100] = [(1439) | 1440 | 1441 |* 1442 *| 1443 | 1444 | (1445)] (100-th AVGs of keccak_f1600_x4_hybrid_asm_v4p) +[0|5|25|50|75|95|100] = [(1479) | 1480 | 1481 |* 1483 *| 1484 | 1487 | (1488)] (100-th AVGs of keccak_f1600_x4_hybrid_asm_v5) +[0|5|25|50|75|95|100] = [(1453) | 1454 | 1455 |* 1456 *| 1456 | 1457 | (1497)] (100-th AVGs of keccak_f1600_x4_hybrid_asm_v5p) +[0|5|25|50|75|95|100] = [(1455) | 1455 | 1456 |* 1457 *| 1458 | 1459 | (1481)] (100-th AVGs of keccak_f1600_x4_hybrid_asm_v6) +[0|5|25|50|75|95|100] = [(1628) | 1629 | 1630 |* 1631 *| 1632 | 1633 | (1662)] (100-th AVGs of keccak_f1600_x4_hybrid_asm_v7) +[0|5|25|50|75|95|100] = [(1445) | 1446 | 1446 |* 1447 *| 1448 | 1449 | (1456)] (100-th AVGs of keccak_f1600_x4_hybrid_asm_v8) +[0|5|25|50|75|95|100] = [(2734) | 2736 | 2737 |* 2738 *| 2738 | 2740 | (2778)] (100-th AVGs of keccak_f1600_x4_scalar_asm_v5) +[0|5|25|50|75|95|100] = [(2166) | 2169 | 2171 |* 2172 *| 2174 | 2180 | (2197)] (100-th AVGs of keccak_f1600_x5_hybrid_asm_v8) +[0|5|25|50|75|95|100] = [(2149) | 2150 | 2151 |* 2152 *| 2160 | 2167 | (2169)] (100-th AVGs of keccak_f1600_x5_hybrid_asm_v8p) +``` + +# Cortex-A710 + +``` +[0|5|25|50|75|95|100] = [( 818) | 819 | 819 |* 820 *| 820 | 822 | ( 843)] (100-th AVGs of keccak_f1600_x1_scalar_C_original) +[0|5|25|50|75|95|100] = [( 755) | 755 | 756 |* 756 *| 757 | 758 | ( 762)] (100-th AVGs of keccak_f1600_x1_scalar_C_v0) +[0|5|25|50|75|95|100] = [( 753) | 753 | 754 |* 755 *| 755 | 756 | ( 782)] (100-th AVGs of keccak_f1600_x1_scalar_C_v1) +[0|5|25|50|75|95|100] = [( 853) | 856 | 858 |* 859 *| 859 | 860 | ( 864)] (100-th AVGs of keccak_f1600_x1_scalar_asm_v1) +[0|5|25|50|75|95|100] = [( 707) | 707 | 707 |* 708 *| 708 | 708 | ( 741)] (100-th AVGs of keccak_f1600_x1_scalar_asm_v2) +[0|5|25|50|75|95|100] = [( 708) | 708 | 708 |* 709 *| 709 | 709 | ( 733)] (100-th AVGs of keccak_f1600_x1_scalar_asm_v3) +[0|5|25|50|75|95|100] = [( 705) | 705 | 706 |* 706 *| 706 | 707 | ( 710)] (100-th AVGs of keccak_f1600_x1_scalar_asm_v4) +[0|5|25|50|75|95|100] = [( 700) | 700 | 701 |* 701 *| 701 | 702 | ( 727)] (100-th AVGs of keccak_f1600_x1_scalar_asm_v5) +[0|5|25|50|75|95|100] = [(2309) | 2309 | 2309 |* 2309 *| 2309 | 2313 | (2315)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2) +[0|5|25|50|75|95|100] = [(1549) | 1549 | 1549 |* 1549 *| 1549 | 1550 | (1555)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v1) +[0|5|25|50|75|95|100] = [(1604) | 1605 | 1605 |* 1605 *| 1606 | 1606 | (1614)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v1p0) +[0|5|25|50|75|95|100] = [(3209) | 3210 | 3230 |* 3231 *| 3231 | 3231 | (3233)] (100-th AVGs of keccak_f1600_x4_v84a_asm_v1p0) +[0|5|25|50|75|95|100] = [(2556) | 2559 | 2570 |* 2572 *| 2573 | 2574 | (2579)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2p0) +[0|5|25|50|75|95|100] = [(2622) | 2622 | 2623 |* 2624 *| 2625 | 2627 | (2641)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2p1) +[0|5|25|50|75|95|100] = [(2340) | 2341 | 2341 |* 2341 *| 2342 | 2343 | (2348)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2p2) +[0|5|25|50|75|95|100] = [(2275) | 2275 | 2275 |* 2276 *| 2277 | 2279 | (2281)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2p3) +[0|5|25|50|75|95|100] = [(2279) | 2281 | 2285 |* 2286 *| 2287 | 2288 | (2291)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2p4) +[0|5|25|50|75|95|100] = [(2209) | 2210 | 2213 |* 2214 *| 2215 | 2216 | (2218)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2p5) +[0|5|25|50|75|95|100] = [(2194) | 2197 | 2199 |* 2200 *| 2202 | 2203 | (2206)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2p6) +[0|5|25|50|75|95|100] = [(2221) | 2222 | 2223 |* 2223 *| 2225 | 2225 | (2242)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2pp0) +[0|5|25|50|75|95|100] = [(2176) | 2176 | 2176 |* 2176 *| 2177 | 2177 | (2182)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2pp1) +[0|5|25|50|75|95|100] = [(2196) | 2196 | 2196 |* 2196 *| 2196 | 2197 | (2201)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2pp2) +[0|5|25|50|75|95|100] = [(2055) | 2056 | 2056 |* 2057 *| 2058 | 2059 | (2061)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2pp3) +[0|5|25|50|75|95|100] = [(2172) | 2173 | 2173 |* 2173 *| 2174 | 2174 | (2179)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2pp4) +[0|5|25|50|75|95|100] = [(2172) | 2172 | 2172 |* 2172 *| 2173 | 2173 | (2178)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2pp5) +[0|5|25|50|75|95|100] = [(2041) | 2042 | 2043 |* 2044 *| 2045 | 2046 | (2052)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2pp6) +[0|5|25|50|75|95|100] = [(2075) | 2076 | 2076 |* 2077 *| 2078 | 2079 | (2081)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2pp7) +[0|5|25|50|75|95|100] = [(2388) | 2388 | 2389 |* 2391 *| 2391 | 2393 | (2395)] (100-th AVGs of keccak_f1600_x2_neon_C_cothan) +[0|5|25|50|75|95|100] = [(1549) | 1549 | 1550 |* 1550 *| 1550 | 1550 | (1553)] (100-th AVGs of keccak_f1600_x2_bas) +[0|5|25|50|75|95|100] = [(1539) | 1539 | 1539 |* 1539 *| 1539 | 1553 | (1553)] (100-th AVGs of keccak_f1600_x2_hybrid_asm_v1) +[0|5|25|50|75|95|100] = [(1594) | 1596 | 1598 |* 1599 *| 1605 | 1655 | (1689)] (100-th AVGs of keccak_f1600_x2_hybrid_asm_v2p0) +[0|5|25|50|75|95|100] = [(1379) | 1380 | 1382 |* 1382 *| 1383 | 1384 | (1385)] (100-th AVGs of keccak_f1600_x2_hybrid_asm_v2p1) +[0|5|25|50|75|95|100] = [(1377) | 1378 | 1379 |* 1381 *| 1382 | 1383 | (1384)] (100-th AVGs of keccak_f1600_x2_hybrid_asm_v2p2) +[0|5|25|50|75|95|100] = [(1499) | 1500 | 1500 |* 1502 *| 1502 | 1503 | (1507)] (100-th AVGs of keccak_f1600_x2_hybrid_asm_v2pp0) +[0|5|25|50|75|95|100] = [(1383) | 1383 | 1384 |* 1384 *| 1384 | 1385 | (1387)] (100-th AVGs of keccak_f1600_x2_hybrid_asm_v2pp1) +[0|5|25|50|75|95|100] = [(1499) | 1500 | 1500 |* 1502 *| 1502 | 1503 | (1511)] (100-th AVGs of keccak_f1600_x2_hybrid_asm_v2pp2) +[0|5|25|50|75|95|100] = [(2167) | 2168 | 2170 |* 2171 *| 2171 | 2173 | (2175)] (100-th AVGs of keccak_f1600_x3_hybrid_asm_v3p) +[0|5|25|50|75|95|100] = [(1531) | 1531 | 1532 |* 1532 *| 1532 | 1533 | (1550)] (100-th AVGs of keccak_f1600_x3_hybrid_asm_v6) +[0|5|25|50|75|95|100] = [(1663) | 1664 | 1665 |* 1665 *| 1666 | 1667 | (1668)] (100-th AVGs of keccak_f1600_x3_hybrid_asm_v7) +[0|5|25|50|75|95|100] = [(2413) | 2413 | 2414 |* 2414 *| 2415 | 2416 | (2421)] (100-th AVGs of keccak_f1600_x4_hybrid_asm_v1) +[0|5|25|50|75|95|100] = [(1607) | 1607 | 1608 |* 1608 *| 1609 | 1610 | (1623)] (100-th AVGs of keccak_f1600_x4_hybrid_asm_v2) +[0|5|25|50|75|95|100] = [(1652) | 1653 | 1654 |* 1654 *| 1655 | 1657 | (1672)] (100-th AVGs of keccak_f1600_x4_hybrid_asm_v2p0) +[0|5|25|50|75|95|100] = [(2236) | 2238 | 2239 |* 2241 *| 2244 | 2247 | (2263)] (100-th AVGs of keccak_f1600_x4_hybrid_asm_v3) +[0|5|25|50|75|95|100] = [(2210) | 2223 | 2227 |* 2229 *| 2230 | 2234 | (2242)] (100-th AVGs of keccak_f1600_x4_hybrid_asm_v3p) +[0|5|25|50|75|95|100] = [(2186) | 2186 | 2186 |* 2187 *| 2188 | 2191 | (2205)] (100-th AVGs of keccak_f1600_x4_hybrid_asm_v3pp) +[0|5|25|50|75|95|100] = [(1751) | 1752 | 1753 |* 1755 *| 1756 | 1759 | (1773)] (100-th AVGs of keccak_f1600_x4_hybrid_asm_v4) +[0|5|25|50|75|95|100] = [(1718) | 1718 | 1718 |* 1718 *| 1718 | 1719 | (1738)] (100-th AVGs of keccak_f1600_x4_hybrid_asm_v4p) +[0|5|25|50|75|95|100] = [(2234) | 2234 | 2235 |* 2235 *| 2236 | 2239 | (2263)] (100-th AVGs of keccak_f1600_x4_hybrid_asm_v5) +[0|5|25|50|75|95|100] = [(2129) | 2133 | 2142 |* 2152 *| 2156 | 2159 | (2169)] (100-th AVGs of keccak_f1600_x4_hybrid_asm_v5p) +[0|5|25|50|75|95|100] = [(1856) | 1857 | 1857 |* 1858 *| 1858 | 1859 | (1876)] (100-th AVGs of keccak_f1600_x4_hybrid_asm_v6) +[0|5|25|50|75|95|100] = [(1738) | 1739 | 1740 |* 1740 *| 1741 | 1743 | (1751)] (100-th AVGs of keccak_f1600_x4_hybrid_asm_v7) +[0|5|25|50|75|95|100] = [(1737) | 1738 | 1738 |* 1739 *| 1739 | 1740 | (1753)] (100-th AVGs of keccak_f1600_x4_hybrid_asm_v8) +[0|5|25|50|75|95|100] = [(2803) | 2804 | 2805 |* 2806 *| 2807 | 2808 | (2847)] (100-th AVGs of keccak_f1600_x4_scalar_asm_v5) +[0|5|25|50|75|95|100] = [(2643) | 2649 | 2651 |* 2652 *| 2654 | 2656 | (2659)] (100-th AVGs of keccak_f1600_x5_hybrid_asm_v8) +[0|5|25|50|75|95|100] = [(2506) | 2511 | 2532 |* 2535 *| 2538 | 2542 | (2559)] (100-th AVGs of keccak_f1600_x5_hybrid_asm_v8p) +``` + +# Cortex-A510 + +``` +[0|5|25|50|75|95|100] = [(1375) | 1375 | 1375 |* 1375 *| 1376 | 1376 | (1392)] (100-th AVGs of keccak_f1600_x1_scalar_C_original) +[0|5|25|50|75|95|100] = [(2031) | 2031 | 2032 |* 2032 *| 2032 | 2034 | (2041)] (100-th AVGs of keccak_f1600_x1_scalar_C_v0) +[0|5|25|50|75|95|100] = [(1952) | 1952 | 1953 |* 1953 *| 1953 | 1956 | (1964)] (100-th AVGs of keccak_f1600_x1_scalar_C_v1) +[0|5|25|50|75|95|100] = [(1470) | 1470 | 1470 |* 1479 *| 1479 | 1484 | (1492)] (100-th AVGs of keccak_f1600_x1_scalar_asm_v1) +[0|5|25|50|75|95|100] = [(1231) | 1231 | 1231 |* 1232 *| 1232 | 1235 | (1238)] (100-th AVGs of keccak_f1600_x1_scalar_asm_v2) +[0|5|25|50|75|95|100] = [(1218) | 1218 | 1219 |* 1227 *| 1228 | 1228 | (1234)] (100-th AVGs of keccak_f1600_x1_scalar_asm_v3) +[0|5|25|50|75|95|100] = [(1085) | 1220 | 1221 |* 1223 *| 1225 | 1317 | (1323)] (100-th AVGs of keccak_f1600_x1_scalar_asm_v4) +[0|5|25|50|75|95|100] = [( 967) | 968 | 968 |* 968 *| 969 | 971 | ( 976)] (100-th AVGs of keccak_f1600_x1_scalar_asm_v5) +[0|5|25|50|75|95|100] = [(6558) | 6558 | 6558 |* 6558 *| 6576 | 7851 | (7857)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2) +[0|5|25|50|75|95|100] = [(1983) | 1983 | 1983 |* 1983 *| 1983 | 1984 | (1995)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v1) +[0|5|25|50|75|95|100] = [(1137) | 1144 | 1144 |* 1144 *| 1144 | 1145 | (1149)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v1p0) +[0|5|25|50|75|95|100] = [(2320) | 2320 | 2321 |* 2321 *| 2321 | 2326 | (2327)] (100-th AVGs of keccak_f1600_x4_v84a_asm_v1p0) +[0|5|25|50|75|95|100] = [(10231) | 10231 | 10231 |* 10231 *| 10236 | 10245 | (10256)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2p0) +[0|5|25|50|75|95|100] = [(10145) | 10145 | 10146 |* 10146 *| 10146 | 10152 | (10164)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2p1) +[0|5|25|50|75|95|100] = [(8598) | 8598 | 8598 |* 8598 *| 8598 | 8605 | (8609)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2p2) +[0|5|25|50|75|95|100] = [(8742) | 8742 | 8743 |* 8743 *| 8743 | 8751 | (8754)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2p3) +[0|5|25|50|75|95|100] = [(7920) | 7920 | 7921 |* 7921 *| 7921 | 7928 | (7933)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2p4) +[0|5|25|50|75|95|100] = [(8281) | 8281 | 8282 |* 8282 *| 8282 | 8287 | (8297)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2p5) +[0|5|25|50|75|95|100] = [(8222) | 8222 | 8223 |* 8223 *| 8227 | 8230 | (8233)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2p6) +[0|5|25|50|75|95|100] = [(6648) | 6648 | 6657 |* 6657 *| 6658 | 6663 | (6667)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2pp0) +[0|5|25|50|75|95|100] = [(6841) | 6842 | 6842 |* 6847 *| 6847 | 6853 | (6860)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2pp1) +[0|5|25|50|75|95|100] = [(6970) | 6970 | 6970 |* 6970 *| 6970 | 6977 | (6994)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2pp2) +[0|5|25|50|75|95|100] = [(6995) | 6995 | 6995 |* 6995 *| 6995 | 7001 | (7009)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2pp3) +[0|5|25|50|75|95|100] = [(6884) | 6884 | 6885 |* 6885 *| 6885 | 6888 | (6893)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2pp4) +[0|5|25|50|75|95|100] = [(7177) | 7177 | 7177 |* 7177 *| 7177 | 7184 | (7199)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2pp5) +[0|5|25|50|75|95|100] = [(7389) | 7389 | 7389 |* 7389 *| 7390 | 7399 | (7417)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2pp6) +[0|5|25|50|75|95|100] = [(7102) | 7108 | 7109 |* 7109 *| 7109 | 7114 | (7133)] (100-th AVGs of keccak_f1600_x2_v84a_asm_v2pp7) +[0|5|25|50|75|95|100] = [(3386) | 3386 | 3387 |* 3397 *| 3397 | 3401 | (3405)] (100-th AVGs of keccak_f1600_x2_neon_C_cothan) +[0|5|25|50|75|95|100] = [(2267) | 2267 | 2267 |* 2268 *| 2268 | 2270 | (2295)] (100-th AVGs of keccak_f1600_x2_bas) +[0|5|25|50|75|95|100] = [(4579) | 4579 | 4579 |* 4579 *| 4579 | 4583 | (4597)] (100-th AVGs of keccak_f1600_x2_hybrid_asm_v1) +[0|5|25|50|75|95|100] = [(4978) | 4978 | 4978 |* 4978 *| 4979 | 4983 | (4986)] (100-th AVGs of keccak_f1600_x2_hybrid_asm_v2p0) +[0|5|25|50|75|95|100] = [(4222) | 4222 | 4222 |* 4222 *| 4222 | 4230 | (4257)] (100-th AVGs of keccak_f1600_x2_hybrid_asm_v2p1) +[0|5|25|50|75|95|100] = [(4222) | 4222 | 4222 |* 4222 *| 4222 | 4235 | (4249)] (100-th AVGs of keccak_f1600_x2_hybrid_asm_v2p2) +[0|5|25|50|75|95|100] = [(4448) | 4449 | 4449 |* 4449 *| 4449 | 4454 | (4459)] (100-th AVGs of keccak_f1600_x2_hybrid_asm_v2pp0) +[0|5|25|50|75|95|100] = [(3425) | 3425 | 3425 |* 3425 *| 3425 | 3430 | (3454)] (100-th AVGs of keccak_f1600_x2_hybrid_asm_v2pp1) +[0|5|25|50|75|95|100] = [(4439) | 4439 | 4449 |* 4449 *| 4449 | 4456 | (4470)] (100-th AVGs of keccak_f1600_x2_hybrid_asm_v2pp2) +[0|5|25|50|75|95|100] = [(7701) | 7701 | 7701 |* 7701 *| 7703 | 7725 | (7758)] (100-th AVGs of keccak_f1600_x3_hybrid_asm_v3p) +[0|5|25|50|75|95|100] = [(4533) | 4533 | 4533 |* 4534 *| 4535 | 4566 | (4570)] (100-th AVGs of keccak_f1600_x3_hybrid_asm_v6) +[0|5|25|50|75|95|100] = [(1822) | 1823 | 1847 |* 1911 *| 1976 | 2015 | (2104)] (100-th AVGs of keccak_f1600_x3_hybrid_asm_v7) +[0|5|25|50|75|95|100] = [(4542) | 4543 | 4543 |* 4543 *| 4544 | 4556 | (4581)] (100-th AVGs of keccak_f1600_x4_hybrid_asm_v1) +[0|5|25|50|75|95|100] = [(3539) | 3539 | 3540 |* 3545 *| 3551 | 3568 | (3598)] (100-th AVGs of keccak_f1600_x4_hybrid_asm_v2) +[0|5|25|50|75|95|100] = [(3275) | 3275 | 3275 |* 3275 *| 3276 | 3322 | (3452)] (100-th AVGs of keccak_f1600_x4_hybrid_asm_v2p0) +[0|5|25|50|75|95|100] = [(7582) | 7582 | 7582 |* 7583 *| 7591 | 7607 | (7690)] (100-th AVGs of keccak_f1600_x4_hybrid_asm_v3) +[0|5|25|50|75|95|100] = [(7383) | 7384 | 7384 |* 7384 *| 7385 | 7407 | (7501)] (100-th AVGs of keccak_f1600_x4_hybrid_asm_v3p) +[0|5|25|50|75|95|100] = [(7397) | 7397 | 7397 |* 7398 *| 7401 | 7412 | (7431)] (100-th AVGs of keccak_f1600_x4_hybrid_asm_v3pp) +[0|5|25|50|75|95|100] = [(4479) | 4480 | 4487 |* 4487 *| 4491 | 4510 | (4584)] (100-th AVGs of keccak_f1600_x4_hybrid_asm_v4) +[0|5|25|50|75|95|100] = [(4516) | 4516 | 4517 |* 4517 *| 4525 | 4537 | (4548)] (100-th AVGs of keccak_f1600_x4_hybrid_asm_v4p) +[0|5|25|50|75|95|100] = [(7602) | 7602 | 7620 |* 7620 *| 7624 | 7642 | (7651)] (100-th AVGs of keccak_f1600_x4_hybrid_asm_v5) +[0|5|25|50|75|95|100] = [(7646) | 7646 | 7647 |* 7662 *| 7663 | 7677 | (7726)] (100-th AVGs of keccak_f1600_x4_hybrid_asm_v5p) +[0|5|25|50|75|95|100] = [(4812) | 4813 | 4813 |* 4813 *| 4815 | 4835 | (4842)] (100-th AVGs of keccak_f1600_x4_hybrid_asm_v6) +[0|5|25|50|75|95|100] = [(2727) | 2728 | 2729 |* 2729 *| 2743 | 2762 | (2792)] (100-th AVGs of keccak_f1600_x4_hybrid_asm_v7) +[0|5|25|50|75|95|100] = [(4124) | 4124 | 4125 |* 4125 *| 4126 | 4152 | (4167)] (100-th AVGs of keccak_f1600_x4_hybrid_asm_v8) +[0|5|25|50|75|95|100] = [(4021) | 4022 | 4026 |* 4046 *| 4049 | 4059 | (4061)] (100-th AVGs of keccak_f1600_x4_scalar_asm_v5) +[0|5|25|50|75|95|100] = [(7197) | 7197 | 7198 |* 7198 *| 7212 | 7233 | (7242)] (100-th AVGs of keccak_f1600_x5_hybrid_asm_v8) +[0|5|25|50|75|95|100] = [(7168) | 7168 | 7168 |* 7169 *| 7172 | 7188 | (7207)] (100-th AVGs of keccak_f1600_x5_hybrid_asm_v8p) +``` + +# diff --git a/sphincsplus/keccak-results.md b/sphincsplus/keccak-results.md new file mode 100644 index 0000000..ffe3155 --- /dev/null +++ b/sphincsplus/keccak-results.md @@ -0,0 +1,29 @@ +| Approach | | |Cortex-X1 | Cortex-A78 | Cortex-A55 | +| -------- | - | - |--------- | ---------- | -----------| +| Reference C | [C][C] | 1x | 811 (811) | 819 (819) | 1935 (1935) +| Scalar | Ours | 1x | 690 (690) | 709 (709) | 1418 (1418) +| Neon | [Ngu][Ngu] | 2x | 1370 (685) | 2409 (1204) | 5222 (2611) +| Neon | Ours | 2x | 1317 (658) | 2197 (1098) | 4560 (2280) +| Scalar/Neon | Ours | 4x | 1524 (381) | 2201 (550) | 7288 (1822) +| Scalar/Neon | Ours | 5x | 2161 (432) | 2191 (438) | 8960 (1792) + + +| Approach | | | Cortex-X2 | Cortex-A710 | Cortex-A510 | +| -------- | - | - | --------- | ----------- | ------------| +| Reference C | [C][C] | 1x | 817 (817) | 820 (820) | 1375 (1375) +| Scalar | Ours | 1x | 687 (687) | 701 (701) | 968 (968) +| Neon | [Ngu][Ngu] | 2x | 1325 (662) | 2391 (1195) | 3397 (1698) +| Neon | Ours | 2x | 1274 (637) | 2044 (1022) | 6970 (3485) +| Neon+SHA-3 | [Wes][Wes] | 2x | 1547 (773) | 1550 (775) | 2268 (1134) +| Neon+SHA-3 | Ours | 2x | 1547 (773) | 1549 (774) | 1144 (572) +| Neon/Neon+SHA-3 | Ours | 2x | 944 (472) | 1502 (751) | 4449 (2224) +| Scalar/Neon/Neon+SHA-3 | Ours | 3x | 985 (328) | 1532 (510) | 4534 (1511) +| Scalar/Neon | Ours | 4x | 1469 (367) | 2229 (557) | 7384 (1846) +| Scalar/Neon+SHA-3 | Ours | 4x | 1551 (387) | 1608 (402) | 3545 (886) +| Scalar/Neon | Ours | 5x | 2152 (430) | 2535 (507) | 7169 (1433) +| Scalar/Neon/Neon+SHA-3 | Ours | 4x | 1439 (359) | 1755 (438) | 4487 (1121) + + +[C]: https://github.com/XKCP/XKCP +[Ngu]: https://github.com/cothan/NEON-SHA3_2x +[Wes]: https://github.com/bwesterb/armed-keccak diff --git a/sphincsplus/sphincs-results.md b/sphincsplus/sphincs-results.md new file mode 100644 index 0000000..02e961e --- /dev/null +++ b/sphincsplus/sphincs-results.md @@ -0,0 +1,65 @@ +# Cortex-X1 +| Parameter set | Implementation | Key Generation | Signing | Verification | +| ------------- | -------------- | -------------- | ------- | ------------ | +| 128f-robust | [C][C] | 7,358,319 | 170,826,272 | 11,502,609 | +| 128f-robust | [Ngu][Ngu] | 6,112,043 | 141,857,043 | 9,834,959 | +| 128f-robust | Ours | 3,491,100 | 81,197,917 | 5,880,952 | +| 128s-robust | [C][C] | 470,975,755 | 3,546,272,464 | 4,168,382 | +| 128s-robust | [Ngu][Ngu] | 391,074,973 | 2,937,624,124 | 3,634,500 | +| 128s-robust | Ours | 223,778,149 | 1,681,495,897 | 2,138,932 | +# Cortex-A78 +| Parameter set | Implementation | Key Generation | Signing | Verification | +| ------------- | -------------- | -------------- | ------- | ------------ | +| 128f-robust | [C][C] | 7,506,993 | 174,285,242 | 11,912,059 | +| 128f-robust | [Ngu][Ngu] | 10,731,381 | 249,061,402 | 16,938,513 | +| 128f-robust | Ours | 5,043,285 | 117,280,323 | 7,948,919 | +| 128s-robust | [C][C] | 479,607,537 | 3,603,102,398 | 4,276,916 | +| 128s-robust | [Ngu][Ngu] | 686,058,727 | 5,153,451,754 | 6,358,655 | +| 128s-robust | Ours | 262,263,906 | 2,029,132,627 | 2,534,354 | +# Cortex-A55 +| Parameter set | Implementation | Key Generation | Signing | Verification | +| ------------- | -------------- | -------------- | ------- | ------------ | +| 128f-robust | [C][C] | 18,035,472 | 418,555,286 | 27,322,293 | +| 128f-robust | [Ngu][Ngu] | 23,443,923 | 544,202,850 | 37,017,382 | +| 128f-robust | Ours | 13,077,870 | 304,188,086 | 21,855,113 | +| 128s-robust | [C][C] | 1,153,926,672 | 8,667,372,131 | 10,415,159 | +| 128s-robust | [Ngu][Ngu] | 1,500,185,859 | 11,269,260,233 | 13,300,844 | +| 128s-robust | Ours | 835,847,388 | 6,278,826,089 | 6,915,664 | +# Cortex-X2 +| Parameter set | Implementation | Key Generation | Signing | Verification | +| ------------- | -------------- | -------------- | ------- | ------------ | +| 128f-robust | [C][C] | 7,481,028 | 173,679,559 | 11,408,718 | +| 128f-robust | [Ngu][Ngu] | 5,946,107 | 138,094,446 | 9,399,772 | +| 128f-robust | [Wes][Wes] | 6,929,630 | 160,941,958 | 11,298,375 | +| 128f-robust | Ours | 3,315,159 | 77,038,220 | 5,543,875 | +| 128s-robust | [C][C] | 479,373,163 | 3,601,404,582 | 4,374,100 | +| 128s-robust | [Ngu][Ngu] | 381,169,535 | 2,863,365,476 | 3,311,755 | +| 128s-robust | [Wes][Wes] | 443,343,467 | 3,330,901,697 | 3,936,987 | +| 128s-robust | Ours | 194,294,564 | 1,517,988,183 | 1,848,971 | +# Cortex-A710 +| Parameter set | Implementation | Key Generation | Signing | Verification | +| ------------- | -------------- | -------------- | ------- | ------------ | +| 128f-robust | [C][C] | 7,570,502 | 175,705,544 | 11,795,667 | +| 128f-robust | [Ngu][Ngu] | 10,641,228 | 247,081,815 | 17,209,781 | +| 128f-robust | [Wes][Wes] | 6,980,377 | 162,089,547 | 11,337,784 | +| 128f-robust | Ours | 3,743,485 | 87,051,944 | 6,071,225 | +| 128s-robust | [C][C] | 483,663,631 | 3,633,790,296 | 4,194,130 | +| 128s-robust | [Ngu][Ngu] | 681,005,510 | 5,118,301,851 | 6,188,066 | +| 128s-robust | [Wes][Wes] | 446,643,525 | 3,356,043,555 | 3,850,357 | +| 128s-robust | Ours | 239,633,901 | 1,800,720,086 | 2,147,065 | +# Cortex-A510 +| Parameter set | Implementation | Key Generation | Signing | Verification | +| ------------- | -------------- | -------------- | ------- | ------------ | +| 128f-robust | [C][C] | 13,786,831 | 315,780,098 | 21,639,779 | +| 128f-robust | [Ngu][Ngu] | 15,269,589 | 354,190,986 | 24,771,038 | +| 128f-robust | [Wes][Wes] | 10,600,478 | 245,622,806 | 16,865,725 | +| 128f-robust | Ours | 5,427,899 | 125,818,288 | 8,919,586 | +| 128s-robust | [C][C] | 871,395,537 | 6,548,092,917 | 7,969,239 | +| 128s-robust | [Ngu][Ngu] | 974,306,786 | 7,322,457,897 | 8,396,733 | +| 128s-robust | [Wes][Wes] | 661,698,650 | 4,991,715,387 | 5,790,915 | +| 128s-robust | Ours | 347,613,894 | 2,610,123,085 | 3,322,286 | + + +[C]: https://github.com/XKCP/XKCP +[Ngu]: https://github.com/cothan/NEON-SHA3_2x +[Wes]: https://github.com/bwesterb/armed-keccak diff --git a/sphincsplus/sphincsplus-keccakx2/LICENSE b/sphincsplus/sphincsplus-keccakx2/LICENSE new file mode 100644 index 0000000..e5a6ce4 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/LICENSE @@ -0,0 +1,121 @@ +The CC0 1.0 Universal license, the text of which is below, applies to this directory +in general, with a few exceptions for individual files as explained in the README and +the corresponding files. + + +CC0 1.0 Universal + +Statement of Purpose + +The laws of most jurisdictions throughout the world automatically confer +exclusive Copyright and Related Rights (defined below) upon the creator and +subsequent owner(s) (each and all, an "owner") of an original work of +authorship and/or a database (each, a "Work"). + +Certain owners wish to permanently relinquish those rights to a Work for the +purpose of contributing to a commons of creative, cultural and scientific +works ("Commons") that the public can reliably and without fear of later +claims of infringement build upon, modify, incorporate in other works, reuse +and redistribute as freely as possible in any form whatsoever and for any +purposes, including without limitation commercial purposes. These owners may +contribute to the Commons to promote the ideal of a free culture and the +further production of creative, cultural and scientific works, or to gain +reputation or greater distribution for their Work in part through the use and +efforts of others. + +For these and/or other purposes and motivations, and without any expectation +of additional consideration or compensation, the person associating CC0 with a +Work (the "Affirmer"), to the extent that he or she is an owner of Copyright +and Related Rights in the Work, voluntarily elects to apply CC0 to the Work +and publicly distribute the Work under its terms, with knowledge of his or her +Copyright and Related Rights in the Work and the meaning and intended legal +effect of CC0 on those rights. + +1. Copyright and Related Rights. A Work made available under CC0 may be +protected by copyright and related or neighboring rights ("Copyright and +Related Rights"). Copyright and Related Rights include, but are not limited +to, the following: + + i. the right to reproduce, adapt, distribute, perform, display, communicate, + and translate a Work; + + ii. moral rights retained by the original author(s) and/or performer(s); + + iii. publicity and privacy rights pertaining to a person's image or likeness + depicted in a Work; + + iv. rights protecting against unfair competition in regards to a Work, + subject to the limitations in paragraph 4(a), below; + + v. rights protecting the extraction, dissemination, use and reuse of data in + a Work; + + vi. database rights (such as those arising under Directive 96/9/EC of the + European Parliament and of the Council of 11 March 1996 on the legal + protection of databases, and under any national implementation thereof, + including any amended or successor version of such directive); and + + vii. other similar, equivalent or corresponding rights throughout the world + based on applicable law or treaty, and any national implementations thereof. + +2. Waiver. To the greatest extent permitted by, but not in contravention of, +applicable law, Affirmer hereby overtly, fully, permanently, irrevocably and +unconditionally waives, abandons, and surrenders all of Affirmer's Copyright +and Related Rights and associated claims and causes of action, whether now +known or unknown (including existing as well as future claims and causes of +action), in the Work (i) in all territories worldwide, (ii) for the maximum +duration provided by applicable law or treaty (including future time +extensions), (iii) in any current or future medium and for any number of +copies, and (iv) for any purpose whatsoever, including without limitation +commercial, advertising or promotional purposes (the "Waiver"). Affirmer makes +the Waiver for the benefit of each member of the public at large and to the +detriment of Affirmer's heirs and successors, fully intending that such Waiver +shall not be subject to revocation, rescission, cancellation, termination, or +any other legal or equitable action to disrupt the quiet enjoyment of the Work +by the public as contemplated by Affirmer's express Statement of Purpose. + +3. Public License Fallback. Should any part of the Waiver for any reason be +judged legally invalid or ineffective under applicable law, then the Waiver +shall be preserved to the maximum extent permitted taking into account +Affirmer's express Statement of Purpose. In addition, to the extent the Waiver +is so judged Affirmer hereby grants to each affected person a royalty-free, +non transferable, non sublicensable, non exclusive, irrevocable and +unconditional license to exercise Affirmer's Copyright and Related Rights in +the Work (i) in all territories worldwide, (ii) for the maximum duration +provided by applicable law or treaty (including future time extensions), (iii) +in any current or future medium and for any number of copies, and (iv) for any +purpose whatsoever, including without limitation commercial, advertising or +promotional purposes (the "License"). The License shall be deemed effective as +of the date CC0 was applied by Affirmer to the Work. Should any part of the +License for any reason be judged legally invalid or ineffective under +applicable law, such partial invalidity or ineffectiveness shall not +invalidate the remainder of the License, and in such case Affirmer hereby +affirms that he or she will not (i) exercise any of his or her remaining +Copyright and Related Rights in the Work or (ii) assert any associated claims +and causes of action with respect to the Work, in either case contrary to +Affirmer's express Statement of Purpose. + +4. Limitations and Disclaimers. + + a. No trademark or patent rights held by Affirmer are waived, abandoned, + surrendered, licensed or otherwise affected by this document. + + b. Affirmer offers the Work as-is and makes no representations or warranties + of any kind concerning the Work, express, implied, statutory or otherwise, + including without limitation warranties of title, merchantability, fitness + for a particular purpose, non infringement, or the absence of latent or + other defects, accuracy, or the present or absence of errors, whether or not + discoverable, all to the greatest extent permissible under applicable law. + + c. Affirmer disclaims responsibility for clearing rights of other persons + that may apply to the Work or any use thereof, including without limitation + any person's Copyright and Related Rights in the Work. Further, Affirmer + disclaims responsibility for obtaining any necessary consents, permissions + or other rights required for any use of the Work. + + d. Affirmer understands and acknowledges that Creative Commons is not a + party to this document and has no duty or obligation with respect to this + CC0 or use of the Work. + +For more information, please see + diff --git a/sphincsplus/sphincsplus-keccakx2/Makefile b/sphincsplus/sphincsplus-keccakx2/Makefile new file mode 100644 index 0000000..87f2467 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/Makefile @@ -0,0 +1,94 @@ +PARAMS = sphincs-shake-128f +THASH = robust + +CC=aarch64-none-linux-gnu-gcc +LD=$(CC) + +PLATFORM ?= v84 + +# PMU / PERF +CYCLES ?= PERF +ifeq ($(CYCLES),PMU) + CFLAGS += -DPMU_CYCLES +endif + +ifeq ($(CYCLES),PERF) + CFLAGS += -DPERF_CYCLES +endif + +ifeq ($(CYCLES),NO) + CFLAGS += -DNO_CYCLES +endif + + +CFLAGS += -I. -flto -fpic -Wall -Wextra -Wpedantic -Wmissing-prototypes -O3 -std=c99 -fomit-frame-pointer -DPARAMS=$(PARAMS) $(EXTRA_CFLAGS) +LDFLAGS = -static -flto + +SRC_DIR=. +BUILD_DIR=build + +HEADERS=$(wildcard $(SRC_DIR)/*.h) $(wildcard $(SRC_DIR)/test/*.h) + +# C / BAS / COTHANV8 / COTHANV84 +KECCAK_X2_IMPL ?= BAS + +ifeq ($(KECCAK_X2_IMPL),C) + ASM_SRC_FILES= + C_SRC_FILES=keccak_f1600_x2/keccakx2_C.c + CFLAGS += -DKECCAK_X2_IMPL_C +else ifeq ($(KECCAK_X2_IMPL),BAS) + ASM_SRC_FILES=keccak_f1600_x2/keccakx2_bas.s + C_SRC_FILES= + CFLAGS += -DKECCAK_X2_IMPL_BAS +else ifeq ($(KECCAK_X2_IMPL),COTHANV8) + ASM_SRC_FILES= + C_SRC_FILES=keccak_f1600_x2/keccakx2_cothan.c + CFLAGS += -DKECCAK_X2_IMPL_COTHAN + CFLAGS_COTHAN= $(CFLAGS) -march=armv8-a +else ifeq ($(KECCAK_X2_IMPL),COTHANV84) + ASM_SRC_FILES= + C_SRC_FILES=keccak_f1600_x2/keccakx2_cothan.c + CFLAGS += -DKECCAK_X2_IMPL_COTHAN + CFLAGS_COTHAN= $(CFLAGS) -march=armv8.4-a+crypto+sha3 +endif +ifeq ($(PLATFORM),v84) + CFLAGS += -march=armv8.4-a+crypto+sha3 +else + CFLAGS += -march=armv8-a +endif +ASM_OBJ_FILES=$(patsubst %.s, $(BUILD_DIR)/%.s.o, $(ASM_SRC_FILES)) + +C_SRC_FILES+= address.c fips202.c fips202x2.c fors.c hash_shake.c hash_shakex2.c merkle.c sign.c utils.c utilsx2.c wots.c thash_shake_$(THASH)x2.c +C_SRC_FILES_BENCH=$(C_SRC_FILES) test/benchmark.c test/randombytes.c test/cycles.c +C_OBJ_FILES_BENCH=$(patsubst %.c, $(BUILD_DIR)/%.c.o, $(C_SRC_FILES_BENCH)) +OBJ_FILES_BENCH=$(ASM_OBJ_FILES) $(C_OBJ_FILES_BENCH) + +.PHONY: clean libclean + +all: benchmark + +# Compilation +$(BUILD_DIR)/keccak_f1600_x2/keccakx2_cothan.c.o: $(SRC_DIR)/keccak_f1600_x2/keccakx2_cothan.c $(HEADERS) + mkdir -p $(@D) + $(CC) $(CFLAGS_COTHAN) -c -o $@ $< + +$(BUILD_DIR)/%.c.o: $(SRC_DIR)/%.c $(HEADERS) + mkdir -p $(@D) + $(CC) $(CFLAGS) -c -o $@ $< + + +$(BUILD_DIR)/%.s.o: $(SRC_DIR)/%.s $(HEADERS) + mkdir -p $(@D) + $(CC) -x assembler-with-cpp $(CFLAGS) -c -o $@ $< + + +# Linking +benchmark: $(OBJ_FILES_BENCH) $(HEADERS) + mkdir -p $(@D) + $(LD) $(LDFLAGS) $(OBJ_FILES_BENCH) -o benchmark + +clean: + -$(RM) -r build + +libclean: + find . -type f -executable -exec rm '{}' \; diff --git a/sphincsplus/sphincsplus-keccakx2/README.md b/sphincsplus/sphincsplus-keccakx2/README.md new file mode 100644 index 0000000..6ec004b --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/README.md @@ -0,0 +1,38 @@ +SPHINCS+ using 2-way parallel Keccak-f1600 +========================================== + +Implementation of SPHINCS+ based on 2-way parallel Keccak-f1600 from [official SPHINCS+ +repository](https://github.com/sphincs/sphincsplus). + +## Usage + +To build, run + +``` +KECCAK_X2_IMPL={C,BAS,COTHANV8} CYCLES={NO,PERF,PMU} CORE={A55,A510,A78,A710,X1,X2} THASH={robust,simple} PARAMS=sphincs-shake{f,s}-{128,192,256}{f,s} make +``` + +which will generate the `./benchmark` binary. + +You may also use + +``` +python3 make_all.py +``` + +to generate benchmark binaries for all possible combinations of parameters, stored in [bin/](bin/), and `bench_x2.sh` to +run them. + +## KATs + +The NIST-provided [source +code](https://csrc.nist.gov/projects/post-quantum-cryptography/post-quantum-cryptography-standardization/example-files) +can be used to generate Known-Answer-Tests (KATs) as done for example in the [official SPHINCS+ +repository](https://github.com/sphincs/sphincsplus/tree/master/shake-a64). + +## License + +Licensed under CC0 1.0 Universal Public Domain Dedication, see [LICENSE](LICENSE), with +the following exceptions: +* [keccak_f1600_x2/keccakx2_bas.s](keccak_f1600_x2/keccakx2_bas.s): MIT +* [keccak_f1600_x2/keccakx2_cothan.c](keccak_f1600_x2/keccakx2_cothan.c): Apache 2.0 diff --git a/sphincsplus/sphincsplus-keccakx2/address.c b/sphincsplus/sphincsplus-keccakx2/address.c new file mode 100644 index 0000000..09e8e95 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/address.c @@ -0,0 +1,112 @@ +#include +#include + +#include "address.h" +#include "params.h" +#include "utils.h" + +/* + * Specify which level of Merkle tree (the "layer") we're working on + */ +void set_layer_addr(uint32_t addr[8], uint32_t layer) +{ + ((unsigned char *)addr)[SPX_OFFSET_LAYER] = layer; +} + +/* + * Specify which Merkle tree within the level (the "tree address") we're working on + */ +void set_tree_addr(uint32_t addr[8], uint64_t tree) +{ +#if (SPX_TREE_HEIGHT * (SPX_D - 1)) > 64 + #error Subtree addressing is currently limited to at most 2^64 trees +#endif + ull_to_bytes(&((unsigned char *)addr)[SPX_OFFSET_TREE], 8, tree ); +} + +/* + * Specify the reason we'll use this address structure for, that is, what + * hash will we compute with it. This is used so that unrelated types of + * hashes don't accidentally get the same address structure. The type will be + * one of the SPX_ADDR_TYPE constants + */ +void set_type(uint32_t addr[8], uint32_t type) +{ + ((unsigned char *)addr)[SPX_OFFSET_TYPE] = type; +} + +/* + * Copy the layer and tree fields of the address structure. This is used + * when we're doing multiple types of hashes within the same Merkle tree + */ +void copy_subtree_addr(uint32_t out[8], const uint32_t in[8]) +{ + memcpy( out, in, SPX_OFFSET_TREE+8 ); +} + +/* These functions are used for OTS addresses. */ + +/* + * Specify which Merkle leaf we're working on; that is, which OTS keypair + * we're talking about. + */ +void set_keypair_addr(uint32_t addr[8], uint32_t keypair) +{ +#if SPX_FULL_HEIGHT/SPX_D > 8 + /* We have > 256 OTS at the bottom of the Merkle tree; to specify */ + /* which one, we'd need to express it in two bytes */ + ((unsigned char *)addr)[SPX_OFFSET_KP_ADDR2] = keypair >> 8; +#endif + ((unsigned char *)addr)[SPX_OFFSET_KP_ADDR1] = keypair; +} + +/* + * Copy the layer, tree and keypair fields of the address structure. This is + * used when we're doing multiple things within the same OTS keypair + */ +void copy_keypair_addr(uint32_t out[8], const uint32_t in[8]) +{ + memcpy( out, in, SPX_OFFSET_TREE+8 ); +#if SPX_FULL_HEIGHT/SPX_D > 8 + ((unsigned char *)out)[SPX_OFFSET_KP_ADDR2] = ((unsigned char *)in)[SPX_OFFSET_KP_ADDR2]; +#endif + ((unsigned char *)out)[SPX_OFFSET_KP_ADDR1] = ((unsigned char *)in)[SPX_OFFSET_KP_ADDR1]; +} + +/* + * Specify which Merkle chain within the OTS we're working with + * (the chain address) + */ +void set_chain_addr(uint32_t addr[8], uint32_t chain) +{ + ((unsigned char *)addr)[SPX_OFFSET_CHAIN_ADDR] = chain; +} + +/* + * Specify where in the Merkle chain we are +* (the hash address) + */ +void set_hash_addr(uint32_t addr[8], uint32_t hash) +{ + ((unsigned char *)addr)[SPX_OFFSET_HASH_ADDR] = hash; +} + +/* These functions are used for all hash tree addresses (including FORS). */ + +/* + * Specify the height of the node in the Merkle/FORS tree we are in + * (the tree height) + */ +void set_tree_height(uint32_t addr[8], uint32_t tree_height) +{ + ((unsigned char *)addr)[SPX_OFFSET_TREE_HGT] = tree_height; +} + +/* + * Specify the distance from the left edge of the node in the Merkle/FORS tree + * (the tree index) + */ +void set_tree_index(uint32_t addr[8], uint32_t tree_index) +{ + u32_to_bytes(&((unsigned char *)addr)[SPX_OFFSET_TREE_INDEX], tree_index ); +} diff --git a/sphincsplus/sphincsplus-keccakx2/address.h b/sphincsplus/sphincsplus-keccakx2/address.h new file mode 100644 index 0000000..49f8d66 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/address.h @@ -0,0 +1,51 @@ +#ifndef SPX_ADDRESS_H +#define SPX_ADDRESS_H + +#include +#include "params.h" + +/* The hash types that are passed to set_type */ +#define SPX_ADDR_TYPE_WOTS 0 +#define SPX_ADDR_TYPE_WOTSPK 1 +#define SPX_ADDR_TYPE_HASHTREE 2 +#define SPX_ADDR_TYPE_FORSTREE 3 +#define SPX_ADDR_TYPE_FORSPK 4 +#define SPX_ADDR_TYPE_WOTSPRF 5 +#define SPX_ADDR_TYPE_FORSPRF 6 + +#define set_layer_addr SPX_NAMESPACE(set_layer_addr) +void set_layer_addr(uint32_t addr[8], uint32_t layer); + +#define set_tree_addr SPX_NAMESPACE(set_tree_addr) +void set_tree_addr(uint32_t addr[8], uint64_t tree); + +#define set_type SPX_NAMESPACE(set_type) +void set_type(uint32_t addr[8], uint32_t type); + +/* Copies the layer and tree part of one address into the other */ +#define copy_subtree_addr SPX_NAMESPACE(copy_subtree_addr) +void copy_subtree_addr(uint32_t out[8], const uint32_t in[8]); + +/* These functions are used for WOTS and FORS addresses. */ + +#define set_keypair_addr SPX_NAMESPACE(set_keypair_addr) +void set_keypair_addr(uint32_t addr[8], uint32_t keypair); + +#define set_chain_addr SPX_NAMESPACE(set_chain_addr) +void set_chain_addr(uint32_t addr[8], uint32_t chain); + +#define set_hash_addr SPX_NAMESPACE(set_hash_addr) +void set_hash_addr(uint32_t addr[8], uint32_t hash); + +#define copy_keypair_addr SPX_NAMESPACE(copy_keypair_addr) +void copy_keypair_addr(uint32_t out[8], const uint32_t in[8]); + +/* These functions are used for all hash tree addresses (including FORS). */ + +#define set_tree_height SPX_NAMESPACE(set_tree_height) +void set_tree_height(uint32_t addr[8], uint32_t tree_height); + +#define set_tree_index SPX_NAMESPACE(set_tree_index) +void set_tree_index(uint32_t addr[8], uint32_t tree_index); + +#endif diff --git a/sphincsplus/sphincsplus-keccakx2/api.h b/sphincsplus/sphincsplus-keccakx2/api.h new file mode 100644 index 0000000..d57a148 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/api.h @@ -0,0 +1,77 @@ +#ifndef SPX_API_H +#define SPX_API_H + +#include +#include + +#include "params.h" + +#define CRYPTO_ALGNAME "SPHINCS+" + +#define CRYPTO_SECRETKEYBYTES SPX_SK_BYTES +#define CRYPTO_PUBLICKEYBYTES SPX_PK_BYTES +#define CRYPTO_BYTES SPX_BYTES +#define CRYPTO_SEEDBYTES 3*SPX_N + +/* + * Returns the length of a secret key, in bytes + */ +unsigned long long crypto_sign_secretkeybytes(void); + +/* + * Returns the length of a public key, in bytes + */ +unsigned long long crypto_sign_publickeybytes(void); + +/* + * Returns the length of a signature, in bytes + */ +unsigned long long crypto_sign_bytes(void); + +/* + * Returns the length of the seed required to generate a key pair, in bytes + */ +unsigned long long crypto_sign_seedbytes(void); + +/* + * Generates a SPHINCS+ key pair given a seed. + * Format sk: [SK_SEED || SK_PRF || PUB_SEED || root] + * Format pk: [root || PUB_SEED] + */ +int crypto_sign_seed_keypair(unsigned char *pk, unsigned char *sk, + const unsigned char *seed); + +/* + * Generates a SPHINCS+ key pair. + * Format sk: [SK_SEED || SK_PRF || PUB_SEED || root] + * Format pk: [root || PUB_SEED] + */ +int crypto_sign_keypair(unsigned char *pk, unsigned char *sk); + +/** + * Returns an array containing a detached signature. + */ +int crypto_sign_signature(uint8_t *sig, size_t *siglen, + const uint8_t *m, size_t mlen, const uint8_t *sk); + +/** + * Verifies a detached signature and message under a given public key. + */ +int crypto_sign_verify(const uint8_t *sig, size_t siglen, + const uint8_t *m, size_t mlen, const uint8_t *pk); + +/** + * Returns an array containing the signature followed by the message. + */ +int crypto_sign(unsigned char *sm, unsigned long long *smlen, + const unsigned char *m, unsigned long long mlen, + const unsigned char *sk); + +/** + * Verifies a given signature-message pair under a given public key. + */ +int crypto_sign_open(unsigned char *m, unsigned long long *mlen, + const unsigned char *sm, unsigned long long smlen, + const unsigned char *pk); + +#endif diff --git a/sphincsplus/sphincsplus-keccakx2/bench_x2.sh b/sphincsplus/sphincsplus-keccakx2/bench_x2.sh new file mode 100644 index 0000000..592dc9a --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/bench_x2.sh @@ -0,0 +1,92 @@ +#!/bin/sh + +if grep -Fq "sha3" /proc/cpuinfo +then + sha3=1 +else + sha3=0 +fi + +warmup=$1 +if [ -z $warmup ]; then + warmup=1 +fi + +for cpu in 80 10 1; do + + if [ $sha3 -eq 0 ]; then + vars="C COTHANV8" + if [ $cpu -eq 80 ]; then + cpuname=X1 + elif [ $cpu -eq 10 ]; then + cpuname=A78 + else + cpuname=A55 + fi + else + vars="C BAS COTHANV8" + if [ $cpu -eq 80 ]; then + cpuname=X2 + elif [ $cpu -eq 10 ]; then + cpuname=A710 + else + cpuname=A510 + fi + fi + + echo "CPU $cpu $cpuname" + benchdir=benchmarks_$cpuname + mkdir -p $benchdir + # the high performance cores may be asleep; we need to wake them up + if [ $warmup -eq 1 ]; then + if [ $cpu -ge 10 ]; then + taskset 1 dd if=/dev/zero of=/dev/null & + taskPid0=$! + taskset 2 dd if=/dev/zero of=/dev/null & + taskPid1=$! + taskset 4 dd if=/dev/zero of=/dev/null & + taskPid2=$! + taskset 8 dd if=/dev/zero of=/dev/null & + taskPid3=$! + fi + sleep 1 + if [ $cpu -ge 80 ]; then + taskset 10 dd if=/dev/zero of=/dev/null & + taskPid4=$! + taskset 20 dd if=/dev/zero of=/dev/null & + taskPid5=$! + taskset 40 dd if=/dev/zero of=/dev/null & + taskPid6=$! + fi + sleep 1 + fi + + for level in 128 192 256; do + for t0 in f s; do + for t1 in simple robust; do + for var in $vars; do + param=sphincs-shake-${level}${t0}-${t1} + echo $param + exe=."/bin/bench_${cpuname}_${param}_${var}" + echo $exe + out="${benchdir}/${param}_${var}" + taskset $cpu $exe > $out + done + done + done + done + + if [ $warmup -eq 1 ]; then + if [ $cpu -ge 10 ]; then + kill $taskPid0 + kill $taskPid1 + kill $taskPid2 + kill $taskPid3 + fi + if [ $cpu -ge 80 ]; then + kill $taskPid4 + kill $taskPid5 + kill $taskPid6 + fi + fi +done diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks.md b/sphincsplus/sphincsplus-keccakx2/benchmarks.md new file mode 100644 index 0000000..be0ae7b --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks.md @@ -0,0 +1,17 @@ +# sphincs-shake-128f-robust + +$ ./benchmark +Parameters: n = 16, h = 66, d = 22, b = 6, k = 33, w = 16 +Running 10 iterations. +thash avg. 1.22 us (0.00 sec); median 3,150 cycles, 1x: 3,150 cycles +f1600x2 avg. 0.60 us (0.00 sec); median 1,548 cycles, 1x: 1,548 cycles +thashx2 avg. 1.21 us (0.00 sec); median 3,139 cycles, 1x: 3,139 cycles +Generating keypair.. avg. 2664.95 us (0.00 sec); median 6,918,351 cycles, 1x: 6,918,351 cycles + - WOTS pk gen 2x.. avg. 665.72 us (0.00 sec); median 369,609 cycles, 4x: 1,478,436 cycles +Signing.. avg. 61823.89 us (0.06 sec); median 41,652 cycles, 1x: 41,652 cycles + - FORS signing.. avg. 3234.24 us (0.00 sec); median 2,357 cycles, 1x: 2,357 cycles + - WOTS pk gen x2.. avg. 665.97 us (0.00 sec); median 0 cycles, 88x: 0 cycles +Verifying.. avg. 4058.85 us (0.00 sec); median 2,574 cycles, 1x: 2,574 cycles +Signature size: 17088 (16.69 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-128f-robust_BAS b/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-128f-robust_BAS new file mode 100644 index 0000000..3b59264 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-128f-robust_BAS @@ -0,0 +1,14 @@ +Parameters: n = 16, h = 66, d = 22, b = 6, k = 33, w = 16 +Running 10 iterations. +thash avg. 3.21 us (0.00 sec); median 4,885 cycles, 1x: 4,885 cycles +f1600x2 avg. 1.55 us (0.00 sec); median 2,339 cycles, 1x: 2,339 cycles +thashx2 avg. 3.15 us (0.00 sec); median 4,783 cycles, 1x: 4,783 cycles +Generating keypair.. avg. 6957.50 us (0.01 sec); median 10,600,478 cycles, 1x: 10,600,478 cycles + - WOTS pk gen 2x.. avg. 1752.30 us (0.00 sec); median 2,659,264 cycles, 4x: 10,637,056 cycles +Signing.. avg. 184239.16 us (0.18 sec); median 245,622,806 cycles, 1x: 245,622,806 cycles + - FORS signing.. avg. 11342.87 us (0.01 sec); median 12,966,473 cycles, 1x: 12,966,473 cycles + - WOTS pk gen x2.. avg. 2304.88 us (0.00 sec); median 2,618,530 cycles, 88x: 230,430,640 cycles +Verifying.. avg. 14769.41 us (0.01 sec); median 16,865,725 cycles, 1x: 16,865,725 cycles +Signature size: 17088 (16.69 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-128f-robust_C b/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-128f-robust_C new file mode 100644 index 0000000..bd048a8 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-128f-robust_C @@ -0,0 +1,14 @@ +Parameters: n = 16, h = 66, d = 22, b = 6, k = 33, w = 16 +Running 10 iterations. +thash avg. 4.12 us (0.00 sec); median 6,246 cycles, 1x: 6,246 cycles +f1600x2 avg. 2.00 us (0.00 sec); median 3,021 cycles, 1x: 3,021 cycles +thashx2 avg. 4.14 us (0.00 sec); median 6,263 cycles, 1x: 6,263 cycles +Generating keypair.. avg. 9107.04 us (0.01 sec); median 13,786,831 cycles, 1x: 13,786,831 cycles + - WOTS pk gen 2x.. avg. 2261.43 us (0.00 sec); median 3,422,151 cycles, 4x: 13,688,604 cycles +Signing.. avg. 236248.68 us (0.24 sec); median 315,780,098 cycles, 1x: 315,780,098 cycles + - FORS signing.. avg. 10879.75 us (0.01 sec); median 16,588,135 cycles, 1x: 16,588,135 cycles + - WOTS pk gen x2.. avg. 2247.50 us (0.00 sec); median 3,403,032 cycles, 88x: 299,466,816 cycles +Verifying.. avg. 14214.96 us (0.01 sec); median 21,639,779 cycles, 1x: 21,639,779 cycles +Signature size: 17088 (16.69 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-128f-robust_COTHANV8 b/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-128f-robust_COTHANV8 new file mode 100644 index 0000000..3664e27 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-128f-robust_COTHANV8 @@ -0,0 +1,14 @@ +Parameters: n = 16, h = 66, d = 22, b = 6, k = 33, w = 16 +Running 10 iterations. +thash avg. 5.21 us (0.00 sec); median 6,935 cycles, 1x: 6,935 cycles +f1600x2 avg. 2.56 us (0.00 sec); median 3,397 cycles, 1x: 3,397 cycles +thashx2 avg. 5.22 us (0.00 sec); median 6,961 cycles, 1x: 6,961 cycles +Generating keypair.. avg. 11450.88 us (0.01 sec); median 15,269,589 cycles, 1x: 15,269,589 cycles + - WOTS pk gen 2x.. avg. 2868.95 us (0.00 sec); median 3,811,878 cycles, 4x: 15,247,512 cycles +Signing.. avg. 295508.87 us (0.30 sec); median 354,190,986 cycles, 1x: 354,190,986 cycles + - FORS signing.. avg. 16286.00 us (0.02 sec); median 18,585,613 cycles, 1x: 18,585,613 cycles + - WOTS pk gen x2.. avg. 3353.25 us (0.00 sec); median 3,814,636 cycles, 88x: 335,687,968 cycles +Verifying.. avg. 21683.10 us (0.02 sec); median 24,771,038 cycles, 1x: 24,771,038 cycles +Signature size: 17088 (16.69 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-128f-simple_BAS b/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-128f-simple_BAS new file mode 100644 index 0000000..5719ef6 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-128f-simple_BAS @@ -0,0 +1,14 @@ +Parameters: n = 16, h = 66, d = 22, b = 6, k = 33, w = 16 +Running 10 iterations. +thash avg. 1.62 us (0.00 sec); median 2,468 cycles, 1x: 2,468 cycles +f1600x2 avg. 1.54 us (0.00 sec); median 2,339 cycles, 1x: 2,339 cycles +thashx2 avg. 1.63 us (0.00 sec); median 2,489 cycles, 1x: 2,489 cycles +Generating keypair.. avg. 3692.08 us (0.00 sec); median 5,644,993 cycles, 1x: 5,644,993 cycles + - WOTS pk gen 2x.. avg. 932.02 us (0.00 sec); median 1,408,587 cycles, 4x: 5,634,348 cycles +Signing.. avg. 86079.62 us (0.09 sec); median 131,359,108 cycles, 1x: 131,359,108 cycles + - FORS signing.. avg. 5209.20 us (0.01 sec); median 7,920,190 cycles, 1x: 7,920,190 cycles + - WOTS pk gen x2.. avg. 923.77 us (0.00 sec); median 1,392,949 cycles, 88x: 122,579,512 cycles +Verifying.. avg. 5643.38 us (0.01 sec); median 8,578,132 cycles, 1x: 8,578,132 cycles +Signature size: 17088 (16.69 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-128f-simple_C b/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-128f-simple_C new file mode 100644 index 0000000..a33f79d --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-128f-simple_C @@ -0,0 +1,14 @@ +Parameters: n = 16, h = 66, d = 22, b = 6, k = 33, w = 16 +Running 10 iterations. +thash avg. 2.09 us (0.00 sec); median 3,166 cycles, 1x: 3,166 cycles +f1600x2 avg. 1.99 us (0.00 sec); median 3,014 cycles, 1x: 3,014 cycles +thashx2 avg. 2.10 us (0.00 sec); median 3,159 cycles, 1x: 3,159 cycles +Generating keypair.. avg. 4698.30 us (0.00 sec); median 7,135,566 cycles, 1x: 7,135,566 cycles + - WOTS pk gen 2x.. avg. 1184.53 us (0.00 sec); median 1,775,700 cycles, 4x: 7,102,800 cycles +Signing.. avg. 115603.20 us (0.12 sec); median 166,996,567 cycles, 1x: 166,996,567 cycles + - FORS signing.. avg. 8903.14 us (0.01 sec); median 10,142,199 cycles, 1x: 10,142,199 cycles + - WOTS pk gen x2.. avg. 1568.65 us (0.00 sec); median 1,775,673 cycles, 88x: 156,259,224 cycles +Verifying.. avg. 9573.65 us (0.01 sec); median 10,874,335 cycles, 1x: 10,874,335 cycles +Signature size: 17088 (16.69 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-128f-simple_COTHANV8 b/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-128f-simple_COTHANV8 new file mode 100644 index 0000000..1df6d03 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-128f-simple_COTHANV8 @@ -0,0 +1,14 @@ +Parameters: n = 16, h = 66, d = 22, b = 6, k = 33, w = 16 +Running 10 iterations. +thash avg. 2.70 us (0.00 sec); median 3,557 cycles, 1x: 3,557 cycles +f1600x2 avg. 2.56 us (0.00 sec); median 3,399 cycles, 1x: 3,399 cycles +thashx2 avg. 2.65 us (0.00 sec); median 3,523 cycles, 1x: 3,523 cycles +Generating keypair.. avg. 5953.33 us (0.01 sec); median 7,960,628 cycles, 1x: 7,960,628 cycles + - WOTS pk gen 2x.. avg. 1500.09 us (0.00 sec); median 1,986,079 cycles, 4x: 7,944,316 cycles +Signing.. avg. 147546.11 us (0.15 sec); median 186,284,851 cycles, 1x: 186,284,851 cycles + - FORS signing.. avg. 9898.93 us (0.01 sec); median 11,309,128 cycles, 1x: 11,309,128 cycles + - WOTS pk gen x2.. avg. 1748.07 us (0.00 sec); median 1,984,773 cycles, 88x: 174,660,024 cycles +Verifying.. avg. 10504.58 us (0.01 sec); median 11,989,896 cycles, 1x: 11,989,896 cycles +Signature size: 17088 (16.69 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-128s-robust_BAS b/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-128s-robust_BAS new file mode 100644 index 0000000..e3c0349 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-128s-robust_BAS @@ -0,0 +1,14 @@ +Parameters: n = 16, h = 63, d = 7, b = 12, k = 14, w = 16 +Running 10 iterations. +thash avg. 2.75 us (0.00 sec); median 4,702 cycles, 1x: 4,702 cycles +f1600x2 avg. 1.35 us (0.00 sec); median 2,274 cycles, 1x: 2,274 cycles +thashx2 avg. 2.79 us (0.00 sec); median 4,772 cycles, 1x: 4,772 cycles +Generating keypair.. avg. 489429.69 us (0.49 sec); median 661,698,650 cycles, 1x: 661,698,650 cycles + - WOTS pk gen 2x.. avg. 1690.04 us (0.00 sec); median 2,581,034 cycles, 256x: 660,744,704 cycles +Signing.. avg. 4170117.54 us (4.17 sec); median 4,991,715,387 cycles, 1x: 4,991,715,387 cycles + - FORS signing.. avg. 288529.77 us (0.29 sec); median 341,234,965 cycles, 1x: 341,234,965 cycles + - WOTS pk gen x2.. avg. 1948.46 us (0.00 sec); median 2,583,429 cycles, 1792x: 4,629,504,768 cycles +Verifying.. avg. 4380.91 us (0.00 sec); median 5,790,915 cycles, 1x: 5,790,915 cycles +Signature size: 7856 (7.67 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-128s-robust_C b/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-128s-robust_C new file mode 100644 index 0000000..267b53e --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-128s-robust_C @@ -0,0 +1,14 @@ +Parameters: n = 16, h = 63, d = 7, b = 12, k = 14, w = 16 +Running 10 iterations. +thash avg. 4.68 us (0.00 sec); median 6,230 cycles, 1x: 6,230 cycles +f1600x2 avg. 2.27 us (0.00 sec); median 3,014 cycles, 1x: 3,014 cycles +thashx2 avg. 4.66 us (0.00 sec); median 6,205 cycles, 1x: 6,205 cycles +Generating keypair.. avg. 745887.16 us (0.75 sec); median 871,395,537 cycles, 1x: 871,395,537 cycles + - WOTS pk gen 2x.. avg. 2991.28 us (0.00 sec); median 3,401,736 cycles, 256x: 870,844,416 cycles +Signing.. avg. 5567138.52 us (5.57 sec); median 6,548,092,917 cycles, 1x: 6,548,092,917 cycles + - FORS signing.. avg. 336348.13 us (0.34 sec); median 447,723,645 cycles, 1x: 447,723,645 cycles + - WOTS pk gen x2.. avg. 2994.14 us (0.00 sec); median 3,405,571 cycles, 1792x: 6,102,783,232 cycles +Verifying.. avg. 6991.89 us (0.01 sec); median 7,969,239 cycles, 1x: 7,969,239 cycles +Signature size: 7856 (7.67 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-128s-robust_COTHANV8 b/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-128s-robust_COTHANV8 new file mode 100644 index 0000000..c44497f --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-128s-robust_COTHANV8 @@ -0,0 +1,14 @@ +Parameters: n = 16, h = 63, d = 7, b = 12, k = 14, w = 16 +Running 10 iterations. +thash avg. 4.60 us (0.00 sec); median 6,985 cycles, 1x: 6,985 cycles +f1600x2 avg. 2.24 us (0.00 sec); median 3,397 cycles, 1x: 3,397 cycles +thashx2 avg. 4.57 us (0.00 sec); median 6,962 cycles, 1x: 6,962 cycles +Generating keypair.. avg. 786412.56 us (0.79 sec); median 974,306,786 cycles, 1x: 974,306,786 cycles + - WOTS pk gen 2x.. avg. 3341.89 us (0.00 sec); median 3,801,024 cycles, 256x: 973,062,144 cycles +Signing.. avg. 6228145.15 us (6.23 sec); median 7,322,457,897 cycles, 1x: 7,322,457,897 cycles + - FORS signing.. avg. 420162.58 us (0.42 sec); median 499,832,023 cycles, 1x: 499,832,023 cycles + - WOTS pk gen x2.. avg. 3357.16 us (0.00 sec); median 3,811,809 cycles, 1792x: 6,830,761,728 cycles +Verifying.. avg. 7366.36 us (0.01 sec); median 8,396,733 cycles, 1x: 8,396,733 cycles +Signature size: 7856 (7.67 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-128s-simple_BAS b/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-128s-simple_BAS new file mode 100644 index 0000000..658b90f --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-128s-simple_BAS @@ -0,0 +1,14 @@ +Parameters: n = 16, h = 63, d = 7, b = 12, k = 14, w = 16 +Running 10 iterations. +thash avg. 1.88 us (0.00 sec); median 2,475 cycles, 1x: 2,475 cycles +f1600x2 avg. 1.74 us (0.00 sec); median 2,304 cycles, 1x: 2,304 cycles +thashx2 avg. 1.81 us (0.00 sec); median 2,393 cycles, 1x: 2,393 cycles +Generating keypair.. avg. 292934.64 us (0.29 sec); median 354,761,316 cycles, 1x: 354,761,316 cycles + - WOTS pk gen 2x.. avg. 1202.61 us (0.00 sec); median 1,353,910 cycles, 256x: 346,600,960 cycles +Signing.. avg. 2283795.33 us (2.28 sec); median 2,638,541,067 cycles, 1x: 2,638,541,067 cycles + - FORS signing.. avg. 181645.63 us (0.18 sec); median 207,805,379 cycles, 1x: 207,805,379 cycles + - WOTS pk gen x2.. avg. 1200.39 us (0.00 sec); median 1,354,127 cycles, 1792x: 2,426,595,584 cycles +Verifying.. avg. 2698.84 us (0.00 sec); median 3,063,408 cycles, 1x: 3,063,408 cycles +Signature size: 7856 (7.67 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-128s-simple_C b/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-128s-simple_C new file mode 100644 index 0000000..9223ee0 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-128s-simple_C @@ -0,0 +1,14 @@ +Parameters: n = 16, h = 63, d = 7, b = 12, k = 14, w = 16 +Running 10 iterations. +thash avg. 2.10 us (0.00 sec); median 3,177 cycles, 1x: 3,177 cycles +f1600x2 avg. 1.99 us (0.00 sec); median 3,020 cycles, 1x: 3,020 cycles +thashx2 avg. 2.08 us (0.00 sec); median 3,158 cycles, 1x: 3,158 cycles +Generating keypair.. avg. 348476.94 us (0.35 sec); median 456,838,568 cycles, 1x: 456,838,568 cycles + - WOTS pk gen 2x.. avg. 1572.76 us (0.00 sec); median 1,781,249 cycles, 256x: 455,999,744 cycles +Signing.. avg. 2958880.65 us (2.96 sec); median 3,463,114,391 cycles, 1x: 3,463,114,391 cycles + - FORS signing.. avg. 237968.09 us (0.24 sec); median 272,209,189 cycles, 1x: 272,209,189 cycles + - WOTS pk gen x2.. avg. 1582.40 us (0.00 sec); median 1,785,856 cycles, 1792x: 3,200,253,952 cycles +Verifying.. avg. 3512.43 us (0.00 sec); median 3,992,468 cycles, 1x: 3,992,468 cycles +Signature size: 7856 (7.67 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-128s-simple_COTHANV8 b/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-128s-simple_COTHANV8 new file mode 100644 index 0000000..0d24fe6 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-128s-simple_COTHANV8 @@ -0,0 +1,14 @@ +Parameters: n = 16, h = 63, d = 7, b = 12, k = 14, w = 16 +Running 10 iterations. +thash avg. 2.70 us (0.00 sec); median 3,548 cycles, 1x: 3,548 cycles +f1600x2 avg. 2.56 us (0.00 sec); median 3,397 cycles, 1x: 3,397 cycles +thashx2 avg. 2.66 us (0.00 sec); median 3,532 cycles, 1x: 3,532 cycles +Generating keypair.. avg. 411364.09 us (0.41 sec); median 509,328,950 cycles, 1x: 509,328,950 cycles + - WOTS pk gen 2x.. avg. 1749.43 us (0.00 sec); median 1,982,241 cycles, 256x: 507,453,696 cycles +Signing.. avg. 3277718.15 us (3.28 sec); median 3,861,853,019 cycles, 1x: 3,861,853,019 cycles + - FORS signing.. avg. 260882.19 us (0.26 sec); median 303,947,751 cycles, 1x: 303,947,751 cycles + - WOTS pk gen x2.. avg. 1314.80 us (0.00 sec); median 1,981,528 cycles, 1792x: 3,550,898,176 cycles +Verifying.. avg. 2959.74 us (0.00 sec); median 4,493,937 cycles, 1x: 4,493,937 cycles +Signature size: 7856 (7.67 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-192f-robust_BAS b/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-192f-robust_BAS new file mode 100644 index 0000000..66c0711 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-192f-robust_BAS @@ -0,0 +1,14 @@ +Parameters: n = 24, h = 66, d = 22, b = 8, k = 33, w = 16 +Running 10 iterations. +thash avg. 4.34 us (0.00 sec); median 4,898 cycles, 1x: 4,898 cycles +f1600x2 avg. 2.17 us (0.00 sec); median 2,440 cycles, 1x: 2,440 cycles +thashx2 avg. 4.23 us (0.00 sec); median 4,916 cycles, 1x: 4,916 cycles +Generating keypair.. avg. 13764.48 us (0.01 sec); median 15,515,148 cycles, 1x: 15,515,148 cycles + - WOTS pk gen 2x.. avg. 3442.32 us (0.00 sec); median 3,864,036 cycles, 4x: 15,456,144 cycles +Signing.. avg. 338663.07 us (0.34 sec); median 384,851,237 cycles, 1x: 384,851,237 cycles + - FORS signing.. avg. 44417.51 us (0.04 sec); median 50,565,665 cycles, 1x: 50,565,665 cycles + - WOTS pk gen x2.. avg. 3312.86 us (0.00 sec); median 3,794,415 cycles, 88x: 333,908,520 cycles +Verifying.. avg. 20921.07 us (0.02 sec); median 23,788,082 cycles, 1x: 23,788,082 cycles +Signature size: 35664 (34.83 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-192f-robust_C b/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-192f-robust_C new file mode 100644 index 0000000..bc01591 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-192f-robust_C @@ -0,0 +1,14 @@ +Parameters: n = 24, h = 66, d = 22, b = 8, k = 33, w = 16 +Running 10 iterations. +thash avg. 5.59 us (0.00 sec); median 6,278 cycles, 1x: 6,278 cycles +f1600x2 avg. 2.74 us (0.00 sec); median 3,053 cycles, 1x: 3,053 cycles +thashx2 avg. 5.57 us (0.00 sec); median 6,255 cycles, 1x: 6,255 cycles +Generating keypair.. avg. 17845.13 us (0.02 sec); median 20,147,227 cycles, 1x: 20,147,227 cycles + - WOTS pk gen 2x.. avg. 4465.83 us (0.00 sec); median 5,022,007 cycles, 4x: 20,088,028 cycles +Signing.. avg. 432677.05 us (0.43 sec); median 507,685,427 cycles, 1x: 507,685,427 cycles + - FORS signing.. avg. 58018.28 us (0.06 sec); median 66,515,666 cycles, 1x: 66,515,666 cycles + - WOTS pk gen x2.. avg. 4414.99 us (0.00 sec); median 4,987,212 cycles, 88x: 438,874,656 cycles +Verifying.. avg. 27767.41 us (0.03 sec); median 31,505,465 cycles, 1x: 31,505,465 cycles +Signature size: 35664 (34.83 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-192f-robust_COTHANV8 b/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-192f-robust_COTHANV8 new file mode 100644 index 0000000..b9f3770 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-192f-robust_COTHANV8 @@ -0,0 +1,14 @@ +Parameters: n = 24, h = 66, d = 22, b = 8, k = 33, w = 16 +Running 10 iterations. +thash avg. 5.25 us (0.00 sec); median 6,973 cycles, 1x: 6,973 cycles +f1600x2 avg. 2.63 us (0.00 sec); median 3,490 cycles, 1x: 3,490 cycles +thashx2 avg. 5.26 us (0.00 sec); median 6,995 cycles, 1x: 6,995 cycles +Generating keypair.. avg. 16800.73 us (0.02 sec); median 22,428,880 cycles, 1x: 22,428,880 cycles + - WOTS pk gen 2x.. avg. 4198.93 us (0.00 sec); median 5,588,143 cycles, 4x: 22,352,572 cycles +Signing.. avg. 464683.78 us (0.46 sec); median 565,806,972 cycles, 1x: 565,806,972 cycles + - FORS signing.. avg. 64785.35 us (0.06 sec); median 74,067,633 cycles, 1x: 74,067,633 cycles + - WOTS pk gen x2.. avg. 4905.81 us (0.00 sec); median 5,576,447 cycles, 88x: 490,727,336 cycles +Verifying.. avg. 30281.10 us (0.03 sec); median 34,604,433 cycles, 1x: 34,604,433 cycles +Signature size: 35664 (34.83 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-192f-simple_BAS b/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-192f-simple_BAS new file mode 100644 index 0000000..32d332d --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-192f-simple_BAS @@ -0,0 +1,14 @@ +Parameters: n = 24, h = 66, d = 22, b = 8, k = 33, w = 16 +Running 10 iterations. +thash avg. 2.19 us (0.00 sec); median 2,435 cycles, 1x: 2,435 cycles +f1600x2 avg. 2.14 us (0.00 sec); median 2,365 cycles, 1x: 2,365 cycles +thashx2 avg. 2.15 us (0.00 sec); median 2,427 cycles, 1x: 2,427 cycles +Generating keypair.. avg. 7208.56 us (0.01 sec); median 8,003,269 cycles, 1x: 8,003,269 cycles + - WOTS pk gen 2x.. avg. 1802.68 us (0.00 sec); median 1,989,558 cycles, 4x: 7,958,232 cycles +Signing.. avg. 183763.15 us (0.18 sec); median 209,269,741 cycles, 1x: 209,269,741 cycles + - FORS signing.. avg. 27300.34 us (0.03 sec); median 30,991,685 cycles, 1x: 30,991,685 cycles + - WOTS pk gen x2.. avg. 1779.92 us (0.00 sec); median 1,994,873 cycles, 88x: 175,548,824 cycles +Verifying.. avg. 10475.81 us (0.01 sec); median 11,940,193 cycles, 1x: 11,940,193 cycles +Signature size: 35664 (34.83 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-192f-simple_C b/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-192f-simple_C new file mode 100644 index 0000000..ecbb1d5 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-192f-simple_C @@ -0,0 +1,14 @@ +Parameters: n = 24, h = 66, d = 22, b = 8, k = 33, w = 16 +Running 10 iterations. +thash avg. 2.83 us (0.00 sec); median 3,186 cycles, 1x: 3,186 cycles +f1600x2 avg. 2.69 us (0.00 sec); median 3,026 cycles, 1x: 3,026 cycles +thashx2 avg. 2.79 us (0.00 sec); median 3,156 cycles, 1x: 3,156 cycles +Generating keypair.. avg. 9211.41 us (0.01 sec); median 10,445,019 cycles, 1x: 10,445,019 cycles + - WOTS pk gen 2x.. avg. 2309.28 us (0.00 sec); median 2,606,125 cycles, 4x: 10,424,500 cycles +Signing.. avg. 237912.74 us (0.24 sec); median 271,303,492 cycles, 1x: 271,303,492 cycles + - FORS signing.. avg. 35703.76 us (0.04 sec); median 40,474,380 cycles, 1x: 40,474,380 cycles + - WOTS pk gen x2.. avg. 2338.78 us (0.00 sec); median 2,625,955 cycles, 88x: 231,084,040 cycles +Verifying.. avg. 13956.52 us (0.01 sec); median 15,857,415 cycles, 1x: 15,857,415 cycles +Signature size: 35664 (34.83 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-192f-simple_COTHANV8 b/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-192f-simple_COTHANV8 new file mode 100644 index 0000000..2f2da29 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-192f-simple_COTHANV8 @@ -0,0 +1,14 @@ +Parameters: n = 24, h = 66, d = 22, b = 8, k = 33, w = 16 +Running 10 iterations. +thash avg. 2.67 us (0.00 sec); median 3,532 cycles, 1x: 3,532 cycles +f1600x2 avg. 2.63 us (0.00 sec); median 3,491 cycles, 1x: 3,491 cycles +thashx2 avg. 2.65 us (0.00 sec); median 3,522 cycles, 1x: 3,522 cycles +Generating keypair.. avg. 8736.30 us (0.01 sec); median 11,644,260 cycles, 1x: 11,644,260 cycles + - WOTS pk gen 2x.. avg. 2190.71 us (0.00 sec); median 2,906,731 cycles, 4x: 11,626,924 cycles +Signing.. avg. 248896.59 us (0.25 sec); median 301,595,363 cycles, 1x: 301,595,363 cycles + - FORS signing.. avg. 39424.16 us (0.04 sec); median 45,104,878 cycles, 1x: 45,104,878 cycles + - WOTS pk gen x2.. avg. 2578.05 us (0.00 sec); median 2,926,444 cycles, 88x: 257,527,072 cycles +Verifying.. avg. 15663.20 us (0.02 sec); median 17,906,290 cycles, 1x: 17,906,290 cycles +Signature size: 35664 (34.83 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-192s-robust_BAS b/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-192s-robust_BAS new file mode 100644 index 0000000..01b48e4 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-192s-robust_BAS @@ -0,0 +1,14 @@ +Parameters: n = 24, h = 63, d = 7, b = 14, k = 17, w = 16 +Running 10 iterations. +thash avg. 3.64 us (0.00 sec); median 4,883 cycles, 1x: 4,883 cycles +f1600x2 avg. 1.82 us (0.00 sec); median 2,425 cycles, 1x: 2,425 cycles +thashx2 avg. 3.64 us (0.00 sec); median 4,826 cycles, 1x: 4,826 cycles +Generating keypair.. avg. 813970.88 us (0.81 sec); median 974,072,140 cycles, 1x: 974,072,140 cycles + - WOTS pk gen 2x.. avg. 2844.54 us (0.00 sec); median 3,785,598 cycles, 256x: 969,113,088 cycles +Signing.. avg. 7150757.20 us (7.15 sec); median 8,488,403,457 cycles, 1x: 8,488,403,457 cycles + - FORS signing.. avg. 1391356.47 us (1.39 sec); median 1,668,885,013 cycles, 1x: 1,668,885,013 cycles + - WOTS pk gen x2.. avg. 3380.43 us (0.00 sec); median 3,843,649 cycles, 1792x: 6,887,819,008 cycles +Verifying.. avg. 7503.65 us (0.01 sec); median 8,544,681 cycles, 1x: 8,544,681 cycles +Signature size: 16224 (15.84 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-192s-robust_C b/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-192s-robust_C new file mode 100644 index 0000000..7667ba6 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-192s-robust_C @@ -0,0 +1,14 @@ +Parameters: n = 24, h = 63, d = 7, b = 14, k = 17, w = 16 +Running 10 iterations. +thash avg. 4.15 us (0.00 sec); median 6,281 cycles, 1x: 6,281 cycles +f1600x2 avg. 2.02 us (0.00 sec); median 3,055 cycles, 1x: 3,055 cycles +thashx2 avg. 4.13 us (0.00 sec); median 6,267 cycles, 1x: 6,267 cycles +Generating keypair.. avg. 1049897.72 us (1.05 sec); median 1,277,271,865 cycles, 1x: 1,277,271,865 cycles + - WOTS pk gen 2x.. avg. 4353.77 us (0.00 sec); median 4,983,259 cycles, 256x: 1,275,714,304 cycles +Signing.. avg. 9379440.52 us (9.38 sec); median 11,110,993,995 cycles, 1x: 11,110,993,995 cycles + - FORS signing.. avg. 1825910.98 us (1.83 sec); median 2,180,684,925 cycles, 1x: 2,180,684,925 cycles + - WOTS pk gen x2.. avg. 3736.32 us (0.00 sec); median 4,970,309 cycles, 1792x: 8,906,793,728 cycles +Verifying.. avg. 8436.82 us (0.01 sec); median 11,250,845 cycles, 1x: 11,250,845 cycles +Signature size: 16224 (15.84 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-192s-robust_COTHANV8 b/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-192s-robust_COTHANV8 new file mode 100644 index 0000000..bb62929 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-192s-robust_COTHANV8 @@ -0,0 +1,14 @@ +Parameters: n = 24, h = 63, d = 7, b = 14, k = 17, w = 16 +Running 10 iterations. +thash avg. 5.24 us (0.00 sec); median 6,973 cycles, 1x: 6,973 cycles +f1600x2 avg. 2.63 us (0.00 sec); median 3,491 cycles, 1x: 3,491 cycles +thashx2 avg. 5.22 us (0.00 sec); median 6,990 cycles, 1x: 6,990 cycles +Generating keypair.. avg. 1214607.32 us (1.21 sec); median 1,427,209,076 cycles, 1x: 1,427,209,076 cycles + - WOTS pk gen 2x.. avg. 4893.92 us (0.00 sec); median 5,567,299 cycles, 256x: 1,425,228,544 cycles +Signing.. avg. 10538131.80 us (10.54 sec); median 12,429,916,791 cycles, 1x: 12,429,916,791 cycles + - FORS signing.. avg. 2037482.16 us (2.04 sec); median 2,440,268,429 cycles, 1x: 2,440,268,429 cycles + - WOTS pk gen x2.. avg. 4905.58 us (0.00 sec); median 5,574,035 cycles, 1792x: 9,988,670,720 cycles +Verifying.. avg. 11003.35 us (0.01 sec); median 12,562,317 cycles, 1x: 12,562,317 cycles +Signature size: 16224 (15.84 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-192s-simple_BAS b/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-192s-simple_BAS new file mode 100644 index 0000000..b84f949 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-192s-simple_BAS @@ -0,0 +1,14 @@ +Parameters: n = 24, h = 63, d = 7, b = 14, k = 17, w = 16 +Running 10 iterations. +thash avg. 1.46 us (0.00 sec); median 2,480 cycles, 1x: 2,480 cycles +f1600x2 avg. 1.39 us (0.00 sec); median 2,366 cycles, 1x: 2,366 cycles +thashx2 avg. 1.46 us (0.00 sec); median 2,498 cycles, 1x: 2,498 cycles +Generating keypair.. avg. 397166.24 us (0.40 sec); median 510,795,717 cycles, 1x: 510,795,717 cycles + - WOTS pk gen 2x.. avg. 1753.86 us (0.00 sec); median 1,986,536 cycles, 256x: 508,553,216 cycles +Signing.. avg. 3875931.92 us (3.88 sec); median 4,605,518,726 cycles, 1x: 4,605,518,726 cycles + - FORS signing.. avg. 863133.66 us (0.86 sec); median 1,023,830,233 cycles, 1x: 1,023,830,233 cycles + - WOTS pk gen x2.. avg. 1802.43 us (0.00 sec); median 2,042,866 cycles, 1792x: 3,660,815,872 cycles +Verifying.. avg. 3896.01 us (0.00 sec); median 4,423,228 cycles, 1x: 4,423,228 cycles +Signature size: 16224 (15.84 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-192s-simple_C b/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-192s-simple_C new file mode 100644 index 0000000..69c2a68 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-192s-simple_C @@ -0,0 +1,14 @@ +Parameters: n = 24, h = 63, d = 7, b = 14, k = 17, w = 16 +Running 10 iterations. +thash avg. 2.41 us (0.00 sec); median 3,188 cycles, 1x: 3,188 cycles +f1600x2 avg. 2.30 us (0.00 sec); median 3,048 cycles, 1x: 3,048 cycles +thashx2 avg. 2.38 us (0.00 sec); median 3,160 cycles, 1x: 3,160 cycles +Generating keypair.. avg. 580528.02 us (0.58 sec); median 668,692,104 cycles, 1x: 668,692,104 cycles + - WOTS pk gen 2x.. avg. 2356.23 us (0.00 sec); median 2,610,399 cycles, 256x: 668,262,144 cycles +Signing.. avg. 5179391.49 us (5.18 sec); median 6,011,879,851 cycles, 1x: 6,011,879,851 cycles + - FORS signing.. avg. 1126435.12 us (1.13 sec); median 1,327,367,716 cycles, 1x: 1,327,367,716 cycles + - WOTS pk gen x2.. avg. 2299.18 us (0.00 sec); median 2,606,838 cycles, 1792x: 4,671,453,696 cycles +Verifying.. avg. 5000.49 us (0.01 sec); median 5,691,913 cycles, 1x: 5,691,913 cycles +Signature size: 16224 (15.84 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-192s-simple_COTHANV8 b/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-192s-simple_COTHANV8 new file mode 100644 index 0000000..6c3f206 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-192s-simple_COTHANV8 @@ -0,0 +1,14 @@ +Parameters: n = 24, h = 63, d = 7, b = 14, k = 17, w = 16 +Running 10 iterations. +thash avg. 2.68 us (0.00 sec); median 3,548 cycles, 1x: 3,548 cycles +f1600x2 avg. 2.63 us (0.00 sec); median 3,492 cycles, 1x: 3,492 cycles +thashx2 avg. 2.68 us (0.00 sec); median 3,557 cycles, 1x: 3,557 cycles +Generating keypair.. avg. 616556.15 us (0.62 sec); median 745,143,259 cycles, 1x: 745,143,259 cycles + - WOTS pk gen 2x.. avg. 1701.95 us (0.00 sec); median 2,907,491 cycles, 256x: 744,317,696 cycles +Signing.. avg. 5654741.20 us (5.65 sec); median 6,695,650,223 cycles, 1x: 6,695,650,223 cycles + - FORS signing.. avg. 1241094.14 us (1.24 sec); median 1,479,366,425 cycles, 1x: 1,479,366,425 cycles + - WOTS pk gen x2.. avg. 2560.65 us (0.00 sec); median 2,909,141 cycles, 1792x: 5,213,180,672 cycles +Verifying.. avg. 5682.68 us (0.01 sec); median 6,470,081 cycles, 1x: 6,470,081 cycles +Signature size: 16224 (15.84 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-256f-robust_BAS b/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-256f-robust_BAS new file mode 100644 index 0000000..7984d63 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-256f-robust_BAS @@ -0,0 +1,14 @@ +Parameters: n = 32, h = 68, d = 17, b = 9, k = 35, w = 16 +Running 10 iterations. +thash avg. 2.83 us (0.00 sec); median 4,859 cycles, 1x: 4,859 cycles +f1600x2 avg. 1.37 us (0.00 sec); median 2,339 cycles, 1x: 2,339 cycles +thashx2 avg. 2.82 us (0.00 sec); median 4,838 cycles, 1x: 4,838 cycles +Generating keypair.. avg. 24106.57 us (0.02 sec); median 41,392,164 cycles, 1x: 41,392,164 cycles + - WOTS pk gen 2x.. avg. 3024.61 us (0.00 sec); median 5,169,728 cycles, 8x: 41,357,824 cycles +Signing.. avg. 635793.13 us (0.64 sec); median 804,023,086 cycles, 1x: 804,023,086 cycles + - FORS signing.. avg. 95399.71 us (0.10 sec); median 109,345,220 cycles, 1x: 109,345,220 cycles + - WOTS pk gen x2.. avg. 4463.54 us (0.00 sec); median 5,079,375 cycles, 136x: 690,795,000 cycles +Verifying.. avg. 21827.31 us (0.02 sec); median 24,824,402 cycles, 1x: 24,824,402 cycles +Signature size: 49856 (48.69 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-256f-robust_C b/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-256f-robust_C new file mode 100644 index 0000000..83f980a --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-256f-robust_C @@ -0,0 +1,14 @@ +Parameters: n = 32, h = 68, d = 17, b = 9, k = 35, w = 16 +Running 10 iterations. +thash avg. 4.74 us (0.00 sec); median 6,308 cycles, 1x: 6,308 cycles +f1600x2 avg. 2.28 us (0.00 sec); median 3,030 cycles, 1x: 3,030 cycles +thashx2 avg. 4.71 us (0.00 sec); median 6,285 cycles, 1x: 6,285 cycles +Generating keypair.. avg. 39762.44 us (0.04 sec); median 53,078,081 cycles, 1x: 53,078,081 cycles + - WOTS pk gen 2x.. avg. 4970.74 us (0.00 sec); median 6,617,998 cycles, 8x: 52,943,984 cycles +Signing.. avg. 849411.70 us (0.85 sec); median 1,038,326,254 cycles, 1x: 1,038,326,254 cycles + - FORS signing.. avg. 123752.32 us (0.12 sec); median 141,226,706 cycles, 1x: 141,226,706 cycles + - WOTS pk gen x2.. avg. 5812.68 us (0.01 sec); median 6,614,931 cycles, 136x: 899,630,616 cycles +Verifying.. avg. 28193.93 us (0.03 sec); median 32,139,636 cycles, 1x: 32,139,636 cycles +Signature size: 49856 (48.69 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-256f-robust_COTHANV8 b/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-256f-robust_COTHANV8 new file mode 100644 index 0000000..ac397e3 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-256f-robust_COTHANV8 @@ -0,0 +1,14 @@ +Parameters: n = 32, h = 68, d = 17, b = 9, k = 35, w = 16 +Running 10 iterations. +thash avg. 4.62 us (0.00 sec); median 7,043 cycles, 1x: 7,043 cycles +f1600x2 avg. 2.22 us (0.00 sec); median 3,379 cycles, 1x: 3,379 cycles +thashx2 avg. 4.60 us (0.00 sec); median 7,011 cycles, 1x: 7,011 cycles +Generating keypair.. avg. 38611.80 us (0.04 sec); median 59,005,014 cycles, 1x: 59,005,014 cycles + - WOTS pk gen 2x.. avg. 4837.87 us (0.00 sec); median 7,369,360 cycles, 8x: 58,954,880 cycles +Signing.. avg. 987379.04 us (0.99 sec); median 1,162,317,679 cycles, 1x: 1,162,317,679 cycles + - FORS signing.. avg. 132869.28 us (0.13 sec); median 157,766,531 cycles, 1x: 157,766,531 cycles + - WOTS pk gen x2.. avg. 6491.75 us (0.01 sec); median 7,397,088 cycles, 136x: 1,006,003,968 cycles +Verifying.. avg. 31279.92 us (0.03 sec); median 35,757,652 cycles, 1x: 35,757,652 cycles +Signature size: 49856 (48.69 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-256f-simple_BAS b/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-256f-simple_BAS new file mode 100644 index 0000000..cba46fa --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-256f-simple_BAS @@ -0,0 +1,14 @@ +Parameters: n = 32, h = 68, d = 17, b = 9, k = 35, w = 16 +Running 10 iterations. +thash avg. 1.86 us (0.00 sec); median 2,432 cycles, 1x: 2,432 cycles +f1600x2 avg. 1.76 us (0.00 sec); median 2,336 cycles, 1x: 2,336 cycles +thashx2 avg. 1.89 us (0.00 sec); median 2,504 cycles, 1x: 2,504 cycles +Generating keypair.. avg. 16372.14 us (0.02 sec); median 21,834,411 cycles, 1x: 21,834,411 cycles + - WOTS pk gen 2x.. avg. 2057.85 us (0.00 sec); median 2,730,154 cycles, 8x: 21,841,232 cycles +Signing.. avg. 366892.12 us (0.37 sec); median 436,117,096 cycles, 1x: 436,117,096 cycles + - FORS signing.. avg. 58836.26 us (0.06 sec); median 67,604,993 cycles, 1x: 67,604,993 cycles + - WOTS pk gen x2.. avg. 2342.52 us (0.00 sec); median 2,660,972 cycles, 136x: 361,892,192 cycles +Verifying.. avg. 11081.04 us (0.01 sec); median 12,768,863 cycles, 1x: 12,768,863 cycles +Signature size: 49856 (48.69 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-256f-simple_C b/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-256f-simple_C new file mode 100644 index 0000000..24f7c63 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-256f-simple_C @@ -0,0 +1,14 @@ +Parameters: n = 32, h = 68, d = 17, b = 9, k = 35, w = 16 +Running 10 iterations. +thash avg. 2.47 us (0.00 sec); median 3,202 cycles, 1x: 3,202 cycles +f1600x2 avg. 2.28 us (0.00 sec); median 3,020 cycles, 1x: 3,020 cycles +thashx2 avg. 2.41 us (0.00 sec); median 3,199 cycles, 1x: 3,199 cycles +Generating keypair.. avg. 20774.63 us (0.02 sec); median 27,745,855 cycles, 1x: 27,745,855 cycles + - WOTS pk gen 2x.. avg. 2606.49 us (0.00 sec); median 3,459,251 cycles, 8x: 27,674,008 cycles +Signing.. avg. 458334.20 us (0.46 sec); median 558,301,104 cycles, 1x: 558,301,104 cycles + - FORS signing.. avg. 75450.09 us (0.08 sec); median 86,183,116 cycles, 1x: 86,183,116 cycles + - WOTS pk gen x2.. avg. 3046.30 us (0.00 sec); median 3,466,757 cycles, 136x: 471,478,952 cycles +Verifying.. avg. 14945.38 us (0.01 sec); median 17,042,434 cycles, 1x: 17,042,434 cycles +Signature size: 49856 (48.69 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-256f-simple_COTHANV8 b/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-256f-simple_COTHANV8 new file mode 100644 index 0000000..6a713b7 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-256f-simple_COTHANV8 @@ -0,0 +1,14 @@ +Parameters: n = 32, h = 68, d = 17, b = 9, k = 35, w = 16 +Running 10 iterations. +thash avg. 2.35 us (0.00 sec); median 3,570 cycles, 1x: 3,570 cycles +f1600x2 avg. 2.23 us (0.00 sec); median 3,381 cycles, 1x: 3,381 cycles +thashx2 avg. 2.36 us (0.00 sec); median 3,584 cycles, 1x: 3,584 cycles +Generating keypair.. avg. 20409.38 us (0.02 sec); median 31,137,366 cycles, 1x: 31,137,366 cycles + - WOTS pk gen 2x.. avg. 2559.71 us (0.00 sec); median 3,889,525 cycles, 8x: 31,116,200 cycles +Signing.. avg. 519286.67 us (0.52 sec); median 622,010,413 cycles, 1x: 622,010,413 cycles + - FORS signing.. avg. 62946.42 us (0.06 sec); median 96,232,980 cycles, 1x: 96,232,980 cycles + - WOTS pk gen x2.. avg. 2542.52 us (0.00 sec); median 3,863,704 cycles, 136x: 525,463,744 cycles +Verifying.. avg. 12099.27 us (0.01 sec); median 18,409,983 cycles, 1x: 18,409,983 cycles +Signature size: 49856 (48.69 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-256s-robust_BAS b/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-256s-robust_BAS new file mode 100644 index 0000000..c556453 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-256s-robust_BAS @@ -0,0 +1,14 @@ +Parameters: n = 32, h = 64, d = 8, b = 14, k = 22, w = 16 +Running 10 iterations. +thash avg. 2.87 us (0.00 sec); median 4,899 cycles, 1x: 4,899 cycles +f1600x2 avg. 1.37 us (0.00 sec); median 2,339 cycles, 1x: 2,339 cycles +thashx2 avg. 2.81 us (0.00 sec); median 4,815 cycles, 1x: 4,815 cycles +Generating keypair.. avg. 492361.00 us (0.49 sec); median 651,373,952 cycles, 1x: 651,373,952 cycles + - WOTS pk gen 2x.. avg. 4482.11 us (0.00 sec); median 5,083,875 cycles, 128x: 650,736,000 cycles +Signing.. avg. 6217999.05 us (6.22 sec); median 7,341,474,565 cycles, 1x: 7,341,474,565 cycles + - FORS signing.. avg. 1874192.41 us (1.87 sec); median 2,174,870,319 cycles, 1x: 2,174,870,319 cycles + - WOTS pk gen x2.. avg. 4472.69 us (0.00 sec); median 5,068,250 cycles, 1024x: 5,189,888,000 cycles +Verifying.. avg. 11121.23 us (0.01 sec); median 12,677,353 cycles, 1x: 12,677,353 cycles +Signature size: 29792 (29.09 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-256s-robust_C b/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-256s-robust_C new file mode 100644 index 0000000..f057f0c --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-256s-robust_C @@ -0,0 +1,14 @@ +Parameters: n = 32, h = 64, d = 8, b = 14, k = 22, w = 16 +Running 10 iterations. +thash avg. 4.70 us (0.00 sec); median 6,249 cycles, 1x: 6,249 cycles +f1600x2 avg. 2.28 us (0.00 sec); median 3,021 cycles, 1x: 3,021 cycles +thashx2 avg. 4.69 us (0.00 sec); median 6,242 cycles, 1x: 6,242 cycles +Generating keypair.. avg. 722874.96 us (0.72 sec); median 844,531,395 cycles, 1x: 844,531,395 cycles + - WOTS pk gen 2x.. avg. 5775.89 us (0.01 sec); median 6,585,409 cycles, 128x: 842,932,352 cycles +Signing.. avg. 8098386.81 us (8.10 sec); median 9,595,055,219 cycles, 1x: 9,595,055,219 cycles + - FORS signing.. avg. 2397540.31 us (2.40 sec); median 2,838,308,845 cycles, 1x: 2,838,308,845 cycles + - WOTS pk gen x2.. avg. 5830.34 us (0.01 sec); median 6,613,039 cycles, 1024x: 6,771,751,936 cycles +Verifying.. avg. 13834.36 us (0.01 sec); median 15,760,040 cycles, 1x: 15,760,040 cycles +Signature size: 29792 (29.09 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-256s-robust_COTHANV8 b/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-256s-robust_COTHANV8 new file mode 100644 index 0000000..90cfaa3 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-256s-robust_COTHANV8 @@ -0,0 +1,14 @@ +Parameters: n = 32, h = 64, d = 8, b = 14, k = 22, w = 16 +Running 10 iterations. +thash avg. 4.10 us (0.00 sec); median 7,023 cycles, 1x: 7,023 cycles +f1600x2 avg. 1.98 us (0.00 sec); median 3,380 cycles, 1x: 3,380 cycles +thashx2 avg. 4.09 us (0.00 sec); median 7,018 cycles, 1x: 7,018 cycles +Generating keypair.. avg. 746820.71 us (0.75 sec); median 945,510,714 cycles, 1x: 945,510,714 cycles + - WOTS pk gen 2x.. avg. 6553.89 us (0.01 sec); median 7,392,392 cycles, 128x: 946,226,176 cycles +Signing.. avg. 9131211.55 us (9.13 sec); median 10,727,353,639 cycles, 1x: 10,727,353,639 cycles + - FORS signing.. avg. 2601384.47 us (2.60 sec); median 3,165,325,230 cycles, 1x: 3,165,325,230 cycles + - WOTS pk gen x2.. avg. 6512.59 us (0.01 sec); median 7,407,507 cycles, 1024x: 7,585,287,168 cycles +Verifying.. avg. 16284.55 us (0.02 sec); median 18,564,368 cycles, 1x: 18,564,368 cycles +Signature size: 29792 (29.09 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-256s-simple_BAS b/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-256s-simple_BAS new file mode 100644 index 0000000..9e9a475 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-256s-simple_BAS @@ -0,0 +1,14 @@ +Parameters: n = 32, h = 64, d = 8, b = 14, k = 22, w = 16 +Running 10 iterations. +thash avg. 1.87 us (0.00 sec); median 2,479 cycles, 1x: 2,479 cycles +f1600x2 avg. 1.76 us (0.00 sec); median 2,339 cycles, 1x: 2,339 cycles +thashx2 avg. 1.89 us (0.00 sec); median 2,515 cycles, 1x: 2,515 cycles +Generating keypair.. avg. 283606.44 us (0.28 sec); median 345,185,284 cycles, 1x: 345,185,284 cycles + - WOTS pk gen 2x.. avg. 2335.40 us (0.00 sec); median 2,650,614 cycles, 128x: 339,278,592 cycles +Signing.. avg. 3338629.86 us (3.34 sec); median 4,074,711,137 cycles, 1x: 4,074,711,137 cycles + - FORS signing.. avg. 1109032.05 us (1.11 sec); median 1,344,961,045 cycles, 1x: 1,344,961,045 cycles + - WOTS pk gen x2.. avg. 2372.53 us (0.00 sec); median 2,692,133 cycles, 1024x: 2,756,744,192 cycles +Verifying.. avg. 5697.93 us (0.01 sec); median 6,518,775 cycles, 1x: 6,518,775 cycles +Signature size: 29792 (29.09 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-256s-simple_C b/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-256s-simple_C new file mode 100644 index 0000000..953e71b --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-256s-simple_C @@ -0,0 +1,14 @@ +Parameters: n = 32, h = 64, d = 8, b = 14, k = 22, w = 16 +Running 10 iterations. +thash avg. 2.10 us (0.00 sec); median 3,186 cycles, 1x: 3,186 cycles +f1600x2 avg. 1.98 us (0.00 sec); median 3,013 cycles, 1x: 3,013 cycles +thashx2 avg. 2.10 us (0.00 sec); median 3,184 cycles, 1x: 3,184 cycles +Generating keypair.. avg. 353918.31 us (0.35 sec); median 443,981,075 cycles, 1x: 443,981,075 cycles + - WOTS pk gen 2x.. avg. 3046.66 us (0.00 sec); median 3,465,580 cycles, 128x: 443,594,240 cycles +Signing.. avg. 4476406.99 us (4.48 sec); median 5,279,964,627 cycles, 1x: 5,279,964,627 cycles + - FORS signing.. avg. 1497181.58 us (1.50 sec); median 1,732,279,853 cycles, 1x: 1,732,279,853 cycles + - WOTS pk gen x2.. avg. 3047.23 us (0.00 sec); median 3,465,716 cycles, 1024x: 3,548,893,184 cycles +Verifying.. avg. 7593.68 us (0.01 sec); median 8,662,172 cycles, 1x: 8,662,172 cycles +Signature size: 29792 (29.09 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-256s-simple_COTHANV8 b/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-256s-simple_COTHANV8 new file mode 100644 index 0000000..3b98c00 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A510/sphincs-shake-256s-simple_COTHANV8 @@ -0,0 +1,14 @@ +Parameters: n = 32, h = 64, d = 8, b = 14, k = 22, w = 16 +Running 10 iterations. +thash avg. 2.71 us (0.00 sec); median 3,574 cycles, 1x: 3,574 cycles +f1600x2 avg. 2.55 us (0.00 sec); median 3,395 cycles, 1x: 3,395 cycles +thashx2 avg. 2.66 us (0.00 sec); median 3,537 cycles, 1x: 3,537 cycles +Generating keypair.. avg. 419340.08 us (0.42 sec); median 496,540,922 cycles, 1x: 496,540,922 cycles + - WOTS pk gen 2x.. avg. 3435.18 us (0.00 sec); median 3,867,365 cycles, 128x: 495,022,720 cycles +Signing.. avg. 4972352.88 us (4.97 sec); median 5,887,889,602 cycles, 1x: 5,887,889,602 cycles + - FORS signing.. avg. 1621482.37 us (1.62 sec); median 1,931,117,786 cycles, 1x: 1,931,117,786 cycles + - WOTS pk gen x2.. avg. 3409.20 us (0.00 sec); median 3,862,557 cycles, 1024x: 3,955,258,368 cycles +Verifying.. avg. 8413.96 us (0.01 sec); median 9,558,527 cycles, 1x: 9,558,527 cycles +Signature size: 29792 (29.09 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-128f-robust_C b/sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-128f-robust_C new file mode 100644 index 0000000..b92dc6f --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-128f-robust_C @@ -0,0 +1,14 @@ +Parameters: n = 16, h = 66, d = 22, b = 6, k = 33, w = 16 +Running 10 iterations. +thash avg. 4.60 us (0.00 sec); median 8,211 cycles, 1x: 8,211 cycles +f1600x2 avg. 2.27 us (0.00 sec); median 4,040 cycles, 1x: 4,040 cycles +thashx2 avg. 4.60 us (0.00 sec); median 8,201 cycles, 1x: 8,201 cycles +Generating keypair.. avg. 10099.71 us (0.01 sec); median 18,035,472 cycles, 1x: 18,035,472 cycles + - WOTS pk gen 2x.. avg. 2530.96 us (0.00 sec); median 4,502,918 cycles, 4x: 18,011,672 cycles +Signing.. avg. 233780.68 us (0.23 sec); median 418,555,286 cycles, 1x: 418,555,286 cycles + - FORS signing.. avg. 12214.86 us (0.01 sec); median 21,852,625 cycles, 1x: 21,852,625 cycles + - WOTS pk gen x2.. avg. 2532.93 us (0.00 sec); median 4,502,474 cycles, 88x: 396,217,712 cycles +Verifying.. avg. 15288.91 us (0.02 sec); median 27,322,293 cycles, 1x: 27,322,293 cycles +Signature size: 17088 (16.69 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-128f-robust_COTHANV8 b/sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-128f-robust_COTHANV8 new file mode 100644 index 0000000..338f7e7 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-128f-robust_COTHANV8 @@ -0,0 +1,14 @@ +Parameters: n = 16, h = 66, d = 22, b = 6, k = 33, w = 16 +Running 10 iterations. +thash avg. 5.98 us (0.00 sec); median 10,675 cycles, 1x: 10,675 cycles +f1600x2 avg. 2.93 us (0.00 sec); median 5,225 cycles, 1x: 5,225 cycles +thashx2 avg. 5.97 us (0.00 sec); median 10,668 cycles, 1x: 10,668 cycles +Generating keypair.. avg. 13123.88 us (0.01 sec); median 23,443,923 cycles, 1x: 23,443,923 cycles + - WOTS pk gen 2x.. avg. 3287.25 us (0.00 sec); median 5,852,619 cycles, 4x: 23,410,476 cycles +Signing.. avg. 303884.72 us (0.30 sec); median 544,202,850 cycles, 1x: 544,202,850 cycles + - FORS signing.. avg. 15888.37 us (0.02 sec); median 28,416,149 cycles, 1x: 28,416,149 cycles + - WOTS pk gen x2.. avg. 3285.71 us (0.00 sec); median 5,852,521 cycles, 88x: 515,021,848 cycles +Verifying.. avg. 20699.83 us (0.02 sec); median 37,017,382 cycles, 1x: 37,017,382 cycles +Signature size: 17088 (16.69 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-128f-simple_C b/sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-128f-simple_C new file mode 100644 index 0000000..3aaf0d0 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-128f-simple_C @@ -0,0 +1,14 @@ +Parameters: n = 16, h = 66, d = 22, b = 6, k = 33, w = 16 +Running 10 iterations. +thash avg. 2.29 us (0.00 sec); median 4,083 cycles, 1x: 4,083 cycles +f1600x2 avg. 2.27 us (0.00 sec); median 4,040 cycles, 1x: 4,040 cycles +thashx2 avg. 2.29 us (0.00 sec); median 4,073 cycles, 1x: 4,073 cycles +Generating keypair.. avg. 5208.05 us (0.01 sec); median 9,242,216 cycles, 1x: 9,242,216 cycles + - WOTS pk gen 2x.. avg. 1306.08 us (0.00 sec); median 2,304,914 cycles, 4x: 9,219,656 cycles +Signing.. avg. 120879.93 us (0.12 sec); median 216,405,191 cycles, 1x: 216,405,191 cycles + - FORS signing.. avg. 7336.03 us (0.01 sec); median 13,100,258 cycles, 1x: 13,100,258 cycles + - WOTS pk gen x2.. avg. 1299.21 us (0.00 sec); median 2,305,173 cycles, 88x: 202,855,224 cycles +Verifying.. avg. 7728.33 us (0.01 sec); median 13,784,058 cycles, 1x: 13,784,058 cycles +Signature size: 17088 (16.69 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-128f-simple_COTHANV8 b/sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-128f-simple_COTHANV8 new file mode 100644 index 0000000..8ade1c9 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-128f-simple_COTHANV8 @@ -0,0 +1,14 @@ +Parameters: n = 16, h = 66, d = 22, b = 6, k = 33, w = 16 +Running 10 iterations. +thash avg. 3.01 us (0.00 sec); median 5,356 cycles, 1x: 5,356 cycles +f1600x2 avg. 2.93 us (0.00 sec); median 5,225 cycles, 1x: 5,225 cycles +thashx2 avg. 3.00 us (0.00 sec); median 5,346 cycles, 1x: 5,346 cycles +Generating keypair.. avg. 6803.46 us (0.01 sec); median 12,126,559 cycles, 1x: 12,126,559 cycles + - WOTS pk gen 2x.. avg. 1704.43 us (0.00 sec); median 3,026,208 cycles, 4x: 12,104,832 cycles +Signing.. avg. 158511.28 us (0.16 sec); median 283,887,234 cycles, 1x: 283,887,234 cycles + - FORS signing.. avg. 9600.50 us (0.01 sec); median 17,146,789 cycles, 1x: 17,146,789 cycles + - WOTS pk gen x2.. avg. 1706.30 us (0.00 sec); median 3,026,696 cycles, 88x: 266,349,248 cycles +Verifying.. avg. 10373.90 us (0.01 sec); median 18,519,691 cycles, 1x: 18,519,691 cycles +Signature size: 17088 (16.69 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-128s-robust_C b/sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-128s-robust_C new file mode 100644 index 0000000..f171482 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-128s-robust_C @@ -0,0 +1,14 @@ +Parameters: n = 16, h = 63, d = 7, b = 12, k = 14, w = 16 +Running 10 iterations. +thash avg. 4.60 us (0.00 sec); median 8,212 cycles, 1x: 8,212 cycles +f1600x2 avg. 2.27 us (0.00 sec); median 4,035 cycles, 1x: 4,035 cycles +thashx2 avg. 4.60 us (0.00 sec); median 8,200 cycles, 1x: 8,200 cycles +Generating keypair.. avg. 644202.77 us (0.64 sec); median 1,153,926,672 cycles, 1x: 1,153,926,672 cycles + - WOTS pk gen 2x.. avg. 2527.59 us (0.00 sec); median 4,502,362 cycles, 256x: 1,152,604,672 cycles +Signing.. avg. 4838973.00 us (4.84 sec); median 8,667,372,131 cycles, 1x: 8,667,372,131 cycles + - FORS signing.. avg. 329170.57 us (0.33 sec); median 589,753,790 cycles, 1x: 589,753,790 cycles + - WOTS pk gen x2.. avg. 2530.01 us (0.00 sec); median 4,502,365 cycles, 1792x: 8,068,238,080 cycles +Verifying.. avg. 5842.77 us (0.01 sec); median 10,415,159 cycles, 1x: 10,415,159 cycles +Signature size: 7856 (7.67 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-128s-robust_COTHANV8 b/sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-128s-robust_COTHANV8 new file mode 100644 index 0000000..18ca155 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-128s-robust_COTHANV8 @@ -0,0 +1,14 @@ +Parameters: n = 16, h = 63, d = 7, b = 12, k = 14, w = 16 +Running 10 iterations. +thash avg. 5.98 us (0.00 sec); median 10,681 cycles, 1x: 10,681 cycles +f1600x2 avg. 2.93 us (0.00 sec); median 5,228 cycles, 1x: 5,228 cycles +thashx2 avg. 5.97 us (0.00 sec); median 10,670 cycles, 1x: 10,670 cycles +Generating keypair.. avg. 837665.89 us (0.84 sec); median 1,500,185,859 cycles, 1x: 1,500,185,859 cycles + - WOTS pk gen 2x.. avg. 3287.81 us (0.00 sec); median 5,850,675 cycles, 256x: 1,497,772,800 cycles +Signing.. avg. 6292136.24 us (6.29 sec); median 11,269,260,233 cycles, 1x: 11,269,260,233 cycles + - FORS signing.. avg. 428706.40 us (0.43 sec); median 767,895,974 cycles, 1x: 767,895,974 cycles + - WOTS pk gen x2.. avg. 3287.02 us (0.00 sec); median 5,851,291 cycles, 1792x: 10,485,513,472 cycles +Verifying.. avg. 7456.03 us (0.01 sec); median 13,300,844 cycles, 1x: 13,300,844 cycles +Signature size: 7856 (7.67 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-128s-simple_C b/sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-128s-simple_C new file mode 100644 index 0000000..344cb7c --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-128s-simple_C @@ -0,0 +1,14 @@ +Parameters: n = 16, h = 63, d = 7, b = 12, k = 14, w = 16 +Running 10 iterations. +thash avg. 2.30 us (0.00 sec); median 4,083 cycles, 1x: 4,083 cycles +f1600x2 avg. 2.26 us (0.00 sec); median 4,036 cycles, 1x: 4,036 cycles +thashx2 avg. 2.29 us (0.00 sec); median 4,073 cycles, 1x: 4,073 cycles +Generating keypair.. avg. 330255.48 us (0.33 sec); median 591,355,982 cycles, 1x: 591,355,982 cycles + - WOTS pk gen 2x.. avg. 1306.98 us (0.00 sec); median 2,304,683 cycles, 256x: 589,998,848 cycles +Signing.. avg. 2508722.31 us (2.51 sec); median 4,492,335,768 cycles, 1x: 4,492,335,768 cycles + - FORS signing.. avg. 196982.35 us (0.20 sec); median 352,797,800 cycles, 1x: 352,797,800 cycles + - WOTS pk gen x2.. avg. 1304.25 us (0.00 sec); median 2,304,666 cycles, 1792x: 4,129,961,472 cycles +Verifying.. avg. 3079.11 us (0.00 sec); median 5,479,870 cycles, 1x: 5,479,870 cycles +Signature size: 7856 (7.67 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-128s-simple_COTHANV8 b/sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-128s-simple_COTHANV8 new file mode 100644 index 0000000..5d1cfc6 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-128s-simple_COTHANV8 @@ -0,0 +1,14 @@ +Parameters: n = 16, h = 63, d = 7, b = 12, k = 14, w = 16 +Running 10 iterations. +thash avg. 3.01 us (0.00 sec); median 5,358 cycles, 1x: 5,358 cycles +f1600x2 avg. 2.94 us (0.00 sec); median 5,224 cycles, 1x: 5,224 cycles +thashx2 avg. 3.00 us (0.00 sec); median 5,347 cycles, 1x: 5,347 cycles +Generating keypair.. avg. 433333.75 us (0.43 sec); median 775,978,405 cycles, 1x: 775,978,405 cycles + - WOTS pk gen 2x.. avg. 1708.80 us (0.00 sec); median 3,026,282 cycles, 256x: 774,728,192 cycles +Signing.. avg. 3291387.08 us (3.29 sec); median 5,894,193,655 cycles, 1x: 5,894,193,655 cycles + - FORS signing.. avg. 258080.68 us (0.26 sec); median 462,280,809 cycles, 1x: 462,280,809 cycles + - WOTS pk gen x2.. avg. 1707.11 us (0.00 sec); median 3,026,525 cycles, 1792x: 5,423,532,800 cycles +Verifying.. avg. 3717.17 us (0.00 sec); median 6,605,630 cycles, 1x: 6,605,630 cycles +Signature size: 7856 (7.67 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-192f-robust_C b/sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-192f-robust_C new file mode 100644 index 0000000..ab1729e --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-192f-robust_C @@ -0,0 +1,14 @@ +Parameters: n = 24, h = 66, d = 22, b = 8, k = 33, w = 16 +Running 10 iterations. +thash avg. 4.62 us (0.00 sec); median 8,234 cycles, 1x: 8,234 cycles +f1600x2 avg. 2.27 us (0.00 sec); median 4,037 cycles, 1x: 4,037 cycles +thashx2 avg. 4.61 us (0.00 sec); median 8,221 cycles, 1x: 8,221 cycles +Generating keypair.. avg. 14774.57 us (0.01 sec); median 26,394,040 cycles, 1x: 26,394,040 cycles + - WOTS pk gen 2x.. avg. 3704.17 us (0.00 sec); median 6,591,620 cycles, 4x: 26,366,480 cycles +Signing.. avg. 372905.51 us (0.37 sec); median 667,807,783 cycles, 1x: 667,807,783 cycles + - FORS signing.. avg. 48683.93 us (0.05 sec); median 87,190,967 cycles, 1x: 87,190,967 cycles + - WOTS pk gen x2.. avg. 3701.18 us (0.00 sec); median 6,590,975 cycles, 88x: 580,005,800 cycles +Verifying.. avg. 23099.58 us (0.02 sec); median 41,330,950 cycles, 1x: 41,330,950 cycles +Signature size: 35664 (34.83 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-192f-robust_COTHANV8 b/sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-192f-robust_COTHANV8 new file mode 100644 index 0000000..70da6e5 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-192f-robust_COTHANV8 @@ -0,0 +1,14 @@ +Parameters: n = 24, h = 66, d = 22, b = 8, k = 33, w = 16 +Running 10 iterations. +thash avg. 5.98 us (0.00 sec); median 10,690 cycles, 1x: 10,690 cycles +f1600x2 avg. 2.96 us (0.00 sec); median 5,266 cycles, 1x: 5,266 cycles +thashx2 avg. 5.98 us (0.00 sec); median 10,678 cycles, 1x: 10,678 cycles +Generating keypair.. avg. 19173.84 us (0.02 sec); median 34,289,394 cycles, 1x: 34,289,394 cycles + - WOTS pk gen 2x.. avg. 4802.23 us (0.00 sec); median 8,560,589 cycles, 4x: 34,242,356 cycles +Signing.. avg. 484655.09 us (0.48 sec); median 867,847,883 cycles, 1x: 867,847,883 cycles + - FORS signing.. avg. 63373.11 us (0.06 sec); median 113,463,572 cycles, 1x: 113,463,572 cycles + - WOTS pk gen x2.. avg. 4802.17 us (0.00 sec); median 8,561,418 cycles, 88x: 753,404,784 cycles +Verifying.. avg. 30143.48 us (0.03 sec); median 53,918,767 cycles, 1x: 53,918,767 cycles +Signature size: 35664 (34.83 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-192f-simple_C b/sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-192f-simple_C new file mode 100644 index 0000000..904b146 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-192f-simple_C @@ -0,0 +1,14 @@ +Parameters: n = 24, h = 66, d = 22, b = 8, k = 33, w = 16 +Running 10 iterations. +thash avg. 2.31 us (0.00 sec); median 4,102 cycles, 1x: 4,102 cycles +f1600x2 avg. 2.27 us (0.00 sec); median 4,034 cycles, 1x: 4,034 cycles +thashx2 avg. 2.30 us (0.00 sec); median 4,093 cycles, 1x: 4,093 cycles +Generating keypair.. avg. 7611.28 us (0.01 sec); median 13,571,977 cycles, 1x: 13,571,977 cycles + - WOTS pk gen 2x.. avg. 1908.39 us (0.00 sec); median 3,387,856 cycles, 4x: 13,551,424 cycles +Signing.. avg. 195870.05 us (0.20 sec); median 350,817,413 cycles, 1x: 350,817,413 cycles + - FORS signing.. avg. 29194.72 us (0.03 sec); median 52,265,641 cycles, 1x: 52,265,641 cycles + - WOTS pk gen x2.. avg. 1909.81 us (0.00 sec); median 3,391,678 cycles, 88x: 298,467,664 cycles +Verifying.. avg. 11075.95 us (0.01 sec); median 19,784,056 cycles, 1x: 19,784,056 cycles +Signature size: 35664 (34.83 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-192f-simple_COTHANV8 b/sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-192f-simple_COTHANV8 new file mode 100644 index 0000000..4957d20 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-192f-simple_COTHANV8 @@ -0,0 +1,14 @@ +Parameters: n = 24, h = 66, d = 22, b = 8, k = 33, w = 16 +Running 10 iterations. +thash avg. 3.01 us (0.00 sec); median 5,364 cycles, 1x: 5,364 cycles +f1600x2 avg. 2.96 us (0.00 sec); median 5,267 cycles, 1x: 5,267 cycles +thashx2 avg. 3.00 us (0.00 sec); median 5,353 cycles, 1x: 5,353 cycles +Generating keypair.. avg. 9936.92 us (0.01 sec); median 17,755,333 cycles, 1x: 17,755,333 cycles + - WOTS pk gen 2x.. avg. 2490.70 us (0.00 sec); median 4,434,825 cycles, 4x: 17,739,300 cycles +Signing.. avg. 256195.60 us (0.26 sec); median 458,903,092 cycles, 1x: 458,903,092 cycles + - FORS signing.. avg. 38164.91 us (0.04 sec); median 68,351,746 cycles, 1x: 68,351,746 cycles + - WOTS pk gen x2.. avg. 2494.99 us (0.00 sec); median 4,434,250 cycles, 88x: 390,214,000 cycles +Verifying.. avg. 15397.95 us (0.02 sec); median 27,533,333 cycles, 1x: 27,533,333 cycles +Signature size: 35664 (34.83 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-192s-robust_C b/sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-192s-robust_C new file mode 100644 index 0000000..62c711e --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-192s-robust_C @@ -0,0 +1,14 @@ +Parameters: n = 24, h = 63, d = 7, b = 14, k = 17, w = 16 +Running 10 iterations. +thash avg. 4.62 us (0.00 sec); median 8,231 cycles, 1x: 8,231 cycles +f1600x2 avg. 2.27 us (0.00 sec); median 4,040 cycles, 1x: 4,040 cycles +thashx2 avg. 4.62 us (0.00 sec); median 8,221 cycles, 1x: 8,221 cycles +Generating keypair.. avg. 943649.18 us (0.94 sec); median 1,689,838,511 cycles, 1x: 1,689,838,511 cycles + - WOTS pk gen 2x.. avg. 3697.47 us (0.00 sec); median 6,591,562 cycles, 256x: 1,687,439,872 cycles +Signing.. avg. 8207288.52 us (8.21 sec); median 14,698,675,315 cycles, 1x: 14,698,675,315 cycles + - FORS signing.. avg. 1602185.23 us (1.60 sec); median 2,869,827,214 cycles, 1x: 2,869,827,214 cycles + - WOTS pk gen x2.. avg. 3702.84 us (0.00 sec); median 6,591,676 cycles, 1792x: 11,812,283,392 cycles +Verifying.. avg. 8137.84 us (0.01 sec); median 14,513,915 cycles, 1x: 14,513,915 cycles +Signature size: 16224 (15.84 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-192s-robust_COTHANV8 b/sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-192s-robust_COTHANV8 new file mode 100644 index 0000000..3454dd6 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-192s-robust_COTHANV8 @@ -0,0 +1,14 @@ +Parameters: n = 24, h = 63, d = 7, b = 14, k = 17, w = 16 +Running 10 iterations. +thash avg. 5.99 us (0.00 sec); median 10,693 cycles, 1x: 10,693 cycles +f1600x2 avg. 2.95 us (0.00 sec); median 5,268 cycles, 1x: 5,268 cycles +thashx2 avg. 5.99 us (0.00 sec); median 10,683 cycles, 1x: 10,683 cycles +Generating keypair.. avg. 1224720.20 us (1.22 sec); median 2,193,398,142 cycles, 1x: 2,193,398,142 cycles + - WOTS pk gen 2x.. avg. 4802.45 us (0.00 sec); median 8,562,050 cycles, 256x: 2,191,884,800 cycles +Signing.. avg. 10656253.76 us (10.66 sec); median 19,087,592,024 cycles, 1x: 19,087,592,024 cycles + - FORS signing.. avg. 2084564.07 us (2.08 sec); median 3,733,614,717 cycles, 1x: 3,733,614,717 cycles + - WOTS pk gen x2.. avg. 4808.26 us (0.00 sec); median 8,562,993 cycles, 1792x: 15,344,883,456 cycles +Verifying.. avg. 10749.68 us (0.01 sec); median 19,209,642 cycles, 1x: 19,209,642 cycles +Signature size: 16224 (15.84 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-192s-simple_C b/sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-192s-simple_C new file mode 100644 index 0000000..69d5dbf --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-192s-simple_C @@ -0,0 +1,14 @@ +Parameters: n = 24, h = 63, d = 7, b = 14, k = 17, w = 16 +Running 10 iterations. +thash avg. 2.31 us (0.00 sec); median 4,104 cycles, 1x: 4,104 cycles +f1600x2 avg. 2.27 us (0.00 sec); median 4,038 cycles, 1x: 4,038 cycles +thashx2 avg. 2.30 us (0.00 sec); median 4,094 cycles, 1x: 4,094 cycles +Generating keypair.. avg. 484868.32 us (0.48 sec); median 868,301,795 cycles, 1x: 868,301,795 cycles + - WOTS pk gen 2x.. avg. 1912.36 us (0.00 sec); median 3,388,902 cycles, 256x: 867,558,912 cycles +Signing.. avg. 4352792.38 us (4.35 sec); median 7,797,108,440 cycles, 1x: 7,797,108,440 cycles + - FORS signing.. avg. 959530.19 us (0.96 sec); median 1,718,942,429 cycles, 1x: 1,718,942,429 cycles + - WOTS pk gen x2.. avg. 1911.80 us (0.00 sec); median 3,390,077 cycles, 1792x: 6,075,017,984 cycles +Verifying.. avg. 4172.08 us (0.00 sec); median 7,424,081 cycles, 1x: 7,424,081 cycles +Signature size: 16224 (15.84 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-192s-simple_COTHANV8 b/sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-192s-simple_COTHANV8 new file mode 100644 index 0000000..60c05c7 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-192s-simple_COTHANV8 @@ -0,0 +1,14 @@ +Parameters: n = 24, h = 63, d = 7, b = 14, k = 17, w = 16 +Running 10 iterations. +thash avg. 3.01 us (0.00 sec); median 5,367 cycles, 1x: 5,367 cycles +f1600x2 avg. 2.97 us (0.00 sec); median 5,264 cycles, 1x: 5,264 cycles +thashx2 avg. 3.01 us (0.00 sec); median 5,355 cycles, 1x: 5,355 cycles +Generating keypair.. avg. 634463.20 us (0.63 sec); median 1,135,767,495 cycles, 1x: 1,135,767,495 cycles + - WOTS pk gen 2x.. avg. 2492.17 us (0.00 sec); median 4,431,748 cycles, 256x: 1,134,527,488 cycles +Signing.. avg. 5694003.52 us (5.69 sec); median 10,198,052,651 cycles, 1x: 10,198,052,651 cycles + - FORS signing.. avg. 1254903.38 us (1.25 sec); median 2,247,650,812 cycles, 1x: 2,247,650,812 cycles + - WOTS pk gen x2.. avg. 2499.31 us (0.00 sec); median 4,432,300 cycles, 1792x: 7,942,681,600 cycles +Verifying.. avg. 5337.31 us (0.01 sec); median 9,515,314 cycles, 1x: 9,515,314 cycles +Signature size: 16224 (15.84 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-256f-robust_C b/sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-256f-robust_C new file mode 100644 index 0000000..5fb3b7d --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-256f-robust_C @@ -0,0 +1,14 @@ +Parameters: n = 32, h = 68, d = 17, b = 9, k = 35, w = 16 +Running 10 iterations. +thash avg. 4.62 us (0.00 sec); median 8,238 cycles, 1x: 8,238 cycles +f1600x2 avg. 2.27 us (0.00 sec); median 4,037 cycles, 1x: 4,037 cycles +thashx2 avg. 4.62 us (0.00 sec); median 8,228 cycles, 1x: 8,228 cycles +Generating keypair.. avg. 38957.46 us (0.04 sec); median 69,680,393 cycles, 1x: 69,680,393 cycles + - WOTS pk gen 2x.. avg. 4881.87 us (0.00 sec); median 8,702,024 cycles, 8x: 69,616,192 cycles +Signing.. avg. 765104.41 us (0.77 sec); median 1,370,043,557 cycles, 1x: 1,370,043,557 cycles + - FORS signing.. avg. 103556.47 us (0.10 sec); median 185,427,742 cycles, 1x: 185,427,742 cycles + - WOTS pk gen x2.. avg. 4881.17 us (0.00 sec); median 8,701,430 cycles, 136x: 1,183,394,480 cycles +Verifying.. avg. 23639.05 us (0.02 sec); median 42,259,846 cycles, 1x: 42,259,846 cycles +Signature size: 49856 (48.69 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-256f-robust_COTHANV8 b/sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-256f-robust_COTHANV8 new file mode 100644 index 0000000..2949d4c --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-256f-robust_COTHANV8 @@ -0,0 +1,14 @@ +Parameters: n = 32, h = 68, d = 17, b = 9, k = 35, w = 16 +Running 10 iterations. +thash avg. 5.99 us (0.00 sec); median 10,701 cycles, 1x: 10,701 cycles +f1600x2 avg. 2.94 us (0.00 sec); median 5,226 cycles, 1x: 5,226 cycles +thashx2 avg. 5.99 us (0.00 sec); median 10,690 cycles, 1x: 10,690 cycles +Generating keypair.. avg. 50548.00 us (0.05 sec); median 90,504,499 cycles, 1x: 90,504,499 cycles + - WOTS pk gen 2x.. avg. 6331.48 us (0.01 sec); median 11,304,928 cycles, 8x: 90,439,424 cycles +Signing.. avg. 993380.47 us (0.99 sec); median 1,779,387,742 cycles, 1x: 1,779,387,742 cycles + - FORS signing.. avg. 134450.17 us (0.13 sec); median 240,813,645 cycles, 1x: 240,813,645 cycles + - WOTS pk gen x2.. avg. 6332.51 us (0.01 sec); median 11,304,670 cycles, 136x: 1,537,435,120 cycles +Verifying.. avg. 29889.22 us (0.03 sec); median 53,460,936 cycles, 1x: 53,460,936 cycles +Signature size: 49856 (48.69 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-256f-simple_C b/sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-256f-simple_C new file mode 100644 index 0000000..70c01e8 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-256f-simple_C @@ -0,0 +1,14 @@ +Parameters: n = 32, h = 68, d = 17, b = 9, k = 35, w = 16 +Running 10 iterations. +thash avg. 2.31 us (0.00 sec); median 4,109 cycles, 1x: 4,109 cycles +f1600x2 avg. 2.27 us (0.00 sec); median 4,035 cycles, 1x: 4,035 cycles +thashx2 avg. 2.30 us (0.00 sec); median 4,100 cycles, 1x: 4,100 cycles +Generating keypair.. avg. 20038.55 us (0.02 sec); median 35,808,797 cycles, 1x: 35,808,797 cycles + - WOTS pk gen 2x.. avg. 2515.25 us (0.00 sec); median 4,473,268 cycles, 8x: 35,786,144 cycles +Signing.. avg. 401969.57 us (0.40 sec); median 719,776,475 cycles, 1x: 719,776,475 cycles + - FORS signing.. avg. 62016.12 us (0.06 sec); median 111,035,600 cycles, 1x: 111,035,600 cycles + - WOTS pk gen x2.. avg. 2515.05 us (0.00 sec); median 4,475,240 cycles, 136x: 608,632,640 cycles +Verifying.. avg. 12049.50 us (0.01 sec); median 21,529,740 cycles, 1x: 21,529,740 cycles +Signature size: 49856 (48.69 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-256f-simple_COTHANV8 b/sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-256f-simple_COTHANV8 new file mode 100644 index 0000000..2dbf8a8 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-256f-simple_COTHANV8 @@ -0,0 +1,14 @@ +Parameters: n = 32, h = 68, d = 17, b = 9, k = 35, w = 16 +Running 10 iterations. +thash avg. 3.02 us (0.00 sec); median 5,376 cycles, 1x: 5,376 cycles +f1600x2 avg. 2.94 us (0.00 sec); median 5,225 cycles, 1x: 5,225 cycles +thashx2 avg. 3.02 us (0.00 sec); median 5,366 cycles, 1x: 5,366 cycles +Generating keypair.. avg. 26186.90 us (0.03 sec); median 46,836,734 cycles, 1x: 46,836,734 cycles + - WOTS pk gen 2x.. avg. 3283.44 us (0.00 sec); median 5,852,401 cycles, 8x: 46,819,208 cycles +Signing.. avg. 525604.73 us (0.53 sec); median 941,333,658 cycles, 1x: 941,333,658 cycles + - FORS signing.. avg. 81041.81 us (0.08 sec); median 145,130,839 cycles, 1x: 145,130,839 cycles + - WOTS pk gen x2.. avg. 3284.34 us (0.00 sec); median 5,850,078 cycles, 136x: 795,610,608 cycles +Verifying.. avg. 15641.11 us (0.02 sec); median 27,951,607 cycles, 1x: 27,951,607 cycles +Signature size: 49856 (48.69 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-256s-robust_C b/sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-256s-robust_C new file mode 100644 index 0000000..a0efea4 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-256s-robust_C @@ -0,0 +1,14 @@ +Parameters: n = 32, h = 64, d = 8, b = 14, k = 22, w = 16 +Running 10 iterations. +thash avg. 4.62 us (0.00 sec); median 8,236 cycles, 1x: 8,236 cycles +f1600x2 avg. 2.27 us (0.00 sec); median 4,035 cycles, 1x: 4,035 cycles +thashx2 avg. 4.62 us (0.00 sec); median 8,226 cycles, 1x: 8,226 cycles +Generating keypair.. avg. 622704.09 us (0.62 sec); median 1,115,134,449 cycles, 1x: 1,115,134,449 cycles + - WOTS pk gen 2x.. avg. 4884.10 us (0.00 sec); median 8,702,284 cycles, 128x: 1,113,892,352 cycles +Signing.. avg. 7061804.28 us (7.06 sec); median 12,647,523,175 cycles, 1x: 12,647,523,175 cycles + - FORS signing.. avg. 2080503.72 us (2.08 sec); median 3,726,478,234 cycles, 1x: 3,726,478,234 cycles + - WOTS pk gen x2.. avg. 4882.34 us (0.00 sec); median 8,702,593 cycles, 1024x: 8,911,455,232 cycles +Verifying.. avg. 12110.84 us (0.01 sec); median 21,631,583 cycles, 1x: 21,631,583 cycles +Signature size: 29792 (29.09 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-256s-robust_COTHANV8 b/sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-256s-robust_COTHANV8 new file mode 100644 index 0000000..df7f6f9 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-256s-robust_COTHANV8 @@ -0,0 +1,14 @@ +Parameters: n = 32, h = 64, d = 8, b = 14, k = 22, w = 16 +Running 10 iterations. +thash avg. 5.99 us (0.00 sec); median 10,704 cycles, 1x: 10,704 cycles +f1600x2 avg. 2.93 us (0.00 sec); median 5,226 cycles, 1x: 5,226 cycles +thashx2 avg. 5.99 us (0.00 sec); median 10,693 cycles, 1x: 10,693 cycles +Generating keypair.. avg. 808351.69 us (0.81 sec); median 1,447,762,784 cycles, 1x: 1,447,762,784 cycles + - WOTS pk gen 2x.. avg. 6334.84 us (0.01 sec); median 11,302,309 cycles, 128x: 1,446,695,552 cycles +Signing.. avg. 9169802.05 us (9.17 sec); median 16,422,259,910 cycles, 1x: 16,422,259,910 cycles + - FORS signing.. avg. 2702173.35 us (2.70 sec); median 4,840,032,990 cycles, 1x: 4,840,032,990 cycles + - WOTS pk gen x2.. avg. 6332.54 us (0.01 sec); median 11,302,138 cycles, 1024x: 11,573,389,312 cycles +Verifying.. avg. 15892.65 us (0.02 sec); median 28,392,037 cycles, 1x: 28,392,037 cycles +Signature size: 29792 (29.09 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-256s-simple_C b/sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-256s-simple_C new file mode 100644 index 0000000..884327b --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-256s-simple_C @@ -0,0 +1,14 @@ +Parameters: n = 32, h = 64, d = 8, b = 14, k = 22, w = 16 +Running 10 iterations. +thash avg. 2.31 us (0.00 sec); median 4,106 cycles, 1x: 4,106 cycles +f1600x2 avg. 2.27 us (0.00 sec); median 4,035 cycles, 1x: 4,035 cycles +thashx2 avg. 2.30 us (0.00 sec); median 4,096 cycles, 1x: 4,096 cycles +Generating keypair.. avg. 320360.34 us (0.32 sec); median 573,213,968 cycles, 1x: 573,213,968 cycles + - WOTS pk gen 2x.. avg. 2517.64 us (0.00 sec); median 4,476,488 cycles, 128x: 572,990,464 cycles +Signing.. avg. 3805604.60 us (3.81 sec); median 6,816,439,485 cycles, 1x: 6,816,439,485 cycles + - FORS signing.. avg. 1245561.37 us (1.25 sec); median 2,230,770,027 cycles, 1x: 2,230,770,027 cycles + - WOTS pk gen x2.. avg. 2521.25 us (0.00 sec); median 4,478,539 cycles, 1024x: 4,586,023,936 cycles +Verifying.. avg. 6108.95 us (0.01 sec); median 10,884,435 cycles, 1x: 10,884,435 cycles +Signature size: 29792 (29.09 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-256s-simple_COTHANV8 b/sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-256s-simple_COTHANV8 new file mode 100644 index 0000000..ac5bfa9 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A55/sphincs-shake-256s-simple_COTHANV8 @@ -0,0 +1,14 @@ +Parameters: n = 32, h = 64, d = 8, b = 14, k = 22, w = 16 +Running 10 iterations. +thash avg. 3.02 us (0.00 sec); median 5,375 cycles, 1x: 5,375 cycles +f1600x2 avg. 2.94 us (0.00 sec); median 5,227 cycles, 1x: 5,227 cycles +thashx2 avg. 3.01 us (0.00 sec); median 5,366 cycles, 1x: 5,366 cycles +Generating keypair.. avg. 418277.52 us (0.42 sec); median 749,037,170 cycles, 1x: 749,037,170 cycles + - WOTS pk gen 2x.. avg. 3286.54 us (0.00 sec); median 5,849,209 cycles, 128x: 748,698,752 cycles +Signing.. avg. 4973583.14 us (4.97 sec); median 8,907,516,912 cycles, 1x: 8,907,516,912 cycles + - FORS signing.. avg. 1627195.07 us (1.63 sec); median 2,915,023,243 cycles, 1x: 2,915,023,243 cycles + - WOTS pk gen x2.. avg. 3289.80 us (0.00 sec); median 5,849,373 cycles, 1024x: 5,989,757,952 cycles +Verifying.. avg. 7568.91 us (0.01 sec); median 13,513,452 cycles, 1x: 13,513,452 cycles +Signature size: 29792 (29.09 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-128f-robust_BAS b/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-128f-robust_BAS new file mode 100644 index 0000000..c3836d0 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-128f-robust_BAS @@ -0,0 +1,14 @@ +Parameters: n = 16, h = 66, d = 22, b = 6, k = 33, w = 16 +Running 10 iterations. +thash avg. 1.68 us (0.00 sec); median 3,178 cycles, 1x: 3,178 cycles +f1600x2 avg. 0.82 us (0.00 sec); median 1,549 cycles, 1x: 1,549 cycles +thashx2 avg. 1.67 us (0.00 sec); median 3,174 cycles, 1x: 3,174 cycles +Generating keypair.. avg. 3656.10 us (0.00 sec); median 6,980,377 cycles, 1x: 6,980,377 cycles + - WOTS pk gen 2x.. avg. 915.68 us (0.00 sec); median 1,741,601 cycles, 4x: 6,966,404 cycles +Signing.. avg. 84690.34 us (0.08 sec); median 162,089,547 cycles, 1x: 162,089,547 cycles + - FORS signing.. avg. 4449.58 us (0.00 sec); median 8,502,653 cycles, 1x: 8,502,653 cycles + - WOTS pk gen x2.. avg. 916.25 us (0.00 sec); median 1,742,499 cycles, 88x: 153,339,912 cycles +Verifying.. avg. 5929.28 us (0.01 sec); median 11,337,784 cycles, 1x: 11,337,784 cycles +Signature size: 17088 (16.69 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-128f-robust_C b/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-128f-robust_C new file mode 100644 index 0000000..97abc44 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-128f-robust_C @@ -0,0 +1,14 @@ +Parameters: n = 16, h = 66, d = 22, b = 6, k = 33, w = 16 +Running 10 iterations. +thash avg. 1.80 us (0.00 sec); median 3,441 cycles, 1x: 3,441 cycles +f1600x2 avg. 0.89 us (0.00 sec); median 1,684 cycles, 1x: 1,684 cycles +thashx2 avg. 1.80 us (0.00 sec); median 3,436 cycles, 1x: 3,436 cycles +Generating keypair.. avg. 3964.83 us (0.00 sec); median 7,570,502 cycles, 1x: 7,570,502 cycles + - WOTS pk gen 2x.. avg. 992.71 us (0.00 sec); median 1,888,898 cycles, 4x: 7,555,592 cycles +Signing.. avg. 91828.88 us (0.09 sec); median 175,705,544 cycles, 1x: 175,705,544 cycles + - FORS signing.. avg. 4813.36 us (0.00 sec); median 9,184,871 cycles, 1x: 9,184,871 cycles + - WOTS pk gen x2.. avg. 993.71 us (0.00 sec); median 1,888,725 cycles, 88x: 166,207,800 cycles +Verifying.. avg. 6170.11 us (0.01 sec); median 11,795,667 cycles, 1x: 11,795,667 cycles +Signature size: 17088 (16.69 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-128f-robust_COTHANV8 b/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-128f-robust_COTHANV8 new file mode 100644 index 0000000..f317b8c --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-128f-robust_COTHANV8 @@ -0,0 +1,14 @@ +Parameters: n = 16, h = 66, d = 22, b = 6, k = 33, w = 16 +Running 10 iterations. +thash avg. 2.56 us (0.00 sec); median 4,886 cycles, 1x: 4,886 cycles +f1600x2 avg. 1.25 us (0.00 sec); median 2,390 cycles, 1x: 2,390 cycles +thashx2 avg. 2.56 us (0.00 sec); median 4,884 cycles, 1x: 4,884 cycles +Generating keypair.. avg. 5567.20 us (0.01 sec); median 10,641,228 cycles, 1x: 10,641,228 cycles + - WOTS pk gen 2x.. avg. 1391.78 us (0.00 sec); median 2,655,039 cycles, 4x: 10,620,156 cycles +Signing.. avg. 129069.29 us (0.13 sec); median 247,081,815 cycles, 1x: 247,081,815 cycles + - FORS signing.. avg. 6775.02 us (0.01 sec); median 12,952,491 cycles, 1x: 12,952,491 cycles + - WOTS pk gen x2.. avg. 1398.41 us (0.00 sec); median 2,655,763 cycles, 88x: 233,707,144 cycles +Verifying.. avg. 8997.87 us (0.01 sec); median 17,209,781 cycles, 1x: 17,209,781 cycles +Signature size: 17088 (16.69 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-128f-simple_BAS b/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-128f-simple_BAS new file mode 100644 index 0000000..7bc035d --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-128f-simple_BAS @@ -0,0 +1,14 @@ +Parameters: n = 16, h = 66, d = 22, b = 6, k = 33, w = 16 +Running 10 iterations. +thash avg. 0.84 us (0.00 sec); median 1,588 cycles, 1x: 1,588 cycles +f1600x2 avg. 0.81 us (0.00 sec); median 1,550 cycles, 1x: 1,550 cycles +thashx2 avg. 0.83 us (0.00 sec); median 1,584 cycles, 1x: 1,584 cycles +Generating keypair.. avg. 1884.06 us (0.00 sec); median 3,591,774 cycles, 1x: 3,591,774 cycles + - WOTS pk gen 2x.. avg. 472.65 us (0.00 sec); median 895,697 cycles, 4x: 3,582,788 cycles +Signing.. avg. 43969.89 us (0.04 sec); median 84,142,589 cycles, 1x: 84,142,589 cycles + - FORS signing.. avg. 2676.63 us (0.00 sec); median 5,111,249 cycles, 1x: 5,111,249 cycles + - WOTS pk gen x2.. avg. 473.11 us (0.00 sec); median 895,634 cycles, 88x: 78,815,792 cycles +Verifying.. avg. 2883.50 us (0.00 sec); median 5,507,962 cycles, 1x: 5,507,962 cycles +Signature size: 17088 (16.69 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-128f-simple_C b/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-128f-simple_C new file mode 100644 index 0000000..e903dd2 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-128f-simple_C @@ -0,0 +1,14 @@ +Parameters: n = 16, h = 66, d = 22, b = 6, k = 33, w = 16 +Running 10 iterations. +thash avg. 0.90 us (0.00 sec); median 1,719 cycles, 1x: 1,719 cycles +f1600x2 avg. 0.89 us (0.00 sec); median 1,685 cycles, 1x: 1,685 cycles +thashx2 avg. 0.90 us (0.00 sec); median 1,717 cycles, 1x: 1,717 cycles +Generating keypair.. avg. 2051.07 us (0.00 sec); median 3,907,584 cycles, 1x: 3,907,584 cycles + - WOTS pk gen 2x.. avg. 513.27 us (0.00 sec); median 974,712 cycles, 4x: 3,898,848 cycles +Signing.. avg. 47838.91 us (0.05 sec); median 91,505,188 cycles, 1x: 91,505,188 cycles + - FORS signing.. avg. 2896.75 us (0.00 sec); median 5,532,793 cycles, 1x: 5,532,793 cycles + - WOTS pk gen x2.. avg. 513.61 us (0.00 sec); median 974,953 cycles, 88x: 85,795,864 cycles +Verifying.. avg. 3144.76 us (0.00 sec); median 6,004,346 cycles, 1x: 6,004,346 cycles +Signature size: 17088 (16.69 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-128f-simple_COTHANV8 b/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-128f-simple_COTHANV8 new file mode 100644 index 0000000..c814d65 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-128f-simple_COTHANV8 @@ -0,0 +1,14 @@ +Parameters: n = 16, h = 66, d = 22, b = 6, k = 33, w = 16 +Running 10 iterations. +thash avg. 1.28 us (0.00 sec); median 2,433 cycles, 1x: 2,433 cycles +f1600x2 avg. 1.25 us (0.00 sec); median 2,388 cycles, 1x: 2,388 cycles +thashx2 avg. 1.28 us (0.00 sec); median 2,429 cycles, 1x: 2,429 cycles +Generating keypair.. avg. 2892.31 us (0.00 sec); median 5,521,938 cycles, 1x: 5,521,938 cycles + - WOTS pk gen 2x.. avg. 725.86 us (0.00 sec); median 1,378,044 cycles, 4x: 5,512,176 cycles +Signing.. avg. 67529.29 us (0.07 sec); median 129,266,161 cycles, 1x: 129,266,161 cycles + - FORS signing.. avg. 4075.87 us (0.00 sec); median 7,788,116 cycles, 1x: 7,788,116 cycles + - WOTS pk gen x2.. avg. 725.13 us (0.00 sec); median 1,378,806 cycles, 88x: 121,334,928 cycles +Verifying.. avg. 4517.89 us (0.00 sec); median 8,633,662 cycles, 1x: 8,633,662 cycles +Signature size: 17088 (16.69 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-128s-robust_BAS b/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-128s-robust_BAS new file mode 100644 index 0000000..a3f0c57 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-128s-robust_BAS @@ -0,0 +1,14 @@ +Parameters: n = 16, h = 63, d = 7, b = 12, k = 14, w = 16 +Running 10 iterations. +thash avg. 1.67 us (0.00 sec); median 3,180 cycles, 1x: 3,180 cycles +f1600x2 avg. 0.81 us (0.00 sec); median 1,550 cycles, 1x: 1,550 cycles +thashx2 avg. 1.66 us (0.00 sec); median 3,175 cycles, 1x: 3,175 cycles +Generating keypair.. avg. 233390.15 us (0.23 sec); median 446,643,525 cycles, 1x: 446,643,525 cycles + - WOTS pk gen 2x.. avg. 914.48 us (0.00 sec); median 1,741,102 cycles, 256x: 445,722,112 cycles +Signing.. avg. 1624245.54 us (1.62 sec); median 3,356,043,555 cycles, 1x: 3,356,043,555 cycles + - FORS signing.. avg. 95728.07 us (0.10 sec); median 229,400,194 cycles, 1x: 229,400,194 cycles + - WOTS pk gen x2.. avg. 731.80 us (0.00 sec); median 1,741,422 cycles, 1792x: 3,120,628,224 cycles +Verifying.. avg. 1613.18 us (0.00 sec); median 3,850,357 cycles, 1x: 3,850,357 cycles +Signature size: 7856 (7.67 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-128s-robust_C b/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-128s-robust_C new file mode 100644 index 0000000..1039dd8 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-128s-robust_C @@ -0,0 +1,14 @@ +Parameters: n = 16, h = 63, d = 7, b = 12, k = 14, w = 16 +Running 10 iterations. +thash avg. 1.81 us (0.00 sec); median 3,448 cycles, 1x: 3,448 cycles +f1600x2 avg. 0.89 us (0.00 sec); median 1,685 cycles, 1x: 1,685 cycles +thashx2 avg. 1.80 us (0.00 sec); median 3,442 cycles, 1x: 3,442 cycles +Generating keypair.. avg. 252709.10 us (0.25 sec); median 483,663,631 cycles, 1x: 483,663,631 cycles + - WOTS pk gen 2x.. avg. 991.93 us (0.00 sec); median 1,885,462 cycles, 256x: 482,678,272 cycles +Signing.. avg. 1898216.66 us (1.90 sec); median 3,633,790,296 cycles, 1x: 3,633,790,296 cycles + - FORS signing.. avg. 129655.83 us (0.13 sec); median 248,176,718 cycles, 1x: 248,176,718 cycles + - WOTS pk gen x2.. avg. 990.59 us (0.00 sec); median 1,885,572 cycles, 1792x: 3,378,945,024 cycles +Verifying.. avg. 2203.40 us (0.00 sec); median 4,194,130 cycles, 1x: 4,194,130 cycles +Signature size: 7856 (7.67 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-128s-robust_COTHANV8 b/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-128s-robust_COTHANV8 new file mode 100644 index 0000000..3aa0442 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-128s-robust_COTHANV8 @@ -0,0 +1,14 @@ +Parameters: n = 16, h = 63, d = 7, b = 12, k = 14, w = 16 +Running 10 iterations. +thash avg. 2.56 us (0.00 sec); median 4,880 cycles, 1x: 4,880 cycles +f1600x2 avg. 1.25 us (0.00 sec); median 2,389 cycles, 1x: 2,389 cycles +thashx2 avg. 2.55 us (0.00 sec); median 4,874 cycles, 1x: 4,874 cycles +Generating keypair.. avg. 355837.58 us (0.36 sec); median 681,005,510 cycles, 1x: 681,005,510 cycles + - WOTS pk gen 2x.. avg. 1395.99 us (0.00 sec); median 2,656,439 cycles, 256x: 680,048,384 cycles +Signing.. avg. 2674010.03 us (2.67 sec); median 5,118,301,851 cycles, 1x: 5,118,301,851 cycles + - FORS signing.. avg. 182887.72 us (0.18 sec); median 349,998,523 cycles, 1x: 349,998,523 cycles + - WOTS pk gen x2.. avg. 1395.31 us (0.00 sec); median 2,655,775 cycles, 1792x: 4,759,148,800 cycles +Verifying.. avg. 3244.49 us (0.00 sec); median 6,188,066 cycles, 1x: 6,188,066 cycles +Signature size: 7856 (7.67 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-128s-simple_BAS b/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-128s-simple_BAS new file mode 100644 index 0000000..3225e3c --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-128s-simple_BAS @@ -0,0 +1,14 @@ +Parameters: n = 16, h = 63, d = 7, b = 12, k = 14, w = 16 +Running 10 iterations. +thash avg. 0.83 us (0.00 sec); median 1,587 cycles, 1x: 1,587 cycles +f1600x2 avg. 0.82 us (0.00 sec); median 1,550 cycles, 1x: 1,550 cycles +thashx2 avg. 0.83 us (0.00 sec); median 1,582 cycles, 1x: 1,582 cycles +Generating keypair.. avg. 120320.54 us (0.12 sec); median 230,305,669 cycles, 1x: 230,305,669 cycles + - WOTS pk gen 2x.. avg. 473.54 us (0.00 sec); median 898,592 cycles, 256x: 230,039,552 cycles +Signing.. avg. 914268.38 us (0.91 sec); median 1,749,875,996 cycles, 1x: 1,749,875,996 cycles + - FORS signing.. avg. 71864.64 us (0.07 sec); median 137,569,707 cycles, 1x: 137,569,707 cycles + - WOTS pk gen x2.. avg. 473.54 us (0.00 sec); median 897,623 cycles, 1792x: 1,608,540,416 cycles +Verifying.. avg. 985.61 us (0.00 sec); median 1,874,134 cycles, 1x: 1,874,134 cycles +Signature size: 7856 (7.67 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-128s-simple_C b/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-128s-simple_C new file mode 100644 index 0000000..d27e519 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-128s-simple_C @@ -0,0 +1,14 @@ +Parameters: n = 16, h = 63, d = 7, b = 12, k = 14, w = 16 +Running 10 iterations. +thash avg. 0.90 us (0.00 sec); median 1,720 cycles, 1x: 1,720 cycles +f1600x2 avg. 0.89 us (0.00 sec); median 1,684 cycles, 1x: 1,684 cycles +thashx2 avg. 0.90 us (0.00 sec); median 1,717 cycles, 1x: 1,717 cycles +Generating keypair.. avg. 130626.70 us (0.13 sec); median 250,024,375 cycles, 1x: 250,024,375 cycles + - WOTS pk gen 2x.. avg. 513.31 us (0.00 sec); median 974,920 cycles, 256x: 249,579,520 cycles +Signing.. avg. 998734.82 us (1.00 sec); median 1,911,512,050 cycles, 1x: 1,911,512,050 cycles + - FORS signing.. avg. 84236.06 us (0.08 sec); median 161,200,835 cycles, 1x: 161,200,835 cycles + - WOTS pk gen x2.. avg. 514.82 us (0.00 sec); median 975,020 cycles, 1792x: 1,747,235,840 cycles +Verifying.. avg. 1159.84 us (0.00 sec); median 2,207,071 cycles, 1x: 2,207,071 cycles +Signature size: 7856 (7.67 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-128s-simple_COTHANV8 b/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-128s-simple_COTHANV8 new file mode 100644 index 0000000..bae111c --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-128s-simple_COTHANV8 @@ -0,0 +1,14 @@ +Parameters: n = 16, h = 63, d = 7, b = 12, k = 14, w = 16 +Running 10 iterations. +thash avg. 1.27 us (0.00 sec); median 2,430 cycles, 1x: 2,430 cycles +f1600x2 avg. 1.25 us (0.00 sec); median 2,390 cycles, 1x: 2,390 cycles +thashx2 avg. 1.27 us (0.00 sec); median 2,421 cycles, 1x: 2,421 cycles +Generating keypair.. avg. 183809.10 us (0.18 sec); median 351,883,525 cycles, 1x: 351,883,525 cycles + - WOTS pk gen 2x.. avg. 722.70 us (0.00 sec); median 1,371,474 cycles, 256x: 351,097,344 cycles +Signing.. avg. 1396793.21 us (1.40 sec); median 2,673,336,821 cycles, 1x: 2,673,336,821 cycles + - FORS signing.. avg. 109664.05 us (0.11 sec); median 209,913,659 cycles, 1x: 209,913,659 cycles + - WOTS pk gen x2.. avg. 720.63 us (0.00 sec); median 1,371,799 cycles, 1792x: 2,458,263,808 cycles +Verifying.. avg. 1647.38 us (0.00 sec); median 3,137,099 cycles, 1x: 3,137,099 cycles +Signature size: 7856 (7.67 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-192f-robust_BAS b/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-192f-robust_BAS new file mode 100644 index 0000000..944aafb --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-192f-robust_BAS @@ -0,0 +1,14 @@ +Parameters: n = 24, h = 66, d = 22, b = 8, k = 33, w = 16 +Running 10 iterations. +thash avg. 1.59 us (0.00 sec); median 3,184 cycles, 1x: 3,184 cycles +f1600x2 avg. 0.78 us (0.00 sec); median 1,551 cycles, 1x: 1,551 cycles +thashx2 avg. 1.59 us (0.00 sec); median 3,178 cycles, 1x: 3,178 cycles +Generating keypair.. avg. 5081.61 us (0.01 sec); median 10,203,467 cycles, 1x: 10,203,467 cycles + - WOTS pk gen 2x.. avg. 1273.45 us (0.00 sec); median 2,547,714 cycles, 4x: 10,190,856 cycles +Signing.. avg. 130125.71 us (0.13 sec); median 258,383,158 cycles, 1x: 258,383,158 cycles + - FORS signing.. avg. 17718.58 us (0.02 sec); median 33,886,760 cycles, 1x: 33,886,760 cycles + - WOTS pk gen x2.. avg. 1337.03 us (0.00 sec); median 2,549,270 cycles, 88x: 224,335,760 cycles +Verifying.. avg. 8263.89 us (0.01 sec); median 15,786,913 cycles, 1x: 15,786,913 cycles +Signature size: 35664 (34.83 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-192f-robust_C b/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-192f-robust_C new file mode 100644 index 0000000..7286077 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-192f-robust_C @@ -0,0 +1,14 @@ +Parameters: n = 24, h = 66, d = 22, b = 8, k = 33, w = 16 +Running 10 iterations. +thash avg. 1.72 us (0.00 sec); median 3,445 cycles, 1x: 3,445 cycles +f1600x2 avg. 0.84 us (0.00 sec); median 1,685 cycles, 1x: 1,685 cycles +thashx2 avg. 1.72 us (0.00 sec); median 3,438 cycles, 1x: 3,438 cycles +Generating keypair.. avg. 5499.59 us (0.01 sec); median 11,041,578 cycles, 1x: 11,041,578 cycles + - WOTS pk gen 2x.. avg. 1377.89 us (0.00 sec); median 2,756,941 cycles, 4x: 11,027,764 cycles +Signing.. avg. 139058.85 us (0.14 sec); median 279,544,603 cycles, 1x: 279,544,603 cycles + - FORS signing.. avg. 18232.49 us (0.02 sec); median 36,641,941 cycles, 1x: 36,641,941 cycles + - WOTS pk gen x2.. avg. 1376.52 us (0.00 sec); median 2,757,073 cycles, 88x: 242,622,424 cycles +Verifying.. avg. 8469.52 us (0.01 sec); median 17,011,498 cycles, 1x: 17,011,498 cycles +Signature size: 35664 (34.83 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-192f-robust_COTHANV8 b/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-192f-robust_COTHANV8 new file mode 100644 index 0000000..ed43dd9 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-192f-robust_COTHANV8 @@ -0,0 +1,14 @@ +Parameters: n = 24, h = 66, d = 22, b = 8, k = 33, w = 16 +Running 10 iterations. +thash avg. 2.55 us (0.00 sec); median 4,873 cycles, 1x: 4,873 cycles +f1600x2 avg. 1.26 us (0.00 sec); median 2,399 cycles, 1x: 2,399 cycles +thashx2 avg. 2.55 us (0.00 sec); median 4,864 cycles, 1x: 4,864 cycles +Generating keypair.. avg. 8165.38 us (0.01 sec); median 15,615,501 cycles, 1x: 15,615,501 cycles + - WOTS pk gen 2x.. avg. 2043.20 us (0.00 sec); median 3,897,691 cycles, 4x: 15,590,764 cycles +Signing.. avg. 206595.09 us (0.21 sec); median 395,348,162 cycles, 1x: 395,348,162 cycles + - FORS signing.. avg. 27067.56 us (0.03 sec); median 51,766,075 cycles, 1x: 51,766,075 cycles + - WOTS pk gen x2.. avg. 2042.69 us (0.00 sec); median 3,896,985 cycles, 88x: 342,934,680 cycles +Verifying.. avg. 12446.35 us (0.01 sec); median 23,797,197 cycles, 1x: 23,797,197 cycles +Signature size: 35664 (34.83 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-192f-simple_BAS b/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-192f-simple_BAS new file mode 100644 index 0000000..d09a050 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-192f-simple_BAS @@ -0,0 +1,14 @@ +Parameters: n = 24, h = 66, d = 22, b = 8, k = 33, w = 16 +Running 10 iterations. +thash avg. 0.80 us (0.00 sec); median 1,590 cycles, 1x: 1,590 cycles +f1600x2 avg. 0.78 us (0.00 sec); median 1,551 cycles, 1x: 1,551 cycles +thashx2 avg. 0.79 us (0.00 sec); median 1,587 cycles, 1x: 1,587 cycles +Generating keypair.. avg. 2627.91 us (0.00 sec); median 5,268,394 cycles, 1x: 5,268,394 cycles + - WOTS pk gen 2x.. avg. 659.30 us (0.00 sec); median 1,314,918 cycles, 4x: 5,259,672 cycles +Signing.. avg. 67809.67 us (0.07 sec); median 136,307,446 cycles, 1x: 136,307,446 cycles + - FORS signing.. avg. 10147.81 us (0.01 sec); median 20,390,453 cycles, 1x: 20,390,453 cycles + - WOTS pk gen x2.. avg. 660.45 us (0.00 sec); median 1,315,448 cycles, 88x: 115,759,424 cycles +Verifying.. avg. 4013.02 us (0.00 sec); median 8,053,271 cycles, 1x: 8,053,271 cycles +Signature size: 35664 (34.83 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-192f-simple_C b/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-192f-simple_C new file mode 100644 index 0000000..4f2a067 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-192f-simple_C @@ -0,0 +1,14 @@ +Parameters: n = 24, h = 66, d = 22, b = 8, k = 33, w = 16 +Running 10 iterations. +thash avg. 0.86 us (0.00 sec); median 1,723 cycles, 1x: 1,723 cycles +f1600x2 avg. 0.84 us (0.00 sec); median 1,686 cycles, 1x: 1,686 cycles +thashx2 avg. 0.86 us (0.00 sec); median 1,718 cycles, 1x: 1,718 cycles +Generating keypair.. avg. 2845.23 us (0.00 sec); median 5,701,956 cycles, 1x: 5,701,956 cycles + - WOTS pk gen 2x.. avg. 713.41 us (0.00 sec); median 1,423,686 cycles, 4x: 5,694,744 cycles +Signing.. avg. 73422.37 us (0.07 sec); median 147,615,271 cycles, 1x: 147,615,271 cycles + - FORS signing.. avg. 11006.01 us (0.01 sec); median 22,116,069 cycles, 1x: 22,116,069 cycles + - WOTS pk gen x2.. avg. 713.71 us (0.00 sec); median 1,424,674 cycles, 88x: 125,371,312 cycles +Verifying.. avg. 4358.49 us (0.00 sec); median 8,748,819 cycles, 1x: 8,748,819 cycles +Signature size: 35664 (34.83 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-192f-simple_COTHANV8 b/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-192f-simple_COTHANV8 new file mode 100644 index 0000000..f975e59 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-192f-simple_COTHANV8 @@ -0,0 +1,14 @@ +Parameters: n = 24, h = 66, d = 22, b = 8, k = 33, w = 16 +Running 10 iterations. +thash avg. 1.28 us (0.00 sec); median 2,436 cycles, 1x: 2,436 cycles +f1600x2 avg. 1.26 us (0.00 sec); median 2,398 cycles, 1x: 2,398 cycles +thashx2 avg. 1.28 us (0.00 sec); median 2,433 cycles, 1x: 2,433 cycles +Generating keypair.. avg. 4226.34 us (0.00 sec); median 8,071,372 cycles, 1x: 8,071,372 cycles + - WOTS pk gen 2x.. avg. 1060.02 us (0.00 sec); median 2,015,414 cycles, 4x: 8,061,656 cycles +Signing.. avg. 109011.49 us (0.11 sec); median 208,655,936 cycles, 1x: 208,655,936 cycles + - FORS signing.. avg. 16250.50 us (0.02 sec); median 31,072,994 cycles, 1x: 31,072,994 cycles + - WOTS pk gen x2.. avg. 1059.89 us (0.00 sec); median 2,016,262 cycles, 88x: 177,431,056 cycles +Verifying.. avg. 6457.49 us (0.01 sec); median 12,344,834 cycles, 1x: 12,344,834 cycles +Signature size: 35664 (34.83 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-192s-robust_BAS b/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-192s-robust_BAS new file mode 100644 index 0000000..5c57fce --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-192s-robust_BAS @@ -0,0 +1,14 @@ +Parameters: n = 24, h = 63, d = 7, b = 14, k = 17, w = 16 +Running 10 iterations. +thash avg. 1.67 us (0.00 sec); median 3,185 cycles, 1x: 3,185 cycles +f1600x2 avg. 0.82 us (0.00 sec); median 1,552 cycles, 1x: 1,552 cycles +thashx2 avg. 1.67 us (0.00 sec); median 3,178 cycles, 1x: 3,178 cycles +Generating keypair.. avg. 340902.87 us (0.34 sec); median 652,603,960 cycles, 1x: 652,603,960 cycles + - WOTS pk gen 2x.. avg. 1341.27 us (0.00 sec); median 2,550,761 cycles, 256x: 652,994,816 cycles +Signing.. avg. 2969380.41 us (2.97 sec); median 5,684,106,313 cycles, 1x: 5,684,106,313 cycles + - FORS signing.. avg. 582860.30 us (0.58 sec); median 1,115,709,750 cycles, 1x: 1,115,709,750 cycles + - WOTS pk gen x2.. avg. 1337.33 us (0.00 sec); median 2,550,124 cycles, 1792x: 4,569,822,208 cycles +Verifying.. avg. 3027.41 us (0.00 sec); median 5,781,148 cycles, 1x: 5,781,148 cycles +Signature size: 16224 (15.84 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-192s-robust_C b/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-192s-robust_C new file mode 100644 index 0000000..998668e --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-192s-robust_C @@ -0,0 +1,14 @@ +Parameters: n = 24, h = 63, d = 7, b = 14, k = 17, w = 16 +Running 10 iterations. +thash avg. 1.81 us (0.00 sec); median 3,448 cycles, 1x: 3,448 cycles +f1600x2 avg. 0.89 us (0.00 sec); median 1,687 cycles, 1x: 1,687 cycles +thashx2 avg. 1.80 us (0.00 sec); median 3,440 cycles, 1x: 3,440 cycles +Generating keypair.. avg. 363139.48 us (0.36 sec); median 706,555,069 cycles, 1x: 706,555,069 cycles + - WOTS pk gen 2x.. avg. 1102.82 us (0.00 sec); median 2,757,870 cycles, 256x: 706,014,720 cycles +Signing.. avg. 2672224.61 us (2.67 sec); median 6,150,668,387 cycles, 1x: 6,150,668,387 cycles + - FORS signing.. avg. 614173.68 us (0.61 sec); median 1,205,210,706 cycles, 1x: 1,205,210,706 cycles + - WOTS pk gen x2.. avg. 1447.91 us (0.00 sec); median 2,757,207 cycles, 1792x: 4,940,914,944 cycles +Verifying.. avg. 3188.18 us (0.00 sec); median 6,079,068 cycles, 1x: 6,079,068 cycles +Signature size: 16224 (15.84 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-192s-robust_COTHANV8 b/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-192s-robust_COTHANV8 new file mode 100644 index 0000000..880a810 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-192s-robust_COTHANV8 @@ -0,0 +1,14 @@ +Parameters: n = 24, h = 63, d = 7, b = 14, k = 17, w = 16 +Running 10 iterations. +thash avg. 2.21 us (0.00 sec); median 4,865 cycles, 1x: 4,865 cycles +f1600x2 avg. 1.10 us (0.00 sec); median 2,399 cycles, 1x: 2,399 cycles +thashx2 avg. 2.21 us (0.00 sec); median 4,862 cycles, 1x: 4,862 cycles +Generating keypair.. avg. 473503.83 us (0.47 sec); median 998,896,958 cycles, 1x: 998,896,958 cycles + - WOTS pk gen 2x.. avg. 1945.92 us (0.00 sec); median 3,898,635 cycles, 256x: 998,050,560 cycles +Signing.. avg. 4513185.48 us (4.51 sec); median 8,696,851,017 cycles, 1x: 8,696,851,017 cycles + - FORS signing.. avg. 890196.12 us (0.89 sec); median 1,703,983,209 cycles, 1x: 1,703,983,209 cycles + - WOTS pk gen x2.. avg. 2045.24 us (0.00 sec); median 3,899,264 cycles, 1792x: 6,987,481,088 cycles +Verifying.. avg. 4521.74 us (0.00 sec); median 8,628,872 cycles, 1x: 8,628,872 cycles +Signature size: 16224 (15.84 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-192s-simple_BAS b/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-192s-simple_BAS new file mode 100644 index 0000000..c205369 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-192s-simple_BAS @@ -0,0 +1,14 @@ +Parameters: n = 24, h = 63, d = 7, b = 14, k = 17, w = 16 +Running 10 iterations. +thash avg. 0.84 us (0.00 sec); median 1,591 cycles, 1x: 1,591 cycles +f1600x2 avg. 0.82 us (0.00 sec); median 1,554 cycles, 1x: 1,554 cycles +thashx2 avg. 0.83 us (0.00 sec); median 1,588 cycles, 1x: 1,588 cycles +Generating keypair.. avg. 176145.51 us (0.18 sec); median 337,127,669 cycles, 1x: 337,127,669 cycles + - WOTS pk gen 2x.. avg. 691.69 us (0.00 sec); median 1,317,369 cycles, 256x: 337,246,464 cycles +Signing.. avg. 1582853.15 us (1.58 sec); median 3,030,067,374 cycles, 1x: 3,030,067,374 cycles + - FORS signing.. avg. 349967.68 us (0.35 sec); median 669,950,425 cycles, 1x: 669,950,425 cycles + - WOTS pk gen x2.. avg. 693.91 us (0.00 sec); median 1,317,191 cycles, 1792x: 2,360,406,272 cycles +Verifying.. avg. 1470.07 us (0.00 sec); median 2,805,039 cycles, 1x: 2,805,039 cycles +Signature size: 16224 (15.84 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-192s-simple_C b/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-192s-simple_C new file mode 100644 index 0000000..21979b4 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-192s-simple_C @@ -0,0 +1,14 @@ +Parameters: n = 24, h = 63, d = 7, b = 14, k = 17, w = 16 +Running 10 iterations. +thash avg. 0.90 us (0.00 sec); median 1,722 cycles, 1x: 1,722 cycles +f1600x2 avg. 0.89 us (0.00 sec); median 1,686 cycles, 1x: 1,686 cycles +thashx2 avg. 0.91 us (0.00 sec); median 1,719 cycles, 1x: 1,719 cycles +Generating keypair.. avg. 191434.02 us (0.19 sec); median 366,361,974 cycles, 1x: 366,361,974 cycles + - WOTS pk gen 2x.. avg. 747.72 us (0.00 sec); median 1,423,345 cycles, 256x: 364,376,320 cycles +Signing.. avg. 1717635.37 us (1.72 sec); median 3,288,067,296 cycles, 1x: 3,288,067,296 cycles + - FORS signing.. avg. 377943.75 us (0.38 sec); median 723,429,167 cycles, 1x: 723,429,167 cycles + - WOTS pk gen x2.. avg. 750.67 us (0.00 sec); median 1,423,557 cycles, 1792x: 2,551,014,144 cycles +Verifying.. avg. 1596.96 us (0.00 sec); median 3,040,709 cycles, 1x: 3,040,709 cycles +Signature size: 16224 (15.84 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-192s-simple_COTHANV8 b/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-192s-simple_COTHANV8 new file mode 100644 index 0000000..03dcb41 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-192s-simple_COTHANV8 @@ -0,0 +1,14 @@ +Parameters: n = 24, h = 63, d = 7, b = 14, k = 17, w = 16 +Running 10 iterations. +thash avg. 1.28 us (0.00 sec); median 2,437 cycles, 1x: 2,437 cycles +f1600x2 avg. 1.26 us (0.00 sec); median 2,399 cycles, 1x: 2,399 cycles +thashx2 avg. 1.27 us (0.00 sec); median 2,428 cycles, 1x: 2,428 cycles +Generating keypair.. avg. 269957.46 us (0.27 sec); median 516,614,433 cycles, 1x: 516,614,433 cycles + - WOTS pk gen 2x.. avg. 1061.62 us (0.00 sec); median 2,015,079 cycles, 256x: 515,860,224 cycles +Signing.. avg. 2006455.84 us (2.01 sec); median 4,638,596,694 cycles, 1x: 4,638,596,694 cycles + - FORS signing.. avg. 455186.74 us (0.46 sec); median 1,022,682,832 cycles, 1x: 1,022,682,832 cycles + - WOTS pk gen x2.. avg. 919.39 us (0.00 sec); median 2,016,187 cycles, 1792x: 3,613,007,104 cycles +Verifying.. avg. 1984.71 us (0.00 sec); median 4,356,861 cycles, 1x: 4,356,861 cycles +Signature size: 16224 (15.84 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-256f-robust_BAS b/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-256f-robust_BAS new file mode 100644 index 0000000..8aba77c --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-256f-robust_BAS @@ -0,0 +1,14 @@ +Parameters: n = 32, h = 68, d = 17, b = 9, k = 35, w = 16 +Running 10 iterations. +thash avg. 1.66 us (0.00 sec); median 3,181 cycles, 1x: 3,181 cycles +f1600x2 avg. 0.82 us (0.00 sec); median 1,550 cycles, 1x: 1,550 cycles +thashx2 avg. 1.66 us (0.00 sec); median 3,174 cycles, 1x: 3,174 cycles +Generating keypair.. avg. 14071.28 us (0.01 sec); median 26,906,944 cycles, 1x: 26,906,944 cycles + - WOTS pk gen 2x.. avg. 1761.16 us (0.00 sec); median 3,362,751 cycles, 8x: 26,902,008 cycles +Signing.. avg. 276603.02 us (0.28 sec); median 529,376,594 cycles, 1x: 529,376,594 cycles + - FORS signing.. avg. 37521.57 us (0.04 sec); median 71,781,334 cycles, 1x: 71,781,334 cycles + - WOTS pk gen x2.. avg. 1763.97 us (0.00 sec); median 3,362,132 cycles, 136x: 457,249,952 cycles +Verifying.. avg. 8111.26 us (0.01 sec); median 15,485,421 cycles, 1x: 15,485,421 cycles +Signature size: 49856 (48.69 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-256f-robust_C b/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-256f-robust_C new file mode 100644 index 0000000..fcd5800 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-256f-robust_C @@ -0,0 +1,14 @@ +Parameters: n = 32, h = 68, d = 17, b = 9, k = 35, w = 16 +Running 10 iterations. +thash avg. 1.81 us (0.00 sec); median 3,449 cycles, 1x: 3,449 cycles +f1600x2 avg. 0.89 us (0.00 sec); median 1,685 cycles, 1x: 1,685 cycles +thashx2 avg. 1.80 us (0.00 sec); median 3,444 cycles, 1x: 3,444 cycles +Generating keypair.. avg. 15266.08 us (0.02 sec); median 29,198,266 cycles, 1x: 29,198,266 cycles + - WOTS pk gen 2x.. avg. 1912.13 us (0.00 sec); median 3,646,527 cycles, 8x: 29,172,216 cycles +Signing.. avg. 300221.75 us (0.30 sec); median 574,641,179 cycles, 1x: 574,641,179 cycles + - FORS signing.. avg. 40663.22 us (0.04 sec); median 77,774,066 cycles, 1x: 77,774,066 cycles + - WOTS pk gen x2.. avg. 1913.02 us (0.00 sec); median 3,647,130 cycles, 136x: 496,009,680 cycles +Verifying.. avg. 9158.10 us (0.01 sec); median 17,495,528 cycles, 1x: 17,495,528 cycles +Signature size: 49856 (48.69 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-256f-robust_COTHANV8 b/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-256f-robust_COTHANV8 new file mode 100644 index 0000000..4b579cd --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-256f-robust_COTHANV8 @@ -0,0 +1,14 @@ +Parameters: n = 32, h = 68, d = 17, b = 9, k = 35, w = 16 +Running 10 iterations. +thash avg. 2.55 us (0.00 sec); median 4,866 cycles, 1x: 4,866 cycles +f1600x2 avg. 1.26 us (0.00 sec); median 2,396 cycles, 1x: 2,396 cycles +thashx2 avg. 2.54 us (0.00 sec); median 4,861 cycles, 1x: 4,861 cycles +Generating keypair.. avg. 21501.48 us (0.02 sec); median 41,146,169 cycles, 1x: 41,146,169 cycles + - WOTS pk gen 2x.. avg. 2691.04 us (0.00 sec); median 5,138,062 cycles, 8x: 41,104,496 cycles +Signing.. avg. 422953.07 us (0.42 sec); median 809,401,323 cycles, 1x: 809,401,323 cycles + - FORS signing.. avg. 57358.25 us (0.06 sec); median 109,752,520 cycles, 1x: 109,752,520 cycles + - WOTS pk gen x2.. avg. 2694.22 us (0.00 sec); median 5,140,926 cycles, 136x: 699,165,936 cycles +Verifying.. avg. 13227.81 us (0.01 sec); median 25,290,154 cycles, 1x: 25,290,154 cycles +Signature size: 49856 (48.69 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-256f-simple_BAS b/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-256f-simple_BAS new file mode 100644 index 0000000..ee6cd28 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-256f-simple_BAS @@ -0,0 +1,14 @@ +Parameters: n = 32, h = 68, d = 17, b = 9, k = 35, w = 16 +Running 10 iterations. +thash avg. 0.84 us (0.00 sec); median 1,594 cycles, 1x: 1,594 cycles +f1600x2 avg. 0.81 us (0.00 sec); median 1,550 cycles, 1x: 1,550 cycles +thashx2 avg. 0.83 us (0.00 sec); median 1,589 cycles, 1x: 1,589 cycles +Generating keypair.. avg. 7270.84 us (0.01 sec); median 13,900,552 cycles, 1x: 13,900,552 cycles + - WOTS pk gen 2x.. avg. 910.60 us (0.00 sec); median 1,735,724 cycles, 8x: 13,885,792 cycles +Signing.. avg. 146041.34 us (0.15 sec); median 279,563,772 cycles, 1x: 279,563,772 cycles + - FORS signing.. avg. 22605.37 us (0.02 sec); median 43,222,102 cycles, 1x: 43,222,102 cycles + - WOTS pk gen x2.. avg. 912.19 us (0.00 sec); median 1,736,202 cycles, 136x: 236,123,472 cycles +Verifying.. avg. 4267.84 us (0.00 sec); median 8,146,997 cycles, 1x: 8,146,997 cycles +Signature size: 49856 (48.69 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-256f-simple_C b/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-256f-simple_C new file mode 100644 index 0000000..35cb134 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-256f-simple_C @@ -0,0 +1,14 @@ +Parameters: n = 32, h = 68, d = 17, b = 9, k = 35, w = 16 +Running 10 iterations. +thash avg. 0.87 us (0.00 sec); median 1,736 cycles, 1x: 1,736 cycles +f1600x2 avg. 0.84 us (0.00 sec); median 1,684 cycles, 1x: 1,684 cycles +thashx2 avg. 0.87 us (0.00 sec); median 1,730 cycles, 1x: 1,730 cycles +Generating keypair.. avg. 7508.09 us (0.01 sec); median 15,078,156 cycles, 1x: 15,078,156 cycles + - WOTS pk gen 2x.. avg. 942.65 us (0.00 sec); median 1,883,591 cycles, 8x: 15,068,728 cycles +Signing.. avg. 150784.33 us (0.15 sec); median 303,140,572 cycles, 1x: 303,140,572 cycles + - FORS signing.. avg. 23556.41 us (0.02 sec); median 46,806,549 cycles, 1x: 46,806,549 cycles + - WOTS pk gen x2.. avg. 989.40 us (0.00 sec); median 1,884,427 cycles, 136x: 256,282,072 cycles +Verifying.. avg. 4536.09 us (0.00 sec); median 8,663,119 cycles, 1x: 8,663,119 cycles +Signature size: 49856 (48.69 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-256f-simple_COTHANV8 b/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-256f-simple_COTHANV8 new file mode 100644 index 0000000..2468c87 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-256f-simple_COTHANV8 @@ -0,0 +1,14 @@ +Parameters: n = 32, h = 68, d = 17, b = 9, k = 35, w = 16 +Running 10 iterations. +thash avg. 1.29 us (0.00 sec); median 2,440 cycles, 1x: 2,440 cycles +f1600x2 avg. 1.26 us (0.00 sec); median 2,398 cycles, 1x: 2,398 cycles +thashx2 avg. 1.28 us (0.00 sec); median 2,434 cycles, 1x: 2,434 cycles +Generating keypair.. avg. 11119.05 us (0.01 sec); median 21,266,648 cycles, 1x: 21,266,648 cycles + - WOTS pk gen 2x.. avg. 1391.47 us (0.00 sec); median 2,656,272 cycles, 8x: 21,250,176 cycles +Signing.. avg. 223506.43 us (0.22 sec); median 427,565,407 cycles, 1x: 427,565,407 cycles + - FORS signing.. avg. 34495.41 us (0.03 sec); median 65,987,951 cycles, 1x: 65,987,951 cycles + - WOTS pk gen x2.. avg. 1395.16 us (0.00 sec); median 2,656,390 cycles, 136x: 361,269,040 cycles +Verifying.. avg. 6452.39 us (0.01 sec); median 12,329,958 cycles, 1x: 12,329,958 cycles +Signature size: 49856 (48.69 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-256s-robust_BAS b/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-256s-robust_BAS new file mode 100644 index 0000000..55ec7c8 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-256s-robust_BAS @@ -0,0 +1,14 @@ +Parameters: n = 32, h = 64, d = 8, b = 14, k = 22, w = 16 +Running 10 iterations. +thash avg. 1.67 us (0.00 sec); median 3,183 cycles, 1x: 3,183 cycles +f1600x2 avg. 0.81 us (0.00 sec); median 1,550 cycles, 1x: 1,550 cycles +thashx2 avg. 1.67 us (0.00 sec); median 3,179 cycles, 1x: 3,179 cycles +Generating keypair.. avg. 224870.83 us (0.22 sec); median 430,398,648 cycles, 1x: 430,398,648 cycles + - WOTS pk gen 2x.. avg. 1765.27 us (0.00 sec); median 3,361,602 cycles, 128x: 430,285,056 cycles +Signing.. avg. 2552222.28 us (2.55 sec); median 4,885,250,328 cycles, 1x: 4,885,250,328 cycles + - FORS signing.. avg. 753158.63 us (0.75 sec); median 1,441,583,715 cycles, 1x: 1,441,583,715 cycles + - WOTS pk gen x2.. avg. 1764.53 us (0.00 sec); median 3,359,149 cycles, 1024x: 3,439,768,576 cycles +Verifying.. avg. 4410.36 us (0.00 sec); median 8,415,380 cycles, 1x: 8,415,380 cycles +Signature size: 29792 (29.09 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-256s-robust_C b/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-256s-robust_C new file mode 100644 index 0000000..6e622b9 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-256s-robust_C @@ -0,0 +1,14 @@ +Parameters: n = 32, h = 64, d = 8, b = 14, k = 22, w = 16 +Running 10 iterations. +thash avg. 1.38 us (0.00 sec); median 3,446 cycles, 1x: 3,446 cycles +f1600x2 avg. 0.68 us (0.00 sec); median 1,684 cycles, 1x: 1,684 cycles +thashx2 avg. 1.37 us (0.00 sec); median 3,441 cycles, 1x: 3,441 cycles +Generating keypair.. avg. 191304.25 us (0.19 sec); median 466,865,747 cycles, 1x: 466,865,747 cycles + - WOTS pk gen 2x.. avg. 1499.86 us (0.00 sec); median 3,646,903 cycles, 128x: 466,803,584 cycles +Signing.. avg. 2308569.20 us (2.31 sec); median 5,299,006,695 cycles, 1x: 5,299,006,695 cycles + - FORS signing.. avg. 775216.72 us (0.78 sec); median 1,564,108,668 cycles, 1x: 1,564,108,668 cycles + - WOTS pk gen x2.. avg. 1912.91 us (0.00 sec); median 3,645,829 cycles, 1024x: 3,733,328,896 cycles +Verifying.. avg. 4652.82 us (0.00 sec); median 8,891,986 cycles, 1x: 8,891,986 cycles +Signature size: 29792 (29.09 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-256s-robust_COTHANV8 b/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-256s-robust_COTHANV8 new file mode 100644 index 0000000..d81be0f --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-256s-robust_COTHANV8 @@ -0,0 +1,14 @@ +Parameters: n = 32, h = 64, d = 8, b = 14, k = 22, w = 16 +Running 10 iterations. +thash avg. 2.09 us (0.00 sec); median 4,868 cycles, 1x: 4,868 cycles +f1600x2 avg. 1.05 us (0.00 sec); median 2,396 cycles, 1x: 2,396 cycles +thashx2 avg. 2.10 us (0.00 sec); median 4,864 cycles, 1x: 4,864 cycles +Generating keypair.. avg. 281338.16 us (0.28 sec); median 658,303,005 cycles, 1x: 658,303,005 cycles + - WOTS pk gen 2x.. avg. 2190.22 us (0.00 sec); median 5,138,801 cycles, 128x: 657,766,528 cycles +Signing.. avg. 3652383.53 us (3.65 sec); median 7,474,936,125 cycles, 1x: 7,474,936,125 cycles + - FORS signing.. avg. 1153479.27 us (1.15 sec); median 2,207,940,370 cycles, 1x: 2,207,940,370 cycles + - WOTS pk gen x2.. avg. 2691.16 us (0.00 sec); median 5,136,581 cycles, 1024x: 5,259,858,944 cycles +Verifying.. avg. 6655.34 us (0.01 sec); median 12,713,883 cycles, 1x: 12,713,883 cycles +Signature size: 29792 (29.09 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-256s-simple_BAS b/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-256s-simple_BAS new file mode 100644 index 0000000..94b12b1 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-256s-simple_BAS @@ -0,0 +1,14 @@ +Parameters: n = 32, h = 64, d = 8, b = 14, k = 22, w = 16 +Running 10 iterations. +thash avg. 0.84 us (0.00 sec); median 1,594 cycles, 1x: 1,594 cycles +f1600x2 avg. 0.81 us (0.00 sec); median 1,549 cycles, 1x: 1,549 cycles +thashx2 avg. 0.83 us (0.00 sec); median 1,589 cycles, 1x: 1,589 cycles +Generating keypair.. avg. 116421.78 us (0.12 sec); median 222,876,525 cycles, 1x: 222,876,525 cycles + - WOTS pk gen 2x.. avg. 915.20 us (0.00 sec); median 1,738,925 cycles, 128x: 222,582,400 cycles +Signing.. avg. 1385214.68 us (1.39 sec); median 2,651,240,293 cycles, 1x: 2,651,240,293 cycles + - FORS signing.. avg. 453268.54 us (0.45 sec); median 867,553,622 cycles, 1x: 867,553,622 cycles + - WOTS pk gen x2.. avg. 915.23 us (0.00 sec); median 1,740,466 cycles, 1024x: 1,782,237,184 cycles +Verifying.. avg. 2235.66 us (0.00 sec); median 4,249,954 cycles, 1x: 4,249,954 cycles +Signature size: 29792 (29.09 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-256s-simple_C b/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-256s-simple_C new file mode 100644 index 0000000..9882d83 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-256s-simple_C @@ -0,0 +1,14 @@ +Parameters: n = 32, h = 64, d = 8, b = 14, k = 22, w = 16 +Running 10 iterations. +thash avg. 0.91 us (0.00 sec); median 1,728 cycles, 1x: 1,728 cycles +f1600x2 avg. 0.89 us (0.00 sec); median 1,685 cycles, 1x: 1,685 cycles +thashx2 avg. 0.91 us (0.00 sec); median 1,724 cycles, 1x: 1,724 cycles +Generating keypair.. avg. 126297.98 us (0.13 sec); median 241,746,354 cycles, 1x: 241,746,354 cycles + - WOTS pk gen 2x.. avg. 986.53 us (0.00 sec); median 1,886,760 cycles, 128x: 241,505,280 cycles +Signing.. avg. 1500947.76 us (1.50 sec); median 2,873,371,886 cycles, 1x: 2,873,371,886 cycles + - FORS signing.. avg. 490530.33 us (0.49 sec); median 938,945,123 cycles, 1x: 938,945,123 cycles + - WOTS pk gen x2.. avg. 991.73 us (0.00 sec); median 1,886,403 cycles, 1024x: 1,931,676,672 cycles +Verifying.. avg. 2382.58 us (0.00 sec); median 4,536,720 cycles, 1x: 4,536,720 cycles +Signature size: 29792 (29.09 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-256s-simple_COTHANV8 b/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-256s-simple_COTHANV8 new file mode 100644 index 0000000..2a73bc2 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A710/sphincs-shake-256s-simple_COTHANV8 @@ -0,0 +1,14 @@ +Parameters: n = 32, h = 64, d = 8, b = 14, k = 22, w = 16 +Running 10 iterations. +thash avg. 1.28 us (0.00 sec); median 2,439 cycles, 1x: 2,439 cycles +f1600x2 avg. 1.26 us (0.00 sec); median 2,398 cycles, 1x: 2,398 cycles +thashx2 avg. 1.28 us (0.00 sec); median 2,435 cycles, 1x: 2,435 cycles +Generating keypair.. avg. 177811.48 us (0.18 sec); median 340,270,587 cycles, 1x: 340,270,587 cycles + - WOTS pk gen 2x.. avg. 1393.93 us (0.00 sec); median 2,656,405 cycles, 128x: 340,019,840 cycles +Signing.. avg. 1979381.28 us (1.98 sec); median 4,047,907,183 cycles, 1x: 4,047,907,183 cycles + - FORS signing.. avg. 560054.46 us (0.56 sec); median 1,324,794,230 cycles, 1x: 1,324,794,230 cycles + - WOTS pk gen x2.. avg. 1116.46 us (0.00 sec); median 2,655,222 cycles, 1024x: 2,718,947,328 cycles +Verifying.. avg. 2684.95 us (0.00 sec); median 6,300,395 cycles, 1x: 6,300,395 cycles +Signature size: 29792 (29.09 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-128f-robust_C b/sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-128f-robust_C new file mode 100644 index 0000000..4c40183 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-128f-robust_C @@ -0,0 +1,14 @@ +Parameters: n = 16, h = 66, d = 22, b = 6, k = 33, w = 16 +Running 10 iterations. +thash avg. 2.06 us (0.00 sec); median 3,412 cycles, 1x: 3,412 cycles +f1600x2 avg. 1.01 us (0.00 sec); median 1,678 cycles, 1x: 1,678 cycles +thashx2 avg. 2.04 us (0.00 sec); median 3,408 cycles, 1x: 3,408 cycles +Generating keypair.. avg. 4487.07 us (0.00 sec); median 7,506,993 cycles, 1x: 7,506,993 cycles + - WOTS pk gen 2x.. avg. 1112.63 us (0.00 sec); median 1,874,353 cycles, 4x: 7,497,412 cycles +Signing.. avg. 103911.53 us (0.10 sec); median 174,285,242 cycles, 1x: 174,285,242 cycles + - FORS signing.. avg. 5393.28 us (0.01 sec); median 9,107,198 cycles, 1x: 9,107,198 cycles + - WOTS pk gen x2.. avg. 1138.93 us (0.00 sec); median 1,874,212 cycles, 88x: 164,930,656 cycles +Verifying.. avg. 7085.67 us (0.01 sec); median 11,912,059 cycles, 1x: 11,912,059 cycles +Signature size: 17088 (16.69 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-128f-robust_COTHANV8 b/sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-128f-robust_COTHANV8 new file mode 100644 index 0000000..a26039a --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-128f-robust_COTHANV8 @@ -0,0 +1,14 @@ +Parameters: n = 16, h = 66, d = 22, b = 6, k = 33, w = 16 +Running 10 iterations. +thash avg. 2.89 us (0.00 sec); median 4,890 cycles, 1x: 4,890 cycles +f1600x2 avg. 1.44 us (0.00 sec); median 2,396 cycles, 1x: 2,396 cycles +thashx2 avg. 2.90 us (0.00 sec); median 4,882 cycles, 1x: 4,882 cycles +Generating keypair.. avg. 6388.16 us (0.01 sec); median 10,731,381 cycles, 1x: 10,731,381 cycles + - WOTS pk gen 2x.. avg. 1570.75 us (0.00 sec); median 2,669,194 cycles, 4x: 10,676,776 cycles +Signing.. avg. 147734.70 us (0.15 sec); median 249,061,402 cycles, 1x: 249,061,402 cycles + - FORS signing.. avg. 7681.27 us (0.01 sec); median 12,980,637 cycles, 1x: 12,980,637 cycles + - WOTS pk gen x2.. avg. 1577.93 us (0.00 sec); median 2,669,023 cycles, 88x: 234,874,024 cycles +Verifying.. avg. 9985.75 us (0.01 sec); median 16,938,513 cycles, 1x: 16,938,513 cycles +Signature size: 17088 (16.69 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-128f-simple_C b/sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-128f-simple_C new file mode 100644 index 0000000..496e43a --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-128f-simple_C @@ -0,0 +1,14 @@ +Parameters: n = 16, h = 66, d = 22, b = 6, k = 33, w = 16 +Running 10 iterations. +thash avg. 1.05 us (0.00 sec); median 1,699 cycles, 1x: 1,699 cycles +f1600x2 avg. 1.04 us (0.00 sec); median 1,680 cycles, 1x: 1,680 cycles +thashx2 avg. 1.05 us (0.00 sec); median 1,697 cycles, 1x: 1,697 cycles +Generating keypair.. avg. 2363.18 us (0.00 sec); median 3,850,777 cycles, 1x: 3,850,777 cycles + - WOTS pk gen 2x.. avg. 582.57 us (0.00 sec); median 960,473 cycles, 4x: 3,841,892 cycles +Signing.. avg. 54638.15 us (0.05 sec); median 90,185,627 cycles, 1x: 90,185,627 cycles + - FORS signing.. avg. 3326.88 us (0.00 sec); median 5,491,435 cycles, 1x: 5,491,435 cycles + - WOTS pk gen x2.. avg. 584.90 us (0.00 sec); median 960,772 cycles, 88x: 84,547,936 cycles +Verifying.. avg. 3457.19 us (0.00 sec); median 5,702,717 cycles, 1x: 5,702,717 cycles +Signature size: 17088 (16.69 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-128f-simple_COTHANV8 b/sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-128f-simple_COTHANV8 new file mode 100644 index 0000000..d5907d4 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-128f-simple_COTHANV8 @@ -0,0 +1,14 @@ +Parameters: n = 16, h = 66, d = 22, b = 6, k = 33, w = 16 +Running 10 iterations. +thash avg. 1.47 us (0.00 sec); median 2,435 cycles, 1x: 2,435 cycles +f1600x2 avg. 1.44 us (0.00 sec); median 2,396 cycles, 1x: 2,396 cycles +thashx2 avg. 1.46 us (0.00 sec); median 2,431 cycles, 1x: 2,431 cycles +Generating keypair.. avg. 3316.87 us (0.00 sec); median 5,517,219 cycles, 1x: 5,517,219 cycles + - WOTS pk gen 2x.. avg. 831.40 us (0.00 sec); median 1,376,199 cycles, 4x: 5,504,796 cycles +Signing.. avg. 77477.64 us (0.08 sec); median 129,176,384 cycles, 1x: 129,176,384 cycles + - FORS signing.. avg. 4691.00 us (0.00 sec); median 7,793,400 cycles, 1x: 7,793,400 cycles + - WOTS pk gen x2.. avg. 818.72 us (0.00 sec); median 1,376,044 cycles, 88x: 121,091,872 cycles +Verifying.. avg. 5038.86 us (0.01 sec); median 8,443,068 cycles, 1x: 8,443,068 cycles +Signature size: 17088 (16.69 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-128s-robust_C b/sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-128s-robust_C new file mode 100644 index 0000000..9aeaf46 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-128s-robust_C @@ -0,0 +1,14 @@ +Parameters: n = 16, h = 63, d = 7, b = 12, k = 14, w = 16 +Running 10 iterations. +thash avg. 1.94 us (0.00 sec); median 3,413 cycles, 1x: 3,413 cycles +f1600x2 avg. 0.95 us (0.00 sec); median 1,680 cycles, 1x: 1,680 cycles +thashx2 avg. 1.92 us (0.00 sec); median 3,407 cycles, 1x: 3,407 cycles +Generating keypair.. avg. 271385.03 us (0.27 sec); median 479,607,537 cycles, 1x: 479,607,537 cycles + - WOTS pk gen 2x.. avg. 1060.11 us (0.00 sec); median 1,871,066 cycles, 256x: 478,992,896 cycles +Signing.. avg. 2039891.18 us (2.04 sec); median 3,603,102,398 cycles, 1x: 3,603,102,398 cycles + - FORS signing.. avg. 139543.23 us (0.14 sec); median 245,840,341 cycles, 1x: 245,840,341 cycles + - WOTS pk gen x2.. avg. 1058.92 us (0.00 sec); median 1,871,248 cycles, 1792x: 3,353,276,416 cycles +Verifying.. avg. 2418.61 us (0.00 sec); median 4,276,916 cycles, 1x: 4,276,916 cycles +Signature size: 7856 (7.67 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-128s-robust_COTHANV8 b/sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-128s-robust_COTHANV8 new file mode 100644 index 0000000..a096bf9 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-128s-robust_COTHANV8 @@ -0,0 +1,14 @@ +Parameters: n = 16, h = 63, d = 7, b = 12, k = 14, w = 16 +Running 10 iterations. +thash avg. 2.78 us (0.00 sec); median 4,884 cycles, 1x: 4,884 cycles +f1600x2 avg. 1.36 us (0.00 sec); median 2,395 cycles, 1x: 2,395 cycles +thashx2 avg. 2.77 us (0.00 sec); median 4,880 cycles, 1x: 4,880 cycles +Generating keypair.. avg. 389035.45 us (0.39 sec); median 686,058,727 cycles, 1x: 686,058,727 cycles + - WOTS pk gen 2x.. avg. 1527.35 us (0.00 sec); median 2,675,824 cycles, 256x: 685,010,944 cycles +Signing.. avg. 2919339.70 us (2.92 sec); median 5,153,451,754 cycles, 1x: 5,153,451,754 cycles + - FORS signing.. avg. 199091.34 us (0.20 sec); median 351,071,063 cycles, 1x: 351,071,063 cycles + - WOTS pk gen x2.. avg. 1538.98 us (0.00 sec); median 2,673,530 cycles, 1792x: 4,790,965,760 cycles +Verifying.. avg. 3626.07 us (0.00 sec); median 6,358,655 cycles, 1x: 6,358,655 cycles +Signature size: 7856 (7.67 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-128s-simple_C b/sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-128s-simple_C new file mode 100644 index 0000000..c586dc8 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-128s-simple_C @@ -0,0 +1,14 @@ +Parameters: n = 16, h = 63, d = 7, b = 12, k = 14, w = 16 +Running 10 iterations. +thash avg. 0.98 us (0.00 sec); median 1,698 cycles, 1x: 1,698 cycles +f1600x2 avg. 1.00 us (0.00 sec); median 1,681 cycles, 1x: 1,681 cycles +thashx2 avg. 1.00 us (0.00 sec); median 1,693 cycles, 1x: 1,693 cycles +Generating keypair.. avg. 145540.32 us (0.15 sec); median 246,769,401 cycles, 1x: 246,769,401 cycles + - WOTS pk gen 2x.. avg. 564.78 us (0.00 sec); median 961,291 cycles, 256x: 246,090,496 cycles +Signing.. avg. 1089123.28 us (1.09 sec); median 1,875,465,782 cycles, 1x: 1,875,465,782 cycles + - FORS signing.. avg. 85040.63 us (0.09 sec); median 148,142,210 cycles, 1x: 148,142,210 cycles + - WOTS pk gen x2.. avg. 558.32 us (0.00 sec); median 961,579 cycles, 1792x: 1,723,149,568 cycles +Verifying.. avg. 1148.61 us (0.00 sec); median 1,999,973 cycles, 1x: 1,999,973 cycles +Signature size: 7856 (7.67 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-128s-simple_COTHANV8 b/sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-128s-simple_COTHANV8 new file mode 100644 index 0000000..1200052 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-128s-simple_COTHANV8 @@ -0,0 +1,14 @@ +Parameters: n = 16, h = 63, d = 7, b = 12, k = 14, w = 16 +Running 10 iterations. +thash avg. 1.42 us (0.00 sec); median 2,435 cycles, 1x: 2,435 cycles +f1600x2 avg. 1.37 us (0.00 sec); median 2,394 cycles, 1x: 2,394 cycles +thashx2 avg. 1.41 us (0.00 sec); median 2,430 cycles, 1x: 2,430 cycles +Generating keypair.. avg. 201680.60 us (0.20 sec); median 352,979,252 cycles, 1x: 352,979,252 cycles + - WOTS pk gen 2x.. avg. 785.95 us (0.00 sec); median 1,377,199 cycles, 256x: 352,562,944 cycles +Signing.. avg. 1526660.35 us (1.53 sec); median 2,681,123,552 cycles, 1x: 2,681,123,552 cycles + - FORS signing.. avg. 119001.72 us (0.12 sec); median 210,278,705 cycles, 1x: 210,278,705 cycles + - WOTS pk gen x2.. avg. 786.45 us (0.00 sec); median 1,375,844 cycles, 1792x: 2,465,512,448 cycles +Verifying.. avg. 1705.93 us (0.00 sec); median 3,013,516 cycles, 1x: 3,013,516 cycles +Signature size: 7856 (7.67 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-192f-robust_C b/sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-192f-robust_C new file mode 100644 index 0000000..fa493c6 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-192f-robust_C @@ -0,0 +1,14 @@ +Parameters: n = 24, h = 66, d = 22, b = 8, k = 33, w = 16 +Running 10 iterations. +thash avg. 1.94 us (0.00 sec); median 3,431 cycles, 1x: 3,431 cycles +f1600x2 avg. 0.95 us (0.00 sec); median 1,678 cycles, 1x: 1,678 cycles +thashx2 avg. 1.95 us (0.00 sec); median 3,424 cycles, 1x: 3,424 cycles +Generating keypair.. avg. 6217.88 us (0.01 sec); median 10,965,617 cycles, 1x: 10,965,617 cycles + - WOTS pk gen 2x.. avg. 1561.19 us (0.00 sec); median 2,737,103 cycles, 4x: 10,948,412 cycles +Signing.. avg. 157612.20 us (0.16 sec); median 277,590,412 cycles, 1x: 277,590,412 cycles + - FORS signing.. avg. 20576.21 us (0.02 sec); median 36,354,482 cycles, 1x: 36,354,482 cycles + - WOTS pk gen x2.. avg. 1570.05 us (0.00 sec); median 2,737,602 cycles, 88x: 240,908,976 cycles +Verifying.. avg. 9648.35 us (0.01 sec); median 16,993,370 cycles, 1x: 16,993,370 cycles +Signature size: 35664 (34.83 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-192f-robust_COTHANV8 b/sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-192f-robust_COTHANV8 new file mode 100644 index 0000000..5eca262 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-192f-robust_COTHANV8 @@ -0,0 +1,14 @@ +Parameters: n = 24, h = 66, d = 22, b = 8, k = 33, w = 16 +Running 10 iterations. +thash avg. 2.80 us (0.00 sec); median 4,920 cycles, 1x: 4,920 cycles +f1600x2 avg. 1.36 us (0.00 sec); median 2,409 cycles, 1x: 2,409 cycles +thashx2 avg. 2.79 us (0.00 sec); median 4,911 cycles, 1x: 4,911 cycles +Generating keypair.. avg. 8910.69 us (0.01 sec); median 15,709,858 cycles, 1x: 15,709,858 cycles + - WOTS pk gen 2x.. avg. 2227.66 us (0.00 sec); median 3,923,105 cycles, 4x: 15,692,420 cycles +Signing.. avg. 225354.92 us (0.23 sec); median 397,618,096 cycles, 1x: 397,618,096 cycles + - FORS signing.. avg. 29584.05 us (0.03 sec); median 51,997,389 cycles, 1x: 51,997,389 cycles + - WOTS pk gen x2.. avg. 2232.72 us (0.00 sec); median 3,923,834 cycles, 88x: 345,297,392 cycles +Verifying.. avg. 14048.80 us (0.01 sec); median 24,656,543 cycles, 1x: 24,656,543 cycles +Signature size: 35664 (34.83 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-192f-simple_C b/sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-192f-simple_C new file mode 100644 index 0000000..f09c66f --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-192f-simple_C @@ -0,0 +1,14 @@ +Parameters: n = 24, h = 66, d = 22, b = 8, k = 33, w = 16 +Running 10 iterations. +thash avg. 0.97 us (0.00 sec); median 1,707 cycles, 1x: 1,707 cycles +f1600x2 avg. 0.96 us (0.00 sec); median 1,680 cycles, 1x: 1,680 cycles +thashx2 avg. 0.97 us (0.00 sec); median 1,703 cycles, 1x: 1,703 cycles +Generating keypair.. avg. 3213.33 us (0.00 sec); median 5,652,429 cycles, 1x: 5,652,429 cycles + - WOTS pk gen 2x.. avg. 800.06 us (0.00 sec); median 1,411,898 cycles, 4x: 5,647,592 cycles +Signing.. avg. 82895.53 us (0.08 sec); median 146,359,836 cycles, 1x: 146,359,836 cycles + - FORS signing.. avg. 12486.66 us (0.01 sec); median 21,977,623 cycles, 1x: 21,977,623 cycles + - WOTS pk gen x2.. avg. 796.20 us (0.00 sec); median 1,411,657 cycles, 88x: 124,225,816 cycles +Verifying.. avg. 4905.61 us (0.00 sec); median 8,585,218 cycles, 1x: 8,585,218 cycles +Signature size: 35664 (34.83 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-192f-simple_COTHANV8 b/sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-192f-simple_COTHANV8 new file mode 100644 index 0000000..bf21970 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-192f-simple_COTHANV8 @@ -0,0 +1,14 @@ +Parameters: n = 24, h = 66, d = 22, b = 8, k = 33, w = 16 +Running 10 iterations. +thash avg. 1.40 us (0.00 sec); median 2,445 cycles, 1x: 2,445 cycles +f1600x2 avg. 1.37 us (0.00 sec); median 2,411 cycles, 1x: 2,411 cycles +thashx2 avg. 1.40 us (0.00 sec); median 2,441 cycles, 1x: 2,441 cycles +Generating keypair.. avg. 4605.40 us (0.00 sec); median 8,096,681 cycles, 1x: 8,096,681 cycles + - WOTS pk gen 2x.. avg. 1157.80 us (0.00 sec); median 2,021,682 cycles, 4x: 8,086,728 cycles +Signing.. avg. 118715.26 us (0.12 sec); median 209,341,577 cycles, 1x: 209,341,577 cycles + - FORS signing.. avg. 17706.90 us (0.02 sec); median 31,190,381 cycles, 1x: 31,190,381 cycles + - WOTS pk gen x2.. avg. 1156.19 us (0.00 sec); median 2,022,067 cycles, 88x: 177,941,896 cycles +Verifying.. avg. 6968.86 us (0.01 sec); median 12,250,621 cycles, 1x: 12,250,621 cycles +Signature size: 35664 (34.83 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-192s-robust_C b/sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-192s-robust_C new file mode 100644 index 0000000..70b1b19 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-192s-robust_C @@ -0,0 +1,14 @@ +Parameters: n = 24, h = 63, d = 7, b = 14, k = 17, w = 16 +Running 10 iterations. +thash avg. 1.95 us (0.00 sec); median 3,419 cycles, 1x: 3,419 cycles +f1600x2 avg. 0.95 us (0.00 sec); median 1,679 cycles, 1x: 1,679 cycles +thashx2 avg. 1.96 us (0.00 sec); median 3,414 cycles, 1x: 3,414 cycles +Generating keypair.. avg. 401389.63 us (0.40 sec); median 703,600,404 cycles, 1x: 703,600,404 cycles + - WOTS pk gen 2x.. avg. 1570.50 us (0.00 sec); median 2,743,615 cycles, 256x: 702,365,440 cycles +Signing.. avg. 3495824.94 us (3.50 sec); median 6,123,692,880 cycles, 1x: 6,123,692,880 cycles + - FORS signing.. avg. 682153.17 us (0.68 sec); median 1,198,906,082 cycles, 1x: 1,198,906,082 cycles + - WOTS pk gen x2.. avg. 1571.95 us (0.00 sec); median 2,746,415 cycles, 1792x: 4,921,575,680 cycles +Verifying.. avg. 3477.37 us (0.00 sec); median 6,088,118 cycles, 1x: 6,088,118 cycles +Signature size: 16224 (15.84 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-192s-robust_COTHANV8 b/sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-192s-robust_COTHANV8 new file mode 100644 index 0000000..e0d21ae --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-192s-robust_COTHANV8 @@ -0,0 +1,14 @@ +Parameters: n = 24, h = 63, d = 7, b = 14, k = 17, w = 16 +Running 10 iterations. +thash avg. 2.82 us (0.00 sec); median 4,917 cycles, 1x: 4,917 cycles +f1600x2 avg. 1.39 us (0.00 sec); median 2,417 cycles, 1x: 2,417 cycles +thashx2 avg. 2.79 us (0.00 sec); median 4,911 cycles, 1x: 4,911 cycles +Generating keypair.. avg. 571760.47 us (0.57 sec); median 1,006,686,291 cycles, 1x: 1,006,686,291 cycles + - WOTS pk gen 2x.. avg. 2242.08 us (0.00 sec); median 3,922,323 cycles, 256x: 1,004,114,688 cycles +Signing.. avg. 4954045.21 us (4.95 sec); median 8,759,149,070 cycles, 1x: 8,759,149,070 cycles + - FORS signing.. avg. 970376.25 us (0.97 sec); median 1,712,746,803 cycles, 1x: 1,712,746,803 cycles + - WOTS pk gen x2.. avg. 2228.87 us (0.00 sec); median 3,923,001 cycles, 1792x: 7,030,017,792 cycles +Verifying.. avg. 4998.47 us (0.00 sec); median 8,813,618 cycles, 1x: 8,813,618 cycles +Signature size: 16224 (15.84 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-192s-simple_C b/sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-192s-simple_C new file mode 100644 index 0000000..405725a --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-192s-simple_C @@ -0,0 +1,14 @@ +Parameters: n = 24, h = 63, d = 7, b = 14, k = 17, w = 16 +Running 10 iterations. +thash avg. 0.97 us (0.00 sec); median 1,706 cycles, 1x: 1,706 cycles +f1600x2 avg. 0.96 us (0.00 sec); median 1,681 cycles, 1x: 1,681 cycles +thashx2 avg. 0.97 us (0.00 sec); median 1,703 cycles, 1x: 1,703 cycles +Generating keypair.. avg. 206139.46 us (0.21 sec); median 362,191,243 cycles, 1x: 362,191,243 cycles + - WOTS pk gen 2x.. avg. 812.30 us (0.00 sec); median 1,420,290 cycles, 256x: 363,594,240 cycles +Signing.. avg. 1853075.21 us (1.85 sec); median 3,257,344,745 cycles, 1x: 3,257,344,745 cycles + - FORS signing.. avg. 410679.35 us (0.41 sec); median 722,097,182 cycles, 1x: 722,097,182 cycles + - WOTS pk gen x2.. avg. 812.42 us (0.00 sec); median 1,421,547 cycles, 1792x: 2,547,412,224 cycles +Verifying.. avg. 1742.04 us (0.00 sec); median 3,048,482 cycles, 1x: 3,048,482 cycles +Signature size: 16224 (15.84 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-192s-simple_COTHANV8 b/sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-192s-simple_COTHANV8 new file mode 100644 index 0000000..c8a3361 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-192s-simple_COTHANV8 @@ -0,0 +1,14 @@ +Parameters: n = 24, h = 63, d = 7, b = 14, k = 17, w = 16 +Running 10 iterations. +thash avg. 1.40 us (0.00 sec); median 2,447 cycles, 1x: 2,447 cycles +f1600x2 avg. 1.39 us (0.00 sec); median 2,412 cycles, 1x: 2,412 cycles +thashx2 avg. 1.40 us (0.00 sec); median 2,437 cycles, 1x: 2,437 cycles +Generating keypair.. avg. 294785.12 us (0.29 sec); median 518,355,899 cycles, 1x: 518,355,899 cycles + - WOTS pk gen 2x.. avg. 1150.79 us (0.00 sec); median 2,023,517 cycles, 256x: 518,020,352 cycles +Signing.. avg. 2651688.51 us (2.65 sec); median 4,654,280,905 cycles, 1x: 4,654,280,905 cycles + - FORS signing.. avg. 584922.42 us (0.58 sec); median 1,025,675,229 cycles, 1x: 1,025,675,229 cycles + - WOTS pk gen x2.. avg. 1148.13 us (0.00 sec); median 2,022,836 cycles, 1792x: 3,624,922,112 cycles +Verifying.. avg. 2494.73 us (0.00 sec); median 4,380,613 cycles, 1x: 4,380,613 cycles +Signature size: 16224 (15.84 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-256f-robust_C b/sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-256f-robust_C new file mode 100644 index 0000000..55b1243 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-256f-robust_C @@ -0,0 +1,14 @@ +Parameters: n = 32, h = 68, d = 17, b = 9, k = 35, w = 16 +Running 10 iterations. +thash avg. 1.94 us (0.00 sec); median 3,418 cycles, 1x: 3,418 cycles +f1600x2 avg. 0.97 us (0.00 sec); median 1,675 cycles, 1x: 1,675 cycles +thashx2 avg. 1.94 us (0.00 sec); median 3,413 cycles, 1x: 3,413 cycles +Generating keypair.. avg. 16383.24 us (0.02 sec); median 28,915,608 cycles, 1x: 28,915,608 cycles + - WOTS pk gen 2x.. avg. 2060.48 us (0.00 sec); median 3,609,811 cycles, 8x: 28,878,488 cycles +Signing.. avg. 322707.89 us (0.32 sec); median 568,680,828 cycles, 1x: 568,680,828 cycles + - FORS signing.. avg. 43762.25 us (0.04 sec); median 77,146,416 cycles, 1x: 77,146,416 cycles + - WOTS pk gen x2.. avg. 2052.63 us (0.00 sec); median 3,610,347 cycles, 136x: 491,007,192 cycles +Verifying.. avg. 9942.30 us (0.01 sec); median 17,544,888 cycles, 1x: 17,544,888 cycles +Signature size: 49856 (48.69 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-256f-robust_COTHANV8 b/sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-256f-robust_COTHANV8 new file mode 100644 index 0000000..8eddc2f --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-256f-robust_COTHANV8 @@ -0,0 +1,14 @@ +Parameters: n = 32, h = 68, d = 17, b = 9, k = 35, w = 16 +Running 10 iterations. +thash avg. 2.80 us (0.00 sec); median 4,897 cycles, 1x: 4,897 cycles +f1600x2 avg. 1.38 us (0.00 sec); median 2,409 cycles, 1x: 2,409 cycles +thashx2 avg. 2.77 us (0.00 sec); median 4,890 cycles, 1x: 4,890 cycles +Generating keypair.. avg. 23522.81 us (0.02 sec); median 41,414,021 cycles, 1x: 41,414,021 cycles + - WOTS pk gen 2x.. avg. 2962.99 us (0.00 sec); median 5,172,828 cycles, 8x: 41,382,624 cycles +Signing.. avg. 462587.63 us (0.46 sec); median 814,211,376 cycles, 1x: 814,211,376 cycles + - FORS signing.. avg. 62592.13 us (0.06 sec); median 110,169,311 cycles, 1x: 110,169,311 cycles + - WOTS pk gen x2.. avg. 2952.05 us (0.00 sec); median 5,173,996 cycles, 136x: 703,663,456 cycles +Verifying.. avg. 14586.48 us (0.01 sec); median 25,602,793 cycles, 1x: 25,602,793 cycles +Signature size: 49856 (48.69 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-256f-simple_C b/sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-256f-simple_C new file mode 100644 index 0000000..95a606a --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-256f-simple_C @@ -0,0 +1,14 @@ +Parameters: n = 32, h = 68, d = 17, b = 9, k = 35, w = 16 +Running 10 iterations. +thash avg. 0.97 us (0.00 sec); median 1,710 cycles, 1x: 1,710 cycles +f1600x2 avg. 0.96 us (0.00 sec); median 1,679 cycles, 1x: 1,679 cycles +thashx2 avg. 0.97 us (0.00 sec); median 1,704 cycles, 1x: 1,704 cycles +Generating keypair.. avg. 8448.46 us (0.01 sec); median 14,919,649 cycles, 1x: 14,919,649 cycles + - WOTS pk gen 2x.. avg. 1061.03 us (0.00 sec); median 1,863,605 cycles, 8x: 14,908,840 cycles +Signing.. avg. 170180.72 us (0.17 sec); median 300,199,912 cycles, 1x: 300,199,912 cycles + - FORS signing.. avg. 26389.17 us (0.03 sec); median 46,582,053 cycles, 1x: 46,582,053 cycles + - WOTS pk gen x2.. avg. 1065.29 us (0.00 sec); median 1,862,774 cycles, 136x: 253,337,264 cycles +Verifying.. avg. 4975.33 us (0.00 sec); median 8,750,146 cycles, 1x: 8,750,146 cycles +Signature size: 49856 (48.69 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-256f-simple_COTHANV8 b/sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-256f-simple_COTHANV8 new file mode 100644 index 0000000..363da08 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-256f-simple_COTHANV8 @@ -0,0 +1,14 @@ +Parameters: n = 32, h = 68, d = 17, b = 9, k = 35, w = 16 +Running 10 iterations. +thash avg. 1.39 us (0.00 sec); median 2,450 cycles, 1x: 2,450 cycles +f1600x2 avg. 1.37 us (0.00 sec); median 2,407 cycles, 1x: 2,407 cycles +thashx2 avg. 1.39 us (0.00 sec); median 2,444 cycles, 1x: 2,444 cycles +Generating keypair.. avg. 12165.87 us (0.01 sec); median 21,391,660 cycles, 1x: 21,391,660 cycles + - WOTS pk gen 2x.. avg. 1520.73 us (0.00 sec); median 2,671,388 cycles, 8x: 21,371,104 cycles +Signing.. avg. 243798.24 us (0.24 sec); median 429,790,459 cycles, 1x: 429,790,459 cycles + - FORS signing.. avg. 37541.04 us (0.04 sec); median 66,186,027 cycles, 1x: 66,186,027 cycles + - WOTS pk gen x2.. avg. 1513.76 us (0.00 sec); median 2,669,915 cycles, 136x: 363,108,440 cycles +Verifying.. avg. 7195.03 us (0.01 sec); median 12,672,518 cycles, 1x: 12,672,518 cycles +Signature size: 49856 (48.69 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-256s-robust_C b/sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-256s-robust_C new file mode 100644 index 0000000..5b39eed --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-256s-robust_C @@ -0,0 +1,14 @@ +Parameters: n = 32, h = 64, d = 8, b = 14, k = 22, w = 16 +Running 10 iterations. +thash avg. 1.96 us (0.00 sec); median 3,422 cycles, 1x: 3,422 cycles +f1600x2 avg. 0.96 us (0.00 sec); median 1,683 cycles, 1x: 1,683 cycles +thashx2 avg. 1.94 us (0.00 sec); median 3,414 cycles, 1x: 3,414 cycles +Generating keypair.. avg. 263367.57 us (0.26 sec); median 462,578,681 cycles, 1x: 462,578,681 cycles + - WOTS pk gen 2x.. avg. 2075.26 us (0.00 sec); median 3,610,785 cycles, 128x: 462,180,480 cycles +Signing.. avg. 2997720.71 us (3.00 sec); median 5,252,300,493 cycles, 1x: 5,252,300,493 cycles + - FORS signing.. avg. 889563.58 us (0.89 sec); median 1,551,655,300 cycles, 1x: 1,551,655,300 cycles + - WOTS pk gen x2.. avg. 2089.08 us (0.00 sec); median 3,613,074 cycles, 1024x: 3,699,787,776 cycles +Verifying.. avg. 5120.21 us (0.01 sec); median 8,918,496 cycles, 1x: 8,918,496 cycles +Signature size: 29792 (29.09 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-256s-robust_COTHANV8 b/sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-256s-robust_COTHANV8 new file mode 100644 index 0000000..f45a4f6 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-256s-robust_COTHANV8 @@ -0,0 +1,14 @@ +Parameters: n = 32, h = 64, d = 8, b = 14, k = 22, w = 16 +Running 10 iterations. +thash avg. 2.85 us (0.00 sec); median 4,905 cycles, 1x: 4,905 cycles +f1600x2 avg. 1.38 us (0.00 sec); median 2,409 cycles, 1x: 2,409 cycles +thashx2 avg. 2.79 us (0.00 sec); median 4,901 cycles, 1x: 4,901 cycles +Generating keypair.. avg. 379810.73 us (0.38 sec); median 662,009,193 cycles, 1x: 662,009,193 cycles + - WOTS pk gen 2x.. avg. 2969.91 us (0.00 sec); median 5,167,120 cycles, 128x: 661,391,360 cycles +Signing.. avg. 4313724.88 us (4.31 sec); median 7,507,395,630 cycles, 1x: 7,507,395,630 cycles + - FORS signing.. avg. 1279549.69 us (1.28 sec); median 2,211,571,990 cycles, 1x: 2,211,571,990 cycles + - WOTS pk gen x2.. avg. 2981.16 us (0.00 sec); median 5,167,434 cycles, 1024x: 5,291,452,416 cycles +Verifying.. avg. 7666.08 us (0.01 sec); median 13,192,484 cycles, 1x: 13,192,484 cycles +Signature size: 29792 (29.09 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-256s-simple_C b/sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-256s-simple_C new file mode 100644 index 0000000..0054a92 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-256s-simple_C @@ -0,0 +1,14 @@ +Parameters: n = 32, h = 64, d = 8, b = 14, k = 22, w = 16 +Running 10 iterations. +thash avg. 0.97 us (0.00 sec); median 1,709 cycles, 1x: 1,709 cycles +f1600x2 avg. 0.96 us (0.00 sec); median 1,675 cycles, 1x: 1,675 cycles +thashx2 avg. 0.98 us (0.00 sec); median 1,706 cycles, 1x: 1,706 cycles +Generating keypair.. avg. 135608.92 us (0.14 sec); median 238,860,354 cycles, 1x: 238,860,354 cycles + - WOTS pk gen 2x.. avg. 1071.52 us (0.00 sec); median 1,864,156 cycles, 128x: 238,611,968 cycles +Signing.. avg. 1617769.78 us (1.62 sec); median 2,846,776,816 cycles, 1x: 2,846,776,816 cycles + - FORS signing.. avg. 531785.79 us (0.53 sec); median 936,104,732 cycles, 1x: 936,104,732 cycles + - WOTS pk gen x2.. avg. 1067.00 us (0.00 sec); median 1,864,581 cycles, 1024x: 1,909,330,944 cycles +Verifying.. avg. 2587.03 us (0.00 sec); median 4,521,643 cycles, 1x: 4,521,643 cycles +Signature size: 29792 (29.09 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-256s-simple_COTHANV8 b/sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-256s-simple_COTHANV8 new file mode 100644 index 0000000..cc714e6 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_A78/sphincs-shake-256s-simple_COTHANV8 @@ -0,0 +1,14 @@ +Parameters: n = 32, h = 64, d = 8, b = 14, k = 22, w = 16 +Running 10 iterations. +thash avg. 1.41 us (0.00 sec); median 2,450 cycles, 1x: 2,450 cycles +f1600x2 avg. 1.38 us (0.00 sec); median 2,409 cycles, 1x: 2,409 cycles +thashx2 avg. 1.39 us (0.00 sec); median 2,445 cycles, 1x: 2,445 cycles +Generating keypair.. avg. 194320.36 us (0.19 sec); median 342,297,658 cycles, 1x: 342,297,658 cycles + - WOTS pk gen 2x.. avg. 1527.10 us (0.00 sec); median 2,671,827 cycles, 128x: 341,993,856 cycles +Signing.. avg. 2314099.44 us (2.31 sec); median 4,068,638,034 cycles, 1x: 4,068,638,034 cycles + - FORS signing.. avg. 758012.75 us (0.76 sec); median 1,330,309,394 cycles, 1x: 1,330,309,394 cycles + - WOTS pk gen x2.. avg. 1519.91 us (0.00 sec); median 2,671,153 cycles, 1024x: 2,735,260,672 cycles +Verifying.. avg. 3564.92 us (0.00 sec); median 6,270,672 cycles, 1x: 6,270,672 cycles +Signature size: 29792 (29.09 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-128f-robust_C b/sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-128f-robust_C new file mode 100644 index 0000000..d5ea02e --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-128f-robust_C @@ -0,0 +1,14 @@ +Parameters: n = 16, h = 66, d = 22, b = 6, k = 33, w = 16 +Running 10 iterations. +thash avg. 1.65 us (0.00 sec); median 3,350 cycles, 1x: 3,350 cycles +f1600x2 avg. 0.84 us (0.00 sec); median 1,662 cycles, 1x: 1,662 cycles +thashx2 avg. 1.65 us (0.00 sec); median 3,345 cycles, 1x: 3,345 cycles +Generating keypair.. avg. 3617.15 us (0.00 sec); median 7,358,319 cycles, 1x: 7,358,319 cycles + - WOTS pk gen 2x.. avg. 921.86 us (0.00 sec); median 1,836,298 cycles, 4x: 7,345,192 cycles +Signing.. avg. 85660.52 us (0.09 sec); median 170,826,272 cycles, 1x: 170,826,272 cycles + - FORS signing.. avg. 4551.81 us (0.00 sec); median 8,967,518 cycles, 1x: 8,967,518 cycles + - WOTS pk gen x2.. avg. 944.93 us (0.00 sec); median 1,835,663 cycles, 88x: 161,538,344 cycles +Verifying.. avg. 5832.46 us (0.01 sec); median 11,502,609 cycles, 1x: 11,502,609 cycles +Signature size: 17088 (16.69 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-128f-robust_COTHANV8 b/sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-128f-robust_COTHANV8 new file mode 100644 index 0000000..af92136 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-128f-robust_COTHANV8 @@ -0,0 +1,14 @@ +Parameters: n = 16, h = 66, d = 22, b = 6, k = 33, w = 16 +Running 10 iterations. +thash avg. 1.42 us (0.00 sec); median 2,782 cycles, 1x: 2,782 cycles +f1600x2 avg. 0.70 us (0.00 sec); median 1,361 cycles, 1x: 1,361 cycles +thashx2 avg. 1.43 us (0.00 sec); median 2,778 cycles, 1x: 2,778 cycles +Generating keypair.. avg. 3157.34 us (0.00 sec); median 6,112,043 cycles, 1x: 6,112,043 cycles + - WOTS pk gen 2x.. avg. 776.32 us (0.00 sec); median 1,524,557 cycles, 4x: 6,098,228 cycles +Signing.. avg. 73704.31 us (0.07 sec); median 141,857,043 cycles, 1x: 141,857,043 cycles + - FORS signing.. avg. 3860.26 us (0.00 sec); median 7,429,614 cycles, 1x: 7,429,614 cycles + - WOTS pk gen x2.. avg. 805.61 us (0.00 sec); median 1,524,836 cycles, 88x: 134,185,568 cycles +Verifying.. avg. 5111.76 us (0.01 sec); median 9,834,959 cycles, 1x: 9,834,959 cycles +Signature size: 17088 (16.69 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-128f-simple_C b/sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-128f-simple_C new file mode 100644 index 0000000..c19074d --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-128f-simple_C @@ -0,0 +1,14 @@ +Parameters: n = 16, h = 66, d = 22, b = 6, k = 33, w = 16 +Running 10 iterations. +thash avg. 0.80 us (0.00 sec); median 1,674 cycles, 1x: 1,674 cycles +f1600x2 avg. 0.79 us (0.00 sec); median 1,657 cycles, 1x: 1,657 cycles +thashx2 avg. 0.80 us (0.00 sec); median 1,669 cycles, 1x: 1,669 cycles +Generating keypair.. avg. 1818.19 us (0.00 sec); median 3,803,865 cycles, 1x: 3,803,865 cycles + - WOTS pk gen 2x.. avg. 476.42 us (0.00 sec); median 949,213 cycles, 4x: 3,796,852 cycles +Signing.. avg. 43448.22 us (0.04 sec); median 89,142,953 cycles, 1x: 89,142,953 cycles + - FORS signing.. avg. 2678.15 us (0.00 sec); median 5,436,710 cycles, 1x: 5,436,710 cycles + - WOTS pk gen x2.. avg. 467.59 us (0.00 sec); median 948,174 cycles, 88x: 83,439,312 cycles +Verifying.. avg. 2731.90 us (0.00 sec); median 5,574,456 cycles, 1x: 5,574,456 cycles +Signature size: 17088 (16.69 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-128f-simple_COTHANV8 b/sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-128f-simple_COTHANV8 new file mode 100644 index 0000000..d1553fc --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-128f-simple_COTHANV8 @@ -0,0 +1,14 @@ +Parameters: n = 16, h = 66, d = 22, b = 6, k = 33, w = 16 +Running 10 iterations. +thash avg. 0.69 us (0.00 sec); median 1,395 cycles, 1x: 1,395 cycles +f1600x2 avg. 0.67 us (0.00 sec); median 1,359 cycles, 1x: 1,359 cycles +thashx2 avg. 0.69 us (0.00 sec); median 1,392 cycles, 1x: 1,392 cycles +Generating keypair.. avg. 1559.01 us (0.00 sec); median 3,164,842 cycles, 1x: 3,164,842 cycles + - WOTS pk gen 2x.. avg. 401.21 us (0.00 sec); median 789,990 cycles, 4x: 3,159,960 cycles +Signing.. avg. 37539.99 us (0.04 sec); median 74,151,231 cycles, 1x: 74,151,231 cycles + - FORS signing.. avg. 2298.46 us (0.00 sec); median 4,491,929 cycles, 1x: 4,491,929 cycles + - WOTS pk gen x2.. avg. 396.38 us (0.00 sec); median 790,490 cycles, 88x: 69,563,120 cycles +Verifying.. avg. 2462.98 us (0.00 sec); median 4,802,811 cycles, 1x: 4,802,811 cycles +Signature size: 17088 (16.69 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-128s-robust_C b/sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-128s-robust_C new file mode 100644 index 0000000..1e445c9 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-128s-robust_C @@ -0,0 +1,14 @@ +Parameters: n = 16, h = 63, d = 7, b = 12, k = 14, w = 16 +Running 10 iterations. +thash avg. 1.74 us (0.00 sec); median 3,351 cycles, 1x: 3,351 cycles +f1600x2 avg. 0.86 us (0.00 sec); median 1,655 cycles, 1x: 1,655 cycles +thashx2 avg. 1.76 us (0.00 sec); median 3,345 cycles, 1x: 3,345 cycles +Generating keypair.. avg. 245436.87 us (0.25 sec); median 470,975,755 cycles, 1x: 470,975,755 cycles + - WOTS pk gen 2x.. avg. 951.81 us (0.00 sec); median 1,836,909 cycles, 256x: 470,248,704 cycles +Signing.. avg. 1846717.99 us (1.85 sec); median 3,546,272,464 cycles, 1x: 3,546,272,464 cycles + - FORS signing.. avg. 130390.88 us (0.13 sec); median 249,317,172 cycles, 1x: 249,317,172 cycles + - WOTS pk gen x2.. avg. 965.69 us (0.00 sec); median 1,836,907 cycles, 1792x: 3,291,737,344 cycles +Verifying.. avg. 2179.08 us (0.00 sec); median 4,168,382 cycles, 1x: 4,168,382 cycles +Signature size: 7856 (7.67 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-128s-robust_COTHANV8 b/sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-128s-robust_COTHANV8 new file mode 100644 index 0000000..c168e8b --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-128s-robust_COTHANV8 @@ -0,0 +1,14 @@ +Parameters: n = 16, h = 63, d = 7, b = 12, k = 14, w = 16 +Running 10 iterations. +thash avg. 1.48 us (0.00 sec); median 2,781 cycles, 1x: 2,781 cycles +f1600x2 avg. 0.72 us (0.00 sec); median 1,360 cycles, 1x: 1,360 cycles +thashx2 avg. 1.46 us (0.00 sec); median 2,777 cycles, 1x: 2,777 cycles +Generating keypair.. avg. 207853.76 us (0.21 sec); median 391,074,973 cycles, 1x: 391,074,973 cycles + - WOTS pk gen 2x.. avg. 805.76 us (0.00 sec); median 1,524,662 cycles, 256x: 390,313,472 cycles +Signing.. avg. 1570329.26 us (1.57 sec); median 2,937,624,124 cycles, 1x: 2,937,624,124 cycles + - FORS signing.. avg. 107364.49 us (0.11 sec); median 200,291,231 cycles, 1x: 200,291,231 cycles + - WOTS pk gen x2.. avg. 833.07 us (0.00 sec); median 1,525,336 cycles, 1792x: 2,733,402,112 cycles +Verifying.. avg. 1954.82 us (0.00 sec); median 3,634,500 cycles, 1x: 3,634,500 cycles +Signature size: 7856 (7.67 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-128s-simple_C b/sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-128s-simple_C new file mode 100644 index 0000000..fe1c5af --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-128s-simple_C @@ -0,0 +1,14 @@ +Parameters: n = 16, h = 63, d = 7, b = 12, k = 14, w = 16 +Running 10 iterations. +thash avg. 0.85 us (0.00 sec); median 1,674 cycles, 1x: 1,674 cycles +f1600x2 avg. 0.84 us (0.00 sec); median 1,657 cycles, 1x: 1,657 cycles +thashx2 avg. 0.84 us (0.00 sec); median 1,673 cycles, 1x: 1,673 cycles +Generating keypair.. avg. 124371.08 us (0.12 sec); median 243,378,592 cycles, 1x: 243,378,592 cycles + - WOTS pk gen 2x.. avg. 488.00 us (0.00 sec); median 948,129 cycles, 256x: 242,721,024 cycles +Signing.. avg. 953240.74 us (0.95 sec); median 1,850,259,811 cycles, 1x: 1,850,259,811 cycles + - FORS signing.. avg. 75836.82 us (0.08 sec); median 146,581,432 cycles, 1x: 146,581,432 cycles + - WOTS pk gen x2.. avg. 489.01 us (0.00 sec); median 948,798 cycles, 1792x: 1,700,246,016 cycles +Verifying.. avg. 1202.51 us (0.00 sec); median 2,294,180 cycles, 1x: 2,294,180 cycles +Signature size: 7856 (7.67 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-128s-simple_COTHANV8 b/sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-128s-simple_COTHANV8 new file mode 100644 index 0000000..81c08f9 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-128s-simple_COTHANV8 @@ -0,0 +1,14 @@ +Parameters: n = 16, h = 63, d = 7, b = 12, k = 14, w = 16 +Running 10 iterations. +thash avg. 0.72 us (0.00 sec); median 1,393 cycles, 1x: 1,393 cycles +f1600x2 avg. 0.70 us (0.00 sec); median 1,364 cycles, 1x: 1,364 cycles +thashx2 avg. 0.73 us (0.00 sec); median 1,387 cycles, 1x: 1,387 cycles +Generating keypair.. avg. 106759.41 us (0.11 sec); median 202,252,292 cycles, 1x: 202,252,292 cycles + - WOTS pk gen 2x.. avg. 413.69 us (0.00 sec); median 788,882 cycles, 256x: 201,953,792 cycles +Signing.. avg. 814170.82 us (0.81 sec); median 1,536,373,295 cycles, 1x: 1,536,373,295 cycles + - FORS signing.. avg. 64064.73 us (0.06 sec); median 120,591,160 cycles, 1x: 120,591,160 cycles + - WOTS pk gen x2.. avg. 410.13 us (0.00 sec); median 789,300 cycles, 1792x: 1,414,425,600 cycles +Verifying.. avg. 993.81 us (0.00 sec); median 1,837,364 cycles, 1x: 1,837,364 cycles +Signature size: 7856 (7.67 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-192f-robust_C b/sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-192f-robust_C new file mode 100644 index 0000000..7947107 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-192f-robust_C @@ -0,0 +1,14 @@ +Parameters: n = 24, h = 66, d = 22, b = 8, k = 33, w = 16 +Running 10 iterations. +thash avg. 1.77 us (0.00 sec); median 3,360 cycles, 1x: 3,360 cycles +f1600x2 avg. 0.88 us (0.00 sec); median 1,655 cycles, 1x: 1,655 cycles +thashx2 avg. 1.76 us (0.00 sec); median 3,358 cycles, 1x: 3,358 cycles +Generating keypair.. avg. 5699.77 us (0.01 sec); median 10,831,684 cycles, 1x: 10,831,684 cycles + - WOTS pk gen 2x.. avg. 1420.83 us (0.00 sec); median 2,701,306 cycles, 4x: 10,805,224 cycles +Signing.. avg. 143935.55 us (0.14 sec); median 274,142,174 cycles, 1x: 274,142,174 cycles + - FORS signing.. avg. 18784.70 us (0.02 sec); median 35,784,766 cycles, 1x: 35,784,766 cycles + - WOTS pk gen x2.. avg. 1422.90 us (0.00 sec); median 2,700,705 cycles, 88x: 237,662,040 cycles +Verifying.. avg. 8942.88 us (0.01 sec); median 17,088,473 cycles, 1x: 17,088,473 cycles +Signature size: 35664 (34.83 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-192f-robust_COTHANV8 b/sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-192f-robust_COTHANV8 new file mode 100644 index 0000000..42f5939 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-192f-robust_COTHANV8 @@ -0,0 +1,14 @@ +Parameters: n = 24, h = 66, d = 22, b = 8, k = 33, w = 16 +Running 10 iterations. +thash avg. 1.47 us (0.00 sec); median 2,795 cycles, 1x: 2,795 cycles +f1600x2 avg. 0.72 us (0.00 sec); median 1,367 cycles, 1x: 1,367 cycles +thashx2 avg. 1.47 us (0.00 sec); median 2,791 cycles, 1x: 2,791 cycles +Generating keypair.. avg. 4711.51 us (0.00 sec); median 8,966,202 cycles, 1x: 8,966,202 cycles + - WOTS pk gen 2x.. avg. 1179.71 us (0.00 sec); median 2,238,658 cycles, 4x: 8,954,632 cycles +Signing.. avg. 120984.89 us (0.12 sec); median 227,007,070 cycles, 1x: 227,007,070 cycles + - FORS signing.. avg. 15906.75 us (0.02 sec); median 29,719,007 cycles, 1x: 29,719,007 cycles + - WOTS pk gen x2.. avg. 1193.77 us (0.00 sec); median 2,239,237 cycles, 88x: 197,052,856 cycles +Verifying.. avg. 7607.06 us (0.01 sec); median 14,111,715 cycles, 1x: 14,111,715 cycles +Signature size: 35664 (34.83 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-192f-simple_C b/sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-192f-simple_C new file mode 100644 index 0000000..0745ead --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-192f-simple_C @@ -0,0 +1,14 @@ +Parameters: n = 24, h = 66, d = 22, b = 8, k = 33, w = 16 +Running 10 iterations. +thash avg. 0.88 us (0.00 sec); median 1,682 cycles, 1x: 1,682 cycles +f1600x2 avg. 0.88 us (0.00 sec); median 1,654 cycles, 1x: 1,654 cycles +thashx2 avg. 0.88 us (0.00 sec); median 1,678 cycles, 1x: 1,678 cycles +Generating keypair.. avg. 2927.15 us (0.00 sec); median 5,577,198 cycles, 1x: 5,577,198 cycles + - WOTS pk gen 2x.. avg. 735.83 us (0.00 sec); median 1,392,163 cycles, 4x: 5,568,652 cycles +Signing.. avg. 75822.45 us (0.08 sec); median 144,334,077 cycles, 1x: 144,334,077 cycles + - FORS signing.. avg. 11355.89 us (0.01 sec); median 21,705,202 cycles, 1x: 21,705,202 cycles + - WOTS pk gen x2.. avg. 735.19 us (0.00 sec); median 1,392,438 cycles, 88x: 122,534,544 cycles +Verifying.. avg. 4435.81 us (0.00 sec); median 8,413,539 cycles, 1x: 8,413,539 cycles +Signature size: 35664 (34.83 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-192f-simple_COTHANV8 b/sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-192f-simple_COTHANV8 new file mode 100644 index 0000000..44b3c69 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-192f-simple_COTHANV8 @@ -0,0 +1,14 @@ +Parameters: n = 24, h = 66, d = 22, b = 8, k = 33, w = 16 +Running 10 iterations. +thash avg. 0.75 us (0.00 sec); median 1,402 cycles, 1x: 1,402 cycles +f1600x2 avg. 0.72 us (0.00 sec); median 1,367 cycles, 1x: 1,367 cycles +thashx2 avg. 0.74 us (0.00 sec); median 1,397 cycles, 1x: 1,397 cycles +Generating keypair.. avg. 2442.97 us (0.00 sec); median 4,637,594 cycles, 1x: 4,637,594 cycles + - WOTS pk gen 2x.. avg. 609.97 us (0.00 sec); median 1,157,901 cycles, 4x: 4,631,604 cycles +Signing.. avg. 63893.84 us (0.06 sec); median 119,952,885 cycles, 1x: 119,952,885 cycles + - FORS signing.. avg. 9540.35 us (0.01 sec); median 17,908,941 cycles, 1x: 17,908,941 cycles + - WOTS pk gen x2.. avg. 617.57 us (0.00 sec); median 1,158,277 cycles, 88x: 101,928,376 cycles +Verifying.. avg. 3785.89 us (0.00 sec); median 7,066,166 cycles, 1x: 7,066,166 cycles +Signature size: 35664 (34.83 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-192s-robust_C b/sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-192s-robust_C new file mode 100644 index 0000000..24228c8 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-192s-robust_C @@ -0,0 +1,14 @@ +Parameters: n = 24, h = 63, d = 7, b = 14, k = 17, w = 16 +Running 10 iterations. +thash avg. 1.77 us (0.00 sec); median 3,363 cycles, 1x: 3,363 cycles +f1600x2 avg. 0.89 us (0.00 sec); median 1,655 cycles, 1x: 1,655 cycles +thashx2 avg. 1.77 us (0.00 sec); median 3,359 cycles, 1x: 3,359 cycles +Generating keypair.. avg. 364822.34 us (0.36 sec); median 692,280,962 cycles, 1x: 692,280,962 cycles + - WOTS pk gen 2x.. avg. 1419.51 us (0.00 sec); median 2,700,302 cycles, 256x: 691,277,312 cycles +Signing.. avg. 3188670.84 us (3.19 sec); median 6,026,604,746 cycles, 1x: 6,026,604,746 cycles + - FORS signing.. avg. 623935.63 us (0.62 sec); median 1,180,221,036 cycles, 1x: 1,180,221,036 cycles + - WOTS pk gen x2.. avg. 1418.51 us (0.00 sec); median 2,699,762 cycles, 1792x: 4,837,973,504 cycles +Verifying.. avg. 3185.69 us (0.00 sec); median 6,019,165 cycles, 1x: 6,019,165 cycles +Signature size: 16224 (15.84 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-192s-robust_COTHANV8 b/sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-192s-robust_COTHANV8 new file mode 100644 index 0000000..f1477eb --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-192s-robust_COTHANV8 @@ -0,0 +1,14 @@ +Parameters: n = 24, h = 63, d = 7, b = 14, k = 17, w = 16 +Running 10 iterations. +thash avg. 1.49 us (0.00 sec); median 2,796 cycles, 1x: 2,796 cycles +f1600x2 avg. 0.73 us (0.00 sec); median 1,368 cycles, 1x: 1,368 cycles +thashx2 avg. 1.47 us (0.00 sec); median 2,792 cycles, 1x: 2,792 cycles +Generating keypair.. avg. 312143.95 us (0.31 sec); median 574,073,215 cycles, 1x: 574,073,215 cycles + - WOTS pk gen 2x.. avg. 1226.77 us (0.00 sec); median 2,238,556 cycles, 256x: 573,070,336 cycles +Signing.. avg. 2728505.06 us (2.73 sec); median 4,995,988,493 cycles, 1x: 4,995,988,493 cycles + - FORS signing.. avg. 534672.02 us (0.53 sec); median 977,375,685 cycles, 1x: 977,375,685 cycles + - WOTS pk gen x2.. avg. 1232.50 us (0.00 sec); median 2,238,830 cycles, 1792x: 4,011,983,360 cycles +Verifying.. avg. 2811.16 us (0.00 sec); median 5,130,395 cycles, 1x: 5,130,395 cycles +Signature size: 16224 (15.84 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-192s-simple_C b/sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-192s-simple_C new file mode 100644 index 0000000..7dbc47c --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-192s-simple_C @@ -0,0 +1,14 @@ +Parameters: n = 24, h = 63, d = 7, b = 14, k = 17, w = 16 +Running 10 iterations. +thash avg. 0.89 us (0.00 sec); median 1,678 cycles, 1x: 1,678 cycles +f1600x2 avg. 0.87 us (0.00 sec); median 1,655 cycles, 1x: 1,655 cycles +thashx2 avg. 0.87 us (0.00 sec); median 1,673 cycles, 1x: 1,673 cycles +Generating keypair.. avg. 187017.03 us (0.19 sec); median 355,996,533 cycles, 1x: 355,996,533 cycles + - WOTS pk gen 2x.. avg. 733.09 us (0.00 sec); median 1,390,501 cycles, 256x: 355,968,256 cycles +Signing.. avg. 1679742.15 us (1.68 sec); median 3,202,969,576 cycles, 1x: 3,202,969,576 cycles + - FORS signing.. avg. 373209.14 us (0.37 sec); median 710,906,118 cycles, 1x: 710,906,118 cycles + - WOTS pk gen x2.. avg. 728.38 us (0.00 sec); median 1,390,448 cycles, 1792x: 2,491,682,816 cycles +Verifying.. avg. 1616.15 us (0.00 sec); median 3,071,937 cycles, 1x: 3,071,937 cycles +Signature size: 16224 (15.84 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-192s-simple_COTHANV8 b/sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-192s-simple_COTHANV8 new file mode 100644 index 0000000..af5be21 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-192s-simple_COTHANV8 @@ -0,0 +1,14 @@ +Parameters: n = 24, h = 63, d = 7, b = 14, k = 17, w = 16 +Running 10 iterations. +thash avg. 0.74 us (0.00 sec); median 1,401 cycles, 1x: 1,401 cycles +f1600x2 avg. 0.72 us (0.00 sec); median 1,368 cycles, 1x: 1,368 cycles +thashx2 avg. 0.74 us (0.00 sec); median 1,395 cycles, 1x: 1,395 cycles +Generating keypair.. avg. 158879.24 us (0.16 sec); median 296,819,369 cycles, 1x: 296,819,369 cycles + - WOTS pk gen 2x.. avg. 633.24 us (0.00 sec); median 1,159,313 cycles, 256x: 296,784,128 cycles +Signing.. avg. 1441153.38 us (1.44 sec); median 2,666,243,716 cycles, 1x: 2,666,243,716 cycles + - FORS signing.. avg. 318221.42 us (0.32 sec); median 588,564,025 cycles, 1x: 588,564,025 cycles + - WOTS pk gen x2.. avg. 641.57 us (0.00 sec); median 1,159,623 cycles, 1792x: 2,078,044,416 cycles +Verifying.. avg. 1405.15 us (0.00 sec); median 2,584,399 cycles, 1x: 2,584,399 cycles +Signature size: 16224 (15.84 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-256f-robust_C b/sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-256f-robust_C new file mode 100644 index 0000000..b893717 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-256f-robust_C @@ -0,0 +1,14 @@ +Parameters: n = 32, h = 68, d = 17, b = 9, k = 35, w = 16 +Running 10 iterations. +thash avg. 1.77 us (0.00 sec); median 3,358 cycles, 1x: 3,358 cycles +f1600x2 avg. 0.88 us (0.00 sec); median 1,658 cycles, 1x: 1,658 cycles +thashx2 avg. 1.80 us (0.00 sec); median 3,357 cycles, 1x: 3,357 cycles +Generating keypair.. avg. 15171.02 us (0.02 sec); median 28,463,662 cycles, 1x: 28,463,662 cycles + - WOTS pk gen 2x.. avg. 1915.87 us (0.00 sec); median 3,547,332 cycles, 8x: 28,378,656 cycles +Signing.. avg. 297778.80 us (0.30 sec); median 559,856,468 cycles, 1x: 559,856,468 cycles + - FORS signing.. avg. 40369.43 us (0.04 sec); median 75,968,607 cycles, 1x: 75,968,607 cycles + - WOTS pk gen x2.. avg. 1902.55 us (0.00 sec); median 3,547,895 cycles, 136x: 482,513,720 cycles +Verifying.. avg. 8964.05 us (0.01 sec); median 16,911,956 cycles, 1x: 16,911,956 cycles +Signature size: 49856 (48.69 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-256f-robust_COTHANV8 b/sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-256f-robust_COTHANV8 new file mode 100644 index 0000000..d906657 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-256f-robust_COTHANV8 @@ -0,0 +1,14 @@ +Parameters: n = 32, h = 68, d = 17, b = 9, k = 35, w = 16 +Running 10 iterations. +thash avg. 1.50 us (0.00 sec); median 2,793 cycles, 1x: 2,793 cycles +f1600x2 avg. 0.72 us (0.00 sec); median 1,357 cycles, 1x: 1,357 cycles +thashx2 avg. 1.50 us (0.00 sec); median 2,791 cycles, 1x: 2,791 cycles +Generating keypair.. avg. 12787.93 us (0.01 sec); median 23,664,503 cycles, 1x: 23,664,503 cycles + - WOTS pk gen 2x.. avg. 1602.90 us (0.00 sec); median 2,954,505 cycles, 8x: 23,636,040 cycles +Signing.. avg. 254274.63 us (0.25 sec); median 465,285,865 cycles, 1x: 465,285,865 cycles + - FORS signing.. avg. 34429.22 us (0.03 sec); median 63,033,830 cycles, 1x: 63,033,830 cycles + - WOTS pk gen x2.. avg. 1602.98 us (0.00 sec); median 2,954,561 cycles, 136x: 401,820,296 cycles +Verifying.. avg. 7761.74 us (0.01 sec); median 14,183,746 cycles, 1x: 14,183,746 cycles +Signature size: 49856 (48.69 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-256f-simple_C b/sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-256f-simple_C new file mode 100644 index 0000000..93ec2c2 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-256f-simple_C @@ -0,0 +1,14 @@ +Parameters: n = 32, h = 68, d = 17, b = 9, k = 35, w = 16 +Running 10 iterations. +thash avg. 0.91 us (0.00 sec); median 1,682 cycles, 1x: 1,682 cycles +f1600x2 avg. 0.90 us (0.00 sec); median 1,657 cycles, 1x: 1,657 cycles +thashx2 avg. 0.90 us (0.00 sec); median 1,679 cycles, 1x: 1,679 cycles +Generating keypair.. avg. 7815.31 us (0.01 sec); median 14,683,259 cycles, 1x: 14,683,259 cycles + - WOTS pk gen 2x.. avg. 1001.67 us (0.00 sec); median 1,833,757 cycles, 8x: 14,670,056 cycles +Signing.. avg. 157614.50 us (0.16 sec); median 295,539,442 cycles, 1x: 295,539,442 cycles + - FORS signing.. avg. 24555.56 us (0.02 sec); median 45,968,081 cycles, 1x: 45,968,081 cycles + - WOTS pk gen x2.. avg. 967.14 us (0.00 sec); median 1,834,191 cycles, 136x: 249,449,976 cycles +Verifying.. avg. 4727.97 us (0.00 sec); median 8,888,540 cycles, 1x: 8,888,540 cycles +Signature size: 49856 (48.69 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-256f-simple_COTHANV8 b/sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-256f-simple_COTHANV8 new file mode 100644 index 0000000..14d7de6 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-256f-simple_COTHANV8 @@ -0,0 +1,14 @@ +Parameters: n = 32, h = 68, d = 17, b = 9, k = 35, w = 16 +Running 10 iterations. +thash avg. 0.76 us (0.00 sec); median 1,401 cycles, 1x: 1,401 cycles +f1600x2 avg. 0.74 us (0.00 sec); median 1,364 cycles, 1x: 1,364 cycles +thashx2 avg. 0.76 us (0.00 sec); median 1,399 cycles, 1x: 1,399 cycles +Generating keypair.. avg. 6554.71 us (0.01 sec); median 12,241,484 cycles, 1x: 12,241,484 cycles + - WOTS pk gen 2x.. avg. 814.04 us (0.00 sec); median 1,529,475 cycles, 8x: 12,235,800 cycles +Signing.. avg. 134622.66 us (0.13 sec); median 246,059,846 cycles, 1x: 246,059,846 cycles + - FORS signing.. avg. 20776.96 us (0.02 sec); median 37,975,845 cycles, 1x: 37,975,845 cycles + - WOTS pk gen x2.. avg. 836.37 us (0.00 sec); median 1,530,010 cycles, 136x: 208,081,360 cycles +Verifying.. avg. 3824.72 us (0.00 sec); median 7,015,155 cycles, 1x: 7,015,155 cycles +Signature size: 49856 (48.69 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-256s-robust_C b/sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-256s-robust_C new file mode 100644 index 0000000..e5fd7d4 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-256s-robust_C @@ -0,0 +1,14 @@ +Parameters: n = 32, h = 64, d = 8, b = 14, k = 22, w = 16 +Running 10 iterations. +thash avg. 1.80 us (0.00 sec); median 3,360 cycles, 1x: 3,360 cycles +f1600x2 avg. 0.89 us (0.00 sec); median 1,660 cycles, 1x: 1,660 cycles +thashx2 avg. 1.80 us (0.00 sec); median 3,357 cycles, 1x: 3,357 cycles +Generating keypair.. avg. 243384.43 us (0.24 sec); median 454,686,951 cycles, 1x: 454,686,951 cycles + - WOTS pk gen 2x.. avg. 1888.00 us (0.00 sec); median 3,557,339 cycles, 128x: 455,339,392 cycles +Signing.. avg. 2745128.97 us (2.75 sec); median 5,163,008,040 cycles, 1x: 5,163,008,040 cycles + - FORS signing.. avg. 809321.21 us (0.81 sec); median 1,525,461,336 cycles, 1x: 1,525,461,336 cycles + - WOTS pk gen x2.. avg. 1895.61 us (0.00 sec); median 3,557,435 cycles, 1024x: 3,642,813,440 cycles +Verifying.. avg. 4603.85 us (0.00 sec); median 8,703,937 cycles, 1x: 8,703,937 cycles +Signature size: 29792 (29.09 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-256s-robust_COTHANV8 b/sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-256s-robust_COTHANV8 new file mode 100644 index 0000000..0dee441 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-256s-robust_COTHANV8 @@ -0,0 +1,14 @@ +Parameters: n = 32, h = 64, d = 8, b = 14, k = 22, w = 16 +Running 10 iterations. +thash avg. 1.48 us (0.00 sec); median 2,795 cycles, 1x: 2,795 cycles +f1600x2 avg. 0.72 us (0.00 sec); median 1,364 cycles, 1x: 1,364 cycles +thashx2 avg. 1.49 us (0.00 sec); median 2,793 cycles, 1x: 2,793 cycles +Generating keypair.. avg. 205669.69 us (0.21 sec); median 378,210,011 cycles, 1x: 378,210,011 cycles + - WOTS pk gen 2x.. avg. 1627.19 us (0.00 sec); median 2,951,834 cycles, 128x: 377,834,752 cycles +Signing.. avg. 2347337.99 us (2.35 sec); median 4,293,375,106 cycles, 1x: 4,293,375,106 cycles + - FORS signing.. avg. 692409.16 us (0.69 sec); median 1,267,510,236 cycles, 1x: 1,267,510,236 cycles + - WOTS pk gen x2.. avg. 1643.40 us (0.00 sec); median 2,951,733 cycles, 1024x: 3,022,574,592 cycles +Verifying.. avg. 4025.60 us (0.00 sec); median 7,393,479 cycles, 1x: 7,393,479 cycles +Signature size: 29792 (29.09 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-256s-simple_C b/sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-256s-simple_C new file mode 100644 index 0000000..8b7c236 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-256s-simple_C @@ -0,0 +1,14 @@ +Parameters: n = 32, h = 64, d = 8, b = 14, k = 22, w = 16 +Running 10 iterations. +thash avg. 0.89 us (0.00 sec); median 1,681 cycles, 1x: 1,681 cycles +f1600x2 avg. 0.87 us (0.00 sec); median 1,656 cycles, 1x: 1,656 cycles +thashx2 avg. 0.90 us (0.00 sec); median 1,678 cycles, 1x: 1,678 cycles +Generating keypair.. avg. 125471.08 us (0.13 sec); median 235,245,492 cycles, 1x: 235,245,492 cycles + - WOTS pk gen 2x.. avg. 979.95 us (0.00 sec); median 1,834,731 cycles, 128x: 234,845,568 cycles +Signing.. avg. 1489155.59 us (1.49 sec); median 2,802,093,309 cycles, 1x: 2,802,093,309 cycles + - FORS signing.. avg. 488396.88 us (0.49 sec); median 920,041,371 cycles, 1x: 920,041,371 cycles + - WOTS pk gen x2.. avg. 969.38 us (0.00 sec); median 1,834,422 cycles, 1024x: 1,878,448,128 cycles +Verifying.. avg. 2353.39 us (0.00 sec); median 4,390,805 cycles, 1x: 4,390,805 cycles +Signature size: 29792 (29.09 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-256s-simple_COTHANV8 b/sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-256s-simple_COTHANV8 new file mode 100644 index 0000000..ea7c3a9 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_X1/sphincs-shake-256s-simple_COTHANV8 @@ -0,0 +1,14 @@ +Parameters: n = 32, h = 64, d = 8, b = 14, k = 22, w = 16 +Running 10 iterations. +thash avg. 0.75 us (0.00 sec); median 1,402 cycles, 1x: 1,402 cycles +f1600x2 avg. 0.72 us (0.00 sec); median 1,366 cycles, 1x: 1,366 cycles +thashx2 avg. 0.74 us (0.00 sec); median 1,400 cycles, 1x: 1,400 cycles +Generating keypair.. avg. 106636.36 us (0.11 sec); median 196,170,518 cycles, 1x: 196,170,518 cycles + - WOTS pk gen 2x.. avg. 840.32 us (0.00 sec); median 1,530,677 cycles, 128x: 195,926,656 cycles +Signing.. avg. 1277888.51 us (1.28 sec); median 2,332,022,566 cycles, 1x: 2,332,022,566 cycles + - FORS signing.. avg. 420093.32 us (0.42 sec); median 762,835,294 cycles, 1x: 762,835,294 cycles + - WOTS pk gen x2.. avg. 832.22 us (0.00 sec); median 1,532,074 cycles, 1024x: 1,568,843,776 cycles +Verifying.. avg. 1977.16 us (0.00 sec); median 3,607,423 cycles, 1x: 3,607,423 cycles +Signature size: 29792 (29.09 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-128f-robust_BAS b/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-128f-robust_BAS new file mode 100644 index 0000000..11ba081 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-128f-robust_BAS @@ -0,0 +1,14 @@ +Parameters: n = 16, h = 66, d = 22, b = 6, k = 33, w = 16 +Running 10 iterations. +thash avg. 1.66 us (0.00 sec); median 3,156 cycles, 1x: 3,156 cycles +f1600x2 avg. 0.81 us (0.00 sec); median 1,547 cycles, 1x: 1,547 cycles +thashx2 avg. 1.65 us (0.00 sec); median 3,148 cycles, 1x: 3,148 cycles +Generating keypair.. avg. 3621.30 us (0.00 sec); median 6,929,630 cycles, 1x: 6,929,630 cycles + - WOTS pk gen 2x.. avg. 908.90 us (0.00 sec); median 1,730,660 cycles, 4x: 6,922,640 cycles +Signing.. avg. 83937.31 us (0.08 sec); median 160,941,958 cycles, 1x: 160,941,958 cycles + - FORS signing.. avg. 4390.33 us (0.00 sec); median 8,410,501 cycles, 1x: 8,410,501 cycles + - WOTS pk gen x2.. avg. 905.90 us (0.00 sec); median 1,728,582 cycles, 88x: 152,115,216 cycles +Verifying.. avg. 5895.57 us (0.01 sec); median 11,298,375 cycles, 1x: 11,298,375 cycles +Signature size: 17088 (16.69 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-128f-robust_C b/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-128f-robust_C new file mode 100644 index 0000000..61751b7 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-128f-robust_C @@ -0,0 +1,14 @@ +Parameters: n = 16, h = 66, d = 22, b = 6, k = 33, w = 16 +Running 10 iterations. +thash avg. 1.78 us (0.00 sec); median 3,408 cycles, 1x: 3,408 cycles +f1600x2 avg. 0.88 us (0.00 sec); median 1,680 cycles, 1x: 1,680 cycles +thashx2 avg. 1.78 us (0.00 sec); median 3,405 cycles, 1x: 3,405 cycles +Generating keypair.. avg. 3908.16 us (0.00 sec); median 7,481,028 cycles, 1x: 7,481,028 cycles + - WOTS pk gen 2x.. avg. 980.87 us (0.00 sec); median 1,867,066 cycles, 4x: 7,468,264 cycles +Signing.. avg. 90588.34 us (0.09 sec); median 173,679,559 cycles, 1x: 173,679,559 cycles + - FORS signing.. avg. 4741.10 us (0.00 sec); median 9,088,627 cycles, 1x: 9,088,627 cycles + - WOTS pk gen x2.. avg. 978.07 us (0.00 sec); median 1,866,496 cycles, 88x: 164,251,648 cycles +Verifying.. avg. 5964.95 us (0.01 sec); median 11,408,718 cycles, 1x: 11,408,718 cycles +Signature size: 17088 (16.69 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-128f-robust_COTHANV8 b/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-128f-robust_COTHANV8 new file mode 100644 index 0000000..7442971 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-128f-robust_COTHANV8 @@ -0,0 +1,14 @@ +Parameters: n = 16, h = 66, d = 22, b = 6, k = 33, w = 16 +Running 10 iterations. +thash avg. 1.42 us (0.00 sec); median 2,709 cycles, 1x: 2,709 cycles +f1600x2 avg. 0.69 us (0.00 sec); median 1,323 cycles, 1x: 1,323 cycles +thashx2 avg. 1.42 us (0.00 sec); median 2,707 cycles, 1x: 2,707 cycles +Generating keypair.. avg. 3123.58 us (0.00 sec); median 5,946,107 cycles, 1x: 5,946,107 cycles + - WOTS pk gen 2x.. avg. 778.29 us (0.00 sec); median 1,484,127 cycles, 4x: 5,936,508 cycles +Signing.. avg. 72104.13 us (0.07 sec); median 138,094,446 cycles, 1x: 138,094,446 cycles + - FORS signing.. avg. 3769.23 us (0.00 sec); median 7,224,235 cycles, 1x: 7,224,235 cycles + - WOTS pk gen x2.. avg. 778.31 us (0.00 sec); median 1,484,144 cycles, 88x: 130,604,672 cycles +Verifying.. avg. 4908.97 us (0.00 sec); median 9,399,772 cycles, 1x: 9,399,772 cycles +Signature size: 17088 (16.69 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-128f-simple_BAS b/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-128f-simple_BAS new file mode 100644 index 0000000..8ef69ea --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-128f-simple_BAS @@ -0,0 +1,14 @@ +Parameters: n = 16, h = 66, d = 22, b = 6, k = 33, w = 16 +Running 10 iterations. +thash avg. 0.83 us (0.00 sec); median 1,583 cycles, 1x: 1,583 cycles +f1600x2 avg. 0.81 us (0.00 sec); median 1,547 cycles, 1x: 1,547 cycles +thashx2 avg. 0.84 us (0.00 sec); median 1,585 cycles, 1x: 1,585 cycles +Generating keypair.. avg. 1890.59 us (0.00 sec); median 3,609,286 cycles, 1x: 3,609,286 cycles + - WOTS pk gen 2x.. avg. 473.63 us (0.00 sec); median 901,119 cycles, 4x: 3,604,476 cycles +Signing.. avg. 44125.54 us (0.04 sec); median 84,516,149 cycles, 1x: 84,516,149 cycles + - FORS signing.. avg. 2663.18 us (0.00 sec); median 5,092,976 cycles, 1x: 5,092,976 cycles + - WOTS pk gen x2.. avg. 476.43 us (0.00 sec); median 900,928 cycles, 88x: 79,281,664 cycles +Verifying.. avg. 2971.94 us (0.00 sec); median 5,676,668 cycles, 1x: 5,676,668 cycles +Signature size: 17088 (16.69 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-128f-simple_C b/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-128f-simple_C new file mode 100644 index 0000000..74ead2d --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-128f-simple_C @@ -0,0 +1,14 @@ +Parameters: n = 16, h = 66, d = 22, b = 6, k = 33, w = 16 +Running 10 iterations. +thash avg. 0.89 us (0.00 sec); median 1,703 cycles, 1x: 1,703 cycles +f1600x2 avg. 0.89 us (0.00 sec); median 1,681 cycles, 1x: 1,681 cycles +thashx2 avg. 0.89 us (0.00 sec); median 1,700 cycles, 1x: 1,700 cycles +Generating keypair.. avg. 2024.45 us (0.00 sec); median 3,860,065 cycles, 1x: 3,860,065 cycles + - WOTS pk gen 2x.. avg. 507.23 us (0.00 sec); median 963,118 cycles, 4x: 3,852,472 cycles +Signing.. avg. 47178.99 us (0.05 sec); median 90,409,208 cycles, 1x: 90,409,208 cycles + - FORS signing.. avg. 2863.96 us (0.00 sec); median 5,484,181 cycles, 1x: 5,484,181 cycles + - WOTS pk gen x2.. avg. 506.84 us (0.00 sec); median 963,272 cycles, 88x: 84,767,936 cycles +Verifying.. avg. 3120.74 us (0.00 sec); median 5,980,288 cycles, 1x: 5,980,288 cycles +Signature size: 17088 (16.69 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-128f-simple_COTHANV8 b/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-128f-simple_COTHANV8 new file mode 100644 index 0000000..ff17874 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-128f-simple_COTHANV8 @@ -0,0 +1,14 @@ +Parameters: n = 16, h = 66, d = 22, b = 6, k = 33, w = 16 +Running 10 iterations. +thash avg. 0.71 us (0.00 sec); median 1,359 cycles, 1x: 1,359 cycles +f1600x2 avg. 0.70 us (0.00 sec); median 1,323 cycles, 1x: 1,323 cycles +thashx2 avg. 0.71 us (0.00 sec); median 1,352 cycles, 1x: 1,352 cycles +Generating keypair.. avg. 1610.57 us (0.00 sec); median 3,077,228 cycles, 1x: 3,077,228 cycles + - WOTS pk gen 2x.. avg. 404.96 us (0.00 sec); median 767,789 cycles, 4x: 3,071,156 cycles +Signing.. avg. 37637.93 us (0.04 sec); median 72,059,681 cycles, 1x: 72,059,681 cycles + - FORS signing.. avg. 2287.03 us (0.00 sec); median 4,372,524 cycles, 1x: 4,372,524 cycles + - WOTS pk gen x2.. avg. 405.46 us (0.00 sec); median 767,474 cycles, 88x: 67,537,712 cycles +Verifying.. avg. 2450.54 us (0.00 sec); median 4,692,388 cycles, 1x: 4,692,388 cycles +Signature size: 17088 (16.69 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-128s-robust_BAS b/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-128s-robust_BAS new file mode 100644 index 0000000..085c3be --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-128s-robust_BAS @@ -0,0 +1,14 @@ +Parameters: n = 16, h = 63, d = 7, b = 12, k = 14, w = 16 +Running 10 iterations. +thash avg. 1.65 us (0.00 sec); median 3,156 cycles, 1x: 3,156 cycles +f1600x2 avg. 0.82 us (0.00 sec); median 1,547 cycles, 1x: 1,547 cycles +thashx2 avg. 1.65 us (0.00 sec); median 3,148 cycles, 1x: 3,148 cycles +Generating keypair.. avg. 231219.05 us (0.23 sec); median 443,343,467 cycles, 1x: 443,343,467 cycles + - WOTS pk gen 2x.. avg. 905.59 us (0.00 sec); median 1,728,663 cycles, 256x: 442,537,728 cycles +Signing.. avg. 1737152.36 us (1.74 sec); median 3,330,901,697 cycles, 1x: 3,330,901,697 cycles + - FORS signing.. avg. 118420.01 us (0.12 sec); median 227,025,402 cycles, 1x: 227,025,402 cycles + - WOTS pk gen x2.. avg. 909.16 us (0.00 sec); median 1,728,522 cycles, 1792x: 3,097,511,424 cycles +Verifying.. avg. 2068.60 us (0.00 sec); median 3,936,987 cycles, 1x: 3,936,987 cycles +Signature size: 7856 (7.67 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-128s-robust_C b/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-128s-robust_C new file mode 100644 index 0000000..87dd53f --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-128s-robust_C @@ -0,0 +1,14 @@ +Parameters: n = 16, h = 63, d = 7, b = 12, k = 14, w = 16 +Running 10 iterations. +thash avg. 1.78 us (0.00 sec); median 3,409 cycles, 1x: 3,409 cycles +f1600x2 avg. 0.88 us (0.00 sec); median 1,680 cycles, 1x: 1,680 cycles +thashx2 avg. 1.78 us (0.00 sec); median 3,405 cycles, 1x: 3,405 cycles +Generating keypair.. avg. 249962.05 us (0.25 sec); median 479,373,163 cycles, 1x: 479,373,163 cycles + - WOTS pk gen 2x.. avg. 979.23 us (0.00 sec); median 1,868,255 cycles, 256x: 478,273,280 cycles +Signing.. avg. 1878307.42 us (1.88 sec); median 3,601,404,582 cycles, 1x: 3,601,404,582 cycles + - FORS signing.. avg. 128009.35 us (0.13 sec); median 245,500,798 cycles, 1x: 245,500,798 cycles + - WOTS pk gen x2.. avg. 988.09 us (0.00 sec); median 1,869,216 cycles, 1792x: 3,349,635,072 cycles +Verifying.. avg. 2292.32 us (0.00 sec); median 4,374,100 cycles, 1x: 4,374,100 cycles +Signature size: 7856 (7.67 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-128s-robust_COTHANV8 b/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-128s-robust_COTHANV8 new file mode 100644 index 0000000..f0d153b --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-128s-robust_COTHANV8 @@ -0,0 +1,14 @@ +Parameters: n = 16, h = 63, d = 7, b = 12, k = 14, w = 16 +Running 10 iterations. +thash avg. 1.42 us (0.00 sec); median 2,710 cycles, 1x: 2,710 cycles +f1600x2 avg. 0.69 us (0.00 sec); median 1,323 cycles, 1x: 1,323 cycles +thashx2 avg. 1.41 us (0.00 sec); median 2,702 cycles, 1x: 2,702 cycles +Generating keypair.. avg. 198768.44 us (0.20 sec); median 381,169,535 cycles, 1x: 381,169,535 cycles + - WOTS pk gen 2x.. avg. 781.21 us (0.00 sec); median 1,484,382 cycles, 256x: 380,001,792 cycles +Signing.. avg. 1493267.71 us (1.49 sec); median 2,863,365,476 cycles, 1x: 2,863,365,476 cycles + - FORS signing.. avg. 101692.64 us (0.10 sec); median 195,018,890 cycles, 1x: 195,018,890 cycles + - WOTS pk gen x2.. avg. 779.65 us (0.00 sec); median 1,484,911 cycles, 1792x: 2,660,960,512 cycles +Verifying.. avg. 1739.51 us (0.00 sec); median 3,311,755 cycles, 1x: 3,311,755 cycles +Signature size: 7856 (7.67 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-128s-simple_BAS b/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-128s-simple_BAS new file mode 100644 index 0000000..30219e3 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-128s-simple_BAS @@ -0,0 +1,14 @@ +Parameters: n = 16, h = 63, d = 7, b = 12, k = 14, w = 16 +Running 10 iterations. +thash avg. 0.83 us (0.00 sec); median 1,583 cycles, 1x: 1,583 cycles +f1600x2 avg. 0.81 us (0.00 sec); median 1,547 cycles, 1x: 1,547 cycles +thashx2 avg. 0.83 us (0.00 sec); median 1,586 cycles, 1x: 1,586 cycles +Generating keypair.. avg. 120475.04 us (0.12 sec); median 230,916,631 cycles, 1x: 230,916,631 cycles + - WOTS pk gen 2x.. avg. 474.85 us (0.00 sec); median 901,124 cycles, 256x: 230,687,744 cycles +Signing.. avg. 914668.15 us (0.91 sec); median 1,753,575,715 cycles, 1x: 1,753,575,715 cycles + - FORS signing.. avg. 71580.77 us (0.07 sec); median 137,192,437 cycles, 1x: 137,192,437 cycles + - WOTS pk gen x2.. avg. 474.25 us (0.00 sec); median 900,568 cycles, 1792x: 1,613,817,856 cycles +Verifying.. avg. 1011.70 us (0.00 sec); median 1,932,444 cycles, 1x: 1,932,444 cycles +Signature size: 7856 (7.67 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-128s-simple_C b/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-128s-simple_C new file mode 100644 index 0000000..204020d --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-128s-simple_C @@ -0,0 +1,14 @@ +Parameters: n = 16, h = 63, d = 7, b = 12, k = 14, w = 16 +Running 10 iterations. +thash avg. 0.89 us (0.00 sec); median 1,703 cycles, 1x: 1,703 cycles +f1600x2 avg. 0.88 us (0.00 sec); median 1,680 cycles, 1x: 1,680 cycles +thashx2 avg. 0.90 us (0.00 sec); median 1,701 cycles, 1x: 1,701 cycles +Generating keypair.. avg. 128931.34 us (0.13 sec); median 246,901,308 cycles, 1x: 246,901,308 cycles + - WOTS pk gen 2x.. avg. 507.26 us (0.00 sec); median 962,908 cycles, 256x: 246,504,448 cycles +Signing.. avg. 978288.65 us (0.98 sec); median 1,875,649,641 cycles, 1x: 1,875,649,641 cycles + - FORS signing.. avg. 76799.50 us (0.08 sec); median 147,315,545 cycles, 1x: 147,315,545 cycles + - WOTS pk gen x2.. avg. 507.99 us (0.00 sec); median 962,905 cycles, 1792x: 1,725,525,760 cycles +Verifying.. avg. 1171.37 us (0.00 sec); median 2,234,827 cycles, 1x: 2,234,827 cycles +Signature size: 7856 (7.67 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-128s-simple_COTHANV8 b/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-128s-simple_COTHANV8 new file mode 100644 index 0000000..c24f865 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-128s-simple_COTHANV8 @@ -0,0 +1,14 @@ +Parameters: n = 16, h = 63, d = 7, b = 12, k = 14, w = 16 +Running 10 iterations. +thash avg. 0.72 us (0.00 sec); median 1,357 cycles, 1x: 1,357 cycles +f1600x2 avg. 0.70 us (0.00 sec); median 1,323 cycles, 1x: 1,323 cycles +thashx2 avg. 0.71 us (0.00 sec); median 1,352 cycles, 1x: 1,352 cycles +Generating keypair.. avg. 102694.96 us (0.10 sec); median 196,887,703 cycles, 1x: 196,887,703 cycles + - WOTS pk gen 2x.. avg. 404.30 us (0.00 sec); median 767,548 cycles, 256x: 196,492,288 cycles +Signing.. avg. 780253.05 us (0.78 sec); median 1,495,761,829 cycles, 1x: 1,495,761,829 cycles + - FORS signing.. avg. 61269.92 us (0.06 sec); median 117,525,623 cycles, 1x: 117,525,623 cycles + - WOTS pk gen x2.. avg. 406.12 us (0.00 sec); median 767,881 cycles, 1792x: 1,376,042,752 cycles +Verifying.. avg. 917.72 us (0.00 sec); median 1,751,313 cycles, 1x: 1,751,313 cycles +Signature size: 7856 (7.67 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-192f-robust_BAS b/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-192f-robust_BAS new file mode 100644 index 0000000..157223f --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-192f-robust_BAS @@ -0,0 +1,14 @@ +Parameters: n = 24, h = 66, d = 22, b = 8, k = 33, w = 16 +Running 10 iterations. +thash avg. 1.65 us (0.00 sec); median 3,158 cycles, 1x: 3,158 cycles +f1600x2 avg. 0.81 us (0.00 sec); median 1,547 cycles, 1x: 1,547 cycles +thashx2 avg. 1.65 us (0.00 sec); median 3,160 cycles, 1x: 3,160 cycles +Generating keypair.. avg. 5299.59 us (0.01 sec); median 10,153,964 cycles, 1x: 10,153,964 cycles + - WOTS pk gen 2x.. avg. 1326.05 us (0.00 sec); median 2,535,027 cycles, 4x: 10,140,108 cycles +Signing.. avg. 134038.24 us (0.13 sec); median 256,992,160 cycles, 1x: 256,992,160 cycles + - FORS signing.. avg. 17524.96 us (0.02 sec); median 33,609,977 cycles, 1x: 33,609,977 cycles + - WOTS pk gen x2.. avg. 1329.40 us (0.00 sec); median 2,534,812 cycles, 88x: 223,063,456 cycles +Verifying.. avg. 8221.65 us (0.01 sec); median 15,766,612 cycles, 1x: 15,766,612 cycles +Signature size: 35664 (34.83 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-192f-robust_C b/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-192f-robust_C new file mode 100644 index 0000000..2cec6fe --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-192f-robust_C @@ -0,0 +1,14 @@ +Parameters: n = 24, h = 66, d = 22, b = 8, k = 33, w = 16 +Running 10 iterations. +thash avg. 1.79 us (0.00 sec); median 3,413 cycles, 1x: 3,413 cycles +f1600x2 avg. 0.88 us (0.00 sec); median 1,681 cycles, 1x: 1,681 cycles +thashx2 avg. 1.78 us (0.00 sec); median 3,408 cycles, 1x: 3,408 cycles +Generating keypair.. avg. 5717.71 us (0.01 sec); median 10,934,197 cycles, 1x: 10,934,197 cycles + - WOTS pk gen 2x.. avg. 1428.75 us (0.00 sec); median 2,729,695 cycles, 4x: 10,918,780 cycles +Signing.. avg. 144381.19 us (0.14 sec); median 276,828,682 cycles, 1x: 276,828,682 cycles + - FORS signing.. avg. 18915.78 us (0.02 sec); median 36,263,484 cycles, 1x: 36,263,484 cycles + - WOTS pk gen x2.. avg. 1428.21 us (0.00 sec); median 2,729,746 cycles, 88x: 240,217,648 cycles +Verifying.. avg. 8740.85 us (0.01 sec); median 16,759,499 cycles, 1x: 16,759,499 cycles +Signature size: 35664 (34.83 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-192f-robust_COTHANV8 b/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-192f-robust_COTHANV8 new file mode 100644 index 0000000..c8db2b7 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-192f-robust_COTHANV8 @@ -0,0 +1,14 @@ +Parameters: n = 24, h = 66, d = 22, b = 8, k = 33, w = 16 +Running 10 iterations. +thash avg. 1.42 us (0.00 sec); median 2,712 cycles, 1x: 2,712 cycles +f1600x2 avg. 0.70 us (0.00 sec); median 1,330 cycles, 1x: 1,330 cycles +thashx2 avg. 1.42 us (0.00 sec); median 2,706 cycles, 1x: 2,706 cycles +Generating keypair.. avg. 4543.18 us (0.00 sec); median 8,692,838 cycles, 1x: 8,692,838 cycles + - WOTS pk gen 2x.. avg. 1136.09 us (0.00 sec); median 2,170,381 cycles, 4x: 8,681,524 cycles +Signing.. avg. 114785.33 us (0.11 sec); median 220,108,092 cycles, 1x: 220,108,092 cycles + - FORS signing.. avg. 15044.89 us (0.02 sec); median 28,800,646 cycles, 1x: 28,800,646 cycles + - WOTS pk gen x2.. avg. 1140.11 us (0.00 sec); median 2,171,942 cycles, 88x: 191,130,896 cycles +Verifying.. avg. 7062.47 us (0.01 sec); median 13,520,696 cycles, 1x: 13,520,696 cycles +Signature size: 35664 (34.83 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-192f-simple_BAS b/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-192f-simple_BAS new file mode 100644 index 0000000..c1b8003 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-192f-simple_BAS @@ -0,0 +1,14 @@ +Parameters: n = 24, h = 66, d = 22, b = 8, k = 33, w = 16 +Running 10 iterations. +thash avg. 0.84 us (0.00 sec); median 1,581 cycles, 1x: 1,581 cycles +f1600x2 avg. 0.81 us (0.00 sec); median 1,547 cycles, 1x: 1,547 cycles +thashx2 avg. 0.83 us (0.00 sec); median 1,581 cycles, 1x: 1,581 cycles +Generating keypair.. avg. 2744.11 us (0.00 sec); median 5,248,434 cycles, 1x: 5,248,434 cycles + - WOTS pk gen 2x.. avg. 691.11 us (0.00 sec); median 1,310,466 cycles, 4x: 5,241,864 cycles +Signing.. avg. 70742.11 us (0.07 sec); median 135,653,442 cycles, 1x: 135,653,442 cycles + - FORS signing.. avg. 10618.87 us (0.01 sec); median 20,207,685 cycles, 1x: 20,207,685 cycles + - WOTS pk gen x2.. avg. 693.35 us (0.00 sec); median 1,310,638 cycles, 88x: 115,336,144 cycles +Verifying.. avg. 4141.50 us (0.00 sec); median 7,862,160 cycles, 1x: 7,862,160 cycles +Signature size: 35664 (34.83 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-192f-simple_C b/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-192f-simple_C new file mode 100644 index 0000000..33f0f76 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-192f-simple_C @@ -0,0 +1,14 @@ +Parameters: n = 24, h = 66, d = 22, b = 8, k = 33, w = 16 +Running 10 iterations. +thash avg. 0.90 us (0.00 sec); median 1,705 cycles, 1x: 1,705 cycles +f1600x2 avg. 0.88 us (0.00 sec); median 1,681 cycles, 1x: 1,681 cycles +thashx2 avg. 0.89 us (0.00 sec); median 1,702 cycles, 1x: 1,702 cycles +Generating keypair.. avg. 2957.18 us (0.00 sec); median 5,651,927 cycles, 1x: 5,651,927 cycles + - WOTS pk gen 2x.. avg. 778.33 us (0.00 sec); median 1,483,363 cycles, 4x: 5,933,452 cycles +Signing.. avg. 76222.59 us (0.08 sec); median 146,147,450 cycles, 1x: 146,147,450 cycles + - FORS signing.. avg. 11365.34 us (0.01 sec); median 21,782,588 cycles, 1x: 21,782,588 cycles + - WOTS pk gen x2.. avg. 778.35 us (0.00 sec); median 1,483,799 cycles, 88x: 130,574,312 cycles +Verifying.. avg. 4453.22 us (0.00 sec); median 8,531,490 cycles, 1x: 8,531,490 cycles +Signature size: 35664 (34.83 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-192f-simple_COTHANV8 b/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-192f-simple_COTHANV8 new file mode 100644 index 0000000..531010d --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-192f-simple_COTHANV8 @@ -0,0 +1,14 @@ +Parameters: n = 24, h = 66, d = 22, b = 8, k = 33, w = 16 +Running 10 iterations. +thash avg. 0.72 us (0.00 sec); median 1,362 cycles, 1x: 1,362 cycles +f1600x2 avg. 0.70 us (0.00 sec); median 1,325 cycles, 1x: 1,325 cycles +thashx2 avg. 0.71 us (0.00 sec); median 1,357 cycles, 1x: 1,357 cycles +Generating keypair.. avg. 2356.23 us (0.00 sec); median 4,505,863 cycles, 1x: 4,505,863 cycles + - WOTS pk gen 2x.. avg. 595.12 us (0.00 sec); median 1,124,484 cycles, 4x: 4,497,936 cycles +Signing.. avg. 60766.64 us (0.06 sec); median 116,534,766 cycles, 1x: 116,534,766 cycles + - FORS signing.. avg. 9074.98 us (0.01 sec); median 17,398,401 cycles, 1x: 17,398,401 cycles + - WOTS pk gen x2.. avg. 592.65 us (0.00 sec); median 1,124,594 cycles, 88x: 98,964,272 cycles +Verifying.. avg. 3544.91 us (0.00 sec); median 6,790,946 cycles, 1x: 6,790,946 cycles +Signature size: 35664 (34.83 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-192s-robust_BAS b/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-192s-robust_BAS new file mode 100644 index 0000000..a70eab6 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-192s-robust_BAS @@ -0,0 +1,14 @@ +Parameters: n = 24, h = 63, d = 7, b = 14, k = 17, w = 16 +Running 10 iterations. +thash avg. 1.66 us (0.00 sec); median 3,161 cycles, 1x: 3,161 cycles +f1600x2 avg. 0.81 us (0.00 sec); median 1,547 cycles, 1x: 1,547 cycles +thashx2 avg. 1.65 us (0.00 sec); median 3,163 cycles, 1x: 3,163 cycles +Generating keypair.. avg. 338917.78 us (0.34 sec); median 649,600,202 cycles, 1x: 649,600,202 cycles + - WOTS pk gen 2x.. avg. 1326.99 us (0.00 sec); median 2,535,970 cycles, 256x: 649,208,320 cycles +Signing.. avg. 2948354.45 us (2.95 sec); median 5,653,486,732 cycles, 1x: 5,653,486,732 cycles + - FORS signing.. avg. 576500.73 us (0.58 sec); median 1,105,431,087 cycles, 1x: 1,105,431,087 cycles + - WOTS pk gen x2.. avg. 1327.33 us (0.00 sec); median 2,536,455 cycles, 1792x: 4,545,327,360 cycles +Verifying.. avg. 2976.84 us (0.00 sec); median 5,693,332 cycles, 1x: 5,693,332 cycles +Signature size: 16224 (15.84 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-192s-robust_C b/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-192s-robust_C new file mode 100644 index 0000000..be2a253 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-192s-robust_C @@ -0,0 +1,14 @@ +Parameters: n = 24, h = 63, d = 7, b = 14, k = 17, w = 16 +Running 10 iterations. +thash avg. 1.79 us (0.00 sec); median 3,416 cycles, 1x: 3,416 cycles +f1600x2 avg. 0.89 us (0.00 sec); median 1,682 cycles, 1x: 1,682 cycles +thashx2 avg. 1.79 us (0.00 sec); median 3,412 cycles, 1x: 3,412 cycles +Generating keypair.. avg. 364836.65 us (0.36 sec); median 699,687,053 cycles, 1x: 699,687,053 cycles + - WOTS pk gen 2x.. avg. 1426.95 us (0.00 sec); median 2,729,743 cycles, 256x: 698,814,208 cycles +Signing.. avg. 3178242.66 us (3.18 sec); median 6,091,695,900 cycles, 1x: 6,091,695,900 cycles + - FORS signing.. avg. 622388.55 us (0.62 sec); median 1,193,384,278 cycles, 1x: 1,193,384,278 cycles + - WOTS pk gen x2.. avg. 1428.61 us (0.00 sec); median 2,730,260 cycles, 1792x: 4,892,625,920 cycles +Verifying.. avg. 3131.56 us (0.00 sec); median 5,997,164 cycles, 1x: 5,997,164 cycles +Signature size: 16224 (15.84 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-192s-robust_COTHANV8 b/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-192s-robust_COTHANV8 new file mode 100644 index 0000000..5ac2105 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-192s-robust_COTHANV8 @@ -0,0 +1,14 @@ +Parameters: n = 24, h = 63, d = 7, b = 14, k = 17, w = 16 +Running 10 iterations. +thash avg. 1.42 us (0.00 sec); median 2,714 cycles, 1x: 2,714 cycles +f1600x2 avg. 0.70 us (0.00 sec); median 1,326 cycles, 1x: 1,326 cycles +thashx2 avg. 1.42 us (0.00 sec); median 2,708 cycles, 1x: 2,708 cycles +Generating keypair.. avg. 290036.21 us (0.29 sec); median 556,207,248 cycles, 1x: 556,207,248 cycles + - WOTS pk gen 2x.. avg. 1134.99 us (0.00 sec); median 2,169,427 cycles, 256x: 555,373,312 cycles +Signing.. avg. 2524900.56 us (2.52 sec); median 4,841,896,756 cycles, 1x: 4,841,896,756 cycles + - FORS signing.. avg. 494480.45 us (0.49 sec); median 948,244,257 cycles, 1x: 948,244,257 cycles + - WOTS pk gen x2.. avg. 1136.45 us (0.00 sec); median 2,170,228 cycles, 1792x: 3,889,048,576 cycles +Verifying.. avg. 2581.94 us (0.00 sec); median 4,941,320 cycles, 1x: 4,941,320 cycles +Signature size: 16224 (15.84 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-192s-simple_BAS b/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-192s-simple_BAS new file mode 100644 index 0000000..4b9d5e7 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-192s-simple_BAS @@ -0,0 +1,14 @@ +Parameters: n = 24, h = 63, d = 7, b = 14, k = 17, w = 16 +Running 10 iterations. +thash avg. 0.83 us (0.00 sec); median 1,582 cycles, 1x: 1,582 cycles +f1600x2 avg. 0.81 us (0.00 sec); median 1,547 cycles, 1x: 1,547 cycles +thashx2 avg. 0.83 us (0.00 sec); median 1,580 cycles, 1x: 1,580 cycles +Generating keypair.. avg. 175423.91 us (0.18 sec); median 335,952,366 cycles, 1x: 335,952,366 cycles + - WOTS pk gen 2x.. avg. 689.97 us (0.00 sec); median 1,310,637 cycles, 256x: 335,523,072 cycles +Signing.. avg. 1573010.52 us (1.57 sec); median 3,016,398,484 cycles, 1x: 3,016,398,484 cycles + - FORS signing.. avg. 346405.05 us (0.35 sec); median 664,280,164 cycles, 1x: 664,280,164 cycles + - WOTS pk gen x2.. avg. 687.73 us (0.00 sec); median 1,310,506 cycles, 1792x: 2,348,426,752 cycles +Verifying.. avg. 1502.89 us (0.00 sec); median 2,870,908 cycles, 1x: 2,870,908 cycles +Signature size: 16224 (15.84 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-192s-simple_C b/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-192s-simple_C new file mode 100644 index 0000000..56d2e99 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-192s-simple_C @@ -0,0 +1,14 @@ +Parameters: n = 24, h = 63, d = 7, b = 14, k = 17, w = 16 +Running 10 iterations. +thash avg. 0.90 us (0.00 sec); median 1,707 cycles, 1x: 1,707 cycles +f1600x2 avg. 0.88 us (0.00 sec); median 1,681 cycles, 1x: 1,681 cycles +thashx2 avg. 0.89 us (0.00 sec); median 1,704 cycles, 1x: 1,704 cycles +Generating keypair.. avg. 188386.08 us (0.19 sec); median 361,300,833 cycles, 1x: 361,300,833 cycles + - WOTS pk gen 2x.. avg. 743.68 us (0.00 sec); median 1,411,688 cycles, 256x: 361,392,128 cycles +Signing.. avg. 1693292.10 us (1.69 sec); median 3,247,323,555 cycles, 1x: 3,247,323,555 cycles + - FORS signing.. avg. 374448.36 us (0.37 sec); median 718,075,434 cycles, 1x: 718,075,434 cycles + - WOTS pk gen x2.. avg. 740.43 us (0.00 sec); median 1,412,300 cycles, 1792x: 2,530,841,600 cycles +Verifying.. avg. 1628.49 us (0.00 sec); median 3,111,380 cycles, 1x: 3,111,380 cycles +Signature size: 16224 (15.84 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-192s-simple_COTHANV8 b/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-192s-simple_COTHANV8 new file mode 100644 index 0000000..9030009 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-192s-simple_COTHANV8 @@ -0,0 +1,14 @@ +Parameters: n = 24, h = 63, d = 7, b = 14, k = 17, w = 16 +Running 10 iterations. +thash avg. 0.72 us (0.00 sec); median 1,360 cycles, 1x: 1,360 cycles +f1600x2 avg. 0.70 us (0.00 sec); median 1,332 cycles, 1x: 1,332 cycles +thashx2 avg. 0.71 us (0.00 sec); median 1,355 cycles, 1x: 1,355 cycles +Generating keypair.. avg. 150469.82 us (0.15 sec); median 288,586,722 cycles, 1x: 288,586,722 cycles + - WOTS pk gen 2x.. avg. 592.49 us (0.00 sec); median 1,126,573 cycles, 256x: 288,402,688 cycles +Signing.. avg. 1351649.15 us (1.35 sec); median 2,591,595,657 cycles, 1x: 2,591,595,657 cycles + - FORS signing.. avg. 297933.28 us (0.30 sec); median 571,294,081 cycles, 1x: 571,294,081 cycles + - WOTS pk gen x2.. avg. 592.31 us (0.00 sec); median 1,127,459 cycles, 1792x: 2,020,406,528 cycles +Verifying.. avg. 1261.62 us (0.00 sec); median 2,407,449 cycles, 1x: 2,407,449 cycles +Signature size: 16224 (15.84 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-256f-robust_BAS b/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-256f-robust_BAS new file mode 100644 index 0000000..fd7b487 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-256f-robust_BAS @@ -0,0 +1,14 @@ +Parameters: n = 32, h = 68, d = 17, b = 9, k = 35, w = 16 +Running 10 iterations. +thash avg. 1.65 us (0.00 sec); median 3,153 cycles, 1x: 3,153 cycles +f1600x2 avg. 0.81 us (0.00 sec); median 1,547 cycles, 1x: 1,547 cycles +thashx2 avg. 1.65 us (0.00 sec); median 3,150 cycles, 1x: 3,150 cycles +Generating keypair.. avg. 13958.91 us (0.01 sec); median 26,767,276 cycles, 1x: 26,767,276 cycles + - WOTS pk gen 2x.. avg. 1748.34 us (0.00 sec); median 3,344,932 cycles, 8x: 26,759,456 cycles +Signing.. avg. 274603.10 us (0.27 sec); median 526,358,565 cycles, 1x: 526,358,565 cycles + - FORS signing.. avg. 37250.35 us (0.04 sec); median 71,228,606 cycles, 1x: 71,228,606 cycles + - WOTS pk gen x2.. avg. 1752.03 us (0.00 sec); median 3,345,912 cycles, 136x: 455,044,032 cycles +Verifying.. avg. 8319.83 us (0.01 sec); median 15,960,213 cycles, 1x: 15,960,213 cycles +Signature size: 49856 (48.69 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-256f-robust_C b/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-256f-robust_C new file mode 100644 index 0000000..d548740 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-256f-robust_C @@ -0,0 +1,14 @@ +Parameters: n = 32, h = 68, d = 17, b = 9, k = 35, w = 16 +Running 10 iterations. +thash avg. 1.79 us (0.00 sec); median 3,419 cycles, 1x: 3,419 cycles +f1600x2 avg. 0.88 us (0.00 sec); median 1,681 cycles, 1x: 1,681 cycles +thashx2 avg. 1.79 us (0.00 sec); median 3,417 cycles, 1x: 3,417 cycles +Generating keypair.. avg. 15042.65 us (0.02 sec); median 28,838,788 cycles, 1x: 28,838,788 cycles + - WOTS pk gen 2x.. avg. 1882.33 us (0.00 sec); median 3,601,916 cycles, 8x: 28,815,328 cycles +Signing.. avg. 295859.22 us (0.30 sec); median 567,215,144 cycles, 1x: 567,215,144 cycles + - FORS signing.. avg. 40171.81 us (0.04 sec); median 77,002,172 cycles, 1x: 77,002,172 cycles + - WOTS pk gen x2.. avg. 1882.06 us (0.00 sec); median 3,601,627 cycles, 136x: 489,821,272 cycles +Verifying.. avg. 8996.45 us (0.01 sec); median 17,251,028 cycles, 1x: 17,251,028 cycles +Signature size: 49856 (48.69 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-256f-robust_COTHANV8 b/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-256f-robust_COTHANV8 new file mode 100644 index 0000000..d60df6e --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-256f-robust_COTHANV8 @@ -0,0 +1,14 @@ +Parameters: n = 32, h = 68, d = 17, b = 9, k = 35, w = 16 +Running 10 iterations. +thash avg. 1.42 us (0.00 sec); median 2,711 cycles, 1x: 2,711 cycles +f1600x2 avg. 0.70 us (0.00 sec); median 1,325 cycles, 1x: 1,325 cycles +thashx2 avg. 1.42 us (0.00 sec); median 2,707 cycles, 1x: 2,707 cycles +Generating keypair.. avg. 12018.37 us (0.01 sec); median 22,961,428 cycles, 1x: 22,961,428 cycles + - WOTS pk gen 2x.. avg. 1499.53 us (0.00 sec); median 2,865,954 cycles, 8x: 22,927,632 cycles +Signing.. avg. 235673.38 us (0.24 sec); median 451,989,983 cycles, 1x: 451,989,983 cycles + - FORS signing.. avg. 32159.76 us (0.03 sec); median 61,571,322 cycles, 1x: 61,571,322 cycles + - WOTS pk gen x2.. avg. 1499.87 us (0.00 sec); median 2,867,367 cycles, 136x: 389,961,912 cycles +Verifying.. avg. 7411.57 us (0.01 sec); median 14,199,395 cycles, 1x: 14,199,395 cycles +Signature size: 49856 (48.69 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-256f-simple_BAS b/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-256f-simple_BAS new file mode 100644 index 0000000..3d9cae3 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-256f-simple_BAS @@ -0,0 +1,14 @@ +Parameters: n = 32, h = 68, d = 17, b = 9, k = 35, w = 16 +Running 10 iterations. +thash avg. 0.84 us (0.00 sec); median 1,578 cycles, 1x: 1,578 cycles +f1600x2 avg. 0.81 us (0.00 sec); median 1,547 cycles, 1x: 1,547 cycles +thashx2 avg. 0.83 us (0.00 sec); median 1,577 cycles, 1x: 1,577 cycles +Generating keypair.. avg. 7221.56 us (0.01 sec); median 13,836,632 cycles, 1x: 13,836,632 cycles + - WOTS pk gen 2x.. avg. 906.03 us (0.00 sec); median 1,729,162 cycles, 8x: 13,833,296 cycles +Signing.. avg. 144984.95 us (0.14 sec); median 278,006,422 cycles, 1x: 278,006,422 cycles + - FORS signing.. avg. 22351.35 us (0.02 sec); median 42,873,876 cycles, 1x: 42,873,876 cycles + - WOTS pk gen x2.. avg. 905.81 us (0.00 sec); median 1,727,694 cycles, 136x: 234,966,384 cycles +Verifying.. avg. 4245.60 us (0.00 sec); median 8,139,298 cycles, 1x: 8,139,298 cycles +Signature size: 49856 (48.69 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-256f-simple_C b/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-256f-simple_C new file mode 100644 index 0000000..d01012a --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-256f-simple_C @@ -0,0 +1,14 @@ +Parameters: n = 32, h = 68, d = 17, b = 9, k = 35, w = 16 +Running 10 iterations. +thash avg. 0.90 us (0.00 sec); median 1,705 cycles, 1x: 1,705 cycles +f1600x2 avg. 0.88 us (0.00 sec); median 1,681 cycles, 1x: 1,681 cycles +thashx2 avg. 0.89 us (0.00 sec); median 1,703 cycles, 1x: 1,703 cycles +Generating keypair.. avg. 7820.35 us (0.01 sec); median 14,974,722 cycles, 1x: 14,974,722 cycles + - WOTS pk gen 2x.. avg. 982.18 us (0.00 sec); median 1,869,605 cycles, 8x: 14,956,840 cycles +Signing.. avg. 156849.84 us (0.16 sec); median 300,786,178 cycles, 1x: 300,786,178 cycles + - FORS signing.. avg. 24119.20 us (0.02 sec); median 46,229,354 cycles, 1x: 46,229,354 cycles + - WOTS pk gen x2.. avg. 981.98 us (0.00 sec); median 1,870,415 cycles, 136x: 254,376,440 cycles +Verifying.. avg. 4552.00 us (0.00 sec); median 8,719,747 cycles, 1x: 8,719,747 cycles +Signature size: 49856 (48.69 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-256f-simple_COTHANV8 b/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-256f-simple_COTHANV8 new file mode 100644 index 0000000..eb44998 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-256f-simple_COTHANV8 @@ -0,0 +1,14 @@ +Parameters: n = 32, h = 68, d = 17, b = 9, k = 35, w = 16 +Running 10 iterations. +thash avg. 0.72 us (0.00 sec); median 1,361 cycles, 1x: 1,361 cycles +f1600x2 avg. 0.70 us (0.00 sec); median 1,324 cycles, 1x: 1,324 cycles +thashx2 avg. 0.71 us (0.00 sec); median 1,357 cycles, 1x: 1,357 cycles +Generating keypair.. avg. 6217.80 us (0.01 sec); median 11,906,370 cycles, 1x: 11,906,370 cycles + - WOTS pk gen 2x.. avg. 779.75 us (0.00 sec); median 1,486,309 cycles, 8x: 11,890,472 cycles +Signing.. avg. 124771.25 us (0.12 sec); median 239,258,150 cycles, 1x: 239,258,150 cycles + - FORS signing.. avg. 19213.89 us (0.02 sec); median 36,824,233 cycles, 1x: 36,824,233 cycles + - WOTS pk gen x2.. avg. 779.33 us (0.00 sec); median 1,487,074 cycles, 136x: 202,242,064 cycles +Verifying.. avg. 3552.39 us (0.00 sec); median 6,805,560 cycles, 1x: 6,805,560 cycles +Signature size: 49856 (48.69 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-256s-robust_BAS b/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-256s-robust_BAS new file mode 100644 index 0000000..9265d61 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-256s-robust_BAS @@ -0,0 +1,14 @@ +Parameters: n = 32, h = 64, d = 8, b = 14, k = 22, w = 16 +Running 10 iterations. +thash avg. 1.65 us (0.00 sec); median 3,156 cycles, 1x: 3,156 cycles +f1600x2 avg. 0.81 us (0.00 sec); median 1,547 cycles, 1x: 1,547 cycles +thashx2 avg. 1.65 us (0.00 sec); median 3,152 cycles, 1x: 3,152 cycles +Generating keypair.. avg. 223025.54 us (0.22 sec); median 427,766,850 cycles, 1x: 427,766,850 cycles + - WOTS pk gen 2x.. avg. 1756.17 us (0.00 sec); median 3,343,356 cycles, 128x: 427,949,568 cycles +Signing.. avg. 2530320.51 us (2.53 sec); median 4,851,834,864 cycles, 1x: 4,851,834,864 cycles + - FORS signing.. avg. 745371.90 us (0.75 sec); median 1,429,423,450 cycles, 1x: 1,429,423,450 cycles + - WOTS pk gen x2.. avg. 1749.54 us (0.00 sec); median 3,340,268 cycles, 1024x: 3,420,434,432 cycles +Verifying.. avg. 4395.27 us (0.00 sec); median 8,420,785 cycles, 1x: 8,420,785 cycles +Signature size: 29792 (29.09 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-256s-robust_C b/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-256s-robust_C new file mode 100644 index 0000000..7251ec4 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-256s-robust_C @@ -0,0 +1,14 @@ +Parameters: n = 32, h = 64, d = 8, b = 14, k = 22, w = 16 +Running 10 iterations. +thash avg. 1.79 us (0.00 sec); median 3,413 cycles, 1x: 3,413 cycles +f1600x2 avg. 0.88 us (0.00 sec); median 1,680 cycles, 1x: 1,680 cycles +thashx2 avg. 1.78 us (0.00 sec); median 3,408 cycles, 1x: 3,408 cycles +Generating keypair.. avg. 241326.05 us (0.24 sec); median 462,848,208 cycles, 1x: 462,848,208 cycles + - WOTS pk gen 2x.. avg. 1885.96 us (0.00 sec); median 3,610,290 cycles, 128x: 462,117,120 cycles +Signing.. avg. 2736895.39 us (2.74 sec); median 5,248,289,006 cycles, 1x: 5,248,289,006 cycles + - FORS signing.. avg. 805863.23 us (0.81 sec); median 1,545,350,346 cycles, 1x: 1,545,350,346 cycles + - WOTS pk gen x2.. avg. 1887.65 us (0.00 sec); median 3,610,355 cycles, 1024x: 3,697,003,520 cycles +Verifying.. avg. 4703.59 us (0.00 sec); median 9,009,284 cycles, 1x: 9,009,284 cycles +Signature size: 29792 (29.09 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-256s-robust_COTHANV8 b/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-256s-robust_COTHANV8 new file mode 100644 index 0000000..c810ec4 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-256s-robust_COTHANV8 @@ -0,0 +1,14 @@ +Parameters: n = 32, h = 64, d = 8, b = 14, k = 22, w = 16 +Running 10 iterations. +thash avg. 1.42 us (0.00 sec); median 2,712 cycles, 1x: 2,712 cycles +f1600x2 avg. 0.70 us (0.00 sec); median 1,324 cycles, 1x: 1,324 cycles +thashx2 avg. 1.42 us (0.00 sec); median 2,708 cycles, 1x: 2,708 cycles +Generating keypair.. avg. 191688.88 us (0.19 sec); median 367,313,491 cycles, 1x: 367,313,491 cycles + - WOTS pk gen 2x.. avg. 1498.20 us (0.00 sec); median 2,865,893 cycles, 128x: 366,834,304 cycles +Signing.. avg. 2173555.26 us (2.17 sec); median 4,167,705,403 cycles, 1x: 4,167,705,403 cycles + - FORS signing.. avg. 640976.72 us (0.64 sec); median 1,229,139,360 cycles, 1x: 1,229,139,360 cycles + - WOTS pk gen x2.. avg. 1507.94 us (0.00 sec); median 2,868,747 cycles, 1024x: 2,937,596,928 cycles +Verifying.. avg. 3758.02 us (0.00 sec); median 7,154,876 cycles, 1x: 7,154,876 cycles +Signature size: 29792 (29.09 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-256s-simple_BAS b/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-256s-simple_BAS new file mode 100644 index 0000000..9fbbf5b --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-256s-simple_BAS @@ -0,0 +1,14 @@ +Parameters: n = 32, h = 64, d = 8, b = 14, k = 22, w = 16 +Running 10 iterations. +thash avg. 0.84 us (0.00 sec); median 1,579 cycles, 1x: 1,579 cycles +f1600x2 avg. 0.81 us (0.00 sec); median 1,547 cycles, 1x: 1,547 cycles +thashx2 avg. 0.83 us (0.00 sec); median 1,574 cycles, 1x: 1,574 cycles +Generating keypair.. avg. 115509.53 us (0.12 sec); median 221,519,591 cycles, 1x: 221,519,591 cycles + - WOTS pk gen 2x.. avg. 906.23 us (0.00 sec); median 1,727,913 cycles, 128x: 221,172,864 cycles +Signing.. avg. 1373694.36 us (1.37 sec); median 2,633,690,353 cycles, 1x: 2,633,690,353 cycles + - FORS signing.. avg. 449201.95 us (0.45 sec); median 861,419,633 cycles, 1x: 861,419,633 cycles + - WOTS pk gen x2.. avg. 905.40 us (0.00 sec); median 1,726,372 cycles, 1024x: 1,767,804,928 cycles +Verifying.. avg. 2164.61 us (0.00 sec); median 4,139,056 cycles, 1x: 4,139,056 cycles +Signature size: 29792 (29.09 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-256s-simple_C b/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-256s-simple_C new file mode 100644 index 0000000..a21c140 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-256s-simple_C @@ -0,0 +1,14 @@ +Parameters: n = 32, h = 64, d = 8, b = 14, k = 22, w = 16 +Running 10 iterations. +thash avg. 0.90 us (0.00 sec); median 1,710 cycles, 1x: 1,710 cycles +f1600x2 avg. 0.88 us (0.00 sec); median 1,679 cycles, 1x: 1,679 cycles +thashx2 avg. 0.90 us (0.00 sec); median 1,705 cycles, 1x: 1,705 cycles +Generating keypair.. avg. 124379.65 us (0.12 sec); median 238,452,037 cycles, 1x: 238,452,037 cycles + - WOTS pk gen 2x.. avg. 974.79 us (0.00 sec); median 1,862,019 cycles, 128x: 238,338,432 cycles +Signing.. avg. 1479502.61 us (1.48 sec); median 2,836,833,463 cycles, 1x: 2,836,833,463 cycles + - FORS signing.. avg. 484527.17 us (0.48 sec); median 929,011,820 cycles, 1x: 929,011,820 cycles + - WOTS pk gen x2.. avg. 980.75 us (0.00 sec); median 1,861,268 cycles, 1024x: 1,905,938,432 cycles +Verifying.. avg. 2420.73 us (0.00 sec); median 4,607,278 cycles, 1x: 4,607,278 cycles +Signature size: 29792 (29.09 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-256s-simple_COTHANV8 b/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-256s-simple_COTHANV8 new file mode 100644 index 0000000..9224f4c --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/benchmarks_X2/sphincs-shake-256s-simple_COTHANV8 @@ -0,0 +1,14 @@ +Parameters: n = 32, h = 64, d = 8, b = 14, k = 22, w = 16 +Running 10 iterations. +thash avg. 0.72 us (0.00 sec); median 1,363 cycles, 1x: 1,363 cycles +f1600x2 avg. 0.70 us (0.00 sec); median 1,324 cycles, 1x: 1,324 cycles +thashx2 avg. 0.72 us (0.00 sec); median 1,357 cycles, 1x: 1,357 cycles +Generating keypair.. avg. 99443.32 us (0.10 sec); median 190,663,503 cycles, 1x: 190,663,503 cycles + - WOTS pk gen 2x.. avg. 780.17 us (0.00 sec); median 1,487,523 cycles, 128x: 190,402,944 cycles +Signing.. avg. 1181845.79 us (1.18 sec); median 2,265,899,851 cycles, 1x: 2,265,899,851 cycles + - FORS signing.. avg. 386247.78 us (0.39 sec); median 740,581,822 cycles, 1x: 740,581,822 cycles + - WOTS pk gen x2.. avg. 780.30 us (0.00 sec); median 1,486,898 cycles, 1024x: 1,522,583,552 cycles +Verifying.. avg. 1886.19 us (0.00 sec); median 3,600,889 cycles, 1x: 3,600,889 cycles +Signature size: 29792 (29.09 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakx2/context.h b/sphincsplus/sphincsplus-keccakx2/context.h new file mode 100644 index 0000000..993c9ce --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/context.h @@ -0,0 +1,13 @@ +#ifndef SPX_CONTEXT_H +#define SPX_CONTEXT_H + +#include + +#include "params.h" + +typedef struct { + uint8_t pub_seed[SPX_N]; + uint8_t sk_seed[SPX_N]; +} spx_ctx; + +#endif diff --git a/sphincsplus/sphincsplus-keccakx2/f1600x2.h b/sphincsplus/sphincsplus-keccakx2/f1600x2.h new file mode 100644 index 0000000..62c2693 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/f1600x2.h @@ -0,0 +1,18 @@ +#ifndef SPX_F1600X2_H +#define SPX_F1600X2_H + +#include + +#if defined(KECCAK_X2_IMPL_C) +void keccak_f1600_x2_C(uint64_t state[2*25]); +#define f1600x2(s) keccak_f1600_x2_scalar_C(s) +#elif defined(KECCAK_X2_IMPL_COTHAN) +#include +void keccak_f1600_x2_neon_C_cothan(uint64_t state[2*25]); +#define f1600x2(s) keccak_f1600_x2_neon_C_cothan(s) +#elif defined(KECCAK_X2_IMPL_BAS) +extern void keccak_f1600_x2_bas(uint64_t* a); +#define f1600x2(s) keccak_f1600_x2_bas(s) +#endif + +#endif diff --git a/sphincsplus/sphincsplus-keccakx2/fips202.c b/sphincsplus/sphincsplus-keccakx2/fips202.c new file mode 100644 index 0000000..ceeb6a5 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/fips202.c @@ -0,0 +1,762 @@ +/* Based on the public domain implementation in + * crypto_hash/keccakc512/simple/ from http://bench.cr.yp.to/supercop.html + * by Ronny Van Keer + * and the public domain "TweetFips202" implementation + * from https://twitter.com/tweetfips202 + * by Gilles Van Assche, Daniel J. Bernstein, and Peter Schwabe */ + +#include +#include + +#include "fips202.h" + +#define NROUNDS 24 +#define ROL(a, offset) (((a) << (offset)) ^ ((a) >> (64 - (offset)))) + +/************************************************* + * Name: load64 + * + * Description: Load 8 bytes into uint64_t in little-endian order + * + * Arguments: - const uint8_t *x: pointer to input byte array + * + * Returns the loaded 64-bit unsigned integer + **************************************************/ +static uint64_t load64(const uint8_t *x) { + uint64_t r = 0; + for (size_t i = 0; i < 8; ++i) { + r |= (uint64_t)x[i] << 8 * i; + } + + return r; +} + +/************************************************* + * Name: store64 + * + * Description: Store a 64-bit integer to a byte array in little-endian order + * + * Arguments: - uint8_t *x: pointer to the output byte array + * - uint64_t u: input 64-bit unsigned integer + **************************************************/ +static void store64(uint8_t *x, uint64_t u) { + for (size_t i = 0; i < 8; ++i) { + x[i] = (uint8_t) (u >> 8 * i); + } +} + +/* Keccak round constants */ +static const uint64_t KeccakF_RoundConstants[NROUNDS] = { + 0x0000000000000001ULL, 0x0000000000008082ULL, + 0x800000000000808aULL, 0x8000000080008000ULL, + 0x000000000000808bULL, 0x0000000080000001ULL, + 0x8000000080008081ULL, 0x8000000000008009ULL, + 0x000000000000008aULL, 0x0000000000000088ULL, + 0x0000000080008009ULL, 0x000000008000000aULL, + 0x000000008000808bULL, 0x800000000000008bULL, + 0x8000000000008089ULL, 0x8000000000008003ULL, + 0x8000000000008002ULL, 0x8000000000000080ULL, + 0x000000000000800aULL, 0x800000008000000aULL, + 0x8000000080008081ULL, 0x8000000000008080ULL, + 0x0000000080000001ULL, 0x8000000080008008ULL +}; + +/************************************************* + * Name: KeccakF1600_StatePermute + * + * Description: The Keccak F1600 Permutation + * + * Arguments: - uint64_t *state: pointer to input/output Keccak state + **************************************************/ +static void KeccakF1600_StatePermute(uint64_t *state) { + int round; + + uint64_t Aba, Abe, Abi, Abo, Abu; + uint64_t Aga, Age, Agi, Ago, Agu; + uint64_t Aka, Ake, Aki, Ako, Aku; + uint64_t Ama, Ame, Ami, Amo, Amu; + uint64_t Asa, Ase, Asi, Aso, Asu; + uint64_t BCa, BCe, BCi, BCo, BCu; + uint64_t Da, De, Di, Do, Du; + uint64_t Eba, Ebe, Ebi, Ebo, Ebu; + uint64_t Ega, Ege, Egi, Ego, Egu; + uint64_t Eka, Eke, Eki, Eko, Eku; + uint64_t Ema, Eme, Emi, Emo, Emu; + uint64_t Esa, Ese, Esi, Eso, Esu; + + // copyFromState(A, state) + Aba = state[0]; + Abe = state[1]; + Abi = state[2]; + Abo = state[3]; + Abu = state[4]; + Aga = state[5]; + Age = state[6]; + Agi = state[7]; + Ago = state[8]; + Agu = state[9]; + Aka = state[10]; + Ake = state[11]; + Aki = state[12]; + Ako = state[13]; + Aku = state[14]; + Ama = state[15]; + Ame = state[16]; + Ami = state[17]; + Amo = state[18]; + Amu = state[19]; + Asa = state[20]; + Ase = state[21]; + Asi = state[22]; + Aso = state[23]; + Asu = state[24]; + + for (round = 0; round < NROUNDS; round += 2) { + // prepareTheta + BCa = Aba ^ Aga ^ Aka ^ Ama ^ Asa; + BCe = Abe ^ Age ^ Ake ^ Ame ^ Ase; + BCi = Abi ^ Agi ^ Aki ^ Ami ^ Asi; + BCo = Abo ^ Ago ^ Ako ^ Amo ^ Aso; + BCu = Abu ^ Agu ^ Aku ^ Amu ^ Asu; + + // thetaRhoPiChiIotaPrepareTheta(round , A, E) + Da = BCu ^ ROL(BCe, 1); + De = BCa ^ ROL(BCi, 1); + Di = BCe ^ ROL(BCo, 1); + Do = BCi ^ ROL(BCu, 1); + Du = BCo ^ ROL(BCa, 1); + + Aba ^= Da; + BCa = Aba; + Age ^= De; + BCe = ROL(Age, 44); + Aki ^= Di; + BCi = ROL(Aki, 43); + Amo ^= Do; + BCo = ROL(Amo, 21); + Asu ^= Du; + BCu = ROL(Asu, 14); + Eba = BCa ^ ((~BCe) & BCi); + Eba ^= KeccakF_RoundConstants[round]; + Ebe = BCe ^ ((~BCi) & BCo); + Ebi = BCi ^ ((~BCo) & BCu); + Ebo = BCo ^ ((~BCu) & BCa); + Ebu = BCu ^ ((~BCa) & BCe); + + Abo ^= Do; + BCa = ROL(Abo, 28); + Agu ^= Du; + BCe = ROL(Agu, 20); + Aka ^= Da; + BCi = ROL(Aka, 3); + Ame ^= De; + BCo = ROL(Ame, 45); + Asi ^= Di; + BCu = ROL(Asi, 61); + Ega = BCa ^ ((~BCe) & BCi); + Ege = BCe ^ ((~BCi) & BCo); + Egi = BCi ^ ((~BCo) & BCu); + Ego = BCo ^ ((~BCu) & BCa); + Egu = BCu ^ ((~BCa) & BCe); + + Abe ^= De; + BCa = ROL(Abe, 1); + Agi ^= Di; + BCe = ROL(Agi, 6); + Ako ^= Do; + BCi = ROL(Ako, 25); + Amu ^= Du; + BCo = ROL(Amu, 8); + Asa ^= Da; + BCu = ROL(Asa, 18); + Eka = BCa ^ ((~BCe) & BCi); + Eke = BCe ^ ((~BCi) & BCo); + Eki = BCi ^ ((~BCo) & BCu); + Eko = BCo ^ ((~BCu) & BCa); + Eku = BCu ^ ((~BCa) & BCe); + + Abu ^= Du; + BCa = ROL(Abu, 27); + Aga ^= Da; + BCe = ROL(Aga, 36); + Ake ^= De; + BCi = ROL(Ake, 10); + Ami ^= Di; + BCo = ROL(Ami, 15); + Aso ^= Do; + BCu = ROL(Aso, 56); + Ema = BCa ^ ((~BCe) & BCi); + Eme = BCe ^ ((~BCi) & BCo); + Emi = BCi ^ ((~BCo) & BCu); + Emo = BCo ^ ((~BCu) & BCa); + Emu = BCu ^ ((~BCa) & BCe); + + Abi ^= Di; + BCa = ROL(Abi, 62); + Ago ^= Do; + BCe = ROL(Ago, 55); + Aku ^= Du; + BCi = ROL(Aku, 39); + Ama ^= Da; + BCo = ROL(Ama, 41); + Ase ^= De; + BCu = ROL(Ase, 2); + Esa = BCa ^ ((~BCe) & BCi); + Ese = BCe ^ ((~BCi) & BCo); + Esi = BCi ^ ((~BCo) & BCu); + Eso = BCo ^ ((~BCu) & BCa); + Esu = BCu ^ ((~BCa) & BCe); + + // prepareTheta + BCa = Eba ^ Ega ^ Eka ^ Ema ^ Esa; + BCe = Ebe ^ Ege ^ Eke ^ Eme ^ Ese; + BCi = Ebi ^ Egi ^ Eki ^ Emi ^ Esi; + BCo = Ebo ^ Ego ^ Eko ^ Emo ^ Eso; + BCu = Ebu ^ Egu ^ Eku ^ Emu ^ Esu; + + // thetaRhoPiChiIotaPrepareTheta(round+1, E, A) + Da = BCu ^ ROL(BCe, 1); + De = BCa ^ ROL(BCi, 1); + Di = BCe ^ ROL(BCo, 1); + Do = BCi ^ ROL(BCu, 1); + Du = BCo ^ ROL(BCa, 1); + + Eba ^= Da; + BCa = Eba; + Ege ^= De; + BCe = ROL(Ege, 44); + Eki ^= Di; + BCi = ROL(Eki, 43); + Emo ^= Do; + BCo = ROL(Emo, 21); + Esu ^= Du; + BCu = ROL(Esu, 14); + Aba = BCa ^ ((~BCe) & BCi); + Aba ^= KeccakF_RoundConstants[round + 1]; + Abe = BCe ^ ((~BCi) & BCo); + Abi = BCi ^ ((~BCo) & BCu); + Abo = BCo ^ ((~BCu) & BCa); + Abu = BCu ^ ((~BCa) & BCe); + + Ebo ^= Do; + BCa = ROL(Ebo, 28); + Egu ^= Du; + BCe = ROL(Egu, 20); + Eka ^= Da; + BCi = ROL(Eka, 3); + Eme ^= De; + BCo = ROL(Eme, 45); + Esi ^= Di; + BCu = ROL(Esi, 61); + Aga = BCa ^ ((~BCe) & BCi); + Age = BCe ^ ((~BCi) & BCo); + Agi = BCi ^ ((~BCo) & BCu); + Ago = BCo ^ ((~BCu) & BCa); + Agu = BCu ^ ((~BCa) & BCe); + + Ebe ^= De; + BCa = ROL(Ebe, 1); + Egi ^= Di; + BCe = ROL(Egi, 6); + Eko ^= Do; + BCi = ROL(Eko, 25); + Emu ^= Du; + BCo = ROL(Emu, 8); + Esa ^= Da; + BCu = ROL(Esa, 18); + Aka = BCa ^ ((~BCe) & BCi); + Ake = BCe ^ ((~BCi) & BCo); + Aki = BCi ^ ((~BCo) & BCu); + Ako = BCo ^ ((~BCu) & BCa); + Aku = BCu ^ ((~BCa) & BCe); + + Ebu ^= Du; + BCa = ROL(Ebu, 27); + Ega ^= Da; + BCe = ROL(Ega, 36); + Eke ^= De; + BCi = ROL(Eke, 10); + Emi ^= Di; + BCo = ROL(Emi, 15); + Eso ^= Do; + BCu = ROL(Eso, 56); + Ama = BCa ^ ((~BCe) & BCi); + Ame = BCe ^ ((~BCi) & BCo); + Ami = BCi ^ ((~BCo) & BCu); + Amo = BCo ^ ((~BCu) & BCa); + Amu = BCu ^ ((~BCa) & BCe); + + Ebi ^= Di; + BCa = ROL(Ebi, 62); + Ego ^= Do; + BCe = ROL(Ego, 55); + Eku ^= Du; + BCi = ROL(Eku, 39); + Ema ^= Da; + BCo = ROL(Ema, 41); + Ese ^= De; + BCu = ROL(Ese, 2); + Asa = BCa ^ ((~BCe) & BCi); + Ase = BCe ^ ((~BCi) & BCo); + Asi = BCi ^ ((~BCo) & BCu); + Aso = BCo ^ ((~BCu) & BCa); + Asu = BCu ^ ((~BCa) & BCe); + } + + // copyToState(state, A) + state[0] = Aba; + state[1] = Abe; + state[2] = Abi; + state[3] = Abo; + state[4] = Abu; + state[5] = Aga; + state[6] = Age; + state[7] = Agi; + state[8] = Ago; + state[9] = Agu; + state[10] = Aka; + state[11] = Ake; + state[12] = Aki; + state[13] = Ako; + state[14] = Aku; + state[15] = Ama; + state[16] = Ame; + state[17] = Ami; + state[18] = Amo; + state[19] = Amu; + state[20] = Asa; + state[21] = Ase; + state[22] = Asi; + state[23] = Aso; + state[24] = Asu; +} + +/************************************************* + * Name: keccak_absorb + * + * Description: Absorb step of Keccak; + * non-incremental, starts by zeroeing the state. + * + * Arguments: - uint64_t *s: pointer to (uninitialized) output Keccak state + * - uint32_t r: rate in bytes (e.g., 168 for SHAKE128) + * - const uint8_t *m: pointer to input to be absorbed into s + * - size_t mlen: length of input in bytes + * - uint8_t p: domain-separation byte for different + * Keccak-derived functions + **************************************************/ +static void keccak_absorb(uint64_t *s, uint32_t r, const uint8_t *m, + size_t mlen, uint8_t p) { + size_t i; + uint8_t t[200]; + + /* Zero state */ + for (i = 0; i < 25; ++i) { + s[i] = 0; + } + + while (mlen >= r) { + for (i = 0; i < r / 8; ++i) { + s[i] ^= load64(m + 8 * i); + } + + KeccakF1600_StatePermute(s); + mlen -= r; + m += r; + } + + for (i = 0; i < r; ++i) { + t[i] = 0; + } + for (i = 0; i < mlen; ++i) { + t[i] = m[i]; + } + t[i] = p; + t[r - 1] |= 128; + for (i = 0; i < r / 8; ++i) { + s[i] ^= load64(t + 8 * i); + } +} + +/************************************************* + * Name: keccak_squeezeblocks + * + * Description: Squeeze step of Keccak. Squeezes full blocks of r bytes each. + * Modifies the state. Can be called multiple times to keep + * squeezing, i.e., is incremental. + * + * Arguments: - uint8_t *h: pointer to output blocks + * - size_t nblocks: number of blocks to be + * squeezed (written to h) + * - uint64_t *s: pointer to input/output Keccak state + * - uint32_t r: rate in bytes (e.g., 168 for SHAKE128) + **************************************************/ +static void keccak_squeezeblocks(uint8_t *h, size_t nblocks, + uint64_t *s, uint32_t r) { + while (nblocks > 0) { + KeccakF1600_StatePermute(s); + for (size_t i = 0; i < (r >> 3); i++) { + store64(h + 8 * i, s[i]); + } + h += r; + nblocks--; + } +} + +/************************************************* + * Name: keccak_inc_init + * + * Description: Initializes the incremental Keccak state to zero. + * + * Arguments: - uint64_t *s_inc: pointer to input/output incremental state + * First 25 values represent Keccak state. + * 26th value represents either the number of absorbed bytes + * that have not been permuted, or not-yet-squeezed bytes. + **************************************************/ +static void keccak_inc_init(uint64_t *s_inc) { + size_t i; + + for (i = 0; i < 25; ++i) { + s_inc[i] = 0; + } + s_inc[25] = 0; +} + +/************************************************* + * Name: keccak_inc_absorb + * + * Description: Incremental keccak absorb + * Preceded by keccak_inc_init, succeeded by keccak_inc_finalize + * + * Arguments: - uint64_t *s_inc: pointer to input/output incremental state + * First 25 values represent Keccak state. + * 26th value represents either the number of absorbed bytes + * that have not been permuted, or not-yet-squeezed bytes. + * - uint32_t r: rate in bytes (e.g., 168 for SHAKE128) + * - const uint8_t *m: pointer to input to be absorbed into s + * - size_t mlen: length of input in bytes + **************************************************/ +static void keccak_inc_absorb(uint64_t *s_inc, uint32_t r, const uint8_t *m, + size_t mlen) { + size_t i; + + /* Recall that s_inc[25] is the non-absorbed bytes xored into the state */ + while (mlen + s_inc[25] >= r) { + for (i = 0; i < r - s_inc[25]; i++) { + /* Take the i'th byte from message + xor with the s_inc[25] + i'th byte of the state; little-endian */ + s_inc[(s_inc[25] + i) >> 3] ^= (uint64_t)m[i] << (8 * ((s_inc[25] + i) & 0x07)); + } + mlen -= (size_t)(r - s_inc[25]); + m += r - s_inc[25]; + s_inc[25] = 0; + + KeccakF1600_StatePermute(s_inc); + } + + for (i = 0; i < mlen; i++) { + s_inc[(s_inc[25] + i) >> 3] ^= (uint64_t)m[i] << (8 * ((s_inc[25] + i) & 0x07)); + } + s_inc[25] += mlen; +} + +/************************************************* + * Name: keccak_inc_finalize + * + * Description: Finalizes Keccak absorb phase, prepares for squeezing + * + * Arguments: - uint64_t *s_inc: pointer to input/output incremental state + * First 25 values represent Keccak state. + * 26th value represents either the number of absorbed bytes + * that have not been permuted, or not-yet-squeezed bytes. + * - uint32_t r: rate in bytes (e.g., 168 for SHAKE128) + * - uint8_t p: domain-separation byte for different + * Keccak-derived functions + **************************************************/ +static void keccak_inc_finalize(uint64_t *s_inc, uint32_t r, uint8_t p) { + /* After keccak_inc_absorb, we are guaranteed that s_inc[25] < r, + so we can always use one more byte for p in the current state. */ + s_inc[s_inc[25] >> 3] ^= (uint64_t)p << (8 * (s_inc[25] & 0x07)); + s_inc[(r - 1) >> 3] ^= (uint64_t)128 << (8 * ((r - 1) & 0x07)); + s_inc[25] = 0; +} + +/************************************************* + * Name: keccak_inc_squeeze + * + * Description: Incremental Keccak squeeze; can be called on byte-level + * + * Arguments: - uint8_t *h: pointer to output bytes + * - size_t outlen: number of bytes to be squeezed + * - uint64_t *s_inc: pointer to input/output incremental state + * First 25 values represent Keccak state. + * 26th value represents either the number of absorbed bytes + * that have not been permuted, or not-yet-squeezed bytes. + * - uint32_t r: rate in bytes (e.g., 168 for SHAKE128) + **************************************************/ +static void keccak_inc_squeeze(uint8_t *h, size_t outlen, + uint64_t *s_inc, uint32_t r) { + size_t i; + + /* First consume any bytes we still have sitting around */ + for (i = 0; i < outlen && i < s_inc[25]; i++) { + /* There are s_inc[25] bytes left, so r - s_inc[25] is the first + available byte. We consume from there, i.e., up to r. */ + h[i] = (uint8_t)(s_inc[(r - s_inc[25] + i) >> 3] >> (8 * ((r - s_inc[25] + i) & 0x07))); + } + h += i; + outlen -= i; + s_inc[25] -= i; + + /* Then squeeze the remaining necessary blocks */ + while (outlen > 0) { + KeccakF1600_StatePermute(s_inc); + + for (i = 0; i < outlen && i < r; i++) { + h[i] = (uint8_t)(s_inc[i >> 3] >> (8 * (i & 0x07))); + } + h += i; + outlen -= i; + s_inc[25] = r - i; + } +} + +void shake128_inc_init(uint64_t *s_inc) { + keccak_inc_init(s_inc); +} + +void shake128_inc_absorb(uint64_t *s_inc, const uint8_t *input, size_t inlen) { + keccak_inc_absorb(s_inc, SHAKE128_RATE, input, inlen); +} + +void shake128_inc_finalize(uint64_t *s_inc) { + keccak_inc_finalize(s_inc, SHAKE128_RATE, 0x1F); +} + +void shake128_inc_squeeze(uint8_t *output, size_t outlen, uint64_t *s_inc) { + keccak_inc_squeeze(output, outlen, s_inc, SHAKE128_RATE); +} + +void shake256_inc_init(uint64_t *s_inc) { + keccak_inc_init(s_inc); +} + +void shake256_inc_absorb(uint64_t *s_inc, const uint8_t *input, size_t inlen) { + keccak_inc_absorb(s_inc, SHAKE256_RATE, input, inlen); +} + +void shake256_inc_finalize(uint64_t *s_inc) { + keccak_inc_finalize(s_inc, SHAKE256_RATE, 0x1F); +} + +void shake256_inc_squeeze(uint8_t *output, size_t outlen, uint64_t *s_inc) { + keccak_inc_squeeze(output, outlen, s_inc, SHAKE256_RATE); +} + + +/************************************************* + * Name: shake128_absorb + * + * Description: Absorb step of the SHAKE128 XOF. + * non-incremental, starts by zeroeing the state. + * + * Arguments: - uint64_t *s: pointer to (uninitialized) output Keccak state + * - const uint8_t *input: pointer to input to be absorbed + * into s + * - size_t inlen: length of input in bytes + **************************************************/ +void shake128_absorb(uint64_t *s, const uint8_t *input, size_t inlen) { + keccak_absorb(s, SHAKE128_RATE, input, inlen, 0x1F); +} + +/************************************************* + * Name: shake128_squeezeblocks + * + * Description: Squeeze step of SHAKE128 XOF. Squeezes full blocks of + * SHAKE128_RATE bytes each. Modifies the state. Can be called + * multiple times to keep squeezing, i.e., is incremental. + * + * Arguments: - uint8_t *output: pointer to output blocks + * - size_t nblocks: number of blocks to be squeezed + * (written to output) + * - uint64_t *s: pointer to input/output Keccak state + **************************************************/ +void shake128_squeezeblocks(uint8_t *output, size_t nblocks, uint64_t *s) { + keccak_squeezeblocks(output, nblocks, s, SHAKE128_RATE); +} + +/************************************************* + * Name: shake256_absorb + * + * Description: Absorb step of the SHAKE256 XOF. + * non-incremental, starts by zeroeing the state. + * + * Arguments: - uint64_t *s: pointer to (uninitialized) output Keccak state + * - const uint8_t *input: pointer to input to be absorbed + * into s + * - size_t inlen: length of input in bytes + **************************************************/ +void shake256_absorb(uint64_t *s, const uint8_t *input, size_t inlen) { + keccak_absorb(s, SHAKE256_RATE, input, inlen, 0x1F); +} + +/************************************************* + * Name: shake256_squeezeblocks + * + * Description: Squeeze step of SHAKE256 XOF. Squeezes full blocks of + * SHAKE256_RATE bytes each. Modifies the state. Can be called + * multiple times to keep squeezing, i.e., is incremental. + * + * Arguments: - uint8_t *output: pointer to output blocks + * - size_t nblocks: number of blocks to be squeezed + * (written to output) + * - uint64_t *s: pointer to input/output Keccak state + **************************************************/ +void shake256_squeezeblocks(uint8_t *output, size_t nblocks, uint64_t *s) { + keccak_squeezeblocks(output, nblocks, s, SHAKE256_RATE); +} + +/************************************************* + * Name: shake128 + * + * Description: SHAKE128 XOF with non-incremental API + * + * Arguments: - uint8_t *output: pointer to output + * - size_t outlen: requested output length in bytes + * - const uint8_t *input: pointer to input + * - size_t inlen: length of input in bytes + **************************************************/ +void shake128(uint8_t *output, size_t outlen, + const uint8_t *input, size_t inlen) { + size_t nblocks = outlen / SHAKE128_RATE; + uint8_t t[SHAKE128_RATE]; + uint64_t s[25]; + + shake128_absorb(s, input, inlen); + shake128_squeezeblocks(output, nblocks, s); + + output += nblocks * SHAKE128_RATE; + outlen -= nblocks * SHAKE128_RATE; + + if (outlen) { + shake128_squeezeblocks(t, 1, s); + for (size_t i = 0; i < outlen; ++i) { + output[i] = t[i]; + } + } +} + +/************************************************* + * Name: shake256 + * + * Description: SHAKE256 XOF with non-incremental API + * + * Arguments: - uint8_t *output: pointer to output + * - size_t outlen: requested output length in bytes + * - const uint8_t *input: pointer to input + * - size_t inlen: length of input in bytes + **************************************************/ +void shake256(uint8_t *output, size_t outlen, + const uint8_t *input, size_t inlen) { + size_t nblocks = outlen / SHAKE256_RATE; + uint8_t t[SHAKE256_RATE]; + uint64_t s[25]; + + shake256_absorb(s, input, inlen); + shake256_squeezeblocks(output, nblocks, s); + + output += nblocks * SHAKE256_RATE; + outlen -= nblocks * SHAKE256_RATE; + + if (outlen) { + shake256_squeezeblocks(t, 1, s); + for (size_t i = 0; i < outlen; ++i) { + output[i] = t[i]; + } + } +} + +void sha3_256_inc_init(uint64_t *s_inc) { + keccak_inc_init(s_inc); +} + +void sha3_256_inc_absorb(uint64_t *s_inc, const uint8_t *input, size_t inlen) { + keccak_inc_absorb(s_inc, SHA3_256_RATE, input, inlen); +} + +void sha3_256_inc_finalize(uint8_t *output, uint64_t *s_inc) { + uint8_t t[SHA3_256_RATE]; + keccak_inc_finalize(s_inc, SHA3_256_RATE, 0x06); + + keccak_squeezeblocks(t, 1, s_inc, SHA3_256_RATE); + + for (size_t i = 0; i < 32; i++) { + output[i] = t[i]; + } +} + +/************************************************* + * Name: sha3_256 + * + * Description: SHA3-256 with non-incremental API + * + * Arguments: - uint8_t *output: pointer to output + * - const uint8_t *input: pointer to input + * - size_t inlen: length of input in bytes + **************************************************/ +void sha3_256(uint8_t *output, const uint8_t *input, size_t inlen) { + uint64_t s[25]; + uint8_t t[SHA3_256_RATE]; + + /* Absorb input */ + keccak_absorb(s, SHA3_256_RATE, input, inlen, 0x06); + + /* Squeeze output */ + keccak_squeezeblocks(t, 1, s, SHA3_256_RATE); + + for (size_t i = 0; i < 32; i++) { + output[i] = t[i]; + } +} + +void sha3_512_inc_init(uint64_t *s_inc) { + keccak_inc_init(s_inc); +} + +void sha3_512_inc_absorb(uint64_t *s_inc, const uint8_t *input, size_t inlen) { + keccak_inc_absorb(s_inc, SHA3_512_RATE, input, inlen); +} + +void sha3_512_inc_finalize(uint8_t *output, uint64_t *s_inc) { + uint8_t t[SHA3_512_RATE]; + keccak_inc_finalize(s_inc, SHA3_512_RATE, 0x06); + + keccak_squeezeblocks(t, 1, s_inc, SHA3_512_RATE); + + for (size_t i = 0; i < 32; i++) { + output[i] = t[i]; + } +} + +/************************************************* + * Name: sha3_512 + * + * Description: SHA3-512 with non-incremental API + * + * Arguments: - uint8_t *output: pointer to output + * - const uint8_t *input: pointer to input + * - size_t inlen: length of input in bytes + **************************************************/ +void sha3_512(uint8_t *output, const uint8_t *input, size_t inlen) { + uint64_t s[25]; + uint8_t t[SHA3_512_RATE]; + + /* Absorb input */ + keccak_absorb(s, SHA3_512_RATE, input, inlen, 0x06); + + /* Squeeze output */ + keccak_squeezeblocks(t, 1, s, SHA3_512_RATE); + + for (size_t i = 0; i < 64; i++) { + output[i] = t[i]; + } +} diff --git a/sphincsplus/sphincsplus-keccakx2/fips202.h b/sphincsplus/sphincsplus-keccakx2/fips202.h new file mode 100644 index 0000000..e11cb7f --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/fips202.h @@ -0,0 +1,47 @@ +#ifndef SPX_FIPS202_H +#define SPX_FIPS202_H + +#include +#include + +#define SHAKE128_RATE 168 +#define SHAKE256_RATE 136 +#define SHA3_256_RATE 136 +#define SHA3_512_RATE 72 + +void shake128_absorb(uint64_t *s, const uint8_t *input, size_t inlen); + +void shake128_squeezeblocks(uint8_t *output, size_t nblocks, uint64_t *s); + +void shake128_inc_init(uint64_t *s_inc); +void shake128_inc_absorb(uint64_t *s_inc, const uint8_t *input, size_t inlen); +void shake128_inc_finalize(uint64_t *s_inc); +void shake128_inc_squeeze(uint8_t *output, size_t outlen, uint64_t *s_inc); + +void shake256_absorb(uint64_t *s, const uint8_t *input, size_t inlen); +void shake256_squeezeblocks(uint8_t *output, size_t nblocks, uint64_t *s); + +void shake256_inc_init(uint64_t *s_inc); +void shake256_inc_absorb(uint64_t *s_inc, const uint8_t *input, size_t inlen); +void shake256_inc_finalize(uint64_t *s_inc); +void shake256_inc_squeeze(uint8_t *output, size_t outlen, uint64_t *s_inc); + +void shake128(uint8_t *output, size_t outlen, + const uint8_t *input, size_t inlen); + +void shake256(uint8_t *output, size_t outlen, + const uint8_t *input, size_t inlen); + +void sha3_256_inc_init(uint64_t *s_inc); +void sha3_256_inc_absorb(uint64_t *s_inc, const uint8_t *input, size_t inlen); +void sha3_256_inc_finalize(uint8_t *output, uint64_t *s_inc); + +void sha3_256(uint8_t *output, const uint8_t *input, size_t inlen); + +void sha3_512_inc_init(uint64_t *s_inc); +void sha3_512_inc_absorb(uint64_t *s_inc, const uint8_t *input, size_t inlen); +void sha3_512_inc_finalize(uint8_t *output, uint64_t *s_inc); + +void sha3_512(uint8_t *output, const uint8_t *input, size_t inlen); + +#endif diff --git a/sphincsplus/sphincsplus-keccakx2/fips202x2.c b/sphincsplus/sphincsplus-keccakx2/fips202x2.c new file mode 100644 index 0000000..87c0df8 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/fips202x2.c @@ -0,0 +1,165 @@ +#include +#include + +#include "fips202x2.h" +#include "fips202.h" +#include "f1600x2.h" + +uint64_t load64(const unsigned char *x) +{ + unsigned long long r = 0, i; + + for (i = 0; i < 8; ++i) { + r |= (unsigned long long)x[i] << 8 * i; + } + return r; +} + +void store64(uint8_t *x, uint64_t u) +{ + unsigned int i; + + for(i=0; i<8; ++i) { + x[i] = u; + u >>= 8; + } +} + +static void keccak_absorb2x(uint64_t *s, + unsigned int r, + const unsigned char *m0, + const unsigned char *m1, + unsigned long long int mlen, + unsigned char p) +{ + unsigned long long i; + unsigned char t0[200]; + unsigned char t1[200]; + + while (mlen >= r) + { + for (i = 0; i < r / 8; ++i) + { + s[2*i+0] ^= load64(m0 + 8 * i); + s[2*i+1] ^= load64(m1 + 8 * i); + } + + f1600x2(s); + mlen -= r; + m0 += r; + m1 += r; + } + + for (i = 0; i < r; ++i) + { + t0[i] = 0; + t1[i] = 0; + } + for (i = 0; i < mlen; ++i) + { + t0[i] = m0[i]; + t1[i] = m1[i]; + } + + t0[i] = p; + t1[i] = p; + + t0[r - 1] |= 128; + t1[r - 1] |= 128; + + for (i = 0; i < r / 8; ++i) + { + s[2*i+0] ^= load64(t0 + 8 * i); + s[2*i+1] ^= load64(t1 + 8 * i); + } +} + + +static void keccak_squeezeblocks2x(unsigned char *h0, + unsigned char *h1, + unsigned long long int nblocks, + uint64_t *s, + unsigned int r) +{ + unsigned int i; + + while(nblocks > 0) + { + f1600x2(s); + for(i=0;i<(r>>3);i++) + { + store64(h0+8*i, s[2*i+0]); + store64(h1+8*i, s[2*i+1]); + } + h0 += r; + h1 += r; + nblocks--; + } +} + + + +void shake128x2(unsigned char *out0, + unsigned char *out1, + unsigned long long outlen, + unsigned char *in0, + unsigned char *in1, + unsigned long long inlen) +{ + uint64_t s[50] = {0}; + unsigned char t0[SHAKE128_RATE]; + unsigned char t1[SHAKE128_RATE]; + unsigned int i; + + /* absorb 4 message of identical length in parallel */ + keccak_absorb2x(s, SHAKE128_RATE, in0, in1, inlen, 0x1F); + + /* Squeeze output */ + keccak_squeezeblocks2x(out0, out1, outlen/SHAKE128_RATE, s, SHAKE128_RATE); + + out0 += (outlen/SHAKE128_RATE)*SHAKE128_RATE; + out1 += (outlen/SHAKE128_RATE)*SHAKE128_RATE; + + if(outlen%SHAKE128_RATE) + { + keccak_squeezeblocks2x(t0, t1, 1, s, SHAKE128_RATE); + for(i=0;i + +uint64_t load64(const unsigned char *x); +void store64(uint8_t *x, uint64_t u); + + +void shake128x2(unsigned char *out0, + unsigned char *out1, + unsigned long long outlen, + unsigned char *in0, + unsigned char *in1, + unsigned long long inlen); + +void shake256x2(unsigned char *out0, + unsigned char *out1, + unsigned long long outlen, + unsigned char *in0, + unsigned char *in1, + unsigned long long inlen); + +#endif diff --git a/sphincsplus/sphincsplus-keccakx2/fors.c b/sphincsplus/sphincsplus-keccakx2/fors.c new file mode 100644 index 0000000..a19fc7e --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/fors.c @@ -0,0 +1,198 @@ +#include +#include +#include + +#include "thash.h" +#include "fors.h" +#include "utils.h" +#include "utilsx2.h" +#include "hash.h" +#include "hashx2.h" +#include "thashx2.h" +#include "address.h" + +static void fors_gen_sk(unsigned char *sk, const spx_ctx *ctx, + uint32_t fors_leaf_addr[8]) +{ + prf_addr(sk, ctx, fors_leaf_addr); +} + +static void fors_gen_skx2(unsigned char *sk0, + unsigned char *sk1, + const spx_ctx *ctx, + uint32_t fors_leaf_addrx2[2*8]) +{ + prf_addrx2(sk0, sk1, + ctx, fors_leaf_addrx2); +} + +static void fors_sk_to_leaf(unsigned char *leaf, const unsigned char *sk, + const spx_ctx *ctx, + uint32_t fors_leaf_addr[8]) +{ + thash(leaf, sk, 1, ctx, fors_leaf_addr); +} + +static void fors_sk_to_leafx2(unsigned char *leaf0, + unsigned char *leaf1, + const unsigned char *sk0, + const unsigned char *sk1, + const spx_ctx *ctx, + uint32_t fors_leaf_addrx2[2*8]) +{ + thashx2(leaf0, leaf1, + sk0, sk1, + 1, ctx, fors_leaf_addrx2); +} + +struct fors_gen_leaf_info { + uint32_t leaf_addrx[2*8]; +}; + +static void fors_gen_leafx2(unsigned char *leaf, + const spx_ctx *ctx, + uint32_t addr_idx, void *info) +{ + struct fors_gen_leaf_info *fors_info = info; + uint32_t *fors_leaf_addrx2 = fors_info->leaf_addrx; + unsigned int j; + + /* Only set the parts that the caller doesn't set */ + for (j = 0; j < 2; j++) { + set_tree_index(fors_leaf_addrx2 + j*8, addr_idx + j); + set_type(fors_leaf_addrx2 + j*8, SPX_ADDR_TYPE_FORSPRF); + } + + fors_gen_skx2(leaf + 0*SPX_N, + leaf + 1*SPX_N, + ctx, fors_leaf_addrx2); + + for (j = 0; j < 2; j++) { + set_type(fors_leaf_addrx2 + j*8, SPX_ADDR_TYPE_FORSTREE); + } + + fors_sk_to_leafx2(leaf + 0*SPX_N, + leaf + 1*SPX_N, + leaf + 0*SPX_N, + leaf + 1*SPX_N, + ctx, fors_leaf_addrx2); +} + +/** + * Interprets m as SPX_FORS_HEIGHT-bit unsigned integers. + * Assumes m contains at least SPX_FORS_HEIGHT * SPX_FORS_TREES bits. + * Assumes indices has space for SPX_FORS_TREES integers. + */ +static void message_to_indices(uint32_t *indices, const unsigned char *m) +{ + unsigned int i, j; + unsigned int offset = 0; + + for (i = 0; i < SPX_FORS_TREES; i++) { + indices[i] = 0; + for (j = 0; j < SPX_FORS_HEIGHT; j++) { + indices[i] ^= ((m[offset >> 3] >> (offset & 0x7)) & 0x1) << j; + offset++; + } + } +} + +/** + * Signs a message m, deriving the secret key from sk_seed and the FTS address. + * Assumes m contains at least SPX_FORS_HEIGHT * SPX_FORS_TREES bits. + */ +void fors_sign(unsigned char *sig, unsigned char *pk, + const unsigned char *m, + const spx_ctx *ctx, + const uint32_t fors_addr[8]) +{ + uint32_t indices[SPX_FORS_TREES]; + unsigned char roots[SPX_FORS_TREES * SPX_N]; + uint32_t fors_tree_addr[2*8] = {0}; + struct fors_gen_leaf_info fors_info = {0}; + uint32_t *fors_leaf_addr = fors_info.leaf_addrx; + uint32_t fors_pk_addr[8] = {0}; + uint32_t idx_offset; + unsigned int i; + + for (i=0; i<2; i++) { + copy_keypair_addr(fors_tree_addr + 8*i, fors_addr); + set_type(fors_tree_addr + 8*i, SPX_ADDR_TYPE_FORSTREE); + copy_keypair_addr(fors_leaf_addr + 8*i, fors_addr); + } + copy_keypair_addr(fors_pk_addr, fors_addr); + set_type(fors_pk_addr, SPX_ADDR_TYPE_FORSPK); + + message_to_indices(indices, m); + + for (i = 0; i < SPX_FORS_TREES; i++) { + idx_offset = i * (1 << SPX_FORS_HEIGHT); + + set_tree_height(fors_tree_addr, 0); + set_tree_index(fors_tree_addr, indices[i] + idx_offset); + + /* Include the secret key part that produces the selected leaf node. */ + set_type(fors_tree_addr, SPX_ADDR_TYPE_FORSPRF); + fors_gen_sk(sig, ctx, fors_tree_addr); + set_type(fors_tree_addr, SPX_ADDR_TYPE_FORSTREE); + sig += SPX_N; + + /* Compute the authentication path for this leaf node. */ + treehashx2(roots + i*SPX_N, sig, ctx, + indices[i], idx_offset, SPX_FORS_HEIGHT, fors_gen_leafx2, + fors_tree_addr, &fors_info); + + sig += SPX_N * SPX_FORS_HEIGHT; + } + + /* Hash horizontally across all tree roots to derive the public key. */ + thash(pk, roots, SPX_FORS_TREES, ctx, fors_pk_addr); +} + +/** + * Derives the FORS public key from a signature. + * This can be used for verification by comparing to a known public key, or to + * subsequently verify a signature on the derived public key. The latter is the + * typical use-case when used as an FTS below an OTS in a hypertree. + * Assumes m contains at least SPX_FORS_HEIGHT * SPX_FORS_TREES bits. + */ +void fors_pk_from_sig(unsigned char *pk, + const unsigned char *sig, const unsigned char *m, + const spx_ctx *ctx, + const uint32_t fors_addr[8]) +{ + uint32_t indices[SPX_FORS_TREES]; + unsigned char roots[SPX_FORS_TREES * SPX_N]; + unsigned char leaf[SPX_N]; + uint32_t fors_tree_addr[8] = {0}; + uint32_t fors_pk_addr[8] = {0}; + uint32_t idx_offset; + unsigned int i; + + copy_keypair_addr(fors_tree_addr, fors_addr); + copy_keypair_addr(fors_pk_addr, fors_addr); + + set_type(fors_tree_addr, SPX_ADDR_TYPE_FORSTREE); + set_type(fors_pk_addr, SPX_ADDR_TYPE_FORSPK); + + message_to_indices(indices, m); + + for (i = 0; i < SPX_FORS_TREES; i++) { + idx_offset = i * (1 << SPX_FORS_HEIGHT); + + set_tree_height(fors_tree_addr, 0); + set_tree_index(fors_tree_addr, indices[i] + idx_offset); + + /* Derive the leaf from the included secret key part. */ + fors_sk_to_leaf(leaf, sig, ctx, fors_tree_addr); + sig += SPX_N; + + /* Derive the corresponding root node of this tree. */ + compute_root(roots + i*SPX_N, leaf, indices[i], idx_offset, + sig, SPX_FORS_HEIGHT, ctx, fors_tree_addr); + sig += SPX_N * SPX_FORS_HEIGHT; + } + + /* Hash horizontally across all tree roots to derive the public key. */ + thash(pk, roots, SPX_FORS_TREES, ctx, fors_pk_addr); +} diff --git a/sphincsplus/sphincsplus-keccakx2/fors.h b/sphincsplus/sphincsplus-keccakx2/fors.h new file mode 100644 index 0000000..8d98017 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/fors.h @@ -0,0 +1,32 @@ +#ifndef SPX_FORS_H +#define SPX_FORS_H + +#include + +#include "params.h" +#include "context.h" + +/** + * Signs a message m, deriving the secret key from sk_seed and the FTS address. + * Assumes m contains at least SPX_FORS_HEIGHT * SPX_FORS_TREES bits. + */ +#define fors_sign SPX_NAMESPACE(fors_sign) +void fors_sign(unsigned char *sig, unsigned char *pk, + const unsigned char *m, + const spx_ctx* ctx, + const uint32_t fors_addr[8]); + +/** + * Derives the FORS public key from a signature. + * This can be used for verification by comparing to a known public key, or to + * subsequently verify a signature on the derived public key. The latter is the + * typical use-case when used as an FTS below an OTS in a hypertree. + * Assumes m contains at least SPX_FORS_HEIGHT * SPX_FORS_TREES bits. + */ +#define fors_pk_from_sig SPX_NAMESPACE(fors_pk_from_sig) +void fors_pk_from_sig(unsigned char *pk, + const unsigned char *sig, const unsigned char *m, + const spx_ctx* ctx, + const uint32_t fors_addr[8]); + +#endif diff --git a/sphincsplus/sphincsplus-keccakx2/hal_env.h b/sphincsplus/sphincsplus-keccakx2/hal_env.h new file mode 100644 index 0000000..e8dd570 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/hal_env.h @@ -0,0 +1,9 @@ +#ifndef QEMU_V8A_HAL_ENV_H +#define QEMU_V8A_HAL_ENV_H + +#define SEP ; + +#define ASM_LOAD(dst,symbol) \ + adrp dst, symbol ; add dst, dst, :lo12:symbol; + +#endif diff --git a/sphincsplus/sphincsplus-keccakx2/hash.h b/sphincsplus/sphincsplus-keccakx2/hash.h new file mode 100644 index 0000000..b141f09 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/hash.h @@ -0,0 +1,27 @@ +#ifndef SPX_HASH_H +#define SPX_HASH_H + +#include +#include "context.h" +#include "params.h" + +#define initialize_hash_function SPX_NAMESPACE(initialize_hash_function) +void initialize_hash_function(spx_ctx *ctx); + +#define prf_addr SPX_NAMESPACE(prf_addr) +void prf_addr(unsigned char *out, const spx_ctx *ctx, + const uint32_t addr[8]); + +#define gen_message_random SPX_NAMESPACE(gen_message_random) +void gen_message_random(unsigned char *R, const unsigned char *sk_prf, + const unsigned char *optrand, + const unsigned char *m, unsigned long long mlen, + const spx_ctx *ctx); + +#define hash_message SPX_NAMESPACE(hash_message) +void hash_message(unsigned char *digest, uint64_t *tree, uint32_t *leaf_idx, + const unsigned char *R, const unsigned char *pk, + const unsigned char *m, unsigned long long mlen, + const spx_ctx *ctx); + +#endif diff --git a/sphincsplus/sphincsplus-keccakx2/hash_shake.c b/sphincsplus/sphincsplus-keccakx2/hash_shake.c new file mode 100644 index 0000000..1da7b81 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/hash_shake.c @@ -0,0 +1,93 @@ +#include +#include + +#include "address.h" +#include "utils.h" +#include "params.h" +#include "hash.h" +#include "fips202.h" + +/* For SHAKE256, there is no immediate reason to initialize at the start, + so this function is an empty operation. */ +void initialize_hash_function(spx_ctx* ctx) +{ + (void)ctx; /* Suppress an 'unused parameter' warning. */ +} + +/* + * Computes PRF(pk_seed, sk_seed, addr) + */ +void prf_addr(unsigned char *out, const spx_ctx *ctx, + const uint32_t addr[8]) +{ + unsigned char buf[2*SPX_N + SPX_ADDR_BYTES]; + + memcpy(buf, ctx->pub_seed, SPX_N); + memcpy(buf + SPX_N, addr, SPX_ADDR_BYTES); + memcpy(buf + SPX_N + SPX_ADDR_BYTES, ctx->sk_seed, SPX_N); + + shake256(out, SPX_N, buf, 2*SPX_N + SPX_ADDR_BYTES); +} + +/** + * Computes the message-dependent randomness R, using a secret seed and an + * optional randomization value as well as the message. + */ +void gen_message_random(unsigned char *R, const unsigned char *sk_prf, + const unsigned char *optrand, + const unsigned char *m, unsigned long long mlen, + const spx_ctx *ctx) +{ + (void)ctx; + uint64_t s_inc[26]; + + shake256_inc_init(s_inc); + shake256_inc_absorb(s_inc, sk_prf, SPX_N); + shake256_inc_absorb(s_inc, optrand, SPX_N); + shake256_inc_absorb(s_inc, m, mlen); + shake256_inc_finalize(s_inc); + shake256_inc_squeeze(R, SPX_N, s_inc); +} + +/** + * Computes the message hash using R, the public key, and the message. + * Outputs the message digest and the index of the leaf. The index is split in + * the tree index and the leaf index, for convenient copying to an address. + */ +void hash_message(unsigned char *digest, uint64_t *tree, uint32_t *leaf_idx, + const unsigned char *R, const unsigned char *pk, + const unsigned char *m, unsigned long long mlen, + const spx_ctx *ctx) +{ + (void)ctx; +#define SPX_TREE_BITS (SPX_TREE_HEIGHT * (SPX_D - 1)) +#define SPX_TREE_BYTES ((SPX_TREE_BITS + 7) / 8) +#define SPX_LEAF_BITS SPX_TREE_HEIGHT +#define SPX_LEAF_BYTES ((SPX_LEAF_BITS + 7) / 8) +#define SPX_DGST_BYTES (SPX_FORS_MSG_BYTES + SPX_TREE_BYTES + SPX_LEAF_BYTES) + + unsigned char buf[SPX_DGST_BYTES]; + unsigned char *bufp = buf; + uint64_t s_inc[26]; + + shake256_inc_init(s_inc); + shake256_inc_absorb(s_inc, R, SPX_N); + shake256_inc_absorb(s_inc, pk, SPX_PK_BYTES); + shake256_inc_absorb(s_inc, m, mlen); + shake256_inc_finalize(s_inc); + shake256_inc_squeeze(buf, SPX_DGST_BYTES, s_inc); + + memcpy(digest, bufp, SPX_FORS_MSG_BYTES); + bufp += SPX_FORS_MSG_BYTES; + +#if SPX_TREE_BITS > 64 + #error For given height and depth, 64 bits cannot represent all subtrees +#endif + + *tree = bytes_to_ull(bufp, SPX_TREE_BYTES); + *tree &= (~(uint64_t)0) >> (64 - SPX_TREE_BITS); + bufp += SPX_TREE_BYTES; + + *leaf_idx = bytes_to_ull(bufp, SPX_LEAF_BYTES); + *leaf_idx &= (~(uint32_t)0) >> (32 - SPX_LEAF_BITS); +} diff --git a/sphincsplus/sphincsplus-keccakx2/hash_shakex2.c b/sphincsplus/sphincsplus-keccakx2/hash_shakex2.c new file mode 100644 index 0000000..fe7869c --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/hash_shakex2.c @@ -0,0 +1,51 @@ +#include +#include + +#include "address.h" +#include "params.h" +#include "fips202x2.h" +#include "f1600x2.h" +#include "hashx2.h" + +/* + * 2-way parallel version of prf_addr; takes 2x as much input and output + */ +void prf_addrx2(unsigned char *out0, + unsigned char *out1, + const spx_ctx *ctx, + const uint32_t addrx2[2*8]) { + /* As we write and read only a few quadwords, it is more efficient to + * build and extract from the fourway SHAKE256 state by hand. */ + uint64_t state[50] = {0}; + + for (int i = 0; i < SPX_N/8; i++) { + uint64_t x = load64(ctx->pub_seed + 8*i); + state[2*i] = x; + state[2*i+1] = x; + } + for (int i = 0; i < 4; i++) { + state[2*(SPX_N/8 + i)] = (((uint64_t)addrx2[1+2*i]) << 32) + | (uint64_t)addrx2[2*i]; + state[2*(SPX_N/8 + i) + 1] = (((uint64_t)addrx2[8+1+2*i]) << 32) + | (uint64_t)addrx2[8+2*i]; + } + for (int i = 0; i < SPX_N/8; i++) { + uint64_t x = load64(ctx->sk_seed + 8*i); + state[2*(SPX_N/8+i+4)] = x; + state[2*(SPX_N/8+i+4)+1] = x; + } + + /* SHAKE domain separator and padding. */ + state[2*(SPX_N/4+4)] = 0x1f; + state[2*(SPX_N/4+4)+1] = 0x1f; + + state[2*16] = 0x80ll << 56; + state[2*16+1] = 0x80ll << 56; + + f1600x2(state); + + for (int i = 0; i < SPX_N/8; i++) { + store64(out0 + 8*i, state[2*i]); + store64(out1 + 8*i, state[2*i+1]); + } +} diff --git a/sphincsplus/sphincsplus-keccakx2/hashx2.h b/sphincsplus/sphincsplus-keccakx2/hashx2.h new file mode 100644 index 0000000..25ce94d --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/hashx2.h @@ -0,0 +1,14 @@ +#ifndef SPX_HASHX2_H +#define SPX_HASHX2_H + +#include +#include "context.h" +#include "params.h" + +#define prf_addrx2 SPX_NAMESPACE(prf_addrx2) +void prf_addrx2(unsigned char *out0, + unsigned char *out1, + const spx_ctx *ctx, + const uint32_t addrx2[2*8]); + +#endif diff --git a/sphincsplus/sphincsplus-keccakx2/keccak_f1600_x2 b/sphincsplus/sphincsplus-keccakx2/keccak_f1600_x2 new file mode 120000 index 0000000..04f3a3a --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/keccak_f1600_x2 @@ -0,0 +1 @@ +../../asm/manual/keccak_f1600/third_party/ \ No newline at end of file diff --git a/sphincsplus/sphincsplus-keccakx2/make_all.py b/sphincsplus/sphincsplus-keccakx2/make_all.py new file mode 100644 index 0000000..e3b9150 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/make_all.py @@ -0,0 +1,78 @@ +# +# Copyright (c) 2022 Arm Limited +# Copyright (c) 2022 Matthias Kannwischer +# SPDX-License-Identifier: MIT +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +# + +#! /usr/bin/env python3 + +import multiprocessing +import subprocess +import itertools +import shutil +import os +import sys + +cores = ["X1", "A78", "A55", "X2", "A710", "A510"] +fns = ['shake'] +options = ["f", "s"] +sizes = [128, 192, 256] +thashes = ['robust', 'simple'] +keccak_var = ["COTHANV8", "C", "BAS"] + +def nameFor(fn, opt, size, thash, var): + return f"sphincs-{fn}-{size}{opt}-{thash}_{var}" + +def make(fn, opt, size, thash, core, bindir, keccak_x2): + + if not os.path.exists(bindir): + os.mkdir(bindir) + + if core in ["X1", "A78", "A55"]: + platform = "v8" + # Skip BAS' code on v8-A + if keccak_x2 == "BAS": + return + elif core in ["X2", "A710", "A510"]: + platform ="v84" + else: + raise Exception() + + name = nameFor(fn, opt, size, thash, keccak_x2) + overrides = [f'PARAMS=sphincs-{fn}-{size}{opt}', 'THASH='+thash, 'CORE='+core, 'PLATFORM='+platform, 'KECCAK_X2_IMPL='+keccak_x2] + + sys.stderr.write(f"Compiling {name} …\n") + sys.stderr.flush() + + subprocess.run(["make", "clean"] + overrides, + stdout=subprocess.DEVNULL, stderr=sys.stderr, check=True) + subprocess.run(["make"] + overrides, + stdout=subprocess.DEVNULL, stderr=sys.stderr, check=True) + + shutil.move("benchmark", f"{bindir}/bench_{core}_{name}") + + +bindir = "bin/" + +for fn in fns: + for opt, size, thash, core, var in itertools.product(options, sizes, thashes, cores, keccak_var): + make(fn, opt, size, thash, core, bindir, var) diff --git a/sphincsplus/sphincsplus-keccakx2/merkle.c b/sphincsplus/sphincsplus-keccakx2/merkle.c new file mode 100644 index 0000000..b2791d1 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/merkle.c @@ -0,0 +1,65 @@ +#include +#include + +#include "utils.h" +#include "utilsx2.h" +#include "wots.h" +#include "wotsx2.h" +#include "merkle.h" +#include "address.h" +#include "params.h" + +/* + * This generates a Merkle signature (WOTS signature followed by the Merkle + * authentication path). + */ +void merkle_sign(uint8_t *sig, unsigned char *root, + const spx_ctx* ctx, + uint32_t wots_addr[8], uint32_t tree_addr[8], + uint32_t idx_leaf) +{ + unsigned char *auth_path = sig + SPX_WOTS_BYTES; + uint32_t tree_addrx2[2*8] = { 0 }; + int j; + struct leaf_info_x2 info = { 0 }; + unsigned steps[ SPX_WOTS_LEN ]; + + info.wots_sig = sig; + chain_lengths(steps, root); + info.wots_steps = steps; + + for (j=0; j<2; j++) { + set_type(&tree_addrx2[8*j], SPX_ADDR_TYPE_HASHTREE); + set_type(&info.leaf_addr[8*j], SPX_ADDR_TYPE_WOTS); + set_type(&info.pk_addr[8*j], SPX_ADDR_TYPE_WOTSPK); + copy_subtree_addr(&tree_addrx2[8*j], tree_addr); + copy_subtree_addr(&info.leaf_addr[8*j], wots_addr); + copy_subtree_addr(&info.pk_addr[8*j], wots_addr); + } + + info.wots_sign_leaf = idx_leaf; + + treehashx2(root, auth_path, ctx, + idx_leaf, 0, + SPX_TREE_HEIGHT, + wots_gen_leafx2, + tree_addrx2, &info); +} + +/* Compute root node of the top-most subtree. */ +void merkle_gen_root(unsigned char *root, const spx_ctx *ctx) +{ + /* We do not need the auth path in key generation, but it simplifies the + code to have just one treehash routine that computes both root and path + in one function. */ + unsigned char auth_path[SPX_TREE_HEIGHT * SPX_N + SPX_WOTS_BYTES]; + uint32_t top_tree_addr[8] = {0}; + uint32_t wots_addr[8] = {0}; + + set_layer_addr(top_tree_addr, SPX_D - 1); + set_layer_addr(wots_addr, SPX_D - 1); + + merkle_sign(auth_path, root, ctx, + wots_addr, top_tree_addr, + ~0 /* ~0 means "don't bother generating an auth path */ ); +} diff --git a/sphincsplus/sphincsplus-keccakx2/merkle.h b/sphincsplus/sphincsplus-keccakx2/merkle.h new file mode 100644 index 0000000..9ac2759 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/merkle.h @@ -0,0 +1,18 @@ +#if !defined( MERKLE_H_ ) +#define MERKLE_H_ + +#include + +/* Generate a Merkle signature (WOTS signature followed by the Merkle */ +/* authentication path) */ +#define merkle_sign SPX_NAMESPACE(merkle_sign) +void merkle_sign(uint8_t *sig, unsigned char *root, + const spx_ctx* ctx, + uint32_t wots_addr[8], uint32_t tree_addr[8], + uint32_t idx_leaf); + +/* Compute the root node of the top-most subtree. */ +#define merkle_gen_root SPX_NAMESPACE(merkle_gen_root) +void merkle_gen_root(unsigned char *root, const spx_ctx* ctx); + +#endif /* MERKLE_H_ */ diff --git a/sphincsplus/sphincsplus-keccakx2/params.h b/sphincsplus/sphincsplus-keccakx2/params.h new file mode 100644 index 0000000..1d7f9c9 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/params.h @@ -0,0 +1,5 @@ +#define str(s) #s +#define xstr(s) str(s) + +#include xstr(params/params-PARAMS.h) + diff --git a/sphincsplus/sphincsplus-keccakx2/params/params-sphincs-shake-128f.h b/sphincsplus/sphincsplus-keccakx2/params/params-sphincs-shake-128f.h new file mode 100644 index 0000000..8f77692 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/params/params-sphincs-shake-128f.h @@ -0,0 +1,80 @@ +#ifndef SPX_PARAMS_H +#define SPX_PARAMS_H + +#define SPX_NAMESPACE(s) SPX_##s + +/* Hash output length in bytes. */ +#define SPX_N 16 +/* Height of the hypertree. */ +#define SPX_FULL_HEIGHT 66 +/* Number of subtree layer. */ +#define SPX_D 22 +/* FORS tree dimensions. */ +#define SPX_FORS_HEIGHT 6 +#define SPX_FORS_TREES 33 +/* Winternitz parameter, */ +#define SPX_WOTS_W 16 + +/* The hash function is defined by linking a different hash.c file, as opposed + to setting a #define constant. */ + +/* For clarity */ +#define SPX_ADDR_BYTES 32 + +/* WOTS parameters. */ +#if SPX_WOTS_W == 256 + #define SPX_WOTS_LOGW 8 +#elif SPX_WOTS_W == 16 + #define SPX_WOTS_LOGW 4 +#else + #error SPX_WOTS_W assumed 16 or 256 +#endif + +#define SPX_WOTS_LEN1 (8 * SPX_N / SPX_WOTS_LOGW) + +/* SPX_WOTS_LEN2 is floor(log(len_1 * (w - 1)) / log(w)) + 1; we precompute */ +#if SPX_WOTS_W == 256 + #if SPX_N <= 1 + #define SPX_WOTS_LEN2 1 + #elif SPX_N <= 256 + #define SPX_WOTS_LEN2 2 + #else + #error Did not precompute SPX_WOTS_LEN2 for n outside {2, .., 256} + #endif +#elif SPX_WOTS_W == 16 + #if SPX_N <= 8 + #define SPX_WOTS_LEN2 2 + #elif SPX_N <= 136 + #define SPX_WOTS_LEN2 3 + #elif SPX_N <= 256 + #define SPX_WOTS_LEN2 4 + #else + #error Did not precompute SPX_WOTS_LEN2 for n outside {2, .., 256} + #endif +#endif + +#define SPX_WOTS_LEN (SPX_WOTS_LEN1 + SPX_WOTS_LEN2) +#define SPX_WOTS_BYTES (SPX_WOTS_LEN * SPX_N) +#define SPX_WOTS_PK_BYTES SPX_WOTS_BYTES + +/* Subtree size. */ +#define SPX_TREE_HEIGHT (SPX_FULL_HEIGHT / SPX_D) + +#if SPX_TREE_HEIGHT * SPX_D != SPX_FULL_HEIGHT + #error SPX_D should always divide SPX_FULL_HEIGHT +#endif + +/* FORS parameters. */ +#define SPX_FORS_MSG_BYTES ((SPX_FORS_HEIGHT * SPX_FORS_TREES + 7) / 8) +#define SPX_FORS_BYTES ((SPX_FORS_HEIGHT + 1) * SPX_FORS_TREES * SPX_N) +#define SPX_FORS_PK_BYTES SPX_N + +/* Resulting SPX sizes. */ +#define SPX_BYTES (SPX_N + SPX_FORS_BYTES + SPX_D * SPX_WOTS_BYTES +\ + SPX_FULL_HEIGHT * SPX_N) +#define SPX_PK_BYTES (2 * SPX_N) +#define SPX_SK_BYTES (2 * SPX_N + SPX_PK_BYTES) + +#include "../shake_offsets.h" + +#endif diff --git a/sphincsplus/sphincsplus-keccakx2/params/params-sphincs-shake-128s.h b/sphincsplus/sphincsplus-keccakx2/params/params-sphincs-shake-128s.h new file mode 100644 index 0000000..a4d1e13 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/params/params-sphincs-shake-128s.h @@ -0,0 +1,80 @@ +#ifndef SPX_PARAMS_H +#define SPX_PARAMS_H + +#define SPX_NAMESPACE(s) SPX_##s + +/* Hash output length in bytes. */ +#define SPX_N 16 +/* Height of the hypertree. */ +#define SPX_FULL_HEIGHT 63 +/* Number of subtree layer. */ +#define SPX_D 7 +/* FORS tree dimensions. */ +#define SPX_FORS_HEIGHT 12 +#define SPX_FORS_TREES 14 +/* Winternitz parameter, */ +#define SPX_WOTS_W 16 + +/* The hash function is defined by linking a different hash.c file, as opposed + to setting a #define constant. */ + +/* For clarity */ +#define SPX_ADDR_BYTES 32 + +/* WOTS parameters. */ +#if SPX_WOTS_W == 256 + #define SPX_WOTS_LOGW 8 +#elif SPX_WOTS_W == 16 + #define SPX_WOTS_LOGW 4 +#else + #error SPX_WOTS_W assumed 16 or 256 +#endif + +#define SPX_WOTS_LEN1 (8 * SPX_N / SPX_WOTS_LOGW) + +/* SPX_WOTS_LEN2 is floor(log(len_1 * (w - 1)) / log(w)) + 1; we precompute */ +#if SPX_WOTS_W == 256 + #if SPX_N <= 1 + #define SPX_WOTS_LEN2 1 + #elif SPX_N <= 256 + #define SPX_WOTS_LEN2 2 + #else + #error Did not precompute SPX_WOTS_LEN2 for n outside {2, .., 256} + #endif +#elif SPX_WOTS_W == 16 + #if SPX_N <= 8 + #define SPX_WOTS_LEN2 2 + #elif SPX_N <= 136 + #define SPX_WOTS_LEN2 3 + #elif SPX_N <= 256 + #define SPX_WOTS_LEN2 4 + #else + #error Did not precompute SPX_WOTS_LEN2 for n outside {2, .., 256} + #endif +#endif + +#define SPX_WOTS_LEN (SPX_WOTS_LEN1 + SPX_WOTS_LEN2) +#define SPX_WOTS_BYTES (SPX_WOTS_LEN * SPX_N) +#define SPX_WOTS_PK_BYTES SPX_WOTS_BYTES + +/* Subtree size. */ +#define SPX_TREE_HEIGHT (SPX_FULL_HEIGHT / SPX_D) + +#if SPX_TREE_HEIGHT * SPX_D != SPX_FULL_HEIGHT + #error SPX_D should always divide SPX_FULL_HEIGHT +#endif + +/* FORS parameters. */ +#define SPX_FORS_MSG_BYTES ((SPX_FORS_HEIGHT * SPX_FORS_TREES + 7) / 8) +#define SPX_FORS_BYTES ((SPX_FORS_HEIGHT + 1) * SPX_FORS_TREES * SPX_N) +#define SPX_FORS_PK_BYTES SPX_N + +/* Resulting SPX sizes. */ +#define SPX_BYTES (SPX_N + SPX_FORS_BYTES + SPX_D * SPX_WOTS_BYTES +\ + SPX_FULL_HEIGHT * SPX_N) +#define SPX_PK_BYTES (2 * SPX_N) +#define SPX_SK_BYTES (2 * SPX_N + SPX_PK_BYTES) + +#include "../shake_offsets.h" + +#endif diff --git a/sphincsplus/sphincsplus-keccakx2/params/params-sphincs-shake-192f.h b/sphincsplus/sphincsplus-keccakx2/params/params-sphincs-shake-192f.h new file mode 100644 index 0000000..b1e73d1 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/params/params-sphincs-shake-192f.h @@ -0,0 +1,80 @@ +#ifndef SPX_PARAMS_H +#define SPX_PARAMS_H + +#define SPX_NAMESPACE(s) SPX_##s + +/* Hash output length in bytes. */ +#define SPX_N 24 +/* Height of the hypertree. */ +#define SPX_FULL_HEIGHT 66 +/* Number of subtree layer. */ +#define SPX_D 22 +/* FORS tree dimensions. */ +#define SPX_FORS_HEIGHT 8 +#define SPX_FORS_TREES 33 +/* Winternitz parameter, */ +#define SPX_WOTS_W 16 + +/* The hash function is defined by linking a different hash.c file, as opposed + to setting a #define constant. */ + +/* For clarity */ +#define SPX_ADDR_BYTES 32 + +/* WOTS parameters. */ +#if SPX_WOTS_W == 256 + #define SPX_WOTS_LOGW 8 +#elif SPX_WOTS_W == 16 + #define SPX_WOTS_LOGW 4 +#else + #error SPX_WOTS_W assumed 16 or 256 +#endif + +#define SPX_WOTS_LEN1 (8 * SPX_N / SPX_WOTS_LOGW) + +/* SPX_WOTS_LEN2 is floor(log(len_1 * (w - 1)) / log(w)) + 1; we precompute */ +#if SPX_WOTS_W == 256 + #if SPX_N <= 1 + #define SPX_WOTS_LEN2 1 + #elif SPX_N <= 256 + #define SPX_WOTS_LEN2 2 + #else + #error Did not precompute SPX_WOTS_LEN2 for n outside {2, .., 256} + #endif +#elif SPX_WOTS_W == 16 + #if SPX_N <= 8 + #define SPX_WOTS_LEN2 2 + #elif SPX_N <= 136 + #define SPX_WOTS_LEN2 3 + #elif SPX_N <= 256 + #define SPX_WOTS_LEN2 4 + #else + #error Did not precompute SPX_WOTS_LEN2 for n outside {2, .., 256} + #endif +#endif + +#define SPX_WOTS_LEN (SPX_WOTS_LEN1 + SPX_WOTS_LEN2) +#define SPX_WOTS_BYTES (SPX_WOTS_LEN * SPX_N) +#define SPX_WOTS_PK_BYTES SPX_WOTS_BYTES + +/* Subtree size. */ +#define SPX_TREE_HEIGHT (SPX_FULL_HEIGHT / SPX_D) + +#if SPX_TREE_HEIGHT * SPX_D != SPX_FULL_HEIGHT + #error SPX_D should always divide SPX_FULL_HEIGHT +#endif + +/* FORS parameters. */ +#define SPX_FORS_MSG_BYTES ((SPX_FORS_HEIGHT * SPX_FORS_TREES + 7) / 8) +#define SPX_FORS_BYTES ((SPX_FORS_HEIGHT + 1) * SPX_FORS_TREES * SPX_N) +#define SPX_FORS_PK_BYTES SPX_N + +/* Resulting SPX sizes. */ +#define SPX_BYTES (SPX_N + SPX_FORS_BYTES + SPX_D * SPX_WOTS_BYTES +\ + SPX_FULL_HEIGHT * SPX_N) +#define SPX_PK_BYTES (2 * SPX_N) +#define SPX_SK_BYTES (2 * SPX_N + SPX_PK_BYTES) + +#include "../shake_offsets.h" + +#endif diff --git a/sphincsplus/sphincsplus-keccakx2/params/params-sphincs-shake-192s.h b/sphincsplus/sphincsplus-keccakx2/params/params-sphincs-shake-192s.h new file mode 100644 index 0000000..0882e1c --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/params/params-sphincs-shake-192s.h @@ -0,0 +1,80 @@ +#ifndef SPX_PARAMS_H +#define SPX_PARAMS_H + +#define SPX_NAMESPACE(s) SPX_##s + +/* Hash output length in bytes. */ +#define SPX_N 24 +/* Height of the hypertree. */ +#define SPX_FULL_HEIGHT 63 +/* Number of subtree layer. */ +#define SPX_D 7 +/* FORS tree dimensions. */ +#define SPX_FORS_HEIGHT 14 +#define SPX_FORS_TREES 17 +/* Winternitz parameter, */ +#define SPX_WOTS_W 16 + +/* The hash function is defined by linking a different hash.c file, as opposed + to setting a #define constant. */ + +/* For clarity */ +#define SPX_ADDR_BYTES 32 + +/* WOTS parameters. */ +#if SPX_WOTS_W == 256 + #define SPX_WOTS_LOGW 8 +#elif SPX_WOTS_W == 16 + #define SPX_WOTS_LOGW 4 +#else + #error SPX_WOTS_W assumed 16 or 256 +#endif + +#define SPX_WOTS_LEN1 (8 * SPX_N / SPX_WOTS_LOGW) + +/* SPX_WOTS_LEN2 is floor(log(len_1 * (w - 1)) / log(w)) + 1; we precompute */ +#if SPX_WOTS_W == 256 + #if SPX_N <= 1 + #define SPX_WOTS_LEN2 1 + #elif SPX_N <= 256 + #define SPX_WOTS_LEN2 2 + #else + #error Did not precompute SPX_WOTS_LEN2 for n outside {2, .., 256} + #endif +#elif SPX_WOTS_W == 16 + #if SPX_N <= 8 + #define SPX_WOTS_LEN2 2 + #elif SPX_N <= 136 + #define SPX_WOTS_LEN2 3 + #elif SPX_N <= 256 + #define SPX_WOTS_LEN2 4 + #else + #error Did not precompute SPX_WOTS_LEN2 for n outside {2, .., 256} + #endif +#endif + +#define SPX_WOTS_LEN (SPX_WOTS_LEN1 + SPX_WOTS_LEN2) +#define SPX_WOTS_BYTES (SPX_WOTS_LEN * SPX_N) +#define SPX_WOTS_PK_BYTES SPX_WOTS_BYTES + +/* Subtree size. */ +#define SPX_TREE_HEIGHT (SPX_FULL_HEIGHT / SPX_D) + +#if SPX_TREE_HEIGHT * SPX_D != SPX_FULL_HEIGHT + #error SPX_D should always divide SPX_FULL_HEIGHT +#endif + +/* FORS parameters. */ +#define SPX_FORS_MSG_BYTES ((SPX_FORS_HEIGHT * SPX_FORS_TREES + 7) / 8) +#define SPX_FORS_BYTES ((SPX_FORS_HEIGHT + 1) * SPX_FORS_TREES * SPX_N) +#define SPX_FORS_PK_BYTES SPX_N + +/* Resulting SPX sizes. */ +#define SPX_BYTES (SPX_N + SPX_FORS_BYTES + SPX_D * SPX_WOTS_BYTES +\ + SPX_FULL_HEIGHT * SPX_N) +#define SPX_PK_BYTES (2 * SPX_N) +#define SPX_SK_BYTES (2 * SPX_N + SPX_PK_BYTES) + +#include "../shake_offsets.h" + +#endif diff --git a/sphincsplus/sphincsplus-keccakx2/params/params-sphincs-shake-256f.h b/sphincsplus/sphincsplus-keccakx2/params/params-sphincs-shake-256f.h new file mode 100644 index 0000000..e301c28 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/params/params-sphincs-shake-256f.h @@ -0,0 +1,80 @@ +#ifndef SPX_PARAMS_H +#define SPX_PARAMS_H + +#define SPX_NAMESPACE(s) SPX_##s + +/* Hash output length in bytes. */ +#define SPX_N 32 +/* Height of the hypertree. */ +#define SPX_FULL_HEIGHT 68 +/* Number of subtree layer. */ +#define SPX_D 17 +/* FORS tree dimensions. */ +#define SPX_FORS_HEIGHT 9 +#define SPX_FORS_TREES 35 +/* Winternitz parameter, */ +#define SPX_WOTS_W 16 + +/* The hash function is defined by linking a different hash.c file, as opposed + to setting a #define constant. */ + +/* For clarity */ +#define SPX_ADDR_BYTES 32 + +/* WOTS parameters. */ +#if SPX_WOTS_W == 256 + #define SPX_WOTS_LOGW 8 +#elif SPX_WOTS_W == 16 + #define SPX_WOTS_LOGW 4 +#else + #error SPX_WOTS_W assumed 16 or 256 +#endif + +#define SPX_WOTS_LEN1 (8 * SPX_N / SPX_WOTS_LOGW) + +/* SPX_WOTS_LEN2 is floor(log(len_1 * (w - 1)) / log(w)) + 1; we precompute */ +#if SPX_WOTS_W == 256 + #if SPX_N <= 1 + #define SPX_WOTS_LEN2 1 + #elif SPX_N <= 256 + #define SPX_WOTS_LEN2 2 + #else + #error Did not precompute SPX_WOTS_LEN2 for n outside {2, .., 256} + #endif +#elif SPX_WOTS_W == 16 + #if SPX_N <= 8 + #define SPX_WOTS_LEN2 2 + #elif SPX_N <= 136 + #define SPX_WOTS_LEN2 3 + #elif SPX_N <= 256 + #define SPX_WOTS_LEN2 4 + #else + #error Did not precompute SPX_WOTS_LEN2 for n outside {2, .., 256} + #endif +#endif + +#define SPX_WOTS_LEN (SPX_WOTS_LEN1 + SPX_WOTS_LEN2) +#define SPX_WOTS_BYTES (SPX_WOTS_LEN * SPX_N) +#define SPX_WOTS_PK_BYTES SPX_WOTS_BYTES + +/* Subtree size. */ +#define SPX_TREE_HEIGHT (SPX_FULL_HEIGHT / SPX_D) + +#if SPX_TREE_HEIGHT * SPX_D != SPX_FULL_HEIGHT + #error SPX_D should always divide SPX_FULL_HEIGHT +#endif + +/* FORS parameters. */ +#define SPX_FORS_MSG_BYTES ((SPX_FORS_HEIGHT * SPX_FORS_TREES + 7) / 8) +#define SPX_FORS_BYTES ((SPX_FORS_HEIGHT + 1) * SPX_FORS_TREES * SPX_N) +#define SPX_FORS_PK_BYTES SPX_N + +/* Resulting SPX sizes. */ +#define SPX_BYTES (SPX_N + SPX_FORS_BYTES + SPX_D * SPX_WOTS_BYTES +\ + SPX_FULL_HEIGHT * SPX_N) +#define SPX_PK_BYTES (2 * SPX_N) +#define SPX_SK_BYTES (2 * SPX_N + SPX_PK_BYTES) + +#include "../shake_offsets.h" + +#endif diff --git a/sphincsplus/sphincsplus-keccakx2/params/params-sphincs-shake-256s.h b/sphincsplus/sphincsplus-keccakx2/params/params-sphincs-shake-256s.h new file mode 100644 index 0000000..0a96894 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/params/params-sphincs-shake-256s.h @@ -0,0 +1,80 @@ +#ifndef SPX_PARAMS_H +#define SPX_PARAMS_H + +#define SPX_NAMESPACE(s) SPX_##s + +/* Hash output length in bytes. */ +#define SPX_N 32 +/* Height of the hypertree. */ +#define SPX_FULL_HEIGHT 64 +/* Number of subtree layer. */ +#define SPX_D 8 +/* FORS tree dimensions. */ +#define SPX_FORS_HEIGHT 14 +#define SPX_FORS_TREES 22 +/* Winternitz parameter, */ +#define SPX_WOTS_W 16 + +/* The hash function is defined by linking a different hash.c file, as opposed + to setting a #define constant. */ + +/* For clarity */ +#define SPX_ADDR_BYTES 32 + +/* WOTS parameters. */ +#if SPX_WOTS_W == 256 + #define SPX_WOTS_LOGW 8 +#elif SPX_WOTS_W == 16 + #define SPX_WOTS_LOGW 4 +#else + #error SPX_WOTS_W assumed 16 or 256 +#endif + +#define SPX_WOTS_LEN1 (8 * SPX_N / SPX_WOTS_LOGW) + +/* SPX_WOTS_LEN2 is floor(log(len_1 * (w - 1)) / log(w)) + 1; we precompute */ +#if SPX_WOTS_W == 256 + #if SPX_N <= 1 + #define SPX_WOTS_LEN2 1 + #elif SPX_N <= 256 + #define SPX_WOTS_LEN2 2 + #else + #error Did not precompute SPX_WOTS_LEN2 for n outside {2, .., 256} + #endif +#elif SPX_WOTS_W == 16 + #if SPX_N <= 8 + #define SPX_WOTS_LEN2 2 + #elif SPX_N <= 136 + #define SPX_WOTS_LEN2 3 + #elif SPX_N <= 256 + #define SPX_WOTS_LEN2 4 + #else + #error Did not precompute SPX_WOTS_LEN2 for n outside {2, .., 256} + #endif +#endif + +#define SPX_WOTS_LEN (SPX_WOTS_LEN1 + SPX_WOTS_LEN2) +#define SPX_WOTS_BYTES (SPX_WOTS_LEN * SPX_N) +#define SPX_WOTS_PK_BYTES SPX_WOTS_BYTES + +/* Subtree size. */ +#define SPX_TREE_HEIGHT (SPX_FULL_HEIGHT / SPX_D) + +#if SPX_TREE_HEIGHT * SPX_D != SPX_FULL_HEIGHT + #error SPX_D should always divide SPX_FULL_HEIGHT +#endif + +/* FORS parameters. */ +#define SPX_FORS_MSG_BYTES ((SPX_FORS_HEIGHT * SPX_FORS_TREES + 7) / 8) +#define SPX_FORS_BYTES ((SPX_FORS_HEIGHT + 1) * SPX_FORS_TREES * SPX_N) +#define SPX_FORS_PK_BYTES SPX_N + +/* Resulting SPX sizes. */ +#define SPX_BYTES (SPX_N + SPX_FORS_BYTES + SPX_D * SPX_WOTS_BYTES +\ + SPX_FULL_HEIGHT * SPX_N) +#define SPX_PK_BYTES (2 * SPX_N) +#define SPX_SK_BYTES (2 * SPX_N + SPX_PK_BYTES) + +#include "../shake_offsets.h" + +#endif diff --git a/sphincsplus/sphincsplus-keccakx2/randombytes.h b/sphincsplus/sphincsplus-keccakx2/randombytes.h new file mode 100644 index 0000000..6499aa3 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/randombytes.h @@ -0,0 +1,8 @@ +#ifndef SPX_RANDOMBYTES_H +#define SPX_RANDOMBYTES_H +#include +#include + +extern void randombytes(uint8_t * x, size_t xlen); + +#endif diff --git a/sphincsplus/sphincsplus-keccakx2/shake_offsets.h b/sphincsplus/sphincsplus-keccakx2/shake_offsets.h new file mode 100644 index 0000000..176360f --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/shake_offsets.h @@ -0,0 +1,21 @@ +#if !defined( SHAKE_OFFSETS_H_ ) +#define SHAKE_OFFSETS_H_ + +/* + * Offsets of various fields in the address structure when we use SHAKE as + * the Sphincs+ hash function + */ + +#define SPX_OFFSET_LAYER 3 /* The byte used to specify the Merkle tree layer */ +#define SPX_OFFSET_TREE 8 /* The start of the 8 byte field used to specify the tree */ +#define SPX_OFFSET_TYPE 19 /* The byte used to specify the hash type (reason) */ +#define SPX_OFFSET_KP_ADDR2 22 /* The high byte used to specify the key pair (which one-time signature) */ +#define SPX_OFFSET_KP_ADDR1 23 /* The low byte used to specify the key pair */ +#define SPX_OFFSET_CHAIN_ADDR 27 /* The byte used to specify the chain address (which Winternitz chain) */ +#define SPX_OFFSET_HASH_ADDR 31 /* The byte used to specify the hash address (where in the Winternitz chain) */ +#define SPX_OFFSET_TREE_HGT 27 /* The byte used to specify the height of this node in the FORS or Merkle tree */ +#define SPX_OFFSET_TREE_INDEX 28 /* The start of the 4 byte field used to specify the node in the FORS or Merkle tree */ + +#define SPX_SHAKE 1 + +#endif /* SHAKE_OFFSETS_H_ */ diff --git a/sphincsplus/sphincsplus-keccakx2/sign.c b/sphincsplus/sphincsplus-keccakx2/sign.c new file mode 100644 index 0000000..fbfc76e --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/sign.c @@ -0,0 +1,287 @@ +#include +#include +#include + +#include "api.h" +#include "params.h" +#include "wots.h" +#include "fors.h" +#include "hash.h" +#include "thash.h" +#include "address.h" +#include "randombytes.h" +#include "utils.h" +#include "merkle.h" + +/* + * Returns the length of a secret key, in bytes + */ +unsigned long long crypto_sign_secretkeybytes(void) +{ + return CRYPTO_SECRETKEYBYTES; +} + +/* + * Returns the length of a public key, in bytes + */ +unsigned long long crypto_sign_publickeybytes(void) +{ + return CRYPTO_PUBLICKEYBYTES; +} + +/* + * Returns the length of a signature, in bytes + */ +unsigned long long crypto_sign_bytes(void) +{ + return CRYPTO_BYTES; +} + +/* + * Returns the length of the seed required to generate a key pair, in bytes + */ +unsigned long long crypto_sign_seedbytes(void) +{ + return CRYPTO_SEEDBYTES; +} + +/* + * Generates an SPX key pair given a seed of length + * Format sk: [SK_SEED || SK_PRF || PUB_SEED || root] + * Format pk: [PUB_SEED || root] + */ +int crypto_sign_seed_keypair(unsigned char *pk, unsigned char *sk, + const unsigned char *seed) +{ + spx_ctx ctx; + + /* Initialize SK_SEED, SK_PRF and PUB_SEED from seed. */ + memcpy(sk, seed, CRYPTO_SEEDBYTES); + + memcpy(pk, sk + 2*SPX_N, SPX_N); + + memcpy(ctx.pub_seed, pk, SPX_N); + memcpy(ctx.sk_seed, sk, SPX_N); + + /* This hook allows the hash function instantiation to do whatever + preparation or computation it needs, based on the public seed. */ + initialize_hash_function(&ctx); + + /* Compute root node of the top-most subtree. */ + merkle_gen_root(sk + 3*SPX_N, &ctx); + + memcpy(pk + SPX_N, sk + 3*SPX_N, SPX_N); + + return 0; +} + +/* + * Generates an SPX key pair. + * Format sk: [SK_SEED || SK_PRF || PUB_SEED || root] + * Format pk: [PUB_SEED || root] + */ +int crypto_sign_keypair(unsigned char *pk, unsigned char *sk) +{ + unsigned char seed[CRYPTO_SEEDBYTES]; + randombytes(seed, CRYPTO_SEEDBYTES); + crypto_sign_seed_keypair(pk, sk, seed); + + return 0; +} + +/** + * Returns an array containing a detached signature. + */ +int crypto_sign_signature(uint8_t *sig, size_t *siglen, + const uint8_t *m, size_t mlen, const uint8_t *sk) +{ + spx_ctx ctx; + + const unsigned char *sk_prf = sk + SPX_N; + const unsigned char *pk = sk + 2*SPX_N; + + unsigned char optrand[SPX_N]; + unsigned char mhash[SPX_FORS_MSG_BYTES]; + unsigned char root[SPX_N]; + unsigned long long i; + uint64_t tree; + uint32_t idx_leaf; + uint32_t wots_addr[8] = {0}; + uint32_t tree_addr[8] = {0}; + + memcpy(ctx.sk_seed, sk, SPX_N); + memcpy(ctx.pub_seed, pk, SPX_N); + + /* This hook allows the hash function instantiation to do whatever + preparation or computation it needs, based on the public seed. */ + initialize_hash_function(&ctx); + + set_type(wots_addr, SPX_ADDR_TYPE_WOTS); + set_type(tree_addr, SPX_ADDR_TYPE_HASHTREE); + + /* Optionally, signing can be made non-deterministic using optrand. + This can help counter side-channel attacks that would benefit from + getting a large number of traces when the signer uses the same nodes. */ + randombytes(optrand, SPX_N); + /* Compute the digest randomization value. */ + gen_message_random(sig, sk_prf, optrand, m, mlen, &ctx); + + /* Derive the message digest and leaf index from R, PK and M. */ + hash_message(mhash, &tree, &idx_leaf, sig, pk, m, mlen, &ctx); + sig += SPX_N; + + set_tree_addr(wots_addr, tree); + set_keypair_addr(wots_addr, idx_leaf); + + /* Sign the message hash using FORS. */ + fors_sign(sig, root, mhash, &ctx, wots_addr); + sig += SPX_FORS_BYTES; + + for (i = 0; i < SPX_D; i++) { + set_layer_addr(tree_addr, i); + set_tree_addr(tree_addr, tree); + + copy_subtree_addr(wots_addr, tree_addr); + set_keypair_addr(wots_addr, idx_leaf); + + merkle_sign(sig, root, &ctx, wots_addr, tree_addr, idx_leaf); + sig += SPX_WOTS_BYTES + SPX_TREE_HEIGHT * SPX_N; + + /* Update the indices for the next layer. */ + idx_leaf = (tree & ((1 << SPX_TREE_HEIGHT)-1)); + tree = tree >> SPX_TREE_HEIGHT; + } + + *siglen = SPX_BYTES; + + return 0; +} + +/** + * Verifies a detached signature and message under a given public key. + */ +int crypto_sign_verify(const uint8_t *sig, size_t siglen, + const uint8_t *m, size_t mlen, const uint8_t *pk) +{ + spx_ctx ctx; + const unsigned char *pub_root = pk + SPX_N; + unsigned char mhash[SPX_FORS_MSG_BYTES]; + unsigned char wots_pk[SPX_WOTS_BYTES]; + unsigned char root[SPX_N]; + unsigned char leaf[SPX_N]; + unsigned int i; + uint64_t tree; + uint32_t idx_leaf; + uint32_t wots_addr[8] = {0}; + uint32_t tree_addr[8] = {0}; + uint32_t wots_pk_addr[8] = {0}; + + if (siglen != SPX_BYTES) { + return -1; + } + + memcpy(ctx.pub_seed, pk, SPX_N); + + /* This hook allows the hash function instantiation to do whatever + preparation or computation it needs, based on the public seed. */ + initialize_hash_function(&ctx); + + set_type(wots_addr, SPX_ADDR_TYPE_WOTS); + set_type(tree_addr, SPX_ADDR_TYPE_HASHTREE); + set_type(wots_pk_addr, SPX_ADDR_TYPE_WOTSPK); + + /* Derive the message digest and leaf index from R || PK || M. */ + /* The additional SPX_N is a result of the hash domain separator. */ + hash_message(mhash, &tree, &idx_leaf, sig, pk, m, mlen, &ctx); + sig += SPX_N; + + /* Layer correctly defaults to 0, so no need to set_layer_addr */ + set_tree_addr(wots_addr, tree); + set_keypair_addr(wots_addr, idx_leaf); + + fors_pk_from_sig(root, sig, mhash, &ctx, wots_addr); + sig += SPX_FORS_BYTES; + + /* For each subtree.. */ + for (i = 0; i < SPX_D; i++) { + set_layer_addr(tree_addr, i); + set_tree_addr(tree_addr, tree); + + copy_subtree_addr(wots_addr, tree_addr); + set_keypair_addr(wots_addr, idx_leaf); + + copy_keypair_addr(wots_pk_addr, wots_addr); + + /* The WOTS public key is only correct if the signature was correct. */ + /* Initially, root is the FORS pk, but on subsequent iterations it is + the root of the subtree below the currently processed subtree. */ + wots_pk_from_sig(wots_pk, sig, root, &ctx, wots_addr); + sig += SPX_WOTS_BYTES; + + /* Compute the leaf node using the WOTS public key. */ + thash(leaf, wots_pk, SPX_WOTS_LEN, &ctx, wots_pk_addr); + + /* Compute the root node of this subtree. */ + compute_root(root, leaf, idx_leaf, 0, sig, SPX_TREE_HEIGHT, + &ctx, tree_addr); + sig += SPX_TREE_HEIGHT * SPX_N; + + /* Update the indices for the next layer. */ + idx_leaf = (tree & ((1 << SPX_TREE_HEIGHT)-1)); + tree = tree >> SPX_TREE_HEIGHT; + } + + /* Check if the root node equals the root node in the public key. */ + if (memcmp(root, pub_root, SPX_N)) { + return -1; + } + + return 0; +} + + +/** + * Returns an array containing the signature followed by the message. + */ +int crypto_sign(unsigned char *sm, unsigned long long *smlen, + const unsigned char *m, unsigned long long mlen, + const unsigned char *sk) +{ + size_t siglen; + + crypto_sign_signature(sm, &siglen, m, (size_t)mlen, sk); + + memmove(sm + SPX_BYTES, m, mlen); + *smlen = siglen + mlen; + + return 0; +} + +/** + * Verifies a given signature-message pair under a given public key. + */ +int crypto_sign_open(unsigned char *m, unsigned long long *mlen, + const unsigned char *sm, unsigned long long smlen, + const unsigned char *pk) +{ + /* The API caller does not necessarily know what size a signature should be + but SPHINCS+ signatures are always exactly SPX_BYTES. */ + if (smlen < SPX_BYTES) { + memset(m, 0, smlen); + *mlen = 0; + return -1; + } + + *mlen = smlen - SPX_BYTES; + + if (crypto_sign_verify(sm, SPX_BYTES, sm + SPX_BYTES, (size_t)*mlen, pk)) { + memset(m, 0, smlen); + *mlen = 0; + return -1; + } + + /* If verification was successful, move the message to the right place. */ + memmove(m, sm + SPX_BYTES, *mlen); + + return 0; +} diff --git a/sphincsplus/sphincsplus-keccakx2/test/benchmark.c b/sphincsplus/sphincsplus-keccakx2/test/benchmark.c new file mode 100644 index 0000000..aa23203 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/test/benchmark.c @@ -0,0 +1,199 @@ +#define _POSIX_C_SOURCE 199309L + +#include +#include +#include + +#include "../thash.h" +#include "../thashx2.h" +#include "../api.h" +#include "../f1600x2.h" +#include "../fors.h" +#include "../wots.h" +#include "../wotsx2.h" +#include "../params.h" +#include "../randombytes.h" + +#include "cycles.h" + +#define SPX_MLEN 32 +#define NTESTS 10 + +static void wots_gen_pkx2(unsigned char *pk, const spx_ctx *ctx, + uint32_t addr[8]); + +static int cmp_llu(const void *a, const void*b) +{ + if(*(unsigned long long *)a < *(unsigned long long *)b) return -1; + if(*(unsigned long long *)a > *(unsigned long long *)b) return 1; + return 0; +} + +static unsigned long long median(unsigned long long *l, size_t llen) +{ + qsort(l,llen,sizeof(unsigned long long),cmp_llu); + + if(llen%2) return l[llen/2]; + else return (l[llen/2-1]+l[llen/2])/2; +} + +static void delta(unsigned long long *l, size_t llen) +{ + unsigned int i; + for(i = 0; i < llen - 1; i++) { + l[i] = l[i+1] - l[i]; + } +} + +static void printfcomma (unsigned long long n) +{ + if (n < 1000) { + printf("%llu", n); + return; + } + printfcomma(n / 1000); + printf (",%03llu", n % 1000); +} + +static void printfalignedcomma (unsigned long long n, int len) +{ + unsigned long long ncopy = n; + int i = 0; + + while (ncopy > 9) { + len -= 1; + ncopy /= 10; + i += 1; // to account for commas + } + i = i/3 - 1; // to account for commas + for (; i < len; i++) { + printf(" "); + } + printfcomma(n); +} + +static void display_result(double result, unsigned long long *l, size_t llen, unsigned long long mul) +{ + unsigned long long med; + + result /= NTESTS; + delta(l, NTESTS + 1); + med = median(l, llen); + printf("avg. %11.2lf us (%2.2lf sec); median ", result, result / 1e6); + printfalignedcomma(med, 12); + printf(" cycles, %5llux: ", mul); + printfalignedcomma(mul*med, 12); + printf(" cycles\n"); +} + +#define MEASURE_GENERIC(TEXT, MUL, FNCALL, CORR)\ + printf(TEXT);\ + clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);\ + for(i = 0; i < NTESTS; i++) {\ + t[i] = cpucycles() / CORR;\ + FNCALL;\ + }\ + t[NTESTS] = cpucycles();\ + clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &stop);\ + result = ((stop.tv_sec - start.tv_sec) * 1e6 + \ + (stop.tv_nsec - start.tv_nsec) / 1e3) / (double)CORR;\ + display_result(result, t, NTESTS, MUL); +#define MEASURT(TEXT, MUL, FNCALL)\ + MEASURE_GENERIC(\ + TEXT, MUL,\ + do {\ + for (int j = 0; j < 1000; j++) {\ + FNCALL;\ + }\ + } while (0);,\ + 1000); +#define MEASURE(TEXT, MUL, FNCALL) MEASURE_GENERIC(TEXT, MUL, FNCALL, 1) + + +static void check_overflow(){ + if(is_cpucycles_overflow()){ + printf("cycle counter overflowed; error\n"); + exit(-1); + } +} +int main() +{ + init_cpucycles(); + /* Make stdout buffer more responsive. */ + setbuf(stdout, NULL); + + spx_ctx ctx; + unsigned char pk[SPX_PK_BYTES]; + unsigned char sk[SPX_SK_BYTES]; + unsigned char *m = malloc(SPX_MLEN); + unsigned char *sm = malloc(SPX_BYTES + SPX_MLEN); + unsigned char *mout = malloc(SPX_BYTES + SPX_MLEN); + + unsigned char fors_pk[SPX_FORS_PK_BYTES]; + unsigned char fors_m[SPX_FORS_MSG_BYTES]; + unsigned char fors_sig[SPX_FORS_BYTES]; + unsigned char addr[SPX_ADDR_BYTES*2]; + unsigned char wots_pk[4*SPX_WOTS_PK_BYTES]; + unsigned char block[SPX_N]; + + unsigned long long smlen; + unsigned long long mlen; + unsigned long long t[NTESTS+1]; + struct timespec start, stop; + double result; + int i; + uint64_t statex2[50]; + + randombytes(m, SPX_MLEN); + randombytes(addr, SPX_ADDR_BYTES*2); + + printf("Parameters: n = %d, h = %d, d = %d, b = %d, k = %d, w = %d\n", + SPX_N, SPX_FULL_HEIGHT, SPX_D, SPX_FORS_HEIGHT, SPX_FORS_TREES, + SPX_WOTS_W); + + printf("Running %d iterations.\n", NTESTS); + check_overflow(); + reset_cpucycles(); + MEASURT("thash ", 1, thash(block, block, 1, &ctx, (uint32_t*)addr)); + check_overflow(); + reset_cpucycles(); + MEASURT("f1600x2 ", 1, f1600x2(statex2)); + check_overflow(); + reset_cpucycles(); + MEASURT("thashx2 ", 1, thashx2(block, block, block, block, 1, &ctx, (uint32_t*)addr)); + check_overflow(); + reset_cpucycles(); + MEASURE("Generating keypair.. ", 1, crypto_sign_keypair(pk, sk)); + check_overflow(); + reset_cpucycles(); + MEASURE(" - WOTS pk gen 2x.. ", (1 << SPX_TREE_HEIGHT) / 2, wots_gen_pkx2(wots_pk, &ctx, (uint32_t *) addr)); + check_overflow(); + reset_cpucycles(); + MEASURE("Signing.. ", 1, crypto_sign(sm, &smlen, m, SPX_MLEN, sk)); + check_overflow(); + reset_cpucycles(); + MEASURE(" - FORS signing.. ", 1, fors_sign(fors_sig, fors_pk, fors_m, &ctx, (uint32_t *) addr)); + check_overflow(); + reset_cpucycles(); + MEASURE(" - WOTS pk gen x2.. ", SPX_D * (1 << SPX_TREE_HEIGHT) / 2, wots_gen_pkx2(wots_pk, &ctx, (uint32_t *) addr)); + check_overflow(); + reset_cpucycles(); + MEASURE("Verifying.. ", 1, crypto_sign_open(mout, &mlen, sm, smlen, pk)); + + printf("Signature size: %d (%.2f KiB)\n", SPX_BYTES, SPX_BYTES / 1024.0); + printf("Public key size: %d (%.2f KiB)\n", SPX_PK_BYTES, SPX_PK_BYTES / 1024.0); + printf("Secret key size: %d (%.2f KiB)\n", SPX_SK_BYTES, SPX_SK_BYTES / 1024.0); + + free(m); + free(sm); + free(mout); + + return 0; +} + +static void wots_gen_pkx2(unsigned char *pk, const spx_ctx *ctx, uint32_t addr[8]) { + struct leaf_info_x2 leaf; + unsigned steps[ SPX_WOTS_LEN ] = { 0 }; + INITIALIZE_LEAF_INFO_X2(leaf, addr, steps); + wots_gen_leafx2(pk, ctx, 0, &leaf); +} diff --git a/sphincsplus/sphincsplus-keccakx2/test/cycles.c b/sphincsplus/sphincsplus-keccakx2/test/cycles.c new file mode 100644 index 0000000..78efb89 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/test/cycles.c @@ -0,0 +1,138 @@ +#include "cycles.h" + + + +#if defined(PMU_CYCLES) +void enable_cyclecounter() { + uint64_t tmp; + __asm __volatile ( + "mrs %[tmp], pmcr_el0\n" + "orr %[tmp], %[tmp], #1\n" + "msr pmcr_el0, %[tmp]\n" + "mrs %[tmp], PMOVSCLR_EL0\n" // reset overflow bit + "orr %[tmp], %[tmp], #(1<<31)\n" + "msr PMOVSCLR_EL0, %[tmp]\n" + "mrs %[tmp], pmcntenset_el0\n" + "orr %[tmp], %[tmp], #1<<31\n" + "msr pmcntenset_el0, %[tmp]\n" + : [tmp] "=r" (tmp) + ); +} + +void disable_cyclecounter() { + uint64_t tmp; + __asm __volatile ( + "mov %[tmp], #0x3f\n" + "orr %[tmp], %[tmp], #1<<31\n" + "msr pmcntenclr_el0, %[tmp]\n" + : [tmp] "=r" (tmp) + ); +} + +uint64_t get_cyclecounter() { + uint64_t retval; + __asm __volatile ( + "mrs %[retval], pmccntr_el0\n" + : [retval] "=r" (retval)); + return retval; +} +// Somehow weird things happen as soon as the cycle counter reaches 2^32. +// In theory, there is a long counter mode (bit 6 of pmcr_el0), but I did not +// get it to work yet. +// Instead, we reset the cycle counter after each experiment and make sure that +// it never overflows. +void reset_cpucycles() { + uint64_t tmp; + __asm __volatile ( + "mrs %[tmp], pmcr_el0\n" + "orr %[tmp], %[tmp], #(1<<2)\n" // reset cycle counter + "msr pmcr_el0, %[tmp]\n" + : [tmp] "=r" (tmp) + ); +} + +int is_cpucycles_overflow(){ + uint32_t val; + __asm __volatile("mrs %0, PMOVSSET_EL0" : "=r"(val)); + return (val & (1U<<31)); +} +#elif defined(PERF_CYCLES) + +#include +#include +#include +#include +#include +#include +#include +#include + +static int perf_fd = 0; +void enable_cyclecounter() { + struct perf_event_attr pe; + memset(&pe, 0, sizeof(struct perf_event_attr)); + pe.type = PERF_TYPE_HARDWARE; + pe.size = sizeof(struct perf_event_attr); + pe.config = PERF_COUNT_HW_CPU_CYCLES; + pe.disabled = 1; + pe.exclude_kernel = 1; + pe.exclude_hv = 1; + + perf_fd = syscall(__NR_perf_event_open, &pe, 0, -1, -1, 0); + + ioctl(perf_fd, PERF_EVENT_IOC_RESET, 0); + ioctl(perf_fd, PERF_EVENT_IOC_ENABLE, 0); +} + +void disable_cyclecounter() { + ioctl(perf_fd, PERF_EVENT_IOC_DISABLE, 0); + close(perf_fd); +} + +uint64_t get_cyclecounter() { + long long cpu_cycles; + ioctl(perf_fd, PERF_EVENT_IOC_DISABLE, 0); + ssize_t read_count = read(perf_fd, &cpu_cycles, sizeof(cpu_cycles)); + if (read_count < 0) { + perror("read"); + exit(EXIT_FAILURE); + } else if (read_count == 0) { + /* Should not happen */ + printf("perf counter empty\n"); + exit(EXIT_FAILURE); + } + ioctl(perf_fd, PERF_EVENT_IOC_ENABLE, 0); + return cpu_cycles; +} + +void reset_cpucycles(void) { + return; +} +int is_cpucycles_overflow(void){ + return 0; +} + +#elif defined(EXTERNAL_CYCLES) + +// nothing to do + +#else /* NO_CYCLES */ + +void enable_cyclecounter() { + return; +} +void disable_cyclecounter() { + return; +} +uint64_t get_cyclecounter() { + return(0); +} + +void reset_cpucycles(void) { + return; +} +int is_cpucycles_overflow(void){ + return 0; +} + +#endif /* NO_CYCLES */ \ No newline at end of file diff --git a/sphincsplus/sphincsplus-keccakx2/test/cycles.h b/sphincsplus/sphincsplus-keccakx2/test/cycles.h new file mode 100644 index 0000000..bc3dff4 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/test/cycles.h @@ -0,0 +1,21 @@ +#ifndef SPX_CYCLES_H +#define SPX_CYCLES_H + +#include + +#if !defined(EXTERNAL_CYCLES) && !defined(PERF_CYCLES) && !defined(PMU_CYCLES) && !defined(NO_CYCLES) +#define NO_CYCLES +#endif + +void enable_cyclecounter(void); +void disable_cyclecounter(void); +uint64_t get_cyclecounter(void); +void reset_cpucycles(void); +int is_cpucycles_overflow(void); + + +#define init_cpucycles enable_cyclecounter +#define cpucycles get_cyclecounter + + +#endif diff --git a/sphincsplus/sphincsplus-keccakx2/test/fors.c b/sphincsplus/sphincsplus-keccakx2/test/fors.c new file mode 100644 index 0000000..970c745 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/test/fors.c @@ -0,0 +1,41 @@ +#include +#include + +#include "../context.h" +#include "../hash.h" +#include "../fors.h" +#include "../randombytes.h" +#include "../params.h" + +int main() +{ + /* Make stdout buffer more responsive. */ + setbuf(stdout, NULL); + + spx_ctx ctx; + + unsigned char pk1[SPX_FORS_PK_BYTES]; + unsigned char pk2[SPX_FORS_PK_BYTES]; + unsigned char sig[SPX_FORS_BYTES]; + unsigned char m[SPX_FORS_MSG_BYTES]; + uint32_t addr[8] = {0}; + + randombytes(ctx.sk_seed, SPX_N); + randombytes(ctx.pub_seed, SPX_N); + randombytes(m, SPX_FORS_MSG_BYTES); + randombytes((unsigned char *)addr, 8 * sizeof(uint32_t)); + + printf("Testing FORS signature and PK derivation.. "); + + initialize_hash_function(&ctx); + + fors_sign(sig, pk1, m, &ctx, addr); + fors_pk_from_sig(pk2, sig, m, &ctx, addr); + + if (memcmp(pk1, pk2, SPX_FORS_PK_BYTES)) { + printf("failed!\n"); + return -1; + } + printf("successful.\n"); + return 0; +} diff --git a/sphincsplus/sphincsplus-keccakx2/test/randombytes.c b/sphincsplus/sphincsplus-keccakx2/test/randombytes.c new file mode 100644 index 0000000..2761695 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/test/randombytes.c @@ -0,0 +1,43 @@ +/* +This code was taken from the SPHINCS reference implementation and is public domain. +*/ + +#include +#include + +#include "randombytes.h" + +static int fd = -1; + +void randombytes(uint8_t *x, size_t xlen) +{ + int i; + + if (fd == -1) { + for (;;) { + fd = open("/dev/urandom", O_RDONLY); + if (fd != -1) { + break; + } + sleep(1); + } + } + + while (xlen > 0) { + if (xlen < 1048576) { + i = xlen; + } + else { + i = 1048576; + } + + i = read(fd, x, i); + if (i < 1) { + sleep(1); + continue; + } + + x += i; + xlen -= i; + } +} diff --git a/sphincsplus/sphincsplus-keccakx2/test/spx.c b/sphincsplus/sphincsplus-keccakx2/test/spx.c new file mode 100644 index 0000000..31f3337 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/test/spx.c @@ -0,0 +1,125 @@ +#include +#include +#include + +#include "../api.h" +#include "../params.h" +#include "../randombytes.h" + +#define SPX_MLEN 32 +#define SPX_SIGNATURES 1 + +int main() +{ + int ret = 0; + int i; + + /* Make stdout buffer more responsive. */ + setbuf(stdout, NULL); + + unsigned char pk[SPX_PK_BYTES]; + unsigned char sk[SPX_SK_BYTES]; + unsigned char *m = malloc(SPX_MLEN); + unsigned char *sm = malloc(SPX_BYTES + SPX_MLEN); + unsigned char *mout = malloc(SPX_BYTES + SPX_MLEN); + unsigned long long smlen; + unsigned long long mlen; + + randombytes(m, SPX_MLEN); + + printf("Generating keypair.. "); + + if (crypto_sign_keypair(pk, sk)) { + printf("failed!\n"); + return -1; + } + printf("successful.\n"); + + printf("Testing %d signatures.. \n", SPX_SIGNATURES); + + for (i = 0; i < SPX_SIGNATURES; i++) { + printf(" - iteration #%d:\n", i); + + crypto_sign(sm, &smlen, m, SPX_MLEN, sk); + + if (smlen != SPX_BYTES + SPX_MLEN) { + printf(" X smlen incorrect [%llu != %u]!\n", + smlen, SPX_BYTES); + ret = -1; + } + else { + printf(" smlen as expected [%llu].\n", smlen); + } + + /* Test if signature is valid. */ + if (crypto_sign_open(mout, &mlen, sm, smlen, pk)) { + printf(" X verification failed!\n"); + ret = -1; + } + else { + printf(" verification succeeded.\n"); + } + + /* Test if the correct message was recovered. */ + if (mlen != SPX_MLEN) { + printf(" X mlen incorrect [%llu != %u]!\n", mlen, SPX_MLEN); + ret = -1; + } + else { + printf(" mlen as expected [%llu].\n", mlen); + } + if (memcmp(m, mout, SPX_MLEN)) { + printf(" X output message incorrect!\n"); + ret = -1; + } + else { + printf(" output message as expected.\n"); + } + + /* Test if signature is valid when validating in-place. */ + if (crypto_sign_open(sm, &mlen, sm, smlen, pk)) { + printf(" X in-place verification failed!\n"); + ret = -1; + } + else { + printf(" in-place verification succeeded.\n"); + } + + /* Test if flipping bits invalidates the signature (it should). */ + + /* Flip the first bit of the message. Should invalidate. */ + sm[smlen - 1] ^= 1; + if (!crypto_sign_open(mout, &mlen, sm, smlen, pk)) { + printf(" X flipping a bit of m DID NOT invalidate signature!\n"); + ret = -1; + } + else { + printf(" flipping a bit of m invalidates signature.\n"); + } + sm[smlen - 1] ^= 1; + +#ifdef SPX_TEST_INVALIDSIG + int j; + /* Flip one bit per hash; the signature is entirely hashes. */ + for (j = 0; j < (int)(smlen - SPX_MLEN); j += SPX_N) { + sm[j] ^= 1; + if (!crypto_sign_open(mout, &mlen, sm, smlen, pk)) { + printf(" X flipping bit %d DID NOT invalidate sig + m!\n", j); + sm[j] ^= 1; + ret = -1; + break; + } + sm[j] ^= 1; + } + if (j >= (int)(smlen - SPX_MLEN)) { + printf(" changing any signature hash invalidates signature.\n"); + } +#endif + } + + free(m); + free(sm); + free(mout); + + return ret; +} diff --git a/sphincsplus/sphincsplus-keccakx2/thash.h b/sphincsplus/sphincsplus-keccakx2/thash.h new file mode 100644 index 0000000..ec9222c --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/thash.h @@ -0,0 +1,12 @@ +#ifndef SPX_THASHX2_AS_ONE +#define SPX_THASHX2_AS_ONE + +#include +#include "context.h" + +void thash(unsigned char *out, const unsigned char *in, unsigned int inblocks, + const spx_ctx *ctx, uint32_t addr[8]); + + +#endif + diff --git a/sphincsplus/sphincsplus-keccakx2/thash_shake_robustx2.c b/sphincsplus/sphincsplus-keccakx2/thash_shake_robustx2.c new file mode 100644 index 0000000..3f8c5b5 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/thash_shake_robustx2.c @@ -0,0 +1,173 @@ +#include +#include + +#include "thash.h" +#include "thashx2.h" +#include "address.h" +#include "params.h" + +#include "f1600x2.h" +#include "fips202x2.h" + + +void thash(unsigned char *out, + const unsigned char *in, + unsigned int inblocks, + const spx_ctx *ctx, uint32_t addr[8]) { + uint32_t addrx2 [2*8] = { + addr[0], addr[1], addr[2], addr[3], addr[4], addr[5], addr[6], addr[7], + addr[0], addr[1], addr[2], addr[3], addr[4], addr[5], addr[6], addr[7] + }; + thashx2(out, out, in, in, inblocks, ctx, addrx2); +} + +/** + * 2-way parallel version of thash; takes 2x as much input and output + */ +void thashx2(unsigned char *out0, + unsigned char *out1, + const unsigned char *in0, + const unsigned char *in1, + unsigned int inblocks, + const spx_ctx *ctx, uint32_t addrx2[2*8]) +{ + if (SPX_N <= 32 && (inblocks == 1 || inblocks == 2)) { + /* As we write and read only a few quadwords, it is more efficient to + * build and extract from the twoway SHAKE256 state by hand. */ + uint64_t state[50] = {0}; + uint64_t state2[50]; + + for (int i = 0; i < SPX_N/8; i++) { + uint64_t x = load64(ctx->pub_seed + 8*i); + state[2*i] = x; + state[2*i+1] = x; + } + for (int i = 0; i < 4; i++) { + state[2*(SPX_N/8 + i)] = (((uint64_t)addrx2[1+2*i]) << 32) + | (uint64_t)addrx2[2*i]; + state[2*(SPX_N/8 + i) + 1] = (((uint64_t)addrx2[8+1+2*i]) << 32) + | (uint64_t)addrx2[8+2*i]; + } + + /* Domain separator and padding. */ + state[2*16] = 0x80ll << 56; + state[2*16+1] = 0x80ll << 56; + + state[2*((SPX_N/8)+4)] ^= 0x1f; + state[2*((SPX_N/8)+4)+1] ^= 0x1f; + + /* We will permutate state2 with f1600x2 to compute the bitmask, + * but first we'll copy it to state2 which will be used to compute + * the final output, as its input is almost identical. */ + memcpy(state2, state, 400); + + f1600x2(state); + + /* By copying from state, state2 already contains the pub_seed + * and address. We just need to copy in the input blocks xorred with + * the bitmask we just computed. */ + for (unsigned int i = 0; i < (SPX_N/8) * inblocks; i++) { + state2[2*(SPX_N/8+4+i)] = state[2*i] ^ load64(in0 + 8*i); + state2[2*(SPX_N/8+4+i)+1] = state[2*i+1] ^ load64(in1 + 8*i); + } + + /* Domain separator and start of padding. Note that the quadwords + * around are already zeroed for state from which we copied. + * We do a XOR instead of a set as this might be the 16th quadword + * when N=32 and inblocks=2, which already contains the end + * of the padding. */ + state2[2*((SPX_N/8)*(1+inblocks)+4)] ^= 0x1f; + state2[2*((SPX_N/8)*(1+inblocks)+4)+1] ^= 0x1f; + + f1600x2(state2); + + for (int i = 0; i < SPX_N/8; i++) { + store64(out0 + 8*i, state2[2*i]); + store64(out1 + 8*i, state2[2*i+1]); + } + } else if (SPX_N == 64 && (inblocks == 1 || inblocks == 2)) { + /* As we write and read only a few quadwords, it is more efficient to + * build and extract from the fourway SHAKE256 state by hand. */ + uint64_t state[50] = {0}; + uint64_t state2[50]; + + for (int i = 0; i < SPX_N/8; i++) { + uint64_t x = load64(ctx->pub_seed + 8*i); + state[2*i] = x; + state[2*i+1] = x; + } + for (int i = 0; i < 4; i++) { + state[2*(SPX_N/8 + i)] = (((uint64_t)addrx2[1+2*i]) << 32) + | (uint64_t)addrx2[2*i]; + state[2*(SPX_N/8 + i) + 1] = (((uint64_t)addrx2[8+1+2*i]) << 32) + | (uint64_t)addrx2[8+2*i]; + } + + /* Domain separator and padding. */ + state[2*16] = 0x80ll << 56; + state[2*16+1] = 0x80ll << 56; + + state[2*((SPX_N/8)*+4)] ^= 0x1f; + state[2*((SPX_N/8)*+4)+1] ^= 0x1f; + + /* We will permutate state2 with f1600x2 to compute the bitmask, + * but first we'll copy it to state2 which will be used to compute + * the final output, as its input is almost identical. */ + memcpy(state2, state, 400); + + f1600x2(state); + + /* We will won't be able to fit all input in on go. + * By copying from state, state2 already contains the pub_seed + * and address. We just need to copy in the input blocks xorred with + * the bitmask we just computed. */ + for (int i = 0; i < 5; i++) { + state2[2*(8+4+i)] = state[2*i] ^ load64(in0 + 8*i); + state2[2*(8+4+i)+1] = state[2*i+1] ^ load64(in1 + 8*i); + } + + f1600x2(state2); + + /* Final input. */ + for (int i = 0; i < 3+8*(inblocks-1); i++) { + state2[2*i] = state2[2*i] ^ state[2*(i+5)] ^ load64(in0 + 8*(i+5)); + state2[2*i+1] = state2[2*i+1] ^ state[2*(i+5)+1] + ^ load64(in1 + 8*(i+5)); + } + + /* Domain separator and padding. */ + state2[2*(3+8*(inblocks-1))] ^= 0x1f; + state2[2*(3+8*(inblocks-1))+1] ^= 0x1f; + state2[16] ^= 0x80ll << 56; + state2[16] ^= 0x80ll << 56; + + f1600x2(state2); + + for (int i = 0; i < 8; i++) { + store64(out0 + 8*i, state2[2*i]); + store64(out1 + 8*i, state2[2*i+1]); + } + } else { + unsigned char buf0[SPX_N + SPX_ADDR_BYTES + inblocks*SPX_N]; + unsigned char buf1[SPX_N + SPX_ADDR_BYTES + inblocks*SPX_N]; + unsigned char bitmask0[inblocks * SPX_N]; + unsigned char bitmask1[inblocks * SPX_N]; + unsigned int i; + + memcpy(buf0, ctx->pub_seed, SPX_N); + memcpy(buf1, ctx->pub_seed, SPX_N); + memcpy(buf0 + SPX_N, addrx2 + 0*8, SPX_ADDR_BYTES); + memcpy(buf1 + SPX_N, addrx2 + 1*8, SPX_ADDR_BYTES); + + shake256x2(bitmask0, bitmask1, inblocks * SPX_N, + buf0, buf1, SPX_N + SPX_ADDR_BYTES); + + for (i = 0; i < inblocks * SPX_N; i++) { + buf0[SPX_N + SPX_ADDR_BYTES + i] = in0[i] ^ bitmask0[i]; + buf1[SPX_N + SPX_ADDR_BYTES + i] = in1[i] ^ bitmask1[i]; + } + + shake256x2(out0, out1, SPX_N, + buf0, buf1, SPX_N + SPX_ADDR_BYTES + inblocks*SPX_N); + } +} diff --git a/sphincsplus/sphincsplus-keccakx2/thash_shake_simplex2.c b/sphincsplus/sphincsplus-keccakx2/thash_shake_simplex2.c new file mode 100644 index 0000000..309aaa8 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/thash_shake_simplex2.c @@ -0,0 +1,123 @@ +#include +#include + +#include "thash.h" +#include "thashx2.h" +#include "address.h" +#include "params.h" + +#include "f1600x2.h" +#include "fips202x2.h" + + +void thash(unsigned char *out, + const unsigned char *in, + unsigned int inblocks, + const spx_ctx *ctx, uint32_t addr[8]) { + uint32_t addrx2 [2*8] = { + addr[0], addr[1], addr[2], addr[3], addr[4], addr[5], addr[6], addr[7], + addr[0], addr[1], addr[2], addr[3], addr[4], addr[5], addr[6], addr[7] + }; + thashx2(out, out, in, in, inblocks, ctx, addrx2); +} + +/** + * 2-way parallel version of thash; takes 2x as much input and output + */ +void thashx2(unsigned char *out0, + unsigned char *out1, + const unsigned char *in0, + const unsigned char *in1, + unsigned int inblocks, + const spx_ctx *ctx, uint32_t addrx2[2*8]) +{ + if (SPX_N <= 32 && (inblocks == 1 || inblocks == 2)) { + /* As we write and read only a few quadwords, it is more efficient to + * build and extract from the twoway SHAKE256 state by hand. */ + uint64_t state[50] = {0}; + for (int i = 0; i < SPX_N/8; i++) { + uint64_t x = load64(ctx->pub_seed + 8*i); + state[2*i] = x; + state[2*i+1] = x; + } + for (int i = 0; i < 4; i++) { + state[2*(SPX_N/8 + i)] = (((uint64_t)addrx2[1+2*i]) << 32) + | (uint64_t)addrx2[2*i]; + state[2*(SPX_N/8 + i) + 1] = (((uint64_t)addrx2[8+1+2*i]) << 32) + | (uint64_t)addrx2[8+2*i]; + } + + for (unsigned int i = 0; i < (SPX_N/8) * inblocks; i++) { + state[2*(SPX_N/8+4+i)] = load64(in0+8*i); + state[2*(SPX_N/8+4+i)+1] = load64(in1+8*i); + } + + /* Domain separator and padding. */ + state[2*16] = 0x80ll << 56; + state[2*16+1] = 0x80ll << 56; + + state[2*((SPX_N/8)*(1+inblocks)+4)] ^= 0x1f; + state[2*((SPX_N/8)*(1+inblocks)+4)+1] ^= 0x1f; + + f1600x2(state); + + for (int i = 0; i < SPX_N/8; i++) { + store64(out0 + 8*i, state[2*i]); + store64(out1 + 8*i, state[2*i+1]); + } + } else if (SPX_N == 64 && (inblocks == 1 || inblocks == 2)) { + uint64_t state[50] = {0}; + for (int i = 0; i < SPX_N/8; i++) { + uint64_t x = load64(ctx->pub_seed + 8*i); + state[2*i] = x; + state[2*i+1] = x; + } + for (int i = 0; i < 4; i++) { + state[2*(SPX_N/8 + i)] = (((uint64_t)addrx2[1+2*i]) << 32) + | (uint64_t)addrx2[2*i]; + state[2*(SPX_N/8 + i) + 1] = (((uint64_t)addrx2[8+1+2*i]) << 32) + | (uint64_t)addrx2[8+2*i]; + } + + for (unsigned int i = 0; i < (SPX_N/8) * inblocks; i++) { + state[2*(SPX_N/8+4+i)] = load64(in0+8*i); + state[2*(SPX_N/8+4+i)+1] = load64(in1+8*i); + } + + f1600x2(state); + + /* Final input. */ + for (unsigned int i = 0; i < 3+8*(inblocks-1); i++) { + state[2*i] ^= load64(in0+8*(i+5)); + state[2*i+1] ^= load64(in1+8*(i+5)); + } + + /* Domain separator and padding. */ + state[2*16] ^= 0x80ll << 56; + state[2*16+1] ^= 0x80ll << 56; + + state[2*(3+8*(inblocks-1))] ^= 0x1f; + state[2*(3+8*(inblocks-1))+1] ^= 0x1f; + + f1600x2(state); + + for (int i = 0; i < SPX_N/8; i++) { + store64(out0 + 8*i, state[2*i]); + store64(out1 + 8*i, state[2*i+1]); + } + + } else { + unsigned char buf0[SPX_N + SPX_ADDR_BYTES + inblocks*SPX_N]; + unsigned char buf1[SPX_N + SPX_ADDR_BYTES + inblocks*SPX_N]; + + memcpy(buf0, ctx->pub_seed, SPX_N); + memcpy(buf1, ctx->pub_seed, SPX_N); + memcpy(buf0 + SPX_N, addrx2 + 0*8, SPX_ADDR_BYTES); + memcpy(buf1 + SPX_N, addrx2 + 1*8, SPX_ADDR_BYTES); + memcpy(buf0 + SPX_N + SPX_ADDR_BYTES, in0, inblocks * SPX_N); + memcpy(buf1 + SPX_N + SPX_ADDR_BYTES, in1, inblocks * SPX_N); + + shake256x2(out0, out1, SPX_N, + buf0, buf1, SPX_N + SPX_ADDR_BYTES + inblocks*SPX_N); + } +} diff --git a/sphincsplus/sphincsplus-keccakx2/thashx2.h b/sphincsplus/sphincsplus-keccakx2/thashx2.h new file mode 100644 index 0000000..bde4f59 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/thashx2.h @@ -0,0 +1,16 @@ +#ifndef SPX_THASHX2_H +#define SPX_THASHX2_H + +#include +#include "context.h" +#include "params.h" + +#define thashx2 SPX_NAMESPACE(thashx2) +void thashx2(unsigned char *out0, + unsigned char *out1, + const unsigned char *in0, + const unsigned char *in1, + unsigned int inblocks, + const spx_ctx *ctx, uint32_t addrx2[2*8]); + +#endif diff --git a/sphincsplus/sphincsplus-keccakx2/utils.c b/sphincsplus/sphincsplus-keccakx2/utils.c new file mode 100644 index 0000000..a3a3b37 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/utils.c @@ -0,0 +1,154 @@ +#include + +#include "utils.h" +#include "params.h" +#include "hash.h" +#include "thash.h" +#include "address.h" + +/** + * Converts the value of 'in' to 'outlen' bytes in big-endian byte order. + */ +void ull_to_bytes(unsigned char *out, unsigned int outlen, + unsigned long long in) +{ + int i; + + /* Iterate over out in decreasing order, for big-endianness. */ + for (i = outlen - 1; i >= 0; i--) { + out[i] = in & 0xff; + in = in >> 8; + } +} + +void u32_to_bytes(unsigned char *out, uint32_t in) +{ + out[0] = (unsigned char)(in >> 24); + out[1] = (unsigned char)(in >> 16); + out[2] = (unsigned char)(in >> 8); + out[3] = (unsigned char)in; +} + +/** + * Converts the inlen bytes in 'in' from big-endian byte order to an integer. + */ +unsigned long long bytes_to_ull(const unsigned char *in, unsigned int inlen) +{ + unsigned long long retval = 0; + unsigned int i; + + for (i = 0; i < inlen; i++) { + retval |= ((unsigned long long)in[i]) << (8*(inlen - 1 - i)); + } + return retval; +} + +/** + * Computes a root node given a leaf and an auth path. + * Expects address to be complete other than the tree_height and tree_index. + */ +void compute_root(unsigned char *root, const unsigned char *leaf, + uint32_t leaf_idx, uint32_t idx_offset, + const unsigned char *auth_path, uint32_t tree_height, + const spx_ctx *ctx, uint32_t addr[8]) +{ + uint32_t i; + unsigned char buffer[2 * SPX_N]; + + /* If leaf_idx is odd (last bit = 1), current path element is a right child + and auth_path has to go left. Otherwise it is the other way around. */ + if (leaf_idx & 1) { + memcpy(buffer + SPX_N, leaf, SPX_N); + memcpy(buffer, auth_path, SPX_N); + } + else { + memcpy(buffer, leaf, SPX_N); + memcpy(buffer + SPX_N, auth_path, SPX_N); + } + auth_path += SPX_N; + + for (i = 0; i < tree_height - 1; i++) { + leaf_idx >>= 1; + idx_offset >>= 1; + /* Set the address of the node we're creating. */ + set_tree_height(addr, i + 1); + set_tree_index(addr, leaf_idx + idx_offset); + + /* Pick the right or left neighbor, depending on parity of the node. */ + if (leaf_idx & 1) { + thash(buffer + SPX_N, buffer, 2, ctx, addr); + memcpy(buffer, auth_path, SPX_N); + } + else { + thash(buffer, buffer, 2, ctx, addr); + memcpy(buffer + SPX_N, auth_path, SPX_N); + } + auth_path += SPX_N; + } + + /* The last iteration is exceptional; we do not copy an auth_path node. */ + leaf_idx >>= 1; + idx_offset >>= 1; + set_tree_height(addr, tree_height); + set_tree_index(addr, leaf_idx + idx_offset); + thash(root, buffer, 2, ctx, addr); +} + +/** + * For a given leaf index, computes the authentication path and the resulting + * root node using Merkle's TreeHash algorithm. + * Expects the layer and tree parts of the tree_addr to be set, as well as the + * tree type (i.e. SPX_ADDR_TYPE_HASHTREE or SPX_ADDR_TYPE_FORSTREE). + * Applies the offset idx_offset to indices before building addresses, so that + * it is possible to continue counting indices across trees. + */ +void treehash(unsigned char *root, unsigned char *auth_path, const spx_ctx* ctx, + uint32_t leaf_idx, uint32_t idx_offset, uint32_t tree_height, + void (*gen_leaf)( + unsigned char* /* leaf */, + const spx_ctx* /* ctx */, + uint32_t /* addr_idx */, const uint32_t[8] /* tree_addr */), + uint32_t tree_addr[8]) +{ + unsigned char stack[(tree_height + 1)*SPX_N]; + unsigned int heights[tree_height + 1]; + unsigned int offset = 0; + uint32_t idx; + uint32_t tree_idx; + + for (idx = 0; idx < (uint32_t)(1 << tree_height); idx++) { + /* Add the next leaf node to the stack. */ + gen_leaf(stack + offset*SPX_N, ctx, idx + idx_offset, tree_addr); + offset++; + heights[offset - 1] = 0; + + /* If this is a node we need for the auth path.. */ + if ((leaf_idx ^ 0x1) == idx) { + memcpy(auth_path, stack + (offset - 1)*SPX_N, SPX_N); + } + + /* While the top-most nodes are of equal height.. */ + while (offset >= 2 && heights[offset - 1] == heights[offset - 2]) { + /* Compute index of the new node, in the next layer. */ + tree_idx = (idx >> (heights[offset - 1] + 1)); + + /* Set the address of the node we're creating. */ + set_tree_height(tree_addr, heights[offset - 1] + 1); + set_tree_index(tree_addr, + tree_idx + (idx_offset >> (heights[offset-1] + 1))); + /* Hash the top-most nodes from the stack together. */ + thash(stack + (offset - 2)*SPX_N, + stack + (offset - 2)*SPX_N, 2, ctx, tree_addr); + offset--; + /* Note that the top-most node is now one layer higher. */ + heights[offset - 1]++; + + /* If this is a node we need for the auth path.. */ + if (((leaf_idx >> heights[offset - 1]) ^ 0x1) == tree_idx) { + memcpy(auth_path + heights[offset - 1]*SPX_N, + stack + (offset - 1)*SPX_N, SPX_N); + } + } + } + memcpy(root, stack, SPX_N); +} diff --git a/sphincsplus/sphincsplus-keccakx2/utils.h b/sphincsplus/sphincsplus-keccakx2/utils.h new file mode 100644 index 0000000..defe9e7 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/utils.h @@ -0,0 +1,52 @@ +#ifndef SPX_UTILS_H +#define SPX_UTILS_H + +#include +#include "params.h" +#include "context.h" + + +/** + * Converts the value of 'in' to 'outlen' bytes in big-endian byte order. + */ +#define ull_to_bytes SPX_NAMESPACE(ull_to_bytes) +void ull_to_bytes(unsigned char *out, unsigned int outlen, + unsigned long long in); +#define u32_to_bytes SPX_NAMESPACE(u32_to_bytes) +void u32_to_bytes(unsigned char *out, uint32_t in); + +/** + * Converts the inlen bytes in 'in' from big-endian byte order to an integer. + */ +#define bytes_to_ull SPX_NAMESPACE(bytes_to_ull) +unsigned long long bytes_to_ull(const unsigned char *in, unsigned int inlen); + +/** + * Computes a root node given a leaf and an auth path. + * Expects address to be complete other than the tree_height and tree_index. + */ +#define compute_root SPX_NAMESPACE(compute_root) +void compute_root(unsigned char *root, const unsigned char *leaf, + uint32_t leaf_idx, uint32_t idx_offset, + const unsigned char *auth_path, uint32_t tree_height, + const spx_ctx *ctx, uint32_t addr[8]); + +/** + * For a given leaf index, computes the authentication path and the resulting + * root node using Merkle's TreeHash algorithm. + * Expects the layer and tree parts of the tree_addr to be set, as well as the + * tree type (i.e. SPX_ADDR_TYPE_HASHTREE or SPX_ADDR_TYPE_FORSTREE). + * Applies the offset idx_offset to indices before building addresses, so that + * it is possible to continue counting indices across trees. + */ +#define treehash SPX_NAMESPACE(treehash) +void treehash(unsigned char *root, unsigned char *auth_path, + const spx_ctx* ctx, + uint32_t leaf_idx, uint32_t idx_offset, uint32_t tree_height, + void (*gen_leaf)( + unsigned char* /* leaf */, + const spx_ctx* ctx /* ctx */, + uint32_t /* addr_idx */, const uint32_t[8] /* tree_addr */), + uint32_t tree_addr[8]); + +#endif diff --git a/sphincsplus/sphincsplus-keccakx2/utilsx2.c b/sphincsplus/sphincsplus-keccakx2/utilsx2.c new file mode 100644 index 0000000..d427ce6 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/utilsx2.c @@ -0,0 +1,130 @@ +#include + +#include "utils.h" +#include "utilsx2.h" +#include "params.h" +#include "thashx2.h" +#include "address.h" + +/* + * Generate the entire Merkle tree, computing the authentication path for leaf_idx, + * and the resulting root node using Merkle's TreeHash algorithm. + * Expects the layer and tree parts of the tree_addr to be set, as well as the + * tree type (i.e. SPX_ADDR_TYPE_HASHTREE or SPX_ADDR_TYPE_FORSTREE) + * + * This expects tree_addrx2 to be initialized to 2 parallel addr structures for + * the Merkle tree nodes + * + * Applies the offset idx_offset to indices before building addresses, so that + * it is possible to continue counting indices across trees. + * + * This works by using the standard Merkle tree building algorithm, except + * that each 'node' tracked is actually 2 consecutive nodes in the real tree. + * When we combine two logical nodes AB and WX, we perform the H + * operation on adjacent real nodes, forming the parent logical node + * (AB)(WX) + * + * When we get to the top level of the real tree (where there is only + * one logical node), we continue this operation one more time; the right + * most real node will by the actual root (and the other node will be + * garbage). We follow the same thashx2 logic so that the 'extract + * authentication path components' part of the loop is still executed (and + * to simplify the code somewhat) + */ +void treehashx2(unsigned char *root, unsigned char *auth_path, + const spx_ctx *ctx, + uint32_t leaf_idx, uint32_t idx_offset, + uint32_t tree_height, + void (*gen_leafx2)( + unsigned char* /* Where to write the leaves */, + const spx_ctx*, + uint32_t idx, void *info), + uint32_t tree_addrx2[2*8], + void *info) +{ + /* This is where we keep the intermediate nodes */ + unsigned char stackx2[tree_height*2*SPX_N]; + uint32_t left_adj = 0, prev_left_adj = 0; /* When we're doing the top */ + /* level, the left-most part of the tree isn't at the beginning */ + /* of current[]. These give the offset of the actual start */ + + uint32_t idx; + uint32_t max_idx = (1 << (tree_height-1)) - 1; + for (idx = 0;; idx++) { + unsigned char current[2*SPX_N]; /* Current logical node */ + gen_leafx2( current, ctx, 2*idx + idx_offset, + info ); + + /* Now combine the freshly generated right node with previously */ + /* generated left ones */ + uint32_t internal_idx_offset = idx_offset; + uint32_t internal_idx = idx; + uint32_t internal_leaf = leaf_idx; + uint32_t h; /* The height we are in the Merkle tree */ + for (h=0;; h++, internal_idx >>= 1, internal_leaf >>= 1) { + + /* Special processing if we're at the top of the tree */ + if (h >= tree_height - 1) { + if (h == tree_height) { + /* We hit the root; return it */ + memcpy( root, ¤t[1*SPX_N], SPX_N ); + return; + } + /* The tree indexing logic is a bit off in this case */ + /* Adjust it so that the left-most node of the part of */ + /* the tree that we're processing has index 0 */ + prev_left_adj = left_adj; + left_adj = 2 - (1 << (tree_height - h - 1)); + } + + /* Check if we hit the top of the tree */ + if (h == tree_height) { + /* We hit the root; return it */ + memcpy( root, ¤t[1*SPX_N], SPX_N ); + return; + } + + /* + * Check if one of the nodes we have is a part of the + * authentication path; if it is, write it out + */ + if ((((internal_idx << 1) ^ internal_leaf) & ~0x1) == 0) { + memcpy( &auth_path[ h * SPX_N ], + ¤t[(((internal_leaf&1)^1) + prev_left_adj) * SPX_N], + SPX_N ); + } + + /* + * Check if we're at a left child; if so, stop going up the stack + * Exception: if we've reached the end of the tree, keep on going + * (so we combine the last 2 nodes into the one root node in two + * more iterations) + */ + if ((internal_idx & 1) == 0 && idx < max_idx) { + break; + } + + /* Ok, we're at a right node (or doing the top 3 levels) */ + /* Now combine the left and right logical nodes together */ + + /* Set the address of the node we're creating. */ + int j; + internal_idx_offset >>= 1; + for (j = 0; j < 2; j++) { + set_tree_height(tree_addrx2 + j*8, h + 1); + set_tree_index(tree_addrx2 + j*8, + (2/2) * (internal_idx&~1) + j - left_adj + internal_idx_offset ); + } + unsigned char *left = &stackx2[h * 2 * SPX_N]; + thashx2( ¤t[0 * SPX_N], + ¤t[1 * SPX_N], + &left [0 * SPX_N], + ¤t[0 * SPX_N], + 2, ctx, tree_addrx2); + } + + /* We've hit a left child; save the current for when we get the */ + /* corresponding right right */ + memcpy( &stackx2[h * 2 * SPX_N], current, 2 * SPX_N); + } +} diff --git a/sphincsplus/sphincsplus-keccakx2/utilsx2.h b/sphincsplus/sphincsplus-keccakx2/utilsx2.h new file mode 100644 index 0000000..3fcfb9d --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/utilsx2.h @@ -0,0 +1,28 @@ +#ifndef SPX_UTILSX2_H +#define SPX_UTILSX2_H + +#include +#include "params.h" + +/** + * For a given leaf index, computes the authentication path and the resulting + * root node using Merkle's TreeHash algorithm. + * Expects the layer and tree parts of the tree_addr to be set, as well as the + * tree type (i.e. SPX_ADDR_TYPE_HASHTREE or SPX_ADDR_TYPE_FORSTREE). + * Applies the offset idx_offset to indices before building addresses, so that + * it is possible to continue counting indices across trees. + * + * This implementation uses SIMD to compute internal nodes 2 at a time (in + * parallel) + */ +#define treehashx2 SPX_NAMESPACE(treehashx2) +void treehashx2(unsigned char *root, unsigned char *auth_path, + const spx_ctx *ctx, + uint32_t leaf_idx, uint32_t idx_offset, uint32_t tree_height, + void (*gen_leafx2)( + unsigned char* /* Where to write the leaves */, + const spx_ctx* /* ctx */, + uint32_t addr_idx, void *info), + uint32_t tree_addrx2[2*8], void *info); + +#endif diff --git a/sphincsplus/sphincsplus-keccakx2/wots.c b/sphincsplus/sphincsplus-keccakx2/wots.c new file mode 100644 index 0000000..ef0235b --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/wots.c @@ -0,0 +1,261 @@ +#include +#include + +#include "utils.h" +#include "utilsx2.h" +#include "hash.h" +#include "hashx2.h" +#include "thashx2.h" +#include "wots.h" +#include "wotsx2.h" +#include "address.h" +#include "params.h" + +// TODO clarify address expectations, and make them more uniform. +// TODO i.e. do we expect types to be set already? +// TODO and do we expect modifications or copies? + +/** + * Computes up the chains + */ +static void gen_chains( + unsigned char *out, + const unsigned char *in, + unsigned int start[SPX_WOTS_LEN], + unsigned int steps[SPX_WOTS_LEN], + const spx_ctx *ctx, + uint32_t addr[8]) +{ + uint32_t i, j, k, idx, watching; + int done; + unsigned char empty[SPX_N]; + unsigned char *bufs[4]; + uint32_t addrs[8*2]; + + int l; + uint16_t counts[SPX_WOTS_W] = { 0 }; + uint16_t idxs[SPX_WOTS_LEN]; + uint16_t total, newTotal; + + /* set addrs = {addr, addr} */ + for (j = 0; j < 2; j++) { + memcpy(addrs+j*8, addr, sizeof(uint32_t) * 8); + } + + /* Initialize out with the value at position 'start'. */ + memcpy(out, in, SPX_WOTS_LEN*SPX_N); + + /* Sort the chains in reverse order by steps using counting sort. */ + for (i = 0; i < SPX_WOTS_LEN; i++) { + counts[steps[i]]++; + } + total = 0; + for (l = SPX_WOTS_W - 1; l >= 0; l--) { + newTotal = counts[l] + total; + counts[l] = total; + total = newTotal; + } + for (i = 0; i < SPX_WOTS_LEN; i++) { + idxs[counts[steps[i]]] = i; + counts[steps[i]]++; + } + + /* We got our work cut out for us: do it! */ + for (i = 0; i < SPX_WOTS_LEN; i += 2) { + for (j = 0; j < 2 && i+j < SPX_WOTS_LEN; j++) { + idx = idxs[i+j]; + set_chain_addr(addrs+j*8, idx); + bufs[j] = out + SPX_N * idx; + } + + /* As the chains are sorted in reverse order, we know that the first + * chain is the longest and the last one is the shortest. We keep + * an eye on whether the last chain is done and then on the one before, + * et cetera. */ + watching = 1; + done = 0; + while (i + watching >= SPX_WOTS_LEN) { + bufs[watching] = &empty[0]; + watching--; + } + + for (k = 0;; k++) { + while (k == steps[idxs[i+watching]]) { + bufs[watching] = &empty[0]; + if (watching == 0) { + done = 1; + break; + } + watching--; + } + if (done) { + break; + } + for (j = 0; j < watching + 1; j++) { + set_hash_addr(addrs+j*8, k + start[idxs[i+j]]); + } + + thashx2(bufs[0], bufs[1], + bufs[0], bufs[1], 1, ctx, addrs); + } + } +} + +/** + * base_w algorithm as described in draft. + * Interprets an array of bytes as integers in base w. + * This only works when log_w is a divisor of 8. + */ +static void base_w(unsigned int *output, const int out_len, + const unsigned char *input) +{ + int in = 0; + int out = 0; + unsigned char total; + int bits = 0; + int consumed; + + for (consumed = 0; consumed < out_len; consumed++) { + if (bits == 0) { + total = input[in]; + in++; + bits += 8; + } + bits -= SPX_WOTS_LOGW; + output[out] = (total >> bits) & (SPX_WOTS_W - 1); + out++; + } +} + +/* Computes the WOTS+ checksum over a message (in base_w). */ +static void wots_checksum(unsigned int *csum_base_w, + const unsigned int *msg_base_w) +{ + unsigned int csum = 0; + unsigned char csum_bytes[(SPX_WOTS_LEN2 * SPX_WOTS_LOGW + 7) / 8]; + unsigned int i; + + /* Compute checksum. */ + for (i = 0; i < SPX_WOTS_LEN1; i++) { + csum += SPX_WOTS_W - 1 - msg_base_w[i]; + } + + /* Convert checksum to base_w. */ + /* Make sure expected empty zero bits are the least significant bits. */ + csum = csum << ((8 - ((SPX_WOTS_LEN2 * SPX_WOTS_LOGW) % 8)) % 8); + ull_to_bytes(csum_bytes, sizeof(csum_bytes), csum); + base_w(csum_base_w, SPX_WOTS_LEN2, csum_bytes); +} + +/* Takes a message and derives the matching chain lengths. */ +void chain_lengths(unsigned int *lengths, const unsigned char *msg) +{ + base_w(lengths, SPX_WOTS_LEN1, msg); + wots_checksum(lengths + SPX_WOTS_LEN1, lengths); +} + +/** + * Takes a WOTS signature and an n-byte message, computes a WOTS public key. + * + * Writes the computed public key to 'pk'. + */ +void wots_pk_from_sig(unsigned char *pk, + const unsigned char *sig, const unsigned char *msg, + const spx_ctx *ctx, uint32_t addr[8]) +{ + unsigned int steps[SPX_WOTS_LEN]; + unsigned int start[SPX_WOTS_LEN]; + uint32_t i; + + chain_lengths(start, msg); + + for (i = 0; i < SPX_WOTS_LEN; i++) { + steps[i] = SPX_WOTS_W - 1 - start[i]; + } + + gen_chains(pk, sig, start, steps, ctx, addr); +} + +/* + * This generates 2 sequential WOTS public keys + * It also generates the WOTS signature if leaf_info indicates + * that we're signing with one of these WOTS keys + */ +void wots_gen_leafx2(unsigned char *dest, + const spx_ctx *ctx, + uint32_t leaf_idx, void *v_info) { + struct leaf_info_x2 *info = v_info; + uint32_t *leaf_addr = info->leaf_addr; + uint32_t *pk_addr = info->pk_addr; + unsigned int i, j, k; + unsigned char pk_buffer[ 2 * SPX_WOTS_BYTES ]; + unsigned wots_offset = SPX_WOTS_BYTES; + unsigned char *buffer; + uint32_t wots_k_mask; + unsigned wots_sign_index; + + if (((leaf_idx ^ info->wots_sign_leaf) & ~1) == 0) { + /* We're traversing the leaf that's signing; generate the WOTS */ + /* signature */ + wots_k_mask = 0; + wots_sign_index = info->wots_sign_leaf & 1; /* Which of of the 2 */ + /* slots do the signatures come from */ + } else { + /* Nope, we're just generating pk's; turn off the signature logic */ + wots_k_mask = ~0; + wots_sign_index = 0; + } + + for (j = 0; j < 2; j++) { + set_keypair_addr( leaf_addr + j*8, leaf_idx + j ); + set_keypair_addr( pk_addr + j*8, leaf_idx + j ); + } + + for (i = 0, buffer = pk_buffer; i < SPX_WOTS_LEN; i++, buffer += SPX_N) { + uint32_t wots_k = info->wots_steps[i] | wots_k_mask; /* Set wots_k to */ + /* the step if we're generating a signature, ~0 if we're not */ + + /* Start with the secret seed */ + for (j = 0; j < 2; j++) { + set_chain_addr(leaf_addr + j*8, i); + set_hash_addr(leaf_addr + j*8, 0); + set_type(leaf_addr + j*8, SPX_ADDR_TYPE_WOTSPRF); + } + prf_addrx2(buffer + 0*wots_offset, + buffer + 1*wots_offset, + ctx, leaf_addr); + for (j = 0; j < 2; j++) { + set_type(leaf_addr + j*8, SPX_ADDR_TYPE_WOTS); + } + + /* Iterate down the WOTS chain */ + for (k=0;; k++) { + /* Check if one of the values we have needs to be saved as a */ + /* part of the WOTS signature */ + if (k == wots_k) { + memcpy( info->wots_sig + i * SPX_N, + buffer + wots_sign_index*wots_offset, SPX_N ); + } + + /* Check if we hit the top of the chain */ + if (k == SPX_WOTS_W - 1) break; + + /* Iterate one step on all 4 chains */ + for (j = 0; j < 2; j++) { + set_hash_addr(leaf_addr + j*8, k); + } + thashx2(buffer + 0*wots_offset, + buffer + 1*wots_offset, + buffer + 0*wots_offset, + buffer + 1*wots_offset, + 1, ctx, leaf_addr); + } + } + + /* Do the final thash to generate the public keys */ + thashx2(dest + 0*SPX_N, + dest + 1*SPX_N, + pk_buffer + 0*wots_offset, + pk_buffer + 1*wots_offset, + SPX_WOTS_LEN, ctx, pk_addr); +} diff --git a/sphincsplus/sphincsplus-keccakx2/wots.h b/sphincsplus/sphincsplus-keccakx2/wots.h new file mode 100644 index 0000000..7e77056 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/wots.h @@ -0,0 +1,25 @@ +#ifndef SPX_WOTS_H +#define SPX_WOTS_H + +#include + +#include "params.h" +#include "context.h" + +/** + * Takes a WOTS signature and an n-byte message, computes a WOTS public key. + * + * Writes the computed public key to 'pk'. + */ +#define wots_pk_from_sig SPX_NAMESPACE(wots_pk_from_sig) +void wots_pk_from_sig(unsigned char *pk, + const unsigned char *sig, const unsigned char *msg, + const spx_ctx *ctx, uint32_t addr[8]); + +/* + * Compute the chain lengths needed for a given message hash + */ +#define chain_lengths SPX_NAMESPACE(chain_lengths) +void chain_lengths(unsigned int *lengths, const unsigned char *msg); + +#endif diff --git a/sphincsplus/sphincsplus-keccakx2/wotsx2.h b/sphincsplus/sphincsplus-keccakx2/wotsx2.h new file mode 100644 index 0000000..6237737 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakx2/wotsx2.h @@ -0,0 +1,40 @@ +#if !defined( WOTSX2_H_ ) +#define WOTSX2_H_ + +#include +#include "params.h" + +/* + * This is here to provide an interface to the internal wots_gen_leafx2 + * routine. While this routine is not referenced in the package outside of + * wots.c, it is called from the stand-alone benchmark code to characterize + * the performance + */ +struct leaf_info_x2 { + unsigned char *wots_sig; + uint32_t wots_sign_leaf; /* The index of the WOTS we're using to sign */ + uint32_t *wots_steps; + uint32_t leaf_addr[2*8]; + uint32_t pk_addr[2*8]; +}; + +/* Macro to set the leaf_info to something 'benign', that is, it would */ +/* run with the same time as it does during the real signing process */ +/* Used only by the benchmark code */ +#define INITIALIZE_LEAF_INFO_X2(info, addr, step_buffer) { \ + info.wots_sig = 0; \ + info.wots_sign_leaf = ~0; \ + info.wots_steps = step_buffer; \ + int i; \ + for (i=0; i<2; i++) { \ + memcpy( &info.leaf_addr[8*i], addr, 32 ); \ + memcpy( &info.pk_addr[8*i], addr, 32 ); \ + } \ +} + +#define wots_gen_leafx2 SPX_NAMESPACE(wots_gen_leafx2) +void wots_gen_leafx2(unsigned char *dest, + const spx_ctx *ctx, + uint32_t leaf_idx, void *v_info); + +#endif /* WOTSX2_H_ */ diff --git a/sphincsplus/sphincsplus-keccakxN/LICENSE b/sphincsplus/sphincsplus-keccakxN/LICENSE new file mode 100644 index 0000000..41b1050 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/LICENSE @@ -0,0 +1,22 @@ +Copyright (c) 2022 Arm Limited +Copyright (c) 2022 Matthias Kannwischer + +SPDX-License-Identifier: MIT + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/sphincsplus/sphincsplus-keccakxN/Makefile b/sphincsplus/sphincsplus-keccakxN/Makefile new file mode 100644 index 0000000..e1c4ed0 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/Makefile @@ -0,0 +1,242 @@ +# +# Copyright (c) 2022 Arm Limited +# Copyright (c) 2022 Matthias Kannwischer +# SPDX-License-Identifier: MIT +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +# + +PARAMS ?= sphincs-shake-128f +THASH ?= robust + +CC=aarch64-none-linux-gnu-gcc +LD=$(CC) + + +CORE ?= X2 + +# PMU / PERF +CYCLES?= PERF + +ifeq ($(CYCLES),PMU) + CFLAGS += -DPMU_CYCLES +endif + +ifeq ($(CYCLES),PERF) + CFLAGS += -DPERF_CYCLES +endif + +ifeq ($(CYCLES),NO) + CFLAGS += -DNO_CYCLES +endif + +DUMMY?=0 +WAY?=4 + +ifeq ($(DUMMY),0) + +ifeq ($(WAY),5) + +ifeq ($(CORE),X2) + KECCAK_X1_IMPL?= keccak_f1600_x1_scalar_asm_v5 + KECCAK_X_IMPL?= keccak_f1600_x5_hybrid_asm_v8p + PLATFORM = v84 +else ifeq ($(CORE),A710) + KECCAK_X1_IMPL?= keccak_f1600_x1_scalar_asm_v5 + KECCAK_X_IMPL?= keccak_f1600_x5_hybrid_asm_v8p + PLATFORM = v84 +else ifeq ($(CORE),A510) + KECCAK_X1_IMPL?= keccak_f1600_x1_scalar_asm_v5 + KECCAK_X_IMPL?= keccak_f1600_x5_hybrid_asm_v8p + PLATFORM = v84 +else ifeq ($(CORE),X1) + KECCAK_X1_IMPL?= keccak_f1600_x1_scalar_asm_v5 + KECCAK_X_IMPL?= keccak_f1600_x5_hybrid_asm_v8p + PLATFORM = v8 +else ifeq ($(CORE),A78) + KECCAK_X1_IMPL?= keccak_f1600_x1_scalar_asm_v5 + KECCAK_X_IMPL?= keccak_f1600_x5_hybrid_asm_v8p + PLATFORM = v8 +else ifeq ($(CORE),A55) + KECCAK_X1_IMPL?= keccak_f1600_x1_scalar_asm_v5 + KECCAK_X_IMPL?= keccak_f1600_x5_hybrid_asm_v8p + PLATFORM = v8 +endif + +else ifeq ($(WAY),4) + +ifeq ($(CORE),X2) + KECCAK_X1_IMPL?= keccak_f1600_x1_scalar_asm_v5 + KECCAK_X_IMPL?= keccak_f1600_x4_hybrid_asm_v4 + PLATFORM = v84 +else ifeq ($(CORE),A710) + KECCAK_X1_IMPL?= keccak_f1600_x1_scalar_asm_v5 + KECCAK_X_IMPL?= keccak_f1600_x4_hybrid_asm_v2 + PLATFORM = v84 +else ifeq ($(CORE),A510) + KECCAK_X1_IMPL?= keccak_f1600_x1_scalar_asm_v5 + KECCAK_X_IMPL?= keccak_f1600_x4_v84a_asm_v1p0 + PLATFORM = v84 +else ifeq ($(CORE),X1) + KECCAK_X1_IMPL?= keccak_f1600_x1_scalar_asm_v5 + KECCAK_X_IMPL?= keccak_f1600_x4_hybrid_asm_v3p + PLATFORM = v8 +else ifeq ($(CORE),A78) + KECCAK_X1_IMPL?= keccak_f1600_x1_scalar_asm_v5 + KECCAK_X_IMPL?= keccak_f1600_x4_hybrid_asm_v3p + PLATFORM = v8 +else ifeq ($(CORE),A55) + KECCAK_X1_IMPL?= keccak_f1600_x1_scalar_asm_v5 + KECCAK_X_IMPL?= keccak_f1600_x4_scalar_asm_v5 + PLATFORM = v8 +endif + +else ifeq ($(WAY),3) + +ifeq ($(CORE),X2) + KECCAK_X1_IMPL?= keccak_f1600_x1_scalar_asm_v5 + KECCAK_X_IMPL?= keccak_f1600_x3_hybrid_asm_v6 + PLATFORM = v84 +else ifeq ($(CORE),A710) + KECCAK_X1_IMPL?= keccak_f1600_x1_scalar_asm_v5 + KECCAK_X_IMPL?= keccak_f1600_x3_hybrid_asm_v6 + PLATFORM = v84 +else ifeq ($(CORE),A510) + KECCAK_X1_IMPL?= keccak_f1600_x1_scalar_asm_v5 + KECCAK_X_IMPL?= keccak_f1600_x3_hybrid_asm_v6 + PLATFORM = v84 +else ifeq ($(CORE),X1) + KECCAK_X1_IMPL?= keccak_f1600_x1_scalar_asm_v5 + KECCAK_X_IMPL?= keccak_f1600_x3_hybrid_asm_v3p + PLATFORM = v8 +else ifeq ($(CORE),A78) + KECCAK_X1_IMPL?= keccak_f1600_x1_scalar_asm_v5 + KECCAK_X_IMPL?= keccak_f1600_x3_hybrid_asm_v3p + PLATFORM = v8 +else ifeq ($(CORE),A55) + KECCAK_X1_IMPL?= keccak_f1600_x1_scalar_asm_v5 + KECCAK_X_IMPL?= keccak_f1600_x3_hybrid_asm_v3p + PLATFORM = v8 +endif + +endif + +else + +ifeq ($(CORE),X2) + KECCAK_X1_IMPL?= keccak_f1600_dummy + KECCAK_X_IMPL?= keccak_f1600_dummy + PLATFORM = v84 +else ifeq ($(CORE),A710) + KECCAK_X1_IMPL?= keccak_f1600_dummy + KECCAK_X_IMPL?= keccak_f1600_dummy + PLATFORM = v84 +else ifeq ($(CORE),A510) + KECCAK_X1_IMPL?= keccak_f1600_dummy + KECCAK_X_IMPL?= keccak_f1600_dummy + PLATFORM = v84 +else ifeq ($(CORE),X1) + KECCAK_X1_IMPL?= keccak_f1600_dummy + KECCAK_X_IMPL?= keccak_f1600_dummy + PLATFORM = v8 +else ifeq ($(CORE),A78) + KECCAK_X1_IMPL?= keccak_f1600_dummy + KECCAK_X_IMPL?= keccak_f1600_dummy + PLATFORM = v8 +else ifeq ($(CORE),A55) + KECCAK_X1_IMPL?= keccak_f1600_dummy + KECCAK_X_IMPL?= keccak_f1600_dummy + PLATFORM = v8 +endif + +endif + +ifeq ($(PLATFORM),v84) + CFLAGS += -march=armv8.4-a+crypto+sha3 +else + CFLAGS += -march=armv8-a +endif + +CFLAGS += -I. -flto -fpic -Wall -Wextra -Wpedantic -Wmissing-prototypes -O3 \ + -std=c99 -fomit-frame-pointer -DPARAMS=$(PARAMS) \ + -DKECCAK_WAY=$(WAY) \ + -DKECCAK_X1_IMPL=$(KECCAK_X1_IMPL) \ + -DKECCAK_X_IMPL=$(KECCAK_X_IMPL) \ + $(EXTRA_CFLAGS) +LDFLAGS = -static -flto + +SRC_DIR=. +BUILD_DIR=build + +HEADERS=$(wildcard $(SRC_DIR)/*.h) $(wildcard $(SRC_DIR)/test/*.h) + +ifeq ($(DUMMY),0) +ASM_SRC_FILES=keccak_f1600/$(KECCAK_X1_IMPL).s keccak_f1600/$(KECCAK_X_IMPL).s +else +ASM_SRC_FILES=keccak_f1600_dummy.s +endif + +ASM_OBJ_FILES=$(patsubst %.s, $(BUILD_DIR)/%.s.o, $(ASM_SRC_FILES)) + +C_SRC_FILES = address.c fips202.c fips202x.c fors.c hash_shake.c hash_shakex.c merkle.c sign.c utils.c utilsx.c wots.c thash_shake_$(THASH)x.c thash_shake_$(THASH).c +C_SRC_FILES_BENCH=$(C_SRC_FILES) test/benchmark.c test/randombytes.c test/cycles.c +C_OBJ_FILES_BENCH=$(patsubst %.c, $(BUILD_DIR)/%.c.o, $(C_SRC_FILES_BENCH)) +OBJ_FILES_BENCH=$(ASM_OBJ_FILES) $(C_OBJ_FILES_BENCH) + +C_SRC_FILES_FUNCTEST=$(C_SRC_FILES) test/functest.c test/randombytes.c +C_OBJ_FILES_FUNCTEST=$(patsubst %.c, $(BUILD_DIR)/%.c.o, $(C_SRC_FILES_FUNCTEST)) +OBJ_FILES_FUNCTEST=$(ASM_OBJ_FILES) $(C_OBJ_FILES_FUNCTEST) + +C_SRC_FILES_TEST=$(C_SRC_FILES) test/thashx.c test/randombytes.c test/cycles.c +C_OBJ_FILES_TEST=$(patsubst %.c, $(BUILD_DIR)/%.c.o, $(C_SRC_FILES_TEST)) +OBJ_FILES_TEST=$(ASM_OBJ_FILES) $(C_OBJ_FILES_TEST) + + + +.PHONY: clean libclean +all: benchmark functest + +# Compilation +$(BUILD_DIR)/%.c.o: $(SRC_DIR)/%.c $(HEADERS) + mkdir -p $(@D) + $(CC) $(CFLAGS) -c -o $@ $< + +$(BUILD_DIR)/%.s.o: $(SRC_DIR)/%.s $(HEADERS) + mkdir -p $(@D) + $(CC) -x assembler-with-cpp $(CFLAGS) -c -o $@ $< + + +# Linking +benchmark: $(OBJ_FILES_BENCH) $(HEADERS) + mkdir -p $(@D) + $(LD) $(LDFLAGS) $(OBJ_FILES_BENCH) -o benchmark + +# Linking +functest: $(OBJ_FILES_FUNCTEST) $(HEADERS) + mkdir -p $(@D) + $(LD) $(LDFLAGS) $(OBJ_FILES_FUNCTEST) -o functest + + +clean: + -$(RM) -r build + +libclean: + find . -type f -executable -exec rm '{}' \; + rm -rf bin diff --git a/sphincsplus/sphincsplus-keccakxN/README.md b/sphincsplus/sphincsplus-keccakxN/README.md new file mode 100644 index 0000000..38f4252 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/README.md @@ -0,0 +1,42 @@ +SPHINCS+ using N-way parallel Keccak-f1600 +========================================== + +Implementation of SPHINCS+ leveraging an N-way parallel Keccak-f1600. +Based on the implementations for N=2 from [official SPHINCS+ repository](https://github.com/sphincs/sphincsplus). + +## Usage + +To build, run + +``` +CYCLES={NO,PERF,PMU} WAY={N} CORE={A55,A510,A710,A78,X1,X2} PARAMS=sphincs-shake{f,s}-{128,192,256}{f,s} THASH={robus,simple} make +``` + +which will build the corresponding benchmark as `./benchmark` and a functional test as `./functest`. +The underlying N-way parallel Keccak-f1600 implementation +is automatically chosen based on the choice of core and parameter set. To force a specific implementation, overwrite the +environment variables `KECCAK_X_IMPL` and/or `KECCAK_X1_IMPL`. + +You may also use + +``` +python3 make_all.py +``` + +to generate benchmark binaries for all possible combinations of parameters, stored in [bin/](bin/), and `bench_xN.sh` to +run them. + +You may run the functional tests using `qemu` as +``` +qemu-aarch64 ./functest +``` +## KATs + +The NIST-provided [source +code](https://csrc.nist.gov/projects/post-quantum-cryptography/post-quantum-cryptography-standardization/example-files) +can be used to generate Known-Answer-Tests (KATs) as done for example in the [official SPHINCS+ +repository](https://github.com/sphincs/sphincsplus/tree/master/shake-a64). + +## License + +Licensed under MIT; see [LICENSE](LICENSE) diff --git a/sphincsplus/sphincsplus-keccakxN/address.c b/sphincsplus/sphincsplus-keccakxN/address.c new file mode 100644 index 0000000..9eb0637 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/address.c @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// This implementation is based on the public domain implementation of SPHINCS+ +// available on https://github.com/sphincs/sphincsplus +// + +#include +#include + +#include "address.h" +#include "params.h" +#include "utils.h" + +/* + * Specify which level of Merkle tree (the "layer") we're working on + */ +void set_layer_addr(uint32_t addr[8], uint32_t layer) +{ + ((unsigned char *)addr)[SPX_OFFSET_LAYER] = layer; +} + +/* + * Specify which Merkle tree within the level (the "tree address") we're working on + */ +void set_tree_addr(uint32_t addr[8], uint64_t tree) +{ +#if (SPX_TREE_HEIGHT * (SPX_D - 1)) > 64 + #error Subtree addressing is currently limited to at most 2^64 trees +#endif + ull_to_bytes(&((unsigned char *)addr)[SPX_OFFSET_TREE], 8, tree ); +} + +/* + * Specify the reason we'll use this address structure for, that is, what + * hash will we compute with it. This is used so that unrelated types of + * hashes don't accidentally get the same address structure. The type will be + * one of the SPX_ADDR_TYPE constants + */ +void set_type(uint32_t addr[8], uint32_t type) +{ + ((unsigned char *)addr)[SPX_OFFSET_TYPE] = type; +} + +/* + * Copy the layer and tree fields of the address structure. This is used + * when we're doing multiple types of hashes within the same Merkle tree + */ +void copy_subtree_addr(uint32_t out[8], const uint32_t in[8]) +{ + memcpy( out, in, SPX_OFFSET_TREE+8 ); +} + +/* These functions are used for OTS addresses. */ + +/* + * Specify which Merkle leaf we're working on; that is, which OTS keypair + * we're talking about. + */ +void set_keypair_addr(uint32_t addr[8], uint32_t keypair) +{ +#if SPX_FULL_HEIGHT/SPX_D > 8 + /* We have > 256 OTS at the bottom of the Merkle tree; to specify */ + /* which one, we'd need to express it in two bytes */ + ((unsigned char *)addr)[SPX_OFFSET_KP_ADDR2] = keypair >> 8; +#endif + ((unsigned char *)addr)[SPX_OFFSET_KP_ADDR1] = keypair; +} + +/* + * Copy the layer, tree and keypair fields of the address structure. This is + * used when we're doing multiple things within the same OTS keypair + */ +void copy_keypair_addr(uint32_t out[8], const uint32_t in[8]) +{ + memcpy( out, in, SPX_OFFSET_TREE+8 ); +#if SPX_FULL_HEIGHT/SPX_D > 8 + ((unsigned char *)out)[SPX_OFFSET_KP_ADDR2] = ((unsigned char *)in)[SPX_OFFSET_KP_ADDR2]; +#endif + ((unsigned char *)out)[SPX_OFFSET_KP_ADDR1] = ((unsigned char *)in)[SPX_OFFSET_KP_ADDR1]; +} + +/* + * Specify which Merkle chain within the OTS we're working with + * (the chain address) + */ +void set_chain_addr(uint32_t addr[8], uint32_t chain) +{ + ((unsigned char *)addr)[SPX_OFFSET_CHAIN_ADDR] = chain; +} + +/* + * Specify where in the Merkle chain we are +* (the hash address) + */ +void set_hash_addr(uint32_t addr[8], uint32_t hash) +{ + ((unsigned char *)addr)[SPX_OFFSET_HASH_ADDR] = hash; +} + +/* These functions are used for all hash tree addresses (including FORS). */ + +/* + * Specify the height of the node in the Merkle/FORS tree we are in + * (the tree height) + */ +void set_tree_height(uint32_t addr[8], uint32_t tree_height) +{ + ((unsigned char *)addr)[SPX_OFFSET_TREE_HGT] = tree_height; +} + +/* + * Specify the distance from the left edge of the node in the Merkle/FORS tree + * (the tree index) + */ +void set_tree_index(uint32_t addr[8], uint32_t tree_index) +{ + u32_to_bytes(&((unsigned char *)addr)[SPX_OFFSET_TREE_INDEX], tree_index ); +} diff --git a/sphincsplus/sphincsplus-keccakxN/address.h b/sphincsplus/sphincsplus-keccakxN/address.h new file mode 100644 index 0000000..2a3c8ec --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/address.h @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// This implementation is based on the public domain implementation of SPHINCS+ +// available on https://github.com/sphincs/sphincsplus +// + + +#ifndef SPX_ADDRESS_H +#define SPX_ADDRESS_H + +#include +#include "params.h" + +/* The hash types that are passed to set_type */ +#define SPX_ADDR_TYPE_WOTS 0 +#define SPX_ADDR_TYPE_WOTSPK 1 +#define SPX_ADDR_TYPE_HASHTREE 2 +#define SPX_ADDR_TYPE_FORSTREE 3 +#define SPX_ADDR_TYPE_FORSPK 4 +#define SPX_ADDR_TYPE_WOTSPRF 5 +#define SPX_ADDR_TYPE_FORSPRF 6 + +#define set_layer_addr SPX_NAMESPACE(set_layer_addr) +void set_layer_addr(uint32_t addr[8], uint32_t layer); + +#define set_tree_addr SPX_NAMESPACE(set_tree_addr) +void set_tree_addr(uint32_t addr[8], uint64_t tree); + +#define set_type SPX_NAMESPACE(set_type) +void set_type(uint32_t addr[8], uint32_t type); + +/* Copies the layer and tree part of one address into the other */ +#define copy_subtree_addr SPX_NAMESPACE(copy_subtree_addr) +void copy_subtree_addr(uint32_t out[8], const uint32_t in[8]); + +/* These functions are used for WOTS and FORS addresses. */ + +#define set_keypair_addr SPX_NAMESPACE(set_keypair_addr) +void set_keypair_addr(uint32_t addr[8], uint32_t keypair); + +#define set_chain_addr SPX_NAMESPACE(set_chain_addr) +void set_chain_addr(uint32_t addr[8], uint32_t chain); + +#define set_hash_addr SPX_NAMESPACE(set_hash_addr) +void set_hash_addr(uint32_t addr[8], uint32_t hash); + +#define copy_keypair_addr SPX_NAMESPACE(copy_keypair_addr) +void copy_keypair_addr(uint32_t out[8], const uint32_t in[8]); + +/* These functions are used for all hash tree addresses (including FORS). */ + +#define set_tree_height SPX_NAMESPACE(set_tree_height) +void set_tree_height(uint32_t addr[8], uint32_t tree_height); + +#define set_tree_index SPX_NAMESPACE(set_tree_index) +void set_tree_index(uint32_t addr[8], uint32_t tree_index); + +#endif diff --git a/sphincsplus/sphincsplus-keccakxN/api.h b/sphincsplus/sphincsplus-keccakxN/api.h new file mode 100644 index 0000000..5f373f8 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/api.h @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// This implementation is based on the public domain implementation of SPHINCS+ +// available on https://github.com/sphincs/sphincsplus +// + +#ifndef SPX_API_H +#define SPX_API_H + +#include +#include + +#include "params.h" + +#define CRYPTO_ALGNAME "SPHINCS+" + +#define CRYPTO_SECRETKEYBYTES SPX_SK_BYTES +#define CRYPTO_PUBLICKEYBYTES SPX_PK_BYTES +#define CRYPTO_BYTES SPX_BYTES +#define CRYPTO_SEEDBYTES 3*SPX_N + +/* + * Returns the length of a secret key, in bytes + */ +unsigned long long crypto_sign_secretkeybytes(void); + +/* + * Returns the length of a public key, in bytes + */ +unsigned long long crypto_sign_publickeybytes(void); + +/* + * Returns the length of a signature, in bytes + */ +unsigned long long crypto_sign_bytes(void); + +/* + * Returns the length of the seed required to generate a key pair, in bytes + */ +unsigned long long crypto_sign_seedbytes(void); + +/* + * Generates a SPHINCS+ key pair given a seed. + * Format sk: [SK_SEED || SK_PRF || PUB_SEED || root] + * Format pk: [root || PUB_SEED] + */ +int crypto_sign_seed_keypair(unsigned char *pk, unsigned char *sk, + const unsigned char *seed); + +/* + * Generates a SPHINCS+ key pair. + * Format sk: [SK_SEED || SK_PRF || PUB_SEED || root] + * Format pk: [root || PUB_SEED] + */ +int crypto_sign_keypair(unsigned char *pk, unsigned char *sk); + +/** + * Returns an array containing a detached signature. + */ +int crypto_sign_signature(uint8_t *sig, size_t *siglen, + const uint8_t *m, size_t mlen, const uint8_t *sk); + +/** + * Verifies a detached signature and message under a given public key. + */ +int crypto_sign_verify(const uint8_t *sig, size_t siglen, + const uint8_t *m, size_t mlen, const uint8_t *pk); + +/** + * Returns an array containing the signature followed by the message. + */ +int crypto_sign(unsigned char *sm, unsigned long long *smlen, + const unsigned char *m, unsigned long long mlen, + const unsigned char *sk); + +/** + * Verifies a given signature-message pair under a given public key. + */ +int crypto_sign_open(unsigned char *m, unsigned long long *mlen, + const unsigned char *sm, unsigned long long smlen, + const unsigned char *pk); + +#endif diff --git a/sphincsplus/sphincsplus-keccakxN/bench_xN.sh b/sphincsplus/sphincsplus-keccakxN/bench_xN.sh new file mode 100644 index 0000000..c420a6b --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/bench_xN.sh @@ -0,0 +1,114 @@ +# +# Copyright (c) 2022 Arm Limited +# Copyright (c) 2022 Matthias Kannwischer +# SPDX-License-Identifier: MIT +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +# + +#!/bin/sh + +if grep -Fq "sha3" /proc/cpuinfo +then + sha3=1 +else + sha3=0 +fi + +warmup=$1 +if [ -z $warmup ]; then + warmup=1 +fi + +for cpu in 80 10 1; do + + if [ $sha3 -eq 0 ]; then + if [ $cpu -eq 80 ]; then + cpuname=X1 + elif [ $cpu -eq 10 ]; then + cpuname=A78 + else + cpuname=A55 + fi + else + if [ $cpu -eq 80 ]; then + cpuname=X2 + elif [ $cpu -eq 10 ]; then + cpuname=A710 + else + cpuname=A510 + fi + fi + + echo "CPU $cpu $cpuname" + benchdir=benchmarks_$cpuname + mkdir -p $benchdir + # the high performance cores may be asleep; we need to wake them up + if [ $warmup -eq 1 ]; then + if [ $cpu -ge 10 ]; then + taskset 1 dd if=/dev/zero of=/dev/null & + taskPid0=$! + taskset 2 dd if=/dev/zero of=/dev/null & + taskPid1=$! + taskset 4 dd if=/dev/zero of=/dev/null & + taskPid2=$! + taskset 8 dd if=/dev/zero of=/dev/null & + taskPid3=$! + fi + sleep 1 + if [ $cpu -ge 80 ]; then + taskset 10 dd if=/dev/zero of=/dev/null & + taskPid4=$! + taskset 20 dd if=/dev/zero of=/dev/null & + taskPid5=$! + taskset 40 dd if=/dev/zero of=/dev/null & + taskPid6=$! + fi + sleep 1 + fi + + for level in 128 192 256; do + for t0 in f s; do + for t1 in simple robust; do + for impl in x3 x4 x5; do + param=sphincs-shake-${level}${t0}-${t1}_${impl} + echo $param + exe=."/bin/bench_${cpuname}_${param}" + echo $exe + taskset $cpu $exe > $benchdir/$param + done + done + done + done + + if [ $warmup -eq 1 ]; then + if [ $cpu -ge 10 ]; then + kill $taskPid0 + kill $taskPid1 + kill $taskPid2 + kill $taskPid3 + fi + if [ $cpu -ge 80 ]; then + kill $taskPid4 + kill $taskPid5 + kill $taskPid6 + fi + fi +done diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-128f-robust_x3 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-128f-robust_x3 new file mode 100644 index 0000000..d242574 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-128f-robust_x3 @@ -0,0 +1,16 @@ +Parameters: n = 16, h = 66, d = 22, b = 6, k = 33, w = 16, way=3, tree height=3, wots_len=35 +Running 10 iterations. +thash avg. 2.11 us (0.00 sec); median 2,768 cycles, 1x: 2,768 cycles +f1600x avg. 3.48 us (0.00 sec); median 4,629 cycles, 1x: 4,629 cycles +thashx avg. 7.06 us (0.00 sec); median 9,374 cycles, 1x: 9,374 cycles +Generating keypair.. avg. 11601.20 us (0.01 sec); median 15,474,378 cycles, 1x: 15,474,378 cycles + - WOTS pk gen x (ideal).. avg. 3865.16 us (0.00 sec); median 5,145,108 cycles, 2x: 10,290,216 cycles + - WOTS pk gen x (real).. avg. 3865.40 us (0.00 sec); median 5,139,286 cycles, 3x: 15,417,858 cycles +Signing.. avg. 298000.61 us (0.30 sec); median 355,855,086 cycles, 1x: 355,855,086 cycles + - FORS signing.. avg. 14082.81 us (0.01 sec); median 16,081,002 cycles, 1x: 16,081,002 cycles + - WOTS pk gen x (ideal).. avg. 4510.47 us (0.00 sec); median 5,128,340 cycles, 58x: 297,443,720 cycles + - WOTS pk gen x (real).. avg. 4495.24 us (0.00 sec); median 5,123,297 cycles, 66x: 338,137,602 cycles +Verifying.. avg. 18359.45 us (0.02 sec); median 20,975,961 cycles, 1x: 20,975,961 cycles +Signature size: 17088 (16.69 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-128f-robust_x4 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-128f-robust_x4 new file mode 100644 index 0000000..7409868 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-128f-robust_x4 @@ -0,0 +1,16 @@ +Parameters: n = 16, h = 66, d = 22, b = 6, k = 33, w = 16, way=4, tree height=3, wots_len=35 +Running 10 iterations. +thash avg. 2.09 us (0.00 sec); median 2,766 cycles, 1x: 2,766 cycles +f1600x avg. 1.75 us (0.00 sec); median 2,321 cycles, 1x: 2,321 cycles +thashx avg. 3.68 us (0.00 sec); median 4,940 cycles, 1x: 4,940 cycles +Generating keypair.. avg. 4090.00 us (0.00 sec); median 5,427,899 cycles, 1x: 5,427,899 cycles + - WOTS pk gen x (ideal).. avg. 2044.66 us (0.00 sec); median 2,716,425 cycles, 2x: 5,432,850 cycles + - WOTS pk gen x (real).. avg. 2057.19 us (0.00 sec); median 2,715,381 cycles, 2x: 5,430,762 cycles +Signing.. avg. 94584.29 us (0.09 sec); median 125,818,288 cycles, 1x: 125,818,288 cycles + - FORS signing.. avg. 5951.11 us (0.01 sec); median 6,785,424 cycles, 1x: 6,785,424 cycles + - WOTS pk gen x (ideal).. avg. 2375.67 us (0.00 sec); median 2,700,589 cycles, 44x: 118,825,916 cycles + - WOTS pk gen x (real).. avg. 2376.71 us (0.00 sec); median 2,700,152 cycles, 44x: 118,806,688 cycles +Verifying.. avg. 7820.06 us (0.01 sec); median 8,919,586 cycles, 1x: 8,919,586 cycles +Signature size: 17088 (16.69 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-128f-robust_x5 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-128f-robust_x5 new file mode 100644 index 0000000..1b13d9a --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-128f-robust_x5 @@ -0,0 +1,16 @@ +Parameters: n = 16, h = 66, d = 22, b = 6, k = 33, w = 16, way=5, tree height=3, wots_len=35 +Running 10 iterations. +thash avg. 2.12 us (0.00 sec); median 2,770 cycles, 1x: 2,770 cycles +f1600x avg. 5.48 us (0.00 sec); median 7,298 cycles, 1x: 7,298 cycles +thashx avg. 11.03 us (0.00 sec); median 14,715 cycles, 1x: 14,715 cycles +Generating keypair.. avg. 12113.56 us (0.01 sec); median 16,168,201 cycles, 1x: 16,168,201 cycles + - WOTS pk gen x (ideal).. avg. 6058.58 us (0.01 sec); median 8,069,451 cycles, 1x: 8,069,451 cycles + - WOTS pk gen x (real).. avg. 6047.50 us (0.01 sec); median 8,058,608 cycles, 2x: 16,117,216 cycles +Signing.. avg. 296021.06 us (0.30 sec); median 370,445,750 cycles, 1x: 370,445,750 cycles + - FORS signing.. avg. 13606.36 us (0.01 sec); median 15,459,054 cycles, 1x: 15,459,054 cycles + - WOTS pk gen x (ideal).. avg. 7106.45 us (0.01 sec); median 8,068,244 cycles, 35x: 282,388,540 cycles + - WOTS pk gen x (real).. avg. 7202.89 us (0.01 sec); median 8,123,059 cycles, 44x: 357,414,596 cycles +Verifying.. avg. 18204.45 us (0.02 sec); median 20,701,681 cycles, 1x: 20,701,681 cycles +Signature size: 17088 (16.69 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-128f-simple_x3 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-128f-simple_x3 new file mode 100644 index 0000000..8675a60 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-128f-simple_x3 @@ -0,0 +1,16 @@ +Parameters: n = 16, h = 66, d = 22, b = 6, k = 33, w = 16, way=3, tree height=3, wots_len=35 +Running 10 iterations. +thash avg. 1.06 us (0.00 sec); median 1,401 cycles, 1x: 1,401 cycles +f1600x avg. 3.48 us (0.00 sec); median 4,632 cycles, 1x: 4,632 cycles +thashx avg. 3.55 us (0.00 sec); median 4,719 cycles, 1x: 4,719 cycles +Generating keypair.. avg. 6059.87 us (0.01 sec); median 8,073,812 cycles, 1x: 8,073,812 cycles + - WOTS pk gen x (ideal).. avg. 2018.22 us (0.00 sec); median 2,675,459 cycles, 2x: 5,350,918 cycles + - WOTS pk gen x (real).. avg. 2016.16 us (0.00 sec); median 2,675,010 cycles, 3x: 8,025,030 cycles +Signing.. avg. 148370.76 us (0.15 sec); median 186,631,527 cycles, 1x: 186,631,527 cycles + - FORS signing.. avg. 8734.25 us (0.01 sec); median 9,933,493 cycles, 1x: 9,933,493 cycles + - WOTS pk gen x (ideal).. avg. 2370.73 us (0.00 sec); median 2,687,298 cycles, 58x: 155,863,284 cycles + - WOTS pk gen x (real).. avg. 2365.47 us (0.00 sec); median 2,683,864 cycles, 66x: 177,135,024 cycles +Verifying.. avg. 9027.60 us (0.01 sec); median 10,293,071 cycles, 1x: 10,293,071 cycles +Signature size: 17088 (16.69 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-128f-simple_x4 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-128f-simple_x4 new file mode 100644 index 0000000..f6f69ba --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-128f-simple_x4 @@ -0,0 +1,16 @@ +Parameters: n = 16, h = 66, d = 22, b = 6, k = 33, w = 16, way=4, tree height=3, wots_len=35 +Running 10 iterations. +thash avg. 1.07 us (0.00 sec); median 1,400 cycles, 1x: 1,400 cycles +f1600x avg. 1.74 us (0.00 sec); median 2,307 cycles, 1x: 2,307 cycles +thashx avg. 1.90 us (0.00 sec); median 2,517 cycles, 1x: 2,517 cycles +Generating keypair.. avg. 2178.70 us (0.00 sec); median 2,877,973 cycles, 1x: 2,877,973 cycles + - WOTS pk gen x (ideal).. avg. 1090.41 us (0.00 sec); median 1,435,959 cycles, 2x: 2,871,918 cycles + - WOTS pk gen x (real).. avg. 1087.25 us (0.00 sec); median 1,435,778 cycles, 2x: 2,871,556 cycles +Signing.. avg. 50647.97 us (0.05 sec); median 67,584,583 cycles, 1x: 67,584,583 cycles + - FORS signing.. avg. 3165.27 us (0.00 sec); median 4,204,020 cycles, 1x: 4,204,020 cycles + - WOTS pk gen x (ideal).. avg. 1089.72 us (0.00 sec); median 1,433,310 cycles, 44x: 63,065,640 cycles + - WOTS pk gen x (real).. avg. 1085.75 us (0.00 sec); median 1,432,866 cycles, 44x: 63,046,104 cycles +Verifying.. avg. 3580.96 us (0.00 sec); median 4,750,523 cycles, 1x: 4,750,523 cycles +Signature size: 17088 (16.69 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-128f-simple_x5 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-128f-simple_x5 new file mode 100644 index 0000000..78c26d8 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-128f-simple_x5 @@ -0,0 +1,16 @@ +Parameters: n = 16, h = 66, d = 22, b = 6, k = 33, w = 16, way=5, tree height=3, wots_len=35 +Running 10 iterations. +thash avg. 1.07 us (0.00 sec); median 1,392 cycles, 1x: 1,392 cycles +f1600x avg. 5.49 us (0.00 sec); median 7,318 cycles, 1x: 7,318 cycles +thashx avg. 5.60 us (0.00 sec); median 7,466 cycles, 1x: 7,466 cycles +Generating keypair.. avg. 6380.70 us (0.01 sec); median 8,466,338 cycles, 1x: 8,466,338 cycles + - WOTS pk gen x (ideal).. avg. 3204.24 us (0.00 sec); median 4,228,398 cycles, 1x: 4,228,398 cycles + - WOTS pk gen x (real).. avg. 3180.02 us (0.00 sec); median 4,229,002 cycles, 2x: 8,458,004 cycles +Signing.. avg. 156738.22 us (0.16 sec); median 194,987,342 cycles, 1x: 194,987,342 cycles + - FORS signing.. avg. 8382.65 us (0.01 sec); median 9,466,795 cycles, 1x: 9,466,795 cycles + - WOTS pk gen x (ideal).. avg. 3700.82 us (0.00 sec); median 4,207,321 cycles, 35x: 147,256,235 cycles + - WOTS pk gen x (real).. avg. 3699.97 us (0.00 sec); median 4,209,527 cycles, 44x: 185,219,188 cycles +Verifying.. avg. 9501.12 us (0.01 sec); median 10,835,477 cycles, 1x: 10,835,477 cycles +Signature size: 17088 (16.69 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-128s-robust_x3 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-128s-robust_x3 new file mode 100644 index 0000000..30dcb4a --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-128s-robust_x3 @@ -0,0 +1,16 @@ +Parameters: n = 16, h = 63, d = 7, b = 12, k = 14, w = 16, way=3, tree height=9, wots_len=35 +Running 10 iterations. +thash avg. 2.09 us (0.00 sec); median 2,766 cycles, 1x: 2,766 cycles +f1600x avg. 3.49 us (0.00 sec); median 4,630 cycles, 1x: 4,630 cycles +thashx avg. 7.08 us (0.00 sec); median 9,401 cycles, 1x: 9,401 cycles +Generating keypair.. avg. 750071.86 us (0.75 sec); median 876,287,746 cycles, 1x: 876,287,746 cycles + - WOTS pk gen x (ideal).. avg. 4493.86 us (0.00 sec); median 5,114,908 cycles, 170x: 869,534,360 cycles + - WOTS pk gen x (real).. avg. 4498.47 us (0.00 sec); median 5,115,273 cycles, 171x: 874,711,683 cycles +Signing.. avg. 5601932.41 us (5.60 sec); median 6,565,700,459 cycles, 1x: 6,565,700,459 cycles + - FORS signing.. avg. 375191.57 us (0.38 sec); median 429,275,802 cycles, 1x: 429,275,802 cycles + - WOTS pk gen x (ideal).. avg. 4503.05 us (0.00 sec); median 5,115,237 cycles, 1194x: 6,107,592,978 cycles + - WOTS pk gen x (real).. avg. 4356.18 us (0.00 sec); median 5,122,478 cycles, 1197x: 6,131,606,166 cycles +Verifying.. avg. 5325.06 us (0.01 sec); median 7,084,895 cycles, 1x: 7,084,895 cycles +Signature size: 7856 (7.67 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-128s-robust_x4 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-128s-robust_x4 new file mode 100644 index 0000000..e8351ff --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-128s-robust_x4 @@ -0,0 +1,16 @@ +Parameters: n = 16, h = 63, d = 7, b = 12, k = 14, w = 16, way=4, tree height=9, wots_len=35 +Running 10 iterations. +thash avg. 2.08 us (0.00 sec); median 2,766 cycles, 1x: 2,766 cycles +f1600x avg. 1.75 us (0.00 sec); median 2,319 cycles, 1x: 2,319 cycles +thashx avg. 3.74 us (0.00 sec); median 4,983 cycles, 1x: 4,983 cycles +Generating keypair.. avg. 286795.54 us (0.29 sec); median 347,613,894 cycles, 1x: 347,613,894 cycles + - WOTS pk gen x (ideal).. avg. 2402.68 us (0.00 sec); median 2,721,358 cycles, 128x: 348,333,824 cycles + - WOTS pk gen x (real).. avg. 2396.58 us (0.00 sec); median 2,720,287 cycles, 128x: 348,196,736 cycles +Signing.. avg. 2175432.11 us (2.18 sec); median 2,610,123,085 cycles, 1x: 2,610,123,085 cycles + - FORS signing.. avg. 155949.29 us (0.16 sec); median 178,319,330 cycles, 1x: 178,319,330 cycles + - WOTS pk gen x (ideal).. avg. 2395.52 us (0.00 sec); median 2,718,577 cycles, 896x: 2,435,844,992 cycles + - WOTS pk gen x (real).. avg. 2390.63 us (0.00 sec); median 2,715,163 cycles, 896x: 2,432,786,048 cycles +Verifying.. avg. 2928.93 us (0.00 sec); median 3,322,286 cycles, 1x: 3,322,286 cycles +Signature size: 7856 (7.67 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-128s-robust_x5 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-128s-robust_x5 new file mode 100644 index 0000000..7d9b45f --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-128s-robust_x5 @@ -0,0 +1,16 @@ +Parameters: n = 16, h = 63, d = 7, b = 12, k = 14, w = 16, way=5, tree height=9, wots_len=35 +Running 10 iterations. +thash avg. 1.84 us (0.00 sec); median 2,768 cycles, 1x: 2,768 cycles +f1600x avg. 4.82 us (0.00 sec); median 7,303 cycles, 1x: 7,303 cycles +thashx avg. 9.69 us (0.00 sec); median 14,767 cycles, 1x: 14,767 cycles +Generating keypair.. avg. 664812.48 us (0.66 sec); median 831,433,488 cycles, 1x: 831,433,488 cycles + - WOTS pk gen x (ideal).. avg. 7080.57 us (0.01 sec); median 8,049,550 cycles, 102x: 821,054,100 cycles + - WOTS pk gen x (real).. avg. 7106.36 us (0.01 sec); median 8,049,841 cycles, 103x: 829,133,623 cycles +Signing.. avg. 5241648.89 us (5.24 sec); median 6,226,453,742 cycles, 1x: 6,226,453,742 cycles + - FORS signing.. avg. 345315.15 us (0.35 sec); median 414,617,769 cycles, 1x: 414,617,769 cycles + - WOTS pk gen x (ideal).. avg. 7035.59 us (0.01 sec); median 8,057,208 cycles, 716x: 5,768,960,928 cycles + - WOTS pk gen x (real).. avg. 7063.99 us (0.01 sec); median 8,049,648 cycles, 721x: 5,803,796,208 cycles +Verifying.. avg. 5983.68 us (0.01 sec); median 6,816,526 cycles, 1x: 6,816,526 cycles +Signature size: 7856 (7.67 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-128s-simple_x3 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-128s-simple_x3 new file mode 100644 index 0000000..5e404d3 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-128s-simple_x3 @@ -0,0 +1,16 @@ +Parameters: n = 16, h = 63, d = 7, b = 12, k = 14, w = 16, way=3, tree height=9, wots_len=35 +Running 10 iterations. +thash avg. 1.09 us (0.00 sec); median 1,400 cycles, 1x: 1,400 cycles +f1600x avg. 3.49 us (0.00 sec); median 4,633 cycles, 1x: 4,633 cycles +thashx avg. 3.54 us (0.00 sec); median 4,702 cycles, 1x: 4,702 cycles +Generating keypair.. avg. 373017.37 us (0.37 sec); median 456,412,528 cycles, 1x: 456,412,528 cycles + - WOTS pk gen x (ideal).. avg. 2013.33 us (0.00 sec); median 2,669,445 cycles, 170x: 453,805,650 cycles + - WOTS pk gen x (real).. avg. 2011.90 us (0.00 sec); median 2,668,954 cycles, 171x: 456,391,134 cycles +Signing.. avg. 2915608.82 us (2.92 sec); median 3,459,402,546 cycles, 1x: 3,459,402,546 cycles + - FORS signing.. avg. 213063.20 us (0.21 sec); median 263,110,101 cycles, 1x: 263,110,101 cycles + - WOTS pk gen x (ideal).. avg. 2352.99 us (0.00 sec); median 2,675,788 cycles, 1194x: 3,194,890,872 cycles + - WOTS pk gen x (real).. avg. 2355.71 us (0.00 sec); median 2,676,196 cycles, 1197x: 3,203,406,612 cycles +Verifying.. avg. 3044.41 us (0.00 sec); median 3,462,312 cycles, 1x: 3,462,312 cycles +Signature size: 7856 (7.67 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-128s-simple_x4 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-128s-simple_x4 new file mode 100644 index 0000000..dd93b3e --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-128s-simple_x4 @@ -0,0 +1,16 @@ +Parameters: n = 16, h = 63, d = 7, b = 12, k = 14, w = 16, way=4, tree height=9, wots_len=35 +Running 10 iterations. +thash avg. 0.82 us (0.00 sec); median 1,390 cycles, 1x: 1,390 cycles +f1600x avg. 1.36 us (0.00 sec); median 2,320 cycles, 1x: 2,320 cycles +thashx avg. 1.47 us (0.00 sec); median 2,509 cycles, 1x: 2,509 cycles +Generating keypair.. avg. 109586.75 us (0.11 sec); median 182,388,464 cycles, 1x: 182,388,464 cycles + - WOTS pk gen x (ideal).. avg. 1272.17 us (0.00 sec); median 1,421,785 cycles, 128x: 181,988,480 cycles + - WOTS pk gen x (real).. avg. 1262.56 us (0.00 sec); median 1,418,255 cycles, 128x: 181,536,640 cycles +Signing.. avg. 1160256.96 us (1.16 sec); median 1,382,170,032 cycles, 1x: 1,382,170,032 cycles + - FORS signing.. avg. 95622.26 us (0.10 sec); median 108,563,835 cycles, 1x: 108,563,835 cycles + - WOTS pk gen x (ideal).. avg. 1259.52 us (0.00 sec); median 1,419,546 cycles, 896x: 1,271,913,216 cycles + - WOTS pk gen x (real).. avg. 1266.55 us (0.00 sec); median 1,419,905 cycles, 896x: 1,272,234,880 cycles +Verifying.. avg. 1593.75 us (0.00 sec); median 1,788,653 cycles, 1x: 1,788,653 cycles +Signature size: 7856 (7.67 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-128s-simple_x5 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-128s-simple_x5 new file mode 100644 index 0000000..9fe7bc0 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-128s-simple_x5 @@ -0,0 +1,16 @@ +Parameters: n = 16, h = 63, d = 7, b = 12, k = 14, w = 16, way=5, tree height=9, wots_len=35 +Running 10 iterations. +thash avg. 0.93 us (0.00 sec); median 1,401 cycles, 1x: 1,401 cycles +f1600x avg. 4.81 us (0.00 sec); median 7,290 cycles, 1x: 7,290 cycles +thashx avg. 4.89 us (0.00 sec); median 7,436 cycles, 1x: 7,436 cycles +Generating keypair.. avg. 346923.68 us (0.35 sec); median 432,728,530 cycles, 1x: 432,728,530 cycles + - WOTS pk gen x (ideal).. avg. 3676.15 us (0.00 sec); median 4,189,433 cycles, 102x: 427,322,166 cycles + - WOTS pk gen x (real).. avg. 3685.75 us (0.00 sec); median 4,190,955 cycles, 103x: 431,668,365 cycles +Signing.. avg. 2783112.01 us (2.78 sec); median 3,281,448,976 cycles, 1x: 3,281,448,976 cycles + - FORS signing.. avg. 216809.79 us (0.22 sec); median 252,446,577 cycles, 1x: 252,446,577 cycles + - WOTS pk gen x (ideal).. avg. 3170.23 us (0.00 sec); median 4,216,159 cycles, 716x: 3,018,769,844 cycles + - WOTS pk gen x (real).. avg. 3170.30 us (0.00 sec); median 4,216,433 cycles, 721x: 3,040,048,193 cycles +Verifying.. avg. 2662.21 us (0.00 sec); median 3,535,241 cycles, 1x: 3,535,241 cycles +Signature size: 7856 (7.67 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-192f-robust_x3 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-192f-robust_x3 new file mode 100644 index 0000000..815abb6 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-192f-robust_x3 @@ -0,0 +1,16 @@ +Parameters: n = 24, h = 66, d = 22, b = 8, k = 33, w = 16, way=3, tree height=3, wots_len=51 +Running 10 iterations. +thash avg. 2.12 us (0.00 sec); median 2,813 cycles, 1x: 2,813 cycles +f1600x avg. 3.48 us (0.00 sec); median 4,633 cycles, 1x: 4,633 cycles +thashx avg. 7.06 us (0.00 sec); median 9,422 cycles, 1x: 9,422 cycles +Generating keypair.. avg. 17041.00 us (0.02 sec); median 22,700,751 cycles, 1x: 22,700,751 cycles + - WOTS pk gen x (ideal).. avg. 5684.04 us (0.01 sec); median 7,574,152 cycles, 2x: 15,148,304 cycles + - WOTS pk gen x (real).. avg. 5681.80 us (0.01 sec); median 7,575,406 cycles, 3x: 22,726,218 cycles +Signing.. avg. 481536.60 us (0.48 sec); median 563,442,818 cycles, 1x: 563,442,818 cycles + - FORS signing.. avg. 56509.94 us (0.06 sec); median 64,653,618 cycles, 1x: 64,653,618 cycles + - WOTS pk gen x (ideal).. avg. 6633.20 us (0.01 sec); median 7,558,541 cycles, 58x: 438,395,378 cycles + - WOTS pk gen x (real).. avg. 6619.68 us (0.01 sec); median 7,549,423 cycles, 66x: 498,261,918 cycles +Verifying.. avg. 26454.03 us (0.03 sec); median 30,189,755 cycles, 1x: 30,189,755 cycles +Signature size: 35664 (34.83 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-192f-robust_x4 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-192f-robust_x4 new file mode 100644 index 0000000..e85ecaf --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-192f-robust_x4 @@ -0,0 +1,16 @@ +Parameters: n = 24, h = 66, d = 22, b = 8, k = 33, w = 16, way=4, tree height=3, wots_len=51 +Running 10 iterations. +thash avg. 2.48 us (0.00 sec); median 2,807 cycles, 1x: 2,807 cycles +f1600x avg. 2.05 us (0.00 sec); median 2,314 cycles, 1x: 2,314 cycles +thashx avg. 4.40 us (0.00 sec); median 5,009 cycles, 1x: 5,009 cycles +Generating keypair.. avg. 7174.62 us (0.01 sec); median 8,114,407 cycles, 1x: 8,114,407 cycles + - WOTS pk gen x (ideal).. avg. 3546.63 us (0.00 sec); median 4,038,972 cycles, 2x: 8,077,944 cycles + - WOTS pk gen x (real).. avg. 3556.63 us (0.00 sec); median 4,038,383 cycles, 2x: 8,076,766 cycles +Signing.. avg. 178249.88 us (0.18 sec); median 204,160,341 cycles, 1x: 204,160,341 cycles + - FORS signing.. avg. 23651.13 us (0.02 sec); median 26,927,959 cycles, 1x: 26,927,959 cycles + - WOTS pk gen x (ideal).. avg. 3538.13 us (0.00 sec); median 4,002,462 cycles, 44x: 176,108,328 cycles + - WOTS pk gen x (real).. avg. 3515.09 us (0.00 sec); median 4,001,993 cycles, 44x: 176,087,692 cycles +Verifying.. avg. 11258.03 us (0.01 sec); median 12,838,112 cycles, 1x: 12,838,112 cycles +Signature size: 35664 (34.83 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-192f-robust_x5 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-192f-robust_x5 new file mode 100644 index 0000000..dbfc3bc --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-192f-robust_x5 @@ -0,0 +1,16 @@ +Parameters: n = 24, h = 66, d = 22, b = 8, k = 33, w = 16, way=5, tree height=3, wots_len=51 +Running 10 iterations. +thash avg. 2.16 us (0.00 sec); median 2,846 cycles, 1x: 2,846 cycles +f1600x avg. 5.48 us (0.00 sec); median 7,314 cycles, 1x: 7,314 cycles +thashx avg. 11.14 us (0.00 sec); median 14,874 cycles, 1x: 14,874 cycles +Generating keypair.. avg. 17864.13 us (0.02 sec); median 23,854,654 cycles, 1x: 23,854,654 cycles + - WOTS pk gen x (ideal).. avg. 8936.58 us (0.01 sec); median 11,919,744 cycles, 1x: 11,919,744 cycles + - WOTS pk gen x (real).. avg. 8918.81 us (0.01 sec); median 11,904,845 cycles, 2x: 23,809,690 cycles +Signing.. avg. 497404.01 us (0.50 sec); median 585,457,076 cycles, 1x: 585,457,076 cycles + - FORS signing.. avg. 46937.53 us (0.05 sec); median 62,717,302 cycles, 1x: 62,717,302 cycles + - WOTS pk gen x (ideal).. avg. 8886.46 us (0.01 sec); median 11,858,206 cycles, 35x: 415,037,210 cycles + - WOTS pk gen x (real).. avg. 8883.18 us (0.01 sec); median 11,848,057 cycles, 44x: 521,314,508 cycles +Verifying.. avg. 25382.50 us (0.03 sec); median 29,635,761 cycles, 1x: 29,635,761 cycles +Signature size: 35664 (34.83 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-192f-simple_x3 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-192f-simple_x3 new file mode 100644 index 0000000..4d3da20 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-192f-simple_x3 @@ -0,0 +1,16 @@ +Parameters: n = 24, h = 66, d = 22, b = 8, k = 33, w = 16, way=3, tree height=3, wots_len=51 +Running 10 iterations. +thash avg. 1.08 us (0.00 sec); median 1,411 cycles, 1x: 1,411 cycles +f1600x avg. 3.41 us (0.00 sec); median 4,534 cycles, 1x: 4,534 cycles +thashx avg. 3.59 us (0.00 sec); median 4,772 cycles, 1x: 4,772 cycles +Generating keypair.. avg. 8987.20 us (0.01 sec); median 11,939,502 cycles, 1x: 11,939,502 cycles + - WOTS pk gen x (ideal).. avg. 2997.84 us (0.00 sec); median 3,977,864 cycles, 2x: 7,955,728 cycles + - WOTS pk gen x (real).. avg. 2987.39 us (0.00 sec); median 3,975,005 cycles, 3x: 11,925,015 cycles +Signing.. avg. 248486.27 us (0.25 sec); median 300,426,475 cycles, 1x: 300,426,475 cycles + - FORS signing.. avg. 34655.93 us (0.03 sec); median 39,625,893 cycles, 1x: 39,625,893 cycles + - WOTS pk gen x (ideal).. avg. 3486.71 us (0.00 sec); median 3,947,576 cycles, 58x: 228,959,408 cycles + - WOTS pk gen x (real).. avg. 3471.95 us (0.00 sec); median 3,947,259 cycles, 66x: 260,519,094 cycles +Verifying.. avg. 13376.65 us (0.01 sec); median 15,299,251 cycles, 1x: 15,299,251 cycles +Signature size: 35664 (34.83 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-192f-simple_x4 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-192f-simple_x4 new file mode 100644 index 0000000..e7f12e5 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-192f-simple_x4 @@ -0,0 +1,16 @@ +Parameters: n = 24, h = 66, d = 22, b = 8, k = 33, w = 16, way=4, tree height=3, wots_len=51 +Running 10 iterations. +thash avg. 1.08 us (0.00 sec); median 1,410 cycles, 1x: 1,410 cycles +f1600x avg. 1.88 us (0.00 sec); median 2,494 cycles, 1x: 2,494 cycles +thashx avg. 1.96 us (0.00 sec); median 2,604 cycles, 1x: 2,604 cycles +Generating keypair.. avg. 3259.66 us (0.00 sec); median 4,324,660 cycles, 1x: 4,324,660 cycles + - WOTS pk gen x (ideal).. avg. 1628.70 us (0.00 sec); median 2,157,322 cycles, 2x: 4,314,644 cycles + - WOTS pk gen x (real).. avg. 1630.88 us (0.00 sec); median 2,158,104 cycles, 2x: 4,316,208 cycles +Signing.. avg. 83495.27 us (0.08 sec); median 111,526,500 cycles, 1x: 111,526,500 cycles + - FORS signing.. avg. 13210.71 us (0.01 sec); median 16,735,359 cycles, 1x: 16,735,359 cycles + - WOTS pk gen x (ideal).. avg. 1910.70 us (0.00 sec); median 2,160,005 cycles, 44x: 95,040,220 cycles + - WOTS pk gen x (real).. avg. 1903.08 us (0.00 sec); median 2,158,507 cycles, 44x: 94,974,308 cycles +Verifying.. avg. 5846.52 us (0.01 sec); median 6,678,388 cycles, 1x: 6,678,388 cycles +Signature size: 35664 (34.83 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-192f-simple_x5 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-192f-simple_x5 new file mode 100644 index 0000000..73888b4 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-192f-simple_x5 @@ -0,0 +1,16 @@ +Parameters: n = 24, h = 66, d = 22, b = 8, k = 33, w = 16, way=5, tree height=3, wots_len=51 +Running 10 iterations. +thash avg. 1.05 us (0.00 sec); median 1,422 cycles, 1x: 1,422 cycles +f1600x avg. 5.39 us (0.00 sec); median 7,177 cycles, 1x: 7,177 cycles +thashx avg. 5.67 us (0.00 sec); median 7,565 cycles, 1x: 7,565 cycles +Generating keypair.. avg. 9412.03 us (0.01 sec); median 12,548,540 cycles, 1x: 12,548,540 cycles + - WOTS pk gen x (ideal).. avg. 4698.96 us (0.00 sec); median 6,264,901 cycles, 1x: 6,264,901 cycles + - WOTS pk gen x (real).. avg. 4689.05 us (0.00 sec); median 6,243,978 cycles, 2x: 12,487,956 cycles +Signing.. avg. 260009.75 us (0.26 sec); median 311,871,362 cycles, 1x: 311,871,362 cycles + - FORS signing.. avg. 33130.56 us (0.03 sec); median 38,225,112 cycles, 1x: 38,225,112 cycles + - WOTS pk gen x (ideal).. avg. 4655.59 us (0.00 sec); median 6,201,179 cycles, 35x: 217,041,265 cycles + - WOTS pk gen x (real).. avg. 4659.98 us (0.00 sec); median 6,202,445 cycles, 44x: 272,907,580 cycles +Verifying.. avg. 11404.07 us (0.01 sec); median 15,220,283 cycles, 1x: 15,220,283 cycles +Signature size: 35664 (34.83 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-192s-robust_x3 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-192s-robust_x3 new file mode 100644 index 0000000..2abb910 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-192s-robust_x3 @@ -0,0 +1,16 @@ +Parameters: n = 24, h = 63, d = 7, b = 14, k = 17, w = 16, way=3, tree height=9, wots_len=51 +Running 10 iterations. +thash avg. 2.14 us (0.00 sec); median 2,825 cycles, 1x: 2,825 cycles +f1600x avg. 3.47 us (0.00 sec); median 4,629 cycles, 1x: 4,629 cycles +thashx avg. 7.11 us (0.00 sec); median 9,472 cycles, 1x: 9,472 cycles +Generating keypair.. avg. 1093568.50 us (1.09 sec); median 1,289,456,220 cycles, 1x: 1,289,456,220 cycles + - WOTS pk gen x (ideal).. avg. 6603.66 us (0.01 sec); median 7,530,802 cycles, 170x: 1,280,236,340 cycles + - WOTS pk gen x (real).. avg. 6600.76 us (0.01 sec); median 7,530,582 cycles, 171x: 1,287,729,522 cycles +Signing.. avg. 9399428.42 us (9.40 sec); median 11,142,516,171 cycles, 1x: 11,142,516,171 cycles + - FORS signing.. avg. 1812053.64 us (1.81 sec); median 2,112,946,487 cycles, 1x: 2,112,946,487 cycles + - WOTS pk gen x (ideal).. avg. 6619.87 us (0.01 sec); median 7,533,728 cycles, 1194x: 8,995,271,232 cycles + - WOTS pk gen x (real).. avg. 6608.27 us (0.01 sec); median 7,534,570 cycles, 1197x: 9,018,880,290 cycles +Verifying.. avg. 8932.37 us (0.01 sec); median 10,154,527 cycles, 1x: 10,154,527 cycles +Signature size: 16224 (15.84 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-192s-robust_x4 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-192s-robust_x4 new file mode 100644 index 0000000..ec8842a --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-192s-robust_x4 @@ -0,0 +1,16 @@ +Parameters: n = 24, h = 63, d = 7, b = 14, k = 17, w = 16, way=4, tree height=9, wots_len=51 +Running 10 iterations. +thash avg. 2.13 us (0.00 sec); median 2,823 cycles, 1x: 2,823 cycles +f1600x avg. 1.74 us (0.00 sec); median 2,305 cycles, 1x: 2,305 cycles +thashx avg. 3.76 us (0.00 sec); median 5,004 cycles, 1x: 5,004 cycles +Generating keypair.. avg. 416704.13 us (0.42 sec); median 515,891,354 cycles, 1x: 515,891,354 cycles + - WOTS pk gen x (ideal).. avg. 3545.76 us (0.00 sec); median 4,027,906 cycles, 128x: 515,571,968 cycles + - WOTS pk gen x (real).. avg. 3574.79 us (0.00 sec); median 4,026,395 cycles, 128x: 515,378,560 cycles +Signing.. avg. 3769124.36 us (3.77 sec); median 4,493,636,148 cycles, 1x: 4,493,636,148 cycles + - FORS signing.. avg. 764409.19 us (0.76 sec); median 882,881,102 cycles, 1x: 882,881,102 cycles + - WOTS pk gen x (ideal).. avg. 3533.15 us (0.00 sec); median 4,024,121 cycles, 896x: 3,605,612,416 cycles + - WOTS pk gen x (real).. avg. 3529.89 us (0.00 sec); median 4,023,021 cycles, 896x: 3,604,626,816 cycles +Verifying.. avg. 4191.65 us (0.00 sec); median 4,770,243 cycles, 1x: 4,770,243 cycles +Signature size: 16224 (15.84 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-192s-robust_x5 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-192s-robust_x5 new file mode 100644 index 0000000..4b5740a --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-192s-robust_x5 @@ -0,0 +1,16 @@ +Parameters: n = 24, h = 63, d = 7, b = 14, k = 17, w = 16, way=5, tree height=9, wots_len=51 +Running 10 iterations. +thash avg. 2.47 us (0.00 sec); median 2,798 cycles, 1x: 2,798 cycles +f1600x avg. 6.41 us (0.00 sec); median 7,298 cycles, 1x: 7,298 cycles +thashx avg. 13.08 us (0.00 sec); median 14,860 cycles, 1x: 14,860 cycles +Generating keypair.. avg. 1035569.09 us (1.04 sec); median 1,221,710,111 cycles, 1x: 1,221,710,111 cycles + - WOTS pk gen x (ideal).. avg. 10393.70 us (0.01 sec); median 11,876,937 cycles, 102x: 1,211,447,574 cycles + - WOTS pk gen x (real).. avg. 10373.04 us (0.01 sec); median 11,859,267 cycles, 103x: 1,221,504,501 cycles +Signing.. avg. 8985093.50 us (8.99 sec); median 10,590,197,026 cycles, 1x: 10,590,197,026 cycles + - FORS signing.. avg. 1729610.21 us (1.73 sec); median 2,038,655,825 cycles, 1x: 2,038,655,825 cycles + - WOTS pk gen x (ideal).. avg. 10016.09 us (0.01 sec); median 11,955,492 cycles, 716x: 8,560,132,272 cycles + - WOTS pk gen x (real).. avg. 8896.17 us (0.01 sec); median 11,871,019 cycles, 721x: 8,559,004,699 cycles +Verifying.. avg. 7111.41 us (0.01 sec); median 9,432,710 cycles, 1x: 9,432,710 cycles +Signature size: 16224 (15.84 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-192s-simple_x3 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-192s-simple_x3 new file mode 100644 index 0000000..2afe882 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-192s-simple_x3 @@ -0,0 +1,16 @@ +Parameters: n = 24, h = 63, d = 7, b = 14, k = 17, w = 16, way=3, tree height=9, wots_len=51 +Running 10 iterations. +thash avg. 1.08 us (0.00 sec); median 1,422 cycles, 1x: 1,422 cycles +f1600x avg. 3.41 us (0.00 sec); median 4,533 cycles, 1x: 4,533 cycles +thashx avg. 3.59 us (0.00 sec); median 4,786 cycles, 1x: 4,786 cycles +Generating keypair.. avg. 556679.92 us (0.56 sec); median 675,412,433 cycles, 1x: 675,412,433 cycles + - WOTS pk gen x (ideal).. avg. 3498.07 us (0.00 sec); median 3,955,899 cycles, 170x: 672,502,830 cycles + - WOTS pk gen x (real).. avg. 3487.57 us (0.00 sec); median 3,944,559 cycles, 171x: 674,519,589 cycles +Signing.. avg. 5083339.69 us (5.08 sec); median 6,025,520,015 cycles, 1x: 6,025,520,015 cycles + - FORS signing.. avg. 1099177.04 us (1.10 sec); median 1,296,435,885 cycles, 1x: 1,296,435,885 cycles + - WOTS pk gen x (ideal).. avg. 3477.62 us (0.00 sec); median 3,941,891 cycles, 1194x: 4,706,617,854 cycles + - WOTS pk gen x (real).. avg. 3459.95 us (0.00 sec); median 3,941,422 cycles, 1197x: 4,717,882,134 cycles +Verifying.. avg. 4647.78 us (0.00 sec); median 5,291,238 cycles, 1x: 5,291,238 cycles +Signature size: 16224 (15.84 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-192s-simple_x4 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-192s-simple_x4 new file mode 100644 index 0000000..fc45dca --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-192s-simple_x4 @@ -0,0 +1,16 @@ +Parameters: n = 24, h = 63, d = 7, b = 14, k = 17, w = 16, way=4, tree height=9, wots_len=51 +Running 10 iterations. +thash avg. 1.07 us (0.00 sec); median 1,413 cycles, 1x: 1,413 cycles +f1600x avg. 1.88 us (0.00 sec); median 2,495 cycles, 1x: 2,495 cycles +thashx avg. 1.94 us (0.00 sec); median 2,577 cycles, 1x: 2,577 cycles +Generating keypair.. avg. 222396.34 us (0.22 sec); median 274,023,520 cycles, 1x: 274,023,520 cycles + - WOTS pk gen x (ideal).. avg. 1899.61 us (0.00 sec); median 2,141,709 cycles, 128x: 274,138,752 cycles + - WOTS pk gen x (real).. avg. 1890.39 us (0.00 sec); median 2,140,841 cycles, 128x: 274,027,648 cycles +Signing.. avg. 2113914.69 us (2.11 sec); median 2,458,736,965 cycles, 1x: 2,458,736,965 cycles + - FORS signing.. avg. 456524.75 us (0.46 sec); median 542,001,697 cycles, 1x: 542,001,697 cycles + - WOTS pk gen x (ideal).. avg. 1895.93 us (0.00 sec); median 2,131,561 cycles, 896x: 1,909,878,656 cycles + - WOTS pk gen x (real).. avg. 1882.01 us (0.00 sec); median 2,130,861 cycles, 896x: 1,909,251,456 cycles +Verifying.. avg. 2188.28 us (0.00 sec); median 2,477,192 cycles, 1x: 2,477,192 cycles +Signature size: 16224 (15.84 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-192s-simple_x5 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-192s-simple_x5 new file mode 100644 index 0000000..b7b64e6 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-192s-simple_x5 @@ -0,0 +1,16 @@ +Parameters: n = 24, h = 63, d = 7, b = 14, k = 17, w = 16, way=5, tree height=9, wots_len=51 +Running 10 iterations. +thash avg. 1.08 us (0.00 sec); median 1,402 cycles, 1x: 1,402 cycles +f1600x avg. 5.37 us (0.00 sec); median 7,146 cycles, 1x: 7,146 cycles +thashx avg. 5.67 us (0.00 sec); median 7,547 cycles, 1x: 7,547 cycles +Generating keypair.. avg. 543584.27 us (0.54 sec); median 641,377,289 cycles, 1x: 641,377,289 cycles + - WOTS pk gen x (ideal).. avg. 5477.30 us (0.01 sec); median 6,225,928 cycles, 102x: 635,044,656 cycles + - WOTS pk gen x (real).. avg. 4666.02 us (0.00 sec); median 6,207,119 cycles, 103x: 639,333,257 cycles +Signing.. avg. 4810945.27 us (4.81 sec); median 5,729,139,825 cycles, 1x: 5,729,139,825 cycles + - FORS signing.. avg. 1049252.10 us (1.05 sec); median 1,245,791,216 cycles, 1x: 1,245,791,216 cycles + - WOTS pk gen x (ideal).. avg. 4685.27 us (0.00 sec); median 6,238,205 cycles, 716x: 4,466,554,780 cycles + - WOTS pk gen x (real).. avg. 4674.97 us (0.00 sec); median 6,235,113 cycles, 721x: 4,495,516,473 cycles +Verifying.. avg. 3828.15 us (0.00 sec); median 5,095,100 cycles, 1x: 5,095,100 cycles +Signature size: 16224 (15.84 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-256f-robust_x3 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-256f-robust_x3 new file mode 100644 index 0000000..1f58072 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-256f-robust_x3 @@ -0,0 +1,16 @@ +Parameters: n = 32, h = 68, d = 17, b = 9, k = 35, w = 16, way=3, tree height=4, wots_len=67 +Running 10 iterations. +thash avg. 2.13 us (0.00 sec); median 2,802 cycles, 1x: 2,802 cycles +f1600x avg. 3.48 us (0.00 sec); median 4,627 cycles, 1x: 4,627 cycles +thashx avg. 7.13 us (0.00 sec); median 9,499 cycles, 1x: 9,499 cycles +Generating keypair.. avg. 44866.23 us (0.04 sec); median 59,903,134 cycles, 1x: 59,903,134 cycles + - WOTS pk gen x (ideal).. avg. 7494.06 us (0.01 sec); median 9,985,217 cycles, 5x: 49,926,085 cycles + - WOTS pk gen x (real).. avg. 7485.61 us (0.01 sec); median 9,984,995 cycles, 6x: 59,909,970 cycles +Signing.. avg. 986020.46 us (0.99 sec); median 1,153,974,441 cycles, 1x: 1,153,974,441 cycles + - FORS signing.. avg. 119067.93 us (0.12 sec); median 136,209,244 cycles, 1x: 136,209,244 cycles + - WOTS pk gen x (ideal).. avg. 8736.72 us (0.01 sec); median 9,982,490 cycles, 90x: 898,424,100 cycles + - WOTS pk gen x (real).. avg. 8773.98 us (0.01 sec); median 9,976,039 cycles, 102x: 1,017,555,978 cycles +Verifying.. avg. 26169.62 us (0.03 sec); median 29,955,963 cycles, 1x: 29,955,963 cycles +Signature size: 49856 (48.69 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-256f-robust_x4 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-256f-robust_x4 new file mode 100644 index 0000000..58362ca --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-256f-robust_x4 @@ -0,0 +1,16 @@ +Parameters: n = 32, h = 68, d = 17, b = 9, k = 35, w = 16, way=4, tree height=4, wots_len=67 +Running 10 iterations. +thash avg. 2.12 us (0.00 sec); median 2,799 cycles, 1x: 2,799 cycles +f1600x avg. 1.75 us (0.00 sec); median 2,320 cycles, 1x: 2,320 cycles +thashx avg. 3.87 us (0.00 sec); median 5,132 cycles, 1x: 5,132 cycles +Generating keypair.. avg. 16276.15 us (0.02 sec); median 21,683,970 cycles, 1x: 21,683,970 cycles + - WOTS pk gen x (ideal).. avg. 4099.33 us (0.00 sec); median 5,449,624 cycles, 4x: 21,798,496 cycles + - WOTS pk gen x (real).. avg. 4069.92 us (0.00 sec); median 5,410,919 cycles, 4x: 21,643,676 cycles +Signing.. avg. 348168.84 us (0.35 sec); median 423,322,148 cycles, 1x: 423,322,148 cycles + - FORS signing.. avg. 50404.18 us (0.05 sec); median 57,660,060 cycles, 1x: 57,660,060 cycles + - WOTS pk gen x (ideal).. avg. 4748.89 us (0.00 sec); median 5,390,843 cycles, 68x: 366,577,324 cycles + - WOTS pk gen x (real).. avg. 4731.18 us (0.00 sec); median 5,387,139 cycles, 68x: 366,325,452 cycles +Verifying.. avg. 11919.52 us (0.01 sec); median 13,605,986 cycles, 1x: 13,605,986 cycles +Signature size: 49856 (48.69 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-256f-robust_x5 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-256f-robust_x5 new file mode 100644 index 0000000..3154803 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-256f-robust_x5 @@ -0,0 +1,16 @@ +Parameters: n = 32, h = 68, d = 17, b = 9, k = 35, w = 16, way=5, tree height=4, wots_len=67 +Running 10 iterations. +thash avg. 2.11 us (0.00 sec); median 2,799 cycles, 1x: 2,799 cycles +f1600x avg. 5.49 us (0.00 sec); median 7,310 cycles, 1x: 7,310 cycles +thashx avg. 11.18 us (0.00 sec); median 14,942 cycles, 1x: 14,942 cycles +Generating keypair.. avg. 47237.32 us (0.05 sec); median 63,093,704 cycles, 1x: 63,093,704 cycles + - WOTS pk gen x (ideal).. avg. 11824.43 us (0.01 sec); median 15,773,118 cycles, 3x: 47,319,354 cycles + - WOTS pk gen x (real).. avg. 11793.38 us (0.01 sec); median 15,750,024 cycles, 4x: 63,000,096 cycles +Signing.. avg. 1011302.95 us (1.01 sec); median 1,199,711,660 cycles, 1x: 1,199,711,660 cycles + - FORS signing.. avg. 115053.40 us (0.12 sec); median 131,516,919 cycles, 1x: 131,516,919 cycles + - WOTS pk gen x (ideal).. avg. 13787.71 us (0.01 sec); median 15,706,744 cycles, 54x: 848,164,176 cycles + - WOTS pk gen x (real).. avg. 13753.20 us (0.01 sec); median 15,709,845 cycles, 68x: 1,068,269,460 cycles +Verifying.. avg. 25011.99 us (0.03 sec); median 28,529,915 cycles, 1x: 28,529,915 cycles +Signature size: 49856 (48.69 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-256f-simple_x3 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-256f-simple_x3 new file mode 100644 index 0000000..b66184d --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-256f-simple_x3 @@ -0,0 +1,16 @@ +Parameters: n = 32, h = 68, d = 17, b = 9, k = 35, w = 16, way=3, tree height=4, wots_len=67 +Running 10 iterations. +thash avg. 1.08 us (0.00 sec); median 1,416 cycles, 1x: 1,416 cycles +f1600x avg. 3.48 us (0.00 sec); median 4,629 cycles, 1x: 4,629 cycles +thashx avg. 3.62 us (0.00 sec); median 4,807 cycles, 1x: 4,807 cycles +Generating keypair.. avg. 23668.75 us (0.02 sec); median 31,610,470 cycles, 1x: 31,610,470 cycles + - WOTS pk gen x (ideal).. avg. 3961.61 us (0.00 sec); median 5,272,572 cycles, 5x: 26,362,860 cycles + - WOTS pk gen x (real).. avg. 3965.09 us (0.00 sec); median 5,274,173 cycles, 6x: 31,645,038 cycles +Signing.. avg. 534277.67 us (0.53 sec); median 619,041,115 cycles, 1x: 619,041,115 cycles + - FORS signing.. avg. 73196.32 us (0.07 sec); median 83,721,808 cycles, 1x: 83,721,808 cycles + - WOTS pk gen x (ideal).. avg. 4631.47 us (0.00 sec); median 5,267,477 cycles, 90x: 474,072,930 cycles + - WOTS pk gen x (real).. avg. 4594.11 us (0.00 sec); median 5,237,291 cycles, 102x: 534,203,682 cycles +Verifying.. avg. 13031.38 us (0.01 sec); median 14,903,705 cycles, 1x: 14,903,705 cycles +Signature size: 49856 (48.69 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-256f-simple_x4 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-256f-simple_x4 new file mode 100644 index 0000000..2a0bd08 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-256f-simple_x4 @@ -0,0 +1,16 @@ +Parameters: n = 32, h = 68, d = 17, b = 9, k = 35, w = 16, way=4, tree height=4, wots_len=67 +Running 10 iterations. +thash avg. 1.08 us (0.00 sec); median 1,414 cycles, 1x: 1,414 cycles +f1600x avg. 1.74 us (0.00 sec); median 2,305 cycles, 1x: 2,305 cycles +thashx avg. 2.02 us (0.00 sec); median 2,676 cycles, 1x: 2,676 cycles +Generating keypair.. avg. 8746.67 us (0.01 sec); median 11,672,711 cycles, 1x: 11,672,711 cycles + - WOTS pk gen x (ideal).. avg. 2182.00 us (0.00 sec); median 2,896,869 cycles, 4x: 11,587,476 cycles + - WOTS pk gen x (real).. avg. 2186.45 us (0.00 sec); median 2,897,221 cycles, 4x: 11,588,884 cycles +Signing.. avg. 188339.11 us (0.19 sec); median 232,926,147 cycles, 1x: 232,926,147 cycles + - FORS signing.. avg. 31652.07 us (0.03 sec); median 36,120,705 cycles, 1x: 36,120,705 cycles + - WOTS pk gen x (ideal).. avg. 2551.54 us (0.00 sec); median 2,881,122 cycles, 68x: 195,916,296 cycles + - WOTS pk gen x (real).. avg. 2535.21 us (0.00 sec); median 2,881,112 cycles, 68x: 195,915,616 cycles +Verifying.. avg. 6070.60 us (0.01 sec); median 6,908,906 cycles, 1x: 6,908,906 cycles +Signature size: 49856 (48.69 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-256f-simple_x5 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-256f-simple_x5 new file mode 100644 index 0000000..b473fd8 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-256f-simple_x5 @@ -0,0 +1,16 @@ +Parameters: n = 32, h = 68, d = 17, b = 9, k = 35, w = 16, way=5, tree height=4, wots_len=67 +Running 10 iterations. +thash avg. 1.08 us (0.00 sec); median 1,421 cycles, 1x: 1,421 cycles +f1600x avg. 5.49 us (0.00 sec); median 7,320 cycles, 1x: 7,320 cycles +thashx avg. 5.68 us (0.00 sec); median 7,566 cycles, 1x: 7,566 cycles +Generating keypair.. avg. 24954.52 us (0.02 sec); median 33,325,833 cycles, 1x: 33,325,833 cycles + - WOTS pk gen x (ideal).. avg. 6238.47 us (0.01 sec); median 8,311,569 cycles, 3x: 24,934,707 cycles + - WOTS pk gen x (real).. avg. 6231.52 us (0.01 sec); median 8,307,134 cycles, 4x: 33,228,536 cycles +Signing.. avg. 548584.70 us (0.55 sec); median 644,023,858 cycles, 1x: 644,023,858 cycles + - FORS signing.. avg. 70474.10 us (0.07 sec); median 80,601,512 cycles, 1x: 80,601,512 cycles + - WOTS pk gen x (ideal).. avg. 7261.23 us (0.01 sec); median 8,292,598 cycles, 54x: 447,800,292 cycles + - WOTS pk gen x (real).. avg. 7258.16 us (0.01 sec); median 8,260,508 cycles, 68x: 561,714,544 cycles +Verifying.. avg. 13018.30 us (0.01 sec); median 14,848,640 cycles, 1x: 14,848,640 cycles +Signature size: 49856 (48.69 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-256s-robust_x3 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-256s-robust_x3 new file mode 100644 index 0000000..6a1d918 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-256s-robust_x3 @@ -0,0 +1,16 @@ +Parameters: n = 32, h = 64, d = 8, b = 14, k = 22, w = 16, way=3, tree height=8, wots_len=67 +Running 10 iterations. +thash avg. 2.13 us (0.00 sec); median 2,817 cycles, 1x: 2,817 cycles +f1600x avg. 3.48 us (0.00 sec); median 4,622 cycles, 1x: 4,622 cycles +thashx avg. 7.14 us (0.00 sec); median 9,517 cycles, 1x: 9,517 cycles +Generating keypair.. avg. 735890.85 us (0.74 sec); median 860,447,868 cycles, 1x: 860,447,868 cycles + - WOTS pk gen x (ideal).. avg. 8739.24 us (0.01 sec); median 9,976,383 cycles, 85x: 847,992,555 cycles + - WOTS pk gen x (real).. avg. 8764.78 us (0.01 sec); median 9,976,141 cycles, 86x: 857,948,126 cycles +Signing.. avg. 8154417.01 us (8.15 sec); median 9,599,740,659 cycles, 1x: 9,599,740,659 cycles + - FORS signing.. avg. 2351110.00 us (2.35 sec); median 2,730,265,827 cycles, 1x: 2,730,265,827 cycles + - WOTS pk gen x (ideal).. avg. 8743.02 us (0.01 sec); median 9,977,691 cycles, 682x: 6,804,785,262 cycles + - WOTS pk gen x (real).. avg. 8764.14 us (0.01 sec); median 9,976,856 cycles, 688x: 6,864,076,928 cycles +Verifying.. avg. 13196.21 us (0.01 sec); median 14,940,822 cycles, 1x: 14,940,822 cycles +Signature size: 29792 (29.09 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-256s-robust_x4 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-256s-robust_x4 new file mode 100644 index 0000000..3db1bd4 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-256s-robust_x4 @@ -0,0 +1,16 @@ +Parameters: n = 32, h = 64, d = 8, b = 14, k = 22, w = 16, way=4, tree height=8, wots_len=67 +Running 10 iterations. +thash avg. 2.12 us (0.00 sec); median 2,804 cycles, 1x: 2,804 cycles +f1600x avg. 1.75 us (0.00 sec); median 2,313 cycles, 1x: 2,313 cycles +thashx avg. 3.84 us (0.00 sec); median 5,124 cycles, 1x: 5,124 cycles +Generating keypair.. avg. 283833.76 us (0.28 sec); median 343,429,673 cycles, 1x: 343,429,673 cycles + - WOTS pk gen x (ideal).. avg. 4712.02 us (0.00 sec); median 5,365,026 cycles, 64x: 343,361,664 cycles + - WOTS pk gen x (real).. avg. 4683.21 us (0.00 sec); median 5,363,839 cycles, 64x: 343,285,696 cycles +Signing.. avg. 3308273.07 us (3.31 sec); median 3,901,695,356 cycles, 1x: 3,901,695,356 cycles + - FORS signing.. avg. 974179.35 us (0.97 sec); median 1,155,567,665 cycles, 1x: 1,155,567,665 cycles + - WOTS pk gen x (ideal).. avg. 4041.04 us (0.00 sec); median 5,374,012 cycles, 512x: 2,751,494,144 cycles + - WOTS pk gen x (real).. avg. 4049.22 us (0.00 sec); median 5,387,133 cycles, 512x: 2,758,212,096 cycles +Verifying.. avg. 5173.74 us (0.01 sec); median 6,897,244 cycles, 1x: 6,897,244 cycles +Signature size: 29792 (29.09 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-256s-robust_x5 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-256s-robust_x5 new file mode 100644 index 0000000..b5dabcc --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-256s-robust_x5 @@ -0,0 +1,16 @@ +Parameters: n = 32, h = 64, d = 8, b = 14, k = 22, w = 16, way=5, tree height=8, wots_len=67 +Running 10 iterations. +thash avg. 2.11 us (0.00 sec); median 2,797 cycles, 1x: 2,797 cycles +f1600x avg. 5.50 us (0.00 sec); median 7,326 cycles, 1x: 7,326 cycles +thashx avg. 11.16 us (0.00 sec); median 14,883 cycles, 1x: 14,883 cycles +Generating keypair.. avg. 655904.41 us (0.66 sec); median 818,789,755 cycles, 1x: 818,789,755 cycles + - WOTS pk gen x (ideal).. avg. 9147.82 us (0.01 sec); median 15,703,857 cycles, 51x: 800,896,707 cycles + - WOTS pk gen x (real).. avg. 12979.95 us (0.01 sec); median 15,706,360 cycles, 52x: 816,730,720 cycles +Signing.. avg. 7865602.90 us (7.87 sec); median 9,187,665,386 cycles, 1x: 9,187,665,386 cycles + - FORS signing.. avg. 2236659.04 us (2.24 sec); median 2,636,645,385 cycles, 1x: 2,636,645,385 cycles + - WOTS pk gen x (ideal).. avg. 13775.74 us (0.01 sec); median 15,734,920 cycles, 409x: 6,435,582,280 cycles + - WOTS pk gen x (real).. avg. 13783.95 us (0.01 sec); median 15,747,453 cycles, 416x: 6,550,940,448 cycles +Verifying.. avg. 12986.30 us (0.01 sec); median 14,732,813 cycles, 1x: 14,732,813 cycles +Signature size: 29792 (29.09 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-256s-simple_x3 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-256s-simple_x3 new file mode 100644 index 0000000..3acff66 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-256s-simple_x3 @@ -0,0 +1,16 @@ +Parameters: n = 32, h = 64, d = 8, b = 14, k = 22, w = 16, way=3, tree height=8, wots_len=67 +Running 10 iterations. +thash avg. 1.07 us (0.00 sec); median 1,406 cycles, 1x: 1,406 cycles +f1600x avg. 3.47 us (0.00 sec); median 4,614 cycles, 1x: 4,614 cycles +thashx avg. 3.63 us (0.00 sec); median 4,827 cycles, 1x: 4,827 cycles +Generating keypair.. avg. 343596.23 us (0.34 sec); median 451,392,290 cycles, 1x: 451,392,290 cycles + - WOTS pk gen x (ideal).. avg. 3065.64 us (0.00 sec); median 5,243,315 cycles, 85x: 445,681,775 cycles + - WOTS pk gen x (real).. avg. 3063.82 us (0.00 sec); median 5,244,527 cycles, 86x: 451,029,322 cycles +Signing.. avg. 4457190.28 us (4.46 sec); median 5,285,489,783 cycles, 1x: 5,285,489,783 cycles + - FORS signing.. avg. 1413658.87 us (1.41 sec); median 1,676,483,848 cycles, 1x: 1,676,483,848 cycles + - WOTS pk gen x (ideal).. avg. 3937.00 us (0.00 sec); median 5,237,883 cycles, 682x: 3,572,236,206 cycles + - WOTS pk gen x (real).. avg. 3944.81 us (0.00 sec); median 5,236,474 cycles, 688x: 3,602,694,112 cycles +Verifying.. avg. 5727.00 us (0.01 sec); median 7,626,041 cycles, 1x: 7,626,041 cycles +Signature size: 29792 (29.09 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-256s-simple_x4 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-256s-simple_x4 new file mode 100644 index 0000000..97cbbd2 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-256s-simple_x4 @@ -0,0 +1,16 @@ +Parameters: n = 32, h = 64, d = 8, b = 14, k = 22, w = 16, way=4, tree height=8, wots_len=67 +Running 10 iterations. +thash avg. 1.08 us (0.00 sec); median 1,419 cycles, 1x: 1,419 cycles +f1600x avg. 1.75 us (0.00 sec); median 2,307 cycles, 1x: 2,307 cycles +thashx avg. 2.03 us (0.00 sec); median 2,703 cycles, 1x: 2,703 cycles +Generating keypair.. avg. 145208.46 us (0.15 sec); median 185,243,467 cycles, 1x: 185,243,467 cycles + - WOTS pk gen x (ideal).. avg. 2555.43 us (0.00 sec); median 2,901,193 cycles, 64x: 185,676,352 cycles + - WOTS pk gen x (real).. avg. 2555.18 us (0.00 sec); median 2,902,405 cycles, 64x: 185,753,920 cycles +Signing.. avg. 1888270.59 us (1.89 sec); median 2,199,470,884 cycles, 1x: 2,199,470,884 cycles + - FORS signing.. avg. 611328.16 us (0.61 sec); median 719,234,988 cycles, 1x: 719,234,988 cycles + - WOTS pk gen x (ideal).. avg. 2579.62 us (0.00 sec); median 2,909,725 cycles, 512x: 1,489,779,200 cycles + - WOTS pk gen x (real).. avg. 2571.37 us (0.00 sec); median 2,909,591 cycles, 512x: 1,489,710,592 cycles +Verifying.. avg. 3194.07 us (0.00 sec); median 3,624,944 cycles, 1x: 3,624,944 cycles +Signature size: 29792 (29.09 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-256s-simple_x5 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-256s-simple_x5 new file mode 100644 index 0000000..ddc53b3 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A510/sphincs-shake-256s-simple_x5 @@ -0,0 +1,16 @@ +Parameters: n = 32, h = 64, d = 8, b = 14, k = 22, w = 16, way=5, tree height=8, wots_len=67 +Running 10 iterations. +thash avg. 1.08 us (0.00 sec); median 1,416 cycles, 1x: 1,416 cycles +f1600x avg. 5.50 us (0.00 sec); median 7,315 cycles, 1x: 7,315 cycles +thashx avg. 5.70 us (0.00 sec); median 7,603 cycles, 1x: 7,603 cycles +Generating keypair.. avg. 341160.39 us (0.34 sec); median 428,395,159 cycles, 1x: 428,395,159 cycles + - WOTS pk gen x (ideal).. avg. 7225.55 us (0.01 sec); median 8,238,683 cycles, 51x: 420,172,833 cycles + - WOTS pk gen x (real).. avg. 7225.25 us (0.01 sec); median 8,238,451 cycles, 52x: 428,399,452 cycles +Signing.. avg. 4260022.25 us (4.26 sec); median 5,043,936,275 cycles, 1x: 5,043,936,275 cycles + - FORS signing.. avg. 1368002.80 us (1.37 sec); median 1,609,364,273 cycles, 1x: 1,609,364,273 cycles + - WOTS pk gen x (ideal).. avg. 6208.85 us (0.01 sec); median 8,272,536 cycles, 409x: 3,383,467,224 cycles + - WOTS pk gen x (real).. avg. 6207.04 us (0.01 sec); median 8,271,118 cycles, 416x: 3,440,785,088 cycles +Verifying.. avg. 5372.62 us (0.01 sec); median 7,151,262 cycles, 1x: 7,151,262 cycles +Signature size: 29792 (29.09 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-128f-robust_x3 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-128f-robust_x3 new file mode 100644 index 0000000..fa3f808 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-128f-robust_x3 @@ -0,0 +1,16 @@ +Parameters: n = 16, h = 66, d = 22, b = 6, k = 33, w = 16, way=3, tree height=3, wots_len=35 +Running 10 iterations. +thash avg. 2.11 us (0.00 sec); median 3,756 cycles, 1x: 3,756 cycles +f1600x avg. 3.24 us (0.00 sec); median 5,804 cycles, 1x: 5,804 cycles +thashx avg. 6.64 us (0.00 sec); median 11,912 cycles, 1x: 11,912 cycles +Generating keypair.. avg. 10949.70 us (0.01 sec); median 19,648,888 cycles, 1x: 19,648,888 cycles + - WOTS pk gen x (ideal).. avg. 3651.18 us (0.00 sec); median 6,541,900 cycles, 2x: 13,083,800 cycles + - WOTS pk gen x (real).. avg. 3644.81 us (0.00 sec); median 6,537,391 cycles, 3x: 19,612,173 cycles +Signing.. avg. 252068.08 us (0.25 sec); median 453,129,427 cycles, 1x: 453,129,427 cycles + - FORS signing.. avg. 11709.41 us (0.01 sec); median 21,042,918 cycles, 1x: 21,042,918 cycles + - WOTS pk gen x (ideal).. avg. 3656.19 us (0.00 sec); median 6,540,456 cycles, 58x: 379,346,448 cycles + - WOTS pk gen x (real).. avg. 3691.80 us (0.00 sec); median 6,542,472 cycles, 66x: 431,803,152 cycles +Verifying.. avg. 15409.61 us (0.02 sec); median 27,072,116 cycles, 1x: 27,072,116 cycles +Signature size: 17088 (16.69 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-128f-robust_x4 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-128f-robust_x4 new file mode 100644 index 0000000..6069392 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-128f-robust_x4 @@ -0,0 +1,16 @@ +Parameters: n = 16, h = 66, d = 22, b = 6, k = 33, w = 16, way=4, tree height=3, wots_len=35 +Running 10 iterations. +thash avg. 2.30 us (0.00 sec); median 3,747 cycles, 1x: 3,747 cycles +f1600x avg. 3.17 us (0.00 sec); median 5,693 cycles, 1x: 5,693 cycles +thashx avg. 6.62 us (0.00 sec); median 11,855 cycles, 1x: 11,855 cycles +Generating keypair.. avg. 7298.58 us (0.01 sec); median 13,077,870 cycles, 1x: 13,077,870 cycles + - WOTS pk gen x (ideal).. avg. 3644.19 us (0.00 sec); median 6,523,384 cycles, 2x: 13,046,768 cycles + - WOTS pk gen x (real).. avg. 3640.50 us (0.00 sec); median 6,522,208 cycles, 2x: 13,044,416 cycles +Signing.. avg. 169047.86 us (0.17 sec); median 304,188,086 cycles, 1x: 304,188,086 cycles + - FORS signing.. avg. 9041.01 us (0.01 sec); median 16,233,686 cycles, 1x: 16,233,686 cycles + - WOTS pk gen x (ideal).. avg. 3642.18 us (0.00 sec); median 6,522,389 cycles, 44x: 286,985,116 cycles + - WOTS pk gen x (real).. avg. 3656.89 us (0.00 sec); median 6,523,570 cycles, 44x: 287,037,080 cycles +Verifying.. avg. 12165.58 us (0.01 sec); median 21,855,113 cycles, 1x: 21,855,113 cycles +Signature size: 17088 (16.69 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-128f-robust_x5 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-128f-robust_x5 new file mode 100644 index 0000000..b18ea9e --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-128f-robust_x5 @@ -0,0 +1,16 @@ +Parameters: n = 16, h = 66, d = 22, b = 6, k = 33, w = 16, way=5, tree height=3, wots_len=35 +Running 10 iterations. +thash avg. 2.34 us (0.00 sec); median 3,750 cycles, 1x: 3,750 cycles +f1600x avg. 5.03 us (0.00 sec); median 9,014 cycles, 1x: 9,014 cycles +thashx avg. 10.32 us (0.00 sec); median 18,531 cycles, 1x: 18,531 cycles +Generating keypair.. avg. 11362.70 us (0.01 sec); median 20,382,274 cycles, 1x: 20,382,274 cycles + - WOTS pk gen x (ideal).. avg. 5672.41 us (0.01 sec); median 10,176,751 cycles, 1x: 10,176,751 cycles + - WOTS pk gen x (real).. avg. 5674.84 us (0.01 sec); median 10,175,713 cycles, 2x: 20,351,426 cycles +Signing.. avg. 260889.12 us (0.26 sec); median 468,277,335 cycles, 1x: 468,277,335 cycles + - FORS signing.. avg. 11132.97 us (0.01 sec); median 19,988,424 cycles, 1x: 19,988,424 cycles + - WOTS pk gen x (ideal).. avg. 5674.45 us (0.01 sec); median 10,176,775 cycles, 35x: 356,187,125 cycles + - WOTS pk gen x (real).. avg. 5675.85 us (0.01 sec); median 10,176,198 cycles, 44x: 447,752,712 cycles +Verifying.. avg. 14756.79 us (0.01 sec); median 26,524,864 cycles, 1x: 26,524,864 cycles +Signature size: 17088 (16.69 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-128f-simple_x3 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-128f-simple_x3 new file mode 100644 index 0000000..8b61ea4 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-128f-simple_x3 @@ -0,0 +1,16 @@ +Parameters: n = 16, h = 66, d = 22, b = 6, k = 33, w = 16, way=3, tree height=3, wots_len=35 +Running 10 iterations. +thash avg. 1.06 us (0.00 sec); median 1,895 cycles, 1x: 1,895 cycles +f1600x avg. 3.24 us (0.00 sec); median 5,803 cycles, 1x: 5,803 cycles +thashx avg. 3.35 us (0.00 sec); median 6,008 cycles, 1x: 6,008 cycles +Generating keypair.. avg. 5694.45 us (0.01 sec); median 10,211,396 cycles, 1x: 10,211,396 cycles + - WOTS pk gen x (ideal).. avg. 1906.09 us (0.00 sec); median 3,396,786 cycles, 2x: 6,793,572 cycles + - WOTS pk gen x (real).. avg. 1905.15 us (0.00 sec); median 3,396,811 cycles, 3x: 10,190,433 cycles +Signing.. avg. 132511.96 us (0.13 sec); median 237,720,306 cycles, 1x: 237,720,306 cycles + - FORS signing.. avg. 7189.00 us (0.01 sec); median 12,857,354 cycles, 1x: 12,857,354 cycles + - WOTS pk gen x (ideal).. avg. 1913.81 us (0.00 sec); median 3,398,836 cycles, 58x: 197,132,488 cycles + - WOTS pk gen x (real).. avg. 1905.74 us (0.00 sec); median 3,397,402 cycles, 66x: 224,228,532 cycles +Verifying.. avg. 7414.17 us (0.01 sec); median 13,268,701 cycles, 1x: 13,268,701 cycles +Signature size: 17088 (16.69 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-128f-simple_x4 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-128f-simple_x4 new file mode 100644 index 0000000..ef44e64 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-128f-simple_x4 @@ -0,0 +1,16 @@ +Parameters: n = 16, h = 66, d = 22, b = 6, k = 33, w = 16, way=4, tree height=3, wots_len=35 +Running 10 iterations. +thash avg. 1.06 us (0.00 sec); median 1,891 cycles, 1x: 1,891 cycles +f1600x avg. 3.19 us (0.00 sec); median 5,696 cycles, 1x: 5,696 cycles +thashx avg. 3.38 us (0.00 sec); median 5,995 cycles, 1x: 5,995 cycles +Generating keypair.. avg. 3835.93 us (0.00 sec); median 6,828,781 cycles, 1x: 6,828,781 cycles + - WOTS pk gen x (ideal).. avg. 1905.20 us (0.00 sec); median 3,395,494 cycles, 2x: 6,790,988 cycles + - WOTS pk gen x (real).. avg. 1901.41 us (0.00 sec); median 3,396,060 cycles, 2x: 6,792,120 cycles +Signing.. avg. 89016.31 us (0.09 sec); median 159,835,839 cycles, 1x: 159,835,839 cycles + - FORS signing.. avg. 5485.00 us (0.01 sec); median 9,835,577 cycles, 1x: 9,835,577 cycles + - WOTS pk gen x (ideal).. avg. 1904.81 us (0.00 sec); median 3,403,283 cycles, 44x: 149,744,452 cycles + - WOTS pk gen x (real).. avg. 1902.40 us (0.00 sec); median 3,401,261 cycles, 44x: 149,655,484 cycles +Verifying.. avg. 5835.21 us (0.01 sec); median 10,466,594 cycles, 1x: 10,466,594 cycles +Signature size: 17088 (16.69 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-128f-simple_x5 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-128f-simple_x5 new file mode 100644 index 0000000..cea67ac --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-128f-simple_x5 @@ -0,0 +1,16 @@ +Parameters: n = 16, h = 66, d = 22, b = 6, k = 33, w = 16, way=5, tree height=3, wots_len=35 +Running 10 iterations. +thash avg. 1.06 us (0.00 sec); median 1,896 cycles, 1x: 1,896 cycles +f1600x avg. 5.03 us (0.00 sec); median 9,015 cycles, 1x: 9,015 cycles +thashx avg. 5.21 us (0.00 sec); median 9,341 cycles, 1x: 9,341 cycles +Generating keypair.. avg. 5922.41 us (0.01 sec); median 10,601,710 cycles, 1x: 10,601,710 cycles + - WOTS pk gen x (ideal).. avg. 2963.85 us (0.00 sec); median 5,298,933 cycles, 1x: 5,298,933 cycles + - WOTS pk gen x (real).. avg. 2959.04 us (0.00 sec); median 5,299,278 cycles, 2x: 10,598,556 cycles +Signing.. avg. 136407.96 us (0.14 sec); median 245,317,772 cycles, 1x: 245,317,772 cycles + - FORS signing.. avg. 6781.60 us (0.01 sec); median 12,169,471 cycles, 1x: 12,169,471 cycles + - WOTS pk gen x (ideal).. avg. 2958.04 us (0.00 sec); median 5,297,525 cycles, 35x: 185,413,375 cycles + - WOTS pk gen x (real).. avg. 2959.96 us (0.00 sec); median 5,296,441 cycles, 44x: 233,043,404 cycles +Verifying.. avg. 7518.26 us (0.01 sec); median 13,492,329 cycles, 1x: 13,492,329 cycles +Signature size: 17088 (16.69 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-128s-robust_x3 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-128s-robust_x3 new file mode 100644 index 0000000..fc2cab9 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-128s-robust_x3 @@ -0,0 +1,16 @@ +Parameters: n = 16, h = 63, d = 7, b = 12, k = 14, w = 16, way=3, tree height=9, wots_len=35 +Running 10 iterations. +thash avg. 2.50 us (0.00 sec); median 3,757 cycles, 1x: 3,757 cycles +f1600x avg. 3.24 us (0.00 sec); median 5,803 cycles, 1x: 5,803 cycles +thashx avg. 6.63 us (0.00 sec); median 11,909 cycles, 1x: 11,909 cycles +Generating keypair.. avg. 622002.35 us (0.62 sec); median 1,119,527,374 cycles, 1x: 1,119,527,374 cycles + - WOTS pk gen x (ideal).. avg. 3650.62 us (0.00 sec); median 6,539,052 cycles, 170x: 1,111,638,840 cycles + - WOTS pk gen x (real).. avg. 3651.68 us (0.00 sec); median 6,538,569 cycles, 171x: 1,118,095,299 cycles +Signing.. avg. 4666163.74 us (4.67 sec); median 8,398,649,663 cycles, 1x: 8,398,649,663 cycles + - FORS signing.. avg. 312129.52 us (0.31 sec); median 561,847,354 cycles, 1x: 561,847,354 cycles + - WOTS pk gen x (ideal).. avg. 3651.84 us (0.00 sec); median 6,539,438 cycles, 1194x: 7,808,088,972 cycles + - WOTS pk gen x (real).. avg. 3647.39 us (0.00 sec); median 6,538,887 cycles, 1197x: 7,827,047,739 cycles +Verifying.. avg. 4980.46 us (0.00 sec); median 8,923,238 cycles, 1x: 8,923,238 cycles +Signature size: 7856 (7.67 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-128s-robust_x4 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-128s-robust_x4 new file mode 100644 index 0000000..4fae8a1 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-128s-robust_x4 @@ -0,0 +1,16 @@ +Parameters: n = 16, h = 63, d = 7, b = 12, k = 14, w = 16, way=4, tree height=9, wots_len=35 +Running 10 iterations. +thash avg. 2.10 us (0.00 sec); median 3,749 cycles, 1x: 3,749 cycles +f1600x avg. 3.18 us (0.00 sec); median 5,693 cycles, 1x: 5,693 cycles +thashx avg. 6.60 us (0.00 sec); median 11,846 cycles, 1x: 11,846 cycles +Generating keypair.. avg. 464425.41 us (0.46 sec); median 835,847,388 cycles, 1x: 835,847,388 cycles + - WOTS pk gen x (ideal).. avg. 3645.69 us (0.00 sec); median 6,524,308 cycles, 128x: 835,111,424 cycles + - WOTS pk gen x (real).. avg. 3640.20 us (0.00 sec); median 6,523,525 cycles, 128x: 835,011,200 cycles +Signing.. avg. 3488284.87 us (3.49 sec); median 6,278,826,089 cycles, 1x: 6,278,826,089 cycles + - FORS signing.. avg. 238092.83 us (0.24 sec); median 428,359,360 cycles, 1x: 428,359,360 cycles + - WOTS pk gen x (ideal).. avg. 3654.27 us (0.00 sec); median 6,521,686 cycles, 896x: 5,843,430,656 cycles + - WOTS pk gen x (real).. avg. 3638.83 us (0.00 sec); median 6,520,230 cycles, 896x: 5,842,126,080 cycles +Verifying.. avg. 3864.23 us (0.00 sec); median 6,915,664 cycles, 1x: 6,915,664 cycles +Signature size: 7856 (7.67 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-128s-robust_x5 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-128s-robust_x5 new file mode 100644 index 0000000..ab6e4c1 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-128s-robust_x5 @@ -0,0 +1,16 @@ +Parameters: n = 16, h = 63, d = 7, b = 12, k = 14, w = 16, way=5, tree height=9, wots_len=35 +Running 10 iterations. +thash avg. 2.10 us (0.00 sec); median 3,756 cycles, 1x: 3,756 cycles +f1600x avg. 5.02 us (0.00 sec); median 9,016 cycles, 1x: 9,016 cycles +thashx avg. 10.32 us (0.00 sec); median 18,537 cycles, 1x: 18,537 cycles +Generating keypair.. avg. 583603.83 us (0.58 sec); median 1,050,457,714 cycles, 1x: 1,050,457,714 cycles + - WOTS pk gen x (ideal).. avg. 5673.63 us (0.01 sec); median 10,182,295 cycles, 102x: 1,038,594,090 cycles + - WOTS pk gen x (real).. avg. 5676.49 us (0.01 sec); median 10,181,550 cycles, 103x: 1,048,699,650 cycles +Signing.. avg. 4385377.29 us (4.39 sec); median 7,892,229,007 cycles, 1x: 7,892,229,007 cycles + - FORS signing.. avg. 299474.44 us (0.30 sec); median 539,170,689 cycles, 1x: 539,170,689 cycles + - WOTS pk gen x (ideal).. avg. 5675.24 us (0.01 sec); median 10,182,480 cycles, 716x: 7,290,655,680 cycles + - WOTS pk gen x (real).. avg. 5676.58 us (0.01 sec); median 10,180,823 cycles, 721x: 7,340,373,383 cycles +Verifying.. avg. 4997.77 us (0.00 sec); median 8,958,859 cycles, 1x: 8,958,859 cycles +Signature size: 7856 (7.67 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-128s-simple_x3 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-128s-simple_x3 new file mode 100644 index 0000000..79cfa92 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-128s-simple_x3 @@ -0,0 +1,16 @@ +Parameters: n = 16, h = 63, d = 7, b = 12, k = 14, w = 16, way=3, tree height=9, wots_len=35 +Running 10 iterations. +thash avg. 1.06 us (0.00 sec); median 1,892 cycles, 1x: 1,892 cycles +f1600x avg. 3.25 us (0.00 sec); median 5,805 cycles, 1x: 5,805 cycles +thashx avg. 3.35 us (0.00 sec); median 6,002 cycles, 1x: 6,002 cycles +Generating keypair.. avg. 323464.59 us (0.32 sec); median 581,990,056 cycles, 1x: 581,990,056 cycles + - WOTS pk gen x (ideal).. avg. 1929.80 us (0.00 sec); median 3,402,134 cycles, 170x: 578,362,780 cycles + - WOTS pk gen x (real).. avg. 2050.64 us (0.00 sec); median 3,404,206 cycles, 171x: 582,119,226 cycles +Signing.. avg. 2455410.80 us (2.46 sec); median 4,422,416,514 cycles, 1x: 4,422,416,514 cycles + - FORS signing.. avg. 189801.86 us (0.19 sec); median 341,701,269 cycles, 1x: 341,701,269 cycles + - WOTS pk gen x (ideal).. avg. 1905.82 us (0.00 sec); median 3,405,407 cycles, 1194x: 4,066,055,958 cycles + - WOTS pk gen x (real).. avg. 1901.78 us (0.00 sec); median 3,402,416 cycles, 1197x: 4,072,691,952 cycles +Verifying.. avg. 2497.55 us (0.00 sec); median 4,465,601 cycles, 1x: 4,465,601 cycles +Signature size: 7856 (7.67 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-128s-simple_x4 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-128s-simple_x4 new file mode 100644 index 0000000..62da4fb --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-128s-simple_x4 @@ -0,0 +1,16 @@ +Parameters: n = 16, h = 63, d = 7, b = 12, k = 14, w = 16, way=4, tree height=9, wots_len=35 +Running 10 iterations. +thash avg. 1.12 us (0.00 sec); median 1,894 cycles, 1x: 1,894 cycles +f1600x avg. 3.17 us (0.00 sec); median 5,686 cycles, 1x: 5,686 cycles +thashx avg. 3.34 us (0.00 sec); median 5,982 cycles, 1x: 5,982 cycles +Generating keypair.. avg. 242263.11 us (0.24 sec); median 435,909,330 cycles, 1x: 435,909,330 cycles + - WOTS pk gen x (ideal).. avg. 1903.68 us (0.00 sec); median 3,398,373 cycles, 128x: 434,991,744 cycles + - WOTS pk gen x (real).. avg. 1905.27 us (0.00 sec); median 3,406,762 cycles, 128x: 436,065,536 cycles +Signing.. avg. 1839875.47 us (1.84 sec); median 3,311,809,970 cycles, 1x: 3,311,809,970 cycles + - FORS signing.. avg. 144352.77 us (0.14 sec); median 259,830,572 cycles, 1x: 259,830,572 cycles + - WOTS pk gen x (ideal).. avg. 1902.79 us (0.00 sec); median 3,398,784 cycles, 896x: 3,045,310,464 cycles + - WOTS pk gen x (real).. avg. 1902.72 us (0.00 sec); median 3,398,347 cycles, 896x: 3,044,918,912 cycles +Verifying.. avg. 2018.57 us (0.00 sec); median 3,608,149 cycles, 1x: 3,608,149 cycles +Signature size: 7856 (7.67 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-128s-simple_x5 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-128s-simple_x5 new file mode 100644 index 0000000..d4cbd70 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-128s-simple_x5 @@ -0,0 +1,16 @@ +Parameters: n = 16, h = 63, d = 7, b = 12, k = 14, w = 16, way=5, tree height=9, wots_len=35 +Running 10 iterations. +thash avg. 1.06 us (0.00 sec); median 1,891 cycles, 1x: 1,891 cycles +f1600x avg. 5.02 us (0.00 sec); median 9,013 cycles, 1x: 9,013 cycles +thashx avg. 5.21 us (0.00 sec); median 9,349 cycles, 1x: 9,349 cycles +Generating keypair.. avg. 303396.97 us (0.30 sec); median 546,143,175 cycles, 1x: 546,143,175 cycles + - WOTS pk gen x (ideal).. avg. 2963.20 us (0.00 sec); median 5,303,839 cycles, 102x: 540,991,578 cycles + - WOTS pk gen x (real).. avg. 2963.23 us (0.00 sec); median 5,303,010 cycles, 103x: 546,210,030 cycles +Signing.. avg. 2305213.89 us (2.31 sec); median 4,149,832,962 cycles, 1x: 4,149,832,962 cycles + - FORS signing.. avg. 181583.78 us (0.18 sec); median 326,606,622 cycles, 1x: 326,606,622 cycles + - WOTS pk gen x (ideal).. avg. 2970.36 us (0.00 sec); median 5,303,864 cycles, 716x: 3,797,566,624 cycles + - WOTS pk gen x (real).. avg. 2962.19 us (0.00 sec); median 5,303,933 cycles, 721x: 3,824,135,693 cycles +Verifying.. avg. 2408.33 us (0.00 sec); median 4,308,013 cycles, 1x: 4,308,013 cycles +Signature size: 7856 (7.67 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-192f-robust_x3 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-192f-robust_x3 new file mode 100644 index 0000000..9006ae2 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-192f-robust_x3 @@ -0,0 +1,16 @@ +Parameters: n = 24, h = 66, d = 22, b = 8, k = 33, w = 16, way=3, tree height=3, wots_len=51 +Running 10 iterations. +thash avg. 2.29 us (0.00 sec); median 3,782 cycles, 1x: 3,782 cycles +f1600x avg. 3.24 us (0.00 sec); median 5,806 cycles, 1x: 5,806 cycles +thashx avg. 6.71 us (0.00 sec); median 12,009 cycles, 1x: 12,009 cycles +Generating keypair.. avg. 16094.17 us (0.02 sec); median 28,916,146 cycles, 1x: 28,916,146 cycles + - WOTS pk gen x (ideal).. avg. 5373.89 us (0.01 sec); median 9,634,189 cycles, 2x: 19,268,378 cycles + - WOTS pk gen x (real).. avg. 5369.72 us (0.01 sec); median 9,634,978 cycles, 3x: 28,904,934 cycles +Signing.. avg. 400013.32 us (0.40 sec); median 720,030,020 cycles, 1x: 720,030,020 cycles + - FORS signing.. avg. 46654.95 us (0.05 sec); median 83,969,737 cycles, 1x: 83,969,737 cycles + - WOTS pk gen x (ideal).. avg. 5373.40 us (0.01 sec); median 9,633,412 cycles, 58x: 558,737,896 cycles + - WOTS pk gen x (real).. avg. 5369.90 us (0.01 sec); median 9,635,422 cycles, 66x: 635,937,852 cycles +Verifying.. avg. 21087.26 us (0.02 sec); median 37,892,507 cycles, 1x: 37,892,507 cycles +Signature size: 35664 (34.83 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-192f-robust_x4 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-192f-robust_x4 new file mode 100644 index 0000000..88d68bb --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-192f-robust_x4 @@ -0,0 +1,16 @@ +Parameters: n = 24, h = 66, d = 22, b = 8, k = 33, w = 16, way=4, tree height=3, wots_len=51 +Running 10 iterations. +thash avg. 2.22 us (0.00 sec); median 3,788 cycles, 1x: 3,788 cycles +f1600x avg. 3.18 us (0.00 sec); median 5,693 cycles, 1x: 5,693 cycles +thashx avg. 6.66 us (0.00 sec); median 11,955 cycles, 1x: 11,955 cycles +Generating keypair.. avg. 10756.05 us (0.01 sec); median 19,277,403 cycles, 1x: 19,277,403 cycles + - WOTS pk gen x (ideal).. avg. 5381.09 us (0.01 sec); median 9,654,396 cycles, 2x: 19,308,792 cycles + - WOTS pk gen x (real).. avg. 5384.89 us (0.01 sec); median 9,657,289 cycles, 2x: 19,314,578 cycles +Signing.. avg. 271346.87 us (0.27 sec); median 488,193,765 cycles, 1x: 488,193,765 cycles + - FORS signing.. avg. 35736.89 us (0.04 sec); median 64,303,158 cycles, 1x: 64,303,158 cycles + - WOTS pk gen x (ideal).. avg. 5362.94 us (0.01 sec); median 9,616,564 cycles, 44x: 423,128,816 cycles + - WOTS pk gen x (real).. avg. 5363.51 us (0.01 sec); median 9,617,467 cycles, 44x: 423,168,548 cycles +Verifying.. avg. 16644.37 us (0.02 sec); median 29,913,235 cycles, 1x: 29,913,235 cycles +Signature size: 35664 (34.83 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-192f-robust_x5 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-192f-robust_x5 new file mode 100644 index 0000000..50e1086 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-192f-robust_x5 @@ -0,0 +1,16 @@ +Parameters: n = 24, h = 66, d = 22, b = 8, k = 33, w = 16, way=5, tree height=3, wots_len=51 +Running 10 iterations. +thash avg. 2.11 us (0.00 sec); median 3,781 cycles, 1x: 3,781 cycles +f1600x avg. 5.03 us (0.00 sec); median 9,016 cycles, 1x: 9,016 cycles +thashx avg. 10.39 us (0.00 sec); median 18,661 cycles, 1x: 18,661 cycles +Generating keypair.. avg. 16712.63 us (0.02 sec); median 29,998,180 cycles, 1x: 29,998,180 cycles + - WOTS pk gen x (ideal).. avg. 8347.97 us (0.01 sec); median 14,983,020 cycles, 1x: 14,983,020 cycles + - WOTS pk gen x (real).. avg. 8344.49 us (0.01 sec); median 14,983,213 cycles, 2x: 29,966,426 cycles +Signing.. avg. 411559.27 us (0.41 sec); median 740,723,528 cycles, 1x: 740,723,528 cycles + - FORS signing.. avg. 44961.54 us (0.04 sec); median 80,907,698 cycles, 1x: 80,907,698 cycles + - WOTS pk gen x (ideal).. avg. 8347.04 us (0.01 sec); median 14,984,520 cycles, 35x: 524,458,200 cycles + - WOTS pk gen x (real).. avg. 8345.03 us (0.01 sec); median 14,981,443 cycles, 44x: 659,183,492 cycles +Verifying.. avg. 20582.37 us (0.02 sec); median 36,998,686 cycles, 1x: 36,998,686 cycles +Signature size: 35664 (34.83 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-192f-simple_x3 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-192f-simple_x3 new file mode 100644 index 0000000..8c3e0e3 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-192f-simple_x3 @@ -0,0 +1,16 @@ +Parameters: n = 24, h = 66, d = 22, b = 8, k = 33, w = 16, way=3, tree height=3, wots_len=51 +Running 10 iterations. +thash avg. 1.07 us (0.00 sec); median 1,909 cycles, 1x: 1,909 cycles +f1600x avg. 3.23 us (0.00 sec); median 5,768 cycles, 1x: 5,768 cycles +thashx avg. 3.39 us (0.00 sec); median 6,087 cycles, 1x: 6,087 cycles +Generating keypair.. avg. 8425.21 us (0.01 sec); median 15,112,951 cycles, 1x: 15,112,951 cycles + - WOTS pk gen x (ideal).. avg. 2816.88 us (0.00 sec); median 5,037,647 cycles, 2x: 10,075,294 cycles + - WOTS pk gen x (real).. avg. 2810.65 us (0.00 sec); median 5,036,087 cycles, 3x: 15,108,261 cycles +Signing.. avg. 213162.91 us (0.21 sec); median 383,659,234 cycles, 1x: 383,659,234 cycles + - FORS signing.. avg. 28506.41 us (0.03 sec); median 51,284,917 cycles, 1x: 51,284,917 cycles + - WOTS pk gen x (ideal).. avg. 2813.59 us (0.00 sec); median 5,034,777 cycles, 58x: 292,017,066 cycles + - WOTS pk gen x (real).. avg. 2811.65 us (0.00 sec); median 5,034,566 cycles, 66x: 332,281,356 cycles +Verifying.. avg. 10724.55 us (0.01 sec); median 19,267,578 cycles, 1x: 19,267,578 cycles +Signature size: 35664 (34.83 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-192f-simple_x4 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-192f-simple_x4 new file mode 100644 index 0000000..100f6cd --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-192f-simple_x4 @@ -0,0 +1,16 @@ +Parameters: n = 24, h = 66, d = 22, b = 8, k = 33, w = 16, way=4, tree height=3, wots_len=51 +Running 10 iterations. +thash avg. 1.08 us (0.00 sec); median 1,910 cycles, 1x: 1,910 cycles +f1600x avg. 3.18 us (0.00 sec); median 5,690 cycles, 1x: 5,690 cycles +thashx avg. 3.39 us (0.00 sec); median 6,070 cycles, 1x: 6,070 cycles +Generating keypair.. avg. 5654.72 us (0.01 sec); median 10,117,805 cycles, 1x: 10,117,805 cycles + - WOTS pk gen x (ideal).. avg. 2818.31 us (0.00 sec); median 5,045,755 cycles, 2x: 10,091,510 cycles + - WOTS pk gen x (real).. avg. 2819.38 us (0.00 sec); median 5,046,034 cycles, 2x: 10,092,068 cycles +Signing.. avg. 145434.74 us (0.15 sec); median 261,680,353 cycles, 1x: 261,680,353 cycles + - FORS signing.. avg. 21774.07 us (0.02 sec); median 39,145,703 cycles, 1x: 39,145,703 cycles + - WOTS pk gen x (ideal).. avg. 2819.41 us (0.00 sec); median 5,045,688 cycles, 44x: 222,010,272 cycles + - WOTS pk gen x (real).. avg. 2818.82 us (0.00 sec); median 5,045,862 cycles, 44x: 222,017,928 cycles +Verifying.. avg. 8333.13 us (0.01 sec); median 14,967,391 cycles, 1x: 14,967,391 cycles +Signature size: 35664 (34.83 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-192f-simple_x5 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-192f-simple_x5 new file mode 100644 index 0000000..3d46e91 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-192f-simple_x5 @@ -0,0 +1,16 @@ +Parameters: n = 24, h = 66, d = 22, b = 8, k = 33, w = 16, way=5, tree height=3, wots_len=51 +Running 10 iterations. +thash avg. 1.45 us (0.00 sec); median 1,912 cycles, 1x: 1,912 cycles +f1600x avg. 5.07 us (0.00 sec); median 8,959 cycles, 1x: 8,959 cycles +thashx avg. 5.27 us (0.00 sec); median 9,459 cycles, 1x: 9,459 cycles +Generating keypair.. avg. 8748.58 us (0.01 sec); median 15,690,431 cycles, 1x: 15,690,431 cycles + - WOTS pk gen x (ideal).. avg. 4376.60 us (0.00 sec); median 7,836,846 cycles, 1x: 7,836,846 cycles + - WOTS pk gen x (real).. avg. 4392.55 us (0.00 sec); median 7,838,217 cycles, 2x: 15,676,434 cycles +Signing.. avg. 219291.15 us (0.22 sec); median 394,332,711 cycles, 1x: 394,332,711 cycles + - FORS signing.. avg. 27356.32 us (0.03 sec); median 49,209,905 cycles, 1x: 49,209,905 cycles + - WOTS pk gen x (ideal).. avg. 4373.34 us (0.00 sec); median 7,838,999 cycles, 35x: 274,364,965 cycles + - WOTS pk gen x (real).. avg. 4373.74 us (0.00 sec); median 7,839,032 cycles, 44x: 344,917,408 cycles +Verifying.. avg. 10340.21 us (0.01 sec); median 18,546,624 cycles, 1x: 18,546,624 cycles +Signature size: 35664 (34.83 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-192s-robust_x3 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-192s-robust_x3 new file mode 100644 index 0000000..66b9200 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-192s-robust_x3 @@ -0,0 +1,16 @@ +Parameters: n = 24, h = 63, d = 7, b = 14, k = 17, w = 16, way=3, tree height=9, wots_len=51 +Running 10 iterations. +thash avg. 2.21 us (0.00 sec); median 3,783 cycles, 1x: 3,783 cycles +f1600x avg. 3.24 us (0.00 sec); median 5,804 cycles, 1x: 5,804 cycles +thashx avg. 6.69 us (0.00 sec); median 12,003 cycles, 1x: 12,003 cycles +Generating keypair.. avg. 915767.64 us (0.92 sec); median 1,648,522,765 cycles, 1x: 1,648,522,765 cycles + - WOTS pk gen x (ideal).. avg. 5371.67 us (0.01 sec); median 9,629,517 cycles, 170x: 1,637,017,890 cycles + - WOTS pk gen x (real).. avg. 5370.36 us (0.01 sec); median 9,630,383 cycles, 171x: 1,646,795,493 cycles +Signing.. avg. 7943882.56 us (7.94 sec); median 14,299,201,577 cycles, 1x: 14,299,201,577 cycles + - FORS signing.. avg. 1532260.07 us (1.53 sec); median 2,758,466,216 cycles, 1x: 2,758,466,216 cycles + - WOTS pk gen x (ideal).. avg. 5372.77 us (0.01 sec); median 9,630,552 cycles, 1194x: 11,498,879,088 cycles + - WOTS pk gen x (real).. avg. 5374.25 us (0.01 sec); median 9,631,565 cycles, 1197x: 11,528,983,305 cycles +Verifying.. avg. 6951.15 us (0.01 sec); median 12,469,581 cycles, 1x: 12,469,581 cycles +Signature size: 16224 (15.84 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-192s-robust_x4 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-192s-robust_x4 new file mode 100644 index 0000000..c83c984 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-192s-robust_x4 @@ -0,0 +1,16 @@ +Parameters: n = 24, h = 63, d = 7, b = 14, k = 17, w = 16, way=4, tree height=9, wots_len=51 +Running 10 iterations. +thash avg. 2.11 us (0.00 sec); median 3,782 cycles, 1x: 3,782 cycles +f1600x avg. 3.17 us (0.00 sec); median 5,690 cycles, 1x: 5,690 cycles +thashx avg. 6.66 us (0.00 sec); median 11,957 cycles, 1x: 11,957 cycles +Generating keypair.. avg. 685393.64 us (0.69 sec); median 1,233,327,133 cycles, 1x: 1,233,327,133 cycles + - WOTS pk gen x (ideal).. avg. 5372.24 us (0.01 sec); median 9,625,238 cycles, 128x: 1,232,030,464 cycles + - WOTS pk gen x (real).. avg. 5377.80 us (0.01 sec); median 9,657,019 cycles, 128x: 1,236,098,432 cycles +Signing.. avg. 5964720.58 us (5.96 sec); median 10,737,166,587 cycles, 1x: 10,737,166,587 cycles + - FORS signing.. avg. 1168114.77 us (1.17 sec); median 2,103,026,809 cycles, 1x: 2,103,026,809 cycles + - WOTS pk gen x (ideal).. avg. 5371.34 us (0.01 sec); median 9,622,340 cycles, 896x: 8,621,616,640 cycles + - WOTS pk gen x (real).. avg. 5379.98 us (0.01 sec); median 9,659,184 cycles, 896x: 8,654,628,864 cycles +Verifying.. avg. 5639.60 us (0.01 sec); median 10,106,128 cycles, 1x: 10,106,128 cycles +Signature size: 16224 (15.84 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-192s-robust_x5 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-192s-robust_x5 new file mode 100644 index 0000000..80b181f --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-192s-robust_x5 @@ -0,0 +1,16 @@ +Parameters: n = 24, h = 63, d = 7, b = 14, k = 17, w = 16, way=5, tree height=9, wots_len=51 +Running 10 iterations. +thash avg. 2.12 us (0.00 sec); median 3,784 cycles, 1x: 3,784 cycles +f1600x avg. 5.03 us (0.00 sec); median 9,012 cycles, 1x: 9,012 cycles +thashx avg. 10.39 us (0.00 sec); median 18,660 cycles, 1x: 18,660 cycles +Generating keypair.. avg. 859422.57 us (0.86 sec); median 1,545,166,999 cycles, 1x: 1,545,166,999 cycles + - WOTS pk gen x (ideal).. avg. 8347.13 us (0.01 sec); median 14,981,538 cycles, 102x: 1,528,116,876 cycles + - WOTS pk gen x (real).. avg. 8357.60 us (0.01 sec); median 14,980,994 cycles, 103x: 1,543,042,382 cycles +Signing.. avg. 7479310.40 us (7.48 sec); median 13,461,559,199 cycles, 1x: 13,461,559,199 cycles + - FORS signing.. avg. 1469785.39 us (1.47 sec); median 2,645,278,255 cycles, 1x: 2,645,278,255 cycles + - WOTS pk gen x (ideal).. avg. 8354.04 us (0.01 sec); median 14,980,442 cycles, 716x: 10,725,996,472 cycles + - WOTS pk gen x (real).. avg. 8341.50 us (0.01 sec); median 14,977,845 cycles, 721x: 10,799,026,245 cycles +Verifying.. avg. 6843.81 us (0.01 sec); median 12,274,876 cycles, 1x: 12,274,876 cycles +Signature size: 16224 (15.84 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-192s-simple_x3 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-192s-simple_x3 new file mode 100644 index 0000000..1980763 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-192s-simple_x3 @@ -0,0 +1,16 @@ +Parameters: n = 24, h = 63, d = 7, b = 14, k = 17, w = 16, way=3, tree height=9, wots_len=51 +Running 10 iterations. +thash avg. 1.13 us (0.00 sec); median 1,911 cycles, 1x: 1,911 cycles +f1600x avg. 3.22 us (0.00 sec); median 5,766 cycles, 1x: 5,766 cycles +thashx avg. 3.39 us (0.00 sec); median 6,077 cycles, 1x: 6,077 cycles +Generating keypair.. avg. 479364.84 us (0.48 sec); median 862,627,079 cycles, 1x: 862,627,079 cycles + - WOTS pk gen x (ideal).. avg. 2816.92 us (0.00 sec); median 5,039,353 cycles, 170x: 856,690,010 cycles + - WOTS pk gen x (real).. avg. 2814.55 us (0.00 sec); median 5,039,979 cycles, 171x: 861,836,409 cycles +Signing.. avg. 4289309.81 us (4.29 sec); median 7,721,109,247 cycles, 1x: 7,721,109,247 cycles + - FORS signing.. avg. 934412.88 us (0.93 sec); median 1,682,313,977 cycles, 1x: 1,682,313,977 cycles + - WOTS pk gen x (ideal).. avg. 2815.07 us (0.00 sec); median 5,038,326 cycles, 1194x: 6,015,761,244 cycles + - WOTS pk gen x (real).. avg. 2813.76 us (0.00 sec); median 5,037,594 cycles, 1197x: 6,030,000,018 cycles +Verifying.. avg. 3716.60 us (0.00 sec); median 6,657,124 cycles, 1x: 6,657,124 cycles +Signature size: 16224 (15.84 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-192s-simple_x4 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-192s-simple_x4 new file mode 100644 index 0000000..f9203ac --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-192s-simple_x4 @@ -0,0 +1,16 @@ +Parameters: n = 24, h = 63, d = 7, b = 14, k = 17, w = 16, way=4, tree height=9, wots_len=51 +Running 10 iterations. +thash avg. 1.07 us (0.00 sec); median 1,910 cycles, 1x: 1,910 cycles +f1600x avg. 3.18 us (0.00 sec); median 5,695 cycles, 1x: 5,695 cycles +thashx avg. 3.39 us (0.00 sec); median 6,069 cycles, 1x: 6,069 cycles +Generating keypair.. avg. 359600.54 us (0.36 sec); median 647,125,764 cycles, 1x: 647,125,764 cycles + - WOTS pk gen x (ideal).. avg. 2814.98 us (0.00 sec); median 5,040,321 cycles, 128x: 645,161,088 cycles + - WOTS pk gen x (real).. avg. 2817.88 us (0.00 sec); median 5,041,628 cycles, 128x: 645,328,384 cycles +Signing.. avg. 3229339.31 us (3.23 sec); median 5,813,184,781 cycles, 1x: 5,813,184,781 cycles + - FORS signing.. avg. 711571.06 us (0.71 sec); median 1,281,056,874 cycles, 1x: 1,281,056,874 cycles + - WOTS pk gen x (ideal).. avg. 2825.02 us (0.00 sec); median 5,044,330 cycles, 896x: 4,519,719,680 cycles + - WOTS pk gen x (real).. avg. 2816.58 us (0.00 sec); median 5,044,415 cycles, 896x: 4,519,795,840 cycles +Verifying.. avg. 2821.18 us (0.00 sec); median 5,040,001 cycles, 1x: 5,040,001 cycles +Signature size: 16224 (15.84 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-192s-simple_x5 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-192s-simple_x5 new file mode 100644 index 0000000..d9da3b1 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-192s-simple_x5 @@ -0,0 +1,16 @@ +Parameters: n = 24, h = 63, d = 7, b = 14, k = 17, w = 16, way=5, tree height=9, wots_len=51 +Running 10 iterations. +thash avg. 1.07 us (0.00 sec); median 1,913 cycles, 1x: 1,913 cycles +f1600x avg. 5.00 us (0.00 sec); median 8,957 cycles, 1x: 8,957 cycles +thashx avg. 5.27 us (0.00 sec); median 9,464 cycles, 1x: 9,464 cycles +Generating keypair.. avg. 449542.34 us (0.45 sec); median 808,173,879 cycles, 1x: 808,173,879 cycles + - WOTS pk gen x (ideal).. avg. 4388.35 us (0.00 sec); median 7,831,331 cycles, 102x: 798,795,762 cycles + - WOTS pk gen x (real).. avg. 4363.58 us (0.00 sec); median 7,829,300 cycles, 103x: 806,417,900 cycles +Signing.. avg. 4036062.86 us (4.04 sec); median 7,263,556,068 cycles, 1x: 7,263,556,068 cycles + - FORS signing.. avg. 892054.91 us (0.89 sec); median 1,606,042,039 cycles, 1x: 1,606,042,039 cycles + - WOTS pk gen x (ideal).. avg. 4382.64 us (0.00 sec); median 7,838,247 cycles, 716x: 5,612,184,852 cycles + - WOTS pk gen x (real).. avg. 4370.06 us (0.00 sec); median 7,836,350 cycles, 721x: 5,650,008,350 cycles +Verifying.. avg. 3602.71 us (0.00 sec); median 6,448,200 cycles, 1x: 6,448,200 cycles +Signature size: 16224 (15.84 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-256f-robust_x3 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-256f-robust_x3 new file mode 100644 index 0000000..56cf8ae --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-256f-robust_x3 @@ -0,0 +1,16 @@ +Parameters: n = 32, h = 68, d = 17, b = 9, k = 35, w = 16, way=3, tree height=4, wots_len=67 +Running 10 iterations. +thash avg. 2.39 us (0.00 sec); median 3,789 cycles, 1x: 3,789 cycles +f1600x avg. 3.24 us (0.00 sec); median 5,806 cycles, 1x: 5,806 cycles +thashx avg. 6.72 us (0.00 sec); median 12,058 cycles, 1x: 12,058 cycles +Generating keypair.. avg. 42610.76 us (0.04 sec); median 76,620,046 cycles, 1x: 76,620,046 cycles + - WOTS pk gen x (ideal).. avg. 7108.84 us (0.01 sec); median 12,759,531 cycles, 5x: 63,797,655 cycles + - WOTS pk gen x (real).. avg. 7105.59 us (0.01 sec); median 12,756,800 cycles, 6x: 76,540,800 cycles +Signing.. avg. 822651.03 us (0.82 sec); median 1,480,585,174 cycles, 1x: 1,480,585,174 cycles + - FORS signing.. avg. 99003.17 us (0.10 sec); median 178,164,006 cycles, 1x: 178,164,006 cycles + - WOTS pk gen x (ideal).. avg. 7109.19 us (0.01 sec); median 12,758,464 cycles, 90x: 1,148,261,760 cycles + - WOTS pk gen x (real).. avg. 7108.95 us (0.01 sec); median 12,759,368 cycles, 102x: 1,301,455,536 cycles +Verifying.. avg. 21716.22 us (0.02 sec); median 39,036,579 cycles, 1x: 39,036,579 cycles +Signature size: 49856 (48.69 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-256f-robust_x4 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-256f-robust_x4 new file mode 100644 index 0000000..2e60e3f --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-256f-robust_x4 @@ -0,0 +1,16 @@ +Parameters: n = 32, h = 68, d = 17, b = 9, k = 35, w = 16, way=4, tree height=4, wots_len=67 +Running 10 iterations. +thash avg. 2.13 us (0.00 sec); median 3,790 cycles, 1x: 3,790 cycles +f1600x avg. 3.18 us (0.00 sec); median 5,694 cycles, 1x: 5,694 cycles +thashx avg. 6.71 us (0.00 sec); median 12,076 cycles, 1x: 12,076 cycles +Generating keypair.. avg. 28515.04 us (0.03 sec); median 51,222,934 cycles, 1x: 51,222,934 cycles + - WOTS pk gen x (ideal).. avg. 7123.91 us (0.01 sec); median 12,775,511 cycles, 4x: 51,102,044 cycles + - WOTS pk gen x (real).. avg. 7125.37 us (0.01 sec); median 12,776,980 cycles, 4x: 51,107,920 cycles +Signing.. avg. 560637.62 us (0.56 sec); median 1,008,762,031 cycles, 1x: 1,008,762,031 cycles + - FORS signing.. avg. 76168.92 us (0.08 sec); median 137,129,343 cycles, 1x: 137,129,343 cycles + - WOTS pk gen x (ideal).. avg. 7127.91 us (0.01 sec); median 12,778,248 cycles, 68x: 868,920,864 cycles + - WOTS pk gen x (real).. avg. 7141.29 us (0.01 sec); median 12,827,771 cycles, 68x: 872,288,428 cycles +Verifying.. avg. 16402.59 us (0.02 sec); median 29,471,928 cycles, 1x: 29,471,928 cycles +Signature size: 49856 (48.69 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-256f-robust_x5 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-256f-robust_x5 new file mode 100644 index 0000000..1c8776e --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-256f-robust_x5 @@ -0,0 +1,16 @@ +Parameters: n = 32, h = 68, d = 17, b = 9, k = 35, w = 16, way=5, tree height=4, wots_len=67 +Running 10 iterations. +thash avg. 2.13 us (0.00 sec); median 3,790 cycles, 1x: 3,790 cycles +f1600x avg. 5.02 us (0.00 sec); median 9,016 cycles, 1x: 9,016 cycles +thashx avg. 10.44 us (0.00 sec); median 18,768 cycles, 1x: 18,768 cycles +Generating keypair.. avg. 44238.55 us (0.04 sec); median 79,545,936 cycles, 1x: 79,545,936 cycles + - WOTS pk gen x (ideal).. avg. 11056.13 us (0.01 sec); median 19,861,646 cycles, 3x: 59,584,938 cycles + - WOTS pk gen x (real).. avg. 11058.10 us (0.01 sec); median 19,862,386 cycles, 4x: 79,449,544 cycles +Signing.. avg. 846394.93 us (0.85 sec); median 1,523,256,812 cycles, 1x: 1,523,256,812 cycles + - FORS signing.. avg. 95075.44 us (0.10 sec); median 171,129,769 cycles, 1x: 171,129,769 cycles + - WOTS pk gen x (ideal).. avg. 11062.72 us (0.01 sec); median 19,863,765 cycles, 54x: 1,072,643,310 cycles + - WOTS pk gen x (real).. avg. 11057.50 us (0.01 sec); median 19,860,006 cycles, 68x: 1,350,480,408 cycles +Verifying.. avg. 20394.50 us (0.02 sec); median 36,666,887 cycles, 1x: 36,666,887 cycles +Signature size: 49856 (48.69 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-256f-simple_x3 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-256f-simple_x3 new file mode 100644 index 0000000..de56c6c --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-256f-simple_x3 @@ -0,0 +1,16 @@ +Parameters: n = 32, h = 68, d = 17, b = 9, k = 35, w = 16, way=3, tree height=4, wots_len=67 +Running 10 iterations. +thash avg. 1.21 us (0.00 sec); median 1,917 cycles, 1x: 1,917 cycles +f1600x avg. 3.24 us (0.00 sec); median 5,804 cycles, 1x: 5,804 cycles +thashx avg. 3.42 us (0.00 sec); median 6,131 cycles, 1x: 6,131 cycles +Generating keypair.. avg. 22385.66 us (0.02 sec); median 40,234,869 cycles, 1x: 40,234,869 cycles + - WOTS pk gen x (ideal).. avg. 3735.00 us (0.00 sec); median 6,691,763 cycles, 5x: 33,458,815 cycles + - WOTS pk gen x (real).. avg. 3744.92 us (0.00 sec); median 6,693,439 cycles, 6x: 40,160,634 cycles +Signing.. avg. 440361.29 us (0.44 sec); median 792,675,755 cycles, 1x: 792,675,755 cycles + - FORS signing.. avg. 60416.07 us (0.06 sec); median 108,736,567 cycles, 1x: 108,736,567 cycles + - WOTS pk gen x (ideal).. avg. 3744.40 us (0.00 sec); median 6,691,692 cycles, 90x: 602,252,280 cycles + - WOTS pk gen x (real).. avg. 3735.11 us (0.00 sec); median 6,692,827 cycles, 102x: 682,668,354 cycles +Verifying.. avg. 10870.69 us (0.01 sec); median 19,524,716 cycles, 1x: 19,524,716 cycles +Signature size: 49856 (48.69 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-256f-simple_x4 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-256f-simple_x4 new file mode 100644 index 0000000..74618f1 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-256f-simple_x4 @@ -0,0 +1,16 @@ +Parameters: n = 32, h = 68, d = 17, b = 9, k = 35, w = 16, way=4, tree height=4, wots_len=67 +Running 10 iterations. +thash avg. 1.27 us (0.00 sec); median 1,916 cycles, 1x: 1,916 cycles +f1600x avg. 3.24 us (0.00 sec); median 5,693 cycles, 1x: 5,693 cycles +thashx avg. 3.43 us (0.00 sec); median 6,142 cycles, 1x: 6,142 cycles +Generating keypair.. avg. 15038.00 us (0.02 sec); median 27,032,209 cycles, 1x: 27,032,209 cycles + - WOTS pk gen x (ideal).. avg. 3763.94 us (0.00 sec); median 6,744,279 cycles, 4x: 26,977,116 cycles + - WOTS pk gen x (real).. avg. 3762.36 us (0.00 sec); median 6,737,199 cycles, 4x: 26,948,796 cycles +Signing.. avg. 301402.45 us (0.30 sec); median 542,274,610 cycles, 1x: 542,274,610 cycles + - FORS signing.. avg. 46538.04 us (0.05 sec); median 83,735,615 cycles, 1x: 83,735,615 cycles + - WOTS pk gen x (ideal).. avg. 3761.35 us (0.00 sec); median 6,736,242 cycles, 68x: 458,064,456 cycles + - WOTS pk gen x (real).. avg. 3759.39 us (0.00 sec); median 6,737,328 cycles, 68x: 458,138,304 cycles +Verifying.. avg. 8606.41 us (0.01 sec); median 15,430,770 cycles, 1x: 15,430,770 cycles +Signature size: 49856 (48.69 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-256f-simple_x5 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-256f-simple_x5 new file mode 100644 index 0000000..70c6241 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-256f-simple_x5 @@ -0,0 +1,16 @@ +Parameters: n = 32, h = 68, d = 17, b = 9, k = 35, w = 16, way=5, tree height=4, wots_len=67 +Running 10 iterations. +thash avg. 1.20 us (0.00 sec); median 1,914 cycles, 1x: 1,914 cycles +f1600x avg. 5.03 us (0.00 sec); median 9,013 cycles, 1x: 9,013 cycles +thashx avg. 5.31 us (0.00 sec); median 9,530 cycles, 1x: 9,530 cycles +Generating keypair.. avg. 23201.24 us (0.02 sec); median 41,692,122 cycles, 1x: 41,692,122 cycles + - WOTS pk gen x (ideal).. avg. 5816.55 us (0.01 sec); median 10,431,684 cycles, 3x: 31,295,052 cycles + - WOTS pk gen x (real).. avg. 5814.89 us (0.01 sec); median 10,429,098 cycles, 4x: 41,716,392 cycles +Signing.. avg. 451590.25 us (0.45 sec); median 812,700,044 cycles, 1x: 812,700,044 cycles + - FORS signing.. avg. 57795.46 us (0.06 sec); median 104,011,280 cycles, 1x: 104,011,280 cycles + - WOTS pk gen x (ideal).. avg. 5814.57 us (0.01 sec); median 10,428,329 cycles, 54x: 563,129,766 cycles + - WOTS pk gen x (real).. avg. 5820.22 us (0.01 sec); median 10,431,302 cycles, 68x: 709,328,536 cycles +Verifying.. avg. 10916.89 us (0.01 sec); median 19,611,515 cycles, 1x: 19,611,515 cycles +Signature size: 49856 (48.69 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-256s-robust_x3 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-256s-robust_x3 new file mode 100644 index 0000000..53b505e --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-256s-robust_x3 @@ -0,0 +1,16 @@ +Parameters: n = 32, h = 64, d = 8, b = 14, k = 22, w = 16, way=3, tree height=8, wots_len=67 +Running 10 iterations. +thash avg. 2.22 us (0.00 sec); median 3,790 cycles, 1x: 3,790 cycles +f1600x avg. 3.24 us (0.00 sec); median 5,805 cycles, 1x: 5,805 cycles +thashx avg. 6.71 us (0.00 sec); median 12,055 cycles, 1x: 12,055 cycles +Generating keypair.. avg. 610147.93 us (0.61 sec); median 1,098,254,852 cycles, 1x: 1,098,254,852 cycles + - WOTS pk gen x (ideal).. avg. 7107.08 us (0.01 sec); median 12,758,762 cycles, 85x: 1,084,494,770 cycles + - WOTS pk gen x (real).. avg. 7111.01 us (0.01 sec); median 12,760,337 cycles, 86x: 1,097,388,982 cycles +Signing.. avg. 6868322.81 us (6.87 sec); median 12,363,393,445 cycles, 1x: 12,363,393,445 cycles + - FORS signing.. avg. 1987088.62 us (1.99 sec); median 3,577,250,507 cycles, 1x: 3,577,250,507 cycles + - WOTS pk gen x (ideal).. avg. 7132.59 us (0.01 sec); median 12,762,137 cycles, 682x: 8,703,777,434 cycles + - WOTS pk gen x (real).. avg. 7117.75 us (0.01 sec); median 12,758,487 cycles, 688x: 8,777,839,056 cycles +Verifying.. avg. 10744.15 us (0.01 sec); median 19,267,105 cycles, 1x: 19,267,105 cycles +Signature size: 29792 (29.09 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-256s-robust_x4 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-256s-robust_x4 new file mode 100644 index 0000000..93b90ac --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-256s-robust_x4 @@ -0,0 +1,16 @@ +Parameters: n = 32, h = 64, d = 8, b = 14, k = 22, w = 16, way=4, tree height=8, wots_len=67 +Running 10 iterations. +thash avg. 2.34 us (0.00 sec); median 3,786 cycles, 1x: 3,786 cycles +f1600x avg. 3.18 us (0.00 sec); median 5,687 cycles, 1x: 5,687 cycles +thashx avg. 6.71 us (0.00 sec); median 12,051 cycles, 1x: 12,051 cycles +Generating keypair.. avg. 454923.21 us (0.45 sec); median 818,424,311 cycles, 1x: 818,424,311 cycles + - WOTS pk gen x (ideal).. avg. 7128.53 us (0.01 sec); median 12,792,890 cycles, 64x: 818,744,960 cycles + - WOTS pk gen x (real).. avg. 7126.64 us (0.01 sec); median 12,789,297 cycles, 64x: 818,515,008 cycles +Signing.. avg. 5163065.51 us (5.16 sec); median 9,292,099,877 cycles, 1x: 9,292,099,877 cycles + - FORS signing.. avg. 1524141.34 us (1.52 sec); median 2,743,957,181 cycles, 1x: 2,743,957,181 cycles + - WOTS pk gen x (ideal).. avg. 7138.80 us (0.01 sec); median 12,790,476 cycles, 512x: 6,548,723,712 cycles + - WOTS pk gen x (real).. avg. 7140.30 us (0.01 sec); median 12,789,763 cycles, 512x: 6,548,358,656 cycles +Verifying.. avg. 8142.27 us (0.01 sec); median 14,609,700 cycles, 1x: 14,609,700 cycles +Signature size: 29792 (29.09 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-256s-robust_x5 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-256s-robust_x5 new file mode 100644 index 0000000..5eb46ca --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-256s-robust_x5 @@ -0,0 +1,16 @@ +Parameters: n = 32, h = 64, d = 8, b = 14, k = 22, w = 16, way=5, tree height=8, wots_len=67 +Running 10 iterations. +thash avg. 2.12 us (0.00 sec); median 3,787 cycles, 1x: 3,787 cycles +f1600x avg. 5.03 us (0.00 sec); median 9,016 cycles, 1x: 9,016 cycles +thashx avg. 10.43 us (0.00 sec); median 18,752 cycles, 1x: 18,752 cycles +Generating keypair.. avg. 574798.13 us (0.57 sec); median 1,034,393,311 cycles, 1x: 1,034,393,311 cycles + - WOTS pk gen x (ideal).. avg. 11059.34 us (0.01 sec); median 19,865,697 cycles, 51x: 1,013,150,547 cycles + - WOTS pk gen x (real).. avg. 11060.31 us (0.01 sec); median 19,870,633 cycles, 52x: 1,033,272,916 cycles +Signing.. avg. 6503223.78 us (6.50 sec); median 11,705,267,580 cycles, 1x: 11,705,267,580 cycles + - FORS signing.. avg. 1905736.66 us (1.91 sec); median 3,430,603,975 cycles, 1x: 3,430,603,975 cycles + - WOTS pk gen x (ideal).. avg. 11067.28 us (0.01 sec); median 19,870,500 cycles, 409x: 8,127,034,500 cycles + - WOTS pk gen x (real).. avg. 11063.51 us (0.01 sec); median 19,869,656 cycles, 416x: 8,265,776,896 cycles +Verifying.. avg. 9836.07 us (0.01 sec); median 17,651,520 cycles, 1x: 17,651,520 cycles +Signature size: 29792 (29.09 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-256s-simple_x3 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-256s-simple_x3 new file mode 100644 index 0000000..2b6b29d --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-256s-simple_x3 @@ -0,0 +1,16 @@ +Parameters: n = 32, h = 64, d = 8, b = 14, k = 22, w = 16, way=3, tree height=8, wots_len=67 +Running 10 iterations. +thash avg. 1.13 us (0.00 sec); median 1,911 cycles, 1x: 1,911 cycles +f1600x avg. 3.24 us (0.00 sec); median 5,804 cycles, 1x: 5,804 cycles +thashx avg. 3.42 us (0.00 sec); median 6,131 cycles, 1x: 6,131 cycles +Generating keypair.. avg. 319930.32 us (0.32 sec); median 575,868,390 cycles, 1x: 575,868,390 cycles + - WOTS pk gen x (ideal).. avg. 3734.31 us (0.00 sec); median 6,694,360 cycles, 85x: 569,020,600 cycles + - WOTS pk gen x (real).. avg. 3748.46 us (0.00 sec); median 6,695,355 cycles, 86x: 575,800,530 cycles +Signing.. avg. 3774759.69 us (3.77 sec); median 6,794,383,294 cycles, 1x: 6,794,383,294 cycles + - FORS signing.. avg. 1215088.38 us (1.22 sec); median 2,187,388,790 cycles, 1x: 2,187,388,790 cycles + - WOTS pk gen x (ideal).. avg. 3736.57 us (0.00 sec); median 6,691,617 cycles, 682x: 4,563,682,794 cycles + - WOTS pk gen x (real).. avg. 3735.58 us (0.00 sec); median 6,691,843 cycles, 688x: 4,603,987,984 cycles +Verifying.. avg. 5401.01 us (0.01 sec); median 9,680,380 cycles, 1x: 9,680,380 cycles +Signature size: 29792 (29.09 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-256s-simple_x4 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-256s-simple_x4 new file mode 100644 index 0000000..033b903 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-256s-simple_x4 @@ -0,0 +1,16 @@ +Parameters: n = 32, h = 64, d = 8, b = 14, k = 22, w = 16, way=4, tree height=8, wots_len=67 +Running 10 iterations. +thash avg. 1.07 us (0.00 sec); median 1,916 cycles, 1x: 1,916 cycles +f1600x avg. 3.18 us (0.00 sec); median 5,694 cycles, 1x: 5,694 cycles +thashx avg. 3.43 us (0.00 sec); median 6,148 cycles, 1x: 6,148 cycles +Generating keypair.. avg. 240054.91 us (0.24 sec); median 432,119,694 cycles, 1x: 432,119,694 cycles + - WOTS pk gen x (ideal).. avg. 3767.27 us (0.00 sec); median 6,740,330 cycles, 64x: 431,381,120 cycles + - WOTS pk gen x (real).. avg. 3759.23 us (0.00 sec); median 6,736,382 cycles, 64x: 431,128,448 cycles +Signing.. avg. 2853437.84 us (2.85 sec); median 5,136,105,131 cycles, 1x: 5,136,105,131 cycles + - FORS signing.. avg. 932053.75 us (0.93 sec); median 1,678,029,926 cycles, 1x: 1,678,029,926 cycles + - WOTS pk gen x (ideal).. avg. 3771.21 us (0.00 sec); median 6,736,475 cycles, 512x: 3,449,075,200 cycles + - WOTS pk gen x (real).. avg. 3761.95 us (0.00 sec); median 6,736,865 cycles, 512x: 3,449,274,880 cycles +Verifying.. avg. 4118.86 us (0.00 sec); median 7,375,926 cycles, 1x: 7,375,926 cycles +Signature size: 29792 (29.09 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-256s-simple_x5 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-256s-simple_x5 new file mode 100644 index 0000000..2c2cacb --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A55/sphincs-shake-256s-simple_x5 @@ -0,0 +1,16 @@ +Parameters: n = 32, h = 64, d = 8, b = 14, k = 22, w = 16, way=5, tree height=8, wots_len=67 +Running 10 iterations. +thash avg. 1.08 us (0.00 sec); median 1,916 cycles, 1x: 1,916 cycles +f1600x avg. 5.03 us (0.00 sec); median 9,018 cycles, 1x: 9,018 cycles +thashx avg. 5.32 us (0.00 sec); median 9,543 cycles, 1x: 9,543 cycles +Generating keypair.. avg. 301351.99 us (0.30 sec); median 542,346,525 cycles, 1x: 542,346,525 cycles + - WOTS pk gen x (ideal).. avg. 5809.62 us (0.01 sec); median 10,421,212 cycles, 51x: 531,481,812 cycles + - WOTS pk gen x (real).. avg. 5830.68 us (0.01 sec); median 10,425,465 cycles, 52x: 542,124,180 cycles +Signing.. avg. 3569774.68 us (3.57 sec); median 6,424,934,516 cycles, 1x: 6,424,934,516 cycles + - FORS signing.. avg. 1158724.12 us (1.16 sec); median 2,085,928,061 cycles, 1x: 2,085,928,061 cycles + - WOTS pk gen x (ideal).. avg. 5820.70 us (0.01 sec); median 10,423,478 cycles, 409x: 4,263,202,502 cycles + - WOTS pk gen x (real).. avg. 5808.67 us (0.01 sec); median 10,424,147 cycles, 416x: 4,336,445,152 cycles +Verifying.. avg. 5102.54 us (0.01 sec); median 9,134,900 cycles, 1x: 9,134,900 cycles +Signature size: 29792 (29.09 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-128f-robust_x3 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-128f-robust_x3 new file mode 100644 index 0000000..ba2c44c --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-128f-robust_x3 @@ -0,0 +1,16 @@ +Parameters: n = 16, h = 66, d = 22, b = 6, k = 33, w = 16, way=3, tree height=3, wots_len=35 +Running 10 iterations. +thash avg. 1.23 us (0.00 sec); median 1,866 cycles, 1x: 1,866 cycles +f1600x avg. 1.01 us (0.00 sec); median 1,534 cycles, 1x: 1,534 cycles +thashx avg. 2.10 us (0.00 sec); median 3,195 cycles, 1x: 3,195 cycles +Generating keypair.. avg. 3464.68 us (0.00 sec); median 5,282,973 cycles, 1x: 5,282,973 cycles + - WOTS pk gen x (ideal).. avg. 1152.97 us (0.00 sec); median 1,753,925 cycles, 2x: 3,507,850 cycles + - WOTS pk gen x (real).. avg. 1154.34 us (0.00 sec); median 1,753,711 cycles, 3x: 5,261,133 cycles +Signing.. avg. 71960.69 us (0.07 sec); median 123,722,884 cycles, 1x: 123,722,884 cycles + - FORS signing.. avg. 3921.63 us (0.00 sec); median 7,489,600 cycles, 1x: 7,489,600 cycles + - WOTS pk gen x (ideal).. avg. 922.74 us (0.00 sec); median 1,753,776 cycles, 58x: 101,719,008 cycles + - WOTS pk gen x (real).. avg. 920.87 us (0.00 sec); median 1,753,665 cycles, 66x: 115,741,890 cycles +Verifying.. avg. 3847.87 us (0.00 sec); median 7,347,743 cycles, 1x: 7,347,743 cycles +Signature size: 17088 (16.69 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-128f-robust_x4 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-128f-robust_x4 new file mode 100644 index 0000000..5a75969 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-128f-robust_x4 @@ -0,0 +1,16 @@ +Parameters: n = 16, h = 66, d = 22, b = 6, k = 33, w = 16, way=4, tree height=3, wots_len=35 +Running 10 iterations. +thash avg. 0.98 us (0.00 sec); median 1,868 cycles, 1x: 1,868 cycles +f1600x avg. 0.85 us (0.00 sec); median 1,606 cycles, 1x: 1,606 cycles +thashx avg. 1.78 us (0.00 sec); median 3,385 cycles, 1x: 3,385 cycles +Generating keypair.. avg. 1965.87 us (0.00 sec); median 3,743,485 cycles, 1x: 3,743,485 cycles + - WOTS pk gen x (ideal).. avg. 983.26 us (0.00 sec); median 1,873,056 cycles, 2x: 3,746,112 cycles + - WOTS pk gen x (real).. avg. 985.63 us (0.00 sec); median 1,873,275 cycles, 2x: 3,746,550 cycles +Signing.. avg. 45505.91 us (0.05 sec); median 87,051,944 cycles, 1x: 87,051,944 cycles + - FORS signing.. avg. 2462.04 us (0.00 sec); median 4,693,564 cycles, 1x: 4,693,564 cycles + - WOTS pk gen x (ideal).. avg. 985.99 us (0.00 sec); median 1,873,360 cycles, 44x: 82,427,840 cycles + - WOTS pk gen x (real).. avg. 986.37 us (0.00 sec); median 1,872,955 cycles, 44x: 82,410,020 cycles +Verifying.. avg. 3178.46 us (0.00 sec); median 6,071,225 cycles, 1x: 6,071,225 cycles +Signature size: 17088 (16.69 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-128f-robust_x5 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-128f-robust_x5 new file mode 100644 index 0000000..702c9e6 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-128f-robust_x5 @@ -0,0 +1,16 @@ +Parameters: n = 16, h = 66, d = 22, b = 6, k = 33, w = 16, way=5, tree height=3, wots_len=35 +Running 10 iterations. +thash avg. 0.98 us (0.00 sec); median 1,869 cycles, 1x: 1,869 cycles +f1600x avg. 1.33 us (0.00 sec); median 2,534 cycles, 1x: 2,534 cycles +thashx avg. 2.79 us (0.00 sec); median 5,326 cycles, 1x: 5,326 cycles +Generating keypair.. avg. 3079.21 us (0.00 sec); median 5,879,342 cycles, 1x: 5,879,342 cycles + - WOTS pk gen x (ideal).. avg. 1539.77 us (0.00 sec); median 2,931,613 cycles, 1x: 2,931,613 cycles + - WOTS pk gen x (real).. avg. 1537.87 us (0.00 sec); median 2,931,729 cycles, 2x: 5,863,458 cycles +Signing.. avg. 71452.17 us (0.07 sec); median 136,735,685 cycles, 1x: 136,735,685 cycles + - FORS signing.. avg. 3908.34 us (0.00 sec); median 7,470,259 cycles, 1x: 7,470,259 cycles + - WOTS pk gen x (ideal).. avg. 1539.18 us (0.00 sec); median 2,931,816 cycles, 35x: 102,613,560 cycles + - WOTS pk gen x (real).. avg. 1540.62 us (0.00 sec); median 2,932,517 cycles, 44x: 129,030,748 cycles +Verifying.. avg. 4087.32 us (0.00 sec); median 7,813,105 cycles, 1x: 7,813,105 cycles +Signature size: 17088 (16.69 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-128f-simple_x3 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-128f-simple_x3 new file mode 100644 index 0000000..4cce64d --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-128f-simple_x3 @@ -0,0 +1,16 @@ +Parameters: n = 16, h = 66, d = 22, b = 6, k = 33, w = 16, way=3, tree height=3, wots_len=35 +Running 10 iterations. +thash avg. 0.64 us (0.00 sec); median 959 cycles, 1x: 959 cycles +f1600x avg. 1.01 us (0.00 sec); median 1,533 cycles, 1x: 1,533 cycles +thashx avg. 1.06 us (0.00 sec); median 1,610 cycles, 1x: 1,610 cycles +Generating keypair.. avg. 1804.13 us (0.00 sec); median 2,742,504 cycles, 1x: 2,742,504 cycles + - WOTS pk gen x (ideal).. avg. 600.59 us (0.00 sec); median 911,509 cycles, 2x: 1,823,018 cycles + - WOTS pk gen x (real).. avg. 603.54 us (0.00 sec); median 911,506 cycles, 3x: 2,734,518 cycles +Signing.. avg. 42352.82 us (0.04 sec); median 64,763,928 cycles, 1x: 64,763,928 cycles + - FORS signing.. avg. 2892.57 us (0.00 sec); median 4,413,513 cycles, 1x: 4,413,513 cycles + - WOTS pk gen x (ideal).. avg. 602.39 us (0.00 sec); median 911,429 cycles, 58x: 52,862,882 cycles + - WOTS pk gen x (real).. avg. 604.05 us (0.00 sec); median 911,347 cycles, 66x: 60,148,902 cycles +Verifying.. avg. 2399.85 us (0.00 sec); median 3,660,239 cycles, 1x: 3,660,239 cycles +Signature size: 17088 (16.69 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-128f-simple_x4 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-128f-simple_x4 new file mode 100644 index 0000000..95df979 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-128f-simple_x4 @@ -0,0 +1,16 @@ +Parameters: n = 16, h = 66, d = 22, b = 6, k = 33, w = 16, way=4, tree height=3, wots_len=35 +Running 10 iterations. +thash avg. 0.63 us (0.00 sec); median 959 cycles, 1x: 959 cycles +f1600x avg. 1.06 us (0.00 sec); median 1,606 cycles, 1x: 1,606 cycles +thashx avg. 1.13 us (0.00 sec); median 1,719 cycles, 1x: 1,719 cycles +Generating keypair.. avg. 1288.95 us (0.00 sec); median 1,958,576 cycles, 1x: 1,958,576 cycles + - WOTS pk gen x (ideal).. avg. 647.79 us (0.00 sec); median 975,697 cycles, 2x: 1,951,394 cycles + - WOTS pk gen x (real).. avg. 642.56 us (0.00 sec); median 976,097 cycles, 2x: 1,952,194 cycles +Signing.. avg. 30064.12 us (0.03 sec); median 45,976,376 cycles, 1x: 45,976,376 cycles + - FORS signing.. avg. 1873.66 us (0.00 sec); median 2,856,258 cycles, 1x: 2,856,258 cycles + - WOTS pk gen x (ideal).. avg. 644.91 us (0.00 sec); median 975,751 cycles, 44x: 42,933,044 cycles + - WOTS pk gen x (real).. avg. 642.07 us (0.00 sec); median 975,935 cycles, 44x: 42,941,140 cycles +Verifying.. avg. 2050.39 us (0.00 sec); median 3,123,707 cycles, 1x: 3,123,707 cycles +Signature size: 17088 (16.69 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-128f-simple_x5 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-128f-simple_x5 new file mode 100644 index 0000000..47f91b4 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-128f-simple_x5 @@ -0,0 +1,16 @@ +Parameters: n = 16, h = 66, d = 22, b = 6, k = 33, w = 16, way=5, tree height=3, wots_len=35 +Running 10 iterations. +thash avg. 0.63 us (0.00 sec); median 958 cycles, 1x: 958 cycles +f1600x avg. 1.66 us (0.00 sec); median 2,535 cycles, 1x: 2,535 cycles +thashx avg. 1.76 us (0.00 sec); median 2,687 cycles, 1x: 2,687 cycles +Generating keypair.. avg. 2006.44 us (0.00 sec); median 3,054,145 cycles, 1x: 3,054,145 cycles + - WOTS pk gen x (ideal).. avg. 1003.21 us (0.00 sec); median 1,524,215 cycles, 1x: 1,524,215 cycles + - WOTS pk gen x (real).. avg. 1001.43 us (0.00 sec); median 1,523,704 cycles, 2x: 3,047,408 cycles +Signing.. avg. 46823.34 us (0.05 sec); median 71,611,073 cycles, 1x: 71,611,073 cycles + - FORS signing.. avg. 2888.81 us (0.00 sec); median 4,405,237 cycles, 1x: 4,405,237 cycles + - WOTS pk gen x (ideal).. avg. 1002.79 us (0.00 sec); median 1,523,426 cycles, 35x: 53,319,910 cycles + - WOTS pk gen x (real).. avg. 1004.15 us (0.00 sec); median 1,523,709 cycles, 44x: 67,043,196 cycles +Verifying.. avg. 2597.62 us (0.00 sec); median 3,963,202 cycles, 1x: 3,963,202 cycles +Signature size: 17088 (16.69 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-128s-robust_x3 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-128s-robust_x3 new file mode 100644 index 0000000..25428e7 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-128s-robust_x3 @@ -0,0 +1,16 @@ +Parameters: n = 16, h = 63, d = 7, b = 12, k = 14, w = 16, way=3, tree height=9, wots_len=35 +Running 10 iterations. +thash avg. 0.98 us (0.00 sec); median 1,868 cycles, 1x: 1,868 cycles +f1600x avg. 0.81 us (0.00 sec); median 1,532 cycles, 1x: 1,532 cycles +thashx avg. 1.67 us (0.00 sec); median 3,193 cycles, 1x: 3,193 cycles +Generating keypair.. avg. 157331.19 us (0.16 sec); median 301,142,284 cycles, 1x: 301,142,284 cycles + - WOTS pk gen x (ideal).. avg. 923.99 us (0.00 sec); median 1,754,022 cycles, 170x: 298,183,740 cycles + - WOTS pk gen x (real).. avg. 922.17 us (0.00 sec); median 1,752,595 cycles, 171x: 299,693,745 cycles +Signing.. avg. 1207432.35 us (1.21 sec); median 2,310,203,569 cycles, 1x: 2,310,203,569 cycles + - FORS signing.. avg. 105190.56 us (0.11 sec); median 201,206,039 cycles, 1x: 201,206,039 cycles + - WOTS pk gen x (ideal).. avg. 925.89 us (0.00 sec); median 1,753,767 cycles, 1194x: 2,093,997,798 cycles + - WOTS pk gen x (real).. avg. 921.35 us (0.00 sec); median 1,753,328 cycles, 1197x: 2,098,733,616 cycles +Verifying.. avg. 1436.41 us (0.00 sec); median 2,728,421 cycles, 1x: 2,728,421 cycles +Signature size: 7856 (7.67 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-128s-robust_x4 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-128s-robust_x4 new file mode 100644 index 0000000..c480b05 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-128s-robust_x4 @@ -0,0 +1,16 @@ +Parameters: n = 16, h = 63, d = 7, b = 12, k = 14, w = 16, way=4, tree height=9, wots_len=35 +Running 10 iterations. +thash avg. 0.98 us (0.00 sec); median 1,866 cycles, 1x: 1,866 cycles +f1600x avg. 0.84 us (0.00 sec); median 1,604 cycles, 1x: 1,604 cycles +thashx avg. 1.77 us (0.00 sec); median 3,385 cycles, 1x: 3,385 cycles +Generating keypair.. avg. 125237.82 us (0.13 sec); median 239,633,901 cycles, 1x: 239,633,901 cycles + - WOTS pk gen x (ideal).. avg. 985.05 us (0.00 sec); median 1,873,067 cycles, 128x: 239,752,576 cycles + - WOTS pk gen x (real).. avg. 983.00 us (0.00 sec); median 1,872,724 cycles, 128x: 239,708,672 cycles +Signing.. avg. 940932.64 us (0.94 sec); median 1,800,720,086 cycles, 1x: 1,800,720,086 cycles + - FORS signing.. avg. 64348.08 us (0.06 sec); median 123,130,224 cycles, 1x: 123,130,224 cycles + - WOTS pk gen x (ideal).. avg. 988.93 us (0.00 sec); median 1,873,645 cycles, 896x: 1,678,785,920 cycles + - WOTS pk gen x (real).. avg. 984.54 us (0.00 sec); median 1,872,926 cycles, 896x: 1,678,141,696 cycles +Verifying.. avg. 1130.98 us (0.00 sec); median 2,147,065 cycles, 1x: 2,147,065 cycles +Signature size: 7856 (7.67 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-128s-robust_x5 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-128s-robust_x5 new file mode 100644 index 0000000..735ed67 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-128s-robust_x5 @@ -0,0 +1,16 @@ +Parameters: n = 16, h = 63, d = 7, b = 12, k = 14, w = 16, way=5, tree height=9, wots_len=35 +Running 10 iterations. +thash avg. 0.98 us (0.00 sec); median 1,867 cycles, 1x: 1,867 cycles +f1600x avg. 1.33 us (0.00 sec); median 2,542 cycles, 1x: 2,542 cycles +thashx avg. 2.79 us (0.00 sec); median 5,334 cycles, 1x: 5,334 cycles +Generating keypair.. avg. 158522.60 us (0.16 sec); median 303,418,188 cycles, 1x: 303,418,188 cycles + - WOTS pk gen x (ideal).. avg. 1539.53 us (0.00 sec); median 2,936,179 cycles, 102x: 299,490,258 cycles + - WOTS pk gen x (real).. avg. 1541.20 us (0.00 sec); median 2,936,174 cycles, 103x: 302,425,922 cycles +Signing.. avg. 1215502.64 us (1.22 sec); median 2,326,090,360 cycles, 1x: 2,326,090,360 cycles + - FORS signing.. avg. 105650.41 us (0.11 sec); median 202,202,560 cycles, 1x: 202,202,560 cycles + - WOTS pk gen x (ideal).. avg. 1543.88 us (0.00 sec); median 2,936,131 cycles, 716x: 2,102,269,796 cycles + - WOTS pk gen x (real).. avg. 1540.86 us (0.00 sec); median 2,936,236 cycles, 721x: 2,117,026,156 cycles +Verifying.. avg. 1464.87 us (0.00 sec); median 2,784,970 cycles, 1x: 2,784,970 cycles +Signature size: 7856 (7.67 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-128s-simple_x3 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-128s-simple_x3 new file mode 100644 index 0000000..efbe527 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-128s-simple_x3 @@ -0,0 +1,16 @@ +Parameters: n = 16, h = 63, d = 7, b = 12, k = 14, w = 16, way=3, tree height=9, wots_len=35 +Running 10 iterations. +thash avg. 0.51 us (0.00 sec); median 958 cycles, 1x: 958 cycles +f1600x avg. 0.81 us (0.00 sec); median 1,534 cycles, 1x: 1,534 cycles +thashx avg. 0.84 us (0.00 sec); median 1,608 cycles, 1x: 1,608 cycles +Generating keypair.. avg. 81732.62 us (0.08 sec); median 156,394,464 cycles, 1x: 156,394,464 cycles + - WOTS pk gen x (ideal).. avg. 485.21 us (0.00 sec); median 911,981 cycles, 170x: 155,036,770 cycles + - WOTS pk gen x (real).. avg. 481.65 us (0.00 sec); median 911,885 cycles, 171x: 155,932,335 cycles +Signing.. avg. 633776.99 us (0.63 sec); median 1,212,723,428 cycles, 1x: 1,212,723,428 cycles + - FORS signing.. avg. 61605.25 us (0.06 sec); median 117,853,437 cycles, 1x: 117,853,437 cycles + - WOTS pk gen x (ideal).. avg. 482.07 us (0.00 sec); median 911,432 cycles, 1194x: 1,088,249,808 cycles + - WOTS pk gen x (real).. avg. 482.12 us (0.00 sec); median 911,459 cycles, 1197x: 1,091,016,423 cycles +Verifying.. avg. 678.29 us (0.00 sec); median 1,287,091 cycles, 1x: 1,287,091 cycles +Signature size: 7856 (7.67 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-128s-simple_x4 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-128s-simple_x4 new file mode 100644 index 0000000..c54520a --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-128s-simple_x4 @@ -0,0 +1,16 @@ +Parameters: n = 16, h = 63, d = 7, b = 12, k = 14, w = 16, way=4, tree height=9, wots_len=35 +Running 10 iterations. +thash avg. 0.51 us (0.00 sec); median 959 cycles, 1x: 959 cycles +f1600x avg. 0.84 us (0.00 sec); median 1,605 cycles, 1x: 1,605 cycles +thashx avg. 0.91 us (0.00 sec); median 1,714 cycles, 1x: 1,714 cycles +Generating keypair.. avg. 65244.64 us (0.07 sec); median 124,863,071 cycles, 1x: 124,863,071 cycles + - WOTS pk gen x (ideal).. avg. 513.75 us (0.00 sec); median 975,336 cycles, 128x: 124,843,008 cycles + - WOTS pk gen x (real).. avg. 515.23 us (0.00 sec); median 975,172 cycles, 128x: 124,822,016 cycles +Signing.. avg. 496008.34 us (0.50 sec); median 949,221,503 cycles, 1x: 949,221,503 cycles + - FORS signing.. avg. 39201.64 us (0.04 sec); median 75,026,022 cycles, 1x: 75,026,022 cycles + - WOTS pk gen x (ideal).. avg. 515.47 us (0.00 sec); median 975,456 cycles, 896x: 874,008,576 cycles + - WOTS pk gen x (real).. avg. 513.65 us (0.00 sec); median 975,250 cycles, 896x: 873,824,000 cycles +Verifying.. avg. 642.95 us (0.00 sec); median 1,220,724 cycles, 1x: 1,220,724 cycles +Signature size: 7856 (7.67 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-128s-simple_x5 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-128s-simple_x5 new file mode 100644 index 0000000..5941b5f --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-128s-simple_x5 @@ -0,0 +1,16 @@ +Parameters: n = 16, h = 63, d = 7, b = 12, k = 14, w = 16, way=5, tree height=9, wots_len=35 +Running 10 iterations. +thash avg. 0.50 us (0.00 sec); median 957 cycles, 1x: 957 cycles +f1600x avg. 1.33 us (0.00 sec); median 2,529 cycles, 1x: 2,529 cycles +thashx avg. 1.41 us (0.00 sec); median 2,693 cycles, 1x: 2,693 cycles +Generating keypair.. avg. 82376.53 us (0.08 sec); median 157,680,446 cycles, 1x: 157,680,446 cycles + - WOTS pk gen x (ideal).. avg. 802.45 us (0.00 sec); median 1,526,103 cycles, 102x: 155,662,506 cycles + - WOTS pk gen x (real).. avg. 802.20 us (0.00 sec); median 1,526,266 cycles, 103x: 157,205,398 cycles +Signing.. avg. 638802.01 us (0.64 sec); median 1,222,387,510 cycles, 1x: 1,222,387,510 cycles + - FORS signing.. avg. 61938.21 us (0.06 sec); median 118,542,645 cycles, 1x: 118,542,645 cycles + - WOTS pk gen x (ideal).. avg. 805.33 us (0.00 sec); median 1,526,012 cycles, 716x: 1,092,624,592 cycles + - WOTS pk gen x (real).. avg. 802.32 us (0.00 sec); median 1,525,894 cycles, 721x: 1,100,169,574 cycles +Verifying.. avg. 775.55 us (0.00 sec); median 1,463,397 cycles, 1x: 1,463,397 cycles +Signature size: 7856 (7.67 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-192f-robust_x3 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-192f-robust_x3 new file mode 100644 index 0000000..672e067 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-192f-robust_x3 @@ -0,0 +1,16 @@ +Parameters: n = 24, h = 66, d = 22, b = 8, k = 33, w = 16, way=3, tree height=3, wots_len=51 +Running 10 iterations. +thash avg. 0.99 us (0.00 sec); median 1,884 cycles, 1x: 1,884 cycles +f1600x avg. 0.81 us (0.00 sec); median 1,537 cycles, 1x: 1,537 cycles +thashx avg. 1.69 us (0.00 sec); median 3,225 cycles, 1x: 3,225 cycles +Generating keypair.. avg. 4075.81 us (0.00 sec); median 7,780,874 cycles, 1x: 7,780,874 cycles + - WOTS pk gen x (ideal).. avg. 1359.48 us (0.00 sec); median 2,589,124 cycles, 2x: 5,178,248 cycles + - WOTS pk gen x (real).. avg. 1358.05 us (0.00 sec); median 2,589,156 cycles, 3x: 7,767,468 cycles +Signing.. avg. 105123.15 us (0.11 sec); median 201,231,097 cycles, 1x: 201,231,097 cycles + - FORS signing.. avg. 15682.46 us (0.02 sec); median 29,998,177 cycles, 1x: 29,998,177 cycles + - WOTS pk gen x (ideal).. avg. 1358.44 us (0.00 sec); median 2,588,274 cycles, 58x: 150,119,892 cycles + - WOTS pk gen x (real).. avg. 1359.14 us (0.00 sec); median 2,588,427 cycles, 66x: 170,836,182 cycles +Verifying.. avg. 5533.24 us (0.01 sec); median 10,551,762 cycles, 1x: 10,551,762 cycles +Signature size: 35664 (34.83 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-192f-robust_x4 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-192f-robust_x4 new file mode 100644 index 0000000..b30e0bc --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-192f-robust_x4 @@ -0,0 +1,16 @@ +Parameters: n = 24, h = 66, d = 22, b = 8, k = 33, w = 16, way=4, tree height=3, wots_len=51 +Running 10 iterations. +thash avg. 0.99 us (0.00 sec); median 1,884 cycles, 1x: 1,884 cycles +f1600x avg. 0.85 us (0.00 sec); median 1,605 cycles, 1x: 1,605 cycles +thashx avg. 1.80 us (0.00 sec); median 3,438 cycles, 1x: 3,438 cycles +Generating keypair.. avg. 2904.15 us (0.00 sec); median 5,542,154 cycles, 1x: 5,542,154 cycles + - WOTS pk gen x (ideal).. avg. 1452.51 us (0.00 sec); median 2,766,807 cycles, 2x: 5,533,614 cycles + - WOTS pk gen x (real).. avg. 1451.82 us (0.00 sec); median 2,767,040 cycles, 2x: 5,534,080 cycles +Signing.. avg. 73445.99 us (0.07 sec); median 140,564,698 cycles, 1x: 140,564,698 cycles + - FORS signing.. avg. 9722.88 us (0.01 sec); median 18,597,351 cycles, 1x: 18,597,351 cycles + - WOTS pk gen x (ideal).. avg. 1452.06 us (0.00 sec); median 2,766,254 cycles, 44x: 121,715,176 cycles + - WOTS pk gen x (real).. avg. 1450.09 us (0.00 sec); median 2,766,347 cycles, 44x: 121,719,268 cycles +Verifying.. avg. 4727.50 us (0.00 sec); median 9,033,983 cycles, 1x: 9,033,983 cycles +Signature size: 35664 (34.83 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-192f-robust_x5 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-192f-robust_x5 new file mode 100644 index 0000000..6c16dda --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-192f-robust_x5 @@ -0,0 +1,16 @@ +Parameters: n = 24, h = 66, d = 22, b = 8, k = 33, w = 16, way=5, tree height=3, wots_len=51 +Running 10 iterations. +thash avg. 0.99 us (0.00 sec); median 1,887 cycles, 1x: 1,887 cycles +f1600x avg. 1.33 us (0.00 sec); median 2,533 cycles, 1x: 2,533 cycles +thashx avg. 2.81 us (0.00 sec); median 5,379 cycles, 1x: 5,379 cycles +Generating keypair.. avg. 4543.27 us (0.00 sec); median 8,672,886 cycles, 1x: 8,672,886 cycles + - WOTS pk gen x (ideal).. avg. 2269.51 us (0.00 sec); median 4,333,004 cycles, 1x: 4,333,004 cycles + - WOTS pk gen x (real).. avg. 2264.32 us (0.00 sec); median 4,333,070 cycles, 2x: 8,666,140 cycles +Signing.. avg. 115479.71 us (0.12 sec); median 221,038,377 cycles, 1x: 221,038,377 cycles + - FORS signing.. avg. 15846.60 us (0.02 sec); median 30,296,197 cycles, 1x: 30,296,197 cycles + - WOTS pk gen x (ideal).. avg. 2273.48 us (0.00 sec); median 4,335,641 cycles, 35x: 151,747,435 cycles + - WOTS pk gen x (real).. avg. 2270.67 us (0.00 sec); median 4,334,774 cycles, 44x: 190,730,056 cycles +Verifying.. avg. 5770.22 us (0.01 sec); median 11,024,572 cycles, 1x: 11,024,572 cycles +Signature size: 35664 (34.83 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-192f-simple_x3 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-192f-simple_x3 new file mode 100644 index 0000000..aa2344c --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-192f-simple_x3 @@ -0,0 +1,16 @@ +Parameters: n = 24, h = 66, d = 22, b = 8, k = 33, w = 16, way=3, tree height=3, wots_len=51 +Running 10 iterations. +thash avg. 0.51 us (0.00 sec); median 964 cycles, 1x: 964 cycles +f1600x avg. 0.81 us (0.00 sec); median 1,532 cycles, 1x: 1,532 cycles +thashx avg. 0.86 us (0.00 sec); median 1,642 cycles, 1x: 1,642 cycles +Generating keypair.. avg. 2145.99 us (0.00 sec); median 4,090,808 cycles, 1x: 4,090,808 cycles + - WOTS pk gen x (ideal).. avg. 715.25 us (0.00 sec); median 1,360,306 cycles, 2x: 2,720,612 cycles + - WOTS pk gen x (real).. avg. 717.66 us (0.00 sec); median 1,360,580 cycles, 3x: 4,081,740 cycles +Signing.. avg. 56238.94 us (0.06 sec); median 107,637,916 cycles, 1x: 107,637,916 cycles + - FORS signing.. avg. 9213.10 us (0.01 sec); median 17,624,664 cycles, 1x: 17,624,664 cycles + - WOTS pk gen x (ideal).. avg. 716.45 us (0.00 sec); median 1,360,393 cycles, 58x: 78,902,794 cycles + - WOTS pk gen x (real).. avg. 716.05 us (0.00 sec); median 1,360,382 cycles, 66x: 89,785,212 cycles +Verifying.. avg. 2928.97 us (0.00 sec); median 5,593,586 cycles, 1x: 5,593,586 cycles +Signature size: 35664 (34.83 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-192f-simple_x4 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-192f-simple_x4 new file mode 100644 index 0000000..db39a02 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-192f-simple_x4 @@ -0,0 +1,16 @@ +Parameters: n = 24, h = 66, d = 22, b = 8, k = 33, w = 16, way=4, tree height=3, wots_len=51 +Running 10 iterations. +thash avg. 0.51 us (0.00 sec); median 962 cycles, 1x: 962 cycles +f1600x avg. 0.85 us (0.00 sec); median 1,616 cycles, 1x: 1,616 cycles +thashx avg. 0.92 us (0.00 sec); median 1,751 cycles, 1x: 1,751 cycles +Generating keypair.. avg. 1533.16 us (0.00 sec); median 2,920,117 cycles, 1x: 2,920,117 cycles + - WOTS pk gen x (ideal).. avg. 765.95 us (0.00 sec); median 1,457,527 cycles, 2x: 2,915,054 cycles + - WOTS pk gen x (real).. avg. 766.37 us (0.00 sec); median 1,457,273 cycles, 2x: 2,914,546 cycles +Signing.. avg. 39571.65 us (0.04 sec); median 75,690,099 cycles, 1x: 75,690,099 cycles + - FORS signing.. avg. 5963.57 us (0.01 sec); median 11,402,680 cycles, 1x: 11,402,680 cycles + - WOTS pk gen x (ideal).. avg. 766.98 us (0.00 sec); median 1,457,462 cycles, 44x: 64,128,328 cycles + - WOTS pk gen x (real).. avg. 766.37 us (0.00 sec); median 1,457,378 cycles, 44x: 64,124,632 cycles +Verifying.. avg. 2403.08 us (0.00 sec); median 4,586,410 cycles, 1x: 4,586,410 cycles +Signature size: 35664 (34.83 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-192f-simple_x5 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-192f-simple_x5 new file mode 100644 index 0000000..1c81fc5 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-192f-simple_x5 @@ -0,0 +1,16 @@ +Parameters: n = 24, h = 66, d = 22, b = 8, k = 33, w = 16, way=5, tree height=3, wots_len=51 +Running 10 iterations. +thash avg. 0.51 us (0.00 sec); median 961 cycles, 1x: 961 cycles +f1600x avg. 1.31 us (0.00 sec); median 2,499 cycles, 1x: 2,499 cycles +thashx avg. 1.43 us (0.00 sec); median 2,736 cycles, 1x: 2,736 cycles +Generating keypair.. avg. 2383.98 us (0.00 sec); median 4,549,350 cycles, 1x: 4,549,350 cycles + - WOTS pk gen x (ideal).. avg. 1193.62 us (0.00 sec); median 2,270,445 cycles, 1x: 2,270,445 cycles + - WOTS pk gen x (real).. avg. 1192.30 us (0.00 sec); median 2,270,763 cycles, 2x: 4,541,526 cycles +Signing.. avg. 61560.30 us (0.06 sec); median 117,818,280 cycles, 1x: 117,818,280 cycles + - FORS signing.. avg. 9291.40 us (0.01 sec); median 17,767,933 cycles, 1x: 17,767,933 cycles + - WOTS pk gen x (ideal).. avg. 1189.88 us (0.00 sec); median 2,270,121 cycles, 35x: 79,454,235 cycles + - WOTS pk gen x (real).. avg. 1192.47 us (0.00 sec); median 2,270,147 cycles, 44x: 99,886,468 cycles +Verifying.. avg. 2973.34 us (0.00 sec); median 5,672,967 cycles, 1x: 5,672,967 cycles +Signature size: 35664 (34.83 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-192s-robust_x3 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-192s-robust_x3 new file mode 100644 index 0000000..7119bc7 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-192s-robust_x3 @@ -0,0 +1,16 @@ +Parameters: n = 24, h = 63, d = 7, b = 14, k = 17, w = 16, way=3, tree height=9, wots_len=51 +Running 10 iterations. +thash avg. 0.82 us (0.00 sec); median 1,886 cycles, 1x: 1,886 cycles +f1600x avg. 0.66 us (0.00 sec); median 1,533 cycles, 1x: 1,533 cycles +thashx avg. 1.41 us (0.00 sec); median 3,226 cycles, 1x: 3,226 cycles +Generating keypair.. avg. 191042.84 us (0.19 sec); median 443,712,667 cycles, 1x: 443,712,667 cycles + - WOTS pk gen x (ideal).. avg. 1113.45 us (0.00 sec); median 2,589,015 cycles, 170x: 440,132,550 cycles + - WOTS pk gen x (real).. avg. 1107.26 us (0.00 sec); median 2,589,078 cycles, 171x: 442,732,338 cycles +Signing.. avg. 1818541.33 us (1.82 sec); median 4,092,216,202 cycles, 1x: 4,092,216,202 cycles + - FORS signing.. avg. 500499.07 us (0.50 sec); median 985,452,726 cycles, 1x: 985,452,726 cycles + - WOTS pk gen x (ideal).. avg. 1362.53 us (0.00 sec); median 2,589,970 cycles, 1194x: 3,092,424,180 cycles + - WOTS pk gen x (real).. avg. 1360.38 us (0.00 sec); median 2,589,997 cycles, 1197x: 3,100,226,409 cycles +Verifying.. avg. 2083.04 us (0.00 sec); median 3,956,235 cycles, 1x: 3,956,235 cycles +Signature size: 16224 (15.84 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-192s-robust_x4 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-192s-robust_x4 new file mode 100644 index 0000000..1049dc0 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-192s-robust_x4 @@ -0,0 +1,16 @@ +Parameters: n = 24, h = 63, d = 7, b = 14, k = 17, w = 16, way=4, tree height=9, wots_len=51 +Running 10 iterations. +thash avg. 0.99 us (0.00 sec); median 1,887 cycles, 1x: 1,887 cycles +f1600x avg. 0.84 us (0.00 sec); median 1,605 cycles, 1x: 1,605 cycles +thashx avg. 1.80 us (0.00 sec); median 3,433 cycles, 1x: 3,433 cycles +Generating keypair.. avg. 185263.80 us (0.19 sec); median 354,569,000 cycles, 1x: 354,569,000 cycles + - WOTS pk gen x (ideal).. avg. 1451.68 us (0.00 sec); median 2,764,963 cycles, 128x: 353,915,264 cycles + - WOTS pk gen x (real).. avg. 1451.86 us (0.00 sec); median 2,764,464 cycles, 128x: 353,851,392 cycles +Signing.. avg. 1614381.82 us (1.61 sec); median 3,089,687,530 cycles, 1x: 3,089,687,530 cycles + - FORS signing.. avg. 317257.12 us (0.32 sec); median 607,183,700 cycles, 1x: 607,183,700 cycles + - WOTS pk gen x (ideal).. avg. 1458.31 us (0.00 sec); median 2,766,119 cycles, 896x: 2,478,442,624 cycles + - WOTS pk gen x (real).. avg. 1454.21 us (0.00 sec); median 2,766,159 cycles, 896x: 2,478,478,464 cycles +Verifying.. avg. 1673.76 us (0.00 sec); median 3,187,198 cycles, 1x: 3,187,198 cycles +Signature size: 16224 (15.84 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-192s-robust_x5 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-192s-robust_x5 new file mode 100644 index 0000000..04d6adb --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-192s-robust_x5 @@ -0,0 +1,16 @@ +Parameters: n = 24, h = 63, d = 7, b = 14, k = 17, w = 16, way=5, tree height=9, wots_len=51 +Running 10 iterations. +thash avg. 0.99 us (0.00 sec); median 1,887 cycles, 1x: 1,887 cycles +f1600x avg. 1.33 us (0.00 sec); median 2,531 cycles, 1x: 2,531 cycles +thashx avg. 2.81 us (0.00 sec); median 5,374 cycles, 1x: 5,374 cycles +Generating keypair.. avg. 232853.43 us (0.23 sec); median 445,605,494 cycles, 1x: 445,605,494 cycles + - WOTS pk gen x (ideal).. avg. 2263.04 us (0.00 sec); median 4,320,790 cycles, 102x: 440,720,580 cycles + - WOTS pk gen x (real).. avg. 2266.45 us (0.00 sec); median 4,319,862 cycles, 103x: 444,945,786 cycles +Signing.. avg. 2148336.13 us (2.15 sec); median 4,111,635,523 cycles, 1x: 4,111,635,523 cycles + - FORS signing.. avg. 518501.86 us (0.52 sec); median 992,269,430 cycles, 1x: 992,269,430 cycles + - WOTS pk gen x (ideal).. avg. 2270.94 us (0.00 sec); median 4,321,853 cycles, 716x: 3,094,446,748 cycles + - WOTS pk gen x (real).. avg. 2265.02 us (0.00 sec); median 4,321,044 cycles, 721x: 3,115,472,724 cycles +Verifying.. avg. 2025.35 us (0.00 sec); median 3,852,679 cycles, 1x: 3,852,679 cycles +Signature size: 16224 (15.84 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-192s-simple_x3 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-192s-simple_x3 new file mode 100644 index 0000000..e970685 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-192s-simple_x3 @@ -0,0 +1,16 @@ +Parameters: n = 24, h = 63, d = 7, b = 14, k = 17, w = 16, way=3, tree height=9, wots_len=51 +Running 10 iterations. +thash avg. 0.51 us (0.00 sec); median 962 cycles, 1x: 962 cycles +f1600x avg. 0.81 us (0.00 sec); median 1,531 cycles, 1x: 1,531 cycles +thashx avg. 0.87 us (0.00 sec); median 1,643 cycles, 1x: 1,643 cycles +Generating keypair.. avg. 121963.91 us (0.12 sec); median 233,420,532 cycles, 1x: 233,420,532 cycles + - WOTS pk gen x (ideal).. avg. 717.34 us (0.00 sec); median 1,362,437 cycles, 170x: 231,614,290 cycles + - WOTS pk gen x (real).. avg. 716.57 us (0.00 sec); median 1,362,467 cycles, 171x: 232,981,857 cycles +Signing.. avg. 1156478.90 us (1.16 sec); median 2,213,247,755 cycles, 1x: 2,213,247,755 cycles + - FORS signing.. avg. 302437.99 us (0.30 sec); median 578,663,448 cycles, 1x: 578,663,448 cycles + - WOTS pk gen x (ideal).. avg. 717.46 us (0.00 sec); median 1,362,405 cycles, 1194x: 1,626,711,570 cycles + - WOTS pk gen x (real).. avg. 715.80 us (0.00 sec); median 1,362,365 cycles, 1197x: 1,630,750,905 cycles +Verifying.. avg. 1039.35 us (0.00 sec); median 1,978,925 cycles, 1x: 1,978,925 cycles +Signature size: 16224 (15.84 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-192s-simple_x4 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-192s-simple_x4 new file mode 100644 index 0000000..6235a0a --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-192s-simple_x4 @@ -0,0 +1,16 @@ +Parameters: n = 24, h = 63, d = 7, b = 14, k = 17, w = 16, way=4, tree height=9, wots_len=51 +Running 10 iterations. +thash avg. 0.51 us (0.00 sec); median 963 cycles, 1x: 963 cycles +f1600x avg. 0.85 us (0.00 sec); median 1,616 cycles, 1x: 1,616 cycles +thashx avg. 0.92 us (0.00 sec); median 1,746 cycles, 1x: 1,746 cycles +Generating keypair.. avg. 97299.02 us (0.10 sec); median 186,241,904 cycles, 1x: 186,241,904 cycles + - WOTS pk gen x (ideal).. avg. 765.20 us (0.00 sec); median 1,454,236 cycles, 128x: 186,142,208 cycles + - WOTS pk gen x (real).. avg. 764.62 us (0.00 sec); median 1,454,314 cycles, 128x: 186,152,192 cycles +Signing.. avg. 876347.89 us (0.88 sec); median 1,677,238,885 cycles, 1x: 1,677,238,885 cycles + - FORS signing.. avg. 194992.73 us (0.19 sec); median 373,149,769 cycles, 1x: 373,149,769 cycles + - WOTS pk gen x (ideal).. avg. 768.56 us (0.00 sec); median 1,454,487 cycles, 896x: 1,303,220,352 cycles + - WOTS pk gen x (real).. avg. 765.83 us (0.00 sec); median 1,454,278 cycles, 896x: 1,303,033,088 cycles +Verifying.. avg. 860.38 us (0.00 sec); median 1,629,268 cycles, 1x: 1,629,268 cycles +Signature size: 16224 (15.84 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-192s-simple_x5 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-192s-simple_x5 new file mode 100644 index 0000000..4cc19e5 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-192s-simple_x5 @@ -0,0 +1,16 @@ +Parameters: n = 24, h = 63, d = 7, b = 14, k = 17, w = 16, way=5, tree height=9, wots_len=51 +Running 10 iterations. +thash avg. 0.51 us (0.00 sec); median 962 cycles, 1x: 962 cycles +f1600x avg. 1.32 us (0.00 sec); median 2,531 cycles, 1x: 2,531 cycles +thashx avg. 1.43 us (0.00 sec); median 2,734 cycles, 1x: 2,734 cycles +Generating keypair.. avg. 122355.84 us (0.12 sec); median 234,207,855 cycles, 1x: 234,207,855 cycles + - WOTS pk gen x (ideal).. avg. 1191.06 us (0.00 sec); median 2,269,429 cycles, 102x: 231,481,758 cycles + - WOTS pk gen x (real).. avg. 1191.36 us (0.00 sec); median 2,269,604 cycles, 103x: 233,769,212 cycles +Signing.. avg. 1042182.57 us (1.04 sec); median 2,220,859,498 cycles, 1x: 2,220,859,498 cycles + - FORS signing.. avg. 246766.49 us (0.25 sec); median 581,422,832 cycles, 1x: 581,422,832 cycles + - WOTS pk gen x (ideal).. avg. 991.77 us (0.00 sec); median 2,269,350 cycles, 716x: 1,624,854,600 cycles + - WOTS pk gen x (real).. avg. 992.86 us (0.00 sec); median 2,269,356 cycles, 721x: 1,636,205,676 cycles +Verifying.. avg. 841.18 us (0.00 sec); median 1,966,302 cycles, 1x: 1,966,302 cycles +Signature size: 16224 (15.84 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-256f-robust_x3 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-256f-robust_x3 new file mode 100644 index 0000000..9f1dadd --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-256f-robust_x3 @@ -0,0 +1,16 @@ +Parameters: n = 32, h = 68, d = 17, b = 9, k = 35, w = 16, way=3, tree height=4, wots_len=67 +Running 10 iterations. +thash avg. 0.99 us (0.00 sec); median 1,875 cycles, 1x: 1,875 cycles +f1600x avg. 0.81 us (0.00 sec); median 1,533 cycles, 1x: 1,533 cycles +thashx avg. 1.70 us (0.00 sec); median 3,251 cycles, 1x: 3,251 cycles +Generating keypair.. avg. 10823.41 us (0.01 sec); median 20,699,906 cycles, 1x: 20,699,906 cycles + - WOTS pk gen x (ideal).. avg. 1806.24 us (0.00 sec); median 3,443,617 cycles, 5x: 17,218,085 cycles + - WOTS pk gen x (real).. avg. 1805.04 us (0.00 sec); median 3,443,122 cycles, 6x: 20,658,732 cycles +Signing.. avg. 217207.54 us (0.22 sec); median 415,512,830 cycles, 1x: 415,512,830 cycles + - FORS signing.. avg. 33232.03 us (0.03 sec); median 63,564,125 cycles, 1x: 63,564,125 cycles + - WOTS pk gen x (ideal).. avg. 1813.04 us (0.00 sec); median 3,444,341 cycles, 90x: 309,990,690 cycles + - WOTS pk gen x (real).. avg. 1807.68 us (0.00 sec); median 3,443,557 cycles, 102x: 351,242,814 cycles +Verifying.. avg. 5689.88 us (0.01 sec); median 10,864,907 cycles, 1x: 10,864,907 cycles +Signature size: 49856 (48.69 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-256f-robust_x4 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-256f-robust_x4 new file mode 100644 index 0000000..cb5618c --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-256f-robust_x4 @@ -0,0 +1,16 @@ +Parameters: n = 32, h = 68, d = 17, b = 9, k = 35, w = 16, way=4, tree height=4, wots_len=67 +Running 10 iterations. +thash avg. 0.99 us (0.00 sec); median 1,873 cycles, 1x: 1,873 cycles +f1600x avg. 0.84 us (0.00 sec); median 1,606 cycles, 1x: 1,606 cycles +thashx avg. 1.81 us (0.00 sec); median 3,461 cycles, 1x: 3,461 cycles +Generating keypair.. avg. 7709.26 us (0.01 sec); median 14,734,865 cycles, 1x: 14,734,865 cycles + - WOTS pk gen x (ideal).. avg. 1928.82 us (0.00 sec); median 3,679,156 cycles, 4x: 14,716,624 cycles + - WOTS pk gen x (real).. avg. 1929.20 us (0.00 sec); median 3,679,202 cycles, 4x: 14,716,808 cycles +Signing.. avg. 151614.60 us (0.15 sec); median 290,186,334 cycles, 1x: 290,186,334 cycles + - FORS signing.. avg. 20740.11 us (0.02 sec); median 39,646,576 cycles, 1x: 39,646,576 cycles + - WOTS pk gen x (ideal).. avg. 1936.50 us (0.00 sec); median 3,680,104 cycles, 68x: 250,247,072 cycles + - WOTS pk gen x (real).. avg. 1929.69 us (0.00 sec); median 3,678,307 cycles, 68x: 250,124,876 cycles +Verifying.. avg. 4695.02 us (0.00 sec); median 8,959,283 cycles, 1x: 8,959,283 cycles +Signature size: 49856 (48.69 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-256f-robust_x5 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-256f-robust_x5 new file mode 100644 index 0000000..c80850d --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-256f-robust_x5 @@ -0,0 +1,16 @@ +Parameters: n = 32, h = 68, d = 17, b = 9, k = 35, w = 16, way=5, tree height=4, wots_len=67 +Running 10 iterations. +thash avg. 0.99 us (0.00 sec); median 1,877 cycles, 1x: 1,877 cycles +f1600x avg. 1.33 us (0.00 sec); median 2,538 cycles, 1x: 2,538 cycles +thashx avg. 2.83 us (0.00 sec); median 5,415 cycles, 1x: 5,415 cycles +Generating keypair.. avg. 12032.49 us (0.01 sec); median 23,019,199 cycles, 1x: 23,019,199 cycles + - WOTS pk gen x (ideal).. avg. 3009.35 us (0.00 sec); median 5,746,036 cycles, 3x: 17,238,108 cycles + - WOTS pk gen x (real).. avg. 3008.31 us (0.00 sec); median 5,745,806 cycles, 4x: 22,983,224 cycles +Signing.. avg. 237926.18 us (0.24 sec); median 455,320,340 cycles, 1x: 455,320,340 cycles + - FORS signing.. avg. 33435.07 us (0.03 sec); median 63,936,537 cycles, 1x: 63,936,537 cycles + - WOTS pk gen x (ideal).. avg. 3021.41 us (0.00 sec); median 5,745,888 cycles, 54x: 310,277,952 cycles + - WOTS pk gen x (real).. avg. 3006.50 us (0.00 sec); median 5,745,190 cycles, 68x: 390,672,920 cycles +Verifying.. avg. 6033.17 us (0.01 sec); median 11,503,876 cycles, 1x: 11,503,876 cycles +Signature size: 49856 (48.69 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-256f-simple_x3 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-256f-simple_x3 new file mode 100644 index 0000000..ffdf8f3 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-256f-simple_x3 @@ -0,0 +1,16 @@ +Parameters: n = 32, h = 68, d = 17, b = 9, k = 35, w = 16, way=3, tree height=4, wots_len=67 +Running 10 iterations. +thash avg. 0.51 us (0.00 sec); median 970 cycles, 1x: 970 cycles +f1600x avg. 0.81 us (0.00 sec); median 1,533 cycles, 1x: 1,533 cycles +thashx avg. 0.88 us (0.00 sec); median 1,666 cycles, 1x: 1,666 cycles +Generating keypair.. avg. 5728.41 us (0.01 sec); median 10,938,467 cycles, 1x: 10,938,467 cycles + - WOTS pk gen x (ideal).. avg. 956.02 us (0.00 sec); median 1,820,756 cycles, 5x: 9,103,780 cycles + - WOTS pk gen x (real).. avg. 957.13 us (0.00 sec); median 1,820,934 cycles, 6x: 10,925,604 cycles +Signing.. avg. 116706.46 us (0.12 sec); median 223,335,039 cycles, 1x: 223,335,039 cycles + - FORS signing.. avg. 19513.30 us (0.02 sec); median 37,323,087 cycles, 1x: 37,323,087 cycles + - WOTS pk gen x (ideal).. avg. 961.79 us (0.00 sec); median 1,820,888 cycles, 90x: 163,879,920 cycles + - WOTS pk gen x (real).. avg. 953.65 us (0.00 sec); median 1,820,880 cycles, 102x: 185,729,760 cycles +Verifying.. avg. 2980.38 us (0.00 sec); median 5,682,900 cycles, 1x: 5,682,900 cycles +Signature size: 49856 (48.69 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-256f-simple_x4 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-256f-simple_x4 new file mode 100644 index 0000000..4b046b5 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-256f-simple_x4 @@ -0,0 +1,16 @@ +Parameters: n = 32, h = 68, d = 17, b = 9, k = 35, w = 16, way=4, tree height=4, wots_len=67 +Running 10 iterations. +thash avg. 0.51 us (0.00 sec); median 970 cycles, 1x: 970 cycles +f1600x avg. 0.84 us (0.00 sec); median 1,605 cycles, 1x: 1,605 cycles +thashx avg. 0.93 us (0.00 sec); median 1,771 cycles, 1x: 1,771 cycles +Generating keypair.. avg. 4084.25 us (0.00 sec); median 7,797,728 cycles, 1x: 7,797,728 cycles + - WOTS pk gen x (ideal).. avg. 1022.33 us (0.00 sec); median 1,942,860 cycles, 4x: 7,771,440 cycles + - WOTS pk gen x (real).. avg. 1019.36 us (0.00 sec); median 1,942,849 cycles, 4x: 7,771,396 cycles +Signing.. avg. 82063.74 us (0.08 sec); median 157,058,567 cycles, 1x: 157,058,567 cycles + - FORS signing.. avg. 12787.61 us (0.01 sec); median 24,465,437 cycles, 1x: 24,465,437 cycles + - WOTS pk gen x (ideal).. avg. 1021.82 us (0.00 sec); median 1,942,864 cycles, 68x: 132,114,752 cycles + - WOTS pk gen x (real).. avg. 1019.81 us (0.00 sec); median 1,942,764 cycles, 68x: 132,107,952 cycles +Verifying.. avg. 2460.76 us (0.00 sec); median 4,694,459 cycles, 1x: 4,694,459 cycles +Signature size: 49856 (48.69 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-256f-simple_x5 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-256f-simple_x5 new file mode 100644 index 0000000..7015c31 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-256f-simple_x5 @@ -0,0 +1,16 @@ +Parameters: n = 32, h = 68, d = 17, b = 9, k = 35, w = 16, way=5, tree height=4, wots_len=67 +Running 10 iterations. +thash avg. 0.51 us (0.00 sec); median 970 cycles, 1x: 970 cycles +f1600x avg. 1.33 us (0.00 sec); median 2,529 cycles, 1x: 2,529 cycles +thashx avg. 1.45 us (0.00 sec); median 2,768 cycles, 1x: 2,768 cycles +Generating keypair.. avg. 6344.93 us (0.01 sec); median 12,127,418 cycles, 1x: 12,127,418 cycles + - WOTS pk gen x (ideal).. avg. 1589.12 us (0.00 sec); median 3,031,476 cycles, 3x: 9,094,428 cycles + - WOTS pk gen x (real).. avg. 1588.79 us (0.00 sec); median 3,029,793 cycles, 4x: 12,119,172 cycles +Signing.. avg. 127375.46 us (0.13 sec); median 243,737,370 cycles, 1x: 243,737,370 cycles + - FORS signing.. avg. 19640.88 us (0.02 sec); median 37,577,965 cycles, 1x: 37,577,965 cycles + - WOTS pk gen x (ideal).. avg. 1592.14 us (0.00 sec); median 3,030,586 cycles, 54x: 163,651,644 cycles + - WOTS pk gen x (real).. avg. 1589.44 us (0.00 sec); median 3,029,190 cycles, 68x: 205,984,920 cycles +Verifying.. avg. 2965.56 us (0.00 sec); median 5,653,567 cycles, 1x: 5,653,567 cycles +Signature size: 49856 (48.69 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-256s-robust_x3 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-256s-robust_x3 new file mode 100644 index 0000000..b24cd84 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-256s-robust_x3 @@ -0,0 +1,16 @@ +Parameters: n = 32, h = 64, d = 8, b = 14, k = 22, w = 16, way=3, tree height=8, wots_len=67 +Running 10 iterations. +thash avg. 0.76 us (0.00 sec); median 1,874 cycles, 1x: 1,874 cycles +f1600x avg. 0.66 us (0.00 sec); median 1,533 cycles, 1x: 1,533 cycles +thashx avg. 1.42 us (0.00 sec); median 3,251 cycles, 1x: 3,251 cycles +Generating keypair.. avg. 127228.01 us (0.13 sec); median 296,600,944 cycles, 1x: 296,600,944 cycles + - WOTS pk gen x (ideal).. avg. 1498.70 us (0.00 sec); median 3,443,904 cycles, 85x: 292,731,840 cycles + - WOTS pk gen x (real).. avg. 1474.71 us (0.00 sec); median 3,443,996 cycles, 86x: 296,183,656 cycles +Signing.. avg. 1589176.28 us (1.59 sec); median 3,650,378,107 cycles, 1x: 3,650,378,107 cycles + - FORS signing.. avg. 617017.90 us (0.62 sec); median 1,277,835,991 cycles, 1x: 1,277,835,991 cycles + - WOTS pk gen x (ideal).. avg. 1720.86 us (0.00 sec); median 3,444,522 cycles, 682x: 2,349,164,004 cycles + - WOTS pk gen x (real).. avg. 1721.11 us (0.00 sec); median 3,445,131 cycles, 688x: 2,370,250,128 cycles +Verifying.. avg. 2824.32 us (0.00 sec); median 5,660,747 cycles, 1x: 5,660,747 cycles +Signature size: 29792 (29.09 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-256s-robust_x4 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-256s-robust_x4 new file mode 100644 index 0000000..31f7ac4 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-256s-robust_x4 @@ -0,0 +1,16 @@ +Parameters: n = 32, h = 64, d = 8, b = 14, k = 22, w = 16, way=4, tree height=8, wots_len=67 +Running 10 iterations. +thash avg. 0.94 us (0.00 sec); median 1,872 cycles, 1x: 1,872 cycles +f1600x avg. 0.80 us (0.00 sec); median 1,604 cycles, 1x: 1,604 cycles +thashx avg. 1.73 us (0.00 sec); median 3,458 cycles, 1x: 3,458 cycles +Generating keypair.. avg. 120025.36 us (0.12 sec); median 236,006,114 cycles, 1x: 236,006,114 cycles + - WOTS pk gen x (ideal).. avg. 1927.76 us (0.00 sec); median 3,676,325 cycles, 64x: 235,284,800 cycles + - WOTS pk gen x (real).. avg. 1926.59 us (0.00 sec); median 3,676,681 cycles, 64x: 235,307,584 cycles +Signing.. avg. 1401494.91 us (1.40 sec); median 2,682,777,035 cycles, 1x: 2,682,777,035 cycles + - FORS signing.. avg. 415043.83 us (0.42 sec); median 794,468,194 cycles, 1x: 794,468,194 cycles + - WOTS pk gen x (ideal).. avg. 1954.81 us (0.00 sec); median 3,685,361 cycles, 512x: 1,886,904,832 cycles + - WOTS pk gen x (real).. avg. 1950.00 us (0.00 sec); median 3,686,588 cycles, 512x: 1,887,533,056 cycles +Verifying.. avg. 2349.98 us (0.00 sec); median 4,461,731 cycles, 1x: 4,461,731 cycles +Signature size: 29792 (29.09 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-256s-robust_x5 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-256s-robust_x5 new file mode 100644 index 0000000..b1a992b --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-256s-robust_x5 @@ -0,0 +1,16 @@ +Parameters: n = 32, h = 64, d = 8, b = 14, k = 22, w = 16, way=5, tree height=8, wots_len=67 +Running 10 iterations. +thash avg. 0.99 us (0.00 sec); median 1,877 cycles, 1x: 1,877 cycles +f1600x avg. 1.31 us (0.00 sec); median 2,492 cycles, 1x: 2,492 cycles +thashx avg. 2.83 us (0.00 sec); median 5,415 cycles, 1x: 5,415 cycles +Generating keypair.. avg. 156275.20 us (0.16 sec); median 299,122,783 cycles, 1x: 299,122,783 cycles + - WOTS pk gen x (ideal).. avg. 3008.43 us (0.00 sec); median 5,745,308 cycles, 51x: 293,010,708 cycles + - WOTS pk gen x (real).. avg. 3007.58 us (0.00 sec); median 5,745,480 cycles, 52x: 298,764,960 cycles +Signing.. avg. 1922214.48 us (1.92 sec); median 3,678,525,836 cycles, 1x: 3,678,525,836 cycles + - FORS signing.. avg. 671750.45 us (0.67 sec); median 1,285,589,353 cycles, 1x: 1,285,589,353 cycles + - WOTS pk gen x (ideal).. avg. 3019.68 us (0.00 sec); median 5,746,814 cycles, 409x: 2,350,446,926 cycles + - WOTS pk gen x (real).. avg. 3003.27 us (0.00 sec); median 5,746,838 cycles, 416x: 2,390,684,608 cycles +Verifying.. avg. 3035.45 us (0.00 sec); median 5,786,979 cycles, 1x: 5,786,979 cycles +Signature size: 29792 (29.09 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-256s-simple_x3 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-256s-simple_x3 new file mode 100644 index 0000000..937f9b9 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-256s-simple_x3 @@ -0,0 +1,16 @@ +Parameters: n = 32, h = 64, d = 8, b = 14, k = 22, w = 16, way=3, tree height=8, wots_len=67 +Running 10 iterations. +thash avg. 0.51 us (0.00 sec); median 971 cycles, 1x: 971 cycles +f1600x avg. 0.81 us (0.00 sec); median 1,533 cycles, 1x: 1,533 cycles +thashx avg. 0.87 us (0.00 sec); median 1,664 cycles, 1x: 1,664 cycles +Generating keypair.. avg. 81928.32 us (0.08 sec); median 156,812,673 cycles, 1x: 156,812,673 cycles + - WOTS pk gen x (ideal).. avg. 955.34 us (0.00 sec); median 1,820,190 cycles, 85x: 154,716,150 cycles + - WOTS pk gen x (real).. avg. 956.54 us (0.00 sec); median 1,820,202 cycles, 86x: 156,537,372 cycles +Signing.. avg. 1048247.07 us (1.05 sec); median 2,006,089,931 cycles, 1x: 2,006,089,931 cycles + - FORS signing.. avg. 392321.73 us (0.39 sec); median 750,773,721 cycles, 1x: 750,773,721 cycles + - WOTS pk gen x (ideal).. avg. 959.18 us (0.00 sec); median 1,820,080 cycles, 682x: 1,241,294,560 cycles + - WOTS pk gen x (real).. avg. 956.06 us (0.00 sec); median 1,820,179 cycles, 688x: 1,252,283,152 cycles +Verifying.. avg. 1521.22 us (0.00 sec); median 2,890,879 cycles, 1x: 2,890,879 cycles +Signature size: 29792 (29.09 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-256s-simple_x4 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-256s-simple_x4 new file mode 100644 index 0000000..68e8adb --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-256s-simple_x4 @@ -0,0 +1,16 @@ +Parameters: n = 32, h = 64, d = 8, b = 14, k = 22, w = 16, way=4, tree height=8, wots_len=67 +Running 10 iterations. +thash avg. 0.51 us (0.00 sec); median 969 cycles, 1x: 969 cycles +f1600x avg. 0.84 us (0.00 sec); median 1,606 cycles, 1x: 1,606 cycles +thashx avg. 0.94 us (0.00 sec); median 1,779 cycles, 1x: 1,779 cycles +Generating keypair.. avg. 65416.77 us (0.07 sec); median 125,208,144 cycles, 1x: 125,208,144 cycles + - WOTS pk gen x (ideal).. avg. 1027.11 us (0.00 sec); median 1,956,262 cycles, 64x: 125,200,768 cycles + - WOTS pk gen x (real).. avg. 1025.84 us (0.00 sec); median 1,955,597 cycles, 64x: 125,158,208 cycles +Signing.. avg. 780307.03 us (0.78 sec); median 1,493,587,914 cycles, 1x: 1,493,587,914 cycles + - FORS signing.. avg. 256741.19 us (0.26 sec); median 491,342,104 cycles, 1x: 491,342,104 cycles + - WOTS pk gen x (ideal).. avg. 1031.26 us (0.00 sec); median 1,954,738 cycles, 512x: 1,000,825,856 cycles + - WOTS pk gen x (real).. avg. 1029.23 us (0.00 sec); median 1,954,790 cycles, 512x: 1,000,852,480 cycles +Verifying.. avg. 1223.28 us (0.00 sec); median 2,316,740 cycles, 1x: 2,316,740 cycles +Signature size: 29792 (29.09 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-256s-simple_x5 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-256s-simple_x5 new file mode 100644 index 0000000..fa86e42 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A710/sphincs-shake-256s-simple_x5 @@ -0,0 +1,16 @@ +Parameters: n = 32, h = 64, d = 8, b = 14, k = 22, w = 16, way=5, tree height=8, wots_len=67 +Running 10 iterations. +thash avg. 0.51 us (0.00 sec); median 970 cycles, 1x: 970 cycles +f1600x avg. 1.31 us (0.00 sec); median 2,494 cycles, 1x: 2,494 cycles +thashx avg. 1.45 us (0.00 sec); median 2,765 cycles, 1x: 2,765 cycles +Generating keypair.. avg. 82260.70 us (0.08 sec); median 157,463,616 cycles, 1x: 157,463,616 cycles + - WOTS pk gen x (ideal).. avg. 1585.96 us (0.00 sec); median 3,023,180 cycles, 51x: 154,182,180 cycles + - WOTS pk gen x (real).. avg. 1587.98 us (0.00 sec); median 3,023,239 cycles, 52x: 157,208,428 cycles +Signing.. avg. 980261.42 us (0.98 sec); median 2,013,008,519 cycles, 1x: 2,013,008,519 cycles + - FORS signing.. avg. 316218.20 us (0.32 sec); median 752,732,714 cycles, 1x: 752,732,714 cycles + - WOTS pk gen x (ideal).. avg. 1291.11 us (0.00 sec); median 3,024,005 cycles, 409x: 1,236,818,045 cycles + - WOTS pk gen x (real).. avg. 1314.45 us (0.00 sec); median 3,024,092 cycles, 416x: 1,258,022,272 cycles +Verifying.. avg. 1231.20 us (0.00 sec); median 2,820,829 cycles, 1x: 2,820,829 cycles +Signature size: 29792 (29.09 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-128f-robust_x3 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-128f-robust_x3 new file mode 100644 index 0000000..e0fa6a3 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-128f-robust_x3 @@ -0,0 +1,16 @@ +Parameters: n = 16, h = 66, d = 22, b = 6, k = 33, w = 16, way=3, tree height=3, wots_len=35 +Running 10 iterations. +thash avg. 0.93 us (0.00 sec); median 1,888 cycles, 1x: 1,888 cycles +f1600x avg. 1.19 us (0.00 sec); median 2,250 cycles, 1x: 2,250 cycles +thashx avg. 2.47 us (0.00 sec); median 4,636 cycles, 1x: 4,636 cycles +Generating keypair.. avg. 3951.86 us (0.00 sec); median 7,636,350 cycles, 1x: 7,636,350 cycles + - WOTS pk gen x (ideal).. avg. 1338.31 us (0.00 sec); median 2,541,421 cycles, 2x: 5,082,842 cycles + - WOTS pk gen x (real).. avg. 1329.35 us (0.00 sec); median 2,541,665 cycles, 3x: 7,624,995 cycles +Signing.. avg. 92029.61 us (0.09 sec); median 177,098,304 cycles, 1x: 177,098,304 cycles + - FORS signing.. avg. 4550.58 us (0.00 sec); median 9,061,507 cycles, 1x: 9,061,507 cycles + - WOTS pk gen x (ideal).. avg. 1283.08 us (0.00 sec); median 2,541,129 cycles, 58x: 147,385,482 cycles + - WOTS pk gen x (real).. avg. 1278.97 us (0.00 sec); median 2,541,198 cycles, 66x: 167,719,068 cycles +Verifying.. avg. 5245.47 us (0.01 sec); median 10,492,115 cycles, 1x: 10,492,115 cycles +Signature size: 17088 (16.69 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-128f-robust_x4 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-128f-robust_x4 new file mode 100644 index 0000000..2737b7a --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-128f-robust_x4 @@ -0,0 +1,16 @@ +Parameters: n = 16, h = 66, d = 22, b = 6, k = 33, w = 16, way=4, tree height=3, wots_len=35 +Running 10 iterations. +thash avg. 0.93 us (0.00 sec); median 1,887 cycles, 1x: 1,887 cycles +f1600x avg. 1.21 us (0.00 sec); median 2,203 cycles, 1x: 2,203 cycles +thashx avg. 2.44 us (0.00 sec); median 4,571 cycles, 1x: 4,571 cycles +Generating keypair.. avg. 2696.88 us (0.00 sec); median 5,043,285 cycles, 1x: 5,043,285 cycles + - WOTS pk gen x (ideal).. avg. 1330.50 us (0.00 sec); median 2,513,413 cycles, 2x: 5,026,826 cycles + - WOTS pk gen x (real).. avg. 1336.81 us (0.00 sec); median 2,515,846 cycles, 2x: 5,031,692 cycles +Signing.. avg. 63404.76 us (0.06 sec); median 117,280,323 cycles, 1x: 117,280,323 cycles + - FORS signing.. avg. 3493.68 us (0.00 sec); median 6,280,912 cycles, 1x: 6,280,912 cycles + - WOTS pk gen x (ideal).. avg. 1390.40 us (0.00 sec); median 2,513,285 cycles, 44x: 110,584,540 cycles + - WOTS pk gen x (real).. avg. 1411.84 us (0.00 sec); median 2,513,744 cycles, 44x: 110,604,736 cycles +Verifying.. avg. 4407.73 us (0.00 sec); median 7,948,919 cycles, 1x: 7,948,919 cycles +Signature size: 17088 (16.69 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-128f-robust_x5 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-128f-robust_x5 new file mode 100644 index 0000000..3736e8d --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-128f-robust_x5 @@ -0,0 +1,16 @@ +Parameters: n = 16, h = 66, d = 22, b = 6, k = 33, w = 16, way=5, tree height=3, wots_len=35 +Running 10 iterations. +thash avg. 0.92 us (0.00 sec); median 1,880 cycles, 1x: 1,880 cycles +f1600x avg. 1.28 us (0.00 sec); median 2,196 cycles, 1x: 2,196 cycles +thashx avg. 2.62 us (0.00 sec); median 4,605 cycles, 1x: 4,605 cycles +Generating keypair.. avg. 2907.28 us (0.00 sec); median 5,086,794 cycles, 1x: 5,086,794 cycles + - WOTS pk gen x (ideal).. avg. 1452.72 us (0.00 sec); median 2,536,498 cycles, 1x: 2,536,498 cycles + - WOTS pk gen x (real).. avg. 1460.41 us (0.00 sec); median 2,535,315 cycles, 2x: 5,070,630 cycles +Signing.. avg. 69453.69 us (0.07 sec); median 119,000,890 cycles, 1x: 119,000,890 cycles + - FORS signing.. avg. 3886.51 us (0.00 sec); median 7,080,299 cycles, 1x: 7,080,299 cycles + - WOTS pk gen x (ideal).. avg. 1544.49 us (0.00 sec); median 2,534,379 cycles, 35x: 88,703,265 cycles + - WOTS pk gen x (real).. avg. 1535.83 us (0.00 sec); median 2,533,778 cycles, 44x: 111,486,232 cycles +Verifying.. avg. 3830.27 us (0.00 sec); median 6,691,393 cycles, 1x: 6,691,393 cycles +Signature size: 17088 (16.69 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-128f-simple_x3 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-128f-simple_x3 new file mode 100644 index 0000000..7b27a8f --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-128f-simple_x3 @@ -0,0 +1,16 @@ +Parameters: n = 16, h = 66, d = 22, b = 6, k = 33, w = 16, way=3, tree height=3, wots_len=35 +Running 10 iterations. +thash avg. 0.42 us (0.00 sec); median 963 cycles, 1x: 963 cycles +f1600x avg. 1.09 us (0.00 sec); median 2,253 cycles, 1x: 2,253 cycles +thashx avg. 1.15 us (0.00 sec); median 2,317 cycles, 1x: 2,317 cycles +Generating keypair.. avg. 1960.30 us (0.00 sec); median 3,944,044 cycles, 1x: 3,944,044 cycles + - WOTS pk gen x (ideal).. avg. 662.90 us (0.00 sec); median 1,311,325 cycles, 2x: 2,622,650 cycles + - WOTS pk gen x (real).. avg. 656.04 us (0.00 sec); median 1,311,257 cycles, 3x: 3,933,771 cycles +Signing.. avg. 46671.74 us (0.05 sec); median 92,209,518 cycles, 1x: 92,209,518 cycles + - FORS signing.. avg. 2742.64 us (0.00 sec); median 5,425,682 cycles, 1x: 5,425,682 cycles + - WOTS pk gen x (ideal).. avg. 671.10 us (0.00 sec); median 1,311,368 cycles, 58x: 76,059,344 cycles + - WOTS pk gen x (real).. avg. 690.83 us (0.00 sec); median 1,311,317 cycles, 66x: 86,546,922 cycles +Verifying.. avg. 2760.27 us (0.00 sec); median 5,284,703 cycles, 1x: 5,284,703 cycles +Signature size: 17088 (16.69 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-128f-simple_x4 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-128f-simple_x4 new file mode 100644 index 0000000..bb587c0 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-128f-simple_x4 @@ -0,0 +1,16 @@ +Parameters: n = 16, h = 66, d = 22, b = 6, k = 33, w = 16, way=4, tree height=3, wots_len=35 +Running 10 iterations. +thash avg. 0.49 us (0.00 sec); median 961 cycles, 1x: 961 cycles +f1600x avg. 1.25 us (0.00 sec); median 2,195 cycles, 1x: 2,195 cycles +thashx avg. 1.31 us (0.00 sec); median 2,305 cycles, 1x: 2,305 cycles +Generating keypair.. avg. 1466.82 us (0.00 sec); median 2,615,746 cycles, 1x: 2,615,746 cycles + - WOTS pk gen x (ideal).. avg. 731.34 us (0.00 sec); median 1,307,560 cycles, 2x: 2,615,120 cycles + - WOTS pk gen x (real).. avg. 736.13 us (0.00 sec); median 1,307,617 cycles, 2x: 2,615,234 cycles +Signing.. avg. 33922.21 us (0.03 sec); median 61,367,020 cycles, 1x: 61,367,020 cycles + - FORS signing.. avg. 2111.43 us (0.00 sec); median 3,789,643 cycles, 1x: 3,789,643 cycles + - WOTS pk gen x (ideal).. avg. 729.57 us (0.00 sec); median 1,307,019 cycles, 44x: 57,508,836 cycles + - WOTS pk gen x (real).. avg. 721.09 us (0.00 sec); median 1,307,071 cycles, 44x: 57,511,124 cycles +Verifying.. avg. 2266.20 us (0.00 sec); median 4,130,357 cycles, 1x: 4,130,357 cycles +Signature size: 17088 (16.69 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-128f-simple_x5 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-128f-simple_x5 new file mode 100644 index 0000000..d33fa35 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-128f-simple_x5 @@ -0,0 +1,16 @@ +Parameters: n = 16, h = 66, d = 22, b = 6, k = 33, w = 16, way=5, tree height=3, wots_len=35 +Running 10 iterations. +thash avg. 0.49 us (0.00 sec); median 961 cycles, 1x: 961 cycles +f1600x avg. 1.40 us (0.00 sec); median 2,191 cycles, 1x: 2,191 cycles +thashx avg. 1.43 us (0.00 sec); median 2,305 cycles, 1x: 2,305 cycles +Generating keypair.. avg. 1624.05 us (0.00 sec); median 2,648,046 cycles, 1x: 2,648,046 cycles + - WOTS pk gen x (ideal).. avg. 811.58 us (0.00 sec); median 1,343,048 cycles, 1x: 1,343,048 cycles + - WOTS pk gen x (real).. avg. 791.48 us (0.00 sec); median 1,313,381 cycles, 2x: 2,626,762 cycles +Signing.. avg. 36186.61 us (0.04 sec); median 62,279,546 cycles, 1x: 62,279,546 cycles + - FORS signing.. avg. 2210.76 us (0.00 sec); median 4,128,088 cycles, 1x: 4,128,088 cycles + - WOTS pk gen x (ideal).. avg. 756.76 us (0.00 sec); median 1,315,161 cycles, 35x: 46,030,635 cycles + - WOTS pk gen x (real).. avg. 751.30 us (0.00 sec); median 1,322,580 cycles, 44x: 58,193,520 cycles +Verifying.. avg. 1938.16 us (0.00 sec); median 3,464,867 cycles, 1x: 3,464,867 cycles +Signature size: 17088 (16.69 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-128s-robust_x3 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-128s-robust_x3 new file mode 100644 index 0000000..7198cb3 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-128s-robust_x3 @@ -0,0 +1,16 @@ +Parameters: n = 16, h = 63, d = 7, b = 12, k = 14, w = 16, way=3, tree height=9, wots_len=35 +Running 10 iterations. +thash avg. 0.90 us (0.00 sec); median 1,885 cycles, 1x: 1,885 cycles +f1600x avg. 1.16 us (0.00 sec); median 2,253 cycles, 1x: 2,253 cycles +thashx avg. 2.39 us (0.00 sec); median 4,622 cycles, 1x: 4,622 cycles +Generating keypair.. avg. 221295.40 us (0.22 sec); median 434,355,823 cycles, 1x: 434,355,823 cycles + - WOTS pk gen x (ideal).. avg. 1318.16 us (0.00 sec); median 2,536,351 cycles, 170x: 431,179,670 cycles + - WOTS pk gen x (real).. avg. 1342.64 us (0.00 sec); median 2,534,774 cycles, 171x: 433,446,354 cycles +Signing.. avg. 1653502.11 us (1.65 sec); median 3,283,145,256 cycles, 1x: 3,283,145,256 cycles + - FORS signing.. avg. 122252.19 us (0.12 sec); median 242,213,588 cycles, 1x: 242,213,588 cycles + - WOTS pk gen x (ideal).. avg. 1313.79 us (0.00 sec); median 2,536,385 cycles, 1194x: 3,028,443,690 cycles + - WOTS pk gen x (real).. avg. 1280.45 us (0.00 sec); median 2,536,049 cycles, 1197x: 3,035,650,653 cycles +Verifying.. avg. 1807.69 us (0.00 sec); median 3,535,233 cycles, 1x: 3,535,233 cycles +Signature size: 7856 (7.67 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-128s-robust_x4 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-128s-robust_x4 new file mode 100644 index 0000000..1ce5b83 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-128s-robust_x4 @@ -0,0 +1,16 @@ +Parameters: n = 16, h = 63, d = 7, b = 12, k = 14, w = 16, way=4, tree height=9, wots_len=35 +Running 10 iterations. +thash avg. 0.97 us (0.00 sec); median 1,886 cycles, 1x: 1,886 cycles +f1600x avg. 1.24 us (0.00 sec); median 2,200 cycles, 1x: 2,200 cycles +thashx avg. 2.57 us (0.00 sec); median 4,578 cycles, 1x: 4,578 cycles +Generating keypair.. avg. 171005.82 us (0.17 sec); median 322,794,425 cycles, 1x: 322,794,425 cycles + - WOTS pk gen x (ideal).. avg. 1310.86 us (0.00 sec); median 2,515,507 cycles, 128x: 321,984,896 cycles + - WOTS pk gen x (real).. avg. 1318.18 us (0.00 sec); median 2,514,182 cycles, 128x: 321,815,296 cycles +Signing.. avg. 1298499.40 us (1.30 sec); median 2,425,297,722 cycles, 1x: 2,425,297,722 cycles + - FORS signing.. avg. 89221.50 us (0.09 sec); median 165,333,679 cycles, 1x: 165,333,679 cycles + - WOTS pk gen x (ideal).. avg. 1379.95 us (0.00 sec); median 2,515,371 cycles, 896x: 2,253,772,416 cycles + - WOTS pk gen x (real).. avg. 1376.14 us (0.00 sec); median 2,515,747 cycles, 896x: 2,254,109,312 cycles +Verifying.. avg. 1596.34 us (0.00 sec); median 3,015,446 cycles, 1x: 3,015,446 cycles +Signature size: 7856 (7.67 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-128s-robust_x5 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-128s-robust_x5 new file mode 100644 index 0000000..130413b --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-128s-robust_x5 @@ -0,0 +1,16 @@ +Parameters: n = 16, h = 63, d = 7, b = 12, k = 14, w = 16, way=5, tree height=9, wots_len=35 +Running 10 iterations. +thash avg. 0.95 us (0.00 sec); median 1,883 cycles, 1x: 1,883 cycles +f1600x avg. 1.29 us (0.00 sec); median 2,205 cycles, 1x: 2,205 cycles +thashx avg. 2.61 us (0.00 sec); median 4,618 cycles, 1x: 4,618 cycles +Generating keypair.. avg. 148443.63 us (0.15 sec); median 262,263,906 cycles, 1x: 262,263,906 cycles + - WOTS pk gen x (ideal).. avg. 1368.73 us (0.00 sec); median 2,533,496 cycles, 102x: 258,416,592 cycles + - WOTS pk gen x (real).. avg. 1357.51 us (0.00 sec); median 2,533,452 cycles, 103x: 260,945,556 cycles +Signing.. avg. 1116931.68 us (1.12 sec); median 2,029,132,627 cycles, 1x: 2,029,132,627 cycles + - FORS signing.. avg. 98716.49 us (0.10 sec); median 191,798,685 cycles, 1x: 191,798,685 cycles + - WOTS pk gen x (ideal).. avg. 1411.95 us (0.00 sec); median 2,533,726 cycles, 716x: 1,814,147,816 cycles + - WOTS pk gen x (real).. avg. 1416.91 us (0.00 sec); median 2,534,966 cycles, 721x: 1,827,710,486 cycles +Verifying.. avg. 1374.01 us (0.00 sec); median 2,534,354 cycles, 1x: 2,534,354 cycles +Signature size: 7856 (7.67 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-128s-simple_x3 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-128s-simple_x3 new file mode 100644 index 0000000..297b28f --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-128s-simple_x3 @@ -0,0 +1,16 @@ +Parameters: n = 16, h = 63, d = 7, b = 12, k = 14, w = 16, way=3, tree height=9, wots_len=35 +Running 10 iterations. +thash avg. 0.48 us (0.00 sec); median 961 cycles, 1x: 961 cycles +f1600x avg. 1.15 us (0.00 sec); median 2,255 cycles, 1x: 2,255 cycles +thashx avg. 1.20 us (0.00 sec); median 2,321 cycles, 1x: 2,321 cycles +Generating keypair.. avg. 116772.28 us (0.12 sec); median 225,048,111 cycles, 1x: 225,048,111 cycles + - WOTS pk gen x (ideal).. avg. 673.29 us (0.00 sec); median 1,314,402 cycles, 170x: 223,448,340 cycles + - WOTS pk gen x (real).. avg. 662.00 us (0.00 sec); median 1,314,541 cycles, 171x: 224,786,511 cycles +Signing.. avg. 881260.24 us (0.88 sec); median 1,720,108,298 cycles, 1x: 1,720,108,298 cycles + - FORS signing.. avg. 73446.57 us (0.07 sec); median 144,549,317 cycles, 1x: 144,549,317 cycles + - WOTS pk gen x (ideal).. avg. 660.09 us (0.00 sec); median 1,314,955 cycles, 1194x: 1,570,056,270 cycles + - WOTS pk gen x (real).. avg. 664.93 us (0.00 sec); median 1,314,890 cycles, 1197x: 1,573,923,330 cycles +Verifying.. avg. 872.30 us (0.00 sec); median 1,738,753 cycles, 1x: 1,738,753 cycles +Signature size: 7856 (7.67 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-128s-simple_x4 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-128s-simple_x4 new file mode 100644 index 0000000..670909d --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-128s-simple_x4 @@ -0,0 +1,16 @@ +Parameters: n = 16, h = 63, d = 7, b = 12, k = 14, w = 16, way=4, tree height=9, wots_len=35 +Running 10 iterations. +thash avg. 0.49 us (0.00 sec); median 961 cycles, 1x: 961 cycles +f1600x avg. 1.25 us (0.00 sec); median 2,197 cycles, 1x: 2,197 cycles +thashx avg. 1.26 us (0.00 sec); median 2,303 cycles, 1x: 2,303 cycles +Generating keypair.. avg. 88157.95 us (0.09 sec); median 167,524,380 cycles, 1x: 167,524,380 cycles + - WOTS pk gen x (ideal).. avg. 698.31 us (0.00 sec); median 1,306,630 cycles, 128x: 167,248,640 cycles + - WOTS pk gen x (real).. avg. 704.47 us (0.00 sec); median 1,306,553 cycles, 128x: 167,238,784 cycles +Signing.. avg. 681565.71 us (0.68 sec); median 1,272,963,693 cycles, 1x: 1,272,963,693 cycles + - FORS signing.. avg. 53310.13 us (0.05 sec); median 99,821,463 cycles, 1x: 99,821,463 cycles + - WOTS pk gen x (ideal).. avg. 685.37 us (0.00 sec); median 1,307,160 cycles, 896x: 1,171,215,360 cycles + - WOTS pk gen x (real).. avg. 670.41 us (0.00 sec); median 1,306,246 cycles, 896x: 1,170,396,416 cycles +Verifying.. avg. 725.56 us (0.00 sec); median 1,413,653 cycles, 1x: 1,413,653 cycles +Signature size: 7856 (7.67 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-128s-simple_x5 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-128s-simple_x5 new file mode 100644 index 0000000..46a4117 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-128s-simple_x5 @@ -0,0 +1,16 @@ +Parameters: n = 16, h = 63, d = 7, b = 12, k = 14, w = 16, way=5, tree height=9, wots_len=35 +Running 10 iterations. +thash avg. 0.45 us (0.00 sec); median 965 cycles, 1x: 965 cycles +f1600x avg. 1.22 us (0.00 sec); median 2,194 cycles, 1x: 2,194 cycles +thashx avg. 1.27 us (0.00 sec); median 2,324 cycles, 1x: 2,324 cycles +Generating keypair.. avg. 75865.74 us (0.08 sec); median 136,590,464 cycles, 1x: 136,590,464 cycles + - WOTS pk gen x (ideal).. avg. 769.75 us (0.00 sec); median 1,357,342 cycles, 102x: 138,448,884 cycles + - WOTS pk gen x (real).. avg. 744.09 us (0.00 sec); median 1,318,291 cycles, 103x: 135,783,973 cycles +Signing.. avg. 593317.65 us (0.59 sec); median 1,068,233,662 cycles, 1x: 1,068,233,662 cycles + - FORS signing.. avg. 58870.50 us (0.06 sec); median 111,393,327 cycles, 1x: 111,393,327 cycles + - WOTS pk gen x (ideal).. avg. 736.61 us (0.00 sec); median 1,324,408 cycles, 716x: 948,276,128 cycles + - WOTS pk gen x (real).. avg. 745.65 us (0.00 sec); median 1,318,085 cycles, 721x: 950,339,285 cycles +Verifying.. avg. 626.64 us (0.00 sec); median 1,159,319 cycles, 1x: 1,159,319 cycles +Signature size: 7856 (7.67 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-192f-robust_x3 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-192f-robust_x3 new file mode 100644 index 0000000..a4d01ed --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-192f-robust_x3 @@ -0,0 +1,16 @@ +Parameters: n = 24, h = 66, d = 22, b = 8, k = 33, w = 16, way=3, tree height=3, wots_len=51 +Running 10 iterations. +thash avg. 0.93 us (0.00 sec); median 1,904 cycles, 1x: 1,904 cycles +f1600x avg. 1.14 us (0.00 sec); median 2,253 cycles, 1x: 2,253 cycles +thashx avg. 2.34 us (0.00 sec); median 4,662 cycles, 1x: 4,662 cycles +Generating keypair.. avg. 5628.80 us (0.01 sec); median 11,215,881 cycles, 1x: 11,215,881 cycles + - WOTS pk gen x (ideal).. avg. 1878.90 us (0.00 sec); median 3,734,017 cycles, 2x: 7,468,034 cycles + - WOTS pk gen x (real).. avg. 1859.64 us (0.00 sec); median 3,733,834 cycles, 3x: 11,201,502 cycles +Signing.. avg. 142990.97 us (0.14 sec); median 282,923,676 cycles, 1x: 282,923,676 cycles + - FORS signing.. avg. 18086.98 us (0.02 sec); median 36,165,818 cycles, 1x: 36,165,818 cycles + - WOTS pk gen x (ideal).. avg. 1891.33 us (0.00 sec); median 3,734,148 cycles, 58x: 216,580,584 cycles + - WOTS pk gen x (real).. avg. 1896.78 us (0.00 sec); median 3,734,803 cycles, 66x: 246,496,998 cycles +Verifying.. avg. 7548.41 us (0.01 sec); median 14,688,491 cycles, 1x: 14,688,491 cycles +Signature size: 35664 (34.83 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-192f-robust_x4 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-192f-robust_x4 new file mode 100644 index 0000000..e253737 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-192f-robust_x4 @@ -0,0 +1,16 @@ +Parameters: n = 24, h = 66, d = 22, b = 8, k = 33, w = 16, way=4, tree height=3, wots_len=51 +Running 10 iterations. +thash avg. 0.96 us (0.00 sec); median 1,905 cycles, 1x: 1,905 cycles +f1600x avg. 1.24 us (0.00 sec); median 2,200 cycles, 1x: 2,200 cycles +thashx avg. 2.48 us (0.00 sec); median 4,613 cycles, 1x: 4,613 cycles +Generating keypair.. avg. 3940.84 us (0.00 sec); median 7,421,249 cycles, 1x: 7,421,249 cycles + - WOTS pk gen x (ideal).. avg. 1967.33 us (0.00 sec); median 3,705,614 cycles, 2x: 7,411,228 cycles + - WOTS pk gen x (real).. avg. 2008.32 us (0.00 sec); median 3,701,200 cycles, 2x: 7,402,400 cycles +Signing.. avg. 101188.39 us (0.10 sec); median 188,148,553 cycles, 1x: 188,148,553 cycles + - FORS signing.. avg. 13328.94 us (0.01 sec); median 24,804,472 cycles, 1x: 24,804,472 cycles + - WOTS pk gen x (ideal).. avg. 1985.84 us (0.00 sec); median 3,702,173 cycles, 44x: 162,895,612 cycles + - WOTS pk gen x (real).. avg. 2048.31 us (0.00 sec); median 3,701,748 cycles, 44x: 162,876,912 cycles +Verifying.. avg. 6369.53 us (0.01 sec); median 11,630,103 cycles, 1x: 11,630,103 cycles +Signature size: 35664 (34.83 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-192f-robust_x5 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-192f-robust_x5 new file mode 100644 index 0000000..27b4a32 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-192f-robust_x5 @@ -0,0 +1,16 @@ +Parameters: n = 24, h = 66, d = 22, b = 8, k = 33, w = 16, way=5, tree height=3, wots_len=51 +Running 10 iterations. +thash avg. 1.06 us (0.00 sec); median 1,904 cycles, 1x: 1,904 cycles +f1600x avg. 1.28 us (0.00 sec); median 2,194 cycles, 1x: 2,194 cycles +thashx avg. 2.66 us (0.00 sec); median 4,673 cycles, 1x: 4,673 cycles +Generating keypair.. avg. 4196.76 us (0.00 sec); median 7,530,656 cycles, 1x: 7,530,656 cycles + - WOTS pk gen x (ideal).. avg. 2074.62 us (0.00 sec); median 3,760,437 cycles, 1x: 3,760,437 cycles + - WOTS pk gen x (real).. avg. 2072.66 us (0.00 sec); median 3,761,499 cycles, 2x: 7,522,998 cycles +Signing.. avg. 105195.62 us (0.11 sec); median 194,523,214 cycles, 1x: 194,523,214 cycles + - FORS signing.. avg. 14898.87 us (0.01 sec); median 28,734,737 cycles, 1x: 28,734,737 cycles + - WOTS pk gen x (ideal).. avg. 2090.24 us (0.00 sec); median 3,773,625 cycles, 35x: 132,076,875 cycles + - WOTS pk gen x (real).. avg. 2111.95 us (0.00 sec); median 3,764,704 cycles, 44x: 165,646,976 cycles +Verifying.. avg. 5410.87 us (0.01 sec); median 9,783,896 cycles, 1x: 9,783,896 cycles +Signature size: 35664 (34.83 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-192f-simple_x3 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-192f-simple_x3 new file mode 100644 index 0000000..cf81eef --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-192f-simple_x3 @@ -0,0 +1,16 @@ +Parameters: n = 24, h = 66, d = 22, b = 8, k = 33, w = 16, way=3, tree height=3, wots_len=51 +Running 10 iterations. +thash avg. 0.44 us (0.00 sec); median 971 cycles, 1x: 971 cycles +f1600x avg. 1.13 us (0.00 sec); median 2,249 cycles, 1x: 2,249 cycles +thashx avg. 1.19 us (0.00 sec); median 2,348 cycles, 1x: 2,348 cycles +Generating keypair.. avg. 2954.06 us (0.00 sec); median 5,845,773 cycles, 1x: 5,845,773 cycles + - WOTS pk gen x (ideal).. avg. 989.70 us (0.00 sec); median 1,945,049 cycles, 2x: 3,890,098 cycles + - WOTS pk gen x (real).. avg. 970.36 us (0.00 sec); median 1,944,270 cycles, 3x: 5,832,810 cycles +Signing.. avg. 74019.61 us (0.07 sec); median 150,218,470 cycles, 1x: 150,218,470 cycles + - FORS signing.. avg. 10761.67 us (0.01 sec); median 21,630,167 cycles, 1x: 21,630,167 cycles + - WOTS pk gen x (ideal).. avg. 969.58 us (0.00 sec); median 1,945,167 cycles, 58x: 112,819,686 cycles + - WOTS pk gen x (real).. avg. 979.35 us (0.00 sec); median 1,944,844 cycles, 66x: 128,359,704 cycles +Verifying.. avg. 3819.09 us (0.00 sec); median 7,528,211 cycles, 1x: 7,528,211 cycles +Signature size: 35664 (34.83 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-192f-simple_x4 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-192f-simple_x4 new file mode 100644 index 0000000..ff6f0d3 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-192f-simple_x4 @@ -0,0 +1,16 @@ +Parameters: n = 24, h = 66, d = 22, b = 8, k = 33, w = 16, way=4, tree height=3, wots_len=51 +Running 10 iterations. +thash avg. 0.48 us (0.00 sec); median 969 cycles, 1x: 969 cycles +f1600x avg. 1.23 us (0.00 sec); median 2,207 cycles, 1x: 2,207 cycles +thashx avg. 1.28 us (0.00 sec); median 2,338 cycles, 1x: 2,338 cycles +Generating keypair.. avg. 2117.81 us (0.00 sec); median 3,902,046 cycles, 1x: 3,902,046 cycles + - WOTS pk gen x (ideal).. avg. 1042.62 us (0.00 sec); median 1,939,460 cycles, 2x: 3,878,920 cycles + - WOTS pk gen x (real).. avg. 1026.68 us (0.00 sec); median 1,938,667 cycles, 2x: 3,877,334 cycles +Signing.. avg. 53924.28 us (0.05 sec); median 100,732,913 cycles, 1x: 100,732,913 cycles + - FORS signing.. avg. 8115.23 us (0.01 sec); median 15,076,625 cycles, 1x: 15,076,625 cycles + - WOTS pk gen x (ideal).. avg. 1050.96 us (0.00 sec); median 1,941,221 cycles, 44x: 85,413,724 cycles + - WOTS pk gen x (real).. avg. 1069.91 us (0.00 sec); median 1,940,283 cycles, 44x: 85,372,452 cycles +Verifying.. avg. 3143.90 us (0.00 sec); median 5,933,917 cycles, 1x: 5,933,917 cycles +Signature size: 35664 (34.83 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-192f-simple_x5 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-192f-simple_x5 new file mode 100644 index 0000000..2a56147 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-192f-simple_x5 @@ -0,0 +1,16 @@ +Parameters: n = 24, h = 66, d = 22, b = 8, k = 33, w = 16, way=5, tree height=3, wots_len=51 +Running 10 iterations. +thash avg. 0.47 us (0.00 sec); median 969 cycles, 1x: 969 cycles +f1600x avg. 1.28 us (0.00 sec); median 2,191 cycles, 1x: 2,191 cycles +thashx avg. 1.32 us (0.00 sec); median 2,357 cycles, 1x: 2,357 cycles +Generating keypair.. avg. 2183.29 us (0.00 sec); median 3,918,880 cycles, 1x: 3,918,880 cycles + - WOTS pk gen x (ideal).. avg. 1112.30 us (0.00 sec); median 1,962,416 cycles, 1x: 1,962,416 cycles + - WOTS pk gen x (real).. avg. 1120.06 us (0.00 sec); median 1,954,469 cycles, 2x: 3,908,938 cycles +Signing.. avg. 57661.59 us (0.06 sec); median 102,968,495 cycles, 1x: 102,968,495 cycles + - FORS signing.. avg. 8757.84 us (0.01 sec); median 16,747,934 cycles, 1x: 16,747,934 cycles + - WOTS pk gen x (ideal).. avg. 1079.98 us (0.00 sec); median 1,951,800 cycles, 35x: 68,313,000 cycles + - WOTS pk gen x (real).. avg. 1081.64 us (0.00 sec); median 1,952,689 cycles, 44x: 85,918,316 cycles +Verifying.. avg. 2709.40 us (0.00 sec); median 4,999,203 cycles, 1x: 4,999,203 cycles +Signature size: 35664 (34.83 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-192s-robust_x3 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-192s-robust_x3 new file mode 100644 index 0000000..36a4f41 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-192s-robust_x3 @@ -0,0 +1,16 @@ +Parameters: n = 24, h = 63, d = 7, b = 14, k = 17, w = 16, way=3, tree height=9, wots_len=51 +Running 10 iterations. +thash avg. 0.93 us (0.00 sec); median 1,905 cycles, 1x: 1,905 cycles +f1600x avg. 1.17 us (0.00 sec); median 2,254 cycles, 1x: 2,254 cycles +thashx avg. 2.40 us (0.00 sec); median 4,654 cycles, 1x: 4,654 cycles +Generating keypair.. avg. 319968.50 us (0.32 sec); median 638,779,779 cycles, 1x: 638,779,779 cycles + - WOTS pk gen x (ideal).. avg. 1857.93 us (0.00 sec); median 3,730,165 cycles, 170x: 634,128,050 cycles + - WOTS pk gen x (real).. avg. 1847.43 us (0.00 sec); median 3,730,637 cycles, 171x: 637,938,927 cycles +Signing.. avg. 2848040.66 us (2.85 sec); median 5,659,693,885 cycles, 1x: 5,659,693,885 cycles + - FORS signing.. avg. 596565.34 us (0.60 sec); median 1,187,669,402 cycles, 1x: 1,187,669,402 cycles + - WOTS pk gen x (ideal).. avg. 1883.07 us (0.00 sec); median 3,731,458 cycles, 1194x: 4,455,360,852 cycles + - WOTS pk gen x (real).. avg. 1910.39 us (0.00 sec); median 3,731,331 cycles, 1197x: 4,466,403,207 cycles +Verifying.. avg. 2551.62 us (0.00 sec); median 5,043,652 cycles, 1x: 5,043,652 cycles +Signature size: 16224 (15.84 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-192s-robust_x4 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-192s-robust_x4 new file mode 100644 index 0000000..c7c62d7 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-192s-robust_x4 @@ -0,0 +1,16 @@ +Parameters: n = 24, h = 63, d = 7, b = 14, k = 17, w = 16, way=4, tree height=9, wots_len=51 +Running 10 iterations. +thash avg. 0.95 us (0.00 sec); median 1,905 cycles, 1x: 1,905 cycles +f1600x avg. 1.22 us (0.00 sec); median 2,197 cycles, 1x: 2,197 cycles +thashx avg. 2.54 us (0.00 sec); median 4,607 cycles, 1x: 4,607 cycles +Generating keypair.. avg. 257145.66 us (0.26 sec); median 474,539,997 cycles, 1x: 474,539,997 cycles + - WOTS pk gen x (ideal).. avg. 1902.49 us (0.00 sec); median 3,702,987 cycles, 128x: 473,982,336 cycles + - WOTS pk gen x (real).. avg. 1888.77 us (0.00 sec); median 3,701,279 cycles, 128x: 473,763,712 cycles +Signing.. avg. 2193445.68 us (2.19 sec); median 4,132,642,101 cycles, 1x: 4,132,642,101 cycles + - FORS signing.. avg. 430377.45 us (0.43 sec); median 811,126,752 cycles, 1x: 811,126,752 cycles + - WOTS pk gen x (ideal).. avg. 1960.28 us (0.00 sec); median 3,701,003 cycles, 896x: 3,316,098,688 cycles + - WOTS pk gen x (real).. avg. 1952.78 us (0.00 sec); median 3,701,500 cycles, 896x: 3,316,544,000 cycles +Verifying.. avg. 2199.70 us (0.00 sec); median 4,220,753 cycles, 1x: 4,220,753 cycles +Signature size: 16224 (15.84 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-192s-robust_x5 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-192s-robust_x5 new file mode 100644 index 0000000..7d45b00 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-192s-robust_x5 @@ -0,0 +1,16 @@ +Parameters: n = 24, h = 63, d = 7, b = 14, k = 17, w = 16, way=5, tree height=9, wots_len=51 +Running 10 iterations. +thash avg. 0.94 us (0.00 sec); median 1,905 cycles, 1x: 1,905 cycles +f1600x avg. 1.28 us (0.00 sec); median 2,192 cycles, 1x: 2,192 cycles +thashx avg. 2.64 us (0.00 sec); median 4,672 cycles, 1x: 4,672 cycles +Generating keypair.. avg. 217089.45 us (0.22 sec); median 386,957,317 cycles, 1x: 386,957,317 cycles + - WOTS pk gen x (ideal).. avg. 2118.30 us (0.00 sec); median 3,746,516 cycles, 102x: 382,144,632 cycles + - WOTS pk gen x (real).. avg. 2102.29 us (0.00 sec); median 3,746,658 cycles, 103x: 385,905,774 cycles +Signing.. avg. 1992180.36 us (1.99 sec); median 3,653,012,294 cycles, 1x: 3,653,012,294 cycles + - FORS signing.. avg. 487258.27 us (0.49 sec); median 944,749,419 cycles, 1x: 944,749,419 cycles + - WOTS pk gen x (ideal).. avg. 2111.04 us (0.00 sec); median 3,750,189 cycles, 716x: 2,685,135,324 cycles + - WOTS pk gen x (real).. avg. 2115.19 us (0.00 sec); median 3,749,037 cycles, 721x: 2,703,055,677 cycles +Verifying.. avg. 1894.68 us (0.00 sec); median 3,470,543 cycles, 1x: 3,470,543 cycles +Signature size: 16224 (15.84 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-192s-simple_x3 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-192s-simple_x3 new file mode 100644 index 0000000..b0f3845 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-192s-simple_x3 @@ -0,0 +1,16 @@ +Parameters: n = 24, h = 63, d = 7, b = 14, k = 17, w = 16, way=3, tree height=9, wots_len=51 +Running 10 iterations. +thash avg. 0.44 us (0.00 sec); median 967 cycles, 1x: 967 cycles +f1600x avg. 1.16 us (0.00 sec); median 2,253 cycles, 1x: 2,253 cycles +thashx avg. 1.19 us (0.00 sec); median 2,355 cycles, 1x: 2,355 cycles +Generating keypair.. avg. 169489.10 us (0.17 sec); median 333,807,526 cycles, 1x: 333,807,526 cycles + - WOTS pk gen x (ideal).. avg. 988.83 us (0.00 sec); median 1,950,069 cycles, 170x: 331,511,730 cycles + - WOTS pk gen x (real).. avg. 1001.52 us (0.00 sec); median 1,950,565 cycles, 171x: 333,546,615 cycles +Signing.. avg. 1530686.82 us (1.53 sec); median 3,046,773,815 cycles, 1x: 3,046,773,815 cycles + - FORS signing.. avg. 358313.93 us (0.36 sec); median 709,601,874 cycles, 1x: 709,601,874 cycles + - WOTS pk gen x (ideal).. avg. 1012.01 us (0.00 sec); median 1,950,388 cycles, 1194x: 2,328,763,272 cycles + - WOTS pk gen x (real).. avg. 1015.80 us (0.00 sec); median 1,950,531 cycles, 1197x: 2,334,785,607 cycles +Verifying.. avg. 1290.22 us (0.00 sec); median 2,522,451 cycles, 1x: 2,522,451 cycles +Signature size: 16224 (15.84 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-192s-simple_x4 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-192s-simple_x4 new file mode 100644 index 0000000..a1bc5f4 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-192s-simple_x4 @@ -0,0 +1,16 @@ +Parameters: n = 24, h = 63, d = 7, b = 14, k = 17, w = 16, way=4, tree height=9, wots_len=51 +Running 10 iterations. +thash avg. 0.49 us (0.00 sec); median 969 cycles, 1x: 969 cycles +f1600x avg. 1.23 us (0.00 sec); median 2,202 cycles, 1x: 2,202 cycles +thashx avg. 1.33 us (0.00 sec); median 2,339 cycles, 1x: 2,339 cycles +Generating keypair.. avg. 134644.93 us (0.13 sec); median 248,947,468 cycles, 1x: 248,947,468 cycles + - WOTS pk gen x (ideal).. avg. 1075.95 us (0.00 sec); median 1,938,003 cycles, 128x: 248,064,384 cycles + - WOTS pk gen x (real).. avg. 1058.37 us (0.00 sec); median 1,937,913 cycles, 128x: 248,052,864 cycles +Signing.. avg. 1189287.19 us (1.19 sec); median 2,236,813,122 cycles, 1x: 2,236,813,122 cycles + - FORS signing.. avg. 258842.97 us (0.26 sec); median 494,115,280 cycles, 1x: 494,115,280 cycles + - WOTS pk gen x (ideal).. avg. 1048.16 us (0.00 sec); median 1,938,224 cycles, 896x: 1,736,648,704 cycles + - WOTS pk gen x (real).. avg. 1021.08 us (0.00 sec); median 1,938,316 cycles, 896x: 1,736,731,136 cycles +Verifying.. avg. 1139.32 us (0.00 sec); median 2,188,690 cycles, 1x: 2,188,690 cycles +Signature size: 16224 (15.84 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-192s-simple_x5 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-192s-simple_x5 new file mode 100644 index 0000000..a20cd11 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-192s-simple_x5 @@ -0,0 +1,16 @@ +Parameters: n = 24, h = 63, d = 7, b = 14, k = 17, w = 16, way=5, tree height=9, wots_len=51 +Running 10 iterations. +thash avg. 0.45 us (0.00 sec); median 968 cycles, 1x: 968 cycles +f1600x avg. 1.28 us (0.00 sec); median 2,191 cycles, 1x: 2,191 cycles +thashx avg. 1.35 us (0.00 sec); median 2,350 cycles, 1x: 2,350 cycles +Generating keypair.. avg. 113705.87 us (0.11 sec); median 202,368,254 cycles, 1x: 202,368,254 cycles + - WOTS pk gen x (ideal).. avg. 1105.14 us (0.00 sec); median 1,954,561 cycles, 102x: 199,365,222 cycles + - WOTS pk gen x (real).. avg. 1100.95 us (0.00 sec); median 1,954,162 cycles, 103x: 201,278,686 cycles +Signing.. avg. 1079708.93 us (1.08 sec); median 1,965,081,421 cycles, 1x: 1,965,081,421 cycles + - FORS signing.. avg. 289427.83 us (0.29 sec); median 548,708,951 cycles, 1x: 548,708,951 cycles + - WOTS pk gen x (ideal).. avg. 1102.14 us (0.00 sec); median 1,957,617 cycles, 716x: 1,401,653,772 cycles + - WOTS pk gen x (real).. avg. 1112.71 us (0.00 sec); median 1,955,586 cycles, 721x: 1,409,977,506 cycles +Verifying.. avg. 1023.26 us (0.00 sec); median 1,832,348 cycles, 1x: 1,832,348 cycles +Signature size: 16224 (15.84 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-256f-robust_x3 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-256f-robust_x3 new file mode 100644 index 0000000..3d5040c --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-256f-robust_x3 @@ -0,0 +1,16 @@ +Parameters: n = 32, h = 68, d = 17, b = 9, k = 35, w = 16, way=3, tree height=4, wots_len=67 +Running 10 iterations. +thash avg. 0.95 us (0.00 sec); median 1,892 cycles, 1x: 1,892 cycles +f1600x avg. 1.17 us (0.00 sec); median 2,255 cycles, 1x: 2,255 cycles +thashx avg. 2.42 us (0.00 sec); median 4,678 cycles, 1x: 4,678 cycles +Generating keypair.. avg. 15101.74 us (0.02 sec); median 29,705,006 cycles, 1x: 29,705,006 cycles + - WOTS pk gen x (ideal).. avg. 2451.51 us (0.00 sec); median 4,947,781 cycles, 5x: 24,738,905 cycles + - WOTS pk gen x (real).. avg. 2447.60 us (0.00 sec); median 4,945,599 cycles, 6x: 29,673,594 cycles +Signing.. avg. 295312.27 us (0.30 sec); median 581,682,926 cycles, 1x: 581,682,926 cycles + - FORS signing.. avg. 36836.39 us (0.04 sec); median 76,502,039 cycles, 1x: 76,502,039 cycles + - WOTS pk gen x (ideal).. avg. 2460.47 us (0.00 sec); median 4,947,034 cycles, 90x: 445,233,060 cycles + - WOTS pk gen x (real).. avg. 2488.29 us (0.00 sec); median 4,946,454 cycles, 102x: 504,538,308 cycles +Verifying.. avg. 7847.23 us (0.01 sec); median 15,651,137 cycles, 1x: 15,651,137 cycles +Signature size: 49856 (48.69 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-256f-robust_x4 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-256f-robust_x4 new file mode 100644 index 0000000..db6f775 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-256f-robust_x4 @@ -0,0 +1,16 @@ +Parameters: n = 32, h = 68, d = 17, b = 9, k = 35, w = 16, way=4, tree height=4, wots_len=67 +Running 10 iterations. +thash avg. 0.97 us (0.00 sec); median 1,895 cycles, 1x: 1,895 cycles +f1600x avg. 1.21 us (0.00 sec); median 2,204 cycles, 1x: 2,204 cycles +thashx avg. 2.48 us (0.00 sec); median 4,638 cycles, 1x: 4,638 cycles +Generating keypair.. avg. 10434.20 us (0.01 sec); median 19,683,800 cycles, 1x: 19,683,800 cycles + - WOTS pk gen x (ideal).. avg. 2648.97 us (0.00 sec); median 4,919,419 cycles, 4x: 19,677,676 cycles + - WOTS pk gen x (real).. avg. 2653.88 us (0.00 sec); median 4,919,992 cycles, 4x: 19,679,968 cycles +Signing.. avg. 207252.62 us (0.21 sec); median 387,574,928 cycles, 1x: 387,574,928 cycles + - FORS signing.. avg. 28029.11 us (0.03 sec); median 52,773,569 cycles, 1x: 52,773,569 cycles + - WOTS pk gen x (ideal).. avg. 2669.13 us (0.00 sec); median 4,921,764 cycles, 68x: 334,679,952 cycles + - WOTS pk gen x (real).. avg. 2609.14 us (0.00 sec); median 4,919,753 cycles, 68x: 334,543,204 cycles +Verifying.. avg. 6236.55 us (0.01 sec); median 11,783,355 cycles, 1x: 11,783,355 cycles +Signature size: 49856 (48.69 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-256f-robust_x5 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-256f-robust_x5 new file mode 100644 index 0000000..67e1870 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-256f-robust_x5 @@ -0,0 +1,16 @@ +Parameters: n = 32, h = 68, d = 17, b = 9, k = 35, w = 16, way=5, tree height=4, wots_len=67 +Running 10 iterations. +thash avg. 0.91 us (0.00 sec); median 1,888 cycles, 1x: 1,888 cycles +f1600x avg. 1.24 us (0.00 sec); median 2,194 cycles, 1x: 2,194 cycles +thashx avg. 2.66 us (0.00 sec); median 4,700 cycles, 1x: 4,700 cycles +Generating keypair.. avg. 11460.62 us (0.01 sec); median 19,992,349 cycles, 1x: 19,992,349 cycles + - WOTS pk gen x (ideal).. avg. 2777.97 us (0.00 sec); median 4,988,277 cycles, 3x: 14,964,831 cycles + - WOTS pk gen x (real).. avg. 2764.70 us (0.00 sec); median 4,988,430 cycles, 4x: 19,953,720 cycles +Signing.. avg. 223428.71 us (0.22 sec); median 400,789,356 cycles, 1x: 400,789,356 cycles + - FORS signing.. avg. 30141.61 us (0.03 sec); median 60,549,299 cycles, 1x: 60,549,299 cycles + - WOTS pk gen x (ideal).. avg. 2650.76 us (0.00 sec); median 4,980,353 cycles, 54x: 268,939,062 cycles + - WOTS pk gen x (real).. avg. 2645.22 us (0.00 sec); median 4,982,744 cycles, 68x: 338,826,592 cycles +Verifying.. avg. 5213.78 us (0.01 sec); median 9,958,517 cycles, 1x: 9,958,517 cycles +Signature size: 49856 (48.69 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-256f-simple_x3 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-256f-simple_x3 new file mode 100644 index 0000000..13011fb --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-256f-simple_x3 @@ -0,0 +1,16 @@ +Parameters: n = 32, h = 68, d = 17, b = 9, k = 35, w = 16, way=3, tree height=4, wots_len=67 +Running 10 iterations. +thash avg. 0.48 us (0.00 sec); median 981 cycles, 1x: 981 cycles +f1600x avg. 1.15 us (0.00 sec); median 2,252 cycles, 1x: 2,252 cycles +thashx avg. 1.21 us (0.00 sec); median 2,373 cycles, 1x: 2,373 cycles +Generating keypair.. avg. 7859.78 us (0.01 sec); median 15,548,942 cycles, 1x: 15,548,942 cycles + - WOTS pk gen x (ideal).. avg. 1290.95 us (0.00 sec); median 2,589,496 cycles, 5x: 12,947,480 cycles + - WOTS pk gen x (real).. avg. 1279.44 us (0.00 sec); median 2,588,393 cycles, 6x: 15,530,358 cycles +Signing.. avg. 156297.88 us (0.16 sec); median 310,249,907 cycles, 1x: 310,249,907 cycles + - FORS signing.. avg. 22801.18 us (0.02 sec); median 45,704,589 cycles, 1x: 45,704,589 cycles + - WOTS pk gen x (ideal).. avg. 1273.35 us (0.00 sec); median 2,590,221 cycles, 90x: 233,119,890 cycles + - WOTS pk gen x (real).. avg. 1280.64 us (0.00 sec); median 2,589,873 cycles, 102x: 264,167,046 cycles +Verifying.. avg. 3759.13 us (0.00 sec); median 7,661,734 cycles, 1x: 7,661,734 cycles +Signature size: 49856 (48.69 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-256f-simple_x4 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-256f-simple_x4 new file mode 100644 index 0000000..4b8c19b --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-256f-simple_x4 @@ -0,0 +1,16 @@ +Parameters: n = 32, h = 68, d = 17, b = 9, k = 35, w = 16, way=4, tree height=4, wots_len=67 +Running 10 iterations. +thash avg. 0.46 us (0.00 sec); median 979 cycles, 1x: 979 cycles +f1600x avg. 1.16 us (0.00 sec); median 2,197 cycles, 1x: 2,197 cycles +thashx avg. 1.24 us (0.00 sec); median 2,368 cycles, 1x: 2,368 cycles +Generating keypair.. avg. 5376.56 us (0.01 sec); median 10,353,536 cycles, 1x: 10,353,536 cycles + - WOTS pk gen x (ideal).. avg. 1332.67 us (0.00 sec); median 2,592,520 cycles, 4x: 10,370,080 cycles + - WOTS pk gen x (real).. avg. 1335.31 us (0.00 sec); median 2,592,175 cycles, 4x: 10,368,700 cycles +Signing.. avg. 108908.20 us (0.11 sec); median 208,520,384 cycles, 1x: 208,520,384 cycles + - FORS signing.. avg. 17222.40 us (0.02 sec); median 32,298,949 cycles, 1x: 32,298,949 cycles + - WOTS pk gen x (ideal).. avg. 1446.35 us (0.00 sec); median 2,594,956 cycles, 68x: 176,457,008 cycles + - WOTS pk gen x (real).. avg. 1401.02 us (0.00 sec); median 2,594,309 cycles, 68x: 176,413,012 cycles +Verifying.. avg. 3108.29 us (0.00 sec); median 5,907,737 cycles, 1x: 5,907,737 cycles +Signature size: 49856 (48.69 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-256f-simple_x5 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-256f-simple_x5 new file mode 100644 index 0000000..47a64e4 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-256f-simple_x5 @@ -0,0 +1,16 @@ +Parameters: n = 32, h = 68, d = 17, b = 9, k = 35, w = 16, way=5, tree height=4, wots_len=67 +Running 10 iterations. +thash avg. 0.49 us (0.00 sec); median 979 cycles, 1x: 979 cycles +f1600x avg. 1.21 us (0.00 sec); median 2,191 cycles, 1x: 2,191 cycles +thashx avg. 1.35 us (0.00 sec); median 2,406 cycles, 1x: 2,406 cycles +Generating keypair.. avg. 5880.69 us (0.01 sec); median 10,546,397 cycles, 1x: 10,546,397 cycles + - WOTS pk gen x (ideal).. avg. 1484.82 us (0.00 sec); median 2,629,569 cycles, 3x: 7,888,707 cycles + - WOTS pk gen x (real).. avg. 1480.67 us (0.00 sec); median 2,632,131 cycles, 4x: 10,528,524 cycles +Signing.. avg. 120310.58 us (0.12 sec); median 214,826,357 cycles, 1x: 214,826,357 cycles + - FORS signing.. avg. 18699.71 us (0.02 sec); median 35,290,463 cycles, 1x: 35,290,463 cycles + - WOTS pk gen x (ideal).. avg. 1493.97 us (0.00 sec); median 2,630,548 cycles, 54x: 142,049,592 cycles + - WOTS pk gen x (real).. avg. 1521.93 us (0.00 sec); median 2,630,102 cycles, 68x: 178,846,936 cycles +Verifying.. avg. 2803.45 us (0.00 sec); median 4,984,567 cycles, 1x: 4,984,567 cycles +Signature size: 49856 (48.69 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-256s-robust_x3 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-256s-robust_x3 new file mode 100644 index 0000000..16bb267 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-256s-robust_x3 @@ -0,0 +1,16 @@ +Parameters: n = 32, h = 64, d = 8, b = 14, k = 22, w = 16, way=3, tree height=8, wots_len=67 +Running 10 iterations. +thash avg. 0.88 us (0.00 sec); median 1,892 cycles, 1x: 1,892 cycles +f1600x avg. 1.08 us (0.00 sec); median 2,251 cycles, 1x: 2,251 cycles +thashx avg. 2.26 us (0.00 sec); median 4,670 cycles, 1x: 4,670 cycles +Generating keypair.. avg. 214223.10 us (0.21 sec); median 425,169,736 cycles, 1x: 425,169,736 cycles + - WOTS pk gen x (ideal).. avg. 2581.21 us (0.00 sec); median 4,938,528 cycles, 85x: 419,774,880 cycles + - WOTS pk gen x (real).. avg. 2555.26 us (0.00 sec); median 4,935,740 cycles, 86x: 424,473,640 cycles +Signing.. avg. 2498570.22 us (2.50 sec); median 4,938,544,715 cycles, 1x: 4,938,544,715 cycles + - FORS signing.. avg. 774240.60 us (0.77 sec); median 1,536,612,141 cycles, 1x: 1,536,612,141 cycles + - WOTS pk gen x (ideal).. avg. 2536.56 us (0.00 sec); median 4,939,980 cycles, 682x: 3,369,066,360 cycles + - WOTS pk gen x (real).. avg. 2543.00 us (0.00 sec); median 4,939,889 cycles, 688x: 3,398,643,632 cycles +Verifying.. avg. 3753.09 us (0.00 sec); median 7,309,097 cycles, 1x: 7,309,097 cycles +Signature size: 29792 (29.09 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-256s-robust_x4 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-256s-robust_x4 new file mode 100644 index 0000000..0016a1d --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-256s-robust_x4 @@ -0,0 +1,16 @@ +Parameters: n = 32, h = 64, d = 8, b = 14, k = 22, w = 16, way=4, tree height=8, wots_len=67 +Running 10 iterations. +thash avg. 0.95 us (0.00 sec); median 1,892 cycles, 1x: 1,892 cycles +f1600x avg. 1.23 us (0.00 sec); median 2,202 cycles, 1x: 2,202 cycles +thashx avg. 2.56 us (0.00 sec); median 4,647 cycles, 1x: 4,647 cycles +Generating keypair.. avg. 165571.30 us (0.17 sec); median 314,700,197 cycles, 1x: 314,700,197 cycles + - WOTS pk gen x (ideal).. avg. 2750.16 us (0.00 sec); median 4,918,751 cycles, 64x: 314,800,064 cycles + - WOTS pk gen x (real).. avg. 2681.45 us (0.00 sec); median 4,921,729 cycles, 64x: 314,990,656 cycles +Signing.. avg. 1907446.99 us (1.91 sec); median 3,576,259,423 cycles, 1x: 3,576,259,423 cycles + - FORS signing.. avg. 565476.81 us (0.57 sec); median 1,057,483,787 cycles, 1x: 1,057,483,787 cycles + - WOTS pk gen x (ideal).. avg. 2619.80 us (0.00 sec); median 4,914,577 cycles, 512x: 2,516,263,424 cycles + - WOTS pk gen x (real).. avg. 2623.45 us (0.00 sec); median 4,914,512 cycles, 512x: 2,516,230,144 cycles +Verifying.. avg. 3146.55 us (0.00 sec); median 5,899,953 cycles, 1x: 5,899,953 cycles +Signature size: 29792 (29.09 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-256s-robust_x5 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-256s-robust_x5 new file mode 100644 index 0000000..6d2a711 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-256s-robust_x5 @@ -0,0 +1,16 @@ +Parameters: n = 32, h = 64, d = 8, b = 14, k = 22, w = 16, way=5, tree height=8, wots_len=67 +Running 10 iterations. +thash avg. 0.94 us (0.00 sec); median 1,892 cycles, 1x: 1,892 cycles +f1600x avg. 1.33 us (0.00 sec); median 2,235 cycles, 1x: 2,235 cycles +thashx avg. 2.66 us (0.00 sec); median 4,705 cycles, 1x: 4,705 cycles +Generating keypair.. avg. 146207.14 us (0.15 sec); median 259,913,143 cycles, 1x: 259,913,143 cycles + - WOTS pk gen x (ideal).. avg. 2664.36 us (0.00 sec); median 4,989,490 cycles, 51x: 254,463,990 cycles + - WOTS pk gen x (real).. avg. 2656.62 us (0.00 sec); median 4,982,373 cycles, 52x: 259,083,396 cycles +Signing.. avg. 1787926.60 us (1.79 sec); median 3,296,102,723 cycles, 1x: 3,296,102,723 cycles + - FORS signing.. avg. 632197.71 us (0.63 sec); median 1,216,879,242 cycles, 1x: 1,216,879,242 cycles + - WOTS pk gen x (ideal).. avg. 2836.67 us (0.00 sec); median 4,979,772 cycles, 409x: 2,036,726,748 cycles + - WOTS pk gen x (real).. avg. 2862.43 us (0.00 sec); median 4,977,077 cycles, 416x: 2,070,464,032 cycles +Verifying.. avg. 2794.29 us (0.00 sec); median 4,987,382 cycles, 1x: 4,987,382 cycles +Signature size: 29792 (29.09 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-256s-simple_x3 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-256s-simple_x3 new file mode 100644 index 0000000..d6e7066 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-256s-simple_x3 @@ -0,0 +1,16 @@ +Parameters: n = 32, h = 64, d = 8, b = 14, k = 22, w = 16, way=3, tree height=8, wots_len=67 +Running 10 iterations. +thash avg. 0.44 us (0.00 sec); median 980 cycles, 1x: 980 cycles +f1600x avg. 1.07 us (0.00 sec); median 2,255 cycles, 1x: 2,255 cycles +thashx avg. 1.14 us (0.00 sec); median 2,376 cycles, 1x: 2,376 cycles +Generating keypair.. avg. 110990.41 us (0.11 sec); median 222,840,461 cycles, 1x: 222,840,461 cycles + - WOTS pk gen x (ideal).. avg. 1296.33 us (0.00 sec); median 2,589,995 cycles, 85x: 220,149,575 cycles + - WOTS pk gen x (real).. avg. 1292.46 us (0.00 sec); median 2,588,662 cycles, 86x: 222,624,932 cycles +Signing.. avg. 1365487.55 us (1.37 sec); median 2,703,101,883 cycles, 1x: 2,703,101,883 cycles + - FORS signing.. avg. 464433.31 us (0.46 sec); median 919,915,569 cycles, 1x: 919,915,569 cycles + - WOTS pk gen x (ideal).. avg. 1310.43 us (0.00 sec); median 2,589,644 cycles, 682x: 1,766,137,208 cycles + - WOTS pk gen x (real).. avg. 1298.10 us (0.00 sec); median 2,589,354 cycles, 688x: 1,781,475,552 cycles +Verifying.. avg. 1894.75 us (0.00 sec); median 3,774,945 cycles, 1x: 3,774,945 cycles +Signature size: 29792 (29.09 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-256s-simple_x4 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-256s-simple_x4 new file mode 100644 index 0000000..23c5aca --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-256s-simple_x4 @@ -0,0 +1,16 @@ +Parameters: n = 32, h = 64, d = 8, b = 14, k = 22, w = 16, way=4, tree height=8, wots_len=67 +Running 10 iterations. +thash avg. 0.47 us (0.00 sec); median 980 cycles, 1x: 980 cycles +f1600x avg. 1.23 us (0.00 sec); median 2,200 cycles, 1x: 2,200 cycles +thashx avg. 1.28 us (0.00 sec); median 2,363 cycles, 1x: 2,363 cycles +Generating keypair.. avg. 90357.02 us (0.09 sec); median 165,960,425 cycles, 1x: 165,960,425 cycles + - WOTS pk gen x (ideal).. avg. 1466.24 us (0.00 sec); median 2,585,625 cycles, 64x: 165,480,000 cycles + - WOTS pk gen x (real).. avg. 1488.71 us (0.00 sec); median 2,585,956 cycles, 64x: 165,501,184 cycles +Signing.. avg. 1055018.60 us (1.06 sec); median 1,973,799,252 cycles, 1x: 1,973,799,252 cycles + - FORS signing.. avg. 344158.53 us (0.34 sec); median 645,842,254 cycles, 1x: 645,842,254 cycles + - WOTS pk gen x (ideal).. avg. 1400.21 us (0.00 sec); median 2,585,248 cycles, 512x: 1,323,646,976 cycles + - WOTS pk gen x (real).. avg. 1433.25 us (0.00 sec); median 2,585,220 cycles, 512x: 1,323,632,640 cycles +Verifying.. avg. 1643.26 us (0.00 sec); median 2,965,882 cycles, 1x: 2,965,882 cycles +Signature size: 29792 (29.09 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-256s-simple_x5 b/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-256s-simple_x5 new file mode 100644 index 0000000..24a4584 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_A78/sphincs-shake-256s-simple_x5 @@ -0,0 +1,16 @@ +Parameters: n = 32, h = 64, d = 8, b = 14, k = 22, w = 16, way=5, tree height=8, wots_len=67 +Running 10 iterations. +thash avg. 0.47 us (0.00 sec); median 981 cycles, 1x: 981 cycles +f1600x avg. 1.32 us (0.00 sec); median 2,198 cycles, 1x: 2,198 cycles +thashx avg. 1.43 us (0.00 sec); median 2,454 cycles, 1x: 2,454 cycles +Generating keypair.. avg. 77539.34 us (0.08 sec); median 136,981,277 cycles, 1x: 136,981,277 cycles + - WOTS pk gen x (ideal).. avg. 1459.17 us (0.00 sec); median 2,639,408 cycles, 51x: 134,609,808 cycles + - WOTS pk gen x (real).. avg. 1478.98 us (0.00 sec); median 2,638,031 cycles, 52x: 137,177,612 cycles +Signing.. avg. 976538.59 us (0.98 sec); median 1,804,459,287 cycles, 1x: 1,804,459,287 cycles + - FORS signing.. avg. 370482.78 us (0.37 sec); median 708,248,191 cycles, 1x: 708,248,191 cycles + - WOTS pk gen x (ideal).. avg. 1420.29 us (0.00 sec); median 2,625,707 cycles, 409x: 1,073,914,163 cycles + - WOTS pk gen x (real).. avg. 1401.64 us (0.00 sec); median 2,628,539 cycles, 416x: 1,093,472,224 cycles +Verifying.. avg. 1345.34 us (0.00 sec); median 2,550,352 cycles, 1x: 2,550,352 cycles +Signature size: 29792 (29.09 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-128f-robust_x3 b/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-128f-robust_x3 new file mode 100644 index 0000000..b663a20 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-128f-robust_x3 @@ -0,0 +1,16 @@ +Parameters: n = 16, h = 66, d = 22, b = 6, k = 33, w = 16, way=3, tree height=3, wots_len=35 +Running 10 iterations. +thash avg. 0.63 us (0.00 sec); median 1,686 cycles, 1x: 1,686 cycles +f1600x avg. 0.57 us (0.00 sec); median 1,504 cycles, 1x: 1,504 cycles +thashx avg. 1.20 us (0.00 sec); median 3,129 cycles, 1x: 3,129 cycles +Generating keypair.. avg. 1994.82 us (0.00 sec); median 5,159,864 cycles, 1x: 5,159,864 cycles + - WOTS pk gen x (ideal).. avg. 662.62 us (0.00 sec); median 1,719,728 cycles, 2x: 3,439,456 cycles + - WOTS pk gen x (real).. avg. 666.46 us (0.00 sec); median 1,718,835 cycles, 3x: 5,156,505 cycles +Signing.. avg. 46880.46 us (0.05 sec); median 120,640,203 cycles, 1x: 120,640,203 cycles + - FORS signing.. avg. 2709.60 us (0.00 sec); median 7,058,509 cycles, 1x: 7,058,509 cycles + - WOTS pk gen x (ideal).. avg. 668.91 us (0.00 sec); median 1,718,329 cycles, 58x: 99,663,082 cycles + - WOTS pk gen x (real).. avg. 672.50 us (0.00 sec); median 1,718,018 cycles, 66x: 113,389,188 cycles +Verifying.. avg. 2680.63 us (0.00 sec); median 6,906,277 cycles, 1x: 6,906,277 cycles +Signature size: 17088 (16.69 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-128f-robust_x4 b/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-128f-robust_x4 new file mode 100644 index 0000000..331a261 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-128f-robust_x4 @@ -0,0 +1,16 @@ +Parameters: n = 16, h = 66, d = 22, b = 6, k = 33, w = 16, way=4, tree height=3, wots_len=35 +Running 10 iterations. +thash avg. 0.65 us (0.00 sec); median 1,688 cycles, 1x: 1,688 cycles +f1600x avg. 0.59 us (0.00 sec); median 1,523 cycles, 1x: 1,523 cycles +thashx avg. 1.29 us (0.00 sec); median 3,169 cycles, 1x: 3,169 cycles +Generating keypair.. avg. 1416.18 us (0.00 sec); median 3,491,100 cycles, 1x: 3,491,100 cycles + - WOTS pk gen x (ideal).. avg. 716.97 us (0.00 sec); median 1,741,063 cycles, 2x: 3,482,126 cycles + - WOTS pk gen x (real).. avg. 701.83 us (0.00 sec); median 1,741,273 cycles, 2x: 3,482,546 cycles +Signing.. avg. 32916.85 us (0.03 sec); median 81,197,917 cycles, 1x: 81,197,917 cycles + - FORS signing.. avg. 1778.61 us (0.00 sec); median 4,382,377 cycles, 1x: 4,382,377 cycles + - WOTS pk gen x (ideal).. avg. 705.96 us (0.00 sec); median 1,739,402 cycles, 44x: 76,533,688 cycles + - WOTS pk gen x (real).. avg. 707.42 us (0.00 sec); median 1,740,622 cycles, 44x: 76,587,368 cycles +Verifying.. avg. 2356.29 us (0.00 sec); median 5,880,952 cycles, 1x: 5,880,952 cycles +Signature size: 17088 (16.69 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-128f-robust_x5 b/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-128f-robust_x5 new file mode 100644 index 0000000..08b45c6 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-128f-robust_x5 @@ -0,0 +1,16 @@ +Parameters: n = 16, h = 66, d = 22, b = 6, k = 33, w = 16, way=5, tree height=3, wots_len=35 +Running 10 iterations. +thash avg. 0.64 us (0.00 sec); median 1,684 cycles, 1x: 1,684 cycles +f1600x avg. 0.84 us (0.00 sec); median 2,165 cycles, 1x: 2,165 cycles +thashx avg. 1.78 us (0.00 sec); median 4,482 cycles, 1x: 4,482 cycles +Generating keypair.. avg. 1972.49 us (0.00 sec); median 4,933,961 cycles, 1x: 4,933,961 cycles + - WOTS pk gen x (ideal).. avg. 977.69 us (0.00 sec); median 2,460,990 cycles, 1x: 2,460,990 cycles + - WOTS pk gen x (real).. avg. 986.27 us (0.00 sec); median 2,461,033 cycles, 2x: 4,922,066 cycles +Signing.. avg. 45978.53 us (0.05 sec); median 115,107,401 cycles, 1x: 115,107,401 cycles + - FORS signing.. avg. 2526.69 us (0.00 sec); median 6,553,713 cycles, 1x: 6,553,713 cycles + - WOTS pk gen x (ideal).. avg. 995.70 us (0.00 sec); median 2,461,892 cycles, 35x: 86,166,220 cycles + - WOTS pk gen x (real).. avg. 989.77 us (0.00 sec); median 2,461,418 cycles, 44x: 108,302,392 cycles +Verifying.. avg. 2645.78 us (0.00 sec); median 6,620,376 cycles, 1x: 6,620,376 cycles +Signature size: 17088 (16.69 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-128f-simple_x3 b/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-128f-simple_x3 new file mode 100644 index 0000000..877321a --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-128f-simple_x3 @@ -0,0 +1,16 @@ +Parameters: n = 16, h = 66, d = 22, b = 6, k = 33, w = 16, way=3, tree height=3, wots_len=35 +Running 10 iterations. +thash avg. 0.32 us (0.00 sec); median 858 cycles, 1x: 858 cycles +f1600x avg. 0.56 us (0.00 sec); median 1,510 cycles, 1x: 1,510 cycles +thashx avg. 0.59 us (0.00 sec); median 1,572 cycles, 1x: 1,572 cycles +Generating keypair.. avg. 998.49 us (0.00 sec); median 2,673,442 cycles, 1x: 2,673,442 cycles + - WOTS pk gen x (ideal).. avg. 332.05 us (0.00 sec); median 887,957 cycles, 2x: 1,775,914 cycles + - WOTS pk gen x (real).. avg. 331.80 us (0.00 sec); median 888,137 cycles, 3x: 2,664,411 cycles +Signing.. avg. 23464.09 us (0.02 sec); median 62,973,301 cycles, 1x: 62,973,301 cycles + - FORS signing.. avg. 1547.58 us (0.00 sec); median 4,149,560 cycles, 1x: 4,149,560 cycles + - WOTS pk gen x (ideal).. avg. 332.05 us (0.00 sec); median 888,386 cycles, 58x: 51,526,388 cycles + - WOTS pk gen x (real).. avg. 332.11 us (0.00 sec); median 887,876 cycles, 66x: 58,599,816 cycles +Verifying.. avg. 1343.44 us (0.00 sec); median 3,596,715 cycles, 1x: 3,596,715 cycles +Signature size: 17088 (16.69 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-128f-simple_x4 b/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-128f-simple_x4 new file mode 100644 index 0000000..af339df --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-128f-simple_x4 @@ -0,0 +1,16 @@ +Parameters: n = 16, h = 66, d = 22, b = 6, k = 33, w = 16, way=4, tree height=3, wots_len=35 +Running 10 iterations. +thash avg. 0.32 us (0.00 sec); median 861 cycles, 1x: 861 cycles +f1600x avg. 0.57 us (0.00 sec); median 1,521 cycles, 1x: 1,521 cycles +thashx avg. 0.59 us (0.00 sec); median 1,586 cycles, 1x: 1,586 cycles +Generating keypair.. avg. 677.86 us (0.00 sec); median 1,807,615 cycles, 1x: 1,807,615 cycles + - WOTS pk gen x (ideal).. avg. 337.86 us (0.00 sec); median 900,235 cycles, 2x: 1,800,470 cycles + - WOTS pk gen x (real).. avg. 338.21 us (0.00 sec); median 899,844 cycles, 2x: 1,799,688 cycles +Signing.. avg. 15990.06 us (0.02 sec); median 42,460,536 cycles, 1x: 42,460,536 cycles + - FORS signing.. avg. 1022.13 us (0.00 sec); median 2,645,662 cycles, 1x: 2,645,662 cycles + - WOTS pk gen x (ideal).. avg. 353.59 us (0.00 sec); median 900,471 cycles, 44x: 39,620,724 cycles + - WOTS pk gen x (real).. avg. 352.68 us (0.00 sec); median 900,127 cycles, 44x: 39,605,588 cycles +Verifying.. avg. 1117.47 us (0.00 sec); median 2,951,245 cycles, 1x: 2,951,245 cycles +Signature size: 17088 (16.69 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-128f-simple_x5 b/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-128f-simple_x5 new file mode 100644 index 0000000..e6cca46 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-128f-simple_x5 @@ -0,0 +1,16 @@ +Parameters: n = 16, h = 66, d = 22, b = 6, k = 33, w = 16, way=5, tree height=3, wots_len=35 +Running 10 iterations. +thash avg. 0.32 us (0.00 sec); median 859 cycles, 1x: 859 cycles +f1600x avg. 0.81 us (0.00 sec); median 2,166 cycles, 1x: 2,166 cycles +thashx avg. 0.84 us (0.00 sec); median 2,245 cycles, 1x: 2,245 cycles +Generating keypair.. avg. 955.85 us (0.00 sec); median 2,552,448 cycles, 1x: 2,552,448 cycles + - WOTS pk gen x (ideal).. avg. 476.60 us (0.00 sec); median 1,272,880 cycles, 1x: 1,272,880 cycles + - WOTS pk gen x (real).. avg. 477.80 us (0.00 sec); median 1,272,730 cycles, 2x: 2,545,460 cycles +Signing.. avg. 22911.74 us (0.02 sec); median 59,986,118 cycles, 1x: 59,986,118 cycles + - FORS signing.. avg. 1424.80 us (0.00 sec); median 3,785,672 cycles, 1x: 3,785,672 cycles + - WOTS pk gen x (ideal).. avg. 495.92 us (0.00 sec); median 1,272,295 cycles, 35x: 44,530,325 cycles + - WOTS pk gen x (real).. avg. 502.21 us (0.00 sec); median 1,272,313 cycles, 44x: 55,981,772 cycles +Verifying.. avg. 1319.77 us (0.00 sec); median 3,403,802 cycles, 1x: 3,403,802 cycles +Signature size: 17088 (16.69 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-128s-robust_x3 b/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-128s-robust_x3 new file mode 100644 index 0000000..115e1a8 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-128s-robust_x3 @@ -0,0 +1,16 @@ +Parameters: n = 16, h = 63, d = 7, b = 12, k = 14, w = 16, way=3, tree height=9, wots_len=35 +Running 10 iterations. +thash avg. 0.71 us (0.00 sec); median 1,686 cycles, 1x: 1,686 cycles +f1600x avg. 0.64 us (0.00 sec); median 1,515 cycles, 1x: 1,515 cycles +thashx avg. 1.35 us (0.00 sec); median 3,142 cycles, 1x: 3,142 cycles +Generating keypair.. avg. 127996.08 us (0.13 sec); median 294,998,154 cycles, 1x: 294,998,154 cycles + - WOTS pk gen x (ideal).. avg. 754.66 us (0.00 sec); median 1,721,028 cycles, 170x: 292,574,760 cycles + - WOTS pk gen x (real).. avg. 747.26 us (0.00 sec); median 1,721,259 cycles, 171x: 294,335,289 cycles +Signing.. avg. 986189.78 us (0.99 sec); median 2,253,054,758 cycles, 1x: 2,253,054,758 cycles + - FORS signing.. avg. 82065.33 us (0.08 sec); median 189,333,623 cycles, 1x: 189,333,623 cycles + - WOTS pk gen x (ideal).. avg. 757.81 us (0.00 sec); median 1,721,084 cycles, 1194x: 2,054,974,296 cycles + - WOTS pk gen x (real).. avg. 762.98 us (0.00 sec); median 1,720,867 cycles, 1197x: 2,059,877,799 cycles +Verifying.. avg. 1153.08 us (0.00 sec); median 2,636,579 cycles, 1x: 2,636,579 cycles +Signature size: 7856 (7.67 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-128s-robust_x4 b/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-128s-robust_x4 new file mode 100644 index 0000000..a262cfb --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-128s-robust_x4 @@ -0,0 +1,16 @@ +Parameters: n = 16, h = 63, d = 7, b = 12, k = 14, w = 16, way=4, tree height=9, wots_len=35 +Running 10 iterations. +thash avg. 0.74 us (0.00 sec); median 1,693 cycles, 1x: 1,693 cycles +f1600x avg. 0.69 us (0.00 sec); median 1,524 cycles, 1x: 1,524 cycles +thashx avg. 1.48 us (0.00 sec); median 3,174 cycles, 1x: 3,174 cycles +Generating keypair.. avg. 103562.29 us (0.10 sec); median 223,778,149 cycles, 1x: 223,778,149 cycles + - WOTS pk gen x (ideal).. avg. 816.20 us (0.00 sec); median 1,744,109 cycles, 128x: 223,245,952 cycles + - WOTS pk gen x (real).. avg. 799.85 us (0.00 sec); median 1,744,222 cycles, 128x: 223,260,416 cycles +Signing.. avg. 779226.76 us (0.78 sec); median 1,681,495,897 cycles, 1x: 1,681,495,897 cycles + - FORS signing.. avg. 53644.16 us (0.05 sec); median 115,078,009 cycles, 1x: 115,078,009 cycles + - WOTS pk gen x (ideal).. avg. 810.97 us (0.00 sec); median 1,744,660 cycles, 896x: 1,563,215,360 cycles + - WOTS pk gen x (real).. avg. 810.91 us (0.00 sec); median 1,744,029 cycles, 896x: 1,562,649,984 cycles +Verifying.. avg. 952.09 us (0.00 sec); median 2,138,932 cycles, 1x: 2,138,932 cycles +Signature size: 7856 (7.67 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-128s-robust_x5 b/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-128s-robust_x5 new file mode 100644 index 0000000..7c46b33 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-128s-robust_x5 @@ -0,0 +1,16 @@ +Parameters: n = 16, h = 63, d = 7, b = 12, k = 14, w = 16, way=5, tree height=9, wots_len=35 +Running 10 iterations. +thash avg. 0.71 us (0.00 sec); median 1,692 cycles, 1x: 1,692 cycles +f1600x avg. 0.95 us (0.00 sec); median 2,165 cycles, 1x: 2,165 cycles +thashx avg. 2.02 us (0.00 sec); median 4,480 cycles, 1x: 4,480 cycles +Generating keypair.. avg. 115902.04 us (0.12 sec); median 254,266,351 cycles, 1x: 254,266,351 cycles + - WOTS pk gen x (ideal).. avg. 1126.61 us (0.00 sec); median 2,460,046 cycles, 102x: 250,924,692 cycles + - WOTS pk gen x (real).. avg. 1125.48 us (0.00 sec); median 2,459,842 cycles, 103x: 253,363,726 cycles +Signing.. avg. 898256.76 us (0.90 sec); median 1,956,789,909 cycles, 1x: 1,956,789,909 cycles + - FORS signing.. avg. 78339.66 us (0.08 sec); median 176,888,391 cycles, 1x: 176,888,391 cycles + - WOTS pk gen x (ideal).. avg. 1135.38 us (0.00 sec); median 2,461,041 cycles, 716x: 1,762,105,356 cycles + - WOTS pk gen x (real).. avg. 1144.10 us (0.00 sec); median 2,460,836 cycles, 721x: 1,774,262,756 cycles +Verifying.. avg. 1134.31 us (0.00 sec); median 2,490,623 cycles, 1x: 2,490,623 cycles +Signature size: 7856 (7.67 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-128s-simple_x3 b/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-128s-simple_x3 new file mode 100644 index 0000000..04b58bf --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-128s-simple_x3 @@ -0,0 +1,16 @@ +Parameters: n = 16, h = 63, d = 7, b = 12, k = 14, w = 16, way=3, tree height=9, wots_len=35 +Running 10 iterations. +thash avg. 0.33 us (0.00 sec); median 861 cycles, 1x: 861 cycles +f1600x avg. 0.58 us (0.00 sec); median 1,506 cycles, 1x: 1,506 cycles +thashx avg. 0.62 us (0.00 sec); median 1,569 cycles, 1x: 1,569 cycles +Generating keypair.. avg. 60780.79 us (0.06 sec); median 151,828,252 cycles, 1x: 151,828,252 cycles + - WOTS pk gen x (ideal).. avg. 361.86 us (0.00 sec); median 886,379 cycles, 170x: 150,684,430 cycles + - WOTS pk gen x (real).. avg. 352.45 us (0.00 sec); median 885,741 cycles, 171x: 151,461,711 cycles +Signing.. avg. 479287.70 us (0.48 sec); median 1,173,345,980 cycles, 1x: 1,173,345,980 cycles + - FORS signing.. avg. 44738.15 us (0.04 sec); median 110,308,073 cycles, 1x: 110,308,073 cycles + - WOTS pk gen x (ideal).. avg. 367.15 us (0.00 sec); median 886,402 cycles, 1194x: 1,058,363,988 cycles + - WOTS pk gen x (real).. avg. 367.30 us (0.00 sec); median 885,623 cycles, 1197x: 1,060,090,731 cycles +Verifying.. avg. 524.84 us (0.00 sec); median 1,267,672 cycles, 1x: 1,267,672 cycles +Signature size: 7856 (7.67 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-128s-simple_x4 b/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-128s-simple_x4 new file mode 100644 index 0000000..6dd90a3 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-128s-simple_x4 @@ -0,0 +1,16 @@ +Parameters: n = 16, h = 63, d = 7, b = 12, k = 14, w = 16, way=4, tree height=9, wots_len=35 +Running 10 iterations. +thash avg. 0.35 us (0.00 sec); median 860 cycles, 1x: 860 cycles +f1600x avg. 0.63 us (0.00 sec); median 1,524 cycles, 1x: 1,524 cycles +thashx avg. 0.68 us (0.00 sec); median 1,591 cycles, 1x: 1,591 cycles +Generating keypair.. avg. 50142.20 us (0.05 sec); median 115,615,079 cycles, 1x: 115,615,079 cycles + - WOTS pk gen x (ideal).. avg. 393.12 us (0.00 sec); median 903,298 cycles, 128x: 115,622,144 cycles + - WOTS pk gen x (real).. avg. 388.00 us (0.00 sec); median 902,497 cycles, 128x: 115,519,616 cycles +Signing.. avg. 382604.36 us (0.38 sec); median 878,556,315 cycles, 1x: 878,556,315 cycles + - FORS signing.. avg. 30383.08 us (0.03 sec); median 69,216,478 cycles, 1x: 69,216,478 cycles + - WOTS pk gen x (ideal).. avg. 397.17 us (0.00 sec); median 902,494 cycles, 896x: 808,634,624 cycles + - WOTS pk gen x (real).. avg. 400.38 us (0.00 sec); median 903,161 cycles, 896x: 809,232,256 cycles +Verifying.. avg. 450.60 us (0.00 sec); median 1,049,578 cycles, 1x: 1,049,578 cycles +Signature size: 7856 (7.67 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-128s-simple_x5 b/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-128s-simple_x5 new file mode 100644 index 0000000..f490af0 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-128s-simple_x5 @@ -0,0 +1,16 @@ +Parameters: n = 16, h = 63, d = 7, b = 12, k = 14, w = 16, way=5, tree height=9, wots_len=35 +Running 10 iterations. +thash avg. 0.35 us (0.00 sec); median 858 cycles, 1x: 858 cycles +f1600x avg. 0.91 us (0.00 sec); median 2,164 cycles, 1x: 2,164 cycles +thashx avg. 0.96 us (0.00 sec); median 2,243 cycles, 1x: 2,243 cycles +Generating keypair.. avg. 56012.17 us (0.06 sec); median 131,382,964 cycles, 1x: 131,382,964 cycles + - WOTS pk gen x (ideal).. avg. 549.83 us (0.00 sec); median 1,271,084 cycles, 102x: 129,650,568 cycles + - WOTS pk gen x (real).. avg. 539.78 us (0.00 sec); median 1,270,915 cycles, 103x: 130,904,245 cycles +Signing.. avg. 442398.48 us (0.44 sec); median 1,021,779,424 cycles, 1x: 1,021,779,424 cycles + - FORS signing.. avg. 42827.96 us (0.04 sec); median 102,096,129 cycles, 1x: 102,096,129 cycles + - WOTS pk gen x (ideal).. avg. 542.78 us (0.00 sec); median 1,271,215 cycles, 716x: 910,189,940 cycles + - WOTS pk gen x (real).. avg. 564.42 us (0.00 sec); median 1,270,987 cycles, 721x: 916,381,627 cycles +Verifying.. avg. 522.53 us (0.00 sec); median 1,212,917 cycles, 1x: 1,212,917 cycles +Signature size: 7856 (7.67 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-192f-robust_x3 b/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-192f-robust_x3 new file mode 100644 index 0000000..e895ebd --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-192f-robust_x3 @@ -0,0 +1,16 @@ +Parameters: n = 24, h = 66, d = 22, b = 8, k = 33, w = 16, way=3, tree height=3, wots_len=51 +Running 10 iterations. +thash avg. 0.75 us (0.00 sec); median 1,704 cycles, 1x: 1,704 cycles +f1600x avg. 0.68 us (0.00 sec); median 1,517 cycles, 1x: 1,517 cycles +thashx avg. 1.44 us (0.00 sec); median 3,155 cycles, 1x: 3,155 cycles +Generating keypair.. avg. 3513.98 us (0.00 sec); median 7,606,740 cycles, 1x: 7,606,740 cycles + - WOTS pk gen x (ideal).. avg. 1170.80 us (0.00 sec); median 2,531,333 cycles, 2x: 5,062,666 cycles + - WOTS pk gen x (real).. avg. 1168.19 us (0.00 sec); median 2,530,130 cycles, 3x: 7,590,390 cycles +Signing.. avg. 89541.51 us (0.09 sec); median 195,493,421 cycles, 1x: 195,493,421 cycles + - FORS signing.. avg. 12495.46 us (0.01 sec); median 28,237,777 cycles, 1x: 28,237,777 cycles + - WOTS pk gen x (ideal).. avg. 1149.71 us (0.00 sec); median 2,529,642 cycles, 58x: 146,719,236 cycles + - WOTS pk gen x (real).. avg. 1173.85 us (0.00 sec); median 2,529,156 cycles, 66x: 166,924,296 cycles +Verifying.. avg. 4603.80 us (0.00 sec); median 10,068,439 cycles, 1x: 10,068,439 cycles +Signature size: 35664 (34.83 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-192f-robust_x4 b/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-192f-robust_x4 new file mode 100644 index 0000000..cf6a1b3 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-192f-robust_x4 @@ -0,0 +1,16 @@ +Parameters: n = 24, h = 66, d = 22, b = 8, k = 33, w = 16, way=4, tree height=3, wots_len=51 +Running 10 iterations. +thash avg. 0.76 us (0.00 sec); median 1,704 cycles, 1x: 1,704 cycles +f1600x avg. 0.73 us (0.00 sec); median 1,520 cycles, 1x: 1,520 cycles +thashx avg. 1.57 us (0.00 sec); median 3,217 cycles, 1x: 3,217 cycles +Generating keypair.. avg. 2488.74 us (0.00 sec); median 5,176,480 cycles, 1x: 5,176,480 cycles + - WOTS pk gen x (ideal).. avg. 1256.29 us (0.00 sec); median 2,581,561 cycles, 2x: 5,163,122 cycles + - WOTS pk gen x (real).. avg. 1242.85 us (0.00 sec); median 2,583,050 cycles, 2x: 5,166,100 cycles +Signing.. avg. 63068.88 us (0.06 sec); median 131,204,401 cycles, 1x: 131,204,401 cycles + - FORS signing.. avg. 8228.50 us (0.01 sec); median 17,296,741 cycles, 1x: 17,296,741 cycles + - WOTS pk gen x (ideal).. avg. 1252.17 us (0.00 sec); median 2,582,246 cycles, 44x: 113,618,824 cycles + - WOTS pk gen x (real).. avg. 1235.04 us (0.00 sec); median 2,581,944 cycles, 44x: 113,605,536 cycles +Verifying.. avg. 3970.03 us (0.00 sec); median 8,462,964 cycles, 1x: 8,462,964 cycles +Signature size: 35664 (34.83 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-192f-robust_x5 b/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-192f-robust_x5 new file mode 100644 index 0000000..7bdf18e --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-192f-robust_x5 @@ -0,0 +1,16 @@ +Parameters: n = 24, h = 66, d = 22, b = 8, k = 33, w = 16, way=5, tree height=3, wots_len=51 +Running 10 iterations. +thash avg. 0.74 us (0.00 sec); median 1,704 cycles, 1x: 1,704 cycles +f1600x avg. 0.98 us (0.00 sec); median 2,164 cycles, 1x: 2,164 cycles +thashx avg. 2.13 us (0.00 sec); median 4,512 cycles, 1x: 4,512 cycles +Generating keypair.. avg. 3392.52 us (0.00 sec); median 7,253,989 cycles, 1x: 7,253,989 cycles + - WOTS pk gen x (ideal).. avg. 1675.28 us (0.00 sec); median 3,621,170 cycles, 1x: 3,621,170 cycles + - WOTS pk gen x (real).. avg. 1702.40 us (0.00 sec); median 3,620,880 cycles, 2x: 7,241,760 cycles +Signing.. avg. 86181.84 us (0.09 sec); median 186,076,434 cycles, 1x: 186,076,434 cycles + - FORS signing.. avg. 11735.07 us (0.01 sec); median 26,462,683 cycles, 1x: 26,462,683 cycles + - WOTS pk gen x (ideal).. avg. 1687.28 us (0.00 sec); median 3,620,546 cycles, 35x: 126,719,110 cycles + - WOTS pk gen x (real).. avg. 1694.59 us (0.00 sec); median 3,620,346 cycles, 44x: 159,295,224 cycles +Verifying.. avg. 4400.23 us (0.00 sec); median 9,464,352 cycles, 1x: 9,464,352 cycles +Signature size: 35664 (34.83 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-192f-simple_x3 b/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-192f-simple_x3 new file mode 100644 index 0000000..b83feb6 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-192f-simple_x3 @@ -0,0 +1,16 @@ +Parameters: n = 24, h = 66, d = 22, b = 8, k = 33, w = 16, way=3, tree height=3, wots_len=51 +Running 10 iterations. +thash avg. 0.38 us (0.00 sec); median 862 cycles, 1x: 862 cycles +f1600x avg. 0.67 us (0.00 sec); median 1,506 cycles, 1x: 1,506 cycles +thashx avg. 0.74 us (0.00 sec); median 1,589 cycles, 1x: 1,589 cycles +Generating keypair.. avg. 1835.28 us (0.00 sec); median 3,951,354 cycles, 1x: 3,951,354 cycles + - WOTS pk gen x (ideal).. avg. 605.67 us (0.00 sec); median 1,314,171 cycles, 2x: 2,628,342 cycles + - WOTS pk gen x (real).. avg. 602.88 us (0.00 sec); median 1,314,638 cycles, 3x: 3,943,914 cycles +Signing.. avg. 47319.30 us (0.05 sec); median 103,369,370 cycles, 1x: 103,369,370 cycles + - FORS signing.. avg. 7305.20 us (0.01 sec); median 16,493,101 cycles, 1x: 16,493,101 cycles + - WOTS pk gen x (ideal).. avg. 602.58 us (0.00 sec); median 1,315,566 cycles, 58x: 76,302,828 cycles + - WOTS pk gen x (real).. avg. 610.89 us (0.00 sec); median 1,315,072 cycles, 66x: 86,794,752 cycles +Verifying.. avg. 2391.14 us (0.00 sec); median 5,237,044 cycles, 1x: 5,237,044 cycles +Signature size: 35664 (34.83 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-192f-simple_x4 b/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-192f-simple_x4 new file mode 100644 index 0000000..a6104e4 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-192f-simple_x4 @@ -0,0 +1,16 @@ +Parameters: n = 24, h = 66, d = 22, b = 8, k = 33, w = 16, way=4, tree height=3, wots_len=51 +Running 10 iterations. +thash avg. 0.38 us (0.00 sec); median 862 cycles, 1x: 862 cycles +f1600x avg. 0.72 us (0.00 sec); median 1,522 cycles, 1x: 1,522 cycles +thashx avg. 0.77 us (0.00 sec); median 1,614 cycles, 1x: 1,614 cycles +Generating keypair.. avg. 1298.71 us (0.00 sec); median 2,685,861 cycles, 1x: 2,685,861 cycles + - WOTS pk gen x (ideal).. avg. 650.60 us (0.00 sec); median 1,340,251 cycles, 2x: 2,680,502 cycles + - WOTS pk gen x (real).. avg. 651.10 us (0.00 sec); median 1,341,268 cycles, 2x: 2,682,536 cycles +Signing.. avg. 33540.78 us (0.03 sec); median 69,618,200 cycles, 1x: 69,618,200 cycles + - FORS signing.. avg. 5018.79 us (0.01 sec); median 10,550,263 cycles, 1x: 10,550,263 cycles + - WOTS pk gen x (ideal).. avg. 641.05 us (0.00 sec); median 1,339,518 cycles, 44x: 58,938,792 cycles + - WOTS pk gen x (real).. avg. 648.41 us (0.00 sec); median 1,338,622 cycles, 44x: 58,899,368 cycles +Verifying.. avg. 1936.35 us (0.00 sec); median 4,121,831 cycles, 1x: 4,121,831 cycles +Signature size: 35664 (34.83 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-192f-simple_x5 b/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-192f-simple_x5 new file mode 100644 index 0000000..fdd1118 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-192f-simple_x5 @@ -0,0 +1,16 @@ +Parameters: n = 24, h = 66, d = 22, b = 8, k = 33, w = 16, way=5, tree height=3, wots_len=51 +Running 10 iterations. +thash avg. 0.38 us (0.00 sec); median 863 cycles, 1x: 863 cycles +f1600x avg. 0.98 us (0.00 sec); median 2,161 cycles, 1x: 2,161 cycles +thashx avg. 1.06 us (0.00 sec); median 2,272 cycles, 1x: 2,272 cycles +Generating keypair.. avg. 1743.74 us (0.00 sec); median 3,771,980 cycles, 1x: 3,771,980 cycles + - WOTS pk gen x (ideal).. avg. 871.42 us (0.00 sec); median 1,882,479 cycles, 1x: 1,882,479 cycles + - WOTS pk gen x (real).. avg. 873.57 us (0.00 sec); median 1,882,799 cycles, 2x: 3,765,598 cycles +Signing.. avg. 45415.92 us (0.05 sec); median 98,343,618 cycles, 1x: 98,343,618 cycles + - FORS signing.. avg. 6801.73 us (0.01 sec); median 15,331,391 cycles, 1x: 15,331,391 cycles + - WOTS pk gen x (ideal).. avg. 881.15 us (0.00 sec); median 1,882,356 cycles, 35x: 65,882,460 cycles + - WOTS pk gen x (real).. avg. 878.58 us (0.00 sec); median 1,881,806 cycles, 44x: 82,799,464 cycles +Verifying.. avg. 2167.85 us (0.00 sec); median 4,712,590 cycles, 1x: 4,712,590 cycles +Signature size: 35664 (34.83 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-192s-robust_x3 b/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-192s-robust_x3 new file mode 100644 index 0000000..7a319b8 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-192s-robust_x3 @@ -0,0 +1,16 @@ +Parameters: n = 24, h = 63, d = 7, b = 14, k = 17, w = 16, way=3, tree height=9, wots_len=51 +Running 10 iterations. +thash avg. 0.80 us (0.00 sec); median 1,707 cycles, 1x: 1,707 cycles +f1600x avg. 0.70 us (0.00 sec); median 1,503 cycles, 1x: 1,503 cycles +thashx avg. 1.49 us (0.00 sec); median 3,159 cycles, 1x: 3,159 cycles +Generating keypair.. avg. 210683.17 us (0.21 sec); median 434,158,632 cycles, 1x: 434,158,632 cycles + - WOTS pk gen x (ideal).. avg. 1230.13 us (0.00 sec); median 2,536,143 cycles, 170x: 431,144,310 cycles + - WOTS pk gen x (real).. avg. 1220.40 us (0.00 sec); median 2,534,751 cycles, 171x: 433,442,421 cycles +Signing.. avg. 1915695.04 us (1.92 sec); median 3,968,698,530 cycles, 1x: 3,968,698,530 cycles + - FORS signing.. avg. 440091.72 us (0.44 sec); median 929,670,850 cycles, 1x: 929,670,850 cycles + - WOTS pk gen x (ideal).. avg. 1237.85 us (0.00 sec); median 2,532,746 cycles, 1194x: 3,024,098,724 cycles + - WOTS pk gen x (real).. avg. 1257.06 us (0.00 sec); median 2,535,561 cycles, 1197x: 3,035,066,517 cycles +Verifying.. avg. 1720.76 us (0.00 sec); median 3,565,065 cycles, 1x: 3,565,065 cycles +Signature size: 16224 (15.84 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-192s-robust_x4 b/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-192s-robust_x4 new file mode 100644 index 0000000..a5627c0 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-192s-robust_x4 @@ -0,0 +1,16 @@ +Parameters: n = 24, h = 63, d = 7, b = 14, k = 17, w = 16, way=4, tree height=9, wots_len=51 +Running 10 iterations. +thash avg. 0.80 us (0.00 sec); median 1,705 cycles, 1x: 1,705 cycles +f1600x avg. 0.76 us (0.00 sec); median 1,521 cycles, 1x: 1,521 cycles +thashx avg. 1.70 us (0.00 sec); median 3,231 cycles, 1x: 3,231 cycles +Generating keypair.. avg. 172483.18 us (0.17 sec); median 332,343,381 cycles, 1x: 332,343,381 cycles + - WOTS pk gen x (ideal).. avg. 1350.46 us (0.00 sec); median 2,593,661 cycles, 128x: 331,988,608 cycles + - WOTS pk gen x (real).. avg. 1341.57 us (0.00 sec); median 2,591,836 cycles, 128x: 331,755,008 cycles +Signing.. avg. 1500609.97 us (1.50 sec); median 2,894,654,585 cycles, 1x: 2,894,654,585 cycles + - FORS signing.. avg. 295673.83 us (0.30 sec); median 568,102,178 cycles, 1x: 568,102,178 cycles + - WOTS pk gen x (ideal).. avg. 1351.90 us (0.00 sec); median 2,593,259 cycles, 896x: 2,323,560,064 cycles + - WOTS pk gen x (real).. avg. 1341.35 us (0.00 sec); median 2,592,718 cycles, 896x: 2,323,075,328 cycles +Verifying.. avg. 1498.29 us (0.00 sec); median 3,011,426 cycles, 1x: 3,011,426 cycles +Signature size: 16224 (15.84 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-192s-robust_x5 b/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-192s-robust_x5 new file mode 100644 index 0000000..19bc98c --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-192s-robust_x5 @@ -0,0 +1,16 @@ +Parameters: n = 24, h = 63, d = 7, b = 14, k = 17, w = 16, way=5, tree height=9, wots_len=51 +Running 10 iterations. +thash avg. 0.79 us (0.00 sec); median 1,706 cycles, 1x: 1,706 cycles +f1600x avg. 1.05 us (0.00 sec); median 2,165 cycles, 1x: 2,165 cycles +thashx avg. 2.24 us (0.00 sec); median 4,510 cycles, 1x: 4,510 cycles +Generating keypair.. avg. 188580.57 us (0.19 sec); median 374,028,784 cycles, 1x: 374,028,784 cycles + - WOTS pk gen x (ideal).. avg. 1820.07 us (0.00 sec); median 3,620,548 cycles, 102x: 369,295,896 cycles + - WOTS pk gen x (real).. avg. 1857.30 us (0.00 sec); median 3,620,400 cycles, 103x: 372,901,200 cycles +Signing.. avg. 1765793.00 us (1.77 sec); median 3,484,984,625 cycles, 1x: 3,484,984,625 cycles + - FORS signing.. avg. 424802.99 us (0.42 sec); median 866,717,718 cycles, 1x: 866,717,718 cycles + - WOTS pk gen x (ideal).. avg. 1860.19 us (0.00 sec); median 3,620,296 cycles, 716x: 2,592,131,936 cycles + - WOTS pk gen x (real).. avg. 1885.86 us (0.00 sec); median 3,619,872 cycles, 721x: 2,609,927,712 cycles +Verifying.. avg. 1683.58 us (0.00 sec); median 3,352,593 cycles, 1x: 3,352,593 cycles +Signature size: 16224 (15.84 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-192s-simple_x3 b/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-192s-simple_x3 new file mode 100644 index 0000000..cbc7168 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-192s-simple_x3 @@ -0,0 +1,16 @@ +Parameters: n = 24, h = 63, d = 7, b = 14, k = 17, w = 16, way=3, tree height=9, wots_len=51 +Running 10 iterations. +thash avg. 0.38 us (0.00 sec); median 860 cycles, 1x: 860 cycles +f1600x avg. 0.68 us (0.00 sec); median 1,503 cycles, 1x: 1,503 cycles +thashx avg. 0.73 us (0.00 sec); median 1,584 cycles, 1x: 1,584 cycles +Generating keypair.. avg. 104189.11 us (0.10 sec); median 225,098,090 cycles, 1x: 225,098,090 cycles + - WOTS pk gen x (ideal).. avg. 611.34 us (0.00 sec); median 1,311,817 cycles, 170x: 223,008,890 cycles + - WOTS pk gen x (real).. avg. 612.37 us (0.00 sec); median 1,313,441 cycles, 171x: 224,598,411 cycles +Signing.. avg. 978437.82 us (0.98 sec); median 2,119,770,173 cycles, 1x: 2,119,770,173 cycles + - FORS signing.. avg. 248825.79 us (0.25 sec); median 544,109,983 cycles, 1x: 544,109,983 cycles + - WOTS pk gen x (ideal).. avg. 615.38 us (0.00 sec); median 1,312,229 cycles, 1194x: 1,566,801,426 cycles + - WOTS pk gen x (real).. avg. 632.66 us (0.00 sec); median 1,311,736 cycles, 1197x: 1,570,147,992 cycles +Verifying.. avg. 833.02 us (0.00 sec); median 1,805,138 cycles, 1x: 1,805,138 cycles +Signature size: 16224 (15.84 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-192s-simple_x4 b/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-192s-simple_x4 new file mode 100644 index 0000000..02eda97 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-192s-simple_x4 @@ -0,0 +1,16 @@ +Parameters: n = 24, h = 63, d = 7, b = 14, k = 17, w = 16, way=4, tree height=9, wots_len=51 +Running 10 iterations. +thash avg. 0.39 us (0.00 sec); median 860 cycles, 1x: 860 cycles +f1600x avg. 0.74 us (0.00 sec); median 1,521 cycles, 1x: 1,521 cycles +thashx avg. 0.81 us (0.00 sec); median 1,619 cycles, 1x: 1,619 cycles +Generating keypair.. avg. 84295.10 us (0.08 sec); median 171,762,388 cycles, 1x: 171,762,388 cycles + - WOTS pk gen x (ideal).. avg. 642.32 us (0.00 sec); median 1,341,172 cycles, 128x: 171,670,016 cycles + - WOTS pk gen x (real).. avg. 672.52 us (0.00 sec); median 1,340,193 cycles, 128x: 171,544,704 cycles +Signing.. avg. 757871.36 us (0.76 sec); median 1,548,172,881 cycles, 1x: 1,548,172,881 cycles + - FORS signing.. avg. 169688.00 us (0.17 sec); median 346,020,223 cycles, 1x: 346,020,223 cycles + - WOTS pk gen x (ideal).. avg. 660.35 us (0.00 sec); median 1,341,153 cycles, 896x: 1,201,673,088 cycles + - WOTS pk gen x (real).. avg. 670.59 us (0.00 sec); median 1,341,084 cycles, 896x: 1,201,611,264 cycles +Verifying.. avg. 738.26 us (0.00 sec); median 1,553,205 cycles, 1x: 1,553,205 cycles +Signature size: 16224 (15.84 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-192s-simple_x5 b/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-192s-simple_x5 new file mode 100644 index 0000000..95daadd --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-192s-simple_x5 @@ -0,0 +1,16 @@ +Parameters: n = 24, h = 63, d = 7, b = 14, k = 17, w = 16, way=5, tree height=9, wots_len=51 +Running 10 iterations. +thash avg. 0.38 us (0.00 sec); median 861 cycles, 1x: 861 cycles +f1600x avg. 0.99 us (0.00 sec); median 2,161 cycles, 1x: 2,161 cycles +thashx avg. 1.05 us (0.00 sec); median 2,270 cycles, 1x: 2,270 cycles +Generating keypair.. avg. 93521.73 us (0.09 sec); median 194,197,329 cycles, 1x: 194,197,329 cycles + - WOTS pk gen x (ideal).. avg. 912.53 us (0.00 sec); median 1,882,653 cycles, 102x: 192,030,606 cycles + - WOTS pk gen x (real).. avg. 913.16 us (0.00 sec); median 1,882,814 cycles, 103x: 193,929,842 cycles +Signing.. avg. 893347.99 us (0.89 sec); median 1,860,741,462 cycles, 1x: 1,860,741,462 cycles + - FORS signing.. avg. 232949.73 us (0.23 sec); median 501,330,026 cycles, 1x: 501,330,026 cycles + - WOTS pk gen x (ideal).. avg. 907.64 us (0.00 sec); median 1,883,565 cycles, 716x: 1,348,632,540 cycles + - WOTS pk gen x (real).. avg. 928.98 us (0.00 sec); median 1,882,897 cycles, 721x: 1,357,568,737 cycles +Verifying.. avg. 817.22 us (0.00 sec); median 1,749,406 cycles, 1x: 1,749,406 cycles +Signature size: 16224 (15.84 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-256f-robust_x3 b/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-256f-robust_x3 new file mode 100644 index 0000000..54c854a --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-256f-robust_x3 @@ -0,0 +1,16 @@ +Parameters: n = 32, h = 68, d = 17, b = 9, k = 35, w = 16, way=3, tree height=4, wots_len=67 +Running 10 iterations. +thash avg. 0.82 us (0.00 sec); median 1,704 cycles, 1x: 1,704 cycles +f1600x avg. 0.74 us (0.00 sec); median 1,507 cycles, 1x: 1,507 cycles +thashx avg. 1.62 us (0.00 sec); median 3,176 cycles, 1x: 3,176 cycles +Generating keypair.. avg. 10357.24 us (0.01 sec); median 20,159,914 cycles, 1x: 20,159,914 cycles + - WOTS pk gen x (ideal).. avg. 1728.21 us (0.00 sec); median 3,355,149 cycles, 5x: 16,775,745 cycles + - WOTS pk gen x (real).. avg. 1732.29 us (0.00 sec); median 3,354,990 cycles, 6x: 20,129,940 cycles +Signing.. avg. 203446.72 us (0.20 sec); median 402,500,678 cycles, 1x: 402,500,678 cycles + - FORS signing.. avg. 29186.11 us (0.03 sec); median 59,687,266 cycles, 1x: 59,687,266 cycles + - WOTS pk gen x (ideal).. avg. 1694.87 us (0.00 sec); median 3,352,507 cycles, 90x: 301,725,630 cycles + - WOTS pk gen x (real).. avg. 1751.19 us (0.00 sec); median 3,353,020 cycles, 102x: 342,008,040 cycles +Verifying.. avg. 5246.36 us (0.01 sec); median 10,506,621 cycles, 1x: 10,506,621 cycles +Signature size: 49856 (48.69 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-256f-robust_x4 b/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-256f-robust_x4 new file mode 100644 index 0000000..ba619e2 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-256f-robust_x4 @@ -0,0 +1,16 @@ +Parameters: n = 32, h = 68, d = 17, b = 9, k = 35, w = 16, way=4, tree height=4, wots_len=67 +Running 10 iterations. +thash avg. 0.83 us (0.00 sec); median 1,702 cycles, 1x: 1,702 cycles +f1600x avg. 0.79 us (0.00 sec); median 1,523 cycles, 1x: 1,523 cycles +thashx avg. 1.74 us (0.00 sec); median 3,239 cycles, 1x: 3,239 cycles +Generating keypair.. avg. 7445.41 us (0.01 sec); median 13,745,898 cycles, 1x: 13,745,898 cycles + - WOTS pk gen x (ideal).. avg. 1849.02 us (0.00 sec); median 3,432,567 cycles, 4x: 13,730,268 cycles + - WOTS pk gen x (real).. avg. 1842.38 us (0.00 sec); median 3,433,069 cycles, 4x: 13,732,276 cycles +Signing.. avg. 144266.24 us (0.14 sec); median 271,043,001 cycles, 1x: 271,043,001 cycles + - FORS signing.. avg. 19648.88 us (0.02 sec); median 37,255,693 cycles, 1x: 37,255,693 cycles + - WOTS pk gen x (ideal).. avg. 1838.37 us (0.00 sec); median 3,434,001 cycles, 68x: 233,512,068 cycles + - WOTS pk gen x (real).. avg. 1829.95 us (0.00 sec); median 3,431,548 cycles, 68x: 233,345,264 cycles +Verifying.. avg. 4419.86 us (0.00 sec); median 8,433,826 cycles, 1x: 8,433,826 cycles +Signature size: 49856 (48.69 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-256f-robust_x5 b/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-256f-robust_x5 new file mode 100644 index 0000000..1d7e7b6 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-256f-robust_x5 @@ -0,0 +1,16 @@ +Parameters: n = 32, h = 68, d = 17, b = 9, k = 35, w = 16, way=5, tree height=4, wots_len=67 +Running 10 iterations. +thash avg. 0.79 us (0.00 sec); median 1,701 cycles, 1x: 1,701 cycles +f1600x avg. 1.10 us (0.00 sec); median 2,166 cycles, 1x: 2,166 cycles +thashx avg. 2.37 us (0.00 sec); median 4,537 cycles, 1x: 4,537 cycles +Generating keypair.. avg. 10012.13 us (0.01 sec); median 19,268,444 cycles, 1x: 19,268,444 cycles + - WOTS pk gen x (ideal).. avg. 2513.54 us (0.00 sec); median 4,803,886 cycles, 3x: 14,411,658 cycles + - WOTS pk gen x (real).. avg. 2497.03 us (0.00 sec); median 4,804,659 cycles, 4x: 19,218,636 cycles +Signing.. avg. 197572.25 us (0.20 sec); median 383,362,470 cycles, 1x: 383,362,470 cycles + - FORS signing.. avg. 27306.43 us (0.03 sec); median 55,843,829 cycles, 1x: 55,843,829 cycles + - WOTS pk gen x (ideal).. avg. 2506.85 us (0.00 sec); median 4,803,034 cycles, 54x: 259,363,836 cycles + - WOTS pk gen x (real).. avg. 2511.58 us (0.00 sec); median 4,802,788 cycles, 68x: 326,589,584 cycles +Verifying.. avg. 4943.09 us (0.00 sec); median 9,617,272 cycles, 1x: 9,617,272 cycles +Signature size: 49856 (48.69 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-256f-simple_x3 b/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-256f-simple_x3 new file mode 100644 index 0000000..d092fb1 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-256f-simple_x3 @@ -0,0 +1,16 @@ +Parameters: n = 32, h = 68, d = 17, b = 9, k = 35, w = 16, way=3, tree height=4, wots_len=67 +Running 10 iterations. +thash avg. 0.42 us (0.00 sec); median 867 cycles, 1x: 867 cycles +f1600x avg. 0.74 us (0.00 sec); median 1,506 cycles, 1x: 1,506 cycles +thashx avg. 0.78 us (0.00 sec); median 1,607 cycles, 1x: 1,607 cycles +Generating keypair.. avg. 5310.05 us (0.01 sec); median 10,520,690 cycles, 1x: 10,520,690 cycles + - WOTS pk gen x (ideal).. avg. 897.09 us (0.00 sec); median 1,755,038 cycles, 5x: 8,775,190 cycles + - WOTS pk gen x (real).. avg. 887.21 us (0.00 sec); median 1,755,081 cycles, 6x: 10,530,486 cycles +Signing.. avg. 108457.40 us (0.11 sec); median 213,846,431 cycles, 1x: 213,846,431 cycles + - FORS signing.. avg. 17045.57 us (0.02 sec); median 34,903,528 cycles, 1x: 34,903,528 cycles + - WOTS pk gen x (ideal).. avg. 899.93 us (0.00 sec); median 1,755,904 cycles, 90x: 158,031,360 cycles + - WOTS pk gen x (real).. avg. 887.35 us (0.00 sec); median 1,756,032 cycles, 102x: 179,115,264 cycles +Verifying.. avg. 2699.30 us (0.00 sec); median 5,377,175 cycles, 1x: 5,377,175 cycles +Signature size: 49856 (48.69 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-256f-simple_x4 b/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-256f-simple_x4 new file mode 100644 index 0000000..6467035 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-256f-simple_x4 @@ -0,0 +1,16 @@ +Parameters: n = 32, h = 68, d = 17, b = 9, k = 35, w = 16, way=4, tree height=4, wots_len=67 +Running 10 iterations. +thash avg. 0.43 us (0.00 sec); median 868 cycles, 1x: 868 cycles +f1600x avg. 0.80 us (0.00 sec); median 1,522 cycles, 1x: 1,522 cycles +thashx avg. 0.91 us (0.00 sec); median 1,656 cycles, 1x: 1,656 cycles +Generating keypair.. avg. 3945.02 us (0.00 sec); median 7,250,536 cycles, 1x: 7,250,536 cycles + - WOTS pk gen x (ideal).. avg. 958.65 us (0.00 sec); median 1,810,141 cycles, 4x: 7,240,564 cycles + - WOTS pk gen x (real).. avg. 979.66 us (0.00 sec); median 1,810,010 cycles, 4x: 7,240,040 cycles +Signing.. avg. 77276.78 us (0.08 sec); median 145,904,546 cycles, 1x: 145,904,546 cycles + - FORS signing.. avg. 11948.57 us (0.01 sec); median 22,650,485 cycles, 1x: 22,650,485 cycles + - WOTS pk gen x (ideal).. avg. 958.23 us (0.00 sec); median 1,810,061 cycles, 68x: 123,084,148 cycles + - WOTS pk gen x (real).. avg. 956.81 us (0.00 sec); median 1,810,666 cycles, 68x: 123,125,288 cycles +Verifying.. avg. 2302.47 us (0.00 sec); median 4,392,535 cycles, 1x: 4,392,535 cycles +Signature size: 49856 (48.69 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-256f-simple_x5 b/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-256f-simple_x5 new file mode 100644 index 0000000..6d1d201 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-256f-simple_x5 @@ -0,0 +1,16 @@ +Parameters: n = 32, h = 68, d = 17, b = 9, k = 35, w = 16, way=5, tree height=4, wots_len=67 +Running 10 iterations. +thash avg. 0.42 us (0.00 sec); median 875 cycles, 1x: 875 cycles +f1600x avg. 1.08 us (0.00 sec); median 2,164 cycles, 1x: 2,164 cycles +thashx avg. 1.18 us (0.00 sec); median 2,300 cycles, 1x: 2,300 cycles +Generating keypair.. avg. 5200.72 us (0.01 sec); median 10,063,932 cycles, 1x: 10,063,932 cycles + - WOTS pk gen x (ideal).. avg. 1312.35 us (0.00 sec); median 2,513,132 cycles, 3x: 7,539,396 cycles + - WOTS pk gen x (real).. avg. 1308.09 us (0.00 sec); median 2,512,719 cycles, 4x: 10,050,876 cycles +Signing.. avg. 105332.59 us (0.11 sec); median 203,539,493 cycles, 1x: 203,539,493 cycles + - FORS signing.. avg. 15850.91 us (0.02 sec); median 32,389,226 cycles, 1x: 32,389,226 cycles + - WOTS pk gen x (ideal).. avg. 1312.95 us (0.00 sec); median 2,511,740 cycles, 54x: 135,633,960 cycles + - WOTS pk gen x (real).. avg. 1329.98 us (0.00 sec); median 2,511,789 cycles, 68x: 170,801,652 cycles +Verifying.. avg. 2516.81 us (0.00 sec); median 4,925,411 cycles, 1x: 4,925,411 cycles +Signature size: 49856 (48.69 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-256s-robust_x3 b/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-256s-robust_x3 new file mode 100644 index 0000000..0932f22 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-256s-robust_x3 @@ -0,0 +1,16 @@ +Parameters: n = 32, h = 64, d = 8, b = 14, k = 22, w = 16, way=3, tree height=8, wots_len=67 +Running 10 iterations. +thash avg. 0.83 us (0.00 sec); median 1,696 cycles, 1x: 1,696 cycles +f1600x avg. 0.76 us (0.00 sec); median 1,507 cycles, 1x: 1,507 cycles +thashx avg. 1.62 us (0.00 sec); median 3,172 cycles, 1x: 3,172 cycles +Generating keypair.. avg. 152028.09 us (0.15 sec); median 289,200,267 cycles, 1x: 289,200,267 cycles + - WOTS pk gen x (ideal).. avg. 1769.94 us (0.00 sec); median 3,361,269 cycles, 85x: 285,707,865 cycles + - WOTS pk gen x (real).. avg. 1770.14 us (0.00 sec); median 3,360,492 cycles, 86x: 289,002,312 cycles +Signing.. avg. 1821100.94 us (1.82 sec); median 3,513,038,959 cycles, 1x: 3,513,038,959 cycles + - FORS signing.. avg. 623212.13 us (0.62 sec); median 1,200,173,916 cycles, 1x: 1,200,173,916 cycles + - WOTS pk gen x (ideal).. avg. 1788.85 us (0.00 sec); median 3,361,961 cycles, 682x: 2,292,857,402 cycles + - WOTS pk gen x (real).. avg. 1817.03 us (0.00 sec); median 3,361,456 cycles, 688x: 2,312,681,728 cycles +Verifying.. avg. 2941.78 us (0.00 sec); median 5,503,072 cycles, 1x: 5,503,072 cycles +Signature size: 29792 (29.09 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-256s-robust_x4 b/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-256s-robust_x4 new file mode 100644 index 0000000..4583ac6 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-256s-robust_x4 @@ -0,0 +1,16 @@ +Parameters: n = 32, h = 64, d = 8, b = 14, k = 22, w = 16, way=4, tree height=8, wots_len=67 +Running 10 iterations. +thash avg. 0.89 us (0.00 sec); median 1,701 cycles, 1x: 1,701 cycles +f1600x avg. 0.83 us (0.00 sec); median 1,523 cycles, 1x: 1,523 cycles +thashx avg. 1.84 us (0.00 sec); median 3,237 cycles, 1x: 3,237 cycles +Generating keypair.. avg. 123869.18 us (0.12 sec); median 220,380,054 cycles, 1x: 220,380,054 cycles + - WOTS pk gen x (ideal).. avg. 1925.52 us (0.00 sec); median 3,433,668 cycles, 64x: 219,754,752 cycles + - WOTS pk gen x (real).. avg. 1964.49 us (0.00 sec); median 3,430,826 cycles, 64x: 219,572,864 cycles +Signing.. avg. 1407647.66 us (1.41 sec); median 2,508,852,801 cycles, 1x: 2,508,852,801 cycles + - FORS signing.. avg. 419985.36 us (0.42 sec); median 746,206,866 cycles, 1x: 746,206,866 cycles + - WOTS pk gen x (ideal).. avg. 1979.44 us (0.00 sec); median 3,434,940 cycles, 512x: 1,758,689,280 cycles + - WOTS pk gen x (real).. avg. 1959.96 us (0.00 sec); median 3,432,331 cycles, 512x: 1,757,353,472 cycles +Verifying.. avg. 2452.55 us (0.00 sec); median 4,475,337 cycles, 1x: 4,475,337 cycles +Signature size: 29792 (29.09 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-256s-robust_x5 b/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-256s-robust_x5 new file mode 100644 index 0000000..62e3eef --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-256s-robust_x5 @@ -0,0 +1,16 @@ +Parameters: n = 32, h = 64, d = 8, b = 14, k = 22, w = 16, way=5, tree height=8, wots_len=67 +Running 10 iterations. +thash avg. 0.84 us (0.00 sec); median 1,702 cycles, 1x: 1,702 cycles +f1600x avg. 1.15 us (0.00 sec); median 2,167 cycles, 1x: 2,167 cycles +thashx avg. 2.48 us (0.00 sec); median 4,543 cycles, 1x: 4,543 cycles +Generating keypair.. avg. 137201.85 us (0.14 sec); median 251,124,409 cycles, 1x: 251,124,409 cycles + - WOTS pk gen x (ideal).. avg. 2701.56 us (0.00 sec); median 4,814,011 cycles, 51x: 245,514,561 cycles + - WOTS pk gen x (real).. avg. 2697.71 us (0.00 sec); median 4,810,432 cycles, 52x: 250,142,464 cycles +Signing.. avg. 1712457.97 us (1.71 sec); median 3,128,532,450 cycles, 1x: 3,128,532,450 cycles + - FORS signing.. avg. 598683.83 us (0.60 sec); median 1,120,482,334 cycles, 1x: 1,120,482,334 cycles + - WOTS pk gen x (ideal).. avg. 2714.67 us (0.00 sec); median 4,809,599 cycles, 409x: 1,967,125,991 cycles + - WOTS pk gen x (real).. avg. 2741.15 us (0.00 sec); median 4,809,955 cycles, 416x: 2,000,941,280 cycles +Verifying.. avg. 2744.05 us (0.00 sec); median 4,964,777 cycles, 1x: 4,964,777 cycles +Signature size: 29792 (29.09 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-256s-simple_x3 b/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-256s-simple_x3 new file mode 100644 index 0000000..6d84027 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-256s-simple_x3 @@ -0,0 +1,16 @@ +Parameters: n = 32, h = 64, d = 8, b = 14, k = 22, w = 16, way=3, tree height=8, wots_len=67 +Running 10 iterations. +thash avg. 0.42 us (0.00 sec); median 866 cycles, 1x: 866 cycles +f1600x avg. 0.75 us (0.00 sec); median 1,509 cycles, 1x: 1,509 cycles +thashx avg. 0.83 us (0.00 sec); median 1,604 cycles, 1x: 1,604 cycles +Generating keypair.. avg. 77394.53 us (0.08 sec); median 151,140,858 cycles, 1x: 151,140,858 cycles + - WOTS pk gen x (ideal).. avg. 908.81 us (0.00 sec); median 1,752,748 cycles, 85x: 148,983,580 cycles + - WOTS pk gen x (real).. avg. 892.69 us (0.00 sec); median 1,752,709 cycles, 86x: 150,732,974 cycles +Signing.. avg. 967779.71 us (0.97 sec); median 1,911,311,253 cycles, 1x: 1,911,311,253 cycles + - FORS signing.. avg. 346789.10 us (0.35 sec); median 701,937,501 cycles, 1x: 701,937,501 cycles + - WOTS pk gen x (ideal).. avg. 910.81 us (0.00 sec); median 1,753,405 cycles, 682x: 1,195,822,210 cycles + - WOTS pk gen x (real).. avg. 915.77 us (0.00 sec); median 1,753,211 cycles, 688x: 1,206,209,168 cycles +Verifying.. avg. 1400.80 us (0.00 sec); median 2,787,881 cycles, 1x: 2,787,881 cycles +Signature size: 29792 (29.09 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-256s-simple_x4 b/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-256s-simple_x4 new file mode 100644 index 0000000..8a6b150 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-256s-simple_x4 @@ -0,0 +1,16 @@ +Parameters: n = 32, h = 64, d = 8, b = 14, k = 22, w = 16, way=4, tree height=8, wots_len=67 +Running 10 iterations. +thash avg. 0.43 us (0.00 sec); median 870 cycles, 1x: 870 cycles +f1600x avg. 0.80 us (0.00 sec); median 1,523 cycles, 1x: 1,523 cycles +thashx avg. 0.90 us (0.00 sec); median 1,640 cycles, 1x: 1,640 cycles +Generating keypair.. avg. 62803.38 us (0.06 sec); median 114,737,934 cycles, 1x: 114,737,934 cycles + - WOTS pk gen x (ideal).. avg. 989.64 us (0.00 sec); median 1,791,070 cycles, 64x: 114,628,480 cycles + - WOTS pk gen x (real).. avg. 966.02 us (0.00 sec); median 1,789,985 cycles, 64x: 114,559,040 cycles +Signing.. avg. 742069.30 us (0.74 sec); median 1,370,515,925 cycles, 1x: 1,370,515,925 cycles + - FORS signing.. avg. 244179.06 us (0.24 sec); median 452,591,203 cycles, 1x: 452,591,203 cycles + - WOTS pk gen x (ideal).. avg. 974.10 us (0.00 sec); median 1,788,687 cycles, 512x: 915,807,744 cycles + - WOTS pk gen x (real).. avg. 987.71 us (0.00 sec); median 1,789,616 cycles, 512x: 916,283,392 cycles +Verifying.. avg. 1182.53 us (0.00 sec); median 2,275,569 cycles, 1x: 2,275,569 cycles +Signature size: 29792 (29.09 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-256s-simple_x5 b/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-256s-simple_x5 new file mode 100644 index 0000000..89242b4 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_X1/sphincs-shake-256s-simple_x5 @@ -0,0 +1,16 @@ +Parameters: n = 32, h = 64, d = 8, b = 14, k = 22, w = 16, way=5, tree height=8, wots_len=67 +Running 10 iterations. +thash avg. 0.41 us (0.00 sec); median 869 cycles, 1x: 869 cycles +f1600x avg. 1.10 us (0.00 sec); median 2,165 cycles, 1x: 2,165 cycles +thashx avg. 1.19 us (0.00 sec); median 2,294 cycles, 1x: 2,294 cycles +Generating keypair.. avg. 68588.54 us (0.07 sec); median 130,713,658 cycles, 1x: 130,713,658 cycles + - WOTS pk gen x (ideal).. avg. 1319.05 us (0.00 sec); median 2,508,329 cycles, 51x: 127,924,779 cycles + - WOTS pk gen x (real).. avg. 1325.05 us (0.00 sec); median 2,509,008 cycles, 52x: 130,468,416 cycles +Signing.. avg. 874003.40 us (0.87 sec); median 1,694,069,097 cycles, 1x: 1,694,069,097 cycles + - FORS signing.. avg. 325981.98 us (0.33 sec); median 648,137,520 cycles, 1x: 648,137,520 cycles + - WOTS pk gen x (ideal).. avg. 1349.49 us (0.00 sec); median 2,509,978 cycles, 409x: 1,026,581,002 cycles + - WOTS pk gen x (real).. avg. 1349.60 us (0.00 sec); median 2,509,877 cycles, 416x: 1,044,108,832 cycles +Verifying.. avg. 1265.22 us (0.00 sec); median 2,391,345 cycles, 1x: 2,391,345 cycles +Signature size: 29792 (29.09 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-128f-robust_x3 b/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-128f-robust_x3 new file mode 100644 index 0000000..36524ae --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-128f-robust_x3 @@ -0,0 +1,16 @@ +Parameters: n = 16, h = 66, d = 22, b = 6, k = 33, w = 16, way=3, tree height=3, wots_len=35 +Running 10 iterations. +thash avg. 0.88 us (0.00 sec); median 1,675 cycles, 1x: 1,675 cycles +f1600x avg. 0.52 us (0.00 sec); median 984 cycles, 1x: 984 cycles +thashx avg. 1.08 us (0.00 sec); median 2,055 cycles, 1x: 2,055 cycles +Generating keypair.. avg. 1779.30 us (0.00 sec); median 3,398,983 cycles, 1x: 3,398,983 cycles + - WOTS pk gen x (ideal).. avg. 592.03 us (0.00 sec); median 1,128,465 cycles, 2x: 2,256,930 cycles + - WOTS pk gen x (real).. avg. 592.23 us (0.00 sec); median 1,128,352 cycles, 3x: 3,385,056 cycles +Signing.. avg. 42074.06 us (0.04 sec); median 80,635,390 cycles, 1x: 80,635,390 cycles + - FORS signing.. avg. 3052.81 us (0.00 sec); median 5,839,916 cycles, 1x: 5,839,916 cycles + - WOTS pk gen x (ideal).. avg. 593.11 us (0.00 sec); median 1,128,780 cycles, 58x: 65,469,240 cycles + - WOTS pk gen x (real).. avg. 592.97 us (0.00 sec); median 1,128,478 cycles, 66x: 74,479,548 cycles +Verifying.. avg. 2656.75 us (0.00 sec); median 5,088,740 cycles, 1x: 5,088,740 cycles +Signature size: 17088 (16.69 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-128f-robust_x4 b/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-128f-robust_x4 new file mode 100644 index 0000000..3417569 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-128f-robust_x4 @@ -0,0 +1,16 @@ +Parameters: n = 16, h = 66, d = 22, b = 6, k = 33, w = 16, way=4, tree height=3, wots_len=35 +Running 10 iterations. +thash avg. 0.88 us (0.00 sec); median 1,672 cycles, 1x: 1,672 cycles +f1600x avg. 0.75 us (0.00 sec); median 1,437 cycles, 1x: 1,437 cycles +thashx avg. 1.57 us (0.00 sec); median 3,003 cycles, 1x: 3,003 cycles +Generating keypair.. avg. 1754.54 us (0.00 sec); median 3,315,159 cycles, 1x: 3,315,159 cycles + - WOTS pk gen x (ideal).. avg. 872.03 us (0.00 sec); median 1,654,816 cycles, 2x: 3,309,632 cycles + - WOTS pk gen x (real).. avg. 869.51 us (0.00 sec); median 1,653,950 cycles, 2x: 3,307,900 cycles +Signing.. avg. 40183.44 us (0.04 sec); median 77,038,220 cycles, 1x: 77,038,220 cycles + - FORS signing.. avg. 2166.57 us (0.00 sec); median 4,148,653 cycles, 1x: 4,148,653 cycles + - WOTS pk gen x (ideal).. avg. 865.67 us (0.00 sec); median 1,652,764 cycles, 44x: 72,721,616 cycles + - WOTS pk gen x (real).. avg. 865.86 us (0.00 sec); median 1,652,720 cycles, 44x: 72,719,680 cycles +Verifying.. avg. 2893.79 us (0.00 sec); median 5,543,875 cycles, 1x: 5,543,875 cycles +Signature size: 17088 (16.69 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-128f-robust_x5 b/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-128f-robust_x5 new file mode 100644 index 0000000..249b848 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-128f-robust_x5 @@ -0,0 +1,16 @@ +Parameters: n = 16, h = 66, d = 22, b = 6, k = 33, w = 16, way=5, tree height=3, wots_len=35 +Running 10 iterations. +thash avg. 0.88 us (0.00 sec); median 1,675 cycles, 1x: 1,675 cycles +f1600x avg. 1.13 us (0.00 sec); median 2,155 cycles, 1x: 2,155 cycles +thashx avg. 2.35 us (0.00 sec); median 4,492 cycles, 1x: 4,492 cycles +Generating keypair.. avg. 2588.15 us (0.00 sec); median 4,941,365 cycles, 1x: 4,941,365 cycles + - WOTS pk gen x (ideal).. avg. 1289.89 us (0.00 sec); median 2,464,836 cycles, 1x: 2,464,836 cycles + - WOTS pk gen x (real).. avg. 1292.29 us (0.00 sec); median 2,464,727 cycles, 2x: 4,929,454 cycles +Signing.. avg. 60064.27 us (0.06 sec); median 115,160,758 cycles, 1x: 115,160,758 cycles + - FORS signing.. avg. 3392.93 us (0.00 sec); median 6,489,405 cycles, 1x: 6,489,405 cycles + - WOTS pk gen x (ideal).. avg. 1288.90 us (0.00 sec); median 2,463,162 cycles, 35x: 86,210,670 cycles + - WOTS pk gen x (real).. avg. 1288.46 us (0.00 sec); median 2,463,877 cycles, 44x: 108,410,588 cycles +Verifying.. avg. 3493.56 us (0.00 sec); median 6,694,077 cycles, 1x: 6,694,077 cycles +Signature size: 17088 (16.69 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-128f-simple_x3 b/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-128f-simple_x3 new file mode 100644 index 0000000..bbf718f --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-128f-simple_x3 @@ -0,0 +1,16 @@ +Parameters: n = 16, h = 66, d = 22, b = 6, k = 33, w = 16, way=3, tree height=3, wots_len=35 +Running 10 iterations. +thash avg. 0.45 us (0.00 sec); median 847 cycles, 1x: 847 cycles +f1600x avg. 0.52 us (0.00 sec); median 985 cycles, 1x: 985 cycles +thashx avg. 0.54 us (0.00 sec); median 1,028 cycles, 1x: 1,028 cycles +Generating keypair.. avg. 924.08 us (0.00 sec); median 1,759,070 cycles, 1x: 1,759,070 cycles + - WOTS pk gen x (ideal).. avg. 307.80 us (0.00 sec); median 582,403 cycles, 2x: 1,164,806 cycles + - WOTS pk gen x (real).. avg. 307.66 us (0.00 sec); median 582,507 cycles, 3x: 1,747,521 cycles +Signing.. avg. 21909.20 us (0.02 sec); median 42,009,689 cycles, 1x: 42,009,689 cycles + - FORS signing.. avg. 1747.01 us (0.00 sec); median 3,343,758 cycles, 1x: 3,343,758 cycles + - WOTS pk gen x (ideal).. avg. 308.44 us (0.00 sec); median 583,295 cycles, 58x: 33,831,110 cycles + - WOTS pk gen x (real).. avg. 308.76 us (0.00 sec); median 583,479 cycles, 66x: 38,509,614 cycles +Verifying.. avg. 1318.98 us (0.00 sec); median 2,520,553 cycles, 1x: 2,520,553 cycles +Signature size: 17088 (16.69 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-128f-simple_x4 b/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-128f-simple_x4 new file mode 100644 index 0000000..92e1246 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-128f-simple_x4 @@ -0,0 +1,16 @@ +Parameters: n = 16, h = 66, d = 22, b = 6, k = 33, w = 16, way=4, tree height=3, wots_len=35 +Running 10 iterations. +thash avg. 0.45 us (0.00 sec); median 846 cycles, 1x: 846 cycles +f1600x avg. 0.76 us (0.00 sec); median 1,438 cycles, 1x: 1,438 cycles +thashx avg. 0.80 us (0.00 sec); median 1,508 cycles, 1x: 1,508 cycles +Generating keypair.. avg. 903.01 us (0.00 sec); median 1,714,176 cycles, 1x: 1,714,176 cycles + - WOTS pk gen x (ideal).. avg. 452.39 us (0.00 sec); median 855,167 cycles, 2x: 1,710,334 cycles + - WOTS pk gen x (real).. avg. 450.70 us (0.00 sec); median 855,180 cycles, 2x: 1,710,360 cycles +Signing.. avg. 21052.14 us (0.02 sec); median 40,247,558 cycles, 1x: 40,247,558 cycles + - FORS signing.. avg. 1307.13 us (0.00 sec); median 2,497,266 cycles, 1x: 2,497,266 cycles + - WOTS pk gen x (ideal).. avg. 453.23 us (0.00 sec); median 855,205 cycles, 44x: 37,629,020 cycles + - WOTS pk gen x (real).. avg. 450.38 us (0.00 sec); median 855,151 cycles, 44x: 37,626,644 cycles +Verifying.. avg. 1461.02 us (0.00 sec); median 2,792,282 cycles, 1x: 2,792,282 cycles +Signature size: 17088 (16.69 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-128f-simple_x5 b/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-128f-simple_x5 new file mode 100644 index 0000000..5d500d0 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-128f-simple_x5 @@ -0,0 +1,16 @@ +Parameters: n = 16, h = 66, d = 22, b = 6, k = 33, w = 16, way=5, tree height=3, wots_len=35 +Running 10 iterations. +thash avg. 0.45 us (0.00 sec); median 848 cycles, 1x: 848 cycles +f1600x avg. 1.13 us (0.00 sec); median 2,154 cycles, 1x: 2,154 cycles +thashx avg. 1.18 us (0.00 sec); median 2,253 cycles, 1x: 2,253 cycles +Generating keypair.. avg. 1344.77 us (0.00 sec); median 2,560,644 cycles, 1x: 2,560,644 cycles + - WOTS pk gen x (ideal).. avg. 670.44 us (0.00 sec); median 1,276,756 cycles, 1x: 1,276,756 cycles + - WOTS pk gen x (real).. avg. 669.64 us (0.00 sec); median 1,276,894 cycles, 2x: 2,553,788 cycles +Signing.. avg. 31390.29 us (0.03 sec); median 60,102,393 cycles, 1x: 60,102,393 cycles + - FORS signing.. avg. 1976.29 us (0.00 sec); median 3,779,408 cycles, 1x: 3,779,408 cycles + - WOTS pk gen x (ideal).. avg. 669.91 us (0.00 sec); median 1,277,281 cycles, 35x: 44,704,835 cycles + - WOTS pk gen x (real).. avg. 671.39 us (0.00 sec); median 1,276,529 cycles, 44x: 56,167,276 cycles +Verifying.. avg. 1775.91 us (0.00 sec); median 3,396,253 cycles, 1x: 3,396,253 cycles +Signature size: 17088 (16.69 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-128s-robust_x3 b/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-128s-robust_x3 new file mode 100644 index 0000000..37cb5ba --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-128s-robust_x3 @@ -0,0 +1,16 @@ +Parameters: n = 16, h = 63, d = 7, b = 12, k = 14, w = 16, way=3, tree height=9, wots_len=35 +Running 10 iterations. +thash avg. 0.88 us (0.00 sec); median 1,673 cycles, 1x: 1,673 cycles +f1600x avg. 0.52 us (0.00 sec); median 989 cycles, 1x: 989 cycles +thashx avg. 1.08 us (0.00 sec); median 2,064 cycles, 1x: 2,064 cycles +Generating keypair.. avg. 101321.41 us (0.10 sec); median 194,294,564 cycles, 1x: 194,294,564 cycles + - WOTS pk gen x (ideal).. avg. 595.96 us (0.00 sec); median 1,131,540 cycles, 170x: 192,361,800 cycles + - WOTS pk gen x (real).. avg. 594.86 us (0.00 sec); median 1,133,251 cycles, 171x: 193,785,921 cycles +Signing.. avg. 792232.34 us (0.79 sec); median 1,517,988,183 cycles, 1x: 1,517,988,183 cycles + - FORS signing.. avg. 82232.82 us (0.08 sec); median 157,601,070 cycles, 1x: 157,601,070 cycles + - WOTS pk gen x (ideal).. avg. 596.95 us (0.00 sec); median 1,133,303 cycles, 1194x: 1,353,163,782 cycles + - WOTS pk gen x (real).. avg. 594.23 us (0.00 sec); median 1,133,581 cycles, 1197x: 1,356,896,457 cycles +Verifying.. avg. 969.25 us (0.00 sec); median 1,848,971 cycles, 1x: 1,848,971 cycles +Signature size: 7856 (7.67 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-128s-robust_x4 b/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-128s-robust_x4 new file mode 100644 index 0000000..d1c2bb2 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-128s-robust_x4 @@ -0,0 +1,16 @@ +Parameters: n = 16, h = 63, d = 7, b = 12, k = 14, w = 16, way=4, tree height=9, wots_len=35 +Running 10 iterations. +thash avg. 0.88 us (0.00 sec); median 1,671 cycles, 1x: 1,671 cycles +f1600x avg. 0.76 us (0.00 sec); median 1,438 cycles, 1x: 1,438 cycles +thashx avg. 1.57 us (0.00 sec); median 3,000 cycles, 1x: 3,000 cycles +Generating keypair.. avg. 110548.49 us (0.11 sec); median 211,921,759 cycles, 1x: 211,921,759 cycles + - WOTS pk gen x (ideal).. avg. 866.61 us (0.00 sec); median 1,653,656 cycles, 128x: 211,667,968 cycles + - WOTS pk gen x (real).. avg. 867.11 us (0.00 sec); median 1,653,338 cycles, 128x: 211,627,264 cycles +Signing.. avg. 830440.87 us (0.83 sec); median 1,592,340,100 cycles, 1x: 1,592,340,100 cycles + - FORS signing.. avg. 56760.55 us (0.06 sec); median 108,870,204 cycles, 1x: 108,870,204 cycles + - WOTS pk gen x (ideal).. avg. 869.64 us (0.00 sec); median 1,652,739 cycles, 896x: 1,480,854,144 cycles + - WOTS pk gen x (real).. avg. 866.29 us (0.00 sec); median 1,653,029 cycles, 896x: 1,481,113,984 cycles +Verifying.. avg. 1020.43 us (0.00 sec); median 1,943,731 cycles, 1x: 1,943,731 cycles +Signature size: 7856 (7.67 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-128s-robust_x5 b/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-128s-robust_x5 new file mode 100644 index 0000000..ed9ed92 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-128s-robust_x5 @@ -0,0 +1,16 @@ +Parameters: n = 16, h = 63, d = 7, b = 12, k = 14, w = 16, way=5, tree height=9, wots_len=35 +Running 10 iterations. +thash avg. 0.88 us (0.00 sec); median 1,673 cycles, 1x: 1,673 cycles +f1600x avg. 1.13 us (0.00 sec); median 2,156 cycles, 1x: 2,156 cycles +thashx avg. 2.35 us (0.00 sec); median 4,466 cycles, 1x: 4,466 cycles +Generating keypair.. avg. 132176.50 us (0.13 sec); median 253,456,557 cycles, 1x: 253,456,557 cycles + - WOTS pk gen x (ideal).. avg. 1282.99 us (0.00 sec); median 2,450,732 cycles, 102x: 249,974,664 cycles + - WOTS pk gen x (real).. avg. 1282.41 us (0.00 sec); median 2,450,637 cycles, 103x: 252,415,611 cycles +Signing.. avg. 1017129.23 us (1.02 sec); median 1,950,193,841 cycles, 1x: 1,950,193,841 cycles + - FORS signing.. avg. 91661.17 us (0.09 sec); median 175,774,395 cycles, 1x: 175,774,395 cycles + - WOTS pk gen x (ideal).. avg. 1282.46 us (0.00 sec); median 2,451,754 cycles, 716x: 1,755,455,864 cycles + - WOTS pk gen x (real).. avg. 1284.38 us (0.00 sec); median 2,452,606 cycles, 721x: 1,768,328,926 cycles +Verifying.. avg. 1243.79 us (0.00 sec); median 2,377,465 cycles, 1x: 2,377,465 cycles +Signature size: 7856 (7.67 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-128s-simple_x3 b/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-128s-simple_x3 new file mode 100644 index 0000000..82b0ad3 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-128s-simple_x3 @@ -0,0 +1,16 @@ +Parameters: n = 16, h = 63, d = 7, b = 12, k = 14, w = 16, way=3, tree height=9, wots_len=35 +Running 10 iterations. +thash avg. 0.45 us (0.00 sec); median 846 cycles, 1x: 846 cycles +f1600x avg. 0.52 us (0.00 sec); median 983 cycles, 1x: 983 cycles +thashx avg. 0.54 us (0.00 sec); median 1,030 cycles, 1x: 1,030 cycles +Generating keypair.. avg. 52331.08 us (0.05 sec); median 100,312,937 cycles, 1x: 100,312,937 cycles + - WOTS pk gen x (ideal).. avg. 309.65 us (0.00 sec); median 583,939 cycles, 170x: 99,269,630 cycles + - WOTS pk gen x (real).. avg. 310.25 us (0.00 sec); median 583,984 cycles, 171x: 99,861,264 cycles +Signing.. avg. 412648.37 us (0.41 sec); median 791,312,314 cycles, 1x: 791,312,314 cycles + - FORS signing.. avg. 46428.74 us (0.05 sec); median 89,060,495 cycles, 1x: 89,060,495 cycles + - WOTS pk gen x (ideal).. avg. 309.93 us (0.00 sec); median 584,387 cycles, 1194x: 697,758,078 cycles + - WOTS pk gen x (real).. avg. 308.95 us (0.00 sec); median 584,539 cycles, 1197x: 699,693,183 cycles +Verifying.. avg. 499.12 us (0.00 sec); median 948,304 cycles, 1x: 948,304 cycles +Signature size: 7856 (7.67 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-128s-simple_x4 b/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-128s-simple_x4 new file mode 100644 index 0000000..e2914a8 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-128s-simple_x4 @@ -0,0 +1,16 @@ +Parameters: n = 16, h = 63, d = 7, b = 12, k = 14, w = 16, way=4, tree height=9, wots_len=35 +Running 10 iterations. +thash avg. 0.45 us (0.00 sec); median 845 cycles, 1x: 845 cycles +f1600x avg. 0.75 us (0.00 sec); median 1,437 cycles, 1x: 1,437 cycles +thashx avg. 0.79 us (0.00 sec); median 1,503 cycles, 1x: 1,503 cycles +Generating keypair.. avg. 57193.07 us (0.06 sec); median 109,627,879 cycles, 1x: 109,627,879 cycles + - WOTS pk gen x (ideal).. avg. 450.04 us (0.00 sec); median 854,706 cycles, 128x: 109,402,368 cycles + - WOTS pk gen x (real).. avg. 450.25 us (0.00 sec); median 854,937 cycles, 128x: 109,431,936 cycles +Signing.. avg. 434513.59 us (0.43 sec); median 833,147,526 cycles, 1x: 833,147,526 cycles + - FORS signing.. avg. 34333.17 us (0.03 sec); median 65,682,691 cycles, 1x: 65,682,691 cycles + - WOTS pk gen x (ideal).. avg. 451.97 us (0.00 sec); median 855,190 cycles, 896x: 766,250,240 cycles + - WOTS pk gen x (real).. avg. 449.60 us (0.00 sec); median 854,547 cycles, 896x: 765,674,112 cycles +Verifying.. avg. 533.92 us (0.00 sec); median 984,707 cycles, 1x: 984,707 cycles +Signature size: 7856 (7.67 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-128s-simple_x5 b/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-128s-simple_x5 new file mode 100644 index 0000000..768dbd7 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-128s-simple_x5 @@ -0,0 +1,16 @@ +Parameters: n = 16, h = 63, d = 7, b = 12, k = 14, w = 16, way=5, tree height=9, wots_len=35 +Running 10 iterations. +thash avg. 0.46 us (0.00 sec); median 847 cycles, 1x: 847 cycles +f1600x avg. 1.14 us (0.00 sec); median 2,154 cycles, 1x: 2,154 cycles +thashx avg. 1.19 us (0.00 sec); median 2,255 cycles, 1x: 2,255 cycles +Generating keypair.. avg. 68978.75 us (0.07 sec); median 132,197,536 cycles, 1x: 132,197,536 cycles + - WOTS pk gen x (ideal).. avg. 672.24 us (0.00 sec); median 1,280,185 cycles, 102x: 130,578,870 cycles + - WOTS pk gen x (real).. avg. 672.15 us (0.00 sec); median 1,280,306 cycles, 103x: 131,871,518 cycles +Signing.. avg. 535663.95 us (0.54 sec); median 1,026,962,333 cycles, 1x: 1,026,962,333 cycles + - FORS signing.. avg. 53075.56 us (0.05 sec); median 101,767,086 cycles, 1x: 101,767,086 cycles + - WOTS pk gen x (ideal).. avg. 672.85 us (0.00 sec); median 1,280,249 cycles, 716x: 916,658,284 cycles + - WOTS pk gen x (real).. avg. 679.87 us (0.00 sec); median 1,280,261 cycles, 721x: 923,068,181 cycles +Verifying.. avg. 618.05 us (0.00 sec); median 1,175,468 cycles, 1x: 1,175,468 cycles +Signature size: 7856 (7.67 KiB) +Public key size: 32 (0.03 KiB) +Secret key size: 64 (0.06 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-192f-robust_x3 b/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-192f-robust_x3 new file mode 100644 index 0000000..6a17c8d --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-192f-robust_x3 @@ -0,0 +1,16 @@ +Parameters: n = 24, h = 66, d = 22, b = 8, k = 33, w = 16, way=3, tree height=3, wots_len=51 +Running 10 iterations. +thash avg. 0.89 us (0.00 sec); median 1,689 cycles, 1x: 1,689 cycles +f1600x avg. 0.52 us (0.00 sec); median 985 cycles, 1x: 985 cycles +thashx avg. 1.09 us (0.00 sec); median 2,080 cycles, 1x: 2,080 cycles +Generating keypair.. avg. 2623.78 us (0.00 sec); median 5,013,717 cycles, 1x: 5,013,717 cycles + - WOTS pk gen x (ideal).. avg. 874.73 us (0.00 sec); median 1,668,699 cycles, 2x: 3,337,398 cycles + - WOTS pk gen x (real).. avg. 874.83 us (0.00 sec); median 1,669,044 cycles, 3x: 5,007,132 cycles +Signing.. avg. 69845.40 us (0.07 sec); median 133,939,624 cycles, 1x: 133,939,624 cycles + - FORS signing.. avg. 12311.85 us (0.01 sec); median 23,605,606 cycles, 1x: 23,605,606 cycles + - WOTS pk gen x (ideal).. avg. 874.98 us (0.00 sec); median 1,669,176 cycles, 58x: 96,812,208 cycles + - WOTS pk gen x (real).. avg. 877.71 us (0.00 sec); median 1,669,432 cycles, 66x: 110,182,512 cycles +Verifying.. avg. 3793.03 us (0.00 sec); median 7,234,543 cycles, 1x: 7,234,543 cycles +Signature size: 35664 (34.83 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-192f-robust_x4 b/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-192f-robust_x4 new file mode 100644 index 0000000..9f3d369 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-192f-robust_x4 @@ -0,0 +1,16 @@ +Parameters: n = 24, h = 66, d = 22, b = 8, k = 33, w = 16, way=4, tree height=3, wots_len=51 +Running 10 iterations. +thash avg. 0.90 us (0.00 sec); median 1,691 cycles, 1x: 1,691 cycles +f1600x avg. 0.76 us (0.00 sec); median 1,438 cycles, 1x: 1,438 cycles +thashx avg. 1.58 us (0.00 sec); median 3,029 cycles, 1x: 3,029 cycles +Generating keypair.. avg. 2552.77 us (0.00 sec); median 4,882,889 cycles, 1x: 4,882,889 cycles + - WOTS pk gen x (ideal).. avg. 1271.78 us (0.00 sec); median 2,432,340 cycles, 2x: 4,864,680 cycles + - WOTS pk gen x (real).. avg. 1271.93 us (0.00 sec); median 2,432,305 cycles, 2x: 4,864,610 cycles +Signing.. avg. 64551.70 us (0.06 sec); median 123,821,768 cycles, 1x: 123,821,768 cycles + - FORS signing.. avg. 8547.95 us (0.01 sec); median 16,368,714 cycles, 1x: 16,368,714 cycles + - WOTS pk gen x (ideal).. avg. 1273.48 us (0.00 sec); median 2,432,478 cycles, 44x: 107,029,032 cycles + - WOTS pk gen x (real).. avg. 1272.88 us (0.00 sec); median 2,432,435 cycles, 44x: 107,027,140 cycles +Verifying.. avg. 4142.30 us (0.00 sec); median 7,925,593 cycles, 1x: 7,925,593 cycles +Signature size: 35664 (34.83 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-192f-robust_x5 b/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-192f-robust_x5 new file mode 100644 index 0000000..8dc0a87 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-192f-robust_x5 @@ -0,0 +1,16 @@ +Parameters: n = 24, h = 66, d = 22, b = 8, k = 33, w = 16, way=5, tree height=3, wots_len=51 +Running 10 iterations. +thash avg. 0.89 us (0.00 sec); median 1,696 cycles, 1x: 1,696 cycles +f1600x avg. 1.13 us (0.00 sec); median 2,156 cycles, 1x: 2,156 cycles +thashx avg. 2.35 us (0.00 sec); median 4,488 cycles, 1x: 4,488 cycles +Generating keypair.. avg. 3788.63 us (0.00 sec); median 7,225,445 cycles, 1x: 7,225,445 cycles + - WOTS pk gen x (ideal).. avg. 1893.60 us (0.00 sec); median 3,604,471 cycles, 1x: 3,604,471 cycles + - WOTS pk gen x (real).. avg. 1895.34 us (0.00 sec); median 3,607,189 cycles, 2x: 7,214,378 cycles +Signing.. avg. 96653.11 us (0.10 sec); median 185,203,170 cycles, 1x: 185,203,170 cycles + - FORS signing.. avg. 13714.89 us (0.01 sec); median 26,287,210 cycles, 1x: 26,287,210 cycles + - WOTS pk gen x (ideal).. avg. 1884.59 us (0.00 sec); median 3,603,994 cycles, 35x: 126,139,790 cycles + - WOTS pk gen x (real).. avg. 1882.87 us (0.00 sec); median 3,603,763 cycles, 44x: 158,565,572 cycles +Verifying.. avg. 4919.22 us (0.00 sec); median 9,413,210 cycles, 1x: 9,413,210 cycles +Signature size: 35664 (34.83 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-192f-simple_x3 b/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-192f-simple_x3 new file mode 100644 index 0000000..8eaaa91 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-192f-simple_x3 @@ -0,0 +1,16 @@ +Parameters: n = 24, h = 66, d = 22, b = 8, k = 33, w = 16, way=3, tree height=3, wots_len=51 +Running 10 iterations. +thash avg. 0.45 us (0.00 sec); median 850 cycles, 1x: 850 cycles +f1600x avg. 0.53 us (0.00 sec); median 983 cycles, 1x: 983 cycles +thashx avg. 0.55 us (0.00 sec); median 1,053 cycles, 1x: 1,053 cycles +Generating keypair.. avg. 1377.05 us (0.00 sec); median 2,624,166 cycles, 1x: 2,624,166 cycles + - WOTS pk gen x (ideal).. avg. 461.27 us (0.00 sec); median 871,813 cycles, 2x: 1,743,626 cycles + - WOTS pk gen x (real).. avg. 460.32 us (0.00 sec); median 871,615 cycles, 3x: 2,614,845 cycles +Signing.. avg. 37094.32 us (0.04 sec); median 71,141,736 cycles, 1x: 71,141,736 cycles + - FORS signing.. avg. 6991.84 us (0.01 sec); median 13,411,355 cycles, 1x: 13,411,355 cycles + - WOTS pk gen x (ideal).. avg. 459.43 us (0.00 sec); median 871,552 cycles, 58x: 50,550,016 cycles + - WOTS pk gen x (real).. avg. 459.92 us (0.00 sec); median 871,792 cycles, 66x: 57,538,272 cycles +Verifying.. avg. 1943.55 us (0.00 sec); median 3,716,850 cycles, 1x: 3,716,850 cycles +Signature size: 35664 (34.83 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-192f-simple_x4 b/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-192f-simple_x4 new file mode 100644 index 0000000..2785ab4 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-192f-simple_x4 @@ -0,0 +1,16 @@ +Parameters: n = 24, h = 66, d = 22, b = 8, k = 33, w = 16, way=4, tree height=3, wots_len=51 +Running 10 iterations. +thash avg. 0.45 us (0.00 sec); median 853 cycles, 1x: 853 cycles +f1600x avg. 0.77 us (0.00 sec); median 1,445 cycles, 1x: 1,445 cycles +thashx avg. 0.80 us (0.00 sec); median 1,522 cycles, 1x: 1,522 cycles +Generating keypair.. avg. 1330.80 us (0.00 sec); median 2,530,668 cycles, 1x: 2,530,668 cycles + - WOTS pk gen x (ideal).. avg. 664.52 us (0.00 sec); median 1,263,335 cycles, 2x: 2,526,670 cycles + - WOTS pk gen x (real).. avg. 662.78 us (0.00 sec); median 1,263,383 cycles, 2x: 2,526,766 cycles +Signing.. avg. 34318.22 us (0.03 sec); median 65,772,684 cycles, 1x: 65,772,684 cycles + - FORS signing.. avg. 5188.19 us (0.01 sec); median 9,933,224 cycles, 1x: 9,933,224 cycles + - WOTS pk gen x (ideal).. avg. 663.70 us (0.00 sec); median 1,263,602 cycles, 44x: 55,598,488 cycles + - WOTS pk gen x (real).. avg. 662.85 us (0.00 sec); median 1,263,190 cycles, 44x: 55,580,360 cycles +Verifying.. avg. 2108.68 us (0.00 sec); median 4,028,172 cycles, 1x: 4,028,172 cycles +Signature size: 35664 (34.83 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-192f-simple_x5 b/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-192f-simple_x5 new file mode 100644 index 0000000..b6829d0 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-192f-simple_x5 @@ -0,0 +1,16 @@ +Parameters: n = 24, h = 66, d = 22, b = 8, k = 33, w = 16, way=5, tree height=3, wots_len=51 +Running 10 iterations. +thash avg. 0.45 us (0.00 sec); median 852 cycles, 1x: 852 cycles +f1600x avg. 1.13 us (0.00 sec); median 2,158 cycles, 1x: 2,158 cycles +thashx avg. 1.21 us (0.00 sec); median 2,282 cycles, 1x: 2,282 cycles +Generating keypair.. avg. 1986.73 us (0.00 sec); median 3,791,237 cycles, 1x: 3,791,237 cycles + - WOTS pk gen x (ideal).. avg. 994.36 us (0.00 sec); median 1,892,516 cycles, 1x: 1,892,516 cycles + - WOTS pk gen x (real).. avg. 990.96 us (0.00 sec); median 1,892,132 cycles, 2x: 3,784,264 cycles +Signing.. avg. 51452.52 us (0.05 sec); median 98,641,433 cycles, 1x: 98,641,433 cycles + - FORS signing.. avg. 7965.49 us (0.01 sec); median 15,254,532 cycles, 1x: 15,254,532 cycles + - WOTS pk gen x (ideal).. avg. 995.16 us (0.00 sec); median 1,892,209 cycles, 35x: 66,227,315 cycles + - WOTS pk gen x (real).. avg. 995.02 us (0.00 sec); median 1,892,779 cycles, 44x: 83,282,276 cycles +Verifying.. avg. 2568.21 us (0.00 sec); median 4,894,034 cycles, 1x: 4,894,034 cycles +Signature size: 35664 (34.83 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-192s-robust_x3 b/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-192s-robust_x3 new file mode 100644 index 0000000..8c75264 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-192s-robust_x3 @@ -0,0 +1,16 @@ +Parameters: n = 24, h = 63, d = 7, b = 14, k = 17, w = 16, way=3, tree height=9, wots_len=51 +Running 10 iterations. +thash avg. 0.89 us (0.00 sec); median 1,695 cycles, 1x: 1,695 cycles +f1600x avg. 0.52 us (0.00 sec); median 986 cycles, 1x: 986 cycles +thashx avg. 1.09 us (0.00 sec); median 2,079 cycles, 1x: 2,079 cycles +Generating keypair.. avg. 149301.08 us (0.15 sec); median 286,283,899 cycles, 1x: 286,283,899 cycles + - WOTS pk gen x (ideal).. avg. 878.25 us (0.00 sec); median 1,669,315 cycles, 170x: 283,783,550 cycles + - WOTS pk gen x (real).. avg. 876.04 us (0.00 sec); median 1,669,983 cycles, 171x: 285,567,093 cycles +Signing.. avg. 1450573.78 us (1.45 sec); median 2,781,233,951 cycles, 1x: 2,781,233,951 cycles + - FORS signing.. avg. 404977.70 us (0.40 sec); median 776,707,602 cycles, 1x: 776,707,602 cycles + - WOTS pk gen x (ideal).. avg. 880.32 us (0.00 sec); median 1,670,337 cycles, 1194x: 1,994,382,378 cycles + - WOTS pk gen x (real).. avg. 875.03 us (0.00 sec); median 1,670,536 cycles, 1197x: 1,999,631,592 cycles +Verifying.. avg. 1350.21 us (0.00 sec); median 2,578,776 cycles, 1x: 2,578,776 cycles +Signature size: 16224 (15.84 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-192s-robust_x4 b/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-192s-robust_x4 new file mode 100644 index 0000000..7f09f48 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-192s-robust_x4 @@ -0,0 +1,16 @@ +Parameters: n = 24, h = 63, d = 7, b = 14, k = 17, w = 16, way=4, tree height=9, wots_len=51 +Running 10 iterations. +thash avg. 0.89 us (0.00 sec); median 1,697 cycles, 1x: 1,697 cycles +f1600x avg. 0.76 us (0.00 sec); median 1,438 cycles, 1x: 1,438 cycles +thashx avg. 1.58 us (0.00 sec); median 3,029 cycles, 1x: 3,029 cycles +Generating keypair.. avg. 162472.02 us (0.16 sec); median 311,561,011 cycles, 1x: 311,561,011 cycles + - WOTS pk gen x (ideal).. avg. 1274.22 us (0.00 sec); median 2,434,901 cycles, 128x: 311,667,328 cycles + - WOTS pk gen x (real).. avg. 1273.44 us (0.00 sec); median 2,434,876 cycles, 128x: 311,664,128 cycles +Signing.. avg. 1416455.92 us (1.42 sec); median 2,715,816,436 cycles, 1x: 2,715,816,436 cycles + - FORS signing.. avg. 278994.84 us (0.28 sec); median 534,395,508 cycles, 1x: 534,395,508 cycles + - WOTS pk gen x (ideal).. avg. 1280.08 us (0.00 sec); median 2,435,312 cycles, 896x: 2,182,039,552 cycles + - WOTS pk gen x (real).. avg. 1273.37 us (0.00 sec); median 2,435,290 cycles, 896x: 2,182,019,840 cycles +Verifying.. avg. 1496.40 us (0.00 sec); median 2,862,617 cycles, 1x: 2,862,617 cycles +Signature size: 16224 (15.84 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-192s-robust_x5 b/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-192s-robust_x5 new file mode 100644 index 0000000..a9e8236 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-192s-robust_x5 @@ -0,0 +1,16 @@ +Parameters: n = 24, h = 63, d = 7, b = 14, k = 17, w = 16, way=5, tree height=9, wots_len=51 +Running 10 iterations. +thash avg. 0.88 us (0.00 sec); median 1,684 cycles, 1x: 1,684 cycles +f1600x avg. 1.13 us (0.00 sec); median 2,155 cycles, 1x: 2,155 cycles +thashx avg. 2.36 us (0.00 sec); median 4,516 cycles, 1x: 4,516 cycles +Generating keypair.. avg. 194873.34 us (0.19 sec); median 373,711,021 cycles, 1x: 373,711,021 cycles + - WOTS pk gen x (ideal).. avg. 1892.96 us (0.00 sec); median 3,622,433 cycles, 102x: 369,488,166 cycles + - WOTS pk gen x (real).. avg. 1892.26 us (0.00 sec); median 3,622,985 cycles, 103x: 373,167,455 cycles +Signing.. avg. 1814145.45 us (1.81 sec); median 3,478,515,617 cycles, 1x: 3,478,515,617 cycles + - FORS signing.. avg. 449674.02 us (0.45 sec); median 862,204,578 cycles, 1x: 862,204,578 cycles + - WOTS pk gen x (ideal).. avg. 1894.62 us (0.00 sec); median 3,621,814 cycles, 716x: 2,593,218,824 cycles + - WOTS pk gen x (real).. avg. 1891.89 us (0.00 sec); median 3,622,055 cycles, 721x: 2,611,501,655 cycles +Verifying.. avg. 1761.26 us (0.00 sec); median 3,363,935 cycles, 1x: 3,363,935 cycles +Signature size: 16224 (15.84 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-192s-simple_x3 b/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-192s-simple_x3 new file mode 100644 index 0000000..7312537 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-192s-simple_x3 @@ -0,0 +1,16 @@ +Parameters: n = 24, h = 63, d = 7, b = 14, k = 17, w = 16, way=3, tree height=9, wots_len=51 +Running 10 iterations. +thash avg. 0.45 us (0.00 sec); median 854 cycles, 1x: 854 cycles +f1600x avg. 0.52 us (0.00 sec); median 984 cycles, 1x: 984 cycles +thashx avg. 0.55 us (0.00 sec); median 1,053 cycles, 1x: 1,053 cycles +Generating keypair.. avg. 78089.15 us (0.08 sec); median 149,596,394 cycles, 1x: 149,596,394 cycles + - WOTS pk gen x (ideal).. avg. 459.56 us (0.00 sec); median 872,520 cycles, 170x: 148,328,400 cycles + - WOTS pk gen x (real).. avg. 458.12 us (0.00 sec); median 872,568 cycles, 171x: 149,209,128 cycles +Signing.. avg. 775233.12 us (0.78 sec); median 1,486,750,450 cycles, 1x: 1,486,750,450 cycles + - FORS signing.. avg. 229213.25 us (0.23 sec); median 439,338,379 cycles, 1x: 439,338,379 cycles + - WOTS pk gen x (ideal).. avg. 462.25 us (0.00 sec); median 872,934 cycles, 1194x: 1,042,283,196 cycles + - WOTS pk gen x (real).. avg. 461.35 us (0.00 sec); median 872,611 cycles, 1197x: 1,044,515,367 cycles +Verifying.. avg. 692.84 us (0.00 sec); median 1,318,539 cycles, 1x: 1,318,539 cycles +Signature size: 16224 (15.84 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-192s-simple_x4 b/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-192s-simple_x4 new file mode 100644 index 0000000..c71bf57 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-192s-simple_x4 @@ -0,0 +1,16 @@ +Parameters: n = 24, h = 63, d = 7, b = 14, k = 17, w = 16, way=4, tree height=9, wots_len=51 +Running 10 iterations. +thash avg. 0.45 us (0.00 sec); median 851 cycles, 1x: 851 cycles +f1600x avg. 0.76 us (0.00 sec); median 1,438 cycles, 1x: 1,438 cycles +thashx avg. 0.80 us (0.00 sec); median 1,521 cycles, 1x: 1,521 cycles +Generating keypair.. avg. 84674.11 us (0.08 sec); median 162,368,198 cycles, 1x: 162,368,198 cycles + - WOTS pk gen x (ideal).. avg. 668.86 us (0.00 sec); median 1,264,566 cycles, 128x: 161,864,448 cycles + - WOTS pk gen x (real).. avg. 666.37 us (0.00 sec); median 1,264,194 cycles, 128x: 161,816,832 cycles +Signing.. avg. 762877.07 us (0.76 sec); median 1,462,553,451 cycles, 1x: 1,462,553,451 cycles + - FORS signing.. avg. 169955.72 us (0.17 sec); median 325,814,837 cycles, 1x: 325,814,837 cycles + - WOTS pk gen x (ideal).. avg. 667.99 us (0.00 sec); median 1,264,965 cycles, 896x: 1,133,408,640 cycles + - WOTS pk gen x (real).. avg. 663.73 us (0.00 sec); median 1,264,517 cycles, 896x: 1,133,007,232 cycles +Verifying.. avg. 765.99 us (0.00 sec); median 1,455,613 cycles, 1x: 1,455,613 cycles +Signature size: 16224 (15.84 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-192s-simple_x5 b/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-192s-simple_x5 new file mode 100644 index 0000000..da8eb18 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-192s-simple_x5 @@ -0,0 +1,16 @@ +Parameters: n = 24, h = 63, d = 7, b = 14, k = 17, w = 16, way=5, tree height=9, wots_len=51 +Running 10 iterations. +thash avg. 0.45 us (0.00 sec); median 850 cycles, 1x: 850 cycles +f1600x avg. 1.13 us (0.00 sec); median 2,156 cycles, 1x: 2,156 cycles +thashx avg. 1.19 us (0.00 sec); median 2,273 cycles, 1x: 2,273 cycles +Generating keypair.. avg. 101558.79 us (0.10 sec); median 194,773,750 cycles, 1x: 194,773,750 cycles + - WOTS pk gen x (ideal).. avg. 988.62 us (0.00 sec); median 1,886,479 cycles, 102x: 192,420,858 cycles + - WOTS pk gen x (real).. avg. 988.57 us (0.00 sec); median 1,885,966 cycles, 103x: 194,254,498 cycles +Signing.. avg. 971708.88 us (0.97 sec); median 1,863,254,010 cycles, 1x: 1,863,254,010 cycles + - FORS signing.. avg. 260693.24 us (0.26 sec); median 499,885,723 cycles, 1x: 499,885,723 cycles + - WOTS pk gen x (ideal).. avg. 997.75 us (0.00 sec); median 1,887,960 cycles, 716x: 1,351,779,360 cycles + - WOTS pk gen x (real).. avg. 990.05 us (0.00 sec); median 1,887,419 cycles, 721x: 1,360,829,099 cycles +Verifying.. avg. 894.14 us (0.00 sec); median 1,699,761 cycles, 1x: 1,699,761 cycles +Signature size: 16224 (15.84 KiB) +Public key size: 48 (0.05 KiB) +Secret key size: 96 (0.09 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-256f-robust_x3 b/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-256f-robust_x3 new file mode 100644 index 0000000..2065698 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-256f-robust_x3 @@ -0,0 +1,16 @@ +Parameters: n = 32, h = 68, d = 17, b = 9, k = 35, w = 16, way=3, tree height=4, wots_len=67 +Running 10 iterations. +thash avg. 0.88 us (0.00 sec); median 1,680 cycles, 1x: 1,680 cycles +f1600x avg. 0.52 us (0.00 sec); median 985 cycles, 1x: 985 cycles +thashx avg. 1.09 us (0.00 sec); median 2,088 cycles, 1x: 2,088 cycles +Generating keypair.. avg. 6928.54 us (0.01 sec); median 13,276,361 cycles, 1x: 13,276,361 cycles + - WOTS pk gen x (ideal).. avg. 1155.62 us (0.00 sec); median 2,209,491 cycles, 5x: 11,047,455 cycles + - WOTS pk gen x (real).. avg. 1155.13 us (0.00 sec); median 2,208,487 cycles, 6x: 13,250,922 cycles +Signing.. avg. 143700.82 us (0.14 sec); median 275,520,489 cycles, 1x: 275,520,489 cycles + - FORS signing.. avg. 25943.87 us (0.03 sec); median 49,728,034 cycles, 1x: 49,728,034 cycles + - WOTS pk gen x (ideal).. avg. 1156.58 us (0.00 sec); median 2,208,387 cycles, 90x: 198,754,830 cycles + - WOTS pk gen x (real).. avg. 1157.11 us (0.00 sec); median 2,210,209 cycles, 102x: 225,441,318 cycles +Verifying.. avg. 3784.77 us (0.00 sec); median 7,231,977 cycles, 1x: 7,231,977 cycles +Signature size: 49856 (48.69 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-256f-robust_x4 b/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-256f-robust_x4 new file mode 100644 index 0000000..5479eaf --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-256f-robust_x4 @@ -0,0 +1,16 @@ +Parameters: n = 32, h = 68, d = 17, b = 9, k = 35, w = 16, way=4, tree height=4, wots_len=67 +Running 10 iterations. +thash avg. 0.88 us (0.00 sec); median 1,679 cycles, 1x: 1,679 cycles +f1600x avg. 0.76 us (0.00 sec); median 1,438 cycles, 1x: 1,438 cycles +thashx avg. 1.60 us (0.00 sec); median 3,062 cycles, 1x: 3,062 cycles +Generating keypair.. avg. 6787.53 us (0.01 sec); median 12,990,544 cycles, 1x: 12,990,544 cycles + - WOTS pk gen x (ideal).. avg. 1694.73 us (0.00 sec); median 3,242,721 cycles, 4x: 12,970,884 cycles + - WOTS pk gen x (real).. avg. 1695.02 us (0.00 sec); median 3,242,423 cycles, 4x: 12,969,692 cycles +Signing.. avg. 133473.90 us (0.13 sec); median 255,892,877 cycles, 1x: 255,892,877 cycles + - FORS signing.. avg. 18239.13 us (0.02 sec); median 34,972,285 cycles, 1x: 34,972,285 cycles + - WOTS pk gen x (ideal).. avg. 1695.89 us (0.00 sec); median 3,242,989 cycles, 68x: 220,523,252 cycles + - WOTS pk gen x (real).. avg. 1695.19 us (0.00 sec); median 3,242,482 cycles, 68x: 220,488,776 cycles +Verifying.. avg. 4193.66 us (0.00 sec); median 8,035,921 cycles, 1x: 8,035,921 cycles +Signature size: 49856 (48.69 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-256f-robust_x5 b/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-256f-robust_x5 new file mode 100644 index 0000000..c4410a7 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-256f-robust_x5 @@ -0,0 +1,16 @@ +Parameters: n = 32, h = 68, d = 17, b = 9, k = 35, w = 16, way=5, tree height=4, wots_len=67 +Running 10 iterations. +thash avg. 0.88 us (0.00 sec); median 1,679 cycles, 1x: 1,679 cycles +f1600x avg. 1.13 us (0.00 sec); median 2,155 cycles, 1x: 2,155 cycles +thashx avg. 2.38 us (0.00 sec); median 4,542 cycles, 1x: 4,542 cycles +Generating keypair.. avg. 10048.11 us (0.01 sec); median 19,215,348 cycles, 1x: 19,215,348 cycles + - WOTS pk gen x (ideal).. avg. 2506.98 us (0.00 sec); median 4,802,033 cycles, 3x: 14,406,099 cycles + - WOTS pk gen x (real).. avg. 2506.69 us (0.00 sec); median 4,801,047 cycles, 4x: 19,204,188 cycles +Signing.. avg. 199224.98 us (0.20 sec); median 381,974,755 cycles, 1x: 381,974,755 cycles + - FORS signing.. avg. 28867.17 us (0.03 sec); median 55,387,431 cycles, 1x: 55,387,431 cycles + - WOTS pk gen x (ideal).. avg. 2506.64 us (0.00 sec); median 4,799,690 cycles, 54x: 259,183,260 cycles + - WOTS pk gen x (real).. avg. 2510.18 us (0.00 sec); median 4,799,322 cycles, 68x: 326,353,896 cycles +Verifying.. avg. 4802.27 us (0.00 sec); median 9,203,497 cycles, 1x: 9,203,497 cycles +Signature size: 49856 (48.69 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-256f-simple_x3 b/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-256f-simple_x3 new file mode 100644 index 0000000..e4df6c0 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-256f-simple_x3 @@ -0,0 +1,16 @@ +Parameters: n = 32, h = 68, d = 17, b = 9, k = 35, w = 16, way=3, tree height=4, wots_len=67 +Running 10 iterations. +thash avg. 0.46 us (0.00 sec); median 859 cycles, 1x: 859 cycles +f1600x avg. 0.52 us (0.00 sec); median 985 cycles, 1x: 985 cycles +thashx avg. 0.56 us (0.00 sec); median 1,058 cycles, 1x: 1,058 cycles +Generating keypair.. avg. 3647.89 us (0.00 sec); median 6,971,365 cycles, 1x: 6,971,365 cycles + - WOTS pk gen x (ideal).. avg. 609.20 us (0.00 sec); median 1,160,107 cycles, 5x: 5,800,535 cycles + - WOTS pk gen x (real).. avg. 609.59 us (0.00 sec); median 1,160,469 cycles, 6x: 6,962,814 cycles +Signing.. avg. 76571.45 us (0.08 sec); median 146,851,802 cycles, 1x: 146,851,802 cycles + - FORS signing.. avg. 14769.48 us (0.01 sec); median 28,304,111 cycles, 1x: 28,304,111 cycles + - WOTS pk gen x (ideal).. avg. 610.26 us (0.00 sec); median 1,160,177 cycles, 90x: 104,415,930 cycles + - WOTS pk gen x (real).. avg. 610.57 us (0.00 sec); median 1,160,383 cycles, 102x: 118,359,066 cycles +Verifying.. avg. 1969.36 us (0.00 sec); median 3,767,167 cycles, 1x: 3,767,167 cycles +Signature size: 49856 (48.69 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-256f-simple_x4 b/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-256f-simple_x4 new file mode 100644 index 0000000..ff22a94 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-256f-simple_x4 @@ -0,0 +1,16 @@ +Parameters: n = 32, h = 68, d = 17, b = 9, k = 35, w = 16, way=4, tree height=4, wots_len=67 +Running 10 iterations. +thash avg. 0.45 us (0.00 sec); median 859 cycles, 1x: 859 cycles +f1600x avg. 0.75 us (0.00 sec); median 1,438 cycles, 1x: 1,438 cycles +thashx avg. 0.81 us (0.00 sec); median 1,545 cycles, 1x: 1,545 cycles +Generating keypair.. avg. 3545.56 us (0.00 sec); median 6,783,320 cycles, 1x: 6,783,320 cycles + - WOTS pk gen x (ideal).. avg. 894.64 us (0.00 sec); median 1,695,333 cycles, 4x: 6,781,332 cycles + - WOTS pk gen x (real).. avg. 891.79 us (0.00 sec); median 1,695,165 cycles, 4x: 6,780,660 cycles +Signing.. avg. 71257.47 us (0.07 sec); median 136,637,253 cycles, 1x: 136,637,253 cycles + - FORS signing.. avg. 11259.51 us (0.01 sec); median 21,414,688 cycles, 1x: 21,414,688 cycles + - WOTS pk gen x (ideal).. avg. 889.80 us (0.00 sec); median 1,694,489 cycles, 68x: 115,225,252 cycles + - WOTS pk gen x (real).. avg. 903.03 us (0.00 sec); median 1,696,538 cycles, 68x: 115,364,584 cycles +Verifying.. avg. 2168.84 us (0.00 sec); median 4,105,144 cycles, 1x: 4,105,144 cycles +Signature size: 49856 (48.69 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-256f-simple_x5 b/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-256f-simple_x5 new file mode 100644 index 0000000..e52fe10 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-256f-simple_x5 @@ -0,0 +1,16 @@ +Parameters: n = 32, h = 68, d = 17, b = 9, k = 35, w = 16, way=5, tree height=4, wots_len=67 +Running 10 iterations. +thash avg. 0.47 us (0.00 sec); median 863 cycles, 1x: 863 cycles +f1600x avg. 1.17 us (0.00 sec); median 2,160 cycles, 1x: 2,160 cycles +thashx avg. 1.20 us (0.00 sec); median 2,288 cycles, 1x: 2,288 cycles +Generating keypair.. avg. 5251.52 us (0.01 sec); median 10,067,029 cycles, 1x: 10,067,029 cycles + - WOTS pk gen x (ideal).. avg. 1320.95 us (0.00 sec); median 2,514,280 cycles, 3x: 7,542,840 cycles + - WOTS pk gen x (real).. avg. 1319.96 us (0.00 sec); median 2,512,742 cycles, 4x: 10,050,968 cycles +Signing.. avg. 106018.17 us (0.11 sec); median 203,317,254 cycles, 1x: 203,317,254 cycles + - FORS signing.. avg. 16833.62 us (0.02 sec); median 32,255,955 cycles, 1x: 32,255,955 cycles + - WOTS pk gen x (ideal).. avg. 1314.24 us (0.00 sec); median 2,511,698 cycles, 54x: 135,631,692 cycles + - WOTS pk gen x (real).. avg. 1312.48 us (0.00 sec); median 2,511,380 cycles, 68x: 170,773,840 cycles +Verifying.. avg. 2523.08 us (0.00 sec); median 4,829,943 cycles, 1x: 4,829,943 cycles +Signature size: 49856 (48.69 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-256s-robust_x3 b/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-256s-robust_x3 new file mode 100644 index 0000000..011eb0f --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-256s-robust_x3 @@ -0,0 +1,16 @@ +Parameters: n = 32, h = 64, d = 8, b = 14, k = 22, w = 16, way=3, tree height=8, wots_len=67 +Running 10 iterations. +thash avg. 1.04 us (0.00 sec); median 1,677 cycles, 1x: 1,677 cycles +f1600x avg. 0.62 us (0.00 sec); median 989 cycles, 1x: 989 cycles +thashx avg. 1.30 us (0.00 sec); median 2,091 cycles, 1x: 2,091 cycles +Generating keypair.. avg. 116925.16 us (0.12 sec); median 190,517,074 cycles, 1x: 190,517,074 cycles + - WOTS pk gen x (ideal).. avg. 1361.08 us (0.00 sec); median 2,212,268 cycles, 85x: 188,042,780 cycles + - WOTS pk gen x (real).. avg. 1366.66 us (0.00 sec); median 2,212,366 cycles, 86x: 190,263,476 cycles +Signing.. avg. 1549056.30 us (1.55 sec); median 2,524,257,766 cycles, 1x: 2,524,257,766 cycles + - FORS signing.. avg. 618203.21 us (0.62 sec); median 999,880,116 cycles, 1x: 999,880,116 cycles + - WOTS pk gen x (ideal).. avg. 1447.82 us (0.00 sec); median 2,211,991 cycles, 682x: 1,508,577,862 cycles + - WOTS pk gen x (real).. avg. 1446.89 us (0.00 sec); median 2,211,164 cycles, 688x: 1,521,280,832 cycles +Verifying.. avg. 2565.47 us (0.00 sec); median 3,928,263 cycles, 1x: 3,928,263 cycles +Signature size: 29792 (29.09 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-256s-robust_x4 b/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-256s-robust_x4 new file mode 100644 index 0000000..798e457 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-256s-robust_x4 @@ -0,0 +1,16 @@ +Parameters: n = 32, h = 64, d = 8, b = 14, k = 22, w = 16, way=4, tree height=8, wots_len=67 +Running 10 iterations. +thash avg. 1.10 us (0.00 sec); median 1,681 cycles, 1x: 1,681 cycles +f1600x avg. 0.95 us (0.00 sec); median 1,437 cycles, 1x: 1,437 cycles +thashx avg. 2.00 us (0.00 sec); median 3,065 cycles, 1x: 3,065 cycles +Generating keypair.. avg. 135520.68 us (0.14 sec); median 207,840,341 cycles, 1x: 207,840,341 cycles + - WOTS pk gen x (ideal).. avg. 2121.17 us (0.00 sec); median 3,245,791 cycles, 64x: 207,730,624 cycles + - WOTS pk gen x (real).. avg. 2118.28 us (0.00 sec); median 3,242,551 cycles, 64x: 207,523,264 cycles +Signing.. avg. 1540854.10 us (1.54 sec); median 2,363,340,163 cycles, 1x: 2,363,340,163 cycles + - FORS signing.. avg. 457186.09 us (0.46 sec); median 700,790,924 cycles, 1x: 700,790,924 cycles + - WOTS pk gen x (ideal).. avg. 2127.32 us (0.00 sec); median 3,244,985 cycles, 512x: 1,661,432,320 cycles + - WOTS pk gen x (real).. avg. 2118.51 us (0.00 sec); median 3,244,724 cycles, 512x: 1,661,298,688 cycles +Verifying.. avg. 2707.91 us (0.00 sec); median 4,141,895 cycles, 1x: 4,141,895 cycles +Signature size: 29792 (29.09 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-256s-robust_x5 b/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-256s-robust_x5 new file mode 100644 index 0000000..78330c8 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-256s-robust_x5 @@ -0,0 +1,16 @@ +Parameters: n = 32, h = 64, d = 8, b = 14, k = 22, w = 16, way=5, tree height=8, wots_len=67 +Running 10 iterations. +thash avg. 1.11 us (0.00 sec); median 1,681 cycles, 1x: 1,681 cycles +f1600x avg. 1.42 us (0.00 sec); median 2,158 cycles, 1x: 2,158 cycles +thashx avg. 2.97 us (0.00 sec); median 4,543 cycles, 1x: 4,543 cycles +Generating keypair.. avg. 163117.44 us (0.16 sec); median 250,124,369 cycles, 1x: 250,124,369 cycles + - WOTS pk gen x (ideal).. avg. 3139.79 us (0.00 sec); median 4,804,212 cycles, 51x: 245,014,812 cycles + - WOTS pk gen x (real).. avg. 3135.06 us (0.00 sec); median 4,800,072 cycles, 52x: 249,603,744 cycles +Signing.. avg. 2029308.08 us (2.03 sec); median 3,112,626,644 cycles, 1x: 3,112,626,644 cycles + - FORS signing.. avg. 724620.34 us (0.72 sec); median 1,111,126,033 cycles, 1x: 1,111,126,033 cycles + - WOTS pk gen x (ideal).. avg. 3136.05 us (0.00 sec); median 4,804,991 cycles, 409x: 1,965,241,319 cycles + - WOTS pk gen x (real).. avg. 3135.54 us (0.00 sec); median 4,802,607 cycles, 416x: 1,997,884,512 cycles +Verifying.. avg. 3195.22 us (0.00 sec); median 4,895,243 cycles, 1x: 4,895,243 cycles +Signature size: 29792 (29.09 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-256s-simple_x3 b/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-256s-simple_x3 new file mode 100644 index 0000000..1898ffb --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-256s-simple_x3 @@ -0,0 +1,16 @@ +Parameters: n = 32, h = 64, d = 8, b = 14, k = 22, w = 16, way=3, tree height=8, wots_len=67 +Running 10 iterations. +thash avg. 0.46 us (0.00 sec); median 864 cycles, 1x: 864 cycles +f1600x avg. 0.52 us (0.00 sec); median 982 cycles, 1x: 982 cycles +thashx avg. 0.56 us (0.00 sec); median 1,058 cycles, 1x: 1,058 cycles +Generating keypair.. avg. 52001.49 us (0.05 sec); median 99,683,977 cycles, 1x: 99,683,977 cycles + - WOTS pk gen x (ideal).. avg. 606.79 us (0.00 sec); median 1,156,105 cycles, 85x: 98,268,925 cycles + - WOTS pk gen x (real).. avg. 607.37 us (0.00 sec); median 1,155,802 cycles, 86x: 99,398,972 cycles +Signing.. avg. 712175.08 us (0.71 sec); median 1,365,566,296 cycles, 1x: 1,365,566,296 cycles + - FORS signing.. avg. 306929.73 us (0.31 sec); median 567,838,338 cycles, 1x: 567,838,338 cycles + - WOTS pk gen x (ideal).. avg. 642.33 us (0.00 sec); median 1,157,251 cycles, 682x: 789,245,182 cycles + - WOTS pk gen x (real).. avg. 639.37 us (0.00 sec); median 1,157,334 cycles, 688x: 796,245,792 cycles +Verifying.. avg. 1055.50 us (0.00 sec); median 1,908,027 cycles, 1x: 1,908,027 cycles +Signature size: 29792 (29.09 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-256s-simple_x4 b/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-256s-simple_x4 new file mode 100644 index 0000000..1179b3b --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-256s-simple_x4 @@ -0,0 +1,16 @@ +Parameters: n = 32, h = 64, d = 8, b = 14, k = 22, w = 16, way=4, tree height=8, wots_len=67 +Running 10 iterations. +thash avg. 0.48 us (0.00 sec); median 861 cycles, 1x: 861 cycles +f1600x avg. 0.80 us (0.00 sec); median 1,438 cycles, 1x: 1,438 cycles +thashx avg. 0.86 us (0.00 sec); median 1,550 cycles, 1x: 1,550 cycles +Generating keypair.. avg. 59563.96 us (0.06 sec); median 108,532,397 cycles, 1x: 108,532,397 cycles + - WOTS pk gen x (ideal).. avg. 938.11 us (0.00 sec); median 1,695,619 cycles, 64x: 108,519,616 cycles + - WOTS pk gen x (real).. avg. 935.79 us (0.00 sec); median 1,695,787 cycles, 64x: 108,530,368 cycles +Signing.. avg. 715293.28 us (0.72 sec); median 1,296,291,979 cycles, 1x: 1,296,291,979 cycles + - FORS signing.. avg. 252161.21 us (0.25 sec); median 428,276,913 cycles, 1x: 428,276,913 cycles + - WOTS pk gen x (ideal).. avg. 987.79 us (0.00 sec); median 1,695,556 cycles, 512x: 868,124,672 cycles + - WOTS pk gen x (real).. avg. 988.28 us (0.00 sec); median 1,695,535 cycles, 512x: 868,113,920 cycles +Verifying.. avg. 1235.84 us (0.00 sec); median 2,118,204 cycles, 1x: 2,118,204 cycles +Signature size: 29792 (29.09 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-256s-simple_x5 b/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-256s-simple_x5 new file mode 100644 index 0000000..f59663b --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/benchmarks_X2/sphincs-shake-256s-simple_x5 @@ -0,0 +1,16 @@ +Parameters: n = 32, h = 64, d = 8, b = 14, k = 22, w = 16, way=5, tree height=8, wots_len=67 +Running 10 iterations. +thash avg. 0.51 us (0.00 sec); median 863 cycles, 1x: 863 cycles +f1600x avg. 1.26 us (0.00 sec); median 2,161 cycles, 1x: 2,161 cycles +thashx avg. 1.35 us (0.00 sec); median 2,313 cycles, 1x: 2,313 cycles +Generating keypair.. avg. 76316.20 us (0.08 sec); median 131,664,911 cycles, 1x: 131,664,911 cycles + - WOTS pk gen x (ideal).. avg. 1471.46 us (0.00 sec); median 2,528,462 cycles, 51x: 128,951,562 cycles + - WOTS pk gen x (real).. avg. 1470.42 us (0.00 sec); median 2,528,706 cycles, 52x: 131,492,712 cycles +Signing.. avg. 1000685.63 us (1.00 sec); median 1,699,979,325 cycles, 1x: 1,699,979,325 cycles + - FORS signing.. avg. 396932.27 us (0.40 sec); median 646,923,760 cycles, 1x: 646,923,760 cycles + - WOTS pk gen x (ideal).. avg. 1556.93 us (0.00 sec); median 2,529,408 cycles, 409x: 1,034,527,872 cycles + - WOTS pk gen x (real).. avg. 1563.75 us (0.00 sec); median 2,528,337 cycles, 416x: 1,051,788,192 cycles +Verifying.. avg. 1609.91 us (0.00 sec); median 2,613,174 cycles, 1x: 2,613,174 cycles +Signature size: 29792 (29.09 KiB) +Public key size: 64 (0.06 KiB) +Secret key size: 128 (0.12 KiB) diff --git a/sphincsplus/sphincsplus-keccakxN/context.h b/sphincsplus/sphincsplus-keccakxN/context.h new file mode 100644 index 0000000..94d1543 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/context.h @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// This implementation is based on the public domain implementation of SPHINCS+ +// available on https://github.com/sphincs/sphincsplus +// + +#ifndef SPX_CONTEXT_H +#define SPX_CONTEXT_H + +#include + +#include "params.h" + +typedef struct { + uint8_t pub_seed[SPX_N]; + uint8_t sk_seed[SPX_N]; +} spx_ctx; + +#endif diff --git a/sphincsplus/sphincsplus-keccakxN/f1600x.h b/sphincsplus/sphincsplus-keccakxN/f1600x.h new file mode 100644 index 0000000..0358990 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/f1600x.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// This implementation is based on the public domain implementation of SPHINCS+ +// available on https://github.com/sphincs/sphincsplus +// + +#ifndef SPX_F1600X_H +#define SPX_F1600X_H + +#if KECCAK_WAY == 4 + +#define STATE_IDX(r,i) \ + (KECCAK_WAY * (i) + (r)) + +#else + +#define STATE_IDX(r,i) \ + ( ( (r) == 0 ) ? ( 2 * (i) + 0 ) : \ + ( ( (r) == 1 ) ? ( 2 * (i) + 1 ) : \ + ( (r) * 25 + (i) ) ) ) + +#endif + +extern void KECCAK_X_IMPL(uint64_t s[KECCAK_WAY*25]); +#define keccakx_asm(s) KECCAK_X_IMPL(s) + +#endif diff --git a/sphincsplus/sphincsplus-keccakxN/fips202.c b/sphincsplus/sphincsplus-keccakxN/fips202.c new file mode 100644 index 0000000..0793df5 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/fips202.c @@ -0,0 +1,513 @@ +/* + * Copyright (c) 2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +/* Based on the public domain implementation in + * crypto_hash/keccakc512/simple/ from http://bench.cr.yp.to/supercop.html + * by Ronny Van Keer + * and the public domain "TweetFips202" implementation + * from https://twitter.com/tweetfips202 + * by Gilles Van Assche, Daniel J. Bernstein, and Peter Schwabe */ + +#include +#include + +#include "fips202.h" + +#define NROUNDS 24 +#define ROL(a, offset) (((a) << (offset)) ^ ((a) >> (64 - (offset)))) + +/************************************************* + * Name: load64 + * + * Description: Load 8 bytes into uint64_t in little-endian order + * + * Arguments: - const uint8_t *x: pointer to input byte array + * + * Returns the loaded 64-bit unsigned integer + **************************************************/ +static uint64_t load64(const uint8_t *x) { + uint64_t r = 0; + for (size_t i = 0; i < 8; ++i) { + r |= (uint64_t)x[i] << 8 * i; + } + + return r; +} + +/************************************************* + * Name: store64 + * + * Description: Store a 64-bit integer to a byte array in little-endian order + * + * Arguments: - uint8_t *x: pointer to the output byte array + * - uint64_t u: input 64-bit unsigned integer + **************************************************/ +static void store64(uint8_t *x, uint64_t u) { + for (size_t i = 0; i < 8; ++i) { + x[i] = (uint8_t) (u >> 8 * i); + } +} + +extern void KECCAK_X1_IMPL(uint64_t *state); +/************************************************* + * Name: KeccakF1600_StatePermute + * + * Description: The Keccak F1600 Permutation + * + * Arguments: - uint64_t *state: pointer to input/output Keccak state + **************************************************/ +static void KeccakF1600_StatePermute(uint64_t *state) { + KECCAK_X1_IMPL(state); +} + +/************************************************* + * Name: keccak_absorb + * + * Description: Absorb step of Keccak; + * non-incremental, starts by zeroeing the state. + * + * Arguments: - uint64_t *s: pointer to (uninitialized) output Keccak state + * - uint32_t r: rate in bytes (e.g., 168 for SHAKE128) + * - const uint8_t *m: pointer to input to be absorbed into s + * - size_t mlen: length of input in bytes + * - uint8_t p: domain-separation byte for different + * Keccak-derived functions + **************************************************/ +static void keccak_absorb(uint64_t *s, uint32_t r, const uint8_t *m, + size_t mlen, uint8_t p) { + size_t i; + uint8_t t[200]; + + /* Zero state */ + for (i = 0; i < 25; ++i) { + s[i] = 0; + } + + while (mlen >= r) { + for (i = 0; i < r / 8; ++i) { + s[i] ^= load64(m + 8 * i); + } + + KeccakF1600_StatePermute(s); + mlen -= r; + m += r; + } + + for (i = 0; i < r; ++i) { + t[i] = 0; + } + for (i = 0; i < mlen; ++i) { + t[i] = m[i]; + } + t[i] = p; + t[r - 1] |= 128; + for (i = 0; i < r / 8; ++i) { + s[i] ^= load64(t + 8 * i); + } +} + +/************************************************* + * Name: keccak_squeezeblocks + * + * Description: Squeeze step of Keccak. Squeezes full blocks of r bytes each. + * Modifies the state. Can be called multiple times to keep + * squeezing, i.e., is incremental. + * + * Arguments: - uint8_t *h: pointer to output blocks + * - size_t nblocks: number of blocks to be + * squeezed (written to h) + * - uint64_t *s: pointer to input/output Keccak state + * - uint32_t r: rate in bytes (e.g., 168 for SHAKE128) + **************************************************/ +static void keccak_squeezeblocks(uint8_t *h, size_t nblocks, + uint64_t *s, uint32_t r) { + while (nblocks > 0) { + KeccakF1600_StatePermute(s); + for (size_t i = 0; i < (r >> 3); i++) { + store64(h + 8 * i, s[i]); + } + h += r; + nblocks--; + } +} + +/************************************************* + * Name: keccak_inc_init + * + * Description: Initializes the incremental Keccak state to zero. + * + * Arguments: - uint64_t *s_inc: pointer to input/output incremental state + * First 25 values represent Keccak state. + * 26th value represents either the number of absorbed bytes + * that have not been permuted, or not-yet-squeezed bytes. + **************************************************/ +static void keccak_inc_init(uint64_t *s_inc) { + size_t i; + + for (i = 0; i < 25; ++i) { + s_inc[i] = 0; + } + s_inc[25] = 0; +} + +/************************************************* + * Name: keccak_inc_absorb + * + * Description: Incremental keccak absorb + * Preceded by keccak_inc_init, succeeded by keccak_inc_finalize + * + * Arguments: - uint64_t *s_inc: pointer to input/output incremental state + * First 25 values represent Keccak state. + * 26th value represents either the number of absorbed bytes + * that have not been permuted, or not-yet-squeezed bytes. + * - uint32_t r: rate in bytes (e.g., 168 for SHAKE128) + * - const uint8_t *m: pointer to input to be absorbed into s + * - size_t mlen: length of input in bytes + **************************************************/ +static void keccak_inc_absorb(uint64_t *s_inc, uint32_t r, const uint8_t *m, + size_t mlen) { + size_t i; + + /* Recall that s_inc[25] is the non-absorbed bytes xored into the state */ + while (mlen + s_inc[25] >= r) { + for (i = 0; i < r - s_inc[25]; i++) { + /* Take the i'th byte from message + xor with the s_inc[25] + i'th byte of the state; little-endian */ + s_inc[(s_inc[25] + i) >> 3] ^= (uint64_t)m[i] << (8 * ((s_inc[25] + i) & 0x07)); + } + mlen -= (size_t)(r - s_inc[25]); + m += r - s_inc[25]; + s_inc[25] = 0; + + KeccakF1600_StatePermute(s_inc); + } + + for (i = 0; i < mlen; i++) { + s_inc[(s_inc[25] + i) >> 3] ^= (uint64_t)m[i] << (8 * ((s_inc[25] + i) & 0x07)); + } + s_inc[25] += mlen; +} + +/************************************************* + * Name: keccak_inc_finalize + * + * Description: Finalizes Keccak absorb phase, prepares for squeezing + * + * Arguments: - uint64_t *s_inc: pointer to input/output incremental state + * First 25 values represent Keccak state. + * 26th value represents either the number of absorbed bytes + * that have not been permuted, or not-yet-squeezed bytes. + * - uint32_t r: rate in bytes (e.g., 168 for SHAKE128) + * - uint8_t p: domain-separation byte for different + * Keccak-derived functions + **************************************************/ +static void keccak_inc_finalize(uint64_t *s_inc, uint32_t r, uint8_t p) { + /* After keccak_inc_absorb, we are guaranteed that s_inc[25] < r, + so we can always use one more byte for p in the current state. */ + s_inc[s_inc[25] >> 3] ^= (uint64_t)p << (8 * (s_inc[25] & 0x07)); + s_inc[(r - 1) >> 3] ^= (uint64_t)128 << (8 * ((r - 1) & 0x07)); + s_inc[25] = 0; +} + +/************************************************* + * Name: keccak_inc_squeeze + * + * Description: Incremental Keccak squeeze; can be called on byte-level + * + * Arguments: - uint8_t *h: pointer to output bytes + * - size_t outlen: number of bytes to be squeezed + * - uint64_t *s_inc: pointer to input/output incremental state + * First 25 values represent Keccak state. + * 26th value represents either the number of absorbed bytes + * that have not been permuted, or not-yet-squeezed bytes. + * - uint32_t r: rate in bytes (e.g., 168 for SHAKE128) + **************************************************/ +static void keccak_inc_squeeze(uint8_t *h, size_t outlen, + uint64_t *s_inc, uint32_t r) { + size_t i; + + /* First consume any bytes we still have sitting around */ + for (i = 0; i < outlen && i < s_inc[25]; i++) { + /* There are s_inc[25] bytes left, so r - s_inc[25] is the first + available byte. We consume from there, i.e., up to r. */ + h[i] = (uint8_t)(s_inc[(r - s_inc[25] + i) >> 3] >> (8 * ((r - s_inc[25] + i) & 0x07))); + } + h += i; + outlen -= i; + s_inc[25] -= i; + + /* Then squeeze the remaining necessary blocks */ + while (outlen > 0) { + KeccakF1600_StatePermute(s_inc); + + for (i = 0; i < outlen && i < r; i++) { + h[i] = (uint8_t)(s_inc[i >> 3] >> (8 * (i & 0x07))); + } + h += i; + outlen -= i; + s_inc[25] = r - i; + } +} + +void shake128_inc_init(uint64_t *s_inc) { + keccak_inc_init(s_inc); +} + +void shake128_inc_absorb(uint64_t *s_inc, const uint8_t *input, size_t inlen) { + keccak_inc_absorb(s_inc, SHAKE128_RATE, input, inlen); +} + +void shake128_inc_finalize(uint64_t *s_inc) { + keccak_inc_finalize(s_inc, SHAKE128_RATE, 0x1F); +} + +void shake128_inc_squeeze(uint8_t *output, size_t outlen, uint64_t *s_inc) { + keccak_inc_squeeze(output, outlen, s_inc, SHAKE128_RATE); +} + +void shake256_inc_init(uint64_t *s_inc) { + keccak_inc_init(s_inc); +} + +void shake256_inc_absorb(uint64_t *s_inc, const uint8_t *input, size_t inlen) { + keccak_inc_absorb(s_inc, SHAKE256_RATE, input, inlen); +} + +void shake256_inc_finalize(uint64_t *s_inc) { + keccak_inc_finalize(s_inc, SHAKE256_RATE, 0x1F); +} + +void shake256_inc_squeeze(uint8_t *output, size_t outlen, uint64_t *s_inc) { + keccak_inc_squeeze(output, outlen, s_inc, SHAKE256_RATE); +} + + +/************************************************* + * Name: shake128_absorb + * + * Description: Absorb step of the SHAKE128 XOF. + * non-incremental, starts by zeroeing the state. + * + * Arguments: - uint64_t *s: pointer to (uninitialized) output Keccak state + * - const uint8_t *input: pointer to input to be absorbed + * into s + * - size_t inlen: length of input in bytes + **************************************************/ +void shake128_absorb(uint64_t *s, const uint8_t *input, size_t inlen) { + keccak_absorb(s, SHAKE128_RATE, input, inlen, 0x1F); +} + +/************************************************* + * Name: shake128_squeezeblocks + * + * Description: Squeeze step of SHAKE128 XOF. Squeezes full blocks of + * SHAKE128_RATE bytes each. Modifies the state. Can be called + * multiple times to keep squeezing, i.e., is incremental. + * + * Arguments: - uint8_t *output: pointer to output blocks + * - size_t nblocks: number of blocks to be squeezed + * (written to output) + * - uint64_t *s: pointer to input/output Keccak state + **************************************************/ +void shake128_squeezeblocks(uint8_t *output, size_t nblocks, uint64_t *s) { + keccak_squeezeblocks(output, nblocks, s, SHAKE128_RATE); +} + +/************************************************* + * Name: shake256_absorb + * + * Description: Absorb step of the SHAKE256 XOF. + * non-incremental, starts by zeroeing the state. + * + * Arguments: - uint64_t *s: pointer to (uninitialized) output Keccak state + * - const uint8_t *input: pointer to input to be absorbed + * into s + * - size_t inlen: length of input in bytes + **************************************************/ +void shake256_absorb(uint64_t *s, const uint8_t *input, size_t inlen) { + keccak_absorb(s, SHAKE256_RATE, input, inlen, 0x1F); +} + +/************************************************* + * Name: shake256_squeezeblocks + * + * Description: Squeeze step of SHAKE256 XOF. Squeezes full blocks of + * SHAKE256_RATE bytes each. Modifies the state. Can be called + * multiple times to keep squeezing, i.e., is incremental. + * + * Arguments: - uint8_t *output: pointer to output blocks + * - size_t nblocks: number of blocks to be squeezed + * (written to output) + * - uint64_t *s: pointer to input/output Keccak state + **************************************************/ +void shake256_squeezeblocks(uint8_t *output, size_t nblocks, uint64_t *s) { + keccak_squeezeblocks(output, nblocks, s, SHAKE256_RATE); +} + +/************************************************* + * Name: shake128 + * + * Description: SHAKE128 XOF with non-incremental API + * + * Arguments: - uint8_t *output: pointer to output + * - size_t outlen: requested output length in bytes + * - const uint8_t *input: pointer to input + * - size_t inlen: length of input in bytes + **************************************************/ +void shake128(uint8_t *output, size_t outlen, + const uint8_t *input, size_t inlen) { + size_t nblocks = outlen / SHAKE128_RATE; + uint8_t t[SHAKE128_RATE]; + uint64_t s[25]; + + shake128_absorb(s, input, inlen); + shake128_squeezeblocks(output, nblocks, s); + + output += nblocks * SHAKE128_RATE; + outlen -= nblocks * SHAKE128_RATE; + + if (outlen) { + shake128_squeezeblocks(t, 1, s); + for (size_t i = 0; i < outlen; ++i) { + output[i] = t[i]; + } + } +} + +/************************************************* + * Name: shake256 + * + * Description: SHAKE256 XOF with non-incremental API + * + * Arguments: - uint8_t *output: pointer to output + * - size_t outlen: requested output length in bytes + * - const uint8_t *input: pointer to input + * - size_t inlen: length of input in bytes + **************************************************/ +void shake256(uint8_t *output, size_t outlen, + const uint8_t *input, size_t inlen) { + size_t nblocks = outlen / SHAKE256_RATE; + uint8_t t[SHAKE256_RATE]; + uint64_t s[25]; + + shake256_absorb(s, input, inlen); + shake256_squeezeblocks(output, nblocks, s); + + output += nblocks * SHAKE256_RATE; + outlen -= nblocks * SHAKE256_RATE; + + if (outlen) { + shake256_squeezeblocks(t, 1, s); + for (size_t i = 0; i < outlen; ++i) { + output[i] = t[i]; + } + } +} + +void sha3_256_inc_init(uint64_t *s_inc) { + keccak_inc_init(s_inc); +} + +void sha3_256_inc_absorb(uint64_t *s_inc, const uint8_t *input, size_t inlen) { + keccak_inc_absorb(s_inc, SHA3_256_RATE, input, inlen); +} + +void sha3_256_inc_finalize(uint8_t *output, uint64_t *s_inc) { + uint8_t t[SHA3_256_RATE]; + keccak_inc_finalize(s_inc, SHA3_256_RATE, 0x06); + + keccak_squeezeblocks(t, 1, s_inc, SHA3_256_RATE); + + for (size_t i = 0; i < 32; i++) { + output[i] = t[i]; + } +} + +/************************************************* + * Name: sha3_256 + * + * Description: SHA3-256 with non-incremental API + * + * Arguments: - uint8_t *output: pointer to output + * - const uint8_t *input: pointer to input + * - size_t inlen: length of input in bytes + **************************************************/ +void sha3_256(uint8_t *output, const uint8_t *input, size_t inlen) { + uint64_t s[25]; + uint8_t t[SHA3_256_RATE]; + + /* Absorb input */ + keccak_absorb(s, SHA3_256_RATE, input, inlen, 0x06); + + /* Squeeze output */ + keccak_squeezeblocks(t, 1, s, SHA3_256_RATE); + + for (size_t i = 0; i < 32; i++) { + output[i] = t[i]; + } +} + +void sha3_512_inc_init(uint64_t *s_inc) { + keccak_inc_init(s_inc); +} + +void sha3_512_inc_absorb(uint64_t *s_inc, const uint8_t *input, size_t inlen) { + keccak_inc_absorb(s_inc, SHA3_512_RATE, input, inlen); +} + +void sha3_512_inc_finalize(uint8_t *output, uint64_t *s_inc) { + uint8_t t[SHA3_512_RATE]; + keccak_inc_finalize(s_inc, SHA3_512_RATE, 0x06); + + keccak_squeezeblocks(t, 1, s_inc, SHA3_512_RATE); + + for (size_t i = 0; i < 32; i++) { + output[i] = t[i]; + } +} + +/************************************************* + * Name: sha3_512 + * + * Description: SHA3-512 with non-incremental API + * + * Arguments: - uint8_t *output: pointer to output + * - const uint8_t *input: pointer to input + * - size_t inlen: length of input in bytes + **************************************************/ +void sha3_512(uint8_t *output, const uint8_t *input, size_t inlen) { + uint64_t s[25]; + uint8_t t[SHA3_512_RATE]; + + /* Absorb input */ + keccak_absorb(s, SHA3_512_RATE, input, inlen, 0x06); + + /* Squeeze output */ + keccak_squeezeblocks(t, 1, s, SHA3_512_RATE); + + for (size_t i = 0; i < 64; i++) { + output[i] = t[i]; + } +} diff --git a/sphincsplus/sphincsplus-keccakxN/fips202.h b/sphincsplus/sphincsplus-keccakxN/fips202.h new file mode 100644 index 0000000..39604cc --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/fips202.h @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// This implementation is based on the public domain implementation of SPHINCS+ +// available on https://github.com/sphincs/sphincsplus +// + +#ifndef SPX_FIPS202_H +#define SPX_FIPS202_H + +#include +#include + +#define SHAKE128_RATE 168 +#define SHAKE256_RATE 136 +#define SHA3_256_RATE 136 +#define SHA3_512_RATE 72 + +void shake128_absorb(uint64_t *s, const uint8_t *input, size_t inlen); + +void shake128_squeezeblocks(uint8_t *output, size_t nblocks, uint64_t *s); + +void shake128_inc_init(uint64_t *s_inc); +void shake128_inc_absorb(uint64_t *s_inc, const uint8_t *input, size_t inlen); +void shake128_inc_finalize(uint64_t *s_inc); +void shake128_inc_squeeze(uint8_t *output, size_t outlen, uint64_t *s_inc); + +void shake256_absorb(uint64_t *s, const uint8_t *input, size_t inlen); +void shake256_squeezeblocks(uint8_t *output, size_t nblocks, uint64_t *s); + +void shake256_inc_init(uint64_t *s_inc); +void shake256_inc_absorb(uint64_t *s_inc, const uint8_t *input, size_t inlen); +void shake256_inc_finalize(uint64_t *s_inc); +void shake256_inc_squeeze(uint8_t *output, size_t outlen, uint64_t *s_inc); + +void shake128(uint8_t *output, size_t outlen, + const uint8_t *input, size_t inlen); + +void shake256(uint8_t *output, size_t outlen, + const uint8_t *input, size_t inlen); + +void sha3_256_inc_init(uint64_t *s_inc); +void sha3_256_inc_absorb(uint64_t *s_inc, const uint8_t *input, size_t inlen); +void sha3_256_inc_finalize(uint8_t *output, uint64_t *s_inc); + +void sha3_256(uint8_t *output, const uint8_t *input, size_t inlen); + +void sha3_512_inc_init(uint64_t *s_inc); +void sha3_512_inc_absorb(uint64_t *s_inc, const uint8_t *input, size_t inlen); +void sha3_512_inc_finalize(uint8_t *output, uint64_t *s_inc); + +void sha3_512(uint8_t *output, const uint8_t *input, size_t inlen); + +#endif diff --git a/sphincsplus/sphincsplus-keccakxN/fips202x.c b/sphincsplus/sphincsplus-keccakxN/fips202x.c new file mode 100644 index 0000000..c5c1622 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/fips202x.c @@ -0,0 +1,216 @@ +/* + * Copyright (c) 2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// This implementation is based on the public domain implementation of SPHINCS+ +// available on https://github.com/sphincs/sphincsplus +// + +#include +#include + +#include "fips202.h" +#include "fips202x.h" +#include "f1600x.h" + +#define NROUNDS 24 +#define ROL(a, offset) ((a << offset) ^ (a >> (64-offset))) + +static uint64_t load64(const unsigned char *x) +{ + unsigned long long r = 0, i; + + for (i = 0; i < 8; ++i) { + r |= (unsigned long long)x[i] << 8 * i; + } + return r; +} + +static void store64(uint8_t *x, uint64_t u) +{ + unsigned int i; + + for(i=0; i<8; ++i) { + x[i] = u; + u >>= 8; + } +} + + + + +#define KeccakF1600_StatePermutex keccakx_asm + +static void keccak_absorbx(uint64_t s[KECCAK_WAY*25], + unsigned int r, + unsigned char const * m[KECCAK_WAY], + unsigned long long int mlen, + unsigned char p) +{ + unsigned long long i; + unsigned char t[KECCAK_WAY][200]; + + while (mlen >= r) + { + for (i = 0; i < r / 8; ++i) + { + for( int j=0; j < KECCAK_WAY; j++ ) + s[STATE_IDX(j,i)] ^= load64(m[j] + 8 * i); + } + + KeccakF1600_StatePermutex(s); + mlen -= r; + for( int j=0; j < KECCAK_WAY; j++ ) + m[j] += r; + } + + for( int j=0; j < KECCAK_WAY; j++ ) + { + for (i = 0; i < r; ++i) + t[j][i] = 0; + for (i = 0; i < mlen; ++i) + t[j][i] = m[j][i]; + t[j][i] = p; + t[j][r - 1] |= 128; + } + + for (i = 0; i < r / 8; ++i) + { + for( int j=0; j < KECCAK_WAY; j++ ) + s[STATE_IDX(j,i)] ^= load64(t[j] + 8 * i); + } +} + + +static void keccak_squeezeblocksx(unsigned char * h[KECCAK_WAY], + unsigned long long int nblocks, + uint64_t s[KECCAK_WAY*25], + unsigned int r) +{ + unsigned int i; + + while(nblocks > 0) + { + KeccakF1600_StatePermutex(s); + for(i=0;i<(r>>3);i++) + { + for( int j=0; j < KECCAK_WAY; j++ ) + store64(h[j]+8*i, s[STATE_IDX(j,i)]); + } + for( int j=0; j < KECCAK_WAY; j++ ) + h[j] += r; + nblocks--; + } +} + + + +void shake128x(unsigned char * const out[KECCAK_WAY], unsigned long long outlen, + unsigned char const * const in [KECCAK_WAY], unsigned long long inlen) +{ + uint64_t s[KECCAK_WAY*25]; + unsigned char t[KECCAK_WAY][SHAKE128_RATE]; + unsigned char *t_ptr[KECCAK_WAY]; + unsigned char const * inc [KECCAK_WAY]; + unsigned char * outc[KECCAK_WAY]; + for( int j=0; j +#include +#include + +#include "fors.h" +#include "utils.h" +#include "utilsx.h" +#include "hash.h" +#include "hashx.h" +#include "thash.h" +#include "thashx.h" +#include "address.h" + +static void fors_gen_sk(unsigned char *sk, const spx_ctx *ctx, + uint32_t fors_leaf_addr[8]) +{ + prf_addr(sk, ctx, fors_leaf_addr); +} + +static void fors_gen_skx(unsigned char *sk[KECCAK_WAY], + const spx_ctx *ctx, + uint32_t fors_leaf_addrx[KECCAK_WAY*8]) +{ + prf_addrx(sk, ctx, fors_leaf_addrx); +} + +static void fors_sk_to_leaf(unsigned char *leaf, + const unsigned char *sk, + const spx_ctx *ctx, + uint32_t fors_leaf_addr[8]) +{ + thash(leaf, sk, 1, ctx, fors_leaf_addr); +} + +static void fors_sk_to_leafx(unsigned char *leaf[KECCAK_WAY], + const unsigned char * const sk[KECCAK_WAY], + const spx_ctx *ctx, + uint32_t fors_leaf_addrx[KECCAK_WAY*8]) +{ + thashx(leaf, sk, 1, ctx, fors_leaf_addrx); +} + +struct fors_gen_leaf_info { + uint32_t leaf_addrx[KECCAK_WAY*8]; +}; + +static void fors_gen_leafx(unsigned char *leaf, + const spx_ctx *ctx, + uint32_t addr_idx, void *info) +{ + struct fors_gen_leaf_info *fors_info = (struct fors_gen_leaf_info *) info; + uint32_t *fors_leaf_addrx = fors_info->leaf_addrx; + unsigned int j; + + unsigned char * leaves[KECCAK_WAY]; + + /* Only set the parts that the caller doesn't set */ + for (j = 0; j < KECCAK_WAY; j++) { + set_tree_index(fors_leaf_addrx + j*8, addr_idx + j); + set_type(fors_leaf_addrx + j*8, SPX_ADDR_TYPE_FORSPRF); + + leaves[j] = leaf + j*SPX_N; + } + + fors_gen_skx(leaves, ctx, fors_leaf_addrx); + + for (j = 0; j < KECCAK_WAY; j++) { + set_type(fors_leaf_addrx + j*8, SPX_ADDR_TYPE_FORSTREE); + } + + /* unsigned char ** -> const unsigned char * const * is OK */ + fors_sk_to_leafx(leaves, + (unsigned char const * const *) leaves, + ctx, fors_leaf_addrx); +} + +/** + * Interprets m as SPX_FORS_HEIGHT-bit unsigned integers. + * Assumes m contains at least SPX_FORS_HEIGHT * SPX_FORS_TREES bits. + * Assumes indices has space for SPX_FORS_TREES integers. + */ +static void message_to_indices(uint32_t *indices, const unsigned char *m) +{ + unsigned int i, j; + unsigned int offset = 0; + + for (i = 0; i < SPX_FORS_TREES; i++) { + indices[i] = 0; + for (j = 0; j < SPX_FORS_HEIGHT; j++) { + indices[i] ^= ((m[offset >> 3] >> (offset & 0x7)) & 0x1) << j; + offset++; + } + } +} + +/** + * Signs a message m, deriving the secret key from sk_seed and the FTS address. + * Assumes m contains at least SPX_FORS_HEIGHT * SPX_FORS_TREES bits. + */ +void fors_sign(unsigned char *sig, unsigned char *pk, + const unsigned char *m, + const spx_ctx *ctx, + const uint32_t fors_addr[8]) +{ + uint32_t indices[SPX_FORS_TREES]; + unsigned char roots[SPX_FORS_TREES * SPX_N]; + uint32_t fors_tree_addr[8] = {0}; + struct fors_gen_leaf_info fors_info = {0}; + uint32_t *fors_leaf_addr = fors_info.leaf_addrx; + uint32_t fors_pk_addr[8] = {0}; + uint32_t idx_offset; + unsigned int i; + + for (i=0; i + +#include "params.h" +#include "context.h" + +/** + * Signs a message m, deriving the secret key from sk_seed and the FTS address. + * Assumes m contains at least SPX_FORS_HEIGHT * SPX_FORS_TREES bits. + */ +#define fors_sign SPX_NAMESPACE(fors_sign) +void fors_sign(unsigned char *sig, unsigned char *pk, + const unsigned char *m, + const spx_ctx* ctx, + const uint32_t fors_addr[8]); + +/** + * Derives the FORS public key from a signature. + * This can be used for verification by comparing to a known public key, or to + * subsequently verify a signature on the derived public key. The latter is the + * typical use-case when used as an FTS below an OTS in a hypertree. + * Assumes m contains at least SPX_FORS_HEIGHT * SPX_FORS_TREES bits. + */ +#define fors_pk_from_sig SPX_NAMESPACE(fors_pk_from_sig) +void fors_pk_from_sig(unsigned char *pk, + const unsigned char *sig, const unsigned char *m, + const spx_ctx* ctx, + const uint32_t fors_addr[8]); + +#endif diff --git a/sphincsplus/sphincsplus-keccakxN/hal_env.h b/sphincsplus/sphincsplus-keccakxN/hal_env.h new file mode 100644 index 0000000..79a66e9 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/hal_env.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef HAL_ENV_H +#define HAL_ENV_H + +#define SEP ; + +#define ASM_LOAD(dst,symbol) \ + adrp dst, symbol ; add dst, dst, :lo12:symbol; + +#endif /* HAL_ENV_H */ diff --git a/sphincsplus/sphincsplus-keccakxN/hash.h b/sphincsplus/sphincsplus-keccakxN/hash.h new file mode 100644 index 0000000..b162750 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/hash.h @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// This implementation is based on the public domain implementation of SPHINCS+ +// available on https://github.com/sphincs/sphincsplus +// + +#ifndef SPX_HASH_H +#define SPX_HASH_H + +#include +#include "context.h" +#include "params.h" + +#define initialize_hash_function SPX_NAMESPACE(initialize_hash_function) +void initialize_hash_function(spx_ctx *ctx); + +#define prf_addr SPX_NAMESPACE(prf_addr) +void prf_addr(unsigned char *out, const spx_ctx *ctx, + const uint32_t addr[8]); + +#define gen_message_random SPX_NAMESPACE(gen_message_random) +void gen_message_random(unsigned char *R, const unsigned char *sk_prf, + const unsigned char *optrand, + const unsigned char *m, unsigned long long mlen, + const spx_ctx *ctx); + +#define hash_message SPX_NAMESPACE(hash_message) +void hash_message(unsigned char *digest, uint64_t *tree, uint32_t *leaf_idx, + const unsigned char *R, const unsigned char *pk, + const unsigned char *m, unsigned long long mlen, + const spx_ctx *ctx); + +#endif diff --git a/sphincsplus/sphincsplus-keccakxN/hash_shake.c b/sphincsplus/sphincsplus-keccakxN/hash_shake.c new file mode 100644 index 0000000..306d5bb --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/hash_shake.c @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// This implementation is based on the public domain implementation of SPHINCS+ +// available on https://github.com/sphincs/sphincsplus +// + +#include +#include + +#include "address.h" +#include "utils.h" +#include "params.h" +#include "hash.h" +#include "fips202.h" + +/* For SHAKE256, there is no immediate reason to initialize at the start, + so this function is an empty operation. */ +void initialize_hash_function(spx_ctx* ctx) +{ + (void)ctx; /* Suppress an 'unused parameter' warning. */ +} + +/* + * Computes PRF(pk_seed, sk_seed, addr) + */ +void prf_addr(unsigned char *out, const spx_ctx *ctx, + const uint32_t addr[8]) +{ + unsigned char buf[2*SPX_N + SPX_ADDR_BYTES]; + + memcpy(buf, ctx->pub_seed, SPX_N); + memcpy(buf + SPX_N, addr, SPX_ADDR_BYTES); + memcpy(buf + SPX_N + SPX_ADDR_BYTES, ctx->sk_seed, SPX_N); + + shake256(out, SPX_N, buf, 2*SPX_N + SPX_ADDR_BYTES); +} + +/** + * Computes the message-dependent randomness R, using a secret seed and an + * optional randomization value as well as the message. + */ +void gen_message_random(unsigned char *R, const unsigned char *sk_prf, + const unsigned char *optrand, + const unsigned char *m, unsigned long long mlen, + const spx_ctx *ctx) +{ + (void)ctx; + uint64_t s_inc[26]; + + shake256_inc_init(s_inc); + shake256_inc_absorb(s_inc, sk_prf, SPX_N); + shake256_inc_absorb(s_inc, optrand, SPX_N); + shake256_inc_absorb(s_inc, m, mlen); + shake256_inc_finalize(s_inc); + shake256_inc_squeeze(R, SPX_N, s_inc); +} + +/** + * Computes the message hash using R, the public key, and the message. + * Outputs the message digest and the index of the leaf. The index is split in + * the tree index and the leaf index, for convenient copying to an address. + */ +void hash_message(unsigned char *digest, uint64_t *tree, uint32_t *leaf_idx, + const unsigned char *R, const unsigned char *pk, + const unsigned char *m, unsigned long long mlen, + const spx_ctx *ctx) +{ + (void)ctx; +#define SPX_TREE_BITS (SPX_TREE_HEIGHT * (SPX_D - 1)) +#define SPX_TREE_BYTES ((SPX_TREE_BITS + 7) / 8) +#define SPX_LEAF_BITS SPX_TREE_HEIGHT +#define SPX_LEAF_BYTES ((SPX_LEAF_BITS + 7) / 8) +#define SPX_DGST_BYTES (SPX_FORS_MSG_BYTES + SPX_TREE_BYTES + SPX_LEAF_BYTES) + + unsigned char buf[SPX_DGST_BYTES]; + unsigned char *bufp = buf; + uint64_t s_inc[26]; + + shake256_inc_init(s_inc); + shake256_inc_absorb(s_inc, R, SPX_N); + shake256_inc_absorb(s_inc, pk, SPX_PK_BYTES); + shake256_inc_absorb(s_inc, m, mlen); + shake256_inc_finalize(s_inc); + shake256_inc_squeeze(buf, SPX_DGST_BYTES, s_inc); + + memcpy(digest, bufp, SPX_FORS_MSG_BYTES); + bufp += SPX_FORS_MSG_BYTES; + +#if SPX_TREE_BITS > 64 + #error For given height and depth, 64 bits cannot represent all subtrees +#endif + + *tree = bytes_to_ull(bufp, SPX_TREE_BYTES); + *tree &= (~(uint64_t)0) >> (64 - SPX_TREE_BITS); + bufp += SPX_TREE_BYTES; + + *leaf_idx = bytes_to_ull(bufp, SPX_LEAF_BYTES); + *leaf_idx &= (~(uint32_t)0) >> (32 - SPX_LEAF_BITS); +} diff --git a/sphincsplus/sphincsplus-keccakxN/hash_shakex.c b/sphincsplus/sphincsplus-keccakxN/hash_shakex.c new file mode 100644 index 0000000..49f794f --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/hash_shakex.c @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// This implementation is based on the public domain implementation of SPHINCS+ +// available on https://github.com/sphincs/sphincsplus +// + +#include +#include + +#include "address.h" +#include "params.h" +#include "fips202x.h" +#include "hashx.h" +#include "f1600x.h" + +#define KeccakF1600_StatePermutex keccakx_asm + +/************************************************* + * Name: load64 + * + * Description: Load 8 bytes into uint64_t in little-endian order + * + * Arguments: - const uint8_t *x: pointer to input byte array + * + * Returns the loaded 64-bit unsigned integer + **************************************************/ +static uint64_t load64(const uint8_t *x) { + uint64_t r = 0; + for (size_t i = 0; i < 8; ++i) { + r |= (uint64_t)x[i] << 8 * i; + } + + return r; +} + +/************************************************* + * Name: store64 + * + * Description: Store a 64-bit integer to a byte array in little-endian order + * + * Arguments: - uint8_t *x: pointer to the output byte array + * - uint64_t u: input 64-bit unsigned integer + **************************************************/ +static void store64(uint8_t *x, uint64_t u) { + for (size_t i = 0; i < 8; ++i) { + x[i] = (uint8_t) (u >> 8 * i); + } +} + + +/* + * 4-way parallel version of prf_addr; takes 4x as much input and output + */ +void prf_addrx(unsigned char *out[KECCAK_WAY], + const spx_ctx *ctx, + const uint32_t addrx[KECCAK_WAY*8]) { + /* As we write and read only a few quadwords, it is more efficient to + * build and extract from the five-way SHAKE256 state by hand. */ + // first 2 states interleaved; last three not interleaved + uint64_t state[KECCAK_WAY*25] = {0}; + + + for (int i = 0; i < SPX_N/8; i++) { + uint64_t x = load64(ctx->pub_seed + 8*i); + for( int j=0; j < KECCAK_WAY; j++ ) + state[STATE_IDX(j,i)] = x; + } + for (int i = 0; i < 4; i++) { + for( int j=0; j < KECCAK_WAY; j++ ) + state[STATE_IDX(j,SPX_N/8 + i)] = (((uint64_t)addrx[j*8+1+2*i]) << 32) + | (uint64_t)addrx[j*8 +2*i]; + } + for (int i = 0; i < SPX_N/8; i++) { + uint64_t x = load64(ctx->sk_seed + 8*i); + for( int j=0; j < KECCAK_WAY; j++ ) + state[STATE_IDX(j, SPX_N/8+i+4)] = x; + } + + + /* SHAKE domain separator and padding. */ + for( int j=0; j < KECCAK_WAY; j++ ) + state[STATE_IDX(j,SPX_N/4+4)] = 0x1f; + + for( int j=0; j < KECCAK_WAY; j++ ) + state[STATE_IDX(j, 16)] = 0x80ll << 56; + + KeccakF1600_StatePermutex(state); + + for (int i = 0; i < SPX_N/8; i++) { + for( int j=0; j < KECCAK_WAY; j++ ) + store64(out[j] + 8*i, state[STATE_IDX(j,i)]); + } +} diff --git a/sphincsplus/sphincsplus-keccakxN/hashx.h b/sphincsplus/sphincsplus-keccakxN/hashx.h new file mode 100644 index 0000000..73cddad --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/hashx.h @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// This implementation is based on the public domain implementation of SPHINCS+ +// available on https://github.com/sphincs/sphincsplus +// + +#ifndef SPX_HASHX_H +#define SPX_HASHX_H + +#include +#include "context.h" +#include "params.h" + +#define prf_addrx SPX_NAMESPACE(prf_addrx) +void prf_addrx(unsigned char *out[KECCAK_WAY], + const spx_ctx *ctx, + const uint32_t addrx[KECCAK_WAY*8]); + +#endif diff --git a/sphincsplus/sphincsplus-keccakxN/keccak_f1600 b/sphincsplus/sphincsplus-keccakxN/keccak_f1600 new file mode 120000 index 0000000..a48f341 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/keccak_f1600 @@ -0,0 +1 @@ +../../asm/manual/keccak_f1600 \ No newline at end of file diff --git a/sphincsplus/sphincsplus-keccakxN/keccak_f1600_dummy.s b/sphincsplus/sphincsplus-keccakxN/keccak_f1600_dummy.s new file mode 100644 index 0000000..2a01f96 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/keccak_f1600_dummy.s @@ -0,0 +1,162 @@ +/* + * Copyright (c) 2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// This implementation is based on the public domain implementation of SPHINCS+ +// available on https://github.com/sphincs/sphincsplus +// + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + + /* Mapping of Kecck-f1600 state to scalar registers + * at the beginning and end of each round. */ + Aba .req x1 + Abe .req x6 + Abi .req x11 + Abo .req x16 + Abu .req x21 + Aga .req x2 + Age .req x7 + Agi .req x12 + Ago .req x17 + Agu .req x22 + Aka .req x3 + Ake .req x8 + Aki .req x13 + Ako .req x18 + Aku .req x23 + Ama .req x4 + Ame .req x9 + Ami .req x14 + Amo .req x19 + Amu .req x24 + Asa .req x5 + Ase .req x10 + Asi .req x15 + Aso .req x20 + Asu .req x25 + +.macro load_input + ldr Aba, [input_addr, #(1*8*0)] + ldr Abe, [input_addr, #(1*8*1)] + ldr Abi, [input_addr, #(1*8*2)] + ldr Abo, [input_addr, #(1*8*3)] + ldr Abu, [input_addr, #(1*8*4)] + ldr Aga, [input_addr, #(1*8*5)] + ldr Age, [input_addr, #(1*8*6)] + ldr Agi, [input_addr, #(1*8*7)] + ldr Ago, [input_addr, #(1*8*8)] + ldr Agu, [input_addr, #(1*8*9)] + ldr Aka, [input_addr, #(1*8*10)] + ldr Ake, [input_addr, #(1*8*11)] + ldr Aki, [input_addr, #(1*8*12)] + ldr Ako, [input_addr, #(1*8*13)] + ldr Aku, [input_addr, #(1*8*14)] + ldr Ama, [input_addr, #(1*8*15)] + ldr Ame, [input_addr, #(1*8*16)] + ldr Ami, [input_addr, #(1*8*17)] + ldr Amo, [input_addr, #(1*8*18)] + ldr Amu, [input_addr, #(1*8*19)] + ldr Asa, [input_addr, #(1*8*20)] + ldr Ase, [input_addr, #(1*8*21)] + ldr Asi, [input_addr, #(1*8*22)] + ldr Aso, [input_addr, #(1*8*23)] + ldr Asu, [input_addr, #(1*8*24)] +.endm + +.macro store_input + str Aba, [input_addr, #(1*8*0)] + str Abe, [input_addr, #(1*8*1)] + str Abi, [input_addr, #(1*8*2)] + str Abo, [input_addr, #(1*8*3)] + str Abu, [input_addr, #(1*8*4)] + str Aga, [input_addr, #(1*8*5)] + str Age, [input_addr, #(1*8*6)] + str Agi, [input_addr, #(1*8*7)] + str Ago, [input_addr, #(1*8*8)] + str Agu, [input_addr, #(1*8*9)] + str Aka, [input_addr, #(1*8*10)] + str Ake, [input_addr, #(1*8*11)] + str Aki, [input_addr, #(1*8*12)] + str Ako, [input_addr, #(1*8*13)] + str Aku, [input_addr, #(1*8*14)] + str Ama, [input_addr, #(1*8*15)] + str Ame, [input_addr, #(1*8*16)] + str Ami, [input_addr, #(1*8*17)] + str Amo, [input_addr, #(1*8*18)] + str Amu, [input_addr, #(1*8*19)] + str Asa, [input_addr, #(1*8*20)] + str Ase, [input_addr, #(1*8*21)] + str Asi, [input_addr, #(1*8*22)] + str Aso, [input_addr, #(1*8*23)] + str Asu, [input_addr, #(1*8*24)] +.endm + +.macro save_gprs + stp x19, x20, [sp, #(16*0)] + stp x21, x22, [sp, #(16*1)] + stp x23, x24, [sp, #(16*2)] + stp x25, x26, [sp, #(16*3)] + stp x27, x28, [sp, #(16*4)] + stp x29, x30, [sp, #(16*5)] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #(16*0)] + ldp x21, x22, [sp, #(16*1)] + ldp x23, x24, [sp, #(16*2)] + ldp x25, x26, [sp, #(16*3)] + ldp x27, x28, [sp, #(16*4)] + ldp x29, x30, [sp, #(16*5)] +.endm + +#define STACK_SIZE (16*6) + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +.text +.align 4 +.global keccak_f1600_dummy +.global _keccak_f1600_dummy + +keccak_f1600_dummy: +_keccak_f1600_dummy: + alloc_stack + save_gprs + + load_input + store_input + + restore_gprs + free_stack + ret diff --git a/sphincsplus/sphincsplus-keccakxN/macros.s b/sphincsplus/sphincsplus-keccakxN/macros.s new file mode 100644 index 0000000..4fb551b --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/macros.s @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include "hal_env.h" + +.macro load_constant_ptr + ASM_LOAD(const_addr, round_constants) +.endm diff --git a/sphincsplus/sphincsplus-keccakxN/make_all.py b/sphincsplus/sphincsplus-keccakxN/make_all.py new file mode 100644 index 0000000..07b3ac4 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/make_all.py @@ -0,0 +1,76 @@ +# +# Copyright (c) 2022 Arm Limited +# Copyright (c) 2022 Matthias Kannwischer +# SPDX-License-Identifier: MIT +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +# + +#! /usr/bin/env python3 + +import multiprocessing +import subprocess +import itertools +import shutil +import os +import sys + +cores = ["X1", "A78", "A55", "X2", "A710", "A510"] +fns = ['shake'] +options = ["f", "s"] +sizes = [128, 192, 256] +thashes = ['robust', 'simple'] +implementations = ['x3', 'x4', 'x5'] +bindir = "bin/" + +def nameFor(fn, opt, size, thash, impl): + return f"sphincs-{fn}-{size}{opt}-{thash}_{impl}" + +def make(fn, opt, size, thash, impl, core, bindir): + if not os.path.exists(bindir): + os.mkdir(bindir) + + if core in ["X1", "A78", "A55"]: + platform = "v8" + elif core in ["X2", "A710", "A510"]: + platform ="v84" + else: + raise Exception() + + way = impl.replace("x", "") + + + name = nameFor(fn, opt, size, thash, impl) + overrides = [f'PARAMS=sphincs-{fn}-{size}{opt}', 'THASH='+thash, 'CORE='+core, 'PLATFORM='+platform, 'WAY='+way] + + sys.stderr.write(f"Compiling {name} for {core} …\n") + sys.stderr.flush() + + subprocess.run(["make", "clean"] + overrides, + stdout=subprocess.DEVNULL, stderr=sys.stderr, check=True) + subprocess.run(["make"] + overrides, + stdout=subprocess.DEVNULL, stderr=sys.stderr, check=True) + + shutil.move("benchmark", f"{bindir}/bench_{core}_{name}") + +for fn in fns: + combinations = itertools.product(options, sizes, thashes, implementations, cores) + for i, (opt, size, thash, impl, core) in enumerate(combinations): + make(fn, opt, size, thash, impl, core, bindir) diff --git a/sphincsplus/sphincsplus-keccakxN/merkle.c b/sphincsplus/sphincsplus-keccakxN/merkle.c new file mode 100644 index 0000000..0b79abd --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/merkle.c @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// This implementation is based on the public domain implementation of SPHINCS+ +// available on https://github.com/sphincs/sphincsplus +// + +#include +#include + +#include "utils.h" +#include "utilsx.h" +#include "wots.h" +#include "wotsx.h" +#include "merkle.h" +#include "address.h" +#include "params.h" + +/* + * This generates a Merkle signature (WOTS signature followed by the Merkle + * authentication path). + */ +void merkle_sign(uint8_t *sig, unsigned char *root, + const spx_ctx* ctx, + uint32_t wots_addr[8], uint32_t tree_addr[8], + uint32_t idx_leaf) +{ + unsigned char *auth_path = sig + SPX_WOTS_BYTES; + int j; + struct leaf_info_x info = { 0 }; + unsigned steps[ SPX_WOTS_LEN ]; + + info.wots_sig = sig; + chain_lengths(steps, root); + info.wots_steps = steps; + + set_type(&tree_addr[0], SPX_ADDR_TYPE_HASHTREE); + for (j=0; j + +/* Generate a Merkle signature (WOTS signature followed by the Merkle */ +/* authentication path) */ +#define merkle_sign SPX_NAMESPACE(merkle_sign) +void merkle_sign(uint8_t *sig, unsigned char *root, + const spx_ctx* ctx, + uint32_t wots_addr[8], uint32_t tree_addr[8], + uint32_t idx_leaf); + +/* Compute the root node of the top-most subtree. */ +#define merkle_gen_root SPX_NAMESPACE(merkle_gen_root) +void merkle_gen_root(unsigned char *root, const spx_ctx* ctx); + +#endif /* MERKLE_H_ */ diff --git a/sphincsplus/sphincsplus-keccakxN/params.h b/sphincsplus/sphincsplus-keccakxN/params.h new file mode 100644 index 0000000..3c4ead4 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/params.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// This implementation is based on the public domain implementation of SPHINCS+ +// available on https://github.com/sphincs/sphincsplus +// + +#define str(s) #s +#define xstr(s) str(s) + +#include xstr(params/params-PARAMS.h) diff --git a/sphincsplus/sphincsplus-keccakxN/params/params-sphincs-shake-128f.h b/sphincsplus/sphincsplus-keccakxN/params/params-sphincs-shake-128f.h new file mode 100644 index 0000000..8f77692 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/params/params-sphincs-shake-128f.h @@ -0,0 +1,80 @@ +#ifndef SPX_PARAMS_H +#define SPX_PARAMS_H + +#define SPX_NAMESPACE(s) SPX_##s + +/* Hash output length in bytes. */ +#define SPX_N 16 +/* Height of the hypertree. */ +#define SPX_FULL_HEIGHT 66 +/* Number of subtree layer. */ +#define SPX_D 22 +/* FORS tree dimensions. */ +#define SPX_FORS_HEIGHT 6 +#define SPX_FORS_TREES 33 +/* Winternitz parameter, */ +#define SPX_WOTS_W 16 + +/* The hash function is defined by linking a different hash.c file, as opposed + to setting a #define constant. */ + +/* For clarity */ +#define SPX_ADDR_BYTES 32 + +/* WOTS parameters. */ +#if SPX_WOTS_W == 256 + #define SPX_WOTS_LOGW 8 +#elif SPX_WOTS_W == 16 + #define SPX_WOTS_LOGW 4 +#else + #error SPX_WOTS_W assumed 16 or 256 +#endif + +#define SPX_WOTS_LEN1 (8 * SPX_N / SPX_WOTS_LOGW) + +/* SPX_WOTS_LEN2 is floor(log(len_1 * (w - 1)) / log(w)) + 1; we precompute */ +#if SPX_WOTS_W == 256 + #if SPX_N <= 1 + #define SPX_WOTS_LEN2 1 + #elif SPX_N <= 256 + #define SPX_WOTS_LEN2 2 + #else + #error Did not precompute SPX_WOTS_LEN2 for n outside {2, .., 256} + #endif +#elif SPX_WOTS_W == 16 + #if SPX_N <= 8 + #define SPX_WOTS_LEN2 2 + #elif SPX_N <= 136 + #define SPX_WOTS_LEN2 3 + #elif SPX_N <= 256 + #define SPX_WOTS_LEN2 4 + #else + #error Did not precompute SPX_WOTS_LEN2 for n outside {2, .., 256} + #endif +#endif + +#define SPX_WOTS_LEN (SPX_WOTS_LEN1 + SPX_WOTS_LEN2) +#define SPX_WOTS_BYTES (SPX_WOTS_LEN * SPX_N) +#define SPX_WOTS_PK_BYTES SPX_WOTS_BYTES + +/* Subtree size. */ +#define SPX_TREE_HEIGHT (SPX_FULL_HEIGHT / SPX_D) + +#if SPX_TREE_HEIGHT * SPX_D != SPX_FULL_HEIGHT + #error SPX_D should always divide SPX_FULL_HEIGHT +#endif + +/* FORS parameters. */ +#define SPX_FORS_MSG_BYTES ((SPX_FORS_HEIGHT * SPX_FORS_TREES + 7) / 8) +#define SPX_FORS_BYTES ((SPX_FORS_HEIGHT + 1) * SPX_FORS_TREES * SPX_N) +#define SPX_FORS_PK_BYTES SPX_N + +/* Resulting SPX sizes. */ +#define SPX_BYTES (SPX_N + SPX_FORS_BYTES + SPX_D * SPX_WOTS_BYTES +\ + SPX_FULL_HEIGHT * SPX_N) +#define SPX_PK_BYTES (2 * SPX_N) +#define SPX_SK_BYTES (2 * SPX_N + SPX_PK_BYTES) + +#include "../shake_offsets.h" + +#endif diff --git a/sphincsplus/sphincsplus-keccakxN/params/params-sphincs-shake-128s.h b/sphincsplus/sphincsplus-keccakxN/params/params-sphincs-shake-128s.h new file mode 100644 index 0000000..a4d1e13 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/params/params-sphincs-shake-128s.h @@ -0,0 +1,80 @@ +#ifndef SPX_PARAMS_H +#define SPX_PARAMS_H + +#define SPX_NAMESPACE(s) SPX_##s + +/* Hash output length in bytes. */ +#define SPX_N 16 +/* Height of the hypertree. */ +#define SPX_FULL_HEIGHT 63 +/* Number of subtree layer. */ +#define SPX_D 7 +/* FORS tree dimensions. */ +#define SPX_FORS_HEIGHT 12 +#define SPX_FORS_TREES 14 +/* Winternitz parameter, */ +#define SPX_WOTS_W 16 + +/* The hash function is defined by linking a different hash.c file, as opposed + to setting a #define constant. */ + +/* For clarity */ +#define SPX_ADDR_BYTES 32 + +/* WOTS parameters. */ +#if SPX_WOTS_W == 256 + #define SPX_WOTS_LOGW 8 +#elif SPX_WOTS_W == 16 + #define SPX_WOTS_LOGW 4 +#else + #error SPX_WOTS_W assumed 16 or 256 +#endif + +#define SPX_WOTS_LEN1 (8 * SPX_N / SPX_WOTS_LOGW) + +/* SPX_WOTS_LEN2 is floor(log(len_1 * (w - 1)) / log(w)) + 1; we precompute */ +#if SPX_WOTS_W == 256 + #if SPX_N <= 1 + #define SPX_WOTS_LEN2 1 + #elif SPX_N <= 256 + #define SPX_WOTS_LEN2 2 + #else + #error Did not precompute SPX_WOTS_LEN2 for n outside {2, .., 256} + #endif +#elif SPX_WOTS_W == 16 + #if SPX_N <= 8 + #define SPX_WOTS_LEN2 2 + #elif SPX_N <= 136 + #define SPX_WOTS_LEN2 3 + #elif SPX_N <= 256 + #define SPX_WOTS_LEN2 4 + #else + #error Did not precompute SPX_WOTS_LEN2 for n outside {2, .., 256} + #endif +#endif + +#define SPX_WOTS_LEN (SPX_WOTS_LEN1 + SPX_WOTS_LEN2) +#define SPX_WOTS_BYTES (SPX_WOTS_LEN * SPX_N) +#define SPX_WOTS_PK_BYTES SPX_WOTS_BYTES + +/* Subtree size. */ +#define SPX_TREE_HEIGHT (SPX_FULL_HEIGHT / SPX_D) + +#if SPX_TREE_HEIGHT * SPX_D != SPX_FULL_HEIGHT + #error SPX_D should always divide SPX_FULL_HEIGHT +#endif + +/* FORS parameters. */ +#define SPX_FORS_MSG_BYTES ((SPX_FORS_HEIGHT * SPX_FORS_TREES + 7) / 8) +#define SPX_FORS_BYTES ((SPX_FORS_HEIGHT + 1) * SPX_FORS_TREES * SPX_N) +#define SPX_FORS_PK_BYTES SPX_N + +/* Resulting SPX sizes. */ +#define SPX_BYTES (SPX_N + SPX_FORS_BYTES + SPX_D * SPX_WOTS_BYTES +\ + SPX_FULL_HEIGHT * SPX_N) +#define SPX_PK_BYTES (2 * SPX_N) +#define SPX_SK_BYTES (2 * SPX_N + SPX_PK_BYTES) + +#include "../shake_offsets.h" + +#endif diff --git a/sphincsplus/sphincsplus-keccakxN/params/params-sphincs-shake-192f.h b/sphincsplus/sphincsplus-keccakxN/params/params-sphincs-shake-192f.h new file mode 100644 index 0000000..b1e73d1 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/params/params-sphincs-shake-192f.h @@ -0,0 +1,80 @@ +#ifndef SPX_PARAMS_H +#define SPX_PARAMS_H + +#define SPX_NAMESPACE(s) SPX_##s + +/* Hash output length in bytes. */ +#define SPX_N 24 +/* Height of the hypertree. */ +#define SPX_FULL_HEIGHT 66 +/* Number of subtree layer. */ +#define SPX_D 22 +/* FORS tree dimensions. */ +#define SPX_FORS_HEIGHT 8 +#define SPX_FORS_TREES 33 +/* Winternitz parameter, */ +#define SPX_WOTS_W 16 + +/* The hash function is defined by linking a different hash.c file, as opposed + to setting a #define constant. */ + +/* For clarity */ +#define SPX_ADDR_BYTES 32 + +/* WOTS parameters. */ +#if SPX_WOTS_W == 256 + #define SPX_WOTS_LOGW 8 +#elif SPX_WOTS_W == 16 + #define SPX_WOTS_LOGW 4 +#else + #error SPX_WOTS_W assumed 16 or 256 +#endif + +#define SPX_WOTS_LEN1 (8 * SPX_N / SPX_WOTS_LOGW) + +/* SPX_WOTS_LEN2 is floor(log(len_1 * (w - 1)) / log(w)) + 1; we precompute */ +#if SPX_WOTS_W == 256 + #if SPX_N <= 1 + #define SPX_WOTS_LEN2 1 + #elif SPX_N <= 256 + #define SPX_WOTS_LEN2 2 + #else + #error Did not precompute SPX_WOTS_LEN2 for n outside {2, .., 256} + #endif +#elif SPX_WOTS_W == 16 + #if SPX_N <= 8 + #define SPX_WOTS_LEN2 2 + #elif SPX_N <= 136 + #define SPX_WOTS_LEN2 3 + #elif SPX_N <= 256 + #define SPX_WOTS_LEN2 4 + #else + #error Did not precompute SPX_WOTS_LEN2 for n outside {2, .., 256} + #endif +#endif + +#define SPX_WOTS_LEN (SPX_WOTS_LEN1 + SPX_WOTS_LEN2) +#define SPX_WOTS_BYTES (SPX_WOTS_LEN * SPX_N) +#define SPX_WOTS_PK_BYTES SPX_WOTS_BYTES + +/* Subtree size. */ +#define SPX_TREE_HEIGHT (SPX_FULL_HEIGHT / SPX_D) + +#if SPX_TREE_HEIGHT * SPX_D != SPX_FULL_HEIGHT + #error SPX_D should always divide SPX_FULL_HEIGHT +#endif + +/* FORS parameters. */ +#define SPX_FORS_MSG_BYTES ((SPX_FORS_HEIGHT * SPX_FORS_TREES + 7) / 8) +#define SPX_FORS_BYTES ((SPX_FORS_HEIGHT + 1) * SPX_FORS_TREES * SPX_N) +#define SPX_FORS_PK_BYTES SPX_N + +/* Resulting SPX sizes. */ +#define SPX_BYTES (SPX_N + SPX_FORS_BYTES + SPX_D * SPX_WOTS_BYTES +\ + SPX_FULL_HEIGHT * SPX_N) +#define SPX_PK_BYTES (2 * SPX_N) +#define SPX_SK_BYTES (2 * SPX_N + SPX_PK_BYTES) + +#include "../shake_offsets.h" + +#endif diff --git a/sphincsplus/sphincsplus-keccakxN/params/params-sphincs-shake-192s.h b/sphincsplus/sphincsplus-keccakxN/params/params-sphincs-shake-192s.h new file mode 100644 index 0000000..0882e1c --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/params/params-sphincs-shake-192s.h @@ -0,0 +1,80 @@ +#ifndef SPX_PARAMS_H +#define SPX_PARAMS_H + +#define SPX_NAMESPACE(s) SPX_##s + +/* Hash output length in bytes. */ +#define SPX_N 24 +/* Height of the hypertree. */ +#define SPX_FULL_HEIGHT 63 +/* Number of subtree layer. */ +#define SPX_D 7 +/* FORS tree dimensions. */ +#define SPX_FORS_HEIGHT 14 +#define SPX_FORS_TREES 17 +/* Winternitz parameter, */ +#define SPX_WOTS_W 16 + +/* The hash function is defined by linking a different hash.c file, as opposed + to setting a #define constant. */ + +/* For clarity */ +#define SPX_ADDR_BYTES 32 + +/* WOTS parameters. */ +#if SPX_WOTS_W == 256 + #define SPX_WOTS_LOGW 8 +#elif SPX_WOTS_W == 16 + #define SPX_WOTS_LOGW 4 +#else + #error SPX_WOTS_W assumed 16 or 256 +#endif + +#define SPX_WOTS_LEN1 (8 * SPX_N / SPX_WOTS_LOGW) + +/* SPX_WOTS_LEN2 is floor(log(len_1 * (w - 1)) / log(w)) + 1; we precompute */ +#if SPX_WOTS_W == 256 + #if SPX_N <= 1 + #define SPX_WOTS_LEN2 1 + #elif SPX_N <= 256 + #define SPX_WOTS_LEN2 2 + #else + #error Did not precompute SPX_WOTS_LEN2 for n outside {2, .., 256} + #endif +#elif SPX_WOTS_W == 16 + #if SPX_N <= 8 + #define SPX_WOTS_LEN2 2 + #elif SPX_N <= 136 + #define SPX_WOTS_LEN2 3 + #elif SPX_N <= 256 + #define SPX_WOTS_LEN2 4 + #else + #error Did not precompute SPX_WOTS_LEN2 for n outside {2, .., 256} + #endif +#endif + +#define SPX_WOTS_LEN (SPX_WOTS_LEN1 + SPX_WOTS_LEN2) +#define SPX_WOTS_BYTES (SPX_WOTS_LEN * SPX_N) +#define SPX_WOTS_PK_BYTES SPX_WOTS_BYTES + +/* Subtree size. */ +#define SPX_TREE_HEIGHT (SPX_FULL_HEIGHT / SPX_D) + +#if SPX_TREE_HEIGHT * SPX_D != SPX_FULL_HEIGHT + #error SPX_D should always divide SPX_FULL_HEIGHT +#endif + +/* FORS parameters. */ +#define SPX_FORS_MSG_BYTES ((SPX_FORS_HEIGHT * SPX_FORS_TREES + 7) / 8) +#define SPX_FORS_BYTES ((SPX_FORS_HEIGHT + 1) * SPX_FORS_TREES * SPX_N) +#define SPX_FORS_PK_BYTES SPX_N + +/* Resulting SPX sizes. */ +#define SPX_BYTES (SPX_N + SPX_FORS_BYTES + SPX_D * SPX_WOTS_BYTES +\ + SPX_FULL_HEIGHT * SPX_N) +#define SPX_PK_BYTES (2 * SPX_N) +#define SPX_SK_BYTES (2 * SPX_N + SPX_PK_BYTES) + +#include "../shake_offsets.h" + +#endif diff --git a/sphincsplus/sphincsplus-keccakxN/params/params-sphincs-shake-256f.h b/sphincsplus/sphincsplus-keccakxN/params/params-sphincs-shake-256f.h new file mode 100644 index 0000000..e301c28 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/params/params-sphincs-shake-256f.h @@ -0,0 +1,80 @@ +#ifndef SPX_PARAMS_H +#define SPX_PARAMS_H + +#define SPX_NAMESPACE(s) SPX_##s + +/* Hash output length in bytes. */ +#define SPX_N 32 +/* Height of the hypertree. */ +#define SPX_FULL_HEIGHT 68 +/* Number of subtree layer. */ +#define SPX_D 17 +/* FORS tree dimensions. */ +#define SPX_FORS_HEIGHT 9 +#define SPX_FORS_TREES 35 +/* Winternitz parameter, */ +#define SPX_WOTS_W 16 + +/* The hash function is defined by linking a different hash.c file, as opposed + to setting a #define constant. */ + +/* For clarity */ +#define SPX_ADDR_BYTES 32 + +/* WOTS parameters. */ +#if SPX_WOTS_W == 256 + #define SPX_WOTS_LOGW 8 +#elif SPX_WOTS_W == 16 + #define SPX_WOTS_LOGW 4 +#else + #error SPX_WOTS_W assumed 16 or 256 +#endif + +#define SPX_WOTS_LEN1 (8 * SPX_N / SPX_WOTS_LOGW) + +/* SPX_WOTS_LEN2 is floor(log(len_1 * (w - 1)) / log(w)) + 1; we precompute */ +#if SPX_WOTS_W == 256 + #if SPX_N <= 1 + #define SPX_WOTS_LEN2 1 + #elif SPX_N <= 256 + #define SPX_WOTS_LEN2 2 + #else + #error Did not precompute SPX_WOTS_LEN2 for n outside {2, .., 256} + #endif +#elif SPX_WOTS_W == 16 + #if SPX_N <= 8 + #define SPX_WOTS_LEN2 2 + #elif SPX_N <= 136 + #define SPX_WOTS_LEN2 3 + #elif SPX_N <= 256 + #define SPX_WOTS_LEN2 4 + #else + #error Did not precompute SPX_WOTS_LEN2 for n outside {2, .., 256} + #endif +#endif + +#define SPX_WOTS_LEN (SPX_WOTS_LEN1 + SPX_WOTS_LEN2) +#define SPX_WOTS_BYTES (SPX_WOTS_LEN * SPX_N) +#define SPX_WOTS_PK_BYTES SPX_WOTS_BYTES + +/* Subtree size. */ +#define SPX_TREE_HEIGHT (SPX_FULL_HEIGHT / SPX_D) + +#if SPX_TREE_HEIGHT * SPX_D != SPX_FULL_HEIGHT + #error SPX_D should always divide SPX_FULL_HEIGHT +#endif + +/* FORS parameters. */ +#define SPX_FORS_MSG_BYTES ((SPX_FORS_HEIGHT * SPX_FORS_TREES + 7) / 8) +#define SPX_FORS_BYTES ((SPX_FORS_HEIGHT + 1) * SPX_FORS_TREES * SPX_N) +#define SPX_FORS_PK_BYTES SPX_N + +/* Resulting SPX sizes. */ +#define SPX_BYTES (SPX_N + SPX_FORS_BYTES + SPX_D * SPX_WOTS_BYTES +\ + SPX_FULL_HEIGHT * SPX_N) +#define SPX_PK_BYTES (2 * SPX_N) +#define SPX_SK_BYTES (2 * SPX_N + SPX_PK_BYTES) + +#include "../shake_offsets.h" + +#endif diff --git a/sphincsplus/sphincsplus-keccakxN/params/params-sphincs-shake-256s.h b/sphincsplus/sphincsplus-keccakxN/params/params-sphincs-shake-256s.h new file mode 100644 index 0000000..0a96894 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/params/params-sphincs-shake-256s.h @@ -0,0 +1,80 @@ +#ifndef SPX_PARAMS_H +#define SPX_PARAMS_H + +#define SPX_NAMESPACE(s) SPX_##s + +/* Hash output length in bytes. */ +#define SPX_N 32 +/* Height of the hypertree. */ +#define SPX_FULL_HEIGHT 64 +/* Number of subtree layer. */ +#define SPX_D 8 +/* FORS tree dimensions. */ +#define SPX_FORS_HEIGHT 14 +#define SPX_FORS_TREES 22 +/* Winternitz parameter, */ +#define SPX_WOTS_W 16 + +/* The hash function is defined by linking a different hash.c file, as opposed + to setting a #define constant. */ + +/* For clarity */ +#define SPX_ADDR_BYTES 32 + +/* WOTS parameters. */ +#if SPX_WOTS_W == 256 + #define SPX_WOTS_LOGW 8 +#elif SPX_WOTS_W == 16 + #define SPX_WOTS_LOGW 4 +#else + #error SPX_WOTS_W assumed 16 or 256 +#endif + +#define SPX_WOTS_LEN1 (8 * SPX_N / SPX_WOTS_LOGW) + +/* SPX_WOTS_LEN2 is floor(log(len_1 * (w - 1)) / log(w)) + 1; we precompute */ +#if SPX_WOTS_W == 256 + #if SPX_N <= 1 + #define SPX_WOTS_LEN2 1 + #elif SPX_N <= 256 + #define SPX_WOTS_LEN2 2 + #else + #error Did not precompute SPX_WOTS_LEN2 for n outside {2, .., 256} + #endif +#elif SPX_WOTS_W == 16 + #if SPX_N <= 8 + #define SPX_WOTS_LEN2 2 + #elif SPX_N <= 136 + #define SPX_WOTS_LEN2 3 + #elif SPX_N <= 256 + #define SPX_WOTS_LEN2 4 + #else + #error Did not precompute SPX_WOTS_LEN2 for n outside {2, .., 256} + #endif +#endif + +#define SPX_WOTS_LEN (SPX_WOTS_LEN1 + SPX_WOTS_LEN2) +#define SPX_WOTS_BYTES (SPX_WOTS_LEN * SPX_N) +#define SPX_WOTS_PK_BYTES SPX_WOTS_BYTES + +/* Subtree size. */ +#define SPX_TREE_HEIGHT (SPX_FULL_HEIGHT / SPX_D) + +#if SPX_TREE_HEIGHT * SPX_D != SPX_FULL_HEIGHT + #error SPX_D should always divide SPX_FULL_HEIGHT +#endif + +/* FORS parameters. */ +#define SPX_FORS_MSG_BYTES ((SPX_FORS_HEIGHT * SPX_FORS_TREES + 7) / 8) +#define SPX_FORS_BYTES ((SPX_FORS_HEIGHT + 1) * SPX_FORS_TREES * SPX_N) +#define SPX_FORS_PK_BYTES SPX_N + +/* Resulting SPX sizes. */ +#define SPX_BYTES (SPX_N + SPX_FORS_BYTES + SPX_D * SPX_WOTS_BYTES +\ + SPX_FULL_HEIGHT * SPX_N) +#define SPX_PK_BYTES (2 * SPX_N) +#define SPX_SK_BYTES (2 * SPX_N + SPX_PK_BYTES) + +#include "../shake_offsets.h" + +#endif diff --git a/sphincsplus/sphincsplus-keccakxN/randombytes.h b/sphincsplus/sphincsplus-keccakxN/randombytes.h new file mode 100644 index 0000000..4e9fa03 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/randombytes.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// This implementation is based on the public domain implementation of SPHINCS+ +// available on https://github.com/sphincs/sphincsplus +// + +#ifndef SPX_RANDOMBYTES_H +#define SPX_RANDOMBYTES_H +#include +#include + +extern void randombytes(uint8_t * x, size_t xlen); + +#endif diff --git a/sphincsplus/sphincsplus-keccakxN/shake_offsets.h b/sphincsplus/sphincsplus-keccakxN/shake_offsets.h new file mode 100644 index 0000000..2036c30 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/shake_offsets.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// This implementation is based on the public domain implementation of SPHINCS+ +// available on https://github.com/sphincs/sphincsplus +// + +#if !defined( SHAKE_OFFSETS_H_ ) +#define SHAKE_OFFSETS_H_ + +/* + * Offsets of various fields in the address structure when we use SHAKE as + * the Sphincs+ hash function + */ + +#define SPX_OFFSET_LAYER 3 /* The byte used to specify the Merkle tree layer */ +#define SPX_OFFSET_TREE 8 /* The start of the 8 byte field used to specify the tree */ +#define SPX_OFFSET_TYPE 19 /* The byte used to specify the hash type (reason) */ +#define SPX_OFFSET_KP_ADDR2 22 /* The high byte used to specify the key pair (which one-time signature) */ +#define SPX_OFFSET_KP_ADDR1 23 /* The low byte used to specify the key pair */ +#define SPX_OFFSET_CHAIN_ADDR 27 /* The byte used to specify the chain address (which Winternitz chain) */ +#define SPX_OFFSET_HASH_ADDR 31 /* The byte used to specify the hash address (where in the Winternitz chain) */ +#define SPX_OFFSET_TREE_HGT 27 /* The byte used to specify the height of this node in the FORS or Merkle tree */ +#define SPX_OFFSET_TREE_INDEX 28 /* The start of the 4 byte field used to specify the node in the FORS or Merkle tree */ + +#define SPX_SHAKE 1 + +#endif /* SHAKE_OFFSETS_H_ */ diff --git a/sphincsplus/sphincsplus-keccakxN/sign.c b/sphincsplus/sphincsplus-keccakxN/sign.c new file mode 100644 index 0000000..da4e454 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/sign.c @@ -0,0 +1,317 @@ +/* + * Copyright (c) 2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// This implementation is based on the public domain implementation of SPHINCS+ +// available on https://github.com/sphincs/sphincsplus +// + +#include +#include +#include + +#include "api.h" +#include "params.h" +#include "wots.h" +#include "fors.h" +#include "hash.h" +#include "thash.h" +#include "address.h" +#include "randombytes.h" +#include "utils.h" +#include "merkle.h" + +/* + * Returns the length of a secret key, in bytes + */ +unsigned long long crypto_sign_secretkeybytes(void) +{ + return CRYPTO_SECRETKEYBYTES; +} + +/* + * Returns the length of a public key, in bytes + */ +unsigned long long crypto_sign_publickeybytes(void) +{ + return CRYPTO_PUBLICKEYBYTES; +} + +/* + * Returns the length of a signature, in bytes + */ +unsigned long long crypto_sign_bytes(void) +{ + return CRYPTO_BYTES; +} + +/* + * Returns the length of the seed required to generate a key pair, in bytes + */ +unsigned long long crypto_sign_seedbytes(void) +{ + return CRYPTO_SEEDBYTES; +} + +/* + * Generates an SPX key pair given a seed of length + * Format sk: [SK_SEED || SK_PRF || PUB_SEED || root] + * Format pk: [PUB_SEED || root] + */ +int crypto_sign_seed_keypair(unsigned char *pk, unsigned char *sk, + const unsigned char *seed) +{ + spx_ctx ctx; + + /* Initialize SK_SEED, SK_PRF and PUB_SEED from seed. */ + memcpy(sk, seed, CRYPTO_SEEDBYTES); + + memcpy(pk, sk + 2*SPX_N, SPX_N); + + memcpy(ctx.pub_seed, pk, SPX_N); + memcpy(ctx.sk_seed, sk, SPX_N); + + /* This hook allows the hash function instantiation to do whatever + preparation or computation it needs, based on the public seed. */ + initialize_hash_function(&ctx); + + /* Compute root node of the top-most subtree. */ + merkle_gen_root(sk + 3*SPX_N, &ctx); + + memcpy(pk + SPX_N, sk + 3*SPX_N, SPX_N); + + return 0; +} + +/* + * Generates an SPX key pair. + * Format sk: [SK_SEED || SK_PRF || PUB_SEED || root] + * Format pk: [PUB_SEED || root] + */ +int crypto_sign_keypair(unsigned char *pk, unsigned char *sk) +{ + unsigned char seed[CRYPTO_SEEDBYTES]; + randombytes(seed, CRYPTO_SEEDBYTES); + crypto_sign_seed_keypair(pk, sk, seed); + + return 0; +} + +/** + * Returns an array containing a detached signature. + */ +int crypto_sign_signature(uint8_t *sig, size_t *siglen, + const uint8_t *m, size_t mlen, const uint8_t *sk) +{ + spx_ctx ctx; + + const unsigned char *sk_prf = sk + SPX_N; + const unsigned char *pk = sk + 2*SPX_N; + + unsigned char optrand[SPX_N]; + unsigned char mhash[SPX_FORS_MSG_BYTES]; + unsigned char root[SPX_N]; + unsigned long long i; + uint64_t tree; + uint32_t idx_leaf; + uint32_t wots_addr[8] = {0}; + uint32_t tree_addr[8] = {0}; + + memcpy(ctx.sk_seed, sk, SPX_N); + memcpy(ctx.pub_seed, pk, SPX_N); + + /* This hook allows the hash function instantiation to do whatever + preparation or computation it needs, based on the public seed. */ + initialize_hash_function(&ctx); + + set_type(wots_addr, SPX_ADDR_TYPE_WOTS); + set_type(tree_addr, SPX_ADDR_TYPE_HASHTREE); + + /* Optionally, signing can be made non-deterministic using optrand. + This can help counter side-channel attacks that would benefit from + getting a large number of traces when the signer uses the same nodes. */ + randombytes(optrand, SPX_N); + /* Compute the digest randomization value. */ + gen_message_random(sig, sk_prf, optrand, m, mlen, &ctx); + + /* Derive the message digest and leaf index from R, PK and M. */ + hash_message(mhash, &tree, &idx_leaf, sig, pk, m, mlen, &ctx); + sig += SPX_N; + + set_tree_addr(wots_addr, tree); + set_keypair_addr(wots_addr, idx_leaf); + + /* Sign the message hash using FORS. */ + fors_sign(sig, root, mhash, &ctx, wots_addr); + sig += SPX_FORS_BYTES; + + for (i = 0; i < SPX_D; i++) { + set_layer_addr(tree_addr, i); + set_tree_addr(tree_addr, tree); + + copy_subtree_addr(wots_addr, tree_addr); + set_keypair_addr(wots_addr, idx_leaf); + + merkle_sign(sig, root, &ctx, wots_addr, tree_addr, idx_leaf); + sig += SPX_WOTS_BYTES + SPX_TREE_HEIGHT * SPX_N; + + /* Update the indices for the next layer. */ + idx_leaf = (tree & ((1 << SPX_TREE_HEIGHT)-1)); + tree = tree >> SPX_TREE_HEIGHT; + } + + *siglen = SPX_BYTES; + + return 0; +} + +/** + * Verifies a detached signature and message under a given public key. + */ +int crypto_sign_verify(const uint8_t *sig, size_t siglen, + const uint8_t *m, size_t mlen, const uint8_t *pk) +{ + spx_ctx ctx; + const unsigned char *pub_root = pk + SPX_N; + unsigned char mhash[SPX_FORS_MSG_BYTES]; + unsigned char wots_pk[SPX_WOTS_BYTES]; + unsigned char root[SPX_N]; + unsigned char leaf[SPX_N]; + unsigned int i; + uint64_t tree; + uint32_t idx_leaf; + uint32_t wots_addr[8] = {0}; + uint32_t tree_addr[8] = {0}; + uint32_t wots_pk_addr[8] = {0}; + + if (siglen != SPX_BYTES) { + return -1; + } + + memcpy(ctx.pub_seed, pk, SPX_N); + + /* This hook allows the hash function instantiation to do whatever + preparation or computation it needs, based on the public seed. */ + initialize_hash_function(&ctx); + + set_type(wots_addr, SPX_ADDR_TYPE_WOTS); + set_type(tree_addr, SPX_ADDR_TYPE_HASHTREE); + set_type(wots_pk_addr, SPX_ADDR_TYPE_WOTSPK); + + /* Derive the message digest and leaf index from R || PK || M. */ + /* The additional SPX_N is a result of the hash domain separator. */ + hash_message(mhash, &tree, &idx_leaf, sig, pk, m, mlen, &ctx); + sig += SPX_N; + + /* Layer correctly defaults to 0, so no need to set_layer_addr */ + set_tree_addr(wots_addr, tree); + set_keypair_addr(wots_addr, idx_leaf); + + fors_pk_from_sig(root, sig, mhash, &ctx, wots_addr); + sig += SPX_FORS_BYTES; + + /* For each subtree.. */ + for (i = 0; i < SPX_D; i++) { + set_layer_addr(tree_addr, i); + set_tree_addr(tree_addr, tree); + + copy_subtree_addr(wots_addr, tree_addr); + set_keypair_addr(wots_addr, idx_leaf); + + copy_keypair_addr(wots_pk_addr, wots_addr); + + /* The WOTS public key is only correct if the signature was correct. */ + /* Initially, root is the FORS pk, but on subsequent iterations it is + the root of the subtree below the currently processed subtree. */ + wots_pk_from_sig(wots_pk, sig, root, &ctx, wots_addr); + sig += SPX_WOTS_BYTES; + + /* Compute the leaf node using the WOTS public key. */ + thash(leaf, wots_pk, SPX_WOTS_LEN, &ctx, wots_pk_addr); + + /* Compute the root node of this subtree. */ + compute_root(root, leaf, idx_leaf, 0, sig, SPX_TREE_HEIGHT, + &ctx, tree_addr); + sig += SPX_TREE_HEIGHT * SPX_N; + + /* Update the indices for the next layer. */ + idx_leaf = (tree & ((1 << SPX_TREE_HEIGHT)-1)); + tree = tree >> SPX_TREE_HEIGHT; + } + + /* Check if the root node equals the root node in the public key. */ + if (memcmp(root, pub_root, SPX_N)) { + return -1; + } + + return 0; +} + + +/** + * Returns an array containing the signature followed by the message. + */ +int crypto_sign(unsigned char *sm, unsigned long long *smlen, + const unsigned char *m, unsigned long long mlen, + const unsigned char *sk) +{ + size_t siglen; + + crypto_sign_signature(sm, &siglen, m, (size_t)mlen, sk); + + memmove(sm + SPX_BYTES, m, mlen); + *smlen = siglen + mlen; + + return 0; +} + +/** + * Verifies a given signature-message pair under a given public key. + */ +int crypto_sign_open(unsigned char *m, unsigned long long *mlen, + const unsigned char *sm, unsigned long long smlen, + const unsigned char *pk) +{ + /* The API caller does not necessarily know what size a signature should be + but SPHINCS+ signatures are always exactly SPX_BYTES. */ + if (smlen < SPX_BYTES) { + memset(m, 0, smlen); + *mlen = 0; + return -1; + } + + *mlen = smlen - SPX_BYTES; + + if (crypto_sign_verify(sm, SPX_BYTES, sm + SPX_BYTES, (size_t)*mlen, pk)) { + memset(m, 0, smlen); + *mlen = 0; + return -1; + } + + /* If verification was successful, move the message to the right place. */ + memmove(m, sm + SPX_BYTES, *mlen); + + return 0; +} diff --git a/sphincsplus/sphincsplus-keccakxN/test/benchmark.c b/sphincsplus/sphincsplus-keccakxN/test/benchmark.c new file mode 100644 index 0000000..2ded7e8 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/test/benchmark.c @@ -0,0 +1,210 @@ +#define _POSIX_C_SOURCE 199309L + +#include +#include +#include + +#include "../thash.h" +#include "../thashx.h" +#include "../api.h" +#include "../f1600x.h" +#include "../fors.h" +#include "../wots.h" +#include "../wotsx.h" +#include "../params.h" +#include "../randombytes.h" +#include "../f1600x.h" +#include "cycles.h" + +#define SPX_MLEN 32 +#define NTESTS 10 + +static void wots_gen_pkx(unsigned char *pk, const spx_ctx *ctx, + uint32_t addr[8]); + +static int cmp_llu(const void *a, const void*b) +{ + if(*(unsigned long long *)a < *(unsigned long long *)b) return -1; + if(*(unsigned long long *)a > *(unsigned long long *)b) return 1; + return 0; +} + +static unsigned long long median(unsigned long long *l, size_t llen) +{ + qsort(l,llen,sizeof(unsigned long long),cmp_llu); + + if(llen%2) return l[llen/2]; + else return (l[llen/2-1]+l[llen/2])/2; +} + +static void delta(unsigned long long *l, size_t llen) +{ + unsigned int i; + for(i = 0; i < llen - 1; i++) { + l[i] = l[i+1] - l[i]; + } +} + +static void printfcomma (unsigned long long n) +{ + if (n < 1000) { + printf("%llu", n); + return; + } + printfcomma(n / 1000); + printf (",%03llu", n % 1000); +} + +static void printfalignedcomma (unsigned long long n, int len) +{ + unsigned long long ncopy = n; + int i = 0; + + while (ncopy > 9) { + len -= 1; + ncopy /= 10; + i += 1; // to account for commas + } + i = i/3 - 1; // to account for commas + for (; i < len; i++) { + printf(" "); + } + printfcomma(n); +} + +static void display_result(double result, unsigned long long *l, size_t llen, unsigned long long mul) +{ + unsigned long long med; + + result /= NTESTS; + delta(l, NTESTS + 1); + med = median(l, llen); + printf("avg. %11.2lf us (%2.2lf sec); median ", result, result / 1e6); + printfalignedcomma(med, 12); + printf(" cycles, %5llux: ", mul); + printfalignedcomma(mul*med, 12); + printf(" cycles\n"); +} + +#define MEASURE_GENERIC(TEXT, MUL, FNCALL, CORR)\ + printf(TEXT);\ + clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);\ + for(i = 0; i < NTESTS; i++) {\ + t[i] = cpucycles() / CORR;\ + FNCALL;\ + }\ + t[NTESTS] = cpucycles();\ + clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &stop);\ + result = ((stop.tv_sec - start.tv_sec) * 1e6 + \ + (stop.tv_nsec - start.tv_nsec) / 1e3) / (double)CORR;\ + display_result(result, t, NTESTS, MUL); +#define MEASURT(TEXT, MUL, FNCALL)\ + MEASURE_GENERIC(\ + TEXT, MUL,\ + do {\ + for (int j = 0; j < 1000; j++) {\ + FNCALL;\ + }\ + } while (0);,\ + 1000); +#define MEASURE(TEXT, MUL, FNCALL) MEASURE_GENERIC(TEXT, MUL, FNCALL, 1) + + +static void check_overflow(){ + if(is_cpucycles_overflow()){ + printf("cycle counter overflowed; error\n"); + exit(-1); + } +} +int main() +{ + init_cpucycles(); + /* Make stdout buffer more responsive. */ + setbuf(stdout, NULL); + + spx_ctx ctx; + unsigned char pk[SPX_PK_BYTES]; + unsigned char sk[SPX_SK_BYTES]; + unsigned char *m = malloc(SPX_MLEN); + unsigned char *sm = malloc(SPX_BYTES + SPX_MLEN); + unsigned char *mout = malloc(SPX_BYTES + SPX_MLEN); + + unsigned char fors_pk[SPX_FORS_PK_BYTES]; + unsigned char fors_m[SPX_FORS_MSG_BYTES]; + unsigned char fors_sig[SPX_FORS_BYTES]; + unsigned char addr[SPX_ADDR_BYTES*KECCAK_WAY]; + unsigned char wots_pk[5*SPX_WOTS_PK_BYTES]; + unsigned char block[SPX_N]; + unsigned char *blocks[KECCAK_WAY]; + + for( int j=0; j < KECCAK_WAY; j++) + blocks[j] = block; + + unsigned long long smlen; + unsigned long long mlen; + unsigned long long t[NTESTS+1]; + struct timespec start, stop; + double result; + int i; + uint64_t statex[KECCAK_WAY*25]; + + randombytes(m, SPX_MLEN); + randombytes(addr, SPX_ADDR_BYTES*4); + + printf("Parameters: n = %d, h = %d, d = %d, b = %d, k = %d, w = %d, way=%d, tree height=%d, wots_len=%d\n", + SPX_N, SPX_FULL_HEIGHT, SPX_D, SPX_FORS_HEIGHT, SPX_FORS_TREES, + SPX_WOTS_W, KECCAK_WAY,SPX_TREE_HEIGHT, SPX_WOTS_LEN ); + + printf("Running %d iterations.\n", NTESTS); + check_overflow(); + reset_cpucycles(); + MEASURT("thash ", 1, thash(block, block, 1, &ctx, (uint32_t*)addr)); + check_overflow(); + reset_cpucycles(); + MEASURT("f1600x ", 1, keccakx_asm(statex)); + check_overflow(); + reset_cpucycles(); + MEASURT("thashx ", 1, thashx(blocks, (unsigned char const * const*)blocks,\ + 1, &ctx, (uint32_t*)addr)); + check_overflow(); + reset_cpucycles(); + MEASURE("Generating keypair.. ", 1, crypto_sign_keypair(pk, sk)); + check_overflow(); + reset_cpucycles(); + MEASURE(" - WOTS pk gen x (ideal).. ", (1 << SPX_TREE_HEIGHT) / KECCAK_WAY, wots_gen_pkx(wots_pk, &ctx, (uint32_t *) addr)); + check_overflow(); + reset_cpucycles(); + MEASURE(" - WOTS pk gen x (real).. ", ((1 << SPX_TREE_HEIGHT) + (KECCAK_WAY-1)) / KECCAK_WAY, wots_gen_pkx(wots_pk, &ctx, (uint32_t *) addr)); + check_overflow(); + reset_cpucycles(); + MEASURE("Signing.. ", 1, crypto_sign(sm, &smlen, m, SPX_MLEN, sk)); + check_overflow(); + reset_cpucycles(); + MEASURE(" - FORS signing.. ", 1, fors_sign(fors_sig, fors_pk, fors_m, &ctx, (uint32_t *) addr)); + check_overflow(); + reset_cpucycles(); + MEASURE(" - WOTS pk gen x (ideal).. ", SPX_D * (1 << SPX_TREE_HEIGHT) / KECCAK_WAY, wots_gen_pkx(wots_pk, &ctx, (uint32_t *) addr)); + check_overflow(); + reset_cpucycles(); + MEASURE(" - WOTS pk gen x (real).. ", SPX_D * (((1 << SPX_TREE_HEIGHT) + (KECCAK_WAY-1)) / KECCAK_WAY), wots_gen_pkx(wots_pk, &ctx, (uint32_t *) addr)); + check_overflow(); + reset_cpucycles(); + MEASURE("Verifying.. ", 1, crypto_sign_open(mout, &mlen, sm, smlen, pk)); + + printf("Signature size: %d (%.2f KiB)\n", SPX_BYTES, SPX_BYTES / 1024.0); + printf("Public key size: %d (%.2f KiB)\n", SPX_PK_BYTES, SPX_PK_BYTES / 1024.0); + printf("Secret key size: %d (%.2f KiB)\n", SPX_SK_BYTES, SPX_SK_BYTES / 1024.0); + + free(m); + free(sm); + free(mout); + + return 0; +} + +static void wots_gen_pkx(unsigned char *pk, const spx_ctx *ctx, uint32_t addr[8]) { + struct leaf_info_x leaf; + unsigned steps[ SPX_WOTS_LEN ] = { 0 }; + INITIALIZE_LEAF_INFO_X(leaf, addr, steps); + wots_gen_leafx(pk, ctx, 0, &leaf); +} diff --git a/sphincsplus/sphincsplus-keccakxN/test/cycles.c b/sphincsplus/sphincsplus-keccakxN/test/cycles.c new file mode 100644 index 0000000..78efb89 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/test/cycles.c @@ -0,0 +1,138 @@ +#include "cycles.h" + + + +#if defined(PMU_CYCLES) +void enable_cyclecounter() { + uint64_t tmp; + __asm __volatile ( + "mrs %[tmp], pmcr_el0\n" + "orr %[tmp], %[tmp], #1\n" + "msr pmcr_el0, %[tmp]\n" + "mrs %[tmp], PMOVSCLR_EL0\n" // reset overflow bit + "orr %[tmp], %[tmp], #(1<<31)\n" + "msr PMOVSCLR_EL0, %[tmp]\n" + "mrs %[tmp], pmcntenset_el0\n" + "orr %[tmp], %[tmp], #1<<31\n" + "msr pmcntenset_el0, %[tmp]\n" + : [tmp] "=r" (tmp) + ); +} + +void disable_cyclecounter() { + uint64_t tmp; + __asm __volatile ( + "mov %[tmp], #0x3f\n" + "orr %[tmp], %[tmp], #1<<31\n" + "msr pmcntenclr_el0, %[tmp]\n" + : [tmp] "=r" (tmp) + ); +} + +uint64_t get_cyclecounter() { + uint64_t retval; + __asm __volatile ( + "mrs %[retval], pmccntr_el0\n" + : [retval] "=r" (retval)); + return retval; +} +// Somehow weird things happen as soon as the cycle counter reaches 2^32. +// In theory, there is a long counter mode (bit 6 of pmcr_el0), but I did not +// get it to work yet. +// Instead, we reset the cycle counter after each experiment and make sure that +// it never overflows. +void reset_cpucycles() { + uint64_t tmp; + __asm __volatile ( + "mrs %[tmp], pmcr_el0\n" + "orr %[tmp], %[tmp], #(1<<2)\n" // reset cycle counter + "msr pmcr_el0, %[tmp]\n" + : [tmp] "=r" (tmp) + ); +} + +int is_cpucycles_overflow(){ + uint32_t val; + __asm __volatile("mrs %0, PMOVSSET_EL0" : "=r"(val)); + return (val & (1U<<31)); +} +#elif defined(PERF_CYCLES) + +#include +#include +#include +#include +#include +#include +#include +#include + +static int perf_fd = 0; +void enable_cyclecounter() { + struct perf_event_attr pe; + memset(&pe, 0, sizeof(struct perf_event_attr)); + pe.type = PERF_TYPE_HARDWARE; + pe.size = sizeof(struct perf_event_attr); + pe.config = PERF_COUNT_HW_CPU_CYCLES; + pe.disabled = 1; + pe.exclude_kernel = 1; + pe.exclude_hv = 1; + + perf_fd = syscall(__NR_perf_event_open, &pe, 0, -1, -1, 0); + + ioctl(perf_fd, PERF_EVENT_IOC_RESET, 0); + ioctl(perf_fd, PERF_EVENT_IOC_ENABLE, 0); +} + +void disable_cyclecounter() { + ioctl(perf_fd, PERF_EVENT_IOC_DISABLE, 0); + close(perf_fd); +} + +uint64_t get_cyclecounter() { + long long cpu_cycles; + ioctl(perf_fd, PERF_EVENT_IOC_DISABLE, 0); + ssize_t read_count = read(perf_fd, &cpu_cycles, sizeof(cpu_cycles)); + if (read_count < 0) { + perror("read"); + exit(EXIT_FAILURE); + } else if (read_count == 0) { + /* Should not happen */ + printf("perf counter empty\n"); + exit(EXIT_FAILURE); + } + ioctl(perf_fd, PERF_EVENT_IOC_ENABLE, 0); + return cpu_cycles; +} + +void reset_cpucycles(void) { + return; +} +int is_cpucycles_overflow(void){ + return 0; +} + +#elif defined(EXTERNAL_CYCLES) + +// nothing to do + +#else /* NO_CYCLES */ + +void enable_cyclecounter() { + return; +} +void disable_cyclecounter() { + return; +} +uint64_t get_cyclecounter() { + return(0); +} + +void reset_cpucycles(void) { + return; +} +int is_cpucycles_overflow(void){ + return 0; +} + +#endif /* NO_CYCLES */ \ No newline at end of file diff --git a/sphincsplus/sphincsplus-keccakxN/test/cycles.h b/sphincsplus/sphincsplus-keccakxN/test/cycles.h new file mode 100644 index 0000000..bc3dff4 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/test/cycles.h @@ -0,0 +1,21 @@ +#ifndef SPX_CYCLES_H +#define SPX_CYCLES_H + +#include + +#if !defined(EXTERNAL_CYCLES) && !defined(PERF_CYCLES) && !defined(PMU_CYCLES) && !defined(NO_CYCLES) +#define NO_CYCLES +#endif + +void enable_cyclecounter(void); +void disable_cyclecounter(void); +uint64_t get_cyclecounter(void); +void reset_cpucycles(void); +int is_cpucycles_overflow(void); + + +#define init_cpucycles enable_cyclecounter +#define cpucycles get_cyclecounter + + +#endif diff --git a/sphincsplus/sphincsplus-keccakxN/test/fors.c b/sphincsplus/sphincsplus-keccakxN/test/fors.c new file mode 100644 index 0000000..970c745 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/test/fors.c @@ -0,0 +1,41 @@ +#include +#include + +#include "../context.h" +#include "../hash.h" +#include "../fors.h" +#include "../randombytes.h" +#include "../params.h" + +int main() +{ + /* Make stdout buffer more responsive. */ + setbuf(stdout, NULL); + + spx_ctx ctx; + + unsigned char pk1[SPX_FORS_PK_BYTES]; + unsigned char pk2[SPX_FORS_PK_BYTES]; + unsigned char sig[SPX_FORS_BYTES]; + unsigned char m[SPX_FORS_MSG_BYTES]; + uint32_t addr[8] = {0}; + + randombytes(ctx.sk_seed, SPX_N); + randombytes(ctx.pub_seed, SPX_N); + randombytes(m, SPX_FORS_MSG_BYTES); + randombytes((unsigned char *)addr, 8 * sizeof(uint32_t)); + + printf("Testing FORS signature and PK derivation.. "); + + initialize_hash_function(&ctx); + + fors_sign(sig, pk1, m, &ctx, addr); + fors_pk_from_sig(pk2, sig, m, &ctx, addr); + + if (memcmp(pk1, pk2, SPX_FORS_PK_BYTES)) { + printf("failed!\n"); + return -1; + } + printf("successful.\n"); + return 0; +} diff --git a/sphincsplus/sphincsplus-keccakxN/test/functest.c b/sphincsplus/sphincsplus-keccakxN/test/functest.c new file mode 100644 index 0000000..7413dc5 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/test/functest.c @@ -0,0 +1,40 @@ +#include +#include +#include + +#include "../api.h" +#include "../randombytes.h" +#include "../params.h" + +#define SPX_MLEN 32 +#define NTESTS 10 +int main() +{ + + unsigned char pk[SPX_PK_BYTES]; + unsigned char sk[SPX_SK_BYTES]; + unsigned char *m = malloc(SPX_MLEN); + unsigned char *sm = malloc(SPX_BYTES + SPX_MLEN); + unsigned char *mout = malloc(SPX_BYTES + SPX_MLEN); + + unsigned long long smlen; + unsigned long long moutlen; + int rc; + printf("Parameters: n = %d, h = %d, d = %d, b = %d, k = %d, w = %d, way=%d, tree height=%d, wots_len=%d\n", + SPX_N, SPX_FULL_HEIGHT, SPX_D, SPX_FORS_HEIGHT, SPX_FORS_TREES, + SPX_WOTS_W, KECCAK_WAY,SPX_TREE_HEIGHT, SPX_WOTS_LEN ); + + + for(int i=0;i +#include + +#include "randombytes.h" + +static int fd = -1; + +void randombytes(uint8_t *x, size_t xlen) +{ + int i; + + if (fd == -1) { + for (;;) { + fd = open("/dev/urandom", O_RDONLY); + if (fd != -1) { + break; + } + sleep(1); + } + } + + while (xlen > 0) { + if (xlen < 1048576) { + i = xlen; + } + else { + i = 1048576; + } + + i = read(fd, x, i); + if (i < 1) { + sleep(1); + continue; + } + + x += i; + xlen -= i; + } +} diff --git a/sphincsplus/sphincsplus-keccakxN/test/spx.c b/sphincsplus/sphincsplus-keccakxN/test/spx.c new file mode 100644 index 0000000..31f3337 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/test/spx.c @@ -0,0 +1,125 @@ +#include +#include +#include + +#include "../api.h" +#include "../params.h" +#include "../randombytes.h" + +#define SPX_MLEN 32 +#define SPX_SIGNATURES 1 + +int main() +{ + int ret = 0; + int i; + + /* Make stdout buffer more responsive. */ + setbuf(stdout, NULL); + + unsigned char pk[SPX_PK_BYTES]; + unsigned char sk[SPX_SK_BYTES]; + unsigned char *m = malloc(SPX_MLEN); + unsigned char *sm = malloc(SPX_BYTES + SPX_MLEN); + unsigned char *mout = malloc(SPX_BYTES + SPX_MLEN); + unsigned long long smlen; + unsigned long long mlen; + + randombytes(m, SPX_MLEN); + + printf("Generating keypair.. "); + + if (crypto_sign_keypair(pk, sk)) { + printf("failed!\n"); + return -1; + } + printf("successful.\n"); + + printf("Testing %d signatures.. \n", SPX_SIGNATURES); + + for (i = 0; i < SPX_SIGNATURES; i++) { + printf(" - iteration #%d:\n", i); + + crypto_sign(sm, &smlen, m, SPX_MLEN, sk); + + if (smlen != SPX_BYTES + SPX_MLEN) { + printf(" X smlen incorrect [%llu != %u]!\n", + smlen, SPX_BYTES); + ret = -1; + } + else { + printf(" smlen as expected [%llu].\n", smlen); + } + + /* Test if signature is valid. */ + if (crypto_sign_open(mout, &mlen, sm, smlen, pk)) { + printf(" X verification failed!\n"); + ret = -1; + } + else { + printf(" verification succeeded.\n"); + } + + /* Test if the correct message was recovered. */ + if (mlen != SPX_MLEN) { + printf(" X mlen incorrect [%llu != %u]!\n", mlen, SPX_MLEN); + ret = -1; + } + else { + printf(" mlen as expected [%llu].\n", mlen); + } + if (memcmp(m, mout, SPX_MLEN)) { + printf(" X output message incorrect!\n"); + ret = -1; + } + else { + printf(" output message as expected.\n"); + } + + /* Test if signature is valid when validating in-place. */ + if (crypto_sign_open(sm, &mlen, sm, smlen, pk)) { + printf(" X in-place verification failed!\n"); + ret = -1; + } + else { + printf(" in-place verification succeeded.\n"); + } + + /* Test if flipping bits invalidates the signature (it should). */ + + /* Flip the first bit of the message. Should invalidate. */ + sm[smlen - 1] ^= 1; + if (!crypto_sign_open(mout, &mlen, sm, smlen, pk)) { + printf(" X flipping a bit of m DID NOT invalidate signature!\n"); + ret = -1; + } + else { + printf(" flipping a bit of m invalidates signature.\n"); + } + sm[smlen - 1] ^= 1; + +#ifdef SPX_TEST_INVALIDSIG + int j; + /* Flip one bit per hash; the signature is entirely hashes. */ + for (j = 0; j < (int)(smlen - SPX_MLEN); j += SPX_N) { + sm[j] ^= 1; + if (!crypto_sign_open(mout, &mlen, sm, smlen, pk)) { + printf(" X flipping bit %d DID NOT invalidate sig + m!\n", j); + sm[j] ^= 1; + ret = -1; + break; + } + sm[j] ^= 1; + } + if (j >= (int)(smlen - SPX_MLEN)) { + printf(" changing any signature hash invalidates signature.\n"); + } +#endif + } + + free(m); + free(sm); + free(mout); + + return ret; +} diff --git a/sphincsplus/sphincsplus-keccakxN/thash.h b/sphincsplus/sphincsplus-keccakxN/thash.h new file mode 100644 index 0000000..fd0dbb3 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/thash.h @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// This implementation is based on the public domain implementation of SPHINCS+ +// available on https://github.com/sphincs/sphincsplus +// + +#ifndef SPX_THASH_H +#define SPX_THASH_H + +#include "context.h" +#include "params.h" + +#include + +#define thash SPX_NAMESPACE(thash) +void thash(unsigned char *out, const unsigned char *in, unsigned int inblocks, + const spx_ctx *ctx, uint32_t addr[8]); + +#endif diff --git a/sphincsplus/sphincsplus-keccakxN/thash_shake_robust.c b/sphincsplus/sphincsplus-keccakxN/thash_shake_robust.c new file mode 100644 index 0000000..f64c7be --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/thash_shake_robust.c @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// This implementation is based on the public domain implementation of SPHINCS+ +// available on https://github.com/sphincs/sphincsplus +// + +#include +#include + +#include "thash.h" +#include "address.h" +#include "params.h" + +#include "fips202.h" + +/** + * Takes an array of inblocks concatenated arrays of SPX_N bytes. + */ +void thash(unsigned char *out, const unsigned char *in, unsigned int inblocks, + const spx_ctx *ctx, uint32_t addr[8]) +{ + unsigned char buf[SPX_N + SPX_ADDR_BYTES + inblocks*SPX_N]; + unsigned char bitmask[inblocks * SPX_N]; + unsigned int i; + + memcpy(buf, ctx->pub_seed, SPX_N); + memcpy(buf + SPX_N, addr, SPX_ADDR_BYTES); + + shake256(bitmask, inblocks * SPX_N, buf, SPX_N + SPX_ADDR_BYTES); + + for (i = 0; i < inblocks * SPX_N; i++) { + buf[SPX_N + SPX_ADDR_BYTES + i] = in[i] ^ bitmask[i]; + } + + shake256(out, SPX_N, buf, SPX_N + SPX_ADDR_BYTES + inblocks*SPX_N); +} diff --git a/sphincsplus/sphincsplus-keccakxN/thash_shake_robustx.c b/sphincsplus/sphincsplus-keccakxN/thash_shake_robustx.c new file mode 100644 index 0000000..d856ae6 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/thash_shake_robustx.c @@ -0,0 +1,171 @@ +/* + * Copyright (c) 2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// This implementation is based on the public domain implementation of SPHINCS+ +// available on https://github.com/sphincs/sphincsplus +// + +#include +#include + +#include "thashx.h" +#include "address.h" +#include "params.h" + +#include "fips202x.h" +#include "f1600x.h" + +#define KeccakF1600_StatePermutex keccakx_asm + +/************************************************* + * Name: load64 + * + * Description: Load 8 bytes into uint64_t in little-endian order + * + * Arguments: - const uint8_t *x: pointer to input byte array + * + * Returns the loaded 64-bit unsigned integer + **************************************************/ +static uint64_t load64(const uint8_t *x) { + uint64_t r = 0; + for (size_t i = 0; i < 8; ++i) { + r |= (uint64_t)x[i] << 8 * i; + } + + return r; +} + +/************************************************* + * Name: store64 + * + * Description: Store a 64-bit integer to a byte array in little-endian order + * + * Arguments: - uint8_t *x: pointer to the output byte array + * - uint64_t u: input 64-bit unsigned integer + **************************************************/ +static void store64(uint8_t *x, uint64_t u) { + for (size_t i = 0; i < 8; ++i) { + x[i] = (uint8_t) (u >> 8 * i); + } +} + +/** + * 5-way parallel version of thash; takes 4x as much input and output + */ +void thashx(unsigned char * const out[KECCAK_WAY], + unsigned char const * const in [KECCAK_WAY], + unsigned int inblocks, + const spx_ctx *ctx, uint32_t addrx[KECCAK_WAY*8]) +{ + if (inblocks == 1 || inblocks == 2) { + + /* As we write and read only a few quadwords, it is more efficient to */ + /* build and extract from the five-way SHAKE256 state by hand. */ + /* first 2 states interleaved; last three not interleaved */ + uint64_t state[KECCAK_WAY*25] = {0}; + + for (int i = 0; i < SPX_N/8; i++) { + uint64_t x = load64(ctx->pub_seed + 8*i); + for( int j=0; j < KECCAK_WAY; j++ ) + state[STATE_IDX(j,i)] = x; + } + for (int i = 0; i < 4; i++) { + for( int j=0; j < KECCAK_WAY; j++ ) + state[STATE_IDX(j,SPX_N/8 + i)] = (((uint64_t)addrx[j*8+1+2*i]) << 32) + | (uint64_t)addrx[j*8+2*i]; + } + + /* SHAKE domain separator and padding */ + for( int j=0; j < KECCAK_WAY; j++ ) + state[STATE_IDX(j,16)] = 0x80ll << 56; + + for( int j=0; j < KECCAK_WAY; j++ ) + state[STATE_IDX(j, (SPX_N/8)+4)] ^= 0x1f; + + /* We will permutate state2 with f1600x to compute the bitmask, */ + /* but first we'll copy it to state2 which will be used to compute */ + /* the final output, as its input is alsmost identical. */ + uint64_t state2[KECCAK_WAY*25]; + memcpy(state2, state, sizeof(state) ); + + KeccakF1600_StatePermutex(state); + + /* By copying from state, state2 already contains the pub_seed + * and addres. We just need to copy in the input blocks xorred with + * the bitmask we just computed. */ + for (unsigned int i = 0; i < (SPX_N/8) * inblocks; i++) { + for( int j=0; j < KECCAK_WAY; j++ ) + state2[STATE_IDX(j, SPX_N/8+4+i)] = state[STATE_IDX(j,i)] ^ load64(in[j] + 8*i); + } + + /* Domain separator and start of padding. Note that the quadwords */ + /* around are already zeroed for state from which we copied. */ + /* We do a XOR instead of a set as this might be the 16th quadword */ + /* when N=32 and inblocks=2, which already contains the end */ + /* of the padding. */ + for( int j=0; j < KECCAK_WAY; j++ ) + state2[STATE_IDX(j,(SPX_N/8)*(1+inblocks)+4)] ^= 0x1f; + + KeccakF1600_StatePermutex(state2); + + for (int i = 0; i < SPX_N/8; i++) { + for( int j=0; j < KECCAK_WAY; j++ ) + store64(out[j] + 8*i, state2[STATE_IDX(j,i)]); + } + } else { + unsigned char buf[KECCAK_WAY][SPX_N + SPX_ADDR_BYTES + inblocks*SPX_N]; + unsigned char *buf_ptr[KECCAK_WAY]; + unsigned char bitmask[KECCAK_WAY][inblocks*SPX_N]; + unsigned char *bitmask_ptr[KECCAK_WAY]; + unsigned int i; + + for( int j=0; j < KECCAK_WAY; j++ ) + { + memcpy(&buf[j][0], ctx->pub_seed, SPX_N); + memcpy(&buf[j][0] + SPX_N, addrx + j*8, SPX_ADDR_BYTES); + } + + for( int j=0; j < KECCAK_WAY; j++ ) + { + bitmask_ptr[j] = &bitmask[j][0]; + buf_ptr[j] = &buf[j][0]; + } + + /* unsigned char ** -> const unsigned char * const * is OK */ + shake256x(bitmask_ptr, inblocks * SPX_N, + (unsigned char const * const *) buf_ptr, + SPX_N + SPX_ADDR_BYTES); + + for( int j=0; j < KECCAK_WAY; j++ ) + for (i = 0; i < inblocks * SPX_N; i++) + buf[j][SPX_N + SPX_ADDR_BYTES + i] = in[j][i] ^ bitmask[j][i]; + + /* unsigned char ** -> const unsigned char * const * is OK */ + shake256x(out, SPX_N, + (unsigned char const * const *)buf_ptr, + SPX_N + SPX_ADDR_BYTES + inblocks*SPX_N); + } +} diff --git a/sphincsplus/sphincsplus-keccakxN/thash_shake_simple.c b/sphincsplus/sphincsplus-keccakxN/thash_shake_simple.c new file mode 100644 index 0000000..a0f5a3b --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/thash_shake_simple.c @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// This implementation is based on the public domain implementation of SPHINCS+ +// available on https://github.com/sphincs/sphincsplus +// + +#include +#include + +#include "thash.h" +#include "address.h" +#include "params.h" + +#include "fips202.h" + +/** + * Takes an array of inblocks concatenated arrays of SPX_N bytes. + */ +void thash(unsigned char *out, const unsigned char *in, unsigned int inblocks, + const spx_ctx *ctx, uint32_t addr[8]) +{ + unsigned char buf[SPX_N + SPX_ADDR_BYTES + inblocks*SPX_N]; + + memcpy(buf, ctx->pub_seed, SPX_N); + memcpy(buf + SPX_N, addr, SPX_ADDR_BYTES); + memcpy(buf + SPX_N + SPX_ADDR_BYTES, in, inblocks * SPX_N); + + shake256(out, SPX_N, buf, SPX_N + SPX_ADDR_BYTES + inblocks*SPX_N); +} diff --git a/sphincsplus/sphincsplus-keccakxN/thash_shake_simplex.c b/sphincsplus/sphincsplus-keccakxN/thash_shake_simplex.c new file mode 100644 index 0000000..1687a54 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/thash_shake_simplex.c @@ -0,0 +1,139 @@ +/* + * Copyright (c) 2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// This implementation is based on the public domain implementation of SPHINCS+ +// available on https://github.com/sphincs/sphincsplus +// + +#include +#include + +#include "thashx.h" +#include "address.h" +#include "params.h" + +#include "fips202x.h" +#include "f1600x.h" + +#define KeccakF1600_StatePermutex keccakx_asm + +/************************************************* + * Name: load64 + * + * Description: Load 8 bytes into uint64_t in little-endian order + * + * Arguments: - const uint8_t *x: pointer to input byte array + * + * Returns the loaded 64-bit unsigned integer + **************************************************/ +static uint64_t load64(const uint8_t *x) { + uint64_t r = 0; + for (size_t i = 0; i < 8; ++i) { + r |= (uint64_t)x[i] << 8 * i; + } + + return r; +} + +/************************************************* + * Name: store64 + * + * Description: Store a 64-bit integer to a byte array in little-endian order + * + * Arguments: - uint8_t *x: pointer to the output byte array + * - uint64_t u: input 64-bit unsigned integer + **************************************************/ +static void store64(uint8_t *x, uint64_t u) { + for (size_t i = 0; i < 8; ++i) { + x[i] = (uint8_t) (u >> 8 * i); + } +} + +/** + * N-way parallel version of thash; takes Nx as much input and output + */ +void thashx(unsigned char * const out[KECCAK_WAY], + unsigned char const * const in [KECCAK_WAY], + unsigned int inblocks, + const spx_ctx *ctx, uint32_t addrx[KECCAK_WAY*8]) +{ + if (inblocks == 1 || inblocks == 2) { + /* As we write and read only a few quadwords, it is more efficient to + * build and extract from the five-way SHAKE256 state by hand. */ + + // first 2 states interleaved; last three not interleaved + uint64_t state[KECCAK_WAY*25] = {0}; + + for (int i = 0; i < SPX_N/8; i++) { + uint64_t x = load64(ctx->pub_seed + 8*i); + for( int j=0; j < KECCAK_WAY; j++ ) + state[STATE_IDX(j,i)] = x; + } + + for (int i = 0; i < 4; i++) { + for( int j=0; j < KECCAK_WAY; j++ ) + state[STATE_IDX(j, SPX_N/8 + i)] = (((uint64_t)addrx[j*8+1+2*i]) << 32) + | (uint64_t)addrx[j*8+2*i]; + } + + for (unsigned int i = 0; i < (SPX_N/8) * inblocks; i++) { + for( int j=0; j < KECCAK_WAY; j++ ) + state[STATE_IDX(j, SPX_N/8+4+i)] = load64(in[j]+8*i); + } + + /* Domain separator and padding. */ + for( int j=0; j < KECCAK_WAY; j++ ) + state[STATE_IDX(j,16)] = 0x80ll << 56; + + for( int j=0; j < KECCAK_WAY; j++ ) + state[STATE_IDX(j,(SPX_N/8)*(1+inblocks)+4)] ^= 0x1f; + + KeccakF1600_StatePermutex(state); + + for (int i = 0; i < SPX_N/8; i++) { + for( int j=0; j < KECCAK_WAY; j++ ) + store64(out[j] + 8*i, state[STATE_IDX(j,i)]); + } + } else { + unsigned char buf[KECCAK_WAY][SPX_N + SPX_ADDR_BYTES + inblocks*SPX_N]; + unsigned char *buf_ptr[KECCAK_WAY]; + + for( int j=0; j < KECCAK_WAY; j++ ) + { + memcpy(&buf[j][0], ctx->pub_seed, SPX_N); + memcpy(&buf[j][0] + SPX_N, addrx + j*8, SPX_ADDR_BYTES); + memcpy(&buf[j][0] + SPX_N + SPX_ADDR_BYTES, in[j], inblocks * SPX_N); + } + + for( int j=0; j < KECCAK_WAY; j++ ) + buf_ptr[j] = &buf[j][0]; + + /* unsigned char ** -> const unsigned char * const * is OK */ + shake256x(out, SPX_N, + (unsigned char const* const*)buf_ptr, + SPX_N + SPX_ADDR_BYTES + inblocks*SPX_N); + } +} diff --git a/sphincsplus/sphincsplus-keccakxN/thashx.h b/sphincsplus/sphincsplus-keccakxN/thashx.h new file mode 100644 index 0000000..41c883a --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/thashx.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// This implementation is based on the public domain implementation of SPHINCS+ +// available on https://github.com/sphincs/sphincsplus +// + +#ifndef SPX_THASHX_H +#define SPX_THASHX_H + +#include +#include "context.h" +#include "params.h" + +#define thashx SPX_NAMESPACE(thashx) +void thashx(unsigned char * const outs[KECCAK_WAY], + unsigned char const * const ins[KECCAK_WAY], + unsigned int inblocks, + const spx_ctx *ctx, uint32_t addrx[KECCAK_WAY*8]); + +#endif diff --git a/sphincsplus/sphincsplus-keccakxN/utils.c b/sphincsplus/sphincsplus-keccakxN/utils.c new file mode 100644 index 0000000..f709219 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/utils.c @@ -0,0 +1,184 @@ +/* + * Copyright (c) 2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// This implementation is based on the public domain implementation of SPHINCS+ +// available on https://github.com/sphincs/sphincsplus +// + +#include + +#include "utils.h" +#include "params.h" +#include "hash.h" +#include "thash.h" +#include "address.h" + +/** + * Converts the value of 'in' to 'outlen' bytes in big-endian byte order. + */ +void ull_to_bytes(unsigned char *out, unsigned int outlen, + unsigned long long in) +{ + int i; + + /* Iterate over out in decreasing order, for big-endianness. */ + for (i = outlen - 1; i >= 0; i--) { + out[i] = in & 0xff; + in = in >> 8; + } +} + +void u32_to_bytes(unsigned char *out, uint32_t in) +{ + out[0] = (unsigned char)(in >> 24); + out[1] = (unsigned char)(in >> 16); + out[2] = (unsigned char)(in >> 8); + out[3] = (unsigned char)in; +} + +/** + * Converts the inlen bytes in 'in' from big-endian byte order to an integer. + */ +unsigned long long bytes_to_ull(const unsigned char *in, unsigned int inlen) +{ + unsigned long long retval = 0; + unsigned int i; + + for (i = 0; i < inlen; i++) { + retval |= ((unsigned long long)in[i]) << (8*(inlen - 1 - i)); + } + return retval; +} + +/** + * Computes a root node given a leaf and an auth path. + * Expects address to be complete other than the tree_height and tree_index. + */ +void compute_root(unsigned char *root, const unsigned char *leaf, + uint32_t leaf_idx, uint32_t idx_offset, + const unsigned char *auth_path, uint32_t tree_height, + const spx_ctx *ctx, uint32_t addr[8]) +{ + uint32_t i; + unsigned char buffer[2 * SPX_N]; + + /* If leaf_idx is odd (last bit = 1), current path element is a right child + and auth_path has to go left. Otherwise it is the other way around. */ + if (leaf_idx & 1) { + memcpy(buffer + SPX_N, leaf, SPX_N); + memcpy(buffer, auth_path, SPX_N); + } + else { + memcpy(buffer, leaf, SPX_N); + memcpy(buffer + SPX_N, auth_path, SPX_N); + } + auth_path += SPX_N; + + for (i = 0; i < tree_height - 1; i++) { + leaf_idx >>= 1; + idx_offset >>= 1; + /* Set the address of the node we're creating. */ + set_tree_height(addr, i + 1); + set_tree_index(addr, leaf_idx + idx_offset); + + /* Pick the right or left neighbor, depending on parity of the node. */ + if (leaf_idx & 1) { + thash(buffer + SPX_N, buffer, 2, ctx, addr); + memcpy(buffer, auth_path, SPX_N); + } + else { + thash(buffer, buffer, 2, ctx, addr); + memcpy(buffer + SPX_N, auth_path, SPX_N); + } + auth_path += SPX_N; + } + + /* The last iteration is exceptional; we do not copy an auth_path node. */ + leaf_idx >>= 1; + idx_offset >>= 1; + set_tree_height(addr, tree_height); + set_tree_index(addr, leaf_idx + idx_offset); + thash(root, buffer, 2, ctx, addr); +} + +/** + * For a given leaf index, computes the authentication path and the resulting + * root node using Merkle's TreeHash algorithm. + * Expects the layer and tree parts of the tree_addr to be set, as well as the + * tree type (i.e. SPX_ADDR_TYPE_HASHTREE or SPX_ADDR_TYPE_FORSTREE). + * Applies the offset idx_offset to indices before building addresses, so that + * it is possible to continue counting indices across trees. + */ +void treehash(unsigned char *root, unsigned char *auth_path, const spx_ctx* ctx, + uint32_t leaf_idx, uint32_t idx_offset, uint32_t tree_height, + void (*gen_leaf)( + unsigned char* /* leaf */, + const spx_ctx* /* ctx */, + uint32_t /* addr_idx */, const uint32_t[8] /* tree_addr */), + uint32_t tree_addr[8]) +{ + unsigned char stack[(tree_height + 1)*SPX_N]; + unsigned int heights[tree_height + 1]; + unsigned int offset = 0; + uint32_t idx; + uint32_t tree_idx; + + for (idx = 0; idx < (uint32_t)(1 << tree_height); idx++) { + /* Add the next leaf node to the stack. */ + gen_leaf(stack + offset*SPX_N, ctx, idx + idx_offset, tree_addr); + offset++; + heights[offset - 1] = 0; + + /* If this is a node we need for the auth path.. */ + if ((leaf_idx ^ 0x1) == idx) { + memcpy(auth_path, stack + (offset - 1)*SPX_N, SPX_N); + } + + /* While the top-most nodes are of equal height.. */ + while (offset >= 2 && heights[offset - 1] == heights[offset - 2]) { + /* Compute index of the new node, in the next layer. */ + tree_idx = (idx >> (heights[offset - 1] + 1)); + + /* Set the address of the node we're creating. */ + set_tree_height(tree_addr, heights[offset - 1] + 1); + set_tree_index(tree_addr, + tree_idx + (idx_offset >> (heights[offset-1] + 1))); + /* Hash the top-most nodes from the stack together. */ + thash(stack + (offset - 2)*SPX_N, + stack + (offset - 2)*SPX_N, 2, ctx, tree_addr); + offset--; + /* Note that the top-most node is now one layer higher. */ + heights[offset - 1]++; + + /* If this is a node we need for the auth path.. */ + if (((leaf_idx >> heights[offset - 1]) ^ 0x1) == tree_idx) { + memcpy(auth_path + heights[offset - 1]*SPX_N, + stack + (offset - 1)*SPX_N, SPX_N); + } + } + } + memcpy(root, stack, SPX_N); +} diff --git a/sphincsplus/sphincsplus-keccakxN/utils.h b/sphincsplus/sphincsplus-keccakxN/utils.h new file mode 100644 index 0000000..147a83e --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/utils.h @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// This implementation is based on the public domain implementation of SPHINCS+ +// available on https://github.com/sphincs/sphincsplus +// + +#ifndef SPX_UTILS_H +#define SPX_UTILS_H + +#include +#include "params.h" +#include "context.h" + + +/** + * Converts the value of 'in' to 'outlen' bytes in big-endian byte order. + */ +#define ull_to_bytes SPX_NAMESPACE(ull_to_bytes) +void ull_to_bytes(unsigned char *out, unsigned int outlen, + unsigned long long in); +#define u32_to_bytes SPX_NAMESPACE(u32_to_bytes) +void u32_to_bytes(unsigned char *out, uint32_t in); + +/** + * Converts the inlen bytes in 'in' from big-endian byte order to an integer. + */ +#define bytes_to_ull SPX_NAMESPACE(bytes_to_ull) +unsigned long long bytes_to_ull(const unsigned char *in, unsigned int inlen); + +/** + * Computes a root node given a leaf and an auth path. + * Expects address to be complete other than the tree_height and tree_index. + */ +#define compute_root SPX_NAMESPACE(compute_root) +void compute_root(unsigned char *root, const unsigned char *leaf, + uint32_t leaf_idx, uint32_t idx_offset, + const unsigned char *auth_path, uint32_t tree_height, + const spx_ctx *ctx, uint32_t addr[8]); + +/** + * For a given leaf index, computes the authentication path and the resulting + * root node using Merkle's TreeHash algorithm. + * Expects the layer and tree parts of the tree_addr to be set, as well as the + * tree type (i.e. SPX_ADDR_TYPE_HASHTREE or SPX_ADDR_TYPE_FORSTREE). + * Applies the offset idx_offset to indices before building addresses, so that + * it is possible to continue counting indices across trees. + */ +#define treehash SPX_NAMESPACE(treehash) +void treehash(unsigned char *root, unsigned char *auth_path, + const spx_ctx* ctx, + uint32_t leaf_idx, uint32_t idx_offset, uint32_t tree_height, + void (*gen_leaf)( + unsigned char* /* leaf */, + const spx_ctx* ctx /* ctx */, + uint32_t /* addr_idx */, const uint32_t[8] /* tree_addr */), + uint32_t tree_addr[8]); + +#endif diff --git a/sphincsplus/sphincsplus-keccakxN/utilsx.c b/sphincsplus/sphincsplus-keccakxN/utilsx.c new file mode 100644 index 0000000..134def1 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/utilsx.c @@ -0,0 +1,252 @@ +/* + * Copyright (c) 2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// This implementation is based on the public domain implementation of SPHINCS+ +// available on https://github.com/sphincs/sphincsplus +// + +#include + +#include "utils.h" +#include "utilsx.h" +#include "params.h" +#include "thashx.h" +#include "thash.h" +#include "address.h" + +// TODO: update docu +/* + * Generate the entire Merkle tree, computing the authentication path for leaf_idx, + * and the resulting root node using Merkle's TreeHash algorithm. + * Expects the layer and tree parts of the tree_addr to be set, as well as the + * tree type (i.e. SPX_ADDR_TYPE_HASHTREE or SPX_ADDR_TYPE_FORSTREE) + * + * This expects tree_addrx4 to be initialized to 4 parallel addr structures for + * the Merkle tree nodes + * + * Applies the offset idx_offset to indices before building addresses, so that + * it is possible to continue counting indices across trees. + * + * This works by using the standard Merkle tree building algorithm, except + * that each 'node' tracked is actually 4 consecutive nodes in the real tree. + * When we combine two logical nodes ABCD and WXYZ, we perform the H + * operation on adjacent real nodes, forming the parent logical node + * (AB)(CD)(WX)(YZ) + * + * When we get to the top two levels of the real tree (where there is only + * one logical node), we continue this operation two more times; the right + * most real node will by the actual root (and the other 3 nodes will be + * garbage). We follow the same thashx4 logic so that the 'extract + * authentication path components' part of the loop is still executed (and + * to simplify the code somewhat) + * + * This currently assumes tree_height >= 2; I suspect that doing an adjusting + * idx, addr_idx on the gen_leafx4 call if tree_height < 2 would fix it; since + * we don't actually use such short trees, I haven't bothered + */ +void treehashx(unsigned char *root, unsigned char *auth_path, + const spx_ctx *ctx, + uint32_t leaf_idx, uint32_t idx_offset, + uint32_t tree_height, + void (*gen_leafx)( + unsigned char* /* Where to write the leaves */, + const spx_ctx*, + uint32_t idx, void *info), + uint32_t tree_addr[8], + void *info) +{ + + #if KECCAK_WAY == 4 + unsigned int i,j; + /* This is where we keep the intermediate nodes */ + unsigned char stackx4[tree_height*4*SPX_N]; + + + unsigned char *in[4]; + unsigned char *out[4]; + uint32_t left_adj = 0, prev_left_adj = 0; /* When we're doing the top 3 */ + /* levels, the left-most part of the tree isn't at the beginning */ + /* of current[]. These give the offset of the actual start */ + + uint32_t idx; + uint32_t max_idx = (1 << (tree_height-2)) - 1; + + uint32_t tree_addrx4[4*8]; + for(i=0;i<8;i++){ + for(j=0;j<4;j++){ + tree_addrx4[j*8+i] = tree_addr[i]; + } + } + + + for (idx = 0;; idx++) { + unsigned char current[4*SPX_N]; /* Current logical node */ + gen_leafx( current, ctx, 4*idx + idx_offset, + info ); + + /* Now combine the freshly generated right node with previously */ + /* generated left ones */ + uint32_t internal_idx_offset = idx_offset; + uint32_t internal_idx = idx; + uint32_t internal_leaf = leaf_idx; + uint32_t h; /* The height we are in the Merkle tree */ + for (h=0;; h++, internal_idx >>= 1, internal_leaf >>= 1) { + + /* Special processing if we're at the top of the tree */ + if (h >= tree_height - 2) { + if (h == tree_height) { + /* We hit the root; return it */ + memcpy( root, ¤t[3*SPX_N], SPX_N ); + return; + } + /* The tree indexing logic is a bit off in this case */ + /* Adjust it so that the left-most node of the part of */ + /* the tree that we're processing has index 0 */ + prev_left_adj = left_adj; + left_adj = 4 - (1 << (tree_height - h - 1)); + } + + /* Check if we hit the top of the tree */ + if (h == tree_height) { + /* We hit the root; return it */ + memcpy( root, ¤t[3*SPX_N], SPX_N ); + return; + } + + /* + * Check if one of the nodes we have is a part of the + * authentication path; if it is, write it out + */ + if ((((internal_idx << 2) ^ internal_leaf) & ~0x3) == 0) { + memcpy( &auth_path[ h * SPX_N ], + ¤t[(((internal_leaf&3)^1) + prev_left_adj) * SPX_N], + SPX_N ); + } + + /* + * Check if we're at a left child; if so, stop going up the stack + * Exception: if we've reached the end of the tree, keep on going + * (so we combine the last 4 nodes into the one root node in two + * more iterations) + */ + if ((internal_idx & 1) == 0 && idx < max_idx) { + break; + } + + /* Ok, we're at a right node (or doing the top 3 levels) */ + /* Now combine the left and right logical nodes together */ + + /* Set the address of the node we're creating. */ + int j; + internal_idx_offset >>= 1; + for (j = 0; j < 4; j++) { + set_tree_height(tree_addrx4 + j*8, h + 1); + set_tree_index(tree_addrx4 + j*8, + (4/2) * (internal_idx&~1) + j - left_adj + internal_idx_offset ); + } + // unsigned char *left = &stackx4[h * 4 * SPX_N]; + // thashx4( ¤t[0 * SPX_N], + // ¤t[1 * SPX_N], + // ¤t[2 * SPX_N], + // ¤t[3 * SPX_N], + // &left [0 * SPX_N], + // &left [2 * SPX_N], + // ¤t[0 * SPX_N], + // ¤t[2 * SPX_N], + // 2, ctx, tree_addrx4); + + in[0] = &stackx4[h * 4 * SPX_N]; + in[1] = &stackx4[h * 4 * SPX_N + 2*SPX_N]; + in[2] = ¤t[0 * SPX_N]; + in[3] = ¤t[2 * SPX_N]; + + for(i=0;i<4;i++){ + out[i] = ¤t[i * SPX_N]; + } + + thashx(out, (const unsigned char**)in, 2, ctx, tree_addrx4); + } + + /* We've hit a left child; save the current for when we get the */ + /* corresponding right right */ + memcpy( &stackx4[h * 4 * SPX_N], current, 4 * SPX_N); + } + #else + + unsigned char current[KECCAK_WAY*SPX_N]; + unsigned int current_idx = KECCAK_WAY; + + // TODO: implement this later properly + unsigned char stack[(tree_height + 1)*SPX_N]; + unsigned int heights[tree_height + 1]; + unsigned int offset = 0; + uint32_t idx; + uint32_t tree_idx; + + for (idx = 0; idx < (uint32_t)(1 << tree_height); idx++) { + /* Add the next leaf node to the stack. */ + if(current_idx >= KECCAK_WAY){ + gen_leafx(current, ctx, idx + idx_offset, info); + current_idx = 0; + } + memcpy(stack + offset*SPX_N, current + current_idx*SPX_N, SPX_N); + + + offset++; + heights[offset - 1] = 0; + + /* If this is a node we need for the auth path.. */ + if ((leaf_idx ^ 0x1) == idx) { + memcpy(auth_path, stack + (offset - 1)*SPX_N, SPX_N); + } + + /* While the top-most nodes are of equal height.. */ + while (offset >= 2 && heights[offset - 1] == heights[offset - 2]) { + /* Compute index of the new node, in the next layer. */ + tree_idx = (idx >> (heights[offset - 1] + 1)); + + /* Set the address of the node we're creating. */ + set_tree_height(tree_addr, heights[offset - 1] + 1); + set_tree_index(tree_addr, tree_idx + (idx_offset >> (heights[offset-1] + 1))); + /* Hash the top-most nodes from the stack together. */ + thash(stack + (offset - 2)*SPX_N, + stack + (offset - 2)*SPX_N, 2, ctx, tree_addr); + offset--; + /* Note that the top-most node is now one layer higher. */ + heights[offset - 1]++; + + /* If this is a node we need for the auth path.. */ + if (((leaf_idx >> heights[offset - 1]) ^ 0x1) == tree_idx) { + memcpy(auth_path + heights[offset - 1]*SPX_N, + stack + (offset - 1)*SPX_N, SPX_N); + } + } + current_idx++; + } + memcpy(root, stack, SPX_N); + + #endif +} diff --git a/sphincsplus/sphincsplus-keccakxN/utilsx.h b/sphincsplus/sphincsplus-keccakxN/utilsx.h new file mode 100644 index 0000000..c3eb379 --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/utilsx.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// This implementation is based on the public domain implementation of SPHINCS+ +// available on https://github.com/sphincs/sphincsplus +// + +#ifndef SPX_UTILSX_H +#define SPX_UTILSX_H + +#include +#include "params.h" + +/** + * For a given leaf index, computes the authentication path and the resulting + * root node using Merkle's TreeHash algorithm. + * Expects the layer and tree parts of the tree_addr to be set, as well as the + * tree type (i.e. SPX_ADDR_TYPE_HASHTREE or SPX_ADDR_TYPE_FORSTREE). + * Applies the offset idx_offset to indices before building addresses, so that + * it is possible to continue counting indices across trees. + * + * This implementation computes on N internal nodes at a time (in + * parallel) + */ +#define treehashx SPX_NAMESPACE(treehashx) +void treehashx(unsigned char *root, unsigned char *auth_path, + const spx_ctx *ctx, + uint32_t leaf_idx, uint32_t idx_offset, uint32_t tree_height, + void (*gen_leafx)( + unsigned char* /* Where to write the leaves */, + const spx_ctx* /* ctx */, + uint32_t addr_idx, void *info), + uint32_t tree_addrx[8], void *info); + +#endif diff --git a/sphincsplus/sphincsplus-keccakxN/wots.c b/sphincsplus/sphincsplus-keccakxN/wots.c new file mode 100644 index 0000000..a42e00c --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/wots.c @@ -0,0 +1,300 @@ +/* + * Copyright (c) 2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// This implementation is based on the public domain implementation of SPHINCS+ +// available on https://github.com/sphincs/sphincsplus +// + +#include +#include + +#include "utils.h" +#include "utilsx.h" +#include "hash.h" +#include "hashx.h" +#include "thash.h" +#include "thashx.h" +#include "wots.h" +#include "wotsx.h" +#include "address.h" +#include "params.h" + +// TODO clarify address expectations, and make them more uniform. +// TODO i.e. do we expect types to be set already? +// TODO and do we expect modifications or copies? + +/** + * Computes up the chains + */ +static void gen_chains( + unsigned char *out, + const unsigned char *in, + unsigned int start[SPX_WOTS_LEN], + unsigned int steps[SPX_WOTS_LEN], + const spx_ctx *ctx, + uint32_t addr[8]) +{ + uint32_t i, j, k, idx, watching; + int done; + unsigned char empty[SPX_N]; + unsigned char *bufs[KECCAK_WAY]; + uint32_t addrs[8*KECCAK_WAY]; + + int l; + uint16_t counts[SPX_WOTS_W] = { 0 }; + uint16_t idxs[SPX_WOTS_LEN]; + uint16_t total, newTotal; + + /* set addrs = {addr, addr, addr, addr} */ + for (j = 0; j < KECCAK_WAY; j++) { + memcpy(addrs+j*8, addr, sizeof(uint32_t) * 8); + } + + /* Initialize out with the value at position 'start'. */ + memcpy(out, in, SPX_WOTS_LEN*SPX_N); + + /* Sort the chains in reverse order by steps using counting sort. */ + for (i = 0; i < SPX_WOTS_LEN; i++) { + counts[steps[i]]++; + } + total = 0; + for (l = SPX_WOTS_W - 1; l >= 0; l--) { + newTotal = counts[l] + total; + counts[l] = total; + total = newTotal; + } + for (i = 0; i < SPX_WOTS_LEN; i++) { + idxs[counts[steps[i]]] = i; + counts[steps[i]]++; + } + + /* We got our work cut out for us: do it! */ + for (i = 0; i < SPX_WOTS_LEN; i += KECCAK_WAY) { + for (j = 0; j < KECCAK_WAY && i+j < SPX_WOTS_LEN; j++) { + idx = idxs[i+j]; + set_chain_addr(addrs+j*8, idx); + bufs[j] = out + SPX_N * idx; + } + + /* As the chains are sorted in reverse order, we know that the first + * chain is the longest and the last one is the shortest. We keep + * an eye on whether the last chain is done and then on the one before, + * et cetera. */ + watching = KECCAK_WAY - 1; + done = 0; + while (i + watching >= SPX_WOTS_LEN) { + bufs[watching] = &empty[0]; + watching--; + } + + for (k = 0;; k++) { + while (k == steps[idxs[i+watching]]) { + bufs[watching] = &empty[0]; + if (watching == 0) { + done = 1; + break; + } + watching--; + } + if (done) { + break; + } + for (j = 0; j < watching + 1; j++) { + set_hash_addr(addrs+j*8, k + start[idxs[i+j]]); + } + + /* unsigned char ** -> const unsigned char * const * is OK */ + thashx(bufs, + (unsigned char const* const*) bufs, + 1, ctx, addrs); + } + } +} + +/** + * base_w algorithm as described in draft. + * Interprets an array of bytes as integers in base w. + * This only works when log_w is a divisor of 8. + */ +static void base_w(unsigned int *output, const int out_len, + const unsigned char *input) +{ + int in = 0; + int out = 0; + unsigned char total; + int bits = 0; + int consumed; + + for (consumed = 0; consumed < out_len; consumed++) { + if (bits == 0) { + total = input[in]; + in++; + bits += 8; + } + bits -= SPX_WOTS_LOGW; + output[out] = (total >> bits) & (SPX_WOTS_W - 1); + out++; + } +} + +/* Computes the WOTS+ checksum over a message (in base_w). */ +static void wots_checksum(unsigned int *csum_base_w, + const unsigned int *msg_base_w) +{ + unsigned int csum = 0; + unsigned char csum_bytes[(SPX_WOTS_LEN2 * SPX_WOTS_LOGW + 7) / 8]; + unsigned int i; + + /* Compute checksum. */ + for (i = 0; i < SPX_WOTS_LEN1; i++) { + csum += SPX_WOTS_W - 1 - msg_base_w[i]; + } + + /* Convert checksum to base_w. */ + /* Make sure expected empty zero bits are the least significant bits. */ + csum = csum << ((8 - ((SPX_WOTS_LEN2 * SPX_WOTS_LOGW) % 8)) % 8); + ull_to_bytes(csum_bytes, sizeof(csum_bytes), csum); + base_w(csum_base_w, SPX_WOTS_LEN2, csum_bytes); +} + +/* Takes a message and derives the matching chain lengths. */ +void chain_lengths(unsigned int *lengths, const unsigned char *msg) +{ + base_w(lengths, SPX_WOTS_LEN1, msg); + wots_checksum(lengths + SPX_WOTS_LEN1, lengths); +} + +/** + * Takes a WOTS signature and an n-byte message, computes a WOTS public key. + * + * Writes the computed public key to 'pk'. + */ +void wots_pk_from_sig(unsigned char *pk, + const unsigned char *sig, const unsigned char *msg, + const spx_ctx *ctx, uint32_t addr[8]) +{ + unsigned int steps[SPX_WOTS_LEN]; + unsigned int start[SPX_WOTS_LEN]; + uint32_t i; + + chain_lengths(start, msg); + + for (i = 0; i < SPX_WOTS_LEN; i++) { + steps[i] = SPX_WOTS_W - 1 - start[i]; + } + + gen_chains(pk, sig, start, steps, ctx, addr); +} + +/* + * This generates N sequential WOTS public keys + * It also generates the WOTS signature if leaf_info indicates + * that we're signing with one of these WOTS keys + */ +void wots_gen_leafx(unsigned char *dest, + const spx_ctx *ctx, + uint32_t leaf_idx, void *v_info) { + struct leaf_info_x *info = (struct leaf_info_x*) v_info; + uint32_t *leaf_addr = info->leaf_addr; + uint32_t *pk_addr = info->pk_addr; + unsigned int i, j, k; + unsigned char pk_buffer[ KECCAK_WAY * SPX_WOTS_BYTES ]; + unsigned wots_offset = SPX_WOTS_BYTES; + unsigned char *buffer; + uint32_t wots_k_mask; + unsigned wots_sign_index; + + if (info->wots_sign_leaf >= leaf_idx && info->wots_sign_leaf < leaf_idx + KECCAK_WAY) { + /* We're traversing the leaf that's signing; generate the WOTS */ + /* signature */ + wots_k_mask = 0; + wots_sign_index = info->wots_sign_leaf - leaf_idx; /* Which of of the N slots */ + /* do the signatures come from */ + } else { + /* Nope, we're just generating pk's; turn off the signature logic */ + wots_k_mask = ~0; + wots_sign_index = 0; + } + + for (j = 0; j < KECCAK_WAY; j++) { + set_keypair_addr( leaf_addr + j*8, leaf_idx + j ); + set_keypair_addr( pk_addr + j*8, leaf_idx + j ); + } + + for (i = 0, buffer = pk_buffer; i < SPX_WOTS_LEN; i++, buffer += SPX_N) { + + unsigned char *buffers[KECCAK_WAY]; + + uint32_t wots_k = info->wots_steps[i] | wots_k_mask; /* Set wots_k to */ + /* the step if we're generating a signature, ~0 if we're not */ + + for( j=0; j < KECCAK_WAY; j++ ) + buffers[j] = buffer + j*wots_offset; + + /* Start with the secret seed */ + for (j = 0; j < KECCAK_WAY; j++) { + set_chain_addr(leaf_addr + j*8, i); + set_hash_addr(leaf_addr + j*8, 0); + set_type(leaf_addr + j*8, SPX_ADDR_TYPE_WOTSPRF); + } + + prf_addrx(buffers, ctx, leaf_addr); + + for (j = 0; j < KECCAK_WAY; j++) { + set_type(leaf_addr + j*8, SPX_ADDR_TYPE_WOTS); + } + + /* Iterate down the WOTS chain */ + for (k=0;; k++) { + /* Check if one of the values we have needs to be saved as a */ + /* part of the WOTS signature */ + if (k == wots_k) { + memcpy( info->wots_sig + i * SPX_N, + buffer + wots_sign_index*wots_offset, SPX_N ); + } + + /* Check if we hit the top of the chain */ + if (k == SPX_WOTS_W - 1) break; + + /* Iterate one step on all 4 chains */ + for (j = 0; j < KECCAK_WAY; j++) { + set_hash_addr(leaf_addr + j*8, k); + } + thashx(buffers, (const unsigned char**)buffers, 1, ctx, leaf_addr); + } + } + + /* Do the final thash to generate the public keys */ + unsigned char *dests[KECCAK_WAY]; + unsigned char *pk_buffers[KECCAK_WAY]; + for( j=0; j < KECCAK_WAY; j++ ) + { + dests[j] = dest + j*SPX_N; + pk_buffers[j] = pk_buffer + j*wots_offset; + } + + thashx(dests, (const unsigned char**)pk_buffers, SPX_WOTS_LEN, ctx, pk_addr); +} diff --git a/sphincsplus/sphincsplus-keccakxN/wots.h b/sphincsplus/sphincsplus-keccakxN/wots.h new file mode 100644 index 0000000..a20fc9e --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/wots.h @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// This implementation is based on the public domain implementation of SPHINCS+ +// available on https://github.com/sphincs/sphincsplus +// + +#ifndef SPX_WOTS_H +#define SPX_WOTS_H + +#include + +#include "params.h" +#include "context.h" + +/** + * Takes a WOTS signature and an n-byte message, computes a WOTS public key. + * + * Writes the computed public key to 'pk'. + */ +#define wots_pk_from_sig SPX_NAMESPACE(wots_pk_from_sig) +void wots_pk_from_sig(unsigned char *pk, + const unsigned char *sig, const unsigned char *msg, + const spx_ctx *ctx, uint32_t addr[8]); + +/* + * Compute the chain lengths needed for a given message hash + */ +#define chain_lengths SPX_NAMESPACE(chain_lengths) +void chain_lengths(unsigned int *lengths, const unsigned char *msg); + +#endif diff --git a/sphincsplus/sphincsplus-keccakxN/wotsx.h b/sphincsplus/sphincsplus-keccakxN/wotsx.h new file mode 100644 index 0000000..082d25b --- /dev/null +++ b/sphincsplus/sphincsplus-keccakxN/wotsx.h @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// This implementation is based on the public domain implementation of SPHINCS+ +// available on https://github.com/sphincs/sphincsplus +// + +#if !defined( WOTSX_H_ ) +#define WOTSX_H_ + +#include +#include "params.h" + +/* + * This is here to provide an interface to the internal wots_gen_leafx + * routine. While this routine is not referenced in the package outside of + * wots.c, it is called from the stand-alone benchmark code to characterize + * the performance + */ +struct leaf_info_x { + unsigned char *wots_sig; + uint32_t wots_sign_leaf; /* The index of the WOTS we're using to sign */ + uint32_t *wots_steps; + uint32_t leaf_addr[KECCAK_WAY*8]; + uint32_t pk_addr[KECCAK_WAY*8]; +}; + +/* Macro to set the leaf_info to something 'benign', that is, it would */ +/* run with the same time as it does during the real signing process */ +/* Used only by the benchmark code */ +#define INITIALIZE_LEAF_INFO_X(info, addr, step_buffer) { \ + info.wots_sig = 0; \ + info.wots_sign_leaf = ~0; \ + info.wots_steps = step_buffer; \ + int i; \ + for (i=0; i + #include + + #define GEN_FILL_RANDOM( bits ) \ + void fill_random_u ## bits ( uint(bits) *buf, unsigned int len ) \ + { \ + unsigned byte_len = len * sizeof(*buf); \ + uint8_t *byte_buf = (uint8_t*) buf; \ + for( ; byte_len; byte_buf++, byte_len-- ) \ + { \ + uint8_t cur_byte; \ + cur_byte = get_random_byte(); \ + *byte_buf = cur_byte; \ + } \ + } + GEN_FILL_RANDOM(8) + GEN_FILL_RANDOM(16) + GEN_FILL_RANDOM(32) + #undef GEN_FILL_RANDOM + + #define GEN_COPY( bits ) \ + void copy_buf_u ## bits ( uint(bits) *dst, \ + uint(bits) const *src, unsigned int len ) \ + { \ + for( ; len; dst++, src++, len-- ) \ + *dst = *src; \ + } + GEN_COPY(8) + GEN_COPY(16) + GEN_COPY(32) + #undef GEN_COPY + + #define GEN_COMPARE_BUF( bits ) \ + int compare_buf_u ## bits ( uint(bits) const *src_a, \ + uint(bits) const *src_b, \ + unsigned len ) \ + { \ + uint(bits) res = 0; \ + for( ; len; src_a++, src_b++, len-- ) \ + res |= ( (*src_a) ^ (*src_b) ); \ + return( res ); \ + } + GEN_COMPARE_BUF(8) + GEN_COMPARE_BUF(16) + GEN_COMPARE_BUF(32) + #undef GEN_COMPARE_BUF + + #define GEN_PRINT_BUF( bits ) \ + void debug_print_buf_u ## bits ( uint(bits) const *buf, \ + unsigned entries, \ + const char *prefix ) \ + { \ + unsigned idx; \ + for( idx = 0; idx < entries; idx += 8 ) \ + { \ + debug_printf( "%s [%#04x-%#04x]: %#04x %#04x %#04x %#04x %#04x %#04x %#04x %#04x\n", \ + prefix, idx, idx+8, \ + buf[idx+0], buf[idx+1], buf[idx+2], buf[idx+3], \ + buf[idx+4], buf[idx+5], buf[idx+6], buf[idx+7] ); \ + } \ + } + GEN_PRINT_BUF(8) + GEN_PRINT_BUF(16) + GEN_PRINT_BUF(32) + #undef GEN_PRINT_BUF + + #define GEN_PRINT_BUF_S( bits ) \ + void debug_print_buf_s ## bits ( sint(bits) const *buf, \ + unsigned entries, \ + const char *prefix ) \ + { \ + unsigned idx; \ + for( idx = 0; idx < entries; idx += 8 ) \ + { \ + debug_printf( "%s [%u-%u]: %d %d %d %d %d %d %d %d\n", \ + prefix, idx, idx+8, \ + buf[idx+0], buf[idx+1], buf[idx+2], buf[idx+3], \ + buf[idx+4], buf[idx+5], buf[idx+6], buf[idx+7] ); \ + } \ + } + GEN_PRINT_BUF_S(8) +GEN_PRINT_BUF_S(16) +GEN_PRINT_BUF_S(32) +#undef GEN_PRINT_BUF_S + +/* Helper to transpose buffers in case this is needed for input preparation. */ +#define GEN_BUFFER_TRANSPOSE(bitwidth) \ +void CONCAT3(buffer_transpose_, u, bitwidth) \ + ( uint(bitwidth) *dst, uint(bitwidth) const *src, \ + unsigned block_length, unsigned dim_x, unsigned dim_y ) \ +{ \ + unsigned i,j,k,idx_load,idx_store; \ + \ + for( i=0; i +#include +#include + +void random_poly( uint16_t *poly, unsigned int len ) +{ + fill_random_u16( poly, len ); +} + +void zero_poly( uint16_t *poly, unsigned int len ) +{ + for( ; len; len--, poly++ ) + *poly = 0; +} + +int compare_poly( uint16_t const *a, uint16_t const *b, unsigned int len ) +{ + return( compare_buf_u16( a, b, len ) ); +} + +void mask_poly( uint16_t *poly, unsigned int len, unsigned bitwidth ) +{ + uint16_t mask = (1u << bitwidth) - 1; + for( ; len; len--, poly++ ) + *poly &= mask; +} + +void copy_poly( uint16_t *dst, uint16_t const *src, unsigned int len ) +{ + for( ; len; len--, dst++, src++ ) + *dst = *src; +} + +void debug_print_poly(uint16_t *poly, unsigned int len, const char *prefix ) +{ + unsigned idx; + for( idx=0; idx < len; idx += 16 ) + { + unsigned sub_idx; + debug_printf( "%s[%03u-%03u]: ", prefix, idx, idx+15 ); + for( sub_idx=0; sub_idx<16; sub_idx++ ) + debug_printf( "%02x ", (unsigned) poly[idx + sub_idx] ); + debug_printf( "\n" ); + } +} + +/* + * Things related to modular arithmetic + */ + +/* Scalar operations */ + +int32_t mod_red_s32( int64_t a, int32_t mod ) +{ + int32_t tmp = a % mod; + if( tmp < 0 ) + tmp += mod; + return( tmp ); +} + +int32_t mod_mul_s32( int32_t a, int32_t b, int32_t mod ) +{ + int64_t tmp = (int64_t) a * (int64_t) b; + int32_t res = (int32_t)( tmp % mod ); + return( res ); +} + +int32_t mod_add_s32( int32_t a, int32_t b, int32_t mod ) +{ + int64_t tmp = (int64_t) a + (int64_t) b; + int32_t res = tmp % mod; + return( res); +} + +int32_t mod_sub_s32( int32_t a, int32_t b, int32_t mod ) +{ + int64_t tmp = (int64_t) a - (int64_t) b; + int32_t res = tmp % mod; + return( res); +} + +int32_t mod_pow_s32( int32_t base, unsigned exp, int32_t mod ) +{ + int32_t base_pow = base; + int32_t tmp = 1; + while( exp != 0 ) + { + if( exp & 1 ) + tmp = mod_mul_s32( tmp, base_pow, mod ); + + base_pow = mod_mul_s32( base_pow, base_pow, mod ); + exp >>= 1; + } + + return( tmp ); +} + +/* Scalar operations */ + +int16_t mod_red_s16( int64_t a, int16_t mod ) +{ + int16_t tmp = a % mod; + if( tmp < 0 ) + tmp += mod; + return( tmp ); +} + +int16_t mod_mul_s16( int16_t a, int16_t b, int16_t mod ) +{ + int64_t tmp = (int64_t) a * (int64_t) b; + int16_t res = (int16_t)( tmp % mod ); + return( res ); +} + +int16_t mod_add_s16( int16_t a, int16_t b, int16_t mod ) +{ + int64_t tmp = (int64_t) a + (int64_t) b; + int16_t res = tmp % mod; + return( res); +} + +int16_t mod_sub_s16( int16_t a, int16_t b, int16_t mod ) +{ + int64_t tmp = (int64_t) a - (int64_t) b; + int16_t res = tmp % mod; + return( res); +} + +int16_t mod_pow_s16( int16_t base, unsigned exp, int16_t mod ) +{ + int16_t base_pow = base; + int16_t tmp = 1; + while( exp != 0 ) + { + if( exp & 1 ) + tmp = mod_mul_s16( tmp, base_pow, mod ); + + base_pow = mod_mul_s16( base_pow, base_pow, mod ); + exp >>= 1; + } + + return( tmp ); +} + +/* Buffer operations */ + +void mod_add_buf_u16( uint16_t *src_a, uint16_t *src_b, uint16_t *dst, + unsigned size ) +{ + for( unsigned i=0; i < size; i++ ) + dst[i] = src_a[i] + src_b[i]; +} + +void mod_add_buf_s32( int32_t *src_a, int32_t *src_b, int32_t *dst, + unsigned size, int32_t modulus ) +{ + for( unsigned i=0; i < size; i++ ) + dst[i] = mod_add_s32( src_a[i], src_b[i], modulus ); +} + +void mod_reduce_buf_s32( int32_t *src, unsigned size, int32_t mod ) +{ + for( unsigned i=0; i < size; i++ ) + { + src[i] = src[i] % mod; + if( src[i] < 0 ) + src[i] += mod; + } +} + +void mod_reduce_buf_s32_signed( int32_t *src, unsigned size, int32_t mod ) +{ + mod_reduce_buf_s32( src, size, mod ); + for( unsigned i=0; i < size; i++ ) + { + if( src[i] >= ( mod / 2 ) ) + src[i] -= mod; + } +} + +void mod_mul_buf_const_s32( int32_t *src, int32_t factor, int32_t *dst, + unsigned size, int32_t mod ) +{ + unsigned idx; + for( idx = 0; idx < size; idx++ ) + dst[idx] = mod_mul_s32( src[idx], factor, mod ); +} + +void mod_mul_buf_s32( int32_t *src_a, int32_t *src_b, int32_t *dst, + unsigned size, int32_t mod ) +{ + unsigned idx; + for( idx = 0; idx < size; idx++ ) + dst[idx] = mod_mul_s32( src_a[idx], src_b[idx], mod ); +} + +/* Buffer operations */ + +void mod_add_buf_s16( int16_t *src_a, int16_t *src_b, int16_t *dst, + unsigned size, int16_t modulus ) +{ + for( unsigned i=0; i < size; i++ ) + dst[i] = mod_add_s16( src_a[i], src_b[i], modulus ); +} + +void mod_reduce_buf_s16( int16_t *src, unsigned size, int16_t mod ) +{ + for( unsigned i=0; i < size; i++ ) + { + src[i] = src[i] % mod; + if( src[i] < 0 ) + src[i] += mod; + } +} + +void mod_reduce_buf_s16_signed( int16_t *src, unsigned size, int16_t mod ) +{ + mod_reduce_buf_s16( src, size, mod ); + for( unsigned i=0; i < size; i++ ) + { + if( src[i] >= ( mod / 2 ) ) + src[i] -= mod; + } +} + +void mod_mul_buf_const_s16( int16_t *src, int16_t factor, int16_t *dst, + unsigned size, int16_t mod ) +{ + unsigned idx; + for( idx = 0; idx < size; idx++ ) + dst[idx] = mod_mul_s16( src[idx], factor, mod ); +} + +void mod_mul_buf_s16( int16_t *src_a, int16_t *src_b, int16_t *dst, + unsigned size, int16_t mod ) +{ + unsigned idx; + for( idx = 0; idx < size; idx++ ) + dst[idx] = mod_mul_s16( src_a[idx], src_b[idx], mod ); +} diff --git a/tests/helloworld/main.c b/tests/helloworld/main.c new file mode 100644 index 0000000..afa5bba --- /dev/null +++ b/tests/helloworld/main.c @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2021 Arm Limited + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include +#include + +void neon_test( void *src, uint16_t *dst ); + +int main (void) +{ + uint16_t test_vector[] = { 1, 2, 3, 4, 5, 6, 7, 8 }; + uint16_t sum; + + /* Test preamble */ + debug_test_start( "Hello World!" ); + + neon_test( test_vector, &sum ); + + if( sum == 36 ) + debug_test_ok(); + else + debug_test_fail(); + + return( 0 ); +} diff --git a/tests/helloworld/neon_test.s b/tests/helloworld/neon_test.s new file mode 100644 index 0000000..56e5a49 --- /dev/null +++ b/tests/helloworld/neon_test.s @@ -0,0 +1,10 @@ + .text + .global neon_test + .global _neon_test +neon_test: +_neon_test: + ldr q0, [x0] + addv h0, v0.8h + smov w0, v0.h[0] + strh w0, [x1] + ret diff --git a/tests/inc/hal.h b/tests/inc/hal.h new file mode 100644 index 0000000..d36c977 --- /dev/null +++ b/tests/inc/hal.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2021 Arm Limited + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#if !defined(TESTS_HAL_H) +#define TESTS_HAL_H + +#include +#include + +/* Initialize random number generation */ +extern void rand_init( unsigned long seed ); + +/* Request random data. */ +extern uint8_t get_random_byte(); + +/* Debugging stubs + * + * Those stubs can either be defined as macros (which is especially + * useful when debugging shall be disabled and we don't want to waste + * code space) or as externally defined functions. + * In case no debugging is desired, just put + * ``` + * #define debug_test_start(str) do {} while(0) + * #define debug_printf( ... ) do {} while(0) + * #define debug_test_ok() do {} while(0) + * #define debug_test_fail() do {} while(0) + * ``` + * in hal_env.h. + */ +#if !defined(TESTS_HAL_DEBUG_MACRO) +extern void debug_test_start( const char *testname ); +extern void debug_printf(const char * restrict format, ... ); +extern void debug_test_ok(); +extern void debug_test_fail(); +#endif /* TESTS_HAL_DEBUG_MACRO */ + +void enable_cyclecounter(); +void disable_cyclecounter(); +uint64_t get_cyclecounter(); + +#endif /* TESTS_HAL_H */ diff --git a/tests/inc/misc.h b/tests/inc/misc.h new file mode 100644 index 0000000..df6cf50 --- /dev/null +++ b/tests/inc/misc.h @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2021 Arm Limited + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#if !defined(MVE_POLY_ARITHMETIC_TESTS_MISC) +#define MVE_POLY_ARITHMETIC_TESTS_MISC + +#include + +/* Some helper macros related to macro expansions. */ + +#define CONCAT2_(A,B) A ## B +#define CONCAT2(A,B) CONCAT2_(A,B) + +#define CONCAT3_(A,B,C) A ## B ## C +#define CONCAT3(A,B,C) CONCAT3_(A,B,C) + +#define CONCAT4_(A,B,C,D) A ## B ## C ## D +#define CONCAT4(A,B,C,D) CONCAT4_(A,B,C, + +#include "poly.h" + +int compare_poly( uint16_t const *src_a, uint16_t const *src_b, + unsigned int dim ); +void random_poly ( uint16_t *a, unsigned int dim ); +void zero_poly ( uint16_t *a, unsigned int dim ); +void mask_poly ( uint16_t *a, unsigned int dim, unsigned bitwidth ); +void copy_poly ( uint16_t *dst, uint16_t const *src, unsigned int dim ); +void debug_print_poly ( uint16_t *a, unsigned int dim, const char *prefix ); + +/* + * Helpers for modular multiplication and reduction. + */ + +/* Scalar operations */ +int32_t mod_red_s32( int64_t a, int32_t mod ); +int32_t mod_mul_s32( int32_t a, int32_t b, int32_t mod ); +int32_t mod_add_s32( int32_t a, int32_t b, int32_t mod ); +int32_t mod_sub_s32( int32_t a, int32_t b, int32_t mod ); +int32_t mod_pow_s32( int32_t base, unsigned exp, int32_t mod ); + +/* Scalar operations */ +int16_t mod_red_s16( int64_t a, int16_t mod ); +int16_t mod_mul_s16( int16_t a, int16_t b, int16_t mod ); +int16_t mod_add_s16( int16_t a, int16_t b, int16_t mod ); +int16_t mod_sub_s16( int16_t a, int16_t b, int16_t mod ); +int16_t mod_pow_s16( int16_t base, unsigned exp, int16_t mod ); + +/* Buffer operations */ +void mod_reduce_buf_s32 ( int32_t *src, unsigned size, int32_t modulus ); +void mod_reduce_buf_s32_signed( int32_t *src, unsigned size, int32_t modulus ); +void mod_mul_buf_const_s32( int32_t *src, int32_t factor, int32_t *dst, + unsigned size, int32_t mod ); +void mod_add_buf_u16( uint16_t *src_a, uint16_t *src_b, uint16_t *dst, + unsigned size ); +void mod_add_buf_s32( int32_t *src_a, int32_t *src_b, int32_t *dst, + unsigned size, int32_t modulus ); +void mod_mul_buf_s32 ( int32_t *src_a, int32_t *src_b, int32_t *dst, + unsigned size, int32_t modulus ); + +/* Buffer operations */ +void mod_reduce_buf_s16 ( int16_t *src, unsigned size, int16_t modulus ); +void mod_reduce_buf_s16_signed( int16_t *src, unsigned size, int16_t modulus ); +void mod_mul_buf_const_s16( int16_t *src, int16_t factor, int16_t *dst, + unsigned size, int16_t mod ); +void mod_add_buf_u16( uint16_t *src_a, uint16_t *src_b, uint16_t *dst, + unsigned size ); +void mod_add_buf_s16( int16_t *src_a, int16_t *src_b, int16_t *dst, + unsigned size, int16_t modulus ); +void mod_mul_buf_s16 ( int16_t *src_a, int16_t *src_b, int16_t *dst, + unsigned size, int16_t modulus ); + +#endif /* PQAX_TEST_POLY_INC */ diff --git a/tests/keccak_neon/keccak_f1600_tests.c b/tests/keccak_neon/keccak_f1600_tests.c new file mode 100755 index 0000000..fbcbea1 --- /dev/null +++ b/tests/keccak_neon/keccak_f1600_tests.c @@ -0,0 +1,458 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include +#include +#include +#include + +#include + +#include "keccak_f1600_tests.h" +#include "keccak_f1600_variants.h" + +#include "misc.h" + +void bench_scalar(void*); +void bench_vector(void*); +void bench_hybrid(void*); + +static int cmp_uint64_t(const void *a, const void *b) +{ + return (int)((*((const uint64_t *)a)) - (*((const uint64_t *)b))); +} + + +void zip_f1600_states_real( int num, uint64_t *dst, uint64_t const *src ) +{ + for( int i=0; i < KECCAK_F1600_X1_STATE_SIZE_UINT64; i++ ) + for( int j=0; j 2 ) + { + dst += 2 * KECCAK_F1600_X1_STATE_SIZE_UINT64; + src += 2 * KECCAK_F1600_X1_STATE_SIZE_UINT64; + memcpy( dst, src, ( num - 2 ) * KECCAK_F1600_X1_STATE_SIZE_BYTES ); + } + } +} + +#define stringify(x) stringify_(x) +#define stringify_(x) #x + +#define MAKE_VALIDATE_F1600_X_GENERIC_DO(testname,funcname,NUM) \ +int testname (void) \ +{ \ + debug_test_start( stringify(testname) ); \ + \ + ALIGN(64) \ + uint64_t state[NUM*KECCAK_F1600_X1_STATE_SIZE_UINT64] = { 0 }; \ + ALIGN(64) \ + uint64_t ref_state[NUM*KECCAK_F1600_X1_STATE_SIZE_UINT64] = { 0 }; \ + ALIGN(64) \ + uint64_t ref_state_[NUM*KECCAK_F1600_X1_STATE_SIZE_UINT64] = { 0 }; \ + \ + fill_random_u8( (uint8_t*) ref_state, KECCAK_F1600_X1_STATE_SIZE_BYTES ); \ + for( int i=1; i < NUM; i++ ) \ + memcpy( (uint8_t*) &ref_state[i*KECCAK_F1600_X1_STATE_SIZE_UINT64], \ + (uint8_t*) &ref_state[0*KECCAK_F1600_X1_STATE_SIZE_UINT64], \ + KECCAK_F1600_X1_STATE_SIZE_BYTES ); \ + \ + zip_f1600_states( NUM, state, ref_state ); \ + \ + funcname( state ); \ + \ + for( int i=0; i +// Author: Matthias Kannwischer +// + +#ifndef KECCAK_F1600_X2_TEST_H +#define KECCAK_F1600_X2_TEST_H + +#define TEST_WARMUP 1000 +#define TEST_ITERATIONS 100 +#define TEST_AVG_CNT 100 + +#define KECCAK_F1600_TEST_HAVE_SHA3_EXTENSION + +#define KECCAK_F1600_TEST_BENCHMARK +#define KECCAK_F1600_TEST_VALIDATE + +int validate_keccak_f1600_x1_scalar_C_v0(void); +int validate_keccak_f1600_x1_scalar_C_v1(void); +int validate_keccak_f1600_x1_scalar_asm_v1(); +int validate_keccak_f1600_x1_scalar_asm_v2(); +int validate_keccak_f1600_x1_scalar_asm_v3(); +int validate_keccak_f1600_x1_scalar_asm_v4(); +int validate_keccak_f1600_x1_scalar_asm_v5(); + +int validate_keccak_f1600_x2_scalar_C(void); +int validate_keccak_f1600_x2_neon_C_cothan(void); +int validate_keccak_f1600_x2_bas(void); + +int validate_keccak_f1600_x3_hybrid_asm_v3p(void); +int validate_keccak_f1600_x3_hybrid_asm_v6(void); +int validate_keccak_f1600_x3_hybrid_asm_v7(void); + +int validate_keccak_f1600_x4_hybrid_asm_v1(void); +int validate_keccak_f1600_x4_hybrid_asm_v2(void); +int validate_keccak_f1600_x4_hybrid_asm_v2p0(void); +int validate_keccak_f1600_x4_hybrid_asm_v3(void); +int validate_keccak_f1600_x4_hybrid_asm_v3p(void); +int validate_keccak_f1600_x4_hybrid_asm_v3pp(void); +int validate_keccak_f1600_x4_hybrid_asm_v4(void); +int validate_keccak_f1600_x4_hybrid_asm_v4p(void); +int validate_keccak_f1600_x4_hybrid_asm_v5(void); +int validate_keccak_f1600_x4_hybrid_asm_v5p(void); +int validate_keccak_f1600_x4_hybrid_asm_v6(void); +int validate_keccak_f1600_x4_hybrid_asm_v7(void); +int validate_keccak_f1600_x4_hybrid_asm_v8(void); + +int validate_keccak_f1600_x4_scalar_asm_v5(void); + +int validate_keccak_f1600_x5_hybrid_asm_v8(void); +int validate_keccak_f1600_x5_hybrid_asm_v8p(void); + +int validate_keccak_f1600_x2_hybrid_asm_v1(void); +int validate_keccak_f1600_x2_hybrid_asm_v2p0(void); +int validate_keccak_f1600_x2_hybrid_asm_v2p1(void); +int validate_keccak_f1600_x2_hybrid_asm_v2p2(void); +int validate_keccak_f1600_x2_hybrid_asm_v2pp0(void); +int validate_keccak_f1600_x2_hybrid_asm_v2pp1(void); +int validate_keccak_f1600_x2_hybrid_asm_v2pp2(void); + +int validate_keccak_f1600_x2_v84a_asm_v1(void); +int validate_keccak_f1600_x2_v84a_asm_v1p0(void); +int validate_keccak_f1600_x4_v84a_asm_v1p0(void); +int validate_keccak_f1600_x2_v84a_asm_v2(void); +int validate_keccak_f1600_x2_v84a_asm_v2p0(void); +int validate_keccak_f1600_x2_v84a_asm_v2p1(void); +int validate_keccak_f1600_x2_v84a_asm_v2p2(void); +int validate_keccak_f1600_x2_v84a_asm_v2p3(void); +int validate_keccak_f1600_x2_v84a_asm_v2p4(void); +int validate_keccak_f1600_x2_v84a_asm_v2p5(void); +int validate_keccak_f1600_x2_v84a_asm_v2p6(void); +int validate_keccak_f1600_x2_v84a_asm_v2pp0(void); +int validate_keccak_f1600_x2_v84a_asm_v2pp1(void); +int validate_keccak_f1600_x2_v84a_asm_v2pp2(void); +int validate_keccak_f1600_x2_v84a_asm_v2pp3(void); +int validate_keccak_f1600_x2_v84a_asm_v2pp4(void); +int validate_keccak_f1600_x2_v84a_asm_v2pp5(void); +int validate_keccak_f1600_x2_v84a_asm_v2pp6(void); +int validate_keccak_f1600_x2_v84a_asm_v2pp7(void); +int benchmark_keccak_f1600_x2_v84a_asm_v1(void); +int benchmark_keccak_f1600_x2_v84a_asm_v1p0(void); +int benchmark_keccak_f1600_x4_v84a_asm_v1p0(void); +int benchmark_keccak_f1600_x2_v84a_asm_v2(void); +int benchmark_keccak_f1600_x2_v84a_asm_v2p0(void); +int benchmark_keccak_f1600_x2_v84a_asm_v2p1(void); +int benchmark_keccak_f1600_x2_v84a_asm_v2p2(void); +int benchmark_keccak_f1600_x2_v84a_asm_v2p3(void); +int benchmark_keccak_f1600_x2_v84a_asm_v2p4(void); +int benchmark_keccak_f1600_x2_v84a_asm_v2p5(void); +int benchmark_keccak_f1600_x2_v84a_asm_v2p6(void); +int benchmark_keccak_f1600_x2_v84a_asm_v2pp0(void); +int benchmark_keccak_f1600_x2_v84a_asm_v2pp1(void); +int benchmark_keccak_f1600_x2_v84a_asm_v2pp2(void); +int benchmark_keccak_f1600_x2_v84a_asm_v2pp3(void); +int benchmark_keccak_f1600_x2_v84a_asm_v2pp4(void); +int benchmark_keccak_f1600_x2_v84a_asm_v2pp5(void); +int benchmark_keccak_f1600_x2_v84a_asm_v2pp6(void); +int benchmark_keccak_f1600_x2_v84a_asm_v2pp7(void); + +int benchmark_keccak_f1600_x1_scalar_C(void); +int benchmark_keccak_f1600_x1_scalar_C_v0(void); +int benchmark_keccak_f1600_x1_scalar_C_v1(void); + +int benchmark_keccak_f1600_x1_scalar_asm_v1(void); +int benchmark_keccak_f1600_x1_scalar_asm_v2(void); +int benchmark_keccak_f1600_x1_scalar_asm_v3(void); +int benchmark_keccak_f1600_x1_scalar_asm_v4(void); +int benchmark_keccak_f1600_x1_scalar_asm_v5(void); + +int benchmark_keccak_f1600_x2_scalar_C(void); +int benchmark_keccak_f1600_x2_neon_C_cothan(void); +int benchmark_keccak_f1600_x2_bas(void); + +int benchmark_keccak_f1600_x3_hybrid_asm_v3p(void); +int benchmark_keccak_f1600_x3_hybrid_asm_v6(void); +int benchmark_keccak_f1600_x3_hybrid_asm_v7(void); + +int benchmark_keccak_f1600_x4_hybrid_asm_v1(void); +int benchmark_keccak_f1600_x4_hybrid_asm_v2(void); +int benchmark_keccak_f1600_x4_hybrid_asm_v2p0(void); +int benchmark_keccak_f1600_x4_hybrid_asm_v3(void); +int benchmark_keccak_f1600_x4_hybrid_asm_v3p(void); +int benchmark_keccak_f1600_x4_hybrid_asm_v3pp(void); +int benchmark_keccak_f1600_x4_hybrid_asm_v4(void); +int benchmark_keccak_f1600_x4_hybrid_asm_v4p(void); +int benchmark_keccak_f1600_x4_hybrid_asm_v5(void); +int benchmark_keccak_f1600_x4_hybrid_asm_v5p(void); +int benchmark_keccak_f1600_x4_hybrid_asm_v6(void); +int benchmark_keccak_f1600_x4_hybrid_asm_v7(void); +int benchmark_keccak_f1600_x4_hybrid_asm_v8(void); + +int benchmark_keccak_f1600_x4_scalar_asm_v5(void); + +int benchmark_keccak_f1600_x5_hybrid_asm_v8(void); +int benchmark_keccak_f1600_x5_hybrid_asm_v8p(void); + +int benchmark_keccak_f1600_x2_hybrid_asm_v1(void); +int benchmark_keccak_f1600_x2_hybrid_asm_v2p0(void); +int benchmark_keccak_f1600_x2_hybrid_asm_v2p1(void); +int benchmark_keccak_f1600_x2_hybrid_asm_v2p2(void); +int benchmark_keccak_f1600_x2_hybrid_asm_v2pp0(void); +int benchmark_keccak_f1600_x2_hybrid_asm_v2pp1(void); +int benchmark_keccak_f1600_x2_hybrid_asm_v2pp2(void); + +int benchmark_scalar(); +int benchmark_vector(); +int benchmark_hybrid(); + +#endif /* KECCAK_F1600_X2_TEST_H */ diff --git a/tests/keccak_neon/main.c b/tests/keccak_neon/main.c new file mode 100755 index 0000000..c10a346 --- /dev/null +++ b/tests/keccak_neon/main.c @@ -0,0 +1,220 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include +#include +#include +#include + +#include "keccak_f1600_tests.h" +#include "hal.h" + +int main(void) +{ + enable_cyclecounter(); + +#if defined(KECCAK_F1600_TEST_VALIDATE) + if( validate_keccak_f1600_x1_scalar_C_v0() != 0 ) + return( 1 ); + if( validate_keccak_f1600_x1_scalar_C_v1() != 0 ) + return( 1 ); + if( validate_keccak_f1600_x1_scalar_asm_v1() != 0 ) + return( 1 ); + if( validate_keccak_f1600_x1_scalar_asm_v2() != 0 ) + return( 1 ); + if( validate_keccak_f1600_x1_scalar_asm_v3() != 0 ) + return( 1 ); + if( validate_keccak_f1600_x1_scalar_asm_v4() != 0 ) + return( 1 ); + if( validate_keccak_f1600_x1_scalar_asm_v5() != 0 ) + return( 1 ); + if( validate_keccak_f1600_x2_v84a_asm_v1() != 0 ) + return( 1 ); + if( validate_keccak_f1600_x2_v84a_asm_v1p0() != 0 ) + return( 1 ); + if( validate_keccak_f1600_x4_v84a_asm_v1p0() != 0 ) + return( 1 ); + if( validate_keccak_f1600_x2_v84a_asm_v2() != 0 ) + return( 1 ); + if( validate_keccak_f1600_x2_v84a_asm_v2p0() != 0 ) + return( 1 ); + if( validate_keccak_f1600_x2_v84a_asm_v2p1() != 0 ) + return( 1 ); + if( validate_keccak_f1600_x2_v84a_asm_v2p2() != 0 ) + return( 1 ); + if( validate_keccak_f1600_x2_v84a_asm_v2p3() != 0 ) + return( 1 ); + if( validate_keccak_f1600_x2_v84a_asm_v2p4() != 0 ) + return( 1 ); + if( validate_keccak_f1600_x2_v84a_asm_v2p5() != 0 ) + return( 1 ); + if( validate_keccak_f1600_x2_v84a_asm_v2p6() != 0 ) + return( 1 ); + if( validate_keccak_f1600_x2_v84a_asm_v2pp0() != 0 ) + return( 1 ); + if( validate_keccak_f1600_x2_v84a_asm_v2pp1() != 0 ) + return( 1 ); + if( validate_keccak_f1600_x2_v84a_asm_v2pp2() != 0 ) + return( 1 ); + if( validate_keccak_f1600_x2_v84a_asm_v2pp3() != 0 ) + return( 1 ); + if( validate_keccak_f1600_x2_v84a_asm_v2pp4() != 0 ) + return( 1 ); + if( validate_keccak_f1600_x2_v84a_asm_v2pp5() != 0 ) + return( 1 ); + if( validate_keccak_f1600_x2_v84a_asm_v2pp6() != 0 ) + return( 1 ); + if( validate_keccak_f1600_x2_v84a_asm_v2pp7() != 0 ) + return( 1 ); + if( validate_keccak_f1600_x2_scalar_C() != 0 ) + return( 1 ); + if( validate_keccak_f1600_x2_neon_C_cothan() != 0 ) + return( 1 ); + if( validate_keccak_f1600_x2_bas() != 0 ) + return( 1 ); + if( validate_keccak_f1600_x3_hybrid_asm_v3p() != 0 ) + return( 1 ); + if( validate_keccak_f1600_x3_hybrid_asm_v6() != 0 ) + return( 1 ); + if( validate_keccak_f1600_x3_hybrid_asm_v7() != 0 ) + return( 1 ); + if( validate_keccak_f1600_x4_hybrid_asm_v1() != 0 ) + return( 1 ); + if( validate_keccak_f1600_x4_hybrid_asm_v2() != 0 ) + return( 1 ); + if( validate_keccak_f1600_x4_hybrid_asm_v3() != 0 ) + return( 1 ); + if( validate_keccak_f1600_x4_hybrid_asm_v3p() != 0 ) + return( 1 ); + if( validate_keccak_f1600_x4_hybrid_asm_v3pp() != 0 ) + return( 1 ); + if( validate_keccak_f1600_x4_hybrid_asm_v4() != 0 ) + return( 1 ); + if( validate_keccak_f1600_x4_hybrid_asm_v4p() != 0 ) + return( 1 ); + if( validate_keccak_f1600_x4_hybrid_asm_v5() != 0 ) + return( 1 ); + if( validate_keccak_f1600_x4_hybrid_asm_v5p() != 0 ) + return( 1 ); + if( validate_keccak_f1600_x4_hybrid_asm_v6() != 0 ) + return( 1 ); + if( validate_keccak_f1600_x4_hybrid_asm_v7() != 0 ) + return( 1 ); + if( validate_keccak_f1600_x4_hybrid_asm_v8() != 0 ) + return( 1 ); + if( validate_keccak_f1600_x4_scalar_asm_v5() != 0 ) + return( 1 ); + if( validate_keccak_f1600_x5_hybrid_asm_v8() != 0 ) + return( 1 ); + if( validate_keccak_f1600_x2_hybrid_asm_v1() != 0 ) + return( 1 ); + if( validate_keccak_f1600_x2_hybrid_asm_v2p0() != 0 ) + return( 1 ); + if( validate_keccak_f1600_x2_hybrid_asm_v2p1() != 0 ) + return( 1 ); + if( validate_keccak_f1600_x2_hybrid_asm_v2p2() != 0 ) + return( 1 ); + if( validate_keccak_f1600_x2_hybrid_asm_v2pp0() != 0 ) + return( 1 ); + if( validate_keccak_f1600_x2_hybrid_asm_v2pp1() != 0 ) + return( 1 ); + if( validate_keccak_f1600_x2_hybrid_asm_v2pp2() != 0 ) + return( 1 ); +#endif /* KECCAK_F1600_TEST_VALIDATE */ + +#if defined(KECCAK_F1600_TEST_BENCHMARK) + benchmark_keccak_f1600_x1_scalar_C(); + benchmark_keccak_f1600_x1_scalar_C_v0(); + benchmark_keccak_f1600_x1_scalar_C_v1(); + + benchmark_keccak_f1600_x1_scalar_asm_v1(); + benchmark_keccak_f1600_x1_scalar_asm_v2(); + benchmark_keccak_f1600_x1_scalar_asm_v3(); + benchmark_keccak_f1600_x1_scalar_asm_v4(); + benchmark_keccak_f1600_x1_scalar_asm_v5(); + + benchmark_keccak_f1600_x2_scalar_C(); + benchmark_keccak_f1600_x2_v84a_asm_v2(); + benchmark_keccak_f1600_x2_v84a_asm_v1(); + benchmark_keccak_f1600_x2_v84a_asm_v1p0(); + benchmark_keccak_f1600_x4_v84a_asm_v1p0(); + benchmark_keccak_f1600_x2_v84a_asm_v2p0(); + benchmark_keccak_f1600_x2_v84a_asm_v2p1(); + benchmark_keccak_f1600_x2_v84a_asm_v2p2(); + benchmark_keccak_f1600_x2_v84a_asm_v2p3(); + benchmark_keccak_f1600_x2_v84a_asm_v2p4(); + benchmark_keccak_f1600_x2_v84a_asm_v2p5(); + benchmark_keccak_f1600_x2_v84a_asm_v2p6(); + benchmark_keccak_f1600_x2_v84a_asm_v2pp0(); + benchmark_keccak_f1600_x2_v84a_asm_v2pp1(); + benchmark_keccak_f1600_x2_v84a_asm_v2pp2(); + benchmark_keccak_f1600_x2_v84a_asm_v2pp3(); + benchmark_keccak_f1600_x2_v84a_asm_v2pp4(); + benchmark_keccak_f1600_x2_v84a_asm_v2pp5(); + benchmark_keccak_f1600_x2_v84a_asm_v2pp6(); + benchmark_keccak_f1600_x2_v84a_asm_v2pp7(); + benchmark_keccak_f1600_x2_neon_C_cothan(); + benchmark_keccak_f1600_x2_bas(); + + benchmark_keccak_f1600_x2_hybrid_asm_v1(); + benchmark_keccak_f1600_x2_hybrid_asm_v2p0(); + benchmark_keccak_f1600_x2_hybrid_asm_v2p1(); + benchmark_keccak_f1600_x2_hybrid_asm_v2p2(); + benchmark_keccak_f1600_x2_hybrid_asm_v2pp0(); + benchmark_keccak_f1600_x2_hybrid_asm_v2pp1(); + benchmark_keccak_f1600_x2_hybrid_asm_v2pp2(); + + benchmark_keccak_f1600_x3_hybrid_asm_v3p(); + benchmark_keccak_f1600_x3_hybrid_asm_v6(); + benchmark_keccak_f1600_x3_hybrid_asm_v7(); + + benchmark_keccak_f1600_x4_hybrid_asm_v1(); + benchmark_keccak_f1600_x4_hybrid_asm_v2(); + benchmark_keccak_f1600_x4_hybrid_asm_v2p0(); + benchmark_keccak_f1600_x4_hybrid_asm_v3(); + benchmark_keccak_f1600_x4_hybrid_asm_v3p(); + benchmark_keccak_f1600_x4_hybrid_asm_v3pp(); + benchmark_keccak_f1600_x4_hybrid_asm_v4(); + benchmark_keccak_f1600_x4_hybrid_asm_v4p(); + benchmark_keccak_f1600_x4_hybrid_asm_v5(); + benchmark_keccak_f1600_x4_hybrid_asm_v5p(); + benchmark_keccak_f1600_x4_hybrid_asm_v6(); + benchmark_keccak_f1600_x4_hybrid_asm_v7(); + benchmark_keccak_f1600_x4_hybrid_asm_v8(); + + benchmark_keccak_f1600_x4_scalar_asm_v5(); + + benchmark_keccak_f1600_x5_hybrid_asm_v8(); + benchmark_keccak_f1600_x5_hybrid_asm_v8p(); +#endif /* KECCAK_F1600_TEST_BENCHMARK */ + + disable_cyclecounter(); + return( 0 ); +} diff --git a/tests/keccak_neon/manual/keccak_f1600_variants.h b/tests/keccak_neon/manual/keccak_f1600_variants.h new file mode 100644 index 0000000..400e90f --- /dev/null +++ b/tests/keccak_neon/manual/keccak_f1600_variants.h @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#ifndef KECCAK_F1600_MANUAL_H +#define KECCAK_F1600_MANUAL_H + +#include + +#define KECCAK_F1600_X1_STATE_SIZE_BITS 1600 +#define KECCAK_F1600_X1_STATE_SIZE_BYTES (KECCAK_F1600_X1_STATE_SIZE_BITS/8) +#define KECCAK_F1600_X1_STATE_SIZE_UINT64 (KECCAK_F1600_X1_STATE_SIZE_BYTES/8) + +#define KECCAK_F1600_X2_STATE_SIZE_BITS (2*1600) +#define KECCAK_F1600_X2_STATE_SIZE_BYTES (KECCAK_F1600_X2_STATE_SIZE_BITS/8) +#define KECCAK_F1600_X2_STATE_SIZE_UINT64 (KECCAK_F1600_X2_STATE_SIZE_BYTES/8) + +/* Third party implementations */ +void keccak_f1600_x1_scalar_C ( uint64_t state[KECCAK_F1600_X1_STATE_SIZE_UINT64] ); +void keccak_f1600_x2_scalar_C ( uint64_t state[KECCAK_F1600_X2_STATE_SIZE_UINT64] ); +void keccak_f1600_x2_bas ( uint64_t state[KECCAK_F1600_X2_STATE_SIZE_UINT64] ); +#include +typedef uint64x2_t v128; +void keccak_f1600_x2_neon_C_cothan( v128 state[25] ); + +/* PQAX implementations */ +void keccak_f1600_x2_v84a_asm_v1( uint64_t state[KECCAK_F1600_X2_STATE_SIZE_UINT64] ); +void keccak_f1600_x2_v84a_asm_v1p0( uint64_t state[KECCAK_F1600_X2_STATE_SIZE_UINT64] ); +void keccak_f1600_x4_v84a_asm_v1p0( uint64_t state[KECCAK_F1600_X2_STATE_SIZE_UINT64] ); +void keccak_f1600_x2_v84a_asm_v2( uint64_t state[KECCAK_F1600_X2_STATE_SIZE_UINT64] ); +void keccak_f1600_x2_v84a_asm_v2p0( uint64_t state[KECCAK_F1600_X2_STATE_SIZE_UINT64] ); +void keccak_f1600_x2_v84a_asm_v2p1( uint64_t state[KECCAK_F1600_X2_STATE_SIZE_UINT64] ); +void keccak_f1600_x2_v84a_asm_v2p2( uint64_t state[KECCAK_F1600_X2_STATE_SIZE_UINT64] ); +void keccak_f1600_x2_v84a_asm_v2p3( uint64_t state[KECCAK_F1600_X2_STATE_SIZE_UINT64] ); +void keccak_f1600_x2_v84a_asm_v2p4( uint64_t state[KECCAK_F1600_X2_STATE_SIZE_UINT64] ); +void keccak_f1600_x2_v84a_asm_v2p5( uint64_t state[KECCAK_F1600_X2_STATE_SIZE_UINT64] ); +void keccak_f1600_x2_v84a_asm_v2p6( uint64_t state[KECCAK_F1600_X2_STATE_SIZE_UINT64] ); +void keccak_f1600_x2_v84a_asm_v2pp0( uint64_t state[KECCAK_F1600_X2_STATE_SIZE_UINT64] ); +void keccak_f1600_x2_v84a_asm_v2pp1( uint64_t state[KECCAK_F1600_X2_STATE_SIZE_UINT64] ); +void keccak_f1600_x2_v84a_asm_v2pp2( uint64_t state[KECCAK_F1600_X2_STATE_SIZE_UINT64] ); +void keccak_f1600_x2_v84a_asm_v2pp3( uint64_t state[KECCAK_F1600_X2_STATE_SIZE_UINT64] ); +void keccak_f1600_x2_v84a_asm_v2pp4( uint64_t state[KECCAK_F1600_X2_STATE_SIZE_UINT64] ); +void keccak_f1600_x2_v84a_asm_v2pp5( uint64_t state[KECCAK_F1600_X2_STATE_SIZE_UINT64] ); +void keccak_f1600_x2_v84a_asm_v2pp6( uint64_t state[KECCAK_F1600_X2_STATE_SIZE_UINT64] ); +void keccak_f1600_x2_v84a_asm_v2pp7( uint64_t state[KECCAK_F1600_X2_STATE_SIZE_UINT64] ); + +void keccak_f1600_x1_scalar_C_original( uint64_t state[KECCAK_F1600_X1_STATE_SIZE_UINT64] ); +void keccak_f1600_x1_scalar_C_v0( uint64_t state[KECCAK_F1600_X1_STATE_SIZE_UINT64] ); +void keccak_f1600_x1_scalar_C_v1( uint64_t state[KECCAK_F1600_X1_STATE_SIZE_UINT64] ); + +void keccak_f1600_x1_scalar_asm_v1( uint64_t state[KECCAK_F1600_X1_STATE_SIZE_UINT64] ); +void keccak_f1600_x1_scalar_asm_v2( uint64_t state[KECCAK_F1600_X1_STATE_SIZE_UINT64] ); +void keccak_f1600_x1_scalar_asm_v3( uint64_t state[KECCAK_F1600_X1_STATE_SIZE_UINT64] ); +void keccak_f1600_x1_scalar_asm_v4( uint64_t state[KECCAK_F1600_X1_STATE_SIZE_UINT64] ); +void keccak_f1600_x1_scalar_asm_v5( uint64_t state[KECCAK_F1600_X1_STATE_SIZE_UINT64] ); + +void keccak_f1600_x4_scalar_asm_v1( uint64_t state[4*KECCAK_F1600_X1_STATE_SIZE_UINT64] ); +void keccak_f1600_x4_scalar_asm_v5( uint64_t state[4*KECCAK_F1600_X1_STATE_SIZE_UINT64] ); + +void keccak_f1600_x3_hybrid_asm_v3p( uint64_t state[3*KECCAK_F1600_X1_STATE_SIZE_UINT64] ); +void keccak_f1600_x3_hybrid_asm_v6( uint64_t state[3*KECCAK_F1600_X1_STATE_SIZE_UINT64] ); +void keccak_f1600_x3_hybrid_asm_v7( uint64_t state[3*KECCAK_F1600_X1_STATE_SIZE_UINT64] ); + + +void keccak_f1600_x4_hybrid_asm_v1 ( uint64_t state[4*KECCAK_F1600_X1_STATE_SIZE_UINT64] ); +void keccak_f1600_x4_hybrid_asm_v2 ( uint64_t state[4*KECCAK_F1600_X1_STATE_SIZE_UINT64] ); +void keccak_f1600_x4_hybrid_asm_v2p0( uint64_t state[4*KECCAK_F1600_X1_STATE_SIZE_UINT64] ); +void keccak_f1600_x4_hybrid_asm_v3 ( uint64_t state[4*KECCAK_F1600_X1_STATE_SIZE_UINT64] ); +void keccak_f1600_x4_hybrid_asm_v3p( uint64_t state[4*KECCAK_F1600_X1_STATE_SIZE_UINT64] ); +void keccak_f1600_x4_hybrid_asm_v3pp( uint64_t state[4*KECCAK_F1600_X1_STATE_SIZE_UINT64] ); +void keccak_f1600_x4_hybrid_asm_v4 ( uint64_t state[4*KECCAK_F1600_X1_STATE_SIZE_UINT64] ); +void keccak_f1600_x4_hybrid_asm_v4p ( uint64_t state[4*KECCAK_F1600_X1_STATE_SIZE_UINT64] ); +void keccak_f1600_x4_hybrid_asm_v5 ( uint64_t state[4*KECCAK_F1600_X1_STATE_SIZE_UINT64] ); +void keccak_f1600_x4_hybrid_asm_v5p ( uint64_t state[4*KECCAK_F1600_X1_STATE_SIZE_UINT64] ); +void keccak_f1600_x4_hybrid_asm_v6 ( uint64_t state[4*KECCAK_F1600_X1_STATE_SIZE_UINT64] ); +void keccak_f1600_x4_hybrid_asm_v7 ( uint64_t state[4*KECCAK_F1600_X1_STATE_SIZE_UINT64] ); +void keccak_f1600_x4_hybrid_asm_v8 ( uint64_t state[4*KECCAK_F1600_X1_STATE_SIZE_UINT64] ); + +void keccak_f1600_x5_hybrid_asm_v8 ( uint64_t state[4*KECCAK_F1600_X1_STATE_SIZE_UINT64] ); +void keccak_f1600_x5_hybrid_asm_v8p ( uint64_t state[4*KECCAK_F1600_X1_STATE_SIZE_UINT64] ); + +void keccak_f1600_x2_hybrid_asm_v1 ( uint64_t state[2*KECCAK_F1600_X1_STATE_SIZE_UINT64] ); +void keccak_f1600_x2_hybrid_asm_v2p0 ( uint64_t state[2*KECCAK_F1600_X1_STATE_SIZE_UINT64] ); +void keccak_f1600_x2_hybrid_asm_v2p1 ( uint64_t state[2*KECCAK_F1600_X1_STATE_SIZE_UINT64] ); +void keccak_f1600_x2_hybrid_asm_v2p2 ( uint64_t state[2*KECCAK_F1600_X1_STATE_SIZE_UINT64] ); +void keccak_f1600_x2_hybrid_asm_v2pp0 ( uint64_t state[2*KECCAK_F1600_X1_STATE_SIZE_UINT64] ); +void keccak_f1600_x2_hybrid_asm_v2pp1 ( uint64_t state[2*KECCAK_F1600_X1_STATE_SIZE_UINT64] ); +void keccak_f1600_x2_hybrid_asm_v2pp2 ( uint64_t state[2*KECCAK_F1600_X1_STATE_SIZE_UINT64] ); + +#endif diff --git a/tests/keccak_neon/manual/keccak_f1600_x1_scalar_C.c b/tests/keccak_neon/manual/keccak_f1600_x1_scalar_C.c new file mode 100644 index 0000000..2feca64 --- /dev/null +++ b/tests/keccak_neon/manual/keccak_f1600_x1_scalar_C.c @@ -0,0 +1,591 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +// Derived from public domain implementation +// in crypto_hash/keccakc512/simple/ from http://bench.cr.yp.to/supercop.html +// by Ronny Van Keer. + +#include "keccak_f1600_variants.h" + +#define KECCAK_F1600_ROUNDS 24 + +static const uint64_t round_constants[KECCAK_F1600_ROUNDS] = +{ + (uint64_t)0x0000000000000001ULL, + (uint64_t)0x0000000000008082ULL, + (uint64_t)0x800000000000808aULL, + (uint64_t)0x8000000080008000ULL, + (uint64_t)0x000000000000808bULL, + (uint64_t)0x0000000080000001ULL, + (uint64_t)0x8000000080008081ULL, + (uint64_t)0x8000000000008009ULL, + (uint64_t)0x000000000000008aULL, + (uint64_t)0x0000000000000088ULL, + (uint64_t)0x0000000080008009ULL, + (uint64_t)0x000000008000000aULL, + (uint64_t)0x000000008000808bULL, + (uint64_t)0x800000000000008bULL, + (uint64_t)0x8000000000008089ULL, + (uint64_t)0x8000000000008003ULL, + (uint64_t)0x8000000000008002ULL, + (uint64_t)0x8000000000000080ULL, + (uint64_t)0x000000000000800aULL, + (uint64_t)0x800000008000000aULL, + (uint64_t)0x8000000080008081ULL, + (uint64_t)0x8000000000008080ULL, + (uint64_t)0x0000000080000001ULL, + (uint64_t)0x8000000080008008ULL +}; + +/* Note: It should not be necessary to use inline assembly here, but + * compilers don't seem to reliably detect potential uses of + * EOR-with-ROR and BIC-with-ROR at the time of writing. */ + +#if defined(inline) +#undef inline +#endif + +#define inline __attribute__((unused)) inline + +#define GEN_BIC_ROL(imm) \ +static inline uint64_t bic_rol_ ## imm ( uint64_t b, uint64_t a ) \ +{ \ + uint64_t res = 0; \ + __asm ("bic %[result], %[input_a], %[input_b], ROR #(64-" #imm ")" \ + : [result] "=r" (res) \ + : [input_a] "r" (a), [input_b] "r" (b) \ + ); \ + return( res ); \ +} + +#define GEN_XOR_ROL(imm) \ +static inline uint64_t xor_rol_ ## imm ( uint64_t b, uint64_t a ) \ +{ \ + uint64_t res = 0; \ + __asm ("eor %[result], %[input_a], %[input_b], ROR #(64-" #imm ")" \ + : [result] "=r" (res) \ + : [input_a] "r" (a), [input_b] "r" (b) \ + ); \ + return( res ); \ +} + +#define GEN_ROL(imm) \ +static inline uint64_t rol_ ## imm ( uint64_t a ) \ +{ \ + uint64_t res = 0; \ + __asm ("ROR %[result], %[input_a], #(64-" #imm ")" \ + : [result] "=r" (res) \ + : [input_a] "r" (a) \ + ); \ + return( res ); \ +} + +#define GEN_ALL(F) \ + F(0) F(1) F(2) F(3) F(4) F(5) F(6) F(7) \ + F(8) F(9) F(10) F(11) F(12) F(13) F(14) F(15) \ + F(16) F(17) F(18) F(19) F(20) F(21) F(22) F(23) \ + F(24) F(25) F(26) F(27) F(28) F(29) F(30) F(31) \ + F(32) F(33) F(34) F(35) F(36) F(37) F(38) F(39) \ + F(40) F(41) F(42) F(43) F(44) F(45) F(46) F(47) \ + F(48) F(49) F(50) F(51) F(52) F(53) F(54) F(55) \ + F(56) F(57) F(58) F(59) F(60) F(61) F(62) F(63) + +GEN_ALL(GEN_BIC_ROL) +GEN_ALL(GEN_ROL) +GEN_ALL(GEN_XOR_ROL) + +void keccak_f1600_x1_scalar_C_v0( uint64_t state[KECCAK_F1600_X1_STATE_SIZE_UINT64] ) +{ + int round; + + uint64_t Aba, Abe, Abi, Abo, Abu; + uint64_t Aga, Age, Agi, Ago, Agu; + uint64_t Aka, Ake, Aki, Ako, Aku; + uint64_t Ama, Ame, Ami, Amo, Amu; + uint64_t Asa, Ase, Asi, Aso, Asu; + uint64_t BCa, BCe, BCi, BCo, BCu; + uint64_t Da, De, Di, Do, Du; + + uint64_t tmp0, tmp1; + + Aba = state[ 0]; Abe = state[ 1]; Abi = state[ 2]; Abo = state[ 3]; + Abu = state[ 4]; Aga = state[ 5]; Age = state[ 6]; Agi = state[ 7]; + Ago = state[ 8]; Agu = state[ 9]; Aka = state[10]; Ake = state[11]; + Aki = state[12]; Ako = state[13]; Aku = state[14]; Ama = state[15]; + Ame = state[16]; Ami = state[17]; Amo = state[18]; Amu = state[19]; + Asa = state[20]; Ase = state[21]; Asi = state[22]; Aso = state[23]; + Asu = state[24]; + + BCa = Aba^Aga^Aka^Ama^Asa; + BCe = Abe^Age^Ake^Ame^Ase; + BCi = Abi^Agi^Aki^Ami^Asi; + BCo = Abo^Ago^Ako^Amo^Aso; + BCu = Abu^Agu^Aku^Amu^Asu; + + Da =xor_rol_1(BCe,BCu); + De =xor_rol_1(BCi,BCa); + Di =xor_rol_1(BCo,BCe); + Do =xor_rol_1(BCu,BCi); + Du =xor_rol_1(BCa,BCo); + + tmp0 = Abe; + Aba = Aba ^ Da; Abe = Age ^ De; Age = Agu ^ Du; Agu = Asi ^ Di; + Asi = Aku ^ Du; Aku = Asa ^ Da; Asa = Abi ^ Di; Abi = Aki ^ Di; + Aki = Ako ^ Do; Ako = Amu ^ Du; Amu = Aso ^ Do; Aso = Ama ^ Da; + Ama = Abu ^ Du; Abu = Asu ^ Du; Asu = Ase ^ De; Ase = Ago ^ Do; + Ago = Ame ^ De; Ame = Aga ^ Da; Aga = Abo ^ Do; Abo = Amo ^ Do; + Amo = Ami ^ Di; Ami = Ake ^ De; Ake = Agi ^ Di; Agi = Aka ^ Da; + Aka = tmp0 ^ De; + + tmp0 = Aba ^ rol_43(bic_rol_1(Abe, Abi)); + tmp1 = xor_rol_23(Abe,bic_rol_22(Abi, Abo)); + Abi = xor_rol_29(Abi,bic_rol_7 (Abo, Abu)); + Abo = xor_rol_21(Abo,bic_rol_14(Abu, Aba)); + Abu = xor_rol_34(Abu,bic_rol_20(Aba, Abe)); + Aba = tmp0; + Abe = tmp1; + + tmp0 = xor_rol_25(Aga,bic_rol_17(Age, Agi)); + tmp1 = xor_rol_39(Age,bic_rol_22(Agi, Ago)); + Agi = xor_rol_6(Agi,bic_rol_48(Ago, Agu)); + Ago = xor_rol_17(Ago,bic_rol_33(Agu, Aga)); + Agu = xor_rol_41(Agu,bic_rol_8 (Aga, Age)); + Aga = tmp0; + Age = tmp1; + + tmp0 = xor_rol_40(Aka,bic_rol_45(Ake, Aki)); + tmp1 = xor_rol_62(Ake,bic_rol_17(Aki, Ako)); + Aki = xor_rol_7(Aki,bic_rol_54(Ako, Aku)); + Ako = xor_rol_7(Ako,bic_rol_17(Aku, Aka)); + Aku = xor_rol_12(Aku,bic_rol_59(Aka, Ake)); + Aka = tmp0; + Ake = tmp1; + + tmp0 = xor_rol_17(Ama,bic_rol_26(Ame, Ami)); + tmp1 = xor_rol_21(Ame,bic_rol_59(Ami, Amo)); + Ami = xor_rol_18(Ami,bic_rol_23(Amo, Amu)); + Amo = xor_rol_52(Amo,bic_rol_29(Amu, Ama)); + Amu = xor_rol_20(Amu,bic_rol_55(Ama, Ame)); + Ama = tmp0; + Ame = tmp1; + + tmp0 = xor_rol_23(Asa,bic_rol_16(Ase, Asi)); + tmp1 = xor_rol_14(Ase,bic_rol_62(Asi, Aso)); + Asi = xor_rol_37(Asi,bic_rol_39(Aso, Asu)); + Aso = xor_rol_43(Aso,bic_rol_4 (Asu, Asa)); + Asu = xor_rol_11(Asu,bic_rol_7 (Asa, Ase)); + Asa = tmp0; + Ase = tmp1; + + Aba ^= (uint64_t)round_constants[0]; + + for(round = 1; round < KECCAK_F1600_ROUNDS; round++ ) + { + + BCa = xor_rol_14( Asa, Aka); + BCa = xor_rol_15( BCa, Ama ); + BCa = xor_rol_7 ( BCa, Aga ); + BCa = xor_rol_3 ( BCa, Aba ); + + BCe = xor_rol_4 ( Age, Ase ); + BCe = xor_rol_20( BCe, Abe ); + BCe = xor_rol_6 ( BCe, Ame ); + BCe = xor_rol_7 ( BCe, Ake ); + BCe = rol_8( BCe ); + + BCi = xor_rol_5 ( Agi, Ami ); + BCi = xor_rol_38( BCi, Aki ); + BCi = xor_rol_4 ( BCi, Abi ); + BCi = xor_rol_12( BCi, Asi ); + BCi = rol_2( BCi ); + + BCo = xor_rol_34( Aso, Ago ); + BCo = xor_rol_1 ( BCo, Amo ); + BCo = xor_rol_26( BCo, Ako ); + BCo = xor_rol_1 ( BCo, Abo ); + + BCu = xor_rol_11( Asu, Abu ); + BCu = xor_rol_8 ( BCu, Amu ); + BCu = xor_rol_16( BCu, Agu ); + BCu = xor_rol_14( BCu, Aku ); + BCu = rol_6( BCu ); + + Da =xor_rol_1(BCe,BCu); + De =xor_rol_1(BCi,BCa); + Di =xor_rol_1(BCo,BCe); + Do =xor_rol_1(BCu,BCi); + Du =xor_rol_1(BCa,BCo); + + tmp0 = Abe; + Aba = Aba ^ Da; + + Abe = xor_rol_45(Age,De); + Age = xor_rol_20(Agu,Du); + Agu = xor_rol_2 (Asi,Di); + Asi = xor_rol_6 (Aku,Du); + Aku = xor_rol_39(Asa,Da); + Asa = xor_rol_14(Abi,Di); + Abi = xor_rol_18(Aki,Di); + Aki = xor_rol_1 (Ako,Do); + Ako = xor_rol_36(Amu,Du); + Amu = xor_rol_62(Aso,Do); + Aso = xor_rol_10(Ama,Da); + Ama = xor_rol_44(Abu,Du); + Abu = xor_rol_55(Asu,Du); + Asu = xor_rol_41(Ase,De); + Ase = xor_rol_28(Ago,Do); + Ago = xor_rol_15(Ame,De); + Ame = xor_rol_3(Aga,Da); + Aga = Abo ^ Do; + Abo = xor_rol_27(Amo,Do); + Amo = xor_rol_56(Ami,Di); + Ami = xor_rol_8 (Ake,De); + Ake = xor_rol_61(Agi,Di); + Agi = xor_rol_25(Aka,Da); + Aka = xor_rol_21(tmp0, De); + + tmp0 = xor_rol_43(bic_rol_1(Abe, Abi), Aba ); + tmp1 = xor_rol_23(Abe, bic_rol_22(Abi, Abo) ); + Abi = xor_rol_29(Abi, bic_rol_7 (Abo, Abu) ); + Abo = xor_rol_21(Abo, bic_rol_14(Abu, Aba) ); + Abu = xor_rol_34(Abu, bic_rol_20(Aba, Abe) ); + Aba = tmp0; + Abe = tmp1; + + tmp0 = xor_rol_25(Aga, bic_rol_17(Age, Agi) ); + tmp1 = xor_rol_39(Age, bic_rol_22(Agi, Ago) ); + Agi = xor_rol_6 (Agi, bic_rol_48(Ago, Agu) ); + Ago = xor_rol_17(Ago, bic_rol_33(Agu, Aga) ); + Agu = xor_rol_41(Agu, bic_rol_8 (Aga, Age) ); + Aga = tmp0; + Age = tmp1; + + tmp0 = xor_rol_40(Aka, bic_rol_45(Ake, Aki) ); + tmp1 = xor_rol_62(Ake, bic_rol_17(Aki, Ako) ); + Aki = xor_rol_7 (Aki, bic_rol_54(Ako, Aku) ); + Ako = xor_rol_7 (Ako, bic_rol_17(Aku, Aka) ); + Aku = xor_rol_12(Aku, bic_rol_59(Aka, Ake) ); + Aka = tmp0; + Ake = tmp1; + + tmp0 = xor_rol_17(Ama, bic_rol_26(Ame, Ami) ); + tmp1 = xor_rol_21(Ame, bic_rol_59(Ami, Amo) ); + Ami = xor_rol_18(Ami, bic_rol_23(Amo, Amu) ); + Amo = xor_rol_52(Amo, bic_rol_29(Amu, Ama) ); + Amu = xor_rol_20(Amu, bic_rol_55(Ama, Ame) ); + Ama = tmp0; + Ame = tmp1; + + tmp0 = xor_rol_23(Asa, bic_rol_16(Ase, Asi) ); + tmp1 = xor_rol_14(Ase, bic_rol_62(Asi, Aso) ); + Asi = xor_rol_37(Asi, bic_rol_39(Aso, Asu) ); + Aso = xor_rol_43(Aso, bic_rol_4 (Asu, Asa) ); + Asu = xor_rol_11(Asu, bic_rol_7 (Asa, Ase) ); + Asa = tmp0; + Ase = tmp1; + + Aba ^= (uint64_t)round_constants[round]; + + } + + Aga = rol_3 (Aga); Aka = rol_25(Aka); Ama = rol_10(Ama); Asa = rol_39(Asa); + Abe = rol_21(Abe); Age = rol_45(Age); Ake = rol_8 (Ake); Ame = rol_15(Ame); + Ase = rol_41(Ase); Abi = rol_14(Abi); Agi = rol_61(Agi); Aki = rol_18(Aki); + Ami = rol_56(Ami); Asi = rol_2 (Asi); Ago = rol_28(Ago); Ako = rol_1 (Ako); + Amo = rol_27(Amo); Aso = rol_62(Aso); Abu = rol_44(Abu); Agu = rol_20(Agu); + Aku = rol_6 (Aku); Amu = rol_36(Amu); Asu = rol_55(Asu); + + state[ 0] = Aba; state[ 1] = Abe; state[ 2] = Abi; state[ 3] = Abo; + state[ 4] = Abu; state[ 5] = Aga; state[ 6] = Age; state[ 7] = Agi; + state[ 8] = Ago; state[ 9] = Agu; state[10] = Aka; state[11] = Ake; + state[12] = Aki; state[13] = Ako; state[14] = Aku; state[15] = Ama; + state[16] = Ame; state[17] = Ami; state[18] = Amo; state[19] = Amu; + state[20] = Asa; state[21] = Ase; state[22] = Asi; state[23] = Aso; + state[24] = Asu; +} + +void keccak_f1600_x1_scalar_C_v1( uint64_t state[KECCAK_F1600_X1_STATE_SIZE_UINT64] ) +{ + int round; + + uint64_t Aba, Abe, Abi, Abo, Abu; + uint64_t Aga, Age, Agi, Ago, Agu; + uint64_t Aka, Ake, Aki, Ako, Aku; + uint64_t Ama, Ame, Ami, Amo, Amu; + uint64_t Asa, Ase, Asi, Aso, Asu; + uint64_t BCa, BCe, BCi, BCo, BCu; + uint64_t Da, De, Di, Do, Du; + + uint64_t tmp0, tmp1; + + Aba = state[ 0]; Abe = state[ 1]; Abi = state[ 2]; Abo = state[ 3]; + Abu = state[ 4]; Aga = state[ 5]; Age = state[ 6]; Agi = state[ 7]; + Ago = state[ 8]; Agu = state[ 9]; Aka = state[10]; Ake = state[11]; + Aki = state[12]; Ako = state[13]; Aku = state[14]; Ama = state[15]; + Ame = state[16]; Ami = state[17]; Amo = state[18]; Amu = state[19]; + Asa = state[20]; Ase = state[21]; Asi = state[22]; Aso = state[23]; + Asu = state[24]; + + BCa = Aba^Aga^Aka^Ama^Asa; + BCe = Abe^Age^Ake^Ame^Ase; + BCi = Abi^Agi^Aki^Ami^Asi; + BCo = Abo^Ago^Ako^Amo^Aso; + BCu = Abu^Agu^Aku^Amu^Asu; + + Da =xor_rol_1(BCe,BCu); + De =xor_rol_1(BCi,BCa); + Di =xor_rol_1(BCo,BCe); + Do =xor_rol_1(BCu,BCi); + Du =xor_rol_1(BCa,BCo); + + tmp0 = Abu; + Agu = Agu ^ Du; Abu = Age ^ De; Age = Ame ^ De; Ame = Ami ^ Di; + Ami = Aso ^ Do; Aso = Abi ^ Di; Abi = Asu ^ Du; Asu = Ago ^ Do; + Ago = Abo ^ Do; Abo = Aba ^ Da; Aba = Aki ^ Di; Aki = Asa ^ Da; + Asa = Aku ^ Du; Aku = Agi ^ Di; Agi = Asi ^ Di; Asi = Ase ^ De; + Ase = Ama ^ Da; Ama = Ake ^ De; Ake = Amu ^ Du; Amu = Aga ^ Da; + Aga = Aka ^ Da; Aka = Ako ^ Do; Ako = Abe ^ De; Abe = Amo ^ Do; + Amo = tmp0 ^ Du; + + tmp0 = bic_rol_1 (Abu, Aba ); + tmp0 = xor_rol_43(tmp0, Abo ); + tmp1 = bic_rol_22(Aba, Abe ); + tmp1 = xor_rol_23(Abu, tmp1); + Abu = bic_rol_20(Abo, Abu ); + Abu = xor_rol_34(Abi, Abu ); + Abo = bic_rol_14(Abi, Abo ); + Abo = xor_rol_21(Abe, Abo ); + Abi = bic_rol_7 (Abe, Abi ); + Abi = xor_rol_29(Aba, Abi ); + Aba = tmp0; + Abe = tmp1; + + tmp0 = bic_rol_17(Agu, Aga ); + tmp0 = xor_rol_25(Ago, tmp0); + tmp1 = bic_rol_22(Aga, Age ); + tmp1 = xor_rol_39(Agu, tmp1); + Agu = bic_rol_8 (Ago, Agu ); + Agu = xor_rol_41(Agi, Agu ); + Ago = bic_rol_33(Agi, Ago ); + Ago = xor_rol_17(Age, Ago ); + Agi = bic_rol_48(Age, Agi ); + Agi = xor_rol_6 (Aga, Agi ); + Aga = tmp0; + Age = tmp1; + + tmp0 = bic_rol_45(Aku, Aka ); + tmp0 = xor_rol_40(Ako, tmp0); + tmp1 = bic_rol_17(Aka, Ake ); + tmp1 = xor_rol_62(Aku, tmp1); + Aku = bic_rol_59(Ako, Aku ); + Aku = xor_rol_12(Aki, Aku ); + Ako = bic_rol_17(Aki, Ako ); + Ako = xor_rol_7 (Ake, Ako ); + Aki = bic_rol_54(Ake, Aki ); + Aki = xor_rol_7 (Aka, Aki ); + Aka = tmp0; + Ake = tmp1; + + tmp0 = bic_rol_26(Amu, Ama ); + tmp0 = xor_rol_17(Amo, tmp0); + tmp1 = bic_rol_59(Ama, Ame ); + tmp1 = xor_rol_21(Amu, tmp1); + Amu = bic_rol_55(Amo, Amu ); + Amu = xor_rol_20(Ami, Amu ); + Amo = bic_rol_29(Ami, Amo ); + Amo = xor_rol_52(Ame, Amo ); + Ami = bic_rol_23(Ame, Ami ); + Ami = xor_rol_18(Ama, Ami ); + Ama = tmp0; + Ame = tmp1; + + tmp0 = bic_rol_16(Asu, Asa ); + tmp0 = xor_rol_23(Aso, tmp0); + tmp1 = bic_rol_62(Asa, Ase ); + tmp1 = xor_rol_14(Asu, tmp1); + Asu = bic_rol_7 (Aso, Asu ); + Asu = xor_rol_11(Asi, Asu ); + Aso = bic_rol_4 (Asi, Aso ); + Aso = xor_rol_43(Ase, Aso ); + Asi = bic_rol_39(Ase, Asi ); + Asi = xor_rol_37(Asa, Asi ); + Asa = tmp0; + Ase = tmp1; + + Aba ^= (uint64_t)round_constants[0]; + + for(round = 1; round < KECCAK_F1600_ROUNDS; round++ ) { + + BCa = xor_rol_14( Asa, Aka); + BCe = xor_rol_4 ( Age, Ase ); + BCi = xor_rol_5 ( Agi, Ami ); + BCo = xor_rol_34( Aso, Ago ); + BCu = xor_rol_11( Asu, Abu ); + + BCa = xor_rol_15( BCa, Ama ); + BCe = xor_rol_20( BCe, Abe ); + BCi = xor_rol_38( BCi, Aki ); + BCo = xor_rol_1 ( BCo, Amo ); + BCa = xor_rol_7 ( BCa, Aga ); + BCu = xor_rol_8 ( BCu, Amu ); + + BCa = xor_rol_3 ( BCa, Aba ); + BCe = xor_rol_6 ( BCe, Ame ); + BCi = xor_rol_4 ( BCi, Abi ); + BCo = xor_rol_26( BCo, Ako ); + BCu = xor_rol_16( BCu, Agu ); + + BCe = xor_rol_7 ( BCe, Ake ); + BCi = xor_rol_12( BCi, Asi ); + BCo = xor_rol_1 ( BCo, Abo ); + BCu = xor_rol_14( BCu, Aku ); + + BCe = rol_8( BCe ); + BCi = rol_2( BCi ); + BCu = rol_6( BCu ); + + Da = xor_rol_1(BCe,BCu); + De = xor_rol_1(BCi,BCa); + Di = xor_rol_1(BCo,BCe); + Do = xor_rol_1(BCu,BCi); + Du = xor_rol_1(BCa,BCo); + + Agu = xor_rol_20(Agu,Du); + tmp0 = Abu; + Abu = xor_rol_45(Age,De); + Age = xor_rol_15(Ame,De); + Ame = xor_rol_56(Ami,Di); + Ami = xor_rol_62(Aso,Do); + Aso = xor_rol_14(Abi,Di); + Abi = xor_rol_55(Asu,Du); + Asu = xor_rol_28(Ago,Do); + Ago = Abo ^ Do; + Abo = Aba ^ Da; + Aba = xor_rol_18(Aki,Di); + Aki = xor_rol_39(Asa,Da); + Asa = xor_rol_6 (Aku,Du); + Aku = xor_rol_61(Agi,Di); + Agi = xor_rol_2 (Asi,Di); + Asi = xor_rol_41(Ase,De); + Ase = xor_rol_10(Ama,Da); + Ama = xor_rol_8 (Ake,De); + Ake = xor_rol_36(Amu,Du); + Amu = xor_rol_3(Aga,Da); + Aga = xor_rol_25(Aka,Da); + Aka = xor_rol_1 (Ako,Do); + Ako = xor_rol_21(Abe,De); + Abe = xor_rol_27(Amo,Do); + Amo = xor_rol_44(tmp0,Du); + + + tmp0 = bic_rol_1 (Abu, Aba ); + tmp0 = xor_rol_43(tmp0, Abo ); + tmp1 = bic_rol_22(Aba, Abe ); + tmp1 = xor_rol_23(Abu, tmp1); + Abu = bic_rol_20(Abo, Abu ); + Abu = xor_rol_34(Abi, Abu ); + Abo = bic_rol_14(Abi, Abo ); + Abo = xor_rol_21(Abe, Abo ); + Abi = bic_rol_7 (Abe, Abi ); + Abi = xor_rol_29(Aba, Abi ); + Aba = tmp0; + Abe = tmp1; + + tmp0 = bic_rol_17(Agu, Aga ); + tmp0 = xor_rol_25(Ago, tmp0); + tmp1 = bic_rol_22(Aga, Age ); + tmp1 = xor_rol_39(Agu, tmp1); + Agu = bic_rol_8 (Ago, Agu ); + Agu = xor_rol_41(Agi, Agu ); + Ago = bic_rol_33(Agi, Ago ); + Ago = xor_rol_17(Age, Ago ); + Agi = bic_rol_48(Age, Agi ); + Agi = xor_rol_6 (Aga, Agi ); + Aga = tmp0; + Age = tmp1; + + tmp0 = bic_rol_45(Aku, Aka ); + tmp0 = xor_rol_40(Ako, tmp0); + tmp1 = bic_rol_17(Aka, Ake ); + tmp1 = xor_rol_62(Aku, tmp1); + Aku = bic_rol_59(Ako, Aku ); + Aku = xor_rol_12(Aki, Aku ); + Ako = bic_rol_17(Aki, Ako ); + Ako = xor_rol_7 (Ake, Ako ); + Aki = bic_rol_54(Ake, Aki ); + Aki = xor_rol_7 (Aka, Aki ); + Aka = tmp0; + Ake = tmp1; + + tmp0 = bic_rol_26(Amu, Ama ); + tmp0 = xor_rol_17(Amo, tmp0); + tmp1 = bic_rol_59(Ama, Ame ); + tmp1 = xor_rol_21(Amu, tmp1); + Amu = bic_rol_55(Amo, Amu ); + Amu = xor_rol_20(Ami, Amu ); + Amo = bic_rol_29(Ami, Amo ); + Amo = xor_rol_52(Ame, Amo ); + Ami = bic_rol_23(Ame, Ami ); + Ami = xor_rol_18(Ama, Ami ); + Ama = tmp0; + Ame = tmp1; + + tmp0 = bic_rol_16(Asu, Asa ); + tmp0 = xor_rol_23(Aso, tmp0); + tmp1 = bic_rol_62(Asa, Ase ); + tmp1 = xor_rol_14(Asu, tmp1); + Asu = bic_rol_7 (Aso, Asu ); + Asu = xor_rol_11(Asi, Asu ); + Aso = bic_rol_4 (Asi, Aso ); + Aso = xor_rol_43(Ase, Aso ); + Asi = bic_rol_39(Ase, Asi ); + Asi = xor_rol_37(Asa, Asi ); + Asa = tmp0; + Ase = tmp1; + + Aba ^= (uint64_t)round_constants[round]; + + } + + Aga = rol_3 (Aga); Aka = rol_25(Aka); Ama = rol_10(Ama); Asa = rol_39(Asa); + Abe = rol_21(Abe); Age = rol_45(Age); Ake = rol_8 (Ake); Ame = rol_15(Ame); + Ase = rol_41(Ase); Abi = rol_14(Abi); Agi = rol_61(Agi); Aki = rol_18(Aki); + Ami = rol_56(Ami); Asi = rol_2 (Asi); Ago = rol_28(Ago); Ako = rol_1 (Ako); + Amo = rol_27(Amo); Aso = rol_62(Aso); Abu = rol_44(Abu); Agu = rol_20(Agu); + Aku = rol_6 (Aku); Amu = rol_36(Amu); Asu = rol_55(Asu); + + state[ 0] = Aba; state[ 1] = Abe; state[ 2] = Abi; state[ 3] = Abo; + state[ 4] = Abu; state[ 5] = Aga; state[ 6] = Age; state[ 7] = Agi; + state[ 8] = Ago; state[ 9] = Agu; state[10] = Aka; state[11] = Ake; + state[12] = Aki; state[13] = Ako; state[14] = Aku; state[15] = Ama; + state[16] = Ame; state[17] = Ami; state[18] = Amo; state[19] = Amu; + state[20] = Asa; state[21] = Ase; state[22] = Asi; state[23] = Aso; + state[24] = Asu; +} diff --git a/tests/keccak_neon/manual/keccak_f1600_x1_scalar_asm_v1.s b/tests/keccak_neon/manual/keccak_f1600_x1_scalar_asm_v1.s new file mode 100644 index 0000000..477272c --- /dev/null +++ b/tests/keccak_neon/manual/keccak_f1600_x1_scalar_asm_v1.s @@ -0,0 +1,413 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +/********************** CONSTANTS *************************/ + .data + .balign 64 +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x28 + count .req x29 + cur_const .req x30 + + /* Mapping of Kecck-f1600 state to scalar registers + * at the beginning and end of each round. */ + Aba .req x1 + Abe .req x6 + Abi .req x11 + Abo .req x16 + Abu .req x21 + Aga .req x2 + Age .req x7 + Agi .req x12 + Ago .req x17 + Agu .req x22 + Aka .req x3 + Ake .req x8 + Aki .req x13 + Ako .req x18 + Aku .req x23 + Ama .req x4 + Ame .req x9 + Ami .req x14 + Amo .req x19 + Amu .req x24 + Asa .req x5 + Ase .req x10 + Asi .req x15 + Aso .req x20 + Asu .req x25 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + Aba_ .req x0 + Abe_ .req x28 + Abi_ .req x11 + Abo_ .req x16 + Abu_ .req x21 + Aga_ .req x3 + Age_ .req x8 + Agi_ .req x12 + Ago_ .req x17 + Agu_ .req x22 + Aka_ .req x4 + Ake_ .req x9 + Aki_ .req x13 + Ako_ .req x18 + Aku_ .req x23 + Ama_ .req x5 + Ame_ .req x10 + Ami_ .req x14 + Amo_ .req x19 + Amu_ .req x24 + Asa_ .req x1 + Ase_ .req x6 + Asi_ .req x15 + Aso_ .req x20 + Asu_ .req x25 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + C0 .req x0 + E0 .req x29 + C1 .req x26 + E1 .req x30 + C2 .req x27 + E2 .req x26 + C3 .req x28 + E3 .req x27 + C4 .req x29 + E4 .req x28 + + tmp .req x30 + +/************************ MACROS ****************************/ + +.macro load_input + ldr Aba, [input_addr, #(1*8*0)] + ldr Abe, [input_addr, #(1*8*1)] + ldr Abi, [input_addr, #(1*8*2)] + ldr Abo, [input_addr, #(1*8*3)] + ldr Abu, [input_addr, #(1*8*4)] + ldr Aga, [input_addr, #(1*8*5)] + ldr Age, [input_addr, #(1*8*6)] + ldr Agi, [input_addr, #(1*8*7)] + ldr Ago, [input_addr, #(1*8*8)] + ldr Agu, [input_addr, #(1*8*9)] + ldr Aka, [input_addr, #(1*8*10)] + ldr Ake, [input_addr, #(1*8*11)] + ldr Aki, [input_addr, #(1*8*12)] + ldr Ako, [input_addr, #(1*8*13)] + ldr Aku, [input_addr, #(1*8*14)] + ldr Ama, [input_addr, #(1*8*15)] + ldr Ame, [input_addr, #(1*8*16)] + ldr Ami, [input_addr, #(1*8*17)] + ldr Amo, [input_addr, #(1*8*18)] + ldr Amu, [input_addr, #(1*8*19)] + ldr Asa, [input_addr, #(1*8*20)] + ldr Ase, [input_addr, #(1*8*21)] + ldr Asi, [input_addr, #(1*8*22)] + ldr Aso, [input_addr, #(1*8*23)] + ldr Asu, [input_addr, #(1*8*24)] +.endm + +.macro store_input + str Aba, [input_addr, #(1*8*0)] + str Abe, [input_addr, #(1*8*1)] + str Abi, [input_addr, #(1*8*2)] + str Abo, [input_addr, #(1*8*3)] + str Abu, [input_addr, #(1*8*4)] + str Aga, [input_addr, #(1*8*5)] + str Age, [input_addr, #(1*8*6)] + str Agi, [input_addr, #(1*8*7)] + str Ago, [input_addr, #(1*8*8)] + str Agu, [input_addr, #(1*8*9)] + str Aka, [input_addr, #(1*8*10)] + str Ake, [input_addr, #(1*8*11)] + str Aki, [input_addr, #(1*8*12)] + str Ako, [input_addr, #(1*8*13)] + str Aku, [input_addr, #(1*8*14)] + str Ama, [input_addr, #(1*8*15)] + str Ame, [input_addr, #(1*8*16)] + str Ami, [input_addr, #(1*8*17)] + str Amo, [input_addr, #(1*8*18)] + str Amu, [input_addr, #(1*8*19)] + str Asa, [input_addr, #(1*8*20)] + str Ase, [input_addr, #(1*8*21)] + str Asi, [input_addr, #(1*8*22)] + str Aso, [input_addr, #(1*8*23)] + str Asu, [input_addr, #(1*8*24)] +.endm + +#define STACK_SIZE (16*6 + 3*8 + 8) // GPRs (16*6), count (8), const (8), input (8), padding (8) +#define STACK_BASE_GPRS (3*8+8) +#define STACK_OFFSET_INPUT (0*8) +#define STACK_OFFSET_CONST (1*8) +#define STACK_OFFSET_COUNT (2*8) + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +.macro save reg, offset + str \reg, [sp, #\offset] +.endm + +.macro restore reg, offset + ldr \reg, [sp, #\offset] +.endm + +.macro save_gprs + stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +/* Keccak-f1600 round */ + +.macro keccak_f1600_round + save count, STACK_OFFSET_COUNT + +eor C0, Aba, Aga +eor C0, C0, Aka +eor C0, C0, Ama +eor C0, C0, Asa +eor C1, Abe, Age +eor C1, C1, Ake +eor C1, C1, Ame +eor C1, C1, Ase +eor C2, Abi, Agi +eor C2, C2, Aki +eor C2, C2, Ami +eor C2, C2, Asi +eor C3, Abo, Ago +eor C3, C3, Ako +eor C3, C3, Amo +eor C3, C3, Aso +eor C4, Abu, Agu +eor C4, C4, Aku +eor C4, C4, Amu +eor C4, C4, Asu + + +eor E1, C0, C2, ROR #63 +eor E3, C2, C4, ROR #63 +eor E0, C4, C1, ROR #63 +eor E2, C1, C3, ROR #63 +eor E4, C3, C0, ROR #63 + +eor Aba_, Aba, E0 +eor Asa_, Abi, E2 +ror Asa_, Asa_, #2 +eor Abi_, Aki, E2 +ror Abi_, Abi_, #21 +eor Aki_, Ako, E3 +ror Aki_, Aki_, #39 +eor Ako_, Amu, E4 +ror Ako_, Ako_, #56 +eor Amu_, Aso, E3 +ror Amu_, Amu_, #8 +eor Aso_, Ama, E0 +ror Aso_, Aso_, #23 +eor Aka_, Abe, E1 +ror Aka_, Aka_, #63 +eor Ase_, Ago, E3 +ror Ase_, Ase_, #9 +eor Ago_, Ame, E1 +ror Ago_, Ago_, #19 +eor Ake_, Agi, E2 +ror Ake_, Ake_, #58 +eor Agi_, Aka, E0 +ror Agi_, Agi_, #61 +eor Aga_, Abo, E3 +ror Aga_, Aga_, #36 +eor Abo_, Amo, E3 +ror Abo_, Abo_, #43 +eor Amo_, Ami, E2 +ror Amo_, Amo_, #49 +eor Ami_, Ake, E1 +ror Ami_, Ami_, #54 +eor Age_, Agu, E4 +ror Age_, Age_, #44 +eor Agu_, Asi, E2 +ror Agu_, Agu_, #3 +eor Asi_, Aku, E4 +ror Asi_, Asi_, #25 +eor Aku_, Asa, E0 +ror Aku_, Aku_, #46 +eor Ama_, Abu, E4 +ror Ama_, Ama_, #37 +eor Abu_, Asu, E4 +ror Abu_, Abu_, #50 +eor Asu_, Ase, E1 +ror Asu_, Asu_, #62 +eor Ame_, Aga, E0 +ror Ame_, Ame_, #28 + +eor Abe_, Age, E1 +ror Abe_, Abe_, #20 + +// xi step +// Row 1 +bic tmp, Agi_, Age_ +eor Aga, tmp, Aga_ +bic tmp, Ago_, Agi_ +eor Age, tmp, Age_ +bic tmp, Agu_, Ago_ +eor Agi, tmp, Agi_ +bic tmp, Aga_, Agu_ +eor Ago, tmp, Ago_ +bic tmp, Age_, Aga_ +eor Agu, tmp, Agu_ +// Row 2 +bic tmp, Aki_, Ake_ +eor Aka, tmp, Aka_ +bic tmp, Ako_, Aki_ +eor Ake, tmp, Ake_ +bic tmp, Aku_, Ako_ +eor Aki, tmp, Aki_ +bic tmp, Aka_, Aku_ +eor Ako, tmp, Ako_ +bic tmp, Ake_, Aka_ +eor Aku, tmp, Aku_ +// Row 3 +bic tmp, Ami_, Ame_ +eor Ama, tmp, Ama_ +bic tmp, Amo_, Ami_ +eor Ame, tmp, Ame_ +bic tmp, Amu_, Amo_ +eor Ami, tmp, Ami_ +bic tmp, Ama_, Amu_ +eor Amo, tmp, Amo_ +bic tmp, Ame_, Ama_ +eor Amu, tmp, Amu_ +// Row 4 +bic tmp, Asi_, Ase_ +eor Asa, tmp, Asa_ +bic tmp, Aso_, Asi_ +eor Ase, tmp, Ase_ +bic tmp, Asu_, Aso_ +eor Asi, tmp, Asi_ +bic tmp, Asa_, Asu_ +eor Aso, tmp, Aso_ +bic tmp, Ase_, Asa_ +eor Asu, tmp, Asu_ +// Row 0 +bic tmp, Abi_, Abe_ +eor Aba, tmp, Aba_ +bic tmp, Abo_, Abi_ +eor Abe, tmp, Abe_ +bic tmp, Abu_, Abo_ +eor Abi, tmp, Abi_ +bic tmp, Aba_, Abu_ +eor Abo, tmp, Abo_ +bic tmp, Abe_, Aba_ +eor Abu, tmp, Abu_ + + restore const_addr, STACK_OFFSET_CONST + ldr cur_const, [const_addr], #8 + eor Aba, Aba, cur_const + save const_addr, STACK_OFFSET_CONST + + restore count, STACK_OFFSET_COUNT +.endm + +#define KECCAK_F1600_ROUNDS 24 + +.text +.balign 16 +.global keccak_f1600_x1_scalar_asm_v1 +.global _keccak_f1600_x1_scalar_asm_v1 + +keccak_f1600_x1_scalar_asm_v1: +_keccak_f1600_x1_scalar_asm_v1: + alloc_stack + save_gprs + load_constant_ptr + save const_addr, STACK_OFFSET_CONST + load_input + save input_addr, STACK_OFFSET_INPUT + + mov count, #0 +loop: + keccak_f1600_round + add count, count, #1 + cmp count, #(KECCAK_F1600_ROUNDS-1) + ble loop + + restore input_addr, STACK_OFFSET_INPUT + store_input + restore_gprs + free_stack + ret diff --git a/tests/keccak_neon/manual/keccak_f1600_x1_scalar_asm_v2.s b/tests/keccak_neon/manual/keccak_f1600_x1_scalar_asm_v2.s new file mode 100644 index 0000000..68f2c71 --- /dev/null +++ b/tests/keccak_neon/manual/keccak_f1600_x1_scalar_asm_v2.s @@ -0,0 +1,505 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +/********************** CONSTANTS *************************/ + .data + .balign 64 +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x29 + count .req w27 + cur_const .req x26 + + /* Mapping of Kecck-f1600 state to scalar registers + * at the beginning and end of each round. */ + Aba .req x1 + Abe .req x6 + Abi .req x11 + Abo .req x16 + Abu .req x21 + Aga .req x2 + Age .req x7 + Agi .req x12 + Ago .req x17 + Agu .req x22 + Aka .req x3 + Ake .req x8 + Aki .req x13 + Ako .req x18 + Aku .req x23 + Ama .req x4 + Ame .req x9 + Ami .req x14 + Amo .req x19 + Amu .req x24 + Asa .req x5 + Ase .req x10 + Asi .req x15 + Aso .req x20 + Asu .req x25 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + Aba_ .req x0 + Abe_ .req x28 + Abi_ .req x11 + Abo_ .req x16 + Abu_ .req x21 + Aga_ .req x3 + Age_ .req x8 + Agi_ .req x12 + Ago_ .req x17 + Agu_ .req x22 + Aka_ .req x4 + Ake_ .req x9 + Aki_ .req x13 + Ako_ .req x18 + Aku_ .req x23 + Ama_ .req x5 + Ame_ .req x10 + Ami_ .req x14 + Amo_ .req x19 + Amu_ .req x24 + Asa_ .req x1 + Ase_ .req x6 + Asi_ .req x15 + Aso_ .req x20 + Asu_ .req x25 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + C0 .req x0 + E0 .req x29 + C1 .req x26 + E1 .req x30 + C2 .req x27 + E2 .req x26 + C3 .req x28 + E3 .req x27 + C4 .req x29 + E4 .req x28 + + tmp .req x30 + +/************************ MACROS ****************************/ + +.macro load_input + ldp Aba, Abe, [input_addr, #(1*8*0)] + ldp Abi, Abo, [input_addr, #(1*8*2)] + ldp Abu, Aga, [input_addr, #(1*8*4)] + ldp Age, Agi, [input_addr, #(1*8*6)] + ldp Ago, Agu, [input_addr, #(1*8*8)] + ldp Aka, Ake, [input_addr, #(1*8*10)] + ldp Aki, Ako, [input_addr, #(1*8*12)] + ldp Aku, Ama, [input_addr, #(1*8*14)] + ldp Ame, Ami, [input_addr, #(1*8*16)] + ldp Amo, Amu, [input_addr, #(1*8*18)] + ldp Asa, Ase, [input_addr, #(1*8*20)] + ldp Asi, Aso, [input_addr, #(1*8*22)] + ldr Asu, [input_addr, #(1*8*24)] +.endm + +.macro store_input + stp Aba, Abe, [input_addr, #(1*8*0)] + stp Abi, Abo, [input_addr, #(1*8*2)] + stp Abu, Aga, [input_addr, #(1*8*4)] + stp Age, Agi, [input_addr, #(1*8*6)] + stp Ago, Agu, [input_addr, #(1*8*8)] + stp Aka, Ake, [input_addr, #(1*8*10)] + stp Aki, Ako, [input_addr, #(1*8*12)] + stp Aku, Ama, [input_addr, #(1*8*14)] + stp Ame, Ami, [input_addr, #(1*8*16)] + stp Amo, Amu, [input_addr, #(1*8*18)] + stp Asa, Ase, [input_addr, #(1*8*20)] + stp Asi, Aso, [input_addr, #(1*8*22)] + str Asu, [input_addr, #(1*8*24)] +.endm + +#define STACK_SIZE (16*6 + 3*8 + 8) // GPRs (16*6), count (8), const (8), input (8), padding (8) +#define STACK_BASE_GPRS (3*8+8) +#define STACK_OFFSET_INPUT (0*8) +#define STACK_OFFSET_CONST (1*8) +#define STACK_OFFSET_COUNT (2*8) + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +.macro save reg, offset + str \reg, [sp, #\offset] +.endm + +.macro restore reg, offset + ldr \reg, [sp, #\offset] +.endm + +.macro save_gprs + stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro keccak_f1600_round_initial + + eor C0, Ama, Asa + eor C1, Ame, Ase + eor C2, Ami, Asi + eor C3, Amo, Aso + eor C4, Amu, Asu + eor C0, Aka, C0 + eor C1, Ake, C1 + eor C2, Aki, C2 + eor C3, Ako, C3 + eor C4, Aku, C4 + eor C0, Aga, C0 + eor C1, Age, C1 + eor C2, Agi, C2 + eor C3, Ago, C3 + eor C4, Agu, C4 + eor C0, Aba, C0 + eor C1, Abe, C1 + eor C2, Abi, C2 + eor C3, Abo, C3 + eor C4, Abu, C4 + + eor E1, C0, C2, ROR #63 + eor E3, C2, C4, ROR #63 + eor E0, C4, C1, ROR #63 + eor E2, C1, C3, ROR #63 + eor E4, C3, C0, ROR #63 + + eor Aba_, Aba, E0 + eor Asa_, Abi, E2 + eor Abi_, Aki, E2 + eor Aki_, Ako, E3 + eor Ako_, Amu, E4 + eor Amu_, Aso, E3 + eor Aso_, Ama, E0 + eor Aka_, Abe, E1 + eor Ase_, Ago, E3 + eor Ago_, Ame, E1 + eor Ake_, Agi, E2 + eor Agi_, Aka, E0 + eor Aga_, Abo, E3 + eor Abo_, Amo, E3 + eor Amo_, Ami, E2 + eor Ami_, Ake, E1 + eor Age_, Agu, E4 + eor Agu_, Asi, E2 + eor Asi_, Aku, E4 + eor Aku_, Asa, E0 + eor Ama_, Abu, E4 + eor Abu_, Asu, E4 + eor Asu_, Ase, E1 + eor Ame_, Aga, E0 + eor Abe_, Age, E1 + + load_constant_ptr + + bic tmp, Agi_, Age_, ROR #47 + eor Aga, tmp, Aga_, ROR #39 + bic tmp, Ago_, Agi_, ROR #42 + eor Age, tmp, Age_, ROR #25 + bic tmp, Agu_, Ago_, ROR #16 + eor Agi, tmp, Agi_, ROR #58 + bic tmp, Aga_, Agu_, ROR #31 + eor Ago, tmp, Ago_, ROR #47 + bic tmp, Age_, Aga_, ROR #56 + eor Agu, tmp, Agu_, ROR #23 + bic tmp, Aki_, Ake_, ROR #19 + eor Aka, tmp, Aka_, ROR #24 + bic tmp, Ako_, Aki_, ROR #47 + eor Ake, tmp, Ake_, ROR #2 + bic tmp, Aku_, Ako_, ROR #10 + eor Aki, tmp, Aki_, ROR #57 + bic tmp, Aka_, Aku_, ROR #47 + eor Ako, tmp, Ako_, ROR #57 + bic tmp, Ake_, Aka_, ROR #5 + eor Aku, tmp, Aku_, ROR #52 + bic tmp, Ami_, Ame_, ROR #38 + eor Ama, tmp, Ama_, ROR #47 + bic tmp, Amo_, Ami_, ROR #5 + eor Ame, tmp, Ame_, ROR #43 + bic tmp, Amu_, Amo_, ROR #41 + eor Ami, tmp, Ami_, ROR #46 + + ldr cur_const, [const_addr] + mov count, #1 + + bic tmp, Ama_, Amu_, ROR #35 + eor Amo, tmp, Amo_, ROR #12 + bic tmp, Ame_, Ama_, ROR #9 + eor Amu, tmp, Amu_, ROR #44 + bic tmp, Asi_, Ase_, ROR #48 + eor Asa, tmp, Asa_, ROR #41 + bic tmp, Aso_, Asi_, ROR #2 + eor Ase, tmp, Ase_, ROR #50 + bic tmp, Asu_, Aso_, ROR #25 + eor Asi, tmp, Asi_, ROR #27 + bic tmp, Asa_, Asu_, ROR #60 + eor Aso, tmp, Aso_, ROR #21 + bic tmp, Ase_, Asa_, ROR #57 + eor Asu, tmp, Asu_, ROR #53 + bic tmp, Abi_, Abe_, ROR #63 + eor Aba, Aba_, tmp, ROR #21 + bic tmp, Abo_, Abi_, ROR #42 + eor Abe, tmp, Abe_, ROR #41 + bic tmp, Abu_, Abo_, ROR #57 + eor Abi, tmp, Abi_, ROR #35 + bic tmp, Aba_, Abu_, ROR #50 + eor Abo, tmp, Abo_, ROR #43 + bic tmp, Abe_, Aba_, ROR #44 + eor Abu, tmp, Abu_, ROR #30 + + eor Aba, Aba, cur_const + +.endm + + +.macro keccak_f1600_round_noninitial + + save count, STACK_OFFSET_COUNT + + eor C0, Aka, Asa, ROR #50 + eor C1, Ase, Age, ROR #60 + eor C2, Ami, Agi, ROR #59 + eor C3, Ago, Aso, ROR #30 + eor C4, Abu, Asu, ROR #53 + eor C0, Ama, C0, ROR #49 + eor C1, Abe, C1, ROR #44 + eor C2, Aki, C2, ROR #26 + eor C3, Amo, C3, ROR #63 + eor C4, Amu, C4, ROR #56 + eor C0, Aga, C0, ROR #57 + eor C1, Ame, C1, ROR #58 + eor C2, Abi, C2, ROR #60 + eor C3, Ako, C3, ROR #38 + eor C4, Agu, C4, ROR #48 + eor C0, Aba, C0, ROR #61 + eor C1, Ake, C1, ROR #57 + eor C2, Asi, C2, ROR #52 + eor C3, Abo, C3, ROR #63 + eor C4, Aku, C4, ROR #50 + ror C1, C1, 56 + ror C4, C4, 58 + ror C2, C2, 62 + + eor E1, C0, C2, ROR #63 + eor E3, C2, C4, ROR #63 + eor E0, C4, C1, ROR #63 + eor E2, C1, C3, ROR #63 + eor E4, C3, C0, ROR #63 + + eor Aba_, E0, Aba + eor Asa_, E2, Abi, ROR #50 + eor Abi_, E2, Aki, ROR #46 + eor Aki_, E3, Ako, ROR #63 + eor Ako_, E4, Amu, ROR #28 + eor Amu_, E3, Aso, ROR #2 + eor Aso_, E0, Ama, ROR #54 + eor Aka_, E1, Abe, ROR #43 + eor Ase_, E3, Ago, ROR #36 + eor Ago_, E1, Ame, ROR #49 + eor Ake_, E2, Agi, ROR #3 + eor Agi_, E0, Aka, ROR #39 + eor Aga_, E3, Abo + eor Abo_, E3, Amo, ROR #37 + eor Amo_, E2, Ami, ROR #8 + eor Ami_, E1, Ake, ROR #56 + eor Age_, E4, Agu, ROR #44 + eor Agu_, E2, Asi, ROR #62 + eor Asi_, E4, Aku, ROR #58 + eor Aku_, E0, Asa, ROR #25 + eor Ama_, E4, Abu, ROR #20 + eor Abu_, E4, Asu, ROR #9 + eor Asu_, E1, Ase, ROR #23 + eor Ame_, E0, Aga, ROR #61 + eor Abe_, E1, Age, ROR #19 + + load_constant_ptr + restore count, STACK_OFFSET_COUNT + + bic tmp, Agi_, Age_, ROR #47 + eor Aga, tmp, Aga_, ROR #39 + bic tmp, Ago_, Agi_, ROR #42 + eor Age, tmp, Age_, ROR #25 + bic tmp, Agu_, Ago_, ROR #16 + eor Agi, tmp, Agi_, ROR #58 + bic tmp, Aga_, Agu_, ROR #31 + eor Ago, tmp, Ago_, ROR #47 + bic tmp, Age_, Aga_, ROR #56 + eor Agu, tmp, Agu_, ROR #23 + bic tmp, Aki_, Ake_, ROR #19 + eor Aka, tmp, Aka_, ROR #24 + bic tmp, Ako_, Aki_, ROR #47 + eor Ake, tmp, Ake_, ROR #2 + bic tmp, Aku_, Ako_, ROR #10 + eor Aki, tmp, Aki_, ROR #57 + bic tmp, Aka_, Aku_, ROR #47 + eor Ako, tmp, Ako_, ROR #57 + bic tmp, Ake_, Aka_, ROR #5 + eor Aku, tmp, Aku_, ROR #52 + bic tmp, Ami_, Ame_, ROR #38 + eor Ama, tmp, Ama_, ROR #47 + bic tmp, Amo_, Ami_, ROR #5 + eor Ame, tmp, Ame_, ROR #43 + bic tmp, Amu_, Amo_, ROR #41 + eor Ami, tmp, Ami_, ROR #46 + bic tmp, Ama_, Amu_, ROR #35 + + ldr cur_const, [const_addr, count, UXTW #3] + add count, count, #1 + + eor Amo, tmp, Amo_, ROR #12 + bic tmp, Ame_, Ama_, ROR #9 + eor Amu, tmp, Amu_, ROR #44 + bic tmp, Asi_, Ase_, ROR #48 + eor Asa, tmp, Asa_, ROR #41 + bic tmp, Aso_, Asi_, ROR #2 + eor Ase, tmp, Ase_, ROR #50 + bic tmp, Asu_, Aso_, ROR #25 + eor Asi, tmp, Asi_, ROR #27 + bic tmp, Asa_, Asu_, ROR #60 + eor Aso, tmp, Aso_, ROR #21 + bic tmp, Ase_, Asa_, ROR #57 + eor Asu, tmp, Asu_, ROR #53 + bic tmp, Abi_, Abe_, ROR #63 + eor Aba, Aba_, tmp, ROR #21 + bic tmp, Abo_, Abi_, ROR #42 + eor Abe, tmp, Abe_, ROR #41 + bic tmp, Abu_, Abo_, ROR #57 + eor Abi, tmp, Abi_, ROR #35 + bic tmp, Aba_, Abu_, ROR #50 + eor Abo, tmp, Abo_, ROR #43 + bic tmp, Abe_, Aba_, ROR #44 + eor Abu, tmp, Abu_, ROR #30 + + eor Aba, Aba, cur_const + +.endm + +.macro final_rotate + ror Aga, Aga,#(64-3) + ror Aka, Aka,#(64-25) + ror Ama, Ama,#(64-10) + ror Asa, Asa,#(64-39) + ror Abe, Abe,#(64-21) + ror Age, Age,#(64-45) + ror Ake, Ake,#(64-8) + ror Ame, Ame,#(64-15) + ror Ase, Ase,#(64-41) + ror Abi, Abi,#(64-14) + ror Agi, Agi,#(64-61) + ror Aki, Aki,#(64-18) + ror Ami, Ami,#(64-56) + ror Asi, Asi,#(64-2) + ror Ago, Ago,#(64-28) + ror Ako, Ako,#(64-1) + ror Amo, Amo,#(64-27) + ror Aso, Aso,#(64-62) + ror Abu, Abu,#(64-44) + ror Agu, Agu,#(64-20) + ror Aku, Aku,#(64-6) + ror Amu, Amu,#(64-36) + ror Asu, Asu,#(64-55) +.endm + + +#define KECCAK_F1600_ROUNDS 24 + +.text +.balign 16 +.global keccak_f1600_x1_scalar_asm_v2 +.global _keccak_f1600_x1_scalar_asm_v2 + +keccak_f1600_x1_scalar_asm_v2: +_keccak_f1600_x1_scalar_asm_v2: + alloc_stack + save_gprs + load_input + save input_addr, STACK_OFFSET_INPUT + + keccak_f1600_round_initial +loop: + keccak_f1600_round_noninitial + cmp count, #(KECCAK_F1600_ROUNDS-1) + ble loop + + final_rotate + + restore input_addr, STACK_OFFSET_INPUT + store_input + restore_gprs + free_stack + ret diff --git a/tests/keccak_neon/manual/keccak_f1600_x1_scalar_asm_v3.s b/tests/keccak_neon/manual/keccak_f1600_x1_scalar_asm_v3.s new file mode 100644 index 0000000..9a5d04b --- /dev/null +++ b/tests/keccak_neon/manual/keccak_f1600_x1_scalar_asm_v3.s @@ -0,0 +1,494 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +/********************** CONSTANTS *************************/ + .data + .balign 64 +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x29 + count .req w27 + cur_const .req x26 + + /* Mapping of Kecck-f1600 state to scalar registers + * at the beginning and end of each round. */ + Aba .req x1 + Abe .req x6 + Abi .req x11 + Abo .req x16 + Abu .req x21 + Aga .req x2 + Age .req x7 + Agi .req x12 + Ago .req x17 + Agu .req x22 + Aka .req x3 + Ake .req x8 + Aki .req x13 + Ako .req x18 + Aku .req x23 + Ama .req x4 + Ame .req x9 + Ami .req x14 + Amo .req x19 + Amu .req x24 + Asa .req x5 + Ase .req x10 + Asi .req x15 + Aso .req x20 + Asu .req x25 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + Aba_ .req x30 + Abe_ .req x28 + Abi_ .req x11 + Abo_ .req x16 + Abu_ .req x21 + Aga_ .req x3 + Age_ .req x8 + Agi_ .req x12 + Ago_ .req x17 + Agu_ .req x22 + Aka_ .req x4 + Ake_ .req x9 + Aki_ .req x13 + Ako_ .req x18 + Aku_ .req x23 + Ama_ .req x5 + Ame_ .req x10 + Ami_ .req x14 + Amo_ .req x19 + Amu_ .req x24 + Asa_ .req x1 + Ase_ .req x6 + Asi_ .req x15 + Aso_ .req x20 + Asu_ .req x25 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + C0 .req x30 + E0 .req x29 + C1 .req x26 + E1 .req x0 + C2 .req x27 + E2 .req x26 + C3 .req x28 + E3 .req x27 + C4 .req x29 + E4 .req x28 + + tmp .req x0 + +/************************ MACROS ****************************/ + +#define STACK_SIZE (16*6 + 3*8 + 8) // GPRs (16*6), count (8), const (8), input (8), padding (8) +#define STACK_BASE_GPRS (3*8+8) +#define STACK_OFFSET_INPUT (0*8) +#define STACK_OFFSET_CONST (1*8) +#define STACK_OFFSET_COUNT (2*8) + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +.macro save reg, offset + str \reg, [sp, #\offset] +.endm + +.macro restore reg, offset + ldr \reg, [sp, #\offset] +.endm + +.macro save_gprs + stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro keccak_f1600_round_initial + ldp Aku, Ama, [input_addr, #(1*8*14)] + ldp Asa, Ase, [input_addr, #(1*8*20)] + eor C0, Ama, Asa + ldp Ame, Ami, [input_addr, #(1*8*16)] + eor C1, Ame, Ase + ldp Asi, Aso, [input_addr, #(1*8*22)] + eor C2, Ami, Asi + ldp Amo, Amu, [input_addr, #(1*8*18)] + eor C3, Amo, Aso + ldr Asu, [input_addr, #(1*8*24)] + eor C4, Amu, Asu + ldp Aka, Ake, [input_addr, #(1*8*10)] + eor C0, Aka, C0 + eor C1, Ake, C1 + ldp Aki, Ako, [input_addr, #(1*8*12)] + eor C2, Aki, C2 + ldp Abu, Aga, [input_addr, #(1*8*4)] + eor C3, Ako, C3 + eor C4, Aku, C4 + ldp Age, Agi, [input_addr, #(1*8*6)] + eor C0, Aga, C0 + ldp Ago, Agu, [input_addr, #(1*8*8)] + eor C1, Age, C1 + ldp Aba, Abe, [input_addr, #(1*8*0)] + eor C2, Agi, C2 + ldp Abi, Abo, [input_addr, #(1*8*2)] + eor C3, Ago, C3 + save input_addr, STACK_OFFSET_INPUT + eor C4, Agu, C4 + eor C0, Aba, C0 + eor C1, Abe, C1 + eor C2, Abi, C2 + eor C3, Abo, C3 + eor C4, Abu, C4 + + eor E1, C0, C2, ROR #63 + eor E3, C2, C4, ROR #63 + eor E0, C4, C1, ROR #63 + eor E2, C1, C3, ROR #63 + eor E4, C3, C0, ROR #63 + + eor Aba_, Aba, E0 + eor Asa_, Abi, E2 + eor Abi_, Aki, E2 + eor Aki_, Ako, E3 + eor Ako_, Amu, E4 + eor Amu_, Aso, E3 + eor Aso_, Ama, E0 + eor Aka_, Abe, E1 + eor Ase_, Ago, E3 + eor Ago_, Ame, E1 + eor Ake_, Agi, E2 + eor Agi_, Aka, E0 + eor Aga_, Abo, E3 + eor Abo_, Amo, E3 + eor Amo_, Ami, E2 + eor Ami_, Ake, E1 + eor Age_, Agu, E4 + eor Agu_, Asi, E2 + eor Asi_, Aku, E4 + eor Aku_, Asa, E0 + eor Ama_, Abu, E4 + eor Abu_, Asu, E4 + eor Asu_, Ase, E1 + eor Ame_, Aga, E0 + eor Abe_, Age, E1 + + load_constant_ptr + + bic tmp, Agi_, Age_, ROR #47 + eor Aga, tmp, Aga_, ROR #39 + bic tmp, Ago_, Agi_, ROR #42 + eor Age, tmp, Age_, ROR #25 + bic tmp, Agu_, Ago_, ROR #16 + eor Agi, tmp, Agi_, ROR #58 + bic tmp, Aga_, Agu_, ROR #31 + eor Ago, tmp, Ago_, ROR #47 + bic tmp, Age_, Aga_, ROR #56 + eor Agu, tmp, Agu_, ROR #23 + bic tmp, Aki_, Ake_, ROR #19 + eor Aka, tmp, Aka_, ROR #24 + bic tmp, Ako_, Aki_, ROR #47 + eor Ake, tmp, Ake_, ROR #2 + bic tmp, Aku_, Ako_, ROR #10 + eor Aki, tmp, Aki_, ROR #57 + bic tmp, Aka_, Aku_, ROR #47 + eor Ako, tmp, Ako_, ROR #57 + bic tmp, Ake_, Aka_, ROR #5 + eor Aku, tmp, Aku_, ROR #52 + bic tmp, Ami_, Ame_, ROR #38 + eor Ama, tmp, Ama_, ROR #47 + bic tmp, Amo_, Ami_, ROR #5 + eor Ame, tmp, Ame_, ROR #43 + bic tmp, Amu_, Amo_, ROR #41 + eor Ami, tmp, Ami_, ROR #46 + + ldr cur_const, [const_addr] + mov count, #1 + + bic tmp, Ama_, Amu_, ROR #35 + eor Amo, tmp, Amo_, ROR #12 + bic tmp, Ame_, Ama_, ROR #9 + eor Amu, tmp, Amu_, ROR #44 + bic tmp, Asi_, Ase_, ROR #48 + eor Asa, tmp, Asa_, ROR #41 + bic tmp, Aso_, Asi_, ROR #2 + eor Ase, tmp, Ase_, ROR #50 + bic tmp, Asu_, Aso_, ROR #25 + eor Asi, tmp, Asi_, ROR #27 + bic tmp, Asa_, Asu_, ROR #60 + eor Aso, tmp, Aso_, ROR #21 + bic tmp, Ase_, Asa_, ROR #57 + eor Asu, tmp, Asu_, ROR #53 + bic tmp, Abi_, Abe_, ROR #63 + eor Aba, Aba_, tmp, ROR #21 + bic tmp, Abo_, Abi_, ROR #42 + eor Abe, tmp, Abe_, ROR #41 + bic tmp, Abu_, Abo_, ROR #57 + eor Abi, tmp, Abi_, ROR #35 + bic tmp, Aba_, Abu_, ROR #50 + eor Abo, tmp, Abo_, ROR #43 + bic tmp, Abe_, Aba_, ROR #44 + eor Abu, tmp, Abu_, ROR #30 + + eor Aba, Aba, cur_const + +.endm + + +.macro keccak_f1600_round_noninitial + + save count, STACK_OFFSET_COUNT + + eor C0, Aka, Asa, ROR #50 + eor C1, Ase, Age, ROR #60 + eor C2, Ami, Agi, ROR #59 + eor C3, Ago, Aso, ROR #30 + eor C4, Abu, Asu, ROR #53 + eor C0, Ama, C0, ROR #49 + eor C1, Abe, C1, ROR #44 + eor C2, Aki, C2, ROR #26 + eor C3, Amo, C3, ROR #63 + eor C4, Amu, C4, ROR #56 + eor C0, Aga, C0, ROR #57 + eor C1, Ame, C1, ROR #58 + eor C2, Abi, C2, ROR #60 + eor C3, Ako, C3, ROR #38 + eor C4, Agu, C4, ROR #48 + eor C0, Aba, C0, ROR #61 + eor C1, Ake, C1, ROR #57 + eor C2, Asi, C2, ROR #52 + eor C3, Abo, C3, ROR #63 + eor C4, Aku, C4, ROR #50 + ror C1, C1, 56 + ror C4, C4, 58 + ror C2, C2, 62 + + eor E1, C0, C2, ROR #63 + eor E3, C2, C4, ROR #63 + eor E0, C4, C1, ROR #63 + eor E2, C1, C3, ROR #63 + eor E4, C3, C0, ROR #63 + + eor Aba_, E0, Aba + eor Asa_, E2, Abi, ROR #50 + eor Abi_, E2, Aki, ROR #46 + eor Aki_, E3, Ako, ROR #63 + eor Ako_, E4, Amu, ROR #28 + eor Amu_, E3, Aso, ROR #2 + eor Aso_, E0, Ama, ROR #54 + eor Aka_, E1, Abe, ROR #43 + eor Ase_, E3, Ago, ROR #36 + eor Ago_, E1, Ame, ROR #49 + eor Ake_, E2, Agi, ROR #3 + eor Agi_, E0, Aka, ROR #39 + eor Aga_, E3, Abo + eor Abo_, E3, Amo, ROR #37 + eor Amo_, E2, Ami, ROR #8 + eor Ami_, E1, Ake, ROR #56 + eor Age_, E4, Agu, ROR #44 + eor Agu_, E2, Asi, ROR #62 + eor Asi_, E4, Aku, ROR #58 + eor Aku_, E0, Asa, ROR #25 + eor Ama_, E4, Abu, ROR #20 + eor Abu_, E4, Asu, ROR #9 + eor Asu_, E1, Ase, ROR #23 + eor Ame_, E0, Aga, ROR #61 + eor Abe_, E1, Age, ROR #19 + + load_constant_ptr + restore count, STACK_OFFSET_COUNT + + bic tmp, Agi_, Age_, ROR #47 + eor Aga, tmp, Aga_, ROR #39 + bic tmp, Ago_, Agi_, ROR #42 + eor Age, tmp, Age_, ROR #25 + bic tmp, Agu_, Ago_, ROR #16 + eor Agi, tmp, Agi_, ROR #58 + bic tmp, Aga_, Agu_, ROR #31 + eor Ago, tmp, Ago_, ROR #47 + bic tmp, Age_, Aga_, ROR #56 + eor Agu, tmp, Agu_, ROR #23 + bic tmp, Aki_, Ake_, ROR #19 + eor Aka, tmp, Aka_, ROR #24 + bic tmp, Ako_, Aki_, ROR #47 + eor Ake, tmp, Ake_, ROR #2 + bic tmp, Aku_, Ako_, ROR #10 + eor Aki, tmp, Aki_, ROR #57 + bic tmp, Aka_, Aku_, ROR #47 + eor Ako, tmp, Ako_, ROR #57 + bic tmp, Ake_, Aka_, ROR #5 + eor Aku, tmp, Aku_, ROR #52 + bic tmp, Ami_, Ame_, ROR #38 + eor Ama, tmp, Ama_, ROR #47 + bic tmp, Amo_, Ami_, ROR #5 + eor Ame, tmp, Ame_, ROR #43 + bic tmp, Amu_, Amo_, ROR #41 + eor Ami, tmp, Ami_, ROR #46 + bic tmp, Ama_, Amu_, ROR #35 + + ldr cur_const, [const_addr, count, UXTW #3] + add count, count, #1 + + eor Amo, tmp, Amo_, ROR #12 + bic tmp, Ame_, Ama_, ROR #9 + eor Amu, tmp, Amu_, ROR #44 + bic tmp, Asi_, Ase_, ROR #48 + eor Asa, tmp, Asa_, ROR #41 + bic tmp, Aso_, Asi_, ROR #2 + eor Ase, tmp, Ase_, ROR #50 + bic tmp, Asu_, Aso_, ROR #25 + eor Asi, tmp, Asi_, ROR #27 + bic tmp, Asa_, Asu_, ROR #60 + eor Aso, tmp, Aso_, ROR #21 + bic tmp, Ase_, Asa_, ROR #57 + eor Asu, tmp, Asu_, ROR #53 + bic tmp, Abi_, Abe_, ROR #63 + eor Aba, Aba_, tmp, ROR #21 + bic tmp, Abo_, Abi_, ROR #42 + eor Abe, tmp, Abe_, ROR #41 + bic tmp, Abu_, Abo_, ROR #57 + eor Abi, tmp, Abi_, ROR #35 + bic tmp, Aba_, Abu_, ROR #50 + eor Abo, tmp, Abo_, ROR #43 + bic tmp, Abe_, Aba_, ROR #44 + eor Abu, tmp, Abu_, ROR #30 + + eor Aba, Aba, cur_const + +.endm + +.macro final_rotate_store + ror Aga, Aga,#(64-3) + restore input_addr, STACK_OFFSET_INPUT + ror Abu, Abu,#(64-44) + ror Aka, Aka,#(64-25) + ror Ake, Ake,#(64-8) + stp Abu, Aga, [input_addr, #(1*8*4)] + ror Ama, Ama,#(64-10) + ror Aku, Aku,#(64-6) + stp Aka, Ake, [input_addr, #(1*8*10)] + ror Asa, Asa,#(64-39) + ror Ase, Ase,#(64-41) + stp Aku, Ama, [input_addr, #(1*8*14)] + ror Abe, Abe,#(64-21) + ror Age, Age,#(64-45) + stp Asa, Ase, [input_addr, #(1*8*20)] + ror Agi, Agi,#(64-61) + stp Aba, Abe, [input_addr, #(1*8*0)] + ror Ame, Ame,#(64-15) + ror Ami, Ami,#(64-56) + stp Age, Agi, [input_addr, #(1*8*6)] + ror Abi, Abi,#(64-14) + ror Aki, Aki,#(64-18) + stp Ame, Ami, [input_addr, #(1*8*16)] + ror Ako, Ako,#(64-1) + stp Abi, Abo, [input_addr, #(1*8*2)] + ror Asi, Asi,#(64-2) + ror Aso, Aso,#(64-62) + stp Aki, Ako, [input_addr, #(1*8*12)] + ror Ago, Ago,#(64-28) + ror Agu, Agu,#(64-20) + stp Asi, Aso, [input_addr, #(1*8*22)] + ror Amo, Amo,#(64-27) + ror Amu, Amu,#(64-36) + stp Ago, Agu, [input_addr, #(1*8*8)] + ror Asu, Asu,#(64-55) + stp Amo, Amu, [input_addr, #(1*8*18)] + str Asu, [input_addr, #(1*8*24)] +.endm + +#define KECCAK_F1600_ROUNDS 24 + +.text +.balign 16 +.global keccak_f1600_x1_scalar_asm_v3 +.global _keccak_f1600_x1_scalar_asm_v3 + +keccak_f1600_x1_scalar_asm_v3: +_keccak_f1600_x1_scalar_asm_v3: + alloc_stack + save_gprs + + keccak_f1600_round_initial +loop: + keccak_f1600_round_noninitial + cmp count, #(KECCAK_F1600_ROUNDS-1) + ble loop + + final_rotate_store + restore_gprs + free_stack + ret diff --git a/tests/keccak_neon/manual/keccak_f1600_x1_scalar_asm_v4.s b/tests/keccak_neon/manual/keccak_f1600_x1_scalar_asm_v4.s new file mode 100644 index 0000000..95f2275 --- /dev/null +++ b/tests/keccak_neon/manual/keccak_f1600_x1_scalar_asm_v4.s @@ -0,0 +1,495 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +/********************** CONSTANTS *************************/ + .data + .balign 64 +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x29 + count .req w27 + cur_const .req x26 + + /* Mapping of Kecck-f1600 state to scalar registers + * at the beginning and end of each round. */ + Aba .req x1 + Abe .req x6 + Abi .req x11 + Abo .req x16 + Abu .req x21 + Aga .req x2 + Age .req x7 + Agi .req x12 + Ago .req x17 + Agu .req x22 + Aka .req x3 + Ake .req x8 + Aki .req x13 + Ako .req x18 + Aku .req x23 + Ama .req x4 + Ame .req x9 + Ami .req x14 + Amo .req x19 + Amu .req x24 + Asa .req x5 + Ase .req x10 + Asi .req x15 + Aso .req x20 + Asu .req x25 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + Aba_ .req x30 + Abe_ .req x28 + Abi_ .req x11 + Abo_ .req x16 + Abu_ .req x21 + Aga_ .req x3 + Age_ .req x8 + Agi_ .req x12 + Ago_ .req x17 + Agu_ .req x22 + Aka_ .req x4 + Ake_ .req x9 + Aki_ .req x13 + Ako_ .req x18 + Aku_ .req x23 + Ama_ .req x5 + Ame_ .req x10 + Ami_ .req x14 + Amo_ .req x19 + Amu_ .req x24 + Asa_ .req x1 + Ase_ .req x6 + Asi_ .req x15 + Aso_ .req x20 + Asu_ .req x25 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + C0 .req x30 + E0 .req x29 + C1 .req x26 + E1 .req x0 + C2 .req x27 + E2 .req x26 + C3 .req x28 + E3 .req x27 + C4 .req x29 + E4 .req x28 + + tmp .req x0 + tmp0 .req x0 + tmp1 .req x26 + +/************************ MACROS ****************************/ + +#define STACK_SIZE (16*6 + 3*8 + 8) // GPRs (16*6), count (8), const (8), input (8), padding (8) +#define STACK_BASE_GPRS (3*8+8) +#define STACK_OFFSET_INPUT (0*8) +#define STACK_OFFSET_CONST (1*8) +#define STACK_OFFSET_COUNT (2*8) + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +.macro save reg, offset + str \reg, [sp, #\offset] +.endm + +.macro restore reg, offset + ldr \reg, [sp, #\offset] +.endm + +.macro save_gprs + stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro keccak_f1600_round_initial + ldp Aku, Ama, [input_addr, #(1*8*14)] + ldp Asa, Ase, [input_addr, #(1*8*20)] + eor C0, Ama, Asa + ldp Ame, Ami, [input_addr, #(1*8*16)] + eor C1, Ame, Ase + ldp Asi, Aso, [input_addr, #(1*8*22)] + eor C2, Ami, Asi + ldp Amo, Amu, [input_addr, #(1*8*18)] + eor C3, Amo, Aso + ldr Asu, [input_addr, #(1*8*24)] + eor C4, Amu, Asu + ldp Aka, Ake, [input_addr, #(1*8*10)] + eor C0, Aka, C0 + eor C1, Ake, C1 + ldp Aki, Ako, [input_addr, #(1*8*12)] + eor C2, Aki, C2 + ldp Abu, Aga, [input_addr, #(1*8*4)] + eor C3, Ako, C3 + eor C4, Aku, C4 + ldp Age, Agi, [input_addr, #(1*8*6)] + eor C0, Aga, C0 + ldp Ago, Agu, [input_addr, #(1*8*8)] + eor C1, Age, C1 + ldp Aba, Abe, [input_addr, #(1*8*0)] + eor C2, Agi, C2 + ldp Abi, Abo, [input_addr, #(1*8*2)] + eor C3, Ago, C3 + save input_addr, STACK_OFFSET_INPUT + eor C4, Agu, C4 + eor C0, Aba, C0 + eor C1, Abe, C1 + eor C2, Abi, C2 + eor C3, Abo, C3 + eor C4, Abu, C4 + + eor E1, C0, C2, ROR #63 + eor E3, C2, C4, ROR #63 + eor E0, C4, C1, ROR #63 + eor E2, C1, C3, ROR #63 + eor E4, C3, C0, ROR #63 + + eor Aba_, Aba, E0 + eor Asa_, Abi, E2 + eor Abi_, Aki, E2 + eor Aki_, Ako, E3 + eor Ako_, Amu, E4 + eor Amu_, Aso, E3 + eor Aso_, Ama, E0 + eor Aka_, Abe, E1 + eor Ase_, Ago, E3 + eor Ago_, Ame, E1 + eor Ake_, Agi, E2 + eor Agi_, Aka, E0 + eor Aga_, Abo, E3 + eor Abo_, Amo, E3 + eor Amo_, Ami, E2 + eor Ami_, Ake, E1 + eor Age_, Agu, E4 + eor Agu_, Asi, E2 + eor Asi_, Aku, E4 + eor Aku_, Asa, E0 + eor Ama_, Abu, E4 + eor Abu_, Asu, E4 + eor Asu_, Ase, E1 + eor Ame_, Aga, E0 + eor Abe_, Age, E1 + + load_constant_ptr + + bic tmp, Agi_, Age_, ROR #47 + eor Aga, tmp, Aga_, ROR #39 + bic tmp, Ago_, Agi_, ROR #42 + eor Age, tmp, Age_, ROR #25 + bic tmp, Agu_, Ago_, ROR #16 + eor Agi, tmp, Agi_, ROR #58 + bic tmp, Aga_, Agu_, ROR #31 + eor Ago, tmp, Ago_, ROR #47 + bic tmp, Age_, Aga_, ROR #56 + eor Agu, tmp, Agu_, ROR #23 + bic tmp, Aki_, Ake_, ROR #19 + eor Aka, tmp, Aka_, ROR #24 + bic tmp, Ako_, Aki_, ROR #47 + eor Ake, tmp, Ake_, ROR #2 + bic tmp, Aku_, Ako_, ROR #10 + eor Aki, tmp, Aki_, ROR #57 + bic tmp, Aka_, Aku_, ROR #47 + eor Ako, tmp, Ako_, ROR #57 + bic tmp, Ake_, Aka_, ROR #5 + eor Aku, tmp, Aku_, ROR #52 + bic tmp, Ami_, Ame_, ROR #38 + eor Ama, tmp, Ama_, ROR #47 + bic tmp, Amo_, Ami_, ROR #5 + eor Ame, tmp, Ame_, ROR #43 + bic tmp, Amu_, Amo_, ROR #41 + eor Ami, tmp, Ami_, ROR #46 + + ldr cur_const, [const_addr] + mov count, #1 + + bic tmp, Ama_, Amu_, ROR #35 + eor Amo, tmp, Amo_, ROR #12 + bic tmp, Ame_, Ama_, ROR #9 + eor Amu, tmp, Amu_, ROR #44 + bic tmp, Asi_, Ase_, ROR #48 + eor Asa, tmp, Asa_, ROR #41 + bic tmp, Aso_, Asi_, ROR #2 + eor Ase, tmp, Ase_, ROR #50 + bic tmp, Asu_, Aso_, ROR #25 + eor Asi, tmp, Asi_, ROR #27 + bic tmp, Asa_, Asu_, ROR #60 + eor Aso, tmp, Aso_, ROR #21 + bic tmp, Ase_, Asa_, ROR #57 + eor Asu, tmp, Asu_, ROR #53 + bic tmp, Abi_, Abe_, ROR #63 + eor Aba, Aba_, tmp, ROR #21 + bic tmp, Abo_, Abi_, ROR #42 + eor Abe, tmp, Abe_, ROR #41 + bic tmp, Abu_, Abo_, ROR #57 + eor Abi, tmp, Abi_, ROR #35 + bic tmp, Aba_, Abu_, ROR #50 + eor Abo, tmp, Abo_, ROR #43 + bic tmp, Abe_, Aba_, ROR #44 + eor Abu, tmp, Abu_, ROR #30 + + eor Aba, Aba, cur_const + +.endm + + +.macro keccak_f1600_round_noninitial + + save count, STACK_OFFSET_COUNT + + eor C0, Aka, Asa, ROR #50 + eor C1, Ase, Age, ROR #60 + eor C2, Ami, Agi, ROR #59 + eor C3, Ago, Aso, ROR #30 + eor C4, Abu, Asu, ROR #53 + eor C0, Ama, C0, ROR #49 + eor C1, Abe, C1, ROR #44 + eor C2, Aki, C2, ROR #26 + eor C3, Amo, C3, ROR #63 + eor C4, Amu, C4, ROR #56 + eor C0, Aga, C0, ROR #57 + eor C1, Ame, C1, ROR #58 + eor C2, Abi, C2, ROR #60 + eor C3, Ako, C3, ROR #38 + eor C4, Agu, C4, ROR #48 + eor C0, Aba, C0, ROR #61 + eor C1, Ake, C1, ROR #57 + eor C2, Asi, C2, ROR #52 + eor C3, Abo, C3, ROR #63 + eor C4, Aku, C4, ROR #50 + ror C1, C1, 56 + ror C4, C4, 58 + ror C2, C2, 62 + + eor E1, C0, C2, ROR #63 + eor E3, C2, C4, ROR #63 + eor E0, C4, C1, ROR #63 + eor E2, C1, C3, ROR #63 + eor E4, C3, C0, ROR #63 + + eor Aba_, E0, Aba + eor Asa_, E2, Abi, ROR #50 + eor Abi_, E2, Aki, ROR #46 + eor Aki_, E3, Ako, ROR #63 + eor Ako_, E4, Amu, ROR #28 + eor Amu_, E3, Aso, ROR #2 + eor Aso_, E0, Ama, ROR #54 + eor Aka_, E1, Abe, ROR #43 + eor Ase_, E3, Ago, ROR #36 + eor Ago_, E1, Ame, ROR #49 + eor Ake_, E2, Agi, ROR #3 + eor Agi_, E0, Aka, ROR #39 + eor Aga_, E3, Abo + eor Abo_, E3, Amo, ROR #37 + eor Amo_, E2, Ami, ROR #8 + eor Ami_, E1, Ake, ROR #56 + eor Age_, E4, Agu, ROR #44 + eor Agu_, E2, Asi, ROR #62 + eor Asi_, E4, Aku, ROR #58 + eor Aku_, E0, Asa, ROR #25 + eor Ama_, E4, Abu, ROR #20 + eor Abu_, E4, Asu, ROR #9 + eor Asu_, E1, Ase, ROR #23 + eor Ame_, E0, Aga, ROR #61 + eor Abe_, E1, Age, ROR #19 + + load_constant_ptr + restore count, STACK_OFFSET_COUNT + + bic tmp0, Agi_, Age_, ROR #47 + bic tmp1, Ago_, Agi_, ROR #42 + eor Aga, tmp0, Aga_, ROR #39 + bic tmp0, Agu_, Ago_, ROR #16 + eor Age, tmp1, Age_, ROR #25 + bic tmp1, Aga_, Agu_, ROR #31 + eor Agi, tmp0, Agi_, ROR #58 + bic tmp0, Age_, Aga_, ROR #56 + eor Ago, tmp1, Ago_, ROR #47 + bic tmp1, Aki_, Ake_, ROR #19 + eor Agu, tmp0, Agu_, ROR #23 + bic tmp0, Ako_, Aki_, ROR #47 + eor Aka, tmp1, Aka_, ROR #24 + bic tmp1, Aku_, Ako_, ROR #10 + eor Ake, tmp0, Ake_, ROR #2 + bic tmp0, Aka_, Aku_, ROR #47 + eor Aki, tmp1, Aki_, ROR #57 + bic tmp1, Ake_, Aka_, ROR #5 + eor Ako, tmp0, Ako_, ROR #57 + bic tmp0, Ami_, Ame_, ROR #38 + eor Aku, tmp1, Aku_, ROR #52 + bic tmp1, Amo_, Ami_, ROR #5 + eor Ama, tmp0, Ama_, ROR #47 + bic tmp0, Amu_, Amo_, ROR #41 + eor Ame, tmp1, Ame_, ROR #43 + bic tmp1, Ama_, Amu_, ROR #35 + eor Ami, tmp0, Ami_, ROR #46 + bic tmp0, Ame_, Ama_, ROR #9 + eor Amo, tmp1, Amo_, ROR #12 + bic tmp1, Asi_, Ase_, ROR #48 + eor Amu, tmp0, Amu_, ROR #44 + bic tmp0, Aso_, Asi_, ROR #2 + eor Asa, tmp1, Asa_, ROR #41 + bic tmp1, Asu_, Aso_, ROR #25 + eor Ase, tmp0, Ase_, ROR #50 + bic tmp0, Asa_, Asu_, ROR #60 + eor Asi, tmp1, Asi_, ROR #27 + bic tmp1, Ase_, Asa_, ROR #57 + eor Aso, tmp0, Aso_, ROR #21 + bic tmp0, Abi_, Abe_, ROR #63 + eor Asu, tmp1, Asu_, ROR #53 + bic tmp1, Abo_, Abi_, ROR #42 + eor Aba, Aba_, tmp0, ROR #21 + bic tmp0, Abu_, Abo_, ROR #57 + eor Abe, tmp1, Abe_, ROR #41 + bic tmp1, Aba_, Abu_, ROR #50 + eor Abi, tmp0, Abi_, ROR #35 + bic tmp0, Abe_, Aba_, ROR #44 + eor Abo, tmp1, Abo_, ROR #43 + eor Abu, tmp0, Abu_, ROR #30 + + ldr cur_const, [const_addr, count, UXTW #3] + add count, count, #1 + + eor Aba, Aba, cur_const + +.endm + +.macro final_rotate_store + ror Aga, Aga,#(64-3) + restore input_addr, STACK_OFFSET_INPUT + ror Abu, Abu,#(64-44) + ror Aka, Aka,#(64-25) + ror Ake, Ake,#(64-8) + stp Abu, Aga, [input_addr, #(1*8*4)] + ror Ama, Ama,#(64-10) + ror Aku, Aku,#(64-6) + stp Aka, Ake, [input_addr, #(1*8*10)] + ror Asa, Asa,#(64-39) + ror Ase, Ase,#(64-41) + stp Aku, Ama, [input_addr, #(1*8*14)] + ror Abe, Abe,#(64-21) + ror Age, Age,#(64-45) + stp Asa, Ase, [input_addr, #(1*8*20)] + ror Agi, Agi,#(64-61) + stp Aba, Abe, [input_addr, #(1*8*0)] + ror Ame, Ame,#(64-15) + ror Ami, Ami,#(64-56) + stp Age, Agi, [input_addr, #(1*8*6)] + ror Abi, Abi,#(64-14) + ror Aki, Aki,#(64-18) + stp Ame, Ami, [input_addr, #(1*8*16)] + ror Ako, Ako,#(64-1) + stp Abi, Abo, [input_addr, #(1*8*2)] + ror Asi, Asi,#(64-2) + ror Aso, Aso,#(64-62) + stp Aki, Ako, [input_addr, #(1*8*12)] + ror Ago, Ago,#(64-28) + ror Agu, Agu,#(64-20) + stp Asi, Aso, [input_addr, #(1*8*22)] + ror Amo, Amo,#(64-27) + ror Amu, Amu,#(64-36) + stp Ago, Agu, [input_addr, #(1*8*8)] + ror Asu, Asu,#(64-55) + stp Amo, Amu, [input_addr, #(1*8*18)] + str Asu, [input_addr, #(1*8*24)] +.endm + +#define KECCAK_F1600_ROUNDS 24 + +.text +.balign 16 +.global keccak_f1600_x1_scalar_asm_v4 +.global _keccak_f1600_x1_scalar_asm_v4 + +keccak_f1600_x1_scalar_asm_v4: +_keccak_f1600_x1_scalar_asm_v4: + alloc_stack + save_gprs + + keccak_f1600_round_initial +loop: + keccak_f1600_round_noninitial + cmp count, #(KECCAK_F1600_ROUNDS-1) + ble loop + + final_rotate_store + restore_gprs + free_stack + ret diff --git a/tests/keccak_neon/manual/keccak_f1600_x1_scalar_asm_v5.s b/tests/keccak_neon/manual/keccak_f1600_x1_scalar_asm_v5.s new file mode 100644 index 0000000..19f1cc2 --- /dev/null +++ b/tests/keccak_neon/manual/keccak_f1600_x1_scalar_asm_v5.s @@ -0,0 +1,506 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +/********************** CONSTANTS *************************/ + .data + .balign 64 +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x26 + cur_const .req x26 + count .req w27 + + /* Mapping of Kecck-f1600 state to scalar registers + * at the beginning and end of each round. */ + Aba .req x1 + Abe .req x6 + Abi .req x11 + Abo .req x16 + Abu .req x21 + Aga .req x2 + Age .req x7 + Agi .req x12 + Ago .req x17 + Agu .req x22 + Aka .req x3 + Ake .req x8 + Aki .req x13 + Ako .req x18 + Aku .req x23 + Ama .req x4 + Ame .req x9 + Ami .req x14 + Amo .req x19 + Amu .req x24 + Asa .req x5 + Ase .req x10 + Asi .req x15 + Aso .req x20 + Asu .req x25 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + Aba_ .req x30 + Abe_ .req x28 + Abi_ .req x11 + Abo_ .req x16 + Abu_ .req x21 + Aga_ .req x3 + Age_ .req x8 + Agi_ .req x12 + Ago_ .req x17 + Agu_ .req x22 + Aka_ .req x4 + Ake_ .req x9 + Aki_ .req x13 + Ako_ .req x18 + Aku_ .req x23 + Ama_ .req x5 + Ame_ .req x10 + Ami_ .req x14 + Amo_ .req x19 + Amu_ .req x24 + Asa_ .req x1 + Ase_ .req x6 + Asi_ .req x15 + Aso_ .req x20 + Asu_ .req x25 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + C0 .req x30 + E0 .req x29 + C1 .req x26 + E1 .req x0 + C2 .req x27 + E2 .req x26 + C3 .req x28 + E3 .req x27 + C4 .req x29 + E4 .req x28 + + tmp .req x0 + +/************************ MACROS ****************************/ + +#define STACK_SIZE (16*6 + 3*8 + 8) // GPRs (16*6), count (8), const (8), input (8), padding (8) +#define STACK_BASE_GPRS (3*8+8) +#define STACK_OFFSET_INPUT (0*8) +#define STACK_OFFSET_CONST (1*8) +#define STACK_OFFSET_COUNT (2*8) + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +.macro save reg, offset + str \reg, [sp, #\offset] +.endm + +.macro restore reg, offset + ldr \reg, [sp, #\offset] +.endm + +.macro save_gprs + stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro keccak_f1600_round_initial + ldp Aku, Ama, [input_addr, #(1*8*14)] + ldp Asa, Ase, [input_addr, #(1*8*20)] + eor C0, Ama, Asa + ldp Ame, Ami, [input_addr, #(1*8*16)] + eor C1, Ame, Ase + ldp Asi, Aso, [input_addr, #(1*8*22)] + eor C2, Ami, Asi + ldp Amo, Amu, [input_addr, #(1*8*18)] + eor C3, Amo, Aso + ldr Asu, [input_addr, #(1*8*24)] + eor C4, Amu, Asu + ldp Aka, Ake, [input_addr, #(1*8*10)] + eor C0, Aka, C0 + eor C1, Ake, C1 + ldp Aki, Ako, [input_addr, #(1*8*12)] + eor C2, Aki, C2 + ldp Abu, Aga, [input_addr, #(1*8*4)] + eor C3, Ako, C3 + eor C4, Aku, C4 + ldp Age, Agi, [input_addr, #(1*8*6)] + eor C0, Aga, C0 + ldp Ago, Agu, [input_addr, #(1*8*8)] + eor C1, Age, C1 + ldp Aba, Abe, [input_addr, #(1*8*0)] + eor C2, Agi, C2 + ldp Abi, Abo, [input_addr, #(1*8*2)] + eor C3, Ago, C3 + save input_addr, STACK_OFFSET_INPUT + eor C4, Agu, C4 + eor C0, Aba, C0 + eor C1, Abe, C1 + eor C2, Abi, C2 + eor C3, Abo, C3 + eor C4, Abu, C4 + + eor E1, C0, C2, ROR #63 + eor E3, C2, C4, ROR #63 + eor E0, C4, C1, ROR #63 + eor E2, C1, C3, ROR #63 + eor E4, C3, C0, ROR #63 + + eor Aba_, Aba, E0 + eor Asa_, Abi, E2 + eor Abi_, Aki, E2 + eor Aki_, Ako, E3 + eor Ako_, Amu, E4 + eor Amu_, Aso, E3 + eor Aso_, Ama, E0 + eor Aka_, Abe, E1 + eor Ase_, Ago, E3 + eor Ago_, Ame, E1 + eor Ake_, Agi, E2 + eor Agi_, Aka, E0 + eor Aga_, Abo, E3 + eor Abo_, Amo, E3 + eor Amo_, Ami, E2 + eor Ami_, Ake, E1 + eor Age_, Agu, E4 + eor Agu_, Asi, E2 + eor Asi_, Aku, E4 + eor Aku_, Asa, E0 + eor Ama_, Abu, E4 + eor Abu_, Asu, E4 + eor Asu_, Ase, E1 + eor Ame_, Aga, E0 + eor Abe_, Age, E1 + + load_constant_ptr + + tmp0 .req x0 + tmp1 .req x29 + + bic tmp0, Agi_, Age_, ROR #47 + bic tmp1, Ago_, Agi_, ROR #42 + eor Aga, tmp0, Aga_, ROR #39 + bic tmp0, Agu_, Ago_, ROR #16 + eor Age, tmp1, Age_, ROR #25 + bic tmp1, Aga_, Agu_, ROR #31 + eor Agi, tmp0, Agi_, ROR #58 + bic tmp0, Age_, Aga_, ROR #56 + eor Ago, tmp1, Ago_, ROR #47 + bic tmp1, Aki_, Ake_, ROR #19 + eor Agu, tmp0, Agu_, ROR #23 + bic tmp0, Ako_, Aki_, ROR #47 + eor Aka, tmp1, Aka_, ROR #24 + bic tmp1, Aku_, Ako_, ROR #10 + eor Ake, tmp0, Ake_, ROR #2 + bic tmp0, Aka_, Aku_, ROR #47 + eor Aki, tmp1, Aki_, ROR #57 + bic tmp1, Ake_, Aka_, ROR #5 + eor Ako, tmp0, Ako_, ROR #57 + bic tmp0, Ami_, Ame_, ROR #38 + eor Aku, tmp1, Aku_, ROR #52 + bic tmp1, Amo_, Ami_, ROR #5 + eor Ama, tmp0, Ama_, ROR #47 + bic tmp0, Amu_, Amo_, ROR #41 + eor Ame, tmp1, Ame_, ROR #43 + bic tmp1, Ama_, Amu_, ROR #35 + eor Ami, tmp0, Ami_, ROR #46 + bic tmp0, Ame_, Ama_, ROR #9 + + str const_addr, [sp, #(STACK_OFFSET_CONST)] + ldr cur_const, [const_addr] + + eor Amo, tmp1, Amo_, ROR #12 + bic tmp1, Asi_, Ase_, ROR #48 + eor Amu, tmp0, Amu_, ROR #44 + bic tmp0, Aso_, Asi_, ROR #2 + eor Asa, tmp1, Asa_, ROR #41 + bic tmp1, Asu_, Aso_, ROR #25 + eor Ase, tmp0, Ase_, ROR #50 + bic tmp0, Asa_, Asu_, ROR #60 + eor Asi, tmp1, Asi_, ROR #27 + bic tmp1, Ase_, Asa_, ROR #57 + eor Aso, tmp0, Aso_, ROR #21 + + mov count, #1 + + bic tmp0, Abi_, Abe_, ROR #63 + eor Asu, tmp1, Asu_, ROR #53 + bic tmp1, Abo_, Abi_, ROR #42 + eor Aba, Aba_, tmp0, ROR #21 + bic tmp0, Abu_, Abo_, ROR #57 + eor Abe, tmp1, Abe_, ROR #41 + bic tmp1, Aba_, Abu_, ROR #50 + eor Abi, tmp0, Abi_, ROR #35 + bic tmp0, Abe_, Aba_, ROR #44 + eor Abo, tmp1, Abo_, ROR #43 + eor Abu, tmp0, Abu_, ROR #30 + + eor Aba, Aba, cur_const + save count, STACK_OFFSET_COUNT + +.endm + + +.macro keccak_f1600_round_noninitial + + eor C2, Asi, Abi, ROR #52 + eor C0, Aba, Aga, ROR #61 + eor C4, Aku, Agu, ROR #50 + eor C1, Ake, Ame, ROR #57 + eor C3, Abo, Ako, ROR #63 + eor C2, C2, Aki, ROR #48 + eor C0, C0, Ama, ROR #54 + eor C4, C4, Amu, ROR #34 + eor C1, C1, Abe, ROR #51 + eor C3, C3, Amo, ROR #37 + eor C2, C2, Ami, ROR #10 + eor C0, C0, Aka, ROR #39 + eor C4, C4, Abu, ROR #26 + eor C1, C1, Ase, ROR #31 + eor C3, C3, Ago, ROR #36 + eor C2, C2, Agi, ROR #5 + eor C0, C0, Asa, ROR #25 + eor C4, C4, Asu, ROR #15 + eor C1, C1, Age, ROR #27 + eor C3, C3, Aso, ROR #2 + + eor E1, C0, C2, ROR #61 + ror C2, C2, 62 + eor E3, C2, C4, ROR #57 + ror C4, C4, 58 + eor E0, C4, C1, ROR #55 + ror C1, C1, 56 + eor E2, C1, C3, ROR #63 + eor E4, C3, C0, ROR #63 + + eor Aba_, E0, Aba + eor Asa_, E2, Abi, ROR #50 + eor Abi_, E2, Aki, ROR #46 + eor Aki_, E3, Ako, ROR #63 + eor Ako_, E4, Amu, ROR #28 + eor Amu_, E3, Aso, ROR #2 + eor Aso_, E0, Ama, ROR #54 + eor Aka_, E1, Abe, ROR #43 + eor Ase_, E3, Ago, ROR #36 + eor Ago_, E1, Ame, ROR #49 + eor Ake_, E2, Agi, ROR #3 + eor Agi_, E0, Aka, ROR #39 + eor Aga_, E3, Abo + eor Abo_, E3, Amo, ROR #37 + eor Amo_, E2, Ami, ROR #8 + eor Ami_, E1, Ake, ROR #56 + eor Age_, E4, Agu, ROR #44 + eor Agu_, E2, Asi, ROR #62 + eor Asi_, E4, Aku, ROR #58 + eor Aku_, E0, Asa, ROR #25 + eor Ama_, E4, Abu, ROR #20 + eor Abu_, E4, Asu, ROR #9 + eor Asu_, E1, Ase, ROR #23 + eor Ame_, E0, Aga, ROR #61 + eor Abe_, E1, Age, ROR #19 + + load_constant_ptr_stack + restore count, STACK_OFFSET_COUNT + + tmp0 .req x0 + tmp1 .req x29 + + bic tmp0, Agi_, Age_, ROR #47 + bic tmp1, Ago_, Agi_, ROR #42 + eor Aga, tmp0, Aga_, ROR #39 + bic tmp0, Agu_, Ago_, ROR #16 + eor Age, tmp1, Age_, ROR #25 + bic tmp1, Aga_, Agu_, ROR #31 + eor Agi, tmp0, Agi_, ROR #58 + bic tmp0, Age_, Aga_, ROR #56 + eor Ago, tmp1, Ago_, ROR #47 + bic tmp1, Aki_, Ake_, ROR #19 + eor Agu, tmp0, Agu_, ROR #23 + bic tmp0, Ako_, Aki_, ROR #47 + eor Aka, tmp1, Aka_, ROR #24 + bic tmp1, Aku_, Ako_, ROR #10 + eor Ake, tmp0, Ake_, ROR #2 + bic tmp0, Aka_, Aku_, ROR #47 + eor Aki, tmp1, Aki_, ROR #57 + bic tmp1, Ake_, Aka_, ROR #5 + eor Ako, tmp0, Ako_, ROR #57 + bic tmp0, Ami_, Ame_, ROR #38 + eor Aku, tmp1, Aku_, ROR #52 + bic tmp1, Amo_, Ami_, ROR #5 + eor Ama, tmp0, Ama_, ROR #47 + bic tmp0, Amu_, Amo_, ROR #41 + eor Ame, tmp1, Ame_, ROR #43 + bic tmp1, Ama_, Amu_, ROR #35 + eor Ami, tmp0, Ami_, ROR #46 + bic tmp0, Ame_, Ama_, ROR #9 + + ldr cur_const, [const_addr, count, UXTW #3] + + eor Amo, tmp1, Amo_, ROR #12 + bic tmp1, Asi_, Ase_, ROR #48 + eor Amu, tmp0, Amu_, ROR #44 + bic tmp0, Aso_, Asi_, ROR #2 + eor Asa, tmp1, Asa_, ROR #41 + bic tmp1, Asu_, Aso_, ROR #25 + eor Ase, tmp0, Ase_, ROR #50 + bic tmp0, Asa_, Asu_, ROR #60 + eor Asi, tmp1, Asi_, ROR #27 + bic tmp1, Ase_, Asa_, ROR #57 + eor Aso, tmp0, Aso_, ROR #21 + bic tmp0, Abi_, Abe_, ROR #63 + add count, count, #1 + save count, STACK_OFFSET_COUNT + eor Asu, tmp1, Asu_, ROR #53 + bic tmp1, Abo_, Abi_, ROR #42 + eor Aba, Aba_, tmp0, ROR #21 + bic tmp0, Abu_, Abo_, ROR #57 + eor Abe, tmp1, Abe_, ROR #41 + bic tmp1, Aba_, Abu_, ROR #50 + eor Abi, tmp0, Abi_, ROR #35 + bic tmp0, Abe_, Aba_, ROR #44 + eor Abo, tmp1, Abo_, ROR #43 + eor Abu, tmp0, Abu_, ROR #30 + + eor Aba, Aba, cur_const + +.endm + +.macro final_rotate_store + ror Aga, Aga,#(64-3) + restore input_addr, STACK_OFFSET_INPUT + ror Abu, Abu,#(64-44) + ror Aka, Aka,#(64-25) + ror Ake, Ake,#(64-8) + stp Abu, Aga, [input_addr, #(1*8*4)] + ror Ama, Ama,#(64-10) + ror Aku, Aku,#(64-6) + stp Aka, Ake, [input_addr, #(1*8*10)] + ror Asa, Asa,#(64-39) + ror Ase, Ase,#(64-41) + stp Aku, Ama, [input_addr, #(1*8*14)] + ror Abe, Abe,#(64-21) + ror Age, Age,#(64-45) + stp Asa, Ase, [input_addr, #(1*8*20)] + ror Agi, Agi,#(64-61) + stp Aba, Abe, [input_addr, #(1*8*0)] + ror Ame, Ame,#(64-15) + ror Ami, Ami,#(64-56) + stp Age, Agi, [input_addr, #(1*8*6)] + ror Abi, Abi,#(64-14) + ror Aki, Aki,#(64-18) + stp Ame, Ami, [input_addr, #(1*8*16)] + ror Ako, Ako,#(64-1) + stp Abi, Abo, [input_addr, #(1*8*2)] + ror Asi, Asi,#(64-2) + ror Aso, Aso,#(64-62) + stp Aki, Ako, [input_addr, #(1*8*12)] + ror Ago, Ago,#(64-28) + ror Agu, Agu,#(64-20) + stp Asi, Aso, [input_addr, #(1*8*22)] + ror Amo, Amo,#(64-27) + ror Amu, Amu,#(64-36) + stp Ago, Agu, [input_addr, #(1*8*8)] + ror Asu, Asu,#(64-55) + stp Amo, Amu, [input_addr, #(1*8*18)] + str Asu, [input_addr, #(1*8*24)] +.endm + +#define KECCAK_F1600_ROUNDS 24 + +.text +.balign 16 +.global keccak_f1600_x1_scalar_asm_v5 +.global _keccak_f1600_x1_scalar_asm_v5 + +.macro load_constant_ptr_stack + ldr const_addr, [sp, #(STACK_OFFSET_CONST)] +.endm +keccak_f1600_x1_scalar_asm_v5: +_keccak_f1600_x1_scalar_asm_v5: + alloc_stack + save_gprs + + keccak_f1600_round_initial +loop: + keccak_f1600_round_noninitial + cmp count, #(KECCAK_F1600_ROUNDS-1) + ble loop + + final_rotate_store + restore_gprs + free_stack + ret diff --git a/tests/keccak_neon/manual/keccak_f1600_x2_hybrid_asm_v1.s b/tests/keccak_neon/manual/keccak_f1600_x2_hybrid_asm_v1.s new file mode 100644 index 0000000..4073530 --- /dev/null +++ b/tests/keccak_neon/manual/keccak_f1600_x2_hybrid_asm_v1.s @@ -0,0 +1,417 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +#if defined(__ARM_FEATURE_SHA3) + +/********************** CONSTANTS *************************/ + .data + .align(8) +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x1 + count .req x2 + cur_const .req x3 + + /* Mapping of Kecck-f1600 state to vector registers + * at the beginning and end of each round. */ + Aba .req v0 + Abe .req v1 + Abi .req v2 + Abo .req v3 + Abu .req v4 + Aga .req v5 + Age .req v6 + Agi .req v7 + Ago .req v8 + Agu .req v9 + Aka .req v10 + Ake .req v11 + Aki .req v12 + Ako .req v13 + Aku .req v14 + Ama .req v15 + Ame .req v16 + Ami .req v17 + Amo .req v18 + Amu .req v19 + Asa .req v20 + Ase .req v21 + Asi .req v22 + Aso .req v23 + Asu .req v24 + + /* q-form of the above mapping */ + Abaq .req q0 + Abeq .req q1 + Abiq .req q2 + Aboq .req q3 + Abuq .req q4 + Agaq .req q5 + Ageq .req q6 + Agiq .req q7 + Agoq .req q8 + Aguq .req q9 + Akaq .req q10 + Akeq .req q11 + Akiq .req q12 + Akoq .req q13 + Akuq .req q14 + Amaq .req q15 + Ameq .req q16 + Amiq .req q17 + Amoq .req q18 + Amuq .req q19 + Asaq .req q20 + Aseq .req q21 + Asiq .req q22 + Asoq .req q23 + Asuq .req q24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v30 + C1 .req v29 + C2 .req v28 + C3 .req v27 + C4 .req v26 + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req v26 + E1 .req v25 + E2 .req v29 + E3 .req v28 + E4 .req v27 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + Abi_ .req v2 + Abo_ .req v3 + Abu_ .req v4 + Aga_ .req v10 + Age_ .req v11 + Agi_ .req v7 + Ago_ .req v8 + Agu_ .req v9 + Aka_ .req v15 + Ake_ .req v16 + Aki_ .req v12 + Ako_ .req v13 + Aku_ .req v14 + Ama_ .req v20 + Ame_ .req v21 + Ami_ .req v17 + Amo_ .req v18 + Amu_ .req v19 + Asa_ .req v0 + Ase_ .req v1 + Asi_ .req v22 + Aso_ .req v23 + Asu_ .req v24 + Aba_ .req v30 + Abe_ .req v27 + + + vtmp .req v31 +/************************ MACROS ****************************/ + +.macro eor3_m1 d s0 s1 s2 + eor \d\().16b, \s0\().16b, \s1\().16b + eor \d\().16b, \d\().16b, \s2\().16b +.endm +.macro rax1_m1 d s0 s1 + add vtmp.2d, \s1\().2d, \s1\().2d + sri vtmp.2d, \s1\().2d, #63 + eor \d\().16b, vtmp.16b, \s0\().16b +.endm +.macro xar_m1 d s0 s1 imm + eor vtmp.16b, \s0\().16b, \s1\().16b + shl \d\().2d, vtmp.2d, #(64-\imm) + sri \d\().2d, vtmp.2d, #(\imm) +.endm + +.macro bcax_m1 d s0 s1 s2 + bic vtmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, vtmp.16b, \s0\().16b +.endm + + +.macro load_input + ldr Abaq, [input_addr, #(2*8*0)] + ldr Abeq, [input_addr, #(2*8*1)] + ldr Abiq, [input_addr, #(2*8*2)] + ldr Aboq, [input_addr, #(2*8*3)] + ldr Abuq, [input_addr, #(2*8*4)] + ldr Agaq, [input_addr, #(2*8*5)] + ldr Ageq, [input_addr, #(2*8*6)] + ldr Agiq, [input_addr, #(2*8*7)] + ldr Agoq, [input_addr, #(2*8*8)] + ldr Aguq, [input_addr, #(2*8*9)] + ldr Akaq, [input_addr, #(2*8*10)] + ldr Akeq, [input_addr, #(2*8*11)] + ldr Akiq, [input_addr, #(2*8*12)] + ldr Akoq, [input_addr, #(2*8*13)] + ldr Akuq, [input_addr, #(2*8*14)] + ldr Amaq, [input_addr, #(2*8*15)] + ldr Ameq, [input_addr, #(2*8*16)] + ldr Amiq, [input_addr, #(2*8*17)] + ldr Amoq, [input_addr, #(2*8*18)] + ldr Amuq, [input_addr, #(2*8*19)] + ldr Asaq, [input_addr, #(2*8*20)] + ldr Aseq, [input_addr, #(2*8*21)] + ldr Asiq, [input_addr, #(2*8*22)] + ldr Asoq, [input_addr, #(2*8*23)] + ldr Asuq, [input_addr, #(2*8*24)] +.endm + +.macro store_input + str Abaq, [input_addr, #(2*8*0)] + str Abeq, [input_addr, #(2*8*1)] + str Abiq, [input_addr, #(2*8*2)] + str Aboq, [input_addr, #(2*8*3)] + str Abuq, [input_addr, #(2*8*4)] + str Agaq, [input_addr, #(2*8*5)] + str Ageq, [input_addr, #(2*8*6)] + str Agiq, [input_addr, #(2*8*7)] + str Agoq, [input_addr, #(2*8*8)] + str Aguq, [input_addr, #(2*8*9)] + str Akaq, [input_addr, #(2*8*10)] + str Akeq, [input_addr, #(2*8*11)] + str Akiq, [input_addr, #(2*8*12)] + str Akoq, [input_addr, #(2*8*13)] + str Akuq, [input_addr, #(2*8*14)] + str Amaq, [input_addr, #(2*8*15)] + str Ameq, [input_addr, #(2*8*16)] + str Amiq, [input_addr, #(2*8*17)] + str Amoq, [input_addr, #(2*8*18)] + str Amuq, [input_addr, #(2*8*19)] + str Asaq, [input_addr, #(2*8*20)] + str Aseq, [input_addr, #(2*8*21)] + str Asiq, [input_addr, #(2*8*22)] + str Asoq, [input_addr, #(2*8*23)] + str Asuq, [input_addr, #(2*8*24)] +.endm + +#define STACK_SIZE (16*4 + 16*6) // VREGS (16*4) + GPRS (TODO: Remove) + +#define STACK_BASE_GPRS (16*4) +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) + .endm + +.macro save_gprs + stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro save_vregs + stp d8, d9, [sp, #(16*0)] + stp d10, d11, [sp, #(16*1)] + stp d12, d13, [sp, #(16*2)] + stp d14, d15, [sp, #(16*3)] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #(16*0)] + ldp d10, d11, [sp, #(16*1)] + ldp d12, d13, [sp, #(16*2)] + ldp d14, d15, [sp, #(16*3)] +.endm + +/* Macros using v8.4-A SHA-3 instructions */ + +.macro eor3_m0 d s0 s1 s2 + eor3 \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +.macro rax1_m0 d s0 s1 + rax1 \d\().2d, \s0\().2d, \s1\().2d +.endm + +.macro xar_m0 d s0 s1 imm + xar \d\().2d, \s0\().2d, \s1\().2d, #\imm +.endm + +.macro bcax_m0 d s0 s1 s2 + bcax \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +/* Keccak-f1600 round */ + +.macro hybrid_round + + eor3_m1 C0, Aba, Aga, Aka + eor3_m0 C0, C0, Ama, Asa + eor3_m1 C1, Abe, Age, Ake + eor3_m0 C1, C1, Ame, Ase + eor3_m1 C2, Abi, Agi, Aki + eor3_m0 C2, C2, Ami, Asi + eor3_m1 C3, Abo, Ago, Ako + eor3_m0 C3, C3, Amo, Aso + eor3_m1 C4, Abu, Agu, Aku + eor3_m0 C4, C4, Amu, Asu + + rax1_m1 E1, C0, C2 + rax1_m0 E3, C2, C4 + rax1_m1 E0, C4, C1 + rax1_m0 E2, C1, C3 + rax1_m1 E4, C3, C0 + + eor Aba_.16b, Aba.16b, E0.16b + xar_m0 Asa_, Abi, E2, 2 + xar_m1 Abi_, Aki, E2, 21 + xar_m0 Aki_, Ako, E3, 39 + xar_m1 Ako_, Amu, E4, 56 + xar_m0 Amu_, Aso, E3, 8 + xar_m1 Aso_, Ama, E0, 23 + xar_m0 Aka_, Abe, E1, 63 + xar_m1 Ase_, Ago, E3, 9 + xar_m0 Ago_, Ame, E1, 19 + xar_m1 Ake_, Agi, E2, 58 + xar_m0 Agi_, Aka, E0, 61 + xar_m1 Aga_, Abo, E3, 36 + xar_m0 Abo_, Amo, E3, 43 + xar_m1 Amo_, Ami, E2, 49 + xar_m0 Ami_, Ake, E1, 54 + xar_m1 Age_, Agu, E4, 44 + xar_m0 Agu_, Asi, E2, 3 + xar_m1 Asi_, Aku, E4, 25 + xar_m0 Aku_, Asa, E0, 46 + xar_m1 Ama_, Abu, E4, 37 + xar_m0 Abu_, Asu, E4, 50 + xar_m1 Asu_, Ase, E1, 62 + xar_m0 Ame_, Aga, E0, 28 + xar_m1 Abe_, Age, E1, 20 + + ld1r {v28.2d}, [const_addr], #8 + + bcax_m0 Aga, Aga_, Agi_, Age_ + bcax_m1 Age, Age_, Ago_, Agi_ + bcax_m0 Agi, Agi_, Agu_, Ago_ + bcax_m1 Ago, Ago_, Aga_, Agu_ + bcax_m0 Agu, Agu_, Age_, Aga_ + bcax_m1 Aka, Aka_, Aki_, Ake_ + bcax_m0 Ake, Ake_, Ako_, Aki_ + bcax_m1 Aki, Aki_, Aku_, Ako_ + bcax_m0 Ako, Ako_, Aka_, Aku_ + bcax_m1 Aku, Aku_, Ake_, Aka_ + bcax_m0 Ama, Ama_, Ami_, Ame_ + bcax_m1 Ame, Ame_, Amo_, Ami_ + bcax_m0 Ami, Ami_, Amu_, Amo_ + bcax_m1 Amo, Amo_, Ama_, Amu_ + bcax_m0 Amu, Amu_, Ame_, Ama_ + bcax_m1 Asa, Asa_, Asi_, Ase_ + bcax_m0 Ase, Ase_, Aso_, Asi_ + bcax_m1 Asi, Asi_, Asu_, Aso_ + bcax_m0 Aso, Aso_, Asa_, Asu_ + bcax_m1 Asu, Asu_, Ase_, Asa_ + bcax_m0 Aba, Aba_, Abi_, Abe_ + bcax_m1 Abe, Abe_, Abo_, Abi_ + bcax_m0 Abi, Abi_, Abu_, Abo_ + bcax_m1 Abo, Abo_, Aba_, Abu_ + bcax_m0 Abu, Abu_, Abe_, Aba_ + + // iota step + eor Aba.16b, Aba.16b, v28.16b + +.endm + +#define KECCAK_F1600_ROUNDS 24 + +.text +.align 4 +.global keccak_f1600_x2_hybrid_asm_v1 +.global _keccak_f1600_x2_hybrid_asm_v1 + +keccak_f1600_x2_hybrid_asm_v1: +_keccak_f1600_x2_hybrid_asm_v1: + alloc_stack + save_gprs + save_vregs + load_constant_ptr + load_input + + mov count, #(KECCAK_F1600_ROUNDS) + +loop: + hybrid_round + sub count, count, #1 + cbnz count, loop + + store_input + restore_vregs + restore_gprs + free_stack + ret + +#endif diff --git a/tests/keccak_neon/manual/keccak_f1600_x2_hybrid_asm_v2p0.s b/tests/keccak_neon/manual/keccak_f1600_x2_hybrid_asm_v2p0.s new file mode 100644 index 0000000..f6985c1 --- /dev/null +++ b/tests/keccak_neon/manual/keccak_f1600_x2_hybrid_asm_v2p0.s @@ -0,0 +1,830 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + + +#if defined(__ARM_FEATURE_SHA3) +/********************** CONSTANTS *************************/ + .data + .align(8) +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x1 + count .req x2 + cur_const .req x3 + + /* Mapping of Kecck-f1600 state to vector registers + * at the beginning and end of each round. */ + ASba .req v0 + ASbe .req v1 + ASbi .req v2 + ASbo .req v3 + ASbu .req v4 + ASga .req v5 + ASge .req v6 + ASgi .req v7 + ASgo .req v8 + ASgu .req v9 + ASka .req v10 + ASke .req v11 + ASki .req v12 + ASko .req v13 + ASku .req v14 + ASma .req v15 + ASme .req v16 + ASmi .req v17 + ASmo .req v18 + ASmu .req v19 + ASsa .req v20 + ASse .req v21 + ASsi .req v22 + ASso .req v23 + ASsu .req v24 + + /* q-form of the above mapping */ + ASbaq .req q0 + ASbeq .req q1 + ASbiq .req q2 + ASboq .req q3 + ASbuq .req q4 + ASgaq .req q5 + ASgeq .req q6 + ASgiq .req q7 + ASgoq .req q8 + ASguq .req q9 + ASkaq .req q10 + ASkeq .req q11 + ASkiq .req q12 + ASkoq .req q13 + ASkuq .req q14 + ASmaq .req q15 + ASmeq .req q16 + ASmiq .req q17 + ASmoq .req q18 + ASmuq .req q19 + ASsaq .req q20 + ASseq .req q21 + ASsiq .req q22 + ASsoq .req q23 + ASsuq .req q24 + + Ascratch0 .req v25 + Ascratch1 .req v26 + Ascratch2 .req v27 + Ascratch3 .req v28 + Ascratch4 .req v29 + Ascratch5 .req v30 + Ascratch6 .req v31 + + Ascratch0q .req q25 + Ascratch1q .req q26 + Ascratch2q .req q27 + Ascratch3q .req q28 + Ascratch4q .req q29 + Ascratch5q .req q30 + Ascratch6q .req q31 + +/************************ MACROS ****************************/ + +.macro load_input + ldp ASbaq, ASbeq, [input_addr, #(2*8*0)] + ldp ASbiq, ASboq, [input_addr, #(2*8*2)] + ldp ASbuq, ASgaq, [input_addr, #(2*8*4)] + ldp ASgeq, ASgiq, [input_addr, #(2*8*6)] + ldp ASgoq, ASguq, [input_addr, #(2*8*8)] + ldp ASkaq, ASkeq, [input_addr, #(2*8*10)] + ldp ASkiq, ASkoq, [input_addr, #(2*8*12)] + ldp ASkuq, ASmaq, [input_addr, #(2*8*14)] + ldp ASmeq, ASmiq, [input_addr, #(2*8*16)] + ldp ASmoq, ASmuq, [input_addr, #(2*8*18)] + ldp ASsaq, ASseq, [input_addr, #(2*8*20)] + ldp ASsiq, ASsoq, [input_addr, #(2*8*22)] + ldr ASsuq, [input_addr, #(2*8*24)] +.endm + +.macro store_input + str ASbaq, [input_addr, #(2*8*0)] + str ASbeq, [input_addr, #(2*8*1)] + str ASbiq, [input_addr, #(2*8*2)] + str ASboq, [input_addr, #(2*8*3)] + str ASbuq, [input_addr, #(2*8*4)] + str ASgaq, [input_addr, #(2*8*5)] + str ASgeq, [input_addr, #(2*8*6)] + str ASgiq, [input_addr, #(2*8*7)] + str ASgoq, [input_addr, #(2*8*8)] + str ASguq, [input_addr, #(2*8*9)] + str ASkaq, [input_addr, #(2*8*10)] + str ASkeq, [input_addr, #(2*8*11)] + str ASkiq, [input_addr, #(2*8*12)] + str ASkoq, [input_addr, #(2*8*13)] + str ASkuq, [input_addr, #(2*8*14)] + str ASmaq, [input_addr, #(2*8*15)] + str ASmeq, [input_addr, #(2*8*16)] + str ASmiq, [input_addr, #(2*8*17)] + str ASmoq, [input_addr, #(2*8*18)] + str ASmuq, [input_addr, #(2*8*19)] + str ASsaq, [input_addr, #(2*8*20)] + str ASseq, [input_addr, #(2*8*21)] + str ASsiq, [input_addr, #(2*8*22)] + str ASsoq, [input_addr, #(2*8*23)] + str ASsuq, [input_addr, #(2*8*24)] +.endm + +#define STACK_SIZE (16*4 + 16*30) +#define STACK_BASE_VREGS 0 +#define STACK_BASE_TMP 16*4 + +#define E0_offset 0 +#define E1_offset 1 +#define E2_offset 2 +#define E3_offset 3 +#define E4_offset 4 + +#define Aba_offset (5 + 0 ) +#define Abe_offset (5 + 1 ) +#define Abi_offset (5 + 2 ) +#define Abo_offset (5 + 3 ) +#define Abu_offset (5 + 4 ) +#define Aga_offset (5 + 5 ) +#define Age_offset (5 + 6 ) +#define Agi_offset (5 + 7 ) +#define Ago_offset (5 + 8 ) +#define Agu_offset (5 + 9 ) +#define Aka_offset (5 + 10 ) +#define Ake_offset (5 + 11 ) +#define Aki_offset (5 + 12 ) +#define Ako_offset (5 + 13 ) +#define Aku_offset (5 + 14 ) +#define Ama_offset (5 + 15 ) +#define Ame_offset (5 + 16 ) +#define Ami_offset (5 + 17 ) +#define Amo_offset (5 + 18 ) +#define Amu_offset (5 + 19 ) +#define Asa_offset (5 + 20 ) +#define Ase_offset (5 + 21 ) +#define Asi_offset (5 + 22 ) +#define Aso_offset (5 + 23 ) +#define Asu_offset (5 + 24 ) + +#define ba_offset (5 + 0 ) +#define be_offset (5 + 1 ) +#define bi_offset (5 + 2 ) +#define bo_offset (5 + 3 ) +#define bu_offset (5 + 4 ) +#define ga_offset (5 + 5 ) +#define ge_offset (5 + 6 ) +#define gi_offset (5 + 7 ) +#define go_offset (5 + 8 ) +#define gu_offset (5 + 9 ) +#define ka_offset (5 + 10 ) +#define ke_offset (5 + 11 ) +#define ki_offset (5 + 12 ) +#define ko_offset (5 + 13 ) +#define ku_offset (5 + 14 ) +#define ma_offset (5 + 15 ) +#define me_offset (5 + 16 ) +#define mi_offset (5 + 17 ) +#define mo_offset (5 + 18 ) +#define mu_offset (5 + 19 ) +#define sa_offset (5 + 20 ) +#define se_offset (5 + 21 ) +#define si_offset (5 + 22 ) +#define so_offset (5 + 23 ) +#define su_offset (5 + 24 ) + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +#define savep(reg, offset_prefix) \ + str reg, [sp, #(STACK_BASE_TMP + 16 * offset_prefix ## _offset)] +#define restorep(reg, offset_prefix) \ + ldr reg, [sp, #(STACK_BASE_TMP + 16 * offset_prefix ## _offset)] +#define save(name) savep(name ## q,name) +#define restore(name) restorep(name ## q,name) + +.macro save_vregs + stp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + stp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + stp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + stp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + ldp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + ldp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + ldp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +/* Macros using v8.4-A SHA-3 instructions */ + +.macro eor3_m1_0 d s0 s1 s2 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor2 d s0 s1 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor3_m1_1 d s0 s1 s2 + eor \d\().16b, \d\().16b, \s2\().16b +.endm + +.macro eor3_m1 d s0 s1 s2 + eor3_m1_0 \d, \s0, \s1, \s2 + eor3_m1_1 \d, \s0, \s1, \s2 +.endm + +.macro rax1_m1 d s0 s1 + add tmp.2d, \s1\().2d, \s1\().2d + sri tmp.2d, \s1\().2d, #63 + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +.macro xar_m1 d s0 s1 imm + eor tmp.16b, \s0\().16b, \s1\().16b + shl \d\().2d, tmp.2d, #(64-\imm) + sri \d\().2d, tmp.2d, #(\imm) +.endm + +.macro bcax_m1 d s0 s1 s2 + bic tmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +.macro eor3_m0 d s0 s1 s2 + eor3 \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +.macro rax1_m0 d s0 s1 + rax1 \d\().2d, \s0\().2d, \s1\().2d +.endm + +.macro xar_m0 d s0 s1 imm + xar \d\().2d, \s0\().2d, \s1\().2d, #\imm +.endm + +.macro bcax_m0 d s0 s1 s2 + bcax \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +#define CONCAT5(a,b,c,d,e) a ## b ## c ## d ## e +#define CONCAT4(a,b,c,d) a ## b ## c ## d + +#define OUT(x) \out\()S##x +#define IN(x) \in\()S##x +#define B(x) \in\()B##x +#define E(x) \in\()E##x +#define C(x) \in\()C##x +#define TMP_IN(x) \in\()scratch ## x +#define TMP_OUT(x) \out\()scratch ## x + +#define OUTq(x) \out\()S##x##q +#define INq(x) \in\()S##x##q +#define Bq(x) \in\()B##x##q +#define Eq(x) \in\()E##x##q +#define Cq(x) \in\()C##x##q +#define TMP_INq(x) \in\()scratch ## x ## q +#define TMP_OUTq(x) \out\()scratch ## x ## q + +.macro declare_mappings out, in + + C(0) .req TMP_IN(0) + C(1) .req TMP_IN(1) + C(2) .req TMP_IN(2) + C(3) .req TMP_IN(3) + C(4) .req TMP_IN(4) + + Cq(0) .req TMP_INq(0) + Cq(1) .req TMP_INq(1) + Cq(2) .req TMP_INq(2) + Cq(3) .req TMP_INq(3) + Cq(4) .req TMP_INq(4) + + E(2) .req TMP_IN(5) + E(4) .req C(3) + E(1) .req C(0) + E(3) .req C(2) + E(0) .req C(4) + + Eq(2) .req TMP_INq(5) + Eq(4) .req Cq(3) + Eq(1) .req Cq(0) + Eq(3) .req Cq(2) + Eq(0) .req Cq(4) + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + B(go) .req IN(me) + B(gi) .req IN(ka) + B(ga) .req IN(bo) + B(ge) .req IN(gu) + B(gu) .req IN(si) + B(ki) .req IN(ko) + B(ko) .req IN(mu) + B(ka) .req IN(be) + B(ke) .req IN(gi) + B(ku) .req IN(sa) + B(mu) .req IN(so) + B(mo) .req IN(mi) + B(mi) .req IN(ke) + B(ma) .req IN(bu) + B(me) .req IN(ga) + B(ba) .req IN(ba) + B(bi) .req IN(ki) + B(bo) .req IN(mo) + B(bu) .req IN(su) + B(be) .req IN(ge) + B(sa) .req IN(bi) + B(so) .req IN(ma) + B(se) .req IN(go) + B(si) .req IN(ku) + B(su) .req IN(se) + + Bq(go) .req INq(me) + Bq(gi) .req INq(ka) + Bq(ga) .req INq(bo) + Bq(ge) .req INq(gu) + Bq(gu) .req INq(si) + Bq(ki) .req INq(ko) + Bq(ko) .req INq(mu) + Bq(ka) .req INq(be) + Bq(ke) .req INq(gi) + Bq(ku) .req INq(sa) + Bq(mu) .req INq(so) + Bq(mo) .req INq(mi) + Bq(mi) .req INq(ke) + Bq(ma) .req INq(bu) + Bq(me) .req INq(ga) + Bq(ba) .req INq(ba) + Bq(bi) .req INq(ki) + Bq(bo) .req INq(mo) + Bq(bu) .req INq(su) + Bq(be) .req INq(ge) + Bq(sa) .req INq(bi) + Bq(so) .req INq(ma) + Bq(se) .req INq(go) + Bq(si) .req INq(ku) + Bq(su) .req INq(se) + + OUT(ga) .req TMP_IN(0) + OUT(ge) .req TMP_IN(1) + OUT(gi) .req B(gi) + OUT(go) .req B(go) + OUT(gu) .req B(gu) + OUT(ka) .req B(ga) + OUT(ke) .req B(ge) + OUT(ki) .req B(ki) + OUT(ko) .req B(ko) + OUT(ku) .req B(ku) + OUT(ma) .req B(ka) + OUT(me) .req B(ke) + OUT(mi) .req B(mi) + OUT(mo) .req B(mo) + OUT(mu) .req B(mu) + OUT(ba) .req B(ma) + OUT(be) .req B(me) + OUT(bi) .req B(bi) + OUT(bo) .req B(bo) + OUT(bu) .req B(bu) + OUT(sa) .req B(ba) + OUT(se) .req B(be) + OUT(si) .req B(si) + OUT(so) .req B(so) + OUT(su) .req B(su) + + OUTq(ga) .req TMP_INq(0) + OUTq(ge) .req TMP_INq(1) + OUTq(gi) .req Bq(gi) + OUTq(go) .req Bq(go) + OUTq(gu) .req Bq(gu) + OUTq(ka) .req Bq(ga) + OUTq(ke) .req Bq(ge) + OUTq(ki) .req Bq(ki) + OUTq(ko) .req Bq(ko) + OUTq(ku) .req Bq(ku) + OUTq(ma) .req Bq(ka) + OUTq(me) .req Bq(ke) + OUTq(mi) .req Bq(mi) + OUTq(mo) .req Bq(mo) + OUTq(mu) .req Bq(mu) + OUTq(ba) .req Bq(ma) + OUTq(be) .req Bq(me) + OUTq(bi) .req Bq(bi) + OUTq(bo) .req Bq(bo) + OUTq(bu) .req Bq(bu) + OUTq(sa) .req Bq(ba) + OUTq(se) .req Bq(be) + OUTq(si) .req Bq(si) + OUTq(so) .req Bq(so) + OUTq(su) .req Bq(su) + + TMP_OUT(0) .req B(sa) + TMP_OUT(1) .req B(se) + TMP_OUT(2) .req TMP_IN(2) + TMP_OUT(3) .req TMP_IN(3) + TMP_OUT(4) .req TMP_IN(4) + TMP_OUT(5) .req TMP_IN(5) + TMP_OUT(6) .req TMP_IN(6) + + TMP_OUTq(0) .req Bq(sa) + TMP_OUTq(1) .req Bq(se) + TMP_OUTq(2) .req TMP_INq(2) + TMP_OUTq(3) .req TMP_INq(3) + TMP_OUTq(4) .req TMP_INq(4) + TMP_OUTq(5) .req TMP_INq(5) + TMP_OUTq(6) .req TMP_INq(6) + + tmp .req v0 + .unreq tmp + tmp .req TMP_IN(6) +.endm + +.macro undeclare_mappings out, in + + .unreq C(0) + .unreq C(1) + .unreq C(2) + .unreq C(3) + .unreq C(4) + + .unreq Cq(0) + .unreq Cq(1) + .unreq Cq(2) + .unreq Cq(3) + .unreq Cq(4) + + .unreq E(2) + .unreq E(4) + .unreq E(1) + .unreq E(3) + .unreq E(0) + + .unreq Eq(2) + .unreq Eq(4) + .unreq Eq(1) + .unreq Eq(3) + .unreq Eq(0) + + .unreq B(go) + .unreq B(gi) + .unreq B(ga) + .unreq B(ge) + .unreq B(gu) + .unreq B(ki) + .unreq B(ko) + .unreq B(ka) + .unreq B(ke) + .unreq B(ku) + .unreq B(mu) + .unreq B(mo) + .unreq B(mi) + .unreq B(ma) + .unreq B(me) + .unreq B(ba) + .unreq B(bi) + .unreq B(bo) + .unreq B(bu) + .unreq B(be) + .unreq B(sa) + .unreq B(so) + .unreq B(se) + .unreq B(si) + .unreq B(su) + + .unreq Bq(go) + .unreq Bq(gi) + .unreq Bq(ga) + .unreq Bq(ge) + .unreq Bq(gu) + .unreq Bq(ki) + .unreq Bq(ko) + .unreq Bq(ka) + .unreq Bq(ke) + .unreq Bq(ku) + .unreq Bq(mu) + .unreq Bq(mo) + .unreq Bq(mi) + .unreq Bq(ma) + .unreq Bq(me) + .unreq Bq(ba) + .unreq Bq(bi) + .unreq Bq(bo) + .unreq Bq(bu) + .unreq Bq(be) + .unreq Bq(sa) + .unreq Bq(so) + .unreq Bq(se) + .unreq Bq(si) + .unreq Bq(su) + + .unreq OUT(ga) + .unreq OUT(ge) + .unreq OUT(gi) + .unreq OUT(go) + .unreq OUT(gu) + .unreq OUT(ka) + .unreq OUT(ke) + .unreq OUT(ki) + .unreq OUT(ko) + .unreq OUT(ku) + .unreq OUT(ma) + .unreq OUT(me) + .unreq OUT(mi) + .unreq OUT(mo) + .unreq OUT(mu) + .unreq OUT(ba) + .unreq OUT(be) + .unreq OUT(bi) + .unreq OUT(bo) + .unreq OUT(bu) + .unreq OUT(sa) + .unreq OUT(se) + .unreq OUT(si) + .unreq OUT(so) + .unreq OUT(su) + + .unreq OUTq(ga) + .unreq OUTq(ge) + .unreq OUTq(gi) + .unreq OUTq(go) + .unreq OUTq(gu) + .unreq OUTq(ka) + .unreq OUTq(ke) + .unreq OUTq(ki) + .unreq OUTq(ko) + .unreq OUTq(ku) + .unreq OUTq(ma) + .unreq OUTq(me) + .unreq OUTq(mi) + .unreq OUTq(mo) + .unreq OUTq(mu) + .unreq OUTq(ba) + .unreq OUTq(be) + .unreq OUTq(bi) + .unreq OUTq(bo) + .unreq OUTq(bu) + .unreq OUTq(sa) + .unreq OUTq(se) + .unreq OUTq(si) + .unreq OUTq(so) + .unreq OUTq(su) + + .unreq TMP_OUT(0) + .unreq TMP_OUT(1) + .unreq TMP_OUT(2) + .unreq TMP_OUT(3) + .unreq TMP_OUT(4) + .unreq TMP_OUT(5) + .unreq TMP_OUT(6) + + .unreq TMP_OUTq(0) + .unreq TMP_OUTq(1) + .unreq TMP_OUTq(2) + .unreq TMP_OUTq(3) + .unreq TMP_OUTq(4) + .unreq TMP_OUTq(5) + .unreq TMP_OUTq(6) + + .unreq tmp +.endm + +.macro keccak_f1600_round out, in + + eor3_m1 C(0), IN(ba), IN(ga), IN(ka) + eor3_m0 C(0), C(0), IN(ma), IN(sa) + eor3_m1 C(1), IN(be), IN(ge), IN(ke) + eor3_m0 C(1), C(1), IN(me), IN(se) + eor3_m1 C(2), IN(bi), IN(gi), IN(ki) + eor3_m0 C(2), C(2), IN(mi), IN(si) + eor3_m1 C(3), IN(bo), IN(go), IN(ko) + eor3_m0 C(3), C(3), IN(mo), IN(so) + eor3_m1 C(4), IN(bu), IN(gu), IN(ku) + eor3_m0 C(4), C(4), IN(mu), IN(su) + + rax1_m0 E(2), C(1), C(3) + rax1_m1 E(4), C(3), C(0) + rax1_m0 E(1), C(0), C(2) + rax1_m1 E(3), C(2), C(4) + rax1_m0 E(0), C(4), C(1) + + xar_m0 B(go), IN(me), E(1), 19 + xar_m1 B(gi), IN(ka), E(0), 61 + xar_m0 B(ga), IN(bo), E(3), 36 + xar_m1 B(ge), IN(gu), E(4), 44 + xar_m0 B(gu), IN(si), E(2), 3 + + xar_m1 B(ki), IN(ko), E(3), 39 + xar_m0 B(ko), IN(mu), E(4), 56 + xar_m1 B(ka), IN(be), E(1), 63 + xar_m0 B(ke), IN(gi), E(2), 58 + xar_m1 B(ku), IN(sa), E(0), 46 + + xar_m0 B(mu), IN(so), E(3), 8 + xar_m1 B(mo), IN(mi), E(2), 49 + xar_m0 B(mi), IN(ke), E(1), 54 + xar_m1 B(ma), IN(bu), E(4), 37 + xar_m0 B(me), IN(ga), E(0), 28 + + eor2 B(ba), IN(ba), E(0) + xar_m1 B(bi), IN(ki), E(2), 21 + xar_m0 B(bo), IN(mo), E(3), 43 + xar_m1 B(bu), IN(su), E(4), 50 + xar_m0 B(be), IN(ge), E(1), 20 + + xar_m1 B(sa), IN(bi), E(2), 2 + xar_m0 B(so), IN(ma), E(0), 23 + xar_m1 B(se), IN(go), E(3), 9 + xar_m0 B(si), IN(ku), E(4), 25 + xar_m1 B(su), IN(se), E(1), 62 + + bcax_m0 OUT(ga), B(ga), B(gi), B(ge) + bcax_m1 OUT(ge), B(ge), B(go), B(gi) + bcax_m0 OUT(gi), B(gi), B(gu), B(go) + bcax_m1 OUT(go), B(go), B(ga), B(gu) + bcax_m0 OUT(gu), B(gu), B(ge), B(ga) + + bcax_m1 OUT(ka), B(ka), B(ki), B(ke) + bcax_m0 OUT(ke), B(ke), B(ko), B(ki) + bcax_m1 OUT(ki), B(ki), B(ku), B(ko) + bcax_m0 OUT(ko), B(ko), B(ka), B(ku) + bcax_m1 OUT(ku), B(ku), B(ke), B(ka) + + bcax_m1 OUT(ma), B(ma), B(mi), B(me) + bcax_m0 OUT(me), B(me), B(mo), B(mi) + bcax_m1 OUT(mi), B(mi), B(mu), B(mo) + bcax_m0 OUT(mo), B(mo), B(ma), B(mu) + bcax_m1 OUT(mu), B(mu), B(me), B(ma) + + bcax_m0 OUT(ba), B(ba), B(bi), B(be) + bcax_m1 OUT(be), B(be), B(bo), B(bi) + bcax_m0 OUT(bi), B(bi), B(bu), B(bo) + bcax_m1 OUT(bo), B(bo), B(ba), B(bu) + bcax_m0 OUT(bu), B(bu), B(be), B(ba) + + bcax_m1 OUT(sa), B(sa), B(si), B(se) + bcax_m0 OUT(se), B(se), B(so), B(si) + bcax_m1 OUT(si), B(si), B(su), B(so) + bcax_m0 OUT(so), B(so), B(sa), B(su) + bcax_m1 OUT(su), B(su), B(se), B(sa) + + ld1r {tmp.2d}, [const_addr], #8 + eor OUT(ba).16b, OUT(ba).16b, tmp.16b +.endm + +.macro transfer_state out, in + + savep(INq(ga),ga) + savep(INq(ge),ge) + savep(INq(gi),gi) + savep(INq(go),go) + savep(INq(gu),gu) + savep(INq(ka),ka) + savep(INq(ke),ke) + savep(INq(ki),ki) + savep(INq(ko),ko) + savep(INq(ku),ku) + savep(INq(ma),ma) + savep(INq(me),me) + savep(INq(mi),mi) + savep(INq(mo),mo) + savep(INq(mu),mu) + savep(INq(ba),ba) + savep(INq(be),be) + savep(INq(bi),bi) + savep(INq(bo),bo) + savep(INq(bu),bu) + savep(INq(sa),sa) + savep(INq(se),se) + savep(INq(si),si) + savep(INq(so),so) + savep(INq(su),su) + + restorep(OUTq(ga),ga) + restorep(OUTq(ge),ge) + restorep(OUTq(gi),gi) + restorep(OUTq(go),go) + restorep(OUTq(gu),gu) + restorep(OUTq(ka),ka) + restorep(OUTq(ke),ke) + restorep(OUTq(ki),ki) + restorep(OUTq(ko),ko) + restorep(OUTq(ku),ku) + restorep(OUTq(ma),ma) + restorep(OUTq(me),me) + restorep(OUTq(mi),mi) + restorep(OUTq(mo),mo) + restorep(OUTq(mu),mu) + restorep(OUTq(ba),ba) + restorep(OUTq(be),be) + restorep(OUTq(bi),bi) + restorep(OUTq(bo),bo) + restorep(OUTq(bu),bu) + restorep(OUTq(sa),sa) + restorep(OUTq(se),se) + restorep(OUTq(si),si) + restorep(OUTq(so),so) + restorep(OUTq(su),su) + +.endm + +.text +.align 4 +.global keccak_f1600_x2_hybrid_asm_v2p0 +.global _keccak_f1600_x2_hybrid_asm_v2p0 + +#define KECCAK_F1600_ROUNDS 24 + +keccak_f1600_x2_hybrid_asm_v2p0: +_keccak_f1600_x2_hybrid_asm_v2p0: + alloc_stack + save_vregs + load_constant_ptr + load_input + + mov count, #24 + +loop: + declare_mappings A1, A + keccak_f1600_round A1, A + + declare_mappings A2, A1 + keccak_f1600_round A2, A1 + + declare_mappings A3, A2 + keccak_f1600_round A3, A2 + + declare_mappings A4, A3 + keccak_f1600_round A4, A3 + + transfer_state A, A4 + undeclare_mappings A4, A + + sub count, count, #4 + cbnz count, loop + + store_input + restore_vregs + free_stack + ret + + #endif diff --git a/tests/keccak_neon/manual/keccak_f1600_x2_hybrid_asm_v2p1.s b/tests/keccak_neon/manual/keccak_f1600_x2_hybrid_asm_v2p1.s new file mode 100644 index 0000000..8fbb78c --- /dev/null +++ b/tests/keccak_neon/manual/keccak_f1600_x2_hybrid_asm_v2p1.s @@ -0,0 +1,880 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + + +#if defined(__ARM_FEATURE_SHA3) +/********************** CONSTANTS *************************/ + .data + .align(8) +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + .quad 0x0 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x1 + count .req x2 + cur_const .req x3 + + /* Mapping of Kecck-f1600 state to vector registers + * at the beginning and end of each round. */ + ASba .req v0 + ASbe .req v1 + ASbi .req v2 + ASbo .req v3 + ASbu .req v4 + ASga .req v5 + ASge .req v6 + ASgi .req v7 + ASgo .req v8 + ASgu .req v9 + ASka .req v10 + ASke .req v11 + ASki .req v12 + ASko .req v13 + ASku .req v14 + ASma .req v15 + ASme .req v16 + ASmi .req v17 + ASmo .req v18 + ASmu .req v19 + ASsa .req v20 + ASse .req v21 + ASsi .req v22 + ASso .req v23 + ASsu .req v24 + + /* q-form of the above mapping */ + ASbaq .req q0 + ASbeq .req q1 + ASbiq .req q2 + ASboq .req q3 + ASbuq .req q4 + ASgaq .req q5 + ASgeq .req q6 + ASgiq .req q7 + ASgoq .req q8 + ASguq .req q9 + ASkaq .req q10 + ASkeq .req q11 + ASkiq .req q12 + ASkoq .req q13 + ASkuq .req q14 + ASmaq .req q15 + ASmeq .req q16 + ASmiq .req q17 + ASmoq .req q18 + ASmuq .req q19 + ASsaq .req q20 + ASseq .req q21 + ASsiq .req q22 + ASsoq .req q23 + ASsuq .req q24 + + Ascratch0 .req v25 + Ascratch1 .req v26 + Ascratch2 .req v27 + Ascratch3 .req v28 + Ascratch4 .req v29 + Ascratch5 .req v30 + Ascratch6 .req v31 + + Ascratch0q .req q25 + Ascratch1q .req q26 + Ascratch2q .req q27 + Ascratch3q .req q28 + Ascratch4q .req q29 + Ascratch5q .req q30 + Ascratch6q .req q31 + +/************************ MACROS ****************************/ + +.macro load_input + ldp ASbaq, ASbeq, [input_addr, #(2*8*0)] + ldp ASbiq, ASboq, [input_addr, #(2*8*2)] + ldp ASbuq, ASgaq, [input_addr, #(2*8*4)] + ldp ASgeq, ASgiq, [input_addr, #(2*8*6)] + ldp ASgoq, ASguq, [input_addr, #(2*8*8)] + ldp ASkaq, ASkeq, [input_addr, #(2*8*10)] + ldp ASkiq, ASkoq, [input_addr, #(2*8*12)] + ldp ASkuq, ASmaq, [input_addr, #(2*8*14)] + ldp ASmeq, ASmiq, [input_addr, #(2*8*16)] + ldp ASmoq, ASmuq, [input_addr, #(2*8*18)] + ldp ASsaq, ASseq, [input_addr, #(2*8*20)] + ldp ASsiq, ASsoq, [input_addr, #(2*8*22)] + ldr ASsuq, [input_addr, #(2*8*24)] +.endm + +.macro store_input in + str \in\()Sbaq, [input_addr, #(2*8*0)] + str \in\()Sbeq, [input_addr, #(2*8*1)] + str \in\()Sbiq, [input_addr, #(2*8*2)] + str \in\()Sboq, [input_addr, #(2*8*3)] + str \in\()Sbuq, [input_addr, #(2*8*4)] + str \in\()Sgaq, [input_addr, #(2*8*5)] + str \in\()Sgeq, [input_addr, #(2*8*6)] + str \in\()Sgiq, [input_addr, #(2*8*7)] + str \in\()Sgoq, [input_addr, #(2*8*8)] + str \in\()Sguq, [input_addr, #(2*8*9)] + str \in\()Skaq, [input_addr, #(2*8*10)] + str \in\()Skeq, [input_addr, #(2*8*11)] + str \in\()Skiq, [input_addr, #(2*8*12)] + str \in\()Skoq, [input_addr, #(2*8*13)] + str \in\()Skuq, [input_addr, #(2*8*14)] + str \in\()Smaq, [input_addr, #(2*8*15)] + str \in\()Smeq, [input_addr, #(2*8*16)] + str \in\()Smiq, [input_addr, #(2*8*17)] + str \in\()Smoq, [input_addr, #(2*8*18)] + str \in\()Smuq, [input_addr, #(2*8*19)] + str \in\()Ssaq, [input_addr, #(2*8*20)] + str \in\()Sseq, [input_addr, #(2*8*21)] + str \in\()Ssiq, [input_addr, #(2*8*22)] + str \in\()Ssoq, [input_addr, #(2*8*23)] + str \in\()Ssuq, [input_addr, #(2*8*24)] +.endm + +#define STACK_SIZE (16*4 + 16*30) +#define STACK_BASE_VREGS 0 +#define STACK_BASE_TMP 16*4 + +#define E0_offset 0 +#define E1_offset 1 +#define E2_offset 2 +#define E3_offset 3 +#define E4_offset 4 + +#define Aba_offset (5 + 0 ) +#define Abe_offset (5 + 1 ) +#define Abi_offset (5 + 2 ) +#define Abo_offset (5 + 3 ) +#define Abu_offset (5 + 4 ) +#define Aga_offset (5 + 5 ) +#define Age_offset (5 + 6 ) +#define Agi_offset (5 + 7 ) +#define Ago_offset (5 + 8 ) +#define Agu_offset (5 + 9 ) +#define Aka_offset (5 + 10 ) +#define Ake_offset (5 + 11 ) +#define Aki_offset (5 + 12 ) +#define Ako_offset (5 + 13 ) +#define Aku_offset (5 + 14 ) +#define Ama_offset (5 + 15 ) +#define Ame_offset (5 + 16 ) +#define Ami_offset (5 + 17 ) +#define Amo_offset (5 + 18 ) +#define Amu_offset (5 + 19 ) +#define Asa_offset (5 + 20 ) +#define Ase_offset (5 + 21 ) +#define Asi_offset (5 + 22 ) +#define Aso_offset (5 + 23 ) +#define Asu_offset (5 + 24 ) + +#define ba_offset (5 + 0 ) +#define be_offset (5 + 1 ) +#define bi_offset (5 + 2 ) +#define bo_offset (5 + 3 ) +#define bu_offset (5 + 4 ) +#define ga_offset (5 + 5 ) +#define ge_offset (5 + 6 ) +#define gi_offset (5 + 7 ) +#define go_offset (5 + 8 ) +#define gu_offset (5 + 9 ) +#define ka_offset (5 + 10 ) +#define ke_offset (5 + 11 ) +#define ki_offset (5 + 12 ) +#define ko_offset (5 + 13 ) +#define ku_offset (5 + 14 ) +#define ma_offset (5 + 15 ) +#define me_offset (5 + 16 ) +#define mi_offset (5 + 17 ) +#define mo_offset (5 + 18 ) +#define mu_offset (5 + 19 ) +#define sa_offset (5 + 20 ) +#define se_offset (5 + 21 ) +#define si_offset (5 + 22 ) +#define so_offset (5 + 23 ) +#define su_offset (5 + 24 ) + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +#define savep(reg, offset_prefix) \ + str reg, [sp, #(STACK_BASE_TMP + 16 * offset_prefix ## _offset)] +#define restorep(reg, offset_prefix) \ + ldr reg, [sp, #(STACK_BASE_TMP + 16 * offset_prefix ## _offset)] +#define save(name) savep(name ## q,name) +#define restore(name) restorep(name ## q,name) + +.macro save_vregs + stp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + stp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + stp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + stp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + ldp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + ldp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + ldp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +/* Macros using v8.4-A SHA-3 instructions */ + +.macro eor3_m1_0 d s0 s1 s2 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor2 d s0 s1 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor3_m1_1 d s0 s1 s2 + eor \d\().16b, \d\().16b, \s2\().16b +.endm + +.macro eor3_m1 d s0 s1 s2 + eor3_m1_0 \d, \s0, \s1, \s2 + eor3_m1_1 \d, \s0, \s1, \s2 +.endm + +.macro rax1_m1 d s0 s1 + add tmp.2d, \s1\().2d, \s1\().2d + sri tmp.2d, \s1\().2d, #63 + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +.macro xar_m1 d s0 s1 imm + eor tmp.16b, \s0\().16b, \s1\().16b + shl \d\().2d, tmp.2d, #(64-\imm) + sri \d\().2d, tmp.2d, #(\imm) +.endm + +.macro bcax_m1 d s0 s1 s2 + bic tmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +.macro eor3_m0 d s0 s1 s2 + eor3 \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +.macro rax1_m0 d s0 s1 + rax1 \d\().2d, \s0\().2d, \s1\().2d +.endm + +.macro xar_m0 d s0 s1 imm + xar \d\().2d, \s0\().2d, \s1\().2d, #\imm +.endm + +.macro bcax_m0 d s0 s1 s2 + bcax \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +#define CONCAT5(a,b,c,d,e) a ## b ## c ## d ## e +#define CONCAT4(a,b,c,d) a ## b ## c ## d + +#define OUT(x) \out\()S##x +#define IN(x) \in\()S##x +#define B(x) \in\()B##x +#define E(x) \in\()E##x +#define C(x) \in\()C##x +#define Cnext(x) \out\()C##x +#define TMP_IN(x) \in\()scratch ## x +#define TMP_OUT(x) \out\()scratch ## x + +#define OUTq(x) \out\()S##x##q +#define INq(x) \in\()S##x##q +#define Bq(x) \in\()B##x##q +#define Eq(x) \in\()E##x##q +#define Cq(x) \in\()C##x##q +#define Cnextq(x) \out\()C##x##q +#define TMP_INq(x) \in\()scratch ## x ## q +#define TMP_OUTq(x) \out\()scratch ## x ## q + +.macro declare_mappings out, in + + C(0) .req TMP_IN(0) + C(1) .req TMP_IN(1) + C(2) .req TMP_IN(2) + C(3) .req TMP_IN(3) + C(4) .req TMP_IN(4) + + Cq(0) .req TMP_INq(0) + Cq(1) .req TMP_INq(1) + Cq(2) .req TMP_INq(2) + Cq(3) .req TMP_INq(3) + Cq(4) .req TMP_INq(4) + + E(1) .req TMP_IN(5) + E(3) .req C(2) + E(0) .req C(4) + E(2) .req C(1) + E(4) .req C(3) + + Eq(1) .req TMP_INq(5) + Eq(3) .req Cq(2) + Eq(0) .req Cq(4) + Eq(2) .req Cq(1) + Eq(4) .req Cq(3) + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + B(go) .req IN(me) + B(gi) .req IN(ka) + B(ga) .req IN(bo) + B(ge) .req IN(gu) + B(gu) .req IN(si) + B(ki) .req IN(ko) + B(ko) .req IN(mu) + B(ka) .req IN(be) + B(ke) .req IN(gi) + B(ku) .req IN(sa) + B(mu) .req IN(so) + B(mo) .req IN(mi) + B(mi) .req IN(ke) + B(ma) .req IN(bu) + B(me) .req IN(ga) + B(ba) .req IN(ba) + B(bi) .req IN(ki) + B(bo) .req IN(mo) + B(bu) .req IN(su) + B(be) .req IN(ge) + B(sa) .req IN(bi) + B(so) .req IN(ma) + B(se) .req IN(go) + B(si) .req IN(ku) + B(su) .req IN(se) + + Bq(go) .req INq(me) + Bq(gi) .req INq(ka) + Bq(ga) .req INq(bo) + Bq(ge) .req INq(gu) + Bq(gu) .req INq(si) + Bq(ki) .req INq(ko) + Bq(ko) .req INq(mu) + Bq(ka) .req INq(be) + Bq(ke) .req INq(gi) + Bq(ku) .req INq(sa) + Bq(mu) .req INq(so) + Bq(mo) .req INq(mi) + Bq(mi) .req INq(ke) + Bq(ma) .req INq(bu) + Bq(me) .req INq(ga) + Bq(ba) .req INq(ba) + Bq(bi) .req INq(ki) + Bq(bo) .req INq(mo) + Bq(bu) .req INq(su) + Bq(be) .req INq(ge) + Bq(sa) .req INq(bi) + Bq(so) .req INq(ma) + Bq(se) .req INq(go) + Bq(si) .req INq(ku) + Bq(su) .req INq(se) + + OUT(ba) .req TMP_IN(0) + OUT(be) .req TMP_IN(5) + OUT(bi) .req B(bi) + OUT(bo) .req B(bo) + OUT(bu) .req B(bu) + OUT(ga) .req B(ba) + OUT(ge) .req B(be) + OUT(gi) .req B(gi) + OUT(go) .req B(go) + OUT(gu) .req B(gu) + OUT(ka) .req B(ga) + OUT(ke) .req B(ge) + OUT(ki) .req B(ki) + OUT(ko) .req B(ko) + OUT(ku) .req B(ku) + OUT(ma) .req B(ka) + OUT(me) .req B(ke) + OUT(mi) .req B(mi) + OUT(mo) .req B(mo) + OUT(mu) .req B(mu) + OUT(sa) .req B(ma) + OUT(se) .req B(me) + OUT(si) .req B(si) + OUT(so) .req B(so) + OUT(su) .req B(su) + + OUTq(ba) .req TMP_INq(0) + OUTq(be) .req TMP_INq(5) + OUTq(bi) .req Bq(bi) + OUTq(bo) .req Bq(bo) + OUTq(bu) .req Bq(bu) + OUTq(ga) .req Bq(ba) + OUTq(ge) .req Bq(be) + OUTq(gi) .req Bq(gi) + OUTq(go) .req Bq(go) + OUTq(gu) .req Bq(gu) + OUTq(ka) .req Bq(ga) + OUTq(ke) .req Bq(ge) + OUTq(ki) .req Bq(ki) + OUTq(ko) .req Bq(ko) + OUTq(ku) .req Bq(ku) + OUTq(ma) .req Bq(ka) + OUTq(me) .req Bq(ke) + OUTq(mi) .req Bq(mi) + OUTq(mo) .req Bq(mo) + OUTq(mu) .req Bq(mu) + OUTq(sa) .req Bq(ma) + OUTq(se) .req Bq(me) + OUTq(si) .req Bq(si) + OUTq(so) .req Bq(so) + OUTq(su) .req Bq(su) + + TMP_OUT(0) .req B(sa) + TMP_OUT(1) .req B(se) + TMP_OUT(2) .req TMP_IN(1) + TMP_OUT(3) .req TMP_IN(2) + TMP_OUT(4) .req TMP_IN(3) + TMP_OUT(5) .req TMP_IN(4) + TMP_OUT(6) .req TMP_IN(6) + + TMP_OUTq(0) .req Bq(sa) + TMP_OUTq(1) .req Bq(se) + TMP_OUTq(2) .req TMP_INq(1) + TMP_OUTq(3) .req TMP_INq(2) + TMP_OUTq(4) .req TMP_INq(3) + TMP_OUTq(5) .req TMP_INq(4) + TMP_OUTq(6) .req TMP_INq(6) + + Cnext(0) .req TMP_OUT(0) + Cnext(1) .req TMP_OUT(1) + Cnext(2) .req TMP_OUT(2) + Cnext(3) .req TMP_OUT(3) + Cnext(4) .req TMP_OUT(4) + + Cnextq(0) .req TMP_OUTq(0) + Cnextq(1) .req TMP_OUTq(1) + Cnextq(2) .req TMP_OUTq(2) + Cnextq(3) .req TMP_OUTq(3) + Cnextq(4) .req TMP_OUTq(4) + + tmp .req v0 + .unreq tmp + tmp .req TMP_IN(6) +.endm + +.macro undeclare_mappings out, in + + .unreq C(0) + .unreq C(1) + .unreq C(2) + .unreq C(3) + .unreq C(4) + + .unreq Cq(0) + .unreq Cq(1) + .unreq Cq(2) + .unreq Cq(3) + .unreq Cq(4) + + .unreq E(2) + .unreq E(4) + .unreq E(1) + .unreq E(3) + .unreq E(0) + + .unreq Eq(2) + .unreq Eq(4) + .unreq Eq(1) + .unreq Eq(3) + .unreq Eq(0) + + .unreq B(go) + .unreq B(gi) + .unreq B(ga) + .unreq B(ge) + .unreq B(gu) + .unreq B(ki) + .unreq B(ko) + .unreq B(ka) + .unreq B(ke) + .unreq B(ku) + .unreq B(mu) + .unreq B(mo) + .unreq B(mi) + .unreq B(ma) + .unreq B(me) + .unreq B(ba) + .unreq B(bi) + .unreq B(bo) + .unreq B(bu) + .unreq B(be) + .unreq B(sa) + .unreq B(so) + .unreq B(se) + .unreq B(si) + .unreq B(su) + + .unreq Bq(go) + .unreq Bq(gi) + .unreq Bq(ga) + .unreq Bq(ge) + .unreq Bq(gu) + .unreq Bq(ki) + .unreq Bq(ko) + .unreq Bq(ka) + .unreq Bq(ke) + .unreq Bq(ku) + .unreq Bq(mu) + .unreq Bq(mo) + .unreq Bq(mi) + .unreq Bq(ma) + .unreq Bq(me) + .unreq Bq(ba) + .unreq Bq(bi) + .unreq Bq(bo) + .unreq Bq(bu) + .unreq Bq(be) + .unreq Bq(sa) + .unreq Bq(so) + .unreq Bq(se) + .unreq Bq(si) + .unreq Bq(su) + + .unreq OUT(ga) + .unreq OUT(ge) + .unreq OUT(gi) + .unreq OUT(go) + .unreq OUT(gu) + .unreq OUT(ka) + .unreq OUT(ke) + .unreq OUT(ki) + .unreq OUT(ko) + .unreq OUT(ku) + .unreq OUT(ma) + .unreq OUT(me) + .unreq OUT(mi) + .unreq OUT(mo) + .unreq OUT(mu) + .unreq OUT(ba) + .unreq OUT(be) + .unreq OUT(bi) + .unreq OUT(bo) + .unreq OUT(bu) + .unreq OUT(sa) + .unreq OUT(se) + .unreq OUT(si) + .unreq OUT(so) + .unreq OUT(su) + + .unreq OUTq(ga) + .unreq OUTq(ge) + .unreq OUTq(gi) + .unreq OUTq(go) + .unreq OUTq(gu) + .unreq OUTq(ka) + .unreq OUTq(ke) + .unreq OUTq(ki) + .unreq OUTq(ko) + .unreq OUTq(ku) + .unreq OUTq(ma) + .unreq OUTq(me) + .unreq OUTq(mi) + .unreq OUTq(mo) + .unreq OUTq(mu) + .unreq OUTq(ba) + .unreq OUTq(be) + .unreq OUTq(bi) + .unreq OUTq(bo) + .unreq OUTq(bu) + .unreq OUTq(sa) + .unreq OUTq(se) + .unreq OUTq(si) + .unreq OUTq(so) + .unreq OUTq(su) + + .unreq TMP_OUT(0) + .unreq TMP_OUT(1) + .unreq TMP_OUT(2) + .unreq TMP_OUT(3) + .unreq TMP_OUT(4) + .unreq TMP_OUT(5) + .unreq TMP_OUT(6) + + .unreq TMP_OUTq(0) + .unreq TMP_OUTq(1) + .unreq TMP_OUTq(2) + .unreq TMP_OUTq(3) + .unreq TMP_OUTq(4) + .unreq TMP_OUTq(5) + .unreq TMP_OUTq(6) + + .unreq tmp +.endm + +.macro keccak_f1600_round out, in + + eor3_m0 C(0), IN(ba), IN(ga), IN(ka) + eor3_m1 C(3), IN(bo), IN(go), IN(ko) + eor3_m0 C(2), IN(bi), IN(gi), IN(ki) + eor3_m1 C(1), IN(be), IN(ge), IN(ke) + eor3_m0 C(0), C(0), IN(ma), IN(sa) + eor3_m1 C(3), C(3), IN(mo), IN(so) + eor3_m0 C(2), C(2), IN(mi), IN(si) + eor3_m1 C(1), C(1), IN(me), IN(se) + eor3_m0 C(4), IN(bu), IN(gu), IN(ku) + rax1_m0 E(1), C(0), C(2) + xar_m1 B(mi), IN(ke), E(1), 54 + eor3_m0 C(4), C(4), IN(mu), IN(su) + xar_m1 B(go), IN(me), E(1), 19 + rax1_m0 E(3), C(2), C(4) + xar_m1 B(ka), IN(be), E(1), 63 + rax1_m0 E(0), C(4), C(1) + xar_m1 B(be), IN(ge), E(1), 20 + rax1_m0 E(2), C(1), C(3) + xar_m1 B(su), IN(se), E(1), 62 + rax1_m0 E(4), C(3), C(0) + + // TODO: * Interleave (fast) v8.4-A based 5-block with (slow) v8-A based 5-block, + // and then pull forward BCAX for the v8.4-A block + // * Handle XAR's for a fixed E(?) first, so that the remaining E(?)'s + // can be computed in parallel? + + eor2 B(ba), IN(ba), E(0) + xar_m1 B(ga), IN(bo), E(3), 36 + xar_m0 B(bi), IN(ki), E(2), 21 + xar_m1 B(ge), IN(gu), E(4), 44 + xar_m0 B(bo), IN(mo), E(3), 43 + xar_m1 B(gi), IN(ka), E(0), 61 + xar_m0 B(bu), IN(su), E(4), 50 + xar_m1 B(gu), IN(si), E(2), 3 + + xar_m0 B(ke), IN(gi), E(2), 58 + xar_m0 B(ki), IN(ko), E(3), 39 + bcax_m1 OUT(ba), B(ba), B(bi), B(be) + bcax_m1 OUT(be), B(be), B(bo), B(bi) + xar_m0 B(ko), IN(mu), E(4), 56 + xar_m0 B(ku), IN(sa), E(0), 46 + bcax_m1 OUT(bi), B(bi), B(bu), B(bo) + bcax_m1 OUT(bo), B(bo), B(ba), B(bu) + + xar_m0 B(ma), IN(bu), E(4), 37 + xar_m0 B(me), IN(ga), E(0), 28 + bcax_m1 OUT(bu), B(bu), B(be), B(ba) + bcax_m1 OUT(ga), B(ga), B(gi), B(ge) + xar_m0 B(mo), IN(mi), E(2), 49 + xar_m0 B(mu), IN(so), E(3), 8 + bcax_m1 OUT(ge), B(ge), B(go), B(gi) + bcax_m1 OUT(gi), B(gi), B(gu), B(go) + + ld1r {tmp.2d}, [const_addr], #8 + eor OUT(ba).16b, OUT(ba).16b, tmp.16b + + xar_m0 B(sa), IN(bi), E(2), 2 + bcax_m1 OUT(go), B(go), B(ga), B(gu) + xar_m0 B(se), IN(go), E(3), 9 + bcax_m1 OUT(gu), B(gu), B(ge), B(ga) + bcax_m1 OUT(ka), B(ka), B(ki), B(ke) + xar_m0 B(si), IN(ku), E(4), 25 + bcax_m1 OUT(ke), B(ke), B(ko), B(ki) + bcax_m1 OUT(ki), B(ki), B(ku), B(ko) + xar_m0 B(so), IN(ma), E(0), 23 + bcax_m1 OUT(ko), B(ko), B(ka), B(ku) + bcax_m1 OUT(ku), B(ku), B(ke), B(ka) + + bcax_m0 OUT(ma), B(ma), B(mi), B(me) + bcax_m1 OUT(me), B(me), B(mo), B(mi) + bcax_m1 OUT(mi), B(mi), B(mu), B(mo) + bcax_m0 OUT(mo), B(mo), B(ma), B(mu) + bcax_m1 OUT(mu), B(mu), B(me), B(ma) + + bcax_m0 OUT(sa), B(sa), B(si), B(se) + bcax_m1 OUT(se), B(se), B(so), B(si) + bcax_m1 OUT(si), B(si), B(su), B(so) + bcax_m0 OUT(so), B(so), B(sa), B(su) + bcax_m1 OUT(su), B(su), B(se), B(sa) + +.endm + +.macro transfer_state out, in + + savep(INq(ga),ga) + savep(INq(ge),ge) + savep(INq(gi),gi) + savep(INq(go),go) + savep(INq(gu),gu) + savep(INq(ka),ka) + savep(INq(ke),ke) + savep(INq(ki),ki) + savep(INq(ko),ko) + savep(INq(ku),ku) + savep(INq(ma),ma) + savep(INq(me),me) + savep(INq(mi),mi) + savep(INq(mo),mo) + savep(INq(mu),mu) + savep(INq(ba),ba) + savep(INq(be),be) + savep(INq(bi),bi) + savep(INq(bo),bo) + savep(INq(bu),bu) + savep(INq(sa),sa) + savep(INq(se),se) + savep(INq(si),si) + savep(INq(so),so) + savep(INq(su),su) + + restorep(OUTq(ga),ga) + restorep(OUTq(ge),ge) + restorep(OUTq(gi),gi) + restorep(OUTq(go),go) + restorep(OUTq(gu),gu) + restorep(OUTq(ka),ka) + restorep(OUTq(ke),ke) + restorep(OUTq(ki),ki) + restorep(OUTq(ko),ko) + restorep(OUTq(ku),ku) + restorep(OUTq(ma),ma) + restorep(OUTq(me),me) + restorep(OUTq(mi),mi) + restorep(OUTq(mo),mo) + restorep(OUTq(mu),mu) + restorep(OUTq(ba),ba) + restorep(OUTq(be),be) + restorep(OUTq(bi),bi) + restorep(OUTq(bo),bo) + restorep(OUTq(bu),bu) + restorep(OUTq(sa),sa) + restorep(OUTq(se),se) + restorep(OUTq(si),si) + restorep(OUTq(so),so) + restorep(OUTq(su),su) + +.endm + +.text +.align 4 +.global keccak_f1600_x2_hybrid_asm_v2p1 +.global _keccak_f1600_x2_hybrid_asm_v2p1 + +#define KECCAK_F1600_ROUNDS 24 + +keccak_f1600_x2_hybrid_asm_v2p1: +_keccak_f1600_x2_hybrid_asm_v2p1: + alloc_stack + save_vregs + load_constant_ptr + load_input + + /* NOTE: Unrolling the whole loop isn't really practical, but for now + * this is just for the sake of understanding the theoretical performance + * uplift of the present approach. */ + + declare_mappings A1, A + keccak_f1600_round A1, A + declare_mappings A2, A1 + keccak_f1600_round A2, A1 + declare_mappings A3, A2 + keccak_f1600_round A3, A2 + declare_mappings A4, A3 + keccak_f1600_round A4, A3 + declare_mappings A5, A4 + keccak_f1600_round A5, A4 + declare_mappings A6, A5 + keccak_f1600_round A6, A5 + declare_mappings A7, A6 + keccak_f1600_round A7, A6 + declare_mappings A8, A7 + keccak_f1600_round A8, A7 + + declare_mappings A9, A8 + keccak_f1600_round A9, A8 + declare_mappings A10, A9 + keccak_f1600_round A10, A9 + declare_mappings A11, A10 + keccak_f1600_round A11, A10 + declare_mappings A12, A11 + keccak_f1600_round A12, A11 + declare_mappings A13, A12 + keccak_f1600_round A13, A12 + declare_mappings A14, A13 + keccak_f1600_round A14, A13 + declare_mappings A15, A14 + keccak_f1600_round A15, A14 + declare_mappings A16, A15 + keccak_f1600_round A16, A15 + + declare_mappings A17, A16 + keccak_f1600_round A17, A16 + declare_mappings A18, A17 + keccak_f1600_round A18, A17 + declare_mappings A19, A18 + keccak_f1600_round A19, A18 + declare_mappings A20, A19 + keccak_f1600_round A20, A19 + declare_mappings A21, A20 + keccak_f1600_round A21, A20 + declare_mappings A22, A21 + keccak_f1600_round A22, A21 + declare_mappings A23, A22 + keccak_f1600_round A23, A22 + declare_mappings A24, A23 + keccak_f1600_round A24, A23 + + store_input A24 + restore_vregs + free_stack + ret + +#endif \ No newline at end of file diff --git a/tests/keccak_neon/manual/keccak_f1600_x2_hybrid_asm_v2p2.s b/tests/keccak_neon/manual/keccak_f1600_x2_hybrid_asm_v2p2.s new file mode 100644 index 0000000..fa64540 --- /dev/null +++ b/tests/keccak_neon/manual/keccak_f1600_x2_hybrid_asm_v2p2.s @@ -0,0 +1,971 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +#if defined(__ARM_FEATURE_SHA3) + +/********************** CONSTANTS *************************/ + .data + .align(8) +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + .quad 0x0 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x1 + count .req x2 + cur_const .req x3 + + /* Mapping of Kecck-f1600 state to vector registers + * at the beginning and end of each round. */ + ASba .req v0 + ASbe .req v1 + ASbi .req v2 + ASbo .req v3 + ASbu .req v4 + ASga .req v5 + ASge .req v6 + ASgi .req v7 + ASgo .req v8 + ASgu .req v9 + ASka .req v10 + ASke .req v11 + ASki .req v12 + ASko .req v13 + ASku .req v14 + ASma .req v15 + ASme .req v16 + ASmi .req v17 + ASmo .req v18 + ASmu .req v19 + ASsa .req v20 + ASse .req v21 + ASsi .req v22 + ASso .req v23 + ASsu .req v24 + + /* q-form of the above mapping */ + ASbaq .req q0 + ASbeq .req q1 + ASbiq .req q2 + ASboq .req q3 + ASbuq .req q4 + ASgaq .req q5 + ASgeq .req q6 + ASgiq .req q7 + ASgoq .req q8 + ASguq .req q9 + ASkaq .req q10 + ASkeq .req q11 + ASkiq .req q12 + ASkoq .req q13 + ASkuq .req q14 + ASmaq .req q15 + ASmeq .req q16 + ASmiq .req q17 + ASmoq .req q18 + ASmuq .req q19 + ASsaq .req q20 + ASseq .req q21 + ASsiq .req q22 + ASsoq .req q23 + ASsuq .req q24 + + Ascratch0 .req v25 + Ascratch1 .req v26 + Ascratch2 .req v27 + Ascratch3 .req v28 + Ascratch4 .req v29 + Ascratch5 .req v30 + Ascratch6 .req v31 + + Ascratch0q .req q25 + Ascratch1q .req q26 + Ascratch2q .req q27 + Ascratch3q .req q28 + Ascratch4q .req q29 + Ascratch5q .req q30 + Ascratch6q .req q31 + +/************************ MACROS ****************************/ + +.macro load_input + ldp ASbaq, ASbeq, [input_addr, #(2*8*0)] + ldp ASbiq, ASboq, [input_addr, #(2*8*2)] + ldp ASbuq, ASgaq, [input_addr, #(2*8*4)] + ldp ASgeq, ASgiq, [input_addr, #(2*8*6)] + ldp ASgoq, ASguq, [input_addr, #(2*8*8)] + ldp ASkaq, ASkeq, [input_addr, #(2*8*10)] + ldp ASkiq, ASkoq, [input_addr, #(2*8*12)] + ldp ASkuq, ASmaq, [input_addr, #(2*8*14)] + ldp ASmeq, ASmiq, [input_addr, #(2*8*16)] + ldp ASmoq, ASmuq, [input_addr, #(2*8*18)] + ldp ASsaq, ASseq, [input_addr, #(2*8*20)] + ldp ASsiq, ASsoq, [input_addr, #(2*8*22)] + ldr ASsuq, [input_addr, #(2*8*24)] +.endm + +.macro store_input in + str \in\()Sbaq, [input_addr, #(2*8*0)] + str \in\()Sbeq, [input_addr, #(2*8*1)] + str \in\()Sbiq, [input_addr, #(2*8*2)] + str \in\()Sboq, [input_addr, #(2*8*3)] + str \in\()Sbuq, [input_addr, #(2*8*4)] + str \in\()Sgaq, [input_addr, #(2*8*5)] + str \in\()Sgeq, [input_addr, #(2*8*6)] + str \in\()Sgiq, [input_addr, #(2*8*7)] + str \in\()Sgoq, [input_addr, #(2*8*8)] + str \in\()Sguq, [input_addr, #(2*8*9)] + str \in\()Skaq, [input_addr, #(2*8*10)] + str \in\()Skeq, [input_addr, #(2*8*11)] + str \in\()Skiq, [input_addr, #(2*8*12)] + str \in\()Skoq, [input_addr, #(2*8*13)] + str \in\()Skuq, [input_addr, #(2*8*14)] + str \in\()Smaq, [input_addr, #(2*8*15)] + str \in\()Smeq, [input_addr, #(2*8*16)] + str \in\()Smiq, [input_addr, #(2*8*17)] + str \in\()Smoq, [input_addr, #(2*8*18)] + str \in\()Smuq, [input_addr, #(2*8*19)] + str \in\()Ssaq, [input_addr, #(2*8*20)] + str \in\()Sseq, [input_addr, #(2*8*21)] + str \in\()Ssiq, [input_addr, #(2*8*22)] + str \in\()Ssoq, [input_addr, #(2*8*23)] + str \in\()Ssuq, [input_addr, #(2*8*24)] +.endm + +#define STACK_SIZE (16*4 + 16*30) +#define STACK_BASE_VREGS 0 +#define STACK_BASE_TMP 16*4 + +#define E0_offset 0 +#define E1_offset 1 +#define E2_offset 2 +#define E3_offset 3 +#define E4_offset 4 + +#define Aba_offset (5 + 0 ) +#define Abe_offset (5 + 1 ) +#define Abi_offset (5 + 2 ) +#define Abo_offset (5 + 3 ) +#define Abu_offset (5 + 4 ) +#define Aga_offset (5 + 5 ) +#define Age_offset (5 + 6 ) +#define Agi_offset (5 + 7 ) +#define Ago_offset (5 + 8 ) +#define Agu_offset (5 + 9 ) +#define Aka_offset (5 + 10 ) +#define Ake_offset (5 + 11 ) +#define Aki_offset (5 + 12 ) +#define Ako_offset (5 + 13 ) +#define Aku_offset (5 + 14 ) +#define Ama_offset (5 + 15 ) +#define Ame_offset (5 + 16 ) +#define Ami_offset (5 + 17 ) +#define Amo_offset (5 + 18 ) +#define Amu_offset (5 + 19 ) +#define Asa_offset (5 + 20 ) +#define Ase_offset (5 + 21 ) +#define Asi_offset (5 + 22 ) +#define Aso_offset (5 + 23 ) +#define Asu_offset (5 + 24 ) + +#define ba_offset (5 + 0 ) +#define be_offset (5 + 1 ) +#define bi_offset (5 + 2 ) +#define bo_offset (5 + 3 ) +#define bu_offset (5 + 4 ) +#define ga_offset (5 + 5 ) +#define ge_offset (5 + 6 ) +#define gi_offset (5 + 7 ) +#define go_offset (5 + 8 ) +#define gu_offset (5 + 9 ) +#define ka_offset (5 + 10 ) +#define ke_offset (5 + 11 ) +#define ki_offset (5 + 12 ) +#define ko_offset (5 + 13 ) +#define ku_offset (5 + 14 ) +#define ma_offset (5 + 15 ) +#define me_offset (5 + 16 ) +#define mi_offset (5 + 17 ) +#define mo_offset (5 + 18 ) +#define mu_offset (5 + 19 ) +#define sa_offset (5 + 20 ) +#define se_offset (5 + 21 ) +#define si_offset (5 + 22 ) +#define so_offset (5 + 23 ) +#define su_offset (5 + 24 ) + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +#define savep(reg, offset_prefix) \ + str reg, [sp, #(STACK_BASE_TMP + 16 * offset_prefix ## _offset)] +#define restorep(reg, offset_prefix) \ + ldr reg, [sp, #(STACK_BASE_TMP + 16 * offset_prefix ## _offset)] +#define save(name) savep(name ## q,name) +#define restore(name) restorep(name ## q,name) + +.macro save_vregs + stp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + stp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + stp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + stp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + ldp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + ldp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + ldp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +/* Macros using v8.4-A SHA-3 instructions */ + +.macro eor3_m1_0 d s0 s1 s2 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor2 d s0 s1 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor3_m1_1 d s0 s1 s2 + eor \d\().16b, \d\().16b, \s2\().16b +.endm + +.macro eor3_m1 d s0 s1 s2 + eor3_m1_0 \d, \s0, \s1, \s2 + eor3_m1_1 \d, \s0, \s1, \s2 +.endm + +.macro rax1_m1 d s0 s1 + add tmp.2d, \s1\().2d, \s1\().2d + sri tmp.2d, \s1\().2d, #63 + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +.macro xar_m1 d s0 s1 imm + eor tmp.16b, \s0\().16b, \s1\().16b + shl \d\().2d, tmp.2d, #(64-\imm) + sri \d\().2d, tmp.2d, #(\imm) +.endm + +.macro bcax_m1 d s0 s1 s2 + bic tmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +.macro eor3_m0 d s0 s1 s2 + eor3 \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +.macro rax1_m0 d s0 s1 + rax1 \d\().2d, \s0\().2d, \s1\().2d +.endm + +.macro xar_m0 d s0 s1 imm + xar \d\().2d, \s0\().2d, \s1\().2d, #\imm +.endm + +.macro bcax_m0 d s0 s1 s2 + bcax \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +#define CONCAT5(a,b,c,d,e) a ## b ## c ## d ## e +#define CONCAT4(a,b,c,d) a ## b ## c ## d + +#define OUT(x) \out\()S##x +#define IN(x) \in\()S##x +#define B(x) \in\()B##x +#define E(x) \in\()E##x +#define C(x) \in\()C##x +#define Cnext(x) \out\()C##x +#define TMP_IN(x) \in\()scratch ## x +#define TMP_OUT(x) \out\()scratch ## x + +#define OUTq(x) \out\()S##x##q +#define INq(x) \in\()S##x##q +#define Bq(x) \in\()B##x##q +#define Eq(x) \in\()E##x##q +#define Cq(x) \in\()C##x##q +#define Cnextq(x) \out\()C##x##q +#define TMP_INq(x) \in\()scratch ## x ## q +#define TMP_OUTq(x) \out\()scratch ## x ## q + +.macro declare_mappings out, in + + C(0) .req TMP_IN(0) + C(1) .req TMP_IN(1) + C(2) .req TMP_IN(2) + C(3) .req TMP_IN(3) + C(4) .req TMP_IN(4) + + Cq(0) .req TMP_INq(0) + Cq(1) .req TMP_INq(1) + Cq(2) .req TMP_INq(2) + Cq(3) .req TMP_INq(3) + Cq(4) .req TMP_INq(4) + + E(1) .req TMP_IN(5) + E(3) .req C(2) + E(0) .req C(4) + E(2) .req C(1) + E(4) .req C(3) + + Eq(1) .req TMP_INq(5) + Eq(3) .req Cq(2) + Eq(0) .req Cq(4) + Eq(2) .req Cq(1) + Eq(4) .req Cq(3) + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + B(go) .req IN(me) + B(gi) .req IN(ka) + B(ga) .req IN(bo) + B(ge) .req IN(gu) + B(gu) .req IN(si) + B(ki) .req IN(ko) + B(ko) .req IN(mu) + B(ka) .req IN(be) + B(ke) .req IN(gi) + B(ku) .req IN(sa) + B(mu) .req IN(so) + B(mo) .req IN(mi) + B(mi) .req IN(ke) + B(ma) .req IN(bu) + B(me) .req IN(ga) + B(ba) .req IN(ba) + B(bi) .req IN(ki) + B(bo) .req IN(mo) + B(bu) .req IN(su) + B(be) .req IN(ge) + B(sa) .req IN(bi) + B(so) .req IN(ma) + B(se) .req IN(go) + B(si) .req IN(ku) + B(su) .req IN(se) + + Bq(go) .req INq(me) + Bq(gi) .req INq(ka) + Bq(ga) .req INq(bo) + Bq(ge) .req INq(gu) + Bq(gu) .req INq(si) + Bq(ki) .req INq(ko) + Bq(ko) .req INq(mu) + Bq(ka) .req INq(be) + Bq(ke) .req INq(gi) + Bq(ku) .req INq(sa) + Bq(mu) .req INq(so) + Bq(mo) .req INq(mi) + Bq(mi) .req INq(ke) + Bq(ma) .req INq(bu) + Bq(me) .req INq(ga) + Bq(ba) .req INq(ba) + Bq(bi) .req INq(ki) + Bq(bo) .req INq(mo) + Bq(bu) .req INq(su) + Bq(be) .req INq(ge) + Bq(sa) .req INq(bi) + Bq(so) .req INq(ma) + Bq(se) .req INq(go) + Bq(si) .req INq(ku) + Bq(su) .req INq(se) + + OUT(ba) .req TMP_IN(0) + OUT(be) .req TMP_IN(5) + OUT(bi) .req B(bi) + OUT(bo) .req B(bo) + OUT(bu) .req B(bu) + OUT(ga) .req B(ba) + OUT(ge) .req B(be) + OUT(gi) .req B(gi) + OUT(go) .req B(go) + OUT(gu) .req B(gu) + OUT(ka) .req B(ga) + OUT(ke) .req B(ge) + OUT(ki) .req B(ki) + OUT(ko) .req B(ko) + OUT(ku) .req B(ku) + OUT(ma) .req B(ka) + OUT(me) .req B(ke) + OUT(mi) .req B(mi) + OUT(mo) .req B(mo) + OUT(mu) .req B(mu) + OUT(sa) .req B(ma) + OUT(se) .req B(me) + OUT(si) .req B(si) + OUT(so) .req B(so) + OUT(su) .req B(su) + + OUTq(ba) .req TMP_INq(0) + OUTq(be) .req TMP_INq(5) + OUTq(bi) .req Bq(bi) + OUTq(bo) .req Bq(bo) + OUTq(bu) .req Bq(bu) + OUTq(ga) .req Bq(ba) + OUTq(ge) .req Bq(be) + OUTq(gi) .req Bq(gi) + OUTq(go) .req Bq(go) + OUTq(gu) .req Bq(gu) + OUTq(ka) .req Bq(ga) + OUTq(ke) .req Bq(ge) + OUTq(ki) .req Bq(ki) + OUTq(ko) .req Bq(ko) + OUTq(ku) .req Bq(ku) + OUTq(ma) .req Bq(ka) + OUTq(me) .req Bq(ke) + OUTq(mi) .req Bq(mi) + OUTq(mo) .req Bq(mo) + OUTq(mu) .req Bq(mu) + OUTq(sa) .req Bq(ma) + OUTq(se) .req Bq(me) + OUTq(si) .req Bq(si) + OUTq(so) .req Bq(so) + OUTq(su) .req Bq(su) + + TMP_OUT(0) .req TMP_IN(1) + TMP_OUT(1) .req TMP_IN(2) + TMP_OUT(2) .req TMP_IN(3) + TMP_OUT(3) .req TMP_IN(4) + TMP_OUT(4) .req B(sa) + TMP_OUT(5) .req B(se) + TMP_OUT(6) .req TMP_IN(6) + + TMP_OUTq(0) .req TMP_INq(1) + TMP_OUTq(1) .req TMP_INq(2) + TMP_OUTq(2) .req TMP_INq(3) + TMP_OUTq(3) .req TMP_INq(4) + TMP_OUTq(4) .req Bq(sa) + TMP_OUTq(5) .req Bq(se) + TMP_OUTq(6) .req TMP_INq(6) + + Cnext(0) .req TMP_OUT(0) + Cnext(1) .req TMP_OUT(1) + Cnext(2) .req TMP_OUT(2) + Cnext(3) .req TMP_OUT(3) + Cnext(4) .req TMP_OUT(4) + + Cnextq(0) .req TMP_OUTq(0) + Cnextq(1) .req TMP_OUTq(1) + Cnextq(2) .req TMP_OUTq(2) + Cnextq(3) .req TMP_OUTq(3) + Cnextq(4) .req TMP_OUTq(4) + + tmp .req v0 + .unreq tmp + tmp .req TMP_IN(6) +.endm + +.macro undeclare_mappings out, in + + .unreq C(0) + .unreq C(1) + .unreq C(2) + .unreq C(3) + .unreq C(4) + + .unreq Cq(0) + .unreq Cq(1) + .unreq Cq(2) + .unreq Cq(3) + .unreq Cq(4) + + .unreq E(2) + .unreq E(4) + .unreq E(1) + .unreq E(3) + .unreq E(0) + + .unreq Eq(2) + .unreq Eq(4) + .unreq Eq(1) + .unreq Eq(3) + .unreq Eq(0) + + .unreq B(go) + .unreq B(gi) + .unreq B(ga) + .unreq B(ge) + .unreq B(gu) + .unreq B(ki) + .unreq B(ko) + .unreq B(ka) + .unreq B(ke) + .unreq B(ku) + .unreq B(mu) + .unreq B(mo) + .unreq B(mi) + .unreq B(ma) + .unreq B(me) + .unreq B(ba) + .unreq B(bi) + .unreq B(bo) + .unreq B(bu) + .unreq B(be) + .unreq B(sa) + .unreq B(so) + .unreq B(se) + .unreq B(si) + .unreq B(su) + + .unreq Bq(go) + .unreq Bq(gi) + .unreq Bq(ga) + .unreq Bq(ge) + .unreq Bq(gu) + .unreq Bq(ki) + .unreq Bq(ko) + .unreq Bq(ka) + .unreq Bq(ke) + .unreq Bq(ku) + .unreq Bq(mu) + .unreq Bq(mo) + .unreq Bq(mi) + .unreq Bq(ma) + .unreq Bq(me) + .unreq Bq(ba) + .unreq Bq(bi) + .unreq Bq(bo) + .unreq Bq(bu) + .unreq Bq(be) + .unreq Bq(sa) + .unreq Bq(so) + .unreq Bq(se) + .unreq Bq(si) + .unreq Bq(su) + + .unreq OUT(ga) + .unreq OUT(ge) + .unreq OUT(gi) + .unreq OUT(go) + .unreq OUT(gu) + .unreq OUT(ka) + .unreq OUT(ke) + .unreq OUT(ki) + .unreq OUT(ko) + .unreq OUT(ku) + .unreq OUT(ma) + .unreq OUT(me) + .unreq OUT(mi) + .unreq OUT(mo) + .unreq OUT(mu) + .unreq OUT(ba) + .unreq OUT(be) + .unreq OUT(bi) + .unreq OUT(bo) + .unreq OUT(bu) + .unreq OUT(sa) + .unreq OUT(se) + .unreq OUT(si) + .unreq OUT(so) + .unreq OUT(su) + + .unreq OUTq(ga) + .unreq OUTq(ge) + .unreq OUTq(gi) + .unreq OUTq(go) + .unreq OUTq(gu) + .unreq OUTq(ka) + .unreq OUTq(ke) + .unreq OUTq(ki) + .unreq OUTq(ko) + .unreq OUTq(ku) + .unreq OUTq(ma) + .unreq OUTq(me) + .unreq OUTq(mi) + .unreq OUTq(mo) + .unreq OUTq(mu) + .unreq OUTq(ba) + .unreq OUTq(be) + .unreq OUTq(bi) + .unreq OUTq(bo) + .unreq OUTq(bu) + .unreq OUTq(sa) + .unreq OUTq(se) + .unreq OUTq(si) + .unreq OUTq(so) + .unreq OUTq(su) + + .unreq TMP_OUT(0) + .unreq TMP_OUT(1) + .unreq TMP_OUT(2) + .unreq TMP_OUT(3) + .unreq TMP_OUT(4) + .unreq TMP_OUT(5) + .unreq TMP_OUT(6) + + .unreq TMP_OUTq(0) + .unreq TMP_OUTq(1) + .unreq TMP_OUTq(2) + .unreq TMP_OUTq(3) + .unreq TMP_OUTq(4) + .unreq TMP_OUTq(5) + .unreq TMP_OUTq(6) + + .unreq tmp +.endm + +.macro keccak_f1600_round_pre out, in + + eor3_m0 C(0), IN(ba), IN(ga), IN(ka) + eor3_m1 C(3), IN(bo), IN(go), IN(ko) + eor3_m0 C(2), IN(bi), IN(gi), IN(ki) + eor3_m1 C(1), IN(be), IN(ge), IN(ke) + eor3_m0 C(0), C(0), IN(ma), IN(sa) + eor3_m1 C(3), C(3), IN(mo), IN(so) + eor3_m0 C(2), C(2), IN(mi), IN(si) + eor3_m1 C(1), C(1), IN(me), IN(se) + eor3_m0 C(4), IN(bu), IN(gu), IN(ku) + +.endm + +.macro keccak_f1600_round_core out, in + + rax1_m0 E(1), C(0), C(2) + xar_m1 B(mi), IN(ke), E(1), 54 + eor3_m0 C(4), C(4), IN(mu), IN(su) + xar_m1 B(go), IN(me), E(1), 19 + rax1_m0 E(3), C(2), C(4) + xar_m1 B(ka), IN(be), E(1), 63 + rax1_m0 E(0), C(4), C(1) + xar_m1 B(be), IN(ge), E(1), 20 + rax1_m0 E(2), C(1), C(3) + xar_m1 B(su), IN(se), E(1), 62 + rax1_m0 E(4), C(3), C(0) + + // TODO: * Interleave (fast) v8.4-A based 5-block with (slow) v8-A based 5-block, + // and then pull forward BCAX for the v8.4-A block + // * Handle XAR's for a fixed E(?) first, so that the remaining E(?)'s + // can be computed in parallel? + + eor2 B(ba), IN(ba), E(0) + xar_m1 B(ga), IN(bo), E(3), 36 + xar_m0 B(bi), IN(ki), E(2), 21 + xar_m1 B(ge), IN(gu), E(4), 44 + xar_m0 B(bo), IN(mo), E(3), 43 + xar_m1 B(gi), IN(ka), E(0), 61 + xar_m0 B(bu), IN(su), E(4), 50 + xar_m1 B(gu), IN(si), E(2), 3 + + xar_m0 B(ke), IN(gi), E(2), 58 + xar_m0 B(ki), IN(ko), E(3), 39 + bcax_m1 OUT(ba), B(ba), B(bi), B(be) + bcax_m1 OUT(be), B(be), B(bo), B(bi) + xar_m0 B(ko), IN(mu), E(4), 56 + xar_m0 B(ku), IN(sa), E(0), 46 + bcax_m1 OUT(bi), B(bi), B(bu), B(bo) + bcax_m1 OUT(bo), B(bo), B(ba), B(bu) + + xar_m0 B(ma), IN(bu), E(4), 37 + xar_m0 B(me), IN(ga), E(0), 28 + bcax_m1 OUT(bu), B(bu), B(be), B(ba) + bcax_m1 OUT(ga), B(ga), B(gi), B(ge) + xar_m0 B(mo), IN(mi), E(2), 49 + xar_m0 B(mu), IN(so), E(3), 8 + bcax_m1 OUT(ge), B(ge), B(go), B(gi) + bcax_m1 OUT(gi), B(gi), B(gu), B(go) + + ld1r {tmp.2d}, [const_addr], #8 + eor OUT(ba).16b, OUT(ba).16b, tmp.16b + + xar_m0 B(sa), IN(bi), E(2), 2 + bcax_m1 OUT(go), B(go), B(ga), B(gu) + xar_m0 B(se), IN(go), E(3), 9 + bcax_m1 OUT(gu), B(gu), B(ge), B(ga) + bcax_m1 OUT(ka), B(ka), B(ki), B(ke) + xar_m0 B(si), IN(ku), E(4), 25 + bcax_m1 OUT(ke), B(ke), B(ko), B(ki) + bcax_m1 OUT(ki), B(ki), B(ku), B(ko) + xar_m0 B(so), IN(ma), E(0), 23 + bcax_m1 OUT(ko), B(ko), B(ka), B(ku) + bcax_m1 OUT(ku), B(ku), B(ke), B(ka) + + bcax_m0 OUT(ma), B(ma), B(mi), B(me) + bcax_m1 OUT(me), B(me), B(mo), B(mi) + bcax_m1 OUT(mi), B(mi), B(mu), B(mo) + bcax_m0 OUT(mo), B(mo), B(ma), B(mu) + bcax_m1 OUT(mu), B(mu), B(me), B(ma) + + bcax_m0 OUT(sa), B(sa), B(si), B(se) + bcax_m1 OUT(se), B(se), B(so), B(si) + bcax_m1 OUT(si), B(si), B(su), B(so) + bcax_m0 OUT(so), B(so), B(sa), B(su) + bcax_m1 OUT(su), B(su), B(se), B(sa) + + eor3_m0 Cnext(0), OUT(ba), OUT(ga), OUT(ka) + eor3_m1 Cnext(3), OUT(bo), OUT(go), OUT(ko) + eor3_m0 Cnext(2), OUT(bi), OUT(gi), OUT(ki) + eor3_m1 Cnext(1), OUT(be), OUT(ge), OUT(ke) + + eor3_m0 Cnext(0), Cnext(0), OUT(ma), OUT(sa) + eor3_m1 Cnext(3), Cnext(3), OUT(mo), OUT(so) + eor3_m0 Cnext(2), Cnext(2), OUT(mi), OUT(si) + eor3_m1 Cnext(1), Cnext(1), OUT(me), OUT(se) + eor3_m0 Cnext(4), OUT(bu), OUT(gu), OUT(ku) + +.endm + +.macro keccak_f1600_round_last out, in + + rax1_m0 E(1), C(0), C(2) + xar_m1 B(mi), IN(ke), E(1), 54 + eor3_m0 C(4), C(4), IN(mu), IN(su) + xar_m1 B(go), IN(me), E(1), 19 + rax1_m0 E(3), C(2), C(4) + xar_m1 B(ka), IN(be), E(1), 63 + rax1_m0 E(0), C(4), C(1) + xar_m1 B(be), IN(ge), E(1), 20 + rax1_m0 E(2), C(1), C(3) + xar_m1 B(su), IN(se), E(1), 62 + rax1_m0 E(4), C(3), C(0) + + // TODO: * Interleave (fast) v8.4-A based 5-block with (slow) v8-A based 5-block, + // and then pull forward BCAX for the v8.4-A block + // * Handle XAR's for a fixed E(?) first, so that the remaining E(?)'s + // can be computed in parallel? + + eor2 B(ba), IN(ba), E(0) + xar_m1 B(ga), IN(bo), E(3), 36 + xar_m0 B(bi), IN(ki), E(2), 21 + xar_m1 B(ge), IN(gu), E(4), 44 + xar_m0 B(bo), IN(mo), E(3), 43 + xar_m1 B(gi), IN(ka), E(0), 61 + xar_m0 B(bu), IN(su), E(4), 50 + xar_m1 B(gu), IN(si), E(2), 3 + + xar_m0 B(ke), IN(gi), E(2), 58 + xar_m0 B(ki), IN(ko), E(3), 39 + bcax_m1 OUT(ba), B(ba), B(bi), B(be) + bcax_m1 OUT(be), B(be), B(bo), B(bi) + xar_m0 B(ko), IN(mu), E(4), 56 + xar_m0 B(ku), IN(sa), E(0), 46 + bcax_m1 OUT(bi), B(bi), B(bu), B(bo) + bcax_m1 OUT(bo), B(bo), B(ba), B(bu) + + xar_m0 B(ma), IN(bu), E(4), 37 + xar_m0 B(me), IN(ga), E(0), 28 + bcax_m1 OUT(bu), B(bu), B(be), B(ba) + bcax_m1 OUT(ga), B(ga), B(gi), B(ge) + xar_m0 B(mo), IN(mi), E(2), 49 + xar_m0 B(mu), IN(so), E(3), 8 + bcax_m1 OUT(ge), B(ge), B(go), B(gi) + bcax_m1 OUT(gi), B(gi), B(gu), B(go) + + ld1r {tmp.2d}, [const_addr], #8 + eor OUT(ba).16b, OUT(ba).16b, tmp.16b + + xar_m0 B(sa), IN(bi), E(2), 2 + bcax_m1 OUT(go), B(go), B(ga), B(gu) + xar_m0 B(se), IN(go), E(3), 9 + bcax_m1 OUT(gu), B(gu), B(ge), B(ga) + bcax_m1 OUT(ka), B(ka), B(ki), B(ke) + xar_m0 B(si), IN(ku), E(4), 25 + bcax_m1 OUT(ke), B(ke), B(ko), B(ki) + bcax_m1 OUT(ki), B(ki), B(ku), B(ko) + xar_m0 B(so), IN(ma), E(0), 23 + bcax_m1 OUT(ko), B(ko), B(ka), B(ku) + bcax_m1 OUT(ku), B(ku), B(ke), B(ka) + + bcax_m0 OUT(ma), B(ma), B(mi), B(me) + bcax_m1 OUT(me), B(me), B(mo), B(mi) + bcax_m1 OUT(mi), B(mi), B(mu), B(mo) + bcax_m0 OUT(mo), B(mo), B(ma), B(mu) + bcax_m1 OUT(mu), B(mu), B(me), B(ma) + + bcax_m0 OUT(sa), B(sa), B(si), B(se) + bcax_m1 OUT(se), B(se), B(so), B(si) + bcax_m1 OUT(si), B(si), B(su), B(so) + bcax_m0 OUT(so), B(so), B(sa), B(su) + bcax_m1 OUT(su), B(su), B(se), B(sa) +.endm + +.macro transfer_state out, in + + savep(INq(ga),ga) + savep(INq(ge),ge) + savep(INq(gi),gi) + savep(INq(go),go) + savep(INq(gu),gu) + savep(INq(ka),ka) + savep(INq(ke),ke) + savep(INq(ki),ki) + savep(INq(ko),ko) + savep(INq(ku),ku) + savep(INq(ma),ma) + savep(INq(me),me) + savep(INq(mi),mi) + savep(INq(mo),mo) + savep(INq(mu),mu) + savep(INq(ba),ba) + savep(INq(be),be) + savep(INq(bi),bi) + savep(INq(bo),bo) + savep(INq(bu),bu) + savep(INq(sa),sa) + savep(INq(se),se) + savep(INq(si),si) + savep(INq(so),so) + savep(INq(su),su) + + restorep(OUTq(ga),ga) + restorep(OUTq(ge),ge) + restorep(OUTq(gi),gi) + restorep(OUTq(go),go) + restorep(OUTq(gu),gu) + restorep(OUTq(ka),ka) + restorep(OUTq(ke),ke) + restorep(OUTq(ki),ki) + restorep(OUTq(ko),ko) + restorep(OUTq(ku),ku) + restorep(OUTq(ma),ma) + restorep(OUTq(me),me) + restorep(OUTq(mi),mi) + restorep(OUTq(mo),mo) + restorep(OUTq(mu),mu) + restorep(OUTq(ba),ba) + restorep(OUTq(be),be) + restorep(OUTq(bi),bi) + restorep(OUTq(bo),bo) + restorep(OUTq(bu),bu) + restorep(OUTq(sa),sa) + restorep(OUTq(se),se) + restorep(OUTq(si),si) + restorep(OUTq(so),so) + restorep(OUTq(su),su) + +.endm + +.text +.align 4 +.global keccak_f1600_x2_hybrid_asm_v2p2 +.global _keccak_f1600_x2_hybrid_asm_v2p2 + +#define KECCAK_F1600_ROUNDS 24 + +keccak_f1600_x2_hybrid_asm_v2p2: +_keccak_f1600_x2_hybrid_asm_v2p2: + alloc_stack + save_vregs + load_constant_ptr + load_input + + /* NOTE: Unrolling the whole loop isn't really practical, but for now + * this is just for the sake of understanding the theoretical performance + * uplift of the present approach. */ + + declare_mappings A1, A + keccak_f1600_round_pre A1, A + keccak_f1600_round_core A1, A + declare_mappings A2, A1 + keccak_f1600_round_core A2, A1 + declare_mappings A3, A2 + keccak_f1600_round_core A3, A2 + declare_mappings A4, A3 + keccak_f1600_round_core A4, A3 + declare_mappings A5, A4 + keccak_f1600_round_core A5, A4 + declare_mappings A6, A5 + keccak_f1600_round_core A6, A5 + declare_mappings A7, A6 + keccak_f1600_round_core A7, A6 + declare_mappings A8, A7 + keccak_f1600_round_core A8, A7 + + declare_mappings A9, A8 + keccak_f1600_round_core A9, A8 + declare_mappings A10, A9 + keccak_f1600_round_core A10, A9 + declare_mappings A11, A10 + keccak_f1600_round_core A11, A10 + declare_mappings A12, A11 + keccak_f1600_round_core A12, A11 + declare_mappings A13, A12 + keccak_f1600_round_core A13, A12 + declare_mappings A14, A13 + keccak_f1600_round_core A14, A13 + declare_mappings A15, A14 + keccak_f1600_round_core A15, A14 + declare_mappings A16, A15 + keccak_f1600_round_core A16, A15 + + declare_mappings A17, A16 + keccak_f1600_round_core A17, A16 + declare_mappings A18, A17 + keccak_f1600_round_core A18, A17 + declare_mappings A19, A18 + keccak_f1600_round_core A19, A18 + declare_mappings A20, A19 + keccak_f1600_round_core A20, A19 + declare_mappings A21, A20 + keccak_f1600_round_core A21, A20 + declare_mappings A22, A21 + keccak_f1600_round_core A22, A21 + declare_mappings A23, A22 + keccak_f1600_round_core A23, A22 + declare_mappings A24, A23 + keccak_f1600_round_last A24, A23 + + store_input A24 + restore_vregs + free_stack + ret + +#endif \ No newline at end of file diff --git a/tests/keccak_neon/manual/keccak_f1600_x2_hybrid_asm_v2pp0.s b/tests/keccak_neon/manual/keccak_f1600_x2_hybrid_asm_v2pp0.s new file mode 100644 index 0000000..517338e --- /dev/null +++ b/tests/keccak_neon/manual/keccak_f1600_x2_hybrid_asm_v2pp0.s @@ -0,0 +1,804 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +#if defined(__ARM_FEATURE_SHA3) +/********************** CONSTANTS *************************/ + .data + .align(8) +_round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x1 + count .req x2 + cur_const .req x3 + + /* Mapping of Kecck-f1600 state to vector registers + * at the beginning and end of each round. */ + Aba .req v0 + Abe .req v1 + Abi .req v2 + Abo .req v3 + Abu .req v4 + Aga .req v5 + Age .req v6 + Agi .req v7 + Ago .req v8 + Agu .req v9 + Aka .req v10 + Ake .req v11 + Aki .req v12 + Ako .req v13 + Aku .req v14 + Ama .req v15 + Ame .req v16 + Ami .req v17 + Amo .req v18 + Amu .req v19 + Asa .req v20 + Ase .req v21 + Asi .req v22 + Aso .req v23 + Asu .req v24 + + /* q-form of the above mapping */ + Abaq .req q0 + Abeq .req q1 + Abiq .req q2 + Aboq .req q3 + Abuq .req q4 + Agaq .req q5 + Ageq .req q6 + Agiq .req q7 + Agoq .req q8 + Aguq .req q9 + Akaq .req q10 + Akeq .req q11 + Akiq .req q12 + Akoq .req q13 + Akuq .req q14 + Amaq .req q15 + Ameq .req q16 + Amiq .req q17 + Amoq .req q18 + Amuq .req q19 + Asaq .req q20 + Aseq .req q21 + Asiq .req q22 + Asoq .req q23 + Asuq .req q24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v27 + C1 .req v28 + C2 .req v29 + C3 .req v30 + C4 .req v31 + + C0q .req q27 + C1q .req q28 + C2q .req q29 + C3q .req q30 + C4q .req q31 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + vBba .req v25 // fresh + vBbe .req v26 // fresh + vBbi .req Abi + vBbo .req Abo + vBbu .req Abu + vBga .req Aka + vBge .req Ake + vBgi .req Agi + vBgo .req Ago + vBgu .req Agu + vBka .req Ama + vBke .req Ame + vBki .req Aki + vBko .req Ako + vBku .req Aku + vBma .req Asa + vBme .req Ase + vBmi .req Ami + vBmo .req Amo + vBmu .req Amu + vBsa .req Aba + vBse .req Abe + vBsi .req Asi + vBso .req Aso + vBsu .req Asu + + vBbaq .req q25 // fresh + vBbeq .req q26 // fresh + vBbiq .req Abiq + vBboq .req Aboq + vBbuq .req Abuq + vBgaq .req Akaq + vBgeq .req Akeq + vBgiq .req Agiq + vBgoq .req Agoq + vBguq .req Aguq + vBkaq .req Amaq + vBkeq .req Ameq + vBkiq .req Akiq + vBkoq .req Akoq + vBkuq .req Akuq + vBmaq .req Asaq + vBmeq .req Aseq + vBmiq .req Amiq + vBmoq .req Amoq + vBmuq .req Amuq + vBsaq .req Abaq + vBseq .req Abeq + vBsiq .req Asiq + vBsoq .req Asoq + vBsuq .req Asuq + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req C4 + E1 .req C0 + E2 .req vBbe // fresh + E3 .req C2 + E4 .req C3 + + E0q .req C4q + E1q .req C0q + E2q .req vBbeq // fresh + E3q .req C2q + E4q .req C3q + + +/************************ MACROS ****************************/ + +.macro load_input + ldp Abaq, Abeq, [input_addr, #(2*8*0)] + ldp Abiq, Aboq, [input_addr, #(2*8*2)] + ldp Abuq, Agaq, [input_addr, #(2*8*4)] + ldp Ageq, Agiq, [input_addr, #(2*8*6)] + ldp Agoq, Aguq, [input_addr, #(2*8*8)] + ldp Akaq, Akeq, [input_addr, #(2*8*10)] + ldp Akiq, Akoq, [input_addr, #(2*8*12)] + ldp Akuq, Amaq, [input_addr, #(2*8*14)] + ldp Ameq, Amiq, [input_addr, #(2*8*16)] + ldp Amoq, Amuq, [input_addr, #(2*8*18)] + ldp Asaq, Aseq, [input_addr, #(2*8*20)] + ldp Asiq, Asoq, [input_addr, #(2*8*22)] + ldr Asuq, [input_addr, #(2*8*24)] +.endm + +.macro store_input + str Abaq, [input_addr, #(2*8*0)] + str Abeq, [input_addr, #(2*8*1)] + str Abiq, [input_addr, #(2*8*2)] + str Aboq, [input_addr, #(2*8*3)] + str Abuq, [input_addr, #(2*8*4)] + str Agaq, [input_addr, #(2*8*5)] + str Ageq, [input_addr, #(2*8*6)] + str Agiq, [input_addr, #(2*8*7)] + str Agoq, [input_addr, #(2*8*8)] + str Aguq, [input_addr, #(2*8*9)] + str Akaq, [input_addr, #(2*8*10)] + str Akeq, [input_addr, #(2*8*11)] + str Akiq, [input_addr, #(2*8*12)] + str Akoq, [input_addr, #(2*8*13)] + str Akuq, [input_addr, #(2*8*14)] + str Amaq, [input_addr, #(2*8*15)] + str Ameq, [input_addr, #(2*8*16)] + str Amiq, [input_addr, #(2*8*17)] + str Amoq, [input_addr, #(2*8*18)] + str Amuq, [input_addr, #(2*8*19)] + str Asaq, [input_addr, #(2*8*20)] + str Aseq, [input_addr, #(2*8*21)] + str Asiq, [input_addr, #(2*8*22)] + str Asoq, [input_addr, #(2*8*23)] + str Asuq, [input_addr, #(2*8*24)] +.endm + +#define STACK_SIZE (16*4 + 16*34) +#define STACK_BASE_VREGS 0 +#define STACK_BASE_TMP 16*4 + +#define Aga_offset 0 +#define E0_offset 1 +#define E1_offset 2 +#define E2_offset 3 +#define E3_offset 4 +#define E4_offset 5 +#define Ame_offset 7 +#define Agi_offset 8 +#define Aka_offset 9 +#define Abo_offset 10 +#define Amo_offset 11 +#define Ami_offset 12 +#define Ake_offset 13 +#define Agu_offset 14 +#define Asi_offset 15 +#define Aku_offset 16 +#define Asa_offset 17 +#define Abu_offset 18 +#define Asu_offset 19 +#define Ase_offset 20 +//#define Aga_offset 21 +#define Age_offset 22 +#define vBgo_offset 23 +#define vBke_offset 24 +#define vBgi_offset 25 +#define vBga_offset 26 +#define vBbo_offset 27 +#define vBmo_offset 28 +#define vBmi_offset 29 +#define vBge_offset 30 + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +#define save(name) \ + str name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] +#define restore(name) \ + ldr name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] + +.macro save_vregs + stp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + stp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + stp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + stp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + ldp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + ldp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + ldp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +.macro eor3_m0 d s0 s1 s2 + eor3 \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +.macro rax1_m0 d s0 s1 + rax1 \d\().2d, \s0\().2d, \s1\().2d +.endm + +.macro xar_m0 d s0 s1 imm + xar \d\().2d, \s0\().2d, \s1\().2d, #\imm +.endm + +.macro bcax_m0 d s0 s1 s2 + bcax \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +.macro eor3_m1_0 d s0 s1 s2 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor2 d s0 s1 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor3_m1_1 d s0 s1 s2 + eor \d\().16b, \d\().16b, \s2\().16b +.endm + +.macro eor3_m1 d s0 s1 s2 + eor3_m1_0 \d, \s0, \s1, \s2 + eor3_m1_1 \d, \s0, \s1, \s2 +.endm + +.macro rax1_m1 d s0 s1 + // Use add instead of SHL #1 + add tmp.2d, \s1\().2d, \s1\().2d + sri tmp.2d, \s1\().2d, #63 + eor \d\().16b, tmp.16b, \s0\().16b +.endm + + .macro xar_m1 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 63 + eor \s0\().16b, \s0\().16b, \s1\().16b + add \d\().2d, \s0\().2d, \s0\().2d + sri \d\().2d, \s0\().2d, #(63) + // .elseif \imm == 62 + // eor \s0\().16b, \s0\().16b, \s1\().16b + // add \d\().2d, \s0\().2d, \s0\().2d + // add \d\().2d, \d\().2d, \d\().2d + // sri \d\().2d, \s0\().2d, #(62) + // .elseif \imm == 61 + // eor \s0\().16b, \s0\().16b, \s1\().16b + // add \d\().2d, \s0\().2d, \s0\().2d + // add \d\().2d, \d\().2d, \d\().2d + // add \d\().2d, \d\().2d, \d\().2d + // sri \d\().2d, \s0\().2d, #(61) + .else + eor \s0\().16b, \s0\().16b, \s1\().16b + shl \d\().2d, \s0\().2d, #(64-\imm) + sri \d\().2d, \s0\().2d, #(\imm) + .endif +.endm + + .macro xar_m1_0 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 63 + eor \s0\().16b, \s0\().16b, \s1\().16b + .elseif \imm == 62 + eor \s0\().16b, \s0\().16b, \s1\().16b + .else + eor \s0\().16b, \s0\().16b, \s1\().16b + .endif +.endm + + .macro xar_m1_1 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 63 + add \d\().2d, \s0\().2d, \s0\().2d + sri \d\().2d, \s0\().2d, #(63) + .elseif \imm == 62 + add \d\().2d, \s0\().2d, \s0\().2d + add \d\().2d, \d\().2d, \d\().2d + sri \d\().2d, \s0\().2d, #(62) + .else + shl \d\().2d, \s0\().2d, #(64-\imm) + sri \d\().2d, \s0\().2d, #(\imm) + .endif +.endm + +.macro bcax_m1 d s0 s1 s2 + bic tmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +/* Keccak-f1600 round */ + +.macro keccak_f1600_round_pre + + /* 10 EOR3, so 20 individual EOR */ + + eor3_m0 C1, Abe, Age, Ake + eor3_m1 C3, Abo, Ago, Ako + eor3_m0 C0, Aba, Aga, Aka + eor3_m1 C2, Abi, Agi, Aki + eor3_m0 C4, Abu, Agu, Aku + eor3_m1 C1, C1, Ame, Ase + eor3_m0 C3, C3, Amo, Aso + eor3_m1 C0, C0, Ama, Asa + eor3_m0 C2, C2, Ami, Asi + eor3_m1 C4, C4, Amu, Asu + +.endm + +.macro keccak_f1600_round + + /* 10 EOR3, so 20 individual EOR */ + + eor3_m1_0 C0, Aba, Aga, Aka + eor3_m1_0 C1, Abe, Age, Ake + eor3_m1_0 C2, Abi, Agi, Aki + eor3_m1_0 C3, Abo, Ago, Ako + eor3_m1_0 C4, Abu, Agu, Aku + eor3_m1_1 C0, Aba, Aga, Aka + eor3_m1_1 C1, Abe, Age, Ake + eor3_m1_1 C2, Abi, Agi, Aki + eor3_m1_1 C3, Abo, Ago, Ako + eor3_m1_1 C4, Abu, Agu, Aku + eor3_m1_0 C0, C0, Ama, Asa + eor3_m1_0 C1, C1, Ame, Ase + eor3_m1_0 C2, C2, Ami, Asi + eor3_m1_0 C3, C3, Amo, Aso + eor3_m1_0 C4, C4, Amu, Asu + eor3_m1_1 C0, C0, Ama, Asa + eor3_m1_1 C1, C1, Ame, Ase + eor3_m1_1 C2, C2, Ami, Asi + eor3_m1_1 C3, C3, Amo, Aso + eor3_m1_1 C4, C4, Amu, Asu + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req vBba + rax1_m1 E2, C1, C3 + rax1_m1 E4, C3, C0 + rax1_m1 E1, C0, C2 + rax1_m1 E3, C2, C4 + rax1_m1 E0, C4, C1 + .unreq tmp + + /* 25x XAR, 75 in total */ + + tmp .req C1 + tmpq .req C1q + + eor vBba.16b, Aba.16b, E0.16b + xar_m1 vBsa, Abi, E2, 2 + xar_m1 vBbi, Aki, E2, 21 + xar_m1 vBki, Ako, E3, 39 + xar_m1 vBko, Amu, E4, 56 + xar_m1 vBmu, Aso, E3, 8 + xar_m1 vBso, Ama, E0, 23 + xar_m1 vBka, Abe, E1, 63 + xar_m1 vBse, Ago, E3, 9 + xar_m1 vBgo, Ame, E1, 19 + xar_m1 vBke, Agi, E2, 58 + xar_m1 vBgi, Aka, E0, 61 + xar_m1 vBga, Abo, E3, 36 + xar_m1 vBbo, Amo, E3, 43 + xar_m1 vBmo, Ami, E2, 49 + xar_m1 vBmi, Ake, E1, 54 + xar_m1 vBge, Agu, E4, 44 + xar_m1 vBgu, Asi, E2, 3 + xar_m1 vBsi, Aku, E4, 25 + xar_m1 vBku, Asa, E0, 46 + xar_m1 vBma, Abu, E4, 37 + xar_m1 vBbu, Asu, E4, 50 + xar_m1 vBsu, Ase, E1, 62 + xar_m1 vBme, Aga, E0, 28 + xar_m1 vBbe, Age, E1, 20 + + /* 25x BCAX, 50 in total */ + + bcax_m1 Aga, vBga, vBgi, vBge + bcax_m1 Age, vBge, vBgo, vBgi + bcax_m1 Agi, vBgi, vBgu, vBgo + bcax_m1 Ago, vBgo, vBga, vBgu + bcax_m1 Agu, vBgu, vBge, vBga + bcax_m1 Aka, vBka, vBki, vBke + bcax_m1 Ake, vBke, vBko, vBki + bcax_m1 Aki, vBki, vBku, vBko + bcax_m1 Ako, vBko, vBka, vBku + bcax_m1 Aku, vBku, vBke, vBka + bcax_m1 Ama, vBma, vBmi, vBme + bcax_m1 Ame, vBme, vBmo, vBmi + bcax_m1 Ami, vBmi, vBmu, vBmo + bcax_m1 Amo, vBmo, vBma, vBmu + bcax_m1 Amu, vBmu, vBme, vBma + bcax_m1 Asa, vBsa, vBsi, vBse + bcax_m1 Ase, vBse, vBso, vBsi + bcax_m1 Asi, vBsi, vBsu, vBso + bcax_m1 Aso, vBso, vBsa, vBsu + bcax_m1 Asu, vBsu, vBse, vBsa + bcax_m1 Aba, vBba, vBbi, vBbe + bcax_m1 Abe, vBbe, vBbo, vBbi + bcax_m1 Abi, vBbi, vBbu, vBbo + bcax_m1 Abo, vBbo, vBba, vBbu + bcax_m1 Abu, vBbu, vBbe, vBba + + // iota step + //ld1r {tmp.2d}, [const_addr], #8 + ldr tmpq, [const_addr], #16 + eor Aba.16b, Aba.16b, tmp.16b + + .unreq tmp + .unreq tmpq + +.endm + +.macro keccak_f1600_round_core + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req vBba + rax1_m0 E2, C1, C3 + rax1_m1 E4, C3, C0 + rax1_m0 E1, C0, C2 + rax1_m1 E3, C2, C4 + rax1_m0 E0, C4, C1 + + /* 25x XAR, 75 in total */ + + .unreq tmp + tmp .req C1 + tmpq .req C1q + + eor vBba.16b, Aba.16b, E0.16b + xar_m1 vBsa, Abi, E2, 2 + xar_m0 vBbi, Aki, E2, 21 + xar_m1 vBki, Ako, E3, 39 + xar_m0 vBko, Amu, E4, 56 + xar_m1 vBmu, Aso, E3, 8 + xar_m0 vBso, Ama, E0, 23 + xar_m1 vBka, Abe, E1, 63 + xar_m0 vBse, Ago, E3, 9 + xar_m1 vBgo, Ame, E1, 19 + xar_m0 vBke, Agi, E2, 58 + xar_m1 vBgi, Aka, E0, 61 + xar_m0 vBga, Abo, E3, 36 + xar_m1 vBbo, Amo, E3, 43 + xar_m0 vBmo, Ami, E2, 49 + xar_m1 vBmi, Ake, E1, 54 + xar_m0 vBge, Agu, E4, 44 + mov E3.16b, Aga.16b + bcax_m1 Aga, vBga, vBgi, vBge + xar_m0 vBgu, Asi, E2, 3 + xar_m1 vBsi, Aku, E4, 25 + xar_m0 vBku, Asa, E0, 46 + xar_m1 vBma, Abu, E4, 37 + xar_m0 vBbu, Asu, E4, 50 + xar_m1 vBsu, Ase, E1, 62 + xar_m0 vBme, E3, E0, 28 + xar_m1 vBbe, Age, E1, 20 + + /* 25x BCAX, 50 in total */ + + bcax_m1 Age, vBge, vBgo, vBgi + bcax_m0 Agi, vBgi, vBgu, vBgo + bcax_m1 Ago, vBgo, vBga, vBgu + bcax_m0 Agu, vBgu, vBge, vBga + bcax_m1 Aka, vBka, vBki, vBke + bcax_m0 Ake, vBke, vBko, vBki + + .unreq tmp + .unreq tmpq + + eor2 C0, Aka, Aga + save(Aga) + + tmp .req Aga + tmpq .req Agaq + bcax_m0 Aki, vBki, vBku, vBko + bcax_m1 Ako, vBko, vBka, vBku + eor2 C1, Ake, Age + bcax_m0 Aku, vBku, vBke, vBka + eor2 C2, Aki, Agi + bcax_m1 Ama, vBma, vBmi, vBme + eor2 C3, Ako, Ago + bcax_m0 Ame, vBme, vBmo, vBmi + eor2 C4, Aku, Agu + bcax_m1 Ami, vBmi, vBmu, vBmo + eor2 C0, C0, Ama + bcax_m0 Amo, vBmo, vBma, vBmu + eor2 C1, C1, Ame + bcax_m1 Amu, vBmu, vBme, vBma + eor2 C2, C2, Ami + bcax_m0 Asa, vBsa, vBsi, vBse + eor2 C3, C3, Amo + bcax_m1 Ase, vBse, vBso, vBsi + eor2 C4, C4, Amu + bcax_m0 Asi, vBsi, vBsu, vBso + eor2 C0, C0, Asa + bcax_m1 Aso, vBso, vBsa, vBsu + eor2 C1, C1, Ase + bcax_m0 Asu, vBsu, vBse, vBsa + eor2 C2, C2, Asi + eor2 C3, C3, Aso + bcax_m1 Aba, vBba, vBbi, vBbe + bcax_m0 Abe, vBbe, vBbo, vBbi + eor2 C1, C1, Abe + + // iota step + //ld1r {tmp.2d}, [const_addr], #8 + ldr tmpq, [const_addr], #16 + eor Aba.16b, Aba.16b, tmp.16b + eor2 C4, C4, Asu + bcax_m0 Abi, vBbi, vBbu, vBbo + bcax_m1 Abo, vBbo, vBba, vBbu + eor2 C3, C3, Abo + eor2 C2, C2, Abi + eor2 C0, C0, Aba + bcax_m0 Abu, vBbu, vBbe, vBba + eor2 C4, C4, Abu + + restore(Aga) + .unreq tmp + .unreq tmpq + +.endm + +.macro keccak_f1600_round_post + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req vBba + rax1_m0 E2, C1, C3 + rax1_m1 E4, C3, C0 + rax1_m0 E1, C0, C2 + rax1_m1 E3, C2, C4 + rax1_m0 E0, C4, C1 + + /* 25x XAR, 75 in total */ + + .unreq tmp + tmp .req C1 + tmpq .req C1q + + eor vBba.16b, Aba.16b, E0.16b + xar_m0 vBsa, Abi, E2, 2 + xar_m1 vBbi, Aki, E2, 21 + xar_m0 vBki, Ako, E3, 39 + xar_m1 vBko, Amu, E4, 56 + xar_m0 vBmu, Aso, E3, 8 + xar_m1 vBso, Ama, E0, 23 + xar_m0 vBka, Abe, E1, 63 + xar_m1 vBse, Ago, E3, 9 + xar_m0 vBgo, Ame, E1, 19 + xar_m1 vBke, Agi, E2, 58 + xar_m0 vBgi, Aka, E0, 61 + xar_m1 vBga, Abo, E3, 36 + xar_m0 vBbo, Amo, E3, 43 + xar_m1 vBmo, Ami, E2, 49 + xar_m0 vBmi, Ake, E1, 54 + xar_m1 vBge, Agu, E4, 44 + mov E3.16b, Aga.16b + bcax_m1 Aga, vBga, vBgi, vBge + xar_m0 vBgu, Asi, E2, 3 + xar_m1 vBsi, Aku, E4, 25 + xar_m0 vBku, Asa, E0, 46 + xar_m1 vBma, Abu, E4, 37 + xar_m0 vBbu, Asu, E4, 50 + xar_m1 vBsu, Ase, E1, 62 + xar_m0 vBme, E3, E0, 28 + xar_m1 vBbe, Age, E1, 20 + + /* 25x BCAX, 50 in total */ + + bcax_m0 Age, vBge, vBgo, vBgi + bcax_m1 Agi, vBgi, vBgu, vBgo + bcax_m0 Ago, vBgo, vBga, vBgu + bcax_m1 Agu, vBgu, vBge, vBga + bcax_m0 Aka, vBka, vBki, vBke + bcax_m1 Ake, vBke, vBko, vBki + bcax_m0 Aki, vBki, vBku, vBko + bcax_m1 Ako, vBko, vBka, vBku + bcax_m0 Aku, vBku, vBke, vBka + bcax_m1 Ama, vBma, vBmi, vBme + bcax_m0 Ame, vBme, vBmo, vBmi + bcax_m1 Ami, vBmi, vBmu, vBmo + bcax_m0 Amo, vBmo, vBma, vBmu + bcax_m1 Amu, vBmu, vBme, vBma + bcax_m0 Asa, vBsa, vBsi, vBse + bcax_m1 Ase, vBse, vBso, vBsi + bcax_m0 Asi, vBsi, vBsu, vBso + bcax_m1 Aso, vBso, vBsa, vBsu + bcax_m0 Asu, vBsu, vBse, vBsa + bcax_m1 Aba, vBba, vBbi, vBbe + bcax_m0 Abe, vBbe, vBbo, vBbi + bcax_m1 Abi, vBbi, vBbu, vBbo + bcax_m0 Abo, vBbo, vBba, vBbu + bcax_m1 Abu, vBbu, vBbe, vBba + + // iota step + //ld1r {tmp.2d}, [const_addr], #8 + ldr tmpq, [const_addr], #16 + eor Aba.16b, Aba.16b, tmp.16b + + .unreq tmp + .unreq tmpq + +.endm + + +.text +.align 4 +.global keccak_f1600_x2_hybrid_asm_v2pp0 +.global _keccak_f1600_x2_hybrid_asm_v2pp0 + +#define KECCAK_F1600_ROUNDS 24 + +keccak_f1600_x2_hybrid_asm_v2pp0: +_keccak_f1600_x2_hybrid_asm_v2pp0: + alloc_stack + save_vregs + load_constant_ptr + load_input + + //mov count, #(KECCAK_F1600_ROUNDS-2) + mov count, #11 + keccak_f1600_round_pre +loop: + keccak_f1600_round_core + keccak_f1600_round_core + sub count, count, #1 + cbnz count, loop + + keccak_f1600_round_core + keccak_f1600_round_post + store_input + restore_vregs + free_stack + ret + +#endif \ No newline at end of file diff --git a/tests/keccak_neon/manual/keccak_f1600_x2_hybrid_asm_v2pp1.s b/tests/keccak_neon/manual/keccak_f1600_x2_hybrid_asm_v2pp1.s new file mode 100644 index 0000000..cac0bcd --- /dev/null +++ b/tests/keccak_neon/manual/keccak_f1600_x2_hybrid_asm_v2pp1.s @@ -0,0 +1,805 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" +#if defined(__ARM_FEATURE_SHA3) + +/********************** CONSTANTS *************************/ + .data + .align(8) +_round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x1 + count .req x2 + cur_const .req x3 + + /* Mapping of Kecck-f1600 state to vector registers + * at the beginning and end of each round. */ + Aba .req v0 + Abe .req v1 + Abi .req v2 + Abo .req v3 + Abu .req v4 + Aga .req v5 + Age .req v6 + Agi .req v7 + Ago .req v8 + Agu .req v9 + Aka .req v10 + Ake .req v11 + Aki .req v12 + Ako .req v13 + Aku .req v14 + Ama .req v15 + Ame .req v16 + Ami .req v17 + Amo .req v18 + Amu .req v19 + Asa .req v20 + Ase .req v21 + Asi .req v22 + Aso .req v23 + Asu .req v24 + + /* q-form of the above mapping */ + Abaq .req q0 + Abeq .req q1 + Abiq .req q2 + Aboq .req q3 + Abuq .req q4 + Agaq .req q5 + Ageq .req q6 + Agiq .req q7 + Agoq .req q8 + Aguq .req q9 + Akaq .req q10 + Akeq .req q11 + Akiq .req q12 + Akoq .req q13 + Akuq .req q14 + Amaq .req q15 + Ameq .req q16 + Amiq .req q17 + Amoq .req q18 + Amuq .req q19 + Asaq .req q20 + Aseq .req q21 + Asiq .req q22 + Asoq .req q23 + Asuq .req q24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v27 + C1 .req v28 + C2 .req v29 + C3 .req v30 + C4 .req v31 + + C0q .req q27 + C1q .req q28 + C2q .req q29 + C3q .req q30 + C4q .req q31 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + vBba .req v25 // fresh + vBbe .req v26 // fresh + vBbi .req Abi + vBbo .req Abo + vBbu .req Abu + vBga .req Aka + vBge .req Ake + vBgi .req Agi + vBgo .req Ago + vBgu .req Agu + vBka .req Ama + vBke .req Ame + vBki .req Aki + vBko .req Ako + vBku .req Aku + vBma .req Asa + vBme .req Ase + vBmi .req Ami + vBmo .req Amo + vBmu .req Amu + vBsa .req Aba + vBse .req Abe + vBsi .req Asi + vBso .req Aso + vBsu .req Asu + + vBbaq .req q25 // fresh + vBbeq .req q26 // fresh + vBbiq .req Abiq + vBboq .req Aboq + vBbuq .req Abuq + vBgaq .req Akaq + vBgeq .req Akeq + vBgiq .req Agiq + vBgoq .req Agoq + vBguq .req Aguq + vBkaq .req Amaq + vBkeq .req Ameq + vBkiq .req Akiq + vBkoq .req Akoq + vBkuq .req Akuq + vBmaq .req Asaq + vBmeq .req Aseq + vBmiq .req Amiq + vBmoq .req Amoq + vBmuq .req Amuq + vBsaq .req Abaq + vBseq .req Abeq + vBsiq .req Asiq + vBsoq .req Asoq + vBsuq .req Asuq + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req C4 + E1 .req C0 + E2 .req vBbe // fresh + E3 .req C2 + E4 .req C3 + + E0q .req C4q + E1q .req C0q + E2q .req vBbeq // fresh + E3q .req C2q + E4q .req C3q + + +/************************ MACROS ****************************/ + +.macro load_input + ldp Abaq, Abeq, [input_addr, #(2*8*0)] + ldp Abiq, Aboq, [input_addr, #(2*8*2)] + ldp Abuq, Agaq, [input_addr, #(2*8*4)] + ldp Ageq, Agiq, [input_addr, #(2*8*6)] + ldp Agoq, Aguq, [input_addr, #(2*8*8)] + ldp Akaq, Akeq, [input_addr, #(2*8*10)] + ldp Akiq, Akoq, [input_addr, #(2*8*12)] + ldp Akuq, Amaq, [input_addr, #(2*8*14)] + ldp Ameq, Amiq, [input_addr, #(2*8*16)] + ldp Amoq, Amuq, [input_addr, #(2*8*18)] + ldp Asaq, Aseq, [input_addr, #(2*8*20)] + ldp Asiq, Asoq, [input_addr, #(2*8*22)] + ldr Asuq, [input_addr, #(2*8*24)] +.endm + +.macro store_input + str Abaq, [input_addr, #(2*8*0)] + str Abeq, [input_addr, #(2*8*1)] + str Abiq, [input_addr, #(2*8*2)] + str Aboq, [input_addr, #(2*8*3)] + str Abuq, [input_addr, #(2*8*4)] + str Agaq, [input_addr, #(2*8*5)] + str Ageq, [input_addr, #(2*8*6)] + str Agiq, [input_addr, #(2*8*7)] + str Agoq, [input_addr, #(2*8*8)] + str Aguq, [input_addr, #(2*8*9)] + str Akaq, [input_addr, #(2*8*10)] + str Akeq, [input_addr, #(2*8*11)] + str Akiq, [input_addr, #(2*8*12)] + str Akoq, [input_addr, #(2*8*13)] + str Akuq, [input_addr, #(2*8*14)] + str Amaq, [input_addr, #(2*8*15)] + str Ameq, [input_addr, #(2*8*16)] + str Amiq, [input_addr, #(2*8*17)] + str Amoq, [input_addr, #(2*8*18)] + str Amuq, [input_addr, #(2*8*19)] + str Asaq, [input_addr, #(2*8*20)] + str Aseq, [input_addr, #(2*8*21)] + str Asiq, [input_addr, #(2*8*22)] + str Asoq, [input_addr, #(2*8*23)] + str Asuq, [input_addr, #(2*8*24)] +.endm + +#define STACK_SIZE (16*4 + 16*34) +#define STACK_BASE_VREGS 0 +#define STACK_BASE_TMP 16*4 + +#define Aga_offset 0 +#define E0_offset 1 +#define E1_offset 2 +#define E2_offset 3 +#define E3_offset 4 +#define E4_offset 5 +#define Ame_offset 7 +#define Agi_offset 8 +#define Aka_offset 9 +#define Abo_offset 10 +#define Amo_offset 11 +#define Ami_offset 12 +#define Ake_offset 13 +#define Agu_offset 14 +#define Asi_offset 15 +#define Aku_offset 16 +#define Asa_offset 17 +#define Abu_offset 18 +#define Asu_offset 19 +#define Ase_offset 20 +//#define Aga_offset 21 +#define Age_offset 22 +#define vBgo_offset 23 +#define vBke_offset 24 +#define vBgi_offset 25 +#define vBga_offset 26 +#define vBbo_offset 27 +#define vBmo_offset 28 +#define vBmi_offset 29 +#define vBge_offset 30 + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +#define save(name) \ + str name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] +#define restore(name) \ + ldr name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] + +.macro save_vregs + stp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + stp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + stp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + stp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + ldp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + ldp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + ldp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +.macro eor3_m0 d s0 s1 s2 + eor3 \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +.macro rax1_m0 d s0 s1 + rax1 \d\().2d, \s0\().2d, \s1\().2d +.endm + +.macro xar_m0 d s0 s1 imm + xar \d\().2d, \s0\().2d, \s1\().2d, #\imm +.endm + +.macro bcax_m0 d s0 s1 s2 + bcax \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +.macro eor3_m1_0 d s0 s1 s2 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor2 d s0 s1 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor3_m1_1 d s0 s1 s2 + eor \d\().16b, \d\().16b, \s2\().16b +.endm + +.macro eor3_m1 d s0 s1 s2 + eor3_m1_0 \d, \s0, \s1, \s2 + eor3_m1_1 \d, \s0, \s1, \s2 +.endm + +.macro rax1_m1 d s0 s1 + // Use add instead of SHL #1 + add tmp.2d, \s1\().2d, \s1\().2d + sri tmp.2d, \s1\().2d, #63 + eor \d\().16b, tmp.16b, \s0\().16b +.endm + + .macro xar_m1 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 63 + eor \s0\().16b, \s0\().16b, \s1\().16b + add \d\().2d, \s0\().2d, \s0\().2d + sri \d\().2d, \s0\().2d, #(63) + // .elseif \imm == 62 + // eor \s0\().16b, \s0\().16b, \s1\().16b + // add \d\().2d, \s0\().2d, \s0\().2d + // add \d\().2d, \d\().2d, \d\().2d + // sri \d\().2d, \s0\().2d, #(62) + // .elseif \imm == 61 + // eor \s0\().16b, \s0\().16b, \s1\().16b + // add \d\().2d, \s0\().2d, \s0\().2d + // add \d\().2d, \d\().2d, \d\().2d + // add \d\().2d, \d\().2d, \d\().2d + // sri \d\().2d, \s0\().2d, #(61) + .else + eor \s0\().16b, \s0\().16b, \s1\().16b + shl \d\().2d, \s0\().2d, #(64-\imm) + sri \d\().2d, \s0\().2d, #(\imm) + .endif +.endm + + .macro xar_m1_0 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 63 + eor \s0\().16b, \s0\().16b, \s1\().16b + .elseif \imm == 62 + eor \s0\().16b, \s0\().16b, \s1\().16b + .else + eor \s0\().16b, \s0\().16b, \s1\().16b + .endif +.endm + + .macro xar_m1_1 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 63 + add \d\().2d, \s0\().2d, \s0\().2d + sri \d\().2d, \s0\().2d, #(63) + .elseif \imm == 62 + add \d\().2d, \s0\().2d, \s0\().2d + add \d\().2d, \d\().2d, \d\().2d + sri \d\().2d, \s0\().2d, #(62) + .else + shl \d\().2d, \s0\().2d, #(64-\imm) + sri \d\().2d, \s0\().2d, #(\imm) + .endif +.endm + +.macro bcax_m1 d s0 s1 s2 + bic tmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +/* Keccak-f1600 round */ + +.macro keccak_f1600_round_pre + + /* 10 EOR3, so 20 individual EOR */ + + eor3_m0 C1, Abe, Age, Ake + eor3_m1 C3, Abo, Ago, Ako + eor3_m0 C0, Aba, Aga, Aka + eor3_m1 C2, Abi, Agi, Aki + eor3_m0 C4, Abu, Agu, Aku + eor3_m1 C1, C1, Ame, Ase + eor3_m0 C3, C3, Amo, Aso + eor3_m1 C0, C0, Ama, Asa + eor3_m0 C2, C2, Ami, Asi + eor3_m1 C4, C4, Amu, Asu + +.endm + +.macro keccak_f1600_round + + /* 10 EOR3, so 20 individual EOR */ + + eor3_m1_0 C0, Aba, Aga, Aka + eor3_m1_0 C1, Abe, Age, Ake + eor3_m1_0 C2, Abi, Agi, Aki + eor3_m1_0 C3, Abo, Ago, Ako + eor3_m1_0 C4, Abu, Agu, Aku + eor3_m1_1 C0, Aba, Aga, Aka + eor3_m1_1 C1, Abe, Age, Ake + eor3_m1_1 C2, Abi, Agi, Aki + eor3_m1_1 C3, Abo, Ago, Ako + eor3_m1_1 C4, Abu, Agu, Aku + eor3_m1_0 C0, C0, Ama, Asa + eor3_m1_0 C1, C1, Ame, Ase + eor3_m1_0 C2, C2, Ami, Asi + eor3_m1_0 C3, C3, Amo, Aso + eor3_m1_0 C4, C4, Amu, Asu + eor3_m1_1 C0, C0, Ama, Asa + eor3_m1_1 C1, C1, Ame, Ase + eor3_m1_1 C2, C2, Ami, Asi + eor3_m1_1 C3, C3, Amo, Aso + eor3_m1_1 C4, C4, Amu, Asu + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req vBba + rax1_m1 E2, C1, C3 + rax1_m1 E4, C3, C0 + rax1_m1 E1, C0, C2 + rax1_m1 E3, C2, C4 + rax1_m1 E0, C4, C1 + .unreq tmp + + /* 25x XAR, 75 in total */ + + tmp .req C1 + tmpq .req C1q + + eor vBba.16b, Aba.16b, E0.16b + xar_m1 vBsa, Abi, E2, 2 + xar_m1 vBbi, Aki, E2, 21 + xar_m1 vBki, Ako, E3, 39 + xar_m1 vBko, Amu, E4, 56 + xar_m1 vBmu, Aso, E3, 8 + xar_m1 vBso, Ama, E0, 23 + xar_m1 vBka, Abe, E1, 63 + xar_m1 vBse, Ago, E3, 9 + xar_m1 vBgo, Ame, E1, 19 + xar_m1 vBke, Agi, E2, 58 + xar_m1 vBgi, Aka, E0, 61 + xar_m1 vBga, Abo, E3, 36 + xar_m1 vBbo, Amo, E3, 43 + xar_m1 vBmo, Ami, E2, 49 + xar_m1 vBmi, Ake, E1, 54 + xar_m1 vBge, Agu, E4, 44 + xar_m1 vBgu, Asi, E2, 3 + xar_m1 vBsi, Aku, E4, 25 + xar_m1 vBku, Asa, E0, 46 + xar_m1 vBma, Abu, E4, 37 + xar_m1 vBbu, Asu, E4, 50 + xar_m1 vBsu, Ase, E1, 62 + xar_m1 vBme, Aga, E0, 28 + xar_m1 vBbe, Age, E1, 20 + + /* 25x BCAX, 50 in total */ + + bcax_m1 Aga, vBga, vBgi, vBge + bcax_m1 Age, vBge, vBgo, vBgi + bcax_m1 Agi, vBgi, vBgu, vBgo + bcax_m1 Ago, vBgo, vBga, vBgu + bcax_m1 Agu, vBgu, vBge, vBga + bcax_m1 Aka, vBka, vBki, vBke + bcax_m1 Ake, vBke, vBko, vBki + bcax_m1 Aki, vBki, vBku, vBko + bcax_m1 Ako, vBko, vBka, vBku + bcax_m1 Aku, vBku, vBke, vBka + bcax_m1 Ama, vBma, vBmi, vBme + bcax_m1 Ame, vBme, vBmo, vBmi + bcax_m1 Ami, vBmi, vBmu, vBmo + bcax_m1 Amo, vBmo, vBma, vBmu + bcax_m1 Amu, vBmu, vBme, vBma + bcax_m1 Asa, vBsa, vBsi, vBse + bcax_m1 Ase, vBse, vBso, vBsi + bcax_m1 Asi, vBsi, vBsu, vBso + bcax_m1 Aso, vBso, vBsa, vBsu + bcax_m1 Asu, vBsu, vBse, vBsa + bcax_m1 Aba, vBba, vBbi, vBbe + bcax_m1 Abe, vBbe, vBbo, vBbi + bcax_m1 Abi, vBbi, vBbu, vBbo + bcax_m1 Abo, vBbo, vBba, vBbu + bcax_m1 Abu, vBbu, vBbe, vBba + + // iota step + //ld1r {tmp.2d}, [const_addr], #8 + ldr tmpq, [const_addr], #16 + eor Aba.16b, Aba.16b, tmp.16b + + .unreq tmp + .unreq tmpq + +.endm + +.macro keccak_f1600_round_core + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req vBba + rax1_m0 E2, C1, C3 + rax1_m0 E4, C3, C0 + rax1_m0 E1, C0, C2 + rax1_m0 E3, C2, C4 + rax1_m0 E0, C4, C1 + + /* 25x XAR, 75 in total */ + + .unreq tmp + tmp .req C1 + tmpq .req C1q + + eor vBba.16b, Aba.16b, E0.16b + xar_m0 vBsa, Abi, E2, 2 + xar_m0 vBbi, Aki, E2, 21 + xar_m0 vBki, Ako, E3, 39 + xar_m1 vBko, Amu, E4, 56 + xar_m0 vBmu, Aso, E3, 8 + xar_m0 vBso, Ama, E0, 23 + xar_m0 vBka, Abe, E1, 63 + xar_m1 vBse, Ago, E3, 9 + xar_m0 vBgo, Ame, E1, 19 + xar_m0 vBke, Agi, E2, 58 + xar_m0 vBgi, Aka, E0, 61 + xar_m1 vBga, Abo, E3, 36 + xar_m0 vBbo, Amo, E3, 43 + xar_m0 vBmo, Ami, E2, 49 + xar_m0 vBmi, Ake, E1, 54 + xar_m1 vBge, Agu, E4, 44 + mov E3.16b, Aga.16b + bcax_m0 Aga, vBga, vBgi, vBge + xar_m0 vBgu, Asi, E2, 3 + xar_m0 vBsi, Aku, E4, 25 + xar_m1 vBku, Asa, E0, 46 + xar_m0 vBma, Abu, E4, 37 + xar_m0 vBbu, Asu, E4, 50 + xar_m0 vBsu, Ase, E1, 62 + xar_m1 vBme, E3, E0, 28 + xar_m0 vBbe, Age, E1, 20 + + /* 25x BCAX, 50 in total */ + + bcax_m0 Age, vBge, vBgo, vBgi + bcax_m0 Agi, vBgi, vBgu, vBgo + bcax_m1 Ago, vBgo, vBga, vBgu + bcax_m0 Agu, vBgu, vBge, vBga + bcax_m0 Aka, vBka, vBki, vBke + bcax_m0 Ake, vBke, vBko, vBki + + .unreq tmp + .unreq tmpq + + eor2 C0, Aka, Aga + save(Aga) + + tmp .req Aga + tmpq .req Agaq + bcax_m0 Aki, vBki, vBku, vBko + bcax_m0 Ako, vBko, vBka, vBku + eor2 C1, Ake, Age + bcax_m0 Aku, vBku, vBke, vBka + eor2 C2, Aki, Agi + bcax_m0 Ama, vBma, vBmi, vBme + eor2 C3, Ako, Ago + bcax_m0 Ame, vBme, vBmo, vBmi + eor2 C4, Aku, Agu + bcax_m0 Ami, vBmi, vBmu, vBmo + eor2 C0, C0, Ama + bcax_m0 Amo, vBmo, vBma, vBmu + eor2 C1, C1, Ame + bcax_m0 Amu, vBmu, vBme, vBma + eor2 C2, C2, Ami + bcax_m0 Asa, vBsa, vBsi, vBse + eor2 C3, C3, Amo + bcax_m0 Ase, vBse, vBso, vBsi + eor2 C4, C4, Amu + bcax_m0 Asi, vBsi, vBsu, vBso + eor2 C0, C0, Asa + bcax_m0 Aso, vBso, vBsa, vBsu + eor2 C1, C1, Ase + bcax_m0 Asu, vBsu, vBse, vBsa + eor2 C2, C2, Asi + eor2 C3, C3, Aso + bcax_m0 Aba, vBba, vBbi, vBbe + bcax_m0 Abe, vBbe, vBbo, vBbi + eor2 C1, C1, Abe + + // iota step + //ld1r {tmp.2d}, [const_addr], #8 + ldr tmpq, [const_addr], #16 + eor Aba.16b, Aba.16b, tmp.16b + eor2 C4, C4, Asu + bcax_m0 Abi, vBbi, vBbu, vBbo + bcax_m0 Abo, vBbo, vBba, vBbu + eor2 C3, C3, Abo + eor2 C2, C2, Abi + eor2 C0, C0, Aba + bcax_m0 Abu, vBbu, vBbe, vBba + eor2 C4, C4, Abu + + restore(Aga) + .unreq tmp + .unreq tmpq + +.endm + +.macro keccak_f1600_round_post + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req vBba + rax1_m0 E2, C1, C3 + rax1_m1 E4, C3, C0 + rax1_m0 E1, C0, C2 + rax1_m1 E3, C2, C4 + rax1_m0 E0, C4, C1 + + /* 25x XAR, 75 in total */ + + .unreq tmp + tmp .req C1 + tmpq .req C1q + + eor vBba.16b, Aba.16b, E0.16b + xar_m0 vBsa, Abi, E2, 2 + xar_m1 vBbi, Aki, E2, 21 + xar_m0 vBki, Ako, E3, 39 + xar_m1 vBko, Amu, E4, 56 + xar_m0 vBmu, Aso, E3, 8 + xar_m1 vBso, Ama, E0, 23 + xar_m0 vBka, Abe, E1, 63 + xar_m1 vBse, Ago, E3, 9 + xar_m0 vBgo, Ame, E1, 19 + xar_m1 vBke, Agi, E2, 58 + xar_m0 vBgi, Aka, E0, 61 + xar_m1 vBga, Abo, E3, 36 + xar_m0 vBbo, Amo, E3, 43 + xar_m1 vBmo, Ami, E2, 49 + xar_m0 vBmi, Ake, E1, 54 + xar_m1 vBge, Agu, E4, 44 + mov E3.16b, Aga.16b + bcax_m1 Aga, vBga, vBgi, vBge + xar_m0 vBgu, Asi, E2, 3 + xar_m1 vBsi, Aku, E4, 25 + xar_m0 vBku, Asa, E0, 46 + xar_m1 vBma, Abu, E4, 37 + xar_m0 vBbu, Asu, E4, 50 + xar_m1 vBsu, Ase, E1, 62 + xar_m0 vBme, E3, E0, 28 + xar_m1 vBbe, Age, E1, 20 + + /* 25x BCAX, 50 in total */ + + bcax_m0 Age, vBge, vBgo, vBgi + bcax_m1 Agi, vBgi, vBgu, vBgo + bcax_m0 Ago, vBgo, vBga, vBgu + bcax_m1 Agu, vBgu, vBge, vBga + bcax_m0 Aka, vBka, vBki, vBke + bcax_m1 Ake, vBke, vBko, vBki + bcax_m0 Aki, vBki, vBku, vBko + bcax_m1 Ako, vBko, vBka, vBku + bcax_m0 Aku, vBku, vBke, vBka + bcax_m1 Ama, vBma, vBmi, vBme + bcax_m0 Ame, vBme, vBmo, vBmi + bcax_m1 Ami, vBmi, vBmu, vBmo + bcax_m0 Amo, vBmo, vBma, vBmu + bcax_m1 Amu, vBmu, vBme, vBma + bcax_m0 Asa, vBsa, vBsi, vBse + bcax_m1 Ase, vBse, vBso, vBsi + bcax_m0 Asi, vBsi, vBsu, vBso + bcax_m1 Aso, vBso, vBsa, vBsu + bcax_m0 Asu, vBsu, vBse, vBsa + bcax_m1 Aba, vBba, vBbi, vBbe + bcax_m0 Abe, vBbe, vBbo, vBbi + bcax_m1 Abi, vBbi, vBbu, vBbo + bcax_m0 Abo, vBbo, vBba, vBbu + bcax_m1 Abu, vBbu, vBbe, vBba + + // iota step + //ld1r {tmp.2d}, [const_addr], #8 + ldr tmpq, [const_addr], #16 + eor Aba.16b, Aba.16b, tmp.16b + + .unreq tmp + .unreq tmpq + +.endm + + +.text +.align 4 +.global keccak_f1600_x2_hybrid_asm_v2pp1 +.global _keccak_f1600_x2_hybrid_asm_v2pp1 + +#define KECCAK_F1600_ROUNDS 24 + +keccak_f1600_x2_hybrid_asm_v2pp1: +_keccak_f1600_x2_hybrid_asm_v2pp1: + alloc_stack + save_vregs + load_constant_ptr + load_input + + //mov count, #(KECCAK_F1600_ROUNDS-2) + mov count, #11 + keccak_f1600_round_pre +loop: + keccak_f1600_round_core + keccak_f1600_round_core + sub count, count, #1 + cbnz count, loop + + keccak_f1600_round_core + keccak_f1600_round_post + store_input + restore_vregs + free_stack + ret + + +#endif /* SHA3 */ diff --git a/tests/keccak_neon/manual/keccak_f1600_x2_hybrid_asm_v2pp2.s b/tests/keccak_neon/manual/keccak_f1600_x2_hybrid_asm_v2pp2.s new file mode 100644 index 0000000..1c22182 --- /dev/null +++ b/tests/keccak_neon/manual/keccak_f1600_x2_hybrid_asm_v2pp2.s @@ -0,0 +1,804 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +#if defined(__ARM_FEATURE_SHA3) +/********************** CONSTANTS *************************/ + .data + .align(8) +_round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x1 + count .req x2 + cur_const .req x3 + + /* Mapping of Kecck-f1600 state to vector registers + * at the beginning and end of each round. */ + Aba .req v0 + Abe .req v1 + Abi .req v2 + Abo .req v3 + Abu .req v4 + Aga .req v5 + Age .req v6 + Agi .req v7 + Ago .req v8 + Agu .req v9 + Aka .req v10 + Ake .req v11 + Aki .req v12 + Ako .req v13 + Aku .req v14 + Ama .req v15 + Ame .req v16 + Ami .req v17 + Amo .req v18 + Amu .req v19 + Asa .req v20 + Ase .req v21 + Asi .req v22 + Aso .req v23 + Asu .req v24 + + /* q-form of the above mapping */ + Abaq .req q0 + Abeq .req q1 + Abiq .req q2 + Aboq .req q3 + Abuq .req q4 + Agaq .req q5 + Ageq .req q6 + Agiq .req q7 + Agoq .req q8 + Aguq .req q9 + Akaq .req q10 + Akeq .req q11 + Akiq .req q12 + Akoq .req q13 + Akuq .req q14 + Amaq .req q15 + Ameq .req q16 + Amiq .req q17 + Amoq .req q18 + Amuq .req q19 + Asaq .req q20 + Aseq .req q21 + Asiq .req q22 + Asoq .req q23 + Asuq .req q24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v27 + C1 .req v28 + C2 .req v29 + C3 .req v30 + C4 .req v31 + + C0q .req q27 + C1q .req q28 + C2q .req q29 + C3q .req q30 + C4q .req q31 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + vBba .req v25 // fresh + vBbe .req v26 // fresh + vBbi .req Abi + vBbo .req Abo + vBbu .req Abu + vBga .req Aka + vBge .req Ake + vBgi .req Agi + vBgo .req Ago + vBgu .req Agu + vBka .req Ama + vBke .req Ame + vBki .req Aki + vBko .req Ako + vBku .req Aku + vBma .req Asa + vBme .req Ase + vBmi .req Ami + vBmo .req Amo + vBmu .req Amu + vBsa .req Aba + vBse .req Abe + vBsi .req Asi + vBso .req Aso + vBsu .req Asu + + vBbaq .req q25 // fresh + vBbeq .req q26 // fresh + vBbiq .req Abiq + vBboq .req Aboq + vBbuq .req Abuq + vBgaq .req Akaq + vBgeq .req Akeq + vBgiq .req Agiq + vBgoq .req Agoq + vBguq .req Aguq + vBkaq .req Amaq + vBkeq .req Ameq + vBkiq .req Akiq + vBkoq .req Akoq + vBkuq .req Akuq + vBmaq .req Asaq + vBmeq .req Aseq + vBmiq .req Amiq + vBmoq .req Amoq + vBmuq .req Amuq + vBsaq .req Abaq + vBseq .req Abeq + vBsiq .req Asiq + vBsoq .req Asoq + vBsuq .req Asuq + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req C4 + E1 .req C0 + E2 .req vBbe // fresh + E3 .req C2 + E4 .req C3 + + E0q .req C4q + E1q .req C0q + E2q .req vBbeq // fresh + E3q .req C2q + E4q .req C3q + + +/************************ MACROS ****************************/ + +.macro load_input + ldp Abaq, Abeq, [input_addr, #(2*8*0)] + ldp Abiq, Aboq, [input_addr, #(2*8*2)] + ldp Abuq, Agaq, [input_addr, #(2*8*4)] + ldp Ageq, Agiq, [input_addr, #(2*8*6)] + ldp Agoq, Aguq, [input_addr, #(2*8*8)] + ldp Akaq, Akeq, [input_addr, #(2*8*10)] + ldp Akiq, Akoq, [input_addr, #(2*8*12)] + ldp Akuq, Amaq, [input_addr, #(2*8*14)] + ldp Ameq, Amiq, [input_addr, #(2*8*16)] + ldp Amoq, Amuq, [input_addr, #(2*8*18)] + ldp Asaq, Aseq, [input_addr, #(2*8*20)] + ldp Asiq, Asoq, [input_addr, #(2*8*22)] + ldr Asuq, [input_addr, #(2*8*24)] +.endm + +.macro store_input + str Abaq, [input_addr, #(2*8*0)] + str Abeq, [input_addr, #(2*8*1)] + str Abiq, [input_addr, #(2*8*2)] + str Aboq, [input_addr, #(2*8*3)] + str Abuq, [input_addr, #(2*8*4)] + str Agaq, [input_addr, #(2*8*5)] + str Ageq, [input_addr, #(2*8*6)] + str Agiq, [input_addr, #(2*8*7)] + str Agoq, [input_addr, #(2*8*8)] + str Aguq, [input_addr, #(2*8*9)] + str Akaq, [input_addr, #(2*8*10)] + str Akeq, [input_addr, #(2*8*11)] + str Akiq, [input_addr, #(2*8*12)] + str Akoq, [input_addr, #(2*8*13)] + str Akuq, [input_addr, #(2*8*14)] + str Amaq, [input_addr, #(2*8*15)] + str Ameq, [input_addr, #(2*8*16)] + str Amiq, [input_addr, #(2*8*17)] + str Amoq, [input_addr, #(2*8*18)] + str Amuq, [input_addr, #(2*8*19)] + str Asaq, [input_addr, #(2*8*20)] + str Aseq, [input_addr, #(2*8*21)] + str Asiq, [input_addr, #(2*8*22)] + str Asoq, [input_addr, #(2*8*23)] + str Asuq, [input_addr, #(2*8*24)] +.endm + +#define STACK_SIZE (16*4 + 16*34) +#define STACK_BASE_VREGS 0 +#define STACK_BASE_TMP 16*4 + +#define Aga_offset 0 +#define E0_offset 1 +#define E1_offset 2 +#define E2_offset 3 +#define E3_offset 4 +#define E4_offset 5 +#define Ame_offset 7 +#define Agi_offset 8 +#define Aka_offset 9 +#define Abo_offset 10 +#define Amo_offset 11 +#define Ami_offset 12 +#define Ake_offset 13 +#define Agu_offset 14 +#define Asi_offset 15 +#define Aku_offset 16 +#define Asa_offset 17 +#define Abu_offset 18 +#define Asu_offset 19 +#define Ase_offset 20 +//#define Aga_offset 21 +#define Age_offset 22 +#define vBgo_offset 23 +#define vBke_offset 24 +#define vBgi_offset 25 +#define vBga_offset 26 +#define vBbo_offset 27 +#define vBmo_offset 28 +#define vBmi_offset 29 +#define vBge_offset 30 + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +#define save(name) \ + str name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] +#define restore(name) \ + ldr name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] + +.macro save_vregs + stp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + stp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + stp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + stp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + ldp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + ldp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + ldp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +.macro eor3_m0 d s0 s1 s2 + eor3 \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +.macro rax1_m0 d s0 s1 + rax1 \d\().2d, \s0\().2d, \s1\().2d +.endm + +.macro xar_m0 d s0 s1 imm + xar \d\().2d, \s0\().2d, \s1\().2d, #\imm +.endm + +.macro bcax_m0 d s0 s1 s2 + bcax \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +.macro eor3_m1_0 d s0 s1 s2 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor2 d s0 s1 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor3_m1_1 d s0 s1 s2 + eor \d\().16b, \d\().16b, \s2\().16b +.endm + +.macro eor3_m1 d s0 s1 s2 + eor3_m1_0 \d, \s0, \s1, \s2 + eor3_m1_1 \d, \s0, \s1, \s2 +.endm + +.macro rax1_m1 d s0 s1 + // Use add instead of SHL #1 + add tmp.2d, \s1\().2d, \s1\().2d + sri tmp.2d, \s1\().2d, #63 + eor \d\().16b, tmp.16b, \s0\().16b +.endm + + .macro xar_m1 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 63 + eor \s0\().16b, \s0\().16b, \s1\().16b + add \d\().2d, \s0\().2d, \s0\().2d + sri \d\().2d, \s0\().2d, #(63) + // .elseif \imm == 62 + // eor \s0\().16b, \s0\().16b, \s1\().16b + // add \d\().2d, \s0\().2d, \s0\().2d + // add \d\().2d, \d\().2d, \d\().2d + // sri \d\().2d, \s0\().2d, #(62) + // .elseif \imm == 61 + // eor \s0\().16b, \s0\().16b, \s1\().16b + // add \d\().2d, \s0\().2d, \s0\().2d + // add \d\().2d, \d\().2d, \d\().2d + // add \d\().2d, \d\().2d, \d\().2d + // sri \d\().2d, \s0\().2d, #(61) + .else + eor \s0\().16b, \s0\().16b, \s1\().16b + shl \d\().2d, \s0\().2d, #(64-\imm) + sri \d\().2d, \s0\().2d, #(\imm) + .endif +.endm + + .macro xar_m1_0 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 63 + eor \s0\().16b, \s0\().16b, \s1\().16b + .elseif \imm == 62 + eor \s0\().16b, \s0\().16b, \s1\().16b + .else + eor \s0\().16b, \s0\().16b, \s1\().16b + .endif +.endm + + .macro xar_m1_1 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 63 + add \d\().2d, \s0\().2d, \s0\().2d + sri \d\().2d, \s0\().2d, #(63) + .elseif \imm == 62 + add \d\().2d, \s0\().2d, \s0\().2d + add \d\().2d, \d\().2d, \d\().2d + sri \d\().2d, \s0\().2d, #(62) + .else + shl \d\().2d, \s0\().2d, #(64-\imm) + sri \d\().2d, \s0\().2d, #(\imm) + .endif +.endm + +.macro bcax_m1 d s0 s1 s2 + bic tmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +/* Keccak-f1600 round */ + +.macro keccak_f1600_round_pre + + /* 10 EOR3, so 20 individual EOR */ + + eor3_m0 C1, Abe, Age, Ake + eor3_m1 C3, Abo, Ago, Ako + eor3_m0 C0, Aba, Aga, Aka + eor3_m1 C2, Abi, Agi, Aki + eor3_m0 C4, Abu, Agu, Aku + eor3_m1 C1, C1, Ame, Ase + eor3_m0 C3, C3, Amo, Aso + eor3_m1 C0, C0, Ama, Asa + eor3_m0 C2, C2, Ami, Asi + eor3_m1 C4, C4, Amu, Asu + +.endm + +.macro keccak_f1600_round + + /* 10 EOR3, so 20 individual EOR */ + + eor3_m1_0 C0, Aba, Aga, Aka + eor3_m1_0 C1, Abe, Age, Ake + eor3_m1_0 C2, Abi, Agi, Aki + eor3_m1_0 C3, Abo, Ago, Ako + eor3_m1_0 C4, Abu, Agu, Aku + eor3_m1_1 C0, Aba, Aga, Aka + eor3_m1_1 C1, Abe, Age, Ake + eor3_m1_1 C2, Abi, Agi, Aki + eor3_m1_1 C3, Abo, Ago, Ako + eor3_m1_1 C4, Abu, Agu, Aku + eor3_m1_0 C0, C0, Ama, Asa + eor3_m1_0 C1, C1, Ame, Ase + eor3_m1_0 C2, C2, Ami, Asi + eor3_m1_0 C3, C3, Amo, Aso + eor3_m1_0 C4, C4, Amu, Asu + eor3_m1_1 C0, C0, Ama, Asa + eor3_m1_1 C1, C1, Ame, Ase + eor3_m1_1 C2, C2, Ami, Asi + eor3_m1_1 C3, C3, Amo, Aso + eor3_m1_1 C4, C4, Amu, Asu + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req vBba + rax1_m1 E2, C1, C3 + rax1_m1 E4, C3, C0 + rax1_m1 E1, C0, C2 + rax1_m1 E3, C2, C4 + rax1_m1 E0, C4, C1 + .unreq tmp + + /* 25x XAR, 75 in total */ + + tmp .req C1 + tmpq .req C1q + + eor vBba.16b, Aba.16b, E0.16b + xar_m1 vBsa, Abi, E2, 2 + xar_m1 vBbi, Aki, E2, 21 + xar_m1 vBki, Ako, E3, 39 + xar_m1 vBko, Amu, E4, 56 + xar_m1 vBmu, Aso, E3, 8 + xar_m1 vBso, Ama, E0, 23 + xar_m1 vBka, Abe, E1, 63 + xar_m1 vBse, Ago, E3, 9 + xar_m1 vBgo, Ame, E1, 19 + xar_m1 vBke, Agi, E2, 58 + xar_m1 vBgi, Aka, E0, 61 + xar_m1 vBga, Abo, E3, 36 + xar_m1 vBbo, Amo, E3, 43 + xar_m1 vBmo, Ami, E2, 49 + xar_m1 vBmi, Ake, E1, 54 + xar_m1 vBge, Agu, E4, 44 + xar_m1 vBgu, Asi, E2, 3 + xar_m1 vBsi, Aku, E4, 25 + xar_m1 vBku, Asa, E0, 46 + xar_m1 vBma, Abu, E4, 37 + xar_m1 vBbu, Asu, E4, 50 + xar_m1 vBsu, Ase, E1, 62 + xar_m1 vBme, Aga, E0, 28 + xar_m1 vBbe, Age, E1, 20 + + /* 25x BCAX, 50 in total */ + + bcax_m1 Aga, vBga, vBgi, vBge + bcax_m1 Age, vBge, vBgo, vBgi + bcax_m1 Agi, vBgi, vBgu, vBgo + bcax_m1 Ago, vBgo, vBga, vBgu + bcax_m1 Agu, vBgu, vBge, vBga + bcax_m1 Aka, vBka, vBki, vBke + bcax_m1 Ake, vBke, vBko, vBki + bcax_m1 Aki, vBki, vBku, vBko + bcax_m1 Ako, vBko, vBka, vBku + bcax_m1 Aku, vBku, vBke, vBka + bcax_m1 Ama, vBma, vBmi, vBme + bcax_m1 Ame, vBme, vBmo, vBmi + bcax_m1 Ami, vBmi, vBmu, vBmo + bcax_m1 Amo, vBmo, vBma, vBmu + bcax_m1 Amu, vBmu, vBme, vBma + bcax_m1 Asa, vBsa, vBsi, vBse + bcax_m1 Ase, vBse, vBso, vBsi + bcax_m1 Asi, vBsi, vBsu, vBso + bcax_m1 Aso, vBso, vBsa, vBsu + bcax_m1 Asu, vBsu, vBse, vBsa + bcax_m1 Aba, vBba, vBbi, vBbe + bcax_m1 Abe, vBbe, vBbo, vBbi + bcax_m1 Abi, vBbi, vBbu, vBbo + bcax_m1 Abo, vBbo, vBba, vBbu + bcax_m1 Abu, vBbu, vBbe, vBba + + // iota step + //ld1r {tmp.2d}, [const_addr], #8 + ldr tmpq, [const_addr], #16 + eor Aba.16b, Aba.16b, tmp.16b + + .unreq tmp + .unreq tmpq + +.endm + +.macro keccak_f1600_round_core + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req vBba + rax1_m0 E2, C1, C3 + rax1_m1 E4, C3, C0 + rax1_m0 E1, C0, C2 + rax1_m1 E3, C2, C4 + rax1_m0 E0, C4, C1 + + /* 25x XAR, 75 in total */ + + .unreq tmp + tmp .req C1 + tmpq .req C1q + + eor vBba.16b, Aba.16b, E0.16b + xar_m1 vBsa, Abi, E2, 2 + xar_m0 vBbi, Aki, E2, 21 + xar_m1 vBki, Ako, E3, 39 + xar_m0 vBko, Amu, E4, 56 + xar_m1 vBmu, Aso, E3, 8 + xar_m0 vBso, Ama, E0, 23 + xar_m1 vBka, Abe, E1, 63 + xar_m0 vBse, Ago, E3, 9 + xar_m1 vBgo, Ame, E1, 19 + xar_m0 vBke, Agi, E2, 58 + xar_m1 vBgi, Aka, E0, 61 + xar_m0 vBga, Abo, E3, 36 + xar_m1 vBbo, Amo, E3, 43 + xar_m0 vBmo, Ami, E2, 49 + xar_m1 vBmi, Ake, E1, 54 + xar_m0 vBge, Agu, E4, 44 + mov E3.16b, Aga.16b + bcax_m1 Aga, vBga, vBgi, vBge + xar_m0 vBgu, Asi, E2, 3 + xar_m1 vBsi, Aku, E4, 25 + xar_m0 vBku, Asa, E0, 46 + xar_m1 vBma, Abu, E4, 37 + xar_m0 vBbu, Asu, E4, 50 + xar_m1 vBsu, Ase, E1, 62 + xar_m0 vBme, E3, E0, 28 + xar_m1 vBbe, Age, E1, 20 + + /* 25x BCAX, 50 in total */ + + bcax_m1 Age, vBge, vBgo, vBgi + bcax_m0 Agi, vBgi, vBgu, vBgo + bcax_m1 Ago, vBgo, vBga, vBgu + bcax_m0 Agu, vBgu, vBge, vBga + bcax_m1 Aka, vBka, vBki, vBke + bcax_m0 Ake, vBke, vBko, vBki + + .unreq tmp + .unreq tmpq + + eor2 C0, Aka, Aga + save(Aga) + + tmp .req Aga + tmpq .req Agaq + bcax_m0 Aki, vBki, vBku, vBko + bcax_m1 Ako, vBko, vBka, vBku + eor2 C1, Ake, Age + bcax_m0 Aku, vBku, vBke, vBka + eor2 C2, Aki, Agi + bcax_m1 Ama, vBma, vBmi, vBme + eor2 C3, Ako, Ago + bcax_m0 Ame, vBme, vBmo, vBmi + eor2 C4, Aku, Agu + bcax_m1 Ami, vBmi, vBmu, vBmo + eor2 C0, C0, Ama + bcax_m0 Amo, vBmo, vBma, vBmu + eor2 C1, C1, Ame + bcax_m1 Amu, vBmu, vBme, vBma + eor2 C2, C2, Ami + bcax_m0 Asa, vBsa, vBsi, vBse + eor2 C3, C3, Amo + bcax_m1 Ase, vBse, vBso, vBsi + eor2 C4, C4, Amu + bcax_m0 Asi, vBsi, vBsu, vBso + eor2 C0, C0, Asa + bcax_m1 Aso, vBso, vBsa, vBsu + eor2 C1, C1, Ase + bcax_m0 Asu, vBsu, vBse, vBsa + eor2 C2, C2, Asi + eor2 C3, C3, Aso + bcax_m1 Aba, vBba, vBbi, vBbe + bcax_m0 Abe, vBbe, vBbo, vBbi + eor2 C1, C1, Abe + + // iota step + //ld1r {tmp.2d}, [const_addr], #8 + ldr tmpq, [const_addr], #16 + eor Aba.16b, Aba.16b, tmp.16b + eor2 C4, C4, Asu + bcax_m0 Abi, vBbi, vBbu, vBbo + bcax_m1 Abo, vBbo, vBba, vBbu + eor2 C3, C3, Abo + eor2 C2, C2, Abi + eor2 C0, C0, Aba + bcax_m0 Abu, vBbu, vBbe, vBba + eor2 C4, C4, Abu + + restore(Aga) + .unreq tmp + .unreq tmpq + +.endm + +.macro keccak_f1600_round_post + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req vBba + rax1_m0 E2, C1, C3 + rax1_m1 E4, C3, C0 + rax1_m0 E1, C0, C2 + rax1_m1 E3, C2, C4 + rax1_m0 E0, C4, C1 + + /* 25x XAR, 75 in total */ + + .unreq tmp + tmp .req C1 + tmpq .req C1q + + eor vBba.16b, Aba.16b, E0.16b + xar_m0 vBsa, Abi, E2, 2 + xar_m1 vBbi, Aki, E2, 21 + xar_m0 vBki, Ako, E3, 39 + xar_m1 vBko, Amu, E4, 56 + xar_m0 vBmu, Aso, E3, 8 + xar_m1 vBso, Ama, E0, 23 + xar_m0 vBka, Abe, E1, 63 + xar_m1 vBse, Ago, E3, 9 + xar_m0 vBgo, Ame, E1, 19 + xar_m1 vBke, Agi, E2, 58 + xar_m0 vBgi, Aka, E0, 61 + xar_m1 vBga, Abo, E3, 36 + xar_m0 vBbo, Amo, E3, 43 + xar_m1 vBmo, Ami, E2, 49 + xar_m0 vBmi, Ake, E1, 54 + xar_m1 vBge, Agu, E4, 44 + mov E3.16b, Aga.16b + bcax_m1 Aga, vBga, vBgi, vBge + xar_m0 vBgu, Asi, E2, 3 + xar_m1 vBsi, Aku, E4, 25 + xar_m0 vBku, Asa, E0, 46 + xar_m1 vBma, Abu, E4, 37 + xar_m0 vBbu, Asu, E4, 50 + xar_m1 vBsu, Ase, E1, 62 + xar_m0 vBme, E3, E0, 28 + xar_m1 vBbe, Age, E1, 20 + + /* 25x BCAX, 50 in total */ + + bcax_m0 Age, vBge, vBgo, vBgi + bcax_m1 Agi, vBgi, vBgu, vBgo + bcax_m0 Ago, vBgo, vBga, vBgu + bcax_m1 Agu, vBgu, vBge, vBga + bcax_m0 Aka, vBka, vBki, vBke + bcax_m1 Ake, vBke, vBko, vBki + bcax_m0 Aki, vBki, vBku, vBko + bcax_m1 Ako, vBko, vBka, vBku + bcax_m0 Aku, vBku, vBke, vBka + bcax_m1 Ama, vBma, vBmi, vBme + bcax_m0 Ame, vBme, vBmo, vBmi + bcax_m1 Ami, vBmi, vBmu, vBmo + bcax_m0 Amo, vBmo, vBma, vBmu + bcax_m1 Amu, vBmu, vBme, vBma + bcax_m0 Asa, vBsa, vBsi, vBse + bcax_m1 Ase, vBse, vBso, vBsi + bcax_m0 Asi, vBsi, vBsu, vBso + bcax_m1 Aso, vBso, vBsa, vBsu + bcax_m0 Asu, vBsu, vBse, vBsa + bcax_m1 Aba, vBba, vBbi, vBbe + bcax_m0 Abe, vBbe, vBbo, vBbi + bcax_m1 Abi, vBbi, vBbu, vBbo + bcax_m0 Abo, vBbo, vBba, vBbu + bcax_m1 Abu, vBbu, vBbe, vBba + + // iota step + //ld1r {tmp.2d}, [const_addr], #8 + ldr tmpq, [const_addr], #16 + eor Aba.16b, Aba.16b, tmp.16b + + .unreq tmp + .unreq tmpq + +.endm + + +.text +.align 4 +.global keccak_f1600_x2_hybrid_asm_v2pp2 +.global _keccak_f1600_x2_hybrid_asm_v2pp2 + +#define KECCAK_F1600_ROUNDS 24 + +keccak_f1600_x2_hybrid_asm_v2pp2: +_keccak_f1600_x2_hybrid_asm_v2pp2: + alloc_stack + save_vregs + load_constant_ptr + load_input + + //mov count, #(KECCAK_F1600_ROUNDS-2) + mov count, #11 + keccak_f1600_round_pre +loop: + keccak_f1600_round_core + keccak_f1600_round_core + sub count, count, #1 + cbnz count, loop + + keccak_f1600_round_core + keccak_f1600_round_post + store_input + restore_vregs + free_stack + ret + +#endif diff --git a/tests/keccak_neon/manual/keccak_f1600_x2_v84a_asm_v1.s b/tests/keccak_neon/manual/keccak_f1600_x2_v84a_asm_v1.s new file mode 100644 index 0000000..3f2635e --- /dev/null +++ b/tests/keccak_neon/manual/keccak_f1600_x2_v84a_asm_v1.s @@ -0,0 +1,338 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +#if defined(__ARM_FEATURE_SHA3) + +/********************** CONSTANTS *************************/ + .data + .align(8) +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x1 + count .req x2 + cur_const .req x3 + + /* Mapping of Kecck-f1600 state to vector registers + * at the beginning and end of each round. */ + Aba .req v0 + Abe .req v1 + Abi .req v2 + Abo .req v3 + Abu .req v4 + Aga .req v5 + Age .req v6 + Agi .req v7 + Ago .req v8 + Agu .req v9 + Aka .req v10 + Ake .req v11 + Aki .req v12 + Ako .req v13 + Aku .req v14 + Ama .req v15 + Ame .req v16 + Ami .req v17 + Amo .req v18 + Amu .req v19 + Asa .req v20 + Ase .req v21 + Asi .req v22 + Aso .req v23 + Asu .req v24 + + /* q-form of the above mapping */ + Abaq .req q0 + Abeq .req q1 + Abiq .req q2 + Aboq .req q3 + Abuq .req q4 + Agaq .req q5 + Ageq .req q6 + Agiq .req q7 + Agoq .req q8 + Aguq .req q9 + Akaq .req q10 + Akeq .req q11 + Akiq .req q12 + Akoq .req q13 + Akuq .req q14 + Amaq .req q15 + Ameq .req q16 + Amiq .req q17 + Amoq .req q18 + Amuq .req q19 + Asaq .req q20 + Aseq .req q21 + Asiq .req q22 + Asoq .req q23 + Asuq .req q24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v30 + C1 .req v29 + C2 .req v28 + C3 .req v27 + C4 .req v26 + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req v26 + E1 .req v25 + E2 .req v29 + E3 .req v28 + E4 .req v27 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + Abi_ .req v2 + Abo_ .req v3 + Abu_ .req v4 + Aga_ .req v10 + Age_ .req v11 + Agi_ .req v7 + Ago_ .req v8 + Agu_ .req v9 + Aka_ .req v15 + Ake_ .req v16 + Aki_ .req v12 + Ako_ .req v13 + Aku_ .req v14 + Ama_ .req v20 + Ame_ .req v21 + Ami_ .req v17 + Amo_ .req v18 + Amu_ .req v19 + Asa_ .req v0 + Ase_ .req v1 + Asi_ .req v22 + Aso_ .req v23 + Asu_ .req v24 + Aba_ .req v30 + Abe_ .req v27 + +/************************ MACROS ****************************/ + +.macro load_input + ld1 {Aba.2d, Abe.2d, Abi.2d, Abo.2d}, [input_addr], #64 + ld1 {Abu.2d, Aga.2d, Age.2d, Agi.2d}, [input_addr], #64 + ld1 {Ago.2d, Agu.2d, Aka.2d, Ake.2d}, [input_addr], #64 + ld1 {Aki.2d, Ako.2d, Aku.2d, Ama.2d}, [input_addr], #64 + ld1 {Ame.2d, Ami.2d, Amo.2d, Amu.2d}, [input_addr], #64 + ld1 {Asa.2d, Ase.2d, Asi.2d, Aso.2d}, [input_addr], #64 + ld1 {Asu.2d}, [input_addr] + sub input_addr, input_addr, #(6*64) +.endm + +.macro store_input + st1 {Aba.2d, Abe.2d, Abi.2d, Abo.2d}, [input_addr], #64 + st1 {Abu.2d, Aga.2d, Age.2d, Agi.2d}, [input_addr], #64 + st1 {Ago.2d, Agu.2d, Aka.2d, Ake.2d}, [input_addr], #64 + st1 {Aki.2d, Ako.2d, Aku.2d, Ama.2d}, [input_addr], #64 + st1 {Ame.2d, Ami.2d, Amo.2d, Amu.2d}, [input_addr], #64 + st1 {Asa.2d, Ase.2d, Asi.2d, Aso.2d}, [input_addr], #64 + st1 {Asu.2d}, [input_addr] +.endm + +#define STACK_SIZE (16*4 + 16*6) // VREGS (16*4) + GPRS (TODO: Remove) + +#define STACK_BASE_GPRS (16*4) +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) + .endm + +.macro save_vregs + stp d8, d9, [sp, #(16*0)] + stp d10, d11, [sp, #(16*1)] + stp d12, d13, [sp, #(16*2)] + stp d14, d15, [sp, #(16*3)] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #(16*0)] + ldp d10, d11, [sp, #(16*1)] + ldp d12, d13, [sp, #(16*2)] + ldp d14, d15, [sp, #(16*3)] +.endm + +/* Macros using v8.4-A SHA-3 instructions */ + +.macro eor3_m0 d s0 s1 s2 + eor3 \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +.macro rax1_m0 d s0 s1 + rax1 \d\().2d, \s0\().2d, \s1\().2d +.endm + +.macro xar_m0 d s0 s1 imm + xar \d\().2d, \s0\().2d, \s1\().2d, #\imm +.endm + +.macro bcax_m0 d s0 s1 s2 + bcax \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +/* Keccak-f1600 round */ + +.macro keccak_f1600_round + + eor3_m0 C0, Aba, Aga, Aka + eor3_m0 C1, Abe, Age, Ake + eor3_m0 C2, Abi, Agi, Aki + eor3_m0 C3, Abo, Ago, Ako + eor3_m0 C4, Abu, Agu, Aku + eor3_m0 C0, C0, Ama, Asa + eor3_m0 C1, C1, Ame, Ase + eor3_m0 C2, C2, Ami, Asi + eor3_m0 C3, C3, Amo, Aso + eor3_m0 C4, C4, Amu, Asu + + rax1_m0 E1, C0, C2 + rax1_m0 E3, C2, C4 + rax1_m0 E0, C4, C1 + rax1_m0 E2, C1, C3 + rax1_m0 E4, C3, C0 + + eor Aba_.16b, Aba.16b, E0.16b + xar_m0 Asa_, Abi, E2, 2 + xar_m0 Abi_, Aki, E2, 21 + xar_m0 Aki_, Ako, E3, 39 + xar_m0 Ako_, Amu, E4, 56 + xar_m0 Amu_, Aso, E3, 8 + xar_m0 Aso_, Ama, E0, 23 + xar_m0 Aka_, Abe, E1, 63 + xar_m0 Ase_, Ago, E3, 9 + xar_m0 Ago_, Ame, E1, 19 + xar_m0 Ake_, Agi, E2, 58 + xar_m0 Agi_, Aka, E0, 61 + xar_m0 Aga_, Abo, E3, 36 + xar_m0 Abo_, Amo, E3, 43 + xar_m0 Amo_, Ami, E2, 49 + xar_m0 Ami_, Ake, E1, 54 + xar_m0 Age_, Agu, E4, 44 + xar_m0 Agu_, Asi, E2, 3 + xar_m0 Asi_, Aku, E4, 25 + xar_m0 Aku_, Asa, E0, 46 + xar_m0 Ama_, Abu, E4, 37 + xar_m0 Abu_, Asu, E4, 50 + xar_m0 Asu_, Ase, E1, 62 + xar_m0 Ame_, Aga, E0, 28 + xar_m0 Abe_, Age, E1, 20 + + ld1r {v31.2d}, [const_addr], #8 + + bcax_m0 Aga, Aga_, Agi_, Age_ + bcax_m0 Age, Age_, Ago_, Agi_ + bcax_m0 Agi, Agi_, Agu_, Ago_ + bcax_m0 Ago, Ago_, Aga_, Agu_ + bcax_m0 Agu, Agu_, Age_, Aga_ + bcax_m0 Aka, Aka_, Aki_, Ake_ + bcax_m0 Ake, Ake_, Ako_, Aki_ + bcax_m0 Aki, Aki_, Aku_, Ako_ + bcax_m0 Ako, Ako_, Aka_, Aku_ + bcax_m0 Aku, Aku_, Ake_, Aka_ + bcax_m0 Ama, Ama_, Ami_, Ame_ + bcax_m0 Ame, Ame_, Amo_, Ami_ + bcax_m0 Ami, Ami_, Amu_, Amo_ + bcax_m0 Amo, Amo_, Ama_, Amu_ + bcax_m0 Amu, Amu_, Ame_, Ama_ + bcax_m0 Asa, Asa_, Asi_, Ase_ + bcax_m0 Ase, Ase_, Aso_, Asi_ + bcax_m0 Asi, Asi_, Asu_, Aso_ + bcax_m0 Aso, Aso_, Asa_, Asu_ + bcax_m0 Asu, Asu_, Ase_, Asa_ + bcax_m0 Aba, Aba_, Abi_, Abe_ + bcax_m0 Abe, Abe_, Abo_, Abi_ + bcax_m0 Abi, Abi_, Abu_, Abo_ + bcax_m0 Abo, Abo_, Aba_, Abu_ + bcax_m0 Abu, Abu_, Abe_, Aba_ + + // iota step + eor Aba.16b, Aba.16b, v31.16b + +.endm + +#define KECCAK_F1600_ROUNDS 24 + +.text +.align 4 +.global keccak_f1600_x2_v84a_asm_v1 +.global _keccak_f1600_x2_v84a_asm_v1 + +keccak_f1600_x2_v84a_asm_v1: +_keccak_f1600_x2_v84a_asm_v1: + alloc_stack + save_vregs + load_constant_ptr + load_input + + mov count, #(KECCAK_F1600_ROUNDS) +loop: + keccak_f1600_round + sub count, count, #1 + cbnz count, loop + + store_input + restore_vregs + free_stack + ret + +#endif diff --git a/tests/keccak_neon/manual/keccak_f1600_x2_v84a_asm_v1p0.s b/tests/keccak_neon/manual/keccak_f1600_x2_v84a_asm_v1p0.s new file mode 100644 index 0000000..0287c70 --- /dev/null +++ b/tests/keccak_neon/manual/keccak_f1600_x2_v84a_asm_v1p0.s @@ -0,0 +1,465 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +#if defined(__ARM_FEATURE_SHA3) + +/********************** CONSTANTS *************************/ + .data + .align(8) +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x1 + count .req x2 + cur_const .req x3 + + /* Mapping of Kecck-f1600 state to vector registers + * at the beginning and end of each round. */ + Aba .req v0 + Abe .req v1 + Abi .req v2 + Abo .req v3 + Abu .req v4 + Aga .req v5 + Age .req v6 + Agi .req v7 + Ago .req v8 + Agu .req v9 + Aka .req v10 + Ake .req v11 + Aki .req v12 + Ako .req v13 + Aku .req v14 + Ama .req v15 + Ame .req v16 + Ami .req v17 + Amo .req v18 + Amu .req v19 + Asa .req v20 + Ase .req v21 + Asi .req v22 + Aso .req v23 + Asu .req v24 + + /* q-form of the above mapping */ + Abaq .req q0 + Abeq .req q1 + Abiq .req q2 + Aboq .req q3 + Abuq .req q4 + Agaq .req q5 + Ageq .req q6 + Agiq .req q7 + Agoq .req q8 + Aguq .req q9 + Akaq .req q10 + Akeq .req q11 + Akiq .req q12 + Akoq .req q13 + Akuq .req q14 + Amaq .req q15 + Ameq .req q16 + Amiq .req q17 + Amoq .req q18 + Amuq .req q19 + Asaq .req q20 + Aseq .req q21 + Asiq .req q22 + Asoq .req q23 + Asuq .req q24 + + Abaz .req z0 + Abez .req z1 + Abiz .req z2 + Aboz .req z3 + Abuz .req z4 + Agaz .req z5 + Agez .req z6 + Agiz .req z7 + Agoz .req z8 + Aguz .req z9 + Akaz .req z10 + Akez .req z11 + Akiz .req z12 + Akoz .req z13 + Akuz .req z14 + Amaz .req z15 + Amez .req z16 + Amiz .req z17 + Amoz .req z18 + Amuz .req z19 + Asaz .req z20 + Asez .req z21 + Asiz .req z22 + Asoz .req z23 + Asuz .req z24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v25 + C1 .req v26 + C2 .req v27 + C3 .req v28 + C4 .req v29 + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req C4 + E1 .req C0 + E2 .req C1 + E3 .req C2 + E4 .req C3 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + Abi_ .req v2 + Abo_ .req v3 + Abu_ .req v4 + Aga_ .req v10 + Age_ .req v11 + Agi_ .req v7 + Ago_ .req v8 + Agu_ .req v9 + Aka_ .req v15 + Ake_ .req v16 + Aki_ .req v12 + Ako_ .req v13 + Aku_ .req v14 + Ama_ .req v20 + Ame_ .req v21 + Ami_ .req v17 + Amo_ .req v18 + Amu_ .req v19 + Asa_ .req v0 + Ase_ .req v1 + Asi_ .req v22 + Aso_ .req v23 + Asu_ .req v24 + Aba_ .req v30 + Abe_ .req E0 + +/************************ MACROS ****************************/ + +.macro load_input + ld1 {Aba.2d, Abe.2d, Abi.2d, Abo.2d}, [input_addr], #64 + ld1 {Abu.2d, Aga.2d, Age.2d, Agi.2d}, [input_addr], #64 + ld1 {Ago.2d, Agu.2d, Aka.2d, Ake.2d}, [input_addr], #64 + ld1 {Aki.2d, Ako.2d, Aku.2d, Ama.2d}, [input_addr], #64 + ld1 {Ame.2d, Ami.2d, Amo.2d, Amu.2d}, [input_addr], #64 + ld1 {Asa.2d, Ase.2d, Asi.2d, Aso.2d}, [input_addr], #64 + ld1 {Asu.2d}, [input_addr] + sub input_addr, input_addr, #(6*64) +.endm + +.macro store_input + st1 {Aba.2d, Abe.2d, Abi.2d, Abo.2d}, [input_addr], #64 + st1 {Abu.2d, Aga.2d, Age.2d, Agi.2d}, [input_addr], #64 + st1 {Ago.2d, Agu.2d, Aka.2d, Ake.2d}, [input_addr], #64 + st1 {Aki.2d, Ako.2d, Aku.2d, Ama.2d}, [input_addr], #64 + st1 {Ame.2d, Ami.2d, Amo.2d, Amu.2d}, [input_addr], #64 + st1 {Asa.2d, Ase.2d, Asi.2d, Aso.2d}, [input_addr], #64 + st1 {Asu.2d}, [input_addr] +.endm + +// .macro load_input +// ldr Abaq, [input_addr, #(2*8*0)] +// ldr Abeq, [input_addr, #(2*8*1)] +// ldr Abiq, [input_addr, #(2*8*2)] +// ldr Aboq, [input_addr, #(2*8*3)] +// ldr Abuq, [input_addr, #(2*8*4)] +// ldr Agaq, [input_addr, #(2*8*5)] +// ldr Ageq, [input_addr, #(2*8*6)] +// ldr Agiq, [input_addr, #(2*8*7)] +// ldr Agoq, [input_addr, #(2*8*8)] +// ldr Aguq, [input_addr, #(2*8*9)] +// ldr Akaq, [input_addr, #(2*8*10)] +// ldr Akeq, [input_addr, #(2*8*11)] +// ldr Akiq, [input_addr, #(2*8*12)] +// ldr Akoq, [input_addr, #(2*8*13)] +// ldr Akuq, [input_addr, #(2*8*14)] +// ldr Amaq, [input_addr, #(2*8*15)] +// ldr Ameq, [input_addr, #(2*8*16)] +// ldr Amiq, [input_addr, #(2*8*17)] +// ldr Amoq, [input_addr, #(2*8*18)] +// ldr Amuq, [input_addr, #(2*8*19)] +// ldr Asaq, [input_addr, #(2*8*20)] +// ldr Aseq, [input_addr, #(2*8*21)] +// ldr Asiq, [input_addr, #(2*8*22)] +// ldr Asoq, [input_addr, #(2*8*23)] +// ldr Asuq, [input_addr, #(2*8*24)] +// .endm + +// .macro store_input +// str Abaq, [input_addr, #(2*8*0)] +// str Abeq, [input_addr, #(2*8*1)] +// str Abiq, [input_addr, #(2*8*2)] +// str Aboq, [input_addr, #(2*8*3)] +// str Abuq, [input_addr, #(2*8*4)] +// str Agaq, [input_addr, #(2*8*5)] +// str Ageq, [input_addr, #(2*8*6)] +// str Agiq, [input_addr, #(2*8*7)] +// str Agoq, [input_addr, #(2*8*8)] +// str Aguq, [input_addr, #(2*8*9)] +// str Akaq, [input_addr, #(2*8*10)] +// str Akeq, [input_addr, #(2*8*11)] +// str Akiq, [input_addr, #(2*8*12)] +// str Akoq, [input_addr, #(2*8*13)] +// str Akuq, [input_addr, #(2*8*14)] +// str Amaq, [input_addr, #(2*8*15)] +// str Ameq, [input_addr, #(2*8*16)] +// str Amiq, [input_addr, #(2*8*17)] +// str Amoq, [input_addr, #(2*8*18)] +// str Amuq, [input_addr, #(2*8*19)] +// str Asaq, [input_addr, #(2*8*20)] +// str Aseq, [input_addr, #(2*8*21)] +// str Asiq, [input_addr, #(2*8*22)] +// str Asoq, [input_addr, #(2*8*23)] +// str Asuq, [input_addr, #(2*8*24)] +// .endm + +#define STACK_SIZE (16*4 + 16*6 + 16*5) // VREGS (16*4) + GPRS (TODO: Remove) + +#define STACK_BASE_GPRS (16*4) +#define STACK_BASE_VTMP (16*4 + 16*6) + +#define save(name)\ + str name ## q, [sp, #(STACK_BASE_VTMP + 16*(name ## _offset))] +#define restore(name) \ + ldr name ## q, [sp, #(STACK_BASE_VTMP + 16*(name ## _offset))] + +#define Aga_offset 0 +#define Age_offset 1 +#define Agi_offset 2 +#define Ago_offset 3 +#define Agu_offset 4 + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +.macro save_vregs + stp d8, d9, [sp, #(16*0)] + stp d10, d11, [sp, #(16*1)] + stp d12, d13, [sp, #(16*2)] + stp d14, d15, [sp, #(16*3)] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #(16*0)] + ldp d10, d11, [sp, #(16*1)] + ldp d12, d13, [sp, #(16*2)] + ldp d14, d15, [sp, #(16*3)] +.endm + +/* Macros using v8.4-A SHA-3 instructions */ + +.macro eor2 d s0 s1 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor3_m0 d s0 s1 s2 + eor3 \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +.macro rax1_m0 d s0 s1 + rax1 \d\().2d, \s0\().2d, \s1\().2d +.endm + +.macro xar_m0 d s0 s1 imm + xar \d\().2d, \s0\().2d, \s1\().2d, #\imm +.endm + +.macro rax1_m1 d s0 s1 + xar_m0 tmp, vzr, \s1, 63 + eor \d\().16b, \s0\().16b, tmp.16b +.endm + +.macro bcax_m0 d s0 s1 s2 + bcax \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +.macro bcax_m2 d s0 s1 s2 + bcax \d\()z.d, \s0\()z.d, \s1\()z.d, \s2\()z.d +.endm + +/* Keccak-f1600 round */ + +.macro keccak_f1600_round + + eor3_m0 C2, Ami, Agi, Aki + eor3_m0 C0, Ama, Aga, Aka + eor3_m0 C1, Ame, Age, Ake + eor3_m0 C3, Amo, Ago, Ako + eor3_m0 C4, Asu, Agu, Aku + + vzr .req v31 + movi vzr.2d, #0 + + eor3_m0 C2, C2, Abi, Asi + save(Agi) SEP C1r .req Agi + eor3_m0 C0, C0, Aba, Asa + eor3_m0 C1, C1, Abe, Ase + save(Agu) SEP C3r .req Agu + eor3_m0 C3, C3, Abo, Aso + eor3_m0 C4, C4, Amu, Abu + + save(Ago) SEP C2r .req Ago + xar_m0 C1r, vzr, C1, 63 + xar_m0 C3r, vzr, C3, 63 + save(Aga) SEP C4r .req Aga + xar_m0 C2r, vzr, C2, 63 + xar_m0 C4r, vzr, C4, 63 + save(Age) SEP C0r .req Age + eor2 E0, C4, C1r + xar_m0 C0r, vzr, C0, 63 + eor2 E2, C1, C3r + eor2 E1, C0, C2r + restore(Agu) // C3r + eor2 E3, C2, C4r + eor2 E4, C3, C0r + restore(Ago) // C2r + restore(Agi) // C1r/Cor + + eor Aba_.16b, Aba.16b, E0.16b + xar_m0 Asa_, Abi, E2, 2 + restore(Aga) // C4r + xar_m0 Abi_, Aki, E2, 21 + xar_m0 Aki_, Ako, E3, 39 + restore(Age) // C0r + xar_m0 Ako_, Amu, E4, 56 + xar_m0 Amu_, Aso, E3, 8 + xar_m0 Aso_, Ama, E0, 23 + xar_m0 Aka_, Abe, E1, 63 + xar_m0 Ase_, Ago, E3, 9 + xar_m0 Ago_, Ame, E1, 19 + xar_m0 Ake_, Agi, E2, 58 + xar_m0 Agi_, Aka, E0, 61 + xar_m0 Aga_, Abo, E3, 36 + xar_m0 Abo_, Amo, E3, 43 + xar_m0 Amo_, Ami, E2, 49 + xar_m0 Ami_, Ake, E1, 54 + xar_m0 Age_, Agu, E4, 44 + xar_m0 Agu_, Asi, E2, 3 + xar_m0 Asi_, Aku, E4, 25 + xar_m0 Aku_, Asa, E0, 46 + xar_m0 Ama_, Abu, E4, 37 + xar_m0 Abu_, Asu, E4, 50 + xar_m0 Asu_, Ase, E1, 62 + xar_m0 Ame_, Aga, E0, 28 + xar_m0 Abe_, Age, E1, 20 + + ld1r {v31.2d}, [const_addr], #8 + + bcax_m0 Aga, Aga_, Agi_, Age_ + bcax_m0 Age, Age_, Ago_, Agi_ + bcax_m0 Agi, Agi_, Agu_, Ago_ + bcax_m0 Ago, Ago_, Aga_, Agu_ + bcax_m0 Agu, Agu_, Age_, Aga_ + bcax_m0 Aka, Aka_, Aki_, Ake_ + bcax_m0 Ake, Ake_, Ako_, Aki_ + bcax_m0 Aki, Aki_, Aku_, Ako_ + bcax_m0 Ako, Ako_, Aka_, Aku_ + bcax_m0 Aku, Aku_, Ake_, Aka_ + bcax_m0 Ama, Ama_, Ami_, Ame_ + bcax_m0 Ame, Ame_, Amo_, Ami_ + bcax_m0 Ami, Ami_, Amu_, Amo_ + bcax_m0 Amo, Amo_, Ama_, Amu_ + bcax_m0 Amu, Amu_, Ame_, Ama_ + bcax_m0 Asa, Asa_, Asi_, Ase_ + bcax_m0 Ase, Ase_, Aso_, Asi_ + bcax_m0 Asi, Asi_, Asu_, Aso_ + bcax_m0 Aso, Aso_, Asa_, Asu_ + bcax_m0 Asu, Asu_, Ase_, Asa_ + bcax_m0 Aba, Aba_, Abi_, Abe_ + bcax_m0 Abe, Abe_, Abo_, Abi_ + bcax_m0 Abi, Abi_, Abu_, Abo_ + bcax_m0 Abo, Abo_, Aba_, Abu_ + bcax_m0 Abu, Abu_, Abe_, Aba_ + + // iota step + eor Aba.16b, Aba.16b, v31.16b + +.endm + +#define KECCAK_F1600_ROUNDS 24 + +.text +.align 4 +.global keccak_f1600_x2_v84a_asm_v1p0 +.global _keccak_f1600_x2_v84a_asm_v1p0 + +keccak_f1600_x2_v84a_asm_v1p0: +_keccak_f1600_x2_v84a_asm_v1p0: + alloc_stack + save_vregs + load_constant_ptr + load_input + + mov count, #(KECCAK_F1600_ROUNDS) +loop: + keccak_f1600_round + sub count, count, #1 + cbnz count, loop + + store_input + restore_vregs + free_stack + ret + +#endif diff --git a/tests/keccak_neon/manual/keccak_f1600_x2_v84a_asm_v2.s b/tests/keccak_neon/manual/keccak_f1600_x2_v84a_asm_v2.s new file mode 100644 index 0000000..698c257 --- /dev/null +++ b/tests/keccak_neon/manual/keccak_f1600_x2_v84a_asm_v2.s @@ -0,0 +1,375 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +/********************** CONSTANTS *************************/ + .data + .align(8) +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x1 + count .req x2 + cur_const .req x3 + + /* Mapping of Kecck-f1600 state to vector registers + * at the beginning and end of each round. */ + Aba .req v0 + Abe .req v1 + Abi .req v2 + Abo .req v3 + Abu .req v4 + Aga .req v5 + Age .req v6 + Agi .req v7 + Ago .req v8 + Agu .req v9 + Aka .req v10 + Ake .req v11 + Aki .req v12 + Ako .req v13 + Aku .req v14 + Ama .req v15 + Ame .req v16 + Ami .req v17 + Amo .req v18 + Amu .req v19 + Asa .req v20 + Ase .req v21 + Asi .req v22 + Aso .req v23 + Asu .req v24 + + /* q-form of the above mapping */ + Abaq .req q0 + Abeq .req q1 + Abiq .req q2 + Aboq .req q3 + Abuq .req q4 + Agaq .req q5 + Ageq .req q6 + Agiq .req q7 + Agoq .req q8 + Aguq .req q9 + Akaq .req q10 + Akeq .req q11 + Akiq .req q12 + Akoq .req q13 + Akuq .req q14 + Amaq .req q15 + Ameq .req q16 + Amiq .req q17 + Amoq .req q18 + Amuq .req q19 + Asaq .req q20 + Aseq .req q21 + Asiq .req q22 + Asoq .req q23 + Asuq .req q24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v30 + C1 .req v29 + C2 .req v28 + C3 .req v27 + C4 .req v26 + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req v26 + E1 .req v25 + E2 .req v29 + E3 .req v28 + E4 .req v27 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + Abi_ .req v2 + Abo_ .req v3 + Abu_ .req v4 + Aga_ .req v10 + Age_ .req v11 + Agi_ .req v7 + Ago_ .req v8 + Agu_ .req v9 + Aka_ .req v15 + Ake_ .req v16 + Aki_ .req v12 + Ako_ .req v13 + Aku_ .req v14 + Ama_ .req v20 + Ame_ .req v21 + Ami_ .req v17 + Amo_ .req v18 + Amu_ .req v19 + Asa_ .req v0 + Ase_ .req v1 + Asi_ .req v22 + Aso_ .req v23 + Asu_ .req v24 + Aba_ .req v30 + Abe_ .req v27 + + /* Unused temporary */ + tmp .req v31 + +/************************ MACROS ****************************/ + +.macro load_input + ldr Abaq, [input_addr, #(2*8*0)] + ldr Abeq, [input_addr, #(2*8*1)] + ldr Abiq, [input_addr, #(2*8*2)] + ldr Aboq, [input_addr, #(2*8*3)] + ldr Abuq, [input_addr, #(2*8*4)] + ldr Agaq, [input_addr, #(2*8*5)] + ldr Ageq, [input_addr, #(2*8*6)] + ldr Agiq, [input_addr, #(2*8*7)] + ldr Agoq, [input_addr, #(2*8*8)] + ldr Aguq, [input_addr, #(2*8*9)] + ldr Akaq, [input_addr, #(2*8*10)] + ldr Akeq, [input_addr, #(2*8*11)] + ldr Akiq, [input_addr, #(2*8*12)] + ldr Akoq, [input_addr, #(2*8*13)] + ldr Akuq, [input_addr, #(2*8*14)] + ldr Amaq, [input_addr, #(2*8*15)] + ldr Ameq, [input_addr, #(2*8*16)] + ldr Amiq, [input_addr, #(2*8*17)] + ldr Amoq, [input_addr, #(2*8*18)] + ldr Amuq, [input_addr, #(2*8*19)] + ldr Asaq, [input_addr, #(2*8*20)] + ldr Aseq, [input_addr, #(2*8*21)] + ldr Asiq, [input_addr, #(2*8*22)] + ldr Asoq, [input_addr, #(2*8*23)] + ldr Asuq, [input_addr, #(2*8*24)] +.endm + +.macro store_input + str Abaq, [input_addr, #(2*8*0)] + str Abeq, [input_addr, #(2*8*1)] + str Abiq, [input_addr, #(2*8*2)] + str Aboq, [input_addr, #(2*8*3)] + str Abuq, [input_addr, #(2*8*4)] + str Agaq, [input_addr, #(2*8*5)] + str Ageq, [input_addr, #(2*8*6)] + str Agiq, [input_addr, #(2*8*7)] + str Agoq, [input_addr, #(2*8*8)] + str Aguq, [input_addr, #(2*8*9)] + str Akaq, [input_addr, #(2*8*10)] + str Akeq, [input_addr, #(2*8*11)] + str Akiq, [input_addr, #(2*8*12)] + str Akoq, [input_addr, #(2*8*13)] + str Akuq, [input_addr, #(2*8*14)] + str Amaq, [input_addr, #(2*8*15)] + str Ameq, [input_addr, #(2*8*16)] + str Amiq, [input_addr, #(2*8*17)] + str Amoq, [input_addr, #(2*8*18)] + str Amuq, [input_addr, #(2*8*19)] + str Asaq, [input_addr, #(2*8*20)] + str Aseq, [input_addr, #(2*8*21)] + str Asiq, [input_addr, #(2*8*22)] + str Asoq, [input_addr, #(2*8*23)] + str Asuq, [input_addr, #(2*8*24)] +.endm + +#define STACK_SIZE (16*4) // VREGS (16*4) +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +.macro save_vregs + stp d8, d9, [sp, #(16*0)] + stp d10, d11, [sp, #(16*1)] + stp d12, d13, [sp, #(16*2)] + stp d14, d15, [sp, #(16*3)] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #(16*0)] + ldp d10, d11, [sp, #(16*1)] + ldp d12, d13, [sp, #(16*2)] + ldp d14, d15, [sp, #(16*3)] +.endm + +/* Macros using v8.4-A SHA-3 instructions */ + +.macro eor3_m1 d s0 s1 s2 + eor \d\().16b, \s0\().16b, \s1\().16b + eor \d\().16b, \d\().16b, \s2\().16b +.endm + +.macro rax1_m1 d s0 s1 + shl tmp.2d, \s1\().2d, #1 + sri tmp.2d, \s1\().2d, #63 + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +.macro xar_m1 d s0 s1 imm + eor tmp.16b, \s0\().16b, \s1\().16b + shl \d\().2d, tmp.2d, #(64-\imm) + sri \d\().2d, tmp.2d, #(\imm) +.endm + +.macro bcax_m1 d s0 s1 s2 + bic tmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +/* Keccak-f1600 round */ + +.macro keccak_f1600_round + + eor3_m1 C0, Aba, Aga, Aka + eor3_m1 C0, C0, Ama, Asa + eor3_m1 C1, Abe, Age, Ake + eor3_m1 C1, C1, Ame, Ase + eor3_m1 C2, Abi, Agi, Aki + eor3_m1 C2, C2, Ami, Asi + eor3_m1 C3, Abo, Ago, Ako + eor3_m1 C3, C3, Amo, Aso + eor3_m1 C4, Abu, Agu, Aku + eor3_m1 C4, C4, Amu, Asu + + rax1_m1 E1, C0, C2 + rax1_m1 E3, C2, C4 + rax1_m1 E0, C4, C1 + rax1_m1 E2, C1, C3 + rax1_m1 E4, C3, C0 + + eor Aba_.16b, Aba.16b, E0.16b + xar_m1 Asa_, Abi, E2, 2 + xar_m1 Abi_, Aki, E2, 21 + xar_m1 Aki_, Ako, E3, 39 + xar_m1 Ako_, Amu, E4, 56 + xar_m1 Amu_, Aso, E3, 8 + xar_m1 Aso_, Ama, E0, 23 + xar_m1 Aka_, Abe, E1, 63 + xar_m1 Ase_, Ago, E3, 9 + xar_m1 Ago_, Ame, E1, 19 + xar_m1 Ake_, Agi, E2, 58 + xar_m1 Agi_, Aka, E0, 61 + xar_m1 Aga_, Abo, E3, 36 + xar_m1 Abo_, Amo, E3, 43 + xar_m1 Amo_, Ami, E2, 49 + xar_m1 Ami_, Ake, E1, 54 + xar_m1 Age_, Agu, E4, 44 + xar_m1 Agu_, Asi, E2, 3 + xar_m1 Asi_, Aku, E4, 25 + xar_m1 Aku_, Asa, E0, 46 + xar_m1 Ama_, Abu, E4, 37 + xar_m1 Abu_, Asu, E4, 50 + xar_m1 Asu_, Ase, E1, 62 + xar_m1 Ame_, Aga, E0, 28 + xar_m1 Abe_, Age, E1, 20 + + bcax_m1 Aga, Aga_, Agi_, Age_ + bcax_m1 Age, Age_, Ago_, Agi_ + bcax_m1 Agi, Agi_, Agu_, Ago_ + bcax_m1 Ago, Ago_, Aga_, Agu_ + bcax_m1 Agu, Agu_, Age_, Aga_ + bcax_m1 Aka, Aka_, Aki_, Ake_ + bcax_m1 Ake, Ake_, Ako_, Aki_ + bcax_m1 Aki, Aki_, Aku_, Ako_ + bcax_m1 Ako, Ako_, Aka_, Aku_ + bcax_m1 Aku, Aku_, Ake_, Aka_ + bcax_m1 Ama, Ama_, Ami_, Ame_ + bcax_m1 Ame, Ame_, Amo_, Ami_ + bcax_m1 Ami, Ami_, Amu_, Amo_ + bcax_m1 Amo, Amo_, Ama_, Amu_ + bcax_m1 Amu, Amu_, Ame_, Ama_ + bcax_m1 Asa, Asa_, Asi_, Ase_ + bcax_m1 Ase, Ase_, Aso_, Asi_ + bcax_m1 Asi, Asi_, Asu_, Aso_ + bcax_m1 Aso, Aso_, Asa_, Asu_ + bcax_m1 Asu, Asu_, Ase_, Asa_ + bcax_m1 Aba, Aba_, Abi_, Abe_ + bcax_m1 Abe, Abe_, Abo_, Abi_ + bcax_m1 Abi, Abi_, Abu_, Abo_ + bcax_m1 Abo, Abo_, Aba_, Abu_ + bcax_m1 Abu, Abu_, Abe_, Aba_ + + // iota step + ld1r {tmp.2d}, [const_addr], #8 + eor Aba.16b, Aba.16b, tmp.16b + +.endm + +#define KECCAK_F1600_ROUNDS 24 + +.text +.align 4 +.global keccak_f1600_x2_v84a_asm_v2 +.global _keccak_f1600_x2_v84a_asm_v2 + +keccak_f1600_x2_v84a_asm_v2: +_keccak_f1600_x2_v84a_asm_v2: + alloc_stack + save_vregs + load_constant_ptr + load_input + + mov count, #(KECCAK_F1600_ROUNDS) +loop: + keccak_f1600_round + sub count, count, #1 + cbnz count, loop + + store_input + restore_vregs + free_stack + ret diff --git a/tests/keccak_neon/manual/keccak_f1600_x2_v84a_asm_v2p0.s b/tests/keccak_neon/manual/keccak_f1600_x2_v84a_asm_v2p0.s new file mode 100644 index 0000000..c9547da --- /dev/null +++ b/tests/keccak_neon/manual/keccak_f1600_x2_v84a_asm_v2p0.s @@ -0,0 +1,596 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +/********************** CONSTANTS *************************/ + .data + .align(8) +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x1 + count .req x2 + cur_const .req x3 + + /* Mapping of Kecck-f1600 state to vector registers + * at the beginning and end of each round. */ + Aba .req v0 + Abe .req v1 + Abi .req v2 + Abo .req v3 + Abu .req v4 + Aga .req v5 + Age .req v6 + Agi .req v7 + Ago .req v8 + Agu .req v9 + Aka .req v10 + Ake .req v11 + Aki .req v12 + Ako .req v13 + Aku .req v14 + Ama .req v15 + Ame .req v16 + Ami .req v17 + Amo .req v18 + Amu .req v19 + Asa .req v20 + Ase .req v21 + Asi .req v22 + Aso .req v23 + Asu .req v24 + + /* q-form of the above mapping */ + Abaq .req q0 + Abeq .req q1 + Abiq .req q2 + Aboq .req q3 + Abuq .req q4 + Agaq .req q5 + Ageq .req q6 + Agiq .req q7 + Agoq .req q8 + Aguq .req q9 + Akaq .req q10 + Akeq .req q11 + Akiq .req q12 + Akoq .req q13 + Akuq .req q14 + Amaq .req q15 + Ameq .req q16 + Amiq .req q17 + Amoq .req q18 + Amuq .req q19 + Asaq .req q20 + Aseq .req q21 + Asiq .req q22 + Asoq .req q23 + Asuq .req q24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v27 + C1 .req v28 + C2 .req v29 + C3 .req v30 + C4 .req v31 + + C0q .req q27 + C1q .req q28 + C2q .req q29 + C3q .req q30 + C4q .req q31 + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req v26 + E1 .req v26 + E2 .req v26 + E3 .req v26 + E4 .req v26 + + E0q .req q26 + E1q .req q26 + E2q .req q26 + E3q .req q26 + E4q .req q26 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + vBgo .req v27 + vBgi .req v28 + vBga .req v29 + vBge .req v30 + vBgu .req v31 + vBki .req v27 + vBko .req v28 + vBka .req v29 + vBke .req v30 + vBku .req v31 + vBmu .req v27 + vBmo .req v28 + vBmi .req v29 + vBma .req v30 + vBme .req v31 + vBba .req v27 + vBbi .req v28 + vBbo .req v29 + vBbu .req v30 + vBbe .req v31 + vBsa .req v27 + vBso .req v28 + vBse .req v29 + vBsi .req v30 + vBsu .req v31 + + vBgoq .req q27 + vBgiq .req q28 + vBgaq .req q29 + vBgeq .req q30 + vBguq .req q31 + vBkiq .req q27 + vBkoq .req q28 + vBkaq .req q29 + vBkeq .req q30 + vBkuq .req q31 + vBmuq .req q27 + vBmoq .req q28 + vBmiq .req q29 + vBmaq .req q30 + vBmeq .req q31 + vBbaq .req q27 + vBbiq .req q28 + vBboq .req q29 + vBbuq .req q30 + vBbeq .req q31 + vBsaq .req q27 + vBsoq .req q28 + vBseq .req q29 + vBsiq .req q30 + vBsuq .req q31 + + vEgu .req Agu + vEga .req v26 + vEge .req v26 + vEgi .req v26 + vEgo .req v26 + vEka .req Aka + vEko .req Ako + vEke .req v26 + vEki .req v26 + vEku .req v26 + vEma .req v26 + vEme .req Ame + vEmi .req Ami + vEmo .req v26 + vEmu .req Amu + vEba .req Aba + vEbe .req Abe + vEbi .req v26 + vEbo .req Abo + vEbu .req Abu + vEsa .req Asa + vEse .req Ase + vEsi .req Asi + vEso .req Aso + vEsu .req Asu + + vEguq .req Aguq + vEgaq .req q26 + vEgeq .req q26 + vEgiq .req q26 + vEgoq .req q26 + vEkaq .req Akaq + vEkoq .req Akoq + vEkeq .req q26 + vEkiq .req q26 + vEkuq .req q26 + vEmaq .req q26 + vEmeq .req Ameq + vEmiq .req Amiq + vEmoq .req q26 + vEmuq .req Amuq + vEbaq .req Abaq + vEbeq .req Abeq + vEbiq .req q26 + vEboq .req Aboq + vEbuq .req Abuq + vEsaq .req Asaq + vEseq .req Aseq + vEsiq .req Asiq + vEsoq .req Asoq + vEsuq .req Asuq + +/************************ MACROS ****************************/ + +.macro load_input + ldp Abaq, Abeq, [input_addr, #(2*8*0)] + ldp Abiq, Aboq, [input_addr, #(2*8*2)] + ldp Abuq, Agaq, [input_addr, #(2*8*4)] + ldp Ageq, Agiq, [input_addr, #(2*8*6)] + ldp Agoq, Aguq, [input_addr, #(2*8*8)] + ldp Akaq, Akeq, [input_addr, #(2*8*10)] + ldp Akiq, Akoq, [input_addr, #(2*8*12)] + ldp Akuq, Amaq, [input_addr, #(2*8*14)] + ldp Ameq, Amiq, [input_addr, #(2*8*16)] + ldp Amoq, Amuq, [input_addr, #(2*8*18)] + ldp Asaq, Aseq, [input_addr, #(2*8*20)] + ldp Asiq, Asoq, [input_addr, #(2*8*22)] + ldr Asuq, [input_addr, #(2*8*24)] +.endm + +.macro store_input + str Abaq, [input_addr, #(2*8*0)] + str Abeq, [input_addr, #(2*8*1)] + str Abiq, [input_addr, #(2*8*2)] + str Aboq, [input_addr, #(2*8*3)] + str Abuq, [input_addr, #(2*8*4)] + str Agaq, [input_addr, #(2*8*5)] + str Ageq, [input_addr, #(2*8*6)] + str Agiq, [input_addr, #(2*8*7)] + str Agoq, [input_addr, #(2*8*8)] + str Aguq, [input_addr, #(2*8*9)] + str Akaq, [input_addr, #(2*8*10)] + str Akeq, [input_addr, #(2*8*11)] + str Akiq, [input_addr, #(2*8*12)] + str Akoq, [input_addr, #(2*8*13)] + str Akuq, [input_addr, #(2*8*14)] + str Amaq, [input_addr, #(2*8*15)] + str Ameq, [input_addr, #(2*8*16)] + str Amiq, [input_addr, #(2*8*17)] + str Amoq, [input_addr, #(2*8*18)] + str Amuq, [input_addr, #(2*8*19)] + str Asaq, [input_addr, #(2*8*20)] + str Aseq, [input_addr, #(2*8*21)] + str Asiq, [input_addr, #(2*8*22)] + str Asoq, [input_addr, #(2*8*23)] + str Asuq, [input_addr, #(2*8*24)] +.endm + +#define STACK_SIZE (16*4 + 16*30) +#define STACK_BASE_VREGS 0 +#define STACK_BASE_TMP 16*4 + +#define E0_offset 0 +#define E1_offset 1 +#define E2_offset 2 +#define E3_offset 3 +#define E4_offset 4 + +#define Aba_offset (5 + 0 ) +#define Abe_offset (5 + 1 ) +#define Abi_offset (5 + 2 ) +#define Abo_offset (5 + 3 ) +#define Abu_offset (5 + 4 ) +#define Aga_offset (5 + 5 ) +#define Age_offset (5 + 6 ) +#define Agi_offset (5 + 7 ) +#define Ago_offset (5 + 8 ) +#define Agu_offset (5 + 9 ) +#define Aka_offset (5 + 10 ) +#define Ake_offset (5 + 11 ) +#define Aki_offset (5 + 12 ) +#define Ako_offset (5 + 13 ) +#define Aku_offset (5 + 14 ) +#define Ama_offset (5 + 15 ) +#define Ame_offset (5 + 16 ) +#define Ami_offset (5 + 17 ) +#define Amo_offset (5 + 18 ) +#define Amu_offset (5 + 19 ) +#define Asa_offset (5 + 20 ) +#define Ase_offset (5 + 21 ) +#define Asi_offset (5 + 22 ) +#define Aso_offset (5 + 23 ) +#define Asu_offset (5 + 24 ) + +#define vEba_offset (5 + 0 ) +#define vEbe_offset (5 + 1 ) +#define vEbi_offset (5 + 2 ) +#define vEbo_offset (5 + 3 ) +#define vEbu_offset (5 + 4 ) +#define vEga_offset (5 + 5 ) +#define vEge_offset (5 + 6 ) +#define vEgi_offset (5 + 7 ) +#define vEgo_offset (5 + 8 ) +#define vEgu_offset (5 + 9 ) +#define vEka_offset (5 + 10 ) +#define vEke_offset (5 + 11 ) +#define vEki_offset (5 + 12 ) +#define vEko_offset (5 + 13 ) +#define vEku_offset (5 + 14 ) +#define vEma_offset (5 + 15 ) +#define vEme_offset (5 + 16 ) +#define vEmi_offset (5 + 17 ) +#define vEmo_offset (5 + 18 ) +#define vEmu_offset (5 + 19 ) +#define vEsa_offset (5 + 20 ) +#define vEse_offset (5 + 21 ) +#define vEsi_offset (5 + 22 ) +#define vEso_offset (5 + 23 ) +#define vEsu_offset (5 + 24 ) + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +#define save(name) \ + str name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] +#define restore(name) \ + ldr name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] + +.macro save_vregs + stp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + stp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + stp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + stp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + ldp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + ldp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + ldp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +/* Macros using v8.4-A SHA-3 instructions */ + +.macro eor3_m1_0 d s0 s1 s2 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor2 d s0 s1 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor3_m1_1 d s0 s1 s2 + eor \d\().16b, \d\().16b, \s2\().16b +.endm + +.macro eor3_m1 d s0 s1 s2 + eor3_m1_0 \d, \s0, \s1, \s2 + eor3_m1_1 \d, \s0, \s1, \s2 +.endm + +.macro rax1_m1 d s0 s1 + add tmp.2d, \s1\().2d, \s1\().2d + sri tmp.2d, \s1\().2d, #63 + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +.macro xar_m1 d s0 s1 imm + eor \s0\().16b, \s0\().16b, \s1\().16b + shl \d\().2d, \s0\().2d, #(64-\imm) + sri \d\().2d, \s0\().2d, #(\imm) +.endm + +.macro xar_m1_0 d s0 s1 imm tmp + eor \tmp\().16b, \s0\().16b, \s1\().16b +.endm + +.macro xar_m1_1 d s0 s1 imm tmp + shl \d\().2d, \tmp\().2d, #(64-\imm) +.endm + +.macro xar_m1_2 d s0 s1 imm tmp + sri \d\().2d, \tmp\().2d, #(\imm) +.endm + +.macro bcax_m1 d s0 s1 s2 + bic tmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +.macro refresh d + mov \d\().16b, \d\().16b +.endm +/* Keccak-f1600 round */ + +.macro keccak_f1600_round + + eor2 C0, Aka, Aga + eor2 C1, Ake, Age + eor2 C2, Aki, Agi + eor2 C3, Ako, Ago + eor2 C4, Aku, Agu + eor2 C0, C0, Ama + eor2 C1, C1, Ame + eor2 C2, C2, Ami + eor2 C3, C3, Amo + eor2 C4, C4, Amu + eor2 C0, C0, Asa + eor2 C1, C1, Ase + eor2 C2, C2, Asi + eor2 C3, C3, Aso + eor2 C4, C4, Asu + eor2 C0, C0, Aba + eor2 C1, C1, Abe + eor2 C2, C2, Abi + eor2 C3, C3, Abo + eor2 C4, C4, Abu + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req v25 + rax1_m1 E2, C1, C3 SEP save(E2) + rax1_m1 E4, C3, C0 SEP save(E4) + rax1_m1 E1, C0, C2 SEP save(E1) + rax1_m1 E3, C2, C4 SEP save(E3) + rax1_m1 E0, C4, C1 SEP save(E0) + + restore(E1) + xar_m1 vBgo, Ame, E1, 19 SEP restore(E0) + xar_m1 vBgi, Aka, E0, 61 SEP restore(E3) + xar_m1 vBga, Abo, E3, 36 SEP restore(E4) + xar_m1 vBge, Agu, E4, 44 SEP restore(E2) + xar_m1 vBgu, Asi, E2, 3 SEP + + bcax_m1 vEga, vBga, vBgi, vBge SEP save(vEga) + bcax_m1 vEge, vBge, vBgo, vBgi SEP save(vEge) + bcax_m1 vEgi, vBgi, vBgu, vBgo SEP save(vEgi) + bcax_m1 vEgo, vBgo, vBga, vBgu SEP save(vEgo) + bcax_m1 vEgu, vBgu, vBge, vBga SEP save(vEgu) + + restore(E3) + xar_m1 vBki, Ako, E3, 39 SEP restore(E4) + xar_m1 vBko, Amu, E4, 56 SEP restore(E1) + xar_m1 vBka, Abe, E1, 63 SEP restore(E2) + xar_m1 vBke, Agi, E2, 58 SEP restore(E0) + xar_m1 vBku, Asa, E0, 46 + + bcax_m1 vEka, vBka, vBki, vBke SEP save(vEka) + bcax_m1 vEke, vBke, vBko, vBki SEP save(vEke) + bcax_m1 vEki, vBki, vBku, vBko SEP save(vEki) + bcax_m1 vEko, vBko, vBka, vBku SEP save(vEko) + bcax_m1 vEku, vBku, vBke, vBka SEP save(vEku) + + restore(E3) + xar_m1 vBmu, Aso, E3, 8 SEP restore(E2) + xar_m1 vBmo, Ami, E2, 49 SEP restore(E1) + xar_m1 vBmi, Ake, E1, 54 SEP restore(E4) + xar_m1 vBma, Abu, E4, 37 SEP restore(E0) + xar_m1 vBme, Aga, E0, 28 + + bcax_m1 vEma, vBma, vBmi, vBme SEP save(vEma) + bcax_m1 vEme, vBme, vBmo, vBmi SEP save(vEme) + bcax_m1 vEmi, vBmi, vBmu, vBmo SEP save(vEmi) + bcax_m1 vEmo, vBmo, vBma, vBmu SEP save(vEmo) + bcax_m1 vEmu, vBmu, vBme, vBma SEP save(vEmu) + + restore(E0) + eor2 vBba, Aba, E0 SEP restore(E2) + xar_m1 vBbi, Aki, E2, 21 SEP restore(E3) + xar_m1 vBbo, Amo, E3, 43 SEP restore(E4) + xar_m1 vBbu, Asu, E4, 50 SEP restore(E1) + xar_m1 vBbe, Age, E1, 20 + + bcax_m1 vEba, vBba, vBbi, vBbe SEP save(vEba) + bcax_m1 vEbe, vBbe, vBbo, vBbi SEP save(vEbe) + bcax_m1 vEbi, vBbi, vBbu, vBbo SEP save(vEbi) + bcax_m1 vEbo, vBbo, vBba, vBbu SEP save(vEbo) + bcax_m1 vEbu, vBbu, vBbe, vBba SEP save(vEbu) + + restore(E2) + xar_m1 vBsa, Abi, E2, 2 SEP restore(E0) + xar_m1 vBso, Ama, E0, 23 SEP restore(E3) + xar_m1 vBse, Ago, E3, 9 SEP restore(E4) + xar_m1 vBsi, Aku, E4, 25 SEP restore(E1) + xar_m1 vBsu, Ase, E1, 62 + + bcax_m1 vEsa, vBsa, vBsi, vBse SEP save(vEsa) + bcax_m1 vEse, vBse, vBso, vBsi SEP save(vEse) + bcax_m1 vEsi, vBsi, vBsu, vBso SEP save(vEsi) + bcax_m1 vEso, vBso, vBsa, vBsu SEP save(vEso) + bcax_m1 vEsu, vBsu, vBse, vBsa SEP save(vEsu) + + restore(Aba) + restore(Abe) + restore(Abi) + restore(Abo) + restore(Abu) + restore(Aga) + restore(Age) + restore(Agi) + restore(Ago) + restore(Agu) + restore(Aka) + restore(Ake) + restore(Aki) + restore(Ako) + restore(Aku) + restore(Ama) + restore(Ame) + restore(Ami) + restore(Amo) + restore(Amu) + restore(Asa) + restore(Ase) + restore(Asi) + restore(Aso) + restore(Asu) + + ld1r {tmp.2d}, [const_addr], #8 + eor Aba.16b, Aba.16b, tmp.16b + + .unreq tmp +.endm + +.text +.align 4 +.global keccak_f1600_x2_v84a_asm_v2p0 +.global _keccak_f1600_x2_v84a_asm_v2p0 + +#define KECCAK_F1600_ROUNDS 24 + +keccak_f1600_x2_v84a_asm_v2p0: +_keccak_f1600_x2_v84a_asm_v2p0: + alloc_stack + save_vregs + load_constant_ptr + load_input + + mov count, #12 +loop: + keccak_f1600_round + keccak_f1600_round + sub count, count, #1 + cbnz count, loop + + store_input + restore_vregs + free_stack + ret diff --git a/tests/keccak_neon/manual/keccak_f1600_x2_v84a_asm_v2p1.s b/tests/keccak_neon/manual/keccak_f1600_x2_v84a_asm_v2p1.s new file mode 100644 index 0000000..2b24b1a --- /dev/null +++ b/tests/keccak_neon/manual/keccak_f1600_x2_v84a_asm_v2p1.s @@ -0,0 +1,732 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +/********************** CONSTANTS *************************/ + .data + .align(8) +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x1 + count .req x2 + cur_const .req x3 + + /* Mapping of Kecck-f1600 state to vector registers + * at the beginning and end of each round. */ + Aba .req v0 + Abe .req v1 + Abi .req v2 + Abo .req v3 + Abu .req v4 + Aga .req v5 + Age .req v6 + Agi .req v7 + Ago .req v8 + Agu .req v9 + Aka .req v10 + Ake .req v11 + Aki .req v12 + Ako .req v13 + Aku .req v14 + Ama .req v15 + Ame .req v16 + Ami .req v17 + Amo .req v18 + Amu .req v19 + Asa .req v20 + Ase .req v21 + Asi .req v22 + Aso .req v23 + Asu .req v24 + + /* q-form of the above mapping */ + Abaq .req q0 + Abeq .req q1 + Abiq .req q2 + Aboq .req q3 + Abuq .req q4 + Agaq .req q5 + Ageq .req q6 + Agiq .req q7 + Agoq .req q8 + Aguq .req q9 + Akaq .req q10 + Akeq .req q11 + Akiq .req q12 + Akoq .req q13 + Akuq .req q14 + Amaq .req q15 + Ameq .req q16 + Amiq .req q17 + Amoq .req q18 + Amuq .req q19 + Asaq .req q20 + Aseq .req q21 + Asiq .req q22 + Asoq .req q23 + Asuq .req q24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v27 + C1 .req v28 + C2 .req v29 + C3 .req v30 + C4 .req v31 + + C0q .req q27 + C1q .req q28 + C2q .req q29 + C3q .req q30 + C4q .req q31 + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req v26 + E1 .req v26 + E2 .req v26 + E3 .req v26 + E4 .req v26 + + E0q .req q26 + E1q .req q26 + E2q .req q26 + E3q .req q26 + E4q .req q26 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + vBgo .req v27 + vBgi .req Ame + vBga .req Aka + vBge .req Abo + vBgu .req Agu + vBki .req Asi + vBko .req Ako + vBka .req Amu + vBke .req Abe + vBku .req Agi + vBmu .req Asa + vBmo .req Aso + vBmi .req Ami + vBma .req Ake + vBme .req Abu + vBba .req Aga + vBbi .req Aba + vBbo .req Aki + vBbu .req Amo + vBbe .req Asu + vBsa .req Age + vBso .req Abi + vBse .req Ama + vBsi .req Ago + vBsu .req Aku + + vBgoq .req q27 + vBgiq .req Ameq + vBgaq .req Akaq + vBgeq .req Aboq + vBguq .req Aguq + vBkiq .req Asiq + vBkoq .req Akoq + vBkaq .req Amuq + vBkeq .req Abeq + vBkuq .req Agiq + vBmuq .req Asaq + vBmoq .req Asoq + vBmiq .req Amiq + vBmaq .req Akeq + vBmeq .req Abuq + vBbaq .req Agaq + vBbiq .req Abaq + vBboq .req Akiq + vBbuq .req Amoq + vBbeq .req Asuq + vBsaq .req Ageq + vBsoq .req Abiq + vBseq .req Amaq + vBsiq .req Agoq + vBsuq .req Akuq + + vEga .req v28 + vEge .req v29 + vEgi .req vBgi + vEgo .req vBgo + vEgu .req vBgu + vEka .req vBga + vEke .req vBge + vEki .req vBki + vEko .req vBko + vEku .req vBku + vEma .req vBka + vEme .req vBke + vEmi .req vBmi + vEmo .req vBmo + vEmu .req vBmu + vEba .req vBma + vEbe .req vBme + vEbi .req vBbi + vEbo .req vBbo + vEbu .req vBbu + vEsa .req vBba + vEse .req vBbe + vEsi .req vBsi + vEso .req vBso + vEsu .req vBsu + + vEgaq .req q28 + vEgeq .req q29 + vEgiq .req vBgiq + vEgoq .req vBgoq + vEguq .req vBguq + vEkaq .req vBgaq + vEkeq .req vBgeq + vEkiq .req vBkiq + vEkoq .req vBkoq + vEkuq .req vBkuq + vEmaq .req vBkaq + vEmeq .req vBkeq + vEmiq .req vBmiq + vEmoq .req vBmoq + vEmuq .req vBmuq + vEbaq .req vBmaq + vEbeq .req vBmeq + vEbiq .req vBbiq + vEboq .req vBboq + vEbuq .req vBbuq + vEsaq .req vBbaq + vEseq .req vBbeq + vEsiq .req vBsiq + vEsoq .req vBsoq + vEsuq .req vBsuq + +/************************ MACROS ****************************/ + +.macro load_input + ldp Abaq, Abeq, [input_addr, #(2*8*0)] + ldp Abiq, Aboq, [input_addr, #(2*8*2)] + ldp Abuq, Agaq, [input_addr, #(2*8*4)] + ldp Ageq, Agiq, [input_addr, #(2*8*6)] + ldp Agoq, Aguq, [input_addr, #(2*8*8)] + ldp Akaq, Akeq, [input_addr, #(2*8*10)] + ldp Akiq, Akoq, [input_addr, #(2*8*12)] + ldp Akuq, Amaq, [input_addr, #(2*8*14)] + ldp Ameq, Amiq, [input_addr, #(2*8*16)] + ldp Amoq, Amuq, [input_addr, #(2*8*18)] + ldp Asaq, Aseq, [input_addr, #(2*8*20)] + ldp Asiq, Asoq, [input_addr, #(2*8*22)] + ldr Asuq, [input_addr, #(2*8*24)] +.endm + +.macro store_input + str Abaq, [input_addr, #(2*8*0)] + str Abeq, [input_addr, #(2*8*1)] + str Abiq, [input_addr, #(2*8*2)] + str Aboq, [input_addr, #(2*8*3)] + str Abuq, [input_addr, #(2*8*4)] + str Agaq, [input_addr, #(2*8*5)] + str Ageq, [input_addr, #(2*8*6)] + str Agiq, [input_addr, #(2*8*7)] + str Agoq, [input_addr, #(2*8*8)] + str Aguq, [input_addr, #(2*8*9)] + str Akaq, [input_addr, #(2*8*10)] + str Akeq, [input_addr, #(2*8*11)] + str Akiq, [input_addr, #(2*8*12)] + str Akoq, [input_addr, #(2*8*13)] + str Akuq, [input_addr, #(2*8*14)] + str Amaq, [input_addr, #(2*8*15)] + str Ameq, [input_addr, #(2*8*16)] + str Amiq, [input_addr, #(2*8*17)] + str Amoq, [input_addr, #(2*8*18)] + str Amuq, [input_addr, #(2*8*19)] + str Asaq, [input_addr, #(2*8*20)] + str Aseq, [input_addr, #(2*8*21)] + str Asiq, [input_addr, #(2*8*22)] + str Asoq, [input_addr, #(2*8*23)] + str Asuq, [input_addr, #(2*8*24)] +.endm + +#define STACK_SIZE (16*4 + 16*30) +#define STACK_BASE_VREGS 0 +#define STACK_BASE_TMP 16*4 + +#define E0_offset 0 +#define E1_offset 1 +#define E2_offset 2 +#define E3_offset 3 +#define E4_offset 4 + +#define Aba_offset (5 + 0 ) +#define Abe_offset (5 + 1 ) +#define Abi_offset (5 + 2 ) +#define Abo_offset (5 + 3 ) +#define Abu_offset (5 + 4 ) +#define Aga_offset (5 + 5 ) +#define Age_offset (5 + 6 ) +#define Agi_offset (5 + 7 ) +#define Ago_offset (5 + 8 ) +#define Agu_offset (5 + 9 ) +#define Aka_offset (5 + 10 ) +#define Ake_offset (5 + 11 ) +#define Aki_offset (5 + 12 ) +#define Ako_offset (5 + 13 ) +#define Aku_offset (5 + 14 ) +#define Ama_offset (5 + 15 ) +#define Ame_offset (5 + 16 ) +#define Ami_offset (5 + 17 ) +#define Amo_offset (5 + 18 ) +#define Amu_offset (5 + 19 ) +#define Asa_offset (5 + 20 ) +#define Ase_offset (5 + 21 ) +#define Asi_offset (5 + 22 ) +#define Aso_offset (5 + 23 ) +#define Asu_offset (5 + 24 ) + +#define vEba_offset (5 + 0 ) +#define vEbe_offset (5 + 1 ) +#define vEbi_offset (5 + 2 ) +#define vEbo_offset (5 + 3 ) +#define vEbu_offset (5 + 4 ) +#define vEga_offset (5 + 5 ) +#define vEge_offset (5 + 6 ) +#define vEgi_offset (5 + 7 ) +#define vEgo_offset (5 + 8 ) +#define vEgu_offset (5 + 9 ) +#define vEka_offset (5 + 10 ) +#define vEke_offset (5 + 11 ) +#define vEki_offset (5 + 12 ) +#define vEko_offset (5 + 13 ) +#define vEku_offset (5 + 14 ) +#define vEma_offset (5 + 15 ) +#define vEme_offset (5 + 16 ) +#define vEmi_offset (5 + 17 ) +#define vEmo_offset (5 + 18 ) +#define vEmu_offset (5 + 19 ) +#define vEsa_offset (5 + 20 ) +#define vEse_offset (5 + 21 ) +#define vEsi_offset (5 + 22 ) +#define vEso_offset (5 + 23 ) +#define vEsu_offset (5 + 24 ) + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +#define save(name) \ + str name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] +#define restore(name) \ + ldr name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] + +.macro save_vregs + stp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + stp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + stp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + stp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + ldp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + ldp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + ldp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +/* Macros using v8.4-A SHA-3 instructions */ + +.macro eor3_m1_0 d s0 s1 s2 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor2 d s0 s1 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor3_m1_1 d s0 s1 s2 + eor \d\().16b, \d\().16b, \s2\().16b +.endm + +.macro eor3_m1 d s0 s1 s2 + eor3_m1_0 \d, \s0, \s1, \s2 + eor3_m1_1 \d, \s0, \s1, \s2 +.endm + +.macro rax1_m1 d s0 s1 + add tmp.2d, \s1\().2d, \s1\().2d + sri tmp.2d, \s1\().2d, #63 + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +.macro xar_m1 d s0 s1 imm + eor \s0\().16b, \s0\().16b, \s1\().16b + shl \d\().2d, \s0\().2d, #(64-\imm) + sri \d\().2d, \s0\().2d, #(\imm) +.endm + +.macro xar_m1_0 d s0 s1 imm tmp + eor \tmp\().16b, \s0\().16b, \s1\().16b +.endm + +.macro xar_m1_1 d s0 s1 imm tmp + shl \d\().2d, \tmp\().2d, #(64-\imm) +.endm + +.macro xar_m1_2 d s0 s1 imm tmp + sri \d\().2d, \tmp\().2d, #(\imm) +.endm + +.macro bcax_m1 d s0 s1 s2 + bic tmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +.macro refresh d + mov \d\().16b, \d\().16b +.endm +/* Keccak-f1600 round */ + +.macro keccak_f1600_round_pre + eor2 C0, Aka, Aga + eor2 C1, Ake, Age + eor2 C2, Aki, Agi + eor2 C3, Ako, Ago + eor2 C4, Aku, Agu + eor2 C0, C0, Ama + eor2 C1, C1, Ame + eor2 C2, C2, Ami + eor2 C3, C3, Amo + eor2 C4, C4, Amu + eor2 C0, C0, Asa + eor2 C1, C1, Ase + eor2 C2, C2, Asi + eor2 C3, C3, Aso + eor2 C4, C4, Asu + eor2 C0, C0, Aba + eor2 C1, C1, Abe + eor2 C2, C2, Abi + eor2 C3, C3, Abo + eor2 C4, C4, Abu +.endm + +.macro keccak_f1600_round_post + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req v25 + rax1_m1 E2, C1, C3 SEP save(E2) + rax1_m1 E4, C3, C0 SEP save(E4) + rax1_m1 E1, C0, C2 SEP save(E1) + rax1_m1 E3, C2, C4 SEP save(E3) + rax1_m1 E0, C4, C1 SEP save(E0) + + restore(E1) + xar_m1 vBgo, Ame, E1, 19 SEP restore(E0) + xar_m1 vBgi, Aka, E0, 61 SEP restore(E3) + xar_m1 vBga, Abo, E3, 36 SEP restore(E4) + xar_m1 vBge, Agu, E4, 44 SEP restore(E2) + xar_m1 vBgu, Asi, E2, 3 SEP + + bcax_m1 vEga, vBga, vBgi, vBge SEP save(vEga) + bcax_m1 vEge, vBge, vBgo, vBgi SEP save(vEge) + bcax_m1 vEgi, vBgi, vBgu, vBgo SEP save(vEgi) + bcax_m1 vEgo, vBgo, vBga, vBgu SEP save(vEgo) + bcax_m1 vEgu, vBgu, vBge, vBga SEP save(vEgu) + + restore(E3) + xar_m1 vBki, Ako, E3, 39 SEP restore(E4) + xar_m1 vBko, Amu, E4, 56 SEP restore(E1) + xar_m1 vBka, Abe, E1, 63 SEP restore(E2) + xar_m1 vBke, Agi, E2, 58 SEP restore(E0) + xar_m1 vBku, Asa, E0, 46 + + bcax_m1 vEka, vBka, vBki, vBke SEP save(vEka) + bcax_m1 vEke, vBke, vBko, vBki SEP save(vEke) + bcax_m1 vEki, vBki, vBku, vBko SEP save(vEki) + bcax_m1 vEko, vBko, vBka, vBku SEP save(vEko) + bcax_m1 vEku, vBku, vBke, vBka SEP save(vEku) + + restore(E3) + xar_m1 vBmu, Aso, E3, 8 SEP restore(E2) + xar_m1 vBmo, Ami, E2, 49 SEP restore(E1) + xar_m1 vBmi, Ake, E1, 54 SEP restore(E4) + xar_m1 vBma, Abu, E4, 37 SEP restore(E0) + xar_m1 vBme, Aga, E0, 28 + + bcax_m1 vEma, vBma, vBmi, vBme SEP save(vEma) + bcax_m1 vEme, vBme, vBmo, vBmi SEP save(vEme) + bcax_m1 vEmi, vBmi, vBmu, vBmo SEP save(vEmi) + bcax_m1 vEmo, vBmo, vBma, vBmu SEP save(vEmo) + bcax_m1 vEmu, vBmu, vBme, vBma SEP save(vEmu) + + restore(E0) + eor2 vBba, Aba, E0 SEP restore(E2) + xar_m1 vBbi, Aki, E2, 21 SEP restore(E3) + xar_m1 vBbo, Amo, E3, 43 SEP restore(E4) + xar_m1 vBbu, Asu, E4, 50 SEP restore(E1) + xar_m1 vBbe, Age, E1, 20 + + bcax_m1 vEba, vBba, vBbi, vBbe SEP save(vEba) + bcax_m1 vEbe, vBbe, vBbo, vBbi SEP save(vEbe) + bcax_m1 vEbi, vBbi, vBbu, vBbo SEP save(vEbi) + bcax_m1 vEbo, vBbo, vBba, vBbu SEP save(vEbo) + bcax_m1 vEbu, vBbu, vBbe, vBba SEP save(vEbu) + + restore(E2) + xar_m1 vBsa, Abi, E2, 2 SEP restore(E0) + xar_m1 vBso, Ama, E0, 23 SEP restore(E3) + xar_m1 vBse, Ago, E3, 9 SEP restore(E4) + xar_m1 vBsi, Aku, E4, 25 SEP restore(E1) + xar_m1 vBsu, Ase, E1, 62 + + bcax_m1 vEsa, vBsa, vBsi, vBse SEP save(vEsa) + bcax_m1 vEse, vBse, vBso, vBsi SEP save(vEse) + bcax_m1 vEsi, vBsi, vBsu, vBso SEP save(vEsi) + bcax_m1 vEso, vBso, vBsa, vBsu SEP save(vEso) + bcax_m1 vEsu, vBsu, vBse, vBsa SEP save(vEsu) + + restore(Aba) + restore(Abe) + restore(Abi) + restore(Abo) + restore(Abu) + restore(Aga) + restore(Age) + restore(Agi) + restore(Ago) + restore(Agu) + restore(Aka) + restore(Ake) + restore(Aki) + restore(Ako) + restore(Aku) + restore(Ama) + restore(Ame) + restore(Ami) + restore(Amo) + restore(Amu) + restore(Asa) + restore(Ase) + restore(Asi) + restore(Aso) + restore(Asu) + + ld1r {tmp.2d}, [const_addr], #8 + eor Aba.16b, Aba.16b, tmp.16b + + .unreq tmp + +.endm + +.macro keccak_f1600_round_core + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req v25 + + rax1_m1 E1, C0, C2 SEP save(E1) + rax1_m1 E0, C4, C1 SEP save(E0) + rax1_m1 E3, C2, C4 SEP save(E3) + rax1_m1 E4, C3, C0 SEP save(E4) + rax1_m1 E2, C1, C3 SEP save(E2) + + restore(E1) + xar_m1 vBgo, Ame, E1, 19 SEP restore(E0) + xar_m1 vBgi, Aka, E0, 61 SEP restore(E3) + xar_m1 vBga, Abo, E3, 36 SEP restore(E4) + xar_m1 vBge, Agu, E4, 44 SEP restore(E2) + xar_m1 vBgu, Asi, E2, 3 SEP + + bcax_m1 vEga, vBga, vBgi, vBge SEP save(vEga) + bcax_m1 vEge, vBge, vBgo, vBgi SEP save(vEge) + bcax_m1 vEgi, vBgi, vBgu, vBgo SEP save(vEgi) + bcax_m1 vEgo, vBgo, vBga, vBgu SEP save(vEgo) + bcax_m1 vEgu, vBgu, vBge, vBga SEP save(vEgu) + + restore(E3) + xar_m1 vBki, Ako, E3, 39 SEP restore(E4) + xar_m1 vBko, Amu, E4, 56 SEP restore(E1) + xar_m1 vBka, Abe, E1, 63 SEP restore(E2) + xar_m1 vBke, Agi, E2, 58 SEP restore(E0) + xar_m1 vBku, Asa, E0, 46 + + bcax_m1 vEka, vBka, vBki, vBke SEP save(vEka) + bcax_m1 vEke, vBke, vBko, vBki SEP save(vEke) + bcax_m1 vEki, vBki, vBku, vBko SEP save(vEki) + bcax_m1 vEko, vBko, vBka, vBku SEP save(vEko) + bcax_m1 vEku, vBku, vBke, vBka SEP save(vEku) + + eor2 C3 /* 30 */, vEko, vEgo /* 27 */ + eor2 C0 /* 27 */, vEka, vEga /* 28 */ + eor2 C1 /* 28 */, vEke, vEge /* 29 */ + eor2 C2 /* 29 */, vEki, vEgi + eor2 C4 /* 31 */, vEku, vEgu + restore(E3) + xar_m1 vBmu, Aso, E3, 8 SEP restore(E2) + xar_m1 vBmo, Ami, E2, 49 SEP restore(E1) + xar_m1 vBmi, Ake, E1, 54 SEP restore(E4) + xar_m1 vBma, Abu, E4, 37 SEP restore(E0) + xar_m1 vBme, Aga, E0, 28 + + bcax_m1 vEma, vBma, vBmi, vBme SEP save(vEma) + bcax_m1 vEme, vBme, vBmo, vBmi SEP save(vEme) + bcax_m1 vEmi, vBmi, vBmu, vBmo SEP save(vEmi) + bcax_m1 vEmo, vBmo, vBma, vBmu SEP save(vEmo) + bcax_m1 vEmu, vBmu, vBme, vBma SEP save(vEmu) + + eor2 C0, C0, vEma + eor2 C1, C1, vEme + eor2 C2, C2, vEmi + eor2 C3, C3, vEmo + eor2 C4, C4, vEmu + restore(E0) + eor2 vBba, Aba, E0 SEP restore(E2) + xar_m1 vBbi, Aki, E2, 21 SEP restore(E3) + xar_m1 vBbo, Amo, E3, 43 SEP restore(E4) + xar_m1 vBbu, Asu, E4, 50 SEP restore(E1) + xar_m1 vBbe, Age, E1, 20 + + bcax_m1 vEba, vBba, vBbi, vBbe + ld1r {tmp.2d}, [const_addr], #8 + eor2 vEba, vEba, tmp SEP save(vEba) + bcax_m1 vEbe, vBbe, vBbo, vBbi SEP save(vEbe) + bcax_m1 vEbi, vBbi, vBbu, vBbo SEP save(vEbi) + bcax_m1 vEbo, vBbo, vBba, vBbu SEP save(vEbo) + bcax_m1 vEbu, vBbu, vBbe, vBba SEP save(vEbu) + + eor2 C0, C0, vEba + eor2 C1, C1, vEbe + eor2 C2, C2, vEbi + eor2 C3, C3, vEbo + eor2 C4, C4, vEbu + restore(E2) + xar_m1 vBsa, Abi, E2, 2 SEP restore(E0) + xar_m1 vBso, Ama, E0, 23 SEP restore(E3) + xar_m1 vBse, Ago, E3, 9 SEP restore(E4) + xar_m1 vBsi, Aku, E4, 25 SEP restore(E1) + xar_m1 vBsu, Ase, E1, 62 + + bcax_m1 vEsa, vBsa, vBsi, vBse SEP save(vEsa) + bcax_m1 vEse, vBse, vBso, vBsi SEP save(vEse) + bcax_m1 vEsi, vBsi, vBsu, vBso SEP save(vEsi) + bcax_m1 vEso, vBso, vBsa, vBsu SEP save(vEso) + bcax_m1 vEsu, vBsu, vBse, vBsa SEP save(vEsu) + + eor2 C0, C0, vEsa + eor2 C1, C1, vEse + eor2 C2, C2, vEsi + eor2 C3, C3, vEso + eor2 C4, C4, vEsu + + restore(Aba) + restore(Abe) + restore(Abi) + restore(Abo) + restore(Abu) + restore(Aga) + restore(Age) + restore(Agi) + restore(Ago) + restore(Agu) + restore(Aka) + restore(Ake) + restore(Aki) + restore(Ako) + restore(Aku) + restore(Ama) + restore(Ame) + restore(Ami) + restore(Amo) + restore(Amu) + restore(Asa) + restore(Ase) + restore(Asi) + restore(Aso) + restore(Asu) + + .unreq tmp + +.endm + +.text +.align 4 +.global keccak_f1600_x2_v84a_asm_v2p1 +.global _keccak_f1600_x2_v84a_asm_v2p1 + +#define KECCAK_F1600_ROUNDS 24 + +keccak_f1600_x2_v84a_asm_v2p1: +_keccak_f1600_x2_v84a_asm_v2p1: + alloc_stack + save_vregs + load_constant_ptr + load_input + + //mov count, #(KECCAK_F1600_ROUNDS-2) + mov count, #11 + keccak_f1600_round_pre +loop: + keccak_f1600_round_core + keccak_f1600_round_core + sub count, count, #1 + cbnz count, loop + + keccak_f1600_round_core + keccak_f1600_round_post + store_input + restore_vregs + free_stack + ret diff --git a/tests/keccak_neon/manual/keccak_f1600_x2_v84a_asm_v2p2.s b/tests/keccak_neon/manual/keccak_f1600_x2_v84a_asm_v2p2.s new file mode 100644 index 0000000..c224667 --- /dev/null +++ b/tests/keccak_neon/manual/keccak_f1600_x2_v84a_asm_v2p2.s @@ -0,0 +1,802 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +/********************** CONSTANTS *************************/ + .data + .align(8) +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x1 + count .req x2 + cur_const .req x3 + + /* Mapping of Kecck-f1600 state to vector registers + * at the beginning and end of each round. */ + Aba .req v0 + Abe .req v1 + Abi .req v2 + Abo .req v3 + Abu .req v4 + Aga .req v5 + Age .req v6 + Agi .req v7 + Ago .req v8 + Agu .req v9 + Aka .req v10 + Ake .req v11 + Aki .req v12 + Ako .req v13 + Aku .req v14 + Ama .req v15 + Ame .req v16 + Ami .req v17 + Amo .req v18 + Amu .req v19 + Asa .req v20 + Ase .req v21 + Asi .req v22 + Aso .req v23 + Asu .req v24 + + /* q-form of the above mapping */ + Abaq .req q0 + Abeq .req q1 + Abiq .req q2 + Aboq .req q3 + Abuq .req q4 + Agaq .req q5 + Ageq .req q6 + Agiq .req q7 + Agoq .req q8 + Aguq .req q9 + Akaq .req q10 + Akeq .req q11 + Akiq .req q12 + Akoq .req q13 + Akuq .req q14 + Amaq .req q15 + Ameq .req q16 + Amiq .req q17 + Amoq .req q18 + Amuq .req q19 + Asaq .req q20 + Aseq .req q21 + Asiq .req q22 + Asoq .req q23 + Asuq .req q24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v27 + C1 .req v28 + C2 .req v29 + C3 .req v30 + C4 .req v31 + + C0q .req q27 + C1q .req q28 + C2q .req q29 + C3q .req q30 + C4q .req q31 + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req v26 + E1 .req v26 + E2 .req v26 + E3 .req v26 + E4 .req v26 + + E0q .req q26 + E1q .req q26 + E2q .req q26 + E3q .req q26 + E4q .req q26 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + // vBgi .req v27 + // vBgo .req v28 + // vBga .req v29 + // vBge .req v30 + // vBgu .req v31 + vBki .req v27 + vBko .req v28 + vBka .req v29 + vBke .req v30 + vBku .req v31 + vBmu .req v27 + vBmo .req v28 + vBmi .req v29 + vBma .req v30 + vBme .req v31 + vBba .req v27 + vBbi .req v28 + vBbo .req v29 + vBbu .req v30 + vBbe .req v31 + vBsa .req v27 + vBso .req v28 + vBse .req v29 + vBsi .req v30 + vBsu .req v31 + + // vBgiq .req q27 + // vBgoq .req q28 + // vBgaq .req q29 + // vBgeq .req q30 + // vBguq .req q31 + vBkiq .req q27 + vBkoq .req q28 + vBkaq .req q29 + vBkeq .req q30 + vBkuq .req q31 + vBmuq .req q27 + vBmoq .req q28 + vBmiq .req q29 + vBmaq .req q30 + vBmeq .req q31 + vBbaq .req q27 + vBbiq .req q28 + vBboq .req q29 + vBbuq .req q30 + vBbeq .req q31 + vBsaq .req q27 + vBsoq .req q28 + vBseq .req q29 + vBsiq .req q30 + vBsuq .req q31 + + vEgu .req Agu + vEga .req v26 + vEge .req v26 + vEgi .req v26 + vEgo .req v26 + vEka .req Aka + vEko .req Ako + vEke .req v26 + vEki .req v26 + vEku .req v26 + vEma .req v26 + vEme .req Ame + vEmi .req Ami + vEmo .req v26 + vEmu .req Amu + vEba .req Aba + vEbe .req Abe + vEbi .req v26 + vEbo .req Abo + vEbu .req Abu + vEsa .req Asa + vEse .req Ase + vEsi .req Asi + vEso .req Aso + vEsu .req Asu + + vEguq .req Aguq + vEgaq .req q26 + vEgeq .req q26 + vEgiq .req q26 + vEgoq .req q26 + vEkaq .req Akaq + vEkoq .req Akoq + vEkeq .req q26 + vEkiq .req q26 + vEkuq .req q26 + vEmaq .req q26 + vEmeq .req Ameq + vEmiq .req Amiq + vEmoq .req q26 + vEmuq .req Amuq + vEbaq .req Abaq + vEbeq .req Abeq + vEbiq .req q26 + vEboq .req Aboq + vEbuq .req Abuq + vEsaq .req Asaq + vEseq .req Aseq + vEsiq .req Asiq + vEsoq .req Asoq + vEsuq .req Asuq + +/************************ MACROS ****************************/ + +.macro load_input + ldp Abaq, Abeq, [input_addr, #(2*8*0)] + ldp Abiq, Aboq, [input_addr, #(2*8*2)] + ldp Abuq, Agaq, [input_addr, #(2*8*4)] + ldp Ageq, Agiq, [input_addr, #(2*8*6)] + ldp Agoq, Aguq, [input_addr, #(2*8*8)] + ldp Akaq, Akeq, [input_addr, #(2*8*10)] + ldp Akiq, Akoq, [input_addr, #(2*8*12)] + ldp Akuq, Amaq, [input_addr, #(2*8*14)] + ldp Ameq, Amiq, [input_addr, #(2*8*16)] + ldp Amoq, Amuq, [input_addr, #(2*8*18)] + ldp Asaq, Aseq, [input_addr, #(2*8*20)] + ldp Asiq, Asoq, [input_addr, #(2*8*22)] + ldr Asuq, [input_addr, #(2*8*24)] +.endm + +.macro store_input + str Abaq, [input_addr, #(2*8*0)] + str Abeq, [input_addr, #(2*8*1)] + str Abiq, [input_addr, #(2*8*2)] + str Aboq, [input_addr, #(2*8*3)] + str Abuq, [input_addr, #(2*8*4)] + str Agaq, [input_addr, #(2*8*5)] + str Ageq, [input_addr, #(2*8*6)] + str Agiq, [input_addr, #(2*8*7)] + str Agoq, [input_addr, #(2*8*8)] + str Aguq, [input_addr, #(2*8*9)] + str Akaq, [input_addr, #(2*8*10)] + str Akeq, [input_addr, #(2*8*11)] + str Akiq, [input_addr, #(2*8*12)] + str Akoq, [input_addr, #(2*8*13)] + str Akuq, [input_addr, #(2*8*14)] + str Amaq, [input_addr, #(2*8*15)] + str Ameq, [input_addr, #(2*8*16)] + str Amiq, [input_addr, #(2*8*17)] + str Amoq, [input_addr, #(2*8*18)] + str Amuq, [input_addr, #(2*8*19)] + str Asaq, [input_addr, #(2*8*20)] + str Aseq, [input_addr, #(2*8*21)] + str Asiq, [input_addr, #(2*8*22)] + str Asoq, [input_addr, #(2*8*23)] + str Asuq, [input_addr, #(2*8*24)] +.endm + +#define STACK_SIZE (16*4 + 16*30) +#define STACK_BASE_VREGS 0 +#define STACK_BASE_TMP 16*4 + +#define E0_offset 0 +#define E1_offset 1 +#define E2_offset 2 +#define E3_offset 3 +#define E4_offset 4 + +#define Aba_offset (5 + 0 ) +#define Abe_offset (5 + 1 ) +#define Abi_offset (5 + 2 ) +#define Abo_offset (5 + 3 ) +#define Abu_offset (5 + 4 ) +#define Aga_offset (5 + 5 ) +#define Age_offset (5 + 6 ) +#define Agi_offset (5 + 7 ) +#define Ago_offset (5 + 8 ) +#define Agu_offset (5 + 9 ) +#define Aka_offset (5 + 10 ) +#define Ake_offset (5 + 11 ) +#define Aki_offset (5 + 12 ) +#define Ako_offset (5 + 13 ) +#define Aku_offset (5 + 14 ) +#define Ama_offset (5 + 15 ) +#define Ame_offset (5 + 16 ) +#define Ami_offset (5 + 17 ) +#define Amo_offset (5 + 18 ) +#define Amu_offset (5 + 19 ) +#define Asa_offset (5 + 20 ) +#define Ase_offset (5 + 21 ) +#define Asi_offset (5 + 22 ) +#define Aso_offset (5 + 23 ) +#define Asu_offset (5 + 24 ) + +#define vEba_offset (5 + 0 ) +#define vEbe_offset (5 + 1 ) +#define vEbi_offset (5 + 2 ) +#define vEbo_offset (5 + 3 ) +#define vEbu_offset (5 + 4 ) +#define vEga_offset (5 + 5 ) +#define vEge_offset (5 + 6 ) +#define vEgi_offset (5 + 7 ) +#define vEgo_offset (5 + 8 ) +#define vEgu_offset (5 + 9 ) +#define vEka_offset (5 + 10 ) +#define vEke_offset (5 + 11 ) +#define vEki_offset (5 + 12 ) +#define vEko_offset (5 + 13 ) +#define vEku_offset (5 + 14 ) +#define vEma_offset (5 + 15 ) +#define vEme_offset (5 + 16 ) +#define vEmi_offset (5 + 17 ) +#define vEmo_offset (5 + 18 ) +#define vEmu_offset (5 + 19 ) +#define vEsa_offset (5 + 20 ) +#define vEse_offset (5 + 21 ) +#define vEsi_offset (5 + 22 ) +#define vEso_offset (5 + 23 ) +#define vEsu_offset (5 + 24 ) + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +#define save(name) \ + str name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] +#define restore(name) \ + ldr name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] + +.macro save_vregs + stp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + stp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + stp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + stp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + ldp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + ldp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + ldp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +/* Macros using v8.4-A SHA-3 instructions */ + +.macro eor3_m1_0 d s0 s1 s2 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor2 d s0 s1 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor3_m1_1 d s0 s1 s2 + eor \d\().16b, \d\().16b, \s2\().16b +.endm + +.macro eor3_m1 d s0 s1 s2 + eor3_m1_0 \d, \s0, \s1, \s2 + eor3_m1_1 \d, \s0, \s1, \s2 +.endm + +.macro rax1_m1 d s0 s1 + add tmp.2d, \s1\().2d, \s1\().2d + sri tmp.2d, \s1\().2d, #63 + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +.macro xar_m1 d s0 s1 imm + eor \s0\().16b, \s0\().16b, \s1\().16b + shl \d\().2d, \s0\().2d, #(64-\imm) + sri \d\().2d, \s0\().2d, #(\imm) +.endm + +.macro xar_m1_0 d s0 s1 imm tmp + eor \tmp\().16b, \s0\().16b, \s1\().16b +.endm + +.macro xar_m1_1 d s0 s1 imm tmp + shl \d\().2d, \tmp\().2d, #(64-\imm) +.endm + +.macro xar_m1_2 d s0 s1 imm tmp + sri \d\().2d, \tmp\().2d, #(\imm) +.endm + +.macro bcax_m1 d s0 s1 s2 + bic tmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +.macro refresh d + mov \d\().16b, \d\().16b +.endm +/* Keccak-f1600 round */ + +.macro keccak_f1600_round_pre + eor2 C0, Aka, Aga + eor2 C1, Ake, Age + eor2 C2, Aki, Agi + eor2 C3, Ako, Ago + eor2 C4, Aku, Agu + eor2 C0, C0, Ama + eor2 C1, C1, Ame + eor2 C2, C2, Ami + eor2 C3, C3, Amo + eor2 C4, C4, Amu + eor2 C0, C0, Asa + eor2 C1, C1, Ase + eor2 C2, C2, Asi + eor2 C3, C3, Aso + eor2 C4, C4, Asu + eor2 C0, C0, Aba + eor2 C1, C1, Abe + eor2 C2, C2, Abi + eor2 C3, C3, Abo + eor2 C4, C4, Abu +.endm + +.macro keccak_f1600_round_core + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req v25 + + .unreq E0 + .unreq E1 + .unreq E2 + .unreq E3 + .unreq E4 + .unreq E0q + .unreq E1q + .unreq E2q + .unreq E3q + .unreq E4q + + E1 .req v26 + E3 .req C2 + E0 .req C4 + E2 .req C1 + E4 .req C3 + + E1q .req q26 + E3q .req C2q + E0q .req C4q + E2q .req C1q + E4q .req C3q + + rax1_m1 E1, C0, C2 SEP save(E1) + rax1_m1 E3, C2, C4 SEP save(E3) + rax1_m1 E0, C4, C1 SEP save(E0) + rax1_m1 E2, C1, C3 SEP save(E2) + rax1_m1 E4, C3, C0 SEP save(E4) + + vBgi .req E0 + vBgo .req v27 + vBga .req E3 + vBge .req E4 + vBgu .req E2 + + xar_m1 vBgi, Aka, E0, 61 SEP + xar_m1 vBgo, Ame, E1, 19 SEP + xar_m1 vBga, Abo, E3, 36 SEP + xar_m1 vBge, Agu, E4, 44 SEP + xar_m1 vBgu, Asi, E2, 3 SEP + + bcax_m1 vEga, vBga, vBgi, vBge SEP save(vEga) + bcax_m1 vEge, vBge, vBgo, vBgi SEP save(vEge) + bcax_m1 vEgi, vBgi, vBgu, vBgo SEP save(vEgi) + bcax_m1 vEgo, vBgo, vBga, vBgu SEP save(vEgo) + bcax_m1 vEgu, vBgu, vBge, vBga + + .unreq E0 + .unreq E1 + .unreq E2 + .unreq E3 + .unreq E4 + .unreq E0q + .unreq E1q + .unreq E2q + .unreq E3q + .unreq E4q + + E0 .req v26 + E1 .req v26 + E2 .req v26 + E3 .req v26 + E4 .req v26 + E0q .req q26 + E1q .req q26 + E2q .req q26 + E3q .req q26 + E4q .req q26 + + restore(E3) + xar_m1 vBki, Ako, E3, 39 SEP restore(E4) + xar_m1 vBko, Amu, E4, 56 SEP restore(E1) + xar_m1 vBka, Abe, E1, 63 SEP restore(E2) + xar_m1 vBke, Agi, E2, 58 SEP restore(E0) + xar_m1 vBku, Asa, E0, 46 + + bcax_m1 vEke, vBke, vBko, vBki SEP save(vEke) + bcax_m1 vEki, vBki, vBku, vBko SEP save(vEki) + bcax_m1 vEku, vBku, vBke, vBka SEP save(vEku) + bcax_m1 vEko, vBko, vBka, vBku + bcax_m1 vEka, vBka, vBki, vBke + + restore(E3) + xar_m1 vBmu, Aso, E3, 8 SEP restore(E2) + xar_m1 vBmo, Ami, E2, 49 SEP restore(E1) + xar_m1 vBmi, Ake, E1, 54 SEP restore(E4) + xar_m1 vBma, Abu, E4, 37 SEP restore(E0) + xar_m1 vBme, Aga, E0, 28 + + bcax_m1 vEma, vBma, vBmi, vBme SEP save(vEma) + bcax_m1 vEmo, vBmo, vBma, vBmu SEP save(vEmo) + bcax_m1 vEme, vBme, vBmo, vBmi + bcax_m1 vEmi, vBmi, vBmu, vBmo + bcax_m1 vEmu, vBmu, vBme, vBma + + restore(E0) + eor2 vBba, Aba, E0 SEP restore(E2) + xar_m1 vBbi, Aki, E2, 21 SEP restore(E3) + xar_m1 vBbo, Amo, E3, 43 SEP restore(E4) + xar_m1 vBbu, Asu, E4, 50 SEP restore(E1) + xar_m1 vBbe, Age, E1, 20 + + bcax_m1 vEbi, vBbi, vBbu, vBbo SEP save(vEbi) + bcax_m1 vEba, vBba, vBbi, vBbe + ld1r {tmp.2d}, [const_addr], #8 + eor2 vEba, vEba, tmp + bcax_m1 vEbe, vBbe, vBbo, vBbi + bcax_m1 vEbo, vBbo, vBba, vBbu + bcax_m1 vEbu, vBbu, vBbe, vBba + + restore(E2) + xar_m1 vBsa, Abi, E2, 2 SEP restore(E0) + xar_m1 vBso, Ama, E0, 23 SEP restore(E3) + xar_m1 vBse, Ago, E3, 9 SEP restore(E4) + xar_m1 vBsi, Aku, E4, 25 SEP restore(E1) + xar_m1 vBsu, Ase, E1, 62 + + bcax_m1 vEsa, vBsa, vBsi, vBse SEP restore(Amo) + bcax_m1 vEse, vBse, vBso, vBsi SEP restore(Agi) + bcax_m1 vEsi, vBsi, vBsu, vBso SEP restore(Abi) + bcax_m1 vEso, vBso, vBsa, vBsu SEP restore(Ake) + bcax_m1 vEsu, vBsu, vBse, vBsa SEP restore(Aki) + + restore(Age) + restore(Aku) + restore(Ama) + restore(Aga) + restore(Ago) + + eor2 C3, Ako, Ago + eor2 C0, Aka, Aga + eor2 C1, Ake, Age + eor2 C2, Aki, Agi + eor2 C4, Aku, Agu + + eor2 C0, C0, Ama + eor2 C1, C1, Ame + eor2 C2, C2, Ami + eor2 C3, C3, Amo + eor2 C4, C4, Amu + + eor2 C0, C0, Aba + eor2 C1, C1, Abe + eor2 C2, C2, Abi + eor2 C3, C3, Abo + eor2 C4, C4, Abu + + eor2 C0, C0, Asa + eor2 C1, C1, Ase + eor2 C2, C2, Asi + eor2 C3, C3, Aso + eor2 C4, C4, Asu + + .unreq tmp + +.endm + +.macro keccak_f1600_round_post + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req v25 + + .unreq E0 + .unreq E1 + .unreq E2 + .unreq E3 + .unreq E4 + .unreq E0q + .unreq E1q + .unreq E2q + .unreq E3q + .unreq E4q + + E1 .req v26 + E3 .req C2 + E0 .req C4 + E2 .req C1 + E4 .req C3 + + E1q .req q26 + E3q .req C2q + E0q .req C4q + E2q .req C1q + E4q .req C3q + + rax1_m1 E1, C0, C2 SEP save(E1) + rax1_m1 E3, C2, C4 SEP save(E3) + rax1_m1 E0, C4, C1 SEP save(E0) + rax1_m1 E2, C1, C3 SEP save(E2) + rax1_m1 E4, C3, C0 SEP save(E4) + + .unreq vBgi + .unreq vBgo + .unreq vBga + .unreq vBge + .unreq vBgu + vBgi .req E0 + vBgo .req v27 + vBga .req E3 + vBge .req E4 + vBgu .req E2 + + xar_m1 vBgi, Aka, E0, 61 SEP + xar_m1 vBgo, Ame, E1, 19 SEP + xar_m1 vBga, Abo, E3, 36 SEP + xar_m1 vBge, Agu, E4, 44 SEP + xar_m1 vBgu, Asi, E2, 3 SEP + + bcax_m1 vEga, vBga, vBgi, vBge SEP save(vEga) + bcax_m1 vEge, vBge, vBgo, vBgi SEP save(vEge) + bcax_m1 vEgi, vBgi, vBgu, vBgo SEP save(vEgi) + bcax_m1 vEgo, vBgo, vBga, vBgu SEP save(vEgo) + bcax_m1 vEgu, vBgu, vBge, vBga + + .unreq E0 + .unreq E1 + .unreq E2 + .unreq E3 + .unreq E4 + .unreq E0q + .unreq E1q + .unreq E2q + .unreq E3q + .unreq E4q + + E0 .req v26 + E1 .req v26 + E2 .req v26 + E3 .req v26 + E4 .req v26 + E0q .req q26 + E1q .req q26 + E2q .req q26 + E3q .req q26 + E4q .req q26 + + restore(E3) + xar_m1 vBki, Ako, E3, 39 SEP restore(E4) + xar_m1 vBko, Amu, E4, 56 SEP restore(E1) + xar_m1 vBka, Abe, E1, 63 SEP restore(E2) + xar_m1 vBke, Agi, E2, 58 SEP restore(E0) + xar_m1 vBku, Asa, E0, 46 + + bcax_m1 vEke, vBke, vBko, vBki SEP save(vEke) + bcax_m1 vEki, vBki, vBku, vBko SEP save(vEki) + bcax_m1 vEku, vBku, vBke, vBka SEP save(vEku) + bcax_m1 vEko, vBko, vBka, vBku + bcax_m1 vEka, vBka, vBki, vBke + + restore(E3) + xar_m1 vBmu, Aso, E3, 8 SEP restore(E2) + xar_m1 vBmo, Ami, E2, 49 SEP restore(E1) + xar_m1 vBmi, Ake, E1, 54 SEP restore(E4) + xar_m1 vBma, Abu, E4, 37 SEP restore(E0) + xar_m1 vBme, Aga, E0, 28 + + bcax_m1 vEma, vBma, vBmi, vBme SEP save(vEma) + bcax_m1 vEmo, vBmo, vBma, vBmu SEP save(vEmo) + bcax_m1 vEme, vBme, vBmo, vBmi + bcax_m1 vEmi, vBmi, vBmu, vBmo + bcax_m1 vEmu, vBmu, vBme, vBma + + restore(E0) + eor2 vBba, Aba, E0 SEP restore(E2) + xar_m1 vBbi, Aki, E2, 21 SEP restore(E3) + xar_m1 vBbo, Amo, E3, 43 SEP restore(E4) + xar_m1 vBbu, Asu, E4, 50 SEP restore(E1) + xar_m1 vBbe, Age, E1, 20 + + bcax_m1 vEbi, vBbi, vBbu, vBbo SEP save(vEbi) + bcax_m1 vEba, vBba, vBbi, vBbe + ld1r {tmp.2d}, [const_addr], #8 + eor2 vEba, vEba, tmp + bcax_m1 vEbe, vBbe, vBbo, vBbi + bcax_m1 vEbo, vBbo, vBba, vBbu + bcax_m1 vEbu, vBbu, vBbe, vBba + + restore(E2) + xar_m1 vBsa, Abi, E2, 2 SEP restore(E0) + xar_m1 vBso, Ama, E0, 23 SEP restore(E3) + xar_m1 vBse, Ago, E3, 9 SEP restore(E4) + xar_m1 vBsi, Aku, E4, 25 SEP restore(E1) + xar_m1 vBsu, Ase, E1, 62 + + bcax_m1 vEsa, vBsa, vBsi, vBse SEP restore(Amo) + bcax_m1 vEse, vBse, vBso, vBsi SEP restore(Agi) + bcax_m1 vEsi, vBsi, vBsu, vBso SEP restore(Abi) + bcax_m1 vEso, vBso, vBsa, vBsu SEP restore(Ake) + bcax_m1 vEsu, vBsu, vBse, vBsa SEP restore(Aki) + + restore(Age) + restore(Aku) + restore(Ama) + restore(Aga) + restore(Ago) + + .unreq tmp + +.endm + + +.text +.align 4 +.global keccak_f1600_x2_v84a_asm_v2p2 +.global _keccak_f1600_x2_v84a_asm_v2p2 + +#define KECCAK_F1600_ROUNDS 24 + +keccak_f1600_x2_v84a_asm_v2p2: +_keccak_f1600_x2_v84a_asm_v2p2: + alloc_stack + save_vregs + load_constant_ptr + load_input + + //mov count, #(KECCAK_F1600_ROUNDS-2) + mov count, #11 + keccak_f1600_round_pre +loop: + keccak_f1600_round_core + keccak_f1600_round_core + sub count, count, #1 + cbnz count, loop + + keccak_f1600_round_core + keccak_f1600_round_post + store_input + restore_vregs + free_stack + ret diff --git a/tests/keccak_neon/manual/keccak_f1600_x2_v84a_asm_v2p3.s b/tests/keccak_neon/manual/keccak_f1600_x2_v84a_asm_v2p3.s new file mode 100644 index 0000000..e83d5ce --- /dev/null +++ b/tests/keccak_neon/manual/keccak_f1600_x2_v84a_asm_v2p3.s @@ -0,0 +1,773 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +/********************** CONSTANTS *************************/ + .data + .align(8) +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x1 + count .req x2 + cur_const .req x3 + + /* Mapping of Kecck-f1600 state to vector registers + * at the beginning and end of each round. */ + Aba .req v0 + Abe .req v1 + Abo .req v2 + Abu .req v3 + Agu .req v4 + Aka .req v5 + Ako .req v6 + Ame .req v7 + Ami .req v8 + Amu .req v9 + Asa .req v10 + Ase .req v11 + Asi .req v12 + Aso .req v13 + Asu .req v14 + + Agi .req v15 + Ake .req v16 + Aga .req v17 + Aki .req v18 + + Abi .req v19 + Ama .req v20 + Ago .req v21 + Aku .req v22 + Age .req v23 + Amo .req v24 + + /* q-form of the above mapping */ + Abaq .req q0 + Abeq .req q1 + Aboq .req q2 + Abuq .req q3 + Aguq .req q4 + Akaq .req q5 + Akoq .req q6 + Ameq .req q7 + Amiq .req q8 + Amuq .req q9 + Asaq .req q10 + Aseq .req q11 + Asiq .req q12 + Asoq .req q13 + Asuq .req q14 + + Agiq .req q15 + Akeq .req q16 + Agaq .req q17 + Akiq .req q18 + + Abiq .req q19 + Amaq .req q20 + Agoq .req q21 + Akuq .req q22 + Ageq .req q23 + Amoq .req q24 + + spare0 .req v25 + spare1 .req v26 + spare2 .req v27 + spare3 .req v28 + spare4 .req v29 + spare5 .req v30 + spare0q .req q25 + spare1q .req q26 + spare2q .req q27 + spare3q .req q28 + spare4q .req q29 + spare5q .req q30 + + vEgu .req Agu /* keep */ + vEga .req spare0 /* out */ + vEge .req spare1 /* out */ + vEgi .req spare2 /* out */ + vEgo .req spare3 /* out */ + + vEka .req Aka /* keep */ + vEko .req Ako /* keep */ + vEke .req spare4 /* out */ + vEki .req spare5 /* out */ + vEku .req Agi /* in */ + + vEma .req Ake /* in */ + vEme .req Ame /* keep */ + vEmi .req Ami /* keep */ + vEmo .req Aga /* in */ + vEmu .req Amu /* keep */ + + vEba .req Aba /* keep */ + vEbe .req Abe /* keep */ + vEbi .req Aki /* in */ + vEbo .req Abo /* keep */ + vEbu .req Abu /* keep */ + + vEsa .req Asa /* keep */ + vEse .req Ase /* keep */ + vEsi .req Asi /* keep */ + vEso .req Aso /* keep */ + vEsu .req Asu /* keep */ + + vEguq .req Aguq + vEgaq .req spare0q + vEgeq .req spare1q + vEgiq .req spare2q + vEgoq .req spare3q + + vEkaq .req Akaq + vEkoq .req Akoq + vEkeq .req spare4q + vEkiq .req spare5q + vEkuq .req Agiq + + vEmaq .req Akeq + vEmeq .req Ameq + vEmiq .req Amiq + vEmoq .req Agaq + vEmuq .req Amuq + + vEbaq .req Abaq + vEbeq .req Abeq + vEbiq .req Akiq + vEboq .req Aboq + vEbuq .req Abuq + + vEsaq .req Asaq + vEseq .req Aseq + vEsiq .req Asiq + vEsoq .req Asoq + vEsuq .req Asuq + + tmp .req v31 + tmpq .req q31 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req spare0 + C1 .req spare1 + C2 .req spare2 + C3 .req spare3 + C4 .req spare4 + C0q .req spare0q + C1q .req spare1q + C2q .req spare2q + C3q .req spare3q + C4q .req spare4q + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + + // Registers used during computation time + E1c .req spare5 + E3c .req C2 + E0c .req C4 + E2c .req C1 + E4c .req C3 + + E1cq .req spare5q + E3cq .req C2q + E0cq .req C4q + E2cq .req C1q + E4cq .req C3q + + // Registers during use time + E0u .req tmp + E1u .req tmp + E2u .req tmp + E3u .req tmp + E4u .req tmp + + E0uq .req tmpq + E1uq .req tmpq + E2uq .req tmpq + E3uq .req tmpq + E4uq .req tmpq + + vBgo .req E1c + vBgi .req Ame + vBga .req Aka + vBge .req Abo + vBgu .req Agu + + vBko .req Ame + vBka .req Amu + vBke .req Abe + vBku .req Agi + vBki .req Asa + + vBmu .req Abo + vBmo .req Aso + vBmi .req Abu + vBma .req Asi + vBme .req Abe + + vBba .req Asi + vBbi .req Asa + vBbo .req Aso + vBbu .req Amo + vBbe .req Asu + + vBsa .req Amo + vBso .req Abi + vBse .req Ama + vBsi .req Ago + vBsu .req Aku + +/************************ MACROS ****************************/ + +.macro load_input + ldp Abaq, Abeq, [input_addr, #(2*8*0)] + ldp Abiq, Aboq, [input_addr, #(2*8*2)] + ldp Abuq, Agaq, [input_addr, #(2*8*4)] + ldp Ageq, Agiq, [input_addr, #(2*8*6)] + ldp Agoq, Aguq, [input_addr, #(2*8*8)] + ldp Akaq, Akeq, [input_addr, #(2*8*10)] + ldp Akiq, Akoq, [input_addr, #(2*8*12)] + ldp Akuq, Amaq, [input_addr, #(2*8*14)] + ldp Ameq, Amiq, [input_addr, #(2*8*16)] + ldp Amoq, Amuq, [input_addr, #(2*8*18)] + ldp Asaq, Aseq, [input_addr, #(2*8*20)] + ldp Asiq, Asoq, [input_addr, #(2*8*22)] + ldr Asuq, [input_addr, #(2*8*24)] +.endm + +.macro store_input + str Abaq, [input_addr, #(2*8*0)] + str Abeq, [input_addr, #(2*8*1)] + str Abiq, [input_addr, #(2*8*2)] + str Aboq, [input_addr, #(2*8*3)] + str Abuq, [input_addr, #(2*8*4)] + str Agaq, [input_addr, #(2*8*5)] + str Ageq, [input_addr, #(2*8*6)] + str Agiq, [input_addr, #(2*8*7)] + str Agoq, [input_addr, #(2*8*8)] + str Aguq, [input_addr, #(2*8*9)] + str Akaq, [input_addr, #(2*8*10)] + str Akeq, [input_addr, #(2*8*11)] + str Akiq, [input_addr, #(2*8*12)] + str Akoq, [input_addr, #(2*8*13)] + str Akuq, [input_addr, #(2*8*14)] + str Amaq, [input_addr, #(2*8*15)] + str Ameq, [input_addr, #(2*8*16)] + str Amiq, [input_addr, #(2*8*17)] + str Amoq, [input_addr, #(2*8*18)] + str Amuq, [input_addr, #(2*8*19)] + str Asaq, [input_addr, #(2*8*20)] + str Aseq, [input_addr, #(2*8*21)] + str Asiq, [input_addr, #(2*8*22)] + str Asoq, [input_addr, #(2*8*23)] + str Asuq, [input_addr, #(2*8*24)] +.endm + +#define STACK_SIZE (16*4 + 16*30) +#define STACK_BASE_VREGS 0 +#define STACK_BASE_TMP 16*4 + +#define E0c_offset 0 +#define E1c_offset 1 +#define E2c_offset 2 +#define E3c_offset 3 +#define E4c_offset 4 +#define E0u_offset 0 +#define E1u_offset 1 +#define E2u_offset 2 +#define E3u_offset 3 +#define E4u_offset 4 + +#define Aba_offset (5 + 0 ) +#define Abe_offset (5 + 1 ) +#define Abi_offset (5 + 2 ) +#define Abo_offset (5 + 3 ) +#define Abu_offset (5 + 4 ) +#define Aga_offset (5 + 5 ) +#define Age_offset (5 + 6 ) +#define Agi_offset (5 + 7 ) +#define Ago_offset (5 + 8 ) +#define Agu_offset (5 + 9 ) +#define Aka_offset (5 + 10 ) +#define Ake_offset (5 + 11 ) +#define Aki_offset (5 + 12 ) +#define Ako_offset (5 + 13 ) +#define Aku_offset (5 + 14 ) +#define Ama_offset (5 + 15 ) +#define Ame_offset (5 + 16 ) +#define Ami_offset (5 + 17 ) +#define Amo_offset (5 + 18 ) +#define Amu_offset (5 + 19 ) +#define Asa_offset (5 + 20 ) +#define Ase_offset (5 + 21 ) +#define Asi_offset (5 + 22 ) +#define Aso_offset (5 + 23 ) +#define Asu_offset (5 + 24 ) + +#define vEba_offset (5 + 0 ) +#define vEbe_offset (5 + 1 ) +#define vEbi_offset (5 + 2 ) +#define vEbo_offset (5 + 3 ) +#define vEbu_offset (5 + 4 ) +#define vEga_offset (5 + 5 ) +#define vEge_offset (5 + 6 ) +#define vEgi_offset (5 + 7 ) +#define vEgo_offset (5 + 8 ) +#define vEgu_offset (5 + 9 ) +#define vEka_offset (5 + 10 ) +#define vEke_offset (5 + 11 ) +#define vEki_offset (5 + 12 ) +#define vEko_offset (5 + 13 ) +#define vEku_offset (5 + 14 ) +#define vEma_offset (5 + 15 ) +#define vEme_offset (5 + 16 ) +#define vEmi_offset (5 + 17 ) +#define vEmo_offset (5 + 18 ) +#define vEmu_offset (5 + 19 ) +#define vEsa_offset (5 + 20 ) +#define vEse_offset (5 + 21 ) +#define vEsi_offset (5 + 22 ) +#define vEso_offset (5 + 23 ) +#define vEsu_offset (5 + 24 ) + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +#define save(name) \ + str name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] +#define restore(name) \ + ldr name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] + +.macro save_vregs + stp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + stp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + stp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + stp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + ldp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + ldp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + ldp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +/* Macros using v8.4-A SHA-3 instructions */ + +.macro eor3_m1_0 d s0 s1 s2 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor2 d s0 s1 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor5 out i0 i1 i2 i3 i4 tmp + eor2 \out, \i0, \i1 + eor2 \tmp, \i3, \i4 + eor2 \out, \out, \i2 + eor2 \out, \out, \tmp +.endm + +.macro move d s + mov \d\().16b, \s\().16b +.endm + + +.macro eor3_m1_1 d s0 s1 s2 + eor \d\().16b, \d\().16b, \s2\().16b +.endm + +.macro eor3_m1 d s0 s1 s2 + eor3_m1_0 \d, \s0, \s1, \s2 + eor3_m1_1 \d, \s0, \s1, \s2 +.endm + +.macro rax1_m1 d s0 s1 + add tmp.2d, \s1\().2d, \s1\().2d + sri tmp.2d, \s1\().2d, #63 + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +.macro xar_m1 d s0 s1 imm + eor \s0\().16b, \s0\().16b, \s1\().16b + shl \d\().2d, \s0\().2d, #(64-\imm) + sri \d\().2d, \s0\().2d, #(\imm) +.endm + +.macro xar_m1_0 d s0 s1 imm tmp + eor \tmp\().16b, \s0\().16b, \s1\().16b +.endm + +.macro xar_m1_1 d s0 s1 imm tmp + shl \d\().2d, \tmp\().2d, #(64-\imm) +.endm + +.macro xar_m1_2 d s0 s1 imm tmp + sri \d\().2d, \tmp\().2d, #(\imm) +.endm + +.macro bcax_m1 d s0 s1 s2 + bic tmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +.macro refresh d + mov \d\().16b, \d\().16b +.endm +/* Keccak-f1600 round */ + +.macro keccak_f1600_round_pre + eor2 C0, Aka, Aga + eor2 C1, Ake, Age + eor2 C2, Aki, Agi + eor2 C3, Ako, Ago + eor2 C4, Aku, Agu + eor2 C0, C0, Ama + eor2 C1, C1, Ame + eor2 C2, C2, Ami + eor2 C3, C3, Amo + eor2 C4, C4, Amu + eor2 C0, C0, Asa + eor2 C1, C1, Ase + eor2 C2, C2, Asi + eor2 C3, C3, Aso + eor2 C4, C4, Asu + eor2 C0, C0, Aba + eor2 C1, C1, Abe + eor2 C2, C2, Abi + eor2 C3, C3, Abo + eor2 C4, C4, Abu +.endm + +.macro keccak_f1600_round_core + + /* 5x RAX1, 15 Neon Instructions total */ + + rax1_m1 E1c, C0, C2 SEP save(E1c) + rax1_m1 E3c, C2, C4 SEP save(E3c) + rax1_m1 E0c, C4, C1 SEP save(E0c) + rax1_m1 E2c, C1, C3 SEP save(E2c) + rax1_m1 E4c, C3, C0 SEP save(E4c) + + xar_m1 vBgo, Ame /* used at block 3 */, E1c, 19 + xar_m1 vBgi, Aka /* used at block 2 */, E0c, 61 + xar_m1 vBga, Abo /* used at block 4 */, E3c, 36 + xar_m1 vBge, Agu /* used at block 1 */, E4c, 44 + xar_m1 vBgu, Asi /* used at block 5 */, E2c, 3 + + bcax_m1 vEga, vBga, vBgi, vBge SEP save(vEga) /* TEMP */ + bcax_m1 vEge, vBge, vBgo, vBgi + bcax_m1 vEgi, vBgi, vBgu, vBgo SEP save(vEgi) /* TEMP */ + bcax_m1 vEgo, vBgo, vBga, vBgu + bcax_m1 vEgu, vBgu, vBge, vBga + + restore(E4u) + xar_m1 vBko, Amu /* used at block 3 */, E4u, 56 SEP restore(E1u) + xar_m1 vBka, Abe /* used at block 4 */, E1u, 63 SEP restore(E2u) + xar_m1 vBke, Agi /* not used */, E2u, 58 SEP restore(E0u) + xar_m1 vBku, Asa /* used at block 5 */, E0u, 46 SEP restore(E3u) + xar_m1 vBki, Ako /* used at block 2 */, E3u, 39 + + bcax_m1 vEke, vBke, vBko, vBki SEP save(vEke) /* TEMP */ + bcax_m1 vEki, vBki, vBku, vBko SEP save(vEki) /* TEMP */ + bcax_m1 vEku, vBku, vBke, vBka + bcax_m1 vEko, vBko, vBka, vBku + bcax_m1 vEka, vBka, vBki, vBke + + // Can use: Abo, Asi, Abe, Asa; Abu, Aso + SEP restore(E3u) + xar_m1 vBmu, Aso /* used at block 5 */, E3u, 8 SEP restore(E4u) + xar_m1 vBma, Abu /* used at block 4 */, E4u, 37 SEP restore(E2u) + xar_m1 vBmo, Ami /* used at block 3 */, E2u, 49 SEP restore(E1u) + xar_m1 vBmi, Ake /* not used */, E1u, 54 SEP restore(E0u) + xar_m1 vBme, Aga /* not used */, E0u, 28 + + bcax_m1 vEma, vBma, vBmi, vBme + bcax_m1 vEmo, vBmo, vBma, vBmu + bcax_m1 vEme, vBme, vBmo, vBmi + bcax_m1 vEmi, vBmi, vBmu, vBmo + bcax_m1 vEmu, vBmu, vBme, vBma + + // Can use: Asi, Asa, Aso, Asu, Amo + restore(E0u) + eor2 vBba, Aba /* used at block 4 */, E0u SEP restore(E2u) + xar_m1 vBbi, Aki /* not used */, E2u, 21 SEP restore(E3u) + xar_m1 vBbo, Amo /* not used */, E3u, 43 SEP restore(E4u) + xar_m1 vBbu, Asu /* used at block 5 */, E4u, 50 SEP restore(E1u) + xar_m1 vBbe, Age /* not used */, E1u, 20 + + bcax_m1 vEba, vBba, vBbi, vBbe + ld1r {tmp.2d}, [const_addr], #8 + eor2 vEba, vEba, tmp + bcax_m1 vEbe, vBbe, vBbo, vBbi + bcax_m1 vEbo, vBbo, vBba, vBbu + bcax_m1 vEbu, vBbu, vBbe, vBba + bcax_m1 vEbi, vBbi, vBbu, vBbo + + // Can use: Amo, Age, Abi, Ama, Ago, Aku + restore(E2u) + xar_m1 vBsa, Abi /* not used */, E2u, 2 SEP restore(E0u) + xar_m1 vBso, Ama /* not used */, E0u, 23 SEP restore(E3u) + xar_m1 vBse, Ago /* not used */, E3u, 9 SEP restore(E4u) + xar_m1 vBsi, Aku /* not used */, E4u, 25 SEP restore(E1u) + xar_m1 vBsu, Ase /* used at block 5 */, E1u, 62 + + bcax_m1 vEsa, vBsa, vBsi, vBse + bcax_m1 vEse, vBse, vBso, vBsi + bcax_m1 vEsi, vBsi, vBsu, vBso + bcax_m1 vEso, vBso, vBsa, vBsu + bcax_m1 vEsu, vBsu, vBse, vBsa + + /* TODO: Unroll twice and arrange things so that after two iterations + we end up at the same allocation of state registers? */ + + /* New spare registers: + * - Abi, Ama, Ago, Aku, Age, Amo */ + + move Abi, vEbi + move Ama, vEma + move Ago, vEgo + move Aku, vEku + move Age, vEge + move Amo, vEmo + + /* Overlapping registers + * - Agi, Ake, Aga, Aki */ +// save(vEgi) +// save(vEke) +// save(vEga) +// save(vEki) + + restore(Agi) + restore(Ake) + restore(Aga) + restore(Aki) + + eor5 C0, Aka, Aga, Ama, Aba, Asa, tmp + eor5 C2, Aki, Agi, Ami, Abi, Asi, tmp + eor5 C4, Aku, Agu, Amu, Abu, Asu, tmp + eor5 C1, Ake, Age, Ame, Abe, Ase, tmp + eor5 C3, Ako, Ago, Amo, Abo, Aso, tmp + + // eor2 C3, Ako, Ago + // eor2 C0, Aka, Aga + // eor2 C1, Ake, Age + // eor2 C2, Aki, Agi + // eor2 C4, Aku, Agu + + // eor2 C0, C0, Ama + // eor2 C1, C1, Ame + // eor2 C2, C2, Ami + // eor2 C3, C3, Amo + // eor2 C4, C4, Amu + + // eor2 C0, C0, Aba + // eor2 C1, C1, Abe + // eor2 C2, C2, Abi + // eor2 C3, C3, Abo + // eor2 C4, C4, Abu + + // eor2 C0, C0, Asa + // eor2 C1, C1, Ase + // eor2 C2, C2, Asi + // eor2 C3, C3, Aso + // eor2 C4, C4, Asu + +.endm + +.macro keccak_f1600_round_post + + /* 5x RAX1, 15 Neon Instructions total */ + + rax1_m1 E1c, C0, C2 SEP save(E1c) + rax1_m1 E3c, C2, C4 SEP save(E3c) + rax1_m1 E0c, C4, C1 SEP save(E0c) + rax1_m1 E2c, C1, C3 SEP save(E2c) + rax1_m1 E4c, C3, C0 SEP save(E4c) + + xar_m1 vBgo, Ame /* used at block 3 */, E1c, 19 + xar_m1 vBgi, Aka /* used at block 2 */, E0c, 61 + xar_m1 vBga, Abo /* used at block 4 */, E3c, 36 + xar_m1 vBge, Agu /* used at block 1 */, E4c, 44 + xar_m1 vBgu, Asi /* used at block 5 */, E2c, 3 + + bcax_m1 vEga, vBga, vBgi, vBge SEP save(vEga) /* TEMP */ + bcax_m1 vEge, vBge, vBgo, vBgi + bcax_m1 vEgi, vBgi, vBgu, vBgo SEP save(vEgi) /* TEMP */ + bcax_m1 vEgo, vBgo, vBga, vBgu + bcax_m1 vEgu, vBgu, vBge, vBga + + restore(E4u) + xar_m1 vBko, Amu /* used at block 3 */, E4u, 56 SEP restore(E1u) + xar_m1 vBka, Abe /* used at block 4 */, E1u, 63 SEP restore(E2u) + xar_m1 vBke, Agi /* not used */, E2u, 58 SEP restore(E0u) + xar_m1 vBku, Asa /* used at block 5 */, E0u, 46 SEP restore(E3u) + xar_m1 vBki, Ako /* used at block 2 */, E3u, 39 + + bcax_m1 vEke, vBke, vBko, vBki SEP save(vEke) /* TEMP */ + bcax_m1 vEki, vBki, vBku, vBko SEP save(vEki) /* TEMP */ + bcax_m1 vEku, vBku, vBke, vBka + bcax_m1 vEko, vBko, vBka, vBku + bcax_m1 vEka, vBka, vBki, vBke + + // Can use: Abo, Asi, Abe, Asa; Abu, Aso + SEP restore(E3u) + xar_m1 vBmu, Aso /* used at block 5 */, E3u, 8 SEP restore(E4u) + xar_m1 vBma, Abu /* used at block 4 */, E4u, 37 SEP restore(E2u) + xar_m1 vBmo, Ami /* used at block 3 */, E2u, 49 SEP restore(E1u) + xar_m1 vBmi, Ake /* not used */, E1u, 54 SEP restore(E0u) + xar_m1 vBme, Aga /* not used */, E0u, 28 + + bcax_m1 vEma, vBma, vBmi, vBme + bcax_m1 vEmo, vBmo, vBma, vBmu + bcax_m1 vEme, vBme, vBmo, vBmi + bcax_m1 vEmi, vBmi, vBmu, vBmo + bcax_m1 vEmu, vBmu, vBme, vBma + + // Can use: Asi, Asa, Aso, Asu, Amo + restore(E0u) + eor2 vBba, Aba /* used at block 4 */, E0u SEP restore(E2u) + xar_m1 vBbi, Aki /* not used */, E2u, 21 SEP restore(E3u) + xar_m1 vBbo, Amo /* not used */, E3u, 43 SEP restore(E4u) + xar_m1 vBbu, Asu /* used at block 5 */, E4u, 50 SEP restore(E1u) + xar_m1 vBbe, Age /* not used */, E1u, 20 + + bcax_m1 vEba, vBba, vBbi, vBbe + ld1r {tmp.2d}, [const_addr], #8 + eor2 vEba, vEba, tmp + bcax_m1 vEbe, vBbe, vBbo, vBbi + bcax_m1 vEbo, vBbo, vBba, vBbu + bcax_m1 vEbu, vBbu, vBbe, vBba + bcax_m1 vEbi, vBbi, vBbu, vBbo + + // Can use: Amo, Age, Abi, Ama, Ago, Aku + restore(E2u) + xar_m1 vBsa, Abi /* not used */, E2u, 2 SEP restore(E0u) + xar_m1 vBso, Ama /* not used */, E0u, 23 SEP restore(E3u) + xar_m1 vBse, Ago /* not used */, E3u, 9 SEP restore(E4u) + xar_m1 vBsi, Aku /* not used */, E4u, 25 SEP restore(E1u) + xar_m1 vBsu, Ase /* used at block 5 */, E1u, 62 + + bcax_m1 vEsa, vBsa, vBsi, vBse + bcax_m1 vEse, vBse, vBso, vBsi + bcax_m1 vEsi, vBsi, vBsu, vBso + bcax_m1 vEso, vBso, vBsa, vBsu + bcax_m1 vEsu, vBsu, vBse, vBsa + + /* TODO: Unroll twice and arrange things so that after two iterations + we end up at the same allocation of state registers? */ + + /* New spare registers: + * - Abi, Ama, Ago, Aku, Age, Amo */ + + move Abi, vEbi + move Ama, vEma + move Ago, vEgo + move Aku, vEku + move Age, vEge + move Amo, vEmo + + /* Overlapping registers + * - Agi, Ake, Aga, Aki */ +// save(vEgi) +// save(vEke) +// save(vEga) +// save(vEki) + + restore(Agi) + restore(Ake) + restore(Aga) + restore(Aki) + +.endm + + +.text +.align 4 +.global keccak_f1600_x2_v84a_asm_v2p3 +.global _keccak_f1600_x2_v84a_asm_v2p3 + +#define KECCAK_F1600_ROUNDS 24 + +keccak_f1600_x2_v84a_asm_v2p3: +_keccak_f1600_x2_v84a_asm_v2p3: + alloc_stack + save_vregs + load_constant_ptr + load_input + + //mov count, #(KECCAK_F1600_ROUNDS-2) + mov count, #11 + keccak_f1600_round_pre +loop: + keccak_f1600_round_core + keccak_f1600_round_core + sub count, count, #1 + cbnz count, loop + + keccak_f1600_round_core + keccak_f1600_round_post + store_input + restore_vregs + free_stack + ret diff --git a/tests/keccak_neon/manual/keccak_f1600_x2_v84a_asm_v2p4.s b/tests/keccak_neon/manual/keccak_f1600_x2_v84a_asm_v2p4.s new file mode 100644 index 0000000..75fe603 --- /dev/null +++ b/tests/keccak_neon/manual/keccak_f1600_x2_v84a_asm_v2p4.s @@ -0,0 +1,689 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +#define STACK_SIZE (16*4 + 16*30) +#define STACK_BASE_VREGS 0 +#define STACK_BASE_TMP 16*4 + +#define E0c_offset 0 +#define E1c_offset 1 +#define E2c_offset 2 +#define E3c_offset 3 +#define E4c_offset 4 +#define E0u_offset 0 +#define E1u_offset 1 +#define E2u_offset 2 +#define E3u_offset 3 +#define E4u_offset 4 + +#define ba_offset (5 + 0 ) +#define be_offset (5 + 1 ) +#define bi_offset (5 + 2 ) +#define bo_offset (5 + 3 ) +#define bu_offset (5 + 4 ) +#define ga_offset (5 + 5 ) +#define ge_offset (5 + 6 ) +#define gi_offset (5 + 7 ) +#define go_offset (5 + 8 ) +#define gu_offset (5 + 9 ) +#define ka_offset (5 + 10 ) +#define ke_offset (5 + 11 ) +#define ki_offset (5 + 12 ) +#define ko_offset (5 + 13 ) +#define ku_offset (5 + 14 ) +#define ma_offset (5 + 15 ) +#define me_offset (5 + 16 ) +#define mi_offset (5 + 17 ) +#define mo_offset (5 + 18 ) +#define mu_offset (5 + 19 ) +#define sa_offset (5 + 20 ) +#define se_offset (5 + 21 ) +#define si_offset (5 + 22 ) +#define so_offset (5 + 23 ) +#define su_offset (5 + 24 ) + +#define savep(reg, offset_prefix) \ + str reg ## q, [sp, #(STACK_BASE_TMP + 16 * offset_prefix ## _offset)] +#define restorep(reg, offset_prefix) \ + ldr reg ## q, [sp, #(STACK_BASE_TMP + 16 * offset_prefix ## _offset)] +#define save(name) savep(name,name) +#define restore(name) restorep(name,name) + +/********************** CONSTANTS *************************/ + .data + .align(8) +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x1 + count .req x2 + cur_const .req x3 + + /* Mapping of Kecck-f1600 state to vector registers + * at the beginning and end of each round. */ + Aba .req v0 + Abe .req v1 + Abo .req v2 + Abu .req v3 + Agu .req v4 + Aka .req v5 + Ako .req v6 + Ame .req v7 + Ami .req v8 + Amu .req v9 + Asa .req v10 + Ase .req v11 + Asi .req v12 + Aso .req v13 + Asu .req v14 + + Agi .req v15 + Ake .req v16 + Aga .req v17 + Aki .req v18 + + Abi .req v19 + Ama .req v20 + Ago .req v21 + Aku .req v22 + Age .req v23 + Amo .req v24 + + /* q-form of the above mapping */ + Abaq .req q0 + Abeq .req q1 + Aboq .req q2 + Abuq .req q3 + Aguq .req q4 + Akaq .req q5 + Akoq .req q6 + Ameq .req q7 + Amiq .req q8 + Amuq .req q9 + Asaq .req q10 + Aseq .req q11 + Asiq .req q12 + Asoq .req q13 + Asuq .req q14 + + Agiq .req q15 + Akeq .req q16 + Agaq .req q17 + Akiq .req q18 + + Abiq .req q19 + Amaq .req q20 + Agoq .req q21 + Akuq .req q22 + Ageq .req q23 + Amoq .req q24 + + Aspare0 .req v25 + Aspare1 .req v26 + Aspare2 .req v27 + Aspare3 .req v28 + Aspare4 .req v29 + Aspare5 .req v30 + Aspare6 .req v31 + Aspare0q .req q25 + Aspare1q .req q26 + Aspare2q .req q27 + Aspare3q .req q28 + Aspare4q .req q29 + Aspare5q .req q30 + Aspare6q .req q31 + +.macro declare_remappings out,in + tmp .req \in\()spare6 + tmpq .req \in\()spare6q + + \out\()gu .req \in\()gu /* keep */ + \out\()ga .req \in\()spare0 /* out */ + \out\()ge .req \in\()spare1 /* out */ + \out\()gi .req \in\()spare2 /* out */ + \out\()go .req \in\()spare3 /* out */ + + \out\()ka .req \in\()ka /* keep */ + \out\()ko .req \in\()ko /* keep */ + \out\()ke .req \in\()spare4 /* out */ + \out\()ki .req \in\()spare5 /* out */ + \out\()ku .req \in\()gi /* in */ + + \out\()ma .req \in\()ke /* in */ + \out\()me .req \in\()me /* keep */ + \out\()mi .req \in\()mi /* keep */ + \out\()mo .req \in\()ga /* in */ + \out\()mu .req \in\()mu /* keep */ + + \out\()ba .req \in\()ba /* keep */ + \out\()be .req \in\()be /* keep */ + \out\()bi .req \in\()ki /* in */ + \out\()bo .req \in\()bo /* keep */ + \out\()bu .req \in\()bu /* keep */ + + \out\()sa .req \in\()sa /* keep */ + \out\()se .req \in\()se /* keep */ + \out\()si .req \in\()si /* keep */ + \out\()so .req \in\()so /* keep */ + \out\()su .req \in\()su /* keep */ + + \out\()guq .req \in\()guq + \out\()gaq .req \in\()spare0q + \out\()geq .req \in\()spare1q + \out\()giq .req \in\()spare2q + \out\()goq .req \in\()spare3q + + \out\()kaq .req \in\()kaq + \out\()koq .req \in\()koq + \out\()keq .req \in\()spare4q + \out\()kiq .req \in\()spare5q + \out\()kuq .req \in\()giq + + \out\()maq .req \in\()keq + \out\()meq .req \in\()meq + \out\()miq .req \in\()miq + \out\()moq .req \in\()gaq + \out\()muq .req \in\()muq + + \out\()baq .req \in\()baq + \out\()beq .req \in\()beq + \out\()biq .req \in\()kiq + \out\()boq .req \in\()boq + \out\()buq .req \in\()buq + + \out\()saq .req \in\()saq + \out\()seq .req \in\()seq + \out\()siq .req \in\()siq + \out\()soq .req \in\()soq + \out\()suq .req \in\()suq + + \out\()spare0 .req \in\()bi + \out\()spare1 .req \in\()ma + \out\()spare2 .req \in\()go + \out\()spare3 .req \in\()ku + \out\()spare4 .req \in\()ge + \out\()spare5 .req \in\()mo + \out\()spare6 .req \in\()spare6 + \out\()spare0q .req \in\()biq + \out\()spare1q .req \in\()maq + \out\()spare2q .req \in\()goq + \out\()spare3q .req \in\()kuq + \out\()spare4q .req \in\()geq + \out\()spare5q .req \in\()moq + \out\()spare6q .req \in\()spare6q + + C0 .req \in\()spare0 + C1 .req \in\()spare1 + C2 .req \in\()spare2 + C3 .req \in\()spare3 + C4 .req \in\()spare4 + C0q .req \in\()spare0q + C1q .req \in\()spare1q + C2q .req \in\()spare2q + C3q .req \in\()spare3q + C4q .req \in\()spare4q + + E1c .req \in\()spare5 + E3c .req C2 + E0c .req C4 + E2c .req C1 + E4c .req C3 + + E1cq .req \in\()spare5q + E3cq .req C2q + E0cq .req C4q + E2cq .req C1q + E4cq .req C3q + + E0u .req tmp + E1u .req tmp + E2u .req tmp + E3u .req tmp + E4u .req tmp + + E0uq .req tmpq + E1uq .req tmpq + E2uq .req tmpq + E3uq .req tmpq + E4uq .req tmpq + + vBgo .req E1c + vBgi .req \in\()me + vBga .req \in\()ka + vBge .req \in\()bo + vBgu .req \in\()gu + + vBko .req \in\()me + vBka .req \in\()mu + vBke .req \in\()be + vBku .req \in\()gi + vBki .req \in\()sa + + vBmu .req \in\()bo + vBmo .req \in\()so + vBmi .req \in\()bu + vBma .req \in\()si + vBme .req \in\()be + + vBba .req \in\()si + vBbi .req \in\()sa + vBbo .req \in\()so + vBbu .req \in\()mo + vBbe .req \in\()su + + vBsa .req \in\()mo + vBso .req \in\()bi + vBse .req \in\()ma + vBsi .req \in\()go + vBsu .req \in\()ku +.endm + +.macro transfer_uncommon out, in + savep(\in\()ga, ga) + savep(\in\()gi, gi) + savep(\in\()ki, ki) + savep(\in\()ke, ke) + savep(\in\()bi, bi) + savep(\in\()ma, ma) + savep(\in\()go, go) + savep(\in\()ku, ku) + savep(\in\()ge, ge) + savep(\in\()mo, mo) + + restorep(\out\()gi, gi) + restorep(\out\()ke, ke) + restorep(\out\()ga, ga) + restorep(\out\()ki, ki) + restorep(\out\()bi, bi) + restorep(\out\()ma, ma) + restorep(\out\()go, go) + restorep(\out\()ku, ku) + restorep(\out\()ge, ge) + restorep(\out\()mo, mo) +.endm + +.macro undeclare_remappings out, in + .unreq vBgo + .unreq vBgi + .unreq vBga + .unreq vBge + .unreq vBgu + .unreq vBko + .unreq vBka + .unreq vBke + .unreq vBku + .unreq vBki + .unreq vBmu + .unreq vBmo + .unreq vBmi + .unreq vBma + .unreq vBme + .unreq vBba + .unreq vBbi + .unreq vBbo + .unreq vBbu + .unreq vBbe + .unreq vBsa + .unreq vBso + .unreq vBse + .unreq vBsi + .unreq vBsu + .unreq C0 + .unreq C1 + .unreq C2 + .unreq C3 + .unreq C4 + .unreq C0q + .unreq C1q + .unreq C2q + .unreq C3q + .unreq C4q + .unreq E1u + .unreq E3u + .unreq E0u + .unreq E2u + .unreq E4u + .unreq E1c + .unreq E3c + .unreq E0c + .unreq E2c + .unreq E4c + .unreq E1uq + .unreq E3uq + .unreq E0uq + .unreq E2uq + .unreq E4uq + .unreq E1cq + .unreq E3cq + .unreq E0cq + .unreq E2cq + .unreq E4cq +.endm + +/************************ MACROS ****************************/ + +.macro load_input + ldp Abaq, Abeq, [input_addr, #(2*8*0)] + ldp Abiq, Aboq, [input_addr, #(2*8*2)] + ldp Abuq, Agaq, [input_addr, #(2*8*4)] + ldp Ageq, Agiq, [input_addr, #(2*8*6)] + ldp Agoq, Aguq, [input_addr, #(2*8*8)] + ldp Akaq, Akeq, [input_addr, #(2*8*10)] + ldp Akiq, Akoq, [input_addr, #(2*8*12)] + ldp Akuq, Amaq, [input_addr, #(2*8*14)] + ldp Ameq, Amiq, [input_addr, #(2*8*16)] + ldp Amoq, Amuq, [input_addr, #(2*8*18)] + ldp Asaq, Aseq, [input_addr, #(2*8*20)] + ldp Asiq, Asoq, [input_addr, #(2*8*22)] + ldr Asuq, [input_addr, #(2*8*24)] +.endm + +.macro store_input + str Abaq, [input_addr, #(2*8*0)] + str Abeq, [input_addr, #(2*8*1)] + str Abiq, [input_addr, #(2*8*2)] + str Aboq, [input_addr, #(2*8*3)] + str Abuq, [input_addr, #(2*8*4)] + str Agaq, [input_addr, #(2*8*5)] + str Ageq, [input_addr, #(2*8*6)] + str Agiq, [input_addr, #(2*8*7)] + str Agoq, [input_addr, #(2*8*8)] + str Aguq, [input_addr, #(2*8*9)] + str Akaq, [input_addr, #(2*8*10)] + str Akeq, [input_addr, #(2*8*11)] + str Akiq, [input_addr, #(2*8*12)] + str Akoq, [input_addr, #(2*8*13)] + str Akuq, [input_addr, #(2*8*14)] + str Amaq, [input_addr, #(2*8*15)] + str Ameq, [input_addr, #(2*8*16)] + str Amiq, [input_addr, #(2*8*17)] + str Amoq, [input_addr, #(2*8*18)] + str Amuq, [input_addr, #(2*8*19)] + str Asaq, [input_addr, #(2*8*20)] + str Aseq, [input_addr, #(2*8*21)] + str Asiq, [input_addr, #(2*8*22)] + str Asoq, [input_addr, #(2*8*23)] + str Asuq, [input_addr, #(2*8*24)] +.endm + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +.macro save_vregs + stp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + stp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + stp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + stp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + ldp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + ldp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + ldp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +/* Macros using v8.4-A SHA-3 instructions */ + +.macro eor3_m1_0 d, s0, s1, s2 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor2 d, s0, s1 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro move d, s + mov \d\().16b, \s\().16b +.endm + + +.macro eor3_m1_1 d, s0, s1, s2 + eor \d\().16b, \d\().16b, \s2\().16b +.endm + +.macro eor3_m1 d, s0, s1, s2 + eor3_m1_0 \d, \s0, \s1, \s2 + eor3_m1_1 \d, \s0, \s1, \s2 +.endm + +.macro rax1_m1 d, s0, s1 + add tmp.2d, \s1\().2d, \s1\().2d + sri tmp.2d, \s1\().2d, #63 + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +.macro xar_m1 d, s0, s1, imm + eor \s0\().16b, \s0\().16b, \s1\().16b + shl \d\().2d, \s0\().2d, #(64-\imm) + sri \d\().2d, \s0\().2d, #(\imm) +.endm + +.macro xar_m1_0 d, s0, s1, imm, tmp + eor \tmp\().16b, \s0\().16b, \s1\().16b +.endm + +.macro xar_m1_1 d, s0, s1, imm, tmp + shl \d\().2d, \tmp\().2d, #(64-\imm) +.endm + +.macro xar_m1_2 d s0 s1 imm tmp + sri \d\().2d, \tmp\().2d, #(\imm) +.endm + +.macro bcax_m1 d s0 s1 s2 + bic tmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +.macro refresh d + mov \d\().16b, \d\().16b +.endm +/* Keccak-f1600 round */ + +.macro keccak_f1600_round_core out in + + eor2 C3, \in\()ko, \in\()go + eor2 C0, \in\()ka, \in\()ga + eor2 C1, \in\()ke, \in\()ge + eor2 C2, \in\()ki, \in\()gi + eor2 C4, \in\()ku, \in\()gu + + eor2 C0, C0, \in\()ma + eor2 C1, C1, \in\()me + eor2 C2, C2, \in\()mi + eor2 C3, C3, \in\()mo + eor2 C4, C4, \in\()mu + + eor2 C0, C0, \in\()ba + eor2 C1, C1, \in\()be + eor2 C2, C2, \in\()bi + eor2 C3, C3, \in\()bo + eor2 C4, C4, \in\()bu + + eor2 C0, C0, \in\()sa + eor2 C1, C1, \in\()se + eor2 C2, C2, \in\()si + eor2 C3, C3, \in\()so + eor2 C4, C4, \in\()su + + rax1_m1 E1c, C0, C2 SEP save(E1c) + rax1_m1 E3c, C2, C4 SEP save(E3c) + rax1_m1 E0c, C4, C1 SEP save(E0c) + rax1_m1 E2c, C1, C3 SEP save(E2c) + rax1_m1 E4c, C3, C0 SEP save(E4c) + + xar_m1 vBgo, \in\()me /* used at block 3 */, E1c, 19 + xar_m1 vBgi, \in\()ka /* used at block 2 */, E0c, 61 + xar_m1 vBga, \in\()bo /* used at block 4 */, E3c, 36 + xar_m1 vBge, \in\()gu /* used at block 1 */, E4c, 44 + xar_m1 vBgu, \in\()si /* used at block 5 */, E2c, 3 + + bcax_m1 \out\()ga, vBga, vBgi, vBge + bcax_m1 \out\()ge, vBge, vBgo, vBgi + bcax_m1 \out\()gi, vBgi, vBgu, vBgo + bcax_m1 \out\()go, vBgo, vBga, vBgu + bcax_m1 \out\()gu, vBgu, vBge, vBga + restore(E4u) + xar_m1 vBko, \in\()mu /* used at block 3 */, E4u, 56 SEP restore(E1u) + xar_m1 vBka, \in\()be /* used at block 4 */, E1u, 63 SEP restore(E2u) + xar_m1 vBke, \in\()gi /* not used */, E2u, 58 SEP restore(E0u) + xar_m1 vBku, \in\()sa /* used at block 5 */, E0u, 46 SEP restore(E3u) + xar_m1 vBki, \in\()ko /* used at block 2 */, E3u, 39 + + bcax_m1 \out\()ke, vBke, vBko, vBki + bcax_m1 \out\()ki, vBki, vBku, vBko + bcax_m1 \out\()ku, vBku, vBke, vBka + bcax_m1 \out\()ko, vBko, vBka, vBku + bcax_m1 \out\()ka, vBka, vBki, vBke + + // Can use: Abo, Asi, Abe, Asa; Abu, Aso + restore(E3u) + xar_m1 vBmu, \in\()so /* used at block 5 */, E3u, 8 SEP restore(E4u) + xar_m1 vBma, \in\()bu /* used at block 4 */, E4u, 37 SEP restore(E2u) + xar_m1 vBmo, \in\()mi /* used at block 3 */, E2u, 49 SEP restore(E1u) + xar_m1 vBmi, \in\()ke /* not used */, E1u, 54 SEP restore(E0u) + xar_m1 vBme, \in\()ga /* not used */, E0u, 28 + + bcax_m1 \out\()ma, vBma, vBmi, vBme + bcax_m1 \out\()mo, vBmo, vBma, vBmu + bcax_m1 \out\()me, vBme, vBmo, vBmi + bcax_m1 \out\()mi, vBmi, vBmu, vBmo + bcax_m1 \out\()mu, vBmu, vBme, vBma + + // Can use: Asi, Asa, Aso, Asu, Amo + restore(E0u) + eor2 vBba, \in\()ba /* used at block 4 */, E0u SEP restore(E2u) + xar_m1 vBbi, \in\()ki /* not used */, E2u, 21 SEP restore(E3u) + xar_m1 vBbo, \in\()mo /* not used */, E3u, 43 SEP restore(E4u) + xar_m1 vBbu, \in\()su /* used at block 5 */, E4u, 50 SEP restore(E1u) + xar_m1 vBbe, \in\()ge /* not used */, E1u, 20 + + bcax_m1 \out\()ba, vBba, vBbi, vBbe + ld1r {tmp.2d}, [const_addr], #8 + eor2 \out\()ba, \out\()ba, tmp + bcax_m1 \out\()be, vBbe, vBbo, vBbi + bcax_m1 \out\()bo, vBbo, vBba, vBbu + bcax_m1 \out\()bu, vBbu, vBbe, vBba + bcax_m1 \out\()bi, vBbi, vBbu, vBbo + + // Can use: Amo, Age, Abi, Ama, Ago, Aku + restore(E2u) + xar_m1 vBsa, \in\()bi /* not used */, E2u, 2 SEP restore(E0u) + xar_m1 vBso, \in\()ma /* not used */, E0u, 23 SEP restore(E3u) + xar_m1 vBse, \in\()go /* not used */, E3u, 9 SEP restore(E4u) + xar_m1 vBsi, \in\()ku /* not used */, E4u, 25 SEP restore(E1u) + xar_m1 vBsu, \in\()se /* used at block 5 */, E1u, 62 + + bcax_m1 \out\()sa, vBsa, vBsi, vBse + bcax_m1 \out\()se, vBse, vBso, vBsi + bcax_m1 \out\()si, vBsi, vBsu, vBso + bcax_m1 \out\()so, vBso, vBsa, vBsu + bcax_m1 \out\()su, vBsu, vBse, vBsa + +.endm + +.text +.align 4 +.global keccak_f1600_x2_v84a_asm_v2p4 +.global _keccak_f1600_x2_v84a_asm_v2p4 + +#define KECCAK_F1600_ROUNDS 24 + +keccak_f1600_x2_v84a_asm_v2p4: +_keccak_f1600_x2_v84a_asm_v2p4: + alloc_stack + save_vregs + load_constant_ptr + load_input + + + //mov count, #(KECCAK_F1600_ROUNDS-2) + mov count, #24 +loop: + declare_remappings A1, A + keccak_f1600_round_core A1, A + undeclare_remappings A1, A + + declare_remappings A2, A1 + keccak_f1600_round_core A2, A1 + undeclare_remappings A2, A1 + + declare_remappings A3, A2 + keccak_f1600_round_core A3, A2 + undeclare_remappings A3, A2 + + declare_remappings A4, A3 + keccak_f1600_round_core A4, A3 + undeclare_remappings A4, A3 + + transfer_uncommon A, A4 + + sub count, count, #4 + cbnz count, loop + + + store_input + restore_vregs + free_stack + ret diff --git a/tests/keccak_neon/manual/keccak_f1600_x2_v84a_asm_v2p5.s b/tests/keccak_neon/manual/keccak_f1600_x2_v84a_asm_v2p5.s new file mode 100644 index 0000000..22e0373 --- /dev/null +++ b/tests/keccak_neon/manual/keccak_f1600_x2_v84a_asm_v2p5.s @@ -0,0 +1,949 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +#define STACK_SIZE (16*4 + 16*30) +#define STACK_BASE_VREGS 0 +#define STACK_BASE_TMP 16*4 + +#define E0c_offset 0 +#define E1c_offset 1 +#define E2c_offset 2 +#define E3c_offset 3 +#define E4c_offset 4 +#define E0u_offset 0 +#define E1u_offset 1 +#define E2u_offset 2 +#define E3u_offset 3 +#define E4u_offset 4 + +#define ba_offset (5 + 0 ) +#define be_offset (5 + 1 ) +#define bi_offset (5 + 2 ) +#define bo_offset (5 + 3 ) +#define bu_offset (5 + 4 ) +#define ga_offset (5 + 5 ) +#define ge_offset (5 + 6 ) +#define gi_offset (5 + 7 ) +#define go_offset (5 + 8 ) +#define gu_offset (5 + 9 ) +#define ka_offset (5 + 10 ) +#define ke_offset (5 + 11 ) +#define ki_offset (5 + 12 ) +#define ko_offset (5 + 13 ) +#define ku_offset (5 + 14 ) +#define ma_offset (5 + 15 ) +#define me_offset (5 + 16 ) +#define mi_offset (5 + 17 ) +#define mo_offset (5 + 18 ) +#define mu_offset (5 + 19 ) +#define sa_offset (5 + 20 ) +#define se_offset (5 + 21 ) +#define si_offset (5 + 22 ) +#define so_offset (5 + 23 ) +#define su_offset (5 + 24 ) + +#define savep(reg, offset_prefix) \ + str reg ## q, [sp, #(STACK_BASE_TMP + 16 * offset_prefix ## _offset)] +#define restorep(reg, offset_prefix) \ + ldr reg ## q, [sp, #(STACK_BASE_TMP + 16 * offset_prefix ## _offset)] +#define save(name) savep(name,name) +#define restore(name) restorep(name,name) + +/********************** CONSTANTS *************************/ + .data + .align(8) +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x1 + count .req x2 + cur_const .req x3 + + /* Mapping of Kecck-f1600 state to vector registers + * at the beginning and end of each round. */ + Aba .req v0 + Abe .req v1 + Abo .req v2 + Abu .req v3 + Agu .req v4 + Aka .req v5 + Ako .req v6 + Ame .req v7 + Ami .req v8 + Amu .req v9 + Asa .req v10 + Ase .req v11 + Asi .req v12 + Aso .req v13 + Asu .req v14 + + Agi .req v15 + Ake .req v16 + Aga .req v17 + Aki .req v18 + + Abi .req v19 + Ama .req v20 + Ago .req v21 + Aku .req v22 + Age .req v23 + Amo .req v24 + + /* q-form of the above mapping */ + Abaq .req q0 + Abeq .req q1 + Aboq .req q2 + Abuq .req q3 + Aguq .req q4 + Akaq .req q5 + Akoq .req q6 + Ameq .req q7 + Amiq .req q8 + Amuq .req q9 + Asaq .req q10 + Aseq .req q11 + Asiq .req q12 + Asoq .req q13 + Asuq .req q14 + + Agiq .req q15 + Akeq .req q16 + Agaq .req q17 + Akiq .req q18 + + Abiq .req q19 + Amaq .req q20 + Agoq .req q21 + Akuq .req q22 + Ageq .req q23 + Amoq .req q24 + + Aspare0 .req v25 + Aspare1 .req v26 + Aspare2 .req v27 + Aspare3 .req v28 + Aspare4 .req v29 + Aspare5 .req v30 + Aspare6 .req v31 + Aspare0q .req q25 + Aspare1q .req q26 + Aspare2q .req q27 + Aspare3q .req q28 + Aspare4q .req q29 + Aspare5q .req q30 + Aspare6q .req q31 + +.macro declare_remappings out,in + tmp .req \in\()spare6 + tmpq .req \in\()spare6q + + \out\()gu .req \in\()gu /* keep */ + \out\()ga .req \in\()spare0 /* out */ + \out\()ge .req \in\()spare1 /* out */ + \out\()gi .req \in\()spare2 /* out */ + \out\()go .req \in\()spare3 /* out */ + + \out\()ka .req \in\()ka /* keep */ + \out\()ko .req \in\()ko /* keep */ + \out\()ke .req \in\()spare4 /* out */ + \out\()ki .req \in\()spare5 /* out */ + \out\()ku .req \in\()gi /* in */ + + \out\()ma .req \in\()ke /* in */ + \out\()me .req \in\()me /* keep */ + \out\()mi .req \in\()mi /* keep */ + \out\()mo .req \in\()ga /* in */ + \out\()mu .req \in\()mu /* keep */ + + \out\()ba .req \in\()ba /* keep */ + \out\()be .req \in\()be /* keep */ + \out\()bi .req \in\()ki /* in */ + \out\()bo .req \in\()bo /* keep */ + \out\()bu .req \in\()bu /* keep */ + + \out\()sa .req \in\()sa /* keep */ + \out\()se .req \in\()se /* keep */ + \out\()si .req \in\()si /* keep */ + \out\()so .req \in\()so /* keep */ + \out\()su .req \in\()su /* keep */ + + \out\()guq .req \in\()guq + \out\()gaq .req \in\()spare0q + \out\()geq .req \in\()spare1q + \out\()giq .req \in\()spare2q + \out\()goq .req \in\()spare3q + + \out\()kaq .req \in\()kaq + \out\()koq .req \in\()koq + \out\()keq .req \in\()spare4q + \out\()kiq .req \in\()spare5q + \out\()kuq .req \in\()giq + + \out\()maq .req \in\()keq + \out\()meq .req \in\()meq + \out\()miq .req \in\()miq + \out\()moq .req \in\()gaq + \out\()muq .req \in\()muq + + \out\()baq .req \in\()baq + \out\()beq .req \in\()beq + \out\()biq .req \in\()kiq + \out\()boq .req \in\()boq + \out\()buq .req \in\()buq + + \out\()saq .req \in\()saq + \out\()seq .req \in\()seq + \out\()siq .req \in\()siq + \out\()soq .req \in\()soq + \out\()suq .req \in\()suq + + \out\()spare0 .req \in\()bi + \out\()spare1 .req \in\()ma + \out\()spare4 .req \in\()go + \out\()spare2 .req \in\()ku + \out\()spare3 .req \in\()ge + \out\()spare5 .req \in\()mo + \out\()spare6 .req \in\()spare6 + \out\()spare0q .req \in\()biq + \out\()spare1q .req \in\()maq + \out\()spare4q .req \in\()goq + \out\()spare2q .req \in\()kuq + \out\()spare3q .req \in\()geq + \out\()spare5q .req \in\()moq + \out\()spare6q .req \in\()spare6q + + C0 .req \in\()spare3 + C1 .req \in\()spare1 + C2 .req \in\()spare2 + C3 .req \in\()spare0 + C4 .req \in\()spare4 + C0q .req \in\()spare3q + C1q .req \in\()spare1q + C2q .req \in\()spare2q + C3q .req \in\()spare0q + C4q .req \in\()spare4q + + E1c .req \in\()spare5 + E3c .req C2 + E0c .req C4 + E2c .req C1 + E4c .req C3 + + E1cq .req \in\()spare5q + E3cq .req C2q + E0cq .req C4q + E2cq .req C1q + E4cq .req C3q + + E0u .req tmp + E1u .req tmp + E2u .req tmp + E3u .req tmp + E4u .req tmp + + E0uq .req tmpq + E1uq .req tmpq + E2uq .req tmpq + E3uq .req tmpq + E4uq .req tmpq + + vBgo .req E1c + vBgi .req \in\()me + vBga .req \in\()ka + vBge .req \in\()bo + vBgu .req \in\()gu + + vBko .req \in\()me + vBka .req \in\()mu + vBke .req \in\()be + vBku .req \in\()gi + vBki .req \in\()sa + + vBmu .req \in\()bo + vBmo .req \in\()so + vBmi .req \in\()bu + vBma .req \in\()si + vBme .req \in\()be + + vBba .req \in\()si + vBbi .req \in\()sa + vBbo .req \in\()so + vBbu .req \in\()mo + vBbe .req \in\()su + + vBsa .req \in\()mo + vBso .req \in\()bi + vBse .req \in\()ma + vBsi .req \in\()go + vBsu .req E1u //\in\()ku +.endm + +.macro transfer_uncommon out, in + savep(\in\()ga, ga) + savep(\in\()gi, gi) + savep(\in\()ki, ki) + savep(\in\()ke, ke) + savep(\in\()bi, bi) + savep(\in\()ma, ma) + savep(\in\()go, go) + savep(\in\()ku, ku) + savep(\in\()ge, ge) + savep(\in\()mo, mo) + + restorep(\out\()gi, gi) + restorep(\out\()ke, ke) + restorep(\out\()ga, ga) + restorep(\out\()ki, ki) + restorep(\out\()bi, bi) + restorep(\out\()ma, ma) + restorep(\out\()go, go) + restorep(\out\()ku, ku) + restorep(\out\()ge, ge) + restorep(\out\()mo, mo) +.endm + +.macro undeclare_remappings out, in + .unreq vBgo + .unreq vBgi + .unreq vBga + .unreq vBge + .unreq vBgu + .unreq vBko + .unreq vBka + .unreq vBke + .unreq vBku + .unreq vBki + .unreq vBmu + .unreq vBmo + .unreq vBmi + .unreq vBma + .unreq vBme + .unreq vBba + .unreq vBbi + .unreq vBbo + .unreq vBbu + .unreq vBbe + .unreq vBsa + .unreq vBso + .unreq vBse + .unreq vBsi + .unreq vBsu + .unreq C0 + .unreq C1 + .unreq C2 + .unreq C3 + .unreq C4 + .unreq C0q + .unreq C1q + .unreq C2q + .unreq C3q + .unreq C4q + .unreq E1u + .unreq E3u + .unreq E0u + .unreq E2u + .unreq E4u + .unreq E1c + .unreq E3c + .unreq E0c + .unreq E2c + .unreq E4c + .unreq E1uq + .unreq E3uq + .unreq E0uq + .unreq E2uq + .unreq E4uq + .unreq E1cq + .unreq E3cq + .unreq E0cq + .unreq E2cq + .unreq E4cq +.endm + +/************************ MACROS ****************************/ + +.macro load_input + ldp Abaq, Abeq, [input_addr, #(2*8*0)] + ldp Abiq, Aboq, [input_addr, #(2*8*2)] + ldp Abuq, Agaq, [input_addr, #(2*8*4)] + ldp Ageq, Agiq, [input_addr, #(2*8*6)] + ldp Agoq, Aguq, [input_addr, #(2*8*8)] + ldp Akaq, Akeq, [input_addr, #(2*8*10)] + ldp Akiq, Akoq, [input_addr, #(2*8*12)] + ldp Akuq, Amaq, [input_addr, #(2*8*14)] + ldp Ameq, Amiq, [input_addr, #(2*8*16)] + ldp Amoq, Amuq, [input_addr, #(2*8*18)] + ldp Asaq, Aseq, [input_addr, #(2*8*20)] + ldp Asiq, Asoq, [input_addr, #(2*8*22)] + ldr Asuq, [input_addr, #(2*8*24)] +.endm + +.macro store_input + str Abaq, [input_addr, #(2*8*0)] + str Abeq, [input_addr, #(2*8*1)] + str Abiq, [input_addr, #(2*8*2)] + str Aboq, [input_addr, #(2*8*3)] + str Abuq, [input_addr, #(2*8*4)] + str Agaq, [input_addr, #(2*8*5)] + str Ageq, [input_addr, #(2*8*6)] + str Agiq, [input_addr, #(2*8*7)] + str Agoq, [input_addr, #(2*8*8)] + str Aguq, [input_addr, #(2*8*9)] + str Akaq, [input_addr, #(2*8*10)] + str Akeq, [input_addr, #(2*8*11)] + str Akiq, [input_addr, #(2*8*12)] + str Akoq, [input_addr, #(2*8*13)] + str Akuq, [input_addr, #(2*8*14)] + str Amaq, [input_addr, #(2*8*15)] + str Ameq, [input_addr, #(2*8*16)] + str Amiq, [input_addr, #(2*8*17)] + str Amoq, [input_addr, #(2*8*18)] + str Amuq, [input_addr, #(2*8*19)] + str Asaq, [input_addr, #(2*8*20)] + str Aseq, [input_addr, #(2*8*21)] + str Asiq, [input_addr, #(2*8*22)] + str Asoq, [input_addr, #(2*8*23)] + str Asuq, [input_addr, #(2*8*24)] +.endm + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +.macro save_vregs + stp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + stp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + stp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + stp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + ldp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + ldp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + ldp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +/* Macros using v8.4-A SHA-3 instructions */ + +.macro eor3_m1_0 d s0 s1 s2 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor2 d s0 s1 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor5 out i0 i1 i2 i3 i4 tmp + eor2 \out, \i0, \i1 + eor2 \tmp, \i3, \i4 + eor2 \out, \out, \i2 + eor2 \out, \out, \tmp +.endm + +.macro move d s + mov \d\().16b, \s\().16b +.endm + + +.macro eor3_m1_1 d s0 s1 s2 + eor \d\().16b, \d\().16b, \s2\().16b +.endm + +.macro eor3_m1 d s0 s1 s2 + eor3_m1_0 \d, \s0, \s1, \s2 + eor3_m1_1 \d, \s0, \s1, \s2 +.endm + +.macro rax1_m1 d s0 s1 + add tmp.2d, \s1\().2d, \s1\().2d + sri tmp.2d, \s1\().2d, #63 + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +.macro xar_m1 d s0 s1 imm + eor \s0\().16b, \s0\().16b, \s1\().16b + shl \d\().2d, \s0\().2d, #(64-\imm) + sri \d\().2d, \s0\().2d, #(\imm) +.endm + +.macro xar_m1_0 d s0 s1 imm tmp + eor \tmp\().16b, \s0\().16b, \s1\().16b +.endm + +.macro xar_m1_1 d s0 s1 imm tmp + shl \d\().2d, \tmp\().2d, #(64-\imm) +.endm + +.macro xar_m1_2 d s0 s1 imm tmp + sri \d\().2d, \tmp\().2d, #(\imm) +.endm + +.macro bcax_m1 d s0 s1 s2 + bic tmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +.macro bcax_m1_d d s0 s1 s2 + bic \d\().16b, \s1\().16b, \s2\().16b + eor \d\().16b, \d\().16b, \s0\().16b +.endm + +.macro refresh d + mov \d\().16b, \d\().16b +.endm +/* Keccak-f1600 round */ + +.macro keccak_f1600_round_full out in + + eor5 C0, \in\()ka, \in\()ga, \in\()ma, \in\()ba, \in\()sa, tmp + eor5 C2, \in\()ki, \in\()gi, \in\()mi, \in\()bi, \in\()si, tmp + eor5 C4, \in\()ku, \in\()gu, \in\()mu, \in\()bu, \in\()su, tmp + eor5 C1, \in\()ke, \in\()ge, \in\()me, \in\()be, \in\()se, tmp + eor5 C3, \in\()ko, \in\()go, \in\()mo, \in\()bo, \in\()so, tmp + + rax1_m1 E1c, C0, C2 SEP save(E1c) + rax1_m1 E3c, C2, C4 SEP save(E3c) + rax1_m1 E0c, C4, C1 SEP save(E0c) + rax1_m1 E2c, C1, C3 SEP save(E2c) + rax1_m1 E4c, C3, C0 SEP save(E4c) + + xar_m1 vBgo, \in\()me /* used at block 3 */, E1c, 19 + xar_m1 vBgi, \in\()ka /* used at block 2 */, E0c, 61 + xar_m1 vBga, \in\()bo /* used at block 4 */, E3c, 36 + xar_m1 vBge, \in\()gu /* used at block 1 */, E4c, 44 + xar_m1 vBgu, \in\()si /* used at block 5 */, E2c, 3 + + bcax_m1 \out\()ga, vBga, vBgi, vBge + bcax_m1 \out\()ge, vBge, vBgo, vBgi + bcax_m1 \out\()gi, vBgi, vBgu, vBgo + bcax_m1 \out\()go, vBgo, vBga, vBgu + bcax_m1 \out\()gu, vBgu, vBge, vBga + restore(E4u) + xar_m1 vBko, \in\()mu /* used at block 3 */, E4u, 56 SEP restore(E1u) + xar_m1 vBka, \in\()be /* used at block 4 */, E1u, 63 SEP restore(E2u) + xar_m1 vBke, \in\()gi /* not used */, E2u, 58 SEP restore(E0u) + xar_m1 vBku, \in\()sa /* used at block 5 */, E0u, 46 SEP restore(E3u) + xar_m1 vBki, \in\()ko /* used at block 2 */, E3u, 39 + + bcax_m1 \out\()ke, vBke, vBko, vBki + bcax_m1 \out\()ki, vBki, vBku, vBko + bcax_m1 \out\()ku, vBku, vBke, vBka + bcax_m1 \out\()ko, vBko, vBka, vBku + bcax_m1 \out\()ka, vBka, vBki, vBke + + // Can use: Abo, Asi, Abe, Asa; Abu, Aso + restore(E3u) + xar_m1 vBmu, \in\()so /* used at block 5 */, E3u, 8 SEP restore(E4u) + xar_m1 vBma, \in\()bu /* used at block 4 */, E4u, 37 SEP restore(E2u) + xar_m1 vBmo, \in\()mi /* used at block 3 */, E2u, 49 SEP restore(E1u) + xar_m1 vBmi, \in\()ke /* not used */, E1u, 54 SEP restore(E0u) + xar_m1 vBme, \in\()ga /* not used */, E0u, 28 + + bcax_m1 \out\()ma, vBma, vBmi, vBme + bcax_m1 \out\()mo, vBmo, vBma, vBmu + bcax_m1 \out\()me, vBme, vBmo, vBmi + bcax_m1 \out\()mi, vBmi, vBmu, vBmo + bcax_m1 \out\()mu, vBmu, vBme, vBma + + // Can use: Asi, Asa, Aso, Asu, Amo + restore(E0u) + eor2 vBba, \in\()ba /* used at block 4 */, E0u SEP restore(E2u) + xar_m1 vBbi, \in\()ki /* not used* */, E2u, 21 SEP restore(E3u) + xar_m1 vBbo, \in\()mo /* not used+ */, E3u, 43 SEP restore(E4u) + xar_m1 vBbu, \in\()su /* used at block 5 */, E4u, 50 SEP restore(E1u) + xar_m1 vBbe, \in\()ge /* not used */, E1u, 20 + + bcax_m1 \out\()ba, vBba, vBbi, vBbe + ld1r {tmp.2d}, [const_addr], #8 + eor2 \out\()ba, \out\()ba, tmp + bcax_m1 \out\()be, vBbe, vBbo, vBbi + bcax_m1 \out\()bo, vBbo, vBba, vBbu + bcax_m1 \out\()bu, vBbu, vBbe, vBba + bcax_m1 \out\()bi, vBbi, vBbu, vBbo + + // Can use: Amo, Age, Abi, Ama, Ago, Aku + restore(E2u) + xar_m1 vBsa, \in\()bi /* not used+ */, E2u, 2 SEP restore(E0u) + xar_m1 vBso, \in\()ma /* not used+ */, E0u, 23 SEP restore(E3u) + xar_m1 vBse, \in\()go /* not used+ */, E3u, 9 SEP restore(E4u) + xar_m1 vBsi, \in\()ku /* not used */, E4u, 25 SEP restore(E1u) + xar_m1 vBsu, \in\()se /* used at block 5 */, E1u, 62 + + bcax_m1_d \out\()sa, vBsa, vBsi, vBse + bcax_m1_d \out\()se, vBse, vBso, vBsi + bcax_m1_d \out\()si, vBsi, vBsu, vBso + bcax_m1_d \out\()so, vBso, vBsa, vBsu + bcax_m1_d \out\()su, vBsu, vBse, vBsa + +.endm + +.macro keccak_f1600_round_pre out in + + eor5 C0, \in\()ka, \in\()ga, \in\()ma, \in\()ba, \in\()sa, tmp + eor5 C2, \in\()ki, \in\()gi, \in\()mi, \in\()bi, \in\()si, tmp + eor5 C4, \in\()ku, \in\()gu, \in\()mu, \in\()bu, \in\()su, tmp + eor5 C1, \in\()ke, \in\()ge, \in\()me, \in\()be, \in\()se, tmp + eor5 C3, \in\()ko, \in\()go, \in\()mo, \in\()bo, \in\()so, tmp + +.endm + +.macro keccak_f1600_round_post out in + + .unreq C0 + .unreq C1 + .unreq C2 + .unreq C3 + .unreq C4 + .unreq C0q + .unreq C1q + .unreq C2q + .unreq C3q + .unreq C4q + + C0 .req \out\()spare3 + C1 .req \out\()spare1 + C2 .req \out\()spare2 + C3 .req \out\()spare0 + C4 .req \out\()spare4 + C0q .req \out\()spare3q + C1q .req \out\()spare1q + C2q .req \out\()spare2q + C3q .req \out\()spare0q + C4q .req \out\()spare4q + + eor5 C0, \out\()ka, \out\()ga, \out\()ma, \out\()ba, \out\()sa, tmp + eor5 C2, \out\()ki, \out\()gi, \out\()mi, \out\()bi, \out\()si, tmp + eor5 C4, \out\()ku, \out\()gu, \out\()mu, \out\()bu, \out\()su, tmp + eor5 C1, \out\()ke, \out\()ge, \out\()me, \out\()be, \out\()se, tmp + eor5 C3, \out\()ko, \out\()go, \out\()mo, \out\()bo, \out\()so, tmp + +.endm +.macro keccak_f1600_round_core out in + + rax1_m1 E1c, C0, C2 SEP save(E1c) + rax1_m1 E3c, C2, C4 SEP save(E3c) + rax1_m1 E0c, C4, C1 SEP save(E0c) + rax1_m1 E2c, C1, C3 SEP save(E2c) + rax1_m1 E4c, C3, C0 SEP save(E4c) + + xar_m1 vBgo, \in\()me /* used at block 3 */, E1c, 19 + xar_m1 vBgi, \in\()ka /* used at block 2 */, E0c, 61 + xar_m1 vBga, \in\()bo /* used at block 4 */, E3c, 36 + xar_m1 vBge, \in\()gu /* used at block 1 */, E4c, 44 + xar_m1 vBgu, \in\()si /* used at block 5 */, E2c, 3 + + bcax_m1 \out\()ga, vBga, vBgi, vBge + bcax_m1 \out\()ge, vBge, vBgo, vBgi + bcax_m1 \out\()gi, vBgi, vBgu, vBgo + bcax_m1 \out\()go, vBgo, vBga, vBgu + bcax_m1 \out\()gu, vBgu, vBge, vBga + restore(E4u) + xar_m1 vBko, \in\()mu /* used at block 3 */, E4u, 56 SEP restore(E1u) + xar_m1 vBka, \in\()be /* used at block 4 */, E1u, 63 SEP restore(E2u) + xar_m1 vBke, \in\()gi /* not used */, E2u, 58 SEP restore(E0u) + xar_m1 vBku, \in\()sa /* used at block 5 */, E0u, 46 SEP restore(E3u) + xar_m1 vBki, \in\()ko /* used at block 2 */, E3u, 39 + + bcax_m1 \out\()ke, vBke, vBko, vBki + bcax_m1 \out\()ki, vBki, vBku, vBko + bcax_m1 \out\()ku, vBku, vBke, vBka + bcax_m1 \out\()ko, vBko, vBka, vBku + bcax_m1 \out\()ka, vBka, vBki, vBke + + // Can use: Abo, Asi, Abe, Asa; Abu, Aso + restore(E3u) + xar_m1 vBmu, \in\()so /* used at block 5 */, E3u, 8 SEP restore(E4u) + xar_m1 vBma, \in\()bu /* used at block 4 */, E4u, 37 SEP restore(E2u) + xar_m1 vBmo, \in\()mi /* used at block 3 */, E2u, 49 SEP restore(E1u) + xar_m1 vBmi, \in\()ke /* not used */, E1u, 54 SEP restore(E0u) + xar_m1 vBme, \in\()ga /* not used */, E0u, 28 + + bcax_m1 \out\()ma, vBma, vBmi, vBme + bcax_m1 \out\()mo, vBmo, vBma, vBmu + bcax_m1 \out\()me, vBme, vBmo, vBmi + bcax_m1 \out\()mi, vBmi, vBmu, vBmo + bcax_m1 \out\()mu, vBmu, vBme, vBma + + // Can use: Asi, Asa, Aso, Asu, Amo + restore(E0u) + eor2 vBba, \in\()ba /* used at block 4 */, E0u SEP restore(E2u) + xar_m1 vBbi, \in\()ki /* not used* */, E2u, 21 SEP restore(E3u) + xar_m1 vBbo, \in\()mo /* not used+ */, E3u, 43 SEP restore(E4u) + xar_m1 vBbu, \in\()su /* used at block 5 */, E4u, 50 SEP restore(E1u) + xar_m1 vBbe, \in\()ge /* not used */, E1u, 20 + + bcax_m1 \out\()ba, vBba, vBbi, vBbe + ld1r {tmp.2d}, [const_addr], #8 + eor2 \out\()ba, \out\()ba, tmp + bcax_m1 \out\()be, vBbe, vBbo, vBbi + bcax_m1 \out\()bo, vBbo, vBba, vBbu + bcax_m1 \out\()bu, vBbu, vBbe, vBba + bcax_m1 \out\()bi, vBbi, vBbu, vBbo + + // Can use: Amo, Age, Abi, Ama, Ago, Aku + restore(E2u) + xar_m1 vBsa, \in\()bi /* not used+ */, E2u, 2 SEP restore(E0u) + xar_m1 vBso, \in\()ma /* not used+ */, E0u, 23 SEP restore(E3u) + xar_m1 vBse, \in\()go /* not used+ */, E3u, 9 SEP restore(E4u) + xar_m1 vBsi, \in\()ku /* not used */, E4u, 25 SEP restore(E1u) + xar_m1 vBsu, \in\()se /* used at block 5 */, E1u, 62 + + bcax_m1_d \out\()sa, vBsa, vBsi, vBse + bcax_m1_d \out\()se, vBse, vBso, vBsi + bcax_m1_d \out\()si, vBsi, vBsu, vBso + bcax_m1_d \out\()so, vBso, vBsa, vBsu + bcax_m1_d \out\()su, vBsu, vBse, vBsa + +.endm + +.macro keccak_f1600_round_first out in + keccak_f1600_round_pre \out, \in + keccak_f1600_round_core \out, \in + keccak_f1600_round_post \out, \in +.endm + +.macro keccak_f1600_round_inner out in + keccak_f1600_round_core \out, \in + keccak_f1600_round_post \out, \in +.endm + +.macro keccak_f1600_round_last out in + keccak_f1600_round_core \out, \in +.endm + +.macro keccak_f1600_round_inner_optim out in + + rax1_m1 E1c, C0, C2 SEP save(E1c) + rax1_m1 E3c, C2, C4 SEP save(E3c) + rax1_m1 E0c, C4, C1 SEP save(E0c) + rax1_m1 E2c, C1, C3 SEP save(E2c) + rax1_m1 E4c, C3, C0 SEP save(E4c) + + xar_m1 vBgo, \in\()me /* used at block 3 */, E1c, 19 + xar_m1 vBgi, \in\()ka /* used at block 2 */, E0c, 61 + xar_m1 vBga, \in\()bo /* used at block 4 */, E3c, 36 + xar_m1 vBge, \in\()gu /* used at block 1 */, E4c, 44 + xar_m1 vBgu, \in\()si /* used at block 5 */, E2c, 3 + + bcax_m1 \out\()ga, vBga, vBgi, vBge + bcax_m1 \out\()ge, vBge, vBgo, vBgi + bcax_m1 \out\()gi, vBgi, vBgu, vBgo + bcax_m1 \out\()go, vBgo, vBga, vBgu + bcax_m1 \out\()gu, vBgu, vBge, vBga + restore(E4u) + xar_m1 vBko, \in\()mu /* used at block 3 */, E4u, 56 SEP restore(E1u) + xar_m1 vBka, \in\()be /* used at block 4 */, E1u, 63 SEP restore(E2u) + xar_m1 vBke, \in\()gi /* not used */, E2u, 58 SEP restore(E0u) + xar_m1 vBku, \in\()sa /* used at block 5 */, E0u, 46 SEP restore(E3u) + xar_m1 vBki, \in\()ko /* used at block 2 */, E3u, 39 + + bcax_m1 \out\()ke, vBke, vBko, vBki + bcax_m1 \out\()ki, vBki, vBku, vBko + bcax_m1 \out\()ku, vBku, vBke, vBka + bcax_m1 \out\()ko, vBko, vBka, vBku + bcax_m1 \out\()ka, vBka, vBki, vBke + + // Can use: Abo, Asi, Abe, Asa; Abu, Aso + restore(E3u) + xar_m1 vBmu, \in\()so /* used at block 5 */, E3u, 8 SEP restore(E4u) + xar_m1 vBma, \in\()bu /* used at block 4 */, E4u, 37 SEP restore(E2u) + xar_m1 vBmo, \in\()mi /* used at block 3 */, E2u, 49 SEP restore(E1u) + xar_m1 vBmi, \in\()ke /* not used */, E1u, 54 SEP restore(E0u) + xar_m1 vBme, \in\()ga /* not used */, E0u, 28 + + bcax_m1 \out\()ma, vBma, vBmi, vBme + bcax_m1 \out\()mo, vBmo, vBma, vBmu + bcax_m1 \out\()me, vBme, vBmo, vBmi + bcax_m1 \out\()mi, vBmi, vBmu, vBmo + bcax_m1 \out\()mu, vBmu, vBme, vBma + + // Can use: Asi, Asa, Aso, Asu, Amo + restore(E0u) + eor2 vBba, \in\()ba /* used at block 4 */, E0u SEP restore(E2u) + xar_m1 vBbi, \in\()ki /* not used* */, E2u, 21 SEP restore(E3u) + xar_m1 vBbo, \in\()mo /* not used+ */, E3u, 43 SEP restore(E4u) + xar_m1 vBbu, \in\()su /* used at block 5 */, E4u, 50 SEP restore(E1u) + xar_m1 vBbe, \in\()ge /* not used */, E1u, 20 + + .unreq C0 + .unreq C0q + C0 .req \out\()spare3 + C0q .req \out\()spare3q + + eor2 C0, \out\()ka, \out\()ga + eor2 C0, C0, \out\()ma + + bcax_m1 \out\()ba, vBba, vBbi, vBbe + ld1r {tmp.2d}, [const_addr], #8 + eor2 \out\()ba, \out\()ba, tmp + bcax_m1 \out\()be, vBbe, vBbo, vBbi + bcax_m1 \out\()bo, vBbo, vBba, vBbu + bcax_m1 \out\()bu, vBbu, vBbe, vBba + bcax_m1 \out\()bi, vBbi, vBbu, vBbo + + eor2 C0, C0, \out\()ba + + // Can use: Amo, Age, Abi, Ama, Ago, Aku + restore(E2u) + xar_m1 vBsa, \in\()bi /* not used+ */, E2u, 2 SEP restore(E0u) + xar_m1 vBso, \in\()ma /* not used+ */, E0u, 23 SEP restore(E3u) + xar_m1 vBse, \in\()go /* not used+ */, E3u, 9 SEP restore(E4u) + xar_m1 vBsi, \in\()ku /* not used */, E4u, 25 SEP restore(E1u) + + .unreq C2 + .unreq C2q + C2 .req \out\()spare2 + C2q .req \out\()spare2q + + eor2 C2, \out\()ki, \out\()gi + eor2 C2, C2, \out\()mi + eor2 C2, C2, \out\()bi + + xar_m1 vBsu, \in\()se /* used at block 5 */, E1u, 62 + + bcax_m1_d \out\()sa, vBsa, vBsi, vBse + eor2 C0, C0, \out\()sa + bcax_m1_d \out\()se, vBse, vBso, vBsi + bcax_m1_d \out\()si, vBsi, vBsu, vBso + eor2 C2, C2, \out\()si + bcax_m1_d \out\()so, vBso, vBsa, vBsu + bcax_m1_d \out\()su, vBsu, vBse, vBsa + + .unreq C1 + .unreq C1q + C1 .req \out\()spare1 + C1q .req \out\()spare1q + + + .unreq C3 + .unreq C4 + .unreq C3q + .unreq C4q + + C3 .req \out\()spare0 + C4 .req \out\()spare4 + C3q .req \out\()spare0q + C4q .req \out\()spare4q + +// eor5 C0, \out\()ka, \out\()ga, \out\()ma, \out\()ba, \out\()sa, tmp +// eor5 C2, \out\()ki, \out\()gi, \out\()mi, \out\()bi, \out\()si, tmp + eor5 C4, \out\()ku, \out\()gu, \out\()mu, \out\()bu, \out\()su, tmp + eor5 C1, \out\()ke, \out\()ge, \out\()me, \out\()be, \out\()se, tmp + eor5 C3, \out\()ko, \out\()go, \out\()mo, \out\()bo, \out\()so, tmp + +.endm + +.text +.align 4 +.global keccak_f1600_x2_v84a_asm_v2p5 +.global _keccak_f1600_x2_v84a_asm_v2p5 + +#define KECCAK_F1600_ROUNDS 24 + +keccak_f1600_x2_v84a_asm_v2p5: +_keccak_f1600_x2_v84a_asm_v2p5: + alloc_stack + save_vregs + load_constant_ptr + load_input + + + //mov count, #(KECCAK_F1600_ROUNDS-2) + mov count, #24 +loop: + declare_remappings A1, A + keccak_f1600_round_first A1, A +// keccak_f1600_round_pre A1 A +// keccak_f1600_round_core A1 A +// keccak_f1600_round_post A1 A + undeclare_remappings A1, A + + declare_remappings A2, A1 +// keccak_f1600_round_pre A2 A1 +// keccak_f1600_round_core A2 A1 +// keccak_f1600_round_post A2 A1 + keccak_f1600_round_inner_optim A2, A1 + undeclare_remappings A2, A1 + + declare_remappings A3, A2 +// keccak_f1600_round_pre A3 A2 +// keccak_f1600_round_core A3 A2 +// keccak_f1600_round_post A3 A2 + keccak_f1600_round_inner_optim A3, A2 + undeclare_remappings A3, A2 + + declare_remappings A4, A3 +// keccak_f1600_round_pre A4 A3 + keccak_f1600_round_last A4, A3 +// keccak_f1600_round_post A4 A3 + undeclare_remappings A4, A3 + + transfer_uncommon A, A4 + + sub count, count, #4 + cbnz count, loop + + store_input + restore_vregs + free_stack + ret diff --git a/tests/keccak_neon/manual/keccak_f1600_x2_v84a_asm_v2p6.s b/tests/keccak_neon/manual/keccak_f1600_x2_v84a_asm_v2p6.s new file mode 100644 index 0000000..856a374 --- /dev/null +++ b/tests/keccak_neon/manual/keccak_f1600_x2_v84a_asm_v2p6.s @@ -0,0 +1,948 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +#define STACK_SIZE (16*4 + 16*30) +#define STACK_BASE_VREGS 0 +#define STACK_BASE_TMP 16*4 + +#define E0c_offset 0 +#define E1c_offset 1 +#define E2c_offset 2 +#define E3c_offset 3 +#define E4c_offset 4 +#define E0u_offset 0 +#define E1u_offset 1 +#define E2u_offset 2 +#define E3u_offset 3 +#define E4u_offset 4 + +#define ba_offset (5 + 0 ) +#define be_offset (5 + 1 ) +#define bi_offset (5 + 2 ) +#define bo_offset (5 + 3 ) +#define bu_offset (5 + 4 ) +#define ga_offset (5 + 5 ) +#define ge_offset (5 + 6 ) +#define gi_offset (5 + 7 ) +#define go_offset (5 + 8 ) +#define gu_offset (5 + 9 ) +#define ka_offset (5 + 10 ) +#define ke_offset (5 + 11 ) +#define ki_offset (5 + 12 ) +#define ko_offset (5 + 13 ) +#define ku_offset (5 + 14 ) +#define ma_offset (5 + 15 ) +#define me_offset (5 + 16 ) +#define mi_offset (5 + 17 ) +#define mo_offset (5 + 18 ) +#define mu_offset (5 + 19 ) +#define sa_offset (5 + 20 ) +#define se_offset (5 + 21 ) +#define si_offset (5 + 22 ) +#define so_offset (5 + 23 ) +#define su_offset (5 + 24 ) + +#define savep(reg, offset_prefix) \ + str reg ## q, [sp, #(STACK_BASE_TMP + 16 * offset_prefix ## _offset)] +#define restorep(reg, offset_prefix) \ + ldr reg ## q, [sp, #(STACK_BASE_TMP + 16 * offset_prefix ## _offset)] +#define save(name) savep(name,name) +#define restore(name) restorep(name,name) + +/********************** CONSTANTS *************************/ + .data + .align(8) +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x1 + count .req x2 + cur_const .req x3 + + /* Mapping of Kecck-f1600 state to vector registers + * at the beginning and end of each round. */ + Aba .req v0 + Abe .req v1 + Abo .req v2 + Abu .req v3 + Agu .req v4 + Aka .req v5 + Ako .req v6 + Ame .req v7 + Ami .req v8 + Amu .req v9 + Asa .req v10 + Ase .req v11 + Asi .req v12 + Aso .req v13 + Asu .req v14 + + Agi .req v15 + Ake .req v16 + Aga .req v17 + Aki .req v18 + + Abi .req v19 + Ama .req v20 + Ago .req v21 + Aku .req v22 + Age .req v23 + Amo .req v24 + + /* q-form of the above mapping */ + Abaq .req q0 + Abeq .req q1 + Aboq .req q2 + Abuq .req q3 + Aguq .req q4 + Akaq .req q5 + Akoq .req q6 + Ameq .req q7 + Amiq .req q8 + Amuq .req q9 + Asaq .req q10 + Aseq .req q11 + Asiq .req q12 + Asoq .req q13 + Asuq .req q14 + + Agiq .req q15 + Akeq .req q16 + Agaq .req q17 + Akiq .req q18 + + Abiq .req q19 + Amaq .req q20 + Agoq .req q21 + Akuq .req q22 + Ageq .req q23 + Amoq .req q24 + + Aspare0 .req v25 + Aspare1 .req v26 + Aspare2 .req v27 + Aspare3 .req v28 + Aspare4 .req v29 + Aspare5 .req v30 + Aspare6 .req v31 + Aspare0q .req q25 + Aspare1q .req q26 + Aspare2q .req q27 + Aspare3q .req q28 + Aspare4q .req q29 + Aspare5q .req q30 + Aspare6q .req q31 + +.macro declare_remappings out,in + tmp .req \in\()spare6 + tmpq .req \in\()spare6q + + \out\()gu .req \in\()gu /* keep */ + \out\()ga .req \in\()spare0 /* out */ + \out\()ge .req \in\()spare1 /* out */ + \out\()gi .req \in\()spare2 /* out */ + \out\()go .req \in\()spare3 /* out */ + + \out\()ka .req \in\()ka /* keep */ + \out\()ko .req \in\()ko /* keep */ + \out\()ke .req \in\()spare4 /* out */ + \out\()ki .req \in\()spare5 /* out */ + \out\()ku .req \in\()gi /* in */ + + \out\()ma .req \in\()ke /* in */ + \out\()me .req \in\()me /* keep */ + \out\()mi .req \in\()mi /* keep */ + \out\()mo .req \in\()ga /* in */ + \out\()mu .req \in\()mu /* keep */ + + \out\()ba .req \in\()ba /* keep */ + \out\()be .req \in\()be /* keep */ + \out\()bi .req \in\()ki /* in */ + \out\()bo .req \in\()bo /* keep */ + \out\()bu .req \in\()bu /* keep */ + + \out\()sa .req \in\()sa /* keep */ + \out\()se .req \in\()se /* keep */ + \out\()si .req \in\()si /* keep */ + \out\()so .req \in\()so /* keep */ + \out\()su .req \in\()su /* keep */ + + \out\()guq .req \in\()guq + \out\()gaq .req \in\()spare0q + \out\()geq .req \in\()spare1q + \out\()giq .req \in\()spare2q + \out\()goq .req \in\()spare3q + + \out\()kaq .req \in\()kaq + \out\()koq .req \in\()koq + \out\()keq .req \in\()spare4q + \out\()kiq .req \in\()spare5q + \out\()kuq .req \in\()giq + + \out\()maq .req \in\()keq + \out\()meq .req \in\()meq + \out\()miq .req \in\()miq + \out\()moq .req \in\()gaq + \out\()muq .req \in\()muq + + \out\()baq .req \in\()baq + \out\()beq .req \in\()beq + \out\()biq .req \in\()kiq + \out\()boq .req \in\()boq + \out\()buq .req \in\()buq + + \out\()saq .req \in\()saq + \out\()seq .req \in\()seq + \out\()siq .req \in\()siq + \out\()soq .req \in\()soq + \out\()suq .req \in\()suq + + \out\()spare0 .req \in\()bi + \out\()spare1 .req \in\()ma + \out\()spare4 .req \in\()go + \out\()spare2 .req \in\()ku + \out\()spare3 .req \in\()ge + \out\()spare5 .req \in\()mo + \out\()spare6 .req \in\()spare6 + \out\()spare0q .req \in\()biq + \out\()spare1q .req \in\()maq + \out\()spare4q .req \in\()goq + \out\()spare2q .req \in\()kuq + \out\()spare3q .req \in\()geq + \out\()spare5q .req \in\()moq + \out\()spare6q .req \in\()spare6q + + C0 .req \in\()spare3 + C1 .req \in\()spare1 + C2 .req \in\()spare2 + C3 .req \in\()spare0 + C4 .req \in\()spare4 + C0q .req \in\()spare3q + C1q .req \in\()spare1q + C2q .req \in\()spare2q + C3q .req \in\()spare0q + C4q .req \in\()spare4q + + E1c .req \in\()spare5 + E3c .req C2 + E0c .req C4 + E2c .req C1 + E4c .req C3 + + E1cq .req \in\()spare5q + E3cq .req C2q + E0cq .req C4q + E2cq .req C1q + E4cq .req C3q + + E0u .req tmp + E1u .req tmp + E2u .req tmp + E3u .req tmp + E4u .req tmp + + E0uq .req tmpq + E1uq .req tmpq + E2uq .req tmpq + E3uq .req tmpq + E4uq .req tmpq + + vBgo .req E1c + vBgi .req \in\()me + vBga .req \in\()ka + vBge .req \in\()bo + vBgu .req \in\()gu + + vBko .req \in\()me + vBka .req \in\()mu + vBke .req \in\()be + vBku .req \in\()gi + vBki .req \in\()sa + + vBmu .req \in\()bo + vBmo .req \in\()so + vBmi .req \in\()bu + vBma .req \in\()si + vBme .req \in\()be + + vBba .req \in\()si + vBbi .req \in\()sa + vBbo .req \in\()so + vBbu .req \in\()mo + vBbe .req \in\()su + + vBsa .req \in\()mo + vBso .req \in\()bi + vBse .req \in\()ma + vBsi .req \in\()go + vBsu .req E1u //\in\()ku +.endm + +.macro transfer_uncommon out in + savep(\in\()ga, ga) + savep(\in\()gi, gi) + savep(\in\()ki, ki) + savep(\in\()ke, ke) + savep(\in\()bi, bi) + savep(\in\()ma, ma) + savep(\in\()go, go) + savep(\in\()ku, ku) + savep(\in\()ge, ge) + savep(\in\()mo, mo) + + restorep(\out\()gi, gi) + restorep(\out\()ke, ke) + restorep(\out\()ga, ga) + restorep(\out\()ki, ki) + restorep(\out\()bi, bi) + restorep(\out\()ma, ma) + restorep(\out\()go, go) + restorep(\out\()ku, ku) + restorep(\out\()ge, ge) + restorep(\out\()mo, mo) +.endm + +.macro undeclare_remappings out in + .unreq vBgo + .unreq vBgi + .unreq vBga + .unreq vBge + .unreq vBgu + .unreq vBko + .unreq vBka + .unreq vBke + .unreq vBku + .unreq vBki + .unreq vBmu + .unreq vBmo + .unreq vBmi + .unreq vBma + .unreq vBme + .unreq vBba + .unreq vBbi + .unreq vBbo + .unreq vBbu + .unreq vBbe + .unreq vBsa + .unreq vBso + .unreq vBse + .unreq vBsi + .unreq vBsu + .unreq C0 + .unreq C1 + .unreq C2 + .unreq C3 + .unreq C4 + .unreq C0q + .unreq C1q + .unreq C2q + .unreq C3q + .unreq C4q + .unreq E1u + .unreq E3u + .unreq E0u + .unreq E2u + .unreq E4u + .unreq E1c + .unreq E3c + .unreq E0c + .unreq E2c + .unreq E4c + .unreq E1uq + .unreq E3uq + .unreq E0uq + .unreq E2uq + .unreq E4uq + .unreq E1cq + .unreq E3cq + .unreq E0cq + .unreq E2cq + .unreq E4cq +.endm + +/************************ MACROS ****************************/ + +.macro load_input + ldp Abaq, Abeq, [input_addr, #(2*8*0)] + ldp Abiq, Aboq, [input_addr, #(2*8*2)] + ldp Abuq, Agaq, [input_addr, #(2*8*4)] + ldp Ageq, Agiq, [input_addr, #(2*8*6)] + ldp Agoq, Aguq, [input_addr, #(2*8*8)] + ldp Akaq, Akeq, [input_addr, #(2*8*10)] + ldp Akiq, Akoq, [input_addr, #(2*8*12)] + ldp Akuq, Amaq, [input_addr, #(2*8*14)] + ldp Ameq, Amiq, [input_addr, #(2*8*16)] + ldp Amoq, Amuq, [input_addr, #(2*8*18)] + ldp Asaq, Aseq, [input_addr, #(2*8*20)] + ldp Asiq, Asoq, [input_addr, #(2*8*22)] + ldr Asuq, [input_addr, #(2*8*24)] +.endm + +.macro store_input + str Abaq, [input_addr, #(2*8*0)] + str Abeq, [input_addr, #(2*8*1)] + str Abiq, [input_addr, #(2*8*2)] + str Aboq, [input_addr, #(2*8*3)] + str Abuq, [input_addr, #(2*8*4)] + str Agaq, [input_addr, #(2*8*5)] + str Ageq, [input_addr, #(2*8*6)] + str Agiq, [input_addr, #(2*8*7)] + str Agoq, [input_addr, #(2*8*8)] + str Aguq, [input_addr, #(2*8*9)] + str Akaq, [input_addr, #(2*8*10)] + str Akeq, [input_addr, #(2*8*11)] + str Akiq, [input_addr, #(2*8*12)] + str Akoq, [input_addr, #(2*8*13)] + str Akuq, [input_addr, #(2*8*14)] + str Amaq, [input_addr, #(2*8*15)] + str Ameq, [input_addr, #(2*8*16)] + str Amiq, [input_addr, #(2*8*17)] + str Amoq, [input_addr, #(2*8*18)] + str Amuq, [input_addr, #(2*8*19)] + str Asaq, [input_addr, #(2*8*20)] + str Aseq, [input_addr, #(2*8*21)] + str Asiq, [input_addr, #(2*8*22)] + str Asoq, [input_addr, #(2*8*23)] + str Asuq, [input_addr, #(2*8*24)] +.endm + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +.macro save_vregs + stp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + stp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + stp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + stp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + ldp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + ldp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + ldp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +/* Macros using v8.4-A SHA-3 instructions */ + +.macro eor3_m1_0 d s0 s1 s2 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor2 d s0 s1 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor5 out i0 i1 i2 i3 i4 tmp + eor2 \out, \i0, \i1 + eor2 \tmp, \i3, \i4 + eor2 \out, \out, \i2 + eor2 \out, \out, \tmp +.endm + +.macro move d s + mov \d\().16b, \s\().16b +.endm + + +.macro eor3_m1_1 d s0 s1 s2 + eor \d\().16b, \d\().16b, \s2\().16b +.endm + +.macro eor3_m1 d s0 s1 s2 + eor3_m1_0 \d, \s0, \s1, \s2 + eor3_m1_1 \d, \s0, \s1, \s2 +.endm + +.macro rax1_m1 d s0 s1 + add tmp.2d, \s1\().2d, \s1\().2d + sri tmp.2d, \s1\().2d, #63 + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +.macro xar_m1 d s0 s1 imm + eor \s0\().16b, \s0\().16b, \s1\().16b + shl \d\().2d, \s0\().2d, #(64-\imm) + sri \d\().2d, \s0\().2d, #(\imm) +.endm + +.macro xar_m1_0 d s0 s1 imm tmp + eor \tmp\().16b, \s0\().16b, \s1\().16b +.endm + +.macro xar_m1_1 d s0 s1 imm tmp + shl \d\().2d, \tmp\().2d, #(64-\imm) +.endm + +.macro xar_m1_2 d s0 s1 imm tmp + sri \d\().2d, \tmp\().2d, #(\imm) +.endm + +.macro bcax_m1 d s0 s1 s2 + bic tmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +.macro bcax_m1_d d s0 s1 s2 + bic \d\().16b, \s1\().16b, \s2\().16b + eor \d\().16b, \d\().16b, \s0\().16b +.endm + +.macro refresh d + mov \d\().16b, \d\().16b +.endm +/* Keccak-f1600 round */ + +.macro keccak_f1600_round_full out in + + eor5 C0, \in\()ka, \in\()ga, \in\()ma, \in\()ba, \in\()sa, tmp + eor5 C2, \in\()ki, \in\()gi, \in\()mi, \in\()bi, \in\()si, tmp + eor5 C4, \in\()ku, \in\()gu, \in\()mu, \in\()bu, \in\()su, tmp + eor5 C1, \in\()ke, \in\()ge, \in\()me, \in\()be, \in\()se, tmp + eor5 C3, \in\()ko, \in\()go, \in\()mo, \in\()bo, \in\()so, tmp + + rax1_m1 E1c, C0, C2 SEP save(E1c) + rax1_m1 E3c, C2, C4 SEP save(E3c) + rax1_m1 E0c, C4, C1 SEP save(E0c) + rax1_m1 E2c, C1, C3 SEP save(E2c) + rax1_m1 E4c, C3, C0 SEP save(E4c) + + xar_m1 vBgi, \in\()ka /* used at block 2 */, E0c, 61 + xar_m1 vBga, \in\()bo /* used at block 4 */, E3c, 36 + xar_m1 vBge, \in\()gu /* used at block 1 */, E4c, 44 + xar_m1 vBgu, \in\()si /* used at block 5 */, E2c, 3 + + bcax_m1 \out\()ga, vBga, vBgi, vBge + bcax_m1 \out\()ge, vBge, vBgo, vBgi + bcax_m1 \out\()gi, vBgi, vBgu, vBgo + bcax_m1 \out\()go, vBgo, vBga, vBgu + bcax_m1 \out\()gu, vBgu, vBge, vBga + restore(E4u) + xar_m1 vBko, \in\()mu /* used at block 3 */, E4u, 56 SEP restore(E1u) + xar_m1 vBka, \in\()be /* used at block 4 */, E1u, 63 SEP restore(E2u) + xar_m1 vBke, \in\()gi /* not used */, E2u, 58 SEP restore(E0u) + xar_m1 vBku, \in\()sa /* used at block 5 */, E0u, 46 SEP restore(E3u) + xar_m1 vBki, \in\()ko /* used at block 2 */, E3u, 39 + + bcax_m1 \out\()ke, vBke, vBko, vBki + bcax_m1 \out\()ki, vBki, vBku, vBko + bcax_m1 \out\()ku, vBku, vBke, vBka + bcax_m1 \out\()ko, vBko, vBka, vBku + bcax_m1 \out\()ka, vBka, vBki, vBke + + // Can use: Abo, Asi, Abe, Asa; Abu, Aso + restore(E3u) + xar_m1 vBmu, \in\()so /* used at block 5 */, E3u, 8 SEP restore(E4u) + xar_m1 vBma, \in\()bu /* used at block 4 */, E4u, 37 SEP restore(E2u) + xar_m1 vBmo, \in\()mi /* used at block 3 */, E2u, 49 SEP restore(E1u) + xar_m1 vBmi, \in\()ke /* not used */, E1u, 54 SEP restore(E0u) + xar_m1 vBme, \in\()ga /* not used */, E0u, 28 + + bcax_m1 \out\()ma, vBma, vBmi, vBme + bcax_m1 \out\()mo, vBmo, vBma, vBmu + bcax_m1 \out\()me, vBme, vBmo, vBmi + bcax_m1 \out\()mi, vBmi, vBmu, vBmo + bcax_m1 \out\()mu, vBmu, vBme, vBma + + // Can use: Asi, Asa, Aso, Asu, Amo + restore(E0u) + eor2 vBba, \in\()ba /* used at block 4 */, E0u SEP restore(E2u) + xar_m1 vBbi, \in\()ki /* not used* */, E2u, 21 SEP restore(E3u) + xar_m1 vBbo, \in\()mo /* not used+ */, E3u, 43 SEP restore(E4u) + xar_m1 vBbu, \in\()su /* used at block 5 */, E4u, 50 SEP restore(E1u) + xar_m1 vBbe, \in\()ge /* not used */, E1u, 20 + + bcax_m1 \out\()ba, vBba, vBbi, vBbe + ld1r {tmp.2d}, [const_addr], #8 + eor2 \out\()ba, \out\()ba, tmp + bcax_m1 \out\()be, vBbe, vBbo, vBbi + bcax_m1 \out\()bo, vBbo, vBba, vBbu + bcax_m1 \out\()bu, vBbu, vBbe, vBba + bcax_m1 \out\()bi, vBbi, vBbu, vBbo + + // Can use: Amo, Age, Abi, Ama, Ago, Aku + restore(E2u) + xar_m1 vBsa, \in\()bi /* not used+ */, E2u, 2 SEP restore(E0u) + xar_m1 vBso, \in\()ma /* not used+ */, E0u, 23 SEP restore(E3u) + xar_m1 vBse, \in\()go /* not used+ */, E3u, 9 SEP restore(E4u) + xar_m1 vBsi, \in\()ku /* not used */, E4u, 25 SEP restore(E1u) + xar_m1 vBsu, \in\()se /* used at block 5 */, E1u, 62 + + bcax_m1_d \out\()sa, vBsa, vBsi, vBse + bcax_m1_d \out\()se, vBse, vBso, vBsi + bcax_m1_d \out\()si, vBsi, vBsu, vBso + bcax_m1_d \out\()so, vBso, vBsa, vBsu + bcax_m1_d \out\()su, vBsu, vBse, vBsa + +.endm + +.macro keccak_f1600_round_pre out in + + eor5 C0, \in\()ka, \in\()ga, \in\()ma, \in\()ba, \in\()sa, tmp + eor5 C2, \in\()ki, \in\()gi, \in\()mi, \in\()bi, \in\()si, tmp + eor5 C4, \in\()ku, \in\()gu, \in\()mu, \in\()bu, \in\()su, tmp + eor5 C1, \in\()ke, \in\()ge, \in\()me, \in\()be, \in\()se, tmp + eor5 C3, \in\()ko, \in\()go, \in\()mo, \in\()bo, \in\()so, tmp + +.endm + +.macro keccak_f1600_round_post out in + + .unreq C0 + .unreq C1 + .unreq C2 + .unreq C3 + .unreq C4 + .unreq C0q + .unreq C1q + .unreq C2q + .unreq C3q + .unreq C4q + + C0 .req \out\()spare3 + C1 .req \out\()spare1 + C2 .req \out\()spare2 + C3 .req \out\()spare0 + C4 .req \out\()spare4 + C0q .req \out\()spare3q + C1q .req \out\()spare1q + C2q .req \out\()spare2q + C3q .req \out\()spare0q + C4q .req \out\()spare4q + + eor5 C0, \out\()ka, \out\()ga, \out\()ma, \out\()ba, \out\()sa, tmp + eor5 C2, \out\()ki, \out\()gi, \out\()mi, \out\()bi, \out\()si, tmp + eor5 C4, \out\()ku, \out\()gu, \out\()mu, \out\()bu, \out\()su, tmp + eor5 C1, \out\()ke, \out\()ge, \out\()me, \out\()be, \out\()se, tmp + eor5 C3, \out\()ko, \out\()go, \out\()mo, \out\()bo, \out\()so, tmp + +.endm +.macro keccak_f1600_round_core out in + + rax1_m1 E1c, C0, C2 SEP save(E1c) + xar_m1 vBgo, \in\()me /* used at block 3 */, E1c, 19 + rax1_m1 E3c, C2, C4 SEP save(E3c) + rax1_m1 E0c, C4, C1 SEP save(E0c) + xar_m1 vBgi, \in\()ka /* used at block 2 */, E0c, 61 + rax1_m1 E2c, C1, C3 SEP save(E2c) + xar_m1 vBga, \in\()bo /* used at block 4 */, E3c, 36 + rax1_m1 E4c, C3, C0 SEP save(E4c) + + xar_m1 vBge, \in\()gu /* used at block 1 */, E4c, 44 + xar_m1 vBgu, \in\()si /* used at block 5 */, E2c, 3 + + bcax_m1 \out\()ga, vBga, vBgi, vBge + bcax_m1 \out\()ge, vBge, vBgo, vBgi + bcax_m1 \out\()gi, vBgi, vBgu, vBgo + bcax_m1 \out\()go, vBgo, vBga, vBgu + bcax_m1 \out\()gu, vBgu, vBge, vBga + restore(E4u) + xar_m1 vBko, \in\()mu /* used at block 3 */, E4u, 56 SEP restore(E1u) + xar_m1 vBka, \in\()be /* used at block 4 */, E1u, 63 SEP restore(E2u) + xar_m1 vBke, \in\()gi /* not used */, E2u, 58 SEP restore(E0u) + xar_m1 vBku, \in\()sa /* used at block 5 */, E0u, 46 SEP restore(E3u) + xar_m1 vBki, \in\()ko /* used at block 2 */, E3u, 39 + + bcax_m1 \out\()ke, vBke, vBko, vBki + bcax_m1 \out\()ki, vBki, vBku, vBko + bcax_m1 \out\()ku, vBku, vBke, vBka + bcax_m1 \out\()ko, vBko, vBka, vBku + bcax_m1 \out\()ka, vBka, vBki, vBke + + // Can use: Abo, Asi, Abe, Asa; Abu, Aso + restore(E3u) + xar_m1 vBmu, \in\()so /* used at block 5 */, E3u, 8 SEP restore(E4u) + xar_m1 vBma, \in\()bu /* used at block 4 */, E4u, 37 SEP restore(E2u) + xar_m1 vBmo, \in\()mi /* used at block 3 */, E2u, 49 SEP restore(E1u) + xar_m1 vBmi, \in\()ke /* not used */, E1u, 54 SEP restore(E0u) + xar_m1 vBme, \in\()ga /* not used */, E0u, 28 + + bcax_m1 \out\()ma, vBma, vBmi, vBme + bcax_m1 \out\()mo, vBmo, vBma, vBmu + bcax_m1 \out\()me, vBme, vBmo, vBmi + bcax_m1 \out\()mi, vBmi, vBmu, vBmo + bcax_m1 \out\()mu, vBmu, vBme, vBma + + // Can use: Asi, Asa, Aso, Asu, Amo + restore(E0u) + eor2 vBba, \in\()ba /* used at block 4 */, E0u SEP restore(E2u) + xar_m1 vBbi, \in\()ki /* not used* */, E2u, 21 SEP restore(E3u) + xar_m1 vBbo, \in\()mo /* not used+ */, E3u, 43 SEP restore(E4u) + xar_m1 vBbu, \in\()su /* used at block 5 */, E4u, 50 SEP restore(E1u) + xar_m1 vBbe, \in\()ge /* not used */, E1u, 20 + + bcax_m1 \out\()ba, vBba, vBbi, vBbe + ld1r {tmp.2d}, [const_addr], #8 + eor2 \out\()ba, \out\()ba, tmp + bcax_m1 \out\()be, vBbe, vBbo, vBbi + bcax_m1 \out\()bo, vBbo, vBba, vBbu + bcax_m1 \out\()bu, vBbu, vBbe, vBba + bcax_m1 \out\()bi, vBbi, vBbu, vBbo + + // Can use: Amo, Age, Abi, Ama, Ago, Aku + restore(E2u) + xar_m1 vBsa, \in\()bi /* not used+ */, E2u, 2 SEP restore(E0u) + xar_m1 vBso, \in\()ma /* not used+ */, E0u, 23 SEP restore(E3u) + xar_m1 vBse, \in\()go /* not used+ */, E3u, 9 SEP restore(E4u) + xar_m1 vBsi, \in\()ku /* not used */, E4u, 25 SEP restore(E1u) + xar_m1 vBsu, \in\()se /* used at block 5 */, E1u, 62 + + bcax_m1_d \out\()sa, vBsa, vBsi, vBse + bcax_m1_d \out\()se, vBse, vBso, vBsi + bcax_m1_d \out\()si, vBsi, vBsu, vBso + bcax_m1_d \out\()so, vBso, vBsa, vBsu + bcax_m1_d \out\()su, vBsu, vBse, vBsa + +.endm + +.macro keccak_f1600_round_first out in + keccak_f1600_round_pre \out, \in + keccak_f1600_round_core \out, \in + keccak_f1600_round_post \out, \in +.endm + +.macro keccak_f1600_round_inner out in + keccak_f1600_round_core \out, \in + keccak_f1600_round_post \out, \in +.endm + +.macro keccak_f1600_round_last out in + keccak_f1600_round_core \out, \in +.endm + +.macro keccak_f1600_round_inner_optim out in + + rax1_m1 E1c, C0, C2 SEP save(E1c) + xar_m1 vBgo, \in\()me /* used at block 3 */, E1c, 19 + rax1_m1 E3c, C2, C4 SEP save(E3c) + rax1_m1 E0c, C4, C1 SEP save(E0c) + xar_m1 vBgi, \in\()ka /* used at block 2 */, E0c, 61 + rax1_m1 E2c, C1, C3 SEP save(E2c) + xar_m1 vBga, \in\()bo /* used at block 4 */, E3c, 36 + rax1_m1 E4c, C3, C0 SEP save(E4c) + + xar_m1 vBge, \in\()gu /* used at block 1 */, E4c, 44 + xar_m1 vBgu, \in\()si /* used at block 5 */, E2c, 3 + + bcax_m1 \out\()ga, vBga, vBgi, vBge + bcax_m1 \out\()ge, vBge, vBgo, vBgi + bcax_m1 \out\()gi, vBgi, vBgu, vBgo + bcax_m1 \out\()go, vBgo, vBga, vBgu + bcax_m1 \out\()gu, vBgu, vBge, vBga + restore(E4u) + xar_m1 vBko, \in\()mu /* used at block 3 */, E4u, 56 SEP restore(E1u) + xar_m1 vBka, \in\()be /* used at block 4 */, E1u, 63 SEP restore(E2u) + xar_m1 vBke, \in\()gi /* not used */, E2u, 58 SEP restore(E0u) + xar_m1 vBku, \in\()sa /* used at block 5 */, E0u, 46 SEP restore(E3u) + xar_m1 vBki, \in\()ko /* used at block 2 */, E3u, 39 + + bcax_m1 \out\()ke, vBke, vBko, vBki + bcax_m1 \out\()ki, vBki, vBku, vBko + bcax_m1 \out\()ku, vBku, vBke, vBka + bcax_m1 \out\()ko, vBko, vBka, vBku + bcax_m1 \out\()ka, vBka, vBki, vBke + + // Can use: Abo, Asi, Abe, Asa; Abu, Aso + restore(E3u) + xar_m1 vBmu, \in\()so /* used at block 5 */, E3u, 8 SEP restore(E4u) + xar_m1 vBma, \in\()bu /* used at block 4 */, E4u, 37 SEP restore(E2u) + xar_m1 vBmo, \in\()mi /* used at block 3 */, E2u, 49 SEP restore(E1u) + xar_m1 vBmi, \in\()ke /* not used */, E1u, 54 SEP restore(E0u) + xar_m1 vBme, \in\()ga /* not used */, E0u, 28 + + bcax_m1 \out\()ma, vBma, vBmi, vBme + bcax_m1 \out\()mo, vBmo, vBma, vBmu + bcax_m1 \out\()me, vBme, vBmo, vBmi + bcax_m1 \out\()mi, vBmi, vBmu, vBmo + bcax_m1 \out\()mu, vBmu, vBme, vBma + + // Can use: Asi, Asa, Aso, Asu, Amo + restore(E0u) + eor2 vBba, \in\()ba /* used at block 4 */, E0u SEP restore(E2u) + xar_m1 vBbi, \in\()ki /* not used* */, E2u, 21 SEP restore(E3u) + xar_m1 vBbo, \in\()mo /* not used+ */, E3u, 43 SEP restore(E4u) + xar_m1 vBbu, \in\()su /* used at block 5 */, E4u, 50 SEP restore(E1u) + xar_m1 vBbe, \in\()ge /* not used */, E1u, 20 + + .unreq C0 + .unreq C0q + C0 .req \out\()spare3 + C0q .req \out\()spare3q + + eor2 C0, \out\()ka, \out\()ga + eor2 C0, C0, \out\()ma + + bcax_m1 \out\()ba, vBba, vBbi, vBbe + ld1r {tmp.2d}, [const_addr], #8 + eor2 \out\()ba, \out\()ba, tmp + bcax_m1 \out\()be, vBbe, vBbo, vBbi + bcax_m1 \out\()bo, vBbo, vBba, vBbu + bcax_m1 \out\()bu, vBbu, vBbe, vBba + bcax_m1 \out\()bi, vBbi, vBbu, vBbo + + eor2 C0, C0, \out\()ba + + // Can use: Amo, Age, Abi, Ama, Ago, Aku + restore(E2u) + xar_m1 vBsa, \in\()bi /* not used+ */, E2u, 2 SEP restore(E0u) + xar_m1 vBso, \in\()ma /* not used+ */, E0u, 23 SEP restore(E3u) + xar_m1 vBse, \in\()go /* not used+ */, E3u, 9 SEP restore(E4u) + xar_m1 vBsi, \in\()ku /* not used */, E4u, 25 SEP restore(E1u) + + .unreq C2 + .unreq C2q + C2 .req \out\()spare2 + C2q .req \out\()spare2q + + eor2 C2, \out\()ki, \out\()gi + eor2 C2, C2, \out\()mi + eor2 C2, C2, \out\()bi + + xar_m1 vBsu, \in\()se /* used at block 5 */, E1u, 62 + + bcax_m1_d \out\()sa, vBsa, vBsi, vBse + eor2 C0, C0, \out\()sa + bcax_m1_d \out\()se, vBse, vBso, vBsi + bcax_m1_d \out\()si, vBsi, vBsu, vBso + eor2 C2, C2, \out\()si + bcax_m1_d \out\()so, vBso, vBsa, vBsu + bcax_m1_d \out\()su, vBsu, vBse, vBsa + + .unreq C1 + .unreq C1q + C1 .req \out\()spare1 + C1q .req \out\()spare1q + + + .unreq C3 + .unreq C4 + .unreq C3q + .unreq C4q + + C3 .req \out\()spare0 + C4 .req \out\()spare4 + C3q .req \out\()spare0q + C4q .req \out\()spare4q + +// eor5 C0, \out\()ka, \out\()ga, \out\()ma, \out\()ba, \out\()sa, tmp +// eor5 C2, \out\()ki, \out\()gi, \out\()mi, \out\()bi, \out\()si, tmp + eor5 C4, \out\()ku, \out\()gu, \out\()mu, \out\()bu, \out\()su, tmp + eor5 C1, \out\()ke, \out\()ge, \out\()me, \out\()be, \out\()se, tmp + eor5 C3, \out\()ko, \out\()go, \out\()mo, \out\()bo, \out\()so, tmp + +.endm + +.text +.align 4 +.global keccak_f1600_x2_v84a_asm_v2p6 +.global _keccak_f1600_x2_v84a_asm_v2p6 + +#define KECCAK_F1600_ROUNDS 24 + +keccak_f1600_x2_v84a_asm_v2p6: +_keccak_f1600_x2_v84a_asm_v2p6: + alloc_stack + save_vregs + load_constant_ptr + load_input + + + //mov count, #(KECCAK_F1600_ROUNDS-2) + mov count, #24 +loop: + declare_remappings A1, A + keccak_f1600_round_first A1, A +// keccak_f1600_round_pre A1 A +// keccak_f1600_round_core A1 A +// keccak_f1600_round_post A1 A + undeclare_remappings A1, A + + declare_remappings A2, A1 +// keccak_f1600_round_pre A2 A1 +// keccak_f1600_round_core A2 A1 +// keccak_f1600_round_post A2 A1 + keccak_f1600_round_inner_optim A2, A1 + undeclare_remappings A2, A1 + + declare_remappings A3, A2 +// keccak_f1600_round_pre A3 A2 +// keccak_f1600_round_core A3 A2 +// keccak_f1600_round_post A3 A2 + keccak_f1600_round_inner_optim A3, A2 + undeclare_remappings A3, A2 + + declare_remappings A4, A3 +// keccak_f1600_round_pre A4 A3 + keccak_f1600_round_last A4, A3 +// keccak_f1600_round_post A4 A3 + undeclare_remappings A4, A3 + + transfer_uncommon A, A4 + + sub count, count, #4 + cbnz count, loop + + store_input + restore_vregs + free_stack + ret diff --git a/tests/keccak_neon/manual/keccak_f1600_x2_v84a_asm_v2pp0.s b/tests/keccak_neon/manual/keccak_f1600_x2_v84a_asm_v2pp0.s new file mode 100644 index 0000000..2a994b5 --- /dev/null +++ b/tests/keccak_neon/manual/keccak_f1600_x2_v84a_asm_v2pp0.s @@ -0,0 +1,729 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +/********************** CONSTANTS *************************/ + .data + .align(8) +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x1 + count .req x2 + cur_const .req x3 + + /* Mapping of Kecck-f1600 state to vector registers + * at the beginning and end of each round. */ + Aba .req v0 + Abe .req v1 + Abi .req v2 + Abo .req v3 + Abu .req v4 + Aga .req v5 + Age .req v6 + Agi .req v7 + Ago .req v8 + Agu .req v9 + Aka .req v10 + Ake .req v11 + Aki .req v12 + Ako .req v13 + Aku .req v14 + Ama .req v15 + Ame .req v16 + Ami .req v17 + Amo .req v18 + Amu .req v19 + Asa .req v20 + Ase .req v21 + Asi .req v22 + Aso .req v23 + Asu .req v24 + + /* q-form of the above mapping */ + Abaq .req q0 + Abeq .req q1 + Abiq .req q2 + Aboq .req q3 + Abuq .req q4 + Agaq .req q5 + Ageq .req q6 + Agiq .req q7 + Agoq .req q8 + Aguq .req q9 + Akaq .req q10 + Akeq .req q11 + Akiq .req q12 + Akoq .req q13 + Akuq .req q14 + Amaq .req q15 + Ameq .req q16 + Amiq .req q17 + Amoq .req q18 + Amuq .req q19 + Asaq .req q20 + Aseq .req q21 + Asiq .req q22 + Asoq .req q23 + Asuq .req q24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v27 + C1 .req v28 + C2 .req v29 + C3 .req v30 + C4 .req v31 + + C0q .req q27 + C1q .req q28 + C2q .req q29 + C3q .req q30 + C4q .req q31 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + vBba .req v25 // fresh + vBbe .req v26 // fresh + vBbi .req Abi + vBbo .req Abo + vBbu .req Abu + vBga .req Aka + vBge .req Ake + vBgi .req Agi + vBgo .req Ago + vBgu .req Agu + vBka .req Ama + vBke .req Ame + vBki .req Aki + vBko .req Ako + vBku .req Aku + vBma .req Asa + vBme .req Ase + vBmi .req Ami + vBmo .req Amo + vBmu .req Amu + vBsa .req Aba + vBse .req Abe + vBsi .req Asi + vBso .req Aso + vBsu .req Asu + + vBbaq .req q25 // fresh + vBbeq .req q26 // fresh + vBbiq .req Abiq + vBboq .req Aboq + vBbuq .req Abuq + vBgaq .req Akaq + vBgeq .req Akeq + vBgiq .req Agiq + vBgoq .req Agoq + vBguq .req Aguq + vBkaq .req Amaq + vBkeq .req Ameq + vBkiq .req Akiq + vBkoq .req Akoq + vBkuq .req Akuq + vBmaq .req Asaq + vBmeq .req Aseq + vBmiq .req Amiq + vBmoq .req Amoq + vBmuq .req Amuq + vBsaq .req Abaq + vBseq .req Abeq + vBsiq .req Asiq + vBsoq .req Asoq + vBsuq .req Asuq + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req C4 + E1 .req C0 + E2 .req vBbe // fresh + E3 .req C2 + E4 .req C3 + + E0q .req C4q + E1q .req C0q + E2q .req vBbeq // fresh + E3q .req C2q + E4q .req C3q + + +/************************ MACROS ****************************/ + +.macro load_input + ldp Abaq, Abeq, [input_addr, #(2*8*0)] + ldp Abiq, Aboq, [input_addr, #(2*8*2)] + ldp Abuq, Agaq, [input_addr, #(2*8*4)] + ldp Ageq, Agiq, [input_addr, #(2*8*6)] + ldp Agoq, Aguq, [input_addr, #(2*8*8)] + ldp Akaq, Akeq, [input_addr, #(2*8*10)] + ldp Akiq, Akoq, [input_addr, #(2*8*12)] + ldp Akuq, Amaq, [input_addr, #(2*8*14)] + ldp Ameq, Amiq, [input_addr, #(2*8*16)] + ldp Amoq, Amuq, [input_addr, #(2*8*18)] + ldp Asaq, Aseq, [input_addr, #(2*8*20)] + ldp Asiq, Asoq, [input_addr, #(2*8*22)] + ldr Asuq, [input_addr, #(2*8*24)] + + // ldr Abaq, [input_addr, #(2*8*0)] + // ldr Abeq, [input_addr, #(2*8*1)] + // ldr Abiq, [input_addr, #(2*8*2)] + // ldr Aboq, [input_addr, #(2*8*3)] + // ldr Abuq, [input_addr, #(2*8*4)] + // ldr Agaq, [input_addr, #(2*8*5)] + // ldr Ageq, [input_addr, #(2*8*6)] + // ldr Agiq, [input_addr, #(2*8*7)] + // ldr Agoq, [input_addr, #(2*8*8)] + // ldr Aguq, [input_addr, #(2*8*9)] + // ldr Akaq, [input_addr, #(2*8*10)] + // ldr Akeq, [input_addr, #(2*8*11)] + // ldr Akiq, [input_addr, #(2*8*12)] + // ldr Akoq, [input_addr, #(2*8*13)] + // ldr Akuq, [input_addr, #(2*8*14)] + // ldr Amaq, [input_addr, #(2*8*15)] + // ldr Ameq, [input_addr, #(2*8*16)] + // ldr Amiq, [input_addr, #(2*8*17)] + // ldr Amoq, [input_addr, #(2*8*18)] + // ldr Amuq, [input_addr, #(2*8*19)] + // ldr Asaq, [input_addr, #(2*8*20)] + // ldr Aseq, [input_addr, #(2*8*21)] + // ldr Asiq, [input_addr, #(2*8*22)] + // ldr Asoq, [input_addr, #(2*8*23)] + // ldr Asuq, [input_addr, #(2*8*24)] +.endm + +.macro store_input + str Abaq, [input_addr, #(2*8*0)] + str Abeq, [input_addr, #(2*8*1)] + str Abiq, [input_addr, #(2*8*2)] + str Aboq, [input_addr, #(2*8*3)] + str Abuq, [input_addr, #(2*8*4)] + str Agaq, [input_addr, #(2*8*5)] + str Ageq, [input_addr, #(2*8*6)] + str Agiq, [input_addr, #(2*8*7)] + str Agoq, [input_addr, #(2*8*8)] + str Aguq, [input_addr, #(2*8*9)] + str Akaq, [input_addr, #(2*8*10)] + str Akeq, [input_addr, #(2*8*11)] + str Akiq, [input_addr, #(2*8*12)] + str Akoq, [input_addr, #(2*8*13)] + str Akuq, [input_addr, #(2*8*14)] + str Amaq, [input_addr, #(2*8*15)] + str Ameq, [input_addr, #(2*8*16)] + str Amiq, [input_addr, #(2*8*17)] + str Amoq, [input_addr, #(2*8*18)] + str Amuq, [input_addr, #(2*8*19)] + str Asaq, [input_addr, #(2*8*20)] + str Aseq, [input_addr, #(2*8*21)] + str Asiq, [input_addr, #(2*8*22)] + str Asoq, [input_addr, #(2*8*23)] + str Asuq, [input_addr, #(2*8*24)] +.endm + +#define STACK_SIZE (16*4 + 16*31) +#define STACK_BASE_VREGS 0 +#define STACK_BASE_TMP 16*4 + +#define Aga_offset 0 +#define E0_offset 1 +#define E1_offset 2 +#define E2_offset 3 +#define E3_offset 4 +#define E4_offset 5 +#define Ame_offset 7 +#define Agi_offset 8 +#define Aka_offset 9 +#define Abo_offset 10 +#define Amo_offset 11 +#define Ami_offset 12 +#define Ake_offset 13 +#define Agu_offset 14 +#define Asi_offset 15 +#define Aku_offset 16 +#define Asa_offset 17 +#define Abu_offset 18 +#define Asu_offset 19 +#define Ase_offset 20 +//#define Aga_offset 21 +#define Age_offset 22 +#define vBgo_offset 23 +#define vBke_offset 24 +#define vBgi_offset 25 +#define vBga_offset 26 +#define vBbo_offset 27 +#define vBmo_offset 28 +#define vBmi_offset 29 +#define vBge_offset 30 + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +#define save(name) \ + str name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] +#define restore(name) \ + ldr name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] + +.macro save_vregs + stp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + stp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + stp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + stp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + ldp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + ldp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + ldp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +/* Macros using v8.4-A SHA-3 instructions */ + +.macro eor3_m1_0 d s0 s1 s2 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor2 d s0 s1 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor3_m1_1 d s0 s1 s2 + eor \d\().16b, \d\().16b, \s2\().16b +.endm + +.macro eor3_m1 d s0 s1 s2 + eor3_m1_0 \d, \s0, \s1, \s2 + eor3_m1_1 \d, \s0, \s1, \s2 +.endm + +.macro rax1_m1 d s0 s1 + add tmp.2d, \s1\().2d, \s1\().2d + sri tmp.2d, \s1\().2d, #63 + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +.macro xar_m1 d s0 s1 imm + eor \s0\().16b, \s0\().16b, \s1\().16b + shl \d\().2d, \s0\().2d, #(64-\imm) + sri \d\().2d, \s0\().2d, #(\imm) +.endm + +.macro xar_m1_0 d s0 s1 imm tmp + eor \tmp\().16b, \s0\().16b, \s1\().16b +.endm + +.macro xar_m1_1 d s0 s1 imm tmp + shl \d\().2d, \tmp\().2d, #(64-\imm) +.endm + +.macro xar_m1_2 d s0 s1 imm tmp + sri \d\().2d, \tmp\().2d, #(\imm) +.endm + +.macro bcax_m1 d s0 s1 s2 + bic tmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +.macro refresh d + mov \d\().16b, \d\().16b +.endm +/* Keccak-f1600 round */ + +.macro keccak_f1600_round_pre + + /* 10 EOR3, so 20 individual EOR */ + + eor3_m1_0 C0, Aba, Aga, Aka + eor3_m1_0 C1, Abe, Age, Ake + eor3_m1_0 C2, Abi, Agi, Aki + eor3_m1_0 C3, Abo, Ago, Ako + eor3_m1_0 C4, Abu, Agu, Aku + eor3_m1_1 C0, Aba, Aga, Aka + eor3_m1_1 C1, Abe, Age, Ake + eor3_m1_1 C2, Abi, Agi, Aki + eor3_m1_1 C3, Abo, Ago, Ako + eor3_m1_1 C4, Abu, Agu, Aku + eor3_m1_0 C0, C0, Ama, Asa + eor3_m1_0 C1, C1, Ame, Ase + eor3_m1_0 C2, C2, Ami, Asi + eor3_m1_0 C3, C3, Amo, Aso + eor3_m1_0 C4, C4, Amu, Asu + eor3_m1_1 C0, C0, Ama, Asa + eor3_m1_1 C1, C1, Ame, Ase + eor3_m1_1 C2, C2, Ami, Asi + eor3_m1_1 C3, C3, Amo, Aso + eor3_m1_1 C4, C4, Amu, Asu + +.endm + +.macro keccak_f1600_round + + /* 10 EOR3, so 20 individual EOR */ + + eor3_m1_0 C0, Aba, Aga, Aka + eor3_m1_0 C1, Abe, Age, Ake + eor3_m1_0 C2, Abi, Agi, Aki + eor3_m1_0 C3, Abo, Ago, Ako + eor3_m1_0 C4, Abu, Agu, Aku + eor3_m1_1 C0, Aba, Aga, Aka + eor3_m1_1 C1, Abe, Age, Ake + eor3_m1_1 C2, Abi, Agi, Aki + eor3_m1_1 C3, Abo, Ago, Ako + eor3_m1_1 C4, Abu, Agu, Aku + eor3_m1_0 C0, C0, Ama, Asa + eor3_m1_0 C1, C1, Ame, Ase + eor3_m1_0 C2, C2, Ami, Asi + eor3_m1_0 C3, C3, Amo, Aso + eor3_m1_0 C4, C4, Amu, Asu + eor3_m1_1 C0, C0, Ama, Asa + eor3_m1_1 C1, C1, Ame, Ase + eor3_m1_1 C2, C2, Ami, Asi + eor3_m1_1 C3, C3, Amo, Aso + eor3_m1_1 C4, C4, Amu, Asu + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req vBba + rax1_m1 E2, C1, C3 + rax1_m1 E4, C3, C0 + rax1_m1 E1, C0, C2 + rax1_m1 E3, C2, C4 + rax1_m1 E0, C4, C1 + .unreq tmp + + /* 25x XAR, 75 in total */ + + tmp .req C1 + + eor vBba.16b, Aba.16b, E0.16b + xar_m1 vBsa, Abi, E2, 2 + xar_m1 vBbi, Aki, E2, 21 + xar_m1 vBki, Ako, E3, 39 + xar_m1 vBko, Amu, E4, 56 + xar_m1 vBmu, Aso, E3, 8 + xar_m1 vBso, Ama, E0, 23 + xar_m1 vBka, Abe, E1, 63 + xar_m1 vBse, Ago, E3, 9 + xar_m1 vBgo, Ame, E1, 19 + xar_m1 vBke, Agi, E2, 58 + xar_m1 vBgi, Aka, E0, 61 + xar_m1 vBga, Abo, E3, 36 + xar_m1 vBbo, Amo, E3, 43 + xar_m1 vBmo, Ami, E2, 49 + xar_m1 vBmi, Ake, E1, 54 + xar_m1 vBge, Agu, E4, 44 + xar_m1 vBgu, Asi, E2, 3 + xar_m1 vBsi, Aku, E4, 25 + xar_m1 vBku, Asa, E0, 46 + xar_m1 vBma, Abu, E4, 37 + xar_m1 vBbu, Asu, E4, 50 + xar_m1 vBsu, Ase, E1, 62 + xar_m1 vBme, Aga, E0, 28 + xar_m1 vBbe, Age, E1, 20 + + /* 25x BCAX, 50 in total */ + + bcax_m1 Aga, vBga, vBgi, vBge + bcax_m1 Age, vBge, vBgo, vBgi + bcax_m1 Agi, vBgi, vBgu, vBgo + bcax_m1 Ago, vBgo, vBga, vBgu + bcax_m1 Agu, vBgu, vBge, vBga + bcax_m1 Aka, vBka, vBki, vBke + bcax_m1 Ake, vBke, vBko, vBki + bcax_m1 Aki, vBki, vBku, vBko + bcax_m1 Ako, vBko, vBka, vBku + bcax_m1 Aku, vBku, vBke, vBka + bcax_m1 Ama, vBma, vBmi, vBme + bcax_m1 Ame, vBme, vBmo, vBmi + bcax_m1 Ami, vBmi, vBmu, vBmo + bcax_m1 Amo, vBmo, vBma, vBmu + bcax_m1 Amu, vBmu, vBme, vBma + bcax_m1 Asa, vBsa, vBsi, vBse + bcax_m1 Ase, vBse, vBso, vBsi + bcax_m1 Asi, vBsi, vBsu, vBso + bcax_m1 Aso, vBso, vBsa, vBsu + bcax_m1 Asu, vBsu, vBse, vBsa + bcax_m1 Aba, vBba, vBbi, vBbe + bcax_m1 Abe, vBbe, vBbo, vBbi + bcax_m1 Abi, vBbi, vBbu, vBbo + bcax_m1 Abo, vBbo, vBba, vBbu + bcax_m1 Abu, vBbu, vBbe, vBba + + // iota step + ld1r {tmp.2d}, [const_addr], #8 + eor Aba.16b, Aba.16b, tmp.16b + + .unreq tmp + +.endm + +.macro keccak_f1600_round_core + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req vBba + rax1_m1 E2, C1, C3 + rax1_m1 E4, C3, C0 + rax1_m1 E1, C0, C2 + rax1_m1 E3, C2, C4 + rax1_m1 E0, C4, C1 + + /* 25x XAR, 75 in total */ + + eor vBba.16b, Aba.16b, E0.16b + xar_m1 vBsa, Abi, E2, 2 + xar_m1 vBbi, Aki, E2, 21 + xar_m1 vBki, Ako, E3, 39 + xar_m1 vBko, Amu, E4, 56 + xar_m1 vBmu, Aso, E3, 8 + xar_m1 vBso, Ama, E0, 23 + xar_m1 vBka, Abe, E1, 63 + xar_m1 vBse, Ago, E3, 9 + xar_m1 vBgo, Ame, E1, 19 + xar_m1 vBke, Agi, E2, 58 + xar_m1 vBgi, Aka, E0, 61 + xar_m1 vBga, Abo, E3, 36 + xar_m1 vBbo, Amo, E3, 43 + xar_m1 vBmo, Ami, E2, 49 + xar_m1 vBmi, Ake, E1, 54 + xar_m1 vBge, Agu, E4, 44 + xar_m1 vBgu, Asi, E2, 3 + xar_m1 vBsi, Aku, E4, 25 + xar_m1 vBku, Asa, E0, 46 + xar_m1 vBma, Abu, E4, 37 + xar_m1 vBbu, Asu, E4, 50 + xar_m1 vBsu, Ase, E1, 62 + xar_m1 vBme, Aga, E0, 28 + xar_m1 vBbe, Age, E1, 20 + + /* 25x BCAX, 50 in total */ + + .unreq tmp + tmp .req C1 + bcax_m1 Aga, vBga, vBgi, vBge + bcax_m1 Age, vBge, vBgo, vBgi + bcax_m1 Agi, vBgi, vBgu, vBgo + bcax_m1 Ago, vBgo, vBga, vBgu + bcax_m1 Agu, vBgu, vBge, vBga + bcax_m1 Aka, vBka, vBki, vBke + bcax_m1 Ake, vBke, vBko, vBki + .unreq tmp + + eor2 C0, Aka, Aga + save(Aga) + + tmp .req Aga + bcax_m1 Aki, vBki, vBku, vBko + bcax_m1 Ako, vBko, vBka, vBku + eor2 C1, Ake, Age + bcax_m1 Aku, vBku, vBke, vBka + eor2 C2, Aki, Agi + bcax_m1 Ama, vBma, vBmi, vBme + eor2 C3, Ako, Ago + bcax_m1 Ame, vBme, vBmo, vBmi + eor2 C4, Aku, Agu + bcax_m1 Ami, vBmi, vBmu, vBmo + eor2 C0, C0, Ama + bcax_m1 Amo, vBmo, vBma, vBmu + eor2 C1, C1, Ame + bcax_m1 Amu, vBmu, vBme, vBma + eor2 C2, C2, Ami + bcax_m1 Asa, vBsa, vBsi, vBse + eor2 C3, C3, Amo + bcax_m1 Ase, vBse, vBso, vBsi + eor2 C4, C4, Amu + bcax_m1 Asi, vBsi, vBsu, vBso + eor2 C0, C0, Asa + bcax_m1 Aso, vBso, vBsa, vBsu + eor2 C1, C1, Ase + bcax_m1 Asu, vBsu, vBse, vBsa + eor2 C2, C2, Asi + eor2 C3, C3, Aso + bcax_m1 Aba, vBba, vBbi, vBbe + bcax_m1 Abe, vBbe, vBbo, vBbi + + // iota step + ld1r {tmp.2d}, [const_addr], #8 + eor Aba.16b, Aba.16b, tmp.16b + eor2 C4, C4, Asu + + eor2 C0, C0, Aba + bcax_m1 Abi, vBbi, vBbu, vBbo + eor2 C1, C1, Abe + bcax_m1 Abo, vBbo, vBba, vBbu + eor2 C2, C2, Abi + bcax_m1 Abu, vBbu, vBbe, vBba + eor2 C3, C3, Abo + eor2 C4, C4, Abu + + restore(Aga) + .unreq tmp + +.endm + +.macro keccak_f1600_round_post + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req vBba + rax1_m1 E2, C1, C3 + rax1_m1 E4, C3, C0 + rax1_m1 E1, C0, C2 + rax1_m1 E3, C2, C4 + rax1_m1 E0, C4, C1 + .unreq tmp + + /* 25x XAR, 75 in total */ + + tmp .req C1 + eor vBba.16b, Aba.16b, E0.16b + xar_m1 vBsa, Abi, E2, 2 + xar_m1 vBbi, Aki, E2, 21 + xar_m1 vBki, Ako, E3, 39 + xar_m1 vBko, Amu, E4, 56 + xar_m1 vBmu, Aso, E3, 8 + xar_m1 vBso, Ama, E0, 23 + xar_m1 vBka, Abe, E1, 63 + xar_m1 vBse, Ago, E3, 9 + xar_m1 vBgo, Ame, E1, 19 + xar_m1 vBke, Agi, E2, 58 + xar_m1 vBgi, Aka, E0, 61 + xar_m1 vBga, Abo, E3, 36 + xar_m1 vBbo, Amo, E3, 43 + xar_m1 vBmo, Ami, E2, 49 + xar_m1 vBmi, Ake, E1, 54 + xar_m1 vBge, Agu, E4, 44 + xar_m1 vBgu, Asi, E2, 3 + xar_m1 vBsi, Aku, E4, 25 + xar_m1 vBku, Asa, E0, 46 + xar_m1 vBma, Abu, E4, 37 + xar_m1 vBbu, Asu, E4, 50 + xar_m1 vBsu, Ase, E1, 62 + xar_m1 vBme, Aga, E0, 28 + xar_m1 vBbe, Age, E1, 20 + + /* 25x BCAX, 50 in total */ + + bcax_m1 Aga, vBga, vBgi, vBge + bcax_m1 Age, vBge, vBgo, vBgi + bcax_m1 Agi, vBgi, vBgu, vBgo + bcax_m1 Ago, vBgo, vBga, vBgu + bcax_m1 Agu, vBgu, vBge, vBga + bcax_m1 Aka, vBka, vBki, vBke + bcax_m1 Ake, vBke, vBko, vBki + bcax_m1 Aki, vBki, vBku, vBko + bcax_m1 Ako, vBko, vBka, vBku + bcax_m1 Aku, vBku, vBke, vBka + bcax_m1 Ama, vBma, vBmi, vBme + bcax_m1 Ame, vBme, vBmo, vBmi + bcax_m1 Ami, vBmi, vBmu, vBmo + bcax_m1 Amo, vBmo, vBma, vBmu + bcax_m1 Amu, vBmu, vBme, vBma + bcax_m1 Asa, vBsa, vBsi, vBse + bcax_m1 Ase, vBse, vBso, vBsi + bcax_m1 Asi, vBsi, vBsu, vBso + bcax_m1 Aso, vBso, vBsa, vBsu + bcax_m1 Asu, vBsu, vBse, vBsa + bcax_m1 Aba, vBba, vBbi, vBbe + bcax_m1 Abe, vBbe, vBbo, vBbi + bcax_m1 Abi, vBbi, vBbu, vBbo + bcax_m1 Abo, vBbo, vBba, vBbu + bcax_m1 Abu, vBbu, vBbe, vBba + + // iota step + ld1r {tmp.2d}, [const_addr], #8 + eor Aba.16b, Aba.16b, tmp.16b + + .unreq tmp + +.endm + + +.text +.align 4 +.global keccak_f1600_x2_v84a_asm_v2pp0 +.global _keccak_f1600_x2_v84a_asm_v2pp0 + +#define KECCAK_F1600_ROUNDS 24 + +keccak_f1600_x2_v84a_asm_v2pp0: +_keccak_f1600_x2_v84a_asm_v2pp0: + alloc_stack + save_vregs + load_constant_ptr + load_input + + //mov count, #(KECCAK_F1600_ROUNDS-2) + mov count, #11 + keccak_f1600_round_pre +loop: + keccak_f1600_round_core + keccak_f1600_round_core + sub count, count, #1 + cbnz count, loop + + keccak_f1600_round_core + keccak_f1600_round_post + store_input + restore_vregs + free_stack + ret diff --git a/tests/keccak_neon/manual/keccak_f1600_x2_v84a_asm_v2pp1.s b/tests/keccak_neon/manual/keccak_f1600_x2_v84a_asm_v2pp1.s new file mode 100644 index 0000000..f8650ed --- /dev/null +++ b/tests/keccak_neon/manual/keccak_f1600_x2_v84a_asm_v2pp1.s @@ -0,0 +1,755 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +/********************** CONSTANTS *************************/ + .data + .align(8) +_round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x1 + count .req x2 + cur_const .req x3 + + /* Mapping of Kecck-f1600 state to vector registers + * at the beginning and end of each round. */ + Aba .req v0 + Abe .req v1 + Abi .req v2 + Abo .req v3 + Abu .req v4 + Aga .req v5 + Age .req v6 + Agi .req v7 + Ago .req v8 + Agu .req v9 + Aka .req v10 + Ake .req v11 + Aki .req v12 + Ako .req v13 + Aku .req v14 + Ama .req v15 + Ame .req v16 + Ami .req v17 + Amo .req v18 + Amu .req v19 + Asa .req v20 + Ase .req v21 + Asi .req v22 + Aso .req v23 + Asu .req v24 + + /* q-form of the above mapping */ + Abaq .req q0 + Abeq .req q1 + Abiq .req q2 + Aboq .req q3 + Abuq .req q4 + Agaq .req q5 + Ageq .req q6 + Agiq .req q7 + Agoq .req q8 + Aguq .req q9 + Akaq .req q10 + Akeq .req q11 + Akiq .req q12 + Akoq .req q13 + Akuq .req q14 + Amaq .req q15 + Ameq .req q16 + Amiq .req q17 + Amoq .req q18 + Amuq .req q19 + Asaq .req q20 + Aseq .req q21 + Asiq .req q22 + Asoq .req q23 + Asuq .req q24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v27 + C1 .req v28 + C2 .req v29 + C3 .req v30 + C4 .req v31 + + C0q .req q27 + C1q .req q28 + C2q .req q29 + C3q .req q30 + C4q .req q31 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + vBba .req v25 // fresh + vBbe .req v26 // fresh + vBbi .req Abi + vBbo .req Abo + vBbu .req Abu + vBga .req Aka + vBge .req Ake + vBgi .req Agi + vBgo .req Ago + vBgu .req Agu + vBka .req Ama + vBke .req Ame + vBki .req Aki + vBko .req Ako + vBku .req Aku + vBma .req Asa + vBme .req Ase + vBmi .req Ami + vBmo .req Amo + vBmu .req Amu + vBsa .req Aba + vBse .req Abe + vBsi .req Asi + vBso .req Aso + vBsu .req Asu + + vBbaq .req q25 // fresh + vBbeq .req q26 // fresh + vBbiq .req Abiq + vBboq .req Aboq + vBbuq .req Abuq + vBgaq .req Akaq + vBgeq .req Akeq + vBgiq .req Agiq + vBgoq .req Agoq + vBguq .req Aguq + vBkaq .req Amaq + vBkeq .req Ameq + vBkiq .req Akiq + vBkoq .req Akoq + vBkuq .req Akuq + vBmaq .req Asaq + vBmeq .req Aseq + vBmiq .req Amiq + vBmoq .req Amoq + vBmuq .req Amuq + vBsaq .req Abaq + vBseq .req Abeq + vBsiq .req Asiq + vBsoq .req Asoq + vBsuq .req Asuq + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req C4 + E1 .req C0 + E2 .req vBbe // fresh + E3 .req C2 + E4 .req C3 + + E0q .req C4q + E1q .req C0q + E2q .req vBbeq // fresh + E3q .req C2q + E4q .req C3q + + +/************************ MACROS ****************************/ + +.macro load_input + ldp Abaq, Abeq, [input_addr, #(2*8*0)] + ldp Abiq, Aboq, [input_addr, #(2*8*2)] + ldp Abuq, Agaq, [input_addr, #(2*8*4)] + ldp Ageq, Agiq, [input_addr, #(2*8*6)] + ldp Agoq, Aguq, [input_addr, #(2*8*8)] + ldp Akaq, Akeq, [input_addr, #(2*8*10)] + ldp Akiq, Akoq, [input_addr, #(2*8*12)] + ldp Akuq, Amaq, [input_addr, #(2*8*14)] + ldp Ameq, Amiq, [input_addr, #(2*8*16)] + ldp Amoq, Amuq, [input_addr, #(2*8*18)] + ldp Asaq, Aseq, [input_addr, #(2*8*20)] + ldp Asiq, Asoq, [input_addr, #(2*8*22)] + ldr Asuq, [input_addr, #(2*8*24)] +.endm + +.macro store_input + str Abaq, [input_addr, #(2*8*0)] + str Abeq, [input_addr, #(2*8*1)] + str Abiq, [input_addr, #(2*8*2)] + str Aboq, [input_addr, #(2*8*3)] + str Abuq, [input_addr, #(2*8*4)] + str Agaq, [input_addr, #(2*8*5)] + str Ageq, [input_addr, #(2*8*6)] + str Agiq, [input_addr, #(2*8*7)] + str Agoq, [input_addr, #(2*8*8)] + str Aguq, [input_addr, #(2*8*9)] + str Akaq, [input_addr, #(2*8*10)] + str Akeq, [input_addr, #(2*8*11)] + str Akiq, [input_addr, #(2*8*12)] + str Akoq, [input_addr, #(2*8*13)] + str Akuq, [input_addr, #(2*8*14)] + str Amaq, [input_addr, #(2*8*15)] + str Ameq, [input_addr, #(2*8*16)] + str Amiq, [input_addr, #(2*8*17)] + str Amoq, [input_addr, #(2*8*18)] + str Amuq, [input_addr, #(2*8*19)] + str Asaq, [input_addr, #(2*8*20)] + str Aseq, [input_addr, #(2*8*21)] + str Asiq, [input_addr, #(2*8*22)] + str Asoq, [input_addr, #(2*8*23)] + str Asuq, [input_addr, #(2*8*24)] +.endm + +#define STACK_SIZE (16*4 + 16*31) +#define STACK_BASE_VREGS 0 +#define STACK_BASE_TMP 16*4 + +#define Aga_offset 0 +#define E0_offset 1 +#define E1_offset 2 +#define E2_offset 3 +#define E3_offset 4 +#define E4_offset 5 +#define Ame_offset 7 +#define Agi_offset 8 +#define Aka_offset 9 +#define Abo_offset 10 +#define Amo_offset 11 +#define Ami_offset 12 +#define Ake_offset 13 +#define Agu_offset 14 +#define Asi_offset 15 +#define Aku_offset 16 +#define Asa_offset 17 +#define Abu_offset 18 +#define Asu_offset 19 +#define Ase_offset 20 +//#define Aga_offset 21 +#define Age_offset 22 +#define vBgo_offset 23 +#define vBke_offset 24 +#define vBgi_offset 25 +#define vBga_offset 26 +#define vBbo_offset 27 +#define vBmo_offset 28 +#define vBmi_offset 29 +#define vBge_offset 30 + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +#define save(name) \ + str name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] +#define restore(name) \ + ldr name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] + +.macro save_vregs + stp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + stp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + stp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + stp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + ldp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + ldp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + ldp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +/* Macros using v8.4-A SHA-3 instructions */ + +.macro eor3_m1_0 d s0 s1 s2 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor2 d s0 s1 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor3_m1_1 d s0 s1 s2 + eor \d\().16b, \d\().16b, \s2\().16b +.endm + +.macro eor3_m1 d s0 s1 s2 + eor3_m1_0 \d, \s0, \s1, \s2 + eor3_m1_1 \d, \s0, \s1, \s2 +.endm + +.macro rax1_m1 d s0 s1 + // Use add instead of SHL #1 + add tmp.2d, \s1\().2d, \s1\().2d + sri tmp.2d, \s1\().2d, #63 + eor \d\().16b, tmp.16b, \s0\().16b +.endm + + .macro xar_m1 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 63 + eor \s0\().16b, \s0\().16b, \s1\().16b + add \d\().2d, \s0\().2d, \s0\().2d + sri \d\().2d, \s0\().2d, #(63) + .elseif \imm == 62 + eor \s0\().16b, \s0\().16b, \s1\().16b + add \d\().2d, \s0\().2d, \s0\().2d + add \d\().2d, \d\().2d, \d\().2d + sri \d\().2d, \s0\().2d, #(62) + .else + eor \s0\().16b, \s0\().16b, \s1\().16b + shl \d\().2d, \s0\().2d, #(64-\imm) + sri \d\().2d, \s0\().2d, #(\imm) + .endif +.endm + +.macro bcax_m1 d s0 s1 s2 + bic tmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +/* Keccak-f1600 round */ + +.macro keccak_f1600_round_pre + + /* 10 EOR3, so 20 individual EOR */ + + eor3_m1_0 C0, Aba, Aga, Aka + eor3_m1_0 C1, Abe, Age, Ake + eor3_m1_0 C2, Abi, Agi, Aki + eor3_m1_0 C3, Abo, Ago, Ako + eor3_m1_0 C4, Abu, Agu, Aku + eor3_m1_1 C0, Aba, Aga, Aka + eor3_m1_1 C1, Abe, Age, Ake + eor3_m1_1 C2, Abi, Agi, Aki + eor3_m1_1 C3, Abo, Ago, Ako + eor3_m1_1 C4, Abu, Agu, Aku + eor3_m1_0 C0, C0, Ama, Asa + eor3_m1_0 C1, C1, Ame, Ase + eor3_m1_0 C2, C2, Ami, Asi + eor3_m1_0 C3, C3, Amo, Aso + eor3_m1_0 C4, C4, Amu, Asu + eor3_m1_1 C0, C0, Ama, Asa + eor3_m1_1 C1, C1, Ame, Ase + eor3_m1_1 C2, C2, Ami, Asi + eor3_m1_1 C3, C3, Amo, Aso + eor3_m1_1 C4, C4, Amu, Asu + +.endm + +.macro keccak_f1600_round + + /* 10 EOR3, so 20 individual EOR */ + + eor3_m1_0 C0, Aba, Aga, Aka + eor3_m1_0 C1, Abe, Age, Ake + eor3_m1_0 C2, Abi, Agi, Aki + eor3_m1_0 C3, Abo, Ago, Ako + eor3_m1_0 C4, Abu, Agu, Aku + eor3_m1_1 C0, Aba, Aga, Aka + eor3_m1_1 C1, Abe, Age, Ake + eor3_m1_1 C2, Abi, Agi, Aki + eor3_m1_1 C3, Abo, Ago, Ako + eor3_m1_1 C4, Abu, Agu, Aku + eor3_m1_0 C0, C0, Ama, Asa + eor3_m1_0 C1, C1, Ame, Ase + eor3_m1_0 C2, C2, Ami, Asi + eor3_m1_0 C3, C3, Amo, Aso + eor3_m1_0 C4, C4, Amu, Asu + eor3_m1_1 C0, C0, Ama, Asa + eor3_m1_1 C1, C1, Ame, Ase + eor3_m1_1 C2, C2, Ami, Asi + eor3_m1_1 C3, C3, Amo, Aso + eor3_m1_1 C4, C4, Amu, Asu + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req vBba + rax1_m1 E2, C1, C3 + rax1_m1 E4, C3, C0 + rax1_m1 E1, C0, C2 + rax1_m1 E3, C2, C4 + rax1_m1 E0, C4, C1 + .unreq tmp + + /* 25x XAR, 75 in total */ + + tmp .req C1 + tmpq .req C1q + + eor vBba.16b, Aba.16b, E0.16b + xar_m1 vBsa, Abi, E2, 2 + xar_m1 vBbi, Aki, E2, 21 + xar_m1 vBki, Ako, E3, 39 + xar_m1 vBko, Amu, E4, 56 + xar_m1 vBmu, Aso, E3, 8 + xar_m1 vBso, Ama, E0, 23 + xar_m1 vBka, Abe, E1, 63 + xar_m1 vBse, Ago, E3, 9 + xar_m1 vBgo, Ame, E1, 19 + xar_m1 vBke, Agi, E2, 58 + xar_m1 vBgi, Aka, E0, 61 + xar_m1 vBga, Abo, E3, 36 + xar_m1 vBbo, Amo, E3, 43 + xar_m1 vBmo, Ami, E2, 49 + xar_m1 vBmi, Ake, E1, 54 + xar_m1 vBge, Agu, E4, 44 + xar_m1 vBgu, Asi, E2, 3 + xar_m1 vBsi, Aku, E4, 25 + xar_m1 vBku, Asa, E0, 46 + xar_m1 vBma, Abu, E4, 37 + xar_m1 vBbu, Asu, E4, 50 + xar_m1 vBsu, Ase, E1, 62 + xar_m1 vBme, Aga, E0, 28 + xar_m1 vBbe, Age, E1, 20 + + /* 25x BCAX, 50 in total */ + + bcax_m1 Aga, vBga, vBgi, vBge + bcax_m1 Age, vBge, vBgo, vBgi + bcax_m1 Agi, vBgi, vBgu, vBgo + bcax_m1 Ago, vBgo, vBga, vBgu + bcax_m1 Agu, vBgu, vBge, vBga + bcax_m1 Aka, vBka, vBki, vBke + bcax_m1 Ake, vBke, vBko, vBki + bcax_m1 Aki, vBki, vBku, vBko + bcax_m1 Ako, vBko, vBka, vBku + bcax_m1 Aku, vBku, vBke, vBka + bcax_m1 Ama, vBma, vBmi, vBme + bcax_m1 Ame, vBme, vBmo, vBmi + bcax_m1 Ami, vBmi, vBmu, vBmo + bcax_m1 Amo, vBmo, vBma, vBmu + bcax_m1 Amu, vBmu, vBme, vBma + bcax_m1 Asa, vBsa, vBsi, vBse + bcax_m1 Ase, vBse, vBso, vBsi + bcax_m1 Asi, vBsi, vBsu, vBso + bcax_m1 Aso, vBso, vBsa, vBsu + bcax_m1 Asu, vBsu, vBse, vBsa + bcax_m1 Aba, vBba, vBbi, vBbe + bcax_m1 Abe, vBbe, vBbo, vBbi + bcax_m1 Abi, vBbi, vBbu, vBbo + bcax_m1 Abo, vBbo, vBba, vBbu + bcax_m1 Abu, vBbu, vBbe, vBba + + // iota step + //ld1r {tmp.2d}, [const_addr], #8 + ldr tmpq, [const_addr], #16 + eor Aba.16b, Aba.16b, tmp.16b + + .unreq tmp + .unreq tmpq + +.endm + +.macro keccak_f1600_round_core + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req vBba + rax1_m1 E2, C1, C3 + rax1_m1 E4, C3, C0 + rax1_m1 E1, C0, C2 + rax1_m1 E3, C2, C4 + rax1_m1 E0, C4, C1 + + /* 25x XAR, 75 in total */ + + eor vBba.16b, Aba.16b, E0.16b + xar_m1 vBsa, Abi, E2, 2 + xar_m1 vBbi, Aki, E2, 21 + xar_m1 vBki, Ako, E3, 39 + xar_m1 vBko, Amu, E4, 56 + xar_m1 vBmu, Aso, E3, 8 + xar_m1 vBso, Ama, E0, 23 + xar_m1 vBka, Abe, E1, 63 + xar_m1 vBse, Ago, E3, 9 + xar_m1 vBgo, Ame, E1, 19 + xar_m1 vBke, Agi, E2, 58 + xar_m1 vBgi, Aka, E0, 61 + xar_m1 vBga, Abo, E3, 36 + xar_m1 vBbo, Amo, E3, 43 + xar_m1 vBmo, Ami, E2, 49 + xar_m1 vBmi, Ake, E1, 54 + xar_m1 vBge, Agu, E4, 44 + xar_m1 vBgu, Asi, E2, 3 + xar_m1 vBsi, Aku, E4, 25 + xar_m1 vBku, Asa, E0, 46 + xar_m1 vBma, Abu, E4, 37 + xar_m1 vBbu, Asu, E4, 50 + xar_m1 vBsu, Ase, E1, 62 + xar_m1 vBme, Aga, E0, 28 + xar_m1 vBbe, Age, E1, 20 + + /* 25x BCAX, 50 in total */ + + .unreq tmp + tmp .req C1 + bcax_m1 Aga, vBga, vBgi, vBge + bcax_m1 Age, vBge, vBgo, vBgi + bcax_m1 Agi, vBgi, vBgu, vBgo + bcax_m1 Ago, vBgo, vBga, vBgu + bcax_m1 Agu, vBgu, vBge, vBga + bcax_m1 Aka, vBka, vBki, vBke + bcax_m1 Ake, vBke, vBko, vBki + .unreq tmp + + eor2 C0, Aka, Aga + save(Aga) + + tmp .req Aga + tmpq .req Agaq + bcax_m1 Aki, vBki, vBku, vBko + bcax_m1 Ako, vBko, vBka, vBku + eor2 C1, Ake, Age + bcax_m1 Aku, vBku, vBke, vBka + eor2 C2, Aki, Agi + bcax_m1 Ama, vBma, vBmi, vBme + eor2 C3, Ako, Ago + bcax_m1 Ame, vBme, vBmo, vBmi + eor2 C4, Aku, Agu + bcax_m1 Ami, vBmi, vBmu, vBmo + eor2 C0, C0, Ama + bcax_m1 Amo, vBmo, vBma, vBmu + eor2 C1, C1, Ame + bcax_m1 Amu, vBmu, vBme, vBma + eor2 C2, C2, Ami + bcax_m1 Asa, vBsa, vBsi, vBse + eor2 C3, C3, Amo + bcax_m1 Ase, vBse, vBso, vBsi + eor2 C4, C4, Amu + bcax_m1 Asi, vBsi, vBsu, vBso + eor2 C0, C0, Asa + bcax_m1 Aso, vBso, vBsa, vBsu + eor2 C1, C1, Ase + bcax_m1 Asu, vBsu, vBse, vBsa + eor2 C2, C2, Asi + eor2 C3, C3, Aso + bcax_m1 Aba, vBba, vBbi, vBbe + bcax_m1 Abe, vBbe, vBbo, vBbi + eor2 C1, C1, Abe + + // iota step + //ld1r {tmp.2d}, [const_addr], #8 + ldr tmpq, [const_addr], #16 + eor Aba.16b, Aba.16b, tmp.16b + eor2 C4, C4, Asu + bcax_m1 Abi, vBbi, vBbu, vBbo + bcax_m1 Abo, vBbo, vBba, vBbu + eor2 C3, C3, Abo + eor2 C2, C2, Abi + eor2 C0, C0, Aba + bcax_m1 Abu, vBbu, vBbe, vBba + eor2 C4, C4, Abu + + restore(Aga) + .unreq tmp + .unreq tmpq + +.endm + +.macro keccak_f1600_round_post + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req vBba + rax1_m1 E2, C1, C3 + rax1_m1 E4, C3, C0 + rax1_m1 E1, C0, C2 + rax1_m1 E3, C2, C4 + rax1_m1 E0, C4, C1 + .unreq tmp + + /* 25x XAR, 75 in total */ + + tmp .req C1 + eor vBba.16b, Aba.16b, E0.16b + xar_m1 vBsa, Abi, E2, 2 + xar_m1 vBbi, Aki, E2, 21 + xar_m1 vBki, Ako, E3, 39 + xar_m1 vBko, Amu, E4, 56 + xar_m1 vBmu, Aso, E3, 8 + xar_m1 vBso, Ama, E0, 23 + xar_m1 vBka, Abe, E1, 63 + xar_m1 vBse, Ago, E3, 9 + xar_m1 vBgo, Ame, E1, 19 + xar_m1 vBke, Agi, E2, 58 + xar_m1 vBgi, Aka, E0, 61 + xar_m1 vBga, Abo, E3, 36 + xar_m1 vBbo, Amo, E3, 43 + xar_m1 vBmo, Ami, E2, 49 + xar_m1 vBmi, Ake, E1, 54 + xar_m1 vBge, Agu, E4, 44 + xar_m1 vBgu, Asi, E2, 3 + xar_m1 vBsi, Aku, E4, 25 + xar_m1 vBku, Asa, E0, 46 + xar_m1 vBma, Abu, E4, 37 + xar_m1 vBbu, Asu, E4, 50 + xar_m1 vBsu, Ase, E1, 62 + xar_m1 vBme, Aga, E0, 28 + xar_m1 vBbe, Age, E1, 20 + + /* 25x BCAX, 50 in total */ + + bcax_m1 Aga, vBga, vBgi, vBge + bcax_m1 Age, vBge, vBgo, vBgi + bcax_m1 Agi, vBgi, vBgu, vBgo + bcax_m1 Ago, vBgo, vBga, vBgu + bcax_m1 Agu, vBgu, vBge, vBga + bcax_m1 Aka, vBka, vBki, vBke + bcax_m1 Ake, vBke, vBko, vBki + bcax_m1 Aki, vBki, vBku, vBko + bcax_m1 Ako, vBko, vBka, vBku + bcax_m1 Aku, vBku, vBke, vBka + bcax_m1 Ama, vBma, vBmi, vBme + bcax_m1 Ame, vBme, vBmo, vBmi + bcax_m1 Ami, vBmi, vBmu, vBmo + bcax_m1 Amo, vBmo, vBma, vBmu + bcax_m1 Amu, vBmu, vBme, vBma + bcax_m1 Asa, vBsa, vBsi, vBse + bcax_m1 Ase, vBse, vBso, vBsi + bcax_m1 Asi, vBsi, vBsu, vBso + bcax_m1 Aso, vBso, vBsa, vBsu + bcax_m1 Asu, vBsu, vBse, vBsa + bcax_m1 Aba, vBba, vBbi, vBbe + bcax_m1 Abe, vBbe, vBbo, vBbi + bcax_m1 Abi, vBbi, vBbu, vBbo + bcax_m1 Abo, vBbo, vBba, vBbu + bcax_m1 Abu, vBbu, vBbe, vBba + + // iota step + ld1r {tmp.2d}, [const_addr], #8 + eor Aba.16b, Aba.16b, tmp.16b + + .unreq tmp + +.endm + + +.text +.align 4 +.global keccak_f1600_x2_v84a_asm_v2pp1 +.global _keccak_f1600_x2_v84a_asm_v2pp1 + +#define KECCAK_F1600_ROUNDS 24 + +keccak_f1600_x2_v84a_asm_v2pp1: +_keccak_f1600_x2_v84a_asm_v2pp1: + alloc_stack + save_vregs + load_constant_ptr + load_input + + //mov count, #(KECCAK_F1600_ROUNDS-2) + mov count, #11 + keccak_f1600_round_pre +loop: + keccak_f1600_round_core + keccak_f1600_round_core + sub count, count, #1 + cbnz count, loop + + keccak_f1600_round_core + keccak_f1600_round_post + store_input + restore_vregs + free_stack + ret diff --git a/tests/keccak_neon/manual/keccak_f1600_x2_v84a_asm_v2pp2.s b/tests/keccak_neon/manual/keccak_f1600_x2_v84a_asm_v2pp2.s new file mode 100644 index 0000000..8b76c2b --- /dev/null +++ b/tests/keccak_neon/manual/keccak_f1600_x2_v84a_asm_v2pp2.s @@ -0,0 +1,798 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +/********************** CONSTANTS *************************/ + .data + .align(8) +_round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x1 + count .req x2 + cur_const .req x3 + + /* Mapping of Kecck-f1600 state to vector registers + * at the beginning and end of each round. */ + Aba .req v0 + Abe .req v1 + Abi .req v2 + Abo .req v3 + Abu .req v4 + Aga .req v5 + Age .req v6 + Agi .req v7 + Ago .req v8 + Agu .req v9 + Aka .req v10 + Ake .req v11 + Aki .req v12 + Ako .req v13 + Aku .req v14 + Ama .req v15 + Ame .req v16 + Ami .req v17 + Amo .req v18 + Amu .req v19 + Asa .req v20 + Ase .req v21 + Asi .req v22 + Aso .req v23 + Asu .req v24 + + /* q-form of the above mapping */ + Abaq .req q0 + Abeq .req q1 + Abiq .req q2 + Aboq .req q3 + Abuq .req q4 + Agaq .req q5 + Ageq .req q6 + Agiq .req q7 + Agoq .req q8 + Aguq .req q9 + Akaq .req q10 + Akeq .req q11 + Akiq .req q12 + Akoq .req q13 + Akuq .req q14 + Amaq .req q15 + Ameq .req q16 + Amiq .req q17 + Amoq .req q18 + Amuq .req q19 + Asaq .req q20 + Aseq .req q21 + Asiq .req q22 + Asoq .req q23 + Asuq .req q24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v27 + C1 .req v28 + C2 .req v29 + C3 .req v30 + C4 .req v31 + + C0q .req q27 + C1q .req q28 + C2q .req q29 + C3q .req q30 + C4q .req q31 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + vBba .req v25 // fresh + vBbe .req v26 // fresh + vBbi .req Abi + vBbo .req Abo + vBbu .req Abu + vBga .req Aka + vBge .req Ake + vBgi .req Agi + vBgo .req Ago + vBgu .req Agu + vBka .req Ama + vBke .req Ame + vBki .req Aki + vBko .req Ako + vBku .req Aku + vBma .req Asa + vBme .req Ase + vBmi .req Ami + vBmo .req Amo + vBmu .req Amu + vBsa .req Aba + vBse .req Abe + vBsi .req Asi + vBso .req Aso + vBsu .req Asu + + vBbaq .req q25 // fresh + vBbeq .req q26 // fresh + vBbiq .req Abiq + vBboq .req Aboq + vBbuq .req Abuq + vBgaq .req Akaq + vBgeq .req Akeq + vBgiq .req Agiq + vBgoq .req Agoq + vBguq .req Aguq + vBkaq .req Amaq + vBkeq .req Ameq + vBkiq .req Akiq + vBkoq .req Akoq + vBkuq .req Akuq + vBmaq .req Asaq + vBmeq .req Aseq + vBmiq .req Amiq + vBmoq .req Amoq + vBmuq .req Amuq + vBsaq .req Abaq + vBseq .req Abeq + vBsiq .req Asiq + vBsoq .req Asoq + vBsuq .req Asuq + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req C4 + E1 .req C0 + E2 .req vBbe // fresh + E3 .req C2 + E4 .req C3 + + E0q .req C4q + E1q .req C0q + E2q .req vBbeq // fresh + E3q .req C2q + E4q .req C3q + + +/************************ MACROS ****************************/ + +.macro load_input + ldp Abaq, Abeq, [input_addr, #(2*8*0)] + ldp Abiq, Aboq, [input_addr, #(2*8*2)] + ldp Abuq, Agaq, [input_addr, #(2*8*4)] + ldp Ageq, Agiq, [input_addr, #(2*8*6)] + ldp Agoq, Aguq, [input_addr, #(2*8*8)] + ldp Akaq, Akeq, [input_addr, #(2*8*10)] + ldp Akiq, Akoq, [input_addr, #(2*8*12)] + ldp Akuq, Amaq, [input_addr, #(2*8*14)] + ldp Ameq, Amiq, [input_addr, #(2*8*16)] + ldp Amoq, Amuq, [input_addr, #(2*8*18)] + ldp Asaq, Aseq, [input_addr, #(2*8*20)] + ldp Asiq, Asoq, [input_addr, #(2*8*22)] + ldr Asuq, [input_addr, #(2*8*24)] +.endm + +.macro store_input + str Abaq, [input_addr, #(2*8*0)] + str Abeq, [input_addr, #(2*8*1)] + str Abiq, [input_addr, #(2*8*2)] + str Aboq, [input_addr, #(2*8*3)] + str Abuq, [input_addr, #(2*8*4)] + str Agaq, [input_addr, #(2*8*5)] + str Ageq, [input_addr, #(2*8*6)] + str Agiq, [input_addr, #(2*8*7)] + str Agoq, [input_addr, #(2*8*8)] + str Aguq, [input_addr, #(2*8*9)] + str Akaq, [input_addr, #(2*8*10)] + str Akeq, [input_addr, #(2*8*11)] + str Akiq, [input_addr, #(2*8*12)] + str Akoq, [input_addr, #(2*8*13)] + str Akuq, [input_addr, #(2*8*14)] + str Amaq, [input_addr, #(2*8*15)] + str Ameq, [input_addr, #(2*8*16)] + str Amiq, [input_addr, #(2*8*17)] + str Amoq, [input_addr, #(2*8*18)] + str Amuq, [input_addr, #(2*8*19)] + str Asaq, [input_addr, #(2*8*20)] + str Aseq, [input_addr, #(2*8*21)] + str Asiq, [input_addr, #(2*8*22)] + str Asoq, [input_addr, #(2*8*23)] + str Asuq, [input_addr, #(2*8*24)] +.endm + +#define STACK_SIZE (16*4 + 16*34) +#define STACK_BASE_VREGS 0 +#define STACK_BASE_TMP 16*4 + +#define Aga_offset 0 +#define E0_offset 1 +#define E1_offset 2 +#define E2_offset 3 +#define E3_offset 4 +#define E4_offset 5 +#define Ame_offset 7 +#define Agi_offset 8 +#define Aka_offset 9 +#define Abo_offset 10 +#define Amo_offset 11 +#define Ami_offset 12 +#define Ake_offset 13 +#define Agu_offset 14 +#define Asi_offset 15 +#define Aku_offset 16 +#define Asa_offset 17 +#define Abu_offset 18 +#define Asu_offset 19 +#define Ase_offset 20 +//#define Aga_offset 21 +#define Age_offset 22 +#define vBgo_offset 23 +#define vBke_offset 24 +#define vBgi_offset 25 +#define vBga_offset 26 +#define vBbo_offset 27 +#define vBmo_offset 28 +#define vBmi_offset 29 +#define vBge_offset 30 + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +#define save(name) \ + str name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] +#define restore(name) \ + ldr name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] + +.macro save_vregs + stp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + stp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + stp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + stp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + ldp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + ldp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + ldp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +/* Macros using v8.4-A SHA-3 instructions */ + +.macro eor3_m1_0 d s0 s1 s2 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor2 d s0 s1 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor3_m1_1 d s0 s1 s2 + eor \d\().16b, \d\().16b, \s2\().16b +.endm + +.macro eor3_m1 d s0 s1 s2 + eor3_m1_0 \d, \s0, \s1, \s2 + eor3_m1_1 \d, \s0, \s1, \s2 +.endm + +.macro rax1_m1 d s0 s1 + // Use add instead of SHL #1 + add tmp.2d, \s1\().2d, \s1\().2d + sri tmp.2d, \s1\().2d, #63 + eor \d\().16b, tmp.16b, \s0\().16b +.endm + + .macro xar_m1 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 63 + eor \s0\().16b, \s0\().16b, \s1\().16b + add \d\().2d, \s0\().2d, \s0\().2d + sri \d\().2d, \s0\().2d, #(63) + .elseif \imm == 62 + eor \s0\().16b, \s0\().16b, \s1\().16b + add \d\().2d, \s0\().2d, \s0\().2d + add \d\().2d, \d\().2d, \d\().2d + sri \d\().2d, \s0\().2d, #(62) + // .elseif \imm == 61 + // eor \s0\().16b, \s0\().16b, \s1\().16b + // add \d\().2d, \s0\().2d, \s0\().2d + // add \d\().2d, \d\().2d, \d\().2d + // add \d\().2d, \d\().2d, \d\().2d + // sri \d\().2d, \s0\().2d, #(61) + .else + eor \s0\().16b, \s0\().16b, \s1\().16b + shl \d\().2d, \s0\().2d, #(64-\imm) + sri \d\().2d, \s0\().2d, #(\imm) + .endif +.endm + + .macro xar_m1_0 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 63 + eor \s0\().16b, \s0\().16b, \s1\().16b + .elseif \imm == 62 + eor \s0\().16b, \s0\().16b, \s1\().16b + .else + eor \s0\().16b, \s0\().16b, \s1\().16b + .endif +.endm + + .macro xar_m1_1 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 63 + add \d\().2d, \s0\().2d, \s0\().2d + sri \d\().2d, \s0\().2d, #(63) + .elseif \imm == 62 + add \d\().2d, \s0\().2d, \s0\().2d + add \d\().2d, \d\().2d, \d\().2d + sri \d\().2d, \s0\().2d, #(62) + .else + shl \d\().2d, \s0\().2d, #(64-\imm) + sri \d\().2d, \s0\().2d, #(\imm) + .endif +.endm + +.macro bcax_m1 d s0 s1 s2 + bic tmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +/* Keccak-f1600 round */ + +.macro keccak_f1600_round_pre + + /* 10 EOR3, so 20 individual EOR */ + + eor3_m1_0 C1, Abe, Age, Ake + eor3_m1_0 C3, Abo, Ago, Ako + eor3_m1_0 C0, Aba, Aga, Aka + eor3_m1_0 C2, Abi, Agi, Aki + eor3_m1_0 C4, Abu, Agu, Aku + eor3_m1_1 C1, Abe, Age, Ake + eor3_m1_1 C3, Abo, Ago, Ako + eor3_m1_1 C0, Aba, Aga, Aka + eor3_m1_1 C2, Abi, Agi, Aki + eor3_m1_1 C4, Abu, Agu, Aku + eor3_m1_0 C1, C1, Ame, Ase + eor3_m1_0 C3, C3, Amo, Aso + eor3_m1_0 C0, C0, Ama, Asa + eor3_m1_0 C2, C2, Ami, Asi + eor3_m1_0 C4, C4, Amu, Asu + eor3_m1_1 C1, C1, Ame, Ase + eor3_m1_1 C3, C3, Amo, Aso + eor3_m1_1 C0, C0, Ama, Asa + eor3_m1_1 C2, C2, Ami, Asi + eor3_m1_1 C4, C4, Amu, Asu + +.endm + +.macro keccak_f1600_round + + /* 10 EOR3, so 20 individual EOR */ + + eor3_m1_0 C0, Aba, Aga, Aka + eor3_m1_0 C1, Abe, Age, Ake + eor3_m1_0 C2, Abi, Agi, Aki + eor3_m1_0 C3, Abo, Ago, Ako + eor3_m1_0 C4, Abu, Agu, Aku + eor3_m1_1 C0, Aba, Aga, Aka + eor3_m1_1 C1, Abe, Age, Ake + eor3_m1_1 C2, Abi, Agi, Aki + eor3_m1_1 C3, Abo, Ago, Ako + eor3_m1_1 C4, Abu, Agu, Aku + eor3_m1_0 C0, C0, Ama, Asa + eor3_m1_0 C1, C1, Ame, Ase + eor3_m1_0 C2, C2, Ami, Asi + eor3_m1_0 C3, C3, Amo, Aso + eor3_m1_0 C4, C4, Amu, Asu + eor3_m1_1 C0, C0, Ama, Asa + eor3_m1_1 C1, C1, Ame, Ase + eor3_m1_1 C2, C2, Ami, Asi + eor3_m1_1 C3, C3, Amo, Aso + eor3_m1_1 C4, C4, Amu, Asu + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req vBba + rax1_m1 E2, C1, C3 + rax1_m1 E4, C3, C0 + rax1_m1 E1, C0, C2 + rax1_m1 E3, C2, C4 + rax1_m1 E0, C4, C1 + .unreq tmp + + /* 25x XAR, 75 in total */ + + tmp .req C1 + tmpq .req C1q + + eor vBba.16b, Aba.16b, E0.16b + xar_m1 vBsa, Abi, E2, 2 + xar_m1 vBbi, Aki, E2, 21 + xar_m1 vBki, Ako, E3, 39 + xar_m1 vBko, Amu, E4, 56 + xar_m1 vBmu, Aso, E3, 8 + xar_m1 vBso, Ama, E0, 23 + xar_m1 vBka, Abe, E1, 63 + xar_m1 vBse, Ago, E3, 9 + xar_m1 vBgo, Ame, E1, 19 + xar_m1 vBke, Agi, E2, 58 + xar_m1 vBgi, Aka, E0, 61 + xar_m1 vBga, Abo, E3, 36 + xar_m1 vBbo, Amo, E3, 43 + xar_m1 vBmo, Ami, E2, 49 + xar_m1 vBmi, Ake, E1, 54 + xar_m1 vBge, Agu, E4, 44 + xar_m1 vBgu, Asi, E2, 3 + xar_m1 vBsi, Aku, E4, 25 + xar_m1 vBku, Asa, E0, 46 + xar_m1 vBma, Abu, E4, 37 + xar_m1 vBbu, Asu, E4, 50 + xar_m1 vBsu, Ase, E1, 62 + xar_m1 vBme, Aga, E0, 28 + xar_m1 vBbe, Age, E1, 20 + + /* 25x BCAX, 50 in total */ + + bcax_m1 Aga, vBga, vBgi, vBge + bcax_m1 Age, vBge, vBgo, vBgi + bcax_m1 Agi, vBgi, vBgu, vBgo + bcax_m1 Ago, vBgo, vBga, vBgu + bcax_m1 Agu, vBgu, vBge, vBga + bcax_m1 Aka, vBka, vBki, vBke + bcax_m1 Ake, vBke, vBko, vBki + bcax_m1 Aki, vBki, vBku, vBko + bcax_m1 Ako, vBko, vBka, vBku + bcax_m1 Aku, vBku, vBke, vBka + bcax_m1 Ama, vBma, vBmi, vBme + bcax_m1 Ame, vBme, vBmo, vBmi + bcax_m1 Ami, vBmi, vBmu, vBmo + bcax_m1 Amo, vBmo, vBma, vBmu + bcax_m1 Amu, vBmu, vBme, vBma + bcax_m1 Asa, vBsa, vBsi, vBse + bcax_m1 Ase, vBse, vBso, vBsi + bcax_m1 Asi, vBsi, vBsu, vBso + bcax_m1 Aso, vBso, vBsa, vBsu + bcax_m1 Asu, vBsu, vBse, vBsa + bcax_m1 Aba, vBba, vBbi, vBbe + bcax_m1 Abe, vBbe, vBbo, vBbi + bcax_m1 Abi, vBbi, vBbu, vBbo + bcax_m1 Abo, vBbo, vBba, vBbu + bcax_m1 Abu, vBbu, vBbe, vBba + + // iota step + //ld1r {tmp.2d}, [const_addr], #8 + ldr tmpq, [const_addr], #16 + eor Aba.16b, Aba.16b, tmp.16b + + .unreq tmp + .unreq tmpq + +.endm + +.macro keccak_f1600_round_core + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req vBba + rax1_m1 E2, C1, C3 + str Agaq, [sp, #(STACK_BASE_TMP + 16 * 30)] + rax1_m1 E4, C3, C0 + rax1_m1 E1, C0, C2 + rax1_m1 E3, C2, C4 + rax1_m1 E0, C4, C1 + + /* 25x XAR, 75 in total */ + + .unreq tmp + tmp .req C1 + tmpq .req C1q + + eor vBba.16b, Aba.16b, E0.16b + xar_m1 vBsa, Abi, E2, 2 + xar_m1 vBbi, Aki, E2, 21 + xar_m1 vBki, Ako, E3, 39 + xar_m1 vBko, Amu, E4, 56 + xar_m1 vBmu, Aso, E3, 8 + xar_m1 vBso, Ama, E0, 23 + xar_m1 vBka, Abe, E1, 63 + xar_m1 vBse, Ago, E3, 9 + xar_m1 vBgo, Ame, E1, 19 + xar_m1 vBke, Agi, E2, 58 + xar_m1 vBgi, Aka, E0, 61 + xar_m1 vBga, Abo, E3, 36 + xar_m1 vBbo, Amo, E3, 43 + xar_m1 vBmo, Ami, E2, 49 + xar_m1 vBmi, Ake, E1, 54 + xar_m1 vBge, Agu, E4, 44 + bcax_m1 Aga, vBga, vBgi, vBge + xar_m1 vBgu, Asi, E2, 3 + xar_m1 vBsi, Aku, E4, 25 + xar_m1 vBku, Asa, E0, 46 + xar_m1 vBma, Abu, E4, 37 + xar_m1 vBbu, Asu, E4, 50 + xar_m1 vBsu, Ase, E1, 62 + ldr tmpq, [sp, #(STACK_BASE_TMP + 16*30)] + xar_m1 vBme, tmp, E0, 28 + xar_m1 vBbe, Age, E1, 20 + + /* 25x BCAX, 50 in total */ + + bcax_m1 Age, vBge, vBgo, vBgi + bcax_m1 Agi, vBgi, vBgu, vBgo + bcax_m1 Ago, vBgo, vBga, vBgu + bcax_m1 Agu, vBgu, vBge, vBga + bcax_m1 Aka, vBka, vBki, vBke + bcax_m1 Ake, vBke, vBko, vBki + + .unreq tmp + .unreq tmpq + + eor2 C0, Aka, Aga + save(Aga) + + tmp .req Aga + tmpq .req Agaq + bcax_m1 Aki, vBki, vBku, vBko + bcax_m1 Ako, vBko, vBka, vBku + eor2 C1, Ake, Age + bcax_m1 Aku, vBku, vBke, vBka + eor2 C2, Aki, Agi + bcax_m1 Ama, vBma, vBmi, vBme + eor2 C3, Ako, Ago + bcax_m1 Ame, vBme, vBmo, vBmi + eor2 C4, Aku, Agu + bcax_m1 Ami, vBmi, vBmu, vBmo + eor2 C0, C0, Ama + bcax_m1 Amo, vBmo, vBma, vBmu + eor2 C1, C1, Ame + bcax_m1 Amu, vBmu, vBme, vBma + eor2 C2, C2, Ami + bcax_m1 Asa, vBsa, vBsi, vBse + eor2 C3, C3, Amo + bcax_m1 Ase, vBse, vBso, vBsi + eor2 C4, C4, Amu + bcax_m1 Asi, vBsi, vBsu, vBso + eor2 C0, C0, Asa + bcax_m1 Aso, vBso, vBsa, vBsu + eor2 C1, C1, Ase + bcax_m1 Asu, vBsu, vBse, vBsa + eor2 C2, C2, Asi + eor2 C3, C3, Aso + bcax_m1 Aba, vBba, vBbi, vBbe + bcax_m1 Abe, vBbe, vBbo, vBbi + eor2 C1, C1, Abe + + // iota step + //ld1r {tmp.2d}, [const_addr], #8 + ldr tmpq, [const_addr], #16 + eor Aba.16b, Aba.16b, tmp.16b + eor2 C4, C4, Asu + bcax_m1 Abi, vBbi, vBbu, vBbo + bcax_m1 Abo, vBbo, vBba, vBbu + eor2 C3, C3, Abo + eor2 C2, C2, Abi + eor2 C0, C0, Aba + bcax_m1 Abu, vBbu, vBbe, vBba + eor2 C4, C4, Abu + + restore(Aga) + .unreq tmp + .unreq tmpq + +.endm + +.macro keccak_f1600_round_post + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req vBba + rax1_m1 E2, C1, C3 + str Agaq, [sp, #(STACK_BASE_TMP + 16 * 30)] + rax1_m1 E4, C3, C0 + rax1_m1 E1, C0, C2 + rax1_m1 E3, C2, C4 + rax1_m1 E0, C4, C1 + + /* 25x XAR, 75 in total */ + + .unreq tmp + tmp .req C1 + tmpq .req C1q + + eor vBba.16b, Aba.16b, E0.16b + xar_m1 vBsa, Abi, E2, 2 + xar_m1 vBbi, Aki, E2, 21 + xar_m1 vBki, Ako, E3, 39 + xar_m1 vBko, Amu, E4, 56 + xar_m1 vBmu, Aso, E3, 8 + xar_m1 vBso, Ama, E0, 23 + xar_m1 vBka, Abe, E1, 63 + xar_m1 vBse, Ago, E3, 9 + xar_m1 vBgo, Ame, E1, 19 + xar_m1 vBke, Agi, E2, 58 + xar_m1 vBgi, Aka, E0, 61 + xar_m1 vBga, Abo, E3, 36 + xar_m1 vBbo, Amo, E3, 43 + xar_m1 vBmo, Ami, E2, 49 + xar_m1 vBmi, Ake, E1, 54 + xar_m1 vBge, Agu, E4, 44 + bcax_m1 Aga, vBga, vBgi, vBge + xar_m1 vBgu, Asi, E2, 3 + xar_m1 vBsi, Aku, E4, 25 + xar_m1 vBku, Asa, E0, 46 + xar_m1 vBma, Abu, E4, 37 + xar_m1 vBbu, Asu, E4, 50 + xar_m1 vBsu, Ase, E1, 62 + ldr tmpq, [sp, #(STACK_BASE_TMP + 16*30)] + xar_m1 vBme, tmp, E0, 28 + xar_m1 vBbe, Age, E1, 20 + + /* 25x BCAX, 50 in total */ + + bcax_m1 Age, vBge, vBgo, vBgi + bcax_m1 Agi, vBgi, vBgu, vBgo + bcax_m1 Ago, vBgo, vBga, vBgu + bcax_m1 Agu, vBgu, vBge, vBga + bcax_m1 Aka, vBka, vBki, vBke + bcax_m1 Ake, vBke, vBko, vBki + bcax_m1 Aki, vBki, vBku, vBko + bcax_m1 Ako, vBko, vBka, vBku + bcax_m1 Aku, vBku, vBke, vBka + bcax_m1 Ama, vBma, vBmi, vBme + bcax_m1 Ame, vBme, vBmo, vBmi + bcax_m1 Ami, vBmi, vBmu, vBmo + bcax_m1 Amo, vBmo, vBma, vBmu + bcax_m1 Amu, vBmu, vBme, vBma + bcax_m1 Asa, vBsa, vBsi, vBse + bcax_m1 Ase, vBse, vBso, vBsi + bcax_m1 Asi, vBsi, vBsu, vBso + bcax_m1 Aso, vBso, vBsa, vBsu + bcax_m1 Asu, vBsu, vBse, vBsa + bcax_m1 Aba, vBba, vBbi, vBbe + bcax_m1 Abe, vBbe, vBbo, vBbi + bcax_m1 Abi, vBbi, vBbu, vBbo + bcax_m1 Abo, vBbo, vBba, vBbu + bcax_m1 Abu, vBbu, vBbe, vBba + + // iota step + //ld1r {tmp.2d}, [const_addr], #8 + ldr tmpq, [const_addr], #16 + eor Aba.16b, Aba.16b, tmp.16b + + .unreq tmp + +.endm + + +.text +.align 4 +.global keccak_f1600_x2_v84a_asm_v2pp2 +.global _keccak_f1600_x2_v84a_asm_v2pp2 + +#define KECCAK_F1600_ROUNDS 24 + +keccak_f1600_x2_v84a_asm_v2pp2: +_keccak_f1600_x2_v84a_asm_v2pp2: + alloc_stack + save_vregs + load_constant_ptr + load_input + + //mov count, #(KECCAK_F1600_ROUNDS-2) + mov count, #11 + keccak_f1600_round_pre +loop: + keccak_f1600_round_core + keccak_f1600_round_core + sub count, count, #1 + cbnz count, loop + + keccak_f1600_round_core + keccak_f1600_round_post + store_input + restore_vregs + free_stack + ret diff --git a/tests/keccak_neon/manual/keccak_f1600_x2_v84a_asm_v2pp3.s b/tests/keccak_neon/manual/keccak_f1600_x2_v84a_asm_v2pp3.s new file mode 100644 index 0000000..ff8359e --- /dev/null +++ b/tests/keccak_neon/manual/keccak_f1600_x2_v84a_asm_v2pp3.s @@ -0,0 +1,905 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#if defined(__ARM_FEATURE_SVE2) +#include "macros.s" + +/********************** CONSTANTS *************************/ + .data + .align(8) +_round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x1 + count .req x2 + cur_const .req x3 + + /* Mapping of Kecck-f1600 state to vector registers + * at the beginning and end of each round. */ + Aba .req v0 + Abe .req v1 + Abi .req v2 + Abo .req v3 + Abu .req v4 + Aga .req v5 + Age .req v6 + Agi .req v7 + Ago .req v8 + Agu .req v9 + Aka .req v10 + Ake .req v11 + Aki .req v12 + Ako .req v13 + Aku .req v14 + Ama .req v15 + Ame .req v16 + Ami .req v17 + Amo .req v18 + Amu .req v19 + Asa .req v20 + Ase .req v21 + Asi .req v22 + Aso .req v23 + Asu .req v24 + + /* q-form of the above mapping */ + Abaq .req q0 + Abeq .req q1 + Abiq .req q2 + Aboq .req q3 + Abuq .req q4 + Agaq .req q5 + Ageq .req q6 + Agiq .req q7 + Agoq .req q8 + Aguq .req q9 + Akaq .req q10 + Akeq .req q11 + Akiq .req q12 + Akoq .req q13 + Akuq .req q14 + Amaq .req q15 + Ameq .req q16 + Amiq .req q17 + Amoq .req q18 + Amuq .req q19 + Asaq .req q20 + Aseq .req q21 + Asiq .req q22 + Asoq .req q23 + Asuq .req q24 + + /* z-form of the above mapping */ + Abaz .req z0 + Abez .req z1 + Abiz .req z2 + Aboz .req z3 + Abuz .req z4 + Agaz .req z5 + Agez .req z6 + Agiz .req z7 + Agoz .req z8 + Aguz .req z9 + Akaz .req z10 + Akez .req z11 + Akiz .req z12 + Akoz .req z13 + Akuz .req z14 + Amaz .req z15 + Amez .req z16 + Amiz .req z17 + Amoz .req z18 + Amuz .req z19 + Asaz .req z20 + Asez .req z21 + Asiz .req z22 + Asoz .req z23 + Asuz .req z24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v27 + C1 .req v28 + C2 .req v29 + C3 .req v30 + C4 .req v31 + + C0q .req q27 + C1q .req q28 + C2q .req q29 + C3q .req q30 + C4q .req q31 + + C0z .req z27 + C1z .req z28 + C2z .req z29 + C3z .req z30 + C4z .req z31 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + vBba .req v25 // fresh + vBbe .req v26 // fresh + vBbi .req Abi + vBbo .req Abo + vBbu .req Abu + vBga .req Aka + vBge .req Ake + vBgi .req Agi + vBgo .req Ago + vBgu .req Agu + vBka .req Ama + vBke .req Ame + vBki .req Aki + vBko .req Ako + vBku .req Aku + vBma .req Asa + vBme .req Ase + vBmi .req Ami + vBmo .req Amo + vBmu .req Amu + vBsa .req Aba + vBse .req Abe + vBsi .req Asi + vBso .req Aso + vBsu .req Asu + + vBbaq .req q25 // fresh + vBbeq .req q26 // fresh + vBbiq .req Abiq + vBboq .req Aboq + vBbuq .req Abuq + vBgaq .req Akaq + vBgeq .req Akeq + vBgiq .req Agiq + vBgoq .req Agoq + vBguq .req Aguq + vBkaq .req Amaq + vBkeq .req Ameq + vBkiq .req Akiq + vBkoq .req Akoq + vBkuq .req Akuq + vBmaq .req Asaq + vBmeq .req Aseq + vBmiq .req Amiq + vBmoq .req Amoq + vBmuq .req Amuq + vBsaq .req Abaq + vBseq .req Abeq + vBsiq .req Asiq + vBsoq .req Asoq + vBsuq .req Asuq + + vBbaz .req z25 // fresh + vBbez .req z26 // fresh + vBbiz .req Abiz + vBboz .req Aboz + vBbuz .req Abuz + vBgaz .req Akaz + vBgez .req Akez + vBgiz .req Agiz + vBgoz .req Agoz + vBguz .req Aguz + vBkaz .req Amaz + vBkez .req Amez + vBkiz .req Akiz + vBkoz .req Akoz + vBkuz .req Akuz + vBmaz .req Asaz + vBmez .req Asez + vBmiz .req Amiz + vBmoz .req Amoz + vBmuz .req Amuz + vBsaz .req Abaz + vBsez .req Abez + vBsiz .req Asiz + vBsoz .req Asoz + vBsuz .req Asuz + + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req C4 + E1 .req C0 + E2 .req vBbe // fresh + E3 .req C2 + E4 .req C3 + + E0q .req C4q + E1q .req C0q + E2q .req vBbeq // fresh + E3q .req C2q + E4q .req C3q + + E0z .req C4z + E1z .req C0z + E2z .req vBbez // fresh + E3z .req C2z + E4z .req C3z + + + +/************************ MACROS ****************************/ + +.macro load_input + ldp Abaq, Abeq, [input_addr, #(2*8*0)] + ldp Abiq, Aboq, [input_addr, #(2*8*2)] + ldp Abuq, Agaq, [input_addr, #(2*8*4)] + ldp Ageq, Agiq, [input_addr, #(2*8*6)] + ldp Agoq, Aguq, [input_addr, #(2*8*8)] + ldp Akaq, Akeq, [input_addr, #(2*8*10)] + ldp Akiq, Akoq, [input_addr, #(2*8*12)] + ldp Akuq, Amaq, [input_addr, #(2*8*14)] + ldp Ameq, Amiq, [input_addr, #(2*8*16)] + ldp Amoq, Amuq, [input_addr, #(2*8*18)] + ldp Asaq, Aseq, [input_addr, #(2*8*20)] + ldp Asiq, Asoq, [input_addr, #(2*8*22)] + ldr Asuq, [input_addr, #(2*8*24)] +.endm + +.macro store_input + str Abaq, [input_addr, #(2*8*0)] + str Abeq, [input_addr, #(2*8*1)] + str Abiq, [input_addr, #(2*8*2)] + str Aboq, [input_addr, #(2*8*3)] + str Abuq, [input_addr, #(2*8*4)] + str Agaq, [input_addr, #(2*8*5)] + str Ageq, [input_addr, #(2*8*6)] + str Agiq, [input_addr, #(2*8*7)] + str Agoq, [input_addr, #(2*8*8)] + str Aguq, [input_addr, #(2*8*9)] + str Akaq, [input_addr, #(2*8*10)] + str Akeq, [input_addr, #(2*8*11)] + str Akiq, [input_addr, #(2*8*12)] + str Akoq, [input_addr, #(2*8*13)] + str Akuq, [input_addr, #(2*8*14)] + str Amaq, [input_addr, #(2*8*15)] + str Ameq, [input_addr, #(2*8*16)] + str Amiq, [input_addr, #(2*8*17)] + str Amoq, [input_addr, #(2*8*18)] + str Amuq, [input_addr, #(2*8*19)] + str Asaq, [input_addr, #(2*8*20)] + str Aseq, [input_addr, #(2*8*21)] + str Asiq, [input_addr, #(2*8*22)] + str Asoq, [input_addr, #(2*8*23)] + str Asuq, [input_addr, #(2*8*24)] +.endm + +#define STACK_SIZE (16*4 + 16*34) +#define STACK_BASE_VREGS 0 +#define STACK_BASE_TMP 16*4 + +#define Aga_offset 0 +#define E0_offset 1 +#define E1_offset 2 +#define E2_offset 3 +#define E3_offset 4 +#define E4_offset 5 +#define Ame_offset 7 +#define Agi_offset 8 +#define Aka_offset 9 +#define Abo_offset 10 +#define Amo_offset 11 +#define Ami_offset 12 +#define Ake_offset 13 +#define Agu_offset 14 +#define Asi_offset 15 +#define Aku_offset 16 +#define Asa_offset 17 +#define Abu_offset 18 +#define Asu_offset 19 +#define Ase_offset 20 +//#define Aga_offset 21 +#define Age_offset 22 +#define vBgo_offset 23 +#define vBke_offset 24 +#define vBgi_offset 25 +#define vBga_offset 26 +#define vBbo_offset 27 +#define vBmo_offset 28 +#define vBmi_offset 29 +#define vBge_offset 30 + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +#define save(name) \ + str name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] +#define restore(name) \ + ldr name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] + +.macro save_vregs + stp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + stp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + stp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + stp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + ldp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + ldp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + ldp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +/* Macros using v8.4-A SHA-3 instructions */ + +.macro eor3_m1_0 d s0 s1 s2 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor2 d s0 s1 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor3_m1_1 d s0 s1 s2 + eor \d\().16b, \d\().16b, \s2\().16b +.endm + +.macro eor3_m1 d s0 s1 s2 + eor3_m1_0 \d, \s0, \s1, \s2 + eor3_m1_1 \d, \s0, \s1, \s2 +.endm + +.macro rax1_m1 d s0 s1 + // Use add instead of SHL #1 + add tmp.2d, \s1\().2d, \s1\().2d + sri tmp.2d, \s1\().2d, #63 + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +xar_m1_const: + .quad (1ULL<<(64-61)) + .quad (1ULL<<(64-56)) + .quad (1ULL<<(64-50)) + .quad (1ULL<<(64-46)) + .quad (1ULL<<(64-44)) + .quad (1ULL<<(64-43)) + .quad (1ULL<<(64-39)) + .quad (1ULL<<(64-36)) + .quad (1ULL<<(64-21)) + .quad (1ULL<<(64-19)) + .quad (1ULL<<(64-9)) + .quad (1ULL<<(64-3)) + + +xar_m1_const_addr: .quad xar_m1_const + + .macro xar_m1 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 21 + eor \s0\().16b, \s0\().16b, \s1\().16b + ldr \d\()q, [x17, #64] + mul \d\()z\().d, \s0\()z\().d, \d\()z\().d[0] + sri \d\().2d, \s0\().2d, #(\imm) + .elseif \imm == 39 + eor \s0\().16b, \s0\().16b, \s1\().16b + ldr \d\()q, [x17, #48] + mul \d\()z\().d, \s0\()z\().d, \d\()z\().d[0] + sri \d\().2d, \s0\().2d, #(\imm) + .elseif \imm == 56 + eor \s0\().16b, \s0\().16b, \s1\().16b + ldr \d\()q, [x17] + mul \d\()z\().d, \s0\()z\().d, \d\()z\().d[1] + sri \d\().2d, \s0\().2d, #(\imm) + .elseif \imm == 63 + eor \s0\().16b, \s0\().16b, \s1\().16b + add \d\().2d, \s0\().2d, \s0\().2d + sri \d\().2d, \s0\().2d, #(63) + .elseif \imm == 9 + eor \s0\().16b, \s0\().16b, \s1\().16b + ldr \d\()q, [x17, #80] + mul \d\()z\().d, \s0\()z\().d, \d\()z\().d[0] + sri \d\().2d, \s0\().2d, #(\imm) + .elseif \imm == 19 + eor \s0\().16b, \s0\().16b, \s1\().16b + ldr \d\()q, [x17, #64] + mul \d\()z\().d, \s0\()z\().d, \d\()z\().d[1] + sri \d\().2d, \s0\().2d, #(\imm) + .elseif \imm == 61 + eor \s0\().16b, \s0\().16b, \s1\().16b + ldr \d\()q, [x17] + mul \d\()z\().d, \s0\()z\().d, \d\()z\().d[0] + sri \d\().2d, \s0\().2d, #(\imm) + .elseif \imm == 36 + eor \s0\().16b, \s0\().16b, \s1\().16b + ldr \d\()q, [x17, #48] + mul \d\()z\().d, \s0\()z\().d, \d\()z\().d[1] + sri \d\().2d, \s0\().2d, #(\imm) + .elseif \imm == 43 + eor \s0\().16b, \s0\().16b, \s1\().16b + ldr \d\()q, [x17, #32] + mul \d\()z\().d, \s0\()z\().d, \d\()z\().d[1] + sri \d\().2d, \s0\().2d, #(\imm) + .elseif \imm == 44 + eor \s0\().16b, \s0\().16b, \s1\().16b + ldr \d\()q, [x17, #32] + mul \d\()z\().d, \s0\()z\().d, \d\()z\().d[0] + sri \d\().2d, \s0\().2d, #(\imm) + .elseif \imm == 3 + eor \s0\().16b, \s0\().16b, \s1\().16b + ldr \d\()q, [x17, #80] + mul \d\()z\().d, \s0\()z\().d, \d\()z\().d[1] + sri \d\().2d, \s0\().2d, #(\imm) + .elseif \imm == 46 + eor \s0\().16b, \s0\().16b, \s1\().16b + ldr \d\()q, [x17, #16] + mul \d\()z\().d, \s0\()z\().d, \d\()z\().d[1] + sri \d\().2d, \s0\().2d, #(\imm) + .elseif \imm == 50 + eor \s0\().16b, \s0\().16b, \s1\().16b + ldr \d\()q, [x17, #16] + mul \d\()z\().d, \s0\()z\().d, \d\()z\().d[0] + sri \d\().2d, \s0\().2d, #(\imm) + .elseif \imm == 62 + eor \s0\().16b, \s0\().16b, \s1\().16b + add \d\().2d, \s0\().2d, \s0\().2d + add \d\().2d, \d\().2d, \d\().2d + sri \d\().2d, \s0\().2d, #(62) + .else + eor \s0\().16b, \s0\().16b, \s1\().16b + shl \d\().2d, \s0\().2d, #(64-\imm) + sri \d\().2d, \s0\().2d, #(\imm) + .endif +.endm + +.macro bcax_m1 d s0 s1 s2 + bic tmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +/* Keccak-f1600 round */ + +.macro keccak_f1600_round_pre + + /* 10 EOR3, so 20 individual EOR */ + + eor3_m1_0 C1, Abe, Age, Ake + eor3_m1_0 C3, Abo, Ago, Ako + eor3_m1_0 C0, Aba, Aga, Aka + eor3_m1_0 C2, Abi, Agi, Aki + eor3_m1_0 C4, Abu, Agu, Aku + eor3_m1_1 C1, Abe, Age, Ake + eor3_m1_1 C3, Abo, Ago, Ako + eor3_m1_1 C0, Aba, Aga, Aka + eor3_m1_1 C2, Abi, Agi, Aki + eor3_m1_1 C4, Abu, Agu, Aku + eor3_m1_0 C1, C1, Ame, Ase + eor3_m1_0 C3, C3, Amo, Aso + eor3_m1_0 C0, C0, Ama, Asa + eor3_m1_0 C2, C2, Ami, Asi + eor3_m1_0 C4, C4, Amu, Asu + eor3_m1_1 C1, C1, Ame, Ase + eor3_m1_1 C3, C3, Amo, Aso + eor3_m1_1 C0, C0, Ama, Asa + eor3_m1_1 C2, C2, Ami, Asi + eor3_m1_1 C4, C4, Amu, Asu + +.endm + +.macro keccak_f1600_round + + /* 10 EOR3, so 20 individual EOR */ + + eor3_m1_0 C0, Aba, Aga, Aka + eor3_m1_0 C1, Abe, Age, Ake + eor3_m1_0 C2, Abi, Agi, Aki + eor3_m1_0 C3, Abo, Ago, Ako + eor3_m1_0 C4, Abu, Agu, Aku + eor3_m1_1 C0, Aba, Aga, Aka + eor3_m1_1 C1, Abe, Age, Ake + eor3_m1_1 C2, Abi, Agi, Aki + eor3_m1_1 C3, Abo, Ago, Ako + eor3_m1_1 C4, Abu, Agu, Aku + eor3_m1_0 C0, C0, Ama, Asa + eor3_m1_0 C1, C1, Ame, Ase + eor3_m1_0 C2, C2, Ami, Asi + eor3_m1_0 C3, C3, Amo, Aso + eor3_m1_0 C4, C4, Amu, Asu + eor3_m1_1 C0, C0, Ama, Asa + eor3_m1_1 C1, C1, Ame, Ase + eor3_m1_1 C2, C2, Ami, Asi + eor3_m1_1 C3, C3, Amo, Aso + eor3_m1_1 C4, C4, Amu, Asu + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req vBba + rax1_m1 E2, C1, C3 + rax1_m1 E4, C3, C0 + rax1_m1 E1, C0, C2 + rax1_m1 E3, C2, C4 + rax1_m1 E0, C4, C1 + .unreq tmp + + /* 25x XAR, 75 in total */ + + tmp .req C1 + tmpq .req C1q + + eor vBba.16b, Aba.16b, E0.16b + xar_m1 vBsa, Abi, E2, 2 + xar_m1 vBbi, Aki, E2, 21 + xar_m1 vBki, Ako, E3, 39 + xar_m1 vBko, Amu, E4, 56 + xar_m1 vBmu, Aso, E3, 8 + xar_m1 vBso, Ama, E0, 23 + xar_m1 vBka, Abe, E1, 63 + xar_m1 vBse, Ago, E3, 9 + xar_m1 vBgo, Ame, E1, 19 + xar_m1 vBke, Agi, E2, 58 + xar_m1 vBgi, Aka, E0, 61 + xar_m1 vBga, Abo, E3, 36 + xar_m1 vBbo, Amo, E3, 43 + xar_m1 vBmo, Ami, E2, 49 + xar_m1 vBmi, Ake, E1, 54 + xar_m1 vBge, Agu, E4, 44 + xar_m1 vBgu, Asi, E2, 3 + xar_m1 vBsi, Aku, E4, 25 + xar_m1 vBku, Asa, E0, 46 + xar_m1 vBma, Abu, E4, 37 + xar_m1 vBbu, Asu, E4, 50 + xar_m1 vBsu, Ase, E1, 62 + xar_m1 vBme, Aga, E0, 28 + xar_m1 vBbe, Age, E1, 20 + + /* 25x BCAX, 50 in total */ + + bcax_m1 Aga, vBga, vBgi, vBge + bcax_m1 Age, vBge, vBgo, vBgi + bcax_m1 Agi, vBgi, vBgu, vBgo + bcax_m1 Ago, vBgo, vBga, vBgu + bcax_m1 Agu, vBgu, vBge, vBga + bcax_m1 Aka, vBka, vBki, vBke + bcax_m1 Ake, vBke, vBko, vBki + bcax_m1 Aki, vBki, vBku, vBko + bcax_m1 Ako, vBko, vBka, vBku + bcax_m1 Aku, vBku, vBke, vBka + bcax_m1 Ama, vBma, vBmi, vBme + bcax_m1 Ame, vBme, vBmo, vBmi + bcax_m1 Ami, vBmi, vBmu, vBmo + bcax_m1 Amo, vBmo, vBma, vBmu + bcax_m1 Amu, vBmu, vBme, vBma + bcax_m1 Asa, vBsa, vBsi, vBse + bcax_m1 Ase, vBse, vBso, vBsi + bcax_m1 Asi, vBsi, vBsu, vBso + bcax_m1 Aso, vBso, vBsa, vBsu + bcax_m1 Asu, vBsu, vBse, vBsa + bcax_m1 Aba, vBba, vBbi, vBbe + bcax_m1 Abe, vBbe, vBbo, vBbi + bcax_m1 Abi, vBbi, vBbu, vBbo + bcax_m1 Abo, vBbo, vBba, vBbu + bcax_m1 Abu, vBbu, vBbe, vBba + + // iota step + //ld1r {tmp.2d}, [const_addr], #8 + ldr tmpq, [const_addr], #16 + eor Aba.16b, Aba.16b, tmp.16b + + .unreq tmp + .unreq tmpq + +.endm + +.macro keccak_f1600_round_core + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req vBba + rax1_m1 E2, C1, C3 + rax1_m1 E4, C3, C0 + rax1_m1 E1, C0, C2 + rax1_m1 E3, C2, C4 + rax1_m1 E0, C4, C1 + + /* 25x XAR, 75 in total */ + + eor vBba.16b, Aba.16b, E0.16b + xar_m1 vBsa, Abi, E2, 2 + xar_m1 vBbi, Aki, E2, 21 + xar_m1 vBki, Ako, E3, 39 + xar_m1 vBko, Amu, E4, 56 + xar_m1 vBmu, Aso, E3, 8 + xar_m1 vBso, Ama, E0, 23 + xar_m1 vBka, Abe, E1, 63 + xar_m1 vBse, Ago, E3, 9 + xar_m1 vBgo, Ame, E1, 19 + xar_m1 vBke, Agi, E2, 58 + xar_m1 vBgi, Aka, E0, 61 + xar_m1 vBga, Abo, E3, 36 + xar_m1 vBbo, Amo, E3, 43 + xar_m1 vBmo, Ami, E2, 49 + xar_m1 vBmi, Ake, E1, 54 + xar_m1 vBge, Agu, E4, 44 + xar_m1 vBgu, Asi, E2, 3 + xar_m1 vBsi, Aku, E4, 25 + xar_m1 vBku, Asa, E0, 46 + xar_m1 vBma, Abu, E4, 37 + xar_m1 vBbu, Asu, E4, 50 + xar_m1 vBsu, Ase, E1, 62 + xar_m1 vBme, Aga, E0, 28 + xar_m1 vBbe, Age, E1, 20 + + /* 25x BCAX, 50 in total */ + + .unreq tmp + tmp .req C1 + bcax_m1 Aga, vBga, vBgi, vBge + bcax_m1 Age, vBge, vBgo, vBgi + bcax_m1 Agi, vBgi, vBgu, vBgo + bcax_m1 Ago, vBgo, vBga, vBgu + bcax_m1 Agu, vBgu, vBge, vBga + bcax_m1 Aka, vBka, vBki, vBke + bcax_m1 Ake, vBke, vBko, vBki + .unreq tmp + + eor2 C0, Aka, Aga + save(Aga) + + tmp .req Aga + tmpq .req Agaq + bcax_m1 Aki, vBki, vBku, vBko + bcax_m1 Ako, vBko, vBka, vBku + eor2 C1, Ake, Age + bcax_m1 Aku, vBku, vBke, vBka + eor2 C2, Aki, Agi + bcax_m1 Ama, vBma, vBmi, vBme + eor2 C3, Ako, Ago + bcax_m1 Ame, vBme, vBmo, vBmi + eor2 C4, Aku, Agu + bcax_m1 Ami, vBmi, vBmu, vBmo + eor2 C0, C0, Ama + bcax_m1 Amo, vBmo, vBma, vBmu + eor2 C1, C1, Ame + bcax_m1 Amu, vBmu, vBme, vBma + eor2 C2, C2, Ami + bcax_m1 Asa, vBsa, vBsi, vBse + eor2 C3, C3, Amo + bcax_m1 Ase, vBse, vBso, vBsi + eor2 C4, C4, Amu + bcax_m1 Asi, vBsi, vBsu, vBso + eor2 C0, C0, Asa + bcax_m1 Aso, vBso, vBsa, vBsu + eor2 C1, C1, Ase + bcax_m1 Asu, vBsu, vBse, vBsa + eor2 C2, C2, Asi + eor2 C3, C3, Aso + bcax_m1 Aba, vBba, vBbi, vBbe + bcax_m1 Abe, vBbe, vBbo, vBbi + eor2 C1, C1, Abe + + // iota step + //ld1r {tmp.2d}, [const_addr], #8 + ldr tmpq, [const_addr], #16 + eor Aba.16b, Aba.16b, tmp.16b + eor2 C4, C4, Asu + bcax_m1 Abi, vBbi, vBbu, vBbo + bcax_m1 Abo, vBbo, vBba, vBbu + eor2 C3, C3, Abo + eor2 C2, C2, Abi + eor2 C0, C0, Aba + bcax_m1 Abu, vBbu, vBbe, vBba + eor2 C4, C4, Abu + + restore(Aga) + .unreq tmp + .unreq tmpq + +.endm + +.macro keccak_f1600_round_post + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req vBba + rax1_m1 E2, C1, C3 + rax1_m1 E4, C3, C0 + rax1_m1 E1, C0, C2 + rax1_m1 E3, C2, C4 + rax1_m1 E0, C4, C1 + .unreq tmp + + /* 25x XAR, 75 in total */ + + tmp .req C1 + tmpq .req C1q + eor vBba.16b, Aba.16b, E0.16b + xar_m1 vBsa, Abi, E2, 2 + xar_m1 vBbi, Aki, E2, 21 + xar_m1 vBki, Ako, E3, 39 + xar_m1 vBko, Amu, E4, 56 + xar_m1 vBmu, Aso, E3, 8 + xar_m1 vBso, Ama, E0, 23 + xar_m1 vBka, Abe, E1, 63 + xar_m1 vBse, Ago, E3, 9 + xar_m1 vBgo, Ame, E1, 19 + xar_m1 vBke, Agi, E2, 58 + xar_m1 vBgi, Aka, E0, 61 + xar_m1 vBga, Abo, E3, 36 + xar_m1 vBbo, Amo, E3, 43 + xar_m1 vBmo, Ami, E2, 49 + xar_m1 vBmi, Ake, E1, 54 + xar_m1 vBge, Agu, E4, 44 + xar_m1 vBgu, Asi, E2, 3 + xar_m1 vBsi, Aku, E4, 25 + xar_m1 vBku, Asa, E0, 46 + xar_m1 vBma, Abu, E4, 37 + xar_m1 vBbu, Asu, E4, 50 + xar_m1 vBsu, Ase, E1, 62 + xar_m1 vBme, Aga, E0, 28 + xar_m1 vBbe, Age, E1, 20 + + /* 25x BCAX, 50 in total */ + + bcax_m1 Aga, vBga, vBgi, vBge + bcax_m1 Age, vBge, vBgo, vBgi + bcax_m1 Agi, vBgi, vBgu, vBgo + bcax_m1 Ago, vBgo, vBga, vBgu + bcax_m1 Agu, vBgu, vBge, vBga + bcax_m1 Aka, vBka, vBki, vBke + bcax_m1 Ake, vBke, vBko, vBki + bcax_m1 Aki, vBki, vBku, vBko + bcax_m1 Ako, vBko, vBka, vBku + bcax_m1 Aku, vBku, vBke, vBka + bcax_m1 Ama, vBma, vBmi, vBme + bcax_m1 Ame, vBme, vBmo, vBmi + bcax_m1 Ami, vBmi, vBmu, vBmo + bcax_m1 Amo, vBmo, vBma, vBmu + bcax_m1 Amu, vBmu, vBme, vBma + bcax_m1 Asa, vBsa, vBsi, vBse + bcax_m1 Ase, vBse, vBso, vBsi + bcax_m1 Asi, vBsi, vBsu, vBso + bcax_m1 Aso, vBso, vBsa, vBsu + bcax_m1 Asu, vBsu, vBse, vBsa + bcax_m1 Aba, vBba, vBbi, vBbe + bcax_m1 Abe, vBbe, vBbo, vBbi + bcax_m1 Abi, vBbi, vBbu, vBbo + bcax_m1 Abo, vBbo, vBba, vBbu + bcax_m1 Abu, vBbu, vBbe, vBba + + // iota step + //ld1r {tmp.2d}, [const_addr], #8 + ldr tmpq, [const_addr], #16 + eor Aba.16b, Aba.16b, tmp.16b + + .unreq tmp + +.endm + + +.text +.align 4 +.global keccak_f1600_x2_v84a_asm_v2pp3 +.global _keccak_f1600_x2_v84a_asm_v2pp3 + +#define KECCAK_F1600_ROUNDS 24 + +keccak_f1600_x2_v84a_asm_v2pp3: +_keccak_f1600_x2_v84a_asm_v2pp3: + alloc_stack + save_vregs + load_constant_ptr + load_input + + ldr x17, xar_m1_const_addr + + //mov count, #(KECCAK_F1600_ROUNDS-2) + mov count, #11 + keccak_f1600_round_pre +loop: + keccak_f1600_round_core + keccak_f1600_round_core + sub count, count, #1 + cbnz count, loop + + keccak_f1600_round_core + keccak_f1600_round_post + store_input + restore_vregs + free_stack + ret +#endif diff --git a/tests/keccak_neon/manual/keccak_f1600_x2_v84a_asm_v2pp4.s b/tests/keccak_neon/manual/keccak_f1600_x2_v84a_asm_v2pp4.s new file mode 100644 index 0000000..60a859e --- /dev/null +++ b/tests/keccak_neon/manual/keccak_f1600_x2_v84a_asm_v2pp4.s @@ -0,0 +1,797 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +/********************** CONSTANTS *************************/ + .data + .align(8) +_round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x1 + count .req x2 + cur_const .req x3 + + /* Mapping of Kecck-f1600 state to vector registers + * at the beginning and end of each round. */ + Aba .req v0 + Abe .req v1 + Abi .req v2 + Abo .req v3 + Abu .req v4 + Aga .req v5 + Age .req v6 + Agi .req v7 + Ago .req v8 + Agu .req v9 + Aka .req v10 + Ake .req v11 + Aki .req v12 + Ako .req v13 + Aku .req v14 + Ama .req v15 + Ame .req v16 + Ami .req v17 + Amo .req v18 + Amu .req v19 + Asa .req v20 + Ase .req v21 + Asi .req v22 + Aso .req v23 + Asu .req v24 + + /* q-form of the above mapping */ + Abaq .req q0 + Abeq .req q1 + Abiq .req q2 + Aboq .req q3 + Abuq .req q4 + Agaq .req q5 + Ageq .req q6 + Agiq .req q7 + Agoq .req q8 + Aguq .req q9 + Akaq .req q10 + Akeq .req q11 + Akiq .req q12 + Akoq .req q13 + Akuq .req q14 + Amaq .req q15 + Ameq .req q16 + Amiq .req q17 + Amoq .req q18 + Amuq .req q19 + Asaq .req q20 + Aseq .req q21 + Asiq .req q22 + Asoq .req q23 + Asuq .req q24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v27 + C1 .req v28 + C2 .req v29 + C3 .req v30 + C4 .req v31 + + C0q .req q27 + C1q .req q28 + C2q .req q29 + C3q .req q30 + C4q .req q31 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + vBba .req v25 // fresh + vBbe .req v26 // fresh + vBbi .req Abi + vBbo .req Abo + vBbu .req Abu + vBga .req Aka + vBge .req Ake + vBgi .req Agi + vBgo .req Ago + vBgu .req Agu + vBka .req Ama + vBke .req Ame + vBki .req Aki + vBko .req Ako + vBku .req Aku + vBma .req Asa + vBme .req Ase + vBmi .req Ami + vBmo .req Amo + vBmu .req Amu + vBsa .req Aba + vBse .req Abe + vBsi .req Asi + vBso .req Aso + vBsu .req Asu + + vBbaq .req q25 // fresh + vBbeq .req q26 // fresh + vBbiq .req Abiq + vBboq .req Aboq + vBbuq .req Abuq + vBgaq .req Akaq + vBgeq .req Akeq + vBgiq .req Agiq + vBgoq .req Agoq + vBguq .req Aguq + vBkaq .req Amaq + vBkeq .req Ameq + vBkiq .req Akiq + vBkoq .req Akoq + vBkuq .req Akuq + vBmaq .req Asaq + vBmeq .req Aseq + vBmiq .req Amiq + vBmoq .req Amoq + vBmuq .req Amuq + vBsaq .req Abaq + vBseq .req Abeq + vBsiq .req Asiq + vBsoq .req Asoq + vBsuq .req Asuq + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req C4 + E1 .req C0 + E2 .req vBbe // fresh + E3 .req C2 + E4 .req C3 + + E0q .req C4q + E1q .req C0q + E2q .req vBbeq // fresh + E3q .req C2q + E4q .req C3q + + +/************************ MACROS ****************************/ + +.macro load_input + ldp Abaq, Abeq, [input_addr, #(2*8*0)] + ldp Abiq, Aboq, [input_addr, #(2*8*2)] + ldp Abuq, Agaq, [input_addr, #(2*8*4)] + ldp Ageq, Agiq, [input_addr, #(2*8*6)] + ldp Agoq, Aguq, [input_addr, #(2*8*8)] + ldp Akaq, Akeq, [input_addr, #(2*8*10)] + ldp Akiq, Akoq, [input_addr, #(2*8*12)] + ldp Akuq, Amaq, [input_addr, #(2*8*14)] + ldp Ameq, Amiq, [input_addr, #(2*8*16)] + ldp Amoq, Amuq, [input_addr, #(2*8*18)] + ldp Asaq, Aseq, [input_addr, #(2*8*20)] + ldp Asiq, Asoq, [input_addr, #(2*8*22)] + ldr Asuq, [input_addr, #(2*8*24)] +.endm + +.macro store_input + str Abaq, [input_addr, #(2*8*0)] + str Abeq, [input_addr, #(2*8*1)] + str Abiq, [input_addr, #(2*8*2)] + str Aboq, [input_addr, #(2*8*3)] + str Abuq, [input_addr, #(2*8*4)] + str Agaq, [input_addr, #(2*8*5)] + str Ageq, [input_addr, #(2*8*6)] + str Agiq, [input_addr, #(2*8*7)] + str Agoq, [input_addr, #(2*8*8)] + str Aguq, [input_addr, #(2*8*9)] + str Akaq, [input_addr, #(2*8*10)] + str Akeq, [input_addr, #(2*8*11)] + str Akiq, [input_addr, #(2*8*12)] + str Akoq, [input_addr, #(2*8*13)] + str Akuq, [input_addr, #(2*8*14)] + str Amaq, [input_addr, #(2*8*15)] + str Ameq, [input_addr, #(2*8*16)] + str Amiq, [input_addr, #(2*8*17)] + str Amoq, [input_addr, #(2*8*18)] + str Amuq, [input_addr, #(2*8*19)] + str Asaq, [input_addr, #(2*8*20)] + str Aseq, [input_addr, #(2*8*21)] + str Asiq, [input_addr, #(2*8*22)] + str Asoq, [input_addr, #(2*8*23)] + str Asuq, [input_addr, #(2*8*24)] +.endm + +#define STACK_SIZE (16*4 + 16*34) +#define STACK_BASE_VREGS 0 +#define STACK_BASE_TMP 16*4 + +#define Aga_offset 0 +#define E0_offset 1 +#define E1_offset 2 +#define E2_offset 3 +#define E3_offset 4 +#define E4_offset 5 +#define Ame_offset 7 +#define Agi_offset 8 +#define Aka_offset 9 +#define Abo_offset 10 +#define Amo_offset 11 +#define Ami_offset 12 +#define Ake_offset 13 +#define Agu_offset 14 +#define Asi_offset 15 +#define Aku_offset 16 +#define Asa_offset 17 +#define Abu_offset 18 +#define Asu_offset 19 +#define Ase_offset 20 +//#define Aga_offset 21 +#define Age_offset 22 +#define vBgo_offset 23 +#define vBke_offset 24 +#define vBgi_offset 25 +#define vBga_offset 26 +#define vBbo_offset 27 +#define vBmo_offset 28 +#define vBmi_offset 29 +#define vBge_offset 30 + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +#define save(name) \ + str name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] +#define restore(name) \ + ldr name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] + +.macro save_vregs + stp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + stp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + stp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + stp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + ldp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + ldp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + ldp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +/* Macros using v8.4-A SHA-3 instructions */ + +.macro eor3_m1_0 d s0 s1 s2 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor2 d s0 s1 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor3_m1_1 d s0 s1 s2 + eor \d\().16b, \d\().16b, \s2\().16b +.endm + +.macro eor3_m1 d s0 s1 s2 + eor3_m1_0 \d, \s0, \s1, \s2 + eor3_m1_1 \d, \s0, \s1, \s2 +.endm + +.macro rax1_m1 d s0 s1 + // Use add instead of SHL #1 + add tmp.2d, \s1\().2d, \s1\().2d + sri tmp.2d, \s1\().2d, #63 + eor \d\().16b, tmp.16b, \s0\().16b +.endm + + .macro xar_m1 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 63 + eor \s0\().16b, \s0\().16b, \s1\().16b + add \d\().2d, \s0\().2d, \s0\().2d + sri \d\().2d, \s0\().2d, #(63) + .elseif \imm == 62 + eor \s0\().16b, \s0\().16b, \s1\().16b + add \d\().2d, \s0\().2d, \s0\().2d + add \d\().2d, \d\().2d, \d\().2d + sri \d\().2d, \s0\().2d, #(62) + // .elseif \imm == 61 + // eor \s0\().16b, \s0\().16b, \s1\().16b + // add \d\().2d, \s0\().2d, \s0\().2d + // add \d\().2d, \d\().2d, \d\().2d + // add \d\().2d, \d\().2d, \d\().2d + // sri \d\().2d, \s0\().2d, #(61) + .else + eor \s0\().16b, \s0\().16b, \s1\().16b + shl \d\().2d, \s0\().2d, #(64-\imm) + sri \d\().2d, \s0\().2d, #(\imm) + .endif +.endm + + .macro xar_m1_0 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 63 + eor \s0\().16b, \s0\().16b, \s1\().16b + .elseif \imm == 62 + eor \s0\().16b, \s0\().16b, \s1\().16b + .else + eor \s0\().16b, \s0\().16b, \s1\().16b + .endif +.endm + + .macro xar_m1_1 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 63 + add \d\().2d, \s0\().2d, \s0\().2d + sri \d\().2d, \s0\().2d, #(63) + .elseif \imm == 62 + add \d\().2d, \s0\().2d, \s0\().2d + add \d\().2d, \d\().2d, \d\().2d + sri \d\().2d, \s0\().2d, #(62) + .else + shl \d\().2d, \s0\().2d, #(64-\imm) + sri \d\().2d, \s0\().2d, #(\imm) + .endif +.endm + +.macro bcax_m1 d s0 s1 s2 + bic tmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +/* Keccak-f1600 round */ + +.macro keccak_f1600_round_pre + + /* 10 EOR3, so 20 individual EOR */ + + eor3_m1_0 C1, Abe, Age, Ake + eor3_m1_0 C3, Abo, Ago, Ako + eor3_m1_0 C0, Aba, Aga, Aka + eor3_m1_0 C2, Abi, Agi, Aki + eor3_m1_0 C4, Abu, Agu, Aku + eor3_m1_1 C1, Abe, Age, Ake + eor3_m1_1 C3, Abo, Ago, Ako + eor3_m1_1 C0, Aba, Aga, Aka + eor3_m1_1 C2, Abi, Agi, Aki + eor3_m1_1 C4, Abu, Agu, Aku + eor3_m1_0 C1, C1, Ame, Ase + eor3_m1_0 C3, C3, Amo, Aso + eor3_m1_0 C0, C0, Ama, Asa + eor3_m1_0 C2, C2, Ami, Asi + eor3_m1_0 C4, C4, Amu, Asu + eor3_m1_1 C1, C1, Ame, Ase + eor3_m1_1 C3, C3, Amo, Aso + eor3_m1_1 C0, C0, Ama, Asa + eor3_m1_1 C2, C2, Ami, Asi + eor3_m1_1 C4, C4, Amu, Asu + +.endm + +.macro keccak_f1600_round + + /* 10 EOR3, so 20 individual EOR */ + + eor3_m1_0 C0, Aba, Aga, Aka + eor3_m1_0 C1, Abe, Age, Ake + eor3_m1_0 C2, Abi, Agi, Aki + eor3_m1_0 C3, Abo, Ago, Ako + eor3_m1_0 C4, Abu, Agu, Aku + eor3_m1_1 C0, Aba, Aga, Aka + eor3_m1_1 C1, Abe, Age, Ake + eor3_m1_1 C2, Abi, Agi, Aki + eor3_m1_1 C3, Abo, Ago, Ako + eor3_m1_1 C4, Abu, Agu, Aku + eor3_m1_0 C0, C0, Ama, Asa + eor3_m1_0 C1, C1, Ame, Ase + eor3_m1_0 C2, C2, Ami, Asi + eor3_m1_0 C3, C3, Amo, Aso + eor3_m1_0 C4, C4, Amu, Asu + eor3_m1_1 C0, C0, Ama, Asa + eor3_m1_1 C1, C1, Ame, Ase + eor3_m1_1 C2, C2, Ami, Asi + eor3_m1_1 C3, C3, Amo, Aso + eor3_m1_1 C4, C4, Amu, Asu + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req vBba + rax1_m1 E2, C1, C3 + rax1_m1 E4, C3, C0 + rax1_m1 E1, C0, C2 + rax1_m1 E3, C2, C4 + rax1_m1 E0, C4, C1 + .unreq tmp + + /* 25x XAR, 75 in total */ + + tmp .req C1 + tmpq .req C1q + + eor vBba.16b, Aba.16b, E0.16b + xar_m1 vBsa, Abi, E2, 2 + xar_m1 vBbi, Aki, E2, 21 + xar_m1 vBki, Ako, E3, 39 + xar_m1 vBko, Amu, E4, 56 + xar_m1 vBmu, Aso, E3, 8 + xar_m1 vBso, Ama, E0, 23 + xar_m1 vBka, Abe, E1, 63 + xar_m1 vBse, Ago, E3, 9 + xar_m1 vBgo, Ame, E1, 19 + xar_m1 vBke, Agi, E2, 58 + xar_m1 vBgi, Aka, E0, 61 + xar_m1 vBga, Abo, E3, 36 + xar_m1 vBbo, Amo, E3, 43 + xar_m1 vBmo, Ami, E2, 49 + xar_m1 vBmi, Ake, E1, 54 + xar_m1 vBge, Agu, E4, 44 + xar_m1 vBgu, Asi, E2, 3 + xar_m1 vBsi, Aku, E4, 25 + xar_m1 vBku, Asa, E0, 46 + xar_m1 vBma, Abu, E4, 37 + xar_m1 vBbu, Asu, E4, 50 + xar_m1 vBsu, Ase, E1, 62 + xar_m1 vBme, Aga, E0, 28 + xar_m1 vBbe, Age, E1, 20 + + /* 25x BCAX, 50 in total */ + + bcax_m1 Aga, vBga, vBgi, vBge + bcax_m1 Age, vBge, vBgo, vBgi + bcax_m1 Agi, vBgi, vBgu, vBgo + bcax_m1 Ago, vBgo, vBga, vBgu + bcax_m1 Agu, vBgu, vBge, vBga + bcax_m1 Aka, vBka, vBki, vBke + bcax_m1 Ake, vBke, vBko, vBki + bcax_m1 Aki, vBki, vBku, vBko + bcax_m1 Ako, vBko, vBka, vBku + bcax_m1 Aku, vBku, vBke, vBka + bcax_m1 Ama, vBma, vBmi, vBme + bcax_m1 Ame, vBme, vBmo, vBmi + bcax_m1 Ami, vBmi, vBmu, vBmo + bcax_m1 Amo, vBmo, vBma, vBmu + bcax_m1 Amu, vBmu, vBme, vBma + bcax_m1 Asa, vBsa, vBsi, vBse + bcax_m1 Ase, vBse, vBso, vBsi + bcax_m1 Asi, vBsi, vBsu, vBso + bcax_m1 Aso, vBso, vBsa, vBsu + bcax_m1 Asu, vBsu, vBse, vBsa + bcax_m1 Aba, vBba, vBbi, vBbe + bcax_m1 Abe, vBbe, vBbo, vBbi + bcax_m1 Abi, vBbi, vBbu, vBbo + bcax_m1 Abo, vBbo, vBba, vBbu + bcax_m1 Abu, vBbu, vBbe, vBba + + // iota step + //ld1r {tmp.2d}, [const_addr], #8 + ldr tmpq, [const_addr], #16 + eor Aba.16b, Aba.16b, tmp.16b + + .unreq tmp + .unreq tmpq + +.endm + +.macro keccak_f1600_round_core + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req vBba + rax1_m1 E2, C1, C3 + rax1_m1 E4, C3, C0 + rax1_m1 E1, C0, C2 + rax1_m1 E3, C2, C4 + rax1_m1 E0, C4, C1 + + /* 25x XAR, 75 in total */ + + .unreq tmp + tmp .req C1 + tmpq .req C1q + + eor vBba.16b, Aba.16b, E0.16b + xar_m1 vBsa, Abi, E2, 2 + xar_m1 vBbi, Aki, E2, 21 + xar_m1 vBki, Ako, E3, 39 + xar_m1 vBko, Amu, E4, 56 + xar_m1 vBmu, Aso, E3, 8 + xar_m1 vBso, Ama, E0, 23 + xar_m1 vBka, Abe, E1, 63 + xar_m1 vBse, Ago, E3, 9 + xar_m1 vBgo, Ame, E1, 19 + xar_m1 vBke, Agi, E2, 58 + xar_m1 vBgi, Aka, E0, 61 + xar_m1 vBga, Abo, E3, 36 + xar_m1 vBbo, Amo, E3, 43 + xar_m1 vBmo, Ami, E2, 49 + xar_m1 vBmi, Ake, E1, 54 + xar_m1 vBge, Agu, E4, 44 + mov E3.16b, Aga.16b + bcax_m1 Aga, vBga, vBgi, vBge + xar_m1 vBgu, Asi, E2, 3 + xar_m1 vBsi, Aku, E4, 25 + xar_m1 vBku, Asa, E0, 46 + xar_m1 vBma, Abu, E4, 37 + xar_m1 vBbu, Asu, E4, 50 + xar_m1 vBsu, Ase, E1, 62 + xar_m1 vBme, E3, E0, 28 + xar_m1 vBbe, Age, E1, 20 + + /* 25x BCAX, 50 in total */ + + bcax_m1 Age, vBge, vBgo, vBgi + bcax_m1 Agi, vBgi, vBgu, vBgo + bcax_m1 Ago, vBgo, vBga, vBgu + bcax_m1 Agu, vBgu, vBge, vBga + bcax_m1 Aka, vBka, vBki, vBke + bcax_m1 Ake, vBke, vBko, vBki + + .unreq tmp + .unreq tmpq + + eor2 C0, Aka, Aga + save(Aga) + + tmp .req Aga + tmpq .req Agaq + bcax_m1 Aki, vBki, vBku, vBko + bcax_m1 Ako, vBko, vBka, vBku + eor2 C1, Ake, Age + bcax_m1 Aku, vBku, vBke, vBka + eor2 C2, Aki, Agi + bcax_m1 Ama, vBma, vBmi, vBme + eor2 C3, Ako, Ago + bcax_m1 Ame, vBme, vBmo, vBmi + eor2 C4, Aku, Agu + bcax_m1 Ami, vBmi, vBmu, vBmo + eor2 C0, C0, Ama + bcax_m1 Amo, vBmo, vBma, vBmu + eor2 C1, C1, Ame + bcax_m1 Amu, vBmu, vBme, vBma + eor2 C2, C2, Ami + bcax_m1 Asa, vBsa, vBsi, vBse + eor2 C3, C3, Amo + bcax_m1 Ase, vBse, vBso, vBsi + eor2 C4, C4, Amu + bcax_m1 Asi, vBsi, vBsu, vBso + eor2 C0, C0, Asa + bcax_m1 Aso, vBso, vBsa, vBsu + eor2 C1, C1, Ase + bcax_m1 Asu, vBsu, vBse, vBsa + eor2 C2, C2, Asi + eor2 C3, C3, Aso + bcax_m1 Aba, vBba, vBbi, vBbe + bcax_m1 Abe, vBbe, vBbo, vBbi + eor2 C1, C1, Abe + + // iota step + //ld1r {tmp.2d}, [const_addr], #8 + ldr tmpq, [const_addr], #16 + eor Aba.16b, Aba.16b, tmp.16b + eor2 C4, C4, Asu + bcax_m1 Abi, vBbi, vBbu, vBbo + bcax_m1 Abo, vBbo, vBba, vBbu + eor2 C3, C3, Abo + eor2 C2, C2, Abi + eor2 C0, C0, Aba + bcax_m1 Abu, vBbu, vBbe, vBba + eor2 C4, C4, Abu + + restore(Aga) + .unreq tmp + .unreq tmpq + +.endm + +.macro keccak_f1600_round_post + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req vBba + rax1_m1 E2, C1, C3 + rax1_m1 E4, C3, C0 + rax1_m1 E1, C0, C2 + rax1_m1 E3, C2, C4 + rax1_m1 E0, C4, C1 + + /* 25x XAR, 75 in total */ + + .unreq tmp + tmp .req C1 + tmpq .req C1q + + eor vBba.16b, Aba.16b, E0.16b + xar_m1 vBsa, Abi, E2, 2 + xar_m1 vBbi, Aki, E2, 21 + xar_m1 vBki, Ako, E3, 39 + xar_m1 vBko, Amu, E4, 56 + xar_m1 vBmu, Aso, E3, 8 + xar_m1 vBso, Ama, E0, 23 + xar_m1 vBka, Abe, E1, 63 + xar_m1 vBse, Ago, E3, 9 + xar_m1 vBgo, Ame, E1, 19 + xar_m1 vBke, Agi, E2, 58 + xar_m1 vBgi, Aka, E0, 61 + xar_m1 vBga, Abo, E3, 36 + xar_m1 vBbo, Amo, E3, 43 + xar_m1 vBmo, Ami, E2, 49 + xar_m1 vBmi, Ake, E1, 54 + xar_m1 vBge, Agu, E4, 44 + mov E3.16b, Aga.16b + bcax_m1 Aga, vBga, vBgi, vBge + xar_m1 vBgu, Asi, E2, 3 + xar_m1 vBsi, Aku, E4, 25 + xar_m1 vBku, Asa, E0, 46 + xar_m1 vBma, Abu, E4, 37 + xar_m1 vBbu, Asu, E4, 50 + xar_m1 vBsu, Ase, E1, 62 + xar_m1 vBme, E3, E0, 28 + xar_m1 vBbe, Age, E1, 20 + + /* 25x BCAX, 50 in total */ + + bcax_m1 Age, vBge, vBgo, vBgi + bcax_m1 Agi, vBgi, vBgu, vBgo + bcax_m1 Ago, vBgo, vBga, vBgu + bcax_m1 Agu, vBgu, vBge, vBga + bcax_m1 Aka, vBka, vBki, vBke + bcax_m1 Ake, vBke, vBko, vBki + bcax_m1 Aki, vBki, vBku, vBko + bcax_m1 Ako, vBko, vBka, vBku + bcax_m1 Aku, vBku, vBke, vBka + bcax_m1 Ama, vBma, vBmi, vBme + bcax_m1 Ame, vBme, vBmo, vBmi + bcax_m1 Ami, vBmi, vBmu, vBmo + bcax_m1 Amo, vBmo, vBma, vBmu + bcax_m1 Amu, vBmu, vBme, vBma + bcax_m1 Asa, vBsa, vBsi, vBse + bcax_m1 Ase, vBse, vBso, vBsi + bcax_m1 Asi, vBsi, vBsu, vBso + bcax_m1 Aso, vBso, vBsa, vBsu + bcax_m1 Asu, vBsu, vBse, vBsa + bcax_m1 Aba, vBba, vBbi, vBbe + bcax_m1 Abe, vBbe, vBbo, vBbi + bcax_m1 Abi, vBbi, vBbu, vBbo + bcax_m1 Abo, vBbo, vBba, vBbu + bcax_m1 Abu, vBbu, vBbe, vBba + + // iota step + //ld1r {tmp.2d}, [const_addr], #8 + ldr tmpq, [const_addr], #16 + eor Aba.16b, Aba.16b, tmp.16b + + .unreq tmp + .unreq tmpq + +.endm + + +.text +.align 4 +.global keccak_f1600_x2_v84a_asm_v2pp4 +.global _keccak_f1600_x2_v84a_asm_v2pp4 + +#define KECCAK_F1600_ROUNDS 24 + +keccak_f1600_x2_v84a_asm_v2pp4: +_keccak_f1600_x2_v84a_asm_v2pp4: + alloc_stack + save_vregs + load_constant_ptr + load_input + + //mov count, #(KECCAK_F1600_ROUNDS-2) + mov count, #11 + keccak_f1600_round_pre +loop: + keccak_f1600_round_core + keccak_f1600_round_core + sub count, count, #1 + cbnz count, loop + + keccak_f1600_round_core + keccak_f1600_round_post + store_input + restore_vregs + free_stack + ret diff --git a/tests/keccak_neon/manual/keccak_f1600_x2_v84a_asm_v2pp5.s b/tests/keccak_neon/manual/keccak_f1600_x2_v84a_asm_v2pp5.s new file mode 100644 index 0000000..89571de --- /dev/null +++ b/tests/keccak_neon/manual/keccak_f1600_x2_v84a_asm_v2pp5.s @@ -0,0 +1,806 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +/********************** CONSTANTS *************************/ + .data + .align(8) +_round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x1 + count .req x2 + cur_const .req x3 + + /* Mapping of Kecck-f1600 state to vector registers + * at the beginning and end of each round. */ + Aba .req v0 + Abe .req v1 + Abi .req v2 + Abo .req v3 + Abu .req v4 + Aga .req v5 + Age .req v6 + Agi .req v7 + Ago .req v8 + Agu .req v9 + Aka .req v10 + Ake .req v11 + Aki .req v12 + Ako .req v13 + Aku .req v14 + Ama .req v15 + Ame .req v16 + Ami .req v17 + Amo .req v18 + Amu .req v19 + Asa .req v20 + Ase .req v21 + Asi .req v22 + Aso .req v23 + Asu .req v24 + + /* q-form of the above mapping */ + Abaq .req q0 + Abeq .req q1 + Abiq .req q2 + Aboq .req q3 + Abuq .req q4 + Agaq .req q5 + Ageq .req q6 + Agiq .req q7 + Agoq .req q8 + Aguq .req q9 + Akaq .req q10 + Akeq .req q11 + Akiq .req q12 + Akoq .req q13 + Akuq .req q14 + Amaq .req q15 + Ameq .req q16 + Amiq .req q17 + Amoq .req q18 + Amuq .req q19 + Asaq .req q20 + Aseq .req q21 + Asiq .req q22 + Asoq .req q23 + Asuq .req q24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v27 + C1 .req v28 + C2 .req v29 + C3 .req v30 + C4 .req v31 + + C0q .req q27 + C1q .req q28 + C2q .req q29 + C3q .req q30 + C4q .req q31 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + vBba .req v25 // fresh + vBbe .req v26 // fresh + vBbi .req Abi + vBbo .req Abo + vBbu .req Abu + vBga .req Aka + vBge .req Ake + vBgi .req Agi + vBgo .req Ago + vBgu .req Agu + vBka .req Ama + vBke .req Ame + vBki .req Aki + vBko .req Ako + vBku .req Aku + vBma .req Asa + vBme .req Ase + vBmi .req Ami + vBmo .req Amo + vBmu .req Amu + vBsa .req Aba + vBse .req Abe + vBsi .req Asi + vBso .req Aso + vBsu .req Asu + + vBbaq .req q25 // fresh + vBbeq .req q26 // fresh + vBbiq .req Abiq + vBboq .req Aboq + vBbuq .req Abuq + vBgaq .req Akaq + vBgeq .req Akeq + vBgiq .req Agiq + vBgoq .req Agoq + vBguq .req Aguq + vBkaq .req Amaq + vBkeq .req Ameq + vBkiq .req Akiq + vBkoq .req Akoq + vBkuq .req Akuq + vBmaq .req Asaq + vBmeq .req Aseq + vBmiq .req Amiq + vBmoq .req Amoq + vBmuq .req Amuq + vBsaq .req Abaq + vBseq .req Abeq + vBsiq .req Asiq + vBsoq .req Asoq + vBsuq .req Asuq + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req C4 + E1 .req C0 + E2 .req vBbe // fresh + E3 .req C2 + E4 .req C3 + + E0q .req C4q + E1q .req C0q + E2q .req vBbeq // fresh + E3q .req C2q + E4q .req C3q + + +/************************ MACROS ****************************/ + +.macro load_input + ldp Abaq, Abeq, [input_addr, #(2*8*0)] + ldp Abiq, Aboq, [input_addr, #(2*8*2)] + ldp Abuq, Agaq, [input_addr, #(2*8*4)] + ldp Ageq, Agiq, [input_addr, #(2*8*6)] + ldp Agoq, Aguq, [input_addr, #(2*8*8)] + ldp Akaq, Akeq, [input_addr, #(2*8*10)] + ldp Akiq, Akoq, [input_addr, #(2*8*12)] + ldp Akuq, Amaq, [input_addr, #(2*8*14)] + ldp Ameq, Amiq, [input_addr, #(2*8*16)] + ldp Amoq, Amuq, [input_addr, #(2*8*18)] + ldp Asaq, Aseq, [input_addr, #(2*8*20)] + ldp Asiq, Asoq, [input_addr, #(2*8*22)] + ldr Asuq, [input_addr, #(2*8*24)] +.endm + +.macro store_input + str Abaq, [input_addr, #(2*8*0)] + str Abeq, [input_addr, #(2*8*1)] + str Abiq, [input_addr, #(2*8*2)] + str Aboq, [input_addr, #(2*8*3)] + str Abuq, [input_addr, #(2*8*4)] + str Agaq, [input_addr, #(2*8*5)] + str Ageq, [input_addr, #(2*8*6)] + str Agiq, [input_addr, #(2*8*7)] + str Agoq, [input_addr, #(2*8*8)] + str Aguq, [input_addr, #(2*8*9)] + str Akaq, [input_addr, #(2*8*10)] + str Akeq, [input_addr, #(2*8*11)] + str Akiq, [input_addr, #(2*8*12)] + str Akoq, [input_addr, #(2*8*13)] + str Akuq, [input_addr, #(2*8*14)] + str Amaq, [input_addr, #(2*8*15)] + str Ameq, [input_addr, #(2*8*16)] + str Amiq, [input_addr, #(2*8*17)] + str Amoq, [input_addr, #(2*8*18)] + str Amuq, [input_addr, #(2*8*19)] + str Asaq, [input_addr, #(2*8*20)] + str Aseq, [input_addr, #(2*8*21)] + str Asiq, [input_addr, #(2*8*22)] + str Asoq, [input_addr, #(2*8*23)] + str Asuq, [input_addr, #(2*8*24)] +.endm + +#define STACK_SIZE (16*4 + 16*34) +#define STACK_BASE_VREGS 0 +#define STACK_BASE_TMP 16*4 + +#define Aga_offset 0 +#define E0_offset 1 +#define E1_offset 2 +#define E2_offset 3 +#define E3_offset 4 +#define E4_offset 5 +#define Ame_offset 7 +#define Agi_offset 8 +#define Aka_offset 9 +#define Abo_offset 10 +#define Amo_offset 11 +#define Ami_offset 12 +#define Ake_offset 13 +#define Agu_offset 14 +#define Asi_offset 15 +#define Aku_offset 16 +#define Asa_offset 17 +#define Abu_offset 18 +#define Asu_offset 19 +#define Ase_offset 20 +//#define Aga_offset 21 +#define Age_offset 22 +#define vBgo_offset 23 +#define vBke_offset 24 +#define vBgi_offset 25 +#define vBga_offset 26 +#define vBbo_offset 27 +#define vBmo_offset 28 +#define vBmi_offset 29 +#define vBge_offset 30 + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +#define save(name) \ + str name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] +#define restore(name) \ + ldr name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] + +.macro save_vregs + stp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + stp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + stp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + stp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + ldp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + ldp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + ldp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +/* Macros using v8.4-A SHA-3 instructions */ + +.macro eor3_m1_0 d s0 s1 s2 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor2 d s0 s1 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor3_m1_1 d s0 s1 s2 + eor \d\().16b, \d\().16b, \s2\().16b +.endm + + +.macro eor3_m1 d s0 s1 s2 + eor3_m1_0 \d, \s0, \s1, \s2 + eor3_m1_1 \d, \s0, \s1, \s2 +.endm + +.macro rax1_m1 d s0 s1 + // Use add instead of SHL #1 + add tmp.2d, \s1\().2d, \s1\().2d + sri tmp.2d, \s1\().2d, #63 + eor \d\().16b, tmp.16b, \s0\().16b +.endm + + .macro xar_m1 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 63 + eor \s0\().16b, \s0\().16b, \s1\().16b + add \d\().2d, \s0\().2d, \s0\().2d + sri \d\().2d, \s0\().2d, #(63) + .elseif \imm == 62 + eor \s0\().16b, \s0\().16b, \s1\().16b + add \d\().2d, \s0\().2d, \s0\().2d + add \d\().2d, \d\().2d, \d\().2d + sri \d\().2d, \s0\().2d, #(62) + .else + eor \s0\().16b, \s0\().16b, \s1\().16b + shl \d\().2d, \s0\().2d, #(64-\imm) + sri \d\().2d, \s0\().2d, #(\imm) + .endif +.endm + + .macro xar_m1_0 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 63 + eor \s0\().16b, \s0\().16b, \s1\().16b + .elseif \imm == 62 + eor \s0\().16b, \s0\().16b, \s1\().16b + .else + eor \s0\().16b, \s0\().16b, \s1\().16b + .endif +.endm + + .macro xar_m1_1 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 63 + add \d\().2d, \s0\().2d, \s0\().2d + sri \d\().2d, \s0\().2d, #(63) + .elseif \imm == 62 + add \d\().2d, \s0\().2d, \s0\().2d + add \d\().2d, \d\().2d, \d\().2d + sri \d\().2d, \s0\().2d, #(62) + .else + shl \d\().2d, \s0\().2d, #(64-\imm) + sri \d\().2d, \s0\().2d, #(\imm) + .endif +.endm + +.macro bcax_m1 d s0 s1 s2 + bic tmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +/* Keccak-f1600 round */ + +.macro keccak_f1600_round_pre + + /* 10 EOR3, so 20 individual EOR */ + + eor3_m1_0 C1, Abe, Age, Ake + eor3_m1_0 C3, Abo, Ago, Ako + eor3_m1_0 C0, Aba, Aga, Aka + eor3_m1_0 C2, Abi, Agi, Aki + eor3_m1_0 C4, Abu, Agu, Aku + eor3_m1_1 C1, Abe, Age, Ake + eor3_m1_1 C3, Abo, Ago, Ako + eor3_m1_1 C0, Aba, Aga, Aka + eor3_m1_1 C2, Abi, Agi, Aki + eor3_m1_1 C4, Abu, Agu, Aku + eor3_m1_0 C1, C1, Ame, Ase + eor3_m1_0 C3, C3, Amo, Aso + eor3_m1_0 C0, C0, Ama, Asa + eor3_m1_0 C2, C2, Ami, Asi + eor3_m1_0 C4, C4, Amu, Asu + eor3_m1_1 C1, C1, Ame, Ase + eor3_m1_1 C3, C3, Amo, Aso + eor3_m1_1 C0, C0, Ama, Asa + eor3_m1_1 C2, C2, Ami, Asi + eor3_m1_1 C4, C4, Amu, Asu + +.endm + +.macro keccak_f1600_round + + /* 10 EOR3, so 20 individual EOR */ + + eor3_m1_0 C0, Aba, Aga, Aka + eor3_m1_0 C1, Abe, Age, Ake + eor3_m1_0 C2, Abi, Agi, Aki + eor3_m1_0 C3, Abo, Ago, Ako + eor3_m1_0 C4, Abu, Agu, Aku + eor3_m1_1 C0, Aba, Aga, Aka + eor3_m1_1 C1, Abe, Age, Ake + eor3_m1_1 C2, Abi, Agi, Aki + eor3_m1_1 C3, Abo, Ago, Ako + eor3_m1_1 C4, Abu, Agu, Aku + eor3_m1_0 C0, C0, Ama, Asa + eor3_m1_0 C1, C1, Ame, Ase + eor3_m1_0 C2, C2, Ami, Asi + eor3_m1_0 C3, C3, Amo, Aso + eor3_m1_0 C4, C4, Amu, Asu + eor3_m1_1 C0, C0, Ama, Asa + eor3_m1_1 C1, C1, Ame, Ase + eor3_m1_1 C2, C2, Ami, Asi + eor3_m1_1 C3, C3, Amo, Aso + eor3_m1_1 C4, C4, Amu, Asu + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req vBba + rax1_m1 E2, C1, C3 + rax1_m1 E4, C3, C0 + rax1_m1 E1, C0, C2 + rax1_m1 E3, C2, C4 + rax1_m1 E0, C4, C1 + .unreq tmp + + /* 25x XAR, 75 in total */ + + tmp .req C1 + tmpq .req C1q + + eor vBba.16b, Aba.16b, E0.16b + xar_m1 vBsa, Abi, E2, 2 + xar_m1 vBbi, Aki, E2, 21 + xar_m1 vBki, Ako, E3, 39 + xar_m1 vBko, Amu, E4, 56 + xar_m1 vBmu, Aso, E3, 8 + xar_m1 vBso, Ama, E0, 23 + xar_m1 vBka, Abe, E1, 63 + xar_m1 vBse, Ago, E3, 9 + xar_m1 vBgo, Ame, E1, 19 + xar_m1 vBke, Agi, E2, 58 + xar_m1 vBgi, Aka, E0, 61 + xar_m1 vBga, Abo, E3, 36 + xar_m1 vBbo, Amo, E3, 43 + xar_m1 vBmo, Ami, E2, 49 + xar_m1 vBmi, Ake, E1, 54 + xar_m1 vBge, Agu, E4, 44 + xar_m1 vBgu, Asi, E2, 3 + xar_m1 vBsi, Aku, E4, 25 + xar_m1 vBku, Asa, E0, 46 + xar_m1 vBma, Abu, E4, 37 + xar_m1 vBbu, Asu, E4, 50 + xar_m1 vBsu, Ase, E1, 62 + xar_m1 vBme, Aga, E0, 28 + xar_m1 vBbe, Age, E1, 20 + + /* 25x BCAX, 50 in total */ + + bcax_m1 Aga, vBga, vBgi, vBge + bcax_m1 Age, vBge, vBgo, vBgi + bcax_m1 Agi, vBgi, vBgu, vBgo + bcax_m1 Ago, vBgo, vBga, vBgu + bcax_m1 Agu, vBgu, vBge, vBga + bcax_m1 Aka, vBka, vBki, vBke + bcax_m1 Ake, vBke, vBko, vBki + bcax_m1 Aki, vBki, vBku, vBko + bcax_m1 Ako, vBko, vBka, vBku + bcax_m1 Aku, vBku, vBke, vBka + bcax_m1 Ama, vBma, vBmi, vBme + bcax_m1 Ame, vBme, vBmo, vBmi + bcax_m1 Ami, vBmi, vBmu, vBmo + bcax_m1 Amo, vBmo, vBma, vBmu + bcax_m1 Amu, vBmu, vBme, vBma + bcax_m1 Asa, vBsa, vBsi, vBse + bcax_m1 Ase, vBse, vBso, vBsi + bcax_m1 Asi, vBsi, vBsu, vBso + bcax_m1 Aso, vBso, vBsa, vBsu + bcax_m1 Asu, vBsu, vBse, vBsa + bcax_m1 Aba, vBba, vBbi, vBbe + bcax_m1 Abe, vBbe, vBbo, vBbi + bcax_m1 Abi, vBbi, vBbu, vBbo + bcax_m1 Abo, vBbo, vBba, vBbu + bcax_m1 Abu, vBbu, vBbe, vBba + + // iota step + //ld1r {tmp.2d}, [const_addr], #8 + ldr tmpq, [const_addr], #16 + eor Aba.16b, Aba.16b, tmp.16b + + .unreq tmp + .unreq tmpq + +.endm + +.macro keccak_f1600_round_core + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req vBba + rax1_m1 E2, C1, C3 + rax1_m1 E4, C3, C0 + rax1_m1 E1, C0, C2 + rax1_m1 E3, C2, C4 + str Agiq, [sp, #(STACK_BASE_TMP + 16*32)] + rax1_m1 E0, C4, C1 + + /* 25x XAR, 75 in total */ + + .unreq tmp + tmp .req C1 + tmpq .req C1q + + xar_m1 vBgi, Aka, E0, 61 + xar_m1 vBga, Abo, E3, 36 + str Agaq, [sp, #(STACK_BASE_TMP + 16 * 30)] + xar_m1 vBbo, Amo, E3, 43 + xar_m1 vBmo, Ami, E2, 49 + str Ageq, [sp, #(STACK_BASE_TMP + 16 * 31)] + xar_m1 vBmi, Ake, E1, 54 + xar_m1 vBge, Agu, E4, 44 + bcax_m1 Aga, vBga, vBgi, vBge + + eor vBba.16b, Aba.16b, E0.16b + xar_m1 vBsa, Abi, E2, 2 + xar_m1 vBbi, Aki, E2, 21 + xar_m1 vBki, Ako, E3, 39 + xar_m1 vBko, Amu, E4, 56 + xar_m1 vBmu, Aso, E3, 8 + xar_m1 vBso, Ama, E0, 23 + xar_m1 vBka, Abe, E1, 63 + xar_m1 vBse, Ago, E3, 9 + xar_m1 vBgo, Ame, E1, 19 + bcax_m1 Age, vBge, vBgo, vBgi + + ldr tmpq, [sp, #(STACK_BASE_TMP + 16*32)] + xar_m1 vBke, tmp, E2, 58 + + xar_m1 vBgu, Asi, E2, 3 + bcax_m1 Agi, vBgi, vBgu, vBgo + xar_m1 vBsi, Aku, E4, 25 + xar_m1 vBku, Asa, E0, 46 + xar_m1 vBma, Abu, E4, 37 + xar_m1 vBbu, Asu, E4, 50 + xar_m1 vBsu, Ase, E1, 62 + ldp tmpq, E3q, [sp, #(STACK_BASE_TMP + 16*30)] + xar_m1 vBme, tmp, E0, 28 + xar_m1 vBbe, E3, E1, 20 + + /* 25x BCAX, 50 in total */ + + bcax_m1 Ago, vBgo, vBga, vBgu + bcax_m1 Agu, vBgu, vBge, vBga + bcax_m1 Aka, vBka, vBki, vBke + bcax_m1 Ake, vBke, vBko, vBki + + .unreq tmp + .unreq tmpq + + eor2 C0, Aka, Aga + save(Aga) + + tmp .req Aga + tmpq .req Agaq + bcax_m1 Aki, vBki, vBku, vBko + bcax_m1 Ako, vBko, vBka, vBku + eor2 C1, Ake, Age + bcax_m1 Aku, vBku, vBke, vBka + eor2 C2, Aki, Agi + bcax_m1 Ama, vBma, vBmi, vBme + eor2 C3, Ako, Ago + bcax_m1 Ame, vBme, vBmo, vBmi + eor2 C4, Aku, Agu + bcax_m1 Ami, vBmi, vBmu, vBmo + eor2 C0, C0, Ama + bcax_m1 Amo, vBmo, vBma, vBmu + eor2 C1, C1, Ame + bcax_m1 Amu, vBmu, vBme, vBma + eor2 C2, C2, Ami + bcax_m1 Asa, vBsa, vBsi, vBse + eor2 C3, C3, Amo + bcax_m1 Ase, vBse, vBso, vBsi + eor2 C4, C4, Amu + bcax_m1 Asi, vBsi, vBsu, vBso + eor2 C0, C0, Asa + bcax_m1 Aso, vBso, vBsa, vBsu + eor2 C1, C1, Ase + bcax_m1 Asu, vBsu, vBse, vBsa + eor2 C2, C2, Asi + eor2 C3, C3, Aso + bcax_m1 Aba, vBba, vBbi, vBbe + bcax_m1 Abe, vBbe, vBbo, vBbi + eor2 C1, C1, Abe + + // iota step + //ld1r {tmp.2d}, [const_addr], #8 + ldr tmpq, [const_addr], #16 + eor Aba.16b, Aba.16b, tmp.16b + eor2 C4, C4, Asu + bcax_m1 Abi, vBbi, vBbu, vBbo + bcax_m1 Abo, vBbo, vBba, vBbu + eor2 C3, C3, Abo + eor2 C2, C2, Abi + eor2 C0, C0, Aba + bcax_m1 Abu, vBbu, vBbe, vBba + eor2 C4, C4, Abu + + restore(Aga) + .unreq tmp + .unreq tmpq + +.endm + +.macro keccak_f1600_round_post + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req vBba + rax1_m1 E2, C1, C3 + rax1_m1 E4, C3, C0 + rax1_m1 E1, C0, C2 + rax1_m1 E3, C2, C4 + str Agiq, [sp, #(STACK_BASE_TMP + 16*32)] + rax1_m1 E0, C4, C1 + + /* 25x XAR, 75 in total */ + + .unreq tmp + tmp .req C1 + tmpq .req C1q + + xar_m1 vBgi, Aka, E0, 61 + xar_m1 vBga, Abo, E3, 36 + str Agaq, [sp, #(STACK_BASE_TMP + 16 * 30)] + xar_m1 vBbo, Amo, E3, 43 + xar_m1 vBmo, Ami, E2, 49 + str Ageq, [sp, #(STACK_BASE_TMP + 16 * 31)] + xar_m1 vBmi, Ake, E1, 54 + xar_m1 vBge, Agu, E4, 44 + bcax_m1 Aga, vBga, vBgi, vBge + + eor vBba.16b, Aba.16b, E0.16b + xar_m1 vBsa, Abi, E2, 2 + xar_m1 vBbi, Aki, E2, 21 + xar_m1 vBki, Ako, E3, 39 + xar_m1 vBko, Amu, E4, 56 + xar_m1 vBmu, Aso, E3, 8 + xar_m1 vBso, Ama, E0, 23 + xar_m1 vBka, Abe, E1, 63 + xar_m1 vBse, Ago, E3, 9 + xar_m1 vBgo, Ame, E1, 19 + bcax_m1 Age, vBge, vBgo, vBgi + + ldr tmpq, [sp, #(STACK_BASE_TMP + 16*32)] + xar_m1 vBke, tmp, E2, 58 + + xar_m1 vBgu, Asi, E2, 3 + bcax_m1 Agi, vBgi, vBgu, vBgo + xar_m1 vBsi, Aku, E4, 25 + xar_m1 vBku, Asa, E0, 46 + xar_m1 vBma, Abu, E4, 37 + xar_m1 vBbu, Asu, E4, 50 + xar_m1 vBsu, Ase, E1, 62 + ldp tmpq, E3q, [sp, #(STACK_BASE_TMP + 16*30)] + xar_m1 vBme, tmp, E0, 28 + xar_m1 vBbe, E3, E1, 20 + + /* 25x BCAX, 50 in total */ + + bcax_m1 Ago, vBgo, vBga, vBgu + bcax_m1 Agu, vBgu, vBge, vBga + bcax_m1 Aka, vBka, vBki, vBke + bcax_m1 Ake, vBke, vBko, vBki + bcax_m1 Aki, vBki, vBku, vBko + bcax_m1 Ako, vBko, vBka, vBku + bcax_m1 Aku, vBku, vBke, vBka + bcax_m1 Ama, vBma, vBmi, vBme + bcax_m1 Ame, vBme, vBmo, vBmi + bcax_m1 Ami, vBmi, vBmu, vBmo + bcax_m1 Amo, vBmo, vBma, vBmu + bcax_m1 Amu, vBmu, vBme, vBma + bcax_m1 Asa, vBsa, vBsi, vBse + bcax_m1 Ase, vBse, vBso, vBsi + bcax_m1 Asi, vBsi, vBsu, vBso + bcax_m1 Aso, vBso, vBsa, vBsu + bcax_m1 Asu, vBsu, vBse, vBsa + bcax_m1 Aba, vBba, vBbi, vBbe + bcax_m1 Abe, vBbe, vBbo, vBbi + bcax_m1 Abi, vBbi, vBbu, vBbo + bcax_m1 Abo, vBbo, vBba, vBbu + bcax_m1 Abu, vBbu, vBbe, vBba + + // iota step + //ld1r {tmp.2d}, [const_addr], #8 + ldr tmpq, [const_addr], #16 + eor Aba.16b, Aba.16b, tmp.16b + + .unreq tmp + .unreq tmpq + +.endm + + +.text +.align 4 +.global keccak_f1600_x2_v84a_asm_v2pp5 +.global _keccak_f1600_x2_v84a_asm_v2pp5 + +#define KECCAK_F1600_ROUNDS 24 + +keccak_f1600_x2_v84a_asm_v2pp5: +_keccak_f1600_x2_v84a_asm_v2pp5: + alloc_stack + save_vregs + load_constant_ptr + load_input + + //mov count, #(KECCAK_F1600_ROUNDS-2) + mov count, #11 + keccak_f1600_round_pre +loop: + keccak_f1600_round_core + keccak_f1600_round_core + sub count, count, #1 + cbnz count, loop + + keccak_f1600_round_core + keccak_f1600_round_post + store_input + restore_vregs + free_stack + ret diff --git a/tests/keccak_neon/manual/keccak_f1600_x2_v84a_asm_v2pp6.s b/tests/keccak_neon/manual/keccak_f1600_x2_v84a_asm_v2pp6.s new file mode 100644 index 0000000..213f214 --- /dev/null +++ b/tests/keccak_neon/manual/keccak_f1600_x2_v84a_asm_v2pp6.s @@ -0,0 +1,917 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#if defined(__ARM_FEATURE_SVE2) +#include "macros.s" + +/********************** CONSTANTS *************************/ + .data + .align(8) +_round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x1 + count .req x2 + cur_const .req x3 + + /* Mapping of Kecck-f1600 state to vector registers + * at the beginning and end of each round. */ + Aba .req v0 + Abe .req v1 + Abi .req v2 + Abo .req v3 + Abu .req v4 + Aga .req v5 + Age .req v6 + Agi .req v7 + Ago .req v8 + Agu .req v9 + Aka .req v10 + Ake .req v11 + Aki .req v12 + Ako .req v13 + Aku .req v14 + Ama .req v15 + Ame .req v16 + Ami .req v17 + Amo .req v18 + Amu .req v19 + Asa .req v20 + Ase .req v21 + Asi .req v22 + Aso .req v23 + Asu .req v24 + + /* q-form of the above mapping */ + Abaq .req q0 + Abeq .req q1 + Abiq .req q2 + Aboq .req q3 + Abuq .req q4 + Agaq .req q5 + Ageq .req q6 + Agiq .req q7 + Agoq .req q8 + Aguq .req q9 + Akaq .req q10 + Akeq .req q11 + Akiq .req q12 + Akoq .req q13 + Akuq .req q14 + Amaq .req q15 + Ameq .req q16 + Amiq .req q17 + Amoq .req q18 + Amuq .req q19 + Asaq .req q20 + Aseq .req q21 + Asiq .req q22 + Asoq .req q23 + Asuq .req q24 + + /* z-form of the above mapping */ + Abaz .req z0 + Abez .req z1 + Abiz .req z2 + Aboz .req z3 + Abuz .req z4 + Agaz .req z5 + Agez .req z6 + Agiz .req z7 + Agoz .req z8 + Aguz .req z9 + Akaz .req z10 + Akez .req z11 + Akiz .req z12 + Akoz .req z13 + Akuz .req z14 + Amaz .req z15 + Amez .req z16 + Amiz .req z17 + Amoz .req z18 + Amuz .req z19 + Asaz .req z20 + Asez .req z21 + Asiz .req z22 + Asoz .req z23 + Asuz .req z24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v27 + C1 .req v28 + C2 .req v29 + C3 .req v30 + C4 .req v31 + + C0q .req q27 + C1q .req q28 + C2q .req q29 + C3q .req q30 + C4q .req q31 + + C0z .req z27 + C1z .req z28 + C2z .req z29 + C3z .req z30 + C4z .req z31 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + vBba .req v25 // fresh + vBbe .req v26 // fresh + vBbi .req Abi + vBbo .req Abo + vBbu .req Abu + vBga .req Aka + vBge .req Ake + vBgi .req Agi + vBgo .req Ago + vBgu .req Agu + vBka .req Ama + vBke .req Ame + vBki .req Aki + vBko .req Ako + vBku .req Aku + vBma .req Asa + vBme .req Ase + vBmi .req Ami + vBmo .req Amo + vBmu .req Amu + vBsa .req Aba + vBse .req Abe + vBsi .req Asi + vBso .req Aso + vBsu .req Asu + + vBbaq .req q25 // fresh + vBbeq .req q26 // fresh + vBbiq .req Abiq + vBboq .req Aboq + vBbuq .req Abuq + vBgaq .req Akaq + vBgeq .req Akeq + vBgiq .req Agiq + vBgoq .req Agoq + vBguq .req Aguq + vBkaq .req Amaq + vBkeq .req Ameq + vBkiq .req Akiq + vBkoq .req Akoq + vBkuq .req Akuq + vBmaq .req Asaq + vBmeq .req Aseq + vBmiq .req Amiq + vBmoq .req Amoq + vBmuq .req Amuq + vBsaq .req Abaq + vBseq .req Abeq + vBsiq .req Asiq + vBsoq .req Asoq + vBsuq .req Asuq + + vBbaz .req z25 // fresh + vBbez .req z26 // fresh + vBbiz .req Abiz + vBboz .req Aboz + vBbuz .req Abuz + vBgaz .req Akaz + vBgez .req Akez + vBgiz .req Agiz + vBgoz .req Agoz + vBguz .req Aguz + vBkaz .req Amaz + vBkez .req Amez + vBkiz .req Akiz + vBkoz .req Akoz + vBkuz .req Akuz + vBmaz .req Asaz + vBmez .req Asez + vBmiz .req Amiz + vBmoz .req Amoz + vBmuz .req Amuz + vBsaz .req Abaz + vBsez .req Abez + vBsiz .req Asiz + vBsoz .req Asoz + vBsuz .req Asuz + + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req C4 + E1 .req C0 + E2 .req vBbe // fresh + E3 .req C2 + E4 .req C3 + + E0q .req C4q + E1q .req C0q + E2q .req vBbeq // fresh + E3q .req C2q + E4q .req C3q + + E0z .req C4z + E1z .req C0z + E2z .req vBbez // fresh + E3z .req C2z + E4z .req C3z + + + +/************************ MACROS ****************************/ + +.macro load_input + ldp Abaq, Abeq, [input_addr, #(2*8*0)] + ldp Abiq, Aboq, [input_addr, #(2*8*2)] + ldp Abuq, Agaq, [input_addr, #(2*8*4)] + ldp Ageq, Agiq, [input_addr, #(2*8*6)] + ldp Agoq, Aguq, [input_addr, #(2*8*8)] + ldp Akaq, Akeq, [input_addr, #(2*8*10)] + ldp Akiq, Akoq, [input_addr, #(2*8*12)] + ldp Akuq, Amaq, [input_addr, #(2*8*14)] + ldp Ameq, Amiq, [input_addr, #(2*8*16)] + ldp Amoq, Amuq, [input_addr, #(2*8*18)] + ldp Asaq, Aseq, [input_addr, #(2*8*20)] + ldp Asiq, Asoq, [input_addr, #(2*8*22)] + ldr Asuq, [input_addr, #(2*8*24)] +.endm + +.macro store_input + str Abaq, [input_addr, #(2*8*0)] + str Abeq, [input_addr, #(2*8*1)] + str Abiq, [input_addr, #(2*8*2)] + str Aboq, [input_addr, #(2*8*3)] + str Abuq, [input_addr, #(2*8*4)] + str Agaq, [input_addr, #(2*8*5)] + str Ageq, [input_addr, #(2*8*6)] + str Agiq, [input_addr, #(2*8*7)] + str Agoq, [input_addr, #(2*8*8)] + str Aguq, [input_addr, #(2*8*9)] + str Akaq, [input_addr, #(2*8*10)] + str Akeq, [input_addr, #(2*8*11)] + str Akiq, [input_addr, #(2*8*12)] + str Akoq, [input_addr, #(2*8*13)] + str Akuq, [input_addr, #(2*8*14)] + str Amaq, [input_addr, #(2*8*15)] + str Ameq, [input_addr, #(2*8*16)] + str Amiq, [input_addr, #(2*8*17)] + str Amoq, [input_addr, #(2*8*18)] + str Amuq, [input_addr, #(2*8*19)] + str Asaq, [input_addr, #(2*8*20)] + str Aseq, [input_addr, #(2*8*21)] + str Asiq, [input_addr, #(2*8*22)] + str Asoq, [input_addr, #(2*8*23)] + str Asuq, [input_addr, #(2*8*24)] +.endm + +#define STACK_SIZE (16*4 + 16*34) +#define STACK_BASE_VREGS 0 +#define STACK_BASE_TMP 16*4 + +#define Aga_offset 0 +#define E0_offset 1 +#define E1_offset 2 +#define E2_offset 3 +#define E3_offset 4 +#define E4_offset 5 +#define Ame_offset 7 +#define Agi_offset 8 +#define Aka_offset 9 +#define Abo_offset 10 +#define Amo_offset 11 +#define Ami_offset 12 +#define Ake_offset 13 +#define Agu_offset 14 +#define Asi_offset 15 +#define Aku_offset 16 +#define Asa_offset 17 +#define Abu_offset 18 +#define Asu_offset 19 +#define Ase_offset 20 +//#define Aga_offset 21 +#define Age_offset 22 +#define vBgo_offset 23 +#define vBke_offset 24 +#define vBgi_offset 25 +#define vBga_offset 26 +#define vBbo_offset 27 +#define vBmo_offset 28 +#define vBmi_offset 29 +#define vBge_offset 30 + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +#define save(name) \ + str name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] +#define restore(name) \ + ldr name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] + +.macro save_vregs + stp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + stp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + stp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + stp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + ldp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + ldp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + ldp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +/* Macros using v8.4-A SHA-3 instructions */ + +.macro eor3_m1_0 d s0 s1 s2 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor2 d s0 s1 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor3_m1_1 d s0 s1 s2 + eor \d\().16b, \d\().16b, \s2\().16b +.endm + +.macro eor3_m1 d s0 s1 s2 + eor3_m1_0 \d, \s0, \s1, \s2 + eor3_m1_1 \d, \s0, \s1, \s2 +.endm + +.macro rax1_m1 d s0 s1 + // Use add instead of SHL #1 + add tmp.2d, \s1\().2d, \s1\().2d + sri tmp.2d, \s1\().2d, #63 + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +xar_m1_const: + .quad (1ULL<<(64-61)) + .quad (1ULL<<(64-56)) + .quad (1ULL<<(64-50)) + .quad (1ULL<<(64-46)) + .quad (1ULL<<(64-44)) + .quad (1ULL<<(64-43)) + .quad (1ULL<<(64-39)) + .quad (1ULL<<(64-36)) + .quad (1ULL<<(64-21)) + .quad (1ULL<<(64-19)) + .quad (1ULL<<(64-9)) + .quad (1ULL<<(64-3)) + + +xar_m1_const_addr: .quad xar_m1_const + + .macro xar_m1 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 21 + eor \s0\().16b, \s0\().16b, \s1\().16b + ldr \d\()q, [x17, #64] + mul \d\()z\().d, \s0\()z\().d, \d\()z\().d[0] + sri \d\().2d, \s0\().2d, #(\imm) + .elseif \imm == 39 + eor \s0\().16b, \s0\().16b, \s1\().16b + ldr \d\()q, [x17, #48] + mul \d\()z\().d, \s0\()z\().d, \d\()z\().d[0] + sri \d\().2d, \s0\().2d, #(\imm) + .elseif \imm == 56 + eor \s0\().16b, \s0\().16b, \s1\().16b + ldr \d\()q, [x17] + mul \d\()z\().d, \s0\()z\().d, \d\()z\().d[1] + sri \d\().2d, \s0\().2d, #(\imm) + .elseif \imm == 63 + eor \s0\().16b, \s0\().16b, \s1\().16b + add \d\().2d, \s0\().2d, \s0\().2d + sri \d\().2d, \s0\().2d, #(63) + .elseif \imm == 9 + eor \s0\().16b, \s0\().16b, \s1\().16b + ldr \d\()q, [x17, #80] + mul \d\()z\().d, \s0\()z\().d, \d\()z\().d[0] + sri \d\().2d, \s0\().2d, #(\imm) + .elseif \imm == 19 + eor \s0\().16b, \s0\().16b, \s1\().16b + ldr \d\()q, [x17, #64] + mul \d\()z\().d, \s0\()z\().d, \d\()z\().d[1] + sri \d\().2d, \s0\().2d, #(\imm) + .elseif \imm == 61 + eor \s0\().16b, \s0\().16b, \s1\().16b + ldr \d\()q, [x17] + mul \d\()z\().d, \s0\()z\().d, \d\()z\().d[0] + sri \d\().2d, \s0\().2d, #(\imm) + .elseif \imm == 36 + eor \s0\().16b, \s0\().16b, \s1\().16b + ldr \d\()q, [x17, #48] + mul \d\()z\().d, \s0\()z\().d, \d\()z\().d[1] + sri \d\().2d, \s0\().2d, #(\imm) + .elseif \imm == 43 + eor \s0\().16b, \s0\().16b, \s1\().16b + ldr \d\()q, [x17, #32] + mul \d\()z\().d, \s0\()z\().d, \d\()z\().d[1] + sri \d\().2d, \s0\().2d, #(\imm) + .elseif \imm == 44 + eor \s0\().16b, \s0\().16b, \s1\().16b + ldr \d\()q, [x17, #32] + mul \d\()z\().d, \s0\()z\().d, \d\()z\().d[0] + sri \d\().2d, \s0\().2d, #(\imm) + .elseif \imm == 3 + eor \s0\().16b, \s0\().16b, \s1\().16b + ldr \d\()q, [x17, #80] + mul \d\()z\().d, \s0\()z\().d, \d\()z\().d[1] + sri \d\().2d, \s0\().2d, #(\imm) + .elseif \imm == 46 + eor \s0\().16b, \s0\().16b, \s1\().16b + ldr \d\()q, [x17, #16] + mul \d\()z\().d, \s0\()z\().d, \d\()z\().d[1] + sri \d\().2d, \s0\().2d, #(\imm) + .elseif \imm == 50 + eor \s0\().16b, \s0\().16b, \s1\().16b + ldr \d\()q, [x17, #16] + mul \d\()z\().d, \s0\()z\().d, \d\()z\().d[0] + sri \d\().2d, \s0\().2d, #(\imm) + .elseif \imm == 62 + eor \s0\().16b, \s0\().16b, \s1\().16b + add \d\().2d, \s0\().2d, \s0\().2d + add \d\().2d, \d\().2d, \d\().2d + sri \d\().2d, \s0\().2d, #(62) + .else + eor \s0\().16b, \s0\().16b, \s1\().16b + shl \d\().2d, \s0\().2d, #(64-\imm) + sri \d\().2d, \s0\().2d, #(\imm) + .endif +.endm + +.macro bcax_m1 d s0 s1 s2 + bic tmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +/* Keccak-f1600 round */ + +.macro keccak_f1600_round_pre + + /* 10 EOR3, so 20 individual EOR */ + + eor3_m1_0 C1, Abe, Age, Ake + eor3_m1_0 C3, Abo, Ago, Ako + eor3_m1_0 C0, Aba, Aga, Aka + eor3_m1_0 C2, Abi, Agi, Aki + eor3_m1_0 C4, Abu, Agu, Aku + eor3_m1_1 C1, Abe, Age, Ake + eor3_m1_1 C3, Abo, Ago, Ako + eor3_m1_1 C0, Aba, Aga, Aka + eor3_m1_1 C2, Abi, Agi, Aki + eor3_m1_1 C4, Abu, Agu, Aku + eor3_m1_0 C1, C1, Ame, Ase + eor3_m1_0 C3, C3, Amo, Aso + eor3_m1_0 C0, C0, Ama, Asa + eor3_m1_0 C2, C2, Ami, Asi + eor3_m1_0 C4, C4, Amu, Asu + eor3_m1_1 C1, C1, Ame, Ase + eor3_m1_1 C3, C3, Amo, Aso + eor3_m1_1 C0, C0, Ama, Asa + eor3_m1_1 C2, C2, Ami, Asi + eor3_m1_1 C4, C4, Amu, Asu + +.endm + +.macro keccak_f1600_round + + /* 10 EOR3, so 20 individual EOR */ + + eor3_m1_0 C0, Aba, Aga, Aka + eor3_m1_0 C1, Abe, Age, Ake + eor3_m1_0 C2, Abi, Agi, Aki + eor3_m1_0 C3, Abo, Ago, Ako + eor3_m1_0 C4, Abu, Agu, Aku + eor3_m1_1 C0, Aba, Aga, Aka + eor3_m1_1 C1, Abe, Age, Ake + eor3_m1_1 C2, Abi, Agi, Aki + eor3_m1_1 C3, Abo, Ago, Ako + eor3_m1_1 C4, Abu, Agu, Aku + eor3_m1_0 C0, C0, Ama, Asa + eor3_m1_0 C1, C1, Ame, Ase + eor3_m1_0 C2, C2, Ami, Asi + eor3_m1_0 C3, C3, Amo, Aso + eor3_m1_0 C4, C4, Amu, Asu + eor3_m1_1 C0, C0, Ama, Asa + eor3_m1_1 C1, C1, Ame, Ase + eor3_m1_1 C2, C2, Ami, Asi + eor3_m1_1 C3, C3, Amo, Aso + eor3_m1_1 C4, C4, Amu, Asu + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req vBba + rax1_m1 E2, C1, C3 + rax1_m1 E4, C3, C0 + rax1_m1 E1, C0, C2 + rax1_m1 E3, C2, C4 + rax1_m1 E0, C4, C1 + .unreq tmp + + /* 25x XAR, 75 in total */ + + tmp .req C1 + tmpq .req C1q + + eor vBba.16b, Aba.16b, E0.16b + xar_m1 vBsa, Abi, E2, 2 + xar_m1 vBbi, Aki, E2, 21 + xar_m1 vBki, Ako, E3, 39 + xar_m1 vBko, Amu, E4, 56 + xar_m1 vBmu, Aso, E3, 8 + xar_m1 vBso, Ama, E0, 23 + xar_m1 vBka, Abe, E1, 63 + xar_m1 vBse, Ago, E3, 9 + xar_m1 vBgo, Ame, E1, 19 + xar_m1 vBke, Agi, E2, 58 + xar_m1 vBgi, Aka, E0, 61 + xar_m1 vBga, Abo, E3, 36 + xar_m1 vBbo, Amo, E3, 43 + xar_m1 vBmo, Ami, E2, 49 + xar_m1 vBmi, Ake, E1, 54 + xar_m1 vBge, Agu, E4, 44 + xar_m1 vBgu, Asi, E2, 3 + xar_m1 vBsi, Aku, E4, 25 + xar_m1 vBku, Asa, E0, 46 + xar_m1 vBma, Abu, E4, 37 + xar_m1 vBbu, Asu, E4, 50 + xar_m1 vBsu, Ase, E1, 62 + xar_m1 vBme, Aga, E0, 28 + xar_m1 vBbe, Age, E1, 20 + + /* 25x BCAX, 50 in total */ + + bcax_m1 Aga, vBga, vBgi, vBge + bcax_m1 Age, vBge, vBgo, vBgi + bcax_m1 Agi, vBgi, vBgu, vBgo + bcax_m1 Ago, vBgo, vBga, vBgu + bcax_m1 Agu, vBgu, vBge, vBga + bcax_m1 Aka, vBka, vBki, vBke + bcax_m1 Ake, vBke, vBko, vBki + bcax_m1 Aki, vBki, vBku, vBko + bcax_m1 Ako, vBko, vBka, vBku + bcax_m1 Aku, vBku, vBke, vBka + bcax_m1 Ama, vBma, vBmi, vBme + bcax_m1 Ame, vBme, vBmo, vBmi + bcax_m1 Ami, vBmi, vBmu, vBmo + bcax_m1 Amo, vBmo, vBma, vBmu + bcax_m1 Amu, vBmu, vBme, vBma + bcax_m1 Asa, vBsa, vBsi, vBse + bcax_m1 Ase, vBse, vBso, vBsi + bcax_m1 Asi, vBsi, vBsu, vBso + bcax_m1 Aso, vBso, vBsa, vBsu + bcax_m1 Asu, vBsu, vBse, vBsa + bcax_m1 Aba, vBba, vBbi, vBbe + bcax_m1 Abe, vBbe, vBbo, vBbi + bcax_m1 Abi, vBbi, vBbu, vBbo + bcax_m1 Abo, vBbo, vBba, vBbu + bcax_m1 Abu, vBbu, vBbe, vBba + + // iota step + //ld1r {tmp.2d}, [const_addr], #8 + ldr tmpq, [const_addr], #16 + eor Aba.16b, Aba.16b, tmp.16b + + .unreq tmp + .unreq tmpq + +.endm + +.macro keccak_f1600_round_core + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req vBba + rax1_m1 E2, C1, C3 + rax1_m1 E4, C3, C0 + rax1_m1 E1, C0, C2 + rax1_m1 E3, C2, C4 + str Agiq, [sp, #(STACK_BASE_TMP + 16*32)] + rax1_m1 E0, C4, C1 + + /* 25x XAR, 75 in total */ + + .unreq tmp + tmp .req C1 + tmpq .req C1q + + xar_m1 vBgi, Aka, E0, 61 + xar_m1 vBga, Abo, E3, 36 + str Agaq, [sp, #(STACK_BASE_TMP + 16 * 30)] + xar_m1 vBbo, Amo, E3, 43 + xar_m1 vBmo, Ami, E2, 49 + str Ageq, [sp, #(STACK_BASE_TMP + 16 * 31)] + xar_m1 vBmi, Ake, E1, 54 + xar_m1 vBge, Agu, E4, 44 + bcax_m1 Aga, vBga, vBgi, vBge + + eor vBba.16b, Aba.16b, E0.16b + xar_m1 vBsa, Abi, E2, 2 + xar_m1 vBbi, Aki, E2, 21 + xar_m1 vBki, Ako, E3, 39 + xar_m1 vBko, Amu, E4, 56 + xar_m1 vBmu, Aso, E3, 8 + xar_m1 vBso, Ama, E0, 23 + xar_m1 vBka, Abe, E1, 63 + xar_m1 vBse, Ago, E3, 9 + xar_m1 vBgo, Ame, E1, 19 + bcax_m1 Age, vBge, vBgo, vBgi + + ldr tmpq, [sp, #(STACK_BASE_TMP + 16*32)] + xar_m1 vBke, tmp, E2, 58 + + xar_m1 vBgu, Asi, E2, 3 + bcax_m1 Agi, vBgi, vBgu, vBgo + xar_m1 vBsi, Aku, E4, 25 + xar_m1 vBku, Asa, E0, 46 + xar_m1 vBma, Abu, E4, 37 + xar_m1 vBbu, Asu, E4, 50 + xar_m1 vBsu, Ase, E1, 62 + ldp tmpq, E3q, [sp, #(STACK_BASE_TMP + 16*30)] + xar_m1 vBme, tmp, E0, 28 + xar_m1 vBbe, E3, E1, 20 + + /* 25x BCAX, 50 in total */ + + bcax_m1 Ago, vBgo, vBga, vBgu + bcax_m1 Agu, vBgu, vBge, vBga + bcax_m1 Aka, vBka, vBki, vBke + bcax_m1 Ake, vBke, vBko, vBki + + .unreq tmp + .unreq tmpq + + eor2 C0, Aka, Aga + save(Aga) + + tmp .req Aga + tmpq .req Agaq + bcax_m1 Aki, vBki, vBku, vBko + bcax_m1 Ako, vBko, vBka, vBku + eor2 C1, Ake, Age + bcax_m1 Aku, vBku, vBke, vBka + eor2 C2, Aki, Agi + bcax_m1 Ama, vBma, vBmi, vBme + eor2 C3, Ako, Ago + bcax_m1 Ame, vBme, vBmo, vBmi + eor2 C4, Aku, Agu + bcax_m1 Ami, vBmi, vBmu, vBmo + eor2 C0, C0, Ama + bcax_m1 Amo, vBmo, vBma, vBmu + eor2 C1, C1, Ame + bcax_m1 Amu, vBmu, vBme, vBma + eor2 C2, C2, Ami + bcax_m1 Asa, vBsa, vBsi, vBse + eor2 C3, C3, Amo + bcax_m1 Ase, vBse, vBso, vBsi + eor2 C4, C4, Amu + bcax_m1 Asi, vBsi, vBsu, vBso + eor2 C0, C0, Asa + bcax_m1 Aso, vBso, vBsa, vBsu + eor2 C1, C1, Ase + bcax_m1 Asu, vBsu, vBse, vBsa + eor2 C2, C2, Asi + eor2 C3, C3, Aso + bcax_m1 Aba, vBba, vBbi, vBbe + bcax_m1 Abe, vBbe, vBbo, vBbi + eor2 C1, C1, Abe + + // iota step + //ld1r {tmp.2d}, [const_addr], #8 + ldr tmpq, [const_addr], #16 + eor Aba.16b, Aba.16b, tmp.16b + eor2 C4, C4, Asu + bcax_m1 Abi, vBbi, vBbu, vBbo + bcax_m1 Abo, vBbo, vBba, vBbu + eor2 C3, C3, Abo + eor2 C2, C2, Abi + eor2 C0, C0, Aba + bcax_m1 Abu, vBbu, vBbe, vBba + eor2 C4, C4, Abu + + restore(Aga) + .unreq tmp + .unreq tmpq + +.endm + +.macro keccak_f1600_round_post + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req vBba + rax1_m1 E2, C1, C3 + rax1_m1 E4, C3, C0 + rax1_m1 E1, C0, C2 + rax1_m1 E3, C2, C4 + rax1_m1 E0, C4, C1 + .unreq tmp + + /* 25x XAR, 75 in total */ + + tmp .req C1 + tmpq .req C1q + eor vBba.16b, Aba.16b, E0.16b + xar_m1 vBsa, Abi, E2, 2 + xar_m1 vBbi, Aki, E2, 21 + xar_m1 vBki, Ako, E3, 39 + xar_m1 vBko, Amu, E4, 56 + xar_m1 vBmu, Aso, E3, 8 + xar_m1 vBso, Ama, E0, 23 + xar_m1 vBka, Abe, E1, 63 + xar_m1 vBse, Ago, E3, 9 + xar_m1 vBgo, Ame, E1, 19 + xar_m1 vBke, Agi, E2, 58 + xar_m1 vBgi, Aka, E0, 61 + xar_m1 vBga, Abo, E3, 36 + xar_m1 vBbo, Amo, E3, 43 + xar_m1 vBmo, Ami, E2, 49 + xar_m1 vBmi, Ake, E1, 54 + xar_m1 vBge, Agu, E4, 44 + xar_m1 vBgu, Asi, E2, 3 + xar_m1 vBsi, Aku, E4, 25 + xar_m1 vBku, Asa, E0, 46 + xar_m1 vBma, Abu, E4, 37 + xar_m1 vBbu, Asu, E4, 50 + xar_m1 vBsu, Ase, E1, 62 + xar_m1 vBme, Aga, E0, 28 + xar_m1 vBbe, Age, E1, 20 + + /* 25x BCAX, 50 in total */ + + bcax_m1 Aga, vBga, vBgi, vBge + bcax_m1 Age, vBge, vBgo, vBgi + bcax_m1 Agi, vBgi, vBgu, vBgo + bcax_m1 Ago, vBgo, vBga, vBgu + bcax_m1 Agu, vBgu, vBge, vBga + bcax_m1 Aka, vBka, vBki, vBke + bcax_m1 Ake, vBke, vBko, vBki + bcax_m1 Aki, vBki, vBku, vBko + bcax_m1 Ako, vBko, vBka, vBku + bcax_m1 Aku, vBku, vBke, vBka + bcax_m1 Ama, vBma, vBmi, vBme + bcax_m1 Ame, vBme, vBmo, vBmi + bcax_m1 Ami, vBmi, vBmu, vBmo + bcax_m1 Amo, vBmo, vBma, vBmu + bcax_m1 Amu, vBmu, vBme, vBma + bcax_m1 Asa, vBsa, vBsi, vBse + bcax_m1 Ase, vBse, vBso, vBsi + bcax_m1 Asi, vBsi, vBsu, vBso + bcax_m1 Aso, vBso, vBsa, vBsu + bcax_m1 Asu, vBsu, vBse, vBsa + bcax_m1 Aba, vBba, vBbi, vBbe + bcax_m1 Abe, vBbe, vBbo, vBbi + bcax_m1 Abi, vBbi, vBbu, vBbo + bcax_m1 Abo, vBbo, vBba, vBbu + bcax_m1 Abu, vBbu, vBbe, vBba + + // iota step + //ld1r {tmp.2d}, [const_addr], #8 + ldr tmpq, [const_addr], #16 + eor Aba.16b, Aba.16b, tmp.16b + + .unreq tmp + +.endm + + +.text +.align 4 +.global keccak_f1600_x2_v84a_asm_v2pp6 +.global _keccak_f1600_x2_v84a_asm_v2pp6 + +#define KECCAK_F1600_ROUNDS 24 + +keccak_f1600_x2_v84a_asm_v2pp6: +_keccak_f1600_x2_v84a_asm_v2pp6: + alloc_stack + save_vregs + load_constant_ptr + load_input + + ldr x17, xar_m1_const_addr + + //mov count, #(KECCAK_F1600_ROUNDS-2) + mov count, #11 + keccak_f1600_round_pre +loop: + keccak_f1600_round_core + keccak_f1600_round_core + sub count, count, #1 + cbnz count, loop + + keccak_f1600_round_core + keccak_f1600_round_post + store_input + restore_vregs + free_stack + ret +#endif diff --git a/tests/keccak_neon/manual/keccak_f1600_x2_v84a_asm_v2pp7.s b/tests/keccak_neon/manual/keccak_f1600_x2_v84a_asm_v2pp7.s new file mode 100644 index 0000000..ae72584 --- /dev/null +++ b/tests/keccak_neon/manual/keccak_f1600_x2_v84a_asm_v2pp7.s @@ -0,0 +1,901 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + + +#if defined(__ARM_FEATURE_SVE2) +/********************** CONSTANTS *************************/ + .data + .align(8) +_round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x1 + count .req x2 + cur_const .req x3 + + /* Mapping of Kecck-f1600 state to vector registers + * at the beginning and end of each round. */ + Aba .req v0 + Abe .req v1 + Abi .req v2 + Abo .req v3 + Abu .req v4 + Aga .req v5 + Age .req v6 + Agi .req v7 + Ago .req v8 + Agu .req v9 + Aka .req v10 + Ake .req v11 + Aki .req v12 + Ako .req v13 + Aku .req v14 + Ama .req v15 + Ame .req v16 + Ami .req v17 + Amo .req v18 + Amu .req v19 + Asa .req v20 + Ase .req v21 + Asi .req v22 + Aso .req v23 + Asu .req v24 + + /* q-form of the above mapping */ + Abaq .req q0 + Abeq .req q1 + Abiq .req q2 + Aboq .req q3 + Abuq .req q4 + Agaq .req q5 + Ageq .req q6 + Agiq .req q7 + Agoq .req q8 + Aguq .req q9 + Akaq .req q10 + Akeq .req q11 + Akiq .req q12 + Akoq .req q13 + Akuq .req q14 + Amaq .req q15 + Ameq .req q16 + Amiq .req q17 + Amoq .req q18 + Amuq .req q19 + Asaq .req q20 + Aseq .req q21 + Asiq .req q22 + Asoq .req q23 + Asuq .req q24 + + /* z-form of the above mapping */ + Abaz .req z0 + Abez .req z1 + Abiz .req z2 + Aboz .req z3 + Abuz .req z4 + Agaz .req z5 + Agez .req z6 + Agiz .req z7 + Agoz .req z8 + Aguz .req z9 + Akaz .req z10 + Akez .req z11 + Akiz .req z12 + Akoz .req z13 + Akuz .req z14 + Amaz .req z15 + Amez .req z16 + Amiz .req z17 + Amoz .req z18 + Amuz .req z19 + Asaz .req z20 + Asez .req z21 + Asiz .req z22 + Asoz .req z23 + Asuz .req z24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v27 + C1 .req v28 + C2 .req v29 + C3 .req v30 + C4 .req v31 + + C0q .req q27 + C1q .req q28 + C2q .req q29 + C3q .req q30 + C4q .req q31 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + vBba .req v25 // fresh + vBbe .req v26 // fresh + vBbi .req Abi + vBbo .req Abo + vBbu .req Abu + vBga .req Aka + vBge .req Ake + vBgi .req Agi + vBgo .req Ago + vBgu .req Agu + vBka .req Ama + vBke .req Ame + vBki .req Aki + vBko .req Ako + vBku .req Aku + vBma .req Asa + vBme .req Ase + vBmi .req Ami + vBmo .req Amo + vBmu .req Amu + vBsa .req Aba + vBse .req Abe + vBsi .req Asi + vBso .req Aso + vBsu .req Asu + + vBbaq .req q25 // fresh + vBbeq .req q26 // fresh + vBbiq .req Abiq + vBboq .req Aboq + vBbuq .req Abuq + vBgaq .req Akaq + vBgeq .req Akeq + vBgiq .req Agiq + vBgoq .req Agoq + vBguq .req Aguq + vBkaq .req Amaq + vBkeq .req Ameq + vBkiq .req Akiq + vBkoq .req Akoq + vBkuq .req Akuq + vBmaq .req Asaq + vBmeq .req Aseq + vBmiq .req Amiq + vBmoq .req Amoq + vBmuq .req Amuq + vBsaq .req Abaq + vBseq .req Abeq + vBsiq .req Asiq + vBsoq .req Asoq + vBsuq .req Asuq + + vBbaz .req z25 // fresh + vBbez .req z26 // fresh + vBbiz .req Abiz + vBboz .req Aboz + vBbuz .req Abuz + vBgaz .req Akaz + vBgez .req Akez + vBgiz .req Agiz + vBgoz .req Agoz + vBguz .req Aguz + vBkaz .req Amaz + vBkez .req Amez + vBkiz .req Akiz + vBkoz .req Akoz + vBkuz .req Akuz + vBmaz .req Asaz + vBmez .req Asez + vBmiz .req Amiz + vBmoz .req Amoz + vBmuz .req Amuz + vBsaz .req Abaz + vBsez .req Abez + vBsiz .req Asiz + vBsoz .req Asoz + vBsuz .req Asuz + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req C4 + E1 .req C0 + E2 .req vBbe // fresh + E3 .req C2 + E4 .req C3 + + E0q .req C4q + E1q .req C0q + E2q .req vBbeq // fresh + E3q .req C2q + E4q .req C3q + + +/************************ MACROS ****************************/ + +.macro load_input + ldp Abaq, Abeq, [input_addr, #(2*8*0)] + ldp Abiq, Aboq, [input_addr, #(2*8*2)] + ldp Abuq, Agaq, [input_addr, #(2*8*4)] + ldp Ageq, Agiq, [input_addr, #(2*8*6)] + ldp Agoq, Aguq, [input_addr, #(2*8*8)] + ldp Akaq, Akeq, [input_addr, #(2*8*10)] + ldp Akiq, Akoq, [input_addr, #(2*8*12)] + ldp Akuq, Amaq, [input_addr, #(2*8*14)] + ldp Ameq, Amiq, [input_addr, #(2*8*16)] + ldp Amoq, Amuq, [input_addr, #(2*8*18)] + ldp Asaq, Aseq, [input_addr, #(2*8*20)] + ldp Asiq, Asoq, [input_addr, #(2*8*22)] + ldr Asuq, [input_addr, #(2*8*24)] +.endm + +.macro store_input + str Abaq, [input_addr, #(2*8*0)] + str Abeq, [input_addr, #(2*8*1)] + str Abiq, [input_addr, #(2*8*2)] + str Aboq, [input_addr, #(2*8*3)] + str Abuq, [input_addr, #(2*8*4)] + str Agaq, [input_addr, #(2*8*5)] + str Ageq, [input_addr, #(2*8*6)] + str Agiq, [input_addr, #(2*8*7)] + str Agoq, [input_addr, #(2*8*8)] + str Aguq, [input_addr, #(2*8*9)] + str Akaq, [input_addr, #(2*8*10)] + str Akeq, [input_addr, #(2*8*11)] + str Akiq, [input_addr, #(2*8*12)] + str Akoq, [input_addr, #(2*8*13)] + str Akuq, [input_addr, #(2*8*14)] + str Amaq, [input_addr, #(2*8*15)] + str Ameq, [input_addr, #(2*8*16)] + str Amiq, [input_addr, #(2*8*17)] + str Amoq, [input_addr, #(2*8*18)] + str Amuq, [input_addr, #(2*8*19)] + str Asaq, [input_addr, #(2*8*20)] + str Aseq, [input_addr, #(2*8*21)] + str Asiq, [input_addr, #(2*8*22)] + str Asoq, [input_addr, #(2*8*23)] + str Asuq, [input_addr, #(2*8*24)] +.endm + +#define STACK_SIZE (16*4 + 16*34) +#define STACK_BASE_VREGS 0 +#define STACK_BASE_TMP 16*4 + +#define Aga_offset 0 +#define E0_offset 1 +#define E1_offset 2 +#define E2_offset 3 +#define E3_offset 4 +#define E4_offset 5 +#define Ame_offset 7 +#define Agi_offset 8 +#define Aka_offset 9 +#define Abo_offset 10 +#define Amo_offset 11 +#define Ami_offset 12 +#define Ake_offset 13 +#define Agu_offset 14 +#define Asi_offset 15 +#define Aku_offset 16 +#define Asa_offset 17 +#define Abu_offset 18 +#define Asu_offset 19 +#define Ase_offset 20 +//#define Aga_offset 21 +#define Age_offset 22 +#define vBgo_offset 23 +#define vBke_offset 24 +#define vBgi_offset 25 +#define vBga_offset 26 +#define vBbo_offset 27 +#define vBmo_offset 28 +#define vBmi_offset 29 +#define vBge_offset 30 + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +#define save(name) \ + str name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] +#define restore(name) \ + ldr name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] + +.macro save_vregs + stp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + stp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + stp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + stp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #(STACK_BASE_VREGS + 16*0)] + ldp d10, d11, [sp, #(STACK_BASE_VREGS + 16*1)] + ldp d12, d13, [sp, #(STACK_BASE_VREGS + 16*2)] + ldp d14, d15, [sp, #(STACK_BASE_VREGS + 16*3)] +.endm + +/* Macros using v8.4-A SHA-3 instructions */ + +.macro eor3_m1_0 d s0 s1 s2 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor2 d s0 s1 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor3_m1_1 d s0 s1 s2 + eor \d\().16b, \d\().16b, \s2\().16b +.endm + +.macro eor3_m1 d s0 s1 s2 + eor3_m1_0 \d, \s0, \s1, \s2 + eor3_m1_1 \d, \s0, \s1, \s2 +.endm + +.macro rax1_m1 d s0 s1 + // Use add instead of SHL #1 + add tmp.2d, \s1\().2d, \s1\().2d + sri tmp.2d, \s1\().2d, #63 + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +xar_m1_const: + .quad (1ULL<<(64-61)) + .quad (1ULL<<(64-56)) + .quad (1ULL<<(64-50)) + .quad (1ULL<<(64-46)) + .quad (1ULL<<(64-44)) + .quad (1ULL<<(64-43)) + .quad (1ULL<<(64-39)) + .quad (1ULL<<(64-36)) + .quad (1ULL<<(64-21)) + .quad (1ULL<<(64-19)) + .quad (1ULL<<(64-9)) + .quad (1ULL<<(64-3)) + + +xar_m1_const_addr: .quad xar_m1_const + + .macro xar_m1 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 21 + eor \s0\().16b, \s0\().16b, \s1\().16b + ldr \d\()q, [x17, #64] + mul \d\()z\().d, \s0\()z\().d, \d\()z\().d[0] + sri \d\().2d, \s0\().2d, #(\imm) + .elseif \imm == 39 + eor \s0\().16b, \s0\().16b, \s1\().16b + ldr \d\()q, [x17, #48] + mul \d\()z\().d, \s0\()z\().d, \d\()z\().d[0] + sri \d\().2d, \s0\().2d, #(\imm) + .elseif \imm == 56 + eor \s0\().16b, \s0\().16b, \s1\().16b + ldr \d\()q, [x17] + mul \d\()z\().d, \s0\()z\().d, \d\()z\().d[1] + sri \d\().2d, \s0\().2d, #(\imm) + .elseif \imm == 63 + eor \s0\().16b, \s0\().16b, \s1\().16b + add \d\().2d, \s0\().2d, \s0\().2d + sri \d\().2d, \s0\().2d, #(63) + .elseif \imm == 9 + eor \s0\().16b, \s0\().16b, \s1\().16b + ldr \d\()q, [x17, #80] + mul \d\()z\().d, \s0\()z\().d, \d\()z\().d[0] + sri \d\().2d, \s0\().2d, #(\imm) + .elseif \imm == 19 + eor \s0\().16b, \s0\().16b, \s1\().16b + ldr \d\()q, [x17, #64] + mul \d\()z\().d, \s0\()z\().d, \d\()z\().d[1] + sri \d\().2d, \s0\().2d, #(\imm) + .elseif \imm == 61 + eor \s0\().16b, \s0\().16b, \s1\().16b + ldr \d\()q, [x17] + mul \d\()z\().d, \s0\()z\().d, \d\()z\().d[0] + sri \d\().2d, \s0\().2d, #(\imm) + .elseif \imm == 36 + eor \s0\().16b, \s0\().16b, \s1\().16b + ldr \d\()q, [x17, #48] + mul \d\()z\().d, \s0\()z\().d, \d\()z\().d[1] + sri \d\().2d, \s0\().2d, #(\imm) + .elseif \imm == 43 + eor \s0\().16b, \s0\().16b, \s1\().16b + ldr \d\()q, [x17, #32] + mul \d\()z\().d, \s0\()z\().d, \d\()z\().d[1] + sri \d\().2d, \s0\().2d, #(\imm) + .elseif \imm == 44 + eor \s0\().16b, \s0\().16b, \s1\().16b + ldr \d\()q, [x17, #32] + mul \d\()z\().d, \s0\()z\().d, \d\()z\().d[0] + sri \d\().2d, \s0\().2d, #(\imm) + .elseif \imm == 3 + eor \s0\().16b, \s0\().16b, \s1\().16b + ldr \d\()q, [x17, #80] + mul \d\()z\().d, \s0\()z\().d, \d\()z\().d[1] + sri \d\().2d, \s0\().2d, #(\imm) + .elseif \imm == 46 + eor \s0\().16b, \s0\().16b, \s1\().16b + ldr \d\()q, [x17, #16] + mul \d\()z\().d, \s0\()z\().d, \d\()z\().d[1] + sri \d\().2d, \s0\().2d, #(\imm) + .elseif \imm == 50 + eor \s0\().16b, \s0\().16b, \s1\().16b + ldr \d\()q, [x17, #16] + mul \d\()z\().d, \s0\()z\().d, \d\()z\().d[0] + sri \d\().2d, \s0\().2d, #(\imm) + .elseif \imm == 62 + eor \s0\().16b, \s0\().16b, \s1\().16b + add \d\().2d, \s0\().2d, \s0\().2d + add \d\().2d, \d\().2d, \d\().2d + sri \d\().2d, \s0\().2d, #(62) + .else + eor \s0\().16b, \s0\().16b, \s1\().16b + shl \d\().2d, \s0\().2d, #(64-\imm) + sri \d\().2d, \s0\().2d, #(\imm) + .endif +.endm + +.macro bcax_m1 d s0 s1 s2 + bic tmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, tmp.16b, \s0\().16b +.endm + +/* Keccak-f1600 round */ + +.macro keccak_f1600_round_pre + + /* 10 EOR3, so 20 individual EOR */ + + eor3_m1_0 C1, Abe, Age, Ake + eor3_m1_0 C3, Abo, Ago, Ako + eor3_m1_0 C0, Aba, Aga, Aka + eor3_m1_0 C2, Abi, Agi, Aki + eor3_m1_0 C4, Abu, Agu, Aku + eor3_m1_1 C1, Abe, Age, Ake + eor3_m1_1 C3, Abo, Ago, Ako + eor3_m1_1 C0, Aba, Aga, Aka + eor3_m1_1 C2, Abi, Agi, Aki + eor3_m1_1 C4, Abu, Agu, Aku + eor3_m1_0 C1, C1, Ame, Ase + eor3_m1_0 C3, C3, Amo, Aso + eor3_m1_0 C0, C0, Ama, Asa + eor3_m1_0 C2, C2, Ami, Asi + eor3_m1_0 C4, C4, Amu, Asu + eor3_m1_1 C1, C1, Ame, Ase + eor3_m1_1 C3, C3, Amo, Aso + eor3_m1_1 C0, C0, Ama, Asa + eor3_m1_1 C2, C2, Ami, Asi + eor3_m1_1 C4, C4, Amu, Asu + +.endm + +.macro keccak_f1600_round + + /* 10 EOR3, so 20 individual EOR */ + + eor3_m1_0 C0, Aba, Aga, Aka + eor3_m1_0 C1, Abe, Age, Ake + eor3_m1_0 C2, Abi, Agi, Aki + eor3_m1_0 C3, Abo, Ago, Ako + eor3_m1_0 C4, Abu, Agu, Aku + eor3_m1_1 C0, Aba, Aga, Aka + eor3_m1_1 C1, Abe, Age, Ake + eor3_m1_1 C2, Abi, Agi, Aki + eor3_m1_1 C3, Abo, Ago, Ako + eor3_m1_1 C4, Abu, Agu, Aku + eor3_m1_0 C0, C0, Ama, Asa + eor3_m1_0 C1, C1, Ame, Ase + eor3_m1_0 C2, C2, Ami, Asi + eor3_m1_0 C3, C3, Amo, Aso + eor3_m1_0 C4, C4, Amu, Asu + eor3_m1_1 C0, C0, Ama, Asa + eor3_m1_1 C1, C1, Ame, Ase + eor3_m1_1 C2, C2, Ami, Asi + eor3_m1_1 C3, C3, Amo, Aso + eor3_m1_1 C4, C4, Amu, Asu + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req vBba + rax1_m1 E2, C1, C3 + rax1_m1 E4, C3, C0 + rax1_m1 E1, C0, C2 + rax1_m1 E3, C2, C4 + rax1_m1 E0, C4, C1 + .unreq tmp + + /* 25x XAR, 75 in total */ + + tmp .req C1 + tmpq .req C1q + + eor vBba.16b, Aba.16b, E0.16b + xar_m1 vBsa, Abi, E2, 2 + xar_m1 vBbi, Aki, E2, 21 + xar_m1 vBki, Ako, E3, 39 + xar_m1 vBko, Amu, E4, 56 + xar_m1 vBmu, Aso, E3, 8 + xar_m1 vBso, Ama, E0, 23 + xar_m1 vBka, Abe, E1, 63 + xar_m1 vBse, Ago, E3, 9 + xar_m1 vBgo, Ame, E1, 19 + xar_m1 vBke, Agi, E2, 58 + xar_m1 vBgi, Aka, E0, 61 + xar_m1 vBga, Abo, E3, 36 + xar_m1 vBbo, Amo, E3, 43 + xar_m1 vBmo, Ami, E2, 49 + xar_m1 vBmi, Ake, E1, 54 + xar_m1 vBge, Agu, E4, 44 + xar_m1 vBgu, Asi, E2, 3 + xar_m1 vBsi, Aku, E4, 25 + xar_m1 vBku, Asa, E0, 46 + xar_m1 vBma, Abu, E4, 37 + xar_m1 vBbu, Asu, E4, 50 + xar_m1 vBsu, Ase, E1, 62 + xar_m1 vBme, Aga, E0, 28 + xar_m1 vBbe, Age, E1, 20 + + /* 25x BCAX, 50 in total */ + + bcax_m1 Aga, vBga, vBgi, vBge + bcax_m1 Age, vBge, vBgo, vBgi + bcax_m1 Agi, vBgi, vBgu, vBgo + bcax_m1 Ago, vBgo, vBga, vBgu + bcax_m1 Agu, vBgu, vBge, vBga + bcax_m1 Aka, vBka, vBki, vBke + bcax_m1 Ake, vBke, vBko, vBki + bcax_m1 Aki, vBki, vBku, vBko + bcax_m1 Ako, vBko, vBka, vBku + bcax_m1 Aku, vBku, vBke, vBka + bcax_m1 Ama, vBma, vBmi, vBme + bcax_m1 Ame, vBme, vBmo, vBmi + bcax_m1 Ami, vBmi, vBmu, vBmo + bcax_m1 Amo, vBmo, vBma, vBmu + bcax_m1 Amu, vBmu, vBme, vBma + bcax_m1 Asa, vBsa, vBsi, vBse + bcax_m1 Ase, vBse, vBso, vBsi + bcax_m1 Asi, vBsi, vBsu, vBso + bcax_m1 Aso, vBso, vBsa, vBsu + bcax_m1 Asu, vBsu, vBse, vBsa + bcax_m1 Aba, vBba, vBbi, vBbe + bcax_m1 Abe, vBbe, vBbo, vBbi + bcax_m1 Abi, vBbi, vBbu, vBbo + bcax_m1 Abo, vBbo, vBba, vBbu + bcax_m1 Abu, vBbu, vBbe, vBba + + // iota step + //ld1r {tmp.2d}, [const_addr], #8 + ldr tmpq, [const_addr], #16 + eor Aba.16b, Aba.16b, tmp.16b + + .unreq tmp + .unreq tmpq + +.endm + +.macro keccak_f1600_round_core + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req vBba + rax1_m1 E2, C1, C3 + rax1_m1 E4, C3, C0 + rax1_m1 E1, C0, C2 + rax1_m1 E3, C2, C4 + rax1_m1 E0, C4, C1 + + /* 25x XAR, 75 in total */ + + .unreq tmp + tmp .req C1 + tmpq .req C1q + + eor vBba.16b, Aba.16b, E0.16b + xar_m1 vBsa, Abi, E2, 2 + xar_m1 vBbi, Aki, E2, 21 + xar_m1 vBki, Ako, E3, 39 + xar_m1 vBko, Amu, E4, 56 + xar_m1 vBmu, Aso, E3, 8 + xar_m1 vBso, Ama, E0, 23 + xar_m1 vBka, Abe, E1, 63 + xar_m1 vBse, Ago, E3, 9 + xar_m1 vBgo, Ame, E1, 19 + xar_m1 vBke, Agi, E2, 58 + xar_m1 vBgi, Aka, E0, 61 + xar_m1 vBga, Abo, E3, 36 + xar_m1 vBbo, Amo, E3, 43 + xar_m1 vBmo, Ami, E2, 49 + xar_m1 vBmi, Ake, E1, 54 + xar_m1 vBge, Agu, E4, 44 + mov E3.16b, Aga.16b + bcax_m1 Aga, vBga, vBgi, vBge + xar_m1 vBgu, Asi, E2, 3 + xar_m1 vBsi, Aku, E4, 25 + xar_m1 vBku, Asa, E0, 46 + xar_m1 vBma, Abu, E4, 37 + xar_m1 vBbu, Asu, E4, 50 + xar_m1 vBsu, Ase, E1, 62 + xar_m1 vBme, E3, E0, 28 + xar_m1 vBbe, Age, E1, 20 + + /* 25x BCAX, 50 in total */ + + bcax_m1 Age, vBge, vBgo, vBgi + bcax_m1 Agi, vBgi, vBgu, vBgo + bcax_m1 Ago, vBgo, vBga, vBgu + bcax_m1 Agu, vBgu, vBge, vBga + bcax_m1 Aka, vBka, vBki, vBke + bcax_m1 Ake, vBke, vBko, vBki + + .unreq tmp + .unreq tmpq + + eor2 C0, Aka, Aga + save(Aga) + + tmp .req Aga + tmpq .req Agaq + bcax_m1 Aki, vBki, vBku, vBko + bcax_m1 Ako, vBko, vBka, vBku + eor2 C1, Ake, Age + bcax_m1 Aku, vBku, vBke, vBka + eor2 C2, Aki, Agi + bcax_m1 Ama, vBma, vBmi, vBme + eor2 C3, Ako, Ago + bcax_m1 Ame, vBme, vBmo, vBmi + eor2 C4, Aku, Agu + bcax_m1 Ami, vBmi, vBmu, vBmo + eor2 C0, C0, Ama + bcax_m1 Amo, vBmo, vBma, vBmu + eor2 C1, C1, Ame + bcax_m1 Amu, vBmu, vBme, vBma + eor2 C2, C2, Ami + bcax_m1 Asa, vBsa, vBsi, vBse + eor2 C3, C3, Amo + bcax_m1 Ase, vBse, vBso, vBsi + eor2 C4, C4, Amu + bcax_m1 Asi, vBsi, vBsu, vBso + eor2 C0, C0, Asa + bcax_m1 Aso, vBso, vBsa, vBsu + eor2 C1, C1, Ase + bcax_m1 Asu, vBsu, vBse, vBsa + eor2 C2, C2, Asi + eor2 C3, C3, Aso + bcax_m1 Aba, vBba, vBbi, vBbe + bcax_m1 Abe, vBbe, vBbo, vBbi + eor2 C1, C1, Abe + + // iota step + //ld1r {tmp.2d}, [const_addr], #8 + ldr tmpq, [const_addr], #16 + eor Aba.16b, Aba.16b, tmp.16b + eor2 C4, C4, Asu + bcax_m1 Abi, vBbi, vBbu, vBbo + bcax_m1 Abo, vBbo, vBba, vBbu + eor2 C3, C3, Abo + eor2 C2, C2, Abi + eor2 C0, C0, Aba + bcax_m1 Abu, vBbu, vBbe, vBba + eor2 C4, C4, Abu + + restore(Aga) + .unreq tmp + .unreq tmpq + +.endm + +.macro keccak_f1600_round_post + + /* 5x RAX1, 15 Neon Instructions total */ + + tmp .req vBba + rax1_m1 E2, C1, C3 + rax1_m1 E4, C3, C0 + rax1_m1 E1, C0, C2 + rax1_m1 E3, C2, C4 + rax1_m1 E0, C4, C1 + + /* 25x XAR, 75 in total */ + + .unreq tmp + tmp .req C1 + tmpq .req C1q + + eor vBba.16b, Aba.16b, E0.16b + xar_m1 vBsa, Abi, E2, 2 + xar_m1 vBbi, Aki, E2, 21 + xar_m1 vBki, Ako, E3, 39 + xar_m1 vBko, Amu, E4, 56 + xar_m1 vBmu, Aso, E3, 8 + xar_m1 vBso, Ama, E0, 23 + xar_m1 vBka, Abe, E1, 63 + xar_m1 vBse, Ago, E3, 9 + xar_m1 vBgo, Ame, E1, 19 + xar_m1 vBke, Agi, E2, 58 + xar_m1 vBgi, Aka, E0, 61 + xar_m1 vBga, Abo, E3, 36 + xar_m1 vBbo, Amo, E3, 43 + xar_m1 vBmo, Ami, E2, 49 + xar_m1 vBmi, Ake, E1, 54 + xar_m1 vBge, Agu, E4, 44 + mov E3.16b, Aga.16b + bcax_m1 Aga, vBga, vBgi, vBge + xar_m1 vBgu, Asi, E2, 3 + xar_m1 vBsi, Aku, E4, 25 + xar_m1 vBku, Asa, E0, 46 + xar_m1 vBma, Abu, E4, 37 + xar_m1 vBbu, Asu, E4, 50 + xar_m1 vBsu, Ase, E1, 62 + xar_m1 vBme, E3, E0, 28 + xar_m1 vBbe, Age, E1, 20 + + /* 25x BCAX, 50 in total */ + + bcax_m1 Age, vBge, vBgo, vBgi + bcax_m1 Agi, vBgi, vBgu, vBgo + bcax_m1 Ago, vBgo, vBga, vBgu + bcax_m1 Agu, vBgu, vBge, vBga + bcax_m1 Aka, vBka, vBki, vBke + bcax_m1 Ake, vBke, vBko, vBki + bcax_m1 Aki, vBki, vBku, vBko + bcax_m1 Ako, vBko, vBka, vBku + bcax_m1 Aku, vBku, vBke, vBka + bcax_m1 Ama, vBma, vBmi, vBme + bcax_m1 Ame, vBme, vBmo, vBmi + bcax_m1 Ami, vBmi, vBmu, vBmo + bcax_m1 Amo, vBmo, vBma, vBmu + bcax_m1 Amu, vBmu, vBme, vBma + bcax_m1 Asa, vBsa, vBsi, vBse + bcax_m1 Ase, vBse, vBso, vBsi + bcax_m1 Asi, vBsi, vBsu, vBso + bcax_m1 Aso, vBso, vBsa, vBsu + bcax_m1 Asu, vBsu, vBse, vBsa + bcax_m1 Aba, vBba, vBbi, vBbe + bcax_m1 Abe, vBbe, vBbo, vBbi + bcax_m1 Abi, vBbi, vBbu, vBbo + bcax_m1 Abo, vBbo, vBba, vBbu + bcax_m1 Abu, vBbu, vBbe, vBba + + // iota step + //ld1r {tmp.2d}, [const_addr], #8 + ldr tmpq, [const_addr], #16 + eor Aba.16b, Aba.16b, tmp.16b + + .unreq tmp + .unreq tmpq + +.endm + + +.text +.align 4 +.global keccak_f1600_x2_v84a_asm_v2pp7 +.global _keccak_f1600_x2_v84a_asm_v2pp7 + +#define KECCAK_F1600_ROUNDS 24 + +keccak_f1600_x2_v84a_asm_v2pp7: +_keccak_f1600_x2_v84a_asm_v2pp7: + alloc_stack + save_vregs + load_constant_ptr + load_input + + ldr x17, xar_m1_const_addr + + //mov count, #(KECCAK_F1600_ROUNDS-2) + mov count, #11 + keccak_f1600_round_pre +loop: + keccak_f1600_round_core + keccak_f1600_round_core + sub count, count, #1 + cbnz count, loop + + keccak_f1600_round_core + keccak_f1600_round_post + store_input + restore_vregs + free_stack + ret + +#endif \ No newline at end of file diff --git a/tests/keccak_neon/manual/keccak_f1600_x3_hybrid_asm_v3p.s b/tests/keccak_neon/manual/keccak_f1600_x3_hybrid_asm_v3p.s new file mode 100644 index 0000000..cbc282b --- /dev/null +++ b/tests/keccak_neon/manual/keccak_f1600_x3_hybrid_asm_v3p.s @@ -0,0 +1,971 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +/********************** CONSTANTS *************************/ + .data + .align(8) +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x29 + count .req w27 + cur_const .req x26 + + /* Mapping of Kecck-f1600 SIMD state to vector registers + * at the beginning and end of each round. */ + + vAba .req v0 + vAbe .req v1 + vAbi .req v2 + vAbo .req v3 + vAbu .req v4 + vAga .req v5 + vAge .req v6 + vAgi .req v7 + vAgo .req v8 + vAgu .req v9 + vAka .req v10 + vAke .req v11 + vAki .req v12 + vAko .req v13 + vAku .req v14 + vAma .req v15 + vAme .req v16 + vAmi .req v17 + vAmo .req v18 + vAmu .req v19 + vAsa .req v20 + vAse .req v21 + vAsi .req v22 + vAso .req v23 + vAsu .req v24 + + /* q-form of the above mapping */ + vAbaq .req q0 + vAbeq .req q1 + vAbiq .req q2 + vAboq .req q3 + vAbuq .req q4 + vAgaq .req q5 + vAgeq .req q6 + vAgiq .req q7 + vAgoq .req q8 + vAguq .req q9 + vAkaq .req q10 + vAkeq .req q11 + vAkiq .req q12 + vAkoq .req q13 + vAkuq .req q14 + vAmaq .req q15 + vAmeq .req q16 + vAmiq .req q17 + vAmoq .req q18 + vAmuq .req q19 + vAsaq .req q20 + vAseq .req q21 + vAsiq .req q22 + vAsoq .req q23 + vAsuq .req q24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v30 + C1 .req v29 + C2 .req v28 + C3 .req v27 + C4 .req v26 + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req v26 + E1 .req v25 + E2 .req v29 + E3 .req v28 + E4 .req v27 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + vAbi_ .req v2 + vAbo_ .req v3 + vAbu_ .req v4 + vAga_ .req v10 + vAge_ .req v11 + vAgi_ .req v7 + vAgo_ .req v8 + vAgu_ .req v9 + vAka_ .req v15 + vAke_ .req v16 + vAki_ .req v12 + vAko_ .req v13 + vAku_ .req v14 + vAma_ .req v20 + vAme_ .req v21 + vAmi_ .req v17 + vAmo_ .req v18 + vAmu_ .req v19 + vAsa_ .req v0 + vAse_ .req v1 + vAsi_ .req v22 + vAso_ .req v23 + vAsu_ .req v24 + vAba_ .req v30 + vAbe_ .req v27 + + /* Unused temporary */ + vtmp .req v31 + + /* Mapping of Kecck-f1600 state to scalar registers + * at the beginning and end of each round. */ + s_Aba .req x1 + sAbe .req x6 + sAbi .req x11 + sAbo .req x16 + sAbu .req x21 + sAga .req x2 + sAge .req x7 + sAgi .req x12 + sAgo .req x17 + sAgu .req x22 + sAka .req x3 + sAke .req x8 + sAki .req x13 + sAko .req x18 + sAku .req x23 + sAma .req x4 + sAme .req x9 + sAmi .req x14 + sAmo .req x19 + sAmu .req x24 + sAsa .req x5 + sAse .req x10 + sAsi .req x15 + sAso .req x20 + sAsu .req x25 + + /* sA_[y,2*x+3*y] = rot(A[x,y]) */ + s_Aba_ .req x0 + sAbe_ .req x28 + sAbi_ .req x11 + sAbo_ .req x16 + sAbu_ .req x21 + sAga_ .req x3 + sAge_ .req x8 + sAgi_ .req x12 + sAgo_ .req x17 + sAgu_ .req x22 + sAka_ .req x4 + sAke_ .req x9 + sAki_ .req x13 + sAko_ .req x18 + sAku_ .req x23 + sAma_ .req x5 + sAme_ .req x10 + sAmi_ .req x14 + sAmo_ .req x19 + sAmu_ .req x24 + sAsa_ .req x1 + sAse_ .req x6 + sAsi_ .req x15 + sAso_ .req x20 + sAsu_ .req x25 + + /* sC[x] = sA[x,0] xor sA[x,1] xor sA[x,2] xor sA[x,3] xor sA[x,4], for x in 0..4 */ + /* sE[x] = sC[x-1] xor rot(C[x+1],1), for x in 0..4 */ + sC0 .req x0 + sE0 .req x29 + sC1 .req x26 + sE1 .req x30 + sC2 .req x27 + sE2 .req x26 + sC3 .req x28 + sE3 .req x27 + sC4 .req x29 + sE4 .req x28 + + tmp .req x30 + +/************************ MACROS ****************************/ + +/* Macros using v8.4-A SHA-3 instructions */ + + +.macro eor3_m1 d s0 s1 s2 + eor \d\().16b, \s0\().16b, \s1\().16b + eor \d\().16b, \d\().16b, \s2\().16b +.endm + +.macro rax1_m1 d s0 s1 + add vtmp.2d, \s1\().2d, \s1\().2d + sri vtmp.2d, \s1\().2d, #63 + eor \d\().16b, vtmp.16b, \s0\().16b +.endm + +.macro xar_m1 d s0 s1 imm + eor vtmp.16b, \s0\().16b, \s1\().16b + shl \d\().2d, vtmp.2d, #(64-\imm) + sri \d\().2d, vtmp.2d, #(\imm) +.endm + +.macro bcax_m1 d s0 s1 s2 + bic vtmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, vtmp.16b, \s0\().16b + .endm + + +.macro eor3_m0 d s0 s1 s2 + eor3 \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +.macro rax1_m0 d s0 s1 + rax1 \d\().2d, \s0\().2d, \s1\().2d +.endm + +.macro xar_m0 d s0 s1 imm + xar \d\().2d, \s0\().2d, \s1\().2d, #\imm +.endm + +.macro bcax_m0 d s0 s1 s2 + bcax \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + + +.macro load_input_vector num idx + ldr vAbaq, [input_addr, #(16*(\num*0+\idx))] + ldr vAbeq, [input_addr, #(16*(\num*1+\idx))] + ldr vAbiq, [input_addr, #(16*(\num*2+\idx))] + ldr vAboq, [input_addr, #(16*(\num*3+\idx))] + ldr vAbuq, [input_addr, #(16*(\num*4+\idx))] + ldr vAgaq, [input_addr, #(16*(\num*5+\idx))] + ldr vAgeq, [input_addr, #(16*(\num*6+\idx))] + ldr vAgiq, [input_addr, #(16*(\num*7+\idx))] + ldr vAgoq, [input_addr, #(16*(\num*8+\idx))] + ldr vAguq, [input_addr, #(16*(\num*9+\idx))] + ldr vAkaq, [input_addr, #(16*(\num*10+\idx))] + ldr vAkeq, [input_addr, #(16*(\num*11+\idx))] + ldr vAkiq, [input_addr, #(16*(\num*12+\idx))] + ldr vAkoq, [input_addr, #(16*(\num*13+\idx))] + ldr vAkuq, [input_addr, #(16*(\num*14+\idx))] + ldr vAmaq, [input_addr, #(16*(\num*15+\idx))] + ldr vAmeq, [input_addr, #(16*(\num*16+\idx))] + ldr vAmiq, [input_addr, #(16*(\num*17+\idx))] + ldr vAmoq, [input_addr, #(16*(\num*18+\idx))] + ldr vAmuq, [input_addr, #(16*(\num*19+\idx))] + ldr vAsaq, [input_addr, #(16*(\num*20+\idx))] + ldr vAseq, [input_addr, #(16*(\num*21+\idx))] + ldr vAsiq, [input_addr, #(16*(\num*22+\idx))] + ldr vAsoq, [input_addr, #(16*(\num*23+\idx))] + ldr vAsuq, [input_addr, #(16*(\num*24+\idx))] +.endm + +.macro store_input_vector num idx + str vAbaq, [input_addr, #(16*(\num*0+\idx))] + str vAbeq, [input_addr, #(16*(\num*1+\idx))] + str vAbiq, [input_addr, #(16*(\num*2+\idx))] + str vAboq, [input_addr, #(16*(\num*3+\idx))] + str vAbuq, [input_addr, #(16*(\num*4+\idx))] + str vAgaq, [input_addr, #(16*(\num*5+\idx))] + str vAgeq, [input_addr, #(16*(\num*6+\idx))] + str vAgiq, [input_addr, #(16*(\num*7+\idx))] + str vAgoq, [input_addr, #(16*(\num*8+\idx))] + str vAguq, [input_addr, #(16*(\num*9+\idx))] + str vAkaq, [input_addr, #(16*(\num*10+\idx))] + str vAkeq, [input_addr, #(16*(\num*11+\idx))] + str vAkiq, [input_addr, #(16*(\num*12+\idx))] + str vAkoq, [input_addr, #(16*(\num*13+\idx))] + str vAkuq, [input_addr, #(16*(\num*14+\idx))] + str vAmaq, [input_addr, #(16*(\num*15+\idx))] + str vAmeq, [input_addr, #(16*(\num*16+\idx))] + str vAmiq, [input_addr, #(16*(\num*17+\idx))] + str vAmoq, [input_addr, #(16*(\num*18+\idx))] + str vAmuq, [input_addr, #(16*(\num*19+\idx))] + str vAsaq, [input_addr, #(16*(\num*20+\idx))] + str vAseq, [input_addr, #(16*(\num*21+\idx))] + str vAsiq, [input_addr, #(16*(\num*22+\idx))] + str vAsoq, [input_addr, #(16*(\num*23+\idx))] + str vAsuq, [input_addr, #(16*(\num*24+\idx))] +.endm + +.macro store_input_scalar num idx + str s_Aba, [input_addr, 8*(\num*(0) +\idx)] + str sAbe, [input_addr, 8*(\num*(0+1) +\idx)] + str sAbi, [input_addr, 8*(\num*(2)+ \idx)] + str sAbo, [input_addr, 8*(\num*(2+1) +\idx)] + str sAbu, [input_addr, 8*(\num*(4)+ \idx)] + str sAga, [input_addr, 8*(\num*(4+1) +\idx)] + str sAge, [input_addr, 8*(\num*(6)+ \idx)] + str sAgi, [input_addr, 8*(\num*(6+1) +\idx)] + str sAgo, [input_addr, 8*(\num*(8)+ \idx)] + str sAgu, [input_addr, 8*(\num*(8+1) +\idx)] + str sAka, [input_addr, 8*(\num*(10) +\idx)] + str sAke, [input_addr, 8*(\num*(10+1)+\idx)] + str sAki, [input_addr, 8*(\num*(12) +\idx)] + str sAko, [input_addr, 8*(\num*(12+1)+\idx)] + str sAku, [input_addr, 8*(\num*(14) +\idx)] + str sAma, [input_addr, 8*(\num*(14+1)+\idx)] + str sAme, [input_addr, 8*(\num*(16) +\idx)] + str sAmi, [input_addr, 8*(\num*(16+1)+\idx)] + str sAmo, [input_addr, 8*(\num*(18) +\idx)] + str sAmu, [input_addr, 8*(\num*(18+1)+\idx)] + str sAsa, [input_addr, 8*(\num*(20) +\idx)] + str sAse, [input_addr, 8*(\num*(20+1)+\idx)] + str sAsi, [input_addr, 8*(\num*(22) +\idx)] + str sAso, [input_addr, 8*(\num*(22+1)+\idx)] + str sAsu, [input_addr, 8*(\num*(24) +\idx)] +.endm + +.macro load_input_scalar num idx + ldr s_Aba, [input_addr, 8*(\num*(0) +\idx)] + ldr sAbe, [input_addr, 8*(\num*(0+1) +\idx)] + ldr sAbi, [input_addr, 8*(\num*(2)+ \idx)] + ldr sAbo, [input_addr, 8*(\num*(2+1) +\idx)] + ldr sAbu, [input_addr, 8*(\num*(4)+ \idx)] + ldr sAga, [input_addr, 8*(\num*(4+1) +\idx)] + ldr sAge, [input_addr, 8*(\num*(6)+ \idx)] + ldr sAgi, [input_addr, 8*(\num*(6+1) +\idx)] + ldr sAgo, [input_addr, 8*(\num*(8)+ \idx)] + ldr sAgu, [input_addr, 8*(\num*(8+1) +\idx)] + ldr sAka, [input_addr, 8*(\num*(10) +\idx)] + ldr sAke, [input_addr, 8*(\num*(10+1)+\idx)] + ldr sAki, [input_addr, 8*(\num*(12) +\idx)] + ldr sAko, [input_addr, 8*(\num*(12+1)+\idx)] + ldr sAku, [input_addr, 8*(\num*(14) +\idx)] + ldr sAma, [input_addr, 8*(\num*(14+1)+\idx)] + ldr sAme, [input_addr, 8*(\num*(16) +\idx)] + ldr sAmi, [input_addr, 8*(\num*(16+1)+\idx)] + ldr sAmo, [input_addr, 8*(\num*(18) +\idx)] + ldr sAmu, [input_addr, 8*(\num*(18+1)+\idx)] + ldr sAsa, [input_addr, 8*(\num*(20) +\idx)] + ldr sAse, [input_addr, 8*(\num*(20+1)+\idx)] + ldr sAsi, [input_addr, 8*(\num*(22) +\idx)] + ldr sAso, [input_addr, 8*(\num*(22+1)+\idx)] + ldr sAsu, [input_addr, 8*(\num*(24) +\idx)] +.endm + +#define STACK_SIZE (4*16 + 8*12 + 4*8) +#define STACK_BASE_GPRS (0) +#define STACK_BASE_VREGS (12*8) +#define STACK_BASE_TMP_GPRS (12*8 + 4*16) +#define STACK_OFFSET_INPUT (0*8) +#define STACK_OFFSET_CONST (1*8) +#define STACK_OFFSET_COUNT (2*8) + + +.macro save reg, offset + str \reg, [sp, #(STACK_BASE_TMP_GPRS + \offset)] +.endm + +.macro restore reg, offset + ldr \reg, [sp, #(STACK_BASE_TMP_GPRS + \offset)] +.endm + +.macro save_gprs + stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro save_vregs + stp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] + stp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + stp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + stp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] +.endm + +.macro restore_vregs + ldp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] + ldp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + ldp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + ldp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] +.endm + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +.macro eor5 dst, src0, src1, src2, src3, src4 + eor \dst, \src0, \src1 + eor \dst, \dst, \src2 + eor \dst, \dst, \src3 + eor \dst, \dst, \src4 +.endm + +.macro xor_rol dst, src1, src0, imm + eor \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro bic_rol dst, src1, src0, imm + bic \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro rotate dst, src, imm + ror \dst, \src, #(64-\imm) +.endm + +.macro hybrid_round_initial +eor sC0, sAma, sAsa SEP +eor sC1, sAme, sAse SEP eor3_m1 C0, vAba, vAga, vAka +eor sC2, sAmi, sAsi SEP eor3_m1 C0, C0, vAma, vAsa +eor sC3, sAmo, sAso SEP +eor sC4, sAmu, sAsu SEP eor3_m1 C1, vAbe, vAge, vAke +eor sC0, sAka, sC0 SEP eor3_m1 C1, C1, vAme, vAse +eor sC1, sAke, sC1 SEP +eor sC2, sAki, sC2 SEP eor3_m1 C2, vAbi, vAgi, vAki +eor sC3, sAko, sC3 SEP eor3_m1 C2, C2, vAmi, vAsi +eor sC4, sAku, sC4 SEP +eor sC0, sAga, sC0 SEP eor3_m1 C3, vAbo, vAgo, vAko +eor sC1, sAge, sC1 SEP eor3_m1 C3, C3, vAmo, vAso +eor sC2, sAgi, sC2 SEP +eor sC3, sAgo, sC3 SEP eor3_m1 C4, vAbu, vAgu, vAku +eor sC4, sAgu, sC4 SEP eor3_m1 C4, C4, vAmu, vAsu +eor sC0, s_Aba, sC0 SEP +eor sC1, sAbe, sC1 SEP rax1_m1 E1, C0, C2 +eor sC2, sAbi, sC2 SEP rax1_m1 E3, C2, C4 +eor sC3, sAbo, sC3 SEP +eor sC4, sAbu, sC4 SEP rax1_m1 E0, C4, C1 +eor sE1, sC0, sC2, ROR #63 SEP +eor sE3, sC2, sC4, ROR #63 SEP rax1_m1 E2, C1, C3 +eor sE0, sC4, sC1, ROR #63 SEP rax1_m1 E4, C3, C0 +eor sE2, sC1, sC3, ROR #63 SEP +eor sE4, sC3, sC0, ROR #63 SEP eor vAba_.16b, vAba.16b, E0.16b +eor s_Aba_, s_Aba, sE0 SEP xar_m1 vAsa_, vAbi, E2, 2 +eor sAsa_, sAbi, sE2 SEP +eor sAbi_, sAki, sE2 SEP xar_m1 vAbi_, vAki, E2, 21 +eor sAki_, sAko, sE3 SEP xar_m1 vAki_, vAko, E3, 39 +eor sAko_, sAmu, sE4 SEP +eor sAmu_, sAso, sE3 SEP xar_m1 vAko_, vAmu, E4, 56 +eor sAso_, sAma, sE0 SEP xar_m1 vAmu_, vAso, E3, 8 +eor sAka_, sAbe, sE1 SEP +eor sAse_, sAgo, sE3 SEP xar_m1 vAso_, vAma, E0, 23 +eor sAgo_, sAme, sE1 SEP xar_m1 vAka_, vAbe, E1, 63 +eor sAke_, sAgi, sE2 SEP +eor sAgi_, sAka, sE0 SEP xar_m1 vAse_, vAgo, E3, 9 +eor sAga_, sAbo, sE3 SEP +eor sAbo_, sAmo, sE3 SEP xar_m1 vAgo_, vAme, E1, 19 +eor sAmo_, sAmi, sE2 SEP xar_m1 vAke_, vAgi, E2, 58 +eor sAmi_, sAke, sE1 SEP +eor sAge_, sAgu, sE4 SEP xar_m1 vAgi_, vAka, E0, 61 +eor sAgu_, sAsi, sE2 SEP xar_m1 vAga_, vAbo, E3, 36 +eor sAsi_, sAku, sE4 SEP +eor sAku_, sAsa, sE0 SEP xar_m1 vAbo_, vAmo, E3, 43 +eor sAma_, sAbu, sE4 SEP xar_m1 vAmo_, vAmi, E2, 49 +eor sAbu_, sAsu, sE4 SEP +eor sAsu_, sAse, sE1 SEP xar_m1 vAmi_, vAke, E1, 54 +eor sAme_, sAga, sE0 SEP xar_m1 vAge_, vAgu, E4, 44 +eor sAbe_, sAge, sE1 SEP +load_constant_ptr SEP xar_m1 vAgu_, vAsi, E2, 3 +bic tmp, sAgi_, sAge_, ROR #47 SEP xar_m1 vAsi_, vAku, E4, 25 +eor sAga, tmp, sAga_, ROR #39 SEP +bic tmp, sAgo_, sAgi_, ROR #42 SEP xar_m1 vAku_, vAsa, E0, 46 +eor sAge, tmp, sAge_, ROR #25 SEP +bic tmp, sAgu_, sAgo_, ROR #16 SEP xar_m1 vAma_, vAbu, E4, 37 +eor sAgi, tmp, sAgi_, ROR #58 SEP xar_m1 vAbu_, vAsu, E4, 50 +bic tmp, sAga_, sAgu_, ROR #31 SEP +eor sAgo, tmp, sAgo_, ROR #47 SEP xar_m1 vAsu_, vAse, E1, 62 +bic tmp, sAge_, sAga_, ROR #56 SEP xar_m1 vAme_, vAga, E0, 28 +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP xar_m1 vAbe_, vAge, E1, 20 +eor sAka, tmp, sAka_, ROR #24 SEP bcax_m1 vAga, vAga_, vAgi_, vAge_ +bic tmp, sAko_, sAki_, ROR #47 SEP +eor sAke, tmp, sAke_, ROR #2 SEP bcax_m1 vAge, vAge_, vAgo_, vAgi_ +bic tmp, sAku_, sAko_, ROR #10 SEP bcax_m1 vAgi, vAgi_, vAgu_, vAgo_ +eor sAki, tmp, sAki_, ROR #57 SEP +bic tmp, sAka_, sAku_, ROR #47 SEP bcax_m1 vAgo, vAgo_, vAga_, vAgu_ +eor sAko, tmp, sAko_, ROR #57 SEP bcax_m1 vAgu, vAgu_, vAge_, vAga_ +bic tmp, sAke_, sAka_, ROR #5 SEP +eor sAku, tmp, sAku_, ROR #52 SEP bcax_m1 vAka, vAka_, vAki_, vAke_ +bic tmp, sAmi_, sAme_, ROR #38 SEP +eor sAma, tmp, sAma_, ROR #47 SEP restore x26, STACK_OFFSET_CONST +bic tmp, sAmo_, sAmi_, ROR #5 SEP ld1r {v28.2d}, [x26], #8 +eor sAme, tmp, sAme_, ROR #43 SEP save x26, STACK_OFFSET_CONST +bic tmp, sAmu_, sAmo_, ROR #41 SEP +eor sAmi, tmp, sAmi_, ROR #46 SEP bcax_m1 vAke, vAke_, vAko_, vAki_ +ldr cur_const, [const_addr] SEP bcax_m1 vAki, vAki_, vAku_, vAko_ +mov count, #1 SEP +bic tmp, sAma_, sAmu_, ROR #35 SEP bcax_m1 vAko, vAko_, vAka_, vAku_ +eor sAmo, tmp, sAmo_, ROR #12 SEP bcax_m1 vAku, vAku_, vAke_, vAka_ +bic tmp, sAme_, sAma_, ROR #9 SEP +eor sAmu, tmp, sAmu_, ROR #44 SEP +bic tmp, sAsi_, sAse_, ROR #48 SEP bcax_m1 vAma, vAma_, vAmi_, vAme_ +eor sAsa, tmp, sAsa_, ROR #41 SEP bcax_m1 vAme, vAme_, vAmo_, vAmi_ +bic tmp, sAso_, sAsi_, ROR #2 SEP bcax_m1 vAmi, vAmi_, vAmu_, vAmo_ +eor sAse, tmp, sAse_, ROR #50 SEP +bic tmp, sAsu_, sAso_, ROR #25 SEP bcax_m1 vAmo, vAmo_, vAma_, vAmu_ +eor sAsi, tmp, sAsi_, ROR #27 SEP bcax_m1 vAmu, vAmu_, vAme_, vAma_ +bic tmp, sAsa_, sAsu_, ROR #60 SEP +eor sAso, tmp, sAso_, ROR #21 SEP bcax_m1 vAsa, vAsa_, vAsi_, vAse_ +bic tmp, sAse_, sAsa_, ROR #57 SEP +eor sAsu, tmp, sAsu_, ROR #53 SEP bcax_m1 vAse, vAse_, vAso_, vAsi_ +bic tmp, sAbi_, sAbe_, ROR #63 SEP bcax_m1 vAsi, vAsi_, vAsu_, vAso_ +eor s_Aba, s_Aba_, tmp, ROR #21 SEP +bic tmp, sAbo_, sAbi_, ROR #42 SEP bcax_m1 vAso, vAso_, vAsa_, vAsu_ +eor sAbe, tmp, sAbe_, ROR #41 SEP bcax_m1 vAsu, vAsu_, vAse_, vAsa_ +bic tmp, sAbu_, sAbo_, ROR #57 SEP +eor sAbi, tmp, sAbi_, ROR #35 SEP bcax_m1 vAba, vAba_, vAbi_, vAbe_ +bic tmp, s_Aba_, sAbu_, ROR #50 SEP bcax_m1 vAbe, vAbe_, vAbo_, vAbi_ +eor sAbo, tmp, sAbo_, ROR #43 SEP +bic tmp, sAbe_, s_Aba_, ROR #44 SEP bcax_m1 vAbi, vAbi_, vAbu_, vAbo_ +eor sAbu, tmp, sAbu_, ROR #30 SEP bcax_m1 vAbo, vAbo_, vAba_, vAbu_ +eor s_Aba, s_Aba, cur_const SEP +save count, STACK_OFFSET_COUNT SEP bcax_m1 vAbu, vAbu_, vAbe_, vAba_ +eor sC0, sAka, sAsa, ROR #50 SEP eor vAba.16b, vAba.16b, v28.16b +eor sC1, sAse, sAge, ROR #60 SEP +eor sC2, sAmi, sAgi, ROR #59 SEP eor3_m1 C0, vAba, vAga, vAka +eor sC3, sAgo, sAso, ROR #30 SEP +eor sC4, sAbu, sAsu, ROR #53 SEP eor3_m1 C0, C0, vAma, vAsa +eor sC0, sAma, sC0, ROR #49 SEP eor3_m1 C1, vAbe, vAge, vAke +eor sC1, sAbe, sC1, ROR #44 SEP +eor sC2, sAki, sC2, ROR #26 SEP eor3_m1 C1, C1, vAme, vAse +eor sC3, sAmo, sC3, ROR #63 SEP eor3_m1 C2, vAbi, vAgi, vAki +eor sC4, sAmu, sC4, ROR #56 SEP +eor sC0, sAga, sC0, ROR #57 SEP eor3_m1 C2, C2, vAmi, vAsi +eor sC1, sAme, sC1, ROR #58 SEP eor3_m1 C3, vAbo, vAgo, vAko +eor sC2, sAbi, sC2, ROR #60 SEP +eor sC3, sAko, sC3, ROR #38 SEP eor3_m1 C3, C3, vAmo, vAso +eor sC4, sAgu, sC4, ROR #48 SEP eor3_m1 C4, vAbu, vAgu, vAku +eor sC0, s_Aba, sC0, ROR #61 SEP +eor sC1, sAke, sC1, ROR #57 SEP eor3_m1 C4, C4, vAmu, vAsu +eor sC2, sAsi, sC2, ROR #52 SEP rax1_m1 E1, C0, C2 +eor sC3, sAbo, sC3, ROR #63 SEP +eor sC4, sAku, sC4, ROR #50 SEP rax1_m1 E3, C2, C4 +ror sC1, sC1, 56 SEP +ror sC4, sC4, 58 SEP rax1_m1 E0, C4, C1 +ror sC2, sC2, 62 SEP rax1_m1 E2, C1, C3 +eor sE1, sC0, sC2, ROR #63 SEP +eor sE3, sC2, sC4, ROR #63 SEP rax1_m1 E4, C3, C0 +eor sE0, sC4, sC1, ROR #63 SEP eor vAba_.16b, vAba.16b, E0.16b +eor sE2, sC1, sC3, ROR #63 SEP +eor sE4, sC3, sC0, ROR #63 SEP xar_m1 vAsa_, vAbi, E2, 2 +eor s_Aba_, sE0, s_Aba SEP xar_m1 vAbi_, vAki, E2, 21 +eor sAsa_, sE2, sAbi, ROR #50 SEP +eor sAbi_, sE2, sAki, ROR #46 SEP xar_m1 vAki_, vAko, E3, 39 +eor sAki_, sE3, sAko, ROR #63 SEP xar_m1 vAko_, vAmu, E4, 56 +eor sAko_, sE4, sAmu, ROR #28 SEP +eor sAmu_, sE3, sAso, ROR #2 SEP xar_m1 vAmu_, vAso, E3, 8 +eor sAso_, sE0, sAma, ROR #54 SEP xar_m1 vAso_, vAma, E0, 23 +eor sAka_, sE1, sAbe, ROR #43 SEP +eor sAse_, sE3, sAgo, ROR #36 SEP xar_m1 vAka_, vAbe, E1, 63 +eor sAgo_, sE1, sAme, ROR #49 SEP +eor sAke_, sE2, sAgi, ROR #3 SEP xar_m1 vAse_, vAgo, E3, 9 +eor sAgi_, sE0, sAka, ROR #39 SEP xar_m1 vAgo_, vAme, E1, 19 +eor sAga_, sE3, sAbo SEP +eor sAbo_, sE3, sAmo, ROR #37 SEP xar_m1 vAke_, vAgi, E2, 58 +eor sAmo_, sE2, sAmi, ROR #8 SEP xar_m1 vAgi_, vAka, E0, 61 +eor sAmi_, sE1, sAke, ROR #56 SEP +eor sAge_, sE4, sAgu, ROR #44 SEP xar_m1 vAga_, vAbo, E3, 36 +eor sAgu_, sE2, sAsi, ROR #62 SEP xar_m1 vAbo_, vAmo, E3, 43 +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP xar_m1 vAmo_, vAmi, E2, 49 +eor sAma_, sE4, sAbu, ROR #20 SEP xar_m1 vAmi_, vAke, E1, 54 +eor sAbu_, sE4, sAsu, ROR #9 SEP +eor sAsu_, sE1, sAse, ROR #23 SEP xar_m1 vAge_, vAgu, E4, 44 +eor sAme_, sE0, sAga, ROR #61 SEP xar_m1 vAgu_, vAsi, E2, 3 +eor sAbe_, sE1, sAge, ROR #19 SEP +load_constant_ptr SEP xar_m1 vAsi_, vAku, E4, 25 +restore count, STACK_OFFSET_COUNT SEP xar_m1 vAku_, vAsa, E0, 46 +bic tmp, sAgi_, sAge_, ROR #47 SEP +eor sAga, tmp, sAga_, ROR #39 SEP xar_m1 vAma_, vAbu, E4, 37 +bic tmp, sAgo_, sAgi_, ROR #42 SEP +eor sAge, tmp, sAge_, ROR #25 SEP xar_m1 vAbu_, vAsu, E4, 50 +bic tmp, sAgu_, sAgo_, ROR #16 SEP xar_m1 vAsu_, vAse, E1, 62 +eor sAgi, tmp, sAgi_, ROR #58 SEP +bic tmp, sAga_, sAgu_, ROR #31 SEP xar_m1 vAme_, vAga, E0, 28 +eor sAgo, tmp, sAgo_, ROR #47 SEP xar_m1 vAbe_, vAge, E1, 20 +bic tmp, sAge_, sAga_, ROR #56 SEP +eor sAgu, tmp, sAgu_, ROR #23 SEP bcax_m1 vAga, vAga_, vAgi_, vAge_ +bic tmp, sAki_, sAke_, ROR #19 SEP bcax_m1 vAge, vAge_, vAgo_, vAgi_ +eor sAka, tmp, sAka_, ROR #24 SEP +bic tmp, sAko_, sAki_, ROR #47 SEP bcax_m1 vAgi, vAgi_, vAgu_, vAgo_ +eor sAke, tmp, sAke_, ROR #2 SEP bcax_m1 vAgo, vAgo_, vAga_, vAgu_ +bic tmp, sAku_, sAko_, ROR #10 SEP +eor sAki, tmp, sAki_, ROR #57 SEP bcax_m1 vAgu, vAgu_, vAge_, vAga_ +bic tmp, sAka_, sAku_, ROR #47 SEP bcax_m1 vAka, vAka_, vAki_, vAke_ +eor sAko, tmp, sAko_, ROR #57 SEP +bic tmp, sAke_, sAka_, ROR #5 SEP bcax_m1 vAke, vAke_, vAko_, vAki_ +eor sAku, tmp, sAku_, ROR #52 SEP +bic tmp, sAmi_, sAme_, ROR #38 SEP bcax_m1 vAki, vAki_, vAku_, vAko_ +eor sAma, tmp, sAma_, ROR #47 SEP bcax_m1 vAko, vAko_, vAka_, vAku_ +bic tmp, sAmo_, sAmi_, ROR #5 SEP +eor sAme, tmp, sAme_, ROR #43 SEP bcax_m1 vAku, vAku_, vAke_, vAka_ +bic tmp, sAmu_, sAmo_, ROR #41 SEP restore x26, STACK_OFFSET_CONST +eor sAmi, tmp, sAmi_, ROR #46 SEP ld1r {v28.2d}, [x26], #8 +bic tmp, sAma_, sAmu_, ROR #35 SEP save x26, STACK_OFFSET_CONST +ldr cur_const, [const_addr, count, UXTW #3] SEP bcax_m1 vAme, vAme_, vAmo_, vAmi_ +eor sAmo, tmp, sAmo_, ROR #12 SEP bcax_m1 vAma, vAma_, vAmi_, vAme_ +bic tmp, sAme_, sAma_, ROR #9 SEP +eor sAmu, tmp, sAmu_, ROR #44 SEP bcax_m1 vAmi, vAmi_, vAmu_, vAmo_ +bic tmp, sAsi_, sAse_, ROR #48 SEP +eor sAsa, tmp, sAsa_, ROR #41 SEP +bic tmp, sAso_, sAsi_, ROR #2 SEP bcax_m1 vAmo, vAmo_, vAma_, vAmu_ +eor sAse, tmp, sAse_, ROR #50 SEP +bic tmp, sAsu_, sAso_, ROR #25 SEP bcax_m1 vAmu, vAmu_, vAme_, vAma_ +eor sAsi, tmp, sAsi_, ROR #27 SEP +bic tmp, sAsa_, sAsu_, ROR #60 SEP bcax_m1 vAsa, vAsa_, vAsi_, vAse_ +eor sAso, tmp, sAso_, ROR #21 SEP bcax_m1 vAse, vAse_, vAso_, vAsi_ +bic tmp, sAse_, sAsa_, ROR #57 SEP +eor sAsu, tmp, sAsu_, ROR #53 SEP bcax_m1 vAsi, vAsi_, vAsu_, vAso_ +bic tmp, sAbi_, sAbe_, ROR #63 SEP bcax_m1 vAso, vAso_, vAsa_, vAsu_ +eor s_Aba, s_Aba_, tmp, ROR #21 SEP +bic tmp, sAbo_, sAbi_, ROR #42 SEP bcax_m1 vAsu, vAsu_, vAse_, vAsa_ +eor sAbe, tmp, sAbe_, ROR #41 SEP bcax_m1 vAba, vAba_, vAbi_, vAbe_ +bic tmp, sAbu_, sAbo_, ROR #57 SEP +eor sAbi, tmp, sAbi_, ROR #35 SEP bcax_m1 vAbe, vAbe_, vAbo_, vAbi_ +bic tmp, s_Aba_, sAbu_, ROR #50 SEP bcax_m1 vAbi, vAbi_, vAbu_, vAbo_ +eor sAbo, tmp, sAbo_, ROR #43 SEP +bic tmp, sAbe_, s_Aba_, ROR #44 SEP bcax_m1 vAbo, vAbo_, vAba_, vAbu_ +eor sAbu, tmp, sAbu_, ROR #30 SEP bcax_m1 vAbu, vAbu_, vAbe_, vAba_ +add count, count, #1 SEP +eor s_Aba, s_Aba, cur_const SEP eor vAba.16b, vAba.16b, v28.16b +.endm + +.macro hybrid_round_noninitial +save count, STACK_OFFSET_COUNT SEP +eor sC0, sAka, sAsa, ROR #50 SEP eor3_m1 C0, vAba, vAga, vAka +eor sC1, sAse, sAge, ROR #60 SEP eor3_m1 C0, C0, vAma, vAsa +eor sC2, sAmi, sAgi, ROR #59 SEP +eor sC3, sAgo, sAso, ROR #30 SEP eor3_m1 C1, vAbe, vAge, vAke +eor sC4, sAbu, sAsu, ROR #53 SEP eor3_m1 C1, C1, vAme, vAse +eor sC0, sAma, sC0, ROR #49 SEP +eor sC1, sAbe, sC1, ROR #44 SEP eor3_m1 C2, vAbi, vAgi, vAki +eor sC2, sAki, sC2, ROR #26 SEP eor3_m1 C2, C2, vAmi, vAsi +eor sC3, sAmo, sC3, ROR #63 SEP +eor sC4, sAmu, sC4, ROR #56 SEP eor3_m1 C3, vAbo, vAgo, vAko +eor sC0, sAga, sC0, ROR #57 SEP +eor sC1, sAme, sC1, ROR #58 SEP eor3_m1 C3, C3, vAmo, vAso +eor sC2, sAbi, sC2, ROR #60 SEP eor3_m1 C4, vAbu, vAgu, vAku +eor sC3, sAko, sC3, ROR #38 SEP +eor sC4, sAgu, sC4, ROR #48 SEP eor3_m1 C4, C4, vAmu, vAsu +eor sC0, s_Aba, sC0, ROR #61 SEP rax1_m1 E1, C0, C2 +eor sC1, sAke, sC1, ROR #57 SEP +eor sC2, sAsi, sC2, ROR #52 SEP rax1_m1 E3, C2, C4 +eor sC3, sAbo, sC3, ROR #63 SEP rax1_m1 E0, C4, C1 +eor sC4, sAku, sC4, ROR #50 SEP +ror sC1, sC1, 56 SEP rax1_m1 E2, C1, C3 +ror sC4, sC4, 58 SEP +ror sC2, sC2, 62 SEP rax1_m1 E4, C3, C0 +eor sE1, sC0, sC2, ROR #63 SEP eor vAba_.16b, vAba.16b, E0.16b +eor sE3, sC2, sC4, ROR #63 SEP +eor sE0, sC4, sC1, ROR #63 SEP xar_m1 vAsa_, vAbi, E2, 2 +eor sE2, sC1, sC3, ROR #63 SEP xar_m1 vAbi_, vAki, E2, 21 +eor sE4, sC3, sC0, ROR #63 SEP +eor s_Aba_, sE0, s_Aba SEP xar_m1 vAki_, vAko, E3, 39 +eor sAsa_, sE2, sAbi, ROR #50 SEP +eor sAbi_, sE2, sAki, ROR #46 SEP xar_m1 vAko_, vAmu, E4, 56 +eor sAki_, sE3, sAko, ROR #63 SEP xar_m1 vAmu_, vAso, E3, 8 +eor sAko_, sE4, sAmu, ROR #28 SEP +eor sAmu_, sE3, sAso, ROR #2 SEP xar_m1 vAso_, vAma, E0, 23 +eor sAso_, sE0, sAma, ROR #54 SEP xar_m1 vAka_, vAbe, E1, 63 +eor sAka_, sE1, sAbe, ROR #43 SEP +eor sAse_, sE3, sAgo, ROR #36 SEP xar_m1 vAse_, vAgo, E3, 9 +eor sAgo_, sE1, sAme, ROR #49 SEP xar_m1 vAgo_, vAme, E1, 19 +eor sAke_, sE2, sAgi, ROR #3 SEP +eor sAgi_, sE0, sAka, ROR #39 SEP xar_m1 vAke_, vAgi, E2, 58 +eor sAga_, sE3, sAbo SEP +eor sAbo_, sE3, sAmo, ROR #37 SEP xar_m1 vAgi_, vAka, E0, 61 +eor sAmo_, sE2, sAmi, ROR #8 SEP xar_m1 vAga_, vAbo, E3, 36 +eor sAmi_, sE1, sAke, ROR #56 SEP +eor sAge_, sE4, sAgu, ROR #44 SEP xar_m1 vAbo_, vAmo, E3, 43 +eor sAgu_, sE2, sAsi, ROR #62 SEP xar_m1 vAmo_, vAmi, E2, 49 +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP xar_m1 vAmi_, vAke, E1, 54 +eor sAma_, sE4, sAbu, ROR #20 SEP xar_m1 vAge_, vAgu, E4, 44 +eor sAbu_, sE4, sAsu, ROR #9 SEP +eor sAsu_, sE1, sAse, ROR #23 SEP xar_m1 vAgu_, vAsi, E2, 3 +eor sAme_, sE0, sAga, ROR #61 SEP +eor sAbe_, sE1, sAge, ROR #19 SEP xar_m1 vAsi_, vAku, E4, 25 +load_constant_ptr SEP xar_m1 vAku_, vAsa, E0, 46 +restore count, STACK_OFFSET_COUNT SEP +bic tmp, sAgi_, sAge_, ROR #47 SEP xar_m1 vAma_, vAbu, E4, 37 +eor sAga, tmp, sAga_, ROR #39 SEP xar_m1 vAbu_, vAsu, E4, 50 +bic tmp, sAgo_, sAgi_, ROR #42 SEP +eor sAge, tmp, sAge_, ROR #25 SEP xar_m1 vAsu_, vAse, E1, 62 +bic tmp, sAgu_, sAgo_, ROR #16 SEP +eor sAgi, tmp, sAgi_, ROR #58 SEP xar_m1 vAme_, vAga, E0, 28 +bic tmp, sAga_, sAgu_, ROR #31 SEP xar_m1 vAbe_, vAge, E1, 20 +eor sAgo, tmp, sAgo_, ROR #47 SEP +bic tmp, sAge_, sAga_, ROR #56 SEP bcax_m1 vAga, vAga_, vAgi_, vAge_ +eor sAgu, tmp, sAgu_, ROR #23 SEP bcax_m1 vAge, vAge_, vAgo_, vAgi_ +bic tmp, sAki_, sAke_, ROR #19 SEP +eor sAka, tmp, sAka_, ROR #24 SEP bcax_m1 vAgi, vAgi_, vAgu_, vAgo_ +bic tmp, sAko_, sAki_, ROR #47 SEP bcax_m1 vAgo, vAgo_, vAga_, vAgu_ +eor sAke, tmp, sAke_, ROR #2 SEP +bic tmp, sAku_, sAko_, ROR #10 SEP bcax_m1 vAgu, vAgu_, vAge_, vAga_ +eor sAki, tmp, sAki_, ROR #57 SEP +bic tmp, sAka_, sAku_, ROR #47 SEP bcax_m1 vAka, vAka_, vAki_, vAke_ +eor sAko, tmp, sAko_, ROR #57 SEP bcax_m1 vAke, vAke_, vAko_, vAki_ +bic tmp, sAke_, sAka_, ROR #5 SEP +eor sAku, tmp, sAku_, ROR #52 SEP bcax_m1 vAki, vAki_, vAku_, vAko_ +bic tmp, sAmi_, sAme_, ROR #38 SEP bcax_m1 vAko, vAko_, vAka_, vAku_ +eor sAma, tmp, sAma_, ROR #47 SEP +bic tmp, sAmo_, sAmi_, ROR #5 SEP bcax_m1 vAku, vAku_, vAke_, vAka_ +eor sAme, tmp, sAme_, ROR #43 SEP bcax_m1 vAma, vAma_, vAmi_, vAme_ +bic tmp, sAmu_, sAmo_, ROR #41 SEP restore x26, STACK_OFFSET_CONST +eor sAmi, tmp, sAmi_, ROR #46 SEP ld1r {v28.2d}, [x26], #8 +bic tmp, sAma_, sAmu_, ROR #35 SEP save x26, STACK_OFFSET_CONST +ldr cur_const, [const_addr, count, UXTW #3] SEP +add count, count, #1 SEP +eor sAmo, tmp, sAmo_, ROR #12 SEP bcax_m1 vAme, vAme_, vAmo_, vAmi_ +bic tmp, sAme_, sAma_, ROR #9 SEP bcax_m1 vAmi, vAmi_, vAmu_, vAmo_ +eor sAmu, tmp, sAmu_, ROR #44 SEP +bic tmp, sAsi_, sAse_, ROR #48 SEP +eor sAsa, tmp, sAsa_, ROR #41 SEP bcax_m1 vAmo, vAmo_, vAma_, vAmu_ +bic tmp, sAso_, sAsi_, ROR #2 SEP +eor sAse, tmp, sAse_, ROR #50 SEP bcax_m1 vAmu, vAmu_, vAme_, vAma_ +bic tmp, sAsu_, sAso_, ROR #25 SEP bcax_m1 vAsa, vAsa_, vAsi_, vAse_ +eor sAsi, tmp, sAsi_, ROR #27 SEP +bic tmp, sAsa_, sAsu_, ROR #60 SEP bcax_m1 vAse, vAse_, vAso_, vAsi_ +eor sAso, tmp, sAso_, ROR #21 SEP bcax_m1 vAsi, vAsi_, vAsu_, vAso_ +bic tmp, sAse_, sAsa_, ROR #57 SEP +eor sAsu, tmp, sAsu_, ROR #53 SEP bcax_m1 vAso, vAso_, vAsa_, vAsu_ +bic tmp, sAbi_, sAbe_, ROR #63 SEP bcax_m1 vAsu, vAsu_, vAse_, vAsa_ +eor s_Aba, s_Aba_, tmp, ROR #21 SEP +bic tmp, sAbo_, sAbi_, ROR #42 SEP bcax_m1 vAba, vAba_, vAbi_, vAbe_ +eor sAbe, tmp, sAbe_, ROR #41 SEP +bic tmp, sAbu_, sAbo_, ROR #57 SEP bcax_m1 vAbe, vAbe_, vAbo_, vAbi_ +eor sAbi, tmp, sAbi_, ROR #35 SEP bcax_m1 vAbi, vAbi_, vAbu_, vAbo_ +bic tmp, s_Aba_, sAbu_, ROR #50 SEP +eor sAbo, tmp, sAbo_, ROR #43 SEP bcax_m1 vAbo, vAbo_, vAba_, vAbu_ +bic tmp, sAbe_, s_Aba_, ROR #44 SEP bcax_m1 vAbu, vAbu_, vAbe_, vAba_ +eor sAbu, tmp, sAbu_, ROR #30 SEP +eor s_Aba, s_Aba, cur_const SEP eor vAba.16b, vAba.16b, v28.16b +save count, STACK_OFFSET_COUNT SEP +eor sC0, sAka, sAsa, ROR #50 SEP eor3_m1 C0, vAba, vAga, vAka +eor sC1, sAse, sAge, ROR #60 SEP eor3_m1 C0, C0, vAma, vAsa +eor sC2, sAmi, sAgi, ROR #59 SEP +eor sC3, sAgo, sAso, ROR #30 SEP eor3_m1 C1, vAbe, vAge, vAke +eor sC4, sAbu, sAsu, ROR #53 SEP eor3_m1 C1, C1, vAme, vAse +eor sC0, sAma, sC0, ROR #49 SEP +eor sC1, sAbe, sC1, ROR #44 SEP eor3_m1 C2, vAbi, vAgi, vAki +eor sC2, sAki, sC2, ROR #26 SEP eor3_m1 C2, C2, vAmi, vAsi +eor sC3, sAmo, sC3, ROR #63 SEP +eor sC4, sAmu, sC4, ROR #56 SEP eor3_m1 C3, vAbo, vAgo, vAko +eor sC0, sAga, sC0, ROR #57 SEP +eor sC1, sAme, sC1, ROR #58 SEP eor3_m1 C3, C3, vAmo, vAso +eor sC2, sAbi, sC2, ROR #60 SEP eor3_m1 C4, vAbu, vAgu, vAku +eor sC3, sAko, sC3, ROR #38 SEP +eor sC4, sAgu, sC4, ROR #48 SEP eor3_m1 C4, C4, vAmu, vAsu +eor sC0, s_Aba, sC0, ROR #61 SEP rax1_m1 E1, C0, C2 +eor sC1, sAke, sC1, ROR #57 SEP +eor sC2, sAsi, sC2, ROR #52 SEP rax1_m1 E3, C2, C4 +eor sC3, sAbo, sC3, ROR #63 SEP rax1_m1 E0, C4, C1 +eor sC4, sAku, sC4, ROR #50 SEP +ror sC1, sC1, 56 SEP rax1_m1 E2, C1, C3 +ror sC4, sC4, 58 SEP +ror sC2, sC2, 62 SEP rax1_m1 E4, C3, C0 +eor sE1, sC0, sC2, ROR #63 SEP eor vAba_.16b, vAba.16b, E0.16b +eor sE3, sC2, sC4, ROR #63 SEP +eor sE0, sC4, sC1, ROR #63 SEP xar_m1 vAsa_, vAbi, E2, 2 +eor sE2, sC1, sC3, ROR #63 SEP xar_m1 vAbi_, vAki, E2, 21 +eor sE4, sC3, sC0, ROR #63 SEP +eor s_Aba_, sE0, s_Aba SEP xar_m1 vAki_, vAko, E3, 39 +eor sAsa_, sE2, sAbi, ROR #50 SEP +eor sAbi_, sE2, sAki, ROR #46 SEP xar_m1 vAko_, vAmu, E4, 56 +eor sAki_, sE3, sAko, ROR #63 SEP xar_m1 vAmu_, vAso, E3, 8 +eor sAko_, sE4, sAmu, ROR #28 SEP +eor sAmu_, sE3, sAso, ROR #2 SEP xar_m1 vAso_, vAma, E0, 23 +eor sAso_, sE0, sAma, ROR #54 SEP xar_m1 vAka_, vAbe, E1, 63 +eor sAka_, sE1, sAbe, ROR #43 SEP +eor sAse_, sE3, sAgo, ROR #36 SEP xar_m1 vAse_, vAgo, E3, 9 +eor sAgo_, sE1, sAme, ROR #49 SEP xar_m1 vAgo_, vAme, E1, 19 +eor sAke_, sE2, sAgi, ROR #3 SEP +eor sAgi_, sE0, sAka, ROR #39 SEP xar_m1 vAke_, vAgi, E2, 58 +eor sAga_, sE3, sAbo SEP +eor sAbo_, sE3, sAmo, ROR #37 SEP xar_m1 vAgi_, vAka, E0, 61 +eor sAmo_, sE2, sAmi, ROR #8 SEP xar_m1 vAga_, vAbo, E3, 36 +eor sAmi_, sE1, sAke, ROR #56 SEP +eor sAge_, sE4, sAgu, ROR #44 SEP xar_m1 vAbo_, vAmo, E3, 43 +eor sAgu_, sE2, sAsi, ROR #62 SEP xar_m1 vAmo_, vAmi, E2, 49 +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP xar_m1 vAmi_, vAke, E1, 54 +eor sAma_, sE4, sAbu, ROR #20 SEP xar_m1 vAge_, vAgu, E4, 44 +eor sAbu_, sE4, sAsu, ROR #9 SEP +eor sAsu_, sE1, sAse, ROR #23 SEP xar_m1 vAgu_, vAsi, E2, 3 +eor sAme_, sE0, sAga, ROR #61 SEP +eor sAbe_, sE1, sAge, ROR #19 SEP xar_m1 vAsi_, vAku, E4, 25 +load_constant_ptr SEP xar_m1 vAku_, vAsa, E0, 46 +restore count, STACK_OFFSET_COUNT SEP +bic tmp, sAgi_, sAge_, ROR #47 SEP xar_m1 vAma_, vAbu, E4, 37 +eor sAga, tmp, sAga_, ROR #39 SEP xar_m1 vAbu_, vAsu, E4, 50 +bic tmp, sAgo_, sAgi_, ROR #42 SEP +eor sAge, tmp, sAge_, ROR #25 SEP xar_m1 vAsu_, vAse, E1, 62 +bic tmp, sAgu_, sAgo_, ROR #16 SEP +eor sAgi, tmp, sAgi_, ROR #58 SEP xar_m1 vAme_, vAga, E0, 28 +bic tmp, sAga_, sAgu_, ROR #31 SEP xar_m1 vAbe_, vAge, E1, 20 +eor sAgo, tmp, sAgo_, ROR #47 SEP +bic tmp, sAge_, sAga_, ROR #56 SEP bcax_m1 vAga, vAga_, vAgi_, vAge_ +eor sAgu, tmp, sAgu_, ROR #23 SEP bcax_m1 vAge, vAge_, vAgo_, vAgi_ +bic tmp, sAki_, sAke_, ROR #19 SEP +eor sAka, tmp, sAka_, ROR #24 SEP bcax_m1 vAgi, vAgi_, vAgu_, vAgo_ +bic tmp, sAko_, sAki_, ROR #47 SEP bcax_m1 vAgo, vAgo_, vAga_, vAgu_ +eor sAke, tmp, sAke_, ROR #2 SEP +bic tmp, sAku_, sAko_, ROR #10 SEP bcax_m1 vAgu, vAgu_, vAge_, vAga_ +eor sAki, tmp, sAki_, ROR #57 SEP +bic tmp, sAka_, sAku_, ROR #47 SEP bcax_m1 vAka, vAka_, vAki_, vAke_ +eor sAko, tmp, sAko_, ROR #57 SEP bcax_m1 vAke, vAke_, vAko_, vAki_ +bic tmp, sAke_, sAka_, ROR #5 SEP +eor sAku, tmp, sAku_, ROR #52 SEP bcax_m1 vAki, vAki_, vAku_, vAko_ +bic tmp, sAmi_, sAme_, ROR #38 SEP bcax_m1 vAko, vAko_, vAka_, vAku_ +eor sAma, tmp, sAma_, ROR #47 SEP +bic tmp, sAmo_, sAmi_, ROR #5 SEP bcax_m1 vAku, vAku_, vAke_, vAka_ +eor sAme, tmp, sAme_, ROR #43 SEP bcax_m1 vAma, vAma_, vAmi_, vAme_ +bic tmp, sAmu_, sAmo_, ROR #41 SEP restore x26, STACK_OFFSET_CONST +eor sAmi, tmp, sAmi_, ROR #46 SEP ld1r {v28.2d}, [x26], #8 +bic tmp, sAma_, sAmu_, ROR #35 SEP save x26, STACK_OFFSET_CONST +ldr cur_const, [const_addr, count, UXTW #3] SEP +add count, count, #1 SEP bcax_m1 vAme, vAme_, vAmo_, vAmi_ +eor sAmo, tmp, sAmo_, ROR #12 SEP +bic tmp, sAme_, sAma_, ROR #9 SEP bcax_m1 vAmi, vAmi_, vAmu_, vAmo_ +eor sAmu, tmp, sAmu_, ROR #44 SEP +bic tmp, sAsi_, sAse_, ROR #48 SEP +eor sAsa, tmp, sAsa_, ROR #41 SEP bcax_m1 vAmo, vAmo_, vAma_, vAmu_ +bic tmp, sAso_, sAsi_, ROR #2 SEP +eor sAse, tmp, sAse_, ROR #50 SEP bcax_m1 vAmu, vAmu_, vAme_, vAma_ +bic tmp, sAsu_, sAso_, ROR #25 SEP bcax_m1 vAsa, vAsa_, vAsi_, vAse_ +eor sAsi, tmp, sAsi_, ROR #27 SEP +bic tmp, sAsa_, sAsu_, ROR #60 SEP bcax_m1 vAse, vAse_, vAso_, vAsi_ +eor sAso, tmp, sAso_, ROR #21 SEP bcax_m1 vAsi, vAsi_, vAsu_, vAso_ +bic tmp, sAse_, sAsa_, ROR #57 SEP +eor sAsu, tmp, sAsu_, ROR #53 SEP bcax_m1 vAso, vAso_, vAsa_, vAsu_ +bic tmp, sAbi_, sAbe_, ROR #63 SEP bcax_m1 vAsu, vAsu_, vAse_, vAsa_ +eor s_Aba, s_Aba_, tmp, ROR #21 SEP +bic tmp, sAbo_, sAbi_, ROR #42 SEP bcax_m1 vAba, vAba_, vAbi_, vAbe_ +eor sAbe, tmp, sAbe_, ROR #41 SEP +bic tmp, sAbu_, sAbo_, ROR #57 SEP bcax_m1 vAbe, vAbe_, vAbo_, vAbi_ +eor sAbi, tmp, sAbi_, ROR #35 SEP bcax_m1 vAbi, vAbi_, vAbu_, vAbo_ +bic tmp, s_Aba_, sAbu_, ROR #50 SEP +eor sAbo, tmp, sAbo_, ROR #43 SEP bcax_m1 vAbo, vAbo_, vAba_, vAbu_ +bic tmp, sAbe_, s_Aba_, ROR #44 SEP bcax_m1 vAbu, vAbu_, vAbe_, vAba_ +eor sAbu, tmp, sAbu_, ROR #30 SEP +eor s_Aba, s_Aba, cur_const SEP eor vAba.16b, vAba.16b, v28.16b +.endm + +.macro final_rotate + ror sAga, sAga,#(64-3) + ror sAka, sAka,#(64-25) + ror sAma, sAma,#(64-10) + ror sAsa, sAsa,#(64-39) + ror sAbe, sAbe,#(64-21) + ror sAge, sAge,#(64-45) + ror sAke, sAke,#(64-8) + ror sAme, sAme,#(64-15) + ror sAse, sAse,#(64-41) + ror sAbi, sAbi,#(64-14) + ror sAgi, sAgi,#(64-61) + ror sAki, sAki,#(64-18) + ror sAmi, sAmi,#(64-56) + ror sAsi, sAsi,#(64-2) + ror sAgo, sAgo,#(64-28) + ror sAko, sAko,#(64-1) + ror sAmo, sAmo,#(64-27) + ror sAso, sAso,#(64-62) + ror sAbu, sAbu,#(64-44) + ror sAgu, sAgu,#(64-20) + ror sAku, sAku,#(64-6) + ror sAmu, sAmu,#(64-36) + ror sAsu, sAsu,#(64-55) +.endm + +#define KECCAK_F1600_ROUNDS 24 + +.global keccak_f1600_x3_hybrid_asm_v3p +.global _keccak_f1600_x3_hybrid_asm_v3p +.text +.align 4 + +keccak_f1600_x3_hybrid_asm_v3p: +_keccak_f1600_x3_hybrid_asm_v3p: + alloc_stack + save_gprs + save_vregs + save input_addr, STACK_OFFSET_INPUT + + load_input_vector 1,0 + + load_constant_ptr + + save const_addr, STACK_OFFSET_CONST + + add input_addr, input_addr, #400 + load_input_scalar 1,0 + hybrid_round_initial + loop_0: + hybrid_round_noninitial + cmp count, #(KECCAK_F1600_ROUNDS) + blt loop_0 + final_rotate + restore input_addr, STACK_OFFSET_INPUT + store_input_vector 1,0 + add input_addr, input_addr, #400 + store_input_scalar 1,0 + + restore_vregs + restore_gprs + free_stack + ret diff --git a/tests/keccak_neon/manual/keccak_f1600_x3_hybrid_asm_v6.s b/tests/keccak_neon/manual/keccak_f1600_x3_hybrid_asm_v6.s new file mode 100644 index 0000000..5352d31 --- /dev/null +++ b/tests/keccak_neon/manual/keccak_f1600_x3_hybrid_asm_v6.s @@ -0,0 +1,1377 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" +#if defined(__ARM_FEATURE_SHA3) + +/********************** CONSTANTS *************************/ + .data + .align(8) +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 +round_constants_vec: + .quad 0x0000000000000001 + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + .quad 0x8000000080008008 +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x29 + count .req w27 + cur_const .req x26 + + /* Mapping of Kecck-f1600 SIMD state to vector registers + * at the beginning and end of each round. */ + + /* Mapping of Kecck-f1600 state to vector registers + * at the beginning and end of each round. */ + vAba .req v0 + vAbe .req v1 + vAbi .req v2 + vAbo .req v3 + vAbu .req v4 + vAga .req v5 + vAge .req v6 + vAgi .req v7 + vAgo .req v8 + vAgu .req v9 + vAka .req v10 + vAke .req v11 + vAki .req v12 + vAko .req v13 + vAku .req v14 + vAma .req v15 + vAme .req v16 + vAmi .req v17 + vAmo .req v18 + vAmu .req v19 + vAsa .req v20 + vAse .req v21 + vAsi .req v22 + vAso .req v23 + vAsu .req v24 + + /* q-form of the above mapping */ + vAbaq .req q0 + vAbeq .req q1 + vAbiq .req q2 + vAboq .req q3 + vAbuq .req q4 + vAgaq .req q5 + vAgeq .req q6 + vAgiq .req q7 + vAgoq .req q8 + vAguq .req q9 + vAkaq .req q10 + vAkeq .req q11 + vAkiq .req q12 + vAkoq .req q13 + vAkuq .req q14 + vAmaq .req q15 + vAmeq .req q16 + vAmiq .req q17 + vAmoq .req q18 + vAmuq .req q19 + vAsaq .req q20 + vAseq .req q21 + vAsiq .req q22 + vAsoq .req q23 + vAsuq .req q24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v27 + C1 .req v28 + C2 .req v29 + C3 .req v30 + C4 .req v31 + + C0q .req q27 + C1q .req q28 + C2q .req q29 + C3q .req q30 + C4q .req q31 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + vBba .req v25 // fresh + vBbe .req v26 // fresh + vBbi .req vAbi + vBbo .req vAbo + vBbu .req vAbu + vBga .req vAka + vBge .req vAke + vBgi .req vAgi + vBgo .req vAgo + vBgu .req vAgu + vBka .req vAma + vBke .req vAme + vBki .req vAki + vBko .req vAko + vBku .req vAku + vBma .req vAsa + vBme .req vAse + vBmi .req vAmi + vBmo .req vAmo + vBmu .req vAmu + vBsa .req vAba + vBse .req vAbe + vBsi .req vAsi + vBso .req vAso + vBsu .req vAsu + + vBbaq .req q25 // fresh + vBbeq .req q26 // fresh + vBbiq .req vAbiq + vBboq .req vAboq + vBbuq .req vAbuq + vBgaq .req vAkaq + vBgeq .req vAkeq + vBgiq .req vAgiq + vBgoq .req vAgoq + vBguq .req vAguq + vBkaq .req vAmaq + vBkeq .req vAmeq + vBkiq .req vAkiq + vBkoq .req vAkoq + vBkuq .req vAkuq + vBmaq .req vAsaq + vBmeq .req vAseq + vBmiq .req vAmiq + vBmoq .req vAmoq + vBmuq .req vAmuq + vBsaq .req vAbaq + vBseq .req vAbeq + vBsiq .req vAsiq + vBsoq .req vAsoq + vBsuq .req vAsuq + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req C4 + E1 .req C0 + E2 .req vBbe // fresh + E3 .req C2 + E4 .req C3 + + E0q .req C4q + E1q .req C0q + E2q .req vBbeq // fresh + E3q .req C2q + E4q .req C3q + + /* Mapping of Kecck-f1600 state to scalar registers + * at the beginning and end of each round. */ + s_Aba .req x1 + sAbe .req x6 + sAbi .req x11 + sAbo .req x16 + sAbu .req x21 + sAga .req x2 + sAge .req x7 + sAgi .req x12 + sAgo .req x17 + sAgu .req x22 + sAka .req x3 + sAke .req x8 + sAki .req x13 + sAko .req x18 + sAku .req x23 + sAma .req x4 + sAme .req x9 + sAmi .req x14 + sAmo .req x19 + sAmu .req x24 + sAsa .req x5 + sAse .req x10 + sAsi .req x15 + sAso .req x20 + sAsu .req x25 + + /* sA_[y,2*x+3*y] = rot(A[x,y]) */ + s_Aba_ .req x0 + sAbe_ .req x28 + sAbi_ .req x11 + sAbo_ .req x16 + sAbu_ .req x21 + sAga_ .req x3 + sAge_ .req x8 + sAgi_ .req x12 + sAgo_ .req x17 + sAgu_ .req x22 + sAka_ .req x4 + sAke_ .req x9 + sAki_ .req x13 + sAko_ .req x18 + sAku_ .req x23 + sAma_ .req x5 + sAme_ .req x10 + sAmi_ .req x14 + sAmo_ .req x19 + sAmu_ .req x24 + sAsa_ .req x1 + sAse_ .req x6 + sAsi_ .req x15 + sAso_ .req x20 + sAsu_ .req x25 + + /* sC[x] = sA[x,0] xor sA[x,1] xor sA[x,2] xor sA[x,3] xor sA[x,4], for x in 0..4 */ + /* sE[x] = sC[x-1] xor rot(C[x+1],1), for x in 0..4 */ + sC0 .req x0 + sE0 .req x29 + sC1 .req x26 + sE1 .req x30 + sC2 .req x27 + sE2 .req x26 + sC3 .req x28 + sE3 .req x27 + sC4 .req x29 + sE4 .req x28 + + tmp .req x30 + +/************************ MACROS ****************************/ + +/* Macros using v8.4-A SHA-3 instructions */ + +.macro eor3_m0 d s0 s1 s2 + eor3 \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +.macro rax1_m0 d s0 s1 + rax1 \d\().2d, \s0\().2d, \s1\().2d +.endm + +.macro xar_m0 d s0 s1 imm + xar \d\().2d, \s0\().2d, \s1\().2d, #\imm +.endm + +.macro bcax_m0 d s0 s1 s2 + bcax \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +.macro eor3_m1_0 d s0 s1 s2 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor2 d s0 s1 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor3_m1_1 d s0 s1 s2 + eor \d\().16b, \d\().16b, \s2\().16b +.endm + +.macro eor3_m1 d s0 s1 s2 + eor3_m1_0 \d, \s0, \s1, \s2 + eor3_m1_1 \d, \s0, \s1, \s2 +.endm + +.macro rax1_m1 d s0 s1 + // Use add instead of SHL #1 + add vvtmp.2d, \s1\().2d, \s1\().2d + sri vvtmp.2d, \s1\().2d, #63 + eor \d\().16b, vvtmp.16b, \s0\().16b +.endm + + .macro xar_m1 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 63 + eor \s0\().16b, \s0\().16b, \s1\().16b + add \d\().2d, \s0\().2d, \s0\().2d + sri \d\().2d, \s0\().2d, #(63) + // .elseif \imm == 62 + // eor \s0\().16b, \s0\().16b, \s1\().16b + // add \d\().2d, \s0\().2d, \s0\().2d + // add \d\().2d, \d\().2d, \d\().2d + // sri \d\().2d, \s0\().2d, #(62) + // .elseif \imm == 61 + // eor \s0\().16b, \s0\().16b, \s1\().16b + // add \d\().2d, \s0\().2d, \s0\().2d + // add \d\().2d, \d\().2d, \d\().2d + // add \d\().2d, \d\().2d, \d\().2d + // sri \d\().2d, \s0\().2d, #(61) + .else + eor \s0\().16b, \s0\().16b, \s1\().16b + shl \d\().2d, \s0\().2d, #(64-\imm) + sri \d\().2d, \s0\().2d, #(\imm) + .endif +.endm + + .macro xar_m1_0 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 63 + eor \s0\().16b, \s0\().16b, \s1\().16b + .elseif \imm == 62 + eor \s0\().16b, \s0\().16b, \s1\().16b + .else + eor \s0\().16b, \s0\().16b, \s1\().16b + .endif +.endm + + .macro xar_m1_1 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 63 + add \d\().2d, \s0\().2d, \s0\().2d + sri \d\().2d, \s0\().2d, #(63) + .elseif \imm == 62 + add \d\().2d, \s0\().2d, \s0\().2d + add \d\().2d, \d\().2d, \d\().2d + sri \d\().2d, \s0\().2d, #(62) + .else + shl \d\().2d, \s0\().2d, #(64-\imm) + sri \d\().2d, \s0\().2d, #(\imm) + .endif +.endm + +.macro bcax_m1 d s0 s1 s2 + bic vvtmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, vvtmp.16b, \s0\().16b +.endm + +.macro load_input_vector num idx + ldr vAbaq, [input_addr, #(16*(\num*0+\idx))] + ldr vAbeq, [input_addr, #(16*(\num*1+\idx))] + ldr vAbiq, [input_addr, #(16*(\num*2+\idx))] + ldr vAboq, [input_addr, #(16*(\num*3+\idx))] + ldr vAbuq, [input_addr, #(16*(\num*4+\idx))] + ldr vAgaq, [input_addr, #(16*(\num*5+\idx))] + ldr vAgeq, [input_addr, #(16*(\num*6+\idx))] + ldr vAgiq, [input_addr, #(16*(\num*7+\idx))] + ldr vAgoq, [input_addr, #(16*(\num*8+\idx))] + ldr vAguq, [input_addr, #(16*(\num*9+\idx))] + ldr vAkaq, [input_addr, #(16*(\num*10+\idx))] + ldr vAkeq, [input_addr, #(16*(\num*11+\idx))] + ldr vAkiq, [input_addr, #(16*(\num*12+\idx))] + ldr vAkoq, [input_addr, #(16*(\num*13+\idx))] + ldr vAkuq, [input_addr, #(16*(\num*14+\idx))] + ldr vAmaq, [input_addr, #(16*(\num*15+\idx))] + ldr vAmeq, [input_addr, #(16*(\num*16+\idx))] + ldr vAmiq, [input_addr, #(16*(\num*17+\idx))] + ldr vAmoq, [input_addr, #(16*(\num*18+\idx))] + ldr vAmuq, [input_addr, #(16*(\num*19+\idx))] + ldr vAsaq, [input_addr, #(16*(\num*20+\idx))] + ldr vAseq, [input_addr, #(16*(\num*21+\idx))] + ldr vAsiq, [input_addr, #(16*(\num*22+\idx))] + ldr vAsoq, [input_addr, #(16*(\num*23+\idx))] + ldr vAsuq, [input_addr, #(16*(\num*24+\idx))] +.endm + +.macro store_input_vector num idx + str vAbaq, [input_addr, #(16*(\num*0+\idx))] + str vAbeq, [input_addr, #(16*(\num*1+\idx))] + str vAbiq, [input_addr, #(16*(\num*2+\idx))] + str vAboq, [input_addr, #(16*(\num*3+\idx))] + str vAbuq, [input_addr, #(16*(\num*4+\idx))] + str vAgaq, [input_addr, #(16*(\num*5+\idx))] + str vAgeq, [input_addr, #(16*(\num*6+\idx))] + str vAgiq, [input_addr, #(16*(\num*7+\idx))] + str vAgoq, [input_addr, #(16*(\num*8+\idx))] + str vAguq, [input_addr, #(16*(\num*9+\idx))] + str vAkaq, [input_addr, #(16*(\num*10+\idx))] + str vAkeq, [input_addr, #(16*(\num*11+\idx))] + str vAkiq, [input_addr, #(16*(\num*12+\idx))] + str vAkoq, [input_addr, #(16*(\num*13+\idx))] + str vAkuq, [input_addr, #(16*(\num*14+\idx))] + str vAmaq, [input_addr, #(16*(\num*15+\idx))] + str vAmeq, [input_addr, #(16*(\num*16+\idx))] + str vAmiq, [input_addr, #(16*(\num*17+\idx))] + str vAmoq, [input_addr, #(16*(\num*18+\idx))] + str vAmuq, [input_addr, #(16*(\num*19+\idx))] + str vAsaq, [input_addr, #(16*(\num*20+\idx))] + str vAseq, [input_addr, #(16*(\num*21+\idx))] + str vAsiq, [input_addr, #(16*(\num*22+\idx))] + str vAsoq, [input_addr, #(16*(\num*23+\idx))] + str vAsuq, [input_addr, #(16*(\num*24+\idx))] +.endm + +.macro store_input_scalar num idx + str s_Aba, [input_addr, 8*(\num*(0) +\idx)] + str sAbe, [input_addr, 8*(\num*(0+1) +\idx)] + str sAbi, [input_addr, 8*(\num*(2)+ \idx)] + str sAbo, [input_addr, 8*(\num*(2+1) +\idx)] + str sAbu, [input_addr, 8*(\num*(4)+ \idx)] + str sAga, [input_addr, 8*(\num*(4+1) +\idx)] + str sAge, [input_addr, 8*(\num*(6)+ \idx)] + str sAgi, [input_addr, 8*(\num*(6+1) +\idx)] + str sAgo, [input_addr, 8*(\num*(8)+ \idx)] + str sAgu, [input_addr, 8*(\num*(8+1) +\idx)] + str sAka, [input_addr, 8*(\num*(10) +\idx)] + str sAke, [input_addr, 8*(\num*(10+1)+\idx)] + str sAki, [input_addr, 8*(\num*(12) +\idx)] + str sAko, [input_addr, 8*(\num*(12+1)+\idx)] + str sAku, [input_addr, 8*(\num*(14) +\idx)] + str sAma, [input_addr, 8*(\num*(14+1)+\idx)] + str sAme, [input_addr, 8*(\num*(16) +\idx)] + str sAmi, [input_addr, 8*(\num*(16+1)+\idx)] + str sAmo, [input_addr, 8*(\num*(18) +\idx)] + str sAmu, [input_addr, 8*(\num*(18+1)+\idx)] + str sAsa, [input_addr, 8*(\num*(20) +\idx)] + str sAse, [input_addr, 8*(\num*(20+1)+\idx)] + str sAsi, [input_addr, 8*(\num*(22) +\idx)] + str sAso, [input_addr, 8*(\num*(22+1)+\idx)] + str sAsu, [input_addr, 8*(\num*(24) +\idx)] +.endm + +.macro load_input_scalar num idx + ldr s_Aba, [input_addr, 8*(\num*(0) +\idx)] + ldr sAbe, [input_addr, 8*(\num*(0+1) +\idx)] + ldr sAbi, [input_addr, 8*(\num*(2)+ \idx)] + ldr sAbo, [input_addr, 8*(\num*(2+1) +\idx)] + ldr sAbu, [input_addr, 8*(\num*(4)+ \idx)] + ldr sAga, [input_addr, 8*(\num*(4+1) +\idx)] + ldr sAge, [input_addr, 8*(\num*(6)+ \idx)] + ldr sAgi, [input_addr, 8*(\num*(6+1) +\idx)] + ldr sAgo, [input_addr, 8*(\num*(8)+ \idx)] + ldr sAgu, [input_addr, 8*(\num*(8+1) +\idx)] + ldr sAka, [input_addr, 8*(\num*(10) +\idx)] + ldr sAke, [input_addr, 8*(\num*(10+1)+\idx)] + ldr sAki, [input_addr, 8*(\num*(12) +\idx)] + ldr sAko, [input_addr, 8*(\num*(12+1)+\idx)] + ldr sAku, [input_addr, 8*(\num*(14) +\idx)] + ldr sAma, [input_addr, 8*(\num*(14+1)+\idx)] + ldr sAme, [input_addr, 8*(\num*(16) +\idx)] + ldr sAmi, [input_addr, 8*(\num*(16+1)+\idx)] + ldr sAmo, [input_addr, 8*(\num*(18) +\idx)] + ldr sAmu, [input_addr, 8*(\num*(18+1)+\idx)] + ldr sAsa, [input_addr, 8*(\num*(20) +\idx)] + ldr sAse, [input_addr, 8*(\num*(20+1)+\idx)] + ldr sAsi, [input_addr, 8*(\num*(22) +\idx)] + ldr sAso, [input_addr, 8*(\num*(22+1)+\idx)] + ldr sAsu, [input_addr, 8*(\num*(24) +\idx)] +.endm + +#define STACK_SIZE (8*8 + 16*6 + 3*8 + 8 + 16*34) // VREGS (8*8), GPRs (16*6), count (8), const (8), input (8), padding (8) +#define STACK_BASE_GPRS (3*8+8) +#define STACK_BASE_VREGS (3*8+8+16*6) +#define STACK_BASE_TMP (8*8 + 16*6 + 3*8 + 8) +#define STACK_OFFSET_INPUT (0*8) +#define STACK_OFFSET_CONST (1*8) +#define STACK_OFFSET_COUNT (2*8) + +#define vAga_offset 0 +#define E0_offset 1 +#define E1_offset 2 +#define E2_offset 3 +#define E3_offset 4 +#define E4_offset 5 +#define Ame_offset 7 +#define Agi_offset 8 +#define Aka_offset 9 +#define Abo_offset 10 +#define Amo_offset 11 +#define Ami_offset 12 +#define Ake_offset 13 +#define Agu_offset 14 +#define Asi_offset 15 +#define Aku_offset 16 +#define Asa_offset 17 +#define Abu_offset 18 +#define Asu_offset 19 +#define Ase_offset 20 +//#define Aga_offset 21 +#define Age_offset 22 +#define vBgo_offset 23 +#define vBke_offset 24 +#define vBgi_offset 25 +#define vBga_offset 26 +#define vBbo_offset 27 +#define vBmo_offset 28 +#define vBmi_offset 29 +#define vBge_offset 30 + +#define save(name) \ + str name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] +#define restore(name) \ + ldr name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] + + +.macro save_gprs + stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro save_vregs + stp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] + stp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + stp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + stp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] +.endm + +.macro restore_vregs + ldp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] + ldp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + ldp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + ldp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] +.endm + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +.macro eor5 dst, src0, src1, src2, src3, src4 + eor \dst, \src0, \src1 + eor \dst, \dst, \src2 + eor \dst, \dst, \src3 + eor \dst, \dst, \src4 +.endm + +.macro xor_rol dst, src1, src0, imm + eor \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro bic_rol dst, src1, src0, imm + bic \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro rotate dst, src, imm + ror \dst, \src, #(64-\imm) +.endm + +.macro save reg, offset + str \reg, [sp, #\offset] +.endm + +.macro restore reg, offset + ldr \reg, [sp, #\offset] +.endm + +.macro hybrid_round_initial +eor sC0, sAma, sAsa SEP +eor sC1, sAme, sAse SEP eor3_m0 C1,vAbe,vAge,vAke +eor sC2, sAmi, sAsi SEP eor3_m1 C3,vAbo,vAgo,vAko +eor sC3, sAmo, sAso SEP eor3_m0 C0,vAba,vAga,vAka +eor sC4, sAmu, sAsu SEP eor3_m1 C2,vAbi,vAgi,vAki +eor sC0, sAka, sC0 SEP eor3_m0 C4,vAbu,vAgu,vAku +eor sC1, sAke, sC1 SEP eor3_m1 C1, C1,vAme, vAse +eor sC2, sAki, sC2 SEP eor3_m0 C3, C3,vAmo, vAso +eor sC3, sAko, sC3 SEP eor3_m1 C0, C0,vAma, vAsa +eor sC4, sAku, sC4 SEP eor3_m0 C2, C2,vAmi, vAsi +eor sC0, sAga, sC0 SEP eor3_m1 C4, C4,vAmu, vAsu +eor sC1, sAge, sC1 SEP vvtmp .req vBba +eor sC2, sAgi, sC2 SEP +eor sC3, sAgo, sC3 SEP rax1_m0 E2, C1, C3 +eor sC4, sAgu, sC4 SEP rax1_m1 E4, C3, C0 +eor sC0, s_Aba, sC0 SEP rax1_m0 E1, C0, C2 +eor sC1, sAbe, sC1 SEP rax1_m1 E3, C2, C4 +eor sC2, sAbi, sC2 SEP rax1_m0 E0, C4, C1 +eor sC3, sAbo, sC3 SEP .unreq vvtmp +eor sC4, sAbu, sC4 SEP vvtmp .req C1 +eor sE1, sC0, sC2, ROR #63 SEP vvtmpq .req C1q +eor sE3, sC2, sC4, ROR #63 SEP eor vBba.16b, vAba.16b, E0.16b +eor sE0, sC4, sC1, ROR #63 SEP xar_m1 vBsa, vAbi, E2, 2 +eor sE2, sC1, sC3, ROR #63 SEP +eor sE4, sC3, sC0, ROR #63 SEP xar_m0 vBbi, vAki, E2, 21 +eor s_Aba_, s_Aba, sE0 SEP xar_m1 vBki, vAko, E3, 39 +eor sAsa_, sAbi, sE2 SEP xar_m0 vBko, vAmu, E4, 56 +eor sAbi_, sAki, sE2 SEP xar_m1 vBmu, vAso, E3, 8 +eor sAki_, sAko, sE3 SEP xar_m0 vBso, vAma, E0, 23 +eor sAko_, sAmu, sE4 SEP xar_m1 vBka, vAbe, E1, 63 +eor sAmu_, sAso, sE3 SEP xar_m0 vBse, vAgo, E3, 9 +eor sAso_, sAma, sE0 SEP xar_m1 vBgo, vAme, E1, 19 +eor sAka_, sAbe, sE1 SEP xar_m0 vBke, vAgi, E2, 58 +eor sAse_, sAgo, sE3 SEP xar_m1 vBgi, vAka, E0, 61 +eor sAgo_, sAme, sE1 SEP +eor sAke_, sAgi, sE2 SEP xar_m0 vBga, vAbo, E3, 36 +eor sAgi_, sAka, sE0 SEP xar_m1 vBbo, vAmo, E3, 43 +eor sAga_, sAbo, sE3 SEP xar_m0 vBmo, vAmi, E2, 49 +eor sAbo_, sAmo, sE3 SEP xar_m1 vBmi, vAke, E1, 54 +eor sAmo_, sAmi, sE2 SEP xar_m0 vBge, vAgu, E4, 44 +eor sAmi_, sAke, sE1 SEP mov E3.16b, vAga.16b +eor sAge_, sAgu, sE4 SEP bcax_m1 vAga, vBga, vBgi, vBge +eor sAgu_, sAsi, sE2 SEP xar_m0 vBgu, vAsi, E2, 3 +eor sAsi_, sAku, sE4 SEP xar_m1 vBsi, vAku, E4, 25 +eor sAku_, sAsa, sE0 SEP xar_m0 vBku, vAsa, E0, 46 +eor sAma_, sAbu, sE4 SEP +eor sAbu_, sAsu, sE4 SEP xar_m1 vBma, vAbu, E4, 37 +eor sAsu_, sAse, sE1 SEP xar_m0 vBbu, vAsu, E4, 50 +eor sAme_, sAga, sE0 SEP xar_m1 vBsu, vAse, E1, 62 +eor sAbe_, sAge, sE1 SEP xar_m0 vBme, E3, E0, 28 +load_constant_ptr SEP xar_m1 vBbe, vAge, E1, 20 +bic tmp, sAgi_, sAge_, ROR #47 SEP bcax_m1 vAge, vBge, vBgo, vBgi +eor sAga, tmp, sAga_, ROR #39 SEP bcax_m0 vAgi, vBgi, vBgu, vBgo +bic tmp, sAgo_, sAgi_, ROR #42 SEP bcax_m1 vAgo, vBgo, vBga, vBgu +eor sAge, tmp, sAge_, ROR #25 SEP bcax_m0 vAgu, vBgu, vBge, vBga +bic tmp, sAgu_, sAgo_, ROR #16 SEP bcax_m1 vAka, vBka, vBki, vBke +eor sAgi, tmp, sAgi_, ROR #58 SEP bcax_m0 vAke, vBke, vBko, vBki +bic tmp, sAga_, sAgu_, ROR #31 SEP +eor sAgo, tmp, sAgo_, ROR #47 SEP .unreq vvtmp +bic tmp, sAge_, sAga_, ROR #56 SEP .unreq vvtmpq +eor sAgu, tmp, sAgu_, ROR #23 SEP eor2 C0, vAka, vAga +bic tmp, sAki_, sAke_, ROR #19 SEP save(vAga) +eor sAka, tmp, sAka_, ROR #24 SEP vvtmp .req vAga +bic tmp, sAko_, sAki_, ROR #47 SEP vvtmpq .req vAgaq +eor sAke, tmp, sAke_, ROR #2 SEP bcax_m0 vAki, vBki, vBku, vBko +bic tmp, sAku_, sAko_, ROR #10 SEP bcax_m1 vAko, vBko, vBka, vBku +eor sAki, tmp, sAki_, ROR #57 SEP eor2 C1, vAke, vAge +bic tmp, sAka_, sAku_, ROR #47 SEP bcax_m0 vAku, vBku, vBke, vBka +eor sAko, tmp, sAko_, ROR #57 SEP +bic tmp, sAke_, sAka_, ROR #5 SEP eor2 C2, vAki, vAgi +eor sAku, tmp, sAku_, ROR #52 SEP bcax_m1 vAma, vBma, vBmi, vBme +bic tmp, sAmi_, sAme_, ROR #38 SEP eor2 C3, vAko, vAgo +eor sAma, tmp, sAma_, ROR #47 SEP bcax_m0 vAme, vBme, vBmo, vBmi +bic tmp, sAmo_, sAmi_, ROR #5 SEP eor2 C4, vAku, vAgu +eor sAme, tmp, sAme_, ROR #43 SEP bcax_m1 vAmi, vBmi, vBmu, vBmo +bic tmp, sAmu_, sAmo_, ROR #41 SEP eor2 C0, C0, vAma +eor sAmi, tmp, sAmi_, ROR #46 SEP bcax_m0 vAmo, vBmo, vBma, vBmu +ldr cur_const, [const_addr] SEP eor2 C1, C1, vAme +mov count, #1 SEP bcax_m1 vAmu, vBmu, vBme, vBma +bic tmp, sAma_, sAmu_, ROR #35 SEP +eor sAmo, tmp, sAmo_, ROR #12 SEP eor2 C2, C2, vAmi +bic tmp, sAme_, sAma_, ROR #9 SEP bcax_m0 vAsa, vBsa, vBsi, vBse +eor sAmu, tmp, sAmu_, ROR #44 SEP eor2 C3, C3, vAmo +bic tmp, sAsi_, sAse_, ROR #48 SEP bcax_m1 vAse, vBse, vBso, vBsi +eor sAsa, tmp, sAsa_, ROR #41 SEP eor2 C4, C4, vAmu +bic tmp, sAso_, sAsi_, ROR #2 SEP bcax_m0 vAsi, vBsi, vBsu, vBso +eor sAse, tmp, sAse_, ROR #50 SEP eor2 C0, C0, vAsa +bic tmp, sAsu_, sAso_, ROR #25 SEP bcax_m1 vAso, vBso, vBsa, vBsu +eor sAsi, tmp, sAsi_, ROR #27 SEP eor2 C1, C1, vAse +bic tmp, sAsa_, sAsu_, ROR #60 SEP bcax_m0 vAsu, vBsu, vBse, vBsa +eor sAso, tmp, sAso_, ROR #21 SEP +save count, STACK_OFFSET_COUNT SEP +bic tmp, sAse_, sAsa_, ROR #57 SEP eor2 C2, C2, vAsi +eor sAsu, tmp, sAsu_, ROR #53 SEP eor2 C3, C3, vAso +bic tmp, sAbi_, sAbe_, ROR #63 SEP bcax_m1 vAba, vBba, vBbi, vBbe +eor s_Aba, s_Aba_, tmp, ROR #21 SEP bcax_m0 vAbe, vBbe, vBbo, vBbi +bic tmp, sAbo_, sAbi_, ROR #42 SEP eor2 C1, C1, vAbe +eor sAbe, tmp, sAbe_, ROR #41 SEP restore x27, STACK_OFFSET_CONST +bic tmp, sAbu_, sAbo_, ROR #57 SEP ldr vvtmpq, [x27], #16 +eor sAbi, tmp, sAbi_, ROR #35 SEP save x27, STACK_OFFSET_CONST +bic tmp, s_Aba_, sAbu_, ROR #50 SEP eor vAba.16b, vAba.16b, vvtmp.16b +eor sAbo, tmp, sAbo_, ROR #43 SEP eor2 C4, C4, vAsu +bic tmp, sAbe_, s_Aba_, ROR #44 SEP +eor sAbu, tmp, sAbu_, ROR #30 SEP bcax_m0 vAbi, vBbi, vBbu, vBbo +eor s_Aba, s_Aba, cur_const SEP bcax_m1 vAbo, vBbo, vBba, vBbu + SEP eor2 C3, C3, vAbo +eor sC0, sAka, sAsa, ROR #50 SEP eor2 C2, C2, vAbi +eor sC1, sAse, sAge, ROR #60 SEP eor2 C0, C0, vAba +eor sC2, sAmi, sAgi, ROR #59 SEP bcax_m0 vAbu, vBbu, vBbe, vBba +eor sC3, sAgo, sAso, ROR #30 SEP eor2 C4, C4, vAbu +eor sC4, sAbu, sAsu, ROR #53 SEP restore(vAga) +eor sC0, sAma, sC0, ROR #49 SEP .unreq vvtmp +eor sC1, sAbe, sC1, ROR #44 SEP .unreq vvtmpq +eor sC2, sAki, sC2, ROR #26 SEP vvtmp .req vBba +eor sC3, sAmo, sC3, ROR #63 SEP +eor sC4, sAmu, sC4, ROR #56 SEP rax1_m0 E2, C1, C3 +eor sC0, sAga, sC0, ROR #57 SEP rax1_m1 E4, C3, C0 +eor sC1, sAme, sC1, ROR #58 SEP rax1_m0 E1, C0, C2 +eor sC2, sAbi, sC2, ROR #60 SEP rax1_m1 E3, C2, C4 +eor sC3, sAko, sC3, ROR #38 SEP rax1_m0 E0, C4, C1 +eor sC4, sAgu, sC4, ROR #48 SEP .unreq vvtmp +eor sC0, s_Aba, sC0, ROR #61 SEP vvtmp .req C1 +eor sC1, sAke, sC1, ROR #57 SEP vvtmpq .req C1q +eor sC2, sAsi, sC2, ROR #52 SEP eor vBba.16b, vAba.16b, E0.16b +eor sC3, sAbo, sC3, ROR #63 SEP xar_m1 vBsa, vAbi, E2, 2 +eor sC4, sAku, sC4, ROR #50 SEP +ror sC1, sC1, 56 SEP xar_m0 vBbi, vAki, E2, 21 +ror sC4, sC4, 58 SEP xar_m1 vBki, vAko, E3, 39 +ror sC2, sC2, 62 SEP xar_m0 vBko, vAmu, E4, 56 +eor sE1, sC0, sC2, ROR #63 SEP xar_m1 vBmu, vAso, E3, 8 +eor sE3, sC2, sC4, ROR #63 SEP xar_m0 vBso, vAma, E0, 23 +eor sE0, sC4, sC1, ROR #63 SEP xar_m1 vBka, vAbe, E1, 63 +eor sE2, sC1, sC3, ROR #63 SEP xar_m0 vBse, vAgo, E3, 9 +eor sE4, sC3, sC0, ROR #63 SEP xar_m1 vBgo, vAme, E1, 19 +eor s_Aba_, sE0, s_Aba SEP xar_m0 vBke, vAgi, E2, 58 +eor sAsa_, sE2, sAbi, ROR #50 SEP xar_m1 vBgi, vAka, E0, 61 +eor sAbi_, sE2, sAki, ROR #46 SEP +eor sAki_, sE3, sAko, ROR #63 SEP xar_m0 vBga, vAbo, E3, 36 +eor sAko_, sE4, sAmu, ROR #28 SEP xar_m1 vBbo, vAmo, E3, 43 +eor sAmu_, sE3, sAso, ROR #2 SEP xar_m0 vBmo, vAmi, E2, 49 +eor sAso_, sE0, sAma, ROR #54 SEP xar_m1 vBmi, vAke, E1, 54 +eor sAka_, sE1, sAbe, ROR #43 SEP xar_m0 vBge, vAgu, E4, 44 +eor sAse_, sE3, sAgo, ROR #36 SEP mov E3.16b, vAga.16b +eor sAgo_, sE1, sAme, ROR #49 SEP bcax_m1 vAga, vBga, vBgi, vBge +eor sAke_, sE2, sAgi, ROR #3 SEP xar_m0 vBgu, vAsi, E2, 3 +eor sAgi_, sE0, sAka, ROR #39 SEP xar_m1 vBsi, vAku, E4, 25 +eor sAga_, sE3, sAbo SEP xar_m0 vBku, vAsa, E0, 46 +eor sAbo_, sE3, sAmo, ROR #37 SEP +eor sAmo_, sE2, sAmi, ROR #8 SEP xar_m1 vBma, vAbu, E4, 37 +eor sAmi_, sE1, sAke, ROR #56 SEP xar_m0 vBbu, vAsu, E4, 50 +eor sAge_, sE4, sAgu, ROR #44 SEP xar_m1 vBsu, vAse, E1, 62 +eor sAgu_, sE2, sAsi, ROR #62 SEP xar_m0 vBme, E3, E0, 28 +eor sAsi_, sE4, sAku, ROR #58 SEP xar_m1 vBbe, vAge, E1, 20 +eor sAku_, sE0, sAsa, ROR #25 SEP bcax_m1 vAge, vBge, vBgo, vBgi +eor sAma_, sE4, sAbu, ROR #20 SEP bcax_m0 vAgi, vBgi, vBgu, vBgo +eor sAbu_, sE4, sAsu, ROR #9 SEP bcax_m1 vAgo, vBgo, vBga, vBgu +eor sAsu_, sE1, sAse, ROR #23 SEP bcax_m0 vAgu, vBgu, vBge, vBga +eor sAme_, sE0, sAga, ROR #61 SEP bcax_m1 vAka, vBka, vBki, vBke +eor sAbe_, sE1, sAge, ROR #19 SEP +load_constant_ptr SEP bcax_m0 vAke, vBke, vBko, vBki +restore count, STACK_OFFSET_COUNT SEP .unreq vvtmp +bic tmp, sAgi_, sAge_, ROR #47 SEP .unreq vvtmpq +eor sAga, tmp, sAga_, ROR #39 SEP eor2 C0, vAka, vAga +bic tmp, sAgo_, sAgi_, ROR #42 SEP save(vAga) +eor sAge, tmp, sAge_, ROR #25 SEP vvtmp .req vAga +bic tmp, sAgu_, sAgo_, ROR #16 SEP vvtmpq .req vAgaq +eor sAgi, tmp, sAgi_, ROR #58 SEP bcax_m0 vAki, vBki, vBku, vBko +bic tmp, sAga_, sAgu_, ROR #31 SEP bcax_m1 vAko, vBko, vBka, vBku +eor sAgo, tmp, sAgo_, ROR #47 SEP eor2 C1, vAke, vAge +bic tmp, sAge_, sAga_, ROR #56 SEP bcax_m0 vAku, vBku, vBke, vBka +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP eor2 C2, vAki, vAgi +eor sAka, tmp, sAka_, ROR #24 SEP bcax_m1 vAma, vBma, vBmi, vBme +bic tmp, sAko_, sAki_, ROR #47 SEP eor2 C3, vAko, vAgo +eor sAke, tmp, sAke_, ROR #2 SEP bcax_m0 vAme, vBme, vBmo, vBmi +bic tmp, sAku_, sAko_, ROR #10 SEP eor2 C4, vAku, vAgu +eor sAki, tmp, sAki_, ROR #57 SEP bcax_m1 vAmi, vBmi, vBmu, vBmo +bic tmp, sAka_, sAku_, ROR #47 SEP eor2 C0, C0, vAma +eor sAko, tmp, sAko_, ROR #57 SEP bcax_m0 vAmo, vBmo, vBma, vBmu +bic tmp, sAke_, sAka_, ROR #5 SEP eor2 C1, C1, vAme +eor sAku, tmp, sAku_, ROR #52 SEP bcax_m1 vAmu, vBmu, vBme, vBma +bic tmp, sAmi_, sAme_, ROR #38 SEP +eor sAma, tmp, sAma_, ROR #47 SEP eor2 C2, C2, vAmi +bic tmp, sAmo_, sAmi_, ROR #5 SEP bcax_m0 vAsa, vBsa, vBsi, vBse +eor sAme, tmp, sAme_, ROR #43 SEP eor2 C3, C3, vAmo +bic tmp, sAmu_, sAmo_, ROR #41 SEP bcax_m1 vAse, vBse, vBso, vBsi +eor sAmi, tmp, sAmi_, ROR #46 SEP eor2 C4, C4, vAmu +bic tmp, sAma_, sAmu_, ROR #35 SEP bcax_m0 vAsi, vBsi, vBsu, vBso +eor sAmo, tmp, sAmo_, ROR #12 SEP eor2 C0, C0, vAsa +bic tmp, sAme_, sAma_, ROR #9 SEP bcax_m1 vAso, vBso, vBsa, vBsu +eor sAmu, tmp, sAmu_, ROR #44 SEP eor2 C1, C1, vAse +bic tmp, sAsi_, sAse_, ROR #48 SEP bcax_m0 vAsu, vBsu, vBse, vBsa + +eor sAsa, tmp, sAsa_, ROR #41 SEP eor2 C2, C2, vAsi +bic tmp, sAso_, sAsi_, ROR #2 SEP eor2 C3, C3, vAso +eor sAse, tmp, sAse_, ROR #50 SEP bcax_m1 vAba, vBba, vBbi, vBbe +bic tmp, sAsu_, sAso_, ROR #25 SEP bcax_m0 vAbe, vBbe, vBbo, vBbi +eor sAsi, tmp, sAsi_, ROR #27 SEP eor2 C1, C1, vAbe +bic tmp, sAsa_, sAsu_, ROR #60 SEP restore x26, STACK_OFFSET_CONST +eor sAso, tmp, sAso_, ROR #21 SEP ldr vvtmpq, [x26], #16 +bic tmp, sAse_, sAsa_, ROR #57 SEP save x26, STACK_OFFSET_CONST +eor sAsu, tmp, sAsu_, ROR #53 SEP eor vAba.16b, vAba.16b, vvtmp.16b +bic tmp, sAbi_, sAbe_, ROR #63 SEP eor2 C4, C4, vAsu +eor s_Aba, s_Aba_, tmp, ROR #21 SEP +ldr cur_const, [const_addr, count, UXTW #3] SEP +bic tmp, sAbo_, sAbi_, ROR #42 SEP bcax_m0 vAbi, vBbi, vBbu, vBbo +eor sAbe, tmp, sAbe_, ROR #41 SEP bcax_m1 vAbo, vBbo, vBba, vBbu +bic tmp, sAbu_, sAbo_, ROR #57 SEP eor2 C3, C3, vAbo +eor sAbi, tmp, sAbi_, ROR #35 SEP eor2 C2, C2, vAbi +bic tmp, s_Aba_, sAbu_, ROR #50 SEP eor2 C0, C0, vAba +eor sAbo, tmp, sAbo_, ROR #43 SEP bcax_m0 vAbu, vBbu, vBbe, vBba +bic tmp, sAbe_, s_Aba_, ROR #44 SEP eor2 C4, C4, vAbu +eor sAbu, tmp, sAbu_, ROR #30 SEP restore(vAga) +add count, count, #1 SEP .unreq vvtmp +eor s_Aba, s_Aba, cur_const SEP .unreq vvtmpq +.endm + + +.macro hybrid_round_noninitial +save count, STACK_OFFSET_COUNT SEP +eor sC0, sAka, sAsa, ROR #50 SEP vvtmp .req vBba +eor sC1, sAse, sAge, ROR #60 SEP rax1_m0 E2, C1, C3 +eor sC2, sAmi, sAgi, ROR #59 SEP rax1_m1 E4, C3, C0 +eor sC3, sAgo, sAso, ROR #30 SEP rax1_m0 E1, C0, C2 +eor sC4, sAbu, sAsu, ROR #53 SEP rax1_m1 E3, C2, C4 +eor sC0, sAma, sC0, ROR #49 SEP rax1_m0 E0, C4, C1 +eor sC1, sAbe, sC1, ROR #44 SEP +eor sC2, sAki, sC2, ROR #26 SEP .unreq vvtmp +eor sC3, sAmo, sC3, ROR #63 SEP vvtmp .req C1 +eor sC4, sAmu, sC4, ROR #56 SEP vvtmpq .req C1q +eor sC0, sAga, sC0, ROR #57 SEP eor vBba.16b, vAba.16b, E0.16b +eor sC1, sAme, sC1, ROR #58 SEP xar_m1 vBsa, vAbi, E2, 2 +eor sC2, sAbi, sC2, ROR #60 SEP +eor sC3, sAko, sC3, ROR #38 SEP xar_m0 vBbi, vAki, E2, 21 +eor sC4, sAgu, sC4, ROR #48 SEP xar_m1 vBki, vAko, E3, 39 +eor sC0, s_Aba, sC0, ROR #61 SEP xar_m0 vBko, vAmu, E4, 56 +eor sC1, sAke, sC1, ROR #57 SEP xar_m1 vBmu, vAso, E3, 8 +eor sC2, sAsi, sC2, ROR #52 SEP xar_m0 vBso, vAma, E0, 23 +eor sC3, sAbo, sC3, ROR #63 SEP xar_m1 vBka, vAbe, E1, 63 +eor sC4, sAku, sC4, ROR #50 SEP +ror sC1, sC1, 56 SEP xar_m0 vBse, vAgo, E3, 9 +ror sC4, sC4, 58 SEP xar_m1 vBgo, vAme, E1, 19 +ror sC2, sC2, 62 SEP xar_m0 vBke, vAgi, E2, 58 +eor sE1, sC0, sC2, ROR #63 SEP xar_m1 vBgi, vAka, E0, 61 +eor sE3, sC2, sC4, ROR #63 SEP xar_m0 vBga, vAbo, E3, 36 +eor sE0, sC4, sC1, ROR #63 SEP +eor sE2, sC1, sC3, ROR #63 SEP xar_m1 vBbo, vAmo, E3, 43 +eor sE4, sC3, sC0, ROR #63 SEP xar_m0 vBmo, vAmi, E2, 49 +eor s_Aba_, sE0, s_Aba SEP xar_m1 vBmi, vAke, E1, 54 +eor sAsa_, sE2, sAbi, ROR #50 SEP xar_m0 vBge, vAgu, E4, 44 +eor sAbi_, sE2, sAki, ROR #46 SEP mov E3.16b, vAga.16b +eor sAki_, sE3, sAko, ROR #63 SEP bcax_m1 vAga, vBga, vBgi, vBge +eor sAko_, sE4, sAmu, ROR #28 SEP +eor sAmu_, sE3, sAso, ROR #2 SEP xar_m0 vBgu, vAsi, E2, 3 +eor sAso_, sE0, sAma, ROR #54 SEP xar_m1 vBsi, vAku, E4, 25 +eor sAka_, sE1, sAbe, ROR #43 SEP xar_m0 vBku, vAsa, E0, 46 +eor sAse_, sE3, sAgo, ROR #36 SEP xar_m1 vBma, vAbu, E4, 37 +eor sAgo_, sE1, sAme, ROR #49 SEP xar_m0 vBbu, vAsu, E4, 50 +eor sAke_, sE2, sAgi, ROR #3 SEP +eor sAgi_, sE0, sAka, ROR #39 SEP xar_m1 vBsu, vAse, E1, 62 +eor sAga_, sE3, sAbo SEP xar_m0 vBme, E3, E0, 28 +eor sAbo_, sE3, sAmo, ROR #37 SEP xar_m1 vBbe, vAge, E1, 20 +eor sAmo_, sE2, sAmi, ROR #8 SEP bcax_m1 vAge, vBge, vBgo, vBgi +eor sAmi_, sE1, sAke, ROR #56 SEP bcax_m0 vAgi, vBgi, vBgu, vBgo +eor sAge_, sE4, sAgu, ROR #44 SEP +eor sAgu_, sE2, sAsi, ROR #62 SEP bcax_m1 vAgo, vBgo, vBga, vBgu +eor sAsi_, sE4, sAku, ROR #58 SEP bcax_m0 vAgu, vBgu, vBge, vBga +eor sAku_, sE0, sAsa, ROR #25 SEP bcax_m1 vAka, vBka, vBki, vBke +eor sAma_, sE4, sAbu, ROR #20 SEP bcax_m0 vAke, vBke, vBko, vBki +eor sAbu_, sE4, sAsu, ROR #9 SEP .unreq vvtmp +eor sAsu_, sE1, sAse, ROR #23 SEP .unreq vvtmpq +eor sAme_, sE0, sAga, ROR #61 SEP +eor sAbe_, sE1, sAge, ROR #19 SEP eor2 C0, vAka, vAga +load_constant_ptr SEP save(vAga) +restore count, STACK_OFFSET_COUNT SEP vvtmp .req vAga +bic tmp, sAgi_, sAge_, ROR #47 SEP vvtmpq .req vAgaq +eor sAga, tmp, sAga_, ROR #39 SEP bcax_m0 vAki, vBki, vBku, vBko +bic tmp, sAgo_, sAgi_, ROR #42 SEP +eor sAge, tmp, sAge_, ROR #25 SEP bcax_m1 vAko, vBko, vBka, vBku +bic tmp, sAgu_, sAgo_, ROR #16 SEP eor2 C1, vAke, vAge +eor sAgi, tmp, sAgi_, ROR #58 SEP bcax_m0 vAku, vBku, vBke, vBka +bic tmp, sAga_, sAgu_, ROR #31 SEP eor2 C2, vAki, vAgi +eor sAgo, tmp, sAgo_, ROR #47 SEP bcax_m1 vAma, vBma, vBmi, vBme +bic tmp, sAge_, sAga_, ROR #56 SEP eor2 C3, vAko, vAgo +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP bcax_m0 vAme, vBme, vBmo, vBmi +eor sAka, tmp, sAka_, ROR #24 SEP eor2 C4, vAku, vAgu +bic tmp, sAko_, sAki_, ROR #47 SEP bcax_m1 vAmi, vBmi, vBmu, vBmo +eor sAke, tmp, sAke_, ROR #2 SEP eor2 C0, C0, vAma +bic tmp, sAku_, sAko_, ROR #10 SEP bcax_m0 vAmo, vBmo, vBma, vBmu +eor sAki, tmp, sAki_, ROR #57 SEP +bic tmp, sAka_, sAku_, ROR #47 SEP eor2 C1, C1, vAme +eor sAko, tmp, sAko_, ROR #57 SEP bcax_m1 vAmu, vBmu, vBme, vBma +bic tmp, sAke_, sAka_, ROR #5 SEP eor2 C2, C2, vAmi +eor sAku, tmp, sAku_, ROR #52 SEP bcax_m0 vAsa, vBsa, vBsi, vBse +bic tmp, sAmi_, sAme_, ROR #38 SEP eor2 C3, C3, vAmo +eor sAma, tmp, sAma_, ROR #47 SEP +bic tmp, sAmo_, sAmi_, ROR #5 SEP bcax_m1 vAse, vBse, vBso, vBsi +eor sAme, tmp, sAme_, ROR #43 SEP eor2 C4, C4, vAmu +bic tmp, sAmu_, sAmo_, ROR #41 SEP bcax_m0 vAsi, vBsi, vBsu, vBso +eor sAmi, tmp, sAmi_, ROR #46 SEP eor2 C0, C0, vAsa +bic tmp, sAma_, sAmu_, ROR #35 SEP bcax_m1 vAso, vBso, vBsa, vBsu +ldr cur_const, [const_addr, count, UXTW #3] SEP eor2 C1, C1, vAse +add count, count, #1 SEP +eor sAmo, tmp, sAmo_, ROR #12 SEP bcax_m0 vAsu, vBsu, vBse, vBsa +bic tmp, sAme_, sAma_, ROR #9 SEP eor2 C2, C2, vAsi +eor sAmu, tmp, sAmu_, ROR #44 SEP eor2 C3, C3, vAso +bic tmp, sAsi_, sAse_, ROR #48 SEP bcax_m1 vAba, vBba, vBbi, vBbe +eor sAsa, tmp, sAsa_, ROR #41 SEP bcax_m0 vAbe, vBbe, vBbo, vBbi +bic tmp, sAso_, sAsi_, ROR #2 SEP +save count, STACK_OFFSET_COUNT SEP +eor sAse, tmp, sAse_, ROR #50 SEP eor2 C1, C1, vAbe +bic tmp, sAsu_, sAso_, ROR #25 SEP restore x27, STACK_OFFSET_CONST +eor sAsi, tmp, sAsi_, ROR #27 SEP ldr vvtmpq, [x27], #16 +bic tmp, sAsa_, sAsu_, ROR #60 SEP save x27, STACK_OFFSET_CONST +eor sAso, tmp, sAso_, ROR #21 SEP eor vAba.16b, vAba.16b, vvtmp.16b +bic tmp, sAse_, sAsa_, ROR #57 SEP eor2 C4, C4, vAsu +eor sAsu, tmp, sAsu_, ROR #53 SEP +bic tmp, sAbi_, sAbe_, ROR #63 SEP bcax_m0 vAbi, vBbi, vBbu, vBbo +eor s_Aba, s_Aba_, tmp, ROR #21 SEP bcax_m1 vAbo, vBbo, vBba, vBbu +bic tmp, sAbo_, sAbi_, ROR #42 SEP eor2 C3, C3, vAbo +eor sAbe, tmp, sAbe_, ROR #41 SEP eor2 C2, C2, vAbi +bic tmp, sAbu_, sAbo_, ROR #57 SEP eor2 C0, C0, vAba +eor sAbi, tmp, sAbi_, ROR #35 SEP +bic tmp, s_Aba_, sAbu_, ROR #50 SEP bcax_m0 vAbu, vBbu, vBbe, vBba +eor sAbo, tmp, sAbo_, ROR #43 SEP eor2 C4, C4, vAbu +bic tmp, sAbe_, s_Aba_, ROR #44 SEP restore(vAga) +eor sAbu, tmp, sAbu_, ROR #30 SEP .unreq vvtmp +eor s_Aba, s_Aba, cur_const SEP .unreq vvtmpq +eor sC0, sAka, sAsa, ROR #50 SEP vvtmp .req vBba +eor sC1, sAse, sAge, ROR #60 SEP rax1_m0 E2, C1, C3 +eor sC2, sAmi, sAgi, ROR #59 SEP rax1_m1 E4, C3, C0 +eor sC3, sAgo, sAso, ROR #30 SEP rax1_m0 E1, C0, C2 +eor sC4, sAbu, sAsu, ROR #53 SEP rax1_m1 E3, C2, C4 +eor sC0, sAma, sC0, ROR #49 SEP rax1_m0 E0, C4, C1 +eor sC1, sAbe, sC1, ROR #44 SEP +eor sC2, sAki, sC2, ROR #26 SEP .unreq vvtmp +eor sC3, sAmo, sC3, ROR #63 SEP vvtmp .req C1 +eor sC4, sAmu, sC4, ROR #56 SEP vvtmpq .req C1q +eor sC0, sAga, sC0, ROR #57 SEP eor vBba.16b, vAba.16b, E0.16b +eor sC1, sAme, sC1, ROR #58 SEP xar_m1 vBsa, vAbi, E2, 2 +eor sC2, sAbi, sC2, ROR #60 SEP +eor sC3, sAko, sC3, ROR #38 SEP xar_m0 vBbi, vAki, E2, 21 +eor sC4, sAgu, sC4, ROR #48 SEP xar_m1 vBki, vAko, E3, 39 +eor sC0, s_Aba, sC0, ROR #61 SEP xar_m0 vBko, vAmu, E4, 56 +eor sC1, sAke, sC1, ROR #57 SEP xar_m1 vBmu, vAso, E3, 8 +eor sC2, sAsi, sC2, ROR #52 SEP xar_m0 vBso, vAma, E0, 23 +eor sC3, sAbo, sC3, ROR #63 SEP xar_m1 vBka, vAbe, E1, 63 +eor sC4, sAku, sC4, ROR #50 SEP +ror sC1, sC1, 56 SEP xar_m0 vBse, vAgo, E3, 9 +ror sC4, sC4, 58 SEP xar_m1 vBgo, vAme, E1, 19 +ror sC2, sC2, 62 SEP xar_m0 vBke, vAgi, E2, 58 +eor sE1, sC0, sC2, ROR #63 SEP xar_m1 vBgi, vAka, E0, 61 +eor sE3, sC2, sC4, ROR #63 SEP xar_m0 vBga, vAbo, E3, 36 +eor sE0, sC4, sC1, ROR #63 SEP +eor sE2, sC1, sC3, ROR #63 SEP xar_m1 vBbo, vAmo, E3, 43 +eor sE4, sC3, sC0, ROR #63 SEP xar_m0 vBmo, vAmi, E2, 49 +eor s_Aba_, sE0, s_Aba SEP xar_m1 vBmi, vAke, E1, 54 +eor sAsa_, sE2, sAbi, ROR #50 SEP xar_m0 vBge, vAgu, E4, 44 +eor sAbi_, sE2, sAki, ROR #46 SEP mov E3.16b, vAga.16b +eor sAki_, sE3, sAko, ROR #63 SEP bcax_m1 vAga, vBga, vBgi, vBge +eor sAko_, sE4, sAmu, ROR #28 SEP +eor sAmu_, sE3, sAso, ROR #2 SEP xar_m0 vBgu, vAsi, E2, 3 +eor sAso_, sE0, sAma, ROR #54 SEP xar_m1 vBsi, vAku, E4, 25 +eor sAka_, sE1, sAbe, ROR #43 SEP xar_m0 vBku, vAsa, E0, 46 +eor sAse_, sE3, sAgo, ROR #36 SEP xar_m1 vBma, vAbu, E4, 37 +eor sAgo_, sE1, sAme, ROR #49 SEP xar_m0 vBbu, vAsu, E4, 50 +eor sAke_, sE2, sAgi, ROR #3 SEP +eor sAgi_, sE0, sAka, ROR #39 SEP xar_m1 vBsu, vAse, E1, 62 +eor sAga_, sE3, sAbo SEP xar_m0 vBme, E3, E0, 28 +eor sAbo_, sE3, sAmo, ROR #37 SEP xar_m1 vBbe, vAge, E1, 20 +eor sAmo_, sE2, sAmi, ROR #8 SEP bcax_m1 vAge, vBge, vBgo, vBgi +eor sAmi_, sE1, sAke, ROR #56 SEP bcax_m0 vAgi, vBgi, vBgu, vBgo +eor sAge_, sE4, sAgu, ROR #44 SEP +eor sAgu_, sE2, sAsi, ROR #62 SEP bcax_m1 vAgo, vBgo, vBga, vBgu +eor sAsi_, sE4, sAku, ROR #58 SEP bcax_m0 vAgu, vBgu, vBge, vBga +eor sAku_, sE0, sAsa, ROR #25 SEP bcax_m1 vAka, vBka, vBki, vBke +eor sAma_, sE4, sAbu, ROR #20 SEP bcax_m0 vAke, vBke, vBko, vBki +eor sAbu_, sE4, sAsu, ROR #9 SEP .unreq vvtmp +eor sAsu_, sE1, sAse, ROR #23 SEP .unreq vvtmpq +eor sAme_, sE0, sAga, ROR #61 SEP +eor sAbe_, sE1, sAge, ROR #19 SEP eor2 C0, vAka, vAga +load_constant_ptr SEP save(vAga) +restore count, STACK_OFFSET_COUNT SEP vvtmp .req vAga +bic tmp, sAgi_, sAge_, ROR #47 SEP vvtmpq .req vAgaq +eor sAga, tmp, sAga_, ROR #39 SEP bcax_m0 vAki, vBki, vBku, vBko +bic tmp, sAgo_, sAgi_, ROR #42 SEP +eor sAge, tmp, sAge_, ROR #25 SEP bcax_m1 vAko, vBko, vBka, vBku +bic tmp, sAgu_, sAgo_, ROR #16 SEP eor2 C1, vAke, vAge +eor sAgi, tmp, sAgi_, ROR #58 SEP bcax_m0 vAku, vBku, vBke, vBka +bic tmp, sAga_, sAgu_, ROR #31 SEP eor2 C2, vAki, vAgi +eor sAgo, tmp, sAgo_, ROR #47 SEP bcax_m1 vAma, vBma, vBmi, vBme +bic tmp, sAge_, sAga_, ROR #56 SEP eor2 C3, vAko, vAgo +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP bcax_m0 vAme, vBme, vBmo, vBmi +eor sAka, tmp, sAka_, ROR #24 SEP eor2 C4, vAku, vAgu +bic tmp, sAko_, sAki_, ROR #47 SEP bcax_m1 vAmi, vBmi, vBmu, vBmo +eor sAke, tmp, sAke_, ROR #2 SEP eor2 C0, C0, vAma +bic tmp, sAku_, sAko_, ROR #10 SEP bcax_m0 vAmo, vBmo, vBma, vBmu +eor sAki, tmp, sAki_, ROR #57 SEP +bic tmp, sAka_, sAku_, ROR #47 SEP eor2 C1, C1, vAme +eor sAko, tmp, sAko_, ROR #57 SEP bcax_m1 vAmu, vBmu, vBme, vBma +bic tmp, sAke_, sAka_, ROR #5 SEP eor2 C2, C2, vAmi +eor sAku, tmp, sAku_, ROR #52 SEP bcax_m0 vAsa, vBsa, vBsi, vBse +bic tmp, sAmi_, sAme_, ROR #38 SEP eor2 C3, C3, vAmo +eor sAma, tmp, sAma_, ROR #47 SEP +bic tmp, sAmo_, sAmi_, ROR #5 SEP bcax_m1 vAse, vBse, vBso, vBsi +eor sAme, tmp, sAme_, ROR #43 SEP eor2 C4, C4, vAmu +bic tmp, sAmu_, sAmo_, ROR #41 SEP bcax_m0 vAsi, vBsi, vBsu, vBso +eor sAmi, tmp, sAmi_, ROR #46 SEP eor2 C0, C0, vAsa +bic tmp, sAma_, sAmu_, ROR #35 SEP bcax_m1 vAso, vBso, vBsa, vBsu + SEP eor2 C1, C1, vAse +eor sAmo, tmp, sAmo_, ROR #12 SEP bcax_m0 vAsu, vBsu, vBse, vBsa +bic tmp, sAme_, sAma_, ROR #9 SEP eor2 C2, C2, vAsi +eor sAmu, tmp, sAmu_, ROR #44 SEP eor2 C3, C3, vAso +bic tmp, sAsi_, sAse_, ROR #48 SEP bcax_m1 vAba, vBba, vBbi, vBbe +eor sAsa, tmp, sAsa_, ROR #41 SEP bcax_m0 vAbe, vBbe, vBbo, vBbi +bic tmp, sAso_, sAsi_, ROR #2 SEP +eor sAse, tmp, sAse_, ROR #50 SEP eor2 C1, C1, vAbe +bic tmp, sAsu_, sAso_, ROR #25 SEP restore x26, STACK_OFFSET_CONST +eor sAsi, tmp, sAsi_, ROR #27 SEP ldr vvtmpq, [x26], #16 +bic tmp, sAsa_, sAsu_, ROR #60 SEP save x26, STACK_OFFSET_CONST +eor sAso, tmp, sAso_, ROR #21 SEP eor vAba.16b, vAba.16b, vvtmp.16b +bic tmp, sAse_, sAsa_, ROR #57 SEP eor2 C4, C4, vAsu +eor sAsu, tmp, sAsu_, ROR #53 SEP +ldr cur_const, [const_addr, count, UXTW #3] SEP +add count, count, #1 SEP +bic tmp, sAbi_, sAbe_, ROR #63 SEP bcax_m0 vAbi, vBbi, vBbu, vBbo +eor s_Aba, s_Aba_, tmp, ROR #21 SEP bcax_m1 vAbo, vBbo, vBba, vBbu +bic tmp, sAbo_, sAbi_, ROR #42 SEP eor2 C3, C3, vAbo +eor sAbe, tmp, sAbe_, ROR #41 SEP eor2 C2, C2, vAbi +bic tmp, sAbu_, sAbo_, ROR #57 SEP eor2 C0, C0, vAba +eor sAbi, tmp, sAbi_, ROR #35 SEP +bic tmp, s_Aba_, sAbu_, ROR #50 SEP bcax_m0 vAbu, vBbu, vBbe, vBba +eor sAbo, tmp, sAbo_, ROR #43 SEP eor2 C4, C4, vAbu +bic tmp, sAbe_, s_Aba_, ROR #44 SEP restore(vAga) +eor sAbu, tmp, sAbu_, ROR #30 SEP .unreq vvtmp +eor s_Aba, s_Aba, cur_const SEP .unreq vvtmpq +.endm + + +.macro hybrid_round_final +save count, STACK_OFFSET_COUNT SEP +eor sC0, sAka, sAsa, ROR #50 SEP vvtmp .req vBba +eor sC1, sAse, sAge, ROR #60 SEP rax1_m0 E2, C1, C3 +eor sC2, sAmi, sAgi, ROR #59 SEP +eor sC3, sAgo, sAso, ROR #30 SEP rax1_m1 E4, C3, C0 +eor sC4, sAbu, sAsu, ROR #53 SEP rax1_m0 E1, C0, C2 +eor sC0, sAma, sC0, ROR #49 SEP +eor sC1, sAbe, sC1, ROR #44 SEP rax1_m1 E3, C2, C4 +eor sC2, sAki, sC2, ROR #26 SEP rax1_m0 E0, C4, C1 +eor sC3, sAmo, sC3, ROR #63 SEP +eor sC4, sAmu, sC4, ROR #56 SEP .unreq vvtmp +eor sC0, sAga, sC0, ROR #57 SEP vvtmp .req C1 +eor sC1, sAme, sC1, ROR #58 SEP +eor sC2, sAbi, sC2, ROR #60 SEP vvtmpq .req C1q +eor sC3, sAko, sC3, ROR #38 SEP eor vBba.16b, vAba.16b, E0.16b +eor sC4, sAgu, sC4, ROR #48 SEP +eor sC0, s_Aba, sC0, ROR #61 SEP xar_m1 vBsa, vAbi, E2, 2 +eor sC1, sAke, sC1, ROR #57 SEP xar_m0 vBbi, vAki, E2, 21 +eor sC2, sAsi, sC2, ROR #52 SEP +eor sC3, sAbo, sC3, ROR #63 SEP xar_m1 vBki, vAko, E3, 39 +eor sC4, sAku, sC4, ROR #50 SEP xar_m0 vBko, vAmu, E4, 56 +ror sC1, sC1, 56 SEP +ror sC4, sC4, 58 SEP xar_m1 vBmu, vAso, E3, 8 +ror sC2, sC2, 62 SEP xar_m0 vBso, vAma, E0, 23 +eor sE1, sC0, sC2, ROR #63 SEP +eor sE3, sC2, sC4, ROR #63 SEP xar_m1 vBka, vAbe, E1, 63 +eor sE0, sC4, sC1, ROR #63 SEP xar_m0 vBse, vAgo, E3, 9 +eor sE2, sC1, sC3, ROR #63 SEP +eor sE4, sC3, sC0, ROR #63 SEP xar_m1 vBgo, vAme, E1, 19 +eor s_Aba_, sE0, s_Aba SEP xar_m0 vBke, vAgi, E2, 58 +eor sAsa_, sE2, sAbi, ROR #50 SEP +eor sAbi_, sE2, sAki, ROR #46 SEP xar_m1 vBgi, vAka, E0, 61 +eor sAki_, sE3, sAko, ROR #63 SEP +eor sAko_, sE4, sAmu, ROR #28 SEP xar_m0 vBga, vAbo, E3, 36 +eor sAmu_, sE3, sAso, ROR #2 SEP xar_m1 vBbo, vAmo, E3, 43 +eor sAso_, sE0, sAma, ROR #54 SEP +eor sAka_, sE1, sAbe, ROR #43 SEP xar_m0 vBmo, vAmi, E2, 49 +eor sAse_, sE3, sAgo, ROR #36 SEP xar_m1 vBmi, vAke, E1, 54 +eor sAgo_, sE1, sAme, ROR #49 SEP +eor sAke_, sE2, sAgi, ROR #3 SEP xar_m0 vBge, vAgu, E4, 44 +eor sAgi_, sE0, sAka, ROR #39 SEP mov E3.16b, vAga.16b +eor sAga_, sE3, sAbo SEP +eor sAbo_, sE3, sAmo, ROR #37 SEP bcax_m1 vAga, vBga, vBgi, vBge +eor sAmo_, sE2, sAmi, ROR #8 SEP xar_m0 vBgu, vAsi, E2, 3 +eor sAmi_, sE1, sAke, ROR #56 SEP +eor sAge_, sE4, sAgu, ROR #44 SEP xar_m1 vBsi, vAku, E4, 25 +eor sAgu_, sE2, sAsi, ROR #62 SEP xar_m0 vBku, vAsa, E0, 46 +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP xar_m1 vBma, vAbu, E4, 37 +eor sAma_, sE4, sAbu, ROR #20 SEP xar_m0 vBbu, vAsu, E4, 50 +eor sAbu_, sE4, sAsu, ROR #9 SEP +eor sAsu_, sE1, sAse, ROR #23 SEP xar_m1 vBsu, vAse, E1, 62 +eor sAme_, sE0, sAga, ROR #61 SEP xar_m0 vBme, E3, E0, 28 +eor sAbe_, sE1, sAge, ROR #19 SEP +load_constant_ptr SEP xar_m1 vBbe, vAge, E1, 20 +restore count, STACK_OFFSET_COUNT SEP bcax_m1 vAge, vBge, vBgo, vBgi +bic tmp, sAgi_, sAge_, ROR #47 SEP +eor sAga, tmp, sAga_, ROR #39 SEP bcax_m0 vAgi, vBgi, vBgu, vBgo +bic tmp, sAgo_, sAgi_, ROR #42 SEP bcax_m1 vAgo, vBgo, vBga, vBgu +eor sAge, tmp, sAge_, ROR #25 SEP +bic tmp, sAgu_, sAgo_, ROR #16 SEP bcax_m0 vAgu, vBgu, vBge, vBga +eor sAgi, tmp, sAgi_, ROR #58 SEP +bic tmp, sAga_, sAgu_, ROR #31 SEP bcax_m1 vAka, vBka, vBki, vBke +eor sAgo, tmp, sAgo_, ROR #47 SEP bcax_m0 vAke, vBke, vBko, vBki +bic tmp, sAge_, sAga_, ROR #56 SEP +eor sAgu, tmp, sAgu_, ROR #23 SEP .unreq vvtmp +bic tmp, sAki_, sAke_, ROR #19 SEP .unreq vvtmpq +eor sAka, tmp, sAka_, ROR #24 SEP +bic tmp, sAko_, sAki_, ROR #47 SEP eor2 C0, vAka, vAga +eor sAke, tmp, sAke_, ROR #2 SEP save(vAga) +bic tmp, sAku_, sAko_, ROR #10 SEP +eor sAki, tmp, sAki_, ROR #57 SEP vvtmp .req vAga +bic tmp, sAka_, sAku_, ROR #47 SEP vvtmpq .req vAgaq +eor sAko, tmp, sAko_, ROR #57 SEP +bic tmp, sAke_, sAka_, ROR #5 SEP bcax_m0 vAki, vBki, vBku, vBko +eor sAku, tmp, sAku_, ROR #52 SEP bcax_m1 vAko, vBko, vBka, vBku +bic tmp, sAmi_, sAme_, ROR #38 SEP +eor sAma, tmp, sAma_, ROR #47 SEP eor2 C1, vAke, vAge +bic tmp, sAmo_, sAmi_, ROR #5 SEP bcax_m0 vAku, vBku, vBke, vBka +eor sAme, tmp, sAme_, ROR #43 SEP +bic tmp, sAmu_, sAmo_, ROR #41 SEP eor2 C2, vAki, vAgi +eor sAmi, tmp, sAmi_, ROR #46 SEP bcax_m1 vAma, vBma, vBmi, vBme +bic tmp, sAma_, sAmu_, ROR #35 SEP +ldr cur_const, [const_addr, count, UXTW #3] SEP eor2 C3, vAko, vAgo +add count, count, #1 SEP bcax_m0 vAme, vBme, vBmo, vBmi +eor sAmo, tmp, sAmo_, ROR #12 SEP +bic tmp, sAme_, sAma_, ROR #9 SEP eor2 C4, vAku, vAgu +eor sAmu, tmp, sAmu_, ROR #44 SEP bcax_m1 vAmi, vBmi, vBmu, vBmo +bic tmp, sAsi_, sAse_, ROR #48 SEP +eor sAsa, tmp, sAsa_, ROR #41 SEP eor2 C0, C0, vAma +bic tmp, sAso_, sAsi_, ROR #2 SEP bcax_m0 vAmo, vBmo, vBma, vBmu +eor sAse, tmp, sAse_, ROR #50 SEP +bic tmp, sAsu_, sAso_, ROR #25 SEP eor2 C1, C1, vAme +eor sAsi, tmp, sAsi_, ROR #27 SEP +bic tmp, sAsa_, sAsu_, ROR #60 SEP bcax_m1 vAmu, vBmu, vBme, vBma +eor sAso, tmp, sAso_, ROR #21 SEP eor2 C2, C2, vAmi +bic tmp, sAse_, sAsa_, ROR #57 SEP +eor sAsu, tmp, sAsu_, ROR #53 SEP bcax_m0 vAsa, vBsa, vBsi, vBse +bic tmp, sAbi_, sAbe_, ROR #63 SEP eor2 C3, C3, vAmo +eor s_Aba, s_Aba_, tmp, ROR #21 SEP +bic tmp, sAbo_, sAbi_, ROR #42 SEP bcax_m1 vAse, vBse, vBso, vBsi +eor sAbe, tmp, sAbe_, ROR #41 SEP eor2 C4, C4, vAmu +bic tmp, sAbu_, sAbo_, ROR #57 SEP +eor sAbi, tmp, sAbi_, ROR #35 SEP bcax_m0 vAsi, vBsi, vBsu, vBso +bic tmp, s_Aba_, sAbu_, ROR #50 SEP eor2 C0, C0, vAsa +eor sAbo, tmp, sAbo_, ROR #43 SEP +bic tmp, sAbe_, s_Aba_, ROR #44 SEP bcax_m1 vAso, vBso, vBsa, vBsu +eor sAbu, tmp, sAbu_, ROR #30 SEP eor2 C1, C1, vAse +eor s_Aba, s_Aba, cur_const SEP +save count, STACK_OFFSET_COUNT SEP bcax_m0 vAsu, vBsu, vBse, vBsa +eor sC0, sAka, sAsa, ROR #50 SEP eor2 C2, C2, vAsi +eor sC1, sAse, sAge, ROR #60 SEP +eor sC2, sAmi, sAgi, ROR #59 SEP eor2 C3, C3, vAso +eor sC3, sAgo, sAso, ROR #30 SEP bcax_m1 vAba, vBba, vBbi, vBbe +eor sC4, sAbu, sAsu, ROR #53 SEP +eor sC0, sAma, sC0, ROR #49 SEP bcax_m0 vAbe, vBbe, vBbo, vBbi +eor sC1, sAbe, sC1, ROR #44 SEP eor2 C1, C1, vAbe +eor sC2, sAki, sC2, ROR #26 SEP +eor sC3, sAmo, sC3, ROR #63 SEP restore x30, STACK_OFFSET_CONST +eor sC4, sAmu, sC4, ROR #56 SEP ldr vvtmpq, [x30], #16 +eor sC0, sAga, sC0, ROR #57 SEP +eor sC1, sAme, sC1, ROR #58 SEP save x30, STACK_OFFSET_CONST +eor sC2, sAbi, sC2, ROR #60 SEP +eor sC3, sAko, sC3, ROR #38 SEP eor vAba.16b, vAba.16b, vvtmp.16b +eor sC4, sAgu, sC4, ROR #48 SEP eor2 C4, C4, vAsu +eor sC0, s_Aba, sC0, ROR #61 SEP +eor sC1, sAke, sC1, ROR #57 SEP bcax_m0 vAbi, vBbi, vBbu, vBbo +eor sC2, sAsi, sC2, ROR #52 SEP bcax_m1 vAbo, vBbo, vBba, vBbu +eor sC3, sAbo, sC3, ROR #63 SEP +eor sC4, sAku, sC4, ROR #50 SEP eor2 C3, C3, vAbo +ror sC1, sC1, 56 SEP eor2 C2, C2, vAbi +ror sC4, sC4, 58 SEP +ror sC2, sC2, 62 SEP eor2 C0, C0, vAba +eor sE1, sC0, sC2, ROR #63 SEP bcax_m0 vAbu, vBbu, vBbe, vBba +eor sE3, sC2, sC4, ROR #63 SEP +eor sE0, sC4, sC1, ROR #63 SEP eor2 C4, C4, vAbu +eor sE2, sC1, sC3, ROR #63 SEP restore(vAga) +eor sE4, sC3, sC0, ROR #63 SEP +eor s_Aba_, sE0, s_Aba SEP .unreq vvtmp +eor sAsa_, sE2, sAbi, ROR #50 SEP .unreq vvtmpq +eor sAbi_, sE2, sAki, ROR #46 SEP +eor sAki_, sE3, sAko, ROR #63 SEP vvtmp .req vBba +eor sAko_, sE4, sAmu, ROR #28 SEP rax1_m0 E2, C1, C3 +eor sAmu_, sE3, sAso, ROR #2 SEP +eor sAso_, sE0, sAma, ROR #54 SEP rax1_m1 E4, C3, C0 +eor sAka_, sE1, sAbe, ROR #43 SEP rax1_m0 E1, C0, C2 +eor sAse_, sE3, sAgo, ROR #36 SEP +eor sAgo_, sE1, sAme, ROR #49 SEP rax1_m1 E3, C2, C4 +eor sAke_, sE2, sAgi, ROR #3 SEP rax1_m0 E0, C4, C1 +eor sAgi_, sE0, sAka, ROR #39 SEP +eor sAga_, sE3, sAbo SEP .unreq vvtmp +eor sAbo_, sE3, sAmo, ROR #37 SEP +eor sAmo_, sE2, sAmi, ROR #8 SEP vvtmp .req C1 +eor sAmi_, sE1, sAke, ROR #56 SEP vvtmpq .req C1q +eor sAge_, sE4, sAgu, ROR #44 SEP +eor sAgu_, sE2, sAsi, ROR #62 SEP eor vBba.16b, vAba.16b, E0.16b +eor sAsi_, sE4, sAku, ROR #58 SEP xar_m0 vBsa, vAbi, E2, 2 +eor sAku_, sE0, sAsa, ROR #25 SEP +eor sAma_, sE4, sAbu, ROR #20 SEP xar_m1 vBbi, vAki, E2, 21 +eor sAbu_, sE4, sAsu, ROR #9 SEP xar_m0 vBki, vAko, E3, 39 +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP xar_m1 vBko, vAmu, E4, 56 +eor sAbe_, sE1, sAge, ROR #19 SEP xar_m0 vBmu, vAso, E3, 8 +load_constant_ptr SEP +restore count, STACK_OFFSET_COUNT SEP xar_m1 vBso, vAma, E0, 23 +bic tmp, sAgi_, sAge_, ROR #47 SEP xar_m0 vBka, vAbe, E1, 63 +eor sAga, tmp, sAga_, ROR #39 SEP +bic tmp, sAgo_, sAgi_, ROR #42 SEP xar_m1 vBse, vAgo, E3, 9 +eor sAge, tmp, sAge_, ROR #25 SEP xar_m0 vBgo, vAme, E1, 19 +bic tmp, sAgu_, sAgo_, ROR #16 SEP +eor sAgi, tmp, sAgi_, ROR #58 SEP xar_m1 vBke, vAgi, E2, 58 +bic tmp, sAga_, sAgu_, ROR #31 SEP xar_m0 vBgi, vAka, E0, 61 +eor sAgo, tmp, sAgo_, ROR #47 SEP +bic tmp, sAge_, sAga_, ROR #56 SEP xar_m1 vBga, vAbo, E3, 36 +eor sAgu, tmp, sAgu_, ROR #23 SEP xar_m0 vBbo, vAmo, E3, 43 +bic tmp, sAki_, sAke_, ROR #19 SEP +eor sAka, tmp, sAka_, ROR #24 SEP xar_m1 vBmo, vAmi, E2, 49 +bic tmp, sAko_, sAki_, ROR #47 SEP xar_m0 vBmi, vAke, E1, 54 +eor sAke, tmp, sAke_, ROR #2 SEP +bic tmp, sAku_, sAko_, ROR #10 SEP xar_m1 vBge, vAgu, E4, 44 +eor sAki, tmp, sAki_, ROR #57 SEP mov E3.16b, vAga.16b +bic tmp, sAka_, sAku_, ROR #47 SEP +eor sAko, tmp, sAko_, ROR #57 SEP bcax_m1 vAga, vBga, vBgi, vBge +bic tmp, sAke_, sAka_, ROR #5 SEP +eor sAku, tmp, sAku_, ROR #52 SEP xar_m0 vBgu, vAsi, E2, 3 +bic tmp, sAmi_, sAme_, ROR #38 SEP xar_m1 vBsi, vAku, E4, 25 +eor sAma, tmp, sAma_, ROR #47 SEP +bic tmp, sAmo_, sAmi_, ROR #5 SEP xar_m0 vBku, vAsa, E0, 46 +eor sAme, tmp, sAme_, ROR #43 SEP xar_m1 vBma, vAbu, E4, 37 +bic tmp, sAmu_, sAmo_, ROR #41 SEP +eor sAmi, tmp, sAmi_, ROR #46 SEP xar_m0 vBbu, vAsu, E4, 50 +bic tmp, sAma_, sAmu_, ROR #35 SEP xar_m1 vBsu, vAse, E1, 62 +ldr cur_const, [const_addr, count, UXTW #3] SEP +add count, count, #1 SEP xar_m0 vBme, E3, E0, 28 +eor sAmo, tmp, sAmo_, ROR #12 SEP xar_m1 vBbe, vAge, E1, 20 +bic tmp, sAme_, sAma_, ROR #9 SEP +eor sAmu, tmp, sAmu_, ROR #44 SEP bcax_m0 vAge, vBge, vBgo, vBgi +bic tmp, sAsi_, sAse_, ROR #48 SEP bcax_m1 vAgi, vBgi, vBgu, vBgo +eor sAsa, tmp, sAsa_, ROR #41 SEP +bic tmp, sAso_, sAsi_, ROR #2 SEP bcax_m0 vAgo, vBgo, vBga, vBgu +eor sAse, tmp, sAse_, ROR #50 SEP bcax_m1 vAgu, vBgu, vBge, vBga +bic tmp, sAsu_, sAso_, ROR #25 SEP +eor sAsi, tmp, sAsi_, ROR #27 SEP bcax_m0 vAka, vBka, vBki, vBke +bic tmp, sAsa_, sAsu_, ROR #60 SEP bcax_m1 vAke, vBke, vBko, vBki +eor sAso, tmp, sAso_, ROR #21 SEP +bic tmp, sAse_, sAsa_, ROR #57 SEP bcax_m0 vAki, vBki, vBku, vBko +eor sAsu, tmp, sAsu_, ROR #53 SEP bcax_m1 vAko, vBko, vBka, vBku +bic tmp, sAbi_, sAbe_, ROR #63 SEP +eor s_Aba, s_Aba_, tmp, ROR #21 SEP bcax_m0 vAku, vBku, vBke, vBka +bic tmp, sAbo_, sAbi_, ROR #42 SEP bcax_m1 vAma, vBma, vBmi, vBme +eor sAbe, tmp, sAbe_, ROR #41 SEP +bic tmp, sAbu_, sAbo_, ROR #57 SEP bcax_m0 vAme, vBme, vBmo, vBmi +eor sAbi, tmp, sAbi_, ROR #35 SEP +bic tmp, s_Aba_, sAbu_, ROR #50 SEP bcax_m1 vAmi, vBmi, vBmu, vBmo +eor sAbo, tmp, sAbo_, ROR #43 SEP bcax_m0 vAmo, vBmo, vBma, vBmu +bic tmp, sAbe_, s_Aba_, ROR #44 SEP +eor sAbu, tmp, sAbu_, ROR #30 SEP bcax_m1 vAmu, vBmu, vBme, vBma +eor s_Aba, s_Aba, cur_const SEP bcax_m0 vAsa, vBsa, vBsi, vBse +ror sAga, sAga,(64-3) SEP +ror sAka, sAka,(64-25) SEP bcax_m1 vAse, vBse, vBso, vBsi +ror sAma, sAma,(64-10) SEP bcax_m0 vAsi, vBsi, vBsu, vBso +ror sAsa, sAsa,(64-39) SEP +ror sAbe, sAbe,(64-21) SEP bcax_m1 vAso, vBso, vBsa, vBsu +ror sAge, sAge,(64-45) SEP bcax_m0 vAsu, vBsu, vBse, vBsa +ror sAke, sAke,(64-8) SEP +ror sAme, sAme,(64-15) SEP bcax_m1 vAba, vBba, vBbi, vBbe +ror sAse, sAse,(64-41) SEP bcax_m0 vAbe, vBbe, vBbo, vBbi +ror sAbi, sAbi,(64-14) SEP +ror sAgi, sAgi,(64-61) SEP bcax_m1 vAbi, vBbi, vBbu, vBbo +ror sAki, sAki,(64-18) SEP bcax_m0 vAbo, vBbo, vBba, vBbu +ror sAmi, sAmi,(64-56) SEP +ror sAsi, sAsi,(64-2) SEP bcax_m1 vAbu, vBbu, vBbe, vBba +ror sAgo, sAgo,(64-28) SEP +ror sAko, sAko,(64-1) SEP +ror sAmo, sAmo,(64-27) SEP restore x26, STACK_OFFSET_CONST +ror sAso, sAso,(64-62) SEP ldr vvtmpq, [x26], #16 +ror sAbu, sAbu,(64-44) SEP +ror sAgu, sAgu,(64-20) SEP save x26, STACK_OFFSET_CONST +ror sAku, sAku,(64-6) SEP eor vAba.16b, vAba.16b, vvtmp.16b +ror sAmu, sAmu,(64-36) SEP .unreq vvtmp +ror sAsu, sAsu,(64-55) SEP .unreq vvtmpq +.endm + + + +#define KECCAK_F1600_ROUNDS 24 + +.global keccak_f1600_x3_hybrid_asm_v6 +.global _keccak_f1600_x3_hybrid_asm_v6 +.text +.align 4 + +keccak_f1600_x3_hybrid_asm_v6: +_keccak_f1600_x3_hybrid_asm_v6: + alloc_stack + save_gprs + save_vregs + save input_addr, STACK_OFFSET_INPUT + + + ASM_LOAD(const_addr,round_constants_vec) + + save const_addr, STACK_OFFSET_CONST + load_input_vector 1,0 + + add input_addr, input_addr, #400 + load_input_scalar 1,0 + hybrid_round_initial + loop_0: + hybrid_round_noninitial + cmp count, #(KECCAK_F1600_ROUNDS-3) + ble loop_0 + + hybrid_round_final + + restore input_addr, STACK_OFFSET_INPUT + store_input_vector 1,0 + add input_addr, input_addr, #400 + store_input_scalar 1,0 + + restore_vregs + restore_gprs + free_stack + + + ret +#endif \ No newline at end of file diff --git a/tests/keccak_neon/manual/keccak_f1600_x3_hybrid_asm_v7.s b/tests/keccak_neon/manual/keccak_f1600_x3_hybrid_asm_v7.s new file mode 100644 index 0000000..559b9f2 --- /dev/null +++ b/tests/keccak_neon/manual/keccak_f1600_x3_hybrid_asm_v7.s @@ -0,0 +1,924 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" +#if defined(__ARM_FEATURE_SHA3) + +/********************** CONSTANTS *************************/ + .data + .align(8) +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 +round_constants_vec: + .quad 0x0000000000000001 + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + .quad 0x8000000080008008 +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x26 + cur_const .req x26 + count .req w27 + + /* Mapping of Kecck-f1600 state to vector registers + * at the beginning and end of each round. */ + vAba .req v0 + vAbe .req v1 + vAbi .req v2 + vAbo .req v3 + vAbu .req v4 + vAga .req v5 + vAge .req v6 + vAgi .req v7 + vAgo .req v8 + vAgu .req v9 + vAka .req v10 + vAke .req v11 + vAki .req v12 + vAko .req v13 + vAku .req v14 + vAma .req v15 + vAme .req v16 + vAmi .req v17 + vAmo .req v18 + vAmu .req v19 + vAsa .req v20 + vAse .req v21 + vAsi .req v22 + vAso .req v23 + vAsu .req v24 + + /* q-form of the above mapping */ + vAbaq .req q0 + vAbeq .req q1 + vAbiq .req q2 + vAboq .req q3 + vAbuq .req q4 + vAgaq .req q5 + vAgeq .req q6 + vAgiq .req q7 + vAgoq .req q8 + vAguq .req q9 + vAkaq .req q10 + vAkeq .req q11 + vAkiq .req q12 + vAkoq .req q13 + vAkuq .req q14 + vAmaq .req q15 + vAmeq .req q16 + vAmiq .req q17 + vAmoq .req q18 + vAmuq .req q19 + vAsaq .req q20 + vAseq .req q21 + vAsiq .req q22 + vAsoq .req q23 + vAsuq .req q24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v30 + C1 .req v29 + C2 .req v28 + C3 .req v27 + C4 .req v26 + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req v26 + E1 .req v25 + E2 .req v29 + E3 .req v28 + E4 .req v27 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + vAbi_ .req v2 + vAbo_ .req v3 + vAbu_ .req v4 + vAga_ .req v10 + vAge_ .req v11 + vAgi_ .req v7 + vAgo_ .req v8 + vAgu_ .req v9 + vAka_ .req v15 + vAke_ .req v16 + vAki_ .req v12 + vAko_ .req v13 + vAku_ .req v14 + vAma_ .req v20 + vAme_ .req v21 + vAmi_ .req v17 + vAmo_ .req v18 + vAmu_ .req v19 + vAsa_ .req v0 + vAse_ .req v1 + vAsi_ .req v22 + vAso_ .req v23 + vAsu_ .req v24 + vAba_ .req v30 + vAbe_ .req v27 + + /* Mapping of Kecck-f1600 state to scalar registers + * at the beginning and end of each round. */ + s_Aba .req x1 + sAbe .req x6 + sAbi .req x11 + sAbo .req x16 + sAbu .req x21 + sAga .req x2 + sAge .req x7 + sAgi .req x12 + sAgo .req x17 + sAgu .req x22 + sAka .req x3 + sAke .req x8 + sAki .req x13 + sAko .req x18 + sAku .req x23 + sAma .req x4 + sAme .req x9 + sAmi .req x14 + sAmo .req x19 + sAmu .req x24 + sAsa .req x5 + sAse .req x10 + sAsi .req x15 + sAso .req x20 + sAsu .req x25 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + s_Aba_ .req x30 + sAbe_ .req x28 + sAbi_ .req x11 + sAbo_ .req x16 + sAbu_ .req x21 + sAga_ .req x3 + sAge_ .req x8 + sAgi_ .req x12 + sAgo_ .req x17 + sAgu_ .req x22 + sAka_ .req x4 + sAke_ .req x9 + sAki_ .req x13 + sAko_ .req x18 + sAku_ .req x23 + sAma_ .req x5 + sAme_ .req x10 + sAmi_ .req x14 + sAmo_ .req x19 + sAmu_ .req x24 + sAsa_ .req x1 + sAse_ .req x6 + sAsi_ .req x15 + sAso_ .req x20 + sAsu_ .req x25 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + sC0 .req x30 + sE0 .req x29 + sC1 .req x26 + sE1 .req x0 + sC2 .req x27 + sE2 .req x26 + sC3 .req x28 + sE3 .req x27 + sC4 .req x29 + sE4 .req x28 + + tmp .req x0 + +/************************ MACROS ****************************/ + +/* Macros using v8.4-A SHA-3 instructions */ + + +.macro eor2 d s0 s1 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor3_m0 d s0 s1 s2 + eor3 \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +.macro rax1_m0 d s0 s1 + rax1 \d\().2d, \s0\().2d, \s1\().2d +.endm + +.macro xar_m0 d s0 s1 imm + xar \d\().2d, \s0\().2d, \s1\().2d, #\imm +.endm + +.macro rax1_m1 d s0 s1 + xar_m0 tmp, vzr, \s1, 63 + eor \d\().16b, \s0\().16b, tmp.16b +.endm + +.macro bcax_m0 d s0 s1 s2 + bcax \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +.macro load_input_vector num idx + ldr vAbaq, [input_addr, #(16*(\num*0+\idx))] + ldr vAbeq, [input_addr, #(16*(\num*1+\idx))] + ldr vAbiq, [input_addr, #(16*(\num*2+\idx))] + ldr vAboq, [input_addr, #(16*(\num*3+\idx))] + ldr vAbuq, [input_addr, #(16*(\num*4+\idx))] + ldr vAgaq, [input_addr, #(16*(\num*5+\idx))] + ldr vAgeq, [input_addr, #(16*(\num*6+\idx))] + ldr vAgiq, [input_addr, #(16*(\num*7+\idx))] + ldr vAgoq, [input_addr, #(16*(\num*8+\idx))] + ldr vAguq, [input_addr, #(16*(\num*9+\idx))] + ldr vAkaq, [input_addr, #(16*(\num*10+\idx))] + ldr vAkeq, [input_addr, #(16*(\num*11+\idx))] + ldr vAkiq, [input_addr, #(16*(\num*12+\idx))] + ldr vAkoq, [input_addr, #(16*(\num*13+\idx))] + ldr vAkuq, [input_addr, #(16*(\num*14+\idx))] + ldr vAmaq, [input_addr, #(16*(\num*15+\idx))] + ldr vAmeq, [input_addr, #(16*(\num*16+\idx))] + ldr vAmiq, [input_addr, #(16*(\num*17+\idx))] + ldr vAmoq, [input_addr, #(16*(\num*18+\idx))] + ldr vAmuq, [input_addr, #(16*(\num*19+\idx))] + ldr vAsaq, [input_addr, #(16*(\num*20+\idx))] + ldr vAseq, [input_addr, #(16*(\num*21+\idx))] + ldr vAsiq, [input_addr, #(16*(\num*22+\idx))] + ldr vAsoq, [input_addr, #(16*(\num*23+\idx))] + ldr vAsuq, [input_addr, #(16*(\num*24+\idx))] +.endm + +.macro store_input_vector num idx + str vAbaq, [input_addr, #(16*(\num*0+\idx))] + str vAbeq, [input_addr, #(16*(\num*1+\idx))] + str vAbiq, [input_addr, #(16*(\num*2+\idx))] + str vAboq, [input_addr, #(16*(\num*3+\idx))] + str vAbuq, [input_addr, #(16*(\num*4+\idx))] + str vAgaq, [input_addr, #(16*(\num*5+\idx))] + str vAgeq, [input_addr, #(16*(\num*6+\idx))] + str vAgiq, [input_addr, #(16*(\num*7+\idx))] + str vAgoq, [input_addr, #(16*(\num*8+\idx))] + str vAguq, [input_addr, #(16*(\num*9+\idx))] + str vAkaq, [input_addr, #(16*(\num*10+\idx))] + str vAkeq, [input_addr, #(16*(\num*11+\idx))] + str vAkiq, [input_addr, #(16*(\num*12+\idx))] + str vAkoq, [input_addr, #(16*(\num*13+\idx))] + str vAkuq, [input_addr, #(16*(\num*14+\idx))] + str vAmaq, [input_addr, #(16*(\num*15+\idx))] + str vAmeq, [input_addr, #(16*(\num*16+\idx))] + str vAmiq, [input_addr, #(16*(\num*17+\idx))] + str vAmoq, [input_addr, #(16*(\num*18+\idx))] + str vAmuq, [input_addr, #(16*(\num*19+\idx))] + str vAsaq, [input_addr, #(16*(\num*20+\idx))] + str vAseq, [input_addr, #(16*(\num*21+\idx))] + str vAsiq, [input_addr, #(16*(\num*22+\idx))] + str vAsoq, [input_addr, #(16*(\num*23+\idx))] + str vAsuq, [input_addr, #(16*(\num*24+\idx))] +.endm + +.macro store_input_scalar num idx + str s_Aba, [input_addr, 8*(\num*(0) +\idx)] + str sAbe, [input_addr, 8*(\num*(0+1) +\idx)] + str sAbi, [input_addr, 8*(\num*(2)+ \idx)] + str sAbo, [input_addr, 8*(\num*(2+1) +\idx)] + str sAbu, [input_addr, 8*(\num*(4)+ \idx)] + str sAga, [input_addr, 8*(\num*(4+1) +\idx)] + str sAge, [input_addr, 8*(\num*(6)+ \idx)] + str sAgi, [input_addr, 8*(\num*(6+1) +\idx)] + str sAgo, [input_addr, 8*(\num*(8)+ \idx)] + str sAgu, [input_addr, 8*(\num*(8+1) +\idx)] + str sAka, [input_addr, 8*(\num*(10) +\idx)] + str sAke, [input_addr, 8*(\num*(10+1)+\idx)] + str sAki, [input_addr, 8*(\num*(12) +\idx)] + str sAko, [input_addr, 8*(\num*(12+1)+\idx)] + str sAku, [input_addr, 8*(\num*(14) +\idx)] + str sAma, [input_addr, 8*(\num*(14+1)+\idx)] + str sAme, [input_addr, 8*(\num*(16) +\idx)] + str sAmi, [input_addr, 8*(\num*(16+1)+\idx)] + str sAmo, [input_addr, 8*(\num*(18) +\idx)] + str sAmu, [input_addr, 8*(\num*(18+1)+\idx)] + str sAsa, [input_addr, 8*(\num*(20) +\idx)] + str sAse, [input_addr, 8*(\num*(20+1)+\idx)] + str sAsi, [input_addr, 8*(\num*(22) +\idx)] + str sAso, [input_addr, 8*(\num*(22+1)+\idx)] + str sAsu, [input_addr, 8*(\num*(24) +\idx)] +.endm + +.macro load_input_scalar num idx + ldr s_Aba, [input_addr, 8*(\num*(0) +\idx)] + ldr sAbe, [input_addr, 8*(\num*(0+1) +\idx)] + ldr sAbi, [input_addr, 8*(\num*(2)+ \idx)] + ldr sAbo, [input_addr, 8*(\num*(2+1) +\idx)] + ldr sAbu, [input_addr, 8*(\num*(4)+ \idx)] + ldr sAga, [input_addr, 8*(\num*(4+1) +\idx)] + ldr sAge, [input_addr, 8*(\num*(6)+ \idx)] + ldr sAgi, [input_addr, 8*(\num*(6+1) +\idx)] + ldr sAgo, [input_addr, 8*(\num*(8)+ \idx)] + ldr sAgu, [input_addr, 8*(\num*(8+1) +\idx)] + ldr sAka, [input_addr, 8*(\num*(10) +\idx)] + ldr sAke, [input_addr, 8*(\num*(10+1)+\idx)] + ldr sAki, [input_addr, 8*(\num*(12) +\idx)] + ldr sAko, [input_addr, 8*(\num*(12+1)+\idx)] + ldr sAku, [input_addr, 8*(\num*(14) +\idx)] + ldr sAma, [input_addr, 8*(\num*(14+1)+\idx)] + ldr sAme, [input_addr, 8*(\num*(16) +\idx)] + ldr sAmi, [input_addr, 8*(\num*(16+1)+\idx)] + ldr sAmo, [input_addr, 8*(\num*(18) +\idx)] + ldr sAmu, [input_addr, 8*(\num*(18+1)+\idx)] + ldr sAsa, [input_addr, 8*(\num*(20) +\idx)] + ldr sAse, [input_addr, 8*(\num*(20+1)+\idx)] + ldr sAsi, [input_addr, 8*(\num*(22) +\idx)] + ldr sAso, [input_addr, 8*(\num*(22+1)+\idx)] + ldr sAsu, [input_addr, 8*(\num*(24) +\idx)] +.endm + +#define STACK_SIZE (8*8 + 16*6 + 4*8 + 16*5) // VREGS (8*8), GPRs (16*6), count (8), const (8), input (8), padding (8) +#define STACK_BASE_GPRS (4*8) +#define STACK_BASE_VREGS (4*8+16*6) +#define STACK_BASE_TMP (8*8 + 16*6 + 4*8) +#define STACK_OFFSET_INPUT (0*8) +#define STACK_OFFSET_CONST (1*8) +#define STACK_OFFSET_COUNT (2*8) +#define STACK_OFFSET_INPUT_SCALAR (3*8) + +#define vAga_offset 0 +#define vAge_offset 1 +#define vAgi_offset 2 +#define vAgo_offset 3 +#define vAgu_offset 4 + +#define save(name) \ + str name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] +#define restore(name) \ + ldr name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] + + +.macro save_gprs + stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro save_vregs + stp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] + stp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + stp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + stp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] +.endm + +.macro restore_vregs + ldp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] + ldp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + ldp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + ldp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] +.endm + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +.macro eor5 dst, src0, src1, src2, src3, src4 + eor \dst, \src0, \src1 + eor \dst, \dst, \src2 + eor \dst, \dst, \src3 + eor \dst, \dst, \src4 +.endm + +.macro xor_rol dst, src1, src0, imm + eor \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro bic_rol dst, src1, src0, imm + bic \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro rotate dst, src, imm + ror \dst, \src, #(64-\imm) +.endm + +.macro save reg, offset + str \reg, [sp, #\offset] +.endm + +.macro restore reg, offset + ldr \reg, [sp, #\offset] +.endm + +.macro hybrid_round_initial +eor sC0, sAma, sAsa SEP +eor sC1, sAme, sAse SEP eor3_m0 C0, vAba, vAga, vAka +eor sC2, sAmi, sAsi SEP eor3_m0 C1, vAbe, vAge, vAke +eor sC3, sAmo, sAso SEP eor3_m0 C2, vAbi, vAgi, vAki +eor sC4, sAmu, sAsu SEP eor3_m0 C3, vAbo, vAgo, vAko +eor sC0, sAka, sC0 SEP eor3_m0 C4, vAbu, vAgu, vAku +eor sC1, sAke, sC1 SEP save(vAga) +eor sC2, sAki, sC2 SEP +eor sC3, sAko, sC3 SEP vzr .req vAga +eor sC4, sAku, sC4 SEP eor vzr.16b, vzr.16b, vzr.16b +eor sC0, sAga, sC0 SEP save(vAge) +eor sC1, sAge, sC1 SEP save(vAgi) +eor sC2, sAgi, sC2 SEP save(vAgo) +eor sC3, sAgo, sC3 SEP save(vAgu) +eor sC4, sAgu, sC4 SEP +eor sC0, s_Aba, sC0 SEP C0r .req vAge +eor sC1, sAbe, sC1 SEP C1r .req vAgi +eor sC2, sAbi, sC2 SEP C2r .req vAgo +eor sC3, sAbo, sC3 SEP C3r .req vAgu +eor sC4, sAbu, sC4 SEP C4r .req v31 +eor sE1, sC0, sC2, ROR #63 SEP eor3_m0 C0, C0, vAma, vAsa +eor sE3, sC2, sC4, ROR #63 SEP +eor sE0, sC4, sC1, ROR #63 SEP eor3_m0 C1, C1, vAme, vAse +eor sE2, sC1, sC3, ROR #63 SEP eor3_m0 C2, C2, vAmi, vAsi +eor sE4, sC3, sC0, ROR #63 SEP eor3_m0 C3, C3, vAmo, vAso +eor s_Aba_, s_Aba, sE0 SEP eor3_m0 C4, C4, vAmu, vAsu +eor sAsa_, sAbi, sE2 SEP xar_m0 C2r, vzr, C2, 63 +eor sAbi_, sAki, sE2 SEP +eor sAki_, sAko, sE3 SEP xar_m0 C4r, vzr, C4, 63 +eor sAko_, sAmu, sE4 SEP xar_m0 C1r, vzr, C1, 63 +eor sAmu_, sAso, sE3 SEP xar_m0 C3r, vzr, C3, 63 +eor sAso_, sAma, sE0 SEP xar_m0 C0r, vzr, C0, 63 +eor sAka_, sAbe, sE1 SEP eor2 E1, C0, C2r +eor sAse_, sAgo, sE3 SEP restore(vAgo) +eor sAgo_, sAme, sE1 SEP +eor sAke_, sAgi, sE2 SEP eor2 E3, C2, C4r +eor sAgi_, sAka, sE0 SEP restore(vAga) +eor sAga_, sAbo, sE3 SEP eor2 E0, C4, C1r +eor sAbo_, sAmo, sE3 SEP restore(vAgi) +eor sAmo_, sAmi, sE2 SEP eor2 E2, C1, C3r +eor sAmi_, sAke, sE1 SEP restore(vAgu) +eor sAge_, sAgu, sE4 SEP +eor sAgu_, sAsi, sE2 SEP eor2 E4, C3, C0r +eor sAsi_, sAku, sE4 SEP restore(vAge) +eor sAku_, sAsa, sE0 SEP eor vAba_.16b, vAba.16b, E0.16b +eor sAma_, sAbu, sE4 SEP xar_m0 vAsa_, vAbi, E2, 2 +eor sAbu_, sAsu, sE4 SEP xar_m0 vAbi_, vAki, E2, 21 +eor sAsu_, sAse, sE1 SEP +eor sAme_, sAga, sE0 SEP xar_m0 vAki_, vAko, E3, 39 +eor sAbe_, sAge, sE1 SEP xar_m0 vAko_, vAmu, E4, 56 +load_constant_ptr SEP xar_m0 vAmu_, vAso, E3, 8 +tmp0 .req x0 SEP xar_m0 vAso_, vAma, E0, 23 +tmp1 .req x29 SEP xar_m0 vAka_, vAbe, E1, 63 +bic tmp0, sAgi_, sAge_, ROR #47 SEP xar_m0 vAse_, vAgo, E3, 9 +bic tmp1, sAgo_, sAgi_, ROR #42 SEP +eor sAga, tmp0, sAga_, ROR #39 SEP xar_m0 vAgo_, vAme, E1, 19 +bic tmp0, sAgu_, sAgo_, ROR #16 SEP xar_m0 vAke_, vAgi, E2, 58 +eor sAge, tmp1, sAge_, ROR #25 SEP xar_m0 vAgi_, vAka, E0, 61 +bic tmp1, sAga_, sAgu_, ROR #31 SEP xar_m0 vAga_, vAbo, E3, 36 +eor sAgi, tmp0, sAgi_, ROR #58 SEP xar_m0 vAbo_, vAmo, E3, 43 +bic tmp0, sAge_, sAga_, ROR #56 SEP xar_m0 vAmo_, vAmi, E2, 49 +eor sAgo, tmp1, sAgo_, ROR #47 SEP +bic tmp1, sAki_, sAke_, ROR #19 SEP xar_m0 vAmi_, vAke, E1, 54 +eor sAgu, tmp0, sAgu_, ROR #23 SEP xar_m0 vAge_, vAgu, E4, 44 +bic tmp0, sAko_, sAki_, ROR #47 SEP xar_m0 vAgu_, vAsi, E2, 3 +eor sAka, tmp1, sAka_, ROR #24 SEP xar_m0 vAsi_, vAku, E4, 25 +bic tmp1, sAku_, sAko_, ROR #10 SEP xar_m0 vAku_, vAsa, E0, 46 +eor sAke, tmp0, sAke_, ROR #2 SEP +bic tmp0, sAka_, sAku_, ROR #47 SEP xar_m0 vAma_, vAbu, E4, 37 +eor sAki, tmp1, sAki_, ROR #57 SEP xar_m0 vAbu_, vAsu, E4, 50 +bic tmp1, sAke_, sAka_, ROR #5 SEP xar_m0 vAsu_, vAse, E1, 62 +eor sAko, tmp0, sAko_, ROR #57 SEP xar_m0 vAme_, vAga, E0, 28 +bic tmp0, sAmi_, sAme_, ROR #38 SEP xar_m0 vAbe_, vAge, E1, 20 +eor sAku, tmp1, sAku_, ROR #52 SEP restore x27, STACK_OFFSET_CONST +bic tmp1, sAmo_, sAmi_, ROR #5 SEP +eor sAma, tmp0, sAma_, ROR #47 SEP ldr q31, [x27], #16 +bic tmp0, sAmu_, sAmo_, ROR #41 SEP save x27, STACK_OFFSET_CONST +eor sAme, tmp1, sAme_, ROR #43 SEP bcax_m0 vAga, vAga_, vAgi_, vAge_ +bic tmp1, sAma_, sAmu_, ROR #35 SEP bcax_m0 vAge, vAge_, vAgo_, vAgi_ +eor sAmi, tmp0, sAmi_, ROR #46 SEP bcax_m0 vAgi, vAgi_, vAgu_, vAgo_ +bic tmp0, sAme_, sAma_, ROR #9 SEP bcax_m0 vAgo, vAgo_, vAga_, vAgu_ +ldr cur_const, [const_addr] SEP +eor sAmo, tmp1, sAmo_, ROR #12 SEP bcax_m0 vAgu, vAgu_, vAge_, vAga_ +bic tmp1, sAsi_, sAse_, ROR #48 SEP bcax_m0 vAka, vAka_, vAki_, vAke_ +eor sAmu, tmp0, sAmu_, ROR #44 SEP bcax_m0 vAke, vAke_, vAko_, vAki_ +bic tmp0, sAso_, sAsi_, ROR #2 SEP bcax_m0 vAki, vAki_, vAku_, vAko_ +eor sAsa, tmp1, sAsa_, ROR #41 SEP bcax_m0 vAko, vAko_, vAka_, vAku_ +bic tmp1, sAsu_, sAso_, ROR #25 SEP +eor sAse, tmp0, sAse_, ROR #50 SEP bcax_m0 vAku, vAku_, vAke_, vAka_ +bic tmp0, sAsa_, sAsu_, ROR #60 SEP bcax_m0 vAma, vAma_, vAmi_, vAme_ +eor sAsi, tmp1, sAsi_, ROR #27 SEP bcax_m0 vAme, vAme_, vAmo_, vAmi_ +bic tmp1, sAse_, sAsa_, ROR #57 SEP bcax_m0 vAmi, vAmi_, vAmu_, vAmo_ +eor sAso, tmp0, sAso_, ROR #21 SEP bcax_m0 vAmo, vAmo_, vAma_, vAmu_ +mov count, #1 SEP bcax_m0 vAmu, vAmu_, vAme_, vAma_ +bic tmp0, sAbi_, sAbe_, ROR #63 SEP +eor sAsu, tmp1, sAsu_, ROR #53 SEP bcax_m0 vAsa, vAsa_, vAsi_, vAse_ +bic tmp1, sAbo_, sAbi_, ROR #42 SEP bcax_m0 vAse, vAse_, vAso_, vAsi_ +eor s_Aba, s_Aba_, tmp0, ROR #21 SEP bcax_m0 vAsi, vAsi_, vAsu_, vAso_ +bic tmp0, sAbu_, sAbo_, ROR #57 SEP bcax_m0 vAso, vAso_, vAsa_, vAsu_ +eor sAbe, tmp1, sAbe_, ROR #41 SEP bcax_m0 vAsu, vAsu_, vAse_, vAsa_ +bic tmp1, s_Aba_, sAbu_, ROR #50 SEP bcax_m0 vAba, vAba_, vAbi_, vAbe_ +eor sAbi, tmp0, sAbi_, ROR #35 SEP +bic tmp0, sAbe_, s_Aba_, ROR #44 SEP bcax_m0 vAbe, vAbe_, vAbo_, vAbi_ +eor sAbo, tmp1, sAbo_, ROR #43 SEP bcax_m0 vAbi, vAbi_, vAbu_, vAbo_ +eor sAbu, tmp0, sAbu_, ROR #30 SEP bcax_m0 vAbo, vAbo_, vAba_, vAbu_ +eor s_Aba, s_Aba, cur_const SEP bcax_m0 vAbu, vAbu_, vAbe_, vAba_ +save count, STACK_OFFSET_COUNT SEP eor vAba.16b, vAba.16b, v31.16b +.endm + +.macro hybrid_round_noninitial +eor sC2, sAsi, sAbi, ROR #52 SEP +eor sC0, s_Aba, sAga, ROR #61 SEP eor3_m0 C0, vAba, vAga, vAka +eor sC4, sAku, sAgu, ROR #50 SEP eor3_m0 C1, vAbe, vAge, vAke +eor sC1, sAke, sAme, ROR #57 SEP eor3_m0 C2, vAbi, vAgi, vAki +eor sC3, sAbo, sAko, ROR #63 SEP eor3_m0 C3, vAbo, vAgo, vAko +eor sC2, sC2, sAki, ROR #48 SEP eor3_m0 C4, vAbu, vAgu, vAku +eor sC0, sC0, sAma, ROR #54 SEP +eor sC4, sC4, sAmu, ROR #34 SEP save(vAga) +eor sC1, sC1, sAbe, ROR #51 SEP vzr .req vAga +eor sC3, sC3, sAmo, ROR #37 SEP eor vzr.16b, vzr.16b, vzr.16b +eor sC2, sC2, sAmi, ROR #10 SEP save(vAge) +eor sC0, sC0, sAka, ROR #39 SEP save(vAgi) +eor sC4, sC4, sAbu, ROR #26 SEP +eor sC1, sC1, sAse, ROR #31 SEP save(vAgo) +eor sC3, sC3, sAgo, ROR #36 SEP save(vAgu) +eor sC2, sC2, sAgi, ROR #5 SEP C0r .req vAge +eor sC0, sC0, sAsa, ROR #25 SEP C1r .req vAgi +eor sC4, sC4, sAsu, ROR #15 SEP +eor sC1, sC1, sAge, ROR #27 SEP C2r .req vAgo +eor sC3, sC3, sAso, ROR #2 SEP C3r .req vAgu +eor sE1, sC0, sC2, ROR #61 SEP C4r .req v31 +ror sC2, sC2, 62 SEP eor3_m0 C0, C0, vAma, vAsa +eor sE3, sC2, sC4, ROR #57 SEP eor3_m0 C1, C1, vAme, vAse +ror sC4, sC4, 58 SEP +eor sE0, sC4, sC1, ROR #55 SEP eor3_m0 C2, C2, vAmi, vAsi +ror sC1, sC1, 56 SEP eor3_m0 C3, C3, vAmo, vAso +eor sE2, sC1, sC3, ROR #63 SEP eor3_m0 C4, C4, vAmu, vAsu +eor sE4, sC3, sC0, ROR #63 SEP xar_m0 C2r, vzr, C2, 63 +eor s_Aba_, sE0, s_Aba SEP +eor sAsa_, sE2, sAbi, ROR #50 SEP xar_m0 C4r, vzr, C4, 63 +eor sAbi_, sE2, sAki, ROR #46 SEP xar_m0 C1r, vzr, C1, 63 +eor sAki_, sE3, sAko, ROR #63 SEP xar_m0 C3r, vzr, C3, 63 +eor sAko_, sE4, sAmu, ROR #28 SEP xar_m0 C0r, vzr, C0, 63 +eor sAmu_, sE3, sAso, ROR #2 SEP eor2 E1, C0, C2r +eor sAso_, sE0, sAma, ROR #54 SEP +eor sAka_, sE1, sAbe, ROR #43 SEP restore(vAgo) +eor sAse_, sE3, sAgo, ROR #36 SEP eor2 E3, C2, C4r +eor sAgo_, sE1, sAme, ROR #49 SEP restore(vAga) +eor sAke_, sE2, sAgi, ROR #3 SEP eor2 E0, C4, C1r +eor sAgi_, sE0, sAka, ROR #39 SEP +eor sAga_, sE3, sAbo SEP restore(vAgi) +eor sAbo_, sE3, sAmo, ROR #37 SEP eor2 E2, C1, C3r +eor sAmo_, sE2, sAmi, ROR #8 SEP restore(vAgu) +eor sAmi_, sE1, sAke, ROR #56 SEP eor2 E4, C3, C0r +eor sAge_, sE4, sAgu, ROR #44 SEP restore(vAge) +eor sAgu_, sE2, sAsi, ROR #62 SEP +eor sAsi_, sE4, sAku, ROR #58 SEP eor vAba_.16b, vAba.16b, E0.16b +eor sAku_, sE0, sAsa, ROR #25 SEP xar_m0 vAsa_, vAbi, E2, 2 +eor sAma_, sE4, sAbu, ROR #20 SEP xar_m0 vAbi_, vAki, E2, 21 +eor sAbu_, sE4, sAsu, ROR #9 SEP xar_m0 vAki_, vAko, E3, 39 +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP xar_m0 vAko_, vAmu, E4, 56 +eor sAbe_, sE1, sAge, ROR #19 SEP xar_m0 vAmu_, vAso, E3, 8 +load_constant_ptr SEP xar_m0 vAso_, vAma, E0, 23 +restore count, STACK_OFFSET_COUNT SEP xar_m0 vAka_, vAbe, E1, 63 +tmp0 .req x0 SEP xar_m0 vAse_, vAgo, E3, 9 +tmp1 .req x29 SEP +bic tmp0, sAgi_, sAge_, ROR #47 SEP xar_m0 vAgo_, vAme, E1, 19 +bic tmp1, sAgo_, sAgi_, ROR #42 SEP xar_m0 vAke_, vAgi, E2, 58 +eor sAga, tmp0, sAga_, ROR #39 SEP xar_m0 vAgi_, vAka, E0, 61 +bic tmp0, sAgu_, sAgo_, ROR #16 SEP xar_m0 vAga_, vAbo, E3, 36 +eor sAge, tmp1, sAge_, ROR #25 SEP xar_m0 vAbo_, vAmo, E3, 43 +bic tmp1, sAga_, sAgu_, ROR #31 SEP +eor sAgi, tmp0, sAgi_, ROR #58 SEP xar_m0 vAmo_, vAmi, E2, 49 +bic tmp0, sAge_, sAga_, ROR #56 SEP xar_m0 vAmi_, vAke, E1, 54 +eor sAgo, tmp1, sAgo_, ROR #47 SEP xar_m0 vAge_, vAgu, E4, 44 +bic tmp1, sAki_, sAke_, ROR #19 SEP xar_m0 vAgu_, vAsi, E2, 3 +eor sAgu, tmp0, sAgu_, ROR #23 SEP +bic tmp0, sAko_, sAki_, ROR #47 SEP xar_m0 vAsi_, vAku, E4, 25 +eor sAka, tmp1, sAka_, ROR #24 SEP xar_m0 vAku_, vAsa, E0, 46 +bic tmp1, sAku_, sAko_, ROR #10 SEP xar_m0 vAma_, vAbu, E4, 37 +eor sAke, tmp0, sAke_, ROR #2 SEP xar_m0 vAbu_, vAsu, E4, 50 +bic tmp0, sAka_, sAku_, ROR #47 SEP xar_m0 vAsu_, vAse, E1, 62 +eor sAki, tmp1, sAki_, ROR #57 SEP +bic tmp1, sAke_, sAka_, ROR #5 SEP xar_m0 vAme_, vAga, E0, 28 +eor sAko, tmp0, sAko_, ROR #57 SEP xar_m0 vAbe_, vAge, E1, 20 +bic tmp0, sAmi_, sAme_, ROR #38 SEP +eor sAku, tmp1, sAku_, ROR #52 SEP +bic tmp1, sAmo_, sAmi_, ROR #5 SEP +eor sAma, tmp0, sAma_, ROR #47 SEP +bic tmp0, sAmu_, sAmo_, ROR #41 SEP bcax_m0 vAga, vAga_, vAgi_, vAge_ +eor sAme, tmp1, sAme_, ROR #43 SEP bcax_m0 vAge, vAge_, vAgo_, vAgi_ +bic tmp1, sAma_, sAmu_, ROR #35 SEP bcax_m0 vAgi, vAgi_, vAgu_, vAgo_ +eor sAmi, tmp0, sAmi_, ROR #46 SEP bcax_m0 vAgo, vAgo_, vAga_, vAgu_ +bic tmp0, sAme_, sAma_, ROR #9 SEP +ldr cur_const, [const_addr, count, UXTW #3] SEP bcax_m0 vAgu, vAgu_, vAge_, vAga_ +eor sAmo, tmp1, sAmo_, ROR #12 SEP bcax_m0 vAka, vAka_, vAki_, vAke_ +bic tmp1, sAsi_, sAse_, ROR #48 SEP bcax_m0 vAke, vAke_, vAko_, vAki_ +eor sAmu, tmp0, sAmu_, ROR #44 SEP bcax_m0 vAki, vAki_, vAku_, vAko_ +bic tmp0, sAso_, sAsi_, ROR #2 SEP +eor sAsa, tmp1, sAsa_, ROR #41 SEP bcax_m0 vAko, vAko_, vAka_, vAku_ +bic tmp1, sAsu_, sAso_, ROR #25 SEP bcax_m0 vAku, vAku_, vAke_, vAka_ +eor sAse, tmp0, sAse_, ROR #50 SEP bcax_m0 vAma, vAma_, vAmi_, vAme_ +bic tmp0, sAsa_, sAsu_, ROR #60 SEP bcax_m0 vAme, vAme_, vAmo_, vAmi_ +eor sAsi, tmp1, sAsi_, ROR #27 SEP bcax_m0 vAmi, vAmi_, vAmu_, vAmo_ +bic tmp1, sAse_, sAsa_, ROR #57 SEP +eor sAso, tmp0, sAso_, ROR #21 SEP bcax_m0 vAmo, vAmo_, vAma_, vAmu_ +bic tmp0, sAbi_, sAbe_, ROR #63 SEP bcax_m0 vAmu, vAmu_, vAme_, vAma_ +add count, count, #1 SEP bcax_m0 vAsa, vAsa_, vAsi_, vAse_ +save count, STACK_OFFSET_COUNT SEP bcax_m0 vAse, vAse_, vAso_, vAsi_ +//TODO: schedule this better SEP +restore x27, STACK_OFFSET_CONST SEP +ldr q31, [x27], #16 SEP +save x27, STACK_OFFSET_CONST SEP +eor sAsu, tmp1, sAsu_, ROR #53 SEP +bic tmp1, sAbo_, sAbi_, ROR #42 SEP bcax_m0 vAsi, vAsi_, vAsu_, vAso_ +eor s_Aba, s_Aba_, tmp0, ROR #21 SEP bcax_m0 vAso, vAso_, vAsa_, vAsu_ +bic tmp0, sAbu_, sAbo_, ROR #57 SEP bcax_m0 vAsu, vAsu_, vAse_, vAsa_ +eor sAbe, tmp1, sAbe_, ROR #41 SEP bcax_m0 vAba, vAba_, vAbi_, vAbe_ +bic tmp1, s_Aba_, sAbu_, ROR #50 SEP bcax_m0 vAbe, vAbe_, vAbo_, vAbi_ +eor sAbi, tmp0, sAbi_, ROR #35 SEP +bic tmp0, sAbe_, s_Aba_, ROR #44 SEP bcax_m0 vAbi, vAbi_, vAbu_, vAbo_ +eor sAbo, tmp1, sAbo_, ROR #43 SEP bcax_m0 vAbo, vAbo_, vAba_, vAbu_ +eor sAbu, tmp0, sAbu_, ROR #30 SEP bcax_m0 vAbu, vAbu_, vAbe_, vAba_ +eor s_Aba, s_Aba, cur_const SEP eor vAba.16b, vAba.16b, v31.16b +.endm + + +.macro hybrid_round_final +eor sC2, sAsi, sAbi, ROR #52 SEP +eor sC0, s_Aba, sAga, ROR #61 SEP eor3_m0 C0, vAba, vAga, vAka +eor sC4, sAku, sAgu, ROR #50 SEP eor3_m0 C1, vAbe, vAge, vAke +eor sC1, sAke, sAme, ROR #57 SEP eor3_m0 C2, vAbi, vAgi, vAki +eor sC3, sAbo, sAko, ROR #63 SEP +eor sC2, sC2, sAki, ROR #48 SEP eor3_m0 C3, vAbo, vAgo, vAko +eor sC0, sC0, sAma, ROR #54 SEP eor3_m0 C4, vAbu, vAgu, vAku +eor sC4, sC4, sAmu, ROR #34 SEP +eor sC1, sC1, sAbe, ROR #51 SEP save(vAga) +eor sC3, sC3, sAmo, ROR #37 SEP vzr .req vAga +eor sC2, sC2, sAmi, ROR #10 SEP +eor sC0, sC0, sAka, ROR #39 SEP eor vzr.16b, vzr.16b, vzr.16b +eor sC4, sC4, sAbu, ROR #26 SEP save(vAge) +eor sC1, sC1, sAse, ROR #31 SEP +eor sC3, sC3, sAgo, ROR #36 SEP save(vAgi) +eor sC2, sC2, sAgi, ROR #5 SEP save(vAgo) +eor sC0, sC0, sAsa, ROR #25 SEP +eor sC4, sC4, sAsu, ROR #15 SEP save(vAgu) +eor sC1, sC1, sAge, ROR #27 SEP C0r .req vAge +eor sC3, sC3, sAso, ROR #2 SEP +eor sE1, sC0, sC2, ROR #61 SEP C1r .req vAgi +ror sC2, sC2, 62 SEP C2r .req vAgo +eor sE3, sC2, sC4, ROR #57 SEP +ror sC4, sC4, 58 SEP C3r .req vAgu +eor sE0, sC4, sC1, ROR #55 SEP C4r .req v31 +ror sC1, sC1, 56 SEP +eor sE2, sC1, sC3, ROR #63 SEP eor3_m0 C0, C0, vAma, vAsa +eor sE4, sC3, sC0, ROR #63 SEP eor3_m0 C1, C1, vAme, vAse +eor s_Aba_, sE0, s_Aba SEP eor3_m0 C2, C2, vAmi, vAsi +eor sAsa_, sE2, sAbi, ROR #50 SEP +eor sAbi_, sE2, sAki, ROR #46 SEP eor3_m0 C3, C3, vAmo, vAso +eor sAki_, sE3, sAko, ROR #63 SEP eor3_m0 C4, C4, vAmu, vAsu +eor sAko_, sE4, sAmu, ROR #28 SEP +eor sAmu_, sE3, sAso, ROR #2 SEP xar_m0 C2r, vzr, C2, 63 +eor sAso_, sE0, sAma, ROR #54 SEP xar_m0 C4r, vzr, C4, 63 +eor sAka_, sE1, sAbe, ROR #43 SEP +eor sAse_, sE3, sAgo, ROR #36 SEP xar_m0 C1r, vzr, C1, 63 +eor sAgo_, sE1, sAme, ROR #49 SEP xar_m0 C3r, vzr, C3, 63 +eor sAke_, sE2, sAgi, ROR #3 SEP +eor sAgi_, sE0, sAka, ROR #39 SEP xar_m0 C0r, vzr, C0, 63 +eor sAga_, sE3, sAbo SEP eor2 E1, C0, C2r +eor sAbo_, sE3, sAmo, ROR #37 SEP +eor sAmo_, sE2, sAmi, ROR #8 SEP restore(vAgo) +eor sAmi_, sE1, sAke, ROR #56 SEP eor2 E3, C2, C4r +eor sAge_, sE4, sAgu, ROR #44 SEP +eor sAgu_, sE2, sAsi, ROR #62 SEP restore(vAga) +eor sAsi_, sE4, sAku, ROR #58 SEP eor2 E0, C4, C1r +eor sAku_, sE0, sAsa, ROR #25 SEP +eor sAma_, sE4, sAbu, ROR #20 SEP restore(vAgi) +eor sAbu_, sE4, sAsu, ROR #9 SEP eor2 E2, C1, C3r +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP restore(vAgu) +eor sAbe_, sE1, sAge, ROR #19 SEP eor2 E4, C3, C0r +load_constant_ptr SEP +tmp0 .req x0 SEP restore(vAge) +tmp1 .req x29 SEP eor vAba_.16b, vAba.16b, E0.16b +bic tmp0, sAgi_, sAge_, ROR #47 SEP xar_m0 vAsa_, vAbi, E2, 2 +bic tmp1, sAgo_, sAgi_, ROR #42 SEP +eor sAga, tmp0, sAga_, ROR #39 SEP xar_m0 vAbi_, vAki, E2, 21 +bic tmp0, sAgu_, sAgo_, ROR #16 SEP xar_m0 vAki_, vAko, E3, 39 +eor sAge, tmp1, sAge_, ROR #25 SEP +bic tmp1, sAga_, sAgu_, ROR #31 SEP xar_m0 vAko_, vAmu, E4, 56 +restore count, STACK_OFFSET_COUNT SEP xar_m0 vAmu_, vAso, E3, 8 +eor sAgi, tmp0, sAgi_, ROR #58 SEP +bic tmp0, sAge_, sAga_, ROR #56 SEP xar_m0 vAso_, vAma, E0, 23 +eor sAgo, tmp1, sAgo_, ROR #47 SEP xar_m0 vAka_, vAbe, E1, 63 +bic tmp1, sAki_, sAke_, ROR #19 SEP +eor sAgu, tmp0, sAgu_, ROR #23 SEP xar_m0 vAse_, vAgo, E3, 9 +bic tmp0, sAko_, sAki_, ROR #47 SEP xar_m0 vAgo_, vAme, E1, 19 +eor sAka, tmp1, sAka_, ROR #24 SEP +bic tmp1, sAku_, sAko_, ROR #10 SEP xar_m0 vAke_, vAgi, E2, 58 +eor sAke, tmp0, sAke_, ROR #2 SEP xar_m0 vAgi_, vAka, E0, 61 +bic tmp0, sAka_, sAku_, ROR #47 SEP +eor sAki, tmp1, sAki_, ROR #57 SEP xar_m0 vAga_, vAbo, E3, 36 +bic tmp1, sAke_, sAka_, ROR #5 SEP xar_m0 vAbo_, vAmo, E3, 43 +eor sAko, tmp0, sAko_, ROR #57 SEP +bic tmp0, sAmi_, sAme_, ROR #38 SEP xar_m0 vAmo_, vAmi, E2, 49 +eor sAku, tmp1, sAku_, ROR #52 SEP xar_m0 vAmi_, vAke, E1, 54 +bic tmp1, sAmo_, sAmi_, ROR #5 SEP +eor sAma, tmp0, sAma_, ROR #47 SEP xar_m0 vAge_, vAgu, E4, 44 +bic tmp0, sAmu_, sAmo_, ROR #41 SEP xar_m0 vAgu_, vAsi, E2, 3 +eor sAme, tmp1, sAme_, ROR #43 SEP xar_m0 vAsi_, vAku, E4, 25 +bic tmp1, sAma_, sAmu_, ROR #35 SEP +eor sAmi, tmp0, sAmi_, ROR #46 SEP xar_m0 vAku_, vAsa, E0, 46 +bic tmp0, sAme_, sAma_, ROR #9 SEP xar_m0 vAma_, vAbu, E4, 37 +ldr cur_const, [const_addr, count, UXTW #3] SEP +eor sAmo, tmp1, sAmo_, ROR #12 SEP xar_m0 vAbu_, vAsu, E4, 50 +bic tmp1, sAsi_, sAse_, ROR #48 SEP xar_m0 vAsu_, vAse, E1, 62 +eor sAmu, tmp0, sAmu_, ROR #44 SEP +bic tmp0, sAso_, sAsi_, ROR #2 SEP xar_m0 vAme_, vAga, E0, 28 +eor sAsa, tmp1, sAsa_, ROR #41 SEP xar_m0 vAbe_, vAge, E1, 20 +bic tmp1, sAsu_, sAso_, ROR #25 SEP +eor sAse, tmp0, sAse_, ROR #50 SEP restore x27, STACK_OFFSET_CONST +bic tmp0, sAsa_, sAsu_, ROR #60 SEP ldr q31, [x27], #16 +eor sAsi, tmp1, sAsi_, ROR #27 SEP +bic tmp1, sAse_, sAsa_, ROR #57 SEP save x27, STACK_OFFSET_CONST +eor sAso, tmp0, sAso_, ROR #21 SEP bcax_m0 vAga, vAga_, vAgi_, vAge_ +bic tmp0, sAbi_, sAbe_, ROR #63 SEP +add count, count, #1 SEP bcax_m0 vAge, vAge_, vAgo_, vAgi_ +save count, STACK_OFFSET_COUNT SEP bcax_m0 vAgi, vAgi_, vAgu_, vAgo_ +eor sAsu, tmp1, sAsu_, ROR #53 SEP +bic tmp1, sAbo_, sAbi_, ROR #42 SEP bcax_m0 vAgo, vAgo_, vAga_, vAgu_ +eor s_Aba, s_Aba_, tmp0, ROR #21 SEP bcax_m0 vAgu, vAgu_, vAge_, vAga_ +bic tmp0, sAbu_, sAbo_, ROR #57 SEP +eor sAbe, tmp1, sAbe_, ROR #41 SEP bcax_m0 vAka, vAka_, vAki_, vAke_ +bic tmp1, s_Aba_, sAbu_, ROR #50 SEP bcax_m0 vAke, vAke_, vAko_, vAki_ +eor sAbi, tmp0, sAbi_, ROR #35 SEP +bic tmp0, sAbe_, s_Aba_, ROR #44 SEP bcax_m0 vAki, vAki_, vAku_, vAko_ +eor sAbo, tmp1, sAbo_, ROR #43 SEP bcax_m0 vAko, vAko_, vAka_, vAku_ +eor sAbu, tmp0, sAbu_, ROR #30 SEP bcax_m0 vAku, vAku_, vAke_, vAka_ +eor s_Aba, s_Aba, cur_const SEP +ror sAga, sAga,(64-3) SEP bcax_m0 vAma, vAma_, vAmi_, vAme_ +ror sAbu, sAbu,(64-44) SEP bcax_m0 vAme, vAme_, vAmo_, vAmi_ +ror sAka, sAka,(64-25) SEP +ror sAke, sAke,(64-8) SEP bcax_m0 vAmi, vAmi_, vAmu_, vAmo_ +ror sAma, sAma,(64-10) SEP bcax_m0 vAmo, vAmo_, vAma_, vAmu_ +ror sAku, sAku,(64-6) SEP +ror sAsa, sAsa,(64-39) SEP bcax_m0 vAmu, vAmu_, vAme_, vAma_ +ror sAse, sAse,(64-41) SEP bcax_m0 vAsa, vAsa_, vAsi_, vAse_ +ror sAbe, sAbe,(64-21) SEP +ror sAge, sAge,(64-45) SEP bcax_m0 vAse, vAse_, vAso_, vAsi_ +ror sAgi, sAgi,(64-61) SEP bcax_m0 vAsi, vAsi_, vAsu_, vAso_ +ror sAme, sAme,(64-15) SEP +ror sAmi, sAmi,(64-56) SEP bcax_m0 vAso, vAso_, vAsa_, vAsu_ +ror sAbi, sAbi,(64-14) SEP bcax_m0 vAsu, vAsu_, vAse_, vAsa_ +ror sAki, sAki,(64-18) SEP +ror sAko, sAko,(64-1) SEP bcax_m0 vAba, vAba_, vAbi_, vAbe_ +ror sAsi, sAsi,(64-2) SEP bcax_m0 vAbe, vAbe_, vAbo_, vAbi_ +ror sAso, sAso,(64-62) SEP +ror sAgo, sAgo,(64-28) SEP bcax_m0 vAbi, vAbi_, vAbu_, vAbo_ +ror sAgu, sAgu,(64-20) SEP bcax_m0 vAbo, vAbo_, vAba_, vAbu_ +ror sAmo, sAmo,(64-27) SEP +ror sAmu, sAmu,(64-36) SEP bcax_m0 vAbu, vAbu_, vAbe_, vAba_ +ror sAsu, sAsu,(64-55) SEP eor vAba.16b, vAba.16b, v31.16b +.endm + + + +#define KECCAK_F1600_ROUNDS 24 + +.global keccak_f1600_x3_hybrid_asm_v7 +.global _keccak_f1600_x3_hybrid_asm_v7 +.text +.align 4 + +keccak_f1600_x3_hybrid_asm_v7: +_keccak_f1600_x3_hybrid_asm_v7: + alloc_stack + save_gprs + save_vregs + save input_addr, STACK_OFFSET_INPUT + + + ASM_LOAD(const_addr,round_constants_vec) + + save const_addr, STACK_OFFSET_CONST + load_input_vector 1,0 + + add input_addr, input_addr, #400 + load_input_scalar 1,0 + hybrid_round_initial + loop_0: + hybrid_round_noninitial + restore count, STACK_OFFSET_COUNT + cmp count, #(KECCAK_F1600_ROUNDS-2) + ble loop_0 + + hybrid_round_final + + restore input_addr, STACK_OFFSET_INPUT + store_input_vector 1,0 + add input_addr, input_addr, #400 + store_input_scalar 1,0 + + restore_vregs + restore_gprs + free_stack + + + ret +#endif \ No newline at end of file diff --git a/tests/keccak_neon/manual/keccak_f1600_x4_hybrid_asm_v1.s b/tests/keccak_neon/manual/keccak_f1600_x4_hybrid_asm_v1.s new file mode 100644 index 0000000..ae453d6 --- /dev/null +++ b/tests/keccak_neon/manual/keccak_f1600_x4_hybrid_asm_v1.s @@ -0,0 +1,1142 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +#if defined(__ARM_FEATURE_SHA3) + +/********************** CONSTANTS *************************/ + .data + .align(8) +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x29 + count .req w27 + cur_const .req x26 + + /* Mapping of Kecck-f1600 SIMD state to vector registers + * at the beginning and end of each round. */ + + vAba .req v0 + vAbe .req v1 + vAbi .req v2 + vAbo .req v3 + vAbu .req v4 + vAga .req v5 + vAge .req v6 + vAgi .req v7 + vAgo .req v8 + vAgu .req v9 + vAka .req v10 + vAke .req v11 + vAki .req v12 + vAko .req v13 + vAku .req v14 + vAma .req v15 + vAme .req v16 + vAmi .req v17 + vAmo .req v18 + vAmu .req v19 + vAsa .req v20 + vAse .req v21 + vAsi .req v22 + vAso .req v23 + vAsu .req v24 + + /* q-form of the above mapping */ + vAbaq .req q0 + vAbeq .req q1 + vAbiq .req q2 + vAboq .req q3 + vAbuq .req q4 + vAgaq .req q5 + vAgeq .req q6 + vAgiq .req q7 + vAgoq .req q8 + vAguq .req q9 + vAkaq .req q10 + vAkeq .req q11 + vAkiq .req q12 + vAkoq .req q13 + vAkuq .req q14 + vAmaq .req q15 + vAmeq .req q16 + vAmiq .req q17 + vAmoq .req q18 + vAmuq .req q19 + vAsaq .req q20 + vAseq .req q21 + vAsiq .req q22 + vAsoq .req q23 + vAsuq .req q24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v30 + C1 .req v29 + C2 .req v28 + C3 .req v27 + C4 .req v26 + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req v26 + E1 .req v25 + E2 .req v29 + E3 .req v28 + E4 .req v27 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + vAbi_ .req v2 + vAbo_ .req v3 + vAbu_ .req v4 + vAga_ .req v10 + vAge_ .req v11 + vAgi_ .req v7 + vAgo_ .req v8 + vAgu_ .req v9 + vAka_ .req v15 + vAke_ .req v16 + vAki_ .req v12 + vAko_ .req v13 + vAku_ .req v14 + vAma_ .req v20 + vAme_ .req v21 + vAmi_ .req v17 + vAmo_ .req v18 + vAmu_ .req v19 + vAsa_ .req v0 + vAse_ .req v1 + vAsi_ .req v22 + vAso_ .req v23 + vAsu_ .req v24 + vAba_ .req v30 + vAbe_ .req v27 + + /* Mapping of Kecck-f1600 state to scalar registers + * at the beginning and end of each round. */ + s_Aba .req x1 + sAbe .req x6 + sAbi .req x11 + sAbo .req x16 + sAbu .req x21 + sAga .req x2 + sAge .req x7 + sAgi .req x12 + sAgo .req x17 + sAgu .req x22 + sAka .req x3 + sAke .req x8 + sAki .req x13 + sAko .req x18 + sAku .req x23 + sAma .req x4 + sAme .req x9 + sAmi .req x14 + sAmo .req x19 + sAmu .req x24 + sAsa .req x5 + sAse .req x10 + sAsi .req x15 + sAso .req x20 + sAsu .req x25 + + /* sA_[y,2*x+3*y] = rot(A[x,y]) */ + s_Aba_ .req x0 + sAbe_ .req x28 + sAbi_ .req x11 + sAbo_ .req x16 + sAbu_ .req x21 + sAga_ .req x3 + sAge_ .req x8 + sAgi_ .req x12 + sAgo_ .req x17 + sAgu_ .req x22 + sAka_ .req x4 + sAke_ .req x9 + sAki_ .req x13 + sAko_ .req x18 + sAku_ .req x23 + sAma_ .req x5 + sAme_ .req x10 + sAmi_ .req x14 + sAmo_ .req x19 + sAmu_ .req x24 + sAsa_ .req x1 + sAse_ .req x6 + sAsi_ .req x15 + sAso_ .req x20 + sAsu_ .req x25 + + /* sC[x] = sA[x,0] xor sA[x,1] xor sA[x,2] xor sA[x,3] xor sA[x,4], for x in 0..4 */ + /* sE[x] = sC[x-1] xor rot(C[x+1],1), for x in 0..4 */ + sC0 .req x0 + sE0 .req x29 + sC1 .req x26 + sE1 .req x30 + sC2 .req x27 + sE2 .req x26 + sC3 .req x28 + sE3 .req x27 + sC4 .req x29 + sE4 .req x28 + + tmp .req x30 + +/************************ MACROS ****************************/ + +/* Macros using v8.4-A SHA-3 instructions */ + +.macro eor3_m0 d s0 s1 s2 + eor3 \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +.macro rax1_m0 d s0 s1 + rax1 \d\().2d, \s0\().2d, \s1\().2d +.endm + +.macro xar_m0 d s0 s1 imm + xar \d\().2d, \s0\().2d, \s1\().2d, #\imm +.endm + +.macro bcax_m0 d s0 s1 s2 + bcax \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +.macro load_input_vector num idx + ldr vAbaq, [input_addr, #(16*(\num*0+\idx))] + ldr vAbeq, [input_addr, #(16*(\num*1+\idx))] + ldr vAbiq, [input_addr, #(16*(\num*2+\idx))] + ldr vAboq, [input_addr, #(16*(\num*3+\idx))] + ldr vAbuq, [input_addr, #(16*(\num*4+\idx))] + ldr vAgaq, [input_addr, #(16*(\num*5+\idx))] + ldr vAgeq, [input_addr, #(16*(\num*6+\idx))] + ldr vAgiq, [input_addr, #(16*(\num*7+\idx))] + ldr vAgoq, [input_addr, #(16*(\num*8+\idx))] + ldr vAguq, [input_addr, #(16*(\num*9+\idx))] + ldr vAkaq, [input_addr, #(16*(\num*10+\idx))] + ldr vAkeq, [input_addr, #(16*(\num*11+\idx))] + ldr vAkiq, [input_addr, #(16*(\num*12+\idx))] + ldr vAkoq, [input_addr, #(16*(\num*13+\idx))] + ldr vAkuq, [input_addr, #(16*(\num*14+\idx))] + ldr vAmaq, [input_addr, #(16*(\num*15+\idx))] + ldr vAmeq, [input_addr, #(16*(\num*16+\idx))] + ldr vAmiq, [input_addr, #(16*(\num*17+\idx))] + ldr vAmoq, [input_addr, #(16*(\num*18+\idx))] + ldr vAmuq, [input_addr, #(16*(\num*19+\idx))] + ldr vAsaq, [input_addr, #(16*(\num*20+\idx))] + ldr vAseq, [input_addr, #(16*(\num*21+\idx))] + ldr vAsiq, [input_addr, #(16*(\num*22+\idx))] + ldr vAsoq, [input_addr, #(16*(\num*23+\idx))] + ldr vAsuq, [input_addr, #(16*(\num*24+\idx))] +.endm + +.macro store_input_vector num idx + str vAbaq, [input_addr, #(16*(\num*0+\idx))] + str vAbeq, [input_addr, #(16*(\num*1+\idx))] + str vAbiq, [input_addr, #(16*(\num*2+\idx))] + str vAboq, [input_addr, #(16*(\num*3+\idx))] + str vAbuq, [input_addr, #(16*(\num*4+\idx))] + str vAgaq, [input_addr, #(16*(\num*5+\idx))] + str vAgeq, [input_addr, #(16*(\num*6+\idx))] + str vAgiq, [input_addr, #(16*(\num*7+\idx))] + str vAgoq, [input_addr, #(16*(\num*8+\idx))] + str vAguq, [input_addr, #(16*(\num*9+\idx))] + str vAkaq, [input_addr, #(16*(\num*10+\idx))] + str vAkeq, [input_addr, #(16*(\num*11+\idx))] + str vAkiq, [input_addr, #(16*(\num*12+\idx))] + str vAkoq, [input_addr, #(16*(\num*13+\idx))] + str vAkuq, [input_addr, #(16*(\num*14+\idx))] + str vAmaq, [input_addr, #(16*(\num*15+\idx))] + str vAmeq, [input_addr, #(16*(\num*16+\idx))] + str vAmiq, [input_addr, #(16*(\num*17+\idx))] + str vAmoq, [input_addr, #(16*(\num*18+\idx))] + str vAmuq, [input_addr, #(16*(\num*19+\idx))] + str vAsaq, [input_addr, #(16*(\num*20+\idx))] + str vAseq, [input_addr, #(16*(\num*21+\idx))] + str vAsiq, [input_addr, #(16*(\num*22+\idx))] + str vAsoq, [input_addr, #(16*(\num*23+\idx))] + str vAsuq, [input_addr, #(16*(\num*24+\idx))] +.endm + +.macro store_input_scalar num idx + str s_Aba, [input_addr, 8*(\num*(0) +\idx)] + str sAbe, [input_addr, 8*(\num*(0+1) +\idx)] + str sAbi, [input_addr, 8*(\num*(2)+ \idx)] + str sAbo, [input_addr, 8*(\num*(2+1) +\idx)] + str sAbu, [input_addr, 8*(\num*(4)+ \idx)] + str sAga, [input_addr, 8*(\num*(4+1) +\idx)] + str sAge, [input_addr, 8*(\num*(6)+ \idx)] + str sAgi, [input_addr, 8*(\num*(6+1) +\idx)] + str sAgo, [input_addr, 8*(\num*(8)+ \idx)] + str sAgu, [input_addr, 8*(\num*(8+1) +\idx)] + str sAka, [input_addr, 8*(\num*(10) +\idx)] + str sAke, [input_addr, 8*(\num*(10+1)+\idx)] + str sAki, [input_addr, 8*(\num*(12) +\idx)] + str sAko, [input_addr, 8*(\num*(12+1)+\idx)] + str sAku, [input_addr, 8*(\num*(14) +\idx)] + str sAma, [input_addr, 8*(\num*(14+1)+\idx)] + str sAme, [input_addr, 8*(\num*(16) +\idx)] + str sAmi, [input_addr, 8*(\num*(16+1)+\idx)] + str sAmo, [input_addr, 8*(\num*(18) +\idx)] + str sAmu, [input_addr, 8*(\num*(18+1)+\idx)] + str sAsa, [input_addr, 8*(\num*(20) +\idx)] + str sAse, [input_addr, 8*(\num*(20+1)+\idx)] + str sAsi, [input_addr, 8*(\num*(22) +\idx)] + str sAso, [input_addr, 8*(\num*(22+1)+\idx)] + str sAsu, [input_addr, 8*(\num*(24) +\idx)] +.endm + +.macro load_input_scalar num idx + ldr s_Aba, [input_addr, 8*(\num*(0) +\idx)] + ldr sAbe, [input_addr, 8*(\num*(0+1) +\idx)] + ldr sAbi, [input_addr, 8*(\num*(2)+ \idx)] + ldr sAbo, [input_addr, 8*(\num*(2+1) +\idx)] + ldr sAbu, [input_addr, 8*(\num*(4)+ \idx)] + ldr sAga, [input_addr, 8*(\num*(4+1) +\idx)] + ldr sAge, [input_addr, 8*(\num*(6)+ \idx)] + ldr sAgi, [input_addr, 8*(\num*(6+1) +\idx)] + ldr sAgo, [input_addr, 8*(\num*(8)+ \idx)] + ldr sAgu, [input_addr, 8*(\num*(8+1) +\idx)] + ldr sAka, [input_addr, 8*(\num*(10) +\idx)] + ldr sAke, [input_addr, 8*(\num*(10+1)+\idx)] + ldr sAki, [input_addr, 8*(\num*(12) +\idx)] + ldr sAko, [input_addr, 8*(\num*(12+1)+\idx)] + ldr sAku, [input_addr, 8*(\num*(14) +\idx)] + ldr sAma, [input_addr, 8*(\num*(14+1)+\idx)] + ldr sAme, [input_addr, 8*(\num*(16) +\idx)] + ldr sAmi, [input_addr, 8*(\num*(16+1)+\idx)] + ldr sAmo, [input_addr, 8*(\num*(18) +\idx)] + ldr sAmu, [input_addr, 8*(\num*(18+1)+\idx)] + ldr sAsa, [input_addr, 8*(\num*(20) +\idx)] + ldr sAse, [input_addr, 8*(\num*(20+1)+\idx)] + ldr sAsi, [input_addr, 8*(\num*(22) +\idx)] + ldr sAso, [input_addr, 8*(\num*(22+1)+\idx)] + ldr sAsu, [input_addr, 8*(\num*(24) +\idx)] +.endm + +#define STACK_SIZE (8*8 + 16*6 + 3*8 + 8) // VREGS (8*8), GPRs (16*6), count (8), const (8), input (8), padding (8) +#define STACK_BASE_GPRS (3*8+8) +#define STACK_BASE_VREGS (3*8+8+16*6) +#define STACK_OFFSET_INPUT (0*8) +#define STACK_OFFSET_CONST (1*8) +#define STACK_OFFSET_COUNT (2*8) + +.macro save_gprs + stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro save_vregs + stp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] + stp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + stp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + stp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] +.endm + +.macro restore_vregs + ldp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] + ldp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + ldp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + ldp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] +.endm + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +.macro eor5 dst, src0, src1, src2, src3, src4 + eor \dst, \src0, \src1 + eor \dst, \dst, \src2 + eor \dst, \dst, \src3 + eor \dst, \dst, \src4 +.endm + +.macro xor_rol dst, src1, src0, imm + eor \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro bic_rol dst, src1, src0, imm + bic \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro rotate dst, src, imm + ror \dst, \src, #(64-\imm) +.endm + +.macro save reg, offset + str \reg, [sp, #\offset] +.endm + +.macro restore reg, offset + ldr \reg, [sp, #\offset] +.endm + +.macro hybrid_round_initial + + eor sC0, sAma, sAsa SEP + eor sC1, sAme, sAse SEP + eor sC2, sAmi, sAsi SEP + eor sC3, sAmo, sAso SEP + eor sC4, sAmu, sAsu SEP + eor sC0, sAka, sC0 SEP + eor sC1, sAke, sC1 SEP + eor sC2, sAki, sC2 SEP + eor sC3, sAko, sC3 SEP + eor sC4, sAku, sC4 SEP + eor sC0, sAga, sC0 SEP + eor sC1, sAge, sC1 SEP + eor sC2, sAgi, sC2 SEP + eor sC3, sAgo, sC3 SEP + eor sC4, sAgu, sC4 SEP + eor sC0, s_Aba, sC0 SEP + eor sC1, sAbe, sC1 SEP + eor sC2, sAbi, sC2 SEP + eor sC3, sAbo, sC3 SEP + eor sC4, sAbu, sC4 SEP + SEP + eor sE1, sC0, sC2, ROR #63 SEP + eor sE3, sC2, sC4, ROR #63 SEP + eor sE0, sC4, sC1, ROR #63 SEP + eor sE2, sC1, sC3, ROR #63 SEP + eor sE4, sC3, sC0, ROR #63 SEP + SEP + eor s_Aba_, s_Aba, sE0 SEP + eor sAsa_, sAbi, sE2 SEP + eor sAbi_, sAki, sE2 SEP + eor sAki_, sAko, sE3 SEP + eor sAko_, sAmu, sE4 SEP + eor sAmu_, sAso, sE3 SEP + eor sAso_, sAma, sE0 SEP + eor sAka_, sAbe, sE1 SEP + eor sAse_, sAgo, sE3 SEP + eor sAgo_, sAme, sE1 SEP + eor sAke_, sAgi, sE2 SEP + eor sAgi_, sAka, sE0 SEP + eor sAga_, sAbo, sE3 SEP + eor sAbo_, sAmo, sE3 SEP + eor sAmo_, sAmi, sE2 SEP + eor sAmi_, sAke, sE1 SEP + eor sAge_, sAgu, sE4 SEP + eor sAgu_, sAsi, sE2 SEP + eor sAsi_, sAku, sE4 SEP + eor sAku_, sAsa, sE0 SEP + eor sAma_, sAbu, sE4 SEP + eor sAbu_, sAsu, sE4 SEP + eor sAsu_, sAse, sE1 SEP + eor sAme_, sAga, sE0 SEP + eor sAbe_, sAge, sE1 SEP + SEP + load_constant_ptr SEP + SEP + bic tmp, sAgi_, sAge_, ROR #47 SEP + eor sAga, tmp, sAga_, ROR #39 SEP + bic tmp, sAgo_, sAgi_, ROR #42 SEP + eor sAge, tmp, sAge_, ROR #25 SEP + bic tmp, sAgu_, sAgo_, ROR #16 SEP + eor sAgi, tmp, sAgi_, ROR #58 SEP + bic tmp, sAga_, sAgu_, ROR #31 SEP + eor sAgo, tmp, sAgo_, ROR #47 SEP + bic tmp, sAge_, sAga_, ROR #56 SEP + eor sAgu, tmp, sAgu_, ROR #23 SEP + bic tmp, sAki_, sAke_, ROR #19 SEP + eor sAka, tmp, sAka_, ROR #24 SEP + bic tmp, sAko_, sAki_, ROR #47 SEP + eor sAke, tmp, sAke_, ROR #2 SEP + bic tmp, sAku_, sAko_, ROR #10 SEP + eor sAki, tmp, sAki_, ROR #57 SEP + bic tmp, sAka_, sAku_, ROR #47 SEP + eor sAko, tmp, sAko_, ROR #57 SEP + bic tmp, sAke_, sAka_, ROR #5 SEP + eor sAku, tmp, sAku_, ROR #52 SEP + bic tmp, sAmi_, sAme_, ROR #38 SEP + eor sAma, tmp, sAma_, ROR #47 SEP + bic tmp, sAmo_, sAmi_, ROR #5 SEP + eor sAme, tmp, sAme_, ROR #43 SEP + bic tmp, sAmu_, sAmo_, ROR #41 SEP + eor sAmi, tmp, sAmi_, ROR #46 SEP + SEP + ldr cur_const, [const_addr] SEP + mov count, #1 SEP + SEP + bic tmp, sAma_, sAmu_, ROR #35 SEP + eor sAmo, tmp, sAmo_, ROR #12 SEP + bic tmp, sAme_, sAma_, ROR #9 SEP + eor sAmu, tmp, sAmu_, ROR #44 SEP + bic tmp, sAsi_, sAse_, ROR #48 SEP + eor sAsa, tmp, sAsa_, ROR #41 SEP + bic tmp, sAso_, sAsi_, ROR #2 SEP + eor sAse, tmp, sAse_, ROR #50 SEP + bic tmp, sAsu_, sAso_, ROR #25 SEP + eor sAsi, tmp, sAsi_, ROR #27 SEP + bic tmp, sAsa_, sAsu_, ROR #60 SEP + eor sAso, tmp, sAso_, ROR #21 SEP + bic tmp, sAse_, sAsa_, ROR #57 SEP + eor sAsu, tmp, sAsu_, ROR #53 SEP + bic tmp, sAbi_, sAbe_, ROR #63 SEP + eor s_Aba, s_Aba_, tmp, ROR #21 SEP + bic tmp, sAbo_, sAbi_, ROR #42 SEP + eor sAbe, tmp, sAbe_, ROR #41 SEP + bic tmp, sAbu_, sAbo_, ROR #57 SEP + eor sAbi, tmp, sAbi_, ROR #35 SEP + bic tmp, s_Aba_, sAbu_, ROR #50 SEP + eor sAbo, tmp, sAbo_, ROR #43 SEP + bic tmp, sAbe_, s_Aba_, ROR #44 SEP + eor sAbu, tmp, sAbu_, ROR #30 SEP + SEP + eor s_Aba, s_Aba, cur_const SEP + SEP + save count, STACK_OFFSET_COUNT SEP + SEP + eor sC0, sAka, sAsa, ROR #50 SEP + eor sC1, sAse, sAge, ROR #60 SEP + eor sC2, sAmi, sAgi, ROR #59 SEP + eor sC3, sAgo, sAso, ROR #30 SEP + eor sC4, sAbu, sAsu, ROR #53 SEP + eor sC0, sAma, sC0, ROR #49 SEP + eor sC1, sAbe, sC1, ROR #44 SEP + eor sC2, sAki, sC2, ROR #26 SEP + eor sC3, sAmo, sC3, ROR #63 SEP + eor sC4, sAmu, sC4, ROR #56 SEP + eor sC0, sAga, sC0, ROR #57 SEP + eor sC1, sAme, sC1, ROR #58 SEP + eor sC2, sAbi, sC2, ROR #60 SEP + eor sC3, sAko, sC3, ROR #38 SEP + eor sC4, sAgu, sC4, ROR #48 SEP + eor sC0, s_Aba, sC0, ROR #61 SEP + eor sC1, sAke, sC1, ROR #57 SEP + eor sC2, sAsi, sC2, ROR #52 SEP + eor sC3, sAbo, sC3, ROR #63 SEP + eor sC4, sAku, sC4, ROR #50 SEP + ror sC1, sC1, 56 SEP + ror sC4, sC4, 58 SEP + ror sC2, sC2, 62 SEP + SEP + eor sE1, sC0, sC2, ROR #63 SEP + eor sE3, sC2, sC4, ROR #63 SEP + eor sE0, sC4, sC1, ROR #63 SEP + eor sE2, sC1, sC3, ROR #63 SEP + eor sE4, sC3, sC0, ROR #63 SEP + SEP + eor s_Aba_, sE0, s_Aba SEP + eor sAsa_, sE2, sAbi, ROR #50 SEP + eor sAbi_, sE2, sAki, ROR #46 SEP + eor sAki_, sE3, sAko, ROR #63 SEP + eor sAko_, sE4, sAmu, ROR #28 SEP + eor sAmu_, sE3, sAso, ROR #2 SEP + eor sAso_, sE0, sAma, ROR #54 SEP + eor sAka_, sE1, sAbe, ROR #43 SEP + eor sAse_, sE3, sAgo, ROR #36 SEP + eor sAgo_, sE1, sAme, ROR #49 SEP + eor sAke_, sE2, sAgi, ROR #3 SEP + eor sAgi_, sE0, sAka, ROR #39 SEP + eor sAga_, sE3, sAbo SEP + eor sAbo_, sE3, sAmo, ROR #37 SEP + eor sAmo_, sE2, sAmi, ROR #8 SEP + eor sAmi_, sE1, sAke, ROR #56 SEP + eor sAge_, sE4, sAgu, ROR #44 SEP + eor sAgu_, sE2, sAsi, ROR #62 SEP + eor sAsi_, sE4, sAku, ROR #58 SEP + eor sAku_, sE0, sAsa, ROR #25 SEP + eor sAma_, sE4, sAbu, ROR #20 SEP + eor sAbu_, sE4, sAsu, ROR #9 SEP + eor sAsu_, sE1, sAse, ROR #23 SEP + eor sAme_, sE0, sAga, ROR #61 SEP + eor sAbe_, sE1, sAge, ROR #19 SEP + SEP + load_constant_ptr SEP + restore count, STACK_OFFSET_COUNT SEP + SEP + bic tmp, sAgi_, sAge_, ROR #47 SEP + eor sAga, tmp, sAga_, ROR #39 SEP + bic tmp, sAgo_, sAgi_, ROR #42 SEP + eor sAge, tmp, sAge_, ROR #25 SEP + bic tmp, sAgu_, sAgo_, ROR #16 SEP + eor sAgi, tmp, sAgi_, ROR #58 SEP + bic tmp, sAga_, sAgu_, ROR #31 SEP + eor sAgo, tmp, sAgo_, ROR #47 SEP + bic tmp, sAge_, sAga_, ROR #56 SEP + eor sAgu, tmp, sAgu_, ROR #23 SEP + bic tmp, sAki_, sAke_, ROR #19 SEP + eor sAka, tmp, sAka_, ROR #24 SEP + bic tmp, sAko_, sAki_, ROR #47 SEP + eor sAke, tmp, sAke_, ROR #2 SEP + bic tmp, sAku_, sAko_, ROR #10 SEP + eor sAki, tmp, sAki_, ROR #57 SEP + bic tmp, sAka_, sAku_, ROR #47 SEP + eor sAko, tmp, sAko_, ROR #57 SEP + bic tmp, sAke_, sAka_, ROR #5 SEP + eor sAku, tmp, sAku_, ROR #52 SEP + bic tmp, sAmi_, sAme_, ROR #38 SEP + eor sAma, tmp, sAma_, ROR #47 SEP + bic tmp, sAmo_, sAmi_, ROR #5 SEP + eor sAme, tmp, sAme_, ROR #43 SEP + bic tmp, sAmu_, sAmo_, ROR #41 SEP + eor sAmi, tmp, sAmi_, ROR #46 SEP + bic tmp, sAma_, sAmu_, ROR #35 SEP + SEP + ldr cur_const, [const_addr, count, UXTW #3] SEP + SEP + eor sAmo, tmp, sAmo_, ROR #12 SEP + bic tmp, sAme_, sAma_, ROR #9 SEP + eor sAmu, tmp, sAmu_, ROR #44 SEP + bic tmp, sAsi_, sAse_, ROR #48 SEP + eor sAsa, tmp, sAsa_, ROR #41 SEP + bic tmp, sAso_, sAsi_, ROR #2 SEP + eor sAse, tmp, sAse_, ROR #50 SEP + bic tmp, sAsu_, sAso_, ROR #25 SEP + eor sAsi, tmp, sAsi_, ROR #27 SEP + bic tmp, sAsa_, sAsu_, ROR #60 SEP + eor sAso, tmp, sAso_, ROR #21 SEP + bic tmp, sAse_, sAsa_, ROR #57 SEP + eor sAsu, tmp, sAsu_, ROR #53 SEP + bic tmp, sAbi_, sAbe_, ROR #63 SEP + eor s_Aba, s_Aba_, tmp, ROR #21 SEP + bic tmp, sAbo_, sAbi_, ROR #42 SEP + eor sAbe, tmp, sAbe_, ROR #41 SEP + bic tmp, sAbu_, sAbo_, ROR #57 SEP + eor sAbi, tmp, sAbi_, ROR #35 SEP + bic tmp, s_Aba_, sAbu_, ROR #50 SEP + eor sAbo, tmp, sAbo_, ROR #43 SEP + bic tmp, sAbe_, s_Aba_, ROR #44 SEP + eor sAbu, tmp, sAbu_, ROR #30 SEP + SEP + add count, count, #1 SEP + SEP + eor s_Aba, s_Aba, cur_const SEP + SEP + SEP + SEP eor3_m0 C0, vAba, vAga, vAka + SEP eor3_m0 C0, C0, vAma, vAsa + SEP eor3_m0 C1, vAbe, vAge, vAke + SEP eor3_m0 C1, C1, vAme, vAse + SEP eor3_m0 C2, vAbi, vAgi, vAki + SEP eor3_m0 C2, C2, vAmi, vAsi + SEP eor3_m0 C3, vAbo, vAgo, vAko + SEP eor3_m0 C3, C3, vAmo, vAso + SEP eor3_m0 C4, vAbu, vAgu, vAku + SEP eor3_m0 C4, C4, vAmu, vAsu + SEP + SEP rax1_m0 E1, C0, C2 + SEP rax1_m0 E3, C2, C4 + SEP rax1_m0 E0, C4, C1 + SEP rax1_m0 E2, C1, C3 + SEP rax1_m0 E4, C3, C0 + SEP + SEP eor vAba_.16b, vAba.16b, E0.16b + SEP xar_m0 vAsa_, vAbi, E2, 2 + SEP xar_m0 vAbi_, vAki, E2, 21 + SEP xar_m0 vAki_, vAko, E3, 39 + SEP xar_m0 vAko_, vAmu, E4, 56 + SEP xar_m0 vAmu_, vAso, E3, 8 + SEP xar_m0 vAso_, vAma, E0, 23 + SEP xar_m0 vAka_, vAbe, E1, 63 + SEP xar_m0 vAse_, vAgo, E3, 9 + SEP xar_m0 vAgo_, vAme, E1, 19 + SEP xar_m0 vAke_, vAgi, E2, 58 + SEP xar_m0 vAgi_, vAka, E0, 61 + SEP xar_m0 vAga_, vAbo, E3, 36 + SEP xar_m0 vAbo_, vAmo, E3, 43 + SEP xar_m0 vAmo_, vAmi, E2, 49 + SEP xar_m0 vAmi_, vAke, E1, 54 + SEP xar_m0 vAge_, vAgu, E4, 44 + SEP xar_m0 vAgu_, vAsi, E2, 3 + SEP xar_m0 vAsi_, vAku, E4, 25 + SEP xar_m0 vAku_, vAsa, E0, 46 + SEP xar_m0 vAma_, vAbu, E4, 37 + SEP xar_m0 vAbu_, vAsu, E4, 50 + SEP xar_m0 vAsu_, vAse, E1, 62 + SEP xar_m0 vAme_, vAga, E0, 28 + SEP xar_m0 vAbe_, vAge, E1, 20 + SEP + SEP restore const_addr, STACK_OFFSET_CONST + SEP ld1r {v31.2d}, [const_addr], #8 + SEP save const_addr, STACK_OFFSET_CONST + SEP + SEP bcax_m0 vAga, vAga_, vAgi_, vAge_ + SEP bcax_m0 vAge, vAge_, vAgo_, vAgi_ + SEP bcax_m0 vAgi, vAgi_, vAgu_, vAgo_ + SEP bcax_m0 vAgo, vAgo_, vAga_, vAgu_ + SEP bcax_m0 vAgu, vAgu_, vAge_, vAga_ + SEP bcax_m0 vAka, vAka_, vAki_, vAke_ + SEP bcax_m0 vAke, vAke_, vAko_, vAki_ + SEP bcax_m0 vAki, vAki_, vAku_, vAko_ + SEP bcax_m0 vAko, vAko_, vAka_, vAku_ + SEP bcax_m0 vAku, vAku_, vAke_, vAka_ + SEP bcax_m0 vAma, vAma_, vAmi_, vAme_ + SEP bcax_m0 vAme, vAme_, vAmo_, vAmi_ + SEP bcax_m0 vAmi, vAmi_, vAmu_, vAmo_ + SEP bcax_m0 vAmo, vAmo_, vAma_, vAmu_ + SEP bcax_m0 vAmu, vAmu_, vAme_, vAma_ + SEP bcax_m0 vAsa, vAsa_, vAsi_, vAse_ + SEP bcax_m0 vAse, vAse_, vAso_, vAsi_ + SEP bcax_m0 vAsi, vAsi_, vAsu_, vAso_ + SEP bcax_m0 vAso, vAso_, vAsa_, vAsu_ + SEP bcax_m0 vAsu, vAsu_, vAse_, vAsa_ + SEP bcax_m0 vAba, vAba_, vAbi_, vAbe_ + SEP bcax_m0 vAbe, vAbe_, vAbo_, vAbi_ + SEP bcax_m0 vAbi, vAbi_, vAbu_, vAbo_ + SEP bcax_m0 vAbo, vAbo_, vAba_, vAbu_ + SEP bcax_m0 vAbu, vAbu_, vAbe_, vAba_ + SEP + SEP eor vAba.16b, vAba.16b, v31.16b +.endm + +.macro hybrid_round_noninitial + save count, STACK_OFFSET_COUNT SEP + SEP + eor sC0, sAka, sAsa, ROR #50 SEP + eor sC1, sAse, sAge, ROR #60 SEP + eor sC2, sAmi, sAgi, ROR #59 SEP + eor sC3, sAgo, sAso, ROR #30 SEP + eor sC4, sAbu, sAsu, ROR #53 SEP + eor sC0, sAma, sC0, ROR #49 SEP + eor sC1, sAbe, sC1, ROR #44 SEP + eor sC2, sAki, sC2, ROR #26 SEP + eor sC3, sAmo, sC3, ROR #63 SEP + eor sC4, sAmu, sC4, ROR #56 SEP + eor sC0, sAga, sC0, ROR #57 SEP + eor sC1, sAme, sC1, ROR #58 SEP + eor sC2, sAbi, sC2, ROR #60 SEP + eor sC3, sAko, sC3, ROR #38 SEP + eor sC4, sAgu, sC4, ROR #48 SEP + eor sC0, s_Aba, sC0, ROR #61 SEP + eor sC1, sAke, sC1, ROR #57 SEP + eor sC2, sAsi, sC2, ROR #52 SEP + eor sC3, sAbo, sC3, ROR #63 SEP + eor sC4, sAku, sC4, ROR #50 SEP + ror sC1, sC1, 56 SEP + ror sC4, sC4, 58 SEP + ror sC2, sC2, 62 SEP + SEP + eor sE1, sC0, sC2, ROR #63 SEP + eor sE3, sC2, sC4, ROR #63 SEP + eor sE0, sC4, sC1, ROR #63 SEP + eor sE2, sC1, sC3, ROR #63 SEP + eor sE4, sC3, sC0, ROR #63 SEP + SEP + eor s_Aba_, sE0, s_Aba SEP + eor sAsa_, sE2, sAbi, ROR #50 SEP + eor sAbi_, sE2, sAki, ROR #46 SEP + eor sAki_, sE3, sAko, ROR #63 SEP + eor sAko_, sE4, sAmu, ROR #28 SEP + eor sAmu_, sE3, sAso, ROR #2 SEP + eor sAso_, sE0, sAma, ROR #54 SEP + eor sAka_, sE1, sAbe, ROR #43 SEP + eor sAse_, sE3, sAgo, ROR #36 SEP + eor sAgo_, sE1, sAme, ROR #49 SEP + eor sAke_, sE2, sAgi, ROR #3 SEP + eor sAgi_, sE0, sAka, ROR #39 SEP + eor sAga_, sE3, sAbo SEP + eor sAbo_, sE3, sAmo, ROR #37 SEP + eor sAmo_, sE2, sAmi, ROR #8 SEP + eor sAmi_, sE1, sAke, ROR #56 SEP + eor sAge_, sE4, sAgu, ROR #44 SEP + eor sAgu_, sE2, sAsi, ROR #62 SEP + eor sAsi_, sE4, sAku, ROR #58 SEP + eor sAku_, sE0, sAsa, ROR #25 SEP + eor sAma_, sE4, sAbu, ROR #20 SEP + eor sAbu_, sE4, sAsu, ROR #9 SEP + eor sAsu_, sE1, sAse, ROR #23 SEP + eor sAme_, sE0, sAga, ROR #61 SEP + eor sAbe_, sE1, sAge, ROR #19 SEP + SEP + load_constant_ptr SEP + restore count, STACK_OFFSET_COUNT SEP + SEP + bic tmp, sAgi_, sAge_, ROR #47 SEP + eor sAga, tmp, sAga_, ROR #39 SEP + bic tmp, sAgo_, sAgi_, ROR #42 SEP + eor sAge, tmp, sAge_, ROR #25 SEP + bic tmp, sAgu_, sAgo_, ROR #16 SEP + eor sAgi, tmp, sAgi_, ROR #58 SEP + bic tmp, sAga_, sAgu_, ROR #31 SEP + eor sAgo, tmp, sAgo_, ROR #47 SEP + bic tmp, sAge_, sAga_, ROR #56 SEP + eor sAgu, tmp, sAgu_, ROR #23 SEP + bic tmp, sAki_, sAke_, ROR #19 SEP + eor sAka, tmp, sAka_, ROR #24 SEP + bic tmp, sAko_, sAki_, ROR #47 SEP + eor sAke, tmp, sAke_, ROR #2 SEP + bic tmp, sAku_, sAko_, ROR #10 SEP + eor sAki, tmp, sAki_, ROR #57 SEP + bic tmp, sAka_, sAku_, ROR #47 SEP + eor sAko, tmp, sAko_, ROR #57 SEP + bic tmp, sAke_, sAka_, ROR #5 SEP + eor sAku, tmp, sAku_, ROR #52 SEP + bic tmp, sAmi_, sAme_, ROR #38 SEP + eor sAma, tmp, sAma_, ROR #47 SEP + bic tmp, sAmo_, sAmi_, ROR #5 SEP + eor sAme, tmp, sAme_, ROR #43 SEP + bic tmp, sAmu_, sAmo_, ROR #41 SEP + eor sAmi, tmp, sAmi_, ROR #46 SEP + bic tmp, sAma_, sAmu_, ROR #35 SEP + SEP + ldr cur_const, [const_addr, count, UXTW #3] SEP + add count, count, #1 SEP + SEP + eor sAmo, tmp, sAmo_, ROR #12 SEP + bic tmp, sAme_, sAma_, ROR #9 SEP + eor sAmu, tmp, sAmu_, ROR #44 SEP + bic tmp, sAsi_, sAse_, ROR #48 SEP + eor sAsa, tmp, sAsa_, ROR #41 SEP + bic tmp, sAso_, sAsi_, ROR #2 SEP + eor sAse, tmp, sAse_, ROR #50 SEP + bic tmp, sAsu_, sAso_, ROR #25 SEP + eor sAsi, tmp, sAsi_, ROR #27 SEP + bic tmp, sAsa_, sAsu_, ROR #60 SEP + eor sAso, tmp, sAso_, ROR #21 SEP + bic tmp, sAse_, sAsa_, ROR #57 SEP + eor sAsu, tmp, sAsu_, ROR #53 SEP + bic tmp, sAbi_, sAbe_, ROR #63 SEP + eor s_Aba, s_Aba_, tmp, ROR #21 SEP + bic tmp, sAbo_, sAbi_, ROR #42 SEP + eor sAbe, tmp, sAbe_, ROR #41 SEP + bic tmp, sAbu_, sAbo_, ROR #57 SEP + eor sAbi, tmp, sAbi_, ROR #35 SEP + bic tmp, s_Aba_, sAbu_, ROR #50 SEP + eor sAbo, tmp, sAbo_, ROR #43 SEP + bic tmp, sAbe_, s_Aba_, ROR #44 SEP + eor sAbu, tmp, sAbu_, ROR #30 SEP + SEP + eor s_Aba, s_Aba, cur_const SEP + save count, STACK_OFFSET_COUNT SEP + SEP + eor sC0, sAka, sAsa, ROR #50 SEP + eor sC1, sAse, sAge, ROR #60 SEP + eor sC2, sAmi, sAgi, ROR #59 SEP + eor sC3, sAgo, sAso, ROR #30 SEP + eor sC4, sAbu, sAsu, ROR #53 SEP + eor sC0, sAma, sC0, ROR #49 SEP + eor sC1, sAbe, sC1, ROR #44 SEP + eor sC2, sAki, sC2, ROR #26 SEP + eor sC3, sAmo, sC3, ROR #63 SEP + eor sC4, sAmu, sC4, ROR #56 SEP + eor sC0, sAga, sC0, ROR #57 SEP + eor sC1, sAme, sC1, ROR #58 SEP + eor sC2, sAbi, sC2, ROR #60 SEP + eor sC3, sAko, sC3, ROR #38 SEP + eor sC4, sAgu, sC4, ROR #48 SEP + eor sC0, s_Aba, sC0, ROR #61 SEP + eor sC1, sAke, sC1, ROR #57 SEP + eor sC2, sAsi, sC2, ROR #52 SEP + eor sC3, sAbo, sC3, ROR #63 SEP + eor sC4, sAku, sC4, ROR #50 SEP + ror sC1, sC1, 56 SEP + ror sC4, sC4, 58 SEP + ror sC2, sC2, 62 SEP + SEP + eor sE1, sC0, sC2, ROR #63 SEP + eor sE3, sC2, sC4, ROR #63 SEP + eor sE0, sC4, sC1, ROR #63 SEP + eor sE2, sC1, sC3, ROR #63 SEP + eor sE4, sC3, sC0, ROR #63 SEP + SEP + eor s_Aba_, sE0, s_Aba SEP + eor sAsa_, sE2, sAbi, ROR #50 SEP + eor sAbi_, sE2, sAki, ROR #46 SEP + eor sAki_, sE3, sAko, ROR #63 SEP + eor sAko_, sE4, sAmu, ROR #28 SEP + eor sAmu_, sE3, sAso, ROR #2 SEP + eor sAso_, sE0, sAma, ROR #54 SEP + eor sAka_, sE1, sAbe, ROR #43 SEP + eor sAse_, sE3, sAgo, ROR #36 SEP + eor sAgo_, sE1, sAme, ROR #49 SEP + eor sAke_, sE2, sAgi, ROR #3 SEP + eor sAgi_, sE0, sAka, ROR #39 SEP + eor sAga_, sE3, sAbo SEP + eor sAbo_, sE3, sAmo, ROR #37 SEP + eor sAmo_, sE2, sAmi, ROR #8 SEP + eor sAmi_, sE1, sAke, ROR #56 SEP + eor sAge_, sE4, sAgu, ROR #44 SEP + eor sAgu_, sE2, sAsi, ROR #62 SEP + eor sAsi_, sE4, sAku, ROR #58 SEP + eor sAku_, sE0, sAsa, ROR #25 SEP + eor sAma_, sE4, sAbu, ROR #20 SEP + eor sAbu_, sE4, sAsu, ROR #9 SEP + eor sAsu_, sE1, sAse, ROR #23 SEP + eor sAme_, sE0, sAga, ROR #61 SEP + eor sAbe_, sE1, sAge, ROR #19 SEP + SEP + load_constant_ptr SEP + restore count, STACK_OFFSET_COUNT SEP + SEP + bic tmp, sAgi_, sAge_, ROR #47 SEP + eor sAga, tmp, sAga_, ROR #39 SEP + bic tmp, sAgo_, sAgi_, ROR #42 SEP + eor sAge, tmp, sAge_, ROR #25 SEP + bic tmp, sAgu_, sAgo_, ROR #16 SEP + eor sAgi, tmp, sAgi_, ROR #58 SEP + bic tmp, sAga_, sAgu_, ROR #31 SEP + eor sAgo, tmp, sAgo_, ROR #47 SEP + bic tmp, sAge_, sAga_, ROR #56 SEP + eor sAgu, tmp, sAgu_, ROR #23 SEP + bic tmp, sAki_, sAke_, ROR #19 SEP + eor sAka, tmp, sAka_, ROR #24 SEP + bic tmp, sAko_, sAki_, ROR #47 SEP + eor sAke, tmp, sAke_, ROR #2 SEP + bic tmp, sAku_, sAko_, ROR #10 SEP + eor sAki, tmp, sAki_, ROR #57 SEP + bic tmp, sAka_, sAku_, ROR #47 SEP + eor sAko, tmp, sAko_, ROR #57 SEP + bic tmp, sAke_, sAka_, ROR #5 SEP + eor sAku, tmp, sAku_, ROR #52 SEP + bic tmp, sAmi_, sAme_, ROR #38 SEP + eor sAma, tmp, sAma_, ROR #47 SEP + bic tmp, sAmo_, sAmi_, ROR #5 SEP + eor sAme, tmp, sAme_, ROR #43 SEP + bic tmp, sAmu_, sAmo_, ROR #41 SEP + eor sAmi, tmp, sAmi_, ROR #46 SEP + bic tmp, sAma_, sAmu_, ROR #35 SEP + SEP + ldr cur_const, [const_addr, count, UXTW #3] SEP + add count, count, #1 SEP + SEP + eor sAmo, tmp, sAmo_, ROR #12 SEP + bic tmp, sAme_, sAma_, ROR #9 SEP + eor sAmu, tmp, sAmu_, ROR #44 SEP + bic tmp, sAsi_, sAse_, ROR #48 SEP + eor sAsa, tmp, sAsa_, ROR #41 SEP + bic tmp, sAso_, sAsi_, ROR #2 SEP + eor sAse, tmp, sAse_, ROR #50 SEP + bic tmp, sAsu_, sAso_, ROR #25 SEP + eor sAsi, tmp, sAsi_, ROR #27 SEP + bic tmp, sAsa_, sAsu_, ROR #60 SEP + eor sAso, tmp, sAso_, ROR #21 SEP + bic tmp, sAse_, sAsa_, ROR #57 SEP + eor sAsu, tmp, sAsu_, ROR #53 SEP + bic tmp, sAbi_, sAbe_, ROR #63 SEP + eor s_Aba, s_Aba_, tmp, ROR #21 SEP + bic tmp, sAbo_, sAbi_, ROR #42 SEP + eor sAbe, tmp, sAbe_, ROR #41 SEP + bic tmp, sAbu_, sAbo_, ROR #57 SEP + eor sAbi, tmp, sAbi_, ROR #35 SEP + bic tmp, s_Aba_, sAbu_, ROR #50 SEP + eor sAbo, tmp, sAbo_, ROR #43 SEP + bic tmp, sAbe_, s_Aba_, ROR #44 SEP + eor sAbu, tmp, sAbu_, ROR #30 SEP + SEP + eor s_Aba, s_Aba, cur_const SEP + SEP + SEP + SEP eor3_m0 C0, vAba, vAga, vAka + SEP eor3_m0 C0, C0, vAma, vAsa + SEP eor3_m0 C1, vAbe, vAge, vAke + SEP eor3_m0 C1, C1, vAme, vAse + SEP eor3_m0 C2, vAbi, vAgi, vAki + SEP eor3_m0 C2, C2, vAmi, vAsi + SEP eor3_m0 C3, vAbo, vAgo, vAko + SEP eor3_m0 C3, C3, vAmo, vAso + SEP eor3_m0 C4, vAbu, vAgu, vAku + SEP eor3_m0 C4, C4, vAmu, vAsu + SEP + SEP rax1_m0 E1, C0, C2 + SEP rax1_m0 E3, C2, C4 + SEP rax1_m0 E0, C4, C1 + SEP rax1_m0 E2, C1, C3 + SEP rax1_m0 E4, C3, C0 + SEP + SEP eor vAba_.16b, vAba.16b, E0.16b + SEP xar_m0 vAsa_, vAbi, E2, 2 + SEP xar_m0 vAbi_, vAki, E2, 21 + SEP xar_m0 vAki_, vAko, E3, 39 + SEP xar_m0 vAko_, vAmu, E4, 56 + SEP xar_m0 vAmu_, vAso, E3, 8 + SEP xar_m0 vAso_, vAma, E0, 23 + SEP xar_m0 vAka_, vAbe, E1, 63 + SEP xar_m0 vAse_, vAgo, E3, 9 + SEP xar_m0 vAgo_, vAme, E1, 19 + SEP xar_m0 vAke_, vAgi, E2, 58 + SEP xar_m0 vAgi_, vAka, E0, 61 + SEP xar_m0 vAga_, vAbo, E3, 36 + SEP xar_m0 vAbo_, vAmo, E3, 43 + SEP xar_m0 vAmo_, vAmi, E2, 49 + SEP xar_m0 vAmi_, vAke, E1, 54 + SEP xar_m0 vAge_, vAgu, E4, 44 + SEP xar_m0 vAgu_, vAsi, E2, 3 + SEP xar_m0 vAsi_, vAku, E4, 25 + SEP xar_m0 vAku_, vAsa, E0, 46 + SEP xar_m0 vAma_, vAbu, E4, 37 + SEP xar_m0 vAbu_, vAsu, E4, 50 + SEP xar_m0 vAsu_, vAse, E1, 62 + SEP xar_m0 vAme_, vAga, E0, 28 + SEP xar_m0 vAbe_, vAge, E1, 20 + SEP + SEP restore const_addr, STACK_OFFSET_CONST + SEP ld1r {v31.2d}, [const_addr], #8 + SEP save const_addr, STACK_OFFSET_CONST + SEP + SEP bcax_m0 vAga, vAga_, vAgi_, vAge_ + SEP bcax_m0 vAge, vAge_, vAgo_, vAgi_ + SEP bcax_m0 vAgi, vAgi_, vAgu_, vAgo_ + SEP bcax_m0 vAgo, vAgo_, vAga_, vAgu_ + SEP bcax_m0 vAgu, vAgu_, vAge_, vAga_ + SEP bcax_m0 vAka, vAka_, vAki_, vAke_ + SEP bcax_m0 vAke, vAke_, vAko_, vAki_ + SEP bcax_m0 vAki, vAki_, vAku_, vAko_ + SEP bcax_m0 vAko, vAko_, vAka_, vAku_ + SEP bcax_m0 vAku, vAku_, vAke_, vAka_ + SEP bcax_m0 vAma, vAma_, vAmi_, vAme_ + SEP bcax_m0 vAme, vAme_, vAmo_, vAmi_ + SEP bcax_m0 vAmi, vAmi_, vAmu_, vAmo_ + SEP bcax_m0 vAmo, vAmo_, vAma_, vAmu_ + SEP bcax_m0 vAmu, vAmu_, vAme_, vAma_ + SEP bcax_m0 vAsa, vAsa_, vAsi_, vAse_ + SEP bcax_m0 vAse, vAse_, vAso_, vAsi_ + SEP bcax_m0 vAsi, vAsi_, vAsu_, vAso_ + SEP bcax_m0 vAso, vAso_, vAsa_, vAsu_ + SEP bcax_m0 vAsu, vAsu_, vAse_, vAsa_ + SEP bcax_m0 vAba, vAba_, vAbi_, vAbe_ + SEP bcax_m0 vAbe, vAbe_, vAbo_, vAbi_ + SEP bcax_m0 vAbi, vAbi_, vAbu_, vAbo_ + SEP bcax_m0 vAbo, vAbo_, vAba_, vAbu_ + SEP bcax_m0 vAbu, vAbu_, vAbe_, vAba_ + SEP + SEP eor vAba.16b, vAba.16b, v31.16b + +.endm + +.macro final_rotate + ror sAga, sAga,#(64-3) + ror sAka, sAka,#(64-25) + ror sAma, sAma,#(64-10) + ror sAsa, sAsa,#(64-39) + ror sAbe, sAbe,#(64-21) + ror sAge, sAge,#(64-45) + ror sAke, sAke,#(64-8) + ror sAme, sAme,#(64-15) + ror sAse, sAse,#(64-41) + ror sAbi, sAbi,#(64-14) + ror sAgi, sAgi,#(64-61) + ror sAki, sAki,#(64-18) + ror sAmi, sAmi,#(64-56) + ror sAsi, sAsi,#(64-2) + ror sAgo, sAgo,#(64-28) + ror sAko, sAko,#(64-1) + ror sAmo, sAmo,#(64-27) + ror sAso, sAso,#(64-62) + ror sAbu, sAbu,#(64-44) + ror sAgu, sAgu,#(64-20) + ror sAku, sAku,#(64-6) + ror sAmu, sAmu,#(64-36) + ror sAsu, sAsu,#(64-55) +.endm + +#define KECCAK_F1600_ROUNDS 24 + +.global keccak_f1600_x4_hybrid_asm_v1 +.global _keccak_f1600_x4_hybrid_asm_v1 +.text +.align 4 + +keccak_f1600_x4_hybrid_asm_v1: +_keccak_f1600_x4_hybrid_asm_v1: + alloc_stack + save_gprs + save_vregs + save input_addr, STACK_OFFSET_INPUT + + load_input_vector 2,1 + + load_constant_ptr + save const_addr, STACK_OFFSET_CONST + + // First scalar Keccak computation alongside first half of SIMD computation + load_input_scalar 4,0 + hybrid_round_initial + loop_0: + hybrid_round_noninitial + cmp count, #(KECCAK_F1600_ROUNDS-1) + ble loop_0 + final_rotate + restore input_addr, STACK_OFFSET_INPUT + store_input_scalar 4,0 + + // Second scalar Keccak computation alongsie second half of SIMD computation + load_input_scalar 4,1 + hybrid_round_initial + loop_1: + hybrid_round_noninitial + cmp count, #(KECCAK_F1600_ROUNDS-1) + ble loop_1 + final_rotate + restore input_addr, STACK_OFFSET_INPUT + store_input_scalar 4, 1 + + store_input_vector 2,1 + + restore_vregs + restore_gprs + free_stack + ret + +#endif diff --git a/tests/keccak_neon/manual/keccak_f1600_x4_hybrid_asm_v2.s b/tests/keccak_neon/manual/keccak_f1600_x4_hybrid_asm_v2.s new file mode 100644 index 0000000..778e1c6 --- /dev/null +++ b/tests/keccak_neon/manual/keccak_f1600_x4_hybrid_asm_v2.s @@ -0,0 +1,991 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +#if defined(__ARM_FEATURE_SHA3) + +/********************** CONSTANTS *************************/ + .data + .align(8) +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x29 + count .req w27 + cur_const .req x26 + + /* Mapping of Kecck-f1600 SIMD state to vector registers + * at the beginning and end of each round. */ + + vAba .req v0 + vAbe .req v1 + vAbi .req v2 + vAbo .req v3 + vAbu .req v4 + vAga .req v5 + vAge .req v6 + vAgi .req v7 + vAgo .req v8 + vAgu .req v9 + vAka .req v10 + vAke .req v11 + vAki .req v12 + vAko .req v13 + vAku .req v14 + vAma .req v15 + vAme .req v16 + vAmi .req v17 + vAmo .req v18 + vAmu .req v19 + vAsa .req v20 + vAse .req v21 + vAsi .req v22 + vAso .req v23 + vAsu .req v24 + + /* q-form of the above mapping */ + vAbaq .req q0 + vAbeq .req q1 + vAbiq .req q2 + vAboq .req q3 + vAbuq .req q4 + vAgaq .req q5 + vAgeq .req q6 + vAgiq .req q7 + vAgoq .req q8 + vAguq .req q9 + vAkaq .req q10 + vAkeq .req q11 + vAkiq .req q12 + vAkoq .req q13 + vAkuq .req q14 + vAmaq .req q15 + vAmeq .req q16 + vAmiq .req q17 + vAmoq .req q18 + vAmuq .req q19 + vAsaq .req q20 + vAseq .req q21 + vAsiq .req q22 + vAsoq .req q23 + vAsuq .req q24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v30 + C1 .req v29 + C2 .req v28 + C3 .req v27 + C4 .req v26 + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req v26 + E1 .req v25 + E2 .req v29 + E3 .req v28 + E4 .req v27 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + vAbi_ .req v2 + vAbo_ .req v3 + vAbu_ .req v4 + vAga_ .req v10 + vAge_ .req v11 + vAgi_ .req v7 + vAgo_ .req v8 + vAgu_ .req v9 + vAka_ .req v15 + vAke_ .req v16 + vAki_ .req v12 + vAko_ .req v13 + vAku_ .req v14 + vAma_ .req v20 + vAme_ .req v21 + vAmi_ .req v17 + vAmo_ .req v18 + vAmu_ .req v19 + vAsa_ .req v0 + vAse_ .req v1 + vAsi_ .req v22 + vAso_ .req v23 + vAsu_ .req v24 + vAba_ .req v30 + vAbe_ .req v27 + + /* Mapping of Kecck-f1600 state to scalar registers + * at the beginning and end of each round. */ + s_Aba .req x1 + sAbe .req x6 + sAbi .req x11 + sAbo .req x16 + sAbu .req x21 + sAga .req x2 + sAge .req x7 + sAgi .req x12 + sAgo .req x17 + sAgu .req x22 + sAka .req x3 + sAke .req x8 + sAki .req x13 + sAko .req x18 + sAku .req x23 + sAma .req x4 + sAme .req x9 + sAmi .req x14 + sAmo .req x19 + sAmu .req x24 + sAsa .req x5 + sAse .req x10 + sAsi .req x15 + sAso .req x20 + sAsu .req x25 + + /* sA_[y,2*x+3*y] = rot(A[x,y]) */ + s_Aba_ .req x0 + sAbe_ .req x28 + sAbi_ .req x11 + sAbo_ .req x16 + sAbu_ .req x21 + sAga_ .req x3 + sAge_ .req x8 + sAgi_ .req x12 + sAgo_ .req x17 + sAgu_ .req x22 + sAka_ .req x4 + sAke_ .req x9 + sAki_ .req x13 + sAko_ .req x18 + sAku_ .req x23 + sAma_ .req x5 + sAme_ .req x10 + sAmi_ .req x14 + sAmo_ .req x19 + sAmu_ .req x24 + sAsa_ .req x1 + sAse_ .req x6 + sAsi_ .req x15 + sAso_ .req x20 + sAsu_ .req x25 + + /* sC[x] = sA[x,0] xor sA[x,1] xor sA[x,2] xor sA[x,3] xor sA[x,4], for x in 0..4 */ + /* sE[x] = sC[x-1] xor rot(C[x+1],1), for x in 0..4 */ + sC0 .req x0 + sE0 .req x29 + sC1 .req x26 + sE1 .req x30 + sC2 .req x27 + sE2 .req x26 + sC3 .req x28 + sE3 .req x27 + sC4 .req x29 + sE4 .req x28 + + tmp .req x30 + +/************************ MACROS ****************************/ + +/* Macros using v8.4-A SHA-3 instructions */ + +.macro eor3_m0 d s0 s1 s2 + eor3 \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +.macro rax1_m0 d s0 s1 + rax1 \d\().2d, \s0\().2d, \s1\().2d +.endm + +.macro xar_m0 d s0 s1 imm + xar \d\().2d, \s0\().2d, \s1\().2d, #\imm +.endm + +.macro bcax_m0 d s0 s1 s2 + bcax \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +.macro load_input_vector num idx + ldr vAbaq, [input_addr, #(16*(\num*0+\idx))] + ldr vAbeq, [input_addr, #(16*(\num*1+\idx))] + ldr vAbiq, [input_addr, #(16*(\num*2+\idx))] + ldr vAboq, [input_addr, #(16*(\num*3+\idx))] + ldr vAbuq, [input_addr, #(16*(\num*4+\idx))] + ldr vAgaq, [input_addr, #(16*(\num*5+\idx))] + ldr vAgeq, [input_addr, #(16*(\num*6+\idx))] + ldr vAgiq, [input_addr, #(16*(\num*7+\idx))] + ldr vAgoq, [input_addr, #(16*(\num*8+\idx))] + ldr vAguq, [input_addr, #(16*(\num*9+\idx))] + ldr vAkaq, [input_addr, #(16*(\num*10+\idx))] + ldr vAkeq, [input_addr, #(16*(\num*11+\idx))] + ldr vAkiq, [input_addr, #(16*(\num*12+\idx))] + ldr vAkoq, [input_addr, #(16*(\num*13+\idx))] + ldr vAkuq, [input_addr, #(16*(\num*14+\idx))] + ldr vAmaq, [input_addr, #(16*(\num*15+\idx))] + ldr vAmeq, [input_addr, #(16*(\num*16+\idx))] + ldr vAmiq, [input_addr, #(16*(\num*17+\idx))] + ldr vAmoq, [input_addr, #(16*(\num*18+\idx))] + ldr vAmuq, [input_addr, #(16*(\num*19+\idx))] + ldr vAsaq, [input_addr, #(16*(\num*20+\idx))] + ldr vAseq, [input_addr, #(16*(\num*21+\idx))] + ldr vAsiq, [input_addr, #(16*(\num*22+\idx))] + ldr vAsoq, [input_addr, #(16*(\num*23+\idx))] + ldr vAsuq, [input_addr, #(16*(\num*24+\idx))] +.endm + +.macro store_input_vector num idx + str vAbaq, [input_addr, #(16*(\num*0+\idx))] + str vAbeq, [input_addr, #(16*(\num*1+\idx))] + str vAbiq, [input_addr, #(16*(\num*2+\idx))] + str vAboq, [input_addr, #(16*(\num*3+\idx))] + str vAbuq, [input_addr, #(16*(\num*4+\idx))] + str vAgaq, [input_addr, #(16*(\num*5+\idx))] + str vAgeq, [input_addr, #(16*(\num*6+\idx))] + str vAgiq, [input_addr, #(16*(\num*7+\idx))] + str vAgoq, [input_addr, #(16*(\num*8+\idx))] + str vAguq, [input_addr, #(16*(\num*9+\idx))] + str vAkaq, [input_addr, #(16*(\num*10+\idx))] + str vAkeq, [input_addr, #(16*(\num*11+\idx))] + str vAkiq, [input_addr, #(16*(\num*12+\idx))] + str vAkoq, [input_addr, #(16*(\num*13+\idx))] + str vAkuq, [input_addr, #(16*(\num*14+\idx))] + str vAmaq, [input_addr, #(16*(\num*15+\idx))] + str vAmeq, [input_addr, #(16*(\num*16+\idx))] + str vAmiq, [input_addr, #(16*(\num*17+\idx))] + str vAmoq, [input_addr, #(16*(\num*18+\idx))] + str vAmuq, [input_addr, #(16*(\num*19+\idx))] + str vAsaq, [input_addr, #(16*(\num*20+\idx))] + str vAseq, [input_addr, #(16*(\num*21+\idx))] + str vAsiq, [input_addr, #(16*(\num*22+\idx))] + str vAsoq, [input_addr, #(16*(\num*23+\idx))] + str vAsuq, [input_addr, #(16*(\num*24+\idx))] +.endm + +.macro store_input_scalar num idx + str s_Aba, [input_addr, 8*(\num*(0) +\idx)] + str sAbe, [input_addr, 8*(\num*(0+1) +\idx)] + str sAbi, [input_addr, 8*(\num*(2)+ \idx)] + str sAbo, [input_addr, 8*(\num*(2+1) +\idx)] + str sAbu, [input_addr, 8*(\num*(4)+ \idx)] + str sAga, [input_addr, 8*(\num*(4+1) +\idx)] + str sAge, [input_addr, 8*(\num*(6)+ \idx)] + str sAgi, [input_addr, 8*(\num*(6+1) +\idx)] + str sAgo, [input_addr, 8*(\num*(8)+ \idx)] + str sAgu, [input_addr, 8*(\num*(8+1) +\idx)] + str sAka, [input_addr, 8*(\num*(10) +\idx)] + str sAke, [input_addr, 8*(\num*(10+1)+\idx)] + str sAki, [input_addr, 8*(\num*(12) +\idx)] + str sAko, [input_addr, 8*(\num*(12+1)+\idx)] + str sAku, [input_addr, 8*(\num*(14) +\idx)] + str sAma, [input_addr, 8*(\num*(14+1)+\idx)] + str sAme, [input_addr, 8*(\num*(16) +\idx)] + str sAmi, [input_addr, 8*(\num*(16+1)+\idx)] + str sAmo, [input_addr, 8*(\num*(18) +\idx)] + str sAmu, [input_addr, 8*(\num*(18+1)+\idx)] + str sAsa, [input_addr, 8*(\num*(20) +\idx)] + str sAse, [input_addr, 8*(\num*(20+1)+\idx)] + str sAsi, [input_addr, 8*(\num*(22) +\idx)] + str sAso, [input_addr, 8*(\num*(22+1)+\idx)] + str sAsu, [input_addr, 8*(\num*(24) +\idx)] +.endm + +.macro load_input_scalar num idx + ldr s_Aba, [input_addr, 8*(\num*(0) +\idx)] + ldr sAbe, [input_addr, 8*(\num*(0+1) +\idx)] + ldr sAbi, [input_addr, 8*(\num*(2)+ \idx)] + ldr sAbo, [input_addr, 8*(\num*(2+1) +\idx)] + ldr sAbu, [input_addr, 8*(\num*(4)+ \idx)] + ldr sAga, [input_addr, 8*(\num*(4+1) +\idx)] + ldr sAge, [input_addr, 8*(\num*(6)+ \idx)] + ldr sAgi, [input_addr, 8*(\num*(6+1) +\idx)] + ldr sAgo, [input_addr, 8*(\num*(8)+ \idx)] + ldr sAgu, [input_addr, 8*(\num*(8+1) +\idx)] + ldr sAka, [input_addr, 8*(\num*(10) +\idx)] + ldr sAke, [input_addr, 8*(\num*(10+1)+\idx)] + ldr sAki, [input_addr, 8*(\num*(12) +\idx)] + ldr sAko, [input_addr, 8*(\num*(12+1)+\idx)] + ldr sAku, [input_addr, 8*(\num*(14) +\idx)] + ldr sAma, [input_addr, 8*(\num*(14+1)+\idx)] + ldr sAme, [input_addr, 8*(\num*(16) +\idx)] + ldr sAmi, [input_addr, 8*(\num*(16+1)+\idx)] + ldr sAmo, [input_addr, 8*(\num*(18) +\idx)] + ldr sAmu, [input_addr, 8*(\num*(18+1)+\idx)] + ldr sAsa, [input_addr, 8*(\num*(20) +\idx)] + ldr sAse, [input_addr, 8*(\num*(20+1)+\idx)] + ldr sAsi, [input_addr, 8*(\num*(22) +\idx)] + ldr sAso, [input_addr, 8*(\num*(22+1)+\idx)] + ldr sAsu, [input_addr, 8*(\num*(24) +\idx)] +.endm + +#define STACK_SIZE (8*8 + 16*6 + 3*8 + 8) // VREGS (8*8), GPRs (16*6), count (8), const (8), input (8), padding (8) +#define STACK_BASE_GPRS (3*8+8) +#define STACK_BASE_VREGS (3*8+8+16*6) +#define STACK_OFFSET_INPUT (0*8) +#define STACK_OFFSET_CONST (1*8) +#define STACK_OFFSET_COUNT (2*8) + +.macro save_gprs + stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro save_vregs + stp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] + stp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + stp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + stp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] +.endm + +.macro restore_vregs + ldp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] + ldp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + ldp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + ldp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] +.endm + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +.macro eor5 dst, src0, src1, src2, src3, src4 + eor \dst, \src0, \src1 + eor \dst, \dst, \src2 + eor \dst, \dst, \src3 + eor \dst, \dst, \src4 +.endm + +.macro xor_rol dst, src1, src0, imm + eor \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro bic_rol dst, src1, src0, imm + bic \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro rotate dst, src, imm + ror \dst, \src, #(64-\imm) +.endm + +.macro save reg, offset + str \reg, [sp, #\offset] +.endm + +.macro restore reg, offset + ldr \reg, [sp, #\offset] +.endm + +.macro hybrid_round_initial + + eor sC0, sAma, sAsa SEP eor3_m0 C0, vAba, vAga, vAka + eor sC1, sAme, sAse SEP + eor sC2, sAmi, sAsi SEP + eor sC3, sAmo, sAso SEP eor3_m0 C0, C0, vAma, vAsa + eor sC4, sAmu, sAsu SEP + eor sC0, sAka, sC0 SEP + eor sC1, sAke, sC1 SEP eor3_m0 C1, vAbe, vAge, vAke + eor sC2, sAki, sC2 SEP + eor sC3, sAko, sC3 SEP + eor sC4, sAku, sC4 SEP eor3_m0 C1, C1, vAme, vAse + eor sC0, sAga, sC0 SEP + eor sC1, sAge, sC1 SEP + eor sC2, sAgi, sC2 SEP eor3_m0 C2, vAbi, vAgi, vAki + eor sC3, sAgo, sC3 SEP + eor sC4, sAgu, sC4 SEP + eor sC0, s_Aba, sC0 SEP eor3_m0 C2, C2, vAmi, vAsi + eor sC1, sAbe, sC1 SEP + eor sC2, sAbi, sC2 SEP + eor sC3, sAbo, sC3 SEP eor3_m0 C3, vAbo, vAgo, vAko + eor sC4, sAbu, sC4 SEP + SEP + eor sE1, sC0, sC2, ROR #63 SEP eor3_m0 C3, C3, vAmo, vAso + eor sE3, sC2, sC4, ROR #63 SEP + eor sE0, sC4, sC1, ROR #63 SEP + eor sE2, sC1, sC3, ROR #63 SEP eor3_m0 C4, vAbu, vAgu, vAku + eor sE4, sC3, sC0, ROR #63 SEP + SEP + eor s_Aba_, s_Aba, sE0 SEP eor3_m0 C4, C4, vAmu, vAsu + eor sAsa_, sAbi, sE2 SEP + eor sAbi_, sAki, sE2 SEP + eor sAki_, sAko, sE3 SEP + eor sAko_, sAmu, sE4 SEP rax1_m0 E1, C0, C2 + eor sAmu_, sAso, sE3 SEP + eor sAso_, sAma, sE0 SEP + eor sAka_, sAbe, sE1 SEP rax1_m0 E3, C2, C4 + eor sAse_, sAgo, sE3 SEP + eor sAgo_, sAme, sE1 SEP + eor sAke_, sAgi, sE2 SEP rax1_m0 E0, C4, C1 + eor sAgi_, sAka, sE0 SEP + eor sAga_, sAbo, sE3 SEP + eor sAbo_, sAmo, sE3 SEP rax1_m0 E2, C1, C3 + eor sAmo_, sAmi, sE2 SEP + eor sAmi_, sAke, sE1 SEP + eor sAge_, sAgu, sE4 SEP rax1_m0 E4, C3, C0 + eor sAgu_, sAsi, sE2 SEP + eor sAsi_, sAku, sE4 SEP + eor sAku_, sAsa, sE0 SEP + eor sAma_, sAbu, sE4 SEP eor vAba_.16b, vAba.16b, E0.16b + eor sAbu_, sAsu, sE4 SEP + eor sAsu_, sAse, sE1 SEP + eor sAme_, sAga, sE0 SEP xar_m0 vAsa_, vAbi, E2, 2 + eor sAbe_, sAge, sE1 SEP + SEP + load_constant_ptr SEP xar_m0 vAbi_, vAki, E2, 21 + SEP + bic tmp, sAgi_, sAge_, ROR #47 SEP + eor sAga, tmp, sAga_, ROR #39 SEP xar_m0 vAki_, vAko, E3, 39 + bic tmp, sAgo_, sAgi_, ROR #42 SEP + eor sAge, tmp, sAge_, ROR #25 SEP + bic tmp, sAgu_, sAgo_, ROR #16 SEP xar_m0 vAko_, vAmu, E4, 56 + eor sAgi, tmp, sAgi_, ROR #58 SEP + bic tmp, sAga_, sAgu_, ROR #31 SEP + eor sAgo, tmp, sAgo_, ROR #47 SEP xar_m0 vAmu_, vAso, E3, 8 + bic tmp, sAge_, sAga_, ROR #56 SEP + eor sAgu, tmp, sAgu_, ROR #23 SEP + bic tmp, sAki_, sAke_, ROR #19 SEP xar_m0 vAso_, vAma, E0, 23 + eor sAka, tmp, sAka_, ROR #24 SEP + bic tmp, sAko_, sAki_, ROR #47 SEP + eor sAke, tmp, sAke_, ROR #2 SEP xar_m0 vAka_, vAbe, E1, 63 + bic tmp, sAku_, sAko_, ROR #10 SEP + eor sAki, tmp, sAki_, ROR #57 SEP + bic tmp, sAka_, sAku_, ROR #47 SEP xar_m0 vAse_, vAgo, E3, 9 + eor sAko, tmp, sAko_, ROR #57 SEP + bic tmp, sAke_, sAka_, ROR #5 SEP + eor sAku, tmp, sAku_, ROR #52 SEP xar_m0 vAgo_, vAme, E1, 19 + bic tmp, sAmi_, sAme_, ROR #38 SEP + eor sAma, tmp, sAma_, ROR #47 SEP + bic tmp, sAmo_, sAmi_, ROR #5 SEP xar_m0 vAke_, vAgi, E2, 58 + eor sAme, tmp, sAme_, ROR #43 SEP + bic tmp, sAmu_, sAmo_, ROR #41 SEP + eor sAmi, tmp, sAmi_, ROR #46 SEP xar_m0 vAgi_, vAka, E0, 61 + SEP + ldr cur_const, [const_addr] SEP + mov count, #1 SEP xar_m0 vAga_, vAbo, E3, 36 + SEP + bic tmp, sAma_, sAmu_, ROR #35 SEP + eor sAmo, tmp, sAmo_, ROR #12 SEP xar_m0 vAbo_, vAmo, E3, 43 + bic tmp, sAme_, sAma_, ROR #9 SEP + eor sAmu, tmp, sAmu_, ROR #44 SEP + bic tmp, sAsi_, sAse_, ROR #48 SEP xar_m0 vAmo_, vAmi, E2, 49 + eor sAsa, tmp, sAsa_, ROR #41 SEP + bic tmp, sAso_, sAsi_, ROR #2 SEP + eor sAse, tmp, sAse_, ROR #50 SEP xar_m0 vAmi_, vAke, E1, 54 + bic tmp, sAsu_, sAso_, ROR #25 SEP + eor sAsi, tmp, sAsi_, ROR #27 SEP + bic tmp, sAsa_, sAsu_, ROR #60 SEP xar_m0 vAge_, vAgu, E4, 44 + eor sAso, tmp, sAso_, ROR #21 SEP + bic tmp, sAse_, sAsa_, ROR #57 SEP + eor sAsu, tmp, sAsu_, ROR #53 SEP xar_m0 vAgu_, vAsi, E2, 3 + bic tmp, sAbi_, sAbe_, ROR #63 SEP + eor s_Aba, s_Aba_, tmp, ROR #21 SEP + bic tmp, sAbo_, sAbi_, ROR #42 SEP xar_m0 vAsi_, vAku, E4, 25 + eor sAbe, tmp, sAbe_, ROR #41 SEP + bic tmp, sAbu_, sAbo_, ROR #57 SEP + eor sAbi, tmp, sAbi_, ROR #35 SEP xar_m0 vAku_, vAsa, E0, 46 + bic tmp, s_Aba_, sAbu_, ROR #50 SEP + eor sAbo, tmp, sAbo_, ROR #43 SEP + bic tmp, sAbe_, s_Aba_, ROR #44 SEP xar_m0 vAma_, vAbu, E4, 37 + eor sAbu, tmp, sAbu_, ROR #30 SEP + SEP + eor s_Aba, s_Aba, cur_const SEP xar_m0 vAbu_, vAsu, E4, 50 + SEP + save count, STACK_OFFSET_COUNT SEP + SEP xar_m0 vAsu_, vAse, E1, 62 + eor sC0, sAka, sAsa, ROR #50 SEP + eor sC1, sAse, sAge, ROR #60 SEP + eor sC2, sAmi, sAgi, ROR #59 SEP xar_m0 vAme_, vAga, E0, 28 + eor sC3, sAgo, sAso, ROR #30 SEP + eor sC4, sAbu, sAsu, ROR #53 SEP + eor sC0, sAma, sC0, ROR #49 SEP xar_m0 vAbe_, vAge, E1, 20 + eor sC1, sAbe, sC1, ROR #44 SEP + eor sC2, sAki, sC2, ROR #26 SEP restore sE1, STACK_OFFSET_CONST + eor sC3, sAmo, sC3, ROR #63 SEP + eor sC4, sAmu, sC4, ROR #56 SEP + eor sC0, sAga, sC0, ROR #57 SEP ld1r {v31.2d}, [sE1], #8 + eor sC1, sAme, sC1, ROR #58 SEP + eor sC2, sAbi, sC2, ROR #60 SEP + eor sC3, sAko, sC3, ROR #38 SEP save sE1, STACK_OFFSET_CONST + eor sC4, sAgu, sC4, ROR #48 SEP + eor sC0, s_Aba, sC0, ROR #61 SEP bcax_m0 vAga, vAga_, vAgi_, vAge_ + eor sC1, sAke, sC1, ROR #57 SEP + eor sC2, sAsi, sC2, ROR #52 SEP + eor sC3, sAbo, sC3, ROR #63 SEP bcax_m0 vAge, vAge_, vAgo_, vAgi_ + eor sC4, sAku, sC4, ROR #50 SEP + ror sC1, sC1, 56 SEP + ror sC4, sC4, 58 SEP bcax_m0 vAgi, vAgi_, vAgu_, vAgo_ + ror sC2, sC2, 62 SEP + SEP + eor sE1, sC0, sC2, ROR #63 SEP bcax_m0 vAgo, vAgo_, vAga_, vAgu_ + eor sE3, sC2, sC4, ROR #63 SEP + eor sE0, sC4, sC1, ROR #63 SEP + eor sE2, sC1, sC3, ROR #63 SEP bcax_m0 vAgu, vAgu_, vAge_, vAga_ + eor sE4, sC3, sC0, ROR #63 SEP + SEP + eor s_Aba_, sE0, s_Aba SEP bcax_m0 vAka, vAka_, vAki_, vAke_ + eor sAsa_, sE2, sAbi, ROR #50 SEP + eor sAbi_, sE2, sAki, ROR #46 SEP + eor sAki_, sE3, sAko, ROR #63 SEP bcax_m0 vAke, vAke_, vAko_, vAki_ + eor sAko_, sE4, sAmu, ROR #28 SEP + eor sAmu_, sE3, sAso, ROR #2 SEP + eor sAso_, sE0, sAma, ROR #54 SEP bcax_m0 vAki, vAki_, vAku_, vAko_ + eor sAka_, sE1, sAbe, ROR #43 SEP + eor sAse_, sE3, sAgo, ROR #36 SEP + eor sAgo_, sE1, sAme, ROR #49 SEP bcax_m0 vAko, vAko_, vAka_, vAku_ + eor sAke_, sE2, sAgi, ROR #3 SEP + eor sAgi_, sE0, sAka, ROR #39 SEP + eor sAga_, sE3, sAbo SEP bcax_m0 vAku, vAku_, vAke_, vAka_ + eor sAbo_, sE3, sAmo, ROR #37 SEP + eor sAmo_, sE2, sAmi, ROR #8 SEP + eor sAmi_, sE1, sAke, ROR #56 SEP bcax_m0 vAma, vAma_, vAmi_, vAme_ + eor sAge_, sE4, sAgu, ROR #44 SEP + eor sAgu_, sE2, sAsi, ROR #62 SEP + eor sAsi_, sE4, sAku, ROR #58 SEP bcax_m0 vAme, vAme_, vAmo_, vAmi_ + eor sAku_, sE0, sAsa, ROR #25 SEP + eor sAma_, sE4, sAbu, ROR #20 SEP + eor sAbu_, sE4, sAsu, ROR #9 SEP bcax_m0 vAmi, vAmi_, vAmu_, vAmo_ + eor sAsu_, sE1, sAse, ROR #23 SEP + eor sAme_, sE0, sAga, ROR #61 SEP + eor sAbe_, sE1, sAge, ROR #19 SEP bcax_m0 vAmo, vAmo_, vAma_, vAmu_ + SEP + load_constant_ptr SEP + restore count, STACK_OFFSET_COUNT SEP bcax_m0 vAmu, vAmu_, vAme_, vAma_ + SEP + bic tmp, sAgi_, sAge_, ROR #47 SEP + eor sAga, tmp, sAga_, ROR #39 SEP bcax_m0 vAsa, vAsa_, vAsi_, vAse_ + bic tmp, sAgo_, sAgi_, ROR #42 SEP + eor sAge, tmp, sAge_, ROR #25 SEP + bic tmp, sAgu_, sAgo_, ROR #16 SEP bcax_m0 vAse, vAse_, vAso_, vAsi_ + eor sAgi, tmp, sAgi_, ROR #58 SEP + bic tmp, sAga_, sAgu_, ROR #31 SEP + eor sAgo, tmp, sAgo_, ROR #47 SEP bcax_m0 vAsi, vAsi_, vAsu_, vAso_ + bic tmp, sAge_, sAga_, ROR #56 SEP + eor sAgu, tmp, sAgu_, ROR #23 SEP + bic tmp, sAki_, sAke_, ROR #19 SEP bcax_m0 vAso, vAso_, vAsa_, vAsu_ + eor sAka, tmp, sAka_, ROR #24 SEP + bic tmp, sAko_, sAki_, ROR #47 SEP + eor sAke, tmp, sAke_, ROR #2 SEP bcax_m0 vAsu, vAsu_, vAse_, vAsa_ + bic tmp, sAku_, sAko_, ROR #10 SEP + eor sAki, tmp, sAki_, ROR #57 SEP + bic tmp, sAka_, sAku_, ROR #47 SEP bcax_m0 vAba, vAba_, vAbi_, vAbe_ + eor sAko, tmp, sAko_, ROR #57 SEP + bic tmp, sAke_, sAka_, ROR #5 SEP + eor sAku, tmp, sAku_, ROR #52 SEP bcax_m0 vAbe, vAbe_, vAbo_, vAbi_ + bic tmp, sAmi_, sAme_, ROR #38 SEP + eor sAma, tmp, sAma_, ROR #47 SEP + bic tmp, sAmo_, sAmi_, ROR #5 SEP bcax_m0 vAbi, vAbi_, vAbu_, vAbo_ + eor sAme, tmp, sAme_, ROR #43 SEP + bic tmp, sAmu_, sAmo_, ROR #41 SEP + eor sAmi, tmp, sAmi_, ROR #46 SEP bcax_m0 vAbo, vAbo_, vAba_, vAbu_ + bic tmp, sAma_, sAmu_, ROR #35 SEP + SEP + ldr cur_const, [const_addr, count, UXTW #3] SEP bcax_m0 vAbu, vAbu_, vAbe_, vAba_ + SEP + eor sAmo, tmp, sAmo_, ROR #12 SEP + bic tmp, sAme_, sAma_, ROR #9 SEP + eor sAmu, tmp, sAmu_, ROR #44 SEP eor vAba.16b, vAba.16b, v31.16b + bic tmp, sAsi_, sAse_, ROR #48 SEP + eor sAsa, tmp, sAsa_, ROR #41 SEP + bic tmp, sAso_, sAsi_, ROR #2 SEP + eor sAse, tmp, sAse_, ROR #50 SEP + bic tmp, sAsu_, sAso_, ROR #25 SEP + eor sAsi, tmp, sAsi_, ROR #27 SEP + bic tmp, sAsa_, sAsu_, ROR #60 SEP + eor sAso, tmp, sAso_, ROR #21 SEP + bic tmp, sAse_, sAsa_, ROR #57 SEP + eor sAsu, tmp, sAsu_, ROR #53 SEP + bic tmp, sAbi_, sAbe_, ROR #63 SEP + eor s_Aba, s_Aba_, tmp, ROR #21 SEP + bic tmp, sAbo_, sAbi_, ROR #42 SEP + eor sAbe, tmp, sAbe_, ROR #41 SEP + bic tmp, sAbu_, sAbo_, ROR #57 SEP + eor sAbi, tmp, sAbi_, ROR #35 SEP + bic tmp, s_Aba_, sAbu_, ROR #50 SEP + eor sAbo, tmp, sAbo_, ROR #43 SEP + bic tmp, sAbe_, s_Aba_, ROR #44 SEP + eor sAbu, tmp, sAbu_, ROR #30 SEP + SEP + add count, count, #1 SEP + SEP + eor s_Aba, s_Aba, cur_const SEP + SEP +.endm + +.macro hybrid_round_noninitial + save count, STACK_OFFSET_COUNT SEP eor3_m0 C0, vAba, vAga, vAka + SEP + eor sC0, sAka, sAsa, ROR #50 SEP + eor sC1, sAse, sAge, ROR #60 SEP eor3_m0 C0, C0, vAma, vAsa + eor sC2, sAmi, sAgi, ROR #59 SEP + eor sC3, sAgo, sAso, ROR #30 SEP + eor sC4, sAbu, sAsu, ROR #53 SEP eor3_m0 C1, vAbe, vAge, vAke + eor sC0, sAma, sC0, ROR #49 SEP + eor sC1, sAbe, sC1, ROR #44 SEP + eor sC2, sAki, sC2, ROR #26 SEP eor3_m0 C1, C1, vAme, vAse + eor sC3, sAmo, sC3, ROR #63 SEP + eor sC4, sAmu, sC4, ROR #56 SEP + eor sC0, sAga, sC0, ROR #57 SEP eor3_m0 C2, vAbi, vAgi, vAki + eor sC1, sAme, sC1, ROR #58 SEP + eor sC2, sAbi, sC2, ROR #60 SEP + eor sC3, sAko, sC3, ROR #38 SEP eor3_m0 C2, C2, vAmi, vAsi + eor sC4, sAgu, sC4, ROR #48 SEP + eor sC0, s_Aba, sC0, ROR #61 SEP + eor sC1, sAke, sC1, ROR #57 SEP eor3_m0 C3, vAbo, vAgo, vAko + eor sC2, sAsi, sC2, ROR #52 SEP + eor sC3, sAbo, sC3, ROR #63 SEP + eor sC4, sAku, sC4, ROR #50 SEP eor3_m0 C3, C3, vAmo, vAso + ror sC1, sC1, 56 SEP + ror sC4, sC4, 58 SEP + ror sC2, sC2, 62 SEP eor3_m0 C4, vAbu, vAgu, vAku + SEP + eor sE1, sC0, sC2, ROR #63 SEP + eor sE3, sC2, sC4, ROR #63 SEP eor3_m0 C4, C4, vAmu, vAsu + eor sE0, sC4, sC1, ROR #63 SEP + eor sE2, sC1, sC3, ROR #63 SEP + eor sE4, sC3, sC0, ROR #63 SEP + SEP rax1_m0 E1, C0, C2 + eor s_Aba_, sE0, s_Aba SEP + eor sAsa_, sE2, sAbi, ROR #50 SEP + eor sAbi_, sE2, sAki, ROR #46 SEP rax1_m0 E3, C2, C4 + eor sAki_, sE3, sAko, ROR #63 SEP + eor sAko_, sE4, sAmu, ROR #28 SEP + eor sAmu_, sE3, sAso, ROR #2 SEP rax1_m0 E0, C4, C1 + eor sAso_, sE0, sAma, ROR #54 SEP + eor sAka_, sE1, sAbe, ROR #43 SEP + eor sAse_, sE3, sAgo, ROR #36 SEP rax1_m0 E2, C1, C3 + eor sAgo_, sE1, sAme, ROR #49 SEP + eor sAke_, sE2, sAgi, ROR #3 SEP + eor sAgi_, sE0, sAka, ROR #39 SEP rax1_m0 E4, C3, C0 + eor sAga_, sE3, sAbo SEP + eor sAbo_, sE3, sAmo, ROR #37 SEP + eor sAmo_, sE2, sAmi, ROR #8 SEP + eor sAmi_, sE1, sAke, ROR #56 SEP eor vAba_.16b, vAba.16b, E0.16b + eor sAge_, sE4, sAgu, ROR #44 SEP + eor sAgu_, sE2, sAsi, ROR #62 SEP + eor sAsi_, sE4, sAku, ROR #58 SEP xar_m0 vAsa_, vAbi, E2, 2 + eor sAku_, sE0, sAsa, ROR #25 SEP + eor sAma_, sE4, sAbu, ROR #20 SEP + eor sAbu_, sE4, sAsu, ROR #9 SEP xar_m0 vAbi_, vAki, E2, 21 + eor sAsu_, sE1, sAse, ROR #23 SEP + eor sAme_, sE0, sAga, ROR #61 SEP + eor sAbe_, sE1, sAge, ROR #19 SEP xar_m0 vAki_, vAko, E3, 39 + SEP + load_constant_ptr SEP + restore count, STACK_OFFSET_COUNT SEP xar_m0 vAko_, vAmu, E4, 56 + SEP + bic tmp, sAgi_, sAge_, ROR #47 SEP + eor sAga, tmp, sAga_, ROR #39 SEP xar_m0 vAmu_, vAso, E3, 8 + bic tmp, sAgo_, sAgi_, ROR #42 SEP + eor sAge, tmp, sAge_, ROR #25 SEP + bic tmp, sAgu_, sAgo_, ROR #16 SEP xar_m0 vAso_, vAma, E0, 23 + eor sAgi, tmp, sAgi_, ROR #58 SEP + bic tmp, sAga_, sAgu_, ROR #31 SEP + eor sAgo, tmp, sAgo_, ROR #47 SEP xar_m0 vAka_, vAbe, E1, 63 + bic tmp, sAge_, sAga_, ROR #56 SEP + eor sAgu, tmp, sAgu_, ROR #23 SEP + bic tmp, sAki_, sAke_, ROR #19 SEP xar_m0 vAse_, vAgo, E3, 9 + eor sAka, tmp, sAka_, ROR #24 SEP + bic tmp, sAko_, sAki_, ROR #47 SEP + eor sAke, tmp, sAke_, ROR #2 SEP xar_m0 vAgo_, vAme, E1, 19 + bic tmp, sAku_, sAko_, ROR #10 SEP + eor sAki, tmp, sAki_, ROR #57 SEP + bic tmp, sAka_, sAku_, ROR #47 SEP xar_m0 vAke_, vAgi, E2, 58 + eor sAko, tmp, sAko_, ROR #57 SEP + bic tmp, sAke_, sAka_, ROR #5 SEP + eor sAku, tmp, sAku_, ROR #52 SEP xar_m0 vAgi_, vAka, E0, 61 + bic tmp, sAmi_, sAme_, ROR #38 SEP + eor sAma, tmp, sAma_, ROR #47 SEP + bic tmp, sAmo_, sAmi_, ROR #5 SEP xar_m0 vAga_, vAbo, E3, 36 + eor sAme, tmp, sAme_, ROR #43 SEP + bic tmp, sAmu_, sAmo_, ROR #41 SEP + eor sAmi, tmp, sAmi_, ROR #46 SEP xar_m0 vAbo_, vAmo, E3, 43 + bic tmp, sAma_, sAmu_, ROR #35 SEP + SEP + ldr cur_const, [const_addr, count, UXTW #3] SEP xar_m0 vAmo_, vAmi, E2, 49 + add count, count, #1 SEP + SEP + eor sAmo, tmp, sAmo_, ROR #12 SEP xar_m0 vAmi_, vAke, E1, 54 + bic tmp, sAme_, sAma_, ROR #9 SEP + eor sAmu, tmp, sAmu_, ROR #44 SEP + bic tmp, sAsi_, sAse_, ROR #48 SEP xar_m0 vAge_, vAgu, E4, 44 + eor sAsa, tmp, sAsa_, ROR #41 SEP + bic tmp, sAso_, sAsi_, ROR #2 SEP + eor sAse, tmp, sAse_, ROR #50 SEP xar_m0 vAgu_, vAsi, E2, 3 + bic tmp, sAsu_, sAso_, ROR #25 SEP + eor sAsi, tmp, sAsi_, ROR #27 SEP + bic tmp, sAsa_, sAsu_, ROR #60 SEP xar_m0 vAsi_, vAku, E4, 25 + eor sAso, tmp, sAso_, ROR #21 SEP + bic tmp, sAse_, sAsa_, ROR #57 SEP + eor sAsu, tmp, sAsu_, ROR #53 SEP xar_m0 vAku_, vAsa, E0, 46 + bic tmp, sAbi_, sAbe_, ROR #63 SEP + eor s_Aba, s_Aba_, tmp, ROR #21 SEP + bic tmp, sAbo_, sAbi_, ROR #42 SEP xar_m0 vAma_, vAbu, E4, 37 + eor sAbe, tmp, sAbe_, ROR #41 SEP + bic tmp, sAbu_, sAbo_, ROR #57 SEP + eor sAbi, tmp, sAbi_, ROR #35 SEP xar_m0 vAbu_, vAsu, E4, 50 + bic tmp, s_Aba_, sAbu_, ROR #50 SEP + eor sAbo, tmp, sAbo_, ROR #43 SEP + bic tmp, sAbe_, s_Aba_, ROR #44 SEP xar_m0 vAsu_, vAse, E1, 62 + eor sAbu, tmp, sAbu_, ROR #30 SEP + SEP + eor s_Aba, s_Aba, cur_const SEP xar_m0 vAme_, vAga, E0, 28 + save count, STACK_OFFSET_COUNT SEP + SEP + eor sC0, sAka, sAsa, ROR #50 SEP xar_m0 vAbe_, vAge, E1, 20 + eor sC1, sAse, sAge, ROR #60 SEP + eor sC2, sAmi, sAgi, ROR #59 SEP + eor sC3, sAgo, sAso, ROR #30 SEP + eor sC4, sAbu, sAsu, ROR #53 SEP restore sE1, STACK_OFFSET_CONST + eor sC0, sAma, sC0, ROR #49 SEP + eor sC1, sAbe, sC1, ROR #44 SEP + eor sC2, sAki, sC2, ROR #26 SEP ld1r {v31.2d}, [sE1], #8 + eor sC3, sAmo, sC3, ROR #63 SEP + eor sC4, sAmu, sC4, ROR #56 SEP + eor sC0, sAga, sC0, ROR #57 SEP save sE1, STACK_OFFSET_CONST + eor sC1, sAme, sC1, ROR #58 SEP + eor sC2, sAbi, sC2, ROR #60 SEP + eor sC3, sAko, sC3, ROR #38 SEP + eor sC4, sAgu, sC4, ROR #48 SEP bcax_m0 vAga, vAga_, vAgi_, vAge_ + eor sC0, s_Aba, sC0, ROR #61 SEP + eor sC1, sAke, sC1, ROR #57 SEP + eor sC2, sAsi, sC2, ROR #52 SEP bcax_m0 vAge, vAge_, vAgo_, vAgi_ + eor sC3, sAbo, sC3, ROR #63 SEP + eor sC4, sAku, sC4, ROR #50 SEP + ror sC1, sC1, 56 SEP bcax_m0 vAgi, vAgi_, vAgu_, vAgo_ + ror sC4, sC4, 58 SEP + ror sC2, sC2, 62 SEP + SEP bcax_m0 vAgo, vAgo_, vAga_, vAgu_ + eor sE1, sC0, sC2, ROR #63 SEP + eor sE3, sC2, sC4, ROR #63 SEP + eor sE0, sC4, sC1, ROR #63 SEP bcax_m0 vAgu, vAgu_, vAge_, vAga_ + eor sE2, sC1, sC3, ROR #63 SEP + eor sE4, sC3, sC0, ROR #63 SEP + SEP bcax_m0 vAka, vAka_, vAki_, vAke_ + eor s_Aba_, sE0, s_Aba SEP + eor sAsa_, sE2, sAbi, ROR #50 SEP + eor sAbi_, sE2, sAki, ROR #46 SEP bcax_m0 vAke, vAke_, vAko_, vAki_ + eor sAki_, sE3, sAko, ROR #63 SEP + eor sAko_, sE4, sAmu, ROR #28 SEP + eor sAmu_, sE3, sAso, ROR #2 SEP bcax_m0 vAki, vAki_, vAku_, vAko_ + eor sAso_, sE0, sAma, ROR #54 SEP + eor sAka_, sE1, sAbe, ROR #43 SEP + eor sAse_, sE3, sAgo, ROR #36 SEP bcax_m0 vAko, vAko_, vAka_, vAku_ + eor sAgo_, sE1, sAme, ROR #49 SEP + eor sAke_, sE2, sAgi, ROR #3 SEP + eor sAgi_, sE0, sAka, ROR #39 SEP bcax_m0 vAku, vAku_, vAke_, vAka_ + eor sAga_, sE3, sAbo SEP + eor sAbo_, sE3, sAmo, ROR #37 SEP + eor sAmo_, sE2, sAmi, ROR #8 SEP bcax_m0 vAma, vAma_, vAmi_, vAme_ + eor sAmi_, sE1, sAke, ROR #56 SEP + eor sAge_, sE4, sAgu, ROR #44 SEP + eor sAgu_, sE2, sAsi, ROR #62 SEP bcax_m0 vAme, vAme_, vAmo_, vAmi_ + eor sAsi_, sE4, sAku, ROR #58 SEP + eor sAku_, sE0, sAsa, ROR #25 SEP + eor sAma_, sE4, sAbu, ROR #20 SEP bcax_m0 vAmi, vAmi_, vAmu_, vAmo_ + eor sAbu_, sE4, sAsu, ROR #9 SEP + eor sAsu_, sE1, sAse, ROR #23 SEP + eor sAme_, sE0, sAga, ROR #61 SEP bcax_m0 vAmo, vAmo_, vAma_, vAmu_ + eor sAbe_, sE1, sAge, ROR #19 SEP + SEP + load_constant_ptr SEP bcax_m0 vAmu, vAmu_, vAme_, vAma_ + restore count, STACK_OFFSET_COUNT SEP + SEP + bic tmp, sAgi_, sAge_, ROR #47 SEP bcax_m0 vAsa, vAsa_, vAsi_, vAse_ + eor sAga, tmp, sAga_, ROR #39 SEP + bic tmp, sAgo_, sAgi_, ROR #42 SEP + eor sAge, tmp, sAge_, ROR #25 SEP bcax_m0 vAse, vAse_, vAso_, vAsi_ + bic tmp, sAgu_, sAgo_, ROR #16 SEP + eor sAgi, tmp, sAgi_, ROR #58 SEP + bic tmp, sAga_, sAgu_, ROR #31 SEP bcax_m0 vAsi, vAsi_, vAsu_, vAso_ + eor sAgo, tmp, sAgo_, ROR #47 SEP + bic tmp, sAge_, sAga_, ROR #56 SEP + eor sAgu, tmp, sAgu_, ROR #23 SEP bcax_m0 vAso, vAso_, vAsa_, vAsu_ + bic tmp, sAki_, sAke_, ROR #19 SEP + eor sAka, tmp, sAka_, ROR #24 SEP + bic tmp, sAko_, sAki_, ROR #47 SEP bcax_m0 vAsu, vAsu_, vAse_, vAsa_ + eor sAke, tmp, sAke_, ROR #2 SEP + bic tmp, sAku_, sAko_, ROR #10 SEP + eor sAki, tmp, sAki_, ROR #57 SEP bcax_m0 vAba, vAba_, vAbi_, vAbe_ + bic tmp, sAka_, sAku_, ROR #47 SEP + eor sAko, tmp, sAko_, ROR #57 SEP + bic tmp, sAke_, sAka_, ROR #5 SEP bcax_m0 vAbe, vAbe_, vAbo_, vAbi_ + eor sAku, tmp, sAku_, ROR #52 SEP + bic tmp, sAmi_, sAme_, ROR #38 SEP + eor sAma, tmp, sAma_, ROR #47 SEP bcax_m0 vAbi, vAbi_, vAbu_, vAbo_ + bic tmp, sAmo_, sAmi_, ROR #5 SEP + eor sAme, tmp, sAme_, ROR #43 SEP + bic tmp, sAmu_, sAmo_, ROR #41 SEP bcax_m0 vAbo, vAbo_, vAba_, vAbu_ + eor sAmi, tmp, sAmi_, ROR #46 SEP + bic tmp, sAma_, sAmu_, ROR #35 SEP + SEP bcax_m0 vAbu, vAbu_, vAbe_, vAba_ + ldr cur_const, [const_addr, count, UXTW #3] SEP + add count, count, #1 SEP + SEP eor vAba.16b, vAba.16b, v31.16b + eor sAmo, tmp, sAmo_, ROR #12 SEP + bic tmp, sAme_, sAma_, ROR #9 SEP + eor sAmu, tmp, sAmu_, ROR #44 SEP + bic tmp, sAsi_, sAse_, ROR #48 SEP + eor sAsa, tmp, sAsa_, ROR #41 SEP + bic tmp, sAso_, sAsi_, ROR #2 SEP + eor sAse, tmp, sAse_, ROR #50 SEP + bic tmp, sAsu_, sAso_, ROR #25 SEP + eor sAsi, tmp, sAsi_, ROR #27 SEP + bic tmp, sAsa_, sAsu_, ROR #60 SEP + eor sAso, tmp, sAso_, ROR #21 SEP + bic tmp, sAse_, sAsa_, ROR #57 SEP + eor sAsu, tmp, sAsu_, ROR #53 SEP + bic tmp, sAbi_, sAbe_, ROR #63 SEP + eor s_Aba, s_Aba_, tmp, ROR #21 SEP + bic tmp, sAbo_, sAbi_, ROR #42 SEP + eor sAbe, tmp, sAbe_, ROR #41 SEP + bic tmp, sAbu_, sAbo_, ROR #57 SEP + eor sAbi, tmp, sAbi_, ROR #35 SEP + bic tmp, s_Aba_, sAbu_, ROR #50 SEP + eor sAbo, tmp, sAbo_, ROR #43 SEP + bic tmp, sAbe_, s_Aba_, ROR #44 SEP + eor sAbu, tmp, sAbu_, ROR #30 SEP + SEP + eor s_Aba, s_Aba, cur_const SEP + +.endm + +.macro final_rotate + ror sAga, sAga,#(64-3) + ror sAka, sAka,#(64-25) + ror sAma, sAma,#(64-10) + ror sAsa, sAsa,#(64-39) + ror sAbe, sAbe,#(64-21) + ror sAge, sAge,#(64-45) + ror sAke, sAke,#(64-8) + ror sAme, sAme,#(64-15) + ror sAse, sAse,#(64-41) + ror sAbi, sAbi,#(64-14) + ror sAgi, sAgi,#(64-61) + ror sAki, sAki,#(64-18) + ror sAmi, sAmi,#(64-56) + ror sAsi, sAsi,#(64-2) + ror sAgo, sAgo,#(64-28) + ror sAko, sAko,#(64-1) + ror sAmo, sAmo,#(64-27) + ror sAso, sAso,#(64-62) + ror sAbu, sAbu,#(64-44) + ror sAgu, sAgu,#(64-20) + ror sAku, sAku,#(64-6) + ror sAmu, sAmu,#(64-36) + ror sAsu, sAsu,#(64-55) +.endm + +#define KECCAK_F1600_ROUNDS 24 + +.global keccak_f1600_x4_hybrid_asm_v2 +.global _keccak_f1600_x4_hybrid_asm_v2 +.text +.align 4 + +keccak_f1600_x4_hybrid_asm_v2: +_keccak_f1600_x4_hybrid_asm_v2: + alloc_stack + save_gprs + save_vregs + save input_addr, STACK_OFFSET_INPUT + + load_input_vector 2,1 + + load_constant_ptr + save const_addr, STACK_OFFSET_CONST + + // First scalar Keccak computation alongside first half of SIMD computation + load_input_scalar 4,0 + hybrid_round_initial + loop_0: + hybrid_round_noninitial + cmp count, #(KECCAK_F1600_ROUNDS-1) + ble loop_0 + final_rotate + restore input_addr, STACK_OFFSET_INPUT + store_input_scalar 4,0 + + // Second scalar Keccak computation alongsie second half of SIMD computation + load_input_scalar 4,1 + hybrid_round_initial + loop_1: + hybrid_round_noninitial + cmp count, #(KECCAK_F1600_ROUNDS-1) + ble loop_1 + final_rotate + restore input_addr, STACK_OFFSET_INPUT + store_input_scalar 4, 1 + + store_input_vector 2,1 + + restore_vregs + restore_gprs + free_stack + ret + +#endif diff --git a/tests/keccak_neon/manual/keccak_f1600_x4_hybrid_asm_v2p0.s b/tests/keccak_neon/manual/keccak_f1600_x4_hybrid_asm_v2p0.s new file mode 100644 index 0000000..7b5a203 --- /dev/null +++ b/tests/keccak_neon/manual/keccak_f1600_x4_hybrid_asm_v2p0.s @@ -0,0 +1,993 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +#if defined(__ARM_FEATURE_SHA3) + +/********************** CONSTANTS *************************/ + .data + .align(8) +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x29 + count .req w27 + cur_const .req x26 + + /* Mapping of Kecck-f1600 SIMD state to vector registers + * at the beginning and end of each round. */ + + vAba .req v0 + vAbe .req v1 + vAbi .req v2 + vAbo .req v3 + vAbu .req v4 + vAga .req v5 + vAge .req v6 + vAgi .req v7 + vAgo .req v8 + vAgu .req v9 + vAka .req v10 + vAke .req v11 + vAki .req v12 + vAko .req v13 + vAku .req v14 + vAma .req v15 + vAme .req v16 + vAmi .req v17 + vAmo .req v18 + vAmu .req v19 + vAsa .req v20 + vAse .req v21 + vAsi .req v22 + vAso .req v23 + vAsu .req v24 + + /* q-form of the above mapping */ + vAbaq .req q0 + vAbeq .req q1 + vAbiq .req q2 + vAboq .req q3 + vAbuq .req q4 + vAgaq .req q5 + vAgeq .req q6 + vAgiq .req q7 + vAgoq .req q8 + vAguq .req q9 + vAkaq .req q10 + vAkeq .req q11 + vAkiq .req q12 + vAkoq .req q13 + vAkuq .req q14 + vAmaq .req q15 + vAmeq .req q16 + vAmiq .req q17 + vAmoq .req q18 + vAmuq .req q19 + vAsaq .req q20 + vAseq .req q21 + vAsiq .req q22 + vAsoq .req q23 + vAsuq .req q24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v30 + C1 .req v29 + C2 .req v28 + C3 .req v27 + C4 .req v26 + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req v26 + E1 .req v25 + E2 .req v29 + E3 .req v28 + E4 .req v27 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + vAbi_ .req v2 + vAbo_ .req v3 + vAbu_ .req v4 + vAga_ .req v10 + vAge_ .req v11 + vAgi_ .req v7 + vAgo_ .req v8 + vAgu_ .req v9 + vAka_ .req v15 + vAke_ .req v16 + vAki_ .req v12 + vAko_ .req v13 + vAku_ .req v14 + vAma_ .req v20 + vAme_ .req v21 + vAmi_ .req v17 + vAmo_ .req v18 + vAmu_ .req v19 + vAsa_ .req v0 + vAse_ .req v1 + vAsi_ .req v22 + vAso_ .req v23 + vAsu_ .req v24 + vAba_ .req v30 + vAbe_ .req v27 + + /* Mapping of Kecck-f1600 state to scalar registers + * at the beginning and end of each round. */ + s_Aba .req x1 + sAbe .req x6 + sAbi .req x11 + sAbo .req x16 + sAbu .req x21 + sAga .req x2 + sAge .req x7 + sAgi .req x12 + sAgo .req x17 + sAgu .req x22 + sAka .req x3 + sAke .req x8 + sAki .req x13 + sAko .req x18 + sAku .req x23 + sAma .req x4 + sAme .req x9 + sAmi .req x14 + sAmo .req x19 + sAmu .req x24 + sAsa .req x5 + sAse .req x10 + sAsi .req x15 + sAso .req x20 + sAsu .req x25 + + /* sA_[y,2*x+3*y] = rot(A[x,y]) */ + s_Aba_ .req x0 + sAbe_ .req x28 + sAbi_ .req x11 + sAbo_ .req x16 + sAbu_ .req x21 + sAga_ .req x3 + sAge_ .req x8 + sAgi_ .req x12 + sAgo_ .req x17 + sAgu_ .req x22 + sAka_ .req x4 + sAke_ .req x9 + sAki_ .req x13 + sAko_ .req x18 + sAku_ .req x23 + sAma_ .req x5 + sAme_ .req x10 + sAmi_ .req x14 + sAmo_ .req x19 + sAmu_ .req x24 + sAsa_ .req x1 + sAse_ .req x6 + sAsi_ .req x15 + sAso_ .req x20 + sAsu_ .req x25 + + /* sC[x] = sA[x,0] xor sA[x,1] xor sA[x,2] xor sA[x,3] xor sA[x,4], for x in 0..4 */ + /* sE[x] = sC[x-1] xor rot(C[x+1],1), for x in 0..4 */ + sC0 .req x0 + sE0 .req x29 + sC1 .req x26 + sE1 .req x30 + sC2 .req x27 + sE2 .req x26 + sC3 .req x28 + sE3 .req x27 + sC4 .req x29 + sE4 .req x28 + + tmp .req x30 + +/************************ MACROS ****************************/ + +/* Macros using v8.4-A SHA-3 instructions */ + +.macro eor3_m0 d s0 s1 s2 + eor3 \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +.macro rax1_m1 d s0 s1 + xar_m0 tmpp, vzr, \s1, 63 + eor \d\().16b, \s0\().16b, tmpp.16b +.endm + +.macro xar_m0 d s0 s1 imm + xar \d\().2d, \s0\().2d, \s1\().2d, #\imm +.endm + +.macro bcax_m0 d s0 s1 s2 + bcax \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +.macro load_input_vector num idx + ldr vAbaq, [input_addr, #(16*(\num*0+\idx))] + ldr vAbeq, [input_addr, #(16*(\num*1+\idx))] + ldr vAbiq, [input_addr, #(16*(\num*2+\idx))] + ldr vAboq, [input_addr, #(16*(\num*3+\idx))] + ldr vAbuq, [input_addr, #(16*(\num*4+\idx))] + ldr vAgaq, [input_addr, #(16*(\num*5+\idx))] + ldr vAgeq, [input_addr, #(16*(\num*6+\idx))] + ldr vAgiq, [input_addr, #(16*(\num*7+\idx))] + ldr vAgoq, [input_addr, #(16*(\num*8+\idx))] + ldr vAguq, [input_addr, #(16*(\num*9+\idx))] + ldr vAkaq, [input_addr, #(16*(\num*10+\idx))] + ldr vAkeq, [input_addr, #(16*(\num*11+\idx))] + ldr vAkiq, [input_addr, #(16*(\num*12+\idx))] + ldr vAkoq, [input_addr, #(16*(\num*13+\idx))] + ldr vAkuq, [input_addr, #(16*(\num*14+\idx))] + ldr vAmaq, [input_addr, #(16*(\num*15+\idx))] + ldr vAmeq, [input_addr, #(16*(\num*16+\idx))] + ldr vAmiq, [input_addr, #(16*(\num*17+\idx))] + ldr vAmoq, [input_addr, #(16*(\num*18+\idx))] + ldr vAmuq, [input_addr, #(16*(\num*19+\idx))] + ldr vAsaq, [input_addr, #(16*(\num*20+\idx))] + ldr vAseq, [input_addr, #(16*(\num*21+\idx))] + ldr vAsiq, [input_addr, #(16*(\num*22+\idx))] + ldr vAsoq, [input_addr, #(16*(\num*23+\idx))] + ldr vAsuq, [input_addr, #(16*(\num*24+\idx))] +.endm + +.macro store_input_vector num idx + str vAbaq, [input_addr, #(16*(\num*0+\idx))] + str vAbeq, [input_addr, #(16*(\num*1+\idx))] + str vAbiq, [input_addr, #(16*(\num*2+\idx))] + str vAboq, [input_addr, #(16*(\num*3+\idx))] + str vAbuq, [input_addr, #(16*(\num*4+\idx))] + str vAgaq, [input_addr, #(16*(\num*5+\idx))] + str vAgeq, [input_addr, #(16*(\num*6+\idx))] + str vAgiq, [input_addr, #(16*(\num*7+\idx))] + str vAgoq, [input_addr, #(16*(\num*8+\idx))] + str vAguq, [input_addr, #(16*(\num*9+\idx))] + str vAkaq, [input_addr, #(16*(\num*10+\idx))] + str vAkeq, [input_addr, #(16*(\num*11+\idx))] + str vAkiq, [input_addr, #(16*(\num*12+\idx))] + str vAkoq, [input_addr, #(16*(\num*13+\idx))] + str vAkuq, [input_addr, #(16*(\num*14+\idx))] + str vAmaq, [input_addr, #(16*(\num*15+\idx))] + str vAmeq, [input_addr, #(16*(\num*16+\idx))] + str vAmiq, [input_addr, #(16*(\num*17+\idx))] + str vAmoq, [input_addr, #(16*(\num*18+\idx))] + str vAmuq, [input_addr, #(16*(\num*19+\idx))] + str vAsaq, [input_addr, #(16*(\num*20+\idx))] + str vAseq, [input_addr, #(16*(\num*21+\idx))] + str vAsiq, [input_addr, #(16*(\num*22+\idx))] + str vAsoq, [input_addr, #(16*(\num*23+\idx))] + str vAsuq, [input_addr, #(16*(\num*24+\idx))] +.endm + +.macro store_input_scalar num idx + str s_Aba, [input_addr, 8*(\num*(0) +\idx)] + str sAbe, [input_addr, 8*(\num*(0+1) +\idx)] + str sAbi, [input_addr, 8*(\num*(2)+ \idx)] + str sAbo, [input_addr, 8*(\num*(2+1) +\idx)] + str sAbu, [input_addr, 8*(\num*(4)+ \idx)] + str sAga, [input_addr, 8*(\num*(4+1) +\idx)] + str sAge, [input_addr, 8*(\num*(6)+ \idx)] + str sAgi, [input_addr, 8*(\num*(6+1) +\idx)] + str sAgo, [input_addr, 8*(\num*(8)+ \idx)] + str sAgu, [input_addr, 8*(\num*(8+1) +\idx)] + str sAka, [input_addr, 8*(\num*(10) +\idx)] + str sAke, [input_addr, 8*(\num*(10+1)+\idx)] + str sAki, [input_addr, 8*(\num*(12) +\idx)] + str sAko, [input_addr, 8*(\num*(12+1)+\idx)] + str sAku, [input_addr, 8*(\num*(14) +\idx)] + str sAma, [input_addr, 8*(\num*(14+1)+\idx)] + str sAme, [input_addr, 8*(\num*(16) +\idx)] + str sAmi, [input_addr, 8*(\num*(16+1)+\idx)] + str sAmo, [input_addr, 8*(\num*(18) +\idx)] + str sAmu, [input_addr, 8*(\num*(18+1)+\idx)] + str sAsa, [input_addr, 8*(\num*(20) +\idx)] + str sAse, [input_addr, 8*(\num*(20+1)+\idx)] + str sAsi, [input_addr, 8*(\num*(22) +\idx)] + str sAso, [input_addr, 8*(\num*(22+1)+\idx)] + str sAsu, [input_addr, 8*(\num*(24) +\idx)] +.endm + +.macro load_input_scalar num idx + ldr s_Aba, [input_addr, 8*(\num*(0) +\idx)] + ldr sAbe, [input_addr, 8*(\num*(0+1) +\idx)] + ldr sAbi, [input_addr, 8*(\num*(2)+ \idx)] + ldr sAbo, [input_addr, 8*(\num*(2+1) +\idx)] + ldr sAbu, [input_addr, 8*(\num*(4)+ \idx)] + ldr sAga, [input_addr, 8*(\num*(4+1) +\idx)] + ldr sAge, [input_addr, 8*(\num*(6)+ \idx)] + ldr sAgi, [input_addr, 8*(\num*(6+1) +\idx)] + ldr sAgo, [input_addr, 8*(\num*(8)+ \idx)] + ldr sAgu, [input_addr, 8*(\num*(8+1) +\idx)] + ldr sAka, [input_addr, 8*(\num*(10) +\idx)] + ldr sAke, [input_addr, 8*(\num*(10+1)+\idx)] + ldr sAki, [input_addr, 8*(\num*(12) +\idx)] + ldr sAko, [input_addr, 8*(\num*(12+1)+\idx)] + ldr sAku, [input_addr, 8*(\num*(14) +\idx)] + ldr sAma, [input_addr, 8*(\num*(14+1)+\idx)] + ldr sAme, [input_addr, 8*(\num*(16) +\idx)] + ldr sAmi, [input_addr, 8*(\num*(16+1)+\idx)] + ldr sAmo, [input_addr, 8*(\num*(18) +\idx)] + ldr sAmu, [input_addr, 8*(\num*(18+1)+\idx)] + ldr sAsa, [input_addr, 8*(\num*(20) +\idx)] + ldr sAse, [input_addr, 8*(\num*(20+1)+\idx)] + ldr sAsi, [input_addr, 8*(\num*(22) +\idx)] + ldr sAso, [input_addr, 8*(\num*(22+1)+\idx)] + ldr sAsu, [input_addr, 8*(\num*(24) +\idx)] +.endm + +#define STACK_SIZE (8*8 + 16*6 + 3*8 + 8) // VREGS (8*8), GPRs (16*6), count (8), const (8), input (8), padding (8) +#define STACK_BASE_GPRS (3*8+8) +#define STACK_BASE_VREGS (3*8+8+16*6) +#define STACK_OFFSET_INPUT (0*8) +#define STACK_OFFSET_CONST (1*8) +#define STACK_OFFSET_COUNT (2*8) + +.macro save_gprs + stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro save_vregs + stp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] + stp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + stp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + stp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] +.endm + +.macro restore_vregs + ldp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] + ldp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + ldp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + ldp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] +.endm + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +.macro eor5 dst, src0, src1, src2, src3, src4 + eor \dst, \src0, \src1 + eor \dst, \dst, \src2 + eor \dst, \dst, \src3 + eor \dst, \dst, \src4 +.endm + +.macro xor_rol dst, src1, src0, imm + eor \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro bic_rol dst, src1, src0, imm + bic \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro rotate dst, src, imm + ror \dst, \src, #(64-\imm) +.endm + +.macro save reg, offset + str \reg, [sp, #\offset] +.endm + +.macro restore reg, offset + ldr \reg, [sp, #\offset] +.endm + +.macro hybrid_round_initial + + eor sC0, sAma, sAsa SEP eor3_m0 C0, vAba, vAga, vAka + eor sC1, sAme, sAse SEP + eor sC2, sAmi, sAsi SEP + eor sC3, sAmo, sAso SEP eor3_m0 C0, C0, vAma, vAsa + eor sC4, sAmu, sAsu SEP + eor sC0, sAka, sC0 SEP + eor sC1, sAke, sC1 SEP eor3_m0 C1, vAbe, vAge, vAke + eor sC2, sAki, sC2 SEP + eor sC3, sAko, sC3 SEP + eor sC4, sAku, sC4 SEP eor3_m0 C1, C1, vAme, vAse + eor sC0, sAga, sC0 SEP + eor sC1, sAge, sC1 SEP + eor sC2, sAgi, sC2 SEP eor3_m0 C2, vAbi, vAgi, vAki + eor sC3, sAgo, sC3 SEP + eor sC4, sAgu, sC4 SEP + eor sC0, s_Aba, sC0 SEP eor3_m0 C2, C2, vAmi, vAsi + eor sC1, sAbe, sC1 SEP + eor sC2, sAbi, sC2 SEP + eor sC3, sAbo, sC3 SEP eor3_m0 C3, vAbo, vAgo, vAko + eor sC4, sAbu, sC4 SEP + SEP + eor sE1, sC0, sC2, ROR #63 SEP eor3_m0 C3, C3, vAmo, vAso + eor sE3, sC2, sC4, ROR #63 SEP + eor sE0, sC4, sC1, ROR #63 SEP + eor sE2, sC1, sC3, ROR #63 SEP eor3_m0 C4, vAbu, vAgu, vAku + eor sE4, sC3, sC0, ROR #63 SEP + SEP + eor s_Aba_, s_Aba, sE0 SEP eor3_m0 C4, C4, vAmu, vAsu + eor sAsa_, sAbi, sE2 SEP vzr .req v31 + eor sAbi_, sAki, sE2 SEP eor vzr.16b, vzr.16b, vzr.16b // zero + eor sAki_, sAko, sE3 SEP tmpp .req E1 + eor sAko_, sAmu, sE4 SEP rax1_m1 E1, C0, C2 + eor sAmu_, sAso, sE3 SEP .unreq tmpp + eor sAso_, sAma, sE0 SEP tmpp .req C0 + eor sAka_, sAbe, sE1 SEP rax1_m1 E3, C2, C4 + eor sAse_, sAgo, sE3 SEP + eor sAgo_, sAme, sE1 SEP + eor sAke_, sAgi, sE2 SEP rax1_m1 E0, C4, C1 + eor sAgi_, sAka, sE0 SEP + eor sAga_, sAbo, sE3 SEP + eor sAbo_, sAmo, sE3 SEP rax1_m1 E2, C1, C3 + eor sAmo_, sAmi, sE2 SEP + eor sAmi_, sAke, sE1 SEP + eor sAge_, sAgu, sE4 SEP rax1_m1 E4, C3, C0 + eor sAgu_, sAsi, sE2 SEP .unreq vzr + eor sAsi_, sAku, sE4 SEP .unreq tmpp + eor sAku_, sAsa, sE0 SEP + eor sAma_, sAbu, sE4 SEP eor vAba_.16b, vAba.16b, E0.16b + eor sAbu_, sAsu, sE4 SEP + eor sAsu_, sAse, sE1 SEP + eor sAme_, sAga, sE0 SEP xar_m0 vAsa_, vAbi, E2, 2 + eor sAbe_, sAge, sE1 SEP + SEP + load_constant_ptr SEP xar_m0 vAbi_, vAki, E2, 21 + SEP + bic tmp, sAgi_, sAge_, ROR #47 SEP + eor sAga, tmp, sAga_, ROR #39 SEP xar_m0 vAki_, vAko, E3, 39 + bic tmp, sAgo_, sAgi_, ROR #42 SEP + eor sAge, tmp, sAge_, ROR #25 SEP + bic tmp, sAgu_, sAgo_, ROR #16 SEP xar_m0 vAko_, vAmu, E4, 56 + eor sAgi, tmp, sAgi_, ROR #58 SEP + bic tmp, sAga_, sAgu_, ROR #31 SEP + eor sAgo, tmp, sAgo_, ROR #47 SEP xar_m0 vAmu_, vAso, E3, 8 + bic tmp, sAge_, sAga_, ROR #56 SEP + eor sAgu, tmp, sAgu_, ROR #23 SEP + bic tmp, sAki_, sAke_, ROR #19 SEP xar_m0 vAso_, vAma, E0, 23 + eor sAka, tmp, sAka_, ROR #24 SEP + bic tmp, sAko_, sAki_, ROR #47 SEP + eor sAke, tmp, sAke_, ROR #2 SEP xar_m0 vAka_, vAbe, E1, 63 + bic tmp, sAku_, sAko_, ROR #10 SEP + eor sAki, tmp, sAki_, ROR #57 SEP + bic tmp, sAka_, sAku_, ROR #47 SEP xar_m0 vAse_, vAgo, E3, 9 + eor sAko, tmp, sAko_, ROR #57 SEP + bic tmp, sAke_, sAka_, ROR #5 SEP + eor sAku, tmp, sAku_, ROR #52 SEP xar_m0 vAgo_, vAme, E1, 19 + bic tmp, sAmi_, sAme_, ROR #38 SEP + eor sAma, tmp, sAma_, ROR #47 SEP + bic tmp, sAmo_, sAmi_, ROR #5 SEP xar_m0 vAke_, vAgi, E2, 58 + eor sAme, tmp, sAme_, ROR #43 SEP + bic tmp, sAmu_, sAmo_, ROR #41 SEP + eor sAmi, tmp, sAmi_, ROR #46 SEP xar_m0 vAgi_, vAka, E0, 61 + SEP + ldr cur_const, [const_addr] SEP + mov count, #1 SEP xar_m0 vAga_, vAbo, E3, 36 + SEP + bic tmp, sAma_, sAmu_, ROR #35 SEP + eor sAmo, tmp, sAmo_, ROR #12 SEP xar_m0 vAbo_, vAmo, E3, 43 + bic tmp, sAme_, sAma_, ROR #9 SEP + eor sAmu, tmp, sAmu_, ROR #44 SEP + bic tmp, sAsi_, sAse_, ROR #48 SEP xar_m0 vAmo_, vAmi, E2, 49 + eor sAsa, tmp, sAsa_, ROR #41 SEP + bic tmp, sAso_, sAsi_, ROR #2 SEP + eor sAse, tmp, sAse_, ROR #50 SEP xar_m0 vAmi_, vAke, E1, 54 + bic tmp, sAsu_, sAso_, ROR #25 SEP + eor sAsi, tmp, sAsi_, ROR #27 SEP + bic tmp, sAsa_, sAsu_, ROR #60 SEP xar_m0 vAge_, vAgu, E4, 44 + eor sAso, tmp, sAso_, ROR #21 SEP + bic tmp, sAse_, sAsa_, ROR #57 SEP + eor sAsu, tmp, sAsu_, ROR #53 SEP xar_m0 vAgu_, vAsi, E2, 3 + bic tmp, sAbi_, sAbe_, ROR #63 SEP + eor s_Aba, s_Aba_, tmp, ROR #21 SEP + bic tmp, sAbo_, sAbi_, ROR #42 SEP xar_m0 vAsi_, vAku, E4, 25 + eor sAbe, tmp, sAbe_, ROR #41 SEP + bic tmp, sAbu_, sAbo_, ROR #57 SEP + eor sAbi, tmp, sAbi_, ROR #35 SEP xar_m0 vAku_, vAsa, E0, 46 + bic tmp, s_Aba_, sAbu_, ROR #50 SEP + eor sAbo, tmp, sAbo_, ROR #43 SEP + bic tmp, sAbe_, s_Aba_, ROR #44 SEP xar_m0 vAma_, vAbu, E4, 37 + eor sAbu, tmp, sAbu_, ROR #30 SEP + SEP + eor s_Aba, s_Aba, cur_const SEP xar_m0 vAbu_, vAsu, E4, 50 + SEP + save count, STACK_OFFSET_COUNT SEP + SEP xar_m0 vAsu_, vAse, E1, 62 + eor sC0, sAka, sAsa, ROR #50 SEP + eor sC1, sAse, sAge, ROR #60 SEP + eor sC2, sAmi, sAgi, ROR #59 SEP xar_m0 vAme_, vAga, E0, 28 + eor sC3, sAgo, sAso, ROR #30 SEP + eor sC4, sAbu, sAsu, ROR #53 SEP + eor sC0, sAma, sC0, ROR #49 SEP xar_m0 vAbe_, vAge, E1, 20 + eor sC1, sAbe, sC1, ROR #44 SEP + eor sC2, sAki, sC2, ROR #26 SEP restore sE1, STACK_OFFSET_CONST + eor sC3, sAmo, sC3, ROR #63 SEP + eor sC4, sAmu, sC4, ROR #56 SEP + eor sC0, sAga, sC0, ROR #57 SEP ld1r {v31.2d}, [sE1], #8 + eor sC1, sAme, sC1, ROR #58 SEP + eor sC2, sAbi, sC2, ROR #60 SEP + eor sC3, sAko, sC3, ROR #38 SEP save sE1, STACK_OFFSET_CONST + eor sC4, sAgu, sC4, ROR #48 SEP + eor sC0, s_Aba, sC0, ROR #61 SEP bcax_m0 vAga, vAga_, vAgi_, vAge_ + eor sC1, sAke, sC1, ROR #57 SEP + eor sC2, sAsi, sC2, ROR #52 SEP + eor sC3, sAbo, sC3, ROR #63 SEP bcax_m0 vAge, vAge_, vAgo_, vAgi_ + eor sC4, sAku, sC4, ROR #50 SEP + ror sC1, sC1, 56 SEP + ror sC4, sC4, 58 SEP bcax_m0 vAgi, vAgi_, vAgu_, vAgo_ + ror sC2, sC2, 62 SEP + SEP + eor sE1, sC0, sC2, ROR #63 SEP bcax_m0 vAgo, vAgo_, vAga_, vAgu_ + eor sE3, sC2, sC4, ROR #63 SEP + eor sE0, sC4, sC1, ROR #63 SEP + eor sE2, sC1, sC3, ROR #63 SEP bcax_m0 vAgu, vAgu_, vAge_, vAga_ + eor sE4, sC3, sC0, ROR #63 SEP + SEP + eor s_Aba_, sE0, s_Aba SEP bcax_m0 vAka, vAka_, vAki_, vAke_ + eor sAsa_, sE2, sAbi, ROR #50 SEP + eor sAbi_, sE2, sAki, ROR #46 SEP + eor sAki_, sE3, sAko, ROR #63 SEP bcax_m0 vAke, vAke_, vAko_, vAki_ + eor sAko_, sE4, sAmu, ROR #28 SEP + eor sAmu_, sE3, sAso, ROR #2 SEP + eor sAso_, sE0, sAma, ROR #54 SEP bcax_m0 vAki, vAki_, vAku_, vAko_ + eor sAka_, sE1, sAbe, ROR #43 SEP + eor sAse_, sE3, sAgo, ROR #36 SEP + eor sAgo_, sE1, sAme, ROR #49 SEP bcax_m0 vAko, vAko_, vAka_, vAku_ + eor sAke_, sE2, sAgi, ROR #3 SEP + eor sAgi_, sE0, sAka, ROR #39 SEP + eor sAga_, sE3, sAbo SEP bcax_m0 vAku, vAku_, vAke_, vAka_ + eor sAbo_, sE3, sAmo, ROR #37 SEP + eor sAmo_, sE2, sAmi, ROR #8 SEP + eor sAmi_, sE1, sAke, ROR #56 SEP bcax_m0 vAma, vAma_, vAmi_, vAme_ + eor sAge_, sE4, sAgu, ROR #44 SEP + eor sAgu_, sE2, sAsi, ROR #62 SEP + eor sAsi_, sE4, sAku, ROR #58 SEP bcax_m0 vAme, vAme_, vAmo_, vAmi_ + eor sAku_, sE0, sAsa, ROR #25 SEP + eor sAma_, sE4, sAbu, ROR #20 SEP + eor sAbu_, sE4, sAsu, ROR #9 SEP bcax_m0 vAmi, vAmi_, vAmu_, vAmo_ + eor sAsu_, sE1, sAse, ROR #23 SEP + eor sAme_, sE0, sAga, ROR #61 SEP + eor sAbe_, sE1, sAge, ROR #19 SEP bcax_m0 vAmo, vAmo_, vAma_, vAmu_ + SEP + load_constant_ptr SEP + restore count, STACK_OFFSET_COUNT SEP bcax_m0 vAmu, vAmu_, vAme_, vAma_ + SEP + bic tmp, sAgi_, sAge_, ROR #47 SEP + eor sAga, tmp, sAga_, ROR #39 SEP bcax_m0 vAsa, vAsa_, vAsi_, vAse_ + bic tmp, sAgo_, sAgi_, ROR #42 SEP + eor sAge, tmp, sAge_, ROR #25 SEP + bic tmp, sAgu_, sAgo_, ROR #16 SEP bcax_m0 vAse, vAse_, vAso_, vAsi_ + eor sAgi, tmp, sAgi_, ROR #58 SEP + bic tmp, sAga_, sAgu_, ROR #31 SEP + eor sAgo, tmp, sAgo_, ROR #47 SEP bcax_m0 vAsi, vAsi_, vAsu_, vAso_ + bic tmp, sAge_, sAga_, ROR #56 SEP + eor sAgu, tmp, sAgu_, ROR #23 SEP + bic tmp, sAki_, sAke_, ROR #19 SEP bcax_m0 vAso, vAso_, vAsa_, vAsu_ + eor sAka, tmp, sAka_, ROR #24 SEP + bic tmp, sAko_, sAki_, ROR #47 SEP + eor sAke, tmp, sAke_, ROR #2 SEP bcax_m0 vAsu, vAsu_, vAse_, vAsa_ + bic tmp, sAku_, sAko_, ROR #10 SEP + eor sAki, tmp, sAki_, ROR #57 SEP + bic tmp, sAka_, sAku_, ROR #47 SEP bcax_m0 vAba, vAba_, vAbi_, vAbe_ + eor sAko, tmp, sAko_, ROR #57 SEP + bic tmp, sAke_, sAka_, ROR #5 SEP + eor sAku, tmp, sAku_, ROR #52 SEP bcax_m0 vAbe, vAbe_, vAbo_, vAbi_ + bic tmp, sAmi_, sAme_, ROR #38 SEP + eor sAma, tmp, sAma_, ROR #47 SEP + bic tmp, sAmo_, sAmi_, ROR #5 SEP bcax_m0 vAbi, vAbi_, vAbu_, vAbo_ + eor sAme, tmp, sAme_, ROR #43 SEP + bic tmp, sAmu_, sAmo_, ROR #41 SEP + eor sAmi, tmp, sAmi_, ROR #46 SEP bcax_m0 vAbo, vAbo_, vAba_, vAbu_ + bic tmp, sAma_, sAmu_, ROR #35 SEP + SEP + ldr cur_const, [const_addr, count, UXTW #3] SEP bcax_m0 vAbu, vAbu_, vAbe_, vAba_ + SEP + eor sAmo, tmp, sAmo_, ROR #12 SEP + bic tmp, sAme_, sAma_, ROR #9 SEP + eor sAmu, tmp, sAmu_, ROR #44 SEP eor vAba.16b, vAba.16b, v31.16b + bic tmp, sAsi_, sAse_, ROR #48 SEP + eor sAsa, tmp, sAsa_, ROR #41 SEP + bic tmp, sAso_, sAsi_, ROR #2 SEP + eor sAse, tmp, sAse_, ROR #50 SEP + bic tmp, sAsu_, sAso_, ROR #25 SEP + eor sAsi, tmp, sAsi_, ROR #27 SEP + bic tmp, sAsa_, sAsu_, ROR #60 SEP + eor sAso, tmp, sAso_, ROR #21 SEP + bic tmp, sAse_, sAsa_, ROR #57 SEP + eor sAsu, tmp, sAsu_, ROR #53 SEP + bic tmp, sAbi_, sAbe_, ROR #63 SEP + eor s_Aba, s_Aba_, tmp, ROR #21 SEP + bic tmp, sAbo_, sAbi_, ROR #42 SEP + eor sAbe, tmp, sAbe_, ROR #41 SEP + bic tmp, sAbu_, sAbo_, ROR #57 SEP + eor sAbi, tmp, sAbi_, ROR #35 SEP + bic tmp, s_Aba_, sAbu_, ROR #50 SEP + eor sAbo, tmp, sAbo_, ROR #43 SEP + bic tmp, sAbe_, s_Aba_, ROR #44 SEP + eor sAbu, tmp, sAbu_, ROR #30 SEP + SEP + add count, count, #1 SEP + SEP + eor s_Aba, s_Aba, cur_const SEP + SEP +.endm + +.macro hybrid_round_noninitial + save count, STACK_OFFSET_COUNT SEP eor3_m0 C0, vAba, vAga, vAka + SEP + eor sC0, sAka, sAsa, ROR #50 SEP + eor sC1, sAse, sAge, ROR #60 SEP eor3_m0 C0, C0, vAma, vAsa + eor sC2, sAmi, sAgi, ROR #59 SEP + eor sC3, sAgo, sAso, ROR #30 SEP + eor sC4, sAbu, sAsu, ROR #53 SEP eor3_m0 C1, vAbe, vAge, vAke + eor sC0, sAma, sC0, ROR #49 SEP + eor sC1, sAbe, sC1, ROR #44 SEP + eor sC2, sAki, sC2, ROR #26 SEP eor3_m0 C1, C1, vAme, vAse + eor sC3, sAmo, sC3, ROR #63 SEP + eor sC4, sAmu, sC4, ROR #56 SEP + eor sC0, sAga, sC0, ROR #57 SEP eor3_m0 C2, vAbi, vAgi, vAki + eor sC1, sAme, sC1, ROR #58 SEP + eor sC2, sAbi, sC2, ROR #60 SEP + eor sC3, sAko, sC3, ROR #38 SEP eor3_m0 C2, C2, vAmi, vAsi + eor sC4, sAgu, sC4, ROR #48 SEP + eor sC0, s_Aba, sC0, ROR #61 SEP + eor sC1, sAke, sC1, ROR #57 SEP eor3_m0 C3, vAbo, vAgo, vAko + eor sC2, sAsi, sC2, ROR #52 SEP + eor sC3, sAbo, sC3, ROR #63 SEP + eor sC4, sAku, sC4, ROR #50 SEP eor3_m0 C3, C3, vAmo, vAso + ror sC1, sC1, 56 SEP + ror sC4, sC4, 58 SEP + ror sC2, sC2, 62 SEP eor3_m0 C4, vAbu, vAgu, vAku + SEP + eor sE1, sC0, sC2, ROR #63 SEP + eor sE3, sC2, sC4, ROR #63 SEP eor3_m0 C4, C4, vAmu, vAsu + eor sE0, sC4, sC1, ROR #63 SEP eor3_m0 C4, C4, vAmu, vAsu + vzr .req v31 + eor sE2, sC1, sC3, ROR #63 SEP eor vzr.16b, vzr.16b, vzr.16b // zero + eor sE4, sC3, sC0, ROR #63 SEP tmpp .req E1 + SEP rax1_m1 E1, C0, C2 + eor s_Aba_, sE0, s_Aba SEP .unreq tmpp + eor sAsa_, sE2, sAbi, ROR #50 SEP tmpp .req C0 + eor sAbi_, sE2, sAki, ROR #46 SEP rax1_m1 E3, C2, C4 + eor sAki_, sE3, sAko, ROR #63 SEP + eor sAko_, sE4, sAmu, ROR #28 SEP + eor sAmu_, sE3, sAso, ROR #2 SEP rax1_m1 E0, C4, C1 + eor sAso_, sE0, sAma, ROR #54 SEP + eor sAka_, sE1, sAbe, ROR #43 SEP + eor sAse_, sE3, sAgo, ROR #36 SEP rax1_m1 E2, C1, C3 + eor sAgo_, sE1, sAme, ROR #49 SEP + eor sAke_, sE2, sAgi, ROR #3 SEP + eor sAgi_, sE0, sAka, ROR #39 SEP rax1_m1 E4, C3, C0 + eor sAga_, sE3, sAbo SEP .unreq vzr + eor sAbo_, sE3, sAmo, ROR #37 SEP .unreq tmpp + eor sAmo_, sE2, sAmi, ROR #8 SEP + eor sAmi_, sE1, sAke, ROR #56 SEP eor vAba_.16b, vAba.16b, E0.16b + eor sAge_, sE4, sAgu, ROR #44 SEP + eor sAgu_, sE2, sAsi, ROR #62 SEP + eor sAsi_, sE4, sAku, ROR #58 SEP xar_m0 vAsa_, vAbi, E2, 2 + eor sAku_, sE0, sAsa, ROR #25 SEP + eor sAma_, sE4, sAbu, ROR #20 SEP + eor sAbu_, sE4, sAsu, ROR #9 SEP xar_m0 vAbi_, vAki, E2, 21 + eor sAsu_, sE1, sAse, ROR #23 SEP + eor sAme_, sE0, sAga, ROR #61 SEP + eor sAbe_, sE1, sAge, ROR #19 SEP xar_m0 vAki_, vAko, E3, 39 + SEP + load_constant_ptr SEP + restore count, STACK_OFFSET_COUNT SEP xar_m0 vAko_, vAmu, E4, 56 + SEP + bic tmp, sAgi_, sAge_, ROR #47 SEP + eor sAga, tmp, sAga_, ROR #39 SEP xar_m0 vAmu_, vAso, E3, 8 + bic tmp, sAgo_, sAgi_, ROR #42 SEP + eor sAge, tmp, sAge_, ROR #25 SEP + bic tmp, sAgu_, sAgo_, ROR #16 SEP xar_m0 vAso_, vAma, E0, 23 + eor sAgi, tmp, sAgi_, ROR #58 SEP + bic tmp, sAga_, sAgu_, ROR #31 SEP + eor sAgo, tmp, sAgo_, ROR #47 SEP xar_m0 vAka_, vAbe, E1, 63 + bic tmp, sAge_, sAga_, ROR #56 SEP + eor sAgu, tmp, sAgu_, ROR #23 SEP + bic tmp, sAki_, sAke_, ROR #19 SEP xar_m0 vAse_, vAgo, E3, 9 + eor sAka, tmp, sAka_, ROR #24 SEP + bic tmp, sAko_, sAki_, ROR #47 SEP + eor sAke, tmp, sAke_, ROR #2 SEP xar_m0 vAgo_, vAme, E1, 19 + bic tmp, sAku_, sAko_, ROR #10 SEP + eor sAki, tmp, sAki_, ROR #57 SEP + bic tmp, sAka_, sAku_, ROR #47 SEP xar_m0 vAke_, vAgi, E2, 58 + eor sAko, tmp, sAko_, ROR #57 SEP + bic tmp, sAke_, sAka_, ROR #5 SEP + eor sAku, tmp, sAku_, ROR #52 SEP xar_m0 vAgi_, vAka, E0, 61 + bic tmp, sAmi_, sAme_, ROR #38 SEP + eor sAma, tmp, sAma_, ROR #47 SEP + bic tmp, sAmo_, sAmi_, ROR #5 SEP xar_m0 vAga_, vAbo, E3, 36 + eor sAme, tmp, sAme_, ROR #43 SEP + bic tmp, sAmu_, sAmo_, ROR #41 SEP + eor sAmi, tmp, sAmi_, ROR #46 SEP xar_m0 vAbo_, vAmo, E3, 43 + bic tmp, sAma_, sAmu_, ROR #35 SEP + SEP + ldr cur_const, [const_addr, count, UXTW #3] SEP xar_m0 vAmo_, vAmi, E2, 49 + add count, count, #1 SEP + SEP + eor sAmo, tmp, sAmo_, ROR #12 SEP xar_m0 vAmi_, vAke, E1, 54 + bic tmp, sAme_, sAma_, ROR #9 SEP + eor sAmu, tmp, sAmu_, ROR #44 SEP + bic tmp, sAsi_, sAse_, ROR #48 SEP xar_m0 vAge_, vAgu, E4, 44 + eor sAsa, tmp, sAsa_, ROR #41 SEP + bic tmp, sAso_, sAsi_, ROR #2 SEP + eor sAse, tmp, sAse_, ROR #50 SEP xar_m0 vAgu_, vAsi, E2, 3 + bic tmp, sAsu_, sAso_, ROR #25 SEP + eor sAsi, tmp, sAsi_, ROR #27 SEP + bic tmp, sAsa_, sAsu_, ROR #60 SEP xar_m0 vAsi_, vAku, E4, 25 + eor sAso, tmp, sAso_, ROR #21 SEP + bic tmp, sAse_, sAsa_, ROR #57 SEP + eor sAsu, tmp, sAsu_, ROR #53 SEP xar_m0 vAku_, vAsa, E0, 46 + bic tmp, sAbi_, sAbe_, ROR #63 SEP + eor s_Aba, s_Aba_, tmp, ROR #21 SEP + bic tmp, sAbo_, sAbi_, ROR #42 SEP xar_m0 vAma_, vAbu, E4, 37 + eor sAbe, tmp, sAbe_, ROR #41 SEP + bic tmp, sAbu_, sAbo_, ROR #57 SEP + eor sAbi, tmp, sAbi_, ROR #35 SEP xar_m0 vAbu_, vAsu, E4, 50 + bic tmp, s_Aba_, sAbu_, ROR #50 SEP + eor sAbo, tmp, sAbo_, ROR #43 SEP + bic tmp, sAbe_, s_Aba_, ROR #44 SEP xar_m0 vAsu_, vAse, E1, 62 + eor sAbu, tmp, sAbu_, ROR #30 SEP + SEP + eor s_Aba, s_Aba, cur_const SEP xar_m0 vAme_, vAga, E0, 28 + save count, STACK_OFFSET_COUNT SEP + SEP + eor sC0, sAka, sAsa, ROR #50 SEP xar_m0 vAbe_, vAge, E1, 20 + eor sC1, sAse, sAge, ROR #60 SEP + eor sC2, sAmi, sAgi, ROR #59 SEP + eor sC3, sAgo, sAso, ROR #30 SEP + eor sC4, sAbu, sAsu, ROR #53 SEP restore sE1, STACK_OFFSET_CONST + eor sC0, sAma, sC0, ROR #49 SEP + eor sC1, sAbe, sC1, ROR #44 SEP + eor sC2, sAki, sC2, ROR #26 SEP ld1r {v31.2d}, [sE1], #8 + eor sC3, sAmo, sC3, ROR #63 SEP + eor sC4, sAmu, sC4, ROR #56 SEP + eor sC0, sAga, sC0, ROR #57 SEP save sE1, STACK_OFFSET_CONST + eor sC1, sAme, sC1, ROR #58 SEP + eor sC2, sAbi, sC2, ROR #60 SEP + eor sC3, sAko, sC3, ROR #38 SEP + eor sC4, sAgu, sC4, ROR #48 SEP bcax_m0 vAga, vAga_, vAgi_, vAge_ + eor sC0, s_Aba, sC0, ROR #61 SEP + eor sC1, sAke, sC1, ROR #57 SEP + eor sC2, sAsi, sC2, ROR #52 SEP bcax_m0 vAge, vAge_, vAgo_, vAgi_ + eor sC3, sAbo, sC3, ROR #63 SEP + eor sC4, sAku, sC4, ROR #50 SEP + ror sC1, sC1, 56 SEP bcax_m0 vAgi, vAgi_, vAgu_, vAgo_ + ror sC4, sC4, 58 SEP + ror sC2, sC2, 62 SEP + SEP bcax_m0 vAgo, vAgo_, vAga_, vAgu_ + eor sE1, sC0, sC2, ROR #63 SEP + eor sE3, sC2, sC4, ROR #63 SEP + eor sE0, sC4, sC1, ROR #63 SEP bcax_m0 vAgu, vAgu_, vAge_, vAga_ + eor sE2, sC1, sC3, ROR #63 SEP + eor sE4, sC3, sC0, ROR #63 SEP + SEP bcax_m0 vAka, vAka_, vAki_, vAke_ + eor s_Aba_, sE0, s_Aba SEP + eor sAsa_, sE2, sAbi, ROR #50 SEP + eor sAbi_, sE2, sAki, ROR #46 SEP bcax_m0 vAke, vAke_, vAko_, vAki_ + eor sAki_, sE3, sAko, ROR #63 SEP + eor sAko_, sE4, sAmu, ROR #28 SEP + eor sAmu_, sE3, sAso, ROR #2 SEP bcax_m0 vAki, vAki_, vAku_, vAko_ + eor sAso_, sE0, sAma, ROR #54 SEP + eor sAka_, sE1, sAbe, ROR #43 SEP + eor sAse_, sE3, sAgo, ROR #36 SEP bcax_m0 vAko, vAko_, vAka_, vAku_ + eor sAgo_, sE1, sAme, ROR #49 SEP + eor sAke_, sE2, sAgi, ROR #3 SEP + eor sAgi_, sE0, sAka, ROR #39 SEP bcax_m0 vAku, vAku_, vAke_, vAka_ + eor sAga_, sE3, sAbo SEP + eor sAbo_, sE3, sAmo, ROR #37 SEP + eor sAmo_, sE2, sAmi, ROR #8 SEP bcax_m0 vAma, vAma_, vAmi_, vAme_ + eor sAmi_, sE1, sAke, ROR #56 SEP + eor sAge_, sE4, sAgu, ROR #44 SEP + eor sAgu_, sE2, sAsi, ROR #62 SEP bcax_m0 vAme, vAme_, vAmo_, vAmi_ + eor sAsi_, sE4, sAku, ROR #58 SEP + eor sAku_, sE0, sAsa, ROR #25 SEP + eor sAma_, sE4, sAbu, ROR #20 SEP bcax_m0 vAmi, vAmi_, vAmu_, vAmo_ + eor sAbu_, sE4, sAsu, ROR #9 SEP + eor sAsu_, sE1, sAse, ROR #23 SEP + eor sAme_, sE0, sAga, ROR #61 SEP bcax_m0 vAmo, vAmo_, vAma_, vAmu_ + eor sAbe_, sE1, sAge, ROR #19 SEP + SEP + load_constant_ptr SEP bcax_m0 vAmu, vAmu_, vAme_, vAma_ + restore count, STACK_OFFSET_COUNT SEP + SEP + bic tmp, sAgi_, sAge_, ROR #47 SEP bcax_m0 vAsa, vAsa_, vAsi_, vAse_ + eor sAga, tmp, sAga_, ROR #39 SEP + bic tmp, sAgo_, sAgi_, ROR #42 SEP + eor sAge, tmp, sAge_, ROR #25 SEP bcax_m0 vAse, vAse_, vAso_, vAsi_ + bic tmp, sAgu_, sAgo_, ROR #16 SEP + eor sAgi, tmp, sAgi_, ROR #58 SEP + bic tmp, sAga_, sAgu_, ROR #31 SEP bcax_m0 vAsi, vAsi_, vAsu_, vAso_ + eor sAgo, tmp, sAgo_, ROR #47 SEP + bic tmp, sAge_, sAga_, ROR #56 SEP + eor sAgu, tmp, sAgu_, ROR #23 SEP bcax_m0 vAso, vAso_, vAsa_, vAsu_ + bic tmp, sAki_, sAke_, ROR #19 SEP + eor sAka, tmp, sAka_, ROR #24 SEP + bic tmp, sAko_, sAki_, ROR #47 SEP bcax_m0 vAsu, vAsu_, vAse_, vAsa_ + eor sAke, tmp, sAke_, ROR #2 SEP + bic tmp, sAku_, sAko_, ROR #10 SEP + eor sAki, tmp, sAki_, ROR #57 SEP bcax_m0 vAba, vAba_, vAbi_, vAbe_ + bic tmp, sAka_, sAku_, ROR #47 SEP + eor sAko, tmp, sAko_, ROR #57 SEP + bic tmp, sAke_, sAka_, ROR #5 SEP bcax_m0 vAbe, vAbe_, vAbo_, vAbi_ + eor sAku, tmp, sAku_, ROR #52 SEP + bic tmp, sAmi_, sAme_, ROR #38 SEP + eor sAma, tmp, sAma_, ROR #47 SEP bcax_m0 vAbi, vAbi_, vAbu_, vAbo_ + bic tmp, sAmo_, sAmi_, ROR #5 SEP + eor sAme, tmp, sAme_, ROR #43 SEP + bic tmp, sAmu_, sAmo_, ROR #41 SEP bcax_m0 vAbo, vAbo_, vAba_, vAbu_ + eor sAmi, tmp, sAmi_, ROR #46 SEP + bic tmp, sAma_, sAmu_, ROR #35 SEP + SEP bcax_m0 vAbu, vAbu_, vAbe_, vAba_ + ldr cur_const, [const_addr, count, UXTW #3] SEP + add count, count, #1 SEP + SEP eor vAba.16b, vAba.16b, v31.16b + eor sAmo, tmp, sAmo_, ROR #12 SEP + bic tmp, sAme_, sAma_, ROR #9 SEP + eor sAmu, tmp, sAmu_, ROR #44 SEP + bic tmp, sAsi_, sAse_, ROR #48 SEP + eor sAsa, tmp, sAsa_, ROR #41 SEP + bic tmp, sAso_, sAsi_, ROR #2 SEP + eor sAse, tmp, sAse_, ROR #50 SEP + bic tmp, sAsu_, sAso_, ROR #25 SEP + eor sAsi, tmp, sAsi_, ROR #27 SEP + bic tmp, sAsa_, sAsu_, ROR #60 SEP + eor sAso, tmp, sAso_, ROR #21 SEP + bic tmp, sAse_, sAsa_, ROR #57 SEP + eor sAsu, tmp, sAsu_, ROR #53 SEP + bic tmp, sAbi_, sAbe_, ROR #63 SEP + eor s_Aba, s_Aba_, tmp, ROR #21 SEP + bic tmp, sAbo_, sAbi_, ROR #42 SEP + eor sAbe, tmp, sAbe_, ROR #41 SEP + bic tmp, sAbu_, sAbo_, ROR #57 SEP + eor sAbi, tmp, sAbi_, ROR #35 SEP + bic tmp, s_Aba_, sAbu_, ROR #50 SEP + eor sAbo, tmp, sAbo_, ROR #43 SEP + bic tmp, sAbe_, s_Aba_, ROR #44 SEP + eor sAbu, tmp, sAbu_, ROR #30 SEP + SEP + eor s_Aba, s_Aba, cur_const SEP + +.endm + +.macro final_rotate + ror sAga, sAga,#(64-3) + ror sAka, sAka,#(64-25) + ror sAma, sAma,#(64-10) + ror sAsa, sAsa,#(64-39) + ror sAbe, sAbe,#(64-21) + ror sAge, sAge,#(64-45) + ror sAke, sAke,#(64-8) + ror sAme, sAme,#(64-15) + ror sAse, sAse,#(64-41) + ror sAbi, sAbi,#(64-14) + ror sAgi, sAgi,#(64-61) + ror sAki, sAki,#(64-18) + ror sAmi, sAmi,#(64-56) + ror sAsi, sAsi,#(64-2) + ror sAgo, sAgo,#(64-28) + ror sAko, sAko,#(64-1) + ror sAmo, sAmo,#(64-27) + ror sAso, sAso,#(64-62) + ror sAbu, sAbu,#(64-44) + ror sAgu, sAgu,#(64-20) + ror sAku, sAku,#(64-6) + ror sAmu, sAmu,#(64-36) + ror sAsu, sAsu,#(64-55) +.endm + +#define KECCAK_F1600_ROUNDS 24 + +.global keccak_f1600_x4_hybrid_asm_v2p0 +.global _keccak_f1600_x4_hybrid_asm_v2p0 +.text +.align 4 + +keccak_f1600_x4_hybrid_asm_v2p0: +_keccak_f1600_x4_hybrid_asm_v2p0: + alloc_stack + save_gprs + save_vregs + save input_addr, STACK_OFFSET_INPUT + + load_input_vector 2,1 + + load_constant_ptr + save const_addr, STACK_OFFSET_CONST + + // First scalar Keccak computation alongside first half of SIMD computation + load_input_scalar 4,0 + hybrid_round_initial + loop_0: + hybrid_round_noninitial + cmp count, #(KECCAK_F1600_ROUNDS-1) + ble loop_0 + final_rotate + restore input_addr, STACK_OFFSET_INPUT + store_input_scalar 4,0 + + // Second scalar Keccak computation alongsie second half of SIMD computation + load_input_scalar 4,1 + hybrid_round_initial + loop_1: + hybrid_round_noninitial + cmp count, #(KECCAK_F1600_ROUNDS-1) + ble loop_1 + final_rotate + restore input_addr, STACK_OFFSET_INPUT + store_input_scalar 4, 1 + + store_input_vector 2,1 + + restore_vregs + restore_gprs + free_stack + ret + +#endif diff --git a/tests/keccak_neon/manual/keccak_f1600_x4_hybrid_asm_v3.s b/tests/keccak_neon/manual/keccak_f1600_x4_hybrid_asm_v3.s new file mode 100644 index 0000000..44795aa --- /dev/null +++ b/tests/keccak_neon/manual/keccak_f1600_x4_hybrid_asm_v3.s @@ -0,0 +1,1015 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +/********************** CONSTANTS *************************/ + .data + .align(8) +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x29 + count .req w27 + cur_const .req x26 + + /* Mapping of Kecck-f1600 SIMD state to vector registers + * at the beginning and end of each round. */ + + vAba .req v0 + vAbe .req v1 + vAbi .req v2 + vAbo .req v3 + vAbu .req v4 + vAga .req v5 + vAge .req v6 + vAgi .req v7 + vAgo .req v8 + vAgu .req v9 + vAka .req v10 + vAke .req v11 + vAki .req v12 + vAko .req v13 + vAku .req v14 + vAma .req v15 + vAme .req v16 + vAmi .req v17 + vAmo .req v18 + vAmu .req v19 + vAsa .req v20 + vAse .req v21 + vAsi .req v22 + vAso .req v23 + vAsu .req v24 + + /* q-form of the above mapping */ + vAbaq .req q0 + vAbeq .req q1 + vAbiq .req q2 + vAboq .req q3 + vAbuq .req q4 + vAgaq .req q5 + vAgeq .req q6 + vAgiq .req q7 + vAgoq .req q8 + vAguq .req q9 + vAkaq .req q10 + vAkeq .req q11 + vAkiq .req q12 + vAkoq .req q13 + vAkuq .req q14 + vAmaq .req q15 + vAmeq .req q16 + vAmiq .req q17 + vAmoq .req q18 + vAmuq .req q19 + vAsaq .req q20 + vAseq .req q21 + vAsiq .req q22 + vAsoq .req q23 + vAsuq .req q24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v30 + C1 .req v29 + C2 .req v28 + C3 .req v27 + C4 .req v26 + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req v26 + E1 .req v25 + E2 .req v29 + E3 .req v28 + E4 .req v27 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + vAbi_ .req v2 + vAbo_ .req v3 + vAbu_ .req v4 + vAga_ .req v10 + vAge_ .req v11 + vAgi_ .req v7 + vAgo_ .req v8 + vAgu_ .req v9 + vAka_ .req v15 + vAke_ .req v16 + vAki_ .req v12 + vAko_ .req v13 + vAku_ .req v14 + vAma_ .req v20 + vAme_ .req v21 + vAmi_ .req v17 + vAmo_ .req v18 + vAmu_ .req v19 + vAsa_ .req v0 + vAse_ .req v1 + vAsi_ .req v22 + vAso_ .req v23 + vAsu_ .req v24 + vAba_ .req v30 + vAbe_ .req v27 + + /* Unused temporary */ + vtmp .req v31 + + /* Mapping of Kecck-f1600 state to scalar registers + * at the beginning and end of each round. */ + s_Aba .req x1 + sAbe .req x6 + sAbi .req x11 + sAbo .req x16 + sAbu .req x21 + sAga .req x2 + sAge .req x7 + sAgi .req x12 + sAgo .req x17 + sAgu .req x22 + sAka .req x3 + sAke .req x8 + sAki .req x13 + sAko .req x18 + sAku .req x23 + sAma .req x4 + sAme .req x9 + sAmi .req x14 + sAmo .req x19 + sAmu .req x24 + sAsa .req x5 + sAse .req x10 + sAsi .req x15 + sAso .req x20 + sAsu .req x25 + + /* sA_[y,2*x+3*y] = rot(A[x,y]) */ + s_Aba_ .req x0 + sAbe_ .req x28 + sAbi_ .req x11 + sAbo_ .req x16 + sAbu_ .req x21 + sAga_ .req x3 + sAge_ .req x8 + sAgi_ .req x12 + sAgo_ .req x17 + sAgu_ .req x22 + sAka_ .req x4 + sAke_ .req x9 + sAki_ .req x13 + sAko_ .req x18 + sAku_ .req x23 + sAma_ .req x5 + sAme_ .req x10 + sAmi_ .req x14 + sAmo_ .req x19 + sAmu_ .req x24 + sAsa_ .req x1 + sAse_ .req x6 + sAsi_ .req x15 + sAso_ .req x20 + sAsu_ .req x25 + + /* sC[x] = sA[x,0] xor sA[x,1] xor sA[x,2] xor sA[x,3] xor sA[x,4], for x in 0..4 */ + /* sE[x] = sC[x-1] xor rot(C[x+1],1), for x in 0..4 */ + sC0 .req x0 + sE0 .req x29 + sC1 .req x26 + sE1 .req x30 + sC2 .req x27 + sE2 .req x26 + sC3 .req x28 + sE3 .req x27 + sC4 .req x29 + sE4 .req x28 + + tmp .req x30 + +/************************ MACROS ****************************/ + +/* Macros using v8.4-A SHA-3 instructions */ + + +.macro eor3_m1 d s0 s1 s2 + eor \d\().16b, \s0\().16b, \s1\().16b + eor \d\().16b, \d\().16b, \s2\().16b +.endm + +.macro rax1_m1 d s0 s1 + add vtmp.2d, \s1\().2d, \s1\().2d + sri vtmp.2d, \s1\().2d, #63 + eor \d\().16b, vtmp.16b, \s0\().16b +.endm + +.macro xar_m1 d s0 s1 imm + eor vtmp.16b, \s0\().16b, \s1\().16b + shl \d\().2d, vtmp.2d, #(64-\imm) + sri \d\().2d, vtmp.2d, #(\imm) +.endm + +.macro bcax_m1 d s0 s1 s2 + bic vtmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, vtmp.16b, \s0\().16b + .endm + + +.macro eor3_m0 d s0 s1 s2 + eor3 \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +.macro rax1_m0 d s0 s1 + rax1 \d\().2d, \s0\().2d, \s1\().2d +.endm + +.macro xar_m0 d s0 s1 imm + xar \d\().2d, \s0\().2d, \s1\().2d, #\imm +.endm + +.macro bcax_m0 d s0 s1 s2 + bcax \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + + +.macro load_input_vector num idx + ldr vAbaq, [input_addr, #(16*(\num*0+\idx))] + ldr vAbeq, [input_addr, #(16*(\num*1+\idx))] + ldr vAbiq, [input_addr, #(16*(\num*2+\idx))] + ldr vAboq, [input_addr, #(16*(\num*3+\idx))] + ldr vAbuq, [input_addr, #(16*(\num*4+\idx))] + ldr vAgaq, [input_addr, #(16*(\num*5+\idx))] + ldr vAgeq, [input_addr, #(16*(\num*6+\idx))] + ldr vAgiq, [input_addr, #(16*(\num*7+\idx))] + ldr vAgoq, [input_addr, #(16*(\num*8+\idx))] + ldr vAguq, [input_addr, #(16*(\num*9+\idx))] + ldr vAkaq, [input_addr, #(16*(\num*10+\idx))] + ldr vAkeq, [input_addr, #(16*(\num*11+\idx))] + ldr vAkiq, [input_addr, #(16*(\num*12+\idx))] + ldr vAkoq, [input_addr, #(16*(\num*13+\idx))] + ldr vAkuq, [input_addr, #(16*(\num*14+\idx))] + ldr vAmaq, [input_addr, #(16*(\num*15+\idx))] + ldr vAmeq, [input_addr, #(16*(\num*16+\idx))] + ldr vAmiq, [input_addr, #(16*(\num*17+\idx))] + ldr vAmoq, [input_addr, #(16*(\num*18+\idx))] + ldr vAmuq, [input_addr, #(16*(\num*19+\idx))] + ldr vAsaq, [input_addr, #(16*(\num*20+\idx))] + ldr vAseq, [input_addr, #(16*(\num*21+\idx))] + ldr vAsiq, [input_addr, #(16*(\num*22+\idx))] + ldr vAsoq, [input_addr, #(16*(\num*23+\idx))] + ldr vAsuq, [input_addr, #(16*(\num*24+\idx))] +.endm + +.macro store_input_vector num idx + str vAbaq, [input_addr, #(16*(\num*0+\idx))] + str vAbeq, [input_addr, #(16*(\num*1+\idx))] + str vAbiq, [input_addr, #(16*(\num*2+\idx))] + str vAboq, [input_addr, #(16*(\num*3+\idx))] + str vAbuq, [input_addr, #(16*(\num*4+\idx))] + str vAgaq, [input_addr, #(16*(\num*5+\idx))] + str vAgeq, [input_addr, #(16*(\num*6+\idx))] + str vAgiq, [input_addr, #(16*(\num*7+\idx))] + str vAgoq, [input_addr, #(16*(\num*8+\idx))] + str vAguq, [input_addr, #(16*(\num*9+\idx))] + str vAkaq, [input_addr, #(16*(\num*10+\idx))] + str vAkeq, [input_addr, #(16*(\num*11+\idx))] + str vAkiq, [input_addr, #(16*(\num*12+\idx))] + str vAkoq, [input_addr, #(16*(\num*13+\idx))] + str vAkuq, [input_addr, #(16*(\num*14+\idx))] + str vAmaq, [input_addr, #(16*(\num*15+\idx))] + str vAmeq, [input_addr, #(16*(\num*16+\idx))] + str vAmiq, [input_addr, #(16*(\num*17+\idx))] + str vAmoq, [input_addr, #(16*(\num*18+\idx))] + str vAmuq, [input_addr, #(16*(\num*19+\idx))] + str vAsaq, [input_addr, #(16*(\num*20+\idx))] + str vAseq, [input_addr, #(16*(\num*21+\idx))] + str vAsiq, [input_addr, #(16*(\num*22+\idx))] + str vAsoq, [input_addr, #(16*(\num*23+\idx))] + str vAsuq, [input_addr, #(16*(\num*24+\idx))] +.endm + +.macro store_input_scalar num idx + str s_Aba, [input_addr, 8*(\num*(0) +\idx)] + str sAbe, [input_addr, 8*(\num*(0+1) +\idx)] + str sAbi, [input_addr, 8*(\num*(2)+ \idx)] + str sAbo, [input_addr, 8*(\num*(2+1) +\idx)] + str sAbu, [input_addr, 8*(\num*(4)+ \idx)] + str sAga, [input_addr, 8*(\num*(4+1) +\idx)] + str sAge, [input_addr, 8*(\num*(6)+ \idx)] + str sAgi, [input_addr, 8*(\num*(6+1) +\idx)] + str sAgo, [input_addr, 8*(\num*(8)+ \idx)] + str sAgu, [input_addr, 8*(\num*(8+1) +\idx)] + str sAka, [input_addr, 8*(\num*(10) +\idx)] + str sAke, [input_addr, 8*(\num*(10+1)+\idx)] + str sAki, [input_addr, 8*(\num*(12) +\idx)] + str sAko, [input_addr, 8*(\num*(12+1)+\idx)] + str sAku, [input_addr, 8*(\num*(14) +\idx)] + str sAma, [input_addr, 8*(\num*(14+1)+\idx)] + str sAme, [input_addr, 8*(\num*(16) +\idx)] + str sAmi, [input_addr, 8*(\num*(16+1)+\idx)] + str sAmo, [input_addr, 8*(\num*(18) +\idx)] + str sAmu, [input_addr, 8*(\num*(18+1)+\idx)] + str sAsa, [input_addr, 8*(\num*(20) +\idx)] + str sAse, [input_addr, 8*(\num*(20+1)+\idx)] + str sAsi, [input_addr, 8*(\num*(22) +\idx)] + str sAso, [input_addr, 8*(\num*(22+1)+\idx)] + str sAsu, [input_addr, 8*(\num*(24) +\idx)] +.endm + +.macro load_input_scalar num idx + ldr s_Aba, [input_addr, 8*(\num*(0) +\idx)] + ldr sAbe, [input_addr, 8*(\num*(0+1) +\idx)] + ldr sAbi, [input_addr, 8*(\num*(2)+ \idx)] + ldr sAbo, [input_addr, 8*(\num*(2+1) +\idx)] + ldr sAbu, [input_addr, 8*(\num*(4)+ \idx)] + ldr sAga, [input_addr, 8*(\num*(4+1) +\idx)] + ldr sAge, [input_addr, 8*(\num*(6)+ \idx)] + ldr sAgi, [input_addr, 8*(\num*(6+1) +\idx)] + ldr sAgo, [input_addr, 8*(\num*(8)+ \idx)] + ldr sAgu, [input_addr, 8*(\num*(8+1) +\idx)] + ldr sAka, [input_addr, 8*(\num*(10) +\idx)] + ldr sAke, [input_addr, 8*(\num*(10+1)+\idx)] + ldr sAki, [input_addr, 8*(\num*(12) +\idx)] + ldr sAko, [input_addr, 8*(\num*(12+1)+\idx)] + ldr sAku, [input_addr, 8*(\num*(14) +\idx)] + ldr sAma, [input_addr, 8*(\num*(14+1)+\idx)] + ldr sAme, [input_addr, 8*(\num*(16) +\idx)] + ldr sAmi, [input_addr, 8*(\num*(16+1)+\idx)] + ldr sAmo, [input_addr, 8*(\num*(18) +\idx)] + ldr sAmu, [input_addr, 8*(\num*(18+1)+\idx)] + ldr sAsa, [input_addr, 8*(\num*(20) +\idx)] + ldr sAse, [input_addr, 8*(\num*(20+1)+\idx)] + ldr sAsi, [input_addr, 8*(\num*(22) +\idx)] + ldr sAso, [input_addr, 8*(\num*(22+1)+\idx)] + ldr sAsu, [input_addr, 8*(\num*(24) +\idx)] +.endm + +#define STACK_SIZE (8*8 + 16*6 + 3*8 + 8) // VREGS (8*8), GPRs (16*6), count (8), const (8), input (8), padding (8) +#define STACK_BASE_GPRS (3*8+8) +#define STACK_BASE_VREGS (3*8+8+16*6) +#define STACK_OFFSET_INPUT (0*8) +#define STACK_OFFSET_CONST (1*8) +#define STACK_OFFSET_COUNT (2*8) + +.macro save_gprs + stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro save_vregs + stp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] + stp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + stp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + stp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] +.endm + +.macro restore_vregs + ldp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] + ldp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + ldp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + ldp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] +.endm + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +.macro eor5 dst, src0, src1, src2, src3, src4 + eor \dst, \src0, \src1 + eor \dst, \dst, \src2 + eor \dst, \dst, \src3 + eor \dst, \dst, \src4 +.endm + +.macro xor_rol dst, src1, src0, imm + eor \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro bic_rol dst, src1, src0, imm + bic \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro rotate dst, src, imm + ror \dst, \src, #(64-\imm) +.endm + +.macro save reg, offset + str \reg, [sp, #\offset] +.endm + +.macro restore reg, offset + ldr \reg, [sp, #\offset] +.endm + +.macro hybrid_round_initial + + eor sC0, sAma, sAsa SEP eor3_m1 C0, vAba, vAga, vAka + eor sC1, sAme, sAse SEP + eor sC2, sAmi, sAsi SEP + eor sC3, sAmo, sAso SEP eor3_m1 C0, C0, vAma, vAsa + eor sC4, sAmu, sAsu SEP + eor sC0, sAka, sC0 SEP + eor sC1, sAke, sC1 SEP eor3_m1 C1, vAbe, vAge, vAke + eor sC2, sAki, sC2 SEP + eor sC3, sAko, sC3 SEP + eor sC4, sAku, sC4 SEP eor3_m1 C1, C1, vAme, vAse + eor sC0, sAga, sC0 SEP + eor sC1, sAge, sC1 SEP + eor sC2, sAgi, sC2 SEP eor3_m1 C2, vAbi, vAgi, vAki + eor sC3, sAgo, sC3 SEP + eor sC4, sAgu, sC4 SEP + eor sC0, s_Aba, sC0 SEP eor3_m1 C2, C2, vAmi, vAsi + eor sC1, sAbe, sC1 SEP + eor sC2, sAbi, sC2 SEP + eor sC3, sAbo, sC3 SEP eor3_m1 C3, vAbo, vAgo, vAko + eor sC4, sAbu, sC4 SEP + SEP + eor sE1, sC0, sC2, ROR #63 SEP eor3_m1 C3, C3, vAmo, vAso + eor sE3, sC2, sC4, ROR #63 SEP + eor sE0, sC4, sC1, ROR #63 SEP + eor sE2, sC1, sC3, ROR #63 SEP eor3_m1 C4, vAbu, vAgu, vAku + eor sE4, sC3, sC0, ROR #63 SEP + SEP + eor s_Aba_, s_Aba, sE0 SEP eor3_m1 C4, C4, vAmu, vAsu + eor sAsa_, sAbi, sE2 SEP + eor sAbi_, sAki, sE2 SEP + eor sAki_, sAko, sE3 SEP + eor sAko_, sAmu, sE4 SEP rax1_m1 E1, C0, C2 + eor sAmu_, sAso, sE3 SEP + eor sAso_, sAma, sE0 SEP + eor sAka_, sAbe, sE1 SEP rax1_m1 E3, C2, C4 + eor sAse_, sAgo, sE3 SEP + eor sAgo_, sAme, sE1 SEP + eor sAke_, sAgi, sE2 SEP rax1_m1 E0, C4, C1 + eor sAgi_, sAka, sE0 SEP + eor sAga_, sAbo, sE3 SEP + eor sAbo_, sAmo, sE3 SEP rax1_m1 E2, C1, C3 + eor sAmo_, sAmi, sE2 SEP + eor sAmi_, sAke, sE1 SEP + eor sAge_, sAgu, sE4 SEP rax1_m1 E4, C3, C0 + eor sAgu_, sAsi, sE2 SEP + eor sAsi_, sAku, sE4 SEP + eor sAku_, sAsa, sE0 SEP + eor sAma_, sAbu, sE4 SEP eor vAba_.16b, vAba.16b, E0.16b + eor sAbu_, sAsu, sE4 SEP + eor sAsu_, sAse, sE1 SEP + eor sAme_, sAga, sE0 SEP xar_m1 vAsa_, vAbi, E2, 2 + eor sAbe_, sAge, sE1 SEP + SEP + load_constant_ptr SEP xar_m1 vAbi_, vAki, E2, 21 + SEP + bic tmp, sAgi_, sAge_, ROR #47 SEP + eor sAga, tmp, sAga_, ROR #39 SEP xar_m1 vAki_, vAko, E3, 39 + bic tmp, sAgo_, sAgi_, ROR #42 SEP + eor sAge, tmp, sAge_, ROR #25 SEP + bic tmp, sAgu_, sAgo_, ROR #16 SEP xar_m1 vAko_, vAmu, E4, 56 + eor sAgi, tmp, sAgi_, ROR #58 SEP + bic tmp, sAga_, sAgu_, ROR #31 SEP + eor sAgo, tmp, sAgo_, ROR #47 SEP xar_m1 vAmu_, vAso, E3, 8 + bic tmp, sAge_, sAga_, ROR #56 SEP + eor sAgu, tmp, sAgu_, ROR #23 SEP + bic tmp, sAki_, sAke_, ROR #19 SEP xar_m1 vAso_, vAma, E0, 23 + eor sAka, tmp, sAka_, ROR #24 SEP + bic tmp, sAko_, sAki_, ROR #47 SEP + eor sAke, tmp, sAke_, ROR #2 SEP xar_m1 vAka_, vAbe, E1, 63 + bic tmp, sAku_, sAko_, ROR #10 SEP + eor sAki, tmp, sAki_, ROR #57 SEP + bic tmp, sAka_, sAku_, ROR #47 SEP xar_m1 vAse_, vAgo, E3, 9 + eor sAko, tmp, sAko_, ROR #57 SEP + bic tmp, sAke_, sAka_, ROR #5 SEP + eor sAku, tmp, sAku_, ROR #52 SEP xar_m1 vAgo_, vAme, E1, 19 + bic tmp, sAmi_, sAme_, ROR #38 SEP + eor sAma, tmp, sAma_, ROR #47 SEP + bic tmp, sAmo_, sAmi_, ROR #5 SEP xar_m1 vAke_, vAgi, E2, 58 + eor sAme, tmp, sAme_, ROR #43 SEP + bic tmp, sAmu_, sAmo_, ROR #41 SEP + eor sAmi, tmp, sAmi_, ROR #46 SEP xar_m1 vAgi_, vAka, E0, 61 + SEP + ldr cur_const, [const_addr] SEP + mov count, #1 SEP xar_m1 vAga_, vAbo, E3, 36 + SEP + bic tmp, sAma_, sAmu_, ROR #35 SEP + eor sAmo, tmp, sAmo_, ROR #12 SEP xar_m1 vAbo_, vAmo, E3, 43 + bic tmp, sAme_, sAma_, ROR #9 SEP + eor sAmu, tmp, sAmu_, ROR #44 SEP + bic tmp, sAsi_, sAse_, ROR #48 SEP xar_m1 vAmo_, vAmi, E2, 49 + eor sAsa, tmp, sAsa_, ROR #41 SEP + bic tmp, sAso_, sAsi_, ROR #2 SEP + eor sAse, tmp, sAse_, ROR #50 SEP xar_m1 vAmi_, vAke, E1, 54 + bic tmp, sAsu_, sAso_, ROR #25 SEP + eor sAsi, tmp, sAsi_, ROR #27 SEP + bic tmp, sAsa_, sAsu_, ROR #60 SEP xar_m1 vAge_, vAgu, E4, 44 + eor sAso, tmp, sAso_, ROR #21 SEP + bic tmp, sAse_, sAsa_, ROR #57 SEP + eor sAsu, tmp, sAsu_, ROR #53 SEP xar_m1 vAgu_, vAsi, E2, 3 + bic tmp, sAbi_, sAbe_, ROR #63 SEP + eor s_Aba, s_Aba_, tmp, ROR #21 SEP + bic tmp, sAbo_, sAbi_, ROR #42 SEP xar_m1 vAsi_, vAku, E4, 25 + eor sAbe, tmp, sAbe_, ROR #41 SEP + bic tmp, sAbu_, sAbo_, ROR #57 SEP + eor sAbi, tmp, sAbi_, ROR #35 SEP xar_m1 vAku_, vAsa, E0, 46 + bic tmp, s_Aba_, sAbu_, ROR #50 SEP + eor sAbo, tmp, sAbo_, ROR #43 SEP + bic tmp, sAbe_, s_Aba_, ROR #44 SEP xar_m1 vAma_, vAbu, E4, 37 + eor sAbu, tmp, sAbu_, ROR #30 SEP + SEP + eor s_Aba, s_Aba, cur_const SEP xar_m1 vAbu_, vAsu, E4, 50 + SEP + save count, STACK_OFFSET_COUNT SEP + SEP xar_m1 vAsu_, vAse, E1, 62 + eor sC0, sAka, sAsa, ROR #50 SEP + eor sC1, sAse, sAge, ROR #60 SEP + eor sC2, sAmi, sAgi, ROR #59 SEP xar_m1 vAme_, vAga, E0, 28 + eor sC3, sAgo, sAso, ROR #30 SEP + eor sC4, sAbu, sAsu, ROR #53 SEP + eor sC0, sAma, sC0, ROR #49 SEP xar_m1 vAbe_, vAge, E1, 20 + eor sC1, sAbe, sC1, ROR #44 SEP + eor sC2, sAki, sC2, ROR #26 SEP restore sE1, STACK_OFFSET_CONST + eor sC3, sAmo, sC3, ROR #63 SEP + eor sC4, sAmu, sC4, ROR #56 SEP + eor sC0, sAga, sC0, ROR #57 SEP ld1r {v28.2d}, [sE1], #8 + eor sC1, sAme, sC1, ROR #58 SEP + eor sC2, sAbi, sC2, ROR #60 SEP + eor sC3, sAko, sC3, ROR #38 SEP save sE1, STACK_OFFSET_CONST + eor sC4, sAgu, sC4, ROR #48 SEP + eor sC0, s_Aba, sC0, ROR #61 SEP bcax_m1 vAga, vAga_, vAgi_, vAge_ + eor sC1, sAke, sC1, ROR #57 SEP + eor sC2, sAsi, sC2, ROR #52 SEP + eor sC3, sAbo, sC3, ROR #63 SEP bcax_m1 vAge, vAge_, vAgo_, vAgi_ + eor sC4, sAku, sC4, ROR #50 SEP + ror sC1, sC1, 56 SEP + ror sC4, sC4, 58 SEP bcax_m1 vAgi, vAgi_, vAgu_, vAgo_ + ror sC2, sC2, 62 SEP + SEP + eor sE1, sC0, sC2, ROR #63 SEP bcax_m1 vAgo, vAgo_, vAga_, vAgu_ + eor sE3, sC2, sC4, ROR #63 SEP + eor sE0, sC4, sC1, ROR #63 SEP + eor sE2, sC1, sC3, ROR #63 SEP bcax_m1 vAgu, vAgu_, vAge_, vAga_ + eor sE4, sC3, sC0, ROR #63 SEP + SEP + eor s_Aba_, sE0, s_Aba SEP bcax_m1 vAka, vAka_, vAki_, vAke_ + eor sAsa_, sE2, sAbi, ROR #50 SEP + eor sAbi_, sE2, sAki, ROR #46 SEP + eor sAki_, sE3, sAko, ROR #63 SEP bcax_m1 vAke, vAke_, vAko_, vAki_ + eor sAko_, sE4, sAmu, ROR #28 SEP + eor sAmu_, sE3, sAso, ROR #2 SEP + eor sAso_, sE0, sAma, ROR #54 SEP bcax_m1 vAki, vAki_, vAku_, vAko_ + eor sAka_, sE1, sAbe, ROR #43 SEP + eor sAse_, sE3, sAgo, ROR #36 SEP + eor sAgo_, sE1, sAme, ROR #49 SEP bcax_m1 vAko, vAko_, vAka_, vAku_ + eor sAke_, sE2, sAgi, ROR #3 SEP + eor sAgi_, sE0, sAka, ROR #39 SEP + eor sAga_, sE3, sAbo SEP bcax_m1 vAku, vAku_, vAke_, vAka_ + eor sAbo_, sE3, sAmo, ROR #37 SEP + eor sAmo_, sE2, sAmi, ROR #8 SEP + eor sAmi_, sE1, sAke, ROR #56 SEP bcax_m1 vAma, vAma_, vAmi_, vAme_ + eor sAge_, sE4, sAgu, ROR #44 SEP + eor sAgu_, sE2, sAsi, ROR #62 SEP + eor sAsi_, sE4, sAku, ROR #58 SEP bcax_m1 vAme, vAme_, vAmo_, vAmi_ + eor sAku_, sE0, sAsa, ROR #25 SEP + eor sAma_, sE4, sAbu, ROR #20 SEP + eor sAbu_, sE4, sAsu, ROR #9 SEP bcax_m1 vAmi, vAmi_, vAmu_, vAmo_ + eor sAsu_, sE1, sAse, ROR #23 SEP + eor sAme_, sE0, sAga, ROR #61 SEP + eor sAbe_, sE1, sAge, ROR #19 SEP bcax_m1 vAmo, vAmo_, vAma_, vAmu_ + SEP + load_constant_ptr SEP + restore count, STACK_OFFSET_COUNT SEP bcax_m1 vAmu, vAmu_, vAme_, vAma_ + SEP + bic tmp, sAgi_, sAge_, ROR #47 SEP + eor sAga, tmp, sAga_, ROR #39 SEP bcax_m1 vAsa, vAsa_, vAsi_, vAse_ + bic tmp, sAgo_, sAgi_, ROR #42 SEP + eor sAge, tmp, sAge_, ROR #25 SEP + bic tmp, sAgu_, sAgo_, ROR #16 SEP bcax_m1 vAse, vAse_, vAso_, vAsi_ + eor sAgi, tmp, sAgi_, ROR #58 SEP + bic tmp, sAga_, sAgu_, ROR #31 SEP + eor sAgo, tmp, sAgo_, ROR #47 SEP bcax_m1 vAsi, vAsi_, vAsu_, vAso_ + bic tmp, sAge_, sAga_, ROR #56 SEP + eor sAgu, tmp, sAgu_, ROR #23 SEP + bic tmp, sAki_, sAke_, ROR #19 SEP bcax_m1 vAso, vAso_, vAsa_, vAsu_ + eor sAka, tmp, sAka_, ROR #24 SEP + bic tmp, sAko_, sAki_, ROR #47 SEP + eor sAke, tmp, sAke_, ROR #2 SEP bcax_m1 vAsu, vAsu_, vAse_, vAsa_ + bic tmp, sAku_, sAko_, ROR #10 SEP + eor sAki, tmp, sAki_, ROR #57 SEP + bic tmp, sAka_, sAku_, ROR #47 SEP bcax_m1 vAba, vAba_, vAbi_, vAbe_ + eor sAko, tmp, sAko_, ROR #57 SEP + bic tmp, sAke_, sAka_, ROR #5 SEP + eor sAku, tmp, sAku_, ROR #52 SEP bcax_m1 vAbe, vAbe_, vAbo_, vAbi_ + bic tmp, sAmi_, sAme_, ROR #38 SEP + eor sAma, tmp, sAma_, ROR #47 SEP + bic tmp, sAmo_, sAmi_, ROR #5 SEP bcax_m1 vAbi, vAbi_, vAbu_, vAbo_ + eor sAme, tmp, sAme_, ROR #43 SEP + bic tmp, sAmu_, sAmo_, ROR #41 SEP + eor sAmi, tmp, sAmi_, ROR #46 SEP bcax_m1 vAbo, vAbo_, vAba_, vAbu_ + bic tmp, sAma_, sAmu_, ROR #35 SEP + SEP + ldr cur_const, [const_addr, count, UXTW #3] SEP bcax_m1 vAbu, vAbu_, vAbe_, vAba_ + SEP + eor sAmo, tmp, sAmo_, ROR #12 SEP + bic tmp, sAme_, sAma_, ROR #9 SEP + eor sAmu, tmp, sAmu_, ROR #44 SEP eor vAba.16b, vAba.16b, v28.16b + bic tmp, sAsi_, sAse_, ROR #48 SEP + eor sAsa, tmp, sAsa_, ROR #41 SEP + bic tmp, sAso_, sAsi_, ROR #2 SEP + eor sAse, tmp, sAse_, ROR #50 SEP + bic tmp, sAsu_, sAso_, ROR #25 SEP + eor sAsi, tmp, sAsi_, ROR #27 SEP + bic tmp, sAsa_, sAsu_, ROR #60 SEP + eor sAso, tmp, sAso_, ROR #21 SEP + bic tmp, sAse_, sAsa_, ROR #57 SEP + eor sAsu, tmp, sAsu_, ROR #53 SEP + bic tmp, sAbi_, sAbe_, ROR #63 SEP + eor s_Aba, s_Aba_, tmp, ROR #21 SEP + bic tmp, sAbo_, sAbi_, ROR #42 SEP + eor sAbe, tmp, sAbe_, ROR #41 SEP + bic tmp, sAbu_, sAbo_, ROR #57 SEP + eor sAbi, tmp, sAbi_, ROR #35 SEP + bic tmp, s_Aba_, sAbu_, ROR #50 SEP + eor sAbo, tmp, sAbo_, ROR #43 SEP + bic tmp, sAbe_, s_Aba_, ROR #44 SEP + eor sAbu, tmp, sAbu_, ROR #30 SEP + SEP + add count, count, #1 SEP + SEP + eor s_Aba, s_Aba, cur_const SEP + SEP +.endm + +.macro hybrid_round_noninitial + save count, STACK_OFFSET_COUNT SEP eor3_m1 C0, vAba, vAga, vAka + SEP + eor sC0, sAka, sAsa, ROR #50 SEP + eor sC1, sAse, sAge, ROR #60 SEP eor3_m1 C0, C0, vAma, vAsa + eor sC2, sAmi, sAgi, ROR #59 SEP + eor sC3, sAgo, sAso, ROR #30 SEP + eor sC4, sAbu, sAsu, ROR #53 SEP eor3_m1 C1, vAbe, vAge, vAke + eor sC0, sAma, sC0, ROR #49 SEP + eor sC1, sAbe, sC1, ROR #44 SEP + eor sC2, sAki, sC2, ROR #26 SEP eor3_m1 C1, C1, vAme, vAse + eor sC3, sAmo, sC3, ROR #63 SEP + eor sC4, sAmu, sC4, ROR #56 SEP + eor sC0, sAga, sC0, ROR #57 SEP eor3_m1 C2, vAbi, vAgi, vAki + eor sC1, sAme, sC1, ROR #58 SEP + eor sC2, sAbi, sC2, ROR #60 SEP + eor sC3, sAko, sC3, ROR #38 SEP eor3_m1 C2, C2, vAmi, vAsi + eor sC4, sAgu, sC4, ROR #48 SEP + eor sC0, s_Aba, sC0, ROR #61 SEP + eor sC1, sAke, sC1, ROR #57 SEP eor3_m1 C3, vAbo, vAgo, vAko + eor sC2, sAsi, sC2, ROR #52 SEP + eor sC3, sAbo, sC3, ROR #63 SEP + eor sC4, sAku, sC4, ROR #50 SEP eor3_m1 C3, C3, vAmo, vAso + ror sC1, sC1, 56 SEP + ror sC4, sC4, 58 SEP + ror sC2, sC2, 62 SEP eor3_m1 C4, vAbu, vAgu, vAku + SEP + eor sE1, sC0, sC2, ROR #63 SEP + eor sE3, sC2, sC4, ROR #63 SEP eor3_m1 C4, C4, vAmu, vAsu + eor sE0, sC4, sC1, ROR #63 SEP + eor sE2, sC1, sC3, ROR #63 SEP + eor sE4, sC3, sC0, ROR #63 SEP + SEP rax1_m1 E1, C0, C2 + eor s_Aba_, sE0, s_Aba SEP + eor sAsa_, sE2, sAbi, ROR #50 SEP + eor sAbi_, sE2, sAki, ROR #46 SEP rax1_m1 E3, C2, C4 + eor sAki_, sE3, sAko, ROR #63 SEP + eor sAko_, sE4, sAmu, ROR #28 SEP + eor sAmu_, sE3, sAso, ROR #2 SEP rax1_m1 E0, C4, C1 + eor sAso_, sE0, sAma, ROR #54 SEP + eor sAka_, sE1, sAbe, ROR #43 SEP + eor sAse_, sE3, sAgo, ROR #36 SEP rax1_m1 E2, C1, C3 + eor sAgo_, sE1, sAme, ROR #49 SEP + eor sAke_, sE2, sAgi, ROR #3 SEP + eor sAgi_, sE0, sAka, ROR #39 SEP rax1_m1 E4, C3, C0 + eor sAga_, sE3, sAbo SEP + eor sAbo_, sE3, sAmo, ROR #37 SEP + eor sAmo_, sE2, sAmi, ROR #8 SEP + eor sAmi_, sE1, sAke, ROR #56 SEP eor vAba_.16b, vAba.16b, E0.16b + eor sAge_, sE4, sAgu, ROR #44 SEP + eor sAgu_, sE2, sAsi, ROR #62 SEP + eor sAsi_, sE4, sAku, ROR #58 SEP xar_m1 vAsa_, vAbi, E2, 2 + eor sAku_, sE0, sAsa, ROR #25 SEP + eor sAma_, sE4, sAbu, ROR #20 SEP + eor sAbu_, sE4, sAsu, ROR #9 SEP xar_m1 vAbi_, vAki, E2, 21 + eor sAsu_, sE1, sAse, ROR #23 SEP + eor sAme_, sE0, sAga, ROR #61 SEP + eor sAbe_, sE1, sAge, ROR #19 SEP xar_m1 vAki_, vAko, E3, 39 + SEP + load_constant_ptr SEP + restore count, STACK_OFFSET_COUNT SEP xar_m1 vAko_, vAmu, E4, 56 + SEP + bic tmp, sAgi_, sAge_, ROR #47 SEP + eor sAga, tmp, sAga_, ROR #39 SEP xar_m1 vAmu_, vAso, E3, 8 + bic tmp, sAgo_, sAgi_, ROR #42 SEP + eor sAge, tmp, sAge_, ROR #25 SEP + bic tmp, sAgu_, sAgo_, ROR #16 SEP xar_m1 vAso_, vAma, E0, 23 + eor sAgi, tmp, sAgi_, ROR #58 SEP + bic tmp, sAga_, sAgu_, ROR #31 SEP + eor sAgo, tmp, sAgo_, ROR #47 SEP xar_m1 vAka_, vAbe, E1, 63 + bic tmp, sAge_, sAga_, ROR #56 SEP + eor sAgu, tmp, sAgu_, ROR #23 SEP + bic tmp, sAki_, sAke_, ROR #19 SEP xar_m1 vAse_, vAgo, E3, 9 + eor sAka, tmp, sAka_, ROR #24 SEP + bic tmp, sAko_, sAki_, ROR #47 SEP + eor sAke, tmp, sAke_, ROR #2 SEP xar_m1 vAgo_, vAme, E1, 19 + bic tmp, sAku_, sAko_, ROR #10 SEP + eor sAki, tmp, sAki_, ROR #57 SEP + bic tmp, sAka_, sAku_, ROR #47 SEP xar_m1 vAke_, vAgi, E2, 58 + eor sAko, tmp, sAko_, ROR #57 SEP + bic tmp, sAke_, sAka_, ROR #5 SEP + eor sAku, tmp, sAku_, ROR #52 SEP xar_m1 vAgi_, vAka, E0, 61 + bic tmp, sAmi_, sAme_, ROR #38 SEP + eor sAma, tmp, sAma_, ROR #47 SEP + bic tmp, sAmo_, sAmi_, ROR #5 SEP xar_m1 vAga_, vAbo, E3, 36 + eor sAme, tmp, sAme_, ROR #43 SEP + bic tmp, sAmu_, sAmo_, ROR #41 SEP + eor sAmi, tmp, sAmi_, ROR #46 SEP xar_m1 vAbo_, vAmo, E3, 43 + bic tmp, sAma_, sAmu_, ROR #35 SEP + SEP + ldr cur_const, [const_addr, count, UXTW #3] SEP xar_m1 vAmo_, vAmi, E2, 49 + add count, count, #1 SEP + SEP + eor sAmo, tmp, sAmo_, ROR #12 SEP xar_m1 vAmi_, vAke, E1, 54 + bic tmp, sAme_, sAma_, ROR #9 SEP + eor sAmu, tmp, sAmu_, ROR #44 SEP + bic tmp, sAsi_, sAse_, ROR #48 SEP xar_m1 vAge_, vAgu, E4, 44 + eor sAsa, tmp, sAsa_, ROR #41 SEP + bic tmp, sAso_, sAsi_, ROR #2 SEP + eor sAse, tmp, sAse_, ROR #50 SEP xar_m1 vAgu_, vAsi, E2, 3 + bic tmp, sAsu_, sAso_, ROR #25 SEP + eor sAsi, tmp, sAsi_, ROR #27 SEP + bic tmp, sAsa_, sAsu_, ROR #60 SEP xar_m1 vAsi_, vAku, E4, 25 + eor sAso, tmp, sAso_, ROR #21 SEP + bic tmp, sAse_, sAsa_, ROR #57 SEP + eor sAsu, tmp, sAsu_, ROR #53 SEP xar_m1 vAku_, vAsa, E0, 46 + bic tmp, sAbi_, sAbe_, ROR #63 SEP + eor s_Aba, s_Aba_, tmp, ROR #21 SEP + bic tmp, sAbo_, sAbi_, ROR #42 SEP xar_m1 vAma_, vAbu, E4, 37 + eor sAbe, tmp, sAbe_, ROR #41 SEP + bic tmp, sAbu_, sAbo_, ROR #57 SEP + eor sAbi, tmp, sAbi_, ROR #35 SEP xar_m1 vAbu_, vAsu, E4, 50 + bic tmp, s_Aba_, sAbu_, ROR #50 SEP + eor sAbo, tmp, sAbo_, ROR #43 SEP + bic tmp, sAbe_, s_Aba_, ROR #44 SEP xar_m1 vAsu_, vAse, E1, 62 + eor sAbu, tmp, sAbu_, ROR #30 SEP + SEP + eor s_Aba, s_Aba, cur_const SEP xar_m1 vAme_, vAga, E0, 28 + save count, STACK_OFFSET_COUNT SEP + SEP + eor sC0, sAka, sAsa, ROR #50 SEP xar_m1 vAbe_, vAge, E1, 20 + eor sC1, sAse, sAge, ROR #60 SEP + eor sC2, sAmi, sAgi, ROR #59 SEP + eor sC3, sAgo, sAso, ROR #30 SEP + eor sC4, sAbu, sAsu, ROR #53 SEP restore sE1, STACK_OFFSET_CONST + eor sC0, sAma, sC0, ROR #49 SEP + eor sC1, sAbe, sC1, ROR #44 SEP + eor sC2, sAki, sC2, ROR #26 SEP ld1r {v28.2d}, [sE1], #8 + eor sC3, sAmo, sC3, ROR #63 SEP + eor sC4, sAmu, sC4, ROR #56 SEP + eor sC0, sAga, sC0, ROR #57 SEP save sE1, STACK_OFFSET_CONST + eor sC1, sAme, sC1, ROR #58 SEP + eor sC2, sAbi, sC2, ROR #60 SEP + eor sC3, sAko, sC3, ROR #38 SEP + eor sC4, sAgu, sC4, ROR #48 SEP bcax_m1 vAga, vAga_, vAgi_, vAge_ + eor sC0, s_Aba, sC0, ROR #61 SEP + eor sC1, sAke, sC1, ROR #57 SEP + eor sC2, sAsi, sC2, ROR #52 SEP bcax_m1 vAge, vAge_, vAgo_, vAgi_ + eor sC3, sAbo, sC3, ROR #63 SEP + eor sC4, sAku, sC4, ROR #50 SEP + ror sC1, sC1, 56 SEP bcax_m1 vAgi, vAgi_, vAgu_, vAgo_ + ror sC4, sC4, 58 SEP + ror sC2, sC2, 62 SEP + SEP bcax_m1 vAgo, vAgo_, vAga_, vAgu_ + eor sE1, sC0, sC2, ROR #63 SEP + eor sE3, sC2, sC4, ROR #63 SEP + eor sE0, sC4, sC1, ROR #63 SEP bcax_m1 vAgu, vAgu_, vAge_, vAga_ + eor sE2, sC1, sC3, ROR #63 SEP + eor sE4, sC3, sC0, ROR #63 SEP + SEP bcax_m1 vAka, vAka_, vAki_, vAke_ + eor s_Aba_, sE0, s_Aba SEP + eor sAsa_, sE2, sAbi, ROR #50 SEP + eor sAbi_, sE2, sAki, ROR #46 SEP bcax_m1 vAke, vAke_, vAko_, vAki_ + eor sAki_, sE3, sAko, ROR #63 SEP + eor sAko_, sE4, sAmu, ROR #28 SEP + eor sAmu_, sE3, sAso, ROR #2 SEP bcax_m1 vAki, vAki_, vAku_, vAko_ + eor sAso_, sE0, sAma, ROR #54 SEP + eor sAka_, sE1, sAbe, ROR #43 SEP + eor sAse_, sE3, sAgo, ROR #36 SEP bcax_m1 vAko, vAko_, vAka_, vAku_ + eor sAgo_, sE1, sAme, ROR #49 SEP + eor sAke_, sE2, sAgi, ROR #3 SEP + eor sAgi_, sE0, sAka, ROR #39 SEP bcax_m1 vAku, vAku_, vAke_, vAka_ + eor sAga_, sE3, sAbo SEP + eor sAbo_, sE3, sAmo, ROR #37 SEP + eor sAmo_, sE2, sAmi, ROR #8 SEP bcax_m1 vAma, vAma_, vAmi_, vAme_ + eor sAmi_, sE1, sAke, ROR #56 SEP + eor sAge_, sE4, sAgu, ROR #44 SEP + eor sAgu_, sE2, sAsi, ROR #62 SEP bcax_m1 vAme, vAme_, vAmo_, vAmi_ + eor sAsi_, sE4, sAku, ROR #58 SEP + eor sAku_, sE0, sAsa, ROR #25 SEP + eor sAma_, sE4, sAbu, ROR #20 SEP bcax_m1 vAmi, vAmi_, vAmu_, vAmo_ + eor sAbu_, sE4, sAsu, ROR #9 SEP + eor sAsu_, sE1, sAse, ROR #23 SEP + eor sAme_, sE0, sAga, ROR #61 SEP bcax_m1 vAmo, vAmo_, vAma_, vAmu_ + eor sAbe_, sE1, sAge, ROR #19 SEP + SEP + load_constant_ptr SEP bcax_m1 vAmu, vAmu_, vAme_, vAma_ + restore count, STACK_OFFSET_COUNT SEP + SEP + bic tmp, sAgi_, sAge_, ROR #47 SEP bcax_m1 vAsa, vAsa_, vAsi_, vAse_ + eor sAga, tmp, sAga_, ROR #39 SEP + bic tmp, sAgo_, sAgi_, ROR #42 SEP + eor sAge, tmp, sAge_, ROR #25 SEP bcax_m1 vAse, vAse_, vAso_, vAsi_ + bic tmp, sAgu_, sAgo_, ROR #16 SEP + eor sAgi, tmp, sAgi_, ROR #58 SEP + bic tmp, sAga_, sAgu_, ROR #31 SEP bcax_m1 vAsi, vAsi_, vAsu_, vAso_ + eor sAgo, tmp, sAgo_, ROR #47 SEP + bic tmp, sAge_, sAga_, ROR #56 SEP + eor sAgu, tmp, sAgu_, ROR #23 SEP bcax_m1 vAso, vAso_, vAsa_, vAsu_ + bic tmp, sAki_, sAke_, ROR #19 SEP + eor sAka, tmp, sAka_, ROR #24 SEP + bic tmp, sAko_, sAki_, ROR #47 SEP bcax_m1 vAsu, vAsu_, vAse_, vAsa_ + eor sAke, tmp, sAke_, ROR #2 SEP + bic tmp, sAku_, sAko_, ROR #10 SEP + eor sAki, tmp, sAki_, ROR #57 SEP bcax_m1 vAba, vAba_, vAbi_, vAbe_ + bic tmp, sAka_, sAku_, ROR #47 SEP + eor sAko, tmp, sAko_, ROR #57 SEP + bic tmp, sAke_, sAka_, ROR #5 SEP bcax_m1 vAbe, vAbe_, vAbo_, vAbi_ + eor sAku, tmp, sAku_, ROR #52 SEP + bic tmp, sAmi_, sAme_, ROR #38 SEP + eor sAma, tmp, sAma_, ROR #47 SEP bcax_m1 vAbi, vAbi_, vAbu_, vAbo_ + bic tmp, sAmo_, sAmi_, ROR #5 SEP + eor sAme, tmp, sAme_, ROR #43 SEP + bic tmp, sAmu_, sAmo_, ROR #41 SEP bcax_m1 vAbo, vAbo_, vAba_, vAbu_ + eor sAmi, tmp, sAmi_, ROR #46 SEP + bic tmp, sAma_, sAmu_, ROR #35 SEP + SEP bcax_m1 vAbu, vAbu_, vAbe_, vAba_ + ldr cur_const, [const_addr, count, UXTW #3] SEP + add count, count, #1 SEP + SEP eor vAba.16b, vAba.16b, v28.16b + eor sAmo, tmp, sAmo_, ROR #12 SEP + bic tmp, sAme_, sAma_, ROR #9 SEP + eor sAmu, tmp, sAmu_, ROR #44 SEP + bic tmp, sAsi_, sAse_, ROR #48 SEP + eor sAsa, tmp, sAsa_, ROR #41 SEP + bic tmp, sAso_, sAsi_, ROR #2 SEP + eor sAse, tmp, sAse_, ROR #50 SEP + bic tmp, sAsu_, sAso_, ROR #25 SEP + eor sAsi, tmp, sAsi_, ROR #27 SEP + bic tmp, sAsa_, sAsu_, ROR #60 SEP + eor sAso, tmp, sAso_, ROR #21 SEP + bic tmp, sAse_, sAsa_, ROR #57 SEP + eor sAsu, tmp, sAsu_, ROR #53 SEP + bic tmp, sAbi_, sAbe_, ROR #63 SEP + eor s_Aba, s_Aba_, tmp, ROR #21 SEP + bic tmp, sAbo_, sAbi_, ROR #42 SEP + eor sAbe, tmp, sAbe_, ROR #41 SEP + bic tmp, sAbu_, sAbo_, ROR #57 SEP + eor sAbi, tmp, sAbi_, ROR #35 SEP + bic tmp, s_Aba_, sAbu_, ROR #50 SEP + eor sAbo, tmp, sAbo_, ROR #43 SEP + bic tmp, sAbe_, s_Aba_, ROR #44 SEP + eor sAbu, tmp, sAbu_, ROR #30 SEP + SEP + eor s_Aba, s_Aba, cur_const SEP + +.endm + +.macro final_rotate + ror sAga, sAga,#(64-3) + ror sAka, sAka,#(64-25) + ror sAma, sAma,#(64-10) + ror sAsa, sAsa,#(64-39) + ror sAbe, sAbe,#(64-21) + ror sAge, sAge,#(64-45) + ror sAke, sAke,#(64-8) + ror sAme, sAme,#(64-15) + ror sAse, sAse,#(64-41) + ror sAbi, sAbi,#(64-14) + ror sAgi, sAgi,#(64-61) + ror sAki, sAki,#(64-18) + ror sAmi, sAmi,#(64-56) + ror sAsi, sAsi,#(64-2) + ror sAgo, sAgo,#(64-28) + ror sAko, sAko,#(64-1) + ror sAmo, sAmo,#(64-27) + ror sAso, sAso,#(64-62) + ror sAbu, sAbu,#(64-44) + ror sAgu, sAgu,#(64-20) + ror sAku, sAku,#(64-6) + ror sAmu, sAmu,#(64-36) + ror sAsu, sAsu,#(64-55) +.endm + +#define KECCAK_F1600_ROUNDS 24 + +.global keccak_f1600_x4_hybrid_asm_v3 +.global _keccak_f1600_x4_hybrid_asm_v3 +.text +.align 4 + +keccak_f1600_x4_hybrid_asm_v3: +_keccak_f1600_x4_hybrid_asm_v3: + alloc_stack + save_gprs + save_vregs + save input_addr, STACK_OFFSET_INPUT + + load_input_vector 2,1 + + load_constant_ptr + save const_addr, STACK_OFFSET_CONST + + // First scalar Keccak computation alongside first half of SIMD computation + load_input_scalar 4,0 + hybrid_round_initial + loop_0: + hybrid_round_noninitial + cmp count, #(KECCAK_F1600_ROUNDS-1) + ble loop_0 + final_rotate + restore input_addr, STACK_OFFSET_INPUT + store_input_scalar 4,0 + + // Second scalar Keccak computation alongsie second half of SIMD computation + load_input_scalar 4,1 + hybrid_round_initial + loop_1: + hybrid_round_noninitial + cmp count, #(KECCAK_F1600_ROUNDS-1) + ble loop_1 + final_rotate + restore input_addr, STACK_OFFSET_INPUT + store_input_scalar 4, 1 + + store_input_vector 2,1 + + restore_vregs + restore_gprs + free_stack + ret diff --git a/tests/keccak_neon/manual/keccak_f1600_x4_hybrid_asm_v3p.s b/tests/keccak_neon/manual/keccak_f1600_x4_hybrid_asm_v3p.s new file mode 100644 index 0000000..86f3074 --- /dev/null +++ b/tests/keccak_neon/manual/keccak_f1600_x4_hybrid_asm_v3p.s @@ -0,0 +1,1016 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +/********************** CONSTANTS *************************/ + .data + .align(8) +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x29 + count .req w27 + cur_const .req x26 + + /* Mapping of Kecck-f1600 SIMD state to vector registers + * at the beginning and end of each round. */ + + vAba .req v0 + vAbe .req v1 + vAbi .req v2 + vAbo .req v3 + vAbu .req v4 + vAga .req v5 + vAge .req v6 + vAgi .req v7 + vAgo .req v8 + vAgu .req v9 + vAka .req v10 + vAke .req v11 + vAki .req v12 + vAko .req v13 + vAku .req v14 + vAma .req v15 + vAme .req v16 + vAmi .req v17 + vAmo .req v18 + vAmu .req v19 + vAsa .req v20 + vAse .req v21 + vAsi .req v22 + vAso .req v23 + vAsu .req v24 + + /* q-form of the above mapping */ + vAbaq .req q0 + vAbeq .req q1 + vAbiq .req q2 + vAboq .req q3 + vAbuq .req q4 + vAgaq .req q5 + vAgeq .req q6 + vAgiq .req q7 + vAgoq .req q8 + vAguq .req q9 + vAkaq .req q10 + vAkeq .req q11 + vAkiq .req q12 + vAkoq .req q13 + vAkuq .req q14 + vAmaq .req q15 + vAmeq .req q16 + vAmiq .req q17 + vAmoq .req q18 + vAmuq .req q19 + vAsaq .req q20 + vAseq .req q21 + vAsiq .req q22 + vAsoq .req q23 + vAsuq .req q24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v30 + C1 .req v29 + C2 .req v28 + C3 .req v27 + C4 .req v26 + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req v26 + E1 .req v25 + E2 .req v29 + E3 .req v28 + E4 .req v27 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + vAbi_ .req v2 + vAbo_ .req v3 + vAbu_ .req v4 + vAga_ .req v10 + vAge_ .req v11 + vAgi_ .req v7 + vAgo_ .req v8 + vAgu_ .req v9 + vAka_ .req v15 + vAke_ .req v16 + vAki_ .req v12 + vAko_ .req v13 + vAku_ .req v14 + vAma_ .req v20 + vAme_ .req v21 + vAmi_ .req v17 + vAmo_ .req v18 + vAmu_ .req v19 + vAsa_ .req v0 + vAse_ .req v1 + vAsi_ .req v22 + vAso_ .req v23 + vAsu_ .req v24 + vAba_ .req v30 + vAbe_ .req v27 + + /* Unused temporary */ + vtmp .req v31 + + /* Mapping of Kecck-f1600 state to scalar registers + * at the beginning and end of each round. */ + s_Aba .req x1 + sAbe .req x6 + sAbi .req x11 + sAbo .req x16 + sAbu .req x21 + sAga .req x2 + sAge .req x7 + sAgi .req x12 + sAgo .req x17 + sAgu .req x22 + sAka .req x3 + sAke .req x8 + sAki .req x13 + sAko .req x18 + sAku .req x23 + sAma .req x4 + sAme .req x9 + sAmi .req x14 + sAmo .req x19 + sAmu .req x24 + sAsa .req x5 + sAse .req x10 + sAsi .req x15 + sAso .req x20 + sAsu .req x25 + + /* sA_[y,2*x+3*y] = rot(A[x,y]) */ + s_Aba_ .req x0 + sAbe_ .req x28 + sAbi_ .req x11 + sAbo_ .req x16 + sAbu_ .req x21 + sAga_ .req x3 + sAge_ .req x8 + sAgi_ .req x12 + sAgo_ .req x17 + sAgu_ .req x22 + sAka_ .req x4 + sAke_ .req x9 + sAki_ .req x13 + sAko_ .req x18 + sAku_ .req x23 + sAma_ .req x5 + sAme_ .req x10 + sAmi_ .req x14 + sAmo_ .req x19 + sAmu_ .req x24 + sAsa_ .req x1 + sAse_ .req x6 + sAsi_ .req x15 + sAso_ .req x20 + sAsu_ .req x25 + + /* sC[x] = sA[x,0] xor sA[x,1] xor sA[x,2] xor sA[x,3] xor sA[x,4], for x in 0..4 */ + /* sE[x] = sC[x-1] xor rot(C[x+1],1), for x in 0..4 */ + sC0 .req x0 + sE0 .req x29 + sC1 .req x26 + sE1 .req x30 + sC2 .req x27 + sE2 .req x26 + sC3 .req x28 + sE3 .req x27 + sC4 .req x29 + sE4 .req x28 + + tmp .req x30 + +/************************ MACROS ****************************/ + +/* Macros using v8.4-A SHA-3 instructions */ + + +.macro eor3_m1 d s0 s1 s2 + eor \d\().16b, \s0\().16b, \s1\().16b + eor \d\().16b, \d\().16b, \s2\().16b +.endm + +.macro rax1_m1 d s0 s1 + add vtmp.2d, \s1\().2d, \s1\().2d + sri vtmp.2d, \s1\().2d, #63 + eor \d\().16b, vtmp.16b, \s0\().16b +.endm + +.macro xar_m1 d s0 s1 imm + eor vtmp.16b, \s0\().16b, \s1\().16b + shl \d\().2d, vtmp.2d, #(64-\imm) + sri \d\().2d, vtmp.2d, #(\imm) +.endm + +.macro bcax_m1 d s0 s1 s2 + bic vtmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, vtmp.16b, \s0\().16b + .endm + + +.macro eor3_m0 d s0 s1 s2 + eor3 \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +.macro rax1_m0 d s0 s1 + rax1 \d\().2d, \s0\().2d, \s1\().2d +.endm + +.macro xar_m0 d s0 s1 imm + xar \d\().2d, \s0\().2d, \s1\().2d, #\imm +.endm + +.macro bcax_m0 d s0 s1 s2 + bcax \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + + +.macro load_input_vector num idx + ldr vAbaq, [input_addr, #(16*(\num*0+\idx))] + ldr vAbeq, [input_addr, #(16*(\num*1+\idx))] + ldr vAbiq, [input_addr, #(16*(\num*2+\idx))] + ldr vAboq, [input_addr, #(16*(\num*3+\idx))] + ldr vAbuq, [input_addr, #(16*(\num*4+\idx))] + ldr vAgaq, [input_addr, #(16*(\num*5+\idx))] + ldr vAgeq, [input_addr, #(16*(\num*6+\idx))] + ldr vAgiq, [input_addr, #(16*(\num*7+\idx))] + ldr vAgoq, [input_addr, #(16*(\num*8+\idx))] + ldr vAguq, [input_addr, #(16*(\num*9+\idx))] + ldr vAkaq, [input_addr, #(16*(\num*10+\idx))] + ldr vAkeq, [input_addr, #(16*(\num*11+\idx))] + ldr vAkiq, [input_addr, #(16*(\num*12+\idx))] + ldr vAkoq, [input_addr, #(16*(\num*13+\idx))] + ldr vAkuq, [input_addr, #(16*(\num*14+\idx))] + ldr vAmaq, [input_addr, #(16*(\num*15+\idx))] + ldr vAmeq, [input_addr, #(16*(\num*16+\idx))] + ldr vAmiq, [input_addr, #(16*(\num*17+\idx))] + ldr vAmoq, [input_addr, #(16*(\num*18+\idx))] + ldr vAmuq, [input_addr, #(16*(\num*19+\idx))] + ldr vAsaq, [input_addr, #(16*(\num*20+\idx))] + ldr vAseq, [input_addr, #(16*(\num*21+\idx))] + ldr vAsiq, [input_addr, #(16*(\num*22+\idx))] + ldr vAsoq, [input_addr, #(16*(\num*23+\idx))] + ldr vAsuq, [input_addr, #(16*(\num*24+\idx))] +.endm + +.macro store_input_vector num idx + str vAbaq, [input_addr, #(16*(\num*0+\idx))] + str vAbeq, [input_addr, #(16*(\num*1+\idx))] + str vAbiq, [input_addr, #(16*(\num*2+\idx))] + str vAboq, [input_addr, #(16*(\num*3+\idx))] + str vAbuq, [input_addr, #(16*(\num*4+\idx))] + str vAgaq, [input_addr, #(16*(\num*5+\idx))] + str vAgeq, [input_addr, #(16*(\num*6+\idx))] + str vAgiq, [input_addr, #(16*(\num*7+\idx))] + str vAgoq, [input_addr, #(16*(\num*8+\idx))] + str vAguq, [input_addr, #(16*(\num*9+\idx))] + str vAkaq, [input_addr, #(16*(\num*10+\idx))] + str vAkeq, [input_addr, #(16*(\num*11+\idx))] + str vAkiq, [input_addr, #(16*(\num*12+\idx))] + str vAkoq, [input_addr, #(16*(\num*13+\idx))] + str vAkuq, [input_addr, #(16*(\num*14+\idx))] + str vAmaq, [input_addr, #(16*(\num*15+\idx))] + str vAmeq, [input_addr, #(16*(\num*16+\idx))] + str vAmiq, [input_addr, #(16*(\num*17+\idx))] + str vAmoq, [input_addr, #(16*(\num*18+\idx))] + str vAmuq, [input_addr, #(16*(\num*19+\idx))] + str vAsaq, [input_addr, #(16*(\num*20+\idx))] + str vAseq, [input_addr, #(16*(\num*21+\idx))] + str vAsiq, [input_addr, #(16*(\num*22+\idx))] + str vAsoq, [input_addr, #(16*(\num*23+\idx))] + str vAsuq, [input_addr, #(16*(\num*24+\idx))] +.endm + +.macro store_input_scalar num idx + str s_Aba, [input_addr, 8*(\num*(0) +\idx)] + str sAbe, [input_addr, 8*(\num*(0+1) +\idx)] + str sAbi, [input_addr, 8*(\num*(2)+ \idx)] + str sAbo, [input_addr, 8*(\num*(2+1) +\idx)] + str sAbu, [input_addr, 8*(\num*(4)+ \idx)] + str sAga, [input_addr, 8*(\num*(4+1) +\idx)] + str sAge, [input_addr, 8*(\num*(6)+ \idx)] + str sAgi, [input_addr, 8*(\num*(6+1) +\idx)] + str sAgo, [input_addr, 8*(\num*(8)+ \idx)] + str sAgu, [input_addr, 8*(\num*(8+1) +\idx)] + str sAka, [input_addr, 8*(\num*(10) +\idx)] + str sAke, [input_addr, 8*(\num*(10+1)+\idx)] + str sAki, [input_addr, 8*(\num*(12) +\idx)] + str sAko, [input_addr, 8*(\num*(12+1)+\idx)] + str sAku, [input_addr, 8*(\num*(14) +\idx)] + str sAma, [input_addr, 8*(\num*(14+1)+\idx)] + str sAme, [input_addr, 8*(\num*(16) +\idx)] + str sAmi, [input_addr, 8*(\num*(16+1)+\idx)] + str sAmo, [input_addr, 8*(\num*(18) +\idx)] + str sAmu, [input_addr, 8*(\num*(18+1)+\idx)] + str sAsa, [input_addr, 8*(\num*(20) +\idx)] + str sAse, [input_addr, 8*(\num*(20+1)+\idx)] + str sAsi, [input_addr, 8*(\num*(22) +\idx)] + str sAso, [input_addr, 8*(\num*(22+1)+\idx)] + str sAsu, [input_addr, 8*(\num*(24) +\idx)] +.endm + +.macro load_input_scalar num idx + ldr s_Aba, [input_addr, 8*(\num*(0) +\idx)] + ldr sAbe, [input_addr, 8*(\num*(0+1) +\idx)] + ldr sAbi, [input_addr, 8*(\num*(2)+ \idx)] + ldr sAbo, [input_addr, 8*(\num*(2+1) +\idx)] + ldr sAbu, [input_addr, 8*(\num*(4)+ \idx)] + ldr sAga, [input_addr, 8*(\num*(4+1) +\idx)] + ldr sAge, [input_addr, 8*(\num*(6)+ \idx)] + ldr sAgi, [input_addr, 8*(\num*(6+1) +\idx)] + ldr sAgo, [input_addr, 8*(\num*(8)+ \idx)] + ldr sAgu, [input_addr, 8*(\num*(8+1) +\idx)] + ldr sAka, [input_addr, 8*(\num*(10) +\idx)] + ldr sAke, [input_addr, 8*(\num*(10+1)+\idx)] + ldr sAki, [input_addr, 8*(\num*(12) +\idx)] + ldr sAko, [input_addr, 8*(\num*(12+1)+\idx)] + ldr sAku, [input_addr, 8*(\num*(14) +\idx)] + ldr sAma, [input_addr, 8*(\num*(14+1)+\idx)] + ldr sAme, [input_addr, 8*(\num*(16) +\idx)] + ldr sAmi, [input_addr, 8*(\num*(16+1)+\idx)] + ldr sAmo, [input_addr, 8*(\num*(18) +\idx)] + ldr sAmu, [input_addr, 8*(\num*(18+1)+\idx)] + ldr sAsa, [input_addr, 8*(\num*(20) +\idx)] + ldr sAse, [input_addr, 8*(\num*(20+1)+\idx)] + ldr sAsi, [input_addr, 8*(\num*(22) +\idx)] + ldr sAso, [input_addr, 8*(\num*(22+1)+\idx)] + ldr sAsu, [input_addr, 8*(\num*(24) +\idx)] +.endm + +#define STACK_SIZE (8*8 + 16*6 + 3*8 + 8) // VREGS (8*8), GPRs (16*6), count (8), const (8), input (8), padding (8) +#define STACK_BASE_GPRS (3*8+8) +#define STACK_BASE_VREGS (3*8+8+16*6) +#define STACK_OFFSET_INPUT (0*8) +#define STACK_OFFSET_CONST (1*8) +#define STACK_OFFSET_COUNT (2*8) + +.macro save_gprs + stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro save_vregs + stp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] + stp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + stp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + stp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] +.endm + +.macro restore_vregs + ldp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] + ldp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + ldp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + ldp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] +.endm + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +.macro eor5 dst, src0, src1, src2, src3, src4 + eor \dst, \src0, \src1 + eor \dst, \dst, \src2 + eor \dst, \dst, \src3 + eor \dst, \dst, \src4 +.endm + +.macro xor_rol dst, src1, src0, imm + eor \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro bic_rol dst, src1, src0, imm + bic \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro rotate dst, src, imm + ror \dst, \src, #(64-\imm) +.endm + +.macro save reg, offset + str \reg, [sp, #\offset] +.endm + +.macro restore reg, offset + ldr \reg, [sp, #\offset] +.endm + +.macro hybrid_round_initial + + eor sC0, sAma, sAsa SEP eor3_m1 C0, vAba, vAga, vAka + eor sC1, sAme, sAse SEP + eor sC2, sAmi, sAsi SEP + eor sC3, sAmo, sAso SEP eor3_m1 C0, C0, vAma, vAsa + eor sC4, sAmu, sAsu SEP + eor sC0, sAka, sC0 SEP + eor sC1, sAke, sC1 SEP eor3_m1 C1, vAbe, vAge, vAke + eor sC2, sAki, sC2 SEP + eor sC3, sAko, sC3 SEP + eor sC4, sAku, sC4 SEP eor3_m1 C1, C1, vAme, vAse + eor sC0, sAga, sC0 SEP + eor sC1, sAge, sC1 SEP + eor sC2, sAgi, sC2 SEP eor3_m1 C2, vAbi, vAgi, vAki + eor sC3, sAgo, sC3 SEP + eor sC4, sAgu, sC4 SEP + eor sC0, s_Aba, sC0 SEP eor3_m1 C2, C2, vAmi, vAsi + eor sC1, sAbe, sC1 SEP + eor sC2, sAbi, sC2 SEP + eor sC3, sAbo, sC3 SEP eor3_m1 C3, vAbo, vAgo, vAko + eor sC4, sAbu, sC4 SEP + SEP + eor sE1, sC0, sC2, ROR #63 SEP eor3_m1 C3, C3, vAmo, vAso + eor sE3, sC2, sC4, ROR #63 SEP + eor sE0, sC4, sC1, ROR #63 SEP + eor sE2, sC1, sC3, ROR #63 SEP eor3_m1 C4, vAbu, vAgu, vAku + eor sE4, sC3, sC0, ROR #63 SEP + SEP + eor s_Aba_, s_Aba, sE0 SEP eor3_m1 C4, C4, vAmu, vAsu + eor sAsa_, sAbi, sE2 SEP + eor sAbi_, sAki, sE2 SEP + eor sAki_, sAko, sE3 SEP rax1_m1 E1, C0, C2 + eor sAko_, sAmu, sE4 SEP + eor sAmu_, sAso, sE3 SEP + eor sAso_, sAma, sE0 SEP + eor sAka_, sAbe, sE1 SEP rax1_m1 E3, C2, C4 + eor sAse_, sAgo, sE3 SEP + eor sAgo_, sAme, sE1 SEP + eor sAke_, sAgi, sE2 SEP + eor sAgi_, sAka, sE0 SEP rax1_m1 E0, C4, C1 + eor sAga_, sAbo, sE3 SEP + eor sAbo_, sAmo, sE3 SEP + eor sAmo_, sAmi, sE2 SEP + eor sAmi_, sAke, sE1 SEP rax1_m1 E2, C1, C3 + eor sAge_, sAgu, sE4 SEP + eor sAgu_, sAsi, sE2 SEP + eor sAsi_, sAku, sE4 SEP + eor sAku_, sAsa, sE0 SEP rax1_m1 E4, C3, C0 + eor sAma_, sAbu, sE4 SEP + eor sAbu_, sAsu, sE4 SEP + eor sAsu_, sAse, sE1 SEP + eor sAme_, sAga, sE0 SEP eor vAba_.16b, vAba.16b, E0.16b + eor sAbe_, sAge, sE1 SEP + SEP + load_constant_ptr SEP xar_m1 vAsa_, vAbi, E2, 2 + SEP + bic tmp, sAgi_, sAge_, ROR #47 SEP + eor sAga, tmp, sAga_, ROR #39 SEP + bic tmp, sAgo_, sAgi_, ROR #42 SEP xar_m1 vAbi_, vAki, E2, 21 + eor sAge, tmp, sAge_, ROR #25 SEP + bic tmp, sAgu_, sAgo_, ROR #16 SEP + eor sAgi, tmp, sAgi_, ROR #58 SEP + bic tmp, sAga_, sAgu_, ROR #31 SEP xar_m1 vAki_, vAko, E3, 39 + eor sAgo, tmp, sAgo_, ROR #47 SEP + bic tmp, sAge_, sAga_, ROR #56 SEP + eor sAgu, tmp, sAgu_, ROR #23 SEP xar_m1 vAko_, vAmu, E4, 56 + bic tmp, sAki_, sAke_, ROR #19 SEP + eor sAka, tmp, sAka_, ROR #24 SEP + bic tmp, sAko_, sAki_, ROR #47 SEP + eor sAke, tmp, sAke_, ROR #2 SEP xar_m1 vAmu_, vAso, E3, 8 + bic tmp, sAku_, sAko_, ROR #10 SEP + eor sAki, tmp, sAki_, ROR #57 SEP + bic tmp, sAka_, sAku_, ROR #47 SEP xar_m1 vAso_, vAma, E0, 23 + eor sAko, tmp, sAko_, ROR #57 SEP + bic tmp, sAke_, sAka_, ROR #5 SEP + eor sAku, tmp, sAku_, ROR #52 SEP + bic tmp, sAmi_, sAme_, ROR #38 SEP xar_m1 vAka_, vAbe, E1, 63 + eor sAma, tmp, sAma_, ROR #47 SEP + bic tmp, sAmo_, sAmi_, ROR #5 SEP + eor sAme, tmp, sAme_, ROR #43 SEP xar_m1 vAse_, vAgo, E3, 9 + bic tmp, sAmu_, sAmo_, ROR #41 SEP + eor sAmi, tmp, sAmi_, ROR #46 SEP + SEP + ldr cur_const, [const_addr] SEP + mov count, #1 SEP xar_m1 vAgo_, vAme, E1, 19 + SEP + bic tmp, sAma_, sAmu_, ROR #35 SEP + eor sAmo, tmp, sAmo_, ROR #12 SEP + bic tmp, sAme_, sAma_, ROR #9 SEP xar_m1 vAke_, vAgi, E2, 58 + eor sAmu, tmp, sAmu_, ROR #44 SEP + bic tmp, sAsi_, sAse_, ROR #48 SEP + eor sAsa, tmp, sAsa_, ROR #41 SEP xar_m1 vAgi_, vAka, E0, 61 + bic tmp, sAso_, sAsi_, ROR #2 SEP + eor sAse, tmp, sAse_, ROR #50 SEP + bic tmp, sAsu_, sAso_, ROR #25 SEP + eor sAsi, tmp, sAsi_, ROR #27 SEP xar_m1 vAga_, vAbo, E3, 36 + bic tmp, sAsa_, sAsu_, ROR #60 SEP + eor sAso, tmp, sAso_, ROR #21 SEP + bic tmp, sAse_, sAsa_, ROR #57 SEP xar_m1 vAbo_, vAmo, E3, 43 + eor sAsu, tmp, sAsu_, ROR #53 SEP + bic tmp, sAbi_, sAbe_, ROR #63 SEP + eor s_Aba, s_Aba_, tmp, ROR #21 SEP + bic tmp, sAbo_, sAbi_, ROR #42 SEP xar_m1 vAmo_, vAmi, E2, 49 + eor sAbe, tmp, sAbe_, ROR #41 SEP + bic tmp, sAbu_, sAbo_, ROR #57 SEP + eor sAbi, tmp, sAbi_, ROR #35 SEP xar_m1 vAmi_, vAke, E1, 54 + bic tmp, s_Aba_, sAbu_, ROR #50 SEP + eor sAbo, tmp, sAbo_, ROR #43 SEP + bic tmp, sAbe_, s_Aba_, ROR #44 SEP + eor sAbu, tmp, sAbu_, ROR #30 SEP xar_m1 vAge_, vAgu, E4, 44 + SEP + eor s_Aba, s_Aba, cur_const SEP + SEP xar_m1 vAgu_, vAsi, E2, 3 + save count, STACK_OFFSET_COUNT SEP + SEP + eor sC0, sAka, sAsa, ROR #50 SEP + eor sC1, sAse, sAge, ROR #60 SEP + eor sC2, sAmi, sAgi, ROR #59 SEP xar_m1 vAsi_, vAku, E4, 25 + eor sC3, sAgo, sAso, ROR #30 SEP + eor sC4, sAbu, sAsu, ROR #53 SEP + eor sC0, sAma, sC0, ROR #49 SEP xar_m1 vAku_, vAsa, E0, 46 + eor sC1, sAbe, sC1, ROR #44 SEP + eor sC2, sAki, sC2, ROR #26 SEP + eor sC3, sAmo, sC3, ROR #63 SEP + eor sC4, sAmu, sC4, ROR #56 SEP xar_m1 vAma_, vAbu, E4, 37 + eor sC0, sAga, sC0, ROR #57 SEP + eor sC1, sAme, sC1, ROR #58 SEP + eor sC2, sAbi, sC2, ROR #60 SEP xar_m1 vAbu_, vAsu, E4, 50 + eor sC3, sAko, sC3, ROR #38 SEP + eor sC4, sAgu, sC4, ROR #48 SEP + eor sC0, s_Aba, sC0, ROR #61 SEP + eor sC1, sAke, sC1, ROR #57 SEP xar_m1 vAsu_, vAse, E1, 62 + eor sC2, sAsi, sC2, ROR #52 SEP + eor sC3, sAbo, sC3, ROR #63 SEP + eor sC4, sAku, sC4, ROR #50 SEP xar_m1 vAme_, vAga, E0, 28 + ror sC1, sC1, 56 SEP + ror sC4, sC4, 58 SEP + ror sC2, sC2, 62 SEP xar_m1 vAbe_, vAge, E1, 20 + SEP + eor sE1, sC0, sC2, ROR #63 SEP + eor sE3, sC2, sC4, ROR #63 SEP + eor sE0, sC4, sC1, ROR #63 SEP bcax_m1 vAga, vAga_, vAgi_, vAge_ + eor sE2, sC1, sC3, ROR #63 SEP + eor sE4, sC3, sC0, ROR #63 SEP + SEP bcax_m1 vAge, vAge_, vAgo_, vAgi_ + eor s_Aba_, sE0, s_Aba SEP + eor sAsa_, sE2, sAbi, ROR #50 SEP + eor sAbi_, sE2, sAki, ROR #46 SEP bcax_m1 vAgi, vAgi_, vAgu_, vAgo_ + eor sAki_, sE3, sAko, ROR #63 SEP + eor sAko_, sE4, sAmu, ROR #28 SEP + eor sAmu_, sE3, sAso, ROR #2 SEP bcax_m1 vAgo, vAgo_, vAga_, vAgu_ + eor sAso_, sE0, sAma, ROR #54 SEP + eor sAka_, sE1, sAbe, ROR #43 SEP + eor sAse_, sE3, sAgo, ROR #36 SEP bcax_m1 vAgu, vAgu_, vAge_, vAga_ + eor sAgo_, sE1, sAme, ROR #49 SEP + eor sAke_, sE2, sAgi, ROR #3 SEP + eor sAgi_, sE0, sAka, ROR #39 SEP bcax_m1 vAka, vAka_, vAki_, vAke_ + eor sAga_, sE3, sAbo SEP + eor sAbo_, sE3, sAmo, ROR #37 SEP + eor sAmo_, sE2, sAmi, ROR #8 SEP bcax_m1 vAke, vAke_, vAko_, vAki_ + eor sAmi_, sE1, sAke, ROR #56 SEP + eor sAge_, sE4, sAgu, ROR #44 SEP + eor sAgu_, sE2, sAsi, ROR #62 SEP bcax_m1 vAki, vAki_, vAku_, vAko_ + eor sAsi_, sE4, sAku, ROR #58 SEP + eor sAku_, sE0, sAsa, ROR #25 SEP + eor sAma_, sE4, sAbu, ROR #20 SEP bcax_m1 vAko, vAko_, vAka_, vAku_ + eor sAbu_, sE4, sAsu, ROR #9 SEP + eor sAsu_, sE1, sAse, ROR #23 SEP + eor sAme_, sE0, sAga, ROR #61 SEP bcax_m1 vAku, vAku_, vAke_, vAka_ + eor sAbe_, sE1, sAge, ROR #19 SEP + SEP + load_constant_ptr SEP bcax_m1 vAma, vAma_, vAmi_, vAme_ + restore count, STACK_OFFSET_COUNT SEP + SEP + bic tmp, sAgi_, sAge_, ROR #47 SEP restore x26, STACK_OFFSET_CONST + eor sAga, tmp, sAga_, ROR #39 SEP + bic tmp, sAgo_, sAgi_, ROR #42 SEP + eor sAge, tmp, sAge_, ROR #25 SEP bcax_m1 vAme, vAme_, vAmo_, vAmi_ + bic tmp, sAgu_, sAgo_, ROR #16 SEP + eor sAgi, tmp, sAgi_, ROR #58 SEP ld1r {v28.2d}, [x26], #8 + bic tmp, sAga_, sAgu_, ROR #31 SEP + eor sAgo, tmp, sAgo_, ROR #47 SEP + bic tmp, sAge_, sAga_, ROR #56 SEP bcax_m1 vAmi, vAmi_, vAmu_, vAmo_ + eor sAgu, tmp, sAgu_, ROR #23 SEP + bic tmp, sAki_, sAke_, ROR #19 SEP save x26, STACK_OFFSET_CONST + eor sAka, tmp, sAka_, ROR #24 SEP + bic tmp, sAko_, sAki_, ROR #47 SEP + eor sAke, tmp, sAke_, ROR #2 SEP bcax_m1 vAmo, vAmo_, vAma_, vAmu_ + bic tmp, sAku_, sAko_, ROR #10 SEP + eor sAki, tmp, sAki_, ROR #57 SEP + bic tmp, sAka_, sAku_, ROR #47 SEP bcax_m1 vAmu, vAmu_, vAme_, vAma_ + eor sAko, tmp, sAko_, ROR #57 SEP + bic tmp, sAke_, sAka_, ROR #5 SEP + eor sAku, tmp, sAku_, ROR #52 SEP bcax_m1 vAsa, vAsa_, vAsi_, vAse_ + bic tmp, sAmi_, sAme_, ROR #38 SEP + eor sAma, tmp, sAma_, ROR #47 SEP + bic tmp, sAmo_, sAmi_, ROR #5 SEP bcax_m1 vAse, vAse_, vAso_, vAsi_ + eor sAme, tmp, sAme_, ROR #43 SEP + bic tmp, sAmu_, sAmo_, ROR #41 SEP + eor sAmi, tmp, sAmi_, ROR #46 SEP bcax_m1 vAsi, vAsi_, vAsu_, vAso_ + bic tmp, sAma_, sAmu_, ROR #35 SEP + SEP + ldr cur_const, [const_addr, count, UXTW #3] SEP bcax_m1 vAso, vAso_, vAsa_, vAsu_ + SEP + eor sAmo, tmp, sAmo_, ROR #12 SEP + bic tmp, sAme_, sAma_, ROR #9 SEP bcax_m1 vAsu, vAsu_, vAse_, vAsa_ + eor sAmu, tmp, sAmu_, ROR #44 SEP + bic tmp, sAsi_, sAse_, ROR #48 SEP + eor sAsa, tmp, sAsa_, ROR #41 SEP bcax_m1 vAba, vAba_, vAbi_, vAbe_ + bic tmp, sAso_, sAsi_, ROR #2 SEP + eor sAse, tmp, sAse_, ROR #50 SEP + bic tmp, sAsu_, sAso_, ROR #25 SEP bcax_m1 vAbe, vAbe_, vAbo_, vAbi_ + eor sAsi, tmp, sAsi_, ROR #27 SEP + bic tmp, sAsa_, sAsu_, ROR #60 SEP + eor sAso, tmp, sAso_, ROR #21 SEP bcax_m1 vAbi, vAbi_, vAbu_, vAbo_ + bic tmp, sAse_, sAsa_, ROR #57 SEP + eor sAsu, tmp, sAsu_, ROR #53 SEP + bic tmp, sAbi_, sAbe_, ROR #63 SEP bcax_m1 vAbo, vAbo_, vAba_, vAbu_ + eor s_Aba, s_Aba_, tmp, ROR #21 SEP + bic tmp, sAbo_, sAbi_, ROR #42 SEP + eor sAbe, tmp, sAbe_, ROR #41 SEP bcax_m1 vAbu, vAbu_, vAbe_, vAba_ + bic tmp, sAbu_, sAbo_, ROR #57 SEP + eor sAbi, tmp, sAbi_, ROR #35 SEP + bic tmp, s_Aba_, sAbu_, ROR #50 SEP + eor sAbo, tmp, sAbo_, ROR #43 SEP eor vAba.16b, vAba.16b, v28.16b + bic tmp, sAbe_, s_Aba_, ROR #44 SEP + eor sAbu, tmp, sAbu_, ROR #30 SEP + SEP + add count, count, #1 SEP + SEP + eor s_Aba, s_Aba, cur_const SEP + SEP +.endm + +.macro hybrid_round_noninitial + save count, STACK_OFFSET_COUNT SEP eor3_m1 C0, vAba, vAga, vAka + SEP + eor sC0, sAka, sAsa, ROR #50 SEP + eor sC1, sAse, sAge, ROR #60 SEP eor3_m1 C0, C0, vAma, vAsa + eor sC2, sAmi, sAgi, ROR #59 SEP + eor sC3, sAgo, sAso, ROR #30 SEP + eor sC4, sAbu, sAsu, ROR #53 SEP eor3_m1 C1, vAbe, vAge, vAke + eor sC0, sAma, sC0, ROR #49 SEP + eor sC1, sAbe, sC1, ROR #44 SEP + eor sC2, sAki, sC2, ROR #26 SEP eor3_m1 C1, C1, vAme, vAse + eor sC3, sAmo, sC3, ROR #63 SEP + eor sC4, sAmu, sC4, ROR #56 SEP + eor sC0, sAga, sC0, ROR #57 SEP eor3_m1 C2, vAbi, vAgi, vAki + eor sC1, sAme, sC1, ROR #58 SEP + eor sC2, sAbi, sC2, ROR #60 SEP + eor sC3, sAko, sC3, ROR #38 SEP eor3_m1 C2, C2, vAmi, vAsi + eor sC4, sAgu, sC4, ROR #48 SEP + eor sC0, s_Aba, sC0, ROR #61 SEP + eor sC1, sAke, sC1, ROR #57 SEP eor3_m1 C3, vAbo, vAgo, vAko + eor sC2, sAsi, sC2, ROR #52 SEP + eor sC3, sAbo, sC3, ROR #63 SEP + eor sC4, sAku, sC4, ROR #50 SEP eor3_m1 C3, C3, vAmo, vAso + ror sC1, sC1, 56 SEP + ror sC4, sC4, 58 SEP + ror sC2, sC2, 62 SEP eor3_m1 C4, vAbu, vAgu, vAku + SEP + eor sE1, sC0, sC2, ROR #63 SEP + eor sE3, sC2, sC4, ROR #63 SEP eor3_m1 C4, C4, vAmu, vAsu + eor sE0, sC4, sC1, ROR #63 SEP + eor sE2, sC1, sC3, ROR #63 SEP + eor sE4, sC3, sC0, ROR #63 SEP rax1_m1 E1, C0, C2 + SEP + eor s_Aba_, sE0, s_Aba SEP + eor sAsa_, sE2, sAbi, ROR #50 SEP + eor sAbi_, sE2, sAki, ROR #46 SEP rax1_m1 E3, C2, C4 + eor sAki_, sE3, sAko, ROR #63 SEP + eor sAko_, sE4, sAmu, ROR #28 SEP + eor sAmu_, sE3, sAso, ROR #2 SEP + eor sAso_, sE0, sAma, ROR #54 SEP rax1_m1 E0, C4, C1 + eor sAka_, sE1, sAbe, ROR #43 SEP + eor sAse_, sE3, sAgo, ROR #36 SEP + eor sAgo_, sE1, sAme, ROR #49 SEP + eor sAke_, sE2, sAgi, ROR #3 SEP rax1_m1 E2, C1, C3 + eor sAgi_, sE0, sAka, ROR #39 SEP + eor sAga_, sE3, sAbo SEP + eor sAbo_, sE3, sAmo, ROR #37 SEP + eor sAmo_, sE2, sAmi, ROR #8 SEP rax1_m1 E4, C3, C0 + eor sAmi_, sE1, sAke, ROR #56 SEP + eor sAge_, sE4, sAgu, ROR #44 SEP + eor sAgu_, sE2, sAsi, ROR #62 SEP + eor sAsi_, sE4, sAku, ROR #58 SEP eor vAba_.16b, vAba.16b, E0.16b + eor sAku_, sE0, sAsa, ROR #25 SEP + eor sAma_, sE4, sAbu, ROR #20 SEP + eor sAbu_, sE4, sAsu, ROR #9 SEP xar_m1 vAsa_, vAbi, E2, 2 + eor sAsu_, sE1, sAse, ROR #23 SEP + eor sAme_, sE0, sAga, ROR #61 SEP + eor sAbe_, sE1, sAge, ROR #19 SEP + SEP xar_m1 vAbi_, vAki, E2, 21 + load_constant_ptr SEP + restore count, STACK_OFFSET_COUNT SEP + SEP + bic tmp, sAgi_, sAge_, ROR #47 SEP xar_m1 vAki_, vAko, E3, 39 + eor sAga, tmp, sAga_, ROR #39 SEP + bic tmp, sAgo_, sAgi_, ROR #42 SEP + eor sAge, tmp, sAge_, ROR #25 SEP xar_m1 vAko_, vAmu, E4, 56 + bic tmp, sAgu_, sAgo_, ROR #16 SEP + eor sAgi, tmp, sAgi_, ROR #58 SEP + bic tmp, sAga_, sAgu_, ROR #31 SEP + eor sAgo, tmp, sAgo_, ROR #47 SEP xar_m1 vAmu_, vAso, E3, 8 + bic tmp, sAge_, sAga_, ROR #56 SEP + eor sAgu, tmp, sAgu_, ROR #23 SEP + bic tmp, sAki_, sAke_, ROR #19 SEP + eor sAka, tmp, sAka_, ROR #24 SEP xar_m1 vAso_, vAma, E0, 23 + bic tmp, sAko_, sAki_, ROR #47 SEP + eor sAke, tmp, sAke_, ROR #2 SEP + bic tmp, sAku_, sAko_, ROR #10 SEP + eor sAki, tmp, sAki_, ROR #57 SEP xar_m1 vAka_, vAbe, E1, 63 + bic tmp, sAka_, sAku_, ROR #47 SEP + eor sAko, tmp, sAko_, ROR #57 SEP + bic tmp, sAke_, sAka_, ROR #5 SEP xar_m1 vAse_, vAgo, E3, 9 + eor sAku, tmp, sAku_, ROR #52 SEP + bic tmp, sAmi_, sAme_, ROR #38 SEP + eor sAma, tmp, sAma_, ROR #47 SEP + bic tmp, sAmo_, sAmi_, ROR #5 SEP + eor sAme, tmp, sAme_, ROR #43 SEP xar_m1 vAgo_, vAme, E1, 19 + bic tmp, sAmu_, sAmo_, ROR #41 SEP + eor sAmi, tmp, sAmi_, ROR #46 SEP + bic tmp, sAma_, sAmu_, ROR #35 SEP + SEP xar_m1 vAke_, vAgi, E2, 58 + ldr cur_const, [const_addr, count, UXTW #3] SEP + add count, count, #1 SEP + SEP + eor sAmo, tmp, sAmo_, ROR #12 SEP + bic tmp, sAme_, sAma_, ROR #9 SEP xar_m1 vAgi_, vAka, E0, 61 + eor sAmu, tmp, sAmu_, ROR #44 SEP + bic tmp, sAsi_, sAse_, ROR #48 SEP + eor sAsa, tmp, sAsa_, ROR #41 SEP + bic tmp, sAso_, sAsi_, ROR #2 SEP xar_m1 vAga_, vAbo, E3, 36 + eor sAse, tmp, sAse_, ROR #50 SEP + bic tmp, sAsu_, sAso_, ROR #25 SEP + eor sAsi, tmp, sAsi_, ROR #27 SEP + bic tmp, sAsa_, sAsu_, ROR #60 SEP xar_m1 vAbo_, vAmo, E3, 43 + eor sAso, tmp, sAso_, ROR #21 SEP + bic tmp, sAse_, sAsa_, ROR #57 SEP + eor sAsu, tmp, sAsu_, ROR #53 SEP + bic tmp, sAbi_, sAbe_, ROR #63 SEP xar_m1 vAmo_, vAmi, E2, 49 + eor s_Aba, s_Aba_, tmp, ROR #21 SEP + bic tmp, sAbo_, sAbi_, ROR #42 SEP + eor sAbe, tmp, sAbe_, ROR #41 SEP + bic tmp, sAbu_, sAbo_, ROR #57 SEP xar_m1 vAmi_, vAke, E1, 54 + eor sAbi, tmp, sAbi_, ROR #35 SEP + bic tmp, s_Aba_, sAbu_, ROR #50 SEP + eor sAbo, tmp, sAbo_, ROR #43 SEP + bic tmp, sAbe_, s_Aba_, ROR #44 SEP xar_m1 vAge_, vAgu, E4, 44 + eor sAbu, tmp, sAbu_, ROR #30 SEP + SEP + eor s_Aba, s_Aba, cur_const SEP xar_m1 vAgu_, vAsi, E2, 3 + save count, STACK_OFFSET_COUNT SEP + SEP + eor sC0, sAka, sAsa, ROR #50 SEP + eor sC1, sAse, sAge, ROR #60 SEP + eor sC2, sAmi, sAgi, ROR #59 SEP xar_m1 vAsi_, vAku, E4, 25 + eor sC3, sAgo, sAso, ROR #30 SEP + eor sC4, sAbu, sAsu, ROR #53 SEP + eor sC0, sAma, sC0, ROR #49 SEP xar_m1 vAku_, vAsa, E0, 46 + eor sC1, sAbe, sC1, ROR #44 SEP + eor sC2, sAki, sC2, ROR #26 SEP + eor sC3, sAmo, sC3, ROR #63 SEP + eor sC4, sAmu, sC4, ROR #56 SEP xar_m1 vAma_, vAbu, E4, 37 + eor sC0, sAga, sC0, ROR #57 SEP + eor sC1, sAme, sC1, ROR #58 SEP + eor sC2, sAbi, sC2, ROR #60 SEP + eor sC3, sAko, sC3, ROR #38 SEP xar_m1 vAbu_, vAsu, E4, 50 + eor sC4, sAgu, sC4, ROR #48 SEP + eor sC0, s_Aba, sC0, ROR #61 SEP + eor sC1, sAke, sC1, ROR #57 SEP + eor sC2, sAsi, sC2, ROR #52 SEP xar_m1 vAsu_, vAse, E1, 62 + eor sC3, sAbo, sC3, ROR #63 SEP + eor sC4, sAku, sC4, ROR #50 SEP + ror sC1, sC1, 56 SEP xar_m1 vAme_, vAga, E0, 28 + ror sC4, sC4, 58 SEP + ror sC2, sC2, 62 SEP + SEP xar_m1 vAbe_, vAge, E1, 20 + eor sE1, sC0, sC2, ROR #63 SEP + eor sE3, sC2, sC4, ROR #63 SEP + eor sE0, sC4, sC1, ROR #63 SEP + eor sE2, sC1, sC3, ROR #63 SEP bcax_m1 vAga, vAga_, vAgi_, vAge_ + eor sE4, sC3, sC0, ROR #63 SEP + SEP + eor s_Aba_, sE0, s_Aba SEP bcax_m1 vAge, vAge_, vAgo_, vAgi_ + eor sAsa_, sE2, sAbi, ROR #50 SEP + eor sAbi_, sE2, sAki, ROR #46 SEP + eor sAki_, sE3, sAko, ROR #63 SEP bcax_m1 vAgi, vAgi_, vAgu_, vAgo_ + eor sAko_, sE4, sAmu, ROR #28 SEP + eor sAmu_, sE3, sAso, ROR #2 SEP + eor sAso_, sE0, sAma, ROR #54 SEP bcax_m1 vAgo, vAgo_, vAga_, vAgu_ + eor sAka_, sE1, sAbe, ROR #43 SEP + eor sAse_, sE3, sAgo, ROR #36 SEP + eor sAgo_, sE1, sAme, ROR #49 SEP bcax_m1 vAgu, vAgu_, vAge_, vAga_ + eor sAke_, sE2, sAgi, ROR #3 SEP + eor sAgi_, sE0, sAka, ROR #39 SEP + eor sAga_, sE3, sAbo SEP bcax_m1 vAka, vAka_, vAki_, vAke_ + eor sAbo_, sE3, sAmo, ROR #37 SEP + eor sAmo_, sE2, sAmi, ROR #8 SEP + eor sAmi_, sE1, sAke, ROR #56 SEP bcax_m1 vAke, vAke_, vAko_, vAki_ + eor sAge_, sE4, sAgu, ROR #44 SEP + eor sAgu_, sE2, sAsi, ROR #62 SEP + eor sAsi_, sE4, sAku, ROR #58 SEP bcax_m1 vAki, vAki_, vAku_, vAko_ + eor sAku_, sE0, sAsa, ROR #25 SEP + eor sAma_, sE4, sAbu, ROR #20 SEP + eor sAbu_, sE4, sAsu, ROR #9 SEP bcax_m1 vAko, vAko_, vAka_, vAku_ + eor sAsu_, sE1, sAse, ROR #23 SEP + eor sAme_, sE0, sAga, ROR #61 SEP + eor sAbe_, sE1, sAge, ROR #19 SEP bcax_m1 vAku, vAku_, vAke_, vAka_ + SEP + load_constant_ptr SEP + restore count, STACK_OFFSET_COUNT SEP bcax_m1 vAma, vAma_, vAmi_, vAme_ + SEP + bic tmp, sAgi_, sAge_, ROR #47 SEP + eor sAga, tmp, sAga_, ROR #39 SEP restore x26, STACK_OFFSET_CONST + bic tmp, sAgo_, sAgi_, ROR #42 SEP + eor sAge, tmp, sAge_, ROR #25 SEP + bic tmp, sAgu_, sAgo_, ROR #16 SEP bcax_m1 vAme, vAme_, vAmo_, vAmi_ + eor sAgi, tmp, sAgi_, ROR #58 SEP + bic tmp, sAga_, sAgu_, ROR #31 SEP ld1r {v28.2d}, [x26], #8 + eor sAgo, tmp, sAgo_, ROR #47 SEP + bic tmp, sAge_, sAga_, ROR #56 SEP + eor sAgu, tmp, sAgu_, ROR #23 SEP bcax_m1 vAmi, vAmi_, vAmu_, vAmo_ + bic tmp, sAki_, sAke_, ROR #19 SEP + eor sAka, tmp, sAka_, ROR #24 SEP save x26, STACK_OFFSET_CONST + bic tmp, sAko_, sAki_, ROR #47 SEP + eor sAke, tmp, sAke_, ROR #2 SEP + bic tmp, sAku_, sAko_, ROR #10 SEP bcax_m1 vAmo, vAmo_, vAma_, vAmu_ + eor sAki, tmp, sAki_, ROR #57 SEP + bic tmp, sAka_, sAku_, ROR #47 SEP + eor sAko, tmp, sAko_, ROR #57 SEP bcax_m1 vAmu, vAmu_, vAme_, vAma_ + bic tmp, sAke_, sAka_, ROR #5 SEP + eor sAku, tmp, sAku_, ROR #52 SEP + bic tmp, sAmi_, sAme_, ROR #38 SEP bcax_m1 vAsa, vAsa_, vAsi_, vAse_ + eor sAma, tmp, sAma_, ROR #47 SEP + bic tmp, sAmo_, sAmi_, ROR #5 SEP + eor sAme, tmp, sAme_, ROR #43 SEP bcax_m1 vAse, vAse_, vAso_, vAsi_ + bic tmp, sAmu_, sAmo_, ROR #41 SEP + eor sAmi, tmp, sAmi_, ROR #46 SEP + bic tmp, sAma_, sAmu_, ROR #35 SEP bcax_m1 vAsi, vAsi_, vAsu_, vAso_ + SEP + ldr cur_const, [const_addr, count, UXTW #3] SEP + add count, count, #1 SEP bcax_m1 vAso, vAso_, vAsa_, vAsu_ + SEP + eor sAmo, tmp, sAmo_, ROR #12 SEP + bic tmp, sAme_, sAma_, ROR #9 SEP bcax_m1 vAsu, vAsu_, vAse_, vAsa_ + eor sAmu, tmp, sAmu_, ROR #44 SEP + bic tmp, sAsi_, sAse_, ROR #48 SEP + eor sAsa, tmp, sAsa_, ROR #41 SEP bcax_m1 vAba, vAba_, vAbi_, vAbe_ + bic tmp, sAso_, sAsi_, ROR #2 SEP + eor sAse, tmp, sAse_, ROR #50 SEP + bic tmp, sAsu_, sAso_, ROR #25 SEP bcax_m1 vAbe, vAbe_, vAbo_, vAbi_ + eor sAsi, tmp, sAsi_, ROR #27 SEP + bic tmp, sAsa_, sAsu_, ROR #60 SEP + eor sAso, tmp, sAso_, ROR #21 SEP bcax_m1 vAbi, vAbi_, vAbu_, vAbo_ + bic tmp, sAse_, sAsa_, ROR #57 SEP + eor sAsu, tmp, sAsu_, ROR #53 SEP + bic tmp, sAbi_, sAbe_, ROR #63 SEP bcax_m1 vAbo, vAbo_, vAba_, vAbu_ + eor s_Aba, s_Aba_, tmp, ROR #21 SEP + bic tmp, sAbo_, sAbi_, ROR #42 SEP + eor sAbe, tmp, sAbe_, ROR #41 SEP bcax_m1 vAbu, vAbu_, vAbe_, vAba_ + bic tmp, sAbu_, sAbo_, ROR #57 SEP + eor sAbi, tmp, sAbi_, ROR #35 SEP + bic tmp, s_Aba_, sAbu_, ROR #50 SEP + eor sAbo, tmp, sAbo_, ROR #43 SEP eor vAba.16b, vAba.16b, v28.16b + bic tmp, sAbe_, s_Aba_, ROR #44 SEP + eor sAbu, tmp, sAbu_, ROR #30 SEP + SEP + eor s_Aba, s_Aba, cur_const SEP + +.endm + +.macro final_rotate + ror sAga, sAga,#(64-3) + ror sAka, sAka,#(64-25) + ror sAma, sAma,#(64-10) + ror sAsa, sAsa,#(64-39) + ror sAbe, sAbe,#(64-21) + ror sAge, sAge,#(64-45) + ror sAke, sAke,#(64-8) + ror sAme, sAme,#(64-15) + ror sAse, sAse,#(64-41) + ror sAbi, sAbi,#(64-14) + ror sAgi, sAgi,#(64-61) + ror sAki, sAki,#(64-18) + ror sAmi, sAmi,#(64-56) + ror sAsi, sAsi,#(64-2) + ror sAgo, sAgo,#(64-28) + ror sAko, sAko,#(64-1) + ror sAmo, sAmo,#(64-27) + ror sAso, sAso,#(64-62) + ror sAbu, sAbu,#(64-44) + ror sAgu, sAgu,#(64-20) + ror sAku, sAku,#(64-6) + ror sAmu, sAmu,#(64-36) + ror sAsu, sAsu,#(64-55) +.endm + +#define KECCAK_F1600_ROUNDS 24 + +.global keccak_f1600_x4_hybrid_asm_v3p +.global _keccak_f1600_x4_hybrid_asm_v3p +.text +.align 4 + +keccak_f1600_x4_hybrid_asm_v3p: +_keccak_f1600_x4_hybrid_asm_v3p: + alloc_stack + save_gprs + save_vregs + save input_addr, STACK_OFFSET_INPUT + + load_input_vector 2,1 + + load_constant_ptr + + save const_addr, STACK_OFFSET_CONST + + // First scalar Keccak computation alongside first half of SIMD computation + load_input_scalar 4,0 + hybrid_round_initial + loop_0: + hybrid_round_noninitial + cmp count, #(KECCAK_F1600_ROUNDS-1) + ble loop_0 + final_rotate + restore input_addr, STACK_OFFSET_INPUT + store_input_scalar 4,0 + + // Second scalar Keccak computation alongsie second half of SIMD computation + load_input_scalar 4,1 + hybrid_round_initial + loop_1: + hybrid_round_noninitial + cmp count, #(KECCAK_F1600_ROUNDS-1) + ble loop_1 + final_rotate + restore input_addr, STACK_OFFSET_INPUT + store_input_scalar 4, 1 + + store_input_vector 2,1 + + restore_vregs + restore_gprs + free_stack + ret diff --git a/tests/keccak_neon/manual/keccak_f1600_x4_hybrid_asm_v3pp.s b/tests/keccak_neon/manual/keccak_f1600_x4_hybrid_asm_v3pp.s new file mode 100644 index 0000000..3b7e3bc --- /dev/null +++ b/tests/keccak_neon/manual/keccak_f1600_x4_hybrid_asm_v3pp.s @@ -0,0 +1,1022 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +/********************** CONSTANTS *************************/ + .data + .align(8) +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x29 + count .req w27 + out_count .req w27 + cur_const .req x26 + + /* Mapping of Kecck-f1600 SIMD state to vector registers + * at the beginning and end of each round. */ + + vAba .req v0 + vAbe .req v1 + vAbi .req v2 + vAbo .req v3 + vAbu .req v4 + vAga .req v5 + vAge .req v6 + vAgi .req v7 + vAgo .req v8 + vAgu .req v9 + vAka .req v10 + vAke .req v11 + vAki .req v12 + vAko .req v13 + vAku .req v14 + vAma .req v15 + vAme .req v16 + vAmi .req v17 + vAmo .req v18 + vAmu .req v19 + vAsa .req v20 + vAse .req v21 + vAsi .req v22 + vAso .req v23 + vAsu .req v24 + + /* q-form of the above mapping */ + vAbaq .req q0 + vAbeq .req q1 + vAbiq .req q2 + vAboq .req q3 + vAbuq .req q4 + vAgaq .req q5 + vAgeq .req q6 + vAgiq .req q7 + vAgoq .req q8 + vAguq .req q9 + vAkaq .req q10 + vAkeq .req q11 + vAkiq .req q12 + vAkoq .req q13 + vAkuq .req q14 + vAmaq .req q15 + vAmeq .req q16 + vAmiq .req q17 + vAmoq .req q18 + vAmuq .req q19 + vAsaq .req q20 + vAseq .req q21 + vAsiq .req q22 + vAsoq .req q23 + vAsuq .req q24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v30 + C1 .req v29 + C2 .req v28 + C3 .req v27 + C4 .req v26 + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req v26 + E1 .req v25 + E2 .req v29 + E3 .req v28 + E4 .req v27 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + vAbi_ .req v2 + vAbo_ .req v3 + vAbu_ .req v4 + vAga_ .req v10 + vAge_ .req v11 + vAgi_ .req v7 + vAgo_ .req v8 + vAgu_ .req v9 + vAka_ .req v15 + vAke_ .req v16 + vAki_ .req v12 + vAko_ .req v13 + vAku_ .req v14 + vAma_ .req v20 + vAme_ .req v21 + vAmi_ .req v17 + vAmo_ .req v18 + vAmu_ .req v19 + vAsa_ .req v0 + vAse_ .req v1 + vAsi_ .req v22 + vAso_ .req v23 + vAsu_ .req v24 + vAba_ .req v30 + vAbe_ .req v27 + + /* Unused temporary */ + vtmp .req v31 + + /* Mapping of Kecck-f1600 state to scalar registers + * at the beginning and end of each round. */ + s_Aba .req x1 + sAbe .req x6 + sAbi .req x11 + sAbo .req x16 + sAbu .req x21 + sAga .req x2 + sAge .req x7 + sAgi .req x12 + sAgo .req x17 + sAgu .req x22 + sAka .req x3 + sAke .req x8 + sAki .req x13 + sAko .req x18 + sAku .req x23 + sAma .req x4 + sAme .req x9 + sAmi .req x14 + sAmo .req x19 + sAmu .req x24 + sAsa .req x5 + sAse .req x10 + sAsi .req x15 + sAso .req x20 + sAsu .req x25 + + /* sA_[y,2*x+3*y] = rot(A[x,y]) */ + s_Aba_ .req x0 + sAbe_ .req x28 + sAbi_ .req x11 + sAbo_ .req x16 + sAbu_ .req x21 + sAga_ .req x3 + sAge_ .req x8 + sAgi_ .req x12 + sAgo_ .req x17 + sAgu_ .req x22 + sAka_ .req x4 + sAke_ .req x9 + sAki_ .req x13 + sAko_ .req x18 + sAku_ .req x23 + sAma_ .req x5 + sAme_ .req x10 + sAmi_ .req x14 + sAmo_ .req x19 + sAmu_ .req x24 + sAsa_ .req x1 + sAse_ .req x6 + sAsi_ .req x15 + sAso_ .req x20 + sAsu_ .req x25 + + /* sC[x] = sA[x,0] xor sA[x,1] xor sA[x,2] xor sA[x,3] xor sA[x,4], for x in 0..4 */ + /* sE[x] = sC[x-1] xor rot(C[x+1],1), for x in 0..4 */ + sC0 .req x0 + sE0 .req x29 + sC1 .req x26 + sE1 .req x30 + sC2 .req x27 + sE2 .req x26 + sC3 .req x28 + sE3 .req x27 + sC4 .req x29 + sE4 .req x28 + + tmp .req x30 + +/************************ MACROS ****************************/ + +/* Macros using v8.4-A SHA-3 instructions */ + + +.macro eor3_m1 d s0 s1 s2 + eor \d\().16b, \s0\().16b, \s1\().16b + eor \d\().16b, \d\().16b, \s2\().16b +.endm + +.macro rax1_m1 d s0 s1 + add vtmp.2d, \s1\().2d, \s1\().2d + sri vtmp.2d, \s1\().2d, #63 + eor \d\().16b, vtmp.16b, \s0\().16b +.endm + +.macro xar_m1 d s0 s1 imm + eor vtmp.16b, \s0\().16b, \s1\().16b + shl \d\().2d, vtmp.2d, #(64-\imm) + sri \d\().2d, vtmp.2d, #(\imm) +.endm + +.macro bcax_m1 d s0 s1 s2 + bic vtmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, vtmp.16b, \s0\().16b +.endm + +.macro eor3_m0 d s0 s1 s2 + eor3 \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +.macro rax1_m0 d s0 s1 + rax1 \d\().2d, \s0\().2d, \s1\().2d +.endm + +.macro xar_m0 d s0 s1 imm + xar \d\().2d, \s0\().2d, \s1\().2d, #\imm +.endm + +.macro bcax_m0 d s0 s1 s2 + bcax \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + + +.macro load_input_vector + ldr vAbaq, [input_addr, #(32*0)] + ldr vAbeq, [input_addr, #(32*0+32)] + ldr vAbiq, [input_addr, #(32*2)] + ldr vAboq, [input_addr, #(32*2+32)] + ldr vAbuq, [input_addr, #(32*4)] + ldr vAgaq, [input_addr, #(32*4+32)] + ldr vAgeq, [input_addr, #(32*6)] + ldr vAgiq, [input_addr, #(32*6+32)] + ldr vAgoq, [input_addr, #(32*8)] + ldr vAguq, [input_addr, #(32*8+32)] + ldr vAkaq, [input_addr, #(32*10)] + ldr vAkeq, [input_addr, #(32*10+32)] + ldr vAkiq, [input_addr, #(32*12)] + ldr vAkoq, [input_addr, #(32*12+32)] + ldr vAkuq, [input_addr, #(32*14)] + ldr vAmaq, [input_addr, #(32*14+32)] + ldr vAmeq, [input_addr, #(32*16)] + ldr vAmiq, [input_addr, #(32*16+32)] + ldr vAmoq, [input_addr, #(32*18)] + ldr vAmuq, [input_addr, #(32*18+32)] + ldr vAsaq, [input_addr, #(32*20)] + ldr vAseq, [input_addr, #(32*20+32)] + ldr vAsiq, [input_addr, #(32*22)] + ldr vAsoq, [input_addr, #(32*22+32)] + ldr vAsuq, [input_addr, #(32*24)] +.endm + +.macro store_input_vector + str vAbaq, [input_addr, #(32*0)] + str vAbeq, [input_addr, #(32*0+32)] + str vAbiq, [input_addr, #(32*2)] + str vAboq, [input_addr, #(32*2+32)] + str vAbuq, [input_addr, #(32*4)] + str vAgaq, [input_addr, #(32*4+32)] + str vAgeq, [input_addr, #(32*6)] + str vAgiq, [input_addr, #(32*6+32)] + str vAgoq, [input_addr, #(32*8)] + str vAguq, [input_addr, #(32*8+32)] + str vAkaq, [input_addr, #(32*10)] + str vAkeq, [input_addr, #(32*10+32)] + str vAkiq, [input_addr, #(32*12)] + str vAkoq, [input_addr, #(32*12+32)] + str vAkuq, [input_addr, #(32*14)] + str vAmaq, [input_addr, #(32*14+32)] + str vAmeq, [input_addr, #(32*16)] + str vAmiq, [input_addr, #(32*16+32)] + str vAmoq, [input_addr, #(32*18)] + str vAmuq, [input_addr, #(32*18+32)] + str vAsaq, [input_addr, #(32*20)] + str vAseq, [input_addr, #(32*20+32)] + str vAsiq, [input_addr, #(32*22)] + str vAsoq, [input_addr, #(32*22+32)] + str vAsuq, [input_addr, #(32*24)] +.endm + +.macro store_input_scalar + str s_Aba,[input_addr, 32*0 ] + str sAbe, [input_addr, 32*1 ] + str sAbi, [input_addr, 32*2 ] + str sAbo, [input_addr, 32*3 ] + str sAbu, [input_addr, 32*4 ] + str sAga, [input_addr, 32*5 ] + str sAge, [input_addr, 32*6 ] + str sAgi, [input_addr, 32*7 ] + str sAgo, [input_addr, 32*8 ] + str sAgu, [input_addr, 32*9 ] + str sAka, [input_addr, 32*10] + str sAke, [input_addr, 32*11] + str sAki, [input_addr, 32*12] + str sAko, [input_addr, 32*13] + str sAku, [input_addr, 32*14] + str sAma, [input_addr, 32*15] + str sAme, [input_addr, 32*16] + str sAmi, [input_addr, 32*17] + str sAmo, [input_addr, 32*18] + str sAmu, [input_addr, 32*19] + str sAsa, [input_addr, 32*20] + str sAse, [input_addr, 32*21] + str sAsi, [input_addr, 32*22] + str sAso, [input_addr, 32*23] + str sAsu, [input_addr, 32*24] +.endm + +.macro load_input_scalar + ldr s_Aba,[input_addr, 32*0 ] + ldr sAbe, [input_addr, 32*1 ] + ldr sAbi, [input_addr, 32*2 ] + ldr sAbo, [input_addr, 32*3 ] + ldr sAbu, [input_addr, 32*4 ] + ldr sAga, [input_addr, 32*5 ] + ldr sAge, [input_addr, 32*6 ] + ldr sAgi, [input_addr, 32*7 ] + ldr sAgo, [input_addr, 32*8 ] + ldr sAgu, [input_addr, 32*9 ] + ldr sAka, [input_addr, 32*10] + ldr sAke, [input_addr, 32*11] + ldr sAki, [input_addr, 32*12] + ldr sAko, [input_addr, 32*13] + ldr sAku, [input_addr, 32*14] + ldr sAma, [input_addr, 32*15] + ldr sAme, [input_addr, 32*16] + ldr sAmi, [input_addr, 32*17] + ldr sAmo, [input_addr, 32*18] + ldr sAmu, [input_addr, 32*19] + ldr sAsa, [input_addr, 32*20] + ldr sAse, [input_addr, 32*21] + ldr sAsi, [input_addr, 32*22] + ldr sAso, [input_addr, 32*23] + ldr sAsu, [input_addr, 32*24] +.endm + +#define STACK_SIZE (4*16 + 12*8 + 6*8) +#define STACK_BASE_VREGS (0) +#define STACK_BASE_GPRS (4*16) +#define STACK_BASE_TMP_GPRS (4*16 + 12*8) +#define STACK_OFFSET_INPUT (0*8) +#define STACK_OFFSET_CONST (1*8) +#define STACK_OFFSET_COUNT (2*8) +#define STACK_OFFSET_COUNT_OUT (3*8) +#define STACK_OFFSET_CUR_INPUT (4*8) + +.macro save_gprs + stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro save_vregs + stp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] + stp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + stp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + stp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] +.endm + +.macro restore_vregs + ldp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] + ldp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + ldp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + ldp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] +.endm + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +.macro eor5 dst, src0, src1, src2, src3, src4 + eor \dst, \src0, \src1 + eor \dst, \dst, \src2 + eor \dst, \dst, \src3 + eor \dst, \dst, \src4 +.endm + +.macro xor_rol dst, src1, src0, imm + eor \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro bic_rol dst, src1, src0, imm + bic \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro rotate dst, src, imm + ror \dst, \src, #(64-\imm) +.endm + +.macro save reg, offset + str \reg, [sp, #(STACK_BASE_TMP_GPRS + \offset)] +.endm + +.macro restore reg, offset + ldr \reg, [sp, #(STACK_BASE_TMP_GPRS + \offset)] +.endm + +.macro hybrid_round_initial + + eor sC0, sAma, sAsa SEP eor3_m1 C0, vAba, vAga, vAka + eor sC1, sAme, sAse SEP + eor sC2, sAmi, sAsi SEP + eor sC3, sAmo, sAso SEP eor3_m1 C0, C0, vAma, vAsa + eor sC4, sAmu, sAsu SEP + eor sC0, sAka, sC0 SEP + eor sC1, sAke, sC1 SEP eor3_m1 C1, vAbe, vAge, vAke + eor sC2, sAki, sC2 SEP + eor sC3, sAko, sC3 SEP + eor sC4, sAku, sC4 SEP eor3_m1 C1, C1, vAme, vAse + eor sC0, sAga, sC0 SEP + eor sC1, sAge, sC1 SEP + eor sC2, sAgi, sC2 SEP eor3_m1 C2, vAbi, vAgi, vAki + eor sC3, sAgo, sC3 SEP + eor sC4, sAgu, sC4 SEP + eor sC0, s_Aba, sC0 SEP eor3_m1 C2, C2, vAmi, vAsi + eor sC1, sAbe, sC1 SEP + eor sC2, sAbi, sC2 SEP + eor sC3, sAbo, sC3 SEP eor3_m1 C3, vAbo, vAgo, vAko + eor sC4, sAbu, sC4 SEP + SEP + eor sE1, sC0, sC2, ROR #63 SEP eor3_m1 C3, C3, vAmo, vAso + eor sE3, sC2, sC4, ROR #63 SEP + eor sE0, sC4, sC1, ROR #63 SEP + eor sE2, sC1, sC3, ROR #63 SEP eor3_m1 C4, vAbu, vAgu, vAku + eor sE4, sC3, sC0, ROR #63 SEP + SEP + eor s_Aba_, s_Aba, sE0 SEP eor3_m1 C4, C4, vAmu, vAsu + eor sAsa_, sAbi, sE2 SEP + eor sAbi_, sAki, sE2 SEP + eor sAki_, sAko, sE3 SEP rax1_m1 E1, C0, C2 + eor sAko_, sAmu, sE4 SEP + eor sAmu_, sAso, sE3 SEP + eor sAso_, sAma, sE0 SEP + eor sAka_, sAbe, sE1 SEP rax1_m1 E3, C2, C4 + eor sAse_, sAgo, sE3 SEP + eor sAgo_, sAme, sE1 SEP + eor sAke_, sAgi, sE2 SEP + eor sAgi_, sAka, sE0 SEP rax1_m1 E0, C4, C1 + eor sAga_, sAbo, sE3 SEP + eor sAbo_, sAmo, sE3 SEP + eor sAmo_, sAmi, sE2 SEP + eor sAmi_, sAke, sE1 SEP rax1_m1 E2, C1, C3 + eor sAge_, sAgu, sE4 SEP + eor sAgu_, sAsi, sE2 SEP + eor sAsi_, sAku, sE4 SEP + eor sAku_, sAsa, sE0 SEP rax1_m1 E4, C3, C0 + eor sAma_, sAbu, sE4 SEP + eor sAbu_, sAsu, sE4 SEP + eor sAsu_, sAse, sE1 SEP + eor sAme_, sAga, sE0 SEP eor vAba_.16b, vAba.16b, E0.16b + eor sAbe_, sAge, sE1 SEP + SEP + load_constant_ptr SEP xar_m1 vAsa_, vAbi, E2, 2 + SEP + bic tmp, sAgi_, sAge_, ROR #47 SEP + eor sAga, tmp, sAga_, ROR #39 SEP + bic tmp, sAgo_, sAgi_, ROR #42 SEP xar_m1 vAbi_, vAki, E2, 21 + eor sAge, tmp, sAge_, ROR #25 SEP + bic tmp, sAgu_, sAgo_, ROR #16 SEP + eor sAgi, tmp, sAgi_, ROR #58 SEP + bic tmp, sAga_, sAgu_, ROR #31 SEP xar_m1 vAki_, vAko, E3, 39 + eor sAgo, tmp, sAgo_, ROR #47 SEP + bic tmp, sAge_, sAga_, ROR #56 SEP + eor sAgu, tmp, sAgu_, ROR #23 SEP xar_m1 vAko_, vAmu, E4, 56 + bic tmp, sAki_, sAke_, ROR #19 SEP + eor sAka, tmp, sAka_, ROR #24 SEP + bic tmp, sAko_, sAki_, ROR #47 SEP + eor sAke, tmp, sAke_, ROR #2 SEP xar_m1 vAmu_, vAso, E3, 8 + bic tmp, sAku_, sAko_, ROR #10 SEP + eor sAki, tmp, sAki_, ROR #57 SEP + bic tmp, sAka_, sAku_, ROR #47 SEP xar_m1 vAso_, vAma, E0, 23 + eor sAko, tmp, sAko_, ROR #57 SEP + bic tmp, sAke_, sAka_, ROR #5 SEP + eor sAku, tmp, sAku_, ROR #52 SEP + bic tmp, sAmi_, sAme_, ROR #38 SEP xar_m1 vAka_, vAbe, E1, 63 + eor sAma, tmp, sAma_, ROR #47 SEP + bic tmp, sAmo_, sAmi_, ROR #5 SEP + eor sAme, tmp, sAme_, ROR #43 SEP xar_m1 vAse_, vAgo, E3, 9 + bic tmp, sAmu_, sAmo_, ROR #41 SEP + eor sAmi, tmp, sAmi_, ROR #46 SEP + SEP + ldr cur_const, [const_addr] SEP + mov count, #1 SEP xar_m1 vAgo_, vAme, E1, 19 + SEP + bic tmp, sAma_, sAmu_, ROR #35 SEP + eor sAmo, tmp, sAmo_, ROR #12 SEP + bic tmp, sAme_, sAma_, ROR #9 SEP xar_m1 vAke_, vAgi, E2, 58 + eor sAmu, tmp, sAmu_, ROR #44 SEP + bic tmp, sAsi_, sAse_, ROR #48 SEP + eor sAsa, tmp, sAsa_, ROR #41 SEP xar_m1 vAgi_, vAka, E0, 61 + bic tmp, sAso_, sAsi_, ROR #2 SEP + eor sAse, tmp, sAse_, ROR #50 SEP + bic tmp, sAsu_, sAso_, ROR #25 SEP + eor sAsi, tmp, sAsi_, ROR #27 SEP xar_m1 vAga_, vAbo, E3, 36 + bic tmp, sAsa_, sAsu_, ROR #60 SEP + eor sAso, tmp, sAso_, ROR #21 SEP + bic tmp, sAse_, sAsa_, ROR #57 SEP xar_m1 vAbo_, vAmo, E3, 43 + eor sAsu, tmp, sAsu_, ROR #53 SEP + bic tmp, sAbi_, sAbe_, ROR #63 SEP + eor s_Aba, s_Aba_, tmp, ROR #21 SEP + bic tmp, sAbo_, sAbi_, ROR #42 SEP xar_m1 vAmo_, vAmi, E2, 49 + eor sAbe, tmp, sAbe_, ROR #41 SEP + bic tmp, sAbu_, sAbo_, ROR #57 SEP + eor sAbi, tmp, sAbi_, ROR #35 SEP xar_m1 vAmi_, vAke, E1, 54 + bic tmp, s_Aba_, sAbu_, ROR #50 SEP + eor sAbo, tmp, sAbo_, ROR #43 SEP + bic tmp, sAbe_, s_Aba_, ROR #44 SEP + eor sAbu, tmp, sAbu_, ROR #30 SEP xar_m1 vAge_, vAgu, E4, 44 + SEP + eor s_Aba, s_Aba, cur_const SEP + SEP xar_m1 vAgu_, vAsi, E2, 3 + save count, STACK_OFFSET_COUNT SEP + SEP + eor sC0, sAka, sAsa, ROR #50 SEP + eor sC1, sAse, sAge, ROR #60 SEP + eor sC2, sAmi, sAgi, ROR #59 SEP xar_m1 vAsi_, vAku, E4, 25 + eor sC3, sAgo, sAso, ROR #30 SEP + eor sC4, sAbu, sAsu, ROR #53 SEP + eor sC0, sAma, sC0, ROR #49 SEP xar_m1 vAku_, vAsa, E0, 46 + eor sC1, sAbe, sC1, ROR #44 SEP + eor sC2, sAki, sC2, ROR #26 SEP + eor sC3, sAmo, sC3, ROR #63 SEP + eor sC4, sAmu, sC4, ROR #56 SEP xar_m1 vAma_, vAbu, E4, 37 + eor sC0, sAga, sC0, ROR #57 SEP + eor sC1, sAme, sC1, ROR #58 SEP + eor sC2, sAbi, sC2, ROR #60 SEP xar_m1 vAbu_, vAsu, E4, 50 + eor sC3, sAko, sC3, ROR #38 SEP + eor sC4, sAgu, sC4, ROR #48 SEP + eor sC0, s_Aba, sC0, ROR #61 SEP + eor sC1, sAke, sC1, ROR #57 SEP xar_m1 vAsu_, vAse, E1, 62 + eor sC2, sAsi, sC2, ROR #52 SEP + eor sC3, sAbo, sC3, ROR #63 SEP + eor sC4, sAku, sC4, ROR #50 SEP xar_m1 vAme_, vAga, E0, 28 + ror sC1, sC1, 56 SEP + ror sC4, sC4, 58 SEP + ror sC2, sC2, 62 SEP xar_m1 vAbe_, vAge, E1, 20 + SEP + eor sE1, sC0, sC2, ROR #63 SEP + eor sE3, sC2, sC4, ROR #63 SEP + eor sE0, sC4, sC1, ROR #63 SEP bcax_m1 vAga, vAga_, vAgi_, vAge_ + eor sE2, sC1, sC3, ROR #63 SEP + eor sE4, sC3, sC0, ROR #63 SEP + SEP bcax_m1 vAge, vAge_, vAgo_, vAgi_ + eor s_Aba_, sE0, s_Aba SEP + eor sAsa_, sE2, sAbi, ROR #50 SEP + eor sAbi_, sE2, sAki, ROR #46 SEP bcax_m1 vAgi, vAgi_, vAgu_, vAgo_ + eor sAki_, sE3, sAko, ROR #63 SEP + eor sAko_, sE4, sAmu, ROR #28 SEP + eor sAmu_, sE3, sAso, ROR #2 SEP bcax_m1 vAgo, vAgo_, vAga_, vAgu_ + eor sAso_, sE0, sAma, ROR #54 SEP + eor sAka_, sE1, sAbe, ROR #43 SEP + eor sAse_, sE3, sAgo, ROR #36 SEP bcax_m1 vAgu, vAgu_, vAge_, vAga_ + eor sAgo_, sE1, sAme, ROR #49 SEP + eor sAke_, sE2, sAgi, ROR #3 SEP + eor sAgi_, sE0, sAka, ROR #39 SEP bcax_m1 vAka, vAka_, vAki_, vAke_ + eor sAga_, sE3, sAbo SEP + eor sAbo_, sE3, sAmo, ROR #37 SEP + eor sAmo_, sE2, sAmi, ROR #8 SEP bcax_m1 vAke, vAke_, vAko_, vAki_ + eor sAmi_, sE1, sAke, ROR #56 SEP + eor sAge_, sE4, sAgu, ROR #44 SEP + eor sAgu_, sE2, sAsi, ROR #62 SEP bcax_m1 vAki, vAki_, vAku_, vAko_ + eor sAsi_, sE4, sAku, ROR #58 SEP + eor sAku_, sE0, sAsa, ROR #25 SEP + eor sAma_, sE4, sAbu, ROR #20 SEP bcax_m1 vAko, vAko_, vAka_, vAku_ + eor sAbu_, sE4, sAsu, ROR #9 SEP + eor sAsu_, sE1, sAse, ROR #23 SEP + eor sAme_, sE0, sAga, ROR #61 SEP bcax_m1 vAku, vAku_, vAke_, vAka_ + eor sAbe_, sE1, sAge, ROR #19 SEP + SEP + load_constant_ptr SEP bcax_m1 vAma, vAma_, vAmi_, vAme_ + restore count, STACK_OFFSET_COUNT SEP + SEP + bic tmp, sAgi_, sAge_, ROR #47 SEP restore x26, STACK_OFFSET_CONST + eor sAga, tmp, sAga_, ROR #39 SEP + bic tmp, sAgo_, sAgi_, ROR #42 SEP + eor sAge, tmp, sAge_, ROR #25 SEP bcax_m1 vAme, vAme_, vAmo_, vAmi_ + bic tmp, sAgu_, sAgo_, ROR #16 SEP + eor sAgi, tmp, sAgi_, ROR #58 SEP ld1r {v28.2d}, [x26], #8 + bic tmp, sAga_, sAgu_, ROR #31 SEP + eor sAgo, tmp, sAgo_, ROR #47 SEP + bic tmp, sAge_, sAga_, ROR #56 SEP bcax_m1 vAmi, vAmi_, vAmu_, vAmo_ + eor sAgu, tmp, sAgu_, ROR #23 SEP + bic tmp, sAki_, sAke_, ROR #19 SEP save x26, STACK_OFFSET_CONST + eor sAka, tmp, sAka_, ROR #24 SEP + bic tmp, sAko_, sAki_, ROR #47 SEP + eor sAke, tmp, sAke_, ROR #2 SEP bcax_m1 vAmo, vAmo_, vAma_, vAmu_ + bic tmp, sAku_, sAko_, ROR #10 SEP + eor sAki, tmp, sAki_, ROR #57 SEP + bic tmp, sAka_, sAku_, ROR #47 SEP bcax_m1 vAmu, vAmu_, vAme_, vAma_ + eor sAko, tmp, sAko_, ROR #57 SEP + bic tmp, sAke_, sAka_, ROR #5 SEP + eor sAku, tmp, sAku_, ROR #52 SEP bcax_m1 vAsa, vAsa_, vAsi_, vAse_ + bic tmp, sAmi_, sAme_, ROR #38 SEP + eor sAma, tmp, sAma_, ROR #47 SEP + bic tmp, sAmo_, sAmi_, ROR #5 SEP bcax_m1 vAse, vAse_, vAso_, vAsi_ + eor sAme, tmp, sAme_, ROR #43 SEP + bic tmp, sAmu_, sAmo_, ROR #41 SEP + eor sAmi, tmp, sAmi_, ROR #46 SEP bcax_m1 vAsi, vAsi_, vAsu_, vAso_ + bic tmp, sAma_, sAmu_, ROR #35 SEP + SEP + ldr cur_const, [const_addr, count, UXTW #3] SEP bcax_m1 vAso, vAso_, vAsa_, vAsu_ + SEP + eor sAmo, tmp, sAmo_, ROR #12 SEP + bic tmp, sAme_, sAma_, ROR #9 SEP bcax_m1 vAsu, vAsu_, vAse_, vAsa_ + eor sAmu, tmp, sAmu_, ROR #44 SEP + bic tmp, sAsi_, sAse_, ROR #48 SEP + eor sAsa, tmp, sAsa_, ROR #41 SEP bcax_m1 vAba, vAba_, vAbi_, vAbe_ + bic tmp, sAso_, sAsi_, ROR #2 SEP + eor sAse, tmp, sAse_, ROR #50 SEP + bic tmp, sAsu_, sAso_, ROR #25 SEP bcax_m1 vAbe, vAbe_, vAbo_, vAbi_ + eor sAsi, tmp, sAsi_, ROR #27 SEP + bic tmp, sAsa_, sAsu_, ROR #60 SEP + eor sAso, tmp, sAso_, ROR #21 SEP bcax_m1 vAbi, vAbi_, vAbu_, vAbo_ + bic tmp, sAse_, sAsa_, ROR #57 SEP + eor sAsu, tmp, sAsu_, ROR #53 SEP + bic tmp, sAbi_, sAbe_, ROR #63 SEP bcax_m1 vAbo, vAbo_, vAba_, vAbu_ + eor s_Aba, s_Aba_, tmp, ROR #21 SEP + bic tmp, sAbo_, sAbi_, ROR #42 SEP + eor sAbe, tmp, sAbe_, ROR #41 SEP bcax_m1 vAbu, vAbu_, vAbe_, vAba_ + bic tmp, sAbu_, sAbo_, ROR #57 SEP + eor sAbi, tmp, sAbi_, ROR #35 SEP + bic tmp, s_Aba_, sAbu_, ROR #50 SEP + eor sAbo, tmp, sAbo_, ROR #43 SEP eor vAba.16b, vAba.16b, v28.16b + bic tmp, sAbe_, s_Aba_, ROR #44 SEP + eor sAbu, tmp, sAbu_, ROR #30 SEP + SEP + add count, count, #1 SEP + SEP + eor s_Aba, s_Aba, cur_const SEP + SEP +.endm + +.macro hybrid_round_noninitial + save count, STACK_OFFSET_COUNT SEP eor3_m1 C0, vAba, vAga, vAka + SEP + eor sC0, sAka, sAsa, ROR #50 SEP + eor sC1, sAse, sAge, ROR #60 SEP eor3_m1 C0, C0, vAma, vAsa + eor sC2, sAmi, sAgi, ROR #59 SEP + eor sC3, sAgo, sAso, ROR #30 SEP + eor sC4, sAbu, sAsu, ROR #53 SEP eor3_m1 C1, vAbe, vAge, vAke + eor sC0, sAma, sC0, ROR #49 SEP + eor sC1, sAbe, sC1, ROR #44 SEP + eor sC2, sAki, sC2, ROR #26 SEP eor3_m1 C1, C1, vAme, vAse + eor sC3, sAmo, sC3, ROR #63 SEP + eor sC4, sAmu, sC4, ROR #56 SEP + eor sC0, sAga, sC0, ROR #57 SEP eor3_m1 C2, vAbi, vAgi, vAki + eor sC1, sAme, sC1, ROR #58 SEP + eor sC2, sAbi, sC2, ROR #60 SEP + eor sC3, sAko, sC3, ROR #38 SEP eor3_m1 C2, C2, vAmi, vAsi + eor sC4, sAgu, sC4, ROR #48 SEP + eor sC0, s_Aba, sC0, ROR #61 SEP + eor sC1, sAke, sC1, ROR #57 SEP eor3_m1 C3, vAbo, vAgo, vAko + eor sC2, sAsi, sC2, ROR #52 SEP + eor sC3, sAbo, sC3, ROR #63 SEP + eor sC4, sAku, sC4, ROR #50 SEP eor3_m1 C3, C3, vAmo, vAso + ror sC1, sC1, 56 SEP + ror sC4, sC4, 58 SEP + ror sC2, sC2, 62 SEP eor3_m1 C4, vAbu, vAgu, vAku + SEP + eor sE1, sC0, sC2, ROR #63 SEP + eor sE3, sC2, sC4, ROR #63 SEP eor3_m1 C4, C4, vAmu, vAsu + eor sE0, sC4, sC1, ROR #63 SEP + eor sE2, sC1, sC3, ROR #63 SEP + eor sE4, sC3, sC0, ROR #63 SEP rax1_m1 E1, C0, C2 + SEP + eor s_Aba_, sE0, s_Aba SEP + eor sAsa_, sE2, sAbi, ROR #50 SEP + eor sAbi_, sE2, sAki, ROR #46 SEP rax1_m1 E3, C2, C4 + eor sAki_, sE3, sAko, ROR #63 SEP + eor sAko_, sE4, sAmu, ROR #28 SEP + eor sAmu_, sE3, sAso, ROR #2 SEP + eor sAso_, sE0, sAma, ROR #54 SEP rax1_m1 E0, C4, C1 + eor sAka_, sE1, sAbe, ROR #43 SEP + eor sAse_, sE3, sAgo, ROR #36 SEP + eor sAgo_, sE1, sAme, ROR #49 SEP + eor sAke_, sE2, sAgi, ROR #3 SEP rax1_m1 E2, C1, C3 + eor sAgi_, sE0, sAka, ROR #39 SEP + eor sAga_, sE3, sAbo SEP + eor sAbo_, sE3, sAmo, ROR #37 SEP + eor sAmo_, sE2, sAmi, ROR #8 SEP rax1_m1 E4, C3, C0 + eor sAmi_, sE1, sAke, ROR #56 SEP + eor sAge_, sE4, sAgu, ROR #44 SEP + eor sAgu_, sE2, sAsi, ROR #62 SEP + eor sAsi_, sE4, sAku, ROR #58 SEP eor vAba_.16b, vAba.16b, E0.16b + eor sAku_, sE0, sAsa, ROR #25 SEP + eor sAma_, sE4, sAbu, ROR #20 SEP + eor sAbu_, sE4, sAsu, ROR #9 SEP xar_m1 vAsa_, vAbi, E2, 2 + eor sAsu_, sE1, sAse, ROR #23 SEP + eor sAme_, sE0, sAga, ROR #61 SEP + eor sAbe_, sE1, sAge, ROR #19 SEP + SEP xar_m1 vAbi_, vAki, E2, 21 + load_constant_ptr SEP + restore count, STACK_OFFSET_COUNT SEP + SEP + bic tmp, sAgi_, sAge_, ROR #47 SEP xar_m1 vAki_, vAko, E3, 39 + eor sAga, tmp, sAga_, ROR #39 SEP + bic tmp, sAgo_, sAgi_, ROR #42 SEP + eor sAge, tmp, sAge_, ROR #25 SEP xar_m1 vAko_, vAmu, E4, 56 + bic tmp, sAgu_, sAgo_, ROR #16 SEP + eor sAgi, tmp, sAgi_, ROR #58 SEP + bic tmp, sAga_, sAgu_, ROR #31 SEP + eor sAgo, tmp, sAgo_, ROR #47 SEP xar_m1 vAmu_, vAso, E3, 8 + bic tmp, sAge_, sAga_, ROR #56 SEP + eor sAgu, tmp, sAgu_, ROR #23 SEP + bic tmp, sAki_, sAke_, ROR #19 SEP + eor sAka, tmp, sAka_, ROR #24 SEP xar_m1 vAso_, vAma, E0, 23 + bic tmp, sAko_, sAki_, ROR #47 SEP + eor sAke, tmp, sAke_, ROR #2 SEP + bic tmp, sAku_, sAko_, ROR #10 SEP + eor sAki, tmp, sAki_, ROR #57 SEP xar_m1 vAka_, vAbe, E1, 63 + bic tmp, sAka_, sAku_, ROR #47 SEP + eor sAko, tmp, sAko_, ROR #57 SEP + bic tmp, sAke_, sAka_, ROR #5 SEP xar_m1 vAse_, vAgo, E3, 9 + eor sAku, tmp, sAku_, ROR #52 SEP + bic tmp, sAmi_, sAme_, ROR #38 SEP + eor sAma, tmp, sAma_, ROR #47 SEP + bic tmp, sAmo_, sAmi_, ROR #5 SEP + eor sAme, tmp, sAme_, ROR #43 SEP xar_m1 vAgo_, vAme, E1, 19 + bic tmp, sAmu_, sAmo_, ROR #41 SEP + eor sAmi, tmp, sAmi_, ROR #46 SEP + bic tmp, sAma_, sAmu_, ROR #35 SEP + SEP xar_m1 vAke_, vAgi, E2, 58 + ldr cur_const, [const_addr, count, UXTW #3] SEP + add count, count, #1 SEP + SEP + eor sAmo, tmp, sAmo_, ROR #12 SEP + bic tmp, sAme_, sAma_, ROR #9 SEP xar_m1 vAgi_, vAka, E0, 61 + eor sAmu, tmp, sAmu_, ROR #44 SEP + bic tmp, sAsi_, sAse_, ROR #48 SEP + eor sAsa, tmp, sAsa_, ROR #41 SEP + bic tmp, sAso_, sAsi_, ROR #2 SEP xar_m1 vAga_, vAbo, E3, 36 + eor sAse, tmp, sAse_, ROR #50 SEP + bic tmp, sAsu_, sAso_, ROR #25 SEP + eor sAsi, tmp, sAsi_, ROR #27 SEP + bic tmp, sAsa_, sAsu_, ROR #60 SEP xar_m1 vAbo_, vAmo, E3, 43 + eor sAso, tmp, sAso_, ROR #21 SEP + bic tmp, sAse_, sAsa_, ROR #57 SEP + eor sAsu, tmp, sAsu_, ROR #53 SEP + bic tmp, sAbi_, sAbe_, ROR #63 SEP xar_m1 vAmo_, vAmi, E2, 49 + eor s_Aba, s_Aba_, tmp, ROR #21 SEP + bic tmp, sAbo_, sAbi_, ROR #42 SEP + eor sAbe, tmp, sAbe_, ROR #41 SEP + bic tmp, sAbu_, sAbo_, ROR #57 SEP xar_m1 vAmi_, vAke, E1, 54 + eor sAbi, tmp, sAbi_, ROR #35 SEP + bic tmp, s_Aba_, sAbu_, ROR #50 SEP + eor sAbo, tmp, sAbo_, ROR #43 SEP + bic tmp, sAbe_, s_Aba_, ROR #44 SEP xar_m1 vAge_, vAgu, E4, 44 + eor sAbu, tmp, sAbu_, ROR #30 SEP + SEP + eor s_Aba, s_Aba, cur_const SEP xar_m1 vAgu_, vAsi, E2, 3 + save count, STACK_OFFSET_COUNT SEP + SEP + eor sC0, sAka, sAsa, ROR #50 SEP + eor sC1, sAse, sAge, ROR #60 SEP + eor sC2, sAmi, sAgi, ROR #59 SEP xar_m1 vAsi_, vAku, E4, 25 + eor sC3, sAgo, sAso, ROR #30 SEP + eor sC4, sAbu, sAsu, ROR #53 SEP + eor sC0, sAma, sC0, ROR #49 SEP xar_m1 vAku_, vAsa, E0, 46 + eor sC1, sAbe, sC1, ROR #44 SEP + eor sC2, sAki, sC2, ROR #26 SEP + eor sC3, sAmo, sC3, ROR #63 SEP + eor sC4, sAmu, sC4, ROR #56 SEP xar_m1 vAma_, vAbu, E4, 37 + eor sC0, sAga, sC0, ROR #57 SEP + eor sC1, sAme, sC1, ROR #58 SEP + eor sC2, sAbi, sC2, ROR #60 SEP + eor sC3, sAko, sC3, ROR #38 SEP xar_m1 vAbu_, vAsu, E4, 50 + eor sC4, sAgu, sC4, ROR #48 SEP + eor sC0, s_Aba, sC0, ROR #61 SEP + eor sC1, sAke, sC1, ROR #57 SEP + eor sC2, sAsi, sC2, ROR #52 SEP xar_m1 vAsu_, vAse, E1, 62 + eor sC3, sAbo, sC3, ROR #63 SEP + eor sC4, sAku, sC4, ROR #50 SEP + ror sC1, sC1, 56 SEP xar_m1 vAme_, vAga, E0, 28 + ror sC4, sC4, 58 SEP + ror sC2, sC2, 62 SEP + SEP xar_m1 vAbe_, vAge, E1, 20 + eor sE1, sC0, sC2, ROR #63 SEP + eor sE3, sC2, sC4, ROR #63 SEP + eor sE0, sC4, sC1, ROR #63 SEP + eor sE2, sC1, sC3, ROR #63 SEP bcax_m1 vAga, vAga_, vAgi_, vAge_ + eor sE4, sC3, sC0, ROR #63 SEP + SEP + eor s_Aba_, sE0, s_Aba SEP bcax_m1 vAge, vAge_, vAgo_, vAgi_ + eor sAsa_, sE2, sAbi, ROR #50 SEP + eor sAbi_, sE2, sAki, ROR #46 SEP + eor sAki_, sE3, sAko, ROR #63 SEP bcax_m1 vAgi, vAgi_, vAgu_, vAgo_ + eor sAko_, sE4, sAmu, ROR #28 SEP + eor sAmu_, sE3, sAso, ROR #2 SEP + eor sAso_, sE0, sAma, ROR #54 SEP bcax_m1 vAgo, vAgo_, vAga_, vAgu_ + eor sAka_, sE1, sAbe, ROR #43 SEP + eor sAse_, sE3, sAgo, ROR #36 SEP + eor sAgo_, sE1, sAme, ROR #49 SEP bcax_m1 vAgu, vAgu_, vAge_, vAga_ + eor sAke_, sE2, sAgi, ROR #3 SEP + eor sAgi_, sE0, sAka, ROR #39 SEP + eor sAga_, sE3, sAbo SEP bcax_m1 vAka, vAka_, vAki_, vAke_ + eor sAbo_, sE3, sAmo, ROR #37 SEP + eor sAmo_, sE2, sAmi, ROR #8 SEP + eor sAmi_, sE1, sAke, ROR #56 SEP bcax_m1 vAke, vAke_, vAko_, vAki_ + eor sAge_, sE4, sAgu, ROR #44 SEP + eor sAgu_, sE2, sAsi, ROR #62 SEP + eor sAsi_, sE4, sAku, ROR #58 SEP bcax_m1 vAki, vAki_, vAku_, vAko_ + eor sAku_, sE0, sAsa, ROR #25 SEP + eor sAma_, sE4, sAbu, ROR #20 SEP + eor sAbu_, sE4, sAsu, ROR #9 SEP bcax_m1 vAko, vAko_, vAka_, vAku_ + eor sAsu_, sE1, sAse, ROR #23 SEP + eor sAme_, sE0, sAga, ROR #61 SEP + eor sAbe_, sE1, sAge, ROR #19 SEP bcax_m1 vAku, vAku_, vAke_, vAka_ + SEP + load_constant_ptr SEP + restore count, STACK_OFFSET_COUNT SEP bcax_m1 vAma, vAma_, vAmi_, vAme_ + SEP + bic tmp, sAgi_, sAge_, ROR #47 SEP + eor sAga, tmp, sAga_, ROR #39 SEP restore x26, STACK_OFFSET_CONST + bic tmp, sAgo_, sAgi_, ROR #42 SEP + eor sAge, tmp, sAge_, ROR #25 SEP + bic tmp, sAgu_, sAgo_, ROR #16 SEP bcax_m1 vAme, vAme_, vAmo_, vAmi_ + eor sAgi, tmp, sAgi_, ROR #58 SEP + bic tmp, sAga_, sAgu_, ROR #31 SEP ld1r {v28.2d}, [x26], #8 + eor sAgo, tmp, sAgo_, ROR #47 SEP + bic tmp, sAge_, sAga_, ROR #56 SEP + eor sAgu, tmp, sAgu_, ROR #23 SEP bcax_m1 vAmi, vAmi_, vAmu_, vAmo_ + bic tmp, sAki_, sAke_, ROR #19 SEP + eor sAka, tmp, sAka_, ROR #24 SEP save x26, STACK_OFFSET_CONST + bic tmp, sAko_, sAki_, ROR #47 SEP + eor sAke, tmp, sAke_, ROR #2 SEP + bic tmp, sAku_, sAko_, ROR #10 SEP bcax_m1 vAmo, vAmo_, vAma_, vAmu_ + eor sAki, tmp, sAki_, ROR #57 SEP + bic tmp, sAka_, sAku_, ROR #47 SEP + eor sAko, tmp, sAko_, ROR #57 SEP bcax_m1 vAmu, vAmu_, vAme_, vAma_ + bic tmp, sAke_, sAka_, ROR #5 SEP + eor sAku, tmp, sAku_, ROR #52 SEP + bic tmp, sAmi_, sAme_, ROR #38 SEP bcax_m1 vAsa, vAsa_, vAsi_, vAse_ + eor sAma, tmp, sAma_, ROR #47 SEP + bic tmp, sAmo_, sAmi_, ROR #5 SEP + eor sAme, tmp, sAme_, ROR #43 SEP bcax_m1 vAse, vAse_, vAso_, vAsi_ + bic tmp, sAmu_, sAmo_, ROR #41 SEP + eor sAmi, tmp, sAmi_, ROR #46 SEP + bic tmp, sAma_, sAmu_, ROR #35 SEP bcax_m1 vAsi, vAsi_, vAsu_, vAso_ + SEP + ldr cur_const, [const_addr, count, UXTW #3] SEP + add count, count, #1 SEP bcax_m1 vAso, vAso_, vAsa_, vAsu_ + SEP + eor sAmo, tmp, sAmo_, ROR #12 SEP + bic tmp, sAme_, sAma_, ROR #9 SEP bcax_m1 vAsu, vAsu_, vAse_, vAsa_ + eor sAmu, tmp, sAmu_, ROR #44 SEP + bic tmp, sAsi_, sAse_, ROR #48 SEP + eor sAsa, tmp, sAsa_, ROR #41 SEP bcax_m1 vAba, vAba_, vAbi_, vAbe_ + bic tmp, sAso_, sAsi_, ROR #2 SEP + eor sAse, tmp, sAse_, ROR #50 SEP + bic tmp, sAsu_, sAso_, ROR #25 SEP bcax_m1 vAbe, vAbe_, vAbo_, vAbi_ + eor sAsi, tmp, sAsi_, ROR #27 SEP + bic tmp, sAsa_, sAsu_, ROR #60 SEP + eor sAso, tmp, sAso_, ROR #21 SEP bcax_m1 vAbi, vAbi_, vAbu_, vAbo_ + bic tmp, sAse_, sAsa_, ROR #57 SEP + eor sAsu, tmp, sAsu_, ROR #53 SEP + bic tmp, sAbi_, sAbe_, ROR #63 SEP bcax_m1 vAbo, vAbo_, vAba_, vAbu_ + eor s_Aba, s_Aba_, tmp, ROR #21 SEP + bic tmp, sAbo_, sAbi_, ROR #42 SEP + eor sAbe, tmp, sAbe_, ROR #41 SEP bcax_m1 vAbu, vAbu_, vAbe_, vAba_ + bic tmp, sAbu_, sAbo_, ROR #57 SEP + eor sAbi, tmp, sAbi_, ROR #35 SEP + bic tmp, s_Aba_, sAbu_, ROR #50 SEP + eor sAbo, tmp, sAbo_, ROR #43 SEP eor vAba.16b, vAba.16b, v28.16b + bic tmp, sAbe_, s_Aba_, ROR #44 SEP + eor sAbu, tmp, sAbu_, ROR #30 SEP + SEP + eor s_Aba, s_Aba, cur_const SEP + +.endm + +.macro final_rotate + ror sAga, sAga,#(64-3) + ror sAka, sAka,#(64-25) + ror sAma, sAma,#(64-10) + ror sAsa, sAsa,#(64-39) + ror sAbe, sAbe,#(64-21) + ror sAge, sAge,#(64-45) + ror sAke, sAke,#(64-8) + ror sAme, sAme,#(64-15) + ror sAse, sAse,#(64-41) + ror sAbi, sAbi,#(64-14) + ror sAgi, sAgi,#(64-61) + ror sAki, sAki,#(64-18) + ror sAmi, sAmi,#(64-56) + ror sAsi, sAsi,#(64-2) + ror sAgo, sAgo,#(64-28) + ror sAko, sAko,#(64-1) + ror sAmo, sAmo,#(64-27) + ror sAso, sAso,#(64-62) + ror sAbu, sAbu,#(64-44) + ror sAgu, sAgu,#(64-20) + ror sAku, sAku,#(64-6) + ror sAmu, sAmu,#(64-36) + ror sAsu, sAsu,#(64-55) +.endm + +#define KECCAK_F1600_ROUNDS 24 + +.global keccak_f1600_x4_hybrid_asm_v3pp +.global _keccak_f1600_x4_hybrid_asm_v3pp +.text +.align 4 + +keccak_f1600_x4_hybrid_asm_v3pp: +_keccak_f1600_x4_hybrid_asm_v3pp: + alloc_stack + save_gprs + save_vregs + save input_addr, STACK_OFFSET_INPUT + + ASM_LOAD(const_addr,round_constants) + save const_addr, STACK_OFFSET_CONST + + load_input_vector + + add input_addr, input_addr, #16 + + mov out_count, #0 +outer_loop: + save out_count, STACK_OFFSET_COUNT_OUT + + load_input_scalar + save input_addr, STACK_OFFSET_CUR_INPUT + + hybrid_round_initial +1: + hybrid_round_noninitial + cmp count, #(KECCAK_F1600_ROUNDS) + blt 1b + + final_rotate + restore input_addr, STACK_OFFSET_CUR_INPUT + store_input_scalar + add input_addr, input_addr, #8 + + restore out_count, STACK_OFFSET_COUNT_OUT + add out_count, out_count, #1 + cmp out_count, #2 + blt outer_loop + + restore input_addr, STACK_OFFSET_INPUT + store_input_vector + + restore_vregs + restore_gprs + free_stack + ret diff --git a/tests/keccak_neon/manual/keccak_f1600_x4_hybrid_asm_v4.s b/tests/keccak_neon/manual/keccak_f1600_x4_hybrid_asm_v4.s new file mode 100644 index 0000000..a5aa8cd --- /dev/null +++ b/tests/keccak_neon/manual/keccak_f1600_x4_hybrid_asm_v4.s @@ -0,0 +1,1018 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +#if defined(__ARM_FEATURE_SHA3) + +/********************** CONSTANTS *************************/ + .data + .align(8) +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x29 + count .req w27 + cur_const .req x26 + + /* Mapping of Kecck-f1600 SIMD state to vector registers + * at the beginning and end of each round. */ + + vAba .req v0 + vAbe .req v1 + vAbi .req v2 + vAbo .req v3 + vAbu .req v4 + vAga .req v5 + vAge .req v6 + vAgi .req v7 + vAgo .req v8 + vAgu .req v9 + vAka .req v10 + vAke .req v11 + vAki .req v12 + vAko .req v13 + vAku .req v14 + vAma .req v15 + vAme .req v16 + vAmi .req v17 + vAmo .req v18 + vAmu .req v19 + vAsa .req v20 + vAse .req v21 + vAsi .req v22 + vAso .req v23 + vAsu .req v24 + + /* q-form of the above mapping */ + vAbaq .req q0 + vAbeq .req q1 + vAbiq .req q2 + vAboq .req q3 + vAbuq .req q4 + vAgaq .req q5 + vAgeq .req q6 + vAgiq .req q7 + vAgoq .req q8 + vAguq .req q9 + vAkaq .req q10 + vAkeq .req q11 + vAkiq .req q12 + vAkoq .req q13 + vAkuq .req q14 + vAmaq .req q15 + vAmeq .req q16 + vAmiq .req q17 + vAmoq .req q18 + vAmuq .req q19 + vAsaq .req q20 + vAseq .req q21 + vAsiq .req q22 + vAsoq .req q23 + vAsuq .req q24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v30 + C1 .req v29 + C2 .req v28 + C3 .req v27 + C4 .req v26 + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req v26 + E1 .req v25 + E2 .req v29 + E3 .req v28 + E4 .req v27 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + vAbi_ .req v2 + vAbo_ .req v3 + vAbu_ .req v4 + vAga_ .req v10 + vAge_ .req v11 + vAgi_ .req v7 + vAgo_ .req v8 + vAgu_ .req v9 + vAka_ .req v15 + vAke_ .req v16 + vAki_ .req v12 + vAko_ .req v13 + vAku_ .req v14 + vAma_ .req v20 + vAme_ .req v21 + vAmi_ .req v17 + vAmo_ .req v18 + vAmu_ .req v19 + vAsa_ .req v0 + vAse_ .req v1 + vAsi_ .req v22 + vAso_ .req v23 + vAsu_ .req v24 + vAba_ .req v30 + vAbe_ .req v27 + + /* Unused temporary */ + vtmp .req v31 + + /* Mapping of Kecck-f1600 state to scalar registers + * at the beginning and end of each round. */ + s_Aba .req x1 + sAbe .req x6 + sAbi .req x11 + sAbo .req x16 + sAbu .req x21 + sAga .req x2 + sAge .req x7 + sAgi .req x12 + sAgo .req x17 + sAgu .req x22 + sAka .req x3 + sAke .req x8 + sAki .req x13 + sAko .req x18 + sAku .req x23 + sAma .req x4 + sAme .req x9 + sAmi .req x14 + sAmo .req x19 + sAmu .req x24 + sAsa .req x5 + sAse .req x10 + sAsi .req x15 + sAso .req x20 + sAsu .req x25 + + /* sA_[y,2*x+3*y] = rot(A[x,y]) */ + s_Aba_ .req x0 + sAbe_ .req x28 + sAbi_ .req x11 + sAbo_ .req x16 + sAbu_ .req x21 + sAga_ .req x3 + sAge_ .req x8 + sAgi_ .req x12 + sAgo_ .req x17 + sAgu_ .req x22 + sAka_ .req x4 + sAke_ .req x9 + sAki_ .req x13 + sAko_ .req x18 + sAku_ .req x23 + sAma_ .req x5 + sAme_ .req x10 + sAmi_ .req x14 + sAmo_ .req x19 + sAmu_ .req x24 + sAsa_ .req x1 + sAse_ .req x6 + sAsi_ .req x15 + sAso_ .req x20 + sAsu_ .req x25 + + /* sC[x] = sA[x,0] xor sA[x,1] xor sA[x,2] xor sA[x,3] xor sA[x,4], for x in 0..4 */ + /* sE[x] = sC[x-1] xor rot(C[x+1],1), for x in 0..4 */ + sC0 .req x0 + sE0 .req x29 + sC1 .req x26 + sE1 .req x30 + sC2 .req x27 + sE2 .req x26 + sC3 .req x28 + sE3 .req x27 + sC4 .req x29 + sE4 .req x28 + + tmp .req x30 + +/************************ MACROS ****************************/ + +/* Macros using v8.4-A SHA-3 instructions */ + + +.macro eor3_m1 d s0 s1 s2 + eor \d\().16b, \s0\().16b, \s1\().16b + eor \d\().16b, \d\().16b, \s2\().16b +.endm + +.macro rax1_m1 d s0 s1 + add vtmp.2d, \s1\().2d, \s1\().2d + sri vtmp.2d, \s1\().2d, #63 + eor \d\().16b, vtmp.16b, \s0\().16b +.endm + +.macro xar_m1 d s0 s1 imm + eor vtmp.16b, \s0\().16b, \s1\().16b + shl \d\().2d, vtmp.2d, #(64-\imm) + sri \d\().2d, vtmp.2d, #(\imm) +.endm + +.macro bcax_m1 d s0 s1 s2 + bic vtmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, vtmp.16b, \s0\().16b +.endm + +.macro eor3_m0 d s0 s1 s2 + eor3 \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +.macro rax1_m0 d s0 s1 + rax1 \d\().2d, \s0\().2d, \s1\().2d +.endm + +.macro xar_m0 d s0 s1 imm + xar \d\().2d, \s0\().2d, \s1\().2d, #\imm +.endm + +.macro bcax_m0 d s0 s1 s2 + bcax \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + + +.macro load_input_vector num idx + ldr vAbaq, [input_addr, #(16*(\num*0+\idx))] + ldr vAbeq, [input_addr, #(16*(\num*1+\idx))] + ldr vAbiq, [input_addr, #(16*(\num*2+\idx))] + ldr vAboq, [input_addr, #(16*(\num*3+\idx))] + ldr vAbuq, [input_addr, #(16*(\num*4+\idx))] + ldr vAgaq, [input_addr, #(16*(\num*5+\idx))] + ldr vAgeq, [input_addr, #(16*(\num*6+\idx))] + ldr vAgiq, [input_addr, #(16*(\num*7+\idx))] + ldr vAgoq, [input_addr, #(16*(\num*8+\idx))] + ldr vAguq, [input_addr, #(16*(\num*9+\idx))] + ldr vAkaq, [input_addr, #(16*(\num*10+\idx))] + ldr vAkeq, [input_addr, #(16*(\num*11+\idx))] + ldr vAkiq, [input_addr, #(16*(\num*12+\idx))] + ldr vAkoq, [input_addr, #(16*(\num*13+\idx))] + ldr vAkuq, [input_addr, #(16*(\num*14+\idx))] + ldr vAmaq, [input_addr, #(16*(\num*15+\idx))] + ldr vAmeq, [input_addr, #(16*(\num*16+\idx))] + ldr vAmiq, [input_addr, #(16*(\num*17+\idx))] + ldr vAmoq, [input_addr, #(16*(\num*18+\idx))] + ldr vAmuq, [input_addr, #(16*(\num*19+\idx))] + ldr vAsaq, [input_addr, #(16*(\num*20+\idx))] + ldr vAseq, [input_addr, #(16*(\num*21+\idx))] + ldr vAsiq, [input_addr, #(16*(\num*22+\idx))] + ldr vAsoq, [input_addr, #(16*(\num*23+\idx))] + ldr vAsuq, [input_addr, #(16*(\num*24+\idx))] +.endm + +.macro store_input_vector num idx + str vAbaq, [input_addr, #(16*(\num*0+\idx))] + str vAbeq, [input_addr, #(16*(\num*1+\idx))] + str vAbiq, [input_addr, #(16*(\num*2+\idx))] + str vAboq, [input_addr, #(16*(\num*3+\idx))] + str vAbuq, [input_addr, #(16*(\num*4+\idx))] + str vAgaq, [input_addr, #(16*(\num*5+\idx))] + str vAgeq, [input_addr, #(16*(\num*6+\idx))] + str vAgiq, [input_addr, #(16*(\num*7+\idx))] + str vAgoq, [input_addr, #(16*(\num*8+\idx))] + str vAguq, [input_addr, #(16*(\num*9+\idx))] + str vAkaq, [input_addr, #(16*(\num*10+\idx))] + str vAkeq, [input_addr, #(16*(\num*11+\idx))] + str vAkiq, [input_addr, #(16*(\num*12+\idx))] + str vAkoq, [input_addr, #(16*(\num*13+\idx))] + str vAkuq, [input_addr, #(16*(\num*14+\idx))] + str vAmaq, [input_addr, #(16*(\num*15+\idx))] + str vAmeq, [input_addr, #(16*(\num*16+\idx))] + str vAmiq, [input_addr, #(16*(\num*17+\idx))] + str vAmoq, [input_addr, #(16*(\num*18+\idx))] + str vAmuq, [input_addr, #(16*(\num*19+\idx))] + str vAsaq, [input_addr, #(16*(\num*20+\idx))] + str vAseq, [input_addr, #(16*(\num*21+\idx))] + str vAsiq, [input_addr, #(16*(\num*22+\idx))] + str vAsoq, [input_addr, #(16*(\num*23+\idx))] + str vAsuq, [input_addr, #(16*(\num*24+\idx))] +.endm + +.macro store_input_scalar num idx + str s_Aba, [input_addr, 8*(\num*(0) +\idx)] + str sAbe, [input_addr, 8*(\num*(0+1) +\idx)] + str sAbi, [input_addr, 8*(\num*(2)+ \idx)] + str sAbo, [input_addr, 8*(\num*(2+1) +\idx)] + str sAbu, [input_addr, 8*(\num*(4)+ \idx)] + str sAga, [input_addr, 8*(\num*(4+1) +\idx)] + str sAge, [input_addr, 8*(\num*(6)+ \idx)] + str sAgi, [input_addr, 8*(\num*(6+1) +\idx)] + str sAgo, [input_addr, 8*(\num*(8)+ \idx)] + str sAgu, [input_addr, 8*(\num*(8+1) +\idx)] + str sAka, [input_addr, 8*(\num*(10) +\idx)] + str sAke, [input_addr, 8*(\num*(10+1)+\idx)] + str sAki, [input_addr, 8*(\num*(12) +\idx)] + str sAko, [input_addr, 8*(\num*(12+1)+\idx)] + str sAku, [input_addr, 8*(\num*(14) +\idx)] + str sAma, [input_addr, 8*(\num*(14+1)+\idx)] + str sAme, [input_addr, 8*(\num*(16) +\idx)] + str sAmi, [input_addr, 8*(\num*(16+1)+\idx)] + str sAmo, [input_addr, 8*(\num*(18) +\idx)] + str sAmu, [input_addr, 8*(\num*(18+1)+\idx)] + str sAsa, [input_addr, 8*(\num*(20) +\idx)] + str sAse, [input_addr, 8*(\num*(20+1)+\idx)] + str sAsi, [input_addr, 8*(\num*(22) +\idx)] + str sAso, [input_addr, 8*(\num*(22+1)+\idx)] + str sAsu, [input_addr, 8*(\num*(24) +\idx)] +.endm + +.macro load_input_scalar num idx + ldr s_Aba, [input_addr, 8*(\num*(0) +\idx)] + ldr sAbe, [input_addr, 8*(\num*(0+1) +\idx)] + ldr sAbi, [input_addr, 8*(\num*(2)+ \idx)] + ldr sAbo, [input_addr, 8*(\num*(2+1) +\idx)] + ldr sAbu, [input_addr, 8*(\num*(4)+ \idx)] + ldr sAga, [input_addr, 8*(\num*(4+1) +\idx)] + ldr sAge, [input_addr, 8*(\num*(6)+ \idx)] + ldr sAgi, [input_addr, 8*(\num*(6+1) +\idx)] + ldr sAgo, [input_addr, 8*(\num*(8)+ \idx)] + ldr sAgu, [input_addr, 8*(\num*(8+1) +\idx)] + ldr sAka, [input_addr, 8*(\num*(10) +\idx)] + ldr sAke, [input_addr, 8*(\num*(10+1)+\idx)] + ldr sAki, [input_addr, 8*(\num*(12) +\idx)] + ldr sAko, [input_addr, 8*(\num*(12+1)+\idx)] + ldr sAku, [input_addr, 8*(\num*(14) +\idx)] + ldr sAma, [input_addr, 8*(\num*(14+1)+\idx)] + ldr sAme, [input_addr, 8*(\num*(16) +\idx)] + ldr sAmi, [input_addr, 8*(\num*(16+1)+\idx)] + ldr sAmo, [input_addr, 8*(\num*(18) +\idx)] + ldr sAmu, [input_addr, 8*(\num*(18+1)+\idx)] + ldr sAsa, [input_addr, 8*(\num*(20) +\idx)] + ldr sAse, [input_addr, 8*(\num*(20+1)+\idx)] + ldr sAsi, [input_addr, 8*(\num*(22) +\idx)] + ldr sAso, [input_addr, 8*(\num*(22+1)+\idx)] + ldr sAsu, [input_addr, 8*(\num*(24) +\idx)] +.endm + +#define STACK_SIZE (8*8 + 16*6 + 3*8 + 8) // VREGS (8*8), GPRs (16*6), count (8), const (8), input (8), padding (8) +#define STACK_BASE_GPRS (3*8+8) +#define STACK_BASE_VREGS (3*8+8+16*6) +#define STACK_OFFSET_INPUT (0*8) +#define STACK_OFFSET_CONST (1*8) +#define STACK_OFFSET_COUNT (2*8) + +.macro save_gprs + stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro save_vregs + stp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] + stp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + stp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + stp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] +.endm + +.macro restore_vregs + ldp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] + ldp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + ldp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + ldp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] +.endm + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +.macro eor5 dst, src0, src1, src2, src3, src4 + eor \dst, \src0, \src1 + eor \dst, \dst, \src2 + eor \dst, \dst, \src3 + eor \dst, \dst, \src4 +.endm + +.macro xor_rol dst, src1, src0, imm + eor \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro bic_rol dst, src1, src0, imm + bic \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro rotate dst, src, imm + ror \dst, \src, #(64-\imm) +.endm + +.macro save reg, offset + str \reg, [sp, #\offset] +.endm + +.macro restore reg, offset + ldr \reg, [sp, #\offset] +.endm + +.macro hybrid_round_initial + + eor sC0, sAma, sAsa SEP eor3_m1 C0, vAba, vAga, vAka + eor sC1, sAme, sAse SEP + eor sC2, sAmi, sAsi SEP + eor sC3, sAmo, sAso SEP eor3_m0 C0, C0, vAma, vAsa + eor sC4, sAmu, sAsu SEP + eor sC0, sAka, sC0 SEP + eor sC1, sAke, sC1 SEP eor3_m1 C1, vAbe, vAge, vAke + eor sC2, sAki, sC2 SEP + eor sC3, sAko, sC3 SEP + eor sC4, sAku, sC4 SEP eor3_m0 C1, C1, vAme, vAse + eor sC0, sAga, sC0 SEP + eor sC1, sAge, sC1 SEP + eor sC2, sAgi, sC2 SEP eor3_m1 C2, vAbi, vAgi, vAki + eor sC3, sAgo, sC3 SEP + eor sC4, sAgu, sC4 SEP + eor sC0, s_Aba, sC0 SEP eor3_m0 C2, C2, vAmi, vAsi + eor sC1, sAbe, sC1 SEP + eor sC2, sAbi, sC2 SEP + eor sC3, sAbo, sC3 SEP eor3_m1 C3, vAbo, vAgo, vAko + eor sC4, sAbu, sC4 SEP + SEP + eor sE1, sC0, sC2, ROR #63 SEP eor3_m0 C3, C3, vAmo, vAso + eor sE3, sC2, sC4, ROR #63 SEP + eor sE0, sC4, sC1, ROR #63 SEP + eor sE2, sC1, sC3, ROR #63 SEP eor3_m1 C4, vAbu, vAgu, vAku + eor sE4, sC3, sC0, ROR #63 SEP + SEP + eor s_Aba_, s_Aba, sE0 SEP eor3_m0 C4, C4, vAmu, vAsu + eor sAsa_, sAbi, sE2 SEP + eor sAbi_, sAki, sE2 SEP + eor sAki_, sAko, sE3 SEP + eor sAko_, sAmu, sE4 SEP rax1_m1 E1, C0, C2 + eor sAmu_, sAso, sE3 SEP + eor sAso_, sAma, sE0 SEP + eor sAka_, sAbe, sE1 SEP rax1_m0 E3, C2, C4 + eor sAse_, sAgo, sE3 SEP + eor sAgo_, sAme, sE1 SEP + eor sAke_, sAgi, sE2 SEP rax1_m1 E0, C4, C1 + eor sAgi_, sAka, sE0 SEP + eor sAga_, sAbo, sE3 SEP + eor sAbo_, sAmo, sE3 SEP rax1_m0 E2, C1, C3 + eor sAmo_, sAmi, sE2 SEP + eor sAmi_, sAke, sE1 SEP + eor sAge_, sAgu, sE4 SEP rax1_m1 E4, C3, C0 + eor sAgu_, sAsi, sE2 SEP + eor sAsi_, sAku, sE4 SEP + eor sAku_, sAsa, sE0 SEP + eor sAma_, sAbu, sE4 SEP eor vAba_.16b, vAba.16b, E0.16b + eor sAbu_, sAsu, sE4 SEP + eor sAsu_, sAse, sE1 SEP + eor sAme_, sAga, sE0 SEP xar_m0 vAsa_, vAbi, E2, 2 + eor sAbe_, sAge, sE1 SEP + SEP + load_constant_ptr SEP xar_m1 vAbi_, vAki, E2, 21 + SEP + bic tmp, sAgi_, sAge_, ROR #47 SEP + eor sAga, tmp, sAga_, ROR #39 SEP xar_m0 vAki_, vAko, E3, 39 + bic tmp, sAgo_, sAgi_, ROR #42 SEP + eor sAge, tmp, sAge_, ROR #25 SEP + bic tmp, sAgu_, sAgo_, ROR #16 SEP xar_m1 vAko_, vAmu, E4, 56 + eor sAgi, tmp, sAgi_, ROR #58 SEP + bic tmp, sAga_, sAgu_, ROR #31 SEP + eor sAgo, tmp, sAgo_, ROR #47 SEP xar_m0 vAmu_, vAso, E3, 8 + bic tmp, sAge_, sAga_, ROR #56 SEP + eor sAgu, tmp, sAgu_, ROR #23 SEP + bic tmp, sAki_, sAke_, ROR #19 SEP xar_m1 vAso_, vAma, E0, 23 + eor sAka, tmp, sAka_, ROR #24 SEP + bic tmp, sAko_, sAki_, ROR #47 SEP + eor sAke, tmp, sAke_, ROR #2 SEP xar_m0 vAka_, vAbe, E1, 63 + bic tmp, sAku_, sAko_, ROR #10 SEP + eor sAki, tmp, sAki_, ROR #57 SEP + bic tmp, sAka_, sAku_, ROR #47 SEP xar_m1 vAse_, vAgo, E3, 9 + eor sAko, tmp, sAko_, ROR #57 SEP + bic tmp, sAke_, sAka_, ROR #5 SEP + eor sAku, tmp, sAku_, ROR #52 SEP xar_m0 vAgo_, vAme, E1, 19 + bic tmp, sAmi_, sAme_, ROR #38 SEP + eor sAma, tmp, sAma_, ROR #47 SEP + bic tmp, sAmo_, sAmi_, ROR #5 SEP xar_m1 vAke_, vAgi, E2, 58 + eor sAme, tmp, sAme_, ROR #43 SEP + bic tmp, sAmu_, sAmo_, ROR #41 SEP + eor sAmi, tmp, sAmi_, ROR #46 SEP xar_m0 vAgi_, vAka, E0, 61 + SEP + ldr cur_const, [const_addr] SEP + mov count, #1 SEP xar_m1 vAga_, vAbo, E3, 36 + SEP + bic tmp, sAma_, sAmu_, ROR #35 SEP + eor sAmo, tmp, sAmo_, ROR #12 SEP xar_m0 vAbo_, vAmo, E3, 43 + bic tmp, sAme_, sAma_, ROR #9 SEP + eor sAmu, tmp, sAmu_, ROR #44 SEP + bic tmp, sAsi_, sAse_, ROR #48 SEP xar_m1 vAmo_, vAmi, E2, 49 + eor sAsa, tmp, sAsa_, ROR #41 SEP + bic tmp, sAso_, sAsi_, ROR #2 SEP + eor sAse, tmp, sAse_, ROR #50 SEP xar_m0 vAmi_, vAke, E1, 54 + bic tmp, sAsu_, sAso_, ROR #25 SEP + eor sAsi, tmp, sAsi_, ROR #27 SEP + bic tmp, sAsa_, sAsu_, ROR #60 SEP xar_m1 vAge_, vAgu, E4, 44 + eor sAso, tmp, sAso_, ROR #21 SEP + bic tmp, sAse_, sAsa_, ROR #57 SEP + eor sAsu, tmp, sAsu_, ROR #53 SEP xar_m0 vAgu_, vAsi, E2, 3 + bic tmp, sAbi_, sAbe_, ROR #63 SEP + eor s_Aba, s_Aba_, tmp, ROR #21 SEP + bic tmp, sAbo_, sAbi_, ROR #42 SEP xar_m1 vAsi_, vAku, E4, 25 + eor sAbe, tmp, sAbe_, ROR #41 SEP + bic tmp, sAbu_, sAbo_, ROR #57 SEP + eor sAbi, tmp, sAbi_, ROR #35 SEP xar_m0 vAku_, vAsa, E0, 46 + bic tmp, s_Aba_, sAbu_, ROR #50 SEP + eor sAbo, tmp, sAbo_, ROR #43 SEP + bic tmp, sAbe_, s_Aba_, ROR #44 SEP xar_m1 vAma_, vAbu, E4, 37 + eor sAbu, tmp, sAbu_, ROR #30 SEP + SEP + eor s_Aba, s_Aba, cur_const SEP xar_m0 vAbu_, vAsu, E4, 50 + SEP + save count, STACK_OFFSET_COUNT SEP + SEP xar_m1 vAsu_, vAse, E1, 62 + eor sC0, sAka, sAsa, ROR #50 SEP + eor sC1, sAse, sAge, ROR #60 SEP + eor sC2, sAmi, sAgi, ROR #59 SEP xar_m0 vAme_, vAga, E0, 28 + eor sC3, sAgo, sAso, ROR #30 SEP + eor sC4, sAbu, sAsu, ROR #53 SEP + eor sC0, sAma, sC0, ROR #49 SEP xar_m1 vAbe_, vAge, E1, 20 + eor sC1, sAbe, sC1, ROR #44 SEP + eor sC2, sAki, sC2, ROR #26 SEP restore sE1, STACK_OFFSET_CONST + eor sC3, sAmo, sC3, ROR #63 SEP + eor sC4, sAmu, sC4, ROR #56 SEP + eor sC0, sAga, sC0, ROR #57 SEP ld1r {v28.2d}, [sE1], #8 + eor sC1, sAme, sC1, ROR #58 SEP + eor sC2, sAbi, sC2, ROR #60 SEP + eor sC3, sAko, sC3, ROR #38 SEP save sE1, STACK_OFFSET_CONST + eor sC4, sAgu, sC4, ROR #48 SEP + eor sC0, s_Aba, sC0, ROR #61 SEP bcax_m0 vAga, vAga_, vAgi_, vAge_ + eor sC1, sAke, sC1, ROR #57 SEP + eor sC2, sAsi, sC2, ROR #52 SEP + eor sC3, sAbo, sC3, ROR #63 SEP bcax_m1 vAge, vAge_, vAgo_, vAgi_ + eor sC4, sAku, sC4, ROR #50 SEP + ror sC1, sC1, 56 SEP + ror sC4, sC4, 58 SEP bcax_m0 vAgi, vAgi_, vAgu_, vAgo_ + ror sC2, sC2, 62 SEP + SEP + eor sE1, sC0, sC2, ROR #63 SEP bcax_m1 vAgo, vAgo_, vAga_, vAgu_ + eor sE3, sC2, sC4, ROR #63 SEP + eor sE0, sC4, sC1, ROR #63 SEP + eor sE2, sC1, sC3, ROR #63 SEP bcax_m0 vAgu, vAgu_, vAge_, vAga_ + eor sE4, sC3, sC0, ROR #63 SEP + SEP + eor s_Aba_, sE0, s_Aba SEP bcax_m1 vAka, vAka_, vAki_, vAke_ + eor sAsa_, sE2, sAbi, ROR #50 SEP + eor sAbi_, sE2, sAki, ROR #46 SEP + eor sAki_, sE3, sAko, ROR #63 SEP bcax_m0 vAke, vAke_, vAko_, vAki_ + eor sAko_, sE4, sAmu, ROR #28 SEP + eor sAmu_, sE3, sAso, ROR #2 SEP + eor sAso_, sE0, sAma, ROR #54 SEP bcax_m1 vAki, vAki_, vAku_, vAko_ + eor sAka_, sE1, sAbe, ROR #43 SEP + eor sAse_, sE3, sAgo, ROR #36 SEP + eor sAgo_, sE1, sAme, ROR #49 SEP bcax_m0 vAko, vAko_, vAka_, vAku_ + eor sAke_, sE2, sAgi, ROR #3 SEP + eor sAgi_, sE0, sAka, ROR #39 SEP + eor sAga_, sE3, sAbo SEP bcax_m1 vAku, vAku_, vAke_, vAka_ + eor sAbo_, sE3, sAmo, ROR #37 SEP + eor sAmo_, sE2, sAmi, ROR #8 SEP + eor sAmi_, sE1, sAke, ROR #56 SEP bcax_m0 vAma, vAma_, vAmi_, vAme_ + eor sAge_, sE4, sAgu, ROR #44 SEP + eor sAgu_, sE2, sAsi, ROR #62 SEP + eor sAsi_, sE4, sAku, ROR #58 SEP bcax_m1 vAme, vAme_, vAmo_, vAmi_ + eor sAku_, sE0, sAsa, ROR #25 SEP + eor sAma_, sE4, sAbu, ROR #20 SEP + eor sAbu_, sE4, sAsu, ROR #9 SEP bcax_m0 vAmi, vAmi_, vAmu_, vAmo_ + eor sAsu_, sE1, sAse, ROR #23 SEP + eor sAme_, sE0, sAga, ROR #61 SEP + eor sAbe_, sE1, sAge, ROR #19 SEP bcax_m1 vAmo, vAmo_, vAma_, vAmu_ + SEP + load_constant_ptr SEP + restore count, STACK_OFFSET_COUNT SEP bcax_m0 vAmu, vAmu_, vAme_, vAma_ + SEP + bic tmp, sAgi_, sAge_, ROR #47 SEP + eor sAga, tmp, sAga_, ROR #39 SEP bcax_m1 vAsa, vAsa_, vAsi_, vAse_ + bic tmp, sAgo_, sAgi_, ROR #42 SEP + eor sAge, tmp, sAge_, ROR #25 SEP + bic tmp, sAgu_, sAgo_, ROR #16 SEP bcax_m0 vAse, vAse_, vAso_, vAsi_ + eor sAgi, tmp, sAgi_, ROR #58 SEP + bic tmp, sAga_, sAgu_, ROR #31 SEP + eor sAgo, tmp, sAgo_, ROR #47 SEP bcax_m1 vAsi, vAsi_, vAsu_, vAso_ + bic tmp, sAge_, sAga_, ROR #56 SEP + eor sAgu, tmp, sAgu_, ROR #23 SEP + bic tmp, sAki_, sAke_, ROR #19 SEP bcax_m0 vAso, vAso_, vAsa_, vAsu_ + eor sAka, tmp, sAka_, ROR #24 SEP + bic tmp, sAko_, sAki_, ROR #47 SEP + eor sAke, tmp, sAke_, ROR #2 SEP bcax_m1 vAsu, vAsu_, vAse_, vAsa_ + bic tmp, sAku_, sAko_, ROR #10 SEP + eor sAki, tmp, sAki_, ROR #57 SEP + bic tmp, sAka_, sAku_, ROR #47 SEP bcax_m0 vAba, vAba_, vAbi_, vAbe_ + eor sAko, tmp, sAko_, ROR #57 SEP + bic tmp, sAke_, sAka_, ROR #5 SEP + eor sAku, tmp, sAku_, ROR #52 SEP bcax_m1 vAbe, vAbe_, vAbo_, vAbi_ + bic tmp, sAmi_, sAme_, ROR #38 SEP + eor sAma, tmp, sAma_, ROR #47 SEP + bic tmp, sAmo_, sAmi_, ROR #5 SEP bcax_m0 vAbi, vAbi_, vAbu_, vAbo_ + eor sAme, tmp, sAme_, ROR #43 SEP + bic tmp, sAmu_, sAmo_, ROR #41 SEP + eor sAmi, tmp, sAmi_, ROR #46 SEP bcax_m1 vAbo, vAbo_, vAba_, vAbu_ + bic tmp, sAma_, sAmu_, ROR #35 SEP + SEP + ldr cur_const, [const_addr, count, UXTW #3] SEP bcax_m0 vAbu, vAbu_, vAbe_, vAba_ + SEP + eor sAmo, tmp, sAmo_, ROR #12 SEP + bic tmp, sAme_, sAma_, ROR #9 SEP + eor sAmu, tmp, sAmu_, ROR #44 SEP eor vAba.16b, vAba.16b, v28.16b + bic tmp, sAsi_, sAse_, ROR #48 SEP + eor sAsa, tmp, sAsa_, ROR #41 SEP + bic tmp, sAso_, sAsi_, ROR #2 SEP + eor sAse, tmp, sAse_, ROR #50 SEP + bic tmp, sAsu_, sAso_, ROR #25 SEP + eor sAsi, tmp, sAsi_, ROR #27 SEP + bic tmp, sAsa_, sAsu_, ROR #60 SEP + eor sAso, tmp, sAso_, ROR #21 SEP + bic tmp, sAse_, sAsa_, ROR #57 SEP + eor sAsu, tmp, sAsu_, ROR #53 SEP + bic tmp, sAbi_, sAbe_, ROR #63 SEP + eor s_Aba, s_Aba_, tmp, ROR #21 SEP + bic tmp, sAbo_, sAbi_, ROR #42 SEP + eor sAbe, tmp, sAbe_, ROR #41 SEP + bic tmp, sAbu_, sAbo_, ROR #57 SEP + eor sAbi, tmp, sAbi_, ROR #35 SEP + bic tmp, s_Aba_, sAbu_, ROR #50 SEP + eor sAbo, tmp, sAbo_, ROR #43 SEP + bic tmp, sAbe_, s_Aba_, ROR #44 SEP + eor sAbu, tmp, sAbu_, ROR #30 SEP + SEP + add count, count, #1 SEP + SEP + eor s_Aba, s_Aba, cur_const SEP + SEP +.endm + +.macro hybrid_round_noninitial + save count, STACK_OFFSET_COUNT SEP eor3_m1 C0, vAba, vAga, vAka + SEP + eor sC0, sAka, sAsa, ROR #50 SEP + eor sC1, sAse, sAge, ROR #60 SEP eor3_m0 C0, C0, vAma, vAsa + eor sC2, sAmi, sAgi, ROR #59 SEP + eor sC3, sAgo, sAso, ROR #30 SEP + eor sC4, sAbu, sAsu, ROR #53 SEP eor3_m1 C1, vAbe, vAge, vAke + eor sC0, sAma, sC0, ROR #49 SEP + eor sC1, sAbe, sC1, ROR #44 SEP + eor sC2, sAki, sC2, ROR #26 SEP eor3_m0 C1, C1, vAme, vAse + eor sC3, sAmo, sC3, ROR #63 SEP + eor sC4, sAmu, sC4, ROR #56 SEP + eor sC0, sAga, sC0, ROR #57 SEP eor3_m1 C2, vAbi, vAgi, vAki + eor sC1, sAme, sC1, ROR #58 SEP + eor sC2, sAbi, sC2, ROR #60 SEP + eor sC3, sAko, sC3, ROR #38 SEP eor3_m0 C2, C2, vAmi, vAsi + eor sC4, sAgu, sC4, ROR #48 SEP + eor sC0, s_Aba, sC0, ROR #61 SEP + eor sC1, sAke, sC1, ROR #57 SEP eor3_m1 C3, vAbo, vAgo, vAko + eor sC2, sAsi, sC2, ROR #52 SEP + eor sC3, sAbo, sC3, ROR #63 SEP + eor sC4, sAku, sC4, ROR #50 SEP eor3_m0 C3, C3, vAmo, vAso + ror sC1, sC1, 56 SEP + ror sC4, sC4, 58 SEP + ror sC2, sC2, 62 SEP eor3_m1 C4, vAbu, vAgu, vAku + SEP + eor sE1, sC0, sC2, ROR #63 SEP + eor sE3, sC2, sC4, ROR #63 SEP eor3_m0 C4, C4, vAmu, vAsu + eor sE0, sC4, sC1, ROR #63 SEP + eor sE2, sC1, sC3, ROR #63 SEP + eor sE4, sC3, sC0, ROR #63 SEP + SEP rax1_m1 E1, C0, C2 + eor s_Aba_, sE0, s_Aba SEP + eor sAsa_, sE2, sAbi, ROR #50 SEP + eor sAbi_, sE2, sAki, ROR #46 SEP rax1_m0 E3, C2, C4 + eor sAki_, sE3, sAko, ROR #63 SEP + eor sAko_, sE4, sAmu, ROR #28 SEP + eor sAmu_, sE3, sAso, ROR #2 SEP rax1_m1 E0, C4, C1 + eor sAso_, sE0, sAma, ROR #54 SEP + eor sAka_, sE1, sAbe, ROR #43 SEP + eor sAse_, sE3, sAgo, ROR #36 SEP rax1_m0 E2, C1, C3 + eor sAgo_, sE1, sAme, ROR #49 SEP + eor sAke_, sE2, sAgi, ROR #3 SEP + eor sAgi_, sE0, sAka, ROR #39 SEP rax1_m1 E4, C3, C0 + eor sAga_, sE3, sAbo SEP + eor sAbo_, sE3, sAmo, ROR #37 SEP + eor sAmo_, sE2, sAmi, ROR #8 SEP + eor sAmi_, sE1, sAke, ROR #56 SEP eor vAba_.16b, vAba.16b, E0.16b + eor sAge_, sE4, sAgu, ROR #44 SEP + eor sAgu_, sE2, sAsi, ROR #62 SEP + eor sAsi_, sE4, sAku, ROR #58 SEP xar_m0 vAsa_, vAbi, E2, 2 + eor sAku_, sE0, sAsa, ROR #25 SEP + eor sAma_, sE4, sAbu, ROR #20 SEP + eor sAbu_, sE4, sAsu, ROR #9 SEP xar_m1 vAbi_, vAki, E2, 21 + eor sAsu_, sE1, sAse, ROR #23 SEP + eor sAme_, sE0, sAga, ROR #61 SEP + eor sAbe_, sE1, sAge, ROR #19 SEP xar_m0 vAki_, vAko, E3, 39 + SEP + load_constant_ptr SEP + restore count, STACK_OFFSET_COUNT SEP xar_m1 vAko_, vAmu, E4, 56 + SEP + bic tmp, sAgi_, sAge_, ROR #47 SEP + eor sAga, tmp, sAga_, ROR #39 SEP xar_m0 vAmu_, vAso, E3, 8 + bic tmp, sAgo_, sAgi_, ROR #42 SEP + eor sAge, tmp, sAge_, ROR #25 SEP + bic tmp, sAgu_, sAgo_, ROR #16 SEP xar_m1 vAso_, vAma, E0, 23 + eor sAgi, tmp, sAgi_, ROR #58 SEP + bic tmp, sAga_, sAgu_, ROR #31 SEP + eor sAgo, tmp, sAgo_, ROR #47 SEP xar_m0 vAka_, vAbe, E1, 63 + bic tmp, sAge_, sAga_, ROR #56 SEP + eor sAgu, tmp, sAgu_, ROR #23 SEP + bic tmp, sAki_, sAke_, ROR #19 SEP xar_m1 vAse_, vAgo, E3, 9 + eor sAka, tmp, sAka_, ROR #24 SEP + bic tmp, sAko_, sAki_, ROR #47 SEP + eor sAke, tmp, sAke_, ROR #2 SEP xar_m0 vAgo_, vAme, E1, 19 + bic tmp, sAku_, sAko_, ROR #10 SEP + eor sAki, tmp, sAki_, ROR #57 SEP + bic tmp, sAka_, sAku_, ROR #47 SEP xar_m1 vAke_, vAgi, E2, 58 + eor sAko, tmp, sAko_, ROR #57 SEP + bic tmp, sAke_, sAka_, ROR #5 SEP + eor sAku, tmp, sAku_, ROR #52 SEP xar_m0 vAgi_, vAka, E0, 61 + bic tmp, sAmi_, sAme_, ROR #38 SEP + eor sAma, tmp, sAma_, ROR #47 SEP + bic tmp, sAmo_, sAmi_, ROR #5 SEP xar_m1 vAga_, vAbo, E3, 36 + eor sAme, tmp, sAme_, ROR #43 SEP + bic tmp, sAmu_, sAmo_, ROR #41 SEP + eor sAmi, tmp, sAmi_, ROR #46 SEP xar_m0 vAbo_, vAmo, E3, 43 + bic tmp, sAma_, sAmu_, ROR #35 SEP + SEP + ldr cur_const, [const_addr, count, UXTW #3] SEP xar_m1 vAmo_, vAmi, E2, 49 + add count, count, #1 SEP + SEP + eor sAmo, tmp, sAmo_, ROR #12 SEP xar_m0 vAmi_, vAke, E1, 54 + bic tmp, sAme_, sAma_, ROR #9 SEP + eor sAmu, tmp, sAmu_, ROR #44 SEP + bic tmp, sAsi_, sAse_, ROR #48 SEP xar_m1 vAge_, vAgu, E4, 44 + eor sAsa, tmp, sAsa_, ROR #41 SEP + bic tmp, sAso_, sAsi_, ROR #2 SEP + eor sAse, tmp, sAse_, ROR #50 SEP xar_m0 vAgu_, vAsi, E2, 3 + bic tmp, sAsu_, sAso_, ROR #25 SEP + eor sAsi, tmp, sAsi_, ROR #27 SEP + bic tmp, sAsa_, sAsu_, ROR #60 SEP xar_m1 vAsi_, vAku, E4, 25 + eor sAso, tmp, sAso_, ROR #21 SEP + bic tmp, sAse_, sAsa_, ROR #57 SEP + eor sAsu, tmp, sAsu_, ROR #53 SEP xar_m0 vAku_, vAsa, E0, 46 + bic tmp, sAbi_, sAbe_, ROR #63 SEP + eor s_Aba, s_Aba_, tmp, ROR #21 SEP + bic tmp, sAbo_, sAbi_, ROR #42 SEP xar_m1 vAma_, vAbu, E4, 37 + eor sAbe, tmp, sAbe_, ROR #41 SEP + bic tmp, sAbu_, sAbo_, ROR #57 SEP + eor sAbi, tmp, sAbi_, ROR #35 SEP xar_m0 vAbu_, vAsu, E4, 50 + bic tmp, s_Aba_, sAbu_, ROR #50 SEP + eor sAbo, tmp, sAbo_, ROR #43 SEP + bic tmp, sAbe_, s_Aba_, ROR #44 SEP xar_m1 vAsu_, vAse, E1, 62 + eor sAbu, tmp, sAbu_, ROR #30 SEP + SEP + eor s_Aba, s_Aba, cur_const SEP xar_m0 vAme_, vAga, E0, 28 + save count, STACK_OFFSET_COUNT SEP + SEP + eor sC0, sAka, sAsa, ROR #50 SEP xar_m1 vAbe_, vAge, E1, 20 + eor sC1, sAse, sAge, ROR #60 SEP + eor sC2, sAmi, sAgi, ROR #59 SEP + eor sC3, sAgo, sAso, ROR #30 SEP + eor sC4, sAbu, sAsu, ROR #53 SEP restore sE1, STACK_OFFSET_CONST + eor sC0, sAma, sC0, ROR #49 SEP + eor sC1, sAbe, sC1, ROR #44 SEP + eor sC2, sAki, sC2, ROR #26 SEP ld1r {v28.2d}, [sE1], #8 + eor sC3, sAmo, sC3, ROR #63 SEP + eor sC4, sAmu, sC4, ROR #56 SEP + eor sC0, sAga, sC0, ROR #57 SEP save sE1, STACK_OFFSET_CONST + eor sC1, sAme, sC1, ROR #58 SEP + eor sC2, sAbi, sC2, ROR #60 SEP + eor sC3, sAko, sC3, ROR #38 SEP + eor sC4, sAgu, sC4, ROR #48 SEP bcax_m0 vAga, vAga_, vAgi_, vAge_ + eor sC0, s_Aba, sC0, ROR #61 SEP + eor sC1, sAke, sC1, ROR #57 SEP + eor sC2, sAsi, sC2, ROR #52 SEP bcax_m1 vAge, vAge_, vAgo_, vAgi_ + eor sC3, sAbo, sC3, ROR #63 SEP + eor sC4, sAku, sC4, ROR #50 SEP + ror sC1, sC1, 56 SEP bcax_m0 vAgi, vAgi_, vAgu_, vAgo_ + ror sC4, sC4, 58 SEP + ror sC2, sC2, 62 SEP + SEP bcax_m1 vAgo, vAgo_, vAga_, vAgu_ + eor sE1, sC0, sC2, ROR #63 SEP + eor sE3, sC2, sC4, ROR #63 SEP + eor sE0, sC4, sC1, ROR #63 SEP bcax_m0 vAgu, vAgu_, vAge_, vAga_ + eor sE2, sC1, sC3, ROR #63 SEP + eor sE4, sC3, sC0, ROR #63 SEP + SEP bcax_m1 vAka, vAka_, vAki_, vAke_ + eor s_Aba_, sE0, s_Aba SEP + eor sAsa_, sE2, sAbi, ROR #50 SEP + eor sAbi_, sE2, sAki, ROR #46 SEP bcax_m0 vAke, vAke_, vAko_, vAki_ + eor sAki_, sE3, sAko, ROR #63 SEP + eor sAko_, sE4, sAmu, ROR #28 SEP + eor sAmu_, sE3, sAso, ROR #2 SEP bcax_m1 vAki, vAki_, vAku_, vAko_ + eor sAso_, sE0, sAma, ROR #54 SEP + eor sAka_, sE1, sAbe, ROR #43 SEP + eor sAse_, sE3, sAgo, ROR #36 SEP bcax_m0 vAko, vAko_, vAka_, vAku_ + eor sAgo_, sE1, sAme, ROR #49 SEP + eor sAke_, sE2, sAgi, ROR #3 SEP + eor sAgi_, sE0, sAka, ROR #39 SEP bcax_m1 vAku, vAku_, vAke_, vAka_ + eor sAga_, sE3, sAbo SEP + eor sAbo_, sE3, sAmo, ROR #37 SEP + eor sAmo_, sE2, sAmi, ROR #8 SEP bcax_m0 vAma, vAma_, vAmi_, vAme_ + eor sAmi_, sE1, sAke, ROR #56 SEP + eor sAge_, sE4, sAgu, ROR #44 SEP + eor sAgu_, sE2, sAsi, ROR #62 SEP bcax_m1 vAme, vAme_, vAmo_, vAmi_ + eor sAsi_, sE4, sAku, ROR #58 SEP + eor sAku_, sE0, sAsa, ROR #25 SEP + eor sAma_, sE4, sAbu, ROR #20 SEP bcax_m0 vAmi, vAmi_, vAmu_, vAmo_ + eor sAbu_, sE4, sAsu, ROR #9 SEP + eor sAsu_, sE1, sAse, ROR #23 SEP + eor sAme_, sE0, sAga, ROR #61 SEP bcax_m1 vAmo, vAmo_, vAma_, vAmu_ + eor sAbe_, sE1, sAge, ROR #19 SEP + SEP + load_constant_ptr SEP bcax_m0 vAmu, vAmu_, vAme_, vAma_ + restore count, STACK_OFFSET_COUNT SEP + SEP + bic tmp, sAgi_, sAge_, ROR #47 SEP bcax_m1 vAsa, vAsa_, vAsi_, vAse_ + eor sAga, tmp, sAga_, ROR #39 SEP + bic tmp, sAgo_, sAgi_, ROR #42 SEP + eor sAge, tmp, sAge_, ROR #25 SEP bcax_m0 vAse, vAse_, vAso_, vAsi_ + bic tmp, sAgu_, sAgo_, ROR #16 SEP + eor sAgi, tmp, sAgi_, ROR #58 SEP + bic tmp, sAga_, sAgu_, ROR #31 SEP bcax_m1 vAsi, vAsi_, vAsu_, vAso_ + eor sAgo, tmp, sAgo_, ROR #47 SEP + bic tmp, sAge_, sAga_, ROR #56 SEP + eor sAgu, tmp, sAgu_, ROR #23 SEP bcax_m0 vAso, vAso_, vAsa_, vAsu_ + bic tmp, sAki_, sAke_, ROR #19 SEP + eor sAka, tmp, sAka_, ROR #24 SEP + bic tmp, sAko_, sAki_, ROR #47 SEP bcax_m1 vAsu, vAsu_, vAse_, vAsa_ + eor sAke, tmp, sAke_, ROR #2 SEP + bic tmp, sAku_, sAko_, ROR #10 SEP + eor sAki, tmp, sAki_, ROR #57 SEP bcax_m0 vAba, vAba_, vAbi_, vAbe_ + bic tmp, sAka_, sAku_, ROR #47 SEP + eor sAko, tmp, sAko_, ROR #57 SEP + bic tmp, sAke_, sAka_, ROR #5 SEP bcax_m1 vAbe, vAbe_, vAbo_, vAbi_ + eor sAku, tmp, sAku_, ROR #52 SEP + bic tmp, sAmi_, sAme_, ROR #38 SEP + eor sAma, tmp, sAma_, ROR #47 SEP bcax_m0 vAbi, vAbi_, vAbu_, vAbo_ + bic tmp, sAmo_, sAmi_, ROR #5 SEP + eor sAme, tmp, sAme_, ROR #43 SEP + bic tmp, sAmu_, sAmo_, ROR #41 SEP bcax_m1 vAbo, vAbo_, vAba_, vAbu_ + eor sAmi, tmp, sAmi_, ROR #46 SEP + bic tmp, sAma_, sAmu_, ROR #35 SEP + SEP bcax_m0 vAbu, vAbu_, vAbe_, vAba_ + ldr cur_const, [const_addr, count, UXTW #3] SEP + add count, count, #1 SEP + SEP eor vAba.16b, vAba.16b, v28.16b + eor sAmo, tmp, sAmo_, ROR #12 SEP + bic tmp, sAme_, sAma_, ROR #9 SEP + eor sAmu, tmp, sAmu_, ROR #44 SEP + bic tmp, sAsi_, sAse_, ROR #48 SEP + eor sAsa, tmp, sAsa_, ROR #41 SEP + bic tmp, sAso_, sAsi_, ROR #2 SEP + eor sAse, tmp, sAse_, ROR #50 SEP + bic tmp, sAsu_, sAso_, ROR #25 SEP + eor sAsi, tmp, sAsi_, ROR #27 SEP + bic tmp, sAsa_, sAsu_, ROR #60 SEP + eor sAso, tmp, sAso_, ROR #21 SEP + bic tmp, sAse_, sAsa_, ROR #57 SEP + eor sAsu, tmp, sAsu_, ROR #53 SEP + bic tmp, sAbi_, sAbe_, ROR #63 SEP + eor s_Aba, s_Aba_, tmp, ROR #21 SEP + bic tmp, sAbo_, sAbi_, ROR #42 SEP + eor sAbe, tmp, sAbe_, ROR #41 SEP + bic tmp, sAbu_, sAbo_, ROR #57 SEP + eor sAbi, tmp, sAbi_, ROR #35 SEP + bic tmp, s_Aba_, sAbu_, ROR #50 SEP + eor sAbo, tmp, sAbo_, ROR #43 SEP + bic tmp, sAbe_, s_Aba_, ROR #44 SEP + eor sAbu, tmp, sAbu_, ROR #30 SEP + SEP + eor s_Aba, s_Aba, cur_const SEP + +.endm + +.macro final_rotate + ror sAga, sAga,#(64-3) + ror sAka, sAka,#(64-25) + ror sAma, sAma,#(64-10) + ror sAsa, sAsa,#(64-39) + ror sAbe, sAbe,#(64-21) + ror sAge, sAge,#(64-45) + ror sAke, sAke,#(64-8) + ror sAme, sAme,#(64-15) + ror sAse, sAse,#(64-41) + ror sAbi, sAbi,#(64-14) + ror sAgi, sAgi,#(64-61) + ror sAki, sAki,#(64-18) + ror sAmi, sAmi,#(64-56) + ror sAsi, sAsi,#(64-2) + ror sAgo, sAgo,#(64-28) + ror sAko, sAko,#(64-1) + ror sAmo, sAmo,#(64-27) + ror sAso, sAso,#(64-62) + ror sAbu, sAbu,#(64-44) + ror sAgu, sAgu,#(64-20) + ror sAku, sAku,#(64-6) + ror sAmu, sAmu,#(64-36) + ror sAsu, sAsu,#(64-55) +.endm + +#define KECCAK_F1600_ROUNDS 24 + +.global keccak_f1600_x4_hybrid_asm_v4 +.global _keccak_f1600_x4_hybrid_asm_v4 +.text +.align 4 + +keccak_f1600_x4_hybrid_asm_v4: +_keccak_f1600_x4_hybrid_asm_v4: + alloc_stack + save_gprs + save_vregs + save input_addr, STACK_OFFSET_INPUT + + load_input_vector 2,1 + + load_constant_ptr + save const_addr, STACK_OFFSET_CONST + + // First scalar Keccak computation alongside first half of SIMD computation + load_input_scalar 4,0 + hybrid_round_initial + loop_0: + hybrid_round_noninitial + cmp count, #(KECCAK_F1600_ROUNDS-1) + ble loop_0 + final_rotate + restore input_addr, STACK_OFFSET_INPUT + store_input_scalar 4,0 + + // Second scalar Keccak computation alongsie second half of SIMD computation + load_input_scalar 4,1 + hybrid_round_initial + loop_1: + hybrid_round_noninitial + cmp count, #(KECCAK_F1600_ROUNDS-1) + ble loop_1 + final_rotate + restore input_addr, STACK_OFFSET_INPUT + store_input_scalar 4, 1 + + store_input_vector 2,1 + + restore_vregs + restore_gprs + free_stack + ret + +#endif diff --git a/tests/keccak_neon/manual/keccak_f1600_x4_hybrid_asm_v4p.s b/tests/keccak_neon/manual/keccak_f1600_x4_hybrid_asm_v4p.s new file mode 100644 index 0000000..69a8718 --- /dev/null +++ b/tests/keccak_neon/manual/keccak_f1600_x4_hybrid_asm_v4p.s @@ -0,0 +1,1026 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +#if defined(__ARM_FEATURE_SHA3) + +/********************** CONSTANTS *************************/ + .data + .align(8) +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x29 + count .req w27 + out_count .req w27 + cur_const .req x26 + + /* Mapping of Kecck-f1600 SIMD state to vector registers + * at the beginning and end of each round. */ + + vAba .req v0 + vAbe .req v1 + vAbi .req v2 + vAbo .req v3 + vAbu .req v4 + vAga .req v5 + vAge .req v6 + vAgi .req v7 + vAgo .req v8 + vAgu .req v9 + vAka .req v10 + vAke .req v11 + vAki .req v12 + vAko .req v13 + vAku .req v14 + vAma .req v15 + vAme .req v16 + vAmi .req v17 + vAmo .req v18 + vAmu .req v19 + vAsa .req v20 + vAse .req v21 + vAsi .req v22 + vAso .req v23 + vAsu .req v24 + + /* q-form of the above mapping */ + vAbaq .req q0 + vAbeq .req q1 + vAbiq .req q2 + vAboq .req q3 + vAbuq .req q4 + vAgaq .req q5 + vAgeq .req q6 + vAgiq .req q7 + vAgoq .req q8 + vAguq .req q9 + vAkaq .req q10 + vAkeq .req q11 + vAkiq .req q12 + vAkoq .req q13 + vAkuq .req q14 + vAmaq .req q15 + vAmeq .req q16 + vAmiq .req q17 + vAmoq .req q18 + vAmuq .req q19 + vAsaq .req q20 + vAseq .req q21 + vAsiq .req q22 + vAsoq .req q23 + vAsuq .req q24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v30 + C1 .req v29 + C2 .req v28 + C3 .req v27 + C4 .req v26 + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req v26 + E1 .req v25 + E2 .req v29 + E3 .req v28 + E4 .req v27 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + vAbi_ .req v2 + vAbo_ .req v3 + vAbu_ .req v4 + vAga_ .req v10 + vAge_ .req v11 + vAgi_ .req v7 + vAgo_ .req v8 + vAgu_ .req v9 + vAka_ .req v15 + vAke_ .req v16 + vAki_ .req v12 + vAko_ .req v13 + vAku_ .req v14 + vAma_ .req v20 + vAme_ .req v21 + vAmi_ .req v17 + vAmo_ .req v18 + vAmu_ .req v19 + vAsa_ .req v0 + vAse_ .req v1 + vAsi_ .req v22 + vAso_ .req v23 + vAsu_ .req v24 + vAba_ .req v30 + vAbe_ .req v27 + + /* Unused temporary */ + vtmp .req v31 + + /* Mapping of Kecck-f1600 state to scalar registers + * at the beginning and end of each round. */ + s_Aba .req x1 + sAbe .req x6 + sAbi .req x11 + sAbo .req x16 + sAbu .req x21 + sAga .req x2 + sAge .req x7 + sAgi .req x12 + sAgo .req x17 + sAgu .req x22 + sAka .req x3 + sAke .req x8 + sAki .req x13 + sAko .req x18 + sAku .req x23 + sAma .req x4 + sAme .req x9 + sAmi .req x14 + sAmo .req x19 + sAmu .req x24 + sAsa .req x5 + sAse .req x10 + sAsi .req x15 + sAso .req x20 + sAsu .req x25 + + /* sA_[y,2*x+3*y] = rot(A[x,y]) */ + s_Aba_ .req x0 + sAbe_ .req x28 + sAbi_ .req x11 + sAbo_ .req x16 + sAbu_ .req x21 + sAga_ .req x3 + sAge_ .req x8 + sAgi_ .req x12 + sAgo_ .req x17 + sAgu_ .req x22 + sAka_ .req x4 + sAke_ .req x9 + sAki_ .req x13 + sAko_ .req x18 + sAku_ .req x23 + sAma_ .req x5 + sAme_ .req x10 + sAmi_ .req x14 + sAmo_ .req x19 + sAmu_ .req x24 + sAsa_ .req x1 + sAse_ .req x6 + sAsi_ .req x15 + sAso_ .req x20 + sAsu_ .req x25 + + /* sC[x] = sA[x,0] xor sA[x,1] xor sA[x,2] xor sA[x,3] xor sA[x,4], for x in 0..4 */ + /* sE[x] = sC[x-1] xor rot(C[x+1],1), for x in 0..4 */ + sC0 .req x0 + sE0 .req x29 + sC1 .req x26 + sE1 .req x30 + sC2 .req x27 + sE2 .req x26 + sC3 .req x28 + sE3 .req x27 + sC4 .req x29 + sE4 .req x28 + + tmp .req x30 + +/************************ MACROS ****************************/ + +/* Macros using v8.4-A SHA-3 instructions */ + + +.macro eor3_m1 d s0 s1 s2 + eor \d\().16b, \s0\().16b, \s1\().16b + eor \d\().16b, \d\().16b, \s2\().16b +.endm + +.macro rax1_m1 d s0 s1 + add vtmp.2d, \s1\().2d, \s1\().2d + sri vtmp.2d, \s1\().2d, #63 + eor \d\().16b, vtmp.16b, \s0\().16b +.endm + +.macro xar_m1 d s0 s1 imm + eor vtmp.16b, \s0\().16b, \s1\().16b + shl \d\().2d, vtmp.2d, #(64-\imm) + sri \d\().2d, vtmp.2d, #(\imm) +.endm + +.macro bcax_m1 d s0 s1 s2 + bic vtmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, vtmp.16b, \s0\().16b +.endm + +.macro eor3_m0 d s0 s1 s2 + eor3 \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +.macro rax1_m0 d s0 s1 + rax1 \d\().2d, \s0\().2d, \s1\().2d +.endm + +.macro xar_m0 d s0 s1 imm + xar \d\().2d, \s0\().2d, \s1\().2d, #\imm +.endm + +.macro bcax_m0 d s0 s1 s2 + bcax \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + + +.macro load_input_vector + ldr vAbaq, [input_addr, #(32*0)] + ldr vAbeq, [input_addr, #(32*0+32)] + ldr vAbiq, [input_addr, #(32*2)] + ldr vAboq, [input_addr, #(32*2+32)] + ldr vAbuq, [input_addr, #(32*4)] + ldr vAgaq, [input_addr, #(32*4+32)] + ldr vAgeq, [input_addr, #(32*6)] + ldr vAgiq, [input_addr, #(32*6+32)] + ldr vAgoq, [input_addr, #(32*8)] + ldr vAguq, [input_addr, #(32*8+32)] + ldr vAkaq, [input_addr, #(32*10)] + ldr vAkeq, [input_addr, #(32*10+32)] + ldr vAkiq, [input_addr, #(32*12)] + ldr vAkoq, [input_addr, #(32*12+32)] + ldr vAkuq, [input_addr, #(32*14)] + ldr vAmaq, [input_addr, #(32*14+32)] + ldr vAmeq, [input_addr, #(32*16)] + ldr vAmiq, [input_addr, #(32*16+32)] + ldr vAmoq, [input_addr, #(32*18)] + ldr vAmuq, [input_addr, #(32*18+32)] + ldr vAsaq, [input_addr, #(32*20)] + ldr vAseq, [input_addr, #(32*20+32)] + ldr vAsiq, [input_addr, #(32*22)] + ldr vAsoq, [input_addr, #(32*22+32)] + ldr vAsuq, [input_addr, #(32*24)] +.endm + +.macro store_input_vector + str vAbaq, [input_addr, #(32*0)] + str vAbeq, [input_addr, #(32*0+32)] + str vAbiq, [input_addr, #(32*2)] + str vAboq, [input_addr, #(32*2+32)] + str vAbuq, [input_addr, #(32*4)] + str vAgaq, [input_addr, #(32*4+32)] + str vAgeq, [input_addr, #(32*6)] + str vAgiq, [input_addr, #(32*6+32)] + str vAgoq, [input_addr, #(32*8)] + str vAguq, [input_addr, #(32*8+32)] + str vAkaq, [input_addr, #(32*10)] + str vAkeq, [input_addr, #(32*10+32)] + str vAkiq, [input_addr, #(32*12)] + str vAkoq, [input_addr, #(32*12+32)] + str vAkuq, [input_addr, #(32*14)] + str vAmaq, [input_addr, #(32*14+32)] + str vAmeq, [input_addr, #(32*16)] + str vAmiq, [input_addr, #(32*16+32)] + str vAmoq, [input_addr, #(32*18)] + str vAmuq, [input_addr, #(32*18+32)] + str vAsaq, [input_addr, #(32*20)] + str vAseq, [input_addr, #(32*20+32)] + str vAsiq, [input_addr, #(32*22)] + str vAsoq, [input_addr, #(32*22+32)] + str vAsuq, [input_addr, #(32*24)] +.endm + +.macro store_input_scalar + str s_Aba,[input_addr, 32*0 ] + str sAbe, [input_addr, 32*1 ] + str sAbi, [input_addr, 32*2 ] + str sAbo, [input_addr, 32*3 ] + str sAbu, [input_addr, 32*4 ] + str sAga, [input_addr, 32*5 ] + str sAge, [input_addr, 32*6 ] + str sAgi, [input_addr, 32*7 ] + str sAgo, [input_addr, 32*8 ] + str sAgu, [input_addr, 32*9 ] + str sAka, [input_addr, 32*10] + str sAke, [input_addr, 32*11] + str sAki, [input_addr, 32*12] + str sAko, [input_addr, 32*13] + str sAku, [input_addr, 32*14] + str sAma, [input_addr, 32*15] + str sAme, [input_addr, 32*16] + str sAmi, [input_addr, 32*17] + str sAmo, [input_addr, 32*18] + str sAmu, [input_addr, 32*19] + str sAsa, [input_addr, 32*20] + str sAse, [input_addr, 32*21] + str sAsi, [input_addr, 32*22] + str sAso, [input_addr, 32*23] + str sAsu, [input_addr, 32*24] +.endm + +.macro load_input_scalar + ldr s_Aba,[input_addr, 32*0 ] + ldr sAbe, [input_addr, 32*1 ] + ldr sAbi, [input_addr, 32*2 ] + ldr sAbo, [input_addr, 32*3 ] + ldr sAbu, [input_addr, 32*4 ] + ldr sAga, [input_addr, 32*5 ] + ldr sAge, [input_addr, 32*6 ] + ldr sAgi, [input_addr, 32*7 ] + ldr sAgo, [input_addr, 32*8 ] + ldr sAgu, [input_addr, 32*9 ] + ldr sAka, [input_addr, 32*10] + ldr sAke, [input_addr, 32*11] + ldr sAki, [input_addr, 32*12] + ldr sAko, [input_addr, 32*13] + ldr sAku, [input_addr, 32*14] + ldr sAma, [input_addr, 32*15] + ldr sAme, [input_addr, 32*16] + ldr sAmi, [input_addr, 32*17] + ldr sAmo, [input_addr, 32*18] + ldr sAmu, [input_addr, 32*19] + ldr sAsa, [input_addr, 32*20] + ldr sAse, [input_addr, 32*21] + ldr sAsi, [input_addr, 32*22] + ldr sAso, [input_addr, 32*23] + ldr sAsu, [input_addr, 32*24] +.endm + +#define STACK_SIZE (4*16 + 12*8 + 6*8) +#define STACK_BASE_VREGS (0) +#define STACK_BASE_GPRS (4*16) +#define STACK_BASE_TMP_GPRS (4*16 + 12*8) +#define STACK_OFFSET_INPUT (0*8) +#define STACK_OFFSET_CONST (1*8) +#define STACK_OFFSET_COUNT (2*8) +#define STACK_OFFSET_COUNT_OUT (3*8) +#define STACK_OFFSET_CUR_INPUT (4*8) + +.macro save_gprs + stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro save_vregs + stp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] + stp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + stp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + stp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] +.endm + +.macro restore_vregs + ldp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] + ldp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + ldp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + ldp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] +.endm + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +.macro eor5 dst, src0, src1, src2, src3, src4 + eor \dst, \src0, \src1 + eor \dst, \dst, \src2 + eor \dst, \dst, \src3 + eor \dst, \dst, \src4 +.endm + +.macro xor_rol dst, src1, src0, imm + eor \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro bic_rol dst, src1, src0, imm + bic \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro rotate dst, src, imm + ror \dst, \src, #(64-\imm) +.endm + +.macro save reg, offset + str \reg, [sp, #(STACK_BASE_TMP_GPRS + \offset)] +.endm + +.macro restore reg, offset + ldr \reg, [sp, #(STACK_BASE_TMP_GPRS + \offset)] +.endm + +.macro hybrid_round_initial + + eor sC0, sAma, sAsa SEP eor3_m1 C0, vAba, vAga, vAka + eor sC1, sAme, sAse SEP + eor sC2, sAmi, sAsi SEP + eor sC3, sAmo, sAso SEP eor3_m0 C0, C0, vAma, vAsa + eor sC4, sAmu, sAsu SEP + eor sC0, sAka, sC0 SEP + eor sC1, sAke, sC1 SEP eor3_m1 C1, vAbe, vAge, vAke + eor sC2, sAki, sC2 SEP + eor sC3, sAko, sC3 SEP + eor sC4, sAku, sC4 SEP eor3_m0 C1, C1, vAme, vAse + eor sC0, sAga, sC0 SEP + eor sC1, sAge, sC1 SEP + eor sC2, sAgi, sC2 SEP eor3_m1 C2, vAbi, vAgi, vAki + eor sC3, sAgo, sC3 SEP + eor sC4, sAgu, sC4 SEP + eor sC0, s_Aba, sC0 SEP eor3_m0 C2, C2, vAmi, vAsi + eor sC1, sAbe, sC1 SEP + eor sC2, sAbi, sC2 SEP + eor sC3, sAbo, sC3 SEP eor3_m1 C3, vAbo, vAgo, vAko + eor sC4, sAbu, sC4 SEP + SEP + eor sE1, sC0, sC2, ROR #63 SEP eor3_m0 C3, C3, vAmo, vAso + eor sE3, sC2, sC4, ROR #63 SEP + eor sE0, sC4, sC1, ROR #63 SEP + eor sE2, sC1, sC3, ROR #63 SEP eor3_m1 C4, vAbu, vAgu, vAku + eor sE4, sC3, sC0, ROR #63 SEP + SEP + eor s_Aba_, s_Aba, sE0 SEP eor3_m0 C4, C4, vAmu, vAsu + eor sAsa_, sAbi, sE2 SEP + eor sAbi_, sAki, sE2 SEP + eor sAki_, sAko, sE3 SEP + eor sAko_, sAmu, sE4 SEP rax1_m1 E1, C0, C2 + eor sAmu_, sAso, sE3 SEP + eor sAso_, sAma, sE0 SEP + eor sAka_, sAbe, sE1 SEP rax1_m0 E3, C2, C4 + eor sAse_, sAgo, sE3 SEP + eor sAgo_, sAme, sE1 SEP + eor sAke_, sAgi, sE2 SEP rax1_m1 E0, C4, C1 + eor sAgi_, sAka, sE0 SEP + eor sAga_, sAbo, sE3 SEP + eor sAbo_, sAmo, sE3 SEP rax1_m0 E2, C1, C3 + eor sAmo_, sAmi, sE2 SEP + eor sAmi_, sAke, sE1 SEP + eor sAge_, sAgu, sE4 SEP rax1_m1 E4, C3, C0 + eor sAgu_, sAsi, sE2 SEP + eor sAsi_, sAku, sE4 SEP + eor sAku_, sAsa, sE0 SEP + eor sAma_, sAbu, sE4 SEP eor vAba_.16b, vAba.16b, E0.16b + eor sAbu_, sAsu, sE4 SEP + eor sAsu_, sAse, sE1 SEP + eor sAme_, sAga, sE0 SEP xar_m0 vAsa_, vAbi, E2, 2 + eor sAbe_, sAge, sE1 SEP + SEP + load_constant_ptr SEP xar_m1 vAbi_, vAki, E2, 21 + SEP + bic tmp, sAgi_, sAge_, ROR #47 SEP + eor sAga, tmp, sAga_, ROR #39 SEP xar_m0 vAki_, vAko, E3, 39 + bic tmp, sAgo_, sAgi_, ROR #42 SEP + eor sAge, tmp, sAge_, ROR #25 SEP + bic tmp, sAgu_, sAgo_, ROR #16 SEP xar_m1 vAko_, vAmu, E4, 56 + eor sAgi, tmp, sAgi_, ROR #58 SEP + bic tmp, sAga_, sAgu_, ROR #31 SEP + eor sAgo, tmp, sAgo_, ROR #47 SEP xar_m0 vAmu_, vAso, E3, 8 + bic tmp, sAge_, sAga_, ROR #56 SEP + eor sAgu, tmp, sAgu_, ROR #23 SEP + bic tmp, sAki_, sAke_, ROR #19 SEP xar_m1 vAso_, vAma, E0, 23 + eor sAka, tmp, sAka_, ROR #24 SEP + bic tmp, sAko_, sAki_, ROR #47 SEP + eor sAke, tmp, sAke_, ROR #2 SEP xar_m0 vAka_, vAbe, E1, 63 + bic tmp, sAku_, sAko_, ROR #10 SEP + eor sAki, tmp, sAki_, ROR #57 SEP + bic tmp, sAka_, sAku_, ROR #47 SEP xar_m1 vAse_, vAgo, E3, 9 + eor sAko, tmp, sAko_, ROR #57 SEP + bic tmp, sAke_, sAka_, ROR #5 SEP + eor sAku, tmp, sAku_, ROR #52 SEP xar_m0 vAgo_, vAme, E1, 19 + bic tmp, sAmi_, sAme_, ROR #38 SEP + eor sAma, tmp, sAma_, ROR #47 SEP + bic tmp, sAmo_, sAmi_, ROR #5 SEP xar_m1 vAke_, vAgi, E2, 58 + eor sAme, tmp, sAme_, ROR #43 SEP + bic tmp, sAmu_, sAmo_, ROR #41 SEP + eor sAmi, tmp, sAmi_, ROR #46 SEP xar_m0 vAgi_, vAka, E0, 61 + SEP + ldr cur_const, [const_addr] SEP + mov count, #1 SEP xar_m1 vAga_, vAbo, E3, 36 + SEP + bic tmp, sAma_, sAmu_, ROR #35 SEP + eor sAmo, tmp, sAmo_, ROR #12 SEP xar_m0 vAbo_, vAmo, E3, 43 + bic tmp, sAme_, sAma_, ROR #9 SEP + eor sAmu, tmp, sAmu_, ROR #44 SEP + bic tmp, sAsi_, sAse_, ROR #48 SEP xar_m1 vAmo_, vAmi, E2, 49 + eor sAsa, tmp, sAsa_, ROR #41 SEP + bic tmp, sAso_, sAsi_, ROR #2 SEP + eor sAse, tmp, sAse_, ROR #50 SEP xar_m0 vAmi_, vAke, E1, 54 + bic tmp, sAsu_, sAso_, ROR #25 SEP + eor sAsi, tmp, sAsi_, ROR #27 SEP + bic tmp, sAsa_, sAsu_, ROR #60 SEP xar_m1 vAge_, vAgu, E4, 44 + eor sAso, tmp, sAso_, ROR #21 SEP + bic tmp, sAse_, sAsa_, ROR #57 SEP + eor sAsu, tmp, sAsu_, ROR #53 SEP xar_m0 vAgu_, vAsi, E2, 3 + bic tmp, sAbi_, sAbe_, ROR #63 SEP + eor s_Aba, s_Aba_, tmp, ROR #21 SEP + bic tmp, sAbo_, sAbi_, ROR #42 SEP xar_m1 vAsi_, vAku, E4, 25 + eor sAbe, tmp, sAbe_, ROR #41 SEP + bic tmp, sAbu_, sAbo_, ROR #57 SEP + eor sAbi, tmp, sAbi_, ROR #35 SEP xar_m0 vAku_, vAsa, E0, 46 + bic tmp, s_Aba_, sAbu_, ROR #50 SEP + eor sAbo, tmp, sAbo_, ROR #43 SEP + bic tmp, sAbe_, s_Aba_, ROR #44 SEP xar_m1 vAma_, vAbu, E4, 37 + eor sAbu, tmp, sAbu_, ROR #30 SEP + SEP + eor s_Aba, s_Aba, cur_const SEP xar_m0 vAbu_, vAsu, E4, 50 + SEP + save count, STACK_OFFSET_COUNT SEP + SEP xar_m1 vAsu_, vAse, E1, 62 + eor sC0, sAka, sAsa, ROR #50 SEP + eor sC1, sAse, sAge, ROR #60 SEP + eor sC2, sAmi, sAgi, ROR #59 SEP xar_m0 vAme_, vAga, E0, 28 + eor sC3, sAgo, sAso, ROR #30 SEP + eor sC4, sAbu, sAsu, ROR #53 SEP + eor sC0, sAma, sC0, ROR #49 SEP xar_m1 vAbe_, vAge, E1, 20 + eor sC1, sAbe, sC1, ROR #44 SEP + eor sC2, sAki, sC2, ROR #26 SEP restore sE1, STACK_OFFSET_CONST + eor sC3, sAmo, sC3, ROR #63 SEP + eor sC4, sAmu, sC4, ROR #56 SEP + eor sC0, sAga, sC0, ROR #57 SEP ld1r {v28.2d}, [sE1], #8 + eor sC1, sAme, sC1, ROR #58 SEP + eor sC2, sAbi, sC2, ROR #60 SEP + eor sC3, sAko, sC3, ROR #38 SEP save sE1, STACK_OFFSET_CONST + eor sC4, sAgu, sC4, ROR #48 SEP + eor sC0, s_Aba, sC0, ROR #61 SEP bcax_m0 vAga, vAga_, vAgi_, vAge_ + eor sC1, sAke, sC1, ROR #57 SEP + eor sC2, sAsi, sC2, ROR #52 SEP + eor sC3, sAbo, sC3, ROR #63 SEP bcax_m1 vAge, vAge_, vAgo_, vAgi_ + eor sC4, sAku, sC4, ROR #50 SEP + ror sC1, sC1, 56 SEP + ror sC4, sC4, 58 SEP bcax_m0 vAgi, vAgi_, vAgu_, vAgo_ + ror sC2, sC2, 62 SEP + SEP + eor sE1, sC0, sC2, ROR #63 SEP bcax_m1 vAgo, vAgo_, vAga_, vAgu_ + eor sE3, sC2, sC4, ROR #63 SEP + eor sE0, sC4, sC1, ROR #63 SEP + eor sE2, sC1, sC3, ROR #63 SEP bcax_m0 vAgu, vAgu_, vAge_, vAga_ + eor sE4, sC3, sC0, ROR #63 SEP + SEP + eor s_Aba_, sE0, s_Aba SEP bcax_m1 vAka, vAka_, vAki_, vAke_ + eor sAsa_, sE2, sAbi, ROR #50 SEP + eor sAbi_, sE2, sAki, ROR #46 SEP + eor sAki_, sE3, sAko, ROR #63 SEP bcax_m0 vAke, vAke_, vAko_, vAki_ + eor sAko_, sE4, sAmu, ROR #28 SEP + eor sAmu_, sE3, sAso, ROR #2 SEP + eor sAso_, sE0, sAma, ROR #54 SEP bcax_m1 vAki, vAki_, vAku_, vAko_ + eor sAka_, sE1, sAbe, ROR #43 SEP + eor sAse_, sE3, sAgo, ROR #36 SEP + eor sAgo_, sE1, sAme, ROR #49 SEP bcax_m0 vAko, vAko_, vAka_, vAku_ + eor sAke_, sE2, sAgi, ROR #3 SEP + eor sAgi_, sE0, sAka, ROR #39 SEP + eor sAga_, sE3, sAbo SEP bcax_m1 vAku, vAku_, vAke_, vAka_ + eor sAbo_, sE3, sAmo, ROR #37 SEP + eor sAmo_, sE2, sAmi, ROR #8 SEP + eor sAmi_, sE1, sAke, ROR #56 SEP bcax_m0 vAma, vAma_, vAmi_, vAme_ + eor sAge_, sE4, sAgu, ROR #44 SEP + eor sAgu_, sE2, sAsi, ROR #62 SEP + eor sAsi_, sE4, sAku, ROR #58 SEP bcax_m1 vAme, vAme_, vAmo_, vAmi_ + eor sAku_, sE0, sAsa, ROR #25 SEP + eor sAma_, sE4, sAbu, ROR #20 SEP + eor sAbu_, sE4, sAsu, ROR #9 SEP bcax_m0 vAmi, vAmi_, vAmu_, vAmo_ + eor sAsu_, sE1, sAse, ROR #23 SEP + eor sAme_, sE0, sAga, ROR #61 SEP + eor sAbe_, sE1, sAge, ROR #19 SEP bcax_m1 vAmo, vAmo_, vAma_, vAmu_ + SEP + load_constant_ptr SEP + restore count, STACK_OFFSET_COUNT SEP bcax_m0 vAmu, vAmu_, vAme_, vAma_ + SEP + bic tmp, sAgi_, sAge_, ROR #47 SEP + eor sAga, tmp, sAga_, ROR #39 SEP bcax_m1 vAsa, vAsa_, vAsi_, vAse_ + bic tmp, sAgo_, sAgi_, ROR #42 SEP + eor sAge, tmp, sAge_, ROR #25 SEP + bic tmp, sAgu_, sAgo_, ROR #16 SEP bcax_m0 vAse, vAse_, vAso_, vAsi_ + eor sAgi, tmp, sAgi_, ROR #58 SEP + bic tmp, sAga_, sAgu_, ROR #31 SEP + eor sAgo, tmp, sAgo_, ROR #47 SEP bcax_m1 vAsi, vAsi_, vAsu_, vAso_ + bic tmp, sAge_, sAga_, ROR #56 SEP + eor sAgu, tmp, sAgu_, ROR #23 SEP + bic tmp, sAki_, sAke_, ROR #19 SEP bcax_m0 vAso, vAso_, vAsa_, vAsu_ + eor sAka, tmp, sAka_, ROR #24 SEP + bic tmp, sAko_, sAki_, ROR #47 SEP + eor sAke, tmp, sAke_, ROR #2 SEP bcax_m1 vAsu, vAsu_, vAse_, vAsa_ + bic tmp, sAku_, sAko_, ROR #10 SEP + eor sAki, tmp, sAki_, ROR #57 SEP + bic tmp, sAka_, sAku_, ROR #47 SEP bcax_m0 vAba, vAba_, vAbi_, vAbe_ + eor sAko, tmp, sAko_, ROR #57 SEP + bic tmp, sAke_, sAka_, ROR #5 SEP + eor sAku, tmp, sAku_, ROR #52 SEP bcax_m1 vAbe, vAbe_, vAbo_, vAbi_ + bic tmp, sAmi_, sAme_, ROR #38 SEP + eor sAma, tmp, sAma_, ROR #47 SEP + bic tmp, sAmo_, sAmi_, ROR #5 SEP bcax_m0 vAbi, vAbi_, vAbu_, vAbo_ + eor sAme, tmp, sAme_, ROR #43 SEP + bic tmp, sAmu_, sAmo_, ROR #41 SEP + eor sAmi, tmp, sAmi_, ROR #46 SEP bcax_m1 vAbo, vAbo_, vAba_, vAbu_ + bic tmp, sAma_, sAmu_, ROR #35 SEP + SEP + ldr cur_const, [const_addr, count, UXTW #3] SEP bcax_m0 vAbu, vAbu_, vAbe_, vAba_ + SEP + eor sAmo, tmp, sAmo_, ROR #12 SEP + bic tmp, sAme_, sAma_, ROR #9 SEP + eor sAmu, tmp, sAmu_, ROR #44 SEP eor vAba.16b, vAba.16b, v28.16b + bic tmp, sAsi_, sAse_, ROR #48 SEP + eor sAsa, tmp, sAsa_, ROR #41 SEP + bic tmp, sAso_, sAsi_, ROR #2 SEP + eor sAse, tmp, sAse_, ROR #50 SEP + bic tmp, sAsu_, sAso_, ROR #25 SEP + eor sAsi, tmp, sAsi_, ROR #27 SEP + bic tmp, sAsa_, sAsu_, ROR #60 SEP + eor sAso, tmp, sAso_, ROR #21 SEP + bic tmp, sAse_, sAsa_, ROR #57 SEP + eor sAsu, tmp, sAsu_, ROR #53 SEP + bic tmp, sAbi_, sAbe_, ROR #63 SEP + eor s_Aba, s_Aba_, tmp, ROR #21 SEP + bic tmp, sAbo_, sAbi_, ROR #42 SEP + eor sAbe, tmp, sAbe_, ROR #41 SEP + bic tmp, sAbu_, sAbo_, ROR #57 SEP + eor sAbi, tmp, sAbi_, ROR #35 SEP + bic tmp, s_Aba_, sAbu_, ROR #50 SEP + eor sAbo, tmp, sAbo_, ROR #43 SEP + bic tmp, sAbe_, s_Aba_, ROR #44 SEP + eor sAbu, tmp, sAbu_, ROR #30 SEP + SEP + add count, count, #1 SEP + SEP + eor s_Aba, s_Aba, cur_const SEP + SEP +.endm + +.macro hybrid_round_noninitial + save count, STACK_OFFSET_COUNT SEP eor3_m1 C0, vAba, vAga, vAka + SEP + eor sC0, sAka, sAsa, ROR #50 SEP + eor sC1, sAse, sAge, ROR #60 SEP eor3_m0 C0, C0, vAma, vAsa + eor sC2, sAmi, sAgi, ROR #59 SEP + eor sC3, sAgo, sAso, ROR #30 SEP + eor sC4, sAbu, sAsu, ROR #53 SEP eor3_m1 C1, vAbe, vAge, vAke + eor sC0, sAma, sC0, ROR #49 SEP + eor sC1, sAbe, sC1, ROR #44 SEP + eor sC2, sAki, sC2, ROR #26 SEP eor3_m0 C1, C1, vAme, vAse + eor sC3, sAmo, sC3, ROR #63 SEP + eor sC4, sAmu, sC4, ROR #56 SEP + eor sC0, sAga, sC0, ROR #57 SEP eor3_m1 C2, vAbi, vAgi, vAki + eor sC1, sAme, sC1, ROR #58 SEP + eor sC2, sAbi, sC2, ROR #60 SEP + eor sC3, sAko, sC3, ROR #38 SEP eor3_m0 C2, C2, vAmi, vAsi + eor sC4, sAgu, sC4, ROR #48 SEP + eor sC0, s_Aba, sC0, ROR #61 SEP + eor sC1, sAke, sC1, ROR #57 SEP eor3_m1 C3, vAbo, vAgo, vAko + eor sC2, sAsi, sC2, ROR #52 SEP + eor sC3, sAbo, sC3, ROR #63 SEP + eor sC4, sAku, sC4, ROR #50 SEP eor3_m0 C3, C3, vAmo, vAso + ror sC1, sC1, 56 SEP + ror sC4, sC4, 58 SEP + ror sC2, sC2, 62 SEP eor3_m1 C4, vAbu, vAgu, vAku + SEP + eor sE1, sC0, sC2, ROR #63 SEP + eor sE3, sC2, sC4, ROR #63 SEP eor3_m0 C4, C4, vAmu, vAsu + eor sE0, sC4, sC1, ROR #63 SEP + eor sE2, sC1, sC3, ROR #63 SEP + eor sE4, sC3, sC0, ROR #63 SEP + SEP rax1_m1 E1, C0, C2 + eor s_Aba_, sE0, s_Aba SEP + eor sAsa_, sE2, sAbi, ROR #50 SEP + eor sAbi_, sE2, sAki, ROR #46 SEP rax1_m0 E3, C2, C4 + eor sAki_, sE3, sAko, ROR #63 SEP + eor sAko_, sE4, sAmu, ROR #28 SEP + eor sAmu_, sE3, sAso, ROR #2 SEP rax1_m1 E0, C4, C1 + eor sAso_, sE0, sAma, ROR #54 SEP + eor sAka_, sE1, sAbe, ROR #43 SEP + eor sAse_, sE3, sAgo, ROR #36 SEP rax1_m0 E2, C1, C3 + eor sAgo_, sE1, sAme, ROR #49 SEP + eor sAke_, sE2, sAgi, ROR #3 SEP + eor sAgi_, sE0, sAka, ROR #39 SEP rax1_m1 E4, C3, C0 + eor sAga_, sE3, sAbo SEP + eor sAbo_, sE3, sAmo, ROR #37 SEP + eor sAmo_, sE2, sAmi, ROR #8 SEP + eor sAmi_, sE1, sAke, ROR #56 SEP eor vAba_.16b, vAba.16b, E0.16b + eor sAge_, sE4, sAgu, ROR #44 SEP + eor sAgu_, sE2, sAsi, ROR #62 SEP + eor sAsi_, sE4, sAku, ROR #58 SEP xar_m0 vAsa_, vAbi, E2, 2 + eor sAku_, sE0, sAsa, ROR #25 SEP + eor sAma_, sE4, sAbu, ROR #20 SEP + eor sAbu_, sE4, sAsu, ROR #9 SEP xar_m1 vAbi_, vAki, E2, 21 + eor sAsu_, sE1, sAse, ROR #23 SEP + eor sAme_, sE0, sAga, ROR #61 SEP + eor sAbe_, sE1, sAge, ROR #19 SEP xar_m0 vAki_, vAko, E3, 39 + SEP + load_constant_ptr SEP + restore count, STACK_OFFSET_COUNT SEP xar_m1 vAko_, vAmu, E4, 56 + SEP + bic tmp, sAgi_, sAge_, ROR #47 SEP + eor sAga, tmp, sAga_, ROR #39 SEP xar_m0 vAmu_, vAso, E3, 8 + bic tmp, sAgo_, sAgi_, ROR #42 SEP + eor sAge, tmp, sAge_, ROR #25 SEP + bic tmp, sAgu_, sAgo_, ROR #16 SEP xar_m1 vAso_, vAma, E0, 23 + eor sAgi, tmp, sAgi_, ROR #58 SEP + bic tmp, sAga_, sAgu_, ROR #31 SEP + eor sAgo, tmp, sAgo_, ROR #47 SEP xar_m0 vAka_, vAbe, E1, 63 + bic tmp, sAge_, sAga_, ROR #56 SEP + eor sAgu, tmp, sAgu_, ROR #23 SEP + bic tmp, sAki_, sAke_, ROR #19 SEP xar_m1 vAse_, vAgo, E3, 9 + eor sAka, tmp, sAka_, ROR #24 SEP + bic tmp, sAko_, sAki_, ROR #47 SEP + eor sAke, tmp, sAke_, ROR #2 SEP xar_m0 vAgo_, vAme, E1, 19 + bic tmp, sAku_, sAko_, ROR #10 SEP + eor sAki, tmp, sAki_, ROR #57 SEP + bic tmp, sAka_, sAku_, ROR #47 SEP xar_m1 vAke_, vAgi, E2, 58 + eor sAko, tmp, sAko_, ROR #57 SEP + bic tmp, sAke_, sAka_, ROR #5 SEP + eor sAku, tmp, sAku_, ROR #52 SEP xar_m0 vAgi_, vAka, E0, 61 + bic tmp, sAmi_, sAme_, ROR #38 SEP + eor sAma, tmp, sAma_, ROR #47 SEP + bic tmp, sAmo_, sAmi_, ROR #5 SEP xar_m1 vAga_, vAbo, E3, 36 + eor sAme, tmp, sAme_, ROR #43 SEP + bic tmp, sAmu_, sAmo_, ROR #41 SEP + eor sAmi, tmp, sAmi_, ROR #46 SEP xar_m0 vAbo_, vAmo, E3, 43 + bic tmp, sAma_, sAmu_, ROR #35 SEP + SEP + ldr cur_const, [const_addr, count, UXTW #3] SEP xar_m1 vAmo_, vAmi, E2, 49 + add count, count, #1 SEP + SEP + eor sAmo, tmp, sAmo_, ROR #12 SEP xar_m0 vAmi_, vAke, E1, 54 + bic tmp, sAme_, sAma_, ROR #9 SEP + eor sAmu, tmp, sAmu_, ROR #44 SEP + bic tmp, sAsi_, sAse_, ROR #48 SEP xar_m1 vAge_, vAgu, E4, 44 + eor sAsa, tmp, sAsa_, ROR #41 SEP + bic tmp, sAso_, sAsi_, ROR #2 SEP + eor sAse, tmp, sAse_, ROR #50 SEP xar_m0 vAgu_, vAsi, E2, 3 + bic tmp, sAsu_, sAso_, ROR #25 SEP + eor sAsi, tmp, sAsi_, ROR #27 SEP + bic tmp, sAsa_, sAsu_, ROR #60 SEP xar_m1 vAsi_, vAku, E4, 25 + eor sAso, tmp, sAso_, ROR #21 SEP + bic tmp, sAse_, sAsa_, ROR #57 SEP + eor sAsu, tmp, sAsu_, ROR #53 SEP xar_m0 vAku_, vAsa, E0, 46 + bic tmp, sAbi_, sAbe_, ROR #63 SEP + eor s_Aba, s_Aba_, tmp, ROR #21 SEP + bic tmp, sAbo_, sAbi_, ROR #42 SEP xar_m1 vAma_, vAbu, E4, 37 + eor sAbe, tmp, sAbe_, ROR #41 SEP + bic tmp, sAbu_, sAbo_, ROR #57 SEP + eor sAbi, tmp, sAbi_, ROR #35 SEP xar_m0 vAbu_, vAsu, E4, 50 + bic tmp, s_Aba_, sAbu_, ROR #50 SEP + eor sAbo, tmp, sAbo_, ROR #43 SEP + bic tmp, sAbe_, s_Aba_, ROR #44 SEP xar_m1 vAsu_, vAse, E1, 62 + eor sAbu, tmp, sAbu_, ROR #30 SEP + SEP + eor s_Aba, s_Aba, cur_const SEP xar_m0 vAme_, vAga, E0, 28 + save count, STACK_OFFSET_COUNT SEP + SEP + eor sC0, sAka, sAsa, ROR #50 SEP xar_m1 vAbe_, vAge, E1, 20 + eor sC1, sAse, sAge, ROR #60 SEP + eor sC2, sAmi, sAgi, ROR #59 SEP + eor sC3, sAgo, sAso, ROR #30 SEP + eor sC4, sAbu, sAsu, ROR #53 SEP restore sE1, STACK_OFFSET_CONST + eor sC0, sAma, sC0, ROR #49 SEP + eor sC1, sAbe, sC1, ROR #44 SEP + eor sC2, sAki, sC2, ROR #26 SEP ld1r {v28.2d}, [sE1], #8 + eor sC3, sAmo, sC3, ROR #63 SEP + eor sC4, sAmu, sC4, ROR #56 SEP + eor sC0, sAga, sC0, ROR #57 SEP save sE1, STACK_OFFSET_CONST + eor sC1, sAme, sC1, ROR #58 SEP + eor sC2, sAbi, sC2, ROR #60 SEP + eor sC3, sAko, sC3, ROR #38 SEP + eor sC4, sAgu, sC4, ROR #48 SEP bcax_m0 vAga, vAga_, vAgi_, vAge_ + eor sC0, s_Aba, sC0, ROR #61 SEP + eor sC1, sAke, sC1, ROR #57 SEP + eor sC2, sAsi, sC2, ROR #52 SEP bcax_m1 vAge, vAge_, vAgo_, vAgi_ + eor sC3, sAbo, sC3, ROR #63 SEP + eor sC4, sAku, sC4, ROR #50 SEP + ror sC1, sC1, 56 SEP bcax_m0 vAgi, vAgi_, vAgu_, vAgo_ + ror sC4, sC4, 58 SEP + ror sC2, sC2, 62 SEP + SEP bcax_m1 vAgo, vAgo_, vAga_, vAgu_ + eor sE1, sC0, sC2, ROR #63 SEP + eor sE3, sC2, sC4, ROR #63 SEP + eor sE0, sC4, sC1, ROR #63 SEP bcax_m0 vAgu, vAgu_, vAge_, vAga_ + eor sE2, sC1, sC3, ROR #63 SEP + eor sE4, sC3, sC0, ROR #63 SEP + SEP bcax_m1 vAka, vAka_, vAki_, vAke_ + eor s_Aba_, sE0, s_Aba SEP + eor sAsa_, sE2, sAbi, ROR #50 SEP + eor sAbi_, sE2, sAki, ROR #46 SEP bcax_m0 vAke, vAke_, vAko_, vAki_ + eor sAki_, sE3, sAko, ROR #63 SEP + eor sAko_, sE4, sAmu, ROR #28 SEP + eor sAmu_, sE3, sAso, ROR #2 SEP bcax_m1 vAki, vAki_, vAku_, vAko_ + eor sAso_, sE0, sAma, ROR #54 SEP + eor sAka_, sE1, sAbe, ROR #43 SEP + eor sAse_, sE3, sAgo, ROR #36 SEP bcax_m0 vAko, vAko_, vAka_, vAku_ + eor sAgo_, sE1, sAme, ROR #49 SEP + eor sAke_, sE2, sAgi, ROR #3 SEP + eor sAgi_, sE0, sAka, ROR #39 SEP bcax_m1 vAku, vAku_, vAke_, vAka_ + eor sAga_, sE3, sAbo SEP + eor sAbo_, sE3, sAmo, ROR #37 SEP + eor sAmo_, sE2, sAmi, ROR #8 SEP bcax_m0 vAma, vAma_, vAmi_, vAme_ + eor sAmi_, sE1, sAke, ROR #56 SEP + eor sAge_, sE4, sAgu, ROR #44 SEP + eor sAgu_, sE2, sAsi, ROR #62 SEP bcax_m1 vAme, vAme_, vAmo_, vAmi_ + eor sAsi_, sE4, sAku, ROR #58 SEP + eor sAku_, sE0, sAsa, ROR #25 SEP + eor sAma_, sE4, sAbu, ROR #20 SEP bcax_m0 vAmi, vAmi_, vAmu_, vAmo_ + eor sAbu_, sE4, sAsu, ROR #9 SEP + eor sAsu_, sE1, sAse, ROR #23 SEP + eor sAme_, sE0, sAga, ROR #61 SEP bcax_m1 vAmo, vAmo_, vAma_, vAmu_ + eor sAbe_, sE1, sAge, ROR #19 SEP + SEP + load_constant_ptr SEP bcax_m0 vAmu, vAmu_, vAme_, vAma_ + restore count, STACK_OFFSET_COUNT SEP + SEP + bic tmp, sAgi_, sAge_, ROR #47 SEP bcax_m1 vAsa, vAsa_, vAsi_, vAse_ + eor sAga, tmp, sAga_, ROR #39 SEP + bic tmp, sAgo_, sAgi_, ROR #42 SEP + eor sAge, tmp, sAge_, ROR #25 SEP bcax_m0 vAse, vAse_, vAso_, vAsi_ + bic tmp, sAgu_, sAgo_, ROR #16 SEP + eor sAgi, tmp, sAgi_, ROR #58 SEP + bic tmp, sAga_, sAgu_, ROR #31 SEP bcax_m1 vAsi, vAsi_, vAsu_, vAso_ + eor sAgo, tmp, sAgo_, ROR #47 SEP + bic tmp, sAge_, sAga_, ROR #56 SEP + eor sAgu, tmp, sAgu_, ROR #23 SEP bcax_m0 vAso, vAso_, vAsa_, vAsu_ + bic tmp, sAki_, sAke_, ROR #19 SEP + eor sAka, tmp, sAka_, ROR #24 SEP + bic tmp, sAko_, sAki_, ROR #47 SEP bcax_m1 vAsu, vAsu_, vAse_, vAsa_ + eor sAke, tmp, sAke_, ROR #2 SEP + bic tmp, sAku_, sAko_, ROR #10 SEP + eor sAki, tmp, sAki_, ROR #57 SEP bcax_m0 vAba, vAba_, vAbi_, vAbe_ + bic tmp, sAka_, sAku_, ROR #47 SEP + eor sAko, tmp, sAko_, ROR #57 SEP + bic tmp, sAke_, sAka_, ROR #5 SEP bcax_m1 vAbe, vAbe_, vAbo_, vAbi_ + eor sAku, tmp, sAku_, ROR #52 SEP + bic tmp, sAmi_, sAme_, ROR #38 SEP + eor sAma, tmp, sAma_, ROR #47 SEP bcax_m0 vAbi, vAbi_, vAbu_, vAbo_ + bic tmp, sAmo_, sAmi_, ROR #5 SEP + eor sAme, tmp, sAme_, ROR #43 SEP + bic tmp, sAmu_, sAmo_, ROR #41 SEP bcax_m1 vAbo, vAbo_, vAba_, vAbu_ + eor sAmi, tmp, sAmi_, ROR #46 SEP + bic tmp, sAma_, sAmu_, ROR #35 SEP + SEP bcax_m0 vAbu, vAbu_, vAbe_, vAba_ + ldr cur_const, [const_addr, count, UXTW #3] SEP + add count, count, #1 SEP + SEP eor vAba.16b, vAba.16b, v28.16b + eor sAmo, tmp, sAmo_, ROR #12 SEP + bic tmp, sAme_, sAma_, ROR #9 SEP + eor sAmu, tmp, sAmu_, ROR #44 SEP + bic tmp, sAsi_, sAse_, ROR #48 SEP + eor sAsa, tmp, sAsa_, ROR #41 SEP + bic tmp, sAso_, sAsi_, ROR #2 SEP + eor sAse, tmp, sAse_, ROR #50 SEP + bic tmp, sAsu_, sAso_, ROR #25 SEP + eor sAsi, tmp, sAsi_, ROR #27 SEP + bic tmp, sAsa_, sAsu_, ROR #60 SEP + eor sAso, tmp, sAso_, ROR #21 SEP + bic tmp, sAse_, sAsa_, ROR #57 SEP + eor sAsu, tmp, sAsu_, ROR #53 SEP + bic tmp, sAbi_, sAbe_, ROR #63 SEP + eor s_Aba, s_Aba_, tmp, ROR #21 SEP + bic tmp, sAbo_, sAbi_, ROR #42 SEP + eor sAbe, tmp, sAbe_, ROR #41 SEP + bic tmp, sAbu_, sAbo_, ROR #57 SEP + eor sAbi, tmp, sAbi_, ROR #35 SEP + bic tmp, s_Aba_, sAbu_, ROR #50 SEP + eor sAbo, tmp, sAbo_, ROR #43 SEP + bic tmp, sAbe_, s_Aba_, ROR #44 SEP + eor sAbu, tmp, sAbu_, ROR #30 SEP + SEP + eor s_Aba, s_Aba, cur_const SEP + +.endm + +.macro final_rotate + ror sAga, sAga,#(64-3) + ror sAka, sAka,#(64-25) + ror sAma, sAma,#(64-10) + ror sAsa, sAsa,#(64-39) + ror sAbe, sAbe,#(64-21) + ror sAge, sAge,#(64-45) + ror sAke, sAke,#(64-8) + ror sAme, sAme,#(64-15) + ror sAse, sAse,#(64-41) + ror sAbi, sAbi,#(64-14) + ror sAgi, sAgi,#(64-61) + ror sAki, sAki,#(64-18) + ror sAmi, sAmi,#(64-56) + ror sAsi, sAsi,#(64-2) + ror sAgo, sAgo,#(64-28) + ror sAko, sAko,#(64-1) + ror sAmo, sAmo,#(64-27) + ror sAso, sAso,#(64-62) + ror sAbu, sAbu,#(64-44) + ror sAgu, sAgu,#(64-20) + ror sAku, sAku,#(64-6) + ror sAmu, sAmu,#(64-36) + ror sAsu, sAsu,#(64-55) +.endm + +#define KECCAK_F1600_ROUNDS 24 + +.global keccak_f1600_x4_hybrid_asm_v4p +.global _keccak_f1600_x4_hybrid_asm_v4p +.text +.align 4 + +keccak_f1600_x4_hybrid_asm_v4p: +_keccak_f1600_x4_hybrid_asm_v4p: + alloc_stack + save_gprs + save_vregs + save input_addr, STACK_OFFSET_INPUT + + ASM_LOAD(const_addr,round_constants) + save const_addr, STACK_OFFSET_CONST + + load_input_vector + + add input_addr, input_addr, #16 + + mov out_count, #0 +outer_loop: + save out_count, STACK_OFFSET_COUNT_OUT + + load_input_scalar + save input_addr, STACK_OFFSET_CUR_INPUT + + hybrid_round_initial +1: + hybrid_round_noninitial + cmp count, #(KECCAK_F1600_ROUNDS) + blt 1b + + final_rotate + restore input_addr, STACK_OFFSET_CUR_INPUT + store_input_scalar + add input_addr, input_addr, #8 + + restore out_count, STACK_OFFSET_COUNT_OUT + add out_count, out_count, #1 + cmp out_count, #2 + blt outer_loop + + restore input_addr, STACK_OFFSET_INPUT + store_input_vector + + restore_vregs + restore_gprs + free_stack + ret + +#endif diff --git a/tests/keccak_neon/manual/keccak_f1600_x4_hybrid_asm_v5.s b/tests/keccak_neon/manual/keccak_f1600_x4_hybrid_asm_v5.s new file mode 100644 index 0000000..8a16f20 --- /dev/null +++ b/tests/keccak_neon/manual/keccak_f1600_x4_hybrid_asm_v5.s @@ -0,0 +1,1360 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +/********************** CONSTANTS *************************/ + .data + .align(8) +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 +round_constants_vec: + .quad 0x0000000000000001 + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + .quad 0x8000000080008008 +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x29 + count .req w27 + cur_const .req x26 + + /* Mapping of Kecck-f1600 SIMD state to vector registers + * at the beginning and end of each round. */ + + /* Mapping of Kecck-f1600 state to vector registers + * at the beginning and end of each round. */ + vAba .req v0 + vAbe .req v1 + vAbi .req v2 + vAbo .req v3 + vAbu .req v4 + vAga .req v5 + vAge .req v6 + vAgi .req v7 + vAgo .req v8 + vAgu .req v9 + vAka .req v10 + vAke .req v11 + vAki .req v12 + vAko .req v13 + vAku .req v14 + vAma .req v15 + vAme .req v16 + vAmi .req v17 + vAmo .req v18 + vAmu .req v19 + vAsa .req v20 + vAse .req v21 + vAsi .req v22 + vAso .req v23 + vAsu .req v24 + + /* q-form of the above mapping */ + vAbaq .req q0 + vAbeq .req q1 + vAbiq .req q2 + vAboq .req q3 + vAbuq .req q4 + vAgaq .req q5 + vAgeq .req q6 + vAgiq .req q7 + vAgoq .req q8 + vAguq .req q9 + vAkaq .req q10 + vAkeq .req q11 + vAkiq .req q12 + vAkoq .req q13 + vAkuq .req q14 + vAmaq .req q15 + vAmeq .req q16 + vAmiq .req q17 + vAmoq .req q18 + vAmuq .req q19 + vAsaq .req q20 + vAseq .req q21 + vAsiq .req q22 + vAsoq .req q23 + vAsuq .req q24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v27 + C1 .req v28 + C2 .req v29 + C3 .req v30 + C4 .req v31 + + C0q .req q27 + C1q .req q28 + C2q .req q29 + C3q .req q30 + C4q .req q31 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + vBba .req v25 // fresh + vBbe .req v26 // fresh + vBbi .req vAbi + vBbo .req vAbo + vBbu .req vAbu + vBga .req vAka + vBge .req vAke + vBgi .req vAgi + vBgo .req vAgo + vBgu .req vAgu + vBka .req vAma + vBke .req vAme + vBki .req vAki + vBko .req vAko + vBku .req vAku + vBma .req vAsa + vBme .req vAse + vBmi .req vAmi + vBmo .req vAmo + vBmu .req vAmu + vBsa .req vAba + vBse .req vAbe + vBsi .req vAsi + vBso .req vAso + vBsu .req vAsu + + vBbaq .req q25 // fresh + vBbeq .req q26 // fresh + vBbiq .req vAbiq + vBboq .req vAboq + vBbuq .req vAbuq + vBgaq .req vAkaq + vBgeq .req vAkeq + vBgiq .req vAgiq + vBgoq .req vAgoq + vBguq .req vAguq + vBkaq .req vAmaq + vBkeq .req vAmeq + vBkiq .req vAkiq + vBkoq .req vAkoq + vBkuq .req vAkuq + vBmaq .req vAsaq + vBmeq .req vAseq + vBmiq .req vAmiq + vBmoq .req vAmoq + vBmuq .req vAmuq + vBsaq .req vAbaq + vBseq .req vAbeq + vBsiq .req vAsiq + vBsoq .req vAsoq + vBsuq .req vAsuq + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req C4 + E1 .req C0 + E2 .req vBbe // fresh + E3 .req C2 + E4 .req C3 + + E0q .req C4q + E1q .req C0q + E2q .req vBbeq // fresh + E3q .req C2q + E4q .req C3q + + /* Mapping of Kecck-f1600 state to scalar registers + * at the beginning and end of each round. */ + s_Aba .req x1 + sAbe .req x6 + sAbi .req x11 + sAbo .req x16 + sAbu .req x21 + sAga .req x2 + sAge .req x7 + sAgi .req x12 + sAgo .req x17 + sAgu .req x22 + sAka .req x3 + sAke .req x8 + sAki .req x13 + sAko .req x18 + sAku .req x23 + sAma .req x4 + sAme .req x9 + sAmi .req x14 + sAmo .req x19 + sAmu .req x24 + sAsa .req x5 + sAse .req x10 + sAsi .req x15 + sAso .req x20 + sAsu .req x25 + + /* sA_[y,2*x+3*y] = rot(A[x,y]) */ + s_Aba_ .req x0 + sAbe_ .req x28 + sAbi_ .req x11 + sAbo_ .req x16 + sAbu_ .req x21 + sAga_ .req x3 + sAge_ .req x8 + sAgi_ .req x12 + sAgo_ .req x17 + sAgu_ .req x22 + sAka_ .req x4 + sAke_ .req x9 + sAki_ .req x13 + sAko_ .req x18 + sAku_ .req x23 + sAma_ .req x5 + sAme_ .req x10 + sAmi_ .req x14 + sAmo_ .req x19 + sAmu_ .req x24 + sAsa_ .req x1 + sAse_ .req x6 + sAsi_ .req x15 + sAso_ .req x20 + sAsu_ .req x25 + + /* sC[x] = sA[x,0] xor sA[x,1] xor sA[x,2] xor sA[x,3] xor sA[x,4], for x in 0..4 */ + /* sE[x] = sC[x-1] xor rot(C[x+1],1), for x in 0..4 */ + sC0 .req x0 + sE0 .req x29 + sC1 .req x26 + sE1 .req x30 + sC2 .req x27 + sE2 .req x26 + sC3 .req x28 + sE3 .req x27 + sC4 .req x29 + sE4 .req x28 + + tmp .req x30 + +/************************ MACROS ****************************/ + +/* Macros using v8.4-A SHA-3 instructions */ + +.macro eor3_m1_0 d s0 s1 s2 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor2 d s0 s1 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor3_m1_1 d s0 s1 s2 + eor \d\().16b, \d\().16b, \s2\().16b +.endm + + +.macro eor3_m1 d s0 s1 s2 + eor3_m1_0 \d, \s0, \s1, \s2 + eor3_m1_1 \d, \s0, \s1, \s2 +.endm + +.macro rax1_m1 d s0 s1 + // Use add instead of SHL #1 + add vvtmp.2d, \s1\().2d, \s1\().2d + sri vvtmp.2d, \s1\().2d, #63 + eor \d\().16b, vvtmp.16b, \s0\().16b +.endm + + .macro xar_m1 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 63 + eor \s0\().16b, \s0\().16b, \s1\().16b + add \d\().2d, \s0\().2d, \s0\().2d + sri \d\().2d, \s0\().2d, #(63) + .elseif \imm == 62 + eor \s0\().16b, \s0\().16b, \s1\().16b + add \d\().2d, \s0\().2d, \s0\().2d + add \d\().2d, \d\().2d, \d\().2d + sri \d\().2d, \s0\().2d, #(62) + .else + eor \s0\().16b, \s0\().16b, \s1\().16b + shl \d\().2d, \s0\().2d, #(64-\imm) + sri \d\().2d, \s0\().2d, #(\imm) + .endif +.endm + + .macro xar_m1_0 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 63 + eor \s0\().16b, \s0\().16b, \s1\().16b + .elseif \imm == 62 + eor \s0\().16b, \s0\().16b, \s1\().16b + .else + eor \s0\().16b, \s0\().16b, \s1\().16b + .endif +.endm + + .macro xar_m1_1 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 63 + add \d\().2d, \s0\().2d, \s0\().2d + sri \d\().2d, \s0\().2d, #(63) + .elseif \imm == 62 + add \d\().2d, \s0\().2d, \s0\().2d + add \d\().2d, \d\().2d, \d\().2d + sri \d\().2d, \s0\().2d, #(62) + .else + shl \d\().2d, \s0\().2d, #(64-\imm) + sri \d\().2d, \s0\().2d, #(\imm) + .endif +.endm + +.macro bcax_m1 d s0 s1 s2 + bic vvtmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, vvtmp.16b, \s0\().16b +.endm + +.macro load_input_vector num idx + ldr vAbaq, [input_addr, #(16*(\num*0+\idx))] + ldr vAbeq, [input_addr, #(16*(\num*1+\idx))] + ldr vAbiq, [input_addr, #(16*(\num*2+\idx))] + ldr vAboq, [input_addr, #(16*(\num*3+\idx))] + ldr vAbuq, [input_addr, #(16*(\num*4+\idx))] + ldr vAgaq, [input_addr, #(16*(\num*5+\idx))] + ldr vAgeq, [input_addr, #(16*(\num*6+\idx))] + ldr vAgiq, [input_addr, #(16*(\num*7+\idx))] + ldr vAgoq, [input_addr, #(16*(\num*8+\idx))] + ldr vAguq, [input_addr, #(16*(\num*9+\idx))] + ldr vAkaq, [input_addr, #(16*(\num*10+\idx))] + ldr vAkeq, [input_addr, #(16*(\num*11+\idx))] + ldr vAkiq, [input_addr, #(16*(\num*12+\idx))] + ldr vAkoq, [input_addr, #(16*(\num*13+\idx))] + ldr vAkuq, [input_addr, #(16*(\num*14+\idx))] + ldr vAmaq, [input_addr, #(16*(\num*15+\idx))] + ldr vAmeq, [input_addr, #(16*(\num*16+\idx))] + ldr vAmiq, [input_addr, #(16*(\num*17+\idx))] + ldr vAmoq, [input_addr, #(16*(\num*18+\idx))] + ldr vAmuq, [input_addr, #(16*(\num*19+\idx))] + ldr vAsaq, [input_addr, #(16*(\num*20+\idx))] + ldr vAseq, [input_addr, #(16*(\num*21+\idx))] + ldr vAsiq, [input_addr, #(16*(\num*22+\idx))] + ldr vAsoq, [input_addr, #(16*(\num*23+\idx))] + ldr vAsuq, [input_addr, #(16*(\num*24+\idx))] +.endm + +.macro store_input_vector num idx + str vAbaq, [input_addr, #(16*(\num*0+\idx))] + str vAbeq, [input_addr, #(16*(\num*1+\idx))] + str vAbiq, [input_addr, #(16*(\num*2+\idx))] + str vAboq, [input_addr, #(16*(\num*3+\idx))] + str vAbuq, [input_addr, #(16*(\num*4+\idx))] + str vAgaq, [input_addr, #(16*(\num*5+\idx))] + str vAgeq, [input_addr, #(16*(\num*6+\idx))] + str vAgiq, [input_addr, #(16*(\num*7+\idx))] + str vAgoq, [input_addr, #(16*(\num*8+\idx))] + str vAguq, [input_addr, #(16*(\num*9+\idx))] + str vAkaq, [input_addr, #(16*(\num*10+\idx))] + str vAkeq, [input_addr, #(16*(\num*11+\idx))] + str vAkiq, [input_addr, #(16*(\num*12+\idx))] + str vAkoq, [input_addr, #(16*(\num*13+\idx))] + str vAkuq, [input_addr, #(16*(\num*14+\idx))] + str vAmaq, [input_addr, #(16*(\num*15+\idx))] + str vAmeq, [input_addr, #(16*(\num*16+\idx))] + str vAmiq, [input_addr, #(16*(\num*17+\idx))] + str vAmoq, [input_addr, #(16*(\num*18+\idx))] + str vAmuq, [input_addr, #(16*(\num*19+\idx))] + str vAsaq, [input_addr, #(16*(\num*20+\idx))] + str vAseq, [input_addr, #(16*(\num*21+\idx))] + str vAsiq, [input_addr, #(16*(\num*22+\idx))] + str vAsoq, [input_addr, #(16*(\num*23+\idx))] + str vAsuq, [input_addr, #(16*(\num*24+\idx))] +.endm + +.macro store_input_scalar num idx + str s_Aba, [input_addr, 8*(\num*(0) +\idx)] + str sAbe, [input_addr, 8*(\num*(0+1) +\idx)] + str sAbi, [input_addr, 8*(\num*(2)+ \idx)] + str sAbo, [input_addr, 8*(\num*(2+1) +\idx)] + str sAbu, [input_addr, 8*(\num*(4)+ \idx)] + str sAga, [input_addr, 8*(\num*(4+1) +\idx)] + str sAge, [input_addr, 8*(\num*(6)+ \idx)] + str sAgi, [input_addr, 8*(\num*(6+1) +\idx)] + str sAgo, [input_addr, 8*(\num*(8)+ \idx)] + str sAgu, [input_addr, 8*(\num*(8+1) +\idx)] + str sAka, [input_addr, 8*(\num*(10) +\idx)] + str sAke, [input_addr, 8*(\num*(10+1)+\idx)] + str sAki, [input_addr, 8*(\num*(12) +\idx)] + str sAko, [input_addr, 8*(\num*(12+1)+\idx)] + str sAku, [input_addr, 8*(\num*(14) +\idx)] + str sAma, [input_addr, 8*(\num*(14+1)+\idx)] + str sAme, [input_addr, 8*(\num*(16) +\idx)] + str sAmi, [input_addr, 8*(\num*(16+1)+\idx)] + str sAmo, [input_addr, 8*(\num*(18) +\idx)] + str sAmu, [input_addr, 8*(\num*(18+1)+\idx)] + str sAsa, [input_addr, 8*(\num*(20) +\idx)] + str sAse, [input_addr, 8*(\num*(20+1)+\idx)] + str sAsi, [input_addr, 8*(\num*(22) +\idx)] + str sAso, [input_addr, 8*(\num*(22+1)+\idx)] + str sAsu, [input_addr, 8*(\num*(24) +\idx)] +.endm + +.macro load_input_scalar num idx + ldr s_Aba, [input_addr, 8*(\num*(0) +\idx)] + ldr sAbe, [input_addr, 8*(\num*(0+1) +\idx)] + ldr sAbi, [input_addr, 8*(\num*(2)+ \idx)] + ldr sAbo, [input_addr, 8*(\num*(2+1) +\idx)] + ldr sAbu, [input_addr, 8*(\num*(4)+ \idx)] + ldr sAga, [input_addr, 8*(\num*(4+1) +\idx)] + ldr sAge, [input_addr, 8*(\num*(6)+ \idx)] + ldr sAgi, [input_addr, 8*(\num*(6+1) +\idx)] + ldr sAgo, [input_addr, 8*(\num*(8)+ \idx)] + ldr sAgu, [input_addr, 8*(\num*(8+1) +\idx)] + ldr sAka, [input_addr, 8*(\num*(10) +\idx)] + ldr sAke, [input_addr, 8*(\num*(10+1)+\idx)] + ldr sAki, [input_addr, 8*(\num*(12) +\idx)] + ldr sAko, [input_addr, 8*(\num*(12+1)+\idx)] + ldr sAku, [input_addr, 8*(\num*(14) +\idx)] + ldr sAma, [input_addr, 8*(\num*(14+1)+\idx)] + ldr sAme, [input_addr, 8*(\num*(16) +\idx)] + ldr sAmi, [input_addr, 8*(\num*(16+1)+\idx)] + ldr sAmo, [input_addr, 8*(\num*(18) +\idx)] + ldr sAmu, [input_addr, 8*(\num*(18+1)+\idx)] + ldr sAsa, [input_addr, 8*(\num*(20) +\idx)] + ldr sAse, [input_addr, 8*(\num*(20+1)+\idx)] + ldr sAsi, [input_addr, 8*(\num*(22) +\idx)] + ldr sAso, [input_addr, 8*(\num*(22+1)+\idx)] + ldr sAsu, [input_addr, 8*(\num*(24) +\idx)] +.endm + +#define STACK_SIZE (8*8 + 16*6 + 3*8 + 8 + 16*34) // VREGS (8*8), GPRs (16*6), count (8), const (8), input (8), padding (8) +#define STACK_BASE_GPRS (3*8+8) +#define STACK_BASE_VREGS (3*8+8+16*6) +#define STACK_BASE_TMP (8*8 + 16*6 + 3*8 + 8) +#define STACK_OFFSET_INPUT (0*8) +#define STACK_OFFSET_CONST (1*8) +#define STACK_OFFSET_COUNT (2*8) + +#define vAga_offset 0 +#define E0_offset 1 +#define E1_offset 2 +#define E2_offset 3 +#define E3_offset 4 +#define E4_offset 5 +#define Ame_offset 7 +#define Agi_offset 8 +#define Aka_offset 9 +#define Abo_offset 10 +#define Amo_offset 11 +#define Ami_offset 12 +#define Ake_offset 13 +#define Agu_offset 14 +#define Asi_offset 15 +#define Aku_offset 16 +#define Asa_offset 17 +#define Abu_offset 18 +#define Asu_offset 19 +#define Ase_offset 20 +//#define Aga_offset 21 +#define Age_offset 22 +#define vBgo_offset 23 +#define vBke_offset 24 +#define vBgi_offset 25 +#define vBga_offset 26 +#define vBbo_offset 27 +#define vBmo_offset 28 +#define vBmi_offset 29 +#define vBge_offset 30 + +#define save(name) \ + str name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] +#define restore(name) \ + ldr name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] + + +.macro save_gprs + stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro save_vregs + stp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] + stp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + stp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + stp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] +.endm + +.macro restore_vregs + ldp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] + ldp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + ldp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + ldp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] +.endm + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +.macro eor5 dst, src0, src1, src2, src3, src4 + eor \dst, \src0, \src1 + eor \dst, \dst, \src2 + eor \dst, \dst, \src3 + eor \dst, \dst, \src4 +.endm + +.macro xor_rol dst, src1, src0, imm + eor \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro bic_rol dst, src1, src0, imm + bic \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro rotate dst, src, imm + ror \dst, \src, #(64-\imm) +.endm + +.macro save reg, offset + str \reg, [sp, #\offset] +.endm + +.macro restore reg, offset + ldr \reg, [sp, #\offset] +.endm + +.macro hybrid_round_initial +eor sC0, sAma, sAsa SEP eor3_m1_0 C1,vAbe,vAge,vAke +eor sC1, sAme, sAse SEP +eor sC2, sAmi, sAsi SEP eor3_m1_0 C3,vAbo,vAgo,vAko +eor sC3, sAmo, sAso SEP +eor sC4, sAmu, sAsu SEP eor3_m1_0 C0,vAba,vAga,vAka +eor sC0, sAka, sC0 SEP +eor sC1, sAke, sC1 SEP eor3_m1_0 C2,vAbi,vAgi,vAki +eor sC2, sAki, sC2 SEP +eor sC3, sAko, sC3 SEP eor3_m1_0 C4,vAbu,vAgu,vAku +eor sC4, sAku, sC4 SEP +eor sC0, sAga, sC0 SEP eor3_m1_1 C1,vAbe,vAge,vAke +eor sC1, sAge, sC1 SEP eor3_m1_1 C3,vAbo,vAgo,vAko +eor sC2, sAgi, sC2 SEP +eor sC3, sAgo, sC3 SEP eor3_m1_1 C0,vAba,vAga,vAka +eor sC4, sAgu, sC4 SEP +eor sC0, s_Aba, sC0 SEP eor3_m1_1 C2,vAbi,vAgi,vAki +eor sC1, sAbe, sC1 SEP +eor sC2, sAbi, sC2 SEP eor3_m1_1 C4,vAbu,vAgu,vAku +eor sC3, sAbo, sC3 SEP +eor sC4, sAbu, sC4 SEP eor3_m1_0 C1, C1,vAme, vAse +eor sE1, sC0, sC2, ROR #63 SEP eor3_m1_0 C3, C3,vAmo, vAso +eor sE3, sC2, sC4, ROR #63 SEP +eor sE0, sC4, sC1, ROR #63 SEP eor3_m1_0 C0, C0,vAma, vAsa +eor sE2, sC1, sC3, ROR #63 SEP +eor sE4, sC3, sC0, ROR #63 SEP eor3_m1_0 C2, C2,vAmi, vAsi +eor s_Aba_, s_Aba, sE0 SEP +eor sAsa_, sAbi, sE2 SEP eor3_m1_0 C4, C4,vAmu, vAsu +eor sAbi_, sAki, sE2 SEP +eor sAki_, sAko, sE3 SEP eor3_m1_1 C1, C1,vAme, vAse +eor sAko_, sAmu, sE4 SEP eor3_m1_1 C3, C3,vAmo, vAso +eor sAmu_, sAso, sE3 SEP +eor sAso_, sAma, sE0 SEP eor3_m1_1 C0, C0,vAma, vAsa +eor sAka_, sAbe, sE1 SEP +eor sAse_, sAgo, sE3 SEP eor3_m1_1 C2, C2,vAmi, vAsi +eor sAgo_, sAme, sE1 SEP +eor sAke_, sAgi, sE2 SEP eor3_m1_1 C4, C4,vAmu, vAsu +eor sAgi_, sAka, sE0 SEP +eor sAga_, sAbo, sE3 SEP vvtmp .req vBba +eor sAbo_, sAmo, sE3 SEP rax1_m1 E2, C1, C3 +eor sAmo_, sAmi, sE2 SEP +eor sAmi_, sAke, sE1 SEP rax1_m1 E4, C3, C0 +eor sAge_, sAgu, sE4 SEP +eor sAgu_, sAsi, sE2 SEP rax1_m1 E1, C0, C2 +eor sAsi_, sAku, sE4 SEP +eor sAku_, sAsa, sE0 SEP rax1_m1 E3, C2, C4 +eor sAma_, sAbu, sE4 SEP +eor sAbu_, sAsu, sE4 SEP str vAgiq, [sp, #(STACK_BASE_TMP + 16*32)] +eor sAsu_, sAse, sE1 SEP rax1_m1 E0, C4, C1 +eor sAme_, sAga, sE0 SEP +eor sAbe_, sAge, sE1 SEP /* 25x XAR, 75 in total */ +load_constant_ptr SEP +bic tmp, sAgi_, sAge_, ROR #47 SEP .unreq vvtmp +eor sAga, tmp, sAga_, ROR #39 SEP +bic tmp, sAgo_, sAgi_, ROR #42 SEP vvtmp .req C1 +eor sAge, tmp, sAge_, ROR #25 SEP +bic tmp, sAgu_, sAgo_, ROR #16 SEP vvtmpq .req C1q +eor sAgi, tmp, sAgi_, ROR #58 SEP xar_m1 vBgi, vAka, E0, 61 +bic tmp, sAga_, sAgu_, ROR #31 SEP +eor sAgo, tmp, sAgo_, ROR #47 SEP xar_m1 vBga, vAbo, E3, 36 +bic tmp, sAge_, sAga_, ROR #56 SEP +eor sAgu, tmp, sAgu_, ROR #23 SEP str vAgaq, [sp, #(STACK_BASE_TMP + 16 * 30)] +bic tmp, sAki_, sAke_, ROR #19 SEP +eor sAka, tmp, sAka_, ROR #24 SEP xar_m1 vBbo, vAmo, E3, 43 +bic tmp, sAko_, sAki_, ROR #47 SEP +eor sAke, tmp, sAke_, ROR #2 SEP xar_m1 vBmo, vAmi, E2, 49 +bic tmp, sAku_, sAko_, ROR #10 SEP str vAgeq, [sp, #(STACK_BASE_TMP + 16 * 31)] +eor sAki, tmp, sAki_, ROR #57 SEP +bic tmp, sAka_, sAku_, ROR #47 SEP xar_m1 vBmi, vAke, E1, 54 +eor sAko, tmp, sAko_, ROR #57 SEP +bic tmp, sAke_, sAka_, ROR #5 SEP xar_m1 vBge, vAgu, E4, 44 +eor sAku, tmp, sAku_, ROR #52 SEP +bic tmp, sAmi_, sAme_, ROR #38 SEP bcax_m1 vAga, vBga, vBgi, vBge +eor sAma, tmp, sAma_, ROR #47 SEP +bic tmp, sAmo_, sAmi_, ROR #5 SEP eor vBba.16b, vAba.16b, E0.16b +eor sAme, tmp, sAme_, ROR #43 SEP +bic tmp, sAmu_, sAmo_, ROR #41 SEP xar_m1 vBsa, vAbi, E2, 2 +eor sAmi, tmp, sAmi_, ROR #46 SEP xar_m1 vBbi, vAki, E2, 21 +ldr cur_const, [const_addr] SEP +mov count, #1 SEP xar_m1 vBki, vAko, E3, 39 +bic tmp, sAma_, sAmu_, ROR #35 SEP +eor sAmo, tmp, sAmo_, ROR #12 SEP xar_m1 vBko, vAmu, E4, 56 +bic tmp, sAme_, sAma_, ROR #9 SEP +eor sAmu, tmp, sAmu_, ROR #44 SEP xar_m1 vBmu, vAso, E3, 8 +bic tmp, sAsi_, sAse_, ROR #48 SEP +eor sAsa, tmp, sAsa_, ROR #41 SEP xar_m1 vBso, vAma, E0, 23 +bic tmp, sAso_, sAsi_, ROR #2 SEP xar_m1 vBka, vAbe, E1, 63 +eor sAse, tmp, sAse_, ROR #50 SEP +bic tmp, sAsu_, sAso_, ROR #25 SEP xar_m1 vBse, vAgo, E3, 9 +eor sAsi, tmp, sAsi_, ROR #27 SEP +bic tmp, sAsa_, sAsu_, ROR #60 SEP xar_m1 vBgo, vAme, E1, 19 +eor sAso, tmp, sAso_, ROR #21 SEP +bic tmp, sAse_, sAsa_, ROR #57 SEP bcax_m1 vAge, vBge, vBgo, vBgi +eor sAsu, tmp, sAsu_, ROR #53 SEP +bic tmp, sAbi_, sAbe_, ROR #63 SEP ldr vvtmpq, [sp, #(STACK_BASE_TMP + 16*32)] +eor s_Aba, s_Aba_, tmp, ROR #21 SEP xar_m1 vBke, vvtmp, E2, 58 +bic tmp, sAbo_, sAbi_, ROR #42 SEP +eor sAbe, tmp, sAbe_, ROR #41 SEP xar_m1 vBgu, vAsi, E2, 3 +bic tmp, sAbu_, sAbo_, ROR #57 SEP +eor sAbi, tmp, sAbi_, ROR #35 SEP bcax_m1 vAgi, vBgi, vBgu, vBgo +bic tmp, s_Aba_, sAbu_, ROR #50 SEP +eor sAbo, tmp, sAbo_, ROR #43 SEP xar_m1 vBsi, vAku, E4, 25 +bic tmp, sAbe_, s_Aba_, ROR #44 SEP +eor sAbu, tmp, sAbu_, ROR #30 SEP xar_m1 vBku, vAsa, E0, 46 +eor s_Aba, s_Aba, cur_const SEP xar_m1 vBma, vAbu, E4, 37 +save count, STACK_OFFSET_COUNT SEP +eor sC0, sAka, sAsa, ROR #50 SEP xar_m1 vBbu, vAsu, E4, 50 +eor sC1, sAse, sAge, ROR #60 SEP +eor sC2, sAmi, sAgi, ROR #59 SEP xar_m1 vBsu, vAse, E1, 62 +eor sC3, sAgo, sAso, ROR #30 SEP +eor sC4, sAbu, sAsu, ROR #53 SEP ldp vvtmpq, E3q, [sp, #(STACK_BASE_TMP + 16*30)] +eor sC0, sAma, sC0, ROR #49 SEP +eor sC1, sAbe, sC1, ROR #44 SEP xar_m1 vBme, vvtmp, E0, 28 +eor sC2, sAki, sC2, ROR #26 SEP xar_m1 vBbe, E3, E1, 20 +eor sC3, sAmo, sC3, ROR #63 SEP +eor sC4, sAmu, sC4, ROR #56 SEP /* 25x BCAX, 50 in total */ +eor sC0, sAga, sC0, ROR #57 SEP +eor sC1, sAme, sC1, ROR #58 SEP bcax_m1 vAgo, vBgo, vBga, vBgu +eor sC2, sAbi, sC2, ROR #60 SEP +eor sC3, sAko, sC3, ROR #38 SEP bcax_m1 vAgu, vBgu, vBge, vBga +eor sC4, sAgu, sC4, ROR #48 SEP +eor sC0, s_Aba, sC0, ROR #61 SEP bcax_m1 vAka, vBka, vBki, vBke +eor sC1, sAke, sC1, ROR #57 SEP bcax_m1 vAke, vBke, vBko, vBki +eor sC2, sAsi, sC2, ROR #52 SEP +eor sC3, sAbo, sC3, ROR #63 SEP .unreq vvtmp +eor sC4, sAku, sC4, ROR #50 SEP +ror sC1, sC1, 56 SEP .unreq vvtmpq +ror sC4, sC4, 58 SEP +ror sC2, sC2, 62 SEP eor2 C0, vAka, vAga +eor sE1, sC0, sC2, ROR #63 SEP +eor sE3, sC2, sC4, ROR #63 SEP save(vAga) +eor sE0, sC4, sC1, ROR #63 SEP vvtmp .req vAga +eor sE2, sC1, sC3, ROR #63 SEP +eor sE4, sC3, sC0, ROR #63 SEP vvtmpq .req vAgaq +eor s_Aba_, sE0, s_Aba SEP +eor sAsa_, sE2, sAbi, ROR #50 SEP bcax_m1 vAki, vBki, vBku, vBko +eor sAbi_, sE2, sAki, ROR #46 SEP +eor sAki_, sE3, sAko, ROR #63 SEP bcax_m1 vAko, vBko, vBka, vBku +eor sAko_, sE4, sAmu, ROR #28 SEP +eor sAmu_, sE3, sAso, ROR #2 SEP eor2 C1, vAke, vAge +eor sAso_, sE0, sAma, ROR #54 SEP bcax_m1 vAku, vBku, vBke, vBka +eor sAka_, sE1, sAbe, ROR #43 SEP +eor sAse_, sE3, sAgo, ROR #36 SEP eor2 C2, vAki, vAgi +eor sAgo_, sE1, sAme, ROR #49 SEP +eor sAke_, sE2, sAgi, ROR #3 SEP bcax_m1 vAma, vBma, vBmi, vBme +eor sAgi_, sE0, sAka, ROR #39 SEP +eor sAga_, sE3, sAbo SEP eor2 C3, vAko, vAgo +eor sAbo_, sE3, sAmo, ROR #37 SEP +eor sAmo_, sE2, sAmi, ROR #8 SEP bcax_m1 vAme, vBme, vBmo, vBmi +eor sAmi_, sE1, sAke, ROR #56 SEP +eor sAge_, sE4, sAgu, ROR #44 SEP eor2 C4, vAku, vAgu +eor sAgu_, sE2, sAsi, ROR #62 SEP bcax_m1 vAmi, vBmi, vBmu, vBmo +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP eor2 C0, C0, vAma +eor sAma_, sE4, sAbu, ROR #20 SEP +eor sAbu_, sE4, sAsu, ROR #9 SEP bcax_m1 vAmo, vBmo, vBma, vBmu +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP eor2 C1, C1, vAme +eor sAbe_, sE1, sAge, ROR #19 SEP +load_constant_ptr SEP bcax_m1 vAmu, vBmu, vBme, vBma +restore count, STACK_OFFSET_COUNT SEP eor2 C2, C2, vAmi +bic tmp, sAgi_, sAge_, ROR #47 SEP +eor sAga, tmp, sAga_, ROR #39 SEP bcax_m1 vAsa, vBsa, vBsi, vBse +bic tmp, sAgo_, sAgi_, ROR #42 SEP +eor sAge, tmp, sAge_, ROR #25 SEP eor2 C3, C3, vAmo +bic tmp, sAgu_, sAgo_, ROR #16 SEP +eor sAgi, tmp, sAgi_, ROR #58 SEP bcax_m1 vAse, vBse, vBso, vBsi +bic tmp, sAga_, sAgu_, ROR #31 SEP +eor sAgo, tmp, sAgo_, ROR #47 SEP eor2 C4, C4, vAmu +bic tmp, sAge_, sAga_, ROR #56 SEP bcax_m1 vAsi, vBsi, vBsu, vBso +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP eor2 C0, C0, vAsa +eor sAka, tmp, sAka_, ROR #24 SEP +bic tmp, sAko_, sAki_, ROR #47 SEP bcax_m1 vAso, vBso, vBsa, vBsu +eor sAke, tmp, sAke_, ROR #2 SEP +bic tmp, sAku_, sAko_, ROR #10 SEP eor2 C1, C1, vAse +eor sAki, tmp, sAki_, ROR #57 SEP +bic tmp, sAka_, sAku_, ROR #47 SEP bcax_m1 vAsu, vBsu, vBse, vBsa +eor sAko, tmp, sAko_, ROR #57 SEP eor2 C2, C2, vAsi +bic tmp, sAke_, sAka_, ROR #5 SEP +eor sAku, tmp, sAku_, ROR #52 SEP eor2 C3, C3, vAso +bic tmp, sAmi_, sAme_, ROR #38 SEP +eor sAma, tmp, sAma_, ROR #47 SEP bcax_m1 vAba, vBba, vBbi, vBbe +bic tmp, sAmo_, sAmi_, ROR #5 SEP +eor sAme, tmp, sAme_, ROR #43 SEP bcax_m1 vAbe, vBbe, vBbo, vBbi +bic tmp, sAmu_, sAmo_, ROR #41 SEP +eor sAmi, tmp, sAmi_, ROR #46 SEP eor2 C1, C1, vAbe +bic tmp, sAma_, sAmu_, ROR #35 SEP restore x26, STACK_OFFSET_CONST +eor sAmo, tmp, sAmo_, ROR #12 SEP ldr vvtmpq, [x26], #16 +bic tmp, sAme_, sAma_, ROR #9 SEP +eor sAmu, tmp, sAmu_, ROR #44 SEP save x26, STACK_OFFSET_CONST +bic tmp, sAsi_, sAse_, ROR #48 SEP +ldr cur_const, [const_addr, count, UXTW #3] SEP +eor sAsa, tmp, sAsa_, ROR #41 SEP eor vAba.16b, vAba.16b, vvtmp.16b +bic tmp, sAso_, sAsi_, ROR #2 SEP +eor sAse, tmp, sAse_, ROR #50 SEP eor2 C4, C4, vAsu +bic tmp, sAsu_, sAso_, ROR #25 SEP bcax_m1 vAbi, vBbi, vBbu, vBbo +eor sAsi, tmp, sAsi_, ROR #27 SEP +bic tmp, sAsa_, sAsu_, ROR #60 SEP bcax_m1 vAbo, vBbo, vBba, vBbu +eor sAso, tmp, sAso_, ROR #21 SEP +bic tmp, sAse_, sAsa_, ROR #57 SEP eor2 C3, C3, vAbo +eor sAsu, tmp, sAsu_, ROR #53 SEP +bic tmp, sAbi_, sAbe_, ROR #63 SEP eor2 C2, C2, vAbi +eor s_Aba, s_Aba_, tmp, ROR #21 SEP +bic tmp, sAbo_, sAbi_, ROR #42 SEP eor2 C0, C0, vAba +eor sAbe, tmp, sAbe_, ROR #41 SEP bcax_m1 vAbu, vBbu, vBbe, vBba +bic tmp, sAbu_, sAbo_, ROR #57 SEP +eor sAbi, tmp, sAbi_, ROR #35 SEP eor2 C4, C4, vAbu +bic tmp, s_Aba_, sAbu_, ROR #50 SEP +eor sAbo, tmp, sAbo_, ROR #43 SEP restore(vAga) +bic tmp, sAbe_, s_Aba_, ROR #44 SEP +eor sAbu, tmp, sAbu_, ROR #30 SEP .unreq vvtmp +add count, count, #1 SEP +eor s_Aba, s_Aba, cur_const SEP .unreq vvtmpq +.endm + + +.macro hybrid_round_noninitial +save count, STACK_OFFSET_COUNT SEP +eor sC0, sAka, sAsa, ROR #50 SEP vvtmp .req vBba +eor sC1, sAse, sAge, ROR #60 SEP rax1_m1 E2, C1, C3 +eor sC2, sAmi, sAgi, ROR #59 SEP rax1_m1 E4, C3, C0 +eor sC3, sAgo, sAso, ROR #30 SEP +eor sC4, sAbu, sAsu, ROR #53 SEP +eor sC0, sAma, sC0, ROR #49 SEP +eor sC1, sAbe, sC1, ROR #44 SEP rax1_m1 E1, C0, C2 +eor sC2, sAki, sC2, ROR #26 SEP +eor sC3, sAmo, sC3, ROR #63 SEP +eor sC4, sAmu, sC4, ROR #56 SEP rax1_m1 E3, C2, C4 +eor sC0, sAga, sC0, ROR #57 SEP +eor sC1, sAme, sC1, ROR #58 SEP str vAgiq, [sp, #(STACK_BASE_TMP + 16*32)] +eor sC2, sAbi, sC2, ROR #60 SEP +eor sC3, sAko, sC3, ROR #38 SEP rax1_m1 E0, C4, C1 +eor sC4, sAgu, sC4, ROR #48 SEP +eor sC0, s_Aba, sC0, ROR #61 SEP .unreq vvtmp +eor sC1, sAke, sC1, ROR #57 SEP +eor sC2, sAsi, sC2, ROR #52 SEP +eor sC3, sAbo, sC3, ROR #63 SEP vvtmp .req C1 +eor sC4, sAku, sC4, ROR #50 SEP +ror sC1, sC1, 56 SEP vvtmpq .req C1q +ror sC4, sC4, 58 SEP +ror sC2, sC2, 62 SEP xar_m1 vBgi, vAka, E0, 61 +eor sE1, sC0, sC2, ROR #63 SEP +eor sE3, sC2, sC4, ROR #63 SEP xar_m1 vBga, vAbo, E3, 36 +eor sE0, sC4, sC1, ROR #63 SEP +eor sE2, sC1, sC3, ROR #63 SEP +eor sE4, sC3, sC0, ROR #63 SEP str vAgaq, [sp, #(STACK_BASE_TMP + 16 * 30)] +eor s_Aba_, sE0, s_Aba SEP +eor sAsa_, sE2, sAbi, ROR #50 SEP xar_m1 vBbo, vAmo, E3, 43 +eor sAbi_, sE2, sAki, ROR #46 SEP +eor sAki_, sE3, sAko, ROR #63 SEP xar_m1 vBmo, vAmi, E2, 49 +eor sAko_, sE4, sAmu, ROR #28 SEP +eor sAmu_, sE3, sAso, ROR #2 SEP +eor sAso_, sE0, sAma, ROR #54 SEP str vAgeq, [sp, #(STACK_BASE_TMP + 16 * 31)] +eor sAka_, sE1, sAbe, ROR #43 SEP +eor sAse_, sE3, sAgo, ROR #36 SEP xar_m1 vBmi, vAke, E1, 54 +eor sAgo_, sE1, sAme, ROR #49 SEP +eor sAke_, sE2, sAgi, ROR #3 SEP xar_m1 vBge, vAgu, E4, 44 +eor sAgi_, sE0, sAka, ROR #39 SEP +eor sAga_, sE3, sAbo SEP bcax_m1 vAga, vBga, vBgi, vBge +eor sAbo_, sE3, sAmo, ROR #37 SEP +eor sAmo_, sE2, sAmi, ROR #8 SEP +eor sAmi_, sE1, sAke, ROR #56 SEP eor vBba.16b, vAba.16b, E0.16b +eor sAge_, sE4, sAgu, ROR #44 SEP +eor sAgu_, sE2, sAsi, ROR #62 SEP xar_m1 vBsa, vAbi, E2, 2 +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP xar_m1 vBbi, vAki, E2, 21 +eor sAma_, sE4, sAbu, ROR #20 SEP +eor sAbu_, sE4, sAsu, ROR #9 SEP xar_m1 vBki, vAko, E3, 39 +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP +eor sAbe_, sE1, sAge, ROR #19 SEP xar_m1 vBko, vAmu, E4, 56 +load_constant_ptr SEP +restore count, STACK_OFFSET_COUNT SEP xar_m1 vBmu, vAso, E3, 8 +bic tmp, sAgi_, sAge_, ROR #47 SEP +eor sAga, tmp, sAga_, ROR #39 SEP xar_m1 vBso, vAma, E0, 23 +bic tmp, sAgo_, sAgi_, ROR #42 SEP +eor sAge, tmp, sAge_, ROR #25 SEP +bic tmp, sAgu_, sAgo_, ROR #16 SEP xar_m1 vBka, vAbe, E1, 63 +eor sAgi, tmp, sAgi_, ROR #58 SEP +bic tmp, sAga_, sAgu_, ROR #31 SEP xar_m1 vBse, vAgo, E3, 9 +eor sAgo, tmp, sAgo_, ROR #47 SEP +bic tmp, sAge_, sAga_, ROR #56 SEP xar_m1 vBgo, vAme, E1, 19 +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP bcax_m1 vAge, vBge, vBgo, vBgi +eor sAka, tmp, sAka_, ROR #24 SEP +bic tmp, sAko_, sAki_, ROR #47 SEP +eor sAke, tmp, sAke_, ROR #2 SEP ldr vvtmpq, [sp, #(STACK_BASE_TMP + 16*32)] +bic tmp, sAku_, sAko_, ROR #10 SEP +eor sAki, tmp, sAki_, ROR #57 SEP xar_m1 vBke, vvtmp, E2, 58 +bic tmp, sAka_, sAku_, ROR #47 SEP +eor sAko, tmp, sAko_, ROR #57 SEP xar_m1 vBgu, vAsi, E2, 3 +bic tmp, sAke_, sAka_, ROR #5 SEP +eor sAku, tmp, sAku_, ROR #52 SEP bcax_m1 vAgi, vBgi, vBgu, vBgo +bic tmp, sAmi_, sAme_, ROR #38 SEP +eor sAma, tmp, sAma_, ROR #47 SEP +bic tmp, sAmo_, sAmi_, ROR #5 SEP xar_m1 vBsi, vAku, E4, 25 +eor sAme, tmp, sAme_, ROR #43 SEP +bic tmp, sAmu_, sAmo_, ROR #41 SEP xar_m1 vBku, vAsa, E0, 46 +eor sAmi, tmp, sAmi_, ROR #46 SEP +bic tmp, sAma_, sAmu_, ROR #35 SEP xar_m1 vBma, vAbu, E4, 37 +ldr cur_const, [const_addr, count, UXTW #3] SEP +add count, count, #1 SEP +eor sAmo, tmp, sAmo_, ROR #12 SEP xar_m1 vBbu, vAsu, E4, 50 +bic tmp, sAme_, sAma_, ROR #9 SEP +eor sAmu, tmp, sAmu_, ROR #44 SEP xar_m1 vBsu, vAse, E1, 62 +bic tmp, sAsi_, sAse_, ROR #48 SEP +eor sAsa, tmp, sAsa_, ROR #41 SEP ldp vvtmpq, E3q, [sp, #(STACK_BASE_TMP + 16*30)] +bic tmp, sAso_, sAsi_, ROR #2 SEP +eor sAse, tmp, sAse_, ROR #50 SEP xar_m1 vBme, vvtmp, E0, 28 +bic tmp, sAsu_, sAso_, ROR #25 SEP +eor sAsi, tmp, sAsi_, ROR #27 SEP +bic tmp, sAsa_, sAsu_, ROR #60 SEP xar_m1 vBbe, E3, E1, 20 +eor sAso, tmp, sAso_, ROR #21 SEP +bic tmp, sAse_, sAsa_, ROR #57 SEP bcax_m1 vAgo, vBgo, vBga, vBgu +eor sAsu, tmp, sAsu_, ROR #53 SEP +bic tmp, sAbi_, sAbe_, ROR #63 SEP bcax_m1 vAgu, vBgu, vBge, vBga +eor s_Aba, s_Aba_, tmp, ROR #21 SEP +bic tmp, sAbo_, sAbi_, ROR #42 SEP bcax_m1 vAka, vBka, vBki, vBke +eor sAbe, tmp, sAbe_, ROR #41 SEP +bic tmp, sAbu_, sAbo_, ROR #57 SEP +eor sAbi, tmp, sAbi_, ROR #35 SEP bcax_m1 vAke, vBke, vBko, vBki +bic tmp, s_Aba_, sAbu_, ROR #50 SEP +eor sAbo, tmp, sAbo_, ROR #43 SEP .unreq vvtmp +bic tmp, sAbe_, s_Aba_, ROR #44 SEP +eor sAbu, tmp, sAbu_, ROR #30 SEP .unreq vvtmpq +eor s_Aba, s_Aba, cur_const SEP +save count, STACK_OFFSET_COUNT SEP +eor sC0, sAka, sAsa, ROR #50 SEP eor2 C0, vAka, vAga +eor sC1, sAse, sAge, ROR #60 SEP +eor sC2, sAmi, sAgi, ROR #59 SEP save(vAga) +eor sC3, sAgo, sAso, ROR #30 SEP +eor sC4, sAbu, sAsu, ROR #53 SEP vvtmp .req vAga +eor sC0, sAma, sC0, ROR #49 SEP +eor sC1, sAbe, sC1, ROR #44 SEP vvtmpq .req vAgaq +eor sC2, sAki, sC2, ROR #26 SEP +eor sC3, sAmo, sC3, ROR #63 SEP +eor sC4, sAmu, sC4, ROR #56 SEP bcax_m1 vAki, vBki, vBku, vBko +eor sC0, sAga, sC0, ROR #57 SEP +eor sC1, sAme, sC1, ROR #58 SEP bcax_m1 vAko, vBko, vBka, vBku +eor sC2, sAbi, sC2, ROR #60 SEP +eor sC3, sAko, sC3, ROR #38 SEP eor2 C1, vAke, vAge +eor sC4, sAgu, sC4, ROR #48 SEP +eor sC0, s_Aba, sC0, ROR #61 SEP bcax_m1 vAku, vBku, vBke, vBka +eor sC1, sAke, sC1, ROR #57 SEP +eor sC2, sAsi, sC2, ROR #52 SEP +eor sC3, sAbo, sC3, ROR #63 SEP eor2 C2, vAki, vAgi +eor sC4, sAku, sC4, ROR #50 SEP +ror sC1, sC1, 56 SEP bcax_m1 vAma, vBma, vBmi, vBme +ror sC4, sC4, 58 SEP +ror sC2, sC2, 62 SEP eor2 C3, vAko, vAgo +eor sE1, sC0, sC2, ROR #63 SEP +eor sE3, sC2, sC4, ROR #63 SEP bcax_m1 vAme, vBme, vBmo, vBmi +eor sE0, sC4, sC1, ROR #63 SEP +eor sE2, sC1, sC3, ROR #63 SEP +eor sE4, sC3, sC0, ROR #63 SEP eor2 C4, vAku, vAgu +eor s_Aba_, sE0, s_Aba SEP +eor sAsa_, sE2, sAbi, ROR #50 SEP bcax_m1 vAmi, vBmi, vBmu, vBmo +eor sAbi_, sE2, sAki, ROR #46 SEP +eor sAki_, sE3, sAko, ROR #63 SEP eor2 C0, C0, vAma +eor sAko_, sE4, sAmu, ROR #28 SEP +eor sAmu_, sE3, sAso, ROR #2 SEP +eor sAso_, sE0, sAma, ROR #54 SEP bcax_m1 vAmo, vBmo, vBma, vBmu +eor sAka_, sE1, sAbe, ROR #43 SEP +eor sAse_, sE3, sAgo, ROR #36 SEP eor2 C1, C1, vAme +eor sAgo_, sE1, sAme, ROR #49 SEP +eor sAke_, sE2, sAgi, ROR #3 SEP bcax_m1 vAmu, vBmu, vBme, vBma +eor sAgi_, sE0, sAka, ROR #39 SEP +eor sAga_, sE3, sAbo SEP eor2 C2, C2, vAmi +eor sAbo_, sE3, sAmo, ROR #37 SEP +eor sAmo_, sE2, sAmi, ROR #8 SEP +eor sAmi_, sE1, sAke, ROR #56 SEP bcax_m1 vAsa, vBsa, vBsi, vBse +eor sAge_, sE4, sAgu, ROR #44 SEP +eor sAgu_, sE2, sAsi, ROR #62 SEP eor2 C3, C3, vAmo +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP bcax_m1 vAse, vBse, vBso, vBsi +eor sAma_, sE4, sAbu, ROR #20 SEP +eor sAbu_, sE4, sAsu, ROR #9 SEP eor2 C4, C4, vAmu +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP +eor sAbe_, sE1, sAge, ROR #19 SEP bcax_m1 vAsi, vBsi, vBsu, vBso +load_constant_ptr SEP +restore count, STACK_OFFSET_COUNT SEP eor2 C0, C0, vAsa +bic tmp, sAgi_, sAge_, ROR #47 SEP +eor sAga, tmp, sAga_, ROR #39 SEP bcax_m1 vAso, vBso, vBsa, vBsu +bic tmp, sAgo_, sAgi_, ROR #42 SEP +eor sAge, tmp, sAge_, ROR #25 SEP +bic tmp, sAgu_, sAgo_, ROR #16 SEP eor2 C1, C1, vAse +eor sAgi, tmp, sAgi_, ROR #58 SEP +bic tmp, sAga_, sAgu_, ROR #31 SEP bcax_m1 vAsu, vBsu, vBse, vBsa +eor sAgo, tmp, sAgo_, ROR #47 SEP +bic tmp, sAge_, sAga_, ROR #56 SEP eor2 C2, C2, vAsi +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP eor2 C3, C3, vAso +eor sAka, tmp, sAka_, ROR #24 SEP +bic tmp, sAko_, sAki_, ROR #47 SEP +eor sAke, tmp, sAke_, ROR #2 SEP bcax_m1 vAba, vBba, vBbi, vBbe +bic tmp, sAku_, sAko_, ROR #10 SEP +eor sAki, tmp, sAki_, ROR #57 SEP bcax_m1 vAbe, vBbe, vBbo, vBbi +bic tmp, sAka_, sAku_, ROR #47 SEP +eor sAko, tmp, sAko_, ROR #57 SEP eor2 C1, C1, vAbe +bic tmp, sAke_, sAka_, ROR #5 SEP +eor sAku, tmp, sAku_, ROR #52 SEP restore x26, STACK_OFFSET_CONST +bic tmp, sAmi_, sAme_, ROR #38 SEP +eor sAma, tmp, sAma_, ROR #47 SEP +bic tmp, sAmo_, sAmi_, ROR #5 SEP ldr vvtmpq, [x26], #16 +eor sAme, tmp, sAme_, ROR #43 SEP +bic tmp, sAmu_, sAmo_, ROR #41 SEP save x26, STACK_OFFSET_CONST +eor sAmi, tmp, sAmi_, ROR #46 SEP +bic tmp, sAma_, sAmu_, ROR #35 SEP eor vAba.16b, vAba.16b, vvtmp.16b +ldr cur_const, [const_addr, count, UXTW #3] SEP +add count, count, #1 SEP +eor sAmo, tmp, sAmo_, ROR #12 SEP eor2 C4, C4, vAsu +bic tmp, sAme_, sAma_, ROR #9 SEP +eor sAmu, tmp, sAmu_, ROR #44 SEP bcax_m1 vAbi, vBbi, vBbu, vBbo +bic tmp, sAsi_, sAse_, ROR #48 SEP +eor sAsa, tmp, sAsa_, ROR #41 SEP bcax_m1 vAbo, vBbo, vBba, vBbu +bic tmp, sAso_, sAsi_, ROR #2 SEP +eor sAse, tmp, sAse_, ROR #50 SEP eor2 C3, C3, vAbo +bic tmp, sAsu_, sAso_, ROR #25 SEP +eor sAsi, tmp, sAsi_, ROR #27 SEP +bic tmp, sAsa_, sAsu_, ROR #60 SEP eor2 C2, C2, vAbi +eor sAso, tmp, sAso_, ROR #21 SEP +bic tmp, sAse_, sAsa_, ROR #57 SEP eor2 C0, C0, vAba +eor sAsu, tmp, sAsu_, ROR #53 SEP +bic tmp, sAbi_, sAbe_, ROR #63 SEP bcax_m1 vAbu, vBbu, vBbe, vBba +eor s_Aba, s_Aba_, tmp, ROR #21 SEP +bic tmp, sAbo_, sAbi_, ROR #42 SEP eor2 C4, C4, vAbu +eor sAbe, tmp, sAbe_, ROR #41 SEP +bic tmp, sAbu_, sAbo_, ROR #57 SEP +eor sAbi, tmp, sAbi_, ROR #35 SEP restore(vAga) +bic tmp, s_Aba_, sAbu_, ROR #50 SEP +eor sAbo, tmp, sAbo_, ROR #43 SEP .unreq vvtmp +bic tmp, sAbe_, s_Aba_, ROR #44 SEP +eor sAbu, tmp, sAbu_, ROR #30 SEP .unreq vvtmpq +eor s_Aba, s_Aba, cur_const SEP +.endm +.macro hybrid_round_final + SEP vvtmp .req vBba +save count, STACK_OFFSET_COUNT SEP rax1_m1 E2, C1, C3 +eor sC0, sAka, sAsa, ROR #50 SEP +eor sC1, sAse, sAge, ROR #60 SEP rax1_m1 E4, C3, C0 +eor sC2, sAmi, sAgi, ROR #59 SEP +eor sC3, sAgo, sAso, ROR #30 SEP rax1_m1 E1, C0, C2 +eor sC4, sAbu, sAsu, ROR #53 SEP +eor sC0, sAma, sC0, ROR #49 SEP +eor sC1, sAbe, sC1, ROR #44 SEP +eor sC2, sAki, sC2, ROR #26 SEP +eor sC3, sAmo, sC3, ROR #63 SEP +eor sC4, sAmu, sC4, ROR #56 SEP +eor sC0, sAga, sC0, ROR #57 SEP +eor sC1, sAme, sC1, ROR #58 SEP +eor sC2, sAbi, sC2, ROR #60 SEP +eor sC3, sAko, sC3, ROR #38 SEP rax1_m1 E3, C2, C4 +eor sC4, sAgu, sC4, ROR #48 SEP +eor sC0, s_Aba, sC0, ROR #61 SEP +eor sC1, sAke, sC1, ROR #57 SEP +eor sC2, sAsi, sC2, ROR #52 SEP str vAgiq, [sp, #(STACK_BASE_TMP + 16*32)] +eor sC3, sAbo, sC3, ROR #63 SEP +eor sC4, sAku, sC4, ROR #50 SEP +ror sC1, sC1, 56 SEP rax1_m1 E0, C4, C1 +ror sC4, sC4, 58 SEP +ror sC2, sC2, 62 SEP +eor sE1, sC0, sC2, ROR #63 SEP +eor sE3, sC2, sC4, ROR #63 SEP .unreq vvtmp +eor sE0, sC4, sC1, ROR #63 SEP +eor sE2, sC1, sC3, ROR #63 SEP +eor sE4, sC3, sC0, ROR #63 SEP vvtmp .req C1 +eor s_Aba_, sE0, s_Aba SEP +eor sAsa_, sE2, sAbi, ROR #50 SEP +eor sAbi_, sE2, sAki, ROR #46 SEP vvtmpq .req C1q +eor sAki_, sE3, sAko, ROR #63 SEP +eor sAko_, sE4, sAmu, ROR #28 SEP +eor sAmu_, sE3, sAso, ROR #2 SEP +eor sAso_, sE0, sAma, ROR #54 SEP xar_m1 vBgi, vAka, E0, 61 +eor sAka_, sE1, sAbe, ROR #43 SEP +eor sAse_, sE3, sAgo, ROR #36 SEP +eor sAgo_, sE1, sAme, ROR #49 SEP xar_m1 vBga, vAbo, E3, 36 +eor sAke_, sE2, sAgi, ROR #3 SEP +eor sAgi_, sE0, sAka, ROR #39 SEP +eor sAga_, sE3, sAbo SEP +eor sAbo_, sE3, sAmo, ROR #37 SEP str vAgaq, [sp, #(STACK_BASE_TMP + 16 * 30)] +eor sAmo_, sE2, sAmi, ROR #8 SEP +eor sAmi_, sE1, sAke, ROR #56 SEP +eor sAge_, sE4, sAgu, ROR #44 SEP xar_m1 vBbo, vAmo, E3, 43 +eor sAgu_, sE2, sAsi, ROR #62 SEP +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP +eor sAma_, sE4, sAbu, ROR #20 SEP xar_m1 vBmo, vAmi, E2, 49 +eor sAbu_, sE4, sAsu, ROR #9 SEP +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP str vAgeq, [sp, #(STACK_BASE_TMP + 16 * 31)] +eor sAbe_, sE1, sAge, ROR #19 SEP +load_constant_ptr SEP +restore count, STACK_OFFSET_COUNT SEP +bic tmp, sAgi_, sAge_, ROR #47 SEP xar_m1 vBmi, vAke, E1, 54 +eor sAga, tmp, sAga_, ROR #39 SEP +bic tmp, sAgo_, sAgi_, ROR #42 SEP +eor sAge, tmp, sAge_, ROR #25 SEP xar_m1 vBge, vAgu, E4, 44 +bic tmp, sAgu_, sAgo_, ROR #16 SEP +eor sAgi, tmp, sAgi_, ROR #58 SEP +bic tmp, sAga_, sAgu_, ROR #31 SEP bcax_m1 vAga, vBga, vBgi, vBge +eor sAgo, tmp, sAgo_, ROR #47 SEP +bic tmp, sAge_, sAga_, ROR #56 SEP +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP eor vBba.16b, vAba.16b, E0.16b +eor sAka, tmp, sAka_, ROR #24 SEP +bic tmp, sAko_, sAki_, ROR #47 SEP +eor sAke, tmp, sAke_, ROR #2 SEP xar_m1 vBsa, vAbi, E2, 2 +bic tmp, sAku_, sAko_, ROR #10 SEP +eor sAki, tmp, sAki_, ROR #57 SEP +bic tmp, sAka_, sAku_, ROR #47 SEP +eor sAko, tmp, sAko_, ROR #57 SEP xar_m1 vBbi, vAki, E2, 21 +bic tmp, sAke_, sAka_, ROR #5 SEP +eor sAku, tmp, sAku_, ROR #52 SEP +bic tmp, sAmi_, sAme_, ROR #38 SEP xar_m1 vBki, vAko, E3, 39 +eor sAma, tmp, sAma_, ROR #47 SEP +bic tmp, sAmo_, sAmi_, ROR #5 SEP +eor sAme, tmp, sAme_, ROR #43 SEP +bic tmp, sAmu_, sAmo_, ROR #41 SEP xar_m1 vBko, vAmu, E4, 56 +eor sAmi, tmp, sAmi_, ROR #46 SEP +bic tmp, sAma_, sAmu_, ROR #35 SEP +ldr cur_const, [const_addr, count, UXTW #3] SEP xar_m1 vBmu, vAso, E3, 8 +add count, count, #1 SEP +eor sAmo, tmp, sAmo_, ROR #12 SEP +bic tmp, sAme_, sAma_, ROR #9 SEP +eor sAmu, tmp, sAmu_, ROR #44 SEP xar_m1 vBso, vAma, E0, 23 +bic tmp, sAsi_, sAse_, ROR #48 SEP +eor sAsa, tmp, sAsa_, ROR #41 SEP +bic tmp, sAso_, sAsi_, ROR #2 SEP xar_m1 vBka, vAbe, E1, 63 +eor sAse, tmp, sAse_, ROR #50 SEP +bic tmp, sAsu_, sAso_, ROR #25 SEP +eor sAsi, tmp, sAsi_, ROR #27 SEP xar_m1 vBse, vAgo, E3, 9 +bic tmp, sAsa_, sAsu_, ROR #60 SEP +eor sAso, tmp, sAso_, ROR #21 SEP +bic tmp, sAse_, sAsa_, ROR #57 SEP +eor sAsu, tmp, sAsu_, ROR #53 SEP xar_m1 vBgo, vAme, E1, 19 +bic tmp, sAbi_, sAbe_, ROR #63 SEP +eor s_Aba, s_Aba_, tmp, ROR #21 SEP +bic tmp, sAbo_, sAbi_, ROR #42 SEP bcax_m1 vAge, vBge, vBgo, vBgi +eor sAbe, tmp, sAbe_, ROR #41 SEP +bic tmp, sAbu_, sAbo_, ROR #57 SEP +eor sAbi, tmp, sAbi_, ROR #35 SEP +bic tmp, s_Aba_, sAbu_, ROR #50 SEP ldr vvtmpq, [sp, #(STACK_BASE_TMP + 16*32)] +eor sAbo, tmp, sAbo_, ROR #43 SEP +bic tmp, sAbe_, s_Aba_, ROR #44 SEP +eor sAbu, tmp, sAbu_, ROR #30 SEP xar_m1 vBke, vvtmp, E2, 58 +eor s_Aba, s_Aba, cur_const SEP +save count, STACK_OFFSET_COUNT SEP +eor sC0, sAka, sAsa, ROR #50 SEP +eor sC1, sAse, sAge, ROR #60 SEP xar_m1 vBgu, vAsi, E2, 3 +eor sC2, sAmi, sAgi, ROR #59 SEP +eor sC3, sAgo, sAso, ROR #30 SEP +eor sC4, sAbu, sAsu, ROR #53 SEP bcax_m1 vAgi, vBgi, vBgu, vBgo +eor sC0, sAma, sC0, ROR #49 SEP +eor sC1, sAbe, sC1, ROR #44 SEP +eor sC2, sAki, sC2, ROR #26 SEP +eor sC3, sAmo, sC3, ROR #63 SEP xar_m1 vBsi, vAku, E4, 25 +eor sC4, sAmu, sC4, ROR #56 SEP +eor sC0, sAga, sC0, ROR #57 SEP +eor sC1, sAme, sC1, ROR #58 SEP xar_m1 vBku, vAsa, E0, 46 +eor sC2, sAbi, sC2, ROR #60 SEP +eor sC3, sAko, sC3, ROR #38 SEP +eor sC4, sAgu, sC4, ROR #48 SEP xar_m1 vBma, vAbu, E4, 37 +eor sC0, s_Aba, sC0, ROR #61 SEP +eor sC1, sAke, sC1, ROR #57 SEP +eor sC2, sAsi, sC2, ROR #52 SEP +eor sC3, sAbo, sC3, ROR #63 SEP xar_m1 vBbu, vAsu, E4, 50 +eor sC4, sAku, sC4, ROR #50 SEP +ror sC1, sC1, 56 SEP +ror sC4, sC4, 58 SEP xar_m1 vBsu, vAse, E1, 62 +ror sC2, sC2, 62 SEP +eor sE1, sC0, sC2, ROR #63 SEP +eor sE3, sC2, sC4, ROR #63 SEP +eor sE0, sC4, sC1, ROR #63 SEP ldp vvtmpq, E3q, [sp, #(STACK_BASE_TMP + 16*30)] +eor sE2, sC1, sC3, ROR #63 SEP +eor sE4, sC3, sC0, ROR #63 SEP +eor s_Aba_, sE0, s_Aba SEP xar_m1 vBme, vvtmp, E0, 28 +eor sAsa_, sE2, sAbi, ROR #50 SEP +eor sAbi_, sE2, sAki, ROR #46 SEP +eor sAki_, sE3, sAko, ROR #63 SEP +eor sAko_, sE4, sAmu, ROR #28 SEP xar_m1 vBbe, E3, E1, 20 +eor sAmu_, sE3, sAso, ROR #2 SEP +eor sAso_, sE0, sAma, ROR #54 SEP +eor sAka_, sE1, sAbe, ROR #43 SEP bcax_m1 vAgo, vBgo, vBga, vBgu +eor sAse_, sE3, sAgo, ROR #36 SEP +eor sAgo_, sE1, sAme, ROR #49 SEP +eor sAke_, sE2, sAgi, ROR #3 SEP +eor sAgi_, sE0, sAka, ROR #39 SEP bcax_m1 vAgu, vBgu, vBge, vBga +eor sAga_, sE3, sAbo SEP +eor sAbo_, sE3, sAmo, ROR #37 SEP +eor sAmo_, sE2, sAmi, ROR #8 SEP bcax_m1 vAka, vBka, vBki, vBke +eor sAmi_, sE1, sAke, ROR #56 SEP +eor sAge_, sE4, sAgu, ROR #44 SEP +eor sAgu_, sE2, sAsi, ROR #62 SEP bcax_m1 vAke, vBke, vBko, vBki +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP +eor sAma_, sE4, sAbu, ROR #20 SEP +eor sAbu_, sE4, sAsu, ROR #9 SEP bcax_m1 vAki, vBki, vBku, vBko +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP +eor sAbe_, sE1, sAge, ROR #19 SEP bcax_m1 vAko, vBko, vBka, vBku +load_constant_ptr SEP +restore count, STACK_OFFSET_COUNT SEP +bic tmp, sAgi_, sAge_, ROR #47 SEP +eor sAga, tmp, sAga_, ROR #39 SEP bcax_m1 vAku, vBku, vBke, vBka +bic tmp, sAgo_, sAgi_, ROR #42 SEP +eor sAge, tmp, sAge_, ROR #25 SEP +bic tmp, sAgu_, sAgo_, ROR #16 SEP bcax_m1 vAma, vBma, vBmi, vBme +eor sAgi, tmp, sAgi_, ROR #58 SEP +bic tmp, sAga_, sAgu_, ROR #31 SEP +eor sAgo, tmp, sAgo_, ROR #47 SEP +bic tmp, sAge_, sAga_, ROR #56 SEP bcax_m1 vAme, vBme, vBmo, vBmi +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP +eor sAka, tmp, sAka_, ROR #24 SEP bcax_m1 vAmi, vBmi, vBmu, vBmo +bic tmp, sAko_, sAki_, ROR #47 SEP +eor sAke, tmp, sAke_, ROR #2 SEP +bic tmp, sAku_, sAko_, ROR #10 SEP +eor sAki, tmp, sAki_, ROR #57 SEP bcax_m1 vAmo, vBmo, vBma, vBmu +bic tmp, sAka_, sAku_, ROR #47 SEP +eor sAko, tmp, sAko_, ROR #57 SEP +bic tmp, sAke_, sAka_, ROR #5 SEP bcax_m1 vAmu, vBmu, vBme, vBma +eor sAku, tmp, sAku_, ROR #52 SEP +bic tmp, sAmi_, sAme_, ROR #38 SEP +eor sAma, tmp, sAma_, ROR #47 SEP bcax_m1 vAsa, vBsa, vBsi, vBse +bic tmp, sAmo_, sAmi_, ROR #5 SEP +eor sAme, tmp, sAme_, ROR #43 SEP +bic tmp, sAmu_, sAmo_, ROR #41 SEP +eor sAmi, tmp, sAmi_, ROR #46 SEP bcax_m1 vAse, vBse, vBso, vBsi +bic tmp, sAma_, sAmu_, ROR #35 SEP +ldr cur_const, [const_addr, count, UXTW #3] SEP +add count, count, #1 SEP bcax_m1 vAsi, vBsi, vBsu, vBso +eor sAmo, tmp, sAmo_, ROR #12 SEP +bic tmp, sAme_, sAma_, ROR #9 SEP +eor sAmu, tmp, sAmu_, ROR #44 SEP +bic tmp, sAsi_, sAse_, ROR #48 SEP bcax_m1 vAso, vBso, vBsa, vBsu +eor sAsa, tmp, sAsa_, ROR #41 SEP +bic tmp, sAso_, sAsi_, ROR #2 SEP +eor sAse, tmp, sAse_, ROR #50 SEP bcax_m1 vAsu, vBsu, vBse, vBsa +bic tmp, sAsu_, sAso_, ROR #25 SEP +eor sAsi, tmp, sAsi_, ROR #27 SEP +bic tmp, sAsa_, sAsu_, ROR #60 SEP +eor sAso, tmp, sAso_, ROR #21 SEP bcax_m1 vAba, vBba, vBbi, vBbe +bic tmp, sAse_, sAsa_, ROR #57 SEP +eor sAsu, tmp, sAsu_, ROR #53 SEP +bic tmp, sAbi_, sAbe_, ROR #63 SEP bcax_m1 vAbe, vBbe, vBbo, vBbi +eor s_Aba, s_Aba_, tmp, ROR #21 SEP +bic tmp, sAbo_, sAbi_, ROR #42 SEP +eor sAbe, tmp, sAbe_, ROR #41 SEP +bic tmp, sAbu_, sAbo_, ROR #57 SEP bcax_m1 vAbi, vBbi, vBbu, vBbo +eor sAbi, tmp, sAbi_, ROR #35 SEP +bic tmp, s_Aba_, sAbu_, ROR #50 SEP +eor sAbo, tmp, sAbo_, ROR #43 SEP bcax_m1 vAbo, vBbo, vBba, vBbu +bic tmp, sAbe_, s_Aba_, ROR #44 SEP +eor sAbu, tmp, sAbu_, ROR #30 SEP +eor s_Aba, s_Aba, cur_const SEP bcax_m1 vAbu, vBbu, vBbe, vBba +ror sAga, sAga,(64-3) SEP +ror sAka, sAka,(64-25) SEP +ror sAma, sAma,(64-10) SEP +ror sAsa, sAsa,(64-39) SEP restore x26, STACK_OFFSET_CONST +ror sAbe, sAbe,(64-21) SEP +ror sAge, sAge,(64-45) SEP +ror sAke, sAke,(64-8) SEP ldr vvtmpq, [x26], #16 +ror sAme, sAme,(64-15) SEP +ror sAse, sAse,(64-41) SEP +ror sAbi, sAbi,(64-14) SEP +ror sAgi, sAgi,(64-61) SEP save x26, STACK_OFFSET_CONST +ror sAki, sAki,(64-18) SEP +ror sAmi, sAmi,(64-56) SEP +ror sAsi, sAsi,(64-2) SEP eor vAba.16b, vAba.16b, vvtmp.16b +ror sAgo, sAgo,(64-28) SEP +ror sAko, sAko,(64-1) SEP +ror sAmo, sAmo,(64-27) SEP +ror sAso, sAso,(64-62) SEP .unreq vvtmp +ror sAbu, sAbu,(64-44) SEP +ror sAgu, sAgu,(64-20) SEP +ror sAku, sAku,(64-6) SEP .unreq vvtmpq +ror sAmu, sAmu,(64-36) SEP +ror sAsu, sAsu,(64-55) SEP +.endm + +#define KECCAK_F1600_ROUNDS 24 + +.global keccak_f1600_x4_hybrid_asm_v5 +.global _keccak_f1600_x4_hybrid_asm_v5 +.text +.align 4 + +keccak_f1600_x4_hybrid_asm_v5: +_keccak_f1600_x4_hybrid_asm_v5: + alloc_stack + save_gprs + save_vregs + save input_addr, STACK_OFFSET_INPUT + + + ASM_LOAD(const_addr,round_constants_vec) + + save const_addr, STACK_OFFSET_CONST + load_input_vector 2,1 + + // First scalar Keccak computation alongside first half of SIMD computation + load_input_scalar 4,0 + hybrid_round_initial + loop_0: + hybrid_round_noninitial + cmp count, #(KECCAK_F1600_ROUNDS-3) + ble loop_0 + + hybrid_round_final + + restore input_addr, STACK_OFFSET_INPUT + store_input_scalar 4,0 + + // Second scalar Keccak computation alongsie second half of SIMD computation + load_input_scalar 4,1 + hybrid_round_initial + loop_1: + hybrid_round_noninitial + cmp count, #(KECCAK_F1600_ROUNDS-3) + ble loop_1 + + hybrid_round_final + + restore input_addr, STACK_OFFSET_INPUT + store_input_scalar 4,1 + store_input_vector 2,1 + + restore_vregs + restore_gprs + free_stack + + + ret diff --git a/tests/keccak_neon/manual/keccak_f1600_x4_hybrid_asm_v5p.s b/tests/keccak_neon/manual/keccak_f1600_x4_hybrid_asm_v5p.s new file mode 100644 index 0000000..960f781 --- /dev/null +++ b/tests/keccak_neon/manual/keccak_f1600_x4_hybrid_asm_v5p.s @@ -0,0 +1,1337 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +/********************** CONSTANTS *************************/ + .data + .align(8) +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 +round_constants_vec: + .quad 0x0000000000000001 + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + .quad 0x8000000080008008 +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x29 + count .req w27 + out_count .req w27 + cur_const .req x26 + + /* Mapping of Kecck-f1600 SIMD state to vector registers + * at the beginning and end of each round. */ + + /* Mapping of Kecck-f1600 state to vector registers + * at the beginning and end of each round. */ + vAba .req v0 + vAbe .req v1 + vAbi .req v2 + vAbo .req v3 + vAbu .req v4 + vAga .req v5 + vAge .req v6 + vAgi .req v7 + vAgo .req v8 + vAgu .req v9 + vAka .req v10 + vAke .req v11 + vAki .req v12 + vAko .req v13 + vAku .req v14 + vAma .req v15 + vAme .req v16 + vAmi .req v17 + vAmo .req v18 + vAmu .req v19 + vAsa .req v20 + vAse .req v21 + vAsi .req v22 + vAso .req v23 + vAsu .req v24 + + /* q-form of the above mapping */ + vAbaq .req q0 + vAbeq .req q1 + vAbiq .req q2 + vAboq .req q3 + vAbuq .req q4 + vAgaq .req q5 + vAgeq .req q6 + vAgiq .req q7 + vAgoq .req q8 + vAguq .req q9 + vAkaq .req q10 + vAkeq .req q11 + vAkiq .req q12 + vAkoq .req q13 + vAkuq .req q14 + vAmaq .req q15 + vAmeq .req q16 + vAmiq .req q17 + vAmoq .req q18 + vAmuq .req q19 + vAsaq .req q20 + vAseq .req q21 + vAsiq .req q22 + vAsoq .req q23 + vAsuq .req q24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v27 + C1 .req v28 + C2 .req v29 + C3 .req v30 + C4 .req v31 + + C0q .req q27 + C1q .req q28 + C2q .req q29 + C3q .req q30 + C4q .req q31 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + vBba .req v25 // fresh + vBbe .req v26 // fresh + vBbi .req vAbi + vBbo .req vAbo + vBbu .req vAbu + vBga .req vAka + vBge .req vAke + vBgi .req vAgi + vBgo .req vAgo + vBgu .req vAgu + vBka .req vAma + vBke .req vAme + vBki .req vAki + vBko .req vAko + vBku .req vAku + vBma .req vAsa + vBme .req vAse + vBmi .req vAmi + vBmo .req vAmo + vBmu .req vAmu + vBsa .req vAba + vBse .req vAbe + vBsi .req vAsi + vBso .req vAso + vBsu .req vAsu + + vBbaq .req q25 // fresh + vBbeq .req q26 // fresh + vBbiq .req vAbiq + vBboq .req vAboq + vBbuq .req vAbuq + vBgaq .req vAkaq + vBgeq .req vAkeq + vBgiq .req vAgiq + vBgoq .req vAgoq + vBguq .req vAguq + vBkaq .req vAmaq + vBkeq .req vAmeq + vBkiq .req vAkiq + vBkoq .req vAkoq + vBkuq .req vAkuq + vBmaq .req vAsaq + vBmeq .req vAseq + vBmiq .req vAmiq + vBmoq .req vAmoq + vBmuq .req vAmuq + vBsaq .req vAbaq + vBseq .req vAbeq + vBsiq .req vAsiq + vBsoq .req vAsoq + vBsuq .req vAsuq + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req C4 + E1 .req C0 + E2 .req vBbe // fresh + E3 .req C2 + E4 .req C3 + + E0q .req C4q + E1q .req C0q + E2q .req vBbeq // fresh + E3q .req C2q + E4q .req C3q + + /* Mapping of Kecck-f1600 state to scalar registers + * at the beginning and end of each round. */ + s_Aba .req x1 + sAbe .req x6 + sAbi .req x11 + sAbo .req x16 + sAbu .req x21 + sAga .req x2 + sAge .req x7 + sAgi .req x12 + sAgo .req x17 + sAgu .req x22 + sAka .req x3 + sAke .req x8 + sAki .req x13 + sAko .req x18 + sAku .req x23 + sAma .req x4 + sAme .req x9 + sAmi .req x14 + sAmo .req x19 + sAmu .req x24 + sAsa .req x5 + sAse .req x10 + sAsi .req x15 + sAso .req x20 + sAsu .req x25 + + /* sA_[y,2*x+3*y] = rot(A[x,y]) */ + s_Aba_ .req x0 + sAbe_ .req x28 + sAbi_ .req x11 + sAbo_ .req x16 + sAbu_ .req x21 + sAga_ .req x3 + sAge_ .req x8 + sAgi_ .req x12 + sAgo_ .req x17 + sAgu_ .req x22 + sAka_ .req x4 + sAke_ .req x9 + sAki_ .req x13 + sAko_ .req x18 + sAku_ .req x23 + sAma_ .req x5 + sAme_ .req x10 + sAmi_ .req x14 + sAmo_ .req x19 + sAmu_ .req x24 + sAsa_ .req x1 + sAse_ .req x6 + sAsi_ .req x15 + sAso_ .req x20 + sAsu_ .req x25 + + /* sC[x] = sA[x,0] xor sA[x,1] xor sA[x,2] xor sA[x,3] xor sA[x,4], for x in 0..4 */ + /* sE[x] = sC[x-1] xor rot(C[x+1],1), for x in 0..4 */ + sC0 .req x0 + sE0 .req x29 + sC1 .req x26 + sE1 .req x30 + sC2 .req x27 + sE2 .req x26 + sC3 .req x28 + sE3 .req x27 + sC4 .req x29 + sE4 .req x28 + + tmp .req x30 + +/************************ MACROS ****************************/ + +/* Macros using v8.4-A SHA-3 instructions */ + +.macro eor3_m1_0 d s0 s1 s2 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor2 d s0 s1 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor3_m1_1 d s0 s1 s2 + eor \d\().16b, \d\().16b, \s2\().16b +.endm + + +.macro eor3_m1 d s0 s1 s2 + eor3_m1_0 \d, \s0, \s1, \s2 + eor3_m1_1 \d, \s0, \s1, \s2 +.endm + +.macro rax1_m1 d s0 s1 + // Use add instead of SHL #1 + add vvtmp.2d, \s1\().2d, \s1\().2d + sri vvtmp.2d, \s1\().2d, #63 + eor \d\().16b, vvtmp.16b, \s0\().16b +.endm + + .macro xar_m1 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 63 + eor \s0\().16b, \s0\().16b, \s1\().16b + add \d\().2d, \s0\().2d, \s0\().2d + sri \d\().2d, \s0\().2d, #(63) + .elseif \imm == 62 + eor \s0\().16b, \s0\().16b, \s1\().16b + add \d\().2d, \s0\().2d, \s0\().2d + add \d\().2d, \d\().2d, \d\().2d + sri \d\().2d, \s0\().2d, #(62) + .else + eor \s0\().16b, \s0\().16b, \s1\().16b + shl \d\().2d, \s0\().2d, #(64-\imm) + sri \d\().2d, \s0\().2d, #(\imm) + .endif +.endm + + .macro xar_m1_0 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 63 + eor \s0\().16b, \s0\().16b, \s1\().16b + .elseif \imm == 62 + eor \s0\().16b, \s0\().16b, \s1\().16b + .else + eor \s0\().16b, \s0\().16b, \s1\().16b + .endif +.endm + + .macro xar_m1_1 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 63 + add \d\().2d, \s0\().2d, \s0\().2d + sri \d\().2d, \s0\().2d, #(63) + .elseif \imm == 62 + add \d\().2d, \s0\().2d, \s0\().2d + add \d\().2d, \d\().2d, \d\().2d + sri \d\().2d, \s0\().2d, #(62) + .else + shl \d\().2d, \s0\().2d, #(64-\imm) + sri \d\().2d, \s0\().2d, #(\imm) + .endif +.endm + +.macro bcax_m1 d s0 s1 s2 + bic vvtmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, vvtmp.16b, \s0\().16b +.endm + +.macro load_input_vector + ldr vAbaq, [input_addr, #(32*0)] + ldr vAbeq, [input_addr, #(32*0+32)] + ldr vAbiq, [input_addr, #(32*2)] + ldr vAboq, [input_addr, #(32*2+32)] + ldr vAbuq, [input_addr, #(32*4)] + ldr vAgaq, [input_addr, #(32*4+32)] + ldr vAgeq, [input_addr, #(32*6)] + ldr vAgiq, [input_addr, #(32*6+32)] + ldr vAgoq, [input_addr, #(32*8)] + ldr vAguq, [input_addr, #(32*8+32)] + ldr vAkaq, [input_addr, #(32*10)] + ldr vAkeq, [input_addr, #(32*10+32)] + ldr vAkiq, [input_addr, #(32*12)] + ldr vAkoq, [input_addr, #(32*12+32)] + ldr vAkuq, [input_addr, #(32*14)] + ldr vAmaq, [input_addr, #(32*14+32)] + ldr vAmeq, [input_addr, #(32*16)] + ldr vAmiq, [input_addr, #(32*16+32)] + ldr vAmoq, [input_addr, #(32*18)] + ldr vAmuq, [input_addr, #(32*18+32)] + ldr vAsaq, [input_addr, #(32*20)] + ldr vAseq, [input_addr, #(32*20+32)] + ldr vAsiq, [input_addr, #(32*22)] + ldr vAsoq, [input_addr, #(32*22+32)] + ldr vAsuq, [input_addr, #(32*24)] +.endm + +.macro store_input_vector + str vAbaq, [input_addr, #(32*0)] + str vAbeq, [input_addr, #(32*0+32)] + str vAbiq, [input_addr, #(32*2)] + str vAboq, [input_addr, #(32*2+32)] + str vAbuq, [input_addr, #(32*4)] + str vAgaq, [input_addr, #(32*4+32)] + str vAgeq, [input_addr, #(32*6)] + str vAgiq, [input_addr, #(32*6+32)] + str vAgoq, [input_addr, #(32*8)] + str vAguq, [input_addr, #(32*8+32)] + str vAkaq, [input_addr, #(32*10)] + str vAkeq, [input_addr, #(32*10+32)] + str vAkiq, [input_addr, #(32*12)] + str vAkoq, [input_addr, #(32*12+32)] + str vAkuq, [input_addr, #(32*14)] + str vAmaq, [input_addr, #(32*14+32)] + str vAmeq, [input_addr, #(32*16)] + str vAmiq, [input_addr, #(32*16+32)] + str vAmoq, [input_addr, #(32*18)] + str vAmuq, [input_addr, #(32*18+32)] + str vAsaq, [input_addr, #(32*20)] + str vAseq, [input_addr, #(32*20+32)] + str vAsiq, [input_addr, #(32*22)] + str vAsoq, [input_addr, #(32*22+32)] + str vAsuq, [input_addr, #(32*24)] +.endm + +.macro store_input_scalar + str s_Aba,[input_addr, 32*0 ] + str sAbe, [input_addr, 32*1 ] + str sAbi, [input_addr, 32*2 ] + str sAbo, [input_addr, 32*3 ] + str sAbu, [input_addr, 32*4 ] + str sAga, [input_addr, 32*5 ] + str sAge, [input_addr, 32*6 ] + str sAgi, [input_addr, 32*7 ] + str sAgo, [input_addr, 32*8 ] + str sAgu, [input_addr, 32*9 ] + str sAka, [input_addr, 32*10] + str sAke, [input_addr, 32*11] + str sAki, [input_addr, 32*12] + str sAko, [input_addr, 32*13] + str sAku, [input_addr, 32*14] + str sAma, [input_addr, 32*15] + str sAme, [input_addr, 32*16] + str sAmi, [input_addr, 32*17] + str sAmo, [input_addr, 32*18] + str sAmu, [input_addr, 32*19] + str sAsa, [input_addr, 32*20] + str sAse, [input_addr, 32*21] + str sAsi, [input_addr, 32*22] + str sAso, [input_addr, 32*23] + str sAsu, [input_addr, 32*24] +.endm + +.macro load_input_scalar + ldr s_Aba,[input_addr, 32*0 ] + ldr sAbe, [input_addr, 32*1 ] + ldr sAbi, [input_addr, 32*2 ] + ldr sAbo, [input_addr, 32*3 ] + ldr sAbu, [input_addr, 32*4 ] + ldr sAga, [input_addr, 32*5 ] + ldr sAge, [input_addr, 32*6 ] + ldr sAgi, [input_addr, 32*7 ] + ldr sAgo, [input_addr, 32*8 ] + ldr sAgu, [input_addr, 32*9 ] + ldr sAka, [input_addr, 32*10] + ldr sAke, [input_addr, 32*11] + ldr sAki, [input_addr, 32*12] + ldr sAko, [input_addr, 32*13] + ldr sAku, [input_addr, 32*14] + ldr sAma, [input_addr, 32*15] + ldr sAme, [input_addr, 32*16] + ldr sAmi, [input_addr, 32*17] + ldr sAmo, [input_addr, 32*18] + ldr sAmu, [input_addr, 32*19] + ldr sAsa, [input_addr, 32*20] + ldr sAse, [input_addr, 32*21] + ldr sAsi, [input_addr, 32*22] + ldr sAso, [input_addr, 32*23] + ldr sAsu, [input_addr, 32*24] +.endm + +#define STACK_SIZE (4*16 + 12*8 + 6*8 + 3*16) +#define STACK_BASE_VREGS (0) +#define STACK_BASE_GPRS (4*16) +#define STACK_BASE_TMP_GPRS (4*16 + 12*8) +#define STACK_BASE_TMP_VREGS (4*16 + 12*8 + 6*8) +#define STACK_OFFSET_INPUT (0*8) +#define STACK_OFFSET_CONST (1*8) +#define STACK_OFFSET_COUNT (2*8) +#define STACK_OFFSET_COUNT_OUT (3*8) +#define STACK_OFFSET_CUR_INPUT (4*8) + +#define vAgi_offset 0 +#define vAga_offset 1 +#define vAge_offset 2 + +#define save(name) \ + str name ## q, [sp, #(STACK_BASE_TMP_VREGS + 16 * name ## _offset)] +#define restore(name) \ + ldr name ## q, [sp, #(STACK_BASE_TMP_VREGS + 16 * name ## _offset)] + +#define restore_as(reg,name) \ + ldr reg, [sp, #(STACK_BASE_TMP_VREGS + 16 * name ## _offset)] + +.macro save reg, offset + str \reg, [sp, #(STACK_BASE_TMP_GPRS + \offset)] +.endm + +.macro restore reg, offset + ldr \reg, [sp, #(STACK_BASE_TMP_GPRS + \offset)] +.endm + +.macro save_gprs + stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro save_vregs + stp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] + stp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + stp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + stp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] +.endm + +.macro restore_vregs + ldp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] + ldp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + ldp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + ldp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] +.endm + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +.macro eor5 dst, src0, src1, src2, src3, src4 + eor \dst, \src0, \src1 + eor \dst, \dst, \src2 + eor \dst, \dst, \src3 + eor \dst, \dst, \src4 +.endm + +.macro xor_rol dst, src1, src0, imm + eor \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro bic_rol dst, src1, src0, imm + bic \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro rotate dst, src, imm + ror \dst, \src, #(64-\imm) +.endm + +.macro hybrid_round_initial +eor sC0, sAma, sAsa SEP eor3_m1_0 C1,vAbe,vAge,vAke +eor sC1, sAme, sAse SEP +eor sC2, sAmi, sAsi SEP eor3_m1_0 C3,vAbo,vAgo,vAko +eor sC3, sAmo, sAso SEP +eor sC4, sAmu, sAsu SEP eor3_m1_0 C0,vAba,vAga,vAka +eor sC0, sAka, sC0 SEP +eor sC1, sAke, sC1 SEP eor3_m1_0 C2,vAbi,vAgi,vAki +eor sC2, sAki, sC2 SEP +eor sC3, sAko, sC3 SEP eor3_m1_0 C4,vAbu,vAgu,vAku +eor sC4, sAku, sC4 SEP +eor sC0, sAga, sC0 SEP eor3_m1_1 C1,vAbe,vAge,vAke +eor sC1, sAge, sC1 SEP eor3_m1_1 C3,vAbo,vAgo,vAko +eor sC2, sAgi, sC2 SEP +eor sC3, sAgo, sC3 SEP eor3_m1_1 C0,vAba,vAga,vAka +eor sC4, sAgu, sC4 SEP +eor sC0, s_Aba, sC0 SEP eor3_m1_1 C2,vAbi,vAgi,vAki +eor sC1, sAbe, sC1 SEP +eor sC2, sAbi, sC2 SEP eor3_m1_1 C4,vAbu,vAgu,vAku +eor sC3, sAbo, sC3 SEP +eor sC4, sAbu, sC4 SEP eor3_m1_0 C1, C1,vAme, vAse +eor sE1, sC0, sC2, ROR #63 SEP eor3_m1_0 C3, C3,vAmo, vAso +eor sE3, sC2, sC4, ROR #63 SEP +eor sE0, sC4, sC1, ROR #63 SEP eor3_m1_0 C0, C0,vAma, vAsa +eor sE2, sC1, sC3, ROR #63 SEP +eor sE4, sC3, sC0, ROR #63 SEP eor3_m1_0 C2, C2,vAmi, vAsi +eor s_Aba_, s_Aba, sE0 SEP +eor sAsa_, sAbi, sE2 SEP eor3_m1_0 C4, C4,vAmu, vAsu +eor sAbi_, sAki, sE2 SEP +eor sAki_, sAko, sE3 SEP eor3_m1_1 C1, C1,vAme, vAse +eor sAko_, sAmu, sE4 SEP eor3_m1_1 C3, C3,vAmo, vAso +eor sAmu_, sAso, sE3 SEP +eor sAso_, sAma, sE0 SEP eor3_m1_1 C0, C0,vAma, vAsa +eor sAka_, sAbe, sE1 SEP +eor sAse_, sAgo, sE3 SEP eor3_m1_1 C2, C2,vAmi, vAsi +eor sAgo_, sAme, sE1 SEP +eor sAke_, sAgi, sE2 SEP eor3_m1_1 C4, C4,vAmu, vAsu +eor sAgi_, sAka, sE0 SEP +eor sAga_, sAbo, sE3 SEP vvtmp .req vBba +eor sAbo_, sAmo, sE3 SEP rax1_m1 E2, C1, C3 +eor sAmo_, sAmi, sE2 SEP +eor sAmi_, sAke, sE1 SEP rax1_m1 E4, C3, C0 +eor sAge_, sAgu, sE4 SEP +eor sAgu_, sAsi, sE2 SEP rax1_m1 E1, C0, C2 +eor sAsi_, sAku, sE4 SEP +eor sAku_, sAsa, sE0 SEP rax1_m1 E3, C2, C4 +eor sAma_, sAbu, sE4 SEP +eor sAbu_, sAsu, sE4 SEP save(vAgi) +eor sAsu_, sAse, sE1 SEP rax1_m1 E0, C4, C1 +eor sAme_, sAga, sE0 SEP +eor sAbe_, sAge, sE1 SEP /* 25x XAR, 75 in total */ +load_constant_ptr SEP +bic tmp, sAgi_, sAge_, ROR #47 SEP .unreq vvtmp +eor sAga, tmp, sAga_, ROR #39 SEP +bic tmp, sAgo_, sAgi_, ROR #42 SEP vvtmp .req C1 +eor sAge, tmp, sAge_, ROR #25 SEP +bic tmp, sAgu_, sAgo_, ROR #16 SEP vvtmpq .req C1q +eor sAgi, tmp, sAgi_, ROR #58 SEP xar_m1 vBgi, vAka, E0, 61 +bic tmp, sAga_, sAgu_, ROR #31 SEP +eor sAgo, tmp, sAgo_, ROR #47 SEP xar_m1 vBga, vAbo, E3, 36 +bic tmp, sAge_, sAga_, ROR #56 SEP +eor sAgu, tmp, sAgu_, ROR #23 SEP save(vAga) +bic tmp, sAki_, sAke_, ROR #19 SEP +eor sAka, tmp, sAka_, ROR #24 SEP xar_m1 vBbo, vAmo, E3, 43 +bic tmp, sAko_, sAki_, ROR #47 SEP +eor sAke, tmp, sAke_, ROR #2 SEP xar_m1 vBmo, vAmi, E2, 49 +bic tmp, sAku_, sAko_, ROR #10 SEP save(vAge) +eor sAki, tmp, sAki_, ROR #57 SEP +bic tmp, sAka_, sAku_, ROR #47 SEP xar_m1 vBmi, vAke, E1, 54 +eor sAko, tmp, sAko_, ROR #57 SEP +bic tmp, sAke_, sAka_, ROR #5 SEP xar_m1 vBge, vAgu, E4, 44 +eor sAku, tmp, sAku_, ROR #52 SEP +bic tmp, sAmi_, sAme_, ROR #38 SEP bcax_m1 vAga, vBga, vBgi, vBge +eor sAma, tmp, sAma_, ROR #47 SEP +bic tmp, sAmo_, sAmi_, ROR #5 SEP eor vBba.16b, vAba.16b, E0.16b +eor sAme, tmp, sAme_, ROR #43 SEP +bic tmp, sAmu_, sAmo_, ROR #41 SEP xar_m1 vBsa, vAbi, E2, 2 +eor sAmi, tmp, sAmi_, ROR #46 SEP xar_m1 vBbi, vAki, E2, 21 +ldr cur_const, [const_addr] SEP +mov count, #1 SEP xar_m1 vBki, vAko, E3, 39 +bic tmp, sAma_, sAmu_, ROR #35 SEP +eor sAmo, tmp, sAmo_, ROR #12 SEP xar_m1 vBko, vAmu, E4, 56 +bic tmp, sAme_, sAma_, ROR #9 SEP +eor sAmu, tmp, sAmu_, ROR #44 SEP xar_m1 vBmu, vAso, E3, 8 +bic tmp, sAsi_, sAse_, ROR #48 SEP +eor sAsa, tmp, sAsa_, ROR #41 SEP xar_m1 vBso, vAma, E0, 23 +bic tmp, sAso_, sAsi_, ROR #2 SEP xar_m1 vBka, vAbe, E1, 63 +eor sAse, tmp, sAse_, ROR #50 SEP +bic tmp, sAsu_, sAso_, ROR #25 SEP xar_m1 vBse, vAgo, E3, 9 +eor sAsi, tmp, sAsi_, ROR #27 SEP +bic tmp, sAsa_, sAsu_, ROR #60 SEP xar_m1 vBgo, vAme, E1, 19 +eor sAso, tmp, sAso_, ROR #21 SEP +bic tmp, sAse_, sAsa_, ROR #57 SEP bcax_m1 vAge, vBge, vBgo, vBgi +eor sAsu, tmp, sAsu_, ROR #53 SEP +bic tmp, sAbi_, sAbe_, ROR #63 SEP restore_as(vvtmpq, vAgi) +eor s_Aba, s_Aba_, tmp, ROR #21 SEP xar_m1 vBke, vvtmp, E2, 58 +bic tmp, sAbo_, sAbi_, ROR #42 SEP +eor sAbe, tmp, sAbe_, ROR #41 SEP xar_m1 vBgu, vAsi, E2, 3 +bic tmp, sAbu_, sAbo_, ROR #57 SEP +eor sAbi, tmp, sAbi_, ROR #35 SEP bcax_m1 vAgi, vBgi, vBgu, vBgo +bic tmp, s_Aba_, sAbu_, ROR #50 SEP +eor sAbo, tmp, sAbo_, ROR #43 SEP xar_m1 vBsi, vAku, E4, 25 +bic tmp, sAbe_, s_Aba_, ROR #44 SEP +eor sAbu, tmp, sAbu_, ROR #30 SEP xar_m1 vBku, vAsa, E0, 46 +eor s_Aba, s_Aba, cur_const SEP xar_m1 vBma, vAbu, E4, 37 +save count, STACK_OFFSET_COUNT SEP +eor sC0, sAka, sAsa, ROR #50 SEP xar_m1 vBbu, vAsu, E4, 50 +eor sC1, sAse, sAge, ROR #60 SEP +eor sC2, sAmi, sAgi, ROR #59 SEP xar_m1 vBsu, vAse, E1, 62 +eor sC3, sAgo, sAso, ROR #30 SEP +eor sC4, sAbu, sAsu, ROR #53 SEP ldp vvtmpq, E3q, [sp, #(STACK_BASE_TMP_VREGS + 16*vAga_offset)] +eor sC0, sAma, sC0, ROR #49 SEP +eor sC1, sAbe, sC1, ROR #44 SEP xar_m1 vBme, vvtmp, E0, 28 +eor sC2, sAki, sC2, ROR #26 SEP xar_m1 vBbe, E3, E1, 20 +eor sC3, sAmo, sC3, ROR #63 SEP +eor sC4, sAmu, sC4, ROR #56 SEP /* 25x BCAX, 50 in total */ +eor sC0, sAga, sC0, ROR #57 SEP +eor sC1, sAme, sC1, ROR #58 SEP bcax_m1 vAgo, vBgo, vBga, vBgu +eor sC2, sAbi, sC2, ROR #60 SEP +eor sC3, sAko, sC3, ROR #38 SEP bcax_m1 vAgu, vBgu, vBge, vBga +eor sC4, sAgu, sC4, ROR #48 SEP +eor sC0, s_Aba, sC0, ROR #61 SEP bcax_m1 vAka, vBka, vBki, vBke +eor sC1, sAke, sC1, ROR #57 SEP bcax_m1 vAke, vBke, vBko, vBki +eor sC2, sAsi, sC2, ROR #52 SEP +eor sC3, sAbo, sC3, ROR #63 SEP .unreq vvtmp +eor sC4, sAku, sC4, ROR #50 SEP +ror sC1, sC1, 56 SEP .unreq vvtmpq +ror sC4, sC4, 58 SEP +ror sC2, sC2, 62 SEP eor2 C0, vAka, vAga +eor sE1, sC0, sC2, ROR #63 SEP +eor sE3, sC2, sC4, ROR #63 SEP save(vAga) +eor sE0, sC4, sC1, ROR #63 SEP vvtmp .req vAga +eor sE2, sC1, sC3, ROR #63 SEP +eor sE4, sC3, sC0, ROR #63 SEP vvtmpq .req vAgaq +eor s_Aba_, sE0, s_Aba SEP +eor sAsa_, sE2, sAbi, ROR #50 SEP bcax_m1 vAki, vBki, vBku, vBko +eor sAbi_, sE2, sAki, ROR #46 SEP +eor sAki_, sE3, sAko, ROR #63 SEP bcax_m1 vAko, vBko, vBka, vBku +eor sAko_, sE4, sAmu, ROR #28 SEP +eor sAmu_, sE3, sAso, ROR #2 SEP eor2 C1, vAke, vAge +eor sAso_, sE0, sAma, ROR #54 SEP bcax_m1 vAku, vBku, vBke, vBka +eor sAka_, sE1, sAbe, ROR #43 SEP +eor sAse_, sE3, sAgo, ROR #36 SEP eor2 C2, vAki, vAgi +eor sAgo_, sE1, sAme, ROR #49 SEP +eor sAke_, sE2, sAgi, ROR #3 SEP bcax_m1 vAma, vBma, vBmi, vBme +eor sAgi_, sE0, sAka, ROR #39 SEP +eor sAga_, sE3, sAbo SEP eor2 C3, vAko, vAgo +eor sAbo_, sE3, sAmo, ROR #37 SEP +eor sAmo_, sE2, sAmi, ROR #8 SEP bcax_m1 vAme, vBme, vBmo, vBmi +eor sAmi_, sE1, sAke, ROR #56 SEP +eor sAge_, sE4, sAgu, ROR #44 SEP eor2 C4, vAku, vAgu +eor sAgu_, sE2, sAsi, ROR #62 SEP bcax_m1 vAmi, vBmi, vBmu, vBmo +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP eor2 C0, C0, vAma +eor sAma_, sE4, sAbu, ROR #20 SEP +eor sAbu_, sE4, sAsu, ROR #9 SEP bcax_m1 vAmo, vBmo, vBma, vBmu +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP eor2 C1, C1, vAme +eor sAbe_, sE1, sAge, ROR #19 SEP +load_constant_ptr SEP bcax_m1 vAmu, vBmu, vBme, vBma +restore count, STACK_OFFSET_COUNT SEP eor2 C2, C2, vAmi +bic tmp, sAgi_, sAge_, ROR #47 SEP +eor sAga, tmp, sAga_, ROR #39 SEP bcax_m1 vAsa, vBsa, vBsi, vBse +bic tmp, sAgo_, sAgi_, ROR #42 SEP +eor sAge, tmp, sAge_, ROR #25 SEP eor2 C3, C3, vAmo +bic tmp, sAgu_, sAgo_, ROR #16 SEP +eor sAgi, tmp, sAgi_, ROR #58 SEP bcax_m1 vAse, vBse, vBso, vBsi +bic tmp, sAga_, sAgu_, ROR #31 SEP +eor sAgo, tmp, sAgo_, ROR #47 SEP eor2 C4, C4, vAmu +bic tmp, sAge_, sAga_, ROR #56 SEP bcax_m1 vAsi, vBsi, vBsu, vBso +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP eor2 C0, C0, vAsa +eor sAka, tmp, sAka_, ROR #24 SEP +bic tmp, sAko_, sAki_, ROR #47 SEP bcax_m1 vAso, vBso, vBsa, vBsu +eor sAke, tmp, sAke_, ROR #2 SEP +bic tmp, sAku_, sAko_, ROR #10 SEP eor2 C1, C1, vAse +eor sAki, tmp, sAki_, ROR #57 SEP +bic tmp, sAka_, sAku_, ROR #47 SEP bcax_m1 vAsu, vBsu, vBse, vBsa +eor sAko, tmp, sAko_, ROR #57 SEP eor2 C2, C2, vAsi +bic tmp, sAke_, sAka_, ROR #5 SEP +eor sAku, tmp, sAku_, ROR #52 SEP eor2 C3, C3, vAso +bic tmp, sAmi_, sAme_, ROR #38 SEP +eor sAma, tmp, sAma_, ROR #47 SEP bcax_m1 vAba, vBba, vBbi, vBbe +bic tmp, sAmo_, sAmi_, ROR #5 SEP +eor sAme, tmp, sAme_, ROR #43 SEP bcax_m1 vAbe, vBbe, vBbo, vBbi +bic tmp, sAmu_, sAmo_, ROR #41 SEP +eor sAmi, tmp, sAmi_, ROR #46 SEP eor2 C1, C1, vAbe +bic tmp, sAma_, sAmu_, ROR #35 SEP restore x26, STACK_OFFSET_CONST +eor sAmo, tmp, sAmo_, ROR #12 SEP ldr vvtmpq, [x26], #16 +bic tmp, sAme_, sAma_, ROR #9 SEP +eor sAmu, tmp, sAmu_, ROR #44 SEP save x26, STACK_OFFSET_CONST +bic tmp, sAsi_, sAse_, ROR #48 SEP +ldr cur_const, [const_addr, count, UXTW #3] SEP +eor sAsa, tmp, sAsa_, ROR #41 SEP eor vAba.16b, vAba.16b, vvtmp.16b +bic tmp, sAso_, sAsi_, ROR #2 SEP +eor sAse, tmp, sAse_, ROR #50 SEP eor2 C4, C4, vAsu +bic tmp, sAsu_, sAso_, ROR #25 SEP bcax_m1 vAbi, vBbi, vBbu, vBbo +eor sAsi, tmp, sAsi_, ROR #27 SEP +bic tmp, sAsa_, sAsu_, ROR #60 SEP bcax_m1 vAbo, vBbo, vBba, vBbu +eor sAso, tmp, sAso_, ROR #21 SEP +bic tmp, sAse_, sAsa_, ROR #57 SEP eor2 C3, C3, vAbo +eor sAsu, tmp, sAsu_, ROR #53 SEP +bic tmp, sAbi_, sAbe_, ROR #63 SEP eor2 C2, C2, vAbi +eor s_Aba, s_Aba_, tmp, ROR #21 SEP +bic tmp, sAbo_, sAbi_, ROR #42 SEP eor2 C0, C0, vAba +eor sAbe, tmp, sAbe_, ROR #41 SEP bcax_m1 vAbu, vBbu, vBbe, vBba +bic tmp, sAbu_, sAbo_, ROR #57 SEP +eor sAbi, tmp, sAbi_, ROR #35 SEP eor2 C4, C4, vAbu +bic tmp, s_Aba_, sAbu_, ROR #50 SEP +eor sAbo, tmp, sAbo_, ROR #43 SEP restore(vAga) +bic tmp, sAbe_, s_Aba_, ROR #44 SEP +eor sAbu, tmp, sAbu_, ROR #30 SEP .unreq vvtmp +add count, count, #1 SEP +eor s_Aba, s_Aba, cur_const SEP .unreq vvtmpq +.endm + + +.macro hybrid_round_noninitial +save count, STACK_OFFSET_COUNT SEP +eor sC0, sAka, sAsa, ROR #50 SEP vvtmp .req vBba +eor sC1, sAse, sAge, ROR #60 SEP rax1_m1 E2, C1, C3 +eor sC2, sAmi, sAgi, ROR #59 SEP rax1_m1 E4, C3, C0 +eor sC3, sAgo, sAso, ROR #30 SEP +eor sC4, sAbu, sAsu, ROR #53 SEP +eor sC0, sAma, sC0, ROR #49 SEP +eor sC1, sAbe, sC1, ROR #44 SEP rax1_m1 E1, C0, C2 +eor sC2, sAki, sC2, ROR #26 SEP +eor sC3, sAmo, sC3, ROR #63 SEP +eor sC4, sAmu, sC4, ROR #56 SEP rax1_m1 E3, C2, C4 +eor sC0, sAga, sC0, ROR #57 SEP +eor sC1, sAme, sC1, ROR #58 SEP save(vAgi) +eor sC2, sAbi, sC2, ROR #60 SEP +eor sC3, sAko, sC3, ROR #38 SEP rax1_m1 E0, C4, C1 +eor sC4, sAgu, sC4, ROR #48 SEP +eor sC0, s_Aba, sC0, ROR #61 SEP .unreq vvtmp +eor sC1, sAke, sC1, ROR #57 SEP +eor sC2, sAsi, sC2, ROR #52 SEP +eor sC3, sAbo, sC3, ROR #63 SEP vvtmp .req C1 +eor sC4, sAku, sC4, ROR #50 SEP +ror sC1, sC1, 56 SEP vvtmpq .req C1q +ror sC4, sC4, 58 SEP +ror sC2, sC2, 62 SEP xar_m1 vBgi, vAka, E0, 61 +eor sE1, sC0, sC2, ROR #63 SEP +eor sE3, sC2, sC4, ROR #63 SEP xar_m1 vBga, vAbo, E3, 36 +eor sE0, sC4, sC1, ROR #63 SEP +eor sE2, sC1, sC3, ROR #63 SEP +eor sE4, sC3, sC0, ROR #63 SEP save(vAga) +eor s_Aba_, sE0, s_Aba SEP +eor sAsa_, sE2, sAbi, ROR #50 SEP xar_m1 vBbo, vAmo, E3, 43 +eor sAbi_, sE2, sAki, ROR #46 SEP +eor sAki_, sE3, sAko, ROR #63 SEP xar_m1 vBmo, vAmi, E2, 49 +eor sAko_, sE4, sAmu, ROR #28 SEP +eor sAmu_, sE3, sAso, ROR #2 SEP +eor sAso_, sE0, sAma, ROR #54 SEP save(vAge) +eor sAka_, sE1, sAbe, ROR #43 SEP +eor sAse_, sE3, sAgo, ROR #36 SEP xar_m1 vBmi, vAke, E1, 54 +eor sAgo_, sE1, sAme, ROR #49 SEP +eor sAke_, sE2, sAgi, ROR #3 SEP xar_m1 vBge, vAgu, E4, 44 +eor sAgi_, sE0, sAka, ROR #39 SEP +eor sAga_, sE3, sAbo SEP bcax_m1 vAga, vBga, vBgi, vBge +eor sAbo_, sE3, sAmo, ROR #37 SEP +eor sAmo_, sE2, sAmi, ROR #8 SEP +eor sAmi_, sE1, sAke, ROR #56 SEP eor vBba.16b, vAba.16b, E0.16b +eor sAge_, sE4, sAgu, ROR #44 SEP +eor sAgu_, sE2, sAsi, ROR #62 SEP xar_m1 vBsa, vAbi, E2, 2 +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP xar_m1 vBbi, vAki, E2, 21 +eor sAma_, sE4, sAbu, ROR #20 SEP +eor sAbu_, sE4, sAsu, ROR #9 SEP xar_m1 vBki, vAko, E3, 39 +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP +eor sAbe_, sE1, sAge, ROR #19 SEP xar_m1 vBko, vAmu, E4, 56 +load_constant_ptr SEP +restore count, STACK_OFFSET_COUNT SEP xar_m1 vBmu, vAso, E3, 8 +bic tmp, sAgi_, sAge_, ROR #47 SEP +eor sAga, tmp, sAga_, ROR #39 SEP xar_m1 vBso, vAma, E0, 23 +bic tmp, sAgo_, sAgi_, ROR #42 SEP +eor sAge, tmp, sAge_, ROR #25 SEP +bic tmp, sAgu_, sAgo_, ROR #16 SEP xar_m1 vBka, vAbe, E1, 63 +eor sAgi, tmp, sAgi_, ROR #58 SEP +bic tmp, sAga_, sAgu_, ROR #31 SEP xar_m1 vBse, vAgo, E3, 9 +eor sAgo, tmp, sAgo_, ROR #47 SEP +bic tmp, sAge_, sAga_, ROR #56 SEP xar_m1 vBgo, vAme, E1, 19 +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP bcax_m1 vAge, vBge, vBgo, vBgi +eor sAka, tmp, sAka_, ROR #24 SEP +bic tmp, sAko_, sAki_, ROR #47 SEP +eor sAke, tmp, sAke_, ROR #2 SEP restore_as(vvtmpq, vAgi) +bic tmp, sAku_, sAko_, ROR #10 SEP +eor sAki, tmp, sAki_, ROR #57 SEP xar_m1 vBke, vvtmp, E2, 58 +bic tmp, sAka_, sAku_, ROR #47 SEP +eor sAko, tmp, sAko_, ROR #57 SEP xar_m1 vBgu, vAsi, E2, 3 +bic tmp, sAke_, sAka_, ROR #5 SEP +eor sAku, tmp, sAku_, ROR #52 SEP bcax_m1 vAgi, vBgi, vBgu, vBgo +bic tmp, sAmi_, sAme_, ROR #38 SEP +eor sAma, tmp, sAma_, ROR #47 SEP +bic tmp, sAmo_, sAmi_, ROR #5 SEP xar_m1 vBsi, vAku, E4, 25 +eor sAme, tmp, sAme_, ROR #43 SEP +bic tmp, sAmu_, sAmo_, ROR #41 SEP xar_m1 vBku, vAsa, E0, 46 +eor sAmi, tmp, sAmi_, ROR #46 SEP +bic tmp, sAma_, sAmu_, ROR #35 SEP xar_m1 vBma, vAbu, E4, 37 +ldr cur_const, [const_addr, count, UXTW #3] SEP +add count, count, #1 SEP +eor sAmo, tmp, sAmo_, ROR #12 SEP xar_m1 vBbu, vAsu, E4, 50 +bic tmp, sAme_, sAma_, ROR #9 SEP +eor sAmu, tmp, sAmu_, ROR #44 SEP xar_m1 vBsu, vAse, E1, 62 +bic tmp, sAsi_, sAse_, ROR #48 SEP +eor sAsa, tmp, sAsa_, ROR #41 SEP ldp vvtmpq, E3q, [sp, #(STACK_BASE_TMP_VREGS + 16*vAga_offset)] +bic tmp, sAso_, sAsi_, ROR #2 SEP +eor sAse, tmp, sAse_, ROR #50 SEP xar_m1 vBme, vvtmp, E0, 28 +bic tmp, sAsu_, sAso_, ROR #25 SEP +eor sAsi, tmp, sAsi_, ROR #27 SEP +bic tmp, sAsa_, sAsu_, ROR #60 SEP xar_m1 vBbe, E3, E1, 20 +eor sAso, tmp, sAso_, ROR #21 SEP +bic tmp, sAse_, sAsa_, ROR #57 SEP bcax_m1 vAgo, vBgo, vBga, vBgu +eor sAsu, tmp, sAsu_, ROR #53 SEP +bic tmp, sAbi_, sAbe_, ROR #63 SEP bcax_m1 vAgu, vBgu, vBge, vBga +eor s_Aba, s_Aba_, tmp, ROR #21 SEP +bic tmp, sAbo_, sAbi_, ROR #42 SEP bcax_m1 vAka, vBka, vBki, vBke +eor sAbe, tmp, sAbe_, ROR #41 SEP +bic tmp, sAbu_, sAbo_, ROR #57 SEP +eor sAbi, tmp, sAbi_, ROR #35 SEP bcax_m1 vAke, vBke, vBko, vBki +bic tmp, s_Aba_, sAbu_, ROR #50 SEP +eor sAbo, tmp, sAbo_, ROR #43 SEP .unreq vvtmp +bic tmp, sAbe_, s_Aba_, ROR #44 SEP +eor sAbu, tmp, sAbu_, ROR #30 SEP .unreq vvtmpq +eor s_Aba, s_Aba, cur_const SEP +save count, STACK_OFFSET_COUNT SEP +eor sC0, sAka, sAsa, ROR #50 SEP eor2 C0, vAka, vAga +eor sC1, sAse, sAge, ROR #60 SEP +eor sC2, sAmi, sAgi, ROR #59 SEP save(vAga) +eor sC3, sAgo, sAso, ROR #30 SEP +eor sC4, sAbu, sAsu, ROR #53 SEP vvtmp .req vAga +eor sC0, sAma, sC0, ROR #49 SEP +eor sC1, sAbe, sC1, ROR #44 SEP vvtmpq .req vAgaq +eor sC2, sAki, sC2, ROR #26 SEP +eor sC3, sAmo, sC3, ROR #63 SEP +eor sC4, sAmu, sC4, ROR #56 SEP bcax_m1 vAki, vBki, vBku, vBko +eor sC0, sAga, sC0, ROR #57 SEP +eor sC1, sAme, sC1, ROR #58 SEP bcax_m1 vAko, vBko, vBka, vBku +eor sC2, sAbi, sC2, ROR #60 SEP +eor sC3, sAko, sC3, ROR #38 SEP eor2 C1, vAke, vAge +eor sC4, sAgu, sC4, ROR #48 SEP +eor sC0, s_Aba, sC0, ROR #61 SEP bcax_m1 vAku, vBku, vBke, vBka +eor sC1, sAke, sC1, ROR #57 SEP +eor sC2, sAsi, sC2, ROR #52 SEP +eor sC3, sAbo, sC3, ROR #63 SEP eor2 C2, vAki, vAgi +eor sC4, sAku, sC4, ROR #50 SEP +ror sC1, sC1, 56 SEP bcax_m1 vAma, vBma, vBmi, vBme +ror sC4, sC4, 58 SEP +ror sC2, sC2, 62 SEP eor2 C3, vAko, vAgo +eor sE1, sC0, sC2, ROR #63 SEP +eor sE3, sC2, sC4, ROR #63 SEP bcax_m1 vAme, vBme, vBmo, vBmi +eor sE0, sC4, sC1, ROR #63 SEP +eor sE2, sC1, sC3, ROR #63 SEP +eor sE4, sC3, sC0, ROR #63 SEP eor2 C4, vAku, vAgu +eor s_Aba_, sE0, s_Aba SEP +eor sAsa_, sE2, sAbi, ROR #50 SEP bcax_m1 vAmi, vBmi, vBmu, vBmo +eor sAbi_, sE2, sAki, ROR #46 SEP +eor sAki_, sE3, sAko, ROR #63 SEP eor2 C0, C0, vAma +eor sAko_, sE4, sAmu, ROR #28 SEP +eor sAmu_, sE3, sAso, ROR #2 SEP +eor sAso_, sE0, sAma, ROR #54 SEP bcax_m1 vAmo, vBmo, vBma, vBmu +eor sAka_, sE1, sAbe, ROR #43 SEP +eor sAse_, sE3, sAgo, ROR #36 SEP eor2 C1, C1, vAme +eor sAgo_, sE1, sAme, ROR #49 SEP +eor sAke_, sE2, sAgi, ROR #3 SEP bcax_m1 vAmu, vBmu, vBme, vBma +eor sAgi_, sE0, sAka, ROR #39 SEP +eor sAga_, sE3, sAbo SEP eor2 C2, C2, vAmi +eor sAbo_, sE3, sAmo, ROR #37 SEP +eor sAmo_, sE2, sAmi, ROR #8 SEP +eor sAmi_, sE1, sAke, ROR #56 SEP bcax_m1 vAsa, vBsa, vBsi, vBse +eor sAge_, sE4, sAgu, ROR #44 SEP +eor sAgu_, sE2, sAsi, ROR #62 SEP eor2 C3, C3, vAmo +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP bcax_m1 vAse, vBse, vBso, vBsi +eor sAma_, sE4, sAbu, ROR #20 SEP +eor sAbu_, sE4, sAsu, ROR #9 SEP eor2 C4, C4, vAmu +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP +eor sAbe_, sE1, sAge, ROR #19 SEP bcax_m1 vAsi, vBsi, vBsu, vBso +load_constant_ptr SEP +restore count, STACK_OFFSET_COUNT SEP eor2 C0, C0, vAsa +bic tmp, sAgi_, sAge_, ROR #47 SEP +eor sAga, tmp, sAga_, ROR #39 SEP bcax_m1 vAso, vBso, vBsa, vBsu +bic tmp, sAgo_, sAgi_, ROR #42 SEP +eor sAge, tmp, sAge_, ROR #25 SEP +bic tmp, sAgu_, sAgo_, ROR #16 SEP eor2 C1, C1, vAse +eor sAgi, tmp, sAgi_, ROR #58 SEP +bic tmp, sAga_, sAgu_, ROR #31 SEP bcax_m1 vAsu, vBsu, vBse, vBsa +eor sAgo, tmp, sAgo_, ROR #47 SEP +bic tmp, sAge_, sAga_, ROR #56 SEP eor2 C2, C2, vAsi +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP eor2 C3, C3, vAso +eor sAka, tmp, sAka_, ROR #24 SEP +bic tmp, sAko_, sAki_, ROR #47 SEP +eor sAke, tmp, sAke_, ROR #2 SEP bcax_m1 vAba, vBba, vBbi, vBbe +bic tmp, sAku_, sAko_, ROR #10 SEP +eor sAki, tmp, sAki_, ROR #57 SEP bcax_m1 vAbe, vBbe, vBbo, vBbi +bic tmp, sAka_, sAku_, ROR #47 SEP +eor sAko, tmp, sAko_, ROR #57 SEP eor2 C1, C1, vAbe +bic tmp, sAke_, sAka_, ROR #5 SEP +eor sAku, tmp, sAku_, ROR #52 SEP restore x26, STACK_OFFSET_CONST +bic tmp, sAmi_, sAme_, ROR #38 SEP +eor sAma, tmp, sAma_, ROR #47 SEP +bic tmp, sAmo_, sAmi_, ROR #5 SEP ldr vvtmpq, [x26], #16 +eor sAme, tmp, sAme_, ROR #43 SEP +bic tmp, sAmu_, sAmo_, ROR #41 SEP save x26, STACK_OFFSET_CONST +eor sAmi, tmp, sAmi_, ROR #46 SEP +bic tmp, sAma_, sAmu_, ROR #35 SEP eor vAba.16b, vAba.16b, vvtmp.16b +ldr cur_const, [const_addr, count, UXTW #3] SEP +add count, count, #1 SEP +eor sAmo, tmp, sAmo_, ROR #12 SEP eor2 C4, C4, vAsu +bic tmp, sAme_, sAma_, ROR #9 SEP +eor sAmu, tmp, sAmu_, ROR #44 SEP bcax_m1 vAbi, vBbi, vBbu, vBbo +bic tmp, sAsi_, sAse_, ROR #48 SEP +eor sAsa, tmp, sAsa_, ROR #41 SEP bcax_m1 vAbo, vBbo, vBba, vBbu +bic tmp, sAso_, sAsi_, ROR #2 SEP +eor sAse, tmp, sAse_, ROR #50 SEP eor2 C3, C3, vAbo +bic tmp, sAsu_, sAso_, ROR #25 SEP +eor sAsi, tmp, sAsi_, ROR #27 SEP +bic tmp, sAsa_, sAsu_, ROR #60 SEP eor2 C2, C2, vAbi +eor sAso, tmp, sAso_, ROR #21 SEP +bic tmp, sAse_, sAsa_, ROR #57 SEP eor2 C0, C0, vAba +eor sAsu, tmp, sAsu_, ROR #53 SEP +bic tmp, sAbi_, sAbe_, ROR #63 SEP bcax_m1 vAbu, vBbu, vBbe, vBba +eor s_Aba, s_Aba_, tmp, ROR #21 SEP +bic tmp, sAbo_, sAbi_, ROR #42 SEP eor2 C4, C4, vAbu +eor sAbe, tmp, sAbe_, ROR #41 SEP +bic tmp, sAbu_, sAbo_, ROR #57 SEP +eor sAbi, tmp, sAbi_, ROR #35 SEP restore(vAga) +bic tmp, s_Aba_, sAbu_, ROR #50 SEP +eor sAbo, tmp, sAbo_, ROR #43 SEP .unreq vvtmp +bic tmp, sAbe_, s_Aba_, ROR #44 SEP +eor sAbu, tmp, sAbu_, ROR #30 SEP .unreq vvtmpq +eor s_Aba, s_Aba, cur_const SEP +.endm +.macro hybrid_round_final + SEP vvtmp .req vBba +save count, STACK_OFFSET_COUNT SEP rax1_m1 E2, C1, C3 +eor sC0, sAka, sAsa, ROR #50 SEP +eor sC1, sAse, sAge, ROR #60 SEP rax1_m1 E4, C3, C0 +eor sC2, sAmi, sAgi, ROR #59 SEP +eor sC3, sAgo, sAso, ROR #30 SEP rax1_m1 E1, C0, C2 +eor sC4, sAbu, sAsu, ROR #53 SEP +eor sC0, sAma, sC0, ROR #49 SEP +eor sC1, sAbe, sC1, ROR #44 SEP +eor sC2, sAki, sC2, ROR #26 SEP +eor sC3, sAmo, sC3, ROR #63 SEP +eor sC4, sAmu, sC4, ROR #56 SEP +eor sC0, sAga, sC0, ROR #57 SEP +eor sC1, sAme, sC1, ROR #58 SEP +eor sC2, sAbi, sC2, ROR #60 SEP +eor sC3, sAko, sC3, ROR #38 SEP rax1_m1 E3, C2, C4 +eor sC4, sAgu, sC4, ROR #48 SEP +eor sC0, s_Aba, sC0, ROR #61 SEP +eor sC1, sAke, sC1, ROR #57 SEP +eor sC2, sAsi, sC2, ROR #52 SEP save(vAgi) +eor sC3, sAbo, sC3, ROR #63 SEP +eor sC4, sAku, sC4, ROR #50 SEP +ror sC1, sC1, 56 SEP rax1_m1 E0, C4, C1 +ror sC4, sC4, 58 SEP +ror sC2, sC2, 62 SEP +eor sE1, sC0, sC2, ROR #63 SEP +eor sE3, sC2, sC4, ROR #63 SEP .unreq vvtmp +eor sE0, sC4, sC1, ROR #63 SEP +eor sE2, sC1, sC3, ROR #63 SEP +eor sE4, sC3, sC0, ROR #63 SEP vvtmp .req C1 +eor s_Aba_, sE0, s_Aba SEP +eor sAsa_, sE2, sAbi, ROR #50 SEP +eor sAbi_, sE2, sAki, ROR #46 SEP vvtmpq .req C1q +eor sAki_, sE3, sAko, ROR #63 SEP +eor sAko_, sE4, sAmu, ROR #28 SEP +eor sAmu_, sE3, sAso, ROR #2 SEP +eor sAso_, sE0, sAma, ROR #54 SEP xar_m1 vBgi, vAka, E0, 61 +eor sAka_, sE1, sAbe, ROR #43 SEP +eor sAse_, sE3, sAgo, ROR #36 SEP +eor sAgo_, sE1, sAme, ROR #49 SEP xar_m1 vBga, vAbo, E3, 36 +eor sAke_, sE2, sAgi, ROR #3 SEP +eor sAgi_, sE0, sAka, ROR #39 SEP +eor sAga_, sE3, sAbo SEP +eor sAbo_, sE3, sAmo, ROR #37 SEP save(vAga) +eor sAmo_, sE2, sAmi, ROR #8 SEP +eor sAmi_, sE1, sAke, ROR #56 SEP +eor sAge_, sE4, sAgu, ROR #44 SEP xar_m1 vBbo, vAmo, E3, 43 +eor sAgu_, sE2, sAsi, ROR #62 SEP +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP +eor sAma_, sE4, sAbu, ROR #20 SEP xar_m1 vBmo, vAmi, E2, 49 +eor sAbu_, sE4, sAsu, ROR #9 SEP +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP save(vAge) +eor sAbe_, sE1, sAge, ROR #19 SEP +load_constant_ptr SEP +restore count, STACK_OFFSET_COUNT SEP +bic tmp, sAgi_, sAge_, ROR #47 SEP xar_m1 vBmi, vAke, E1, 54 +eor sAga, tmp, sAga_, ROR #39 SEP +bic tmp, sAgo_, sAgi_, ROR #42 SEP +eor sAge, tmp, sAge_, ROR #25 SEP xar_m1 vBge, vAgu, E4, 44 +bic tmp, sAgu_, sAgo_, ROR #16 SEP +eor sAgi, tmp, sAgi_, ROR #58 SEP +bic tmp, sAga_, sAgu_, ROR #31 SEP bcax_m1 vAga, vBga, vBgi, vBge +eor sAgo, tmp, sAgo_, ROR #47 SEP +bic tmp, sAge_, sAga_, ROR #56 SEP +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP eor vBba.16b, vAba.16b, E0.16b +eor sAka, tmp, sAka_, ROR #24 SEP +bic tmp, sAko_, sAki_, ROR #47 SEP +eor sAke, tmp, sAke_, ROR #2 SEP xar_m1 vBsa, vAbi, E2, 2 +bic tmp, sAku_, sAko_, ROR #10 SEP +eor sAki, tmp, sAki_, ROR #57 SEP +bic tmp, sAka_, sAku_, ROR #47 SEP +eor sAko, tmp, sAko_, ROR #57 SEP xar_m1 vBbi, vAki, E2, 21 +bic tmp, sAke_, sAka_, ROR #5 SEP +eor sAku, tmp, sAku_, ROR #52 SEP +bic tmp, sAmi_, sAme_, ROR #38 SEP xar_m1 vBki, vAko, E3, 39 +eor sAma, tmp, sAma_, ROR #47 SEP +bic tmp, sAmo_, sAmi_, ROR #5 SEP +eor sAme, tmp, sAme_, ROR #43 SEP +bic tmp, sAmu_, sAmo_, ROR #41 SEP xar_m1 vBko, vAmu, E4, 56 +eor sAmi, tmp, sAmi_, ROR #46 SEP +bic tmp, sAma_, sAmu_, ROR #35 SEP +ldr cur_const, [const_addr, count, UXTW #3] SEP xar_m1 vBmu, vAso, E3, 8 +add count, count, #1 SEP +eor sAmo, tmp, sAmo_, ROR #12 SEP +bic tmp, sAme_, sAma_, ROR #9 SEP +eor sAmu, tmp, sAmu_, ROR #44 SEP xar_m1 vBso, vAma, E0, 23 +bic tmp, sAsi_, sAse_, ROR #48 SEP +eor sAsa, tmp, sAsa_, ROR #41 SEP +bic tmp, sAso_, sAsi_, ROR #2 SEP xar_m1 vBka, vAbe, E1, 63 +eor sAse, tmp, sAse_, ROR #50 SEP +bic tmp, sAsu_, sAso_, ROR #25 SEP +eor sAsi, tmp, sAsi_, ROR #27 SEP xar_m1 vBse, vAgo, E3, 9 +bic tmp, sAsa_, sAsu_, ROR #60 SEP +eor sAso, tmp, sAso_, ROR #21 SEP +bic tmp, sAse_, sAsa_, ROR #57 SEP +eor sAsu, tmp, sAsu_, ROR #53 SEP xar_m1 vBgo, vAme, E1, 19 +bic tmp, sAbi_, sAbe_, ROR #63 SEP +eor s_Aba, s_Aba_, tmp, ROR #21 SEP +bic tmp, sAbo_, sAbi_, ROR #42 SEP bcax_m1 vAge, vBge, vBgo, vBgi +eor sAbe, tmp, sAbe_, ROR #41 SEP +bic tmp, sAbu_, sAbo_, ROR #57 SEP +eor sAbi, tmp, sAbi_, ROR #35 SEP +bic tmp, s_Aba_, sAbu_, ROR #50 SEP restore_as(vvtmpq, vAgi) +eor sAbo, tmp, sAbo_, ROR #43 SEP +bic tmp, sAbe_, s_Aba_, ROR #44 SEP +eor sAbu, tmp, sAbu_, ROR #30 SEP xar_m1 vBke, vvtmp, E2, 58 +eor s_Aba, s_Aba, cur_const SEP +save count, STACK_OFFSET_COUNT SEP +eor sC0, sAka, sAsa, ROR #50 SEP +eor sC1, sAse, sAge, ROR #60 SEP xar_m1 vBgu, vAsi, E2, 3 +eor sC2, sAmi, sAgi, ROR #59 SEP +eor sC3, sAgo, sAso, ROR #30 SEP +eor sC4, sAbu, sAsu, ROR #53 SEP bcax_m1 vAgi, vBgi, vBgu, vBgo +eor sC0, sAma, sC0, ROR #49 SEP +eor sC1, sAbe, sC1, ROR #44 SEP +eor sC2, sAki, sC2, ROR #26 SEP +eor sC3, sAmo, sC3, ROR #63 SEP xar_m1 vBsi, vAku, E4, 25 +eor sC4, sAmu, sC4, ROR #56 SEP +eor sC0, sAga, sC0, ROR #57 SEP +eor sC1, sAme, sC1, ROR #58 SEP xar_m1 vBku, vAsa, E0, 46 +eor sC2, sAbi, sC2, ROR #60 SEP +eor sC3, sAko, sC3, ROR #38 SEP +eor sC4, sAgu, sC4, ROR #48 SEP xar_m1 vBma, vAbu, E4, 37 +eor sC0, s_Aba, sC0, ROR #61 SEP +eor sC1, sAke, sC1, ROR #57 SEP +eor sC2, sAsi, sC2, ROR #52 SEP +eor sC3, sAbo, sC3, ROR #63 SEP xar_m1 vBbu, vAsu, E4, 50 +eor sC4, sAku, sC4, ROR #50 SEP +ror sC1, sC1, 56 SEP +ror sC4, sC4, 58 SEP xar_m1 vBsu, vAse, E1, 62 +ror sC2, sC2, 62 SEP +eor sE1, sC0, sC2, ROR #63 SEP +eor sE3, sC2, sC4, ROR #63 SEP +eor sE0, sC4, sC1, ROR #63 SEP ldp vvtmpq, E3q, [sp, #(STACK_BASE_TMP_VREGS + 16*vAga_offset)] +eor sE2, sC1, sC3, ROR #63 SEP +eor sE4, sC3, sC0, ROR #63 SEP +eor s_Aba_, sE0, s_Aba SEP xar_m1 vBme, vvtmp, E0, 28 +eor sAsa_, sE2, sAbi, ROR #50 SEP +eor sAbi_, sE2, sAki, ROR #46 SEP +eor sAki_, sE3, sAko, ROR #63 SEP +eor sAko_, sE4, sAmu, ROR #28 SEP xar_m1 vBbe, E3, E1, 20 +eor sAmu_, sE3, sAso, ROR #2 SEP +eor sAso_, sE0, sAma, ROR #54 SEP +eor sAka_, sE1, sAbe, ROR #43 SEP bcax_m1 vAgo, vBgo, vBga, vBgu +eor sAse_, sE3, sAgo, ROR #36 SEP +eor sAgo_, sE1, sAme, ROR #49 SEP +eor sAke_, sE2, sAgi, ROR #3 SEP +eor sAgi_, sE0, sAka, ROR #39 SEP bcax_m1 vAgu, vBgu, vBge, vBga +eor sAga_, sE3, sAbo SEP +eor sAbo_, sE3, sAmo, ROR #37 SEP +eor sAmo_, sE2, sAmi, ROR #8 SEP bcax_m1 vAka, vBka, vBki, vBke +eor sAmi_, sE1, sAke, ROR #56 SEP +eor sAge_, sE4, sAgu, ROR #44 SEP +eor sAgu_, sE2, sAsi, ROR #62 SEP bcax_m1 vAke, vBke, vBko, vBki +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP +eor sAma_, sE4, sAbu, ROR #20 SEP +eor sAbu_, sE4, sAsu, ROR #9 SEP bcax_m1 vAki, vBki, vBku, vBko +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP +eor sAbe_, sE1, sAge, ROR #19 SEP bcax_m1 vAko, vBko, vBka, vBku +load_constant_ptr SEP +restore count, STACK_OFFSET_COUNT SEP +bic tmp, sAgi_, sAge_, ROR #47 SEP +eor sAga, tmp, sAga_, ROR #39 SEP bcax_m1 vAku, vBku, vBke, vBka +bic tmp, sAgo_, sAgi_, ROR #42 SEP +eor sAge, tmp, sAge_, ROR #25 SEP +bic tmp, sAgu_, sAgo_, ROR #16 SEP bcax_m1 vAma, vBma, vBmi, vBme +eor sAgi, tmp, sAgi_, ROR #58 SEP +bic tmp, sAga_, sAgu_, ROR #31 SEP +eor sAgo, tmp, sAgo_, ROR #47 SEP +bic tmp, sAge_, sAga_, ROR #56 SEP bcax_m1 vAme, vBme, vBmo, vBmi +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP +eor sAka, tmp, sAka_, ROR #24 SEP bcax_m1 vAmi, vBmi, vBmu, vBmo +bic tmp, sAko_, sAki_, ROR #47 SEP +eor sAke, tmp, sAke_, ROR #2 SEP +bic tmp, sAku_, sAko_, ROR #10 SEP +eor sAki, tmp, sAki_, ROR #57 SEP bcax_m1 vAmo, vBmo, vBma, vBmu +bic tmp, sAka_, sAku_, ROR #47 SEP +eor sAko, tmp, sAko_, ROR #57 SEP +bic tmp, sAke_, sAka_, ROR #5 SEP bcax_m1 vAmu, vBmu, vBme, vBma +eor sAku, tmp, sAku_, ROR #52 SEP +bic tmp, sAmi_, sAme_, ROR #38 SEP +eor sAma, tmp, sAma_, ROR #47 SEP bcax_m1 vAsa, vBsa, vBsi, vBse +bic tmp, sAmo_, sAmi_, ROR #5 SEP +eor sAme, tmp, sAme_, ROR #43 SEP +bic tmp, sAmu_, sAmo_, ROR #41 SEP +eor sAmi, tmp, sAmi_, ROR #46 SEP bcax_m1 vAse, vBse, vBso, vBsi +bic tmp, sAma_, sAmu_, ROR #35 SEP +ldr cur_const, [const_addr, count, UXTW #3] SEP +add count, count, #1 SEP bcax_m1 vAsi, vBsi, vBsu, vBso +eor sAmo, tmp, sAmo_, ROR #12 SEP +bic tmp, sAme_, sAma_, ROR #9 SEP +eor sAmu, tmp, sAmu_, ROR #44 SEP +bic tmp, sAsi_, sAse_, ROR #48 SEP bcax_m1 vAso, vBso, vBsa, vBsu +eor sAsa, tmp, sAsa_, ROR #41 SEP +bic tmp, sAso_, sAsi_, ROR #2 SEP +eor sAse, tmp, sAse_, ROR #50 SEP bcax_m1 vAsu, vBsu, vBse, vBsa +bic tmp, sAsu_, sAso_, ROR #25 SEP +eor sAsi, tmp, sAsi_, ROR #27 SEP +bic tmp, sAsa_, sAsu_, ROR #60 SEP +eor sAso, tmp, sAso_, ROR #21 SEP bcax_m1 vAba, vBba, vBbi, vBbe +bic tmp, sAse_, sAsa_, ROR #57 SEP +eor sAsu, tmp, sAsu_, ROR #53 SEP +bic tmp, sAbi_, sAbe_, ROR #63 SEP bcax_m1 vAbe, vBbe, vBbo, vBbi +eor s_Aba, s_Aba_, tmp, ROR #21 SEP +bic tmp, sAbo_, sAbi_, ROR #42 SEP +eor sAbe, tmp, sAbe_, ROR #41 SEP +bic tmp, sAbu_, sAbo_, ROR #57 SEP bcax_m1 vAbi, vBbi, vBbu, vBbo +eor sAbi, tmp, sAbi_, ROR #35 SEP +bic tmp, s_Aba_, sAbu_, ROR #50 SEP +eor sAbo, tmp, sAbo_, ROR #43 SEP bcax_m1 vAbo, vBbo, vBba, vBbu +bic tmp, sAbe_, s_Aba_, ROR #44 SEP +eor sAbu, tmp, sAbu_, ROR #30 SEP +eor s_Aba, s_Aba, cur_const SEP bcax_m1 vAbu, vBbu, vBbe, vBba +ror sAga, sAga,(64-3) SEP +ror sAka, sAka,(64-25) SEP +ror sAma, sAma,(64-10) SEP +ror sAsa, sAsa,(64-39) SEP restore x26, STACK_OFFSET_CONST +ror sAbe, sAbe,(64-21) SEP +ror sAge, sAge,(64-45) SEP +ror sAke, sAke,(64-8) SEP ldr vvtmpq, [x26], #16 +ror sAme, sAme,(64-15) SEP +ror sAse, sAse,(64-41) SEP +ror sAbi, sAbi,(64-14) SEP +ror sAgi, sAgi,(64-61) SEP save x26, STACK_OFFSET_CONST +ror sAki, sAki,(64-18) SEP +ror sAmi, sAmi,(64-56) SEP +ror sAsi, sAsi,(64-2) SEP eor vAba.16b, vAba.16b, vvtmp.16b +ror sAgo, sAgo,(64-28) SEP +ror sAko, sAko,(64-1) SEP +ror sAmo, sAmo,(64-27) SEP +ror sAso, sAso,(64-62) SEP .unreq vvtmp +ror sAbu, sAbu,(64-44) SEP +ror sAgu, sAgu,(64-20) SEP +ror sAku, sAku,(64-6) SEP .unreq vvtmpq +ror sAmu, sAmu,(64-36) SEP +ror sAsu, sAsu,(64-55) SEP +.endm + +#define KECCAK_F1600_ROUNDS 24 + +.global keccak_f1600_x4_hybrid_asm_v5p +.global _keccak_f1600_x4_hybrid_asm_v5p +.text +.align 4 + +keccak_f1600_x4_hybrid_asm_v5p: +_keccak_f1600_x4_hybrid_asm_v5p: + alloc_stack + save_gprs + save_vregs + save input_addr, STACK_OFFSET_INPUT + + ASM_LOAD(const_addr,round_constants_vec) + save const_addr, STACK_OFFSET_CONST + + load_input_vector + + add input_addr, input_addr, #16 + + mov out_count, #0 +outer_loop: + save out_count, STACK_OFFSET_COUNT_OUT + + load_input_scalar + save input_addr, STACK_OFFSET_CUR_INPUT + + hybrid_round_initial +1: + hybrid_round_noninitial + cmp count, #(KECCAK_F1600_ROUNDS-3) + blt 1b + hybrid_round_final + + restore input_addr, STACK_OFFSET_CUR_INPUT + store_input_scalar + add input_addr, input_addr, #8 + + restore out_count, STACK_OFFSET_COUNT_OUT + add out_count, out_count, #1 + cmp out_count, #2 + blt outer_loop + + restore input_addr, STACK_OFFSET_INPUT + store_input_vector + + restore_vregs + restore_gprs + free_stack + ret diff --git a/tests/keccak_neon/manual/keccak_f1600_x4_hybrid_asm_v6.s b/tests/keccak_neon/manual/keccak_f1600_x4_hybrid_asm_v6.s new file mode 100644 index 0000000..183fa2c --- /dev/null +++ b/tests/keccak_neon/manual/keccak_f1600_x4_hybrid_asm_v6.s @@ -0,0 +1,1385 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" +#if defined(__ARM_FEATURE_SHA3) + +/********************** CONSTANTS *************************/ + .data + .align(8) +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 +round_constants_vec: + .quad 0x0000000000000001 + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + .quad 0x8000000080008008 +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x29 + count .req w27 + cur_const .req x26 + + /* Mapping of Kecck-f1600 SIMD state to vector registers + * at the beginning and end of each round. */ + + /* Mapping of Kecck-f1600 state to vector registers + * at the beginning and end of each round. */ + vAba .req v0 + vAbe .req v1 + vAbi .req v2 + vAbo .req v3 + vAbu .req v4 + vAga .req v5 + vAge .req v6 + vAgi .req v7 + vAgo .req v8 + vAgu .req v9 + vAka .req v10 + vAke .req v11 + vAki .req v12 + vAko .req v13 + vAku .req v14 + vAma .req v15 + vAme .req v16 + vAmi .req v17 + vAmo .req v18 + vAmu .req v19 + vAsa .req v20 + vAse .req v21 + vAsi .req v22 + vAso .req v23 + vAsu .req v24 + + /* q-form of the above mapping */ + vAbaq .req q0 + vAbeq .req q1 + vAbiq .req q2 + vAboq .req q3 + vAbuq .req q4 + vAgaq .req q5 + vAgeq .req q6 + vAgiq .req q7 + vAgoq .req q8 + vAguq .req q9 + vAkaq .req q10 + vAkeq .req q11 + vAkiq .req q12 + vAkoq .req q13 + vAkuq .req q14 + vAmaq .req q15 + vAmeq .req q16 + vAmiq .req q17 + vAmoq .req q18 + vAmuq .req q19 + vAsaq .req q20 + vAseq .req q21 + vAsiq .req q22 + vAsoq .req q23 + vAsuq .req q24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v27 + C1 .req v28 + C2 .req v29 + C3 .req v30 + C4 .req v31 + + C0q .req q27 + C1q .req q28 + C2q .req q29 + C3q .req q30 + C4q .req q31 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + vBba .req v25 // fresh + vBbe .req v26 // fresh + vBbi .req vAbi + vBbo .req vAbo + vBbu .req vAbu + vBga .req vAka + vBge .req vAke + vBgi .req vAgi + vBgo .req vAgo + vBgu .req vAgu + vBka .req vAma + vBke .req vAme + vBki .req vAki + vBko .req vAko + vBku .req vAku + vBma .req vAsa + vBme .req vAse + vBmi .req vAmi + vBmo .req vAmo + vBmu .req vAmu + vBsa .req vAba + vBse .req vAbe + vBsi .req vAsi + vBso .req vAso + vBsu .req vAsu + + vBbaq .req q25 // fresh + vBbeq .req q26 // fresh + vBbiq .req vAbiq + vBboq .req vAboq + vBbuq .req vAbuq + vBgaq .req vAkaq + vBgeq .req vAkeq + vBgiq .req vAgiq + vBgoq .req vAgoq + vBguq .req vAguq + vBkaq .req vAmaq + vBkeq .req vAmeq + vBkiq .req vAkiq + vBkoq .req vAkoq + vBkuq .req vAkuq + vBmaq .req vAsaq + vBmeq .req vAseq + vBmiq .req vAmiq + vBmoq .req vAmoq + vBmuq .req vAmuq + vBsaq .req vAbaq + vBseq .req vAbeq + vBsiq .req vAsiq + vBsoq .req vAsoq + vBsuq .req vAsuq + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req C4 + E1 .req C0 + E2 .req vBbe // fresh + E3 .req C2 + E4 .req C3 + + E0q .req C4q + E1q .req C0q + E2q .req vBbeq // fresh + E3q .req C2q + E4q .req C3q + + /* Mapping of Kecck-f1600 state to scalar registers + * at the beginning and end of each round. */ + s_Aba .req x1 + sAbe .req x6 + sAbi .req x11 + sAbo .req x16 + sAbu .req x21 + sAga .req x2 + sAge .req x7 + sAgi .req x12 + sAgo .req x17 + sAgu .req x22 + sAka .req x3 + sAke .req x8 + sAki .req x13 + sAko .req x18 + sAku .req x23 + sAma .req x4 + sAme .req x9 + sAmi .req x14 + sAmo .req x19 + sAmu .req x24 + sAsa .req x5 + sAse .req x10 + sAsi .req x15 + sAso .req x20 + sAsu .req x25 + + /* sA_[y,2*x+3*y] = rot(A[x,y]) */ + s_Aba_ .req x0 + sAbe_ .req x28 + sAbi_ .req x11 + sAbo_ .req x16 + sAbu_ .req x21 + sAga_ .req x3 + sAge_ .req x8 + sAgi_ .req x12 + sAgo_ .req x17 + sAgu_ .req x22 + sAka_ .req x4 + sAke_ .req x9 + sAki_ .req x13 + sAko_ .req x18 + sAku_ .req x23 + sAma_ .req x5 + sAme_ .req x10 + sAmi_ .req x14 + sAmo_ .req x19 + sAmu_ .req x24 + sAsa_ .req x1 + sAse_ .req x6 + sAsi_ .req x15 + sAso_ .req x20 + sAsu_ .req x25 + + /* sC[x] = sA[x,0] xor sA[x,1] xor sA[x,2] xor sA[x,3] xor sA[x,4], for x in 0..4 */ + /* sE[x] = sC[x-1] xor rot(C[x+1],1), for x in 0..4 */ + sC0 .req x0 + sE0 .req x29 + sC1 .req x26 + sE1 .req x30 + sC2 .req x27 + sE2 .req x26 + sC3 .req x28 + sE3 .req x27 + sC4 .req x29 + sE4 .req x28 + + tmp .req x30 + +/************************ MACROS ****************************/ + +/* Macros using v8.4-A SHA-3 instructions */ + +.macro eor3_m0 d s0 s1 s2 + eor3 \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +.macro rax1_m0 d s0 s1 + rax1 \d\().2d, \s0\().2d, \s1\().2d +.endm + +.macro xar_m0 d s0 s1 imm + xar \d\().2d, \s0\().2d, \s1\().2d, #\imm +.endm + +.macro bcax_m0 d s0 s1 s2 + bcax \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +.macro eor3_m1_0 d s0 s1 s2 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor2 d s0 s1 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor3_m1_1 d s0 s1 s2 + eor \d\().16b, \d\().16b, \s2\().16b +.endm + +.macro eor3_m1 d s0 s1 s2 + eor3_m1_0 \d, \s0, \s1, \s2 + eor3_m1_1 \d, \s0, \s1, \s2 +.endm + +.macro rax1_m1 d s0 s1 + // Use add instead of SHL #1 + add vvtmp.2d, \s1\().2d, \s1\().2d + sri vvtmp.2d, \s1\().2d, #63 + eor \d\().16b, vvtmp.16b, \s0\().16b +.endm + + .macro xar_m1 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 63 + eor \s0\().16b, \s0\().16b, \s1\().16b + add \d\().2d, \s0\().2d, \s0\().2d + sri \d\().2d, \s0\().2d, #(63) + // .elseif \imm == 62 + // eor \s0\().16b, \s0\().16b, \s1\().16b + // add \d\().2d, \s0\().2d, \s0\().2d + // add \d\().2d, \d\().2d, \d\().2d + // sri \d\().2d, \s0\().2d, #(62) + // .elseif \imm == 61 + // eor \s0\().16b, \s0\().16b, \s1\().16b + // add \d\().2d, \s0\().2d, \s0\().2d + // add \d\().2d, \d\().2d, \d\().2d + // add \d\().2d, \d\().2d, \d\().2d + // sri \d\().2d, \s0\().2d, #(61) + .else + eor \s0\().16b, \s0\().16b, \s1\().16b + shl \d\().2d, \s0\().2d, #(64-\imm) + sri \d\().2d, \s0\().2d, #(\imm) + .endif +.endm + + .macro xar_m1_0 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 63 + eor \s0\().16b, \s0\().16b, \s1\().16b + .elseif \imm == 62 + eor \s0\().16b, \s0\().16b, \s1\().16b + .else + eor \s0\().16b, \s0\().16b, \s1\().16b + .endif +.endm + + .macro xar_m1_1 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 63 + add \d\().2d, \s0\().2d, \s0\().2d + sri \d\().2d, \s0\().2d, #(63) + .elseif \imm == 62 + add \d\().2d, \s0\().2d, \s0\().2d + add \d\().2d, \d\().2d, \d\().2d + sri \d\().2d, \s0\().2d, #(62) + .else + shl \d\().2d, \s0\().2d, #(64-\imm) + sri \d\().2d, \s0\().2d, #(\imm) + .endif +.endm + +.macro bcax_m1 d s0 s1 s2 + bic vvtmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, vvtmp.16b, \s0\().16b +.endm + +.macro load_input_vector num idx + ldr vAbaq, [input_addr, #(16*(\num*0+\idx))] + ldr vAbeq, [input_addr, #(16*(\num*1+\idx))] + ldr vAbiq, [input_addr, #(16*(\num*2+\idx))] + ldr vAboq, [input_addr, #(16*(\num*3+\idx))] + ldr vAbuq, [input_addr, #(16*(\num*4+\idx))] + ldr vAgaq, [input_addr, #(16*(\num*5+\idx))] + ldr vAgeq, [input_addr, #(16*(\num*6+\idx))] + ldr vAgiq, [input_addr, #(16*(\num*7+\idx))] + ldr vAgoq, [input_addr, #(16*(\num*8+\idx))] + ldr vAguq, [input_addr, #(16*(\num*9+\idx))] + ldr vAkaq, [input_addr, #(16*(\num*10+\idx))] + ldr vAkeq, [input_addr, #(16*(\num*11+\idx))] + ldr vAkiq, [input_addr, #(16*(\num*12+\idx))] + ldr vAkoq, [input_addr, #(16*(\num*13+\idx))] + ldr vAkuq, [input_addr, #(16*(\num*14+\idx))] + ldr vAmaq, [input_addr, #(16*(\num*15+\idx))] + ldr vAmeq, [input_addr, #(16*(\num*16+\idx))] + ldr vAmiq, [input_addr, #(16*(\num*17+\idx))] + ldr vAmoq, [input_addr, #(16*(\num*18+\idx))] + ldr vAmuq, [input_addr, #(16*(\num*19+\idx))] + ldr vAsaq, [input_addr, #(16*(\num*20+\idx))] + ldr vAseq, [input_addr, #(16*(\num*21+\idx))] + ldr vAsiq, [input_addr, #(16*(\num*22+\idx))] + ldr vAsoq, [input_addr, #(16*(\num*23+\idx))] + ldr vAsuq, [input_addr, #(16*(\num*24+\idx))] +.endm + +.macro store_input_vector num idx + str vAbaq, [input_addr, #(16*(\num*0+\idx))] + str vAbeq, [input_addr, #(16*(\num*1+\idx))] + str vAbiq, [input_addr, #(16*(\num*2+\idx))] + str vAboq, [input_addr, #(16*(\num*3+\idx))] + str vAbuq, [input_addr, #(16*(\num*4+\idx))] + str vAgaq, [input_addr, #(16*(\num*5+\idx))] + str vAgeq, [input_addr, #(16*(\num*6+\idx))] + str vAgiq, [input_addr, #(16*(\num*7+\idx))] + str vAgoq, [input_addr, #(16*(\num*8+\idx))] + str vAguq, [input_addr, #(16*(\num*9+\idx))] + str vAkaq, [input_addr, #(16*(\num*10+\idx))] + str vAkeq, [input_addr, #(16*(\num*11+\idx))] + str vAkiq, [input_addr, #(16*(\num*12+\idx))] + str vAkoq, [input_addr, #(16*(\num*13+\idx))] + str vAkuq, [input_addr, #(16*(\num*14+\idx))] + str vAmaq, [input_addr, #(16*(\num*15+\idx))] + str vAmeq, [input_addr, #(16*(\num*16+\idx))] + str vAmiq, [input_addr, #(16*(\num*17+\idx))] + str vAmoq, [input_addr, #(16*(\num*18+\idx))] + str vAmuq, [input_addr, #(16*(\num*19+\idx))] + str vAsaq, [input_addr, #(16*(\num*20+\idx))] + str vAseq, [input_addr, #(16*(\num*21+\idx))] + str vAsiq, [input_addr, #(16*(\num*22+\idx))] + str vAsoq, [input_addr, #(16*(\num*23+\idx))] + str vAsuq, [input_addr, #(16*(\num*24+\idx))] +.endm + +.macro store_input_scalar num idx + str s_Aba, [input_addr, 8*(\num*(0) +\idx)] + str sAbe, [input_addr, 8*(\num*(0+1) +\idx)] + str sAbi, [input_addr, 8*(\num*(2)+ \idx)] + str sAbo, [input_addr, 8*(\num*(2+1) +\idx)] + str sAbu, [input_addr, 8*(\num*(4)+ \idx)] + str sAga, [input_addr, 8*(\num*(4+1) +\idx)] + str sAge, [input_addr, 8*(\num*(6)+ \idx)] + str sAgi, [input_addr, 8*(\num*(6+1) +\idx)] + str sAgo, [input_addr, 8*(\num*(8)+ \idx)] + str sAgu, [input_addr, 8*(\num*(8+1) +\idx)] + str sAka, [input_addr, 8*(\num*(10) +\idx)] + str sAke, [input_addr, 8*(\num*(10+1)+\idx)] + str sAki, [input_addr, 8*(\num*(12) +\idx)] + str sAko, [input_addr, 8*(\num*(12+1)+\idx)] + str sAku, [input_addr, 8*(\num*(14) +\idx)] + str sAma, [input_addr, 8*(\num*(14+1)+\idx)] + str sAme, [input_addr, 8*(\num*(16) +\idx)] + str sAmi, [input_addr, 8*(\num*(16+1)+\idx)] + str sAmo, [input_addr, 8*(\num*(18) +\idx)] + str sAmu, [input_addr, 8*(\num*(18+1)+\idx)] + str sAsa, [input_addr, 8*(\num*(20) +\idx)] + str sAse, [input_addr, 8*(\num*(20+1)+\idx)] + str sAsi, [input_addr, 8*(\num*(22) +\idx)] + str sAso, [input_addr, 8*(\num*(22+1)+\idx)] + str sAsu, [input_addr, 8*(\num*(24) +\idx)] +.endm + +.macro load_input_scalar num idx + ldr s_Aba, [input_addr, 8*(\num*(0) +\idx)] + ldr sAbe, [input_addr, 8*(\num*(0+1) +\idx)] + ldr sAbi, [input_addr, 8*(\num*(2)+ \idx)] + ldr sAbo, [input_addr, 8*(\num*(2+1) +\idx)] + ldr sAbu, [input_addr, 8*(\num*(4)+ \idx)] + ldr sAga, [input_addr, 8*(\num*(4+1) +\idx)] + ldr sAge, [input_addr, 8*(\num*(6)+ \idx)] + ldr sAgi, [input_addr, 8*(\num*(6+1) +\idx)] + ldr sAgo, [input_addr, 8*(\num*(8)+ \idx)] + ldr sAgu, [input_addr, 8*(\num*(8+1) +\idx)] + ldr sAka, [input_addr, 8*(\num*(10) +\idx)] + ldr sAke, [input_addr, 8*(\num*(10+1)+\idx)] + ldr sAki, [input_addr, 8*(\num*(12) +\idx)] + ldr sAko, [input_addr, 8*(\num*(12+1)+\idx)] + ldr sAku, [input_addr, 8*(\num*(14) +\idx)] + ldr sAma, [input_addr, 8*(\num*(14+1)+\idx)] + ldr sAme, [input_addr, 8*(\num*(16) +\idx)] + ldr sAmi, [input_addr, 8*(\num*(16+1)+\idx)] + ldr sAmo, [input_addr, 8*(\num*(18) +\idx)] + ldr sAmu, [input_addr, 8*(\num*(18+1)+\idx)] + ldr sAsa, [input_addr, 8*(\num*(20) +\idx)] + ldr sAse, [input_addr, 8*(\num*(20+1)+\idx)] + ldr sAsi, [input_addr, 8*(\num*(22) +\idx)] + ldr sAso, [input_addr, 8*(\num*(22+1)+\idx)] + ldr sAsu, [input_addr, 8*(\num*(24) +\idx)] +.endm + +#define STACK_SIZE (8*8 + 16*6 + 3*8 + 8 + 16*34) // VREGS (8*8), GPRs (16*6), count (8), const (8), input (8), padding (8) +#define STACK_BASE_GPRS (3*8+8) +#define STACK_BASE_VREGS (3*8+8+16*6) +#define STACK_BASE_TMP (8*8 + 16*6 + 3*8 + 8) +#define STACK_OFFSET_INPUT (0*8) +#define STACK_OFFSET_CONST (1*8) +#define STACK_OFFSET_COUNT (2*8) + +#define vAga_offset 0 +#define E0_offset 1 +#define E1_offset 2 +#define E2_offset 3 +#define E3_offset 4 +#define E4_offset 5 +#define Ame_offset 7 +#define Agi_offset 8 +#define Aka_offset 9 +#define Abo_offset 10 +#define Amo_offset 11 +#define Ami_offset 12 +#define Ake_offset 13 +#define Agu_offset 14 +#define Asi_offset 15 +#define Aku_offset 16 +#define Asa_offset 17 +#define Abu_offset 18 +#define Asu_offset 19 +#define Ase_offset 20 +//#define Aga_offset 21 +#define Age_offset 22 +#define vBgo_offset 23 +#define vBke_offset 24 +#define vBgi_offset 25 +#define vBga_offset 26 +#define vBbo_offset 27 +#define vBmo_offset 28 +#define vBmi_offset 29 +#define vBge_offset 30 + +#define save(name) \ + str name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] +#define restore(name) \ + ldr name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] + + +.macro save_gprs + stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro save_vregs + stp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] + stp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + stp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + stp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] +.endm + +.macro restore_vregs + ldp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] + ldp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + ldp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + ldp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] +.endm + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +.macro eor5 dst, src0, src1, src2, src3, src4 + eor \dst, \src0, \src1 + eor \dst, \dst, \src2 + eor \dst, \dst, \src3 + eor \dst, \dst, \src4 +.endm + +.macro xor_rol dst, src1, src0, imm + eor \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro bic_rol dst, src1, src0, imm + bic \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro rotate dst, src, imm + ror \dst, \src, #(64-\imm) +.endm + +.macro save reg, offset + str \reg, [sp, #\offset] +.endm + +.macro restore reg, offset + ldr \reg, [sp, #\offset] +.endm + +.macro hybrid_round_initial +eor sC0, sAma, sAsa SEP +eor sC1, sAme, sAse SEP eor3_m0 C1,vAbe,vAge,vAke +eor sC2, sAmi, sAsi SEP +eor sC3, sAmo, sAso SEP eor3_m1 C3,vAbo,vAgo,vAko +eor sC4, sAmu, sAsu SEP +eor sC0, sAka, sC0 SEP eor3_m0 C0,vAba,vAga,vAka +eor sC1, sAke, sC1 SEP +eor sC2, sAki, sC2 SEP eor3_m1 C2,vAbi,vAgi,vAki +eor sC3, sAko, sC3 SEP +eor sC4, sAku, sC4 SEP eor3_m0 C4,vAbu,vAgu,vAku +eor sC0, sAga, sC0 SEP +eor sC1, sAge, sC1 SEP eor3_m1 C1, C1,vAme, vAse +eor sC2, sAgi, sC2 SEP +eor sC3, sAgo, sC3 SEP eor3_m0 C3, C3,vAmo, vAso +eor sC4, sAgu, sC4 SEP +eor sC0, s_Aba, sC0 SEP eor3_m1 C0, C0,vAma, vAsa +eor sC1, sAbe, sC1 SEP +eor sC2, sAbi, sC2 SEP eor3_m0 C2, C2,vAmi, vAsi +eor sC3, sAbo, sC3 SEP +eor sC4, sAbu, sC4 SEP eor3_m1 C4, C4,vAmu, vAsu +eor sE1, sC0, sC2, ROR #63 SEP +eor sE3, sC2, sC4, ROR #63 SEP vvtmp .req vBba +eor sE0, sC4, sC1, ROR #63 SEP +eor sE2, sC1, sC3, ROR #63 SEP rax1_m0 E2, C1, C3 +eor sE4, sC3, sC0, ROR #63 SEP +eor s_Aba_, s_Aba, sE0 SEP +eor sAsa_, sAbi, sE2 SEP rax1_m1 E4, C3, C0 +eor sAbi_, sAki, sE2 SEP +eor sAki_, sAko, sE3 SEP rax1_m0 E1, C0, C2 +eor sAko_, sAmu, sE4 SEP +eor sAmu_, sAso, sE3 SEP rax1_m1 E3, C2, C4 +eor sAso_, sAma, sE0 SEP +eor sAka_, sAbe, sE1 SEP rax1_m0 E0, C4, C1 +eor sAse_, sAgo, sE3 SEP +eor sAgo_, sAme, sE1 SEP .unreq vvtmp +eor sAke_, sAgi, sE2 SEP +eor sAgi_, sAka, sE0 SEP vvtmp .req C1 +eor sAga_, sAbo, sE3 SEP +eor sAbo_, sAmo, sE3 SEP vvtmpq .req C1q +eor sAmo_, sAmi, sE2 SEP +eor sAmi_, sAke, sE1 SEP eor vBba.16b, vAba.16b, E0.16b +eor sAge_, sAgu, sE4 SEP +eor sAgu_, sAsi, sE2 SEP xar_m1 vBsa, vAbi, E2, 2 +eor sAsi_, sAku, sE4 SEP +eor sAku_, sAsa, sE0 SEP xar_m0 vBbi, vAki, E2, 21 +eor sAma_, sAbu, sE4 SEP +eor sAbu_, sAsu, sE4 SEP xar_m1 vBki, vAko, E3, 39 +eor sAsu_, sAse, sE1 SEP +eor sAme_, sAga, sE0 SEP +eor sAbe_, sAge, sE1 SEP xar_m0 vBko, vAmu, E4, 56 +load_constant_ptr SEP +bic tmp, sAgi_, sAge_, ROR #47 SEP xar_m1 vBmu, vAso, E3, 8 +eor sAga, tmp, sAga_, ROR #39 SEP +bic tmp, sAgo_, sAgi_, ROR #42 SEP xar_m0 vBso, vAma, E0, 23 +eor sAge, tmp, sAge_, ROR #25 SEP +bic tmp, sAgu_, sAgo_, ROR #16 SEP xar_m1 vBka, vAbe, E1, 63 +eor sAgi, tmp, sAgi_, ROR #58 SEP +bic tmp, sAga_, sAgu_, ROR #31 SEP xar_m0 vBse, vAgo, E3, 9 +eor sAgo, tmp, sAgo_, ROR #47 SEP +bic tmp, sAge_, sAga_, ROR #56 SEP xar_m1 vBgo, vAme, E1, 19 +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP xar_m0 vBke, vAgi, E2, 58 +eor sAka, tmp, sAka_, ROR #24 SEP +bic tmp, sAko_, sAki_, ROR #47 SEP xar_m1 vBgi, vAka, E0, 61 +eor sAke, tmp, sAke_, ROR #2 SEP +bic tmp, sAku_, sAko_, ROR #10 SEP xar_m0 vBga, vAbo, E3, 36 +eor sAki, tmp, sAki_, ROR #57 SEP +bic tmp, sAka_, sAku_, ROR #47 SEP xar_m1 vBbo, vAmo, E3, 43 +eor sAko, tmp, sAko_, ROR #57 SEP +bic tmp, sAke_, sAka_, ROR #5 SEP xar_m0 vBmo, vAmi, E2, 49 +eor sAku, tmp, sAku_, ROR #52 SEP +bic tmp, sAmi_, sAme_, ROR #38 SEP +eor sAma, tmp, sAma_, ROR #47 SEP xar_m1 vBmi, vAke, E1, 54 +bic tmp, sAmo_, sAmi_, ROR #5 SEP +eor sAme, tmp, sAme_, ROR #43 SEP xar_m0 vBge, vAgu, E4, 44 +bic tmp, sAmu_, sAmo_, ROR #41 SEP +eor sAmi, tmp, sAmi_, ROR #46 SEP mov E3.16b, vAga.16b +ldr cur_const, [const_addr] SEP +mov count, #1 SEP bcax_m1 vAga, vBga, vBgi, vBge +bic tmp, sAma_, sAmu_, ROR #35 SEP +eor sAmo, tmp, sAmo_, ROR #12 SEP xar_m0 vBgu, vAsi, E2, 3 +bic tmp, sAme_, sAma_, ROR #9 SEP +eor sAmu, tmp, sAmu_, ROR #44 SEP xar_m1 vBsi, vAku, E4, 25 +bic tmp, sAsi_, sAse_, ROR #48 SEP +eor sAsa, tmp, sAsa_, ROR #41 SEP xar_m0 vBku, vAsa, E0, 46 +bic tmp, sAso_, sAsi_, ROR #2 SEP +eor sAse, tmp, sAse_, ROR #50 SEP xar_m1 vBma, vAbu, E4, 37 +bic tmp, sAsu_, sAso_, ROR #25 SEP +eor sAsi, tmp, sAsi_, ROR #27 SEP xar_m0 vBbu, vAsu, E4, 50 +bic tmp, sAsa_, sAsu_, ROR #60 SEP +eor sAso, tmp, sAso_, ROR #21 SEP xar_m1 vBsu, vAse, E1, 62 +bic tmp, sAse_, sAsa_, ROR #57 SEP +eor sAsu, tmp, sAsu_, ROR #53 SEP xar_m0 vBme, E3, E0, 28 +bic tmp, sAbi_, sAbe_, ROR #63 SEP +eor s_Aba, s_Aba_, tmp, ROR #21 SEP xar_m1 vBbe, vAge, E1, 20 +bic tmp, sAbo_, sAbi_, ROR #42 SEP +eor sAbe, tmp, sAbe_, ROR #41 SEP +bic tmp, sAbu_, sAbo_, ROR #57 SEP bcax_m1 vAge, vBge, vBgo, vBgi +eor sAbi, tmp, sAbi_, ROR #35 SEP +bic tmp, s_Aba_, sAbu_, ROR #50 SEP bcax_m0 vAgi, vBgi, vBgu, vBgo +eor sAbo, tmp, sAbo_, ROR #43 SEP +bic tmp, sAbe_, s_Aba_, ROR #44 SEP bcax_m1 vAgo, vBgo, vBga, vBgu +eor sAbu, tmp, sAbu_, ROR #30 SEP +eor s_Aba, s_Aba, cur_const SEP bcax_m0 vAgu, vBgu, vBge, vBga +save count, STACK_OFFSET_COUNT SEP +eor sC0, sAka, sAsa, ROR #50 SEP bcax_m1 vAka, vBka, vBki, vBke +eor sC1, sAse, sAge, ROR #60 SEP +eor sC2, sAmi, sAgi, ROR #59 SEP bcax_m0 vAke, vBke, vBko, vBki +eor sC3, sAgo, sAso, ROR #30 SEP +eor sC4, sAbu, sAsu, ROR #53 SEP .unreq vvtmp +eor sC0, sAma, sC0, ROR #49 SEP +eor sC1, sAbe, sC1, ROR #44 SEP .unreq vvtmpq +eor sC2, sAki, sC2, ROR #26 SEP +eor sC3, sAmo, sC3, ROR #63 SEP eor2 C0, vAka, vAga +eor sC4, sAmu, sC4, ROR #56 SEP +eor sC0, sAga, sC0, ROR #57 SEP save(vAga) +eor sC1, sAme, sC1, ROR #58 SEP +eor sC2, sAbi, sC2, ROR #60 SEP vvtmp .req vAga +eor sC3, sAko, sC3, ROR #38 SEP +eor sC4, sAgu, sC4, ROR #48 SEP +eor sC0, s_Aba, sC0, ROR #61 SEP vvtmpq .req vAgaq +eor sC1, sAke, sC1, ROR #57 SEP +eor sC2, sAsi, sC2, ROR #52 SEP bcax_m0 vAki, vBki, vBku, vBko +eor sC3, sAbo, sC3, ROR #63 SEP +eor sC4, sAku, sC4, ROR #50 SEP bcax_m1 vAko, vBko, vBka, vBku +ror sC1, sC1, 56 SEP +ror sC4, sC4, 58 SEP eor2 C1, vAke, vAge +ror sC2, sC2, 62 SEP +eor sE1, sC0, sC2, ROR #63 SEP bcax_m0 vAku, vBku, vBke, vBka +eor sE3, sC2, sC4, ROR #63 SEP +eor sE0, sC4, sC1, ROR #63 SEP eor2 C2, vAki, vAgi +eor sE2, sC1, sC3, ROR #63 SEP +eor sE4, sC3, sC0, ROR #63 SEP bcax_m1 vAma, vBma, vBmi, vBme +eor s_Aba_, sE0, s_Aba SEP +eor sAsa_, sE2, sAbi, ROR #50 SEP eor2 C3, vAko, vAgo +eor sAbi_, sE2, sAki, ROR #46 SEP +eor sAki_, sE3, sAko, ROR #63 SEP bcax_m0 vAme, vBme, vBmo, vBmi +eor sAko_, sE4, sAmu, ROR #28 SEP +eor sAmu_, sE3, sAso, ROR #2 SEP eor2 C4, vAku, vAgu +eor sAso_, sE0, sAma, ROR #54 SEP +eor sAka_, sE1, sAbe, ROR #43 SEP bcax_m1 vAmi, vBmi, vBmu, vBmo +eor sAse_, sE3, sAgo, ROR #36 SEP +eor sAgo_, sE1, sAme, ROR #49 SEP +eor sAke_, sE2, sAgi, ROR #3 SEP eor2 C0, C0, vAma +eor sAgi_, sE0, sAka, ROR #39 SEP +eor sAga_, sE3, sAbo SEP bcax_m0 vAmo, vBmo, vBma, vBmu +eor sAbo_, sE3, sAmo, ROR #37 SEP +eor sAmo_, sE2, sAmi, ROR #8 SEP eor2 C1, C1, vAme +eor sAmi_, sE1, sAke, ROR #56 SEP +eor sAge_, sE4, sAgu, ROR #44 SEP bcax_m1 vAmu, vBmu, vBme, vBma +eor sAgu_, sE2, sAsi, ROR #62 SEP +eor sAsi_, sE4, sAku, ROR #58 SEP eor2 C2, C2, vAmi +eor sAku_, sE0, sAsa, ROR #25 SEP +eor sAma_, sE4, sAbu, ROR #20 SEP bcax_m0 vAsa, vBsa, vBsi, vBse +eor sAbu_, sE4, sAsu, ROR #9 SEP +eor sAsu_, sE1, sAse, ROR #23 SEP eor2 C3, C3, vAmo +eor sAme_, sE0, sAga, ROR #61 SEP +eor sAbe_, sE1, sAge, ROR #19 SEP bcax_m1 vAse, vBse, vBso, vBsi +load_constant_ptr SEP +restore count, STACK_OFFSET_COUNT SEP eor2 C4, C4, vAmu +bic tmp, sAgi_, sAge_, ROR #47 SEP +eor sAga, tmp, sAga_, ROR #39 SEP bcax_m0 vAsi, vBsi, vBsu, vBso +bic tmp, sAgo_, sAgi_, ROR #42 SEP +eor sAge, tmp, sAge_, ROR #25 SEP eor2 C0, C0, vAsa +bic tmp, sAgu_, sAgo_, ROR #16 SEP +eor sAgi, tmp, sAgi_, ROR #58 SEP bcax_m1 vAso, vBso, vBsa, vBsu +bic tmp, sAga_, sAgu_, ROR #31 SEP +eor sAgo, tmp, sAgo_, ROR #47 SEP +bic tmp, sAge_, sAga_, ROR #56 SEP eor2 C1, C1, vAse +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP bcax_m0 vAsu, vBsu, vBse, vBsa +eor sAka, tmp, sAka_, ROR #24 SEP +bic tmp, sAko_, sAki_, ROR #47 SEP eor2 C2, C2, vAsi +eor sAke, tmp, sAke_, ROR #2 SEP +bic tmp, sAku_, sAko_, ROR #10 SEP eor2 C3, C3, vAso +eor sAki, tmp, sAki_, ROR #57 SEP +bic tmp, sAka_, sAku_, ROR #47 SEP bcax_m1 vAba, vBba, vBbi, vBbe +eor sAko, tmp, sAko_, ROR #57 SEP +bic tmp, sAke_, sAka_, ROR #5 SEP bcax_m0 vAbe, vBbe, vBbo, vBbi +eor sAku, tmp, sAku_, ROR #52 SEP +bic tmp, sAmi_, sAme_, ROR #38 SEP eor2 C1, C1, vAbe +eor sAma, tmp, sAma_, ROR #47 SEP +bic tmp, sAmo_, sAmi_, ROR #5 SEP restore x26, STACK_OFFSET_CONST +eor sAme, tmp, sAme_, ROR #43 SEP +bic tmp, sAmu_, sAmo_, ROR #41 SEP ldr vvtmpq, [x26], #16 +eor sAmi, tmp, sAmi_, ROR #46 SEP +bic tmp, sAma_, sAmu_, ROR #35 SEP save x26, STACK_OFFSET_CONST +eor sAmo, tmp, sAmo_, ROR #12 SEP +bic tmp, sAme_, sAma_, ROR #9 SEP eor vAba.16b, vAba.16b, vvtmp.16b +eor sAmu, tmp, sAmu_, ROR #44 SEP +bic tmp, sAsi_, sAse_, ROR #48 SEP +ldr cur_const, [const_addr, count, UXTW #3] SEP eor2 C4, C4, vAsu +eor sAsa, tmp, sAsa_, ROR #41 SEP +bic tmp, sAso_, sAsi_, ROR #2 SEP bcax_m0 vAbi, vBbi, vBbu, vBbo +eor sAse, tmp, sAse_, ROR #50 SEP +bic tmp, sAsu_, sAso_, ROR #25 SEP bcax_m1 vAbo, vBbo, vBba, vBbu +eor sAsi, tmp, sAsi_, ROR #27 SEP +bic tmp, sAsa_, sAsu_, ROR #60 SEP eor2 C3, C3, vAbo +eor sAso, tmp, sAso_, ROR #21 SEP +bic tmp, sAse_, sAsa_, ROR #57 SEP eor2 C2, C2, vAbi +eor sAsu, tmp, sAsu_, ROR #53 SEP +bic tmp, sAbi_, sAbe_, ROR #63 SEP eor2 C0, C0, vAba +eor s_Aba, s_Aba_, tmp, ROR #21 SEP +bic tmp, sAbo_, sAbi_, ROR #42 SEP bcax_m0 vAbu, vBbu, vBbe, vBba +eor sAbe, tmp, sAbe_, ROR #41 SEP +bic tmp, sAbu_, sAbo_, ROR #57 SEP eor2 C4, C4, vAbu +eor sAbi, tmp, sAbi_, ROR #35 SEP +bic tmp, s_Aba_, sAbu_, ROR #50 SEP restore(vAga) +eor sAbo, tmp, sAbo_, ROR #43 SEP +bic tmp, sAbe_, s_Aba_, ROR #44 SEP .unreq vvtmp +eor sAbu, tmp, sAbu_, ROR #30 SEP +add count, count, #1 SEP .unreq vvtmpq +eor s_Aba, s_Aba, cur_const SEP +.endm + + + +.macro hybrid_round_noninitial +save count, STACK_OFFSET_COUNT SEP +eor sC0, sAka, sAsa, ROR #50 SEP vvtmp .req vBba +eor sC1, sAse, sAge, ROR #60 SEP +eor sC2, sAmi, sAgi, ROR #59 SEP rax1_m0 E2, C1, C3 +eor sC3, sAgo, sAso, ROR #30 SEP +eor sC4, sAbu, sAsu, ROR #53 SEP rax1_m1 E4, C3, C0 +eor sC0, sAma, sC0, ROR #49 SEP +eor sC1, sAbe, sC1, ROR #44 SEP +eor sC2, sAki, sC2, ROR #26 SEP rax1_m0 E1, C0, C2 +eor sC3, sAmo, sC3, ROR #63 SEP +eor sC4, sAmu, sC4, ROR #56 SEP rax1_m1 E3, C2, C4 +eor sC0, sAga, sC0, ROR #57 SEP +eor sC1, sAme, sC1, ROR #58 SEP rax1_m0 E0, C4, C1 +eor sC2, sAbi, sC2, ROR #60 SEP +eor sC3, sAko, sC3, ROR #38 SEP +eor sC4, sAgu, sC4, ROR #48 SEP .unreq vvtmp +eor sC0, s_Aba, sC0, ROR #61 SEP +eor sC1, sAke, sC1, ROR #57 SEP vvtmp .req C1 +eor sC2, sAsi, sC2, ROR #52 SEP +eor sC3, sAbo, sC3, ROR #63 SEP vvtmpq .req C1q +eor sC4, sAku, sC4, ROR #50 SEP +ror sC1, sC1, 56 SEP +ror sC4, sC4, 58 SEP eor vBba.16b, vAba.16b, E0.16b +ror sC2, sC2, 62 SEP +eor sE1, sC0, sC2, ROR #63 SEP xar_m1 vBsa, vAbi, E2, 2 +eor sE3, sC2, sC4, ROR #63 SEP +eor sE0, sC4, sC1, ROR #63 SEP +eor sE2, sC1, sC3, ROR #63 SEP xar_m0 vBbi, vAki, E2, 21 +eor sE4, sC3, sC0, ROR #63 SEP +eor s_Aba_, sE0, s_Aba SEP xar_m1 vBki, vAko, E3, 39 +eor sAsa_, sE2, sAbi, ROR #50 SEP +eor sAbi_, sE2, sAki, ROR #46 SEP xar_m0 vBko, vAmu, E4, 56 +eor sAki_, sE3, sAko, ROR #63 SEP +eor sAko_, sE4, sAmu, ROR #28 SEP +eor sAmu_, sE3, sAso, ROR #2 SEP xar_m1 vBmu, vAso, E3, 8 +eor sAso_, sE0, sAma, ROR #54 SEP +eor sAka_, sE1, sAbe, ROR #43 SEP xar_m0 vBso, vAma, E0, 23 +eor sAse_, sE3, sAgo, ROR #36 SEP +eor sAgo_, sE1, sAme, ROR #49 SEP xar_m1 vBka, vAbe, E1, 63 +eor sAke_, sE2, sAgi, ROR #3 SEP +eor sAgi_, sE0, sAka, ROR #39 SEP +eor sAga_, sE3, sAbo SEP xar_m0 vBse, vAgo, E3, 9 +eor sAbo_, sE3, sAmo, ROR #37 SEP +eor sAmo_, sE2, sAmi, ROR #8 SEP xar_m1 vBgo, vAme, E1, 19 +eor sAmi_, sE1, sAke, ROR #56 SEP +eor sAge_, sE4, sAgu, ROR #44 SEP +eor sAgu_, sE2, sAsi, ROR #62 SEP xar_m0 vBke, vAgi, E2, 58 +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP xar_m1 vBgi, vAka, E0, 61 +eor sAma_, sE4, sAbu, ROR #20 SEP +eor sAbu_, sE4, sAsu, ROR #9 SEP xar_m0 vBga, vAbo, E3, 36 +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP +eor sAbe_, sE1, sAge, ROR #19 SEP xar_m1 vBbo, vAmo, E3, 43 +load_constant_ptr SEP +restore count, STACK_OFFSET_COUNT SEP xar_m0 vBmo, vAmi, E2, 49 +bic tmp, sAgi_, sAge_, ROR #47 SEP +eor sAga, tmp, sAga_, ROR #39 SEP xar_m1 vBmi, vAke, E1, 54 +bic tmp, sAgo_, sAgi_, ROR #42 SEP +eor sAge, tmp, sAge_, ROR #25 SEP +bic tmp, sAgu_, sAgo_, ROR #16 SEP xar_m0 vBge, vAgu, E4, 44 +eor sAgi, tmp, sAgi_, ROR #58 SEP +bic tmp, sAga_, sAgu_, ROR #31 SEP mov E3.16b, vAga.16b +eor sAgo, tmp, sAgo_, ROR #47 SEP +bic tmp, sAge_, sAga_, ROR #56 SEP bcax_m1 vAga, vBga, vBgi, vBge +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP +eor sAka, tmp, sAka_, ROR #24 SEP xar_m0 vBgu, vAsi, E2, 3 +bic tmp, sAko_, sAki_, ROR #47 SEP +eor sAke, tmp, sAke_, ROR #2 SEP xar_m1 vBsi, vAku, E4, 25 +bic tmp, sAku_, sAko_, ROR #10 SEP +eor sAki, tmp, sAki_, ROR #57 SEP +bic tmp, sAka_, sAku_, ROR #47 SEP xar_m0 vBku, vAsa, E0, 46 +eor sAko, tmp, sAko_, ROR #57 SEP +bic tmp, sAke_, sAka_, ROR #5 SEP xar_m1 vBma, vAbu, E4, 37 +eor sAku, tmp, sAku_, ROR #52 SEP +bic tmp, sAmi_, sAme_, ROR #38 SEP xar_m0 vBbu, vAsu, E4, 50 +eor sAma, tmp, sAma_, ROR #47 SEP +bic tmp, sAmo_, sAmi_, ROR #5 SEP +eor sAme, tmp, sAme_, ROR #43 SEP xar_m1 vBsu, vAse, E1, 62 +bic tmp, sAmu_, sAmo_, ROR #41 SEP +eor sAmi, tmp, sAmi_, ROR #46 SEP xar_m0 vBme, E3, E0, 28 +bic tmp, sAma_, sAmu_, ROR #35 SEP +ldr cur_const, [const_addr, count, UXTW #3] SEP xar_m1 vBbe, vAge, E1, 20 +add count, count, #1 SEP +eor sAmo, tmp, sAmo_, ROR #12 SEP +bic tmp, sAme_, sAma_, ROR #9 SEP bcax_m1 vAge, vBge, vBgo, vBgi +eor sAmu, tmp, sAmu_, ROR #44 SEP +bic tmp, sAsi_, sAse_, ROR #48 SEP bcax_m0 vAgi, vBgi, vBgu, vBgo +eor sAsa, tmp, sAsa_, ROR #41 SEP +bic tmp, sAso_, sAsi_, ROR #2 SEP +eor sAse, tmp, sAse_, ROR #50 SEP bcax_m1 vAgo, vBgo, vBga, vBgu +bic tmp, sAsu_, sAso_, ROR #25 SEP +eor sAsi, tmp, sAsi_, ROR #27 SEP bcax_m0 vAgu, vBgu, vBge, vBga +bic tmp, sAsa_, sAsu_, ROR #60 SEP +eor sAso, tmp, sAso_, ROR #21 SEP bcax_m1 vAka, vBka, vBki, vBke +bic tmp, sAse_, sAsa_, ROR #57 SEP +eor sAsu, tmp, sAsu_, ROR #53 SEP +bic tmp, sAbi_, sAbe_, ROR #63 SEP bcax_m0 vAke, vBke, vBko, vBki +eor s_Aba, s_Aba_, tmp, ROR #21 SEP +bic tmp, sAbo_, sAbi_, ROR #42 SEP .unreq vvtmp +eor sAbe, tmp, sAbe_, ROR #41 SEP +bic tmp, sAbu_, sAbo_, ROR #57 SEP .unreq vvtmpq +eor sAbi, tmp, sAbi_, ROR #35 SEP +bic tmp, s_Aba_, sAbu_, ROR #50 SEP +eor sAbo, tmp, sAbo_, ROR #43 SEP eor2 C0, vAka, vAga +bic tmp, sAbe_, s_Aba_, ROR #44 SEP +eor sAbu, tmp, sAbu_, ROR #30 SEP save(vAga) +eor s_Aba, s_Aba, cur_const SEP +save count, STACK_OFFSET_COUNT SEP +eor sC0, sAka, sAsa, ROR #50 SEP vvtmp .req vAga +eor sC1, sAse, sAge, ROR #60 SEP +eor sC2, sAmi, sAgi, ROR #59 SEP vvtmpq .req vAgaq +eor sC3, sAgo, sAso, ROR #30 SEP +eor sC4, sAbu, sAsu, ROR #53 SEP bcax_m0 vAki, vBki, vBku, vBko +eor sC0, sAma, sC0, ROR #49 SEP +eor sC1, sAbe, sC1, ROR #44 SEP +eor sC2, sAki, sC2, ROR #26 SEP bcax_m1 vAko, vBko, vBka, vBku +eor sC3, sAmo, sC3, ROR #63 SEP +eor sC4, sAmu, sC4, ROR #56 SEP eor2 C1, vAke, vAge +eor sC0, sAga, sC0, ROR #57 SEP +eor sC1, sAme, sC1, ROR #58 SEP bcax_m0 vAku, vBku, vBke, vBka +eor sC2, sAbi, sC2, ROR #60 SEP +eor sC3, sAko, sC3, ROR #38 SEP +eor sC4, sAgu, sC4, ROR #48 SEP eor2 C2, vAki, vAgi +eor sC0, s_Aba, sC0, ROR #61 SEP +eor sC1, sAke, sC1, ROR #57 SEP bcax_m1 vAma, vBma, vBmi, vBme +eor sC2, sAsi, sC2, ROR #52 SEP +eor sC3, sAbo, sC3, ROR #63 SEP eor2 C3, vAko, vAgo +eor sC4, sAku, sC4, ROR #50 SEP +ror sC1, sC1, 56 SEP +ror sC4, sC4, 58 SEP bcax_m0 vAme, vBme, vBmo, vBmi +ror sC2, sC2, 62 SEP +eor sE1, sC0, sC2, ROR #63 SEP eor2 C4, vAku, vAgu +eor sE3, sC2, sC4, ROR #63 SEP +eor sE0, sC4, sC1, ROR #63 SEP +eor sE2, sC1, sC3, ROR #63 SEP bcax_m1 vAmi, vBmi, vBmu, vBmo +eor sE4, sC3, sC0, ROR #63 SEP +eor s_Aba_, sE0, s_Aba SEP eor2 C0, C0, vAma +eor sAsa_, sE2, sAbi, ROR #50 SEP +eor sAbi_, sE2, sAki, ROR #46 SEP bcax_m0 vAmo, vBmo, vBma, vBmu +eor sAki_, sE3, sAko, ROR #63 SEP +eor sAko_, sE4, sAmu, ROR #28 SEP +eor sAmu_, sE3, sAso, ROR #2 SEP eor2 C1, C1, vAme +eor sAso_, sE0, sAma, ROR #54 SEP +eor sAka_, sE1, sAbe, ROR #43 SEP bcax_m1 vAmu, vBmu, vBme, vBma +eor sAse_, sE3, sAgo, ROR #36 SEP +eor sAgo_, sE1, sAme, ROR #49 SEP eor2 C2, C2, vAmi +eor sAke_, sE2, sAgi, ROR #3 SEP +eor sAgi_, sE0, sAka, ROR #39 SEP +eor sAga_, sE3, sAbo SEP bcax_m0 vAsa, vBsa, vBsi, vBse +eor sAbo_, sE3, sAmo, ROR #37 SEP +eor sAmo_, sE2, sAmi, ROR #8 SEP eor2 C3, C3, vAmo +eor sAmi_, sE1, sAke, ROR #56 SEP +eor sAge_, sE4, sAgu, ROR #44 SEP +eor sAgu_, sE2, sAsi, ROR #62 SEP bcax_m1 vAse, vBse, vBso, vBsi +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP eor2 C4, C4, vAmu +eor sAma_, sE4, sAbu, ROR #20 SEP +eor sAbu_, sE4, sAsu, ROR #9 SEP bcax_m0 vAsi, vBsi, vBsu, vBso +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP +eor sAbe_, sE1, sAge, ROR #19 SEP eor2 C0, C0, vAsa +load_constant_ptr SEP +restore count, STACK_OFFSET_COUNT SEP bcax_m1 vAso, vBso, vBsa, vBsu +bic tmp, sAgi_, sAge_, ROR #47 SEP +eor sAga, tmp, sAga_, ROR #39 SEP eor2 C1, C1, vAse +bic tmp, sAgo_, sAgi_, ROR #42 SEP +eor sAge, tmp, sAge_, ROR #25 SEP +bic tmp, sAgu_, sAgo_, ROR #16 SEP bcax_m0 vAsu, vBsu, vBse, vBsa +eor sAgi, tmp, sAgi_, ROR #58 SEP +bic tmp, sAga_, sAgu_, ROR #31 SEP eor2 C2, C2, vAsi +eor sAgo, tmp, sAgo_, ROR #47 SEP +bic tmp, sAge_, sAga_, ROR #56 SEP eor2 C3, C3, vAso +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP +eor sAka, tmp, sAka_, ROR #24 SEP bcax_m1 vAba, vBba, vBbi, vBbe +bic tmp, sAko_, sAki_, ROR #47 SEP +eor sAke, tmp, sAke_, ROR #2 SEP bcax_m0 vAbe, vBbe, vBbo, vBbi +bic tmp, sAku_, sAko_, ROR #10 SEP +eor sAki, tmp, sAki_, ROR #57 SEP +bic tmp, sAka_, sAku_, ROR #47 SEP eor2 C1, C1, vAbe +eor sAko, tmp, sAko_, ROR #57 SEP +bic tmp, sAke_, sAka_, ROR #5 SEP restore x26, STACK_OFFSET_CONST +eor sAku, tmp, sAku_, ROR #52 SEP +bic tmp, sAmi_, sAme_, ROR #38 SEP ldr vvtmpq, [x26], #16 +eor sAma, tmp, sAma_, ROR #47 SEP +bic tmp, sAmo_, sAmi_, ROR #5 SEP +eor sAme, tmp, sAme_, ROR #43 SEP save x26, STACK_OFFSET_CONST +bic tmp, sAmu_, sAmo_, ROR #41 SEP +eor sAmi, tmp, sAmi_, ROR #46 SEP eor vAba.16b, vAba.16b, vvtmp.16b +bic tmp, sAma_, sAmu_, ROR #35 SEP +ldr cur_const, [const_addr, count, UXTW #3] SEP eor2 C4, C4, vAsu +add count, count, #1 SEP +eor sAmo, tmp, sAmo_, ROR #12 SEP +bic tmp, sAme_, sAma_, ROR #9 SEP bcax_m0 vAbi, vBbi, vBbu, vBbo +eor sAmu, tmp, sAmu_, ROR #44 SEP +bic tmp, sAsi_, sAse_, ROR #48 SEP bcax_m1 vAbo, vBbo, vBba, vBbu +eor sAsa, tmp, sAsa_, ROR #41 SEP +bic tmp, sAso_, sAsi_, ROR #2 SEP +eor sAse, tmp, sAse_, ROR #50 SEP eor2 C3, C3, vAbo +bic tmp, sAsu_, sAso_, ROR #25 SEP +eor sAsi, tmp, sAsi_, ROR #27 SEP eor2 C2, C2, vAbi +bic tmp, sAsa_, sAsu_, ROR #60 SEP +eor sAso, tmp, sAso_, ROR #21 SEP eor2 C0, C0, vAba +bic tmp, sAse_, sAsa_, ROR #57 SEP +eor sAsu, tmp, sAsu_, ROR #53 SEP +bic tmp, sAbi_, sAbe_, ROR #63 SEP bcax_m0 vAbu, vBbu, vBbe, vBba +eor s_Aba, s_Aba_, tmp, ROR #21 SEP +bic tmp, sAbo_, sAbi_, ROR #42 SEP eor2 C4, C4, vAbu +eor sAbe, tmp, sAbe_, ROR #41 SEP +bic tmp, sAbu_, sAbo_, ROR #57 SEP restore(vAga) +eor sAbi, tmp, sAbi_, ROR #35 SEP +bic tmp, s_Aba_, sAbu_, ROR #50 SEP +eor sAbo, tmp, sAbo_, ROR #43 SEP .unreq vvtmp +bic tmp, sAbe_, s_Aba_, ROR #44 SEP +eor sAbu, tmp, sAbu_, ROR #30 SEP .unreq vvtmpq +eor s_Aba, s_Aba, cur_const SEP +.endm + +.macro hybrid_round_final +save count, STACK_OFFSET_COUNT SEP +eor sC0, sAka, sAsa, ROR #50 SEP vvtmp .req vBba +eor sC1, sAse, sAge, ROR #60 SEP +eor sC2, sAmi, sAgi, ROR #59 SEP +eor sC3, sAgo, sAso, ROR #30 SEP rax1_m0 E2, C1, C3 +eor sC4, sAbu, sAsu, ROR #53 SEP +eor sC0, sAma, sC0, ROR #49 SEP +eor sC1, sAbe, sC1, ROR #44 SEP +eor sC2, sAki, sC2, ROR #26 SEP rax1_m1 E4, C3, C0 +eor sC3, sAmo, sC3, ROR #63 SEP +eor sC4, sAmu, sC4, ROR #56 SEP +eor sC0, sAga, sC0, ROR #57 SEP rax1_m0 E1, C0, C2 +eor sC1, sAme, sC1, ROR #58 SEP +eor sC2, sAbi, sC2, ROR #60 SEP +eor sC3, sAko, sC3, ROR #38 SEP +eor sC4, sAgu, sC4, ROR #48 SEP rax1_m1 E3, C2, C4 +eor sC0, s_Aba, sC0, ROR #61 SEP +eor sC1, sAke, sC1, ROR #57 SEP +eor sC2, sAsi, sC2, ROR #52 SEP +eor sC3, sAbo, sC3, ROR #63 SEP rax1_m0 E0, C4, C1 +eor sC4, sAku, sC4, ROR #50 SEP +ror sC1, sC1, 56 SEP +ror sC4, sC4, 58 SEP .unreq vvtmp +ror sC2, sC2, 62 SEP +eor sE1, sC0, sC2, ROR #63 SEP +eor sE3, sC2, sC4, ROR #63 SEP +eor sE0, sC4, sC1, ROR #63 SEP vvtmp .req C1 +eor sE2, sC1, sC3, ROR #63 SEP +eor sE4, sC3, sC0, ROR #63 SEP +eor s_Aba_, sE0, s_Aba SEP +eor sAsa_, sE2, sAbi, ROR #50 SEP vvtmpq .req C1q +eor sAbi_, sE2, sAki, ROR #46 SEP +eor sAki_, sE3, sAko, ROR #63 SEP +eor sAko_, sE4, sAmu, ROR #28 SEP eor vBba.16b, vAba.16b, E0.16b +eor sAmu_, sE3, sAso, ROR #2 SEP +eor sAso_, sE0, sAma, ROR #54 SEP +eor sAka_, sE1, sAbe, ROR #43 SEP +eor sAse_, sE3, sAgo, ROR #36 SEP xar_m0 vBsa, vAbi, E2, 2 +eor sAgo_, sE1, sAme, ROR #49 SEP +eor sAke_, sE2, sAgi, ROR #3 SEP +eor sAgi_, sE0, sAka, ROR #39 SEP +eor sAga_, sE3, sAbo SEP xar_m1 vBbi, vAki, E2, 21 +eor sAbo_, sE3, sAmo, ROR #37 SEP +eor sAmo_, sE2, sAmi, ROR #8 SEP +eor sAmi_, sE1, sAke, ROR #56 SEP xar_m0 vBki, vAko, E3, 39 +eor sAge_, sE4, sAgu, ROR #44 SEP +eor sAgu_, sE2, sAsi, ROR #62 SEP +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP xar_m1 vBko, vAmu, E4, 56 +eor sAma_, sE4, sAbu, ROR #20 SEP +eor sAbu_, sE4, sAsu, ROR #9 SEP +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP xar_m0 vBmu, vAso, E3, 8 +eor sAbe_, sE1, sAge, ROR #19 SEP +load_constant_ptr SEP +restore count, STACK_OFFSET_COUNT SEP xar_m1 vBso, vAma, E0, 23 +bic tmp, sAgi_, sAge_, ROR #47 SEP +eor sAga, tmp, sAga_, ROR #39 SEP +bic tmp, sAgo_, sAgi_, ROR #42 SEP +eor sAge, tmp, sAge_, ROR #25 SEP xar_m0 vBka, vAbe, E1, 63 +bic tmp, sAgu_, sAgo_, ROR #16 SEP +eor sAgi, tmp, sAgi_, ROR #58 SEP +bic tmp, sAga_, sAgu_, ROR #31 SEP +eor sAgo, tmp, sAgo_, ROR #47 SEP xar_m1 vBse, vAgo, E3, 9 +bic tmp, sAge_, sAga_, ROR #56 SEP +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP xar_m0 vBgo, vAme, E1, 19 +eor sAka, tmp, sAka_, ROR #24 SEP +bic tmp, sAko_, sAki_, ROR #47 SEP +eor sAke, tmp, sAke_, ROR #2 SEP +bic tmp, sAku_, sAko_, ROR #10 SEP xar_m1 vBke, vAgi, E2, 58 +eor sAki, tmp, sAki_, ROR #57 SEP +bic tmp, sAka_, sAku_, ROR #47 SEP +eor sAko, tmp, sAko_, ROR #57 SEP +bic tmp, sAke_, sAka_, ROR #5 SEP xar_m0 vBgi, vAka, E0, 61 +eor sAku, tmp, sAku_, ROR #52 SEP +bic tmp, sAmi_, sAme_, ROR #38 SEP +eor sAma, tmp, sAma_, ROR #47 SEP xar_m1 vBga, vAbo, E3, 36 +bic tmp, sAmo_, sAmi_, ROR #5 SEP +eor sAme, tmp, sAme_, ROR #43 SEP +bic tmp, sAmu_, sAmo_, ROR #41 SEP +eor sAmi, tmp, sAmi_, ROR #46 SEP xar_m0 vBbo, vAmo, E3, 43 +bic tmp, sAma_, sAmu_, ROR #35 SEP +ldr cur_const, [const_addr, count, UXTW #3] SEP +add count, count, #1 SEP xar_m1 vBmo, vAmi, E2, 49 +eor sAmo, tmp, sAmo_, ROR #12 SEP +bic tmp, sAme_, sAma_, ROR #9 SEP +eor sAmu, tmp, sAmu_, ROR #44 SEP +bic tmp, sAsi_, sAse_, ROR #48 SEP xar_m0 vBmi, vAke, E1, 54 +eor sAsa, tmp, sAsa_, ROR #41 SEP +bic tmp, sAso_, sAsi_, ROR #2 SEP +eor sAse, tmp, sAse_, ROR #50 SEP +bic tmp, sAsu_, sAso_, ROR #25 SEP xar_m1 vBge, vAgu, E4, 44 +eor sAsi, tmp, sAsi_, ROR #27 SEP +bic tmp, sAsa_, sAsu_, ROR #60 SEP +eor sAso, tmp, sAso_, ROR #21 SEP mov E3.16b, vAga.16b +bic tmp, sAse_, sAsa_, ROR #57 SEP +eor sAsu, tmp, sAsu_, ROR #53 SEP +bic tmp, sAbi_, sAbe_, ROR #63 SEP +eor s_Aba, s_Aba_, tmp, ROR #21 SEP bcax_m1 vAga, vBga, vBgi, vBge +bic tmp, sAbo_, sAbi_, ROR #42 SEP +eor sAbe, tmp, sAbe_, ROR #41 SEP +bic tmp, sAbu_, sAbo_, ROR #57 SEP +eor sAbi, tmp, sAbi_, ROR #35 SEP xar_m0 vBgu, vAsi, E2, 3 +bic tmp, s_Aba_, sAbu_, ROR #50 SEP +eor sAbo, tmp, sAbo_, ROR #43 SEP +bic tmp, sAbe_, s_Aba_, ROR #44 SEP xar_m1 vBsi, vAku, E4, 25 +eor sAbu, tmp, sAbu_, ROR #30 SEP +eor s_Aba, s_Aba, cur_const SEP +save count, STACK_OFFSET_COUNT SEP +eor sC0, sAka, sAsa, ROR #50 SEP xar_m0 vBku, vAsa, E0, 46 +eor sC1, sAse, sAge, ROR #60 SEP +eor sC2, sAmi, sAgi, ROR #59 SEP +eor sC3, sAgo, sAso, ROR #30 SEP +eor sC4, sAbu, sAsu, ROR #53 SEP xar_m1 vBma, vAbu, E4, 37 +eor sC0, sAma, sC0, ROR #49 SEP +eor sC1, sAbe, sC1, ROR #44 SEP +eor sC2, sAki, sC2, ROR #26 SEP xar_m0 vBbu, vAsu, E4, 50 +eor sC3, sAmo, sC3, ROR #63 SEP +eor sC4, sAmu, sC4, ROR #56 SEP +eor sC0, sAga, sC0, ROR #57 SEP +eor sC1, sAme, sC1, ROR #58 SEP xar_m1 vBsu, vAse, E1, 62 +eor sC2, sAbi, sC2, ROR #60 SEP +eor sC3, sAko, sC3, ROR #38 SEP +eor sC4, sAgu, sC4, ROR #48 SEP +eor sC0, s_Aba, sC0, ROR #61 SEP xar_m0 vBme, E3, E0, 28 +eor sC1, sAke, sC1, ROR #57 SEP +eor sC2, sAsi, sC2, ROR #52 SEP +eor sC3, sAbo, sC3, ROR #63 SEP xar_m1 vBbe, vAge, E1, 20 +eor sC4, sAku, sC4, ROR #50 SEP +ror sC1, sC1, 56 SEP +ror sC4, sC4, 58 SEP +ror sC2, sC2, 62 SEP bcax_m0 vAge, vBge, vBgo, vBgi +eor sE1, sC0, sC2, ROR #63 SEP +eor sE3, sC2, sC4, ROR #63 SEP +eor sE0, sC4, sC1, ROR #63 SEP +eor sE2, sC1, sC3, ROR #63 SEP bcax_m1 vAgi, vBgi, vBgu, vBgo +eor sE4, sC3, sC0, ROR #63 SEP +eor s_Aba_, sE0, s_Aba SEP +eor sAsa_, sE2, sAbi, ROR #50 SEP bcax_m0 vAgo, vBgo, vBga, vBgu +eor sAbi_, sE2, sAki, ROR #46 SEP +eor sAki_, sE3, sAko, ROR #63 SEP +eor sAko_, sE4, sAmu, ROR #28 SEP +eor sAmu_, sE3, sAso, ROR #2 SEP bcax_m1 vAgu, vBgu, vBge, vBga +eor sAso_, sE0, sAma, ROR #54 SEP +eor sAka_, sE1, sAbe, ROR #43 SEP +eor sAse_, sE3, sAgo, ROR #36 SEP +eor sAgo_, sE1, sAme, ROR #49 SEP bcax_m0 vAka, vBka, vBki, vBke +eor sAke_, sE2, sAgi, ROR #3 SEP +eor sAgi_, sE0, sAka, ROR #39 SEP +eor sAga_, sE3, sAbo SEP bcax_m1 vAke, vBke, vBko, vBki +eor sAbo_, sE3, sAmo, ROR #37 SEP +eor sAmo_, sE2, sAmi, ROR #8 SEP +eor sAmi_, sE1, sAke, ROR #56 SEP +eor sAge_, sE4, sAgu, ROR #44 SEP bcax_m0 vAki, vBki, vBku, vBko +eor sAgu_, sE2, sAsi, ROR #62 SEP +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP +eor sAma_, sE4, sAbu, ROR #20 SEP bcax_m1 vAko, vBko, vBka, vBku +eor sAbu_, sE4, sAsu, ROR #9 SEP +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP bcax_m0 vAku, vBku, vBke, vBka +eor sAbe_, sE1, sAge, ROR #19 SEP +load_constant_ptr SEP +restore count, STACK_OFFSET_COUNT SEP +bic tmp, sAgi_, sAge_, ROR #47 SEP bcax_m1 vAma, vBma, vBmi, vBme +eor sAga, tmp, sAga_, ROR #39 SEP +bic tmp, sAgo_, sAgi_, ROR #42 SEP +eor sAge, tmp, sAge_, ROR #25 SEP bcax_m0 vAme, vBme, vBmo, vBmi +bic tmp, sAgu_, sAgo_, ROR #16 SEP +eor sAgi, tmp, sAgi_, ROR #58 SEP +bic tmp, sAga_, sAgu_, ROR #31 SEP +eor sAgo, tmp, sAgo_, ROR #47 SEP bcax_m1 vAmi, vBmi, vBmu, vBmo +bic tmp, sAge_, sAga_, ROR #56 SEP +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP +eor sAka, tmp, sAka_, ROR #24 SEP bcax_m0 vAmo, vBmo, vBma, vBmu +bic tmp, sAko_, sAki_, ROR #47 SEP +eor sAke, tmp, sAke_, ROR #2 SEP +bic tmp, sAku_, sAko_, ROR #10 SEP bcax_m1 vAmu, vBmu, vBme, vBma +eor sAki, tmp, sAki_, ROR #57 SEP +bic tmp, sAka_, sAku_, ROR #47 SEP +eor sAko, tmp, sAko_, ROR #57 SEP +bic tmp, sAke_, sAka_, ROR #5 SEP bcax_m0 vAsa, vBsa, vBsi, vBse +eor sAku, tmp, sAku_, ROR #52 SEP +bic tmp, sAmi_, sAme_, ROR #38 SEP +eor sAma, tmp, sAma_, ROR #47 SEP +bic tmp, sAmo_, sAmi_, ROR #5 SEP bcax_m1 vAse, vBse, vBso, vBsi +eor sAme, tmp, sAme_, ROR #43 SEP +bic tmp, sAmu_, sAmo_, ROR #41 SEP +eor sAmi, tmp, sAmi_, ROR #46 SEP bcax_m0 vAsi, vBsi, vBsu, vBso +bic tmp, sAma_, sAmu_, ROR #35 SEP +ldr cur_const, [const_addr, count, UXTW #3] SEP +add count, count, #1 SEP +eor sAmo, tmp, sAmo_, ROR #12 SEP bcax_m1 vAso, vBso, vBsa, vBsu +bic tmp, sAme_, sAma_, ROR #9 SEP +eor sAmu, tmp, sAmu_, ROR #44 SEP +bic tmp, sAsi_, sAse_, ROR #48 SEP +eor sAsa, tmp, sAsa_, ROR #41 SEP bcax_m0 vAsu, vBsu, vBse, vBsa +bic tmp, sAso_, sAsi_, ROR #2 SEP +eor sAse, tmp, sAse_, ROR #50 SEP +bic tmp, sAsu_, sAso_, ROR #25 SEP bcax_m1 vAba, vBba, vBbi, vBbe +eor sAsi, tmp, sAsi_, ROR #27 SEP +bic tmp, sAsa_, sAsu_, ROR #60 SEP +eor sAso, tmp, sAso_, ROR #21 SEP +bic tmp, sAse_, sAsa_, ROR #57 SEP bcax_m0 vAbe, vBbe, vBbo, vBbi +eor sAsu, tmp, sAsu_, ROR #53 SEP +bic tmp, sAbi_, sAbe_, ROR #63 SEP +eor s_Aba, s_Aba_, tmp, ROR #21 SEP +bic tmp, sAbo_, sAbi_, ROR #42 SEP bcax_m1 vAbi, vBbi, vBbu, vBbo +eor sAbe, tmp, sAbe_, ROR #41 SEP +bic tmp, sAbu_, sAbo_, ROR #57 SEP +eor sAbi, tmp, sAbi_, ROR #35 SEP bcax_m0 vAbo, vBbo, vBba, vBbu +bic tmp, s_Aba_, sAbu_, ROR #50 SEP +eor sAbo, tmp, sAbo_, ROR #43 SEP +bic tmp, sAbe_, s_Aba_, ROR #44 SEP +eor sAbu, tmp, sAbu_, ROR #30 SEP bcax_m1 vAbu, vBbu, vBbe, vBba +eor s_Aba, s_Aba, cur_const SEP +ror sAga, sAga,(64-3) SEP +ror sAka, sAka,(64-25) SEP +ror sAma, sAma,(64-10) SEP restore x26, STACK_OFFSET_CONST +ror sAsa, sAsa,(64-39) SEP +ror sAbe, sAbe,(64-21) SEP +ror sAge, sAge,(64-45) SEP ldr vvtmpq, [x26], #16 +ror sAke, sAke,(64-8) SEP +ror sAme, sAme,(64-15) SEP +ror sAse, sAse,(64-41) SEP +ror sAbi, sAbi,(64-14) SEP save x26, STACK_OFFSET_CONST +ror sAgi, sAgi,(64-61) SEP +ror sAki, sAki,(64-18) SEP +ror sAmi, sAmi,(64-56) SEP +ror sAsi, sAsi,(64-2) SEP eor vAba.16b, vAba.16b, vvtmp.16b +ror sAgo, sAgo,(64-28) SEP +ror sAko, sAko,(64-1) SEP +ror sAmo, sAmo,(64-27) SEP .unreq vvtmp +ror sAso, sAso,(64-62) SEP +ror sAbu, sAbu,(64-44) SEP +ror sAgu, sAgu,(64-20) SEP +ror sAku, sAku,(64-6) SEP .unreq vvtmpq +ror sAmu, sAmu,(64-36) SEP +ror sAsu, sAsu,(64-55) SEP +.endm + + +#define KECCAK_F1600_ROUNDS 24 + +.global keccak_f1600_x4_hybrid_asm_v6 +.global _keccak_f1600_x4_hybrid_asm_v6 +.text +.align 4 + +keccak_f1600_x4_hybrid_asm_v6: +_keccak_f1600_x4_hybrid_asm_v6: + alloc_stack + save_gprs + save_vregs + save input_addr, STACK_OFFSET_INPUT + + + ASM_LOAD(const_addr,round_constants_vec) + + save const_addr, STACK_OFFSET_CONST + load_input_vector 2,1 + + // First scalar Keccak computation alongside first half of SIMD computation + load_input_scalar 4,0 + hybrid_round_initial + loop_0: + hybrid_round_noninitial + cmp count, #(KECCAK_F1600_ROUNDS-3) + ble loop_0 + + hybrid_round_final + + restore input_addr, STACK_OFFSET_INPUT + store_input_scalar 4,0 + + // Second scalar Keccak computation alongsie second half of SIMD computation + load_input_scalar 4,1 + hybrid_round_initial + loop_1: + hybrid_round_noninitial + cmp count, #(KECCAK_F1600_ROUNDS-3) + ble loop_1 + + hybrid_round_final + + restore input_addr, STACK_OFFSET_INPUT + store_input_scalar 4,1 + store_input_vector 2,1 + + restore_vregs + restore_gprs + free_stack + + + ret +#endif \ No newline at end of file diff --git a/tests/keccak_neon/manual/keccak_f1600_x4_hybrid_asm_v7.s b/tests/keccak_neon/manual/keccak_f1600_x4_hybrid_asm_v7.s new file mode 100644 index 0000000..661bda5 --- /dev/null +++ b/tests/keccak_neon/manual/keccak_f1600_x4_hybrid_asm_v7.s @@ -0,0 +1,1266 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" +#if defined(__ARM_FEATURE_SHA3) + +/********************** CONSTANTS *************************/ + .data + .align(8) +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 +round_constants_vec: + .quad 0x0000000000000001 + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + .quad 0x8000000080008008 +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x26 + cur_const .req x26 + count .req w27 + + /* Mapping of Kecck-f1600 state to vector registers + * at the beginning and end of each round. */ + vAba .req v0 + vAbe .req v1 + vAbi .req v2 + vAbo .req v3 + vAbu .req v4 + vAga .req v5 + vAge .req v6 + vAgi .req v7 + vAgo .req v8 + vAgu .req v9 + vAka .req v10 + vAke .req v11 + vAki .req v12 + vAko .req v13 + vAku .req v14 + vAma .req v15 + vAme .req v16 + vAmi .req v17 + vAmo .req v18 + vAmu .req v19 + vAsa .req v20 + vAse .req v21 + vAsi .req v22 + vAso .req v23 + vAsu .req v24 + + /* q-form of the above mapping */ + vAbaq .req q0 + vAbeq .req q1 + vAbiq .req q2 + vAboq .req q3 + vAbuq .req q4 + vAgaq .req q5 + vAgeq .req q6 + vAgiq .req q7 + vAgoq .req q8 + vAguq .req q9 + vAkaq .req q10 + vAkeq .req q11 + vAkiq .req q12 + vAkoq .req q13 + vAkuq .req q14 + vAmaq .req q15 + vAmeq .req q16 + vAmiq .req q17 + vAmoq .req q18 + vAmuq .req q19 + vAsaq .req q20 + vAseq .req q21 + vAsiq .req q22 + vAsoq .req q23 + vAsuq .req q24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v30 + C1 .req v29 + C2 .req v28 + C3 .req v27 + C4 .req v26 + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req v26 + E1 .req v25 + E2 .req v29 + E3 .req v28 + E4 .req v27 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + vAbi_ .req v2 + vAbo_ .req v3 + vAbu_ .req v4 + vAga_ .req v10 + vAge_ .req v11 + vAgi_ .req v7 + vAgo_ .req v8 + vAgu_ .req v9 + vAka_ .req v15 + vAke_ .req v16 + vAki_ .req v12 + vAko_ .req v13 + vAku_ .req v14 + vAma_ .req v20 + vAme_ .req v21 + vAmi_ .req v17 + vAmo_ .req v18 + vAmu_ .req v19 + vAsa_ .req v0 + vAse_ .req v1 + vAsi_ .req v22 + vAso_ .req v23 + vAsu_ .req v24 + vAba_ .req v30 + vAbe_ .req v27 + + /* Mapping of Kecck-f1600 state to scalar registers + * at the beginning and end of each round. */ + s_Aba .req x1 + sAbe .req x6 + sAbi .req x11 + sAbo .req x16 + sAbu .req x21 + sAga .req x2 + sAge .req x7 + sAgi .req x12 + sAgo .req x17 + sAgu .req x22 + sAka .req x3 + sAke .req x8 + sAki .req x13 + sAko .req x18 + sAku .req x23 + sAma .req x4 + sAme .req x9 + sAmi .req x14 + sAmo .req x19 + sAmu .req x24 + sAsa .req x5 + sAse .req x10 + sAsi .req x15 + sAso .req x20 + sAsu .req x25 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + s_Aba_ .req x30 + sAbe_ .req x28 + sAbi_ .req x11 + sAbo_ .req x16 + sAbu_ .req x21 + sAga_ .req x3 + sAge_ .req x8 + sAgi_ .req x12 + sAgo_ .req x17 + sAgu_ .req x22 + sAka_ .req x4 + sAke_ .req x9 + sAki_ .req x13 + sAko_ .req x18 + sAku_ .req x23 + sAma_ .req x5 + sAme_ .req x10 + sAmi_ .req x14 + sAmo_ .req x19 + sAmu_ .req x24 + sAsa_ .req x1 + sAse_ .req x6 + sAsi_ .req x15 + sAso_ .req x20 + sAsu_ .req x25 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + sC0 .req x30 + sE0 .req x29 + sC1 .req x26 + sE1 .req x0 + sC2 .req x27 + sE2 .req x26 + sC3 .req x28 + sE3 .req x27 + sC4 .req x29 + sE4 .req x28 + + tmp .req x0 + +/************************ MACROS ****************************/ + +/* Macros using v8.4-A SHA-3 instructions */ + + +.macro eor2 d s0 s1 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor3_m0 d s0 s1 s2 + eor3 \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +.macro rax1_m0 d s0 s1 + rax1 \d\().2d, \s0\().2d, \s1\().2d +.endm + +.macro xar_m0 d s0 s1 imm + xar \d\().2d, \s0\().2d, \s1\().2d, #\imm +.endm + +.macro rax1_m1 d s0 s1 + xar_m0 tmp, vzr, \s1, 63 + eor \d\().16b, \s0\().16b, tmp.16b +.endm + +.macro bcax_m0 d s0 s1 s2 + bcax \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + + +.macro load_input_vector num idx + ldr vAbaq, [input_addr, #(16*(\num*0+\idx))] + ldr vAbeq, [input_addr, #(16*(\num*1+\idx))] + ldr vAbiq, [input_addr, #(16*(\num*2+\idx))] + ldr vAboq, [input_addr, #(16*(\num*3+\idx))] + ldr vAbuq, [input_addr, #(16*(\num*4+\idx))] + ldr vAgaq, [input_addr, #(16*(\num*5+\idx))] + ldr vAgeq, [input_addr, #(16*(\num*6+\idx))] + ldr vAgiq, [input_addr, #(16*(\num*7+\idx))] + ldr vAgoq, [input_addr, #(16*(\num*8+\idx))] + ldr vAguq, [input_addr, #(16*(\num*9+\idx))] + ldr vAkaq, [input_addr, #(16*(\num*10+\idx))] + ldr vAkeq, [input_addr, #(16*(\num*11+\idx))] + ldr vAkiq, [input_addr, #(16*(\num*12+\idx))] + ldr vAkoq, [input_addr, #(16*(\num*13+\idx))] + ldr vAkuq, [input_addr, #(16*(\num*14+\idx))] + ldr vAmaq, [input_addr, #(16*(\num*15+\idx))] + ldr vAmeq, [input_addr, #(16*(\num*16+\idx))] + ldr vAmiq, [input_addr, #(16*(\num*17+\idx))] + ldr vAmoq, [input_addr, #(16*(\num*18+\idx))] + ldr vAmuq, [input_addr, #(16*(\num*19+\idx))] + ldr vAsaq, [input_addr, #(16*(\num*20+\idx))] + ldr vAseq, [input_addr, #(16*(\num*21+\idx))] + ldr vAsiq, [input_addr, #(16*(\num*22+\idx))] + ldr vAsoq, [input_addr, #(16*(\num*23+\idx))] + ldr vAsuq, [input_addr, #(16*(\num*24+\idx))] +.endm + +.macro store_input_vector num idx + str vAbaq, [input_addr, #(16*(\num*0+\idx))] + str vAbeq, [input_addr, #(16*(\num*1+\idx))] + str vAbiq, [input_addr, #(16*(\num*2+\idx))] + str vAboq, [input_addr, #(16*(\num*3+\idx))] + str vAbuq, [input_addr, #(16*(\num*4+\idx))] + str vAgaq, [input_addr, #(16*(\num*5+\idx))] + str vAgeq, [input_addr, #(16*(\num*6+\idx))] + str vAgiq, [input_addr, #(16*(\num*7+\idx))] + str vAgoq, [input_addr, #(16*(\num*8+\idx))] + str vAguq, [input_addr, #(16*(\num*9+\idx))] + str vAkaq, [input_addr, #(16*(\num*10+\idx))] + str vAkeq, [input_addr, #(16*(\num*11+\idx))] + str vAkiq, [input_addr, #(16*(\num*12+\idx))] + str vAkoq, [input_addr, #(16*(\num*13+\idx))] + str vAkuq, [input_addr, #(16*(\num*14+\idx))] + str vAmaq, [input_addr, #(16*(\num*15+\idx))] + str vAmeq, [input_addr, #(16*(\num*16+\idx))] + str vAmiq, [input_addr, #(16*(\num*17+\idx))] + str vAmoq, [input_addr, #(16*(\num*18+\idx))] + str vAmuq, [input_addr, #(16*(\num*19+\idx))] + str vAsaq, [input_addr, #(16*(\num*20+\idx))] + str vAseq, [input_addr, #(16*(\num*21+\idx))] + str vAsiq, [input_addr, #(16*(\num*22+\idx))] + str vAsoq, [input_addr, #(16*(\num*23+\idx))] + str vAsuq, [input_addr, #(16*(\num*24+\idx))] +.endm + +.macro store_input_scalar num idx + str s_Aba, [input_addr, 8*(\num*(0) +\idx)] + str sAbe, [input_addr, 8*(\num*(0+1) +\idx)] + str sAbi, [input_addr, 8*(\num*(2)+ \idx)] + str sAbo, [input_addr, 8*(\num*(2+1) +\idx)] + str sAbu, [input_addr, 8*(\num*(4)+ \idx)] + str sAga, [input_addr, 8*(\num*(4+1) +\idx)] + str sAge, [input_addr, 8*(\num*(6)+ \idx)] + str sAgi, [input_addr, 8*(\num*(6+1) +\idx)] + str sAgo, [input_addr, 8*(\num*(8)+ \idx)] + str sAgu, [input_addr, 8*(\num*(8+1) +\idx)] + str sAka, [input_addr, 8*(\num*(10) +\idx)] + str sAke, [input_addr, 8*(\num*(10+1)+\idx)] + str sAki, [input_addr, 8*(\num*(12) +\idx)] + str sAko, [input_addr, 8*(\num*(12+1)+\idx)] + str sAku, [input_addr, 8*(\num*(14) +\idx)] + str sAma, [input_addr, 8*(\num*(14+1)+\idx)] + str sAme, [input_addr, 8*(\num*(16) +\idx)] + str sAmi, [input_addr, 8*(\num*(16+1)+\idx)] + str sAmo, [input_addr, 8*(\num*(18) +\idx)] + str sAmu, [input_addr, 8*(\num*(18+1)+\idx)] + str sAsa, [input_addr, 8*(\num*(20) +\idx)] + str sAse, [input_addr, 8*(\num*(20+1)+\idx)] + str sAsi, [input_addr, 8*(\num*(22) +\idx)] + str sAso, [input_addr, 8*(\num*(22+1)+\idx)] + str sAsu, [input_addr, 8*(\num*(24) +\idx)] +.endm + +.macro load_input_scalar num idx + ldr s_Aba, [input_addr, 8*(\num*(0) +\idx)] + ldr sAbe, [input_addr, 8*(\num*(0+1) +\idx)] + ldr sAbi, [input_addr, 8*(\num*(2)+ \idx)] + ldr sAbo, [input_addr, 8*(\num*(2+1) +\idx)] + ldr sAbu, [input_addr, 8*(\num*(4)+ \idx)] + ldr sAga, [input_addr, 8*(\num*(4+1) +\idx)] + ldr sAge, [input_addr, 8*(\num*(6)+ \idx)] + ldr sAgi, [input_addr, 8*(\num*(6+1) +\idx)] + ldr sAgo, [input_addr, 8*(\num*(8)+ \idx)] + ldr sAgu, [input_addr, 8*(\num*(8+1) +\idx)] + ldr sAka, [input_addr, 8*(\num*(10) +\idx)] + ldr sAke, [input_addr, 8*(\num*(10+1)+\idx)] + ldr sAki, [input_addr, 8*(\num*(12) +\idx)] + ldr sAko, [input_addr, 8*(\num*(12+1)+\idx)] + ldr sAku, [input_addr, 8*(\num*(14) +\idx)] + ldr sAma, [input_addr, 8*(\num*(14+1)+\idx)] + ldr sAme, [input_addr, 8*(\num*(16) +\idx)] + ldr sAmi, [input_addr, 8*(\num*(16+1)+\idx)] + ldr sAmo, [input_addr, 8*(\num*(18) +\idx)] + ldr sAmu, [input_addr, 8*(\num*(18+1)+\idx)] + ldr sAsa, [input_addr, 8*(\num*(20) +\idx)] + ldr sAse, [input_addr, 8*(\num*(20+1)+\idx)] + ldr sAsi, [input_addr, 8*(\num*(22) +\idx)] + ldr sAso, [input_addr, 8*(\num*(22+1)+\idx)] + ldr sAsu, [input_addr, 8*(\num*(24) +\idx)] +.endm + +#define STACK_SIZE (8*8 + 16*6 + 4*8 + 16*5) // VREGS (8*8), GPRs (16*6), count (8), const (8), input (8), padding (8) +#define STACK_BASE_GPRS (4*8) +#define STACK_BASE_VREGS (4*8+16*6) +#define STACK_BASE_TMP (8*8 + 16*6 + 4*8) +#define STACK_OFFSET_INPUT (0*8) +#define STACK_OFFSET_CONST (1*8) +#define STACK_OFFSET_COUNT (2*8) +#define STACK_OFFSET_INPUT_SCALAR (3*8) + +#define vAga_offset 0 +#define vAge_offset 1 +#define vAgi_offset 2 +#define vAgo_offset 3 +#define vAgu_offset 4 + +#define save(name) \ + str name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] +#define restore(name) \ + ldr name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] + + +.macro save_gprs + stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro save_vregs + stp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] + stp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + stp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + stp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] +.endm + +.macro restore_vregs + ldp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] + ldp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + ldp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + ldp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] +.endm + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +.macro eor5 dst, src0, src1, src2, src3, src4 + eor \dst, \src0, \src1 + eor \dst, \dst, \src2 + eor \dst, \dst, \src3 + eor \dst, \dst, \src4 +.endm + +.macro xor_rol dst, src1, src0, imm + eor \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro bic_rol dst, src1, src0, imm + bic \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro rotate dst, src, imm + ror \dst, \src, #(64-\imm) +.endm + +.macro save reg, offset + str \reg, [sp, #\offset] +.endm + +.macro restore reg, offset + ldr \reg, [sp, #\offset] +.endm + +.macro hybrid_round_initial +eor sC0, sAma, sAsa SEP +eor sC1, sAme, sAse SEP eor3_m0 C0, vAba, vAga, vAka +eor sC2, sAmi, sAsi SEP +eor sC3, sAmo, sAso SEP eor3_m0 C1, vAbe, vAge, vAke +eor sC4, sAmu, sAsu SEP +eor sC0, sAka, sC0 SEP eor3_m0 C2, vAbi, vAgi, vAki +eor sC1, sAke, sC1 SEP +eor sC2, sAki, sC2 SEP +eor sC3, sAko, sC3 SEP eor3_m0 C3, vAbo, vAgo, vAko +eor sC4, sAku, sC4 SEP +eor sC0, sAga, sC0 SEP eor3_m0 C4, vAbu, vAgu, vAku +eor sC1, sAge, sC1 SEP +eor sC2, sAgi, sC2 SEP save(vAga) +eor sC3, sAgo, sC3 SEP +eor sC4, sAgu, sC4 SEP +eor sC0, s_Aba, sC0 SEP vzr .req vAga +eor sC1, sAbe, sC1 SEP +eor sC2, sAbi, sC2 SEP eor vzr.16b, vzr.16b, vzr.16b +eor sC3, sAbo, sC3 SEP +eor sC4, sAbu, sC4 SEP +eor sE1, sC0, sC2, ROR #63 SEP save(vAge) +eor sE3, sC2, sC4, ROR #63 SEP +eor sE0, sC4, sC1, ROR #63 SEP save(vAgi) +eor sE2, sC1, sC3, ROR #63 SEP +eor sE4, sC3, sC0, ROR #63 SEP save(vAgo) +eor s_Aba_, s_Aba, sE0 SEP +eor sAsa_, sAbi, sE2 SEP +eor sAbi_, sAki, sE2 SEP save(vAgu) +eor sAki_, sAko, sE3 SEP +eor sAko_, sAmu, sE4 SEP C0r .req vAge +eor sAmu_, sAso, sE3 SEP +eor sAso_, sAma, sE0 SEP +eor sAka_, sAbe, sE1 SEP C1r .req vAgi +eor sAse_, sAgo, sE3 SEP +eor sAgo_, sAme, sE1 SEP C2r .req vAgo +eor sAke_, sAgi, sE2 SEP +eor sAgi_, sAka, sE0 SEP C3r .req vAgu +eor sAga_, sAbo, sE3 SEP +eor sAbo_, sAmo, sE3 SEP +eor sAmo_, sAmi, sE2 SEP C4r .req v31 +eor sAmi_, sAke, sE1 SEP +eor sAge_, sAgu, sE4 SEP eor3_m0 C0, C0, vAma, vAsa +eor sAgu_, sAsi, sE2 SEP +eor sAsi_, sAku, sE4 SEP +eor sAku_, sAsa, sE0 SEP eor3_m0 C1, C1, vAme, vAse +eor sAma_, sAbu, sE4 SEP +eor sAbu_, sAsu, sE4 SEP eor3_m0 C2, C2, vAmi, vAsi +eor sAsu_, sAse, sE1 SEP +eor sAme_, sAga, sE0 SEP eor3_m0 C3, C3, vAmo, vAso +eor sAbe_, sAge, sE1 SEP +load_constant_ptr SEP +tmp0 .req x0 SEP eor3_m0 C4, C4, vAmu, vAsu +tmp1 .req x29 SEP +bic tmp0, sAgi_, sAge_, ROR #47 SEP xar_m0 C2r, vzr, C2, 63 +bic tmp1, sAgo_, sAgi_, ROR #42 SEP +eor sAga, tmp0, sAga_, ROR #39 SEP +bic tmp0, sAgu_, sAgo_, ROR #16 SEP xar_m0 C4r, vzr, C4, 63 +eor sAge, tmp1, sAge_, ROR #25 SEP +bic tmp1, sAga_, sAgu_, ROR #31 SEP xar_m0 C1r, vzr, C1, 63 +eor sAgi, tmp0, sAgi_, ROR #58 SEP +bic tmp0, sAge_, sAga_, ROR #56 SEP xar_m0 C3r, vzr, C3, 63 +eor sAgo, tmp1, sAgo_, ROR #47 SEP +bic tmp1, sAki_, sAke_, ROR #19 SEP +eor sAgu, tmp0, sAgu_, ROR #23 SEP xar_m0 C0r, vzr, C0, 63 +bic tmp0, sAko_, sAki_, ROR #47 SEP +eor sAka, tmp1, sAka_, ROR #24 SEP eor2 E1, C0, C2r +bic tmp1, sAku_, sAko_, ROR #10 SEP +eor sAke, tmp0, sAke_, ROR #2 SEP +bic tmp0, sAka_, sAku_, ROR #47 SEP restore(vAgo) +eor sAki, tmp1, sAki_, ROR #57 SEP +bic tmp1, sAke_, sAka_, ROR #5 SEP eor2 E3, C2, C4r +eor sAko, tmp0, sAko_, ROR #57 SEP +bic tmp0, sAmi_, sAme_, ROR #38 SEP restore(vAga) +eor sAku, tmp1, sAku_, ROR #52 SEP +bic tmp1, sAmo_, sAmi_, ROR #5 SEP +eor sAma, tmp0, sAma_, ROR #47 SEP eor2 E0, C4, C1r +bic tmp0, sAmu_, sAmo_, ROR #41 SEP +eor sAme, tmp1, sAme_, ROR #43 SEP restore(vAgi) +bic tmp1, sAma_, sAmu_, ROR #35 SEP +eor sAmi, tmp0, sAmi_, ROR #46 SEP +bic tmp0, sAme_, sAma_, ROR #9 SEP eor2 E2, C1, C3r +ldr cur_const, [const_addr] SEP +eor sAmo, tmp1, sAmo_, ROR #12 SEP restore(vAgu) +bic tmp1, sAsi_, sAse_, ROR #48 SEP +eor sAmu, tmp0, sAmu_, ROR #44 SEP eor2 E4, C3, C0r +bic tmp0, sAso_, sAsi_, ROR #2 SEP +eor sAsa, tmp1, sAsa_, ROR #41 SEP +bic tmp1, sAsu_, sAso_, ROR #25 SEP restore(vAge) +eor sAse, tmp0, sAse_, ROR #50 SEP +bic tmp0, sAsa_, sAsu_, ROR #60 SEP eor vAba_.16b, vAba.16b, E0.16b +eor sAsi, tmp1, sAsi_, ROR #27 SEP +bic tmp1, sAse_, sAsa_, ROR #57 SEP +eor sAso, tmp0, sAso_, ROR #21 SEP xar_m0 vAsa_, vAbi, E2, 2 +mov count, #1 SEP +bic tmp0, sAbi_, sAbe_, ROR #63 SEP xar_m0 vAbi_, vAki, E2, 21 +eor sAsu, tmp1, sAsu_, ROR #53 SEP +bic tmp1, sAbo_, sAbi_, ROR #42 SEP xar_m0 vAki_, vAko, E3, 39 +eor s_Aba, s_Aba_, tmp0, ROR #21 SEP +bic tmp0, sAbu_, sAbo_, ROR #57 SEP +eor sAbe, tmp1, sAbe_, ROR #41 SEP xar_m0 vAko_, vAmu, E4, 56 +bic tmp1, s_Aba_, sAbu_, ROR #50 SEP +eor sAbi, tmp0, sAbi_, ROR #35 SEP xar_m0 vAmu_, vAso, E3, 8 +bic tmp0, sAbe_, s_Aba_, ROR #44 SEP +eor sAbo, tmp1, sAbo_, ROR #43 SEP +eor sAbu, tmp0, sAbu_, ROR #30 SEP xar_m0 vAso_, vAma, E0, 23 +eor s_Aba, s_Aba, cur_const SEP +save count, STACK_OFFSET_COUNT SEP xar_m0 vAka_, vAbe, E1, 63 +eor sC2, sAsi, sAbi, ROR #52 SEP +eor sC0, s_Aba, sAga, ROR #61 SEP xar_m0 vAse_, vAgo, E3, 9 +eor sC4, sAku, sAgu, ROR #50 SEP +eor sC1, sAke, sAme, ROR #57 SEP +eor sC3, sAbo, sAko, ROR #63 SEP xar_m0 vAgo_, vAme, E1, 19 +eor sC2, sC2, sAki, ROR #48 SEP +eor sC0, sC0, sAma, ROR #54 SEP xar_m0 vAke_, vAgi, E2, 58 +eor sC4, sC4, sAmu, ROR #34 SEP +eor sC1, sC1, sAbe, ROR #51 SEP xar_m0 vAgi_, vAka, E0, 61 +eor sC3, sC3, sAmo, ROR #37 SEP +eor sC2, sC2, sAmi, ROR #10 SEP +eor sC0, sC0, sAka, ROR #39 SEP xar_m0 vAga_, vAbo, E3, 36 +eor sC4, sC4, sAbu, ROR #26 SEP +eor sC1, sC1, sAse, ROR #31 SEP xar_m0 vAbo_, vAmo, E3, 43 +eor sC3, sC3, sAgo, ROR #36 SEP +eor sC2, sC2, sAgi, ROR #5 SEP +eor sC0, sC0, sAsa, ROR #25 SEP xar_m0 vAmo_, vAmi, E2, 49 +eor sC4, sC4, sAsu, ROR #15 SEP +eor sC1, sC1, sAge, ROR #27 SEP xar_m0 vAmi_, vAke, E1, 54 +eor sC3, sC3, sAso, ROR #2 SEP +eor sE1, sC0, sC2, ROR #61 SEP xar_m0 vAge_, vAgu, E4, 44 +ror sC2, sC2, 62 SEP +eor sE3, sC2, sC4, ROR #57 SEP +ror sC4, sC4, 58 SEP xar_m0 vAgu_, vAsi, E2, 3 +eor sE0, sC4, sC1, ROR #55 SEP +ror sC1, sC1, 56 SEP xar_m0 vAsi_, vAku, E4, 25 +eor sE2, sC1, sC3, ROR #63 SEP +eor sE4, sC3, sC0, ROR #63 SEP +eor s_Aba_, sE0, s_Aba SEP xar_m0 vAku_, vAsa, E0, 46 +eor sAsa_, sE2, sAbi, ROR #50 SEP +eor sAbi_, sE2, sAki, ROR #46 SEP xar_m0 vAma_, vAbu, E4, 37 +eor sAki_, sE3, sAko, ROR #63 SEP +eor sAko_, sE4, sAmu, ROR #28 SEP xar_m0 vAbu_, vAsu, E4, 50 +eor sAmu_, sE3, sAso, ROR #2 SEP +eor sAso_, sE0, sAma, ROR #54 SEP +eor sAka_, sE1, sAbe, ROR #43 SEP xar_m0 vAsu_, vAse, E1, 62 +eor sAse_, sE3, sAgo, ROR #36 SEP +eor sAgo_, sE1, sAme, ROR #49 SEP xar_m0 vAme_, vAga, E0, 28 +eor sAke_, sE2, sAgi, ROR #3 SEP +eor sAgi_, sE0, sAka, ROR #39 SEP +eor sAga_, sE3, sAbo SEP xar_m0 vAbe_, vAge, E1, 20 +eor sAbo_, sE3, sAmo, ROR #37 SEP +eor sAmo_, sE2, sAmi, ROR #8 SEP restore x27, STACK_OFFSET_CONST +eor sAmi_, sE1, sAke, ROR #56 SEP +eor sAge_, sE4, sAgu, ROR #44 SEP ldr q31, [x27], #16 +eor sAgu_, sE2, sAsi, ROR #62 SEP +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP save x27, STACK_OFFSET_CONST +eor sAma_, sE4, sAbu, ROR #20 SEP +eor sAbu_, sE4, sAsu, ROR #9 SEP bcax_m0 vAga, vAga_, vAgi_, vAge_ +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP +eor sAbe_, sE1, sAge, ROR #19 SEP bcax_m0 vAge, vAge_, vAgo_, vAgi_ +load_constant_ptr SEP +restore count, STACK_OFFSET_COUNT SEP bcax_m0 vAgi, vAgi_, vAgu_, vAgo_ +tmp0 .req x0 SEP +tmp1 .req x29 SEP bcax_m0 vAgo, vAgo_, vAga_, vAgu_ +bic tmp0, sAgi_, sAge_, ROR #47 SEP +bic tmp1, sAgo_, sAgi_, ROR #42 SEP +eor sAga, tmp0, sAga_, ROR #39 SEP bcax_m0 vAgu, vAgu_, vAge_, vAga_ +bic tmp0, sAgu_, sAgo_, ROR #16 SEP +eor sAge, tmp1, sAge_, ROR #25 SEP bcax_m0 vAka, vAka_, vAki_, vAke_ +bic tmp1, sAga_, sAgu_, ROR #31 SEP +eor sAgi, tmp0, sAgi_, ROR #58 SEP +bic tmp0, sAge_, sAga_, ROR #56 SEP bcax_m0 vAke, vAke_, vAko_, vAki_ +eor sAgo, tmp1, sAgo_, ROR #47 SEP +bic tmp1, sAki_, sAke_, ROR #19 SEP bcax_m0 vAki, vAki_, vAku_, vAko_ +eor sAgu, tmp0, sAgu_, ROR #23 SEP +bic tmp0, sAko_, sAki_, ROR #47 SEP bcax_m0 vAko, vAko_, vAka_, vAku_ +eor sAka, tmp1, sAka_, ROR #24 SEP +bic tmp1, sAku_, sAko_, ROR #10 SEP +eor sAke, tmp0, sAke_, ROR #2 SEP bcax_m0 vAku, vAku_, vAke_, vAka_ +bic tmp0, sAka_, sAku_, ROR #47 SEP +eor sAki, tmp1, sAki_, ROR #57 SEP bcax_m0 vAma, vAma_, vAmi_, vAme_ +bic tmp1, sAke_, sAka_, ROR #5 SEP +eor sAko, tmp0, sAko_, ROR #57 SEP +bic tmp0, sAmi_, sAme_, ROR #38 SEP bcax_m0 vAme, vAme_, vAmo_, vAmi_ +eor sAku, tmp1, sAku_, ROR #52 SEP +bic tmp1, sAmo_, sAmi_, ROR #5 SEP bcax_m0 vAmi, vAmi_, vAmu_, vAmo_ +eor sAma, tmp0, sAma_, ROR #47 SEP +bic tmp0, sAmu_, sAmo_, ROR #41 SEP bcax_m0 vAmo, vAmo_, vAma_, vAmu_ +eor sAme, tmp1, sAme_, ROR #43 SEP +bic tmp1, sAma_, sAmu_, ROR #35 SEP +eor sAmi, tmp0, sAmi_, ROR #46 SEP bcax_m0 vAmu, vAmu_, vAme_, vAma_ +bic tmp0, sAme_, sAma_, ROR #9 SEP +ldr cur_const, [const_addr, count, UXTW #3] SEP bcax_m0 vAsa, vAsa_, vAsi_, vAse_ +eor sAmo, tmp1, sAmo_, ROR #12 SEP +bic tmp1, sAsi_, sAse_, ROR #48 SEP +eor sAmu, tmp0, sAmu_, ROR #44 SEP bcax_m0 vAse, vAse_, vAso_, vAsi_ +bic tmp0, sAso_, sAsi_, ROR #2 SEP +eor sAsa, tmp1, sAsa_, ROR #41 SEP bcax_m0 vAsi, vAsi_, vAsu_, vAso_ +bic tmp1, sAsu_, sAso_, ROR #25 SEP +eor sAse, tmp0, sAse_, ROR #50 SEP bcax_m0 vAso, vAso_, vAsa_, vAsu_ +bic tmp0, sAsa_, sAsu_, ROR #60 SEP +eor sAsi, tmp1, sAsi_, ROR #27 SEP +bic tmp1, sAse_, sAsa_, ROR #57 SEP bcax_m0 vAsu, vAsu_, vAse_, vAsa_ +eor sAso, tmp0, sAso_, ROR #21 SEP +bic tmp0, sAbi_, sAbe_, ROR #63 SEP bcax_m0 vAba, vAba_, vAbi_, vAbe_ +add count, count, #1 SEP +save count, STACK_OFFSET_COUNT SEP +eor sAsu, tmp1, sAsu_, ROR #53 SEP bcax_m0 vAbe, vAbe_, vAbo_, vAbi_ +bic tmp1, sAbo_, sAbi_, ROR #42 SEP +eor s_Aba, s_Aba_, tmp0, ROR #21 SEP bcax_m0 vAbi, vAbi_, vAbu_, vAbo_ +bic tmp0, sAbu_, sAbo_, ROR #57 SEP +eor sAbe, tmp1, sAbe_, ROR #41 SEP bcax_m0 vAbo, vAbo_, vAba_, vAbu_ +bic tmp1, s_Aba_, sAbu_, ROR #50 SEP +eor sAbi, tmp0, sAbi_, ROR #35 SEP +bic tmp0, sAbe_, s_Aba_, ROR #44 SEP bcax_m0 vAbu, vAbu_, vAbe_, vAba_ +eor sAbo, tmp1, sAbo_, ROR #43 SEP +eor sAbu, tmp0, sAbu_, ROR #30 SEP eor vAba.16b, vAba.16b, v31.16b +eor s_Aba, s_Aba, cur_const SEP +.endm + + +.macro hybrid_round_noninitial +eor sC2, sAsi, sAbi, ROR #52 SEP +eor sC0, s_Aba, sAga, ROR #61 SEP eor3_m0 C0, vAba, vAga, vAka +eor sC4, sAku, sAgu, ROR #50 SEP +eor sC1, sAke, sAme, ROR #57 SEP eor3_m0 C1, vAbe, vAge, vAke +eor sC3, sAbo, sAko, ROR #63 SEP +eor sC2, sC2, sAki, ROR #48 SEP eor3_m0 C2, vAbi, vAgi, vAki +eor sC0, sC0, sAma, ROR #54 SEP +eor sC4, sC4, sAmu, ROR #34 SEP +eor sC1, sC1, sAbe, ROR #51 SEP eor3_m0 C3, vAbo, vAgo, vAko +eor sC3, sC3, sAmo, ROR #37 SEP +eor sC2, sC2, sAmi, ROR #10 SEP eor3_m0 C4, vAbu, vAgu, vAku +eor sC0, sC0, sAka, ROR #39 SEP +eor sC4, sC4, sAbu, ROR #26 SEP +eor sC1, sC1, sAse, ROR #31 SEP save(vAga) +eor sC3, sC3, sAgo, ROR #36 SEP +eor sC2, sC2, sAgi, ROR #5 SEP vzr .req vAga +eor sC0, sC0, sAsa, ROR #25 SEP +eor sC4, sC4, sAsu, ROR #15 SEP +eor sC1, sC1, sAge, ROR #27 SEP eor vzr.16b, vzr.16b, vzr.16b +eor sC3, sC3, sAso, ROR #2 SEP +eor sE1, sC0, sC2, ROR #61 SEP save(vAge) +ror sC2, sC2, 62 SEP +eor sE3, sC2, sC4, ROR #57 SEP save(vAgi) +ror sC4, sC4, 58 SEP +eor sE0, sC4, sC1, ROR #55 SEP +ror sC1, sC1, 56 SEP save(vAgo) +eor sE2, sC1, sC3, ROR #63 SEP +eor sE4, sC3, sC0, ROR #63 SEP save(vAgu) +eor s_Aba_, sE0, s_Aba SEP +eor sAsa_, sE2, sAbi, ROR #50 SEP +eor sAbi_, sE2, sAki, ROR #46 SEP C0r .req vAge +eor sAki_, sE3, sAko, ROR #63 SEP +eor sAko_, sE4, sAmu, ROR #28 SEP C1r .req vAgi +eor sAmu_, sE3, sAso, ROR #2 SEP +eor sAso_, sE0, sAma, ROR #54 SEP +eor sAka_, sE1, sAbe, ROR #43 SEP C2r .req vAgo +eor sAse_, sE3, sAgo, ROR #36 SEP +eor sAgo_, sE1, sAme, ROR #49 SEP C3r .req vAgu +eor sAke_, sE2, sAgi, ROR #3 SEP +eor sAgi_, sE0, sAka, ROR #39 SEP +eor sAga_, sE3, sAbo SEP C4r .req v31 +eor sAbo_, sE3, sAmo, ROR #37 SEP +eor sAmo_, sE2, sAmi, ROR #8 SEP eor3_m0 C0, C0, vAma, vAsa +eor sAmi_, sE1, sAke, ROR #56 SEP +eor sAge_, sE4, sAgu, ROR #44 SEP eor3_m0 C1, C1, vAme, vAse +eor sAgu_, sE2, sAsi, ROR #62 SEP +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP eor3_m0 C2, C2, vAmi, vAsi +eor sAma_, sE4, sAbu, ROR #20 SEP +eor sAbu_, sE4, sAsu, ROR #9 SEP eor3_m0 C3, C3, vAmo, vAso +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP +eor sAbe_, sE1, sAge, ROR #19 SEP eor3_m0 C4, C4, vAmu, vAsu +load_constant_ptr SEP +restore count, STACK_OFFSET_COUNT SEP xar_m0 C2r, vzr, C2, 63 +tmp0 .req x0 SEP +tmp1 .req x29 SEP +bic tmp0, sAgi_, sAge_, ROR #47 SEP xar_m0 C4r, vzr, C4, 63 +bic tmp1, sAgo_, sAgi_, ROR #42 SEP +eor sAga, tmp0, sAga_, ROR #39 SEP xar_m0 C1r, vzr, C1, 63 +bic tmp0, sAgu_, sAgo_, ROR #16 SEP +eor sAge, tmp1, sAge_, ROR #25 SEP xar_m0 C3r, vzr, C3, 63 +bic tmp1, sAga_, sAgu_, ROR #31 SEP +eor sAgi, tmp0, sAgi_, ROR #58 SEP +bic tmp0, sAge_, sAga_, ROR #56 SEP xar_m0 C0r, vzr, C0, 63 +eor sAgo, tmp1, sAgo_, ROR #47 SEP +bic tmp1, sAki_, sAke_, ROR #19 SEP eor2 E1, C0, C2r +eor sAgu, tmp0, sAgu_, ROR #23 SEP +bic tmp0, sAko_, sAki_, ROR #47 SEP +eor sAka, tmp1, sAka_, ROR #24 SEP restore(vAgo) +bic tmp1, sAku_, sAko_, ROR #10 SEP +eor sAke, tmp0, sAke_, ROR #2 SEP eor2 E3, C2, C4r +bic tmp0, sAka_, sAku_, ROR #47 SEP +eor sAki, tmp1, sAki_, ROR #57 SEP +bic tmp1, sAke_, sAka_, ROR #5 SEP restore(vAga) +eor sAko, tmp0, sAko_, ROR #57 SEP +bic tmp0, sAmi_, sAme_, ROR #38 SEP eor2 E0, C4, C1r +eor sAku, tmp1, sAku_, ROR #52 SEP +bic tmp1, sAmo_, sAmi_, ROR #5 SEP +eor sAma, tmp0, sAma_, ROR #47 SEP restore(vAgi) +bic tmp0, sAmu_, sAmo_, ROR #41 SEP +eor sAme, tmp1, sAme_, ROR #43 SEP eor2 E2, C1, C3r +bic tmp1, sAma_, sAmu_, ROR #35 SEP +eor sAmi, tmp0, sAmi_, ROR #46 SEP restore(vAgu) +bic tmp0, sAme_, sAma_, ROR #9 SEP +ldr cur_const, [const_addr, count, UXTW #3] SEP +eor sAmo, tmp1, sAmo_, ROR #12 SEP eor2 E4, C3, C0r +bic tmp1, sAsi_, sAse_, ROR #48 SEP +eor sAmu, tmp0, sAmu_, ROR #44 SEP restore(vAge) +bic tmp0, sAso_, sAsi_, ROR #2 SEP +eor sAsa, tmp1, sAsa_, ROR #41 SEP +bic tmp1, sAsu_, sAso_, ROR #25 SEP eor vAba_.16b, vAba.16b, E0.16b +eor sAse, tmp0, sAse_, ROR #50 SEP +bic tmp0, sAsa_, sAsu_, ROR #60 SEP xar_m0 vAsa_, vAbi, E2, 2 +eor sAsi, tmp1, sAsi_, ROR #27 SEP +bic tmp1, sAse_, sAsa_, ROR #57 SEP +eor sAso, tmp0, sAso_, ROR #21 SEP xar_m0 vAbi_, vAki, E2, 21 +bic tmp0, sAbi_, sAbe_, ROR #63 SEP +add count, count, #1 SEP xar_m0 vAki_, vAko, E3, 39 +save count, STACK_OFFSET_COUNT SEP +eor sAsu, tmp1, sAsu_, ROR #53 SEP +bic tmp1, sAbo_, sAbi_, ROR #42 SEP xar_m0 vAko_, vAmu, E4, 56 +eor s_Aba, s_Aba_, tmp0, ROR #21 SEP +bic tmp0, sAbu_, sAbo_, ROR #57 SEP xar_m0 vAmu_, vAso, E3, 8 +eor sAbe, tmp1, sAbe_, ROR #41 SEP +bic tmp1, s_Aba_, sAbu_, ROR #50 SEP xar_m0 vAso_, vAma, E0, 23 +eor sAbi, tmp0, sAbi_, ROR #35 SEP +bic tmp0, sAbe_, s_Aba_, ROR #44 SEP +eor sAbo, tmp1, sAbo_, ROR #43 SEP xar_m0 vAka_, vAbe, E1, 63 +eor sAbu, tmp0, sAbu_, ROR #30 SEP +eor s_Aba, s_Aba, cur_const SEP xar_m0 vAse_, vAgo, E3, 9 +eor sC2, sAsi, sAbi, ROR #52 SEP +eor sC0, s_Aba, sAga, ROR #61 SEP +eor sC4, sAku, sAgu, ROR #50 SEP xar_m0 vAgo_, vAme, E1, 19 +eor sC1, sAke, sAme, ROR #57 SEP +eor sC3, sAbo, sAko, ROR #63 SEP xar_m0 vAke_, vAgi, E2, 58 +eor sC2, sC2, sAki, ROR #48 SEP +eor sC0, sC0, sAma, ROR #54 SEP +eor sC4, sC4, sAmu, ROR #34 SEP xar_m0 vAgi_, vAka, E0, 61 +eor sC1, sC1, sAbe, ROR #51 SEP +eor sC3, sC3, sAmo, ROR #37 SEP xar_m0 vAga_, vAbo, E3, 36 +eor sC2, sC2, sAmi, ROR #10 SEP +eor sC0, sC0, sAka, ROR #39 SEP xar_m0 vAbo_, vAmo, E3, 43 +eor sC4, sC4, sAbu, ROR #26 SEP +eor sC1, sC1, sAse, ROR #31 SEP +eor sC3, sC3, sAgo, ROR #36 SEP xar_m0 vAmo_, vAmi, E2, 49 +eor sC2, sC2, sAgi, ROR #5 SEP +eor sC0, sC0, sAsa, ROR #25 SEP xar_m0 vAmi_, vAke, E1, 54 +eor sC4, sC4, sAsu, ROR #15 SEP +eor sC1, sC1, sAge, ROR #27 SEP +eor sC3, sC3, sAso, ROR #2 SEP xar_m0 vAge_, vAgu, E4, 44 +eor sE1, sC0, sC2, ROR #61 SEP +ror sC2, sC2, 62 SEP xar_m0 vAgu_, vAsi, E2, 3 +eor sE3, sC2, sC4, ROR #57 SEP +ror sC4, sC4, 58 SEP +eor sE0, sC4, sC1, ROR #55 SEP xar_m0 vAsi_, vAku, E4, 25 +ror sC1, sC1, 56 SEP +eor sE2, sC1, sC3, ROR #63 SEP xar_m0 vAku_, vAsa, E0, 46 +eor sE4, sC3, sC0, ROR #63 SEP +eor s_Aba_, sE0, s_Aba SEP +eor sAsa_, sE2, sAbi, ROR #50 SEP xar_m0 vAma_, vAbu, E4, 37 +eor sAbi_, sE2, sAki, ROR #46 SEP +eor sAki_, sE3, sAko, ROR #63 SEP xar_m0 vAbu_, vAsu, E4, 50 +eor sAko_, sE4, sAmu, ROR #28 SEP +eor sAmu_, sE3, sAso, ROR #2 SEP xar_m0 vAsu_, vAse, E1, 62 +eor sAso_, sE0, sAma, ROR #54 SEP +eor sAka_, sE1, sAbe, ROR #43 SEP +eor sAse_, sE3, sAgo, ROR #36 SEP xar_m0 vAme_, vAga, E0, 28 +eor sAgo_, sE1, sAme, ROR #49 SEP +eor sAke_, sE2, sAgi, ROR #3 SEP xar_m0 vAbe_, vAge, E1, 20 +eor sAgi_, sE0, sAka, ROR #39 SEP +eor sAga_, sE3, sAbo SEP +eor sAbo_, sE3, sAmo, ROR #37 SEP restore x27, STACK_OFFSET_CONST +eor sAmo_, sE2, sAmi, ROR #8 SEP +eor sAmi_, sE1, sAke, ROR #56 SEP ldr q31, [x27], #16 +eor sAge_, sE4, sAgu, ROR #44 SEP +eor sAgu_, sE2, sAsi, ROR #62 SEP +eor sAsi_, sE4, sAku, ROR #58 SEP save x27, STACK_OFFSET_CONST +eor sAku_, sE0, sAsa, ROR #25 SEP +eor sAma_, sE4, sAbu, ROR #20 SEP bcax_m0 vAga, vAga_, vAgi_, vAge_ +eor sAbu_, sE4, sAsu, ROR #9 SEP +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP bcax_m0 vAge, vAge_, vAgo_, vAgi_ +eor sAbe_, sE1, sAge, ROR #19 SEP +load_constant_ptr SEP bcax_m0 vAgi, vAgi_, vAgu_, vAgo_ +restore count, STACK_OFFSET_COUNT SEP +tmp0 .req x0 SEP bcax_m0 vAgo, vAgo_, vAga_, vAgu_ +tmp1 .req x29 SEP +bic tmp0, sAgi_, sAge_, ROR #47 SEP +bic tmp1, sAgo_, sAgi_, ROR #42 SEP bcax_m0 vAgu, vAgu_, vAge_, vAga_ +eor sAga, tmp0, sAga_, ROR #39 SEP +bic tmp0, sAgu_, sAgo_, ROR #16 SEP bcax_m0 vAka, vAka_, vAki_, vAke_ +eor sAge, tmp1, sAge_, ROR #25 SEP +bic tmp1, sAga_, sAgu_, ROR #31 SEP +eor sAgi, tmp0, sAgi_, ROR #58 SEP bcax_m0 vAke, vAke_, vAko_, vAki_ +bic tmp0, sAge_, sAga_, ROR #56 SEP +eor sAgo, tmp1, sAgo_, ROR #47 SEP bcax_m0 vAki, vAki_, vAku_, vAko_ +bic tmp1, sAki_, sAke_, ROR #19 SEP +eor sAgu, tmp0, sAgu_, ROR #23 SEP +bic tmp0, sAko_, sAki_, ROR #47 SEP bcax_m0 vAko, vAko_, vAka_, vAku_ +eor sAka, tmp1, sAka_, ROR #24 SEP +bic tmp1, sAku_, sAko_, ROR #10 SEP bcax_m0 vAku, vAku_, vAke_, vAka_ +eor sAke, tmp0, sAke_, ROR #2 SEP +bic tmp0, sAka_, sAku_, ROR #47 SEP bcax_m0 vAma, vAma_, vAmi_, vAme_ +eor sAki, tmp1, sAki_, ROR #57 SEP +bic tmp1, sAke_, sAka_, ROR #5 SEP +eor sAko, tmp0, sAko_, ROR #57 SEP bcax_m0 vAme, vAme_, vAmo_, vAmi_ +bic tmp0, sAmi_, sAme_, ROR #38 SEP +eor sAku, tmp1, sAku_, ROR #52 SEP bcax_m0 vAmi, vAmi_, vAmu_, vAmo_ +bic tmp1, sAmo_, sAmi_, ROR #5 SEP +eor sAma, tmp0, sAma_, ROR #47 SEP +bic tmp0, sAmu_, sAmo_, ROR #41 SEP bcax_m0 vAmo, vAmo_, vAma_, vAmu_ +eor sAme, tmp1, sAme_, ROR #43 SEP +bic tmp1, sAma_, sAmu_, ROR #35 SEP bcax_m0 vAmu, vAmu_, vAme_, vAma_ +eor sAmi, tmp0, sAmi_, ROR #46 SEP +bic tmp0, sAme_, sAma_, ROR #9 SEP +ldr cur_const, [const_addr, count, UXTW #3] SEP bcax_m0 vAsa, vAsa_, vAsi_, vAse_ +eor sAmo, tmp1, sAmo_, ROR #12 SEP +bic tmp1, sAsi_, sAse_, ROR #48 SEP bcax_m0 vAse, vAse_, vAso_, vAsi_ +eor sAmu, tmp0, sAmu_, ROR #44 SEP +bic tmp0, sAso_, sAsi_, ROR #2 SEP +eor sAsa, tmp1, sAsa_, ROR #41 SEP bcax_m0 vAsi, vAsi_, vAsu_, vAso_ +bic tmp1, sAsu_, sAso_, ROR #25 SEP +eor sAse, tmp0, sAse_, ROR #50 SEP bcax_m0 vAso, vAso_, vAsa_, vAsu_ +bic tmp0, sAsa_, sAsu_, ROR #60 SEP +eor sAsi, tmp1, sAsi_, ROR #27 SEP bcax_m0 vAsu, vAsu_, vAse_, vAsa_ +bic tmp1, sAse_, sAsa_, ROR #57 SEP +eor sAso, tmp0, sAso_, ROR #21 SEP +bic tmp0, sAbi_, sAbe_, ROR #63 SEP bcax_m0 vAba, vAba_, vAbi_, vAbe_ +add count, count, #1 SEP +save count, STACK_OFFSET_COUNT SEP bcax_m0 vAbe, vAbe_, vAbo_, vAbi_ +eor sAsu, tmp1, sAsu_, ROR #53 SEP +bic tmp1, sAbo_, sAbi_, ROR #42 SEP +eor s_Aba, s_Aba_, tmp0, ROR #21 SEP bcax_m0 vAbi, vAbi_, vAbu_, vAbo_ +bic tmp0, sAbu_, sAbo_, ROR #57 SEP +eor sAbe, tmp1, sAbe_, ROR #41 SEP bcax_m0 vAbo, vAbo_, vAba_, vAbu_ +bic tmp1, s_Aba_, sAbu_, ROR #50 SEP +eor sAbi, tmp0, sAbi_, ROR #35 SEP +bic tmp0, sAbe_, s_Aba_, ROR #44 SEP bcax_m0 vAbu, vAbu_, vAbe_, vAba_ +eor sAbo, tmp1, sAbo_, ROR #43 SEP +eor sAbu, tmp0, sAbu_, ROR #30 SEP eor vAba.16b, vAba.16b, v31.16b +eor s_Aba, s_Aba, cur_const SEP +.endm + + +.macro hybrid_round_final +eor sC2, sAsi, sAbi, ROR #52 SEP +eor sC0, s_Aba, sAga, ROR #61 SEP eor3_m0 C0, vAba, vAga, vAka +eor sC4, sAku, sAgu, ROR #50 SEP +eor sC1, sAke, sAme, ROR #57 SEP eor3_m0 C1, vAbe, vAge, vAke +eor sC3, sAbo, sAko, ROR #63 SEP +eor sC2, sC2, sAki, ROR #48 SEP +eor sC0, sC0, sAma, ROR #54 SEP eor3_m0 C2, vAbi, vAgi, vAki +eor sC4, sC4, sAmu, ROR #34 SEP +eor sC1, sC1, sAbe, ROR #51 SEP +eor sC3, sC3, sAmo, ROR #37 SEP eor3_m0 C3, vAbo, vAgo, vAko +eor sC2, sC2, sAmi, ROR #10 SEP +eor sC0, sC0, sAka, ROR #39 SEP eor3_m0 C4, vAbu, vAgu, vAku +eor sC4, sC4, sAbu, ROR #26 SEP +eor sC1, sC1, sAse, ROR #31 SEP +eor sC3, sC3, sAgo, ROR #36 SEP save(vAga) +eor sC2, sC2, sAgi, ROR #5 SEP +eor sC0, sC0, sAsa, ROR #25 SEP +eor sC4, sC4, sAsu, ROR #15 SEP vzr .req vAga +eor sC1, sC1, sAge, ROR #27 SEP +eor sC3, sC3, sAso, ROR #2 SEP eor vzr.16b, vzr.16b, vzr.16b +eor sE1, sC0, sC2, ROR #61 SEP +ror sC2, sC2, 62 SEP +eor sE3, sC2, sC4, ROR #57 SEP save(vAge) +ror sC4, sC4, 58 SEP +eor sE0, sC4, sC1, ROR #55 SEP +ror sC1, sC1, 56 SEP save(vAgi) +eor sE2, sC1, sC3, ROR #63 SEP +eor sE4, sC3, sC0, ROR #63 SEP save(vAgo) +eor s_Aba_, sE0, s_Aba SEP +eor sAsa_, sE2, sAbi, ROR #50 SEP +eor sAbi_, sE2, sAki, ROR #46 SEP save(vAgu) +eor sAki_, sE3, sAko, ROR #63 SEP +eor sAko_, sE4, sAmu, ROR #28 SEP +eor sAmu_, sE3, sAso, ROR #2 SEP C0r .req vAge +eor sAso_, sE0, sAma, ROR #54 SEP +eor sAka_, sE1, sAbe, ROR #43 SEP +eor sAse_, sE3, sAgo, ROR #36 SEP C1r .req vAgi +eor sAgo_, sE1, sAme, ROR #49 SEP +eor sAke_, sE2, sAgi, ROR #3 SEP C2r .req vAgo +eor sAgi_, sE0, sAka, ROR #39 SEP +eor sAga_, sE3, sAbo SEP +eor sAbo_, sE3, sAmo, ROR #37 SEP C3r .req vAgu +eor sAmo_, sE2, sAmi, ROR #8 SEP +eor sAmi_, sE1, sAke, ROR #56 SEP +eor sAge_, sE4, sAgu, ROR #44 SEP C4r .req v31 +eor sAgu_, sE2, sAsi, ROR #62 SEP +eor sAsi_, sE4, sAku, ROR #58 SEP eor3_m0 C0, C0, vAma, vAsa +eor sAku_, sE0, sAsa, ROR #25 SEP +eor sAma_, sE4, sAbu, ROR #20 SEP +eor sAbu_, sE4, sAsu, ROR #9 SEP eor3_m0 C1, C1, vAme, vAse +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP +eor sAbe_, sE1, sAge, ROR #19 SEP eor3_m0 C2, C2, vAmi, vAsi +load_constant_ptr SEP +restore count, STACK_OFFSET_COUNT SEP eor3_m0 C3, C3, vAmo, vAso +tmp0 .req x0 SEP +tmp1 .req x29 SEP +bic tmp0, sAgi_, sAge_, ROR #47 SEP eor3_m0 C4, C4, vAmu, vAsu +bic tmp1, sAgo_, sAgi_, ROR #42 SEP +eor sAga, tmp0, sAga_, ROR #39 SEP +bic tmp0, sAgu_, sAgo_, ROR #16 SEP xar_m0 C2r, vzr, C2, 63 +eor sAge, tmp1, sAge_, ROR #25 SEP +bic tmp1, sAga_, sAgu_, ROR #31 SEP xar_m0 C4r, vzr, C4, 63 +eor sAgi, tmp0, sAgi_, ROR #58 SEP +bic tmp0, sAge_, sAga_, ROR #56 SEP +eor sAgo, tmp1, sAgo_, ROR #47 SEP xar_m0 C1r, vzr, C1, 63 +bic tmp1, sAki_, sAke_, ROR #19 SEP +eor sAgu, tmp0, sAgu_, ROR #23 SEP +bic tmp0, sAko_, sAki_, ROR #47 SEP xar_m0 C3r, vzr, C3, 63 +eor sAka, tmp1, sAka_, ROR #24 SEP +bic tmp1, sAku_, sAko_, ROR #10 SEP +eor sAke, tmp0, sAke_, ROR #2 SEP xar_m0 C0r, vzr, C0, 63 +bic tmp0, sAka_, sAku_, ROR #47 SEP +eor sAki, tmp1, sAki_, ROR #57 SEP eor2 E1, C0, C2r +bic tmp1, sAke_, sAka_, ROR #5 SEP +eor sAko, tmp0, sAko_, ROR #57 SEP +bic tmp0, sAmi_, sAme_, ROR #38 SEP restore(vAgo) +eor sAku, tmp1, sAku_, ROR #52 SEP +bic tmp1, sAmo_, sAmi_, ROR #5 SEP +eor sAma, tmp0, sAma_, ROR #47 SEP eor2 E3, C2, C4r +bic tmp0, sAmu_, sAmo_, ROR #41 SEP +eor sAme, tmp1, sAme_, ROR #43 SEP restore(vAga) +bic tmp1, sAma_, sAmu_, ROR #35 SEP +eor sAmi, tmp0, sAmi_, ROR #46 SEP +bic tmp0, sAme_, sAma_, ROR #9 SEP eor2 E0, C4, C1r +ldr cur_const, [const_addr, count, UXTW #3] SEP +eor sAmo, tmp1, sAmo_, ROR #12 SEP +bic tmp1, sAsi_, sAse_, ROR #48 SEP restore(vAgi) +eor sAmu, tmp0, sAmu_, ROR #44 SEP +bic tmp0, sAso_, sAsi_, ROR #2 SEP eor2 E2, C1, C3r +eor sAsa, tmp1, sAsa_, ROR #41 SEP +bic tmp1, sAsu_, sAso_, ROR #25 SEP +eor sAse, tmp0, sAse_, ROR #50 SEP restore(vAgu) +bic tmp0, sAsa_, sAsu_, ROR #60 SEP +eor sAsi, tmp1, sAsi_, ROR #27 SEP +bic tmp1, sAse_, sAsa_, ROR #57 SEP eor2 E4, C3, C0r +eor sAso, tmp0, sAso_, ROR #21 SEP +bic tmp0, sAbi_, sAbe_, ROR #63 SEP restore(vAge) +add count, count, #1 SEP +save count, STACK_OFFSET_COUNT SEP +eor sAsu, tmp1, sAsu_, ROR #53 SEP eor vAba_.16b, vAba.16b, E0.16b +bic tmp1, sAbo_, sAbi_, ROR #42 SEP +eor s_Aba, s_Aba_, tmp0, ROR #21 SEP +bic tmp0, sAbu_, sAbo_, ROR #57 SEP xar_m0 vAsa_, vAbi, E2, 2 +eor sAbe, tmp1, sAbe_, ROR #41 SEP +bic tmp1, s_Aba_, sAbu_, ROR #50 SEP +eor sAbi, tmp0, sAbi_, ROR #35 SEP xar_m0 vAbi_, vAki, E2, 21 +bic tmp0, sAbe_, s_Aba_, ROR #44 SEP +eor sAbo, tmp1, sAbo_, ROR #43 SEP xar_m0 vAki_, vAko, E3, 39 +eor sAbu, tmp0, sAbu_, ROR #30 SEP +eor s_Aba, s_Aba, cur_const SEP +eor sC2, sAsi, sAbi, ROR #52 SEP xar_m0 vAko_, vAmu, E4, 56 +eor sC0, s_Aba, sAga, ROR #61 SEP +eor sC4, sAku, sAgu, ROR #50 SEP +eor sC1, sAke, sAme, ROR #57 SEP xar_m0 vAmu_, vAso, E3, 8 +eor sC3, sAbo, sAko, ROR #63 SEP +eor sC2, sC2, sAki, ROR #48 SEP xar_m0 vAso_, vAma, E0, 23 +eor sC0, sC0, sAma, ROR #54 SEP +eor sC4, sC4, sAmu, ROR #34 SEP +eor sC1, sC1, sAbe, ROR #51 SEP xar_m0 vAka_, vAbe, E1, 63 +eor sC3, sC3, sAmo, ROR #37 SEP +eor sC2, sC2, sAmi, ROR #10 SEP +eor sC0, sC0, sAka, ROR #39 SEP xar_m0 vAse_, vAgo, E3, 9 +eor sC4, sC4, sAbu, ROR #26 SEP +eor sC1, sC1, sAse, ROR #31 SEP xar_m0 vAgo_, vAme, E1, 19 +eor sC3, sC3, sAgo, ROR #36 SEP +eor sC2, sC2, sAgi, ROR #5 SEP +eor sC0, sC0, sAsa, ROR #25 SEP xar_m0 vAke_, vAgi, E2, 58 +eor sC4, sC4, sAsu, ROR #15 SEP +eor sC1, sC1, sAge, ROR #27 SEP +eor sC3, sC3, sAso, ROR #2 SEP xar_m0 vAgi_, vAka, E0, 61 +eor sE1, sC0, sC2, ROR #61 SEP +ror sC2, sC2, 62 SEP xar_m0 vAga_, vAbo, E3, 36 +eor sE3, sC2, sC4, ROR #57 SEP +ror sC4, sC4, 58 SEP +eor sE0, sC4, sC1, ROR #55 SEP xar_m0 vAbo_, vAmo, E3, 43 +ror sC1, sC1, 56 SEP +eor sE2, sC1, sC3, ROR #63 SEP +eor sE4, sC3, sC0, ROR #63 SEP xar_m0 vAmo_, vAmi, E2, 49 +eor s_Aba_, sE0, s_Aba SEP +eor sAsa_, sE2, sAbi, ROR #50 SEP +eor sAbi_, sE2, sAki, ROR #46 SEP xar_m0 vAmi_, vAke, E1, 54 +eor sAki_, sE3, sAko, ROR #63 SEP +eor sAko_, sE4, sAmu, ROR #28 SEP xar_m0 vAge_, vAgu, E4, 44 +eor sAmu_, sE3, sAso, ROR #2 SEP +eor sAso_, sE0, sAma, ROR #54 SEP +eor sAka_, sE1, sAbe, ROR #43 SEP xar_m0 vAgu_, vAsi, E2, 3 +eor sAse_, sE3, sAgo, ROR #36 SEP +eor sAgo_, sE1, sAme, ROR #49 SEP +eor sAke_, sE2, sAgi, ROR #3 SEP xar_m0 vAsi_, vAku, E4, 25 +eor sAgi_, sE0, sAka, ROR #39 SEP +eor sAga_, sE3, sAbo SEP xar_m0 vAku_, vAsa, E0, 46 +eor sAbo_, sE3, sAmo, ROR #37 SEP +eor sAmo_, sE2, sAmi, ROR #8 SEP +eor sAmi_, sE1, sAke, ROR #56 SEP xar_m0 vAma_, vAbu, E4, 37 +eor sAge_, sE4, sAgu, ROR #44 SEP +eor sAgu_, sE2, sAsi, ROR #62 SEP +eor sAsi_, sE4, sAku, ROR #58 SEP xar_m0 vAbu_, vAsu, E4, 50 +eor sAku_, sE0, sAsa, ROR #25 SEP +eor sAma_, sE4, sAbu, ROR #20 SEP xar_m0 vAsu_, vAse, E1, 62 +eor sAbu_, sE4, sAsu, ROR #9 SEP +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP xar_m0 vAme_, vAga, E0, 28 +eor sAbe_, sE1, sAge, ROR #19 SEP +load_constant_ptr SEP xar_m0 vAbe_, vAge, E1, 20 +tmp0 .req x0 SEP +tmp1 .req x29 SEP restore x27, STACK_OFFSET_CONST +bic tmp0, sAgi_, sAge_, ROR #47 SEP +bic tmp1, sAgo_, sAgi_, ROR #42 SEP +eor sAga, tmp0, sAga_, ROR #39 SEP ldr q31, [x27], #16 +bic tmp0, sAgu_, sAgo_, ROR #16 SEP +eor sAge, tmp1, sAge_, ROR #25 SEP +bic tmp1, sAga_, sAgu_, ROR #31 SEP save x27, STACK_OFFSET_CONST +restore count, STACK_OFFSET_COUNT SEP +eor sAgi, tmp0, sAgi_, ROR #58 SEP +bic tmp0, sAge_, sAga_, ROR #56 SEP +eor sAgo, tmp1, sAgo_, ROR #47 SEP bcax_m0 vAga, vAga_, vAgi_, vAge_ +bic tmp1, sAki_, sAke_, ROR #19 SEP +eor sAgu, tmp0, sAgu_, ROR #23 SEP bcax_m0 vAge, vAge_, vAgo_, vAgi_ +bic tmp0, sAko_, sAki_, ROR #47 SEP +eor sAka, tmp1, sAka_, ROR #24 SEP +bic tmp1, sAku_, sAko_, ROR #10 SEP bcax_m0 vAgi, vAgi_, vAgu_, vAgo_ +eor sAke, tmp0, sAke_, ROR #2 SEP +bic tmp0, sAka_, sAku_, ROR #47 SEP +eor sAki, tmp1, sAki_, ROR #57 SEP bcax_m0 vAgo, vAgo_, vAga_, vAgu_ +bic tmp1, sAke_, sAka_, ROR #5 SEP +eor sAko, tmp0, sAko_, ROR #57 SEP bcax_m0 vAgu, vAgu_, vAge_, vAga_ +bic tmp0, sAmi_, sAme_, ROR #38 SEP +eor sAku, tmp1, sAku_, ROR #52 SEP +bic tmp1, sAmo_, sAmi_, ROR #5 SEP bcax_m0 vAka, vAka_, vAki_, vAke_ +eor sAma, tmp0, sAma_, ROR #47 SEP +bic tmp0, sAmu_, sAmo_, ROR #41 SEP +eor sAme, tmp1, sAme_, ROR #43 SEP bcax_m0 vAke, vAke_, vAko_, vAki_ +bic tmp1, sAma_, sAmu_, ROR #35 SEP +eor sAmi, tmp0, sAmi_, ROR #46 SEP bcax_m0 vAki, vAki_, vAku_, vAko_ +bic tmp0, sAme_, sAma_, ROR #9 SEP +ldr cur_const, [const_addr, count, UXTW #3] SEP +eor sAmo, tmp1, sAmo_, ROR #12 SEP bcax_m0 vAko, vAko_, vAka_, vAku_ +bic tmp1, sAsi_, sAse_, ROR #48 SEP +eor sAmu, tmp0, sAmu_, ROR #44 SEP +bic tmp0, sAso_, sAsi_, ROR #2 SEP bcax_m0 vAku, vAku_, vAke_, vAka_ +eor sAsa, tmp1, sAsa_, ROR #41 SEP +bic tmp1, sAsu_, sAso_, ROR #25 SEP bcax_m0 vAma, vAma_, vAmi_, vAme_ +eor sAse, tmp0, sAse_, ROR #50 SEP +bic tmp0, sAsa_, sAsu_, ROR #60 SEP +eor sAsi, tmp1, sAsi_, ROR #27 SEP bcax_m0 vAme, vAme_, vAmo_, vAmi_ +bic tmp1, sAse_, sAsa_, ROR #57 SEP +eor sAso, tmp0, sAso_, ROR #21 SEP +bic tmp0, sAbi_, sAbe_, ROR #63 SEP bcax_m0 vAmi, vAmi_, vAmu_, vAmo_ +add count, count, #1 SEP +save count, STACK_OFFSET_COUNT SEP +eor sAsu, tmp1, sAsu_, ROR #53 SEP bcax_m0 vAmo, vAmo_, vAma_, vAmu_ +bic tmp1, sAbo_, sAbi_, ROR #42 SEP +eor s_Aba, s_Aba_, tmp0, ROR #21 SEP bcax_m0 vAmu, vAmu_, vAme_, vAma_ +bic tmp0, sAbu_, sAbo_, ROR #57 SEP +eor sAbe, tmp1, sAbe_, ROR #41 SEP +bic tmp1, s_Aba_, sAbu_, ROR #50 SEP bcax_m0 vAsa, vAsa_, vAsi_, vAse_ +eor sAbi, tmp0, sAbi_, ROR #35 SEP +bic tmp0, sAbe_, s_Aba_, ROR #44 SEP +eor sAbo, tmp1, sAbo_, ROR #43 SEP bcax_m0 vAse, vAse_, vAso_, vAsi_ +eor sAbu, tmp0, sAbu_, ROR #30 SEP +eor s_Aba, s_Aba, cur_const SEP bcax_m0 vAsi, vAsi_, vAsu_, vAso_ +ror sAga, sAga,(64-3) SEP +ror sAbu, sAbu,(64-44) SEP +ror sAka, sAka,(64-25) SEP bcax_m0 vAso, vAso_, vAsa_, vAsu_ +ror sAke, sAke,(64-8) SEP +ror sAma, sAma,(64-10) SEP +ror sAku, sAku,(64-6) SEP bcax_m0 vAsu, vAsu_, vAse_, vAsa_ +ror sAsa, sAsa,(64-39) SEP +ror sAse, sAse,(64-41) SEP bcax_m0 vAba, vAba_, vAbi_, vAbe_ +ror sAbe, sAbe,(64-21) SEP +ror sAge, sAge,(64-45) SEP +ror sAgi, sAgi,(64-61) SEP bcax_m0 vAbe, vAbe_, vAbo_, vAbi_ +ror sAme, sAme,(64-15) SEP +ror sAmi, sAmi,(64-56) SEP +ror sAbi, sAbi,(64-14) SEP bcax_m0 vAbi, vAbi_, vAbu_, vAbo_ +ror sAki, sAki,(64-18) SEP +ror sAko, sAko,(64-1) SEP bcax_m0 vAbo, vAbo_, vAba_, vAbu_ +ror sAsi, sAsi,(64-2) SEP +ror sAso, sAso,(64-62) SEP +ror sAgo, sAgo,(64-28) SEP bcax_m0 vAbu, vAbu_, vAbe_, vAba_ +ror sAgu, sAgu,(64-20) SEP +ror sAmo, sAmo,(64-27) SEP +ror sAmu, sAmu,(64-36) SEP eor vAba.16b, vAba.16b, v31.16b +ror sAsu, sAsu,(64-55) SEP +.endm + + + +#define KECCAK_F1600_ROUNDS 24 + +.global keccak_f1600_x4_hybrid_asm_v7 +.global _keccak_f1600_x4_hybrid_asm_v7 +.text +.align 4 + +keccak_f1600_x4_hybrid_asm_v7: +_keccak_f1600_x4_hybrid_asm_v7: + alloc_stack + save_gprs + save_vregs + save input_addr, STACK_OFFSET_INPUT + + + ASM_LOAD(const_addr,round_constants_vec) + + save const_addr, STACK_OFFSET_CONST + load_input_vector 2,1 + + // First scalar Keccak computation alongside first half of SIMD computation + load_input_scalar 4,0 + hybrid_round_initial + loop_0: + hybrid_round_noninitial + cmp count, #(KECCAK_F1600_ROUNDS-3) + ble loop_0 + + hybrid_round_final + + restore input_addr, STACK_OFFSET_INPUT + store_input_scalar 4,0 + + // Second scalar Keccak computation alongsie second half of SIMD computation + load_input_scalar 4,1 + hybrid_round_initial + loop_1: + hybrid_round_noninitial + cmp count, #(KECCAK_F1600_ROUNDS-3) + ble loop_1 + + hybrid_round_final + + restore input_addr, STACK_OFFSET_INPUT + store_input_scalar 4,1 + store_input_vector 2,1 + + restore_vregs + restore_gprs + free_stack + + + ret +#endif \ No newline at end of file diff --git a/tests/keccak_neon/manual/keccak_f1600_x4_hybrid_asm_v8.s b/tests/keccak_neon/manual/keccak_f1600_x4_hybrid_asm_v8.s new file mode 100644 index 0000000..10e3410 --- /dev/null +++ b/tests/keccak_neon/manual/keccak_f1600_x4_hybrid_asm_v8.s @@ -0,0 +1,1367 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" +#if defined(__ARM_FEATURE_SHA3) + +/********************** CONSTANTS *************************/ + .data + .align(8) +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 +round_constants_vec: + .quad 0x0000000000000001 + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + .quad 0x8000000080008008 +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x29 + count .req w27 + cur_const .req x26 + + /* Mapping of Kecck-f1600 SIMD state to vector registers + * at the beginning and end of each round. */ + + /* Mapping of Kecck-f1600 state to vector registers + * at the beginning and end of each round. */ + vAba .req v0 + vAbe .req v1 + vAbi .req v2 + vAbo .req v3 + vAbu .req v4 + vAga .req v5 + vAge .req v6 + vAgi .req v7 + vAgo .req v8 + vAgu .req v9 + vAka .req v10 + vAke .req v11 + vAki .req v12 + vAko .req v13 + vAku .req v14 + vAma .req v15 + vAme .req v16 + vAmi .req v17 + vAmo .req v18 + vAmu .req v19 + vAsa .req v20 + vAse .req v21 + vAsi .req v22 + vAso .req v23 + vAsu .req v24 + + /* q-form of the above mapping */ + vAbaq .req q0 + vAbeq .req q1 + vAbiq .req q2 + vAboq .req q3 + vAbuq .req q4 + vAgaq .req q5 + vAgeq .req q6 + vAgiq .req q7 + vAgoq .req q8 + vAguq .req q9 + vAkaq .req q10 + vAkeq .req q11 + vAkiq .req q12 + vAkoq .req q13 + vAkuq .req q14 + vAmaq .req q15 + vAmeq .req q16 + vAmiq .req q17 + vAmoq .req q18 + vAmuq .req q19 + vAsaq .req q20 + vAseq .req q21 + vAsiq .req q22 + vAsoq .req q23 + vAsuq .req q24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v27 + C1 .req v28 + C2 .req v29 + C3 .req v30 + C4 .req v31 + + C0q .req q27 + C1q .req q28 + C2q .req q29 + C3q .req q30 + C4q .req q31 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + vBba .req v25 // fresh + vBbe .req v26 // fresh + vBbi .req vAbi + vBbo .req vAbo + vBbu .req vAbu + vBga .req vAka + vBge .req vAke + vBgi .req vAgi + vBgo .req vAgo + vBgu .req vAgu + vBka .req vAma + vBke .req vAme + vBki .req vAki + vBko .req vAko + vBku .req vAku + vBma .req vAsa + vBme .req vAse + vBmi .req vAmi + vBmo .req vAmo + vBmu .req vAmu + vBsa .req vAba + vBse .req vAbe + vBsi .req vAsi + vBso .req vAso + vBsu .req vAsu + + vBbaq .req q25 // fresh + vBbeq .req q26 // fresh + vBbiq .req vAbiq + vBboq .req vAboq + vBbuq .req vAbuq + vBgaq .req vAkaq + vBgeq .req vAkeq + vBgiq .req vAgiq + vBgoq .req vAgoq + vBguq .req vAguq + vBkaq .req vAmaq + vBkeq .req vAmeq + vBkiq .req vAkiq + vBkoq .req vAkoq + vBkuq .req vAkuq + vBmaq .req vAsaq + vBmeq .req vAseq + vBmiq .req vAmiq + vBmoq .req vAmoq + vBmuq .req vAmuq + vBsaq .req vAbaq + vBseq .req vAbeq + vBsiq .req vAsiq + vBsoq .req vAsoq + vBsuq .req vAsuq + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req C4 + E1 .req C0 + E2 .req vBbe // fresh + E3 .req C2 + E4 .req C3 + + E0q .req C4q + E1q .req C0q + E2q .req vBbeq // fresh + E3q .req C2q + E4q .req C3q + + /* Mapping of Kecck-f1600 state to scalar registers + * at the beginning and end of each round. */ + s_Aba .req x1 + sAbe .req x6 + sAbi .req x11 + sAbo .req x16 + sAbu .req x21 + sAga .req x2 + sAge .req x7 + sAgi .req x12 + sAgo .req x17 + sAgu .req x22 + sAka .req x3 + sAke .req x8 + sAki .req x13 + sAko .req x18 + sAku .req x23 + sAma .req x4 + sAme .req x9 + sAmi .req x14 + sAmo .req x19 + sAmu .req x24 + sAsa .req x5 + sAse .req x10 + sAsi .req x15 + sAso .req x20 + sAsu .req x25 + + /* sA_[y,2*x+3*y] = rot(A[x,y]) */ + s_Aba_ .req x0 + sAbe_ .req x28 + sAbi_ .req x11 + sAbo_ .req x16 + sAbu_ .req x21 + sAga_ .req x3 + sAge_ .req x8 + sAgi_ .req x12 + sAgo_ .req x17 + sAgu_ .req x22 + sAka_ .req x4 + sAke_ .req x9 + sAki_ .req x13 + sAko_ .req x18 + sAku_ .req x23 + sAma_ .req x5 + sAme_ .req x10 + sAmi_ .req x14 + sAmo_ .req x19 + sAmu_ .req x24 + sAsa_ .req x1 + sAse_ .req x6 + sAsi_ .req x15 + sAso_ .req x20 + sAsu_ .req x25 + + /* sC[x] = sA[x,0] xor sA[x,1] xor sA[x,2] xor sA[x,3] xor sA[x,4], for x in 0..4 */ + /* sE[x] = sC[x-1] xor rot(C[x+1],1), for x in 0..4 */ + sC0 .req x0 + sE0 .req x29 + sC1 .req x26 + sE1 .req x30 + sC2 .req x27 + sE2 .req x26 + sC3 .req x28 + sE3 .req x27 + sC4 .req x29 + sE4 .req x28 + + tmp .req x30 + +/************************ MACROS ****************************/ + +/* Macros using v8.4-A SHA-3 instructions */ + +.macro eor3_m0 d s0 s1 s2 + eor3 \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +.macro rax1_m0 d s0 s1 + rax1 \d\().2d, \s0\().2d, \s1\().2d +.endm + +.macro xar_m0 d s0 s1 imm + xar \d\().2d, \s0\().2d, \s1\().2d, #\imm +.endm + +.macro bcax_m0 d s0 s1 s2 + bcax \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +.macro eor3_m1_0 d s0 s1 s2 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor2 d s0 s1 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor3_m1_1 d s0 s1 s2 + eor \d\().16b, \d\().16b, \s2\().16b +.endm + +.macro eor3_m1 d s0 s1 s2 + eor3_m1_0 \d, \s0, \s1, \s2 + eor3_m1_1 \d, \s0, \s1, \s2 +.endm + +.macro rax1_m1 d s0 s1 + // Use add instead of SHL #1 + shl vvtmp.2d, \s1\().2d, #1 + sri vvtmp.2d, \s1\().2d, #63 + eor \d\().16b, vvtmp.16b, \s0\().16b +.endm + + .macro xar_m1 d s0 s1 imm + eor \s0\().16b, \s0\().16b, \s1\().16b + shl \d\().2d, \s0\().2d, #(64-\imm) + sri \d\().2d, \s0\().2d, #(\imm) +.endm + + .macro xar_m1_0 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 63 + eor \s0\().16b, \s0\().16b, \s1\().16b + .elseif \imm == 62 + eor \s0\().16b, \s0\().16b, \s1\().16b + .else + eor \s0\().16b, \s0\().16b, \s1\().16b + .endif +.endm + + .macro xar_m1_1 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 63 + add \d\().2d, \s0\().2d, \s0\().2d + sri \d\().2d, \s0\().2d, #(63) + .elseif \imm == 62 + add \d\().2d, \s0\().2d, \s0\().2d + add \d\().2d, \d\().2d, \d\().2d + sri \d\().2d, \s0\().2d, #(62) + .else + shl \d\().2d, \s0\().2d, #(64-\imm) + sri \d\().2d, \s0\().2d, #(\imm) + .endif +.endm + +.macro bcax_m1 d s0 s1 s2 + bic vvtmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, vvtmp.16b, \s0\().16b +.endm + +.macro load_input_vector num idx + ldr vAbaq, [input_addr, #(16*(\num*0+\idx))] + ldr vAbeq, [input_addr, #(16*(\num*1+\idx))] + ldr vAbiq, [input_addr, #(16*(\num*2+\idx))] + ldr vAboq, [input_addr, #(16*(\num*3+\idx))] + ldr vAbuq, [input_addr, #(16*(\num*4+\idx))] + ldr vAgaq, [input_addr, #(16*(\num*5+\idx))] + ldr vAgeq, [input_addr, #(16*(\num*6+\idx))] + ldr vAgiq, [input_addr, #(16*(\num*7+\idx))] + ldr vAgoq, [input_addr, #(16*(\num*8+\idx))] + ldr vAguq, [input_addr, #(16*(\num*9+\idx))] + ldr vAkaq, [input_addr, #(16*(\num*10+\idx))] + ldr vAkeq, [input_addr, #(16*(\num*11+\idx))] + ldr vAkiq, [input_addr, #(16*(\num*12+\idx))] + ldr vAkoq, [input_addr, #(16*(\num*13+\idx))] + ldr vAkuq, [input_addr, #(16*(\num*14+\idx))] + ldr vAmaq, [input_addr, #(16*(\num*15+\idx))] + ldr vAmeq, [input_addr, #(16*(\num*16+\idx))] + ldr vAmiq, [input_addr, #(16*(\num*17+\idx))] + ldr vAmoq, [input_addr, #(16*(\num*18+\idx))] + ldr vAmuq, [input_addr, #(16*(\num*19+\idx))] + ldr vAsaq, [input_addr, #(16*(\num*20+\idx))] + ldr vAseq, [input_addr, #(16*(\num*21+\idx))] + ldr vAsiq, [input_addr, #(16*(\num*22+\idx))] + ldr vAsoq, [input_addr, #(16*(\num*23+\idx))] + ldr vAsuq, [input_addr, #(16*(\num*24+\idx))] +.endm + +.macro store_input_vector num idx + str vAbaq, [input_addr, #(16*(\num*0+\idx))] + str vAbeq, [input_addr, #(16*(\num*1+\idx))] + str vAbiq, [input_addr, #(16*(\num*2+\idx))] + str vAboq, [input_addr, #(16*(\num*3+\idx))] + str vAbuq, [input_addr, #(16*(\num*4+\idx))] + str vAgaq, [input_addr, #(16*(\num*5+\idx))] + str vAgeq, [input_addr, #(16*(\num*6+\idx))] + str vAgiq, [input_addr, #(16*(\num*7+\idx))] + str vAgoq, [input_addr, #(16*(\num*8+\idx))] + str vAguq, [input_addr, #(16*(\num*9+\idx))] + str vAkaq, [input_addr, #(16*(\num*10+\idx))] + str vAkeq, [input_addr, #(16*(\num*11+\idx))] + str vAkiq, [input_addr, #(16*(\num*12+\idx))] + str vAkoq, [input_addr, #(16*(\num*13+\idx))] + str vAkuq, [input_addr, #(16*(\num*14+\idx))] + str vAmaq, [input_addr, #(16*(\num*15+\idx))] + str vAmeq, [input_addr, #(16*(\num*16+\idx))] + str vAmiq, [input_addr, #(16*(\num*17+\idx))] + str vAmoq, [input_addr, #(16*(\num*18+\idx))] + str vAmuq, [input_addr, #(16*(\num*19+\idx))] + str vAsaq, [input_addr, #(16*(\num*20+\idx))] + str vAseq, [input_addr, #(16*(\num*21+\idx))] + str vAsiq, [input_addr, #(16*(\num*22+\idx))] + str vAsoq, [input_addr, #(16*(\num*23+\idx))] + str vAsuq, [input_addr, #(16*(\num*24+\idx))] +.endm + +.macro store_input_scalar num idx + str s_Aba, [input_addr, 8*(\num*(0) +\idx)] + str sAbe, [input_addr, 8*(\num*(0+1) +\idx)] + str sAbi, [input_addr, 8*(\num*(2)+ \idx)] + str sAbo, [input_addr, 8*(\num*(2+1) +\idx)] + str sAbu, [input_addr, 8*(\num*(4)+ \idx)] + str sAga, [input_addr, 8*(\num*(4+1) +\idx)] + str sAge, [input_addr, 8*(\num*(6)+ \idx)] + str sAgi, [input_addr, 8*(\num*(6+1) +\idx)] + str sAgo, [input_addr, 8*(\num*(8)+ \idx)] + str sAgu, [input_addr, 8*(\num*(8+1) +\idx)] + str sAka, [input_addr, 8*(\num*(10) +\idx)] + str sAke, [input_addr, 8*(\num*(10+1)+\idx)] + str sAki, [input_addr, 8*(\num*(12) +\idx)] + str sAko, [input_addr, 8*(\num*(12+1)+\idx)] + str sAku, [input_addr, 8*(\num*(14) +\idx)] + str sAma, [input_addr, 8*(\num*(14+1)+\idx)] + str sAme, [input_addr, 8*(\num*(16) +\idx)] + str sAmi, [input_addr, 8*(\num*(16+1)+\idx)] + str sAmo, [input_addr, 8*(\num*(18) +\idx)] + str sAmu, [input_addr, 8*(\num*(18+1)+\idx)] + str sAsa, [input_addr, 8*(\num*(20) +\idx)] + str sAse, [input_addr, 8*(\num*(20+1)+\idx)] + str sAsi, [input_addr, 8*(\num*(22) +\idx)] + str sAso, [input_addr, 8*(\num*(22+1)+\idx)] + str sAsu, [input_addr, 8*(\num*(24) +\idx)] +.endm + +.macro load_input_scalar num idx + ldr s_Aba, [input_addr, 8*(\num*(0) +\idx)] + ldr sAbe, [input_addr, 8*(\num*(0+1) +\idx)] + ldr sAbi, [input_addr, 8*(\num*(2)+ \idx)] + ldr sAbo, [input_addr, 8*(\num*(2+1) +\idx)] + ldr sAbu, [input_addr, 8*(\num*(4)+ \idx)] + ldr sAga, [input_addr, 8*(\num*(4+1) +\idx)] + ldr sAge, [input_addr, 8*(\num*(6)+ \idx)] + ldr sAgi, [input_addr, 8*(\num*(6+1) +\idx)] + ldr sAgo, [input_addr, 8*(\num*(8)+ \idx)] + ldr sAgu, [input_addr, 8*(\num*(8+1) +\idx)] + ldr sAka, [input_addr, 8*(\num*(10) +\idx)] + ldr sAke, [input_addr, 8*(\num*(10+1)+\idx)] + ldr sAki, [input_addr, 8*(\num*(12) +\idx)] + ldr sAko, [input_addr, 8*(\num*(12+1)+\idx)] + ldr sAku, [input_addr, 8*(\num*(14) +\idx)] + ldr sAma, [input_addr, 8*(\num*(14+1)+\idx)] + ldr sAme, [input_addr, 8*(\num*(16) +\idx)] + ldr sAmi, [input_addr, 8*(\num*(16+1)+\idx)] + ldr sAmo, [input_addr, 8*(\num*(18) +\idx)] + ldr sAmu, [input_addr, 8*(\num*(18+1)+\idx)] + ldr sAsa, [input_addr, 8*(\num*(20) +\idx)] + ldr sAse, [input_addr, 8*(\num*(20+1)+\idx)] + ldr sAsi, [input_addr, 8*(\num*(22) +\idx)] + ldr sAso, [input_addr, 8*(\num*(22+1)+\idx)] + ldr sAsu, [input_addr, 8*(\num*(24) +\idx)] +.endm + +#define STACK_SIZE (8*8 + 16*6 + 3*8 + 8 + 16*34) // VREGS (8*8), GPRs (16*6), count (8), const (8), input (8), padding (8) +#define STACK_BASE_GPRS (3*8+8) +#define STACK_BASE_VREGS (3*8+8+16*6) +#define STACK_BASE_TMP (8*8 + 16*6 + 3*8 + 8) +#define STACK_OFFSET_INPUT (0*8) +#define STACK_OFFSET_CONST (1*8) +#define STACK_OFFSET_COUNT (2*8) + +#define vAga_offset 0 +#define E0_offset 1 +#define E1_offset 2 +#define E2_offset 3 +#define E3_offset 4 +#define E4_offset 5 +#define Ame_offset 7 +#define Agi_offset 8 +#define Aka_offset 9 +#define Abo_offset 10 +#define Amo_offset 11 +#define Ami_offset 12 +#define Ake_offset 13 +#define Agu_offset 14 +#define Asi_offset 15 +#define Aku_offset 16 +#define Asa_offset 17 +#define Abu_offset 18 +#define Asu_offset 19 +#define Ase_offset 20 +//#define Aga_offset 21 +#define Age_offset 22 +#define vBgo_offset 23 +#define vBke_offset 24 +#define vBgi_offset 25 +#define vBga_offset 26 +#define vBbo_offset 27 +#define vBmo_offset 28 +#define vBmi_offset 29 +#define vBge_offset 30 + +#define save(name) \ + str name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] +#define restore(name) \ + ldr name ## q, [sp, #(STACK_BASE_TMP + 16 * name ## _offset)] + + +.macro save_gprs + stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro save_vregs + stp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] + stp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + stp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + stp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] +.endm + +.macro restore_vregs + ldp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] + ldp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + ldp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + ldp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] +.endm + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +.macro eor5 dst, src0, src1, src2, src3, src4 + eor \dst, \src0, \src1 + eor \dst, \dst, \src2 + eor \dst, \dst, \src3 + eor \dst, \dst, \src4 +.endm + +.macro xor_rol dst, src1, src0, imm + eor \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro bic_rol dst, src1, src0, imm + bic \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro rotate dst, src, imm + ror \dst, \src, #(64-\imm) +.endm + +.macro save reg, offset + str \reg, [sp, #\offset] +.endm + +.macro restore reg, offset + ldr \reg, [sp, #\offset] +.endm + +.macro hybrid_round_initial +eor sC0, sAma, sAsa SEP +eor sC1, sAme, sAse SEP eor3_m0 C1,vAbe,vAge,vAke +eor sC2, sAmi, sAsi SEP +eor sC3, sAmo, sAso SEP eor3_m1 C3,vAbo,vAgo,vAko +eor sC4, sAmu, sAsu SEP +eor sC0, sAka, sC0 SEP eor3_m0 C0,vAba,vAga,vAka +eor sC1, sAke, sC1 SEP +eor sC2, sAki, sC2 SEP eor3_m1 C2,vAbi,vAgi,vAki +eor sC3, sAko, sC3 SEP +eor sC4, sAku, sC4 SEP eor3_m0 C4,vAbu,vAgu,vAku +eor sC0, sAga, sC0 SEP +eor sC1, sAge, sC1 SEP eor3_m1 C1, C1,vAme, vAse +eor sC2, sAgi, sC2 SEP +eor sC3, sAgo, sC3 SEP eor3_m0 C3, C3,vAmo, vAso +eor sC4, sAgu, sC4 SEP +eor sC0, s_Aba, sC0 SEP eor3_m1 C0, C0,vAma, vAsa +eor sC1, sAbe, sC1 SEP +eor sC2, sAbi, sC2 SEP eor3_m0 C2, C2,vAmi, vAsi +eor sC3, sAbo, sC3 SEP +eor sC4, sAbu, sC4 SEP eor3_m1 C4, C4,vAmu, vAsu +eor sE1, sC0, sC2, ROR #63 SEP +eor sE3, sC2, sC4, ROR #63 SEP vvtmp .req vBba +eor sE0, sC4, sC1, ROR #63 SEP +eor sE2, sC1, sC3, ROR #63 SEP rax1_m0 E2, C1, C3 +eor sE4, sC3, sC0, ROR #63 SEP +eor s_Aba_, s_Aba, sE0 SEP +eor sAsa_, sAbi, sE2 SEP rax1_m1 E4, C3, C0 +eor sAbi_, sAki, sE2 SEP +eor sAki_, sAko, sE3 SEP rax1_m0 E1, C0, C2 +eor sAko_, sAmu, sE4 SEP +eor sAmu_, sAso, sE3 SEP rax1_m1 E3, C2, C4 +eor sAso_, sAma, sE0 SEP +eor sAka_, sAbe, sE1 SEP rax1_m0 E0, C4, C1 +eor sAse_, sAgo, sE3 SEP +eor sAgo_, sAme, sE1 SEP .unreq vvtmp +eor sAke_, sAgi, sE2 SEP +eor sAgi_, sAka, sE0 SEP vvtmp .req C1 +eor sAga_, sAbo, sE3 SEP +eor sAbo_, sAmo, sE3 SEP vvtmpq .req C1q +eor sAmo_, sAmi, sE2 SEP +eor sAmi_, sAke, sE1 SEP eor vBba.16b, vAba.16b, E0.16b +eor sAge_, sAgu, sE4 SEP +eor sAgu_, sAsi, sE2 SEP xar_m1 vBsa, vAbi, E2, 2 +eor sAsi_, sAku, sE4 SEP +eor sAku_, sAsa, sE0 SEP xar_m0 vBbi, vAki, E2, 21 +eor sAma_, sAbu, sE4 SEP +eor sAbu_, sAsu, sE4 SEP xar_m1 vBki, vAko, E3, 39 +eor sAsu_, sAse, sE1 SEP +eor sAme_, sAga, sE0 SEP +eor sAbe_, sAge, sE1 SEP xar_m0 vBko, vAmu, E4, 56 +load_constant_ptr SEP +bic tmp, sAgi_, sAge_, ROR #47 SEP xar_m1 vBmu, vAso, E3, 8 +eor sAga, tmp, sAga_, ROR #39 SEP +bic tmp, sAgo_, sAgi_, ROR #42 SEP xar_m0 vBso, vAma, E0, 23 +eor sAge, tmp, sAge_, ROR #25 SEP +bic tmp, sAgu_, sAgo_, ROR #16 SEP xar_m1 vBka, vAbe, E1, 63 +eor sAgi, tmp, sAgi_, ROR #58 SEP +bic tmp, sAga_, sAgu_, ROR #31 SEP xar_m0 vBse, vAgo, E3, 9 +eor sAgo, tmp, sAgo_, ROR #47 SEP +bic tmp, sAge_, sAga_, ROR #56 SEP xar_m1 vBgo, vAme, E1, 19 +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP xar_m0 vBke, vAgi, E2, 58 +eor sAka, tmp, sAka_, ROR #24 SEP +bic tmp, sAko_, sAki_, ROR #47 SEP xar_m1 vBgi, vAka, E0, 61 +eor sAke, tmp, sAke_, ROR #2 SEP +bic tmp, sAku_, sAko_, ROR #10 SEP xar_m0 vBga, vAbo, E3, 36 +eor sAki, tmp, sAki_, ROR #57 SEP +bic tmp, sAka_, sAku_, ROR #47 SEP xar_m1 vBbo, vAmo, E3, 43 +eor sAko, tmp, sAko_, ROR #57 SEP +bic tmp, sAke_, sAka_, ROR #5 SEP xar_m0 vBmo, vAmi, E2, 49 +eor sAku, tmp, sAku_, ROR #52 SEP +bic tmp, sAmi_, sAme_, ROR #38 SEP +eor sAma, tmp, sAma_, ROR #47 SEP xar_m1 vBmi, vAke, E1, 54 +bic tmp, sAmo_, sAmi_, ROR #5 SEP +eor sAme, tmp, sAme_, ROR #43 SEP xar_m0 vBge, vAgu, E4, 44 +bic tmp, sAmu_, sAmo_, ROR #41 SEP +eor sAmi, tmp, sAmi_, ROR #46 SEP mov E3.16b, vAga.16b +ldr cur_const, [const_addr] SEP +mov count, #1 SEP bcax_m1 vAga, vBga, vBgi, vBge +bic tmp, sAma_, sAmu_, ROR #35 SEP +eor sAmo, tmp, sAmo_, ROR #12 SEP xar_m0 vBgu, vAsi, E2, 3 +bic tmp, sAme_, sAma_, ROR #9 SEP +eor sAmu, tmp, sAmu_, ROR #44 SEP xar_m1 vBsi, vAku, E4, 25 +bic tmp, sAsi_, sAse_, ROR #48 SEP +eor sAsa, tmp, sAsa_, ROR #41 SEP xar_m0 vBku, vAsa, E0, 46 +bic tmp, sAso_, sAsi_, ROR #2 SEP +eor sAse, tmp, sAse_, ROR #50 SEP xar_m1 vBma, vAbu, E4, 37 +bic tmp, sAsu_, sAso_, ROR #25 SEP +eor sAsi, tmp, sAsi_, ROR #27 SEP xar_m0 vBbu, vAsu, E4, 50 +bic tmp, sAsa_, sAsu_, ROR #60 SEP +eor sAso, tmp, sAso_, ROR #21 SEP xar_m1 vBsu, vAse, E1, 62 +bic tmp, sAse_, sAsa_, ROR #57 SEP +eor sAsu, tmp, sAsu_, ROR #53 SEP xar_m0 vBme, E3, E0, 28 +bic tmp, sAbi_, sAbe_, ROR #63 SEP +eor s_Aba, s_Aba_, tmp, ROR #21 SEP xar_m1 vBbe, vAge, E1, 20 +bic tmp, sAbo_, sAbi_, ROR #42 SEP +eor sAbe, tmp, sAbe_, ROR #41 SEP +bic tmp, sAbu_, sAbo_, ROR #57 SEP bcax_m1 vAge, vBge, vBgo, vBgi +eor sAbi, tmp, sAbi_, ROR #35 SEP +bic tmp, s_Aba_, sAbu_, ROR #50 SEP bcax_m0 vAgi, vBgi, vBgu, vBgo +eor sAbo, tmp, sAbo_, ROR #43 SEP +bic tmp, sAbe_, s_Aba_, ROR #44 SEP bcax_m1 vAgo, vBgo, vBga, vBgu +eor sAbu, tmp, sAbu_, ROR #30 SEP +eor s_Aba, s_Aba, cur_const SEP bcax_m0 vAgu, vBgu, vBge, vBga +save count, STACK_OFFSET_COUNT SEP +eor sC0, sAka, sAsa, ROR #50 SEP bcax_m1 vAka, vBka, vBki, vBke +eor sC1, sAse, sAge, ROR #60 SEP +eor sC2, sAmi, sAgi, ROR #59 SEP bcax_m0 vAke, vBke, vBko, vBki +eor sC3, sAgo, sAso, ROR #30 SEP +eor sC4, sAbu, sAsu, ROR #53 SEP .unreq vvtmp +eor sC0, sAma, sC0, ROR #49 SEP +eor sC1, sAbe, sC1, ROR #44 SEP .unreq vvtmpq +eor sC2, sAki, sC2, ROR #26 SEP +eor sC3, sAmo, sC3, ROR #63 SEP eor2 C0, vAka, vAga +eor sC4, sAmu, sC4, ROR #56 SEP +eor sC0, sAga, sC0, ROR #57 SEP save(vAga) +eor sC1, sAme, sC1, ROR #58 SEP +eor sC2, sAbi, sC2, ROR #60 SEP vvtmp .req vAga +eor sC3, sAko, sC3, ROR #38 SEP +eor sC4, sAgu, sC4, ROR #48 SEP +eor sC0, s_Aba, sC0, ROR #61 SEP vvtmpq .req vAgaq +eor sC1, sAke, sC1, ROR #57 SEP +eor sC2, sAsi, sC2, ROR #52 SEP bcax_m0 vAki, vBki, vBku, vBko +eor sC3, sAbo, sC3, ROR #63 SEP +eor sC4, sAku, sC4, ROR #50 SEP bcax_m1 vAko, vBko, vBka, vBku +ror sC1, sC1, 56 SEP +ror sC4, sC4, 58 SEP eor2 C1, vAke, vAge +ror sC2, sC2, 62 SEP +eor sE1, sC0, sC2, ROR #63 SEP bcax_m0 vAku, vBku, vBke, vBka +eor sE3, sC2, sC4, ROR #63 SEP +eor sE0, sC4, sC1, ROR #63 SEP eor2 C2, vAki, vAgi +eor sE2, sC1, sC3, ROR #63 SEP +eor sE4, sC3, sC0, ROR #63 SEP bcax_m1 vAma, vBma, vBmi, vBme +eor s_Aba_, sE0, s_Aba SEP +eor sAsa_, sE2, sAbi, ROR #50 SEP eor2 C3, vAko, vAgo +eor sAbi_, sE2, sAki, ROR #46 SEP +eor sAki_, sE3, sAko, ROR #63 SEP bcax_m0 vAme, vBme, vBmo, vBmi +eor sAko_, sE4, sAmu, ROR #28 SEP +eor sAmu_, sE3, sAso, ROR #2 SEP eor2 C4, vAku, vAgu +eor sAso_, sE0, sAma, ROR #54 SEP +eor sAka_, sE1, sAbe, ROR #43 SEP bcax_m1 vAmi, vBmi, vBmu, vBmo +eor sAse_, sE3, sAgo, ROR #36 SEP +eor sAgo_, sE1, sAme, ROR #49 SEP +eor sAke_, sE2, sAgi, ROR #3 SEP eor2 C0, C0, vAma +eor sAgi_, sE0, sAka, ROR #39 SEP +eor sAga_, sE3, sAbo SEP bcax_m0 vAmo, vBmo, vBma, vBmu +eor sAbo_, sE3, sAmo, ROR #37 SEP +eor sAmo_, sE2, sAmi, ROR #8 SEP eor2 C1, C1, vAme +eor sAmi_, sE1, sAke, ROR #56 SEP +eor sAge_, sE4, sAgu, ROR #44 SEP bcax_m1 vAmu, vBmu, vBme, vBma +eor sAgu_, sE2, sAsi, ROR #62 SEP +eor sAsi_, sE4, sAku, ROR #58 SEP eor2 C2, C2, vAmi +eor sAku_, sE0, sAsa, ROR #25 SEP +eor sAma_, sE4, sAbu, ROR #20 SEP bcax_m0 vAsa, vBsa, vBsi, vBse +eor sAbu_, sE4, sAsu, ROR #9 SEP +eor sAsu_, sE1, sAse, ROR #23 SEP eor2 C3, C3, vAmo +eor sAme_, sE0, sAga, ROR #61 SEP +eor sAbe_, sE1, sAge, ROR #19 SEP bcax_m1 vAse, vBse, vBso, vBsi +load_constant_ptr SEP +restore count, STACK_OFFSET_COUNT SEP eor2 C4, C4, vAmu +bic tmp, sAgi_, sAge_, ROR #47 SEP +eor sAga, tmp, sAga_, ROR #39 SEP bcax_m0 vAsi, vBsi, vBsu, vBso +bic tmp, sAgo_, sAgi_, ROR #42 SEP +eor sAge, tmp, sAge_, ROR #25 SEP eor2 C0, C0, vAsa +bic tmp, sAgu_, sAgo_, ROR #16 SEP +eor sAgi, tmp, sAgi_, ROR #58 SEP bcax_m1 vAso, vBso, vBsa, vBsu +bic tmp, sAga_, sAgu_, ROR #31 SEP +eor sAgo, tmp, sAgo_, ROR #47 SEP +bic tmp, sAge_, sAga_, ROR #56 SEP eor2 C1, C1, vAse +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP bcax_m0 vAsu, vBsu, vBse, vBsa +eor sAka, tmp, sAka_, ROR #24 SEP +bic tmp, sAko_, sAki_, ROR #47 SEP eor2 C2, C2, vAsi +eor sAke, tmp, sAke_, ROR #2 SEP +bic tmp, sAku_, sAko_, ROR #10 SEP eor2 C3, C3, vAso +eor sAki, tmp, sAki_, ROR #57 SEP +bic tmp, sAka_, sAku_, ROR #47 SEP bcax_m1 vAba, vBba, vBbi, vBbe +eor sAko, tmp, sAko_, ROR #57 SEP +bic tmp, sAke_, sAka_, ROR #5 SEP bcax_m0 vAbe, vBbe, vBbo, vBbi +eor sAku, tmp, sAku_, ROR #52 SEP +bic tmp, sAmi_, sAme_, ROR #38 SEP eor2 C1, C1, vAbe +eor sAma, tmp, sAma_, ROR #47 SEP +bic tmp, sAmo_, sAmi_, ROR #5 SEP restore x26, STACK_OFFSET_CONST +eor sAme, tmp, sAme_, ROR #43 SEP +bic tmp, sAmu_, sAmo_, ROR #41 SEP ldr vvtmpq, [x26], #16 +eor sAmi, tmp, sAmi_, ROR #46 SEP +bic tmp, sAma_, sAmu_, ROR #35 SEP save x26, STACK_OFFSET_CONST +eor sAmo, tmp, sAmo_, ROR #12 SEP +bic tmp, sAme_, sAma_, ROR #9 SEP eor vAba.16b, vAba.16b, vvtmp.16b +eor sAmu, tmp, sAmu_, ROR #44 SEP +bic tmp, sAsi_, sAse_, ROR #48 SEP +ldr cur_const, [const_addr, count, UXTW #3] SEP eor2 C4, C4, vAsu +eor sAsa, tmp, sAsa_, ROR #41 SEP +bic tmp, sAso_, sAsi_, ROR #2 SEP bcax_m0 vAbi, vBbi, vBbu, vBbo +eor sAse, tmp, sAse_, ROR #50 SEP +bic tmp, sAsu_, sAso_, ROR #25 SEP bcax_m1 vAbo, vBbo, vBba, vBbu +eor sAsi, tmp, sAsi_, ROR #27 SEP +bic tmp, sAsa_, sAsu_, ROR #60 SEP eor2 C3, C3, vAbo +eor sAso, tmp, sAso_, ROR #21 SEP +bic tmp, sAse_, sAsa_, ROR #57 SEP eor2 C2, C2, vAbi +eor sAsu, tmp, sAsu_, ROR #53 SEP +bic tmp, sAbi_, sAbe_, ROR #63 SEP eor2 C0, C0, vAba +eor s_Aba, s_Aba_, tmp, ROR #21 SEP +bic tmp, sAbo_, sAbi_, ROR #42 SEP bcax_m0 vAbu, vBbu, vBbe, vBba +eor sAbe, tmp, sAbe_, ROR #41 SEP +bic tmp, sAbu_, sAbo_, ROR #57 SEP eor2 C4, C4, vAbu +eor sAbi, tmp, sAbi_, ROR #35 SEP +bic tmp, s_Aba_, sAbu_, ROR #50 SEP restore(vAga) +eor sAbo, tmp, sAbo_, ROR #43 SEP +bic tmp, sAbe_, s_Aba_, ROR #44 SEP .unreq vvtmp +eor sAbu, tmp, sAbu_, ROR #30 SEP +add count, count, #1 SEP .unreq vvtmpq +eor s_Aba, s_Aba, cur_const SEP +.endm + + + +.macro hybrid_round_noninitial +save count, STACK_OFFSET_COUNT SEP +eor sC0, sAka, sAsa, ROR #50 SEP vvtmp .req vBba +eor sC1, sAse, sAge, ROR #60 SEP +eor sC2, sAmi, sAgi, ROR #59 SEP rax1_m0 E2, C1, C3 +eor sC3, sAgo, sAso, ROR #30 SEP +eor sC4, sAbu, sAsu, ROR #53 SEP rax1_m0 E4, C3, C0 +eor sC0, sAma, sC0, ROR #49 SEP +eor sC1, sAbe, sC1, ROR #44 SEP +eor sC2, sAki, sC2, ROR #26 SEP rax1_m0 E1, C0, C2 +eor sC3, sAmo, sC3, ROR #63 SEP +eor sC4, sAmu, sC4, ROR #56 SEP rax1_m0 E3, C2, C4 +eor sC0, sAga, sC0, ROR #57 SEP +eor sC1, sAme, sC1, ROR #58 SEP rax1_m0 E0, C4, C1 +eor sC2, sAbi, sC2, ROR #60 SEP +eor sC3, sAko, sC3, ROR #38 SEP +eor sC4, sAgu, sC4, ROR #48 SEP .unreq vvtmp +eor sC0, s_Aba, sC0, ROR #61 SEP +eor sC1, sAke, sC1, ROR #57 SEP vvtmp .req C1 +eor sC2, sAsi, sC2, ROR #52 SEP +eor sC3, sAbo, sC3, ROR #63 SEP vvtmpq .req C1q +eor sC4, sAku, sC4, ROR #50 SEP +ror sC1, sC1, 56 SEP +ror sC4, sC4, 58 SEP eor vBba.16b, vAba.16b, E0.16b +ror sC2, sC2, 62 SEP +eor sE1, sC0, sC2, ROR #63 SEP xar_m0 vBsa, vAbi, E2, 2 +eor sE3, sC2, sC4, ROR #63 SEP +eor sE0, sC4, sC1, ROR #63 SEP +eor sE2, sC1, sC3, ROR #63 SEP xar_m0 vBbi, vAki, E2, 21 +eor sE4, sC3, sC0, ROR #63 SEP +eor s_Aba_, sE0, s_Aba SEP xar_m0 vBki, vAko, E3, 39 +eor sAsa_, sE2, sAbi, ROR #50 SEP +eor sAbi_, sE2, sAki, ROR #46 SEP xar_m1 vBko, vAmu, E4, 56 +eor sAki_, sE3, sAko, ROR #63 SEP +eor sAko_, sE4, sAmu, ROR #28 SEP +eor sAmu_, sE3, sAso, ROR #2 SEP xar_m0 vBmu, vAso, E3, 8 +eor sAso_, sE0, sAma, ROR #54 SEP +eor sAka_, sE1, sAbe, ROR #43 SEP xar_m0 vBso, vAma, E0, 23 +eor sAse_, sE3, sAgo, ROR #36 SEP +eor sAgo_, sE1, sAme, ROR #49 SEP xar_m0 vBka, vAbe, E1, 63 +eor sAke_, sE2, sAgi, ROR #3 SEP +eor sAgi_, sE0, sAka, ROR #39 SEP +eor sAga_, sE3, sAbo SEP xar_m1 vBse, vAgo, E3, 9 +eor sAbo_, sE3, sAmo, ROR #37 SEP +eor sAmo_, sE2, sAmi, ROR #8 SEP xar_m0 vBgo, vAme, E1, 19 +eor sAmi_, sE1, sAke, ROR #56 SEP +eor sAge_, sE4, sAgu, ROR #44 SEP +eor sAgu_, sE2, sAsi, ROR #62 SEP xar_m0 vBke, vAgi, E2, 58 +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP xar_m0 vBgi, vAka, E0, 61 +eor sAma_, sE4, sAbu, ROR #20 SEP +eor sAbu_, sE4, sAsu, ROR #9 SEP xar_m1 vBga, vAbo, E3, 36 +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP +eor sAbe_, sE1, sAge, ROR #19 SEP xar_m0 vBbo, vAmo, E3, 43 +load_constant_ptr SEP +restore count, STACK_OFFSET_COUNT SEP xar_m0 vBmo, vAmi, E2, 49 +bic tmp, sAgi_, sAge_, ROR #47 SEP +eor sAga, tmp, sAga_, ROR #39 SEP xar_m0 vBmi, vAke, E1, 54 +bic tmp, sAgo_, sAgi_, ROR #42 SEP +eor sAge, tmp, sAge_, ROR #25 SEP +bic tmp, sAgu_, sAgo_, ROR #16 SEP xar_m1 vBge, vAgu, E4, 44 +eor sAgi, tmp, sAgi_, ROR #58 SEP +bic tmp, sAga_, sAgu_, ROR #31 SEP mov E3.16b, vAga.16b +eor sAgo, tmp, sAgo_, ROR #47 SEP +bic tmp, sAge_, sAga_, ROR #56 SEP bcax_m0 vAga, vBga, vBgi, vBge +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP +eor sAka, tmp, sAka_, ROR #24 SEP xar_m0 vBgu, vAsi, E2, 3 +bic tmp, sAko_, sAki_, ROR #47 SEP +eor sAke, tmp, sAke_, ROR #2 SEP xar_m0 vBsi, vAku, E4, 25 +bic tmp, sAku_, sAko_, ROR #10 SEP +eor sAki, tmp, sAki_, ROR #57 SEP +bic tmp, sAka_, sAku_, ROR #47 SEP xar_m1 vBku, vAsa, E0, 46 +eor sAko, tmp, sAko_, ROR #57 SEP +bic tmp, sAke_, sAka_, ROR #5 SEP xar_m0 vBma, vAbu, E4, 37 +eor sAku, tmp, sAku_, ROR #52 SEP +bic tmp, sAmi_, sAme_, ROR #38 SEP xar_m0 vBbu, vAsu, E4, 50 +eor sAma, tmp, sAma_, ROR #47 SEP +bic tmp, sAmo_, sAmi_, ROR #5 SEP +eor sAme, tmp, sAme_, ROR #43 SEP xar_m0 vBsu, vAse, E1, 62 +bic tmp, sAmu_, sAmo_, ROR #41 SEP +eor sAmi, tmp, sAmi_, ROR #46 SEP xar_m1 vBme, E3, E0, 28 +bic tmp, sAma_, sAmu_, ROR #35 SEP +ldr cur_const, [const_addr, count, UXTW #3] SEP xar_m0 vBbe, vAge, E1, 20 +add count, count, #1 SEP +eor sAmo, tmp, sAmo_, ROR #12 SEP +bic tmp, sAme_, sAma_, ROR #9 SEP bcax_m0 vAge, vBge, vBgo, vBgi +eor sAmu, tmp, sAmu_, ROR #44 SEP +bic tmp, sAsi_, sAse_, ROR #48 SEP bcax_m0 vAgi, vBgi, vBgu, vBgo +eor sAsa, tmp, sAsa_, ROR #41 SEP +bic tmp, sAso_, sAsi_, ROR #2 SEP +eor sAse, tmp, sAse_, ROR #50 SEP bcax_m1 vAgo, vBgo, vBga, vBgu +bic tmp, sAsu_, sAso_, ROR #25 SEP +eor sAsi, tmp, sAsi_, ROR #27 SEP bcax_m0 vAgu, vBgu, vBge, vBga +bic tmp, sAsa_, sAsu_, ROR #60 SEP +eor sAso, tmp, sAso_, ROR #21 SEP bcax_m0 vAka, vBka, vBki, vBke +bic tmp, sAse_, sAsa_, ROR #57 SEP +eor sAsu, tmp, sAsu_, ROR #53 SEP +bic tmp, sAbi_, sAbe_, ROR #63 SEP bcax_m0 vAke, vBke, vBko, vBki +eor s_Aba, s_Aba_, tmp, ROR #21 SEP +bic tmp, sAbo_, sAbi_, ROR #42 SEP .unreq vvtmp +eor sAbe, tmp, sAbe_, ROR #41 SEP +bic tmp, sAbu_, sAbo_, ROR #57 SEP .unreq vvtmpq +eor sAbi, tmp, sAbi_, ROR #35 SEP +bic tmp, s_Aba_, sAbu_, ROR #50 SEP +eor sAbo, tmp, sAbo_, ROR #43 SEP eor2 C0, vAka, vAga +bic tmp, sAbe_, s_Aba_, ROR #44 SEP +eor sAbu, tmp, sAbu_, ROR #30 SEP save(vAga) +eor s_Aba, s_Aba, cur_const SEP +save count, STACK_OFFSET_COUNT SEP +eor sC0, sAka, sAsa, ROR #50 SEP vvtmp .req vAga +eor sC1, sAse, sAge, ROR #60 SEP +eor sC2, sAmi, sAgi, ROR #59 SEP vvtmpq .req vAgaq +eor sC3, sAgo, sAso, ROR #30 SEP +eor sC4, sAbu, sAsu, ROR #53 SEP bcax_m0 vAki, vBki, vBku, vBko +eor sC0, sAma, sC0, ROR #49 SEP +eor sC1, sAbe, sC1, ROR #44 SEP +eor sC2, sAki, sC2, ROR #26 SEP bcax_m0 vAko, vBko, vBka, vBku +eor sC3, sAmo, sC3, ROR #63 SEP +eor sC4, sAmu, sC4, ROR #56 SEP eor2 C1, vAke, vAge +eor sC0, sAga, sC0, ROR #57 SEP +eor sC1, sAme, sC1, ROR #58 SEP bcax_m0 vAku, vBku, vBke, vBka +eor sC2, sAbi, sC2, ROR #60 SEP +eor sC3, sAko, sC3, ROR #38 SEP +eor sC4, sAgu, sC4, ROR #48 SEP eor2 C2, vAki, vAgi +eor sC0, s_Aba, sC0, ROR #61 SEP +eor sC1, sAke, sC1, ROR #57 SEP bcax_m0 vAma, vBma, vBmi, vBme +eor sC2, sAsi, sC2, ROR #52 SEP +eor sC3, sAbo, sC3, ROR #63 SEP eor2 C3, vAko, vAgo +eor sC4, sAku, sC4, ROR #50 SEP +ror sC1, sC1, 56 SEP +ror sC4, sC4, 58 SEP bcax_m0 vAme, vBme, vBmo, vBmi +ror sC2, sC2, 62 SEP +eor sE1, sC0, sC2, ROR #63 SEP eor2 C4, vAku, vAgu +eor sE3, sC2, sC4, ROR #63 SEP +eor sE0, sC4, sC1, ROR #63 SEP +eor sE2, sC1, sC3, ROR #63 SEP bcax_m0 vAmi, vBmi, vBmu, vBmo +eor sE4, sC3, sC0, ROR #63 SEP +eor s_Aba_, sE0, s_Aba SEP eor2 C0, C0, vAma +eor sAsa_, sE2, sAbi, ROR #50 SEP +eor sAbi_, sE2, sAki, ROR #46 SEP bcax_m0 vAmo, vBmo, vBma, vBmu +eor sAki_, sE3, sAko, ROR #63 SEP +eor sAko_, sE4, sAmu, ROR #28 SEP +eor sAmu_, sE3, sAso, ROR #2 SEP eor2 C1, C1, vAme +eor sAso_, sE0, sAma, ROR #54 SEP +eor sAka_, sE1, sAbe, ROR #43 SEP bcax_m1 vAmu, vBmu, vBme, vBma +eor sAse_, sE3, sAgo, ROR #36 SEP +eor sAgo_, sE1, sAme, ROR #49 SEP eor2 C2, C2, vAmi +eor sAke_, sE2, sAgi, ROR #3 SEP +eor sAgi_, sE0, sAka, ROR #39 SEP +eor sAga_, sE3, sAbo SEP bcax_m0 vAsa, vBsa, vBsi, vBse +eor sAbo_, sE3, sAmo, ROR #37 SEP +eor sAmo_, sE2, sAmi, ROR #8 SEP eor2 C3, C3, vAmo +eor sAmi_, sE1, sAke, ROR #56 SEP +eor sAge_, sE4, sAgu, ROR #44 SEP +eor sAgu_, sE2, sAsi, ROR #62 SEP bcax_m0 vAse, vBse, vBso, vBsi +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP eor2 C4, C4, vAmu +eor sAma_, sE4, sAbu, ROR #20 SEP +eor sAbu_, sE4, sAsu, ROR #9 SEP bcax_m0 vAsi, vBsi, vBsu, vBso +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP +eor sAbe_, sE1, sAge, ROR #19 SEP eor2 C0, C0, vAsa +load_constant_ptr SEP +restore count, STACK_OFFSET_COUNT SEP bcax_m0 vAso, vBso, vBsa, vBsu +bic tmp, sAgi_, sAge_, ROR #47 SEP +eor sAga, tmp, sAga_, ROR #39 SEP eor2 C1, C1, vAse +bic tmp, sAgo_, sAgi_, ROR #42 SEP +eor sAge, tmp, sAge_, ROR #25 SEP +bic tmp, sAgu_, sAgo_, ROR #16 SEP bcax_m0 vAsu, vBsu, vBse, vBsa +eor sAgi, tmp, sAgi_, ROR #58 SEP +bic tmp, sAga_, sAgu_, ROR #31 SEP eor2 C2, C2, vAsi +eor sAgo, tmp, sAgo_, ROR #47 SEP +bic tmp, sAge_, sAga_, ROR #56 SEP eor2 C3, C3, vAso +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP +eor sAka, tmp, sAka_, ROR #24 SEP bcax_m0 vAba, vBba, vBbi, vBbe +bic tmp, sAko_, sAki_, ROR #47 SEP +eor sAke, tmp, sAke_, ROR #2 SEP bcax_m0 vAbe, vBbe, vBbo, vBbi +bic tmp, sAku_, sAko_, ROR #10 SEP +eor sAki, tmp, sAki_, ROR #57 SEP +bic tmp, sAka_, sAku_, ROR #47 SEP eor2 C1, C1, vAbe +eor sAko, tmp, sAko_, ROR #57 SEP +bic tmp, sAke_, sAka_, ROR #5 SEP restore x26, STACK_OFFSET_CONST +eor sAku, tmp, sAku_, ROR #52 SEP +bic tmp, sAmi_, sAme_, ROR #38 SEP ldr vvtmpq, [x26], #16 +eor sAma, tmp, sAma_, ROR #47 SEP +bic tmp, sAmo_, sAmi_, ROR #5 SEP +eor sAme, tmp, sAme_, ROR #43 SEP save x26, STACK_OFFSET_CONST +bic tmp, sAmu_, sAmo_, ROR #41 SEP +eor sAmi, tmp, sAmi_, ROR #46 SEP eor vAba.16b, vAba.16b, vvtmp.16b +bic tmp, sAma_, sAmu_, ROR #35 SEP +ldr cur_const, [const_addr, count, UXTW #3] SEP eor2 C4, C4, vAsu +add count, count, #1 SEP +eor sAmo, tmp, sAmo_, ROR #12 SEP +bic tmp, sAme_, sAma_, ROR #9 SEP bcax_m0 vAbi, vBbi, vBbu, vBbo +eor sAmu, tmp, sAmu_, ROR #44 SEP +bic tmp, sAsi_, sAse_, ROR #48 SEP bcax_m0 vAbo, vBbo, vBba, vBbu +eor sAsa, tmp, sAsa_, ROR #41 SEP +bic tmp, sAso_, sAsi_, ROR #2 SEP +eor sAse, tmp, sAse_, ROR #50 SEP eor2 C3, C3, vAbo +bic tmp, sAsu_, sAso_, ROR #25 SEP +eor sAsi, tmp, sAsi_, ROR #27 SEP eor2 C2, C2, vAbi +bic tmp, sAsa_, sAsu_, ROR #60 SEP +eor sAso, tmp, sAso_, ROR #21 SEP eor2 C0, C0, vAba +bic tmp, sAse_, sAsa_, ROR #57 SEP +eor sAsu, tmp, sAsu_, ROR #53 SEP +bic tmp, sAbi_, sAbe_, ROR #63 SEP bcax_m0 vAbu, vBbu, vBbe, vBba +eor s_Aba, s_Aba_, tmp, ROR #21 SEP +bic tmp, sAbo_, sAbi_, ROR #42 SEP eor2 C4, C4, vAbu +eor sAbe, tmp, sAbe_, ROR #41 SEP +bic tmp, sAbu_, sAbo_, ROR #57 SEP restore(vAga) +eor sAbi, tmp, sAbi_, ROR #35 SEP +bic tmp, s_Aba_, sAbu_, ROR #50 SEP +eor sAbo, tmp, sAbo_, ROR #43 SEP .unreq vvtmp +bic tmp, sAbe_, s_Aba_, ROR #44 SEP +eor sAbu, tmp, sAbu_, ROR #30 SEP .unreq vvtmpq +eor s_Aba, s_Aba, cur_const SEP +.endm + +.macro hybrid_round_final +save count, STACK_OFFSET_COUNT SEP +eor sC0, sAka, sAsa, ROR #50 SEP vvtmp .req vBba +eor sC1, sAse, sAge, ROR #60 SEP +eor sC2, sAmi, sAgi, ROR #59 SEP +eor sC3, sAgo, sAso, ROR #30 SEP rax1_m0 E2, C1, C3 +eor sC4, sAbu, sAsu, ROR #53 SEP +eor sC0, sAma, sC0, ROR #49 SEP +eor sC1, sAbe, sC1, ROR #44 SEP +eor sC2, sAki, sC2, ROR #26 SEP rax1_m0 E4, C3, C0 +eor sC3, sAmo, sC3, ROR #63 SEP +eor sC4, sAmu, sC4, ROR #56 SEP +eor sC0, sAga, sC0, ROR #57 SEP rax1_m0 E1, C0, C2 +eor sC1, sAme, sC1, ROR #58 SEP +eor sC2, sAbi, sC2, ROR #60 SEP +eor sC3, sAko, sC3, ROR #38 SEP +eor sC4, sAgu, sC4, ROR #48 SEP rax1_m0 E3, C2, C4 +eor sC0, s_Aba, sC0, ROR #61 SEP +eor sC1, sAke, sC1, ROR #57 SEP +eor sC2, sAsi, sC2, ROR #52 SEP +eor sC3, sAbo, sC3, ROR #63 SEP rax1_m0 E0, C4, C1 +eor sC4, sAku, sC4, ROR #50 SEP +ror sC1, sC1, 56 SEP +ror sC4, sC4, 58 SEP .unreq vvtmp +ror sC2, sC2, 62 SEP +eor sE1, sC0, sC2, ROR #63 SEP +eor sE3, sC2, sC4, ROR #63 SEP +eor sE0, sC4, sC1, ROR #63 SEP vvtmp .req C1 +eor sE2, sC1, sC3, ROR #63 SEP +eor sE4, sC3, sC0, ROR #63 SEP +eor s_Aba_, sE0, s_Aba SEP +eor sAsa_, sE2, sAbi, ROR #50 SEP vvtmpq .req C1q +eor sAbi_, sE2, sAki, ROR #46 SEP +eor sAki_, sE3, sAko, ROR #63 SEP +eor sAko_, sE4, sAmu, ROR #28 SEP eor vBba.16b, vAba.16b, E0.16b +eor sAmu_, sE3, sAso, ROR #2 SEP +eor sAso_, sE0, sAma, ROR #54 SEP +eor sAka_, sE1, sAbe, ROR #43 SEP +eor sAse_, sE3, sAgo, ROR #36 SEP xar_m0 vBsa, vAbi, E2, 2 +eor sAgo_, sE1, sAme, ROR #49 SEP +eor sAke_, sE2, sAgi, ROR #3 SEP +eor sAgi_, sE0, sAka, ROR #39 SEP +eor sAga_, sE3, sAbo SEP xar_m0 vBbi, vAki, E2, 21 +eor sAbo_, sE3, sAmo, ROR #37 SEP +eor sAmo_, sE2, sAmi, ROR #8 SEP +eor sAmi_, sE1, sAke, ROR #56 SEP xar_m0 vBki, vAko, E3, 39 +eor sAge_, sE4, sAgu, ROR #44 SEP +eor sAgu_, sE2, sAsi, ROR #62 SEP +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP xar_m1 vBko, vAmu, E4, 56 +eor sAma_, sE4, sAbu, ROR #20 SEP +eor sAbu_, sE4, sAsu, ROR #9 SEP +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP xar_m0 vBmu, vAso, E3, 8 +eor sAbe_, sE1, sAge, ROR #19 SEP +load_constant_ptr SEP +restore count, STACK_OFFSET_COUNT SEP xar_m0 vBso, vAma, E0, 23 +bic tmp, sAgi_, sAge_, ROR #47 SEP +eor sAga, tmp, sAga_, ROR #39 SEP +bic tmp, sAgo_, sAgi_, ROR #42 SEP +eor sAge, tmp, sAge_, ROR #25 SEP xar_m0 vBka, vAbe, E1, 63 +bic tmp, sAgu_, sAgo_, ROR #16 SEP +eor sAgi, tmp, sAgi_, ROR #58 SEP +bic tmp, sAga_, sAgu_, ROR #31 SEP +eor sAgo, tmp, sAgo_, ROR #47 SEP xar_m1 vBse, vAgo, E3, 9 +bic tmp, sAge_, sAga_, ROR #56 SEP +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP xar_m0 vBgo, vAme, E1, 19 +eor sAka, tmp, sAka_, ROR #24 SEP +bic tmp, sAko_, sAki_, ROR #47 SEP +eor sAke, tmp, sAke_, ROR #2 SEP +bic tmp, sAku_, sAko_, ROR #10 SEP xar_m0 vBke, vAgi, E2, 58 +eor sAki, tmp, sAki_, ROR #57 SEP +bic tmp, sAka_, sAku_, ROR #47 SEP +eor sAko, tmp, sAko_, ROR #57 SEP +bic tmp, sAke_, sAka_, ROR #5 SEP xar_m0 vBgi, vAka, E0, 61 +eor sAku, tmp, sAku_, ROR #52 SEP +bic tmp, sAmi_, sAme_, ROR #38 SEP +eor sAma, tmp, sAma_, ROR #47 SEP xar_m1 vBga, vAbo, E3, 36 +bic tmp, sAmo_, sAmi_, ROR #5 SEP +eor sAme, tmp, sAme_, ROR #43 SEP +bic tmp, sAmu_, sAmo_, ROR #41 SEP +eor sAmi, tmp, sAmi_, ROR #46 SEP xar_m0 vBbo, vAmo, E3, 43 +bic tmp, sAma_, sAmu_, ROR #35 SEP +ldr cur_const, [const_addr, count, UXTW #3] SEP +add count, count, #1 SEP xar_m0 vBmo, vAmi, E2, 49 +eor sAmo, tmp, sAmo_, ROR #12 SEP +bic tmp, sAme_, sAma_, ROR #9 SEP +eor sAmu, tmp, sAmu_, ROR #44 SEP +bic tmp, sAsi_, sAse_, ROR #48 SEP xar_m0 vBmi, vAke, E1, 54 +eor sAsa, tmp, sAsa_, ROR #41 SEP +bic tmp, sAso_, sAsi_, ROR #2 SEP +eor sAse, tmp, sAse_, ROR #50 SEP +bic tmp, sAsu_, sAso_, ROR #25 SEP xar_m1 vBge, vAgu, E4, 44 +eor sAsi, tmp, sAsi_, ROR #27 SEP +bic tmp, sAsa_, sAsu_, ROR #60 SEP +eor sAso, tmp, sAso_, ROR #21 SEP mov E3.16b, vAga.16b +bic tmp, sAse_, sAsa_, ROR #57 SEP +eor sAsu, tmp, sAsu_, ROR #53 SEP +bic tmp, sAbi_, sAbe_, ROR #63 SEP +eor s_Aba, s_Aba_, tmp, ROR #21 SEP bcax_m0 vAga, vBga, vBgi, vBge +bic tmp, sAbo_, sAbi_, ROR #42 SEP +eor sAbe, tmp, sAbe_, ROR #41 SEP +bic tmp, sAbu_, sAbo_, ROR #57 SEP +eor sAbi, tmp, sAbi_, ROR #35 SEP xar_m0 vBgu, vAsi, E2, 3 +bic tmp, s_Aba_, sAbu_, ROR #50 SEP +eor sAbo, tmp, sAbo_, ROR #43 SEP +bic tmp, sAbe_, s_Aba_, ROR #44 SEP xar_m0 vBsi, vAku, E4, 25 +eor sAbu, tmp, sAbu_, ROR #30 SEP +eor s_Aba, s_Aba, cur_const SEP +save count, STACK_OFFSET_COUNT SEP +eor sC0, sAka, sAsa, ROR #50 SEP xar_m1 vBku, vAsa, E0, 46 +eor sC1, sAse, sAge, ROR #60 SEP +eor sC2, sAmi, sAgi, ROR #59 SEP +eor sC3, sAgo, sAso, ROR #30 SEP +eor sC4, sAbu, sAsu, ROR #53 SEP xar_m0 vBma, vAbu, E4, 37 +eor sC0, sAma, sC0, ROR #49 SEP +eor sC1, sAbe, sC1, ROR #44 SEP +eor sC2, sAki, sC2, ROR #26 SEP xar_m0 vBbu, vAsu, E4, 50 +eor sC3, sAmo, sC3, ROR #63 SEP +eor sC4, sAmu, sC4, ROR #56 SEP +eor sC0, sAga, sC0, ROR #57 SEP +eor sC1, sAme, sC1, ROR #58 SEP xar_m0 vBsu, vAse, E1, 62 +eor sC2, sAbi, sC2, ROR #60 SEP +eor sC3, sAko, sC3, ROR #38 SEP +eor sC4, sAgu, sC4, ROR #48 SEP +eor sC0, s_Aba, sC0, ROR #61 SEP xar_m1 vBme, E3, E0, 28 +eor sC1, sAke, sC1, ROR #57 SEP +eor sC2, sAsi, sC2, ROR #52 SEP +eor sC3, sAbo, sC3, ROR #63 SEP xar_m0 vBbe, vAge, E1, 20 +eor sC4, sAku, sC4, ROR #50 SEP +ror sC1, sC1, 56 SEP +ror sC4, sC4, 58 SEP +ror sC2, sC2, 62 SEP bcax_m0 vAge, vBge, vBgo, vBgi +eor sE1, sC0, sC2, ROR #63 SEP +eor sE3, sC2, sC4, ROR #63 SEP +eor sE0, sC4, sC1, ROR #63 SEP +eor sE2, sC1, sC3, ROR #63 SEP bcax_m0 vAgi, vBgi, vBgu, vBgo +eor sE4, sC3, sC0, ROR #63 SEP +eor s_Aba_, sE0, s_Aba SEP +eor sAsa_, sE2, sAbi, ROR #50 SEP bcax_m1 vAgo, vBgo, vBga, vBgu +eor sAbi_, sE2, sAki, ROR #46 SEP +eor sAki_, sE3, sAko, ROR #63 SEP +eor sAko_, sE4, sAmu, ROR #28 SEP +eor sAmu_, sE3, sAso, ROR #2 SEP bcax_m0 vAgu, vBgu, vBge, vBga +eor sAso_, sE0, sAma, ROR #54 SEP +eor sAka_, sE1, sAbe, ROR #43 SEP +eor sAse_, sE3, sAgo, ROR #36 SEP +eor sAgo_, sE1, sAme, ROR #49 SEP bcax_m0 vAka, vBka, vBki, vBke +eor sAke_, sE2, sAgi, ROR #3 SEP +eor sAgi_, sE0, sAka, ROR #39 SEP +eor sAga_, sE3, sAbo SEP bcax_m0 vAke, vBke, vBko, vBki +eor sAbo_, sE3, sAmo, ROR #37 SEP +eor sAmo_, sE2, sAmi, ROR #8 SEP +eor sAmi_, sE1, sAke, ROR #56 SEP +eor sAge_, sE4, sAgu, ROR #44 SEP bcax_m1 vAki, vBki, vBku, vBko +eor sAgu_, sE2, sAsi, ROR #62 SEP +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP +eor sAma_, sE4, sAbu, ROR #20 SEP bcax_m0 vAko, vBko, vBka, vBku +eor sAbu_, sE4, sAsu, ROR #9 SEP +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP bcax_m0 vAku, vBku, vBke, vBka +eor sAbe_, sE1, sAge, ROR #19 SEP +load_constant_ptr SEP +restore count, STACK_OFFSET_COUNT SEP +bic tmp, sAgi_, sAge_, ROR #47 SEP bcax_m0 vAma, vBma, vBmi, vBme +eor sAga, tmp, sAga_, ROR #39 SEP +bic tmp, sAgo_, sAgi_, ROR #42 SEP +eor sAge, tmp, sAge_, ROR #25 SEP bcax_m1 vAme, vBme, vBmo, vBmi +bic tmp, sAgu_, sAgo_, ROR #16 SEP +eor sAgi, tmp, sAgi_, ROR #58 SEP +bic tmp, sAga_, sAgu_, ROR #31 SEP +eor sAgo, tmp, sAgo_, ROR #47 SEP bcax_m0 vAmi, vBmi, vBmu, vBmo +bic tmp, sAge_, sAga_, ROR #56 SEP +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP +eor sAka, tmp, sAka_, ROR #24 SEP bcax_m0 vAmo, vBmo, vBma, vBmu +bic tmp, sAko_, sAki_, ROR #47 SEP +eor sAke, tmp, sAke_, ROR #2 SEP +bic tmp, sAku_, sAko_, ROR #10 SEP bcax_m0 vAmu, vBmu, vBme, vBma +eor sAki, tmp, sAki_, ROR #57 SEP +bic tmp, sAka_, sAku_, ROR #47 SEP +eor sAko, tmp, sAko_, ROR #57 SEP +bic tmp, sAke_, sAka_, ROR #5 SEP bcax_m1 vAsa, vBsa, vBsi, vBse +eor sAku, tmp, sAku_, ROR #52 SEP +bic tmp, sAmi_, sAme_, ROR #38 SEP +eor sAma, tmp, sAma_, ROR #47 SEP +bic tmp, sAmo_, sAmi_, ROR #5 SEP bcax_m0 vAse, vBse, vBso, vBsi +eor sAme, tmp, sAme_, ROR #43 SEP +bic tmp, sAmu_, sAmo_, ROR #41 SEP +eor sAmi, tmp, sAmi_, ROR #46 SEP bcax_m0 vAsi, vBsi, vBsu, vBso +bic tmp, sAma_, sAmu_, ROR #35 SEP +ldr cur_const, [const_addr, count, UXTW #3] SEP +add count, count, #1 SEP +eor sAmo, tmp, sAmo_, ROR #12 SEP bcax_m0 vAso, vBso, vBsa, vBsu +bic tmp, sAme_, sAma_, ROR #9 SEP +eor sAmu, tmp, sAmu_, ROR #44 SEP +bic tmp, sAsi_, sAse_, ROR #48 SEP +eor sAsa, tmp, sAsa_, ROR #41 SEP bcax_m1 vAsu, vBsu, vBse, vBsa +bic tmp, sAso_, sAsi_, ROR #2 SEP +eor sAse, tmp, sAse_, ROR #50 SEP +bic tmp, sAsu_, sAso_, ROR #25 SEP bcax_m0 vAba, vBba, vBbi, vBbe +eor sAsi, tmp, sAsi_, ROR #27 SEP +bic tmp, sAsa_, sAsu_, ROR #60 SEP +eor sAso, tmp, sAso_, ROR #21 SEP +bic tmp, sAse_, sAsa_, ROR #57 SEP bcax_m0 vAbe, vBbe, vBbo, vBbi +eor sAsu, tmp, sAsu_, ROR #53 SEP +bic tmp, sAbi_, sAbe_, ROR #63 SEP +eor s_Aba, s_Aba_, tmp, ROR #21 SEP +bic tmp, sAbo_, sAbi_, ROR #42 SEP bcax_m0 vAbi, vBbi, vBbu, vBbo +eor sAbe, tmp, sAbe_, ROR #41 SEP +bic tmp, sAbu_, sAbo_, ROR #57 SEP +eor sAbi, tmp, sAbi_, ROR #35 SEP bcax_m0 vAbo, vBbo, vBba, vBbu +bic tmp, s_Aba_, sAbu_, ROR #50 SEP +eor sAbo, tmp, sAbo_, ROR #43 SEP +bic tmp, sAbe_, s_Aba_, ROR #44 SEP +eor sAbu, tmp, sAbu_, ROR #30 SEP bcax_m0 vAbu, vBbu, vBbe, vBba +eor s_Aba, s_Aba, cur_const SEP +ror sAga, sAga,(64-3) SEP +ror sAka, sAka,(64-25) SEP +ror sAma, sAma,(64-10) SEP restore x26, STACK_OFFSET_CONST +ror sAsa, sAsa,(64-39) SEP +ror sAbe, sAbe,(64-21) SEP +ror sAge, sAge,(64-45) SEP ldr vvtmpq, [x26], #16 +ror sAke, sAke,(64-8) SEP +ror sAme, sAme,(64-15) SEP +ror sAse, sAse,(64-41) SEP +ror sAbi, sAbi,(64-14) SEP save x26, STACK_OFFSET_CONST +ror sAgi, sAgi,(64-61) SEP +ror sAki, sAki,(64-18) SEP +ror sAmi, sAmi,(64-56) SEP +ror sAsi, sAsi,(64-2) SEP eor vAba.16b, vAba.16b, vvtmp.16b +ror sAgo, sAgo,(64-28) SEP +ror sAko, sAko,(64-1) SEP +ror sAmo, sAmo,(64-27) SEP .unreq vvtmp +ror sAso, sAso,(64-62) SEP +ror sAbu, sAbu,(64-44) SEP +ror sAgu, sAgu,(64-20) SEP +ror sAku, sAku,(64-6) SEP .unreq vvtmpq +ror sAmu, sAmu,(64-36) SEP +ror sAsu, sAsu,(64-55) SEP +.endm + + +#define KECCAK_F1600_ROUNDS 24 + +.global keccak_f1600_x4_hybrid_asm_v8 +.global _keccak_f1600_x4_hybrid_asm_v8 +.text +.align 4 + +keccak_f1600_x4_hybrid_asm_v8: +_keccak_f1600_x4_hybrid_asm_v8: + alloc_stack + save_gprs + save_vregs + save input_addr, STACK_OFFSET_INPUT + + + ASM_LOAD(const_addr,round_constants_vec) + + save const_addr, STACK_OFFSET_CONST + load_input_vector 2,1 + + // First scalar Keccak computation alongside first half of SIMD computation + load_input_scalar 4,0 + hybrid_round_initial + loop_0: + hybrid_round_noninitial + cmp count, #(KECCAK_F1600_ROUNDS-3) + ble loop_0 + + hybrid_round_final + + restore input_addr, STACK_OFFSET_INPUT + store_input_scalar 4,0 + + // Second scalar Keccak computation alongsie second half of SIMD computation + load_input_scalar 4,1 + hybrid_round_initial + loop_1: + hybrid_round_noninitial + cmp count, #(KECCAK_F1600_ROUNDS-3) + ble loop_1 + + hybrid_round_final + + restore input_addr, STACK_OFFSET_INPUT + store_input_scalar 4,1 + store_input_vector 2,1 + + restore_vregs + restore_gprs + free_stack + + + ret +#endif diff --git a/tests/keccak_neon/manual/keccak_f1600_x4_scalar_asm_v1.s b/tests/keccak_neon/manual/keccak_f1600_x4_scalar_asm_v1.s new file mode 100644 index 0000000..7ce0c0d --- /dev/null +++ b/tests/keccak_neon/manual/keccak_f1600_x4_scalar_asm_v1.s @@ -0,0 +1,561 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + + .data + .align(8) +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + + input_addr .req x0 + const_addr .req x1 + count .req w0 + cur_const .req x1 + + /* Allocation of GPRs for Keccak-f1600 state */ +#define ABA x2 +#define ABE x3 +#define ABI x4 +#define ABO x5 +#define ABU x6 +#define AGA x7 +#define AGE x8 +#define AGI x9 +#define AGO x10 +#define AGU x11 +#define AKA x12 +#define AKE x13 +#define AKI x14 +#define AKO x15 +#define AKU x16 +#define AMA x17 +#define AME x18 +#define AMI x19 +#define AMO x20 +#define AMU x21 +#define ASA x22 +#define ASE x23 +#define ASI x24 +#define ASO x25 +#define ASU x26 + + Aba .req ABA + Abe .req ABE + Abi .req ABI + Abo .req ABO + Abu .req ABU + Aga .req AGA + Age .req AGE + Agi .req AGI + Ago .req AGO + Agu .req AGU + Aka .req AKA + Ake .req AKE + Aki .req AKI + Ako .req AKO + Aku .req AKU + Ama .req AMA + Ame .req AME + Ami .req AMI + Amo .req AMO + Amu .req AMU + Asa .req ASA + Ase .req ASE + Asi .req ASI + Aso .req ASO + Asu .req ASU + + Aba_tmp .req AGA + Abe_tmp .req AGE + Abi_tmp .req ABI + Abo_tmp .req ABO + Abu_tmp .req ABU + Aga_tmp .req AKA + Age_tmp .req AKE + Agi_tmp .req AGI + Ago_tmp .req AGO + Agu_tmp .req AGU + Aka_tmp .req AMA + Ake_tmp .req AME + Aki_tmp .req AKI + Ako_tmp .req AKO + Aku_tmp .req AKU + Ama_tmp .req ASA + Ame_tmp .req ASE + Ami_tmp .req AMI + Amo_tmp .req AMO + Amu_tmp .req AMU + Asa_tmp .req x28 + Ase_tmp .req x27 + Asi_tmp .req ASI + Aso_tmp .req ASO + Asu_tmp .req ASU + +#define STACK_SIZE (16*6 + 3*8 + 8) // GPRs (16*6), count (8), const (8), input (8), padding (8) +#define STACK_BASE_GPRS (3*8+8) +#define STACK_OFFSET_INPUT (0*8) +#define STACK_OFFSET_CONST (1*8) +#define STACK_OFFSET_COUNT (2*8) + +.macro store_input_scalar num idx + str Aba, [input_addr, 8*(\num*(0) +\idx)] + str Abe, [input_addr, 8*(\num*(0+1) +\idx)] + str Abi, [input_addr, 8*(\num*(2)+ \idx)] + str Abo, [input_addr, 8*(\num*(2+1) +\idx)] + str Abu, [input_addr, 8*(\num*(4)+ \idx)] + str Aga, [input_addr, 8*(\num*(4+1) +\idx)] + str Age, [input_addr, 8*(\num*(6)+ \idx)] + str Agi, [input_addr, 8*(\num*(6+1) +\idx)] + str Ago, [input_addr, 8*(\num*(8)+ \idx)] + str Agu, [input_addr, 8*(\num*(8+1) +\idx)] + str Aka, [input_addr, 8*(\num*(10) +\idx)] + str Ake, [input_addr, 8*(\num*(10+1)+\idx)] + str Aki, [input_addr, 8*(\num*(12) +\idx)] + str Ako, [input_addr, 8*(\num*(12+1)+\idx)] + str Aku, [input_addr, 8*(\num*(14) +\idx)] + str Ama, [input_addr, 8*(\num*(14+1)+\idx)] + str Ame, [input_addr, 8*(\num*(16) +\idx)] + str Ami, [input_addr, 8*(\num*(16+1)+\idx)] + str Amo, [input_addr, 8*(\num*(18) +\idx)] + str Amu, [input_addr, 8*(\num*(18+1)+\idx)] + str Asa, [input_addr, 8*(\num*(20) +\idx)] + str Ase, [input_addr, 8*(\num*(20+1)+\idx)] + str Asi, [input_addr, 8*(\num*(22) +\idx)] + str Aso, [input_addr, 8*(\num*(22+1)+\idx)] + str Asu, [input_addr, 8*(\num*(24) +\idx)] +.endm + +.macro load_input_scalar num idx + ldr Aba, [input_addr, 8*(\num*(0) +\idx)] + ldr Abe, [input_addr, 8*(\num*(0+1) +\idx)] + ldr Abi, [input_addr, 8*(\num*(2)+ \idx)] + ldr Abo, [input_addr, 8*(\num*(2+1) +\idx)] + ldr Abu, [input_addr, 8*(\num*(4)+ \idx)] + ldr Aga, [input_addr, 8*(\num*(4+1) +\idx)] + ldr Age, [input_addr, 8*(\num*(6)+ \idx)] + ldr Agi, [input_addr, 8*(\num*(6+1) +\idx)] + ldr Ago, [input_addr, 8*(\num*(8)+ \idx)] + ldr Agu, [input_addr, 8*(\num*(8+1) +\idx)] + ldr Aka, [input_addr, 8*(\num*(10) +\idx)] + ldr Ake, [input_addr, 8*(\num*(10+1)+\idx)] + ldr Aki, [input_addr, 8*(\num*(12) +\idx)] + ldr Ako, [input_addr, 8*(\num*(12+1)+\idx)] + ldr Aku, [input_addr, 8*(\num*(14) +\idx)] + ldr Ama, [input_addr, 8*(\num*(14+1)+\idx)] + ldr Ame, [input_addr, 8*(\num*(16) +\idx)] + ldr Ami, [input_addr, 8*(\num*(16+1)+\idx)] + ldr Amo, [input_addr, 8*(\num*(18) +\idx)] + ldr Amu, [input_addr, 8*(\num*(18+1)+\idx)] + ldr Asa, [input_addr, 8*(\num*(20) +\idx)] + ldr Ase, [input_addr, 8*(\num*(20+1)+\idx)] + ldr Asi, [input_addr, 8*(\num*(22) +\idx)] + ldr Aso, [input_addr, 8*(\num*(22+1)+\idx)] + ldr Asu, [input_addr, 8*(\num*(24) +\idx)] +.endm + +.macro save_gprs + stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +.macro eor5 dst, src0, src1, src2, src3, src4 + eor \dst, \src0, \src1 + eor \dst, \dst, \src2 + eor \dst, \dst, \src3 + eor \dst, \dst, \src4 +.endm + +.macro xor_rol dst, src1, src0, imm + eor \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro bic_rol dst, src1, src0, imm + bic \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro rotate dst, src, imm + ror \dst, \src, #(64-\imm) +.endm + +.macro save reg, offset + str \reg, [sp, #\offset] +.endm + +.macro restore reg, offset + ldr \reg, [sp, #\offset] +.endm + +.macro keccak_f1600_round is_first + + .if \is_first == 0 + save count, STACK_OFFSET_COUNT + .endif + +#define BCE x30 +#define BCA x0 +#define BCI x27 +#define BCO x28 +#define BCU x29 + + BCe .req BCE + BCa .req BCA + BCi .req BCI + BCo .req BCO + BCu .req BCU + + .if \is_first == 1 + eor5 BCa, Aba, Aga, Aka, Ama, Asa + eor5 BCe, Abe, Age, Ake, Ame, Ase + eor5 BCi, Abi, Agi, Aki, Ami, Asi + eor5 BCo, Abo, Ago, Ako, Amo, Aso + eor5 BCu, Abu, Agu, Aku, Amu, Asu + .else + xor_rol BCu, Asu, Abu , 11 + xor_rol BCa, Asa, Aka, 14 + xor_rol BCe, Age, Ase , 4 + xor_rol BCi, Agi, Ami , 5 + xor_rol BCu, BCu, Amu , 8 + xor_rol BCo, Aso, Ago , 34 + xor_rol BCe, BCe, Abe , 20 + xor_rol BCa, BCa, Ama , 15 + xor_rol BCi, BCi, Aki , 38 + xor_rol BCu, BCu, Agu , 16 + xor_rol BCe, BCe, Ame , 6 + xor_rol BCo, BCo, Amo , 1 + xor_rol BCi, BCi, Abi , 4 + xor_rol BCu, BCu, Aku , 14 + xor_rol BCe, BCe, Ake , 7 + xor_rol BCo, BCo, Ako , 26 + xor_rol BCa, BCa, Aga , 7 + xor_rol BCi, BCi, Asi , 12 + rotate BCe, BCe, 8 + xor_rol BCo, BCo, Abo , 1 + rotate BCu, BCu, 6 + rotate BCi, BCi, 2 + xor_rol BCa, BCa, Aba , 3 + .endif + + Da .req BCE + Du .req BCA + De .req BCI + Di .req x1 + Do .req BCU + + xor_rol Di,BCo,BCe,1 + xor_rol Da,BCe,BCu,1 + xor_rol Do,BCu,BCi,1 + .unreq BCu + xor_rol De,BCi,BCa,1 + .unreq BCi + xor_rol Du,BCa,BCo,1 + .unreq BCa + .unreq BCo + .unreq BCe + + .if \is_first == 1 + + eor Asa_tmp,Abi,Di + eor Abi_tmp,Aki,Di + eor Aki_tmp,Ako,Do + eor Ako_tmp,Amu,Du + eor Amu_tmp,Aso,Do + eor Aso_tmp,Ama,Da + eor Aka_tmp,Abe,De + + eor Abe_tmp,Age,De + + temp .req ABE + eor temp,Ago,Do + eor Ago_tmp,Ame,De + eor Ake_tmp,Agi,Di + eor Agi_tmp,Aka,Da + eor Aga_tmp,Abo,Do + eor Abo_tmp,Amo,Do + eor Amo_tmp,Ami,Di + eor Ami_tmp,Ake,De + eor Age_tmp,Agu,Du + eor Agu_tmp,Asi,Di + eor Asi_tmp,Aku,Du + eor Aku_tmp,Asa,Da + eor Ama_tmp,Abu,Du + eor Abu_tmp,Asu,Du + eor Asu_tmp,Ase,De + eor Ame_tmp,Aga,Da + eor Aba_tmp,Aba,Da + mov Ase_tmp,temp + .unreq temp + + .else + + xor_rol Asa_tmp,Abi,Di,14 + xor_rol Abi_tmp,Aki,Di,18 + xor_rol Aki_tmp,Ako,Do,1 + xor_rol Ako_tmp,Amu,Du,36 + xor_rol Amu_tmp,Aso,Do,62 + xor_rol Aso_tmp,Ama,Da,10 + xor_rol Aka_tmp,Abe,De,21 + + xor_rol Abe_tmp,Age,De,45 + + temp .req ABE + xor_rol temp,Ago,Do,28 + xor_rol Ago_tmp,Ame,De,15 + xor_rol Ake_tmp,Agi,Di,61 + xor_rol Agi_tmp,Aka,Da,25 + eor Aga_tmp,Abo,Do + xor_rol Abo_tmp,Amo,Do,27 + xor_rol Amo_tmp,Ami,Di,56 + xor_rol Ami_tmp,Ake,De,8 + xor_rol Age_tmp,Agu,Du,20 + xor_rol Agu_tmp,Asi,Di,2 + xor_rol Asi_tmp,Aku,Du,6 + xor_rol Aku_tmp,Asa,Da,39 + xor_rol Ama_tmp,Abu,Du,44 + xor_rol Abu_tmp,Asu,Du,55 + xor_rol Asu_tmp,Ase,De,41 + xor_rol Ame_tmp,Aga,Da,3 + eor Aba_tmp,Aba,Da + mov Ase_tmp,temp + .unreq temp + + .endif + + .unreq Da + .unreq De + .unreq Di + .unreq Do + .unreq Du + + tmp .req x30 + + bic_rol tmp, Abe_tmp, Abi_tmp,1 + xor_rol Aba, tmp, Aba_tmp,43 + bic_rol tmp, Abi_tmp, Abo_tmp,22 + xor_rol Abe, Abe_tmp, tmp,23 + bic_rol tmp ,Abo_tmp, Abu_tmp,7 + xor_rol Abi ,Abi_tmp, tmp,29 + bic_rol tmp ,Abu_tmp, Aba_tmp,14 + xor_rol Abo ,Abo_tmp, tmp,21 + bic_rol tmp ,Aba_tmp, Abe_tmp,20 + xor_rol Abu ,Abu_tmp, tmp,34 + + bic_rol tmp, Age_tmp, Agi_tmp,17 + xor_rol Aga, Aga_tmp, tmp,25 + bic_rol tmp, Agi_tmp, Ago_tmp,22 + xor_rol Age, Age_tmp, tmp,39 + bic_rol tmp ,Ago_tmp, Agu_tmp,48 + xor_rol Agi ,Agi_tmp, tmp,6 + bic_rol tmp ,Agu_tmp, Aga_tmp,33 + xor_rol Ago ,Ago_tmp, tmp,17 + bic_rol tmp ,Aga_tmp, Age_tmp,8 + xor_rol Agu ,Agu_tmp, tmp,41 + + .if \is_first == 0 + restore count, STACK_OFFSET_COUNT + .endif + + load_constant_ptr + + bic_rol tmp, Ake_tmp, Aki_tmp,45 + xor_rol Aka, Aka_tmp, tmp,40 + bic_rol tmp, Aki_tmp, Ako_tmp,17 + xor_rol Ake, Ake_tmp, tmp,62 + bic_rol tmp ,Ako_tmp, Aku_tmp,54 + xor_rol Aki ,Aki_tmp, tmp,7 + bic_rol tmp ,Aku_tmp, Aka_tmp,17 + xor_rol Ako ,Ako_tmp, tmp,7 + bic_rol tmp ,Aka_tmp, Ake_tmp,59 + xor_rol Aku ,Aku_tmp, tmp,12 + + bic_rol tmp, Ame_tmp, Ami_tmp,26 + xor_rol Ama, Ama_tmp, tmp,17 + bic_rol tmp, Ami_tmp, Amo_tmp,59 + xor_rol Ame, Ame_tmp, tmp,21 + bic_rol tmp ,Amo_tmp, Amu_tmp,23 + xor_rol Ami ,Ami_tmp, tmp,18 + bic_rol tmp ,Amu_tmp, Ama_tmp,29 + xor_rol Amo ,Amo_tmp, tmp,52 + bic_rol tmp ,Ama_tmp, Ame_tmp,55 + xor_rol Amu ,Amu_tmp, tmp,20 + + .if \is_first == 0 + ldr cur_const, [const_addr, count, UXTW #3] + add count, count, #1 + .else + ldr cur_const, [const_addr] + mov count, #1 + .endif + + bic_rol tmp, Ase_tmp, Asi_tmp,16 + xor_rol Asa, Asa_tmp, tmp,23 + bic_rol tmp, Asi_tmp, Aso_tmp,62 + xor_rol Ase, Ase_tmp, tmp,14 + bic_rol tmp ,Aso_tmp, Asu_tmp,39 + xor_rol Asi ,Asi_tmp, tmp,37 + bic_rol tmp ,Asu_tmp, Asa_tmp,4 + xor_rol Aso ,Aso_tmp, tmp,43 + bic_rol tmp ,Asa_tmp, Ase_tmp,7 + xor_rol Asu ,Asu_tmp, tmp,11 + + eor Aba, Aba, cur_const + +.endm + +.macro final_rotate + rotate Aga, Aga,3 + rotate Aka, Aka,25 + rotate Ama, Ama,10 + rotate Asa, Asa,39 + rotate Abe, Abe,21 + rotate Age, Age,45 + rotate Ake, Ake,8 + rotate Ame, Ame,15 + rotate Ase, Ase,41 + rotate Abi, Abi,14 + rotate Agi, Agi,61 + rotate Aki, Aki,18 + rotate Ami, Ami,56 + rotate Asi, Asi,2 + rotate Ago, Ago,28 + rotate Ako, Ako,1 + rotate Amo, Amo,27 + rotate Aso, Aso,62 + rotate Abu, Abu,44 + rotate Agu, Agu,20 + rotate Aku, Aku,6 + rotate Amu, Amu,36 + rotate Asu, Asu,55 +.endm + +#define KECCAK_F1600_ROUNDS 24 + +.global keccak_f1600_x4_scalar_asm_v1 +.global _keccak_f1600_x4_scalar_asm_v1 +.text +.align 4 + +keccak_f1600_x4_scalar_asm_v1: +_keccak_f1600_x4_scalar_asm_v1: + alloc_stack + save_gprs + save input_addr, STACK_OFFSET_INPUT + + // First scalar Keccak computation + load_input_scalar 4,0 + keccak_f1600_round 1 +loop_0: + keccak_f1600_round 0 + cmp count, #(KECCAK_F1600_ROUNDS-1) + ble loop_0 + final_rotate + restore input_addr, STACK_OFFSET_INPUT + store_input_scalar 4,0 + + // Second scalar Keccak computation + load_input_scalar 4, 1 + keccak_f1600_round 1 +loop_1: + keccak_f1600_round 0 + cmp count, #(KECCAK_F1600_ROUNDS-1) + ble loop_1 + final_rotate + restore input_addr, STACK_OFFSET_INPUT + store_input_scalar 4, 1 + + // Third scalar Keccak computation + load_input_scalar 4, 2 + keccak_f1600_round 1 +loop_2: + keccak_f1600_round 0 + cmp count, #(KECCAK_F1600_ROUNDS-1) + ble loop_2 + final_rotate + restore input_addr, STACK_OFFSET_INPUT + store_input_scalar 4, 2 + + // Fourth scalar Keccak computation + load_input_scalar 4, 3 + keccak_f1600_round 1 +loop_3: + keccak_f1600_round 0 + cmp count, #(KECCAK_F1600_ROUNDS-1) + ble loop_3 + final_rotate + restore input_addr, STACK_OFFSET_INPUT + store_input_scalar 4, 3 + + restore_gprs + free_stack + ret diff --git a/tests/keccak_neon/manual/keccak_f1600_x4_scalar_asm_v5.s b/tests/keccak_neon/manual/keccak_f1600_x4_scalar_asm_v5.s new file mode 100644 index 0000000..90fc545 --- /dev/null +++ b/tests/keccak_neon/manual/keccak_f1600_x4_scalar_asm_v5.s @@ -0,0 +1,543 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +/********************** CONSTANTS *************************/ + .data + .balign 64 +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x26 + cur_const .req x26 + count .req w27 + out_count .req w27 + + /* Mapping of Kecck-f1600 state to scalar registers + * at the beginning and end of each round. */ + Aba .req x1 + Abe .req x6 + Abi .req x11 + Abo .req x16 + Abu .req x21 + Aga .req x2 + Age .req x7 + Agi .req x12 + Ago .req x17 + Agu .req x22 + Aka .req x3 + Ake .req x8 + Aki .req x13 + Ako .req x18 + Aku .req x23 + Ama .req x4 + Ame .req x9 + Ami .req x14 + Amo .req x19 + Amu .req x24 + Asa .req x5 + Ase .req x10 + Asi .req x15 + Aso .req x20 + Asu .req x25 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + Aba_ .req x30 + Abe_ .req x28 + Abi_ .req x11 + Abo_ .req x16 + Abu_ .req x21 + Aga_ .req x3 + Age_ .req x8 + Agi_ .req x12 + Ago_ .req x17 + Agu_ .req x22 + Aka_ .req x4 + Ake_ .req x9 + Aki_ .req x13 + Ako_ .req x18 + Aku_ .req x23 + Ama_ .req x5 + Ame_ .req x10 + Ami_ .req x14 + Amo_ .req x19 + Amu_ .req x24 + Asa_ .req x1 + Ase_ .req x6 + Asi_ .req x15 + Aso_ .req x20 + Asu_ .req x25 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + C0 .req x30 + E0 .req x29 + C1 .req x26 + E1 .req x0 + C2 .req x27 + E2 .req x26 + C3 .req x28 + E3 .req x27 + C4 .req x29 + E4 .req x28 + + tmp .req x0 + +/************************ MACROS ****************************/ + +#define STACK_SIZE (16*6 + 3*8 + 8) // GPRs (16*6), count (8), const (8), input (8), padding (8) +#define STACK_BASE_GPRS (3*8+8) +#define STACK_OFFSET_INPUT (0*8) +#define STACK_OFFSET_CONST (1*8) +#define STACK_OFFSET_COUNT (2*8) +#define STACK_OFFSET_OUTCOUNT (3*8) + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +.macro save reg, offset + str \reg, [sp, #\offset] +.endm + +.macro restore reg, offset + ldr \reg, [sp, #\offset] +.endm + +.macro save_gprs + stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro keccak_f1600_round_initial + ldr Aku, [input_addr, 8*(4*(14) )] + ldr Ama, [input_addr, 8*(4*(14+1))] + ldr Asa, [input_addr, 8*(4*(20) )] + ldr Ase, [input_addr, 8*(4*(20+1))] + eor C0, Ama, Asa + ldr Ame, [input_addr, 8*(4*(16) )] + ldr Ami, [input_addr, 8*(4*(16+1))] + eor C1, Ame, Ase + ldr Asi, [input_addr, 8*(4*(22) )] + ldr Aso, [input_addr, 8*(4*(22+1))] + eor C2, Ami, Asi + ldr Amo, [input_addr, 8*(4*(18) )] + ldr Amu, [input_addr, 8*(4*(18+1))] + eor C3, Amo, Aso + ldr Asu, [input_addr, #(4*8*24)] + eor C4, Amu, Asu + ldr Aka, [input_addr, 8*(4*(10) )] + ldr Ake, [input_addr, 8*(4*(10+1))] + eor C0, Aka, C0 + eor C1, Ake, C1 + ldr Aki, [input_addr, 8*(4*(12) )] + ldr Ako, [input_addr, 8*(4*(12+1))] + eor C2, Aki, C2 + ldr Abu, [input_addr, 8*(4*(4))] + ldr Aga, [input_addr, 8*(4*(4+1) )] + eor C3, Ako, C3 + eor C4, Aku, C4 + ldr Age, [input_addr, 8*(4*(6))] + ldr Agi, [input_addr, 8*(4*(6+1) )] + eor C0, Aga, C0 + ldr Ago, [input_addr, 8*(4*(8))] + ldr Agu, [input_addr, 8*(4*(8+1) )] + eor C1, Age, C1 + ldr Aba, [input_addr, 8*(4*(0) )] + ldr Abe, [input_addr, 8*(4*(0+1) )] + eor C2, Agi, C2 + ldr Abi, [input_addr, 8*(4*(2))] + ldr Abo, [input_addr, 8*(4*(2+1) )] + eor C3, Ago, C3 + save input_addr, STACK_OFFSET_INPUT + eor C4, Agu, C4 + eor C0, Aba, C0 + eor C1, Abe, C1 + eor C2, Abi, C2 + eor C3, Abo, C3 + eor C4, Abu, C4 + + eor E1, C0, C2, ROR #63 + eor E3, C2, C4, ROR #63 + eor E0, C4, C1, ROR #63 + eor E2, C1, C3, ROR #63 + eor E4, C3, C0, ROR #63 + + eor Aba_, Aba, E0 + eor Asa_, Abi, E2 + eor Abi_, Aki, E2 + eor Aki_, Ako, E3 + eor Ako_, Amu, E4 + eor Amu_, Aso, E3 + eor Aso_, Ama, E0 + eor Aka_, Abe, E1 + eor Ase_, Ago, E3 + eor Ago_, Ame, E1 + eor Ake_, Agi, E2 + eor Agi_, Aka, E0 + eor Aga_, Abo, E3 + eor Abo_, Amo, E3 + eor Amo_, Ami, E2 + eor Ami_, Ake, E1 + eor Age_, Agu, E4 + eor Agu_, Asi, E2 + eor Asi_, Aku, E4 + eor Aku_, Asa, E0 + eor Ama_, Abu, E4 + eor Abu_, Asu, E4 + eor Asu_, Ase, E1 + eor Ame_, Aga, E0 + eor Abe_, Age, E1 + + load_constant_ptr + + tmp0 .req x0 + tmp1 .req x29 + + bic tmp0, Agi_, Age_, ROR #47 + bic tmp1, Ago_, Agi_, ROR #42 + eor Aga, tmp0, Aga_, ROR #39 + bic tmp0, Agu_, Ago_, ROR #16 + eor Age, tmp1, Age_, ROR #25 + bic tmp1, Aga_, Agu_, ROR #31 + eor Agi, tmp0, Agi_, ROR #58 + bic tmp0, Age_, Aga_, ROR #56 + eor Ago, tmp1, Ago_, ROR #47 + bic tmp1, Aki_, Ake_, ROR #19 + eor Agu, tmp0, Agu_, ROR #23 + bic tmp0, Ako_, Aki_, ROR #47 + eor Aka, tmp1, Aka_, ROR #24 + bic tmp1, Aku_, Ako_, ROR #10 + eor Ake, tmp0, Ake_, ROR #2 + bic tmp0, Aka_, Aku_, ROR #47 + eor Aki, tmp1, Aki_, ROR #57 + bic tmp1, Ake_, Aka_, ROR #5 + eor Ako, tmp0, Ako_, ROR #57 + bic tmp0, Ami_, Ame_, ROR #38 + eor Aku, tmp1, Aku_, ROR #52 + bic tmp1, Amo_, Ami_, ROR #5 + eor Ama, tmp0, Ama_, ROR #47 + bic tmp0, Amu_, Amo_, ROR #41 + eor Ame, tmp1, Ame_, ROR #43 + bic tmp1, Ama_, Amu_, ROR #35 + eor Ami, tmp0, Ami_, ROR #46 + bic tmp0, Ame_, Ama_, ROR #9 + + str const_addr, [sp, #(STACK_OFFSET_CONST)] + ldr cur_const, [const_addr] + + eor Amo, tmp1, Amo_, ROR #12 + bic tmp1, Asi_, Ase_, ROR #48 + eor Amu, tmp0, Amu_, ROR #44 + bic tmp0, Aso_, Asi_, ROR #2 + eor Asa, tmp1, Asa_, ROR #41 + bic tmp1, Asu_, Aso_, ROR #25 + eor Ase, tmp0, Ase_, ROR #50 + bic tmp0, Asa_, Asu_, ROR #60 + eor Asi, tmp1, Asi_, ROR #27 + bic tmp1, Ase_, Asa_, ROR #57 + eor Aso, tmp0, Aso_, ROR #21 + + mov count, #1 + + bic tmp0, Abi_, Abe_, ROR #63 + eor Asu, tmp1, Asu_, ROR #53 + bic tmp1, Abo_, Abi_, ROR #42 + eor Aba, Aba_, tmp0, ROR #21 + bic tmp0, Abu_, Abo_, ROR #57 + eor Abe, tmp1, Abe_, ROR #41 + bic tmp1, Aba_, Abu_, ROR #50 + eor Abi, tmp0, Abi_, ROR #35 + bic tmp0, Abe_, Aba_, ROR #44 + eor Abo, tmp1, Abo_, ROR #43 + eor Abu, tmp0, Abu_, ROR #30 + + eor Aba, Aba, cur_const + save count, STACK_OFFSET_COUNT + +.endm + + +.macro keccak_f1600_round_noninitial + + eor C2, Asi, Abi, ROR #52 + eor C0, Aba, Aga, ROR #61 + eor C4, Aku, Agu, ROR #50 + eor C1, Ake, Ame, ROR #57 + eor C3, Abo, Ako, ROR #63 + eor C2, C2, Aki, ROR #48 + eor C0, C0, Ama, ROR #54 + eor C4, C4, Amu, ROR #34 + eor C1, C1, Abe, ROR #51 + eor C3, C3, Amo, ROR #37 + eor C2, C2, Ami, ROR #10 + eor C0, C0, Aka, ROR #39 + eor C4, C4, Abu, ROR #26 + eor C1, C1, Ase, ROR #31 + eor C3, C3, Ago, ROR #36 + eor C2, C2, Agi, ROR #5 + eor C0, C0, Asa, ROR #25 + eor C4, C4, Asu, ROR #15 + eor C1, C1, Age, ROR #27 + eor C3, C3, Aso, ROR #2 + + eor E1, C0, C2, ROR #61 + ror C2, C2, 62 + eor E3, C2, C4, ROR #57 + ror C4, C4, 58 + eor E0, C4, C1, ROR #55 + ror C1, C1, 56 + eor E2, C1, C3, ROR #63 + eor E4, C3, C0, ROR #63 + + eor Aba_, E0, Aba + eor Asa_, E2, Abi, ROR #50 + eor Abi_, E2, Aki, ROR #46 + eor Aki_, E3, Ako, ROR #63 + eor Ako_, E4, Amu, ROR #28 + eor Amu_, E3, Aso, ROR #2 + eor Aso_, E0, Ama, ROR #54 + eor Aka_, E1, Abe, ROR #43 + eor Ase_, E3, Ago, ROR #36 + eor Ago_, E1, Ame, ROR #49 + eor Ake_, E2, Agi, ROR #3 + eor Agi_, E0, Aka, ROR #39 + eor Aga_, E3, Abo + eor Abo_, E3, Amo, ROR #37 + eor Amo_, E2, Ami, ROR #8 + eor Ami_, E1, Ake, ROR #56 + eor Age_, E4, Agu, ROR #44 + eor Agu_, E2, Asi, ROR #62 + eor Asi_, E4, Aku, ROR #58 + eor Aku_, E0, Asa, ROR #25 + eor Ama_, E4, Abu, ROR #20 + eor Abu_, E4, Asu, ROR #9 + eor Asu_, E1, Ase, ROR #23 + eor Ame_, E0, Aga, ROR #61 + eor Abe_, E1, Age, ROR #19 + + load_constant_ptr_stack + restore count, STACK_OFFSET_COUNT + + tmp0 .req x0 + tmp1 .req x29 + + bic tmp0, Agi_, Age_, ROR #47 + bic tmp1, Ago_, Agi_, ROR #42 + eor Aga, tmp0, Aga_, ROR #39 + bic tmp0, Agu_, Ago_, ROR #16 + eor Age, tmp1, Age_, ROR #25 + bic tmp1, Aga_, Agu_, ROR #31 + eor Agi, tmp0, Agi_, ROR #58 + bic tmp0, Age_, Aga_, ROR #56 + eor Ago, tmp1, Ago_, ROR #47 + bic tmp1, Aki_, Ake_, ROR #19 + eor Agu, tmp0, Agu_, ROR #23 + bic tmp0, Ako_, Aki_, ROR #47 + eor Aka, tmp1, Aka_, ROR #24 + bic tmp1, Aku_, Ako_, ROR #10 + eor Ake, tmp0, Ake_, ROR #2 + bic tmp0, Aka_, Aku_, ROR #47 + eor Aki, tmp1, Aki_, ROR #57 + bic tmp1, Ake_, Aka_, ROR #5 + eor Ako, tmp0, Ako_, ROR #57 + bic tmp0, Ami_, Ame_, ROR #38 + eor Aku, tmp1, Aku_, ROR #52 + bic tmp1, Amo_, Ami_, ROR #5 + eor Ama, tmp0, Ama_, ROR #47 + bic tmp0, Amu_, Amo_, ROR #41 + eor Ame, tmp1, Ame_, ROR #43 + bic tmp1, Ama_, Amu_, ROR #35 + eor Ami, tmp0, Ami_, ROR #46 + bic tmp0, Ame_, Ama_, ROR #9 + + ldr cur_const, [const_addr, count, UXTW #3] + + eor Amo, tmp1, Amo_, ROR #12 + bic tmp1, Asi_, Ase_, ROR #48 + eor Amu, tmp0, Amu_, ROR #44 + bic tmp0, Aso_, Asi_, ROR #2 + eor Asa, tmp1, Asa_, ROR #41 + bic tmp1, Asu_, Aso_, ROR #25 + eor Ase, tmp0, Ase_, ROR #50 + bic tmp0, Asa_, Asu_, ROR #60 + eor Asi, tmp1, Asi_, ROR #27 + bic tmp1, Ase_, Asa_, ROR #57 + eor Aso, tmp0, Aso_, ROR #21 + bic tmp0, Abi_, Abe_, ROR #63 + add count, count, #1 + save count, STACK_OFFSET_COUNT + eor Asu, tmp1, Asu_, ROR #53 + bic tmp1, Abo_, Abi_, ROR #42 + eor Aba, Aba_, tmp0, ROR #21 + bic tmp0, Abu_, Abo_, ROR #57 + eor Abe, tmp1, Abe_, ROR #41 + bic tmp1, Aba_, Abu_, ROR #50 + eor Abi, tmp0, Abi_, ROR #35 + bic tmp0, Abe_, Aba_, ROR #44 + eor Abo, tmp1, Abo_, ROR #43 + eor Abu, tmp0, Abu_, ROR #30 + + eor Aba, Aba, cur_const + +.endm + +.macro final_rotate_store + ror Aga, Aga,#(64-3) + restore input_addr, STACK_OFFSET_INPUT + ror Abu, Abu,#(64-44) + ror Aka, Aka,#(64-25) + ror Ake, Ake,#(64-8) + str Abu, [input_addr, 8*(4*(4))] + str Aga, [input_addr, 8*(4*(4+1) )] + ror Ama, Ama,#(64-10) + ror Aku, Aku,#(64-6) + str Aka, [input_addr, 8*(4*(10) )] + str Ake, [input_addr, 8*(4*(10+1))] + ror Asa, Asa,#(64-39) + ror Ase, Ase,#(64-41) + str Aku, [input_addr, 8*(4*(14) )] + str Ama, [input_addr, 8*(4*(14+1))] + ror Abe, Abe,#(64-21) + ror Age, Age,#(64-45) + str Asa, [input_addr, 8*(4*(20) )] + str Ase, [input_addr, 8*(4*(20+1))] + ror Agi, Agi,#(64-61) + str Aba, [input_addr, 8*(4*(0) )] + str Abe, [input_addr, 8*(4*(0+1) )] + ror Ame, Ame,#(64-15) + ror Ami, Ami,#(64-56) + str Age, [input_addr, 8*(4*(6))] + str Agi, [input_addr, 8*(4*(6+1) )] + ror Abi, Abi,#(64-14) + ror Aki, Aki,#(64-18) + str Ame, [input_addr, 8*(4*(16) )] + str Ami, [input_addr, 8*(4*(16+1))] + ror Ako, Ako,#(64-1) + str Abi, [input_addr, 8*(4*(2))] + str Abo, [input_addr, 8*(4*(2+1) )] + ror Asi, Asi,#(64-2) + ror Aso, Aso,#(64-62) + str Aki, [input_addr, 8*(4*(12) )] + str Ako, [input_addr, 8*(4*(12+1))] + ror Ago, Ago,#(64-28) + ror Agu, Agu,#(64-20) + str Asi, [input_addr, 8*(4*(22) )] + str Aso, [input_addr, 8*(4*(22+1))] + ror Amo, Amo,#(64-27) + ror Amu, Amu,#(64-36) + str Ago, [input_addr, 8*(4*(8))] + str Agu, [input_addr, 8*(4*(8+1) )] + ror Asu, Asu,#(64-55) + str Amo, [input_addr, 8*(4*(18) )] + str Amu, [input_addr, 8*(4*(18+1))] + str Asu, [input_addr, #(4*8*24)] +.endm + +#define KECCAK_F1600_ROUNDS 24 + +.text +.balign 16 +.global keccak_f1600_x4_scalar_asm_v5 +.global _keccak_f1600_x4_scalar_asm_v5 + +.macro load_constant_ptr_stack + ldr const_addr, [sp, #(STACK_OFFSET_CONST)] +.endm +keccak_f1600_x4_scalar_asm_v5: +_keccak_f1600_x4_scalar_asm_v5: + alloc_stack + save_gprs + + mov out_count, #4 +1: + save out_count, STACK_OFFSET_OUTCOUNT + + keccak_f1600_round_initial +loop: + keccak_f1600_round_noninitial + cmp count, #(KECCAK_F1600_ROUNDS-1) + ble loop + + final_rotate_store + add input_addr, input_addr, #8 + + restore out_count, STACK_OFFSET_OUTCOUNT + sub out_count, out_count, #1 + cbnz out_count, 1b + + + restore_gprs + free_stack + ret diff --git a/tests/keccak_neon/manual/keccak_f1600_x4_v84a_asm_v1p0.s b/tests/keccak_neon/manual/keccak_f1600_x4_v84a_asm_v1p0.s new file mode 100644 index 0000000..acce4c7 --- /dev/null +++ b/tests/keccak_neon/manual/keccak_f1600_x4_v84a_asm_v1p0.s @@ -0,0 +1,452 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +#if defined(__ARM_FEATURE_SHA3) + +/********************** CONSTANTS *************************/ + .data + .align(8) +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x1 + count .req x2 + cur_const .req x3 + out_count .req x4 + + /* Mapping of Kecck-f1600 state to vector registers + * at the beginning and end of each round. */ + Aba .req v0 + Abe .req v1 + Abi .req v2 + Abo .req v3 + Abu .req v4 + Aga .req v5 + Age .req v6 + Agi .req v7 + Ago .req v8 + Agu .req v9 + Aka .req v10 + Ake .req v11 + Aki .req v12 + Ako .req v13 + Aku .req v14 + Ama .req v15 + Ame .req v16 + Ami .req v17 + Amo .req v18 + Amu .req v19 + Asa .req v20 + Ase .req v21 + Asi .req v22 + Aso .req v23 + Asu .req v24 + + /* q-form of the above mapping */ + Abaq .req q0 + Abeq .req q1 + Abiq .req q2 + Aboq .req q3 + Abuq .req q4 + Agaq .req q5 + Ageq .req q6 + Agiq .req q7 + Agoq .req q8 + Aguq .req q9 + Akaq .req q10 + Akeq .req q11 + Akiq .req q12 + Akoq .req q13 + Akuq .req q14 + Amaq .req q15 + Ameq .req q16 + Amiq .req q17 + Amoq .req q18 + Amuq .req q19 + Asaq .req q20 + Aseq .req q21 + Asiq .req q22 + Asoq .req q23 + Asuq .req q24 + + Abaz .req z0 + Abez .req z1 + Abiz .req z2 + Aboz .req z3 + Abuz .req z4 + Agaz .req z5 + Agez .req z6 + Agiz .req z7 + Agoz .req z8 + Aguz .req z9 + Akaz .req z10 + Akez .req z11 + Akiz .req z12 + Akoz .req z13 + Akuz .req z14 + Amaz .req z15 + Amez .req z16 + Amiz .req z17 + Amoz .req z18 + Amuz .req z19 + Asaz .req z20 + Asez .req z21 + Asiz .req z22 + Asoz .req z23 + Asuz .req z24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v25 + C1 .req v26 + C2 .req v27 + C3 .req v28 + C4 .req v29 + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req C4 + E1 .req C0 + E2 .req C1 + E3 .req C2 + E4 .req C3 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + Abi_ .req v2 + Abo_ .req v3 + Abu_ .req v4 + Aga_ .req v10 + Age_ .req v11 + Agi_ .req v7 + Ago_ .req v8 + Agu_ .req v9 + Aka_ .req v15 + Ake_ .req v16 + Aki_ .req v12 + Ako_ .req v13 + Aku_ .req v14 + Ama_ .req v20 + Ame_ .req v21 + Ami_ .req v17 + Amo_ .req v18 + Amu_ .req v19 + Asa_ .req v0 + Ase_ .req v1 + Asi_ .req v22 + Aso_ .req v23 + Asu_ .req v24 + Aba_ .req v30 + Abe_ .req E0 + +/************************ MACROS ****************************/ + +.macro load_input + ldr Abaq, [input_addr, #(4*8*0)] + ldr Abeq, [input_addr, #(4*8*1)] + ldr Abiq, [input_addr, #(4*8*2)] + ldr Aboq, [input_addr, #(4*8*3)] + ldr Abuq, [input_addr, #(4*8*4)] + ldr Agaq, [input_addr, #(4*8*5)] + ldr Ageq, [input_addr, #(4*8*6)] + ldr Agiq, [input_addr, #(4*8*7)] + ldr Agoq, [input_addr, #(4*8*8)] + ldr Aguq, [input_addr, #(4*8*9)] + ldr Akaq, [input_addr, #(4*8*10)] + ldr Akeq, [input_addr, #(4*8*11)] + ldr Akiq, [input_addr, #(4*8*12)] + ldr Akoq, [input_addr, #(4*8*13)] + ldr Akuq, [input_addr, #(4*8*14)] + ldr Amaq, [input_addr, #(4*8*15)] + ldr Ameq, [input_addr, #(4*8*16)] + ldr Amiq, [input_addr, #(4*8*17)] + ldr Amoq, [input_addr, #(4*8*18)] + ldr Amuq, [input_addr, #(4*8*19)] + ldr Asaq, [input_addr, #(4*8*20)] + ldr Aseq, [input_addr, #(4*8*21)] + ldr Asiq, [input_addr, #(4*8*22)] + ldr Asoq, [input_addr, #(4*8*23)] + ldr Asuq, [input_addr, #(4*8*24)] +.endm + +.macro store_input + str Abaq, [input_addr, #(4*8*0)] + str Abeq, [input_addr, #(4*8*1)] + str Abiq, [input_addr, #(4*8*2)] + str Aboq, [input_addr, #(4*8*3)] + str Abuq, [input_addr, #(4*8*4)] + str Agaq, [input_addr, #(4*8*5)] + str Ageq, [input_addr, #(4*8*6)] + str Agiq, [input_addr, #(4*8*7)] + str Agoq, [input_addr, #(4*8*8)] + str Aguq, [input_addr, #(4*8*9)] + str Akaq, [input_addr, #(4*8*10)] + str Akeq, [input_addr, #(4*8*11)] + str Akiq, [input_addr, #(4*8*12)] + str Akoq, [input_addr, #(4*8*13)] + str Akuq, [input_addr, #(4*8*14)] + str Amaq, [input_addr, #(4*8*15)] + str Ameq, [input_addr, #(4*8*16)] + str Amiq, [input_addr, #(4*8*17)] + str Amoq, [input_addr, #(4*8*18)] + str Amuq, [input_addr, #(4*8*19)] + str Asaq, [input_addr, #(4*8*20)] + str Aseq, [input_addr, #(4*8*21)] + str Asiq, [input_addr, #(4*8*22)] + str Asoq, [input_addr, #(4*8*23)] + str Asuq, [input_addr, #(4*8*24)] +.endm + +#define STACK_SIZE (16*4 + 16*6 + 16*5) // VREGS (16*4) + GPRS (TODO: Remove) + +#define STACK_BASE_GPRS (16*4) +#define STACK_BASE_VTMP (16*4 + 16*6) + +#define save(name)\ + str name ## q, [sp, #(STACK_BASE_VTMP + 16*(name ## _offset))] +#define restore(name) \ + ldr name ## q, [sp, #(STACK_BASE_VTMP + 16*(name ## _offset))] + +#define Aga_offset 0 +#define Age_offset 1 +#define Agi_offset 2 +#define Ago_offset 3 +#define Agu_offset 4 + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +.macro save_vregs + stp d8, d9, [sp, #(16*0)] + stp d10, d11, [sp, #(16*1)] + stp d12, d13, [sp, #(16*2)] + stp d14, d15, [sp, #(16*3)] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #(16*0)] + ldp d10, d11, [sp, #(16*1)] + ldp d12, d13, [sp, #(16*2)] + ldp d14, d15, [sp, #(16*3)] +.endm + +/* Macros using v8.4-A SHA-3 instructions */ + +.macro eor2 d s0 s1 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor3_m0 d s0 s1 s2 + eor3 \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +.macro rax1_m0 d s0 s1 + rax1 \d\().2d, \s0\().2d, \s1\().2d +.endm + +.macro xar_m0 d s0 s1 imm + xar \d\().2d, \s0\().2d, \s1\().2d, #\imm +.endm + +.macro rax1_m1 d s0 s1 + xar_m0 tmp, vzr, \s1, 63 + eor \d\().16b, \s0\().16b, tmp.16b +.endm + +.macro bcax_m0 d s0 s1 s2 + bcax \d\().16b, \s0\().16b, \s1\().16b, \s2\().16b +.endm + +.macro bcax_m2 d s0 s1 s2 + bcax \d\()z.d, \s0\()z.d, \s1\()z.d, \s2\()z.d +.endm + +/* Keccak-f1600 round */ + +.macro keccak_f1600_round + + eor3_m0 C2, Ami, Agi, Aki + eor3_m0 C0, Ama, Aga, Aka + eor3_m0 C1, Ame, Age, Ake + eor3_m0 C3, Amo, Ago, Ako + eor3_m0 C4, Asu, Agu, Aku + + vzr .req v31 + movi vzr.2d, #0 + + eor3_m0 C2, C2, Abi, Asi + save(Agi) SEP C1r .req Agi + eor3_m0 C0, C0, Aba, Asa + eor3_m0 C1, C1, Abe, Ase + save(Agu) SEP C3r .req Agu + eor3_m0 C3, C3, Abo, Aso + eor3_m0 C4, C4, Amu, Abu + + save(Ago) SEP C2r .req Ago + xar_m0 C1r, vzr, C1, 63 + xar_m0 C3r, vzr, C3, 63 + save(Aga) SEP C4r .req Aga + xar_m0 C2r, vzr, C2, 63 + xar_m0 C4r, vzr, C4, 63 + save(Age) SEP C0r .req Age + eor2 E0, C4, C1r + xar_m0 C0r, vzr, C0, 63 + eor2 E2, C1, C3r + eor2 E1, C0, C2r + restore(Agu) // C3r + eor2 E3, C2, C4r + eor2 E4, C3, C0r + restore(Ago) // C2r + restore(Agi) // C1r/Cor + + eor Aba_.16b, Aba.16b, E0.16b + xar_m0 Asa_, Abi, E2, 2 + restore(Aga) // C4r + xar_m0 Abi_, Aki, E2, 21 + xar_m0 Aki_, Ako, E3, 39 + restore(Age) // C0r + xar_m0 Ako_, Amu, E4, 56 + xar_m0 Amu_, Aso, E3, 8 + xar_m0 Aso_, Ama, E0, 23 + xar_m0 Aka_, Abe, E1, 63 + xar_m0 Ase_, Ago, E3, 9 + xar_m0 Ago_, Ame, E1, 19 + xar_m0 Ake_, Agi, E2, 58 + xar_m0 Agi_, Aka, E0, 61 + xar_m0 Aga_, Abo, E3, 36 + xar_m0 Abo_, Amo, E3, 43 + xar_m0 Amo_, Ami, E2, 49 + xar_m0 Ami_, Ake, E1, 54 + xar_m0 Age_, Agu, E4, 44 + xar_m0 Agu_, Asi, E2, 3 + xar_m0 Asi_, Aku, E4, 25 + xar_m0 Aku_, Asa, E0, 46 + xar_m0 Ama_, Abu, E4, 37 + xar_m0 Abu_, Asu, E4, 50 + xar_m0 Asu_, Ase, E1, 62 + xar_m0 Ame_, Aga, E0, 28 + xar_m0 Abe_, Age, E1, 20 + + ld1r {v31.2d}, [const_addr], #8 + + bcax_m0 Aga, Aga_, Agi_, Age_ + bcax_m0 Age, Age_, Ago_, Agi_ + bcax_m0 Agi, Agi_, Agu_, Ago_ + bcax_m0 Ago, Ago_, Aga_, Agu_ + bcax_m0 Agu, Agu_, Age_, Aga_ + bcax_m0 Aka, Aka_, Aki_, Ake_ + bcax_m0 Ake, Ake_, Ako_, Aki_ + bcax_m0 Aki, Aki_, Aku_, Ako_ + bcax_m0 Ako, Ako_, Aka_, Aku_ + bcax_m0 Aku, Aku_, Ake_, Aka_ + bcax_m0 Ama, Ama_, Ami_, Ame_ + bcax_m0 Ame, Ame_, Amo_, Ami_ + bcax_m0 Ami, Ami_, Amu_, Amo_ + bcax_m0 Amo, Amo_, Ama_, Amu_ + bcax_m0 Amu, Amu_, Ame_, Ama_ + bcax_m0 Asa, Asa_, Asi_, Ase_ + bcax_m0 Ase, Ase_, Aso_, Asi_ + bcax_m0 Asi, Asi_, Asu_, Aso_ + bcax_m0 Aso, Aso_, Asa_, Asu_ + bcax_m0 Asu, Asu_, Ase_, Asa_ + bcax_m0 Aba, Aba_, Abi_, Abe_ + bcax_m0 Abe, Abe_, Abo_, Abi_ + bcax_m0 Abi, Abi_, Abu_, Abo_ + bcax_m0 Abo, Abo_, Aba_, Abu_ + bcax_m0 Abu, Abu_, Abe_, Aba_ + + // iota step + eor Aba.16b, Aba.16b, v31.16b + +.endm + +#define KECCAK_F1600_ROUNDS 24 + +.text +.align 4 +.global keccak_f1600_x4_v84a_asm_v1p0 +.global _keccak_f1600_x4_v84a_asm_v1p0 + +keccak_f1600_x4_v84a_asm_v1p0: +_keccak_f1600_x4_v84a_asm_v1p0: + alloc_stack + save_vregs + + mov out_count, #2 +1: + load_constant_ptr + load_input + mov count, #(KECCAK_F1600_ROUNDS) +2: + keccak_f1600_round + sub count, count, #1 + cbnz count, 2b + + store_input + add input_addr, input_addr, #16 + + sub out_count, out_count, #1 + cbnz out_count, 1b + + restore_vregs + free_stack + ret + +#endif diff --git a/tests/keccak_neon/manual/keccak_f1600_x5_hybrid_asm_v8.s b/tests/keccak_neon/manual/keccak_f1600_x5_hybrid_asm_v8.s new file mode 100644 index 0000000..b26e3fa --- /dev/null +++ b/tests/keccak_neon/manual/keccak_f1600_x5_hybrid_asm_v8.s @@ -0,0 +1,1635 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +/********************** CONSTANTS *************************/ + .data + .align(8) +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 +round_constants_vec: + .quad 0x0000000000000001 + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + .quad 0x8000000080008008 +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x29 + count .req w27 + out_count .req w27 + cur_const .req x26 + + /* Mapping of Kecck-f1600 SIMD state to vector registers + * at the beginning and end of each round. */ + + /* Mapping of Kecck-f1600 state to vector registers + * at the beginning and end of each round. */ + vAba .req v0 + vAbe .req v1 + vAbi .req v2 + vAbo .req v3 + vAbu .req v4 + vAga .req v5 + vAge .req v6 + vAgi .req v7 + vAgo .req v8 + vAgu .req v9 + vAka .req v10 + vAke .req v11 + vAki .req v12 + vAko .req v13 + vAku .req v14 + vAma .req v15 + vAme .req v16 + vAmi .req v17 + vAmo .req v18 + vAmu .req v19 + vAsa .req v20 + vAse .req v21 + vAsi .req v22 + vAso .req v23 + vAsu .req v24 + + /* q-form of the above mapping */ + vAbaq .req q0 + vAbeq .req q1 + vAbiq .req q2 + vAboq .req q3 + vAbuq .req q4 + vAgaq .req q5 + vAgeq .req q6 + vAgiq .req q7 + vAgoq .req q8 + vAguq .req q9 + vAkaq .req q10 + vAkeq .req q11 + vAkiq .req q12 + vAkoq .req q13 + vAkuq .req q14 + vAmaq .req q15 + vAmeq .req q16 + vAmiq .req q17 + vAmoq .req q18 + vAmuq .req q19 + vAsaq .req q20 + vAseq .req q21 + vAsiq .req q22 + vAsoq .req q23 + vAsuq .req q24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v27 + C1 .req v28 + C2 .req v29 + C3 .req v30 + C4 .req v31 + + C0q .req q27 + C1q .req q28 + C2q .req q29 + C3q .req q30 + C4q .req q31 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + vBba .req v25 // fresh + vBbe .req v26 // fresh + vBbi .req vAbi + vBbo .req vAbo + vBbu .req vAbu + vBga .req vAka + vBge .req vAke + vBgi .req vAgi + vBgo .req vAgo + vBgu .req vAgu + vBka .req vAma + vBke .req vAme + vBki .req vAki + vBko .req vAko + vBku .req vAku + vBma .req vAsa + vBme .req vAse + vBmi .req vAmi + vBmo .req vAmo + vBmu .req vAmu + vBsa .req vAba + vBse .req vAbe + vBsi .req vAsi + vBso .req vAso + vBsu .req vAsu + + vBbaq .req q25 // fresh + vBbeq .req q26 // fresh + vBbiq .req vAbiq + vBboq .req vAboq + vBbuq .req vAbuq + vBgaq .req vAkaq + vBgeq .req vAkeq + vBgiq .req vAgiq + vBgoq .req vAgoq + vBguq .req vAguq + vBkaq .req vAmaq + vBkeq .req vAmeq + vBkiq .req vAkiq + vBkoq .req vAkoq + vBkuq .req vAkuq + vBmaq .req vAsaq + vBmeq .req vAseq + vBmiq .req vAmiq + vBmoq .req vAmoq + vBmuq .req vAmuq + vBsaq .req vAbaq + vBseq .req vAbeq + vBsiq .req vAsiq + vBsoq .req vAsoq + vBsuq .req vAsuq + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req C4 + E1 .req C0 + E2 .req vBbe // fresh + E3 .req C2 + E4 .req C3 + + E0q .req C4q + E1q .req C0q + E2q .req vBbeq // fresh + E3q .req C2q + E4q .req C3q + + /* Mapping of Kecck-f1600 state to scalar registers + * at the beginning and end of each round. */ + s_Aba .req x1 + sAbe .req x6 + sAbi .req x11 + sAbo .req x16 + sAbu .req x21 + sAga .req x2 + sAge .req x7 + sAgi .req x12 + sAgo .req x17 + sAgu .req x22 + sAka .req x3 + sAke .req x8 + sAki .req x13 + sAko .req x18 + sAku .req x23 + sAma .req x4 + sAme .req x9 + sAmi .req x14 + sAmo .req x19 + sAmu .req x24 + sAsa .req x5 + sAse .req x10 + sAsi .req x15 + sAso .req x20 + sAsu .req x25 + + /* sA_[y,2*x+3*y] = rot(A[x,y]) */ + s_Aba_ .req x0 + sAbe_ .req x28 + sAbi_ .req x11 + sAbo_ .req x16 + sAbu_ .req x21 + sAga_ .req x3 + sAge_ .req x8 + sAgi_ .req x12 + sAgo_ .req x17 + sAgu_ .req x22 + sAka_ .req x4 + sAke_ .req x9 + sAki_ .req x13 + sAko_ .req x18 + sAku_ .req x23 + sAma_ .req x5 + sAme_ .req x10 + sAmi_ .req x14 + sAmo_ .req x19 + sAmu_ .req x24 + sAsa_ .req x1 + sAse_ .req x6 + sAsi_ .req x15 + sAso_ .req x20 + sAsu_ .req x25 + + /* sC[x] = sA[x,0] xor sA[x,1] xor sA[x,2] xor sA[x,3] xor sA[x,4], for x in 0..4 */ + /* sE[x] = sC[x-1] xor rot(C[x+1],1), for x in 0..4 */ + sC0 .req x0 + sE0 .req x29 + sC1 .req x26 + sE1 .req x30 + sC2 .req x27 + sE2 .req x26 + sC3 .req x28 + sE3 .req x27 + sC4 .req x29 + sE4 .req x28 + + tmp .req x30 + +/************************ MACROS ****************************/ + +.macro eor2 d s0 s1 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor3_m1 d s0 s1 s2 + eor2 \d, \s0, \s1 + eor2 \d, \d, \s2 +.endm + +.macro rax1_m1 d s0 s1 + shl vvtmp.2d, \s1\().2d, #1 + sri vvtmp.2d, \s1\().2d, #63 + eor \d\().16b, vvtmp.16b, \s0\().16b +.endm + + .macro xar_m1 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 63 + eor \s0\().16b, \s0\().16b, \s1\().16b + add \d\().2d, \s0\().2d, \s0\().2d + sri \d\().2d, \s0\().2d, #(63) + .else + eor \s0\().16b, \s0\().16b, \s1\().16b + shl \d\().2d, \s0\().2d, #(64-\imm) + sri \d\().2d, \s0\().2d, #(\imm) + .endif +.endm + +.macro bcax_m1 d s0 s1 s2 + bic vvtmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, vvtmp.16b, \s0\().16b +.endm + +.macro load_input_vector + ldp vAbaq, vAbeq, [input_addr, #(16*0)] + ldp vAbiq, vAboq, [input_addr, #(16*2)] + ldp vAbuq, vAgaq, [input_addr, #(16*4)] + ldp vAgeq, vAgiq, [input_addr, #(16*6)] + ldp vAgoq, vAguq, [input_addr, #(16*8)] + ldp vAkaq, vAkeq, [input_addr, #(16*10)] + ldp vAkiq, vAkoq, [input_addr, #(16*12)] + ldp vAkuq, vAmaq, [input_addr, #(16*14)] + ldp vAmeq, vAmiq, [input_addr, #(16*16)] + ldp vAmoq, vAmuq, [input_addr, #(16*18)] + ldp vAsaq, vAseq, [input_addr, #(16*20)] + ldp vAsiq, vAsoq, [input_addr, #(16*22)] + ldr vAsuq, [input_addr, #(16*24)] + + // ldr vAbaq, [input_addr, #(16*0)] + // ldr vAbeq, [input_addr, #(16*1)] + // ldr vAbiq, [input_addr, #(16*2)] + // ldr vAboq, [input_addr, #(16*3)] + // ldr vAbuq, [input_addr, #(16*4)] + // ldr vAgaq, [input_addr, #(16*5)] + // ldr vAgeq, [input_addr, #(16*6)] + // ldr vAgiq, [input_addr, #(16*7)] + // ldr vAgoq, [input_addr, #(16*8)] + // ldr vAguq, [input_addr, #(16*9)] + // ldr vAkaq, [input_addr, #(16*10)] + // ldr vAkeq, [input_addr, #(16*11)] + // ldr vAkiq, [input_addr, #(16*12)] + // ldr vAkoq, [input_addr, #(16*13)] + // ldr vAkuq, [input_addr, #(16*14)] + // ldr vAmaq, [input_addr, #(16*15)] + // ldr vAmeq, [input_addr, #(16*16)] + // ldr vAmiq, [input_addr, #(16*17)] + // ldr vAmoq, [input_addr, #(16*18)] + // ldr vAmuq, [input_addr, #(16*19)] + // ldr vAsaq, [input_addr, #(16*20)] + // ldr vAseq, [input_addr, #(16*21)] + // ldr vAsiq, [input_addr, #(16*22)] + // ldr vAsoq, [input_addr, #(16*23)] + // ldr vAsuq, [input_addr, #(16*24)] +.endm + +.macro store_input_vector + stp vAbaq, vAbeq, [input_addr, #(16*0)] + stp vAbiq, vAboq, [input_addr, #(16*2)] + stp vAbuq, vAgaq, [input_addr, #(16*4)] + stp vAgeq, vAgiq, [input_addr, #(16*6)] + stp vAgoq, vAguq, [input_addr, #(16*8)] + stp vAkaq, vAkeq, [input_addr, #(16*10)] + stp vAkiq, vAkoq, [input_addr, #(16*12)] + stp vAkuq, vAmaq, [input_addr, #(16*14)] + stp vAmeq, vAmiq, [input_addr, #(16*16)] + stp vAmoq, vAmuq, [input_addr, #(16*18)] + stp vAsaq, vAseq, [input_addr, #(16*20)] + stp vAsiq, vAsoq, [input_addr, #(16*22)] + str vAsuq, [input_addr, #(16*24)] + + // str vAbaq, [input_addr, #(16*0)] + // str vAbeq, [input_addr, #(16*1)] + // str vAbiq, [input_addr, #(16*2)] + // str vAboq, [input_addr, #(16*3)] + // str vAbuq, [input_addr, #(16*4)] + // str vAgaq, [input_addr, #(16*5)] + // str vAgeq, [input_addr, #(16*6)] + // str vAgiq, [input_addr, #(16*7)] + // str vAgoq, [input_addr, #(16*8)] + // str vAguq, [input_addr, #(16*9)] + // str vAkaq, [input_addr, #(16*10)] + // str vAkeq, [input_addr, #(16*11)] + // str vAkiq, [input_addr, #(16*12)] + // str vAkoq, [input_addr, #(16*13)] + // str vAkuq, [input_addr, #(16*14)] + // str vAmaq, [input_addr, #(16*15)] + // str vAmeq, [input_addr, #(16*16)] + // str vAmiq, [input_addr, #(16*17)] + // str vAmoq, [input_addr, #(16*18)] + // str vAmuq, [input_addr, #(16*19)] + // str vAsaq, [input_addr, #(16*20)] + // str vAseq, [input_addr, #(16*21)] + // str vAsiq, [input_addr, #(16*22)] + // str vAsoq, [input_addr, #(16*23)] + // str vAsuq, [input_addr, #(16*24)] +.endm + +.macro load_input_scalar + ldp s_Aba, sAbe, [input_addr,8*0 ] + ldp sAbi, sAbo, [input_addr,8*2 ] + ldp sAbu, sAga, [input_addr,8*4 ] + ldp sAge, sAgi, [input_addr,8*6 ] + ldp sAgo, sAgu, [input_addr,8*8 ] + ldp sAka, sAke, [input_addr,8*10] + ldp sAki, sAko, [input_addr,8*12] + ldp sAku, sAma, [input_addr,8*14] + ldp sAme, sAmi, [input_addr,8*16] + ldp sAmo, sAmu, [input_addr,8*18] + ldp sAsa, sAse, [input_addr,8*20] + ldp sAsi, sAso, [input_addr,8*22] + ldr sAsu, [input_addr,8*24] +.endm + +.macro store_input_scalar + stp s_Aba, sAbe, [input_addr,8*0 ] + stp sAbi, sAbo, [input_addr,8*2 ] + stp sAbu, sAga, [input_addr,8*4 ] + stp sAge, sAgi, [input_addr,8*6 ] + stp sAgo, sAgu, [input_addr,8*8 ] + stp sAka, sAke, [input_addr,8*10] + stp sAki, sAko, [input_addr,8*12] + stp sAku, sAma, [input_addr,8*14] + stp sAme, sAmi, [input_addr,8*16] + stp sAmo, sAmu, [input_addr,8*18] + stp sAsa, sAse, [input_addr,8*20] + stp sAsi, sAso, [input_addr,8*22] + str sAsu, [input_addr,8*24] +.endm + + +#define STACK_SIZE (4*16 + 12*8 + 6*8 + 16*1) +#define STACK_BASE_VREGS (0) +#define STACK_BASE_GPRS (4*16) +#define STACK_BASE_TMP_GPRS (4*16 + 12*8) +#define STACK_BASE_TMP_VREGS (4*16 + 12*8 + 6*8) +#define STACK_OFFSET_INPUT (0*8) +#define STACK_OFFSET_CONST (1*8) +#define STACK_OFFSET_COUNT (2*8) +#define STACK_OFFSET_COUNT_OUT (3*8) +#define STACK_OFFSET_CUR_INPUT (4*8) + +#define vAga_offset 0 + +#define save(name) \ + str name ## q, [sp, #(STACK_BASE_TMP_VREGS + 16 * name ## _offset)] +#define restore(name) \ + ldr name ## q, [sp, #(STACK_BASE_TMP_VREGS + 16 * name ## _offset)] + + +.macro save_gprs + stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro save_vregs + stp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] + stp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + stp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + stp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] +.endm + +.macro restore_vregs + ldp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] + ldp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + ldp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + ldp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] +.endm + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +.macro eor5 dst, src0, src1, src2, src3, src4 + eor \dst, \src0, \src1 + eor \dst, \dst, \src2 + eor \dst, \dst, \src3 + eor \dst, \dst, \src4 +.endm + +.macro xor_rol dst, src1, src0, imm + eor \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro bic_rol dst, src1, src0, imm + bic \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro rotate dst, src, imm + ror \dst, \src, #(64-\imm) +.endm + +.macro save reg, offset + str \reg, [sp, #(STACK_BASE_TMP_GPRS + \offset)] +.endm + +.macro restore reg, offset + ldr \reg, [sp, #(STACK_BASE_TMP_GPRS + \offset)] +.endm + +.macro hybrid_round_initial +eor sC0, sAma, sAsa SEP +eor sC1, sAme, sAse SEP eor3_m1 C1,vAbe,vAge,vAke +eor sC2, sAmi, sAsi SEP +eor sC3, sAmo, sAso SEP +eor sC4, sAmu, sAsu SEP +eor sC0, sAka, sC0 SEP eor3_m1 C3,vAbo,vAgo,vAko +eor sC1, sAke, sC1 SEP +eor sC2, sAki, sC2 SEP +eor sC3, sAko, sC3 SEP +eor sC4, sAku, sC4 SEP eor3_m1 C0,vAba,vAga,vAka +eor sC0, sAga, sC0 SEP +eor sC1, sAge, sC1 SEP +eor sC2, sAgi, sC2 SEP +eor sC3, sAgo, sC3 SEP eor3_m1 C2,vAbi,vAgi,vAki +eor sC4, sAgu, sC4 SEP +eor sC0, s_Aba, sC0 SEP +eor sC1, sAbe, sC1 SEP +eor sC2, sAbi, sC2 SEP eor3_m1 C4,vAbu,vAgu,vAku +eor sC3, sAbo, sC3 SEP +eor sC4, sAbu, sC4 SEP +eor sE1, sC0, sC2, ROR #63 SEP +eor sE3, sC2, sC4, ROR #63 SEP eor3_m1 C1, C1,vAme, vAse +eor sE0, sC4, sC1, ROR #63 SEP +eor sE2, sC1, sC3, ROR #63 SEP +eor sE4, sC3, sC0, ROR #63 SEP +eor s_Aba_, s_Aba, sE0 SEP eor3_m1 C3, C3,vAmo, vAso +eor sAsa_, sAbi, sE2 SEP +eor sAbi_, sAki, sE2 SEP +eor sAki_, sAko, sE3 SEP +eor sAko_, sAmu, sE4 SEP eor3_m1 C0, C0,vAma, vAsa +eor sAmu_, sAso, sE3 SEP +eor sAso_, sAma, sE0 SEP +eor sAka_, sAbe, sE1 SEP +eor sAse_, sAgo, sE3 SEP eor3_m1 C2, C2,vAmi, vAsi +eor sAgo_, sAme, sE1 SEP +eor sAke_, sAgi, sE2 SEP +eor sAgi_, sAka, sE0 SEP +eor sAga_, sAbo, sE3 SEP eor3_m1 C4, C4,vAmu, vAsu +eor sAbo_, sAmo, sE3 SEP +eor sAmo_, sAmi, sE2 SEP vvtmp .req vBba +eor sAmi_, sAke, sE1 SEP +eor sAge_, sAgu, sE4 SEP rax1_m1 E2, C1, C3 +eor sAgu_, sAsi, sE2 SEP +eor sAsi_, sAku, sE4 SEP +eor sAku_, sAsa, sE0 SEP +eor sAma_, sAbu, sE4 SEP rax1_m1 E4, C3, C0 +eor sAbu_, sAsu, sE4 SEP +eor sAsu_, sAse, sE1 SEP +eor sAme_, sAga, sE0 SEP +eor sAbe_, sAge, sE1 SEP rax1_m1 E1, C0, C2 +load_constant_ptr SEP +bic tmp, sAgi_, sAge_, ROR #47 SEP +eor sAga, tmp, sAga_, ROR #39 SEP +bic tmp, sAgo_, sAgi_, ROR #42 SEP rax1_m1 E3, C2, C4 +eor sAge, tmp, sAge_, ROR #25 SEP +bic tmp, sAgu_, sAgo_, ROR #16 SEP +eor sAgi, tmp, sAgi_, ROR #58 SEP +bic tmp, sAga_, sAgu_, ROR #31 SEP rax1_m1 E0, C4, C1 +eor sAgo, tmp, sAgo_, ROR #47 SEP +bic tmp, sAge_, sAga_, ROR #56 SEP .unreq vvtmp +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP vvtmp .req C1 +eor sAka, tmp, sAka_, ROR #24 SEP +bic tmp, sAko_, sAki_, ROR #47 SEP vvtmpq .req C1q +eor sAke, tmp, sAke_, ROR #2 SEP +bic tmp, sAku_, sAko_, ROR #10 SEP eor vBba.16b, vAba.16b, E0.16b +eor sAki, tmp, sAki_, ROR #57 SEP +bic tmp, sAka_, sAku_, ROR #47 SEP xar_m1 vBsa, vAbi, E2, 2 +eor sAko, tmp, sAko_, ROR #57 SEP +bic tmp, sAke_, sAka_, ROR #5 SEP +eor sAku, tmp, sAku_, ROR #52 SEP +bic tmp, sAmi_, sAme_, ROR #38 SEP xar_m1 vBbi, vAki, E2, 21 +eor sAma, tmp, sAma_, ROR #47 SEP +bic tmp, sAmo_, sAmi_, ROR #5 SEP +eor sAme, tmp, sAme_, ROR #43 SEP +bic tmp, sAmu_, sAmo_, ROR #41 SEP xar_m1 vBki, vAko, E3, 39 +eor sAmi, tmp, sAmi_, ROR #46 SEP +ldr cur_const, [const_addr] SEP +mov count, #1 SEP +bic tmp, sAma_, sAmu_, ROR #35 SEP xar_m1 vBko, vAmu, E4, 56 +eor sAmo, tmp, sAmo_, ROR #12 SEP +bic tmp, sAme_, sAma_, ROR #9 SEP +eor sAmu, tmp, sAmu_, ROR #44 SEP +bic tmp, sAsi_, sAse_, ROR #48 SEP xar_m1 vBmu, vAso, E3, 8 +eor sAsa, tmp, sAsa_, ROR #41 SEP +bic tmp, sAso_, sAsi_, ROR #2 SEP +eor sAse, tmp, sAse_, ROR #50 SEP +bic tmp, sAsu_, sAso_, ROR #25 SEP xar_m1 vBso, vAma, E0, 23 +eor sAsi, tmp, sAsi_, ROR #27 SEP +bic tmp, sAsa_, sAsu_, ROR #60 SEP +eor sAso, tmp, sAso_, ROR #21 SEP +bic tmp, sAse_, sAsa_, ROR #57 SEP xar_m1 vBka, vAbe, E1, 63 +eor sAsu, tmp, sAsu_, ROR #53 SEP +bic tmp, sAbi_, sAbe_, ROR #63 SEP +eor s_Aba, s_Aba_, tmp, ROR #21 SEP +bic tmp, sAbo_, sAbi_, ROR #42 SEP xar_m1 vBse, vAgo, E3, 9 +eor sAbe, tmp, sAbe_, ROR #41 SEP +bic tmp, sAbu_, sAbo_, ROR #57 SEP +eor sAbi, tmp, sAbi_, ROR #35 SEP +bic tmp, s_Aba_, sAbu_, ROR #50 SEP xar_m1 vBgo, vAme, E1, 19 +eor sAbo, tmp, sAbo_, ROR #43 SEP +bic tmp, sAbe_, s_Aba_, ROR #44 SEP +eor sAbu, tmp, sAbu_, ROR #30 SEP +eor s_Aba, s_Aba, cur_const SEP xar_m1 vBke, vAgi, E2, 58 +save count, STACK_OFFSET_COUNT SEP +eor sC0, sAka, sAsa, ROR #50 SEP +eor sC1, sAse, sAge, ROR #60 SEP +eor sC2, sAmi, sAgi, ROR #59 SEP xar_m1 vBgi, vAka, E0, 61 +eor sC3, sAgo, sAso, ROR #30 SEP +eor sC4, sAbu, sAsu, ROR #53 SEP +eor sC0, sAma, sC0, ROR #49 SEP +eor sC1, sAbe, sC1, ROR #44 SEP xar_m1 vBga, vAbo, E3, 36 +eor sC2, sAki, sC2, ROR #26 SEP +eor sC3, sAmo, sC3, ROR #63 SEP +eor sC4, sAmu, sC4, ROR #56 SEP +eor sC0, sAga, sC0, ROR #57 SEP xar_m1 vBbo, vAmo, E3, 43 +eor sC1, sAme, sC1, ROR #58 SEP +eor sC2, sAbi, sC2, ROR #60 SEP +eor sC3, sAko, sC3, ROR #38 SEP +eor sC4, sAgu, sC4, ROR #48 SEP xar_m1 vBmo, vAmi, E2, 49 +eor sC0, s_Aba, sC0, ROR #61 SEP +eor sC1, sAke, sC1, ROR #57 SEP +eor sC2, sAsi, sC2, ROR #52 SEP +eor sC3, sAbo, sC3, ROR #63 SEP +eor sC4, sAku, sC4, ROR #50 SEP xar_m1 vBmi, vAke, E1, 54 +ror sC1, sC1, 56 SEP +ror sC4, sC4, 58 SEP +ror sC2, sC2, 62 SEP +eor sE1, sC0, sC2, ROR #63 SEP xar_m1 vBge, vAgu, E4, 44 +eor sE3, sC2, sC4, ROR #63 SEP +eor sE0, sC4, sC1, ROR #63 SEP mov E3.16b, vAga.16b +eor sE2, sC1, sC3, ROR #63 SEP +eor sE4, sC3, sC0, ROR #63 SEP bcax_m1 vAga, vBga, vBgi, vBge +eor s_Aba_, sE0, s_Aba SEP +eor sAsa_, sE2, sAbi, ROR #50 SEP +eor sAbi_, sE2, sAki, ROR #46 SEP xar_m1 vBgu, vAsi, E2, 3 +eor sAki_, sE3, sAko, ROR #63 SEP +eor sAko_, sE4, sAmu, ROR #28 SEP +eor sAmu_, sE3, sAso, ROR #2 SEP +eor sAso_, sE0, sAma, ROR #54 SEP xar_m1 vBsi, vAku, E4, 25 +eor sAka_, sE1, sAbe, ROR #43 SEP +eor sAse_, sE3, sAgo, ROR #36 SEP +eor sAgo_, sE1, sAme, ROR #49 SEP +eor sAke_, sE2, sAgi, ROR #3 SEP xar_m1 vBku, vAsa, E0, 46 +eor sAgi_, sE0, sAka, ROR #39 SEP +eor sAga_, sE3, sAbo SEP +eor sAbo_, sE3, sAmo, ROR #37 SEP +eor sAmo_, sE2, sAmi, ROR #8 SEP +eor sAmi_, sE1, sAke, ROR #56 SEP +eor sAge_, sE4, sAgu, ROR #44 SEP +eor sAgu_, sE2, sAsi, ROR #62 SEP xar_m1 vBma, vAbu, E4, 37 +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP +eor sAma_, sE4, sAbu, ROR #20 SEP +eor sAbu_, sE4, sAsu, ROR #9 SEP +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP +eor sAbe_, sE1, sAge, ROR #19 SEP xar_m1 vBbu, vAsu, E4, 50 +load_constant_ptr SEP +restore count, STACK_OFFSET_COUNT SEP +bic tmp, sAgi_, sAge_, ROR #47 SEP +eor sAga, tmp, sAga_, ROR #39 SEP +bic tmp, sAgo_, sAgi_, ROR #42 SEP xar_m1 vBsu, vAse, E1, 62 +eor sAge, tmp, sAge_, ROR #25 SEP +bic tmp, sAgu_, sAgo_, ROR #16 SEP +eor sAgi, tmp, sAgi_, ROR #58 SEP +bic tmp, sAga_, sAgu_, ROR #31 SEP +eor sAgo, tmp, sAgo_, ROR #47 SEP +bic tmp, sAge_, sAga_, ROR #56 SEP +eor sAgu, tmp, sAgu_, ROR #23 SEP xar_m1 vBme, E3, E0, 28 +bic tmp, sAki_, sAke_, ROR #19 SEP +eor sAka, tmp, sAka_, ROR #24 SEP +bic tmp, sAko_, sAki_, ROR #47 SEP +eor sAke, tmp, sAke_, ROR #2 SEP +bic tmp, sAku_, sAko_, ROR #10 SEP +eor sAki, tmp, sAki_, ROR #57 SEP xar_m1 vBbe, vAge, E1, 20 +bic tmp, sAka_, sAku_, ROR #47 SEP +eor sAko, tmp, sAko_, ROR #57 SEP +bic tmp, sAke_, sAka_, ROR #5 SEP +eor sAku, tmp, sAku_, ROR #52 SEP +bic tmp, sAmi_, sAme_, ROR #38 SEP +eor sAma, tmp, sAma_, ROR #47 SEP bcax_m1 vAge, vBge, vBgo, vBgi +bic tmp, sAmo_, sAmi_, ROR #5 SEP +eor sAme, tmp, sAme_, ROR #43 SEP +bic tmp, sAmu_, sAmo_, ROR #41 SEP +eor sAmi, tmp, sAmi_, ROR #46 SEP bcax_m1 vAgi, vBgi, vBgu, vBgo +bic tmp, sAma_, sAmu_, ROR #35 SEP +eor sAmo, tmp, sAmo_, ROR #12 SEP +bic tmp, sAme_, sAma_, ROR #9 SEP +eor sAmu, tmp, sAmu_, ROR #44 SEP bcax_m1 vAgo, vBgo, vBga, vBgu +bic tmp, sAsi_, sAse_, ROR #48 SEP +ldr cur_const, [const_addr, count, UXTW #3] SEP +eor sAsa, tmp, sAsa_, ROR #41 SEP +bic tmp, sAso_, sAsi_, ROR #2 SEP bcax_m1 vAgu, vBgu, vBge, vBga +eor sAse, tmp, sAse_, ROR #50 SEP +bic tmp, sAsu_, sAso_, ROR #25 SEP +eor sAsi, tmp, sAsi_, ROR #27 SEP +bic tmp, sAsa_, sAsu_, ROR #60 SEP bcax_m1 vAka, vBka, vBki, vBke +eor sAso, tmp, sAso_, ROR #21 SEP +bic tmp, sAse_, sAsa_, ROR #57 SEP +eor sAsu, tmp, sAsu_, ROR #53 SEP +bic tmp, sAbi_, sAbe_, ROR #63 SEP bcax_m1 vAke, vBke, vBko, vBki +eor s_Aba, s_Aba_, tmp, ROR #21 SEP .unreq vvtmp +bic tmp, sAbo_, sAbi_, ROR #42 SEP +eor sAbe, tmp, sAbe_, ROR #41 SEP .unreq vvtmpq +bic tmp, sAbu_, sAbo_, ROR #57 SEP eor2 C0, vAka, vAga +eor sAbi, tmp, sAbi_, ROR #35 SEP vvtmp .req vAga +bic tmp, s_Aba_, sAbu_, ROR #50 SEP save(vAga) +eor sAbo, tmp, sAbo_, ROR #43 SEP vvtmpq .req vAgaq +bic tmp, sAbe_, s_Aba_, ROR #44 SEP bcax_m1 vAki, vBki, vBku, vBko +eor sAbu, tmp, sAbu_, ROR #30 SEP +add count, count, #1 SEP +eor s_Aba, s_Aba, cur_const SEP + SEP +save count, STACK_OFFSET_COUNT SEP bcax_m1 vAko, vBko, vBka, vBku +eor sC0, sAka, sAsa, ROR #50 SEP +eor sC1, sAse, sAge, ROR #60 SEP +eor sC2, sAmi, sAgi, ROR #59 SEP +eor sC3, sAgo, sAso, ROR #30 SEP eor2 C1, vAke, vAge +eor sC4, sAbu, sAsu, ROR #53 SEP +eor sC0, sAma, sC0, ROR #49 SEP bcax_m1 vAku, vBku, vBke, vBka +eor sC1, sAbe, sC1, ROR #44 SEP +eor sC2, sAki, sC2, ROR #26 SEP +eor sC3, sAmo, sC3, ROR #63 SEP +eor sC4, sAmu, sC4, ROR #56 SEP eor2 C2, vAki, vAgi +eor sC0, sAga, sC0, ROR #57 SEP +eor sC1, sAme, sC1, ROR #58 SEP bcax_m1 vAma, vBma, vBmi, vBme +eor sC2, sAbi, sC2, ROR #60 SEP +eor sC3, sAko, sC3, ROR #38 SEP +eor sC4, sAgu, sC4, ROR #48 SEP +eor sC0, s_Aba, sC0, ROR #61 SEP eor2 C3, vAko, vAgo +eor sC1, sAke, sC1, ROR #57 SEP +eor sC2, sAsi, sC2, ROR #52 SEP bcax_m1 vAme, vBme, vBmo, vBmi +eor sC3, sAbo, sC3, ROR #63 SEP +eor sC4, sAku, sC4, ROR #50 SEP +ror sC1, sC1, 56 SEP +ror sC4, sC4, 58 SEP eor2 C4, vAku, vAgu +ror sC2, sC2, 62 SEP +eor sE1, sC0, sC2, ROR #63 SEP bcax_m1 vAmi, vBmi, vBmu, vBmo +eor sE3, sC2, sC4, ROR #63 SEP +eor sE0, sC4, sC1, ROR #63 SEP +eor sE2, sC1, sC3, ROR #63 SEP eor2 C0, C0, vAma +eor sE4, sC3, sC0, ROR #63 SEP +eor s_Aba_, sE0, s_Aba SEP bcax_m1 vAmo, vBmo, vBma, vBmu +eor sAsa_, sE2, sAbi, ROR #50 SEP +eor sAbi_, sE2, sAki, ROR #46 SEP +eor sAki_, sE3, sAko, ROR #63 SEP +eor sAko_, sE4, sAmu, ROR #28 SEP eor2 C1, C1, vAme +eor sAmu_, sE3, sAso, ROR #2 SEP +eor sAso_, sE0, sAma, ROR #54 SEP bcax_m1 vAmu, vBmu, vBme, vBma +eor sAka_, sE1, sAbe, ROR #43 SEP +eor sAse_, sE3, sAgo, ROR #36 SEP +eor sAgo_, sE1, sAme, ROR #49 SEP eor2 C2, C2, vAmi +eor sAke_, sE2, sAgi, ROR #3 SEP +eor sAgi_, sE0, sAka, ROR #39 SEP bcax_m1 vAsa, vBsa, vBsi, vBse +eor sAga_, sE3, sAbo SEP +eor sAbo_, sE3, sAmo, ROR #37 SEP eor2 C3, C3, vAmo +eor sAmo_, sE2, sAmi, ROR #8 SEP +eor sAmi_, sE1, sAke, ROR #56 SEP bcax_m1 vAse, vBse, vBso, vBsi +eor sAge_, sE4, sAgu, ROR #44 SEP +eor sAgu_, sE2, sAsi, ROR #62 SEP +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP eor2 C4, C4, vAmu +eor sAma_, sE4, sAbu, ROR #20 SEP +eor sAbu_, sE4, sAsu, ROR #9 SEP bcax_m1 vAsi, vBsi, vBsu, vBso +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP +eor sAbe_, sE1, sAge, ROR #19 SEP +load_constant_ptr SEP eor2 C0, C0, vAsa +restore count, STACK_OFFSET_COUNT SEP +bic tmp, sAgi_, sAge_, ROR #47 SEP bcax_m1 vAso, vBso, vBsa, vBsu +eor sAga, tmp, sAga_, ROR #39 SEP +bic tmp, sAgo_, sAgi_, ROR #42 SEP +eor sAge, tmp, sAge_, ROR #25 SEP +bic tmp, sAgu_, sAgo_, ROR #16 SEP eor2 C1, C1, vAse +eor sAgi, tmp, sAgi_, ROR #58 SEP +bic tmp, sAga_, sAgu_, ROR #31 SEP bcax_m1 vAsu, vBsu, vBse, vBsa +eor sAgo, tmp, sAgo_, ROR #47 SEP +bic tmp, sAge_, sAga_, ROR #56 SEP +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP eor2 C2, C2, vAsi +eor sAka, tmp, sAka_, ROR #24 SEP +bic tmp, sAko_, sAki_, ROR #47 SEP eor2 C3, C3, vAso +eor sAke, tmp, sAke_, ROR #2 SEP +bic tmp, sAku_, sAko_, ROR #10 SEP bcax_m1 vAba, vBba, vBbi, vBbe +eor sAki, tmp, sAki_, ROR #57 SEP +bic tmp, sAka_, sAku_, ROR #47 SEP +eor sAko, tmp, sAko_, ROR #57 SEP +bic tmp, sAke_, sAka_, ROR #5 SEP bcax_m1 vAbe, vBbe, vBbo, vBbi +eor sAku, tmp, sAku_, ROR #52 SEP +bic tmp, sAmi_, sAme_, ROR #38 SEP +eor sAma, tmp, sAma_, ROR #47 SEP +bic tmp, sAmo_, sAmi_, ROR #5 SEP eor2 C1, C1, vAbe +eor sAme, tmp, sAme_, ROR #43 SEP restore x26, STACK_OFFSET_CONST +bic tmp, sAmu_, sAmo_, ROR #41 SEP ldr vvtmpq, [x26], #16 +eor sAmi, tmp, sAmi_, ROR #46 SEP save x26, STACK_OFFSET_CONST +bic tmp, sAma_, sAmu_, ROR #35 SEP +eor sAmo, tmp, sAmo_, ROR #12 SEP eor vAba.16b, vAba.16b, vvtmp.16b +bic tmp, sAme_, sAma_, ROR #9 SEP +eor sAmu, tmp, sAmu_, ROR #44 SEP eor2 C4, C4, vAsu +bic tmp, sAsi_, sAse_, ROR #48 SEP +ldr cur_const, [const_addr, count, UXTW #3] SEP bcax_m1 vAbi, vBbi, vBbu, vBbo +eor sAsa, tmp, sAsa_, ROR #41 SEP +bic tmp, sAso_, sAsi_, ROR #2 SEP +eor sAse, tmp, sAse_, ROR #50 SEP +bic tmp, sAsu_, sAso_, ROR #25 SEP bcax_m1 vAbo, vBbo, vBba, vBbu +eor sAsi, tmp, sAsi_, ROR #27 SEP +bic tmp, sAsa_, sAsu_, ROR #60 SEP +eor sAso, tmp, sAso_, ROR #21 SEP +bic tmp, sAse_, sAsa_, ROR #57 SEP eor2 C3, C3, vAbo +eor sAsu, tmp, sAsu_, ROR #53 SEP +bic tmp, sAbi_, sAbe_, ROR #63 SEP eor2 C2, C2, vAbi +eor s_Aba, s_Aba_, tmp, ROR #21 SEP +bic tmp, sAbo_, sAbi_, ROR #42 SEP eor2 C0, C0, vAba +eor sAbe, tmp, sAbe_, ROR #41 SEP +bic tmp, sAbu_, sAbo_, ROR #57 SEP bcax_m1 vAbu, vBbu, vBbe, vBba +eor sAbi, tmp, sAbi_, ROR #35 SEP +bic tmp, s_Aba_, sAbu_, ROR #50 SEP +eor sAbo, tmp, sAbo_, ROR #43 SEP +bic tmp, sAbe_, s_Aba_, ROR #44 SEP eor2 C4, C4, vAbu +eor sAbu, tmp, sAbu_, ROR #30 SEP +add count, count, #1 SEP restore(vAga) +eor s_Aba, s_Aba, cur_const SEP + .unreq vvtmp + + .unreq vvtmpq +.endm + +.macro hybrid_round_noninitial + SEP vvtmp .req vBba +save count, STACK_OFFSET_COUNT SEP rax1_m1 E2, C1, C3 +eor sC0, sAka, sAsa, ROR #50 SEP +eor sC1, sAse, sAge, ROR #60 SEP +eor sC2, sAmi, sAgi, ROR #59 SEP +eor sC3, sAgo, sAso, ROR #30 SEP +eor sC4, sAbu, sAsu, ROR #53 SEP +eor sC0, sAma, sC0, ROR #49 SEP rax1_m1 E4, C3, C0 +eor sC1, sAbe, sC1, ROR #44 SEP +eor sC2, sAki, sC2, ROR #26 SEP +eor sC3, sAmo, sC3, ROR #63 SEP +eor sC4, sAmu, sC4, ROR #56 SEP +eor sC0, sAga, sC0, ROR #57 SEP +eor sC1, sAme, sC1, ROR #58 SEP rax1_m1 E1, C0, C2 +eor sC2, sAbi, sC2, ROR #60 SEP +eor sC3, sAko, sC3, ROR #38 SEP +eor sC4, sAgu, sC4, ROR #48 SEP +eor sC0, s_Aba, sC0, ROR #61 SEP +eor sC1, sAke, sC1, ROR #57 SEP +eor sC2, sAsi, sC2, ROR #52 SEP rax1_m1 E3, C2, C4 +eor sC3, sAbo, sC3, ROR #63 SEP +eor sC4, sAku, sC4, ROR #50 SEP +ror sC1, sC1, 56 SEP +ror sC4, sC4, 58 SEP +ror sC2, sC2, 62 SEP +eor sE1, sC0, sC2, ROR #63 SEP rax1_m1 E0, C4, C1 +eor sE3, sC2, sC4, ROR #63 SEP +eor sE0, sC4, sC1, ROR #63 SEP .unreq vvtmp +eor sE2, sC1, sC3, ROR #63 SEP vvtmp .req C1 +eor sE4, sC3, sC0, ROR #63 SEP vvtmpq .req C1q +eor s_Aba_, sE0, s_Aba SEP +eor sAsa_, sE2, sAbi, ROR #50 SEP eor vBba.16b, vAba.16b, E0.16b +eor sAbi_, sE2, sAki, ROR #46 SEP +eor sAki_, sE3, sAko, ROR #63 SEP xar_m1 vBsa, vAbi, E2, 2 +eor sAko_, sE4, sAmu, ROR #28 SEP +eor sAmu_, sE3, sAso, ROR #2 SEP +eor sAso_, sE0, sAma, ROR #54 SEP +eor sAka_, sE1, sAbe, ROR #43 SEP +eor sAse_, sE3, sAgo, ROR #36 SEP +eor sAgo_, sE1, sAme, ROR #49 SEP xar_m1 vBbi, vAki, E2, 21 +eor sAke_, sE2, sAgi, ROR #3 SEP +eor sAgi_, sE0, sAka, ROR #39 SEP +eor sAga_, sE3, sAbo SEP +eor sAbo_, sE3, sAmo, ROR #37 SEP +eor sAmo_, sE2, sAmi, ROR #8 SEP +eor sAmi_, sE1, sAke, ROR #56 SEP xar_m1 vBki, vAko, E3, 39 +eor sAge_, sE4, sAgu, ROR #44 SEP +eor sAgu_, sE2, sAsi, ROR #62 SEP +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP +eor sAma_, sE4, sAbu, ROR #20 SEP +eor sAbu_, sE4, sAsu, ROR #9 SEP xar_m1 vBko, vAmu, E4, 56 +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP +eor sAbe_, sE1, sAge, ROR #19 SEP +load_constant_ptr SEP +restore count, STACK_OFFSET_COUNT SEP +bic tmp, sAgi_, sAge_, ROR #47 SEP xar_m1 vBmu, vAso, E3, 8 +eor sAga, tmp, sAga_, ROR #39 SEP +bic tmp, sAgo_, sAgi_, ROR #42 SEP +eor sAge, tmp, sAge_, ROR #25 SEP +bic tmp, sAgu_, sAgo_, ROR #16 SEP +eor sAgi, tmp, sAgi_, ROR #58 SEP +bic tmp, sAga_, sAgu_, ROR #31 SEP xar_m1 vBso, vAma, E0, 23 +eor sAgo, tmp, sAgo_, ROR #47 SEP +bic tmp, sAge_, sAga_, ROR #56 SEP +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP +eor sAka, tmp, sAka_, ROR #24 SEP +bic tmp, sAko_, sAki_, ROR #47 SEP xar_m1 vBka, vAbe, E1, 63 +eor sAke, tmp, sAke_, ROR #2 SEP +bic tmp, sAku_, sAko_, ROR #10 SEP +eor sAki, tmp, sAki_, ROR #57 SEP +bic tmp, sAka_, sAku_, ROR #47 SEP +eor sAko, tmp, sAko_, ROR #57 SEP +bic tmp, sAke_, sAka_, ROR #5 SEP xar_m1 vBse, vAgo, E3, 9 +eor sAku, tmp, sAku_, ROR #52 SEP +bic tmp, sAmi_, sAme_, ROR #38 SEP +eor sAma, tmp, sAma_, ROR #47 SEP +bic tmp, sAmo_, sAmi_, ROR #5 SEP +eor sAme, tmp, sAme_, ROR #43 SEP xar_m1 vBgo, vAme, E1, 19 +bic tmp, sAmu_, sAmo_, ROR #41 SEP +eor sAmi, tmp, sAmi_, ROR #46 SEP +bic tmp, sAma_, sAmu_, ROR #35 SEP +ldr cur_const, [const_addr, count, UXTW #3] +add count, count, #1 SEP xar_m1 vBke, vAgi, E2, 58 +eor sAmo, tmp, sAmo_, ROR #12 SEP +bic tmp, sAme_, sAma_, ROR #9 SEP +eor sAmu, tmp, sAmu_, ROR #44 SEP +bic tmp, sAsi_, sAse_, ROR #48 SEP +eor sAsa, tmp, sAsa_, ROR #41 SEP xar_m1 vBgi, vAka, E0, 61 +bic tmp, sAso_, sAsi_, ROR #2 SEP +eor sAse, tmp, sAse_, ROR #50 SEP +bic tmp, sAsu_, sAso_, ROR #25 SEP +eor sAsi, tmp, sAsi_, ROR #27 SEP +bic tmp, sAsa_, sAsu_, ROR #60 SEP +eor sAso, tmp, sAso_, ROR #21 SEP xar_m1 vBga, vAbo, E3, 36 +bic tmp, sAse_, sAsa_, ROR #57 SEP +eor sAsu, tmp, sAsu_, ROR #53 SEP +bic tmp, sAbi_, sAbe_, ROR #63 SEP +eor s_Aba, s_Aba_, tmp, ROR #21 SEP +bic tmp, sAbo_, sAbi_, ROR #42 SEP +eor sAbe, tmp, sAbe_, ROR #41 SEP xar_m1 vBbo, vAmo, E3, 43 +bic tmp, sAbu_, sAbo_, ROR #57 SEP +eor sAbi, tmp, sAbi_, ROR #35 SEP +bic tmp, s_Aba_, sAbu_, ROR #50 SEP +eor sAbo, tmp, sAbo_, ROR #43 SEP +bic tmp, sAbe_, s_Aba_, ROR #44 SEP +eor sAbu, tmp, sAbu_, ROR #30 SEP xar_m1 vBmo, vAmi, E2, 49 +eor s_Aba, s_Aba, cur_const SEP +save count, STACK_OFFSET_COUNT SEP +eor sC0, sAka, sAsa, ROR #50 SEP +eor sC1, sAse, sAge, ROR #60 SEP +eor sC2, sAmi, sAgi, ROR #59 SEP +eor sC3, sAgo, sAso, ROR #30 SEP xar_m1 vBmi, vAke, E1, 54 +eor sC4, sAbu, sAsu, ROR #53 SEP +eor sC0, sAma, sC0, ROR #49 SEP +eor sC1, sAbe, sC1, ROR #44 SEP +eor sC2, sAki, sC2, ROR #26 SEP +eor sC3, sAmo, sC3, ROR #63 SEP +eor sC4, sAmu, sC4, ROR #56 SEP +eor sC0, sAga, sC0, ROR #57 SEP xar_m1 vBge, vAgu, E4, 44 +eor sC1, sAme, sC1, ROR #58 SEP +eor sC2, sAbi, sC2, ROR #60 SEP +eor sC3, sAko, sC3, ROR #38 SEP +eor sC4, sAgu, sC4, ROR #48 SEP +eor sC0, s_Aba, sC0, ROR #61 SEP +eor sC1, sAke, sC1, ROR #57 SEP mov E3.16b, vAga.16b +eor sC2, sAsi, sC2, ROR #52 SEP +eor sC3, sAbo, sC3, ROR #63 SEP bcax_m1 vAga, vBga, vBgi, vBge +eor sC4, sAku, sC4, ROR #50 SEP +ror sC1, sC1, 56 SEP +ror sC4, sC4, 58 SEP +ror sC2, sC2, 62 SEP xar_m1 vBgu, vAsi, E2, 3 +eor sE1, sC0, sC2, ROR #63 SEP +eor sE3, sC2, sC4, ROR #63 SEP +eor sE0, sC4, sC1, ROR #63 SEP +eor sE2, sC1, sC3, ROR #63 SEP +eor sE4, sC3, sC0, ROR #63 SEP +eor s_Aba_, sE0, s_Aba SEP xar_m1 vBsi, vAku, E4, 25 +eor sAsa_, sE2, sAbi, ROR #50 SEP +eor sAbi_, sE2, sAki, ROR #46 SEP +eor sAki_, sE3, sAko, ROR #63 SEP +eor sAko_, sE4, sAmu, ROR #28 SEP +eor sAmu_, sE3, sAso, ROR #2 SEP +eor sAso_, sE0, sAma, ROR #54 SEP xar_m1 vBku, vAsa, E0, 46 +eor sAka_, sE1, sAbe, ROR #43 SEP +eor sAse_, sE3, sAgo, ROR #36 SEP +eor sAgo_, sE1, sAme, ROR #49 SEP +eor sAke_, sE2, sAgi, ROR #3 SEP +eor sAgi_, sE0, sAka, ROR #39 SEP +eor sAga_, sE3, sAbo SEP xar_m1 vBma, vAbu, E4, 37 +eor sAbo_, sE3, sAmo, ROR #37 SEP +eor sAmo_, sE2, sAmi, ROR #8 SEP +eor sAmi_, sE1, sAke, ROR #56 SEP +eor sAge_, sE4, sAgu, ROR #44 SEP +eor sAgu_, sE2, sAsi, ROR #62 SEP xar_m1 vBbu, vAsu, E4, 50 +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP +eor sAma_, sE4, sAbu, ROR #20 SEP +eor sAbu_, sE4, sAsu, ROR #9 SEP +eor sAsu_, sE1, sAse, ROR #23 SEP xar_m1 vBsu, vAse, E1, 62 +eor sAme_, sE0, sAga, ROR #61 SEP +eor sAbe_, sE1, sAge, ROR #19 SEP +load_constant_ptr SEP +restore count, STACK_OFFSET_COUNT SEP +bic tmp, sAgi_, sAge_, ROR #47 SEP +eor sAga, tmp, sAga_, ROR #39 SEP xar_m1 vBme, E3, E0, 28 +bic tmp, sAgo_, sAgi_, ROR #42 SEP +eor sAge, tmp, sAge_, ROR #25 SEP +bic tmp, sAgu_, sAgo_, ROR #16 SEP +eor sAgi, tmp, sAgi_, ROR #58 SEP +bic tmp, sAga_, sAgu_, ROR #31 SEP +eor sAgo, tmp, sAgo_, ROR #47 SEP xar_m1 vBbe, vAge, E1, 20 +bic tmp, sAge_, sAga_, ROR #56 SEP +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP +eor sAka, tmp, sAka_, ROR #24 SEP +bic tmp, sAko_, sAki_, ROR #47 SEP bcax_m1 vAge, vBge, vBgo, vBgi +eor sAke, tmp, sAke_, ROR #2 SEP +bic tmp, sAku_, sAko_, ROR #10 SEP +eor sAki, tmp, sAki_, ROR #57 SEP +bic tmp, sAka_, sAku_, ROR #47 SEP bcax_m1 vAgi, vBgi, vBgu, vBgo +eor sAko, tmp, sAko_, ROR #57 SEP +bic tmp, sAke_, sAka_, ROR #5 SEP +eor sAku, tmp, sAku_, ROR #52 SEP +bic tmp, sAmi_, sAme_, ROR #38 SEP bcax_m1 vAgo, vBgo, vBga, vBgu +eor sAma, tmp, sAma_, ROR #47 SEP +bic tmp, sAmo_, sAmi_, ROR #5 SEP +eor sAme, tmp, sAme_, ROR #43 SEP +bic tmp, sAmu_, sAmo_, ROR #41 SEP bcax_m1 vAgu, vBgu, vBge, vBga +eor sAmi, tmp, sAmi_, ROR #46 SEP +bic tmp, sAma_, sAmu_, ROR #35 SEP +ldr cur_const, [const_addr, count, UXTW #3] +add count, count, #1 SEP bcax_m1 vAka, vBka, vBki, vBke +eor sAmo, tmp, sAmo_, ROR #12 SEP +bic tmp, sAme_, sAma_, ROR #9 SEP +eor sAmu, tmp, sAmu_, ROR #44 SEP +bic tmp, sAsi_, sAse_, ROR #48 SEP bcax_m1 vAke, vBke, vBko, vBki +eor sAsa, tmp, sAsa_, ROR #41 SEP .unreq vvtmp +bic tmp, sAso_, sAsi_, ROR #2 SEP .unreq vvtmpq +eor sAse, tmp, sAse_, ROR #50 SEP +bic tmp, sAsu_, sAso_, ROR #25 SEP eor2 C0, vAka, vAga +eor sAsi, tmp, sAsi_, ROR #27 SEP save(vAga) +bic tmp, sAsa_, sAsu_, ROR #60 SEP vvtmp .req vAga +eor sAso, tmp, sAso_, ROR #21 SEP vvtmpq .req vAgaq +bic tmp, sAse_, sAsa_, ROR #57 SEP bcax_m1 vAki, vBki, vBku, vBko +eor sAsu, tmp, sAsu_, ROR #53 SEP +bic tmp, sAbi_, sAbe_, ROR #63 SEP +eor s_Aba, s_Aba_, tmp, ROR #21 SEP +bic tmp, sAbo_, sAbi_, ROR #42 SEP bcax_m1 vAko, vBko, vBka, vBku +eor sAbe, tmp, sAbe_, ROR #41 SEP +bic tmp, sAbu_, sAbo_, ROR #57 SEP +eor sAbi, tmp, sAbi_, ROR #35 SEP +bic tmp, s_Aba_, sAbu_, ROR #50 SEP eor2 C1, vAke, vAge +eor sAbo, tmp, sAbo_, ROR #43 SEP +bic tmp, sAbe_, s_Aba_, ROR #44 SEP bcax_m1 vAku, vBku, vBke, vBka +eor sAbu, tmp, sAbu_, ROR #30 SEP +eor s_Aba, s_Aba, cur_const SEP + SEP +save count, STACK_OFFSET_COUNT SEP +eor sC0, sAka, sAsa, ROR #50 SEP +eor sC1, sAse, sAge, ROR #60 SEP eor2 C2, vAki, vAgi +eor sC2, sAmi, sAgi, ROR #59 SEP +eor sC3, sAgo, sAso, ROR #30 SEP bcax_m1 vAma, vBma, vBmi, vBme +eor sC4, sAbu, sAsu, ROR #53 SEP +eor sC0, sAma, sC0, ROR #49 SEP +eor sC1, sAbe, sC1, ROR #44 SEP +eor sC2, sAki, sC2, ROR #26 SEP eor2 C3, vAko, vAgo +eor sC3, sAmo, sC3, ROR #63 SEP +eor sC4, sAmu, sC4, ROR #56 SEP bcax_m1 vAme, vBme, vBmo, vBmi +eor sC0, sAga, sC0, ROR #57 SEP +eor sC1, sAme, sC1, ROR #58 SEP +eor sC2, sAbi, sC2, ROR #60 SEP +eor sC3, sAko, sC3, ROR #38 SEP eor2 C4, vAku, vAgu +eor sC4, sAgu, sC4, ROR #48 SEP +eor sC0, s_Aba, sC0, ROR #61 SEP bcax_m1 vAmi, vBmi, vBmu, vBmo +eor sC1, sAke, sC1, ROR #57 SEP +eor sC2, sAsi, sC2, ROR #52 SEP +eor sC3, sAbo, sC3, ROR #63 SEP +eor sC4, sAku, sC4, ROR #50 SEP eor2 C0, C0, vAma +ror sC1, sC1, 56 SEP +ror sC4, sC4, 58 SEP bcax_m1 vAmo, vBmo, vBma, vBmu +ror sC2, sC2, 62 SEP +eor sE1, sC0, sC2, ROR #63 SEP +eor sE3, sC2, sC4, ROR #63 SEP +eor sE0, sC4, sC1, ROR #63 SEP eor2 C1, C1, vAme +eor sE2, sC1, sC3, ROR #63 SEP +eor sE4, sC3, sC0, ROR #63 SEP bcax_m1 vAmu, vBmu, vBme, vBma +eor s_Aba_, sE0, s_Aba SEP +eor sAsa_, sE2, sAbi, ROR #50 SEP +eor sAbi_, sE2, sAki, ROR #46 SEP +eor sAki_, sE3, sAko, ROR #63 SEP eor2 C2, C2, vAmi +eor sAko_, sE4, sAmu, ROR #28 SEP +eor sAmu_, sE3, sAso, ROR #2 SEP bcax_m1 vAsa, vBsa, vBsi, vBse +eor sAso_, sE0, sAma, ROR #54 SEP +eor sAka_, sE1, sAbe, ROR #43 SEP +eor sAse_, sE3, sAgo, ROR #36 SEP +eor sAgo_, sE1, sAme, ROR #49 SEP eor2 C3, C3, vAmo +eor sAke_, sE2, sAgi, ROR #3 SEP +eor sAgi_, sE0, sAka, ROR #39 SEP bcax_m1 vAse, vBse, vBso, vBsi +eor sAga_, sE3, sAbo SEP +eor sAbo_, sE3, sAmo, ROR #37 SEP +eor sAmo_, sE2, sAmi, ROR #8 SEP +eor sAmi_, sE1, sAke, ROR #56 SEP eor2 C4, C4, vAmu +eor sAge_, sE4, sAgu, ROR #44 SEP +eor sAgu_, sE2, sAsi, ROR #62 SEP bcax_m1 vAsi, vBsi, vBsu, vBso +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP +eor sAma_, sE4, sAbu, ROR #20 SEP +eor sAbu_, sE4, sAsu, ROR #9 SEP eor2 C0, C0, vAsa +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP bcax_m1 vAso, vBso, vBsa, vBsu +eor sAbe_, sE1, sAge, ROR #19 SEP +load_constant_ptr SEP +restore count, STACK_OFFSET_COUNT SEP +bic tmp, sAgi_, sAge_, ROR #47 SEP +eor sAga, tmp, sAga_, ROR #39 SEP +bic tmp, sAgo_, sAgi_, ROR #42 SEP eor2 C1, C1, vAse +eor sAge, tmp, sAge_, ROR #25 SEP +bic tmp, sAgu_, sAgo_, ROR #16 SEP bcax_m1 vAsu, vBsu, vBse, vBsa +eor sAgi, tmp, sAgi_, ROR #58 SEP +bic tmp, sAga_, sAgu_, ROR #31 SEP +eor sAgo, tmp, sAgo_, ROR #47 SEP +bic tmp, sAge_, sAga_, ROR #56 SEP eor2 C2, C2, vAsi +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP eor2 C3, C3, vAso +eor sAka, tmp, sAka_, ROR #24 SEP +bic tmp, sAko_, sAki_, ROR #47 SEP bcax_m1 vAba, vBba, vBbi, vBbe +eor sAke, tmp, sAke_, ROR #2 SEP +bic tmp, sAku_, sAko_, ROR #10 SEP +eor sAki, tmp, sAki_, ROR #57 SEP +bic tmp, sAka_, sAku_, ROR #47 SEP bcax_m1 vAbe, vBbe, vBbo, vBbi +eor sAko, tmp, sAko_, ROR #57 SEP +bic tmp, sAke_, sAka_, ROR #5 SEP +eor sAku, tmp, sAku_, ROR #52 SEP +bic tmp, sAmi_, sAme_, ROR #38 SEP eor2 C1, C1, vAbe +eor sAma, tmp, sAma_, ROR #47 SEP +bic tmp, sAmo_, sAmi_, ROR #5 SEP restore x26, STACK_OFFSET_CONST +eor sAme, tmp, sAme_, ROR #43 SEP ldr vvtmpq, [x26], #16 +bic tmp, sAmu_, sAmo_, ROR #41 SEP save x26, STACK_OFFSET_CONST +eor sAmi, tmp, sAmi_, ROR #46 SEP +bic tmp, sAma_, sAmu_, ROR #35 SEP eor vAba.16b, vAba.16b, vvtmp.16b +ldr cur_const, [const_addr, count, UXTW #3] +add count, count, #1 SEP +eor sAmo, tmp, sAmo_, ROR #12 SEP eor2 C4, C4, vAsu +bic tmp, sAme_, sAma_, ROR #9 SEP +eor sAmu, tmp, sAmu_, ROR #44 SEP bcax_m1 vAbi, vBbi, vBbu, vBbo +bic tmp, sAsi_, sAse_, ROR #48 SEP +eor sAsa, tmp, sAsa_, ROR #41 SEP +bic tmp, sAso_, sAsi_, ROR #2 SEP +eor sAse, tmp, sAse_, ROR #50 SEP bcax_m1 vAbo, vBbo, vBba, vBbu +bic tmp, sAsu_, sAso_, ROR #25 SEP +eor sAsi, tmp, sAsi_, ROR #27 SEP +bic tmp, sAsa_, sAsu_, ROR #60 SEP +eor sAso, tmp, sAso_, ROR #21 SEP eor2 C3, C3, vAbo +bic tmp, sAse_, sAsa_, ROR #57 SEP +eor sAsu, tmp, sAsu_, ROR #53 SEP eor2 C2, C2, vAbi +bic tmp, sAbi_, sAbe_, ROR #63 SEP +eor s_Aba, s_Aba_, tmp, ROR #21 SEP eor2 C0, C0, vAba +bic tmp, sAbo_, sAbi_, ROR #42 SEP +eor sAbe, tmp, sAbe_, ROR #41 SEP bcax_m1 vAbu, vBbu, vBbe, vBba +bic tmp, sAbu_, sAbo_, ROR #57 SEP +eor sAbi, tmp, sAbi_, ROR #35 SEP +bic tmp, s_Aba_, sAbu_, ROR #50 SEP +eor sAbo, tmp, sAbo_, ROR #43 SEP eor2 C4, C4, vAbu +bic tmp, sAbe_, s_Aba_, ROR #44 SEP +eor sAbu, tmp, sAbu_, ROR #30 SEP restore(vAga) +eor s_Aba, s_Aba, cur_const SEP .unreq vvtmp + .unreq vvtmpq + +.endm + +.macro hybrid_round_final + +save count, STACK_OFFSET_COUNT SEP vvtmp .req vBba +eor sC0, sAka, sAsa, ROR #50 SEP +eor sC1, sAse, sAge, ROR #60 SEP +eor sC2, sAmi, sAgi, ROR #59 SEP rax1_m1 E2, C1, C3 +eor sC3, sAgo, sAso, ROR #30 SEP +eor sC4, sAbu, sAsu, ROR #53 SEP +eor sC0, sAma, sC0, ROR #49 SEP +eor sC1, sAbe, sC1, ROR #44 SEP +eor sC2, sAki, sC2, ROR #26 SEP +eor sC3, sAmo, sC3, ROR #63 SEP rax1_m1 E4, C3, C0 +eor sC4, sAmu, sC4, ROR #56 SEP +eor sC0, sAga, sC0, ROR #57 SEP +eor sC1, sAme, sC1, ROR #58 SEP +eor sC2, sAbi, sC2, ROR #60 SEP +eor sC3, sAko, sC3, ROR #38 SEP +eor sC4, sAgu, sC4, ROR #48 SEP rax1_m1 E1, C0, C2 +eor sC0, s_Aba, sC0, ROR #61 SEP +eor sC1, sAke, sC1, ROR #57 SEP +eor sC2, sAsi, sC2, ROR #52 SEP +eor sC3, sAbo, sC3, ROR #63 SEP +eor sC4, sAku, sC4, ROR #50 SEP +ror sC1, sC1, 56 SEP rax1_m1 E3, C2, C4 +ror sC4, sC4, 58 SEP +ror sC2, sC2, 62 SEP +eor sE1, sC0, sC2, ROR #63 SEP +eor sE3, sC2, sC4, ROR #63 SEP +eor sE0, sC4, sC1, ROR #63 SEP +eor sE2, sC1, sC3, ROR #63 SEP rax1_m1 E0, C4, C1 +eor sE4, sC3, sC0, ROR #63 SEP +eor s_Aba_, sE0, s_Aba SEP +eor sAsa_, sE2, sAbi, ROR #50 SEP .unreq vvtmp +eor sAbi_, sE2, sAki, ROR #46 SEP vvtmp .req C1 +eor sAki_, sE3, sAko, ROR #63 SEP vvtmpq .req C1q +eor sAko_, sE4, sAmu, ROR #28 SEP +eor sAmu_, sE3, sAso, ROR #2 SEP eor vBba.16b, vAba.16b, E0.16b +eor sAso_, sE0, sAma, ROR #54 SEP +eor sAka_, sE1, sAbe, ROR #43 SEP xar_m1 vBsa, vAbi, E2, 2 +eor sAse_, sE3, sAgo, ROR #36 SEP +eor sAgo_, sE1, sAme, ROR #49 SEP +eor sAke_, sE2, sAgi, ROR #3 SEP +eor sAgi_, sE0, sAka, ROR #39 SEP +eor sAga_, sE3, sAbo SEP +eor sAbo_, sE3, sAmo, ROR #37 SEP xar_m1 vBbi, vAki, E2, 21 +eor sAmo_, sE2, sAmi, ROR #8 SEP +eor sAmi_, sE1, sAke, ROR #56 SEP +eor sAge_, sE4, sAgu, ROR #44 SEP +eor sAgu_, sE2, sAsi, ROR #62 SEP +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP xar_m1 vBki, vAko, E3, 39 +eor sAma_, sE4, sAbu, ROR #20 SEP +eor sAbu_, sE4, sAsu, ROR #9 SEP +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP +eor sAbe_, sE1, sAge, ROR #19 SEP +load_constant_ptr SEP xar_m1 vBko, vAmu, E4, 56 +restore count, STACK_OFFSET_COUNT SEP +bic tmp, sAgi_, sAge_, ROR #47 SEP +eor sAga, tmp, sAga_, ROR #39 SEP +bic tmp, sAgo_, sAgi_, ROR #42 SEP +eor sAge, tmp, sAge_, ROR #25 SEP +bic tmp, sAgu_, sAgo_, ROR #16 SEP xar_m1 vBmu, vAso, E3, 8 +eor sAgi, tmp, sAgi_, ROR #58 SEP +bic tmp, sAga_, sAgu_, ROR #31 SEP +eor sAgo, tmp, sAgo_, ROR #47 SEP +bic tmp, sAge_, sAga_, ROR #56 SEP +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP xar_m1 vBso, vAma, E0, 23 +eor sAka, tmp, sAka_, ROR #24 SEP +bic tmp, sAko_, sAki_, ROR #47 SEP +eor sAke, tmp, sAke_, ROR #2 SEP +bic tmp, sAku_, sAko_, ROR #10 SEP +eor sAki, tmp, sAki_, ROR #57 SEP +bic tmp, sAka_, sAku_, ROR #47 SEP +eor sAko, tmp, sAko_, ROR #57 SEP xar_m1 vBka, vAbe, E1, 63 +bic tmp, sAke_, sAka_, ROR #5 SEP +eor sAku, tmp, sAku_, ROR #52 SEP +bic tmp, sAmi_, sAme_, ROR #38 SEP +eor sAma, tmp, sAma_, ROR #47 SEP +bic tmp, sAmo_, sAmi_, ROR #5 SEP +eor sAme, tmp, sAme_, ROR #43 SEP xar_m1 vBse, vAgo, E3, 9 +bic tmp, sAmu_, sAmo_, ROR #41 SEP +eor sAmi, tmp, sAmi_, ROR #46 SEP +bic tmp, sAma_, sAmu_, ROR #35 SEP +ldr cur_const, [const_addr, count, UXTW #3] SEP +add count, count, #1 SEP +eor sAmo, tmp, sAmo_, ROR #12 SEP xar_m1 vBgo, vAme, E1, 19 +bic tmp, sAme_, sAma_, ROR #9 SEP +eor sAmu, tmp, sAmu_, ROR #44 SEP +bic tmp, sAsi_, sAse_, ROR #48 SEP +eor sAsa, tmp, sAsa_, ROR #41 SEP +bic tmp, sAso_, sAsi_, ROR #2 SEP +eor sAse, tmp, sAse_, ROR #50 SEP xar_m1 vBke, vAgi, E2, 58 +bic tmp, sAsu_, sAso_, ROR #25 SEP +eor sAsi, tmp, sAsi_, ROR #27 SEP +bic tmp, sAsa_, sAsu_, ROR #60 SEP +eor sAso, tmp, sAso_, ROR #21 SEP +bic tmp, sAse_, sAsa_, ROR #57 SEP +eor sAsu, tmp, sAsu_, ROR #53 SEP xar_m1 vBgi, vAka, E0, 61 +bic tmp, sAbi_, sAbe_, ROR #63 SEP +eor s_Aba, s_Aba_, tmp, ROR #21 SEP +bic tmp, sAbo_, sAbi_, ROR #42 SEP +eor sAbe, tmp, sAbe_, ROR #41 SEP +bic tmp, sAbu_, sAbo_, ROR #57 SEP +eor sAbi, tmp, sAbi_, ROR #35 SEP xar_m1 vBga, vAbo, E3, 36 +bic tmp, s_Aba_, sAbu_, ROR #50 SEP +eor sAbo, tmp, sAbo_, ROR #43 SEP +bic tmp, sAbe_, s_Aba_, ROR #44 SEP +eor sAbu, tmp, sAbu_, ROR #30 SEP +eor s_Aba, s_Aba, cur_const SEP + SEP xar_m1 vBbo, vAmo, E3, 43 +save count, STACK_OFFSET_COUNT SEP +eor sC0, sAka, sAsa, ROR #50 SEP +eor sC1, sAse, sAge, ROR #60 SEP +eor sC2, sAmi, sAgi, ROR #59 SEP +eor sC3, sAgo, sAso, ROR #30 SEP +eor sC4, sAbu, sAsu, ROR #53 SEP +eor sC0, sAma, sC0, ROR #49 SEP xar_m1 vBmo, vAmi, E2, 49 +eor sC1, sAbe, sC1, ROR #44 SEP +eor sC2, sAki, sC2, ROR #26 SEP +eor sC3, sAmo, sC3, ROR #63 SEP +eor sC4, sAmu, sC4, ROR #56 SEP +eor sC0, sAga, sC0, ROR #57 SEP +eor sC1, sAme, sC1, ROR #58 SEP +eor sC2, sAbi, sC2, ROR #60 SEP xar_m1 vBmi, vAke, E1, 54 +eor sC3, sAko, sC3, ROR #38 SEP +eor sC4, sAgu, sC4, ROR #48 SEP +eor sC0, s_Aba, sC0, ROR #61 SEP +eor sC1, sAke, sC1, ROR #57 SEP +eor sC2, sAsi, sC2, ROR #52 SEP +eor sC3, sAbo, sC3, ROR #63 SEP xar_m1 vBge, vAgu, E4, 44 +eor sC4, sAku, sC4, ROR #50 SEP +ror sC1, sC1, 56 SEP +ror sC4, sC4, 58 SEP +ror sC2, sC2, 62 SEP +eor sE1, sC0, sC2, ROR #63 SEP +eor sE3, sC2, sC4, ROR #63 SEP mov E3.16b, vAga.16b +eor sE0, sC4, sC1, ROR #63 SEP +eor sE2, sC1, sC3, ROR #63 SEP +eor sE4, sC3, sC0, ROR #63 SEP +eor s_Aba_, sE0, s_Aba SEP bcax_m1 vAga, vBga, vBgi, vBge +eor sAsa_, sE2, sAbi, ROR #50 SEP +eor sAbi_, sE2, sAki, ROR #46 SEP +eor sAki_, sE3, sAko, ROR #63 SEP +eor sAko_, sE4, sAmu, ROR #28 SEP xar_m1 vBgu, vAsi, E2, 3 +eor sAmu_, sE3, sAso, ROR #2 SEP +eor sAso_, sE0, sAma, ROR #54 SEP +eor sAka_, sE1, sAbe, ROR #43 SEP +eor sAse_, sE3, sAgo, ROR #36 SEP +eor sAgo_, sE1, sAme, ROR #49 SEP +eor sAke_, sE2, sAgi, ROR #3 SEP xar_m1 vBsi, vAku, E4, 25 +eor sAgi_, sE0, sAka, ROR #39 SEP +eor sAga_, sE3, sAbo SEP +eor sAbo_, sE3, sAmo, ROR #37 SEP +eor sAmo_, sE2, sAmi, ROR #8 SEP +eor sAmi_, sE1, sAke, ROR #56 SEP +eor sAge_, sE4, sAgu, ROR #44 SEP +eor sAgu_, sE2, sAsi, ROR #62 SEP xar_m1 vBku, vAsa, E0, 46 +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP +eor sAma_, sE4, sAbu, ROR #20 SEP +eor sAbu_, sE4, sAsu, ROR #9 SEP +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP xar_m1 vBma, vAbu, E4, 37 +eor sAbe_, sE1, sAge, ROR #19 SEP +load_constant_ptr SEP +restore count, STACK_OFFSET_COUNT SEP +bic tmp, sAgi_, sAge_, ROR #47 SEP +eor sAga, tmp, sAga_, ROR #39 SEP +bic tmp, sAgo_, sAgi_, ROR #42 SEP xar_m1 vBbu, vAsu, E4, 50 +eor sAge, tmp, sAge_, ROR #25 SEP +bic tmp, sAgu_, sAgo_, ROR #16 SEP +eor sAgi, tmp, sAgi_, ROR #58 SEP +bic tmp, sAga_, sAgu_, ROR #31 SEP +eor sAgo, tmp, sAgo_, ROR #47 SEP +bic tmp, sAge_, sAga_, ROR #56 SEP +eor sAgu, tmp, sAgu_, ROR #23 SEP xar_m1 vBsu, vAse, E1, 62 +bic tmp, sAki_, sAke_, ROR #19 SEP +eor sAka, tmp, sAka_, ROR #24 SEP +bic tmp, sAko_, sAki_, ROR #47 SEP +eor sAke, tmp, sAke_, ROR #2 SEP +bic tmp, sAku_, sAko_, ROR #10 SEP +eor sAki, tmp, sAki_, ROR #57 SEP xar_m1 vBme, E3, E0, 28 +bic tmp, sAka_, sAku_, ROR #47 SEP +eor sAko, tmp, sAko_, ROR #57 SEP +bic tmp, sAke_, sAka_, ROR #5 SEP +eor sAku, tmp, sAku_, ROR #52 SEP +bic tmp, sAmi_, sAme_, ROR #38 SEP +eor sAma, tmp, sAma_, ROR #47 SEP xar_m1 vBbe, vAge, E1, 20 +bic tmp, sAmo_, sAmi_, ROR #5 SEP +eor sAme, tmp, sAme_, ROR #43 SEP +bic tmp, sAmu_, sAmo_, ROR #41 SEP +eor sAmi, tmp, sAmi_, ROR #46 SEP +bic tmp, sAma_, sAmu_, ROR #35 SEP +ldr cur_const, [const_addr, count, UXTW #3] SEP bcax_m1 vAge, vBge, vBgo, vBgi +add count, count, #1 SEP +eor sAmo, tmp, sAmo_, ROR #12 SEP +bic tmp, sAme_, sAma_, ROR #9 SEP +eor sAmu, tmp, sAmu_, ROR #44 SEP bcax_m1 vAgi, vBgi, vBgu, vBgo +bic tmp, sAsi_, sAse_, ROR #48 SEP +eor sAsa, tmp, sAsa_, ROR #41 SEP +bic tmp, sAso_, sAsi_, ROR #2 SEP +eor sAse, tmp, sAse_, ROR #50 SEP bcax_m1 vAgo, vBgo, vBga, vBgu +bic tmp, sAsu_, sAso_, ROR #25 SEP +eor sAsi, tmp, sAsi_, ROR #27 SEP +bic tmp, sAsa_, sAsu_, ROR #60 SEP +eor sAso, tmp, sAso_, ROR #21 SEP bcax_m1 vAgu, vBgu, vBge, vBga +bic tmp, sAse_, sAsa_, ROR #57 SEP +eor sAsu, tmp, sAsu_, ROR #53 SEP +bic tmp, sAbi_, sAbe_, ROR #63 SEP +eor s_Aba, s_Aba_, tmp, ROR #21 SEP bcax_m1 vAka, vBka, vBki, vBke +bic tmp, sAbo_, sAbi_, ROR #42 SEP +eor sAbe, tmp, sAbe_, ROR #41 SEP +bic tmp, sAbu_, sAbo_, ROR #57 SEP +eor sAbi, tmp, sAbi_, ROR #35 SEP bcax_m1 vAke, vBke, vBko, vBki +bic tmp, s_Aba_, sAbu_, ROR #50 SEP +eor sAbo, tmp, sAbo_, ROR #43 SEP +bic tmp, sAbe_, s_Aba_, ROR #44 SEP +eor sAbu, tmp, sAbu_, ROR #30 SEP bcax_m1 vAki, vBki, vBku, vBko +eor s_Aba, s_Aba, cur_const SEP +save count, STACK_OFFSET_COUNT SEP +eor sC0, sAka, sAsa, ROR #50 SEP +eor sC1, sAse, sAge, ROR #60 SEP bcax_m1 vAko, vBko, vBka, vBku +eor sC2, sAmi, sAgi, ROR #59 SEP +eor sC3, sAgo, sAso, ROR #30 SEP +eor sC4, sAbu, sAsu, ROR #53 SEP +eor sC0, sAma, sC0, ROR #49 SEP bcax_m1 vAku, vBku, vBke, vBka +eor sC1, sAbe, sC1, ROR #44 SEP +eor sC2, sAki, sC2, ROR #26 SEP +eor sC3, sAmo, sC3, ROR #63 SEP +eor sC4, sAmu, sC4, ROR #56 SEP bcax_m1 vAma, vBma, vBmi, vBme +eor sC0, sAga, sC0, ROR #57 SEP +eor sC1, sAme, sC1, ROR #58 SEP +eor sC2, sAbi, sC2, ROR #60 SEP +eor sC3, sAko, sC3, ROR #38 SEP bcax_m1 vAme, vBme, vBmo, vBmi +eor sC4, sAgu, sC4, ROR #48 SEP +eor sC0, s_Aba, sC0, ROR #61 SEP +eor sC1, sAke, sC1, ROR #57 SEP +eor sC2, sAsi, sC2, ROR #52 SEP bcax_m1 vAmi, vBmi, vBmu, vBmo +eor sC3, sAbo, sC3, ROR #63 SEP +eor sC4, sAku, sC4, ROR #50 SEP +ror sC1, sC1, 56 SEP +ror sC4, sC4, 58 SEP bcax_m1 vAmo, vBmo, vBma, vBmu +ror sC2, sC2, 62 SEP +eor sE1, sC0, sC2, ROR #63 SEP +eor sE3, sC2, sC4, ROR #63 SEP +eor sE0, sC4, sC1, ROR #63 SEP bcax_m1 vAmu, vBmu, vBme, vBma +eor sE2, sC1, sC3, ROR #63 SEP +eor sE4, sC3, sC0, ROR #63 SEP +eor s_Aba_, sE0, s_Aba SEP +eor sAsa_, sE2, sAbi, ROR #50 SEP bcax_m1 vAsa, vBsa, vBsi, vBse +eor sAbi_, sE2, sAki, ROR #46 SEP +eor sAki_, sE3, sAko, ROR #63 SEP +eor sAko_, sE4, sAmu, ROR #28 SEP +eor sAmu_, sE3, sAso, ROR #2 SEP bcax_m1 vAse, vBse, vBso, vBsi +eor sAso_, sE0, sAma, ROR #54 SEP +eor sAka_, sE1, sAbe, ROR #43 SEP +eor sAse_, sE3, sAgo, ROR #36 SEP +eor sAgo_, sE1, sAme, ROR #49 SEP bcax_m1 vAsi, vBsi, vBsu, vBso +eor sAke_, sE2, sAgi, ROR #3 SEP +eor sAgi_, sE0, sAka, ROR #39 SEP +eor sAga_, sE3, sAbo SEP +eor sAbo_, sE3, sAmo, ROR #37 SEP bcax_m1 vAso, vBso, vBsa, vBsu +eor sAmo_, sE2, sAmi, ROR #8 SEP +eor sAmi_, sE1, sAke, ROR #56 SEP +eor sAge_, sE4, sAgu, ROR #44 SEP +eor sAgu_, sE2, sAsi, ROR #62 SEP bcax_m1 vAsu, vBsu, vBse, vBsa +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP +eor sAma_, sE4, sAbu, ROR #20 SEP +eor sAbu_, sE4, sAsu, ROR #9 SEP bcax_m1 vAba, vBba, vBbi, vBbe +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP +eor sAbe_, sE1, sAge, ROR #19 SEP +load_constant_ptr SEP bcax_m1 vAbe, vBbe, vBbo, vBbi +restore count, STACK_OFFSET_COUNT SEP +bic tmp, sAgi_, sAge_, ROR #47 SEP +eor sAga, tmp, sAga_, ROR #39 SEP +bic tmp, sAgo_, sAgi_, ROR #42 SEP bcax_m1 vAbi, vBbi, vBbu, vBbo +eor sAge, tmp, sAge_, ROR #25 SEP +bic tmp, sAgu_, sAgo_, ROR #16 SEP +eor sAgi, tmp, sAgi_, ROR #58 SEP +bic tmp, sAga_, sAgu_, ROR #31 SEP bcax_m1 vAbo, vBbo, vBba, vBbu +eor sAgo, tmp, sAgo_, ROR #47 SEP +bic tmp, sAge_, sAga_, ROR #56 SEP +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP bcax_m1 vAbu, vBbu, vBbe, vBba +eor sAka, tmp, sAka_, ROR #24 SEP +bic tmp, sAko_, sAki_, ROR #47 SEP +eor sAke, tmp, sAke_, ROR #2 SEP +bic tmp, sAku_, sAko_, ROR #10 SEP restore x26, STACK_OFFSET_CONST +eor sAki, tmp, sAki_, ROR #57 SEP +bic tmp, sAka_, sAku_, ROR #47 SEP +eor sAko, tmp, sAko_, ROR #57 SEP ldr vvtmpq, [x26], #16 +bic tmp, sAke_, sAka_, ROR #5 SEP +eor sAku, tmp, sAku_, ROR #52 SEP +bic tmp, sAmi_, sAme_, ROR #38 SEP +eor sAma, tmp, sAma_, ROR #47 SEP save x26, STACK_OFFSET_CONST +bic tmp, sAmo_, sAmi_, ROR #5 SEP +eor sAme, tmp, sAme_, ROR #43 SEP +bic tmp, sAmu_, sAmo_, ROR #41 SEP +eor sAmi, tmp, sAmi_, ROR #46 SEP eor vAba.16b, vAba.16b, vvtmp.16b +bic tmp, sAma_, sAmu_, ROR #35 SEP +ldr cur_const, [const_addr, count, UXTW #3] SEP +add count, count, #1 SEP .unreq vvtmp +eor sAmo, tmp, sAmo_, ROR #12 SEP +bic tmp, sAme_, sAma_, ROR #9 SEP +eor sAmu, tmp, sAmu_, ROR #44 SEP +bic tmp, sAsi_, sAse_, ROR #48 SEP .unreq vvtmpq +eor sAsa, tmp, sAsa_, ROR #41 SEP +bic tmp, sAso_, sAsi_, ROR #2 SEP +eor sAse, tmp, sAse_, ROR #50 SEP +bic tmp, sAsu_, sAso_, ROR #25 SEP +eor sAsi, tmp, sAsi_, ROR #27 SEP +bic tmp, sAsa_, sAsu_, ROR #60 SEP +eor sAso, tmp, sAso_, ROR #21 SEP +bic tmp, sAse_, sAsa_, ROR #57 SEP +eor sAsu, tmp, sAsu_, ROR #53 SEP +bic tmp, sAbi_, sAbe_, ROR #63 SEP +eor s_Aba, s_Aba_, tmp, ROR #21 SEP +bic tmp, sAbo_, sAbi_, ROR #42 SEP +eor sAbe, tmp, sAbe_, ROR #41 SEP +bic tmp, sAbu_, sAbo_, ROR #57 SEP +eor sAbi, tmp, sAbi_, ROR #35 SEP +bic tmp, s_Aba_, sAbu_, ROR #50 SEP +eor sAbo, tmp, sAbo_, ROR #43 SEP +bic tmp, sAbe_, s_Aba_, ROR #44 SEP +eor sAbu, tmp, sAbu_, ROR #30 SEP +eor s_Aba, s_Aba, cur_const SEP +ror sAga, sAga,(64-3) SEP +ror sAka, sAka,(64-25) SEP +ror sAma, sAma,(64-10) SEP +ror sAsa, sAsa,(64-39) SEP +ror sAbe, sAbe,(64-21) SEP +ror sAge, sAge,(64-45) SEP +ror sAke, sAke,(64-8) SEP +ror sAme, sAme,(64-15) SEP +ror sAse, sAse,(64-41) SEP +ror sAbi, sAbi,(64-14) SEP +ror sAgi, sAgi,(64-61) SEP +ror sAki, sAki,(64-18) SEP +ror sAmi, sAmi,(64-56) SEP +ror sAsi, sAsi,(64-2) SEP +ror sAgo, sAgo,(64-28) SEP +ror sAko, sAko,(64-1) SEP +ror sAmo, sAmo,(64-27) SEP +ror sAso, sAso,(64-62) SEP +ror sAbu, sAbu,(64-44) SEP +ror sAgu, sAgu,(64-20) SEP +ror sAku, sAku,(64-6) SEP +ror sAmu, sAmu,(64-36) SEP +ror sAsu, sAsu,(64-55) SEP +.endm + + +#define KECCAK_F1600_ROUNDS 24 + +.global keccak_f1600_x5_hybrid_asm_v8 +.global _keccak_f1600_x5_hybrid_asm_v8 +.text +.align 4 + +keccak_f1600_x5_hybrid_asm_v8: +_keccak_f1600_x5_hybrid_asm_v8: + alloc_stack + save_gprs + save_vregs + + save input_addr, STACK_OFFSET_INPUT + + ASM_LOAD(const_addr,round_constants_vec) + save const_addr, STACK_OFFSET_CONST + + load_input_vector + + add input_addr, input_addr, #(2*8*25) + save input_addr, STACK_OFFSET_CUR_INPUT + + mov out_count, #0 +outer_loop: + save out_count, STACK_OFFSET_COUNT_OUT + + load_input_scalar + save input_addr, STACK_OFFSET_CUR_INPUT + + hybrid_round_initial +inner_loop: + hybrid_round_noninitial + cmp count, #(KECCAK_F1600_ROUNDS-6) + ble inner_loop + hybrid_round_final + + restore input_addr, STACK_OFFSET_CUR_INPUT + store_input_scalar + add input_addr, input_addr, #(8*25) + + restore out_count, STACK_OFFSET_COUNT_OUT + add out_count, out_count, #1 + cmp out_count, #3 + blt outer_loop + + restore input_addr, STACK_OFFSET_INPUT + store_input_vector + + restore_vregs + restore_gprs + free_stack + + ret diff --git a/tests/keccak_neon/manual/keccak_f1600_x5_hybrid_asm_v8p.s b/tests/keccak_neon/manual/keccak_f1600_x5_hybrid_asm_v8p.s new file mode 100644 index 0000000..c904df4 --- /dev/null +++ b/tests/keccak_neon/manual/keccak_f1600_x5_hybrid_asm_v8p.s @@ -0,0 +1,1306 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include "macros.s" + +/********************** CONSTANTS *************************/ + .data + .align(8) +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 +round_constants_vec: + .quad 0x0000000000000001 + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + .quad 0x8000000080008008 +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x29 + count .req w27 + out_count .req w27 + cur_const .req x26 + + /* Mapping of Kecck-f1600 SIMD state to vector registers + * at the beginning and end of each round. */ + + /* Mapping of Kecck-f1600 state to vector registers + * at the beginning and end of each round. */ + vAba .req v0 + vAbe .req v1 + vAbi .req v2 + vAbo .req v3 + vAbu .req v4 + vAga .req v5 + vAge .req v6 + vAgi .req v7 + vAgo .req v8 + vAgu .req v9 + vAka .req v10 + vAke .req v11 + vAki .req v12 + vAko .req v13 + vAku .req v14 + vAma .req v15 + vAme .req v16 + vAmi .req v17 + vAmo .req v18 + vAmu .req v19 + vAsa .req v20 + vAse .req v21 + vAsi .req v22 + vAso .req v23 + vAsu .req v24 + + /* q-form of the above mapping */ + vAbaq .req q0 + vAbeq .req q1 + vAbiq .req q2 + vAboq .req q3 + vAbuq .req q4 + vAgaq .req q5 + vAgeq .req q6 + vAgiq .req q7 + vAgoq .req q8 + vAguq .req q9 + vAkaq .req q10 + vAkeq .req q11 + vAkiq .req q12 + vAkoq .req q13 + vAkuq .req q14 + vAmaq .req q15 + vAmeq .req q16 + vAmiq .req q17 + vAmoq .req q18 + vAmuq .req q19 + vAsaq .req q20 + vAseq .req q21 + vAsiq .req q22 + vAsoq .req q23 + vAsuq .req q24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v27 + C1 .req v28 + C2 .req v29 + C3 .req v30 + C4 .req v31 + + C0q .req q27 + C1q .req q28 + C2q .req q29 + C3q .req q30 + C4q .req q31 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + vBba .req v25 // fresh + vBbe .req v26 // fresh + vBbi .req vAbi + vBbo .req vAbo + vBbu .req vAbu + vBga .req vAka + vBge .req vAke + vBgi .req vAgi + vBgo .req vAgo + vBgu .req vAgu + vBka .req vAma + vBke .req vAme + vBki .req vAki + vBko .req vAko + vBku .req vAku + vBma .req vAsa + vBme .req vAse + vBmi .req vAmi + vBmo .req vAmo + vBmu .req vAmu + vBsa .req vAba + vBse .req vAbe + vBsi .req vAsi + vBso .req vAso + vBsu .req vAsu + + vBbaq .req q25 // fresh + vBbeq .req q26 // fresh + vBbiq .req vAbiq + vBboq .req vAboq + vBbuq .req vAbuq + vBgaq .req vAkaq + vBgeq .req vAkeq + vBgiq .req vAgiq + vBgoq .req vAgoq + vBguq .req vAguq + vBkaq .req vAmaq + vBkeq .req vAmeq + vBkiq .req vAkiq + vBkoq .req vAkoq + vBkuq .req vAkuq + vBmaq .req vAsaq + vBmeq .req vAseq + vBmiq .req vAmiq + vBmoq .req vAmoq + vBmuq .req vAmuq + vBsaq .req vAbaq + vBseq .req vAbeq + vBsiq .req vAsiq + vBsoq .req vAsoq + vBsuq .req vAsuq + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req C4 + E1 .req C0 + E2 .req vBbe // fresh + E3 .req C2 + E4 .req C3 + + E0q .req C4q + E1q .req C0q + E2q .req vBbeq // fresh + E3q .req C2q + E4q .req C3q + + /* Mapping of Kecck-f1600 state to scalar registers + * at the beginning and end of each round. */ + s_Aba .req x1 + sAbe .req x6 + sAbi .req x11 + sAbo .req x16 + sAbu .req x21 + sAga .req x2 + sAge .req x7 + sAgi .req x12 + sAgo .req x17 + sAgu .req x22 + sAka .req x3 + sAke .req x8 + sAki .req x13 + sAko .req x18 + sAku .req x23 + sAma .req x4 + sAme .req x9 + sAmi .req x14 + sAmo .req x19 + sAmu .req x24 + sAsa .req x5 + sAse .req x10 + sAsi .req x15 + sAso .req x20 + sAsu .req x25 + + /* sA_[y,2*x+3*y] = rot(A[x,y]) */ + s_Aba_ .req x0 + sAbe_ .req x28 + sAbi_ .req x11 + sAbo_ .req x16 + sAbu_ .req x21 + sAga_ .req x3 + sAge_ .req x8 + sAgi_ .req x12 + sAgo_ .req x17 + sAgu_ .req x22 + sAka_ .req x4 + sAke_ .req x9 + sAki_ .req x13 + sAko_ .req x18 + sAku_ .req x23 + sAma_ .req x5 + sAme_ .req x10 + sAmi_ .req x14 + sAmo_ .req x19 + sAmu_ .req x24 + sAsa_ .req x1 + sAse_ .req x6 + sAsi_ .req x15 + sAso_ .req x20 + sAsu_ .req x25 + + /* sC[x] = sA[x,0] xor sA[x,1] xor sA[x,2] xor sA[x,3] xor sA[x,4], for x in 0..4 */ + /* sE[x] = sC[x-1] xor rot(C[x+1],1), for x in 0..4 */ + sC0 .req x0 + sE0 .req x29 + sC1 .req x26 + sE1 .req x30 + sC2 .req x27 + sE2 .req x26 + sC3 .req x28 + sE3 .req x27 + sC4 .req x29 + sE4 .req x28 + + tmp .req x30 + +/************************ MACROS ****************************/ + +.macro eor2 d s0 s1 + eor \d\().16b, \s0\().16b, \s1\().16b +.endm + +.macro eor3_m1 d s0 s1 s2 + eor2 \d, \s0, \s1 + eor2 \d, \d, \s2 +.endm + +.macro rax1_m1 d s0 s1 + shl vvtmp.2d, \s1\().2d, #1 + sri vvtmp.2d, \s1\().2d, #63 + eor \d\().16b, vvtmp.16b, \s0\().16b +.endm + + .macro xar_m1 d s0 s1 imm + // Special cases where we can replace SHLs by ADDs + .if \imm == 63 + eor \s0\().16b, \s0\().16b, \s1\().16b + add \d\().2d, \s0\().2d, \s0\().2d + sri \d\().2d, \s0\().2d, #(63) + .else + eor \s0\().16b, \s0\().16b, \s1\().16b + shl \d\().2d, \s0\().2d, #(64-\imm) + sri \d\().2d, \s0\().2d, #(\imm) + .endif +.endm + +.macro bcax_m1 d s0 s1 s2 + bic vvtmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, vvtmp.16b, \s0\().16b +.endm + +.macro load_input_vector + ldp vAbaq, vAbeq, [input_addr, #(16*0)] + ldp vAbiq, vAboq, [input_addr, #(16*2)] + ldp vAbuq, vAgaq, [input_addr, #(16*4)] + ldp vAgeq, vAgiq, [input_addr, #(16*6)] + ldp vAgoq, vAguq, [input_addr, #(16*8)] + ldp vAkaq, vAkeq, [input_addr, #(16*10)] + ldp vAkiq, vAkoq, [input_addr, #(16*12)] + ldp vAkuq, vAmaq, [input_addr, #(16*14)] + ldp vAmeq, vAmiq, [input_addr, #(16*16)] + ldp vAmoq, vAmuq, [input_addr, #(16*18)] + ldp vAsaq, vAseq, [input_addr, #(16*20)] + ldp vAsiq, vAsoq, [input_addr, #(16*22)] + ldr vAsuq, [input_addr, #(16*24)] + + // ldr vAbaq, [input_addr, #(16*0)] + // ldr vAbeq, [input_addr, #(16*1)] + // ldr vAbiq, [input_addr, #(16*2)] + // ldr vAboq, [input_addr, #(16*3)] + // ldr vAbuq, [input_addr, #(16*4)] + // ldr vAgaq, [input_addr, #(16*5)] + // ldr vAgeq, [input_addr, #(16*6)] + // ldr vAgiq, [input_addr, #(16*7)] + // ldr vAgoq, [input_addr, #(16*8)] + // ldr vAguq, [input_addr, #(16*9)] + // ldr vAkaq, [input_addr, #(16*10)] + // ldr vAkeq, [input_addr, #(16*11)] + // ldr vAkiq, [input_addr, #(16*12)] + // ldr vAkoq, [input_addr, #(16*13)] + // ldr vAkuq, [input_addr, #(16*14)] + // ldr vAmaq, [input_addr, #(16*15)] + // ldr vAmeq, [input_addr, #(16*16)] + // ldr vAmiq, [input_addr, #(16*17)] + // ldr vAmoq, [input_addr, #(16*18)] + // ldr vAmuq, [input_addr, #(16*19)] + // ldr vAsaq, [input_addr, #(16*20)] + // ldr vAseq, [input_addr, #(16*21)] + // ldr vAsiq, [input_addr, #(16*22)] + // ldr vAsoq, [input_addr, #(16*23)] + // ldr vAsuq, [input_addr, #(16*24)] +.endm + +.macro store_input_vector + stp vAbaq, vAbeq, [input_addr, #(16*0)] + stp vAbiq, vAboq, [input_addr, #(16*2)] + stp vAbuq, vAgaq, [input_addr, #(16*4)] + stp vAgeq, vAgiq, [input_addr, #(16*6)] + stp vAgoq, vAguq, [input_addr, #(16*8)] + stp vAkaq, vAkeq, [input_addr, #(16*10)] + stp vAkiq, vAkoq, [input_addr, #(16*12)] + stp vAkuq, vAmaq, [input_addr, #(16*14)] + stp vAmeq, vAmiq, [input_addr, #(16*16)] + stp vAmoq, vAmuq, [input_addr, #(16*18)] + stp vAsaq, vAseq, [input_addr, #(16*20)] + stp vAsiq, vAsoq, [input_addr, #(16*22)] + str vAsuq, [input_addr, #(16*24)] + + // str vAbaq, [input_addr, #(16*0)] + // str vAbeq, [input_addr, #(16*1)] + // str vAbiq, [input_addr, #(16*2)] + // str vAboq, [input_addr, #(16*3)] + // str vAbuq, [input_addr, #(16*4)] + // str vAgaq, [input_addr, #(16*5)] + // str vAgeq, [input_addr, #(16*6)] + // str vAgiq, [input_addr, #(16*7)] + // str vAgoq, [input_addr, #(16*8)] + // str vAguq, [input_addr, #(16*9)] + // str vAkaq, [input_addr, #(16*10)] + // str vAkeq, [input_addr, #(16*11)] + // str vAkiq, [input_addr, #(16*12)] + // str vAkoq, [input_addr, #(16*13)] + // str vAkuq, [input_addr, #(16*14)] + // str vAmaq, [input_addr, #(16*15)] + // str vAmeq, [input_addr, #(16*16)] + // str vAmiq, [input_addr, #(16*17)] + // str vAmoq, [input_addr, #(16*18)] + // str vAmuq, [input_addr, #(16*19)] + // str vAsaq, [input_addr, #(16*20)] + // str vAseq, [input_addr, #(16*21)] + // str vAsiq, [input_addr, #(16*22)] + // str vAsoq, [input_addr, #(16*23)] + // str vAsuq, [input_addr, #(16*24)] +.endm + +.macro load_input_scalar + ldp s_Aba, sAbe, [input_addr,8*0 ] + ldp sAbi, sAbo, [input_addr,8*2 ] + ldp sAbu, sAga, [input_addr,8*4 ] + ldp sAge, sAgi, [input_addr,8*6 ] + ldp sAgo, sAgu, [input_addr,8*8 ] + ldp sAka, sAke, [input_addr,8*10] + ldp sAki, sAko, [input_addr,8*12] + ldp sAku, sAma, [input_addr,8*14] + ldp sAme, sAmi, [input_addr,8*16] + ldp sAmo, sAmu, [input_addr,8*18] + ldp sAsa, sAse, [input_addr,8*20] + ldp sAsi, sAso, [input_addr,8*22] + ldr sAsu, [input_addr,8*24] +.endm + +.macro store_input_scalar + stp s_Aba, sAbe, [input_addr,8*0 ] + stp sAbi, sAbo, [input_addr,8*2 ] + stp sAbu, sAga, [input_addr,8*4 ] + stp sAge, sAgi, [input_addr,8*6 ] + stp sAgo, sAgu, [input_addr,8*8 ] + stp sAka, sAke, [input_addr,8*10] + stp sAki, sAko, [input_addr,8*12] + stp sAku, sAma, [input_addr,8*14] + stp sAme, sAmi, [input_addr,8*16] + stp sAmo, sAmu, [input_addr,8*18] + stp sAsa, sAse, [input_addr,8*20] + stp sAsi, sAso, [input_addr,8*22] + str sAsu, [input_addr,8*24] +.endm + + +#define STACK_SIZE (4*16 + 12*8 + 6*8 + 16*1) +#define STACK_BASE_VREGS (0) +#define STACK_BASE_GPRS (4*16) +#define STACK_BASE_TMP_GPRS (4*16 + 12*8) +#define STACK_BASE_TMP_VREGS (4*16 + 12*8 + 6*8) +#define STACK_OFFSET_INPUT (0*8) +#define STACK_OFFSET_CONST (1*8) +#define STACK_OFFSET_COUNT (2*8) +#define STACK_OFFSET_COUNT_OUT (3*8) +#define STACK_OFFSET_CUR_INPUT (4*8) + +#define vAga_offset 0 + +#define save(name) \ + str name ## q, [sp, #(STACK_BASE_TMP_VREGS + 16 * name ## _offset)] +#define restore(name) \ + ldr name ## q, [sp, #(STACK_BASE_TMP_VREGS + 16 * name ## _offset)] + + +.macro save_gprs + stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro save_vregs + stp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] + stp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + stp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + stp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] +.endm + +.macro restore_vregs + ldp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] + ldp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + ldp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + ldp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] +.endm + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +.macro eor5 dst, src0, src1, src2, src3, src4 + eor \dst, \src0, \src1 + eor \dst, \dst, \src2 + eor \dst, \dst, \src3 + eor \dst, \dst, \src4 +.endm + +.macro xor_rol dst, src1, src0, imm + eor \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro bic_rol dst, src1, src0, imm + bic \dst, \src0, \src1, ROR #(64-\imm) +.endm + +.macro rotate dst, src, imm + ror \dst, \src, #(64-\imm) +.endm + +.macro save reg, offset + str \reg, [sp, #(STACK_BASE_TMP_GPRS + \offset)] +.endm + +.macro restore reg, offset + ldr \reg, [sp, #(STACK_BASE_TMP_GPRS + \offset)] +.endm + +.macro hybrid_round_initial +eor sC0, sAma, sAsa SEP +eor sC1, sAme, sAse SEP eor3_m1 C1,vAbe,vAge,vAke +eor sC2, sAmi, sAsi SEP +eor sC3, sAmo, sAso SEP +eor sC4, sAmu, sAsu SEP +eor sC0, sAka, sC0 SEP eor3_m1 C3,vAbo,vAgo,vAko +eor sC1, sAke, sC1 SEP +eor sC2, sAki, sC2 SEP +eor sC3, sAko, sC3 SEP +eor sC4, sAku, sC4 SEP eor3_m1 C0,vAba,vAga,vAka +eor sC0, sAga, sC0 SEP +eor sC1, sAge, sC1 SEP +eor sC2, sAgi, sC2 SEP +eor sC3, sAgo, sC3 SEP eor3_m1 C2,vAbi,vAgi,vAki +eor sC4, sAgu, sC4 SEP +eor sC0, s_Aba, sC0 SEP +eor sC1, sAbe, sC1 SEP +eor sC2, sAbi, sC2 SEP eor3_m1 C4,vAbu,vAgu,vAku +eor sC3, sAbo, sC3 SEP +eor sC4, sAbu, sC4 SEP +eor sE1, sC0, sC2, ROR #63 SEP +eor sE3, sC2, sC4, ROR #63 SEP eor3_m1 C1, C1,vAme, vAse +eor sE0, sC4, sC1, ROR #63 SEP +eor sE2, sC1, sC3, ROR #63 SEP +eor sE4, sC3, sC0, ROR #63 SEP +eor s_Aba_, s_Aba, sE0 SEP eor3_m1 C3, C3,vAmo, vAso +eor sAsa_, sAbi, sE2 SEP +eor sAbi_, sAki, sE2 SEP +eor sAki_, sAko, sE3 SEP +eor sAko_, sAmu, sE4 SEP eor3_m1 C0, C0,vAma, vAsa +eor sAmu_, sAso, sE3 SEP +eor sAso_, sAma, sE0 SEP +eor sAka_, sAbe, sE1 SEP +eor sAse_, sAgo, sE3 SEP eor3_m1 C2, C2,vAmi, vAsi +eor sAgo_, sAme, sE1 SEP +eor sAke_, sAgi, sE2 SEP +eor sAgi_, sAka, sE0 SEP +eor sAga_, sAbo, sE3 SEP eor3_m1 C4, C4,vAmu, vAsu +eor sAbo_, sAmo, sE3 SEP +eor sAmo_, sAmi, sE2 SEP vvtmp .req vBba +eor sAmi_, sAke, sE1 SEP +eor sAge_, sAgu, sE4 SEP rax1_m1 E2, C1, C3 +eor sAgu_, sAsi, sE2 SEP +eor sAsi_, sAku, sE4 SEP +eor sAku_, sAsa, sE0 SEP +eor sAma_, sAbu, sE4 SEP rax1_m1 E4, C3, C0 +eor sAbu_, sAsu, sE4 SEP +eor sAsu_, sAse, sE1 SEP +eor sAme_, sAga, sE0 SEP +eor sAbe_, sAge, sE1 SEP rax1_m1 E1, C0, C2 +load_constant_ptr SEP +bic tmp, sAgi_, sAge_, ROR #47 SEP +eor sAga, tmp, sAga_, ROR #39 SEP +bic tmp, sAgo_, sAgi_, ROR #42 SEP rax1_m1 E3, C2, C4 +eor sAge, tmp, sAge_, ROR #25 SEP +bic tmp, sAgu_, sAgo_, ROR #16 SEP +eor sAgi, tmp, sAgi_, ROR #58 SEP +bic tmp, sAga_, sAgu_, ROR #31 SEP rax1_m1 E0, C4, C1 +eor sAgo, tmp, sAgo_, ROR #47 SEP +bic tmp, sAge_, sAga_, ROR #56 SEP .unreq vvtmp +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP vvtmp .req C1 +eor sAka, tmp, sAka_, ROR #24 SEP +bic tmp, sAko_, sAki_, ROR #47 SEP vvtmpq .req C1q +eor sAke, tmp, sAke_, ROR #2 SEP +bic tmp, sAku_, sAko_, ROR #10 SEP eor vBba.16b, vAba.16b, E0.16b +eor sAki, tmp, sAki_, ROR #57 SEP +bic tmp, sAka_, sAku_, ROR #47 SEP xar_m1 vBsa, vAbi, E2, 2 +eor sAko, tmp, sAko_, ROR #57 SEP +bic tmp, sAke_, sAka_, ROR #5 SEP +eor sAku, tmp, sAku_, ROR #52 SEP +bic tmp, sAmi_, sAme_, ROR #38 SEP xar_m1 vBbi, vAki, E2, 21 +eor sAma, tmp, sAma_, ROR #47 SEP +bic tmp, sAmo_, sAmi_, ROR #5 SEP +eor sAme, tmp, sAme_, ROR #43 SEP +bic tmp, sAmu_, sAmo_, ROR #41 SEP xar_m1 vBki, vAko, E3, 39 +eor sAmi, tmp, sAmi_, ROR #46 SEP +ldr cur_const, [const_addr] SEP +mov count, #1 SEP +bic tmp, sAma_, sAmu_, ROR #35 SEP xar_m1 vBko, vAmu, E4, 56 +eor sAmo, tmp, sAmo_, ROR #12 SEP +bic tmp, sAme_, sAma_, ROR #9 SEP +eor sAmu, tmp, sAmu_, ROR #44 SEP +bic tmp, sAsi_, sAse_, ROR #48 SEP xar_m1 vBmu, vAso, E3, 8 +eor sAsa, tmp, sAsa_, ROR #41 SEP +bic tmp, sAso_, sAsi_, ROR #2 SEP +eor sAse, tmp, sAse_, ROR #50 SEP +bic tmp, sAsu_, sAso_, ROR #25 SEP xar_m1 vBso, vAma, E0, 23 +eor sAsi, tmp, sAsi_, ROR #27 SEP +bic tmp, sAsa_, sAsu_, ROR #60 SEP +eor sAso, tmp, sAso_, ROR #21 SEP +bic tmp, sAse_, sAsa_, ROR #57 SEP xar_m1 vBka, vAbe, E1, 63 +eor sAsu, tmp, sAsu_, ROR #53 SEP +bic tmp, sAbi_, sAbe_, ROR #63 SEP +eor s_Aba, s_Aba_, tmp, ROR #21 SEP +bic tmp, sAbo_, sAbi_, ROR #42 SEP xar_m1 vBse, vAgo, E3, 9 +eor sAbe, tmp, sAbe_, ROR #41 SEP +bic tmp, sAbu_, sAbo_, ROR #57 SEP +eor sAbi, tmp, sAbi_, ROR #35 SEP +bic tmp, s_Aba_, sAbu_, ROR #50 SEP xar_m1 vBgo, vAme, E1, 19 +eor sAbo, tmp, sAbo_, ROR #43 SEP +bic tmp, sAbe_, s_Aba_, ROR #44 SEP +eor sAbu, tmp, sAbu_, ROR #30 SEP +eor s_Aba, s_Aba, cur_const SEP xar_m1 vBke, vAgi, E2, 58 +save count, STACK_OFFSET_COUNT SEP +eor sC0, sAka, sAsa, ROR #50 SEP +eor sC1, sAse, sAge, ROR #60 SEP +eor sC2, sAmi, sAgi, ROR #59 SEP xar_m1 vBgi, vAka, E0, 61 +eor sC3, sAgo, sAso, ROR #30 SEP +eor sC4, sAbu, sAsu, ROR #53 SEP +eor sC0, sAma, sC0, ROR #49 SEP +eor sC1, sAbe, sC1, ROR #44 SEP xar_m1 vBga, vAbo, E3, 36 +eor sC2, sAki, sC2, ROR #26 SEP +eor sC3, sAmo, sC3, ROR #63 SEP +eor sC4, sAmu, sC4, ROR #56 SEP +eor sC0, sAga, sC0, ROR #57 SEP xar_m1 vBbo, vAmo, E3, 43 +eor sC1, sAme, sC1, ROR #58 SEP +eor sC2, sAbi, sC2, ROR #60 SEP +eor sC3, sAko, sC3, ROR #38 SEP +eor sC4, sAgu, sC4, ROR #48 SEP xar_m1 vBmo, vAmi, E2, 49 +eor sC0, s_Aba, sC0, ROR #61 SEP +eor sC1, sAke, sC1, ROR #57 SEP +eor sC2, sAsi, sC2, ROR #52 SEP +eor sC3, sAbo, sC3, ROR #63 SEP +eor sC4, sAku, sC4, ROR #50 SEP xar_m1 vBmi, vAke, E1, 54 +ror sC1, sC1, 56 SEP +ror sC4, sC4, 58 SEP +ror sC2, sC2, 62 SEP +eor sE1, sC0, sC2, ROR #63 SEP xar_m1 vBge, vAgu, E4, 44 +eor sE3, sC2, sC4, ROR #63 SEP +eor sE0, sC4, sC1, ROR #63 SEP mov E3.16b, vAga.16b +eor sE2, sC1, sC3, ROR #63 SEP +eor sE4, sC3, sC0, ROR #63 SEP bcax_m1 vAga, vBga, vBgi, vBge +eor s_Aba_, sE0, s_Aba SEP +eor sAsa_, sE2, sAbi, ROR #50 SEP +eor sAbi_, sE2, sAki, ROR #46 SEP xar_m1 vBgu, vAsi, E2, 3 +eor sAki_, sE3, sAko, ROR #63 SEP +eor sAko_, sE4, sAmu, ROR #28 SEP +eor sAmu_, sE3, sAso, ROR #2 SEP +eor sAso_, sE0, sAma, ROR #54 SEP xar_m1 vBsi, vAku, E4, 25 +eor sAka_, sE1, sAbe, ROR #43 SEP +eor sAse_, sE3, sAgo, ROR #36 SEP +eor sAgo_, sE1, sAme, ROR #49 SEP +eor sAke_, sE2, sAgi, ROR #3 SEP xar_m1 vBku, vAsa, E0, 46 +eor sAgi_, sE0, sAka, ROR #39 SEP +eor sAga_, sE3, sAbo SEP +eor sAbo_, sE3, sAmo, ROR #37 SEP +eor sAmo_, sE2, sAmi, ROR #8 SEP +eor sAmi_, sE1, sAke, ROR #56 SEP +eor sAge_, sE4, sAgu, ROR #44 SEP +eor sAgu_, sE2, sAsi, ROR #62 SEP xar_m1 vBma, vAbu, E4, 37 +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP +eor sAma_, sE4, sAbu, ROR #20 SEP +eor sAbu_, sE4, sAsu, ROR #9 SEP +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP +eor sAbe_, sE1, sAge, ROR #19 SEP xar_m1 vBbu, vAsu, E4, 50 +load_constant_ptr SEP +restore count, STACK_OFFSET_COUNT SEP +bic tmp, sAgi_, sAge_, ROR #47 SEP +eor sAga, tmp, sAga_, ROR #39 SEP +bic tmp, sAgo_, sAgi_, ROR #42 SEP xar_m1 vBsu, vAse, E1, 62 +eor sAge, tmp, sAge_, ROR #25 SEP +bic tmp, sAgu_, sAgo_, ROR #16 SEP +eor sAgi, tmp, sAgi_, ROR #58 SEP +bic tmp, sAga_, sAgu_, ROR #31 SEP +eor sAgo, tmp, sAgo_, ROR #47 SEP +bic tmp, sAge_, sAga_, ROR #56 SEP +eor sAgu, tmp, sAgu_, ROR #23 SEP xar_m1 vBme, E3, E0, 28 +bic tmp, sAki_, sAke_, ROR #19 SEP +eor sAka, tmp, sAka_, ROR #24 SEP +bic tmp, sAko_, sAki_, ROR #47 SEP +eor sAke, tmp, sAke_, ROR #2 SEP +bic tmp, sAku_, sAko_, ROR #10 SEP +eor sAki, tmp, sAki_, ROR #57 SEP xar_m1 vBbe, vAge, E1, 20 +bic tmp, sAka_, sAku_, ROR #47 SEP +eor sAko, tmp, sAko_, ROR #57 SEP +bic tmp, sAke_, sAka_, ROR #5 SEP +eor sAku, tmp, sAku_, ROR #52 SEP +bic tmp, sAmi_, sAme_, ROR #38 SEP +eor sAma, tmp, sAma_, ROR #47 SEP bcax_m1 vAge, vBge, vBgo, vBgi +bic tmp, sAmo_, sAmi_, ROR #5 SEP +eor sAme, tmp, sAme_, ROR #43 SEP +bic tmp, sAmu_, sAmo_, ROR #41 SEP +eor sAmi, tmp, sAmi_, ROR #46 SEP bcax_m1 vAgi, vBgi, vBgu, vBgo +bic tmp, sAma_, sAmu_, ROR #35 SEP +eor sAmo, tmp, sAmo_, ROR #12 SEP +bic tmp, sAme_, sAma_, ROR #9 SEP +eor sAmu, tmp, sAmu_, ROR #44 SEP bcax_m1 vAgo, vBgo, vBga, vBgu +bic tmp, sAsi_, sAse_, ROR #48 SEP +ldr cur_const, [const_addr, count, UXTW #3] SEP +eor sAsa, tmp, sAsa_, ROR #41 SEP +bic tmp, sAso_, sAsi_, ROR #2 SEP bcax_m1 vAgu, vBgu, vBge, vBga +eor sAse, tmp, sAse_, ROR #50 SEP +bic tmp, sAsu_, sAso_, ROR #25 SEP +eor sAsi, tmp, sAsi_, ROR #27 SEP +bic tmp, sAsa_, sAsu_, ROR #60 SEP bcax_m1 vAka, vBka, vBki, vBke +eor sAso, tmp, sAso_, ROR #21 SEP +bic tmp, sAse_, sAsa_, ROR #57 SEP +eor sAsu, tmp, sAsu_, ROR #53 SEP +bic tmp, sAbi_, sAbe_, ROR #63 SEP bcax_m1 vAke, vBke, vBko, vBki +eor s_Aba, s_Aba_, tmp, ROR #21 SEP .unreq vvtmp +bic tmp, sAbo_, sAbi_, ROR #42 SEP +eor sAbe, tmp, sAbe_, ROR #41 SEP .unreq vvtmpq +bic tmp, sAbu_, sAbo_, ROR #57 SEP eor2 C0, vAka, vAga +eor sAbi, tmp, sAbi_, ROR #35 SEP vvtmp .req vAga +bic tmp, s_Aba_, sAbu_, ROR #50 SEP save(vAga) +eor sAbo, tmp, sAbo_, ROR #43 SEP vvtmpq .req vAgaq +bic tmp, sAbe_, s_Aba_, ROR #44 SEP bcax_m1 vAki, vBki, vBku, vBko +eor sAbu, tmp, sAbu_, ROR #30 SEP +add count, count, #1 SEP +eor s_Aba, s_Aba, cur_const SEP + SEP +save count, STACK_OFFSET_COUNT SEP bcax_m1 vAko, vBko, vBka, vBku +eor sC0, sAka, sAsa, ROR #50 SEP +eor sC1, sAse, sAge, ROR #60 SEP +eor sC2, sAmi, sAgi, ROR #59 SEP +eor sC3, sAgo, sAso, ROR #30 SEP eor2 C1, vAke, vAge +eor sC4, sAbu, sAsu, ROR #53 SEP +eor sC0, sAma, sC0, ROR #49 SEP bcax_m1 vAku, vBku, vBke, vBka +eor sC1, sAbe, sC1, ROR #44 SEP +eor sC2, sAki, sC2, ROR #26 SEP +eor sC3, sAmo, sC3, ROR #63 SEP +eor sC4, sAmu, sC4, ROR #56 SEP eor2 C2, vAki, vAgi +eor sC0, sAga, sC0, ROR #57 SEP +eor sC1, sAme, sC1, ROR #58 SEP bcax_m1 vAma, vBma, vBmi, vBme +eor sC2, sAbi, sC2, ROR #60 SEP +eor sC3, sAko, sC3, ROR #38 SEP +eor sC4, sAgu, sC4, ROR #48 SEP +eor sC0, s_Aba, sC0, ROR #61 SEP eor2 C3, vAko, vAgo +eor sC1, sAke, sC1, ROR #57 SEP +eor sC2, sAsi, sC2, ROR #52 SEP bcax_m1 vAme, vBme, vBmo, vBmi +eor sC3, sAbo, sC3, ROR #63 SEP +eor sC4, sAku, sC4, ROR #50 SEP +ror sC1, sC1, 56 SEP +ror sC4, sC4, 58 SEP eor2 C4, vAku, vAgu +ror sC2, sC2, 62 SEP +eor sE1, sC0, sC2, ROR #63 SEP bcax_m1 vAmi, vBmi, vBmu, vBmo +eor sE3, sC2, sC4, ROR #63 SEP +eor sE0, sC4, sC1, ROR #63 SEP +eor sE2, sC1, sC3, ROR #63 SEP eor2 C0, C0, vAma +eor sE4, sC3, sC0, ROR #63 SEP +eor s_Aba_, sE0, s_Aba SEP bcax_m1 vAmo, vBmo, vBma, vBmu +eor sAsa_, sE2, sAbi, ROR #50 SEP +eor sAbi_, sE2, sAki, ROR #46 SEP +eor sAki_, sE3, sAko, ROR #63 SEP +eor sAko_, sE4, sAmu, ROR #28 SEP eor2 C1, C1, vAme +eor sAmu_, sE3, sAso, ROR #2 SEP +eor sAso_, sE0, sAma, ROR #54 SEP bcax_m1 vAmu, vBmu, vBme, vBma +eor sAka_, sE1, sAbe, ROR #43 SEP +eor sAse_, sE3, sAgo, ROR #36 SEP +eor sAgo_, sE1, sAme, ROR #49 SEP eor2 C2, C2, vAmi +eor sAke_, sE2, sAgi, ROR #3 SEP +eor sAgi_, sE0, sAka, ROR #39 SEP bcax_m1 vAsa, vBsa, vBsi, vBse +eor sAga_, sE3, sAbo SEP +eor sAbo_, sE3, sAmo, ROR #37 SEP eor2 C3, C3, vAmo +eor sAmo_, sE2, sAmi, ROR #8 SEP +eor sAmi_, sE1, sAke, ROR #56 SEP bcax_m1 vAse, vBse, vBso, vBsi +eor sAge_, sE4, sAgu, ROR #44 SEP +eor sAgu_, sE2, sAsi, ROR #62 SEP +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP eor2 C4, C4, vAmu +eor sAma_, sE4, sAbu, ROR #20 SEP +eor sAbu_, sE4, sAsu, ROR #9 SEP bcax_m1 vAsi, vBsi, vBsu, vBso +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP +eor sAbe_, sE1, sAge, ROR #19 SEP +load_constant_ptr SEP eor2 C0, C0, vAsa +restore count, STACK_OFFSET_COUNT SEP +bic tmp, sAgi_, sAge_, ROR #47 SEP bcax_m1 vAso, vBso, vBsa, vBsu +eor sAga, tmp, sAga_, ROR #39 SEP +bic tmp, sAgo_, sAgi_, ROR #42 SEP +eor sAge, tmp, sAge_, ROR #25 SEP +bic tmp, sAgu_, sAgo_, ROR #16 SEP eor2 C1, C1, vAse +eor sAgi, tmp, sAgi_, ROR #58 SEP +bic tmp, sAga_, sAgu_, ROR #31 SEP bcax_m1 vAsu, vBsu, vBse, vBsa +eor sAgo, tmp, sAgo_, ROR #47 SEP +bic tmp, sAge_, sAga_, ROR #56 SEP +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP eor2 C2, C2, vAsi +eor sAka, tmp, sAka_, ROR #24 SEP +bic tmp, sAko_, sAki_, ROR #47 SEP eor2 C3, C3, vAso +eor sAke, tmp, sAke_, ROR #2 SEP +bic tmp, sAku_, sAko_, ROR #10 SEP bcax_m1 vAba, vBba, vBbi, vBbe +eor sAki, tmp, sAki_, ROR #57 SEP +bic tmp, sAka_, sAku_, ROR #47 SEP +eor sAko, tmp, sAko_, ROR #57 SEP +bic tmp, sAke_, sAka_, ROR #5 SEP bcax_m1 vAbe, vBbe, vBbo, vBbi +eor sAku, tmp, sAku_, ROR #52 SEP +bic tmp, sAmi_, sAme_, ROR #38 SEP +eor sAma, tmp, sAma_, ROR #47 SEP +bic tmp, sAmo_, sAmi_, ROR #5 SEP eor2 C1, C1, vAbe +eor sAme, tmp, sAme_, ROR #43 SEP restore x26, STACK_OFFSET_CONST +bic tmp, sAmu_, sAmo_, ROR #41 SEP ldr vvtmpq, [x26], #16 +eor sAmi, tmp, sAmi_, ROR #46 SEP save x26, STACK_OFFSET_CONST +bic tmp, sAma_, sAmu_, ROR #35 SEP +eor sAmo, tmp, sAmo_, ROR #12 SEP eor vAba.16b, vAba.16b, vvtmp.16b +bic tmp, sAme_, sAma_, ROR #9 SEP +eor sAmu, tmp, sAmu_, ROR #44 SEP eor2 C4, C4, vAsu +bic tmp, sAsi_, sAse_, ROR #48 SEP +ldr cur_const, [const_addr, count, UXTW #3] SEP bcax_m1 vAbi, vBbi, vBbu, vBbo +eor sAsa, tmp, sAsa_, ROR #41 SEP +bic tmp, sAso_, sAsi_, ROR #2 SEP +eor sAse, tmp, sAse_, ROR #50 SEP +bic tmp, sAsu_, sAso_, ROR #25 SEP bcax_m1 vAbo, vBbo, vBba, vBbu +eor sAsi, tmp, sAsi_, ROR #27 SEP +bic tmp, sAsa_, sAsu_, ROR #60 SEP +eor sAso, tmp, sAso_, ROR #21 SEP +bic tmp, sAse_, sAsa_, ROR #57 SEP eor2 C3, C3, vAbo +eor sAsu, tmp, sAsu_, ROR #53 SEP +bic tmp, sAbi_, sAbe_, ROR #63 SEP eor2 C2, C2, vAbi +eor s_Aba, s_Aba_, tmp, ROR #21 SEP +bic tmp, sAbo_, sAbi_, ROR #42 SEP eor2 C0, C0, vAba +eor sAbe, tmp, sAbe_, ROR #41 SEP +bic tmp, sAbu_, sAbo_, ROR #57 SEP bcax_m1 vAbu, vBbu, vBbe, vBba +eor sAbi, tmp, sAbi_, ROR #35 SEP +bic tmp, s_Aba_, sAbu_, ROR #50 SEP +eor sAbo, tmp, sAbo_, ROR #43 SEP +bic tmp, sAbe_, s_Aba_, ROR #44 SEP eor2 C4, C4, vAbu +eor sAbu, tmp, sAbu_, ROR #30 SEP +add count, count, #1 SEP restore(vAga) +eor s_Aba, s_Aba, cur_const SEP + .unreq vvtmp + + .unreq vvtmpq +.endm + +.macro hybrid_round_noninitial + SEP vvtmp .req vBba +save count, STACK_OFFSET_COUNT SEP rax1_m1 E2, C1, C3 +eor sC0, sAka, sAsa, ROR #50 SEP +eor sC1, sAse, sAge, ROR #60 SEP +eor sC2, sAmi, sAgi, ROR #59 SEP +eor sC3, sAgo, sAso, ROR #30 SEP +eor sC4, sAbu, sAsu, ROR #53 SEP +eor sC0, sAma, sC0, ROR #49 SEP rax1_m1 E4, C3, C0 +eor sC1, sAbe, sC1, ROR #44 SEP +eor sC2, sAki, sC2, ROR #26 SEP +eor sC3, sAmo, sC3, ROR #63 SEP +eor sC4, sAmu, sC4, ROR #56 SEP +eor sC0, sAga, sC0, ROR #57 SEP +eor sC1, sAme, sC1, ROR #58 SEP rax1_m1 E1, C0, C2 +eor sC2, sAbi, sC2, ROR #60 SEP +eor sC3, sAko, sC3, ROR #38 SEP +eor sC4, sAgu, sC4, ROR #48 SEP +eor sC0, s_Aba, sC0, ROR #61 SEP +eor sC1, sAke, sC1, ROR #57 SEP +eor sC2, sAsi, sC2, ROR #52 SEP rax1_m1 E3, C2, C4 +eor sC3, sAbo, sC3, ROR #63 SEP +eor sC4, sAku, sC4, ROR #50 SEP +ror sC1, sC1, 56 SEP +ror sC4, sC4, 58 SEP +ror sC2, sC2, 62 SEP +eor sE1, sC0, sC2, ROR #63 SEP rax1_m1 E0, C4, C1 +eor sE3, sC2, sC4, ROR #63 SEP +eor sE0, sC4, sC1, ROR #63 SEP .unreq vvtmp +eor sE2, sC1, sC3, ROR #63 SEP vvtmp .req C1 +eor sE4, sC3, sC0, ROR #63 SEP vvtmpq .req C1q +eor s_Aba_, sE0, s_Aba SEP +eor sAsa_, sE2, sAbi, ROR #50 SEP eor vBba.16b, vAba.16b, E0.16b +eor sAbi_, sE2, sAki, ROR #46 SEP +eor sAki_, sE3, sAko, ROR #63 SEP xar_m1 vBsa, vAbi, E2, 2 +eor sAko_, sE4, sAmu, ROR #28 SEP +eor sAmu_, sE3, sAso, ROR #2 SEP +eor sAso_, sE0, sAma, ROR #54 SEP +eor sAka_, sE1, sAbe, ROR #43 SEP +eor sAse_, sE3, sAgo, ROR #36 SEP +eor sAgo_, sE1, sAme, ROR #49 SEP xar_m1 vBbi, vAki, E2, 21 +eor sAke_, sE2, sAgi, ROR #3 SEP +eor sAgi_, sE0, sAka, ROR #39 SEP +eor sAga_, sE3, sAbo SEP +eor sAbo_, sE3, sAmo, ROR #37 SEP +eor sAmo_, sE2, sAmi, ROR #8 SEP +eor sAmi_, sE1, sAke, ROR #56 SEP xar_m1 vBki, vAko, E3, 39 +eor sAge_, sE4, sAgu, ROR #44 SEP +eor sAgu_, sE2, sAsi, ROR #62 SEP +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP +eor sAma_, sE4, sAbu, ROR #20 SEP +eor sAbu_, sE4, sAsu, ROR #9 SEP xar_m1 vBko, vAmu, E4, 56 +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP +eor sAbe_, sE1, sAge, ROR #19 SEP +load_constant_ptr SEP +restore count, STACK_OFFSET_COUNT SEP +bic tmp, sAgi_, sAge_, ROR #47 SEP xar_m1 vBmu, vAso, E3, 8 +eor sAga, tmp, sAga_, ROR #39 SEP +bic tmp, sAgo_, sAgi_, ROR #42 SEP +eor sAge, tmp, sAge_, ROR #25 SEP +bic tmp, sAgu_, sAgo_, ROR #16 SEP +eor sAgi, tmp, sAgi_, ROR #58 SEP +bic tmp, sAga_, sAgu_, ROR #31 SEP xar_m1 vBso, vAma, E0, 23 +eor sAgo, tmp, sAgo_, ROR #47 SEP +bic tmp, sAge_, sAga_, ROR #56 SEP +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP +eor sAka, tmp, sAka_, ROR #24 SEP +bic tmp, sAko_, sAki_, ROR #47 SEP xar_m1 vBka, vAbe, E1, 63 +eor sAke, tmp, sAke_, ROR #2 SEP +bic tmp, sAku_, sAko_, ROR #10 SEP +eor sAki, tmp, sAki_, ROR #57 SEP +bic tmp, sAka_, sAku_, ROR #47 SEP +eor sAko, tmp, sAko_, ROR #57 SEP +bic tmp, sAke_, sAka_, ROR #5 SEP xar_m1 vBse, vAgo, E3, 9 +eor sAku, tmp, sAku_, ROR #52 SEP +bic tmp, sAmi_, sAme_, ROR #38 SEP +eor sAma, tmp, sAma_, ROR #47 SEP +bic tmp, sAmo_, sAmi_, ROR #5 SEP +eor sAme, tmp, sAme_, ROR #43 SEP xar_m1 vBgo, vAme, E1, 19 +bic tmp, sAmu_, sAmo_, ROR #41 SEP +eor sAmi, tmp, sAmi_, ROR #46 SEP +bic tmp, sAma_, sAmu_, ROR #35 SEP +ldr cur_const, [const_addr, count, UXTW #3] +add count, count, #1 SEP xar_m1 vBke, vAgi, E2, 58 +eor sAmo, tmp, sAmo_, ROR #12 SEP +bic tmp, sAme_, sAma_, ROR #9 SEP +eor sAmu, tmp, sAmu_, ROR #44 SEP +bic tmp, sAsi_, sAse_, ROR #48 SEP +eor sAsa, tmp, sAsa_, ROR #41 SEP xar_m1 vBgi, vAka, E0, 61 +bic tmp, sAso_, sAsi_, ROR #2 SEP +eor sAse, tmp, sAse_, ROR #50 SEP +bic tmp, sAsu_, sAso_, ROR #25 SEP +eor sAsi, tmp, sAsi_, ROR #27 SEP +bic tmp, sAsa_, sAsu_, ROR #60 SEP +eor sAso, tmp, sAso_, ROR #21 SEP xar_m1 vBga, vAbo, E3, 36 +bic tmp, sAse_, sAsa_, ROR #57 SEP +eor sAsu, tmp, sAsu_, ROR #53 SEP +bic tmp, sAbi_, sAbe_, ROR #63 SEP +eor s_Aba, s_Aba_, tmp, ROR #21 SEP +bic tmp, sAbo_, sAbi_, ROR #42 SEP +eor sAbe, tmp, sAbe_, ROR #41 SEP xar_m1 vBbo, vAmo, E3, 43 +bic tmp, sAbu_, sAbo_, ROR #57 SEP +eor sAbi, tmp, sAbi_, ROR #35 SEP +bic tmp, s_Aba_, sAbu_, ROR #50 SEP +eor sAbo, tmp, sAbo_, ROR #43 SEP +bic tmp, sAbe_, s_Aba_, ROR #44 SEP +eor sAbu, tmp, sAbu_, ROR #30 SEP xar_m1 vBmo, vAmi, E2, 49 +eor s_Aba, s_Aba, cur_const SEP +save count, STACK_OFFSET_COUNT SEP +eor sC0, sAka, sAsa, ROR #50 SEP +eor sC1, sAse, sAge, ROR #60 SEP +eor sC2, sAmi, sAgi, ROR #59 SEP +eor sC3, sAgo, sAso, ROR #30 SEP xar_m1 vBmi, vAke, E1, 54 +eor sC4, sAbu, sAsu, ROR #53 SEP +eor sC0, sAma, sC0, ROR #49 SEP +eor sC1, sAbe, sC1, ROR #44 SEP +eor sC2, sAki, sC2, ROR #26 SEP +eor sC3, sAmo, sC3, ROR #63 SEP +eor sC4, sAmu, sC4, ROR #56 SEP +eor sC0, sAga, sC0, ROR #57 SEP xar_m1 vBge, vAgu, E4, 44 +eor sC1, sAme, sC1, ROR #58 SEP +eor sC2, sAbi, sC2, ROR #60 SEP +eor sC3, sAko, sC3, ROR #38 SEP +eor sC4, sAgu, sC4, ROR #48 SEP +eor sC0, s_Aba, sC0, ROR #61 SEP +eor sC1, sAke, sC1, ROR #57 SEP mov E3.16b, vAga.16b +eor sC2, sAsi, sC2, ROR #52 SEP +eor sC3, sAbo, sC3, ROR #63 SEP bcax_m1 vAga, vBga, vBgi, vBge +eor sC4, sAku, sC4, ROR #50 SEP +ror sC1, sC1, 56 SEP +ror sC4, sC4, 58 SEP +ror sC2, sC2, 62 SEP xar_m1 vBgu, vAsi, E2, 3 +eor sE1, sC0, sC2, ROR #63 SEP +eor sE3, sC2, sC4, ROR #63 SEP +eor sE0, sC4, sC1, ROR #63 SEP +eor sE2, sC1, sC3, ROR #63 SEP +eor sE4, sC3, sC0, ROR #63 SEP +eor s_Aba_, sE0, s_Aba SEP xar_m1 vBsi, vAku, E4, 25 +eor sAsa_, sE2, sAbi, ROR #50 SEP +eor sAbi_, sE2, sAki, ROR #46 SEP +eor sAki_, sE3, sAko, ROR #63 SEP +eor sAko_, sE4, sAmu, ROR #28 SEP +eor sAmu_, sE3, sAso, ROR #2 SEP +eor sAso_, sE0, sAma, ROR #54 SEP xar_m1 vBku, vAsa, E0, 46 +eor sAka_, sE1, sAbe, ROR #43 SEP +eor sAse_, sE3, sAgo, ROR #36 SEP +eor sAgo_, sE1, sAme, ROR #49 SEP +eor sAke_, sE2, sAgi, ROR #3 SEP +eor sAgi_, sE0, sAka, ROR #39 SEP +eor sAga_, sE3, sAbo SEP xar_m1 vBma, vAbu, E4, 37 +eor sAbo_, sE3, sAmo, ROR #37 SEP +eor sAmo_, sE2, sAmi, ROR #8 SEP +eor sAmi_, sE1, sAke, ROR #56 SEP +eor sAge_, sE4, sAgu, ROR #44 SEP +eor sAgu_, sE2, sAsi, ROR #62 SEP xar_m1 vBbu, vAsu, E4, 50 +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP +eor sAma_, sE4, sAbu, ROR #20 SEP +eor sAbu_, sE4, sAsu, ROR #9 SEP +eor sAsu_, sE1, sAse, ROR #23 SEP xar_m1 vBsu, vAse, E1, 62 +eor sAme_, sE0, sAga, ROR #61 SEP +eor sAbe_, sE1, sAge, ROR #19 SEP +load_constant_ptr SEP +restore count, STACK_OFFSET_COUNT SEP +bic tmp, sAgi_, sAge_, ROR #47 SEP +eor sAga, tmp, sAga_, ROR #39 SEP xar_m1 vBme, E3, E0, 28 +bic tmp, sAgo_, sAgi_, ROR #42 SEP +eor sAge, tmp, sAge_, ROR #25 SEP +bic tmp, sAgu_, sAgo_, ROR #16 SEP +eor sAgi, tmp, sAgi_, ROR #58 SEP +bic tmp, sAga_, sAgu_, ROR #31 SEP +eor sAgo, tmp, sAgo_, ROR #47 SEP xar_m1 vBbe, vAge, E1, 20 +bic tmp, sAge_, sAga_, ROR #56 SEP +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP +eor sAka, tmp, sAka_, ROR #24 SEP +bic tmp, sAko_, sAki_, ROR #47 SEP bcax_m1 vAge, vBge, vBgo, vBgi +eor sAke, tmp, sAke_, ROR #2 SEP +bic tmp, sAku_, sAko_, ROR #10 SEP +eor sAki, tmp, sAki_, ROR #57 SEP +bic tmp, sAka_, sAku_, ROR #47 SEP bcax_m1 vAgi, vBgi, vBgu, vBgo +eor sAko, tmp, sAko_, ROR #57 SEP +bic tmp, sAke_, sAka_, ROR #5 SEP +eor sAku, tmp, sAku_, ROR #52 SEP +bic tmp, sAmi_, sAme_, ROR #38 SEP bcax_m1 vAgo, vBgo, vBga, vBgu +eor sAma, tmp, sAma_, ROR #47 SEP +bic tmp, sAmo_, sAmi_, ROR #5 SEP +eor sAme, tmp, sAme_, ROR #43 SEP +bic tmp, sAmu_, sAmo_, ROR #41 SEP bcax_m1 vAgu, vBgu, vBge, vBga +eor sAmi, tmp, sAmi_, ROR #46 SEP +bic tmp, sAma_, sAmu_, ROR #35 SEP +ldr cur_const, [const_addr, count, UXTW #3] +add count, count, #1 SEP bcax_m1 vAka, vBka, vBki, vBke +eor sAmo, tmp, sAmo_, ROR #12 SEP +bic tmp, sAme_, sAma_, ROR #9 SEP +eor sAmu, tmp, sAmu_, ROR #44 SEP +bic tmp, sAsi_, sAse_, ROR #48 SEP bcax_m1 vAke, vBke, vBko, vBki +eor sAsa, tmp, sAsa_, ROR #41 SEP .unreq vvtmp +bic tmp, sAso_, sAsi_, ROR #2 SEP .unreq vvtmpq +eor sAse, tmp, sAse_, ROR #50 SEP +bic tmp, sAsu_, sAso_, ROR #25 SEP eor2 C0, vAka, vAga +eor sAsi, tmp, sAsi_, ROR #27 SEP save(vAga) +bic tmp, sAsa_, sAsu_, ROR #60 SEP vvtmp .req vAga +eor sAso, tmp, sAso_, ROR #21 SEP vvtmpq .req vAgaq +bic tmp, sAse_, sAsa_, ROR #57 SEP bcax_m1 vAki, vBki, vBku, vBko +eor sAsu, tmp, sAsu_, ROR #53 SEP +bic tmp, sAbi_, sAbe_, ROR #63 SEP +eor s_Aba, s_Aba_, tmp, ROR #21 SEP +bic tmp, sAbo_, sAbi_, ROR #42 SEP bcax_m1 vAko, vBko, vBka, vBku +eor sAbe, tmp, sAbe_, ROR #41 SEP +bic tmp, sAbu_, sAbo_, ROR #57 SEP +eor sAbi, tmp, sAbi_, ROR #35 SEP +bic tmp, s_Aba_, sAbu_, ROR #50 SEP eor2 C1, vAke, vAge +eor sAbo, tmp, sAbo_, ROR #43 SEP +bic tmp, sAbe_, s_Aba_, ROR #44 SEP bcax_m1 vAku, vBku, vBke, vBka +eor sAbu, tmp, sAbu_, ROR #30 SEP +eor s_Aba, s_Aba, cur_const SEP + SEP +save count, STACK_OFFSET_COUNT SEP +eor sC0, sAka, sAsa, ROR #50 SEP +eor sC1, sAse, sAge, ROR #60 SEP eor2 C2, vAki, vAgi +eor sC2, sAmi, sAgi, ROR #59 SEP +eor sC3, sAgo, sAso, ROR #30 SEP bcax_m1 vAma, vBma, vBmi, vBme +eor sC4, sAbu, sAsu, ROR #53 SEP +eor sC0, sAma, sC0, ROR #49 SEP +eor sC1, sAbe, sC1, ROR #44 SEP +eor sC2, sAki, sC2, ROR #26 SEP eor2 C3, vAko, vAgo +eor sC3, sAmo, sC3, ROR #63 SEP +eor sC4, sAmu, sC4, ROR #56 SEP bcax_m1 vAme, vBme, vBmo, vBmi +eor sC0, sAga, sC0, ROR #57 SEP +eor sC1, sAme, sC1, ROR #58 SEP +eor sC2, sAbi, sC2, ROR #60 SEP +eor sC3, sAko, sC3, ROR #38 SEP eor2 C4, vAku, vAgu +eor sC4, sAgu, sC4, ROR #48 SEP +eor sC0, s_Aba, sC0, ROR #61 SEP bcax_m1 vAmi, vBmi, vBmu, vBmo +eor sC1, sAke, sC1, ROR #57 SEP +eor sC2, sAsi, sC2, ROR #52 SEP +eor sC3, sAbo, sC3, ROR #63 SEP +eor sC4, sAku, sC4, ROR #50 SEP eor2 C0, C0, vAma +ror sC1, sC1, 56 SEP +ror sC4, sC4, 58 SEP bcax_m1 vAmo, vBmo, vBma, vBmu +ror sC2, sC2, 62 SEP +eor sE1, sC0, sC2, ROR #63 SEP +eor sE3, sC2, sC4, ROR #63 SEP +eor sE0, sC4, sC1, ROR #63 SEP eor2 C1, C1, vAme +eor sE2, sC1, sC3, ROR #63 SEP +eor sE4, sC3, sC0, ROR #63 SEP bcax_m1 vAmu, vBmu, vBme, vBma +eor s_Aba_, sE0, s_Aba SEP +eor sAsa_, sE2, sAbi, ROR #50 SEP +eor sAbi_, sE2, sAki, ROR #46 SEP +eor sAki_, sE3, sAko, ROR #63 SEP eor2 C2, C2, vAmi +eor sAko_, sE4, sAmu, ROR #28 SEP +eor sAmu_, sE3, sAso, ROR #2 SEP bcax_m1 vAsa, vBsa, vBsi, vBse +eor sAso_, sE0, sAma, ROR #54 SEP +eor sAka_, sE1, sAbe, ROR #43 SEP +eor sAse_, sE3, sAgo, ROR #36 SEP +eor sAgo_, sE1, sAme, ROR #49 SEP eor2 C3, C3, vAmo +eor sAke_, sE2, sAgi, ROR #3 SEP +eor sAgi_, sE0, sAka, ROR #39 SEP bcax_m1 vAse, vBse, vBso, vBsi +eor sAga_, sE3, sAbo SEP +eor sAbo_, sE3, sAmo, ROR #37 SEP +eor sAmo_, sE2, sAmi, ROR #8 SEP +eor sAmi_, sE1, sAke, ROR #56 SEP eor2 C4, C4, vAmu +eor sAge_, sE4, sAgu, ROR #44 SEP +eor sAgu_, sE2, sAsi, ROR #62 SEP bcax_m1 vAsi, vBsi, vBsu, vBso +eor sAsi_, sE4, sAku, ROR #58 SEP +eor sAku_, sE0, sAsa, ROR #25 SEP +eor sAma_, sE4, sAbu, ROR #20 SEP +eor sAbu_, sE4, sAsu, ROR #9 SEP eor2 C0, C0, vAsa +eor sAsu_, sE1, sAse, ROR #23 SEP +eor sAme_, sE0, sAga, ROR #61 SEP bcax_m1 vAso, vBso, vBsa, vBsu +eor sAbe_, sE1, sAge, ROR #19 SEP +load_constant_ptr SEP +restore count, STACK_OFFSET_COUNT SEP +bic tmp, sAgi_, sAge_, ROR #47 SEP +eor sAga, tmp, sAga_, ROR #39 SEP +bic tmp, sAgo_, sAgi_, ROR #42 SEP eor2 C1, C1, vAse +eor sAge, tmp, sAge_, ROR #25 SEP +bic tmp, sAgu_, sAgo_, ROR #16 SEP bcax_m1 vAsu, vBsu, vBse, vBsa +eor sAgi, tmp, sAgi_, ROR #58 SEP +bic tmp, sAga_, sAgu_, ROR #31 SEP +eor sAgo, tmp, sAgo_, ROR #47 SEP +bic tmp, sAge_, sAga_, ROR #56 SEP eor2 C2, C2, vAsi +eor sAgu, tmp, sAgu_, ROR #23 SEP +bic tmp, sAki_, sAke_, ROR #19 SEP eor2 C3, C3, vAso +eor sAka, tmp, sAka_, ROR #24 SEP +bic tmp, sAko_, sAki_, ROR #47 SEP bcax_m1 vAba, vBba, vBbi, vBbe +eor sAke, tmp, sAke_, ROR #2 SEP +bic tmp, sAku_, sAko_, ROR #10 SEP +eor sAki, tmp, sAki_, ROR #57 SEP +bic tmp, sAka_, sAku_, ROR #47 SEP bcax_m1 vAbe, vBbe, vBbo, vBbi +eor sAko, tmp, sAko_, ROR #57 SEP +bic tmp, sAke_, sAka_, ROR #5 SEP +eor sAku, tmp, sAku_, ROR #52 SEP +bic tmp, sAmi_, sAme_, ROR #38 SEP eor2 C1, C1, vAbe +eor sAma, tmp, sAma_, ROR #47 SEP +bic tmp, sAmo_, sAmi_, ROR #5 SEP restore x26, STACK_OFFSET_CONST +eor sAme, tmp, sAme_, ROR #43 SEP ldr vvtmpq, [x26], #16 +bic tmp, sAmu_, sAmo_, ROR #41 SEP save x26, STACK_OFFSET_CONST +eor sAmi, tmp, sAmi_, ROR #46 SEP +bic tmp, sAma_, sAmu_, ROR #35 SEP eor vAba.16b, vAba.16b, vvtmp.16b +ldr cur_const, [const_addr, count, UXTW #3] +add count, count, #1 SEP +eor sAmo, tmp, sAmo_, ROR #12 SEP eor2 C4, C4, vAsu +bic tmp, sAme_, sAma_, ROR #9 SEP +eor sAmu, tmp, sAmu_, ROR #44 SEP bcax_m1 vAbi, vBbi, vBbu, vBbo +bic tmp, sAsi_, sAse_, ROR #48 SEP +eor sAsa, tmp, sAsa_, ROR #41 SEP +bic tmp, sAso_, sAsi_, ROR #2 SEP +eor sAse, tmp, sAse_, ROR #50 SEP bcax_m1 vAbo, vBbo, vBba, vBbu +bic tmp, sAsu_, sAso_, ROR #25 SEP +eor sAsi, tmp, sAsi_, ROR #27 SEP +bic tmp, sAsa_, sAsu_, ROR #60 SEP +eor sAso, tmp, sAso_, ROR #21 SEP eor2 C3, C3, vAbo +bic tmp, sAse_, sAsa_, ROR #57 SEP +eor sAsu, tmp, sAsu_, ROR #53 SEP eor2 C2, C2, vAbi +bic tmp, sAbi_, sAbe_, ROR #63 SEP +eor s_Aba, s_Aba_, tmp, ROR #21 SEP eor2 C0, C0, vAba +bic tmp, sAbo_, sAbi_, ROR #42 SEP +eor sAbe, tmp, sAbe_, ROR #41 SEP bcax_m1 vAbu, vBbu, vBbe, vBba +bic tmp, sAbu_, sAbo_, ROR #57 SEP +eor sAbi, tmp, sAbi_, ROR #35 SEP +bic tmp, s_Aba_, sAbu_, ROR #50 SEP +eor sAbo, tmp, sAbo_, ROR #43 SEP eor2 C4, C4, vAbu +bic tmp, sAbe_, s_Aba_, ROR #44 SEP +eor sAbu, tmp, sAbu_, ROR #30 SEP restore(vAga) +eor s_Aba, s_Aba, cur_const SEP .unreq vvtmp + .unreq vvtmpq + +.endm + + +.macro final_rotate +ror sAga, sAga,(64-3) SEP +ror sAka, sAka,(64-25) SEP +ror sAma, sAma,(64-10) SEP +ror sAsa, sAsa,(64-39) SEP +ror sAbe, sAbe,(64-21) SEP +ror sAge, sAge,(64-45) SEP +ror sAke, sAke,(64-8) SEP +ror sAme, sAme,(64-15) SEP +ror sAse, sAse,(64-41) SEP +ror sAbi, sAbi,(64-14) SEP +ror sAgi, sAgi,(64-61) SEP +ror sAki, sAki,(64-18) SEP +ror sAmi, sAmi,(64-56) SEP +ror sAsi, sAsi,(64-2) SEP +ror sAgo, sAgo,(64-28) SEP +ror sAko, sAko,(64-1) SEP +ror sAmo, sAmo,(64-27) SEP +ror sAso, sAso,(64-62) SEP +ror sAbu, sAbu,(64-44) SEP +ror sAgu, sAgu,(64-20) SEP +ror sAku, sAku,(64-6) SEP +ror sAmu, sAmu,(64-36) SEP +ror sAsu, sAsu,(64-55) SEP +.endm + +#define KECCAK_F1600_ROUNDS 24 + +.global keccak_f1600_x5_hybrid_asm_v8p +.global _keccak_f1600_x5_hybrid_asm_v8p +.text +.align 4 + +keccak_f1600_x5_hybrid_asm_v8p: +_keccak_f1600_x5_hybrid_asm_v8p: + alloc_stack + save_gprs + save_vregs + + save input_addr, STACK_OFFSET_INPUT + + ASM_LOAD(const_addr,round_constants_vec) + save const_addr, STACK_OFFSET_CONST + + load_input_vector + + add input_addr, input_addr, #(2*8*25) + save input_addr, STACK_OFFSET_CUR_INPUT + + mov out_count, #0 +outer_loop: + save out_count, STACK_OFFSET_COUNT_OUT + + load_input_scalar + save input_addr, STACK_OFFSET_CUR_INPUT + + hybrid_round_initial +inner_loop: + hybrid_round_noninitial + cmp count, #(KECCAK_F1600_ROUNDS-3) + ble inner_loop + final_rotate + + restore input_addr, STACK_OFFSET_CUR_INPUT + store_input_scalar + add input_addr, input_addr, #(8*25) + + restore out_count, STACK_OFFSET_COUNT_OUT + add out_count, out_count, #1 + cmp out_count, #3 + blt outer_loop + + restore input_addr, STACK_OFFSET_INPUT + store_input_vector + + restore_vregs + restore_gprs + free_stack + + ret diff --git a/tests/keccak_neon/manual/macros.s b/tests/keccak_neon/manual/macros.s new file mode 100644 index 0000000..77e0bd4 --- /dev/null +++ b/tests/keccak_neon/manual/macros.s @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +#include + +.macro load_constant_ptr + ASM_LOAD(const_addr, round_constants) +.endm diff --git a/tests/keccak_neon/manual/third_party/keccakx2_C.c b/tests/keccak_neon/manual/third_party/keccakx2_C.c new file mode 100644 index 0000000..1ed19d9 --- /dev/null +++ b/tests/keccak_neon/manual/third_party/keccakx2_C.c @@ -0,0 +1,330 @@ + +// Derived, with minor modifications, from public domain implementation +// in crypto_hash/keccakc512/simple/ from http://bench.cr.yp.to/supercop.html +// by Ronny Van Keer. +// +// To the extent possible under law, the implementer has waived all copyright +// and related or neighboring rights to the source code in this file. +// http://creativecommons.org/publicdomain/zero/1.0/ + + + +#include "../keccak_f1600_variants.h" + +#include +#include + +#define KECCAK_F1600_ROUNDS 24 + +static const uint64_t round_constants[KECCAK_F1600_ROUNDS] = +{ + (uint64_t)0x0000000000000001ULL, + (uint64_t)0x0000000000008082ULL, + (uint64_t)0x800000000000808aULL, + (uint64_t)0x8000000080008000ULL, + (uint64_t)0x000000000000808bULL, + (uint64_t)0x0000000080000001ULL, + (uint64_t)0x8000000080008081ULL, + (uint64_t)0x8000000000008009ULL, + (uint64_t)0x000000000000008aULL, + (uint64_t)0x0000000000000088ULL, + (uint64_t)0x0000000080008009ULL, + (uint64_t)0x000000008000000aULL, + (uint64_t)0x000000008000808bULL, + (uint64_t)0x800000000000008bULL, + (uint64_t)0x8000000000008089ULL, + (uint64_t)0x8000000000008003ULL, + (uint64_t)0x8000000000008002ULL, + (uint64_t)0x8000000000000080ULL, + (uint64_t)0x000000000000800aULL, + (uint64_t)0x800000008000000aULL, + (uint64_t)0x8000000080008081ULL, + (uint64_t)0x8000000000008080ULL, + (uint64_t)0x0000000080000001ULL, + (uint64_t)0x8000000080008008ULL +}; + +#define ROL(a, offset) (((a) << (offset)) ^ ((a) >> (64-(offset)))) +void keccak_f1600_x1_scalar_C( uint64_t state[KECCAK_F1600_X1_STATE_SIZE_UINT64] ) +{ + uint64_t Aba, Abe, Abi, Abo, Abu; + uint64_t Aga, Age, Agi, Ago, Agu; + uint64_t Aka, Ake, Aki, Ako, Aku; + uint64_t Ama, Ame, Ami, Amo, Amu; + uint64_t Asa, Ase, Asi, Aso, Asu; + uint64_t BCa, BCe, BCi, BCo, BCu; + uint64_t Da, De, Di, Do, Du; + uint64_t Eba, Ebe, Ebi, Ebo, Ebu; + uint64_t Ega, Ege, Egi, Ego, Egu; + uint64_t Eka, Eke, Eki, Eko, Eku; + uint64_t Ema, Eme, Emi, Emo, Emu; + uint64_t Esa, Ese, Esi, Eso, Esu; + + //copyFromState(A, state) + Aba = state[ 0]; + Abe = state[ 1]; + Abi = state[ 2]; + Abo = state[ 3]; + Abu = state[ 4]; + Aga = state[ 5]; + Age = state[ 6]; + Agi = state[ 7]; + Ago = state[ 8]; + Agu = state[ 9]; + Aka = state[10]; + Ake = state[11]; + Aki = state[12]; + Ako = state[13]; + Aku = state[14]; + Ama = state[15]; + Ame = state[16]; + Ami = state[17]; + Amo = state[18]; + Amu = state[19]; + Asa = state[20]; + Ase = state[21]; + Asi = state[22]; + Aso = state[23]; + Asu = state[24]; + + for( int round = 0; round < KECCAK_F1600_ROUNDS; round += 2 ) + { + // prepareTheta + BCa = Aba^Aga^Aka^Ama^Asa; + BCe = Abe^Age^Ake^Ame^Ase; + BCi = Abi^Agi^Aki^Ami^Asi; + BCo = Abo^Ago^Ako^Amo^Aso; + BCu = Abu^Agu^Aku^Amu^Asu; + + //thetaRhoPiChiIotaPrepareTheta(round , A, E) + Da = BCu^ROL(BCe, 1); + De = BCa^ROL(BCi, 1); + Di = BCe^ROL(BCo, 1); + Do = BCi^ROL(BCu, 1); + Du = BCo^ROL(BCa, 1); + + Aba ^= Da; + BCa = Aba; + Age ^= De; + BCe = ROL(Age, 44); + Aki ^= Di; + BCi = ROL(Aki, 43); + Amo ^= Do; + BCo = ROL(Amo, 21); + Asu ^= Du; + BCu = ROL(Asu, 14); + Eba = BCa ^((~BCe)& BCi ); + Eba ^= (uint64_t)round_constants[round]; + Ebe = BCe ^((~BCi)& BCo ); + Ebi = BCi ^((~BCo)& BCu ); + Ebo = BCo ^((~BCu)& BCa ); + Ebu = BCu ^((~BCa)& BCe ); + + Abo ^= Do; + BCa = ROL(Abo, 28); + Agu ^= Du; + BCe = ROL(Agu, 20); + Aka ^= Da; + BCi = ROL(Aka, 3); + Ame ^= De; + BCo = ROL(Ame, 45); + Asi ^= Di; + BCu = ROL(Asi, 61); + Ega = BCa ^((~BCe)& BCi ); + Ege = BCe ^((~BCi)& BCo ); + Egi = BCi ^((~BCo)& BCu ); + Ego = BCo ^((~BCu)& BCa ); + Egu = BCu ^((~BCa)& BCe ); + + Abe ^= De; + BCa = ROL(Abe, 1); + Agi ^= Di; + BCe = ROL(Agi, 6); + Ako ^= Do; + BCi = ROL(Ako, 25); + Amu ^= Du; + BCo = ROL(Amu, 8); + Asa ^= Da; + BCu = ROL(Asa, 18); + Eka = BCa ^((~BCe)& BCi ); + Eke = BCe ^((~BCi)& BCo ); + Eki = BCi ^((~BCo)& BCu ); + Eko = BCo ^((~BCu)& BCa ); + Eku = BCu ^((~BCa)& BCe ); + + Abu ^= Du; + BCa = ROL(Abu, 27); + Aga ^= Da; + BCe = ROL(Aga, 36); + Ake ^= De; + BCi = ROL(Ake, 10); + Ami ^= Di; + BCo = ROL(Ami, 15); + Aso ^= Do; + BCu = ROL(Aso, 56); + Ema = BCa ^((~BCe)& BCi ); + Eme = BCe ^((~BCi)& BCo ); + Emi = BCi ^((~BCo)& BCu ); + Emo = BCo ^((~BCu)& BCa ); + Emu = BCu ^((~BCa)& BCe ); + + Abi ^= Di; + BCa = ROL(Abi, 62); + Ago ^= Do; + BCe = ROL(Ago, 55); + Aku ^= Du; + BCi = ROL(Aku, 39); + Ama ^= Da; + BCo = ROL(Ama, 41); + Ase ^= De; + BCu = ROL(Ase, 2); + Esa = BCa ^((~BCe)& BCi ); + Ese = BCe ^((~BCi)& BCo ); + Esi = BCi ^((~BCo)& BCu ); + Eso = BCo ^((~BCu)& BCa ); + Esu = BCu ^((~BCa)& BCe ); + + // prepareTheta + BCa = Eba^Ega^Eka^Ema^Esa; + BCe = Ebe^Ege^Eke^Eme^Ese; + BCi = Ebi^Egi^Eki^Emi^Esi; + BCo = Ebo^Ego^Eko^Emo^Eso; + BCu = Ebu^Egu^Eku^Emu^Esu; + + //thetaRhoPiChiIotaPrepareTheta(round+1, E, A) + Da = BCu^ROL(BCe, 1); + De = BCa^ROL(BCi, 1); + Di = BCe^ROL(BCo, 1); + Do = BCi^ROL(BCu, 1); + Du = BCo^ROL(BCa, 1); + + Eba ^= Da; + BCa = Eba; + Ege ^= De; + BCe = ROL(Ege, 44); + Eki ^= Di; + BCi = ROL(Eki, 43); + Emo ^= Do; + BCo = ROL(Emo, 21); + Esu ^= Du; + BCu = ROL(Esu, 14); + Aba = BCa ^((~BCe)& BCi ); + Aba ^= (uint64_t)round_constants[round+1]; + Abe = BCe ^((~BCi)& BCo ); + Abi = BCi ^((~BCo)& BCu ); + Abo = BCo ^((~BCu)& BCa ); + Abu = BCu ^((~BCa)& BCe ); + + Ebo ^= Do; + BCa = ROL(Ebo, 28); + Egu ^= Du; + BCe = ROL(Egu, 20); + Eka ^= Da; + BCi = ROL(Eka, 3); + Eme ^= De; + BCo = ROL(Eme, 45); + Esi ^= Di; + BCu = ROL(Esi, 61); + Aga = BCa ^((~BCe)& BCi ); + Age = BCe ^((~BCi)& BCo ); + Agi = BCi ^((~BCo)& BCu ); + Ago = BCo ^((~BCu)& BCa ); + Agu = BCu ^((~BCa)& BCe ); + + Ebe ^= De; + BCa = ROL(Ebe, 1); + Egi ^= Di; + BCe = ROL(Egi, 6); + Eko ^= Do; + BCi = ROL(Eko, 25); + Emu ^= Du; + BCo = ROL(Emu, 8); + Esa ^= Da; + BCu = ROL(Esa, 18); + Aka = BCa ^((~BCe)& BCi ); + Ake = BCe ^((~BCi)& BCo ); + Aki = BCi ^((~BCo)& BCu ); + Ako = BCo ^((~BCu)& BCa ); + Aku = BCu ^((~BCa)& BCe ); + + Ebu ^= Du; + BCa = ROL(Ebu, 27); + Ega ^= Da; + BCe = ROL(Ega, 36); + Eke ^= De; + BCi = ROL(Eke, 10); + Emi ^= Di; + BCo = ROL(Emi, 15); + Eso ^= Do; + BCu = ROL(Eso, 56); + Ama = BCa ^((~BCe)& BCi ); + Ame = BCe ^((~BCi)& BCo ); + Ami = BCi ^((~BCo)& BCu ); + Amo = BCo ^((~BCu)& BCa ); + Amu = BCu ^((~BCa)& BCe ); + + Ebi ^= Di; + BCa = ROL(Ebi, 62); + Ego ^= Do; + BCe = ROL(Ego, 55); + Eku ^= Du; + BCi = ROL(Eku, 39); + Ema ^= Da; + BCo = ROL(Ema, 41); + Ese ^= De; + BCu = ROL(Ese, 2); + Asa = BCa ^((~BCe)& BCi ); + Ase = BCe ^((~BCi)& BCo ); + Asi = BCi ^((~BCo)& BCu ); + Aso = BCo ^((~BCu)& BCa ); + Asu = BCu ^((~BCa)& BCe ); + } + + //copyToState(state, A) + state[ 0] = Aba; + state[ 1] = Abe; + state[ 2] = Abi; + state[ 3] = Abo; + state[ 4] = Abu; + state[ 5] = Aga; + state[ 6] = Age; + state[ 7] = Agi; + state[ 8] = Ago; + state[ 9] = Agu; + state[10] = Aka; + state[11] = Ake; + state[12] = Aki; + state[13] = Ako; + state[14] = Aku; + state[15] = Ama; + state[16] = Ame; + state[17] = Ami; + state[18] = Amo; + state[19] = Amu; + state[20] = Asa; + state[21] = Ase; + state[22] = Asi; + state[23] = Aso; + state[24] = Asu; +} + +void keccak_f1600_x2_scalar_C(uint64_t state[2*25]) +{ + uint64_t state1[25]; + uint64_t state2[25]; + + // de-interleave + for(size_t i=0;i<25;i++){ + state1[i] = state[2*i+0]; + state2[i] = state[2*i+1]; + } + + keccak_f1600_x1_scalar_C(state1); + keccak_f1600_x1_scalar_C(state2); + + // interleave + for(size_t i=0;i<25;i++){ + state[2*i+0] = state1[i]; + state[2*i+1] = state2[i]; + } +} diff --git a/tests/keccak_neon/manual/third_party/keccakx2_bas.s b/tests/keccak_neon/manual/third_party/keccakx2_bas.s new file mode 100644 index 0000000..ef29c69 --- /dev/null +++ b/tests/keccak_neon/manual/third_party/keccakx2_bas.s @@ -0,0 +1,203 @@ +// MIT License +// +// Copyright (c) 2020 Bas Westerbaan +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +// +// With trivial modifications for PQAX +// + +#if defined(__ARM_FEATURE_SHA3) + +#include // For ASM_LOAD only + +.macro load_constant_ptr + ASM_LOAD(const_addr, round_constants) +.endm + +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + +const_addr .req x1 + +.macro round + // Execute theta, but without xoring into the state yet. + // Compute parities p[i] = a[i] ^ a[5+i] ^ ... ^ a[20+i]. + eor3 v25.16b, v0.16b, v5.16b, v10.16b + eor3 v26.16b, v1.16b, v6.16b, v11.16b + eor3 v27.16b, v2.16b, v7.16b, v12.16b + eor3 v28.16b, v3.16b, v8.16b, v13.16b + eor3 v29.16b, v4.16b, v9.16b, v14.16b + + eor3 v25.16b, v25.16b, v15.16b, v20.16b + eor3 v26.16b, v26.16b, v16.16b, v21.16b + eor3 v27.16b, v27.16b, v17.16b, v22.16b + eor3 v28.16b, v28.16b, v18.16b, v23.16b + eor3 v29.16b, v29.16b, v19.16b, v24.16b + + rax1 v30.2d, v29.2d, v26.2d // d[0] = rotl(p[1], 1) ^ p[4] + rax1 v29.2d, v27.2d, v29.2d // d[3] = rotl(p[4], 1) ^ p[2] + rax1 v27.2d, v25.2d, v27.2d // d[1] = rotl(p[2], 1) ^ p[0] + rax1 v25.2d, v28.2d, v25.2d // d[4] = rotl(p[0], 1) ^ p[3] + rax1 v28.2d, v26.2d, v28.2d // d[2] = rotl(p[3], 1) ^ p[1] + + // Xor parities from step theta into the state at the same time + // as executing rho and pi. + eor v0.16b, v0.16b, v30.16b + mov v31.16b, v1.16b + xar v1.2d, v6.2d, v27.2d, 20 + xar v6.2d, v9.2d, v25.2d, 44 + xar v9.2d, v22.2d, v28.2d, 3 + xar v22.2d, v14.2d, v25.2d, 25 + xar v14.2d, v20.2d, v30.2d, 46 + xar v20.2d, v2.2d, v28.2d, 2 + xar v2.2d, v12.2d, v28.2d, 21 + xar v12.2d, v13.2d, v29.2d, 39 + xar v13.2d, v19.2d, v25.2d, 56 + xar v19.2d, v23.2d, v29.2d, 8 + xar v23.2d, v15.2d, v30.2d, 23 + xar v15.2d, v4.2d, v25.2d, 37 + xar v4.2d, v24.2d, v25.2d, 50 + xar v24.2d, v21.2d, v27.2d, 62 + xar v21.2d, v8.2d, v29.2d, 9 + xar v8.2d, v16.2d, v27.2d, 19 + xar v16.2d, v5.2d, v30.2d, 28 + xar v5.2d, v3.2d, v29.2d, 36 + xar v3.2d, v18.2d, v29.2d, 43 + xar v18.2d, v17.2d, v28.2d, 49 + xar v17.2d, v11.2d, v27.2d, 54 + xar v11.2d, v7.2d, v28.2d, 58 + xar v7.2d, v10.2d, v30.2d, 61 + xar v10.2d, v31.2d, v27.2d, 63 + + // Chi + bcax v25.16b, v0.16b, v2.16b, v1.16b + bcax v26.16b, v1.16b, v3.16b, v2.16b + bcax v2.16b, v2.16b, v4.16b, v3.16b + bcax v3.16b, v3.16b, v0.16b, v4.16b + bcax v4.16b, v4.16b, v1.16b, v0.16b + mov v0.16b, v25.16b + mov v1.16b, v26.16b + + bcax v25.16b, v5.16b, v7.16b, v6.16b + bcax v26.16b, v6.16b, v8.16b, v7.16b + bcax v7.16b, v7.16b, v9.16b, v8.16b + bcax v8.16b, v8.16b, v5.16b, v9.16b + bcax v9.16b, v9.16b, v6.16b, v5.16b + mov v5.16b, v25.16b + mov v6.16b, v26.16b + + bcax v25.16b, v10.16b, v12.16b, v11.16b + bcax v26.16b, v11.16b, v13.16b, v12.16b + bcax v12.16b, v12.16b, v14.16b, v13.16b + bcax v13.16b, v13.16b, v10.16b, v14.16b + bcax v14.16b, v14.16b, v11.16b, v10.16b + mov v10.16b, v25.16b + mov v11.16b, v26.16b + + bcax v25.16b, v15.16b, v17.16b, v16.16b + bcax v26.16b, v16.16b, v18.16b, v17.16b + bcax v17.16b, v17.16b, v19.16b, v18.16b + bcax v18.16b, v18.16b, v15.16b, v19.16b + bcax v19.16b, v19.16b, v16.16b, v15.16b + mov v15.16b, v25.16b + mov v16.16b, v26.16b + + bcax v25.16b, v20.16b, v22.16b, v21.16b + bcax v26.16b, v21.16b, v23.16b, v22.16b + bcax v22.16b, v22.16b, v24.16b, v23.16b + bcax v23.16b, v23.16b, v20.16b, v24.16b + bcax v24.16b, v24.16b, v21.16b, v20.16b + mov v20.16b, v25.16b + mov v21.16b, v26.16b + + // iota + ld1r {v25.2d}, [const_addr], #8 + eor v0.16b, v0.16b, v25.16b +.endm + +.align 4 +.global keccak_f1600_x2_bas +.global _keccak_f1600_x2_bas +keccak_f1600_x2_bas: +_keccak_f1600_x2_bas: + stp d8, d9, [sp,#-16]! + stp d10, d11, [sp,#-16]! + stp d12, d13, [sp,#-16]! + stp d14, d15, [sp,#-16]! + + load_constant_ptr + mov x2, x0 + mov x3, #24 + + ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x0], #64 + ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [x0], #64 + ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x0], #64 + ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [x0], #64 + ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x0], #64 + ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [x0], #64 + ld1 {v24.2d}, [x0] + +loop: + round + + subs x3, x3, #1 + cbnz x3, loop + + mov x0, x2 + st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x0], #64 + st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [x0], #64 + st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x0], #64 + st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [x0], #64 + st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x0], #64 + st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [x0], #64 + st1 {v24.2d}, [x0] + + ldp d14, d15, [sp], #16 + ldp d12, d13, [sp], #16 + ldp d10, d11, [sp], #16 + ldp d8, d9, [sp], #16 + + ret lr + +#endif diff --git a/tests/keccak_neon/manual/third_party/keccakx2_cothan.c b/tests/keccak_neon/manual/third_party/keccakx2_cothan.c new file mode 100644 index 0000000..42a1433 --- /dev/null +++ b/tests/keccak_neon/manual/third_party/keccakx2_cothan.c @@ -0,0 +1,404 @@ +/*============================================================================= + * Copyright (c) 2020 by Cryptographic Engineering Research Group (CERG) + * ECE Department, George Mason University + * Fairfax, VA, U.S.A. + * Author: Duc Tri Nguyen +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +=============================================================================*/ +#include +#include + +#include "../keccak_f1600_variants.h" + +#define NROUNDS 24 +#define SHA3 0 + +#define SHAKE128_RATE 168 +#define SHAKE256_RATE 136 +#define SHA3_256_RATE 136 +#define SHA3_512_RATE 72 + +/* + * Using vld1q_u64_x4 is consider harmful + */ +#ifndef MEM +#define MEM 0 +#endif + +// Define NEON operation + +// Bitwise-XOR: c = a ^ b +#define vxor(c, a, b) c = veorq_u64(a, b); + +#define pack(out, a, b, c, d) \ + out.val[0] = a; \ + out.val[1] = b; \ + out.val[2] = c; \ + out.val[3] = d; + +#define unpack(a, b, c, d, out) \ + a = out.val[0]; \ + b = out.val[1]; \ + c = out.val[2]; \ + d = out.val[3]; + +#if SHA3 == 1 + +/* + * At least ARMv8.2-sha3 supported + */ + +// Xor chain: out = a ^ b ^ c ^ d ^ e +#define vXOR5(out, a, b, c, d, e) \ + out = veor3q_u64(a, b, c); \ + out = veor3q_u64(out, d, e); + +// Rotate left by 1 bit, then XOR: a ^ ROL(b) +#define vRXOR(c, a, b) c = vrax1q_u64(a, b); + +// XOR then Rotate by n bit: c = ROL(a^b, n) +#define vXORR(c, a, b, n) c = vxarq_u64(a, b, n); + +// Xor Not And: out = a ^ ( (~b) & c) +#define vXNA(out, a, b, c) out = vbcaxq_u64(a, c, b); + +#else + +// Rotate left by n bit +#define vROL(out, a, offset) \ + out = vshlq_n_u64(a, (offset)); \ + out = vsriq_n_u64(out, a, 64 - (offset)); + +// Xor chain: out = a ^ b ^ c ^ d ^ e +#define vXOR5(out, a, b, c, d, e) \ + out = veorq_u64(a, b); \ + out = veorq_u64(out, c); \ + out = veorq_u64(out, d); \ + out = veorq_u64(out, e); + +// Xor Not And: out = a ^ ( (~b) & c) +#define vXNA(out, a, b, c) \ + out = vbicq_u64(c, b); \ + out = veorq_u64(out, a); + +#define vRXOR(c, a, b) \ + vROL(c, b, 1); \ + vxor(c, c, a); + +#define vXORR(c, a, b, n) \ + a = veorq_u64(a, b); \ + vROL(c, a, 64 - n); + +#endif + +// End + +/* Keccak round constants */ +static const uint64_t neon_KeccakF_RoundConstants[NROUNDS] = { + (uint64_t)0x0000000000000001ULL, + (uint64_t)0x0000000000008082ULL, + (uint64_t)0x800000000000808aULL, + (uint64_t)0x8000000080008000ULL, + (uint64_t)0x000000000000808bULL, + (uint64_t)0x0000000080000001ULL, + (uint64_t)0x8000000080008081ULL, + (uint64_t)0x8000000000008009ULL, + (uint64_t)0x000000000000008aULL, + (uint64_t)0x0000000000000088ULL, + (uint64_t)0x0000000080008009ULL, + (uint64_t)0x000000008000000aULL, + (uint64_t)0x000000008000808bULL, + (uint64_t)0x800000000000008bULL, + (uint64_t)0x8000000000008089ULL, + (uint64_t)0x8000000000008003ULL, + (uint64_t)0x8000000000008002ULL, + (uint64_t)0x8000000000000080ULL, + (uint64_t)0x000000000000800aULL, + (uint64_t)0x800000008000000aULL, + (uint64_t)0x8000000080008081ULL, + (uint64_t)0x8000000000008080ULL, + (uint64_t)0x0000000080000001ULL, + (uint64_t)0x8000000080008008ULL}; + +/************************************************* + * Name: KeccakF1600_StatePermutex2 + * + * Description: The Keccak F1600 Permutation + * + * Arguments: - v128 *state: pointer to input/output Keccak state + **************************************************/ +void keccak_f1600_x2_neon_C_cothan(v128 state[25]) +{ + v128 Aba, Abe, Abi, Abo, Abu; + v128 Aga, Age, Agi, Ago, Agu; + v128 Aka, Ake, Aki, Ako, Aku; + v128 Ama, Ame, Ami, Amo, Amu; + v128 Asa, Ase, Asi, Aso, Asu; + v128 BCa, BCe, BCi, BCo, BCu; // tmp + v128 Da, De, Di, Do, Du; // D + v128 Eba, Ebe, Ebi, Ebo, Ebu; + v128 Ega, Ege, Egi, Ego, Egu; + v128 Eka, Eke, Eki, Eko, Eku; + v128 Ema, Eme, Emi, Emo, Emu; + v128 Esa, Ese, Esi, Eso, Esu; + +#if MEM == 1 + uint64x2x4_t holder; + + holder = vld1q_u64_x4((uint64_t *)&state[0]); + unpack(Aba, Abe, Abi, Abo, holder); + + holder = vld1q_u64_x4((uint64_t *)&state[4]); + unpack(Abu, Aga, Age, Agi, holder); + + holder = vld1q_u64_x4((uint64_t *)&state[8]); + unpack(Ago, Agu, Aka, Ake, holder); + + holder = vld1q_u64_x4((uint64_t *)&state[12]); + unpack(Aki, Ako, Aku, Ama, holder); + + holder = vld1q_u64_x4((uint64_t *)&state[16]); + unpack(Ame, Ami, Amo, Amu, holder); + + holder = vld1q_u64_x4((uint64_t *)&state[20]); + unpack(Asa, Ase, Asi, Aso, holder); + + Asu = vld1q_u64((uint64_t *)&state[24]); +#else + Aba = state[0]; + Abe = state[1]; + Abi = state[2]; + Abo = state[3]; + Abu = state[4]; + Aga = state[5]; + Age = state[6]; + Agi = state[7]; + Ago = state[8]; + Agu = state[9]; + Aka = state[10]; + Ake = state[11]; + Aki = state[12]; + Ako = state[13]; + Aku = state[14]; + Ama = state[15]; + Ame = state[16]; + Ami = state[17]; + Amo = state[18]; + Amu = state[19]; + Asa = state[20]; + Ase = state[21]; + Asi = state[22]; + Aso = state[23]; + Asu = state[24]; +#endif + + for (int round = 0; round < NROUNDS; round += 2) + { + // prepareTheta + vXOR5(BCa, Aba, Aga, Aka, Ama, Asa); + vXOR5(BCe, Abe, Age, Ake, Ame, Ase); + vXOR5(BCi, Abi, Agi, Aki, Ami, Asi); + vXOR5(BCo, Abo, Ago, Ako, Amo, Aso); + vXOR5(BCu, Abu, Agu, Aku, Amu, Asu); + + vRXOR(Da, BCu, BCe); + vRXOR(De, BCa, BCi); + vRXOR(Di, BCe, BCo); + vRXOR(Do, BCi, BCu); + vRXOR(Du, BCo, BCa); + + vxor(Aba, Aba, Da); + vXORR(BCe, Age, De, 20); + vXORR(BCi, Aki, Di, 21); + vXORR(BCo, Amo, Do, 43); + vXORR(BCu, Asu, Du, 50); + + vXNA(Eba, Aba, BCe, BCi); + vxor(Eba, Eba, vld1q_dup_u64(&neon_KeccakF_RoundConstants[round])); + vXNA(Ebe, BCe, BCi, BCo); + vXNA(Ebi, BCi, BCo, BCu); + vXNA(Ebo, BCo, BCu, Aba); + vXNA(Ebu, BCu, Aba, BCe); + + vXORR(BCa, Abo, Do, 36); + vXORR(BCe, Agu, Du, 44); + vXORR(BCi, Aka, Da, 61); + vXORR(BCo, Ame, De, 19); + vXORR(BCu, Asi, Di, 3); + + vXNA(Ega, BCa, BCe, BCi); + vXNA(Ege, BCe, BCi, BCo); + vXNA(Egi, BCi, BCo, BCu); + vXNA(Ego, BCo, BCu, BCa); + vXNA(Egu, BCu, BCa, BCe); + + vXORR(BCa, Abe, De, 63); + vXORR(BCe, Agi, Di, 58); + vXORR(BCi, Ako, Do, 39); + vXORR(BCo, Amu, Du, 56); + vXORR(BCu, Asa, Da, 46); + + vXNA(Eka, BCa, BCe, BCi); + vXNA(Eke, BCe, BCi, BCo); + vXNA(Eki, BCi, BCo, BCu); + vXNA(Eko, BCo, BCu, BCa); + vXNA(Eku, BCu, BCa, BCe); + + vXORR(BCa, Abu, Du, 37); + vXORR(BCe, Aga, Da, 28); + vXORR(BCi, Ake, De, 54); + vXORR(BCo, Ami, Di, 49); + vXORR(BCu, Aso, Do, 8); + + vXNA(Ema, BCa, BCe, BCi); + vXNA(Eme, BCe, BCi, BCo); + vXNA(Emi, BCi, BCo, BCu); + vXNA(Emo, BCo, BCu, BCa); + vXNA(Emu, BCu, BCa, BCe); + + vXORR(BCa, Abi, Di, 2); + vXORR(BCe, Ago, Do, 9); + vXORR(BCi, Aku, Du, 25); + vXORR(BCo, Ama, Da, 23); + vXORR(BCu, Ase, De, 62); + + vXNA(Esa, BCa, BCe, BCi); + vXNA(Ese, BCe, BCi, BCo); + vXNA(Esi, BCi, BCo, BCu); + vXNA(Eso, BCo, BCu, BCa); + vXNA(Esu, BCu, BCa, BCe); + + // Next Round + + // prepareTheta + vXOR5(BCa, Eba, Ega, Eka, Ema, Esa); + vXOR5(BCe, Ebe, Ege, Eke, Eme, Ese); + vXOR5(BCi, Ebi, Egi, Eki, Emi, Esi); + vXOR5(BCo, Ebo, Ego, Eko, Emo, Eso); + vXOR5(BCu, Ebu, Egu, Eku, Emu, Esu); + + // thetaRhoPiChiIotaPrepareTheta(round+1, E, A) + vRXOR(Da, BCu, BCe); + vRXOR(De, BCa, BCi); + vRXOR(Di, BCe, BCo); + vRXOR(Do, BCi, BCu); + vRXOR(Du, BCo, BCa); + + vxor(Eba, Eba, Da); + vXORR(BCe, Ege, De, 20); + vXORR(BCi, Eki, Di, 21); + vXORR(BCo, Emo, Do, 43); + vXORR(BCu, Esu, Du, 50); + + vXNA(Aba, Eba, BCe, BCi); + vxor(Aba, Aba, vld1q_dup_u64(&neon_KeccakF_RoundConstants[round + 1])); + vXNA(Abe, BCe, BCi, BCo); + vXNA(Abi, BCi, BCo, BCu); + vXNA(Abo, BCo, BCu, Eba); + vXNA(Abu, BCu, Eba, BCe); + + vXORR(BCa, Ebo, Do, 36); + vXORR(BCe, Egu, Du, 44); + vXORR(BCi, Eka, Da, 61); + vXORR(BCo, Eme, De, 19); + vXORR(BCu, Esi, Di, 3); + + vXNA(Aga, BCa, BCe, BCi); + vXNA(Age, BCe, BCi, BCo); + vXNA(Agi, BCi, BCo, BCu); + vXNA(Ago, BCo, BCu, BCa); + vXNA(Agu, BCu, BCa, BCe); + + vXORR(BCa, Ebe, De, 63); + vXORR(BCe, Egi, Di, 58); + vXORR(BCi, Eko, Do, 39); + vXORR(BCo, Emu, Du, 56); + vXORR(BCu, Esa, Da, 46); + + vXNA(Aka, BCa, BCe, BCi); + vXNA(Ake, BCe, BCi, BCo); + vXNA(Aki, BCi, BCo, BCu); + vXNA(Ako, BCo, BCu, BCa); + vXNA(Aku, BCu, BCa, BCe); + + vXORR(BCa, Ebu, Du, 37); + vXORR(BCe, Ega, Da, 28); + vXORR(BCi, Eke, De, 54); + vXORR(BCo, Emi, Di, 49); + vXORR(BCu, Eso, Do, 8); + + vXNA(Ama, BCa, BCe, BCi); + vXNA(Ame, BCe, BCi, BCo); + vXNA(Ami, BCi, BCo, BCu); + vXNA(Amo, BCo, BCu, BCa); + vXNA(Amu, BCu, BCa, BCe); + + vXORR(BCa, Ebi, Di, 2); + vXORR(BCe, Ego, Do, 9); + vXORR(BCi, Eku, Du, 25); + vXORR(BCo, Ema, Da, 23); + vXORR(BCu, Ese, De, 62); + + vXNA(Asa, BCa, BCe, BCi); + vXNA(Ase, BCe, BCi, BCo); + vXNA(Asi, BCi, BCo, BCu); + vXNA(Aso, BCo, BCu, BCa); + vXNA(Asu, BCu, BCa, BCe); + } + +#if MEM == 1 + pack(holder, Aba, Abe, Abi, Abo); + vst1q_u64_x4((uint64_t *)&state[0], holder); + + pack(holder, Abu, Aga, Age, Agi); + vst1q_u64_x4((uint64_t *)&state[4], holder); + + pack(holder, Ago, Agu, Aka, Ake); + vst1q_u64_x4((uint64_t *)&state[8], holder); + + pack(holder, Aki, Ako, Aku, Ama); + vst1q_u64_x4((uint64_t *)&state[12], holder); + + pack(holder, Ame, Ami, Amo, Amu); + vst1q_u64_x4((uint64_t *)&state[16], holder); + + pack(holder, Asa, Ase, Asi, Aso); + vst1q_u64_x4((uint64_t *)&state[20], holder); + + vst1q_u64((uint64_t *)&state[24], Asu); +#else + state[0] = Aba; + state[1] = Abe; + state[2] = Abi; + state[3] = Abo; + state[4] = Abu; + state[5] = Aga; + state[6] = Age; + state[7] = Agi; + state[8] = Ago; + state[9] = Agu; + state[10] = Aka; + state[11] = Ake; + state[12] = Aki; + state[13] = Ako; + state[14] = Aku; + state[15] = Ama; + state[16] = Ame; + state[17] = Ami; + state[18] = Amo; + state[19] = Amu; + state[20] = Asa; + state[21] = Ase; + state[22] = Asi; + state[23] = Aso; + state[24] = Asu; +#endif +} diff --git a/tests/ntt_kyber/main.c b/tests/ntt_kyber/main.c new file mode 100644 index 0000000..1ec48c3 --- /dev/null +++ b/tests/ntt_kyber/main.c @@ -0,0 +1,229 @@ +/* + * Copyright (c) 2022 Arm Limited + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * + * Author: Hanno Becker + */ + +#define TEST_FOO +#define BENCH_FOO + +/* + * Some external references to auto-generated assembly. + */ + +#include +#include +#include + +#define WARMUP_ITERATIONS 1000 +#define ITER_PER_TEST 100 +#define TEST_COUNT 100 + +/* Add declarationa for ASM NTTs here */ +void ntt_kyber_123_4567(int16_t *); + +#define NTT_LAYERS 8 +#define NTT_SIZE (1u << NTT_LAYERS) +#define NTT_ROOT_ORDER (2 * NTT_SIZE) +#define NTT_INCOMPLETE_LAYERS 7 +#define NTT_INCOMPLETE_SIZE (1u << NTT_INCOMPLETE_LAYERS) +#define NTT_LAYER_GAP ( NTT_LAYERS - NTT_INCOMPLETE_LAYERS ) +#define NTT_LAYER_STRIDE (1u << NTT_LAYER_GAP ) + +#include +#include +#include + +/* + * Test cases + */ + +int16_t base_root = 17; +int16_t modulus = 3329; +uint16_t modulus_inv_u16 = 62209; + +int16_t roots [NTT_ROOT_ORDER / 2] __attribute__((aligned(16))) = { 0 }; +uint16_t roots_twisted[NTT_ROOT_ORDER / 2] __attribute__((aligned(16))) = { 0 }; + +void build_roots() +{ + for( unsigned i=0; i < NTT_ROOT_ORDER / 2; i++ ) + { + roots[i] = mod_pow_s16( base_root, i, modulus ); + roots_twisted[i] = roots[i] * modulus_inv_u16; + } +} + +unsigned bit_reverse( unsigned in, unsigned width ) +{ + unsigned out = 0; + while( width-- ) + { + out <<= 1; + out |= ( in % 2 ); + in >>= 1; + } + return( out ); +} + +static int cmp_uint64_t(const void *a, const void *b) +{ + return (int)((*((const uint64_t *)a)) - (*((const uint64_t *)b))); +} + +void ntt_s16_C( int16_t *src ) +{ + int16_t res[NTT_SIZE]; + build_roots(); + + for( unsigned t=0; t= ( NTT_ROOT_ORDER / 2 ) ); + exp = exp % ( NTT_ROOT_ORDER / 2 ); + + cur = mod_mul_s16( src[NTT_LAYER_STRIDE*j+t], + roots[exp], + modulus ); + + if( !sub ) + tmp = mod_add_s16( tmp, cur, modulus ); + else + tmp = mod_sub_s16( tmp, cur, modulus ); + } + res[NTT_LAYER_STRIDE*i+t] = tmp; + } + } + + memcpy( src, res, sizeof( res ) ); +} + +void buf_bitrev_4( int16_t *src ) +{ + int32_t *src_ = (int32_t*) src; + for( unsigned i=0; i < NTT_SIZE/2; i += 16 ) + { + int32_t tmp[16]; + for( unsigned t=0; t < 16; t++ ) + tmp[t] = src_[i+t]; + + for( unsigned t0=0; t0 < 4; t0++ ) + for( unsigned t1=0; t1 < 4; t1++ ) + src_[i+t0*4 + t1] = tmp[t1*4+t0]; + } +} + +#define MAKE_TEST_FWD(var,func,rev4) \ +int test_ntt_ ## var () \ +{ \ + debug_test_start( "NTT s16 for " #func ); \ + int16_t src[NTT_SIZE] __attribute__((aligned(16))); \ + int16_t src_copy[NTT_SIZE] __attribute__((aligned(16))); \ + \ + /* Setup input */ \ + fill_random_u16( (uint16_t*) src, NTT_SIZE ); \ + mod_reduce_buf_s16( src, NTT_SIZE, modulus ); \ + \ + /* Step 1: Reference NTT */ \ + memcpy( src_copy, src, sizeof( src ) ); \ + ntt_s16_C( src_copy ); \ + mod_reduce_buf_s16( src_copy, NTT_SIZE, modulus ); \ + \ + if( rev4 ) \ + buf_bitrev_4( src_copy ); \ + \ + /* Step 2: Neon-based NTT */ \ + (func)( src ); \ + \ + mod_reduce_buf_s16( src, NTT_SIZE, modulus ); \ + if( compare_buf_u16( (uint16_t const*) src, (uint16_t const*) src_copy, \ + NTT_SIZE ) != 0 ) \ + { \ + debug_print_buf_s16( src_copy, NTT_SIZE, "Reference" ); \ + debug_print_buf_s16( src, NTT_SIZE, "Neon" ); \ + debug_test_fail(); \ + return( 1 ); \ + } \ + debug_test_ok(); \ + \ + return( 0 ); \ +} + +MAKE_TEST_FWD(asm,ntt_kyber_123_4567,1) + +uint64_t t0, t1; +uint64_t cycles[TEST_COUNT]; + +#define MAKE_BENCH(var,func) \ +int bench_ntt_ ## var () \ +{ \ + int16_t src[NTT_SIZE] __attribute__((aligned(16))); \ + \ + for( unsigned cnt=0; cnt < WARMUP_ITERATIONS; cnt++ ) \ + (func)( src ); \ + \ + for( unsigned cnt=0; cnt < TEST_COUNT; cnt++ ) \ + { \ + t0 = get_cyclecounter(); \ + for( unsigned cntp=0; cntp < ITER_PER_TEST; cntp++ ) \ + (func)( src ); \ + t1 = get_cyclecounter(); \ + cycles[cnt] = (t1 - t0) / ITER_PER_TEST; \ + } \ + \ + /* Report median */ \ + qsort( cycles, TEST_COUNT, sizeof(uint64_t), cmp_uint64_t ); \ + debug_printf( "Median after %u NTTs: %lld cycles\n", \ + TEST_COUNT,cycles[TEST_COUNT >> 1] ); \ + \ + return( 0 ); \ +} + +MAKE_BENCH(asm,ntt_kyber_123_4567) + +int main( void ) +{ + debug_test_start("Kyber NTT test"); + + /* Benchs */ + bench_ntt_asm(); + + /* Tests */ + if( test_ntt_asm()!= 0 ) + return(1); + + debug_test_ok(); + return(0); +} diff --git a/tests/ntt_kyber/manual/dummy b/tests/ntt_kyber/manual/dummy new file mode 100644 index 0000000..e69de29 diff --git a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_0_0.s b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_0_0.s new file mode 100644 index 0000000..85f29ad --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_0_0.s @@ -0,0 +1,2422 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 26036764 // Layer 6, block 0 +.word 7065381 // Layer 6, block 1 +.word 11280567 // Layer 6, block 2 +.word 19695786 // Layer 6, block 3 +.word 1666225723 // Layer 6, block 0 +.word 452149874 // Layer 6, block 1 +.word 721901190 // Layer 6, block 2 +.word 1260434103 // Layer 6, block 3 +.word 28678040 // Layer 7, block 0 +.word 5637166 // Layer 7, block 2 +.word 18759424 // Layer 7, block 4 +.word 8648030 // Layer 7, block 6 +.word 1835254486 // Layer 7, block 0 +.word 360751090 // Layer 7, block 2 +.word 1200511508 // Layer 7, block 4 +.word 553431680 // Layer 7, block 6 +.word 7232147 // Layer 7, block 1 +.word 7430689 // Layer 7, block 3 +.word 14819378 // Layer 7, block 5 +.word 22112339 // Layer 7, block 7 +.word 462822084 // Layer 7, block 1 +.word 475527802 // Layer 7, block 3 +.word 948367809 // Layer 7, block 5 +.word 1415081692 // Layer 7, block 7 +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14834498 // Layer 6, block 4 +.word 22861321 // Layer 6, block 5 +.word 23033862 // Layer 6, block 6 +.word 32211066 // Layer 6, block 7 +.word 949335415 // Layer 6, block 4 +.word 1463012881 // Layer 6, block 5 +.word 1474054663 // Layer 6, block 6 +.word 2061350894 // Layer 6, block 7 +.word 7103825 // Layer 7, block 8 +.word 24338119 // Layer 7, block 10 +.word 6674394 // Layer 7, block 12 +.word 3716128 // Layer 7, block 14 +.word 454610102 // Layer 7, block 8 +.word 1557520740 // Layer 7, block 10 +.word 427128616 // Layer 7, block 12 +.word 237814041 // Layer 7, block 14 +.word 18577393 // Layer 7, block 9 +.word 17042091 // Layer 7, block 11 +.word 6574213 // Layer 7, block 13 +.word 24666803 // Layer 7, block 15 +.word 1188862414 // Layer 7, block 9 +.word 1090610585 // Layer 7, block 11 +.word 420717521 // Layer 7, block 13 +.word 1578554911 // Layer 7, block 15 +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 11253846 // Layer 6, block 8 +.word 16151303 // Layer 6, block 9 +.word 1821442 // Layer 6, block 10 +.word 23358663 // Layer 6, block 11 +.word 720191176 // Layer 6, block 8 +.word 1033604503 // Layer 6, block 9 +.word 116563391 // Layer 6, block 10 +.word 1494840340 // Layer 6, block 11 +.word 32787475 // Layer 7, block 16 +.word 8269259 // Layer 7, block 18 +.word 20826321 // Layer 7, block 20 +.word 21194054 // Layer 7, block 22 +.word 2098238255 // Layer 7, block 16 +.word 529192186 // Layer 7, block 18 +.word 1332782821 // Layer 7, block 20 +.word 1356315937 // Layer 7, block 22 +.word 28400654 // Layer 7, block 17 +.word 31090287 // Layer 7, block 19 +.word 26776841 // Layer 7, block 21 +.word 22281074 // Layer 7, block 23 +.word 1817503137 // Layer 7, block 17 +.word 1989626512 // Layer 7, block 19 +.word 1713587037 // Layer 7, block 21 +.word 1425879908 // Layer 7, block 23 +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 20504641 // Layer 6, block 12 +.word 7735096 // Layer 6, block 13 +.word 29463916 // Layer 6, block 14 +.word 23172067 // Layer 6, block 15 +.word 1312196872 // Layer 6, block 12 +.word 495008363 // Layer 6, block 13 +.word 1885546712 // Layer 6, block 14 +.word 1482899108 // Layer 6, block 15 +.word 1953000 // Layer 7, block 24 +.word 12766243 // Layer 7, block 26 +.word 16292342 // Layer 7, block 28 +.word 25143337 // Layer 7, block 30 +.word 124982461 // Layer 7, block 24 +.word 816977197 // Layer 7, block 26 +.word 1042630311 // Layer 7, block 28 +.word 1609050759 // Layer 7, block 30 +.word 12486848 // Layer 7, block 25 +.word 31556661 // Layer 7, block 27 +.word 28330310 // Layer 7, block 29 +.word 15137961 // Layer 7, block 31 +.word 799097282 // Layer 7, block 25 +.word 2019472170 // Layer 7, block 27 +.word 1813001465 // Layer 7, block 29 +.word 968755565 // Layer 7, block 31 +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 18663828 // Layer 6, block 16 +.word 25765932 // Layer 6, block 17 +.word 11779122 // Layer 6, block 18 +.word 29112305 // Layer 6, block 19 +.word 1194393831 // Layer 6, block 16 +.word 1648893798 // Layer 6, block 17 +.word 753806275 // Layer 6, block 18 +.word 1863045325 // Layer 6, block 19 +.word 33163184 // Layer 7, block 32 +.word 11550623 // Layer 7, block 34 +.word 25375595 // Layer 7, block 36 +.word 18254638 // Layer 7, block 38 +.word 2122281795 // Layer 7, block 32 +.word 739183455 // Layer 7, block 34 +.word 1623914137 // Layer 7, block 36 +.word 1168207670 // Layer 7, block 38 +.word 9551359 // Layer 7, block 33 +.word 33257316 // Layer 7, block 35 +.word 10387700 // Layer 7, block 37 +.word 4263629 // Layer 7, block 39 +.word 611240324 // Layer 7, block 33 +.word 2128305784 // Layer 7, block 35 +.word 664762063 // Layer 7, block 37 +.word 272851431 // Layer 7, block 39 +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 596073 // Layer 6, block 20 +.word 29039358 // Layer 6, block 21 +.word 6760262 // Layer 6, block 22 +.word 2228887 // Layer 6, block 23 +.word 38145761 // Layer 6, block 20 +.word 1858377074 // Layer 6, block 21 +.word 432623749 // Layer 6, block 22 +.word 142637881 // Layer 6, block 23 +.word 25929180 // Layer 7, block 40 +.word 23508428 // Layer 7, block 42 +.word 22560727 // Layer 7, block 44 +.word 29457393 // Layer 7, block 46 +.word 1659340873 // Layer 7, block 40 +.word 1504424569 // Layer 7, block 42 +.word 1443776334 // Layer 7, block 44 +.word 1885129272 // Layer 7, block 46 +.word 17371159 // Layer 7, block 41 +.word 11558208 // Layer 7, block 43 +.word 15755637 // Layer 7, block 45 +.word 20740787 // Layer 7, block 47 +.word 1111669329 // Layer 7, block 41 +.word 739668858 // Layer 7, block 43 +.word 1008283812 // Layer 7, block 45 +.word 1327309063 // Layer 7, block 47 +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 13624329 // Layer 6, block 24 +.word 9838349 // Layer 6, block 25 +.word 6934560 // Layer 6, block 26 +.word 11310234 // Layer 6, block 27 +.word 871890510 // Layer 6, block 24 +.word 629606282 // Layer 6, block 25 +.word 443777969 // Layer 6, block 26 +.word 723799733 // Layer 6, block 27 +.word 3153984 // Layer 7, block 48 +.word 15599806 // Layer 7, block 50 +.word 23484790 // Layer 7, block 52 +.word 30174454 // Layer 7, block 54 +.word 201839571 // Layer 7, block 48 +.word 998311389 // Layer 7, block 50 +.word 1502911852 // Layer 7, block 52 +.word 1931017673 // Layer 7, block 54 +.word 13598070 // Layer 7, block 49 +.word 31454003 // Layer 7, block 51 +.word 20506260 // Layer 7, block 53 +.word 5928435 // Layer 7, block 55 +.word 870210062 // Layer 7, block 49 +.word 2012902560 // Layer 7, block 51 +.word 1312300480 // Layer 7, block 53 +.word 379390883 // Layer 7, block 55 +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 32798516 // Layer 6, block 28 +.word 9911360 // Layer 6, block 29 +.word 32443170 // Layer 6, block 30 +.word 31293482 // Layer 6, block 31 +.word 2098944825 // Layer 6, block 28 +.word 634278629 // Layer 6, block 29 +.word 2076204416 // Layer 6, block 30 +.word 2002630000 // Layer 6, block 31 +.word 26013877 // Layer 7, block 56 +.word 22928950 // Layer 7, block 58 +.word 24547058 // Layer 7, block 60 +.word 21082546 // Layer 7, block 62 +.word 1664761067 // Layer 7, block 56 +.word 1467340807 // Layer 7, block 58 +.word 1570891816 // Layer 7, block 60 +.word 1349179970 // Layer 7, block 62 +.word 21864746 // Layer 7, block 57 +.word 27678266 // Layer 7, block 59 +.word 30695887 // Layer 7, block 61 +.word 31772478 // Layer 7, block 63 +.word 1399236949 // Layer 7, block 57 +.word 1771273834 // Layer 7, block 59 +.word 1964386839 // Layer 7, block 61 +.word 2033283404 // Layer 7, block 63 +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 2853776 // Layer 6, block 32 +.word 31645959 // Layer 6, block 33 +.word 29723614 // Layer 6, block 34 +.word 31813171 // Layer 6, block 35 +.word 182627725 // Layer 6, block 32 +.word 2025186806 // Layer 6, block 33 +.word 1902166116 // Layer 6, block 34 +.word 2035887557 // Layer 6, block 35 +.word 30377953 // Layer 7, block 64 +.word 4924837 // Layer 7, block 66 +.word 11362575 // Layer 7, block 68 +.word 31398766 // Layer 7, block 70 +.word 1944040616 // Layer 7, block 64 +.word 315165513 // Layer 7, block 66 +.word 727149301 // Layer 7, block 68 +.word 2009367662 // Layer 7, block 70 +.word 27689101 // Layer 7, block 65 +.word 31229525 // Layer 7, block 67 +.word 6544948 // Layer 7, block 69 +.word 13728247 // Layer 7, block 71 +.word 1771967221 // Layer 7, block 65 +.word 1998537064 // Layer 7, block 67 +.word 418844704 // Layer 7, block 69 +.word 878540754 // Layer 7, block 71 +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9116920 // Layer 6, block 36 +.word 26449800 // Layer 6, block 37 +.word 27173300 // Layer 6, block 38 +.word 1574249 // Layer 6, block 39 +.word 583438350 // Layer 6, block 36 +.word 1692658010 // Layer 6, block 37 +.word 1738958476 // Layer 6, block 38 +.word 100744247 // Layer 6, block 39 +.word 6510145 // Layer 7, block 72 +.word 760999 // Layer 7, block 74 +.word 1634503 // Layer 7, block 76 +.word 29546109 // Layer 7, block 78 +.word 416617482 // Layer 7, block 72 +.word 48700219 // Layer 7, block 74 +.word 104600209 // Layer 7, block 76 +.word 1890806663 // Layer 7, block 78 +.word 2195232 // Layer 7, block 73 +.word 4465852 // Layer 7, block 75 +.word 31203102 // Layer 7, block 77 +.word 29916743 // Layer 7, block 79 +.word 140484126 // Layer 7, block 73 +.word 285792715 // Layer 7, block 75 +.word 1996846121 // Layer 7, block 77 +.word 1914525428 // Layer 7, block 79 +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29172999 // Layer 6, block 40 +.word 16825951 // Layer 6, block 41 +.word 11592382 // Layer 6, block 42 +.word 2671395 // Layer 6, block 43 +.word 1866929445 // Layer 6, block 40 +.word 1076778680 // Layer 6, block 41 +.word 741855827 // Layer 6, block 42 +.word 170956232 // Layer 6, block 43 +.word 14579779 // Layer 7, block 80 +.word 24263513 // Layer 7, block 82 +.word 4646776 // Layer 7, block 84 +.word 69049 // Layer 7, block 86 +.word 933034643 // Layer 7, block 80 +.word 1552746321 // Layer 7, block 82 +.word 297370968 // Layer 7, block 84 +.word 4418799 // Layer 7, block 86 +.word 33263488 // Layer 7, block 81 +.word 22493246 // Layer 7, block 83 +.word 22009979 // Layer 7, block 85 +.word 12021234 // Layer 7, block 87 +.word 2128700762 // Layer 7, block 81 +.word 1439457879 // Layer 7, block 83 +.word 1408531152 // Layer 7, block 85 +.word 769300260 // Layer 7, block 87 +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 15720958 // Layer 6, block 44 +.word 4876619 // Layer 6, block 45 +.word 9370171 // Layer 6, block 46 +.word 2197027 // Layer 6, block 47 +.word 1006064525 // Layer 6, block 44 +.word 312079797 // Layer 6, block 45 +.word 599645177 // Layer 6, block 46 +.word 140598997 // Layer 6, block 47 +.word 16117282 // Layer 7, block 88 +.word 9635661 // Layer 7, block 90 +.word 9117520 // Layer 7, block 92 +.word 3506913 // Layer 7, block 94 +.word 1031427326 // Layer 7, block 88 +.word 616635240 // Layer 7, block 90 +.word 583476747 // Layer 7, block 92 +.word 224425303 // Layer 7, block 94 +.word 20014407 // Layer 7, block 89 +.word 25893988 // Layer 7, block 91 +.word 10257619 // Layer 7, block 93 +.word 24501669 // Layer 7, block 95 +.word 1280824291 // Layer 7, block 89 +.word 1657088757 // Layer 7, block 91 +.word 656437514 // Layer 7, block 93 +.word 1567987141 // Layer 7, block 95 +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 23467272 // Layer 6, block 48 +.word 11944835 // Layer 6, block 49 +.word 29768154 // Layer 6, block 50 +.word 3189790 // Layer 6, block 51 +.word 1501790786 // Layer 6, block 48 +.word 764411097 // Layer 6, block 49 +.word 1905016458 // Layer 6, block 50 +.word 204130980 // Layer 6, block 51 +.word 28559032 // Layer 7, block 96 +.word 20151609 // Layer 7, block 98 +.word 11645481 // Layer 7, block 100 +.word 16402437 // Layer 7, block 102 +.word 1827638556 // Layer 7, block 96 +.word 1289604549 // Layer 7, block 98 +.word 745253903 // Layer 7, block 100 +.word 1049675853 // Layer 7, block 102 +.word 1005359 // Layer 7, block 97 +.word 19130139 // Layer 7, block 99 +.word 11690281 // Layer 7, block 101 +.word 5461508 // Layer 7, block 103 +.word 64338065 // Layer 7, block 97 +.word 1224235458 // Layer 7, block 99 +.word 748120885 // Layer 7, block 101 +.word 349509836 // Layer 7, block 103 +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 4898455 // Layer 6, block 52 +.word 22059944 // Layer 6, block 53 +.word 20315246 // Layer 6, block 54 +.word 28615767 // Layer 6, block 55 +.word 313477194 // Layer 6, block 52 +.word 1411728668 // Layer 6, block 53 +.word 1300076517 // Layer 6, block 54 +.word 1831269319 // Layer 6, block 55 +.word 6226096 // Layer 7, block 104 +.word 14029790 // Layer 7, block 106 +.word 7729000 // Layer 7, block 108 +.word 13958531 // Layer 7, block 110 +.word 398439734 // Layer 7, block 104 +.word 897838034 // Layer 7, block 106 +.word 494618249 // Layer 7, block 108 +.word 893277806 // Layer 7, block 110 +.word 31755058 // Layer 7, block 105 +.word 26102744 // Layer 7, block 107 +.word 19175904 // Layer 7, block 109 +.word 19472238 // Layer 7, block 111 +.word 2032168609 // Layer 7, block 105 +.word 1670448121 // Layer 7, block 107 +.word 1227164194 // Layer 7, block 109 +.word 1246128123 // Layer 7, block 111 +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 17302560 // Layer 6, block 56 +.word 8630188 // Layer 6, block 57 +.word 13744680 // Layer 6, block 58 +.word 31890906 // Layer 6, block 59 +.word 1107279328 // Layer 6, block 56 +.word 552289879 // Layer 6, block 57 +.word 879592386 // Layer 6, block 58 +.word 2040862218 // Layer 6, block 59 +.word 4735938 // Layer 7, block 112 +.word 26671657 // Layer 7, block 114 +.word 25810971 // Layer 7, block 116 +.word 25578690 // Layer 7, block 118 +.word 303076900 // Layer 7, block 112 +.word 1706855774 // Layer 7, block 114 +.word 1651776074 // Layer 7, block 116 +.word 1636911225 // Layer 7, block 118 +.word 6957373 // Layer 7, block 113 +.word 25381712 // Layer 7, block 115 +.word 27780827 // Layer 7, block 117 +.word 28062311 // Layer 7, block 119 +.word 445237890 // Layer 7, block 113 +.word 1624305595 // Layer 7, block 115 +.word 1777837237 // Layer 7, block 117 +.word 1795850838 // Layer 7, block 119 +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 26150922 // Layer 6, block 60 +.word 29525906 // Layer 6, block 61 +.word 23080870 // Layer 6, block 62 +.word 1636987 // Layer 6, block 63 +.word 1673531278 // Layer 6, block 60 +.word 1889513769 // Layer 6, block 61 +.word 1477062945 // Layer 6, block 62 +.word 104759172 // Layer 6, block 63 +.word 10674616 // Layer 7, block 120 +.word 9508293 // Layer 7, block 122 +.word 4274200 // Layer 7, block 124 +.word 10066304 // Layer 7, block 126 +.word 683123285 // Layer 7, block 120 +.word 608484310 // Layer 7, block 122 +.word 273527923 // Layer 7, block 124 +.word 644194289 // Layer 7, block 126 +.word 26473446 // Layer 7, block 121 +.word 14853570 // Layer 7, block 123 +.word 32427548 // Layer 7, block 125 +.word 16598340 // Layer 7, block 127 +.word 1694171239 // Layer 7, block 121 +.word 950555930 // Layer 7, block 123 +.word 2075204685 // Layer 7, block 125 +.word 1062212688 // Layer 7, block 127 +.text +.global ntt_u32_full_neon_asm_var_4_4_0_0 +.global _ntt_u32_full_neon_asm_var_4_4_0_0 +ntt_u32_full_neon_asm_var_4_4_0_0: +_ntt_u32_full_neon_asm_var_4_4_0_0: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #800] +ldr q21, [x0, #864] +ldr q20, [x0, #928] +ldr q19, [x0, #992] +ldr q18, [x0, #288] +ldr q17, [x0, #352] +ldr q16, [x0, #416] +ldr q3, [x0, #480] +ldr q2, [x0, #544] +ldr q1, [x0, #608] +ldr q0, [x0, #672] +ldr q15, [x0, #736] +ldr q14, [x0, #32] +ldr q13, [x0, #96] +ldr q12, [x0, #160] +ldr q11, [x0, #224] +sqrdmulh v10.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +mla v22.4S, v10.4S, v31.s[0] +sub v10.4s, v18.4s, v22.4s +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v17.4s, v21.4s +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +mla v20.4S, v21.4S, v31.s[0] +sub v21.4s, v16.4s, v20.4s +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +mla v19.4S, v20.4S, v31.s[0] +sub v20.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +mla v2.4S, v19.4S, v31.s[0] +sub v19.4s, v14.4s, v2.4s +add v14.4s, v14.4s, v2.4s +sqrdmulh v2.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +mla v1.4S, v2.4S, v31.s[0] +sub v2.4s, v13.4s, v1.4s +add v13.4s, v13.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v29.s[0] +mul v0.4S, v0.4S,v30.s[0] +mla v0.4S, v1.4S, v31.s[0] +sub v1.4s, v12.4s, v0.4s +add v12.4s, v12.4s, v0.4s +sqrdmulh v0.4S, v15.4S, v29.s[0] +mul v15.4S, v15.4S,v30.s[0] +mla v15.4S, v0.4S, v31.s[0] +sub v0.4s, v11.4s, v15.4s +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +mla v16.4S, v15.4S, v31.s[0] +sub v15.4s, v12.4s, v16.4s +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +mla v3.4S, v16.4S, v31.s[0] +sub v16.4s, v11.4s, v3.4s +add v11.4s, v11.4s, v3.4s +sqrdmulh v3.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +mla v18.4S, v3.4S, v31.s[0] +sub v3.4s, v14.4s, v18.4s +add v14.4s, v14.4s, v18.4s +sqrdmulh v18.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +mla v17.4S, v18.4S, v31.s[0] +sub v18.4s, v13.4s, v17.4s +add v13.4s, v13.4s, v17.4s +sqrdmulh v17.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +mla v21.4S, v17.4S, v31.s[0] +sub v17.4s, v1.4s, v21.4s +add v1.4s, v1.4s, v21.4s +sqrdmulh v21.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +mla v20.4S, v21.4S, v31.s[0] +sub v21.4s, v0.4s, v20.4s +add v0.4s, v0.4s, v20.4s +sqrdmulh v20.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +mla v10.4S, v20.4S, v31.s[0] +sub v20.4s, v19.4s, v10.4s +add v19.4s, v19.4s, v10.4s +sqrdmulh v10.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +mla v22.4S, v10.4S, v31.s[0] +sub v10.4s, v2.4s, v22.4s +add v2.4s, v2.4s, v22.4s +sqrdmulh v22.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +mla v12.4S, v22.4S, v31.s[0] +sub v22.4s, v14.4s, v12.4s +add v14.4s, v14.4s, v12.4s +sqrdmulh v12.4S, v11.4S, v27.s[0] +mul v11.4S, v11.4S,v28.s[0] +mla v11.4S, v12.4S, v31.s[0] +sub v12.4s, v13.4s, v11.4s +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v15.4S, v27.s[1] +mul v15.4S, v15.4S,v28.s[1] +mla v15.4S, v11.4S, v31.s[0] +sub v11.4s, v3.4s, v15.4s +add v3.4s, v3.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v27.s[1] +mul v16.4S, v16.4S,v28.s[1] +mla v16.4S, v15.4S, v31.s[0] +sub v15.4s, v18.4s, v16.4s +add v18.4s, v18.4s, v16.4s +sqrdmulh v16.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +mla v1.4S, v16.4S, v31.s[0] +sub v16.4s, v19.4s, v1.4s +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v27.s[2] +mul v0.4S, v0.4S,v28.s[2] +mla v0.4S, v1.4S, v31.s[0] +sub v1.4s, v2.4s, v0.4s +add v2.4s, v2.4s, v0.4s +sqrdmulh v0.4S, v17.4S, v27.s[3] +mul v17.4S, v17.4S,v28.s[3] +mla v17.4S, v0.4S, v31.s[0] +sub v0.4s, v20.4s, v17.4s +add v20.4s, v20.4s, v17.4s +sqrdmulh v17.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +mla v21.4S, v17.4S, v31.s[0] +sub v17.4s, v10.4s, v21.4s +add v10.4s, v10.4s, v21.4s +sqrdmulh v21.4S, v13.4S, v25.s[0] +mul v13.4S, v13.4S,v26.s[0] +mla v13.4S, v21.4S, v31.s[0] +sub v21.4s, v14.4s, v13.4s +add v14.4s, v14.4s, v13.4s +sqrdmulh v13.4S, v12.4S, v25.s[1] +mul v12.4S, v12.4S,v26.s[1] +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v18.4S, v25.s[2] +mul v18.4S, v18.4S,v26.s[2] +mla v18.4S, v12.4S, v31.s[0] +sub v12.4s, v3.4s, v18.4s +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v15.4S, v25.s[3] +mul v15.4S, v15.4S,v26.s[3] +mla v15.4S, v18.4S, v31.s[0] +sub v18.4s, v11.4s, v15.4s +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v23.s[0] +mul v2.4S, v2.4S,v24.s[0] +mla v2.4S, v15.4S, v31.s[0] +sub v15.4s, v19.4s, v2.4s +add v19.4s, v19.4s, v2.4s +sqrdmulh v2.4S, v1.4S, v23.s[1] +mul v1.4S, v1.4S,v24.s[1] +mla v1.4S, v2.4S, v31.s[0] +sub v2.4s, v16.4s, v1.4s +add v16.4s, v16.4s, v1.4s +sqrdmulh v1.4S, v10.4S, v23.s[2] +mul v10.4S, v10.4S,v24.s[2] +mla v10.4S, v1.4S, v31.s[0] +sub v1.4s, v20.4s, v10.4s +add v20.4s, v20.4s, v10.4s +sqrdmulh v10.4S, v17.4S, v23.s[3] +mul v17.4S, v17.4S,v24.s[3] +mla v17.4S, v10.4S, v31.s[0] +sub v10.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +str q14, [x0, #32] +str q21, [x0, #96] +str q22, [x0, #160] +str q13, [x0, #224] +str q3, [x0, #288] +str q12, [x0, #352] +str q11, [x0, #416] +str q18, [x0, #480] +str q19, [x0, #544] +str q15, [x0, #608] +str q16, [x0, #672] +str q2, [x0, #736] +str q20, [x0, #800] +str q1, [x0, #864] +str q0, [x0, #928] +str q10, [x0, #992] +ldr q10, [x0, #816] +ldr q0, [x0, #880] +ldr q1, [x0, #944] +ldr q20, [x0, #1008] +ldr q2, [x0, #304] +ldr q16, [x0, #368] +ldr q15, [x0, #432] +ldr q19, [x0, #496] +ldr q18, [x0, #560] +ldr q11, [x0, #624] +ldr q12, [x0, #688] +ldr q3, [x0, #752] +ldr q13, [x0, #48] +ldr q22, [x0, #112] +ldr q21, [x0, #176] +ldr q14, [x0, #240] +sqrdmulh v17.4S, v10.4S, v29.s[0] +mul v10.4S, v10.4S,v30.s[0] +mla v10.4S, v17.4S, v31.s[0] +sub v17.4s, v2.4s, v10.4s +add v2.4s, v2.4s, v10.4s +sqrdmulh v10.4S, v0.4S, v29.s[0] +mul v0.4S, v0.4S,v30.s[0] +mla v0.4S, v10.4S, v31.s[0] +sub v10.4s, v16.4s, v0.4s +add v16.4s, v16.4s, v0.4s +sqrdmulh v0.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +mla v1.4S, v0.4S, v31.s[0] +sub v0.4s, v15.4s, v1.4s +add v15.4s, v15.4s, v1.4s +sqrdmulh v1.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +mla v20.4S, v1.4S, v31.s[0] +sub v1.4s, v19.4s, v20.4s +add v19.4s, v19.4s, v20.4s +sqrdmulh v20.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +mla v18.4S, v20.4S, v31.s[0] +sub v20.4s, v13.4s, v18.4s +add v13.4s, v13.4s, v18.4s +sqrdmulh v18.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +mla v11.4S, v18.4S, v31.s[0] +sub v18.4s, v22.4s, v11.4s +add v22.4s, v22.4s, v11.4s +sqrdmulh v11.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +mla v12.4S, v11.4S, v31.s[0] +sub v11.4s, v21.4s, v12.4s +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +mla v3.4S, v12.4S, v31.s[0] +sub v12.4s, v14.4s, v3.4s +add v14.4s, v14.4s, v3.4s +sqrdmulh v3.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +mla v15.4S, v3.4S, v31.s[0] +sub v3.4s, v21.4s, v15.4s +add v21.4s, v21.4s, v15.4s +sqrdmulh v15.4S, v19.4S, v29.s[1] +mul v19.4S, v19.4S,v30.s[1] +mla v19.4S, v15.4S, v31.s[0] +sub v15.4s, v14.4s, v19.4s +add v14.4s, v14.4s, v19.4s +sqrdmulh v19.4S, v2.4S, v29.s[1] +mul v2.4S, v2.4S,v30.s[1] +mla v2.4S, v19.4S, v31.s[0] +sub v19.4s, v13.4s, v2.4s +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +mla v16.4S, v2.4S, v31.s[0] +sub v2.4s, v22.4s, v16.4s +add v22.4s, v22.4s, v16.4s +sqrdmulh v16.4S, v0.4S, v29.s[2] +mul v0.4S, v0.4S,v30.s[2] +mla v0.4S, v16.4S, v31.s[0] +sub v16.4s, v11.4s, v0.4s +add v11.4s, v11.4s, v0.4s +sqrdmulh v0.4S, v1.4S, v29.s[2] +mul v1.4S, v1.4S,v30.s[2] +mla v1.4S, v0.4S, v31.s[0] +sub v0.4s, v12.4s, v1.4s +add v12.4s, v12.4s, v1.4s +sqrdmulh v1.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +mla v17.4S, v1.4S, v31.s[0] +sub v1.4s, v20.4s, v17.4s +add v20.4s, v20.4s, v17.4s +sqrdmulh v17.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +mla v10.4S, v17.4S, v31.s[0] +sub v17.4s, v18.4s, v10.4s +add v18.4s, v18.4s, v10.4s +sqrdmulh v10.4S, v21.4S, v27.s[0] +mul v21.4S, v21.4S,v28.s[0] +mla v21.4S, v10.4S, v31.s[0] +sub v10.4s, v13.4s, v21.4s +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +mla v14.4S, v21.4S, v31.s[0] +sub v21.4s, v22.4s, v14.4s +add v22.4s, v22.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v27.s[1] +mul v3.4S, v3.4S,v28.s[1] +mla v3.4S, v14.4S, v31.s[0] +sub v14.4s, v19.4s, v3.4s +add v19.4s, v19.4s, v3.4s +sqrdmulh v3.4S, v15.4S, v27.s[1] +mul v15.4S, v15.4S,v28.s[1] +mla v15.4S, v3.4S, v31.s[0] +sub v3.4s, v2.4s, v15.4s +add v2.4s, v2.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v27.s[2] +mul v11.4S, v11.4S,v28.s[2] +mla v11.4S, v15.4S, v31.s[0] +sub v15.4s, v20.4s, v11.4s +add v20.4s, v20.4s, v11.4s +sqrdmulh v11.4S, v12.4S, v27.s[2] +mul v12.4S, v12.4S,v28.s[2] +mla v12.4S, v11.4S, v31.s[0] +sub v11.4s, v18.4s, v12.4s +add v18.4s, v18.4s, v12.4s +sqrdmulh v12.4S, v16.4S, v27.s[3] +mul v16.4S, v16.4S,v28.s[3] +mla v16.4S, v12.4S, v31.s[0] +sub v12.4s, v1.4s, v16.4s +add v1.4s, v1.4s, v16.4s +sqrdmulh v16.4S, v0.4S, v27.s[3] +mul v0.4S, v0.4S,v28.s[3] +mla v0.4S, v16.4S, v31.s[0] +sub v16.4s, v17.4s, v0.4s +add v17.4s, v17.4s, v0.4s +sqrdmulh v0.4S, v22.4S, v25.s[0] +mul v22.4S, v22.4S,v26.s[0] +mla v22.4S, v0.4S, v31.s[0] +sub v0.4s, v13.4s, v22.4s +add v13.4s, v13.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v25.s[1] +mul v21.4S, v21.4S,v26.s[1] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v10.4s, v21.4s +add v10.4s, v10.4s, v21.4s +sqrdmulh v21.4S, v2.4S, v25.s[2] +mul v2.4S, v2.4S,v26.s[2] +mla v2.4S, v21.4S, v31.s[0] +sub v21.4s, v19.4s, v2.4s +add v19.4s, v19.4s, v2.4s +sqrdmulh v2.4S, v3.4S, v25.s[3] +mul v3.4S, v3.4S,v26.s[3] +mla v3.4S, v2.4S, v31.s[0] +sub v2.4s, v14.4s, v3.4s +add v14.4s, v14.4s, v3.4s +sqrdmulh v3.4S, v18.4S, v23.s[0] +mul v18.4S, v18.4S,v24.s[0] +mla v18.4S, v3.4S, v31.s[0] +sub v3.4s, v20.4s, v18.4s +add v20.4s, v20.4s, v18.4s +sqrdmulh v18.4S, v11.4S, v23.s[1] +mul v11.4S, v11.4S,v24.s[1] +mla v11.4S, v18.4S, v31.s[0] +sub v18.4s, v15.4s, v11.4s +add v15.4s, v15.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v23.s[2] +mul v17.4S, v17.4S,v24.s[2] +mla v17.4S, v11.4S, v31.s[0] +sub v11.4s, v1.4s, v17.4s +add v1.4s, v1.4s, v17.4s +sqrdmulh v17.4S, v16.4S, v23.s[3] +mul v16.4S, v16.4S,v24.s[3] +mla v16.4S, v17.4S, v31.s[0] +sub v17.4s, v12.4s, v16.4s +add v12.4s, v12.4s, v16.4s +str q13, [x0, #48] +str q0, [x0, #112] +str q10, [x0, #176] +str q22, [x0, #240] +str q19, [x0, #304] +str q21, [x0, #368] +str q14, [x0, #432] +str q2, [x0, #496] +str q20, [x0, #560] +str q3, [x0, #624] +str q15, [x0, #688] +str q18, [x0, #752] +str q1, [x0, #816] +str q11, [x0, #880] +str q12, [x0, #944] +str q17, [x0, #1008] +ldr q17, [x0, #768] +ldr q12, [x0, #832] +ldr q11, [x0, #896] +ldr q1, [x0, #960] +ldr q18, [x0, #256] +ldr q15, [x0, #320] +ldr q3, [x0, #384] +ldr q20, [x0, #448] +ldr q2, [x0, #512] +ldr q14, [x0, #576] +ldr q21, [x0, #640] +ldr q19, [x0, #704] +ldr q22, [x0, #0] +ldr q10, [x0, #64] +ldr q0, [x0, #128] +ldr q13, [x0, #192] +sqrdmulh v16.4S, v17.4S, v29.s[0] +mul v17.4S, v17.4S,v30.s[0] +mla v17.4S, v16.4S, v31.s[0] +sub v16.4s, v18.4s, v17.4s +add v18.4s, v18.4s, v17.4s +sqrdmulh v17.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +mla v12.4S, v17.4S, v31.s[0] +sub v17.4s, v15.4s, v12.4s +add v15.4s, v15.4s, v12.4s +sqrdmulh v12.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +mla v11.4S, v12.4S, v31.s[0] +sub v12.4s, v3.4s, v11.4s +add v3.4s, v3.4s, v11.4s +sqrdmulh v11.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +mla v1.4S, v11.4S, v31.s[0] +sub v11.4s, v20.4s, v1.4s +add v20.4s, v20.4s, v1.4s +sqrdmulh v1.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +mla v2.4S, v1.4S, v31.s[0] +sub v1.4s, v22.4s, v2.4s +add v22.4s, v22.4s, v2.4s +sqrdmulh v2.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +mla v14.4S, v2.4S, v31.s[0] +sub v2.4s, v10.4s, v14.4s +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +mla v21.4S, v14.4S, v31.s[0] +sub v14.4s, v0.4s, v21.4s +add v0.4s, v0.4s, v21.4s +sqrdmulh v21.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +mla v19.4S, v21.4S, v31.s[0] +sub v21.4s, v13.4s, v19.4s +add v13.4s, v13.4s, v19.4s +sqrdmulh v19.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +mla v3.4S, v19.4S, v31.s[0] +sub v19.4s, v0.4s, v3.4s +add v0.4s, v0.4s, v3.4s +sqrdmulh v3.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +mla v20.4S, v3.4S, v31.s[0] +sub v3.4s, v13.4s, v20.4s +add v13.4s, v13.4s, v20.4s +sqrdmulh v20.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +mla v18.4S, v20.4S, v31.s[0] +sub v20.4s, v22.4s, v18.4s +add v22.4s, v22.4s, v18.4s +sqrdmulh v18.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +mla v15.4S, v18.4S, v31.s[0] +sub v18.4s, v10.4s, v15.4s +add v10.4s, v10.4s, v15.4s +sqrdmulh v15.4S, v12.4S, v29.s[2] +mul v12.4S, v12.4S,v30.s[2] +mla v12.4S, v15.4S, v31.s[0] +sub v15.4s, v14.4s, v12.4s +add v14.4s, v14.4s, v12.4s +sqrdmulh v12.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +mla v11.4S, v12.4S, v31.s[0] +sub v12.4s, v21.4s, v11.4s +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +mla v16.4S, v11.4S, v31.s[0] +sub v11.4s, v1.4s, v16.4s +add v1.4s, v1.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +mla v17.4S, v16.4S, v31.s[0] +sub v16.4s, v2.4s, v17.4s +add v2.4s, v2.4s, v17.4s +sqrdmulh v17.4S, v0.4S, v27.s[0] +mul v0.4S, v0.4S,v28.s[0] +mla v0.4S, v17.4S, v31.s[0] +sub v17.4s, v22.4s, v0.4s +add v22.4s, v22.4s, v0.4s +sqrdmulh v0.4S, v13.4S, v27.s[0] +mul v13.4S, v13.4S,v28.s[0] +mla v13.4S, v0.4S, v31.s[0] +sub v0.4s, v10.4s, v13.4s +add v10.4s, v10.4s, v13.4s +sqrdmulh v13.4S, v19.4S, v27.s[1] +mul v19.4S, v19.4S,v28.s[1] +mla v19.4S, v13.4S, v31.s[0] +sub v13.4s, v20.4s, v19.4s +add v20.4s, v20.4s, v19.4s +sqrdmulh v19.4S, v3.4S, v27.s[1] +mul v3.4S, v3.4S,v28.s[1] +mla v3.4S, v19.4S, v31.s[0] +sub v19.4s, v18.4s, v3.4s +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v14.4S, v27.s[2] +mul v14.4S, v14.4S,v28.s[2] +mla v14.4S, v3.4S, v31.s[0] +sub v3.4s, v1.4s, v14.4s +add v1.4s, v1.4s, v14.4s +sqrdmulh v14.4S, v21.4S, v27.s[2] +mul v21.4S, v21.4S,v28.s[2] +mla v21.4S, v14.4S, v31.s[0] +sub v14.4s, v2.4s, v21.4s +add v2.4s, v2.4s, v21.4s +sqrdmulh v21.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +mla v15.4S, v21.4S, v31.s[0] +sub v21.4s, v11.4s, v15.4s +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v12.4S, v27.s[3] +mul v12.4S, v12.4S,v28.s[3] +mla v12.4S, v15.4S, v31.s[0] +sub v15.4s, v16.4s, v12.4s +add v16.4s, v16.4s, v12.4s +sqrdmulh v12.4S, v10.4S, v25.s[0] +mul v10.4S, v10.4S,v26.s[0] +mla v10.4S, v12.4S, v31.s[0] +sub v12.4s, v22.4s, v10.4s +add v22.4s, v22.4s, v10.4s +sqrdmulh v10.4S, v0.4S, v25.s[1] +mul v0.4S, v0.4S,v26.s[1] +mla v0.4S, v10.4S, v31.s[0] +sub v10.4s, v17.4s, v0.4s +add v17.4s, v17.4s, v0.4s +sqrdmulh v0.4S, v18.4S, v25.s[2] +mul v18.4S, v18.4S,v26.s[2] +mla v18.4S, v0.4S, v31.s[0] +sub v0.4s, v20.4s, v18.4s +add v20.4s, v20.4s, v18.4s +sqrdmulh v18.4S, v19.4S, v25.s[3] +mul v19.4S, v19.4S,v26.s[3] +mla v19.4S, v18.4S, v31.s[0] +sub v18.4s, v13.4s, v19.4s +add v13.4s, v13.4s, v19.4s +sqrdmulh v19.4S, v2.4S, v23.s[0] +mul v2.4S, v2.4S,v24.s[0] +mla v2.4S, v19.4S, v31.s[0] +sub v19.4s, v1.4s, v2.4s +add v1.4s, v1.4s, v2.4s +sqrdmulh v2.4S, v14.4S, v23.s[1] +mul v14.4S, v14.4S,v24.s[1] +mla v14.4S, v2.4S, v31.s[0] +sub v2.4s, v3.4s, v14.4s +add v3.4s, v3.4s, v14.4s +sqrdmulh v14.4S, v16.4S, v23.s[2] +mul v16.4S, v16.4S,v24.s[2] +mla v16.4S, v14.4S, v31.s[0] +sub v14.4s, v11.4s, v16.4s +add v11.4s, v11.4s, v16.4s +sqrdmulh v16.4S, v15.4S, v23.s[3] +mul v15.4S, v15.4S,v24.s[3] +mla v15.4S, v16.4S, v31.s[0] +sub v16.4s, v21.4s, v15.4s +add v21.4s, v21.4s, v15.4s +str q22, [x0, #0] +str q12, [x0, #64] +str q17, [x0, #128] +str q10, [x0, #192] +str q20, [x0, #256] +str q0, [x0, #320] +str q13, [x0, #384] +str q18, [x0, #448] +str q1, [x0, #512] +str q19, [x0, #576] +str q3, [x0, #640] +str q2, [x0, #704] +str q11, [x0, #768] +str q14, [x0, #832] +str q21, [x0, #896] +str q16, [x0, #960] +ldr q16, [x0, #784] +ldr q21, [x0, #848] +ldr q14, [x0, #912] +ldr q11, [x0, #976] +ldr q2, [x0, #272] +ldr q3, [x0, #336] +ldr q19, [x0, #400] +ldr q1, [x0, #464] +ldr q18, [x0, #528] +ldr q13, [x0, #592] +ldr q0, [x0, #656] +ldr q20, [x0, #720] +ldr q10, [x0, #16] +ldr q17, [x0, #80] +ldr q12, [x0, #144] +ldr q22, [x0, #208] +sqrdmulh v15.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +mla v16.4S, v15.4S, v31.s[0] +sub v15.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +mla v21.4S, v16.4S, v31.s[0] +sub v16.4s, v3.4s, v21.4s +add v3.4s, v3.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +mla v14.4S, v21.4S, v31.s[0] +sub v21.4s, v19.4s, v14.4s +add v19.4s, v19.4s, v14.4s +sqrdmulh v14.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +mla v11.4S, v14.4S, v31.s[0] +sub v14.4s, v1.4s, v11.4s +add v1.4s, v1.4s, v11.4s +sqrdmulh v11.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +mla v18.4S, v11.4S, v31.s[0] +sub v11.4s, v10.4s, v18.4s +add v10.4s, v10.4s, v18.4s +sqrdmulh v18.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +mla v13.4S, v18.4S, v31.s[0] +sub v18.4s, v17.4s, v13.4s +add v17.4s, v17.4s, v13.4s +sqrdmulh v13.4S, v0.4S, v29.s[0] +mul v0.4S, v0.4S,v30.s[0] +mla v0.4S, v13.4S, v31.s[0] +sub v13.4s, v12.4s, v0.4s +add v12.4s, v12.4s, v0.4s +sqrdmulh v0.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +mla v20.4S, v0.4S, v31.s[0] +sub v0.4s, v22.4s, v20.4s +add v22.4s, v22.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v29.s[1] +mul v19.4S, v19.4S,v30.s[1] +mla v19.4S, v20.4S, v31.s[0] +sub v20.4s, v12.4s, v19.4s +add v12.4s, v12.4s, v19.4s +sqrdmulh v19.4S, v1.4S, v29.s[1] +mul v1.4S, v1.4S,v30.s[1] +mla v1.4S, v19.4S, v31.s[0] +sub v19.4s, v22.4s, v1.4s +add v22.4s, v22.4s, v1.4s +sqrdmulh v1.4S, v2.4S, v29.s[1] +mul v2.4S, v2.4S,v30.s[1] +mla v2.4S, v1.4S, v31.s[0] +sub v1.4s, v10.4s, v2.4s +add v10.4s, v10.4s, v2.4s +sqrdmulh v2.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +mla v3.4S, v2.4S, v31.s[0] +sub v2.4s, v17.4s, v3.4s +add v17.4s, v17.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +mla v21.4S, v3.4S, v31.s[0] +sub v3.4s, v13.4s, v21.4s +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +mla v14.4S, v21.4S, v31.s[0] +sub v21.4s, v0.4s, v14.4s +add v0.4s, v0.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +mla v15.4S, v14.4S, v31.s[0] +sub v14.4s, v11.4s, v15.4s +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +mla v16.4S, v15.4S, v31.s[0] +sub v15.4s, v18.4s, v16.4s +add v18.4s, v18.4s, v16.4s +sqrdmulh v16.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +mla v12.4S, v16.4S, v31.s[0] +sub v16.4s, v10.4s, v12.4s +add v10.4s, v10.4s, v12.4s +sqrdmulh v12.4S, v22.4S, v27.s[0] +mul v22.4S, v22.4S,v28.s[0] +mla v22.4S, v12.4S, v31.s[0] +sub v12.4s, v17.4s, v22.4s +add v17.4s, v17.4s, v22.4s +sqrdmulh v22.4S, v20.4S, v27.s[1] +mul v20.4S, v20.4S,v28.s[1] +mla v20.4S, v22.4S, v31.s[0] +sub v22.4s, v1.4s, v20.4s +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v27.s[1] +mul v19.4S, v19.4S,v28.s[1] +mla v19.4S, v20.4S, v31.s[0] +sub v20.4s, v2.4s, v19.4s +add v2.4s, v2.4s, v19.4s +sqrdmulh v19.4S, v13.4S, v27.s[2] +mul v13.4S, v13.4S,v28.s[2] +mla v13.4S, v19.4S, v31.s[0] +sub v19.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +sqrdmulh v13.4S, v0.4S, v27.s[2] +mul v0.4S, v0.4S,v28.s[2] +mla v0.4S, v13.4S, v31.s[0] +sub v13.4s, v18.4s, v0.4s +add v18.4s, v18.4s, v0.4s +sqrdmulh v0.4S, v3.4S, v27.s[3] +mul v3.4S, v3.4S,v28.s[3] +mla v3.4S, v0.4S, v31.s[0] +sub v0.4s, v14.4s, v3.4s +add v14.4s, v14.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +mla v21.4S, v3.4S, v31.s[0] +sub v3.4s, v15.4s, v21.4s +add v15.4s, v15.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v25.s[0] +mul v17.4S, v17.4S,v26.s[0] +mla v17.4S, v21.4S, v31.s[0] +sub v21.4s, v10.4s, v17.4s +add v10.4s, v10.4s, v17.4s +sqrdmulh v17.4S, v12.4S, v25.s[1] +mul v12.4S, v12.4S,v26.s[1] +mla v12.4S, v17.4S, v31.s[0] +sub v17.4s, v16.4s, v12.4s +add v16.4s, v16.4s, v12.4s +sqrdmulh v12.4S, v2.4S, v25.s[2] +mul v2.4S, v2.4S,v26.s[2] +mla v2.4S, v12.4S, v31.s[0] +sub v12.4s, v1.4s, v2.4s +add v1.4s, v1.4s, v2.4s +sqrdmulh v2.4S, v20.4S, v25.s[3] +mul v20.4S, v20.4S,v26.s[3] +mla v20.4S, v2.4S, v31.s[0] +sub v2.4s, v22.4s, v20.4s +add v22.4s, v22.4s, v20.4s +sqrdmulh v20.4S, v18.4S, v23.s[0] +mul v18.4S, v18.4S,v24.s[0] +mla v18.4S, v20.4S, v31.s[0] +sub v20.4s, v11.4s, v18.4s +add v11.4s, v11.4s, v18.4s +sqrdmulh v18.4S, v13.4S, v23.s[1] +mul v13.4S, v13.4S,v24.s[1] +mla v13.4S, v18.4S, v31.s[0] +sub v18.4s, v19.4s, v13.4s +add v19.4s, v19.4s, v13.4s +sqrdmulh v13.4S, v15.4S, v23.s[2] +mul v15.4S, v15.4S,v24.s[2] +mla v15.4S, v13.4S, v31.s[0] +sub v13.4s, v14.4s, v15.4s +add v14.4s, v14.4s, v15.4s +sqrdmulh v15.4S, v3.4S, v23.s[3] +mul v3.4S, v3.4S,v24.s[3] +mla v3.4S, v15.4S, v31.s[0] +sub v15.4s, v0.4s, v3.4s +add v0.4s, v0.4s, v3.4s +str q10, [x0, #16] +str q21, [x0, #80] +str q16, [x0, #144] +str q17, [x0, #208] +str q1, [x0, #272] +str q12, [x0, #336] +str q22, [x0, #400] +str q2, [x0, #464] +str q11, [x0, #528] +str q20, [x0, #592] +str q19, [x0, #656] +str q18, [x0, #720] +str q14, [x0, #784] +str q13, [x0, #848] +str q0, [x0, #912] +str q15, [x0, #976] +ldr q4, [x17, #+128] +ldr q5, [x17, #+144] +ldr q6, [x17, #+160] +ldr q7, [x17, #+176] +ldr q8, [x17, #+192] +ldr q9, [x17, #+208] +ldr q3, [x17, #+224] +ldr q10, [x17, #+240] +ldr q21, [x0, #32] +ldr q16, [x0, #48] +ldr q17, [x0, #0] +ldr q1, [x0, #16] +sqrdmulh v12.4S, v21.4S, v5.s[0] +mul v21.4S, v21.4S,v4.s[0] +mla v21.4S, v12.4S, v31.s[0] +sub v12.4s, v17.4s, v21.4s +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v16.4S, v5.s[0] +mul v16.4S, v16.4S,v4.s[0] +mla v16.4S, v21.4S, v31.s[0] +sub v21.4s, v1.4s, v16.4s +add v1.4s, v1.4s, v16.4s +sqrdmulh v16.4S, v1.4S, v5.s[1] +mul v1.4S, v1.4S,v4.s[1] +mla v1.4S, v16.4S, v31.s[0] +sub v16.4s, v17.4s, v1.4s +add v17.4s, v17.4s, v1.4s +sqrdmulh v1.4S, v21.4S, v5.s[2] +mul v21.4S, v21.4S,v4.s[2] +mla v21.4S, v1.4S, v31.s[0] +sub v1.4s, v12.4s, v21.4s +add v12.4s, v12.4s, v21.4s +trn1 v21.4S, v17.4S, v16.4S +trn2 v22.4S, v17.4S, v16.4S +trn1 v2.4S, v12.4S, v1.4S +trn2 v11.4S, v12.4S, v1.4S +trn2 v12.2D, v21.2D, v2.2D +trn2 v1.2D, v22.2D, v11.2D +trn1 v17.2D, v21.2D, v2.2D +trn1 v16.2D, v22.2D, v11.2D +sqrdmulh v11.4S, v12.4S, v7.4S +mul v12.4S, v12.4S,v6.4S +mla v12.4S, v11.4S, v31.s[0] +sub v11.4s, v17.4s, v12.4s +add v17.4s, v17.4s, v12.4s +sqrdmulh v12.4S, v1.4S, v7.4S +mul v1.4S, v1.4S,v6.4S +mla v1.4S, v12.4S, v31.s[0] +sub v12.4s, v16.4s, v1.4s +add v16.4s, v16.4s, v1.4s +sqrdmulh v1.4S, v16.4S, v9.4S +mul v16.4S, v16.4S,v8.4S +mla v16.4S, v1.4S, v31.s[0] +sub v1.4s, v17.4s, v16.4s +add v17.4s, v17.4s, v16.4s +sqrdmulh v16.4S, v12.4S, v10.4S +mul v12.4S, v12.4S,v3.4S +mla v12.4S, v16.4S, v31.s[0] +sub v16.4s, v11.4s, v12.4s +add v11.4s, v11.4s, v12.4s +str q17, [x0, #0] +str q1, [x0, #16] +str q11, [x0, #32] +str q16, [x0, #48] +ldr q16, [x17, #+256] +ldr q11, [x17, #+272] +ldr q1, [x17, #+288] +ldr q17, [x17, #+304] +ldr q12, [x17, #+320] +ldr q22, [x17, #+336] +ldr q2, [x17, #+352] +ldr q21, [x17, #+368] +ldr q10, [x0, #96] +ldr q3, [x0, #112] +ldr q9, [x0, #64] +ldr q8, [x0, #80] +sqrdmulh v7.4S, v10.4S, v11.s[0] +mul v10.4S, v10.4S,v16.s[0] +mla v10.4S, v7.4S, v31.s[0] +sub v7.4s, v9.4s, v10.4s +add v9.4s, v9.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v11.s[0] +mul v3.4S, v3.4S,v16.s[0] +mla v3.4S, v10.4S, v31.s[0] +sub v10.4s, v8.4s, v3.4s +add v8.4s, v8.4s, v3.4s +sqrdmulh v3.4S, v8.4S, v11.s[1] +mul v8.4S, v8.4S,v16.s[1] +mla v8.4S, v3.4S, v31.s[0] +sub v3.4s, v9.4s, v8.4s +add v9.4s, v9.4s, v8.4s +sqrdmulh v8.4S, v10.4S, v11.s[2] +mul v10.4S, v10.4S,v16.s[2] +mla v10.4S, v8.4S, v31.s[0] +sub v8.4s, v7.4s, v10.4s +add v7.4s, v7.4s, v10.4s +trn1 v10.4S, v9.4S, v3.4S +trn2 v6.4S, v9.4S, v3.4S +trn1 v5.4S, v7.4S, v8.4S +trn2 v4.4S, v7.4S, v8.4S +trn2 v7.2D, v10.2D, v5.2D +trn2 v8.2D, v6.2D, v4.2D +trn1 v9.2D, v10.2D, v5.2D +trn1 v3.2D, v6.2D, v4.2D +sqrdmulh v4.4S, v7.4S, v17.4S +mul v7.4S, v7.4S,v1.4S +mla v7.4S, v4.4S, v31.s[0] +sub v4.4s, v9.4s, v7.4s +add v9.4s, v9.4s, v7.4s +sqrdmulh v7.4S, v8.4S, v17.4S +mul v8.4S, v8.4S,v1.4S +mla v8.4S, v7.4S, v31.s[0] +sub v7.4s, v3.4s, v8.4s +add v3.4s, v3.4s, v8.4s +sqrdmulh v8.4S, v3.4S, v22.4S +mul v3.4S, v3.4S,v12.4S +mla v3.4S, v8.4S, v31.s[0] +sub v8.4s, v9.4s, v3.4s +add v9.4s, v9.4s, v3.4s +sqrdmulh v3.4S, v7.4S, v21.4S +mul v7.4S, v7.4S,v2.4S +mla v7.4S, v3.4S, v31.s[0] +sub v3.4s, v4.4s, v7.4s +add v4.4s, v4.4s, v7.4s +str q9, [x0, #64] +str q8, [x0, #80] +str q4, [x0, #96] +str q3, [x0, #112] +ldr q3, [x17, #+384] +ldr q4, [x17, #+400] +ldr q8, [x17, #+416] +ldr q9, [x17, #+432] +ldr q7, [x17, #+448] +ldr q6, [x17, #+464] +ldr q5, [x17, #+480] +ldr q10, [x17, #+496] +ldr q21, [x0, #160] +ldr q2, [x0, #176] +ldr q22, [x0, #128] +ldr q12, [x0, #144] +sqrdmulh v17.4S, v21.4S, v4.s[0] +mul v21.4S, v21.4S,v3.s[0] +mla v21.4S, v17.4S, v31.s[0] +sub v17.4s, v22.4s, v21.4s +add v22.4s, v22.4s, v21.4s +sqrdmulh v21.4S, v2.4S, v4.s[0] +mul v2.4S, v2.4S,v3.s[0] +mla v2.4S, v21.4S, v31.s[0] +sub v21.4s, v12.4s, v2.4s +add v12.4s, v12.4s, v2.4s +sqrdmulh v2.4S, v12.4S, v4.s[1] +mul v12.4S, v12.4S,v3.s[1] +mla v12.4S, v2.4S, v31.s[0] +sub v2.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v21.4S, v4.s[2] +mul v21.4S, v21.4S,v3.s[2] +mla v21.4S, v12.4S, v31.s[0] +sub v12.4s, v17.4s, v21.4s +add v17.4s, v17.4s, v21.4s +trn1 v21.4S, v22.4S, v2.4S +trn2 v1.4S, v22.4S, v2.4S +trn1 v11.4S, v17.4S, v12.4S +trn2 v16.4S, v17.4S, v12.4S +trn2 v17.2D, v21.2D, v11.2D +trn2 v12.2D, v1.2D, v16.2D +trn1 v22.2D, v21.2D, v11.2D +trn1 v2.2D, v1.2D, v16.2D +sqrdmulh v16.4S, v17.4S, v9.4S +mul v17.4S, v17.4S,v8.4S +mla v17.4S, v16.4S, v31.s[0] +sub v16.4s, v22.4s, v17.4s +add v22.4s, v22.4s, v17.4s +sqrdmulh v17.4S, v12.4S, v9.4S +mul v12.4S, v12.4S,v8.4S +mla v12.4S, v17.4S, v31.s[0] +sub v17.4s, v2.4s, v12.4s +add v2.4s, v2.4s, v12.4s +sqrdmulh v12.4S, v2.4S, v6.4S +mul v2.4S, v2.4S,v7.4S +mla v2.4S, v12.4S, v31.s[0] +sub v12.4s, v22.4s, v2.4s +add v22.4s, v22.4s, v2.4s +sqrdmulh v2.4S, v17.4S, v10.4S +mul v17.4S, v17.4S,v5.4S +mla v17.4S, v2.4S, v31.s[0] +sub v2.4s, v16.4s, v17.4s +add v16.4s, v16.4s, v17.4s +str q22, [x0, #128] +str q12, [x0, #144] +str q16, [x0, #160] +str q2, [x0, #176] +ldr q2, [x17, #+512] +ldr q16, [x17, #+528] +ldr q12, [x17, #+544] +ldr q22, [x17, #+560] +ldr q17, [x17, #+576] +ldr q1, [x17, #+592] +ldr q11, [x17, #+608] +ldr q21, [x17, #+624] +ldr q10, [x0, #224] +ldr q5, [x0, #240] +ldr q6, [x0, #192] +ldr q7, [x0, #208] +sqrdmulh v9.4S, v10.4S, v16.s[0] +mul v10.4S, v10.4S,v2.s[0] +mla v10.4S, v9.4S, v31.s[0] +sub v9.4s, v6.4s, v10.4s +add v6.4s, v6.4s, v10.4s +sqrdmulh v10.4S, v5.4S, v16.s[0] +mul v5.4S, v5.4S,v2.s[0] +mla v5.4S, v10.4S, v31.s[0] +sub v10.4s, v7.4s, v5.4s +add v7.4s, v7.4s, v5.4s +sqrdmulh v5.4S, v7.4S, v16.s[1] +mul v7.4S, v7.4S,v2.s[1] +mla v7.4S, v5.4S, v31.s[0] +sub v5.4s, v6.4s, v7.4s +add v6.4s, v6.4s, v7.4s +sqrdmulh v7.4S, v10.4S, v16.s[2] +mul v10.4S, v10.4S,v2.s[2] +mla v10.4S, v7.4S, v31.s[0] +sub v7.4s, v9.4s, v10.4s +add v9.4s, v9.4s, v10.4s +trn1 v10.4S, v6.4S, v5.4S +trn2 v8.4S, v6.4S, v5.4S +trn1 v4.4S, v9.4S, v7.4S +trn2 v3.4S, v9.4S, v7.4S +trn2 v9.2D, v10.2D, v4.2D +trn2 v7.2D, v8.2D, v3.2D +trn1 v6.2D, v10.2D, v4.2D +trn1 v5.2D, v8.2D, v3.2D +sqrdmulh v3.4S, v9.4S, v22.4S +mul v9.4S, v9.4S,v12.4S +mla v9.4S, v3.4S, v31.s[0] +sub v3.4s, v6.4s, v9.4s +add v6.4s, v6.4s, v9.4s +sqrdmulh v9.4S, v7.4S, v22.4S +mul v7.4S, v7.4S,v12.4S +mla v7.4S, v9.4S, v31.s[0] +sub v9.4s, v5.4s, v7.4s +add v5.4s, v5.4s, v7.4s +sqrdmulh v7.4S, v5.4S, v1.4S +mul v5.4S, v5.4S,v17.4S +mla v5.4S, v7.4S, v31.s[0] +sub v7.4s, v6.4s, v5.4s +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v9.4S, v21.4S +mul v9.4S, v9.4S,v11.4S +mla v9.4S, v5.4S, v31.s[0] +sub v5.4s, v3.4s, v9.4s +add v3.4s, v3.4s, v9.4s +str q6, [x0, #192] +str q7, [x0, #208] +str q3, [x0, #224] +str q5, [x0, #240] +ldr q5, [x17, #+640] +ldr q3, [x17, #+656] +ldr q7, [x17, #+672] +ldr q6, [x17, #+688] +ldr q9, [x17, #+704] +ldr q8, [x17, #+720] +ldr q4, [x17, #+736] +ldr q10, [x17, #+752] +ldr q21, [x0, #288] +ldr q11, [x0, #304] +ldr q1, [x0, #256] +ldr q17, [x0, #272] +sqrdmulh v22.4S, v21.4S, v3.s[0] +mul v21.4S, v21.4S,v5.s[0] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v1.4s, v21.4s +add v1.4s, v1.4s, v21.4s +sqrdmulh v21.4S, v11.4S, v3.s[0] +mul v11.4S, v11.4S,v5.s[0] +mla v11.4S, v21.4S, v31.s[0] +sub v21.4s, v17.4s, v11.4s +add v17.4s, v17.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v3.s[1] +mul v17.4S, v17.4S,v5.s[1] +mla v17.4S, v11.4S, v31.s[0] +sub v11.4s, v1.4s, v17.4s +add v1.4s, v1.4s, v17.4s +sqrdmulh v17.4S, v21.4S, v3.s[2] +mul v21.4S, v21.4S,v5.s[2] +mla v21.4S, v17.4S, v31.s[0] +sub v17.4s, v22.4s, v21.4s +add v22.4s, v22.4s, v21.4s +trn1 v21.4S, v1.4S, v11.4S +trn2 v12.4S, v1.4S, v11.4S +trn1 v16.4S, v22.4S, v17.4S +trn2 v2.4S, v22.4S, v17.4S +trn2 v22.2D, v21.2D, v16.2D +trn2 v17.2D, v12.2D, v2.2D +trn1 v1.2D, v21.2D, v16.2D +trn1 v11.2D, v12.2D, v2.2D +sqrdmulh v2.4S, v22.4S, v6.4S +mul v22.4S, v22.4S,v7.4S +mla v22.4S, v2.4S, v31.s[0] +sub v2.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +sqrdmulh v22.4S, v17.4S, v6.4S +mul v17.4S, v17.4S,v7.4S +mla v17.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v17.4s +add v11.4s, v11.4s, v17.4s +sqrdmulh v17.4S, v11.4S, v8.4S +mul v11.4S, v11.4S,v9.4S +mla v11.4S, v17.4S, v31.s[0] +sub v17.4s, v1.4s, v11.4s +add v1.4s, v1.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v10.4S +mul v22.4S, v22.4S,v4.4S +mla v22.4S, v11.4S, v31.s[0] +sub v11.4s, v2.4s, v22.4s +add v2.4s, v2.4s, v22.4s +str q1, [x0, #256] +str q17, [x0, #272] +str q2, [x0, #288] +str q11, [x0, #304] +ldr q11, [x17, #+768] +ldr q2, [x17, #+784] +ldr q17, [x17, #+800] +ldr q1, [x17, #+816] +ldr q22, [x17, #+832] +ldr q12, [x17, #+848] +ldr q16, [x17, #+864] +ldr q21, [x17, #+880] +ldr q10, [x0, #352] +ldr q4, [x0, #368] +ldr q8, [x0, #320] +ldr q9, [x0, #336] +sqrdmulh v6.4S, v10.4S, v2.s[0] +mul v10.4S, v10.4S,v11.s[0] +mla v10.4S, v6.4S, v31.s[0] +sub v6.4s, v8.4s, v10.4s +add v8.4s, v8.4s, v10.4s +sqrdmulh v10.4S, v4.4S, v2.s[0] +mul v4.4S, v4.4S,v11.s[0] +mla v4.4S, v10.4S, v31.s[0] +sub v10.4s, v9.4s, v4.4s +add v9.4s, v9.4s, v4.4s +sqrdmulh v4.4S, v9.4S, v2.s[1] +mul v9.4S, v9.4S,v11.s[1] +mla v9.4S, v4.4S, v31.s[0] +sub v4.4s, v8.4s, v9.4s +add v8.4s, v8.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v2.s[2] +mul v10.4S, v10.4S,v11.s[2] +mla v10.4S, v9.4S, v31.s[0] +sub v9.4s, v6.4s, v10.4s +add v6.4s, v6.4s, v10.4s +trn1 v10.4S, v8.4S, v4.4S +trn2 v7.4S, v8.4S, v4.4S +trn1 v3.4S, v6.4S, v9.4S +trn2 v5.4S, v6.4S, v9.4S +trn2 v6.2D, v10.2D, v3.2D +trn2 v9.2D, v7.2D, v5.2D +trn1 v8.2D, v10.2D, v3.2D +trn1 v4.2D, v7.2D, v5.2D +sqrdmulh v5.4S, v6.4S, v1.4S +mul v6.4S, v6.4S,v17.4S +mla v6.4S, v5.4S, v31.s[0] +sub v5.4s, v8.4s, v6.4s +add v8.4s, v8.4s, v6.4s +sqrdmulh v6.4S, v9.4S, v1.4S +mul v9.4S, v9.4S,v17.4S +mla v9.4S, v6.4S, v31.s[0] +sub v6.4s, v4.4s, v9.4s +add v4.4s, v4.4s, v9.4s +sqrdmulh v9.4S, v4.4S, v12.4S +mul v4.4S, v4.4S,v22.4S +mla v4.4S, v9.4S, v31.s[0] +sub v9.4s, v8.4s, v4.4s +add v8.4s, v8.4s, v4.4s +sqrdmulh v4.4S, v6.4S, v21.4S +mul v6.4S, v6.4S,v16.4S +mla v6.4S, v4.4S, v31.s[0] +sub v4.4s, v5.4s, v6.4s +add v5.4s, v5.4s, v6.4s +str q8, [x0, #320] +str q9, [x0, #336] +str q5, [x0, #352] +str q4, [x0, #368] +ldr q4, [x17, #+896] +ldr q5, [x17, #+912] +ldr q9, [x17, #+928] +ldr q8, [x17, #+944] +ldr q6, [x17, #+960] +ldr q7, [x17, #+976] +ldr q3, [x17, #+992] +ldr q10, [x17, #+1008] +ldr q21, [x0, #416] +ldr q16, [x0, #432] +ldr q12, [x0, #384] +ldr q22, [x0, #400] +sqrdmulh v1.4S, v21.4S, v5.s[0] +mul v21.4S, v21.4S,v4.s[0] +mla v21.4S, v1.4S, v31.s[0] +sub v1.4s, v12.4s, v21.4s +add v12.4s, v12.4s, v21.4s +sqrdmulh v21.4S, v16.4S, v5.s[0] +mul v16.4S, v16.4S,v4.s[0] +mla v16.4S, v21.4S, v31.s[0] +sub v21.4s, v22.4s, v16.4s +add v22.4s, v22.4s, v16.4s +sqrdmulh v16.4S, v22.4S, v5.s[1] +mul v22.4S, v22.4S,v4.s[1] +mla v22.4S, v16.4S, v31.s[0] +sub v16.4s, v12.4s, v22.4s +add v12.4s, v12.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v5.s[2] +mul v21.4S, v21.4S,v4.s[2] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v1.4s, v21.4s +add v1.4s, v1.4s, v21.4s +trn1 v21.4S, v12.4S, v16.4S +trn2 v17.4S, v12.4S, v16.4S +trn1 v2.4S, v1.4S, v22.4S +trn2 v11.4S, v1.4S, v22.4S +trn2 v1.2D, v21.2D, v2.2D +trn2 v22.2D, v17.2D, v11.2D +trn1 v12.2D, v21.2D, v2.2D +trn1 v16.2D, v17.2D, v11.2D +sqrdmulh v11.4S, v1.4S, v8.4S +mul v1.4S, v1.4S,v9.4S +mla v1.4S, v11.4S, v31.s[0] +sub v11.4s, v12.4s, v1.4s +add v12.4s, v12.4s, v1.4s +sqrdmulh v1.4S, v22.4S, v8.4S +mul v22.4S, v22.4S,v9.4S +mla v22.4S, v1.4S, v31.s[0] +sub v1.4s, v16.4s, v22.4s +add v16.4s, v16.4s, v22.4s +sqrdmulh v22.4S, v16.4S, v7.4S +mul v16.4S, v16.4S,v6.4S +mla v16.4S, v22.4S, v31.s[0] +sub v22.4s, v12.4s, v16.4s +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v1.4S, v10.4S +mul v1.4S, v1.4S,v3.4S +mla v1.4S, v16.4S, v31.s[0] +sub v16.4s, v11.4s, v1.4s +add v11.4s, v11.4s, v1.4s +str q12, [x0, #384] +str q22, [x0, #400] +str q11, [x0, #416] +str q16, [x0, #432] +ldr q16, [x17, #+1024] +ldr q11, [x17, #+1040] +ldr q22, [x17, #+1056] +ldr q12, [x17, #+1072] +ldr q1, [x17, #+1088] +ldr q17, [x17, #+1104] +ldr q2, [x17, #+1120] +ldr q21, [x17, #+1136] +ldr q10, [x0, #480] +ldr q3, [x0, #496] +ldr q7, [x0, #448] +ldr q6, [x0, #464] +sqrdmulh v8.4S, v10.4S, v11.s[0] +mul v10.4S, v10.4S,v16.s[0] +mla v10.4S, v8.4S, v31.s[0] +sub v8.4s, v7.4s, v10.4s +add v7.4s, v7.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v11.s[0] +mul v3.4S, v3.4S,v16.s[0] +mla v3.4S, v10.4S, v31.s[0] +sub v10.4s, v6.4s, v3.4s +add v6.4s, v6.4s, v3.4s +sqrdmulh v3.4S, v6.4S, v11.s[1] +mul v6.4S, v6.4S,v16.s[1] +mla v6.4S, v3.4S, v31.s[0] +sub v3.4s, v7.4s, v6.4s +add v7.4s, v7.4s, v6.4s +sqrdmulh v6.4S, v10.4S, v11.s[2] +mul v10.4S, v10.4S,v16.s[2] +mla v10.4S, v6.4S, v31.s[0] +sub v6.4s, v8.4s, v10.4s +add v8.4s, v8.4s, v10.4s +trn1 v10.4S, v7.4S, v3.4S +trn2 v9.4S, v7.4S, v3.4S +trn1 v5.4S, v8.4S, v6.4S +trn2 v4.4S, v8.4S, v6.4S +trn2 v8.2D, v10.2D, v5.2D +trn2 v6.2D, v9.2D, v4.2D +trn1 v7.2D, v10.2D, v5.2D +trn1 v3.2D, v9.2D, v4.2D +sqrdmulh v4.4S, v8.4S, v12.4S +mul v8.4S, v8.4S,v22.4S +mla v8.4S, v4.4S, v31.s[0] +sub v4.4s, v7.4s, v8.4s +add v7.4s, v7.4s, v8.4s +sqrdmulh v8.4S, v6.4S, v12.4S +mul v6.4S, v6.4S,v22.4S +mla v6.4S, v8.4S, v31.s[0] +sub v8.4s, v3.4s, v6.4s +add v3.4s, v3.4s, v6.4s +sqrdmulh v6.4S, v3.4S, v17.4S +mul v3.4S, v3.4S,v1.4S +mla v3.4S, v6.4S, v31.s[0] +sub v6.4s, v7.4s, v3.4s +add v7.4s, v7.4s, v3.4s +sqrdmulh v3.4S, v8.4S, v21.4S +mul v8.4S, v8.4S,v2.4S +mla v8.4S, v3.4S, v31.s[0] +sub v3.4s, v4.4s, v8.4s +add v4.4s, v4.4s, v8.4s +str q7, [x0, #448] +str q6, [x0, #464] +str q4, [x0, #480] +str q3, [x0, #496] +ldr q3, [x17, #+1152] +ldr q4, [x17, #+1168] +ldr q6, [x17, #+1184] +ldr q7, [x17, #+1200] +ldr q8, [x17, #+1216] +ldr q9, [x17, #+1232] +ldr q5, [x17, #+1248] +ldr q10, [x17, #+1264] +ldr q21, [x0, #544] +ldr q2, [x0, #560] +ldr q17, [x0, #512] +ldr q1, [x0, #528] +sqrdmulh v12.4S, v21.4S, v4.s[0] +mul v21.4S, v21.4S,v3.s[0] +mla v21.4S, v12.4S, v31.s[0] +sub v12.4s, v17.4s, v21.4s +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v2.4S, v4.s[0] +mul v2.4S, v2.4S,v3.s[0] +mla v2.4S, v21.4S, v31.s[0] +sub v21.4s, v1.4s, v2.4s +add v1.4s, v1.4s, v2.4s +sqrdmulh v2.4S, v1.4S, v4.s[1] +mul v1.4S, v1.4S,v3.s[1] +mla v1.4S, v2.4S, v31.s[0] +sub v2.4s, v17.4s, v1.4s +add v17.4s, v17.4s, v1.4s +sqrdmulh v1.4S, v21.4S, v4.s[2] +mul v21.4S, v21.4S,v3.s[2] +mla v21.4S, v1.4S, v31.s[0] +sub v1.4s, v12.4s, v21.4s +add v12.4s, v12.4s, v21.4s +trn1 v21.4S, v17.4S, v2.4S +trn2 v22.4S, v17.4S, v2.4S +trn1 v11.4S, v12.4S, v1.4S +trn2 v16.4S, v12.4S, v1.4S +trn2 v12.2D, v21.2D, v11.2D +trn2 v1.2D, v22.2D, v16.2D +trn1 v17.2D, v21.2D, v11.2D +trn1 v2.2D, v22.2D, v16.2D +sqrdmulh v16.4S, v12.4S, v7.4S +mul v12.4S, v12.4S,v6.4S +mla v12.4S, v16.4S, v31.s[0] +sub v16.4s, v17.4s, v12.4s +add v17.4s, v17.4s, v12.4s +sqrdmulh v12.4S, v1.4S, v7.4S +mul v1.4S, v1.4S,v6.4S +mla v1.4S, v12.4S, v31.s[0] +sub v12.4s, v2.4s, v1.4s +add v2.4s, v2.4s, v1.4s +sqrdmulh v1.4S, v2.4S, v9.4S +mul v2.4S, v2.4S,v8.4S +mla v2.4S, v1.4S, v31.s[0] +sub v1.4s, v17.4s, v2.4s +add v17.4s, v17.4s, v2.4s +sqrdmulh v2.4S, v12.4S, v10.4S +mul v12.4S, v12.4S,v5.4S +mla v12.4S, v2.4S, v31.s[0] +sub v2.4s, v16.4s, v12.4s +add v16.4s, v16.4s, v12.4s +str q17, [x0, #512] +str q1, [x0, #528] +str q16, [x0, #544] +str q2, [x0, #560] +ldr q2, [x17, #+1280] +ldr q16, [x17, #+1296] +ldr q1, [x17, #+1312] +ldr q17, [x17, #+1328] +ldr q12, [x17, #+1344] +ldr q22, [x17, #+1360] +ldr q11, [x17, #+1376] +ldr q21, [x17, #+1392] +ldr q10, [x0, #608] +ldr q5, [x0, #624] +ldr q9, [x0, #576] +ldr q8, [x0, #592] +sqrdmulh v7.4S, v10.4S, v16.s[0] +mul v10.4S, v10.4S,v2.s[0] +mla v10.4S, v7.4S, v31.s[0] +sub v7.4s, v9.4s, v10.4s +add v9.4s, v9.4s, v10.4s +sqrdmulh v10.4S, v5.4S, v16.s[0] +mul v5.4S, v5.4S,v2.s[0] +mla v5.4S, v10.4S, v31.s[0] +sub v10.4s, v8.4s, v5.4s +add v8.4s, v8.4s, v5.4s +sqrdmulh v5.4S, v8.4S, v16.s[1] +mul v8.4S, v8.4S,v2.s[1] +mla v8.4S, v5.4S, v31.s[0] +sub v5.4s, v9.4s, v8.4s +add v9.4s, v9.4s, v8.4s +sqrdmulh v8.4S, v10.4S, v16.s[2] +mul v10.4S, v10.4S,v2.s[2] +mla v10.4S, v8.4S, v31.s[0] +sub v8.4s, v7.4s, v10.4s +add v7.4s, v7.4s, v10.4s +trn1 v10.4S, v9.4S, v5.4S +trn2 v6.4S, v9.4S, v5.4S +trn1 v4.4S, v7.4S, v8.4S +trn2 v3.4S, v7.4S, v8.4S +trn2 v7.2D, v10.2D, v4.2D +trn2 v8.2D, v6.2D, v3.2D +trn1 v9.2D, v10.2D, v4.2D +trn1 v5.2D, v6.2D, v3.2D +sqrdmulh v3.4S, v7.4S, v17.4S +mul v7.4S, v7.4S,v1.4S +mla v7.4S, v3.4S, v31.s[0] +sub v3.4s, v9.4s, v7.4s +add v9.4s, v9.4s, v7.4s +sqrdmulh v7.4S, v8.4S, v17.4S +mul v8.4S, v8.4S,v1.4S +mla v8.4S, v7.4S, v31.s[0] +sub v7.4s, v5.4s, v8.4s +add v5.4s, v5.4s, v8.4s +sqrdmulh v8.4S, v5.4S, v22.4S +mul v5.4S, v5.4S,v12.4S +mla v5.4S, v8.4S, v31.s[0] +sub v8.4s, v9.4s, v5.4s +add v9.4s, v9.4s, v5.4s +sqrdmulh v5.4S, v7.4S, v21.4S +mul v7.4S, v7.4S,v11.4S +mla v7.4S, v5.4S, v31.s[0] +sub v5.4s, v3.4s, v7.4s +add v3.4s, v3.4s, v7.4s +str q9, [x0, #576] +str q8, [x0, #592] +str q3, [x0, #608] +str q5, [x0, #624] +ldr q5, [x17, #+1408] +ldr q3, [x17, #+1424] +ldr q8, [x17, #+1440] +ldr q9, [x17, #+1456] +ldr q7, [x17, #+1472] +ldr q6, [x17, #+1488] +ldr q4, [x17, #+1504] +ldr q10, [x17, #+1520] +ldr q21, [x0, #672] +ldr q11, [x0, #688] +ldr q22, [x0, #640] +ldr q12, [x0, #656] +sqrdmulh v17.4S, v21.4S, v3.s[0] +mul v21.4S, v21.4S,v5.s[0] +mla v21.4S, v17.4S, v31.s[0] +sub v17.4s, v22.4s, v21.4s +add v22.4s, v22.4s, v21.4s +sqrdmulh v21.4S, v11.4S, v3.s[0] +mul v11.4S, v11.4S,v5.s[0] +mla v11.4S, v21.4S, v31.s[0] +sub v21.4s, v12.4s, v11.4s +add v12.4s, v12.4s, v11.4s +sqrdmulh v11.4S, v12.4S, v3.s[1] +mul v12.4S, v12.4S,v5.s[1] +mla v12.4S, v11.4S, v31.s[0] +sub v11.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v21.4S, v3.s[2] +mul v21.4S, v21.4S,v5.s[2] +mla v21.4S, v12.4S, v31.s[0] +sub v12.4s, v17.4s, v21.4s +add v17.4s, v17.4s, v21.4s +trn1 v21.4S, v22.4S, v11.4S +trn2 v1.4S, v22.4S, v11.4S +trn1 v16.4S, v17.4S, v12.4S +trn2 v2.4S, v17.4S, v12.4S +trn2 v17.2D, v21.2D, v16.2D +trn2 v12.2D, v1.2D, v2.2D +trn1 v22.2D, v21.2D, v16.2D +trn1 v11.2D, v1.2D, v2.2D +sqrdmulh v2.4S, v17.4S, v9.4S +mul v17.4S, v17.4S,v8.4S +mla v17.4S, v2.4S, v31.s[0] +sub v2.4s, v22.4s, v17.4s +add v22.4s, v22.4s, v17.4s +sqrdmulh v17.4S, v12.4S, v9.4S +mul v12.4S, v12.4S,v8.4S +mla v12.4S, v17.4S, v31.s[0] +sub v17.4s, v11.4s, v12.4s +add v11.4s, v11.4s, v12.4s +sqrdmulh v12.4S, v11.4S, v6.4S +mul v11.4S, v11.4S,v7.4S +mla v11.4S, v12.4S, v31.s[0] +sub v12.4s, v22.4s, v11.4s +add v22.4s, v22.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v10.4S +mul v17.4S, v17.4S,v4.4S +mla v17.4S, v11.4S, v31.s[0] +sub v11.4s, v2.4s, v17.4s +add v2.4s, v2.4s, v17.4s +str q22, [x0, #640] +str q12, [x0, #656] +str q2, [x0, #672] +str q11, [x0, #688] +ldr q11, [x17, #+1536] +ldr q2, [x17, #+1552] +ldr q12, [x17, #+1568] +ldr q22, [x17, #+1584] +ldr q17, [x17, #+1600] +ldr q1, [x17, #+1616] +ldr q16, [x17, #+1632] +ldr q21, [x17, #+1648] +ldr q10, [x0, #736] +ldr q4, [x0, #752] +ldr q6, [x0, #704] +ldr q7, [x0, #720] +sqrdmulh v9.4S, v10.4S, v2.s[0] +mul v10.4S, v10.4S,v11.s[0] +mla v10.4S, v9.4S, v31.s[0] +sub v9.4s, v6.4s, v10.4s +add v6.4s, v6.4s, v10.4s +sqrdmulh v10.4S, v4.4S, v2.s[0] +mul v4.4S, v4.4S,v11.s[0] +mla v4.4S, v10.4S, v31.s[0] +sub v10.4s, v7.4s, v4.4s +add v7.4s, v7.4s, v4.4s +sqrdmulh v4.4S, v7.4S, v2.s[1] +mul v7.4S, v7.4S,v11.s[1] +mla v7.4S, v4.4S, v31.s[0] +sub v4.4s, v6.4s, v7.4s +add v6.4s, v6.4s, v7.4s +sqrdmulh v7.4S, v10.4S, v2.s[2] +mul v10.4S, v10.4S,v11.s[2] +mla v10.4S, v7.4S, v31.s[0] +sub v7.4s, v9.4s, v10.4s +add v9.4s, v9.4s, v10.4s +trn1 v10.4S, v6.4S, v4.4S +trn2 v8.4S, v6.4S, v4.4S +trn1 v3.4S, v9.4S, v7.4S +trn2 v5.4S, v9.4S, v7.4S +trn2 v9.2D, v10.2D, v3.2D +trn2 v7.2D, v8.2D, v5.2D +trn1 v6.2D, v10.2D, v3.2D +trn1 v4.2D, v8.2D, v5.2D +sqrdmulh v5.4S, v9.4S, v22.4S +mul v9.4S, v9.4S,v12.4S +mla v9.4S, v5.4S, v31.s[0] +sub v5.4s, v6.4s, v9.4s +add v6.4s, v6.4s, v9.4s +sqrdmulh v9.4S, v7.4S, v22.4S +mul v7.4S, v7.4S,v12.4S +mla v7.4S, v9.4S, v31.s[0] +sub v9.4s, v4.4s, v7.4s +add v4.4s, v4.4s, v7.4s +sqrdmulh v7.4S, v4.4S, v1.4S +mul v4.4S, v4.4S,v17.4S +mla v4.4S, v7.4S, v31.s[0] +sub v7.4s, v6.4s, v4.4s +add v6.4s, v6.4s, v4.4s +sqrdmulh v4.4S, v9.4S, v21.4S +mul v9.4S, v9.4S,v16.4S +mla v9.4S, v4.4S, v31.s[0] +sub v4.4s, v5.4s, v9.4s +add v5.4s, v5.4s, v9.4s +str q6, [x0, #704] +str q7, [x0, #720] +str q5, [x0, #736] +str q4, [x0, #752] +ldr q4, [x17, #+1664] +ldr q5, [x17, #+1680] +ldr q7, [x17, #+1696] +ldr q6, [x17, #+1712] +ldr q9, [x17, #+1728] +ldr q8, [x17, #+1744] +ldr q3, [x17, #+1760] +ldr q10, [x17, #+1776] +ldr q21, [x0, #800] +ldr q16, [x0, #816] +ldr q1, [x0, #768] +ldr q17, [x0, #784] +sqrdmulh v22.4S, v21.4S, v5.s[0] +mul v21.4S, v21.4S,v4.s[0] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v1.4s, v21.4s +add v1.4s, v1.4s, v21.4s +sqrdmulh v21.4S, v16.4S, v5.s[0] +mul v16.4S, v16.4S,v4.s[0] +mla v16.4S, v21.4S, v31.s[0] +sub v21.4s, v17.4s, v16.4s +add v17.4s, v17.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v5.s[1] +mul v17.4S, v17.4S,v4.s[1] +mla v17.4S, v16.4S, v31.s[0] +sub v16.4s, v1.4s, v17.4s +add v1.4s, v1.4s, v17.4s +sqrdmulh v17.4S, v21.4S, v5.s[2] +mul v21.4S, v21.4S,v4.s[2] +mla v21.4S, v17.4S, v31.s[0] +sub v17.4s, v22.4s, v21.4s +add v22.4s, v22.4s, v21.4s +trn1 v21.4S, v1.4S, v16.4S +trn2 v12.4S, v1.4S, v16.4S +trn1 v2.4S, v22.4S, v17.4S +trn2 v11.4S, v22.4S, v17.4S +trn2 v22.2D, v21.2D, v2.2D +trn2 v17.2D, v12.2D, v11.2D +trn1 v1.2D, v21.2D, v2.2D +trn1 v16.2D, v12.2D, v11.2D +sqrdmulh v11.4S, v22.4S, v6.4S +mul v22.4S, v22.4S,v7.4S +mla v22.4S, v11.4S, v31.s[0] +sub v11.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +sqrdmulh v22.4S, v17.4S, v6.4S +mul v17.4S, v17.4S,v7.4S +mla v17.4S, v22.4S, v31.s[0] +sub v22.4s, v16.4s, v17.4s +add v16.4s, v16.4s, v17.4s +sqrdmulh v17.4S, v16.4S, v8.4S +mul v16.4S, v16.4S,v9.4S +mla v16.4S, v17.4S, v31.s[0] +sub v17.4s, v1.4s, v16.4s +add v1.4s, v1.4s, v16.4s +sqrdmulh v16.4S, v22.4S, v10.4S +mul v22.4S, v22.4S,v3.4S +mla v22.4S, v16.4S, v31.s[0] +sub v16.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +str q1, [x0, #768] +str q17, [x0, #784] +str q11, [x0, #800] +str q16, [x0, #816] +ldr q16, [x17, #+1792] +ldr q11, [x17, #+1808] +ldr q17, [x17, #+1824] +ldr q1, [x17, #+1840] +ldr q22, [x17, #+1856] +ldr q12, [x17, #+1872] +ldr q2, [x17, #+1888] +ldr q21, [x17, #+1904] +ldr q10, [x0, #864] +ldr q3, [x0, #880] +ldr q8, [x0, #832] +ldr q9, [x0, #848] +sqrdmulh v6.4S, v10.4S, v11.s[0] +mul v10.4S, v10.4S,v16.s[0] +mla v10.4S, v6.4S, v31.s[0] +sub v6.4s, v8.4s, v10.4s +add v8.4s, v8.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v11.s[0] +mul v3.4S, v3.4S,v16.s[0] +mla v3.4S, v10.4S, v31.s[0] +sub v10.4s, v9.4s, v3.4s +add v9.4s, v9.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v11.s[1] +mul v9.4S, v9.4S,v16.s[1] +mla v9.4S, v3.4S, v31.s[0] +sub v3.4s, v8.4s, v9.4s +add v8.4s, v8.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v11.s[2] +mul v10.4S, v10.4S,v16.s[2] +mla v10.4S, v9.4S, v31.s[0] +sub v9.4s, v6.4s, v10.4s +add v6.4s, v6.4s, v10.4s +trn1 v10.4S, v8.4S, v3.4S +trn2 v7.4S, v8.4S, v3.4S +trn1 v5.4S, v6.4S, v9.4S +trn2 v4.4S, v6.4S, v9.4S +trn2 v6.2D, v10.2D, v5.2D +trn2 v9.2D, v7.2D, v4.2D +trn1 v8.2D, v10.2D, v5.2D +trn1 v3.2D, v7.2D, v4.2D +sqrdmulh v4.4S, v6.4S, v1.4S +mul v6.4S, v6.4S,v17.4S +mla v6.4S, v4.4S, v31.s[0] +sub v4.4s, v8.4s, v6.4s +add v8.4s, v8.4s, v6.4s +sqrdmulh v6.4S, v9.4S, v1.4S +mul v9.4S, v9.4S,v17.4S +mla v9.4S, v6.4S, v31.s[0] +sub v6.4s, v3.4s, v9.4s +add v3.4s, v3.4s, v9.4s +sqrdmulh v9.4S, v3.4S, v12.4S +mul v3.4S, v3.4S,v22.4S +mla v3.4S, v9.4S, v31.s[0] +sub v9.4s, v8.4s, v3.4s +add v8.4s, v8.4s, v3.4s +sqrdmulh v3.4S, v6.4S, v21.4S +mul v6.4S, v6.4S,v2.4S +mla v6.4S, v3.4S, v31.s[0] +sub v3.4s, v4.4s, v6.4s +add v4.4s, v4.4s, v6.4s +str q8, [x0, #832] +str q9, [x0, #848] +str q4, [x0, #864] +str q3, [x0, #880] +ldr q3, [x17, #+1920] +ldr q4, [x17, #+1936] +ldr q9, [x17, #+1952] +ldr q8, [x17, #+1968] +ldr q6, [x17, #+1984] +ldr q7, [x17, #+2000] +ldr q5, [x17, #+2016] +ldr q10, [x17, #+2032] +ldr q21, [x0, #928] +ldr q2, [x0, #944] +ldr q12, [x0, #896] +ldr q22, [x0, #912] +sqrdmulh v1.4S, v21.4S, v4.s[0] +mul v21.4S, v21.4S,v3.s[0] +mla v21.4S, v1.4S, v31.s[0] +sub v1.4s, v12.4s, v21.4s +add v12.4s, v12.4s, v21.4s +sqrdmulh v21.4S, v2.4S, v4.s[0] +mul v2.4S, v2.4S,v3.s[0] +mla v2.4S, v21.4S, v31.s[0] +sub v21.4s, v22.4s, v2.4s +add v22.4s, v22.4s, v2.4s +sqrdmulh v2.4S, v22.4S, v4.s[1] +mul v22.4S, v22.4S,v3.s[1] +mla v22.4S, v2.4S, v31.s[0] +sub v2.4s, v12.4s, v22.4s +add v12.4s, v12.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v4.s[2] +mul v21.4S, v21.4S,v3.s[2] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v1.4s, v21.4s +add v1.4s, v1.4s, v21.4s +trn1 v21.4S, v12.4S, v2.4S +trn2 v17.4S, v12.4S, v2.4S +trn1 v11.4S, v1.4S, v22.4S +trn2 v16.4S, v1.4S, v22.4S +trn2 v1.2D, v21.2D, v11.2D +trn2 v22.2D, v17.2D, v16.2D +trn1 v12.2D, v21.2D, v11.2D +trn1 v2.2D, v17.2D, v16.2D +sqrdmulh v16.4S, v1.4S, v8.4S +mul v1.4S, v1.4S,v9.4S +mla v1.4S, v16.4S, v31.s[0] +sub v16.4s, v12.4s, v1.4s +add v12.4s, v12.4s, v1.4s +sqrdmulh v1.4S, v22.4S, v8.4S +mul v22.4S, v22.4S,v9.4S +mla v22.4S, v1.4S, v31.s[0] +sub v1.4s, v2.4s, v22.4s +add v2.4s, v2.4s, v22.4s +sqrdmulh v22.4S, v2.4S, v7.4S +mul v2.4S, v2.4S,v6.4S +mla v2.4S, v22.4S, v31.s[0] +sub v22.4s, v12.4s, v2.4s +add v12.4s, v12.4s, v2.4s +sqrdmulh v2.4S, v1.4S, v10.4S +mul v1.4S, v1.4S,v5.4S +mla v1.4S, v2.4S, v31.s[0] +sub v2.4s, v16.4s, v1.4s +add v16.4s, v16.4s, v1.4s +str q12, [x0, #896] +str q22, [x0, #912] +str q16, [x0, #928] +str q2, [x0, #944] +ldr q2, [x17, #+2048] +ldr q16, [x17, #+2064] +ldr q22, [x17, #+2080] +ldr q12, [x17, #+2096] +ldr q1, [x17, #+2112] +ldr q17, [x17, #+2128] +ldr q11, [x17, #+2144] +ldr q21, [x17, #+2160] +ldr q10, [x0, #992] +ldr q5, [x0, #1008] +ldr q7, [x0, #960] +ldr q6, [x0, #976] +sqrdmulh v8.4S, v10.4S, v16.s[0] +mul v10.4S, v10.4S,v2.s[0] +mla v10.4S, v8.4S, v31.s[0] +sub v8.4s, v7.4s, v10.4s +add v7.4s, v7.4s, v10.4s +sqrdmulh v10.4S, v5.4S, v16.s[0] +mul v5.4S, v5.4S,v2.s[0] +mla v5.4S, v10.4S, v31.s[0] +sub v10.4s, v6.4s, v5.4s +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v6.4S, v16.s[1] +mul v6.4S, v6.4S,v2.s[1] +mla v6.4S, v5.4S, v31.s[0] +sub v5.4s, v7.4s, v6.4s +add v7.4s, v7.4s, v6.4s +sqrdmulh v6.4S, v10.4S, v16.s[2] +mul v10.4S, v10.4S,v2.s[2] +mla v10.4S, v6.4S, v31.s[0] +sub v6.4s, v8.4s, v10.4s +add v8.4s, v8.4s, v10.4s +trn1 v10.4S, v7.4S, v5.4S +trn2 v9.4S, v7.4S, v5.4S +trn1 v4.4S, v8.4S, v6.4S +trn2 v3.4S, v8.4S, v6.4S +trn2 v8.2D, v10.2D, v4.2D +trn2 v6.2D, v9.2D, v3.2D +trn1 v7.2D, v10.2D, v4.2D +trn1 v5.2D, v9.2D, v3.2D +sqrdmulh v3.4S, v8.4S, v12.4S +mul v8.4S, v8.4S,v22.4S +mla v8.4S, v3.4S, v31.s[0] +sub v3.4s, v7.4s, v8.4s +add v7.4s, v7.4s, v8.4s +sqrdmulh v8.4S, v6.4S, v12.4S +mul v6.4S, v6.4S,v22.4S +mla v6.4S, v8.4S, v31.s[0] +sub v8.4s, v5.4s, v6.4s +add v5.4s, v5.4s, v6.4s +sqrdmulh v6.4S, v5.4S, v17.4S +mul v5.4S, v5.4S,v1.4S +mla v5.4S, v6.4S, v31.s[0] +sub v6.4s, v7.4s, v5.4s +add v7.4s, v7.4s, v5.4s +sqrdmulh v5.4S, v8.4S, v21.4S +mul v8.4S, v8.4S,v11.4S +mla v8.4S, v5.4S, v31.s[0] +sub v5.4s, v3.4s, v8.4s +add v3.4s, v3.4s, v8.4s +str q7, [x0, #960] +str q6, [x0, #976] +str q3, [x0, #992] +str q5, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 2392 +// Instruction count: 2388 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_10_0.s b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_10_0.s new file mode 100644 index 0000000..c97d115 --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_10_0.s @@ -0,0 +1,2486 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 26036764 // Layer 6, block 0 +.word 7065381 // Layer 6, block 1 +.word 11280567 // Layer 6, block 2 +.word 19695786 // Layer 6, block 3 +.word 1666225723 // Layer 6, block 0 +.word 452149874 // Layer 6, block 1 +.word 721901190 // Layer 6, block 2 +.word 1260434103 // Layer 6, block 3 +.word 28678040 // Layer 7, block 0 +.word 5637166 // Layer 7, block 2 +.word 18759424 // Layer 7, block 4 +.word 8648030 // Layer 7, block 6 +.word 1835254486 // Layer 7, block 0 +.word 360751090 // Layer 7, block 2 +.word 1200511508 // Layer 7, block 4 +.word 553431680 // Layer 7, block 6 +.word 7232147 // Layer 7, block 1 +.word 7430689 // Layer 7, block 3 +.word 14819378 // Layer 7, block 5 +.word 22112339 // Layer 7, block 7 +.word 462822084 // Layer 7, block 1 +.word 475527802 // Layer 7, block 3 +.word 948367809 // Layer 7, block 5 +.word 1415081692 // Layer 7, block 7 +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14834498 // Layer 6, block 4 +.word 22861321 // Layer 6, block 5 +.word 23033862 // Layer 6, block 6 +.word 32211066 // Layer 6, block 7 +.word 949335415 // Layer 6, block 4 +.word 1463012881 // Layer 6, block 5 +.word 1474054663 // Layer 6, block 6 +.word 2061350894 // Layer 6, block 7 +.word 7103825 // Layer 7, block 8 +.word 24338119 // Layer 7, block 10 +.word 6674394 // Layer 7, block 12 +.word 3716128 // Layer 7, block 14 +.word 454610102 // Layer 7, block 8 +.word 1557520740 // Layer 7, block 10 +.word 427128616 // Layer 7, block 12 +.word 237814041 // Layer 7, block 14 +.word 18577393 // Layer 7, block 9 +.word 17042091 // Layer 7, block 11 +.word 6574213 // Layer 7, block 13 +.word 24666803 // Layer 7, block 15 +.word 1188862414 // Layer 7, block 9 +.word 1090610585 // Layer 7, block 11 +.word 420717521 // Layer 7, block 13 +.word 1578554911 // Layer 7, block 15 +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 11253846 // Layer 6, block 8 +.word 16151303 // Layer 6, block 9 +.word 1821442 // Layer 6, block 10 +.word 23358663 // Layer 6, block 11 +.word 720191176 // Layer 6, block 8 +.word 1033604503 // Layer 6, block 9 +.word 116563391 // Layer 6, block 10 +.word 1494840340 // Layer 6, block 11 +.word 32787475 // Layer 7, block 16 +.word 8269259 // Layer 7, block 18 +.word 20826321 // Layer 7, block 20 +.word 21194054 // Layer 7, block 22 +.word 2098238255 // Layer 7, block 16 +.word 529192186 // Layer 7, block 18 +.word 1332782821 // Layer 7, block 20 +.word 1356315937 // Layer 7, block 22 +.word 28400654 // Layer 7, block 17 +.word 31090287 // Layer 7, block 19 +.word 26776841 // Layer 7, block 21 +.word 22281074 // Layer 7, block 23 +.word 1817503137 // Layer 7, block 17 +.word 1989626512 // Layer 7, block 19 +.word 1713587037 // Layer 7, block 21 +.word 1425879908 // Layer 7, block 23 +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 20504641 // Layer 6, block 12 +.word 7735096 // Layer 6, block 13 +.word 29463916 // Layer 6, block 14 +.word 23172067 // Layer 6, block 15 +.word 1312196872 // Layer 6, block 12 +.word 495008363 // Layer 6, block 13 +.word 1885546712 // Layer 6, block 14 +.word 1482899108 // Layer 6, block 15 +.word 1953000 // Layer 7, block 24 +.word 12766243 // Layer 7, block 26 +.word 16292342 // Layer 7, block 28 +.word 25143337 // Layer 7, block 30 +.word 124982461 // Layer 7, block 24 +.word 816977197 // Layer 7, block 26 +.word 1042630311 // Layer 7, block 28 +.word 1609050759 // Layer 7, block 30 +.word 12486848 // Layer 7, block 25 +.word 31556661 // Layer 7, block 27 +.word 28330310 // Layer 7, block 29 +.word 15137961 // Layer 7, block 31 +.word 799097282 // Layer 7, block 25 +.word 2019472170 // Layer 7, block 27 +.word 1813001465 // Layer 7, block 29 +.word 968755565 // Layer 7, block 31 +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 18663828 // Layer 6, block 16 +.word 25765932 // Layer 6, block 17 +.word 11779122 // Layer 6, block 18 +.word 29112305 // Layer 6, block 19 +.word 1194393831 // Layer 6, block 16 +.word 1648893798 // Layer 6, block 17 +.word 753806275 // Layer 6, block 18 +.word 1863045325 // Layer 6, block 19 +.word 33163184 // Layer 7, block 32 +.word 11550623 // Layer 7, block 34 +.word 25375595 // Layer 7, block 36 +.word 18254638 // Layer 7, block 38 +.word 2122281795 // Layer 7, block 32 +.word 739183455 // Layer 7, block 34 +.word 1623914137 // Layer 7, block 36 +.word 1168207670 // Layer 7, block 38 +.word 9551359 // Layer 7, block 33 +.word 33257316 // Layer 7, block 35 +.word 10387700 // Layer 7, block 37 +.word 4263629 // Layer 7, block 39 +.word 611240324 // Layer 7, block 33 +.word 2128305784 // Layer 7, block 35 +.word 664762063 // Layer 7, block 37 +.word 272851431 // Layer 7, block 39 +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 596073 // Layer 6, block 20 +.word 29039358 // Layer 6, block 21 +.word 6760262 // Layer 6, block 22 +.word 2228887 // Layer 6, block 23 +.word 38145761 // Layer 6, block 20 +.word 1858377074 // Layer 6, block 21 +.word 432623749 // Layer 6, block 22 +.word 142637881 // Layer 6, block 23 +.word 25929180 // Layer 7, block 40 +.word 23508428 // Layer 7, block 42 +.word 22560727 // Layer 7, block 44 +.word 29457393 // Layer 7, block 46 +.word 1659340873 // Layer 7, block 40 +.word 1504424569 // Layer 7, block 42 +.word 1443776334 // Layer 7, block 44 +.word 1885129272 // Layer 7, block 46 +.word 17371159 // Layer 7, block 41 +.word 11558208 // Layer 7, block 43 +.word 15755637 // Layer 7, block 45 +.word 20740787 // Layer 7, block 47 +.word 1111669329 // Layer 7, block 41 +.word 739668858 // Layer 7, block 43 +.word 1008283812 // Layer 7, block 45 +.word 1327309063 // Layer 7, block 47 +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 13624329 // Layer 6, block 24 +.word 9838349 // Layer 6, block 25 +.word 6934560 // Layer 6, block 26 +.word 11310234 // Layer 6, block 27 +.word 871890510 // Layer 6, block 24 +.word 629606282 // Layer 6, block 25 +.word 443777969 // Layer 6, block 26 +.word 723799733 // Layer 6, block 27 +.word 3153984 // Layer 7, block 48 +.word 15599806 // Layer 7, block 50 +.word 23484790 // Layer 7, block 52 +.word 30174454 // Layer 7, block 54 +.word 201839571 // Layer 7, block 48 +.word 998311389 // Layer 7, block 50 +.word 1502911852 // Layer 7, block 52 +.word 1931017673 // Layer 7, block 54 +.word 13598070 // Layer 7, block 49 +.word 31454003 // Layer 7, block 51 +.word 20506260 // Layer 7, block 53 +.word 5928435 // Layer 7, block 55 +.word 870210062 // Layer 7, block 49 +.word 2012902560 // Layer 7, block 51 +.word 1312300480 // Layer 7, block 53 +.word 379390883 // Layer 7, block 55 +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 32798516 // Layer 6, block 28 +.word 9911360 // Layer 6, block 29 +.word 32443170 // Layer 6, block 30 +.word 31293482 // Layer 6, block 31 +.word 2098944825 // Layer 6, block 28 +.word 634278629 // Layer 6, block 29 +.word 2076204416 // Layer 6, block 30 +.word 2002630000 // Layer 6, block 31 +.word 26013877 // Layer 7, block 56 +.word 22928950 // Layer 7, block 58 +.word 24547058 // Layer 7, block 60 +.word 21082546 // Layer 7, block 62 +.word 1664761067 // Layer 7, block 56 +.word 1467340807 // Layer 7, block 58 +.word 1570891816 // Layer 7, block 60 +.word 1349179970 // Layer 7, block 62 +.word 21864746 // Layer 7, block 57 +.word 27678266 // Layer 7, block 59 +.word 30695887 // Layer 7, block 61 +.word 31772478 // Layer 7, block 63 +.word 1399236949 // Layer 7, block 57 +.word 1771273834 // Layer 7, block 59 +.word 1964386839 // Layer 7, block 61 +.word 2033283404 // Layer 7, block 63 +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 2853776 // Layer 6, block 32 +.word 31645959 // Layer 6, block 33 +.word 29723614 // Layer 6, block 34 +.word 31813171 // Layer 6, block 35 +.word 182627725 // Layer 6, block 32 +.word 2025186806 // Layer 6, block 33 +.word 1902166116 // Layer 6, block 34 +.word 2035887557 // Layer 6, block 35 +.word 30377953 // Layer 7, block 64 +.word 4924837 // Layer 7, block 66 +.word 11362575 // Layer 7, block 68 +.word 31398766 // Layer 7, block 70 +.word 1944040616 // Layer 7, block 64 +.word 315165513 // Layer 7, block 66 +.word 727149301 // Layer 7, block 68 +.word 2009367662 // Layer 7, block 70 +.word 27689101 // Layer 7, block 65 +.word 31229525 // Layer 7, block 67 +.word 6544948 // Layer 7, block 69 +.word 13728247 // Layer 7, block 71 +.word 1771967221 // Layer 7, block 65 +.word 1998537064 // Layer 7, block 67 +.word 418844704 // Layer 7, block 69 +.word 878540754 // Layer 7, block 71 +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9116920 // Layer 6, block 36 +.word 26449800 // Layer 6, block 37 +.word 27173300 // Layer 6, block 38 +.word 1574249 // Layer 6, block 39 +.word 583438350 // Layer 6, block 36 +.word 1692658010 // Layer 6, block 37 +.word 1738958476 // Layer 6, block 38 +.word 100744247 // Layer 6, block 39 +.word 6510145 // Layer 7, block 72 +.word 760999 // Layer 7, block 74 +.word 1634503 // Layer 7, block 76 +.word 29546109 // Layer 7, block 78 +.word 416617482 // Layer 7, block 72 +.word 48700219 // Layer 7, block 74 +.word 104600209 // Layer 7, block 76 +.word 1890806663 // Layer 7, block 78 +.word 2195232 // Layer 7, block 73 +.word 4465852 // Layer 7, block 75 +.word 31203102 // Layer 7, block 77 +.word 29916743 // Layer 7, block 79 +.word 140484126 // Layer 7, block 73 +.word 285792715 // Layer 7, block 75 +.word 1996846121 // Layer 7, block 77 +.word 1914525428 // Layer 7, block 79 +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29172999 // Layer 6, block 40 +.word 16825951 // Layer 6, block 41 +.word 11592382 // Layer 6, block 42 +.word 2671395 // Layer 6, block 43 +.word 1866929445 // Layer 6, block 40 +.word 1076778680 // Layer 6, block 41 +.word 741855827 // Layer 6, block 42 +.word 170956232 // Layer 6, block 43 +.word 14579779 // Layer 7, block 80 +.word 24263513 // Layer 7, block 82 +.word 4646776 // Layer 7, block 84 +.word 69049 // Layer 7, block 86 +.word 933034643 // Layer 7, block 80 +.word 1552746321 // Layer 7, block 82 +.word 297370968 // Layer 7, block 84 +.word 4418799 // Layer 7, block 86 +.word 33263488 // Layer 7, block 81 +.word 22493246 // Layer 7, block 83 +.word 22009979 // Layer 7, block 85 +.word 12021234 // Layer 7, block 87 +.word 2128700762 // Layer 7, block 81 +.word 1439457879 // Layer 7, block 83 +.word 1408531152 // Layer 7, block 85 +.word 769300260 // Layer 7, block 87 +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 15720958 // Layer 6, block 44 +.word 4876619 // Layer 6, block 45 +.word 9370171 // Layer 6, block 46 +.word 2197027 // Layer 6, block 47 +.word 1006064525 // Layer 6, block 44 +.word 312079797 // Layer 6, block 45 +.word 599645177 // Layer 6, block 46 +.word 140598997 // Layer 6, block 47 +.word 16117282 // Layer 7, block 88 +.word 9635661 // Layer 7, block 90 +.word 9117520 // Layer 7, block 92 +.word 3506913 // Layer 7, block 94 +.word 1031427326 // Layer 7, block 88 +.word 616635240 // Layer 7, block 90 +.word 583476747 // Layer 7, block 92 +.word 224425303 // Layer 7, block 94 +.word 20014407 // Layer 7, block 89 +.word 25893988 // Layer 7, block 91 +.word 10257619 // Layer 7, block 93 +.word 24501669 // Layer 7, block 95 +.word 1280824291 // Layer 7, block 89 +.word 1657088757 // Layer 7, block 91 +.word 656437514 // Layer 7, block 93 +.word 1567987141 // Layer 7, block 95 +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 23467272 // Layer 6, block 48 +.word 11944835 // Layer 6, block 49 +.word 29768154 // Layer 6, block 50 +.word 3189790 // Layer 6, block 51 +.word 1501790786 // Layer 6, block 48 +.word 764411097 // Layer 6, block 49 +.word 1905016458 // Layer 6, block 50 +.word 204130980 // Layer 6, block 51 +.word 28559032 // Layer 7, block 96 +.word 20151609 // Layer 7, block 98 +.word 11645481 // Layer 7, block 100 +.word 16402437 // Layer 7, block 102 +.word 1827638556 // Layer 7, block 96 +.word 1289604549 // Layer 7, block 98 +.word 745253903 // Layer 7, block 100 +.word 1049675853 // Layer 7, block 102 +.word 1005359 // Layer 7, block 97 +.word 19130139 // Layer 7, block 99 +.word 11690281 // Layer 7, block 101 +.word 5461508 // Layer 7, block 103 +.word 64338065 // Layer 7, block 97 +.word 1224235458 // Layer 7, block 99 +.word 748120885 // Layer 7, block 101 +.word 349509836 // Layer 7, block 103 +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 4898455 // Layer 6, block 52 +.word 22059944 // Layer 6, block 53 +.word 20315246 // Layer 6, block 54 +.word 28615767 // Layer 6, block 55 +.word 313477194 // Layer 6, block 52 +.word 1411728668 // Layer 6, block 53 +.word 1300076517 // Layer 6, block 54 +.word 1831269319 // Layer 6, block 55 +.word 6226096 // Layer 7, block 104 +.word 14029790 // Layer 7, block 106 +.word 7729000 // Layer 7, block 108 +.word 13958531 // Layer 7, block 110 +.word 398439734 // Layer 7, block 104 +.word 897838034 // Layer 7, block 106 +.word 494618249 // Layer 7, block 108 +.word 893277806 // Layer 7, block 110 +.word 31755058 // Layer 7, block 105 +.word 26102744 // Layer 7, block 107 +.word 19175904 // Layer 7, block 109 +.word 19472238 // Layer 7, block 111 +.word 2032168609 // Layer 7, block 105 +.word 1670448121 // Layer 7, block 107 +.word 1227164194 // Layer 7, block 109 +.word 1246128123 // Layer 7, block 111 +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 17302560 // Layer 6, block 56 +.word 8630188 // Layer 6, block 57 +.word 13744680 // Layer 6, block 58 +.word 31890906 // Layer 6, block 59 +.word 1107279328 // Layer 6, block 56 +.word 552289879 // Layer 6, block 57 +.word 879592386 // Layer 6, block 58 +.word 2040862218 // Layer 6, block 59 +.word 4735938 // Layer 7, block 112 +.word 26671657 // Layer 7, block 114 +.word 25810971 // Layer 7, block 116 +.word 25578690 // Layer 7, block 118 +.word 303076900 // Layer 7, block 112 +.word 1706855774 // Layer 7, block 114 +.word 1651776074 // Layer 7, block 116 +.word 1636911225 // Layer 7, block 118 +.word 6957373 // Layer 7, block 113 +.word 25381712 // Layer 7, block 115 +.word 27780827 // Layer 7, block 117 +.word 28062311 // Layer 7, block 119 +.word 445237890 // Layer 7, block 113 +.word 1624305595 // Layer 7, block 115 +.word 1777837237 // Layer 7, block 117 +.word 1795850838 // Layer 7, block 119 +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 26150922 // Layer 6, block 60 +.word 29525906 // Layer 6, block 61 +.word 23080870 // Layer 6, block 62 +.word 1636987 // Layer 6, block 63 +.word 1673531278 // Layer 6, block 60 +.word 1889513769 // Layer 6, block 61 +.word 1477062945 // Layer 6, block 62 +.word 104759172 // Layer 6, block 63 +.word 10674616 // Layer 7, block 120 +.word 9508293 // Layer 7, block 122 +.word 4274200 // Layer 7, block 124 +.word 10066304 // Layer 7, block 126 +.word 683123285 // Layer 7, block 120 +.word 608484310 // Layer 7, block 122 +.word 273527923 // Layer 7, block 124 +.word 644194289 // Layer 7, block 126 +.word 26473446 // Layer 7, block 121 +.word 14853570 // Layer 7, block 123 +.word 32427548 // Layer 7, block 125 +.word 16598340 // Layer 7, block 127 +.word 1694171239 // Layer 7, block 121 +.word 950555930 // Layer 7, block 123 +.word 2075204685 // Layer 7, block 125 +.word 1062212688 // Layer 7, block 127 +.text +.global ntt_u32_full_neon_asm_var_4_4_10_0 +.global _ntt_u32_full_neon_asm_var_4_4_10_0 +ntt_u32_full_neon_asm_var_4_4_10_0: +_ntt_u32_full_neon_asm_var_4_4_10_0: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #928] +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +ldr q20, [x0, #992] +sqrdmulh v19.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q18, [x0, #800] +sqrdmulh v17.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +ldr q16, [x0, #864] +sqrdmulh v3.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +mla v22.4S, v21.4S, v31.s[0] +mla v20.4S, v19.4S, v31.s[0] +mla v18.4S, v17.4S, v31.s[0] +mla v16.4S, v3.4S, v31.s[0] +ldr q3, [x0, #544] +sqrdmulh v17.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +ldr q19, [x0, #608] +sqrdmulh v21.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q2, [x0, #672] +ldr q1, [x0, #416] +sqrdmulh v0.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +sub v15.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +ldr q22, [x0, #736] +ldr q14, [x0, #480] +sqrdmulh v13.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v12.4s, v14.4s, v20.4s +add v14.4s, v14.4s, v20.4s +ldr q20, [x0, #288] +mla v3.4S, v17.4S, v31.s[0] +mla v19.4S, v21.4S, v31.s[0] +sub v21.4s, v20.4s, v18.4s +mla v2.4S, v0.4S, v31.s[0] +mla v22.4S, v13.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +ldr q18, [x0, #352] +sqrdmulh v13.4S, v1.4S, v29.s[1] +mul v1.4S, v1.4S,v30.s[1] +sub v0.4s, v18.4s, v16.4s +sqrdmulh v17.4S, v14.4S, v29.s[1] +mul v14.4S, v14.4S,v30.s[1] +add v18.4s, v18.4s, v16.4s +ldr q16, [x0, #32] +sqrdmulh v11.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v10.4s, v16.4s, v3.4s +add v16.4s, v16.4s, v3.4s +ldr q3, [x0, #96] +sqrdmulh v9.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v8.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +ldr q19, [x0, #160] +mla v1.4S, v13.4S, v31.s[0] +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v19.4s, v2.4s +mla v20.4S, v11.4S, v31.s[0] +mla v18.4S, v9.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +ldr q2, [x0, #224] +sqrdmulh v9.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +sub v11.4s, v2.4s, v22.4s +sqrdmulh v13.4S, v12.4S, v29.s[2] +mul v12.4S, v12.4S,v30.s[2] +add v2.4s, v2.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +sub v7.4s, v19.4s, v1.4s +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v29.s[2] +mul v0.4S, v0.4S,v30.s[2] +sub v6.4s, v2.4s, v14.4s +add v2.4s, v2.4s, v14.4s +mla v15.4S, v9.4S, v31.s[0] +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v16.4s, v20.4s +nop +mla v21.4S, v22.4S, v31.s[0] +mla v0.4S, v1.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +nop +sqrdmulh v20.4S, v7.4S, v27.s[1] +mul v7.4S, v7.4S,v28.s[1] +sub v1.4s, v3.4s, v18.4s +nop +sqrdmulh v22.4S, v6.4S, v27.s[1] +mul v6.4S, v6.4S,v28.s[1] +add v3.4s, v3.4s, v18.4s +nop +sqrdmulh v18.4S, v19.4S, v27.s[0] +mul v19.4S, v19.4S,v28.s[0] +sub v9.4s, v17.4s, v15.4s +add v17.4s, v17.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v27.s[0] +mul v2.4S, v2.4S,v28.s[0] +sub v14.4s, v11.4s, v12.4s +add v11.4s, v11.4s, v12.4s +mla v7.4S, v20.4S, v31.s[0] +mla v6.4S, v22.4S, v31.s[0] +sub v22.4s, v10.4s, v21.4s +nop +mla v19.4S, v18.4S, v31.s[0] +mla v2.4S, v15.4S, v31.s[0] +add v10.4s, v10.4s, v21.4s +nop +sqrdmulh v21.4S, v17.4S, v27.s[2] +mul v17.4S, v17.4S,v28.s[2] +sub v15.4s, v8.4s, v0.4s +nop +sqrdmulh v18.4S, v11.4S, v27.s[2] +mul v11.4S, v11.4S,v28.s[2] +add v8.4s, v8.4s, v0.4s +nop +sqrdmulh v0.4S, v9.4S, v27.s[3] +mul v9.4S, v9.4S,v28.s[3] +sub v20.4s, v13.4s, v7.4s +add v13.4s, v13.4s, v7.4s +sqrdmulh v7.4S, v14.4S, v27.s[3] +mul v14.4S, v14.4S,v28.s[3] +sub v12.4s, v1.4s, v6.4s +add v1.4s, v1.4s, v6.4s +mla v17.4S, v21.4S, v31.s[0] +mla v11.4S, v18.4S, v31.s[0] +sub v18.4s, v16.4s, v19.4s +nop +mla v9.4S, v0.4S, v31.s[0] +mla v14.4S, v7.4S, v31.s[0] +add v16.4s, v16.4s, v19.4s +nop +sqrdmulh v19.4S, v1.4S, v25.s[2] +mul v1.4S, v1.4S,v26.s[2] +sub v7.4s, v3.4s, v2.4s +nop +sqrdmulh v0.4S, v12.4S, v25.s[3] +mul v12.4S, v12.4S,v26.s[3] +add v3.4s, v3.4s, v2.4s +nop +sqrdmulh v2.4S, v7.4S, v25.s[1] +mul v7.4S, v7.4S,v26.s[1] +sub v21.4s, v10.4s, v17.4s +add v10.4s, v10.4s, v17.4s +sqrdmulh v17.4S, v3.4S, v25.s[0] +mul v3.4S, v3.4S,v26.s[0] +sub v6.4s, v8.4s, v11.4s +add v8.4s, v8.4s, v11.4s +mla v1.4S, v19.4S, v31.s[0] +mla v12.4S, v0.4S, v31.s[0] +sub v0.4s, v22.4s, v9.4s +nop +mla v7.4S, v2.4S, v31.s[0] +mla v3.4S, v17.4S, v31.s[0] +add v22.4s, v22.4s, v9.4s +nop +sqrdmulh v9.4S, v8.4S, v23.s[0] +mul v8.4S, v8.4S,v24.s[0] +sub v17.4s, v15.4s, v14.4s +nop +sqrdmulh v2.4S, v6.4S, v23.s[1] +mul v6.4S, v6.4S,v24.s[1] +add v15.4s, v15.4s, v14.4s +nop +sqrdmulh v14.4S, v15.4S, v23.s[2] +mul v15.4S, v15.4S,v24.s[2] +sub v19.4s, v13.4s, v1.4s +add v13.4s, v13.4s, v1.4s +sqrdmulh v1.4S, v17.4S, v23.s[3] +mul v17.4S, v17.4S,v24.s[3] +sub v11.4s, v20.4s, v12.4s +add v20.4s, v20.4s, v12.4s +mla v8.4S, v9.4S, v31.s[0] +mla v6.4S, v2.4S, v31.s[0] +sub v2.4s, v18.4s, v7.4s +str q13, [x0, #288] +mla v15.4S, v14.4S, v31.s[0] +mla v17.4S, v1.4S, v31.s[0] +add v18.4s, v18.4s, v7.4s +str q19, [x0, #352] +ldr q19, [x0, #944] +sqrdmulh v7.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +sub v1.4s, v16.4s, v3.4s +str q20, [x0, #416] +ldr q20, [x0, #1008] +sqrdmulh v14.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v16.4s, v16.4s, v3.4s +str q11, [x0, #480] +ldr q11, [x0, #816] +sqrdmulh v3.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +sub v13.4s, v10.4s, v8.4s +add v10.4s, v10.4s, v8.4s +ldr q8, [x0, #880] +sqrdmulh v9.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v12.4s, v21.4s, v6.4s +add v21.4s, v21.4s, v6.4s +mla v19.4S, v7.4S, v31.s[0] +mla v20.4S, v14.4S, v31.s[0] +sub v14.4s, v22.4s, v15.4s +str q18, [x0, #160] +mla v11.4S, v3.4S, v31.s[0] +mla v8.4S, v9.4S, v31.s[0] +add v22.4s, v22.4s, v15.4s +str q2, [x0, #224] +ldr q2, [x0, #560] +sqrdmulh v15.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +sub v9.4s, v0.4s, v17.4s +str q16, [x0, #32] +ldr q16, [x0, #624] +sqrdmulh v3.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +add v0.4s, v0.4s, v17.4s +str q1, [x0, #96] +ldr q1, [x0, #688] +ldr q17, [x0, #432] +sqrdmulh v18.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +sub v7.4s, v17.4s, v19.4s +add v17.4s, v17.4s, v19.4s +ldr q19, [x0, #752] +ldr q6, [x0, #496] +sqrdmulh v5.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +sub v4.4s, v6.4s, v20.4s +add v6.4s, v6.4s, v20.4s +ldr q20, [x0, #304] +mla v2.4S, v15.4S, v31.s[0] +mla v16.4S, v3.4S, v31.s[0] +sub v3.4s, v20.4s, v11.4s +str q10, [x0, #544] +mla v1.4S, v18.4S, v31.s[0] +mla v19.4S, v5.4S, v31.s[0] +add v20.4s, v20.4s, v11.4s +str q13, [x0, #608] +ldr q13, [x0, #368] +sqrdmulh v11.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v5.4s, v13.4s, v8.4s +str q21, [x0, #672] +sqrdmulh v21.4S, v6.4S, v29.s[1] +mul v6.4S, v6.4S,v30.s[1] +add v13.4s, v13.4s, v8.4s +str q12, [x0, #736] +ldr q12, [x0, #48] +sqrdmulh v8.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v18.4s, v12.4s, v2.4s +add v12.4s, v12.4s, v2.4s +ldr q2, [x0, #112] +sqrdmulh v10.4S, v13.4S, v29.s[1] +mul v13.4S, v13.4S,v30.s[1] +sub v15.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +ldr q16, [x0, #176] +mla v17.4S, v11.4S, v31.s[0] +mla v6.4S, v21.4S, v31.s[0] +sub v21.4s, v16.4s, v1.4s +str q22, [x0, #800] +mla v20.4S, v8.4S, v31.s[0] +mla v13.4S, v10.4S, v31.s[0] +add v16.4s, v16.4s, v1.4s +str q14, [x0, #864] +ldr q14, [x0, #240] +sqrdmulh v1.4S, v7.4S, v29.s[2] +mul v7.4S, v7.4S,v30.s[2] +sub v10.4s, v14.4s, v19.4s +str q0, [x0, #928] +sqrdmulh v0.4S, v4.4S, v29.s[2] +mul v4.4S, v4.4S,v30.s[2] +add v14.4s, v14.4s, v19.4s +str q9, [x0, #992] +sqrdmulh v9.4S, v3.4S, v29.s[2] +mul v3.4S, v3.4S,v30.s[2] +sub v19.4s, v16.4s, v17.4s +add v16.4s, v16.4s, v17.4s +sqrdmulh v17.4S, v5.4S, v29.s[2] +mul v5.4S, v5.4S,v30.s[2] +sub v8.4s, v14.4s, v6.4s +add v14.4s, v14.4s, v6.4s +mla v7.4S, v1.4S, v31.s[0] +mla v4.4S, v0.4S, v31.s[0] +sub v0.4s, v12.4s, v20.4s +nop +mla v3.4S, v9.4S, v31.s[0] +mla v5.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v20.4s +nop +sqrdmulh v20.4S, v19.4S, v27.s[1] +mul v19.4S, v19.4S,v28.s[1] +sub v17.4s, v2.4s, v13.4s +nop +sqrdmulh v9.4S, v8.4S, v27.s[1] +mul v8.4S, v8.4S,v28.s[1] +add v2.4s, v2.4s, v13.4s +nop +sqrdmulh v13.4S, v16.4S, v27.s[0] +mul v16.4S, v16.4S,v28.s[0] +sub v1.4s, v21.4s, v7.4s +add v21.4s, v21.4s, v7.4s +sqrdmulh v7.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +sub v6.4s, v10.4s, v4.4s +add v10.4s, v10.4s, v4.4s +mla v19.4S, v20.4S, v31.s[0] +mla v8.4S, v9.4S, v31.s[0] +sub v9.4s, v18.4s, v3.4s +nop +mla v16.4S, v13.4S, v31.s[0] +mla v14.4S, v7.4S, v31.s[0] +add v18.4s, v18.4s, v3.4s +nop +sqrdmulh v3.4S, v21.4S, v27.s[2] +mul v21.4S, v21.4S,v28.s[2] +sub v7.4s, v15.4s, v5.4s +nop +sqrdmulh v13.4S, v10.4S, v27.s[2] +mul v10.4S, v10.4S,v28.s[2] +add v15.4s, v15.4s, v5.4s +nop +sqrdmulh v5.4S, v1.4S, v27.s[3] +mul v1.4S, v1.4S,v28.s[3] +sub v20.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v27.s[3] +mul v6.4S, v6.4S,v28.s[3] +sub v4.4s, v17.4s, v8.4s +add v17.4s, v17.4s, v8.4s +mla v21.4S, v3.4S, v31.s[0] +mla v10.4S, v13.4S, v31.s[0] +sub v13.4s, v12.4s, v16.4s +nop +mla v1.4S, v5.4S, v31.s[0] +mla v6.4S, v19.4S, v31.s[0] +add v12.4s, v12.4s, v16.4s +nop +sqrdmulh v16.4S, v17.4S, v25.s[2] +mul v17.4S, v17.4S,v26.s[2] +sub v19.4s, v2.4s, v14.4s +nop +sqrdmulh v5.4S, v4.4S, v25.s[3] +mul v4.4S, v4.4S,v26.s[3] +add v2.4s, v2.4s, v14.4s +nop +sqrdmulh v14.4S, v19.4S, v25.s[1] +mul v19.4S, v19.4S,v26.s[1] +sub v3.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v2.4S, v25.s[0] +mul v2.4S, v2.4S,v26.s[0] +sub v8.4s, v15.4s, v10.4s +add v15.4s, v15.4s, v10.4s +mla v17.4S, v16.4S, v31.s[0] +mla v4.4S, v5.4S, v31.s[0] +sub v5.4s, v9.4s, v1.4s +nop +mla v19.4S, v14.4S, v31.s[0] +mla v2.4S, v21.4S, v31.s[0] +add v9.4s, v9.4s, v1.4s +nop +sqrdmulh v1.4S, v15.4S, v23.s[0] +mul v15.4S, v15.4S,v24.s[0] +sub v21.4s, v7.4s, v6.4s +nop +sqrdmulh v14.4S, v8.4S, v23.s[1] +mul v8.4S, v8.4S,v24.s[1] +add v7.4s, v7.4s, v6.4s +nop +sqrdmulh v6.4S, v7.4S, v23.s[2] +mul v7.4S, v7.4S,v24.s[2] +sub v16.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +sqrdmulh v17.4S, v21.4S, v23.s[3] +mul v21.4S, v21.4S,v24.s[3] +sub v10.4s, v20.4s, v4.4s +add v20.4s, v20.4s, v4.4s +mla v15.4S, v1.4S, v31.s[0] +mla v8.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v19.4s +str q0, [x0, #304] +mla v7.4S, v6.4S, v31.s[0] +mla v21.4S, v17.4S, v31.s[0] +add v13.4s, v13.4s, v19.4s +str q16, [x0, #368] +ldr q16, [x0, #896] +sqrdmulh v19.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v17.4s, v12.4s, v2.4s +str q20, [x0, #432] +ldr q20, [x0, #960] +sqrdmulh v6.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v12.4s, v12.4s, v2.4s +str q10, [x0, #496] +ldr q10, [x0, #768] +sqrdmulh v2.4S, v10.4S, v29.s[0] +mul v10.4S, v10.4S,v30.s[0] +sub v0.4s, v18.4s, v15.4s +add v18.4s, v18.4s, v15.4s +ldr q15, [x0, #832] +sqrdmulh v1.4S, v15.4S, v29.s[0] +mul v15.4S, v15.4S,v30.s[0] +sub v4.4s, v3.4s, v8.4s +add v3.4s, v3.4s, v8.4s +mla v16.4S, v19.4S, v31.s[0] +mla v20.4S, v6.4S, v31.s[0] +sub v6.4s, v9.4s, v7.4s +str q13, [x0, #176] +mla v10.4S, v2.4S, v31.s[0] +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v7.4s +str q14, [x0, #240] +ldr q14, [x0, #512] +sqrdmulh v7.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v1.4s, v5.4s, v21.4s +str q12, [x0, #48] +ldr q12, [x0, #576] +sqrdmulh v2.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +add v5.4s, v5.4s, v21.4s +str q17, [x0, #112] +ldr q17, [x0, #640] +ldr q21, [x0, #384] +sqrdmulh v13.4S, v17.4S, v29.s[0] +mul v17.4S, v17.4S,v30.s[0] +sub v19.4s, v21.4s, v16.4s +add v21.4s, v21.4s, v16.4s +ldr q16, [x0, #704] +ldr q8, [x0, #448] +sqrdmulh v22.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v11.4s, v8.4s, v20.4s +add v8.4s, v8.4s, v20.4s +ldr q20, [x0, #256] +mla v14.4S, v7.4S, v31.s[0] +mla v12.4S, v2.4S, v31.s[0] +sub v2.4s, v20.4s, v10.4s +str q18, [x0, #560] +mla v17.4S, v13.4S, v31.s[0] +mla v16.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v10.4s +str q0, [x0, #624] +ldr q0, [x0, #320] +sqrdmulh v10.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v22.4s, v0.4s, v15.4s +str q3, [x0, #688] +sqrdmulh v3.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +add v0.4s, v0.4s, v15.4s +str q4, [x0, #752] +ldr q4, [x0, #0] +sqrdmulh v15.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v13.4s, v4.4s, v14.4s +add v4.4s, v4.4s, v14.4s +ldr q14, [x0, #64] +sqrdmulh v18.4S, v0.4S, v29.s[1] +mul v0.4S, v0.4S,v30.s[1] +sub v7.4s, v14.4s, v12.4s +add v14.4s, v14.4s, v12.4s +ldr q12, [x0, #128] +mla v21.4S, v10.4S, v31.s[0] +mla v8.4S, v3.4S, v31.s[0] +sub v3.4s, v12.4s, v17.4s +str q9, [x0, #816] +mla v20.4S, v15.4S, v31.s[0] +mla v0.4S, v18.4S, v31.s[0] +add v12.4s, v12.4s, v17.4s +str q6, [x0, #880] +ldr q6, [x0, #192] +sqrdmulh v17.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +sub v18.4s, v6.4s, v16.4s +str q5, [x0, #944] +sqrdmulh v5.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +add v6.4s, v6.4s, v16.4s +str q1, [x0, #1008] +sqrdmulh v1.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v16.4s, v12.4s, v21.4s +add v12.4s, v12.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +sub v15.4s, v6.4s, v8.4s +add v6.4s, v6.4s, v8.4s +mla v19.4S, v17.4S, v31.s[0] +mla v11.4S, v5.4S, v31.s[0] +sub v5.4s, v4.4s, v20.4s +nop +mla v2.4S, v1.4S, v31.s[0] +mla v22.4S, v21.4S, v31.s[0] +add v4.4s, v4.4s, v20.4s +nop +sqrdmulh v20.4S, v16.4S, v27.s[1] +mul v16.4S, v16.4S,v28.s[1] +sub v21.4s, v14.4s, v0.4s +nop +sqrdmulh v1.4S, v15.4S, v27.s[1] +mul v15.4S, v15.4S,v28.s[1] +add v14.4s, v14.4s, v0.4s +nop +sqrdmulh v0.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +sub v17.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v27.s[0] +mul v6.4S, v6.4S,v28.s[0] +sub v8.4s, v18.4s, v11.4s +add v18.4s, v18.4s, v11.4s +mla v16.4S, v20.4S, v31.s[0] +mla v15.4S, v1.4S, v31.s[0] +sub v1.4s, v13.4s, v2.4s +nop +mla v12.4S, v0.4S, v31.s[0] +mla v6.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v2.4s +nop +sqrdmulh v2.4S, v3.4S, v27.s[2] +mul v3.4S, v3.4S,v28.s[2] +sub v19.4s, v7.4s, v22.4s +nop +sqrdmulh v0.4S, v18.4S, v27.s[2] +mul v18.4S, v18.4S,v28.s[2] +add v7.4s, v7.4s, v22.4s +nop +sqrdmulh v22.4S, v17.4S, v27.s[3] +mul v17.4S, v17.4S,v28.s[3] +sub v20.4s, v5.4s, v16.4s +add v5.4s, v5.4s, v16.4s +sqrdmulh v16.4S, v8.4S, v27.s[3] +mul v8.4S, v8.4S,v28.s[3] +sub v11.4s, v21.4s, v15.4s +add v21.4s, v21.4s, v15.4s +mla v3.4S, v2.4S, v31.s[0] +mla v18.4S, v0.4S, v31.s[0] +sub v0.4s, v4.4s, v12.4s +nop +mla v17.4S, v22.4S, v31.s[0] +mla v8.4S, v16.4S, v31.s[0] +add v4.4s, v4.4s, v12.4s +nop +sqrdmulh v12.4S, v21.4S, v25.s[2] +mul v21.4S, v21.4S,v26.s[2] +sub v16.4s, v14.4s, v6.4s +nop +sqrdmulh v22.4S, v11.4S, v25.s[3] +mul v11.4S, v11.4S,v26.s[3] +add v14.4s, v14.4s, v6.4s +nop +sqrdmulh v6.4S, v16.4S, v25.s[1] +mul v16.4S, v16.4S,v26.s[1] +sub v2.4s, v13.4s, v3.4s +add v13.4s, v13.4s, v3.4s +sqrdmulh v3.4S, v14.4S, v25.s[0] +mul v14.4S, v14.4S,v26.s[0] +sub v15.4s, v7.4s, v18.4s +add v7.4s, v7.4s, v18.4s +mla v21.4S, v12.4S, v31.s[0] +mla v11.4S, v22.4S, v31.s[0] +sub v22.4s, v1.4s, v17.4s +nop +mla v16.4S, v6.4S, v31.s[0] +mla v14.4S, v3.4S, v31.s[0] +add v1.4s, v1.4s, v17.4s +nop +sqrdmulh v17.4S, v7.4S, v23.s[0] +mul v7.4S, v7.4S,v24.s[0] +sub v3.4s, v19.4s, v8.4s +nop +sqrdmulh v6.4S, v15.4S, v23.s[1] +mul v15.4S, v15.4S,v24.s[1] +add v19.4s, v19.4s, v8.4s +nop +sqrdmulh v8.4S, v19.4S, v23.s[2] +mul v19.4S, v19.4S,v24.s[2] +sub v12.4s, v5.4s, v21.4s +add v5.4s, v5.4s, v21.4s +sqrdmulh v21.4S, v3.4S, v23.s[3] +mul v3.4S, v3.4S,v24.s[3] +sub v18.4s, v20.4s, v11.4s +add v20.4s, v20.4s, v11.4s +mla v7.4S, v17.4S, v31.s[0] +mla v15.4S, v6.4S, v31.s[0] +sub v6.4s, v0.4s, v16.4s +str q5, [x0, #256] +mla v19.4S, v8.4S, v31.s[0] +mla v3.4S, v21.4S, v31.s[0] +add v0.4s, v0.4s, v16.4s +str q12, [x0, #320] +ldr q12, [x0, #912] +sqrdmulh v16.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +sub v21.4s, v4.4s, v14.4s +str q20, [x0, #384] +ldr q20, [x0, #976] +sqrdmulh v8.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v4.4s, v4.4s, v14.4s +str q18, [x0, #448] +ldr q18, [x0, #784] +sqrdmulh v14.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +sub v5.4s, v13.4s, v7.4s +add v13.4s, v13.4s, v7.4s +ldr q7, [x0, #848] +sqrdmulh v17.4S, v7.4S, v29.s[0] +mul v7.4S, v7.4S,v30.s[0] +sub v11.4s, v2.4s, v15.4s +add v2.4s, v2.4s, v15.4s +mla v12.4S, v16.4S, v31.s[0] +mla v20.4S, v8.4S, v31.s[0] +sub v8.4s, v1.4s, v19.4s +str q0, [x0, #128] +mla v18.4S, v14.4S, v31.s[0] +mla v7.4S, v17.4S, v31.s[0] +add v1.4s, v1.4s, v19.4s +str q6, [x0, #192] +ldr q6, [x0, #528] +sqrdmulh v19.4S, v6.4S, v29.s[0] +mul v6.4S, v6.4S,v30.s[0] +sub v17.4s, v22.4s, v3.4s +str q4, [x0, #0] +ldr q4, [x0, #592] +sqrdmulh v14.4S, v4.4S, v29.s[0] +mul v4.4S, v4.4S,v30.s[0] +add v22.4s, v22.4s, v3.4s +str q21, [x0, #64] +ldr q21, [x0, #656] +ldr q3, [x0, #400] +sqrdmulh v0.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +sub v16.4s, v3.4s, v12.4s +add v3.4s, v3.4s, v12.4s +ldr q12, [x0, #720] +ldr q15, [x0, #464] +sqrdmulh v9.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +sub v10.4s, v15.4s, v20.4s +add v15.4s, v15.4s, v20.4s +ldr q20, [x0, #272] +mla v6.4S, v19.4S, v31.s[0] +mla v4.4S, v14.4S, v31.s[0] +sub v14.4s, v20.4s, v18.4s +str q13, [x0, #512] +mla v21.4S, v0.4S, v31.s[0] +mla v12.4S, v9.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +str q5, [x0, #576] +ldr q5, [x0, #336] +sqrdmulh v18.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v9.4s, v5.4s, v7.4s +str q2, [x0, #640] +sqrdmulh v2.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +add v5.4s, v5.4s, v7.4s +str q11, [x0, #704] +ldr q11, [x0, #16] +sqrdmulh v7.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v0.4s, v11.4s, v6.4s +add v11.4s, v11.4s, v6.4s +ldr q6, [x0, #80] +sqrdmulh v13.4S, v5.4S, v29.s[1] +mul v5.4S, v5.4S,v30.s[1] +sub v19.4s, v6.4s, v4.4s +add v6.4s, v6.4s, v4.4s +ldr q4, [x0, #144] +mla v3.4S, v18.4S, v31.s[0] +mla v15.4S, v2.4S, v31.s[0] +sub v2.4s, v4.4s, v21.4s +str q1, [x0, #768] +mla v20.4S, v7.4S, v31.s[0] +mla v5.4S, v13.4S, v31.s[0] +add v4.4s, v4.4s, v21.4s +str q8, [x0, #832] +ldr q8, [x0, #208] +sqrdmulh v21.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +sub v13.4s, v8.4s, v12.4s +str q22, [x0, #896] +sqrdmulh v22.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +add v8.4s, v8.4s, v12.4s +str q17, [x0, #960] +sqrdmulh v17.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v12.4s, v4.4s, v3.4s +add v4.4s, v4.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v29.s[2] +mul v9.4S, v9.4S,v30.s[2] +sub v7.4s, v8.4s, v15.4s +add v8.4s, v8.4s, v15.4s +mla v16.4S, v21.4S, v31.s[0] +mla v10.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v20.4s +nop +mla v14.4S, v17.4S, v31.s[0] +mla v9.4S, v3.4S, v31.s[0] +add v11.4s, v11.4s, v20.4s +nop +sqrdmulh v20.4S, v12.4S, v27.s[1] +mul v12.4S, v12.4S,v28.s[1] +sub v3.4s, v6.4s, v5.4s +nop +sqrdmulh v17.4S, v7.4S, v27.s[1] +mul v7.4S, v7.4S,v28.s[1] +add v6.4s, v6.4s, v5.4s +nop +sqrdmulh v5.4S, v4.4S, v27.s[0] +mul v4.4S, v4.4S,v28.s[0] +sub v21.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v8.4S, v27.s[0] +mul v8.4S, v8.4S,v28.s[0] +sub v15.4s, v13.4s, v10.4s +add v13.4s, v13.4s, v10.4s +mla v12.4S, v20.4S, v31.s[0] +mla v7.4S, v17.4S, v31.s[0] +sub v17.4s, v0.4s, v14.4s +nop +mla v4.4S, v5.4S, v31.s[0] +mla v8.4S, v16.4S, v31.s[0] +add v0.4s, v0.4s, v14.4s +nop +sqrdmulh v14.4S, v2.4S, v27.s[2] +mul v2.4S, v2.4S,v28.s[2] +sub v16.4s, v19.4s, v9.4s +nop +sqrdmulh v5.4S, v13.4S, v27.s[2] +mul v13.4S, v13.4S,v28.s[2] +add v19.4s, v19.4s, v9.4s +nop +sqrdmulh v9.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +sub v20.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +sub v10.4s, v3.4s, v7.4s +add v3.4s, v3.4s, v7.4s +mla v2.4S, v14.4S, v31.s[0] +mla v13.4S, v5.4S, v31.s[0] +sub v5.4s, v11.4s, v4.4s +nop +mla v21.4S, v9.4S, v31.s[0] +mla v15.4S, v12.4S, v31.s[0] +add v11.4s, v11.4s, v4.4s +nop +sqrdmulh v4.4S, v3.4S, v25.s[2] +mul v3.4S, v3.4S,v26.s[2] +sub v12.4s, v6.4s, v8.4s +nop +sqrdmulh v9.4S, v10.4S, v25.s[3] +mul v10.4S, v10.4S,v26.s[3] +add v6.4s, v6.4s, v8.4s +nop +sqrdmulh v8.4S, v12.4S, v25.s[1] +mul v12.4S, v12.4S,v26.s[1] +sub v14.4s, v0.4s, v2.4s +add v0.4s, v0.4s, v2.4s +sqrdmulh v2.4S, v6.4S, v25.s[0] +mul v6.4S, v6.4S,v26.s[0] +sub v7.4s, v19.4s, v13.4s +add v19.4s, v19.4s, v13.4s +mla v3.4S, v4.4S, v31.s[0] +mla v10.4S, v9.4S, v31.s[0] +sub v9.4s, v17.4s, v21.4s +nop +mla v12.4S, v8.4S, v31.s[0] +mla v6.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v21.4s +nop +sqrdmulh v21.4S, v19.4S, v23.s[0] +mul v19.4S, v19.4S,v24.s[0] +sub v2.4s, v16.4s, v15.4s +nop +sqrdmulh v8.4S, v7.4S, v23.s[1] +mul v7.4S, v7.4S,v24.s[1] +add v16.4s, v16.4s, v15.4s +nop +sqrdmulh v15.4S, v16.4S, v23.s[2] +mul v16.4S, v16.4S,v24.s[2] +sub v4.4s, v22.4s, v3.4s +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v2.4S, v23.s[3] +mul v2.4S, v2.4S,v24.s[3] +sub v13.4s, v20.4s, v10.4s +add v20.4s, v20.4s, v10.4s +mla v19.4S, v21.4S, v31.s[0] +mla v7.4S, v8.4S, v31.s[0] +sub v8.4s, v5.4s, v12.4s +str q22, [x0, #272] +mla v16.4S, v15.4S, v31.s[0] +mla v2.4S, v3.4S, v31.s[0] +add v5.4s, v5.4s, v12.4s +str q4, [x0, #336] +sub v23.4s, v11.4s, v6.4s +str q20, [x0, #400] +add v11.4s, v11.4s, v6.4s +str q13, [x0, #464] +sub v13.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sub v19.4s, v14.4s, v7.4s +add v14.4s, v14.4s, v7.4s +sub v7.4s, v17.4s, v16.4s +str q5, [x0, #144] +add v17.4s, v17.4s, v16.4s +str q8, [x0, #208] +sub v8.4s, v9.4s, v2.4s +str q11, [x0, #16] +add v9.4s, v9.4s, v2.4s +str q23, [x0, #80] +str q0, [x0, #528] +str q13, [x0, #592] +str q14, [x0, #656] +str q19, [x0, #720] +str q17, [x0, #784] +str q7, [x0, #848] +str q9, [x0, #912] +str q8, [x0, #976] +ldr q18, [x17, #+128] +ldr q1, [x17, #+144] +ldr q10, [x17, #+160] +ldr q21, [x17, #+176] +ldr q22, [x17, #+192] +ldr q15, [x17, #+208] +ldr q3, [x17, #+224] +ldr q12, [x17, #+240] +ldr q4, [x0, #32] +ldr q30, [x0, #48] +ldr q29, [x0, #0] +ldr q28, [x0, #16] +sqrdmulh v27.4S, v4.4S, v1.s[0] +mul v4.4S, v4.4S,v18.s[0] +mla v4.4S, v27.4S, v31.s[0] +sub v27.4s, v29.4s, v4.4s +add v29.4s, v29.4s, v4.4s +sqrdmulh v4.4S, v30.4S, v1.s[0] +mul v30.4S, v30.4S,v18.s[0] +mla v30.4S, v4.4S, v31.s[0] +sub v4.4s, v28.4s, v30.4s +add v28.4s, v28.4s, v30.4s +sqrdmulh v30.4S, v28.4S, v1.s[1] +mul v28.4S, v28.4S,v18.s[1] +mla v28.4S, v30.4S, v31.s[0] +sub v30.4s, v29.4s, v28.4s +add v29.4s, v29.4s, v28.4s +sqrdmulh v28.4S, v4.4S, v1.s[2] +mul v4.4S, v4.4S,v18.s[2] +mla v4.4S, v28.4S, v31.s[0] +sub v28.4s, v27.4s, v4.4s +add v27.4s, v27.4s, v4.4s +trn1 v4.4S, v29.4S, v30.4S +trn2 v26.4S, v29.4S, v30.4S +trn1 v25.4S, v27.4S, v28.4S +trn2 v24.4S, v27.4S, v28.4S +trn2 v27.2D, v4.2D, v25.2D +trn2 v28.2D, v26.2D, v24.2D +trn1 v29.2D, v4.2D, v25.2D +trn1 v30.2D, v26.2D, v24.2D +sqrdmulh v24.4S, v27.4S, v21.4S +mul v27.4S, v27.4S,v10.4S +mla v27.4S, v24.4S, v31.s[0] +sub v24.4s, v29.4s, v27.4s +add v29.4s, v29.4s, v27.4s +sqrdmulh v27.4S, v28.4S, v21.4S +mul v28.4S, v28.4S,v10.4S +mla v28.4S, v27.4S, v31.s[0] +sub v27.4s, v30.4s, v28.4s +add v30.4s, v30.4s, v28.4s +sqrdmulh v28.4S, v30.4S, v15.4S +mul v30.4S, v30.4S,v22.4S +mla v30.4S, v28.4S, v31.s[0] +sub v28.4s, v29.4s, v30.4s +add v29.4s, v29.4s, v30.4s +sqrdmulh v30.4S, v27.4S, v12.4S +mul v27.4S, v27.4S,v3.4S +mla v27.4S, v30.4S, v31.s[0] +sub v30.4s, v24.4s, v27.4s +add v24.4s, v24.4s, v27.4s +str q29, [x0, #0] +str q28, [x0, #16] +str q24, [x0, #32] +str q30, [x0, #48] +ldr q30, [x17, #+256] +ldr q24, [x17, #+272] +ldr q28, [x17, #+288] +ldr q29, [x17, #+304] +ldr q27, [x17, #+320] +ldr q26, [x17, #+336] +ldr q25, [x17, #+352] +ldr q4, [x17, #+368] +ldr q12, [x0, #96] +ldr q3, [x0, #112] +ldr q15, [x0, #64] +ldr q22, [x0, #80] +sqrdmulh v21.4S, v12.4S, v24.s[0] +mul v12.4S, v12.4S,v30.s[0] +mla v12.4S, v21.4S, v31.s[0] +sub v21.4s, v15.4s, v12.4s +add v15.4s, v15.4s, v12.4s +sqrdmulh v12.4S, v3.4S, v24.s[0] +mul v3.4S, v3.4S,v30.s[0] +mla v3.4S, v12.4S, v31.s[0] +sub v12.4s, v22.4s, v3.4s +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v22.4S, v24.s[1] +mul v22.4S, v22.4S,v30.s[1] +mla v22.4S, v3.4S, v31.s[0] +sub v3.4s, v15.4s, v22.4s +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v12.4S, v24.s[2] +mul v12.4S, v12.4S,v30.s[2] +mla v12.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v12.4s +add v21.4s, v21.4s, v12.4s +trn1 v12.4S, v15.4S, v3.4S +trn2 v10.4S, v15.4S, v3.4S +trn1 v1.4S, v21.4S, v22.4S +trn2 v18.4S, v21.4S, v22.4S +trn2 v21.2D, v12.2D, v1.2D +trn2 v22.2D, v10.2D, v18.2D +trn1 v15.2D, v12.2D, v1.2D +trn1 v3.2D, v10.2D, v18.2D +sqrdmulh v18.4S, v21.4S, v29.4S +mul v21.4S, v21.4S,v28.4S +mla v21.4S, v18.4S, v31.s[0] +sub v18.4s, v15.4s, v21.4s +add v15.4s, v15.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v29.4S +mul v22.4S, v22.4S,v28.4S +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v3.4s, v22.4s +add v3.4s, v3.4s, v22.4s +sqrdmulh v22.4S, v3.4S, v26.4S +mul v3.4S, v3.4S,v27.4S +mla v3.4S, v22.4S, v31.s[0] +sub v22.4s, v15.4s, v3.4s +add v15.4s, v15.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v4.4S +mul v21.4S, v21.4S,v25.4S +mla v21.4S, v3.4S, v31.s[0] +sub v3.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +str q15, [x0, #64] +str q22, [x0, #80] +str q18, [x0, #96] +str q3, [x0, #112] +ldr q3, [x17, #+384] +ldr q18, [x17, #+400] +ldr q22, [x17, #+416] +ldr q15, [x17, #+432] +ldr q21, [x17, #+448] +ldr q10, [x17, #+464] +ldr q1, [x17, #+480] +ldr q12, [x17, #+496] +ldr q4, [x0, #160] +ldr q25, [x0, #176] +ldr q26, [x0, #128] +ldr q27, [x0, #144] +sqrdmulh v29.4S, v4.4S, v18.s[0] +mul v4.4S, v4.4S,v3.s[0] +mla v4.4S, v29.4S, v31.s[0] +sub v29.4s, v26.4s, v4.4s +add v26.4s, v26.4s, v4.4s +sqrdmulh v4.4S, v25.4S, v18.s[0] +mul v25.4S, v25.4S,v3.s[0] +mla v25.4S, v4.4S, v31.s[0] +sub v4.4s, v27.4s, v25.4s +add v27.4s, v27.4s, v25.4s +sqrdmulh v25.4S, v27.4S, v18.s[1] +mul v27.4S, v27.4S,v3.s[1] +mla v27.4S, v25.4S, v31.s[0] +sub v25.4s, v26.4s, v27.4s +add v26.4s, v26.4s, v27.4s +sqrdmulh v27.4S, v4.4S, v18.s[2] +mul v4.4S, v4.4S,v3.s[2] +mla v4.4S, v27.4S, v31.s[0] +sub v27.4s, v29.4s, v4.4s +add v29.4s, v29.4s, v4.4s +trn1 v4.4S, v26.4S, v25.4S +trn2 v28.4S, v26.4S, v25.4S +trn1 v24.4S, v29.4S, v27.4S +trn2 v30.4S, v29.4S, v27.4S +trn2 v29.2D, v4.2D, v24.2D +trn2 v27.2D, v28.2D, v30.2D +trn1 v26.2D, v4.2D, v24.2D +trn1 v25.2D, v28.2D, v30.2D +sqrdmulh v30.4S, v29.4S, v15.4S +mul v29.4S, v29.4S,v22.4S +mla v29.4S, v30.4S, v31.s[0] +sub v30.4s, v26.4s, v29.4s +add v26.4s, v26.4s, v29.4s +sqrdmulh v29.4S, v27.4S, v15.4S +mul v27.4S, v27.4S,v22.4S +mla v27.4S, v29.4S, v31.s[0] +sub v29.4s, v25.4s, v27.4s +add v25.4s, v25.4s, v27.4s +sqrdmulh v27.4S, v25.4S, v10.4S +mul v25.4S, v25.4S,v21.4S +mla v25.4S, v27.4S, v31.s[0] +sub v27.4s, v26.4s, v25.4s +add v26.4s, v26.4s, v25.4s +sqrdmulh v25.4S, v29.4S, v12.4S +mul v29.4S, v29.4S,v1.4S +mla v29.4S, v25.4S, v31.s[0] +sub v25.4s, v30.4s, v29.4s +add v30.4s, v30.4s, v29.4s +str q26, [x0, #128] +str q27, [x0, #144] +str q30, [x0, #160] +str q25, [x0, #176] +ldr q25, [x17, #+512] +ldr q30, [x17, #+528] +ldr q27, [x17, #+544] +ldr q26, [x17, #+560] +ldr q29, [x17, #+576] +ldr q28, [x17, #+592] +ldr q24, [x17, #+608] +ldr q4, [x17, #+624] +ldr q12, [x0, #224] +ldr q1, [x0, #240] +ldr q10, [x0, #192] +ldr q21, [x0, #208] +sqrdmulh v15.4S, v12.4S, v30.s[0] +mul v12.4S, v12.4S,v25.s[0] +mla v12.4S, v15.4S, v31.s[0] +sub v15.4s, v10.4s, v12.4s +add v10.4s, v10.4s, v12.4s +sqrdmulh v12.4S, v1.4S, v30.s[0] +mul v1.4S, v1.4S,v25.s[0] +mla v1.4S, v12.4S, v31.s[0] +sub v12.4s, v21.4s, v1.4s +add v21.4s, v21.4s, v1.4s +sqrdmulh v1.4S, v21.4S, v30.s[1] +mul v21.4S, v21.4S,v25.s[1] +mla v21.4S, v1.4S, v31.s[0] +sub v1.4s, v10.4s, v21.4s +add v10.4s, v10.4s, v21.4s +sqrdmulh v21.4S, v12.4S, v30.s[2] +mul v12.4S, v12.4S,v25.s[2] +mla v12.4S, v21.4S, v31.s[0] +sub v21.4s, v15.4s, v12.4s +add v15.4s, v15.4s, v12.4s +trn1 v12.4S, v10.4S, v1.4S +trn2 v22.4S, v10.4S, v1.4S +trn1 v18.4S, v15.4S, v21.4S +trn2 v3.4S, v15.4S, v21.4S +trn2 v15.2D, v12.2D, v18.2D +trn2 v21.2D, v22.2D, v3.2D +trn1 v10.2D, v12.2D, v18.2D +trn1 v1.2D, v22.2D, v3.2D +sqrdmulh v3.4S, v15.4S, v26.4S +mul v15.4S, v15.4S,v27.4S +mla v15.4S, v3.4S, v31.s[0] +sub v3.4s, v10.4s, v15.4s +add v10.4s, v10.4s, v15.4s +sqrdmulh v15.4S, v21.4S, v26.4S +mul v21.4S, v21.4S,v27.4S +mla v21.4S, v15.4S, v31.s[0] +sub v15.4s, v1.4s, v21.4s +add v1.4s, v1.4s, v21.4s +sqrdmulh v21.4S, v1.4S, v28.4S +mul v1.4S, v1.4S,v29.4S +mla v1.4S, v21.4S, v31.s[0] +sub v21.4s, v10.4s, v1.4s +add v10.4s, v10.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v4.4S +mul v15.4S, v15.4S,v24.4S +mla v15.4S, v1.4S, v31.s[0] +sub v1.4s, v3.4s, v15.4s +add v3.4s, v3.4s, v15.4s +str q10, [x0, #192] +str q21, [x0, #208] +str q3, [x0, #224] +str q1, [x0, #240] +ldr q1, [x17, #+640] +ldr q3, [x17, #+656] +ldr q21, [x17, #+672] +ldr q10, [x17, #+688] +ldr q15, [x17, #+704] +ldr q22, [x17, #+720] +ldr q18, [x17, #+736] +ldr q12, [x17, #+752] +ldr q4, [x0, #288] +ldr q24, [x0, #304] +ldr q28, [x0, #256] +ldr q29, [x0, #272] +sqrdmulh v26.4S, v4.4S, v3.s[0] +mul v4.4S, v4.4S,v1.s[0] +mla v4.4S, v26.4S, v31.s[0] +sub v26.4s, v28.4s, v4.4s +add v28.4s, v28.4s, v4.4s +sqrdmulh v4.4S, v24.4S, v3.s[0] +mul v24.4S, v24.4S,v1.s[0] +mla v24.4S, v4.4S, v31.s[0] +sub v4.4s, v29.4s, v24.4s +add v29.4s, v29.4s, v24.4s +sqrdmulh v24.4S, v29.4S, v3.s[1] +mul v29.4S, v29.4S,v1.s[1] +mla v29.4S, v24.4S, v31.s[0] +sub v24.4s, v28.4s, v29.4s +add v28.4s, v28.4s, v29.4s +sqrdmulh v29.4S, v4.4S, v3.s[2] +mul v4.4S, v4.4S,v1.s[2] +mla v4.4S, v29.4S, v31.s[0] +sub v29.4s, v26.4s, v4.4s +add v26.4s, v26.4s, v4.4s +trn1 v4.4S, v28.4S, v24.4S +trn2 v27.4S, v28.4S, v24.4S +trn1 v30.4S, v26.4S, v29.4S +trn2 v25.4S, v26.4S, v29.4S +trn2 v26.2D, v4.2D, v30.2D +trn2 v29.2D, v27.2D, v25.2D +trn1 v28.2D, v4.2D, v30.2D +trn1 v24.2D, v27.2D, v25.2D +sqrdmulh v25.4S, v26.4S, v10.4S +mul v26.4S, v26.4S,v21.4S +mla v26.4S, v25.4S, v31.s[0] +sub v25.4s, v28.4s, v26.4s +add v28.4s, v28.4s, v26.4s +sqrdmulh v26.4S, v29.4S, v10.4S +mul v29.4S, v29.4S,v21.4S +mla v29.4S, v26.4S, v31.s[0] +sub v26.4s, v24.4s, v29.4s +add v24.4s, v24.4s, v29.4s +sqrdmulh v29.4S, v24.4S, v22.4S +mul v24.4S, v24.4S,v15.4S +mla v24.4S, v29.4S, v31.s[0] +sub v29.4s, v28.4s, v24.4s +add v28.4s, v28.4s, v24.4s +sqrdmulh v24.4S, v26.4S, v12.4S +mul v26.4S, v26.4S,v18.4S +mla v26.4S, v24.4S, v31.s[0] +sub v24.4s, v25.4s, v26.4s +add v25.4s, v25.4s, v26.4s +str q28, [x0, #256] +str q29, [x0, #272] +str q25, [x0, #288] +str q24, [x0, #304] +ldr q24, [x17, #+768] +ldr q25, [x17, #+784] +ldr q29, [x17, #+800] +ldr q28, [x17, #+816] +ldr q26, [x17, #+832] +ldr q27, [x17, #+848] +ldr q30, [x17, #+864] +ldr q4, [x17, #+880] +ldr q12, [x0, #352] +ldr q18, [x0, #368] +ldr q22, [x0, #320] +ldr q15, [x0, #336] +sqrdmulh v10.4S, v12.4S, v25.s[0] +mul v12.4S, v12.4S,v24.s[0] +mla v12.4S, v10.4S, v31.s[0] +sub v10.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v18.4S, v25.s[0] +mul v18.4S, v18.4S,v24.s[0] +mla v18.4S, v12.4S, v31.s[0] +sub v12.4s, v15.4s, v18.4s +add v15.4s, v15.4s, v18.4s +sqrdmulh v18.4S, v15.4S, v25.s[1] +mul v15.4S, v15.4S,v24.s[1] +mla v15.4S, v18.4S, v31.s[0] +sub v18.4s, v22.4s, v15.4s +add v22.4s, v22.4s, v15.4s +sqrdmulh v15.4S, v12.4S, v25.s[2] +mul v12.4S, v12.4S,v24.s[2] +mla v12.4S, v15.4S, v31.s[0] +sub v15.4s, v10.4s, v12.4s +add v10.4s, v10.4s, v12.4s +trn1 v12.4S, v22.4S, v18.4S +trn2 v21.4S, v22.4S, v18.4S +trn1 v3.4S, v10.4S, v15.4S +trn2 v1.4S, v10.4S, v15.4S +trn2 v10.2D, v12.2D, v3.2D +trn2 v15.2D, v21.2D, v1.2D +trn1 v22.2D, v12.2D, v3.2D +trn1 v18.2D, v21.2D, v1.2D +sqrdmulh v1.4S, v10.4S, v28.4S +mul v10.4S, v10.4S,v29.4S +mla v10.4S, v1.4S, v31.s[0] +sub v1.4s, v22.4s, v10.4s +add v22.4s, v22.4s, v10.4s +sqrdmulh v10.4S, v15.4S, v28.4S +mul v15.4S, v15.4S,v29.4S +mla v15.4S, v10.4S, v31.s[0] +sub v10.4s, v18.4s, v15.4s +add v18.4s, v18.4s, v15.4s +sqrdmulh v15.4S, v18.4S, v27.4S +mul v18.4S, v18.4S,v26.4S +mla v18.4S, v15.4S, v31.s[0] +sub v15.4s, v22.4s, v18.4s +add v22.4s, v22.4s, v18.4s +sqrdmulh v18.4S, v10.4S, v4.4S +mul v10.4S, v10.4S,v30.4S +mla v10.4S, v18.4S, v31.s[0] +sub v18.4s, v1.4s, v10.4s +add v1.4s, v1.4s, v10.4s +str q22, [x0, #320] +str q15, [x0, #336] +str q1, [x0, #352] +str q18, [x0, #368] +ldr q18, [x17, #+896] +ldr q1, [x17, #+912] +ldr q15, [x17, #+928] +ldr q22, [x17, #+944] +ldr q10, [x17, #+960] +ldr q21, [x17, #+976] +ldr q3, [x17, #+992] +ldr q12, [x17, #+1008] +ldr q4, [x0, #416] +ldr q30, [x0, #432] +ldr q27, [x0, #384] +ldr q26, [x0, #400] +sqrdmulh v28.4S, v4.4S, v1.s[0] +mul v4.4S, v4.4S,v18.s[0] +mla v4.4S, v28.4S, v31.s[0] +sub v28.4s, v27.4s, v4.4s +add v27.4s, v27.4s, v4.4s +sqrdmulh v4.4S, v30.4S, v1.s[0] +mul v30.4S, v30.4S,v18.s[0] +mla v30.4S, v4.4S, v31.s[0] +sub v4.4s, v26.4s, v30.4s +add v26.4s, v26.4s, v30.4s +sqrdmulh v30.4S, v26.4S, v1.s[1] +mul v26.4S, v26.4S,v18.s[1] +mla v26.4S, v30.4S, v31.s[0] +sub v30.4s, v27.4s, v26.4s +add v27.4s, v27.4s, v26.4s +sqrdmulh v26.4S, v4.4S, v1.s[2] +mul v4.4S, v4.4S,v18.s[2] +mla v4.4S, v26.4S, v31.s[0] +sub v26.4s, v28.4s, v4.4s +add v28.4s, v28.4s, v4.4s +trn1 v4.4S, v27.4S, v30.4S +trn2 v29.4S, v27.4S, v30.4S +trn1 v25.4S, v28.4S, v26.4S +trn2 v24.4S, v28.4S, v26.4S +trn2 v28.2D, v4.2D, v25.2D +trn2 v26.2D, v29.2D, v24.2D +trn1 v27.2D, v4.2D, v25.2D +trn1 v30.2D, v29.2D, v24.2D +sqrdmulh v24.4S, v28.4S, v22.4S +mul v28.4S, v28.4S,v15.4S +mla v28.4S, v24.4S, v31.s[0] +sub v24.4s, v27.4s, v28.4s +add v27.4s, v27.4s, v28.4s +sqrdmulh v28.4S, v26.4S, v22.4S +mul v26.4S, v26.4S,v15.4S +mla v26.4S, v28.4S, v31.s[0] +sub v28.4s, v30.4s, v26.4s +add v30.4s, v30.4s, v26.4s +sqrdmulh v26.4S, v30.4S, v21.4S +mul v30.4S, v30.4S,v10.4S +mla v30.4S, v26.4S, v31.s[0] +sub v26.4s, v27.4s, v30.4s +add v27.4s, v27.4s, v30.4s +sqrdmulh v30.4S, v28.4S, v12.4S +mul v28.4S, v28.4S,v3.4S +mla v28.4S, v30.4S, v31.s[0] +sub v30.4s, v24.4s, v28.4s +add v24.4s, v24.4s, v28.4s +str q27, [x0, #384] +str q26, [x0, #400] +str q24, [x0, #416] +str q30, [x0, #432] +ldr q30, [x17, #+1024] +ldr q24, [x17, #+1040] +ldr q26, [x17, #+1056] +ldr q27, [x17, #+1072] +ldr q28, [x17, #+1088] +ldr q29, [x17, #+1104] +ldr q25, [x17, #+1120] +ldr q4, [x17, #+1136] +ldr q12, [x0, #480] +ldr q3, [x0, #496] +ldr q21, [x0, #448] +ldr q10, [x0, #464] +sqrdmulh v22.4S, v12.4S, v24.s[0] +mul v12.4S, v12.4S,v30.s[0] +mla v12.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v12.4s +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v3.4S, v24.s[0] +mul v3.4S, v3.4S,v30.s[0] +mla v3.4S, v12.4S, v31.s[0] +sub v12.4s, v10.4s, v3.4s +add v10.4s, v10.4s, v3.4s +sqrdmulh v3.4S, v10.4S, v24.s[1] +mul v10.4S, v10.4S,v30.s[1] +mla v10.4S, v3.4S, v31.s[0] +sub v3.4s, v21.4s, v10.4s +add v21.4s, v21.4s, v10.4s +sqrdmulh v10.4S, v12.4S, v24.s[2] +mul v12.4S, v12.4S,v30.s[2] +mla v12.4S, v10.4S, v31.s[0] +sub v10.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +trn1 v12.4S, v21.4S, v3.4S +trn2 v15.4S, v21.4S, v3.4S +trn1 v1.4S, v22.4S, v10.4S +trn2 v18.4S, v22.4S, v10.4S +trn2 v22.2D, v12.2D, v1.2D +trn2 v10.2D, v15.2D, v18.2D +trn1 v21.2D, v12.2D, v1.2D +trn1 v3.2D, v15.2D, v18.2D +sqrdmulh v18.4S, v22.4S, v27.4S +mul v22.4S, v22.4S,v26.4S +mla v22.4S, v18.4S, v31.s[0] +sub v18.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +sqrdmulh v22.4S, v10.4S, v27.4S +mul v10.4S, v10.4S,v26.4S +mla v10.4S, v22.4S, v31.s[0] +sub v22.4s, v3.4s, v10.4s +add v3.4s, v3.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v29.4S +mul v3.4S, v3.4S,v28.4S +mla v3.4S, v10.4S, v31.s[0] +sub v10.4s, v21.4s, v3.4s +add v21.4s, v21.4s, v3.4s +sqrdmulh v3.4S, v22.4S, v4.4S +mul v22.4S, v22.4S,v25.4S +mla v22.4S, v3.4S, v31.s[0] +sub v3.4s, v18.4s, v22.4s +add v18.4s, v18.4s, v22.4s +str q21, [x0, #448] +str q10, [x0, #464] +str q18, [x0, #480] +str q3, [x0, #496] +ldr q3, [x17, #+1152] +ldr q18, [x17, #+1168] +ldr q10, [x17, #+1184] +ldr q21, [x17, #+1200] +ldr q22, [x17, #+1216] +ldr q15, [x17, #+1232] +ldr q1, [x17, #+1248] +ldr q12, [x17, #+1264] +ldr q4, [x0, #544] +ldr q25, [x0, #560] +ldr q29, [x0, #512] +ldr q28, [x0, #528] +sqrdmulh v27.4S, v4.4S, v18.s[0] +mul v4.4S, v4.4S,v3.s[0] +mla v4.4S, v27.4S, v31.s[0] +sub v27.4s, v29.4s, v4.4s +add v29.4s, v29.4s, v4.4s +sqrdmulh v4.4S, v25.4S, v18.s[0] +mul v25.4S, v25.4S,v3.s[0] +mla v25.4S, v4.4S, v31.s[0] +sub v4.4s, v28.4s, v25.4s +add v28.4s, v28.4s, v25.4s +sqrdmulh v25.4S, v28.4S, v18.s[1] +mul v28.4S, v28.4S,v3.s[1] +mla v28.4S, v25.4S, v31.s[0] +sub v25.4s, v29.4s, v28.4s +add v29.4s, v29.4s, v28.4s +sqrdmulh v28.4S, v4.4S, v18.s[2] +mul v4.4S, v4.4S,v3.s[2] +mla v4.4S, v28.4S, v31.s[0] +sub v28.4s, v27.4s, v4.4s +add v27.4s, v27.4s, v4.4s +trn1 v4.4S, v29.4S, v25.4S +trn2 v26.4S, v29.4S, v25.4S +trn1 v24.4S, v27.4S, v28.4S +trn2 v30.4S, v27.4S, v28.4S +trn2 v27.2D, v4.2D, v24.2D +trn2 v28.2D, v26.2D, v30.2D +trn1 v29.2D, v4.2D, v24.2D +trn1 v25.2D, v26.2D, v30.2D +sqrdmulh v30.4S, v27.4S, v21.4S +mul v27.4S, v27.4S,v10.4S +mla v27.4S, v30.4S, v31.s[0] +sub v30.4s, v29.4s, v27.4s +add v29.4s, v29.4s, v27.4s +sqrdmulh v27.4S, v28.4S, v21.4S +mul v28.4S, v28.4S,v10.4S +mla v28.4S, v27.4S, v31.s[0] +sub v27.4s, v25.4s, v28.4s +add v25.4s, v25.4s, v28.4s +sqrdmulh v28.4S, v25.4S, v15.4S +mul v25.4S, v25.4S,v22.4S +mla v25.4S, v28.4S, v31.s[0] +sub v28.4s, v29.4s, v25.4s +add v29.4s, v29.4s, v25.4s +sqrdmulh v25.4S, v27.4S, v12.4S +mul v27.4S, v27.4S,v1.4S +mla v27.4S, v25.4S, v31.s[0] +sub v25.4s, v30.4s, v27.4s +add v30.4s, v30.4s, v27.4s +str q29, [x0, #512] +str q28, [x0, #528] +str q30, [x0, #544] +str q25, [x0, #560] +ldr q25, [x17, #+1280] +ldr q30, [x17, #+1296] +ldr q28, [x17, #+1312] +ldr q29, [x17, #+1328] +ldr q27, [x17, #+1344] +ldr q26, [x17, #+1360] +ldr q24, [x17, #+1376] +ldr q4, [x17, #+1392] +ldr q12, [x0, #608] +ldr q1, [x0, #624] +ldr q15, [x0, #576] +ldr q22, [x0, #592] +sqrdmulh v21.4S, v12.4S, v30.s[0] +mul v12.4S, v12.4S,v25.s[0] +mla v12.4S, v21.4S, v31.s[0] +sub v21.4s, v15.4s, v12.4s +add v15.4s, v15.4s, v12.4s +sqrdmulh v12.4S, v1.4S, v30.s[0] +mul v1.4S, v1.4S,v25.s[0] +mla v1.4S, v12.4S, v31.s[0] +sub v12.4s, v22.4s, v1.4s +add v22.4s, v22.4s, v1.4s +sqrdmulh v1.4S, v22.4S, v30.s[1] +mul v22.4S, v22.4S,v25.s[1] +mla v22.4S, v1.4S, v31.s[0] +sub v1.4s, v15.4s, v22.4s +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v12.4S, v30.s[2] +mul v12.4S, v12.4S,v25.s[2] +mla v12.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v12.4s +add v21.4s, v21.4s, v12.4s +trn1 v12.4S, v15.4S, v1.4S +trn2 v10.4S, v15.4S, v1.4S +trn1 v18.4S, v21.4S, v22.4S +trn2 v3.4S, v21.4S, v22.4S +trn2 v21.2D, v12.2D, v18.2D +trn2 v22.2D, v10.2D, v3.2D +trn1 v15.2D, v12.2D, v18.2D +trn1 v1.2D, v10.2D, v3.2D +sqrdmulh v3.4S, v21.4S, v29.4S +mul v21.4S, v21.4S,v28.4S +mla v21.4S, v3.4S, v31.s[0] +sub v3.4s, v15.4s, v21.4s +add v15.4s, v15.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v29.4S +mul v22.4S, v22.4S,v28.4S +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +sqrdmulh v22.4S, v1.4S, v26.4S +mul v1.4S, v1.4S,v27.4S +mla v1.4S, v22.4S, v31.s[0] +sub v22.4s, v15.4s, v1.4s +add v15.4s, v15.4s, v1.4s +sqrdmulh v1.4S, v21.4S, v4.4S +mul v21.4S, v21.4S,v24.4S +mla v21.4S, v1.4S, v31.s[0] +sub v1.4s, v3.4s, v21.4s +add v3.4s, v3.4s, v21.4s +str q15, [x0, #576] +str q22, [x0, #592] +str q3, [x0, #608] +str q1, [x0, #624] +ldr q1, [x17, #+1408] +ldr q3, [x17, #+1424] +ldr q22, [x17, #+1440] +ldr q15, [x17, #+1456] +ldr q21, [x17, #+1472] +ldr q10, [x17, #+1488] +ldr q18, [x17, #+1504] +ldr q12, [x17, #+1520] +ldr q4, [x0, #672] +ldr q24, [x0, #688] +ldr q26, [x0, #640] +ldr q27, [x0, #656] +sqrdmulh v29.4S, v4.4S, v3.s[0] +mul v4.4S, v4.4S,v1.s[0] +mla v4.4S, v29.4S, v31.s[0] +sub v29.4s, v26.4s, v4.4s +add v26.4s, v26.4s, v4.4s +sqrdmulh v4.4S, v24.4S, v3.s[0] +mul v24.4S, v24.4S,v1.s[0] +mla v24.4S, v4.4S, v31.s[0] +sub v4.4s, v27.4s, v24.4s +add v27.4s, v27.4s, v24.4s +sqrdmulh v24.4S, v27.4S, v3.s[1] +mul v27.4S, v27.4S,v1.s[1] +mla v27.4S, v24.4S, v31.s[0] +sub v24.4s, v26.4s, v27.4s +add v26.4s, v26.4s, v27.4s +sqrdmulh v27.4S, v4.4S, v3.s[2] +mul v4.4S, v4.4S,v1.s[2] +mla v4.4S, v27.4S, v31.s[0] +sub v27.4s, v29.4s, v4.4s +add v29.4s, v29.4s, v4.4s +trn1 v4.4S, v26.4S, v24.4S +trn2 v28.4S, v26.4S, v24.4S +trn1 v30.4S, v29.4S, v27.4S +trn2 v25.4S, v29.4S, v27.4S +trn2 v29.2D, v4.2D, v30.2D +trn2 v27.2D, v28.2D, v25.2D +trn1 v26.2D, v4.2D, v30.2D +trn1 v24.2D, v28.2D, v25.2D +sqrdmulh v25.4S, v29.4S, v15.4S +mul v29.4S, v29.4S,v22.4S +mla v29.4S, v25.4S, v31.s[0] +sub v25.4s, v26.4s, v29.4s +add v26.4s, v26.4s, v29.4s +sqrdmulh v29.4S, v27.4S, v15.4S +mul v27.4S, v27.4S,v22.4S +mla v27.4S, v29.4S, v31.s[0] +sub v29.4s, v24.4s, v27.4s +add v24.4s, v24.4s, v27.4s +sqrdmulh v27.4S, v24.4S, v10.4S +mul v24.4S, v24.4S,v21.4S +mla v24.4S, v27.4S, v31.s[0] +sub v27.4s, v26.4s, v24.4s +add v26.4s, v26.4s, v24.4s +sqrdmulh v24.4S, v29.4S, v12.4S +mul v29.4S, v29.4S,v18.4S +mla v29.4S, v24.4S, v31.s[0] +sub v24.4s, v25.4s, v29.4s +add v25.4s, v25.4s, v29.4s +str q26, [x0, #640] +str q27, [x0, #656] +str q25, [x0, #672] +str q24, [x0, #688] +ldr q24, [x17, #+1536] +ldr q25, [x17, #+1552] +ldr q27, [x17, #+1568] +ldr q26, [x17, #+1584] +ldr q29, [x17, #+1600] +ldr q28, [x17, #+1616] +ldr q30, [x17, #+1632] +ldr q4, [x17, #+1648] +ldr q12, [x0, #736] +ldr q18, [x0, #752] +ldr q10, [x0, #704] +ldr q21, [x0, #720] +sqrdmulh v15.4S, v12.4S, v25.s[0] +mul v12.4S, v12.4S,v24.s[0] +mla v12.4S, v15.4S, v31.s[0] +sub v15.4s, v10.4s, v12.4s +add v10.4s, v10.4s, v12.4s +sqrdmulh v12.4S, v18.4S, v25.s[0] +mul v18.4S, v18.4S,v24.s[0] +mla v18.4S, v12.4S, v31.s[0] +sub v12.4s, v21.4s, v18.4s +add v21.4s, v21.4s, v18.4s +sqrdmulh v18.4S, v21.4S, v25.s[1] +mul v21.4S, v21.4S,v24.s[1] +mla v21.4S, v18.4S, v31.s[0] +sub v18.4s, v10.4s, v21.4s +add v10.4s, v10.4s, v21.4s +sqrdmulh v21.4S, v12.4S, v25.s[2] +mul v12.4S, v12.4S,v24.s[2] +mla v12.4S, v21.4S, v31.s[0] +sub v21.4s, v15.4s, v12.4s +add v15.4s, v15.4s, v12.4s +trn1 v12.4S, v10.4S, v18.4S +trn2 v22.4S, v10.4S, v18.4S +trn1 v3.4S, v15.4S, v21.4S +trn2 v1.4S, v15.4S, v21.4S +trn2 v15.2D, v12.2D, v3.2D +trn2 v21.2D, v22.2D, v1.2D +trn1 v10.2D, v12.2D, v3.2D +trn1 v18.2D, v22.2D, v1.2D +sqrdmulh v1.4S, v15.4S, v26.4S +mul v15.4S, v15.4S,v27.4S +mla v15.4S, v1.4S, v31.s[0] +sub v1.4s, v10.4s, v15.4s +add v10.4s, v10.4s, v15.4s +sqrdmulh v15.4S, v21.4S, v26.4S +mul v21.4S, v21.4S,v27.4S +mla v21.4S, v15.4S, v31.s[0] +sub v15.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v18.4S, v28.4S +mul v18.4S, v18.4S,v29.4S +mla v18.4S, v21.4S, v31.s[0] +sub v21.4s, v10.4s, v18.4s +add v10.4s, v10.4s, v18.4s +sqrdmulh v18.4S, v15.4S, v4.4S +mul v15.4S, v15.4S,v30.4S +mla v15.4S, v18.4S, v31.s[0] +sub v18.4s, v1.4s, v15.4s +add v1.4s, v1.4s, v15.4s +str q10, [x0, #704] +str q21, [x0, #720] +str q1, [x0, #736] +str q18, [x0, #752] +ldr q18, [x17, #+1664] +ldr q1, [x17, #+1680] +ldr q21, [x17, #+1696] +ldr q10, [x17, #+1712] +ldr q15, [x17, #+1728] +ldr q22, [x17, #+1744] +ldr q3, [x17, #+1760] +ldr q12, [x17, #+1776] +ldr q4, [x0, #800] +ldr q30, [x0, #816] +ldr q28, [x0, #768] +ldr q29, [x0, #784] +sqrdmulh v26.4S, v4.4S, v1.s[0] +mul v4.4S, v4.4S,v18.s[0] +mla v4.4S, v26.4S, v31.s[0] +sub v26.4s, v28.4s, v4.4s +add v28.4s, v28.4s, v4.4s +sqrdmulh v4.4S, v30.4S, v1.s[0] +mul v30.4S, v30.4S,v18.s[0] +mla v30.4S, v4.4S, v31.s[0] +sub v4.4s, v29.4s, v30.4s +add v29.4s, v29.4s, v30.4s +sqrdmulh v30.4S, v29.4S, v1.s[1] +mul v29.4S, v29.4S,v18.s[1] +mla v29.4S, v30.4S, v31.s[0] +sub v30.4s, v28.4s, v29.4s +add v28.4s, v28.4s, v29.4s +sqrdmulh v29.4S, v4.4S, v1.s[2] +mul v4.4S, v4.4S,v18.s[2] +mla v4.4S, v29.4S, v31.s[0] +sub v29.4s, v26.4s, v4.4s +add v26.4s, v26.4s, v4.4s +trn1 v4.4S, v28.4S, v30.4S +trn2 v27.4S, v28.4S, v30.4S +trn1 v25.4S, v26.4S, v29.4S +trn2 v24.4S, v26.4S, v29.4S +trn2 v26.2D, v4.2D, v25.2D +trn2 v29.2D, v27.2D, v24.2D +trn1 v28.2D, v4.2D, v25.2D +trn1 v30.2D, v27.2D, v24.2D +sqrdmulh v24.4S, v26.4S, v10.4S +mul v26.4S, v26.4S,v21.4S +mla v26.4S, v24.4S, v31.s[0] +sub v24.4s, v28.4s, v26.4s +add v28.4s, v28.4s, v26.4s +sqrdmulh v26.4S, v29.4S, v10.4S +mul v29.4S, v29.4S,v21.4S +mla v29.4S, v26.4S, v31.s[0] +sub v26.4s, v30.4s, v29.4s +add v30.4s, v30.4s, v29.4s +sqrdmulh v29.4S, v30.4S, v22.4S +mul v30.4S, v30.4S,v15.4S +mla v30.4S, v29.4S, v31.s[0] +sub v29.4s, v28.4s, v30.4s +add v28.4s, v28.4s, v30.4s +sqrdmulh v30.4S, v26.4S, v12.4S +mul v26.4S, v26.4S,v3.4S +mla v26.4S, v30.4S, v31.s[0] +sub v30.4s, v24.4s, v26.4s +add v24.4s, v24.4s, v26.4s +str q28, [x0, #768] +str q29, [x0, #784] +str q24, [x0, #800] +str q30, [x0, #816] +ldr q30, [x17, #+1792] +ldr q24, [x17, #+1808] +ldr q29, [x17, #+1824] +ldr q28, [x17, #+1840] +ldr q26, [x17, #+1856] +ldr q27, [x17, #+1872] +ldr q25, [x17, #+1888] +ldr q4, [x17, #+1904] +ldr q12, [x0, #864] +ldr q3, [x0, #880] +ldr q22, [x0, #832] +ldr q15, [x0, #848] +sqrdmulh v10.4S, v12.4S, v24.s[0] +mul v12.4S, v12.4S,v30.s[0] +mla v12.4S, v10.4S, v31.s[0] +sub v10.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v3.4S, v24.s[0] +mul v3.4S, v3.4S,v30.s[0] +mla v3.4S, v12.4S, v31.s[0] +sub v12.4s, v15.4s, v3.4s +add v15.4s, v15.4s, v3.4s +sqrdmulh v3.4S, v15.4S, v24.s[1] +mul v15.4S, v15.4S,v30.s[1] +mla v15.4S, v3.4S, v31.s[0] +sub v3.4s, v22.4s, v15.4s +add v22.4s, v22.4s, v15.4s +sqrdmulh v15.4S, v12.4S, v24.s[2] +mul v12.4S, v12.4S,v30.s[2] +mla v12.4S, v15.4S, v31.s[0] +sub v15.4s, v10.4s, v12.4s +add v10.4s, v10.4s, v12.4s +trn1 v12.4S, v22.4S, v3.4S +trn2 v21.4S, v22.4S, v3.4S +trn1 v1.4S, v10.4S, v15.4S +trn2 v18.4S, v10.4S, v15.4S +trn2 v10.2D, v12.2D, v1.2D +trn2 v15.2D, v21.2D, v18.2D +trn1 v22.2D, v12.2D, v1.2D +trn1 v3.2D, v21.2D, v18.2D +sqrdmulh v18.4S, v10.4S, v28.4S +mul v10.4S, v10.4S,v29.4S +mla v10.4S, v18.4S, v31.s[0] +sub v18.4s, v22.4s, v10.4s +add v22.4s, v22.4s, v10.4s +sqrdmulh v10.4S, v15.4S, v28.4S +mul v15.4S, v15.4S,v29.4S +mla v15.4S, v10.4S, v31.s[0] +sub v10.4s, v3.4s, v15.4s +add v3.4s, v3.4s, v15.4s +sqrdmulh v15.4S, v3.4S, v27.4S +mul v3.4S, v3.4S,v26.4S +mla v3.4S, v15.4S, v31.s[0] +sub v15.4s, v22.4s, v3.4s +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v10.4S, v4.4S +mul v10.4S, v10.4S,v25.4S +mla v10.4S, v3.4S, v31.s[0] +sub v3.4s, v18.4s, v10.4s +add v18.4s, v18.4s, v10.4s +str q22, [x0, #832] +str q15, [x0, #848] +str q18, [x0, #864] +str q3, [x0, #880] +ldr q3, [x17, #+1920] +ldr q18, [x17, #+1936] +ldr q15, [x17, #+1952] +ldr q22, [x17, #+1968] +ldr q10, [x17, #+1984] +ldr q21, [x17, #+2000] +ldr q1, [x17, #+2016] +ldr q12, [x17, #+2032] +ldr q4, [x0, #928] +ldr q25, [x0, #944] +ldr q27, [x0, #896] +ldr q26, [x0, #912] +sqrdmulh v28.4S, v4.4S, v18.s[0] +mul v4.4S, v4.4S,v3.s[0] +mla v4.4S, v28.4S, v31.s[0] +sub v28.4s, v27.4s, v4.4s +add v27.4s, v27.4s, v4.4s +sqrdmulh v4.4S, v25.4S, v18.s[0] +mul v25.4S, v25.4S,v3.s[0] +mla v25.4S, v4.4S, v31.s[0] +sub v4.4s, v26.4s, v25.4s +add v26.4s, v26.4s, v25.4s +sqrdmulh v25.4S, v26.4S, v18.s[1] +mul v26.4S, v26.4S,v3.s[1] +mla v26.4S, v25.4S, v31.s[0] +sub v25.4s, v27.4s, v26.4s +add v27.4s, v27.4s, v26.4s +sqrdmulh v26.4S, v4.4S, v18.s[2] +mul v4.4S, v4.4S,v3.s[2] +mla v4.4S, v26.4S, v31.s[0] +sub v26.4s, v28.4s, v4.4s +add v28.4s, v28.4s, v4.4s +trn1 v4.4S, v27.4S, v25.4S +trn2 v29.4S, v27.4S, v25.4S +trn1 v24.4S, v28.4S, v26.4S +trn2 v30.4S, v28.4S, v26.4S +trn2 v28.2D, v4.2D, v24.2D +trn2 v26.2D, v29.2D, v30.2D +trn1 v27.2D, v4.2D, v24.2D +trn1 v25.2D, v29.2D, v30.2D +sqrdmulh v30.4S, v28.4S, v22.4S +mul v28.4S, v28.4S,v15.4S +mla v28.4S, v30.4S, v31.s[0] +sub v30.4s, v27.4s, v28.4s +add v27.4s, v27.4s, v28.4s +sqrdmulh v28.4S, v26.4S, v22.4S +mul v26.4S, v26.4S,v15.4S +mla v26.4S, v28.4S, v31.s[0] +sub v28.4s, v25.4s, v26.4s +add v25.4s, v25.4s, v26.4s +sqrdmulh v26.4S, v25.4S, v21.4S +mul v25.4S, v25.4S,v10.4S +mla v25.4S, v26.4S, v31.s[0] +sub v26.4s, v27.4s, v25.4s +add v27.4s, v27.4s, v25.4s +sqrdmulh v25.4S, v28.4S, v12.4S +mul v28.4S, v28.4S,v1.4S +mla v28.4S, v25.4S, v31.s[0] +sub v25.4s, v30.4s, v28.4s +add v30.4s, v30.4s, v28.4s +str q27, [x0, #896] +str q26, [x0, #912] +str q30, [x0, #928] +str q25, [x0, #944] +ldr q25, [x17, #+2048] +ldr q30, [x17, #+2064] +ldr q26, [x17, #+2080] +ldr q27, [x17, #+2096] +ldr q28, [x17, #+2112] +ldr q29, [x17, #+2128] +ldr q24, [x17, #+2144] +ldr q4, [x17, #+2160] +ldr q12, [x0, #992] +ldr q1, [x0, #1008] +ldr q21, [x0, #960] +ldr q10, [x0, #976] +sqrdmulh v22.4S, v12.4S, v30.s[0] +mul v12.4S, v12.4S,v25.s[0] +mla v12.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v12.4s +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v1.4S, v30.s[0] +mul v1.4S, v1.4S,v25.s[0] +mla v1.4S, v12.4S, v31.s[0] +sub v12.4s, v10.4s, v1.4s +add v10.4s, v10.4s, v1.4s +sqrdmulh v1.4S, v10.4S, v30.s[1] +mul v10.4S, v10.4S,v25.s[1] +mla v10.4S, v1.4S, v31.s[0] +sub v1.4s, v21.4s, v10.4s +add v21.4s, v21.4s, v10.4s +sqrdmulh v10.4S, v12.4S, v30.s[2] +mul v12.4S, v12.4S,v25.s[2] +mla v12.4S, v10.4S, v31.s[0] +sub v10.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +trn1 v12.4S, v21.4S, v1.4S +trn2 v15.4S, v21.4S, v1.4S +trn1 v18.4S, v22.4S, v10.4S +trn2 v3.4S, v22.4S, v10.4S +trn2 v22.2D, v12.2D, v18.2D +trn2 v10.2D, v15.2D, v3.2D +trn1 v21.2D, v12.2D, v18.2D +trn1 v1.2D, v15.2D, v3.2D +sqrdmulh v3.4S, v22.4S, v27.4S +mul v22.4S, v22.4S,v26.4S +mla v22.4S, v3.4S, v31.s[0] +sub v3.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +sqrdmulh v22.4S, v10.4S, v27.4S +mul v10.4S, v10.4S,v26.4S +mla v10.4S, v22.4S, v31.s[0] +sub v22.4s, v1.4s, v10.4s +add v1.4s, v1.4s, v10.4s +sqrdmulh v10.4S, v1.4S, v29.4S +mul v1.4S, v1.4S,v28.4S +mla v1.4S, v10.4S, v31.s[0] +sub v10.4s, v21.4s, v1.4s +add v21.4s, v21.4s, v1.4s +sqrdmulh v1.4S, v22.4S, v4.4S +mul v22.4S, v22.4S,v24.4S +mla v22.4S, v1.4S, v31.s[0] +sub v1.4s, v3.4s, v22.4s +add v3.4s, v3.4s, v22.4s +str q21, [x0, #960] +str q10, [x0, #976] +str q3, [x0, #992] +str q1, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 2456 +// Instruction count: 2452 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_11_0.s b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_11_0.s new file mode 100644 index 0000000..4ee80f8 --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_11_0.s @@ -0,0 +1,2422 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 26036764 // Layer 6, block 0 +.word 7065381 // Layer 6, block 1 +.word 11280567 // Layer 6, block 2 +.word 19695786 // Layer 6, block 3 +.word 1666225723 // Layer 6, block 0 +.word 452149874 // Layer 6, block 1 +.word 721901190 // Layer 6, block 2 +.word 1260434103 // Layer 6, block 3 +.word 28678040 // Layer 7, block 0 +.word 5637166 // Layer 7, block 2 +.word 18759424 // Layer 7, block 4 +.word 8648030 // Layer 7, block 6 +.word 1835254486 // Layer 7, block 0 +.word 360751090 // Layer 7, block 2 +.word 1200511508 // Layer 7, block 4 +.word 553431680 // Layer 7, block 6 +.word 7232147 // Layer 7, block 1 +.word 7430689 // Layer 7, block 3 +.word 14819378 // Layer 7, block 5 +.word 22112339 // Layer 7, block 7 +.word 462822084 // Layer 7, block 1 +.word 475527802 // Layer 7, block 3 +.word 948367809 // Layer 7, block 5 +.word 1415081692 // Layer 7, block 7 +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14834498 // Layer 6, block 4 +.word 22861321 // Layer 6, block 5 +.word 23033862 // Layer 6, block 6 +.word 32211066 // Layer 6, block 7 +.word 949335415 // Layer 6, block 4 +.word 1463012881 // Layer 6, block 5 +.word 1474054663 // Layer 6, block 6 +.word 2061350894 // Layer 6, block 7 +.word 7103825 // Layer 7, block 8 +.word 24338119 // Layer 7, block 10 +.word 6674394 // Layer 7, block 12 +.word 3716128 // Layer 7, block 14 +.word 454610102 // Layer 7, block 8 +.word 1557520740 // Layer 7, block 10 +.word 427128616 // Layer 7, block 12 +.word 237814041 // Layer 7, block 14 +.word 18577393 // Layer 7, block 9 +.word 17042091 // Layer 7, block 11 +.word 6574213 // Layer 7, block 13 +.word 24666803 // Layer 7, block 15 +.word 1188862414 // Layer 7, block 9 +.word 1090610585 // Layer 7, block 11 +.word 420717521 // Layer 7, block 13 +.word 1578554911 // Layer 7, block 15 +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 11253846 // Layer 6, block 8 +.word 16151303 // Layer 6, block 9 +.word 1821442 // Layer 6, block 10 +.word 23358663 // Layer 6, block 11 +.word 720191176 // Layer 6, block 8 +.word 1033604503 // Layer 6, block 9 +.word 116563391 // Layer 6, block 10 +.word 1494840340 // Layer 6, block 11 +.word 32787475 // Layer 7, block 16 +.word 8269259 // Layer 7, block 18 +.word 20826321 // Layer 7, block 20 +.word 21194054 // Layer 7, block 22 +.word 2098238255 // Layer 7, block 16 +.word 529192186 // Layer 7, block 18 +.word 1332782821 // Layer 7, block 20 +.word 1356315937 // Layer 7, block 22 +.word 28400654 // Layer 7, block 17 +.word 31090287 // Layer 7, block 19 +.word 26776841 // Layer 7, block 21 +.word 22281074 // Layer 7, block 23 +.word 1817503137 // Layer 7, block 17 +.word 1989626512 // Layer 7, block 19 +.word 1713587037 // Layer 7, block 21 +.word 1425879908 // Layer 7, block 23 +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 20504641 // Layer 6, block 12 +.word 7735096 // Layer 6, block 13 +.word 29463916 // Layer 6, block 14 +.word 23172067 // Layer 6, block 15 +.word 1312196872 // Layer 6, block 12 +.word 495008363 // Layer 6, block 13 +.word 1885546712 // Layer 6, block 14 +.word 1482899108 // Layer 6, block 15 +.word 1953000 // Layer 7, block 24 +.word 12766243 // Layer 7, block 26 +.word 16292342 // Layer 7, block 28 +.word 25143337 // Layer 7, block 30 +.word 124982461 // Layer 7, block 24 +.word 816977197 // Layer 7, block 26 +.word 1042630311 // Layer 7, block 28 +.word 1609050759 // Layer 7, block 30 +.word 12486848 // Layer 7, block 25 +.word 31556661 // Layer 7, block 27 +.word 28330310 // Layer 7, block 29 +.word 15137961 // Layer 7, block 31 +.word 799097282 // Layer 7, block 25 +.word 2019472170 // Layer 7, block 27 +.word 1813001465 // Layer 7, block 29 +.word 968755565 // Layer 7, block 31 +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 18663828 // Layer 6, block 16 +.word 25765932 // Layer 6, block 17 +.word 11779122 // Layer 6, block 18 +.word 29112305 // Layer 6, block 19 +.word 1194393831 // Layer 6, block 16 +.word 1648893798 // Layer 6, block 17 +.word 753806275 // Layer 6, block 18 +.word 1863045325 // Layer 6, block 19 +.word 33163184 // Layer 7, block 32 +.word 11550623 // Layer 7, block 34 +.word 25375595 // Layer 7, block 36 +.word 18254638 // Layer 7, block 38 +.word 2122281795 // Layer 7, block 32 +.word 739183455 // Layer 7, block 34 +.word 1623914137 // Layer 7, block 36 +.word 1168207670 // Layer 7, block 38 +.word 9551359 // Layer 7, block 33 +.word 33257316 // Layer 7, block 35 +.word 10387700 // Layer 7, block 37 +.word 4263629 // Layer 7, block 39 +.word 611240324 // Layer 7, block 33 +.word 2128305784 // Layer 7, block 35 +.word 664762063 // Layer 7, block 37 +.word 272851431 // Layer 7, block 39 +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 596073 // Layer 6, block 20 +.word 29039358 // Layer 6, block 21 +.word 6760262 // Layer 6, block 22 +.word 2228887 // Layer 6, block 23 +.word 38145761 // Layer 6, block 20 +.word 1858377074 // Layer 6, block 21 +.word 432623749 // Layer 6, block 22 +.word 142637881 // Layer 6, block 23 +.word 25929180 // Layer 7, block 40 +.word 23508428 // Layer 7, block 42 +.word 22560727 // Layer 7, block 44 +.word 29457393 // Layer 7, block 46 +.word 1659340873 // Layer 7, block 40 +.word 1504424569 // Layer 7, block 42 +.word 1443776334 // Layer 7, block 44 +.word 1885129272 // Layer 7, block 46 +.word 17371159 // Layer 7, block 41 +.word 11558208 // Layer 7, block 43 +.word 15755637 // Layer 7, block 45 +.word 20740787 // Layer 7, block 47 +.word 1111669329 // Layer 7, block 41 +.word 739668858 // Layer 7, block 43 +.word 1008283812 // Layer 7, block 45 +.word 1327309063 // Layer 7, block 47 +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 13624329 // Layer 6, block 24 +.word 9838349 // Layer 6, block 25 +.word 6934560 // Layer 6, block 26 +.word 11310234 // Layer 6, block 27 +.word 871890510 // Layer 6, block 24 +.word 629606282 // Layer 6, block 25 +.word 443777969 // Layer 6, block 26 +.word 723799733 // Layer 6, block 27 +.word 3153984 // Layer 7, block 48 +.word 15599806 // Layer 7, block 50 +.word 23484790 // Layer 7, block 52 +.word 30174454 // Layer 7, block 54 +.word 201839571 // Layer 7, block 48 +.word 998311389 // Layer 7, block 50 +.word 1502911852 // Layer 7, block 52 +.word 1931017673 // Layer 7, block 54 +.word 13598070 // Layer 7, block 49 +.word 31454003 // Layer 7, block 51 +.word 20506260 // Layer 7, block 53 +.word 5928435 // Layer 7, block 55 +.word 870210062 // Layer 7, block 49 +.word 2012902560 // Layer 7, block 51 +.word 1312300480 // Layer 7, block 53 +.word 379390883 // Layer 7, block 55 +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 32798516 // Layer 6, block 28 +.word 9911360 // Layer 6, block 29 +.word 32443170 // Layer 6, block 30 +.word 31293482 // Layer 6, block 31 +.word 2098944825 // Layer 6, block 28 +.word 634278629 // Layer 6, block 29 +.word 2076204416 // Layer 6, block 30 +.word 2002630000 // Layer 6, block 31 +.word 26013877 // Layer 7, block 56 +.word 22928950 // Layer 7, block 58 +.word 24547058 // Layer 7, block 60 +.word 21082546 // Layer 7, block 62 +.word 1664761067 // Layer 7, block 56 +.word 1467340807 // Layer 7, block 58 +.word 1570891816 // Layer 7, block 60 +.word 1349179970 // Layer 7, block 62 +.word 21864746 // Layer 7, block 57 +.word 27678266 // Layer 7, block 59 +.word 30695887 // Layer 7, block 61 +.word 31772478 // Layer 7, block 63 +.word 1399236949 // Layer 7, block 57 +.word 1771273834 // Layer 7, block 59 +.word 1964386839 // Layer 7, block 61 +.word 2033283404 // Layer 7, block 63 +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 2853776 // Layer 6, block 32 +.word 31645959 // Layer 6, block 33 +.word 29723614 // Layer 6, block 34 +.word 31813171 // Layer 6, block 35 +.word 182627725 // Layer 6, block 32 +.word 2025186806 // Layer 6, block 33 +.word 1902166116 // Layer 6, block 34 +.word 2035887557 // Layer 6, block 35 +.word 30377953 // Layer 7, block 64 +.word 4924837 // Layer 7, block 66 +.word 11362575 // Layer 7, block 68 +.word 31398766 // Layer 7, block 70 +.word 1944040616 // Layer 7, block 64 +.word 315165513 // Layer 7, block 66 +.word 727149301 // Layer 7, block 68 +.word 2009367662 // Layer 7, block 70 +.word 27689101 // Layer 7, block 65 +.word 31229525 // Layer 7, block 67 +.word 6544948 // Layer 7, block 69 +.word 13728247 // Layer 7, block 71 +.word 1771967221 // Layer 7, block 65 +.word 1998537064 // Layer 7, block 67 +.word 418844704 // Layer 7, block 69 +.word 878540754 // Layer 7, block 71 +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9116920 // Layer 6, block 36 +.word 26449800 // Layer 6, block 37 +.word 27173300 // Layer 6, block 38 +.word 1574249 // Layer 6, block 39 +.word 583438350 // Layer 6, block 36 +.word 1692658010 // Layer 6, block 37 +.word 1738958476 // Layer 6, block 38 +.word 100744247 // Layer 6, block 39 +.word 6510145 // Layer 7, block 72 +.word 760999 // Layer 7, block 74 +.word 1634503 // Layer 7, block 76 +.word 29546109 // Layer 7, block 78 +.word 416617482 // Layer 7, block 72 +.word 48700219 // Layer 7, block 74 +.word 104600209 // Layer 7, block 76 +.word 1890806663 // Layer 7, block 78 +.word 2195232 // Layer 7, block 73 +.word 4465852 // Layer 7, block 75 +.word 31203102 // Layer 7, block 77 +.word 29916743 // Layer 7, block 79 +.word 140484126 // Layer 7, block 73 +.word 285792715 // Layer 7, block 75 +.word 1996846121 // Layer 7, block 77 +.word 1914525428 // Layer 7, block 79 +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29172999 // Layer 6, block 40 +.word 16825951 // Layer 6, block 41 +.word 11592382 // Layer 6, block 42 +.word 2671395 // Layer 6, block 43 +.word 1866929445 // Layer 6, block 40 +.word 1076778680 // Layer 6, block 41 +.word 741855827 // Layer 6, block 42 +.word 170956232 // Layer 6, block 43 +.word 14579779 // Layer 7, block 80 +.word 24263513 // Layer 7, block 82 +.word 4646776 // Layer 7, block 84 +.word 69049 // Layer 7, block 86 +.word 933034643 // Layer 7, block 80 +.word 1552746321 // Layer 7, block 82 +.word 297370968 // Layer 7, block 84 +.word 4418799 // Layer 7, block 86 +.word 33263488 // Layer 7, block 81 +.word 22493246 // Layer 7, block 83 +.word 22009979 // Layer 7, block 85 +.word 12021234 // Layer 7, block 87 +.word 2128700762 // Layer 7, block 81 +.word 1439457879 // Layer 7, block 83 +.word 1408531152 // Layer 7, block 85 +.word 769300260 // Layer 7, block 87 +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 15720958 // Layer 6, block 44 +.word 4876619 // Layer 6, block 45 +.word 9370171 // Layer 6, block 46 +.word 2197027 // Layer 6, block 47 +.word 1006064525 // Layer 6, block 44 +.word 312079797 // Layer 6, block 45 +.word 599645177 // Layer 6, block 46 +.word 140598997 // Layer 6, block 47 +.word 16117282 // Layer 7, block 88 +.word 9635661 // Layer 7, block 90 +.word 9117520 // Layer 7, block 92 +.word 3506913 // Layer 7, block 94 +.word 1031427326 // Layer 7, block 88 +.word 616635240 // Layer 7, block 90 +.word 583476747 // Layer 7, block 92 +.word 224425303 // Layer 7, block 94 +.word 20014407 // Layer 7, block 89 +.word 25893988 // Layer 7, block 91 +.word 10257619 // Layer 7, block 93 +.word 24501669 // Layer 7, block 95 +.word 1280824291 // Layer 7, block 89 +.word 1657088757 // Layer 7, block 91 +.word 656437514 // Layer 7, block 93 +.word 1567987141 // Layer 7, block 95 +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 23467272 // Layer 6, block 48 +.word 11944835 // Layer 6, block 49 +.word 29768154 // Layer 6, block 50 +.word 3189790 // Layer 6, block 51 +.word 1501790786 // Layer 6, block 48 +.word 764411097 // Layer 6, block 49 +.word 1905016458 // Layer 6, block 50 +.word 204130980 // Layer 6, block 51 +.word 28559032 // Layer 7, block 96 +.word 20151609 // Layer 7, block 98 +.word 11645481 // Layer 7, block 100 +.word 16402437 // Layer 7, block 102 +.word 1827638556 // Layer 7, block 96 +.word 1289604549 // Layer 7, block 98 +.word 745253903 // Layer 7, block 100 +.word 1049675853 // Layer 7, block 102 +.word 1005359 // Layer 7, block 97 +.word 19130139 // Layer 7, block 99 +.word 11690281 // Layer 7, block 101 +.word 5461508 // Layer 7, block 103 +.word 64338065 // Layer 7, block 97 +.word 1224235458 // Layer 7, block 99 +.word 748120885 // Layer 7, block 101 +.word 349509836 // Layer 7, block 103 +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 4898455 // Layer 6, block 52 +.word 22059944 // Layer 6, block 53 +.word 20315246 // Layer 6, block 54 +.word 28615767 // Layer 6, block 55 +.word 313477194 // Layer 6, block 52 +.word 1411728668 // Layer 6, block 53 +.word 1300076517 // Layer 6, block 54 +.word 1831269319 // Layer 6, block 55 +.word 6226096 // Layer 7, block 104 +.word 14029790 // Layer 7, block 106 +.word 7729000 // Layer 7, block 108 +.word 13958531 // Layer 7, block 110 +.word 398439734 // Layer 7, block 104 +.word 897838034 // Layer 7, block 106 +.word 494618249 // Layer 7, block 108 +.word 893277806 // Layer 7, block 110 +.word 31755058 // Layer 7, block 105 +.word 26102744 // Layer 7, block 107 +.word 19175904 // Layer 7, block 109 +.word 19472238 // Layer 7, block 111 +.word 2032168609 // Layer 7, block 105 +.word 1670448121 // Layer 7, block 107 +.word 1227164194 // Layer 7, block 109 +.word 1246128123 // Layer 7, block 111 +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 17302560 // Layer 6, block 56 +.word 8630188 // Layer 6, block 57 +.word 13744680 // Layer 6, block 58 +.word 31890906 // Layer 6, block 59 +.word 1107279328 // Layer 6, block 56 +.word 552289879 // Layer 6, block 57 +.word 879592386 // Layer 6, block 58 +.word 2040862218 // Layer 6, block 59 +.word 4735938 // Layer 7, block 112 +.word 26671657 // Layer 7, block 114 +.word 25810971 // Layer 7, block 116 +.word 25578690 // Layer 7, block 118 +.word 303076900 // Layer 7, block 112 +.word 1706855774 // Layer 7, block 114 +.word 1651776074 // Layer 7, block 116 +.word 1636911225 // Layer 7, block 118 +.word 6957373 // Layer 7, block 113 +.word 25381712 // Layer 7, block 115 +.word 27780827 // Layer 7, block 117 +.word 28062311 // Layer 7, block 119 +.word 445237890 // Layer 7, block 113 +.word 1624305595 // Layer 7, block 115 +.word 1777837237 // Layer 7, block 117 +.word 1795850838 // Layer 7, block 119 +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 26150922 // Layer 6, block 60 +.word 29525906 // Layer 6, block 61 +.word 23080870 // Layer 6, block 62 +.word 1636987 // Layer 6, block 63 +.word 1673531278 // Layer 6, block 60 +.word 1889513769 // Layer 6, block 61 +.word 1477062945 // Layer 6, block 62 +.word 104759172 // Layer 6, block 63 +.word 10674616 // Layer 7, block 120 +.word 9508293 // Layer 7, block 122 +.word 4274200 // Layer 7, block 124 +.word 10066304 // Layer 7, block 126 +.word 683123285 // Layer 7, block 120 +.word 608484310 // Layer 7, block 122 +.word 273527923 // Layer 7, block 124 +.word 644194289 // Layer 7, block 126 +.word 26473446 // Layer 7, block 121 +.word 14853570 // Layer 7, block 123 +.word 32427548 // Layer 7, block 125 +.word 16598340 // Layer 7, block 127 +.word 1694171239 // Layer 7, block 121 +.word 950555930 // Layer 7, block 123 +.word 2075204685 // Layer 7, block 125 +.word 1062212688 // Layer 7, block 127 +.text +.global ntt_u32_full_neon_asm_var_4_4_11_0 +.global _ntt_u32_full_neon_asm_var_4_4_11_0 +ntt_u32_full_neon_asm_var_4_4_11_0: +_ntt_u32_full_neon_asm_var_4_4_11_0: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x0, #928] +ldr q29, [x17, #+0] +ldr q28, [x17, #+16] +sqrdmulh v27.4S, v30.4S, v28.s[0] +mul v30.4S, v30.4S,v29.s[0] +ldr q26, [x0, #992] +sqrdmulh v25.4S, v26.4S, v28.s[0] +mul v26.4S, v26.4S,v29.s[0] +ldr q24, [x0, #800] +sqrdmulh v23.4S, v24.4S, v28.s[0] +mul v24.4S, v24.4S,v29.s[0] +ldr q22, [x0, #864] +sqrdmulh v21.4S, v22.4S, v28.s[0] +mul v22.4S, v22.4S,v29.s[0] +ldr q20, [x0, #544] +mla v30.4S, v27.4S, v31.s[0] +sqrdmulh v27.4S, v20.4S, v28.s[0] +ldr q19, [x0, #608] +mla v26.4S, v25.4S, v31.s[0] +sqrdmulh v25.4S, v19.4S, v28.s[0] +ldr q18, [x0, #672] +mla v24.4S, v23.4S, v31.s[0] +sqrdmulh v23.4S, v18.4S, v28.s[0] +ldr q17, [x0, #736] +mla v22.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v17.4S, v28.s[0] +ldr q16, [x0, #416] +ldr q3, [x0, #480] +mul v20.4S, v20.4S,v29.s[0] +sub v2.4s, v16.4s, v30.4s +mul v19.4S, v19.4S,v29.s[0] +add v16.4s, v16.4s, v30.4s +ldr q30, [x0, #288] +ldr q1, [x0, #352] +mla v20.4S, v27.4S, v31.s[0] +sub v27.4s, v3.4s, v26.4s +mla v19.4S, v25.4S, v31.s[0] +add v3.4s, v3.4s, v26.4s +ldr q26, [x0, #32] +ldr q25, [x0, #96] +mul v18.4S, v18.4S,v29.s[0] +sub v0.4s, v30.4s, v24.4s +mul v17.4S, v17.4S,v29.s[0] +add v30.4s, v30.4s, v24.4s +ldr q24, [x0, #160] +ldr q15, [x0, #224] +mla v18.4S, v23.4S, v31.s[0] +sub v23.4s, v1.4s, v22.4s +mla v17.4S, v21.4S, v31.s[0] +add v1.4s, v1.4s, v22.4s +sqrdmulh v22.4S, v16.4S, v28.s[1] +mul v16.4S, v16.4S,v29.s[1] +sqrdmulh v21.4S, v3.4S, v28.s[1] +sub v14.4s, v26.4s, v20.4s +mul v3.4S, v3.4S,v29.s[1] +add v26.4s, v26.4s, v20.4s +sqrdmulh v20.4S, v30.4S, v28.s[1] +sub v13.4s, v25.4s, v19.4s +mul v30.4S, v30.4S,v29.s[1] +add v25.4s, v25.4s, v19.4s +sqrdmulh v19.4S, v1.4S, v28.s[1] +sub v12.4s, v24.4s, v18.4s +mul v1.4S, v1.4S,v29.s[1] +add v24.4s, v24.4s, v18.4s +mla v16.4S, v22.4S, v31.s[0] +sub v22.4s, v15.4s, v17.4s +sqrdmulh v18.4S, v2.4S, v28.s[2] +add v15.4s, v15.4s, v17.4s +mla v3.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v27.4S, v28.s[2] +mla v30.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v0.4S, v28.s[2] +mla v1.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v23.4S, v28.s[2] +ldr q17, [x17, #+32] +ldr q11, [x17, #+48] +mul v2.4S, v2.4S,v29.s[2] +sub v10.4s, v24.4s, v16.4s +mul v27.4S, v27.4S,v29.s[2] +add v24.4s, v24.4s, v16.4s +mla v2.4S, v18.4S, v31.s[0] +sub v18.4s, v15.4s, v3.4s +mla v27.4S, v21.4S, v31.s[0] +add v15.4s, v15.4s, v3.4s +mul v0.4S, v0.4S,v29.s[2] +sub v3.4s, v26.4s, v30.4s +mul v23.4S, v23.4S,v29.s[2] +add v26.4s, v26.4s, v30.4s +mla v0.4S, v20.4S, v31.s[0] +sub v20.4s, v25.4s, v1.4s +mla v23.4S, v19.4S, v31.s[0] +add v25.4s, v25.4s, v1.4s +sqrdmulh v1.4S, v10.4S, v11.s[1] +mul v10.4S, v10.4S,v17.s[1] +sqrdmulh v19.4S, v18.4S, v11.s[1] +sub v30.4s, v12.4s, v2.4s +mul v18.4S, v18.4S,v17.s[1] +add v12.4s, v12.4s, v2.4s +sqrdmulh v2.4S, v24.4S, v11.s[0] +sub v21.4s, v22.4s, v27.4s +mul v24.4S, v24.4S,v17.s[0] +add v22.4s, v22.4s, v27.4s +sqrdmulh v27.4S, v15.4S, v11.s[0] +sub v16.4s, v14.4s, v0.4s +mul v15.4S, v15.4S,v17.s[0] +add v14.4s, v14.4s, v0.4s +ldr q0, [x17, #+64] +ldr q9, [x17, #+80] +mla v10.4S, v1.4S, v31.s[0] +sub v1.4s, v13.4s, v23.4s +sqrdmulh v8.4S, v12.4S, v11.s[2] +add v13.4s, v13.4s, v23.4s +mla v18.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v22.4S, v11.s[2] +mla v24.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v30.4S, v11.s[3] +mla v15.4S, v27.4S, v31.s[0] +sqrdmulh v27.4S, v21.4S, v11.s[3] +ldr q23, [x17, #+96] +ldr q7, [x17, #+112] +mul v12.4S, v12.4S,v17.s[2] +sub v6.4s, v3.4s, v10.4s +mul v22.4S, v22.4S,v17.s[2] +add v3.4s, v3.4s, v10.4s +mla v12.4S, v8.4S, v31.s[0] +sub v8.4s, v20.4s, v18.4s +mla v22.4S, v19.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +mul v30.4S, v30.4S,v17.s[3] +sub v18.4s, v26.4s, v24.4s +mul v21.4S, v21.4S,v17.s[3] +add v26.4s, v26.4s, v24.4s +mla v30.4S, v2.4S, v31.s[0] +sub v2.4s, v25.4s, v15.4s +mla v21.4S, v27.4S, v31.s[0] +add v25.4s, v25.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v9.s[2] +mul v20.4S, v20.4S,v0.s[2] +sqrdmulh v27.4S, v8.4S, v9.s[3] +sub v24.4s, v14.4s, v12.4s +mul v8.4S, v8.4S,v0.s[3] +add v14.4s, v14.4s, v12.4s +sqrdmulh v12.4S, v2.4S, v9.s[1] +sub v19.4s, v13.4s, v22.4s +mul v2.4S, v2.4S,v0.s[1] +add v13.4s, v13.4s, v22.4s +sqrdmulh v22.4S, v25.4S, v9.s[0] +sub v10.4s, v16.4s, v30.4s +mul v25.4S, v25.4S,v0.s[0] +add v16.4s, v16.4s, v30.4s +mla v20.4S, v15.4S, v31.s[0] +sub v15.4s, v1.4s, v21.4s +sqrdmulh v30.4S, v13.4S, v7.s[0] +add v1.4s, v1.4s, v21.4s +mla v8.4S, v27.4S, v31.s[0] +sqrdmulh v27.4S, v19.4S, v7.s[1] +mla v2.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v1.4S, v7.s[2] +mla v25.4S, v22.4S, v31.s[0] +sqrdmulh v22.4S, v15.4S, v7.s[3] +mul v13.4S, v13.4S,v23.s[0] +sub v21.4s, v3.4s, v20.4s +str q21, [x0, #352] +mul v19.4S, v19.4S,v23.s[1] +add v3.4s, v3.4s, v20.4s +str q3, [x0, #288] +mla v13.4S, v30.4S, v31.s[0] +sub v30.4s, v6.4s, v8.4s +str q30, [x0, #480] +mla v19.4S, v27.4S, v31.s[0] +add v6.4s, v6.4s, v8.4s +str q6, [x0, #416] +mul v1.4S, v1.4S,v23.s[2] +sub v6.4s, v18.4s, v2.4s +str q6, [x0, #224] +mul v15.4S, v15.4S,v23.s[3] +add v18.4s, v18.4s, v2.4s +str q18, [x0, #160] +mla v1.4S, v12.4S, v31.s[0] +sub v12.4s, v26.4s, v25.4s +str q12, [x0, #96] +mla v15.4S, v22.4S, v31.s[0] +add v26.4s, v26.4s, v25.4s +str q26, [x0, #32] +ldr q26, [x0, #944] +sqrdmulh v25.4S, v26.4S, v28.s[0] +mul v26.4S, v26.4S,v29.s[0] +ldr q22, [x0, #1008] +sqrdmulh v12.4S, v22.4S, v28.s[0] +sub v18.4s, v14.4s, v13.4s +str q18, [x0, #608] +mul v22.4S, v22.4S,v29.s[0] +add v14.4s, v14.4s, v13.4s +str q14, [x0, #544] +ldr q14, [x0, #816] +sqrdmulh v13.4S, v14.4S, v28.s[0] +sub v18.4s, v24.4s, v19.4s +str q18, [x0, #736] +mul v14.4S, v14.4S,v29.s[0] +add v24.4s, v24.4s, v19.4s +str q24, [x0, #672] +ldr q24, [x0, #880] +sqrdmulh v19.4S, v24.4S, v28.s[0] +sub v18.4s, v16.4s, v1.4s +str q18, [x0, #864] +mul v24.4S, v24.4S,v29.s[0] +add v16.4s, v16.4s, v1.4s +str q16, [x0, #800] +ldr q16, [x0, #560] +mla v26.4S, v25.4S, v31.s[0] +sub v25.4s, v10.4s, v15.4s +str q25, [x0, #992] +sqrdmulh v25.4S, v16.4S, v28.s[0] +add v10.4s, v10.4s, v15.4s +str q10, [x0, #928] +ldr q10, [x0, #624] +mla v22.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v10.4S, v28.s[0] +ldr q15, [x0, #688] +mla v14.4S, v13.4S, v31.s[0] +sqrdmulh v13.4S, v15.4S, v28.s[0] +ldr q1, [x0, #752] +mla v24.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v1.4S, v28.s[0] +ldr q18, [x0, #432] +ldr q2, [x0, #496] +mul v16.4S, v16.4S,v29.s[0] +sub v6.4s, v18.4s, v26.4s +mul v10.4S, v10.4S,v29.s[0] +add v18.4s, v18.4s, v26.4s +ldr q26, [x0, #304] +ldr q8, [x0, #368] +mla v16.4S, v25.4S, v31.s[0] +sub v25.4s, v2.4s, v22.4s +mla v10.4S, v12.4S, v31.s[0] +add v2.4s, v2.4s, v22.4s +ldr q22, [x0, #48] +ldr q12, [x0, #112] +mul v15.4S, v15.4S,v29.s[0] +sub v27.4s, v26.4s, v14.4s +mul v1.4S, v1.4S,v29.s[0] +add v26.4s, v26.4s, v14.4s +ldr q14, [x0, #176] +ldr q30, [x0, #240] +mla v15.4S, v13.4S, v31.s[0] +sub v13.4s, v8.4s, v24.4s +mla v1.4S, v19.4S, v31.s[0] +add v8.4s, v8.4s, v24.4s +sqrdmulh v24.4S, v18.4S, v28.s[1] +mul v18.4S, v18.4S,v29.s[1] +sqrdmulh v19.4S, v2.4S, v28.s[1] +sub v3.4s, v22.4s, v16.4s +mul v2.4S, v2.4S,v29.s[1] +add v22.4s, v22.4s, v16.4s +sqrdmulh v16.4S, v26.4S, v28.s[1] +sub v20.4s, v12.4s, v10.4s +mul v26.4S, v26.4S,v29.s[1] +add v12.4s, v12.4s, v10.4s +sqrdmulh v10.4S, v8.4S, v28.s[1] +sub v21.4s, v14.4s, v15.4s +mul v8.4S, v8.4S,v29.s[1] +add v14.4s, v14.4s, v15.4s +mla v18.4S, v24.4S, v31.s[0] +sub v24.4s, v30.4s, v1.4s +sqrdmulh v15.4S, v6.4S, v28.s[2] +add v30.4s, v30.4s, v1.4s +mla v2.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v25.4S, v28.s[2] +mla v26.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v27.4S, v28.s[2] +mla v8.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v13.4S, v28.s[2] +mul v6.4S, v6.4S,v29.s[2] +sub v1.4s, v14.4s, v18.4s +mul v25.4S, v25.4S,v29.s[2] +add v14.4s, v14.4s, v18.4s +mla v6.4S, v15.4S, v31.s[0] +sub v15.4s, v30.4s, v2.4s +mla v25.4S, v19.4S, v31.s[0] +add v30.4s, v30.4s, v2.4s +mul v27.4S, v27.4S,v29.s[2] +sub v2.4s, v22.4s, v26.4s +mul v13.4S, v13.4S,v29.s[2] +add v22.4s, v22.4s, v26.4s +mla v27.4S, v16.4S, v31.s[0] +sub v16.4s, v12.4s, v8.4s +mla v13.4S, v10.4S, v31.s[0] +add v12.4s, v12.4s, v8.4s +sqrdmulh v8.4S, v1.4S, v11.s[1] +mul v1.4S, v1.4S,v17.s[1] +sqrdmulh v10.4S, v15.4S, v11.s[1] +sub v26.4s, v21.4s, v6.4s +mul v15.4S, v15.4S,v17.s[1] +add v21.4s, v21.4s, v6.4s +sqrdmulh v6.4S, v14.4S, v11.s[0] +sub v19.4s, v24.4s, v25.4s +mul v14.4S, v14.4S,v17.s[0] +add v24.4s, v24.4s, v25.4s +sqrdmulh v25.4S, v30.4S, v11.s[0] +sub v18.4s, v3.4s, v27.4s +mul v30.4S, v30.4S,v17.s[0] +add v3.4s, v3.4s, v27.4s +mla v1.4S, v8.4S, v31.s[0] +sub v8.4s, v20.4s, v13.4s +sqrdmulh v27.4S, v21.4S, v11.s[2] +add v20.4s, v20.4s, v13.4s +mla v15.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v24.4S, v11.s[2] +mla v14.4S, v6.4S, v31.s[0] +sqrdmulh v6.4S, v26.4S, v11.s[3] +mla v30.4S, v25.4S, v31.s[0] +sqrdmulh v25.4S, v19.4S, v11.s[3] +mul v21.4S, v21.4S,v17.s[2] +sub v13.4s, v2.4s, v1.4s +mul v24.4S, v24.4S,v17.s[2] +add v2.4s, v2.4s, v1.4s +mla v21.4S, v27.4S, v31.s[0] +sub v27.4s, v16.4s, v15.4s +mla v24.4S, v10.4S, v31.s[0] +add v16.4s, v16.4s, v15.4s +mul v26.4S, v26.4S,v17.s[3] +sub v15.4s, v22.4s, v14.4s +mul v19.4S, v19.4S,v17.s[3] +add v22.4s, v22.4s, v14.4s +mla v26.4S, v6.4S, v31.s[0] +sub v6.4s, v12.4s, v30.4s +mla v19.4S, v25.4S, v31.s[0] +add v12.4s, v12.4s, v30.4s +sqrdmulh v30.4S, v16.4S, v9.s[2] +mul v16.4S, v16.4S,v0.s[2] +sqrdmulh v25.4S, v27.4S, v9.s[3] +sub v14.4s, v3.4s, v21.4s +mul v27.4S, v27.4S,v0.s[3] +add v3.4s, v3.4s, v21.4s +sqrdmulh v21.4S, v6.4S, v9.s[1] +sub v10.4s, v20.4s, v24.4s +mul v6.4S, v6.4S,v0.s[1] +add v20.4s, v20.4s, v24.4s +sqrdmulh v24.4S, v12.4S, v9.s[0] +sub v1.4s, v18.4s, v26.4s +mul v12.4S, v12.4S,v0.s[0] +add v18.4s, v18.4s, v26.4s +mla v16.4S, v30.4S, v31.s[0] +sub v30.4s, v8.4s, v19.4s +sqrdmulh v26.4S, v20.4S, v7.s[0] +add v8.4s, v8.4s, v19.4s +mla v27.4S, v25.4S, v31.s[0] +sqrdmulh v25.4S, v10.4S, v7.s[1] +mla v6.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v8.4S, v7.s[2] +mla v12.4S, v24.4S, v31.s[0] +sqrdmulh v24.4S, v30.4S, v7.s[3] +mul v20.4S, v20.4S,v23.s[0] +sub v19.4s, v2.4s, v16.4s +str q19, [x0, #368] +mul v10.4S, v10.4S,v23.s[1] +add v2.4s, v2.4s, v16.4s +str q2, [x0, #304] +mla v20.4S, v26.4S, v31.s[0] +sub v26.4s, v13.4s, v27.4s +str q26, [x0, #496] +mla v10.4S, v25.4S, v31.s[0] +add v13.4s, v13.4s, v27.4s +str q13, [x0, #432] +mul v8.4S, v8.4S,v23.s[2] +sub v13.4s, v15.4s, v6.4s +str q13, [x0, #240] +mul v30.4S, v30.4S,v23.s[3] +add v15.4s, v15.4s, v6.4s +str q15, [x0, #176] +mla v8.4S, v21.4S, v31.s[0] +sub v21.4s, v22.4s, v12.4s +str q21, [x0, #112] +mla v30.4S, v24.4S, v31.s[0] +add v22.4s, v22.4s, v12.4s +str q22, [x0, #48] +ldr q22, [x0, #896] +sqrdmulh v12.4S, v22.4S, v28.s[0] +mul v22.4S, v22.4S,v29.s[0] +ldr q24, [x0, #960] +sqrdmulh v21.4S, v24.4S, v28.s[0] +sub v15.4s, v3.4s, v20.4s +str q15, [x0, #624] +mul v24.4S, v24.4S,v29.s[0] +add v3.4s, v3.4s, v20.4s +str q3, [x0, #560] +ldr q3, [x0, #768] +sqrdmulh v20.4S, v3.4S, v28.s[0] +sub v15.4s, v14.4s, v10.4s +str q15, [x0, #752] +mul v3.4S, v3.4S,v29.s[0] +add v14.4s, v14.4s, v10.4s +str q14, [x0, #688] +ldr q14, [x0, #832] +sqrdmulh v10.4S, v14.4S, v28.s[0] +sub v15.4s, v18.4s, v8.4s +str q15, [x0, #880] +mul v14.4S, v14.4S,v29.s[0] +add v18.4s, v18.4s, v8.4s +str q18, [x0, #816] +ldr q18, [x0, #512] +mla v22.4S, v12.4S, v31.s[0] +sub v12.4s, v1.4s, v30.4s +str q12, [x0, #1008] +sqrdmulh v12.4S, v18.4S, v28.s[0] +add v1.4s, v1.4s, v30.4s +str q1, [x0, #944] +ldr q1, [x0, #576] +mla v24.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v1.4S, v28.s[0] +ldr q30, [x0, #640] +mla v3.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v30.4S, v28.s[0] +ldr q8, [x0, #704] +mla v14.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v8.4S, v28.s[0] +ldr q15, [x0, #384] +ldr q6, [x0, #448] +mul v18.4S, v18.4S,v29.s[0] +sub v13.4s, v15.4s, v22.4s +mul v1.4S, v1.4S,v29.s[0] +add v15.4s, v15.4s, v22.4s +ldr q22, [x0, #256] +ldr q27, [x0, #320] +mla v18.4S, v12.4S, v31.s[0] +sub v12.4s, v6.4s, v24.4s +mla v1.4S, v21.4S, v31.s[0] +add v6.4s, v6.4s, v24.4s +ldr q24, [x0, #0] +ldr q21, [x0, #64] +mul v30.4S, v30.4S,v29.s[0] +sub v25.4s, v22.4s, v3.4s +mul v8.4S, v8.4S,v29.s[0] +add v22.4s, v22.4s, v3.4s +ldr q3, [x0, #128] +ldr q26, [x0, #192] +mla v30.4S, v20.4S, v31.s[0] +sub v20.4s, v27.4s, v14.4s +mla v8.4S, v10.4S, v31.s[0] +add v27.4s, v27.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v28.s[1] +mul v15.4S, v15.4S,v29.s[1] +sqrdmulh v10.4S, v6.4S, v28.s[1] +sub v2.4s, v24.4s, v18.4s +mul v6.4S, v6.4S,v29.s[1] +add v24.4s, v24.4s, v18.4s +sqrdmulh v18.4S, v22.4S, v28.s[1] +sub v16.4s, v21.4s, v1.4s +mul v22.4S, v22.4S,v29.s[1] +add v21.4s, v21.4s, v1.4s +sqrdmulh v1.4S, v27.4S, v28.s[1] +sub v19.4s, v3.4s, v30.4s +mul v27.4S, v27.4S,v29.s[1] +add v3.4s, v3.4s, v30.4s +mla v15.4S, v14.4S, v31.s[0] +sub v14.4s, v26.4s, v8.4s +sqrdmulh v30.4S, v13.4S, v28.s[2] +add v26.4s, v26.4s, v8.4s +mla v6.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v12.4S, v28.s[2] +mla v22.4S, v18.4S, v31.s[0] +sqrdmulh v18.4S, v25.4S, v28.s[2] +mla v27.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v20.4S, v28.s[2] +mul v13.4S, v13.4S,v29.s[2] +sub v8.4s, v3.4s, v15.4s +mul v12.4S, v12.4S,v29.s[2] +add v3.4s, v3.4s, v15.4s +mla v13.4S, v30.4S, v31.s[0] +sub v30.4s, v26.4s, v6.4s +mla v12.4S, v10.4S, v31.s[0] +add v26.4s, v26.4s, v6.4s +mul v25.4S, v25.4S,v29.s[2] +sub v6.4s, v24.4s, v22.4s +mul v20.4S, v20.4S,v29.s[2] +add v24.4s, v24.4s, v22.4s +mla v25.4S, v18.4S, v31.s[0] +sub v18.4s, v21.4s, v27.4s +mla v20.4S, v1.4S, v31.s[0] +add v21.4s, v21.4s, v27.4s +sqrdmulh v27.4S, v8.4S, v11.s[1] +mul v8.4S, v8.4S,v17.s[1] +sqrdmulh v1.4S, v30.4S, v11.s[1] +sub v22.4s, v19.4s, v13.4s +mul v30.4S, v30.4S,v17.s[1] +add v19.4s, v19.4s, v13.4s +sqrdmulh v13.4S, v3.4S, v11.s[0] +sub v10.4s, v14.4s, v12.4s +mul v3.4S, v3.4S,v17.s[0] +add v14.4s, v14.4s, v12.4s +sqrdmulh v12.4S, v26.4S, v11.s[0] +sub v15.4s, v2.4s, v25.4s +mul v26.4S, v26.4S,v17.s[0] +add v2.4s, v2.4s, v25.4s +mla v8.4S, v27.4S, v31.s[0] +sub v27.4s, v16.4s, v20.4s +sqrdmulh v25.4S, v19.4S, v11.s[2] +add v16.4s, v16.4s, v20.4s +mla v30.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v14.4S, v11.s[2] +mla v3.4S, v13.4S, v31.s[0] +sqrdmulh v13.4S, v22.4S, v11.s[3] +mla v26.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v10.4S, v11.s[3] +mul v19.4S, v19.4S,v17.s[2] +sub v20.4s, v6.4s, v8.4s +mul v14.4S, v14.4S,v17.s[2] +add v6.4s, v6.4s, v8.4s +mla v19.4S, v25.4S, v31.s[0] +sub v25.4s, v18.4s, v30.4s +mla v14.4S, v1.4S, v31.s[0] +add v18.4s, v18.4s, v30.4s +mul v22.4S, v22.4S,v17.s[3] +sub v30.4s, v24.4s, v3.4s +mul v10.4S, v10.4S,v17.s[3] +add v24.4s, v24.4s, v3.4s +mla v22.4S, v13.4S, v31.s[0] +sub v13.4s, v21.4s, v26.4s +mla v10.4S, v12.4S, v31.s[0] +add v21.4s, v21.4s, v26.4s +sqrdmulh v26.4S, v18.4S, v9.s[2] +mul v18.4S, v18.4S,v0.s[2] +sqrdmulh v12.4S, v25.4S, v9.s[3] +sub v3.4s, v2.4s, v19.4s +mul v25.4S, v25.4S,v0.s[3] +add v2.4s, v2.4s, v19.4s +sqrdmulh v19.4S, v13.4S, v9.s[1] +sub v1.4s, v16.4s, v14.4s +mul v13.4S, v13.4S,v0.s[1] +add v16.4s, v16.4s, v14.4s +sqrdmulh v14.4S, v21.4S, v9.s[0] +sub v8.4s, v15.4s, v22.4s +mul v21.4S, v21.4S,v0.s[0] +add v15.4s, v15.4s, v22.4s +mla v18.4S, v26.4S, v31.s[0] +sub v26.4s, v27.4s, v10.4s +sqrdmulh v22.4S, v16.4S, v7.s[0] +add v27.4s, v27.4s, v10.4s +mla v25.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v1.4S, v7.s[1] +mla v13.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v27.4S, v7.s[2] +mla v21.4S, v14.4S, v31.s[0] +sqrdmulh v14.4S, v26.4S, v7.s[3] +mul v16.4S, v16.4S,v23.s[0] +sub v10.4s, v6.4s, v18.4s +str q10, [x0, #320] +mul v1.4S, v1.4S,v23.s[1] +add v6.4s, v6.4s, v18.4s +str q6, [x0, #256] +mla v16.4S, v22.4S, v31.s[0] +sub v22.4s, v20.4s, v25.4s +str q22, [x0, #448] +mla v1.4S, v12.4S, v31.s[0] +add v20.4s, v20.4s, v25.4s +str q20, [x0, #384] +mul v27.4S, v27.4S,v23.s[2] +sub v20.4s, v30.4s, v13.4s +str q20, [x0, #192] +mul v26.4S, v26.4S,v23.s[3] +add v30.4s, v30.4s, v13.4s +str q30, [x0, #128] +mla v27.4S, v19.4S, v31.s[0] +sub v19.4s, v24.4s, v21.4s +str q19, [x0, #64] +mla v26.4S, v14.4S, v31.s[0] +add v24.4s, v24.4s, v21.4s +str q24, [x0, #0] +ldr q24, [x0, #912] +sqrdmulh v21.4S, v24.4S, v28.s[0] +mul v24.4S, v24.4S,v29.s[0] +ldr q14, [x0, #976] +sqrdmulh v19.4S, v14.4S, v28.s[0] +sub v30.4s, v2.4s, v16.4s +str q30, [x0, #576] +mul v14.4S, v14.4S,v29.s[0] +add v2.4s, v2.4s, v16.4s +str q2, [x0, #512] +ldr q2, [x0, #784] +sqrdmulh v16.4S, v2.4S, v28.s[0] +sub v30.4s, v3.4s, v1.4s +str q30, [x0, #704] +mul v2.4S, v2.4S,v29.s[0] +add v3.4s, v3.4s, v1.4s +str q3, [x0, #640] +ldr q3, [x0, #848] +sqrdmulh v1.4S, v3.4S, v28.s[0] +sub v30.4s, v15.4s, v27.4s +str q30, [x0, #832] +mul v3.4S, v3.4S,v29.s[0] +add v15.4s, v15.4s, v27.4s +str q15, [x0, #768] +ldr q15, [x0, #528] +mla v24.4S, v21.4S, v31.s[0] +sub v21.4s, v8.4s, v26.4s +str q21, [x0, #960] +sqrdmulh v21.4S, v15.4S, v28.s[0] +add v8.4s, v8.4s, v26.4s +str q8, [x0, #896] +ldr q8, [x0, #592] +mla v14.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v8.4S, v28.s[0] +ldr q26, [x0, #656] +mla v2.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v26.4S, v28.s[0] +ldr q27, [x0, #720] +mla v3.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v27.4S, v28.s[0] +ldr q30, [x0, #400] +ldr q13, [x0, #464] +mul v15.4S, v15.4S,v29.s[0] +sub v20.4s, v30.4s, v24.4s +mul v8.4S, v8.4S,v29.s[0] +add v30.4s, v30.4s, v24.4s +ldr q24, [x0, #272] +ldr q25, [x0, #336] +mla v15.4S, v21.4S, v31.s[0] +sub v21.4s, v13.4s, v14.4s +mla v8.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v14.4s +ldr q14, [x0, #16] +ldr q19, [x0, #80] +mul v26.4S, v26.4S,v29.s[0] +sub v12.4s, v24.4s, v2.4s +mul v27.4S, v27.4S,v29.s[0] +add v24.4s, v24.4s, v2.4s +ldr q2, [x0, #144] +ldr q22, [x0, #208] +mla v26.4S, v16.4S, v31.s[0] +sub v16.4s, v25.4s, v3.4s +mla v27.4S, v1.4S, v31.s[0] +add v25.4s, v25.4s, v3.4s +sqrdmulh v3.4S, v30.4S, v28.s[1] +mul v30.4S, v30.4S,v29.s[1] +sqrdmulh v1.4S, v13.4S, v28.s[1] +sub v6.4s, v14.4s, v15.4s +mul v13.4S, v13.4S,v29.s[1] +add v14.4s, v14.4s, v15.4s +sqrdmulh v15.4S, v24.4S, v28.s[1] +sub v18.4s, v19.4s, v8.4s +mul v24.4S, v24.4S,v29.s[1] +add v19.4s, v19.4s, v8.4s +sqrdmulh v8.4S, v25.4S, v28.s[1] +sub v10.4s, v2.4s, v26.4s +mul v25.4S, v25.4S,v29.s[1] +add v2.4s, v2.4s, v26.4s +mla v30.4S, v3.4S, v31.s[0] +sub v3.4s, v22.4s, v27.4s +sqrdmulh v26.4S, v20.4S, v28.s[2] +add v22.4s, v22.4s, v27.4s +mla v13.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v21.4S, v28.s[2] +mla v24.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v12.4S, v28.s[2] +mla v25.4S, v8.4S, v31.s[0] +sqrdmulh v8.4S, v16.4S, v28.s[2] +mul v20.4S, v20.4S,v29.s[2] +sub v27.4s, v2.4s, v30.4s +mul v21.4S, v21.4S,v29.s[2] +add v2.4s, v2.4s, v30.4s +mla v20.4S, v26.4S, v31.s[0] +sub v26.4s, v22.4s, v13.4s +mla v21.4S, v1.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +mul v12.4S, v12.4S,v29.s[2] +sub v13.4s, v14.4s, v24.4s +mul v16.4S, v16.4S,v29.s[2] +add v14.4s, v14.4s, v24.4s +mla v12.4S, v15.4S, v31.s[0] +sub v15.4s, v19.4s, v25.4s +mla v16.4S, v8.4S, v31.s[0] +add v19.4s, v19.4s, v25.4s +sqrdmulh v28.4S, v27.4S, v11.s[1] +mul v27.4S, v27.4S,v17.s[1] +sqrdmulh v29.4S, v26.4S, v11.s[1] +sub v25.4s, v10.4s, v20.4s +mul v26.4S, v26.4S,v17.s[1] +add v10.4s, v10.4s, v20.4s +sqrdmulh v20.4S, v2.4S, v11.s[0] +sub v8.4s, v3.4s, v21.4s +mul v2.4S, v2.4S,v17.s[0] +add v3.4s, v3.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v11.s[0] +sub v24.4s, v6.4s, v12.4s +mul v22.4S, v22.4S,v17.s[0] +add v6.4s, v6.4s, v12.4s +mla v27.4S, v28.4S, v31.s[0] +sub v28.4s, v18.4s, v16.4s +sqrdmulh v12.4S, v10.4S, v11.s[2] +add v18.4s, v18.4s, v16.4s +mla v26.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v3.4S, v11.s[2] +mla v2.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v25.4S, v11.s[3] +mla v22.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v8.4S, v11.s[3] +mul v10.4S, v10.4S,v17.s[2] +sub v16.4s, v13.4s, v27.4s +mul v3.4S, v3.4S,v17.s[2] +add v13.4s, v13.4s, v27.4s +mla v10.4S, v12.4S, v31.s[0] +sub v12.4s, v15.4s, v26.4s +mla v3.4S, v29.4S, v31.s[0] +add v15.4s, v15.4s, v26.4s +mul v25.4S, v25.4S,v17.s[3] +sub v26.4s, v14.4s, v2.4s +mul v8.4S, v8.4S,v17.s[3] +add v14.4s, v14.4s, v2.4s +mla v25.4S, v20.4S, v31.s[0] +sub v20.4s, v19.4s, v22.4s +mla v8.4S, v21.4S, v31.s[0] +add v19.4s, v19.4s, v22.4s +sqrdmulh v11.4S, v15.4S, v9.s[2] +mul v15.4S, v15.4S,v0.s[2] +sqrdmulh v17.4S, v12.4S, v9.s[3] +sub v22.4s, v6.4s, v10.4s +mul v12.4S, v12.4S,v0.s[3] +add v6.4s, v6.4s, v10.4s +sqrdmulh v10.4S, v20.4S, v9.s[1] +sub v21.4s, v18.4s, v3.4s +mul v20.4S, v20.4S,v0.s[1] +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v19.4S, v9.s[0] +sub v2.4s, v24.4s, v25.4s +mul v19.4S, v19.4S,v0.s[0] +add v24.4s, v24.4s, v25.4s +mla v15.4S, v11.4S, v31.s[0] +sub v11.4s, v28.4s, v8.4s +sqrdmulh v25.4S, v18.4S, v7.s[0] +add v28.4s, v28.4s, v8.4s +mla v12.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v21.4S, v7.s[1] +mla v20.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v28.4S, v7.s[2] +mla v19.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v11.4S, v7.s[3] +mul v18.4S, v18.4S,v23.s[0] +sub v8.4s, v13.4s, v15.4s +str q8, [x0, #336] +mul v21.4S, v21.4S,v23.s[1] +add v13.4s, v13.4s, v15.4s +str q13, [x0, #272] +mla v18.4S, v25.4S, v31.s[0] +sub v25.4s, v16.4s, v12.4s +str q25, [x0, #464] +mla v21.4S, v17.4S, v31.s[0] +add v16.4s, v16.4s, v12.4s +str q16, [x0, #400] +mul v28.4S, v28.4S,v23.s[2] +sub v16.4s, v26.4s, v20.4s +str q16, [x0, #208] +mul v11.4S, v11.4S,v23.s[3] +add v26.4s, v26.4s, v20.4s +str q26, [x0, #144] +mla v28.4S, v10.4S, v31.s[0] +sub v10.4s, v14.4s, v19.4s +str q10, [x0, #80] +mla v11.4S, v3.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +str q14, [x0, #16] +sub v7.4s, v6.4s, v18.4s +str q7, [x0, #592] +add v6.4s, v6.4s, v18.4s +str q6, [x0, #528] +sub v6.4s, v22.4s, v21.4s +str q6, [x0, #720] +add v22.4s, v22.4s, v21.4s +str q22, [x0, #656] +sub v22.4s, v24.4s, v28.4s +str q22, [x0, #848] +add v24.4s, v24.4s, v28.4s +str q24, [x0, #784] +sub v24.4s, v2.4s, v11.4s +str q24, [x0, #976] +add v2.4s, v2.4s, v11.4s +str q2, [x0, #912] +ldr q4, [x17, #+128] +ldr q5, [x17, #+144] +ldr q30, [x17, #+160] +ldr q1, [x17, #+176] +ldr q27, [x17, #+192] +ldr q29, [x17, #+208] +ldr q8, [x17, #+224] +ldr q15, [x17, #+240] +ldr q13, [x0, #32] +ldr q25, [x0, #48] +ldr q17, [x0, #0] +ldr q12, [x0, #16] +sqrdmulh v16.4S, v13.4S, v5.s[0] +mul v13.4S, v13.4S,v4.s[0] +mla v13.4S, v16.4S, v31.s[0] +sub v16.4s, v17.4s, v13.4s +add v17.4s, v17.4s, v13.4s +sqrdmulh v13.4S, v25.4S, v5.s[0] +mul v25.4S, v25.4S,v4.s[0] +mla v25.4S, v13.4S, v31.s[0] +sub v13.4s, v12.4s, v25.4s +add v12.4s, v12.4s, v25.4s +sqrdmulh v25.4S, v12.4S, v5.s[1] +mul v12.4S, v12.4S,v4.s[1] +mla v12.4S, v25.4S, v31.s[0] +sub v25.4s, v17.4s, v12.4s +add v17.4s, v17.4s, v12.4s +sqrdmulh v12.4S, v13.4S, v5.s[2] +mul v13.4S, v13.4S,v4.s[2] +mla v13.4S, v12.4S, v31.s[0] +sub v12.4s, v16.4s, v13.4s +add v16.4s, v16.4s, v13.4s +trn1 v13.4S, v17.4S, v25.4S +trn2 v20.4S, v17.4S, v25.4S +trn1 v26.4S, v16.4S, v12.4S +trn2 v10.4S, v16.4S, v12.4S +trn2 v16.2D, v13.2D, v26.2D +trn2 v12.2D, v20.2D, v10.2D +trn1 v17.2D, v13.2D, v26.2D +trn1 v25.2D, v20.2D, v10.2D +sqrdmulh v10.4S, v16.4S, v1.4S +mul v16.4S, v16.4S,v30.4S +mla v16.4S, v10.4S, v31.s[0] +sub v10.4s, v17.4s, v16.4s +add v17.4s, v17.4s, v16.4s +sqrdmulh v16.4S, v12.4S, v1.4S +mul v12.4S, v12.4S,v30.4S +mla v12.4S, v16.4S, v31.s[0] +sub v16.4s, v25.4s, v12.4s +add v25.4s, v25.4s, v12.4s +sqrdmulh v12.4S, v25.4S, v29.4S +mul v25.4S, v25.4S,v27.4S +mla v25.4S, v12.4S, v31.s[0] +sub v12.4s, v17.4s, v25.4s +add v17.4s, v17.4s, v25.4s +sqrdmulh v25.4S, v16.4S, v15.4S +mul v16.4S, v16.4S,v8.4S +mla v16.4S, v25.4S, v31.s[0] +sub v25.4s, v10.4s, v16.4s +add v10.4s, v10.4s, v16.4s +str q17, [x0, #0] +str q12, [x0, #16] +str q10, [x0, #32] +str q25, [x0, #48] +ldr q25, [x17, #+256] +ldr q10, [x17, #+272] +ldr q12, [x17, #+288] +ldr q17, [x17, #+304] +ldr q16, [x17, #+320] +ldr q20, [x17, #+336] +ldr q26, [x17, #+352] +ldr q13, [x17, #+368] +ldr q15, [x0, #96] +ldr q8, [x0, #112] +ldr q29, [x0, #64] +ldr q27, [x0, #80] +sqrdmulh v1.4S, v15.4S, v10.s[0] +mul v15.4S, v15.4S,v25.s[0] +mla v15.4S, v1.4S, v31.s[0] +sub v1.4s, v29.4s, v15.4s +add v29.4s, v29.4s, v15.4s +sqrdmulh v15.4S, v8.4S, v10.s[0] +mul v8.4S, v8.4S,v25.s[0] +mla v8.4S, v15.4S, v31.s[0] +sub v15.4s, v27.4s, v8.4s +add v27.4s, v27.4s, v8.4s +sqrdmulh v8.4S, v27.4S, v10.s[1] +mul v27.4S, v27.4S,v25.s[1] +mla v27.4S, v8.4S, v31.s[0] +sub v8.4s, v29.4s, v27.4s +add v29.4s, v29.4s, v27.4s +sqrdmulh v27.4S, v15.4S, v10.s[2] +mul v15.4S, v15.4S,v25.s[2] +mla v15.4S, v27.4S, v31.s[0] +sub v27.4s, v1.4s, v15.4s +add v1.4s, v1.4s, v15.4s +trn1 v15.4S, v29.4S, v8.4S +trn2 v30.4S, v29.4S, v8.4S +trn1 v5.4S, v1.4S, v27.4S +trn2 v4.4S, v1.4S, v27.4S +trn2 v1.2D, v15.2D, v5.2D +trn2 v27.2D, v30.2D, v4.2D +trn1 v29.2D, v15.2D, v5.2D +trn1 v8.2D, v30.2D, v4.2D +sqrdmulh v4.4S, v1.4S, v17.4S +mul v1.4S, v1.4S,v12.4S +mla v1.4S, v4.4S, v31.s[0] +sub v4.4s, v29.4s, v1.4s +add v29.4s, v29.4s, v1.4s +sqrdmulh v1.4S, v27.4S, v17.4S +mul v27.4S, v27.4S,v12.4S +mla v27.4S, v1.4S, v31.s[0] +sub v1.4s, v8.4s, v27.4s +add v8.4s, v8.4s, v27.4s +sqrdmulh v27.4S, v8.4S, v20.4S +mul v8.4S, v8.4S,v16.4S +mla v8.4S, v27.4S, v31.s[0] +sub v27.4s, v29.4s, v8.4s +add v29.4s, v29.4s, v8.4s +sqrdmulh v8.4S, v1.4S, v13.4S +mul v1.4S, v1.4S,v26.4S +mla v1.4S, v8.4S, v31.s[0] +sub v8.4s, v4.4s, v1.4s +add v4.4s, v4.4s, v1.4s +str q29, [x0, #64] +str q27, [x0, #80] +str q4, [x0, #96] +str q8, [x0, #112] +ldr q8, [x17, #+384] +ldr q4, [x17, #+400] +ldr q27, [x17, #+416] +ldr q29, [x17, #+432] +ldr q1, [x17, #+448] +ldr q30, [x17, #+464] +ldr q5, [x17, #+480] +ldr q15, [x17, #+496] +ldr q13, [x0, #160] +ldr q26, [x0, #176] +ldr q20, [x0, #128] +ldr q16, [x0, #144] +sqrdmulh v17.4S, v13.4S, v4.s[0] +mul v13.4S, v13.4S,v8.s[0] +mla v13.4S, v17.4S, v31.s[0] +sub v17.4s, v20.4s, v13.4s +add v20.4s, v20.4s, v13.4s +sqrdmulh v13.4S, v26.4S, v4.s[0] +mul v26.4S, v26.4S,v8.s[0] +mla v26.4S, v13.4S, v31.s[0] +sub v13.4s, v16.4s, v26.4s +add v16.4s, v16.4s, v26.4s +sqrdmulh v26.4S, v16.4S, v4.s[1] +mul v16.4S, v16.4S,v8.s[1] +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v20.4s, v16.4s +add v20.4s, v20.4s, v16.4s +sqrdmulh v16.4S, v13.4S, v4.s[2] +mul v13.4S, v13.4S,v8.s[2] +mla v13.4S, v16.4S, v31.s[0] +sub v16.4s, v17.4s, v13.4s +add v17.4s, v17.4s, v13.4s +trn1 v13.4S, v20.4S, v26.4S +trn2 v12.4S, v20.4S, v26.4S +trn1 v10.4S, v17.4S, v16.4S +trn2 v25.4S, v17.4S, v16.4S +trn2 v17.2D, v13.2D, v10.2D +trn2 v16.2D, v12.2D, v25.2D +trn1 v20.2D, v13.2D, v10.2D +trn1 v26.2D, v12.2D, v25.2D +sqrdmulh v25.4S, v17.4S, v29.4S +mul v17.4S, v17.4S,v27.4S +mla v17.4S, v25.4S, v31.s[0] +sub v25.4s, v20.4s, v17.4s +add v20.4s, v20.4s, v17.4s +sqrdmulh v17.4S, v16.4S, v29.4S +mul v16.4S, v16.4S,v27.4S +mla v16.4S, v17.4S, v31.s[0] +sub v17.4s, v26.4s, v16.4s +add v26.4s, v26.4s, v16.4s +sqrdmulh v16.4S, v26.4S, v30.4S +mul v26.4S, v26.4S,v1.4S +mla v26.4S, v16.4S, v31.s[0] +sub v16.4s, v20.4s, v26.4s +add v20.4s, v20.4s, v26.4s +sqrdmulh v26.4S, v17.4S, v15.4S +mul v17.4S, v17.4S,v5.4S +mla v17.4S, v26.4S, v31.s[0] +sub v26.4s, v25.4s, v17.4s +add v25.4s, v25.4s, v17.4s +str q20, [x0, #128] +str q16, [x0, #144] +str q25, [x0, #160] +str q26, [x0, #176] +ldr q26, [x17, #+512] +ldr q25, [x17, #+528] +ldr q16, [x17, #+544] +ldr q20, [x17, #+560] +ldr q17, [x17, #+576] +ldr q12, [x17, #+592] +ldr q10, [x17, #+608] +ldr q13, [x17, #+624] +ldr q15, [x0, #224] +ldr q5, [x0, #240] +ldr q30, [x0, #192] +ldr q1, [x0, #208] +sqrdmulh v29.4S, v15.4S, v25.s[0] +mul v15.4S, v15.4S,v26.s[0] +mla v15.4S, v29.4S, v31.s[0] +sub v29.4s, v30.4s, v15.4s +add v30.4s, v30.4s, v15.4s +sqrdmulh v15.4S, v5.4S, v25.s[0] +mul v5.4S, v5.4S,v26.s[0] +mla v5.4S, v15.4S, v31.s[0] +sub v15.4s, v1.4s, v5.4s +add v1.4s, v1.4s, v5.4s +sqrdmulh v5.4S, v1.4S, v25.s[1] +mul v1.4S, v1.4S,v26.s[1] +mla v1.4S, v5.4S, v31.s[0] +sub v5.4s, v30.4s, v1.4s +add v30.4s, v30.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v25.s[2] +mul v15.4S, v15.4S,v26.s[2] +mla v15.4S, v1.4S, v31.s[0] +sub v1.4s, v29.4s, v15.4s +add v29.4s, v29.4s, v15.4s +trn1 v15.4S, v30.4S, v5.4S +trn2 v27.4S, v30.4S, v5.4S +trn1 v4.4S, v29.4S, v1.4S +trn2 v8.4S, v29.4S, v1.4S +trn2 v29.2D, v15.2D, v4.2D +trn2 v1.2D, v27.2D, v8.2D +trn1 v30.2D, v15.2D, v4.2D +trn1 v5.2D, v27.2D, v8.2D +sqrdmulh v8.4S, v29.4S, v20.4S +mul v29.4S, v29.4S,v16.4S +mla v29.4S, v8.4S, v31.s[0] +sub v8.4s, v30.4s, v29.4s +add v30.4s, v30.4s, v29.4s +sqrdmulh v29.4S, v1.4S, v20.4S +mul v1.4S, v1.4S,v16.4S +mla v1.4S, v29.4S, v31.s[0] +sub v29.4s, v5.4s, v1.4s +add v5.4s, v5.4s, v1.4s +sqrdmulh v1.4S, v5.4S, v12.4S +mul v5.4S, v5.4S,v17.4S +mla v5.4S, v1.4S, v31.s[0] +sub v1.4s, v30.4s, v5.4s +add v30.4s, v30.4s, v5.4s +sqrdmulh v5.4S, v29.4S, v13.4S +mul v29.4S, v29.4S,v10.4S +mla v29.4S, v5.4S, v31.s[0] +sub v5.4s, v8.4s, v29.4s +add v8.4s, v8.4s, v29.4s +str q30, [x0, #192] +str q1, [x0, #208] +str q8, [x0, #224] +str q5, [x0, #240] +ldr q5, [x17, #+640] +ldr q8, [x17, #+656] +ldr q1, [x17, #+672] +ldr q30, [x17, #+688] +ldr q29, [x17, #+704] +ldr q27, [x17, #+720] +ldr q4, [x17, #+736] +ldr q15, [x17, #+752] +ldr q13, [x0, #288] +ldr q10, [x0, #304] +ldr q12, [x0, #256] +ldr q17, [x0, #272] +sqrdmulh v20.4S, v13.4S, v8.s[0] +mul v13.4S, v13.4S,v5.s[0] +mla v13.4S, v20.4S, v31.s[0] +sub v20.4s, v12.4s, v13.4s +add v12.4s, v12.4s, v13.4s +sqrdmulh v13.4S, v10.4S, v8.s[0] +mul v10.4S, v10.4S,v5.s[0] +mla v10.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v10.4s +add v17.4s, v17.4s, v10.4s +sqrdmulh v10.4S, v17.4S, v8.s[1] +mul v17.4S, v17.4S,v5.s[1] +mla v17.4S, v10.4S, v31.s[0] +sub v10.4s, v12.4s, v17.4s +add v12.4s, v12.4s, v17.4s +sqrdmulh v17.4S, v13.4S, v8.s[2] +mul v13.4S, v13.4S,v5.s[2] +mla v13.4S, v17.4S, v31.s[0] +sub v17.4s, v20.4s, v13.4s +add v20.4s, v20.4s, v13.4s +trn1 v13.4S, v12.4S, v10.4S +trn2 v16.4S, v12.4S, v10.4S +trn1 v25.4S, v20.4S, v17.4S +trn2 v26.4S, v20.4S, v17.4S +trn2 v20.2D, v13.2D, v25.2D +trn2 v17.2D, v16.2D, v26.2D +trn1 v12.2D, v13.2D, v25.2D +trn1 v10.2D, v16.2D, v26.2D +sqrdmulh v26.4S, v20.4S, v30.4S +mul v20.4S, v20.4S,v1.4S +mla v20.4S, v26.4S, v31.s[0] +sub v26.4s, v12.4s, v20.4s +add v12.4s, v12.4s, v20.4s +sqrdmulh v20.4S, v17.4S, v30.4S +mul v17.4S, v17.4S,v1.4S +mla v17.4S, v20.4S, v31.s[0] +sub v20.4s, v10.4s, v17.4s +add v10.4s, v10.4s, v17.4s +sqrdmulh v17.4S, v10.4S, v27.4S +mul v10.4S, v10.4S,v29.4S +mla v10.4S, v17.4S, v31.s[0] +sub v17.4s, v12.4s, v10.4s +add v12.4s, v12.4s, v10.4s +sqrdmulh v10.4S, v20.4S, v15.4S +mul v20.4S, v20.4S,v4.4S +mla v20.4S, v10.4S, v31.s[0] +sub v10.4s, v26.4s, v20.4s +add v26.4s, v26.4s, v20.4s +str q12, [x0, #256] +str q17, [x0, #272] +str q26, [x0, #288] +str q10, [x0, #304] +ldr q10, [x17, #+768] +ldr q26, [x17, #+784] +ldr q17, [x17, #+800] +ldr q12, [x17, #+816] +ldr q20, [x17, #+832] +ldr q16, [x17, #+848] +ldr q25, [x17, #+864] +ldr q13, [x17, #+880] +ldr q15, [x0, #352] +ldr q4, [x0, #368] +ldr q27, [x0, #320] +ldr q29, [x0, #336] +sqrdmulh v30.4S, v15.4S, v26.s[0] +mul v15.4S, v15.4S,v10.s[0] +mla v15.4S, v30.4S, v31.s[0] +sub v30.4s, v27.4s, v15.4s +add v27.4s, v27.4s, v15.4s +sqrdmulh v15.4S, v4.4S, v26.s[0] +mul v4.4S, v4.4S,v10.s[0] +mla v4.4S, v15.4S, v31.s[0] +sub v15.4s, v29.4s, v4.4s +add v29.4s, v29.4s, v4.4s +sqrdmulh v4.4S, v29.4S, v26.s[1] +mul v29.4S, v29.4S,v10.s[1] +mla v29.4S, v4.4S, v31.s[0] +sub v4.4s, v27.4s, v29.4s +add v27.4s, v27.4s, v29.4s +sqrdmulh v29.4S, v15.4S, v26.s[2] +mul v15.4S, v15.4S,v10.s[2] +mla v15.4S, v29.4S, v31.s[0] +sub v29.4s, v30.4s, v15.4s +add v30.4s, v30.4s, v15.4s +trn1 v15.4S, v27.4S, v4.4S +trn2 v1.4S, v27.4S, v4.4S +trn1 v8.4S, v30.4S, v29.4S +trn2 v5.4S, v30.4S, v29.4S +trn2 v30.2D, v15.2D, v8.2D +trn2 v29.2D, v1.2D, v5.2D +trn1 v27.2D, v15.2D, v8.2D +trn1 v4.2D, v1.2D, v5.2D +sqrdmulh v5.4S, v30.4S, v12.4S +mul v30.4S, v30.4S,v17.4S +mla v30.4S, v5.4S, v31.s[0] +sub v5.4s, v27.4s, v30.4s +add v27.4s, v27.4s, v30.4s +sqrdmulh v30.4S, v29.4S, v12.4S +mul v29.4S, v29.4S,v17.4S +mla v29.4S, v30.4S, v31.s[0] +sub v30.4s, v4.4s, v29.4s +add v4.4s, v4.4s, v29.4s +sqrdmulh v29.4S, v4.4S, v16.4S +mul v4.4S, v4.4S,v20.4S +mla v4.4S, v29.4S, v31.s[0] +sub v29.4s, v27.4s, v4.4s +add v27.4s, v27.4s, v4.4s +sqrdmulh v4.4S, v30.4S, v13.4S +mul v30.4S, v30.4S,v25.4S +mla v30.4S, v4.4S, v31.s[0] +sub v4.4s, v5.4s, v30.4s +add v5.4s, v5.4s, v30.4s +str q27, [x0, #320] +str q29, [x0, #336] +str q5, [x0, #352] +str q4, [x0, #368] +ldr q4, [x17, #+896] +ldr q5, [x17, #+912] +ldr q29, [x17, #+928] +ldr q27, [x17, #+944] +ldr q30, [x17, #+960] +ldr q1, [x17, #+976] +ldr q8, [x17, #+992] +ldr q15, [x17, #+1008] +ldr q13, [x0, #416] +ldr q25, [x0, #432] +ldr q16, [x0, #384] +ldr q20, [x0, #400] +sqrdmulh v12.4S, v13.4S, v5.s[0] +mul v13.4S, v13.4S,v4.s[0] +mla v13.4S, v12.4S, v31.s[0] +sub v12.4s, v16.4s, v13.4s +add v16.4s, v16.4s, v13.4s +sqrdmulh v13.4S, v25.4S, v5.s[0] +mul v25.4S, v25.4S,v4.s[0] +mla v25.4S, v13.4S, v31.s[0] +sub v13.4s, v20.4s, v25.4s +add v20.4s, v20.4s, v25.4s +sqrdmulh v25.4S, v20.4S, v5.s[1] +mul v20.4S, v20.4S,v4.s[1] +mla v20.4S, v25.4S, v31.s[0] +sub v25.4s, v16.4s, v20.4s +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v13.4S, v5.s[2] +mul v13.4S, v13.4S,v4.s[2] +mla v13.4S, v20.4S, v31.s[0] +sub v20.4s, v12.4s, v13.4s +add v12.4s, v12.4s, v13.4s +trn1 v13.4S, v16.4S, v25.4S +trn2 v17.4S, v16.4S, v25.4S +trn1 v26.4S, v12.4S, v20.4S +trn2 v10.4S, v12.4S, v20.4S +trn2 v12.2D, v13.2D, v26.2D +trn2 v20.2D, v17.2D, v10.2D +trn1 v16.2D, v13.2D, v26.2D +trn1 v25.2D, v17.2D, v10.2D +sqrdmulh v10.4S, v12.4S, v27.4S +mul v12.4S, v12.4S,v29.4S +mla v12.4S, v10.4S, v31.s[0] +sub v10.4s, v16.4s, v12.4s +add v16.4s, v16.4s, v12.4s +sqrdmulh v12.4S, v20.4S, v27.4S +mul v20.4S, v20.4S,v29.4S +mla v20.4S, v12.4S, v31.s[0] +sub v12.4s, v25.4s, v20.4s +add v25.4s, v25.4s, v20.4s +sqrdmulh v20.4S, v25.4S, v1.4S +mul v25.4S, v25.4S,v30.4S +mla v25.4S, v20.4S, v31.s[0] +sub v20.4s, v16.4s, v25.4s +add v16.4s, v16.4s, v25.4s +sqrdmulh v25.4S, v12.4S, v15.4S +mul v12.4S, v12.4S,v8.4S +mla v12.4S, v25.4S, v31.s[0] +sub v25.4s, v10.4s, v12.4s +add v10.4s, v10.4s, v12.4s +str q16, [x0, #384] +str q20, [x0, #400] +str q10, [x0, #416] +str q25, [x0, #432] +ldr q25, [x17, #+1024] +ldr q10, [x17, #+1040] +ldr q20, [x17, #+1056] +ldr q16, [x17, #+1072] +ldr q12, [x17, #+1088] +ldr q17, [x17, #+1104] +ldr q26, [x17, #+1120] +ldr q13, [x17, #+1136] +ldr q15, [x0, #480] +ldr q8, [x0, #496] +ldr q1, [x0, #448] +ldr q30, [x0, #464] +sqrdmulh v27.4S, v15.4S, v10.s[0] +mul v15.4S, v15.4S,v25.s[0] +mla v15.4S, v27.4S, v31.s[0] +sub v27.4s, v1.4s, v15.4s +add v1.4s, v1.4s, v15.4s +sqrdmulh v15.4S, v8.4S, v10.s[0] +mul v8.4S, v8.4S,v25.s[0] +mla v8.4S, v15.4S, v31.s[0] +sub v15.4s, v30.4s, v8.4s +add v30.4s, v30.4s, v8.4s +sqrdmulh v8.4S, v30.4S, v10.s[1] +mul v30.4S, v30.4S,v25.s[1] +mla v30.4S, v8.4S, v31.s[0] +sub v8.4s, v1.4s, v30.4s +add v1.4s, v1.4s, v30.4s +sqrdmulh v30.4S, v15.4S, v10.s[2] +mul v15.4S, v15.4S,v25.s[2] +mla v15.4S, v30.4S, v31.s[0] +sub v30.4s, v27.4s, v15.4s +add v27.4s, v27.4s, v15.4s +trn1 v15.4S, v1.4S, v8.4S +trn2 v29.4S, v1.4S, v8.4S +trn1 v5.4S, v27.4S, v30.4S +trn2 v4.4S, v27.4S, v30.4S +trn2 v27.2D, v15.2D, v5.2D +trn2 v30.2D, v29.2D, v4.2D +trn1 v1.2D, v15.2D, v5.2D +trn1 v8.2D, v29.2D, v4.2D +sqrdmulh v4.4S, v27.4S, v16.4S +mul v27.4S, v27.4S,v20.4S +mla v27.4S, v4.4S, v31.s[0] +sub v4.4s, v1.4s, v27.4s +add v1.4s, v1.4s, v27.4s +sqrdmulh v27.4S, v30.4S, v16.4S +mul v30.4S, v30.4S,v20.4S +mla v30.4S, v27.4S, v31.s[0] +sub v27.4s, v8.4s, v30.4s +add v8.4s, v8.4s, v30.4s +sqrdmulh v30.4S, v8.4S, v17.4S +mul v8.4S, v8.4S,v12.4S +mla v8.4S, v30.4S, v31.s[0] +sub v30.4s, v1.4s, v8.4s +add v1.4s, v1.4s, v8.4s +sqrdmulh v8.4S, v27.4S, v13.4S +mul v27.4S, v27.4S,v26.4S +mla v27.4S, v8.4S, v31.s[0] +sub v8.4s, v4.4s, v27.4s +add v4.4s, v4.4s, v27.4s +str q1, [x0, #448] +str q30, [x0, #464] +str q4, [x0, #480] +str q8, [x0, #496] +ldr q8, [x17, #+1152] +ldr q4, [x17, #+1168] +ldr q30, [x17, #+1184] +ldr q1, [x17, #+1200] +ldr q27, [x17, #+1216] +ldr q29, [x17, #+1232] +ldr q5, [x17, #+1248] +ldr q15, [x17, #+1264] +ldr q13, [x0, #544] +ldr q26, [x0, #560] +ldr q17, [x0, #512] +ldr q12, [x0, #528] +sqrdmulh v16.4S, v13.4S, v4.s[0] +mul v13.4S, v13.4S,v8.s[0] +mla v13.4S, v16.4S, v31.s[0] +sub v16.4s, v17.4s, v13.4s +add v17.4s, v17.4s, v13.4s +sqrdmulh v13.4S, v26.4S, v4.s[0] +mul v26.4S, v26.4S,v8.s[0] +mla v26.4S, v13.4S, v31.s[0] +sub v13.4s, v12.4s, v26.4s +add v12.4s, v12.4s, v26.4s +sqrdmulh v26.4S, v12.4S, v4.s[1] +mul v12.4S, v12.4S,v8.s[1] +mla v12.4S, v26.4S, v31.s[0] +sub v26.4s, v17.4s, v12.4s +add v17.4s, v17.4s, v12.4s +sqrdmulh v12.4S, v13.4S, v4.s[2] +mul v13.4S, v13.4S,v8.s[2] +mla v13.4S, v12.4S, v31.s[0] +sub v12.4s, v16.4s, v13.4s +add v16.4s, v16.4s, v13.4s +trn1 v13.4S, v17.4S, v26.4S +trn2 v20.4S, v17.4S, v26.4S +trn1 v10.4S, v16.4S, v12.4S +trn2 v25.4S, v16.4S, v12.4S +trn2 v16.2D, v13.2D, v10.2D +trn2 v12.2D, v20.2D, v25.2D +trn1 v17.2D, v13.2D, v10.2D +trn1 v26.2D, v20.2D, v25.2D +sqrdmulh v25.4S, v16.4S, v1.4S +mul v16.4S, v16.4S,v30.4S +mla v16.4S, v25.4S, v31.s[0] +sub v25.4s, v17.4s, v16.4s +add v17.4s, v17.4s, v16.4s +sqrdmulh v16.4S, v12.4S, v1.4S +mul v12.4S, v12.4S,v30.4S +mla v12.4S, v16.4S, v31.s[0] +sub v16.4s, v26.4s, v12.4s +add v26.4s, v26.4s, v12.4s +sqrdmulh v12.4S, v26.4S, v29.4S +mul v26.4S, v26.4S,v27.4S +mla v26.4S, v12.4S, v31.s[0] +sub v12.4s, v17.4s, v26.4s +add v17.4s, v17.4s, v26.4s +sqrdmulh v26.4S, v16.4S, v15.4S +mul v16.4S, v16.4S,v5.4S +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v25.4s, v16.4s +add v25.4s, v25.4s, v16.4s +str q17, [x0, #512] +str q12, [x0, #528] +str q25, [x0, #544] +str q26, [x0, #560] +ldr q26, [x17, #+1280] +ldr q25, [x17, #+1296] +ldr q12, [x17, #+1312] +ldr q17, [x17, #+1328] +ldr q16, [x17, #+1344] +ldr q20, [x17, #+1360] +ldr q10, [x17, #+1376] +ldr q13, [x17, #+1392] +ldr q15, [x0, #608] +ldr q5, [x0, #624] +ldr q29, [x0, #576] +ldr q27, [x0, #592] +sqrdmulh v1.4S, v15.4S, v25.s[0] +mul v15.4S, v15.4S,v26.s[0] +mla v15.4S, v1.4S, v31.s[0] +sub v1.4s, v29.4s, v15.4s +add v29.4s, v29.4s, v15.4s +sqrdmulh v15.4S, v5.4S, v25.s[0] +mul v5.4S, v5.4S,v26.s[0] +mla v5.4S, v15.4S, v31.s[0] +sub v15.4s, v27.4s, v5.4s +add v27.4s, v27.4s, v5.4s +sqrdmulh v5.4S, v27.4S, v25.s[1] +mul v27.4S, v27.4S,v26.s[1] +mla v27.4S, v5.4S, v31.s[0] +sub v5.4s, v29.4s, v27.4s +add v29.4s, v29.4s, v27.4s +sqrdmulh v27.4S, v15.4S, v25.s[2] +mul v15.4S, v15.4S,v26.s[2] +mla v15.4S, v27.4S, v31.s[0] +sub v27.4s, v1.4s, v15.4s +add v1.4s, v1.4s, v15.4s +trn1 v15.4S, v29.4S, v5.4S +trn2 v30.4S, v29.4S, v5.4S +trn1 v4.4S, v1.4S, v27.4S +trn2 v8.4S, v1.4S, v27.4S +trn2 v1.2D, v15.2D, v4.2D +trn2 v27.2D, v30.2D, v8.2D +trn1 v29.2D, v15.2D, v4.2D +trn1 v5.2D, v30.2D, v8.2D +sqrdmulh v8.4S, v1.4S, v17.4S +mul v1.4S, v1.4S,v12.4S +mla v1.4S, v8.4S, v31.s[0] +sub v8.4s, v29.4s, v1.4s +add v29.4s, v29.4s, v1.4s +sqrdmulh v1.4S, v27.4S, v17.4S +mul v27.4S, v27.4S,v12.4S +mla v27.4S, v1.4S, v31.s[0] +sub v1.4s, v5.4s, v27.4s +add v5.4s, v5.4s, v27.4s +sqrdmulh v27.4S, v5.4S, v20.4S +mul v5.4S, v5.4S,v16.4S +mla v5.4S, v27.4S, v31.s[0] +sub v27.4s, v29.4s, v5.4s +add v29.4s, v29.4s, v5.4s +sqrdmulh v5.4S, v1.4S, v13.4S +mul v1.4S, v1.4S,v10.4S +mla v1.4S, v5.4S, v31.s[0] +sub v5.4s, v8.4s, v1.4s +add v8.4s, v8.4s, v1.4s +str q29, [x0, #576] +str q27, [x0, #592] +str q8, [x0, #608] +str q5, [x0, #624] +ldr q5, [x17, #+1408] +ldr q8, [x17, #+1424] +ldr q27, [x17, #+1440] +ldr q29, [x17, #+1456] +ldr q1, [x17, #+1472] +ldr q30, [x17, #+1488] +ldr q4, [x17, #+1504] +ldr q15, [x17, #+1520] +ldr q13, [x0, #672] +ldr q10, [x0, #688] +ldr q20, [x0, #640] +ldr q16, [x0, #656] +sqrdmulh v17.4S, v13.4S, v8.s[0] +mul v13.4S, v13.4S,v5.s[0] +mla v13.4S, v17.4S, v31.s[0] +sub v17.4s, v20.4s, v13.4s +add v20.4s, v20.4s, v13.4s +sqrdmulh v13.4S, v10.4S, v8.s[0] +mul v10.4S, v10.4S,v5.s[0] +mla v10.4S, v13.4S, v31.s[0] +sub v13.4s, v16.4s, v10.4s +add v16.4s, v16.4s, v10.4s +sqrdmulh v10.4S, v16.4S, v8.s[1] +mul v16.4S, v16.4S,v5.s[1] +mla v16.4S, v10.4S, v31.s[0] +sub v10.4s, v20.4s, v16.4s +add v20.4s, v20.4s, v16.4s +sqrdmulh v16.4S, v13.4S, v8.s[2] +mul v13.4S, v13.4S,v5.s[2] +mla v13.4S, v16.4S, v31.s[0] +sub v16.4s, v17.4s, v13.4s +add v17.4s, v17.4s, v13.4s +trn1 v13.4S, v20.4S, v10.4S +trn2 v12.4S, v20.4S, v10.4S +trn1 v25.4S, v17.4S, v16.4S +trn2 v26.4S, v17.4S, v16.4S +trn2 v17.2D, v13.2D, v25.2D +trn2 v16.2D, v12.2D, v26.2D +trn1 v20.2D, v13.2D, v25.2D +trn1 v10.2D, v12.2D, v26.2D +sqrdmulh v26.4S, v17.4S, v29.4S +mul v17.4S, v17.4S,v27.4S +mla v17.4S, v26.4S, v31.s[0] +sub v26.4s, v20.4s, v17.4s +add v20.4s, v20.4s, v17.4s +sqrdmulh v17.4S, v16.4S, v29.4S +mul v16.4S, v16.4S,v27.4S +mla v16.4S, v17.4S, v31.s[0] +sub v17.4s, v10.4s, v16.4s +add v10.4s, v10.4s, v16.4s +sqrdmulh v16.4S, v10.4S, v30.4S +mul v10.4S, v10.4S,v1.4S +mla v10.4S, v16.4S, v31.s[0] +sub v16.4s, v20.4s, v10.4s +add v20.4s, v20.4s, v10.4s +sqrdmulh v10.4S, v17.4S, v15.4S +mul v17.4S, v17.4S,v4.4S +mla v17.4S, v10.4S, v31.s[0] +sub v10.4s, v26.4s, v17.4s +add v26.4s, v26.4s, v17.4s +str q20, [x0, #640] +str q16, [x0, #656] +str q26, [x0, #672] +str q10, [x0, #688] +ldr q10, [x17, #+1536] +ldr q26, [x17, #+1552] +ldr q16, [x17, #+1568] +ldr q20, [x17, #+1584] +ldr q17, [x17, #+1600] +ldr q12, [x17, #+1616] +ldr q25, [x17, #+1632] +ldr q13, [x17, #+1648] +ldr q15, [x0, #736] +ldr q4, [x0, #752] +ldr q30, [x0, #704] +ldr q1, [x0, #720] +sqrdmulh v29.4S, v15.4S, v26.s[0] +mul v15.4S, v15.4S,v10.s[0] +mla v15.4S, v29.4S, v31.s[0] +sub v29.4s, v30.4s, v15.4s +add v30.4s, v30.4s, v15.4s +sqrdmulh v15.4S, v4.4S, v26.s[0] +mul v4.4S, v4.4S,v10.s[0] +mla v4.4S, v15.4S, v31.s[0] +sub v15.4s, v1.4s, v4.4s +add v1.4s, v1.4s, v4.4s +sqrdmulh v4.4S, v1.4S, v26.s[1] +mul v1.4S, v1.4S,v10.s[1] +mla v1.4S, v4.4S, v31.s[0] +sub v4.4s, v30.4s, v1.4s +add v30.4s, v30.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v26.s[2] +mul v15.4S, v15.4S,v10.s[2] +mla v15.4S, v1.4S, v31.s[0] +sub v1.4s, v29.4s, v15.4s +add v29.4s, v29.4s, v15.4s +trn1 v15.4S, v30.4S, v4.4S +trn2 v27.4S, v30.4S, v4.4S +trn1 v8.4S, v29.4S, v1.4S +trn2 v5.4S, v29.4S, v1.4S +trn2 v29.2D, v15.2D, v8.2D +trn2 v1.2D, v27.2D, v5.2D +trn1 v30.2D, v15.2D, v8.2D +trn1 v4.2D, v27.2D, v5.2D +sqrdmulh v5.4S, v29.4S, v20.4S +mul v29.4S, v29.4S,v16.4S +mla v29.4S, v5.4S, v31.s[0] +sub v5.4s, v30.4s, v29.4s +add v30.4s, v30.4s, v29.4s +sqrdmulh v29.4S, v1.4S, v20.4S +mul v1.4S, v1.4S,v16.4S +mla v1.4S, v29.4S, v31.s[0] +sub v29.4s, v4.4s, v1.4s +add v4.4s, v4.4s, v1.4s +sqrdmulh v1.4S, v4.4S, v12.4S +mul v4.4S, v4.4S,v17.4S +mla v4.4S, v1.4S, v31.s[0] +sub v1.4s, v30.4s, v4.4s +add v30.4s, v30.4s, v4.4s +sqrdmulh v4.4S, v29.4S, v13.4S +mul v29.4S, v29.4S,v25.4S +mla v29.4S, v4.4S, v31.s[0] +sub v4.4s, v5.4s, v29.4s +add v5.4s, v5.4s, v29.4s +str q30, [x0, #704] +str q1, [x0, #720] +str q5, [x0, #736] +str q4, [x0, #752] +ldr q4, [x17, #+1664] +ldr q5, [x17, #+1680] +ldr q1, [x17, #+1696] +ldr q30, [x17, #+1712] +ldr q29, [x17, #+1728] +ldr q27, [x17, #+1744] +ldr q8, [x17, #+1760] +ldr q15, [x17, #+1776] +ldr q13, [x0, #800] +ldr q25, [x0, #816] +ldr q12, [x0, #768] +ldr q17, [x0, #784] +sqrdmulh v20.4S, v13.4S, v5.s[0] +mul v13.4S, v13.4S,v4.s[0] +mla v13.4S, v20.4S, v31.s[0] +sub v20.4s, v12.4s, v13.4s +add v12.4s, v12.4s, v13.4s +sqrdmulh v13.4S, v25.4S, v5.s[0] +mul v25.4S, v25.4S,v4.s[0] +mla v25.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v25.4s +add v17.4s, v17.4s, v25.4s +sqrdmulh v25.4S, v17.4S, v5.s[1] +mul v17.4S, v17.4S,v4.s[1] +mla v17.4S, v25.4S, v31.s[0] +sub v25.4s, v12.4s, v17.4s +add v12.4s, v12.4s, v17.4s +sqrdmulh v17.4S, v13.4S, v5.s[2] +mul v13.4S, v13.4S,v4.s[2] +mla v13.4S, v17.4S, v31.s[0] +sub v17.4s, v20.4s, v13.4s +add v20.4s, v20.4s, v13.4s +trn1 v13.4S, v12.4S, v25.4S +trn2 v16.4S, v12.4S, v25.4S +trn1 v26.4S, v20.4S, v17.4S +trn2 v10.4S, v20.4S, v17.4S +trn2 v20.2D, v13.2D, v26.2D +trn2 v17.2D, v16.2D, v10.2D +trn1 v12.2D, v13.2D, v26.2D +trn1 v25.2D, v16.2D, v10.2D +sqrdmulh v10.4S, v20.4S, v30.4S +mul v20.4S, v20.4S,v1.4S +mla v20.4S, v10.4S, v31.s[0] +sub v10.4s, v12.4s, v20.4s +add v12.4s, v12.4s, v20.4s +sqrdmulh v20.4S, v17.4S, v30.4S +mul v17.4S, v17.4S,v1.4S +mla v17.4S, v20.4S, v31.s[0] +sub v20.4s, v25.4s, v17.4s +add v25.4s, v25.4s, v17.4s +sqrdmulh v17.4S, v25.4S, v27.4S +mul v25.4S, v25.4S,v29.4S +mla v25.4S, v17.4S, v31.s[0] +sub v17.4s, v12.4s, v25.4s +add v12.4s, v12.4s, v25.4s +sqrdmulh v25.4S, v20.4S, v15.4S +mul v20.4S, v20.4S,v8.4S +mla v20.4S, v25.4S, v31.s[0] +sub v25.4s, v10.4s, v20.4s +add v10.4s, v10.4s, v20.4s +str q12, [x0, #768] +str q17, [x0, #784] +str q10, [x0, #800] +str q25, [x0, #816] +ldr q25, [x17, #+1792] +ldr q10, [x17, #+1808] +ldr q17, [x17, #+1824] +ldr q12, [x17, #+1840] +ldr q20, [x17, #+1856] +ldr q16, [x17, #+1872] +ldr q26, [x17, #+1888] +ldr q13, [x17, #+1904] +ldr q15, [x0, #864] +ldr q8, [x0, #880] +ldr q27, [x0, #832] +ldr q29, [x0, #848] +sqrdmulh v30.4S, v15.4S, v10.s[0] +mul v15.4S, v15.4S,v25.s[0] +mla v15.4S, v30.4S, v31.s[0] +sub v30.4s, v27.4s, v15.4s +add v27.4s, v27.4s, v15.4s +sqrdmulh v15.4S, v8.4S, v10.s[0] +mul v8.4S, v8.4S,v25.s[0] +mla v8.4S, v15.4S, v31.s[0] +sub v15.4s, v29.4s, v8.4s +add v29.4s, v29.4s, v8.4s +sqrdmulh v8.4S, v29.4S, v10.s[1] +mul v29.4S, v29.4S,v25.s[1] +mla v29.4S, v8.4S, v31.s[0] +sub v8.4s, v27.4s, v29.4s +add v27.4s, v27.4s, v29.4s +sqrdmulh v29.4S, v15.4S, v10.s[2] +mul v15.4S, v15.4S,v25.s[2] +mla v15.4S, v29.4S, v31.s[0] +sub v29.4s, v30.4s, v15.4s +add v30.4s, v30.4s, v15.4s +trn1 v15.4S, v27.4S, v8.4S +trn2 v1.4S, v27.4S, v8.4S +trn1 v5.4S, v30.4S, v29.4S +trn2 v4.4S, v30.4S, v29.4S +trn2 v30.2D, v15.2D, v5.2D +trn2 v29.2D, v1.2D, v4.2D +trn1 v27.2D, v15.2D, v5.2D +trn1 v8.2D, v1.2D, v4.2D +sqrdmulh v4.4S, v30.4S, v12.4S +mul v30.4S, v30.4S,v17.4S +mla v30.4S, v4.4S, v31.s[0] +sub v4.4s, v27.4s, v30.4s +add v27.4s, v27.4s, v30.4s +sqrdmulh v30.4S, v29.4S, v12.4S +mul v29.4S, v29.4S,v17.4S +mla v29.4S, v30.4S, v31.s[0] +sub v30.4s, v8.4s, v29.4s +add v8.4s, v8.4s, v29.4s +sqrdmulh v29.4S, v8.4S, v16.4S +mul v8.4S, v8.4S,v20.4S +mla v8.4S, v29.4S, v31.s[0] +sub v29.4s, v27.4s, v8.4s +add v27.4s, v27.4s, v8.4s +sqrdmulh v8.4S, v30.4S, v13.4S +mul v30.4S, v30.4S,v26.4S +mla v30.4S, v8.4S, v31.s[0] +sub v8.4s, v4.4s, v30.4s +add v4.4s, v4.4s, v30.4s +str q27, [x0, #832] +str q29, [x0, #848] +str q4, [x0, #864] +str q8, [x0, #880] +ldr q8, [x17, #+1920] +ldr q4, [x17, #+1936] +ldr q29, [x17, #+1952] +ldr q27, [x17, #+1968] +ldr q30, [x17, #+1984] +ldr q1, [x17, #+2000] +ldr q5, [x17, #+2016] +ldr q15, [x17, #+2032] +ldr q13, [x0, #928] +ldr q26, [x0, #944] +ldr q16, [x0, #896] +ldr q20, [x0, #912] +sqrdmulh v12.4S, v13.4S, v4.s[0] +mul v13.4S, v13.4S,v8.s[0] +mla v13.4S, v12.4S, v31.s[0] +sub v12.4s, v16.4s, v13.4s +add v16.4s, v16.4s, v13.4s +sqrdmulh v13.4S, v26.4S, v4.s[0] +mul v26.4S, v26.4S,v8.s[0] +mla v26.4S, v13.4S, v31.s[0] +sub v13.4s, v20.4s, v26.4s +add v20.4s, v20.4s, v26.4s +sqrdmulh v26.4S, v20.4S, v4.s[1] +mul v20.4S, v20.4S,v8.s[1] +mla v20.4S, v26.4S, v31.s[0] +sub v26.4s, v16.4s, v20.4s +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v13.4S, v4.s[2] +mul v13.4S, v13.4S,v8.s[2] +mla v13.4S, v20.4S, v31.s[0] +sub v20.4s, v12.4s, v13.4s +add v12.4s, v12.4s, v13.4s +trn1 v13.4S, v16.4S, v26.4S +trn2 v17.4S, v16.4S, v26.4S +trn1 v10.4S, v12.4S, v20.4S +trn2 v25.4S, v12.4S, v20.4S +trn2 v12.2D, v13.2D, v10.2D +trn2 v20.2D, v17.2D, v25.2D +trn1 v16.2D, v13.2D, v10.2D +trn1 v26.2D, v17.2D, v25.2D +sqrdmulh v25.4S, v12.4S, v27.4S +mul v12.4S, v12.4S,v29.4S +mla v12.4S, v25.4S, v31.s[0] +sub v25.4s, v16.4s, v12.4s +add v16.4s, v16.4s, v12.4s +sqrdmulh v12.4S, v20.4S, v27.4S +mul v20.4S, v20.4S,v29.4S +mla v20.4S, v12.4S, v31.s[0] +sub v12.4s, v26.4s, v20.4s +add v26.4s, v26.4s, v20.4s +sqrdmulh v20.4S, v26.4S, v1.4S +mul v26.4S, v26.4S,v30.4S +mla v26.4S, v20.4S, v31.s[0] +sub v20.4s, v16.4s, v26.4s +add v16.4s, v16.4s, v26.4s +sqrdmulh v26.4S, v12.4S, v15.4S +mul v12.4S, v12.4S,v5.4S +mla v12.4S, v26.4S, v31.s[0] +sub v26.4s, v25.4s, v12.4s +add v25.4s, v25.4s, v12.4s +str q16, [x0, #896] +str q20, [x0, #912] +str q25, [x0, #928] +str q26, [x0, #944] +ldr q26, [x17, #+2048] +ldr q25, [x17, #+2064] +ldr q20, [x17, #+2080] +ldr q16, [x17, #+2096] +ldr q12, [x17, #+2112] +ldr q17, [x17, #+2128] +ldr q10, [x17, #+2144] +ldr q13, [x17, #+2160] +ldr q15, [x0, #992] +ldr q5, [x0, #1008] +ldr q1, [x0, #960] +ldr q30, [x0, #976] +sqrdmulh v27.4S, v15.4S, v25.s[0] +mul v15.4S, v15.4S,v26.s[0] +mla v15.4S, v27.4S, v31.s[0] +sub v27.4s, v1.4s, v15.4s +add v1.4s, v1.4s, v15.4s +sqrdmulh v15.4S, v5.4S, v25.s[0] +mul v5.4S, v5.4S,v26.s[0] +mla v5.4S, v15.4S, v31.s[0] +sub v15.4s, v30.4s, v5.4s +add v30.4s, v30.4s, v5.4s +sqrdmulh v5.4S, v30.4S, v25.s[1] +mul v30.4S, v30.4S,v26.s[1] +mla v30.4S, v5.4S, v31.s[0] +sub v5.4s, v1.4s, v30.4s +add v1.4s, v1.4s, v30.4s +sqrdmulh v30.4S, v15.4S, v25.s[2] +mul v15.4S, v15.4S,v26.s[2] +mla v15.4S, v30.4S, v31.s[0] +sub v30.4s, v27.4s, v15.4s +add v27.4s, v27.4s, v15.4s +trn1 v15.4S, v1.4S, v5.4S +trn2 v29.4S, v1.4S, v5.4S +trn1 v4.4S, v27.4S, v30.4S +trn2 v8.4S, v27.4S, v30.4S +trn2 v27.2D, v15.2D, v4.2D +trn2 v30.2D, v29.2D, v8.2D +trn1 v1.2D, v15.2D, v4.2D +trn1 v5.2D, v29.2D, v8.2D +sqrdmulh v8.4S, v27.4S, v16.4S +mul v27.4S, v27.4S,v20.4S +mla v27.4S, v8.4S, v31.s[0] +sub v8.4s, v1.4s, v27.4s +add v1.4s, v1.4s, v27.4s +sqrdmulh v27.4S, v30.4S, v16.4S +mul v30.4S, v30.4S,v20.4S +mla v30.4S, v27.4S, v31.s[0] +sub v27.4s, v5.4s, v30.4s +add v5.4s, v5.4s, v30.4s +sqrdmulh v30.4S, v5.4S, v17.4S +mul v5.4S, v5.4S,v12.4S +mla v5.4S, v30.4S, v31.s[0] +sub v30.4s, v1.4s, v5.4s +add v1.4s, v1.4s, v5.4s +sqrdmulh v5.4S, v27.4S, v13.4S +mul v27.4S, v27.4S,v10.4S +mla v27.4S, v5.4S, v31.s[0] +sub v5.4s, v8.4s, v27.4s +add v8.4s, v8.4s, v27.4s +str q1, [x0, #960] +str q30, [x0, #976] +str q8, [x0, #992] +str q5, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 2392 +// Instruction count: 2388 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_12_0.s b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_12_0.s new file mode 100644 index 0000000..01a8251 --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_12_0.s @@ -0,0 +1,2422 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 26036764 // Layer 6, block 0 +.word 7065381 // Layer 6, block 1 +.word 11280567 // Layer 6, block 2 +.word 19695786 // Layer 6, block 3 +.word 1666225723 // Layer 6, block 0 +.word 452149874 // Layer 6, block 1 +.word 721901190 // Layer 6, block 2 +.word 1260434103 // Layer 6, block 3 +.word 28678040 // Layer 7, block 0 +.word 5637166 // Layer 7, block 2 +.word 18759424 // Layer 7, block 4 +.word 8648030 // Layer 7, block 6 +.word 1835254486 // Layer 7, block 0 +.word 360751090 // Layer 7, block 2 +.word 1200511508 // Layer 7, block 4 +.word 553431680 // Layer 7, block 6 +.word 7232147 // Layer 7, block 1 +.word 7430689 // Layer 7, block 3 +.word 14819378 // Layer 7, block 5 +.word 22112339 // Layer 7, block 7 +.word 462822084 // Layer 7, block 1 +.word 475527802 // Layer 7, block 3 +.word 948367809 // Layer 7, block 5 +.word 1415081692 // Layer 7, block 7 +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14834498 // Layer 6, block 4 +.word 22861321 // Layer 6, block 5 +.word 23033862 // Layer 6, block 6 +.word 32211066 // Layer 6, block 7 +.word 949335415 // Layer 6, block 4 +.word 1463012881 // Layer 6, block 5 +.word 1474054663 // Layer 6, block 6 +.word 2061350894 // Layer 6, block 7 +.word 7103825 // Layer 7, block 8 +.word 24338119 // Layer 7, block 10 +.word 6674394 // Layer 7, block 12 +.word 3716128 // Layer 7, block 14 +.word 454610102 // Layer 7, block 8 +.word 1557520740 // Layer 7, block 10 +.word 427128616 // Layer 7, block 12 +.word 237814041 // Layer 7, block 14 +.word 18577393 // Layer 7, block 9 +.word 17042091 // Layer 7, block 11 +.word 6574213 // Layer 7, block 13 +.word 24666803 // Layer 7, block 15 +.word 1188862414 // Layer 7, block 9 +.word 1090610585 // Layer 7, block 11 +.word 420717521 // Layer 7, block 13 +.word 1578554911 // Layer 7, block 15 +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 11253846 // Layer 6, block 8 +.word 16151303 // Layer 6, block 9 +.word 1821442 // Layer 6, block 10 +.word 23358663 // Layer 6, block 11 +.word 720191176 // Layer 6, block 8 +.word 1033604503 // Layer 6, block 9 +.word 116563391 // Layer 6, block 10 +.word 1494840340 // Layer 6, block 11 +.word 32787475 // Layer 7, block 16 +.word 8269259 // Layer 7, block 18 +.word 20826321 // Layer 7, block 20 +.word 21194054 // Layer 7, block 22 +.word 2098238255 // Layer 7, block 16 +.word 529192186 // Layer 7, block 18 +.word 1332782821 // Layer 7, block 20 +.word 1356315937 // Layer 7, block 22 +.word 28400654 // Layer 7, block 17 +.word 31090287 // Layer 7, block 19 +.word 26776841 // Layer 7, block 21 +.word 22281074 // Layer 7, block 23 +.word 1817503137 // Layer 7, block 17 +.word 1989626512 // Layer 7, block 19 +.word 1713587037 // Layer 7, block 21 +.word 1425879908 // Layer 7, block 23 +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 20504641 // Layer 6, block 12 +.word 7735096 // Layer 6, block 13 +.word 29463916 // Layer 6, block 14 +.word 23172067 // Layer 6, block 15 +.word 1312196872 // Layer 6, block 12 +.word 495008363 // Layer 6, block 13 +.word 1885546712 // Layer 6, block 14 +.word 1482899108 // Layer 6, block 15 +.word 1953000 // Layer 7, block 24 +.word 12766243 // Layer 7, block 26 +.word 16292342 // Layer 7, block 28 +.word 25143337 // Layer 7, block 30 +.word 124982461 // Layer 7, block 24 +.word 816977197 // Layer 7, block 26 +.word 1042630311 // Layer 7, block 28 +.word 1609050759 // Layer 7, block 30 +.word 12486848 // Layer 7, block 25 +.word 31556661 // Layer 7, block 27 +.word 28330310 // Layer 7, block 29 +.word 15137961 // Layer 7, block 31 +.word 799097282 // Layer 7, block 25 +.word 2019472170 // Layer 7, block 27 +.word 1813001465 // Layer 7, block 29 +.word 968755565 // Layer 7, block 31 +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 18663828 // Layer 6, block 16 +.word 25765932 // Layer 6, block 17 +.word 11779122 // Layer 6, block 18 +.word 29112305 // Layer 6, block 19 +.word 1194393831 // Layer 6, block 16 +.word 1648893798 // Layer 6, block 17 +.word 753806275 // Layer 6, block 18 +.word 1863045325 // Layer 6, block 19 +.word 33163184 // Layer 7, block 32 +.word 11550623 // Layer 7, block 34 +.word 25375595 // Layer 7, block 36 +.word 18254638 // Layer 7, block 38 +.word 2122281795 // Layer 7, block 32 +.word 739183455 // Layer 7, block 34 +.word 1623914137 // Layer 7, block 36 +.word 1168207670 // Layer 7, block 38 +.word 9551359 // Layer 7, block 33 +.word 33257316 // Layer 7, block 35 +.word 10387700 // Layer 7, block 37 +.word 4263629 // Layer 7, block 39 +.word 611240324 // Layer 7, block 33 +.word 2128305784 // Layer 7, block 35 +.word 664762063 // Layer 7, block 37 +.word 272851431 // Layer 7, block 39 +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 596073 // Layer 6, block 20 +.word 29039358 // Layer 6, block 21 +.word 6760262 // Layer 6, block 22 +.word 2228887 // Layer 6, block 23 +.word 38145761 // Layer 6, block 20 +.word 1858377074 // Layer 6, block 21 +.word 432623749 // Layer 6, block 22 +.word 142637881 // Layer 6, block 23 +.word 25929180 // Layer 7, block 40 +.word 23508428 // Layer 7, block 42 +.word 22560727 // Layer 7, block 44 +.word 29457393 // Layer 7, block 46 +.word 1659340873 // Layer 7, block 40 +.word 1504424569 // Layer 7, block 42 +.word 1443776334 // Layer 7, block 44 +.word 1885129272 // Layer 7, block 46 +.word 17371159 // Layer 7, block 41 +.word 11558208 // Layer 7, block 43 +.word 15755637 // Layer 7, block 45 +.word 20740787 // Layer 7, block 47 +.word 1111669329 // Layer 7, block 41 +.word 739668858 // Layer 7, block 43 +.word 1008283812 // Layer 7, block 45 +.word 1327309063 // Layer 7, block 47 +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 13624329 // Layer 6, block 24 +.word 9838349 // Layer 6, block 25 +.word 6934560 // Layer 6, block 26 +.word 11310234 // Layer 6, block 27 +.word 871890510 // Layer 6, block 24 +.word 629606282 // Layer 6, block 25 +.word 443777969 // Layer 6, block 26 +.word 723799733 // Layer 6, block 27 +.word 3153984 // Layer 7, block 48 +.word 15599806 // Layer 7, block 50 +.word 23484790 // Layer 7, block 52 +.word 30174454 // Layer 7, block 54 +.word 201839571 // Layer 7, block 48 +.word 998311389 // Layer 7, block 50 +.word 1502911852 // Layer 7, block 52 +.word 1931017673 // Layer 7, block 54 +.word 13598070 // Layer 7, block 49 +.word 31454003 // Layer 7, block 51 +.word 20506260 // Layer 7, block 53 +.word 5928435 // Layer 7, block 55 +.word 870210062 // Layer 7, block 49 +.word 2012902560 // Layer 7, block 51 +.word 1312300480 // Layer 7, block 53 +.word 379390883 // Layer 7, block 55 +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 32798516 // Layer 6, block 28 +.word 9911360 // Layer 6, block 29 +.word 32443170 // Layer 6, block 30 +.word 31293482 // Layer 6, block 31 +.word 2098944825 // Layer 6, block 28 +.word 634278629 // Layer 6, block 29 +.word 2076204416 // Layer 6, block 30 +.word 2002630000 // Layer 6, block 31 +.word 26013877 // Layer 7, block 56 +.word 22928950 // Layer 7, block 58 +.word 24547058 // Layer 7, block 60 +.word 21082546 // Layer 7, block 62 +.word 1664761067 // Layer 7, block 56 +.word 1467340807 // Layer 7, block 58 +.word 1570891816 // Layer 7, block 60 +.word 1349179970 // Layer 7, block 62 +.word 21864746 // Layer 7, block 57 +.word 27678266 // Layer 7, block 59 +.word 30695887 // Layer 7, block 61 +.word 31772478 // Layer 7, block 63 +.word 1399236949 // Layer 7, block 57 +.word 1771273834 // Layer 7, block 59 +.word 1964386839 // Layer 7, block 61 +.word 2033283404 // Layer 7, block 63 +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 2853776 // Layer 6, block 32 +.word 31645959 // Layer 6, block 33 +.word 29723614 // Layer 6, block 34 +.word 31813171 // Layer 6, block 35 +.word 182627725 // Layer 6, block 32 +.word 2025186806 // Layer 6, block 33 +.word 1902166116 // Layer 6, block 34 +.word 2035887557 // Layer 6, block 35 +.word 30377953 // Layer 7, block 64 +.word 4924837 // Layer 7, block 66 +.word 11362575 // Layer 7, block 68 +.word 31398766 // Layer 7, block 70 +.word 1944040616 // Layer 7, block 64 +.word 315165513 // Layer 7, block 66 +.word 727149301 // Layer 7, block 68 +.word 2009367662 // Layer 7, block 70 +.word 27689101 // Layer 7, block 65 +.word 31229525 // Layer 7, block 67 +.word 6544948 // Layer 7, block 69 +.word 13728247 // Layer 7, block 71 +.word 1771967221 // Layer 7, block 65 +.word 1998537064 // Layer 7, block 67 +.word 418844704 // Layer 7, block 69 +.word 878540754 // Layer 7, block 71 +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9116920 // Layer 6, block 36 +.word 26449800 // Layer 6, block 37 +.word 27173300 // Layer 6, block 38 +.word 1574249 // Layer 6, block 39 +.word 583438350 // Layer 6, block 36 +.word 1692658010 // Layer 6, block 37 +.word 1738958476 // Layer 6, block 38 +.word 100744247 // Layer 6, block 39 +.word 6510145 // Layer 7, block 72 +.word 760999 // Layer 7, block 74 +.word 1634503 // Layer 7, block 76 +.word 29546109 // Layer 7, block 78 +.word 416617482 // Layer 7, block 72 +.word 48700219 // Layer 7, block 74 +.word 104600209 // Layer 7, block 76 +.word 1890806663 // Layer 7, block 78 +.word 2195232 // Layer 7, block 73 +.word 4465852 // Layer 7, block 75 +.word 31203102 // Layer 7, block 77 +.word 29916743 // Layer 7, block 79 +.word 140484126 // Layer 7, block 73 +.word 285792715 // Layer 7, block 75 +.word 1996846121 // Layer 7, block 77 +.word 1914525428 // Layer 7, block 79 +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29172999 // Layer 6, block 40 +.word 16825951 // Layer 6, block 41 +.word 11592382 // Layer 6, block 42 +.word 2671395 // Layer 6, block 43 +.word 1866929445 // Layer 6, block 40 +.word 1076778680 // Layer 6, block 41 +.word 741855827 // Layer 6, block 42 +.word 170956232 // Layer 6, block 43 +.word 14579779 // Layer 7, block 80 +.word 24263513 // Layer 7, block 82 +.word 4646776 // Layer 7, block 84 +.word 69049 // Layer 7, block 86 +.word 933034643 // Layer 7, block 80 +.word 1552746321 // Layer 7, block 82 +.word 297370968 // Layer 7, block 84 +.word 4418799 // Layer 7, block 86 +.word 33263488 // Layer 7, block 81 +.word 22493246 // Layer 7, block 83 +.word 22009979 // Layer 7, block 85 +.word 12021234 // Layer 7, block 87 +.word 2128700762 // Layer 7, block 81 +.word 1439457879 // Layer 7, block 83 +.word 1408531152 // Layer 7, block 85 +.word 769300260 // Layer 7, block 87 +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 15720958 // Layer 6, block 44 +.word 4876619 // Layer 6, block 45 +.word 9370171 // Layer 6, block 46 +.word 2197027 // Layer 6, block 47 +.word 1006064525 // Layer 6, block 44 +.word 312079797 // Layer 6, block 45 +.word 599645177 // Layer 6, block 46 +.word 140598997 // Layer 6, block 47 +.word 16117282 // Layer 7, block 88 +.word 9635661 // Layer 7, block 90 +.word 9117520 // Layer 7, block 92 +.word 3506913 // Layer 7, block 94 +.word 1031427326 // Layer 7, block 88 +.word 616635240 // Layer 7, block 90 +.word 583476747 // Layer 7, block 92 +.word 224425303 // Layer 7, block 94 +.word 20014407 // Layer 7, block 89 +.word 25893988 // Layer 7, block 91 +.word 10257619 // Layer 7, block 93 +.word 24501669 // Layer 7, block 95 +.word 1280824291 // Layer 7, block 89 +.word 1657088757 // Layer 7, block 91 +.word 656437514 // Layer 7, block 93 +.word 1567987141 // Layer 7, block 95 +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 23467272 // Layer 6, block 48 +.word 11944835 // Layer 6, block 49 +.word 29768154 // Layer 6, block 50 +.word 3189790 // Layer 6, block 51 +.word 1501790786 // Layer 6, block 48 +.word 764411097 // Layer 6, block 49 +.word 1905016458 // Layer 6, block 50 +.word 204130980 // Layer 6, block 51 +.word 28559032 // Layer 7, block 96 +.word 20151609 // Layer 7, block 98 +.word 11645481 // Layer 7, block 100 +.word 16402437 // Layer 7, block 102 +.word 1827638556 // Layer 7, block 96 +.word 1289604549 // Layer 7, block 98 +.word 745253903 // Layer 7, block 100 +.word 1049675853 // Layer 7, block 102 +.word 1005359 // Layer 7, block 97 +.word 19130139 // Layer 7, block 99 +.word 11690281 // Layer 7, block 101 +.word 5461508 // Layer 7, block 103 +.word 64338065 // Layer 7, block 97 +.word 1224235458 // Layer 7, block 99 +.word 748120885 // Layer 7, block 101 +.word 349509836 // Layer 7, block 103 +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 4898455 // Layer 6, block 52 +.word 22059944 // Layer 6, block 53 +.word 20315246 // Layer 6, block 54 +.word 28615767 // Layer 6, block 55 +.word 313477194 // Layer 6, block 52 +.word 1411728668 // Layer 6, block 53 +.word 1300076517 // Layer 6, block 54 +.word 1831269319 // Layer 6, block 55 +.word 6226096 // Layer 7, block 104 +.word 14029790 // Layer 7, block 106 +.word 7729000 // Layer 7, block 108 +.word 13958531 // Layer 7, block 110 +.word 398439734 // Layer 7, block 104 +.word 897838034 // Layer 7, block 106 +.word 494618249 // Layer 7, block 108 +.word 893277806 // Layer 7, block 110 +.word 31755058 // Layer 7, block 105 +.word 26102744 // Layer 7, block 107 +.word 19175904 // Layer 7, block 109 +.word 19472238 // Layer 7, block 111 +.word 2032168609 // Layer 7, block 105 +.word 1670448121 // Layer 7, block 107 +.word 1227164194 // Layer 7, block 109 +.word 1246128123 // Layer 7, block 111 +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 17302560 // Layer 6, block 56 +.word 8630188 // Layer 6, block 57 +.word 13744680 // Layer 6, block 58 +.word 31890906 // Layer 6, block 59 +.word 1107279328 // Layer 6, block 56 +.word 552289879 // Layer 6, block 57 +.word 879592386 // Layer 6, block 58 +.word 2040862218 // Layer 6, block 59 +.word 4735938 // Layer 7, block 112 +.word 26671657 // Layer 7, block 114 +.word 25810971 // Layer 7, block 116 +.word 25578690 // Layer 7, block 118 +.word 303076900 // Layer 7, block 112 +.word 1706855774 // Layer 7, block 114 +.word 1651776074 // Layer 7, block 116 +.word 1636911225 // Layer 7, block 118 +.word 6957373 // Layer 7, block 113 +.word 25381712 // Layer 7, block 115 +.word 27780827 // Layer 7, block 117 +.word 28062311 // Layer 7, block 119 +.word 445237890 // Layer 7, block 113 +.word 1624305595 // Layer 7, block 115 +.word 1777837237 // Layer 7, block 117 +.word 1795850838 // Layer 7, block 119 +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 26150922 // Layer 6, block 60 +.word 29525906 // Layer 6, block 61 +.word 23080870 // Layer 6, block 62 +.word 1636987 // Layer 6, block 63 +.word 1673531278 // Layer 6, block 60 +.word 1889513769 // Layer 6, block 61 +.word 1477062945 // Layer 6, block 62 +.word 104759172 // Layer 6, block 63 +.word 10674616 // Layer 7, block 120 +.word 9508293 // Layer 7, block 122 +.word 4274200 // Layer 7, block 124 +.word 10066304 // Layer 7, block 126 +.word 683123285 // Layer 7, block 120 +.word 608484310 // Layer 7, block 122 +.word 273527923 // Layer 7, block 124 +.word 644194289 // Layer 7, block 126 +.word 26473446 // Layer 7, block 121 +.word 14853570 // Layer 7, block 123 +.word 32427548 // Layer 7, block 125 +.word 16598340 // Layer 7, block 127 +.word 1694171239 // Layer 7, block 121 +.word 950555930 // Layer 7, block 123 +.word 2075204685 // Layer 7, block 125 +.word 1062212688 // Layer 7, block 127 +.text +.global ntt_u32_full_neon_asm_var_4_4_12_0 +.global _ntt_u32_full_neon_asm_var_4_4_12_0 +ntt_u32_full_neon_asm_var_4_4_12_0: +_ntt_u32_full_neon_asm_var_4_4_12_0: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x0, #928] +ldr q29, [x17, #+0] +ldr q28, [x17, #+16] +sqrdmulh v27.4S, v30.4S, v28.s[0] +mul v30.4S, v30.4S,v29.s[0] +ldr q26, [x0, #992] +sqrdmulh v25.4S, v26.4S, v28.s[0] +mul v26.4S, v26.4S,v29.s[0] +ldr q24, [x0, #800] +sqrdmulh v23.4S, v24.4S, v28.s[0] +mul v24.4S, v24.4S,v29.s[0] +ldr q22, [x0, #864] +sqrdmulh v21.4S, v22.4S, v28.s[0] +mul v22.4S, v22.4S,v29.s[0] +ldr q20, [x0, #544] +mla v30.4S, v27.4S, v31.s[0] +sqrdmulh v27.4S, v20.4S, v28.s[0] +ldr q19, [x0, #608] +mla v26.4S, v25.4S, v31.s[0] +sqrdmulh v25.4S, v19.4S, v28.s[0] +ldr q18, [x0, #672] +mla v24.4S, v23.4S, v31.s[0] +sqrdmulh v23.4S, v18.4S, v28.s[0] +ldr q17, [x0, #736] +mla v22.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v17.4S, v28.s[0] +ldr q16, [x0, #416] +ldr q3, [x0, #480] +mul v20.4S, v20.4S,v29.s[0] +sub v2.4s, v16.4s, v30.4s +mul v19.4S, v19.4S,v29.s[0] +add v16.4s, v16.4s, v30.4s +ldr q30, [x0, #288] +ldr q1, [x0, #352] +mla v20.4S, v27.4S, v31.s[0] +sub v27.4s, v3.4s, v26.4s +mla v19.4S, v25.4S, v31.s[0] +add v3.4s, v3.4s, v26.4s +ldr q26, [x0, #32] +ldr q25, [x0, #96] +mul v18.4S, v18.4S,v29.s[0] +sub v0.4s, v30.4s, v24.4s +mul v17.4S, v17.4S,v29.s[0] +add v30.4s, v30.4s, v24.4s +ldr q24, [x0, #160] +ldr q15, [x0, #224] +mla v18.4S, v23.4S, v31.s[0] +sub v23.4s, v1.4s, v22.4s +mla v17.4S, v21.4S, v31.s[0] +add v1.4s, v1.4s, v22.4s +sqrdmulh v22.4S, v16.4S, v28.s[1] +mul v16.4S, v16.4S,v29.s[1] +sqrdmulh v21.4S, v3.4S, v28.s[1] +sub v14.4s, v26.4s, v20.4s +mul v3.4S, v3.4S,v29.s[1] +add v26.4s, v26.4s, v20.4s +sqrdmulh v20.4S, v30.4S, v28.s[1] +sub v13.4s, v25.4s, v19.4s +mul v30.4S, v30.4S,v29.s[1] +add v25.4s, v25.4s, v19.4s +sqrdmulh v19.4S, v1.4S, v28.s[1] +sub v12.4s, v24.4s, v18.4s +mul v1.4S, v1.4S,v29.s[1] +add v24.4s, v24.4s, v18.4s +mla v16.4S, v22.4S, v31.s[0] +sub v22.4s, v15.4s, v17.4s +sqrdmulh v18.4S, v2.4S, v28.s[2] +add v15.4s, v15.4s, v17.4s +mla v3.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v27.4S, v28.s[2] +mla v30.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v0.4S, v28.s[2] +mla v1.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v23.4S, v28.s[2] +ldr q17, [x17, #+32] +ldr q11, [x17, #+48] +mul v2.4S, v2.4S,v29.s[2] +sub v10.4s, v24.4s, v16.4s +mul v27.4S, v27.4S,v29.s[2] +add v24.4s, v24.4s, v16.4s +mla v2.4S, v18.4S, v31.s[0] +sub v18.4s, v15.4s, v3.4s +mla v27.4S, v21.4S, v31.s[0] +add v15.4s, v15.4s, v3.4s +mul v0.4S, v0.4S,v29.s[2] +sub v3.4s, v26.4s, v30.4s +mul v23.4S, v23.4S,v29.s[2] +add v26.4s, v26.4s, v30.4s +mla v0.4S, v20.4S, v31.s[0] +sub v20.4s, v25.4s, v1.4s +mla v23.4S, v19.4S, v31.s[0] +add v25.4s, v25.4s, v1.4s +sqrdmulh v1.4S, v10.4S, v11.s[1] +mul v10.4S, v10.4S,v17.s[1] +sqrdmulh v19.4S, v18.4S, v11.s[1] +sub v30.4s, v12.4s, v2.4s +mul v18.4S, v18.4S,v17.s[1] +add v12.4s, v12.4s, v2.4s +sqrdmulh v2.4S, v24.4S, v11.s[0] +sub v21.4s, v22.4s, v27.4s +mul v24.4S, v24.4S,v17.s[0] +add v22.4s, v22.4s, v27.4s +sqrdmulh v27.4S, v15.4S, v11.s[0] +sub v16.4s, v14.4s, v0.4s +mul v15.4S, v15.4S,v17.s[0] +add v14.4s, v14.4s, v0.4s +ldr q0, [x17, #+64] +ldr q9, [x17, #+80] +mla v10.4S, v1.4S, v31.s[0] +sub v1.4s, v13.4s, v23.4s +sqrdmulh v8.4S, v12.4S, v11.s[2] +add v13.4s, v13.4s, v23.4s +mla v18.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v22.4S, v11.s[2] +mla v24.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v30.4S, v11.s[3] +mla v15.4S, v27.4S, v31.s[0] +sqrdmulh v27.4S, v21.4S, v11.s[3] +ldr q23, [x17, #+96] +ldr q7, [x17, #+112] +mul v12.4S, v12.4S,v17.s[2] +sub v6.4s, v3.4s, v10.4s +mul v22.4S, v22.4S,v17.s[2] +add v3.4s, v3.4s, v10.4s +mla v12.4S, v8.4S, v31.s[0] +sub v8.4s, v20.4s, v18.4s +mla v22.4S, v19.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +mul v30.4S, v30.4S,v17.s[3] +sub v18.4s, v26.4s, v24.4s +mul v21.4S, v21.4S,v17.s[3] +add v26.4s, v26.4s, v24.4s +mla v30.4S, v2.4S, v31.s[0] +sub v2.4s, v25.4s, v15.4s +mla v21.4S, v27.4S, v31.s[0] +add v25.4s, v25.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v9.s[2] +mul v20.4S, v20.4S,v0.s[2] +sqrdmulh v27.4S, v8.4S, v9.s[3] +sub v24.4s, v14.4s, v12.4s +mul v8.4S, v8.4S,v0.s[3] +add v14.4s, v14.4s, v12.4s +sqrdmulh v12.4S, v2.4S, v9.s[1] +sub v19.4s, v13.4s, v22.4s +mul v2.4S, v2.4S,v0.s[1] +add v13.4s, v13.4s, v22.4s +sqrdmulh v22.4S, v25.4S, v9.s[0] +sub v10.4s, v16.4s, v30.4s +mul v25.4S, v25.4S,v0.s[0] +add v16.4s, v16.4s, v30.4s +mla v20.4S, v15.4S, v31.s[0] +sub v15.4s, v1.4s, v21.4s +sqrdmulh v30.4S, v13.4S, v7.s[0] +add v1.4s, v1.4s, v21.4s +mla v8.4S, v27.4S, v31.s[0] +sub v27.4s, v3.4s, v20.4s +sqrdmulh v21.4S, v19.4S, v7.s[1] +add v3.4s, v3.4s, v20.4s +mla v2.4S, v12.4S, v31.s[0] +sub v12.4s, v6.4s, v8.4s +sqrdmulh v20.4S, v1.4S, v7.s[2] +add v6.4s, v6.4s, v8.4s +mla v25.4S, v22.4S, v31.s[0] +sub v22.4s, v18.4s, v2.4s +sqrdmulh v8.4S, v15.4S, v7.s[3] +add v18.4s, v18.4s, v2.4s +mul v13.4S, v13.4S,v23.s[0] +sub v2.4s, v26.4s, v25.4s +mul v19.4S, v19.4S,v23.s[1] +add v26.4s, v26.4s, v25.4s +mla v13.4S, v30.4S, v31.s[0] +str q27, [x0, #352] +mla v19.4S, v21.4S, v31.s[0] +str q3, [x0, #288] +mul v1.4S, v1.4S,v23.s[2] +str q12, [x0, #480] +mul v15.4S, v15.4S,v23.s[3] +str q6, [x0, #416] +mla v1.4S, v20.4S, v31.s[0] +str q22, [x0, #224] +mla v15.4S, v8.4S, v31.s[0] +str q18, [x0, #160] +ldr q18, [x0, #944] +sqrdmulh v8.4S, v18.4S, v28.s[0] +str q2, [x0, #96] +mul v18.4S, v18.4S,v29.s[0] +str q26, [x0, #32] +ldr q26, [x0, #1008] +sqrdmulh v2.4S, v26.4S, v28.s[0] +sub v22.4s, v14.4s, v13.4s +str q22, [x0, #608] +mul v26.4S, v26.4S,v29.s[0] +add v14.4s, v14.4s, v13.4s +ldr q13, [x0, #816] +sqrdmulh v22.4S, v13.4S, v28.s[0] +sub v20.4s, v24.4s, v19.4s +str q14, [x0, #544] +mul v13.4S, v13.4S,v29.s[0] +add v24.4s, v24.4s, v19.4s +ldr q19, [x0, #880] +sqrdmulh v14.4S, v19.4S, v28.s[0] +sub v6.4s, v16.4s, v1.4s +str q20, [x0, #736] +mul v19.4S, v19.4S,v29.s[0] +add v16.4s, v16.4s, v1.4s +ldr q1, [x0, #560] +mla v18.4S, v8.4S, v31.s[0] +sub v8.4s, v10.4s, v15.4s +str q24, [x0, #672] +sqrdmulh v24.4S, v1.4S, v28.s[0] +add v10.4s, v10.4s, v15.4s +ldr q15, [x0, #624] +mla v26.4S, v2.4S, v31.s[0] +str q6, [x0, #864] +sqrdmulh v6.4S, v15.4S, v28.s[0] +ldr q2, [x0, #688] +mla v13.4S, v22.4S, v31.s[0] +str q16, [x0, #800] +sqrdmulh v16.4S, v2.4S, v28.s[0] +ldr q22, [x0, #752] +mla v19.4S, v14.4S, v31.s[0] +str q8, [x0, #992] +sqrdmulh v8.4S, v22.4S, v28.s[0] +ldr q14, [x0, #432] +ldr q20, [x0, #496] +mul v1.4S, v1.4S,v29.s[0] +sub v12.4s, v14.4s, v18.4s +str q10, [x0, #928] +mul v15.4S, v15.4S,v29.s[0] +add v14.4s, v14.4s, v18.4s +ldr q18, [x0, #304] +ldr q10, [x0, #368] +mla v1.4S, v24.4S, v31.s[0] +sub v24.4s, v20.4s, v26.4s +mla v15.4S, v6.4S, v31.s[0] +add v20.4s, v20.4s, v26.4s +ldr q26, [x0, #48] +ldr q6, [x0, #112] +mul v2.4S, v2.4S,v29.s[0] +sub v3.4s, v18.4s, v13.4s +mul v22.4S, v22.4S,v29.s[0] +add v18.4s, v18.4s, v13.4s +ldr q13, [x0, #176] +ldr q21, [x0, #240] +mla v2.4S, v16.4S, v31.s[0] +sub v16.4s, v10.4s, v19.4s +mla v22.4S, v8.4S, v31.s[0] +add v10.4s, v10.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v28.s[1] +mul v14.4S, v14.4S,v29.s[1] +sqrdmulh v8.4S, v20.4S, v28.s[1] +sub v27.4s, v26.4s, v1.4s +mul v20.4S, v20.4S,v29.s[1] +add v26.4s, v26.4s, v1.4s +sqrdmulh v1.4S, v18.4S, v28.s[1] +sub v30.4s, v6.4s, v15.4s +mul v18.4S, v18.4S,v29.s[1] +add v6.4s, v6.4s, v15.4s +sqrdmulh v15.4S, v10.4S, v28.s[1] +sub v25.4s, v13.4s, v2.4s +mul v10.4S, v10.4S,v29.s[1] +add v13.4s, v13.4s, v2.4s +mla v14.4S, v19.4S, v31.s[0] +sub v19.4s, v21.4s, v22.4s +sqrdmulh v2.4S, v12.4S, v28.s[2] +add v21.4s, v21.4s, v22.4s +mla v20.4S, v8.4S, v31.s[0] +sqrdmulh v8.4S, v24.4S, v28.s[2] +mla v18.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v3.4S, v28.s[2] +mla v10.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v16.4S, v28.s[2] +mul v12.4S, v12.4S,v29.s[2] +sub v22.4s, v13.4s, v14.4s +mul v24.4S, v24.4S,v29.s[2] +add v13.4s, v13.4s, v14.4s +mla v12.4S, v2.4S, v31.s[0] +sub v2.4s, v21.4s, v20.4s +mla v24.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v20.4s +mul v3.4S, v3.4S,v29.s[2] +sub v20.4s, v26.4s, v18.4s +mul v16.4S, v16.4S,v29.s[2] +add v26.4s, v26.4s, v18.4s +mla v3.4S, v1.4S, v31.s[0] +sub v1.4s, v6.4s, v10.4s +mla v16.4S, v15.4S, v31.s[0] +add v6.4s, v6.4s, v10.4s +sqrdmulh v10.4S, v22.4S, v11.s[1] +mul v22.4S, v22.4S,v17.s[1] +sqrdmulh v15.4S, v2.4S, v11.s[1] +sub v18.4s, v25.4s, v12.4s +mul v2.4S, v2.4S,v17.s[1] +add v25.4s, v25.4s, v12.4s +sqrdmulh v12.4S, v13.4S, v11.s[0] +sub v8.4s, v19.4s, v24.4s +mul v13.4S, v13.4S,v17.s[0] +add v19.4s, v19.4s, v24.4s +sqrdmulh v24.4S, v21.4S, v11.s[0] +sub v14.4s, v27.4s, v3.4s +mul v21.4S, v21.4S,v17.s[0] +add v27.4s, v27.4s, v3.4s +mla v22.4S, v10.4S, v31.s[0] +sub v10.4s, v30.4s, v16.4s +sqrdmulh v3.4S, v25.4S, v11.s[2] +add v30.4s, v30.4s, v16.4s +mla v2.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v19.4S, v11.s[2] +mla v13.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v18.4S, v11.s[3] +mla v21.4S, v24.4S, v31.s[0] +sqrdmulh v24.4S, v8.4S, v11.s[3] +mul v25.4S, v25.4S,v17.s[2] +sub v16.4s, v20.4s, v22.4s +mul v19.4S, v19.4S,v17.s[2] +add v20.4s, v20.4s, v22.4s +mla v25.4S, v3.4S, v31.s[0] +sub v3.4s, v1.4s, v2.4s +mla v19.4S, v15.4S, v31.s[0] +add v1.4s, v1.4s, v2.4s +mul v18.4S, v18.4S,v17.s[3] +sub v2.4s, v26.4s, v13.4s +mul v8.4S, v8.4S,v17.s[3] +add v26.4s, v26.4s, v13.4s +mla v18.4S, v12.4S, v31.s[0] +sub v12.4s, v6.4s, v21.4s +mla v8.4S, v24.4S, v31.s[0] +add v6.4s, v6.4s, v21.4s +sqrdmulh v21.4S, v1.4S, v9.s[2] +mul v1.4S, v1.4S,v0.s[2] +sqrdmulh v24.4S, v3.4S, v9.s[3] +sub v13.4s, v27.4s, v25.4s +mul v3.4S, v3.4S,v0.s[3] +add v27.4s, v27.4s, v25.4s +sqrdmulh v25.4S, v12.4S, v9.s[1] +sub v15.4s, v30.4s, v19.4s +mul v12.4S, v12.4S,v0.s[1] +add v30.4s, v30.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v9.s[0] +sub v22.4s, v14.4s, v18.4s +mul v6.4S, v6.4S,v0.s[0] +add v14.4s, v14.4s, v18.4s +mla v1.4S, v21.4S, v31.s[0] +sub v21.4s, v10.4s, v8.4s +sqrdmulh v18.4S, v30.4S, v7.s[0] +add v10.4s, v10.4s, v8.4s +mla v3.4S, v24.4S, v31.s[0] +sub v24.4s, v20.4s, v1.4s +sqrdmulh v8.4S, v15.4S, v7.s[1] +add v20.4s, v20.4s, v1.4s +mla v12.4S, v25.4S, v31.s[0] +sub v25.4s, v16.4s, v3.4s +sqrdmulh v1.4S, v10.4S, v7.s[2] +add v16.4s, v16.4s, v3.4s +mla v6.4S, v19.4S, v31.s[0] +sub v19.4s, v2.4s, v12.4s +sqrdmulh v3.4S, v21.4S, v7.s[3] +add v2.4s, v2.4s, v12.4s +mul v30.4S, v30.4S,v23.s[0] +sub v12.4s, v26.4s, v6.4s +mul v15.4S, v15.4S,v23.s[1] +add v26.4s, v26.4s, v6.4s +mla v30.4S, v18.4S, v31.s[0] +str q24, [x0, #368] +mla v15.4S, v8.4S, v31.s[0] +str q20, [x0, #304] +mul v10.4S, v10.4S,v23.s[2] +str q25, [x0, #496] +mul v21.4S, v21.4S,v23.s[3] +str q16, [x0, #432] +mla v10.4S, v1.4S, v31.s[0] +str q19, [x0, #240] +mla v21.4S, v3.4S, v31.s[0] +str q2, [x0, #176] +ldr q2, [x0, #896] +sqrdmulh v3.4S, v2.4S, v28.s[0] +str q12, [x0, #112] +mul v2.4S, v2.4S,v29.s[0] +str q26, [x0, #48] +ldr q26, [x0, #960] +sqrdmulh v12.4S, v26.4S, v28.s[0] +sub v19.4s, v27.4s, v30.4s +str q19, [x0, #624] +mul v26.4S, v26.4S,v29.s[0] +add v27.4s, v27.4s, v30.4s +ldr q30, [x0, #768] +sqrdmulh v19.4S, v30.4S, v28.s[0] +sub v1.4s, v13.4s, v15.4s +str q27, [x0, #560] +mul v30.4S, v30.4S,v29.s[0] +add v13.4s, v13.4s, v15.4s +ldr q15, [x0, #832] +sqrdmulh v27.4S, v15.4S, v28.s[0] +sub v16.4s, v14.4s, v10.4s +str q1, [x0, #752] +mul v15.4S, v15.4S,v29.s[0] +add v14.4s, v14.4s, v10.4s +ldr q10, [x0, #512] +mla v2.4S, v3.4S, v31.s[0] +sub v3.4s, v22.4s, v21.4s +str q13, [x0, #688] +sqrdmulh v13.4S, v10.4S, v28.s[0] +add v22.4s, v22.4s, v21.4s +ldr q21, [x0, #576] +mla v26.4S, v12.4S, v31.s[0] +str q16, [x0, #880] +sqrdmulh v16.4S, v21.4S, v28.s[0] +ldr q12, [x0, #640] +mla v30.4S, v19.4S, v31.s[0] +str q14, [x0, #816] +sqrdmulh v14.4S, v12.4S, v28.s[0] +ldr q19, [x0, #704] +mla v15.4S, v27.4S, v31.s[0] +str q3, [x0, #1008] +sqrdmulh v3.4S, v19.4S, v28.s[0] +ldr q27, [x0, #384] +ldr q1, [x0, #448] +mul v10.4S, v10.4S,v29.s[0] +sub v25.4s, v27.4s, v2.4s +str q22, [x0, #944] +mul v21.4S, v21.4S,v29.s[0] +add v27.4s, v27.4s, v2.4s +ldr q2, [x0, #256] +ldr q22, [x0, #320] +mla v10.4S, v13.4S, v31.s[0] +sub v13.4s, v1.4s, v26.4s +mla v21.4S, v16.4S, v31.s[0] +add v1.4s, v1.4s, v26.4s +ldr q26, [x0, #0] +ldr q16, [x0, #64] +mul v12.4S, v12.4S,v29.s[0] +sub v20.4s, v2.4s, v30.4s +mul v19.4S, v19.4S,v29.s[0] +add v2.4s, v2.4s, v30.4s +ldr q30, [x0, #128] +ldr q8, [x0, #192] +mla v12.4S, v14.4S, v31.s[0] +sub v14.4s, v22.4s, v15.4s +mla v19.4S, v3.4S, v31.s[0] +add v22.4s, v22.4s, v15.4s +sqrdmulh v15.4S, v27.4S, v28.s[1] +mul v27.4S, v27.4S,v29.s[1] +sqrdmulh v3.4S, v1.4S, v28.s[1] +sub v24.4s, v26.4s, v10.4s +mul v1.4S, v1.4S,v29.s[1] +add v26.4s, v26.4s, v10.4s +sqrdmulh v10.4S, v2.4S, v28.s[1] +sub v18.4s, v16.4s, v21.4s +mul v2.4S, v2.4S,v29.s[1] +add v16.4s, v16.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v28.s[1] +sub v6.4s, v30.4s, v12.4s +mul v22.4S, v22.4S,v29.s[1] +add v30.4s, v30.4s, v12.4s +mla v27.4S, v15.4S, v31.s[0] +sub v15.4s, v8.4s, v19.4s +sqrdmulh v12.4S, v25.4S, v28.s[2] +add v8.4s, v8.4s, v19.4s +mla v1.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v13.4S, v28.s[2] +mla v2.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v20.4S, v28.s[2] +mla v22.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v14.4S, v28.s[2] +mul v25.4S, v25.4S,v29.s[2] +sub v19.4s, v30.4s, v27.4s +mul v13.4S, v13.4S,v29.s[2] +add v30.4s, v30.4s, v27.4s +mla v25.4S, v12.4S, v31.s[0] +sub v12.4s, v8.4s, v1.4s +mla v13.4S, v3.4S, v31.s[0] +add v8.4s, v8.4s, v1.4s +mul v20.4S, v20.4S,v29.s[2] +sub v1.4s, v26.4s, v2.4s +mul v14.4S, v14.4S,v29.s[2] +add v26.4s, v26.4s, v2.4s +mla v20.4S, v10.4S, v31.s[0] +sub v10.4s, v16.4s, v22.4s +mla v14.4S, v21.4S, v31.s[0] +add v16.4s, v16.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v11.s[1] +mul v19.4S, v19.4S,v17.s[1] +sqrdmulh v21.4S, v12.4S, v11.s[1] +sub v2.4s, v6.4s, v25.4s +mul v12.4S, v12.4S,v17.s[1] +add v6.4s, v6.4s, v25.4s +sqrdmulh v25.4S, v30.4S, v11.s[0] +sub v3.4s, v15.4s, v13.4s +mul v30.4S, v30.4S,v17.s[0] +add v15.4s, v15.4s, v13.4s +sqrdmulh v13.4S, v8.4S, v11.s[0] +sub v27.4s, v24.4s, v20.4s +mul v8.4S, v8.4S,v17.s[0] +add v24.4s, v24.4s, v20.4s +mla v19.4S, v22.4S, v31.s[0] +sub v22.4s, v18.4s, v14.4s +sqrdmulh v20.4S, v6.4S, v11.s[2] +add v18.4s, v18.4s, v14.4s +mla v12.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v15.4S, v11.s[2] +mla v30.4S, v25.4S, v31.s[0] +sqrdmulh v25.4S, v2.4S, v11.s[3] +mla v8.4S, v13.4S, v31.s[0] +sqrdmulh v13.4S, v3.4S, v11.s[3] +mul v6.4S, v6.4S,v17.s[2] +sub v14.4s, v1.4s, v19.4s +mul v15.4S, v15.4S,v17.s[2] +add v1.4s, v1.4s, v19.4s +mla v6.4S, v20.4S, v31.s[0] +sub v20.4s, v10.4s, v12.4s +mla v15.4S, v21.4S, v31.s[0] +add v10.4s, v10.4s, v12.4s +mul v2.4S, v2.4S,v17.s[3] +sub v12.4s, v26.4s, v30.4s +mul v3.4S, v3.4S,v17.s[3] +add v26.4s, v26.4s, v30.4s +mla v2.4S, v25.4S, v31.s[0] +sub v25.4s, v16.4s, v8.4s +mla v3.4S, v13.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v10.4S, v9.s[2] +mul v10.4S, v10.4S,v0.s[2] +sqrdmulh v13.4S, v20.4S, v9.s[3] +sub v30.4s, v24.4s, v6.4s +mul v20.4S, v20.4S,v0.s[3] +add v24.4s, v24.4s, v6.4s +sqrdmulh v6.4S, v25.4S, v9.s[1] +sub v21.4s, v18.4s, v15.4s +mul v25.4S, v25.4S,v0.s[1] +add v18.4s, v18.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v9.s[0] +sub v19.4s, v27.4s, v2.4s +mul v16.4S, v16.4S,v0.s[0] +add v27.4s, v27.4s, v2.4s +mla v10.4S, v8.4S, v31.s[0] +sub v8.4s, v22.4s, v3.4s +sqrdmulh v2.4S, v18.4S, v7.s[0] +add v22.4s, v22.4s, v3.4s +mla v20.4S, v13.4S, v31.s[0] +sub v13.4s, v1.4s, v10.4s +sqrdmulh v3.4S, v21.4S, v7.s[1] +add v1.4s, v1.4s, v10.4s +mla v25.4S, v6.4S, v31.s[0] +sub v6.4s, v14.4s, v20.4s +sqrdmulh v10.4S, v22.4S, v7.s[2] +add v14.4s, v14.4s, v20.4s +mla v16.4S, v15.4S, v31.s[0] +sub v15.4s, v12.4s, v25.4s +sqrdmulh v20.4S, v8.4S, v7.s[3] +add v12.4s, v12.4s, v25.4s +mul v18.4S, v18.4S,v23.s[0] +sub v25.4s, v26.4s, v16.4s +mul v21.4S, v21.4S,v23.s[1] +add v26.4s, v26.4s, v16.4s +mla v18.4S, v2.4S, v31.s[0] +str q13, [x0, #320] +mla v21.4S, v3.4S, v31.s[0] +str q1, [x0, #256] +mul v22.4S, v22.4S,v23.s[2] +str q6, [x0, #448] +mul v8.4S, v8.4S,v23.s[3] +str q14, [x0, #384] +mla v22.4S, v10.4S, v31.s[0] +str q15, [x0, #192] +mla v8.4S, v20.4S, v31.s[0] +str q12, [x0, #128] +ldr q12, [x0, #912] +sqrdmulh v20.4S, v12.4S, v28.s[0] +str q25, [x0, #64] +mul v12.4S, v12.4S,v29.s[0] +str q26, [x0, #0] +ldr q26, [x0, #976] +sqrdmulh v25.4S, v26.4S, v28.s[0] +sub v15.4s, v24.4s, v18.4s +str q15, [x0, #576] +mul v26.4S, v26.4S,v29.s[0] +add v24.4s, v24.4s, v18.4s +ldr q18, [x0, #784] +sqrdmulh v15.4S, v18.4S, v28.s[0] +sub v10.4s, v30.4s, v21.4s +str q24, [x0, #512] +mul v18.4S, v18.4S,v29.s[0] +add v30.4s, v30.4s, v21.4s +ldr q21, [x0, #848] +sqrdmulh v24.4S, v21.4S, v28.s[0] +sub v14.4s, v27.4s, v22.4s +str q10, [x0, #704] +mul v21.4S, v21.4S,v29.s[0] +add v27.4s, v27.4s, v22.4s +ldr q22, [x0, #528] +mla v12.4S, v20.4S, v31.s[0] +sub v20.4s, v19.4s, v8.4s +str q30, [x0, #640] +sqrdmulh v30.4S, v22.4S, v28.s[0] +add v19.4s, v19.4s, v8.4s +ldr q8, [x0, #592] +mla v26.4S, v25.4S, v31.s[0] +str q14, [x0, #832] +sqrdmulh v14.4S, v8.4S, v28.s[0] +ldr q25, [x0, #656] +mla v18.4S, v15.4S, v31.s[0] +str q27, [x0, #768] +sqrdmulh v27.4S, v25.4S, v28.s[0] +ldr q15, [x0, #720] +mla v21.4S, v24.4S, v31.s[0] +str q20, [x0, #960] +sqrdmulh v20.4S, v15.4S, v28.s[0] +ldr q24, [x0, #400] +ldr q10, [x0, #464] +mul v22.4S, v22.4S,v29.s[0] +sub v6.4s, v24.4s, v12.4s +str q19, [x0, #896] +mul v8.4S, v8.4S,v29.s[0] +add v24.4s, v24.4s, v12.4s +ldr q12, [x0, #272] +ldr q19, [x0, #336] +mla v22.4S, v30.4S, v31.s[0] +sub v30.4s, v10.4s, v26.4s +mla v8.4S, v14.4S, v31.s[0] +add v10.4s, v10.4s, v26.4s +ldr q26, [x0, #16] +ldr q14, [x0, #80] +mul v25.4S, v25.4S,v29.s[0] +sub v1.4s, v12.4s, v18.4s +mul v15.4S, v15.4S,v29.s[0] +add v12.4s, v12.4s, v18.4s +ldr q18, [x0, #144] +ldr q3, [x0, #208] +mla v25.4S, v27.4S, v31.s[0] +sub v27.4s, v19.4s, v21.4s +mla v15.4S, v20.4S, v31.s[0] +add v19.4s, v19.4s, v21.4s +sqrdmulh v21.4S, v24.4S, v28.s[1] +mul v24.4S, v24.4S,v29.s[1] +sqrdmulh v20.4S, v10.4S, v28.s[1] +sub v13.4s, v26.4s, v22.4s +mul v10.4S, v10.4S,v29.s[1] +add v26.4s, v26.4s, v22.4s +sqrdmulh v22.4S, v12.4S, v28.s[1] +sub v2.4s, v14.4s, v8.4s +mul v12.4S, v12.4S,v29.s[1] +add v14.4s, v14.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v28.s[1] +sub v16.4s, v18.4s, v25.4s +mul v19.4S, v19.4S,v29.s[1] +add v18.4s, v18.4s, v25.4s +mla v24.4S, v21.4S, v31.s[0] +sub v21.4s, v3.4s, v15.4s +sqrdmulh v25.4S, v6.4S, v28.s[2] +add v3.4s, v3.4s, v15.4s +mla v10.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v30.4S, v28.s[2] +mla v12.4S, v22.4S, v31.s[0] +sqrdmulh v22.4S, v1.4S, v28.s[2] +mla v19.4S, v8.4S, v31.s[0] +sqrdmulh v8.4S, v27.4S, v28.s[2] +mul v6.4S, v6.4S,v29.s[2] +sub v15.4s, v18.4s, v24.4s +mul v30.4S, v30.4S,v29.s[2] +add v18.4s, v18.4s, v24.4s +mla v6.4S, v25.4S, v31.s[0] +sub v25.4s, v3.4s, v10.4s +mla v30.4S, v20.4S, v31.s[0] +add v3.4s, v3.4s, v10.4s +mul v1.4S, v1.4S,v29.s[2] +sub v10.4s, v26.4s, v12.4s +mul v27.4S, v27.4S,v29.s[2] +add v26.4s, v26.4s, v12.4s +mla v1.4S, v22.4S, v31.s[0] +sub v22.4s, v14.4s, v19.4s +mla v27.4S, v8.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +sqrdmulh v28.4S, v15.4S, v11.s[1] +mul v15.4S, v15.4S,v17.s[1] +sqrdmulh v29.4S, v25.4S, v11.s[1] +sub v19.4s, v16.4s, v6.4s +mul v25.4S, v25.4S,v17.s[1] +add v16.4s, v16.4s, v6.4s +sqrdmulh v6.4S, v18.4S, v11.s[0] +sub v8.4s, v21.4s, v30.4s +mul v18.4S, v18.4S,v17.s[0] +add v21.4s, v21.4s, v30.4s +sqrdmulh v30.4S, v3.4S, v11.s[0] +sub v12.4s, v13.4s, v1.4s +mul v3.4S, v3.4S,v17.s[0] +add v13.4s, v13.4s, v1.4s +mla v15.4S, v28.4S, v31.s[0] +sub v28.4s, v2.4s, v27.4s +sqrdmulh v1.4S, v16.4S, v11.s[2] +add v2.4s, v2.4s, v27.4s +mla v25.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v21.4S, v11.s[2] +mla v18.4S, v6.4S, v31.s[0] +sqrdmulh v6.4S, v19.4S, v11.s[3] +mla v3.4S, v30.4S, v31.s[0] +sqrdmulh v30.4S, v8.4S, v11.s[3] +mul v16.4S, v16.4S,v17.s[2] +sub v27.4s, v10.4s, v15.4s +mul v21.4S, v21.4S,v17.s[2] +add v10.4s, v10.4s, v15.4s +mla v16.4S, v1.4S, v31.s[0] +sub v1.4s, v22.4s, v25.4s +mla v21.4S, v29.4S, v31.s[0] +add v22.4s, v22.4s, v25.4s +mul v19.4S, v19.4S,v17.s[3] +sub v25.4s, v26.4s, v18.4s +mul v8.4S, v8.4S,v17.s[3] +add v26.4s, v26.4s, v18.4s +mla v19.4S, v6.4S, v31.s[0] +sub v6.4s, v14.4s, v3.4s +mla v8.4S, v30.4S, v31.s[0] +add v14.4s, v14.4s, v3.4s +sqrdmulh v11.4S, v22.4S, v9.s[2] +mul v22.4S, v22.4S,v0.s[2] +sqrdmulh v17.4S, v1.4S, v9.s[3] +sub v3.4s, v13.4s, v16.4s +mul v1.4S, v1.4S,v0.s[3] +add v13.4s, v13.4s, v16.4s +sqrdmulh v16.4S, v6.4S, v9.s[1] +sub v30.4s, v2.4s, v21.4s +mul v6.4S, v6.4S,v0.s[1] +add v2.4s, v2.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v9.s[0] +sub v18.4s, v12.4s, v19.4s +mul v14.4S, v14.4S,v0.s[0] +add v12.4s, v12.4s, v19.4s +mla v22.4S, v11.4S, v31.s[0] +sub v11.4s, v28.4s, v8.4s +sqrdmulh v9.4S, v2.4S, v7.s[0] +add v28.4s, v28.4s, v8.4s +mla v1.4S, v17.4S, v31.s[0] +sub v17.4s, v10.4s, v22.4s +sqrdmulh v8.4S, v30.4S, v7.s[1] +add v10.4s, v10.4s, v22.4s +mla v6.4S, v16.4S, v31.s[0] +sub v16.4s, v27.4s, v1.4s +sqrdmulh v22.4S, v28.4S, v7.s[2] +add v27.4s, v27.4s, v1.4s +mla v14.4S, v21.4S, v31.s[0] +sub v21.4s, v25.4s, v6.4s +sqrdmulh v1.4S, v11.4S, v7.s[3] +add v25.4s, v25.4s, v6.4s +mul v2.4S, v2.4S,v23.s[0] +sub v6.4s, v26.4s, v14.4s +mul v30.4S, v30.4S,v23.s[1] +add v26.4s, v26.4s, v14.4s +mla v2.4S, v9.4S, v31.s[0] +str q17, [x0, #336] +mla v30.4S, v8.4S, v31.s[0] +str q10, [x0, #272] +mul v28.4S, v28.4S,v23.s[2] +str q16, [x0, #464] +mul v11.4S, v11.4S,v23.s[3] +str q27, [x0, #400] +mla v28.4S, v22.4S, v31.s[0] +str q21, [x0, #208] +mla v11.4S, v1.4S, v31.s[0] +str q25, [x0, #144] +str q6, [x0, #80] +str q26, [x0, #16] +sub v26.4s, v13.4s, v2.4s +str q26, [x0, #592] +add v13.4s, v13.4s, v2.4s +sub v2.4s, v3.4s, v30.4s +str q13, [x0, #528] +add v3.4s, v3.4s, v30.4s +sub v30.4s, v12.4s, v28.4s +str q2, [x0, #720] +add v12.4s, v12.4s, v28.4s +sub v28.4s, v18.4s, v11.4s +str q3, [x0, #656] +add v18.4s, v18.4s, v11.4s +str q30, [x0, #848] +str q12, [x0, #784] +str q28, [x0, #976] +str q18, [x0, #912] +ldr q4, [x17, #+128] +ldr q5, [x17, #+144] +ldr q24, [x17, #+160] +ldr q20, [x17, #+176] +ldr q15, [x17, #+192] +ldr q29, [x17, #+208] +ldr q19, [x17, #+224] +ldr q0, [x17, #+240] +ldr q14, [x0, #32] +ldr q9, [x0, #48] +ldr q17, [x0, #0] +ldr q8, [x0, #16] +sqrdmulh v10.4S, v14.4S, v5.s[0] +mul v14.4S, v14.4S,v4.s[0] +mla v14.4S, v10.4S, v31.s[0] +sub v10.4s, v17.4s, v14.4s +add v17.4s, v17.4s, v14.4s +sqrdmulh v14.4S, v9.4S, v5.s[0] +mul v9.4S, v9.4S,v4.s[0] +mla v9.4S, v14.4S, v31.s[0] +sub v14.4s, v8.4s, v9.4s +add v8.4s, v8.4s, v9.4s +sqrdmulh v9.4S, v8.4S, v5.s[1] +mul v8.4S, v8.4S,v4.s[1] +mla v8.4S, v9.4S, v31.s[0] +sub v9.4s, v17.4s, v8.4s +add v17.4s, v17.4s, v8.4s +sqrdmulh v8.4S, v14.4S, v5.s[2] +mul v14.4S, v14.4S,v4.s[2] +mla v14.4S, v8.4S, v31.s[0] +sub v8.4s, v10.4s, v14.4s +add v10.4s, v10.4s, v14.4s +trn1 v14.4S, v17.4S, v9.4S +trn2 v16.4S, v17.4S, v9.4S +trn1 v27.4S, v10.4S, v8.4S +trn2 v22.4S, v10.4S, v8.4S +trn2 v10.2D, v14.2D, v27.2D +trn2 v8.2D, v16.2D, v22.2D +trn1 v17.2D, v14.2D, v27.2D +trn1 v9.2D, v16.2D, v22.2D +sqrdmulh v22.4S, v10.4S, v20.4S +mul v10.4S, v10.4S,v24.4S +mla v10.4S, v22.4S, v31.s[0] +sub v22.4s, v17.4s, v10.4s +add v17.4s, v17.4s, v10.4s +sqrdmulh v10.4S, v8.4S, v20.4S +mul v8.4S, v8.4S,v24.4S +mla v8.4S, v10.4S, v31.s[0] +sub v10.4s, v9.4s, v8.4s +add v9.4s, v9.4s, v8.4s +sqrdmulh v8.4S, v9.4S, v29.4S +mul v9.4S, v9.4S,v15.4S +mla v9.4S, v8.4S, v31.s[0] +sub v8.4s, v17.4s, v9.4s +add v17.4s, v17.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v0.4S +mul v10.4S, v10.4S,v19.4S +mla v10.4S, v9.4S, v31.s[0] +sub v9.4s, v22.4s, v10.4s +add v22.4s, v22.4s, v10.4s +str q17, [x0, #0] +str q8, [x0, #16] +str q22, [x0, #32] +str q9, [x0, #48] +ldr q9, [x17, #+256] +ldr q22, [x17, #+272] +ldr q8, [x17, #+288] +ldr q17, [x17, #+304] +ldr q10, [x17, #+320] +ldr q16, [x17, #+336] +ldr q27, [x17, #+352] +ldr q14, [x17, #+368] +ldr q0, [x0, #96] +ldr q19, [x0, #112] +ldr q29, [x0, #64] +ldr q15, [x0, #80] +sqrdmulh v20.4S, v0.4S, v22.s[0] +mul v0.4S, v0.4S,v9.s[0] +mla v0.4S, v20.4S, v31.s[0] +sub v20.4s, v29.4s, v0.4s +add v29.4s, v29.4s, v0.4s +sqrdmulh v0.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v9.s[0] +mla v19.4S, v0.4S, v31.s[0] +sub v0.4s, v15.4s, v19.4s +add v15.4s, v15.4s, v19.4s +sqrdmulh v19.4S, v15.4S, v22.s[1] +mul v15.4S, v15.4S,v9.s[1] +mla v15.4S, v19.4S, v31.s[0] +sub v19.4s, v29.4s, v15.4s +add v29.4s, v29.4s, v15.4s +sqrdmulh v15.4S, v0.4S, v22.s[2] +mul v0.4S, v0.4S,v9.s[2] +mla v0.4S, v15.4S, v31.s[0] +sub v15.4s, v20.4s, v0.4s +add v20.4s, v20.4s, v0.4s +trn1 v0.4S, v29.4S, v19.4S +trn2 v24.4S, v29.4S, v19.4S +trn1 v5.4S, v20.4S, v15.4S +trn2 v4.4S, v20.4S, v15.4S +trn2 v20.2D, v0.2D, v5.2D +trn2 v15.2D, v24.2D, v4.2D +trn1 v29.2D, v0.2D, v5.2D +trn1 v19.2D, v24.2D, v4.2D +sqrdmulh v4.4S, v20.4S, v17.4S +mul v20.4S, v20.4S,v8.4S +mla v20.4S, v4.4S, v31.s[0] +sub v4.4s, v29.4s, v20.4s +add v29.4s, v29.4s, v20.4s +sqrdmulh v20.4S, v15.4S, v17.4S +mul v15.4S, v15.4S,v8.4S +mla v15.4S, v20.4S, v31.s[0] +sub v20.4s, v19.4s, v15.4s +add v19.4s, v19.4s, v15.4s +sqrdmulh v15.4S, v19.4S, v16.4S +mul v19.4S, v19.4S,v10.4S +mla v19.4S, v15.4S, v31.s[0] +sub v15.4s, v29.4s, v19.4s +add v29.4s, v29.4s, v19.4s +sqrdmulh v19.4S, v20.4S, v14.4S +mul v20.4S, v20.4S,v27.4S +mla v20.4S, v19.4S, v31.s[0] +sub v19.4s, v4.4s, v20.4s +add v4.4s, v4.4s, v20.4s +str q29, [x0, #64] +str q15, [x0, #80] +str q4, [x0, #96] +str q19, [x0, #112] +ldr q19, [x17, #+384] +ldr q4, [x17, #+400] +ldr q15, [x17, #+416] +ldr q29, [x17, #+432] +ldr q20, [x17, #+448] +ldr q24, [x17, #+464] +ldr q5, [x17, #+480] +ldr q0, [x17, #+496] +ldr q14, [x0, #160] +ldr q27, [x0, #176] +ldr q16, [x0, #128] +ldr q10, [x0, #144] +sqrdmulh v17.4S, v14.4S, v4.s[0] +mul v14.4S, v14.4S,v19.s[0] +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v16.4s, v14.4s +add v16.4s, v16.4s, v14.4s +sqrdmulh v14.4S, v27.4S, v4.s[0] +mul v27.4S, v27.4S,v19.s[0] +mla v27.4S, v14.4S, v31.s[0] +sub v14.4s, v10.4s, v27.4s +add v10.4s, v10.4s, v27.4s +sqrdmulh v27.4S, v10.4S, v4.s[1] +mul v10.4S, v10.4S,v19.s[1] +mla v10.4S, v27.4S, v31.s[0] +sub v27.4s, v16.4s, v10.4s +add v16.4s, v16.4s, v10.4s +sqrdmulh v10.4S, v14.4S, v4.s[2] +mul v14.4S, v14.4S,v19.s[2] +mla v14.4S, v10.4S, v31.s[0] +sub v10.4s, v17.4s, v14.4s +add v17.4s, v17.4s, v14.4s +trn1 v14.4S, v16.4S, v27.4S +trn2 v8.4S, v16.4S, v27.4S +trn1 v22.4S, v17.4S, v10.4S +trn2 v9.4S, v17.4S, v10.4S +trn2 v17.2D, v14.2D, v22.2D +trn2 v10.2D, v8.2D, v9.2D +trn1 v16.2D, v14.2D, v22.2D +trn1 v27.2D, v8.2D, v9.2D +sqrdmulh v9.4S, v17.4S, v29.4S +mul v17.4S, v17.4S,v15.4S +mla v17.4S, v9.4S, v31.s[0] +sub v9.4s, v16.4s, v17.4s +add v16.4s, v16.4s, v17.4s +sqrdmulh v17.4S, v10.4S, v29.4S +mul v10.4S, v10.4S,v15.4S +mla v10.4S, v17.4S, v31.s[0] +sub v17.4s, v27.4s, v10.4s +add v27.4s, v27.4s, v10.4s +sqrdmulh v10.4S, v27.4S, v24.4S +mul v27.4S, v27.4S,v20.4S +mla v27.4S, v10.4S, v31.s[0] +sub v10.4s, v16.4s, v27.4s +add v16.4s, v16.4s, v27.4s +sqrdmulh v27.4S, v17.4S, v0.4S +mul v17.4S, v17.4S,v5.4S +mla v17.4S, v27.4S, v31.s[0] +sub v27.4s, v9.4s, v17.4s +add v9.4s, v9.4s, v17.4s +str q16, [x0, #128] +str q10, [x0, #144] +str q9, [x0, #160] +str q27, [x0, #176] +ldr q27, [x17, #+512] +ldr q9, [x17, #+528] +ldr q10, [x17, #+544] +ldr q16, [x17, #+560] +ldr q17, [x17, #+576] +ldr q8, [x17, #+592] +ldr q22, [x17, #+608] +ldr q14, [x17, #+624] +ldr q0, [x0, #224] +ldr q5, [x0, #240] +ldr q24, [x0, #192] +ldr q20, [x0, #208] +sqrdmulh v29.4S, v0.4S, v9.s[0] +mul v0.4S, v0.4S,v27.s[0] +mla v0.4S, v29.4S, v31.s[0] +sub v29.4s, v24.4s, v0.4s +add v24.4s, v24.4s, v0.4s +sqrdmulh v0.4S, v5.4S, v9.s[0] +mul v5.4S, v5.4S,v27.s[0] +mla v5.4S, v0.4S, v31.s[0] +sub v0.4s, v20.4s, v5.4s +add v20.4s, v20.4s, v5.4s +sqrdmulh v5.4S, v20.4S, v9.s[1] +mul v20.4S, v20.4S,v27.s[1] +mla v20.4S, v5.4S, v31.s[0] +sub v5.4s, v24.4s, v20.4s +add v24.4s, v24.4s, v20.4s +sqrdmulh v20.4S, v0.4S, v9.s[2] +mul v0.4S, v0.4S,v27.s[2] +mla v0.4S, v20.4S, v31.s[0] +sub v20.4s, v29.4s, v0.4s +add v29.4s, v29.4s, v0.4s +trn1 v0.4S, v24.4S, v5.4S +trn2 v15.4S, v24.4S, v5.4S +trn1 v4.4S, v29.4S, v20.4S +trn2 v19.4S, v29.4S, v20.4S +trn2 v29.2D, v0.2D, v4.2D +trn2 v20.2D, v15.2D, v19.2D +trn1 v24.2D, v0.2D, v4.2D +trn1 v5.2D, v15.2D, v19.2D +sqrdmulh v19.4S, v29.4S, v16.4S +mul v29.4S, v29.4S,v10.4S +mla v29.4S, v19.4S, v31.s[0] +sub v19.4s, v24.4s, v29.4s +add v24.4s, v24.4s, v29.4s +sqrdmulh v29.4S, v20.4S, v16.4S +mul v20.4S, v20.4S,v10.4S +mla v20.4S, v29.4S, v31.s[0] +sub v29.4s, v5.4s, v20.4s +add v5.4s, v5.4s, v20.4s +sqrdmulh v20.4S, v5.4S, v8.4S +mul v5.4S, v5.4S,v17.4S +mla v5.4S, v20.4S, v31.s[0] +sub v20.4s, v24.4s, v5.4s +add v24.4s, v24.4s, v5.4s +sqrdmulh v5.4S, v29.4S, v14.4S +mul v29.4S, v29.4S,v22.4S +mla v29.4S, v5.4S, v31.s[0] +sub v5.4s, v19.4s, v29.4s +add v19.4s, v19.4s, v29.4s +str q24, [x0, #192] +str q20, [x0, #208] +str q19, [x0, #224] +str q5, [x0, #240] +ldr q5, [x17, #+640] +ldr q19, [x17, #+656] +ldr q20, [x17, #+672] +ldr q24, [x17, #+688] +ldr q29, [x17, #+704] +ldr q15, [x17, #+720] +ldr q4, [x17, #+736] +ldr q0, [x17, #+752] +ldr q14, [x0, #288] +ldr q22, [x0, #304] +ldr q8, [x0, #256] +ldr q17, [x0, #272] +sqrdmulh v16.4S, v14.4S, v19.s[0] +mul v14.4S, v14.4S,v5.s[0] +mla v14.4S, v16.4S, v31.s[0] +sub v16.4s, v8.4s, v14.4s +add v8.4s, v8.4s, v14.4s +sqrdmulh v14.4S, v22.4S, v19.s[0] +mul v22.4S, v22.4S,v5.s[0] +mla v22.4S, v14.4S, v31.s[0] +sub v14.4s, v17.4s, v22.4s +add v17.4s, v17.4s, v22.4s +sqrdmulh v22.4S, v17.4S, v19.s[1] +mul v17.4S, v17.4S,v5.s[1] +mla v17.4S, v22.4S, v31.s[0] +sub v22.4s, v8.4s, v17.4s +add v8.4s, v8.4s, v17.4s +sqrdmulh v17.4S, v14.4S, v19.s[2] +mul v14.4S, v14.4S,v5.s[2] +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v16.4s, v14.4s +add v16.4s, v16.4s, v14.4s +trn1 v14.4S, v8.4S, v22.4S +trn2 v10.4S, v8.4S, v22.4S +trn1 v9.4S, v16.4S, v17.4S +trn2 v27.4S, v16.4S, v17.4S +trn2 v16.2D, v14.2D, v9.2D +trn2 v17.2D, v10.2D, v27.2D +trn1 v8.2D, v14.2D, v9.2D +trn1 v22.2D, v10.2D, v27.2D +sqrdmulh v27.4S, v16.4S, v24.4S +mul v16.4S, v16.4S,v20.4S +mla v16.4S, v27.4S, v31.s[0] +sub v27.4s, v8.4s, v16.4s +add v8.4s, v8.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v24.4S +mul v17.4S, v17.4S,v20.4S +mla v17.4S, v16.4S, v31.s[0] +sub v16.4s, v22.4s, v17.4s +add v22.4s, v22.4s, v17.4s +sqrdmulh v17.4S, v22.4S, v15.4S +mul v22.4S, v22.4S,v29.4S +mla v22.4S, v17.4S, v31.s[0] +sub v17.4s, v8.4s, v22.4s +add v8.4s, v8.4s, v22.4s +sqrdmulh v22.4S, v16.4S, v0.4S +mul v16.4S, v16.4S,v4.4S +mla v16.4S, v22.4S, v31.s[0] +sub v22.4s, v27.4s, v16.4s +add v27.4s, v27.4s, v16.4s +str q8, [x0, #256] +str q17, [x0, #272] +str q27, [x0, #288] +str q22, [x0, #304] +ldr q22, [x17, #+768] +ldr q27, [x17, #+784] +ldr q17, [x17, #+800] +ldr q8, [x17, #+816] +ldr q16, [x17, #+832] +ldr q10, [x17, #+848] +ldr q9, [x17, #+864] +ldr q14, [x17, #+880] +ldr q0, [x0, #352] +ldr q4, [x0, #368] +ldr q15, [x0, #320] +ldr q29, [x0, #336] +sqrdmulh v24.4S, v0.4S, v27.s[0] +mul v0.4S, v0.4S,v22.s[0] +mla v0.4S, v24.4S, v31.s[0] +sub v24.4s, v15.4s, v0.4s +add v15.4s, v15.4s, v0.4s +sqrdmulh v0.4S, v4.4S, v27.s[0] +mul v4.4S, v4.4S,v22.s[0] +mla v4.4S, v0.4S, v31.s[0] +sub v0.4s, v29.4s, v4.4s +add v29.4s, v29.4s, v4.4s +sqrdmulh v4.4S, v29.4S, v27.s[1] +mul v29.4S, v29.4S,v22.s[1] +mla v29.4S, v4.4S, v31.s[0] +sub v4.4s, v15.4s, v29.4s +add v15.4s, v15.4s, v29.4s +sqrdmulh v29.4S, v0.4S, v27.s[2] +mul v0.4S, v0.4S,v22.s[2] +mla v0.4S, v29.4S, v31.s[0] +sub v29.4s, v24.4s, v0.4s +add v24.4s, v24.4s, v0.4s +trn1 v0.4S, v15.4S, v4.4S +trn2 v20.4S, v15.4S, v4.4S +trn1 v19.4S, v24.4S, v29.4S +trn2 v5.4S, v24.4S, v29.4S +trn2 v24.2D, v0.2D, v19.2D +trn2 v29.2D, v20.2D, v5.2D +trn1 v15.2D, v0.2D, v19.2D +trn1 v4.2D, v20.2D, v5.2D +sqrdmulh v5.4S, v24.4S, v8.4S +mul v24.4S, v24.4S,v17.4S +mla v24.4S, v5.4S, v31.s[0] +sub v5.4s, v15.4s, v24.4s +add v15.4s, v15.4s, v24.4s +sqrdmulh v24.4S, v29.4S, v8.4S +mul v29.4S, v29.4S,v17.4S +mla v29.4S, v24.4S, v31.s[0] +sub v24.4s, v4.4s, v29.4s +add v4.4s, v4.4s, v29.4s +sqrdmulh v29.4S, v4.4S, v10.4S +mul v4.4S, v4.4S,v16.4S +mla v4.4S, v29.4S, v31.s[0] +sub v29.4s, v15.4s, v4.4s +add v15.4s, v15.4s, v4.4s +sqrdmulh v4.4S, v24.4S, v14.4S +mul v24.4S, v24.4S,v9.4S +mla v24.4S, v4.4S, v31.s[0] +sub v4.4s, v5.4s, v24.4s +add v5.4s, v5.4s, v24.4s +str q15, [x0, #320] +str q29, [x0, #336] +str q5, [x0, #352] +str q4, [x0, #368] +ldr q4, [x17, #+896] +ldr q5, [x17, #+912] +ldr q29, [x17, #+928] +ldr q15, [x17, #+944] +ldr q24, [x17, #+960] +ldr q20, [x17, #+976] +ldr q19, [x17, #+992] +ldr q0, [x17, #+1008] +ldr q14, [x0, #416] +ldr q9, [x0, #432] +ldr q10, [x0, #384] +ldr q16, [x0, #400] +sqrdmulh v8.4S, v14.4S, v5.s[0] +mul v14.4S, v14.4S,v4.s[0] +mla v14.4S, v8.4S, v31.s[0] +sub v8.4s, v10.4s, v14.4s +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v9.4S, v5.s[0] +mul v9.4S, v9.4S,v4.s[0] +mla v9.4S, v14.4S, v31.s[0] +sub v14.4s, v16.4s, v9.4s +add v16.4s, v16.4s, v9.4s +sqrdmulh v9.4S, v16.4S, v5.s[1] +mul v16.4S, v16.4S,v4.s[1] +mla v16.4S, v9.4S, v31.s[0] +sub v9.4s, v10.4s, v16.4s +add v10.4s, v10.4s, v16.4s +sqrdmulh v16.4S, v14.4S, v5.s[2] +mul v14.4S, v14.4S,v4.s[2] +mla v14.4S, v16.4S, v31.s[0] +sub v16.4s, v8.4s, v14.4s +add v8.4s, v8.4s, v14.4s +trn1 v14.4S, v10.4S, v9.4S +trn2 v17.4S, v10.4S, v9.4S +trn1 v27.4S, v8.4S, v16.4S +trn2 v22.4S, v8.4S, v16.4S +trn2 v8.2D, v14.2D, v27.2D +trn2 v16.2D, v17.2D, v22.2D +trn1 v10.2D, v14.2D, v27.2D +trn1 v9.2D, v17.2D, v22.2D +sqrdmulh v22.4S, v8.4S, v15.4S +mul v8.4S, v8.4S,v29.4S +mla v8.4S, v22.4S, v31.s[0] +sub v22.4s, v10.4s, v8.4s +add v10.4s, v10.4s, v8.4s +sqrdmulh v8.4S, v16.4S, v15.4S +mul v16.4S, v16.4S,v29.4S +mla v16.4S, v8.4S, v31.s[0] +sub v8.4s, v9.4s, v16.4s +add v9.4s, v9.4s, v16.4s +sqrdmulh v16.4S, v9.4S, v20.4S +mul v9.4S, v9.4S,v24.4S +mla v9.4S, v16.4S, v31.s[0] +sub v16.4s, v10.4s, v9.4s +add v10.4s, v10.4s, v9.4s +sqrdmulh v9.4S, v8.4S, v0.4S +mul v8.4S, v8.4S,v19.4S +mla v8.4S, v9.4S, v31.s[0] +sub v9.4s, v22.4s, v8.4s +add v22.4s, v22.4s, v8.4s +str q10, [x0, #384] +str q16, [x0, #400] +str q22, [x0, #416] +str q9, [x0, #432] +ldr q9, [x17, #+1024] +ldr q22, [x17, #+1040] +ldr q16, [x17, #+1056] +ldr q10, [x17, #+1072] +ldr q8, [x17, #+1088] +ldr q17, [x17, #+1104] +ldr q27, [x17, #+1120] +ldr q14, [x17, #+1136] +ldr q0, [x0, #480] +ldr q19, [x0, #496] +ldr q20, [x0, #448] +ldr q24, [x0, #464] +sqrdmulh v15.4S, v0.4S, v22.s[0] +mul v0.4S, v0.4S,v9.s[0] +mla v0.4S, v15.4S, v31.s[0] +sub v15.4s, v20.4s, v0.4s +add v20.4s, v20.4s, v0.4s +sqrdmulh v0.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v9.s[0] +mla v19.4S, v0.4S, v31.s[0] +sub v0.4s, v24.4s, v19.4s +add v24.4s, v24.4s, v19.4s +sqrdmulh v19.4S, v24.4S, v22.s[1] +mul v24.4S, v24.4S,v9.s[1] +mla v24.4S, v19.4S, v31.s[0] +sub v19.4s, v20.4s, v24.4s +add v20.4s, v20.4s, v24.4s +sqrdmulh v24.4S, v0.4S, v22.s[2] +mul v0.4S, v0.4S,v9.s[2] +mla v0.4S, v24.4S, v31.s[0] +sub v24.4s, v15.4s, v0.4s +add v15.4s, v15.4s, v0.4s +trn1 v0.4S, v20.4S, v19.4S +trn2 v29.4S, v20.4S, v19.4S +trn1 v5.4S, v15.4S, v24.4S +trn2 v4.4S, v15.4S, v24.4S +trn2 v15.2D, v0.2D, v5.2D +trn2 v24.2D, v29.2D, v4.2D +trn1 v20.2D, v0.2D, v5.2D +trn1 v19.2D, v29.2D, v4.2D +sqrdmulh v4.4S, v15.4S, v10.4S +mul v15.4S, v15.4S,v16.4S +mla v15.4S, v4.4S, v31.s[0] +sub v4.4s, v20.4s, v15.4s +add v20.4s, v20.4s, v15.4s +sqrdmulh v15.4S, v24.4S, v10.4S +mul v24.4S, v24.4S,v16.4S +mla v24.4S, v15.4S, v31.s[0] +sub v15.4s, v19.4s, v24.4s +add v19.4s, v19.4s, v24.4s +sqrdmulh v24.4S, v19.4S, v17.4S +mul v19.4S, v19.4S,v8.4S +mla v19.4S, v24.4S, v31.s[0] +sub v24.4s, v20.4s, v19.4s +add v20.4s, v20.4s, v19.4s +sqrdmulh v19.4S, v15.4S, v14.4S +mul v15.4S, v15.4S,v27.4S +mla v15.4S, v19.4S, v31.s[0] +sub v19.4s, v4.4s, v15.4s +add v4.4s, v4.4s, v15.4s +str q20, [x0, #448] +str q24, [x0, #464] +str q4, [x0, #480] +str q19, [x0, #496] +ldr q19, [x17, #+1152] +ldr q4, [x17, #+1168] +ldr q24, [x17, #+1184] +ldr q20, [x17, #+1200] +ldr q15, [x17, #+1216] +ldr q29, [x17, #+1232] +ldr q5, [x17, #+1248] +ldr q0, [x17, #+1264] +ldr q14, [x0, #544] +ldr q27, [x0, #560] +ldr q17, [x0, #512] +ldr q8, [x0, #528] +sqrdmulh v10.4S, v14.4S, v4.s[0] +mul v14.4S, v14.4S,v19.s[0] +mla v14.4S, v10.4S, v31.s[0] +sub v10.4s, v17.4s, v14.4s +add v17.4s, v17.4s, v14.4s +sqrdmulh v14.4S, v27.4S, v4.s[0] +mul v27.4S, v27.4S,v19.s[0] +mla v27.4S, v14.4S, v31.s[0] +sub v14.4s, v8.4s, v27.4s +add v8.4s, v8.4s, v27.4s +sqrdmulh v27.4S, v8.4S, v4.s[1] +mul v8.4S, v8.4S,v19.s[1] +mla v8.4S, v27.4S, v31.s[0] +sub v27.4s, v17.4s, v8.4s +add v17.4s, v17.4s, v8.4s +sqrdmulh v8.4S, v14.4S, v4.s[2] +mul v14.4S, v14.4S,v19.s[2] +mla v14.4S, v8.4S, v31.s[0] +sub v8.4s, v10.4s, v14.4s +add v10.4s, v10.4s, v14.4s +trn1 v14.4S, v17.4S, v27.4S +trn2 v16.4S, v17.4S, v27.4S +trn1 v22.4S, v10.4S, v8.4S +trn2 v9.4S, v10.4S, v8.4S +trn2 v10.2D, v14.2D, v22.2D +trn2 v8.2D, v16.2D, v9.2D +trn1 v17.2D, v14.2D, v22.2D +trn1 v27.2D, v16.2D, v9.2D +sqrdmulh v9.4S, v10.4S, v20.4S +mul v10.4S, v10.4S,v24.4S +mla v10.4S, v9.4S, v31.s[0] +sub v9.4s, v17.4s, v10.4s +add v17.4s, v17.4s, v10.4s +sqrdmulh v10.4S, v8.4S, v20.4S +mul v8.4S, v8.4S,v24.4S +mla v8.4S, v10.4S, v31.s[0] +sub v10.4s, v27.4s, v8.4s +add v27.4s, v27.4s, v8.4s +sqrdmulh v8.4S, v27.4S, v29.4S +mul v27.4S, v27.4S,v15.4S +mla v27.4S, v8.4S, v31.s[0] +sub v8.4s, v17.4s, v27.4s +add v17.4s, v17.4s, v27.4s +sqrdmulh v27.4S, v10.4S, v0.4S +mul v10.4S, v10.4S,v5.4S +mla v10.4S, v27.4S, v31.s[0] +sub v27.4s, v9.4s, v10.4s +add v9.4s, v9.4s, v10.4s +str q17, [x0, #512] +str q8, [x0, #528] +str q9, [x0, #544] +str q27, [x0, #560] +ldr q27, [x17, #+1280] +ldr q9, [x17, #+1296] +ldr q8, [x17, #+1312] +ldr q17, [x17, #+1328] +ldr q10, [x17, #+1344] +ldr q16, [x17, #+1360] +ldr q22, [x17, #+1376] +ldr q14, [x17, #+1392] +ldr q0, [x0, #608] +ldr q5, [x0, #624] +ldr q29, [x0, #576] +ldr q15, [x0, #592] +sqrdmulh v20.4S, v0.4S, v9.s[0] +mul v0.4S, v0.4S,v27.s[0] +mla v0.4S, v20.4S, v31.s[0] +sub v20.4s, v29.4s, v0.4s +add v29.4s, v29.4s, v0.4s +sqrdmulh v0.4S, v5.4S, v9.s[0] +mul v5.4S, v5.4S,v27.s[0] +mla v5.4S, v0.4S, v31.s[0] +sub v0.4s, v15.4s, v5.4s +add v15.4s, v15.4s, v5.4s +sqrdmulh v5.4S, v15.4S, v9.s[1] +mul v15.4S, v15.4S,v27.s[1] +mla v15.4S, v5.4S, v31.s[0] +sub v5.4s, v29.4s, v15.4s +add v29.4s, v29.4s, v15.4s +sqrdmulh v15.4S, v0.4S, v9.s[2] +mul v0.4S, v0.4S,v27.s[2] +mla v0.4S, v15.4S, v31.s[0] +sub v15.4s, v20.4s, v0.4s +add v20.4s, v20.4s, v0.4s +trn1 v0.4S, v29.4S, v5.4S +trn2 v24.4S, v29.4S, v5.4S +trn1 v4.4S, v20.4S, v15.4S +trn2 v19.4S, v20.4S, v15.4S +trn2 v20.2D, v0.2D, v4.2D +trn2 v15.2D, v24.2D, v19.2D +trn1 v29.2D, v0.2D, v4.2D +trn1 v5.2D, v24.2D, v19.2D +sqrdmulh v19.4S, v20.4S, v17.4S +mul v20.4S, v20.4S,v8.4S +mla v20.4S, v19.4S, v31.s[0] +sub v19.4s, v29.4s, v20.4s +add v29.4s, v29.4s, v20.4s +sqrdmulh v20.4S, v15.4S, v17.4S +mul v15.4S, v15.4S,v8.4S +mla v15.4S, v20.4S, v31.s[0] +sub v20.4s, v5.4s, v15.4s +add v5.4s, v5.4s, v15.4s +sqrdmulh v15.4S, v5.4S, v16.4S +mul v5.4S, v5.4S,v10.4S +mla v5.4S, v15.4S, v31.s[0] +sub v15.4s, v29.4s, v5.4s +add v29.4s, v29.4s, v5.4s +sqrdmulh v5.4S, v20.4S, v14.4S +mul v20.4S, v20.4S,v22.4S +mla v20.4S, v5.4S, v31.s[0] +sub v5.4s, v19.4s, v20.4s +add v19.4s, v19.4s, v20.4s +str q29, [x0, #576] +str q15, [x0, #592] +str q19, [x0, #608] +str q5, [x0, #624] +ldr q5, [x17, #+1408] +ldr q19, [x17, #+1424] +ldr q15, [x17, #+1440] +ldr q29, [x17, #+1456] +ldr q20, [x17, #+1472] +ldr q24, [x17, #+1488] +ldr q4, [x17, #+1504] +ldr q0, [x17, #+1520] +ldr q14, [x0, #672] +ldr q22, [x0, #688] +ldr q16, [x0, #640] +ldr q10, [x0, #656] +sqrdmulh v17.4S, v14.4S, v19.s[0] +mul v14.4S, v14.4S,v5.s[0] +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v16.4s, v14.4s +add v16.4s, v16.4s, v14.4s +sqrdmulh v14.4S, v22.4S, v19.s[0] +mul v22.4S, v22.4S,v5.s[0] +mla v22.4S, v14.4S, v31.s[0] +sub v14.4s, v10.4s, v22.4s +add v10.4s, v10.4s, v22.4s +sqrdmulh v22.4S, v10.4S, v19.s[1] +mul v10.4S, v10.4S,v5.s[1] +mla v10.4S, v22.4S, v31.s[0] +sub v22.4s, v16.4s, v10.4s +add v16.4s, v16.4s, v10.4s +sqrdmulh v10.4S, v14.4S, v19.s[2] +mul v14.4S, v14.4S,v5.s[2] +mla v14.4S, v10.4S, v31.s[0] +sub v10.4s, v17.4s, v14.4s +add v17.4s, v17.4s, v14.4s +trn1 v14.4S, v16.4S, v22.4S +trn2 v8.4S, v16.4S, v22.4S +trn1 v9.4S, v17.4S, v10.4S +trn2 v27.4S, v17.4S, v10.4S +trn2 v17.2D, v14.2D, v9.2D +trn2 v10.2D, v8.2D, v27.2D +trn1 v16.2D, v14.2D, v9.2D +trn1 v22.2D, v8.2D, v27.2D +sqrdmulh v27.4S, v17.4S, v29.4S +mul v17.4S, v17.4S,v15.4S +mla v17.4S, v27.4S, v31.s[0] +sub v27.4s, v16.4s, v17.4s +add v16.4s, v16.4s, v17.4s +sqrdmulh v17.4S, v10.4S, v29.4S +mul v10.4S, v10.4S,v15.4S +mla v10.4S, v17.4S, v31.s[0] +sub v17.4s, v22.4s, v10.4s +add v22.4s, v22.4s, v10.4s +sqrdmulh v10.4S, v22.4S, v24.4S +mul v22.4S, v22.4S,v20.4S +mla v22.4S, v10.4S, v31.s[0] +sub v10.4s, v16.4s, v22.4s +add v16.4s, v16.4s, v22.4s +sqrdmulh v22.4S, v17.4S, v0.4S +mul v17.4S, v17.4S,v4.4S +mla v17.4S, v22.4S, v31.s[0] +sub v22.4s, v27.4s, v17.4s +add v27.4s, v27.4s, v17.4s +str q16, [x0, #640] +str q10, [x0, #656] +str q27, [x0, #672] +str q22, [x0, #688] +ldr q22, [x17, #+1536] +ldr q27, [x17, #+1552] +ldr q10, [x17, #+1568] +ldr q16, [x17, #+1584] +ldr q17, [x17, #+1600] +ldr q8, [x17, #+1616] +ldr q9, [x17, #+1632] +ldr q14, [x17, #+1648] +ldr q0, [x0, #736] +ldr q4, [x0, #752] +ldr q24, [x0, #704] +ldr q20, [x0, #720] +sqrdmulh v29.4S, v0.4S, v27.s[0] +mul v0.4S, v0.4S,v22.s[0] +mla v0.4S, v29.4S, v31.s[0] +sub v29.4s, v24.4s, v0.4s +add v24.4s, v24.4s, v0.4s +sqrdmulh v0.4S, v4.4S, v27.s[0] +mul v4.4S, v4.4S,v22.s[0] +mla v4.4S, v0.4S, v31.s[0] +sub v0.4s, v20.4s, v4.4s +add v20.4s, v20.4s, v4.4s +sqrdmulh v4.4S, v20.4S, v27.s[1] +mul v20.4S, v20.4S,v22.s[1] +mla v20.4S, v4.4S, v31.s[0] +sub v4.4s, v24.4s, v20.4s +add v24.4s, v24.4s, v20.4s +sqrdmulh v20.4S, v0.4S, v27.s[2] +mul v0.4S, v0.4S,v22.s[2] +mla v0.4S, v20.4S, v31.s[0] +sub v20.4s, v29.4s, v0.4s +add v29.4s, v29.4s, v0.4s +trn1 v0.4S, v24.4S, v4.4S +trn2 v15.4S, v24.4S, v4.4S +trn1 v19.4S, v29.4S, v20.4S +trn2 v5.4S, v29.4S, v20.4S +trn2 v29.2D, v0.2D, v19.2D +trn2 v20.2D, v15.2D, v5.2D +trn1 v24.2D, v0.2D, v19.2D +trn1 v4.2D, v15.2D, v5.2D +sqrdmulh v5.4S, v29.4S, v16.4S +mul v29.4S, v29.4S,v10.4S +mla v29.4S, v5.4S, v31.s[0] +sub v5.4s, v24.4s, v29.4s +add v24.4s, v24.4s, v29.4s +sqrdmulh v29.4S, v20.4S, v16.4S +mul v20.4S, v20.4S,v10.4S +mla v20.4S, v29.4S, v31.s[0] +sub v29.4s, v4.4s, v20.4s +add v4.4s, v4.4s, v20.4s +sqrdmulh v20.4S, v4.4S, v8.4S +mul v4.4S, v4.4S,v17.4S +mla v4.4S, v20.4S, v31.s[0] +sub v20.4s, v24.4s, v4.4s +add v24.4s, v24.4s, v4.4s +sqrdmulh v4.4S, v29.4S, v14.4S +mul v29.4S, v29.4S,v9.4S +mla v29.4S, v4.4S, v31.s[0] +sub v4.4s, v5.4s, v29.4s +add v5.4s, v5.4s, v29.4s +str q24, [x0, #704] +str q20, [x0, #720] +str q5, [x0, #736] +str q4, [x0, #752] +ldr q4, [x17, #+1664] +ldr q5, [x17, #+1680] +ldr q20, [x17, #+1696] +ldr q24, [x17, #+1712] +ldr q29, [x17, #+1728] +ldr q15, [x17, #+1744] +ldr q19, [x17, #+1760] +ldr q0, [x17, #+1776] +ldr q14, [x0, #800] +ldr q9, [x0, #816] +ldr q8, [x0, #768] +ldr q17, [x0, #784] +sqrdmulh v16.4S, v14.4S, v5.s[0] +mul v14.4S, v14.4S,v4.s[0] +mla v14.4S, v16.4S, v31.s[0] +sub v16.4s, v8.4s, v14.4s +add v8.4s, v8.4s, v14.4s +sqrdmulh v14.4S, v9.4S, v5.s[0] +mul v9.4S, v9.4S,v4.s[0] +mla v9.4S, v14.4S, v31.s[0] +sub v14.4s, v17.4s, v9.4s +add v17.4s, v17.4s, v9.4s +sqrdmulh v9.4S, v17.4S, v5.s[1] +mul v17.4S, v17.4S,v4.s[1] +mla v17.4S, v9.4S, v31.s[0] +sub v9.4s, v8.4s, v17.4s +add v8.4s, v8.4s, v17.4s +sqrdmulh v17.4S, v14.4S, v5.s[2] +mul v14.4S, v14.4S,v4.s[2] +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v16.4s, v14.4s +add v16.4s, v16.4s, v14.4s +trn1 v14.4S, v8.4S, v9.4S +trn2 v10.4S, v8.4S, v9.4S +trn1 v27.4S, v16.4S, v17.4S +trn2 v22.4S, v16.4S, v17.4S +trn2 v16.2D, v14.2D, v27.2D +trn2 v17.2D, v10.2D, v22.2D +trn1 v8.2D, v14.2D, v27.2D +trn1 v9.2D, v10.2D, v22.2D +sqrdmulh v22.4S, v16.4S, v24.4S +mul v16.4S, v16.4S,v20.4S +mla v16.4S, v22.4S, v31.s[0] +sub v22.4s, v8.4s, v16.4s +add v8.4s, v8.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v24.4S +mul v17.4S, v17.4S,v20.4S +mla v17.4S, v16.4S, v31.s[0] +sub v16.4s, v9.4s, v17.4s +add v9.4s, v9.4s, v17.4s +sqrdmulh v17.4S, v9.4S, v15.4S +mul v9.4S, v9.4S,v29.4S +mla v9.4S, v17.4S, v31.s[0] +sub v17.4s, v8.4s, v9.4s +add v8.4s, v8.4s, v9.4s +sqrdmulh v9.4S, v16.4S, v0.4S +mul v16.4S, v16.4S,v19.4S +mla v16.4S, v9.4S, v31.s[0] +sub v9.4s, v22.4s, v16.4s +add v22.4s, v22.4s, v16.4s +str q8, [x0, #768] +str q17, [x0, #784] +str q22, [x0, #800] +str q9, [x0, #816] +ldr q9, [x17, #+1792] +ldr q22, [x17, #+1808] +ldr q17, [x17, #+1824] +ldr q8, [x17, #+1840] +ldr q16, [x17, #+1856] +ldr q10, [x17, #+1872] +ldr q27, [x17, #+1888] +ldr q14, [x17, #+1904] +ldr q0, [x0, #864] +ldr q19, [x0, #880] +ldr q15, [x0, #832] +ldr q29, [x0, #848] +sqrdmulh v24.4S, v0.4S, v22.s[0] +mul v0.4S, v0.4S,v9.s[0] +mla v0.4S, v24.4S, v31.s[0] +sub v24.4s, v15.4s, v0.4s +add v15.4s, v15.4s, v0.4s +sqrdmulh v0.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v9.s[0] +mla v19.4S, v0.4S, v31.s[0] +sub v0.4s, v29.4s, v19.4s +add v29.4s, v29.4s, v19.4s +sqrdmulh v19.4S, v29.4S, v22.s[1] +mul v29.4S, v29.4S,v9.s[1] +mla v29.4S, v19.4S, v31.s[0] +sub v19.4s, v15.4s, v29.4s +add v15.4s, v15.4s, v29.4s +sqrdmulh v29.4S, v0.4S, v22.s[2] +mul v0.4S, v0.4S,v9.s[2] +mla v0.4S, v29.4S, v31.s[0] +sub v29.4s, v24.4s, v0.4s +add v24.4s, v24.4s, v0.4s +trn1 v0.4S, v15.4S, v19.4S +trn2 v20.4S, v15.4S, v19.4S +trn1 v5.4S, v24.4S, v29.4S +trn2 v4.4S, v24.4S, v29.4S +trn2 v24.2D, v0.2D, v5.2D +trn2 v29.2D, v20.2D, v4.2D +trn1 v15.2D, v0.2D, v5.2D +trn1 v19.2D, v20.2D, v4.2D +sqrdmulh v4.4S, v24.4S, v8.4S +mul v24.4S, v24.4S,v17.4S +mla v24.4S, v4.4S, v31.s[0] +sub v4.4s, v15.4s, v24.4s +add v15.4s, v15.4s, v24.4s +sqrdmulh v24.4S, v29.4S, v8.4S +mul v29.4S, v29.4S,v17.4S +mla v29.4S, v24.4S, v31.s[0] +sub v24.4s, v19.4s, v29.4s +add v19.4s, v19.4s, v29.4s +sqrdmulh v29.4S, v19.4S, v10.4S +mul v19.4S, v19.4S,v16.4S +mla v19.4S, v29.4S, v31.s[0] +sub v29.4s, v15.4s, v19.4s +add v15.4s, v15.4s, v19.4s +sqrdmulh v19.4S, v24.4S, v14.4S +mul v24.4S, v24.4S,v27.4S +mla v24.4S, v19.4S, v31.s[0] +sub v19.4s, v4.4s, v24.4s +add v4.4s, v4.4s, v24.4s +str q15, [x0, #832] +str q29, [x0, #848] +str q4, [x0, #864] +str q19, [x0, #880] +ldr q19, [x17, #+1920] +ldr q4, [x17, #+1936] +ldr q29, [x17, #+1952] +ldr q15, [x17, #+1968] +ldr q24, [x17, #+1984] +ldr q20, [x17, #+2000] +ldr q5, [x17, #+2016] +ldr q0, [x17, #+2032] +ldr q14, [x0, #928] +ldr q27, [x0, #944] +ldr q10, [x0, #896] +ldr q16, [x0, #912] +sqrdmulh v8.4S, v14.4S, v4.s[0] +mul v14.4S, v14.4S,v19.s[0] +mla v14.4S, v8.4S, v31.s[0] +sub v8.4s, v10.4s, v14.4s +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v27.4S, v4.s[0] +mul v27.4S, v27.4S,v19.s[0] +mla v27.4S, v14.4S, v31.s[0] +sub v14.4s, v16.4s, v27.4s +add v16.4s, v16.4s, v27.4s +sqrdmulh v27.4S, v16.4S, v4.s[1] +mul v16.4S, v16.4S,v19.s[1] +mla v16.4S, v27.4S, v31.s[0] +sub v27.4s, v10.4s, v16.4s +add v10.4s, v10.4s, v16.4s +sqrdmulh v16.4S, v14.4S, v4.s[2] +mul v14.4S, v14.4S,v19.s[2] +mla v14.4S, v16.4S, v31.s[0] +sub v16.4s, v8.4s, v14.4s +add v8.4s, v8.4s, v14.4s +trn1 v14.4S, v10.4S, v27.4S +trn2 v17.4S, v10.4S, v27.4S +trn1 v22.4S, v8.4S, v16.4S +trn2 v9.4S, v8.4S, v16.4S +trn2 v8.2D, v14.2D, v22.2D +trn2 v16.2D, v17.2D, v9.2D +trn1 v10.2D, v14.2D, v22.2D +trn1 v27.2D, v17.2D, v9.2D +sqrdmulh v9.4S, v8.4S, v15.4S +mul v8.4S, v8.4S,v29.4S +mla v8.4S, v9.4S, v31.s[0] +sub v9.4s, v10.4s, v8.4s +add v10.4s, v10.4s, v8.4s +sqrdmulh v8.4S, v16.4S, v15.4S +mul v16.4S, v16.4S,v29.4S +mla v16.4S, v8.4S, v31.s[0] +sub v8.4s, v27.4s, v16.4s +add v27.4s, v27.4s, v16.4s +sqrdmulh v16.4S, v27.4S, v20.4S +mul v27.4S, v27.4S,v24.4S +mla v27.4S, v16.4S, v31.s[0] +sub v16.4s, v10.4s, v27.4s +add v10.4s, v10.4s, v27.4s +sqrdmulh v27.4S, v8.4S, v0.4S +mul v8.4S, v8.4S,v5.4S +mla v8.4S, v27.4S, v31.s[0] +sub v27.4s, v9.4s, v8.4s +add v9.4s, v9.4s, v8.4s +str q10, [x0, #896] +str q16, [x0, #912] +str q9, [x0, #928] +str q27, [x0, #944] +ldr q27, [x17, #+2048] +ldr q9, [x17, #+2064] +ldr q16, [x17, #+2080] +ldr q10, [x17, #+2096] +ldr q8, [x17, #+2112] +ldr q17, [x17, #+2128] +ldr q22, [x17, #+2144] +ldr q14, [x17, #+2160] +ldr q0, [x0, #992] +ldr q5, [x0, #1008] +ldr q20, [x0, #960] +ldr q24, [x0, #976] +sqrdmulh v15.4S, v0.4S, v9.s[0] +mul v0.4S, v0.4S,v27.s[0] +mla v0.4S, v15.4S, v31.s[0] +sub v15.4s, v20.4s, v0.4s +add v20.4s, v20.4s, v0.4s +sqrdmulh v0.4S, v5.4S, v9.s[0] +mul v5.4S, v5.4S,v27.s[0] +mla v5.4S, v0.4S, v31.s[0] +sub v0.4s, v24.4s, v5.4s +add v24.4s, v24.4s, v5.4s +sqrdmulh v5.4S, v24.4S, v9.s[1] +mul v24.4S, v24.4S,v27.s[1] +mla v24.4S, v5.4S, v31.s[0] +sub v5.4s, v20.4s, v24.4s +add v20.4s, v20.4s, v24.4s +sqrdmulh v24.4S, v0.4S, v9.s[2] +mul v0.4S, v0.4S,v27.s[2] +mla v0.4S, v24.4S, v31.s[0] +sub v24.4s, v15.4s, v0.4s +add v15.4s, v15.4s, v0.4s +trn1 v0.4S, v20.4S, v5.4S +trn2 v29.4S, v20.4S, v5.4S +trn1 v4.4S, v15.4S, v24.4S +trn2 v19.4S, v15.4S, v24.4S +trn2 v15.2D, v0.2D, v4.2D +trn2 v24.2D, v29.2D, v19.2D +trn1 v20.2D, v0.2D, v4.2D +trn1 v5.2D, v29.2D, v19.2D +sqrdmulh v19.4S, v15.4S, v10.4S +mul v15.4S, v15.4S,v16.4S +mla v15.4S, v19.4S, v31.s[0] +sub v19.4s, v20.4s, v15.4s +add v20.4s, v20.4s, v15.4s +sqrdmulh v15.4S, v24.4S, v10.4S +mul v24.4S, v24.4S,v16.4S +mla v24.4S, v15.4S, v31.s[0] +sub v15.4s, v5.4s, v24.4s +add v5.4s, v5.4s, v24.4s +sqrdmulh v24.4S, v5.4S, v17.4S +mul v5.4S, v5.4S,v8.4S +mla v5.4S, v24.4S, v31.s[0] +sub v24.4s, v20.4s, v5.4s +add v20.4s, v20.4s, v5.4s +sqrdmulh v5.4S, v15.4S, v14.4S +mul v15.4S, v15.4S,v22.4S +mla v15.4S, v5.4S, v31.s[0] +sub v5.4s, v19.4s, v15.4s +add v19.4s, v19.4s, v15.4s +str q20, [x0, #960] +str q24, [x0, #976] +str q19, [x0, #992] +str q5, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 2392 +// Instruction count: 2388 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_13_0.s b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_13_0.s new file mode 100644 index 0000000..34599c2 --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_13_0.s @@ -0,0 +1,2422 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 26036764 // Layer 6, block 0 +.word 7065381 // Layer 6, block 1 +.word 11280567 // Layer 6, block 2 +.word 19695786 // Layer 6, block 3 +.word 1666225723 // Layer 6, block 0 +.word 452149874 // Layer 6, block 1 +.word 721901190 // Layer 6, block 2 +.word 1260434103 // Layer 6, block 3 +.word 28678040 // Layer 7, block 0 +.word 5637166 // Layer 7, block 2 +.word 18759424 // Layer 7, block 4 +.word 8648030 // Layer 7, block 6 +.word 1835254486 // Layer 7, block 0 +.word 360751090 // Layer 7, block 2 +.word 1200511508 // Layer 7, block 4 +.word 553431680 // Layer 7, block 6 +.word 7232147 // Layer 7, block 1 +.word 7430689 // Layer 7, block 3 +.word 14819378 // Layer 7, block 5 +.word 22112339 // Layer 7, block 7 +.word 462822084 // Layer 7, block 1 +.word 475527802 // Layer 7, block 3 +.word 948367809 // Layer 7, block 5 +.word 1415081692 // Layer 7, block 7 +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14834498 // Layer 6, block 4 +.word 22861321 // Layer 6, block 5 +.word 23033862 // Layer 6, block 6 +.word 32211066 // Layer 6, block 7 +.word 949335415 // Layer 6, block 4 +.word 1463012881 // Layer 6, block 5 +.word 1474054663 // Layer 6, block 6 +.word 2061350894 // Layer 6, block 7 +.word 7103825 // Layer 7, block 8 +.word 24338119 // Layer 7, block 10 +.word 6674394 // Layer 7, block 12 +.word 3716128 // Layer 7, block 14 +.word 454610102 // Layer 7, block 8 +.word 1557520740 // Layer 7, block 10 +.word 427128616 // Layer 7, block 12 +.word 237814041 // Layer 7, block 14 +.word 18577393 // Layer 7, block 9 +.word 17042091 // Layer 7, block 11 +.word 6574213 // Layer 7, block 13 +.word 24666803 // Layer 7, block 15 +.word 1188862414 // Layer 7, block 9 +.word 1090610585 // Layer 7, block 11 +.word 420717521 // Layer 7, block 13 +.word 1578554911 // Layer 7, block 15 +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 11253846 // Layer 6, block 8 +.word 16151303 // Layer 6, block 9 +.word 1821442 // Layer 6, block 10 +.word 23358663 // Layer 6, block 11 +.word 720191176 // Layer 6, block 8 +.word 1033604503 // Layer 6, block 9 +.word 116563391 // Layer 6, block 10 +.word 1494840340 // Layer 6, block 11 +.word 32787475 // Layer 7, block 16 +.word 8269259 // Layer 7, block 18 +.word 20826321 // Layer 7, block 20 +.word 21194054 // Layer 7, block 22 +.word 2098238255 // Layer 7, block 16 +.word 529192186 // Layer 7, block 18 +.word 1332782821 // Layer 7, block 20 +.word 1356315937 // Layer 7, block 22 +.word 28400654 // Layer 7, block 17 +.word 31090287 // Layer 7, block 19 +.word 26776841 // Layer 7, block 21 +.word 22281074 // Layer 7, block 23 +.word 1817503137 // Layer 7, block 17 +.word 1989626512 // Layer 7, block 19 +.word 1713587037 // Layer 7, block 21 +.word 1425879908 // Layer 7, block 23 +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 20504641 // Layer 6, block 12 +.word 7735096 // Layer 6, block 13 +.word 29463916 // Layer 6, block 14 +.word 23172067 // Layer 6, block 15 +.word 1312196872 // Layer 6, block 12 +.word 495008363 // Layer 6, block 13 +.word 1885546712 // Layer 6, block 14 +.word 1482899108 // Layer 6, block 15 +.word 1953000 // Layer 7, block 24 +.word 12766243 // Layer 7, block 26 +.word 16292342 // Layer 7, block 28 +.word 25143337 // Layer 7, block 30 +.word 124982461 // Layer 7, block 24 +.word 816977197 // Layer 7, block 26 +.word 1042630311 // Layer 7, block 28 +.word 1609050759 // Layer 7, block 30 +.word 12486848 // Layer 7, block 25 +.word 31556661 // Layer 7, block 27 +.word 28330310 // Layer 7, block 29 +.word 15137961 // Layer 7, block 31 +.word 799097282 // Layer 7, block 25 +.word 2019472170 // Layer 7, block 27 +.word 1813001465 // Layer 7, block 29 +.word 968755565 // Layer 7, block 31 +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 18663828 // Layer 6, block 16 +.word 25765932 // Layer 6, block 17 +.word 11779122 // Layer 6, block 18 +.word 29112305 // Layer 6, block 19 +.word 1194393831 // Layer 6, block 16 +.word 1648893798 // Layer 6, block 17 +.word 753806275 // Layer 6, block 18 +.word 1863045325 // Layer 6, block 19 +.word 33163184 // Layer 7, block 32 +.word 11550623 // Layer 7, block 34 +.word 25375595 // Layer 7, block 36 +.word 18254638 // Layer 7, block 38 +.word 2122281795 // Layer 7, block 32 +.word 739183455 // Layer 7, block 34 +.word 1623914137 // Layer 7, block 36 +.word 1168207670 // Layer 7, block 38 +.word 9551359 // Layer 7, block 33 +.word 33257316 // Layer 7, block 35 +.word 10387700 // Layer 7, block 37 +.word 4263629 // Layer 7, block 39 +.word 611240324 // Layer 7, block 33 +.word 2128305784 // Layer 7, block 35 +.word 664762063 // Layer 7, block 37 +.word 272851431 // Layer 7, block 39 +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 596073 // Layer 6, block 20 +.word 29039358 // Layer 6, block 21 +.word 6760262 // Layer 6, block 22 +.word 2228887 // Layer 6, block 23 +.word 38145761 // Layer 6, block 20 +.word 1858377074 // Layer 6, block 21 +.word 432623749 // Layer 6, block 22 +.word 142637881 // Layer 6, block 23 +.word 25929180 // Layer 7, block 40 +.word 23508428 // Layer 7, block 42 +.word 22560727 // Layer 7, block 44 +.word 29457393 // Layer 7, block 46 +.word 1659340873 // Layer 7, block 40 +.word 1504424569 // Layer 7, block 42 +.word 1443776334 // Layer 7, block 44 +.word 1885129272 // Layer 7, block 46 +.word 17371159 // Layer 7, block 41 +.word 11558208 // Layer 7, block 43 +.word 15755637 // Layer 7, block 45 +.word 20740787 // Layer 7, block 47 +.word 1111669329 // Layer 7, block 41 +.word 739668858 // Layer 7, block 43 +.word 1008283812 // Layer 7, block 45 +.word 1327309063 // Layer 7, block 47 +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 13624329 // Layer 6, block 24 +.word 9838349 // Layer 6, block 25 +.word 6934560 // Layer 6, block 26 +.word 11310234 // Layer 6, block 27 +.word 871890510 // Layer 6, block 24 +.word 629606282 // Layer 6, block 25 +.word 443777969 // Layer 6, block 26 +.word 723799733 // Layer 6, block 27 +.word 3153984 // Layer 7, block 48 +.word 15599806 // Layer 7, block 50 +.word 23484790 // Layer 7, block 52 +.word 30174454 // Layer 7, block 54 +.word 201839571 // Layer 7, block 48 +.word 998311389 // Layer 7, block 50 +.word 1502911852 // Layer 7, block 52 +.word 1931017673 // Layer 7, block 54 +.word 13598070 // Layer 7, block 49 +.word 31454003 // Layer 7, block 51 +.word 20506260 // Layer 7, block 53 +.word 5928435 // Layer 7, block 55 +.word 870210062 // Layer 7, block 49 +.word 2012902560 // Layer 7, block 51 +.word 1312300480 // Layer 7, block 53 +.word 379390883 // Layer 7, block 55 +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 32798516 // Layer 6, block 28 +.word 9911360 // Layer 6, block 29 +.word 32443170 // Layer 6, block 30 +.word 31293482 // Layer 6, block 31 +.word 2098944825 // Layer 6, block 28 +.word 634278629 // Layer 6, block 29 +.word 2076204416 // Layer 6, block 30 +.word 2002630000 // Layer 6, block 31 +.word 26013877 // Layer 7, block 56 +.word 22928950 // Layer 7, block 58 +.word 24547058 // Layer 7, block 60 +.word 21082546 // Layer 7, block 62 +.word 1664761067 // Layer 7, block 56 +.word 1467340807 // Layer 7, block 58 +.word 1570891816 // Layer 7, block 60 +.word 1349179970 // Layer 7, block 62 +.word 21864746 // Layer 7, block 57 +.word 27678266 // Layer 7, block 59 +.word 30695887 // Layer 7, block 61 +.word 31772478 // Layer 7, block 63 +.word 1399236949 // Layer 7, block 57 +.word 1771273834 // Layer 7, block 59 +.word 1964386839 // Layer 7, block 61 +.word 2033283404 // Layer 7, block 63 +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 2853776 // Layer 6, block 32 +.word 31645959 // Layer 6, block 33 +.word 29723614 // Layer 6, block 34 +.word 31813171 // Layer 6, block 35 +.word 182627725 // Layer 6, block 32 +.word 2025186806 // Layer 6, block 33 +.word 1902166116 // Layer 6, block 34 +.word 2035887557 // Layer 6, block 35 +.word 30377953 // Layer 7, block 64 +.word 4924837 // Layer 7, block 66 +.word 11362575 // Layer 7, block 68 +.word 31398766 // Layer 7, block 70 +.word 1944040616 // Layer 7, block 64 +.word 315165513 // Layer 7, block 66 +.word 727149301 // Layer 7, block 68 +.word 2009367662 // Layer 7, block 70 +.word 27689101 // Layer 7, block 65 +.word 31229525 // Layer 7, block 67 +.word 6544948 // Layer 7, block 69 +.word 13728247 // Layer 7, block 71 +.word 1771967221 // Layer 7, block 65 +.word 1998537064 // Layer 7, block 67 +.word 418844704 // Layer 7, block 69 +.word 878540754 // Layer 7, block 71 +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9116920 // Layer 6, block 36 +.word 26449800 // Layer 6, block 37 +.word 27173300 // Layer 6, block 38 +.word 1574249 // Layer 6, block 39 +.word 583438350 // Layer 6, block 36 +.word 1692658010 // Layer 6, block 37 +.word 1738958476 // Layer 6, block 38 +.word 100744247 // Layer 6, block 39 +.word 6510145 // Layer 7, block 72 +.word 760999 // Layer 7, block 74 +.word 1634503 // Layer 7, block 76 +.word 29546109 // Layer 7, block 78 +.word 416617482 // Layer 7, block 72 +.word 48700219 // Layer 7, block 74 +.word 104600209 // Layer 7, block 76 +.word 1890806663 // Layer 7, block 78 +.word 2195232 // Layer 7, block 73 +.word 4465852 // Layer 7, block 75 +.word 31203102 // Layer 7, block 77 +.word 29916743 // Layer 7, block 79 +.word 140484126 // Layer 7, block 73 +.word 285792715 // Layer 7, block 75 +.word 1996846121 // Layer 7, block 77 +.word 1914525428 // Layer 7, block 79 +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29172999 // Layer 6, block 40 +.word 16825951 // Layer 6, block 41 +.word 11592382 // Layer 6, block 42 +.word 2671395 // Layer 6, block 43 +.word 1866929445 // Layer 6, block 40 +.word 1076778680 // Layer 6, block 41 +.word 741855827 // Layer 6, block 42 +.word 170956232 // Layer 6, block 43 +.word 14579779 // Layer 7, block 80 +.word 24263513 // Layer 7, block 82 +.word 4646776 // Layer 7, block 84 +.word 69049 // Layer 7, block 86 +.word 933034643 // Layer 7, block 80 +.word 1552746321 // Layer 7, block 82 +.word 297370968 // Layer 7, block 84 +.word 4418799 // Layer 7, block 86 +.word 33263488 // Layer 7, block 81 +.word 22493246 // Layer 7, block 83 +.word 22009979 // Layer 7, block 85 +.word 12021234 // Layer 7, block 87 +.word 2128700762 // Layer 7, block 81 +.word 1439457879 // Layer 7, block 83 +.word 1408531152 // Layer 7, block 85 +.word 769300260 // Layer 7, block 87 +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 15720958 // Layer 6, block 44 +.word 4876619 // Layer 6, block 45 +.word 9370171 // Layer 6, block 46 +.word 2197027 // Layer 6, block 47 +.word 1006064525 // Layer 6, block 44 +.word 312079797 // Layer 6, block 45 +.word 599645177 // Layer 6, block 46 +.word 140598997 // Layer 6, block 47 +.word 16117282 // Layer 7, block 88 +.word 9635661 // Layer 7, block 90 +.word 9117520 // Layer 7, block 92 +.word 3506913 // Layer 7, block 94 +.word 1031427326 // Layer 7, block 88 +.word 616635240 // Layer 7, block 90 +.word 583476747 // Layer 7, block 92 +.word 224425303 // Layer 7, block 94 +.word 20014407 // Layer 7, block 89 +.word 25893988 // Layer 7, block 91 +.word 10257619 // Layer 7, block 93 +.word 24501669 // Layer 7, block 95 +.word 1280824291 // Layer 7, block 89 +.word 1657088757 // Layer 7, block 91 +.word 656437514 // Layer 7, block 93 +.word 1567987141 // Layer 7, block 95 +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 23467272 // Layer 6, block 48 +.word 11944835 // Layer 6, block 49 +.word 29768154 // Layer 6, block 50 +.word 3189790 // Layer 6, block 51 +.word 1501790786 // Layer 6, block 48 +.word 764411097 // Layer 6, block 49 +.word 1905016458 // Layer 6, block 50 +.word 204130980 // Layer 6, block 51 +.word 28559032 // Layer 7, block 96 +.word 20151609 // Layer 7, block 98 +.word 11645481 // Layer 7, block 100 +.word 16402437 // Layer 7, block 102 +.word 1827638556 // Layer 7, block 96 +.word 1289604549 // Layer 7, block 98 +.word 745253903 // Layer 7, block 100 +.word 1049675853 // Layer 7, block 102 +.word 1005359 // Layer 7, block 97 +.word 19130139 // Layer 7, block 99 +.word 11690281 // Layer 7, block 101 +.word 5461508 // Layer 7, block 103 +.word 64338065 // Layer 7, block 97 +.word 1224235458 // Layer 7, block 99 +.word 748120885 // Layer 7, block 101 +.word 349509836 // Layer 7, block 103 +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 4898455 // Layer 6, block 52 +.word 22059944 // Layer 6, block 53 +.word 20315246 // Layer 6, block 54 +.word 28615767 // Layer 6, block 55 +.word 313477194 // Layer 6, block 52 +.word 1411728668 // Layer 6, block 53 +.word 1300076517 // Layer 6, block 54 +.word 1831269319 // Layer 6, block 55 +.word 6226096 // Layer 7, block 104 +.word 14029790 // Layer 7, block 106 +.word 7729000 // Layer 7, block 108 +.word 13958531 // Layer 7, block 110 +.word 398439734 // Layer 7, block 104 +.word 897838034 // Layer 7, block 106 +.word 494618249 // Layer 7, block 108 +.word 893277806 // Layer 7, block 110 +.word 31755058 // Layer 7, block 105 +.word 26102744 // Layer 7, block 107 +.word 19175904 // Layer 7, block 109 +.word 19472238 // Layer 7, block 111 +.word 2032168609 // Layer 7, block 105 +.word 1670448121 // Layer 7, block 107 +.word 1227164194 // Layer 7, block 109 +.word 1246128123 // Layer 7, block 111 +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 17302560 // Layer 6, block 56 +.word 8630188 // Layer 6, block 57 +.word 13744680 // Layer 6, block 58 +.word 31890906 // Layer 6, block 59 +.word 1107279328 // Layer 6, block 56 +.word 552289879 // Layer 6, block 57 +.word 879592386 // Layer 6, block 58 +.word 2040862218 // Layer 6, block 59 +.word 4735938 // Layer 7, block 112 +.word 26671657 // Layer 7, block 114 +.word 25810971 // Layer 7, block 116 +.word 25578690 // Layer 7, block 118 +.word 303076900 // Layer 7, block 112 +.word 1706855774 // Layer 7, block 114 +.word 1651776074 // Layer 7, block 116 +.word 1636911225 // Layer 7, block 118 +.word 6957373 // Layer 7, block 113 +.word 25381712 // Layer 7, block 115 +.word 27780827 // Layer 7, block 117 +.word 28062311 // Layer 7, block 119 +.word 445237890 // Layer 7, block 113 +.word 1624305595 // Layer 7, block 115 +.word 1777837237 // Layer 7, block 117 +.word 1795850838 // Layer 7, block 119 +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 26150922 // Layer 6, block 60 +.word 29525906 // Layer 6, block 61 +.word 23080870 // Layer 6, block 62 +.word 1636987 // Layer 6, block 63 +.word 1673531278 // Layer 6, block 60 +.word 1889513769 // Layer 6, block 61 +.word 1477062945 // Layer 6, block 62 +.word 104759172 // Layer 6, block 63 +.word 10674616 // Layer 7, block 120 +.word 9508293 // Layer 7, block 122 +.word 4274200 // Layer 7, block 124 +.word 10066304 // Layer 7, block 126 +.word 683123285 // Layer 7, block 120 +.word 608484310 // Layer 7, block 122 +.word 273527923 // Layer 7, block 124 +.word 644194289 // Layer 7, block 126 +.word 26473446 // Layer 7, block 121 +.word 14853570 // Layer 7, block 123 +.word 32427548 // Layer 7, block 125 +.word 16598340 // Layer 7, block 127 +.word 1694171239 // Layer 7, block 121 +.word 950555930 // Layer 7, block 123 +.word 2075204685 // Layer 7, block 125 +.word 1062212688 // Layer 7, block 127 +.text +.global ntt_u32_full_neon_asm_var_4_4_13_0 +.global _ntt_u32_full_neon_asm_var_4_4_13_0 +ntt_u32_full_neon_asm_var_4_4_13_0: +_ntt_u32_full_neon_asm_var_4_4_13_0: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #928] +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +ldr q20, [x0, #992] +sqrdmulh v19.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q18, [x0, #800] +sqrdmulh v17.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +ldr q16, [x0, #864] +sqrdmulh v3.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +ldr q2, [x0, #544] +mla v22.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v2.4S, v29.s[0] +ldr q1, [x0, #608] +mla v20.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v1.4S, v29.s[0] +ldr q0, [x0, #672] +mla v18.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v0.4S, v29.s[0] +ldr q15, [x0, #736] +mla v16.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v15.4S, v29.s[0] +ldr q14, [x0, #416] +ldr q13, [x0, #480] +mul v2.4S, v2.4S,v30.s[0] +sub v12.4s, v14.4s, v22.4s +mul v1.4S, v1.4S,v30.s[0] +add v14.4s, v14.4s, v22.4s +ldr q22, [x0, #288] +ldr q11, [x0, #352] +mla v2.4S, v21.4S, v31.s[0] +sub v21.4s, v13.4s, v20.4s +mla v1.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v20.4s +ldr q20, [x0, #32] +ldr q19, [x0, #96] +mul v0.4S, v0.4S,v30.s[0] +sub v10.4s, v22.4s, v18.4s +mul v15.4S, v15.4S,v30.s[0] +add v22.4s, v22.4s, v18.4s +ldr q18, [x0, #160] +ldr q9, [x0, #224] +mla v0.4S, v17.4S, v31.s[0] +sub v17.4s, v11.4s, v16.4s +mla v15.4S, v3.4S, v31.s[0] +add v11.4s, v11.4s, v16.4s +sqrdmulh v16.4S, v14.4S, v29.s[1] +mul v14.4S, v14.4S,v30.s[1] +sqrdmulh v3.4S, v13.4S, v29.s[1] +sub v8.4s, v20.4s, v2.4s +mul v13.4S, v13.4S,v30.s[1] +add v20.4s, v20.4s, v2.4s +sqrdmulh v2.4S, v22.4S, v29.s[1] +sub v7.4s, v19.4s, v1.4s +mul v22.4S, v22.4S,v30.s[1] +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v11.4S, v29.s[1] +sub v6.4s, v18.4s, v0.4s +mul v11.4S, v11.4S,v30.s[1] +add v18.4s, v18.4s, v0.4s +mla v14.4S, v16.4S, v31.s[0] +sub v16.4s, v9.4s, v15.4s +sqrdmulh v0.4S, v12.4S, v29.s[2] +add v9.4s, v9.4s, v15.4s +mla v13.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v21.4S, v29.s[2] +mla v22.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v10.4S, v29.s[2] +mla v11.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v17.4S, v29.s[2] +mul v12.4S, v12.4S,v30.s[2] +sub v15.4s, v18.4s, v14.4s +mul v21.4S, v21.4S,v30.s[2] +add v18.4s, v18.4s, v14.4s +mla v12.4S, v0.4S, v31.s[0] +sub v0.4s, v9.4s, v13.4s +mla v21.4S, v3.4S, v31.s[0] +add v9.4s, v9.4s, v13.4s +mul v10.4S, v10.4S,v30.s[2] +sub v13.4s, v20.4s, v22.4s +mul v17.4S, v17.4S,v30.s[2] +add v20.4s, v20.4s, v22.4s +mla v10.4S, v2.4S, v31.s[0] +sub v2.4s, v19.4s, v11.4s +mla v17.4S, v1.4S, v31.s[0] +add v19.4s, v19.4s, v11.4s +sqrdmulh v11.4S, v15.4S, v27.s[1] +mul v15.4S, v15.4S,v28.s[1] +sqrdmulh v1.4S, v0.4S, v27.s[1] +sub v22.4s, v6.4s, v12.4s +mul v0.4S, v0.4S,v28.s[1] +add v6.4s, v6.4s, v12.4s +sqrdmulh v12.4S, v18.4S, v27.s[0] +sub v3.4s, v16.4s, v21.4s +mul v18.4S, v18.4S,v28.s[0] +add v16.4s, v16.4s, v21.4s +sqrdmulh v21.4S, v9.4S, v27.s[0] +sub v14.4s, v8.4s, v10.4s +mul v9.4S, v9.4S,v28.s[0] +add v8.4s, v8.4s, v10.4s +mla v15.4S, v11.4S, v31.s[0] +sub v11.4s, v7.4s, v17.4s +sqrdmulh v10.4S, v6.4S, v27.s[2] +add v7.4s, v7.4s, v17.4s +mla v0.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v16.4S, v27.s[2] +mla v18.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v22.4S, v27.s[3] +mla v9.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v3.4S, v27.s[3] +mul v6.4S, v6.4S,v28.s[2] +sub v17.4s, v13.4s, v15.4s +mul v16.4S, v16.4S,v28.s[2] +add v13.4s, v13.4s, v15.4s +mla v6.4S, v10.4S, v31.s[0] +sub v10.4s, v2.4s, v0.4s +mla v16.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v0.4s +mul v22.4S, v22.4S,v28.s[3] +sub v0.4s, v20.4s, v18.4s +mul v3.4S, v3.4S,v28.s[3] +add v20.4s, v20.4s, v18.4s +mla v22.4S, v12.4S, v31.s[0] +sub v12.4s, v19.4s, v9.4s +mla v3.4S, v21.4S, v31.s[0] +add v19.4s, v19.4s, v9.4s +sqrdmulh v9.4S, v2.4S, v25.s[2] +mul v2.4S, v2.4S,v26.s[2] +sqrdmulh v21.4S, v10.4S, v25.s[3] +sub v18.4s, v8.4s, v6.4s +mul v10.4S, v10.4S,v26.s[3] +add v8.4s, v8.4s, v6.4s +sqrdmulh v6.4S, v12.4S, v25.s[1] +sub v1.4s, v7.4s, v16.4s +mul v12.4S, v12.4S,v26.s[1] +add v7.4s, v7.4s, v16.4s +sqrdmulh v16.4S, v19.4S, v25.s[0] +sub v15.4s, v14.4s, v22.4s +mul v19.4S, v19.4S,v26.s[0] +add v14.4s, v14.4s, v22.4s +mla v2.4S, v9.4S, v31.s[0] +sub v9.4s, v11.4s, v3.4s +sqrdmulh v22.4S, v7.4S, v23.s[0] +add v11.4s, v11.4s, v3.4s +mla v10.4S, v21.4S, v31.s[0] +sub v21.4s, v13.4s, v2.4s +sqrdmulh v3.4S, v1.4S, v23.s[1] +add v13.4s, v13.4s, v2.4s +mla v12.4S, v6.4S, v31.s[0] +sub v6.4s, v17.4s, v10.4s +sqrdmulh v2.4S, v11.4S, v23.s[2] +add v17.4s, v17.4s, v10.4s +mla v19.4S, v16.4S, v31.s[0] +sub v16.4s, v0.4s, v12.4s +sqrdmulh v10.4S, v9.4S, v23.s[3] +add v0.4s, v0.4s, v12.4s +mul v7.4S, v7.4S,v24.s[0] +sub v12.4s, v20.4s, v19.4s +mul v1.4S, v1.4S,v24.s[1] +add v20.4s, v20.4s, v19.4s +mla v7.4S, v22.4S, v31.s[0] +str q21, [x0, #352] +mla v1.4S, v3.4S, v31.s[0] +str q13, [x0, #288] +mul v11.4S, v11.4S,v24.s[2] +str q6, [x0, #480] +mul v9.4S, v9.4S,v24.s[3] +str q17, [x0, #416] +mla v11.4S, v2.4S, v31.s[0] +str q16, [x0, #224] +mla v9.4S, v10.4S, v31.s[0] +str q0, [x0, #160] +ldr q0, [x0, #944] +sqrdmulh v10.4S, v0.4S, v29.s[0] +str q12, [x0, #96] +mul v0.4S, v0.4S,v30.s[0] +str q20, [x0, #32] +ldr q20, [x0, #1008] +sqrdmulh v12.4S, v20.4S, v29.s[0] +sub v16.4s, v8.4s, v7.4s +str q16, [x0, #608] +mul v20.4S, v20.4S,v30.s[0] +add v8.4s, v8.4s, v7.4s +ldr q7, [x0, #816] +sqrdmulh v16.4S, v7.4S, v29.s[0] +sub v2.4s, v18.4s, v1.4s +str q8, [x0, #544] +mul v7.4S, v7.4S,v30.s[0] +add v18.4s, v18.4s, v1.4s +ldr q1, [x0, #880] +sqrdmulh v8.4S, v1.4S, v29.s[0] +sub v17.4s, v14.4s, v11.4s +str q2, [x0, #736] +mul v1.4S, v1.4S,v30.s[0] +add v14.4s, v14.4s, v11.4s +ldr q11, [x0, #560] +mla v0.4S, v10.4S, v31.s[0] +sub v10.4s, v15.4s, v9.4s +str q18, [x0, #672] +sqrdmulh v18.4S, v11.4S, v29.s[0] +add v15.4s, v15.4s, v9.4s +ldr q9, [x0, #624] +mla v20.4S, v12.4S, v31.s[0] +str q17, [x0, #864] +sqrdmulh v17.4S, v9.4S, v29.s[0] +ldr q12, [x0, #688] +mla v7.4S, v16.4S, v31.s[0] +str q14, [x0, #800] +sqrdmulh v14.4S, v12.4S, v29.s[0] +ldr q16, [x0, #752] +mla v1.4S, v8.4S, v31.s[0] +str q10, [x0, #992] +sqrdmulh v10.4S, v16.4S, v29.s[0] +ldr q8, [x0, #432] +ldr q2, [x0, #496] +mul v11.4S, v11.4S,v30.s[0] +sub v6.4s, v8.4s, v0.4s +str q15, [x0, #928] +mul v9.4S, v9.4S,v30.s[0] +add v8.4s, v8.4s, v0.4s +ldr q0, [x0, #304] +ldr q15, [x0, #368] +mla v11.4S, v18.4S, v31.s[0] +sub v18.4s, v2.4s, v20.4s +mla v9.4S, v17.4S, v31.s[0] +add v2.4s, v2.4s, v20.4s +ldr q20, [x0, #48] +ldr q17, [x0, #112] +mul v12.4S, v12.4S,v30.s[0] +sub v13.4s, v0.4s, v7.4s +mul v16.4S, v16.4S,v30.s[0] +add v0.4s, v0.4s, v7.4s +ldr q7, [x0, #176] +ldr q3, [x0, #240] +mla v12.4S, v14.4S, v31.s[0] +sub v14.4s, v15.4s, v1.4s +mla v16.4S, v10.4S, v31.s[0] +add v15.4s, v15.4s, v1.4s +sqrdmulh v1.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +sqrdmulh v10.4S, v2.4S, v29.s[1] +sub v21.4s, v20.4s, v11.4s +mul v2.4S, v2.4S,v30.s[1] +add v20.4s, v20.4s, v11.4s +sqrdmulh v11.4S, v0.4S, v29.s[1] +sub v22.4s, v17.4s, v9.4s +mul v0.4S, v0.4S,v30.s[1] +add v17.4s, v17.4s, v9.4s +sqrdmulh v9.4S, v15.4S, v29.s[1] +sub v19.4s, v7.4s, v12.4s +mul v15.4S, v15.4S,v30.s[1] +add v7.4s, v7.4s, v12.4s +mla v8.4S, v1.4S, v31.s[0] +sub v1.4s, v3.4s, v16.4s +sqrdmulh v12.4S, v6.4S, v29.s[2] +add v3.4s, v3.4s, v16.4s +mla v2.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v18.4S, v29.s[2] +mla v0.4S, v11.4S, v31.s[0] +sqrdmulh v11.4S, v13.4S, v29.s[2] +mla v15.4S, v9.4S, v31.s[0] +sqrdmulh v9.4S, v14.4S, v29.s[2] +mul v6.4S, v6.4S,v30.s[2] +sub v16.4s, v7.4s, v8.4s +mul v18.4S, v18.4S,v30.s[2] +add v7.4s, v7.4s, v8.4s +mla v6.4S, v12.4S, v31.s[0] +sub v12.4s, v3.4s, v2.4s +mla v18.4S, v10.4S, v31.s[0] +add v3.4s, v3.4s, v2.4s +mul v13.4S, v13.4S,v30.s[2] +sub v2.4s, v20.4s, v0.4s +mul v14.4S, v14.4S,v30.s[2] +add v20.4s, v20.4s, v0.4s +mla v13.4S, v11.4S, v31.s[0] +sub v11.4s, v17.4s, v15.4s +mla v14.4S, v9.4S, v31.s[0] +add v17.4s, v17.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v27.s[1] +mul v16.4S, v16.4S,v28.s[1] +sqrdmulh v9.4S, v12.4S, v27.s[1] +sub v0.4s, v19.4s, v6.4s +mul v12.4S, v12.4S,v28.s[1] +add v19.4s, v19.4s, v6.4s +sqrdmulh v6.4S, v7.4S, v27.s[0] +sub v10.4s, v1.4s, v18.4s +mul v7.4S, v7.4S,v28.s[0] +add v1.4s, v1.4s, v18.4s +sqrdmulh v18.4S, v3.4S, v27.s[0] +sub v8.4s, v21.4s, v13.4s +mul v3.4S, v3.4S,v28.s[0] +add v21.4s, v21.4s, v13.4s +mla v16.4S, v15.4S, v31.s[0] +sub v15.4s, v22.4s, v14.4s +sqrdmulh v13.4S, v19.4S, v27.s[2] +add v22.4s, v22.4s, v14.4s +mla v12.4S, v9.4S, v31.s[0] +sqrdmulh v9.4S, v1.4S, v27.s[2] +mla v7.4S, v6.4S, v31.s[0] +sqrdmulh v6.4S, v0.4S, v27.s[3] +mla v3.4S, v18.4S, v31.s[0] +sqrdmulh v18.4S, v10.4S, v27.s[3] +mul v19.4S, v19.4S,v28.s[2] +sub v14.4s, v2.4s, v16.4s +mul v1.4S, v1.4S,v28.s[2] +add v2.4s, v2.4s, v16.4s +mla v19.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v12.4s +mla v1.4S, v9.4S, v31.s[0] +add v11.4s, v11.4s, v12.4s +mul v0.4S, v0.4S,v28.s[3] +sub v12.4s, v20.4s, v7.4s +mul v10.4S, v10.4S,v28.s[3] +add v20.4s, v20.4s, v7.4s +mla v0.4S, v6.4S, v31.s[0] +sub v6.4s, v17.4s, v3.4s +mla v10.4S, v18.4S, v31.s[0] +add v17.4s, v17.4s, v3.4s +sqrdmulh v3.4S, v11.4S, v25.s[2] +mul v11.4S, v11.4S,v26.s[2] +sqrdmulh v18.4S, v13.4S, v25.s[3] +sub v7.4s, v21.4s, v19.4s +mul v13.4S, v13.4S,v26.s[3] +add v21.4s, v21.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v25.s[1] +sub v9.4s, v22.4s, v1.4s +mul v6.4S, v6.4S,v26.s[1] +add v22.4s, v22.4s, v1.4s +sqrdmulh v1.4S, v17.4S, v25.s[0] +sub v16.4s, v8.4s, v0.4s +mul v17.4S, v17.4S,v26.s[0] +add v8.4s, v8.4s, v0.4s +mla v11.4S, v3.4S, v31.s[0] +sub v3.4s, v15.4s, v10.4s +sqrdmulh v0.4S, v22.4S, v23.s[0] +add v15.4s, v15.4s, v10.4s +mla v13.4S, v18.4S, v31.s[0] +sub v18.4s, v2.4s, v11.4s +sqrdmulh v10.4S, v9.4S, v23.s[1] +add v2.4s, v2.4s, v11.4s +mla v6.4S, v19.4S, v31.s[0] +sub v19.4s, v14.4s, v13.4s +sqrdmulh v11.4S, v15.4S, v23.s[2] +add v14.4s, v14.4s, v13.4s +mla v17.4S, v1.4S, v31.s[0] +sub v1.4s, v12.4s, v6.4s +sqrdmulh v13.4S, v3.4S, v23.s[3] +add v12.4s, v12.4s, v6.4s +mul v22.4S, v22.4S,v24.s[0] +sub v6.4s, v20.4s, v17.4s +mul v9.4S, v9.4S,v24.s[1] +add v20.4s, v20.4s, v17.4s +mla v22.4S, v0.4S, v31.s[0] +str q18, [x0, #368] +mla v9.4S, v10.4S, v31.s[0] +str q2, [x0, #304] +mul v15.4S, v15.4S,v24.s[2] +str q19, [x0, #496] +mul v3.4S, v3.4S,v24.s[3] +str q14, [x0, #432] +mla v15.4S, v11.4S, v31.s[0] +str q1, [x0, #240] +mla v3.4S, v13.4S, v31.s[0] +str q12, [x0, #176] +ldr q12, [x0, #896] +sqrdmulh v13.4S, v12.4S, v29.s[0] +str q6, [x0, #112] +mul v12.4S, v12.4S,v30.s[0] +str q20, [x0, #48] +ldr q20, [x0, #960] +sqrdmulh v6.4S, v20.4S, v29.s[0] +sub v1.4s, v21.4s, v22.4s +str q1, [x0, #624] +mul v20.4S, v20.4S,v30.s[0] +add v21.4s, v21.4s, v22.4s +ldr q22, [x0, #768] +sqrdmulh v1.4S, v22.4S, v29.s[0] +sub v11.4s, v7.4s, v9.4s +str q21, [x0, #560] +mul v22.4S, v22.4S,v30.s[0] +add v7.4s, v7.4s, v9.4s +ldr q9, [x0, #832] +sqrdmulh v21.4S, v9.4S, v29.s[0] +sub v14.4s, v8.4s, v15.4s +str q11, [x0, #752] +mul v9.4S, v9.4S,v30.s[0] +add v8.4s, v8.4s, v15.4s +ldr q15, [x0, #512] +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v16.4s, v3.4s +str q7, [x0, #688] +sqrdmulh v7.4S, v15.4S, v29.s[0] +add v16.4s, v16.4s, v3.4s +ldr q3, [x0, #576] +mla v20.4S, v6.4S, v31.s[0] +str q14, [x0, #880] +sqrdmulh v14.4S, v3.4S, v29.s[0] +ldr q6, [x0, #640] +mla v22.4S, v1.4S, v31.s[0] +str q8, [x0, #816] +sqrdmulh v8.4S, v6.4S, v29.s[0] +ldr q1, [x0, #704] +mla v9.4S, v21.4S, v31.s[0] +str q13, [x0, #1008] +sqrdmulh v13.4S, v1.4S, v29.s[0] +ldr q21, [x0, #384] +ldr q11, [x0, #448] +mul v15.4S, v15.4S,v30.s[0] +sub v19.4s, v21.4s, v12.4s +str q16, [x0, #944] +mul v3.4S, v3.4S,v30.s[0] +add v21.4s, v21.4s, v12.4s +ldr q12, [x0, #256] +ldr q16, [x0, #320] +mla v15.4S, v7.4S, v31.s[0] +sub v7.4s, v11.4s, v20.4s +mla v3.4S, v14.4S, v31.s[0] +add v11.4s, v11.4s, v20.4s +ldr q20, [x0, #0] +ldr q14, [x0, #64] +mul v6.4S, v6.4S,v30.s[0] +sub v2.4s, v12.4s, v22.4s +mul v1.4S, v1.4S,v30.s[0] +add v12.4s, v12.4s, v22.4s +ldr q22, [x0, #128] +ldr q10, [x0, #192] +mla v6.4S, v8.4S, v31.s[0] +sub v8.4s, v16.4s, v9.4s +mla v1.4S, v13.4S, v31.s[0] +add v16.4s, v16.4s, v9.4s +sqrdmulh v9.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sqrdmulh v13.4S, v11.4S, v29.s[1] +sub v18.4s, v20.4s, v15.4s +mul v11.4S, v11.4S,v30.s[1] +add v20.4s, v20.4s, v15.4s +sqrdmulh v15.4S, v12.4S, v29.s[1] +sub v0.4s, v14.4s, v3.4s +mul v12.4S, v12.4S,v30.s[1] +add v14.4s, v14.4s, v3.4s +sqrdmulh v3.4S, v16.4S, v29.s[1] +sub v17.4s, v22.4s, v6.4s +mul v16.4S, v16.4S,v30.s[1] +add v22.4s, v22.4s, v6.4s +mla v21.4S, v9.4S, v31.s[0] +sub v9.4s, v10.4s, v1.4s +sqrdmulh v6.4S, v19.4S, v29.s[2] +add v10.4s, v10.4s, v1.4s +mla v11.4S, v13.4S, v31.s[0] +sqrdmulh v13.4S, v7.4S, v29.s[2] +mla v12.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v2.4S, v29.s[2] +mla v16.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v8.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +sub v1.4s, v22.4s, v21.4s +mul v7.4S, v7.4S,v30.s[2] +add v22.4s, v22.4s, v21.4s +mla v19.4S, v6.4S, v31.s[0] +sub v6.4s, v10.4s, v11.4s +mla v7.4S, v13.4S, v31.s[0] +add v10.4s, v10.4s, v11.4s +mul v2.4S, v2.4S,v30.s[2] +sub v11.4s, v20.4s, v12.4s +mul v8.4S, v8.4S,v30.s[2] +add v20.4s, v20.4s, v12.4s +mla v2.4S, v15.4S, v31.s[0] +sub v15.4s, v14.4s, v16.4s +mla v8.4S, v3.4S, v31.s[0] +add v14.4s, v14.4s, v16.4s +sqrdmulh v16.4S, v1.4S, v27.s[1] +mul v1.4S, v1.4S,v28.s[1] +sqrdmulh v3.4S, v6.4S, v27.s[1] +sub v12.4s, v17.4s, v19.4s +mul v6.4S, v6.4S,v28.s[1] +add v17.4s, v17.4s, v19.4s +sqrdmulh v19.4S, v22.4S, v27.s[0] +sub v13.4s, v9.4s, v7.4s +mul v22.4S, v22.4S,v28.s[0] +add v9.4s, v9.4s, v7.4s +sqrdmulh v7.4S, v10.4S, v27.s[0] +sub v21.4s, v18.4s, v2.4s +mul v10.4S, v10.4S,v28.s[0] +add v18.4s, v18.4s, v2.4s +mla v1.4S, v16.4S, v31.s[0] +sub v16.4s, v0.4s, v8.4s +sqrdmulh v2.4S, v17.4S, v27.s[2] +add v0.4s, v0.4s, v8.4s +mla v6.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v9.4S, v27.s[2] +mla v22.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v12.4S, v27.s[3] +mla v10.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v13.4S, v27.s[3] +mul v17.4S, v17.4S,v28.s[2] +sub v8.4s, v11.4s, v1.4s +mul v9.4S, v9.4S,v28.s[2] +add v11.4s, v11.4s, v1.4s +mla v17.4S, v2.4S, v31.s[0] +sub v2.4s, v15.4s, v6.4s +mla v9.4S, v3.4S, v31.s[0] +add v15.4s, v15.4s, v6.4s +mul v12.4S, v12.4S,v28.s[3] +sub v6.4s, v20.4s, v22.4s +mul v13.4S, v13.4S,v28.s[3] +add v20.4s, v20.4s, v22.4s +mla v12.4S, v19.4S, v31.s[0] +sub v19.4s, v14.4s, v10.4s +mla v13.4S, v7.4S, v31.s[0] +add v14.4s, v14.4s, v10.4s +sqrdmulh v10.4S, v15.4S, v25.s[2] +mul v15.4S, v15.4S,v26.s[2] +sqrdmulh v7.4S, v2.4S, v25.s[3] +sub v22.4s, v18.4s, v17.4s +mul v2.4S, v2.4S,v26.s[3] +add v18.4s, v18.4s, v17.4s +sqrdmulh v17.4S, v19.4S, v25.s[1] +sub v3.4s, v0.4s, v9.4s +mul v19.4S, v19.4S,v26.s[1] +add v0.4s, v0.4s, v9.4s +sqrdmulh v9.4S, v14.4S, v25.s[0] +sub v1.4s, v21.4s, v12.4s +mul v14.4S, v14.4S,v26.s[0] +add v21.4s, v21.4s, v12.4s +mla v15.4S, v10.4S, v31.s[0] +sub v10.4s, v16.4s, v13.4s +sqrdmulh v12.4S, v0.4S, v23.s[0] +add v16.4s, v16.4s, v13.4s +mla v2.4S, v7.4S, v31.s[0] +sub v7.4s, v11.4s, v15.4s +sqrdmulh v13.4S, v3.4S, v23.s[1] +add v11.4s, v11.4s, v15.4s +mla v19.4S, v17.4S, v31.s[0] +sub v17.4s, v8.4s, v2.4s +sqrdmulh v15.4S, v16.4S, v23.s[2] +add v8.4s, v8.4s, v2.4s +mla v14.4S, v9.4S, v31.s[0] +sub v9.4s, v6.4s, v19.4s +sqrdmulh v2.4S, v10.4S, v23.s[3] +add v6.4s, v6.4s, v19.4s +mul v0.4S, v0.4S,v24.s[0] +sub v19.4s, v20.4s, v14.4s +mul v3.4S, v3.4S,v24.s[1] +add v20.4s, v20.4s, v14.4s +mla v0.4S, v12.4S, v31.s[0] +str q7, [x0, #320] +mla v3.4S, v13.4S, v31.s[0] +str q11, [x0, #256] +mul v16.4S, v16.4S,v24.s[2] +str q17, [x0, #448] +mul v10.4S, v10.4S,v24.s[3] +str q8, [x0, #384] +mla v16.4S, v15.4S, v31.s[0] +str q9, [x0, #192] +mla v10.4S, v2.4S, v31.s[0] +str q6, [x0, #128] +ldr q6, [x0, #912] +sqrdmulh v2.4S, v6.4S, v29.s[0] +str q19, [x0, #64] +mul v6.4S, v6.4S,v30.s[0] +str q20, [x0, #0] +ldr q20, [x0, #976] +sqrdmulh v19.4S, v20.4S, v29.s[0] +sub v9.4s, v18.4s, v0.4s +str q9, [x0, #576] +mul v20.4S, v20.4S,v30.s[0] +add v18.4s, v18.4s, v0.4s +ldr q0, [x0, #784] +sqrdmulh v9.4S, v0.4S, v29.s[0] +sub v15.4s, v22.4s, v3.4s +str q18, [x0, #512] +mul v0.4S, v0.4S,v30.s[0] +add v22.4s, v22.4s, v3.4s +ldr q3, [x0, #848] +sqrdmulh v18.4S, v3.4S, v29.s[0] +sub v8.4s, v21.4s, v16.4s +str q15, [x0, #704] +mul v3.4S, v3.4S,v30.s[0] +add v21.4s, v21.4s, v16.4s +ldr q16, [x0, #528] +mla v6.4S, v2.4S, v31.s[0] +sub v2.4s, v1.4s, v10.4s +str q22, [x0, #640] +sqrdmulh v22.4S, v16.4S, v29.s[0] +add v1.4s, v1.4s, v10.4s +ldr q10, [x0, #592] +mla v20.4S, v19.4S, v31.s[0] +str q8, [x0, #832] +sqrdmulh v8.4S, v10.4S, v29.s[0] +ldr q19, [x0, #656] +mla v0.4S, v9.4S, v31.s[0] +str q21, [x0, #768] +sqrdmulh v21.4S, v19.4S, v29.s[0] +ldr q9, [x0, #720] +mla v3.4S, v18.4S, v31.s[0] +str q2, [x0, #960] +sqrdmulh v2.4S, v9.4S, v29.s[0] +ldr q18, [x0, #400] +ldr q15, [x0, #464] +mul v16.4S, v16.4S,v30.s[0] +sub v17.4s, v18.4s, v6.4s +str q1, [x0, #896] +mul v10.4S, v10.4S,v30.s[0] +add v18.4s, v18.4s, v6.4s +ldr q6, [x0, #272] +ldr q1, [x0, #336] +mla v16.4S, v22.4S, v31.s[0] +sub v22.4s, v15.4s, v20.4s +mla v10.4S, v8.4S, v31.s[0] +add v15.4s, v15.4s, v20.4s +ldr q20, [x0, #16] +ldr q8, [x0, #80] +mul v19.4S, v19.4S,v30.s[0] +sub v11.4s, v6.4s, v0.4s +mul v9.4S, v9.4S,v30.s[0] +add v6.4s, v6.4s, v0.4s +ldr q0, [x0, #144] +ldr q13, [x0, #208] +mla v19.4S, v21.4S, v31.s[0] +sub v21.4s, v1.4s, v3.4s +mla v9.4S, v2.4S, v31.s[0] +add v1.4s, v1.4s, v3.4s +sqrdmulh v3.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sqrdmulh v2.4S, v15.4S, v29.s[1] +sub v7.4s, v20.4s, v16.4s +mul v15.4S, v15.4S,v30.s[1] +add v20.4s, v20.4s, v16.4s +sqrdmulh v16.4S, v6.4S, v29.s[1] +sub v12.4s, v8.4s, v10.4s +mul v6.4S, v6.4S,v30.s[1] +add v8.4s, v8.4s, v10.4s +sqrdmulh v10.4S, v1.4S, v29.s[1] +sub v14.4s, v0.4s, v19.4s +mul v1.4S, v1.4S,v30.s[1] +add v0.4s, v0.4s, v19.4s +mla v18.4S, v3.4S, v31.s[0] +sub v3.4s, v13.4s, v9.4s +sqrdmulh v19.4S, v17.4S, v29.s[2] +add v13.4s, v13.4s, v9.4s +mla v15.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v22.4S, v29.s[2] +mla v6.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v11.4S, v29.s[2] +mla v1.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v21.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +sub v9.4s, v0.4s, v18.4s +mul v22.4S, v22.4S,v30.s[2] +add v0.4s, v0.4s, v18.4s +mla v17.4S, v19.4S, v31.s[0] +sub v19.4s, v13.4s, v15.4s +mla v22.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v15.4s +mul v11.4S, v11.4S,v30.s[2] +sub v15.4s, v20.4s, v6.4s +mul v21.4S, v21.4S,v30.s[2] +add v20.4s, v20.4s, v6.4s +mla v11.4S, v16.4S, v31.s[0] +sub v16.4s, v8.4s, v1.4s +mla v21.4S, v10.4S, v31.s[0] +add v8.4s, v8.4s, v1.4s +sqrdmulh v29.4S, v9.4S, v27.s[1] +mul v9.4S, v9.4S,v28.s[1] +sqrdmulh v30.4S, v19.4S, v27.s[1] +sub v1.4s, v14.4s, v17.4s +mul v19.4S, v19.4S,v28.s[1] +add v14.4s, v14.4s, v17.4s +sqrdmulh v17.4S, v0.4S, v27.s[0] +sub v10.4s, v3.4s, v22.4s +mul v0.4S, v0.4S,v28.s[0] +add v3.4s, v3.4s, v22.4s +sqrdmulh v22.4S, v13.4S, v27.s[0] +sub v6.4s, v7.4s, v11.4s +mul v13.4S, v13.4S,v28.s[0] +add v7.4s, v7.4s, v11.4s +mla v9.4S, v29.4S, v31.s[0] +sub v29.4s, v12.4s, v21.4s +sqrdmulh v11.4S, v14.4S, v27.s[2] +add v12.4s, v12.4s, v21.4s +mla v19.4S, v30.4S, v31.s[0] +sqrdmulh v30.4S, v3.4S, v27.s[2] +mla v0.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v1.4S, v27.s[3] +mla v13.4S, v22.4S, v31.s[0] +sqrdmulh v22.4S, v10.4S, v27.s[3] +mul v14.4S, v14.4S,v28.s[2] +sub v21.4s, v15.4s, v9.4s +mul v3.4S, v3.4S,v28.s[2] +add v15.4s, v15.4s, v9.4s +mla v14.4S, v11.4S, v31.s[0] +sub v11.4s, v16.4s, v19.4s +mla v3.4S, v30.4S, v31.s[0] +add v16.4s, v16.4s, v19.4s +mul v1.4S, v1.4S,v28.s[3] +sub v19.4s, v20.4s, v0.4s +mul v10.4S, v10.4S,v28.s[3] +add v20.4s, v20.4s, v0.4s +mla v1.4S, v17.4S, v31.s[0] +sub v17.4s, v8.4s, v13.4s +mla v10.4S, v22.4S, v31.s[0] +add v8.4s, v8.4s, v13.4s +sqrdmulh v27.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sqrdmulh v28.4S, v11.4S, v25.s[3] +sub v13.4s, v7.4s, v14.4s +mul v11.4S, v11.4S,v26.s[3] +add v7.4s, v7.4s, v14.4s +sqrdmulh v14.4S, v17.4S, v25.s[1] +sub v22.4s, v12.4s, v3.4s +mul v17.4S, v17.4S,v26.s[1] +add v12.4s, v12.4s, v3.4s +sqrdmulh v3.4S, v8.4S, v25.s[0] +sub v0.4s, v6.4s, v1.4s +mul v8.4S, v8.4S,v26.s[0] +add v6.4s, v6.4s, v1.4s +mla v16.4S, v27.4S, v31.s[0] +sub v27.4s, v29.4s, v10.4s +sqrdmulh v25.4S, v12.4S, v23.s[0] +add v29.4s, v29.4s, v10.4s +mla v11.4S, v28.4S, v31.s[0] +sub v28.4s, v15.4s, v16.4s +sqrdmulh v10.4S, v22.4S, v23.s[1] +add v15.4s, v15.4s, v16.4s +mla v17.4S, v14.4S, v31.s[0] +sub v14.4s, v21.4s, v11.4s +sqrdmulh v16.4S, v29.4S, v23.s[2] +add v21.4s, v21.4s, v11.4s +mla v8.4S, v3.4S, v31.s[0] +sub v3.4s, v19.4s, v17.4s +sqrdmulh v11.4S, v27.4S, v23.s[3] +add v19.4s, v19.4s, v17.4s +mul v12.4S, v12.4S,v24.s[0] +sub v17.4s, v20.4s, v8.4s +mul v22.4S, v22.4S,v24.s[1] +add v20.4s, v20.4s, v8.4s +mla v12.4S, v25.4S, v31.s[0] +str q28, [x0, #336] +mla v22.4S, v10.4S, v31.s[0] +str q15, [x0, #272] +mul v29.4S, v29.4S,v24.s[2] +str q14, [x0, #464] +mul v27.4S, v27.4S,v24.s[3] +str q21, [x0, #400] +mla v29.4S, v16.4S, v31.s[0] +str q3, [x0, #208] +mla v27.4S, v11.4S, v31.s[0] +str q19, [x0, #144] +str q17, [x0, #80] +str q20, [x0, #16] +sub v20.4s, v7.4s, v12.4s +str q20, [x0, #592] +add v7.4s, v7.4s, v12.4s +sub v12.4s, v13.4s, v22.4s +str q7, [x0, #528] +add v13.4s, v13.4s, v22.4s +sub v22.4s, v6.4s, v29.4s +str q12, [x0, #720] +add v6.4s, v6.4s, v29.4s +sub v29.4s, v0.4s, v27.4s +str q13, [x0, #656] +add v0.4s, v0.4s, v27.4s +str q22, [x0, #848] +str q6, [x0, #784] +str q29, [x0, #976] +str q0, [x0, #912] +ldr q4, [x17, #+128] +ldr q5, [x17, #+144] +ldr q18, [x17, #+160] +ldr q2, [x17, #+176] +ldr q9, [x17, #+192] +ldr q30, [x17, #+208] +ldr q1, [x17, #+224] +ldr q26, [x17, #+240] +ldr q8, [x0, #32] +ldr q25, [x0, #48] +ldr q28, [x0, #0] +ldr q10, [x0, #16] +sqrdmulh v15.4S, v8.4S, v5.s[0] +mul v8.4S, v8.4S,v4.s[0] +mla v8.4S, v15.4S, v31.s[0] +sub v15.4s, v28.4s, v8.4s +add v28.4s, v28.4s, v8.4s +sqrdmulh v8.4S, v25.4S, v5.s[0] +mul v25.4S, v25.4S,v4.s[0] +mla v25.4S, v8.4S, v31.s[0] +sub v8.4s, v10.4s, v25.4s +add v10.4s, v10.4s, v25.4s +sqrdmulh v25.4S, v10.4S, v5.s[1] +mul v10.4S, v10.4S,v4.s[1] +mla v10.4S, v25.4S, v31.s[0] +sub v25.4s, v28.4s, v10.4s +add v28.4s, v28.4s, v10.4s +sqrdmulh v10.4S, v8.4S, v5.s[2] +mul v8.4S, v8.4S,v4.s[2] +mla v8.4S, v10.4S, v31.s[0] +sub v10.4s, v15.4s, v8.4s +add v15.4s, v15.4s, v8.4s +trn1 v8.4S, v28.4S, v25.4S +trn2 v14.4S, v28.4S, v25.4S +trn1 v21.4S, v15.4S, v10.4S +trn2 v16.4S, v15.4S, v10.4S +trn2 v15.2D, v8.2D, v21.2D +trn2 v10.2D, v14.2D, v16.2D +trn1 v28.2D, v8.2D, v21.2D +trn1 v25.2D, v14.2D, v16.2D +sqrdmulh v16.4S, v15.4S, v2.4S +mul v15.4S, v15.4S,v18.4S +mla v15.4S, v16.4S, v31.s[0] +sub v16.4s, v28.4s, v15.4s +add v28.4s, v28.4s, v15.4s +sqrdmulh v15.4S, v10.4S, v2.4S +mul v10.4S, v10.4S,v18.4S +mla v10.4S, v15.4S, v31.s[0] +sub v15.4s, v25.4s, v10.4s +add v25.4s, v25.4s, v10.4s +sqrdmulh v10.4S, v25.4S, v30.4S +mul v25.4S, v25.4S,v9.4S +mla v25.4S, v10.4S, v31.s[0] +sub v10.4s, v28.4s, v25.4s +add v28.4s, v28.4s, v25.4s +sqrdmulh v25.4S, v15.4S, v26.4S +mul v15.4S, v15.4S,v1.4S +mla v15.4S, v25.4S, v31.s[0] +sub v25.4s, v16.4s, v15.4s +add v16.4s, v16.4s, v15.4s +str q28, [x0, #0] +str q10, [x0, #16] +str q16, [x0, #32] +str q25, [x0, #48] +ldr q25, [x17, #+256] +ldr q16, [x17, #+272] +ldr q10, [x17, #+288] +ldr q28, [x17, #+304] +ldr q15, [x17, #+320] +ldr q14, [x17, #+336] +ldr q21, [x17, #+352] +ldr q8, [x17, #+368] +ldr q26, [x0, #96] +ldr q1, [x0, #112] +ldr q30, [x0, #64] +ldr q9, [x0, #80] +sqrdmulh v2.4S, v26.4S, v16.s[0] +mul v26.4S, v26.4S,v25.s[0] +mla v26.4S, v2.4S, v31.s[0] +sub v2.4s, v30.4s, v26.4s +add v30.4s, v30.4s, v26.4s +sqrdmulh v26.4S, v1.4S, v16.s[0] +mul v1.4S, v1.4S,v25.s[0] +mla v1.4S, v26.4S, v31.s[0] +sub v26.4s, v9.4s, v1.4s +add v9.4s, v9.4s, v1.4s +sqrdmulh v1.4S, v9.4S, v16.s[1] +mul v9.4S, v9.4S,v25.s[1] +mla v9.4S, v1.4S, v31.s[0] +sub v1.4s, v30.4s, v9.4s +add v30.4s, v30.4s, v9.4s +sqrdmulh v9.4S, v26.4S, v16.s[2] +mul v26.4S, v26.4S,v25.s[2] +mla v26.4S, v9.4S, v31.s[0] +sub v9.4s, v2.4s, v26.4s +add v2.4s, v2.4s, v26.4s +trn1 v26.4S, v30.4S, v1.4S +trn2 v18.4S, v30.4S, v1.4S +trn1 v5.4S, v2.4S, v9.4S +trn2 v4.4S, v2.4S, v9.4S +trn2 v2.2D, v26.2D, v5.2D +trn2 v9.2D, v18.2D, v4.2D +trn1 v30.2D, v26.2D, v5.2D +trn1 v1.2D, v18.2D, v4.2D +sqrdmulh v4.4S, v2.4S, v28.4S +mul v2.4S, v2.4S,v10.4S +mla v2.4S, v4.4S, v31.s[0] +sub v4.4s, v30.4s, v2.4s +add v30.4s, v30.4s, v2.4s +sqrdmulh v2.4S, v9.4S, v28.4S +mul v9.4S, v9.4S,v10.4S +mla v9.4S, v2.4S, v31.s[0] +sub v2.4s, v1.4s, v9.4s +add v1.4s, v1.4s, v9.4s +sqrdmulh v9.4S, v1.4S, v14.4S +mul v1.4S, v1.4S,v15.4S +mla v1.4S, v9.4S, v31.s[0] +sub v9.4s, v30.4s, v1.4s +add v30.4s, v30.4s, v1.4s +sqrdmulh v1.4S, v2.4S, v8.4S +mul v2.4S, v2.4S,v21.4S +mla v2.4S, v1.4S, v31.s[0] +sub v1.4s, v4.4s, v2.4s +add v4.4s, v4.4s, v2.4s +str q30, [x0, #64] +str q9, [x0, #80] +str q4, [x0, #96] +str q1, [x0, #112] +ldr q1, [x17, #+384] +ldr q4, [x17, #+400] +ldr q9, [x17, #+416] +ldr q30, [x17, #+432] +ldr q2, [x17, #+448] +ldr q18, [x17, #+464] +ldr q5, [x17, #+480] +ldr q26, [x17, #+496] +ldr q8, [x0, #160] +ldr q21, [x0, #176] +ldr q14, [x0, #128] +ldr q15, [x0, #144] +sqrdmulh v28.4S, v8.4S, v4.s[0] +mul v8.4S, v8.4S,v1.s[0] +mla v8.4S, v28.4S, v31.s[0] +sub v28.4s, v14.4s, v8.4s +add v14.4s, v14.4s, v8.4s +sqrdmulh v8.4S, v21.4S, v4.s[0] +mul v21.4S, v21.4S,v1.s[0] +mla v21.4S, v8.4S, v31.s[0] +sub v8.4s, v15.4s, v21.4s +add v15.4s, v15.4s, v21.4s +sqrdmulh v21.4S, v15.4S, v4.s[1] +mul v15.4S, v15.4S,v1.s[1] +mla v15.4S, v21.4S, v31.s[0] +sub v21.4s, v14.4s, v15.4s +add v14.4s, v14.4s, v15.4s +sqrdmulh v15.4S, v8.4S, v4.s[2] +mul v8.4S, v8.4S,v1.s[2] +mla v8.4S, v15.4S, v31.s[0] +sub v15.4s, v28.4s, v8.4s +add v28.4s, v28.4s, v8.4s +trn1 v8.4S, v14.4S, v21.4S +trn2 v10.4S, v14.4S, v21.4S +trn1 v16.4S, v28.4S, v15.4S +trn2 v25.4S, v28.4S, v15.4S +trn2 v28.2D, v8.2D, v16.2D +trn2 v15.2D, v10.2D, v25.2D +trn1 v14.2D, v8.2D, v16.2D +trn1 v21.2D, v10.2D, v25.2D +sqrdmulh v25.4S, v28.4S, v30.4S +mul v28.4S, v28.4S,v9.4S +mla v28.4S, v25.4S, v31.s[0] +sub v25.4s, v14.4s, v28.4s +add v14.4s, v14.4s, v28.4s +sqrdmulh v28.4S, v15.4S, v30.4S +mul v15.4S, v15.4S,v9.4S +mla v15.4S, v28.4S, v31.s[0] +sub v28.4s, v21.4s, v15.4s +add v21.4s, v21.4s, v15.4s +sqrdmulh v15.4S, v21.4S, v18.4S +mul v21.4S, v21.4S,v2.4S +mla v21.4S, v15.4S, v31.s[0] +sub v15.4s, v14.4s, v21.4s +add v14.4s, v14.4s, v21.4s +sqrdmulh v21.4S, v28.4S, v26.4S +mul v28.4S, v28.4S,v5.4S +mla v28.4S, v21.4S, v31.s[0] +sub v21.4s, v25.4s, v28.4s +add v25.4s, v25.4s, v28.4s +str q14, [x0, #128] +str q15, [x0, #144] +str q25, [x0, #160] +str q21, [x0, #176] +ldr q21, [x17, #+512] +ldr q25, [x17, #+528] +ldr q15, [x17, #+544] +ldr q14, [x17, #+560] +ldr q28, [x17, #+576] +ldr q10, [x17, #+592] +ldr q16, [x17, #+608] +ldr q8, [x17, #+624] +ldr q26, [x0, #224] +ldr q5, [x0, #240] +ldr q18, [x0, #192] +ldr q2, [x0, #208] +sqrdmulh v30.4S, v26.4S, v25.s[0] +mul v26.4S, v26.4S,v21.s[0] +mla v26.4S, v30.4S, v31.s[0] +sub v30.4s, v18.4s, v26.4s +add v18.4s, v18.4s, v26.4s +sqrdmulh v26.4S, v5.4S, v25.s[0] +mul v5.4S, v5.4S,v21.s[0] +mla v5.4S, v26.4S, v31.s[0] +sub v26.4s, v2.4s, v5.4s +add v2.4s, v2.4s, v5.4s +sqrdmulh v5.4S, v2.4S, v25.s[1] +mul v2.4S, v2.4S,v21.s[1] +mla v2.4S, v5.4S, v31.s[0] +sub v5.4s, v18.4s, v2.4s +add v18.4s, v18.4s, v2.4s +sqrdmulh v2.4S, v26.4S, v25.s[2] +mul v26.4S, v26.4S,v21.s[2] +mla v26.4S, v2.4S, v31.s[0] +sub v2.4s, v30.4s, v26.4s +add v30.4s, v30.4s, v26.4s +trn1 v26.4S, v18.4S, v5.4S +trn2 v9.4S, v18.4S, v5.4S +trn1 v4.4S, v30.4S, v2.4S +trn2 v1.4S, v30.4S, v2.4S +trn2 v30.2D, v26.2D, v4.2D +trn2 v2.2D, v9.2D, v1.2D +trn1 v18.2D, v26.2D, v4.2D +trn1 v5.2D, v9.2D, v1.2D +sqrdmulh v1.4S, v30.4S, v14.4S +mul v30.4S, v30.4S,v15.4S +mla v30.4S, v1.4S, v31.s[0] +sub v1.4s, v18.4s, v30.4s +add v18.4s, v18.4s, v30.4s +sqrdmulh v30.4S, v2.4S, v14.4S +mul v2.4S, v2.4S,v15.4S +mla v2.4S, v30.4S, v31.s[0] +sub v30.4s, v5.4s, v2.4s +add v5.4s, v5.4s, v2.4s +sqrdmulh v2.4S, v5.4S, v10.4S +mul v5.4S, v5.4S,v28.4S +mla v5.4S, v2.4S, v31.s[0] +sub v2.4s, v18.4s, v5.4s +add v18.4s, v18.4s, v5.4s +sqrdmulh v5.4S, v30.4S, v8.4S +mul v30.4S, v30.4S,v16.4S +mla v30.4S, v5.4S, v31.s[0] +sub v5.4s, v1.4s, v30.4s +add v1.4s, v1.4s, v30.4s +str q18, [x0, #192] +str q2, [x0, #208] +str q1, [x0, #224] +str q5, [x0, #240] +ldr q5, [x17, #+640] +ldr q1, [x17, #+656] +ldr q2, [x17, #+672] +ldr q18, [x17, #+688] +ldr q30, [x17, #+704] +ldr q9, [x17, #+720] +ldr q4, [x17, #+736] +ldr q26, [x17, #+752] +ldr q8, [x0, #288] +ldr q16, [x0, #304] +ldr q10, [x0, #256] +ldr q28, [x0, #272] +sqrdmulh v14.4S, v8.4S, v1.s[0] +mul v8.4S, v8.4S,v5.s[0] +mla v8.4S, v14.4S, v31.s[0] +sub v14.4s, v10.4s, v8.4s +add v10.4s, v10.4s, v8.4s +sqrdmulh v8.4S, v16.4S, v1.s[0] +mul v16.4S, v16.4S,v5.s[0] +mla v16.4S, v8.4S, v31.s[0] +sub v8.4s, v28.4s, v16.4s +add v28.4s, v28.4s, v16.4s +sqrdmulh v16.4S, v28.4S, v1.s[1] +mul v28.4S, v28.4S,v5.s[1] +mla v28.4S, v16.4S, v31.s[0] +sub v16.4s, v10.4s, v28.4s +add v10.4s, v10.4s, v28.4s +sqrdmulh v28.4S, v8.4S, v1.s[2] +mul v8.4S, v8.4S,v5.s[2] +mla v8.4S, v28.4S, v31.s[0] +sub v28.4s, v14.4s, v8.4s +add v14.4s, v14.4s, v8.4s +trn1 v8.4S, v10.4S, v16.4S +trn2 v15.4S, v10.4S, v16.4S +trn1 v25.4S, v14.4S, v28.4S +trn2 v21.4S, v14.4S, v28.4S +trn2 v14.2D, v8.2D, v25.2D +trn2 v28.2D, v15.2D, v21.2D +trn1 v10.2D, v8.2D, v25.2D +trn1 v16.2D, v15.2D, v21.2D +sqrdmulh v21.4S, v14.4S, v18.4S +mul v14.4S, v14.4S,v2.4S +mla v14.4S, v21.4S, v31.s[0] +sub v21.4s, v10.4s, v14.4s +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v28.4S, v18.4S +mul v28.4S, v28.4S,v2.4S +mla v28.4S, v14.4S, v31.s[0] +sub v14.4s, v16.4s, v28.4s +add v16.4s, v16.4s, v28.4s +sqrdmulh v28.4S, v16.4S, v9.4S +mul v16.4S, v16.4S,v30.4S +mla v16.4S, v28.4S, v31.s[0] +sub v28.4s, v10.4s, v16.4s +add v10.4s, v10.4s, v16.4s +sqrdmulh v16.4S, v14.4S, v26.4S +mul v14.4S, v14.4S,v4.4S +mla v14.4S, v16.4S, v31.s[0] +sub v16.4s, v21.4s, v14.4s +add v21.4s, v21.4s, v14.4s +str q10, [x0, #256] +str q28, [x0, #272] +str q21, [x0, #288] +str q16, [x0, #304] +ldr q16, [x17, #+768] +ldr q21, [x17, #+784] +ldr q28, [x17, #+800] +ldr q10, [x17, #+816] +ldr q14, [x17, #+832] +ldr q15, [x17, #+848] +ldr q25, [x17, #+864] +ldr q8, [x17, #+880] +ldr q26, [x0, #352] +ldr q4, [x0, #368] +ldr q9, [x0, #320] +ldr q30, [x0, #336] +sqrdmulh v18.4S, v26.4S, v21.s[0] +mul v26.4S, v26.4S,v16.s[0] +mla v26.4S, v18.4S, v31.s[0] +sub v18.4s, v9.4s, v26.4s +add v9.4s, v9.4s, v26.4s +sqrdmulh v26.4S, v4.4S, v21.s[0] +mul v4.4S, v4.4S,v16.s[0] +mla v4.4S, v26.4S, v31.s[0] +sub v26.4s, v30.4s, v4.4s +add v30.4s, v30.4s, v4.4s +sqrdmulh v4.4S, v30.4S, v21.s[1] +mul v30.4S, v30.4S,v16.s[1] +mla v30.4S, v4.4S, v31.s[0] +sub v4.4s, v9.4s, v30.4s +add v9.4s, v9.4s, v30.4s +sqrdmulh v30.4S, v26.4S, v21.s[2] +mul v26.4S, v26.4S,v16.s[2] +mla v26.4S, v30.4S, v31.s[0] +sub v30.4s, v18.4s, v26.4s +add v18.4s, v18.4s, v26.4s +trn1 v26.4S, v9.4S, v4.4S +trn2 v2.4S, v9.4S, v4.4S +trn1 v1.4S, v18.4S, v30.4S +trn2 v5.4S, v18.4S, v30.4S +trn2 v18.2D, v26.2D, v1.2D +trn2 v30.2D, v2.2D, v5.2D +trn1 v9.2D, v26.2D, v1.2D +trn1 v4.2D, v2.2D, v5.2D +sqrdmulh v5.4S, v18.4S, v10.4S +mul v18.4S, v18.4S,v28.4S +mla v18.4S, v5.4S, v31.s[0] +sub v5.4s, v9.4s, v18.4s +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v30.4S, v10.4S +mul v30.4S, v30.4S,v28.4S +mla v30.4S, v18.4S, v31.s[0] +sub v18.4s, v4.4s, v30.4s +add v4.4s, v4.4s, v30.4s +sqrdmulh v30.4S, v4.4S, v15.4S +mul v4.4S, v4.4S,v14.4S +mla v4.4S, v30.4S, v31.s[0] +sub v30.4s, v9.4s, v4.4s +add v9.4s, v9.4s, v4.4s +sqrdmulh v4.4S, v18.4S, v8.4S +mul v18.4S, v18.4S,v25.4S +mla v18.4S, v4.4S, v31.s[0] +sub v4.4s, v5.4s, v18.4s +add v5.4s, v5.4s, v18.4s +str q9, [x0, #320] +str q30, [x0, #336] +str q5, [x0, #352] +str q4, [x0, #368] +ldr q4, [x17, #+896] +ldr q5, [x17, #+912] +ldr q30, [x17, #+928] +ldr q9, [x17, #+944] +ldr q18, [x17, #+960] +ldr q2, [x17, #+976] +ldr q1, [x17, #+992] +ldr q26, [x17, #+1008] +ldr q8, [x0, #416] +ldr q25, [x0, #432] +ldr q15, [x0, #384] +ldr q14, [x0, #400] +sqrdmulh v10.4S, v8.4S, v5.s[0] +mul v8.4S, v8.4S,v4.s[0] +mla v8.4S, v10.4S, v31.s[0] +sub v10.4s, v15.4s, v8.4s +add v15.4s, v15.4s, v8.4s +sqrdmulh v8.4S, v25.4S, v5.s[0] +mul v25.4S, v25.4S,v4.s[0] +mla v25.4S, v8.4S, v31.s[0] +sub v8.4s, v14.4s, v25.4s +add v14.4s, v14.4s, v25.4s +sqrdmulh v25.4S, v14.4S, v5.s[1] +mul v14.4S, v14.4S,v4.s[1] +mla v14.4S, v25.4S, v31.s[0] +sub v25.4s, v15.4s, v14.4s +add v15.4s, v15.4s, v14.4s +sqrdmulh v14.4S, v8.4S, v5.s[2] +mul v8.4S, v8.4S,v4.s[2] +mla v8.4S, v14.4S, v31.s[0] +sub v14.4s, v10.4s, v8.4s +add v10.4s, v10.4s, v8.4s +trn1 v8.4S, v15.4S, v25.4S +trn2 v28.4S, v15.4S, v25.4S +trn1 v21.4S, v10.4S, v14.4S +trn2 v16.4S, v10.4S, v14.4S +trn2 v10.2D, v8.2D, v21.2D +trn2 v14.2D, v28.2D, v16.2D +trn1 v15.2D, v8.2D, v21.2D +trn1 v25.2D, v28.2D, v16.2D +sqrdmulh v16.4S, v10.4S, v9.4S +mul v10.4S, v10.4S,v30.4S +mla v10.4S, v16.4S, v31.s[0] +sub v16.4s, v15.4s, v10.4s +add v15.4s, v15.4s, v10.4s +sqrdmulh v10.4S, v14.4S, v9.4S +mul v14.4S, v14.4S,v30.4S +mla v14.4S, v10.4S, v31.s[0] +sub v10.4s, v25.4s, v14.4s +add v25.4s, v25.4s, v14.4s +sqrdmulh v14.4S, v25.4S, v2.4S +mul v25.4S, v25.4S,v18.4S +mla v25.4S, v14.4S, v31.s[0] +sub v14.4s, v15.4s, v25.4s +add v15.4s, v15.4s, v25.4s +sqrdmulh v25.4S, v10.4S, v26.4S +mul v10.4S, v10.4S,v1.4S +mla v10.4S, v25.4S, v31.s[0] +sub v25.4s, v16.4s, v10.4s +add v16.4s, v16.4s, v10.4s +str q15, [x0, #384] +str q14, [x0, #400] +str q16, [x0, #416] +str q25, [x0, #432] +ldr q25, [x17, #+1024] +ldr q16, [x17, #+1040] +ldr q14, [x17, #+1056] +ldr q15, [x17, #+1072] +ldr q10, [x17, #+1088] +ldr q28, [x17, #+1104] +ldr q21, [x17, #+1120] +ldr q8, [x17, #+1136] +ldr q26, [x0, #480] +ldr q1, [x0, #496] +ldr q2, [x0, #448] +ldr q18, [x0, #464] +sqrdmulh v9.4S, v26.4S, v16.s[0] +mul v26.4S, v26.4S,v25.s[0] +mla v26.4S, v9.4S, v31.s[0] +sub v9.4s, v2.4s, v26.4s +add v2.4s, v2.4s, v26.4s +sqrdmulh v26.4S, v1.4S, v16.s[0] +mul v1.4S, v1.4S,v25.s[0] +mla v1.4S, v26.4S, v31.s[0] +sub v26.4s, v18.4s, v1.4s +add v18.4s, v18.4s, v1.4s +sqrdmulh v1.4S, v18.4S, v16.s[1] +mul v18.4S, v18.4S,v25.s[1] +mla v18.4S, v1.4S, v31.s[0] +sub v1.4s, v2.4s, v18.4s +add v2.4s, v2.4s, v18.4s +sqrdmulh v18.4S, v26.4S, v16.s[2] +mul v26.4S, v26.4S,v25.s[2] +mla v26.4S, v18.4S, v31.s[0] +sub v18.4s, v9.4s, v26.4s +add v9.4s, v9.4s, v26.4s +trn1 v26.4S, v2.4S, v1.4S +trn2 v30.4S, v2.4S, v1.4S +trn1 v5.4S, v9.4S, v18.4S +trn2 v4.4S, v9.4S, v18.4S +trn2 v9.2D, v26.2D, v5.2D +trn2 v18.2D, v30.2D, v4.2D +trn1 v2.2D, v26.2D, v5.2D +trn1 v1.2D, v30.2D, v4.2D +sqrdmulh v4.4S, v9.4S, v15.4S +mul v9.4S, v9.4S,v14.4S +mla v9.4S, v4.4S, v31.s[0] +sub v4.4s, v2.4s, v9.4s +add v2.4s, v2.4s, v9.4s +sqrdmulh v9.4S, v18.4S, v15.4S +mul v18.4S, v18.4S,v14.4S +mla v18.4S, v9.4S, v31.s[0] +sub v9.4s, v1.4s, v18.4s +add v1.4s, v1.4s, v18.4s +sqrdmulh v18.4S, v1.4S, v28.4S +mul v1.4S, v1.4S,v10.4S +mla v1.4S, v18.4S, v31.s[0] +sub v18.4s, v2.4s, v1.4s +add v2.4s, v2.4s, v1.4s +sqrdmulh v1.4S, v9.4S, v8.4S +mul v9.4S, v9.4S,v21.4S +mla v9.4S, v1.4S, v31.s[0] +sub v1.4s, v4.4s, v9.4s +add v4.4s, v4.4s, v9.4s +str q2, [x0, #448] +str q18, [x0, #464] +str q4, [x0, #480] +str q1, [x0, #496] +ldr q1, [x17, #+1152] +ldr q4, [x17, #+1168] +ldr q18, [x17, #+1184] +ldr q2, [x17, #+1200] +ldr q9, [x17, #+1216] +ldr q30, [x17, #+1232] +ldr q5, [x17, #+1248] +ldr q26, [x17, #+1264] +ldr q8, [x0, #544] +ldr q21, [x0, #560] +ldr q28, [x0, #512] +ldr q10, [x0, #528] +sqrdmulh v15.4S, v8.4S, v4.s[0] +mul v8.4S, v8.4S,v1.s[0] +mla v8.4S, v15.4S, v31.s[0] +sub v15.4s, v28.4s, v8.4s +add v28.4s, v28.4s, v8.4s +sqrdmulh v8.4S, v21.4S, v4.s[0] +mul v21.4S, v21.4S,v1.s[0] +mla v21.4S, v8.4S, v31.s[0] +sub v8.4s, v10.4s, v21.4s +add v10.4s, v10.4s, v21.4s +sqrdmulh v21.4S, v10.4S, v4.s[1] +mul v10.4S, v10.4S,v1.s[1] +mla v10.4S, v21.4S, v31.s[0] +sub v21.4s, v28.4s, v10.4s +add v28.4s, v28.4s, v10.4s +sqrdmulh v10.4S, v8.4S, v4.s[2] +mul v8.4S, v8.4S,v1.s[2] +mla v8.4S, v10.4S, v31.s[0] +sub v10.4s, v15.4s, v8.4s +add v15.4s, v15.4s, v8.4s +trn1 v8.4S, v28.4S, v21.4S +trn2 v14.4S, v28.4S, v21.4S +trn1 v16.4S, v15.4S, v10.4S +trn2 v25.4S, v15.4S, v10.4S +trn2 v15.2D, v8.2D, v16.2D +trn2 v10.2D, v14.2D, v25.2D +trn1 v28.2D, v8.2D, v16.2D +trn1 v21.2D, v14.2D, v25.2D +sqrdmulh v25.4S, v15.4S, v2.4S +mul v15.4S, v15.4S,v18.4S +mla v15.4S, v25.4S, v31.s[0] +sub v25.4s, v28.4s, v15.4s +add v28.4s, v28.4s, v15.4s +sqrdmulh v15.4S, v10.4S, v2.4S +mul v10.4S, v10.4S,v18.4S +mla v10.4S, v15.4S, v31.s[0] +sub v15.4s, v21.4s, v10.4s +add v21.4s, v21.4s, v10.4s +sqrdmulh v10.4S, v21.4S, v30.4S +mul v21.4S, v21.4S,v9.4S +mla v21.4S, v10.4S, v31.s[0] +sub v10.4s, v28.4s, v21.4s +add v28.4s, v28.4s, v21.4s +sqrdmulh v21.4S, v15.4S, v26.4S +mul v15.4S, v15.4S,v5.4S +mla v15.4S, v21.4S, v31.s[0] +sub v21.4s, v25.4s, v15.4s +add v25.4s, v25.4s, v15.4s +str q28, [x0, #512] +str q10, [x0, #528] +str q25, [x0, #544] +str q21, [x0, #560] +ldr q21, [x17, #+1280] +ldr q25, [x17, #+1296] +ldr q10, [x17, #+1312] +ldr q28, [x17, #+1328] +ldr q15, [x17, #+1344] +ldr q14, [x17, #+1360] +ldr q16, [x17, #+1376] +ldr q8, [x17, #+1392] +ldr q26, [x0, #608] +ldr q5, [x0, #624] +ldr q30, [x0, #576] +ldr q9, [x0, #592] +sqrdmulh v2.4S, v26.4S, v25.s[0] +mul v26.4S, v26.4S,v21.s[0] +mla v26.4S, v2.4S, v31.s[0] +sub v2.4s, v30.4s, v26.4s +add v30.4s, v30.4s, v26.4s +sqrdmulh v26.4S, v5.4S, v25.s[0] +mul v5.4S, v5.4S,v21.s[0] +mla v5.4S, v26.4S, v31.s[0] +sub v26.4s, v9.4s, v5.4s +add v9.4s, v9.4s, v5.4s +sqrdmulh v5.4S, v9.4S, v25.s[1] +mul v9.4S, v9.4S,v21.s[1] +mla v9.4S, v5.4S, v31.s[0] +sub v5.4s, v30.4s, v9.4s +add v30.4s, v30.4s, v9.4s +sqrdmulh v9.4S, v26.4S, v25.s[2] +mul v26.4S, v26.4S,v21.s[2] +mla v26.4S, v9.4S, v31.s[0] +sub v9.4s, v2.4s, v26.4s +add v2.4s, v2.4s, v26.4s +trn1 v26.4S, v30.4S, v5.4S +trn2 v18.4S, v30.4S, v5.4S +trn1 v4.4S, v2.4S, v9.4S +trn2 v1.4S, v2.4S, v9.4S +trn2 v2.2D, v26.2D, v4.2D +trn2 v9.2D, v18.2D, v1.2D +trn1 v30.2D, v26.2D, v4.2D +trn1 v5.2D, v18.2D, v1.2D +sqrdmulh v1.4S, v2.4S, v28.4S +mul v2.4S, v2.4S,v10.4S +mla v2.4S, v1.4S, v31.s[0] +sub v1.4s, v30.4s, v2.4s +add v30.4s, v30.4s, v2.4s +sqrdmulh v2.4S, v9.4S, v28.4S +mul v9.4S, v9.4S,v10.4S +mla v9.4S, v2.4S, v31.s[0] +sub v2.4s, v5.4s, v9.4s +add v5.4s, v5.4s, v9.4s +sqrdmulh v9.4S, v5.4S, v14.4S +mul v5.4S, v5.4S,v15.4S +mla v5.4S, v9.4S, v31.s[0] +sub v9.4s, v30.4s, v5.4s +add v30.4s, v30.4s, v5.4s +sqrdmulh v5.4S, v2.4S, v8.4S +mul v2.4S, v2.4S,v16.4S +mla v2.4S, v5.4S, v31.s[0] +sub v5.4s, v1.4s, v2.4s +add v1.4s, v1.4s, v2.4s +str q30, [x0, #576] +str q9, [x0, #592] +str q1, [x0, #608] +str q5, [x0, #624] +ldr q5, [x17, #+1408] +ldr q1, [x17, #+1424] +ldr q9, [x17, #+1440] +ldr q30, [x17, #+1456] +ldr q2, [x17, #+1472] +ldr q18, [x17, #+1488] +ldr q4, [x17, #+1504] +ldr q26, [x17, #+1520] +ldr q8, [x0, #672] +ldr q16, [x0, #688] +ldr q14, [x0, #640] +ldr q15, [x0, #656] +sqrdmulh v28.4S, v8.4S, v1.s[0] +mul v8.4S, v8.4S,v5.s[0] +mla v8.4S, v28.4S, v31.s[0] +sub v28.4s, v14.4s, v8.4s +add v14.4s, v14.4s, v8.4s +sqrdmulh v8.4S, v16.4S, v1.s[0] +mul v16.4S, v16.4S,v5.s[0] +mla v16.4S, v8.4S, v31.s[0] +sub v8.4s, v15.4s, v16.4s +add v15.4s, v15.4s, v16.4s +sqrdmulh v16.4S, v15.4S, v1.s[1] +mul v15.4S, v15.4S,v5.s[1] +mla v15.4S, v16.4S, v31.s[0] +sub v16.4s, v14.4s, v15.4s +add v14.4s, v14.4s, v15.4s +sqrdmulh v15.4S, v8.4S, v1.s[2] +mul v8.4S, v8.4S,v5.s[2] +mla v8.4S, v15.4S, v31.s[0] +sub v15.4s, v28.4s, v8.4s +add v28.4s, v28.4s, v8.4s +trn1 v8.4S, v14.4S, v16.4S +trn2 v10.4S, v14.4S, v16.4S +trn1 v25.4S, v28.4S, v15.4S +trn2 v21.4S, v28.4S, v15.4S +trn2 v28.2D, v8.2D, v25.2D +trn2 v15.2D, v10.2D, v21.2D +trn1 v14.2D, v8.2D, v25.2D +trn1 v16.2D, v10.2D, v21.2D +sqrdmulh v21.4S, v28.4S, v30.4S +mul v28.4S, v28.4S,v9.4S +mla v28.4S, v21.4S, v31.s[0] +sub v21.4s, v14.4s, v28.4s +add v14.4s, v14.4s, v28.4s +sqrdmulh v28.4S, v15.4S, v30.4S +mul v15.4S, v15.4S,v9.4S +mla v15.4S, v28.4S, v31.s[0] +sub v28.4s, v16.4s, v15.4s +add v16.4s, v16.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v18.4S +mul v16.4S, v16.4S,v2.4S +mla v16.4S, v15.4S, v31.s[0] +sub v15.4s, v14.4s, v16.4s +add v14.4s, v14.4s, v16.4s +sqrdmulh v16.4S, v28.4S, v26.4S +mul v28.4S, v28.4S,v4.4S +mla v28.4S, v16.4S, v31.s[0] +sub v16.4s, v21.4s, v28.4s +add v21.4s, v21.4s, v28.4s +str q14, [x0, #640] +str q15, [x0, #656] +str q21, [x0, #672] +str q16, [x0, #688] +ldr q16, [x17, #+1536] +ldr q21, [x17, #+1552] +ldr q15, [x17, #+1568] +ldr q14, [x17, #+1584] +ldr q28, [x17, #+1600] +ldr q10, [x17, #+1616] +ldr q25, [x17, #+1632] +ldr q8, [x17, #+1648] +ldr q26, [x0, #736] +ldr q4, [x0, #752] +ldr q18, [x0, #704] +ldr q2, [x0, #720] +sqrdmulh v30.4S, v26.4S, v21.s[0] +mul v26.4S, v26.4S,v16.s[0] +mla v26.4S, v30.4S, v31.s[0] +sub v30.4s, v18.4s, v26.4s +add v18.4s, v18.4s, v26.4s +sqrdmulh v26.4S, v4.4S, v21.s[0] +mul v4.4S, v4.4S,v16.s[0] +mla v4.4S, v26.4S, v31.s[0] +sub v26.4s, v2.4s, v4.4s +add v2.4s, v2.4s, v4.4s +sqrdmulh v4.4S, v2.4S, v21.s[1] +mul v2.4S, v2.4S,v16.s[1] +mla v2.4S, v4.4S, v31.s[0] +sub v4.4s, v18.4s, v2.4s +add v18.4s, v18.4s, v2.4s +sqrdmulh v2.4S, v26.4S, v21.s[2] +mul v26.4S, v26.4S,v16.s[2] +mla v26.4S, v2.4S, v31.s[0] +sub v2.4s, v30.4s, v26.4s +add v30.4s, v30.4s, v26.4s +trn1 v26.4S, v18.4S, v4.4S +trn2 v9.4S, v18.4S, v4.4S +trn1 v1.4S, v30.4S, v2.4S +trn2 v5.4S, v30.4S, v2.4S +trn2 v30.2D, v26.2D, v1.2D +trn2 v2.2D, v9.2D, v5.2D +trn1 v18.2D, v26.2D, v1.2D +trn1 v4.2D, v9.2D, v5.2D +sqrdmulh v5.4S, v30.4S, v14.4S +mul v30.4S, v30.4S,v15.4S +mla v30.4S, v5.4S, v31.s[0] +sub v5.4s, v18.4s, v30.4s +add v18.4s, v18.4s, v30.4s +sqrdmulh v30.4S, v2.4S, v14.4S +mul v2.4S, v2.4S,v15.4S +mla v2.4S, v30.4S, v31.s[0] +sub v30.4s, v4.4s, v2.4s +add v4.4s, v4.4s, v2.4s +sqrdmulh v2.4S, v4.4S, v10.4S +mul v4.4S, v4.4S,v28.4S +mla v4.4S, v2.4S, v31.s[0] +sub v2.4s, v18.4s, v4.4s +add v18.4s, v18.4s, v4.4s +sqrdmulh v4.4S, v30.4S, v8.4S +mul v30.4S, v30.4S,v25.4S +mla v30.4S, v4.4S, v31.s[0] +sub v4.4s, v5.4s, v30.4s +add v5.4s, v5.4s, v30.4s +str q18, [x0, #704] +str q2, [x0, #720] +str q5, [x0, #736] +str q4, [x0, #752] +ldr q4, [x17, #+1664] +ldr q5, [x17, #+1680] +ldr q2, [x17, #+1696] +ldr q18, [x17, #+1712] +ldr q30, [x17, #+1728] +ldr q9, [x17, #+1744] +ldr q1, [x17, #+1760] +ldr q26, [x17, #+1776] +ldr q8, [x0, #800] +ldr q25, [x0, #816] +ldr q10, [x0, #768] +ldr q28, [x0, #784] +sqrdmulh v14.4S, v8.4S, v5.s[0] +mul v8.4S, v8.4S,v4.s[0] +mla v8.4S, v14.4S, v31.s[0] +sub v14.4s, v10.4s, v8.4s +add v10.4s, v10.4s, v8.4s +sqrdmulh v8.4S, v25.4S, v5.s[0] +mul v25.4S, v25.4S,v4.s[0] +mla v25.4S, v8.4S, v31.s[0] +sub v8.4s, v28.4s, v25.4s +add v28.4s, v28.4s, v25.4s +sqrdmulh v25.4S, v28.4S, v5.s[1] +mul v28.4S, v28.4S,v4.s[1] +mla v28.4S, v25.4S, v31.s[0] +sub v25.4s, v10.4s, v28.4s +add v10.4s, v10.4s, v28.4s +sqrdmulh v28.4S, v8.4S, v5.s[2] +mul v8.4S, v8.4S,v4.s[2] +mla v8.4S, v28.4S, v31.s[0] +sub v28.4s, v14.4s, v8.4s +add v14.4s, v14.4s, v8.4s +trn1 v8.4S, v10.4S, v25.4S +trn2 v15.4S, v10.4S, v25.4S +trn1 v21.4S, v14.4S, v28.4S +trn2 v16.4S, v14.4S, v28.4S +trn2 v14.2D, v8.2D, v21.2D +trn2 v28.2D, v15.2D, v16.2D +trn1 v10.2D, v8.2D, v21.2D +trn1 v25.2D, v15.2D, v16.2D +sqrdmulh v16.4S, v14.4S, v18.4S +mul v14.4S, v14.4S,v2.4S +mla v14.4S, v16.4S, v31.s[0] +sub v16.4s, v10.4s, v14.4s +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v28.4S, v18.4S +mul v28.4S, v28.4S,v2.4S +mla v28.4S, v14.4S, v31.s[0] +sub v14.4s, v25.4s, v28.4s +add v25.4s, v25.4s, v28.4s +sqrdmulh v28.4S, v25.4S, v9.4S +mul v25.4S, v25.4S,v30.4S +mla v25.4S, v28.4S, v31.s[0] +sub v28.4s, v10.4s, v25.4s +add v10.4s, v10.4s, v25.4s +sqrdmulh v25.4S, v14.4S, v26.4S +mul v14.4S, v14.4S,v1.4S +mla v14.4S, v25.4S, v31.s[0] +sub v25.4s, v16.4s, v14.4s +add v16.4s, v16.4s, v14.4s +str q10, [x0, #768] +str q28, [x0, #784] +str q16, [x0, #800] +str q25, [x0, #816] +ldr q25, [x17, #+1792] +ldr q16, [x17, #+1808] +ldr q28, [x17, #+1824] +ldr q10, [x17, #+1840] +ldr q14, [x17, #+1856] +ldr q15, [x17, #+1872] +ldr q21, [x17, #+1888] +ldr q8, [x17, #+1904] +ldr q26, [x0, #864] +ldr q1, [x0, #880] +ldr q9, [x0, #832] +ldr q30, [x0, #848] +sqrdmulh v18.4S, v26.4S, v16.s[0] +mul v26.4S, v26.4S,v25.s[0] +mla v26.4S, v18.4S, v31.s[0] +sub v18.4s, v9.4s, v26.4s +add v9.4s, v9.4s, v26.4s +sqrdmulh v26.4S, v1.4S, v16.s[0] +mul v1.4S, v1.4S,v25.s[0] +mla v1.4S, v26.4S, v31.s[0] +sub v26.4s, v30.4s, v1.4s +add v30.4s, v30.4s, v1.4s +sqrdmulh v1.4S, v30.4S, v16.s[1] +mul v30.4S, v30.4S,v25.s[1] +mla v30.4S, v1.4S, v31.s[0] +sub v1.4s, v9.4s, v30.4s +add v9.4s, v9.4s, v30.4s +sqrdmulh v30.4S, v26.4S, v16.s[2] +mul v26.4S, v26.4S,v25.s[2] +mla v26.4S, v30.4S, v31.s[0] +sub v30.4s, v18.4s, v26.4s +add v18.4s, v18.4s, v26.4s +trn1 v26.4S, v9.4S, v1.4S +trn2 v2.4S, v9.4S, v1.4S +trn1 v5.4S, v18.4S, v30.4S +trn2 v4.4S, v18.4S, v30.4S +trn2 v18.2D, v26.2D, v5.2D +trn2 v30.2D, v2.2D, v4.2D +trn1 v9.2D, v26.2D, v5.2D +trn1 v1.2D, v2.2D, v4.2D +sqrdmulh v4.4S, v18.4S, v10.4S +mul v18.4S, v18.4S,v28.4S +mla v18.4S, v4.4S, v31.s[0] +sub v4.4s, v9.4s, v18.4s +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v30.4S, v10.4S +mul v30.4S, v30.4S,v28.4S +mla v30.4S, v18.4S, v31.s[0] +sub v18.4s, v1.4s, v30.4s +add v1.4s, v1.4s, v30.4s +sqrdmulh v30.4S, v1.4S, v15.4S +mul v1.4S, v1.4S,v14.4S +mla v1.4S, v30.4S, v31.s[0] +sub v30.4s, v9.4s, v1.4s +add v9.4s, v9.4s, v1.4s +sqrdmulh v1.4S, v18.4S, v8.4S +mul v18.4S, v18.4S,v21.4S +mla v18.4S, v1.4S, v31.s[0] +sub v1.4s, v4.4s, v18.4s +add v4.4s, v4.4s, v18.4s +str q9, [x0, #832] +str q30, [x0, #848] +str q4, [x0, #864] +str q1, [x0, #880] +ldr q1, [x17, #+1920] +ldr q4, [x17, #+1936] +ldr q30, [x17, #+1952] +ldr q9, [x17, #+1968] +ldr q18, [x17, #+1984] +ldr q2, [x17, #+2000] +ldr q5, [x17, #+2016] +ldr q26, [x17, #+2032] +ldr q8, [x0, #928] +ldr q21, [x0, #944] +ldr q15, [x0, #896] +ldr q14, [x0, #912] +sqrdmulh v10.4S, v8.4S, v4.s[0] +mul v8.4S, v8.4S,v1.s[0] +mla v8.4S, v10.4S, v31.s[0] +sub v10.4s, v15.4s, v8.4s +add v15.4s, v15.4s, v8.4s +sqrdmulh v8.4S, v21.4S, v4.s[0] +mul v21.4S, v21.4S,v1.s[0] +mla v21.4S, v8.4S, v31.s[0] +sub v8.4s, v14.4s, v21.4s +add v14.4s, v14.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v4.s[1] +mul v14.4S, v14.4S,v1.s[1] +mla v14.4S, v21.4S, v31.s[0] +sub v21.4s, v15.4s, v14.4s +add v15.4s, v15.4s, v14.4s +sqrdmulh v14.4S, v8.4S, v4.s[2] +mul v8.4S, v8.4S,v1.s[2] +mla v8.4S, v14.4S, v31.s[0] +sub v14.4s, v10.4s, v8.4s +add v10.4s, v10.4s, v8.4s +trn1 v8.4S, v15.4S, v21.4S +trn2 v28.4S, v15.4S, v21.4S +trn1 v16.4S, v10.4S, v14.4S +trn2 v25.4S, v10.4S, v14.4S +trn2 v10.2D, v8.2D, v16.2D +trn2 v14.2D, v28.2D, v25.2D +trn1 v15.2D, v8.2D, v16.2D +trn1 v21.2D, v28.2D, v25.2D +sqrdmulh v25.4S, v10.4S, v9.4S +mul v10.4S, v10.4S,v30.4S +mla v10.4S, v25.4S, v31.s[0] +sub v25.4s, v15.4s, v10.4s +add v15.4s, v15.4s, v10.4s +sqrdmulh v10.4S, v14.4S, v9.4S +mul v14.4S, v14.4S,v30.4S +mla v14.4S, v10.4S, v31.s[0] +sub v10.4s, v21.4s, v14.4s +add v21.4s, v21.4s, v14.4s +sqrdmulh v14.4S, v21.4S, v2.4S +mul v21.4S, v21.4S,v18.4S +mla v21.4S, v14.4S, v31.s[0] +sub v14.4s, v15.4s, v21.4s +add v15.4s, v15.4s, v21.4s +sqrdmulh v21.4S, v10.4S, v26.4S +mul v10.4S, v10.4S,v5.4S +mla v10.4S, v21.4S, v31.s[0] +sub v21.4s, v25.4s, v10.4s +add v25.4s, v25.4s, v10.4s +str q15, [x0, #896] +str q14, [x0, #912] +str q25, [x0, #928] +str q21, [x0, #944] +ldr q21, [x17, #+2048] +ldr q25, [x17, #+2064] +ldr q14, [x17, #+2080] +ldr q15, [x17, #+2096] +ldr q10, [x17, #+2112] +ldr q28, [x17, #+2128] +ldr q16, [x17, #+2144] +ldr q8, [x17, #+2160] +ldr q26, [x0, #992] +ldr q5, [x0, #1008] +ldr q2, [x0, #960] +ldr q18, [x0, #976] +sqrdmulh v9.4S, v26.4S, v25.s[0] +mul v26.4S, v26.4S,v21.s[0] +mla v26.4S, v9.4S, v31.s[0] +sub v9.4s, v2.4s, v26.4s +add v2.4s, v2.4s, v26.4s +sqrdmulh v26.4S, v5.4S, v25.s[0] +mul v5.4S, v5.4S,v21.s[0] +mla v5.4S, v26.4S, v31.s[0] +sub v26.4s, v18.4s, v5.4s +add v18.4s, v18.4s, v5.4s +sqrdmulh v5.4S, v18.4S, v25.s[1] +mul v18.4S, v18.4S,v21.s[1] +mla v18.4S, v5.4S, v31.s[0] +sub v5.4s, v2.4s, v18.4s +add v2.4s, v2.4s, v18.4s +sqrdmulh v18.4S, v26.4S, v25.s[2] +mul v26.4S, v26.4S,v21.s[2] +mla v26.4S, v18.4S, v31.s[0] +sub v18.4s, v9.4s, v26.4s +add v9.4s, v9.4s, v26.4s +trn1 v26.4S, v2.4S, v5.4S +trn2 v30.4S, v2.4S, v5.4S +trn1 v4.4S, v9.4S, v18.4S +trn2 v1.4S, v9.4S, v18.4S +trn2 v9.2D, v26.2D, v4.2D +trn2 v18.2D, v30.2D, v1.2D +trn1 v2.2D, v26.2D, v4.2D +trn1 v5.2D, v30.2D, v1.2D +sqrdmulh v1.4S, v9.4S, v15.4S +mul v9.4S, v9.4S,v14.4S +mla v9.4S, v1.4S, v31.s[0] +sub v1.4s, v2.4s, v9.4s +add v2.4s, v2.4s, v9.4s +sqrdmulh v9.4S, v18.4S, v15.4S +mul v18.4S, v18.4S,v14.4S +mla v18.4S, v9.4S, v31.s[0] +sub v9.4s, v5.4s, v18.4s +add v5.4s, v5.4s, v18.4s +sqrdmulh v18.4S, v5.4S, v28.4S +mul v5.4S, v5.4S,v10.4S +mla v5.4S, v18.4S, v31.s[0] +sub v18.4s, v2.4s, v5.4s +add v2.4s, v2.4s, v5.4s +sqrdmulh v5.4S, v9.4S, v8.4S +mul v9.4S, v9.4S,v16.4S +mla v9.4S, v5.4S, v31.s[0] +sub v5.4s, v1.4s, v9.4s +add v1.4s, v1.4s, v9.4s +str q2, [x0, #960] +str q18, [x0, #976] +str q1, [x0, #992] +str q5, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 2392 +// Instruction count: 2388 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_14_0.s b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_14_0.s new file mode 100644 index 0000000..dae0130 --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_14_0.s @@ -0,0 +1,2506 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 26036764 // Layer 6, block 0 +.word 7065381 // Layer 6, block 1 +.word 11280567 // Layer 6, block 2 +.word 19695786 // Layer 6, block 3 +.word 1666225723 // Layer 6, block 0 +.word 452149874 // Layer 6, block 1 +.word 721901190 // Layer 6, block 2 +.word 1260434103 // Layer 6, block 3 +.word 28678040 // Layer 7, block 0 +.word 5637166 // Layer 7, block 2 +.word 18759424 // Layer 7, block 4 +.word 8648030 // Layer 7, block 6 +.word 1835254486 // Layer 7, block 0 +.word 360751090 // Layer 7, block 2 +.word 1200511508 // Layer 7, block 4 +.word 553431680 // Layer 7, block 6 +.word 7232147 // Layer 7, block 1 +.word 7430689 // Layer 7, block 3 +.word 14819378 // Layer 7, block 5 +.word 22112339 // Layer 7, block 7 +.word 462822084 // Layer 7, block 1 +.word 475527802 // Layer 7, block 3 +.word 948367809 // Layer 7, block 5 +.word 1415081692 // Layer 7, block 7 +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14834498 // Layer 6, block 4 +.word 22861321 // Layer 6, block 5 +.word 23033862 // Layer 6, block 6 +.word 32211066 // Layer 6, block 7 +.word 949335415 // Layer 6, block 4 +.word 1463012881 // Layer 6, block 5 +.word 1474054663 // Layer 6, block 6 +.word 2061350894 // Layer 6, block 7 +.word 7103825 // Layer 7, block 8 +.word 24338119 // Layer 7, block 10 +.word 6674394 // Layer 7, block 12 +.word 3716128 // Layer 7, block 14 +.word 454610102 // Layer 7, block 8 +.word 1557520740 // Layer 7, block 10 +.word 427128616 // Layer 7, block 12 +.word 237814041 // Layer 7, block 14 +.word 18577393 // Layer 7, block 9 +.word 17042091 // Layer 7, block 11 +.word 6574213 // Layer 7, block 13 +.word 24666803 // Layer 7, block 15 +.word 1188862414 // Layer 7, block 9 +.word 1090610585 // Layer 7, block 11 +.word 420717521 // Layer 7, block 13 +.word 1578554911 // Layer 7, block 15 +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 11253846 // Layer 6, block 8 +.word 16151303 // Layer 6, block 9 +.word 1821442 // Layer 6, block 10 +.word 23358663 // Layer 6, block 11 +.word 720191176 // Layer 6, block 8 +.word 1033604503 // Layer 6, block 9 +.word 116563391 // Layer 6, block 10 +.word 1494840340 // Layer 6, block 11 +.word 32787475 // Layer 7, block 16 +.word 8269259 // Layer 7, block 18 +.word 20826321 // Layer 7, block 20 +.word 21194054 // Layer 7, block 22 +.word 2098238255 // Layer 7, block 16 +.word 529192186 // Layer 7, block 18 +.word 1332782821 // Layer 7, block 20 +.word 1356315937 // Layer 7, block 22 +.word 28400654 // Layer 7, block 17 +.word 31090287 // Layer 7, block 19 +.word 26776841 // Layer 7, block 21 +.word 22281074 // Layer 7, block 23 +.word 1817503137 // Layer 7, block 17 +.word 1989626512 // Layer 7, block 19 +.word 1713587037 // Layer 7, block 21 +.word 1425879908 // Layer 7, block 23 +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 20504641 // Layer 6, block 12 +.word 7735096 // Layer 6, block 13 +.word 29463916 // Layer 6, block 14 +.word 23172067 // Layer 6, block 15 +.word 1312196872 // Layer 6, block 12 +.word 495008363 // Layer 6, block 13 +.word 1885546712 // Layer 6, block 14 +.word 1482899108 // Layer 6, block 15 +.word 1953000 // Layer 7, block 24 +.word 12766243 // Layer 7, block 26 +.word 16292342 // Layer 7, block 28 +.word 25143337 // Layer 7, block 30 +.word 124982461 // Layer 7, block 24 +.word 816977197 // Layer 7, block 26 +.word 1042630311 // Layer 7, block 28 +.word 1609050759 // Layer 7, block 30 +.word 12486848 // Layer 7, block 25 +.word 31556661 // Layer 7, block 27 +.word 28330310 // Layer 7, block 29 +.word 15137961 // Layer 7, block 31 +.word 799097282 // Layer 7, block 25 +.word 2019472170 // Layer 7, block 27 +.word 1813001465 // Layer 7, block 29 +.word 968755565 // Layer 7, block 31 +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 18663828 // Layer 6, block 16 +.word 25765932 // Layer 6, block 17 +.word 11779122 // Layer 6, block 18 +.word 29112305 // Layer 6, block 19 +.word 1194393831 // Layer 6, block 16 +.word 1648893798 // Layer 6, block 17 +.word 753806275 // Layer 6, block 18 +.word 1863045325 // Layer 6, block 19 +.word 33163184 // Layer 7, block 32 +.word 11550623 // Layer 7, block 34 +.word 25375595 // Layer 7, block 36 +.word 18254638 // Layer 7, block 38 +.word 2122281795 // Layer 7, block 32 +.word 739183455 // Layer 7, block 34 +.word 1623914137 // Layer 7, block 36 +.word 1168207670 // Layer 7, block 38 +.word 9551359 // Layer 7, block 33 +.word 33257316 // Layer 7, block 35 +.word 10387700 // Layer 7, block 37 +.word 4263629 // Layer 7, block 39 +.word 611240324 // Layer 7, block 33 +.word 2128305784 // Layer 7, block 35 +.word 664762063 // Layer 7, block 37 +.word 272851431 // Layer 7, block 39 +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 596073 // Layer 6, block 20 +.word 29039358 // Layer 6, block 21 +.word 6760262 // Layer 6, block 22 +.word 2228887 // Layer 6, block 23 +.word 38145761 // Layer 6, block 20 +.word 1858377074 // Layer 6, block 21 +.word 432623749 // Layer 6, block 22 +.word 142637881 // Layer 6, block 23 +.word 25929180 // Layer 7, block 40 +.word 23508428 // Layer 7, block 42 +.word 22560727 // Layer 7, block 44 +.word 29457393 // Layer 7, block 46 +.word 1659340873 // Layer 7, block 40 +.word 1504424569 // Layer 7, block 42 +.word 1443776334 // Layer 7, block 44 +.word 1885129272 // Layer 7, block 46 +.word 17371159 // Layer 7, block 41 +.word 11558208 // Layer 7, block 43 +.word 15755637 // Layer 7, block 45 +.word 20740787 // Layer 7, block 47 +.word 1111669329 // Layer 7, block 41 +.word 739668858 // Layer 7, block 43 +.word 1008283812 // Layer 7, block 45 +.word 1327309063 // Layer 7, block 47 +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 13624329 // Layer 6, block 24 +.word 9838349 // Layer 6, block 25 +.word 6934560 // Layer 6, block 26 +.word 11310234 // Layer 6, block 27 +.word 871890510 // Layer 6, block 24 +.word 629606282 // Layer 6, block 25 +.word 443777969 // Layer 6, block 26 +.word 723799733 // Layer 6, block 27 +.word 3153984 // Layer 7, block 48 +.word 15599806 // Layer 7, block 50 +.word 23484790 // Layer 7, block 52 +.word 30174454 // Layer 7, block 54 +.word 201839571 // Layer 7, block 48 +.word 998311389 // Layer 7, block 50 +.word 1502911852 // Layer 7, block 52 +.word 1931017673 // Layer 7, block 54 +.word 13598070 // Layer 7, block 49 +.word 31454003 // Layer 7, block 51 +.word 20506260 // Layer 7, block 53 +.word 5928435 // Layer 7, block 55 +.word 870210062 // Layer 7, block 49 +.word 2012902560 // Layer 7, block 51 +.word 1312300480 // Layer 7, block 53 +.word 379390883 // Layer 7, block 55 +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 32798516 // Layer 6, block 28 +.word 9911360 // Layer 6, block 29 +.word 32443170 // Layer 6, block 30 +.word 31293482 // Layer 6, block 31 +.word 2098944825 // Layer 6, block 28 +.word 634278629 // Layer 6, block 29 +.word 2076204416 // Layer 6, block 30 +.word 2002630000 // Layer 6, block 31 +.word 26013877 // Layer 7, block 56 +.word 22928950 // Layer 7, block 58 +.word 24547058 // Layer 7, block 60 +.word 21082546 // Layer 7, block 62 +.word 1664761067 // Layer 7, block 56 +.word 1467340807 // Layer 7, block 58 +.word 1570891816 // Layer 7, block 60 +.word 1349179970 // Layer 7, block 62 +.word 21864746 // Layer 7, block 57 +.word 27678266 // Layer 7, block 59 +.word 30695887 // Layer 7, block 61 +.word 31772478 // Layer 7, block 63 +.word 1399236949 // Layer 7, block 57 +.word 1771273834 // Layer 7, block 59 +.word 1964386839 // Layer 7, block 61 +.word 2033283404 // Layer 7, block 63 +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 2853776 // Layer 6, block 32 +.word 31645959 // Layer 6, block 33 +.word 29723614 // Layer 6, block 34 +.word 31813171 // Layer 6, block 35 +.word 182627725 // Layer 6, block 32 +.word 2025186806 // Layer 6, block 33 +.word 1902166116 // Layer 6, block 34 +.word 2035887557 // Layer 6, block 35 +.word 30377953 // Layer 7, block 64 +.word 4924837 // Layer 7, block 66 +.word 11362575 // Layer 7, block 68 +.word 31398766 // Layer 7, block 70 +.word 1944040616 // Layer 7, block 64 +.word 315165513 // Layer 7, block 66 +.word 727149301 // Layer 7, block 68 +.word 2009367662 // Layer 7, block 70 +.word 27689101 // Layer 7, block 65 +.word 31229525 // Layer 7, block 67 +.word 6544948 // Layer 7, block 69 +.word 13728247 // Layer 7, block 71 +.word 1771967221 // Layer 7, block 65 +.word 1998537064 // Layer 7, block 67 +.word 418844704 // Layer 7, block 69 +.word 878540754 // Layer 7, block 71 +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9116920 // Layer 6, block 36 +.word 26449800 // Layer 6, block 37 +.word 27173300 // Layer 6, block 38 +.word 1574249 // Layer 6, block 39 +.word 583438350 // Layer 6, block 36 +.word 1692658010 // Layer 6, block 37 +.word 1738958476 // Layer 6, block 38 +.word 100744247 // Layer 6, block 39 +.word 6510145 // Layer 7, block 72 +.word 760999 // Layer 7, block 74 +.word 1634503 // Layer 7, block 76 +.word 29546109 // Layer 7, block 78 +.word 416617482 // Layer 7, block 72 +.word 48700219 // Layer 7, block 74 +.word 104600209 // Layer 7, block 76 +.word 1890806663 // Layer 7, block 78 +.word 2195232 // Layer 7, block 73 +.word 4465852 // Layer 7, block 75 +.word 31203102 // Layer 7, block 77 +.word 29916743 // Layer 7, block 79 +.word 140484126 // Layer 7, block 73 +.word 285792715 // Layer 7, block 75 +.word 1996846121 // Layer 7, block 77 +.word 1914525428 // Layer 7, block 79 +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29172999 // Layer 6, block 40 +.word 16825951 // Layer 6, block 41 +.word 11592382 // Layer 6, block 42 +.word 2671395 // Layer 6, block 43 +.word 1866929445 // Layer 6, block 40 +.word 1076778680 // Layer 6, block 41 +.word 741855827 // Layer 6, block 42 +.word 170956232 // Layer 6, block 43 +.word 14579779 // Layer 7, block 80 +.word 24263513 // Layer 7, block 82 +.word 4646776 // Layer 7, block 84 +.word 69049 // Layer 7, block 86 +.word 933034643 // Layer 7, block 80 +.word 1552746321 // Layer 7, block 82 +.word 297370968 // Layer 7, block 84 +.word 4418799 // Layer 7, block 86 +.word 33263488 // Layer 7, block 81 +.word 22493246 // Layer 7, block 83 +.word 22009979 // Layer 7, block 85 +.word 12021234 // Layer 7, block 87 +.word 2128700762 // Layer 7, block 81 +.word 1439457879 // Layer 7, block 83 +.word 1408531152 // Layer 7, block 85 +.word 769300260 // Layer 7, block 87 +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 15720958 // Layer 6, block 44 +.word 4876619 // Layer 6, block 45 +.word 9370171 // Layer 6, block 46 +.word 2197027 // Layer 6, block 47 +.word 1006064525 // Layer 6, block 44 +.word 312079797 // Layer 6, block 45 +.word 599645177 // Layer 6, block 46 +.word 140598997 // Layer 6, block 47 +.word 16117282 // Layer 7, block 88 +.word 9635661 // Layer 7, block 90 +.word 9117520 // Layer 7, block 92 +.word 3506913 // Layer 7, block 94 +.word 1031427326 // Layer 7, block 88 +.word 616635240 // Layer 7, block 90 +.word 583476747 // Layer 7, block 92 +.word 224425303 // Layer 7, block 94 +.word 20014407 // Layer 7, block 89 +.word 25893988 // Layer 7, block 91 +.word 10257619 // Layer 7, block 93 +.word 24501669 // Layer 7, block 95 +.word 1280824291 // Layer 7, block 89 +.word 1657088757 // Layer 7, block 91 +.word 656437514 // Layer 7, block 93 +.word 1567987141 // Layer 7, block 95 +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 23467272 // Layer 6, block 48 +.word 11944835 // Layer 6, block 49 +.word 29768154 // Layer 6, block 50 +.word 3189790 // Layer 6, block 51 +.word 1501790786 // Layer 6, block 48 +.word 764411097 // Layer 6, block 49 +.word 1905016458 // Layer 6, block 50 +.word 204130980 // Layer 6, block 51 +.word 28559032 // Layer 7, block 96 +.word 20151609 // Layer 7, block 98 +.word 11645481 // Layer 7, block 100 +.word 16402437 // Layer 7, block 102 +.word 1827638556 // Layer 7, block 96 +.word 1289604549 // Layer 7, block 98 +.word 745253903 // Layer 7, block 100 +.word 1049675853 // Layer 7, block 102 +.word 1005359 // Layer 7, block 97 +.word 19130139 // Layer 7, block 99 +.word 11690281 // Layer 7, block 101 +.word 5461508 // Layer 7, block 103 +.word 64338065 // Layer 7, block 97 +.word 1224235458 // Layer 7, block 99 +.word 748120885 // Layer 7, block 101 +.word 349509836 // Layer 7, block 103 +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 4898455 // Layer 6, block 52 +.word 22059944 // Layer 6, block 53 +.word 20315246 // Layer 6, block 54 +.word 28615767 // Layer 6, block 55 +.word 313477194 // Layer 6, block 52 +.word 1411728668 // Layer 6, block 53 +.word 1300076517 // Layer 6, block 54 +.word 1831269319 // Layer 6, block 55 +.word 6226096 // Layer 7, block 104 +.word 14029790 // Layer 7, block 106 +.word 7729000 // Layer 7, block 108 +.word 13958531 // Layer 7, block 110 +.word 398439734 // Layer 7, block 104 +.word 897838034 // Layer 7, block 106 +.word 494618249 // Layer 7, block 108 +.word 893277806 // Layer 7, block 110 +.word 31755058 // Layer 7, block 105 +.word 26102744 // Layer 7, block 107 +.word 19175904 // Layer 7, block 109 +.word 19472238 // Layer 7, block 111 +.word 2032168609 // Layer 7, block 105 +.word 1670448121 // Layer 7, block 107 +.word 1227164194 // Layer 7, block 109 +.word 1246128123 // Layer 7, block 111 +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 17302560 // Layer 6, block 56 +.word 8630188 // Layer 6, block 57 +.word 13744680 // Layer 6, block 58 +.word 31890906 // Layer 6, block 59 +.word 1107279328 // Layer 6, block 56 +.word 552289879 // Layer 6, block 57 +.word 879592386 // Layer 6, block 58 +.word 2040862218 // Layer 6, block 59 +.word 4735938 // Layer 7, block 112 +.word 26671657 // Layer 7, block 114 +.word 25810971 // Layer 7, block 116 +.word 25578690 // Layer 7, block 118 +.word 303076900 // Layer 7, block 112 +.word 1706855774 // Layer 7, block 114 +.word 1651776074 // Layer 7, block 116 +.word 1636911225 // Layer 7, block 118 +.word 6957373 // Layer 7, block 113 +.word 25381712 // Layer 7, block 115 +.word 27780827 // Layer 7, block 117 +.word 28062311 // Layer 7, block 119 +.word 445237890 // Layer 7, block 113 +.word 1624305595 // Layer 7, block 115 +.word 1777837237 // Layer 7, block 117 +.word 1795850838 // Layer 7, block 119 +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 26150922 // Layer 6, block 60 +.word 29525906 // Layer 6, block 61 +.word 23080870 // Layer 6, block 62 +.word 1636987 // Layer 6, block 63 +.word 1673531278 // Layer 6, block 60 +.word 1889513769 // Layer 6, block 61 +.word 1477062945 // Layer 6, block 62 +.word 104759172 // Layer 6, block 63 +.word 10674616 // Layer 7, block 120 +.word 9508293 // Layer 7, block 122 +.word 4274200 // Layer 7, block 124 +.word 10066304 // Layer 7, block 126 +.word 683123285 // Layer 7, block 120 +.word 608484310 // Layer 7, block 122 +.word 273527923 // Layer 7, block 124 +.word 644194289 // Layer 7, block 126 +.word 26473446 // Layer 7, block 121 +.word 14853570 // Layer 7, block 123 +.word 32427548 // Layer 7, block 125 +.word 16598340 // Layer 7, block 127 +.word 1694171239 // Layer 7, block 121 +.word 950555930 // Layer 7, block 123 +.word 2075204685 // Layer 7, block 125 +.word 1062212688 // Layer 7, block 127 +.text +.global ntt_u32_full_neon_asm_var_4_4_14_0 +.global _ntt_u32_full_neon_asm_var_4_4_14_0 +ntt_u32_full_neon_asm_var_4_4_14_0: +_ntt_u32_full_neon_asm_var_4_4_14_0: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x0, #928] +ldr q29, [x17, #+0] +ldr q28, [x17, #+16] +sqrdmulh v27.4S, v30.4S, v28.s[0] +mul v30.4S, v30.4S,v29.s[0] +ldr q26, [x0, #992] +sqrdmulh v25.4S, v26.4S, v28.s[0] +mul v26.4S, v26.4S,v29.s[0] +ldr q24, [x0, #800] +sqrdmulh v23.4S, v24.4S, v28.s[0] +mul v24.4S, v24.4S,v29.s[0] +ldr q22, [x0, #864] +sqrdmulh v21.4S, v22.4S, v28.s[0] +mul v22.4S, v22.4S,v29.s[0] +ldr q20, [x0, #544] +mla v30.4S, v27.4S, v31.s[0] +sqrdmulh v27.4S, v20.4S, v28.s[0] +ldr q19, [x0, #608] +mla v26.4S, v25.4S, v31.s[0] +sqrdmulh v25.4S, v19.4S, v28.s[0] +nop +ldr q18, [x0, #672] +mla v24.4S, v23.4S, v31.s[0] +sqrdmulh v23.4S, v18.4S, v28.s[0] +nop +ldr q17, [x0, #736] +mla v22.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v17.4S, v28.s[0] +nop +ldr q16, [x0, #416] +ldr q3, [x0, #480] +mul v20.4S, v20.4S,v29.s[0] +sub v2.4s, v16.4s, v30.4s +mul v19.4S, v19.4S,v29.s[0] +add v16.4s, v16.4s, v30.4s +ldr q30, [x0, #288] +ldr q1, [x0, #352] +mla v20.4S, v27.4S, v31.s[0] +sub v27.4s, v3.4s, v26.4s +mla v19.4S, v25.4S, v31.s[0] +add v3.4s, v3.4s, v26.4s +ldr q26, [x0, #32] +ldr q25, [x0, #96] +mul v18.4S, v18.4S,v29.s[0] +sub v0.4s, v30.4s, v24.4s +mul v17.4S, v17.4S,v29.s[0] +add v30.4s, v30.4s, v24.4s +ldr q24, [x0, #160] +ldr q15, [x0, #224] +mla v18.4S, v23.4S, v31.s[0] +sub v23.4s, v1.4s, v22.4s +mla v17.4S, v21.4S, v31.s[0] +add v1.4s, v1.4s, v22.4s +sqrdmulh v22.4S, v16.4S, v28.s[1] +nop +mul v16.4S, v16.4S,v29.s[1] +nop +sqrdmulh v21.4S, v3.4S, v28.s[1] +sub v14.4s, v26.4s, v20.4s +mul v3.4S, v3.4S,v29.s[1] +add v26.4s, v26.4s, v20.4s +sqrdmulh v20.4S, v30.4S, v28.s[1] +sub v13.4s, v25.4s, v19.4s +mul v30.4S, v30.4S,v29.s[1] +add v25.4s, v25.4s, v19.4s +sqrdmulh v19.4S, v1.4S, v28.s[1] +sub v12.4s, v24.4s, v18.4s +mul v1.4S, v1.4S,v29.s[1] +add v24.4s, v24.4s, v18.4s +mla v16.4S, v22.4S, v31.s[0] +sub v22.4s, v15.4s, v17.4s +sqrdmulh v18.4S, v2.4S, v28.s[2] +add v15.4s, v15.4s, v17.4s +mla v3.4S, v21.4S, v31.s[0] +nop +sqrdmulh v21.4S, v27.4S, v28.s[2] +nop +mla v30.4S, v20.4S, v31.s[0] +nop +sqrdmulh v20.4S, v0.4S, v28.s[2] +nop +mla v1.4S, v19.4S, v31.s[0] +nop +sqrdmulh v19.4S, v23.4S, v28.s[2] +nop +ldr q17, [x17, #+32] +ldr q11, [x17, #+48] +mul v2.4S, v2.4S,v29.s[2] +sub v10.4s, v24.4s, v16.4s +mul v27.4S, v27.4S,v29.s[2] +add v24.4s, v24.4s, v16.4s +mla v2.4S, v18.4S, v31.s[0] +sub v18.4s, v15.4s, v3.4s +mla v27.4S, v21.4S, v31.s[0] +add v15.4s, v15.4s, v3.4s +mul v0.4S, v0.4S,v29.s[2] +sub v3.4s, v26.4s, v30.4s +mul v23.4S, v23.4S,v29.s[2] +add v26.4s, v26.4s, v30.4s +mla v0.4S, v20.4S, v31.s[0] +sub v20.4s, v25.4s, v1.4s +mla v23.4S, v19.4S, v31.s[0] +add v25.4s, v25.4s, v1.4s +sqrdmulh v1.4S, v10.4S, v11.s[1] +nop +mul v10.4S, v10.4S,v17.s[1] +nop +sqrdmulh v19.4S, v18.4S, v11.s[1] +sub v30.4s, v12.4s, v2.4s +mul v18.4S, v18.4S,v17.s[1] +add v12.4s, v12.4s, v2.4s +sqrdmulh v2.4S, v24.4S, v11.s[0] +sub v21.4s, v22.4s, v27.4s +mul v24.4S, v24.4S,v17.s[0] +add v22.4s, v22.4s, v27.4s +sqrdmulh v27.4S, v15.4S, v11.s[0] +sub v16.4s, v14.4s, v0.4s +mul v15.4S, v15.4S,v17.s[0] +add v14.4s, v14.4s, v0.4s +ldr q0, [x17, #+64] +ldr q9, [x17, #+80] +mla v10.4S, v1.4S, v31.s[0] +sub v1.4s, v13.4s, v23.4s +sqrdmulh v8.4S, v12.4S, v11.s[2] +add v13.4s, v13.4s, v23.4s +mla v18.4S, v19.4S, v31.s[0] +nop +sqrdmulh v19.4S, v22.4S, v11.s[2] +nop +mla v24.4S, v2.4S, v31.s[0] +nop +sqrdmulh v2.4S, v30.4S, v11.s[3] +nop +mla v15.4S, v27.4S, v31.s[0] +nop +sqrdmulh v27.4S, v21.4S, v11.s[3] +nop +ldr q23, [x17, #+96] +ldr q7, [x17, #+112] +mul v12.4S, v12.4S,v17.s[2] +sub v6.4s, v3.4s, v10.4s +mul v22.4S, v22.4S,v17.s[2] +add v3.4s, v3.4s, v10.4s +mla v12.4S, v8.4S, v31.s[0] +sub v8.4s, v20.4s, v18.4s +mla v22.4S, v19.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +mul v30.4S, v30.4S,v17.s[3] +sub v18.4s, v26.4s, v24.4s +mul v21.4S, v21.4S,v17.s[3] +add v26.4s, v26.4s, v24.4s +mla v30.4S, v2.4S, v31.s[0] +sub v2.4s, v25.4s, v15.4s +mla v21.4S, v27.4S, v31.s[0] +add v25.4s, v25.4s, v15.4s +sqrdmulh v15.4S, v8.4S, v9.s[3] +nop +mul v8.4S, v8.4S,v0.s[3] +nop +sqrdmulh v27.4S, v20.4S, v9.s[2] +sub v24.4s, v14.4s, v12.4s +mul v20.4S, v20.4S,v0.s[2] +add v14.4s, v14.4s, v12.4s +sqrdmulh v12.4S, v2.4S, v9.s[1] +sub v19.4s, v13.4s, v22.4s +mul v2.4S, v2.4S,v0.s[1] +add v13.4s, v13.4s, v22.4s +sqrdmulh v22.4S, v25.4S, v9.s[0] +sub v10.4s, v16.4s, v30.4s +mul v25.4S, v25.4S,v0.s[0] +add v16.4s, v16.4s, v30.4s +mla v8.4S, v15.4S, v31.s[0] +sub v15.4s, v1.4s, v21.4s +sqrdmulh v30.4S, v13.4S, v7.s[0] +add v1.4s, v1.4s, v21.4s +mla v20.4S, v27.4S, v31.s[0] +sub v27.4s, v6.4s, v8.4s +sqrdmulh v21.4S, v19.4S, v7.s[1] +add v6.4s, v6.4s, v8.4s +mla v2.4S, v12.4S, v31.s[0] +sub v12.4s, v3.4s, v20.4s +sqrdmulh v8.4S, v1.4S, v7.s[2] +add v3.4s, v3.4s, v20.4s +mla v25.4S, v22.4S, v31.s[0] +sub v22.4s, v18.4s, v2.4s +sqrdmulh v20.4S, v15.4S, v7.s[3] +add v18.4s, v18.4s, v2.4s +mul v13.4S, v13.4S,v23.s[0] +sub v2.4s, v26.4s, v25.4s +mul v19.4S, v19.4S,v23.s[1] +add v26.4s, v26.4s, v25.4s +mla v13.4S, v30.4S, v31.s[0] +str q12, [x0, #352] +mla v19.4S, v21.4S, v31.s[0] +str q3, [x0, #288] +mul v1.4S, v1.4S,v23.s[2] +str q27, [x0, #480] +mul v15.4S, v15.4S,v23.s[3] +str q6, [x0, #416] +mla v1.4S, v8.4S, v31.s[0] +str q22, [x0, #224] +mla v15.4S, v20.4S, v31.s[0] +str q18, [x0, #160] +ldr q18, [x0, #944] +sqrdmulh v20.4S, v18.4S, v28.s[0] +str q2, [x0, #96] +mul v18.4S, v18.4S,v29.s[0] +str q26, [x0, #32] +ldr q26, [x0, #1008] +sqrdmulh v2.4S, v26.4S, v28.s[0] +sub v22.4s, v14.4s, v13.4s +str q22, [x0, #608] +mul v26.4S, v26.4S,v29.s[0] +add v14.4s, v14.4s, v13.4s +ldr q13, [x0, #816] +sqrdmulh v22.4S, v13.4S, v28.s[0] +sub v8.4s, v24.4s, v19.4s +str q14, [x0, #544] +mul v13.4S, v13.4S,v29.s[0] +add v24.4s, v24.4s, v19.4s +ldr q19, [x0, #880] +sqrdmulh v14.4S, v19.4S, v28.s[0] +sub v6.4s, v16.4s, v1.4s +str q8, [x0, #736] +mul v19.4S, v19.4S,v29.s[0] +add v16.4s, v16.4s, v1.4s +ldr q1, [x0, #560] +mla v18.4S, v20.4S, v31.s[0] +sub v20.4s, v10.4s, v15.4s +str q24, [x0, #672] +sqrdmulh v24.4S, v1.4S, v28.s[0] +add v10.4s, v10.4s, v15.4s +ldr q15, [x0, #624] +mla v26.4S, v2.4S, v31.s[0] +str q6, [x0, #864] +sqrdmulh v6.4S, v15.4S, v28.s[0] +nop +ldr q2, [x0, #688] +mla v13.4S, v22.4S, v31.s[0] +str q16, [x0, #800] +sqrdmulh v16.4S, v2.4S, v28.s[0] +nop +ldr q22, [x0, #752] +mla v19.4S, v14.4S, v31.s[0] +str q20, [x0, #992] +sqrdmulh v20.4S, v22.4S, v28.s[0] +nop +ldr q14, [x0, #432] +ldr q8, [x0, #496] +mul v1.4S, v1.4S,v29.s[0] +sub v27.4s, v14.4s, v18.4s +str q10, [x0, #928] +mul v15.4S, v15.4S,v29.s[0] +add v14.4s, v14.4s, v18.4s +ldr q18, [x0, #304] +ldr q10, [x0, #368] +mla v1.4S, v24.4S, v31.s[0] +sub v24.4s, v8.4s, v26.4s +mla v15.4S, v6.4S, v31.s[0] +add v8.4s, v8.4s, v26.4s +ldr q26, [x0, #48] +ldr q6, [x0, #112] +mul v2.4S, v2.4S,v29.s[0] +sub v3.4s, v18.4s, v13.4s +mul v22.4S, v22.4S,v29.s[0] +add v18.4s, v18.4s, v13.4s +ldr q13, [x0, #176] +ldr q21, [x0, #240] +mla v2.4S, v16.4S, v31.s[0] +sub v16.4s, v10.4s, v19.4s +mla v22.4S, v20.4S, v31.s[0] +add v10.4s, v10.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v28.s[1] +nop +mul v14.4S, v14.4S,v29.s[1] +nop +sqrdmulh v20.4S, v8.4S, v28.s[1] +sub v12.4s, v26.4s, v1.4s +mul v8.4S, v8.4S,v29.s[1] +add v26.4s, v26.4s, v1.4s +sqrdmulh v1.4S, v18.4S, v28.s[1] +sub v30.4s, v6.4s, v15.4s +mul v18.4S, v18.4S,v29.s[1] +add v6.4s, v6.4s, v15.4s +sqrdmulh v15.4S, v10.4S, v28.s[1] +sub v25.4s, v13.4s, v2.4s +mul v10.4S, v10.4S,v29.s[1] +add v13.4s, v13.4s, v2.4s +mla v14.4S, v19.4S, v31.s[0] +sub v19.4s, v21.4s, v22.4s +sqrdmulh v2.4S, v27.4S, v28.s[2] +add v21.4s, v21.4s, v22.4s +mla v8.4S, v20.4S, v31.s[0] +nop +sqrdmulh v20.4S, v24.4S, v28.s[2] +nop +mla v18.4S, v1.4S, v31.s[0] +nop +sqrdmulh v1.4S, v3.4S, v28.s[2] +nop +mla v10.4S, v15.4S, v31.s[0] +nop +sqrdmulh v15.4S, v16.4S, v28.s[2] +nop +mul v27.4S, v27.4S,v29.s[2] +sub v22.4s, v13.4s, v14.4s +mul v24.4S, v24.4S,v29.s[2] +add v13.4s, v13.4s, v14.4s +mla v27.4S, v2.4S, v31.s[0] +sub v2.4s, v21.4s, v8.4s +mla v24.4S, v20.4S, v31.s[0] +add v21.4s, v21.4s, v8.4s +mul v3.4S, v3.4S,v29.s[2] +sub v8.4s, v26.4s, v18.4s +mul v16.4S, v16.4S,v29.s[2] +add v26.4s, v26.4s, v18.4s +mla v3.4S, v1.4S, v31.s[0] +sub v1.4s, v6.4s, v10.4s +mla v16.4S, v15.4S, v31.s[0] +add v6.4s, v6.4s, v10.4s +sqrdmulh v10.4S, v22.4S, v11.s[1] +nop +mul v22.4S, v22.4S,v17.s[1] +nop +sqrdmulh v15.4S, v2.4S, v11.s[1] +sub v18.4s, v25.4s, v27.4s +mul v2.4S, v2.4S,v17.s[1] +add v25.4s, v25.4s, v27.4s +sqrdmulh v27.4S, v13.4S, v11.s[0] +sub v20.4s, v19.4s, v24.4s +mul v13.4S, v13.4S,v17.s[0] +add v19.4s, v19.4s, v24.4s +sqrdmulh v24.4S, v21.4S, v11.s[0] +sub v14.4s, v12.4s, v3.4s +mul v21.4S, v21.4S,v17.s[0] +add v12.4s, v12.4s, v3.4s +mla v22.4S, v10.4S, v31.s[0] +sub v10.4s, v30.4s, v16.4s +sqrdmulh v3.4S, v25.4S, v11.s[2] +add v30.4s, v30.4s, v16.4s +mla v2.4S, v15.4S, v31.s[0] +nop +sqrdmulh v15.4S, v19.4S, v11.s[2] +nop +mla v13.4S, v27.4S, v31.s[0] +nop +sqrdmulh v27.4S, v18.4S, v11.s[3] +nop +mla v21.4S, v24.4S, v31.s[0] +nop +sqrdmulh v24.4S, v20.4S, v11.s[3] +nop +mul v25.4S, v25.4S,v17.s[2] +sub v16.4s, v8.4s, v22.4s +mul v19.4S, v19.4S,v17.s[2] +add v8.4s, v8.4s, v22.4s +mla v25.4S, v3.4S, v31.s[0] +sub v3.4s, v1.4s, v2.4s +mla v19.4S, v15.4S, v31.s[0] +add v1.4s, v1.4s, v2.4s +mul v18.4S, v18.4S,v17.s[3] +sub v2.4s, v26.4s, v13.4s +mul v20.4S, v20.4S,v17.s[3] +add v26.4s, v26.4s, v13.4s +mla v18.4S, v27.4S, v31.s[0] +sub v27.4s, v6.4s, v21.4s +mla v20.4S, v24.4S, v31.s[0] +add v6.4s, v6.4s, v21.4s +sqrdmulh v21.4S, v3.4S, v9.s[3] +nop +mul v3.4S, v3.4S,v0.s[3] +nop +sqrdmulh v24.4S, v1.4S, v9.s[2] +sub v13.4s, v12.4s, v25.4s +mul v1.4S, v1.4S,v0.s[2] +add v12.4s, v12.4s, v25.4s +sqrdmulh v25.4S, v27.4S, v9.s[1] +sub v15.4s, v30.4s, v19.4s +mul v27.4S, v27.4S,v0.s[1] +add v30.4s, v30.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v9.s[0] +sub v22.4s, v14.4s, v18.4s +mul v6.4S, v6.4S,v0.s[0] +add v14.4s, v14.4s, v18.4s +mla v3.4S, v21.4S, v31.s[0] +sub v21.4s, v10.4s, v20.4s +sqrdmulh v18.4S, v30.4S, v7.s[0] +add v10.4s, v10.4s, v20.4s +mla v1.4S, v24.4S, v31.s[0] +sub v24.4s, v16.4s, v3.4s +sqrdmulh v20.4S, v15.4S, v7.s[1] +add v16.4s, v16.4s, v3.4s +mla v27.4S, v25.4S, v31.s[0] +sub v25.4s, v8.4s, v1.4s +sqrdmulh v3.4S, v10.4S, v7.s[2] +add v8.4s, v8.4s, v1.4s +mla v6.4S, v19.4S, v31.s[0] +sub v19.4s, v2.4s, v27.4s +sqrdmulh v1.4S, v21.4S, v7.s[3] +add v2.4s, v2.4s, v27.4s +mul v30.4S, v30.4S,v23.s[0] +sub v27.4s, v26.4s, v6.4s +mul v15.4S, v15.4S,v23.s[1] +add v26.4s, v26.4s, v6.4s +mla v30.4S, v18.4S, v31.s[0] +str q25, [x0, #368] +mla v15.4S, v20.4S, v31.s[0] +str q8, [x0, #304] +mul v10.4S, v10.4S,v23.s[2] +str q24, [x0, #496] +mul v21.4S, v21.4S,v23.s[3] +str q16, [x0, #432] +mla v10.4S, v3.4S, v31.s[0] +str q19, [x0, #240] +mla v21.4S, v1.4S, v31.s[0] +str q2, [x0, #176] +ldr q2, [x0, #896] +sqrdmulh v1.4S, v2.4S, v28.s[0] +str q27, [x0, #112] +mul v2.4S, v2.4S,v29.s[0] +str q26, [x0, #48] +ldr q26, [x0, #960] +sqrdmulh v27.4S, v26.4S, v28.s[0] +sub v19.4s, v12.4s, v30.4s +str q19, [x0, #624] +mul v26.4S, v26.4S,v29.s[0] +add v12.4s, v12.4s, v30.4s +ldr q30, [x0, #768] +sqrdmulh v19.4S, v30.4S, v28.s[0] +sub v3.4s, v13.4s, v15.4s +str q12, [x0, #560] +mul v30.4S, v30.4S,v29.s[0] +add v13.4s, v13.4s, v15.4s +ldr q15, [x0, #832] +sqrdmulh v12.4S, v15.4S, v28.s[0] +sub v16.4s, v14.4s, v10.4s +str q3, [x0, #752] +mul v15.4S, v15.4S,v29.s[0] +add v14.4s, v14.4s, v10.4s +ldr q10, [x0, #512] +mla v2.4S, v1.4S, v31.s[0] +sub v1.4s, v22.4s, v21.4s +str q13, [x0, #688] +sqrdmulh v13.4S, v10.4S, v28.s[0] +add v22.4s, v22.4s, v21.4s +ldr q21, [x0, #576] +mla v26.4S, v27.4S, v31.s[0] +str q16, [x0, #880] +sqrdmulh v16.4S, v21.4S, v28.s[0] +nop +ldr q27, [x0, #640] +mla v30.4S, v19.4S, v31.s[0] +str q14, [x0, #816] +sqrdmulh v14.4S, v27.4S, v28.s[0] +nop +ldr q19, [x0, #704] +mla v15.4S, v12.4S, v31.s[0] +str q1, [x0, #1008] +sqrdmulh v1.4S, v19.4S, v28.s[0] +nop +ldr q12, [x0, #384] +ldr q3, [x0, #448] +mul v10.4S, v10.4S,v29.s[0] +sub v24.4s, v12.4s, v2.4s +str q22, [x0, #944] +mul v21.4S, v21.4S,v29.s[0] +add v12.4s, v12.4s, v2.4s +ldr q2, [x0, #256] +ldr q22, [x0, #320] +mla v10.4S, v13.4S, v31.s[0] +sub v13.4s, v3.4s, v26.4s +mla v21.4S, v16.4S, v31.s[0] +add v3.4s, v3.4s, v26.4s +ldr q26, [x0, #0] +ldr q16, [x0, #64] +mul v27.4S, v27.4S,v29.s[0] +sub v8.4s, v2.4s, v30.4s +mul v19.4S, v19.4S,v29.s[0] +add v2.4s, v2.4s, v30.4s +ldr q30, [x0, #128] +ldr q20, [x0, #192] +mla v27.4S, v14.4S, v31.s[0] +sub v14.4s, v22.4s, v15.4s +mla v19.4S, v1.4S, v31.s[0] +add v22.4s, v22.4s, v15.4s +sqrdmulh v15.4S, v12.4S, v28.s[1] +nop +mul v12.4S, v12.4S,v29.s[1] +nop +sqrdmulh v1.4S, v3.4S, v28.s[1] +sub v25.4s, v26.4s, v10.4s +mul v3.4S, v3.4S,v29.s[1] +add v26.4s, v26.4s, v10.4s +sqrdmulh v10.4S, v2.4S, v28.s[1] +sub v18.4s, v16.4s, v21.4s +mul v2.4S, v2.4S,v29.s[1] +add v16.4s, v16.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v28.s[1] +sub v6.4s, v30.4s, v27.4s +mul v22.4S, v22.4S,v29.s[1] +add v30.4s, v30.4s, v27.4s +mla v12.4S, v15.4S, v31.s[0] +sub v15.4s, v20.4s, v19.4s +sqrdmulh v27.4S, v24.4S, v28.s[2] +add v20.4s, v20.4s, v19.4s +mla v3.4S, v1.4S, v31.s[0] +nop +sqrdmulh v1.4S, v13.4S, v28.s[2] +nop +mla v2.4S, v10.4S, v31.s[0] +nop +sqrdmulh v10.4S, v8.4S, v28.s[2] +nop +mla v22.4S, v21.4S, v31.s[0] +nop +sqrdmulh v21.4S, v14.4S, v28.s[2] +nop +mul v24.4S, v24.4S,v29.s[2] +sub v19.4s, v30.4s, v12.4s +mul v13.4S, v13.4S,v29.s[2] +add v30.4s, v30.4s, v12.4s +mla v24.4S, v27.4S, v31.s[0] +sub v27.4s, v20.4s, v3.4s +mla v13.4S, v1.4S, v31.s[0] +add v20.4s, v20.4s, v3.4s +mul v8.4S, v8.4S,v29.s[2] +sub v3.4s, v26.4s, v2.4s +mul v14.4S, v14.4S,v29.s[2] +add v26.4s, v26.4s, v2.4s +mla v8.4S, v10.4S, v31.s[0] +sub v10.4s, v16.4s, v22.4s +mla v14.4S, v21.4S, v31.s[0] +add v16.4s, v16.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v11.s[1] +nop +mul v19.4S, v19.4S,v17.s[1] +nop +sqrdmulh v21.4S, v27.4S, v11.s[1] +sub v2.4s, v6.4s, v24.4s +mul v27.4S, v27.4S,v17.s[1] +add v6.4s, v6.4s, v24.4s +sqrdmulh v24.4S, v30.4S, v11.s[0] +sub v1.4s, v15.4s, v13.4s +mul v30.4S, v30.4S,v17.s[0] +add v15.4s, v15.4s, v13.4s +sqrdmulh v13.4S, v20.4S, v11.s[0] +sub v12.4s, v25.4s, v8.4s +mul v20.4S, v20.4S,v17.s[0] +add v25.4s, v25.4s, v8.4s +mla v19.4S, v22.4S, v31.s[0] +sub v22.4s, v18.4s, v14.4s +sqrdmulh v8.4S, v6.4S, v11.s[2] +add v18.4s, v18.4s, v14.4s +mla v27.4S, v21.4S, v31.s[0] +nop +sqrdmulh v21.4S, v15.4S, v11.s[2] +nop +mla v30.4S, v24.4S, v31.s[0] +nop +sqrdmulh v24.4S, v2.4S, v11.s[3] +nop +mla v20.4S, v13.4S, v31.s[0] +nop +sqrdmulh v13.4S, v1.4S, v11.s[3] +nop +mul v6.4S, v6.4S,v17.s[2] +sub v14.4s, v3.4s, v19.4s +mul v15.4S, v15.4S,v17.s[2] +add v3.4s, v3.4s, v19.4s +mla v6.4S, v8.4S, v31.s[0] +sub v8.4s, v10.4s, v27.4s +mla v15.4S, v21.4S, v31.s[0] +add v10.4s, v10.4s, v27.4s +mul v2.4S, v2.4S,v17.s[3] +sub v27.4s, v26.4s, v30.4s +mul v1.4S, v1.4S,v17.s[3] +add v26.4s, v26.4s, v30.4s +mla v2.4S, v24.4S, v31.s[0] +sub v24.4s, v16.4s, v20.4s +mla v1.4S, v13.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v9.s[3] +nop +mul v8.4S, v8.4S,v0.s[3] +nop +sqrdmulh v13.4S, v10.4S, v9.s[2] +sub v30.4s, v25.4s, v6.4s +mul v10.4S, v10.4S,v0.s[2] +add v25.4s, v25.4s, v6.4s +sqrdmulh v6.4S, v24.4S, v9.s[1] +sub v21.4s, v18.4s, v15.4s +mul v24.4S, v24.4S,v0.s[1] +add v18.4s, v18.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v9.s[0] +sub v19.4s, v12.4s, v2.4s +mul v16.4S, v16.4S,v0.s[0] +add v12.4s, v12.4s, v2.4s +mla v8.4S, v20.4S, v31.s[0] +sub v20.4s, v22.4s, v1.4s +sqrdmulh v2.4S, v18.4S, v7.s[0] +add v22.4s, v22.4s, v1.4s +mla v10.4S, v13.4S, v31.s[0] +sub v13.4s, v14.4s, v8.4s +sqrdmulh v1.4S, v21.4S, v7.s[1] +add v14.4s, v14.4s, v8.4s +mla v24.4S, v6.4S, v31.s[0] +sub v6.4s, v3.4s, v10.4s +sqrdmulh v8.4S, v22.4S, v7.s[2] +add v3.4s, v3.4s, v10.4s +mla v16.4S, v15.4S, v31.s[0] +sub v15.4s, v27.4s, v24.4s +sqrdmulh v10.4S, v20.4S, v7.s[3] +add v27.4s, v27.4s, v24.4s +mul v18.4S, v18.4S,v23.s[0] +sub v24.4s, v26.4s, v16.4s +mul v21.4S, v21.4S,v23.s[1] +add v26.4s, v26.4s, v16.4s +mla v18.4S, v2.4S, v31.s[0] +str q6, [x0, #320] +mla v21.4S, v1.4S, v31.s[0] +str q3, [x0, #256] +mul v22.4S, v22.4S,v23.s[2] +str q13, [x0, #448] +mul v20.4S, v20.4S,v23.s[3] +str q14, [x0, #384] +mla v22.4S, v8.4S, v31.s[0] +str q15, [x0, #192] +mla v20.4S, v10.4S, v31.s[0] +str q27, [x0, #128] +ldr q27, [x0, #912] +sqrdmulh v10.4S, v27.4S, v28.s[0] +str q24, [x0, #64] +mul v27.4S, v27.4S,v29.s[0] +str q26, [x0, #0] +ldr q26, [x0, #976] +sqrdmulh v24.4S, v26.4S, v28.s[0] +sub v15.4s, v25.4s, v18.4s +str q15, [x0, #576] +mul v26.4S, v26.4S,v29.s[0] +add v25.4s, v25.4s, v18.4s +ldr q18, [x0, #784] +sqrdmulh v15.4S, v18.4S, v28.s[0] +sub v8.4s, v30.4s, v21.4s +str q25, [x0, #512] +mul v18.4S, v18.4S,v29.s[0] +add v30.4s, v30.4s, v21.4s +ldr q21, [x0, #848] +sqrdmulh v25.4S, v21.4S, v28.s[0] +sub v14.4s, v12.4s, v22.4s +str q8, [x0, #704] +mul v21.4S, v21.4S,v29.s[0] +add v12.4s, v12.4s, v22.4s +ldr q22, [x0, #528] +mla v27.4S, v10.4S, v31.s[0] +sub v10.4s, v19.4s, v20.4s +str q30, [x0, #640] +sqrdmulh v30.4S, v22.4S, v28.s[0] +add v19.4s, v19.4s, v20.4s +ldr q20, [x0, #592] +mla v26.4S, v24.4S, v31.s[0] +str q14, [x0, #832] +sqrdmulh v14.4S, v20.4S, v28.s[0] +nop +ldr q24, [x0, #656] +mla v18.4S, v15.4S, v31.s[0] +str q12, [x0, #768] +sqrdmulh v12.4S, v24.4S, v28.s[0] +nop +ldr q15, [x0, #720] +mla v21.4S, v25.4S, v31.s[0] +str q10, [x0, #960] +sqrdmulh v10.4S, v15.4S, v28.s[0] +nop +ldr q25, [x0, #400] +ldr q8, [x0, #464] +mul v22.4S, v22.4S,v29.s[0] +sub v13.4s, v25.4s, v27.4s +str q19, [x0, #896] +mul v20.4S, v20.4S,v29.s[0] +add v25.4s, v25.4s, v27.4s +ldr q27, [x0, #272] +ldr q19, [x0, #336] +mla v22.4S, v30.4S, v31.s[0] +sub v30.4s, v8.4s, v26.4s +mla v20.4S, v14.4S, v31.s[0] +add v8.4s, v8.4s, v26.4s +ldr q26, [x0, #16] +ldr q14, [x0, #80] +mul v24.4S, v24.4S,v29.s[0] +sub v3.4s, v27.4s, v18.4s +mul v15.4S, v15.4S,v29.s[0] +add v27.4s, v27.4s, v18.4s +ldr q18, [x0, #144] +ldr q1, [x0, #208] +mla v24.4S, v12.4S, v31.s[0] +sub v12.4s, v19.4s, v21.4s +mla v15.4S, v10.4S, v31.s[0] +add v19.4s, v19.4s, v21.4s +sqrdmulh v21.4S, v25.4S, v28.s[1] +nop +mul v25.4S, v25.4S,v29.s[1] +nop +sqrdmulh v10.4S, v8.4S, v28.s[1] +sub v6.4s, v26.4s, v22.4s +mul v8.4S, v8.4S,v29.s[1] +add v26.4s, v26.4s, v22.4s +sqrdmulh v22.4S, v27.4S, v28.s[1] +sub v2.4s, v14.4s, v20.4s +mul v27.4S, v27.4S,v29.s[1] +add v14.4s, v14.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v28.s[1] +sub v16.4s, v18.4s, v24.4s +mul v19.4S, v19.4S,v29.s[1] +add v18.4s, v18.4s, v24.4s +mla v25.4S, v21.4S, v31.s[0] +sub v21.4s, v1.4s, v15.4s +sqrdmulh v24.4S, v13.4S, v28.s[2] +add v1.4s, v1.4s, v15.4s +mla v8.4S, v10.4S, v31.s[0] +nop +sqrdmulh v10.4S, v30.4S, v28.s[2] +nop +mla v27.4S, v22.4S, v31.s[0] +nop +sqrdmulh v22.4S, v3.4S, v28.s[2] +nop +mla v19.4S, v20.4S, v31.s[0] +nop +sqrdmulh v20.4S, v12.4S, v28.s[2] +nop +mul v13.4S, v13.4S,v29.s[2] +sub v15.4s, v18.4s, v25.4s +mul v30.4S, v30.4S,v29.s[2] +add v18.4s, v18.4s, v25.4s +mla v13.4S, v24.4S, v31.s[0] +sub v24.4s, v1.4s, v8.4s +mla v30.4S, v10.4S, v31.s[0] +add v1.4s, v1.4s, v8.4s +mul v3.4S, v3.4S,v29.s[2] +sub v8.4s, v26.4s, v27.4s +mul v12.4S, v12.4S,v29.s[2] +add v26.4s, v26.4s, v27.4s +mla v3.4S, v22.4S, v31.s[0] +sub v22.4s, v14.4s, v19.4s +mla v12.4S, v20.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +sqrdmulh v28.4S, v15.4S, v11.s[1] +nop +mul v15.4S, v15.4S,v17.s[1] +nop +sqrdmulh v29.4S, v24.4S, v11.s[1] +sub v19.4s, v16.4s, v13.4s +mul v24.4S, v24.4S,v17.s[1] +add v16.4s, v16.4s, v13.4s +sqrdmulh v13.4S, v18.4S, v11.s[0] +sub v20.4s, v21.4s, v30.4s +mul v18.4S, v18.4S,v17.s[0] +add v21.4s, v21.4s, v30.4s +sqrdmulh v30.4S, v1.4S, v11.s[0] +sub v27.4s, v6.4s, v3.4s +mul v1.4S, v1.4S,v17.s[0] +add v6.4s, v6.4s, v3.4s +mla v15.4S, v28.4S, v31.s[0] +sub v28.4s, v2.4s, v12.4s +sqrdmulh v3.4S, v16.4S, v11.s[2] +add v2.4s, v2.4s, v12.4s +mla v24.4S, v29.4S, v31.s[0] +nop +sqrdmulh v29.4S, v21.4S, v11.s[2] +nop +mla v18.4S, v13.4S, v31.s[0] +nop +sqrdmulh v13.4S, v19.4S, v11.s[3] +nop +mla v1.4S, v30.4S, v31.s[0] +nop +sqrdmulh v30.4S, v20.4S, v11.s[3] +nop +mul v16.4S, v16.4S,v17.s[2] +sub v12.4s, v8.4s, v15.4s +mul v21.4S, v21.4S,v17.s[2] +add v8.4s, v8.4s, v15.4s +mla v16.4S, v3.4S, v31.s[0] +sub v3.4s, v22.4s, v24.4s +mla v21.4S, v29.4S, v31.s[0] +add v22.4s, v22.4s, v24.4s +mul v19.4S, v19.4S,v17.s[3] +sub v24.4s, v26.4s, v18.4s +mul v20.4S, v20.4S,v17.s[3] +add v26.4s, v26.4s, v18.4s +mla v19.4S, v13.4S, v31.s[0] +sub v13.4s, v14.4s, v1.4s +mla v20.4S, v30.4S, v31.s[0] +add v14.4s, v14.4s, v1.4s +sqrdmulh v11.4S, v3.4S, v9.s[3] +nop +mul v3.4S, v3.4S,v0.s[3] +nop +sqrdmulh v17.4S, v22.4S, v9.s[2] +sub v1.4s, v6.4s, v16.4s +mul v22.4S, v22.4S,v0.s[2] +add v6.4s, v6.4s, v16.4s +sqrdmulh v16.4S, v13.4S, v9.s[1] +sub v30.4s, v2.4s, v21.4s +mul v13.4S, v13.4S,v0.s[1] +add v2.4s, v2.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v9.s[0] +sub v18.4s, v27.4s, v19.4s +mul v14.4S, v14.4S,v0.s[0] +add v27.4s, v27.4s, v19.4s +mla v3.4S, v11.4S, v31.s[0] +sub v11.4s, v28.4s, v20.4s +sqrdmulh v9.4S, v2.4S, v7.s[0] +add v28.4s, v28.4s, v20.4s +mla v22.4S, v17.4S, v31.s[0] +sub v17.4s, v12.4s, v3.4s +sqrdmulh v20.4S, v30.4S, v7.s[1] +add v12.4s, v12.4s, v3.4s +mla v13.4S, v16.4S, v31.s[0] +sub v16.4s, v8.4s, v22.4s +sqrdmulh v3.4S, v28.4S, v7.s[2] +add v8.4s, v8.4s, v22.4s +mla v14.4S, v21.4S, v31.s[0] +sub v21.4s, v24.4s, v13.4s +sqrdmulh v22.4S, v11.4S, v7.s[3] +add v24.4s, v24.4s, v13.4s +mul v2.4S, v2.4S,v23.s[0] +sub v13.4s, v26.4s, v14.4s +mul v30.4S, v30.4S,v23.s[1] +add v26.4s, v26.4s, v14.4s +mla v2.4S, v9.4S, v31.s[0] +str q16, [x0, #336] +mla v30.4S, v20.4S, v31.s[0] +str q8, [x0, #272] +mul v28.4S, v28.4S,v23.s[2] +str q17, [x0, #464] +mul v11.4S, v11.4S,v23.s[3] +str q12, [x0, #400] +mla v28.4S, v3.4S, v31.s[0] +str q21, [x0, #208] +mla v11.4S, v22.4S, v31.s[0] +str q24, [x0, #144] +str q13, [x0, #80] +str q26, [x0, #16] +sub v26.4s, v6.4s, v2.4s +str q26, [x0, #592] +add v6.4s, v6.4s, v2.4s +sub v2.4s, v1.4s, v30.4s +str q6, [x0, #528] +add v1.4s, v1.4s, v30.4s +sub v30.4s, v27.4s, v28.4s +str q2, [x0, #720] +add v27.4s, v27.4s, v28.4s +sub v28.4s, v18.4s, v11.4s +str q1, [x0, #656] +add v18.4s, v18.4s, v11.4s +str q30, [x0, #848] +str q27, [x0, #784] +str q28, [x0, #976] +str q18, [x0, #912] +ldr q4, [x17, #+128] +ldr q5, [x17, #+144] +ldr q25, [x17, #+160] +ldr q10, [x17, #+176] +ldr q15, [x17, #+192] +ldr q29, [x17, #+208] +ldr q19, [x17, #+224] +ldr q0, [x17, #+240] +ldr q14, [x0, #32] +ldr q9, [x0, #48] +ldr q16, [x0, #0] +ldr q20, [x0, #16] +sqrdmulh v8.4S, v14.4S, v5.s[0] +mul v14.4S, v14.4S,v4.s[0] +mla v14.4S, v8.4S, v31.s[0] +sub v8.4s, v16.4s, v14.4s +add v16.4s, v16.4s, v14.4s +sqrdmulh v14.4S, v9.4S, v5.s[0] +mul v9.4S, v9.4S,v4.s[0] +mla v9.4S, v14.4S, v31.s[0] +sub v14.4s, v20.4s, v9.4s +add v20.4s, v20.4s, v9.4s +sqrdmulh v9.4S, v20.4S, v5.s[1] +mul v20.4S, v20.4S,v4.s[1] +mla v20.4S, v9.4S, v31.s[0] +sub v9.4s, v16.4s, v20.4s +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v14.4S, v5.s[2] +mul v14.4S, v14.4S,v4.s[2] +mla v14.4S, v20.4S, v31.s[0] +sub v20.4s, v8.4s, v14.4s +add v8.4s, v8.4s, v14.4s +trn1 v14.4S, v16.4S, v9.4S +trn2 v17.4S, v16.4S, v9.4S +trn1 v12.4S, v8.4S, v20.4S +trn2 v3.4S, v8.4S, v20.4S +trn2 v8.2D, v14.2D, v12.2D +trn2 v20.2D, v17.2D, v3.2D +trn1 v16.2D, v14.2D, v12.2D +trn1 v9.2D, v17.2D, v3.2D +sqrdmulh v3.4S, v8.4S, v10.4S +mul v8.4S, v8.4S,v25.4S +mla v8.4S, v3.4S, v31.s[0] +sub v3.4s, v16.4s, v8.4s +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v20.4S, v10.4S +mul v20.4S, v20.4S,v25.4S +mla v20.4S, v8.4S, v31.s[0] +sub v8.4s, v9.4s, v20.4s +add v9.4s, v9.4s, v20.4s +sqrdmulh v20.4S, v9.4S, v29.4S +mul v9.4S, v9.4S,v15.4S +mla v9.4S, v20.4S, v31.s[0] +sub v20.4s, v16.4s, v9.4s +add v16.4s, v16.4s, v9.4s +sqrdmulh v9.4S, v8.4S, v0.4S +mul v8.4S, v8.4S,v19.4S +mla v8.4S, v9.4S, v31.s[0] +sub v9.4s, v3.4s, v8.4s +add v3.4s, v3.4s, v8.4s +str q16, [x0, #0] +str q20, [x0, #16] +str q3, [x0, #32] +str q9, [x0, #48] +ldr q9, [x17, #+256] +ldr q3, [x17, #+272] +ldr q20, [x17, #+288] +ldr q16, [x17, #+304] +ldr q8, [x17, #+320] +ldr q17, [x17, #+336] +ldr q12, [x17, #+352] +ldr q14, [x17, #+368] +ldr q0, [x0, #96] +ldr q19, [x0, #112] +ldr q29, [x0, #64] +ldr q15, [x0, #80] +sqrdmulh v10.4S, v0.4S, v3.s[0] +mul v0.4S, v0.4S,v9.s[0] +mla v0.4S, v10.4S, v31.s[0] +sub v10.4s, v29.4s, v0.4s +add v29.4s, v29.4s, v0.4s +sqrdmulh v0.4S, v19.4S, v3.s[0] +mul v19.4S, v19.4S,v9.s[0] +mla v19.4S, v0.4S, v31.s[0] +sub v0.4s, v15.4s, v19.4s +add v15.4s, v15.4s, v19.4s +sqrdmulh v19.4S, v15.4S, v3.s[1] +mul v15.4S, v15.4S,v9.s[1] +mla v15.4S, v19.4S, v31.s[0] +sub v19.4s, v29.4s, v15.4s +add v29.4s, v29.4s, v15.4s +sqrdmulh v15.4S, v0.4S, v3.s[2] +mul v0.4S, v0.4S,v9.s[2] +mla v0.4S, v15.4S, v31.s[0] +sub v15.4s, v10.4s, v0.4s +add v10.4s, v10.4s, v0.4s +trn1 v0.4S, v29.4S, v19.4S +trn2 v25.4S, v29.4S, v19.4S +trn1 v5.4S, v10.4S, v15.4S +trn2 v4.4S, v10.4S, v15.4S +trn2 v10.2D, v0.2D, v5.2D +trn2 v15.2D, v25.2D, v4.2D +trn1 v29.2D, v0.2D, v5.2D +trn1 v19.2D, v25.2D, v4.2D +sqrdmulh v4.4S, v10.4S, v16.4S +mul v10.4S, v10.4S,v20.4S +mla v10.4S, v4.4S, v31.s[0] +sub v4.4s, v29.4s, v10.4s +add v29.4s, v29.4s, v10.4s +sqrdmulh v10.4S, v15.4S, v16.4S +mul v15.4S, v15.4S,v20.4S +mla v15.4S, v10.4S, v31.s[0] +sub v10.4s, v19.4s, v15.4s +add v19.4s, v19.4s, v15.4s +sqrdmulh v15.4S, v19.4S, v17.4S +mul v19.4S, v19.4S,v8.4S +mla v19.4S, v15.4S, v31.s[0] +sub v15.4s, v29.4s, v19.4s +add v29.4s, v29.4s, v19.4s +sqrdmulh v19.4S, v10.4S, v14.4S +mul v10.4S, v10.4S,v12.4S +mla v10.4S, v19.4S, v31.s[0] +sub v19.4s, v4.4s, v10.4s +add v4.4s, v4.4s, v10.4s +str q29, [x0, #64] +str q15, [x0, #80] +str q4, [x0, #96] +str q19, [x0, #112] +ldr q19, [x17, #+384] +ldr q4, [x17, #+400] +ldr q15, [x17, #+416] +ldr q29, [x17, #+432] +ldr q10, [x17, #+448] +ldr q25, [x17, #+464] +ldr q5, [x17, #+480] +ldr q0, [x17, #+496] +ldr q14, [x0, #160] +ldr q12, [x0, #176] +ldr q17, [x0, #128] +ldr q8, [x0, #144] +sqrdmulh v16.4S, v14.4S, v4.s[0] +mul v14.4S, v14.4S,v19.s[0] +mla v14.4S, v16.4S, v31.s[0] +sub v16.4s, v17.4s, v14.4s +add v17.4s, v17.4s, v14.4s +sqrdmulh v14.4S, v12.4S, v4.s[0] +mul v12.4S, v12.4S,v19.s[0] +mla v12.4S, v14.4S, v31.s[0] +sub v14.4s, v8.4s, v12.4s +add v8.4s, v8.4s, v12.4s +sqrdmulh v12.4S, v8.4S, v4.s[1] +mul v8.4S, v8.4S,v19.s[1] +mla v8.4S, v12.4S, v31.s[0] +sub v12.4s, v17.4s, v8.4s +add v17.4s, v17.4s, v8.4s +sqrdmulh v8.4S, v14.4S, v4.s[2] +mul v14.4S, v14.4S,v19.s[2] +mla v14.4S, v8.4S, v31.s[0] +sub v8.4s, v16.4s, v14.4s +add v16.4s, v16.4s, v14.4s +trn1 v14.4S, v17.4S, v12.4S +trn2 v20.4S, v17.4S, v12.4S +trn1 v3.4S, v16.4S, v8.4S +trn2 v9.4S, v16.4S, v8.4S +trn2 v16.2D, v14.2D, v3.2D +trn2 v8.2D, v20.2D, v9.2D +trn1 v17.2D, v14.2D, v3.2D +trn1 v12.2D, v20.2D, v9.2D +sqrdmulh v9.4S, v16.4S, v29.4S +mul v16.4S, v16.4S,v15.4S +mla v16.4S, v9.4S, v31.s[0] +sub v9.4s, v17.4s, v16.4s +add v17.4s, v17.4s, v16.4s +sqrdmulh v16.4S, v8.4S, v29.4S +mul v8.4S, v8.4S,v15.4S +mla v8.4S, v16.4S, v31.s[0] +sub v16.4s, v12.4s, v8.4s +add v12.4s, v12.4s, v8.4s +sqrdmulh v8.4S, v12.4S, v25.4S +mul v12.4S, v12.4S,v10.4S +mla v12.4S, v8.4S, v31.s[0] +sub v8.4s, v17.4s, v12.4s +add v17.4s, v17.4s, v12.4s +sqrdmulh v12.4S, v16.4S, v0.4S +mul v16.4S, v16.4S,v5.4S +mla v16.4S, v12.4S, v31.s[0] +sub v12.4s, v9.4s, v16.4s +add v9.4s, v9.4s, v16.4s +str q17, [x0, #128] +str q8, [x0, #144] +str q9, [x0, #160] +str q12, [x0, #176] +ldr q12, [x17, #+512] +ldr q9, [x17, #+528] +ldr q8, [x17, #+544] +ldr q17, [x17, #+560] +ldr q16, [x17, #+576] +ldr q20, [x17, #+592] +ldr q3, [x17, #+608] +ldr q14, [x17, #+624] +ldr q0, [x0, #224] +ldr q5, [x0, #240] +ldr q25, [x0, #192] +ldr q10, [x0, #208] +sqrdmulh v29.4S, v0.4S, v9.s[0] +mul v0.4S, v0.4S,v12.s[0] +mla v0.4S, v29.4S, v31.s[0] +sub v29.4s, v25.4s, v0.4s +add v25.4s, v25.4s, v0.4s +sqrdmulh v0.4S, v5.4S, v9.s[0] +mul v5.4S, v5.4S,v12.s[0] +mla v5.4S, v0.4S, v31.s[0] +sub v0.4s, v10.4s, v5.4s +add v10.4s, v10.4s, v5.4s +sqrdmulh v5.4S, v10.4S, v9.s[1] +mul v10.4S, v10.4S,v12.s[1] +mla v10.4S, v5.4S, v31.s[0] +sub v5.4s, v25.4s, v10.4s +add v25.4s, v25.4s, v10.4s +sqrdmulh v10.4S, v0.4S, v9.s[2] +mul v0.4S, v0.4S,v12.s[2] +mla v0.4S, v10.4S, v31.s[0] +sub v10.4s, v29.4s, v0.4s +add v29.4s, v29.4s, v0.4s +trn1 v0.4S, v25.4S, v5.4S +trn2 v15.4S, v25.4S, v5.4S +trn1 v4.4S, v29.4S, v10.4S +trn2 v19.4S, v29.4S, v10.4S +trn2 v29.2D, v0.2D, v4.2D +trn2 v10.2D, v15.2D, v19.2D +trn1 v25.2D, v0.2D, v4.2D +trn1 v5.2D, v15.2D, v19.2D +sqrdmulh v19.4S, v29.4S, v17.4S +mul v29.4S, v29.4S,v8.4S +mla v29.4S, v19.4S, v31.s[0] +sub v19.4s, v25.4s, v29.4s +add v25.4s, v25.4s, v29.4s +sqrdmulh v29.4S, v10.4S, v17.4S +mul v10.4S, v10.4S,v8.4S +mla v10.4S, v29.4S, v31.s[0] +sub v29.4s, v5.4s, v10.4s +add v5.4s, v5.4s, v10.4s +sqrdmulh v10.4S, v5.4S, v20.4S +mul v5.4S, v5.4S,v16.4S +mla v5.4S, v10.4S, v31.s[0] +sub v10.4s, v25.4s, v5.4s +add v25.4s, v25.4s, v5.4s +sqrdmulh v5.4S, v29.4S, v14.4S +mul v29.4S, v29.4S,v3.4S +mla v29.4S, v5.4S, v31.s[0] +sub v5.4s, v19.4s, v29.4s +add v19.4s, v19.4s, v29.4s +str q25, [x0, #192] +str q10, [x0, #208] +str q19, [x0, #224] +str q5, [x0, #240] +ldr q5, [x17, #+640] +ldr q19, [x17, #+656] +ldr q10, [x17, #+672] +ldr q25, [x17, #+688] +ldr q29, [x17, #+704] +ldr q15, [x17, #+720] +ldr q4, [x17, #+736] +ldr q0, [x17, #+752] +ldr q14, [x0, #288] +ldr q3, [x0, #304] +ldr q20, [x0, #256] +ldr q16, [x0, #272] +sqrdmulh v17.4S, v14.4S, v19.s[0] +mul v14.4S, v14.4S,v5.s[0] +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v20.4s, v14.4s +add v20.4s, v20.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v19.s[0] +mul v3.4S, v3.4S,v5.s[0] +mla v3.4S, v14.4S, v31.s[0] +sub v14.4s, v16.4s, v3.4s +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v16.4S, v19.s[1] +mul v16.4S, v16.4S,v5.s[1] +mla v16.4S, v3.4S, v31.s[0] +sub v3.4s, v20.4s, v16.4s +add v20.4s, v20.4s, v16.4s +sqrdmulh v16.4S, v14.4S, v19.s[2] +mul v14.4S, v14.4S,v5.s[2] +mla v14.4S, v16.4S, v31.s[0] +sub v16.4s, v17.4s, v14.4s +add v17.4s, v17.4s, v14.4s +trn1 v14.4S, v20.4S, v3.4S +trn2 v8.4S, v20.4S, v3.4S +trn1 v9.4S, v17.4S, v16.4S +trn2 v12.4S, v17.4S, v16.4S +trn2 v17.2D, v14.2D, v9.2D +trn2 v16.2D, v8.2D, v12.2D +trn1 v20.2D, v14.2D, v9.2D +trn1 v3.2D, v8.2D, v12.2D +sqrdmulh v12.4S, v17.4S, v25.4S +mul v17.4S, v17.4S,v10.4S +mla v17.4S, v12.4S, v31.s[0] +sub v12.4s, v20.4s, v17.4s +add v20.4s, v20.4s, v17.4s +sqrdmulh v17.4S, v16.4S, v25.4S +mul v16.4S, v16.4S,v10.4S +mla v16.4S, v17.4S, v31.s[0] +sub v17.4s, v3.4s, v16.4s +add v3.4s, v3.4s, v16.4s +sqrdmulh v16.4S, v3.4S, v15.4S +mul v3.4S, v3.4S,v29.4S +mla v3.4S, v16.4S, v31.s[0] +sub v16.4s, v20.4s, v3.4s +add v20.4s, v20.4s, v3.4s +sqrdmulh v3.4S, v17.4S, v0.4S +mul v17.4S, v17.4S,v4.4S +mla v17.4S, v3.4S, v31.s[0] +sub v3.4s, v12.4s, v17.4s +add v12.4s, v12.4s, v17.4s +str q20, [x0, #256] +str q16, [x0, #272] +str q12, [x0, #288] +str q3, [x0, #304] +ldr q3, [x17, #+768] +ldr q12, [x17, #+784] +ldr q16, [x17, #+800] +ldr q20, [x17, #+816] +ldr q17, [x17, #+832] +ldr q8, [x17, #+848] +ldr q9, [x17, #+864] +ldr q14, [x17, #+880] +ldr q0, [x0, #352] +ldr q4, [x0, #368] +ldr q15, [x0, #320] +ldr q29, [x0, #336] +sqrdmulh v25.4S, v0.4S, v12.s[0] +mul v0.4S, v0.4S,v3.s[0] +mla v0.4S, v25.4S, v31.s[0] +sub v25.4s, v15.4s, v0.4s +add v15.4s, v15.4s, v0.4s +sqrdmulh v0.4S, v4.4S, v12.s[0] +mul v4.4S, v4.4S,v3.s[0] +mla v4.4S, v0.4S, v31.s[0] +sub v0.4s, v29.4s, v4.4s +add v29.4s, v29.4s, v4.4s +sqrdmulh v4.4S, v29.4S, v12.s[1] +mul v29.4S, v29.4S,v3.s[1] +mla v29.4S, v4.4S, v31.s[0] +sub v4.4s, v15.4s, v29.4s +add v15.4s, v15.4s, v29.4s +sqrdmulh v29.4S, v0.4S, v12.s[2] +mul v0.4S, v0.4S,v3.s[2] +mla v0.4S, v29.4S, v31.s[0] +sub v29.4s, v25.4s, v0.4s +add v25.4s, v25.4s, v0.4s +trn1 v0.4S, v15.4S, v4.4S +trn2 v10.4S, v15.4S, v4.4S +trn1 v19.4S, v25.4S, v29.4S +trn2 v5.4S, v25.4S, v29.4S +trn2 v25.2D, v0.2D, v19.2D +trn2 v29.2D, v10.2D, v5.2D +trn1 v15.2D, v0.2D, v19.2D +trn1 v4.2D, v10.2D, v5.2D +sqrdmulh v5.4S, v25.4S, v20.4S +mul v25.4S, v25.4S,v16.4S +mla v25.4S, v5.4S, v31.s[0] +sub v5.4s, v15.4s, v25.4s +add v15.4s, v15.4s, v25.4s +sqrdmulh v25.4S, v29.4S, v20.4S +mul v29.4S, v29.4S,v16.4S +mla v29.4S, v25.4S, v31.s[0] +sub v25.4s, v4.4s, v29.4s +add v4.4s, v4.4s, v29.4s +sqrdmulh v29.4S, v4.4S, v8.4S +mul v4.4S, v4.4S,v17.4S +mla v4.4S, v29.4S, v31.s[0] +sub v29.4s, v15.4s, v4.4s +add v15.4s, v15.4s, v4.4s +sqrdmulh v4.4S, v25.4S, v14.4S +mul v25.4S, v25.4S,v9.4S +mla v25.4S, v4.4S, v31.s[0] +sub v4.4s, v5.4s, v25.4s +add v5.4s, v5.4s, v25.4s +str q15, [x0, #320] +str q29, [x0, #336] +str q5, [x0, #352] +str q4, [x0, #368] +ldr q4, [x17, #+896] +ldr q5, [x17, #+912] +ldr q29, [x17, #+928] +ldr q15, [x17, #+944] +ldr q25, [x17, #+960] +ldr q10, [x17, #+976] +ldr q19, [x17, #+992] +ldr q0, [x17, #+1008] +ldr q14, [x0, #416] +ldr q9, [x0, #432] +ldr q8, [x0, #384] +ldr q17, [x0, #400] +sqrdmulh v20.4S, v14.4S, v5.s[0] +mul v14.4S, v14.4S,v4.s[0] +mla v14.4S, v20.4S, v31.s[0] +sub v20.4s, v8.4s, v14.4s +add v8.4s, v8.4s, v14.4s +sqrdmulh v14.4S, v9.4S, v5.s[0] +mul v9.4S, v9.4S,v4.s[0] +mla v9.4S, v14.4S, v31.s[0] +sub v14.4s, v17.4s, v9.4s +add v17.4s, v17.4s, v9.4s +sqrdmulh v9.4S, v17.4S, v5.s[1] +mul v17.4S, v17.4S,v4.s[1] +mla v17.4S, v9.4S, v31.s[0] +sub v9.4s, v8.4s, v17.4s +add v8.4s, v8.4s, v17.4s +sqrdmulh v17.4S, v14.4S, v5.s[2] +mul v14.4S, v14.4S,v4.s[2] +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v20.4s, v14.4s +add v20.4s, v20.4s, v14.4s +trn1 v14.4S, v8.4S, v9.4S +trn2 v16.4S, v8.4S, v9.4S +trn1 v12.4S, v20.4S, v17.4S +trn2 v3.4S, v20.4S, v17.4S +trn2 v20.2D, v14.2D, v12.2D +trn2 v17.2D, v16.2D, v3.2D +trn1 v8.2D, v14.2D, v12.2D +trn1 v9.2D, v16.2D, v3.2D +sqrdmulh v3.4S, v20.4S, v15.4S +mul v20.4S, v20.4S,v29.4S +mla v20.4S, v3.4S, v31.s[0] +sub v3.4s, v8.4s, v20.4s +add v8.4s, v8.4s, v20.4s +sqrdmulh v20.4S, v17.4S, v15.4S +mul v17.4S, v17.4S,v29.4S +mla v17.4S, v20.4S, v31.s[0] +sub v20.4s, v9.4s, v17.4s +add v9.4s, v9.4s, v17.4s +sqrdmulh v17.4S, v9.4S, v10.4S +mul v9.4S, v9.4S,v25.4S +mla v9.4S, v17.4S, v31.s[0] +sub v17.4s, v8.4s, v9.4s +add v8.4s, v8.4s, v9.4s +sqrdmulh v9.4S, v20.4S, v0.4S +mul v20.4S, v20.4S,v19.4S +mla v20.4S, v9.4S, v31.s[0] +sub v9.4s, v3.4s, v20.4s +add v3.4s, v3.4s, v20.4s +str q8, [x0, #384] +str q17, [x0, #400] +str q3, [x0, #416] +str q9, [x0, #432] +ldr q9, [x17, #+1024] +ldr q3, [x17, #+1040] +ldr q17, [x17, #+1056] +ldr q8, [x17, #+1072] +ldr q20, [x17, #+1088] +ldr q16, [x17, #+1104] +ldr q12, [x17, #+1120] +ldr q14, [x17, #+1136] +ldr q0, [x0, #480] +ldr q19, [x0, #496] +ldr q10, [x0, #448] +ldr q25, [x0, #464] +sqrdmulh v15.4S, v0.4S, v3.s[0] +mul v0.4S, v0.4S,v9.s[0] +mla v0.4S, v15.4S, v31.s[0] +sub v15.4s, v10.4s, v0.4s +add v10.4s, v10.4s, v0.4s +sqrdmulh v0.4S, v19.4S, v3.s[0] +mul v19.4S, v19.4S,v9.s[0] +mla v19.4S, v0.4S, v31.s[0] +sub v0.4s, v25.4s, v19.4s +add v25.4s, v25.4s, v19.4s +sqrdmulh v19.4S, v25.4S, v3.s[1] +mul v25.4S, v25.4S,v9.s[1] +mla v25.4S, v19.4S, v31.s[0] +sub v19.4s, v10.4s, v25.4s +add v10.4s, v10.4s, v25.4s +sqrdmulh v25.4S, v0.4S, v3.s[2] +mul v0.4S, v0.4S,v9.s[2] +mla v0.4S, v25.4S, v31.s[0] +sub v25.4s, v15.4s, v0.4s +add v15.4s, v15.4s, v0.4s +trn1 v0.4S, v10.4S, v19.4S +trn2 v29.4S, v10.4S, v19.4S +trn1 v5.4S, v15.4S, v25.4S +trn2 v4.4S, v15.4S, v25.4S +trn2 v15.2D, v0.2D, v5.2D +trn2 v25.2D, v29.2D, v4.2D +trn1 v10.2D, v0.2D, v5.2D +trn1 v19.2D, v29.2D, v4.2D +sqrdmulh v4.4S, v15.4S, v8.4S +mul v15.4S, v15.4S,v17.4S +mla v15.4S, v4.4S, v31.s[0] +sub v4.4s, v10.4s, v15.4s +add v10.4s, v10.4s, v15.4s +sqrdmulh v15.4S, v25.4S, v8.4S +mul v25.4S, v25.4S,v17.4S +mla v25.4S, v15.4S, v31.s[0] +sub v15.4s, v19.4s, v25.4s +add v19.4s, v19.4s, v25.4s +sqrdmulh v25.4S, v19.4S, v16.4S +mul v19.4S, v19.4S,v20.4S +mla v19.4S, v25.4S, v31.s[0] +sub v25.4s, v10.4s, v19.4s +add v10.4s, v10.4s, v19.4s +sqrdmulh v19.4S, v15.4S, v14.4S +mul v15.4S, v15.4S,v12.4S +mla v15.4S, v19.4S, v31.s[0] +sub v19.4s, v4.4s, v15.4s +add v4.4s, v4.4s, v15.4s +str q10, [x0, #448] +str q25, [x0, #464] +str q4, [x0, #480] +str q19, [x0, #496] +ldr q19, [x17, #+1152] +ldr q4, [x17, #+1168] +ldr q25, [x17, #+1184] +ldr q10, [x17, #+1200] +ldr q15, [x17, #+1216] +ldr q29, [x17, #+1232] +ldr q5, [x17, #+1248] +ldr q0, [x17, #+1264] +ldr q14, [x0, #544] +ldr q12, [x0, #560] +ldr q16, [x0, #512] +ldr q20, [x0, #528] +sqrdmulh v8.4S, v14.4S, v4.s[0] +mul v14.4S, v14.4S,v19.s[0] +mla v14.4S, v8.4S, v31.s[0] +sub v8.4s, v16.4s, v14.4s +add v16.4s, v16.4s, v14.4s +sqrdmulh v14.4S, v12.4S, v4.s[0] +mul v12.4S, v12.4S,v19.s[0] +mla v12.4S, v14.4S, v31.s[0] +sub v14.4s, v20.4s, v12.4s +add v20.4s, v20.4s, v12.4s +sqrdmulh v12.4S, v20.4S, v4.s[1] +mul v20.4S, v20.4S,v19.s[1] +mla v20.4S, v12.4S, v31.s[0] +sub v12.4s, v16.4s, v20.4s +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v14.4S, v4.s[2] +mul v14.4S, v14.4S,v19.s[2] +mla v14.4S, v20.4S, v31.s[0] +sub v20.4s, v8.4s, v14.4s +add v8.4s, v8.4s, v14.4s +trn1 v14.4S, v16.4S, v12.4S +trn2 v17.4S, v16.4S, v12.4S +trn1 v3.4S, v8.4S, v20.4S +trn2 v9.4S, v8.4S, v20.4S +trn2 v8.2D, v14.2D, v3.2D +trn2 v20.2D, v17.2D, v9.2D +trn1 v16.2D, v14.2D, v3.2D +trn1 v12.2D, v17.2D, v9.2D +sqrdmulh v9.4S, v8.4S, v10.4S +mul v8.4S, v8.4S,v25.4S +mla v8.4S, v9.4S, v31.s[0] +sub v9.4s, v16.4s, v8.4s +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v20.4S, v10.4S +mul v20.4S, v20.4S,v25.4S +mla v20.4S, v8.4S, v31.s[0] +sub v8.4s, v12.4s, v20.4s +add v12.4s, v12.4s, v20.4s +sqrdmulh v20.4S, v12.4S, v29.4S +mul v12.4S, v12.4S,v15.4S +mla v12.4S, v20.4S, v31.s[0] +sub v20.4s, v16.4s, v12.4s +add v16.4s, v16.4s, v12.4s +sqrdmulh v12.4S, v8.4S, v0.4S +mul v8.4S, v8.4S,v5.4S +mla v8.4S, v12.4S, v31.s[0] +sub v12.4s, v9.4s, v8.4s +add v9.4s, v9.4s, v8.4s +str q16, [x0, #512] +str q20, [x0, #528] +str q9, [x0, #544] +str q12, [x0, #560] +ldr q12, [x17, #+1280] +ldr q9, [x17, #+1296] +ldr q20, [x17, #+1312] +ldr q16, [x17, #+1328] +ldr q8, [x17, #+1344] +ldr q17, [x17, #+1360] +ldr q3, [x17, #+1376] +ldr q14, [x17, #+1392] +ldr q0, [x0, #608] +ldr q5, [x0, #624] +ldr q29, [x0, #576] +ldr q15, [x0, #592] +sqrdmulh v10.4S, v0.4S, v9.s[0] +mul v0.4S, v0.4S,v12.s[0] +mla v0.4S, v10.4S, v31.s[0] +sub v10.4s, v29.4s, v0.4s +add v29.4s, v29.4s, v0.4s +sqrdmulh v0.4S, v5.4S, v9.s[0] +mul v5.4S, v5.4S,v12.s[0] +mla v5.4S, v0.4S, v31.s[0] +sub v0.4s, v15.4s, v5.4s +add v15.4s, v15.4s, v5.4s +sqrdmulh v5.4S, v15.4S, v9.s[1] +mul v15.4S, v15.4S,v12.s[1] +mla v15.4S, v5.4S, v31.s[0] +sub v5.4s, v29.4s, v15.4s +add v29.4s, v29.4s, v15.4s +sqrdmulh v15.4S, v0.4S, v9.s[2] +mul v0.4S, v0.4S,v12.s[2] +mla v0.4S, v15.4S, v31.s[0] +sub v15.4s, v10.4s, v0.4s +add v10.4s, v10.4s, v0.4s +trn1 v0.4S, v29.4S, v5.4S +trn2 v25.4S, v29.4S, v5.4S +trn1 v4.4S, v10.4S, v15.4S +trn2 v19.4S, v10.4S, v15.4S +trn2 v10.2D, v0.2D, v4.2D +trn2 v15.2D, v25.2D, v19.2D +trn1 v29.2D, v0.2D, v4.2D +trn1 v5.2D, v25.2D, v19.2D +sqrdmulh v19.4S, v10.4S, v16.4S +mul v10.4S, v10.4S,v20.4S +mla v10.4S, v19.4S, v31.s[0] +sub v19.4s, v29.4s, v10.4s +add v29.4s, v29.4s, v10.4s +sqrdmulh v10.4S, v15.4S, v16.4S +mul v15.4S, v15.4S,v20.4S +mla v15.4S, v10.4S, v31.s[0] +sub v10.4s, v5.4s, v15.4s +add v5.4s, v5.4s, v15.4s +sqrdmulh v15.4S, v5.4S, v17.4S +mul v5.4S, v5.4S,v8.4S +mla v5.4S, v15.4S, v31.s[0] +sub v15.4s, v29.4s, v5.4s +add v29.4s, v29.4s, v5.4s +sqrdmulh v5.4S, v10.4S, v14.4S +mul v10.4S, v10.4S,v3.4S +mla v10.4S, v5.4S, v31.s[0] +sub v5.4s, v19.4s, v10.4s +add v19.4s, v19.4s, v10.4s +str q29, [x0, #576] +str q15, [x0, #592] +str q19, [x0, #608] +str q5, [x0, #624] +ldr q5, [x17, #+1408] +ldr q19, [x17, #+1424] +ldr q15, [x17, #+1440] +ldr q29, [x17, #+1456] +ldr q10, [x17, #+1472] +ldr q25, [x17, #+1488] +ldr q4, [x17, #+1504] +ldr q0, [x17, #+1520] +ldr q14, [x0, #672] +ldr q3, [x0, #688] +ldr q17, [x0, #640] +ldr q8, [x0, #656] +sqrdmulh v16.4S, v14.4S, v19.s[0] +mul v14.4S, v14.4S,v5.s[0] +mla v14.4S, v16.4S, v31.s[0] +sub v16.4s, v17.4s, v14.4s +add v17.4s, v17.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v19.s[0] +mul v3.4S, v3.4S,v5.s[0] +mla v3.4S, v14.4S, v31.s[0] +sub v14.4s, v8.4s, v3.4s +add v8.4s, v8.4s, v3.4s +sqrdmulh v3.4S, v8.4S, v19.s[1] +mul v8.4S, v8.4S,v5.s[1] +mla v8.4S, v3.4S, v31.s[0] +sub v3.4s, v17.4s, v8.4s +add v17.4s, v17.4s, v8.4s +sqrdmulh v8.4S, v14.4S, v19.s[2] +mul v14.4S, v14.4S,v5.s[2] +mla v14.4S, v8.4S, v31.s[0] +sub v8.4s, v16.4s, v14.4s +add v16.4s, v16.4s, v14.4s +trn1 v14.4S, v17.4S, v3.4S +trn2 v20.4S, v17.4S, v3.4S +trn1 v9.4S, v16.4S, v8.4S +trn2 v12.4S, v16.4S, v8.4S +trn2 v16.2D, v14.2D, v9.2D +trn2 v8.2D, v20.2D, v12.2D +trn1 v17.2D, v14.2D, v9.2D +trn1 v3.2D, v20.2D, v12.2D +sqrdmulh v12.4S, v16.4S, v29.4S +mul v16.4S, v16.4S,v15.4S +mla v16.4S, v12.4S, v31.s[0] +sub v12.4s, v17.4s, v16.4s +add v17.4s, v17.4s, v16.4s +sqrdmulh v16.4S, v8.4S, v29.4S +mul v8.4S, v8.4S,v15.4S +mla v8.4S, v16.4S, v31.s[0] +sub v16.4s, v3.4s, v8.4s +add v3.4s, v3.4s, v8.4s +sqrdmulh v8.4S, v3.4S, v25.4S +mul v3.4S, v3.4S,v10.4S +mla v3.4S, v8.4S, v31.s[0] +sub v8.4s, v17.4s, v3.4s +add v17.4s, v17.4s, v3.4s +sqrdmulh v3.4S, v16.4S, v0.4S +mul v16.4S, v16.4S,v4.4S +mla v16.4S, v3.4S, v31.s[0] +sub v3.4s, v12.4s, v16.4s +add v12.4s, v12.4s, v16.4s +str q17, [x0, #640] +str q8, [x0, #656] +str q12, [x0, #672] +str q3, [x0, #688] +ldr q3, [x17, #+1536] +ldr q12, [x17, #+1552] +ldr q8, [x17, #+1568] +ldr q17, [x17, #+1584] +ldr q16, [x17, #+1600] +ldr q20, [x17, #+1616] +ldr q9, [x17, #+1632] +ldr q14, [x17, #+1648] +ldr q0, [x0, #736] +ldr q4, [x0, #752] +ldr q25, [x0, #704] +ldr q10, [x0, #720] +sqrdmulh v29.4S, v0.4S, v12.s[0] +mul v0.4S, v0.4S,v3.s[0] +mla v0.4S, v29.4S, v31.s[0] +sub v29.4s, v25.4s, v0.4s +add v25.4s, v25.4s, v0.4s +sqrdmulh v0.4S, v4.4S, v12.s[0] +mul v4.4S, v4.4S,v3.s[0] +mla v4.4S, v0.4S, v31.s[0] +sub v0.4s, v10.4s, v4.4s +add v10.4s, v10.4s, v4.4s +sqrdmulh v4.4S, v10.4S, v12.s[1] +mul v10.4S, v10.4S,v3.s[1] +mla v10.4S, v4.4S, v31.s[0] +sub v4.4s, v25.4s, v10.4s +add v25.4s, v25.4s, v10.4s +sqrdmulh v10.4S, v0.4S, v12.s[2] +mul v0.4S, v0.4S,v3.s[2] +mla v0.4S, v10.4S, v31.s[0] +sub v10.4s, v29.4s, v0.4s +add v29.4s, v29.4s, v0.4s +trn1 v0.4S, v25.4S, v4.4S +trn2 v15.4S, v25.4S, v4.4S +trn1 v19.4S, v29.4S, v10.4S +trn2 v5.4S, v29.4S, v10.4S +trn2 v29.2D, v0.2D, v19.2D +trn2 v10.2D, v15.2D, v5.2D +trn1 v25.2D, v0.2D, v19.2D +trn1 v4.2D, v15.2D, v5.2D +sqrdmulh v5.4S, v29.4S, v17.4S +mul v29.4S, v29.4S,v8.4S +mla v29.4S, v5.4S, v31.s[0] +sub v5.4s, v25.4s, v29.4s +add v25.4s, v25.4s, v29.4s +sqrdmulh v29.4S, v10.4S, v17.4S +mul v10.4S, v10.4S,v8.4S +mla v10.4S, v29.4S, v31.s[0] +sub v29.4s, v4.4s, v10.4s +add v4.4s, v4.4s, v10.4s +sqrdmulh v10.4S, v4.4S, v20.4S +mul v4.4S, v4.4S,v16.4S +mla v4.4S, v10.4S, v31.s[0] +sub v10.4s, v25.4s, v4.4s +add v25.4s, v25.4s, v4.4s +sqrdmulh v4.4S, v29.4S, v14.4S +mul v29.4S, v29.4S,v9.4S +mla v29.4S, v4.4S, v31.s[0] +sub v4.4s, v5.4s, v29.4s +add v5.4s, v5.4s, v29.4s +str q25, [x0, #704] +str q10, [x0, #720] +str q5, [x0, #736] +str q4, [x0, #752] +ldr q4, [x17, #+1664] +ldr q5, [x17, #+1680] +ldr q10, [x17, #+1696] +ldr q25, [x17, #+1712] +ldr q29, [x17, #+1728] +ldr q15, [x17, #+1744] +ldr q19, [x17, #+1760] +ldr q0, [x17, #+1776] +ldr q14, [x0, #800] +ldr q9, [x0, #816] +ldr q20, [x0, #768] +ldr q16, [x0, #784] +sqrdmulh v17.4S, v14.4S, v5.s[0] +mul v14.4S, v14.4S,v4.s[0] +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v20.4s, v14.4s +add v20.4s, v20.4s, v14.4s +sqrdmulh v14.4S, v9.4S, v5.s[0] +mul v9.4S, v9.4S,v4.s[0] +mla v9.4S, v14.4S, v31.s[0] +sub v14.4s, v16.4s, v9.4s +add v16.4s, v16.4s, v9.4s +sqrdmulh v9.4S, v16.4S, v5.s[1] +mul v16.4S, v16.4S,v4.s[1] +mla v16.4S, v9.4S, v31.s[0] +sub v9.4s, v20.4s, v16.4s +add v20.4s, v20.4s, v16.4s +sqrdmulh v16.4S, v14.4S, v5.s[2] +mul v14.4S, v14.4S,v4.s[2] +mla v14.4S, v16.4S, v31.s[0] +sub v16.4s, v17.4s, v14.4s +add v17.4s, v17.4s, v14.4s +trn1 v14.4S, v20.4S, v9.4S +trn2 v8.4S, v20.4S, v9.4S +trn1 v12.4S, v17.4S, v16.4S +trn2 v3.4S, v17.4S, v16.4S +trn2 v17.2D, v14.2D, v12.2D +trn2 v16.2D, v8.2D, v3.2D +trn1 v20.2D, v14.2D, v12.2D +trn1 v9.2D, v8.2D, v3.2D +sqrdmulh v3.4S, v17.4S, v25.4S +mul v17.4S, v17.4S,v10.4S +mla v17.4S, v3.4S, v31.s[0] +sub v3.4s, v20.4s, v17.4s +add v20.4s, v20.4s, v17.4s +sqrdmulh v17.4S, v16.4S, v25.4S +mul v16.4S, v16.4S,v10.4S +mla v16.4S, v17.4S, v31.s[0] +sub v17.4s, v9.4s, v16.4s +add v9.4s, v9.4s, v16.4s +sqrdmulh v16.4S, v9.4S, v15.4S +mul v9.4S, v9.4S,v29.4S +mla v9.4S, v16.4S, v31.s[0] +sub v16.4s, v20.4s, v9.4s +add v20.4s, v20.4s, v9.4s +sqrdmulh v9.4S, v17.4S, v0.4S +mul v17.4S, v17.4S,v19.4S +mla v17.4S, v9.4S, v31.s[0] +sub v9.4s, v3.4s, v17.4s +add v3.4s, v3.4s, v17.4s +str q20, [x0, #768] +str q16, [x0, #784] +str q3, [x0, #800] +str q9, [x0, #816] +ldr q9, [x17, #+1792] +ldr q3, [x17, #+1808] +ldr q16, [x17, #+1824] +ldr q20, [x17, #+1840] +ldr q17, [x17, #+1856] +ldr q8, [x17, #+1872] +ldr q12, [x17, #+1888] +ldr q14, [x17, #+1904] +ldr q0, [x0, #864] +ldr q19, [x0, #880] +ldr q15, [x0, #832] +ldr q29, [x0, #848] +sqrdmulh v25.4S, v0.4S, v3.s[0] +mul v0.4S, v0.4S,v9.s[0] +mla v0.4S, v25.4S, v31.s[0] +sub v25.4s, v15.4s, v0.4s +add v15.4s, v15.4s, v0.4s +sqrdmulh v0.4S, v19.4S, v3.s[0] +mul v19.4S, v19.4S,v9.s[0] +mla v19.4S, v0.4S, v31.s[0] +sub v0.4s, v29.4s, v19.4s +add v29.4s, v29.4s, v19.4s +sqrdmulh v19.4S, v29.4S, v3.s[1] +mul v29.4S, v29.4S,v9.s[1] +mla v29.4S, v19.4S, v31.s[0] +sub v19.4s, v15.4s, v29.4s +add v15.4s, v15.4s, v29.4s +sqrdmulh v29.4S, v0.4S, v3.s[2] +mul v0.4S, v0.4S,v9.s[2] +mla v0.4S, v29.4S, v31.s[0] +sub v29.4s, v25.4s, v0.4s +add v25.4s, v25.4s, v0.4s +trn1 v0.4S, v15.4S, v19.4S +trn2 v10.4S, v15.4S, v19.4S +trn1 v5.4S, v25.4S, v29.4S +trn2 v4.4S, v25.4S, v29.4S +trn2 v25.2D, v0.2D, v5.2D +trn2 v29.2D, v10.2D, v4.2D +trn1 v15.2D, v0.2D, v5.2D +trn1 v19.2D, v10.2D, v4.2D +sqrdmulh v4.4S, v25.4S, v20.4S +mul v25.4S, v25.4S,v16.4S +mla v25.4S, v4.4S, v31.s[0] +sub v4.4s, v15.4s, v25.4s +add v15.4s, v15.4s, v25.4s +sqrdmulh v25.4S, v29.4S, v20.4S +mul v29.4S, v29.4S,v16.4S +mla v29.4S, v25.4S, v31.s[0] +sub v25.4s, v19.4s, v29.4s +add v19.4s, v19.4s, v29.4s +sqrdmulh v29.4S, v19.4S, v8.4S +mul v19.4S, v19.4S,v17.4S +mla v19.4S, v29.4S, v31.s[0] +sub v29.4s, v15.4s, v19.4s +add v15.4s, v15.4s, v19.4s +sqrdmulh v19.4S, v25.4S, v14.4S +mul v25.4S, v25.4S,v12.4S +mla v25.4S, v19.4S, v31.s[0] +sub v19.4s, v4.4s, v25.4s +add v4.4s, v4.4s, v25.4s +str q15, [x0, #832] +str q29, [x0, #848] +str q4, [x0, #864] +str q19, [x0, #880] +ldr q19, [x17, #+1920] +ldr q4, [x17, #+1936] +ldr q29, [x17, #+1952] +ldr q15, [x17, #+1968] +ldr q25, [x17, #+1984] +ldr q10, [x17, #+2000] +ldr q5, [x17, #+2016] +ldr q0, [x17, #+2032] +ldr q14, [x0, #928] +ldr q12, [x0, #944] +ldr q8, [x0, #896] +ldr q17, [x0, #912] +sqrdmulh v20.4S, v14.4S, v4.s[0] +mul v14.4S, v14.4S,v19.s[0] +mla v14.4S, v20.4S, v31.s[0] +sub v20.4s, v8.4s, v14.4s +add v8.4s, v8.4s, v14.4s +sqrdmulh v14.4S, v12.4S, v4.s[0] +mul v12.4S, v12.4S,v19.s[0] +mla v12.4S, v14.4S, v31.s[0] +sub v14.4s, v17.4s, v12.4s +add v17.4s, v17.4s, v12.4s +sqrdmulh v12.4S, v17.4S, v4.s[1] +mul v17.4S, v17.4S,v19.s[1] +mla v17.4S, v12.4S, v31.s[0] +sub v12.4s, v8.4s, v17.4s +add v8.4s, v8.4s, v17.4s +sqrdmulh v17.4S, v14.4S, v4.s[2] +mul v14.4S, v14.4S,v19.s[2] +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v20.4s, v14.4s +add v20.4s, v20.4s, v14.4s +trn1 v14.4S, v8.4S, v12.4S +trn2 v16.4S, v8.4S, v12.4S +trn1 v3.4S, v20.4S, v17.4S +trn2 v9.4S, v20.4S, v17.4S +trn2 v20.2D, v14.2D, v3.2D +trn2 v17.2D, v16.2D, v9.2D +trn1 v8.2D, v14.2D, v3.2D +trn1 v12.2D, v16.2D, v9.2D +sqrdmulh v9.4S, v20.4S, v15.4S +mul v20.4S, v20.4S,v29.4S +mla v20.4S, v9.4S, v31.s[0] +sub v9.4s, v8.4s, v20.4s +add v8.4s, v8.4s, v20.4s +sqrdmulh v20.4S, v17.4S, v15.4S +mul v17.4S, v17.4S,v29.4S +mla v17.4S, v20.4S, v31.s[0] +sub v20.4s, v12.4s, v17.4s +add v12.4s, v12.4s, v17.4s +sqrdmulh v17.4S, v12.4S, v10.4S +mul v12.4S, v12.4S,v25.4S +mla v12.4S, v17.4S, v31.s[0] +sub v17.4s, v8.4s, v12.4s +add v8.4s, v8.4s, v12.4s +sqrdmulh v12.4S, v20.4S, v0.4S +mul v20.4S, v20.4S,v5.4S +mla v20.4S, v12.4S, v31.s[0] +sub v12.4s, v9.4s, v20.4s +add v9.4s, v9.4s, v20.4s +str q8, [x0, #896] +str q17, [x0, #912] +str q9, [x0, #928] +str q12, [x0, #944] +ldr q12, [x17, #+2048] +ldr q9, [x17, #+2064] +ldr q17, [x17, #+2080] +ldr q8, [x17, #+2096] +ldr q20, [x17, #+2112] +ldr q16, [x17, #+2128] +ldr q3, [x17, #+2144] +ldr q14, [x17, #+2160] +ldr q0, [x0, #992] +ldr q5, [x0, #1008] +ldr q10, [x0, #960] +ldr q25, [x0, #976] +sqrdmulh v15.4S, v0.4S, v9.s[0] +mul v0.4S, v0.4S,v12.s[0] +mla v0.4S, v15.4S, v31.s[0] +sub v15.4s, v10.4s, v0.4s +add v10.4s, v10.4s, v0.4s +sqrdmulh v0.4S, v5.4S, v9.s[0] +mul v5.4S, v5.4S,v12.s[0] +mla v5.4S, v0.4S, v31.s[0] +sub v0.4s, v25.4s, v5.4s +add v25.4s, v25.4s, v5.4s +sqrdmulh v5.4S, v25.4S, v9.s[1] +mul v25.4S, v25.4S,v12.s[1] +mla v25.4S, v5.4S, v31.s[0] +sub v5.4s, v10.4s, v25.4s +add v10.4s, v10.4s, v25.4s +sqrdmulh v25.4S, v0.4S, v9.s[2] +mul v0.4S, v0.4S,v12.s[2] +mla v0.4S, v25.4S, v31.s[0] +sub v25.4s, v15.4s, v0.4s +add v15.4s, v15.4s, v0.4s +trn1 v0.4S, v10.4S, v5.4S +trn2 v29.4S, v10.4S, v5.4S +trn1 v4.4S, v15.4S, v25.4S +trn2 v19.4S, v15.4S, v25.4S +trn2 v15.2D, v0.2D, v4.2D +trn2 v25.2D, v29.2D, v19.2D +trn1 v10.2D, v0.2D, v4.2D +trn1 v5.2D, v29.2D, v19.2D +sqrdmulh v19.4S, v15.4S, v8.4S +mul v15.4S, v15.4S,v17.4S +mla v15.4S, v19.4S, v31.s[0] +sub v19.4s, v10.4s, v15.4s +add v10.4s, v10.4s, v15.4s +sqrdmulh v15.4S, v25.4S, v8.4S +mul v25.4S, v25.4S,v17.4S +mla v25.4S, v15.4S, v31.s[0] +sub v15.4s, v5.4s, v25.4s +add v5.4s, v5.4s, v25.4s +sqrdmulh v25.4S, v5.4S, v16.4S +mul v5.4S, v5.4S,v20.4S +mla v5.4S, v25.4S, v31.s[0] +sub v25.4s, v10.4s, v5.4s +add v10.4s, v10.4s, v5.4s +sqrdmulh v5.4S, v15.4S, v14.4S +mul v15.4S, v15.4S,v3.4S +mla v15.4S, v5.4S, v31.s[0] +sub v5.4s, v19.4s, v15.4s +add v19.4s, v19.4s, v15.4s +str q10, [x0, #960] +str q25, [x0, #976] +str q19, [x0, #992] +str q5, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 2476 +// Instruction count: 2472 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_15_0.s b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_15_0.s new file mode 100644 index 0000000..eda1068 --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_15_0.s @@ -0,0 +1,2506 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 26036764 // Layer 6, block 0 +.word 7065381 // Layer 6, block 1 +.word 11280567 // Layer 6, block 2 +.word 19695786 // Layer 6, block 3 +.word 1666225723 // Layer 6, block 0 +.word 452149874 // Layer 6, block 1 +.word 721901190 // Layer 6, block 2 +.word 1260434103 // Layer 6, block 3 +.word 28678040 // Layer 7, block 0 +.word 5637166 // Layer 7, block 2 +.word 18759424 // Layer 7, block 4 +.word 8648030 // Layer 7, block 6 +.word 1835254486 // Layer 7, block 0 +.word 360751090 // Layer 7, block 2 +.word 1200511508 // Layer 7, block 4 +.word 553431680 // Layer 7, block 6 +.word 7232147 // Layer 7, block 1 +.word 7430689 // Layer 7, block 3 +.word 14819378 // Layer 7, block 5 +.word 22112339 // Layer 7, block 7 +.word 462822084 // Layer 7, block 1 +.word 475527802 // Layer 7, block 3 +.word 948367809 // Layer 7, block 5 +.word 1415081692 // Layer 7, block 7 +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14834498 // Layer 6, block 4 +.word 22861321 // Layer 6, block 5 +.word 23033862 // Layer 6, block 6 +.word 32211066 // Layer 6, block 7 +.word 949335415 // Layer 6, block 4 +.word 1463012881 // Layer 6, block 5 +.word 1474054663 // Layer 6, block 6 +.word 2061350894 // Layer 6, block 7 +.word 7103825 // Layer 7, block 8 +.word 24338119 // Layer 7, block 10 +.word 6674394 // Layer 7, block 12 +.word 3716128 // Layer 7, block 14 +.word 454610102 // Layer 7, block 8 +.word 1557520740 // Layer 7, block 10 +.word 427128616 // Layer 7, block 12 +.word 237814041 // Layer 7, block 14 +.word 18577393 // Layer 7, block 9 +.word 17042091 // Layer 7, block 11 +.word 6574213 // Layer 7, block 13 +.word 24666803 // Layer 7, block 15 +.word 1188862414 // Layer 7, block 9 +.word 1090610585 // Layer 7, block 11 +.word 420717521 // Layer 7, block 13 +.word 1578554911 // Layer 7, block 15 +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 11253846 // Layer 6, block 8 +.word 16151303 // Layer 6, block 9 +.word 1821442 // Layer 6, block 10 +.word 23358663 // Layer 6, block 11 +.word 720191176 // Layer 6, block 8 +.word 1033604503 // Layer 6, block 9 +.word 116563391 // Layer 6, block 10 +.word 1494840340 // Layer 6, block 11 +.word 32787475 // Layer 7, block 16 +.word 8269259 // Layer 7, block 18 +.word 20826321 // Layer 7, block 20 +.word 21194054 // Layer 7, block 22 +.word 2098238255 // Layer 7, block 16 +.word 529192186 // Layer 7, block 18 +.word 1332782821 // Layer 7, block 20 +.word 1356315937 // Layer 7, block 22 +.word 28400654 // Layer 7, block 17 +.word 31090287 // Layer 7, block 19 +.word 26776841 // Layer 7, block 21 +.word 22281074 // Layer 7, block 23 +.word 1817503137 // Layer 7, block 17 +.word 1989626512 // Layer 7, block 19 +.word 1713587037 // Layer 7, block 21 +.word 1425879908 // Layer 7, block 23 +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 20504641 // Layer 6, block 12 +.word 7735096 // Layer 6, block 13 +.word 29463916 // Layer 6, block 14 +.word 23172067 // Layer 6, block 15 +.word 1312196872 // Layer 6, block 12 +.word 495008363 // Layer 6, block 13 +.word 1885546712 // Layer 6, block 14 +.word 1482899108 // Layer 6, block 15 +.word 1953000 // Layer 7, block 24 +.word 12766243 // Layer 7, block 26 +.word 16292342 // Layer 7, block 28 +.word 25143337 // Layer 7, block 30 +.word 124982461 // Layer 7, block 24 +.word 816977197 // Layer 7, block 26 +.word 1042630311 // Layer 7, block 28 +.word 1609050759 // Layer 7, block 30 +.word 12486848 // Layer 7, block 25 +.word 31556661 // Layer 7, block 27 +.word 28330310 // Layer 7, block 29 +.word 15137961 // Layer 7, block 31 +.word 799097282 // Layer 7, block 25 +.word 2019472170 // Layer 7, block 27 +.word 1813001465 // Layer 7, block 29 +.word 968755565 // Layer 7, block 31 +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 18663828 // Layer 6, block 16 +.word 25765932 // Layer 6, block 17 +.word 11779122 // Layer 6, block 18 +.word 29112305 // Layer 6, block 19 +.word 1194393831 // Layer 6, block 16 +.word 1648893798 // Layer 6, block 17 +.word 753806275 // Layer 6, block 18 +.word 1863045325 // Layer 6, block 19 +.word 33163184 // Layer 7, block 32 +.word 11550623 // Layer 7, block 34 +.word 25375595 // Layer 7, block 36 +.word 18254638 // Layer 7, block 38 +.word 2122281795 // Layer 7, block 32 +.word 739183455 // Layer 7, block 34 +.word 1623914137 // Layer 7, block 36 +.word 1168207670 // Layer 7, block 38 +.word 9551359 // Layer 7, block 33 +.word 33257316 // Layer 7, block 35 +.word 10387700 // Layer 7, block 37 +.word 4263629 // Layer 7, block 39 +.word 611240324 // Layer 7, block 33 +.word 2128305784 // Layer 7, block 35 +.word 664762063 // Layer 7, block 37 +.word 272851431 // Layer 7, block 39 +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 596073 // Layer 6, block 20 +.word 29039358 // Layer 6, block 21 +.word 6760262 // Layer 6, block 22 +.word 2228887 // Layer 6, block 23 +.word 38145761 // Layer 6, block 20 +.word 1858377074 // Layer 6, block 21 +.word 432623749 // Layer 6, block 22 +.word 142637881 // Layer 6, block 23 +.word 25929180 // Layer 7, block 40 +.word 23508428 // Layer 7, block 42 +.word 22560727 // Layer 7, block 44 +.word 29457393 // Layer 7, block 46 +.word 1659340873 // Layer 7, block 40 +.word 1504424569 // Layer 7, block 42 +.word 1443776334 // Layer 7, block 44 +.word 1885129272 // Layer 7, block 46 +.word 17371159 // Layer 7, block 41 +.word 11558208 // Layer 7, block 43 +.word 15755637 // Layer 7, block 45 +.word 20740787 // Layer 7, block 47 +.word 1111669329 // Layer 7, block 41 +.word 739668858 // Layer 7, block 43 +.word 1008283812 // Layer 7, block 45 +.word 1327309063 // Layer 7, block 47 +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 13624329 // Layer 6, block 24 +.word 9838349 // Layer 6, block 25 +.word 6934560 // Layer 6, block 26 +.word 11310234 // Layer 6, block 27 +.word 871890510 // Layer 6, block 24 +.word 629606282 // Layer 6, block 25 +.word 443777969 // Layer 6, block 26 +.word 723799733 // Layer 6, block 27 +.word 3153984 // Layer 7, block 48 +.word 15599806 // Layer 7, block 50 +.word 23484790 // Layer 7, block 52 +.word 30174454 // Layer 7, block 54 +.word 201839571 // Layer 7, block 48 +.word 998311389 // Layer 7, block 50 +.word 1502911852 // Layer 7, block 52 +.word 1931017673 // Layer 7, block 54 +.word 13598070 // Layer 7, block 49 +.word 31454003 // Layer 7, block 51 +.word 20506260 // Layer 7, block 53 +.word 5928435 // Layer 7, block 55 +.word 870210062 // Layer 7, block 49 +.word 2012902560 // Layer 7, block 51 +.word 1312300480 // Layer 7, block 53 +.word 379390883 // Layer 7, block 55 +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 32798516 // Layer 6, block 28 +.word 9911360 // Layer 6, block 29 +.word 32443170 // Layer 6, block 30 +.word 31293482 // Layer 6, block 31 +.word 2098944825 // Layer 6, block 28 +.word 634278629 // Layer 6, block 29 +.word 2076204416 // Layer 6, block 30 +.word 2002630000 // Layer 6, block 31 +.word 26013877 // Layer 7, block 56 +.word 22928950 // Layer 7, block 58 +.word 24547058 // Layer 7, block 60 +.word 21082546 // Layer 7, block 62 +.word 1664761067 // Layer 7, block 56 +.word 1467340807 // Layer 7, block 58 +.word 1570891816 // Layer 7, block 60 +.word 1349179970 // Layer 7, block 62 +.word 21864746 // Layer 7, block 57 +.word 27678266 // Layer 7, block 59 +.word 30695887 // Layer 7, block 61 +.word 31772478 // Layer 7, block 63 +.word 1399236949 // Layer 7, block 57 +.word 1771273834 // Layer 7, block 59 +.word 1964386839 // Layer 7, block 61 +.word 2033283404 // Layer 7, block 63 +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 2853776 // Layer 6, block 32 +.word 31645959 // Layer 6, block 33 +.word 29723614 // Layer 6, block 34 +.word 31813171 // Layer 6, block 35 +.word 182627725 // Layer 6, block 32 +.word 2025186806 // Layer 6, block 33 +.word 1902166116 // Layer 6, block 34 +.word 2035887557 // Layer 6, block 35 +.word 30377953 // Layer 7, block 64 +.word 4924837 // Layer 7, block 66 +.word 11362575 // Layer 7, block 68 +.word 31398766 // Layer 7, block 70 +.word 1944040616 // Layer 7, block 64 +.word 315165513 // Layer 7, block 66 +.word 727149301 // Layer 7, block 68 +.word 2009367662 // Layer 7, block 70 +.word 27689101 // Layer 7, block 65 +.word 31229525 // Layer 7, block 67 +.word 6544948 // Layer 7, block 69 +.word 13728247 // Layer 7, block 71 +.word 1771967221 // Layer 7, block 65 +.word 1998537064 // Layer 7, block 67 +.word 418844704 // Layer 7, block 69 +.word 878540754 // Layer 7, block 71 +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9116920 // Layer 6, block 36 +.word 26449800 // Layer 6, block 37 +.word 27173300 // Layer 6, block 38 +.word 1574249 // Layer 6, block 39 +.word 583438350 // Layer 6, block 36 +.word 1692658010 // Layer 6, block 37 +.word 1738958476 // Layer 6, block 38 +.word 100744247 // Layer 6, block 39 +.word 6510145 // Layer 7, block 72 +.word 760999 // Layer 7, block 74 +.word 1634503 // Layer 7, block 76 +.word 29546109 // Layer 7, block 78 +.word 416617482 // Layer 7, block 72 +.word 48700219 // Layer 7, block 74 +.word 104600209 // Layer 7, block 76 +.word 1890806663 // Layer 7, block 78 +.word 2195232 // Layer 7, block 73 +.word 4465852 // Layer 7, block 75 +.word 31203102 // Layer 7, block 77 +.word 29916743 // Layer 7, block 79 +.word 140484126 // Layer 7, block 73 +.word 285792715 // Layer 7, block 75 +.word 1996846121 // Layer 7, block 77 +.word 1914525428 // Layer 7, block 79 +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29172999 // Layer 6, block 40 +.word 16825951 // Layer 6, block 41 +.word 11592382 // Layer 6, block 42 +.word 2671395 // Layer 6, block 43 +.word 1866929445 // Layer 6, block 40 +.word 1076778680 // Layer 6, block 41 +.word 741855827 // Layer 6, block 42 +.word 170956232 // Layer 6, block 43 +.word 14579779 // Layer 7, block 80 +.word 24263513 // Layer 7, block 82 +.word 4646776 // Layer 7, block 84 +.word 69049 // Layer 7, block 86 +.word 933034643 // Layer 7, block 80 +.word 1552746321 // Layer 7, block 82 +.word 297370968 // Layer 7, block 84 +.word 4418799 // Layer 7, block 86 +.word 33263488 // Layer 7, block 81 +.word 22493246 // Layer 7, block 83 +.word 22009979 // Layer 7, block 85 +.word 12021234 // Layer 7, block 87 +.word 2128700762 // Layer 7, block 81 +.word 1439457879 // Layer 7, block 83 +.word 1408531152 // Layer 7, block 85 +.word 769300260 // Layer 7, block 87 +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 15720958 // Layer 6, block 44 +.word 4876619 // Layer 6, block 45 +.word 9370171 // Layer 6, block 46 +.word 2197027 // Layer 6, block 47 +.word 1006064525 // Layer 6, block 44 +.word 312079797 // Layer 6, block 45 +.word 599645177 // Layer 6, block 46 +.word 140598997 // Layer 6, block 47 +.word 16117282 // Layer 7, block 88 +.word 9635661 // Layer 7, block 90 +.word 9117520 // Layer 7, block 92 +.word 3506913 // Layer 7, block 94 +.word 1031427326 // Layer 7, block 88 +.word 616635240 // Layer 7, block 90 +.word 583476747 // Layer 7, block 92 +.word 224425303 // Layer 7, block 94 +.word 20014407 // Layer 7, block 89 +.word 25893988 // Layer 7, block 91 +.word 10257619 // Layer 7, block 93 +.word 24501669 // Layer 7, block 95 +.word 1280824291 // Layer 7, block 89 +.word 1657088757 // Layer 7, block 91 +.word 656437514 // Layer 7, block 93 +.word 1567987141 // Layer 7, block 95 +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 23467272 // Layer 6, block 48 +.word 11944835 // Layer 6, block 49 +.word 29768154 // Layer 6, block 50 +.word 3189790 // Layer 6, block 51 +.word 1501790786 // Layer 6, block 48 +.word 764411097 // Layer 6, block 49 +.word 1905016458 // Layer 6, block 50 +.word 204130980 // Layer 6, block 51 +.word 28559032 // Layer 7, block 96 +.word 20151609 // Layer 7, block 98 +.word 11645481 // Layer 7, block 100 +.word 16402437 // Layer 7, block 102 +.word 1827638556 // Layer 7, block 96 +.word 1289604549 // Layer 7, block 98 +.word 745253903 // Layer 7, block 100 +.word 1049675853 // Layer 7, block 102 +.word 1005359 // Layer 7, block 97 +.word 19130139 // Layer 7, block 99 +.word 11690281 // Layer 7, block 101 +.word 5461508 // Layer 7, block 103 +.word 64338065 // Layer 7, block 97 +.word 1224235458 // Layer 7, block 99 +.word 748120885 // Layer 7, block 101 +.word 349509836 // Layer 7, block 103 +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 4898455 // Layer 6, block 52 +.word 22059944 // Layer 6, block 53 +.word 20315246 // Layer 6, block 54 +.word 28615767 // Layer 6, block 55 +.word 313477194 // Layer 6, block 52 +.word 1411728668 // Layer 6, block 53 +.word 1300076517 // Layer 6, block 54 +.word 1831269319 // Layer 6, block 55 +.word 6226096 // Layer 7, block 104 +.word 14029790 // Layer 7, block 106 +.word 7729000 // Layer 7, block 108 +.word 13958531 // Layer 7, block 110 +.word 398439734 // Layer 7, block 104 +.word 897838034 // Layer 7, block 106 +.word 494618249 // Layer 7, block 108 +.word 893277806 // Layer 7, block 110 +.word 31755058 // Layer 7, block 105 +.word 26102744 // Layer 7, block 107 +.word 19175904 // Layer 7, block 109 +.word 19472238 // Layer 7, block 111 +.word 2032168609 // Layer 7, block 105 +.word 1670448121 // Layer 7, block 107 +.word 1227164194 // Layer 7, block 109 +.word 1246128123 // Layer 7, block 111 +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 17302560 // Layer 6, block 56 +.word 8630188 // Layer 6, block 57 +.word 13744680 // Layer 6, block 58 +.word 31890906 // Layer 6, block 59 +.word 1107279328 // Layer 6, block 56 +.word 552289879 // Layer 6, block 57 +.word 879592386 // Layer 6, block 58 +.word 2040862218 // Layer 6, block 59 +.word 4735938 // Layer 7, block 112 +.word 26671657 // Layer 7, block 114 +.word 25810971 // Layer 7, block 116 +.word 25578690 // Layer 7, block 118 +.word 303076900 // Layer 7, block 112 +.word 1706855774 // Layer 7, block 114 +.word 1651776074 // Layer 7, block 116 +.word 1636911225 // Layer 7, block 118 +.word 6957373 // Layer 7, block 113 +.word 25381712 // Layer 7, block 115 +.word 27780827 // Layer 7, block 117 +.word 28062311 // Layer 7, block 119 +.word 445237890 // Layer 7, block 113 +.word 1624305595 // Layer 7, block 115 +.word 1777837237 // Layer 7, block 117 +.word 1795850838 // Layer 7, block 119 +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 26150922 // Layer 6, block 60 +.word 29525906 // Layer 6, block 61 +.word 23080870 // Layer 6, block 62 +.word 1636987 // Layer 6, block 63 +.word 1673531278 // Layer 6, block 60 +.word 1889513769 // Layer 6, block 61 +.word 1477062945 // Layer 6, block 62 +.word 104759172 // Layer 6, block 63 +.word 10674616 // Layer 7, block 120 +.word 9508293 // Layer 7, block 122 +.word 4274200 // Layer 7, block 124 +.word 10066304 // Layer 7, block 126 +.word 683123285 // Layer 7, block 120 +.word 608484310 // Layer 7, block 122 +.word 273527923 // Layer 7, block 124 +.word 644194289 // Layer 7, block 126 +.word 26473446 // Layer 7, block 121 +.word 14853570 // Layer 7, block 123 +.word 32427548 // Layer 7, block 125 +.word 16598340 // Layer 7, block 127 +.word 1694171239 // Layer 7, block 121 +.word 950555930 // Layer 7, block 123 +.word 2075204685 // Layer 7, block 125 +.word 1062212688 // Layer 7, block 127 +.text +.global ntt_u32_full_neon_asm_var_4_4_15_0 +.global _ntt_u32_full_neon_asm_var_4_4_15_0 +ntt_u32_full_neon_asm_var_4_4_15_0: +_ntt_u32_full_neon_asm_var_4_4_15_0: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x0, #992] +ldr q29, [x17, #+0] +ldr q28, [x17, #+16] +sqrdmulh v27.4S, v30.4S, v28.s[0] +mul v30.4S, v30.4S,v29.s[0] +ldr q26, [x0, #928] +sqrdmulh v25.4S, v26.4S, v28.s[0] +mul v26.4S, v26.4S,v29.s[0] +ldr q24, [x0, #864] +sqrdmulh v23.4S, v24.4S, v28.s[0] +mul v24.4S, v24.4S,v29.s[0] +ldr q22, [x0, #800] +sqrdmulh v21.4S, v22.4S, v28.s[0] +mul v22.4S, v22.4S,v29.s[0] +ldr q20, [x0, #736] +mla v30.4S, v27.4S, v31.s[0] +sqrdmulh v27.4S, v20.4S, v28.s[0] +ldr q19, [x0, #672] +mla v26.4S, v25.4S, v31.s[0] +sqrdmulh v25.4S, v19.4S, v28.s[0] +nop +ldr q18, [x0, #608] +mla v24.4S, v23.4S, v31.s[0] +sqrdmulh v23.4S, v18.4S, v28.s[0] +nop +ldr q17, [x0, #544] +mla v22.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v17.4S, v28.s[0] +nop +ldr q16, [x0, #480] +ldr q3, [x0, #416] +mul v20.4S, v20.4S,v29.s[0] +sub v2.4s, v16.4s, v30.4s +mul v19.4S, v19.4S,v29.s[0] +add v16.4s, v16.4s, v30.4s +ldr q30, [x0, #352] +ldr q1, [x0, #288] +mla v20.4S, v27.4S, v31.s[0] +sub v27.4s, v3.4s, v26.4s +mla v19.4S, v25.4S, v31.s[0] +add v3.4s, v3.4s, v26.4s +ldr q26, [x0, #224] +ldr q25, [x0, #160] +mul v18.4S, v18.4S,v29.s[0] +sub v0.4s, v30.4s, v24.4s +mul v17.4S, v17.4S,v29.s[0] +add v30.4s, v30.4s, v24.4s +ldr q24, [x0, #96] +ldr q15, [x0, #32] +mla v18.4S, v23.4S, v31.s[0] +sub v23.4s, v1.4s, v22.4s +mla v17.4S, v21.4S, v31.s[0] +add v1.4s, v1.4s, v22.4s +sqrdmulh v22.4S, v2.4S, v28.s[2] +nop +mul v2.4S, v2.4S,v29.s[2] +nop +sqrdmulh v21.4S, v27.4S, v28.s[2] +sub v14.4s, v26.4s, v20.4s +mul v27.4S, v27.4S,v29.s[2] +add v26.4s, v26.4s, v20.4s +sqrdmulh v20.4S, v0.4S, v28.s[2] +sub v13.4s, v25.4s, v19.4s +mul v0.4S, v0.4S,v29.s[2] +add v25.4s, v25.4s, v19.4s +sqrdmulh v19.4S, v23.4S, v28.s[2] +sub v12.4s, v24.4s, v18.4s +mul v23.4S, v23.4S,v29.s[2] +add v24.4s, v24.4s, v18.4s +mla v2.4S, v22.4S, v31.s[0] +sub v22.4s, v15.4s, v17.4s +sqrdmulh v18.4S, v16.4S, v28.s[1] +add v15.4s, v15.4s, v17.4s +mla v27.4S, v21.4S, v31.s[0] +nop +sqrdmulh v21.4S, v3.4S, v28.s[1] +nop +mla v0.4S, v20.4S, v31.s[0] +nop +sqrdmulh v20.4S, v30.4S, v28.s[1] +nop +mla v23.4S, v19.4S, v31.s[0] +nop +sqrdmulh v19.4S, v1.4S, v28.s[1] +nop +ldr q17, [x17, #+32] +ldr q11, [x17, #+48] +mul v16.4S, v16.4S,v29.s[1] +sub v10.4s, v14.4s, v2.4s +mul v3.4S, v3.4S,v29.s[1] +add v14.4s, v14.4s, v2.4s +mla v16.4S, v18.4S, v31.s[0] +sub v18.4s, v13.4s, v27.4s +mla v3.4S, v21.4S, v31.s[0] +add v13.4s, v13.4s, v27.4s +mul v30.4S, v30.4S,v29.s[1] +sub v27.4s, v12.4s, v0.4s +mul v1.4S, v1.4S,v29.s[1] +add v12.4s, v12.4s, v0.4s +mla v30.4S, v20.4S, v31.s[0] +sub v20.4s, v22.4s, v23.4s +mla v1.4S, v19.4S, v31.s[0] +add v22.4s, v22.4s, v23.4s +sqrdmulh v23.4S, v10.4S, v11.s[3] +nop +mul v10.4S, v10.4S,v17.s[3] +nop +sqrdmulh v19.4S, v18.4S, v11.s[3] +sub v0.4s, v26.4s, v16.4s +mul v18.4S, v18.4S,v17.s[3] +add v26.4s, v26.4s, v16.4s +sqrdmulh v16.4S, v14.4S, v11.s[2] +sub v21.4s, v25.4s, v3.4s +mul v14.4S, v14.4S,v17.s[2] +add v25.4s, v25.4s, v3.4s +sqrdmulh v3.4S, v13.4S, v11.s[2] +sub v2.4s, v24.4s, v30.4s +mul v13.4S, v13.4S,v17.s[2] +add v24.4s, v24.4s, v30.4s +ldr q30, [x17, #+96] +ldr q9, [x17, #+112] +mla v10.4S, v23.4S, v31.s[0] +sub v23.4s, v15.4s, v1.4s +sqrdmulh v8.4S, v0.4S, v11.s[1] +add v15.4s, v15.4s, v1.4s +mla v18.4S, v19.4S, v31.s[0] +nop +sqrdmulh v19.4S, v21.4S, v11.s[1] +nop +mla v14.4S, v16.4S, v31.s[0] +nop +sqrdmulh v16.4S, v26.4S, v11.s[0] +nop +mla v13.4S, v3.4S, v31.s[0] +nop +sqrdmulh v3.4S, v25.4S, v11.s[0] +nop +ldr q1, [x17, #+64] +ldr q7, [x17, #+80] +mul v0.4S, v0.4S,v17.s[1] +sub v6.4s, v27.4s, v10.4s +mul v21.4S, v21.4S,v17.s[1] +add v27.4s, v27.4s, v10.4s +mla v0.4S, v8.4S, v31.s[0] +sub v8.4s, v20.4s, v18.4s +mla v21.4S, v19.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +mul v26.4S, v26.4S,v17.s[0] +sub v18.4s, v12.4s, v14.4s +mul v25.4S, v25.4S,v17.s[0] +add v12.4s, v12.4s, v14.4s +mla v26.4S, v16.4S, v31.s[0] +sub v16.4s, v22.4s, v13.4s +mla v25.4S, v3.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v6.4S, v9.s[3] +nop +mul v6.4S, v6.4S,v30.s[3] +nop +sqrdmulh v3.4S, v27.4S, v9.s[2] +sub v14.4s, v2.4s, v0.4s +mul v27.4S, v27.4S,v30.s[2] +add v2.4s, v2.4s, v0.4s +sqrdmulh v0.4S, v18.4S, v9.s[1] +sub v19.4s, v23.4s, v21.4s +mul v18.4S, v18.4S,v30.s[1] +add v23.4s, v23.4s, v21.4s +sqrdmulh v21.4S, v12.4S, v9.s[0] +sub v10.4s, v24.4s, v26.4s +mul v12.4S, v12.4S,v30.s[0] +add v24.4s, v24.4s, v26.4s +mla v6.4S, v13.4S, v31.s[0] +sub v13.4s, v15.4s, v25.4s +sqrdmulh v26.4S, v14.4S, v7.s[3] +add v15.4s, v15.4s, v25.4s +mla v27.4S, v3.4S, v31.s[0] +sub v3.4s, v8.4s, v6.4s +sqrdmulh v25.4S, v2.4S, v7.s[2] +add v8.4s, v8.4s, v6.4s +mla v18.4S, v0.4S, v31.s[0] +sub v0.4s, v20.4s, v27.4s +sqrdmulh v6.4S, v10.4S, v7.s[1] +add v20.4s, v20.4s, v27.4s +mla v12.4S, v21.4S, v31.s[0] +sub v21.4s, v16.4s, v18.4s +sqrdmulh v27.4S, v24.4S, v7.s[0] +add v16.4s, v16.4s, v18.4s +mul v14.4S, v14.4S,v1.s[3] +sub v18.4s, v22.4s, v12.4s +mul v2.4S, v2.4S,v1.s[2] +add v22.4s, v22.4s, v12.4s +mla v14.4S, v26.4S, v31.s[0] +str q3, [x0, #992] +mla v2.4S, v25.4S, v31.s[0] +str q8, [x0, #928] +mul v10.4S, v10.4S,v1.s[1] +str q0, [x0, #864] +mul v24.4S, v24.4S,v1.s[0] +str q20, [x0, #800] +mla v10.4S, v6.4S, v31.s[0] +str q21, [x0, #736] +mla v24.4S, v27.4S, v31.s[0] +str q16, [x0, #672] +ldr q16, [x0, #1008] +sqrdmulh v27.4S, v16.4S, v28.s[0] +str q18, [x0, #608] +mul v16.4S, v16.4S,v29.s[0] +str q22, [x0, #544] +ldr q22, [x0, #944] +sqrdmulh v18.4S, v22.4S, v28.s[0] +sub v21.4s, v19.4s, v14.4s +str q21, [x0, #480] +mul v22.4S, v22.4S,v29.s[0] +add v19.4s, v19.4s, v14.4s +ldr q14, [x0, #880] +sqrdmulh v21.4S, v14.4S, v28.s[0] +sub v6.4s, v23.4s, v2.4s +str q19, [x0, #416] +mul v14.4S, v14.4S,v29.s[0] +add v23.4s, v23.4s, v2.4s +ldr q2, [x0, #816] +sqrdmulh v19.4S, v2.4S, v28.s[0] +sub v20.4s, v13.4s, v10.4s +str q6, [x0, #352] +mul v2.4S, v2.4S,v29.s[0] +add v13.4s, v13.4s, v10.4s +ldr q10, [x0, #752] +mla v16.4S, v27.4S, v31.s[0] +sub v27.4s, v15.4s, v24.4s +str q23, [x0, #288] +sqrdmulh v23.4S, v10.4S, v28.s[0] +add v15.4s, v15.4s, v24.4s +ldr q24, [x0, #688] +mla v22.4S, v18.4S, v31.s[0] +str q20, [x0, #224] +sqrdmulh v20.4S, v24.4S, v28.s[0] +nop +ldr q18, [x0, #624] +mla v14.4S, v21.4S, v31.s[0] +str q13, [x0, #160] +sqrdmulh v13.4S, v18.4S, v28.s[0] +nop +ldr q21, [x0, #560] +mla v2.4S, v19.4S, v31.s[0] +str q27, [x0, #96] +sqrdmulh v27.4S, v21.4S, v28.s[0] +nop +ldr q19, [x0, #496] +ldr q6, [x0, #432] +mul v10.4S, v10.4S,v29.s[0] +sub v0.4s, v19.4s, v16.4s +str q15, [x0, #32] +mul v24.4S, v24.4S,v29.s[0] +add v19.4s, v19.4s, v16.4s +ldr q16, [x0, #368] +ldr q15, [x0, #304] +mla v10.4S, v23.4S, v31.s[0] +sub v23.4s, v6.4s, v22.4s +mla v24.4S, v20.4S, v31.s[0] +add v6.4s, v6.4s, v22.4s +ldr q22, [x0, #240] +ldr q20, [x0, #176] +mul v18.4S, v18.4S,v29.s[0] +sub v8.4s, v16.4s, v14.4s +mul v21.4S, v21.4S,v29.s[0] +add v16.4s, v16.4s, v14.4s +ldr q14, [x0, #112] +ldr q25, [x0, #48] +mla v18.4S, v13.4S, v31.s[0] +sub v13.4s, v15.4s, v2.4s +mla v21.4S, v27.4S, v31.s[0] +add v15.4s, v15.4s, v2.4s +sqrdmulh v2.4S, v0.4S, v28.s[2] +nop +mul v0.4S, v0.4S,v29.s[2] +nop +sqrdmulh v27.4S, v23.4S, v28.s[2] +sub v3.4s, v22.4s, v10.4s +mul v23.4S, v23.4S,v29.s[2] +add v22.4s, v22.4s, v10.4s +sqrdmulh v10.4S, v8.4S, v28.s[2] +sub v26.4s, v20.4s, v24.4s +mul v8.4S, v8.4S,v29.s[2] +add v20.4s, v20.4s, v24.4s +sqrdmulh v24.4S, v13.4S, v28.s[2] +sub v12.4s, v14.4s, v18.4s +mul v13.4S, v13.4S,v29.s[2] +add v14.4s, v14.4s, v18.4s +mla v0.4S, v2.4S, v31.s[0] +sub v2.4s, v25.4s, v21.4s +sqrdmulh v18.4S, v19.4S, v28.s[1] +add v25.4s, v25.4s, v21.4s +mla v23.4S, v27.4S, v31.s[0] +nop +sqrdmulh v27.4S, v6.4S, v28.s[1] +nop +mla v8.4S, v10.4S, v31.s[0] +nop +sqrdmulh v10.4S, v16.4S, v28.s[1] +nop +mla v13.4S, v24.4S, v31.s[0] +nop +sqrdmulh v24.4S, v15.4S, v28.s[1] +nop +mul v19.4S, v19.4S,v29.s[1] +sub v21.4s, v3.4s, v0.4s +mul v6.4S, v6.4S,v29.s[1] +add v3.4s, v3.4s, v0.4s +mla v19.4S, v18.4S, v31.s[0] +sub v18.4s, v26.4s, v23.4s +mla v6.4S, v27.4S, v31.s[0] +add v26.4s, v26.4s, v23.4s +mul v16.4S, v16.4S,v29.s[1] +sub v23.4s, v12.4s, v8.4s +mul v15.4S, v15.4S,v29.s[1] +add v12.4s, v12.4s, v8.4s +mla v16.4S, v10.4S, v31.s[0] +sub v10.4s, v2.4s, v13.4s +mla v15.4S, v24.4S, v31.s[0] +add v2.4s, v2.4s, v13.4s +sqrdmulh v13.4S, v21.4S, v11.s[3] +nop +mul v21.4S, v21.4S,v17.s[3] +nop +sqrdmulh v24.4S, v18.4S, v11.s[3] +sub v8.4s, v22.4s, v19.4s +mul v18.4S, v18.4S,v17.s[3] +add v22.4s, v22.4s, v19.4s +sqrdmulh v19.4S, v3.4S, v11.s[2] +sub v27.4s, v20.4s, v6.4s +mul v3.4S, v3.4S,v17.s[2] +add v20.4s, v20.4s, v6.4s +sqrdmulh v6.4S, v26.4S, v11.s[2] +sub v0.4s, v14.4s, v16.4s +mul v26.4S, v26.4S,v17.s[2] +add v14.4s, v14.4s, v16.4s +mla v21.4S, v13.4S, v31.s[0] +sub v13.4s, v25.4s, v15.4s +sqrdmulh v16.4S, v8.4S, v11.s[1] +add v25.4s, v25.4s, v15.4s +mla v18.4S, v24.4S, v31.s[0] +nop +sqrdmulh v24.4S, v27.4S, v11.s[1] +nop +mla v3.4S, v19.4S, v31.s[0] +nop +sqrdmulh v19.4S, v22.4S, v11.s[0] +nop +mla v26.4S, v6.4S, v31.s[0] +nop +sqrdmulh v6.4S, v20.4S, v11.s[0] +nop +mul v8.4S, v8.4S,v17.s[1] +sub v15.4s, v23.4s, v21.4s +mul v27.4S, v27.4S,v17.s[1] +add v23.4s, v23.4s, v21.4s +mla v8.4S, v16.4S, v31.s[0] +sub v16.4s, v10.4s, v18.4s +mla v27.4S, v24.4S, v31.s[0] +add v10.4s, v10.4s, v18.4s +mul v22.4S, v22.4S,v17.s[0] +sub v18.4s, v12.4s, v3.4s +mul v20.4S, v20.4S,v17.s[0] +add v12.4s, v12.4s, v3.4s +mla v22.4S, v19.4S, v31.s[0] +sub v19.4s, v2.4s, v26.4s +mla v20.4S, v6.4S, v31.s[0] +add v2.4s, v2.4s, v26.4s +sqrdmulh v26.4S, v15.4S, v9.s[3] +nop +mul v15.4S, v15.4S,v30.s[3] +nop +sqrdmulh v6.4S, v23.4S, v9.s[2] +sub v3.4s, v0.4s, v8.4s +mul v23.4S, v23.4S,v30.s[2] +add v0.4s, v0.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v9.s[1] +sub v24.4s, v13.4s, v27.4s +mul v18.4S, v18.4S,v30.s[1] +add v13.4s, v13.4s, v27.4s +sqrdmulh v27.4S, v12.4S, v9.s[0] +sub v21.4s, v14.4s, v22.4s +mul v12.4S, v12.4S,v30.s[0] +add v14.4s, v14.4s, v22.4s +mla v15.4S, v26.4S, v31.s[0] +sub v26.4s, v25.4s, v20.4s +sqrdmulh v22.4S, v3.4S, v7.s[3] +add v25.4s, v25.4s, v20.4s +mla v23.4S, v6.4S, v31.s[0] +sub v6.4s, v16.4s, v15.4s +sqrdmulh v20.4S, v0.4S, v7.s[2] +add v16.4s, v16.4s, v15.4s +mla v18.4S, v8.4S, v31.s[0] +sub v8.4s, v10.4s, v23.4s +sqrdmulh v15.4S, v21.4S, v7.s[1] +add v10.4s, v10.4s, v23.4s +mla v12.4S, v27.4S, v31.s[0] +sub v27.4s, v19.4s, v18.4s +sqrdmulh v23.4S, v14.4S, v7.s[0] +add v19.4s, v19.4s, v18.4s +mul v3.4S, v3.4S,v1.s[3] +sub v18.4s, v2.4s, v12.4s +mul v0.4S, v0.4S,v1.s[2] +add v2.4s, v2.4s, v12.4s +mla v3.4S, v22.4S, v31.s[0] +str q6, [x0, #1008] +mla v0.4S, v20.4S, v31.s[0] +str q16, [x0, #944] +mul v21.4S, v21.4S,v1.s[1] +str q8, [x0, #880] +mul v14.4S, v14.4S,v1.s[0] +str q10, [x0, #816] +mla v21.4S, v15.4S, v31.s[0] +str q27, [x0, #752] +mla v14.4S, v23.4S, v31.s[0] +str q19, [x0, #688] +ldr q19, [x0, #960] +sqrdmulh v23.4S, v19.4S, v28.s[0] +str q18, [x0, #624] +mul v19.4S, v19.4S,v29.s[0] +str q2, [x0, #560] +ldr q2, [x0, #896] +sqrdmulh v18.4S, v2.4S, v28.s[0] +sub v27.4s, v24.4s, v3.4s +str q27, [x0, #496] +mul v2.4S, v2.4S,v29.s[0] +add v24.4s, v24.4s, v3.4s +ldr q3, [x0, #832] +sqrdmulh v27.4S, v3.4S, v28.s[0] +sub v15.4s, v13.4s, v0.4s +str q24, [x0, #432] +mul v3.4S, v3.4S,v29.s[0] +add v13.4s, v13.4s, v0.4s +ldr q0, [x0, #768] +sqrdmulh v24.4S, v0.4S, v28.s[0] +sub v10.4s, v26.4s, v21.4s +str q15, [x0, #368] +mul v0.4S, v0.4S,v29.s[0] +add v26.4s, v26.4s, v21.4s +ldr q21, [x0, #704] +mla v19.4S, v23.4S, v31.s[0] +sub v23.4s, v25.4s, v14.4s +str q13, [x0, #304] +sqrdmulh v13.4S, v21.4S, v28.s[0] +add v25.4s, v25.4s, v14.4s +ldr q14, [x0, #640] +mla v2.4S, v18.4S, v31.s[0] +str q10, [x0, #240] +sqrdmulh v10.4S, v14.4S, v28.s[0] +nop +ldr q18, [x0, #576] +mla v3.4S, v27.4S, v31.s[0] +str q26, [x0, #176] +sqrdmulh v26.4S, v18.4S, v28.s[0] +nop +ldr q27, [x0, #512] +mla v0.4S, v24.4S, v31.s[0] +str q23, [x0, #112] +sqrdmulh v23.4S, v27.4S, v28.s[0] +nop +ldr q24, [x0, #448] +ldr q15, [x0, #384] +mul v21.4S, v21.4S,v29.s[0] +sub v8.4s, v24.4s, v19.4s +str q25, [x0, #48] +mul v14.4S, v14.4S,v29.s[0] +add v24.4s, v24.4s, v19.4s +ldr q19, [x0, #320] +ldr q25, [x0, #256] +mla v21.4S, v13.4S, v31.s[0] +sub v13.4s, v15.4s, v2.4s +mla v14.4S, v10.4S, v31.s[0] +add v15.4s, v15.4s, v2.4s +ldr q2, [x0, #192] +ldr q10, [x0, #128] +mul v18.4S, v18.4S,v29.s[0] +sub v16.4s, v19.4s, v3.4s +mul v27.4S, v27.4S,v29.s[0] +add v19.4s, v19.4s, v3.4s +ldr q3, [x0, #64] +ldr q20, [x0, #0] +mla v18.4S, v26.4S, v31.s[0] +sub v26.4s, v25.4s, v0.4s +mla v27.4S, v23.4S, v31.s[0] +add v25.4s, v25.4s, v0.4s +sqrdmulh v0.4S, v8.4S, v28.s[2] +nop +mul v8.4S, v8.4S,v29.s[2] +nop +sqrdmulh v23.4S, v13.4S, v28.s[2] +sub v6.4s, v2.4s, v21.4s +mul v13.4S, v13.4S,v29.s[2] +add v2.4s, v2.4s, v21.4s +sqrdmulh v21.4S, v16.4S, v28.s[2] +sub v22.4s, v10.4s, v14.4s +mul v16.4S, v16.4S,v29.s[2] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v26.4S, v28.s[2] +sub v12.4s, v3.4s, v18.4s +mul v26.4S, v26.4S,v29.s[2] +add v3.4s, v3.4s, v18.4s +mla v8.4S, v0.4S, v31.s[0] +sub v0.4s, v20.4s, v27.4s +sqrdmulh v18.4S, v24.4S, v28.s[1] +add v20.4s, v20.4s, v27.4s +mla v13.4S, v23.4S, v31.s[0] +nop +sqrdmulh v23.4S, v15.4S, v28.s[1] +nop +mla v16.4S, v21.4S, v31.s[0] +nop +sqrdmulh v21.4S, v19.4S, v28.s[1] +nop +mla v26.4S, v14.4S, v31.s[0] +nop +sqrdmulh v14.4S, v25.4S, v28.s[1] +nop +mul v24.4S, v24.4S,v29.s[1] +sub v27.4s, v6.4s, v8.4s +mul v15.4S, v15.4S,v29.s[1] +add v6.4s, v6.4s, v8.4s +mla v24.4S, v18.4S, v31.s[0] +sub v18.4s, v22.4s, v13.4s +mla v15.4S, v23.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +mul v19.4S, v19.4S,v29.s[1] +sub v13.4s, v12.4s, v16.4s +mul v25.4S, v25.4S,v29.s[1] +add v12.4s, v12.4s, v16.4s +mla v19.4S, v21.4S, v31.s[0] +sub v21.4s, v0.4s, v26.4s +mla v25.4S, v14.4S, v31.s[0] +add v0.4s, v0.4s, v26.4s +sqrdmulh v26.4S, v27.4S, v11.s[3] +nop +mul v27.4S, v27.4S,v17.s[3] +nop +sqrdmulh v14.4S, v18.4S, v11.s[3] +sub v16.4s, v2.4s, v24.4s +mul v18.4S, v18.4S,v17.s[3] +add v2.4s, v2.4s, v24.4s +sqrdmulh v24.4S, v6.4S, v11.s[2] +sub v23.4s, v10.4s, v15.4s +mul v6.4S, v6.4S,v17.s[2] +add v10.4s, v10.4s, v15.4s +sqrdmulh v15.4S, v22.4S, v11.s[2] +sub v8.4s, v3.4s, v19.4s +mul v22.4S, v22.4S,v17.s[2] +add v3.4s, v3.4s, v19.4s +mla v27.4S, v26.4S, v31.s[0] +sub v26.4s, v20.4s, v25.4s +sqrdmulh v19.4S, v16.4S, v11.s[1] +add v20.4s, v20.4s, v25.4s +mla v18.4S, v14.4S, v31.s[0] +nop +sqrdmulh v14.4S, v23.4S, v11.s[1] +nop +mla v6.4S, v24.4S, v31.s[0] +nop +sqrdmulh v24.4S, v2.4S, v11.s[0] +nop +mla v22.4S, v15.4S, v31.s[0] +nop +sqrdmulh v15.4S, v10.4S, v11.s[0] +nop +mul v16.4S, v16.4S,v17.s[1] +sub v25.4s, v13.4s, v27.4s +mul v23.4S, v23.4S,v17.s[1] +add v13.4s, v13.4s, v27.4s +mla v16.4S, v19.4S, v31.s[0] +sub v19.4s, v21.4s, v18.4s +mla v23.4S, v14.4S, v31.s[0] +add v21.4s, v21.4s, v18.4s +mul v2.4S, v2.4S,v17.s[0] +sub v18.4s, v12.4s, v6.4s +mul v10.4S, v10.4S,v17.s[0] +add v12.4s, v12.4s, v6.4s +mla v2.4S, v24.4S, v31.s[0] +sub v24.4s, v0.4s, v22.4s +mla v10.4S, v15.4S, v31.s[0] +add v0.4s, v0.4s, v22.4s +sqrdmulh v22.4S, v25.4S, v9.s[3] +nop +mul v25.4S, v25.4S,v30.s[3] +nop +sqrdmulh v15.4S, v13.4S, v9.s[2] +sub v6.4s, v8.4s, v16.4s +mul v13.4S, v13.4S,v30.s[2] +add v8.4s, v8.4s, v16.4s +sqrdmulh v16.4S, v18.4S, v9.s[1] +sub v14.4s, v26.4s, v23.4s +mul v18.4S, v18.4S,v30.s[1] +add v26.4s, v26.4s, v23.4s +sqrdmulh v23.4S, v12.4S, v9.s[0] +sub v27.4s, v3.4s, v2.4s +mul v12.4S, v12.4S,v30.s[0] +add v3.4s, v3.4s, v2.4s +mla v25.4S, v22.4S, v31.s[0] +sub v22.4s, v20.4s, v10.4s +sqrdmulh v2.4S, v6.4S, v7.s[3] +add v20.4s, v20.4s, v10.4s +mla v13.4S, v15.4S, v31.s[0] +sub v15.4s, v19.4s, v25.4s +sqrdmulh v10.4S, v8.4S, v7.s[2] +add v19.4s, v19.4s, v25.4s +mla v18.4S, v16.4S, v31.s[0] +sub v16.4s, v21.4s, v13.4s +sqrdmulh v25.4S, v27.4S, v7.s[1] +add v21.4s, v21.4s, v13.4s +mla v12.4S, v23.4S, v31.s[0] +sub v23.4s, v24.4s, v18.4s +sqrdmulh v13.4S, v3.4S, v7.s[0] +add v24.4s, v24.4s, v18.4s +mul v6.4S, v6.4S,v1.s[3] +sub v18.4s, v0.4s, v12.4s +mul v8.4S, v8.4S,v1.s[2] +add v0.4s, v0.4s, v12.4s +mla v6.4S, v2.4S, v31.s[0] +str q15, [x0, #960] +mla v8.4S, v10.4S, v31.s[0] +str q19, [x0, #896] +mul v27.4S, v27.4S,v1.s[1] +str q16, [x0, #832] +mul v3.4S, v3.4S,v1.s[0] +str q21, [x0, #768] +mla v27.4S, v25.4S, v31.s[0] +str q23, [x0, #704] +mla v3.4S, v13.4S, v31.s[0] +str q24, [x0, #640] +ldr q24, [x0, #976] +sqrdmulh v13.4S, v24.4S, v28.s[0] +str q18, [x0, #576] +mul v24.4S, v24.4S,v29.s[0] +str q0, [x0, #512] +ldr q0, [x0, #912] +sqrdmulh v18.4S, v0.4S, v28.s[0] +sub v23.4s, v14.4s, v6.4s +str q23, [x0, #448] +mul v0.4S, v0.4S,v29.s[0] +add v14.4s, v14.4s, v6.4s +ldr q6, [x0, #848] +sqrdmulh v23.4S, v6.4S, v28.s[0] +sub v25.4s, v26.4s, v8.4s +str q14, [x0, #384] +mul v6.4S, v6.4S,v29.s[0] +add v26.4s, v26.4s, v8.4s +ldr q8, [x0, #784] +sqrdmulh v14.4S, v8.4S, v28.s[0] +sub v21.4s, v22.4s, v27.4s +str q25, [x0, #320] +mul v8.4S, v8.4S,v29.s[0] +add v22.4s, v22.4s, v27.4s +ldr q27, [x0, #720] +mla v24.4S, v13.4S, v31.s[0] +sub v13.4s, v20.4s, v3.4s +str q26, [x0, #256] +sqrdmulh v26.4S, v27.4S, v28.s[0] +add v20.4s, v20.4s, v3.4s +ldr q3, [x0, #656] +mla v0.4S, v18.4S, v31.s[0] +str q21, [x0, #192] +sqrdmulh v21.4S, v3.4S, v28.s[0] +nop +ldr q18, [x0, #592] +mla v6.4S, v23.4S, v31.s[0] +str q22, [x0, #128] +sqrdmulh v22.4S, v18.4S, v28.s[0] +nop +ldr q23, [x0, #528] +mla v8.4S, v14.4S, v31.s[0] +str q13, [x0, #64] +sqrdmulh v13.4S, v23.4S, v28.s[0] +nop +ldr q14, [x0, #464] +ldr q25, [x0, #400] +mul v27.4S, v27.4S,v29.s[0] +sub v16.4s, v14.4s, v24.4s +str q20, [x0, #0] +mul v3.4S, v3.4S,v29.s[0] +add v14.4s, v14.4s, v24.4s +ldr q24, [x0, #336] +ldr q20, [x0, #272] +mla v27.4S, v26.4S, v31.s[0] +sub v26.4s, v25.4s, v0.4s +mla v3.4S, v21.4S, v31.s[0] +add v25.4s, v25.4s, v0.4s +ldr q0, [x0, #208] +ldr q21, [x0, #144] +mul v18.4S, v18.4S,v29.s[0] +sub v19.4s, v24.4s, v6.4s +mul v23.4S, v23.4S,v29.s[0] +add v24.4s, v24.4s, v6.4s +ldr q6, [x0, #80] +ldr q10, [x0, #16] +mla v18.4S, v22.4S, v31.s[0] +sub v22.4s, v20.4s, v8.4s +mla v23.4S, v13.4S, v31.s[0] +add v20.4s, v20.4s, v8.4s +sqrdmulh v8.4S, v16.4S, v28.s[2] +nop +mul v16.4S, v16.4S,v29.s[2] +nop +sqrdmulh v13.4S, v26.4S, v28.s[2] +sub v15.4s, v0.4s, v27.4s +mul v26.4S, v26.4S,v29.s[2] +add v0.4s, v0.4s, v27.4s +sqrdmulh v27.4S, v19.4S, v28.s[2] +sub v2.4s, v21.4s, v3.4s +mul v19.4S, v19.4S,v29.s[2] +add v21.4s, v21.4s, v3.4s +sqrdmulh v3.4S, v22.4S, v28.s[2] +sub v12.4s, v6.4s, v18.4s +mul v22.4S, v22.4S,v29.s[2] +add v6.4s, v6.4s, v18.4s +mla v16.4S, v8.4S, v31.s[0] +sub v8.4s, v10.4s, v23.4s +sqrdmulh v18.4S, v14.4S, v28.s[1] +add v10.4s, v10.4s, v23.4s +mla v26.4S, v13.4S, v31.s[0] +nop +sqrdmulh v13.4S, v25.4S, v28.s[1] +nop +mla v19.4S, v27.4S, v31.s[0] +nop +sqrdmulh v27.4S, v24.4S, v28.s[1] +nop +mla v22.4S, v3.4S, v31.s[0] +nop +sqrdmulh v3.4S, v20.4S, v28.s[1] +nop +mul v14.4S, v14.4S,v29.s[1] +sub v23.4s, v15.4s, v16.4s +mul v25.4S, v25.4S,v29.s[1] +add v15.4s, v15.4s, v16.4s +mla v14.4S, v18.4S, v31.s[0] +sub v18.4s, v2.4s, v26.4s +mla v25.4S, v13.4S, v31.s[0] +add v2.4s, v2.4s, v26.4s +mul v24.4S, v24.4S,v29.s[1] +sub v26.4s, v12.4s, v19.4s +mul v20.4S, v20.4S,v29.s[1] +add v12.4s, v12.4s, v19.4s +mla v24.4S, v27.4S, v31.s[0] +sub v27.4s, v8.4s, v22.4s +mla v20.4S, v3.4S, v31.s[0] +add v8.4s, v8.4s, v22.4s +sqrdmulh v28.4S, v23.4S, v11.s[3] +nop +mul v23.4S, v23.4S,v17.s[3] +nop +sqrdmulh v29.4S, v18.4S, v11.s[3] +sub v22.4s, v0.4s, v14.4s +mul v18.4S, v18.4S,v17.s[3] +add v0.4s, v0.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v11.s[2] +sub v3.4s, v21.4s, v25.4s +mul v15.4S, v15.4S,v17.s[2] +add v21.4s, v21.4s, v25.4s +sqrdmulh v25.4S, v2.4S, v11.s[2] +sub v19.4s, v6.4s, v24.4s +mul v2.4S, v2.4S,v17.s[2] +add v6.4s, v6.4s, v24.4s +mla v23.4S, v28.4S, v31.s[0] +sub v28.4s, v10.4s, v20.4s +sqrdmulh v24.4S, v22.4S, v11.s[1] +add v10.4s, v10.4s, v20.4s +mla v18.4S, v29.4S, v31.s[0] +nop +sqrdmulh v29.4S, v3.4S, v11.s[1] +nop +mla v15.4S, v14.4S, v31.s[0] +nop +sqrdmulh v14.4S, v0.4S, v11.s[0] +nop +mla v2.4S, v25.4S, v31.s[0] +nop +sqrdmulh v25.4S, v21.4S, v11.s[0] +nop +mul v22.4S, v22.4S,v17.s[1] +sub v20.4s, v26.4s, v23.4s +mul v3.4S, v3.4S,v17.s[1] +add v26.4s, v26.4s, v23.4s +mla v22.4S, v24.4S, v31.s[0] +sub v24.4s, v27.4s, v18.4s +mla v3.4S, v29.4S, v31.s[0] +add v27.4s, v27.4s, v18.4s +mul v0.4S, v0.4S,v17.s[0] +sub v18.4s, v12.4s, v15.4s +mul v21.4S, v21.4S,v17.s[0] +add v12.4s, v12.4s, v15.4s +mla v0.4S, v14.4S, v31.s[0] +sub v14.4s, v8.4s, v2.4s +mla v21.4S, v25.4S, v31.s[0] +add v8.4s, v8.4s, v2.4s +sqrdmulh v11.4S, v20.4S, v9.s[3] +nop +mul v20.4S, v20.4S,v30.s[3] +nop +sqrdmulh v17.4S, v26.4S, v9.s[2] +sub v2.4s, v19.4s, v22.4s +mul v26.4S, v26.4S,v30.s[2] +add v19.4s, v19.4s, v22.4s +sqrdmulh v22.4S, v18.4S, v9.s[1] +sub v25.4s, v28.4s, v3.4s +mul v18.4S, v18.4S,v30.s[1] +add v28.4s, v28.4s, v3.4s +sqrdmulh v3.4S, v12.4S, v9.s[0] +sub v15.4s, v6.4s, v0.4s +mul v12.4S, v12.4S,v30.s[0] +add v6.4s, v6.4s, v0.4s +mla v20.4S, v11.4S, v31.s[0] +sub v11.4s, v10.4s, v21.4s +sqrdmulh v9.4S, v2.4S, v7.s[3] +add v10.4s, v10.4s, v21.4s +mla v26.4S, v17.4S, v31.s[0] +sub v17.4s, v24.4s, v20.4s +sqrdmulh v21.4S, v19.4S, v7.s[2] +add v24.4s, v24.4s, v20.4s +mla v18.4S, v22.4S, v31.s[0] +sub v22.4s, v27.4s, v26.4s +sqrdmulh v20.4S, v15.4S, v7.s[1] +add v27.4s, v27.4s, v26.4s +mla v12.4S, v3.4S, v31.s[0] +sub v3.4s, v14.4s, v18.4s +sqrdmulh v26.4S, v6.4S, v7.s[0] +add v14.4s, v14.4s, v18.4s +mul v2.4S, v2.4S,v1.s[3] +sub v18.4s, v8.4s, v12.4s +mul v19.4S, v19.4S,v1.s[2] +add v8.4s, v8.4s, v12.4s +mla v2.4S, v9.4S, v31.s[0] +str q17, [x0, #976] +mla v19.4S, v21.4S, v31.s[0] +str q24, [x0, #912] +mul v15.4S, v15.4S,v1.s[1] +str q22, [x0, #848] +mul v6.4S, v6.4S,v1.s[0] +str q27, [x0, #784] +mla v15.4S, v20.4S, v31.s[0] +str q3, [x0, #720] +mla v6.4S, v26.4S, v31.s[0] +str q14, [x0, #656] +str q18, [x0, #592] +str q8, [x0, #528] +sub v8.4s, v25.4s, v2.4s +str q8, [x0, #464] +add v25.4s, v25.4s, v2.4s +sub v2.4s, v28.4s, v19.4s +str q25, [x0, #400] +add v28.4s, v28.4s, v19.4s +sub v19.4s, v11.4s, v15.4s +str q2, [x0, #336] +add v11.4s, v11.4s, v15.4s +sub v15.4s, v10.4s, v6.4s +str q28, [x0, #272] +add v10.4s, v10.4s, v6.4s +str q19, [x0, #208] +str q11, [x0, #144] +str q15, [x0, #80] +str q10, [x0, #16] +ldr q4, [x17, #+128] +ldr q5, [x17, #+144] +ldr q16, [x17, #+160] +ldr q13, [x17, #+176] +ldr q23, [x17, #+192] +ldr q29, [x17, #+208] +ldr q0, [x17, #+224] +ldr q30, [x17, #+240] +ldr q12, [x0, #32] +ldr q9, [x0, #48] +ldr q17, [x0, #0] +ldr q21, [x0, #16] +sqrdmulh v24.4S, v12.4S, v5.s[0] +mul v12.4S, v12.4S,v4.s[0] +mla v12.4S, v24.4S, v31.s[0] +sub v24.4s, v17.4s, v12.4s +add v17.4s, v17.4s, v12.4s +sqrdmulh v12.4S, v9.4S, v5.s[0] +mul v9.4S, v9.4S,v4.s[0] +mla v9.4S, v12.4S, v31.s[0] +sub v12.4s, v21.4s, v9.4s +add v21.4s, v21.4s, v9.4s +sqrdmulh v9.4S, v21.4S, v5.s[1] +mul v21.4S, v21.4S,v4.s[1] +mla v21.4S, v9.4S, v31.s[0] +sub v9.4s, v17.4s, v21.4s +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v12.4S, v5.s[2] +mul v12.4S, v12.4S,v4.s[2] +mla v12.4S, v21.4S, v31.s[0] +sub v21.4s, v24.4s, v12.4s +add v24.4s, v24.4s, v12.4s +trn1 v12.4S, v17.4S, v9.4S +trn2 v22.4S, v17.4S, v9.4S +trn1 v27.4S, v24.4S, v21.4S +trn2 v20.4S, v24.4S, v21.4S +trn2 v24.2D, v12.2D, v27.2D +trn2 v21.2D, v22.2D, v20.2D +trn1 v17.2D, v12.2D, v27.2D +trn1 v9.2D, v22.2D, v20.2D +sqrdmulh v20.4S, v24.4S, v13.4S +mul v24.4S, v24.4S,v16.4S +mla v24.4S, v20.4S, v31.s[0] +sub v20.4s, v17.4s, v24.4s +add v17.4s, v17.4s, v24.4s +sqrdmulh v24.4S, v21.4S, v13.4S +mul v21.4S, v21.4S,v16.4S +mla v21.4S, v24.4S, v31.s[0] +sub v24.4s, v9.4s, v21.4s +add v9.4s, v9.4s, v21.4s +sqrdmulh v21.4S, v9.4S, v29.4S +mul v9.4S, v9.4S,v23.4S +mla v9.4S, v21.4S, v31.s[0] +sub v21.4s, v17.4s, v9.4s +add v17.4s, v17.4s, v9.4s +sqrdmulh v9.4S, v24.4S, v30.4S +mul v24.4S, v24.4S,v0.4S +mla v24.4S, v9.4S, v31.s[0] +sub v9.4s, v20.4s, v24.4s +add v20.4s, v20.4s, v24.4s +str q17, [x0, #0] +str q21, [x0, #16] +str q20, [x0, #32] +str q9, [x0, #48] +ldr q9, [x17, #+256] +ldr q20, [x17, #+272] +ldr q21, [x17, #+288] +ldr q17, [x17, #+304] +ldr q24, [x17, #+320] +ldr q22, [x17, #+336] +ldr q27, [x17, #+352] +ldr q12, [x17, #+368] +ldr q30, [x0, #96] +ldr q0, [x0, #112] +ldr q29, [x0, #64] +ldr q23, [x0, #80] +sqrdmulh v13.4S, v30.4S, v20.s[0] +mul v30.4S, v30.4S,v9.s[0] +mla v30.4S, v13.4S, v31.s[0] +sub v13.4s, v29.4s, v30.4s +add v29.4s, v29.4s, v30.4s +sqrdmulh v30.4S, v0.4S, v20.s[0] +mul v0.4S, v0.4S,v9.s[0] +mla v0.4S, v30.4S, v31.s[0] +sub v30.4s, v23.4s, v0.4s +add v23.4s, v23.4s, v0.4s +sqrdmulh v0.4S, v23.4S, v20.s[1] +mul v23.4S, v23.4S,v9.s[1] +mla v23.4S, v0.4S, v31.s[0] +sub v0.4s, v29.4s, v23.4s +add v29.4s, v29.4s, v23.4s +sqrdmulh v23.4S, v30.4S, v20.s[2] +mul v30.4S, v30.4S,v9.s[2] +mla v30.4S, v23.4S, v31.s[0] +sub v23.4s, v13.4s, v30.4s +add v13.4s, v13.4s, v30.4s +trn1 v30.4S, v29.4S, v0.4S +trn2 v16.4S, v29.4S, v0.4S +trn1 v5.4S, v13.4S, v23.4S +trn2 v4.4S, v13.4S, v23.4S +trn2 v13.2D, v30.2D, v5.2D +trn2 v23.2D, v16.2D, v4.2D +trn1 v29.2D, v30.2D, v5.2D +trn1 v0.2D, v16.2D, v4.2D +sqrdmulh v4.4S, v13.4S, v17.4S +mul v13.4S, v13.4S,v21.4S +mla v13.4S, v4.4S, v31.s[0] +sub v4.4s, v29.4s, v13.4s +add v29.4s, v29.4s, v13.4s +sqrdmulh v13.4S, v23.4S, v17.4S +mul v23.4S, v23.4S,v21.4S +mla v23.4S, v13.4S, v31.s[0] +sub v13.4s, v0.4s, v23.4s +add v0.4s, v0.4s, v23.4s +sqrdmulh v23.4S, v0.4S, v22.4S +mul v0.4S, v0.4S,v24.4S +mla v0.4S, v23.4S, v31.s[0] +sub v23.4s, v29.4s, v0.4s +add v29.4s, v29.4s, v0.4s +sqrdmulh v0.4S, v13.4S, v12.4S +mul v13.4S, v13.4S,v27.4S +mla v13.4S, v0.4S, v31.s[0] +sub v0.4s, v4.4s, v13.4s +add v4.4s, v4.4s, v13.4s +str q29, [x0, #64] +str q23, [x0, #80] +str q4, [x0, #96] +str q0, [x0, #112] +ldr q0, [x17, #+384] +ldr q4, [x17, #+400] +ldr q23, [x17, #+416] +ldr q29, [x17, #+432] +ldr q13, [x17, #+448] +ldr q16, [x17, #+464] +ldr q5, [x17, #+480] +ldr q30, [x17, #+496] +ldr q12, [x0, #160] +ldr q27, [x0, #176] +ldr q22, [x0, #128] +ldr q24, [x0, #144] +sqrdmulh v17.4S, v12.4S, v4.s[0] +mul v12.4S, v12.4S,v0.s[0] +mla v12.4S, v17.4S, v31.s[0] +sub v17.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v27.4S, v4.s[0] +mul v27.4S, v27.4S,v0.s[0] +mla v27.4S, v12.4S, v31.s[0] +sub v12.4s, v24.4s, v27.4s +add v24.4s, v24.4s, v27.4s +sqrdmulh v27.4S, v24.4S, v4.s[1] +mul v24.4S, v24.4S,v0.s[1] +mla v24.4S, v27.4S, v31.s[0] +sub v27.4s, v22.4s, v24.4s +add v22.4s, v22.4s, v24.4s +sqrdmulh v24.4S, v12.4S, v4.s[2] +mul v12.4S, v12.4S,v0.s[2] +mla v12.4S, v24.4S, v31.s[0] +sub v24.4s, v17.4s, v12.4s +add v17.4s, v17.4s, v12.4s +trn1 v12.4S, v22.4S, v27.4S +trn2 v21.4S, v22.4S, v27.4S +trn1 v20.4S, v17.4S, v24.4S +trn2 v9.4S, v17.4S, v24.4S +trn2 v17.2D, v12.2D, v20.2D +trn2 v24.2D, v21.2D, v9.2D +trn1 v22.2D, v12.2D, v20.2D +trn1 v27.2D, v21.2D, v9.2D +sqrdmulh v9.4S, v17.4S, v29.4S +mul v17.4S, v17.4S,v23.4S +mla v17.4S, v9.4S, v31.s[0] +sub v9.4s, v22.4s, v17.4s +add v22.4s, v22.4s, v17.4s +sqrdmulh v17.4S, v24.4S, v29.4S +mul v24.4S, v24.4S,v23.4S +mla v24.4S, v17.4S, v31.s[0] +sub v17.4s, v27.4s, v24.4s +add v27.4s, v27.4s, v24.4s +sqrdmulh v24.4S, v27.4S, v16.4S +mul v27.4S, v27.4S,v13.4S +mla v27.4S, v24.4S, v31.s[0] +sub v24.4s, v22.4s, v27.4s +add v22.4s, v22.4s, v27.4s +sqrdmulh v27.4S, v17.4S, v30.4S +mul v17.4S, v17.4S,v5.4S +mla v17.4S, v27.4S, v31.s[0] +sub v27.4s, v9.4s, v17.4s +add v9.4s, v9.4s, v17.4s +str q22, [x0, #128] +str q24, [x0, #144] +str q9, [x0, #160] +str q27, [x0, #176] +ldr q27, [x17, #+512] +ldr q9, [x17, #+528] +ldr q24, [x17, #+544] +ldr q22, [x17, #+560] +ldr q17, [x17, #+576] +ldr q21, [x17, #+592] +ldr q20, [x17, #+608] +ldr q12, [x17, #+624] +ldr q30, [x0, #224] +ldr q5, [x0, #240] +ldr q16, [x0, #192] +ldr q13, [x0, #208] +sqrdmulh v29.4S, v30.4S, v9.s[0] +mul v30.4S, v30.4S,v27.s[0] +mla v30.4S, v29.4S, v31.s[0] +sub v29.4s, v16.4s, v30.4s +add v16.4s, v16.4s, v30.4s +sqrdmulh v30.4S, v5.4S, v9.s[0] +mul v5.4S, v5.4S,v27.s[0] +mla v5.4S, v30.4S, v31.s[0] +sub v30.4s, v13.4s, v5.4s +add v13.4s, v13.4s, v5.4s +sqrdmulh v5.4S, v13.4S, v9.s[1] +mul v13.4S, v13.4S,v27.s[1] +mla v13.4S, v5.4S, v31.s[0] +sub v5.4s, v16.4s, v13.4s +add v16.4s, v16.4s, v13.4s +sqrdmulh v13.4S, v30.4S, v9.s[2] +mul v30.4S, v30.4S,v27.s[2] +mla v30.4S, v13.4S, v31.s[0] +sub v13.4s, v29.4s, v30.4s +add v29.4s, v29.4s, v30.4s +trn1 v30.4S, v16.4S, v5.4S +trn2 v23.4S, v16.4S, v5.4S +trn1 v4.4S, v29.4S, v13.4S +trn2 v0.4S, v29.4S, v13.4S +trn2 v29.2D, v30.2D, v4.2D +trn2 v13.2D, v23.2D, v0.2D +trn1 v16.2D, v30.2D, v4.2D +trn1 v5.2D, v23.2D, v0.2D +sqrdmulh v0.4S, v29.4S, v22.4S +mul v29.4S, v29.4S,v24.4S +mla v29.4S, v0.4S, v31.s[0] +sub v0.4s, v16.4s, v29.4s +add v16.4s, v16.4s, v29.4s +sqrdmulh v29.4S, v13.4S, v22.4S +mul v13.4S, v13.4S,v24.4S +mla v13.4S, v29.4S, v31.s[0] +sub v29.4s, v5.4s, v13.4s +add v5.4s, v5.4s, v13.4s +sqrdmulh v13.4S, v5.4S, v21.4S +mul v5.4S, v5.4S,v17.4S +mla v5.4S, v13.4S, v31.s[0] +sub v13.4s, v16.4s, v5.4s +add v16.4s, v16.4s, v5.4s +sqrdmulh v5.4S, v29.4S, v12.4S +mul v29.4S, v29.4S,v20.4S +mla v29.4S, v5.4S, v31.s[0] +sub v5.4s, v0.4s, v29.4s +add v0.4s, v0.4s, v29.4s +str q16, [x0, #192] +str q13, [x0, #208] +str q0, [x0, #224] +str q5, [x0, #240] +ldr q5, [x17, #+640] +ldr q0, [x17, #+656] +ldr q13, [x17, #+672] +ldr q16, [x17, #+688] +ldr q29, [x17, #+704] +ldr q23, [x17, #+720] +ldr q4, [x17, #+736] +ldr q30, [x17, #+752] +ldr q12, [x0, #288] +ldr q20, [x0, #304] +ldr q21, [x0, #256] +ldr q17, [x0, #272] +sqrdmulh v22.4S, v12.4S, v0.s[0] +mul v12.4S, v12.4S,v5.s[0] +mla v12.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v12.4s +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v20.4S, v0.s[0] +mul v20.4S, v20.4S,v5.s[0] +mla v20.4S, v12.4S, v31.s[0] +sub v12.4s, v17.4s, v20.4s +add v17.4s, v17.4s, v20.4s +sqrdmulh v20.4S, v17.4S, v0.s[1] +mul v17.4S, v17.4S,v5.s[1] +mla v17.4S, v20.4S, v31.s[0] +sub v20.4s, v21.4s, v17.4s +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v12.4S, v0.s[2] +mul v12.4S, v12.4S,v5.s[2] +mla v12.4S, v17.4S, v31.s[0] +sub v17.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +trn1 v12.4S, v21.4S, v20.4S +trn2 v24.4S, v21.4S, v20.4S +trn1 v9.4S, v22.4S, v17.4S +trn2 v27.4S, v22.4S, v17.4S +trn2 v22.2D, v12.2D, v9.2D +trn2 v17.2D, v24.2D, v27.2D +trn1 v21.2D, v12.2D, v9.2D +trn1 v20.2D, v24.2D, v27.2D +sqrdmulh v27.4S, v22.4S, v16.4S +mul v22.4S, v22.4S,v13.4S +mla v22.4S, v27.4S, v31.s[0] +sub v27.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +sqrdmulh v22.4S, v17.4S, v16.4S +mul v17.4S, v17.4S,v13.4S +mla v17.4S, v22.4S, v31.s[0] +sub v22.4s, v20.4s, v17.4s +add v20.4s, v20.4s, v17.4s +sqrdmulh v17.4S, v20.4S, v23.4S +mul v20.4S, v20.4S,v29.4S +mla v20.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v20.4s +add v21.4s, v21.4s, v20.4s +sqrdmulh v20.4S, v22.4S, v30.4S +mul v22.4S, v22.4S,v4.4S +mla v22.4S, v20.4S, v31.s[0] +sub v20.4s, v27.4s, v22.4s +add v27.4s, v27.4s, v22.4s +str q21, [x0, #256] +str q17, [x0, #272] +str q27, [x0, #288] +str q20, [x0, #304] +ldr q20, [x17, #+768] +ldr q27, [x17, #+784] +ldr q17, [x17, #+800] +ldr q21, [x17, #+816] +ldr q22, [x17, #+832] +ldr q24, [x17, #+848] +ldr q9, [x17, #+864] +ldr q12, [x17, #+880] +ldr q30, [x0, #352] +ldr q4, [x0, #368] +ldr q23, [x0, #320] +ldr q29, [x0, #336] +sqrdmulh v16.4S, v30.4S, v27.s[0] +mul v30.4S, v30.4S,v20.s[0] +mla v30.4S, v16.4S, v31.s[0] +sub v16.4s, v23.4s, v30.4s +add v23.4s, v23.4s, v30.4s +sqrdmulh v30.4S, v4.4S, v27.s[0] +mul v4.4S, v4.4S,v20.s[0] +mla v4.4S, v30.4S, v31.s[0] +sub v30.4s, v29.4s, v4.4s +add v29.4s, v29.4s, v4.4s +sqrdmulh v4.4S, v29.4S, v27.s[1] +mul v29.4S, v29.4S,v20.s[1] +mla v29.4S, v4.4S, v31.s[0] +sub v4.4s, v23.4s, v29.4s +add v23.4s, v23.4s, v29.4s +sqrdmulh v29.4S, v30.4S, v27.s[2] +mul v30.4S, v30.4S,v20.s[2] +mla v30.4S, v29.4S, v31.s[0] +sub v29.4s, v16.4s, v30.4s +add v16.4s, v16.4s, v30.4s +trn1 v30.4S, v23.4S, v4.4S +trn2 v13.4S, v23.4S, v4.4S +trn1 v0.4S, v16.4S, v29.4S +trn2 v5.4S, v16.4S, v29.4S +trn2 v16.2D, v30.2D, v0.2D +trn2 v29.2D, v13.2D, v5.2D +trn1 v23.2D, v30.2D, v0.2D +trn1 v4.2D, v13.2D, v5.2D +sqrdmulh v5.4S, v16.4S, v21.4S +mul v16.4S, v16.4S,v17.4S +mla v16.4S, v5.4S, v31.s[0] +sub v5.4s, v23.4s, v16.4s +add v23.4s, v23.4s, v16.4s +sqrdmulh v16.4S, v29.4S, v21.4S +mul v29.4S, v29.4S,v17.4S +mla v29.4S, v16.4S, v31.s[0] +sub v16.4s, v4.4s, v29.4s +add v4.4s, v4.4s, v29.4s +sqrdmulh v29.4S, v4.4S, v24.4S +mul v4.4S, v4.4S,v22.4S +mla v4.4S, v29.4S, v31.s[0] +sub v29.4s, v23.4s, v4.4s +add v23.4s, v23.4s, v4.4s +sqrdmulh v4.4S, v16.4S, v12.4S +mul v16.4S, v16.4S,v9.4S +mla v16.4S, v4.4S, v31.s[0] +sub v4.4s, v5.4s, v16.4s +add v5.4s, v5.4s, v16.4s +str q23, [x0, #320] +str q29, [x0, #336] +str q5, [x0, #352] +str q4, [x0, #368] +ldr q4, [x17, #+896] +ldr q5, [x17, #+912] +ldr q29, [x17, #+928] +ldr q23, [x17, #+944] +ldr q16, [x17, #+960] +ldr q13, [x17, #+976] +ldr q0, [x17, #+992] +ldr q30, [x17, #+1008] +ldr q12, [x0, #416] +ldr q9, [x0, #432] +ldr q24, [x0, #384] +ldr q22, [x0, #400] +sqrdmulh v21.4S, v12.4S, v5.s[0] +mul v12.4S, v12.4S,v4.s[0] +mla v12.4S, v21.4S, v31.s[0] +sub v21.4s, v24.4s, v12.4s +add v24.4s, v24.4s, v12.4s +sqrdmulh v12.4S, v9.4S, v5.s[0] +mul v9.4S, v9.4S,v4.s[0] +mla v9.4S, v12.4S, v31.s[0] +sub v12.4s, v22.4s, v9.4s +add v22.4s, v22.4s, v9.4s +sqrdmulh v9.4S, v22.4S, v5.s[1] +mul v22.4S, v22.4S,v4.s[1] +mla v22.4S, v9.4S, v31.s[0] +sub v9.4s, v24.4s, v22.4s +add v24.4s, v24.4s, v22.4s +sqrdmulh v22.4S, v12.4S, v5.s[2] +mul v12.4S, v12.4S,v4.s[2] +mla v12.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v12.4s +add v21.4s, v21.4s, v12.4s +trn1 v12.4S, v24.4S, v9.4S +trn2 v17.4S, v24.4S, v9.4S +trn1 v27.4S, v21.4S, v22.4S +trn2 v20.4S, v21.4S, v22.4S +trn2 v21.2D, v12.2D, v27.2D +trn2 v22.2D, v17.2D, v20.2D +trn1 v24.2D, v12.2D, v27.2D +trn1 v9.2D, v17.2D, v20.2D +sqrdmulh v20.4S, v21.4S, v23.4S +mul v21.4S, v21.4S,v29.4S +mla v21.4S, v20.4S, v31.s[0] +sub v20.4s, v24.4s, v21.4s +add v24.4s, v24.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v23.4S +mul v22.4S, v22.4S,v29.4S +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v9.4s, v22.4s +add v9.4s, v9.4s, v22.4s +sqrdmulh v22.4S, v9.4S, v13.4S +mul v9.4S, v9.4S,v16.4S +mla v9.4S, v22.4S, v31.s[0] +sub v22.4s, v24.4s, v9.4s +add v24.4s, v24.4s, v9.4s +sqrdmulh v9.4S, v21.4S, v30.4S +mul v21.4S, v21.4S,v0.4S +mla v21.4S, v9.4S, v31.s[0] +sub v9.4s, v20.4s, v21.4s +add v20.4s, v20.4s, v21.4s +str q24, [x0, #384] +str q22, [x0, #400] +str q20, [x0, #416] +str q9, [x0, #432] +ldr q9, [x17, #+1024] +ldr q20, [x17, #+1040] +ldr q22, [x17, #+1056] +ldr q24, [x17, #+1072] +ldr q21, [x17, #+1088] +ldr q17, [x17, #+1104] +ldr q27, [x17, #+1120] +ldr q12, [x17, #+1136] +ldr q30, [x0, #480] +ldr q0, [x0, #496] +ldr q13, [x0, #448] +ldr q16, [x0, #464] +sqrdmulh v23.4S, v30.4S, v20.s[0] +mul v30.4S, v30.4S,v9.s[0] +mla v30.4S, v23.4S, v31.s[0] +sub v23.4s, v13.4s, v30.4s +add v13.4s, v13.4s, v30.4s +sqrdmulh v30.4S, v0.4S, v20.s[0] +mul v0.4S, v0.4S,v9.s[0] +mla v0.4S, v30.4S, v31.s[0] +sub v30.4s, v16.4s, v0.4s +add v16.4s, v16.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v20.s[1] +mul v16.4S, v16.4S,v9.s[1] +mla v16.4S, v0.4S, v31.s[0] +sub v0.4s, v13.4s, v16.4s +add v13.4s, v13.4s, v16.4s +sqrdmulh v16.4S, v30.4S, v20.s[2] +mul v30.4S, v30.4S,v9.s[2] +mla v30.4S, v16.4S, v31.s[0] +sub v16.4s, v23.4s, v30.4s +add v23.4s, v23.4s, v30.4s +trn1 v30.4S, v13.4S, v0.4S +trn2 v29.4S, v13.4S, v0.4S +trn1 v5.4S, v23.4S, v16.4S +trn2 v4.4S, v23.4S, v16.4S +trn2 v23.2D, v30.2D, v5.2D +trn2 v16.2D, v29.2D, v4.2D +trn1 v13.2D, v30.2D, v5.2D +trn1 v0.2D, v29.2D, v4.2D +sqrdmulh v4.4S, v23.4S, v24.4S +mul v23.4S, v23.4S,v22.4S +mla v23.4S, v4.4S, v31.s[0] +sub v4.4s, v13.4s, v23.4s +add v13.4s, v13.4s, v23.4s +sqrdmulh v23.4S, v16.4S, v24.4S +mul v16.4S, v16.4S,v22.4S +mla v16.4S, v23.4S, v31.s[0] +sub v23.4s, v0.4s, v16.4s +add v0.4s, v0.4s, v16.4s +sqrdmulh v16.4S, v0.4S, v17.4S +mul v0.4S, v0.4S,v21.4S +mla v0.4S, v16.4S, v31.s[0] +sub v16.4s, v13.4s, v0.4s +add v13.4s, v13.4s, v0.4s +sqrdmulh v0.4S, v23.4S, v12.4S +mul v23.4S, v23.4S,v27.4S +mla v23.4S, v0.4S, v31.s[0] +sub v0.4s, v4.4s, v23.4s +add v4.4s, v4.4s, v23.4s +str q13, [x0, #448] +str q16, [x0, #464] +str q4, [x0, #480] +str q0, [x0, #496] +ldr q0, [x17, #+1152] +ldr q4, [x17, #+1168] +ldr q16, [x17, #+1184] +ldr q13, [x17, #+1200] +ldr q23, [x17, #+1216] +ldr q29, [x17, #+1232] +ldr q5, [x17, #+1248] +ldr q30, [x17, #+1264] +ldr q12, [x0, #544] +ldr q27, [x0, #560] +ldr q17, [x0, #512] +ldr q21, [x0, #528] +sqrdmulh v24.4S, v12.4S, v4.s[0] +mul v12.4S, v12.4S,v0.s[0] +mla v12.4S, v24.4S, v31.s[0] +sub v24.4s, v17.4s, v12.4s +add v17.4s, v17.4s, v12.4s +sqrdmulh v12.4S, v27.4S, v4.s[0] +mul v27.4S, v27.4S,v0.s[0] +mla v27.4S, v12.4S, v31.s[0] +sub v12.4s, v21.4s, v27.4s +add v21.4s, v21.4s, v27.4s +sqrdmulh v27.4S, v21.4S, v4.s[1] +mul v21.4S, v21.4S,v0.s[1] +mla v21.4S, v27.4S, v31.s[0] +sub v27.4s, v17.4s, v21.4s +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v12.4S, v4.s[2] +mul v12.4S, v12.4S,v0.s[2] +mla v12.4S, v21.4S, v31.s[0] +sub v21.4s, v24.4s, v12.4s +add v24.4s, v24.4s, v12.4s +trn1 v12.4S, v17.4S, v27.4S +trn2 v22.4S, v17.4S, v27.4S +trn1 v20.4S, v24.4S, v21.4S +trn2 v9.4S, v24.4S, v21.4S +trn2 v24.2D, v12.2D, v20.2D +trn2 v21.2D, v22.2D, v9.2D +trn1 v17.2D, v12.2D, v20.2D +trn1 v27.2D, v22.2D, v9.2D +sqrdmulh v9.4S, v24.4S, v13.4S +mul v24.4S, v24.4S,v16.4S +mla v24.4S, v9.4S, v31.s[0] +sub v9.4s, v17.4s, v24.4s +add v17.4s, v17.4s, v24.4s +sqrdmulh v24.4S, v21.4S, v13.4S +mul v21.4S, v21.4S,v16.4S +mla v21.4S, v24.4S, v31.s[0] +sub v24.4s, v27.4s, v21.4s +add v27.4s, v27.4s, v21.4s +sqrdmulh v21.4S, v27.4S, v29.4S +mul v27.4S, v27.4S,v23.4S +mla v27.4S, v21.4S, v31.s[0] +sub v21.4s, v17.4s, v27.4s +add v17.4s, v17.4s, v27.4s +sqrdmulh v27.4S, v24.4S, v30.4S +mul v24.4S, v24.4S,v5.4S +mla v24.4S, v27.4S, v31.s[0] +sub v27.4s, v9.4s, v24.4s +add v9.4s, v9.4s, v24.4s +str q17, [x0, #512] +str q21, [x0, #528] +str q9, [x0, #544] +str q27, [x0, #560] +ldr q27, [x17, #+1280] +ldr q9, [x17, #+1296] +ldr q21, [x17, #+1312] +ldr q17, [x17, #+1328] +ldr q24, [x17, #+1344] +ldr q22, [x17, #+1360] +ldr q20, [x17, #+1376] +ldr q12, [x17, #+1392] +ldr q30, [x0, #608] +ldr q5, [x0, #624] +ldr q29, [x0, #576] +ldr q23, [x0, #592] +sqrdmulh v13.4S, v30.4S, v9.s[0] +mul v30.4S, v30.4S,v27.s[0] +mla v30.4S, v13.4S, v31.s[0] +sub v13.4s, v29.4s, v30.4s +add v29.4s, v29.4s, v30.4s +sqrdmulh v30.4S, v5.4S, v9.s[0] +mul v5.4S, v5.4S,v27.s[0] +mla v5.4S, v30.4S, v31.s[0] +sub v30.4s, v23.4s, v5.4s +add v23.4s, v23.4s, v5.4s +sqrdmulh v5.4S, v23.4S, v9.s[1] +mul v23.4S, v23.4S,v27.s[1] +mla v23.4S, v5.4S, v31.s[0] +sub v5.4s, v29.4s, v23.4s +add v29.4s, v29.4s, v23.4s +sqrdmulh v23.4S, v30.4S, v9.s[2] +mul v30.4S, v30.4S,v27.s[2] +mla v30.4S, v23.4S, v31.s[0] +sub v23.4s, v13.4s, v30.4s +add v13.4s, v13.4s, v30.4s +trn1 v30.4S, v29.4S, v5.4S +trn2 v16.4S, v29.4S, v5.4S +trn1 v4.4S, v13.4S, v23.4S +trn2 v0.4S, v13.4S, v23.4S +trn2 v13.2D, v30.2D, v4.2D +trn2 v23.2D, v16.2D, v0.2D +trn1 v29.2D, v30.2D, v4.2D +trn1 v5.2D, v16.2D, v0.2D +sqrdmulh v0.4S, v13.4S, v17.4S +mul v13.4S, v13.4S,v21.4S +mla v13.4S, v0.4S, v31.s[0] +sub v0.4s, v29.4s, v13.4s +add v29.4s, v29.4s, v13.4s +sqrdmulh v13.4S, v23.4S, v17.4S +mul v23.4S, v23.4S,v21.4S +mla v23.4S, v13.4S, v31.s[0] +sub v13.4s, v5.4s, v23.4s +add v5.4s, v5.4s, v23.4s +sqrdmulh v23.4S, v5.4S, v22.4S +mul v5.4S, v5.4S,v24.4S +mla v5.4S, v23.4S, v31.s[0] +sub v23.4s, v29.4s, v5.4s +add v29.4s, v29.4s, v5.4s +sqrdmulh v5.4S, v13.4S, v12.4S +mul v13.4S, v13.4S,v20.4S +mla v13.4S, v5.4S, v31.s[0] +sub v5.4s, v0.4s, v13.4s +add v0.4s, v0.4s, v13.4s +str q29, [x0, #576] +str q23, [x0, #592] +str q0, [x0, #608] +str q5, [x0, #624] +ldr q5, [x17, #+1408] +ldr q0, [x17, #+1424] +ldr q23, [x17, #+1440] +ldr q29, [x17, #+1456] +ldr q13, [x17, #+1472] +ldr q16, [x17, #+1488] +ldr q4, [x17, #+1504] +ldr q30, [x17, #+1520] +ldr q12, [x0, #672] +ldr q20, [x0, #688] +ldr q22, [x0, #640] +ldr q24, [x0, #656] +sqrdmulh v17.4S, v12.4S, v0.s[0] +mul v12.4S, v12.4S,v5.s[0] +mla v12.4S, v17.4S, v31.s[0] +sub v17.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v20.4S, v0.s[0] +mul v20.4S, v20.4S,v5.s[0] +mla v20.4S, v12.4S, v31.s[0] +sub v12.4s, v24.4s, v20.4s +add v24.4s, v24.4s, v20.4s +sqrdmulh v20.4S, v24.4S, v0.s[1] +mul v24.4S, v24.4S,v5.s[1] +mla v24.4S, v20.4S, v31.s[0] +sub v20.4s, v22.4s, v24.4s +add v22.4s, v22.4s, v24.4s +sqrdmulh v24.4S, v12.4S, v0.s[2] +mul v12.4S, v12.4S,v5.s[2] +mla v12.4S, v24.4S, v31.s[0] +sub v24.4s, v17.4s, v12.4s +add v17.4s, v17.4s, v12.4s +trn1 v12.4S, v22.4S, v20.4S +trn2 v21.4S, v22.4S, v20.4S +trn1 v9.4S, v17.4S, v24.4S +trn2 v27.4S, v17.4S, v24.4S +trn2 v17.2D, v12.2D, v9.2D +trn2 v24.2D, v21.2D, v27.2D +trn1 v22.2D, v12.2D, v9.2D +trn1 v20.2D, v21.2D, v27.2D +sqrdmulh v27.4S, v17.4S, v29.4S +mul v17.4S, v17.4S,v23.4S +mla v17.4S, v27.4S, v31.s[0] +sub v27.4s, v22.4s, v17.4s +add v22.4s, v22.4s, v17.4s +sqrdmulh v17.4S, v24.4S, v29.4S +mul v24.4S, v24.4S,v23.4S +mla v24.4S, v17.4S, v31.s[0] +sub v17.4s, v20.4s, v24.4s +add v20.4s, v20.4s, v24.4s +sqrdmulh v24.4S, v20.4S, v16.4S +mul v20.4S, v20.4S,v13.4S +mla v20.4S, v24.4S, v31.s[0] +sub v24.4s, v22.4s, v20.4s +add v22.4s, v22.4s, v20.4s +sqrdmulh v20.4S, v17.4S, v30.4S +mul v17.4S, v17.4S,v4.4S +mla v17.4S, v20.4S, v31.s[0] +sub v20.4s, v27.4s, v17.4s +add v27.4s, v27.4s, v17.4s +str q22, [x0, #640] +str q24, [x0, #656] +str q27, [x0, #672] +str q20, [x0, #688] +ldr q20, [x17, #+1536] +ldr q27, [x17, #+1552] +ldr q24, [x17, #+1568] +ldr q22, [x17, #+1584] +ldr q17, [x17, #+1600] +ldr q21, [x17, #+1616] +ldr q9, [x17, #+1632] +ldr q12, [x17, #+1648] +ldr q30, [x0, #736] +ldr q4, [x0, #752] +ldr q16, [x0, #704] +ldr q13, [x0, #720] +sqrdmulh v29.4S, v30.4S, v27.s[0] +mul v30.4S, v30.4S,v20.s[0] +mla v30.4S, v29.4S, v31.s[0] +sub v29.4s, v16.4s, v30.4s +add v16.4s, v16.4s, v30.4s +sqrdmulh v30.4S, v4.4S, v27.s[0] +mul v4.4S, v4.4S,v20.s[0] +mla v4.4S, v30.4S, v31.s[0] +sub v30.4s, v13.4s, v4.4s +add v13.4s, v13.4s, v4.4s +sqrdmulh v4.4S, v13.4S, v27.s[1] +mul v13.4S, v13.4S,v20.s[1] +mla v13.4S, v4.4S, v31.s[0] +sub v4.4s, v16.4s, v13.4s +add v16.4s, v16.4s, v13.4s +sqrdmulh v13.4S, v30.4S, v27.s[2] +mul v30.4S, v30.4S,v20.s[2] +mla v30.4S, v13.4S, v31.s[0] +sub v13.4s, v29.4s, v30.4s +add v29.4s, v29.4s, v30.4s +trn1 v30.4S, v16.4S, v4.4S +trn2 v23.4S, v16.4S, v4.4S +trn1 v0.4S, v29.4S, v13.4S +trn2 v5.4S, v29.4S, v13.4S +trn2 v29.2D, v30.2D, v0.2D +trn2 v13.2D, v23.2D, v5.2D +trn1 v16.2D, v30.2D, v0.2D +trn1 v4.2D, v23.2D, v5.2D +sqrdmulh v5.4S, v29.4S, v22.4S +mul v29.4S, v29.4S,v24.4S +mla v29.4S, v5.4S, v31.s[0] +sub v5.4s, v16.4s, v29.4s +add v16.4s, v16.4s, v29.4s +sqrdmulh v29.4S, v13.4S, v22.4S +mul v13.4S, v13.4S,v24.4S +mla v13.4S, v29.4S, v31.s[0] +sub v29.4s, v4.4s, v13.4s +add v4.4s, v4.4s, v13.4s +sqrdmulh v13.4S, v4.4S, v21.4S +mul v4.4S, v4.4S,v17.4S +mla v4.4S, v13.4S, v31.s[0] +sub v13.4s, v16.4s, v4.4s +add v16.4s, v16.4s, v4.4s +sqrdmulh v4.4S, v29.4S, v12.4S +mul v29.4S, v29.4S,v9.4S +mla v29.4S, v4.4S, v31.s[0] +sub v4.4s, v5.4s, v29.4s +add v5.4s, v5.4s, v29.4s +str q16, [x0, #704] +str q13, [x0, #720] +str q5, [x0, #736] +str q4, [x0, #752] +ldr q4, [x17, #+1664] +ldr q5, [x17, #+1680] +ldr q13, [x17, #+1696] +ldr q16, [x17, #+1712] +ldr q29, [x17, #+1728] +ldr q23, [x17, #+1744] +ldr q0, [x17, #+1760] +ldr q30, [x17, #+1776] +ldr q12, [x0, #800] +ldr q9, [x0, #816] +ldr q21, [x0, #768] +ldr q17, [x0, #784] +sqrdmulh v22.4S, v12.4S, v5.s[0] +mul v12.4S, v12.4S,v4.s[0] +mla v12.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v12.4s +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v9.4S, v5.s[0] +mul v9.4S, v9.4S,v4.s[0] +mla v9.4S, v12.4S, v31.s[0] +sub v12.4s, v17.4s, v9.4s +add v17.4s, v17.4s, v9.4s +sqrdmulh v9.4S, v17.4S, v5.s[1] +mul v17.4S, v17.4S,v4.s[1] +mla v17.4S, v9.4S, v31.s[0] +sub v9.4s, v21.4s, v17.4s +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v12.4S, v5.s[2] +mul v12.4S, v12.4S,v4.s[2] +mla v12.4S, v17.4S, v31.s[0] +sub v17.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +trn1 v12.4S, v21.4S, v9.4S +trn2 v24.4S, v21.4S, v9.4S +trn1 v27.4S, v22.4S, v17.4S +trn2 v20.4S, v22.4S, v17.4S +trn2 v22.2D, v12.2D, v27.2D +trn2 v17.2D, v24.2D, v20.2D +trn1 v21.2D, v12.2D, v27.2D +trn1 v9.2D, v24.2D, v20.2D +sqrdmulh v20.4S, v22.4S, v16.4S +mul v22.4S, v22.4S,v13.4S +mla v22.4S, v20.4S, v31.s[0] +sub v20.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +sqrdmulh v22.4S, v17.4S, v16.4S +mul v17.4S, v17.4S,v13.4S +mla v17.4S, v22.4S, v31.s[0] +sub v22.4s, v9.4s, v17.4s +add v9.4s, v9.4s, v17.4s +sqrdmulh v17.4S, v9.4S, v23.4S +mul v9.4S, v9.4S,v29.4S +mla v9.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v9.4s +add v21.4s, v21.4s, v9.4s +sqrdmulh v9.4S, v22.4S, v30.4S +mul v22.4S, v22.4S,v0.4S +mla v22.4S, v9.4S, v31.s[0] +sub v9.4s, v20.4s, v22.4s +add v20.4s, v20.4s, v22.4s +str q21, [x0, #768] +str q17, [x0, #784] +str q20, [x0, #800] +str q9, [x0, #816] +ldr q9, [x17, #+1792] +ldr q20, [x17, #+1808] +ldr q17, [x17, #+1824] +ldr q21, [x17, #+1840] +ldr q22, [x17, #+1856] +ldr q24, [x17, #+1872] +ldr q27, [x17, #+1888] +ldr q12, [x17, #+1904] +ldr q30, [x0, #864] +ldr q0, [x0, #880] +ldr q23, [x0, #832] +ldr q29, [x0, #848] +sqrdmulh v16.4S, v30.4S, v20.s[0] +mul v30.4S, v30.4S,v9.s[0] +mla v30.4S, v16.4S, v31.s[0] +sub v16.4s, v23.4s, v30.4s +add v23.4s, v23.4s, v30.4s +sqrdmulh v30.4S, v0.4S, v20.s[0] +mul v0.4S, v0.4S,v9.s[0] +mla v0.4S, v30.4S, v31.s[0] +sub v30.4s, v29.4s, v0.4s +add v29.4s, v29.4s, v0.4s +sqrdmulh v0.4S, v29.4S, v20.s[1] +mul v29.4S, v29.4S,v9.s[1] +mla v29.4S, v0.4S, v31.s[0] +sub v0.4s, v23.4s, v29.4s +add v23.4s, v23.4s, v29.4s +sqrdmulh v29.4S, v30.4S, v20.s[2] +mul v30.4S, v30.4S,v9.s[2] +mla v30.4S, v29.4S, v31.s[0] +sub v29.4s, v16.4s, v30.4s +add v16.4s, v16.4s, v30.4s +trn1 v30.4S, v23.4S, v0.4S +trn2 v13.4S, v23.4S, v0.4S +trn1 v5.4S, v16.4S, v29.4S +trn2 v4.4S, v16.4S, v29.4S +trn2 v16.2D, v30.2D, v5.2D +trn2 v29.2D, v13.2D, v4.2D +trn1 v23.2D, v30.2D, v5.2D +trn1 v0.2D, v13.2D, v4.2D +sqrdmulh v4.4S, v16.4S, v21.4S +mul v16.4S, v16.4S,v17.4S +mla v16.4S, v4.4S, v31.s[0] +sub v4.4s, v23.4s, v16.4s +add v23.4s, v23.4s, v16.4s +sqrdmulh v16.4S, v29.4S, v21.4S +mul v29.4S, v29.4S,v17.4S +mla v29.4S, v16.4S, v31.s[0] +sub v16.4s, v0.4s, v29.4s +add v0.4s, v0.4s, v29.4s +sqrdmulh v29.4S, v0.4S, v24.4S +mul v0.4S, v0.4S,v22.4S +mla v0.4S, v29.4S, v31.s[0] +sub v29.4s, v23.4s, v0.4s +add v23.4s, v23.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v12.4S +mul v16.4S, v16.4S,v27.4S +mla v16.4S, v0.4S, v31.s[0] +sub v0.4s, v4.4s, v16.4s +add v4.4s, v4.4s, v16.4s +str q23, [x0, #832] +str q29, [x0, #848] +str q4, [x0, #864] +str q0, [x0, #880] +ldr q0, [x17, #+1920] +ldr q4, [x17, #+1936] +ldr q29, [x17, #+1952] +ldr q23, [x17, #+1968] +ldr q16, [x17, #+1984] +ldr q13, [x17, #+2000] +ldr q5, [x17, #+2016] +ldr q30, [x17, #+2032] +ldr q12, [x0, #928] +ldr q27, [x0, #944] +ldr q24, [x0, #896] +ldr q22, [x0, #912] +sqrdmulh v21.4S, v12.4S, v4.s[0] +mul v12.4S, v12.4S,v0.s[0] +mla v12.4S, v21.4S, v31.s[0] +sub v21.4s, v24.4s, v12.4s +add v24.4s, v24.4s, v12.4s +sqrdmulh v12.4S, v27.4S, v4.s[0] +mul v27.4S, v27.4S,v0.s[0] +mla v27.4S, v12.4S, v31.s[0] +sub v12.4s, v22.4s, v27.4s +add v22.4s, v22.4s, v27.4s +sqrdmulh v27.4S, v22.4S, v4.s[1] +mul v22.4S, v22.4S,v0.s[1] +mla v22.4S, v27.4S, v31.s[0] +sub v27.4s, v24.4s, v22.4s +add v24.4s, v24.4s, v22.4s +sqrdmulh v22.4S, v12.4S, v4.s[2] +mul v12.4S, v12.4S,v0.s[2] +mla v12.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v12.4s +add v21.4s, v21.4s, v12.4s +trn1 v12.4S, v24.4S, v27.4S +trn2 v17.4S, v24.4S, v27.4S +trn1 v20.4S, v21.4S, v22.4S +trn2 v9.4S, v21.4S, v22.4S +trn2 v21.2D, v12.2D, v20.2D +trn2 v22.2D, v17.2D, v9.2D +trn1 v24.2D, v12.2D, v20.2D +trn1 v27.2D, v17.2D, v9.2D +sqrdmulh v9.4S, v21.4S, v23.4S +mul v21.4S, v21.4S,v29.4S +mla v21.4S, v9.4S, v31.s[0] +sub v9.4s, v24.4s, v21.4s +add v24.4s, v24.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v23.4S +mul v22.4S, v22.4S,v29.4S +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v27.4s, v22.4s +add v27.4s, v27.4s, v22.4s +sqrdmulh v22.4S, v27.4S, v13.4S +mul v27.4S, v27.4S,v16.4S +mla v27.4S, v22.4S, v31.s[0] +sub v22.4s, v24.4s, v27.4s +add v24.4s, v24.4s, v27.4s +sqrdmulh v27.4S, v21.4S, v30.4S +mul v21.4S, v21.4S,v5.4S +mla v21.4S, v27.4S, v31.s[0] +sub v27.4s, v9.4s, v21.4s +add v9.4s, v9.4s, v21.4s +str q24, [x0, #896] +str q22, [x0, #912] +str q9, [x0, #928] +str q27, [x0, #944] +ldr q27, [x17, #+2048] +ldr q9, [x17, #+2064] +ldr q22, [x17, #+2080] +ldr q24, [x17, #+2096] +ldr q21, [x17, #+2112] +ldr q17, [x17, #+2128] +ldr q20, [x17, #+2144] +ldr q12, [x17, #+2160] +ldr q30, [x0, #992] +ldr q5, [x0, #1008] +ldr q13, [x0, #960] +ldr q16, [x0, #976] +sqrdmulh v23.4S, v30.4S, v9.s[0] +mul v30.4S, v30.4S,v27.s[0] +mla v30.4S, v23.4S, v31.s[0] +sub v23.4s, v13.4s, v30.4s +add v13.4s, v13.4s, v30.4s +sqrdmulh v30.4S, v5.4S, v9.s[0] +mul v5.4S, v5.4S,v27.s[0] +mla v5.4S, v30.4S, v31.s[0] +sub v30.4s, v16.4s, v5.4s +add v16.4s, v16.4s, v5.4s +sqrdmulh v5.4S, v16.4S, v9.s[1] +mul v16.4S, v16.4S,v27.s[1] +mla v16.4S, v5.4S, v31.s[0] +sub v5.4s, v13.4s, v16.4s +add v13.4s, v13.4s, v16.4s +sqrdmulh v16.4S, v30.4S, v9.s[2] +mul v30.4S, v30.4S,v27.s[2] +mla v30.4S, v16.4S, v31.s[0] +sub v16.4s, v23.4s, v30.4s +add v23.4s, v23.4s, v30.4s +trn1 v30.4S, v13.4S, v5.4S +trn2 v29.4S, v13.4S, v5.4S +trn1 v4.4S, v23.4S, v16.4S +trn2 v0.4S, v23.4S, v16.4S +trn2 v23.2D, v30.2D, v4.2D +trn2 v16.2D, v29.2D, v0.2D +trn1 v13.2D, v30.2D, v4.2D +trn1 v5.2D, v29.2D, v0.2D +sqrdmulh v0.4S, v23.4S, v24.4S +mul v23.4S, v23.4S,v22.4S +mla v23.4S, v0.4S, v31.s[0] +sub v0.4s, v13.4s, v23.4s +add v13.4s, v13.4s, v23.4s +sqrdmulh v23.4S, v16.4S, v24.4S +mul v16.4S, v16.4S,v22.4S +mla v16.4S, v23.4S, v31.s[0] +sub v23.4s, v5.4s, v16.4s +add v5.4s, v5.4s, v16.4s +sqrdmulh v16.4S, v5.4S, v17.4S +mul v5.4S, v5.4S,v21.4S +mla v5.4S, v16.4S, v31.s[0] +sub v16.4s, v13.4s, v5.4s +add v13.4s, v13.4s, v5.4s +sqrdmulh v5.4S, v23.4S, v12.4S +mul v23.4S, v23.4S,v20.4S +mla v23.4S, v5.4S, v31.s[0] +sub v5.4s, v0.4s, v23.4s +add v0.4s, v0.4s, v23.4s +str q13, [x0, #960] +str q16, [x0, #976] +str q0, [x0, #992] +str q5, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 2476 +// Instruction count: 2472 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_16_0.s b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_16_0.s new file mode 100644 index 0000000..2b07129 --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_16_0.s @@ -0,0 +1,2506 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 26036764 // Layer 6, block 0 +.word 7065381 // Layer 6, block 1 +.word 11280567 // Layer 6, block 2 +.word 19695786 // Layer 6, block 3 +.word 1666225723 // Layer 6, block 0 +.word 452149874 // Layer 6, block 1 +.word 721901190 // Layer 6, block 2 +.word 1260434103 // Layer 6, block 3 +.word 28678040 // Layer 7, block 0 +.word 5637166 // Layer 7, block 2 +.word 18759424 // Layer 7, block 4 +.word 8648030 // Layer 7, block 6 +.word 1835254486 // Layer 7, block 0 +.word 360751090 // Layer 7, block 2 +.word 1200511508 // Layer 7, block 4 +.word 553431680 // Layer 7, block 6 +.word 7232147 // Layer 7, block 1 +.word 7430689 // Layer 7, block 3 +.word 14819378 // Layer 7, block 5 +.word 22112339 // Layer 7, block 7 +.word 462822084 // Layer 7, block 1 +.word 475527802 // Layer 7, block 3 +.word 948367809 // Layer 7, block 5 +.word 1415081692 // Layer 7, block 7 +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14834498 // Layer 6, block 4 +.word 22861321 // Layer 6, block 5 +.word 23033862 // Layer 6, block 6 +.word 32211066 // Layer 6, block 7 +.word 949335415 // Layer 6, block 4 +.word 1463012881 // Layer 6, block 5 +.word 1474054663 // Layer 6, block 6 +.word 2061350894 // Layer 6, block 7 +.word 7103825 // Layer 7, block 8 +.word 24338119 // Layer 7, block 10 +.word 6674394 // Layer 7, block 12 +.word 3716128 // Layer 7, block 14 +.word 454610102 // Layer 7, block 8 +.word 1557520740 // Layer 7, block 10 +.word 427128616 // Layer 7, block 12 +.word 237814041 // Layer 7, block 14 +.word 18577393 // Layer 7, block 9 +.word 17042091 // Layer 7, block 11 +.word 6574213 // Layer 7, block 13 +.word 24666803 // Layer 7, block 15 +.word 1188862414 // Layer 7, block 9 +.word 1090610585 // Layer 7, block 11 +.word 420717521 // Layer 7, block 13 +.word 1578554911 // Layer 7, block 15 +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 11253846 // Layer 6, block 8 +.word 16151303 // Layer 6, block 9 +.word 1821442 // Layer 6, block 10 +.word 23358663 // Layer 6, block 11 +.word 720191176 // Layer 6, block 8 +.word 1033604503 // Layer 6, block 9 +.word 116563391 // Layer 6, block 10 +.word 1494840340 // Layer 6, block 11 +.word 32787475 // Layer 7, block 16 +.word 8269259 // Layer 7, block 18 +.word 20826321 // Layer 7, block 20 +.word 21194054 // Layer 7, block 22 +.word 2098238255 // Layer 7, block 16 +.word 529192186 // Layer 7, block 18 +.word 1332782821 // Layer 7, block 20 +.word 1356315937 // Layer 7, block 22 +.word 28400654 // Layer 7, block 17 +.word 31090287 // Layer 7, block 19 +.word 26776841 // Layer 7, block 21 +.word 22281074 // Layer 7, block 23 +.word 1817503137 // Layer 7, block 17 +.word 1989626512 // Layer 7, block 19 +.word 1713587037 // Layer 7, block 21 +.word 1425879908 // Layer 7, block 23 +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 20504641 // Layer 6, block 12 +.word 7735096 // Layer 6, block 13 +.word 29463916 // Layer 6, block 14 +.word 23172067 // Layer 6, block 15 +.word 1312196872 // Layer 6, block 12 +.word 495008363 // Layer 6, block 13 +.word 1885546712 // Layer 6, block 14 +.word 1482899108 // Layer 6, block 15 +.word 1953000 // Layer 7, block 24 +.word 12766243 // Layer 7, block 26 +.word 16292342 // Layer 7, block 28 +.word 25143337 // Layer 7, block 30 +.word 124982461 // Layer 7, block 24 +.word 816977197 // Layer 7, block 26 +.word 1042630311 // Layer 7, block 28 +.word 1609050759 // Layer 7, block 30 +.word 12486848 // Layer 7, block 25 +.word 31556661 // Layer 7, block 27 +.word 28330310 // Layer 7, block 29 +.word 15137961 // Layer 7, block 31 +.word 799097282 // Layer 7, block 25 +.word 2019472170 // Layer 7, block 27 +.word 1813001465 // Layer 7, block 29 +.word 968755565 // Layer 7, block 31 +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 18663828 // Layer 6, block 16 +.word 25765932 // Layer 6, block 17 +.word 11779122 // Layer 6, block 18 +.word 29112305 // Layer 6, block 19 +.word 1194393831 // Layer 6, block 16 +.word 1648893798 // Layer 6, block 17 +.word 753806275 // Layer 6, block 18 +.word 1863045325 // Layer 6, block 19 +.word 33163184 // Layer 7, block 32 +.word 11550623 // Layer 7, block 34 +.word 25375595 // Layer 7, block 36 +.word 18254638 // Layer 7, block 38 +.word 2122281795 // Layer 7, block 32 +.word 739183455 // Layer 7, block 34 +.word 1623914137 // Layer 7, block 36 +.word 1168207670 // Layer 7, block 38 +.word 9551359 // Layer 7, block 33 +.word 33257316 // Layer 7, block 35 +.word 10387700 // Layer 7, block 37 +.word 4263629 // Layer 7, block 39 +.word 611240324 // Layer 7, block 33 +.word 2128305784 // Layer 7, block 35 +.word 664762063 // Layer 7, block 37 +.word 272851431 // Layer 7, block 39 +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 596073 // Layer 6, block 20 +.word 29039358 // Layer 6, block 21 +.word 6760262 // Layer 6, block 22 +.word 2228887 // Layer 6, block 23 +.word 38145761 // Layer 6, block 20 +.word 1858377074 // Layer 6, block 21 +.word 432623749 // Layer 6, block 22 +.word 142637881 // Layer 6, block 23 +.word 25929180 // Layer 7, block 40 +.word 23508428 // Layer 7, block 42 +.word 22560727 // Layer 7, block 44 +.word 29457393 // Layer 7, block 46 +.word 1659340873 // Layer 7, block 40 +.word 1504424569 // Layer 7, block 42 +.word 1443776334 // Layer 7, block 44 +.word 1885129272 // Layer 7, block 46 +.word 17371159 // Layer 7, block 41 +.word 11558208 // Layer 7, block 43 +.word 15755637 // Layer 7, block 45 +.word 20740787 // Layer 7, block 47 +.word 1111669329 // Layer 7, block 41 +.word 739668858 // Layer 7, block 43 +.word 1008283812 // Layer 7, block 45 +.word 1327309063 // Layer 7, block 47 +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 13624329 // Layer 6, block 24 +.word 9838349 // Layer 6, block 25 +.word 6934560 // Layer 6, block 26 +.word 11310234 // Layer 6, block 27 +.word 871890510 // Layer 6, block 24 +.word 629606282 // Layer 6, block 25 +.word 443777969 // Layer 6, block 26 +.word 723799733 // Layer 6, block 27 +.word 3153984 // Layer 7, block 48 +.word 15599806 // Layer 7, block 50 +.word 23484790 // Layer 7, block 52 +.word 30174454 // Layer 7, block 54 +.word 201839571 // Layer 7, block 48 +.word 998311389 // Layer 7, block 50 +.word 1502911852 // Layer 7, block 52 +.word 1931017673 // Layer 7, block 54 +.word 13598070 // Layer 7, block 49 +.word 31454003 // Layer 7, block 51 +.word 20506260 // Layer 7, block 53 +.word 5928435 // Layer 7, block 55 +.word 870210062 // Layer 7, block 49 +.word 2012902560 // Layer 7, block 51 +.word 1312300480 // Layer 7, block 53 +.word 379390883 // Layer 7, block 55 +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 32798516 // Layer 6, block 28 +.word 9911360 // Layer 6, block 29 +.word 32443170 // Layer 6, block 30 +.word 31293482 // Layer 6, block 31 +.word 2098944825 // Layer 6, block 28 +.word 634278629 // Layer 6, block 29 +.word 2076204416 // Layer 6, block 30 +.word 2002630000 // Layer 6, block 31 +.word 26013877 // Layer 7, block 56 +.word 22928950 // Layer 7, block 58 +.word 24547058 // Layer 7, block 60 +.word 21082546 // Layer 7, block 62 +.word 1664761067 // Layer 7, block 56 +.word 1467340807 // Layer 7, block 58 +.word 1570891816 // Layer 7, block 60 +.word 1349179970 // Layer 7, block 62 +.word 21864746 // Layer 7, block 57 +.word 27678266 // Layer 7, block 59 +.word 30695887 // Layer 7, block 61 +.word 31772478 // Layer 7, block 63 +.word 1399236949 // Layer 7, block 57 +.word 1771273834 // Layer 7, block 59 +.word 1964386839 // Layer 7, block 61 +.word 2033283404 // Layer 7, block 63 +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 2853776 // Layer 6, block 32 +.word 31645959 // Layer 6, block 33 +.word 29723614 // Layer 6, block 34 +.word 31813171 // Layer 6, block 35 +.word 182627725 // Layer 6, block 32 +.word 2025186806 // Layer 6, block 33 +.word 1902166116 // Layer 6, block 34 +.word 2035887557 // Layer 6, block 35 +.word 30377953 // Layer 7, block 64 +.word 4924837 // Layer 7, block 66 +.word 11362575 // Layer 7, block 68 +.word 31398766 // Layer 7, block 70 +.word 1944040616 // Layer 7, block 64 +.word 315165513 // Layer 7, block 66 +.word 727149301 // Layer 7, block 68 +.word 2009367662 // Layer 7, block 70 +.word 27689101 // Layer 7, block 65 +.word 31229525 // Layer 7, block 67 +.word 6544948 // Layer 7, block 69 +.word 13728247 // Layer 7, block 71 +.word 1771967221 // Layer 7, block 65 +.word 1998537064 // Layer 7, block 67 +.word 418844704 // Layer 7, block 69 +.word 878540754 // Layer 7, block 71 +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9116920 // Layer 6, block 36 +.word 26449800 // Layer 6, block 37 +.word 27173300 // Layer 6, block 38 +.word 1574249 // Layer 6, block 39 +.word 583438350 // Layer 6, block 36 +.word 1692658010 // Layer 6, block 37 +.word 1738958476 // Layer 6, block 38 +.word 100744247 // Layer 6, block 39 +.word 6510145 // Layer 7, block 72 +.word 760999 // Layer 7, block 74 +.word 1634503 // Layer 7, block 76 +.word 29546109 // Layer 7, block 78 +.word 416617482 // Layer 7, block 72 +.word 48700219 // Layer 7, block 74 +.word 104600209 // Layer 7, block 76 +.word 1890806663 // Layer 7, block 78 +.word 2195232 // Layer 7, block 73 +.word 4465852 // Layer 7, block 75 +.word 31203102 // Layer 7, block 77 +.word 29916743 // Layer 7, block 79 +.word 140484126 // Layer 7, block 73 +.word 285792715 // Layer 7, block 75 +.word 1996846121 // Layer 7, block 77 +.word 1914525428 // Layer 7, block 79 +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29172999 // Layer 6, block 40 +.word 16825951 // Layer 6, block 41 +.word 11592382 // Layer 6, block 42 +.word 2671395 // Layer 6, block 43 +.word 1866929445 // Layer 6, block 40 +.word 1076778680 // Layer 6, block 41 +.word 741855827 // Layer 6, block 42 +.word 170956232 // Layer 6, block 43 +.word 14579779 // Layer 7, block 80 +.word 24263513 // Layer 7, block 82 +.word 4646776 // Layer 7, block 84 +.word 69049 // Layer 7, block 86 +.word 933034643 // Layer 7, block 80 +.word 1552746321 // Layer 7, block 82 +.word 297370968 // Layer 7, block 84 +.word 4418799 // Layer 7, block 86 +.word 33263488 // Layer 7, block 81 +.word 22493246 // Layer 7, block 83 +.word 22009979 // Layer 7, block 85 +.word 12021234 // Layer 7, block 87 +.word 2128700762 // Layer 7, block 81 +.word 1439457879 // Layer 7, block 83 +.word 1408531152 // Layer 7, block 85 +.word 769300260 // Layer 7, block 87 +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 15720958 // Layer 6, block 44 +.word 4876619 // Layer 6, block 45 +.word 9370171 // Layer 6, block 46 +.word 2197027 // Layer 6, block 47 +.word 1006064525 // Layer 6, block 44 +.word 312079797 // Layer 6, block 45 +.word 599645177 // Layer 6, block 46 +.word 140598997 // Layer 6, block 47 +.word 16117282 // Layer 7, block 88 +.word 9635661 // Layer 7, block 90 +.word 9117520 // Layer 7, block 92 +.word 3506913 // Layer 7, block 94 +.word 1031427326 // Layer 7, block 88 +.word 616635240 // Layer 7, block 90 +.word 583476747 // Layer 7, block 92 +.word 224425303 // Layer 7, block 94 +.word 20014407 // Layer 7, block 89 +.word 25893988 // Layer 7, block 91 +.word 10257619 // Layer 7, block 93 +.word 24501669 // Layer 7, block 95 +.word 1280824291 // Layer 7, block 89 +.word 1657088757 // Layer 7, block 91 +.word 656437514 // Layer 7, block 93 +.word 1567987141 // Layer 7, block 95 +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 23467272 // Layer 6, block 48 +.word 11944835 // Layer 6, block 49 +.word 29768154 // Layer 6, block 50 +.word 3189790 // Layer 6, block 51 +.word 1501790786 // Layer 6, block 48 +.word 764411097 // Layer 6, block 49 +.word 1905016458 // Layer 6, block 50 +.word 204130980 // Layer 6, block 51 +.word 28559032 // Layer 7, block 96 +.word 20151609 // Layer 7, block 98 +.word 11645481 // Layer 7, block 100 +.word 16402437 // Layer 7, block 102 +.word 1827638556 // Layer 7, block 96 +.word 1289604549 // Layer 7, block 98 +.word 745253903 // Layer 7, block 100 +.word 1049675853 // Layer 7, block 102 +.word 1005359 // Layer 7, block 97 +.word 19130139 // Layer 7, block 99 +.word 11690281 // Layer 7, block 101 +.word 5461508 // Layer 7, block 103 +.word 64338065 // Layer 7, block 97 +.word 1224235458 // Layer 7, block 99 +.word 748120885 // Layer 7, block 101 +.word 349509836 // Layer 7, block 103 +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 4898455 // Layer 6, block 52 +.word 22059944 // Layer 6, block 53 +.word 20315246 // Layer 6, block 54 +.word 28615767 // Layer 6, block 55 +.word 313477194 // Layer 6, block 52 +.word 1411728668 // Layer 6, block 53 +.word 1300076517 // Layer 6, block 54 +.word 1831269319 // Layer 6, block 55 +.word 6226096 // Layer 7, block 104 +.word 14029790 // Layer 7, block 106 +.word 7729000 // Layer 7, block 108 +.word 13958531 // Layer 7, block 110 +.word 398439734 // Layer 7, block 104 +.word 897838034 // Layer 7, block 106 +.word 494618249 // Layer 7, block 108 +.word 893277806 // Layer 7, block 110 +.word 31755058 // Layer 7, block 105 +.word 26102744 // Layer 7, block 107 +.word 19175904 // Layer 7, block 109 +.word 19472238 // Layer 7, block 111 +.word 2032168609 // Layer 7, block 105 +.word 1670448121 // Layer 7, block 107 +.word 1227164194 // Layer 7, block 109 +.word 1246128123 // Layer 7, block 111 +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 17302560 // Layer 6, block 56 +.word 8630188 // Layer 6, block 57 +.word 13744680 // Layer 6, block 58 +.word 31890906 // Layer 6, block 59 +.word 1107279328 // Layer 6, block 56 +.word 552289879 // Layer 6, block 57 +.word 879592386 // Layer 6, block 58 +.word 2040862218 // Layer 6, block 59 +.word 4735938 // Layer 7, block 112 +.word 26671657 // Layer 7, block 114 +.word 25810971 // Layer 7, block 116 +.word 25578690 // Layer 7, block 118 +.word 303076900 // Layer 7, block 112 +.word 1706855774 // Layer 7, block 114 +.word 1651776074 // Layer 7, block 116 +.word 1636911225 // Layer 7, block 118 +.word 6957373 // Layer 7, block 113 +.word 25381712 // Layer 7, block 115 +.word 27780827 // Layer 7, block 117 +.word 28062311 // Layer 7, block 119 +.word 445237890 // Layer 7, block 113 +.word 1624305595 // Layer 7, block 115 +.word 1777837237 // Layer 7, block 117 +.word 1795850838 // Layer 7, block 119 +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 26150922 // Layer 6, block 60 +.word 29525906 // Layer 6, block 61 +.word 23080870 // Layer 6, block 62 +.word 1636987 // Layer 6, block 63 +.word 1673531278 // Layer 6, block 60 +.word 1889513769 // Layer 6, block 61 +.word 1477062945 // Layer 6, block 62 +.word 104759172 // Layer 6, block 63 +.word 10674616 // Layer 7, block 120 +.word 9508293 // Layer 7, block 122 +.word 4274200 // Layer 7, block 124 +.word 10066304 // Layer 7, block 126 +.word 683123285 // Layer 7, block 120 +.word 608484310 // Layer 7, block 122 +.word 273527923 // Layer 7, block 124 +.word 644194289 // Layer 7, block 126 +.word 26473446 // Layer 7, block 121 +.word 14853570 // Layer 7, block 123 +.word 32427548 // Layer 7, block 125 +.word 16598340 // Layer 7, block 127 +.word 1694171239 // Layer 7, block 121 +.word 950555930 // Layer 7, block 123 +.word 2075204685 // Layer 7, block 125 +.word 1062212688 // Layer 7, block 127 +.text +.global ntt_u32_full_neon_asm_var_4_4_16_0 +.global _ntt_u32_full_neon_asm_var_4_4_16_0 +ntt_u32_full_neon_asm_var_4_4_16_0: +_ntt_u32_full_neon_asm_var_4_4_16_0: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x0, #992] +ldr q29, [x17, #+0] +ldr q28, [x17, #+16] +sqrdmulh v27.4S, v30.4S, v28.s[0] +mul v30.4S, v30.4S,v29.s[0] +ldr q26, [x0, #928] +sqrdmulh v25.4S, v26.4S, v28.s[0] +mul v26.4S, v26.4S,v29.s[0] +ldr q24, [x0, #864] +sqrdmulh v23.4S, v24.4S, v28.s[0] +mul v24.4S, v24.4S,v29.s[0] +ldr q22, [x0, #800] +sqrdmulh v21.4S, v22.4S, v28.s[0] +mul v22.4S, v22.4S,v29.s[0] +ldr q20, [x0, #736] +mla v30.4S, v27.4S, v31.s[0] +sqrdmulh v27.4S, v20.4S, v28.s[0] +ldr q19, [x0, #672] +mla v26.4S, v25.4S, v31.s[0] +sqrdmulh v25.4S, v19.4S, v28.s[0] +nop +ldr q18, [x0, #608] +mla v24.4S, v23.4S, v31.s[0] +sqrdmulh v23.4S, v18.4S, v28.s[0] +nop +ldr q17, [x0, #544] +mla v22.4S, v21.4S, v31.s[0] +nop +sqrdmulh v21.4S, v17.4S, v28.s[0] +ldr q16, [x0, #480] +ldr q3, [x0, #416] +mul v20.4S, v20.4S,v29.s[0] +sub v2.4s, v16.4s, v30.4s +mul v19.4S, v19.4S,v29.s[0] +add v16.4s, v16.4s, v30.4s +ldr q30, [x0, #352] +ldr q1, [x0, #288] +mla v20.4S, v27.4S, v31.s[0] +sub v27.4s, v3.4s, v26.4s +mla v19.4S, v25.4S, v31.s[0] +add v3.4s, v3.4s, v26.4s +ldr q26, [x0, #224] +ldr q25, [x0, #160] +mul v18.4S, v18.4S,v29.s[0] +sub v0.4s, v30.4s, v24.4s +mul v17.4S, v17.4S,v29.s[0] +add v30.4s, v30.4s, v24.4s +ldr q24, [x0, #96] +ldr q15, [x0, #32] +mla v18.4S, v23.4S, v31.s[0] +sub v23.4s, v1.4s, v22.4s +mla v17.4S, v21.4S, v31.s[0] +add v1.4s, v1.4s, v22.4s +sqrdmulh v22.4S, v2.4S, v28.s[2] +nop +mul v2.4S, v2.4S,v29.s[2] +nop +sqrdmulh v21.4S, v27.4S, v28.s[2] +sub v14.4s, v26.4s, v20.4s +mul v27.4S, v27.4S,v29.s[2] +add v26.4s, v26.4s, v20.4s +sqrdmulh v20.4S, v16.4S, v28.s[1] +sub v13.4s, v25.4s, v19.4s +mul v16.4S, v16.4S,v29.s[1] +add v25.4s, v25.4s, v19.4s +sqrdmulh v19.4S, v3.4S, v28.s[1] +sub v12.4s, v24.4s, v18.4s +mul v3.4S, v3.4S,v29.s[1] +add v24.4s, v24.4s, v18.4s +mla v2.4S, v22.4S, v31.s[0] +sub v22.4s, v15.4s, v17.4s +sqrdmulh v18.4S, v0.4S, v28.s[2] +add v15.4s, v15.4s, v17.4s +mla v27.4S, v21.4S, v31.s[0] +nop +sqrdmulh v21.4S, v23.4S, v28.s[2] +nop +mla v16.4S, v20.4S, v31.s[0] +nop +sqrdmulh v20.4S, v30.4S, v28.s[1] +nop +mla v3.4S, v19.4S, v31.s[0] +nop +sqrdmulh v19.4S, v1.4S, v28.s[1] +nop +ldr q17, [x17, #+32] +ldr q11, [x17, #+48] +mul v0.4S, v0.4S,v29.s[2] +sub v10.4s, v14.4s, v2.4s +mul v23.4S, v23.4S,v29.s[2] +add v14.4s, v14.4s, v2.4s +mla v0.4S, v18.4S, v31.s[0] +sub v18.4s, v13.4s, v27.4s +mla v23.4S, v21.4S, v31.s[0] +add v13.4s, v13.4s, v27.4s +mul v30.4S, v30.4S,v29.s[1] +sub v27.4s, v26.4s, v16.4s +mul v1.4S, v1.4S,v29.s[1] +add v26.4s, v26.4s, v16.4s +mla v30.4S, v20.4S, v31.s[0] +sub v20.4s, v25.4s, v3.4s +mla v1.4S, v19.4S, v31.s[0] +add v25.4s, v25.4s, v3.4s +sqrdmulh v3.4S, v10.4S, v11.s[3] +nop +mul v10.4S, v10.4S,v17.s[3] +nop +sqrdmulh v19.4S, v14.4S, v11.s[2] +sub v16.4s, v12.4s, v0.4s +mul v14.4S, v14.4S,v17.s[2] +add v12.4s, v12.4s, v0.4s +sqrdmulh v0.4S, v27.4S, v11.s[1] +sub v21.4s, v22.4s, v23.4s +mul v27.4S, v27.4S,v17.s[1] +add v22.4s, v22.4s, v23.4s +sqrdmulh v23.4S, v26.4S, v11.s[0] +sub v2.4s, v24.4s, v30.4s +mul v26.4S, v26.4S,v17.s[0] +add v24.4s, v24.4s, v30.4s +ldr q30, [x17, #+96] +ldr q9, [x17, #+112] +mla v10.4S, v3.4S, v31.s[0] +sub v3.4s, v15.4s, v1.4s +sqrdmulh v8.4S, v18.4S, v11.s[3] +add v15.4s, v15.4s, v1.4s +mla v14.4S, v19.4S, v31.s[0] +nop +sqrdmulh v19.4S, v13.4S, v11.s[2] +nop +mla v27.4S, v0.4S, v31.s[0] +nop +sqrdmulh v0.4S, v20.4S, v11.s[1] +nop +mla v26.4S, v23.4S, v31.s[0] +nop +sqrdmulh v23.4S, v25.4S, v11.s[0] +nop +ldr q1, [x17, #+64] +ldr q7, [x17, #+80] +mul v18.4S, v18.4S,v17.s[3] +sub v6.4s, v16.4s, v10.4s +mul v13.4S, v13.4S,v17.s[2] +add v16.4s, v16.4s, v10.4s +mla v18.4S, v8.4S, v31.s[0] +sub v8.4s, v12.4s, v14.4s +mla v13.4S, v19.4S, v31.s[0] +add v12.4s, v12.4s, v14.4s +mul v20.4S, v20.4S,v17.s[1] +sub v14.4s, v2.4s, v27.4s +mul v25.4S, v25.4S,v17.s[0] +add v2.4s, v2.4s, v27.4s +mla v20.4S, v0.4S, v31.s[0] +sub v0.4s, v24.4s, v26.4s +mla v25.4S, v23.4S, v31.s[0] +add v24.4s, v24.4s, v26.4s +sqrdmulh v26.4S, v6.4S, v9.s[3] +nop +mul v6.4S, v6.4S,v30.s[3] +nop +sqrdmulh v23.4S, v16.4S, v9.s[2] +sub v27.4s, v21.4s, v18.4s +mul v16.4S, v16.4S,v30.s[2] +add v21.4s, v21.4s, v18.4s +sqrdmulh v18.4S, v8.4S, v9.s[1] +sub v19.4s, v22.4s, v13.4s +mul v8.4S, v8.4S,v30.s[1] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v12.4S, v9.s[0] +sub v10.4s, v3.4s, v20.4s +mul v12.4S, v12.4S,v30.s[0] +add v3.4s, v3.4s, v20.4s +mla v6.4S, v26.4S, v31.s[0] +sub v26.4s, v15.4s, v25.4s +sqrdmulh v20.4S, v14.4S, v7.s[3] +add v15.4s, v15.4s, v25.4s +mla v16.4S, v23.4S, v31.s[0] +sub v23.4s, v27.4s, v6.4s +sqrdmulh v25.4S, v2.4S, v7.s[2] +add v27.4s, v27.4s, v6.4s +mla v8.4S, v18.4S, v31.s[0] +sub v18.4s, v21.4s, v16.4s +sqrdmulh v6.4S, v0.4S, v7.s[1] +add v21.4s, v21.4s, v16.4s +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v19.4s, v8.4s +sqrdmulh v16.4S, v24.4S, v7.s[0] +add v19.4s, v19.4s, v8.4s +mul v14.4S, v14.4S,v1.s[3] +sub v8.4s, v22.4s, v12.4s +mul v2.4S, v2.4S,v1.s[2] +add v22.4s, v22.4s, v12.4s +mla v14.4S, v20.4S, v31.s[0] +str q23, [x0, #992] +mla v2.4S, v25.4S, v31.s[0] +str q27, [x0, #928] +mul v0.4S, v0.4S,v1.s[1] +str q18, [x0, #864] +mul v24.4S, v24.4S,v1.s[0] +str q21, [x0, #800] +mla v0.4S, v6.4S, v31.s[0] +str q13, [x0, #736] +mla v24.4S, v16.4S, v31.s[0] +str q19, [x0, #672] +ldr q19, [x0, #1008] +sqrdmulh v16.4S, v19.4S, v28.s[0] +str q8, [x0, #608] +mul v19.4S, v19.4S,v29.s[0] +str q22, [x0, #544] +ldr q22, [x0, #944] +sqrdmulh v8.4S, v22.4S, v28.s[0] +sub v13.4s, v10.4s, v14.4s +str q13, [x0, #480] +mul v22.4S, v22.4S,v29.s[0] +add v10.4s, v10.4s, v14.4s +ldr q14, [x0, #880] +sqrdmulh v13.4S, v14.4S, v28.s[0] +sub v6.4s, v3.4s, v2.4s +str q10, [x0, #416] +mul v14.4S, v14.4S,v29.s[0] +add v3.4s, v3.4s, v2.4s +ldr q2, [x0, #816] +sqrdmulh v10.4S, v2.4S, v28.s[0] +sub v21.4s, v26.4s, v0.4s +str q6, [x0, #352] +mul v2.4S, v2.4S,v29.s[0] +add v26.4s, v26.4s, v0.4s +ldr q0, [x0, #752] +mla v19.4S, v16.4S, v31.s[0] +sub v16.4s, v15.4s, v24.4s +str q3, [x0, #288] +sqrdmulh v3.4S, v0.4S, v28.s[0] +add v15.4s, v15.4s, v24.4s +ldr q24, [x0, #688] +mla v22.4S, v8.4S, v31.s[0] +str q21, [x0, #224] +sqrdmulh v21.4S, v24.4S, v28.s[0] +nop +ldr q8, [x0, #624] +mla v14.4S, v13.4S, v31.s[0] +str q26, [x0, #160] +sqrdmulh v26.4S, v8.4S, v28.s[0] +nop +ldr q13, [x0, #560] +mla v2.4S, v10.4S, v31.s[0] +nop +sqrdmulh v10.4S, v13.4S, v28.s[0] +str q16, [x0, #96] +ldr q16, [x0, #496] +ldr q6, [x0, #432] +mul v0.4S, v0.4S,v29.s[0] +sub v18.4s, v16.4s, v19.4s +str q15, [x0, #32] +mul v24.4S, v24.4S,v29.s[0] +add v16.4s, v16.4s, v19.4s +ldr q19, [x0, #368] +ldr q15, [x0, #304] +mla v0.4S, v3.4S, v31.s[0] +sub v3.4s, v6.4s, v22.4s +mla v24.4S, v21.4S, v31.s[0] +add v6.4s, v6.4s, v22.4s +ldr q22, [x0, #240] +ldr q21, [x0, #176] +mul v8.4S, v8.4S,v29.s[0] +sub v27.4s, v19.4s, v14.4s +mul v13.4S, v13.4S,v29.s[0] +add v19.4s, v19.4s, v14.4s +ldr q14, [x0, #112] +ldr q25, [x0, #48] +mla v8.4S, v26.4S, v31.s[0] +sub v26.4s, v15.4s, v2.4s +mla v13.4S, v10.4S, v31.s[0] +add v15.4s, v15.4s, v2.4s +sqrdmulh v2.4S, v18.4S, v28.s[2] +nop +mul v18.4S, v18.4S,v29.s[2] +nop +sqrdmulh v10.4S, v3.4S, v28.s[2] +sub v23.4s, v22.4s, v0.4s +mul v3.4S, v3.4S,v29.s[2] +add v22.4s, v22.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v28.s[1] +sub v20.4s, v21.4s, v24.4s +mul v16.4S, v16.4S,v29.s[1] +add v21.4s, v21.4s, v24.4s +sqrdmulh v24.4S, v6.4S, v28.s[1] +sub v12.4s, v14.4s, v8.4s +mul v6.4S, v6.4S,v29.s[1] +add v14.4s, v14.4s, v8.4s +mla v18.4S, v2.4S, v31.s[0] +sub v2.4s, v25.4s, v13.4s +sqrdmulh v8.4S, v27.4S, v28.s[2] +add v25.4s, v25.4s, v13.4s +mla v3.4S, v10.4S, v31.s[0] +nop +sqrdmulh v10.4S, v26.4S, v28.s[2] +nop +mla v16.4S, v0.4S, v31.s[0] +nop +sqrdmulh v0.4S, v19.4S, v28.s[1] +nop +mla v6.4S, v24.4S, v31.s[0] +nop +sqrdmulh v24.4S, v15.4S, v28.s[1] +nop +mul v27.4S, v27.4S,v29.s[2] +sub v13.4s, v23.4s, v18.4s +mul v26.4S, v26.4S,v29.s[2] +add v23.4s, v23.4s, v18.4s +mla v27.4S, v8.4S, v31.s[0] +sub v8.4s, v20.4s, v3.4s +mla v26.4S, v10.4S, v31.s[0] +add v20.4s, v20.4s, v3.4s +mul v19.4S, v19.4S,v29.s[1] +sub v3.4s, v22.4s, v16.4s +mul v15.4S, v15.4S,v29.s[1] +add v22.4s, v22.4s, v16.4s +mla v19.4S, v0.4S, v31.s[0] +sub v0.4s, v21.4s, v6.4s +mla v15.4S, v24.4S, v31.s[0] +add v21.4s, v21.4s, v6.4s +sqrdmulh v6.4S, v13.4S, v11.s[3] +nop +mul v13.4S, v13.4S,v17.s[3] +nop +sqrdmulh v24.4S, v23.4S, v11.s[2] +sub v16.4s, v12.4s, v27.4s +mul v23.4S, v23.4S,v17.s[2] +add v12.4s, v12.4s, v27.4s +sqrdmulh v27.4S, v3.4S, v11.s[1] +sub v10.4s, v2.4s, v26.4s +mul v3.4S, v3.4S,v17.s[1] +add v2.4s, v2.4s, v26.4s +sqrdmulh v26.4S, v22.4S, v11.s[0] +sub v18.4s, v14.4s, v19.4s +mul v22.4S, v22.4S,v17.s[0] +add v14.4s, v14.4s, v19.4s +mla v13.4S, v6.4S, v31.s[0] +sub v6.4s, v25.4s, v15.4s +sqrdmulh v19.4S, v8.4S, v11.s[3] +add v25.4s, v25.4s, v15.4s +mla v23.4S, v24.4S, v31.s[0] +nop +sqrdmulh v24.4S, v20.4S, v11.s[2] +nop +mla v3.4S, v27.4S, v31.s[0] +nop +sqrdmulh v27.4S, v0.4S, v11.s[1] +nop +mla v22.4S, v26.4S, v31.s[0] +nop +sqrdmulh v26.4S, v21.4S, v11.s[0] +nop +mul v8.4S, v8.4S,v17.s[3] +sub v15.4s, v16.4s, v13.4s +mul v20.4S, v20.4S,v17.s[2] +add v16.4s, v16.4s, v13.4s +mla v8.4S, v19.4S, v31.s[0] +sub v19.4s, v12.4s, v23.4s +mla v20.4S, v24.4S, v31.s[0] +add v12.4s, v12.4s, v23.4s +mul v0.4S, v0.4S,v17.s[1] +sub v23.4s, v18.4s, v3.4s +mul v21.4S, v21.4S,v17.s[0] +add v18.4s, v18.4s, v3.4s +mla v0.4S, v27.4S, v31.s[0] +sub v27.4s, v14.4s, v22.4s +mla v21.4S, v26.4S, v31.s[0] +add v14.4s, v14.4s, v22.4s +sqrdmulh v22.4S, v15.4S, v9.s[3] +nop +mul v15.4S, v15.4S,v30.s[3] +nop +sqrdmulh v26.4S, v16.4S, v9.s[2] +sub v3.4s, v10.4s, v8.4s +mul v16.4S, v16.4S,v30.s[2] +add v10.4s, v10.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v9.s[1] +sub v24.4s, v2.4s, v20.4s +mul v19.4S, v19.4S,v30.s[1] +add v2.4s, v2.4s, v20.4s +sqrdmulh v20.4S, v12.4S, v9.s[0] +sub v13.4s, v6.4s, v0.4s +mul v12.4S, v12.4S,v30.s[0] +add v6.4s, v6.4s, v0.4s +mla v15.4S, v22.4S, v31.s[0] +sub v22.4s, v25.4s, v21.4s +sqrdmulh v0.4S, v23.4S, v7.s[3] +add v25.4s, v25.4s, v21.4s +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v3.4s, v15.4s +sqrdmulh v21.4S, v18.4S, v7.s[2] +add v3.4s, v3.4s, v15.4s +mla v19.4S, v8.4S, v31.s[0] +sub v8.4s, v10.4s, v16.4s +sqrdmulh v15.4S, v27.4S, v7.s[1] +add v10.4s, v10.4s, v16.4s +mla v12.4S, v20.4S, v31.s[0] +sub v20.4s, v24.4s, v19.4s +sqrdmulh v16.4S, v14.4S, v7.s[0] +add v24.4s, v24.4s, v19.4s +mul v23.4S, v23.4S,v1.s[3] +sub v19.4s, v2.4s, v12.4s +mul v18.4S, v18.4S,v1.s[2] +add v2.4s, v2.4s, v12.4s +mla v23.4S, v0.4S, v31.s[0] +str q26, [x0, #1008] +mla v18.4S, v21.4S, v31.s[0] +str q3, [x0, #944] +mul v27.4S, v27.4S,v1.s[1] +str q8, [x0, #880] +mul v14.4S, v14.4S,v1.s[0] +str q10, [x0, #816] +mla v27.4S, v15.4S, v31.s[0] +str q20, [x0, #752] +mla v14.4S, v16.4S, v31.s[0] +str q24, [x0, #688] +ldr q24, [x0, #960] +sqrdmulh v16.4S, v24.4S, v28.s[0] +str q19, [x0, #624] +mul v24.4S, v24.4S,v29.s[0] +str q2, [x0, #560] +ldr q2, [x0, #896] +sqrdmulh v19.4S, v2.4S, v28.s[0] +sub v20.4s, v13.4s, v23.4s +str q20, [x0, #496] +mul v2.4S, v2.4S,v29.s[0] +add v13.4s, v13.4s, v23.4s +ldr q23, [x0, #832] +sqrdmulh v20.4S, v23.4S, v28.s[0] +sub v15.4s, v6.4s, v18.4s +str q13, [x0, #432] +mul v23.4S, v23.4S,v29.s[0] +add v6.4s, v6.4s, v18.4s +ldr q18, [x0, #768] +sqrdmulh v13.4S, v18.4S, v28.s[0] +sub v10.4s, v22.4s, v27.4s +str q15, [x0, #368] +mul v18.4S, v18.4S,v29.s[0] +add v22.4s, v22.4s, v27.4s +ldr q27, [x0, #704] +mla v24.4S, v16.4S, v31.s[0] +sub v16.4s, v25.4s, v14.4s +str q6, [x0, #304] +sqrdmulh v6.4S, v27.4S, v28.s[0] +add v25.4s, v25.4s, v14.4s +ldr q14, [x0, #640] +mla v2.4S, v19.4S, v31.s[0] +str q10, [x0, #240] +sqrdmulh v10.4S, v14.4S, v28.s[0] +nop +ldr q19, [x0, #576] +mla v23.4S, v20.4S, v31.s[0] +str q22, [x0, #176] +sqrdmulh v22.4S, v19.4S, v28.s[0] +nop +ldr q20, [x0, #512] +mla v18.4S, v13.4S, v31.s[0] +nop +sqrdmulh v13.4S, v20.4S, v28.s[0] +str q16, [x0, #112] +ldr q16, [x0, #448] +ldr q15, [x0, #384] +mul v27.4S, v27.4S,v29.s[0] +sub v8.4s, v16.4s, v24.4s +str q25, [x0, #48] +mul v14.4S, v14.4S,v29.s[0] +add v16.4s, v16.4s, v24.4s +ldr q24, [x0, #320] +ldr q25, [x0, #256] +mla v27.4S, v6.4S, v31.s[0] +sub v6.4s, v15.4s, v2.4s +mla v14.4S, v10.4S, v31.s[0] +add v15.4s, v15.4s, v2.4s +ldr q2, [x0, #192] +ldr q10, [x0, #128] +mul v19.4S, v19.4S,v29.s[0] +sub v3.4s, v24.4s, v23.4s +mul v20.4S, v20.4S,v29.s[0] +add v24.4s, v24.4s, v23.4s +ldr q23, [x0, #64] +ldr q21, [x0, #0] +mla v19.4S, v22.4S, v31.s[0] +sub v22.4s, v25.4s, v18.4s +mla v20.4S, v13.4S, v31.s[0] +add v25.4s, v25.4s, v18.4s +sqrdmulh v18.4S, v8.4S, v28.s[2] +nop +mul v8.4S, v8.4S,v29.s[2] +nop +sqrdmulh v13.4S, v6.4S, v28.s[2] +sub v26.4s, v2.4s, v27.4s +mul v6.4S, v6.4S,v29.s[2] +add v2.4s, v2.4s, v27.4s +sqrdmulh v27.4S, v16.4S, v28.s[1] +sub v0.4s, v10.4s, v14.4s +mul v16.4S, v16.4S,v29.s[1] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v28.s[1] +sub v12.4s, v23.4s, v19.4s +mul v15.4S, v15.4S,v29.s[1] +add v23.4s, v23.4s, v19.4s +mla v8.4S, v18.4S, v31.s[0] +sub v18.4s, v21.4s, v20.4s +sqrdmulh v19.4S, v3.4S, v28.s[2] +add v21.4s, v21.4s, v20.4s +mla v6.4S, v13.4S, v31.s[0] +nop +sqrdmulh v13.4S, v22.4S, v28.s[2] +nop +mla v16.4S, v27.4S, v31.s[0] +nop +sqrdmulh v27.4S, v24.4S, v28.s[1] +nop +mla v15.4S, v14.4S, v31.s[0] +nop +sqrdmulh v14.4S, v25.4S, v28.s[1] +nop +mul v3.4S, v3.4S,v29.s[2] +sub v20.4s, v26.4s, v8.4s +mul v22.4S, v22.4S,v29.s[2] +add v26.4s, v26.4s, v8.4s +mla v3.4S, v19.4S, v31.s[0] +sub v19.4s, v0.4s, v6.4s +mla v22.4S, v13.4S, v31.s[0] +add v0.4s, v0.4s, v6.4s +mul v24.4S, v24.4S,v29.s[1] +sub v6.4s, v2.4s, v16.4s +mul v25.4S, v25.4S,v29.s[1] +add v2.4s, v2.4s, v16.4s +mla v24.4S, v27.4S, v31.s[0] +sub v27.4s, v10.4s, v15.4s +mla v25.4S, v14.4S, v31.s[0] +add v10.4s, v10.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v11.s[3] +nop +mul v20.4S, v20.4S,v17.s[3] +nop +sqrdmulh v14.4S, v26.4S, v11.s[2] +sub v16.4s, v12.4s, v3.4s +mul v26.4S, v26.4S,v17.s[2] +add v12.4s, v12.4s, v3.4s +sqrdmulh v3.4S, v6.4S, v11.s[1] +sub v13.4s, v18.4s, v22.4s +mul v6.4S, v6.4S,v17.s[1] +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v2.4S, v11.s[0] +sub v8.4s, v23.4s, v24.4s +mul v2.4S, v2.4S,v17.s[0] +add v23.4s, v23.4s, v24.4s +mla v20.4S, v15.4S, v31.s[0] +sub v15.4s, v21.4s, v25.4s +sqrdmulh v24.4S, v19.4S, v11.s[3] +add v21.4s, v21.4s, v25.4s +mla v26.4S, v14.4S, v31.s[0] +nop +sqrdmulh v14.4S, v0.4S, v11.s[2] +nop +mla v6.4S, v3.4S, v31.s[0] +nop +sqrdmulh v3.4S, v27.4S, v11.s[1] +nop +mla v2.4S, v22.4S, v31.s[0] +nop +sqrdmulh v22.4S, v10.4S, v11.s[0] +nop +mul v19.4S, v19.4S,v17.s[3] +sub v25.4s, v16.4s, v20.4s +mul v0.4S, v0.4S,v17.s[2] +add v16.4s, v16.4s, v20.4s +mla v19.4S, v24.4S, v31.s[0] +sub v24.4s, v12.4s, v26.4s +mla v0.4S, v14.4S, v31.s[0] +add v12.4s, v12.4s, v26.4s +mul v27.4S, v27.4S,v17.s[1] +sub v26.4s, v8.4s, v6.4s +mul v10.4S, v10.4S,v17.s[0] +add v8.4s, v8.4s, v6.4s +mla v27.4S, v3.4S, v31.s[0] +sub v3.4s, v23.4s, v2.4s +mla v10.4S, v22.4S, v31.s[0] +add v23.4s, v23.4s, v2.4s +sqrdmulh v2.4S, v25.4S, v9.s[3] +nop +mul v25.4S, v25.4S,v30.s[3] +nop +sqrdmulh v22.4S, v16.4S, v9.s[2] +sub v6.4s, v13.4s, v19.4s +mul v16.4S, v16.4S,v30.s[2] +add v13.4s, v13.4s, v19.4s +sqrdmulh v19.4S, v24.4S, v9.s[1] +sub v14.4s, v18.4s, v0.4s +mul v24.4S, v24.4S,v30.s[1] +add v18.4s, v18.4s, v0.4s +sqrdmulh v0.4S, v12.4S, v9.s[0] +sub v20.4s, v15.4s, v27.4s +mul v12.4S, v12.4S,v30.s[0] +add v15.4s, v15.4s, v27.4s +mla v25.4S, v2.4S, v31.s[0] +sub v2.4s, v21.4s, v10.4s +sqrdmulh v27.4S, v26.4S, v7.s[3] +add v21.4s, v21.4s, v10.4s +mla v16.4S, v22.4S, v31.s[0] +sub v22.4s, v6.4s, v25.4s +sqrdmulh v10.4S, v8.4S, v7.s[2] +add v6.4s, v6.4s, v25.4s +mla v24.4S, v19.4S, v31.s[0] +sub v19.4s, v13.4s, v16.4s +sqrdmulh v25.4S, v3.4S, v7.s[1] +add v13.4s, v13.4s, v16.4s +mla v12.4S, v0.4S, v31.s[0] +sub v0.4s, v14.4s, v24.4s +sqrdmulh v16.4S, v23.4S, v7.s[0] +add v14.4s, v14.4s, v24.4s +mul v26.4S, v26.4S,v1.s[3] +sub v24.4s, v18.4s, v12.4s +mul v8.4S, v8.4S,v1.s[2] +add v18.4s, v18.4s, v12.4s +mla v26.4S, v27.4S, v31.s[0] +str q22, [x0, #960] +mla v8.4S, v10.4S, v31.s[0] +str q6, [x0, #896] +mul v3.4S, v3.4S,v1.s[1] +str q19, [x0, #832] +mul v23.4S, v23.4S,v1.s[0] +str q13, [x0, #768] +mla v3.4S, v25.4S, v31.s[0] +str q0, [x0, #704] +mla v23.4S, v16.4S, v31.s[0] +str q14, [x0, #640] +ldr q14, [x0, #976] +sqrdmulh v16.4S, v14.4S, v28.s[0] +str q24, [x0, #576] +mul v14.4S, v14.4S,v29.s[0] +str q18, [x0, #512] +ldr q18, [x0, #912] +sqrdmulh v24.4S, v18.4S, v28.s[0] +sub v0.4s, v20.4s, v26.4s +str q0, [x0, #448] +mul v18.4S, v18.4S,v29.s[0] +add v20.4s, v20.4s, v26.4s +ldr q26, [x0, #848] +sqrdmulh v0.4S, v26.4S, v28.s[0] +sub v25.4s, v15.4s, v8.4s +str q20, [x0, #384] +mul v26.4S, v26.4S,v29.s[0] +add v15.4s, v15.4s, v8.4s +ldr q8, [x0, #784] +sqrdmulh v20.4S, v8.4S, v28.s[0] +sub v13.4s, v2.4s, v3.4s +str q25, [x0, #320] +mul v8.4S, v8.4S,v29.s[0] +add v2.4s, v2.4s, v3.4s +ldr q3, [x0, #720] +mla v14.4S, v16.4S, v31.s[0] +sub v16.4s, v21.4s, v23.4s +str q15, [x0, #256] +sqrdmulh v15.4S, v3.4S, v28.s[0] +add v21.4s, v21.4s, v23.4s +ldr q23, [x0, #656] +mla v18.4S, v24.4S, v31.s[0] +str q13, [x0, #192] +sqrdmulh v13.4S, v23.4S, v28.s[0] +nop +ldr q24, [x0, #592] +mla v26.4S, v0.4S, v31.s[0] +str q2, [x0, #128] +sqrdmulh v2.4S, v24.4S, v28.s[0] +nop +ldr q0, [x0, #528] +mla v8.4S, v20.4S, v31.s[0] +nop +sqrdmulh v20.4S, v0.4S, v28.s[0] +str q16, [x0, #64] +ldr q16, [x0, #464] +ldr q25, [x0, #400] +mul v3.4S, v3.4S,v29.s[0] +sub v19.4s, v16.4s, v14.4s +str q21, [x0, #0] +mul v23.4S, v23.4S,v29.s[0] +add v16.4s, v16.4s, v14.4s +ldr q14, [x0, #336] +ldr q21, [x0, #272] +mla v3.4S, v15.4S, v31.s[0] +sub v15.4s, v25.4s, v18.4s +mla v23.4S, v13.4S, v31.s[0] +add v25.4s, v25.4s, v18.4s +ldr q18, [x0, #208] +ldr q13, [x0, #144] +mul v24.4S, v24.4S,v29.s[0] +sub v6.4s, v14.4s, v26.4s +mul v0.4S, v0.4S,v29.s[0] +add v14.4s, v14.4s, v26.4s +ldr q26, [x0, #80] +ldr q10, [x0, #16] +mla v24.4S, v2.4S, v31.s[0] +sub v2.4s, v21.4s, v8.4s +mla v0.4S, v20.4S, v31.s[0] +add v21.4s, v21.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v28.s[2] +nop +mul v19.4S, v19.4S,v29.s[2] +nop +sqrdmulh v20.4S, v15.4S, v28.s[2] +sub v22.4s, v18.4s, v3.4s +mul v15.4S, v15.4S,v29.s[2] +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v16.4S, v28.s[1] +sub v27.4s, v13.4s, v23.4s +mul v16.4S, v16.4S,v29.s[1] +add v13.4s, v13.4s, v23.4s +sqrdmulh v23.4S, v25.4S, v28.s[1] +sub v12.4s, v26.4s, v24.4s +mul v25.4S, v25.4S,v29.s[1] +add v26.4s, v26.4s, v24.4s +mla v19.4S, v8.4S, v31.s[0] +sub v8.4s, v10.4s, v0.4s +sqrdmulh v24.4S, v6.4S, v28.s[2] +add v10.4s, v10.4s, v0.4s +mla v15.4S, v20.4S, v31.s[0] +nop +sqrdmulh v20.4S, v2.4S, v28.s[2] +nop +mla v16.4S, v3.4S, v31.s[0] +nop +sqrdmulh v3.4S, v14.4S, v28.s[1] +nop +mla v25.4S, v23.4S, v31.s[0] +nop +sqrdmulh v23.4S, v21.4S, v28.s[1] +nop +mul v6.4S, v6.4S,v29.s[2] +sub v0.4s, v22.4s, v19.4s +mul v2.4S, v2.4S,v29.s[2] +add v22.4s, v22.4s, v19.4s +mla v6.4S, v24.4S, v31.s[0] +sub v24.4s, v27.4s, v15.4s +mla v2.4S, v20.4S, v31.s[0] +add v27.4s, v27.4s, v15.4s +mul v14.4S, v14.4S,v29.s[1] +sub v15.4s, v18.4s, v16.4s +mul v21.4S, v21.4S,v29.s[1] +add v18.4s, v18.4s, v16.4s +mla v14.4S, v3.4S, v31.s[0] +sub v3.4s, v13.4s, v25.4s +mla v21.4S, v23.4S, v31.s[0] +add v13.4s, v13.4s, v25.4s +sqrdmulh v28.4S, v0.4S, v11.s[3] +nop +mul v0.4S, v0.4S,v17.s[3] +nop +sqrdmulh v29.4S, v22.4S, v11.s[2] +sub v25.4s, v12.4s, v6.4s +mul v22.4S, v22.4S,v17.s[2] +add v12.4s, v12.4s, v6.4s +sqrdmulh v6.4S, v15.4S, v11.s[1] +sub v23.4s, v8.4s, v2.4s +mul v15.4S, v15.4S,v17.s[1] +add v8.4s, v8.4s, v2.4s +sqrdmulh v2.4S, v18.4S, v11.s[0] +sub v16.4s, v26.4s, v14.4s +mul v18.4S, v18.4S,v17.s[0] +add v26.4s, v26.4s, v14.4s +mla v0.4S, v28.4S, v31.s[0] +sub v28.4s, v10.4s, v21.4s +sqrdmulh v14.4S, v24.4S, v11.s[3] +add v10.4s, v10.4s, v21.4s +mla v22.4S, v29.4S, v31.s[0] +nop +sqrdmulh v29.4S, v27.4S, v11.s[2] +nop +mla v15.4S, v6.4S, v31.s[0] +nop +sqrdmulh v6.4S, v3.4S, v11.s[1] +nop +mla v18.4S, v2.4S, v31.s[0] +nop +sqrdmulh v2.4S, v13.4S, v11.s[0] +nop +mul v24.4S, v24.4S,v17.s[3] +sub v21.4s, v25.4s, v0.4s +mul v27.4S, v27.4S,v17.s[2] +add v25.4s, v25.4s, v0.4s +mla v24.4S, v14.4S, v31.s[0] +sub v14.4s, v12.4s, v22.4s +mla v27.4S, v29.4S, v31.s[0] +add v12.4s, v12.4s, v22.4s +mul v3.4S, v3.4S,v17.s[1] +sub v22.4s, v16.4s, v15.4s +mul v13.4S, v13.4S,v17.s[0] +add v16.4s, v16.4s, v15.4s +mla v3.4S, v6.4S, v31.s[0] +sub v6.4s, v26.4s, v18.4s +mla v13.4S, v2.4S, v31.s[0] +add v26.4s, v26.4s, v18.4s +sqrdmulh v11.4S, v21.4S, v9.s[3] +nop +mul v21.4S, v21.4S,v30.s[3] +nop +sqrdmulh v17.4S, v25.4S, v9.s[2] +sub v18.4s, v23.4s, v24.4s +mul v25.4S, v25.4S,v30.s[2] +add v23.4s, v23.4s, v24.4s +sqrdmulh v24.4S, v14.4S, v9.s[1] +sub v2.4s, v8.4s, v27.4s +mul v14.4S, v14.4S,v30.s[1] +add v8.4s, v8.4s, v27.4s +sqrdmulh v27.4S, v12.4S, v9.s[0] +sub v15.4s, v28.4s, v3.4s +mul v12.4S, v12.4S,v30.s[0] +add v28.4s, v28.4s, v3.4s +mla v21.4S, v11.4S, v31.s[0] +sub v11.4s, v10.4s, v13.4s +sqrdmulh v9.4S, v22.4S, v7.s[3] +add v10.4s, v10.4s, v13.4s +mla v25.4S, v17.4S, v31.s[0] +sub v17.4s, v18.4s, v21.4s +sqrdmulh v13.4S, v16.4S, v7.s[2] +add v18.4s, v18.4s, v21.4s +mla v14.4S, v24.4S, v31.s[0] +sub v24.4s, v23.4s, v25.4s +sqrdmulh v21.4S, v6.4S, v7.s[1] +add v23.4s, v23.4s, v25.4s +mla v12.4S, v27.4S, v31.s[0] +sub v27.4s, v2.4s, v14.4s +sqrdmulh v25.4S, v26.4S, v7.s[0] +add v2.4s, v2.4s, v14.4s +mul v22.4S, v22.4S,v1.s[3] +sub v14.4s, v8.4s, v12.4s +mul v16.4S, v16.4S,v1.s[2] +add v8.4s, v8.4s, v12.4s +mla v22.4S, v9.4S, v31.s[0] +str q17, [x0, #976] +mla v16.4S, v13.4S, v31.s[0] +str q18, [x0, #912] +mul v6.4S, v6.4S,v1.s[1] +str q24, [x0, #848] +mul v26.4S, v26.4S,v1.s[0] +str q23, [x0, #784] +mla v6.4S, v21.4S, v31.s[0] +str q27, [x0, #720] +mla v26.4S, v25.4S, v31.s[0] +str q2, [x0, #656] +str q14, [x0, #592] +str q8, [x0, #528] +sub v8.4s, v15.4s, v22.4s +str q8, [x0, #464] +add v15.4s, v15.4s, v22.4s +sub v22.4s, v28.4s, v16.4s +str q15, [x0, #400] +add v28.4s, v28.4s, v16.4s +sub v16.4s, v11.4s, v6.4s +str q22, [x0, #336] +add v11.4s, v11.4s, v6.4s +sub v6.4s, v10.4s, v26.4s +str q28, [x0, #272] +add v10.4s, v10.4s, v26.4s +str q16, [x0, #208] +str q11, [x0, #144] +str q6, [x0, #80] +str q10, [x0, #16] +ldr q4, [x17, #+128] +ldr q5, [x17, #+144] +ldr q19, [x17, #+160] +ldr q20, [x17, #+176] +ldr q0, [x17, #+192] +ldr q29, [x17, #+208] +ldr q3, [x17, #+224] +ldr q30, [x17, #+240] +ldr q12, [x0, #32] +ldr q9, [x0, #48] +ldr q17, [x0, #0] +ldr q13, [x0, #16] +sqrdmulh v18.4S, v12.4S, v5.s[0] +mul v12.4S, v12.4S,v4.s[0] +mla v12.4S, v18.4S, v31.s[0] +sub v18.4s, v17.4s, v12.4s +add v17.4s, v17.4s, v12.4s +sqrdmulh v12.4S, v9.4S, v5.s[0] +mul v9.4S, v9.4S,v4.s[0] +mla v9.4S, v12.4S, v31.s[0] +sub v12.4s, v13.4s, v9.4s +add v13.4s, v13.4s, v9.4s +sqrdmulh v9.4S, v13.4S, v5.s[1] +mul v13.4S, v13.4S,v4.s[1] +mla v13.4S, v9.4S, v31.s[0] +sub v9.4s, v17.4s, v13.4s +add v17.4s, v17.4s, v13.4s +sqrdmulh v13.4S, v12.4S, v5.s[2] +mul v12.4S, v12.4S,v4.s[2] +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v18.4s, v12.4s +add v18.4s, v18.4s, v12.4s +trn1 v12.4S, v17.4S, v9.4S +trn2 v24.4S, v17.4S, v9.4S +trn1 v23.4S, v18.4S, v13.4S +trn2 v21.4S, v18.4S, v13.4S +trn2 v18.2D, v12.2D, v23.2D +trn2 v13.2D, v24.2D, v21.2D +trn1 v17.2D, v12.2D, v23.2D +trn1 v9.2D, v24.2D, v21.2D +sqrdmulh v21.4S, v18.4S, v20.4S +mul v18.4S, v18.4S,v19.4S +mla v18.4S, v21.4S, v31.s[0] +sub v21.4s, v17.4s, v18.4s +add v17.4s, v17.4s, v18.4s +sqrdmulh v18.4S, v13.4S, v20.4S +mul v13.4S, v13.4S,v19.4S +mla v13.4S, v18.4S, v31.s[0] +sub v18.4s, v9.4s, v13.4s +add v9.4s, v9.4s, v13.4s +sqrdmulh v13.4S, v9.4S, v29.4S +mul v9.4S, v9.4S,v0.4S +mla v9.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v9.4s +add v17.4s, v17.4s, v9.4s +sqrdmulh v9.4S, v18.4S, v30.4S +mul v18.4S, v18.4S,v3.4S +mla v18.4S, v9.4S, v31.s[0] +sub v9.4s, v21.4s, v18.4s +add v21.4s, v21.4s, v18.4s +str q17, [x0, #0] +str q13, [x0, #16] +str q21, [x0, #32] +str q9, [x0, #48] +ldr q9, [x17, #+256] +ldr q21, [x17, #+272] +ldr q13, [x17, #+288] +ldr q17, [x17, #+304] +ldr q18, [x17, #+320] +ldr q24, [x17, #+336] +ldr q23, [x17, #+352] +ldr q12, [x17, #+368] +ldr q30, [x0, #96] +ldr q3, [x0, #112] +ldr q29, [x0, #64] +ldr q0, [x0, #80] +sqrdmulh v20.4S, v30.4S, v21.s[0] +mul v30.4S, v30.4S,v9.s[0] +mla v30.4S, v20.4S, v31.s[0] +sub v20.4s, v29.4s, v30.4s +add v29.4s, v29.4s, v30.4s +sqrdmulh v30.4S, v3.4S, v21.s[0] +mul v3.4S, v3.4S,v9.s[0] +mla v3.4S, v30.4S, v31.s[0] +sub v30.4s, v0.4s, v3.4s +add v0.4s, v0.4s, v3.4s +sqrdmulh v3.4S, v0.4S, v21.s[1] +mul v0.4S, v0.4S,v9.s[1] +mla v0.4S, v3.4S, v31.s[0] +sub v3.4s, v29.4s, v0.4s +add v29.4s, v29.4s, v0.4s +sqrdmulh v0.4S, v30.4S, v21.s[2] +mul v30.4S, v30.4S,v9.s[2] +mla v30.4S, v0.4S, v31.s[0] +sub v0.4s, v20.4s, v30.4s +add v20.4s, v20.4s, v30.4s +trn1 v30.4S, v29.4S, v3.4S +trn2 v19.4S, v29.4S, v3.4S +trn1 v5.4S, v20.4S, v0.4S +trn2 v4.4S, v20.4S, v0.4S +trn2 v20.2D, v30.2D, v5.2D +trn2 v0.2D, v19.2D, v4.2D +trn1 v29.2D, v30.2D, v5.2D +trn1 v3.2D, v19.2D, v4.2D +sqrdmulh v4.4S, v20.4S, v17.4S +mul v20.4S, v20.4S,v13.4S +mla v20.4S, v4.4S, v31.s[0] +sub v4.4s, v29.4s, v20.4s +add v29.4s, v29.4s, v20.4s +sqrdmulh v20.4S, v0.4S, v17.4S +mul v0.4S, v0.4S,v13.4S +mla v0.4S, v20.4S, v31.s[0] +sub v20.4s, v3.4s, v0.4s +add v3.4s, v3.4s, v0.4s +sqrdmulh v0.4S, v3.4S, v24.4S +mul v3.4S, v3.4S,v18.4S +mla v3.4S, v0.4S, v31.s[0] +sub v0.4s, v29.4s, v3.4s +add v29.4s, v29.4s, v3.4s +sqrdmulh v3.4S, v20.4S, v12.4S +mul v20.4S, v20.4S,v23.4S +mla v20.4S, v3.4S, v31.s[0] +sub v3.4s, v4.4s, v20.4s +add v4.4s, v4.4s, v20.4s +str q29, [x0, #64] +str q0, [x0, #80] +str q4, [x0, #96] +str q3, [x0, #112] +ldr q3, [x17, #+384] +ldr q4, [x17, #+400] +ldr q0, [x17, #+416] +ldr q29, [x17, #+432] +ldr q20, [x17, #+448] +ldr q19, [x17, #+464] +ldr q5, [x17, #+480] +ldr q30, [x17, #+496] +ldr q12, [x0, #160] +ldr q23, [x0, #176] +ldr q24, [x0, #128] +ldr q18, [x0, #144] +sqrdmulh v17.4S, v12.4S, v4.s[0] +mul v12.4S, v12.4S,v3.s[0] +mla v12.4S, v17.4S, v31.s[0] +sub v17.4s, v24.4s, v12.4s +add v24.4s, v24.4s, v12.4s +sqrdmulh v12.4S, v23.4S, v4.s[0] +mul v23.4S, v23.4S,v3.s[0] +mla v23.4S, v12.4S, v31.s[0] +sub v12.4s, v18.4s, v23.4s +add v18.4s, v18.4s, v23.4s +sqrdmulh v23.4S, v18.4S, v4.s[1] +mul v18.4S, v18.4S,v3.s[1] +mla v18.4S, v23.4S, v31.s[0] +sub v23.4s, v24.4s, v18.4s +add v24.4s, v24.4s, v18.4s +sqrdmulh v18.4S, v12.4S, v4.s[2] +mul v12.4S, v12.4S,v3.s[2] +mla v12.4S, v18.4S, v31.s[0] +sub v18.4s, v17.4s, v12.4s +add v17.4s, v17.4s, v12.4s +trn1 v12.4S, v24.4S, v23.4S +trn2 v13.4S, v24.4S, v23.4S +trn1 v21.4S, v17.4S, v18.4S +trn2 v9.4S, v17.4S, v18.4S +trn2 v17.2D, v12.2D, v21.2D +trn2 v18.2D, v13.2D, v9.2D +trn1 v24.2D, v12.2D, v21.2D +trn1 v23.2D, v13.2D, v9.2D +sqrdmulh v9.4S, v17.4S, v29.4S +mul v17.4S, v17.4S,v0.4S +mla v17.4S, v9.4S, v31.s[0] +sub v9.4s, v24.4s, v17.4s +add v24.4s, v24.4s, v17.4s +sqrdmulh v17.4S, v18.4S, v29.4S +mul v18.4S, v18.4S,v0.4S +mla v18.4S, v17.4S, v31.s[0] +sub v17.4s, v23.4s, v18.4s +add v23.4s, v23.4s, v18.4s +sqrdmulh v18.4S, v23.4S, v19.4S +mul v23.4S, v23.4S,v20.4S +mla v23.4S, v18.4S, v31.s[0] +sub v18.4s, v24.4s, v23.4s +add v24.4s, v24.4s, v23.4s +sqrdmulh v23.4S, v17.4S, v30.4S +mul v17.4S, v17.4S,v5.4S +mla v17.4S, v23.4S, v31.s[0] +sub v23.4s, v9.4s, v17.4s +add v9.4s, v9.4s, v17.4s +str q24, [x0, #128] +str q18, [x0, #144] +str q9, [x0, #160] +str q23, [x0, #176] +ldr q23, [x17, #+512] +ldr q9, [x17, #+528] +ldr q18, [x17, #+544] +ldr q24, [x17, #+560] +ldr q17, [x17, #+576] +ldr q13, [x17, #+592] +ldr q21, [x17, #+608] +ldr q12, [x17, #+624] +ldr q30, [x0, #224] +ldr q5, [x0, #240] +ldr q19, [x0, #192] +ldr q20, [x0, #208] +sqrdmulh v29.4S, v30.4S, v9.s[0] +mul v30.4S, v30.4S,v23.s[0] +mla v30.4S, v29.4S, v31.s[0] +sub v29.4s, v19.4s, v30.4s +add v19.4s, v19.4s, v30.4s +sqrdmulh v30.4S, v5.4S, v9.s[0] +mul v5.4S, v5.4S,v23.s[0] +mla v5.4S, v30.4S, v31.s[0] +sub v30.4s, v20.4s, v5.4s +add v20.4s, v20.4s, v5.4s +sqrdmulh v5.4S, v20.4S, v9.s[1] +mul v20.4S, v20.4S,v23.s[1] +mla v20.4S, v5.4S, v31.s[0] +sub v5.4s, v19.4s, v20.4s +add v19.4s, v19.4s, v20.4s +sqrdmulh v20.4S, v30.4S, v9.s[2] +mul v30.4S, v30.4S,v23.s[2] +mla v30.4S, v20.4S, v31.s[0] +sub v20.4s, v29.4s, v30.4s +add v29.4s, v29.4s, v30.4s +trn1 v30.4S, v19.4S, v5.4S +trn2 v0.4S, v19.4S, v5.4S +trn1 v4.4S, v29.4S, v20.4S +trn2 v3.4S, v29.4S, v20.4S +trn2 v29.2D, v30.2D, v4.2D +trn2 v20.2D, v0.2D, v3.2D +trn1 v19.2D, v30.2D, v4.2D +trn1 v5.2D, v0.2D, v3.2D +sqrdmulh v3.4S, v29.4S, v24.4S +mul v29.4S, v29.4S,v18.4S +mla v29.4S, v3.4S, v31.s[0] +sub v3.4s, v19.4s, v29.4s +add v19.4s, v19.4s, v29.4s +sqrdmulh v29.4S, v20.4S, v24.4S +mul v20.4S, v20.4S,v18.4S +mla v20.4S, v29.4S, v31.s[0] +sub v29.4s, v5.4s, v20.4s +add v5.4s, v5.4s, v20.4s +sqrdmulh v20.4S, v5.4S, v13.4S +mul v5.4S, v5.4S,v17.4S +mla v5.4S, v20.4S, v31.s[0] +sub v20.4s, v19.4s, v5.4s +add v19.4s, v19.4s, v5.4s +sqrdmulh v5.4S, v29.4S, v12.4S +mul v29.4S, v29.4S,v21.4S +mla v29.4S, v5.4S, v31.s[0] +sub v5.4s, v3.4s, v29.4s +add v3.4s, v3.4s, v29.4s +str q19, [x0, #192] +str q20, [x0, #208] +str q3, [x0, #224] +str q5, [x0, #240] +ldr q5, [x17, #+640] +ldr q3, [x17, #+656] +ldr q20, [x17, #+672] +ldr q19, [x17, #+688] +ldr q29, [x17, #+704] +ldr q0, [x17, #+720] +ldr q4, [x17, #+736] +ldr q30, [x17, #+752] +ldr q12, [x0, #288] +ldr q21, [x0, #304] +ldr q13, [x0, #256] +ldr q17, [x0, #272] +sqrdmulh v24.4S, v12.4S, v3.s[0] +mul v12.4S, v12.4S,v5.s[0] +mla v12.4S, v24.4S, v31.s[0] +sub v24.4s, v13.4s, v12.4s +add v13.4s, v13.4s, v12.4s +sqrdmulh v12.4S, v21.4S, v3.s[0] +mul v21.4S, v21.4S,v5.s[0] +mla v21.4S, v12.4S, v31.s[0] +sub v12.4s, v17.4s, v21.4s +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v3.s[1] +mul v17.4S, v17.4S,v5.s[1] +mla v17.4S, v21.4S, v31.s[0] +sub v21.4s, v13.4s, v17.4s +add v13.4s, v13.4s, v17.4s +sqrdmulh v17.4S, v12.4S, v3.s[2] +mul v12.4S, v12.4S,v5.s[2] +mla v12.4S, v17.4S, v31.s[0] +sub v17.4s, v24.4s, v12.4s +add v24.4s, v24.4s, v12.4s +trn1 v12.4S, v13.4S, v21.4S +trn2 v18.4S, v13.4S, v21.4S +trn1 v9.4S, v24.4S, v17.4S +trn2 v23.4S, v24.4S, v17.4S +trn2 v24.2D, v12.2D, v9.2D +trn2 v17.2D, v18.2D, v23.2D +trn1 v13.2D, v12.2D, v9.2D +trn1 v21.2D, v18.2D, v23.2D +sqrdmulh v23.4S, v24.4S, v19.4S +mul v24.4S, v24.4S,v20.4S +mla v24.4S, v23.4S, v31.s[0] +sub v23.4s, v13.4s, v24.4s +add v13.4s, v13.4s, v24.4s +sqrdmulh v24.4S, v17.4S, v19.4S +mul v17.4S, v17.4S,v20.4S +mla v17.4S, v24.4S, v31.s[0] +sub v24.4s, v21.4s, v17.4s +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v21.4S, v0.4S +mul v21.4S, v21.4S,v29.4S +mla v21.4S, v17.4S, v31.s[0] +sub v17.4s, v13.4s, v21.4s +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v24.4S, v30.4S +mul v24.4S, v24.4S,v4.4S +mla v24.4S, v21.4S, v31.s[0] +sub v21.4s, v23.4s, v24.4s +add v23.4s, v23.4s, v24.4s +str q13, [x0, #256] +str q17, [x0, #272] +str q23, [x0, #288] +str q21, [x0, #304] +ldr q21, [x17, #+768] +ldr q23, [x17, #+784] +ldr q17, [x17, #+800] +ldr q13, [x17, #+816] +ldr q24, [x17, #+832] +ldr q18, [x17, #+848] +ldr q9, [x17, #+864] +ldr q12, [x17, #+880] +ldr q30, [x0, #352] +ldr q4, [x0, #368] +ldr q0, [x0, #320] +ldr q29, [x0, #336] +sqrdmulh v19.4S, v30.4S, v23.s[0] +mul v30.4S, v30.4S,v21.s[0] +mla v30.4S, v19.4S, v31.s[0] +sub v19.4s, v0.4s, v30.4s +add v0.4s, v0.4s, v30.4s +sqrdmulh v30.4S, v4.4S, v23.s[0] +mul v4.4S, v4.4S,v21.s[0] +mla v4.4S, v30.4S, v31.s[0] +sub v30.4s, v29.4s, v4.4s +add v29.4s, v29.4s, v4.4s +sqrdmulh v4.4S, v29.4S, v23.s[1] +mul v29.4S, v29.4S,v21.s[1] +mla v29.4S, v4.4S, v31.s[0] +sub v4.4s, v0.4s, v29.4s +add v0.4s, v0.4s, v29.4s +sqrdmulh v29.4S, v30.4S, v23.s[2] +mul v30.4S, v30.4S,v21.s[2] +mla v30.4S, v29.4S, v31.s[0] +sub v29.4s, v19.4s, v30.4s +add v19.4s, v19.4s, v30.4s +trn1 v30.4S, v0.4S, v4.4S +trn2 v20.4S, v0.4S, v4.4S +trn1 v3.4S, v19.4S, v29.4S +trn2 v5.4S, v19.4S, v29.4S +trn2 v19.2D, v30.2D, v3.2D +trn2 v29.2D, v20.2D, v5.2D +trn1 v0.2D, v30.2D, v3.2D +trn1 v4.2D, v20.2D, v5.2D +sqrdmulh v5.4S, v19.4S, v13.4S +mul v19.4S, v19.4S,v17.4S +mla v19.4S, v5.4S, v31.s[0] +sub v5.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v29.4S, v13.4S +mul v29.4S, v29.4S,v17.4S +mla v29.4S, v19.4S, v31.s[0] +sub v19.4s, v4.4s, v29.4s +add v4.4s, v4.4s, v29.4s +sqrdmulh v29.4S, v4.4S, v18.4S +mul v4.4S, v4.4S,v24.4S +mla v4.4S, v29.4S, v31.s[0] +sub v29.4s, v0.4s, v4.4s +add v0.4s, v0.4s, v4.4s +sqrdmulh v4.4S, v19.4S, v12.4S +mul v19.4S, v19.4S,v9.4S +mla v19.4S, v4.4S, v31.s[0] +sub v4.4s, v5.4s, v19.4s +add v5.4s, v5.4s, v19.4s +str q0, [x0, #320] +str q29, [x0, #336] +str q5, [x0, #352] +str q4, [x0, #368] +ldr q4, [x17, #+896] +ldr q5, [x17, #+912] +ldr q29, [x17, #+928] +ldr q0, [x17, #+944] +ldr q19, [x17, #+960] +ldr q20, [x17, #+976] +ldr q3, [x17, #+992] +ldr q30, [x17, #+1008] +ldr q12, [x0, #416] +ldr q9, [x0, #432] +ldr q18, [x0, #384] +ldr q24, [x0, #400] +sqrdmulh v13.4S, v12.4S, v5.s[0] +mul v12.4S, v12.4S,v4.s[0] +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v18.4s, v12.4s +add v18.4s, v18.4s, v12.4s +sqrdmulh v12.4S, v9.4S, v5.s[0] +mul v9.4S, v9.4S,v4.s[0] +mla v9.4S, v12.4S, v31.s[0] +sub v12.4s, v24.4s, v9.4s +add v24.4s, v24.4s, v9.4s +sqrdmulh v9.4S, v24.4S, v5.s[1] +mul v24.4S, v24.4S,v4.s[1] +mla v24.4S, v9.4S, v31.s[0] +sub v9.4s, v18.4s, v24.4s +add v18.4s, v18.4s, v24.4s +sqrdmulh v24.4S, v12.4S, v5.s[2] +mul v12.4S, v12.4S,v4.s[2] +mla v12.4S, v24.4S, v31.s[0] +sub v24.4s, v13.4s, v12.4s +add v13.4s, v13.4s, v12.4s +trn1 v12.4S, v18.4S, v9.4S +trn2 v17.4S, v18.4S, v9.4S +trn1 v23.4S, v13.4S, v24.4S +trn2 v21.4S, v13.4S, v24.4S +trn2 v13.2D, v12.2D, v23.2D +trn2 v24.2D, v17.2D, v21.2D +trn1 v18.2D, v12.2D, v23.2D +trn1 v9.2D, v17.2D, v21.2D +sqrdmulh v21.4S, v13.4S, v0.4S +mul v13.4S, v13.4S,v29.4S +mla v13.4S, v21.4S, v31.s[0] +sub v21.4s, v18.4s, v13.4s +add v18.4s, v18.4s, v13.4s +sqrdmulh v13.4S, v24.4S, v0.4S +mul v24.4S, v24.4S,v29.4S +mla v24.4S, v13.4S, v31.s[0] +sub v13.4s, v9.4s, v24.4s +add v9.4s, v9.4s, v24.4s +sqrdmulh v24.4S, v9.4S, v20.4S +mul v9.4S, v9.4S,v19.4S +mla v9.4S, v24.4S, v31.s[0] +sub v24.4s, v18.4s, v9.4s +add v18.4s, v18.4s, v9.4s +sqrdmulh v9.4S, v13.4S, v30.4S +mul v13.4S, v13.4S,v3.4S +mla v13.4S, v9.4S, v31.s[0] +sub v9.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +str q18, [x0, #384] +str q24, [x0, #400] +str q21, [x0, #416] +str q9, [x0, #432] +ldr q9, [x17, #+1024] +ldr q21, [x17, #+1040] +ldr q24, [x17, #+1056] +ldr q18, [x17, #+1072] +ldr q13, [x17, #+1088] +ldr q17, [x17, #+1104] +ldr q23, [x17, #+1120] +ldr q12, [x17, #+1136] +ldr q30, [x0, #480] +ldr q3, [x0, #496] +ldr q20, [x0, #448] +ldr q19, [x0, #464] +sqrdmulh v0.4S, v30.4S, v21.s[0] +mul v30.4S, v30.4S,v9.s[0] +mla v30.4S, v0.4S, v31.s[0] +sub v0.4s, v20.4s, v30.4s +add v20.4s, v20.4s, v30.4s +sqrdmulh v30.4S, v3.4S, v21.s[0] +mul v3.4S, v3.4S,v9.s[0] +mla v3.4S, v30.4S, v31.s[0] +sub v30.4s, v19.4s, v3.4s +add v19.4s, v19.4s, v3.4s +sqrdmulh v3.4S, v19.4S, v21.s[1] +mul v19.4S, v19.4S,v9.s[1] +mla v19.4S, v3.4S, v31.s[0] +sub v3.4s, v20.4s, v19.4s +add v20.4s, v20.4s, v19.4s +sqrdmulh v19.4S, v30.4S, v21.s[2] +mul v30.4S, v30.4S,v9.s[2] +mla v30.4S, v19.4S, v31.s[0] +sub v19.4s, v0.4s, v30.4s +add v0.4s, v0.4s, v30.4s +trn1 v30.4S, v20.4S, v3.4S +trn2 v29.4S, v20.4S, v3.4S +trn1 v5.4S, v0.4S, v19.4S +trn2 v4.4S, v0.4S, v19.4S +trn2 v0.2D, v30.2D, v5.2D +trn2 v19.2D, v29.2D, v4.2D +trn1 v20.2D, v30.2D, v5.2D +trn1 v3.2D, v29.2D, v4.2D +sqrdmulh v4.4S, v0.4S, v18.4S +mul v0.4S, v0.4S,v24.4S +mla v0.4S, v4.4S, v31.s[0] +sub v4.4s, v20.4s, v0.4s +add v20.4s, v20.4s, v0.4s +sqrdmulh v0.4S, v19.4S, v18.4S +mul v19.4S, v19.4S,v24.4S +mla v19.4S, v0.4S, v31.s[0] +sub v0.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v3.4S, v17.4S +mul v3.4S, v3.4S,v13.4S +mla v3.4S, v19.4S, v31.s[0] +sub v19.4s, v20.4s, v3.4s +add v20.4s, v20.4s, v3.4s +sqrdmulh v3.4S, v0.4S, v12.4S +mul v0.4S, v0.4S,v23.4S +mla v0.4S, v3.4S, v31.s[0] +sub v3.4s, v4.4s, v0.4s +add v4.4s, v4.4s, v0.4s +str q20, [x0, #448] +str q19, [x0, #464] +str q4, [x0, #480] +str q3, [x0, #496] +ldr q3, [x17, #+1152] +ldr q4, [x17, #+1168] +ldr q19, [x17, #+1184] +ldr q20, [x17, #+1200] +ldr q0, [x17, #+1216] +ldr q29, [x17, #+1232] +ldr q5, [x17, #+1248] +ldr q30, [x17, #+1264] +ldr q12, [x0, #544] +ldr q23, [x0, #560] +ldr q17, [x0, #512] +ldr q13, [x0, #528] +sqrdmulh v18.4S, v12.4S, v4.s[0] +mul v12.4S, v12.4S,v3.s[0] +mla v12.4S, v18.4S, v31.s[0] +sub v18.4s, v17.4s, v12.4s +add v17.4s, v17.4s, v12.4s +sqrdmulh v12.4S, v23.4S, v4.s[0] +mul v23.4S, v23.4S,v3.s[0] +mla v23.4S, v12.4S, v31.s[0] +sub v12.4s, v13.4s, v23.4s +add v13.4s, v13.4s, v23.4s +sqrdmulh v23.4S, v13.4S, v4.s[1] +mul v13.4S, v13.4S,v3.s[1] +mla v13.4S, v23.4S, v31.s[0] +sub v23.4s, v17.4s, v13.4s +add v17.4s, v17.4s, v13.4s +sqrdmulh v13.4S, v12.4S, v4.s[2] +mul v12.4S, v12.4S,v3.s[2] +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v18.4s, v12.4s +add v18.4s, v18.4s, v12.4s +trn1 v12.4S, v17.4S, v23.4S +trn2 v24.4S, v17.4S, v23.4S +trn1 v21.4S, v18.4S, v13.4S +trn2 v9.4S, v18.4S, v13.4S +trn2 v18.2D, v12.2D, v21.2D +trn2 v13.2D, v24.2D, v9.2D +trn1 v17.2D, v12.2D, v21.2D +trn1 v23.2D, v24.2D, v9.2D +sqrdmulh v9.4S, v18.4S, v20.4S +mul v18.4S, v18.4S,v19.4S +mla v18.4S, v9.4S, v31.s[0] +sub v9.4s, v17.4s, v18.4s +add v17.4s, v17.4s, v18.4s +sqrdmulh v18.4S, v13.4S, v20.4S +mul v13.4S, v13.4S,v19.4S +mla v13.4S, v18.4S, v31.s[0] +sub v18.4s, v23.4s, v13.4s +add v23.4s, v23.4s, v13.4s +sqrdmulh v13.4S, v23.4S, v29.4S +mul v23.4S, v23.4S,v0.4S +mla v23.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v23.4s +add v17.4s, v17.4s, v23.4s +sqrdmulh v23.4S, v18.4S, v30.4S +mul v18.4S, v18.4S,v5.4S +mla v18.4S, v23.4S, v31.s[0] +sub v23.4s, v9.4s, v18.4s +add v9.4s, v9.4s, v18.4s +str q17, [x0, #512] +str q13, [x0, #528] +str q9, [x0, #544] +str q23, [x0, #560] +ldr q23, [x17, #+1280] +ldr q9, [x17, #+1296] +ldr q13, [x17, #+1312] +ldr q17, [x17, #+1328] +ldr q18, [x17, #+1344] +ldr q24, [x17, #+1360] +ldr q21, [x17, #+1376] +ldr q12, [x17, #+1392] +ldr q30, [x0, #608] +ldr q5, [x0, #624] +ldr q29, [x0, #576] +ldr q0, [x0, #592] +sqrdmulh v20.4S, v30.4S, v9.s[0] +mul v30.4S, v30.4S,v23.s[0] +mla v30.4S, v20.4S, v31.s[0] +sub v20.4s, v29.4s, v30.4s +add v29.4s, v29.4s, v30.4s +sqrdmulh v30.4S, v5.4S, v9.s[0] +mul v5.4S, v5.4S,v23.s[0] +mla v5.4S, v30.4S, v31.s[0] +sub v30.4s, v0.4s, v5.4s +add v0.4s, v0.4s, v5.4s +sqrdmulh v5.4S, v0.4S, v9.s[1] +mul v0.4S, v0.4S,v23.s[1] +mla v0.4S, v5.4S, v31.s[0] +sub v5.4s, v29.4s, v0.4s +add v29.4s, v29.4s, v0.4s +sqrdmulh v0.4S, v30.4S, v9.s[2] +mul v30.4S, v30.4S,v23.s[2] +mla v30.4S, v0.4S, v31.s[0] +sub v0.4s, v20.4s, v30.4s +add v20.4s, v20.4s, v30.4s +trn1 v30.4S, v29.4S, v5.4S +trn2 v19.4S, v29.4S, v5.4S +trn1 v4.4S, v20.4S, v0.4S +trn2 v3.4S, v20.4S, v0.4S +trn2 v20.2D, v30.2D, v4.2D +trn2 v0.2D, v19.2D, v3.2D +trn1 v29.2D, v30.2D, v4.2D +trn1 v5.2D, v19.2D, v3.2D +sqrdmulh v3.4S, v20.4S, v17.4S +mul v20.4S, v20.4S,v13.4S +mla v20.4S, v3.4S, v31.s[0] +sub v3.4s, v29.4s, v20.4s +add v29.4s, v29.4s, v20.4s +sqrdmulh v20.4S, v0.4S, v17.4S +mul v0.4S, v0.4S,v13.4S +mla v0.4S, v20.4S, v31.s[0] +sub v20.4s, v5.4s, v0.4s +add v5.4s, v5.4s, v0.4s +sqrdmulh v0.4S, v5.4S, v24.4S +mul v5.4S, v5.4S,v18.4S +mla v5.4S, v0.4S, v31.s[0] +sub v0.4s, v29.4s, v5.4s +add v29.4s, v29.4s, v5.4s +sqrdmulh v5.4S, v20.4S, v12.4S +mul v20.4S, v20.4S,v21.4S +mla v20.4S, v5.4S, v31.s[0] +sub v5.4s, v3.4s, v20.4s +add v3.4s, v3.4s, v20.4s +str q29, [x0, #576] +str q0, [x0, #592] +str q3, [x0, #608] +str q5, [x0, #624] +ldr q5, [x17, #+1408] +ldr q3, [x17, #+1424] +ldr q0, [x17, #+1440] +ldr q29, [x17, #+1456] +ldr q20, [x17, #+1472] +ldr q19, [x17, #+1488] +ldr q4, [x17, #+1504] +ldr q30, [x17, #+1520] +ldr q12, [x0, #672] +ldr q21, [x0, #688] +ldr q24, [x0, #640] +ldr q18, [x0, #656] +sqrdmulh v17.4S, v12.4S, v3.s[0] +mul v12.4S, v12.4S,v5.s[0] +mla v12.4S, v17.4S, v31.s[0] +sub v17.4s, v24.4s, v12.4s +add v24.4s, v24.4s, v12.4s +sqrdmulh v12.4S, v21.4S, v3.s[0] +mul v21.4S, v21.4S,v5.s[0] +mla v21.4S, v12.4S, v31.s[0] +sub v12.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v18.4S, v3.s[1] +mul v18.4S, v18.4S,v5.s[1] +mla v18.4S, v21.4S, v31.s[0] +sub v21.4s, v24.4s, v18.4s +add v24.4s, v24.4s, v18.4s +sqrdmulh v18.4S, v12.4S, v3.s[2] +mul v12.4S, v12.4S,v5.s[2] +mla v12.4S, v18.4S, v31.s[0] +sub v18.4s, v17.4s, v12.4s +add v17.4s, v17.4s, v12.4s +trn1 v12.4S, v24.4S, v21.4S +trn2 v13.4S, v24.4S, v21.4S +trn1 v9.4S, v17.4S, v18.4S +trn2 v23.4S, v17.4S, v18.4S +trn2 v17.2D, v12.2D, v9.2D +trn2 v18.2D, v13.2D, v23.2D +trn1 v24.2D, v12.2D, v9.2D +trn1 v21.2D, v13.2D, v23.2D +sqrdmulh v23.4S, v17.4S, v29.4S +mul v17.4S, v17.4S,v0.4S +mla v17.4S, v23.4S, v31.s[0] +sub v23.4s, v24.4s, v17.4s +add v24.4s, v24.4s, v17.4s +sqrdmulh v17.4S, v18.4S, v29.4S +mul v18.4S, v18.4S,v0.4S +mla v18.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v18.4s +add v21.4s, v21.4s, v18.4s +sqrdmulh v18.4S, v21.4S, v19.4S +mul v21.4S, v21.4S,v20.4S +mla v21.4S, v18.4S, v31.s[0] +sub v18.4s, v24.4s, v21.4s +add v24.4s, v24.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v30.4S +mul v17.4S, v17.4S,v4.4S +mla v17.4S, v21.4S, v31.s[0] +sub v21.4s, v23.4s, v17.4s +add v23.4s, v23.4s, v17.4s +str q24, [x0, #640] +str q18, [x0, #656] +str q23, [x0, #672] +str q21, [x0, #688] +ldr q21, [x17, #+1536] +ldr q23, [x17, #+1552] +ldr q18, [x17, #+1568] +ldr q24, [x17, #+1584] +ldr q17, [x17, #+1600] +ldr q13, [x17, #+1616] +ldr q9, [x17, #+1632] +ldr q12, [x17, #+1648] +ldr q30, [x0, #736] +ldr q4, [x0, #752] +ldr q19, [x0, #704] +ldr q20, [x0, #720] +sqrdmulh v29.4S, v30.4S, v23.s[0] +mul v30.4S, v30.4S,v21.s[0] +mla v30.4S, v29.4S, v31.s[0] +sub v29.4s, v19.4s, v30.4s +add v19.4s, v19.4s, v30.4s +sqrdmulh v30.4S, v4.4S, v23.s[0] +mul v4.4S, v4.4S,v21.s[0] +mla v4.4S, v30.4S, v31.s[0] +sub v30.4s, v20.4s, v4.4s +add v20.4s, v20.4s, v4.4s +sqrdmulh v4.4S, v20.4S, v23.s[1] +mul v20.4S, v20.4S,v21.s[1] +mla v20.4S, v4.4S, v31.s[0] +sub v4.4s, v19.4s, v20.4s +add v19.4s, v19.4s, v20.4s +sqrdmulh v20.4S, v30.4S, v23.s[2] +mul v30.4S, v30.4S,v21.s[2] +mla v30.4S, v20.4S, v31.s[0] +sub v20.4s, v29.4s, v30.4s +add v29.4s, v29.4s, v30.4s +trn1 v30.4S, v19.4S, v4.4S +trn2 v0.4S, v19.4S, v4.4S +trn1 v3.4S, v29.4S, v20.4S +trn2 v5.4S, v29.4S, v20.4S +trn2 v29.2D, v30.2D, v3.2D +trn2 v20.2D, v0.2D, v5.2D +trn1 v19.2D, v30.2D, v3.2D +trn1 v4.2D, v0.2D, v5.2D +sqrdmulh v5.4S, v29.4S, v24.4S +mul v29.4S, v29.4S,v18.4S +mla v29.4S, v5.4S, v31.s[0] +sub v5.4s, v19.4s, v29.4s +add v19.4s, v19.4s, v29.4s +sqrdmulh v29.4S, v20.4S, v24.4S +mul v20.4S, v20.4S,v18.4S +mla v20.4S, v29.4S, v31.s[0] +sub v29.4s, v4.4s, v20.4s +add v4.4s, v4.4s, v20.4s +sqrdmulh v20.4S, v4.4S, v13.4S +mul v4.4S, v4.4S,v17.4S +mla v4.4S, v20.4S, v31.s[0] +sub v20.4s, v19.4s, v4.4s +add v19.4s, v19.4s, v4.4s +sqrdmulh v4.4S, v29.4S, v12.4S +mul v29.4S, v29.4S,v9.4S +mla v29.4S, v4.4S, v31.s[0] +sub v4.4s, v5.4s, v29.4s +add v5.4s, v5.4s, v29.4s +str q19, [x0, #704] +str q20, [x0, #720] +str q5, [x0, #736] +str q4, [x0, #752] +ldr q4, [x17, #+1664] +ldr q5, [x17, #+1680] +ldr q20, [x17, #+1696] +ldr q19, [x17, #+1712] +ldr q29, [x17, #+1728] +ldr q0, [x17, #+1744] +ldr q3, [x17, #+1760] +ldr q30, [x17, #+1776] +ldr q12, [x0, #800] +ldr q9, [x0, #816] +ldr q13, [x0, #768] +ldr q17, [x0, #784] +sqrdmulh v24.4S, v12.4S, v5.s[0] +mul v12.4S, v12.4S,v4.s[0] +mla v12.4S, v24.4S, v31.s[0] +sub v24.4s, v13.4s, v12.4s +add v13.4s, v13.4s, v12.4s +sqrdmulh v12.4S, v9.4S, v5.s[0] +mul v9.4S, v9.4S,v4.s[0] +mla v9.4S, v12.4S, v31.s[0] +sub v12.4s, v17.4s, v9.4s +add v17.4s, v17.4s, v9.4s +sqrdmulh v9.4S, v17.4S, v5.s[1] +mul v17.4S, v17.4S,v4.s[1] +mla v17.4S, v9.4S, v31.s[0] +sub v9.4s, v13.4s, v17.4s +add v13.4s, v13.4s, v17.4s +sqrdmulh v17.4S, v12.4S, v5.s[2] +mul v12.4S, v12.4S,v4.s[2] +mla v12.4S, v17.4S, v31.s[0] +sub v17.4s, v24.4s, v12.4s +add v24.4s, v24.4s, v12.4s +trn1 v12.4S, v13.4S, v9.4S +trn2 v18.4S, v13.4S, v9.4S +trn1 v23.4S, v24.4S, v17.4S +trn2 v21.4S, v24.4S, v17.4S +trn2 v24.2D, v12.2D, v23.2D +trn2 v17.2D, v18.2D, v21.2D +trn1 v13.2D, v12.2D, v23.2D +trn1 v9.2D, v18.2D, v21.2D +sqrdmulh v21.4S, v24.4S, v19.4S +mul v24.4S, v24.4S,v20.4S +mla v24.4S, v21.4S, v31.s[0] +sub v21.4s, v13.4s, v24.4s +add v13.4s, v13.4s, v24.4s +sqrdmulh v24.4S, v17.4S, v19.4S +mul v17.4S, v17.4S,v20.4S +mla v17.4S, v24.4S, v31.s[0] +sub v24.4s, v9.4s, v17.4s +add v9.4s, v9.4s, v17.4s +sqrdmulh v17.4S, v9.4S, v0.4S +mul v9.4S, v9.4S,v29.4S +mla v9.4S, v17.4S, v31.s[0] +sub v17.4s, v13.4s, v9.4s +add v13.4s, v13.4s, v9.4s +sqrdmulh v9.4S, v24.4S, v30.4S +mul v24.4S, v24.4S,v3.4S +mla v24.4S, v9.4S, v31.s[0] +sub v9.4s, v21.4s, v24.4s +add v21.4s, v21.4s, v24.4s +str q13, [x0, #768] +str q17, [x0, #784] +str q21, [x0, #800] +str q9, [x0, #816] +ldr q9, [x17, #+1792] +ldr q21, [x17, #+1808] +ldr q17, [x17, #+1824] +ldr q13, [x17, #+1840] +ldr q24, [x17, #+1856] +ldr q18, [x17, #+1872] +ldr q23, [x17, #+1888] +ldr q12, [x17, #+1904] +ldr q30, [x0, #864] +ldr q3, [x0, #880] +ldr q0, [x0, #832] +ldr q29, [x0, #848] +sqrdmulh v19.4S, v30.4S, v21.s[0] +mul v30.4S, v30.4S,v9.s[0] +mla v30.4S, v19.4S, v31.s[0] +sub v19.4s, v0.4s, v30.4s +add v0.4s, v0.4s, v30.4s +sqrdmulh v30.4S, v3.4S, v21.s[0] +mul v3.4S, v3.4S,v9.s[0] +mla v3.4S, v30.4S, v31.s[0] +sub v30.4s, v29.4s, v3.4s +add v29.4s, v29.4s, v3.4s +sqrdmulh v3.4S, v29.4S, v21.s[1] +mul v29.4S, v29.4S,v9.s[1] +mla v29.4S, v3.4S, v31.s[0] +sub v3.4s, v0.4s, v29.4s +add v0.4s, v0.4s, v29.4s +sqrdmulh v29.4S, v30.4S, v21.s[2] +mul v30.4S, v30.4S,v9.s[2] +mla v30.4S, v29.4S, v31.s[0] +sub v29.4s, v19.4s, v30.4s +add v19.4s, v19.4s, v30.4s +trn1 v30.4S, v0.4S, v3.4S +trn2 v20.4S, v0.4S, v3.4S +trn1 v5.4S, v19.4S, v29.4S +trn2 v4.4S, v19.4S, v29.4S +trn2 v19.2D, v30.2D, v5.2D +trn2 v29.2D, v20.2D, v4.2D +trn1 v0.2D, v30.2D, v5.2D +trn1 v3.2D, v20.2D, v4.2D +sqrdmulh v4.4S, v19.4S, v13.4S +mul v19.4S, v19.4S,v17.4S +mla v19.4S, v4.4S, v31.s[0] +sub v4.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v29.4S, v13.4S +mul v29.4S, v29.4S,v17.4S +mla v29.4S, v19.4S, v31.s[0] +sub v19.4s, v3.4s, v29.4s +add v3.4s, v3.4s, v29.4s +sqrdmulh v29.4S, v3.4S, v18.4S +mul v3.4S, v3.4S,v24.4S +mla v3.4S, v29.4S, v31.s[0] +sub v29.4s, v0.4s, v3.4s +add v0.4s, v0.4s, v3.4s +sqrdmulh v3.4S, v19.4S, v12.4S +mul v19.4S, v19.4S,v23.4S +mla v19.4S, v3.4S, v31.s[0] +sub v3.4s, v4.4s, v19.4s +add v4.4s, v4.4s, v19.4s +str q0, [x0, #832] +str q29, [x0, #848] +str q4, [x0, #864] +str q3, [x0, #880] +ldr q3, [x17, #+1920] +ldr q4, [x17, #+1936] +ldr q29, [x17, #+1952] +ldr q0, [x17, #+1968] +ldr q19, [x17, #+1984] +ldr q20, [x17, #+2000] +ldr q5, [x17, #+2016] +ldr q30, [x17, #+2032] +ldr q12, [x0, #928] +ldr q23, [x0, #944] +ldr q18, [x0, #896] +ldr q24, [x0, #912] +sqrdmulh v13.4S, v12.4S, v4.s[0] +mul v12.4S, v12.4S,v3.s[0] +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v18.4s, v12.4s +add v18.4s, v18.4s, v12.4s +sqrdmulh v12.4S, v23.4S, v4.s[0] +mul v23.4S, v23.4S,v3.s[0] +mla v23.4S, v12.4S, v31.s[0] +sub v12.4s, v24.4s, v23.4s +add v24.4s, v24.4s, v23.4s +sqrdmulh v23.4S, v24.4S, v4.s[1] +mul v24.4S, v24.4S,v3.s[1] +mla v24.4S, v23.4S, v31.s[0] +sub v23.4s, v18.4s, v24.4s +add v18.4s, v18.4s, v24.4s +sqrdmulh v24.4S, v12.4S, v4.s[2] +mul v12.4S, v12.4S,v3.s[2] +mla v12.4S, v24.4S, v31.s[0] +sub v24.4s, v13.4s, v12.4s +add v13.4s, v13.4s, v12.4s +trn1 v12.4S, v18.4S, v23.4S +trn2 v17.4S, v18.4S, v23.4S +trn1 v21.4S, v13.4S, v24.4S +trn2 v9.4S, v13.4S, v24.4S +trn2 v13.2D, v12.2D, v21.2D +trn2 v24.2D, v17.2D, v9.2D +trn1 v18.2D, v12.2D, v21.2D +trn1 v23.2D, v17.2D, v9.2D +sqrdmulh v9.4S, v13.4S, v0.4S +mul v13.4S, v13.4S,v29.4S +mla v13.4S, v9.4S, v31.s[0] +sub v9.4s, v18.4s, v13.4s +add v18.4s, v18.4s, v13.4s +sqrdmulh v13.4S, v24.4S, v0.4S +mul v24.4S, v24.4S,v29.4S +mla v24.4S, v13.4S, v31.s[0] +sub v13.4s, v23.4s, v24.4s +add v23.4s, v23.4s, v24.4s +sqrdmulh v24.4S, v23.4S, v20.4S +mul v23.4S, v23.4S,v19.4S +mla v23.4S, v24.4S, v31.s[0] +sub v24.4s, v18.4s, v23.4s +add v18.4s, v18.4s, v23.4s +sqrdmulh v23.4S, v13.4S, v30.4S +mul v13.4S, v13.4S,v5.4S +mla v13.4S, v23.4S, v31.s[0] +sub v23.4s, v9.4s, v13.4s +add v9.4s, v9.4s, v13.4s +str q18, [x0, #896] +str q24, [x0, #912] +str q9, [x0, #928] +str q23, [x0, #944] +ldr q23, [x17, #+2048] +ldr q9, [x17, #+2064] +ldr q24, [x17, #+2080] +ldr q18, [x17, #+2096] +ldr q13, [x17, #+2112] +ldr q17, [x17, #+2128] +ldr q21, [x17, #+2144] +ldr q12, [x17, #+2160] +ldr q30, [x0, #992] +ldr q5, [x0, #1008] +ldr q20, [x0, #960] +ldr q19, [x0, #976] +sqrdmulh v0.4S, v30.4S, v9.s[0] +mul v30.4S, v30.4S,v23.s[0] +mla v30.4S, v0.4S, v31.s[0] +sub v0.4s, v20.4s, v30.4s +add v20.4s, v20.4s, v30.4s +sqrdmulh v30.4S, v5.4S, v9.s[0] +mul v5.4S, v5.4S,v23.s[0] +mla v5.4S, v30.4S, v31.s[0] +sub v30.4s, v19.4s, v5.4s +add v19.4s, v19.4s, v5.4s +sqrdmulh v5.4S, v19.4S, v9.s[1] +mul v19.4S, v19.4S,v23.s[1] +mla v19.4S, v5.4S, v31.s[0] +sub v5.4s, v20.4s, v19.4s +add v20.4s, v20.4s, v19.4s +sqrdmulh v19.4S, v30.4S, v9.s[2] +mul v30.4S, v30.4S,v23.s[2] +mla v30.4S, v19.4S, v31.s[0] +sub v19.4s, v0.4s, v30.4s +add v0.4s, v0.4s, v30.4s +trn1 v30.4S, v20.4S, v5.4S +trn2 v29.4S, v20.4S, v5.4S +trn1 v4.4S, v0.4S, v19.4S +trn2 v3.4S, v0.4S, v19.4S +trn2 v0.2D, v30.2D, v4.2D +trn2 v19.2D, v29.2D, v3.2D +trn1 v20.2D, v30.2D, v4.2D +trn1 v5.2D, v29.2D, v3.2D +sqrdmulh v3.4S, v0.4S, v18.4S +mul v0.4S, v0.4S,v24.4S +mla v0.4S, v3.4S, v31.s[0] +sub v3.4s, v20.4s, v0.4s +add v20.4s, v20.4s, v0.4s +sqrdmulh v0.4S, v19.4S, v18.4S +mul v19.4S, v19.4S,v24.4S +mla v19.4S, v0.4S, v31.s[0] +sub v0.4s, v5.4s, v19.4s +add v5.4s, v5.4s, v19.4s +sqrdmulh v19.4S, v5.4S, v17.4S +mul v5.4S, v5.4S,v13.4S +mla v5.4S, v19.4S, v31.s[0] +sub v19.4s, v20.4s, v5.4s +add v20.4s, v20.4s, v5.4s +sqrdmulh v5.4S, v0.4S, v12.4S +mul v0.4S, v0.4S,v21.4S +mla v0.4S, v5.4S, v31.s[0] +sub v5.4s, v3.4s, v0.4s +add v3.4s, v3.4s, v0.4s +str q20, [x0, #960] +str q19, [x0, #976] +str q3, [x0, #992] +str q5, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 2476 +// Instruction count: 2472 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_17_0.s b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_17_0.s new file mode 100644 index 0000000..eeedda8 --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_17_0.s @@ -0,0 +1,2486 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 26036764 // Layer 6, block 0 +.word 7065381 // Layer 6, block 1 +.word 11280567 // Layer 6, block 2 +.word 19695786 // Layer 6, block 3 +.word 1666225723 // Layer 6, block 0 +.word 452149874 // Layer 6, block 1 +.word 721901190 // Layer 6, block 2 +.word 1260434103 // Layer 6, block 3 +.word 28678040 // Layer 7, block 0 +.word 5637166 // Layer 7, block 2 +.word 18759424 // Layer 7, block 4 +.word 8648030 // Layer 7, block 6 +.word 1835254486 // Layer 7, block 0 +.word 360751090 // Layer 7, block 2 +.word 1200511508 // Layer 7, block 4 +.word 553431680 // Layer 7, block 6 +.word 7232147 // Layer 7, block 1 +.word 7430689 // Layer 7, block 3 +.word 14819378 // Layer 7, block 5 +.word 22112339 // Layer 7, block 7 +.word 462822084 // Layer 7, block 1 +.word 475527802 // Layer 7, block 3 +.word 948367809 // Layer 7, block 5 +.word 1415081692 // Layer 7, block 7 +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14834498 // Layer 6, block 4 +.word 22861321 // Layer 6, block 5 +.word 23033862 // Layer 6, block 6 +.word 32211066 // Layer 6, block 7 +.word 949335415 // Layer 6, block 4 +.word 1463012881 // Layer 6, block 5 +.word 1474054663 // Layer 6, block 6 +.word 2061350894 // Layer 6, block 7 +.word 7103825 // Layer 7, block 8 +.word 24338119 // Layer 7, block 10 +.word 6674394 // Layer 7, block 12 +.word 3716128 // Layer 7, block 14 +.word 454610102 // Layer 7, block 8 +.word 1557520740 // Layer 7, block 10 +.word 427128616 // Layer 7, block 12 +.word 237814041 // Layer 7, block 14 +.word 18577393 // Layer 7, block 9 +.word 17042091 // Layer 7, block 11 +.word 6574213 // Layer 7, block 13 +.word 24666803 // Layer 7, block 15 +.word 1188862414 // Layer 7, block 9 +.word 1090610585 // Layer 7, block 11 +.word 420717521 // Layer 7, block 13 +.word 1578554911 // Layer 7, block 15 +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 11253846 // Layer 6, block 8 +.word 16151303 // Layer 6, block 9 +.word 1821442 // Layer 6, block 10 +.word 23358663 // Layer 6, block 11 +.word 720191176 // Layer 6, block 8 +.word 1033604503 // Layer 6, block 9 +.word 116563391 // Layer 6, block 10 +.word 1494840340 // Layer 6, block 11 +.word 32787475 // Layer 7, block 16 +.word 8269259 // Layer 7, block 18 +.word 20826321 // Layer 7, block 20 +.word 21194054 // Layer 7, block 22 +.word 2098238255 // Layer 7, block 16 +.word 529192186 // Layer 7, block 18 +.word 1332782821 // Layer 7, block 20 +.word 1356315937 // Layer 7, block 22 +.word 28400654 // Layer 7, block 17 +.word 31090287 // Layer 7, block 19 +.word 26776841 // Layer 7, block 21 +.word 22281074 // Layer 7, block 23 +.word 1817503137 // Layer 7, block 17 +.word 1989626512 // Layer 7, block 19 +.word 1713587037 // Layer 7, block 21 +.word 1425879908 // Layer 7, block 23 +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 20504641 // Layer 6, block 12 +.word 7735096 // Layer 6, block 13 +.word 29463916 // Layer 6, block 14 +.word 23172067 // Layer 6, block 15 +.word 1312196872 // Layer 6, block 12 +.word 495008363 // Layer 6, block 13 +.word 1885546712 // Layer 6, block 14 +.word 1482899108 // Layer 6, block 15 +.word 1953000 // Layer 7, block 24 +.word 12766243 // Layer 7, block 26 +.word 16292342 // Layer 7, block 28 +.word 25143337 // Layer 7, block 30 +.word 124982461 // Layer 7, block 24 +.word 816977197 // Layer 7, block 26 +.word 1042630311 // Layer 7, block 28 +.word 1609050759 // Layer 7, block 30 +.word 12486848 // Layer 7, block 25 +.word 31556661 // Layer 7, block 27 +.word 28330310 // Layer 7, block 29 +.word 15137961 // Layer 7, block 31 +.word 799097282 // Layer 7, block 25 +.word 2019472170 // Layer 7, block 27 +.word 1813001465 // Layer 7, block 29 +.word 968755565 // Layer 7, block 31 +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 18663828 // Layer 6, block 16 +.word 25765932 // Layer 6, block 17 +.word 11779122 // Layer 6, block 18 +.word 29112305 // Layer 6, block 19 +.word 1194393831 // Layer 6, block 16 +.word 1648893798 // Layer 6, block 17 +.word 753806275 // Layer 6, block 18 +.word 1863045325 // Layer 6, block 19 +.word 33163184 // Layer 7, block 32 +.word 11550623 // Layer 7, block 34 +.word 25375595 // Layer 7, block 36 +.word 18254638 // Layer 7, block 38 +.word 2122281795 // Layer 7, block 32 +.word 739183455 // Layer 7, block 34 +.word 1623914137 // Layer 7, block 36 +.word 1168207670 // Layer 7, block 38 +.word 9551359 // Layer 7, block 33 +.word 33257316 // Layer 7, block 35 +.word 10387700 // Layer 7, block 37 +.word 4263629 // Layer 7, block 39 +.word 611240324 // Layer 7, block 33 +.word 2128305784 // Layer 7, block 35 +.word 664762063 // Layer 7, block 37 +.word 272851431 // Layer 7, block 39 +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 596073 // Layer 6, block 20 +.word 29039358 // Layer 6, block 21 +.word 6760262 // Layer 6, block 22 +.word 2228887 // Layer 6, block 23 +.word 38145761 // Layer 6, block 20 +.word 1858377074 // Layer 6, block 21 +.word 432623749 // Layer 6, block 22 +.word 142637881 // Layer 6, block 23 +.word 25929180 // Layer 7, block 40 +.word 23508428 // Layer 7, block 42 +.word 22560727 // Layer 7, block 44 +.word 29457393 // Layer 7, block 46 +.word 1659340873 // Layer 7, block 40 +.word 1504424569 // Layer 7, block 42 +.word 1443776334 // Layer 7, block 44 +.word 1885129272 // Layer 7, block 46 +.word 17371159 // Layer 7, block 41 +.word 11558208 // Layer 7, block 43 +.word 15755637 // Layer 7, block 45 +.word 20740787 // Layer 7, block 47 +.word 1111669329 // Layer 7, block 41 +.word 739668858 // Layer 7, block 43 +.word 1008283812 // Layer 7, block 45 +.word 1327309063 // Layer 7, block 47 +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 13624329 // Layer 6, block 24 +.word 9838349 // Layer 6, block 25 +.word 6934560 // Layer 6, block 26 +.word 11310234 // Layer 6, block 27 +.word 871890510 // Layer 6, block 24 +.word 629606282 // Layer 6, block 25 +.word 443777969 // Layer 6, block 26 +.word 723799733 // Layer 6, block 27 +.word 3153984 // Layer 7, block 48 +.word 15599806 // Layer 7, block 50 +.word 23484790 // Layer 7, block 52 +.word 30174454 // Layer 7, block 54 +.word 201839571 // Layer 7, block 48 +.word 998311389 // Layer 7, block 50 +.word 1502911852 // Layer 7, block 52 +.word 1931017673 // Layer 7, block 54 +.word 13598070 // Layer 7, block 49 +.word 31454003 // Layer 7, block 51 +.word 20506260 // Layer 7, block 53 +.word 5928435 // Layer 7, block 55 +.word 870210062 // Layer 7, block 49 +.word 2012902560 // Layer 7, block 51 +.word 1312300480 // Layer 7, block 53 +.word 379390883 // Layer 7, block 55 +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 32798516 // Layer 6, block 28 +.word 9911360 // Layer 6, block 29 +.word 32443170 // Layer 6, block 30 +.word 31293482 // Layer 6, block 31 +.word 2098944825 // Layer 6, block 28 +.word 634278629 // Layer 6, block 29 +.word 2076204416 // Layer 6, block 30 +.word 2002630000 // Layer 6, block 31 +.word 26013877 // Layer 7, block 56 +.word 22928950 // Layer 7, block 58 +.word 24547058 // Layer 7, block 60 +.word 21082546 // Layer 7, block 62 +.word 1664761067 // Layer 7, block 56 +.word 1467340807 // Layer 7, block 58 +.word 1570891816 // Layer 7, block 60 +.word 1349179970 // Layer 7, block 62 +.word 21864746 // Layer 7, block 57 +.word 27678266 // Layer 7, block 59 +.word 30695887 // Layer 7, block 61 +.word 31772478 // Layer 7, block 63 +.word 1399236949 // Layer 7, block 57 +.word 1771273834 // Layer 7, block 59 +.word 1964386839 // Layer 7, block 61 +.word 2033283404 // Layer 7, block 63 +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 2853776 // Layer 6, block 32 +.word 31645959 // Layer 6, block 33 +.word 29723614 // Layer 6, block 34 +.word 31813171 // Layer 6, block 35 +.word 182627725 // Layer 6, block 32 +.word 2025186806 // Layer 6, block 33 +.word 1902166116 // Layer 6, block 34 +.word 2035887557 // Layer 6, block 35 +.word 30377953 // Layer 7, block 64 +.word 4924837 // Layer 7, block 66 +.word 11362575 // Layer 7, block 68 +.word 31398766 // Layer 7, block 70 +.word 1944040616 // Layer 7, block 64 +.word 315165513 // Layer 7, block 66 +.word 727149301 // Layer 7, block 68 +.word 2009367662 // Layer 7, block 70 +.word 27689101 // Layer 7, block 65 +.word 31229525 // Layer 7, block 67 +.word 6544948 // Layer 7, block 69 +.word 13728247 // Layer 7, block 71 +.word 1771967221 // Layer 7, block 65 +.word 1998537064 // Layer 7, block 67 +.word 418844704 // Layer 7, block 69 +.word 878540754 // Layer 7, block 71 +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9116920 // Layer 6, block 36 +.word 26449800 // Layer 6, block 37 +.word 27173300 // Layer 6, block 38 +.word 1574249 // Layer 6, block 39 +.word 583438350 // Layer 6, block 36 +.word 1692658010 // Layer 6, block 37 +.word 1738958476 // Layer 6, block 38 +.word 100744247 // Layer 6, block 39 +.word 6510145 // Layer 7, block 72 +.word 760999 // Layer 7, block 74 +.word 1634503 // Layer 7, block 76 +.word 29546109 // Layer 7, block 78 +.word 416617482 // Layer 7, block 72 +.word 48700219 // Layer 7, block 74 +.word 104600209 // Layer 7, block 76 +.word 1890806663 // Layer 7, block 78 +.word 2195232 // Layer 7, block 73 +.word 4465852 // Layer 7, block 75 +.word 31203102 // Layer 7, block 77 +.word 29916743 // Layer 7, block 79 +.word 140484126 // Layer 7, block 73 +.word 285792715 // Layer 7, block 75 +.word 1996846121 // Layer 7, block 77 +.word 1914525428 // Layer 7, block 79 +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29172999 // Layer 6, block 40 +.word 16825951 // Layer 6, block 41 +.word 11592382 // Layer 6, block 42 +.word 2671395 // Layer 6, block 43 +.word 1866929445 // Layer 6, block 40 +.word 1076778680 // Layer 6, block 41 +.word 741855827 // Layer 6, block 42 +.word 170956232 // Layer 6, block 43 +.word 14579779 // Layer 7, block 80 +.word 24263513 // Layer 7, block 82 +.word 4646776 // Layer 7, block 84 +.word 69049 // Layer 7, block 86 +.word 933034643 // Layer 7, block 80 +.word 1552746321 // Layer 7, block 82 +.word 297370968 // Layer 7, block 84 +.word 4418799 // Layer 7, block 86 +.word 33263488 // Layer 7, block 81 +.word 22493246 // Layer 7, block 83 +.word 22009979 // Layer 7, block 85 +.word 12021234 // Layer 7, block 87 +.word 2128700762 // Layer 7, block 81 +.word 1439457879 // Layer 7, block 83 +.word 1408531152 // Layer 7, block 85 +.word 769300260 // Layer 7, block 87 +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 15720958 // Layer 6, block 44 +.word 4876619 // Layer 6, block 45 +.word 9370171 // Layer 6, block 46 +.word 2197027 // Layer 6, block 47 +.word 1006064525 // Layer 6, block 44 +.word 312079797 // Layer 6, block 45 +.word 599645177 // Layer 6, block 46 +.word 140598997 // Layer 6, block 47 +.word 16117282 // Layer 7, block 88 +.word 9635661 // Layer 7, block 90 +.word 9117520 // Layer 7, block 92 +.word 3506913 // Layer 7, block 94 +.word 1031427326 // Layer 7, block 88 +.word 616635240 // Layer 7, block 90 +.word 583476747 // Layer 7, block 92 +.word 224425303 // Layer 7, block 94 +.word 20014407 // Layer 7, block 89 +.word 25893988 // Layer 7, block 91 +.word 10257619 // Layer 7, block 93 +.word 24501669 // Layer 7, block 95 +.word 1280824291 // Layer 7, block 89 +.word 1657088757 // Layer 7, block 91 +.word 656437514 // Layer 7, block 93 +.word 1567987141 // Layer 7, block 95 +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 23467272 // Layer 6, block 48 +.word 11944835 // Layer 6, block 49 +.word 29768154 // Layer 6, block 50 +.word 3189790 // Layer 6, block 51 +.word 1501790786 // Layer 6, block 48 +.word 764411097 // Layer 6, block 49 +.word 1905016458 // Layer 6, block 50 +.word 204130980 // Layer 6, block 51 +.word 28559032 // Layer 7, block 96 +.word 20151609 // Layer 7, block 98 +.word 11645481 // Layer 7, block 100 +.word 16402437 // Layer 7, block 102 +.word 1827638556 // Layer 7, block 96 +.word 1289604549 // Layer 7, block 98 +.word 745253903 // Layer 7, block 100 +.word 1049675853 // Layer 7, block 102 +.word 1005359 // Layer 7, block 97 +.word 19130139 // Layer 7, block 99 +.word 11690281 // Layer 7, block 101 +.word 5461508 // Layer 7, block 103 +.word 64338065 // Layer 7, block 97 +.word 1224235458 // Layer 7, block 99 +.word 748120885 // Layer 7, block 101 +.word 349509836 // Layer 7, block 103 +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 4898455 // Layer 6, block 52 +.word 22059944 // Layer 6, block 53 +.word 20315246 // Layer 6, block 54 +.word 28615767 // Layer 6, block 55 +.word 313477194 // Layer 6, block 52 +.word 1411728668 // Layer 6, block 53 +.word 1300076517 // Layer 6, block 54 +.word 1831269319 // Layer 6, block 55 +.word 6226096 // Layer 7, block 104 +.word 14029790 // Layer 7, block 106 +.word 7729000 // Layer 7, block 108 +.word 13958531 // Layer 7, block 110 +.word 398439734 // Layer 7, block 104 +.word 897838034 // Layer 7, block 106 +.word 494618249 // Layer 7, block 108 +.word 893277806 // Layer 7, block 110 +.word 31755058 // Layer 7, block 105 +.word 26102744 // Layer 7, block 107 +.word 19175904 // Layer 7, block 109 +.word 19472238 // Layer 7, block 111 +.word 2032168609 // Layer 7, block 105 +.word 1670448121 // Layer 7, block 107 +.word 1227164194 // Layer 7, block 109 +.word 1246128123 // Layer 7, block 111 +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 17302560 // Layer 6, block 56 +.word 8630188 // Layer 6, block 57 +.word 13744680 // Layer 6, block 58 +.word 31890906 // Layer 6, block 59 +.word 1107279328 // Layer 6, block 56 +.word 552289879 // Layer 6, block 57 +.word 879592386 // Layer 6, block 58 +.word 2040862218 // Layer 6, block 59 +.word 4735938 // Layer 7, block 112 +.word 26671657 // Layer 7, block 114 +.word 25810971 // Layer 7, block 116 +.word 25578690 // Layer 7, block 118 +.word 303076900 // Layer 7, block 112 +.word 1706855774 // Layer 7, block 114 +.word 1651776074 // Layer 7, block 116 +.word 1636911225 // Layer 7, block 118 +.word 6957373 // Layer 7, block 113 +.word 25381712 // Layer 7, block 115 +.word 27780827 // Layer 7, block 117 +.word 28062311 // Layer 7, block 119 +.word 445237890 // Layer 7, block 113 +.word 1624305595 // Layer 7, block 115 +.word 1777837237 // Layer 7, block 117 +.word 1795850838 // Layer 7, block 119 +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 26150922 // Layer 6, block 60 +.word 29525906 // Layer 6, block 61 +.word 23080870 // Layer 6, block 62 +.word 1636987 // Layer 6, block 63 +.word 1673531278 // Layer 6, block 60 +.word 1889513769 // Layer 6, block 61 +.word 1477062945 // Layer 6, block 62 +.word 104759172 // Layer 6, block 63 +.word 10674616 // Layer 7, block 120 +.word 9508293 // Layer 7, block 122 +.word 4274200 // Layer 7, block 124 +.word 10066304 // Layer 7, block 126 +.word 683123285 // Layer 7, block 120 +.word 608484310 // Layer 7, block 122 +.word 273527923 // Layer 7, block 124 +.word 644194289 // Layer 7, block 126 +.word 26473446 // Layer 7, block 121 +.word 14853570 // Layer 7, block 123 +.word 32427548 // Layer 7, block 125 +.word 16598340 // Layer 7, block 127 +.word 1694171239 // Layer 7, block 121 +.word 950555930 // Layer 7, block 123 +.word 2075204685 // Layer 7, block 125 +.word 1062212688 // Layer 7, block 127 +.text +.global ntt_u32_full_neon_asm_var_4_4_17_0 +.global _ntt_u32_full_neon_asm_var_4_4_17_0 +ntt_u32_full_neon_asm_var_4_4_17_0: +_ntt_u32_full_neon_asm_var_4_4_17_0: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x0, #992] +sqrdmulh v27.4S, v28.4S, v29.s[0] +mul v28.4S, v28.4S,v30.s[0] +ldr q26, [x0, #928] +sqrdmulh v25.4S, v26.4S, v29.s[0] +mul v26.4S, v26.4S,v30.s[0] +ldr q24, [x0, #864] +sqrdmulh v23.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +ldr q22, [x0, #800] +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +ldr q20, [x0, #736] +mla v28.4S, v27.4S, v31.s[0] +sqrdmulh v27.4S, v20.4S, v29.s[0] +ldr q19, [x0, #672] +mla v26.4S, v25.4S, v31.s[0] +sqrdmulh v25.4S, v19.4S, v29.s[0] +ldr q18, [x0, #608] +mla v24.4S, v23.4S, v31.s[0] +sqrdmulh v23.4S, v18.4S, v29.s[0] +ldr q17, [x0, #544] +mla v22.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v17.4S, v29.s[0] +ldr q16, [x0, #480] +ldr q3, [x0, #416] +mul v20.4S, v20.4S,v30.s[0] +sub v2.4s, v16.4s, v28.4s +mul v19.4S, v19.4S,v30.s[0] +add v16.4s, v16.4s, v28.4s +ldr q28, [x0, #352] +ldr q1, [x0, #288] +mla v20.4S, v27.4S, v31.s[0] +sub v27.4s, v3.4s, v26.4s +mla v19.4S, v25.4S, v31.s[0] +add v3.4s, v3.4s, v26.4s +ldr q26, [x0, #224] +ldr q25, [x0, #160] +mul v18.4S, v18.4S,v30.s[0] +sub v0.4s, v28.4s, v24.4s +mul v17.4S, v17.4S,v30.s[0] +add v28.4s, v28.4s, v24.4s +ldr q24, [x0, #96] +ldr q15, [x0, #32] +mla v18.4S, v23.4S, v31.s[0] +sub v23.4s, v1.4s, v22.4s +mla v17.4S, v21.4S, v31.s[0] +add v1.4s, v1.4s, v22.4s +sqrdmulh v22.4S, v2.4S, v29.s[2] +nop +mul v2.4S, v2.4S,v30.s[2] +nop +sqrdmulh v21.4S, v27.4S, v29.s[2] +sub v14.4s, v26.4s, v20.4s +mul v27.4S, v27.4S,v30.s[2] +add v26.4s, v26.4s, v20.4s +sqrdmulh v20.4S, v16.4S, v29.s[1] +sub v13.4s, v25.4s, v19.4s +mul v16.4S, v16.4S,v30.s[1] +add v25.4s, v25.4s, v19.4s +sqrdmulh v19.4S, v3.4S, v29.s[1] +sub v12.4s, v24.4s, v18.4s +mul v3.4S, v3.4S,v30.s[1] +add v24.4s, v24.4s, v18.4s +mla v2.4S, v22.4S, v31.s[0] +sub v22.4s, v15.4s, v17.4s +sqrdmulh v18.4S, v0.4S, v29.s[2] +add v15.4s, v15.4s, v17.4s +mla v27.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v23.4S, v29.s[2] +nop +mla v16.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v28.4S, v29.s[1] +nop +mla v3.4S, v19.4S, v31.s[0] +nop +sqrdmulh v19.4S, v1.4S, v29.s[1] +nop +ldr q17, [x17, #+32] +ldr q11, [x17, #+48] +mul v0.4S, v0.4S,v30.s[2] +sub v10.4s, v14.4s, v2.4s +mul v23.4S, v23.4S,v30.s[2] +add v14.4s, v14.4s, v2.4s +mla v0.4S, v18.4S, v31.s[0] +sub v18.4s, v13.4s, v27.4s +mla v23.4S, v21.4S, v31.s[0] +add v13.4s, v13.4s, v27.4s +mul v28.4S, v28.4S,v30.s[1] +sub v27.4s, v26.4s, v16.4s +mul v1.4S, v1.4S,v30.s[1] +add v26.4s, v26.4s, v16.4s +mla v28.4S, v20.4S, v31.s[0] +sub v20.4s, v25.4s, v3.4s +mla v1.4S, v19.4S, v31.s[0] +add v25.4s, v25.4s, v3.4s +sqrdmulh v3.4S, v10.4S, v11.s[3] +nop +mul v10.4S, v10.4S,v17.s[3] +nop +sqrdmulh v19.4S, v14.4S, v11.s[2] +sub v16.4s, v12.4s, v0.4s +mul v14.4S, v14.4S,v17.s[2] +add v12.4s, v12.4s, v0.4s +sqrdmulh v0.4S, v27.4S, v11.s[1] +sub v21.4s, v22.4s, v23.4s +mul v27.4S, v27.4S,v17.s[1] +add v22.4s, v22.4s, v23.4s +sqrdmulh v23.4S, v26.4S, v11.s[0] +sub v2.4s, v24.4s, v28.4s +mul v26.4S, v26.4S,v17.s[0] +add v24.4s, v24.4s, v28.4s +ldr q28, [x17, #+96] +ldr q9, [x17, #+112] +mla v10.4S, v3.4S, v31.s[0] +sub v3.4s, v15.4s, v1.4s +sqrdmulh v8.4S, v18.4S, v11.s[3] +add v15.4s, v15.4s, v1.4s +mla v14.4S, v19.4S, v31.s[0] +nop +sqrdmulh v19.4S, v13.4S, v11.s[2] +nop +mla v27.4S, v0.4S, v31.s[0] +nop +sqrdmulh v0.4S, v20.4S, v11.s[1] +nop +mla v26.4S, v23.4S, v31.s[0] +nop +sqrdmulh v23.4S, v25.4S, v11.s[0] +nop +ldr q1, [x17, #+64] +ldr q7, [x17, #+80] +mul v18.4S, v18.4S,v17.s[3] +sub v6.4s, v16.4s, v10.4s +mul v13.4S, v13.4S,v17.s[2] +add v16.4s, v16.4s, v10.4s +mla v18.4S, v8.4S, v31.s[0] +sub v8.4s, v12.4s, v14.4s +mla v13.4S, v19.4S, v31.s[0] +add v12.4s, v12.4s, v14.4s +mul v20.4S, v20.4S,v17.s[1] +sub v14.4s, v2.4s, v27.4s +mul v25.4S, v25.4S,v17.s[0] +add v2.4s, v2.4s, v27.4s +mla v20.4S, v0.4S, v31.s[0] +sub v0.4s, v24.4s, v26.4s +mla v25.4S, v23.4S, v31.s[0] +add v24.4s, v24.4s, v26.4s +sqrdmulh v26.4S, v6.4S, v9.s[3] +nop +mul v6.4S, v6.4S,v28.s[3] +nop +sqrdmulh v23.4S, v16.4S, v9.s[2] +sub v27.4s, v21.4s, v18.4s +mul v16.4S, v16.4S,v28.s[2] +add v21.4s, v21.4s, v18.4s +sqrdmulh v18.4S, v8.4S, v9.s[1] +sub v19.4s, v22.4s, v13.4s +mul v8.4S, v8.4S,v28.s[1] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v12.4S, v9.s[0] +sub v10.4s, v3.4s, v20.4s +mul v12.4S, v12.4S,v28.s[0] +add v3.4s, v3.4s, v20.4s +mla v6.4S, v26.4S, v31.s[0] +sub v26.4s, v15.4s, v25.4s +sqrdmulh v20.4S, v14.4S, v7.s[3] +add v15.4s, v15.4s, v25.4s +mla v16.4S, v23.4S, v31.s[0] +sub v23.4s, v27.4s, v6.4s +sqrdmulh v25.4S, v2.4S, v7.s[2] +add v27.4s, v27.4s, v6.4s +mla v8.4S, v18.4S, v31.s[0] +sub v18.4s, v21.4s, v16.4s +sqrdmulh v6.4S, v0.4S, v7.s[1] +add v21.4s, v21.4s, v16.4s +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v19.4s, v8.4s +sqrdmulh v16.4S, v24.4S, v7.s[0] +add v19.4s, v19.4s, v8.4s +mul v14.4S, v14.4S,v1.s[3] +sub v8.4s, v22.4s, v12.4s +mul v2.4S, v2.4S,v1.s[2] +add v22.4s, v22.4s, v12.4s +mla v14.4S, v20.4S, v31.s[0] +str q23, [x0, #992] +mla v2.4S, v25.4S, v31.s[0] +str q27, [x0, #928] +mul v0.4S, v0.4S,v1.s[1] +str q18, [x0, #864] +mul v24.4S, v24.4S,v1.s[0] +str q21, [x0, #800] +mla v0.4S, v6.4S, v31.s[0] +str q13, [x0, #736] +mla v24.4S, v16.4S, v31.s[0] +str q19, [x0, #672] +ldr q19, [x0, #1008] +sqrdmulh v16.4S, v19.4S, v29.s[0] +str q8, [x0, #608] +mul v19.4S, v19.4S,v30.s[0] +sub v8.4s, v10.4s, v14.4s +ldr q13, [x0, #944] +sqrdmulh v6.4S, v13.4S, v29.s[0] +str q22, [x0, #544] +mul v13.4S, v13.4S,v30.s[0] +add v10.4s, v10.4s, v14.4s +ldr q14, [x0, #880] +sqrdmulh v22.4S, v14.4S, v29.s[0] +str q8, [x0, #480] +mul v14.4S, v14.4S,v30.s[0] +sub v8.4s, v3.4s, v2.4s +ldr q21, [x0, #816] +sqrdmulh v18.4S, v21.4S, v29.s[0] +str q10, [x0, #416] +mul v21.4S, v21.4S,v30.s[0] +add v3.4s, v3.4s, v2.4s +ldr q2, [x0, #752] +mla v19.4S, v16.4S, v31.s[0] +str q8, [x0, #352] +sqrdmulh v8.4S, v2.4S, v29.s[0] +sub v16.4s, v26.4s, v0.4s +ldr q10, [x0, #688] +mla v13.4S, v6.4S, v31.s[0] +str q3, [x0, #288] +sqrdmulh v3.4S, v10.4S, v29.s[0] +add v26.4s, v26.4s, v0.4s +ldr q0, [x0, #624] +mla v14.4S, v22.4S, v31.s[0] +str q16, [x0, #224] +sqrdmulh v16.4S, v0.4S, v29.s[0] +sub v22.4s, v15.4s, v24.4s +ldr q6, [x0, #560] +mla v21.4S, v18.4S, v31.s[0] +str q26, [x0, #160] +sqrdmulh v26.4S, v6.4S, v29.s[0] +add v15.4s, v15.4s, v24.4s +ldr q24, [x0, #496] +ldr q18, [x0, #432] +mul v2.4S, v2.4S,v30.s[0] +sub v27.4s, v24.4s, v19.4s +mul v10.4S, v10.4S,v30.s[0] +add v24.4s, v24.4s, v19.4s +ldr q19, [x0, #368] +ldr q25, [x0, #304] +mla v2.4S, v8.4S, v31.s[0] +sub v8.4s, v18.4s, v13.4s +mla v10.4S, v3.4S, v31.s[0] +add v18.4s, v18.4s, v13.4s +ldr q13, [x0, #240] +ldr q3, [x0, #176] +mul v0.4S, v0.4S,v30.s[0] +sub v23.4s, v19.4s, v14.4s +mul v6.4S, v6.4S,v30.s[0] +add v19.4s, v19.4s, v14.4s +ldr q14, [x0, #112] +ldr q20, [x0, #48] +mla v0.4S, v16.4S, v31.s[0] +sub v16.4s, v25.4s, v21.4s +mla v6.4S, v26.4S, v31.s[0] +add v25.4s, v25.4s, v21.4s +sqrdmulh v21.4S, v27.4S, v29.s[2] +nop +mul v27.4S, v27.4S,v30.s[2] +nop +sqrdmulh v26.4S, v8.4S, v29.s[2] +sub v12.4s, v13.4s, v2.4s +mul v8.4S, v8.4S,v30.s[2] +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v24.4S, v29.s[1] +sub v5.4s, v3.4s, v10.4s +mul v24.4S, v24.4S,v30.s[1] +add v3.4s, v3.4s, v10.4s +sqrdmulh v10.4S, v18.4S, v29.s[1] +sub v4.4s, v14.4s, v0.4s +mul v18.4S, v18.4S,v30.s[1] +add v14.4s, v14.4s, v0.4s +mla v27.4S, v21.4S, v31.s[0] +sub v21.4s, v20.4s, v6.4s +sqrdmulh v0.4S, v23.4S, v29.s[2] +add v20.4s, v20.4s, v6.4s +mla v8.4S, v26.4S, v31.s[0] +str q22, [x0, #96] +sqrdmulh v22.4S, v16.4S, v29.s[2] +nop +mla v24.4S, v2.4S, v31.s[0] +str q15, [x0, #32] +sqrdmulh v15.4S, v19.4S, v29.s[1] +nop +mla v18.4S, v10.4S, v31.s[0] +nop +sqrdmulh v10.4S, v25.4S, v29.s[1] +nop +mul v23.4S, v23.4S,v30.s[2] +sub v2.4s, v12.4s, v27.4s +mul v16.4S, v16.4S,v30.s[2] +add v12.4s, v12.4s, v27.4s +mla v23.4S, v0.4S, v31.s[0] +sub v0.4s, v5.4s, v8.4s +mla v16.4S, v22.4S, v31.s[0] +add v5.4s, v5.4s, v8.4s +mul v19.4S, v19.4S,v30.s[1] +sub v8.4s, v13.4s, v24.4s +mul v25.4S, v25.4S,v30.s[1] +add v13.4s, v13.4s, v24.4s +mla v19.4S, v15.4S, v31.s[0] +sub v15.4s, v3.4s, v18.4s +mla v25.4S, v10.4S, v31.s[0] +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v2.4S, v11.s[3] +nop +mul v2.4S, v2.4S,v17.s[3] +nop +sqrdmulh v10.4S, v12.4S, v11.s[2] +sub v24.4s, v4.4s, v23.4s +mul v12.4S, v12.4S,v17.s[2] +add v4.4s, v4.4s, v23.4s +sqrdmulh v23.4S, v8.4S, v11.s[1] +sub v22.4s, v21.4s, v16.4s +mul v8.4S, v8.4S,v17.s[1] +add v21.4s, v21.4s, v16.4s +sqrdmulh v16.4S, v13.4S, v11.s[0] +sub v27.4s, v14.4s, v19.4s +mul v13.4S, v13.4S,v17.s[0] +add v14.4s, v14.4s, v19.4s +mla v2.4S, v18.4S, v31.s[0] +sub v18.4s, v20.4s, v25.4s +sqrdmulh v19.4S, v0.4S, v11.s[3] +add v20.4s, v20.4s, v25.4s +mla v12.4S, v10.4S, v31.s[0] +nop +sqrdmulh v10.4S, v5.4S, v11.s[2] +nop +mla v8.4S, v23.4S, v31.s[0] +nop +sqrdmulh v23.4S, v15.4S, v11.s[1] +nop +mla v13.4S, v16.4S, v31.s[0] +nop +sqrdmulh v16.4S, v3.4S, v11.s[0] +nop +mul v0.4S, v0.4S,v17.s[3] +sub v25.4s, v24.4s, v2.4s +mul v5.4S, v5.4S,v17.s[2] +add v24.4s, v24.4s, v2.4s +mla v0.4S, v19.4S, v31.s[0] +sub v19.4s, v4.4s, v12.4s +mla v5.4S, v10.4S, v31.s[0] +add v4.4s, v4.4s, v12.4s +mul v15.4S, v15.4S,v17.s[1] +sub v12.4s, v27.4s, v8.4s +mul v3.4S, v3.4S,v17.s[0] +add v27.4s, v27.4s, v8.4s +mla v15.4S, v23.4S, v31.s[0] +sub v23.4s, v14.4s, v13.4s +mla v3.4S, v16.4S, v31.s[0] +add v14.4s, v14.4s, v13.4s +sqrdmulh v13.4S, v25.4S, v9.s[3] +nop +mul v25.4S, v25.4S,v28.s[3] +nop +sqrdmulh v16.4S, v24.4S, v9.s[2] +sub v8.4s, v22.4s, v0.4s +mul v24.4S, v24.4S,v28.s[2] +add v22.4s, v22.4s, v0.4s +sqrdmulh v0.4S, v19.4S, v9.s[1] +sub v10.4s, v21.4s, v5.4s +mul v19.4S, v19.4S,v28.s[1] +add v21.4s, v21.4s, v5.4s +sqrdmulh v5.4S, v4.4S, v9.s[0] +sub v2.4s, v18.4s, v15.4s +mul v4.4S, v4.4S,v28.s[0] +add v18.4s, v18.4s, v15.4s +mla v25.4S, v13.4S, v31.s[0] +sub v13.4s, v20.4s, v3.4s +sqrdmulh v15.4S, v12.4S, v7.s[3] +add v20.4s, v20.4s, v3.4s +mla v24.4S, v16.4S, v31.s[0] +sub v16.4s, v8.4s, v25.4s +sqrdmulh v3.4S, v27.4S, v7.s[2] +add v8.4s, v8.4s, v25.4s +mla v19.4S, v0.4S, v31.s[0] +sub v0.4s, v22.4s, v24.4s +sqrdmulh v25.4S, v23.4S, v7.s[1] +add v22.4s, v22.4s, v24.4s +mla v4.4S, v5.4S, v31.s[0] +sub v5.4s, v10.4s, v19.4s +sqrdmulh v24.4S, v14.4S, v7.s[0] +add v10.4s, v10.4s, v19.4s +mul v12.4S, v12.4S,v1.s[3] +sub v19.4s, v21.4s, v4.4s +mul v27.4S, v27.4S,v1.s[2] +add v21.4s, v21.4s, v4.4s +mla v12.4S, v15.4S, v31.s[0] +str q16, [x0, #1008] +mla v27.4S, v3.4S, v31.s[0] +str q8, [x0, #944] +mul v23.4S, v23.4S,v1.s[1] +str q0, [x0, #880] +mul v14.4S, v14.4S,v1.s[0] +str q22, [x0, #816] +mla v23.4S, v25.4S, v31.s[0] +str q5, [x0, #752] +mla v14.4S, v24.4S, v31.s[0] +str q10, [x0, #688] +ldr q10, [x0, #960] +sqrdmulh v24.4S, v10.4S, v29.s[0] +str q19, [x0, #624] +mul v10.4S, v10.4S,v30.s[0] +sub v19.4s, v2.4s, v12.4s +ldr q5, [x0, #896] +sqrdmulh v25.4S, v5.4S, v29.s[0] +str q21, [x0, #560] +mul v5.4S, v5.4S,v30.s[0] +add v2.4s, v2.4s, v12.4s +ldr q12, [x0, #832] +sqrdmulh v21.4S, v12.4S, v29.s[0] +str q19, [x0, #496] +mul v12.4S, v12.4S,v30.s[0] +sub v19.4s, v18.4s, v27.4s +ldr q22, [x0, #768] +sqrdmulh v0.4S, v22.4S, v29.s[0] +str q2, [x0, #432] +mul v22.4S, v22.4S,v30.s[0] +add v18.4s, v18.4s, v27.4s +ldr q27, [x0, #704] +mla v10.4S, v24.4S, v31.s[0] +str q19, [x0, #368] +sqrdmulh v19.4S, v27.4S, v29.s[0] +sub v24.4s, v13.4s, v23.4s +ldr q2, [x0, #640] +mla v5.4S, v25.4S, v31.s[0] +str q18, [x0, #304] +sqrdmulh v18.4S, v2.4S, v29.s[0] +add v13.4s, v13.4s, v23.4s +ldr q23, [x0, #576] +mla v12.4S, v21.4S, v31.s[0] +str q24, [x0, #240] +sqrdmulh v24.4S, v23.4S, v29.s[0] +sub v21.4s, v20.4s, v14.4s +ldr q25, [x0, #512] +mla v22.4S, v0.4S, v31.s[0] +str q13, [x0, #176] +sqrdmulh v13.4S, v25.4S, v29.s[0] +add v20.4s, v20.4s, v14.4s +ldr q14, [x0, #448] +ldr q0, [x0, #384] +mul v27.4S, v27.4S,v30.s[0] +sub v8.4s, v14.4s, v10.4s +mul v2.4S, v2.4S,v30.s[0] +add v14.4s, v14.4s, v10.4s +ldr q10, [x0, #320] +ldr q3, [x0, #256] +mla v27.4S, v19.4S, v31.s[0] +sub v19.4s, v0.4s, v5.4s +mla v2.4S, v18.4S, v31.s[0] +add v0.4s, v0.4s, v5.4s +ldr q5, [x0, #192] +ldr q18, [x0, #128] +mul v23.4S, v23.4S,v30.s[0] +sub v16.4s, v10.4s, v12.4s +mul v25.4S, v25.4S,v30.s[0] +add v10.4s, v10.4s, v12.4s +ldr q12, [x0, #64] +ldr q15, [x0, #0] +mla v23.4S, v24.4S, v31.s[0] +sub v24.4s, v3.4s, v22.4s +mla v25.4S, v13.4S, v31.s[0] +add v3.4s, v3.4s, v22.4s +sqrdmulh v22.4S, v8.4S, v29.s[2] +nop +mul v8.4S, v8.4S,v30.s[2] +nop +sqrdmulh v13.4S, v19.4S, v29.s[2] +sub v4.4s, v5.4s, v27.4s +mul v19.4S, v19.4S,v30.s[2] +add v5.4s, v5.4s, v27.4s +sqrdmulh v27.4S, v14.4S, v29.s[1] +sub v26.4s, v18.4s, v2.4s +mul v14.4S, v14.4S,v30.s[1] +add v18.4s, v18.4s, v2.4s +sqrdmulh v2.4S, v0.4S, v29.s[1] +sub v6.4s, v12.4s, v23.4s +mul v0.4S, v0.4S,v30.s[1] +add v12.4s, v12.4s, v23.4s +mla v8.4S, v22.4S, v31.s[0] +sub v22.4s, v15.4s, v25.4s +sqrdmulh v23.4S, v16.4S, v29.s[2] +add v15.4s, v15.4s, v25.4s +mla v19.4S, v13.4S, v31.s[0] +str q21, [x0, #112] +sqrdmulh v21.4S, v24.4S, v29.s[2] +nop +mla v14.4S, v27.4S, v31.s[0] +str q20, [x0, #48] +sqrdmulh v20.4S, v10.4S, v29.s[1] +nop +mla v0.4S, v2.4S, v31.s[0] +nop +sqrdmulh v2.4S, v3.4S, v29.s[1] +nop +mul v16.4S, v16.4S,v30.s[2] +sub v27.4s, v4.4s, v8.4s +mul v24.4S, v24.4S,v30.s[2] +add v4.4s, v4.4s, v8.4s +mla v16.4S, v23.4S, v31.s[0] +sub v23.4s, v26.4s, v19.4s +mla v24.4S, v21.4S, v31.s[0] +add v26.4s, v26.4s, v19.4s +mul v10.4S, v10.4S,v30.s[1] +sub v19.4s, v5.4s, v14.4s +mul v3.4S, v3.4S,v30.s[1] +add v5.4s, v5.4s, v14.4s +mla v10.4S, v20.4S, v31.s[0] +sub v20.4s, v18.4s, v0.4s +mla v3.4S, v2.4S, v31.s[0] +add v18.4s, v18.4s, v0.4s +sqrdmulh v0.4S, v27.4S, v11.s[3] +nop +mul v27.4S, v27.4S,v17.s[3] +nop +sqrdmulh v2.4S, v4.4S, v11.s[2] +sub v14.4s, v6.4s, v16.4s +mul v4.4S, v4.4S,v17.s[2] +add v6.4s, v6.4s, v16.4s +sqrdmulh v16.4S, v19.4S, v11.s[1] +sub v21.4s, v22.4s, v24.4s +mul v19.4S, v19.4S,v17.s[1] +add v22.4s, v22.4s, v24.4s +sqrdmulh v24.4S, v5.4S, v11.s[0] +sub v8.4s, v12.4s, v10.4s +mul v5.4S, v5.4S,v17.s[0] +add v12.4s, v12.4s, v10.4s +mla v27.4S, v0.4S, v31.s[0] +sub v0.4s, v15.4s, v3.4s +sqrdmulh v10.4S, v23.4S, v11.s[3] +add v15.4s, v15.4s, v3.4s +mla v4.4S, v2.4S, v31.s[0] +nop +sqrdmulh v2.4S, v26.4S, v11.s[2] +nop +mla v19.4S, v16.4S, v31.s[0] +nop +sqrdmulh v16.4S, v20.4S, v11.s[1] +nop +mla v5.4S, v24.4S, v31.s[0] +nop +sqrdmulh v24.4S, v18.4S, v11.s[0] +nop +mul v23.4S, v23.4S,v17.s[3] +sub v3.4s, v14.4s, v27.4s +mul v26.4S, v26.4S,v17.s[2] +add v14.4s, v14.4s, v27.4s +mla v23.4S, v10.4S, v31.s[0] +sub v10.4s, v6.4s, v4.4s +mla v26.4S, v2.4S, v31.s[0] +add v6.4s, v6.4s, v4.4s +mul v20.4S, v20.4S,v17.s[1] +sub v4.4s, v8.4s, v19.4s +mul v18.4S, v18.4S,v17.s[0] +add v8.4s, v8.4s, v19.4s +mla v20.4S, v16.4S, v31.s[0] +sub v16.4s, v12.4s, v5.4s +mla v18.4S, v24.4S, v31.s[0] +add v12.4s, v12.4s, v5.4s +sqrdmulh v5.4S, v3.4S, v9.s[3] +nop +mul v3.4S, v3.4S,v28.s[3] +nop +sqrdmulh v24.4S, v14.4S, v9.s[2] +sub v19.4s, v21.4s, v23.4s +mul v14.4S, v14.4S,v28.s[2] +add v21.4s, v21.4s, v23.4s +sqrdmulh v23.4S, v10.4S, v9.s[1] +sub v2.4s, v22.4s, v26.4s +mul v10.4S, v10.4S,v28.s[1] +add v22.4s, v22.4s, v26.4s +sqrdmulh v26.4S, v6.4S, v9.s[0] +sub v27.4s, v0.4s, v20.4s +mul v6.4S, v6.4S,v28.s[0] +add v0.4s, v0.4s, v20.4s +mla v3.4S, v5.4S, v31.s[0] +sub v5.4s, v15.4s, v18.4s +sqrdmulh v20.4S, v4.4S, v7.s[3] +add v15.4s, v15.4s, v18.4s +mla v14.4S, v24.4S, v31.s[0] +sub v24.4s, v19.4s, v3.4s +sqrdmulh v18.4S, v8.4S, v7.s[2] +add v19.4s, v19.4s, v3.4s +mla v10.4S, v23.4S, v31.s[0] +sub v23.4s, v21.4s, v14.4s +sqrdmulh v3.4S, v16.4S, v7.s[1] +add v21.4s, v21.4s, v14.4s +mla v6.4S, v26.4S, v31.s[0] +sub v26.4s, v2.4s, v10.4s +sqrdmulh v14.4S, v12.4S, v7.s[0] +add v2.4s, v2.4s, v10.4s +mul v4.4S, v4.4S,v1.s[3] +sub v10.4s, v22.4s, v6.4s +mul v8.4S, v8.4S,v1.s[2] +add v22.4s, v22.4s, v6.4s +mla v4.4S, v20.4S, v31.s[0] +str q24, [x0, #960] +mla v8.4S, v18.4S, v31.s[0] +str q19, [x0, #896] +mul v16.4S, v16.4S,v1.s[1] +str q23, [x0, #832] +mul v12.4S, v12.4S,v1.s[0] +str q21, [x0, #768] +mla v16.4S, v3.4S, v31.s[0] +str q26, [x0, #704] +mla v12.4S, v14.4S, v31.s[0] +str q2, [x0, #640] +ldr q2, [x0, #976] +sqrdmulh v14.4S, v2.4S, v29.s[0] +str q10, [x0, #576] +mul v2.4S, v2.4S,v30.s[0] +sub v10.4s, v27.4s, v4.4s +ldr q26, [x0, #912] +sqrdmulh v3.4S, v26.4S, v29.s[0] +str q22, [x0, #512] +mul v26.4S, v26.4S,v30.s[0] +add v27.4s, v27.4s, v4.4s +ldr q4, [x0, #848] +sqrdmulh v22.4S, v4.4S, v29.s[0] +str q10, [x0, #448] +mul v4.4S, v4.4S,v30.s[0] +sub v10.4s, v0.4s, v8.4s +ldr q21, [x0, #784] +sqrdmulh v23.4S, v21.4S, v29.s[0] +str q27, [x0, #384] +mul v21.4S, v21.4S,v30.s[0] +add v0.4s, v0.4s, v8.4s +ldr q8, [x0, #720] +mla v2.4S, v14.4S, v31.s[0] +str q10, [x0, #320] +sqrdmulh v10.4S, v8.4S, v29.s[0] +sub v14.4s, v5.4s, v16.4s +ldr q27, [x0, #656] +mla v26.4S, v3.4S, v31.s[0] +str q0, [x0, #256] +sqrdmulh v0.4S, v27.4S, v29.s[0] +add v5.4s, v5.4s, v16.4s +ldr q16, [x0, #592] +mla v4.4S, v22.4S, v31.s[0] +str q14, [x0, #192] +sqrdmulh v14.4S, v16.4S, v29.s[0] +sub v22.4s, v15.4s, v12.4s +ldr q3, [x0, #528] +mla v21.4S, v23.4S, v31.s[0] +str q5, [x0, #128] +sqrdmulh v5.4S, v3.4S, v29.s[0] +add v15.4s, v15.4s, v12.4s +ldr q12, [x0, #464] +ldr q23, [x0, #400] +mul v8.4S, v8.4S,v30.s[0] +sub v19.4s, v12.4s, v2.4s +mul v27.4S, v27.4S,v30.s[0] +add v12.4s, v12.4s, v2.4s +ldr q2, [x0, #336] +ldr q18, [x0, #272] +mla v8.4S, v10.4S, v31.s[0] +sub v10.4s, v23.4s, v26.4s +mla v27.4S, v0.4S, v31.s[0] +add v23.4s, v23.4s, v26.4s +ldr q26, [x0, #208] +ldr q0, [x0, #144] +mul v16.4S, v16.4S,v30.s[0] +sub v24.4s, v2.4s, v4.4s +mul v3.4S, v3.4S,v30.s[0] +add v2.4s, v2.4s, v4.4s +ldr q4, [x0, #80] +ldr q20, [x0, #16] +mla v16.4S, v14.4S, v31.s[0] +sub v14.4s, v18.4s, v21.4s +mla v3.4S, v5.4S, v31.s[0] +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v19.4S, v29.s[2] +nop +mul v19.4S, v19.4S,v30.s[2] +nop +sqrdmulh v5.4S, v10.4S, v29.s[2] +sub v6.4s, v26.4s, v8.4s +mul v10.4S, v10.4S,v30.s[2] +add v26.4s, v26.4s, v8.4s +sqrdmulh v8.4S, v12.4S, v29.s[1] +sub v13.4s, v0.4s, v27.4s +mul v12.4S, v12.4S,v30.s[1] +add v0.4s, v0.4s, v27.4s +sqrdmulh v27.4S, v23.4S, v29.s[1] +sub v25.4s, v4.4s, v16.4s +mul v23.4S, v23.4S,v30.s[1] +add v4.4s, v4.4s, v16.4s +mla v19.4S, v21.4S, v31.s[0] +sub v21.4s, v20.4s, v3.4s +sqrdmulh v16.4S, v24.4S, v29.s[2] +add v20.4s, v20.4s, v3.4s +mla v10.4S, v5.4S, v31.s[0] +str q22, [x0, #64] +sqrdmulh v22.4S, v14.4S, v29.s[2] +nop +mla v12.4S, v8.4S, v31.s[0] +str q15, [x0, #0] +sqrdmulh v15.4S, v2.4S, v29.s[1] +nop +mla v23.4S, v27.4S, v31.s[0] +nop +sqrdmulh v27.4S, v18.4S, v29.s[1] +nop +mul v24.4S, v24.4S,v30.s[2] +sub v8.4s, v6.4s, v19.4s +mul v14.4S, v14.4S,v30.s[2] +add v6.4s, v6.4s, v19.4s +mla v24.4S, v16.4S, v31.s[0] +sub v16.4s, v13.4s, v10.4s +mla v14.4S, v22.4S, v31.s[0] +add v13.4s, v13.4s, v10.4s +mul v2.4S, v2.4S,v30.s[1] +sub v10.4s, v26.4s, v12.4s +mul v18.4S, v18.4S,v30.s[1] +add v26.4s, v26.4s, v12.4s +mla v2.4S, v15.4S, v31.s[0] +sub v15.4s, v0.4s, v23.4s +mla v18.4S, v27.4S, v31.s[0] +add v0.4s, v0.4s, v23.4s +sqrdmulh v29.4S, v8.4S, v11.s[3] +nop +mul v8.4S, v8.4S,v17.s[3] +nop +sqrdmulh v30.4S, v6.4S, v11.s[2] +sub v23.4s, v25.4s, v24.4s +mul v6.4S, v6.4S,v17.s[2] +add v25.4s, v25.4s, v24.4s +sqrdmulh v24.4S, v10.4S, v11.s[1] +sub v27.4s, v21.4s, v14.4s +mul v10.4S, v10.4S,v17.s[1] +add v21.4s, v21.4s, v14.4s +sqrdmulh v14.4S, v26.4S, v11.s[0] +sub v12.4s, v4.4s, v2.4s +mul v26.4S, v26.4S,v17.s[0] +add v4.4s, v4.4s, v2.4s +mla v8.4S, v29.4S, v31.s[0] +sub v29.4s, v20.4s, v18.4s +sqrdmulh v2.4S, v16.4S, v11.s[3] +add v20.4s, v20.4s, v18.4s +mla v6.4S, v30.4S, v31.s[0] +nop +sqrdmulh v30.4S, v13.4S, v11.s[2] +nop +mla v10.4S, v24.4S, v31.s[0] +nop +sqrdmulh v24.4S, v15.4S, v11.s[1] +nop +mla v26.4S, v14.4S, v31.s[0] +nop +sqrdmulh v14.4S, v0.4S, v11.s[0] +nop +mul v16.4S, v16.4S,v17.s[3] +sub v18.4s, v23.4s, v8.4s +mul v13.4S, v13.4S,v17.s[2] +add v23.4s, v23.4s, v8.4s +mla v16.4S, v2.4S, v31.s[0] +sub v2.4s, v25.4s, v6.4s +mla v13.4S, v30.4S, v31.s[0] +add v25.4s, v25.4s, v6.4s +mul v15.4S, v15.4S,v17.s[1] +sub v6.4s, v12.4s, v10.4s +mul v0.4S, v0.4S,v17.s[0] +add v12.4s, v12.4s, v10.4s +mla v15.4S, v24.4S, v31.s[0] +sub v24.4s, v4.4s, v26.4s +mla v0.4S, v14.4S, v31.s[0] +add v4.4s, v4.4s, v26.4s +sqrdmulh v11.4S, v18.4S, v9.s[3] +nop +mul v18.4S, v18.4S,v28.s[3] +nop +sqrdmulh v17.4S, v23.4S, v9.s[2] +sub v26.4s, v27.4s, v16.4s +mul v23.4S, v23.4S,v28.s[2] +add v27.4s, v27.4s, v16.4s +sqrdmulh v16.4S, v2.4S, v9.s[1] +sub v14.4s, v21.4s, v13.4s +mul v2.4S, v2.4S,v28.s[1] +add v21.4s, v21.4s, v13.4s +sqrdmulh v13.4S, v25.4S, v9.s[0] +sub v10.4s, v29.4s, v15.4s +mul v25.4S, v25.4S,v28.s[0] +add v29.4s, v29.4s, v15.4s +mla v18.4S, v11.4S, v31.s[0] +sub v11.4s, v20.4s, v0.4s +sqrdmulh v9.4S, v6.4S, v7.s[3] +add v20.4s, v20.4s, v0.4s +mla v23.4S, v17.4S, v31.s[0] +sub v17.4s, v26.4s, v18.4s +sqrdmulh v0.4S, v12.4S, v7.s[2] +add v26.4s, v26.4s, v18.4s +mla v2.4S, v16.4S, v31.s[0] +sub v16.4s, v27.4s, v23.4s +sqrdmulh v18.4S, v24.4S, v7.s[1] +add v27.4s, v27.4s, v23.4s +mla v25.4S, v13.4S, v31.s[0] +sub v13.4s, v14.4s, v2.4s +sqrdmulh v23.4S, v4.4S, v7.s[0] +add v14.4s, v14.4s, v2.4s +mul v6.4S, v6.4S,v1.s[3] +sub v2.4s, v21.4s, v25.4s +mul v12.4S, v12.4S,v1.s[2] +add v21.4s, v21.4s, v25.4s +mla v6.4S, v9.4S, v31.s[0] +str q17, [x0, #976] +mla v12.4S, v0.4S, v31.s[0] +str q26, [x0, #912] +mul v24.4S, v24.4S,v1.s[1] +str q16, [x0, #848] +mul v4.4S, v4.4S,v1.s[0] +str q27, [x0, #784] +mla v24.4S, v18.4S, v31.s[0] +str q13, [x0, #720] +mla v4.4S, v23.4S, v31.s[0] +str q14, [x0, #656] +str q2, [x0, #592] +sub v2.4s, v10.4s, v6.4s +str q21, [x0, #528] +add v10.4s, v10.4s, v6.4s +str q2, [x0, #464] +sub v2.4s, v29.4s, v12.4s +str q10, [x0, #400] +add v29.4s, v29.4s, v12.4s +str q2, [x0, #336] +sub v2.4s, v11.4s, v24.4s +str q29, [x0, #272] +add v11.4s, v11.4s, v24.4s +str q2, [x0, #208] +sub v2.4s, v20.4s, v4.4s +str q11, [x0, #144] +add v20.4s, v20.4s, v4.4s +str q2, [x0, #80] +str q20, [x0, #16] +ldr q3, [x17, #+128] +ldr q5, [x17, #+144] +ldr q19, [x17, #+160] +ldr q22, [x17, #+176] +ldr q8, [x17, #+192] +ldr q30, [x17, #+208] +ldr q15, [x17, #+224] +ldr q28, [x17, #+240] +ldr q25, [x0, #32] +ldr q9, [x0, #48] +ldr q17, [x0, #0] +ldr q0, [x0, #16] +sqrdmulh v26.4S, v25.4S, v5.s[0] +mul v25.4S, v25.4S,v3.s[0] +mla v25.4S, v26.4S, v31.s[0] +sub v26.4s, v17.4s, v25.4s +add v17.4s, v17.4s, v25.4s +sqrdmulh v25.4S, v9.4S, v5.s[0] +mul v9.4S, v9.4S,v3.s[0] +mla v9.4S, v25.4S, v31.s[0] +sub v25.4s, v0.4s, v9.4s +add v0.4s, v0.4s, v9.4s +sqrdmulh v9.4S, v0.4S, v5.s[1] +mul v0.4S, v0.4S,v3.s[1] +mla v0.4S, v9.4S, v31.s[0] +sub v9.4s, v17.4s, v0.4s +add v17.4s, v17.4s, v0.4s +sqrdmulh v0.4S, v25.4S, v5.s[2] +mul v25.4S, v25.4S,v3.s[2] +mla v25.4S, v0.4S, v31.s[0] +sub v0.4s, v26.4s, v25.4s +add v26.4s, v26.4s, v25.4s +trn1 v25.4S, v17.4S, v9.4S +trn2 v16.4S, v17.4S, v9.4S +trn1 v27.4S, v26.4S, v0.4S +trn2 v18.4S, v26.4S, v0.4S +trn2 v26.2D, v25.2D, v27.2D +trn2 v0.2D, v16.2D, v18.2D +trn1 v17.2D, v25.2D, v27.2D +trn1 v9.2D, v16.2D, v18.2D +sqrdmulh v18.4S, v26.4S, v22.4S +mul v26.4S, v26.4S,v19.4S +mla v26.4S, v18.4S, v31.s[0] +sub v18.4s, v17.4s, v26.4s +add v17.4s, v17.4s, v26.4s +sqrdmulh v26.4S, v0.4S, v22.4S +mul v0.4S, v0.4S,v19.4S +mla v0.4S, v26.4S, v31.s[0] +sub v26.4s, v9.4s, v0.4s +add v9.4s, v9.4s, v0.4s +sqrdmulh v0.4S, v9.4S, v30.4S +mul v9.4S, v9.4S,v8.4S +mla v9.4S, v0.4S, v31.s[0] +sub v0.4s, v17.4s, v9.4s +add v17.4s, v17.4s, v9.4s +sqrdmulh v9.4S, v26.4S, v28.4S +mul v26.4S, v26.4S,v15.4S +mla v26.4S, v9.4S, v31.s[0] +sub v9.4s, v18.4s, v26.4s +add v18.4s, v18.4s, v26.4s +str q17, [x0, #0] +str q0, [x0, #16] +str q18, [x0, #32] +str q9, [x0, #48] +ldr q9, [x17, #+256] +ldr q18, [x17, #+272] +ldr q0, [x17, #+288] +ldr q17, [x17, #+304] +ldr q26, [x17, #+320] +ldr q16, [x17, #+336] +ldr q27, [x17, #+352] +ldr q25, [x17, #+368] +ldr q28, [x0, #96] +ldr q15, [x0, #112] +ldr q30, [x0, #64] +ldr q8, [x0, #80] +sqrdmulh v22.4S, v28.4S, v18.s[0] +mul v28.4S, v28.4S,v9.s[0] +mla v28.4S, v22.4S, v31.s[0] +sub v22.4s, v30.4s, v28.4s +add v30.4s, v30.4s, v28.4s +sqrdmulh v28.4S, v15.4S, v18.s[0] +mul v15.4S, v15.4S,v9.s[0] +mla v15.4S, v28.4S, v31.s[0] +sub v28.4s, v8.4s, v15.4s +add v8.4s, v8.4s, v15.4s +sqrdmulh v15.4S, v8.4S, v18.s[1] +mul v8.4S, v8.4S,v9.s[1] +mla v8.4S, v15.4S, v31.s[0] +sub v15.4s, v30.4s, v8.4s +add v30.4s, v30.4s, v8.4s +sqrdmulh v8.4S, v28.4S, v18.s[2] +mul v28.4S, v28.4S,v9.s[2] +mla v28.4S, v8.4S, v31.s[0] +sub v8.4s, v22.4s, v28.4s +add v22.4s, v22.4s, v28.4s +trn1 v28.4S, v30.4S, v15.4S +trn2 v19.4S, v30.4S, v15.4S +trn1 v5.4S, v22.4S, v8.4S +trn2 v3.4S, v22.4S, v8.4S +trn2 v22.2D, v28.2D, v5.2D +trn2 v8.2D, v19.2D, v3.2D +trn1 v30.2D, v28.2D, v5.2D +trn1 v15.2D, v19.2D, v3.2D +sqrdmulh v3.4S, v22.4S, v17.4S +mul v22.4S, v22.4S,v0.4S +mla v22.4S, v3.4S, v31.s[0] +sub v3.4s, v30.4s, v22.4s +add v30.4s, v30.4s, v22.4s +sqrdmulh v22.4S, v8.4S, v17.4S +mul v8.4S, v8.4S,v0.4S +mla v8.4S, v22.4S, v31.s[0] +sub v22.4s, v15.4s, v8.4s +add v15.4s, v15.4s, v8.4s +sqrdmulh v8.4S, v15.4S, v16.4S +mul v15.4S, v15.4S,v26.4S +mla v15.4S, v8.4S, v31.s[0] +sub v8.4s, v30.4s, v15.4s +add v30.4s, v30.4s, v15.4s +sqrdmulh v15.4S, v22.4S, v25.4S +mul v22.4S, v22.4S,v27.4S +mla v22.4S, v15.4S, v31.s[0] +sub v15.4s, v3.4s, v22.4s +add v3.4s, v3.4s, v22.4s +str q30, [x0, #64] +str q8, [x0, #80] +str q3, [x0, #96] +str q15, [x0, #112] +ldr q15, [x17, #+384] +ldr q3, [x17, #+400] +ldr q8, [x17, #+416] +ldr q30, [x17, #+432] +ldr q22, [x17, #+448] +ldr q19, [x17, #+464] +ldr q5, [x17, #+480] +ldr q28, [x17, #+496] +ldr q25, [x0, #160] +ldr q27, [x0, #176] +ldr q16, [x0, #128] +ldr q26, [x0, #144] +sqrdmulh v17.4S, v25.4S, v3.s[0] +mul v25.4S, v25.4S,v15.s[0] +mla v25.4S, v17.4S, v31.s[0] +sub v17.4s, v16.4s, v25.4s +add v16.4s, v16.4s, v25.4s +sqrdmulh v25.4S, v27.4S, v3.s[0] +mul v27.4S, v27.4S,v15.s[0] +mla v27.4S, v25.4S, v31.s[0] +sub v25.4s, v26.4s, v27.4s +add v26.4s, v26.4s, v27.4s +sqrdmulh v27.4S, v26.4S, v3.s[1] +mul v26.4S, v26.4S,v15.s[1] +mla v26.4S, v27.4S, v31.s[0] +sub v27.4s, v16.4s, v26.4s +add v16.4s, v16.4s, v26.4s +sqrdmulh v26.4S, v25.4S, v3.s[2] +mul v25.4S, v25.4S,v15.s[2] +mla v25.4S, v26.4S, v31.s[0] +sub v26.4s, v17.4s, v25.4s +add v17.4s, v17.4s, v25.4s +trn1 v25.4S, v16.4S, v27.4S +trn2 v0.4S, v16.4S, v27.4S +trn1 v18.4S, v17.4S, v26.4S +trn2 v9.4S, v17.4S, v26.4S +trn2 v17.2D, v25.2D, v18.2D +trn2 v26.2D, v0.2D, v9.2D +trn1 v16.2D, v25.2D, v18.2D +trn1 v27.2D, v0.2D, v9.2D +sqrdmulh v9.4S, v17.4S, v30.4S +mul v17.4S, v17.4S,v8.4S +mla v17.4S, v9.4S, v31.s[0] +sub v9.4s, v16.4s, v17.4s +add v16.4s, v16.4s, v17.4s +sqrdmulh v17.4S, v26.4S, v30.4S +mul v26.4S, v26.4S,v8.4S +mla v26.4S, v17.4S, v31.s[0] +sub v17.4s, v27.4s, v26.4s +add v27.4s, v27.4s, v26.4s +sqrdmulh v26.4S, v27.4S, v19.4S +mul v27.4S, v27.4S,v22.4S +mla v27.4S, v26.4S, v31.s[0] +sub v26.4s, v16.4s, v27.4s +add v16.4s, v16.4s, v27.4s +sqrdmulh v27.4S, v17.4S, v28.4S +mul v17.4S, v17.4S,v5.4S +mla v17.4S, v27.4S, v31.s[0] +sub v27.4s, v9.4s, v17.4s +add v9.4s, v9.4s, v17.4s +str q16, [x0, #128] +str q26, [x0, #144] +str q9, [x0, #160] +str q27, [x0, #176] +ldr q27, [x17, #+512] +ldr q9, [x17, #+528] +ldr q26, [x17, #+544] +ldr q16, [x17, #+560] +ldr q17, [x17, #+576] +ldr q0, [x17, #+592] +ldr q18, [x17, #+608] +ldr q25, [x17, #+624] +ldr q28, [x0, #224] +ldr q5, [x0, #240] +ldr q19, [x0, #192] +ldr q22, [x0, #208] +sqrdmulh v30.4S, v28.4S, v9.s[0] +mul v28.4S, v28.4S,v27.s[0] +mla v28.4S, v30.4S, v31.s[0] +sub v30.4s, v19.4s, v28.4s +add v19.4s, v19.4s, v28.4s +sqrdmulh v28.4S, v5.4S, v9.s[0] +mul v5.4S, v5.4S,v27.s[0] +mla v5.4S, v28.4S, v31.s[0] +sub v28.4s, v22.4s, v5.4s +add v22.4s, v22.4s, v5.4s +sqrdmulh v5.4S, v22.4S, v9.s[1] +mul v22.4S, v22.4S,v27.s[1] +mla v22.4S, v5.4S, v31.s[0] +sub v5.4s, v19.4s, v22.4s +add v19.4s, v19.4s, v22.4s +sqrdmulh v22.4S, v28.4S, v9.s[2] +mul v28.4S, v28.4S,v27.s[2] +mla v28.4S, v22.4S, v31.s[0] +sub v22.4s, v30.4s, v28.4s +add v30.4s, v30.4s, v28.4s +trn1 v28.4S, v19.4S, v5.4S +trn2 v8.4S, v19.4S, v5.4S +trn1 v3.4S, v30.4S, v22.4S +trn2 v15.4S, v30.4S, v22.4S +trn2 v30.2D, v28.2D, v3.2D +trn2 v22.2D, v8.2D, v15.2D +trn1 v19.2D, v28.2D, v3.2D +trn1 v5.2D, v8.2D, v15.2D +sqrdmulh v15.4S, v30.4S, v16.4S +mul v30.4S, v30.4S,v26.4S +mla v30.4S, v15.4S, v31.s[0] +sub v15.4s, v19.4s, v30.4s +add v19.4s, v19.4s, v30.4s +sqrdmulh v30.4S, v22.4S, v16.4S +mul v22.4S, v22.4S,v26.4S +mla v22.4S, v30.4S, v31.s[0] +sub v30.4s, v5.4s, v22.4s +add v5.4s, v5.4s, v22.4s +sqrdmulh v22.4S, v5.4S, v0.4S +mul v5.4S, v5.4S,v17.4S +mla v5.4S, v22.4S, v31.s[0] +sub v22.4s, v19.4s, v5.4s +add v19.4s, v19.4s, v5.4s +sqrdmulh v5.4S, v30.4S, v25.4S +mul v30.4S, v30.4S,v18.4S +mla v30.4S, v5.4S, v31.s[0] +sub v5.4s, v15.4s, v30.4s +add v15.4s, v15.4s, v30.4s +str q19, [x0, #192] +str q22, [x0, #208] +str q15, [x0, #224] +str q5, [x0, #240] +ldr q5, [x17, #+640] +ldr q15, [x17, #+656] +ldr q22, [x17, #+672] +ldr q19, [x17, #+688] +ldr q30, [x17, #+704] +ldr q8, [x17, #+720] +ldr q3, [x17, #+736] +ldr q28, [x17, #+752] +ldr q25, [x0, #288] +ldr q18, [x0, #304] +ldr q0, [x0, #256] +ldr q17, [x0, #272] +sqrdmulh v16.4S, v25.4S, v15.s[0] +mul v25.4S, v25.4S,v5.s[0] +mla v25.4S, v16.4S, v31.s[0] +sub v16.4s, v0.4s, v25.4s +add v0.4s, v0.4s, v25.4s +sqrdmulh v25.4S, v18.4S, v15.s[0] +mul v18.4S, v18.4S,v5.s[0] +mla v18.4S, v25.4S, v31.s[0] +sub v25.4s, v17.4s, v18.4s +add v17.4s, v17.4s, v18.4s +sqrdmulh v18.4S, v17.4S, v15.s[1] +mul v17.4S, v17.4S,v5.s[1] +mla v17.4S, v18.4S, v31.s[0] +sub v18.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +sqrdmulh v17.4S, v25.4S, v15.s[2] +mul v25.4S, v25.4S,v5.s[2] +mla v25.4S, v17.4S, v31.s[0] +sub v17.4s, v16.4s, v25.4s +add v16.4s, v16.4s, v25.4s +trn1 v25.4S, v0.4S, v18.4S +trn2 v26.4S, v0.4S, v18.4S +trn1 v9.4S, v16.4S, v17.4S +trn2 v27.4S, v16.4S, v17.4S +trn2 v16.2D, v25.2D, v9.2D +trn2 v17.2D, v26.2D, v27.2D +trn1 v0.2D, v25.2D, v9.2D +trn1 v18.2D, v26.2D, v27.2D +sqrdmulh v27.4S, v16.4S, v19.4S +mul v16.4S, v16.4S,v22.4S +mla v16.4S, v27.4S, v31.s[0] +sub v27.4s, v0.4s, v16.4s +add v0.4s, v0.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v19.4S +mul v17.4S, v17.4S,v22.4S +mla v17.4S, v16.4S, v31.s[0] +sub v16.4s, v18.4s, v17.4s +add v18.4s, v18.4s, v17.4s +sqrdmulh v17.4S, v18.4S, v8.4S +mul v18.4S, v18.4S,v30.4S +mla v18.4S, v17.4S, v31.s[0] +sub v17.4s, v0.4s, v18.4s +add v0.4s, v0.4s, v18.4s +sqrdmulh v18.4S, v16.4S, v28.4S +mul v16.4S, v16.4S,v3.4S +mla v16.4S, v18.4S, v31.s[0] +sub v18.4s, v27.4s, v16.4s +add v27.4s, v27.4s, v16.4s +str q0, [x0, #256] +str q17, [x0, #272] +str q27, [x0, #288] +str q18, [x0, #304] +ldr q18, [x17, #+768] +ldr q27, [x17, #+784] +ldr q17, [x17, #+800] +ldr q0, [x17, #+816] +ldr q16, [x17, #+832] +ldr q26, [x17, #+848] +ldr q9, [x17, #+864] +ldr q25, [x17, #+880] +ldr q28, [x0, #352] +ldr q3, [x0, #368] +ldr q8, [x0, #320] +ldr q30, [x0, #336] +sqrdmulh v19.4S, v28.4S, v27.s[0] +mul v28.4S, v28.4S,v18.s[0] +mla v28.4S, v19.4S, v31.s[0] +sub v19.4s, v8.4s, v28.4s +add v8.4s, v8.4s, v28.4s +sqrdmulh v28.4S, v3.4S, v27.s[0] +mul v3.4S, v3.4S,v18.s[0] +mla v3.4S, v28.4S, v31.s[0] +sub v28.4s, v30.4s, v3.4s +add v30.4s, v30.4s, v3.4s +sqrdmulh v3.4S, v30.4S, v27.s[1] +mul v30.4S, v30.4S,v18.s[1] +mla v30.4S, v3.4S, v31.s[0] +sub v3.4s, v8.4s, v30.4s +add v8.4s, v8.4s, v30.4s +sqrdmulh v30.4S, v28.4S, v27.s[2] +mul v28.4S, v28.4S,v18.s[2] +mla v28.4S, v30.4S, v31.s[0] +sub v30.4s, v19.4s, v28.4s +add v19.4s, v19.4s, v28.4s +trn1 v28.4S, v8.4S, v3.4S +trn2 v22.4S, v8.4S, v3.4S +trn1 v15.4S, v19.4S, v30.4S +trn2 v5.4S, v19.4S, v30.4S +trn2 v19.2D, v28.2D, v15.2D +trn2 v30.2D, v22.2D, v5.2D +trn1 v8.2D, v28.2D, v15.2D +trn1 v3.2D, v22.2D, v5.2D +sqrdmulh v5.4S, v19.4S, v0.4S +mul v19.4S, v19.4S,v17.4S +mla v19.4S, v5.4S, v31.s[0] +sub v5.4s, v8.4s, v19.4s +add v8.4s, v8.4s, v19.4s +sqrdmulh v19.4S, v30.4S, v0.4S +mul v30.4S, v30.4S,v17.4S +mla v30.4S, v19.4S, v31.s[0] +sub v19.4s, v3.4s, v30.4s +add v3.4s, v3.4s, v30.4s +sqrdmulh v30.4S, v3.4S, v26.4S +mul v3.4S, v3.4S,v16.4S +mla v3.4S, v30.4S, v31.s[0] +sub v30.4s, v8.4s, v3.4s +add v8.4s, v8.4s, v3.4s +sqrdmulh v3.4S, v19.4S, v25.4S +mul v19.4S, v19.4S,v9.4S +mla v19.4S, v3.4S, v31.s[0] +sub v3.4s, v5.4s, v19.4s +add v5.4s, v5.4s, v19.4s +str q8, [x0, #320] +str q30, [x0, #336] +str q5, [x0, #352] +str q3, [x0, #368] +ldr q3, [x17, #+896] +ldr q5, [x17, #+912] +ldr q30, [x17, #+928] +ldr q8, [x17, #+944] +ldr q19, [x17, #+960] +ldr q22, [x17, #+976] +ldr q15, [x17, #+992] +ldr q28, [x17, #+1008] +ldr q25, [x0, #416] +ldr q9, [x0, #432] +ldr q26, [x0, #384] +ldr q16, [x0, #400] +sqrdmulh v0.4S, v25.4S, v5.s[0] +mul v25.4S, v25.4S,v3.s[0] +mla v25.4S, v0.4S, v31.s[0] +sub v0.4s, v26.4s, v25.4s +add v26.4s, v26.4s, v25.4s +sqrdmulh v25.4S, v9.4S, v5.s[0] +mul v9.4S, v9.4S,v3.s[0] +mla v9.4S, v25.4S, v31.s[0] +sub v25.4s, v16.4s, v9.4s +add v16.4s, v16.4s, v9.4s +sqrdmulh v9.4S, v16.4S, v5.s[1] +mul v16.4S, v16.4S,v3.s[1] +mla v16.4S, v9.4S, v31.s[0] +sub v9.4s, v26.4s, v16.4s +add v26.4s, v26.4s, v16.4s +sqrdmulh v16.4S, v25.4S, v5.s[2] +mul v25.4S, v25.4S,v3.s[2] +mla v25.4S, v16.4S, v31.s[0] +sub v16.4s, v0.4s, v25.4s +add v0.4s, v0.4s, v25.4s +trn1 v25.4S, v26.4S, v9.4S +trn2 v17.4S, v26.4S, v9.4S +trn1 v27.4S, v0.4S, v16.4S +trn2 v18.4S, v0.4S, v16.4S +trn2 v0.2D, v25.2D, v27.2D +trn2 v16.2D, v17.2D, v18.2D +trn1 v26.2D, v25.2D, v27.2D +trn1 v9.2D, v17.2D, v18.2D +sqrdmulh v18.4S, v0.4S, v8.4S +mul v0.4S, v0.4S,v30.4S +mla v0.4S, v18.4S, v31.s[0] +sub v18.4s, v26.4s, v0.4s +add v26.4s, v26.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v8.4S +mul v16.4S, v16.4S,v30.4S +mla v16.4S, v0.4S, v31.s[0] +sub v0.4s, v9.4s, v16.4s +add v9.4s, v9.4s, v16.4s +sqrdmulh v16.4S, v9.4S, v22.4S +mul v9.4S, v9.4S,v19.4S +mla v9.4S, v16.4S, v31.s[0] +sub v16.4s, v26.4s, v9.4s +add v26.4s, v26.4s, v9.4s +sqrdmulh v9.4S, v0.4S, v28.4S +mul v0.4S, v0.4S,v15.4S +mla v0.4S, v9.4S, v31.s[0] +sub v9.4s, v18.4s, v0.4s +add v18.4s, v18.4s, v0.4s +str q26, [x0, #384] +str q16, [x0, #400] +str q18, [x0, #416] +str q9, [x0, #432] +ldr q9, [x17, #+1024] +ldr q18, [x17, #+1040] +ldr q16, [x17, #+1056] +ldr q26, [x17, #+1072] +ldr q0, [x17, #+1088] +ldr q17, [x17, #+1104] +ldr q27, [x17, #+1120] +ldr q25, [x17, #+1136] +ldr q28, [x0, #480] +ldr q15, [x0, #496] +ldr q22, [x0, #448] +ldr q19, [x0, #464] +sqrdmulh v8.4S, v28.4S, v18.s[0] +mul v28.4S, v28.4S,v9.s[0] +mla v28.4S, v8.4S, v31.s[0] +sub v8.4s, v22.4s, v28.4s +add v22.4s, v22.4s, v28.4s +sqrdmulh v28.4S, v15.4S, v18.s[0] +mul v15.4S, v15.4S,v9.s[0] +mla v15.4S, v28.4S, v31.s[0] +sub v28.4s, v19.4s, v15.4s +add v19.4s, v19.4s, v15.4s +sqrdmulh v15.4S, v19.4S, v18.s[1] +mul v19.4S, v19.4S,v9.s[1] +mla v19.4S, v15.4S, v31.s[0] +sub v15.4s, v22.4s, v19.4s +add v22.4s, v22.4s, v19.4s +sqrdmulh v19.4S, v28.4S, v18.s[2] +mul v28.4S, v28.4S,v9.s[2] +mla v28.4S, v19.4S, v31.s[0] +sub v19.4s, v8.4s, v28.4s +add v8.4s, v8.4s, v28.4s +trn1 v28.4S, v22.4S, v15.4S +trn2 v30.4S, v22.4S, v15.4S +trn1 v5.4S, v8.4S, v19.4S +trn2 v3.4S, v8.4S, v19.4S +trn2 v8.2D, v28.2D, v5.2D +trn2 v19.2D, v30.2D, v3.2D +trn1 v22.2D, v28.2D, v5.2D +trn1 v15.2D, v30.2D, v3.2D +sqrdmulh v3.4S, v8.4S, v26.4S +mul v8.4S, v8.4S,v16.4S +mla v8.4S, v3.4S, v31.s[0] +sub v3.4s, v22.4s, v8.4s +add v22.4s, v22.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v26.4S +mul v19.4S, v19.4S,v16.4S +mla v19.4S, v8.4S, v31.s[0] +sub v8.4s, v15.4s, v19.4s +add v15.4s, v15.4s, v19.4s +sqrdmulh v19.4S, v15.4S, v17.4S +mul v15.4S, v15.4S,v0.4S +mla v15.4S, v19.4S, v31.s[0] +sub v19.4s, v22.4s, v15.4s +add v22.4s, v22.4s, v15.4s +sqrdmulh v15.4S, v8.4S, v25.4S +mul v8.4S, v8.4S,v27.4S +mla v8.4S, v15.4S, v31.s[0] +sub v15.4s, v3.4s, v8.4s +add v3.4s, v3.4s, v8.4s +str q22, [x0, #448] +str q19, [x0, #464] +str q3, [x0, #480] +str q15, [x0, #496] +ldr q15, [x17, #+1152] +ldr q3, [x17, #+1168] +ldr q19, [x17, #+1184] +ldr q22, [x17, #+1200] +ldr q8, [x17, #+1216] +ldr q30, [x17, #+1232] +ldr q5, [x17, #+1248] +ldr q28, [x17, #+1264] +ldr q25, [x0, #544] +ldr q27, [x0, #560] +ldr q17, [x0, #512] +ldr q0, [x0, #528] +sqrdmulh v26.4S, v25.4S, v3.s[0] +mul v25.4S, v25.4S,v15.s[0] +mla v25.4S, v26.4S, v31.s[0] +sub v26.4s, v17.4s, v25.4s +add v17.4s, v17.4s, v25.4s +sqrdmulh v25.4S, v27.4S, v3.s[0] +mul v27.4S, v27.4S,v15.s[0] +mla v27.4S, v25.4S, v31.s[0] +sub v25.4s, v0.4s, v27.4s +add v0.4s, v0.4s, v27.4s +sqrdmulh v27.4S, v0.4S, v3.s[1] +mul v0.4S, v0.4S,v15.s[1] +mla v0.4S, v27.4S, v31.s[0] +sub v27.4s, v17.4s, v0.4s +add v17.4s, v17.4s, v0.4s +sqrdmulh v0.4S, v25.4S, v3.s[2] +mul v25.4S, v25.4S,v15.s[2] +mla v25.4S, v0.4S, v31.s[0] +sub v0.4s, v26.4s, v25.4s +add v26.4s, v26.4s, v25.4s +trn1 v25.4S, v17.4S, v27.4S +trn2 v16.4S, v17.4S, v27.4S +trn1 v18.4S, v26.4S, v0.4S +trn2 v9.4S, v26.4S, v0.4S +trn2 v26.2D, v25.2D, v18.2D +trn2 v0.2D, v16.2D, v9.2D +trn1 v17.2D, v25.2D, v18.2D +trn1 v27.2D, v16.2D, v9.2D +sqrdmulh v9.4S, v26.4S, v22.4S +mul v26.4S, v26.4S,v19.4S +mla v26.4S, v9.4S, v31.s[0] +sub v9.4s, v17.4s, v26.4s +add v17.4s, v17.4s, v26.4s +sqrdmulh v26.4S, v0.4S, v22.4S +mul v0.4S, v0.4S,v19.4S +mla v0.4S, v26.4S, v31.s[0] +sub v26.4s, v27.4s, v0.4s +add v27.4s, v27.4s, v0.4s +sqrdmulh v0.4S, v27.4S, v30.4S +mul v27.4S, v27.4S,v8.4S +mla v27.4S, v0.4S, v31.s[0] +sub v0.4s, v17.4s, v27.4s +add v17.4s, v17.4s, v27.4s +sqrdmulh v27.4S, v26.4S, v28.4S +mul v26.4S, v26.4S,v5.4S +mla v26.4S, v27.4S, v31.s[0] +sub v27.4s, v9.4s, v26.4s +add v9.4s, v9.4s, v26.4s +str q17, [x0, #512] +str q0, [x0, #528] +str q9, [x0, #544] +str q27, [x0, #560] +ldr q27, [x17, #+1280] +ldr q9, [x17, #+1296] +ldr q0, [x17, #+1312] +ldr q17, [x17, #+1328] +ldr q26, [x17, #+1344] +ldr q16, [x17, #+1360] +ldr q18, [x17, #+1376] +ldr q25, [x17, #+1392] +ldr q28, [x0, #608] +ldr q5, [x0, #624] +ldr q30, [x0, #576] +ldr q8, [x0, #592] +sqrdmulh v22.4S, v28.4S, v9.s[0] +mul v28.4S, v28.4S,v27.s[0] +mla v28.4S, v22.4S, v31.s[0] +sub v22.4s, v30.4s, v28.4s +add v30.4s, v30.4s, v28.4s +sqrdmulh v28.4S, v5.4S, v9.s[0] +mul v5.4S, v5.4S,v27.s[0] +mla v5.4S, v28.4S, v31.s[0] +sub v28.4s, v8.4s, v5.4s +add v8.4s, v8.4s, v5.4s +sqrdmulh v5.4S, v8.4S, v9.s[1] +mul v8.4S, v8.4S,v27.s[1] +mla v8.4S, v5.4S, v31.s[0] +sub v5.4s, v30.4s, v8.4s +add v30.4s, v30.4s, v8.4s +sqrdmulh v8.4S, v28.4S, v9.s[2] +mul v28.4S, v28.4S,v27.s[2] +mla v28.4S, v8.4S, v31.s[0] +sub v8.4s, v22.4s, v28.4s +add v22.4s, v22.4s, v28.4s +trn1 v28.4S, v30.4S, v5.4S +trn2 v19.4S, v30.4S, v5.4S +trn1 v3.4S, v22.4S, v8.4S +trn2 v15.4S, v22.4S, v8.4S +trn2 v22.2D, v28.2D, v3.2D +trn2 v8.2D, v19.2D, v15.2D +trn1 v30.2D, v28.2D, v3.2D +trn1 v5.2D, v19.2D, v15.2D +sqrdmulh v15.4S, v22.4S, v17.4S +mul v22.4S, v22.4S,v0.4S +mla v22.4S, v15.4S, v31.s[0] +sub v15.4s, v30.4s, v22.4s +add v30.4s, v30.4s, v22.4s +sqrdmulh v22.4S, v8.4S, v17.4S +mul v8.4S, v8.4S,v0.4S +mla v8.4S, v22.4S, v31.s[0] +sub v22.4s, v5.4s, v8.4s +add v5.4s, v5.4s, v8.4s +sqrdmulh v8.4S, v5.4S, v16.4S +mul v5.4S, v5.4S,v26.4S +mla v5.4S, v8.4S, v31.s[0] +sub v8.4s, v30.4s, v5.4s +add v30.4s, v30.4s, v5.4s +sqrdmulh v5.4S, v22.4S, v25.4S +mul v22.4S, v22.4S,v18.4S +mla v22.4S, v5.4S, v31.s[0] +sub v5.4s, v15.4s, v22.4s +add v15.4s, v15.4s, v22.4s +str q30, [x0, #576] +str q8, [x0, #592] +str q15, [x0, #608] +str q5, [x0, #624] +ldr q5, [x17, #+1408] +ldr q15, [x17, #+1424] +ldr q8, [x17, #+1440] +ldr q30, [x17, #+1456] +ldr q22, [x17, #+1472] +ldr q19, [x17, #+1488] +ldr q3, [x17, #+1504] +ldr q28, [x17, #+1520] +ldr q25, [x0, #672] +ldr q18, [x0, #688] +ldr q16, [x0, #640] +ldr q26, [x0, #656] +sqrdmulh v17.4S, v25.4S, v15.s[0] +mul v25.4S, v25.4S,v5.s[0] +mla v25.4S, v17.4S, v31.s[0] +sub v17.4s, v16.4s, v25.4s +add v16.4s, v16.4s, v25.4s +sqrdmulh v25.4S, v18.4S, v15.s[0] +mul v18.4S, v18.4S,v5.s[0] +mla v18.4S, v25.4S, v31.s[0] +sub v25.4s, v26.4s, v18.4s +add v26.4s, v26.4s, v18.4s +sqrdmulh v18.4S, v26.4S, v15.s[1] +mul v26.4S, v26.4S,v5.s[1] +mla v26.4S, v18.4S, v31.s[0] +sub v18.4s, v16.4s, v26.4s +add v16.4s, v16.4s, v26.4s +sqrdmulh v26.4S, v25.4S, v15.s[2] +mul v25.4S, v25.4S,v5.s[2] +mla v25.4S, v26.4S, v31.s[0] +sub v26.4s, v17.4s, v25.4s +add v17.4s, v17.4s, v25.4s +trn1 v25.4S, v16.4S, v18.4S +trn2 v0.4S, v16.4S, v18.4S +trn1 v9.4S, v17.4S, v26.4S +trn2 v27.4S, v17.4S, v26.4S +trn2 v17.2D, v25.2D, v9.2D +trn2 v26.2D, v0.2D, v27.2D +trn1 v16.2D, v25.2D, v9.2D +trn1 v18.2D, v0.2D, v27.2D +sqrdmulh v27.4S, v17.4S, v30.4S +mul v17.4S, v17.4S,v8.4S +mla v17.4S, v27.4S, v31.s[0] +sub v27.4s, v16.4s, v17.4s +add v16.4s, v16.4s, v17.4s +sqrdmulh v17.4S, v26.4S, v30.4S +mul v26.4S, v26.4S,v8.4S +mla v26.4S, v17.4S, v31.s[0] +sub v17.4s, v18.4s, v26.4s +add v18.4s, v18.4s, v26.4s +sqrdmulh v26.4S, v18.4S, v19.4S +mul v18.4S, v18.4S,v22.4S +mla v18.4S, v26.4S, v31.s[0] +sub v26.4s, v16.4s, v18.4s +add v16.4s, v16.4s, v18.4s +sqrdmulh v18.4S, v17.4S, v28.4S +mul v17.4S, v17.4S,v3.4S +mla v17.4S, v18.4S, v31.s[0] +sub v18.4s, v27.4s, v17.4s +add v27.4s, v27.4s, v17.4s +str q16, [x0, #640] +str q26, [x0, #656] +str q27, [x0, #672] +str q18, [x0, #688] +ldr q18, [x17, #+1536] +ldr q27, [x17, #+1552] +ldr q26, [x17, #+1568] +ldr q16, [x17, #+1584] +ldr q17, [x17, #+1600] +ldr q0, [x17, #+1616] +ldr q9, [x17, #+1632] +ldr q25, [x17, #+1648] +ldr q28, [x0, #736] +ldr q3, [x0, #752] +ldr q19, [x0, #704] +ldr q22, [x0, #720] +sqrdmulh v30.4S, v28.4S, v27.s[0] +mul v28.4S, v28.4S,v18.s[0] +mla v28.4S, v30.4S, v31.s[0] +sub v30.4s, v19.4s, v28.4s +add v19.4s, v19.4s, v28.4s +sqrdmulh v28.4S, v3.4S, v27.s[0] +mul v3.4S, v3.4S,v18.s[0] +mla v3.4S, v28.4S, v31.s[0] +sub v28.4s, v22.4s, v3.4s +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v22.4S, v27.s[1] +mul v22.4S, v22.4S,v18.s[1] +mla v22.4S, v3.4S, v31.s[0] +sub v3.4s, v19.4s, v22.4s +add v19.4s, v19.4s, v22.4s +sqrdmulh v22.4S, v28.4S, v27.s[2] +mul v28.4S, v28.4S,v18.s[2] +mla v28.4S, v22.4S, v31.s[0] +sub v22.4s, v30.4s, v28.4s +add v30.4s, v30.4s, v28.4s +trn1 v28.4S, v19.4S, v3.4S +trn2 v8.4S, v19.4S, v3.4S +trn1 v15.4S, v30.4S, v22.4S +trn2 v5.4S, v30.4S, v22.4S +trn2 v30.2D, v28.2D, v15.2D +trn2 v22.2D, v8.2D, v5.2D +trn1 v19.2D, v28.2D, v15.2D +trn1 v3.2D, v8.2D, v5.2D +sqrdmulh v5.4S, v30.4S, v16.4S +mul v30.4S, v30.4S,v26.4S +mla v30.4S, v5.4S, v31.s[0] +sub v5.4s, v19.4s, v30.4s +add v19.4s, v19.4s, v30.4s +sqrdmulh v30.4S, v22.4S, v16.4S +mul v22.4S, v22.4S,v26.4S +mla v22.4S, v30.4S, v31.s[0] +sub v30.4s, v3.4s, v22.4s +add v3.4s, v3.4s, v22.4s +sqrdmulh v22.4S, v3.4S, v0.4S +mul v3.4S, v3.4S,v17.4S +mla v3.4S, v22.4S, v31.s[0] +sub v22.4s, v19.4s, v3.4s +add v19.4s, v19.4s, v3.4s +sqrdmulh v3.4S, v30.4S, v25.4S +mul v30.4S, v30.4S,v9.4S +mla v30.4S, v3.4S, v31.s[0] +sub v3.4s, v5.4s, v30.4s +add v5.4s, v5.4s, v30.4s +str q19, [x0, #704] +str q22, [x0, #720] +str q5, [x0, #736] +str q3, [x0, #752] +ldr q3, [x17, #+1664] +ldr q5, [x17, #+1680] +ldr q22, [x17, #+1696] +ldr q19, [x17, #+1712] +ldr q30, [x17, #+1728] +ldr q8, [x17, #+1744] +ldr q15, [x17, #+1760] +ldr q28, [x17, #+1776] +ldr q25, [x0, #800] +ldr q9, [x0, #816] +ldr q0, [x0, #768] +ldr q17, [x0, #784] +sqrdmulh v16.4S, v25.4S, v5.s[0] +mul v25.4S, v25.4S,v3.s[0] +mla v25.4S, v16.4S, v31.s[0] +sub v16.4s, v0.4s, v25.4s +add v0.4s, v0.4s, v25.4s +sqrdmulh v25.4S, v9.4S, v5.s[0] +mul v9.4S, v9.4S,v3.s[0] +mla v9.4S, v25.4S, v31.s[0] +sub v25.4s, v17.4s, v9.4s +add v17.4s, v17.4s, v9.4s +sqrdmulh v9.4S, v17.4S, v5.s[1] +mul v17.4S, v17.4S,v3.s[1] +mla v17.4S, v9.4S, v31.s[0] +sub v9.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +sqrdmulh v17.4S, v25.4S, v5.s[2] +mul v25.4S, v25.4S,v3.s[2] +mla v25.4S, v17.4S, v31.s[0] +sub v17.4s, v16.4s, v25.4s +add v16.4s, v16.4s, v25.4s +trn1 v25.4S, v0.4S, v9.4S +trn2 v26.4S, v0.4S, v9.4S +trn1 v27.4S, v16.4S, v17.4S +trn2 v18.4S, v16.4S, v17.4S +trn2 v16.2D, v25.2D, v27.2D +trn2 v17.2D, v26.2D, v18.2D +trn1 v0.2D, v25.2D, v27.2D +trn1 v9.2D, v26.2D, v18.2D +sqrdmulh v18.4S, v16.4S, v19.4S +mul v16.4S, v16.4S,v22.4S +mla v16.4S, v18.4S, v31.s[0] +sub v18.4s, v0.4s, v16.4s +add v0.4s, v0.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v19.4S +mul v17.4S, v17.4S,v22.4S +mla v17.4S, v16.4S, v31.s[0] +sub v16.4s, v9.4s, v17.4s +add v9.4s, v9.4s, v17.4s +sqrdmulh v17.4S, v9.4S, v8.4S +mul v9.4S, v9.4S,v30.4S +mla v9.4S, v17.4S, v31.s[0] +sub v17.4s, v0.4s, v9.4s +add v0.4s, v0.4s, v9.4s +sqrdmulh v9.4S, v16.4S, v28.4S +mul v16.4S, v16.4S,v15.4S +mla v16.4S, v9.4S, v31.s[0] +sub v9.4s, v18.4s, v16.4s +add v18.4s, v18.4s, v16.4s +str q0, [x0, #768] +str q17, [x0, #784] +str q18, [x0, #800] +str q9, [x0, #816] +ldr q9, [x17, #+1792] +ldr q18, [x17, #+1808] +ldr q17, [x17, #+1824] +ldr q0, [x17, #+1840] +ldr q16, [x17, #+1856] +ldr q26, [x17, #+1872] +ldr q27, [x17, #+1888] +ldr q25, [x17, #+1904] +ldr q28, [x0, #864] +ldr q15, [x0, #880] +ldr q8, [x0, #832] +ldr q30, [x0, #848] +sqrdmulh v19.4S, v28.4S, v18.s[0] +mul v28.4S, v28.4S,v9.s[0] +mla v28.4S, v19.4S, v31.s[0] +sub v19.4s, v8.4s, v28.4s +add v8.4s, v8.4s, v28.4s +sqrdmulh v28.4S, v15.4S, v18.s[0] +mul v15.4S, v15.4S,v9.s[0] +mla v15.4S, v28.4S, v31.s[0] +sub v28.4s, v30.4s, v15.4s +add v30.4s, v30.4s, v15.4s +sqrdmulh v15.4S, v30.4S, v18.s[1] +mul v30.4S, v30.4S,v9.s[1] +mla v30.4S, v15.4S, v31.s[0] +sub v15.4s, v8.4s, v30.4s +add v8.4s, v8.4s, v30.4s +sqrdmulh v30.4S, v28.4S, v18.s[2] +mul v28.4S, v28.4S,v9.s[2] +mla v28.4S, v30.4S, v31.s[0] +sub v30.4s, v19.4s, v28.4s +add v19.4s, v19.4s, v28.4s +trn1 v28.4S, v8.4S, v15.4S +trn2 v22.4S, v8.4S, v15.4S +trn1 v5.4S, v19.4S, v30.4S +trn2 v3.4S, v19.4S, v30.4S +trn2 v19.2D, v28.2D, v5.2D +trn2 v30.2D, v22.2D, v3.2D +trn1 v8.2D, v28.2D, v5.2D +trn1 v15.2D, v22.2D, v3.2D +sqrdmulh v3.4S, v19.4S, v0.4S +mul v19.4S, v19.4S,v17.4S +mla v19.4S, v3.4S, v31.s[0] +sub v3.4s, v8.4s, v19.4s +add v8.4s, v8.4s, v19.4s +sqrdmulh v19.4S, v30.4S, v0.4S +mul v30.4S, v30.4S,v17.4S +mla v30.4S, v19.4S, v31.s[0] +sub v19.4s, v15.4s, v30.4s +add v15.4s, v15.4s, v30.4s +sqrdmulh v30.4S, v15.4S, v26.4S +mul v15.4S, v15.4S,v16.4S +mla v15.4S, v30.4S, v31.s[0] +sub v30.4s, v8.4s, v15.4s +add v8.4s, v8.4s, v15.4s +sqrdmulh v15.4S, v19.4S, v25.4S +mul v19.4S, v19.4S,v27.4S +mla v19.4S, v15.4S, v31.s[0] +sub v15.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +str q8, [x0, #832] +str q30, [x0, #848] +str q3, [x0, #864] +str q15, [x0, #880] +ldr q15, [x17, #+1920] +ldr q3, [x17, #+1936] +ldr q30, [x17, #+1952] +ldr q8, [x17, #+1968] +ldr q19, [x17, #+1984] +ldr q22, [x17, #+2000] +ldr q5, [x17, #+2016] +ldr q28, [x17, #+2032] +ldr q25, [x0, #928] +ldr q27, [x0, #944] +ldr q26, [x0, #896] +ldr q16, [x0, #912] +sqrdmulh v0.4S, v25.4S, v3.s[0] +mul v25.4S, v25.4S,v15.s[0] +mla v25.4S, v0.4S, v31.s[0] +sub v0.4s, v26.4s, v25.4s +add v26.4s, v26.4s, v25.4s +sqrdmulh v25.4S, v27.4S, v3.s[0] +mul v27.4S, v27.4S,v15.s[0] +mla v27.4S, v25.4S, v31.s[0] +sub v25.4s, v16.4s, v27.4s +add v16.4s, v16.4s, v27.4s +sqrdmulh v27.4S, v16.4S, v3.s[1] +mul v16.4S, v16.4S,v15.s[1] +mla v16.4S, v27.4S, v31.s[0] +sub v27.4s, v26.4s, v16.4s +add v26.4s, v26.4s, v16.4s +sqrdmulh v16.4S, v25.4S, v3.s[2] +mul v25.4S, v25.4S,v15.s[2] +mla v25.4S, v16.4S, v31.s[0] +sub v16.4s, v0.4s, v25.4s +add v0.4s, v0.4s, v25.4s +trn1 v25.4S, v26.4S, v27.4S +trn2 v17.4S, v26.4S, v27.4S +trn1 v18.4S, v0.4S, v16.4S +trn2 v9.4S, v0.4S, v16.4S +trn2 v0.2D, v25.2D, v18.2D +trn2 v16.2D, v17.2D, v9.2D +trn1 v26.2D, v25.2D, v18.2D +trn1 v27.2D, v17.2D, v9.2D +sqrdmulh v9.4S, v0.4S, v8.4S +mul v0.4S, v0.4S,v30.4S +mla v0.4S, v9.4S, v31.s[0] +sub v9.4s, v26.4s, v0.4s +add v26.4s, v26.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v8.4S +mul v16.4S, v16.4S,v30.4S +mla v16.4S, v0.4S, v31.s[0] +sub v0.4s, v27.4s, v16.4s +add v27.4s, v27.4s, v16.4s +sqrdmulh v16.4S, v27.4S, v22.4S +mul v27.4S, v27.4S,v19.4S +mla v27.4S, v16.4S, v31.s[0] +sub v16.4s, v26.4s, v27.4s +add v26.4s, v26.4s, v27.4s +sqrdmulh v27.4S, v0.4S, v28.4S +mul v0.4S, v0.4S,v5.4S +mla v0.4S, v27.4S, v31.s[0] +sub v27.4s, v9.4s, v0.4s +add v9.4s, v9.4s, v0.4s +str q26, [x0, #896] +str q16, [x0, #912] +str q9, [x0, #928] +str q27, [x0, #944] +ldr q27, [x17, #+2048] +ldr q9, [x17, #+2064] +ldr q16, [x17, #+2080] +ldr q26, [x17, #+2096] +ldr q0, [x17, #+2112] +ldr q17, [x17, #+2128] +ldr q18, [x17, #+2144] +ldr q25, [x17, #+2160] +ldr q28, [x0, #992] +ldr q5, [x0, #1008] +ldr q22, [x0, #960] +ldr q19, [x0, #976] +sqrdmulh v8.4S, v28.4S, v9.s[0] +mul v28.4S, v28.4S,v27.s[0] +mla v28.4S, v8.4S, v31.s[0] +sub v8.4s, v22.4s, v28.4s +add v22.4s, v22.4s, v28.4s +sqrdmulh v28.4S, v5.4S, v9.s[0] +mul v5.4S, v5.4S,v27.s[0] +mla v5.4S, v28.4S, v31.s[0] +sub v28.4s, v19.4s, v5.4s +add v19.4s, v19.4s, v5.4s +sqrdmulh v5.4S, v19.4S, v9.s[1] +mul v19.4S, v19.4S,v27.s[1] +mla v19.4S, v5.4S, v31.s[0] +sub v5.4s, v22.4s, v19.4s +add v22.4s, v22.4s, v19.4s +sqrdmulh v19.4S, v28.4S, v9.s[2] +mul v28.4S, v28.4S,v27.s[2] +mla v28.4S, v19.4S, v31.s[0] +sub v19.4s, v8.4s, v28.4s +add v8.4s, v8.4s, v28.4s +trn1 v28.4S, v22.4S, v5.4S +trn2 v30.4S, v22.4S, v5.4S +trn1 v3.4S, v8.4S, v19.4S +trn2 v15.4S, v8.4S, v19.4S +trn2 v8.2D, v28.2D, v3.2D +trn2 v19.2D, v30.2D, v15.2D +trn1 v22.2D, v28.2D, v3.2D +trn1 v5.2D, v30.2D, v15.2D +sqrdmulh v15.4S, v8.4S, v26.4S +mul v8.4S, v8.4S,v16.4S +mla v8.4S, v15.4S, v31.s[0] +sub v15.4s, v22.4s, v8.4s +add v22.4s, v22.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v26.4S +mul v19.4S, v19.4S,v16.4S +mla v19.4S, v8.4S, v31.s[0] +sub v8.4s, v5.4s, v19.4s +add v5.4s, v5.4s, v19.4s +sqrdmulh v19.4S, v5.4S, v17.4S +mul v5.4S, v5.4S,v0.4S +mla v5.4S, v19.4S, v31.s[0] +sub v19.4s, v22.4s, v5.4s +add v22.4s, v22.4s, v5.4s +sqrdmulh v5.4S, v8.4S, v25.4S +mul v8.4S, v8.4S,v18.4S +mla v8.4S, v5.4S, v31.s[0] +sub v5.4s, v15.4s, v8.4s +add v15.4s, v15.4s, v8.4s +str q22, [x0, #960] +str q19, [x0, #976] +str q15, [x0, #992] +str q5, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 2456 +// Instruction count: 2452 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_18_0.s b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_18_0.s new file mode 100644 index 0000000..5783747 --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_18_0.s @@ -0,0 +1,2486 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 26036764 // Layer 6, block 0 +.word 7065381 // Layer 6, block 1 +.word 11280567 // Layer 6, block 2 +.word 19695786 // Layer 6, block 3 +.word 1666225723 // Layer 6, block 0 +.word 452149874 // Layer 6, block 1 +.word 721901190 // Layer 6, block 2 +.word 1260434103 // Layer 6, block 3 +.word 28678040 // Layer 7, block 0 +.word 5637166 // Layer 7, block 2 +.word 18759424 // Layer 7, block 4 +.word 8648030 // Layer 7, block 6 +.word 1835254486 // Layer 7, block 0 +.word 360751090 // Layer 7, block 2 +.word 1200511508 // Layer 7, block 4 +.word 553431680 // Layer 7, block 6 +.word 7232147 // Layer 7, block 1 +.word 7430689 // Layer 7, block 3 +.word 14819378 // Layer 7, block 5 +.word 22112339 // Layer 7, block 7 +.word 462822084 // Layer 7, block 1 +.word 475527802 // Layer 7, block 3 +.word 948367809 // Layer 7, block 5 +.word 1415081692 // Layer 7, block 7 +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14834498 // Layer 6, block 4 +.word 22861321 // Layer 6, block 5 +.word 23033862 // Layer 6, block 6 +.word 32211066 // Layer 6, block 7 +.word 949335415 // Layer 6, block 4 +.word 1463012881 // Layer 6, block 5 +.word 1474054663 // Layer 6, block 6 +.word 2061350894 // Layer 6, block 7 +.word 7103825 // Layer 7, block 8 +.word 24338119 // Layer 7, block 10 +.word 6674394 // Layer 7, block 12 +.word 3716128 // Layer 7, block 14 +.word 454610102 // Layer 7, block 8 +.word 1557520740 // Layer 7, block 10 +.word 427128616 // Layer 7, block 12 +.word 237814041 // Layer 7, block 14 +.word 18577393 // Layer 7, block 9 +.word 17042091 // Layer 7, block 11 +.word 6574213 // Layer 7, block 13 +.word 24666803 // Layer 7, block 15 +.word 1188862414 // Layer 7, block 9 +.word 1090610585 // Layer 7, block 11 +.word 420717521 // Layer 7, block 13 +.word 1578554911 // Layer 7, block 15 +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 11253846 // Layer 6, block 8 +.word 16151303 // Layer 6, block 9 +.word 1821442 // Layer 6, block 10 +.word 23358663 // Layer 6, block 11 +.word 720191176 // Layer 6, block 8 +.word 1033604503 // Layer 6, block 9 +.word 116563391 // Layer 6, block 10 +.word 1494840340 // Layer 6, block 11 +.word 32787475 // Layer 7, block 16 +.word 8269259 // Layer 7, block 18 +.word 20826321 // Layer 7, block 20 +.word 21194054 // Layer 7, block 22 +.word 2098238255 // Layer 7, block 16 +.word 529192186 // Layer 7, block 18 +.word 1332782821 // Layer 7, block 20 +.word 1356315937 // Layer 7, block 22 +.word 28400654 // Layer 7, block 17 +.word 31090287 // Layer 7, block 19 +.word 26776841 // Layer 7, block 21 +.word 22281074 // Layer 7, block 23 +.word 1817503137 // Layer 7, block 17 +.word 1989626512 // Layer 7, block 19 +.word 1713587037 // Layer 7, block 21 +.word 1425879908 // Layer 7, block 23 +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 20504641 // Layer 6, block 12 +.word 7735096 // Layer 6, block 13 +.word 29463916 // Layer 6, block 14 +.word 23172067 // Layer 6, block 15 +.word 1312196872 // Layer 6, block 12 +.word 495008363 // Layer 6, block 13 +.word 1885546712 // Layer 6, block 14 +.word 1482899108 // Layer 6, block 15 +.word 1953000 // Layer 7, block 24 +.word 12766243 // Layer 7, block 26 +.word 16292342 // Layer 7, block 28 +.word 25143337 // Layer 7, block 30 +.word 124982461 // Layer 7, block 24 +.word 816977197 // Layer 7, block 26 +.word 1042630311 // Layer 7, block 28 +.word 1609050759 // Layer 7, block 30 +.word 12486848 // Layer 7, block 25 +.word 31556661 // Layer 7, block 27 +.word 28330310 // Layer 7, block 29 +.word 15137961 // Layer 7, block 31 +.word 799097282 // Layer 7, block 25 +.word 2019472170 // Layer 7, block 27 +.word 1813001465 // Layer 7, block 29 +.word 968755565 // Layer 7, block 31 +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 18663828 // Layer 6, block 16 +.word 25765932 // Layer 6, block 17 +.word 11779122 // Layer 6, block 18 +.word 29112305 // Layer 6, block 19 +.word 1194393831 // Layer 6, block 16 +.word 1648893798 // Layer 6, block 17 +.word 753806275 // Layer 6, block 18 +.word 1863045325 // Layer 6, block 19 +.word 33163184 // Layer 7, block 32 +.word 11550623 // Layer 7, block 34 +.word 25375595 // Layer 7, block 36 +.word 18254638 // Layer 7, block 38 +.word 2122281795 // Layer 7, block 32 +.word 739183455 // Layer 7, block 34 +.word 1623914137 // Layer 7, block 36 +.word 1168207670 // Layer 7, block 38 +.word 9551359 // Layer 7, block 33 +.word 33257316 // Layer 7, block 35 +.word 10387700 // Layer 7, block 37 +.word 4263629 // Layer 7, block 39 +.word 611240324 // Layer 7, block 33 +.word 2128305784 // Layer 7, block 35 +.word 664762063 // Layer 7, block 37 +.word 272851431 // Layer 7, block 39 +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 596073 // Layer 6, block 20 +.word 29039358 // Layer 6, block 21 +.word 6760262 // Layer 6, block 22 +.word 2228887 // Layer 6, block 23 +.word 38145761 // Layer 6, block 20 +.word 1858377074 // Layer 6, block 21 +.word 432623749 // Layer 6, block 22 +.word 142637881 // Layer 6, block 23 +.word 25929180 // Layer 7, block 40 +.word 23508428 // Layer 7, block 42 +.word 22560727 // Layer 7, block 44 +.word 29457393 // Layer 7, block 46 +.word 1659340873 // Layer 7, block 40 +.word 1504424569 // Layer 7, block 42 +.word 1443776334 // Layer 7, block 44 +.word 1885129272 // Layer 7, block 46 +.word 17371159 // Layer 7, block 41 +.word 11558208 // Layer 7, block 43 +.word 15755637 // Layer 7, block 45 +.word 20740787 // Layer 7, block 47 +.word 1111669329 // Layer 7, block 41 +.word 739668858 // Layer 7, block 43 +.word 1008283812 // Layer 7, block 45 +.word 1327309063 // Layer 7, block 47 +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 13624329 // Layer 6, block 24 +.word 9838349 // Layer 6, block 25 +.word 6934560 // Layer 6, block 26 +.word 11310234 // Layer 6, block 27 +.word 871890510 // Layer 6, block 24 +.word 629606282 // Layer 6, block 25 +.word 443777969 // Layer 6, block 26 +.word 723799733 // Layer 6, block 27 +.word 3153984 // Layer 7, block 48 +.word 15599806 // Layer 7, block 50 +.word 23484790 // Layer 7, block 52 +.word 30174454 // Layer 7, block 54 +.word 201839571 // Layer 7, block 48 +.word 998311389 // Layer 7, block 50 +.word 1502911852 // Layer 7, block 52 +.word 1931017673 // Layer 7, block 54 +.word 13598070 // Layer 7, block 49 +.word 31454003 // Layer 7, block 51 +.word 20506260 // Layer 7, block 53 +.word 5928435 // Layer 7, block 55 +.word 870210062 // Layer 7, block 49 +.word 2012902560 // Layer 7, block 51 +.word 1312300480 // Layer 7, block 53 +.word 379390883 // Layer 7, block 55 +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 32798516 // Layer 6, block 28 +.word 9911360 // Layer 6, block 29 +.word 32443170 // Layer 6, block 30 +.word 31293482 // Layer 6, block 31 +.word 2098944825 // Layer 6, block 28 +.word 634278629 // Layer 6, block 29 +.word 2076204416 // Layer 6, block 30 +.word 2002630000 // Layer 6, block 31 +.word 26013877 // Layer 7, block 56 +.word 22928950 // Layer 7, block 58 +.word 24547058 // Layer 7, block 60 +.word 21082546 // Layer 7, block 62 +.word 1664761067 // Layer 7, block 56 +.word 1467340807 // Layer 7, block 58 +.word 1570891816 // Layer 7, block 60 +.word 1349179970 // Layer 7, block 62 +.word 21864746 // Layer 7, block 57 +.word 27678266 // Layer 7, block 59 +.word 30695887 // Layer 7, block 61 +.word 31772478 // Layer 7, block 63 +.word 1399236949 // Layer 7, block 57 +.word 1771273834 // Layer 7, block 59 +.word 1964386839 // Layer 7, block 61 +.word 2033283404 // Layer 7, block 63 +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 2853776 // Layer 6, block 32 +.word 31645959 // Layer 6, block 33 +.word 29723614 // Layer 6, block 34 +.word 31813171 // Layer 6, block 35 +.word 182627725 // Layer 6, block 32 +.word 2025186806 // Layer 6, block 33 +.word 1902166116 // Layer 6, block 34 +.word 2035887557 // Layer 6, block 35 +.word 30377953 // Layer 7, block 64 +.word 4924837 // Layer 7, block 66 +.word 11362575 // Layer 7, block 68 +.word 31398766 // Layer 7, block 70 +.word 1944040616 // Layer 7, block 64 +.word 315165513 // Layer 7, block 66 +.word 727149301 // Layer 7, block 68 +.word 2009367662 // Layer 7, block 70 +.word 27689101 // Layer 7, block 65 +.word 31229525 // Layer 7, block 67 +.word 6544948 // Layer 7, block 69 +.word 13728247 // Layer 7, block 71 +.word 1771967221 // Layer 7, block 65 +.word 1998537064 // Layer 7, block 67 +.word 418844704 // Layer 7, block 69 +.word 878540754 // Layer 7, block 71 +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9116920 // Layer 6, block 36 +.word 26449800 // Layer 6, block 37 +.word 27173300 // Layer 6, block 38 +.word 1574249 // Layer 6, block 39 +.word 583438350 // Layer 6, block 36 +.word 1692658010 // Layer 6, block 37 +.word 1738958476 // Layer 6, block 38 +.word 100744247 // Layer 6, block 39 +.word 6510145 // Layer 7, block 72 +.word 760999 // Layer 7, block 74 +.word 1634503 // Layer 7, block 76 +.word 29546109 // Layer 7, block 78 +.word 416617482 // Layer 7, block 72 +.word 48700219 // Layer 7, block 74 +.word 104600209 // Layer 7, block 76 +.word 1890806663 // Layer 7, block 78 +.word 2195232 // Layer 7, block 73 +.word 4465852 // Layer 7, block 75 +.word 31203102 // Layer 7, block 77 +.word 29916743 // Layer 7, block 79 +.word 140484126 // Layer 7, block 73 +.word 285792715 // Layer 7, block 75 +.word 1996846121 // Layer 7, block 77 +.word 1914525428 // Layer 7, block 79 +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29172999 // Layer 6, block 40 +.word 16825951 // Layer 6, block 41 +.word 11592382 // Layer 6, block 42 +.word 2671395 // Layer 6, block 43 +.word 1866929445 // Layer 6, block 40 +.word 1076778680 // Layer 6, block 41 +.word 741855827 // Layer 6, block 42 +.word 170956232 // Layer 6, block 43 +.word 14579779 // Layer 7, block 80 +.word 24263513 // Layer 7, block 82 +.word 4646776 // Layer 7, block 84 +.word 69049 // Layer 7, block 86 +.word 933034643 // Layer 7, block 80 +.word 1552746321 // Layer 7, block 82 +.word 297370968 // Layer 7, block 84 +.word 4418799 // Layer 7, block 86 +.word 33263488 // Layer 7, block 81 +.word 22493246 // Layer 7, block 83 +.word 22009979 // Layer 7, block 85 +.word 12021234 // Layer 7, block 87 +.word 2128700762 // Layer 7, block 81 +.word 1439457879 // Layer 7, block 83 +.word 1408531152 // Layer 7, block 85 +.word 769300260 // Layer 7, block 87 +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 15720958 // Layer 6, block 44 +.word 4876619 // Layer 6, block 45 +.word 9370171 // Layer 6, block 46 +.word 2197027 // Layer 6, block 47 +.word 1006064525 // Layer 6, block 44 +.word 312079797 // Layer 6, block 45 +.word 599645177 // Layer 6, block 46 +.word 140598997 // Layer 6, block 47 +.word 16117282 // Layer 7, block 88 +.word 9635661 // Layer 7, block 90 +.word 9117520 // Layer 7, block 92 +.word 3506913 // Layer 7, block 94 +.word 1031427326 // Layer 7, block 88 +.word 616635240 // Layer 7, block 90 +.word 583476747 // Layer 7, block 92 +.word 224425303 // Layer 7, block 94 +.word 20014407 // Layer 7, block 89 +.word 25893988 // Layer 7, block 91 +.word 10257619 // Layer 7, block 93 +.word 24501669 // Layer 7, block 95 +.word 1280824291 // Layer 7, block 89 +.word 1657088757 // Layer 7, block 91 +.word 656437514 // Layer 7, block 93 +.word 1567987141 // Layer 7, block 95 +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 23467272 // Layer 6, block 48 +.word 11944835 // Layer 6, block 49 +.word 29768154 // Layer 6, block 50 +.word 3189790 // Layer 6, block 51 +.word 1501790786 // Layer 6, block 48 +.word 764411097 // Layer 6, block 49 +.word 1905016458 // Layer 6, block 50 +.word 204130980 // Layer 6, block 51 +.word 28559032 // Layer 7, block 96 +.word 20151609 // Layer 7, block 98 +.word 11645481 // Layer 7, block 100 +.word 16402437 // Layer 7, block 102 +.word 1827638556 // Layer 7, block 96 +.word 1289604549 // Layer 7, block 98 +.word 745253903 // Layer 7, block 100 +.word 1049675853 // Layer 7, block 102 +.word 1005359 // Layer 7, block 97 +.word 19130139 // Layer 7, block 99 +.word 11690281 // Layer 7, block 101 +.word 5461508 // Layer 7, block 103 +.word 64338065 // Layer 7, block 97 +.word 1224235458 // Layer 7, block 99 +.word 748120885 // Layer 7, block 101 +.word 349509836 // Layer 7, block 103 +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 4898455 // Layer 6, block 52 +.word 22059944 // Layer 6, block 53 +.word 20315246 // Layer 6, block 54 +.word 28615767 // Layer 6, block 55 +.word 313477194 // Layer 6, block 52 +.word 1411728668 // Layer 6, block 53 +.word 1300076517 // Layer 6, block 54 +.word 1831269319 // Layer 6, block 55 +.word 6226096 // Layer 7, block 104 +.word 14029790 // Layer 7, block 106 +.word 7729000 // Layer 7, block 108 +.word 13958531 // Layer 7, block 110 +.word 398439734 // Layer 7, block 104 +.word 897838034 // Layer 7, block 106 +.word 494618249 // Layer 7, block 108 +.word 893277806 // Layer 7, block 110 +.word 31755058 // Layer 7, block 105 +.word 26102744 // Layer 7, block 107 +.word 19175904 // Layer 7, block 109 +.word 19472238 // Layer 7, block 111 +.word 2032168609 // Layer 7, block 105 +.word 1670448121 // Layer 7, block 107 +.word 1227164194 // Layer 7, block 109 +.word 1246128123 // Layer 7, block 111 +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 17302560 // Layer 6, block 56 +.word 8630188 // Layer 6, block 57 +.word 13744680 // Layer 6, block 58 +.word 31890906 // Layer 6, block 59 +.word 1107279328 // Layer 6, block 56 +.word 552289879 // Layer 6, block 57 +.word 879592386 // Layer 6, block 58 +.word 2040862218 // Layer 6, block 59 +.word 4735938 // Layer 7, block 112 +.word 26671657 // Layer 7, block 114 +.word 25810971 // Layer 7, block 116 +.word 25578690 // Layer 7, block 118 +.word 303076900 // Layer 7, block 112 +.word 1706855774 // Layer 7, block 114 +.word 1651776074 // Layer 7, block 116 +.word 1636911225 // Layer 7, block 118 +.word 6957373 // Layer 7, block 113 +.word 25381712 // Layer 7, block 115 +.word 27780827 // Layer 7, block 117 +.word 28062311 // Layer 7, block 119 +.word 445237890 // Layer 7, block 113 +.word 1624305595 // Layer 7, block 115 +.word 1777837237 // Layer 7, block 117 +.word 1795850838 // Layer 7, block 119 +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 26150922 // Layer 6, block 60 +.word 29525906 // Layer 6, block 61 +.word 23080870 // Layer 6, block 62 +.word 1636987 // Layer 6, block 63 +.word 1673531278 // Layer 6, block 60 +.word 1889513769 // Layer 6, block 61 +.word 1477062945 // Layer 6, block 62 +.word 104759172 // Layer 6, block 63 +.word 10674616 // Layer 7, block 120 +.word 9508293 // Layer 7, block 122 +.word 4274200 // Layer 7, block 124 +.word 10066304 // Layer 7, block 126 +.word 683123285 // Layer 7, block 120 +.word 608484310 // Layer 7, block 122 +.word 273527923 // Layer 7, block 124 +.word 644194289 // Layer 7, block 126 +.word 26473446 // Layer 7, block 121 +.word 14853570 // Layer 7, block 123 +.word 32427548 // Layer 7, block 125 +.word 16598340 // Layer 7, block 127 +.word 1694171239 // Layer 7, block 121 +.word 950555930 // Layer 7, block 123 +.word 2075204685 // Layer 7, block 125 +.word 1062212688 // Layer 7, block 127 +.text +.global ntt_u32_full_neon_asm_var_4_4_18_0 +.global _ntt_u32_full_neon_asm_var_4_4_18_0 +ntt_u32_full_neon_asm_var_4_4_18_0: +_ntt_u32_full_neon_asm_var_4_4_18_0: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x0, #992] +sqrdmulh v27.4S, v28.4S, v29.s[0] +mul v28.4S, v28.4S,v30.s[0] +ldr q26, [x0, #928] +sqrdmulh v25.4S, v26.4S, v29.s[0] +mul v26.4S, v26.4S,v30.s[0] +ldr q24, [x0, #864] +sqrdmulh v23.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +ldr q22, [x0, #800] +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +ldr q20, [x0, #736] +sqrdmulh v19.4S, v20.4S, v29.s[0] +mla v28.4S, v27.4S, v31.s[0] +ldr q27, [x0, #672] +sqrdmulh v18.4S, v27.4S, v29.s[0] +mla v26.4S, v25.4S, v31.s[0] +ldr q25, [x0, #608] +sqrdmulh v17.4S, v25.4S, v29.s[0] +mla v24.4S, v23.4S, v31.s[0] +ldr q23, [x0, #544] +sqrdmulh v16.4S, v23.4S, v29.s[0] +mla v22.4S, v21.4S, v31.s[0] +ldr q21, [x0, #480] +ldr q3, [x0, #416] +mul v27.4S, v27.4S,v30.s[0] +mul v20.4S, v20.4S,v30.s[0] +sub v2.4s, v21.4s, v28.4s +add v21.4s, v21.4s, v28.4s +ldr q28, [x0, #352] +ldr q1, [x0, #288] +mla v27.4S, v18.4S, v31.s[0] +mla v20.4S, v19.4S, v31.s[0] +sub v19.4s, v3.4s, v26.4s +add v3.4s, v3.4s, v26.4s +ldr q26, [x0, #224] +ldr q18, [x0, #160] +mul v23.4S, v23.4S,v30.s[0] +mul v25.4S, v25.4S,v30.s[0] +sub v0.4s, v28.4s, v24.4s +add v28.4s, v28.4s, v24.4s +ldr q24, [x0, #96] +ldr q15, [x0, #32] +mla v23.4S, v16.4S, v31.s[0] +mla v25.4S, v17.4S, v31.s[0] +sub v17.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +sqrdmulh v22.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v16.4s, v26.4s, v20.4s +nop +sqrdmulh v14.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +add v26.4s, v26.4s, v20.4s +nop +sqrdmulh v20.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v13.4s, v18.4s, v27.4s +add v18.4s, v18.4s, v27.4s +sqrdmulh v27.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v12.4s, v24.4s, v25.4s +add v24.4s, v24.4s, v25.4s +sqrdmulh v25.4S, v0.4S, v29.s[2] +mla v2.4S, v22.4S, v31.s[0] +sub v22.4s, v15.4s, v23.4s +sqrdmulh v11.4S, v17.4S, v29.s[2] +mla v19.4S, v14.4S, v31.s[0] +add v15.4s, v15.4s, v23.4s +nop +sqrdmulh v23.4S, v28.4S, v29.s[1] +mla v21.4S, v20.4S, v31.s[0] +nop +sqrdmulh v20.4S, v1.4S, v29.s[1] +mla v3.4S, v27.4S, v31.s[0] +nop +nop +ldr q27, [x17, #+32] +ldr q14, [x17, #+48] +mul v17.4S, v17.4S,v30.s[2] +mul v0.4S, v0.4S,v30.s[2] +sub v10.4s, v16.4s, v2.4s +add v16.4s, v16.4s, v2.4s +mla v17.4S, v11.4S, v31.s[0] +mla v0.4S, v25.4S, v31.s[0] +sub v25.4s, v13.4s, v19.4s +add v13.4s, v13.4s, v19.4s +mul v1.4S, v1.4S,v30.s[1] +mul v28.4S, v28.4S,v30.s[1] +sub v19.4s, v26.4s, v21.4s +add v26.4s, v26.4s, v21.4s +mla v1.4S, v20.4S, v31.s[0] +mla v28.4S, v23.4S, v31.s[0] +sub v23.4s, v18.4s, v3.4s +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v10.4S, v14.s[3] +mul v10.4S, v10.4S,v27.s[3] +nop +nop +sqrdmulh v20.4S, v16.4S, v14.s[2] +mul v16.4S, v16.4S,v27.s[2] +sub v21.4s, v12.4s, v0.4s +add v12.4s, v12.4s, v0.4s +sqrdmulh v0.4S, v19.4S, v14.s[1] +mul v19.4S, v19.4S,v27.s[1] +sub v11.4s, v22.4s, v17.4s +add v22.4s, v22.4s, v17.4s +sqrdmulh v17.4S, v26.4S, v14.s[0] +mul v26.4S, v26.4S,v27.s[0] +sub v2.4s, v24.4s, v28.4s +add v24.4s, v24.4s, v28.4s +ldr q28, [x17, #+96] +ldr q9, [x17, #+112] +sqrdmulh v8.4S, v25.4S, v14.s[3] +mla v10.4S, v3.4S, v31.s[0] +sub v3.4s, v15.4s, v1.4s +add v15.4s, v15.4s, v1.4s +sqrdmulh v1.4S, v13.4S, v14.s[2] +mla v16.4S, v20.4S, v31.s[0] +nop +nop +sqrdmulh v20.4S, v23.4S, v14.s[1] +mla v19.4S, v0.4S, v31.s[0] +nop +nop +sqrdmulh v0.4S, v18.4S, v14.s[0] +mla v26.4S, v17.4S, v31.s[0] +nop +nop +ldr q17, [x17, #+64] +ldr q7, [x17, #+80] +mul v13.4S, v13.4S,v27.s[2] +mul v25.4S, v25.4S,v27.s[3] +sub v6.4s, v21.4s, v10.4s +add v21.4s, v21.4s, v10.4s +mla v13.4S, v1.4S, v31.s[0] +mla v25.4S, v8.4S, v31.s[0] +sub v8.4s, v12.4s, v16.4s +add v12.4s, v12.4s, v16.4s +mul v18.4S, v18.4S,v27.s[0] +mul v23.4S, v23.4S,v27.s[1] +sub v16.4s, v2.4s, v19.4s +add v2.4s, v2.4s, v19.4s +mla v18.4S, v0.4S, v31.s[0] +mla v23.4S, v20.4S, v31.s[0] +sub v20.4s, v24.4s, v26.4s +add v24.4s, v24.4s, v26.4s +sqrdmulh v26.4S, v6.4S, v9.s[3] +mul v6.4S, v6.4S,v28.s[3] +nop +nop +sqrdmulh v0.4S, v21.4S, v9.s[2] +mul v21.4S, v21.4S,v28.s[2] +sub v19.4s, v11.4s, v25.4s +add v11.4s, v11.4s, v25.4s +sqrdmulh v25.4S, v8.4S, v9.s[1] +mul v8.4S, v8.4S,v28.s[1] +sub v1.4s, v22.4s, v13.4s +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v12.4S, v9.s[0] +mul v12.4S, v12.4S,v28.s[0] +sub v10.4s, v3.4s, v23.4s +add v3.4s, v3.4s, v23.4s +sqrdmulh v23.4S, v16.4S, v7.s[3] +mla v6.4S, v26.4S, v31.s[0] +sub v26.4s, v15.4s, v18.4s +add v15.4s, v15.4s, v18.4s +sqrdmulh v18.4S, v2.4S, v7.s[2] +mla v21.4S, v0.4S, v31.s[0] +sub v0.4s, v19.4s, v6.4s +str q0, [x0, #992] +sqrdmulh v0.4S, v20.4S, v7.s[1] +mla v8.4S, v25.4S, v31.s[0] +add v19.4s, v19.4s, v6.4s +str q19, [x0, #928] +sqrdmulh v19.4S, v24.4S, v7.s[0] +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v21.4s +str q13, [x0, #864] +mul v2.4S, v2.4S,v17.s[2] +mul v16.4S, v16.4S,v17.s[3] +add v11.4s, v11.4s, v21.4s +sub v21.4s, v1.4s, v8.4s +mla v2.4S, v18.4S, v31.s[0] +mla v16.4S, v23.4S, v31.s[0] +add v1.4s, v1.4s, v8.4s +str q11, [x0, #800] +mul v24.4S, v24.4S,v17.s[0] +mul v20.4S, v20.4S,v17.s[1] +sub v11.4s, v22.4s, v12.4s +str q21, [x0, #736] +mla v24.4S, v19.4S, v31.s[0] +mla v20.4S, v0.4S, v31.s[0] +add v22.4s, v22.4s, v12.4s +str q1, [x0, #672] +ldr q1, [x0, #1008] +sqrdmulh v12.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +str q11, [x0, #608] +sub v11.4s, v10.4s, v16.4s +ldr q0, [x0, #944] +sqrdmulh v19.4S, v0.4S, v29.s[0] +mul v0.4S, v0.4S,v30.s[0] +str q22, [x0, #544] +add v10.4s, v10.4s, v16.4s +ldr q16, [x0, #880] +sqrdmulh v22.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +str q11, [x0, #480] +sub v11.4s, v3.4s, v2.4s +ldr q21, [x0, #816] +sqrdmulh v8.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +str q10, [x0, #416] +add v3.4s, v3.4s, v2.4s +ldr q2, [x0, #752] +sqrdmulh v10.4S, v2.4S, v29.s[0] +mla v1.4S, v12.4S, v31.s[0] +str q11, [x0, #352] +sub v11.4s, v26.4s, v20.4s +ldr q12, [x0, #688] +sqrdmulh v23.4S, v12.4S, v29.s[0] +mla v0.4S, v19.4S, v31.s[0] +str q3, [x0, #288] +add v26.4s, v26.4s, v20.4s +ldr q20, [x0, #624] +sqrdmulh v3.4S, v20.4S, v29.s[0] +mla v16.4S, v22.4S, v31.s[0] +str q11, [x0, #224] +sub v11.4s, v15.4s, v24.4s +ldr q22, [x0, #560] +sqrdmulh v19.4S, v22.4S, v29.s[0] +mla v21.4S, v8.4S, v31.s[0] +str q26, [x0, #160] +add v15.4s, v15.4s, v24.4s +ldr q24, [x0, #496] +ldr q26, [x0, #432] +mul v12.4S, v12.4S,v30.s[0] +mul v2.4S, v2.4S,v30.s[0] +sub v8.4s, v24.4s, v1.4s +add v24.4s, v24.4s, v1.4s +ldr q1, [x0, #368] +ldr q18, [x0, #304] +mla v12.4S, v23.4S, v31.s[0] +mla v2.4S, v10.4S, v31.s[0] +sub v10.4s, v26.4s, v0.4s +add v26.4s, v26.4s, v0.4s +ldr q0, [x0, #240] +ldr q23, [x0, #176] +mul v22.4S, v22.4S,v30.s[0] +mul v20.4S, v20.4S,v30.s[0] +sub v13.4s, v1.4s, v16.4s +add v1.4s, v1.4s, v16.4s +ldr q16, [x0, #112] +ldr q6, [x0, #48] +mla v22.4S, v19.4S, v31.s[0] +mla v20.4S, v3.4S, v31.s[0] +sub v3.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v8.4S, v29.s[2] +mul v8.4S, v8.4S,v30.s[2] +sub v19.4s, v0.4s, v2.4s +nop +sqrdmulh v25.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +add v0.4s, v0.4s, v2.4s +nop +sqrdmulh v2.4S, v24.4S, v29.s[1] +mul v24.4S, v24.4S,v30.s[1] +sub v5.4s, v23.4s, v12.4s +add v23.4s, v23.4s, v12.4s +sqrdmulh v12.4S, v26.4S, v29.s[1] +mul v26.4S, v26.4S,v30.s[1] +sub v4.4s, v16.4s, v20.4s +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v13.4S, v29.s[2] +mla v8.4S, v21.4S, v31.s[0] +sub v21.4s, v6.4s, v22.4s +str q11, [x0, #96] +sqrdmulh v11.4S, v3.4S, v29.s[2] +mla v10.4S, v25.4S, v31.s[0] +add v6.4s, v6.4s, v22.4s +nop +sqrdmulh v22.4S, v1.4S, v29.s[1] +mla v24.4S, v2.4S, v31.s[0] +str q15, [x0, #32] +nop +sqrdmulh v15.4S, v18.4S, v29.s[1] +mla v26.4S, v12.4S, v31.s[0] +nop +nop +mul v3.4S, v3.4S,v30.s[2] +mul v13.4S, v13.4S,v30.s[2] +sub v12.4s, v19.4s, v8.4s +add v19.4s, v19.4s, v8.4s +mla v3.4S, v11.4S, v31.s[0] +mla v13.4S, v20.4S, v31.s[0] +sub v20.4s, v5.4s, v10.4s +add v5.4s, v5.4s, v10.4s +mul v18.4S, v18.4S,v30.s[1] +mul v1.4S, v1.4S,v30.s[1] +sub v10.4s, v0.4s, v24.4s +add v0.4s, v0.4s, v24.4s +mla v18.4S, v15.4S, v31.s[0] +mla v1.4S, v22.4S, v31.s[0] +sub v22.4s, v23.4s, v26.4s +add v23.4s, v23.4s, v26.4s +sqrdmulh v26.4S, v12.4S, v14.s[3] +mul v12.4S, v12.4S,v27.s[3] +nop +nop +sqrdmulh v15.4S, v19.4S, v14.s[2] +mul v19.4S, v19.4S,v27.s[2] +sub v24.4s, v4.4s, v13.4s +add v4.4s, v4.4s, v13.4s +sqrdmulh v13.4S, v10.4S, v14.s[1] +mul v10.4S, v10.4S,v27.s[1] +sub v11.4s, v21.4s, v3.4s +add v21.4s, v21.4s, v3.4s +sqrdmulh v3.4S, v0.4S, v14.s[0] +mul v0.4S, v0.4S,v27.s[0] +sub v8.4s, v16.4s, v1.4s +add v16.4s, v16.4s, v1.4s +sqrdmulh v1.4S, v20.4S, v14.s[3] +mla v12.4S, v26.4S, v31.s[0] +sub v26.4s, v6.4s, v18.4s +add v6.4s, v6.4s, v18.4s +sqrdmulh v18.4S, v5.4S, v14.s[2] +mla v19.4S, v15.4S, v31.s[0] +nop +nop +sqrdmulh v15.4S, v22.4S, v14.s[1] +mla v10.4S, v13.4S, v31.s[0] +nop +nop +sqrdmulh v13.4S, v23.4S, v14.s[0] +mla v0.4S, v3.4S, v31.s[0] +nop +nop +mul v5.4S, v5.4S,v27.s[2] +mul v20.4S, v20.4S,v27.s[3] +sub v3.4s, v24.4s, v12.4s +add v24.4s, v24.4s, v12.4s +mla v5.4S, v18.4S, v31.s[0] +mla v20.4S, v1.4S, v31.s[0] +sub v1.4s, v4.4s, v19.4s +add v4.4s, v4.4s, v19.4s +mul v23.4S, v23.4S,v27.s[0] +mul v22.4S, v22.4S,v27.s[1] +sub v19.4s, v8.4s, v10.4s +add v8.4s, v8.4s, v10.4s +mla v23.4S, v13.4S, v31.s[0] +mla v22.4S, v15.4S, v31.s[0] +sub v15.4s, v16.4s, v0.4s +add v16.4s, v16.4s, v0.4s +sqrdmulh v0.4S, v3.4S, v9.s[3] +mul v3.4S, v3.4S,v28.s[3] +nop +nop +sqrdmulh v13.4S, v24.4S, v9.s[2] +mul v24.4S, v24.4S,v28.s[2] +sub v10.4s, v11.4s, v20.4s +add v11.4s, v11.4s, v20.4s +sqrdmulh v20.4S, v1.4S, v9.s[1] +mul v1.4S, v1.4S,v28.s[1] +sub v18.4s, v21.4s, v5.4s +add v21.4s, v21.4s, v5.4s +sqrdmulh v5.4S, v4.4S, v9.s[0] +mul v4.4S, v4.4S,v28.s[0] +sub v12.4s, v26.4s, v22.4s +add v26.4s, v26.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v7.s[3] +mla v3.4S, v0.4S, v31.s[0] +sub v0.4s, v6.4s, v23.4s +add v6.4s, v6.4s, v23.4s +sqrdmulh v23.4S, v8.4S, v7.s[2] +mla v24.4S, v13.4S, v31.s[0] +sub v13.4s, v10.4s, v3.4s +str q13, [x0, #1008] +sqrdmulh v13.4S, v15.4S, v7.s[1] +mla v1.4S, v20.4S, v31.s[0] +add v10.4s, v10.4s, v3.4s +str q10, [x0, #944] +sqrdmulh v10.4S, v16.4S, v7.s[0] +mla v4.4S, v5.4S, v31.s[0] +sub v5.4s, v11.4s, v24.4s +str q5, [x0, #880] +mul v8.4S, v8.4S,v17.s[2] +mul v19.4S, v19.4S,v17.s[3] +add v11.4s, v11.4s, v24.4s +sub v24.4s, v18.4s, v1.4s +mla v8.4S, v23.4S, v31.s[0] +mla v19.4S, v22.4S, v31.s[0] +add v18.4s, v18.4s, v1.4s +str q11, [x0, #816] +mul v16.4S, v16.4S,v17.s[0] +mul v15.4S, v15.4S,v17.s[1] +sub v11.4s, v21.4s, v4.4s +str q24, [x0, #752] +mla v16.4S, v10.4S, v31.s[0] +mla v15.4S, v13.4S, v31.s[0] +add v21.4s, v21.4s, v4.4s +str q18, [x0, #688] +ldr q18, [x0, #960] +sqrdmulh v4.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +str q11, [x0, #624] +sub v11.4s, v12.4s, v19.4s +ldr q13, [x0, #896] +sqrdmulh v10.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +str q21, [x0, #560] +add v12.4s, v12.4s, v19.4s +ldr q19, [x0, #832] +sqrdmulh v21.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +str q11, [x0, #496] +sub v11.4s, v26.4s, v8.4s +ldr q24, [x0, #768] +sqrdmulh v1.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +str q12, [x0, #432] +add v26.4s, v26.4s, v8.4s +ldr q8, [x0, #704] +sqrdmulh v12.4S, v8.4S, v29.s[0] +mla v18.4S, v4.4S, v31.s[0] +str q11, [x0, #368] +sub v11.4s, v0.4s, v15.4s +ldr q4, [x0, #640] +sqrdmulh v22.4S, v4.4S, v29.s[0] +mla v13.4S, v10.4S, v31.s[0] +str q26, [x0, #304] +add v0.4s, v0.4s, v15.4s +ldr q15, [x0, #576] +sqrdmulh v26.4S, v15.4S, v29.s[0] +mla v19.4S, v21.4S, v31.s[0] +str q11, [x0, #240] +sub v11.4s, v6.4s, v16.4s +ldr q21, [x0, #512] +sqrdmulh v10.4S, v21.4S, v29.s[0] +mla v24.4S, v1.4S, v31.s[0] +str q0, [x0, #176] +add v6.4s, v6.4s, v16.4s +ldr q16, [x0, #448] +ldr q0, [x0, #384] +mul v4.4S, v4.4S,v30.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v1.4s, v16.4s, v18.4s +add v16.4s, v16.4s, v18.4s +ldr q18, [x0, #320] +ldr q23, [x0, #256] +mla v4.4S, v22.4S, v31.s[0] +mla v8.4S, v12.4S, v31.s[0] +sub v12.4s, v0.4s, v13.4s +add v0.4s, v0.4s, v13.4s +ldr q13, [x0, #192] +ldr q22, [x0, #128] +mul v21.4S, v21.4S,v30.s[0] +mul v15.4S, v15.4S,v30.s[0] +sub v5.4s, v18.4s, v19.4s +add v18.4s, v18.4s, v19.4s +ldr q19, [x0, #64] +ldr q3, [x0, #0] +mla v21.4S, v10.4S, v31.s[0] +mla v15.4S, v26.4S, v31.s[0] +sub v26.4s, v23.4s, v24.4s +add v23.4s, v23.4s, v24.4s +sqrdmulh v24.4S, v1.4S, v29.s[2] +mul v1.4S, v1.4S,v30.s[2] +sub v10.4s, v13.4s, v8.4s +nop +sqrdmulh v20.4S, v12.4S, v29.s[2] +mul v12.4S, v12.4S,v30.s[2] +add v13.4s, v13.4s, v8.4s +nop +sqrdmulh v8.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v2.4s, v22.4s, v4.4s +add v22.4s, v22.4s, v4.4s +sqrdmulh v4.4S, v0.4S, v29.s[1] +mul v0.4S, v0.4S,v30.s[1] +sub v25.4s, v19.4s, v15.4s +add v19.4s, v19.4s, v15.4s +sqrdmulh v15.4S, v5.4S, v29.s[2] +mla v1.4S, v24.4S, v31.s[0] +sub v24.4s, v3.4s, v21.4s +str q11, [x0, #112] +sqrdmulh v11.4S, v26.4S, v29.s[2] +mla v12.4S, v20.4S, v31.s[0] +add v3.4s, v3.4s, v21.4s +nop +sqrdmulh v21.4S, v18.4S, v29.s[1] +mla v16.4S, v8.4S, v31.s[0] +str q6, [x0, #48] +nop +sqrdmulh v6.4S, v23.4S, v29.s[1] +mla v0.4S, v4.4S, v31.s[0] +nop +nop +mul v26.4S, v26.4S,v30.s[2] +mul v5.4S, v5.4S,v30.s[2] +sub v4.4s, v10.4s, v1.4s +add v10.4s, v10.4s, v1.4s +mla v26.4S, v11.4S, v31.s[0] +mla v5.4S, v15.4S, v31.s[0] +sub v15.4s, v2.4s, v12.4s +add v2.4s, v2.4s, v12.4s +mul v23.4S, v23.4S,v30.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v12.4s, v13.4s, v16.4s +add v13.4s, v13.4s, v16.4s +mla v23.4S, v6.4S, v31.s[0] +mla v18.4S, v21.4S, v31.s[0] +sub v21.4s, v22.4s, v0.4s +add v22.4s, v22.4s, v0.4s +sqrdmulh v0.4S, v4.4S, v14.s[3] +mul v4.4S, v4.4S,v27.s[3] +nop +nop +sqrdmulh v6.4S, v10.4S, v14.s[2] +mul v10.4S, v10.4S,v27.s[2] +sub v16.4s, v25.4s, v5.4s +add v25.4s, v25.4s, v5.4s +sqrdmulh v5.4S, v12.4S, v14.s[1] +mul v12.4S, v12.4S,v27.s[1] +sub v11.4s, v24.4s, v26.4s +add v24.4s, v24.4s, v26.4s +sqrdmulh v26.4S, v13.4S, v14.s[0] +mul v13.4S, v13.4S,v27.s[0] +sub v1.4s, v19.4s, v18.4s +add v19.4s, v19.4s, v18.4s +sqrdmulh v18.4S, v15.4S, v14.s[3] +mla v4.4S, v0.4S, v31.s[0] +sub v0.4s, v3.4s, v23.4s +add v3.4s, v3.4s, v23.4s +sqrdmulh v23.4S, v2.4S, v14.s[2] +mla v10.4S, v6.4S, v31.s[0] +nop +nop +sqrdmulh v6.4S, v21.4S, v14.s[1] +mla v12.4S, v5.4S, v31.s[0] +nop +nop +sqrdmulh v5.4S, v22.4S, v14.s[0] +mla v13.4S, v26.4S, v31.s[0] +nop +nop +mul v2.4S, v2.4S,v27.s[2] +mul v15.4S, v15.4S,v27.s[3] +sub v26.4s, v16.4s, v4.4s +add v16.4s, v16.4s, v4.4s +mla v2.4S, v23.4S, v31.s[0] +mla v15.4S, v18.4S, v31.s[0] +sub v18.4s, v25.4s, v10.4s +add v25.4s, v25.4s, v10.4s +mul v22.4S, v22.4S,v27.s[0] +mul v21.4S, v21.4S,v27.s[1] +sub v10.4s, v1.4s, v12.4s +add v1.4s, v1.4s, v12.4s +mla v22.4S, v5.4S, v31.s[0] +mla v21.4S, v6.4S, v31.s[0] +sub v6.4s, v19.4s, v13.4s +add v19.4s, v19.4s, v13.4s +sqrdmulh v13.4S, v26.4S, v9.s[3] +mul v26.4S, v26.4S,v28.s[3] +nop +nop +sqrdmulh v5.4S, v16.4S, v9.s[2] +mul v16.4S, v16.4S,v28.s[2] +sub v12.4s, v11.4s, v15.4s +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v18.4S, v9.s[1] +mul v18.4S, v18.4S,v28.s[1] +sub v23.4s, v24.4s, v2.4s +add v24.4s, v24.4s, v2.4s +sqrdmulh v2.4S, v25.4S, v9.s[0] +mul v25.4S, v25.4S,v28.s[0] +sub v4.4s, v0.4s, v21.4s +add v0.4s, v0.4s, v21.4s +sqrdmulh v21.4S, v10.4S, v7.s[3] +mla v26.4S, v13.4S, v31.s[0] +sub v13.4s, v3.4s, v22.4s +add v3.4s, v3.4s, v22.4s +sqrdmulh v22.4S, v1.4S, v7.s[2] +mla v16.4S, v5.4S, v31.s[0] +sub v5.4s, v12.4s, v26.4s +str q5, [x0, #960] +sqrdmulh v5.4S, v6.4S, v7.s[1] +mla v18.4S, v15.4S, v31.s[0] +add v12.4s, v12.4s, v26.4s +str q12, [x0, #896] +sqrdmulh v12.4S, v19.4S, v7.s[0] +mla v25.4S, v2.4S, v31.s[0] +sub v2.4s, v11.4s, v16.4s +str q2, [x0, #832] +mul v1.4S, v1.4S,v17.s[2] +mul v10.4S, v10.4S,v17.s[3] +add v11.4s, v11.4s, v16.4s +sub v16.4s, v23.4s, v18.4s +mla v1.4S, v22.4S, v31.s[0] +mla v10.4S, v21.4S, v31.s[0] +add v23.4s, v23.4s, v18.4s +str q11, [x0, #768] +mul v19.4S, v19.4S,v17.s[0] +mul v6.4S, v6.4S,v17.s[1] +sub v11.4s, v24.4s, v25.4s +str q16, [x0, #704] +mla v19.4S, v12.4S, v31.s[0] +mla v6.4S, v5.4S, v31.s[0] +add v24.4s, v24.4s, v25.4s +str q23, [x0, #640] +ldr q23, [x0, #976] +sqrdmulh v25.4S, v23.4S, v29.s[0] +mul v23.4S, v23.4S,v30.s[0] +str q11, [x0, #576] +sub v11.4s, v4.4s, v10.4s +ldr q5, [x0, #912] +sqrdmulh v12.4S, v5.4S, v29.s[0] +mul v5.4S, v5.4S,v30.s[0] +str q24, [x0, #512] +add v4.4s, v4.4s, v10.4s +ldr q10, [x0, #848] +sqrdmulh v24.4S, v10.4S, v29.s[0] +mul v10.4S, v10.4S,v30.s[0] +str q11, [x0, #448] +sub v11.4s, v0.4s, v1.4s +ldr q16, [x0, #784] +sqrdmulh v18.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +str q4, [x0, #384] +add v0.4s, v0.4s, v1.4s +ldr q1, [x0, #720] +sqrdmulh v4.4S, v1.4S, v29.s[0] +mla v23.4S, v25.4S, v31.s[0] +str q11, [x0, #320] +sub v11.4s, v13.4s, v6.4s +ldr q25, [x0, #656] +sqrdmulh v21.4S, v25.4S, v29.s[0] +mla v5.4S, v12.4S, v31.s[0] +str q0, [x0, #256] +add v13.4s, v13.4s, v6.4s +ldr q6, [x0, #592] +sqrdmulh v0.4S, v6.4S, v29.s[0] +mla v10.4S, v24.4S, v31.s[0] +str q11, [x0, #192] +sub v11.4s, v3.4s, v19.4s +ldr q24, [x0, #528] +sqrdmulh v12.4S, v24.4S, v29.s[0] +mla v16.4S, v18.4S, v31.s[0] +str q13, [x0, #128] +add v3.4s, v3.4s, v19.4s +ldr q19, [x0, #464] +ldr q13, [x0, #400] +mul v25.4S, v25.4S,v30.s[0] +mul v1.4S, v1.4S,v30.s[0] +sub v18.4s, v19.4s, v23.4s +add v19.4s, v19.4s, v23.4s +ldr q23, [x0, #336] +ldr q22, [x0, #272] +mla v25.4S, v21.4S, v31.s[0] +mla v1.4S, v4.4S, v31.s[0] +sub v4.4s, v13.4s, v5.4s +add v13.4s, v13.4s, v5.4s +ldr q5, [x0, #208] +ldr q21, [x0, #144] +mul v24.4S, v24.4S,v30.s[0] +mul v6.4S, v6.4S,v30.s[0] +sub v2.4s, v23.4s, v10.4s +add v23.4s, v23.4s, v10.4s +ldr q10, [x0, #80] +ldr q26, [x0, #16] +mla v24.4S, v12.4S, v31.s[0] +mla v6.4S, v0.4S, v31.s[0] +sub v0.4s, v22.4s, v16.4s +add v22.4s, v22.4s, v16.4s +sqrdmulh v16.4S, v18.4S, v29.s[2] +mul v18.4S, v18.4S,v30.s[2] +sub v12.4s, v5.4s, v1.4s +nop +sqrdmulh v15.4S, v4.4S, v29.s[2] +mul v4.4S, v4.4S,v30.s[2] +add v5.4s, v5.4s, v1.4s +nop +sqrdmulh v1.4S, v19.4S, v29.s[1] +mul v19.4S, v19.4S,v30.s[1] +sub v8.4s, v21.4s, v25.4s +add v21.4s, v21.4s, v25.4s +sqrdmulh v25.4S, v13.4S, v29.s[1] +mul v13.4S, v13.4S,v30.s[1] +sub v20.4s, v10.4s, v6.4s +add v10.4s, v10.4s, v6.4s +sqrdmulh v6.4S, v2.4S, v29.s[2] +mla v18.4S, v16.4S, v31.s[0] +sub v16.4s, v26.4s, v24.4s +str q11, [x0, #64] +sqrdmulh v11.4S, v0.4S, v29.s[2] +mla v4.4S, v15.4S, v31.s[0] +add v26.4s, v26.4s, v24.4s +nop +sqrdmulh v24.4S, v23.4S, v29.s[1] +mla v19.4S, v1.4S, v31.s[0] +str q3, [x0, #0] +nop +sqrdmulh v3.4S, v22.4S, v29.s[1] +mla v13.4S, v25.4S, v31.s[0] +nop +nop +mul v0.4S, v0.4S,v30.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v25.4s, v12.4s, v18.4s +add v12.4s, v12.4s, v18.4s +mla v0.4S, v11.4S, v31.s[0] +mla v2.4S, v6.4S, v31.s[0] +sub v6.4s, v8.4s, v4.4s +add v8.4s, v8.4s, v4.4s +mul v22.4S, v22.4S,v30.s[1] +mul v23.4S, v23.4S,v30.s[1] +sub v4.4s, v5.4s, v19.4s +add v5.4s, v5.4s, v19.4s +mla v22.4S, v3.4S, v31.s[0] +mla v23.4S, v24.4S, v31.s[0] +sub v24.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +sqrdmulh v29.4S, v25.4S, v14.s[3] +mul v25.4S, v25.4S,v27.s[3] +nop +nop +sqrdmulh v30.4S, v12.4S, v14.s[2] +mul v12.4S, v12.4S,v27.s[2] +sub v13.4s, v20.4s, v2.4s +add v20.4s, v20.4s, v2.4s +sqrdmulh v2.4S, v4.4S, v14.s[1] +mul v4.4S, v4.4S,v27.s[1] +sub v3.4s, v16.4s, v0.4s +add v16.4s, v16.4s, v0.4s +sqrdmulh v0.4S, v5.4S, v14.s[0] +mul v5.4S, v5.4S,v27.s[0] +sub v19.4s, v10.4s, v23.4s +add v10.4s, v10.4s, v23.4s +sqrdmulh v23.4S, v6.4S, v14.s[3] +mla v25.4S, v29.4S, v31.s[0] +sub v29.4s, v26.4s, v22.4s +add v26.4s, v26.4s, v22.4s +sqrdmulh v22.4S, v8.4S, v14.s[2] +mla v12.4S, v30.4S, v31.s[0] +nop +nop +sqrdmulh v30.4S, v24.4S, v14.s[1] +mla v4.4S, v2.4S, v31.s[0] +nop +nop +sqrdmulh v2.4S, v21.4S, v14.s[0] +mla v5.4S, v0.4S, v31.s[0] +nop +nop +mul v8.4S, v8.4S,v27.s[2] +mul v6.4S, v6.4S,v27.s[3] +sub v0.4s, v13.4s, v25.4s +add v13.4s, v13.4s, v25.4s +mla v8.4S, v22.4S, v31.s[0] +mla v6.4S, v23.4S, v31.s[0] +sub v23.4s, v20.4s, v12.4s +add v20.4s, v20.4s, v12.4s +mul v21.4S, v21.4S,v27.s[0] +mul v24.4S, v24.4S,v27.s[1] +sub v12.4s, v19.4s, v4.4s +add v19.4s, v19.4s, v4.4s +mla v21.4S, v2.4S, v31.s[0] +mla v24.4S, v30.4S, v31.s[0] +sub v30.4s, v10.4s, v5.4s +add v10.4s, v10.4s, v5.4s +sqrdmulh v14.4S, v0.4S, v9.s[3] +mul v0.4S, v0.4S,v28.s[3] +nop +nop +sqrdmulh v27.4S, v13.4S, v9.s[2] +mul v13.4S, v13.4S,v28.s[2] +sub v5.4s, v3.4s, v6.4s +add v3.4s, v3.4s, v6.4s +sqrdmulh v6.4S, v23.4S, v9.s[1] +mul v23.4S, v23.4S,v28.s[1] +sub v2.4s, v16.4s, v8.4s +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v20.4S, v9.s[0] +mul v20.4S, v20.4S,v28.s[0] +sub v4.4s, v29.4s, v24.4s +add v29.4s, v29.4s, v24.4s +sqrdmulh v9.4S, v12.4S, v7.s[3] +mla v0.4S, v14.4S, v31.s[0] +sub v14.4s, v26.4s, v21.4s +add v26.4s, v26.4s, v21.4s +sqrdmulh v21.4S, v19.4S, v7.s[2] +mla v13.4S, v27.4S, v31.s[0] +sub v27.4s, v5.4s, v0.4s +str q27, [x0, #976] +sqrdmulh v27.4S, v30.4S, v7.s[1] +mla v23.4S, v6.4S, v31.s[0] +add v5.4s, v5.4s, v0.4s +str q5, [x0, #912] +sqrdmulh v5.4S, v10.4S, v7.s[0] +mla v20.4S, v8.4S, v31.s[0] +sub v8.4s, v3.4s, v13.4s +str q8, [x0, #848] +mul v19.4S, v19.4S,v17.s[2] +mul v12.4S, v12.4S,v17.s[3] +add v3.4s, v3.4s, v13.4s +sub v13.4s, v2.4s, v23.4s +mla v19.4S, v21.4S, v31.s[0] +mla v12.4S, v9.4S, v31.s[0] +add v2.4s, v2.4s, v23.4s +str q3, [x0, #784] +mul v10.4S, v10.4S,v17.s[0] +mul v30.4S, v30.4S,v17.s[1] +sub v3.4s, v16.4s, v20.4s +str q13, [x0, #720] +mla v10.4S, v5.4S, v31.s[0] +mla v30.4S, v27.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +str q2, [x0, #656] +str q3, [x0, #592] +sub v3.4s, v4.4s, v12.4s +str q16, [x0, #528] +add v4.4s, v4.4s, v12.4s +str q3, [x0, #464] +sub v3.4s, v29.4s, v19.4s +str q4, [x0, #400] +add v29.4s, v29.4s, v19.4s +str q3, [x0, #336] +sub v3.4s, v14.4s, v30.4s +str q29, [x0, #272] +add v14.4s, v14.4s, v30.4s +str q3, [x0, #208] +sub v3.4s, v26.4s, v10.4s +str q14, [x0, #144] +add v26.4s, v26.4s, v10.4s +str q3, [x0, #80] +str q26, [x0, #16] +ldr q15, [x17, #+128] +ldr q1, [x17, #+144] +ldr q18, [x17, #+160] +ldr q11, [x17, #+176] +ldr q25, [x17, #+192] +ldr q22, [x17, #+208] +ldr q24, [x17, #+224] +ldr q28, [x17, #+240] +ldr q6, [x0, #32] +ldr q0, [x0, #48] +ldr q8, [x0, #0] +ldr q21, [x0, #16] +sqrdmulh v9.4S, v6.4S, v1.s[0] +mul v6.4S, v6.4S,v15.s[0] +mla v6.4S, v9.4S, v31.s[0] +sub v9.4s, v8.4s, v6.4s +add v8.4s, v8.4s, v6.4s +sqrdmulh v6.4S, v0.4S, v1.s[0] +mul v0.4S, v0.4S,v15.s[0] +mla v0.4S, v6.4S, v31.s[0] +sub v6.4s, v21.4s, v0.4s +add v21.4s, v21.4s, v0.4s +sqrdmulh v0.4S, v21.4S, v1.s[1] +mul v21.4S, v21.4S,v15.s[1] +mla v21.4S, v0.4S, v31.s[0] +sub v0.4s, v8.4s, v21.4s +add v8.4s, v8.4s, v21.4s +sqrdmulh v21.4S, v6.4S, v1.s[2] +mul v6.4S, v6.4S,v15.s[2] +mla v6.4S, v21.4S, v31.s[0] +sub v21.4s, v9.4s, v6.4s +add v9.4s, v9.4s, v6.4s +trn1 v6.4S, v8.4S, v0.4S +trn2 v23.4S, v8.4S, v0.4S +trn1 v13.4S, v9.4S, v21.4S +trn2 v5.4S, v9.4S, v21.4S +trn2 v9.2D, v6.2D, v13.2D +trn2 v21.2D, v23.2D, v5.2D +trn1 v8.2D, v6.2D, v13.2D +trn1 v0.2D, v23.2D, v5.2D +sqrdmulh v5.4S, v9.4S, v11.4S +mul v9.4S, v9.4S,v18.4S +mla v9.4S, v5.4S, v31.s[0] +sub v5.4s, v8.4s, v9.4s +add v8.4s, v8.4s, v9.4s +sqrdmulh v9.4S, v21.4S, v11.4S +mul v21.4S, v21.4S,v18.4S +mla v21.4S, v9.4S, v31.s[0] +sub v9.4s, v0.4s, v21.4s +add v0.4s, v0.4s, v21.4s +sqrdmulh v21.4S, v0.4S, v22.4S +mul v0.4S, v0.4S,v25.4S +mla v0.4S, v21.4S, v31.s[0] +sub v21.4s, v8.4s, v0.4s +add v8.4s, v8.4s, v0.4s +sqrdmulh v0.4S, v9.4S, v28.4S +mul v9.4S, v9.4S,v24.4S +mla v9.4S, v0.4S, v31.s[0] +sub v0.4s, v5.4s, v9.4s +add v5.4s, v5.4s, v9.4s +str q8, [x0, #0] +str q21, [x0, #16] +str q5, [x0, #32] +str q0, [x0, #48] +ldr q0, [x17, #+256] +ldr q5, [x17, #+272] +ldr q21, [x17, #+288] +ldr q8, [x17, #+304] +ldr q9, [x17, #+320] +ldr q23, [x17, #+336] +ldr q13, [x17, #+352] +ldr q6, [x17, #+368] +ldr q28, [x0, #96] +ldr q24, [x0, #112] +ldr q22, [x0, #64] +ldr q25, [x0, #80] +sqrdmulh v11.4S, v28.4S, v5.s[0] +mul v28.4S, v28.4S,v0.s[0] +mla v28.4S, v11.4S, v31.s[0] +sub v11.4s, v22.4s, v28.4s +add v22.4s, v22.4s, v28.4s +sqrdmulh v28.4S, v24.4S, v5.s[0] +mul v24.4S, v24.4S,v0.s[0] +mla v24.4S, v28.4S, v31.s[0] +sub v28.4s, v25.4s, v24.4s +add v25.4s, v25.4s, v24.4s +sqrdmulh v24.4S, v25.4S, v5.s[1] +mul v25.4S, v25.4S,v0.s[1] +mla v25.4S, v24.4S, v31.s[0] +sub v24.4s, v22.4s, v25.4s +add v22.4s, v22.4s, v25.4s +sqrdmulh v25.4S, v28.4S, v5.s[2] +mul v28.4S, v28.4S,v0.s[2] +mla v28.4S, v25.4S, v31.s[0] +sub v25.4s, v11.4s, v28.4s +add v11.4s, v11.4s, v28.4s +trn1 v28.4S, v22.4S, v24.4S +trn2 v18.4S, v22.4S, v24.4S +trn1 v1.4S, v11.4S, v25.4S +trn2 v15.4S, v11.4S, v25.4S +trn2 v11.2D, v28.2D, v1.2D +trn2 v25.2D, v18.2D, v15.2D +trn1 v22.2D, v28.2D, v1.2D +trn1 v24.2D, v18.2D, v15.2D +sqrdmulh v15.4S, v11.4S, v8.4S +mul v11.4S, v11.4S,v21.4S +mla v11.4S, v15.4S, v31.s[0] +sub v15.4s, v22.4s, v11.4s +add v22.4s, v22.4s, v11.4s +sqrdmulh v11.4S, v25.4S, v8.4S +mul v25.4S, v25.4S,v21.4S +mla v25.4S, v11.4S, v31.s[0] +sub v11.4s, v24.4s, v25.4s +add v24.4s, v24.4s, v25.4s +sqrdmulh v25.4S, v24.4S, v23.4S +mul v24.4S, v24.4S,v9.4S +mla v24.4S, v25.4S, v31.s[0] +sub v25.4s, v22.4s, v24.4s +add v22.4s, v22.4s, v24.4s +sqrdmulh v24.4S, v11.4S, v6.4S +mul v11.4S, v11.4S,v13.4S +mla v11.4S, v24.4S, v31.s[0] +sub v24.4s, v15.4s, v11.4s +add v15.4s, v15.4s, v11.4s +str q22, [x0, #64] +str q25, [x0, #80] +str q15, [x0, #96] +str q24, [x0, #112] +ldr q24, [x17, #+384] +ldr q15, [x17, #+400] +ldr q25, [x17, #+416] +ldr q22, [x17, #+432] +ldr q11, [x17, #+448] +ldr q18, [x17, #+464] +ldr q1, [x17, #+480] +ldr q28, [x17, #+496] +ldr q6, [x0, #160] +ldr q13, [x0, #176] +ldr q23, [x0, #128] +ldr q9, [x0, #144] +sqrdmulh v8.4S, v6.4S, v15.s[0] +mul v6.4S, v6.4S,v24.s[0] +mla v6.4S, v8.4S, v31.s[0] +sub v8.4s, v23.4s, v6.4s +add v23.4s, v23.4s, v6.4s +sqrdmulh v6.4S, v13.4S, v15.s[0] +mul v13.4S, v13.4S,v24.s[0] +mla v13.4S, v6.4S, v31.s[0] +sub v6.4s, v9.4s, v13.4s +add v9.4s, v9.4s, v13.4s +sqrdmulh v13.4S, v9.4S, v15.s[1] +mul v9.4S, v9.4S,v24.s[1] +mla v9.4S, v13.4S, v31.s[0] +sub v13.4s, v23.4s, v9.4s +add v23.4s, v23.4s, v9.4s +sqrdmulh v9.4S, v6.4S, v15.s[2] +mul v6.4S, v6.4S,v24.s[2] +mla v6.4S, v9.4S, v31.s[0] +sub v9.4s, v8.4s, v6.4s +add v8.4s, v8.4s, v6.4s +trn1 v6.4S, v23.4S, v13.4S +trn2 v21.4S, v23.4S, v13.4S +trn1 v5.4S, v8.4S, v9.4S +trn2 v0.4S, v8.4S, v9.4S +trn2 v8.2D, v6.2D, v5.2D +trn2 v9.2D, v21.2D, v0.2D +trn1 v23.2D, v6.2D, v5.2D +trn1 v13.2D, v21.2D, v0.2D +sqrdmulh v0.4S, v8.4S, v22.4S +mul v8.4S, v8.4S,v25.4S +mla v8.4S, v0.4S, v31.s[0] +sub v0.4s, v23.4s, v8.4s +add v23.4s, v23.4s, v8.4s +sqrdmulh v8.4S, v9.4S, v22.4S +mul v9.4S, v9.4S,v25.4S +mla v9.4S, v8.4S, v31.s[0] +sub v8.4s, v13.4s, v9.4s +add v13.4s, v13.4s, v9.4s +sqrdmulh v9.4S, v13.4S, v18.4S +mul v13.4S, v13.4S,v11.4S +mla v13.4S, v9.4S, v31.s[0] +sub v9.4s, v23.4s, v13.4s +add v23.4s, v23.4s, v13.4s +sqrdmulh v13.4S, v8.4S, v28.4S +mul v8.4S, v8.4S,v1.4S +mla v8.4S, v13.4S, v31.s[0] +sub v13.4s, v0.4s, v8.4s +add v0.4s, v0.4s, v8.4s +str q23, [x0, #128] +str q9, [x0, #144] +str q0, [x0, #160] +str q13, [x0, #176] +ldr q13, [x17, #+512] +ldr q0, [x17, #+528] +ldr q9, [x17, #+544] +ldr q23, [x17, #+560] +ldr q8, [x17, #+576] +ldr q21, [x17, #+592] +ldr q5, [x17, #+608] +ldr q6, [x17, #+624] +ldr q28, [x0, #224] +ldr q1, [x0, #240] +ldr q18, [x0, #192] +ldr q11, [x0, #208] +sqrdmulh v22.4S, v28.4S, v0.s[0] +mul v28.4S, v28.4S,v13.s[0] +mla v28.4S, v22.4S, v31.s[0] +sub v22.4s, v18.4s, v28.4s +add v18.4s, v18.4s, v28.4s +sqrdmulh v28.4S, v1.4S, v0.s[0] +mul v1.4S, v1.4S,v13.s[0] +mla v1.4S, v28.4S, v31.s[0] +sub v28.4s, v11.4s, v1.4s +add v11.4s, v11.4s, v1.4s +sqrdmulh v1.4S, v11.4S, v0.s[1] +mul v11.4S, v11.4S,v13.s[1] +mla v11.4S, v1.4S, v31.s[0] +sub v1.4s, v18.4s, v11.4s +add v18.4s, v18.4s, v11.4s +sqrdmulh v11.4S, v28.4S, v0.s[2] +mul v28.4S, v28.4S,v13.s[2] +mla v28.4S, v11.4S, v31.s[0] +sub v11.4s, v22.4s, v28.4s +add v22.4s, v22.4s, v28.4s +trn1 v28.4S, v18.4S, v1.4S +trn2 v25.4S, v18.4S, v1.4S +trn1 v15.4S, v22.4S, v11.4S +trn2 v24.4S, v22.4S, v11.4S +trn2 v22.2D, v28.2D, v15.2D +trn2 v11.2D, v25.2D, v24.2D +trn1 v18.2D, v28.2D, v15.2D +trn1 v1.2D, v25.2D, v24.2D +sqrdmulh v24.4S, v22.4S, v23.4S +mul v22.4S, v22.4S,v9.4S +mla v22.4S, v24.4S, v31.s[0] +sub v24.4s, v18.4s, v22.4s +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v11.4S, v23.4S +mul v11.4S, v11.4S,v9.4S +mla v11.4S, v22.4S, v31.s[0] +sub v22.4s, v1.4s, v11.4s +add v1.4s, v1.4s, v11.4s +sqrdmulh v11.4S, v1.4S, v21.4S +mul v1.4S, v1.4S,v8.4S +mla v1.4S, v11.4S, v31.s[0] +sub v11.4s, v18.4s, v1.4s +add v18.4s, v18.4s, v1.4s +sqrdmulh v1.4S, v22.4S, v6.4S +mul v22.4S, v22.4S,v5.4S +mla v22.4S, v1.4S, v31.s[0] +sub v1.4s, v24.4s, v22.4s +add v24.4s, v24.4s, v22.4s +str q18, [x0, #192] +str q11, [x0, #208] +str q24, [x0, #224] +str q1, [x0, #240] +ldr q1, [x17, #+640] +ldr q24, [x17, #+656] +ldr q11, [x17, #+672] +ldr q18, [x17, #+688] +ldr q22, [x17, #+704] +ldr q25, [x17, #+720] +ldr q15, [x17, #+736] +ldr q28, [x17, #+752] +ldr q6, [x0, #288] +ldr q5, [x0, #304] +ldr q21, [x0, #256] +ldr q8, [x0, #272] +sqrdmulh v23.4S, v6.4S, v24.s[0] +mul v6.4S, v6.4S,v1.s[0] +mla v6.4S, v23.4S, v31.s[0] +sub v23.4s, v21.4s, v6.4s +add v21.4s, v21.4s, v6.4s +sqrdmulh v6.4S, v5.4S, v24.s[0] +mul v5.4S, v5.4S,v1.s[0] +mla v5.4S, v6.4S, v31.s[0] +sub v6.4s, v8.4s, v5.4s +add v8.4s, v8.4s, v5.4s +sqrdmulh v5.4S, v8.4S, v24.s[1] +mul v8.4S, v8.4S,v1.s[1] +mla v8.4S, v5.4S, v31.s[0] +sub v5.4s, v21.4s, v8.4s +add v21.4s, v21.4s, v8.4s +sqrdmulh v8.4S, v6.4S, v24.s[2] +mul v6.4S, v6.4S,v1.s[2] +mla v6.4S, v8.4S, v31.s[0] +sub v8.4s, v23.4s, v6.4s +add v23.4s, v23.4s, v6.4s +trn1 v6.4S, v21.4S, v5.4S +trn2 v9.4S, v21.4S, v5.4S +trn1 v0.4S, v23.4S, v8.4S +trn2 v13.4S, v23.4S, v8.4S +trn2 v23.2D, v6.2D, v0.2D +trn2 v8.2D, v9.2D, v13.2D +trn1 v21.2D, v6.2D, v0.2D +trn1 v5.2D, v9.2D, v13.2D +sqrdmulh v13.4S, v23.4S, v18.4S +mul v23.4S, v23.4S,v11.4S +mla v23.4S, v13.4S, v31.s[0] +sub v13.4s, v21.4s, v23.4s +add v21.4s, v21.4s, v23.4s +sqrdmulh v23.4S, v8.4S, v18.4S +mul v8.4S, v8.4S,v11.4S +mla v8.4S, v23.4S, v31.s[0] +sub v23.4s, v5.4s, v8.4s +add v5.4s, v5.4s, v8.4s +sqrdmulh v8.4S, v5.4S, v25.4S +mul v5.4S, v5.4S,v22.4S +mla v5.4S, v8.4S, v31.s[0] +sub v8.4s, v21.4s, v5.4s +add v21.4s, v21.4s, v5.4s +sqrdmulh v5.4S, v23.4S, v28.4S +mul v23.4S, v23.4S,v15.4S +mla v23.4S, v5.4S, v31.s[0] +sub v5.4s, v13.4s, v23.4s +add v13.4s, v13.4s, v23.4s +str q21, [x0, #256] +str q8, [x0, #272] +str q13, [x0, #288] +str q5, [x0, #304] +ldr q5, [x17, #+768] +ldr q13, [x17, #+784] +ldr q8, [x17, #+800] +ldr q21, [x17, #+816] +ldr q23, [x17, #+832] +ldr q9, [x17, #+848] +ldr q0, [x17, #+864] +ldr q6, [x17, #+880] +ldr q28, [x0, #352] +ldr q15, [x0, #368] +ldr q25, [x0, #320] +ldr q22, [x0, #336] +sqrdmulh v18.4S, v28.4S, v13.s[0] +mul v28.4S, v28.4S,v5.s[0] +mla v28.4S, v18.4S, v31.s[0] +sub v18.4s, v25.4s, v28.4s +add v25.4s, v25.4s, v28.4s +sqrdmulh v28.4S, v15.4S, v13.s[0] +mul v15.4S, v15.4S,v5.s[0] +mla v15.4S, v28.4S, v31.s[0] +sub v28.4s, v22.4s, v15.4s +add v22.4s, v22.4s, v15.4s +sqrdmulh v15.4S, v22.4S, v13.s[1] +mul v22.4S, v22.4S,v5.s[1] +mla v22.4S, v15.4S, v31.s[0] +sub v15.4s, v25.4s, v22.4s +add v25.4s, v25.4s, v22.4s +sqrdmulh v22.4S, v28.4S, v13.s[2] +mul v28.4S, v28.4S,v5.s[2] +mla v28.4S, v22.4S, v31.s[0] +sub v22.4s, v18.4s, v28.4s +add v18.4s, v18.4s, v28.4s +trn1 v28.4S, v25.4S, v15.4S +trn2 v11.4S, v25.4S, v15.4S +trn1 v24.4S, v18.4S, v22.4S +trn2 v1.4S, v18.4S, v22.4S +trn2 v18.2D, v28.2D, v24.2D +trn2 v22.2D, v11.2D, v1.2D +trn1 v25.2D, v28.2D, v24.2D +trn1 v15.2D, v11.2D, v1.2D +sqrdmulh v1.4S, v18.4S, v21.4S +mul v18.4S, v18.4S,v8.4S +mla v18.4S, v1.4S, v31.s[0] +sub v1.4s, v25.4s, v18.4s +add v25.4s, v25.4s, v18.4s +sqrdmulh v18.4S, v22.4S, v21.4S +mul v22.4S, v22.4S,v8.4S +mla v22.4S, v18.4S, v31.s[0] +sub v18.4s, v15.4s, v22.4s +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v15.4S, v9.4S +mul v15.4S, v15.4S,v23.4S +mla v15.4S, v22.4S, v31.s[0] +sub v22.4s, v25.4s, v15.4s +add v25.4s, v25.4s, v15.4s +sqrdmulh v15.4S, v18.4S, v6.4S +mul v18.4S, v18.4S,v0.4S +mla v18.4S, v15.4S, v31.s[0] +sub v15.4s, v1.4s, v18.4s +add v1.4s, v1.4s, v18.4s +str q25, [x0, #320] +str q22, [x0, #336] +str q1, [x0, #352] +str q15, [x0, #368] +ldr q15, [x17, #+896] +ldr q1, [x17, #+912] +ldr q22, [x17, #+928] +ldr q25, [x17, #+944] +ldr q18, [x17, #+960] +ldr q11, [x17, #+976] +ldr q24, [x17, #+992] +ldr q28, [x17, #+1008] +ldr q6, [x0, #416] +ldr q0, [x0, #432] +ldr q9, [x0, #384] +ldr q23, [x0, #400] +sqrdmulh v21.4S, v6.4S, v1.s[0] +mul v6.4S, v6.4S,v15.s[0] +mla v6.4S, v21.4S, v31.s[0] +sub v21.4s, v9.4s, v6.4s +add v9.4s, v9.4s, v6.4s +sqrdmulh v6.4S, v0.4S, v1.s[0] +mul v0.4S, v0.4S,v15.s[0] +mla v0.4S, v6.4S, v31.s[0] +sub v6.4s, v23.4s, v0.4s +add v23.4s, v23.4s, v0.4s +sqrdmulh v0.4S, v23.4S, v1.s[1] +mul v23.4S, v23.4S,v15.s[1] +mla v23.4S, v0.4S, v31.s[0] +sub v0.4s, v9.4s, v23.4s +add v9.4s, v9.4s, v23.4s +sqrdmulh v23.4S, v6.4S, v1.s[2] +mul v6.4S, v6.4S,v15.s[2] +mla v6.4S, v23.4S, v31.s[0] +sub v23.4s, v21.4s, v6.4s +add v21.4s, v21.4s, v6.4s +trn1 v6.4S, v9.4S, v0.4S +trn2 v8.4S, v9.4S, v0.4S +trn1 v13.4S, v21.4S, v23.4S +trn2 v5.4S, v21.4S, v23.4S +trn2 v21.2D, v6.2D, v13.2D +trn2 v23.2D, v8.2D, v5.2D +trn1 v9.2D, v6.2D, v13.2D +trn1 v0.2D, v8.2D, v5.2D +sqrdmulh v5.4S, v21.4S, v25.4S +mul v21.4S, v21.4S,v22.4S +mla v21.4S, v5.4S, v31.s[0] +sub v5.4s, v9.4s, v21.4s +add v9.4s, v9.4s, v21.4s +sqrdmulh v21.4S, v23.4S, v25.4S +mul v23.4S, v23.4S,v22.4S +mla v23.4S, v21.4S, v31.s[0] +sub v21.4s, v0.4s, v23.4s +add v0.4s, v0.4s, v23.4s +sqrdmulh v23.4S, v0.4S, v11.4S +mul v0.4S, v0.4S,v18.4S +mla v0.4S, v23.4S, v31.s[0] +sub v23.4s, v9.4s, v0.4s +add v9.4s, v9.4s, v0.4s +sqrdmulh v0.4S, v21.4S, v28.4S +mul v21.4S, v21.4S,v24.4S +mla v21.4S, v0.4S, v31.s[0] +sub v0.4s, v5.4s, v21.4s +add v5.4s, v5.4s, v21.4s +str q9, [x0, #384] +str q23, [x0, #400] +str q5, [x0, #416] +str q0, [x0, #432] +ldr q0, [x17, #+1024] +ldr q5, [x17, #+1040] +ldr q23, [x17, #+1056] +ldr q9, [x17, #+1072] +ldr q21, [x17, #+1088] +ldr q8, [x17, #+1104] +ldr q13, [x17, #+1120] +ldr q6, [x17, #+1136] +ldr q28, [x0, #480] +ldr q24, [x0, #496] +ldr q11, [x0, #448] +ldr q18, [x0, #464] +sqrdmulh v25.4S, v28.4S, v5.s[0] +mul v28.4S, v28.4S,v0.s[0] +mla v28.4S, v25.4S, v31.s[0] +sub v25.4s, v11.4s, v28.4s +add v11.4s, v11.4s, v28.4s +sqrdmulh v28.4S, v24.4S, v5.s[0] +mul v24.4S, v24.4S,v0.s[0] +mla v24.4S, v28.4S, v31.s[0] +sub v28.4s, v18.4s, v24.4s +add v18.4s, v18.4s, v24.4s +sqrdmulh v24.4S, v18.4S, v5.s[1] +mul v18.4S, v18.4S,v0.s[1] +mla v18.4S, v24.4S, v31.s[0] +sub v24.4s, v11.4s, v18.4s +add v11.4s, v11.4s, v18.4s +sqrdmulh v18.4S, v28.4S, v5.s[2] +mul v28.4S, v28.4S,v0.s[2] +mla v28.4S, v18.4S, v31.s[0] +sub v18.4s, v25.4s, v28.4s +add v25.4s, v25.4s, v28.4s +trn1 v28.4S, v11.4S, v24.4S +trn2 v22.4S, v11.4S, v24.4S +trn1 v1.4S, v25.4S, v18.4S +trn2 v15.4S, v25.4S, v18.4S +trn2 v25.2D, v28.2D, v1.2D +trn2 v18.2D, v22.2D, v15.2D +trn1 v11.2D, v28.2D, v1.2D +trn1 v24.2D, v22.2D, v15.2D +sqrdmulh v15.4S, v25.4S, v9.4S +mul v25.4S, v25.4S,v23.4S +mla v25.4S, v15.4S, v31.s[0] +sub v15.4s, v11.4s, v25.4s +add v11.4s, v11.4s, v25.4s +sqrdmulh v25.4S, v18.4S, v9.4S +mul v18.4S, v18.4S,v23.4S +mla v18.4S, v25.4S, v31.s[0] +sub v25.4s, v24.4s, v18.4s +add v24.4s, v24.4s, v18.4s +sqrdmulh v18.4S, v24.4S, v8.4S +mul v24.4S, v24.4S,v21.4S +mla v24.4S, v18.4S, v31.s[0] +sub v18.4s, v11.4s, v24.4s +add v11.4s, v11.4s, v24.4s +sqrdmulh v24.4S, v25.4S, v6.4S +mul v25.4S, v25.4S,v13.4S +mla v25.4S, v24.4S, v31.s[0] +sub v24.4s, v15.4s, v25.4s +add v15.4s, v15.4s, v25.4s +str q11, [x0, #448] +str q18, [x0, #464] +str q15, [x0, #480] +str q24, [x0, #496] +ldr q24, [x17, #+1152] +ldr q15, [x17, #+1168] +ldr q18, [x17, #+1184] +ldr q11, [x17, #+1200] +ldr q25, [x17, #+1216] +ldr q22, [x17, #+1232] +ldr q1, [x17, #+1248] +ldr q28, [x17, #+1264] +ldr q6, [x0, #544] +ldr q13, [x0, #560] +ldr q8, [x0, #512] +ldr q21, [x0, #528] +sqrdmulh v9.4S, v6.4S, v15.s[0] +mul v6.4S, v6.4S,v24.s[0] +mla v6.4S, v9.4S, v31.s[0] +sub v9.4s, v8.4s, v6.4s +add v8.4s, v8.4s, v6.4s +sqrdmulh v6.4S, v13.4S, v15.s[0] +mul v13.4S, v13.4S,v24.s[0] +mla v13.4S, v6.4S, v31.s[0] +sub v6.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +sqrdmulh v13.4S, v21.4S, v15.s[1] +mul v21.4S, v21.4S,v24.s[1] +mla v21.4S, v13.4S, v31.s[0] +sub v13.4s, v8.4s, v21.4s +add v8.4s, v8.4s, v21.4s +sqrdmulh v21.4S, v6.4S, v15.s[2] +mul v6.4S, v6.4S,v24.s[2] +mla v6.4S, v21.4S, v31.s[0] +sub v21.4s, v9.4s, v6.4s +add v9.4s, v9.4s, v6.4s +trn1 v6.4S, v8.4S, v13.4S +trn2 v23.4S, v8.4S, v13.4S +trn1 v5.4S, v9.4S, v21.4S +trn2 v0.4S, v9.4S, v21.4S +trn2 v9.2D, v6.2D, v5.2D +trn2 v21.2D, v23.2D, v0.2D +trn1 v8.2D, v6.2D, v5.2D +trn1 v13.2D, v23.2D, v0.2D +sqrdmulh v0.4S, v9.4S, v11.4S +mul v9.4S, v9.4S,v18.4S +mla v9.4S, v0.4S, v31.s[0] +sub v0.4s, v8.4s, v9.4s +add v8.4s, v8.4s, v9.4s +sqrdmulh v9.4S, v21.4S, v11.4S +mul v21.4S, v21.4S,v18.4S +mla v21.4S, v9.4S, v31.s[0] +sub v9.4s, v13.4s, v21.4s +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v13.4S, v22.4S +mul v13.4S, v13.4S,v25.4S +mla v13.4S, v21.4S, v31.s[0] +sub v21.4s, v8.4s, v13.4s +add v8.4s, v8.4s, v13.4s +sqrdmulh v13.4S, v9.4S, v28.4S +mul v9.4S, v9.4S,v1.4S +mla v9.4S, v13.4S, v31.s[0] +sub v13.4s, v0.4s, v9.4s +add v0.4s, v0.4s, v9.4s +str q8, [x0, #512] +str q21, [x0, #528] +str q0, [x0, #544] +str q13, [x0, #560] +ldr q13, [x17, #+1280] +ldr q0, [x17, #+1296] +ldr q21, [x17, #+1312] +ldr q8, [x17, #+1328] +ldr q9, [x17, #+1344] +ldr q23, [x17, #+1360] +ldr q5, [x17, #+1376] +ldr q6, [x17, #+1392] +ldr q28, [x0, #608] +ldr q1, [x0, #624] +ldr q22, [x0, #576] +ldr q25, [x0, #592] +sqrdmulh v11.4S, v28.4S, v0.s[0] +mul v28.4S, v28.4S,v13.s[0] +mla v28.4S, v11.4S, v31.s[0] +sub v11.4s, v22.4s, v28.4s +add v22.4s, v22.4s, v28.4s +sqrdmulh v28.4S, v1.4S, v0.s[0] +mul v1.4S, v1.4S,v13.s[0] +mla v1.4S, v28.4S, v31.s[0] +sub v28.4s, v25.4s, v1.4s +add v25.4s, v25.4s, v1.4s +sqrdmulh v1.4S, v25.4S, v0.s[1] +mul v25.4S, v25.4S,v13.s[1] +mla v25.4S, v1.4S, v31.s[0] +sub v1.4s, v22.4s, v25.4s +add v22.4s, v22.4s, v25.4s +sqrdmulh v25.4S, v28.4S, v0.s[2] +mul v28.4S, v28.4S,v13.s[2] +mla v28.4S, v25.4S, v31.s[0] +sub v25.4s, v11.4s, v28.4s +add v11.4s, v11.4s, v28.4s +trn1 v28.4S, v22.4S, v1.4S +trn2 v18.4S, v22.4S, v1.4S +trn1 v15.4S, v11.4S, v25.4S +trn2 v24.4S, v11.4S, v25.4S +trn2 v11.2D, v28.2D, v15.2D +trn2 v25.2D, v18.2D, v24.2D +trn1 v22.2D, v28.2D, v15.2D +trn1 v1.2D, v18.2D, v24.2D +sqrdmulh v24.4S, v11.4S, v8.4S +mul v11.4S, v11.4S,v21.4S +mla v11.4S, v24.4S, v31.s[0] +sub v24.4s, v22.4s, v11.4s +add v22.4s, v22.4s, v11.4s +sqrdmulh v11.4S, v25.4S, v8.4S +mul v25.4S, v25.4S,v21.4S +mla v25.4S, v11.4S, v31.s[0] +sub v11.4s, v1.4s, v25.4s +add v1.4s, v1.4s, v25.4s +sqrdmulh v25.4S, v1.4S, v23.4S +mul v1.4S, v1.4S,v9.4S +mla v1.4S, v25.4S, v31.s[0] +sub v25.4s, v22.4s, v1.4s +add v22.4s, v22.4s, v1.4s +sqrdmulh v1.4S, v11.4S, v6.4S +mul v11.4S, v11.4S,v5.4S +mla v11.4S, v1.4S, v31.s[0] +sub v1.4s, v24.4s, v11.4s +add v24.4s, v24.4s, v11.4s +str q22, [x0, #576] +str q25, [x0, #592] +str q24, [x0, #608] +str q1, [x0, #624] +ldr q1, [x17, #+1408] +ldr q24, [x17, #+1424] +ldr q25, [x17, #+1440] +ldr q22, [x17, #+1456] +ldr q11, [x17, #+1472] +ldr q18, [x17, #+1488] +ldr q15, [x17, #+1504] +ldr q28, [x17, #+1520] +ldr q6, [x0, #672] +ldr q5, [x0, #688] +ldr q23, [x0, #640] +ldr q9, [x0, #656] +sqrdmulh v8.4S, v6.4S, v24.s[0] +mul v6.4S, v6.4S,v1.s[0] +mla v6.4S, v8.4S, v31.s[0] +sub v8.4s, v23.4s, v6.4s +add v23.4s, v23.4s, v6.4s +sqrdmulh v6.4S, v5.4S, v24.s[0] +mul v5.4S, v5.4S,v1.s[0] +mla v5.4S, v6.4S, v31.s[0] +sub v6.4s, v9.4s, v5.4s +add v9.4s, v9.4s, v5.4s +sqrdmulh v5.4S, v9.4S, v24.s[1] +mul v9.4S, v9.4S,v1.s[1] +mla v9.4S, v5.4S, v31.s[0] +sub v5.4s, v23.4s, v9.4s +add v23.4s, v23.4s, v9.4s +sqrdmulh v9.4S, v6.4S, v24.s[2] +mul v6.4S, v6.4S,v1.s[2] +mla v6.4S, v9.4S, v31.s[0] +sub v9.4s, v8.4s, v6.4s +add v8.4s, v8.4s, v6.4s +trn1 v6.4S, v23.4S, v5.4S +trn2 v21.4S, v23.4S, v5.4S +trn1 v0.4S, v8.4S, v9.4S +trn2 v13.4S, v8.4S, v9.4S +trn2 v8.2D, v6.2D, v0.2D +trn2 v9.2D, v21.2D, v13.2D +trn1 v23.2D, v6.2D, v0.2D +trn1 v5.2D, v21.2D, v13.2D +sqrdmulh v13.4S, v8.4S, v22.4S +mul v8.4S, v8.4S,v25.4S +mla v8.4S, v13.4S, v31.s[0] +sub v13.4s, v23.4s, v8.4s +add v23.4s, v23.4s, v8.4s +sqrdmulh v8.4S, v9.4S, v22.4S +mul v9.4S, v9.4S,v25.4S +mla v9.4S, v8.4S, v31.s[0] +sub v8.4s, v5.4s, v9.4s +add v5.4s, v5.4s, v9.4s +sqrdmulh v9.4S, v5.4S, v18.4S +mul v5.4S, v5.4S,v11.4S +mla v5.4S, v9.4S, v31.s[0] +sub v9.4s, v23.4s, v5.4s +add v23.4s, v23.4s, v5.4s +sqrdmulh v5.4S, v8.4S, v28.4S +mul v8.4S, v8.4S,v15.4S +mla v8.4S, v5.4S, v31.s[0] +sub v5.4s, v13.4s, v8.4s +add v13.4s, v13.4s, v8.4s +str q23, [x0, #640] +str q9, [x0, #656] +str q13, [x0, #672] +str q5, [x0, #688] +ldr q5, [x17, #+1536] +ldr q13, [x17, #+1552] +ldr q9, [x17, #+1568] +ldr q23, [x17, #+1584] +ldr q8, [x17, #+1600] +ldr q21, [x17, #+1616] +ldr q0, [x17, #+1632] +ldr q6, [x17, #+1648] +ldr q28, [x0, #736] +ldr q15, [x0, #752] +ldr q18, [x0, #704] +ldr q11, [x0, #720] +sqrdmulh v22.4S, v28.4S, v13.s[0] +mul v28.4S, v28.4S,v5.s[0] +mla v28.4S, v22.4S, v31.s[0] +sub v22.4s, v18.4s, v28.4s +add v18.4s, v18.4s, v28.4s +sqrdmulh v28.4S, v15.4S, v13.s[0] +mul v15.4S, v15.4S,v5.s[0] +mla v15.4S, v28.4S, v31.s[0] +sub v28.4s, v11.4s, v15.4s +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v13.s[1] +mul v11.4S, v11.4S,v5.s[1] +mla v11.4S, v15.4S, v31.s[0] +sub v15.4s, v18.4s, v11.4s +add v18.4s, v18.4s, v11.4s +sqrdmulh v11.4S, v28.4S, v13.s[2] +mul v28.4S, v28.4S,v5.s[2] +mla v28.4S, v11.4S, v31.s[0] +sub v11.4s, v22.4s, v28.4s +add v22.4s, v22.4s, v28.4s +trn1 v28.4S, v18.4S, v15.4S +trn2 v25.4S, v18.4S, v15.4S +trn1 v24.4S, v22.4S, v11.4S +trn2 v1.4S, v22.4S, v11.4S +trn2 v22.2D, v28.2D, v24.2D +trn2 v11.2D, v25.2D, v1.2D +trn1 v18.2D, v28.2D, v24.2D +trn1 v15.2D, v25.2D, v1.2D +sqrdmulh v1.4S, v22.4S, v23.4S +mul v22.4S, v22.4S,v9.4S +mla v22.4S, v1.4S, v31.s[0] +sub v1.4s, v18.4s, v22.4s +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v11.4S, v23.4S +mul v11.4S, v11.4S,v9.4S +mla v11.4S, v22.4S, v31.s[0] +sub v22.4s, v15.4s, v11.4s +add v15.4s, v15.4s, v11.4s +sqrdmulh v11.4S, v15.4S, v21.4S +mul v15.4S, v15.4S,v8.4S +mla v15.4S, v11.4S, v31.s[0] +sub v11.4s, v18.4s, v15.4s +add v18.4s, v18.4s, v15.4s +sqrdmulh v15.4S, v22.4S, v6.4S +mul v22.4S, v22.4S,v0.4S +mla v22.4S, v15.4S, v31.s[0] +sub v15.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +str q18, [x0, #704] +str q11, [x0, #720] +str q1, [x0, #736] +str q15, [x0, #752] +ldr q15, [x17, #+1664] +ldr q1, [x17, #+1680] +ldr q11, [x17, #+1696] +ldr q18, [x17, #+1712] +ldr q22, [x17, #+1728] +ldr q25, [x17, #+1744] +ldr q24, [x17, #+1760] +ldr q28, [x17, #+1776] +ldr q6, [x0, #800] +ldr q0, [x0, #816] +ldr q21, [x0, #768] +ldr q8, [x0, #784] +sqrdmulh v23.4S, v6.4S, v1.s[0] +mul v6.4S, v6.4S,v15.s[0] +mla v6.4S, v23.4S, v31.s[0] +sub v23.4s, v21.4s, v6.4s +add v21.4s, v21.4s, v6.4s +sqrdmulh v6.4S, v0.4S, v1.s[0] +mul v0.4S, v0.4S,v15.s[0] +mla v0.4S, v6.4S, v31.s[0] +sub v6.4s, v8.4s, v0.4s +add v8.4s, v8.4s, v0.4s +sqrdmulh v0.4S, v8.4S, v1.s[1] +mul v8.4S, v8.4S,v15.s[1] +mla v8.4S, v0.4S, v31.s[0] +sub v0.4s, v21.4s, v8.4s +add v21.4s, v21.4s, v8.4s +sqrdmulh v8.4S, v6.4S, v1.s[2] +mul v6.4S, v6.4S,v15.s[2] +mla v6.4S, v8.4S, v31.s[0] +sub v8.4s, v23.4s, v6.4s +add v23.4s, v23.4s, v6.4s +trn1 v6.4S, v21.4S, v0.4S +trn2 v9.4S, v21.4S, v0.4S +trn1 v13.4S, v23.4S, v8.4S +trn2 v5.4S, v23.4S, v8.4S +trn2 v23.2D, v6.2D, v13.2D +trn2 v8.2D, v9.2D, v5.2D +trn1 v21.2D, v6.2D, v13.2D +trn1 v0.2D, v9.2D, v5.2D +sqrdmulh v5.4S, v23.4S, v18.4S +mul v23.4S, v23.4S,v11.4S +mla v23.4S, v5.4S, v31.s[0] +sub v5.4s, v21.4s, v23.4s +add v21.4s, v21.4s, v23.4s +sqrdmulh v23.4S, v8.4S, v18.4S +mul v8.4S, v8.4S,v11.4S +mla v8.4S, v23.4S, v31.s[0] +sub v23.4s, v0.4s, v8.4s +add v0.4s, v0.4s, v8.4s +sqrdmulh v8.4S, v0.4S, v25.4S +mul v0.4S, v0.4S,v22.4S +mla v0.4S, v8.4S, v31.s[0] +sub v8.4s, v21.4s, v0.4s +add v21.4s, v21.4s, v0.4s +sqrdmulh v0.4S, v23.4S, v28.4S +mul v23.4S, v23.4S,v24.4S +mla v23.4S, v0.4S, v31.s[0] +sub v0.4s, v5.4s, v23.4s +add v5.4s, v5.4s, v23.4s +str q21, [x0, #768] +str q8, [x0, #784] +str q5, [x0, #800] +str q0, [x0, #816] +ldr q0, [x17, #+1792] +ldr q5, [x17, #+1808] +ldr q8, [x17, #+1824] +ldr q21, [x17, #+1840] +ldr q23, [x17, #+1856] +ldr q9, [x17, #+1872] +ldr q13, [x17, #+1888] +ldr q6, [x17, #+1904] +ldr q28, [x0, #864] +ldr q24, [x0, #880] +ldr q25, [x0, #832] +ldr q22, [x0, #848] +sqrdmulh v18.4S, v28.4S, v5.s[0] +mul v28.4S, v28.4S,v0.s[0] +mla v28.4S, v18.4S, v31.s[0] +sub v18.4s, v25.4s, v28.4s +add v25.4s, v25.4s, v28.4s +sqrdmulh v28.4S, v24.4S, v5.s[0] +mul v24.4S, v24.4S,v0.s[0] +mla v24.4S, v28.4S, v31.s[0] +sub v28.4s, v22.4s, v24.4s +add v22.4s, v22.4s, v24.4s +sqrdmulh v24.4S, v22.4S, v5.s[1] +mul v22.4S, v22.4S,v0.s[1] +mla v22.4S, v24.4S, v31.s[0] +sub v24.4s, v25.4s, v22.4s +add v25.4s, v25.4s, v22.4s +sqrdmulh v22.4S, v28.4S, v5.s[2] +mul v28.4S, v28.4S,v0.s[2] +mla v28.4S, v22.4S, v31.s[0] +sub v22.4s, v18.4s, v28.4s +add v18.4s, v18.4s, v28.4s +trn1 v28.4S, v25.4S, v24.4S +trn2 v11.4S, v25.4S, v24.4S +trn1 v1.4S, v18.4S, v22.4S +trn2 v15.4S, v18.4S, v22.4S +trn2 v18.2D, v28.2D, v1.2D +trn2 v22.2D, v11.2D, v15.2D +trn1 v25.2D, v28.2D, v1.2D +trn1 v24.2D, v11.2D, v15.2D +sqrdmulh v15.4S, v18.4S, v21.4S +mul v18.4S, v18.4S,v8.4S +mla v18.4S, v15.4S, v31.s[0] +sub v15.4s, v25.4s, v18.4s +add v25.4s, v25.4s, v18.4s +sqrdmulh v18.4S, v22.4S, v21.4S +mul v22.4S, v22.4S,v8.4S +mla v22.4S, v18.4S, v31.s[0] +sub v18.4s, v24.4s, v22.4s +add v24.4s, v24.4s, v22.4s +sqrdmulh v22.4S, v24.4S, v9.4S +mul v24.4S, v24.4S,v23.4S +mla v24.4S, v22.4S, v31.s[0] +sub v22.4s, v25.4s, v24.4s +add v25.4s, v25.4s, v24.4s +sqrdmulh v24.4S, v18.4S, v6.4S +mul v18.4S, v18.4S,v13.4S +mla v18.4S, v24.4S, v31.s[0] +sub v24.4s, v15.4s, v18.4s +add v15.4s, v15.4s, v18.4s +str q25, [x0, #832] +str q22, [x0, #848] +str q15, [x0, #864] +str q24, [x0, #880] +ldr q24, [x17, #+1920] +ldr q15, [x17, #+1936] +ldr q22, [x17, #+1952] +ldr q25, [x17, #+1968] +ldr q18, [x17, #+1984] +ldr q11, [x17, #+2000] +ldr q1, [x17, #+2016] +ldr q28, [x17, #+2032] +ldr q6, [x0, #928] +ldr q13, [x0, #944] +ldr q9, [x0, #896] +ldr q23, [x0, #912] +sqrdmulh v21.4S, v6.4S, v15.s[0] +mul v6.4S, v6.4S,v24.s[0] +mla v6.4S, v21.4S, v31.s[0] +sub v21.4s, v9.4s, v6.4s +add v9.4s, v9.4s, v6.4s +sqrdmulh v6.4S, v13.4S, v15.s[0] +mul v13.4S, v13.4S,v24.s[0] +mla v13.4S, v6.4S, v31.s[0] +sub v6.4s, v23.4s, v13.4s +add v23.4s, v23.4s, v13.4s +sqrdmulh v13.4S, v23.4S, v15.s[1] +mul v23.4S, v23.4S,v24.s[1] +mla v23.4S, v13.4S, v31.s[0] +sub v13.4s, v9.4s, v23.4s +add v9.4s, v9.4s, v23.4s +sqrdmulh v23.4S, v6.4S, v15.s[2] +mul v6.4S, v6.4S,v24.s[2] +mla v6.4S, v23.4S, v31.s[0] +sub v23.4s, v21.4s, v6.4s +add v21.4s, v21.4s, v6.4s +trn1 v6.4S, v9.4S, v13.4S +trn2 v8.4S, v9.4S, v13.4S +trn1 v5.4S, v21.4S, v23.4S +trn2 v0.4S, v21.4S, v23.4S +trn2 v21.2D, v6.2D, v5.2D +trn2 v23.2D, v8.2D, v0.2D +trn1 v9.2D, v6.2D, v5.2D +trn1 v13.2D, v8.2D, v0.2D +sqrdmulh v0.4S, v21.4S, v25.4S +mul v21.4S, v21.4S,v22.4S +mla v21.4S, v0.4S, v31.s[0] +sub v0.4s, v9.4s, v21.4s +add v9.4s, v9.4s, v21.4s +sqrdmulh v21.4S, v23.4S, v25.4S +mul v23.4S, v23.4S,v22.4S +mla v23.4S, v21.4S, v31.s[0] +sub v21.4s, v13.4s, v23.4s +add v13.4s, v13.4s, v23.4s +sqrdmulh v23.4S, v13.4S, v11.4S +mul v13.4S, v13.4S,v18.4S +mla v13.4S, v23.4S, v31.s[0] +sub v23.4s, v9.4s, v13.4s +add v9.4s, v9.4s, v13.4s +sqrdmulh v13.4S, v21.4S, v28.4S +mul v21.4S, v21.4S,v1.4S +mla v21.4S, v13.4S, v31.s[0] +sub v13.4s, v0.4s, v21.4s +add v0.4s, v0.4s, v21.4s +str q9, [x0, #896] +str q23, [x0, #912] +str q0, [x0, #928] +str q13, [x0, #944] +ldr q13, [x17, #+2048] +ldr q0, [x17, #+2064] +ldr q23, [x17, #+2080] +ldr q9, [x17, #+2096] +ldr q21, [x17, #+2112] +ldr q8, [x17, #+2128] +ldr q5, [x17, #+2144] +ldr q6, [x17, #+2160] +ldr q28, [x0, #992] +ldr q1, [x0, #1008] +ldr q11, [x0, #960] +ldr q18, [x0, #976] +sqrdmulh v25.4S, v28.4S, v0.s[0] +mul v28.4S, v28.4S,v13.s[0] +mla v28.4S, v25.4S, v31.s[0] +sub v25.4s, v11.4s, v28.4s +add v11.4s, v11.4s, v28.4s +sqrdmulh v28.4S, v1.4S, v0.s[0] +mul v1.4S, v1.4S,v13.s[0] +mla v1.4S, v28.4S, v31.s[0] +sub v28.4s, v18.4s, v1.4s +add v18.4s, v18.4s, v1.4s +sqrdmulh v1.4S, v18.4S, v0.s[1] +mul v18.4S, v18.4S,v13.s[1] +mla v18.4S, v1.4S, v31.s[0] +sub v1.4s, v11.4s, v18.4s +add v11.4s, v11.4s, v18.4s +sqrdmulh v18.4S, v28.4S, v0.s[2] +mul v28.4S, v28.4S,v13.s[2] +mla v28.4S, v18.4S, v31.s[0] +sub v18.4s, v25.4s, v28.4s +add v25.4s, v25.4s, v28.4s +trn1 v28.4S, v11.4S, v1.4S +trn2 v22.4S, v11.4S, v1.4S +trn1 v15.4S, v25.4S, v18.4S +trn2 v24.4S, v25.4S, v18.4S +trn2 v25.2D, v28.2D, v15.2D +trn2 v18.2D, v22.2D, v24.2D +trn1 v11.2D, v28.2D, v15.2D +trn1 v1.2D, v22.2D, v24.2D +sqrdmulh v24.4S, v25.4S, v9.4S +mul v25.4S, v25.4S,v23.4S +mla v25.4S, v24.4S, v31.s[0] +sub v24.4s, v11.4s, v25.4s +add v11.4s, v11.4s, v25.4s +sqrdmulh v25.4S, v18.4S, v9.4S +mul v18.4S, v18.4S,v23.4S +mla v18.4S, v25.4S, v31.s[0] +sub v25.4s, v1.4s, v18.4s +add v1.4s, v1.4s, v18.4s +sqrdmulh v18.4S, v1.4S, v8.4S +mul v1.4S, v1.4S,v21.4S +mla v1.4S, v18.4S, v31.s[0] +sub v18.4s, v11.4s, v1.4s +add v11.4s, v11.4s, v1.4s +sqrdmulh v1.4S, v25.4S, v6.4S +mul v25.4S, v25.4S,v5.4S +mla v25.4S, v1.4S, v31.s[0] +sub v1.4s, v24.4s, v25.4s +add v24.4s, v24.4s, v25.4s +str q11, [x0, #960] +str q18, [x0, #976] +str q24, [x0, #992] +str q1, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 2456 +// Instruction count: 2452 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_1_0.s b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_1_0.s new file mode 100644 index 0000000..d6c8d9d --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_1_0.s @@ -0,0 +1,2422 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 26036764 // Layer 6, block 0 +.word 7065381 // Layer 6, block 1 +.word 11280567 // Layer 6, block 2 +.word 19695786 // Layer 6, block 3 +.word 1666225723 // Layer 6, block 0 +.word 452149874 // Layer 6, block 1 +.word 721901190 // Layer 6, block 2 +.word 1260434103 // Layer 6, block 3 +.word 28678040 // Layer 7, block 0 +.word 5637166 // Layer 7, block 2 +.word 18759424 // Layer 7, block 4 +.word 8648030 // Layer 7, block 6 +.word 1835254486 // Layer 7, block 0 +.word 360751090 // Layer 7, block 2 +.word 1200511508 // Layer 7, block 4 +.word 553431680 // Layer 7, block 6 +.word 7232147 // Layer 7, block 1 +.word 7430689 // Layer 7, block 3 +.word 14819378 // Layer 7, block 5 +.word 22112339 // Layer 7, block 7 +.word 462822084 // Layer 7, block 1 +.word 475527802 // Layer 7, block 3 +.word 948367809 // Layer 7, block 5 +.word 1415081692 // Layer 7, block 7 +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14834498 // Layer 6, block 4 +.word 22861321 // Layer 6, block 5 +.word 23033862 // Layer 6, block 6 +.word 32211066 // Layer 6, block 7 +.word 949335415 // Layer 6, block 4 +.word 1463012881 // Layer 6, block 5 +.word 1474054663 // Layer 6, block 6 +.word 2061350894 // Layer 6, block 7 +.word 7103825 // Layer 7, block 8 +.word 24338119 // Layer 7, block 10 +.word 6674394 // Layer 7, block 12 +.word 3716128 // Layer 7, block 14 +.word 454610102 // Layer 7, block 8 +.word 1557520740 // Layer 7, block 10 +.word 427128616 // Layer 7, block 12 +.word 237814041 // Layer 7, block 14 +.word 18577393 // Layer 7, block 9 +.word 17042091 // Layer 7, block 11 +.word 6574213 // Layer 7, block 13 +.word 24666803 // Layer 7, block 15 +.word 1188862414 // Layer 7, block 9 +.word 1090610585 // Layer 7, block 11 +.word 420717521 // Layer 7, block 13 +.word 1578554911 // Layer 7, block 15 +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 11253846 // Layer 6, block 8 +.word 16151303 // Layer 6, block 9 +.word 1821442 // Layer 6, block 10 +.word 23358663 // Layer 6, block 11 +.word 720191176 // Layer 6, block 8 +.word 1033604503 // Layer 6, block 9 +.word 116563391 // Layer 6, block 10 +.word 1494840340 // Layer 6, block 11 +.word 32787475 // Layer 7, block 16 +.word 8269259 // Layer 7, block 18 +.word 20826321 // Layer 7, block 20 +.word 21194054 // Layer 7, block 22 +.word 2098238255 // Layer 7, block 16 +.word 529192186 // Layer 7, block 18 +.word 1332782821 // Layer 7, block 20 +.word 1356315937 // Layer 7, block 22 +.word 28400654 // Layer 7, block 17 +.word 31090287 // Layer 7, block 19 +.word 26776841 // Layer 7, block 21 +.word 22281074 // Layer 7, block 23 +.word 1817503137 // Layer 7, block 17 +.word 1989626512 // Layer 7, block 19 +.word 1713587037 // Layer 7, block 21 +.word 1425879908 // Layer 7, block 23 +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 20504641 // Layer 6, block 12 +.word 7735096 // Layer 6, block 13 +.word 29463916 // Layer 6, block 14 +.word 23172067 // Layer 6, block 15 +.word 1312196872 // Layer 6, block 12 +.word 495008363 // Layer 6, block 13 +.word 1885546712 // Layer 6, block 14 +.word 1482899108 // Layer 6, block 15 +.word 1953000 // Layer 7, block 24 +.word 12766243 // Layer 7, block 26 +.word 16292342 // Layer 7, block 28 +.word 25143337 // Layer 7, block 30 +.word 124982461 // Layer 7, block 24 +.word 816977197 // Layer 7, block 26 +.word 1042630311 // Layer 7, block 28 +.word 1609050759 // Layer 7, block 30 +.word 12486848 // Layer 7, block 25 +.word 31556661 // Layer 7, block 27 +.word 28330310 // Layer 7, block 29 +.word 15137961 // Layer 7, block 31 +.word 799097282 // Layer 7, block 25 +.word 2019472170 // Layer 7, block 27 +.word 1813001465 // Layer 7, block 29 +.word 968755565 // Layer 7, block 31 +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 18663828 // Layer 6, block 16 +.word 25765932 // Layer 6, block 17 +.word 11779122 // Layer 6, block 18 +.word 29112305 // Layer 6, block 19 +.word 1194393831 // Layer 6, block 16 +.word 1648893798 // Layer 6, block 17 +.word 753806275 // Layer 6, block 18 +.word 1863045325 // Layer 6, block 19 +.word 33163184 // Layer 7, block 32 +.word 11550623 // Layer 7, block 34 +.word 25375595 // Layer 7, block 36 +.word 18254638 // Layer 7, block 38 +.word 2122281795 // Layer 7, block 32 +.word 739183455 // Layer 7, block 34 +.word 1623914137 // Layer 7, block 36 +.word 1168207670 // Layer 7, block 38 +.word 9551359 // Layer 7, block 33 +.word 33257316 // Layer 7, block 35 +.word 10387700 // Layer 7, block 37 +.word 4263629 // Layer 7, block 39 +.word 611240324 // Layer 7, block 33 +.word 2128305784 // Layer 7, block 35 +.word 664762063 // Layer 7, block 37 +.word 272851431 // Layer 7, block 39 +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 596073 // Layer 6, block 20 +.word 29039358 // Layer 6, block 21 +.word 6760262 // Layer 6, block 22 +.word 2228887 // Layer 6, block 23 +.word 38145761 // Layer 6, block 20 +.word 1858377074 // Layer 6, block 21 +.word 432623749 // Layer 6, block 22 +.word 142637881 // Layer 6, block 23 +.word 25929180 // Layer 7, block 40 +.word 23508428 // Layer 7, block 42 +.word 22560727 // Layer 7, block 44 +.word 29457393 // Layer 7, block 46 +.word 1659340873 // Layer 7, block 40 +.word 1504424569 // Layer 7, block 42 +.word 1443776334 // Layer 7, block 44 +.word 1885129272 // Layer 7, block 46 +.word 17371159 // Layer 7, block 41 +.word 11558208 // Layer 7, block 43 +.word 15755637 // Layer 7, block 45 +.word 20740787 // Layer 7, block 47 +.word 1111669329 // Layer 7, block 41 +.word 739668858 // Layer 7, block 43 +.word 1008283812 // Layer 7, block 45 +.word 1327309063 // Layer 7, block 47 +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 13624329 // Layer 6, block 24 +.word 9838349 // Layer 6, block 25 +.word 6934560 // Layer 6, block 26 +.word 11310234 // Layer 6, block 27 +.word 871890510 // Layer 6, block 24 +.word 629606282 // Layer 6, block 25 +.word 443777969 // Layer 6, block 26 +.word 723799733 // Layer 6, block 27 +.word 3153984 // Layer 7, block 48 +.word 15599806 // Layer 7, block 50 +.word 23484790 // Layer 7, block 52 +.word 30174454 // Layer 7, block 54 +.word 201839571 // Layer 7, block 48 +.word 998311389 // Layer 7, block 50 +.word 1502911852 // Layer 7, block 52 +.word 1931017673 // Layer 7, block 54 +.word 13598070 // Layer 7, block 49 +.word 31454003 // Layer 7, block 51 +.word 20506260 // Layer 7, block 53 +.word 5928435 // Layer 7, block 55 +.word 870210062 // Layer 7, block 49 +.word 2012902560 // Layer 7, block 51 +.word 1312300480 // Layer 7, block 53 +.word 379390883 // Layer 7, block 55 +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 32798516 // Layer 6, block 28 +.word 9911360 // Layer 6, block 29 +.word 32443170 // Layer 6, block 30 +.word 31293482 // Layer 6, block 31 +.word 2098944825 // Layer 6, block 28 +.word 634278629 // Layer 6, block 29 +.word 2076204416 // Layer 6, block 30 +.word 2002630000 // Layer 6, block 31 +.word 26013877 // Layer 7, block 56 +.word 22928950 // Layer 7, block 58 +.word 24547058 // Layer 7, block 60 +.word 21082546 // Layer 7, block 62 +.word 1664761067 // Layer 7, block 56 +.word 1467340807 // Layer 7, block 58 +.word 1570891816 // Layer 7, block 60 +.word 1349179970 // Layer 7, block 62 +.word 21864746 // Layer 7, block 57 +.word 27678266 // Layer 7, block 59 +.word 30695887 // Layer 7, block 61 +.word 31772478 // Layer 7, block 63 +.word 1399236949 // Layer 7, block 57 +.word 1771273834 // Layer 7, block 59 +.word 1964386839 // Layer 7, block 61 +.word 2033283404 // Layer 7, block 63 +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 2853776 // Layer 6, block 32 +.word 31645959 // Layer 6, block 33 +.word 29723614 // Layer 6, block 34 +.word 31813171 // Layer 6, block 35 +.word 182627725 // Layer 6, block 32 +.word 2025186806 // Layer 6, block 33 +.word 1902166116 // Layer 6, block 34 +.word 2035887557 // Layer 6, block 35 +.word 30377953 // Layer 7, block 64 +.word 4924837 // Layer 7, block 66 +.word 11362575 // Layer 7, block 68 +.word 31398766 // Layer 7, block 70 +.word 1944040616 // Layer 7, block 64 +.word 315165513 // Layer 7, block 66 +.word 727149301 // Layer 7, block 68 +.word 2009367662 // Layer 7, block 70 +.word 27689101 // Layer 7, block 65 +.word 31229525 // Layer 7, block 67 +.word 6544948 // Layer 7, block 69 +.word 13728247 // Layer 7, block 71 +.word 1771967221 // Layer 7, block 65 +.word 1998537064 // Layer 7, block 67 +.word 418844704 // Layer 7, block 69 +.word 878540754 // Layer 7, block 71 +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9116920 // Layer 6, block 36 +.word 26449800 // Layer 6, block 37 +.word 27173300 // Layer 6, block 38 +.word 1574249 // Layer 6, block 39 +.word 583438350 // Layer 6, block 36 +.word 1692658010 // Layer 6, block 37 +.word 1738958476 // Layer 6, block 38 +.word 100744247 // Layer 6, block 39 +.word 6510145 // Layer 7, block 72 +.word 760999 // Layer 7, block 74 +.word 1634503 // Layer 7, block 76 +.word 29546109 // Layer 7, block 78 +.word 416617482 // Layer 7, block 72 +.word 48700219 // Layer 7, block 74 +.word 104600209 // Layer 7, block 76 +.word 1890806663 // Layer 7, block 78 +.word 2195232 // Layer 7, block 73 +.word 4465852 // Layer 7, block 75 +.word 31203102 // Layer 7, block 77 +.word 29916743 // Layer 7, block 79 +.word 140484126 // Layer 7, block 73 +.word 285792715 // Layer 7, block 75 +.word 1996846121 // Layer 7, block 77 +.word 1914525428 // Layer 7, block 79 +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29172999 // Layer 6, block 40 +.word 16825951 // Layer 6, block 41 +.word 11592382 // Layer 6, block 42 +.word 2671395 // Layer 6, block 43 +.word 1866929445 // Layer 6, block 40 +.word 1076778680 // Layer 6, block 41 +.word 741855827 // Layer 6, block 42 +.word 170956232 // Layer 6, block 43 +.word 14579779 // Layer 7, block 80 +.word 24263513 // Layer 7, block 82 +.word 4646776 // Layer 7, block 84 +.word 69049 // Layer 7, block 86 +.word 933034643 // Layer 7, block 80 +.word 1552746321 // Layer 7, block 82 +.word 297370968 // Layer 7, block 84 +.word 4418799 // Layer 7, block 86 +.word 33263488 // Layer 7, block 81 +.word 22493246 // Layer 7, block 83 +.word 22009979 // Layer 7, block 85 +.word 12021234 // Layer 7, block 87 +.word 2128700762 // Layer 7, block 81 +.word 1439457879 // Layer 7, block 83 +.word 1408531152 // Layer 7, block 85 +.word 769300260 // Layer 7, block 87 +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 15720958 // Layer 6, block 44 +.word 4876619 // Layer 6, block 45 +.word 9370171 // Layer 6, block 46 +.word 2197027 // Layer 6, block 47 +.word 1006064525 // Layer 6, block 44 +.word 312079797 // Layer 6, block 45 +.word 599645177 // Layer 6, block 46 +.word 140598997 // Layer 6, block 47 +.word 16117282 // Layer 7, block 88 +.word 9635661 // Layer 7, block 90 +.word 9117520 // Layer 7, block 92 +.word 3506913 // Layer 7, block 94 +.word 1031427326 // Layer 7, block 88 +.word 616635240 // Layer 7, block 90 +.word 583476747 // Layer 7, block 92 +.word 224425303 // Layer 7, block 94 +.word 20014407 // Layer 7, block 89 +.word 25893988 // Layer 7, block 91 +.word 10257619 // Layer 7, block 93 +.word 24501669 // Layer 7, block 95 +.word 1280824291 // Layer 7, block 89 +.word 1657088757 // Layer 7, block 91 +.word 656437514 // Layer 7, block 93 +.word 1567987141 // Layer 7, block 95 +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 23467272 // Layer 6, block 48 +.word 11944835 // Layer 6, block 49 +.word 29768154 // Layer 6, block 50 +.word 3189790 // Layer 6, block 51 +.word 1501790786 // Layer 6, block 48 +.word 764411097 // Layer 6, block 49 +.word 1905016458 // Layer 6, block 50 +.word 204130980 // Layer 6, block 51 +.word 28559032 // Layer 7, block 96 +.word 20151609 // Layer 7, block 98 +.word 11645481 // Layer 7, block 100 +.word 16402437 // Layer 7, block 102 +.word 1827638556 // Layer 7, block 96 +.word 1289604549 // Layer 7, block 98 +.word 745253903 // Layer 7, block 100 +.word 1049675853 // Layer 7, block 102 +.word 1005359 // Layer 7, block 97 +.word 19130139 // Layer 7, block 99 +.word 11690281 // Layer 7, block 101 +.word 5461508 // Layer 7, block 103 +.word 64338065 // Layer 7, block 97 +.word 1224235458 // Layer 7, block 99 +.word 748120885 // Layer 7, block 101 +.word 349509836 // Layer 7, block 103 +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 4898455 // Layer 6, block 52 +.word 22059944 // Layer 6, block 53 +.word 20315246 // Layer 6, block 54 +.word 28615767 // Layer 6, block 55 +.word 313477194 // Layer 6, block 52 +.word 1411728668 // Layer 6, block 53 +.word 1300076517 // Layer 6, block 54 +.word 1831269319 // Layer 6, block 55 +.word 6226096 // Layer 7, block 104 +.word 14029790 // Layer 7, block 106 +.word 7729000 // Layer 7, block 108 +.word 13958531 // Layer 7, block 110 +.word 398439734 // Layer 7, block 104 +.word 897838034 // Layer 7, block 106 +.word 494618249 // Layer 7, block 108 +.word 893277806 // Layer 7, block 110 +.word 31755058 // Layer 7, block 105 +.word 26102744 // Layer 7, block 107 +.word 19175904 // Layer 7, block 109 +.word 19472238 // Layer 7, block 111 +.word 2032168609 // Layer 7, block 105 +.word 1670448121 // Layer 7, block 107 +.word 1227164194 // Layer 7, block 109 +.word 1246128123 // Layer 7, block 111 +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 17302560 // Layer 6, block 56 +.word 8630188 // Layer 6, block 57 +.word 13744680 // Layer 6, block 58 +.word 31890906 // Layer 6, block 59 +.word 1107279328 // Layer 6, block 56 +.word 552289879 // Layer 6, block 57 +.word 879592386 // Layer 6, block 58 +.word 2040862218 // Layer 6, block 59 +.word 4735938 // Layer 7, block 112 +.word 26671657 // Layer 7, block 114 +.word 25810971 // Layer 7, block 116 +.word 25578690 // Layer 7, block 118 +.word 303076900 // Layer 7, block 112 +.word 1706855774 // Layer 7, block 114 +.word 1651776074 // Layer 7, block 116 +.word 1636911225 // Layer 7, block 118 +.word 6957373 // Layer 7, block 113 +.word 25381712 // Layer 7, block 115 +.word 27780827 // Layer 7, block 117 +.word 28062311 // Layer 7, block 119 +.word 445237890 // Layer 7, block 113 +.word 1624305595 // Layer 7, block 115 +.word 1777837237 // Layer 7, block 117 +.word 1795850838 // Layer 7, block 119 +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 26150922 // Layer 6, block 60 +.word 29525906 // Layer 6, block 61 +.word 23080870 // Layer 6, block 62 +.word 1636987 // Layer 6, block 63 +.word 1673531278 // Layer 6, block 60 +.word 1889513769 // Layer 6, block 61 +.word 1477062945 // Layer 6, block 62 +.word 104759172 // Layer 6, block 63 +.word 10674616 // Layer 7, block 120 +.word 9508293 // Layer 7, block 122 +.word 4274200 // Layer 7, block 124 +.word 10066304 // Layer 7, block 126 +.word 683123285 // Layer 7, block 120 +.word 608484310 // Layer 7, block 122 +.word 273527923 // Layer 7, block 124 +.word 644194289 // Layer 7, block 126 +.word 26473446 // Layer 7, block 121 +.word 14853570 // Layer 7, block 123 +.word 32427548 // Layer 7, block 125 +.word 16598340 // Layer 7, block 127 +.word 1694171239 // Layer 7, block 121 +.word 950555930 // Layer 7, block 123 +.word 2075204685 // Layer 7, block 125 +.word 1062212688 // Layer 7, block 127 +.text +.global ntt_u32_full_neon_asm_var_4_4_1_0 +.global _ntt_u32_full_neon_asm_var_4_4_1_0 +ntt_u32_full_neon_asm_var_4_4_1_0: +_ntt_u32_full_neon_asm_var_4_4_1_0: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #800] +ldr q21, [x0, #864] +ldr q20, [x0, #928] +ldr q19, [x0, #992] +ldr q18, [x0, #288] +ldr q17, [x0, #352] +ldr q16, [x0, #416] +ldr q3, [x0, #480] +ldr q2, [x0, #544] +ldr q1, [x0, #608] +ldr q0, [x0, #672] +ldr q15, [x0, #736] +ldr q14, [x0, #32] +ldr q13, [x0, #96] +ldr q12, [x0, #160] +ldr q11, [x0, #224] +sqrdmulh v10.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sqrdmulh v9.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +mla v22.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +mla v21.4S, v9.4S, v31.s[0] +sub v9.4s, v18.4s, v22.4s +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +mla v20.4S, v10.4S, v31.s[0] +sub v10.4s, v17.4s, v21.4s +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +mla v19.4S, v22.4S, v31.s[0] +sub v22.4s, v16.4s, v20.4s +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +mla v2.4S, v21.4S, v31.s[0] +sub v21.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v0.4S, v29.s[0] +mul v0.4S, v0.4S,v30.s[0] +mla v1.4S, v20.4S, v31.s[0] +sub v20.4s, v14.4s, v2.4s +add v14.4s, v14.4s, v2.4s +sqrdmulh v2.4S, v15.4S, v29.s[0] +mul v15.4S, v15.4S,v30.s[0] +mla v0.4S, v19.4S, v31.s[0] +sub v19.4s, v13.4s, v1.4s +add v13.4s, v13.4s, v1.4s +sqrdmulh v1.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +mla v15.4S, v2.4S, v31.s[0] +sub v2.4s, v12.4s, v0.4s +add v12.4s, v12.4s, v0.4s +sqrdmulh v0.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +mla v16.4S, v1.4S, v31.s[0] +sub v1.4s, v11.4s, v15.4s +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +mla v3.4S, v0.4S, v31.s[0] +sub v0.4s, v12.4s, v16.4s +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +mla v18.4S, v15.4S, v31.s[0] +sub v15.4s, v11.4s, v3.4s +add v11.4s, v11.4s, v3.4s +sqrdmulh v3.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +mla v17.4S, v16.4S, v31.s[0] +sub v16.4s, v14.4s, v18.4s +add v14.4s, v14.4s, v18.4s +sqrdmulh v18.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +mla v22.4S, v3.4S, v31.s[0] +sub v3.4s, v13.4s, v17.4s +add v13.4s, v13.4s, v17.4s +sqrdmulh v17.4S, v9.4S, v29.s[2] +mul v9.4S, v9.4S,v30.s[2] +mla v21.4S, v18.4S, v31.s[0] +sub v18.4s, v2.4s, v22.4s +add v2.4s, v2.4s, v22.4s +sqrdmulh v22.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +mla v9.4S, v17.4S, v31.s[0] +sub v17.4s, v1.4s, v21.4s +add v1.4s, v1.4s, v21.4s +sqrdmulh v21.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +mla v10.4S, v22.4S, v31.s[0] +sub v22.4s, v20.4s, v9.4s +add v20.4s, v20.4s, v9.4s +sqrdmulh v9.4S, v11.4S, v27.s[0] +mul v11.4S, v11.4S,v28.s[0] +mla v12.4S, v21.4S, v31.s[0] +sub v21.4s, v19.4s, v10.4s +add v19.4s, v19.4s, v10.4s +sqrdmulh v10.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +mla v11.4S, v9.4S, v31.s[0] +sub v9.4s, v14.4s, v12.4s +add v14.4s, v14.4s, v12.4s +sqrdmulh v12.4S, v15.4S, v27.s[1] +mul v15.4S, v15.4S,v28.s[1] +mla v0.4S, v10.4S, v31.s[0] +sub v10.4s, v13.4s, v11.4s +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v2.4S, v27.s[2] +mul v2.4S, v2.4S,v28.s[2] +mla v15.4S, v12.4S, v31.s[0] +sub v12.4s, v16.4s, v0.4s +add v16.4s, v16.4s, v0.4s +sqrdmulh v0.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +mla v2.4S, v11.4S, v31.s[0] +sub v11.4s, v3.4s, v15.4s +add v3.4s, v3.4s, v15.4s +sqrdmulh v15.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +mla v1.4S, v0.4S, v31.s[0] +sub v0.4s, v20.4s, v2.4s +add v20.4s, v20.4s, v2.4s +sqrdmulh v2.4S, v17.4S, v27.s[3] +mul v17.4S, v17.4S,v28.s[3] +mla v18.4S, v15.4S, v31.s[0] +sub v15.4s, v19.4s, v1.4s +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v13.4S, v25.s[0] +mul v13.4S, v13.4S,v26.s[0] +mla v17.4S, v2.4S, v31.s[0] +sub v2.4s, v22.4s, v18.4s +add v22.4s, v22.4s, v18.4s +sqrdmulh v18.4S, v10.4S, v25.s[1] +mul v10.4S, v10.4S,v26.s[1] +mla v13.4S, v1.4S, v31.s[0] +sub v1.4s, v21.4s, v17.4s +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v3.4S, v25.s[2] +mul v3.4S, v3.4S,v26.s[2] +mla v10.4S, v18.4S, v31.s[0] +sub v18.4s, v14.4s, v13.4s +add v14.4s, v14.4s, v13.4s +sqrdmulh v13.4S, v11.4S, v25.s[3] +mul v11.4S, v11.4S,v26.s[3] +mla v3.4S, v17.4S, v31.s[0] +sub v17.4s, v9.4s, v10.4s +add v9.4s, v9.4s, v10.4s +sqrdmulh v10.4S, v19.4S, v23.s[0] +mul v19.4S, v19.4S,v24.s[0] +mla v11.4S, v13.4S, v31.s[0] +sub v13.4s, v16.4s, v3.4s +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v15.4S, v23.s[1] +mul v15.4S, v15.4S,v24.s[1] +mla v19.4S, v10.4S, v31.s[0] +sub v10.4s, v12.4s, v11.4s +add v12.4s, v12.4s, v11.4s +sqrdmulh v11.4S, v21.4S, v23.s[2] +mul v21.4S, v21.4S,v24.s[2] +mla v15.4S, v3.4S, v31.s[0] +sub v3.4s, v20.4s, v19.4s +add v20.4s, v20.4s, v19.4s +sqrdmulh v19.4S, v1.4S, v23.s[3] +mul v1.4S, v1.4S,v24.s[3] +mla v21.4S, v11.4S, v31.s[0] +sub v11.4s, v0.4s, v15.4s +add v0.4s, v0.4s, v15.4s +mla v1.4S, v19.4S, v31.s[0] +sub v19.4s, v22.4s, v21.4s +add v22.4s, v22.4s, v21.4s +sub v21.4s, v2.4s, v1.4s +add v2.4s, v2.4s, v1.4s +str q14, [x0, #32] +str q18, [x0, #96] +str q9, [x0, #160] +str q17, [x0, #224] +str q16, [x0, #288] +str q13, [x0, #352] +str q12, [x0, #416] +str q10, [x0, #480] +str q20, [x0, #544] +str q3, [x0, #608] +str q0, [x0, #672] +str q11, [x0, #736] +str q22, [x0, #800] +str q19, [x0, #864] +str q2, [x0, #928] +str q21, [x0, #992] +ldr q21, [x0, #816] +ldr q2, [x0, #880] +ldr q19, [x0, #944] +ldr q22, [x0, #1008] +ldr q11, [x0, #304] +ldr q0, [x0, #368] +ldr q3, [x0, #432] +ldr q20, [x0, #496] +ldr q10, [x0, #560] +ldr q12, [x0, #624] +ldr q13, [x0, #688] +ldr q16, [x0, #752] +ldr q17, [x0, #48] +ldr q9, [x0, #112] +ldr q18, [x0, #176] +ldr q14, [x0, #240] +sqrdmulh v1.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +sqrdmulh v15.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +mla v21.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +mla v2.4S, v15.4S, v31.s[0] +sub v15.4s, v11.4s, v21.4s +add v11.4s, v11.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +mla v19.4S, v1.4S, v31.s[0] +sub v1.4s, v0.4s, v2.4s +add v0.4s, v0.4s, v2.4s +sqrdmulh v2.4S, v10.4S, v29.s[0] +mul v10.4S, v10.4S,v30.s[0] +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +mla v10.4S, v2.4S, v31.s[0] +sub v2.4s, v20.4s, v22.4s +add v20.4s, v20.4s, v22.4s +sqrdmulh v22.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +mla v12.4S, v19.4S, v31.s[0] +sub v19.4s, v17.4s, v10.4s +add v17.4s, v17.4s, v10.4s +sqrdmulh v10.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +mla v13.4S, v22.4S, v31.s[0] +sub v22.4s, v9.4s, v12.4s +add v9.4s, v9.4s, v12.4s +sqrdmulh v12.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +mla v16.4S, v10.4S, v31.s[0] +sub v10.4s, v18.4s, v13.4s +add v18.4s, v18.4s, v13.4s +sqrdmulh v13.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +mla v3.4S, v12.4S, v31.s[0] +sub v12.4s, v14.4s, v16.4s +add v14.4s, v14.4s, v16.4s +sqrdmulh v16.4S, v11.4S, v29.s[1] +mul v11.4S, v11.4S,v30.s[1] +mla v20.4S, v13.4S, v31.s[0] +sub v13.4s, v18.4s, v3.4s +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v0.4S, v29.s[1] +mul v0.4S, v0.4S,v30.s[1] +mla v11.4S, v16.4S, v31.s[0] +sub v16.4s, v14.4s, v20.4s +add v14.4s, v14.4s, v20.4s +sqrdmulh v20.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +mla v0.4S, v3.4S, v31.s[0] +sub v3.4s, v17.4s, v11.4s +add v17.4s, v17.4s, v11.4s +sqrdmulh v11.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +mla v21.4S, v20.4S, v31.s[0] +sub v20.4s, v9.4s, v0.4s +add v9.4s, v9.4s, v0.4s +sqrdmulh v0.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +mla v2.4S, v11.4S, v31.s[0] +sub v11.4s, v10.4s, v21.4s +add v10.4s, v10.4s, v21.4s +sqrdmulh v21.4S, v1.4S, v29.s[2] +mul v1.4S, v1.4S,v30.s[2] +mla v15.4S, v0.4S, v31.s[0] +sub v0.4s, v12.4s, v2.4s +add v12.4s, v12.4s, v2.4s +sqrdmulh v2.4S, v18.4S, v27.s[0] +mul v18.4S, v18.4S,v28.s[0] +mla v1.4S, v21.4S, v31.s[0] +sub v21.4s, v19.4s, v15.4s +add v19.4s, v19.4s, v15.4s +sqrdmulh v15.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +mla v18.4S, v2.4S, v31.s[0] +sub v2.4s, v22.4s, v1.4s +add v22.4s, v22.4s, v1.4s +sqrdmulh v1.4S, v13.4S, v27.s[1] +mul v13.4S, v13.4S,v28.s[1] +mla v14.4S, v15.4S, v31.s[0] +sub v15.4s, v17.4s, v18.4s +add v17.4s, v17.4s, v18.4s +sqrdmulh v18.4S, v16.4S, v27.s[1] +mul v16.4S, v16.4S,v28.s[1] +mla v13.4S, v1.4S, v31.s[0] +sub v1.4s, v9.4s, v14.4s +add v9.4s, v9.4s, v14.4s +sqrdmulh v14.4S, v10.4S, v27.s[2] +mul v10.4S, v10.4S,v28.s[2] +mla v16.4S, v18.4S, v31.s[0] +sub v18.4s, v3.4s, v13.4s +add v3.4s, v3.4s, v13.4s +sqrdmulh v13.4S, v12.4S, v27.s[2] +mul v12.4S, v12.4S,v28.s[2] +mla v10.4S, v14.4S, v31.s[0] +sub v14.4s, v20.4s, v16.4s +add v20.4s, v20.4s, v16.4s +sqrdmulh v16.4S, v11.4S, v27.s[3] +mul v11.4S, v11.4S,v28.s[3] +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v19.4s, v10.4s +add v19.4s, v19.4s, v10.4s +sqrdmulh v10.4S, v0.4S, v27.s[3] +mul v0.4S, v0.4S,v28.s[3] +mla v11.4S, v16.4S, v31.s[0] +sub v16.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v9.4S, v25.s[0] +mul v9.4S, v9.4S,v26.s[0] +mla v0.4S, v10.4S, v31.s[0] +sub v10.4s, v21.4s, v11.4s +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v1.4S, v25.s[1] +mul v1.4S, v1.4S,v26.s[1] +mla v9.4S, v12.4S, v31.s[0] +sub v12.4s, v2.4s, v0.4s +add v2.4s, v2.4s, v0.4s +sqrdmulh v0.4S, v20.4S, v25.s[2] +mul v20.4S, v20.4S,v26.s[2] +mla v1.4S, v11.4S, v31.s[0] +sub v11.4s, v17.4s, v9.4s +add v17.4s, v17.4s, v9.4s +sqrdmulh v9.4S, v14.4S, v25.s[3] +mul v14.4S, v14.4S,v26.s[3] +mla v20.4S, v0.4S, v31.s[0] +sub v0.4s, v15.4s, v1.4s +add v15.4s, v15.4s, v1.4s +sqrdmulh v1.4S, v22.4S, v23.s[0] +mul v22.4S, v22.4S,v24.s[0] +mla v14.4S, v9.4S, v31.s[0] +sub v9.4s, v3.4s, v20.4s +add v3.4s, v3.4s, v20.4s +sqrdmulh v20.4S, v16.4S, v23.s[1] +mul v16.4S, v16.4S,v24.s[1] +mla v22.4S, v1.4S, v31.s[0] +sub v1.4s, v18.4s, v14.4s +add v18.4s, v18.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v23.s[2] +mul v2.4S, v2.4S,v24.s[2] +mla v16.4S, v20.4S, v31.s[0] +sub v20.4s, v19.4s, v22.4s +add v19.4s, v19.4s, v22.4s +sqrdmulh v22.4S, v12.4S, v23.s[3] +mul v12.4S, v12.4S,v24.s[3] +mla v2.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v16.4s +add v13.4s, v13.4s, v16.4s +mla v12.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v2.4s +add v21.4s, v21.4s, v2.4s +sub v2.4s, v10.4s, v12.4s +add v10.4s, v10.4s, v12.4s +str q17, [x0, #48] +str q11, [x0, #112] +str q15, [x0, #176] +str q0, [x0, #240] +str q3, [x0, #304] +str q9, [x0, #368] +str q18, [x0, #432] +str q1, [x0, #496] +str q19, [x0, #560] +str q20, [x0, #624] +str q13, [x0, #688] +str q14, [x0, #752] +str q21, [x0, #816] +str q22, [x0, #880] +str q10, [x0, #944] +str q2, [x0, #1008] +ldr q2, [x0, #768] +ldr q10, [x0, #832] +ldr q22, [x0, #896] +ldr q21, [x0, #960] +ldr q14, [x0, #256] +ldr q13, [x0, #320] +ldr q20, [x0, #384] +ldr q19, [x0, #448] +ldr q1, [x0, #512] +ldr q18, [x0, #576] +ldr q9, [x0, #640] +ldr q3, [x0, #704] +ldr q0, [x0, #0] +ldr q15, [x0, #64] +ldr q11, [x0, #128] +ldr q17, [x0, #192] +sqrdmulh v12.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +sqrdmulh v16.4S, v10.4S, v29.s[0] +mul v10.4S, v10.4S,v30.s[0] +mla v2.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +mla v10.4S, v16.4S, v31.s[0] +sub v16.4s, v14.4s, v2.4s +add v14.4s, v14.4s, v2.4s +sqrdmulh v2.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +mla v22.4S, v12.4S, v31.s[0] +sub v12.4s, v13.4s, v10.4s +add v13.4s, v13.4s, v10.4s +sqrdmulh v10.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +mla v21.4S, v2.4S, v31.s[0] +sub v2.4s, v20.4s, v22.4s +add v20.4s, v20.4s, v22.4s +sqrdmulh v22.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +mla v1.4S, v10.4S, v31.s[0] +sub v10.4s, v19.4s, v21.4s +add v19.4s, v19.4s, v21.4s +sqrdmulh v21.4S, v9.4S, v29.s[0] +mul v9.4S, v9.4S,v30.s[0] +mla v18.4S, v22.4S, v31.s[0] +sub v22.4s, v0.4s, v1.4s +add v0.4s, v0.4s, v1.4s +sqrdmulh v1.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +mla v9.4S, v21.4S, v31.s[0] +sub v21.4s, v15.4s, v18.4s +add v15.4s, v15.4s, v18.4s +sqrdmulh v18.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +mla v3.4S, v1.4S, v31.s[0] +sub v1.4s, v11.4s, v9.4s +add v11.4s, v11.4s, v9.4s +sqrdmulh v9.4S, v19.4S, v29.s[1] +mul v19.4S, v19.4S,v30.s[1] +mla v20.4S, v18.4S, v31.s[0] +sub v18.4s, v17.4s, v3.4s +add v17.4s, v17.4s, v3.4s +sqrdmulh v3.4S, v14.4S, v29.s[1] +mul v14.4S, v14.4S,v30.s[1] +mla v19.4S, v9.4S, v31.s[0] +sub v9.4s, v11.4s, v20.4s +add v11.4s, v11.4s, v20.4s +sqrdmulh v20.4S, v13.4S, v29.s[1] +mul v13.4S, v13.4S,v30.s[1] +mla v14.4S, v3.4S, v31.s[0] +sub v3.4s, v17.4s, v19.4s +add v17.4s, v17.4s, v19.4s +sqrdmulh v19.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +mla v13.4S, v20.4S, v31.s[0] +sub v20.4s, v0.4s, v14.4s +add v0.4s, v0.4s, v14.4s +sqrdmulh v14.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +mla v2.4S, v19.4S, v31.s[0] +sub v19.4s, v15.4s, v13.4s +add v15.4s, v15.4s, v13.4s +sqrdmulh v13.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +mla v10.4S, v14.4S, v31.s[0] +sub v14.4s, v1.4s, v2.4s +add v1.4s, v1.4s, v2.4s +sqrdmulh v2.4S, v12.4S, v29.s[2] +mul v12.4S, v12.4S,v30.s[2] +mla v16.4S, v13.4S, v31.s[0] +sub v13.4s, v18.4s, v10.4s +add v18.4s, v18.4s, v10.4s +sqrdmulh v10.4S, v11.4S, v27.s[0] +mul v11.4S, v11.4S,v28.s[0] +mla v12.4S, v2.4S, v31.s[0] +sub v2.4s, v22.4s, v16.4s +add v22.4s, v22.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v27.s[0] +mul v17.4S, v17.4S,v28.s[0] +mla v11.4S, v10.4S, v31.s[0] +sub v10.4s, v21.4s, v12.4s +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v9.4S, v27.s[1] +mul v9.4S, v9.4S,v28.s[1] +mla v17.4S, v16.4S, v31.s[0] +sub v16.4s, v0.4s, v11.4s +add v0.4s, v0.4s, v11.4s +sqrdmulh v11.4S, v3.4S, v27.s[1] +mul v3.4S, v3.4S,v28.s[1] +mla v9.4S, v12.4S, v31.s[0] +sub v12.4s, v15.4s, v17.4s +add v15.4s, v15.4s, v17.4s +sqrdmulh v17.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +mla v3.4S, v11.4S, v31.s[0] +sub v11.4s, v20.4s, v9.4s +add v20.4s, v20.4s, v9.4s +sqrdmulh v9.4S, v18.4S, v27.s[2] +mul v18.4S, v18.4S,v28.s[2] +mla v1.4S, v17.4S, v31.s[0] +sub v17.4s, v19.4s, v3.4s +add v19.4s, v19.4s, v3.4s +sqrdmulh v3.4S, v14.4S, v27.s[3] +mul v14.4S, v14.4S,v28.s[3] +mla v18.4S, v9.4S, v31.s[0] +sub v9.4s, v22.4s, v1.4s +add v22.4s, v22.4s, v1.4s +sqrdmulh v1.4S, v13.4S, v27.s[3] +mul v13.4S, v13.4S,v28.s[3] +mla v14.4S, v3.4S, v31.s[0] +sub v3.4s, v21.4s, v18.4s +add v21.4s, v21.4s, v18.4s +sqrdmulh v18.4S, v15.4S, v25.s[0] +mul v15.4S, v15.4S,v26.s[0] +mla v13.4S, v1.4S, v31.s[0] +sub v1.4s, v2.4s, v14.4s +add v2.4s, v2.4s, v14.4s +sqrdmulh v14.4S, v12.4S, v25.s[1] +mul v12.4S, v12.4S,v26.s[1] +mla v15.4S, v18.4S, v31.s[0] +sub v18.4s, v10.4s, v13.4s +add v10.4s, v10.4s, v13.4s +sqrdmulh v13.4S, v19.4S, v25.s[2] +mul v19.4S, v19.4S,v26.s[2] +mla v12.4S, v14.4S, v31.s[0] +sub v14.4s, v0.4s, v15.4s +add v0.4s, v0.4s, v15.4s +sqrdmulh v15.4S, v17.4S, v25.s[3] +mul v17.4S, v17.4S,v26.s[3] +mla v19.4S, v13.4S, v31.s[0] +sub v13.4s, v16.4s, v12.4s +add v16.4s, v16.4s, v12.4s +sqrdmulh v12.4S, v21.4S, v23.s[0] +mul v21.4S, v21.4S,v24.s[0] +mla v17.4S, v15.4S, v31.s[0] +sub v15.4s, v20.4s, v19.4s +add v20.4s, v20.4s, v19.4s +sqrdmulh v19.4S, v3.4S, v23.s[1] +mul v3.4S, v3.4S,v24.s[1] +mla v21.4S, v12.4S, v31.s[0] +sub v12.4s, v11.4s, v17.4s +add v11.4s, v11.4s, v17.4s +sqrdmulh v17.4S, v10.4S, v23.s[2] +mul v10.4S, v10.4S,v24.s[2] +mla v3.4S, v19.4S, v31.s[0] +sub v19.4s, v22.4s, v21.4s +add v22.4s, v22.4s, v21.4s +sqrdmulh v21.4S, v18.4S, v23.s[3] +mul v18.4S, v18.4S,v24.s[3] +mla v10.4S, v17.4S, v31.s[0] +sub v17.4s, v9.4s, v3.4s +add v9.4s, v9.4s, v3.4s +mla v18.4S, v21.4S, v31.s[0] +sub v21.4s, v2.4s, v10.4s +add v2.4s, v2.4s, v10.4s +sub v10.4s, v1.4s, v18.4s +add v1.4s, v1.4s, v18.4s +str q0, [x0, #0] +str q14, [x0, #64] +str q16, [x0, #128] +str q13, [x0, #192] +str q20, [x0, #256] +str q15, [x0, #320] +str q11, [x0, #384] +str q12, [x0, #448] +str q22, [x0, #512] +str q19, [x0, #576] +str q9, [x0, #640] +str q17, [x0, #704] +str q2, [x0, #768] +str q21, [x0, #832] +str q1, [x0, #896] +str q10, [x0, #960] +ldr q10, [x0, #784] +ldr q1, [x0, #848] +ldr q21, [x0, #912] +ldr q2, [x0, #976] +ldr q17, [x0, #272] +ldr q9, [x0, #336] +ldr q19, [x0, #400] +ldr q22, [x0, #464] +ldr q12, [x0, #528] +ldr q11, [x0, #592] +ldr q15, [x0, #656] +ldr q20, [x0, #720] +ldr q13, [x0, #16] +ldr q16, [x0, #80] +ldr q14, [x0, #144] +ldr q0, [x0, #208] +sqrdmulh v18.4S, v10.4S, v29.s[0] +mul v10.4S, v10.4S,v30.s[0] +sqrdmulh v3.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +mla v10.4S, v18.4S, v31.s[0] +sqrdmulh v18.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +mla v1.4S, v3.4S, v31.s[0] +sub v3.4s, v17.4s, v10.4s +add v17.4s, v17.4s, v10.4s +sqrdmulh v10.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +mla v21.4S, v18.4S, v31.s[0] +sub v18.4s, v9.4s, v1.4s +add v9.4s, v9.4s, v1.4s +sqrdmulh v1.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +mla v2.4S, v10.4S, v31.s[0] +sub v10.4s, v19.4s, v21.4s +add v19.4s, v19.4s, v21.4s +sqrdmulh v21.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +mla v12.4S, v1.4S, v31.s[0] +sub v1.4s, v22.4s, v2.4s +add v22.4s, v22.4s, v2.4s +sqrdmulh v2.4S, v15.4S, v29.s[0] +mul v15.4S, v15.4S,v30.s[0] +mla v11.4S, v21.4S, v31.s[0] +sub v21.4s, v13.4s, v12.4s +add v13.4s, v13.4s, v12.4s +sqrdmulh v12.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +mla v15.4S, v2.4S, v31.s[0] +sub v2.4s, v16.4s, v11.4s +add v16.4s, v16.4s, v11.4s +sqrdmulh v11.4S, v19.4S, v29.s[1] +mul v19.4S, v19.4S,v30.s[1] +mla v20.4S, v12.4S, v31.s[0] +sub v12.4s, v14.4s, v15.4s +add v14.4s, v14.4s, v15.4s +sqrdmulh v15.4S, v22.4S, v29.s[1] +mul v22.4S, v22.4S,v30.s[1] +mla v19.4S, v11.4S, v31.s[0] +sub v11.4s, v0.4s, v20.4s +add v0.4s, v0.4s, v20.4s +sqrdmulh v20.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +mla v22.4S, v15.4S, v31.s[0] +sub v15.4s, v14.4s, v19.4s +add v14.4s, v14.4s, v19.4s +sqrdmulh v19.4S, v9.4S, v29.s[1] +mul v9.4S, v9.4S,v30.s[1] +mla v17.4S, v20.4S, v31.s[0] +sub v20.4s, v0.4s, v22.4s +add v0.4s, v0.4s, v22.4s +sqrdmulh v22.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +mla v9.4S, v19.4S, v31.s[0] +sub v19.4s, v13.4s, v17.4s +add v13.4s, v13.4s, v17.4s +sqrdmulh v17.4S, v1.4S, v29.s[2] +mul v1.4S, v1.4S,v30.s[2] +mla v10.4S, v22.4S, v31.s[0] +sub v22.4s, v16.4s, v9.4s +add v16.4s, v16.4s, v9.4s +sqrdmulh v9.4S, v3.4S, v29.s[2] +mul v3.4S, v3.4S,v30.s[2] +mla v1.4S, v17.4S, v31.s[0] +sub v17.4s, v12.4s, v10.4s +add v12.4s, v12.4s, v10.4s +sqrdmulh v10.4S, v18.4S, v29.s[2] +mul v18.4S, v18.4S,v30.s[2] +mla v3.4S, v9.4S, v31.s[0] +sub v9.4s, v11.4s, v1.4s +add v11.4s, v11.4s, v1.4s +sqrdmulh v1.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +mla v18.4S, v10.4S, v31.s[0] +sub v10.4s, v21.4s, v3.4s +add v21.4s, v21.4s, v3.4s +sqrdmulh v3.4S, v0.4S, v27.s[0] +mul v0.4S, v0.4S,v28.s[0] +mla v14.4S, v1.4S, v31.s[0] +sub v1.4s, v2.4s, v18.4s +add v2.4s, v2.4s, v18.4s +sqrdmulh v18.4S, v15.4S, v27.s[1] +mul v15.4S, v15.4S,v28.s[1] +mla v0.4S, v3.4S, v31.s[0] +sub v3.4s, v13.4s, v14.4s +add v13.4s, v13.4s, v14.4s +sqrdmulh v14.4S, v20.4S, v27.s[1] +mul v20.4S, v20.4S,v28.s[1] +mla v15.4S, v18.4S, v31.s[0] +sub v18.4s, v16.4s, v0.4s +add v16.4s, v16.4s, v0.4s +sqrdmulh v0.4S, v12.4S, v27.s[2] +mul v12.4S, v12.4S,v28.s[2] +mla v20.4S, v14.4S, v31.s[0] +sub v14.4s, v19.4s, v15.4s +add v19.4s, v19.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v27.s[2] +mul v11.4S, v11.4S,v28.s[2] +mla v12.4S, v0.4S, v31.s[0] +sub v0.4s, v22.4s, v20.4s +add v22.4s, v22.4s, v20.4s +sqrdmulh v20.4S, v17.4S, v27.s[3] +mul v17.4S, v17.4S,v28.s[3] +mla v11.4S, v15.4S, v31.s[0] +sub v15.4s, v21.4s, v12.4s +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v9.4S, v27.s[3] +mul v9.4S, v9.4S,v28.s[3] +mla v17.4S, v20.4S, v31.s[0] +sub v20.4s, v2.4s, v11.4s +add v2.4s, v2.4s, v11.4s +sqrdmulh v11.4S, v16.4S, v25.s[0] +mul v16.4S, v16.4S,v26.s[0] +mla v9.4S, v12.4S, v31.s[0] +sub v12.4s, v10.4s, v17.4s +add v10.4s, v10.4s, v17.4s +sqrdmulh v17.4S, v18.4S, v25.s[1] +mul v18.4S, v18.4S,v26.s[1] +mla v16.4S, v11.4S, v31.s[0] +sub v11.4s, v1.4s, v9.4s +add v1.4s, v1.4s, v9.4s +sqrdmulh v9.4S, v22.4S, v25.s[2] +mul v22.4S, v22.4S,v26.s[2] +mla v18.4S, v17.4S, v31.s[0] +sub v17.4s, v13.4s, v16.4s +add v13.4s, v13.4s, v16.4s +sqrdmulh v16.4S, v0.4S, v25.s[3] +mul v0.4S, v0.4S,v26.s[3] +mla v22.4S, v9.4S, v31.s[0] +sub v9.4s, v3.4s, v18.4s +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v2.4S, v23.s[0] +mul v2.4S, v2.4S,v24.s[0] +mla v0.4S, v16.4S, v31.s[0] +sub v16.4s, v19.4s, v22.4s +add v19.4s, v19.4s, v22.4s +sqrdmulh v22.4S, v20.4S, v23.s[1] +mul v20.4S, v20.4S,v24.s[1] +mla v2.4S, v18.4S, v31.s[0] +sub v18.4s, v14.4s, v0.4s +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v1.4S, v23.s[2] +mul v1.4S, v1.4S,v24.s[2] +mla v20.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v2.4s +add v21.4s, v21.4s, v2.4s +sqrdmulh v2.4S, v11.4S, v23.s[3] +mul v11.4S, v11.4S,v24.s[3] +mla v1.4S, v0.4S, v31.s[0] +sub v0.4s, v15.4s, v20.4s +add v15.4s, v15.4s, v20.4s +mla v11.4S, v2.4S, v31.s[0] +sub v2.4s, v10.4s, v1.4s +add v10.4s, v10.4s, v1.4s +sub v1.4s, v12.4s, v11.4s +add v12.4s, v12.4s, v11.4s +str q13, [x0, #16] +str q17, [x0, #80] +str q3, [x0, #144] +str q9, [x0, #208] +str q19, [x0, #272] +str q16, [x0, #336] +str q14, [x0, #400] +str q18, [x0, #464] +str q21, [x0, #528] +str q22, [x0, #592] +str q15, [x0, #656] +str q0, [x0, #720] +str q10, [x0, #784] +str q2, [x0, #848] +str q12, [x0, #912] +str q1, [x0, #976] +ldr q4, [x17, #+128] +ldr q5, [x17, #+144] +ldr q6, [x17, #+160] +ldr q7, [x17, #+176] +ldr q8, [x17, #+192] +ldr q20, [x17, #+208] +ldr q11, [x17, #+224] +ldr q13, [x17, #+240] +ldr q17, [x0, #32] +ldr q3, [x0, #48] +ldr q9, [x0, #0] +ldr q19, [x0, #16] +sqrdmulh v16.4S, v17.4S, v5.s[0] +mul v17.4S, v17.4S,v4.s[0] +mla v17.4S, v16.4S, v31.s[0] +sub v16.4s, v9.4s, v17.4s +add v9.4s, v9.4s, v17.4s +sqrdmulh v17.4S, v3.4S, v5.s[0] +mul v3.4S, v3.4S,v4.s[0] +mla v3.4S, v17.4S, v31.s[0] +sub v17.4s, v19.4s, v3.4s +add v19.4s, v19.4s, v3.4s +sqrdmulh v3.4S, v19.4S, v5.s[1] +mul v19.4S, v19.4S,v4.s[1] +mla v19.4S, v3.4S, v31.s[0] +sub v3.4s, v9.4s, v19.4s +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v17.4S, v5.s[2] +mul v17.4S, v17.4S,v4.s[2] +mla v17.4S, v19.4S, v31.s[0] +sub v19.4s, v16.4s, v17.4s +add v16.4s, v16.4s, v17.4s +trn1 v17.4S, v9.4S, v3.4S +trn2 v14.4S, v9.4S, v3.4S +trn1 v18.4S, v16.4S, v19.4S +trn2 v21.4S, v16.4S, v19.4S +trn2 v16.2D, v17.2D, v18.2D +trn2 v19.2D, v14.2D, v21.2D +trn1 v9.2D, v17.2D, v18.2D +trn1 v3.2D, v14.2D, v21.2D +sqrdmulh v21.4S, v16.4S, v7.4S +mul v16.4S, v16.4S,v6.4S +mla v16.4S, v21.4S, v31.s[0] +sub v21.4s, v9.4s, v16.4s +add v9.4s, v9.4s, v16.4s +sqrdmulh v16.4S, v19.4S, v7.4S +mul v19.4S, v19.4S,v6.4S +mla v19.4S, v16.4S, v31.s[0] +sub v16.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v3.4S, v20.4S +mul v3.4S, v3.4S,v8.4S +mla v3.4S, v19.4S, v31.s[0] +sub v19.4s, v9.4s, v3.4s +add v9.4s, v9.4s, v3.4s +sqrdmulh v3.4S, v16.4S, v13.4S +mul v16.4S, v16.4S,v11.4S +mla v16.4S, v3.4S, v31.s[0] +sub v3.4s, v21.4s, v16.4s +add v21.4s, v21.4s, v16.4s +str q9, [x0, #0] +str q19, [x0, #16] +str q21, [x0, #32] +str q3, [x0, #48] +ldr q3, [x17, #+256] +ldr q21, [x17, #+272] +ldr q19, [x17, #+288] +ldr q9, [x17, #+304] +ldr q16, [x17, #+320] +ldr q14, [x17, #+336] +ldr q18, [x17, #+352] +ldr q17, [x17, #+368] +ldr q13, [x0, #96] +ldr q11, [x0, #112] +ldr q20, [x0, #64] +ldr q8, [x0, #80] +sqrdmulh v7.4S, v13.4S, v21.s[0] +mul v13.4S, v13.4S,v3.s[0] +mla v13.4S, v7.4S, v31.s[0] +sub v7.4s, v20.4s, v13.4s +add v20.4s, v20.4s, v13.4s +sqrdmulh v13.4S, v11.4S, v21.s[0] +mul v11.4S, v11.4S,v3.s[0] +mla v11.4S, v13.4S, v31.s[0] +sub v13.4s, v8.4s, v11.4s +add v8.4s, v8.4s, v11.4s +sqrdmulh v11.4S, v8.4S, v21.s[1] +mul v8.4S, v8.4S,v3.s[1] +mla v8.4S, v11.4S, v31.s[0] +sub v11.4s, v20.4s, v8.4s +add v20.4s, v20.4s, v8.4s +sqrdmulh v8.4S, v13.4S, v21.s[2] +mul v13.4S, v13.4S,v3.s[2] +mla v13.4S, v8.4S, v31.s[0] +sub v8.4s, v7.4s, v13.4s +add v7.4s, v7.4s, v13.4s +trn1 v13.4S, v20.4S, v11.4S +trn2 v6.4S, v20.4S, v11.4S +trn1 v5.4S, v7.4S, v8.4S +trn2 v4.4S, v7.4S, v8.4S +trn2 v7.2D, v13.2D, v5.2D +trn2 v8.2D, v6.2D, v4.2D +trn1 v20.2D, v13.2D, v5.2D +trn1 v11.2D, v6.2D, v4.2D +sqrdmulh v4.4S, v7.4S, v9.4S +mul v7.4S, v7.4S,v19.4S +mla v7.4S, v4.4S, v31.s[0] +sub v4.4s, v20.4s, v7.4s +add v20.4s, v20.4s, v7.4s +sqrdmulh v7.4S, v8.4S, v9.4S +mul v8.4S, v8.4S,v19.4S +mla v8.4S, v7.4S, v31.s[0] +sub v7.4s, v11.4s, v8.4s +add v11.4s, v11.4s, v8.4s +sqrdmulh v8.4S, v11.4S, v14.4S +mul v11.4S, v11.4S,v16.4S +mla v11.4S, v8.4S, v31.s[0] +sub v8.4s, v20.4s, v11.4s +add v20.4s, v20.4s, v11.4s +sqrdmulh v11.4S, v7.4S, v17.4S +mul v7.4S, v7.4S,v18.4S +mla v7.4S, v11.4S, v31.s[0] +sub v11.4s, v4.4s, v7.4s +add v4.4s, v4.4s, v7.4s +str q20, [x0, #64] +str q8, [x0, #80] +str q4, [x0, #96] +str q11, [x0, #112] +ldr q11, [x17, #+384] +ldr q4, [x17, #+400] +ldr q8, [x17, #+416] +ldr q20, [x17, #+432] +ldr q7, [x17, #+448] +ldr q6, [x17, #+464] +ldr q5, [x17, #+480] +ldr q13, [x17, #+496] +ldr q17, [x0, #160] +ldr q18, [x0, #176] +ldr q14, [x0, #128] +ldr q16, [x0, #144] +sqrdmulh v9.4S, v17.4S, v4.s[0] +mul v17.4S, v17.4S,v11.s[0] +mla v17.4S, v9.4S, v31.s[0] +sub v9.4s, v14.4s, v17.4s +add v14.4s, v14.4s, v17.4s +sqrdmulh v17.4S, v18.4S, v4.s[0] +mul v18.4S, v18.4S,v11.s[0] +mla v18.4S, v17.4S, v31.s[0] +sub v17.4s, v16.4s, v18.4s +add v16.4s, v16.4s, v18.4s +sqrdmulh v18.4S, v16.4S, v4.s[1] +mul v16.4S, v16.4S,v11.s[1] +mla v16.4S, v18.4S, v31.s[0] +sub v18.4s, v14.4s, v16.4s +add v14.4s, v14.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v4.s[2] +mul v17.4S, v17.4S,v11.s[2] +mla v17.4S, v16.4S, v31.s[0] +sub v16.4s, v9.4s, v17.4s +add v9.4s, v9.4s, v17.4s +trn1 v17.4S, v14.4S, v18.4S +trn2 v19.4S, v14.4S, v18.4S +trn1 v21.4S, v9.4S, v16.4S +trn2 v3.4S, v9.4S, v16.4S +trn2 v9.2D, v17.2D, v21.2D +trn2 v16.2D, v19.2D, v3.2D +trn1 v14.2D, v17.2D, v21.2D +trn1 v18.2D, v19.2D, v3.2D +sqrdmulh v3.4S, v9.4S, v20.4S +mul v9.4S, v9.4S,v8.4S +mla v9.4S, v3.4S, v31.s[0] +sub v3.4s, v14.4s, v9.4s +add v14.4s, v14.4s, v9.4s +sqrdmulh v9.4S, v16.4S, v20.4S +mul v16.4S, v16.4S,v8.4S +mla v16.4S, v9.4S, v31.s[0] +sub v9.4s, v18.4s, v16.4s +add v18.4s, v18.4s, v16.4s +sqrdmulh v16.4S, v18.4S, v6.4S +mul v18.4S, v18.4S,v7.4S +mla v18.4S, v16.4S, v31.s[0] +sub v16.4s, v14.4s, v18.4s +add v14.4s, v14.4s, v18.4s +sqrdmulh v18.4S, v9.4S, v13.4S +mul v9.4S, v9.4S,v5.4S +mla v9.4S, v18.4S, v31.s[0] +sub v18.4s, v3.4s, v9.4s +add v3.4s, v3.4s, v9.4s +str q14, [x0, #128] +str q16, [x0, #144] +str q3, [x0, #160] +str q18, [x0, #176] +ldr q18, [x17, #+512] +ldr q3, [x17, #+528] +ldr q16, [x17, #+544] +ldr q14, [x17, #+560] +ldr q9, [x17, #+576] +ldr q19, [x17, #+592] +ldr q21, [x17, #+608] +ldr q17, [x17, #+624] +ldr q13, [x0, #224] +ldr q5, [x0, #240] +ldr q6, [x0, #192] +ldr q7, [x0, #208] +sqrdmulh v20.4S, v13.4S, v3.s[0] +mul v13.4S, v13.4S,v18.s[0] +mla v13.4S, v20.4S, v31.s[0] +sub v20.4s, v6.4s, v13.4s +add v6.4s, v6.4s, v13.4s +sqrdmulh v13.4S, v5.4S, v3.s[0] +mul v5.4S, v5.4S,v18.s[0] +mla v5.4S, v13.4S, v31.s[0] +sub v13.4s, v7.4s, v5.4s +add v7.4s, v7.4s, v5.4s +sqrdmulh v5.4S, v7.4S, v3.s[1] +mul v7.4S, v7.4S,v18.s[1] +mla v7.4S, v5.4S, v31.s[0] +sub v5.4s, v6.4s, v7.4s +add v6.4s, v6.4s, v7.4s +sqrdmulh v7.4S, v13.4S, v3.s[2] +mul v13.4S, v13.4S,v18.s[2] +mla v13.4S, v7.4S, v31.s[0] +sub v7.4s, v20.4s, v13.4s +add v20.4s, v20.4s, v13.4s +trn1 v13.4S, v6.4S, v5.4S +trn2 v8.4S, v6.4S, v5.4S +trn1 v4.4S, v20.4S, v7.4S +trn2 v11.4S, v20.4S, v7.4S +trn2 v20.2D, v13.2D, v4.2D +trn2 v7.2D, v8.2D, v11.2D +trn1 v6.2D, v13.2D, v4.2D +trn1 v5.2D, v8.2D, v11.2D +sqrdmulh v11.4S, v20.4S, v14.4S +mul v20.4S, v20.4S,v16.4S +mla v20.4S, v11.4S, v31.s[0] +sub v11.4s, v6.4s, v20.4s +add v6.4s, v6.4s, v20.4s +sqrdmulh v20.4S, v7.4S, v14.4S +mul v7.4S, v7.4S,v16.4S +mla v7.4S, v20.4S, v31.s[0] +sub v20.4s, v5.4s, v7.4s +add v5.4s, v5.4s, v7.4s +sqrdmulh v7.4S, v5.4S, v19.4S +mul v5.4S, v5.4S,v9.4S +mla v5.4S, v7.4S, v31.s[0] +sub v7.4s, v6.4s, v5.4s +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v20.4S, v17.4S +mul v20.4S, v20.4S,v21.4S +mla v20.4S, v5.4S, v31.s[0] +sub v5.4s, v11.4s, v20.4s +add v11.4s, v11.4s, v20.4s +str q6, [x0, #192] +str q7, [x0, #208] +str q11, [x0, #224] +str q5, [x0, #240] +ldr q5, [x17, #+640] +ldr q11, [x17, #+656] +ldr q7, [x17, #+672] +ldr q6, [x17, #+688] +ldr q20, [x17, #+704] +ldr q8, [x17, #+720] +ldr q4, [x17, #+736] +ldr q13, [x17, #+752] +ldr q17, [x0, #288] +ldr q21, [x0, #304] +ldr q19, [x0, #256] +ldr q9, [x0, #272] +sqrdmulh v14.4S, v17.4S, v11.s[0] +mul v17.4S, v17.4S,v5.s[0] +mla v17.4S, v14.4S, v31.s[0] +sub v14.4s, v19.4s, v17.4s +add v19.4s, v19.4s, v17.4s +sqrdmulh v17.4S, v21.4S, v11.s[0] +mul v21.4S, v21.4S,v5.s[0] +mla v21.4S, v17.4S, v31.s[0] +sub v17.4s, v9.4s, v21.4s +add v9.4s, v9.4s, v21.4s +sqrdmulh v21.4S, v9.4S, v11.s[1] +mul v9.4S, v9.4S,v5.s[1] +mla v9.4S, v21.4S, v31.s[0] +sub v21.4s, v19.4s, v9.4s +add v19.4s, v19.4s, v9.4s +sqrdmulh v9.4S, v17.4S, v11.s[2] +mul v17.4S, v17.4S,v5.s[2] +mla v17.4S, v9.4S, v31.s[0] +sub v9.4s, v14.4s, v17.4s +add v14.4s, v14.4s, v17.4s +trn1 v17.4S, v19.4S, v21.4S +trn2 v16.4S, v19.4S, v21.4S +trn1 v3.4S, v14.4S, v9.4S +trn2 v18.4S, v14.4S, v9.4S +trn2 v14.2D, v17.2D, v3.2D +trn2 v9.2D, v16.2D, v18.2D +trn1 v19.2D, v17.2D, v3.2D +trn1 v21.2D, v16.2D, v18.2D +sqrdmulh v18.4S, v14.4S, v6.4S +mul v14.4S, v14.4S,v7.4S +mla v14.4S, v18.4S, v31.s[0] +sub v18.4s, v19.4s, v14.4s +add v19.4s, v19.4s, v14.4s +sqrdmulh v14.4S, v9.4S, v6.4S +mul v9.4S, v9.4S,v7.4S +mla v9.4S, v14.4S, v31.s[0] +sub v14.4s, v21.4s, v9.4s +add v21.4s, v21.4s, v9.4s +sqrdmulh v9.4S, v21.4S, v8.4S +mul v21.4S, v21.4S,v20.4S +mla v21.4S, v9.4S, v31.s[0] +sub v9.4s, v19.4s, v21.4s +add v19.4s, v19.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v13.4S +mul v14.4S, v14.4S,v4.4S +mla v14.4S, v21.4S, v31.s[0] +sub v21.4s, v18.4s, v14.4s +add v18.4s, v18.4s, v14.4s +str q19, [x0, #256] +str q9, [x0, #272] +str q18, [x0, #288] +str q21, [x0, #304] +ldr q21, [x17, #+768] +ldr q18, [x17, #+784] +ldr q9, [x17, #+800] +ldr q19, [x17, #+816] +ldr q14, [x17, #+832] +ldr q16, [x17, #+848] +ldr q3, [x17, #+864] +ldr q17, [x17, #+880] +ldr q13, [x0, #352] +ldr q4, [x0, #368] +ldr q8, [x0, #320] +ldr q20, [x0, #336] +sqrdmulh v6.4S, v13.4S, v18.s[0] +mul v13.4S, v13.4S,v21.s[0] +mla v13.4S, v6.4S, v31.s[0] +sub v6.4s, v8.4s, v13.4s +add v8.4s, v8.4s, v13.4s +sqrdmulh v13.4S, v4.4S, v18.s[0] +mul v4.4S, v4.4S,v21.s[0] +mla v4.4S, v13.4S, v31.s[0] +sub v13.4s, v20.4s, v4.4s +add v20.4s, v20.4s, v4.4s +sqrdmulh v4.4S, v20.4S, v18.s[1] +mul v20.4S, v20.4S,v21.s[1] +mla v20.4S, v4.4S, v31.s[0] +sub v4.4s, v8.4s, v20.4s +add v8.4s, v8.4s, v20.4s +sqrdmulh v20.4S, v13.4S, v18.s[2] +mul v13.4S, v13.4S,v21.s[2] +mla v13.4S, v20.4S, v31.s[0] +sub v20.4s, v6.4s, v13.4s +add v6.4s, v6.4s, v13.4s +trn1 v13.4S, v8.4S, v4.4S +trn2 v7.4S, v8.4S, v4.4S +trn1 v11.4S, v6.4S, v20.4S +trn2 v5.4S, v6.4S, v20.4S +trn2 v6.2D, v13.2D, v11.2D +trn2 v20.2D, v7.2D, v5.2D +trn1 v8.2D, v13.2D, v11.2D +trn1 v4.2D, v7.2D, v5.2D +sqrdmulh v5.4S, v6.4S, v19.4S +mul v6.4S, v6.4S,v9.4S +mla v6.4S, v5.4S, v31.s[0] +sub v5.4s, v8.4s, v6.4s +add v8.4s, v8.4s, v6.4s +sqrdmulh v6.4S, v20.4S, v19.4S +mul v20.4S, v20.4S,v9.4S +mla v20.4S, v6.4S, v31.s[0] +sub v6.4s, v4.4s, v20.4s +add v4.4s, v4.4s, v20.4s +sqrdmulh v20.4S, v4.4S, v16.4S +mul v4.4S, v4.4S,v14.4S +mla v4.4S, v20.4S, v31.s[0] +sub v20.4s, v8.4s, v4.4s +add v8.4s, v8.4s, v4.4s +sqrdmulh v4.4S, v6.4S, v17.4S +mul v6.4S, v6.4S,v3.4S +mla v6.4S, v4.4S, v31.s[0] +sub v4.4s, v5.4s, v6.4s +add v5.4s, v5.4s, v6.4s +str q8, [x0, #320] +str q20, [x0, #336] +str q5, [x0, #352] +str q4, [x0, #368] +ldr q4, [x17, #+896] +ldr q5, [x17, #+912] +ldr q20, [x17, #+928] +ldr q8, [x17, #+944] +ldr q6, [x17, #+960] +ldr q7, [x17, #+976] +ldr q11, [x17, #+992] +ldr q13, [x17, #+1008] +ldr q17, [x0, #416] +ldr q3, [x0, #432] +ldr q16, [x0, #384] +ldr q14, [x0, #400] +sqrdmulh v19.4S, v17.4S, v5.s[0] +mul v17.4S, v17.4S,v4.s[0] +mla v17.4S, v19.4S, v31.s[0] +sub v19.4s, v16.4s, v17.4s +add v16.4s, v16.4s, v17.4s +sqrdmulh v17.4S, v3.4S, v5.s[0] +mul v3.4S, v3.4S,v4.s[0] +mla v3.4S, v17.4S, v31.s[0] +sub v17.4s, v14.4s, v3.4s +add v14.4s, v14.4s, v3.4s +sqrdmulh v3.4S, v14.4S, v5.s[1] +mul v14.4S, v14.4S,v4.s[1] +mla v14.4S, v3.4S, v31.s[0] +sub v3.4s, v16.4s, v14.4s +add v16.4s, v16.4s, v14.4s +sqrdmulh v14.4S, v17.4S, v5.s[2] +mul v17.4S, v17.4S,v4.s[2] +mla v17.4S, v14.4S, v31.s[0] +sub v14.4s, v19.4s, v17.4s +add v19.4s, v19.4s, v17.4s +trn1 v17.4S, v16.4S, v3.4S +trn2 v9.4S, v16.4S, v3.4S +trn1 v18.4S, v19.4S, v14.4S +trn2 v21.4S, v19.4S, v14.4S +trn2 v19.2D, v17.2D, v18.2D +trn2 v14.2D, v9.2D, v21.2D +trn1 v16.2D, v17.2D, v18.2D +trn1 v3.2D, v9.2D, v21.2D +sqrdmulh v21.4S, v19.4S, v8.4S +mul v19.4S, v19.4S,v20.4S +mla v19.4S, v21.4S, v31.s[0] +sub v21.4s, v16.4s, v19.4s +add v16.4s, v16.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v8.4S +mul v14.4S, v14.4S,v20.4S +mla v14.4S, v19.4S, v31.s[0] +sub v19.4s, v3.4s, v14.4s +add v3.4s, v3.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v7.4S +mul v3.4S, v3.4S,v6.4S +mla v3.4S, v14.4S, v31.s[0] +sub v14.4s, v16.4s, v3.4s +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v19.4S, v13.4S +mul v19.4S, v19.4S,v11.4S +mla v19.4S, v3.4S, v31.s[0] +sub v3.4s, v21.4s, v19.4s +add v21.4s, v21.4s, v19.4s +str q16, [x0, #384] +str q14, [x0, #400] +str q21, [x0, #416] +str q3, [x0, #432] +ldr q3, [x17, #+1024] +ldr q21, [x17, #+1040] +ldr q14, [x17, #+1056] +ldr q16, [x17, #+1072] +ldr q19, [x17, #+1088] +ldr q9, [x17, #+1104] +ldr q18, [x17, #+1120] +ldr q17, [x17, #+1136] +ldr q13, [x0, #480] +ldr q11, [x0, #496] +ldr q7, [x0, #448] +ldr q6, [x0, #464] +sqrdmulh v8.4S, v13.4S, v21.s[0] +mul v13.4S, v13.4S,v3.s[0] +mla v13.4S, v8.4S, v31.s[0] +sub v8.4s, v7.4s, v13.4s +add v7.4s, v7.4s, v13.4s +sqrdmulh v13.4S, v11.4S, v21.s[0] +mul v11.4S, v11.4S,v3.s[0] +mla v11.4S, v13.4S, v31.s[0] +sub v13.4s, v6.4s, v11.4s +add v6.4s, v6.4s, v11.4s +sqrdmulh v11.4S, v6.4S, v21.s[1] +mul v6.4S, v6.4S,v3.s[1] +mla v6.4S, v11.4S, v31.s[0] +sub v11.4s, v7.4s, v6.4s +add v7.4s, v7.4s, v6.4s +sqrdmulh v6.4S, v13.4S, v21.s[2] +mul v13.4S, v13.4S,v3.s[2] +mla v13.4S, v6.4S, v31.s[0] +sub v6.4s, v8.4s, v13.4s +add v8.4s, v8.4s, v13.4s +trn1 v13.4S, v7.4S, v11.4S +trn2 v20.4S, v7.4S, v11.4S +trn1 v5.4S, v8.4S, v6.4S +trn2 v4.4S, v8.4S, v6.4S +trn2 v8.2D, v13.2D, v5.2D +trn2 v6.2D, v20.2D, v4.2D +trn1 v7.2D, v13.2D, v5.2D +trn1 v11.2D, v20.2D, v4.2D +sqrdmulh v4.4S, v8.4S, v16.4S +mul v8.4S, v8.4S,v14.4S +mla v8.4S, v4.4S, v31.s[0] +sub v4.4s, v7.4s, v8.4s +add v7.4s, v7.4s, v8.4s +sqrdmulh v8.4S, v6.4S, v16.4S +mul v6.4S, v6.4S,v14.4S +mla v6.4S, v8.4S, v31.s[0] +sub v8.4s, v11.4s, v6.4s +add v11.4s, v11.4s, v6.4s +sqrdmulh v6.4S, v11.4S, v9.4S +mul v11.4S, v11.4S,v19.4S +mla v11.4S, v6.4S, v31.s[0] +sub v6.4s, v7.4s, v11.4s +add v7.4s, v7.4s, v11.4s +sqrdmulh v11.4S, v8.4S, v17.4S +mul v8.4S, v8.4S,v18.4S +mla v8.4S, v11.4S, v31.s[0] +sub v11.4s, v4.4s, v8.4s +add v4.4s, v4.4s, v8.4s +str q7, [x0, #448] +str q6, [x0, #464] +str q4, [x0, #480] +str q11, [x0, #496] +ldr q11, [x17, #+1152] +ldr q4, [x17, #+1168] +ldr q6, [x17, #+1184] +ldr q7, [x17, #+1200] +ldr q8, [x17, #+1216] +ldr q20, [x17, #+1232] +ldr q5, [x17, #+1248] +ldr q13, [x17, #+1264] +ldr q17, [x0, #544] +ldr q18, [x0, #560] +ldr q9, [x0, #512] +ldr q19, [x0, #528] +sqrdmulh v16.4S, v17.4S, v4.s[0] +mul v17.4S, v17.4S,v11.s[0] +mla v17.4S, v16.4S, v31.s[0] +sub v16.4s, v9.4s, v17.4s +add v9.4s, v9.4s, v17.4s +sqrdmulh v17.4S, v18.4S, v4.s[0] +mul v18.4S, v18.4S,v11.s[0] +mla v18.4S, v17.4S, v31.s[0] +sub v17.4s, v19.4s, v18.4s +add v19.4s, v19.4s, v18.4s +sqrdmulh v18.4S, v19.4S, v4.s[1] +mul v19.4S, v19.4S,v11.s[1] +mla v19.4S, v18.4S, v31.s[0] +sub v18.4s, v9.4s, v19.4s +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v17.4S, v4.s[2] +mul v17.4S, v17.4S,v11.s[2] +mla v17.4S, v19.4S, v31.s[0] +sub v19.4s, v16.4s, v17.4s +add v16.4s, v16.4s, v17.4s +trn1 v17.4S, v9.4S, v18.4S +trn2 v14.4S, v9.4S, v18.4S +trn1 v21.4S, v16.4S, v19.4S +trn2 v3.4S, v16.4S, v19.4S +trn2 v16.2D, v17.2D, v21.2D +trn2 v19.2D, v14.2D, v3.2D +trn1 v9.2D, v17.2D, v21.2D +trn1 v18.2D, v14.2D, v3.2D +sqrdmulh v3.4S, v16.4S, v7.4S +mul v16.4S, v16.4S,v6.4S +mla v16.4S, v3.4S, v31.s[0] +sub v3.4s, v9.4s, v16.4s +add v9.4s, v9.4s, v16.4s +sqrdmulh v16.4S, v19.4S, v7.4S +mul v19.4S, v19.4S,v6.4S +mla v19.4S, v16.4S, v31.s[0] +sub v16.4s, v18.4s, v19.4s +add v18.4s, v18.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v20.4S +mul v18.4S, v18.4S,v8.4S +mla v18.4S, v19.4S, v31.s[0] +sub v19.4s, v9.4s, v18.4s +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v16.4S, v13.4S +mul v16.4S, v16.4S,v5.4S +mla v16.4S, v18.4S, v31.s[0] +sub v18.4s, v3.4s, v16.4s +add v3.4s, v3.4s, v16.4s +str q9, [x0, #512] +str q19, [x0, #528] +str q3, [x0, #544] +str q18, [x0, #560] +ldr q18, [x17, #+1280] +ldr q3, [x17, #+1296] +ldr q19, [x17, #+1312] +ldr q9, [x17, #+1328] +ldr q16, [x17, #+1344] +ldr q14, [x17, #+1360] +ldr q21, [x17, #+1376] +ldr q17, [x17, #+1392] +ldr q13, [x0, #608] +ldr q5, [x0, #624] +ldr q20, [x0, #576] +ldr q8, [x0, #592] +sqrdmulh v7.4S, v13.4S, v3.s[0] +mul v13.4S, v13.4S,v18.s[0] +mla v13.4S, v7.4S, v31.s[0] +sub v7.4s, v20.4s, v13.4s +add v20.4s, v20.4s, v13.4s +sqrdmulh v13.4S, v5.4S, v3.s[0] +mul v5.4S, v5.4S,v18.s[0] +mla v5.4S, v13.4S, v31.s[0] +sub v13.4s, v8.4s, v5.4s +add v8.4s, v8.4s, v5.4s +sqrdmulh v5.4S, v8.4S, v3.s[1] +mul v8.4S, v8.4S,v18.s[1] +mla v8.4S, v5.4S, v31.s[0] +sub v5.4s, v20.4s, v8.4s +add v20.4s, v20.4s, v8.4s +sqrdmulh v8.4S, v13.4S, v3.s[2] +mul v13.4S, v13.4S,v18.s[2] +mla v13.4S, v8.4S, v31.s[0] +sub v8.4s, v7.4s, v13.4s +add v7.4s, v7.4s, v13.4s +trn1 v13.4S, v20.4S, v5.4S +trn2 v6.4S, v20.4S, v5.4S +trn1 v4.4S, v7.4S, v8.4S +trn2 v11.4S, v7.4S, v8.4S +trn2 v7.2D, v13.2D, v4.2D +trn2 v8.2D, v6.2D, v11.2D +trn1 v20.2D, v13.2D, v4.2D +trn1 v5.2D, v6.2D, v11.2D +sqrdmulh v11.4S, v7.4S, v9.4S +mul v7.4S, v7.4S,v19.4S +mla v7.4S, v11.4S, v31.s[0] +sub v11.4s, v20.4s, v7.4s +add v20.4s, v20.4s, v7.4s +sqrdmulh v7.4S, v8.4S, v9.4S +mul v8.4S, v8.4S,v19.4S +mla v8.4S, v7.4S, v31.s[0] +sub v7.4s, v5.4s, v8.4s +add v5.4s, v5.4s, v8.4s +sqrdmulh v8.4S, v5.4S, v14.4S +mul v5.4S, v5.4S,v16.4S +mla v5.4S, v8.4S, v31.s[0] +sub v8.4s, v20.4s, v5.4s +add v20.4s, v20.4s, v5.4s +sqrdmulh v5.4S, v7.4S, v17.4S +mul v7.4S, v7.4S,v21.4S +mla v7.4S, v5.4S, v31.s[0] +sub v5.4s, v11.4s, v7.4s +add v11.4s, v11.4s, v7.4s +str q20, [x0, #576] +str q8, [x0, #592] +str q11, [x0, #608] +str q5, [x0, #624] +ldr q5, [x17, #+1408] +ldr q11, [x17, #+1424] +ldr q8, [x17, #+1440] +ldr q20, [x17, #+1456] +ldr q7, [x17, #+1472] +ldr q6, [x17, #+1488] +ldr q4, [x17, #+1504] +ldr q13, [x17, #+1520] +ldr q17, [x0, #672] +ldr q21, [x0, #688] +ldr q14, [x0, #640] +ldr q16, [x0, #656] +sqrdmulh v9.4S, v17.4S, v11.s[0] +mul v17.4S, v17.4S,v5.s[0] +mla v17.4S, v9.4S, v31.s[0] +sub v9.4s, v14.4s, v17.4s +add v14.4s, v14.4s, v17.4s +sqrdmulh v17.4S, v21.4S, v11.s[0] +mul v21.4S, v21.4S,v5.s[0] +mla v21.4S, v17.4S, v31.s[0] +sub v17.4s, v16.4s, v21.4s +add v16.4s, v16.4s, v21.4s +sqrdmulh v21.4S, v16.4S, v11.s[1] +mul v16.4S, v16.4S,v5.s[1] +mla v16.4S, v21.4S, v31.s[0] +sub v21.4s, v14.4s, v16.4s +add v14.4s, v14.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v11.s[2] +mul v17.4S, v17.4S,v5.s[2] +mla v17.4S, v16.4S, v31.s[0] +sub v16.4s, v9.4s, v17.4s +add v9.4s, v9.4s, v17.4s +trn1 v17.4S, v14.4S, v21.4S +trn2 v19.4S, v14.4S, v21.4S +trn1 v3.4S, v9.4S, v16.4S +trn2 v18.4S, v9.4S, v16.4S +trn2 v9.2D, v17.2D, v3.2D +trn2 v16.2D, v19.2D, v18.2D +trn1 v14.2D, v17.2D, v3.2D +trn1 v21.2D, v19.2D, v18.2D +sqrdmulh v18.4S, v9.4S, v20.4S +mul v9.4S, v9.4S,v8.4S +mla v9.4S, v18.4S, v31.s[0] +sub v18.4s, v14.4s, v9.4s +add v14.4s, v14.4s, v9.4s +sqrdmulh v9.4S, v16.4S, v20.4S +mul v16.4S, v16.4S,v8.4S +mla v16.4S, v9.4S, v31.s[0] +sub v9.4s, v21.4s, v16.4s +add v21.4s, v21.4s, v16.4s +sqrdmulh v16.4S, v21.4S, v6.4S +mul v21.4S, v21.4S,v7.4S +mla v21.4S, v16.4S, v31.s[0] +sub v16.4s, v14.4s, v21.4s +add v14.4s, v14.4s, v21.4s +sqrdmulh v21.4S, v9.4S, v13.4S +mul v9.4S, v9.4S,v4.4S +mla v9.4S, v21.4S, v31.s[0] +sub v21.4s, v18.4s, v9.4s +add v18.4s, v18.4s, v9.4s +str q14, [x0, #640] +str q16, [x0, #656] +str q18, [x0, #672] +str q21, [x0, #688] +ldr q21, [x17, #+1536] +ldr q18, [x17, #+1552] +ldr q16, [x17, #+1568] +ldr q14, [x17, #+1584] +ldr q9, [x17, #+1600] +ldr q19, [x17, #+1616] +ldr q3, [x17, #+1632] +ldr q17, [x17, #+1648] +ldr q13, [x0, #736] +ldr q4, [x0, #752] +ldr q6, [x0, #704] +ldr q7, [x0, #720] +sqrdmulh v20.4S, v13.4S, v18.s[0] +mul v13.4S, v13.4S,v21.s[0] +mla v13.4S, v20.4S, v31.s[0] +sub v20.4s, v6.4s, v13.4s +add v6.4s, v6.4s, v13.4s +sqrdmulh v13.4S, v4.4S, v18.s[0] +mul v4.4S, v4.4S,v21.s[0] +mla v4.4S, v13.4S, v31.s[0] +sub v13.4s, v7.4s, v4.4s +add v7.4s, v7.4s, v4.4s +sqrdmulh v4.4S, v7.4S, v18.s[1] +mul v7.4S, v7.4S,v21.s[1] +mla v7.4S, v4.4S, v31.s[0] +sub v4.4s, v6.4s, v7.4s +add v6.4s, v6.4s, v7.4s +sqrdmulh v7.4S, v13.4S, v18.s[2] +mul v13.4S, v13.4S,v21.s[2] +mla v13.4S, v7.4S, v31.s[0] +sub v7.4s, v20.4s, v13.4s +add v20.4s, v20.4s, v13.4s +trn1 v13.4S, v6.4S, v4.4S +trn2 v8.4S, v6.4S, v4.4S +trn1 v11.4S, v20.4S, v7.4S +trn2 v5.4S, v20.4S, v7.4S +trn2 v20.2D, v13.2D, v11.2D +trn2 v7.2D, v8.2D, v5.2D +trn1 v6.2D, v13.2D, v11.2D +trn1 v4.2D, v8.2D, v5.2D +sqrdmulh v5.4S, v20.4S, v14.4S +mul v20.4S, v20.4S,v16.4S +mla v20.4S, v5.4S, v31.s[0] +sub v5.4s, v6.4s, v20.4s +add v6.4s, v6.4s, v20.4s +sqrdmulh v20.4S, v7.4S, v14.4S +mul v7.4S, v7.4S,v16.4S +mla v7.4S, v20.4S, v31.s[0] +sub v20.4s, v4.4s, v7.4s +add v4.4s, v4.4s, v7.4s +sqrdmulh v7.4S, v4.4S, v19.4S +mul v4.4S, v4.4S,v9.4S +mla v4.4S, v7.4S, v31.s[0] +sub v7.4s, v6.4s, v4.4s +add v6.4s, v6.4s, v4.4s +sqrdmulh v4.4S, v20.4S, v17.4S +mul v20.4S, v20.4S,v3.4S +mla v20.4S, v4.4S, v31.s[0] +sub v4.4s, v5.4s, v20.4s +add v5.4s, v5.4s, v20.4s +str q6, [x0, #704] +str q7, [x0, #720] +str q5, [x0, #736] +str q4, [x0, #752] +ldr q4, [x17, #+1664] +ldr q5, [x17, #+1680] +ldr q7, [x17, #+1696] +ldr q6, [x17, #+1712] +ldr q20, [x17, #+1728] +ldr q8, [x17, #+1744] +ldr q11, [x17, #+1760] +ldr q13, [x17, #+1776] +ldr q17, [x0, #800] +ldr q3, [x0, #816] +ldr q19, [x0, #768] +ldr q9, [x0, #784] +sqrdmulh v14.4S, v17.4S, v5.s[0] +mul v17.4S, v17.4S,v4.s[0] +mla v17.4S, v14.4S, v31.s[0] +sub v14.4s, v19.4s, v17.4s +add v19.4s, v19.4s, v17.4s +sqrdmulh v17.4S, v3.4S, v5.s[0] +mul v3.4S, v3.4S,v4.s[0] +mla v3.4S, v17.4S, v31.s[0] +sub v17.4s, v9.4s, v3.4s +add v9.4s, v9.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v5.s[1] +mul v9.4S, v9.4S,v4.s[1] +mla v9.4S, v3.4S, v31.s[0] +sub v3.4s, v19.4s, v9.4s +add v19.4s, v19.4s, v9.4s +sqrdmulh v9.4S, v17.4S, v5.s[2] +mul v17.4S, v17.4S,v4.s[2] +mla v17.4S, v9.4S, v31.s[0] +sub v9.4s, v14.4s, v17.4s +add v14.4s, v14.4s, v17.4s +trn1 v17.4S, v19.4S, v3.4S +trn2 v16.4S, v19.4S, v3.4S +trn1 v18.4S, v14.4S, v9.4S +trn2 v21.4S, v14.4S, v9.4S +trn2 v14.2D, v17.2D, v18.2D +trn2 v9.2D, v16.2D, v21.2D +trn1 v19.2D, v17.2D, v18.2D +trn1 v3.2D, v16.2D, v21.2D +sqrdmulh v21.4S, v14.4S, v6.4S +mul v14.4S, v14.4S,v7.4S +mla v14.4S, v21.4S, v31.s[0] +sub v21.4s, v19.4s, v14.4s +add v19.4s, v19.4s, v14.4s +sqrdmulh v14.4S, v9.4S, v6.4S +mul v9.4S, v9.4S,v7.4S +mla v9.4S, v14.4S, v31.s[0] +sub v14.4s, v3.4s, v9.4s +add v3.4s, v3.4s, v9.4s +sqrdmulh v9.4S, v3.4S, v8.4S +mul v3.4S, v3.4S,v20.4S +mla v3.4S, v9.4S, v31.s[0] +sub v9.4s, v19.4s, v3.4s +add v19.4s, v19.4s, v3.4s +sqrdmulh v3.4S, v14.4S, v13.4S +mul v14.4S, v14.4S,v11.4S +mla v14.4S, v3.4S, v31.s[0] +sub v3.4s, v21.4s, v14.4s +add v21.4s, v21.4s, v14.4s +str q19, [x0, #768] +str q9, [x0, #784] +str q21, [x0, #800] +str q3, [x0, #816] +ldr q3, [x17, #+1792] +ldr q21, [x17, #+1808] +ldr q9, [x17, #+1824] +ldr q19, [x17, #+1840] +ldr q14, [x17, #+1856] +ldr q16, [x17, #+1872] +ldr q18, [x17, #+1888] +ldr q17, [x17, #+1904] +ldr q13, [x0, #864] +ldr q11, [x0, #880] +ldr q8, [x0, #832] +ldr q20, [x0, #848] +sqrdmulh v6.4S, v13.4S, v21.s[0] +mul v13.4S, v13.4S,v3.s[0] +mla v13.4S, v6.4S, v31.s[0] +sub v6.4s, v8.4s, v13.4s +add v8.4s, v8.4s, v13.4s +sqrdmulh v13.4S, v11.4S, v21.s[0] +mul v11.4S, v11.4S,v3.s[0] +mla v11.4S, v13.4S, v31.s[0] +sub v13.4s, v20.4s, v11.4s +add v20.4s, v20.4s, v11.4s +sqrdmulh v11.4S, v20.4S, v21.s[1] +mul v20.4S, v20.4S,v3.s[1] +mla v20.4S, v11.4S, v31.s[0] +sub v11.4s, v8.4s, v20.4s +add v8.4s, v8.4s, v20.4s +sqrdmulh v20.4S, v13.4S, v21.s[2] +mul v13.4S, v13.4S,v3.s[2] +mla v13.4S, v20.4S, v31.s[0] +sub v20.4s, v6.4s, v13.4s +add v6.4s, v6.4s, v13.4s +trn1 v13.4S, v8.4S, v11.4S +trn2 v7.4S, v8.4S, v11.4S +trn1 v5.4S, v6.4S, v20.4S +trn2 v4.4S, v6.4S, v20.4S +trn2 v6.2D, v13.2D, v5.2D +trn2 v20.2D, v7.2D, v4.2D +trn1 v8.2D, v13.2D, v5.2D +trn1 v11.2D, v7.2D, v4.2D +sqrdmulh v4.4S, v6.4S, v19.4S +mul v6.4S, v6.4S,v9.4S +mla v6.4S, v4.4S, v31.s[0] +sub v4.4s, v8.4s, v6.4s +add v8.4s, v8.4s, v6.4s +sqrdmulh v6.4S, v20.4S, v19.4S +mul v20.4S, v20.4S,v9.4S +mla v20.4S, v6.4S, v31.s[0] +sub v6.4s, v11.4s, v20.4s +add v11.4s, v11.4s, v20.4s +sqrdmulh v20.4S, v11.4S, v16.4S +mul v11.4S, v11.4S,v14.4S +mla v11.4S, v20.4S, v31.s[0] +sub v20.4s, v8.4s, v11.4s +add v8.4s, v8.4s, v11.4s +sqrdmulh v11.4S, v6.4S, v17.4S +mul v6.4S, v6.4S,v18.4S +mla v6.4S, v11.4S, v31.s[0] +sub v11.4s, v4.4s, v6.4s +add v4.4s, v4.4s, v6.4s +str q8, [x0, #832] +str q20, [x0, #848] +str q4, [x0, #864] +str q11, [x0, #880] +ldr q11, [x17, #+1920] +ldr q4, [x17, #+1936] +ldr q20, [x17, #+1952] +ldr q8, [x17, #+1968] +ldr q6, [x17, #+1984] +ldr q7, [x17, #+2000] +ldr q5, [x17, #+2016] +ldr q13, [x17, #+2032] +ldr q17, [x0, #928] +ldr q18, [x0, #944] +ldr q16, [x0, #896] +ldr q14, [x0, #912] +sqrdmulh v19.4S, v17.4S, v4.s[0] +mul v17.4S, v17.4S,v11.s[0] +mla v17.4S, v19.4S, v31.s[0] +sub v19.4s, v16.4s, v17.4s +add v16.4s, v16.4s, v17.4s +sqrdmulh v17.4S, v18.4S, v4.s[0] +mul v18.4S, v18.4S,v11.s[0] +mla v18.4S, v17.4S, v31.s[0] +sub v17.4s, v14.4s, v18.4s +add v14.4s, v14.4s, v18.4s +sqrdmulh v18.4S, v14.4S, v4.s[1] +mul v14.4S, v14.4S,v11.s[1] +mla v14.4S, v18.4S, v31.s[0] +sub v18.4s, v16.4s, v14.4s +add v16.4s, v16.4s, v14.4s +sqrdmulh v14.4S, v17.4S, v4.s[2] +mul v17.4S, v17.4S,v11.s[2] +mla v17.4S, v14.4S, v31.s[0] +sub v14.4s, v19.4s, v17.4s +add v19.4s, v19.4s, v17.4s +trn1 v17.4S, v16.4S, v18.4S +trn2 v9.4S, v16.4S, v18.4S +trn1 v21.4S, v19.4S, v14.4S +trn2 v3.4S, v19.4S, v14.4S +trn2 v19.2D, v17.2D, v21.2D +trn2 v14.2D, v9.2D, v3.2D +trn1 v16.2D, v17.2D, v21.2D +trn1 v18.2D, v9.2D, v3.2D +sqrdmulh v3.4S, v19.4S, v8.4S +mul v19.4S, v19.4S,v20.4S +mla v19.4S, v3.4S, v31.s[0] +sub v3.4s, v16.4s, v19.4s +add v16.4s, v16.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v8.4S +mul v14.4S, v14.4S,v20.4S +mla v14.4S, v19.4S, v31.s[0] +sub v19.4s, v18.4s, v14.4s +add v18.4s, v18.4s, v14.4s +sqrdmulh v14.4S, v18.4S, v7.4S +mul v18.4S, v18.4S,v6.4S +mla v18.4S, v14.4S, v31.s[0] +sub v14.4s, v16.4s, v18.4s +add v16.4s, v16.4s, v18.4s +sqrdmulh v18.4S, v19.4S, v13.4S +mul v19.4S, v19.4S,v5.4S +mla v19.4S, v18.4S, v31.s[0] +sub v18.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +str q16, [x0, #896] +str q14, [x0, #912] +str q3, [x0, #928] +str q18, [x0, #944] +ldr q18, [x17, #+2048] +ldr q3, [x17, #+2064] +ldr q14, [x17, #+2080] +ldr q16, [x17, #+2096] +ldr q19, [x17, #+2112] +ldr q9, [x17, #+2128] +ldr q21, [x17, #+2144] +ldr q17, [x17, #+2160] +ldr q13, [x0, #992] +ldr q5, [x0, #1008] +ldr q7, [x0, #960] +ldr q6, [x0, #976] +sqrdmulh v8.4S, v13.4S, v3.s[0] +mul v13.4S, v13.4S,v18.s[0] +mla v13.4S, v8.4S, v31.s[0] +sub v8.4s, v7.4s, v13.4s +add v7.4s, v7.4s, v13.4s +sqrdmulh v13.4S, v5.4S, v3.s[0] +mul v5.4S, v5.4S,v18.s[0] +mla v5.4S, v13.4S, v31.s[0] +sub v13.4s, v6.4s, v5.4s +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v6.4S, v3.s[1] +mul v6.4S, v6.4S,v18.s[1] +mla v6.4S, v5.4S, v31.s[0] +sub v5.4s, v7.4s, v6.4s +add v7.4s, v7.4s, v6.4s +sqrdmulh v6.4S, v13.4S, v3.s[2] +mul v13.4S, v13.4S,v18.s[2] +mla v13.4S, v6.4S, v31.s[0] +sub v6.4s, v8.4s, v13.4s +add v8.4s, v8.4s, v13.4s +trn1 v13.4S, v7.4S, v5.4S +trn2 v20.4S, v7.4S, v5.4S +trn1 v4.4S, v8.4S, v6.4S +trn2 v11.4S, v8.4S, v6.4S +trn2 v8.2D, v13.2D, v4.2D +trn2 v6.2D, v20.2D, v11.2D +trn1 v7.2D, v13.2D, v4.2D +trn1 v5.2D, v20.2D, v11.2D +sqrdmulh v11.4S, v8.4S, v16.4S +mul v8.4S, v8.4S,v14.4S +mla v8.4S, v11.4S, v31.s[0] +sub v11.4s, v7.4s, v8.4s +add v7.4s, v7.4s, v8.4s +sqrdmulh v8.4S, v6.4S, v16.4S +mul v6.4S, v6.4S,v14.4S +mla v6.4S, v8.4S, v31.s[0] +sub v8.4s, v5.4s, v6.4s +add v5.4s, v5.4s, v6.4s +sqrdmulh v6.4S, v5.4S, v9.4S +mul v5.4S, v5.4S,v19.4S +mla v5.4S, v6.4S, v31.s[0] +sub v6.4s, v7.4s, v5.4s +add v7.4s, v7.4s, v5.4s +sqrdmulh v5.4S, v8.4S, v17.4S +mul v8.4S, v8.4S,v21.4S +mla v8.4S, v5.4S, v31.s[0] +sub v5.4s, v11.4s, v8.4s +add v11.4s, v11.4s, v8.4s +str q7, [x0, #960] +str q6, [x0, #976] +str q11, [x0, #992] +str q5, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 2392 +// Instruction count: 2388 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_2_0.s b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_2_0.s new file mode 100644 index 0000000..1f606a8 --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_2_0.s @@ -0,0 +1,2422 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 26036764 // Layer 6, block 0 +.word 7065381 // Layer 6, block 1 +.word 11280567 // Layer 6, block 2 +.word 19695786 // Layer 6, block 3 +.word 1666225723 // Layer 6, block 0 +.word 452149874 // Layer 6, block 1 +.word 721901190 // Layer 6, block 2 +.word 1260434103 // Layer 6, block 3 +.word 28678040 // Layer 7, block 0 +.word 5637166 // Layer 7, block 2 +.word 18759424 // Layer 7, block 4 +.word 8648030 // Layer 7, block 6 +.word 1835254486 // Layer 7, block 0 +.word 360751090 // Layer 7, block 2 +.word 1200511508 // Layer 7, block 4 +.word 553431680 // Layer 7, block 6 +.word 7232147 // Layer 7, block 1 +.word 7430689 // Layer 7, block 3 +.word 14819378 // Layer 7, block 5 +.word 22112339 // Layer 7, block 7 +.word 462822084 // Layer 7, block 1 +.word 475527802 // Layer 7, block 3 +.word 948367809 // Layer 7, block 5 +.word 1415081692 // Layer 7, block 7 +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14834498 // Layer 6, block 4 +.word 22861321 // Layer 6, block 5 +.word 23033862 // Layer 6, block 6 +.word 32211066 // Layer 6, block 7 +.word 949335415 // Layer 6, block 4 +.word 1463012881 // Layer 6, block 5 +.word 1474054663 // Layer 6, block 6 +.word 2061350894 // Layer 6, block 7 +.word 7103825 // Layer 7, block 8 +.word 24338119 // Layer 7, block 10 +.word 6674394 // Layer 7, block 12 +.word 3716128 // Layer 7, block 14 +.word 454610102 // Layer 7, block 8 +.word 1557520740 // Layer 7, block 10 +.word 427128616 // Layer 7, block 12 +.word 237814041 // Layer 7, block 14 +.word 18577393 // Layer 7, block 9 +.word 17042091 // Layer 7, block 11 +.word 6574213 // Layer 7, block 13 +.word 24666803 // Layer 7, block 15 +.word 1188862414 // Layer 7, block 9 +.word 1090610585 // Layer 7, block 11 +.word 420717521 // Layer 7, block 13 +.word 1578554911 // Layer 7, block 15 +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 11253846 // Layer 6, block 8 +.word 16151303 // Layer 6, block 9 +.word 1821442 // Layer 6, block 10 +.word 23358663 // Layer 6, block 11 +.word 720191176 // Layer 6, block 8 +.word 1033604503 // Layer 6, block 9 +.word 116563391 // Layer 6, block 10 +.word 1494840340 // Layer 6, block 11 +.word 32787475 // Layer 7, block 16 +.word 8269259 // Layer 7, block 18 +.word 20826321 // Layer 7, block 20 +.word 21194054 // Layer 7, block 22 +.word 2098238255 // Layer 7, block 16 +.word 529192186 // Layer 7, block 18 +.word 1332782821 // Layer 7, block 20 +.word 1356315937 // Layer 7, block 22 +.word 28400654 // Layer 7, block 17 +.word 31090287 // Layer 7, block 19 +.word 26776841 // Layer 7, block 21 +.word 22281074 // Layer 7, block 23 +.word 1817503137 // Layer 7, block 17 +.word 1989626512 // Layer 7, block 19 +.word 1713587037 // Layer 7, block 21 +.word 1425879908 // Layer 7, block 23 +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 20504641 // Layer 6, block 12 +.word 7735096 // Layer 6, block 13 +.word 29463916 // Layer 6, block 14 +.word 23172067 // Layer 6, block 15 +.word 1312196872 // Layer 6, block 12 +.word 495008363 // Layer 6, block 13 +.word 1885546712 // Layer 6, block 14 +.word 1482899108 // Layer 6, block 15 +.word 1953000 // Layer 7, block 24 +.word 12766243 // Layer 7, block 26 +.word 16292342 // Layer 7, block 28 +.word 25143337 // Layer 7, block 30 +.word 124982461 // Layer 7, block 24 +.word 816977197 // Layer 7, block 26 +.word 1042630311 // Layer 7, block 28 +.word 1609050759 // Layer 7, block 30 +.word 12486848 // Layer 7, block 25 +.word 31556661 // Layer 7, block 27 +.word 28330310 // Layer 7, block 29 +.word 15137961 // Layer 7, block 31 +.word 799097282 // Layer 7, block 25 +.word 2019472170 // Layer 7, block 27 +.word 1813001465 // Layer 7, block 29 +.word 968755565 // Layer 7, block 31 +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 18663828 // Layer 6, block 16 +.word 25765932 // Layer 6, block 17 +.word 11779122 // Layer 6, block 18 +.word 29112305 // Layer 6, block 19 +.word 1194393831 // Layer 6, block 16 +.word 1648893798 // Layer 6, block 17 +.word 753806275 // Layer 6, block 18 +.word 1863045325 // Layer 6, block 19 +.word 33163184 // Layer 7, block 32 +.word 11550623 // Layer 7, block 34 +.word 25375595 // Layer 7, block 36 +.word 18254638 // Layer 7, block 38 +.word 2122281795 // Layer 7, block 32 +.word 739183455 // Layer 7, block 34 +.word 1623914137 // Layer 7, block 36 +.word 1168207670 // Layer 7, block 38 +.word 9551359 // Layer 7, block 33 +.word 33257316 // Layer 7, block 35 +.word 10387700 // Layer 7, block 37 +.word 4263629 // Layer 7, block 39 +.word 611240324 // Layer 7, block 33 +.word 2128305784 // Layer 7, block 35 +.word 664762063 // Layer 7, block 37 +.word 272851431 // Layer 7, block 39 +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 596073 // Layer 6, block 20 +.word 29039358 // Layer 6, block 21 +.word 6760262 // Layer 6, block 22 +.word 2228887 // Layer 6, block 23 +.word 38145761 // Layer 6, block 20 +.word 1858377074 // Layer 6, block 21 +.word 432623749 // Layer 6, block 22 +.word 142637881 // Layer 6, block 23 +.word 25929180 // Layer 7, block 40 +.word 23508428 // Layer 7, block 42 +.word 22560727 // Layer 7, block 44 +.word 29457393 // Layer 7, block 46 +.word 1659340873 // Layer 7, block 40 +.word 1504424569 // Layer 7, block 42 +.word 1443776334 // Layer 7, block 44 +.word 1885129272 // Layer 7, block 46 +.word 17371159 // Layer 7, block 41 +.word 11558208 // Layer 7, block 43 +.word 15755637 // Layer 7, block 45 +.word 20740787 // Layer 7, block 47 +.word 1111669329 // Layer 7, block 41 +.word 739668858 // Layer 7, block 43 +.word 1008283812 // Layer 7, block 45 +.word 1327309063 // Layer 7, block 47 +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 13624329 // Layer 6, block 24 +.word 9838349 // Layer 6, block 25 +.word 6934560 // Layer 6, block 26 +.word 11310234 // Layer 6, block 27 +.word 871890510 // Layer 6, block 24 +.word 629606282 // Layer 6, block 25 +.word 443777969 // Layer 6, block 26 +.word 723799733 // Layer 6, block 27 +.word 3153984 // Layer 7, block 48 +.word 15599806 // Layer 7, block 50 +.word 23484790 // Layer 7, block 52 +.word 30174454 // Layer 7, block 54 +.word 201839571 // Layer 7, block 48 +.word 998311389 // Layer 7, block 50 +.word 1502911852 // Layer 7, block 52 +.word 1931017673 // Layer 7, block 54 +.word 13598070 // Layer 7, block 49 +.word 31454003 // Layer 7, block 51 +.word 20506260 // Layer 7, block 53 +.word 5928435 // Layer 7, block 55 +.word 870210062 // Layer 7, block 49 +.word 2012902560 // Layer 7, block 51 +.word 1312300480 // Layer 7, block 53 +.word 379390883 // Layer 7, block 55 +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 32798516 // Layer 6, block 28 +.word 9911360 // Layer 6, block 29 +.word 32443170 // Layer 6, block 30 +.word 31293482 // Layer 6, block 31 +.word 2098944825 // Layer 6, block 28 +.word 634278629 // Layer 6, block 29 +.word 2076204416 // Layer 6, block 30 +.word 2002630000 // Layer 6, block 31 +.word 26013877 // Layer 7, block 56 +.word 22928950 // Layer 7, block 58 +.word 24547058 // Layer 7, block 60 +.word 21082546 // Layer 7, block 62 +.word 1664761067 // Layer 7, block 56 +.word 1467340807 // Layer 7, block 58 +.word 1570891816 // Layer 7, block 60 +.word 1349179970 // Layer 7, block 62 +.word 21864746 // Layer 7, block 57 +.word 27678266 // Layer 7, block 59 +.word 30695887 // Layer 7, block 61 +.word 31772478 // Layer 7, block 63 +.word 1399236949 // Layer 7, block 57 +.word 1771273834 // Layer 7, block 59 +.word 1964386839 // Layer 7, block 61 +.word 2033283404 // Layer 7, block 63 +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 2853776 // Layer 6, block 32 +.word 31645959 // Layer 6, block 33 +.word 29723614 // Layer 6, block 34 +.word 31813171 // Layer 6, block 35 +.word 182627725 // Layer 6, block 32 +.word 2025186806 // Layer 6, block 33 +.word 1902166116 // Layer 6, block 34 +.word 2035887557 // Layer 6, block 35 +.word 30377953 // Layer 7, block 64 +.word 4924837 // Layer 7, block 66 +.word 11362575 // Layer 7, block 68 +.word 31398766 // Layer 7, block 70 +.word 1944040616 // Layer 7, block 64 +.word 315165513 // Layer 7, block 66 +.word 727149301 // Layer 7, block 68 +.word 2009367662 // Layer 7, block 70 +.word 27689101 // Layer 7, block 65 +.word 31229525 // Layer 7, block 67 +.word 6544948 // Layer 7, block 69 +.word 13728247 // Layer 7, block 71 +.word 1771967221 // Layer 7, block 65 +.word 1998537064 // Layer 7, block 67 +.word 418844704 // Layer 7, block 69 +.word 878540754 // Layer 7, block 71 +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9116920 // Layer 6, block 36 +.word 26449800 // Layer 6, block 37 +.word 27173300 // Layer 6, block 38 +.word 1574249 // Layer 6, block 39 +.word 583438350 // Layer 6, block 36 +.word 1692658010 // Layer 6, block 37 +.word 1738958476 // Layer 6, block 38 +.word 100744247 // Layer 6, block 39 +.word 6510145 // Layer 7, block 72 +.word 760999 // Layer 7, block 74 +.word 1634503 // Layer 7, block 76 +.word 29546109 // Layer 7, block 78 +.word 416617482 // Layer 7, block 72 +.word 48700219 // Layer 7, block 74 +.word 104600209 // Layer 7, block 76 +.word 1890806663 // Layer 7, block 78 +.word 2195232 // Layer 7, block 73 +.word 4465852 // Layer 7, block 75 +.word 31203102 // Layer 7, block 77 +.word 29916743 // Layer 7, block 79 +.word 140484126 // Layer 7, block 73 +.word 285792715 // Layer 7, block 75 +.word 1996846121 // Layer 7, block 77 +.word 1914525428 // Layer 7, block 79 +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29172999 // Layer 6, block 40 +.word 16825951 // Layer 6, block 41 +.word 11592382 // Layer 6, block 42 +.word 2671395 // Layer 6, block 43 +.word 1866929445 // Layer 6, block 40 +.word 1076778680 // Layer 6, block 41 +.word 741855827 // Layer 6, block 42 +.word 170956232 // Layer 6, block 43 +.word 14579779 // Layer 7, block 80 +.word 24263513 // Layer 7, block 82 +.word 4646776 // Layer 7, block 84 +.word 69049 // Layer 7, block 86 +.word 933034643 // Layer 7, block 80 +.word 1552746321 // Layer 7, block 82 +.word 297370968 // Layer 7, block 84 +.word 4418799 // Layer 7, block 86 +.word 33263488 // Layer 7, block 81 +.word 22493246 // Layer 7, block 83 +.word 22009979 // Layer 7, block 85 +.word 12021234 // Layer 7, block 87 +.word 2128700762 // Layer 7, block 81 +.word 1439457879 // Layer 7, block 83 +.word 1408531152 // Layer 7, block 85 +.word 769300260 // Layer 7, block 87 +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 15720958 // Layer 6, block 44 +.word 4876619 // Layer 6, block 45 +.word 9370171 // Layer 6, block 46 +.word 2197027 // Layer 6, block 47 +.word 1006064525 // Layer 6, block 44 +.word 312079797 // Layer 6, block 45 +.word 599645177 // Layer 6, block 46 +.word 140598997 // Layer 6, block 47 +.word 16117282 // Layer 7, block 88 +.word 9635661 // Layer 7, block 90 +.word 9117520 // Layer 7, block 92 +.word 3506913 // Layer 7, block 94 +.word 1031427326 // Layer 7, block 88 +.word 616635240 // Layer 7, block 90 +.word 583476747 // Layer 7, block 92 +.word 224425303 // Layer 7, block 94 +.word 20014407 // Layer 7, block 89 +.word 25893988 // Layer 7, block 91 +.word 10257619 // Layer 7, block 93 +.word 24501669 // Layer 7, block 95 +.word 1280824291 // Layer 7, block 89 +.word 1657088757 // Layer 7, block 91 +.word 656437514 // Layer 7, block 93 +.word 1567987141 // Layer 7, block 95 +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 23467272 // Layer 6, block 48 +.word 11944835 // Layer 6, block 49 +.word 29768154 // Layer 6, block 50 +.word 3189790 // Layer 6, block 51 +.word 1501790786 // Layer 6, block 48 +.word 764411097 // Layer 6, block 49 +.word 1905016458 // Layer 6, block 50 +.word 204130980 // Layer 6, block 51 +.word 28559032 // Layer 7, block 96 +.word 20151609 // Layer 7, block 98 +.word 11645481 // Layer 7, block 100 +.word 16402437 // Layer 7, block 102 +.word 1827638556 // Layer 7, block 96 +.word 1289604549 // Layer 7, block 98 +.word 745253903 // Layer 7, block 100 +.word 1049675853 // Layer 7, block 102 +.word 1005359 // Layer 7, block 97 +.word 19130139 // Layer 7, block 99 +.word 11690281 // Layer 7, block 101 +.word 5461508 // Layer 7, block 103 +.word 64338065 // Layer 7, block 97 +.word 1224235458 // Layer 7, block 99 +.word 748120885 // Layer 7, block 101 +.word 349509836 // Layer 7, block 103 +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 4898455 // Layer 6, block 52 +.word 22059944 // Layer 6, block 53 +.word 20315246 // Layer 6, block 54 +.word 28615767 // Layer 6, block 55 +.word 313477194 // Layer 6, block 52 +.word 1411728668 // Layer 6, block 53 +.word 1300076517 // Layer 6, block 54 +.word 1831269319 // Layer 6, block 55 +.word 6226096 // Layer 7, block 104 +.word 14029790 // Layer 7, block 106 +.word 7729000 // Layer 7, block 108 +.word 13958531 // Layer 7, block 110 +.word 398439734 // Layer 7, block 104 +.word 897838034 // Layer 7, block 106 +.word 494618249 // Layer 7, block 108 +.word 893277806 // Layer 7, block 110 +.word 31755058 // Layer 7, block 105 +.word 26102744 // Layer 7, block 107 +.word 19175904 // Layer 7, block 109 +.word 19472238 // Layer 7, block 111 +.word 2032168609 // Layer 7, block 105 +.word 1670448121 // Layer 7, block 107 +.word 1227164194 // Layer 7, block 109 +.word 1246128123 // Layer 7, block 111 +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 17302560 // Layer 6, block 56 +.word 8630188 // Layer 6, block 57 +.word 13744680 // Layer 6, block 58 +.word 31890906 // Layer 6, block 59 +.word 1107279328 // Layer 6, block 56 +.word 552289879 // Layer 6, block 57 +.word 879592386 // Layer 6, block 58 +.word 2040862218 // Layer 6, block 59 +.word 4735938 // Layer 7, block 112 +.word 26671657 // Layer 7, block 114 +.word 25810971 // Layer 7, block 116 +.word 25578690 // Layer 7, block 118 +.word 303076900 // Layer 7, block 112 +.word 1706855774 // Layer 7, block 114 +.word 1651776074 // Layer 7, block 116 +.word 1636911225 // Layer 7, block 118 +.word 6957373 // Layer 7, block 113 +.word 25381712 // Layer 7, block 115 +.word 27780827 // Layer 7, block 117 +.word 28062311 // Layer 7, block 119 +.word 445237890 // Layer 7, block 113 +.word 1624305595 // Layer 7, block 115 +.word 1777837237 // Layer 7, block 117 +.word 1795850838 // Layer 7, block 119 +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 26150922 // Layer 6, block 60 +.word 29525906 // Layer 6, block 61 +.word 23080870 // Layer 6, block 62 +.word 1636987 // Layer 6, block 63 +.word 1673531278 // Layer 6, block 60 +.word 1889513769 // Layer 6, block 61 +.word 1477062945 // Layer 6, block 62 +.word 104759172 // Layer 6, block 63 +.word 10674616 // Layer 7, block 120 +.word 9508293 // Layer 7, block 122 +.word 4274200 // Layer 7, block 124 +.word 10066304 // Layer 7, block 126 +.word 683123285 // Layer 7, block 120 +.word 608484310 // Layer 7, block 122 +.word 273527923 // Layer 7, block 124 +.word 644194289 // Layer 7, block 126 +.word 26473446 // Layer 7, block 121 +.word 14853570 // Layer 7, block 123 +.word 32427548 // Layer 7, block 125 +.word 16598340 // Layer 7, block 127 +.word 1694171239 // Layer 7, block 121 +.word 950555930 // Layer 7, block 123 +.word 2075204685 // Layer 7, block 125 +.word 1062212688 // Layer 7, block 127 +.text +.global ntt_u32_full_neon_asm_var_4_4_2_0 +.global _ntt_u32_full_neon_asm_var_4_4_2_0 +ntt_u32_full_neon_asm_var_4_4_2_0: +_ntt_u32_full_neon_asm_var_4_4_2_0: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #800] +ldr q21, [x0, #864] +ldr q20, [x0, #928] +ldr q19, [x0, #992] +ldr q18, [x0, #288] +ldr q17, [x0, #352] +ldr q16, [x0, #416] +ldr q3, [x0, #480] +sqrdmulh v2.4S, v22.4S, v29.s[0] +ldr q1, [x0, #544] +mul v22.4S, v22.4S,v30.s[0] +ldr q0, [x0, #608] +sqrdmulh v15.4S, v21.4S, v29.s[0] +ldr q14, [x0, #672] +mul v21.4S, v21.4S,v30.s[0] +ldr q13, [x0, #736] +mla v22.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q12, [x0, #32] +mla v21.4S, v15.4S, v31.s[0] +sub v15.4s, v18.4s, v22.4s +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q11, [x0, #96] +mla v20.4S, v2.4S, v31.s[0] +sub v2.4s, v17.4s, v21.4s +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v1.4S, v29.s[0] +ldr q10, [x0, #160] +mul v1.4S, v1.4S,v30.s[0] +mla v19.4S, v22.4S, v31.s[0] +sub v22.4s, v16.4s, v20.4s +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v0.4S, v29.s[0] +ldr q9, [x0, #224] +mul v0.4S, v0.4S,v30.s[0] +mla v1.4S, v21.4S, v31.s[0] +sub v21.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +mla v0.4S, v20.4S, v31.s[0] +sub v20.4s, v12.4s, v1.4s +add v12.4s, v12.4s, v1.4s +sqrdmulh v1.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +mla v14.4S, v19.4S, v31.s[0] +sub v19.4s, v11.4s, v0.4s +add v11.4s, v11.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +mla v13.4S, v1.4S, v31.s[0] +sub v1.4s, v10.4s, v14.4s +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +mla v16.4S, v0.4S, v31.s[0] +sub v0.4s, v9.4s, v13.4s +add v9.4s, v9.4s, v13.4s +sqrdmulh v13.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +mla v3.4S, v14.4S, v31.s[0] +sub v14.4s, v10.4s, v16.4s +add v10.4s, v10.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +mla v18.4S, v13.4S, v31.s[0] +sub v13.4s, v9.4s, v3.4s +add v9.4s, v9.4s, v3.4s +sqrdmulh v3.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +mla v17.4S, v16.4S, v31.s[0] +sub v16.4s, v12.4s, v18.4s +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +mla v22.4S, v3.4S, v31.s[0] +sub v3.4s, v11.4s, v17.4s +add v11.4s, v11.4s, v17.4s +sqrdmulh v17.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +mla v21.4S, v18.4S, v31.s[0] +sub v18.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +sqrdmulh v22.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +mla v15.4S, v17.4S, v31.s[0] +sub v17.4s, v0.4s, v21.4s +add v0.4s, v0.4s, v21.4s +sqrdmulh v21.4S, v10.4S, v27.s[0] +mul v10.4S, v10.4S,v28.s[0] +mla v2.4S, v22.4S, v31.s[0] +sub v22.4s, v20.4s, v15.4s +add v20.4s, v20.4s, v15.4s +sqrdmulh v15.4S, v9.4S, v27.s[0] +mul v9.4S, v9.4S,v28.s[0] +mla v10.4S, v21.4S, v31.s[0] +sub v21.4s, v19.4s, v2.4s +add v19.4s, v19.4s, v2.4s +sqrdmulh v2.4S, v14.4S, v27.s[1] +mul v14.4S, v14.4S,v28.s[1] +mla v9.4S, v15.4S, v31.s[0] +sub v15.4s, v12.4s, v10.4s +add v12.4s, v12.4s, v10.4s +sqrdmulh v10.4S, v13.4S, v27.s[1] +mul v13.4S, v13.4S,v28.s[1] +mla v14.4S, v2.4S, v31.s[0] +sub v2.4s, v11.4s, v9.4s +add v11.4s, v11.4s, v9.4s +sqrdmulh v9.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +mla v13.4S, v10.4S, v31.s[0] +sub v10.4s, v16.4s, v14.4s +add v16.4s, v16.4s, v14.4s +sqrdmulh v14.4S, v0.4S, v27.s[2] +mul v0.4S, v0.4S,v28.s[2] +mla v1.4S, v9.4S, v31.s[0] +sub v9.4s, v3.4s, v13.4s +add v3.4s, v3.4s, v13.4s +sqrdmulh v13.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +mla v0.4S, v14.4S, v31.s[0] +sub v14.4s, v20.4s, v1.4s +add v20.4s, v20.4s, v1.4s +sqrdmulh v1.4S, v17.4S, v27.s[3] +mul v17.4S, v17.4S,v28.s[3] +mla v18.4S, v13.4S, v31.s[0] +sub v13.4s, v19.4s, v0.4s +add v19.4s, v19.4s, v0.4s +sqrdmulh v0.4S, v11.4S, v25.s[0] +mul v11.4S, v11.4S,v26.s[0] +mla v17.4S, v1.4S, v31.s[0] +sub v1.4s, v22.4s, v18.4s +add v22.4s, v22.4s, v18.4s +sqrdmulh v18.4S, v2.4S, v25.s[1] +mul v2.4S, v2.4S,v26.s[1] +mla v11.4S, v0.4S, v31.s[0] +sub v0.4s, v21.4s, v17.4s +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v3.4S, v25.s[2] +mul v3.4S, v3.4S,v26.s[2] +mla v2.4S, v18.4S, v31.s[0] +sub v18.4s, v12.4s, v11.4s +add v12.4s, v12.4s, v11.4s +sqrdmulh v11.4S, v9.4S, v25.s[3] +mul v9.4S, v9.4S,v26.s[3] +mla v3.4S, v17.4S, v31.s[0] +sub v17.4s, v15.4s, v2.4s +add v15.4s, v15.4s, v2.4s +str q12, [x0, #32] +sqrdmulh v12.4S, v19.4S, v23.s[0] +str q18, [x0, #96] +mul v19.4S, v19.4S,v24.s[0] +ldr q18, [x0, #816] +mla v9.4S, v11.4S, v31.s[0] +ldr q11, [x0, #880] +sub v2.4s, v16.4s, v3.4s +add v16.4s, v16.4s, v3.4s +str q15, [x0, #160] +sqrdmulh v15.4S, v13.4S, v23.s[1] +str q17, [x0, #224] +mul v13.4S, v13.4S,v24.s[1] +ldr q17, [x0, #944] +mla v19.4S, v12.4S, v31.s[0] +ldr q12, [x0, #1008] +sub v3.4s, v10.4s, v9.4s +add v10.4s, v10.4s, v9.4s +str q16, [x0, #288] +sqrdmulh v16.4S, v21.4S, v23.s[2] +str q2, [x0, #352] +mul v21.4S, v21.4S,v24.s[2] +ldr q2, [x0, #304] +mla v13.4S, v15.4S, v31.s[0] +ldr q15, [x0, #368] +sub v9.4s, v20.4s, v19.4s +add v20.4s, v20.4s, v19.4s +str q10, [x0, #416] +sqrdmulh v10.4S, v0.4S, v23.s[3] +str q3, [x0, #480] +mul v0.4S, v0.4S,v24.s[3] +ldr q3, [x0, #432] +mla v21.4S, v16.4S, v31.s[0] +ldr q16, [x0, #496] +sub v19.4s, v14.4s, v13.4s +add v14.4s, v14.4s, v13.4s +str q20, [x0, #544] +sqrdmulh v20.4S, v18.4S, v29.s[0] +str q9, [x0, #608] +ldr q9, [x0, #560] +mul v18.4S, v18.4S,v30.s[0] +ldr q13, [x0, #624] +mla v0.4S, v10.4S, v31.s[0] +sub v10.4s, v22.4s, v21.4s +add v22.4s, v22.4s, v21.4s +str q14, [x0, #672] +sqrdmulh v14.4S, v11.4S, v29.s[0] +str q19, [x0, #736] +ldr q19, [x0, #688] +mul v11.4S, v11.4S,v30.s[0] +ldr q21, [x0, #752] +mla v18.4S, v20.4S, v31.s[0] +sub v20.4s, v1.4s, v0.4s +add v1.4s, v1.4s, v0.4s +str q22, [x0, #800] +sqrdmulh v22.4S, v17.4S, v29.s[0] +str q10, [x0, #864] +mul v17.4S, v17.4S,v30.s[0] +ldr q10, [x0, #48] +mla v11.4S, v14.4S, v31.s[0] +sub v14.4s, v2.4s, v18.4s +add v2.4s, v2.4s, v18.4s +str q1, [x0, #928] +sqrdmulh v1.4S, v12.4S, v29.s[0] +str q20, [x0, #992] +mul v12.4S, v12.4S,v30.s[0] +ldr q20, [x0, #112] +mla v17.4S, v22.4S, v31.s[0] +sub v22.4s, v15.4s, v11.4s +add v15.4s, v15.4s, v11.4s +sqrdmulh v11.4S, v9.4S, v29.s[0] +ldr q18, [x0, #176] +mul v9.4S, v9.4S,v30.s[0] +mla v12.4S, v1.4S, v31.s[0] +sub v1.4s, v3.4s, v17.4s +add v3.4s, v3.4s, v17.4s +sqrdmulh v17.4S, v13.4S, v29.s[0] +ldr q0, [x0, #240] +mul v13.4S, v13.4S,v30.s[0] +mla v9.4S, v11.4S, v31.s[0] +sub v11.4s, v16.4s, v12.4s +add v16.4s, v16.4s, v12.4s +sqrdmulh v12.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +mla v13.4S, v17.4S, v31.s[0] +sub v17.4s, v10.4s, v9.4s +add v10.4s, v10.4s, v9.4s +sqrdmulh v9.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +mla v19.4S, v12.4S, v31.s[0] +sub v12.4s, v20.4s, v13.4s +add v20.4s, v20.4s, v13.4s +sqrdmulh v13.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +mla v21.4S, v9.4S, v31.s[0] +sub v9.4s, v18.4s, v19.4s +add v18.4s, v18.4s, v19.4s +sqrdmulh v19.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +mla v3.4S, v13.4S, v31.s[0] +sub v13.4s, v0.4s, v21.4s +add v0.4s, v0.4s, v21.4s +sqrdmulh v21.4S, v2.4S, v29.s[1] +mul v2.4S, v2.4S,v30.s[1] +mla v16.4S, v19.4S, v31.s[0] +sub v19.4s, v18.4s, v3.4s +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +mla v2.4S, v21.4S, v31.s[0] +sub v21.4s, v0.4s, v16.4s +add v0.4s, v0.4s, v16.4s +sqrdmulh v16.4S, v1.4S, v29.s[2] +mul v1.4S, v1.4S,v30.s[2] +mla v15.4S, v3.4S, v31.s[0] +sub v3.4s, v10.4s, v2.4s +add v10.4s, v10.4s, v2.4s +sqrdmulh v2.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +mla v1.4S, v16.4S, v31.s[0] +sub v16.4s, v20.4s, v15.4s +add v20.4s, v20.4s, v15.4s +sqrdmulh v15.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +mla v11.4S, v2.4S, v31.s[0] +sub v2.4s, v9.4s, v1.4s +add v9.4s, v9.4s, v1.4s +sqrdmulh v1.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +mla v14.4S, v15.4S, v31.s[0] +sub v15.4s, v13.4s, v11.4s +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v18.4S, v27.s[0] +mul v18.4S, v18.4S,v28.s[0] +mla v22.4S, v1.4S, v31.s[0] +sub v1.4s, v17.4s, v14.4s +add v17.4s, v17.4s, v14.4s +sqrdmulh v14.4S, v0.4S, v27.s[0] +mul v0.4S, v0.4S,v28.s[0] +mla v18.4S, v11.4S, v31.s[0] +sub v11.4s, v12.4s, v22.4s +add v12.4s, v12.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v27.s[1] +mul v19.4S, v19.4S,v28.s[1] +mla v0.4S, v14.4S, v31.s[0] +sub v14.4s, v10.4s, v18.4s +add v10.4s, v10.4s, v18.4s +sqrdmulh v18.4S, v21.4S, v27.s[1] +mul v21.4S, v21.4S,v28.s[1] +mla v19.4S, v22.4S, v31.s[0] +sub v22.4s, v20.4s, v0.4s +add v20.4s, v20.4s, v0.4s +sqrdmulh v0.4S, v9.4S, v27.s[2] +mul v9.4S, v9.4S,v28.s[2] +mla v21.4S, v18.4S, v31.s[0] +sub v18.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v13.4S, v27.s[2] +mul v13.4S, v13.4S,v28.s[2] +mla v9.4S, v0.4S, v31.s[0] +sub v0.4s, v16.4s, v21.4s +add v16.4s, v16.4s, v21.4s +sqrdmulh v21.4S, v2.4S, v27.s[3] +mul v2.4S, v2.4S,v28.s[3] +mla v13.4S, v19.4S, v31.s[0] +sub v19.4s, v17.4s, v9.4s +add v17.4s, v17.4s, v9.4s +sqrdmulh v9.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +mla v2.4S, v21.4S, v31.s[0] +sub v21.4s, v12.4s, v13.4s +add v12.4s, v12.4s, v13.4s +sqrdmulh v13.4S, v20.4S, v25.s[0] +mul v20.4S, v20.4S,v26.s[0] +mla v15.4S, v9.4S, v31.s[0] +sub v9.4s, v1.4s, v2.4s +add v1.4s, v1.4s, v2.4s +sqrdmulh v2.4S, v22.4S, v25.s[1] +mul v22.4S, v22.4S,v26.s[1] +mla v20.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v15.4s +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +mla v22.4S, v2.4S, v31.s[0] +sub v2.4s, v10.4s, v20.4s +add v10.4s, v10.4s, v20.4s +sqrdmulh v20.4S, v0.4S, v25.s[3] +mul v0.4S, v0.4S,v26.s[3] +mla v16.4S, v15.4S, v31.s[0] +sub v15.4s, v14.4s, v22.4s +add v14.4s, v14.4s, v22.4s +str q10, [x0, #48] +sqrdmulh v10.4S, v12.4S, v23.s[0] +str q2, [x0, #112] +mul v12.4S, v12.4S,v24.s[0] +ldr q2, [x0, #768] +mla v0.4S, v20.4S, v31.s[0] +ldr q20, [x0, #832] +sub v22.4s, v3.4s, v16.4s +add v3.4s, v3.4s, v16.4s +str q14, [x0, #176] +sqrdmulh v14.4S, v21.4S, v23.s[1] +str q15, [x0, #240] +mul v21.4S, v21.4S,v24.s[1] +ldr q15, [x0, #896] +mla v12.4S, v10.4S, v31.s[0] +ldr q10, [x0, #960] +sub v16.4s, v18.4s, v0.4s +add v18.4s, v18.4s, v0.4s +str q3, [x0, #304] +sqrdmulh v3.4S, v11.4S, v23.s[2] +str q22, [x0, #368] +mul v11.4S, v11.4S,v24.s[2] +ldr q22, [x0, #256] +mla v21.4S, v14.4S, v31.s[0] +ldr q14, [x0, #320] +sub v0.4s, v17.4s, v12.4s +add v17.4s, v17.4s, v12.4s +str q18, [x0, #432] +sqrdmulh v18.4S, v13.4S, v23.s[3] +str q16, [x0, #496] +mul v13.4S, v13.4S,v24.s[3] +ldr q16, [x0, #384] +mla v11.4S, v3.4S, v31.s[0] +ldr q3, [x0, #448] +sub v12.4s, v19.4s, v21.4s +add v19.4s, v19.4s, v21.4s +str q17, [x0, #560] +sqrdmulh v17.4S, v2.4S, v29.s[0] +str q0, [x0, #624] +ldr q0, [x0, #512] +mul v2.4S, v2.4S,v30.s[0] +ldr q21, [x0, #576] +mla v13.4S, v18.4S, v31.s[0] +sub v18.4s, v1.4s, v11.4s +add v1.4s, v1.4s, v11.4s +str q19, [x0, #688] +sqrdmulh v19.4S, v20.4S, v29.s[0] +str q12, [x0, #752] +ldr q12, [x0, #640] +mul v20.4S, v20.4S,v30.s[0] +ldr q11, [x0, #704] +mla v2.4S, v17.4S, v31.s[0] +sub v17.4s, v9.4s, v13.4s +add v9.4s, v9.4s, v13.4s +str q1, [x0, #816] +sqrdmulh v1.4S, v15.4S, v29.s[0] +str q18, [x0, #880] +mul v15.4S, v15.4S,v30.s[0] +ldr q18, [x0, #0] +mla v20.4S, v19.4S, v31.s[0] +sub v19.4s, v22.4s, v2.4s +add v22.4s, v22.4s, v2.4s +str q9, [x0, #944] +sqrdmulh v9.4S, v10.4S, v29.s[0] +str q17, [x0, #1008] +mul v10.4S, v10.4S,v30.s[0] +ldr q17, [x0, #64] +mla v15.4S, v1.4S, v31.s[0] +sub v1.4s, v14.4s, v20.4s +add v14.4s, v14.4s, v20.4s +sqrdmulh v20.4S, v0.4S, v29.s[0] +ldr q2, [x0, #128] +mul v0.4S, v0.4S,v30.s[0] +mla v10.4S, v9.4S, v31.s[0] +sub v9.4s, v16.4s, v15.4s +add v16.4s, v16.4s, v15.4s +sqrdmulh v15.4S, v21.4S, v29.s[0] +ldr q13, [x0, #192] +mul v21.4S, v21.4S,v30.s[0] +mla v0.4S, v20.4S, v31.s[0] +sub v20.4s, v3.4s, v10.4s +add v3.4s, v3.4s, v10.4s +sqrdmulh v10.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +mla v21.4S, v15.4S, v31.s[0] +sub v15.4s, v18.4s, v0.4s +add v18.4s, v18.4s, v0.4s +sqrdmulh v0.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +mla v12.4S, v10.4S, v31.s[0] +sub v10.4s, v17.4s, v21.4s +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +mla v11.4S, v0.4S, v31.s[0] +sub v0.4s, v2.4s, v12.4s +add v2.4s, v2.4s, v12.4s +sqrdmulh v12.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +mla v16.4S, v21.4S, v31.s[0] +sub v21.4s, v13.4s, v11.4s +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v29.s[1] +mul v22.4S, v22.4S,v30.s[1] +mla v3.4S, v12.4S, v31.s[0] +sub v12.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v14.4S, v29.s[1] +mul v14.4S, v14.4S,v30.s[1] +mla v22.4S, v11.4S, v31.s[0] +sub v11.4s, v13.4s, v3.4s +add v13.4s, v13.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v29.s[2] +mul v9.4S, v9.4S,v30.s[2] +mla v14.4S, v16.4S, v31.s[0] +sub v16.4s, v18.4s, v22.4s +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +mla v9.4S, v3.4S, v31.s[0] +sub v3.4s, v17.4s, v14.4s +add v17.4s, v17.4s, v14.4s +sqrdmulh v14.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +mla v20.4S, v22.4S, v31.s[0] +sub v22.4s, v0.4s, v9.4s +add v0.4s, v0.4s, v9.4s +sqrdmulh v9.4S, v1.4S, v29.s[2] +mul v1.4S, v1.4S,v30.s[2] +mla v19.4S, v14.4S, v31.s[0] +sub v14.4s, v21.4s, v20.4s +add v21.4s, v21.4s, v20.4s +sqrdmulh v20.4S, v2.4S, v27.s[0] +mul v2.4S, v2.4S,v28.s[0] +mla v1.4S, v9.4S, v31.s[0] +sub v9.4s, v15.4s, v19.4s +add v15.4s, v15.4s, v19.4s +sqrdmulh v19.4S, v13.4S, v27.s[0] +mul v13.4S, v13.4S,v28.s[0] +mla v2.4S, v20.4S, v31.s[0] +sub v20.4s, v10.4s, v1.4s +add v10.4s, v10.4s, v1.4s +sqrdmulh v1.4S, v12.4S, v27.s[1] +mul v12.4S, v12.4S,v28.s[1] +mla v13.4S, v19.4S, v31.s[0] +sub v19.4s, v18.4s, v2.4s +add v18.4s, v18.4s, v2.4s +sqrdmulh v2.4S, v11.4S, v27.s[1] +mul v11.4S, v11.4S,v28.s[1] +mla v12.4S, v1.4S, v31.s[0] +sub v1.4s, v17.4s, v13.4s +add v17.4s, v17.4s, v13.4s +sqrdmulh v13.4S, v0.4S, v27.s[2] +mul v0.4S, v0.4S,v28.s[2] +mla v11.4S, v2.4S, v31.s[0] +sub v2.4s, v16.4s, v12.4s +add v16.4s, v16.4s, v12.4s +sqrdmulh v12.4S, v21.4S, v27.s[2] +mul v21.4S, v21.4S,v28.s[2] +mla v0.4S, v13.4S, v31.s[0] +sub v13.4s, v3.4s, v11.4s +add v3.4s, v3.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v27.s[3] +mul v22.4S, v22.4S,v28.s[3] +mla v21.4S, v12.4S, v31.s[0] +sub v12.4s, v15.4s, v0.4s +add v15.4s, v15.4s, v0.4s +sqrdmulh v0.4S, v14.4S, v27.s[3] +mul v14.4S, v14.4S,v28.s[3] +mla v22.4S, v11.4S, v31.s[0] +sub v11.4s, v10.4s, v21.4s +add v10.4s, v10.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v25.s[0] +mul v17.4S, v17.4S,v26.s[0] +mla v14.4S, v0.4S, v31.s[0] +sub v0.4s, v9.4s, v22.4s +add v9.4s, v9.4s, v22.4s +sqrdmulh v22.4S, v1.4S, v25.s[1] +mul v1.4S, v1.4S,v26.s[1] +mla v17.4S, v21.4S, v31.s[0] +sub v21.4s, v20.4s, v14.4s +add v20.4s, v20.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v25.s[2] +mul v3.4S, v3.4S,v26.s[2] +mla v1.4S, v22.4S, v31.s[0] +sub v22.4s, v18.4s, v17.4s +add v18.4s, v18.4s, v17.4s +sqrdmulh v17.4S, v13.4S, v25.s[3] +mul v13.4S, v13.4S,v26.s[3] +mla v3.4S, v14.4S, v31.s[0] +sub v14.4s, v19.4s, v1.4s +add v19.4s, v19.4s, v1.4s +str q18, [x0, #0] +sqrdmulh v18.4S, v10.4S, v23.s[0] +str q22, [x0, #64] +mul v10.4S, v10.4S,v24.s[0] +ldr q22, [x0, #784] +mla v13.4S, v17.4S, v31.s[0] +ldr q17, [x0, #848] +sub v1.4s, v16.4s, v3.4s +add v16.4s, v16.4s, v3.4s +str q19, [x0, #128] +sqrdmulh v19.4S, v11.4S, v23.s[1] +str q14, [x0, #192] +mul v11.4S, v11.4S,v24.s[1] +ldr q14, [x0, #912] +mla v10.4S, v18.4S, v31.s[0] +ldr q18, [x0, #976] +sub v3.4s, v2.4s, v13.4s +add v2.4s, v2.4s, v13.4s +str q16, [x0, #256] +sqrdmulh v16.4S, v20.4S, v23.s[2] +str q1, [x0, #320] +mul v20.4S, v20.4S,v24.s[2] +ldr q1, [x0, #272] +mla v11.4S, v19.4S, v31.s[0] +ldr q19, [x0, #336] +sub v13.4s, v15.4s, v10.4s +add v15.4s, v15.4s, v10.4s +str q2, [x0, #384] +sqrdmulh v2.4S, v21.4S, v23.s[3] +str q3, [x0, #448] +mul v21.4S, v21.4S,v24.s[3] +ldr q3, [x0, #400] +mla v20.4S, v16.4S, v31.s[0] +ldr q16, [x0, #464] +sub v10.4s, v12.4s, v11.4s +add v12.4s, v12.4s, v11.4s +str q15, [x0, #512] +sqrdmulh v15.4S, v22.4S, v29.s[0] +str q13, [x0, #576] +ldr q13, [x0, #528] +mul v22.4S, v22.4S,v30.s[0] +ldr q11, [x0, #592] +mla v21.4S, v2.4S, v31.s[0] +sub v2.4s, v9.4s, v20.4s +add v9.4s, v9.4s, v20.4s +str q12, [x0, #640] +sqrdmulh v12.4S, v17.4S, v29.s[0] +str q10, [x0, #704] +ldr q10, [x0, #656] +mul v17.4S, v17.4S,v30.s[0] +ldr q20, [x0, #720] +mla v22.4S, v15.4S, v31.s[0] +sub v15.4s, v0.4s, v21.4s +add v0.4s, v0.4s, v21.4s +str q9, [x0, #768] +sqrdmulh v9.4S, v14.4S, v29.s[0] +str q2, [x0, #832] +mul v14.4S, v14.4S,v30.s[0] +ldr q2, [x0, #16] +mla v17.4S, v12.4S, v31.s[0] +sub v12.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +str q0, [x0, #896] +sqrdmulh v0.4S, v18.4S, v29.s[0] +str q15, [x0, #960] +mul v18.4S, v18.4S,v30.s[0] +ldr q15, [x0, #80] +mla v14.4S, v9.4S, v31.s[0] +sub v9.4s, v19.4s, v17.4s +add v19.4s, v19.4s, v17.4s +sqrdmulh v17.4S, v13.4S, v29.s[0] +ldr q22, [x0, #144] +mul v13.4S, v13.4S,v30.s[0] +mla v18.4S, v0.4S, v31.s[0] +sub v0.4s, v3.4s, v14.4s +add v3.4s, v3.4s, v14.4s +sqrdmulh v14.4S, v11.4S, v29.s[0] +ldr q21, [x0, #208] +mul v11.4S, v11.4S,v30.s[0] +mla v13.4S, v17.4S, v31.s[0] +sub v17.4s, v16.4s, v18.4s +add v16.4s, v16.4s, v18.4s +sqrdmulh v18.4S, v10.4S, v29.s[0] +mul v10.4S, v10.4S,v30.s[0] +mla v11.4S, v14.4S, v31.s[0] +sub v14.4s, v2.4s, v13.4s +add v2.4s, v2.4s, v13.4s +sqrdmulh v13.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +mla v10.4S, v18.4S, v31.s[0] +sub v18.4s, v15.4s, v11.4s +add v15.4s, v15.4s, v11.4s +sqrdmulh v11.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +mla v20.4S, v13.4S, v31.s[0] +sub v13.4s, v22.4s, v10.4s +add v22.4s, v22.4s, v10.4s +sqrdmulh v10.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +mla v3.4S, v11.4S, v31.s[0] +sub v11.4s, v21.4s, v20.4s +add v21.4s, v21.4s, v20.4s +sqrdmulh v20.4S, v1.4S, v29.s[1] +mul v1.4S, v1.4S,v30.s[1] +mla v16.4S, v10.4S, v31.s[0] +sub v10.4s, v22.4s, v3.4s +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v19.4S, v29.s[1] +mul v19.4S, v19.4S,v30.s[1] +mla v1.4S, v20.4S, v31.s[0] +sub v20.4s, v21.4s, v16.4s +add v21.4s, v21.4s, v16.4s +sqrdmulh v16.4S, v0.4S, v29.s[2] +mul v0.4S, v0.4S,v30.s[2] +mla v19.4S, v3.4S, v31.s[0] +sub v3.4s, v2.4s, v1.4s +add v2.4s, v2.4s, v1.4s +sqrdmulh v1.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +mla v0.4S, v16.4S, v31.s[0] +sub v16.4s, v15.4s, v19.4s +add v15.4s, v15.4s, v19.4s +sqrdmulh v19.4S, v12.4S, v29.s[2] +mul v12.4S, v12.4S,v30.s[2] +mla v17.4S, v1.4S, v31.s[0] +sub v1.4s, v13.4s, v0.4s +add v13.4s, v13.4s, v0.4s +sqrdmulh v0.4S, v9.4S, v29.s[2] +mul v9.4S, v9.4S,v30.s[2] +mla v12.4S, v19.4S, v31.s[0] +sub v19.4s, v11.4s, v17.4s +add v11.4s, v11.4s, v17.4s +sqrdmulh v17.4S, v22.4S, v27.s[0] +mul v22.4S, v22.4S,v28.s[0] +mla v9.4S, v0.4S, v31.s[0] +sub v0.4s, v14.4s, v12.4s +add v14.4s, v14.4s, v12.4s +sqrdmulh v12.4S, v21.4S, v27.s[0] +mul v21.4S, v21.4S,v28.s[0] +mla v22.4S, v17.4S, v31.s[0] +sub v17.4s, v18.4s, v9.4s +add v18.4s, v18.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v27.s[1] +mul v10.4S, v10.4S,v28.s[1] +mla v21.4S, v12.4S, v31.s[0] +sub v12.4s, v2.4s, v22.4s +add v2.4s, v2.4s, v22.4s +sqrdmulh v22.4S, v20.4S, v27.s[1] +mul v20.4S, v20.4S,v28.s[1] +mla v10.4S, v9.4S, v31.s[0] +sub v9.4s, v15.4s, v21.4s +add v15.4s, v15.4s, v21.4s +sqrdmulh v21.4S, v13.4S, v27.s[2] +mul v13.4S, v13.4S,v28.s[2] +mla v20.4S, v22.4S, v31.s[0] +sub v22.4s, v3.4s, v10.4s +add v3.4s, v3.4s, v10.4s +sqrdmulh v10.4S, v11.4S, v27.s[2] +mul v11.4S, v11.4S,v28.s[2] +mla v13.4S, v21.4S, v31.s[0] +sub v21.4s, v16.4s, v20.4s +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v1.4S, v27.s[3] +mul v1.4S, v1.4S,v28.s[3] +mla v11.4S, v10.4S, v31.s[0] +sub v10.4s, v14.4s, v13.4s +add v14.4s, v14.4s, v13.4s +sqrdmulh v13.4S, v19.4S, v27.s[3] +mul v19.4S, v19.4S,v28.s[3] +mla v1.4S, v20.4S, v31.s[0] +sub v20.4s, v18.4s, v11.4s +add v18.4s, v18.4s, v11.4s +sqrdmulh v11.4S, v15.4S, v25.s[0] +mul v15.4S, v15.4S,v26.s[0] +mla v19.4S, v13.4S, v31.s[0] +sub v13.4s, v0.4s, v1.4s +add v0.4s, v0.4s, v1.4s +sqrdmulh v1.4S, v9.4S, v25.s[1] +mul v9.4S, v9.4S,v26.s[1] +mla v15.4S, v11.4S, v31.s[0] +sub v11.4s, v17.4s, v19.4s +add v17.4s, v17.4s, v19.4s +sqrdmulh v19.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +mla v9.4S, v1.4S, v31.s[0] +sub v1.4s, v2.4s, v15.4s +add v2.4s, v2.4s, v15.4s +sqrdmulh v15.4S, v21.4S, v25.s[3] +mul v21.4S, v21.4S,v26.s[3] +mla v16.4S, v19.4S, v31.s[0] +sub v19.4s, v12.4s, v9.4s +add v12.4s, v12.4s, v9.4s +str q2, [x0, #16] +sqrdmulh v2.4S, v18.4S, v23.s[0] +str q1, [x0, #80] +mul v18.4S, v18.4S,v24.s[0] +mla v21.4S, v15.4S, v31.s[0] +sub v15.4s, v3.4s, v16.4s +add v3.4s, v3.4s, v16.4s +str q12, [x0, #144] +sqrdmulh v12.4S, v20.4S, v23.s[1] +str q19, [x0, #208] +mul v20.4S, v20.4S,v24.s[1] +mla v18.4S, v2.4S, v31.s[0] +sub v2.4s, v22.4s, v21.4s +add v22.4s, v22.4s, v21.4s +str q3, [x0, #272] +sqrdmulh v3.4S, v17.4S, v23.s[2] +str q15, [x0, #336] +mul v17.4S, v17.4S,v24.s[2] +mla v20.4S, v12.4S, v31.s[0] +sub v12.4s, v14.4s, v18.4s +add v14.4s, v14.4s, v18.4s +str q22, [x0, #400] +sqrdmulh v22.4S, v11.4S, v23.s[3] +str q2, [x0, #464] +mul v11.4S, v11.4S,v24.s[3] +mla v17.4S, v3.4S, v31.s[0] +sub v3.4s, v10.4s, v20.4s +add v10.4s, v10.4s, v20.4s +str q14, [x0, #528] +str q12, [x0, #592] +mla v11.4S, v22.4S, v31.s[0] +sub v22.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +str q10, [x0, #656] +str q3, [x0, #720] +sub v3.4s, v13.4s, v11.4s +add v13.4s, v13.4s, v11.4s +str q0, [x0, #784] +str q22, [x0, #848] +str q13, [x0, #912] +str q3, [x0, #976] +ldr q4, [x17, #+128] +ldr q5, [x17, #+144] +ldr q6, [x17, #+160] +ldr q7, [x17, #+176] +ldr q8, [x17, #+192] +ldr q9, [x17, #+208] +ldr q1, [x17, #+224] +ldr q16, [x17, #+240] +ldr q19, [x0, #32] +ldr q21, [x0, #48] +ldr q15, [x0, #0] +ldr q18, [x0, #16] +sqrdmulh v2.4S, v19.4S, v5.s[0] +mul v19.4S, v19.4S,v4.s[0] +mla v19.4S, v2.4S, v31.s[0] +sub v2.4s, v15.4s, v19.4s +add v15.4s, v15.4s, v19.4s +sqrdmulh v19.4S, v21.4S, v5.s[0] +mul v21.4S, v21.4S,v4.s[0] +mla v21.4S, v19.4S, v31.s[0] +sub v19.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v18.4S, v5.s[1] +mul v18.4S, v18.4S,v4.s[1] +mla v18.4S, v21.4S, v31.s[0] +sub v21.4s, v15.4s, v18.4s +add v15.4s, v15.4s, v18.4s +sqrdmulh v18.4S, v19.4S, v5.s[2] +mul v19.4S, v19.4S,v4.s[2] +mla v19.4S, v18.4S, v31.s[0] +sub v18.4s, v2.4s, v19.4s +add v2.4s, v2.4s, v19.4s +trn1 v19.4S, v15.4S, v21.4S +trn2 v20.4S, v15.4S, v21.4S +trn1 v14.4S, v2.4S, v18.4S +trn2 v12.4S, v2.4S, v18.4S +trn2 v2.2D, v19.2D, v14.2D +trn2 v18.2D, v20.2D, v12.2D +trn1 v15.2D, v19.2D, v14.2D +trn1 v21.2D, v20.2D, v12.2D +sqrdmulh v12.4S, v2.4S, v7.4S +mul v2.4S, v2.4S,v6.4S +mla v2.4S, v12.4S, v31.s[0] +sub v12.4s, v15.4s, v2.4s +add v15.4s, v15.4s, v2.4s +sqrdmulh v2.4S, v18.4S, v7.4S +mul v18.4S, v18.4S,v6.4S +mla v18.4S, v2.4S, v31.s[0] +sub v2.4s, v21.4s, v18.4s +add v21.4s, v21.4s, v18.4s +sqrdmulh v18.4S, v21.4S, v9.4S +mul v21.4S, v21.4S,v8.4S +mla v21.4S, v18.4S, v31.s[0] +sub v18.4s, v15.4s, v21.4s +add v15.4s, v15.4s, v21.4s +sqrdmulh v21.4S, v2.4S, v16.4S +mul v2.4S, v2.4S,v1.4S +mla v2.4S, v21.4S, v31.s[0] +sub v21.4s, v12.4s, v2.4s +add v12.4s, v12.4s, v2.4s +str q15, [x0, #0] +str q18, [x0, #16] +str q12, [x0, #32] +str q21, [x0, #48] +ldr q21, [x17, #+256] +ldr q12, [x17, #+272] +ldr q18, [x17, #+288] +ldr q15, [x17, #+304] +ldr q2, [x17, #+320] +ldr q20, [x17, #+336] +ldr q14, [x17, #+352] +ldr q19, [x17, #+368] +ldr q16, [x0, #96] +ldr q1, [x0, #112] +ldr q9, [x0, #64] +ldr q8, [x0, #80] +sqrdmulh v7.4S, v16.4S, v12.s[0] +mul v16.4S, v16.4S,v21.s[0] +mla v16.4S, v7.4S, v31.s[0] +sub v7.4s, v9.4s, v16.4s +add v9.4s, v9.4s, v16.4s +sqrdmulh v16.4S, v1.4S, v12.s[0] +mul v1.4S, v1.4S,v21.s[0] +mla v1.4S, v16.4S, v31.s[0] +sub v16.4s, v8.4s, v1.4s +add v8.4s, v8.4s, v1.4s +sqrdmulh v1.4S, v8.4S, v12.s[1] +mul v8.4S, v8.4S,v21.s[1] +mla v8.4S, v1.4S, v31.s[0] +sub v1.4s, v9.4s, v8.4s +add v9.4s, v9.4s, v8.4s +sqrdmulh v8.4S, v16.4S, v12.s[2] +mul v16.4S, v16.4S,v21.s[2] +mla v16.4S, v8.4S, v31.s[0] +sub v8.4s, v7.4s, v16.4s +add v7.4s, v7.4s, v16.4s +trn1 v16.4S, v9.4S, v1.4S +trn2 v6.4S, v9.4S, v1.4S +trn1 v5.4S, v7.4S, v8.4S +trn2 v4.4S, v7.4S, v8.4S +trn2 v7.2D, v16.2D, v5.2D +trn2 v8.2D, v6.2D, v4.2D +trn1 v9.2D, v16.2D, v5.2D +trn1 v1.2D, v6.2D, v4.2D +sqrdmulh v4.4S, v7.4S, v15.4S +mul v7.4S, v7.4S,v18.4S +mla v7.4S, v4.4S, v31.s[0] +sub v4.4s, v9.4s, v7.4s +add v9.4s, v9.4s, v7.4s +sqrdmulh v7.4S, v8.4S, v15.4S +mul v8.4S, v8.4S,v18.4S +mla v8.4S, v7.4S, v31.s[0] +sub v7.4s, v1.4s, v8.4s +add v1.4s, v1.4s, v8.4s +sqrdmulh v8.4S, v1.4S, v20.4S +mul v1.4S, v1.4S,v2.4S +mla v1.4S, v8.4S, v31.s[0] +sub v8.4s, v9.4s, v1.4s +add v9.4s, v9.4s, v1.4s +sqrdmulh v1.4S, v7.4S, v19.4S +mul v7.4S, v7.4S,v14.4S +mla v7.4S, v1.4S, v31.s[0] +sub v1.4s, v4.4s, v7.4s +add v4.4s, v4.4s, v7.4s +str q9, [x0, #64] +str q8, [x0, #80] +str q4, [x0, #96] +str q1, [x0, #112] +ldr q1, [x17, #+384] +ldr q4, [x17, #+400] +ldr q8, [x17, #+416] +ldr q9, [x17, #+432] +ldr q7, [x17, #+448] +ldr q6, [x17, #+464] +ldr q5, [x17, #+480] +ldr q16, [x17, #+496] +ldr q19, [x0, #160] +ldr q14, [x0, #176] +ldr q20, [x0, #128] +ldr q2, [x0, #144] +sqrdmulh v15.4S, v19.4S, v4.s[0] +mul v19.4S, v19.4S,v1.s[0] +mla v19.4S, v15.4S, v31.s[0] +sub v15.4s, v20.4s, v19.4s +add v20.4s, v20.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v4.s[0] +mul v14.4S, v14.4S,v1.s[0] +mla v14.4S, v19.4S, v31.s[0] +sub v19.4s, v2.4s, v14.4s +add v2.4s, v2.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v4.s[1] +mul v2.4S, v2.4S,v1.s[1] +mla v2.4S, v14.4S, v31.s[0] +sub v14.4s, v20.4s, v2.4s +add v20.4s, v20.4s, v2.4s +sqrdmulh v2.4S, v19.4S, v4.s[2] +mul v19.4S, v19.4S,v1.s[2] +mla v19.4S, v2.4S, v31.s[0] +sub v2.4s, v15.4s, v19.4s +add v15.4s, v15.4s, v19.4s +trn1 v19.4S, v20.4S, v14.4S +trn2 v18.4S, v20.4S, v14.4S +trn1 v12.4S, v15.4S, v2.4S +trn2 v21.4S, v15.4S, v2.4S +trn2 v15.2D, v19.2D, v12.2D +trn2 v2.2D, v18.2D, v21.2D +trn1 v20.2D, v19.2D, v12.2D +trn1 v14.2D, v18.2D, v21.2D +sqrdmulh v21.4S, v15.4S, v9.4S +mul v15.4S, v15.4S,v8.4S +mla v15.4S, v21.4S, v31.s[0] +sub v21.4s, v20.4s, v15.4s +add v20.4s, v20.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v9.4S +mul v2.4S, v2.4S,v8.4S +mla v2.4S, v15.4S, v31.s[0] +sub v15.4s, v14.4s, v2.4s +add v14.4s, v14.4s, v2.4s +sqrdmulh v2.4S, v14.4S, v6.4S +mul v14.4S, v14.4S,v7.4S +mla v14.4S, v2.4S, v31.s[0] +sub v2.4s, v20.4s, v14.4s +add v20.4s, v20.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v16.4S +mul v15.4S, v15.4S,v5.4S +mla v15.4S, v14.4S, v31.s[0] +sub v14.4s, v21.4s, v15.4s +add v21.4s, v21.4s, v15.4s +str q20, [x0, #128] +str q2, [x0, #144] +str q21, [x0, #160] +str q14, [x0, #176] +ldr q14, [x17, #+512] +ldr q21, [x17, #+528] +ldr q2, [x17, #+544] +ldr q20, [x17, #+560] +ldr q15, [x17, #+576] +ldr q18, [x17, #+592] +ldr q12, [x17, #+608] +ldr q19, [x17, #+624] +ldr q16, [x0, #224] +ldr q5, [x0, #240] +ldr q6, [x0, #192] +ldr q7, [x0, #208] +sqrdmulh v9.4S, v16.4S, v21.s[0] +mul v16.4S, v16.4S,v14.s[0] +mla v16.4S, v9.4S, v31.s[0] +sub v9.4s, v6.4s, v16.4s +add v6.4s, v6.4s, v16.4s +sqrdmulh v16.4S, v5.4S, v21.s[0] +mul v5.4S, v5.4S,v14.s[0] +mla v5.4S, v16.4S, v31.s[0] +sub v16.4s, v7.4s, v5.4s +add v7.4s, v7.4s, v5.4s +sqrdmulh v5.4S, v7.4S, v21.s[1] +mul v7.4S, v7.4S,v14.s[1] +mla v7.4S, v5.4S, v31.s[0] +sub v5.4s, v6.4s, v7.4s +add v6.4s, v6.4s, v7.4s +sqrdmulh v7.4S, v16.4S, v21.s[2] +mul v16.4S, v16.4S,v14.s[2] +mla v16.4S, v7.4S, v31.s[0] +sub v7.4s, v9.4s, v16.4s +add v9.4s, v9.4s, v16.4s +trn1 v16.4S, v6.4S, v5.4S +trn2 v8.4S, v6.4S, v5.4S +trn1 v4.4S, v9.4S, v7.4S +trn2 v1.4S, v9.4S, v7.4S +trn2 v9.2D, v16.2D, v4.2D +trn2 v7.2D, v8.2D, v1.2D +trn1 v6.2D, v16.2D, v4.2D +trn1 v5.2D, v8.2D, v1.2D +sqrdmulh v1.4S, v9.4S, v20.4S +mul v9.4S, v9.4S,v2.4S +mla v9.4S, v1.4S, v31.s[0] +sub v1.4s, v6.4s, v9.4s +add v6.4s, v6.4s, v9.4s +sqrdmulh v9.4S, v7.4S, v20.4S +mul v7.4S, v7.4S,v2.4S +mla v7.4S, v9.4S, v31.s[0] +sub v9.4s, v5.4s, v7.4s +add v5.4s, v5.4s, v7.4s +sqrdmulh v7.4S, v5.4S, v18.4S +mul v5.4S, v5.4S,v15.4S +mla v5.4S, v7.4S, v31.s[0] +sub v7.4s, v6.4s, v5.4s +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v9.4S, v19.4S +mul v9.4S, v9.4S,v12.4S +mla v9.4S, v5.4S, v31.s[0] +sub v5.4s, v1.4s, v9.4s +add v1.4s, v1.4s, v9.4s +str q6, [x0, #192] +str q7, [x0, #208] +str q1, [x0, #224] +str q5, [x0, #240] +ldr q5, [x17, #+640] +ldr q1, [x17, #+656] +ldr q7, [x17, #+672] +ldr q6, [x17, #+688] +ldr q9, [x17, #+704] +ldr q8, [x17, #+720] +ldr q4, [x17, #+736] +ldr q16, [x17, #+752] +ldr q19, [x0, #288] +ldr q12, [x0, #304] +ldr q18, [x0, #256] +ldr q15, [x0, #272] +sqrdmulh v20.4S, v19.4S, v1.s[0] +mul v19.4S, v19.4S,v5.s[0] +mla v19.4S, v20.4S, v31.s[0] +sub v20.4s, v18.4s, v19.4s +add v18.4s, v18.4s, v19.4s +sqrdmulh v19.4S, v12.4S, v1.s[0] +mul v12.4S, v12.4S,v5.s[0] +mla v12.4S, v19.4S, v31.s[0] +sub v19.4s, v15.4s, v12.4s +add v15.4s, v15.4s, v12.4s +sqrdmulh v12.4S, v15.4S, v1.s[1] +mul v15.4S, v15.4S,v5.s[1] +mla v15.4S, v12.4S, v31.s[0] +sub v12.4s, v18.4s, v15.4s +add v18.4s, v18.4s, v15.4s +sqrdmulh v15.4S, v19.4S, v1.s[2] +mul v19.4S, v19.4S,v5.s[2] +mla v19.4S, v15.4S, v31.s[0] +sub v15.4s, v20.4s, v19.4s +add v20.4s, v20.4s, v19.4s +trn1 v19.4S, v18.4S, v12.4S +trn2 v2.4S, v18.4S, v12.4S +trn1 v21.4S, v20.4S, v15.4S +trn2 v14.4S, v20.4S, v15.4S +trn2 v20.2D, v19.2D, v21.2D +trn2 v15.2D, v2.2D, v14.2D +trn1 v18.2D, v19.2D, v21.2D +trn1 v12.2D, v2.2D, v14.2D +sqrdmulh v14.4S, v20.4S, v6.4S +mul v20.4S, v20.4S,v7.4S +mla v20.4S, v14.4S, v31.s[0] +sub v14.4s, v18.4s, v20.4s +add v18.4s, v18.4s, v20.4s +sqrdmulh v20.4S, v15.4S, v6.4S +mul v15.4S, v15.4S,v7.4S +mla v15.4S, v20.4S, v31.s[0] +sub v20.4s, v12.4s, v15.4s +add v12.4s, v12.4s, v15.4s +sqrdmulh v15.4S, v12.4S, v8.4S +mul v12.4S, v12.4S,v9.4S +mla v12.4S, v15.4S, v31.s[0] +sub v15.4s, v18.4s, v12.4s +add v18.4s, v18.4s, v12.4s +sqrdmulh v12.4S, v20.4S, v16.4S +mul v20.4S, v20.4S,v4.4S +mla v20.4S, v12.4S, v31.s[0] +sub v12.4s, v14.4s, v20.4s +add v14.4s, v14.4s, v20.4s +str q18, [x0, #256] +str q15, [x0, #272] +str q14, [x0, #288] +str q12, [x0, #304] +ldr q12, [x17, #+768] +ldr q14, [x17, #+784] +ldr q15, [x17, #+800] +ldr q18, [x17, #+816] +ldr q20, [x17, #+832] +ldr q2, [x17, #+848] +ldr q21, [x17, #+864] +ldr q19, [x17, #+880] +ldr q16, [x0, #352] +ldr q4, [x0, #368] +ldr q8, [x0, #320] +ldr q9, [x0, #336] +sqrdmulh v6.4S, v16.4S, v14.s[0] +mul v16.4S, v16.4S,v12.s[0] +mla v16.4S, v6.4S, v31.s[0] +sub v6.4s, v8.4s, v16.4s +add v8.4s, v8.4s, v16.4s +sqrdmulh v16.4S, v4.4S, v14.s[0] +mul v4.4S, v4.4S,v12.s[0] +mla v4.4S, v16.4S, v31.s[0] +sub v16.4s, v9.4s, v4.4s +add v9.4s, v9.4s, v4.4s +sqrdmulh v4.4S, v9.4S, v14.s[1] +mul v9.4S, v9.4S,v12.s[1] +mla v9.4S, v4.4S, v31.s[0] +sub v4.4s, v8.4s, v9.4s +add v8.4s, v8.4s, v9.4s +sqrdmulh v9.4S, v16.4S, v14.s[2] +mul v16.4S, v16.4S,v12.s[2] +mla v16.4S, v9.4S, v31.s[0] +sub v9.4s, v6.4s, v16.4s +add v6.4s, v6.4s, v16.4s +trn1 v16.4S, v8.4S, v4.4S +trn2 v7.4S, v8.4S, v4.4S +trn1 v1.4S, v6.4S, v9.4S +trn2 v5.4S, v6.4S, v9.4S +trn2 v6.2D, v16.2D, v1.2D +trn2 v9.2D, v7.2D, v5.2D +trn1 v8.2D, v16.2D, v1.2D +trn1 v4.2D, v7.2D, v5.2D +sqrdmulh v5.4S, v6.4S, v18.4S +mul v6.4S, v6.4S,v15.4S +mla v6.4S, v5.4S, v31.s[0] +sub v5.4s, v8.4s, v6.4s +add v8.4s, v8.4s, v6.4s +sqrdmulh v6.4S, v9.4S, v18.4S +mul v9.4S, v9.4S,v15.4S +mla v9.4S, v6.4S, v31.s[0] +sub v6.4s, v4.4s, v9.4s +add v4.4s, v4.4s, v9.4s +sqrdmulh v9.4S, v4.4S, v2.4S +mul v4.4S, v4.4S,v20.4S +mla v4.4S, v9.4S, v31.s[0] +sub v9.4s, v8.4s, v4.4s +add v8.4s, v8.4s, v4.4s +sqrdmulh v4.4S, v6.4S, v19.4S +mul v6.4S, v6.4S,v21.4S +mla v6.4S, v4.4S, v31.s[0] +sub v4.4s, v5.4s, v6.4s +add v5.4s, v5.4s, v6.4s +str q8, [x0, #320] +str q9, [x0, #336] +str q5, [x0, #352] +str q4, [x0, #368] +ldr q4, [x17, #+896] +ldr q5, [x17, #+912] +ldr q9, [x17, #+928] +ldr q8, [x17, #+944] +ldr q6, [x17, #+960] +ldr q7, [x17, #+976] +ldr q1, [x17, #+992] +ldr q16, [x17, #+1008] +ldr q19, [x0, #416] +ldr q21, [x0, #432] +ldr q2, [x0, #384] +ldr q20, [x0, #400] +sqrdmulh v18.4S, v19.4S, v5.s[0] +mul v19.4S, v19.4S,v4.s[0] +mla v19.4S, v18.4S, v31.s[0] +sub v18.4s, v2.4s, v19.4s +add v2.4s, v2.4s, v19.4s +sqrdmulh v19.4S, v21.4S, v5.s[0] +mul v21.4S, v21.4S,v4.s[0] +mla v21.4S, v19.4S, v31.s[0] +sub v19.4s, v20.4s, v21.4s +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v20.4S, v5.s[1] +mul v20.4S, v20.4S,v4.s[1] +mla v20.4S, v21.4S, v31.s[0] +sub v21.4s, v2.4s, v20.4s +add v2.4s, v2.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v5.s[2] +mul v19.4S, v19.4S,v4.s[2] +mla v19.4S, v20.4S, v31.s[0] +sub v20.4s, v18.4s, v19.4s +add v18.4s, v18.4s, v19.4s +trn1 v19.4S, v2.4S, v21.4S +trn2 v15.4S, v2.4S, v21.4S +trn1 v14.4S, v18.4S, v20.4S +trn2 v12.4S, v18.4S, v20.4S +trn2 v18.2D, v19.2D, v14.2D +trn2 v20.2D, v15.2D, v12.2D +trn1 v2.2D, v19.2D, v14.2D +trn1 v21.2D, v15.2D, v12.2D +sqrdmulh v12.4S, v18.4S, v8.4S +mul v18.4S, v18.4S,v9.4S +mla v18.4S, v12.4S, v31.s[0] +sub v12.4s, v2.4s, v18.4s +add v2.4s, v2.4s, v18.4s +sqrdmulh v18.4S, v20.4S, v8.4S +mul v20.4S, v20.4S,v9.4S +mla v20.4S, v18.4S, v31.s[0] +sub v18.4s, v21.4s, v20.4s +add v21.4s, v21.4s, v20.4s +sqrdmulh v20.4S, v21.4S, v7.4S +mul v21.4S, v21.4S,v6.4S +mla v21.4S, v20.4S, v31.s[0] +sub v20.4s, v2.4s, v21.4s +add v2.4s, v2.4s, v21.4s +sqrdmulh v21.4S, v18.4S, v16.4S +mul v18.4S, v18.4S,v1.4S +mla v18.4S, v21.4S, v31.s[0] +sub v21.4s, v12.4s, v18.4s +add v12.4s, v12.4s, v18.4s +str q2, [x0, #384] +str q20, [x0, #400] +str q12, [x0, #416] +str q21, [x0, #432] +ldr q21, [x17, #+1024] +ldr q12, [x17, #+1040] +ldr q20, [x17, #+1056] +ldr q2, [x17, #+1072] +ldr q18, [x17, #+1088] +ldr q15, [x17, #+1104] +ldr q14, [x17, #+1120] +ldr q19, [x17, #+1136] +ldr q16, [x0, #480] +ldr q1, [x0, #496] +ldr q7, [x0, #448] +ldr q6, [x0, #464] +sqrdmulh v8.4S, v16.4S, v12.s[0] +mul v16.4S, v16.4S,v21.s[0] +mla v16.4S, v8.4S, v31.s[0] +sub v8.4s, v7.4s, v16.4s +add v7.4s, v7.4s, v16.4s +sqrdmulh v16.4S, v1.4S, v12.s[0] +mul v1.4S, v1.4S,v21.s[0] +mla v1.4S, v16.4S, v31.s[0] +sub v16.4s, v6.4s, v1.4s +add v6.4s, v6.4s, v1.4s +sqrdmulh v1.4S, v6.4S, v12.s[1] +mul v6.4S, v6.4S,v21.s[1] +mla v6.4S, v1.4S, v31.s[0] +sub v1.4s, v7.4s, v6.4s +add v7.4s, v7.4s, v6.4s +sqrdmulh v6.4S, v16.4S, v12.s[2] +mul v16.4S, v16.4S,v21.s[2] +mla v16.4S, v6.4S, v31.s[0] +sub v6.4s, v8.4s, v16.4s +add v8.4s, v8.4s, v16.4s +trn1 v16.4S, v7.4S, v1.4S +trn2 v9.4S, v7.4S, v1.4S +trn1 v5.4S, v8.4S, v6.4S +trn2 v4.4S, v8.4S, v6.4S +trn2 v8.2D, v16.2D, v5.2D +trn2 v6.2D, v9.2D, v4.2D +trn1 v7.2D, v16.2D, v5.2D +trn1 v1.2D, v9.2D, v4.2D +sqrdmulh v4.4S, v8.4S, v2.4S +mul v8.4S, v8.4S,v20.4S +mla v8.4S, v4.4S, v31.s[0] +sub v4.4s, v7.4s, v8.4s +add v7.4s, v7.4s, v8.4s +sqrdmulh v8.4S, v6.4S, v2.4S +mul v6.4S, v6.4S,v20.4S +mla v6.4S, v8.4S, v31.s[0] +sub v8.4s, v1.4s, v6.4s +add v1.4s, v1.4s, v6.4s +sqrdmulh v6.4S, v1.4S, v15.4S +mul v1.4S, v1.4S,v18.4S +mla v1.4S, v6.4S, v31.s[0] +sub v6.4s, v7.4s, v1.4s +add v7.4s, v7.4s, v1.4s +sqrdmulh v1.4S, v8.4S, v19.4S +mul v8.4S, v8.4S,v14.4S +mla v8.4S, v1.4S, v31.s[0] +sub v1.4s, v4.4s, v8.4s +add v4.4s, v4.4s, v8.4s +str q7, [x0, #448] +str q6, [x0, #464] +str q4, [x0, #480] +str q1, [x0, #496] +ldr q1, [x17, #+1152] +ldr q4, [x17, #+1168] +ldr q6, [x17, #+1184] +ldr q7, [x17, #+1200] +ldr q8, [x17, #+1216] +ldr q9, [x17, #+1232] +ldr q5, [x17, #+1248] +ldr q16, [x17, #+1264] +ldr q19, [x0, #544] +ldr q14, [x0, #560] +ldr q15, [x0, #512] +ldr q18, [x0, #528] +sqrdmulh v2.4S, v19.4S, v4.s[0] +mul v19.4S, v19.4S,v1.s[0] +mla v19.4S, v2.4S, v31.s[0] +sub v2.4s, v15.4s, v19.4s +add v15.4s, v15.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v4.s[0] +mul v14.4S, v14.4S,v1.s[0] +mla v14.4S, v19.4S, v31.s[0] +sub v19.4s, v18.4s, v14.4s +add v18.4s, v18.4s, v14.4s +sqrdmulh v14.4S, v18.4S, v4.s[1] +mul v18.4S, v18.4S,v1.s[1] +mla v18.4S, v14.4S, v31.s[0] +sub v14.4s, v15.4s, v18.4s +add v15.4s, v15.4s, v18.4s +sqrdmulh v18.4S, v19.4S, v4.s[2] +mul v19.4S, v19.4S,v1.s[2] +mla v19.4S, v18.4S, v31.s[0] +sub v18.4s, v2.4s, v19.4s +add v2.4s, v2.4s, v19.4s +trn1 v19.4S, v15.4S, v14.4S +trn2 v20.4S, v15.4S, v14.4S +trn1 v12.4S, v2.4S, v18.4S +trn2 v21.4S, v2.4S, v18.4S +trn2 v2.2D, v19.2D, v12.2D +trn2 v18.2D, v20.2D, v21.2D +trn1 v15.2D, v19.2D, v12.2D +trn1 v14.2D, v20.2D, v21.2D +sqrdmulh v21.4S, v2.4S, v7.4S +mul v2.4S, v2.4S,v6.4S +mla v2.4S, v21.4S, v31.s[0] +sub v21.4s, v15.4s, v2.4s +add v15.4s, v15.4s, v2.4s +sqrdmulh v2.4S, v18.4S, v7.4S +mul v18.4S, v18.4S,v6.4S +mla v18.4S, v2.4S, v31.s[0] +sub v2.4s, v14.4s, v18.4s +add v14.4s, v14.4s, v18.4s +sqrdmulh v18.4S, v14.4S, v9.4S +mul v14.4S, v14.4S,v8.4S +mla v14.4S, v18.4S, v31.s[0] +sub v18.4s, v15.4s, v14.4s +add v15.4s, v15.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v16.4S +mul v2.4S, v2.4S,v5.4S +mla v2.4S, v14.4S, v31.s[0] +sub v14.4s, v21.4s, v2.4s +add v21.4s, v21.4s, v2.4s +str q15, [x0, #512] +str q18, [x0, #528] +str q21, [x0, #544] +str q14, [x0, #560] +ldr q14, [x17, #+1280] +ldr q21, [x17, #+1296] +ldr q18, [x17, #+1312] +ldr q15, [x17, #+1328] +ldr q2, [x17, #+1344] +ldr q20, [x17, #+1360] +ldr q12, [x17, #+1376] +ldr q19, [x17, #+1392] +ldr q16, [x0, #608] +ldr q5, [x0, #624] +ldr q9, [x0, #576] +ldr q8, [x0, #592] +sqrdmulh v7.4S, v16.4S, v21.s[0] +mul v16.4S, v16.4S,v14.s[0] +mla v16.4S, v7.4S, v31.s[0] +sub v7.4s, v9.4s, v16.4s +add v9.4s, v9.4s, v16.4s +sqrdmulh v16.4S, v5.4S, v21.s[0] +mul v5.4S, v5.4S,v14.s[0] +mla v5.4S, v16.4S, v31.s[0] +sub v16.4s, v8.4s, v5.4s +add v8.4s, v8.4s, v5.4s +sqrdmulh v5.4S, v8.4S, v21.s[1] +mul v8.4S, v8.4S,v14.s[1] +mla v8.4S, v5.4S, v31.s[0] +sub v5.4s, v9.4s, v8.4s +add v9.4s, v9.4s, v8.4s +sqrdmulh v8.4S, v16.4S, v21.s[2] +mul v16.4S, v16.4S,v14.s[2] +mla v16.4S, v8.4S, v31.s[0] +sub v8.4s, v7.4s, v16.4s +add v7.4s, v7.4s, v16.4s +trn1 v16.4S, v9.4S, v5.4S +trn2 v6.4S, v9.4S, v5.4S +trn1 v4.4S, v7.4S, v8.4S +trn2 v1.4S, v7.4S, v8.4S +trn2 v7.2D, v16.2D, v4.2D +trn2 v8.2D, v6.2D, v1.2D +trn1 v9.2D, v16.2D, v4.2D +trn1 v5.2D, v6.2D, v1.2D +sqrdmulh v1.4S, v7.4S, v15.4S +mul v7.4S, v7.4S,v18.4S +mla v7.4S, v1.4S, v31.s[0] +sub v1.4s, v9.4s, v7.4s +add v9.4s, v9.4s, v7.4s +sqrdmulh v7.4S, v8.4S, v15.4S +mul v8.4S, v8.4S,v18.4S +mla v8.4S, v7.4S, v31.s[0] +sub v7.4s, v5.4s, v8.4s +add v5.4s, v5.4s, v8.4s +sqrdmulh v8.4S, v5.4S, v20.4S +mul v5.4S, v5.4S,v2.4S +mla v5.4S, v8.4S, v31.s[0] +sub v8.4s, v9.4s, v5.4s +add v9.4s, v9.4s, v5.4s +sqrdmulh v5.4S, v7.4S, v19.4S +mul v7.4S, v7.4S,v12.4S +mla v7.4S, v5.4S, v31.s[0] +sub v5.4s, v1.4s, v7.4s +add v1.4s, v1.4s, v7.4s +str q9, [x0, #576] +str q8, [x0, #592] +str q1, [x0, #608] +str q5, [x0, #624] +ldr q5, [x17, #+1408] +ldr q1, [x17, #+1424] +ldr q8, [x17, #+1440] +ldr q9, [x17, #+1456] +ldr q7, [x17, #+1472] +ldr q6, [x17, #+1488] +ldr q4, [x17, #+1504] +ldr q16, [x17, #+1520] +ldr q19, [x0, #672] +ldr q12, [x0, #688] +ldr q20, [x0, #640] +ldr q2, [x0, #656] +sqrdmulh v15.4S, v19.4S, v1.s[0] +mul v19.4S, v19.4S,v5.s[0] +mla v19.4S, v15.4S, v31.s[0] +sub v15.4s, v20.4s, v19.4s +add v20.4s, v20.4s, v19.4s +sqrdmulh v19.4S, v12.4S, v1.s[0] +mul v12.4S, v12.4S,v5.s[0] +mla v12.4S, v19.4S, v31.s[0] +sub v19.4s, v2.4s, v12.4s +add v2.4s, v2.4s, v12.4s +sqrdmulh v12.4S, v2.4S, v1.s[1] +mul v2.4S, v2.4S,v5.s[1] +mla v2.4S, v12.4S, v31.s[0] +sub v12.4s, v20.4s, v2.4s +add v20.4s, v20.4s, v2.4s +sqrdmulh v2.4S, v19.4S, v1.s[2] +mul v19.4S, v19.4S,v5.s[2] +mla v19.4S, v2.4S, v31.s[0] +sub v2.4s, v15.4s, v19.4s +add v15.4s, v15.4s, v19.4s +trn1 v19.4S, v20.4S, v12.4S +trn2 v18.4S, v20.4S, v12.4S +trn1 v21.4S, v15.4S, v2.4S +trn2 v14.4S, v15.4S, v2.4S +trn2 v15.2D, v19.2D, v21.2D +trn2 v2.2D, v18.2D, v14.2D +trn1 v20.2D, v19.2D, v21.2D +trn1 v12.2D, v18.2D, v14.2D +sqrdmulh v14.4S, v15.4S, v9.4S +mul v15.4S, v15.4S,v8.4S +mla v15.4S, v14.4S, v31.s[0] +sub v14.4s, v20.4s, v15.4s +add v20.4s, v20.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v9.4S +mul v2.4S, v2.4S,v8.4S +mla v2.4S, v15.4S, v31.s[0] +sub v15.4s, v12.4s, v2.4s +add v12.4s, v12.4s, v2.4s +sqrdmulh v2.4S, v12.4S, v6.4S +mul v12.4S, v12.4S,v7.4S +mla v12.4S, v2.4S, v31.s[0] +sub v2.4s, v20.4s, v12.4s +add v20.4s, v20.4s, v12.4s +sqrdmulh v12.4S, v15.4S, v16.4S +mul v15.4S, v15.4S,v4.4S +mla v15.4S, v12.4S, v31.s[0] +sub v12.4s, v14.4s, v15.4s +add v14.4s, v14.4s, v15.4s +str q20, [x0, #640] +str q2, [x0, #656] +str q14, [x0, #672] +str q12, [x0, #688] +ldr q12, [x17, #+1536] +ldr q14, [x17, #+1552] +ldr q2, [x17, #+1568] +ldr q20, [x17, #+1584] +ldr q15, [x17, #+1600] +ldr q18, [x17, #+1616] +ldr q21, [x17, #+1632] +ldr q19, [x17, #+1648] +ldr q16, [x0, #736] +ldr q4, [x0, #752] +ldr q6, [x0, #704] +ldr q7, [x0, #720] +sqrdmulh v9.4S, v16.4S, v14.s[0] +mul v16.4S, v16.4S,v12.s[0] +mla v16.4S, v9.4S, v31.s[0] +sub v9.4s, v6.4s, v16.4s +add v6.4s, v6.4s, v16.4s +sqrdmulh v16.4S, v4.4S, v14.s[0] +mul v4.4S, v4.4S,v12.s[0] +mla v4.4S, v16.4S, v31.s[0] +sub v16.4s, v7.4s, v4.4s +add v7.4s, v7.4s, v4.4s +sqrdmulh v4.4S, v7.4S, v14.s[1] +mul v7.4S, v7.4S,v12.s[1] +mla v7.4S, v4.4S, v31.s[0] +sub v4.4s, v6.4s, v7.4s +add v6.4s, v6.4s, v7.4s +sqrdmulh v7.4S, v16.4S, v14.s[2] +mul v16.4S, v16.4S,v12.s[2] +mla v16.4S, v7.4S, v31.s[0] +sub v7.4s, v9.4s, v16.4s +add v9.4s, v9.4s, v16.4s +trn1 v16.4S, v6.4S, v4.4S +trn2 v8.4S, v6.4S, v4.4S +trn1 v1.4S, v9.4S, v7.4S +trn2 v5.4S, v9.4S, v7.4S +trn2 v9.2D, v16.2D, v1.2D +trn2 v7.2D, v8.2D, v5.2D +trn1 v6.2D, v16.2D, v1.2D +trn1 v4.2D, v8.2D, v5.2D +sqrdmulh v5.4S, v9.4S, v20.4S +mul v9.4S, v9.4S,v2.4S +mla v9.4S, v5.4S, v31.s[0] +sub v5.4s, v6.4s, v9.4s +add v6.4s, v6.4s, v9.4s +sqrdmulh v9.4S, v7.4S, v20.4S +mul v7.4S, v7.4S,v2.4S +mla v7.4S, v9.4S, v31.s[0] +sub v9.4s, v4.4s, v7.4s +add v4.4s, v4.4s, v7.4s +sqrdmulh v7.4S, v4.4S, v18.4S +mul v4.4S, v4.4S,v15.4S +mla v4.4S, v7.4S, v31.s[0] +sub v7.4s, v6.4s, v4.4s +add v6.4s, v6.4s, v4.4s +sqrdmulh v4.4S, v9.4S, v19.4S +mul v9.4S, v9.4S,v21.4S +mla v9.4S, v4.4S, v31.s[0] +sub v4.4s, v5.4s, v9.4s +add v5.4s, v5.4s, v9.4s +str q6, [x0, #704] +str q7, [x0, #720] +str q5, [x0, #736] +str q4, [x0, #752] +ldr q4, [x17, #+1664] +ldr q5, [x17, #+1680] +ldr q7, [x17, #+1696] +ldr q6, [x17, #+1712] +ldr q9, [x17, #+1728] +ldr q8, [x17, #+1744] +ldr q1, [x17, #+1760] +ldr q16, [x17, #+1776] +ldr q19, [x0, #800] +ldr q21, [x0, #816] +ldr q18, [x0, #768] +ldr q15, [x0, #784] +sqrdmulh v20.4S, v19.4S, v5.s[0] +mul v19.4S, v19.4S,v4.s[0] +mla v19.4S, v20.4S, v31.s[0] +sub v20.4s, v18.4s, v19.4s +add v18.4s, v18.4s, v19.4s +sqrdmulh v19.4S, v21.4S, v5.s[0] +mul v21.4S, v21.4S,v4.s[0] +mla v21.4S, v19.4S, v31.s[0] +sub v19.4s, v15.4s, v21.4s +add v15.4s, v15.4s, v21.4s +sqrdmulh v21.4S, v15.4S, v5.s[1] +mul v15.4S, v15.4S,v4.s[1] +mla v15.4S, v21.4S, v31.s[0] +sub v21.4s, v18.4s, v15.4s +add v18.4s, v18.4s, v15.4s +sqrdmulh v15.4S, v19.4S, v5.s[2] +mul v19.4S, v19.4S,v4.s[2] +mla v19.4S, v15.4S, v31.s[0] +sub v15.4s, v20.4s, v19.4s +add v20.4s, v20.4s, v19.4s +trn1 v19.4S, v18.4S, v21.4S +trn2 v2.4S, v18.4S, v21.4S +trn1 v14.4S, v20.4S, v15.4S +trn2 v12.4S, v20.4S, v15.4S +trn2 v20.2D, v19.2D, v14.2D +trn2 v15.2D, v2.2D, v12.2D +trn1 v18.2D, v19.2D, v14.2D +trn1 v21.2D, v2.2D, v12.2D +sqrdmulh v12.4S, v20.4S, v6.4S +mul v20.4S, v20.4S,v7.4S +mla v20.4S, v12.4S, v31.s[0] +sub v12.4s, v18.4s, v20.4s +add v18.4s, v18.4s, v20.4s +sqrdmulh v20.4S, v15.4S, v6.4S +mul v15.4S, v15.4S,v7.4S +mla v15.4S, v20.4S, v31.s[0] +sub v20.4s, v21.4s, v15.4s +add v21.4s, v21.4s, v15.4s +sqrdmulh v15.4S, v21.4S, v8.4S +mul v21.4S, v21.4S,v9.4S +mla v21.4S, v15.4S, v31.s[0] +sub v15.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v20.4S, v16.4S +mul v20.4S, v20.4S,v1.4S +mla v20.4S, v21.4S, v31.s[0] +sub v21.4s, v12.4s, v20.4s +add v12.4s, v12.4s, v20.4s +str q18, [x0, #768] +str q15, [x0, #784] +str q12, [x0, #800] +str q21, [x0, #816] +ldr q21, [x17, #+1792] +ldr q12, [x17, #+1808] +ldr q15, [x17, #+1824] +ldr q18, [x17, #+1840] +ldr q20, [x17, #+1856] +ldr q2, [x17, #+1872] +ldr q14, [x17, #+1888] +ldr q19, [x17, #+1904] +ldr q16, [x0, #864] +ldr q1, [x0, #880] +ldr q8, [x0, #832] +ldr q9, [x0, #848] +sqrdmulh v6.4S, v16.4S, v12.s[0] +mul v16.4S, v16.4S,v21.s[0] +mla v16.4S, v6.4S, v31.s[0] +sub v6.4s, v8.4s, v16.4s +add v8.4s, v8.4s, v16.4s +sqrdmulh v16.4S, v1.4S, v12.s[0] +mul v1.4S, v1.4S,v21.s[0] +mla v1.4S, v16.4S, v31.s[0] +sub v16.4s, v9.4s, v1.4s +add v9.4s, v9.4s, v1.4s +sqrdmulh v1.4S, v9.4S, v12.s[1] +mul v9.4S, v9.4S,v21.s[1] +mla v9.4S, v1.4S, v31.s[0] +sub v1.4s, v8.4s, v9.4s +add v8.4s, v8.4s, v9.4s +sqrdmulh v9.4S, v16.4S, v12.s[2] +mul v16.4S, v16.4S,v21.s[2] +mla v16.4S, v9.4S, v31.s[0] +sub v9.4s, v6.4s, v16.4s +add v6.4s, v6.4s, v16.4s +trn1 v16.4S, v8.4S, v1.4S +trn2 v7.4S, v8.4S, v1.4S +trn1 v5.4S, v6.4S, v9.4S +trn2 v4.4S, v6.4S, v9.4S +trn2 v6.2D, v16.2D, v5.2D +trn2 v9.2D, v7.2D, v4.2D +trn1 v8.2D, v16.2D, v5.2D +trn1 v1.2D, v7.2D, v4.2D +sqrdmulh v4.4S, v6.4S, v18.4S +mul v6.4S, v6.4S,v15.4S +mla v6.4S, v4.4S, v31.s[0] +sub v4.4s, v8.4s, v6.4s +add v8.4s, v8.4s, v6.4s +sqrdmulh v6.4S, v9.4S, v18.4S +mul v9.4S, v9.4S,v15.4S +mla v9.4S, v6.4S, v31.s[0] +sub v6.4s, v1.4s, v9.4s +add v1.4s, v1.4s, v9.4s +sqrdmulh v9.4S, v1.4S, v2.4S +mul v1.4S, v1.4S,v20.4S +mla v1.4S, v9.4S, v31.s[0] +sub v9.4s, v8.4s, v1.4s +add v8.4s, v8.4s, v1.4s +sqrdmulh v1.4S, v6.4S, v19.4S +mul v6.4S, v6.4S,v14.4S +mla v6.4S, v1.4S, v31.s[0] +sub v1.4s, v4.4s, v6.4s +add v4.4s, v4.4s, v6.4s +str q8, [x0, #832] +str q9, [x0, #848] +str q4, [x0, #864] +str q1, [x0, #880] +ldr q1, [x17, #+1920] +ldr q4, [x17, #+1936] +ldr q9, [x17, #+1952] +ldr q8, [x17, #+1968] +ldr q6, [x17, #+1984] +ldr q7, [x17, #+2000] +ldr q5, [x17, #+2016] +ldr q16, [x17, #+2032] +ldr q19, [x0, #928] +ldr q14, [x0, #944] +ldr q2, [x0, #896] +ldr q20, [x0, #912] +sqrdmulh v18.4S, v19.4S, v4.s[0] +mul v19.4S, v19.4S,v1.s[0] +mla v19.4S, v18.4S, v31.s[0] +sub v18.4s, v2.4s, v19.4s +add v2.4s, v2.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v4.s[0] +mul v14.4S, v14.4S,v1.s[0] +mla v14.4S, v19.4S, v31.s[0] +sub v19.4s, v20.4s, v14.4s +add v20.4s, v20.4s, v14.4s +sqrdmulh v14.4S, v20.4S, v4.s[1] +mul v20.4S, v20.4S,v1.s[1] +mla v20.4S, v14.4S, v31.s[0] +sub v14.4s, v2.4s, v20.4s +add v2.4s, v2.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v4.s[2] +mul v19.4S, v19.4S,v1.s[2] +mla v19.4S, v20.4S, v31.s[0] +sub v20.4s, v18.4s, v19.4s +add v18.4s, v18.4s, v19.4s +trn1 v19.4S, v2.4S, v14.4S +trn2 v15.4S, v2.4S, v14.4S +trn1 v12.4S, v18.4S, v20.4S +trn2 v21.4S, v18.4S, v20.4S +trn2 v18.2D, v19.2D, v12.2D +trn2 v20.2D, v15.2D, v21.2D +trn1 v2.2D, v19.2D, v12.2D +trn1 v14.2D, v15.2D, v21.2D +sqrdmulh v21.4S, v18.4S, v8.4S +mul v18.4S, v18.4S,v9.4S +mla v18.4S, v21.4S, v31.s[0] +sub v21.4s, v2.4s, v18.4s +add v2.4s, v2.4s, v18.4s +sqrdmulh v18.4S, v20.4S, v8.4S +mul v20.4S, v20.4S,v9.4S +mla v20.4S, v18.4S, v31.s[0] +sub v18.4s, v14.4s, v20.4s +add v14.4s, v14.4s, v20.4s +sqrdmulh v20.4S, v14.4S, v7.4S +mul v14.4S, v14.4S,v6.4S +mla v14.4S, v20.4S, v31.s[0] +sub v20.4s, v2.4s, v14.4s +add v2.4s, v2.4s, v14.4s +sqrdmulh v14.4S, v18.4S, v16.4S +mul v18.4S, v18.4S,v5.4S +mla v18.4S, v14.4S, v31.s[0] +sub v14.4s, v21.4s, v18.4s +add v21.4s, v21.4s, v18.4s +str q2, [x0, #896] +str q20, [x0, #912] +str q21, [x0, #928] +str q14, [x0, #944] +ldr q14, [x17, #+2048] +ldr q21, [x17, #+2064] +ldr q20, [x17, #+2080] +ldr q2, [x17, #+2096] +ldr q18, [x17, #+2112] +ldr q15, [x17, #+2128] +ldr q12, [x17, #+2144] +ldr q19, [x17, #+2160] +ldr q16, [x0, #992] +ldr q5, [x0, #1008] +ldr q7, [x0, #960] +ldr q6, [x0, #976] +sqrdmulh v8.4S, v16.4S, v21.s[0] +mul v16.4S, v16.4S,v14.s[0] +mla v16.4S, v8.4S, v31.s[0] +sub v8.4s, v7.4s, v16.4s +add v7.4s, v7.4s, v16.4s +sqrdmulh v16.4S, v5.4S, v21.s[0] +mul v5.4S, v5.4S,v14.s[0] +mla v5.4S, v16.4S, v31.s[0] +sub v16.4s, v6.4s, v5.4s +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v6.4S, v21.s[1] +mul v6.4S, v6.4S,v14.s[1] +mla v6.4S, v5.4S, v31.s[0] +sub v5.4s, v7.4s, v6.4s +add v7.4s, v7.4s, v6.4s +sqrdmulh v6.4S, v16.4S, v21.s[2] +mul v16.4S, v16.4S,v14.s[2] +mla v16.4S, v6.4S, v31.s[0] +sub v6.4s, v8.4s, v16.4s +add v8.4s, v8.4s, v16.4s +trn1 v16.4S, v7.4S, v5.4S +trn2 v9.4S, v7.4S, v5.4S +trn1 v4.4S, v8.4S, v6.4S +trn2 v1.4S, v8.4S, v6.4S +trn2 v8.2D, v16.2D, v4.2D +trn2 v6.2D, v9.2D, v1.2D +trn1 v7.2D, v16.2D, v4.2D +trn1 v5.2D, v9.2D, v1.2D +sqrdmulh v1.4S, v8.4S, v2.4S +mul v8.4S, v8.4S,v20.4S +mla v8.4S, v1.4S, v31.s[0] +sub v1.4s, v7.4s, v8.4s +add v7.4s, v7.4s, v8.4s +sqrdmulh v8.4S, v6.4S, v2.4S +mul v6.4S, v6.4S,v20.4S +mla v6.4S, v8.4S, v31.s[0] +sub v8.4s, v5.4s, v6.4s +add v5.4s, v5.4s, v6.4s +sqrdmulh v6.4S, v5.4S, v15.4S +mul v5.4S, v5.4S,v18.4S +mla v5.4S, v6.4S, v31.s[0] +sub v6.4s, v7.4s, v5.4s +add v7.4s, v7.4s, v5.4s +sqrdmulh v5.4S, v8.4S, v19.4S +mul v8.4S, v8.4S,v12.4S +mla v8.4S, v5.4S, v31.s[0] +sub v5.4s, v1.4s, v8.4s +add v1.4s, v1.4s, v8.4s +str q7, [x0, #960] +str q6, [x0, #976] +str q1, [x0, #992] +str q5, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 2392 +// Instruction count: 2388 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_0.s b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_0.s new file mode 100644 index 0000000..d3538e3 --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_0.s @@ -0,0 +1,2422 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 26036764 // Layer 6, block 0 +.word 7065381 // Layer 6, block 1 +.word 11280567 // Layer 6, block 2 +.word 19695786 // Layer 6, block 3 +.word 1666225723 // Layer 6, block 0 +.word 452149874 // Layer 6, block 1 +.word 721901190 // Layer 6, block 2 +.word 1260434103 // Layer 6, block 3 +.word 28678040 // Layer 7, block 0 +.word 5637166 // Layer 7, block 2 +.word 18759424 // Layer 7, block 4 +.word 8648030 // Layer 7, block 6 +.word 1835254486 // Layer 7, block 0 +.word 360751090 // Layer 7, block 2 +.word 1200511508 // Layer 7, block 4 +.word 553431680 // Layer 7, block 6 +.word 7232147 // Layer 7, block 1 +.word 7430689 // Layer 7, block 3 +.word 14819378 // Layer 7, block 5 +.word 22112339 // Layer 7, block 7 +.word 462822084 // Layer 7, block 1 +.word 475527802 // Layer 7, block 3 +.word 948367809 // Layer 7, block 5 +.word 1415081692 // Layer 7, block 7 +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14834498 // Layer 6, block 4 +.word 22861321 // Layer 6, block 5 +.word 23033862 // Layer 6, block 6 +.word 32211066 // Layer 6, block 7 +.word 949335415 // Layer 6, block 4 +.word 1463012881 // Layer 6, block 5 +.word 1474054663 // Layer 6, block 6 +.word 2061350894 // Layer 6, block 7 +.word 7103825 // Layer 7, block 8 +.word 24338119 // Layer 7, block 10 +.word 6674394 // Layer 7, block 12 +.word 3716128 // Layer 7, block 14 +.word 454610102 // Layer 7, block 8 +.word 1557520740 // Layer 7, block 10 +.word 427128616 // Layer 7, block 12 +.word 237814041 // Layer 7, block 14 +.word 18577393 // Layer 7, block 9 +.word 17042091 // Layer 7, block 11 +.word 6574213 // Layer 7, block 13 +.word 24666803 // Layer 7, block 15 +.word 1188862414 // Layer 7, block 9 +.word 1090610585 // Layer 7, block 11 +.word 420717521 // Layer 7, block 13 +.word 1578554911 // Layer 7, block 15 +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 11253846 // Layer 6, block 8 +.word 16151303 // Layer 6, block 9 +.word 1821442 // Layer 6, block 10 +.word 23358663 // Layer 6, block 11 +.word 720191176 // Layer 6, block 8 +.word 1033604503 // Layer 6, block 9 +.word 116563391 // Layer 6, block 10 +.word 1494840340 // Layer 6, block 11 +.word 32787475 // Layer 7, block 16 +.word 8269259 // Layer 7, block 18 +.word 20826321 // Layer 7, block 20 +.word 21194054 // Layer 7, block 22 +.word 2098238255 // Layer 7, block 16 +.word 529192186 // Layer 7, block 18 +.word 1332782821 // Layer 7, block 20 +.word 1356315937 // Layer 7, block 22 +.word 28400654 // Layer 7, block 17 +.word 31090287 // Layer 7, block 19 +.word 26776841 // Layer 7, block 21 +.word 22281074 // Layer 7, block 23 +.word 1817503137 // Layer 7, block 17 +.word 1989626512 // Layer 7, block 19 +.word 1713587037 // Layer 7, block 21 +.word 1425879908 // Layer 7, block 23 +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 20504641 // Layer 6, block 12 +.word 7735096 // Layer 6, block 13 +.word 29463916 // Layer 6, block 14 +.word 23172067 // Layer 6, block 15 +.word 1312196872 // Layer 6, block 12 +.word 495008363 // Layer 6, block 13 +.word 1885546712 // Layer 6, block 14 +.word 1482899108 // Layer 6, block 15 +.word 1953000 // Layer 7, block 24 +.word 12766243 // Layer 7, block 26 +.word 16292342 // Layer 7, block 28 +.word 25143337 // Layer 7, block 30 +.word 124982461 // Layer 7, block 24 +.word 816977197 // Layer 7, block 26 +.word 1042630311 // Layer 7, block 28 +.word 1609050759 // Layer 7, block 30 +.word 12486848 // Layer 7, block 25 +.word 31556661 // Layer 7, block 27 +.word 28330310 // Layer 7, block 29 +.word 15137961 // Layer 7, block 31 +.word 799097282 // Layer 7, block 25 +.word 2019472170 // Layer 7, block 27 +.word 1813001465 // Layer 7, block 29 +.word 968755565 // Layer 7, block 31 +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 18663828 // Layer 6, block 16 +.word 25765932 // Layer 6, block 17 +.word 11779122 // Layer 6, block 18 +.word 29112305 // Layer 6, block 19 +.word 1194393831 // Layer 6, block 16 +.word 1648893798 // Layer 6, block 17 +.word 753806275 // Layer 6, block 18 +.word 1863045325 // Layer 6, block 19 +.word 33163184 // Layer 7, block 32 +.word 11550623 // Layer 7, block 34 +.word 25375595 // Layer 7, block 36 +.word 18254638 // Layer 7, block 38 +.word 2122281795 // Layer 7, block 32 +.word 739183455 // Layer 7, block 34 +.word 1623914137 // Layer 7, block 36 +.word 1168207670 // Layer 7, block 38 +.word 9551359 // Layer 7, block 33 +.word 33257316 // Layer 7, block 35 +.word 10387700 // Layer 7, block 37 +.word 4263629 // Layer 7, block 39 +.word 611240324 // Layer 7, block 33 +.word 2128305784 // Layer 7, block 35 +.word 664762063 // Layer 7, block 37 +.word 272851431 // Layer 7, block 39 +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 596073 // Layer 6, block 20 +.word 29039358 // Layer 6, block 21 +.word 6760262 // Layer 6, block 22 +.word 2228887 // Layer 6, block 23 +.word 38145761 // Layer 6, block 20 +.word 1858377074 // Layer 6, block 21 +.word 432623749 // Layer 6, block 22 +.word 142637881 // Layer 6, block 23 +.word 25929180 // Layer 7, block 40 +.word 23508428 // Layer 7, block 42 +.word 22560727 // Layer 7, block 44 +.word 29457393 // Layer 7, block 46 +.word 1659340873 // Layer 7, block 40 +.word 1504424569 // Layer 7, block 42 +.word 1443776334 // Layer 7, block 44 +.word 1885129272 // Layer 7, block 46 +.word 17371159 // Layer 7, block 41 +.word 11558208 // Layer 7, block 43 +.word 15755637 // Layer 7, block 45 +.word 20740787 // Layer 7, block 47 +.word 1111669329 // Layer 7, block 41 +.word 739668858 // Layer 7, block 43 +.word 1008283812 // Layer 7, block 45 +.word 1327309063 // Layer 7, block 47 +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 13624329 // Layer 6, block 24 +.word 9838349 // Layer 6, block 25 +.word 6934560 // Layer 6, block 26 +.word 11310234 // Layer 6, block 27 +.word 871890510 // Layer 6, block 24 +.word 629606282 // Layer 6, block 25 +.word 443777969 // Layer 6, block 26 +.word 723799733 // Layer 6, block 27 +.word 3153984 // Layer 7, block 48 +.word 15599806 // Layer 7, block 50 +.word 23484790 // Layer 7, block 52 +.word 30174454 // Layer 7, block 54 +.word 201839571 // Layer 7, block 48 +.word 998311389 // Layer 7, block 50 +.word 1502911852 // Layer 7, block 52 +.word 1931017673 // Layer 7, block 54 +.word 13598070 // Layer 7, block 49 +.word 31454003 // Layer 7, block 51 +.word 20506260 // Layer 7, block 53 +.word 5928435 // Layer 7, block 55 +.word 870210062 // Layer 7, block 49 +.word 2012902560 // Layer 7, block 51 +.word 1312300480 // Layer 7, block 53 +.word 379390883 // Layer 7, block 55 +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 32798516 // Layer 6, block 28 +.word 9911360 // Layer 6, block 29 +.word 32443170 // Layer 6, block 30 +.word 31293482 // Layer 6, block 31 +.word 2098944825 // Layer 6, block 28 +.word 634278629 // Layer 6, block 29 +.word 2076204416 // Layer 6, block 30 +.word 2002630000 // Layer 6, block 31 +.word 26013877 // Layer 7, block 56 +.word 22928950 // Layer 7, block 58 +.word 24547058 // Layer 7, block 60 +.word 21082546 // Layer 7, block 62 +.word 1664761067 // Layer 7, block 56 +.word 1467340807 // Layer 7, block 58 +.word 1570891816 // Layer 7, block 60 +.word 1349179970 // Layer 7, block 62 +.word 21864746 // Layer 7, block 57 +.word 27678266 // Layer 7, block 59 +.word 30695887 // Layer 7, block 61 +.word 31772478 // Layer 7, block 63 +.word 1399236949 // Layer 7, block 57 +.word 1771273834 // Layer 7, block 59 +.word 1964386839 // Layer 7, block 61 +.word 2033283404 // Layer 7, block 63 +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 2853776 // Layer 6, block 32 +.word 31645959 // Layer 6, block 33 +.word 29723614 // Layer 6, block 34 +.word 31813171 // Layer 6, block 35 +.word 182627725 // Layer 6, block 32 +.word 2025186806 // Layer 6, block 33 +.word 1902166116 // Layer 6, block 34 +.word 2035887557 // Layer 6, block 35 +.word 30377953 // Layer 7, block 64 +.word 4924837 // Layer 7, block 66 +.word 11362575 // Layer 7, block 68 +.word 31398766 // Layer 7, block 70 +.word 1944040616 // Layer 7, block 64 +.word 315165513 // Layer 7, block 66 +.word 727149301 // Layer 7, block 68 +.word 2009367662 // Layer 7, block 70 +.word 27689101 // Layer 7, block 65 +.word 31229525 // Layer 7, block 67 +.word 6544948 // Layer 7, block 69 +.word 13728247 // Layer 7, block 71 +.word 1771967221 // Layer 7, block 65 +.word 1998537064 // Layer 7, block 67 +.word 418844704 // Layer 7, block 69 +.word 878540754 // Layer 7, block 71 +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9116920 // Layer 6, block 36 +.word 26449800 // Layer 6, block 37 +.word 27173300 // Layer 6, block 38 +.word 1574249 // Layer 6, block 39 +.word 583438350 // Layer 6, block 36 +.word 1692658010 // Layer 6, block 37 +.word 1738958476 // Layer 6, block 38 +.word 100744247 // Layer 6, block 39 +.word 6510145 // Layer 7, block 72 +.word 760999 // Layer 7, block 74 +.word 1634503 // Layer 7, block 76 +.word 29546109 // Layer 7, block 78 +.word 416617482 // Layer 7, block 72 +.word 48700219 // Layer 7, block 74 +.word 104600209 // Layer 7, block 76 +.word 1890806663 // Layer 7, block 78 +.word 2195232 // Layer 7, block 73 +.word 4465852 // Layer 7, block 75 +.word 31203102 // Layer 7, block 77 +.word 29916743 // Layer 7, block 79 +.word 140484126 // Layer 7, block 73 +.word 285792715 // Layer 7, block 75 +.word 1996846121 // Layer 7, block 77 +.word 1914525428 // Layer 7, block 79 +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29172999 // Layer 6, block 40 +.word 16825951 // Layer 6, block 41 +.word 11592382 // Layer 6, block 42 +.word 2671395 // Layer 6, block 43 +.word 1866929445 // Layer 6, block 40 +.word 1076778680 // Layer 6, block 41 +.word 741855827 // Layer 6, block 42 +.word 170956232 // Layer 6, block 43 +.word 14579779 // Layer 7, block 80 +.word 24263513 // Layer 7, block 82 +.word 4646776 // Layer 7, block 84 +.word 69049 // Layer 7, block 86 +.word 933034643 // Layer 7, block 80 +.word 1552746321 // Layer 7, block 82 +.word 297370968 // Layer 7, block 84 +.word 4418799 // Layer 7, block 86 +.word 33263488 // Layer 7, block 81 +.word 22493246 // Layer 7, block 83 +.word 22009979 // Layer 7, block 85 +.word 12021234 // Layer 7, block 87 +.word 2128700762 // Layer 7, block 81 +.word 1439457879 // Layer 7, block 83 +.word 1408531152 // Layer 7, block 85 +.word 769300260 // Layer 7, block 87 +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 15720958 // Layer 6, block 44 +.word 4876619 // Layer 6, block 45 +.word 9370171 // Layer 6, block 46 +.word 2197027 // Layer 6, block 47 +.word 1006064525 // Layer 6, block 44 +.word 312079797 // Layer 6, block 45 +.word 599645177 // Layer 6, block 46 +.word 140598997 // Layer 6, block 47 +.word 16117282 // Layer 7, block 88 +.word 9635661 // Layer 7, block 90 +.word 9117520 // Layer 7, block 92 +.word 3506913 // Layer 7, block 94 +.word 1031427326 // Layer 7, block 88 +.word 616635240 // Layer 7, block 90 +.word 583476747 // Layer 7, block 92 +.word 224425303 // Layer 7, block 94 +.word 20014407 // Layer 7, block 89 +.word 25893988 // Layer 7, block 91 +.word 10257619 // Layer 7, block 93 +.word 24501669 // Layer 7, block 95 +.word 1280824291 // Layer 7, block 89 +.word 1657088757 // Layer 7, block 91 +.word 656437514 // Layer 7, block 93 +.word 1567987141 // Layer 7, block 95 +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 23467272 // Layer 6, block 48 +.word 11944835 // Layer 6, block 49 +.word 29768154 // Layer 6, block 50 +.word 3189790 // Layer 6, block 51 +.word 1501790786 // Layer 6, block 48 +.word 764411097 // Layer 6, block 49 +.word 1905016458 // Layer 6, block 50 +.word 204130980 // Layer 6, block 51 +.word 28559032 // Layer 7, block 96 +.word 20151609 // Layer 7, block 98 +.word 11645481 // Layer 7, block 100 +.word 16402437 // Layer 7, block 102 +.word 1827638556 // Layer 7, block 96 +.word 1289604549 // Layer 7, block 98 +.word 745253903 // Layer 7, block 100 +.word 1049675853 // Layer 7, block 102 +.word 1005359 // Layer 7, block 97 +.word 19130139 // Layer 7, block 99 +.word 11690281 // Layer 7, block 101 +.word 5461508 // Layer 7, block 103 +.word 64338065 // Layer 7, block 97 +.word 1224235458 // Layer 7, block 99 +.word 748120885 // Layer 7, block 101 +.word 349509836 // Layer 7, block 103 +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 4898455 // Layer 6, block 52 +.word 22059944 // Layer 6, block 53 +.word 20315246 // Layer 6, block 54 +.word 28615767 // Layer 6, block 55 +.word 313477194 // Layer 6, block 52 +.word 1411728668 // Layer 6, block 53 +.word 1300076517 // Layer 6, block 54 +.word 1831269319 // Layer 6, block 55 +.word 6226096 // Layer 7, block 104 +.word 14029790 // Layer 7, block 106 +.word 7729000 // Layer 7, block 108 +.word 13958531 // Layer 7, block 110 +.word 398439734 // Layer 7, block 104 +.word 897838034 // Layer 7, block 106 +.word 494618249 // Layer 7, block 108 +.word 893277806 // Layer 7, block 110 +.word 31755058 // Layer 7, block 105 +.word 26102744 // Layer 7, block 107 +.word 19175904 // Layer 7, block 109 +.word 19472238 // Layer 7, block 111 +.word 2032168609 // Layer 7, block 105 +.word 1670448121 // Layer 7, block 107 +.word 1227164194 // Layer 7, block 109 +.word 1246128123 // Layer 7, block 111 +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 17302560 // Layer 6, block 56 +.word 8630188 // Layer 6, block 57 +.word 13744680 // Layer 6, block 58 +.word 31890906 // Layer 6, block 59 +.word 1107279328 // Layer 6, block 56 +.word 552289879 // Layer 6, block 57 +.word 879592386 // Layer 6, block 58 +.word 2040862218 // Layer 6, block 59 +.word 4735938 // Layer 7, block 112 +.word 26671657 // Layer 7, block 114 +.word 25810971 // Layer 7, block 116 +.word 25578690 // Layer 7, block 118 +.word 303076900 // Layer 7, block 112 +.word 1706855774 // Layer 7, block 114 +.word 1651776074 // Layer 7, block 116 +.word 1636911225 // Layer 7, block 118 +.word 6957373 // Layer 7, block 113 +.word 25381712 // Layer 7, block 115 +.word 27780827 // Layer 7, block 117 +.word 28062311 // Layer 7, block 119 +.word 445237890 // Layer 7, block 113 +.word 1624305595 // Layer 7, block 115 +.word 1777837237 // Layer 7, block 117 +.word 1795850838 // Layer 7, block 119 +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 26150922 // Layer 6, block 60 +.word 29525906 // Layer 6, block 61 +.word 23080870 // Layer 6, block 62 +.word 1636987 // Layer 6, block 63 +.word 1673531278 // Layer 6, block 60 +.word 1889513769 // Layer 6, block 61 +.word 1477062945 // Layer 6, block 62 +.word 104759172 // Layer 6, block 63 +.word 10674616 // Layer 7, block 120 +.word 9508293 // Layer 7, block 122 +.word 4274200 // Layer 7, block 124 +.word 10066304 // Layer 7, block 126 +.word 683123285 // Layer 7, block 120 +.word 608484310 // Layer 7, block 122 +.word 273527923 // Layer 7, block 124 +.word 644194289 // Layer 7, block 126 +.word 26473446 // Layer 7, block 121 +.word 14853570 // Layer 7, block 123 +.word 32427548 // Layer 7, block 125 +.word 16598340 // Layer 7, block 127 +.word 1694171239 // Layer 7, block 121 +.word 950555930 // Layer 7, block 123 +.word 2075204685 // Layer 7, block 125 +.word 1062212688 // Layer 7, block 127 +.text +.global ntt_u32_full_neon_asm_var_4_4_3_0 +.global _ntt_u32_full_neon_asm_var_4_4_3_0 +ntt_u32_full_neon_asm_var_4_4_3_0: +_ntt_u32_full_neon_asm_var_4_4_3_0: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #800] +ldr q21, [x0, #864] +ldr q20, [x0, #928] +ldr q19, [x0, #992] +ldr q18, [x0, #288] +ldr q17, [x0, #352] +ldr q16, [x0, #416] +ldr q3, [x0, #480] +sqrdmulh v2.4S, v22.4S, v29.s[0] +ldr q1, [x0, #544] +mul v22.4S, v22.4S,v30.s[0] +ldr q0, [x0, #608] +sqrdmulh v15.4S, v21.4S, v29.s[0] +ldr q14, [x0, #672] +mul v21.4S, v21.4S,v30.s[0] +ldr q13, [x0, #736] +mla v22.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q12, [x0, #32] +sub v11.4s, v18.4s, v22.4s +mla v21.4S, v15.4S, v31.s[0] +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q15, [x0, #96] +sub v10.4s, v17.4s, v21.4s +mla v20.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v1.4S, v29.s[0] +ldr q2, [x0, #160] +mul v1.4S, v1.4S,v30.s[0] +sub v9.4s, v16.4s, v20.4s +mla v19.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v0.4S, v29.s[0] +ldr q22, [x0, #224] +mul v0.4S, v0.4S,v30.s[0] +sub v8.4s, v3.4s, v19.4s +mla v1.4S, v21.4S, v31.s[0] +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v21.4s, v12.4s, v1.4s +mla v0.4S, v20.4S, v31.s[0] +add v12.4s, v12.4s, v1.4s +sqrdmulh v1.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +sub v20.4s, v15.4s, v0.4s +mla v14.4S, v19.4S, v31.s[0] +add v15.4s, v15.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v19.4s, v2.4s, v14.4s +mla v13.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v1.4s, v22.4s, v13.4s +mla v16.4S, v0.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v0.4s, v2.4s, v16.4s +mla v3.4S, v14.4S, v31.s[0] +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v14.4s, v22.4s, v3.4s +mla v18.4S, v13.4S, v31.s[0] +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v29.s[2] +mul v9.4S, v9.4S,v30.s[2] +sub v13.4s, v12.4s, v18.4s +mla v17.4S, v16.4S, v31.s[0] +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v8.4S, v29.s[2] +mul v8.4S, v8.4S,v30.s[2] +sub v16.4s, v15.4s, v17.4s +mla v9.4S, v3.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +sqrdmulh v17.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v3.4s, v19.4s, v9.4s +mla v8.4S, v18.4S, v31.s[0] +add v19.4s, v19.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v18.4s, v1.4s, v8.4s +mla v11.4S, v17.4S, v31.s[0] +add v1.4s, v1.4s, v8.4s +sqrdmulh v8.4S, v2.4S, v27.s[0] +mul v2.4S, v2.4S,v28.s[0] +sub v17.4s, v21.4s, v11.4s +mla v10.4S, v9.4S, v31.s[0] +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v27.s[0] +mul v22.4S, v22.4S,v28.s[0] +sub v9.4s, v20.4s, v10.4s +mla v2.4S, v8.4S, v31.s[0] +add v20.4s, v20.4s, v10.4s +sqrdmulh v10.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v8.4s, v12.4s, v2.4s +mla v22.4S, v11.4S, v31.s[0] +add v12.4s, v12.4s, v2.4s +sqrdmulh v2.4S, v14.4S, v27.s[1] +mul v14.4S, v14.4S,v28.s[1] +sub v11.4s, v15.4s, v22.4s +mla v0.4S, v10.4S, v31.s[0] +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v27.s[2] +mul v19.4S, v19.4S,v28.s[2] +sub v10.4s, v13.4s, v0.4s +mla v14.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v0.4s +sqrdmulh v0.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +sub v2.4s, v16.4s, v14.4s +mla v19.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v27.s[3] +mul v3.4S, v3.4S,v28.s[3] +sub v22.4s, v21.4s, v19.4s +mla v1.4S, v0.4S, v31.s[0] +add v21.4s, v21.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v0.4s, v20.4s, v1.4s +mla v3.4S, v14.4S, v31.s[0] +add v20.4s, v20.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v25.s[0] +mul v15.4S, v15.4S,v26.s[0] +sub v14.4s, v17.4s, v3.4s +mla v18.4S, v19.4S, v31.s[0] +add v17.4s, v17.4s, v3.4s +sqrdmulh v3.4S, v11.4S, v25.s[1] +mul v11.4S, v11.4S,v26.s[1] +sub v19.4s, v9.4s, v18.4s +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v1.4s, v12.4s, v15.4s +mla v11.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v25.s[3] +mul v2.4S, v2.4S,v26.s[3] +sub v3.4s, v8.4s, v11.4s +mla v16.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v11.4s +str q12, [x0, #32] +sqrdmulh v12.4S, v20.4S, v23.s[0] +str q1, [x0, #96] +mul v20.4S, v20.4S,v24.s[0] +ldr q1, [x0, #816] +sub v11.4s, v13.4s, v16.4s +ldr q18, [x0, #880] +mla v2.4S, v15.4S, v31.s[0] +add v13.4s, v13.4s, v16.4s +str q8, [x0, #160] +sqrdmulh v8.4S, v0.4S, v23.s[1] +str q3, [x0, #224] +mul v0.4S, v0.4S,v24.s[1] +ldr q3, [x0, #944] +sub v16.4s, v10.4s, v2.4s +ldr q15, [x0, #1008] +mla v20.4S, v12.4S, v31.s[0] +add v10.4s, v10.4s, v2.4s +str q13, [x0, #288] +sqrdmulh v13.4S, v9.4S, v23.s[2] +str q11, [x0, #352] +mul v9.4S, v9.4S,v24.s[2] +ldr q11, [x0, #304] +sub v2.4s, v21.4s, v20.4s +ldr q12, [x0, #368] +mla v0.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v20.4s +str q10, [x0, #416] +sqrdmulh v10.4S, v19.4S, v23.s[3] +str q16, [x0, #480] +mul v19.4S, v19.4S,v24.s[3] +ldr q16, [x0, #432] +sub v20.4s, v22.4s, v0.4s +ldr q8, [x0, #496] +mla v9.4S, v13.4S, v31.s[0] +add v22.4s, v22.4s, v0.4s +str q21, [x0, #544] +sqrdmulh v21.4S, v1.4S, v29.s[0] +str q2, [x0, #608] +ldr q2, [x0, #560] +mul v1.4S, v1.4S,v30.s[0] +ldr q0, [x0, #624] +sub v13.4s, v17.4s, v9.4s +mla v19.4S, v10.4S, v31.s[0] +add v17.4s, v17.4s, v9.4s +str q22, [x0, #672] +sqrdmulh v22.4S, v18.4S, v29.s[0] +str q20, [x0, #736] +ldr q20, [x0, #688] +mul v18.4S, v18.4S,v30.s[0] +ldr q9, [x0, #752] +sub v10.4s, v14.4s, v19.4s +mla v1.4S, v21.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +str q17, [x0, #800] +sqrdmulh v17.4S, v3.4S, v29.s[0] +str q13, [x0, #864] +mul v3.4S, v3.4S,v30.s[0] +ldr q13, [x0, #48] +sub v19.4s, v11.4s, v1.4s +mla v18.4S, v22.4S, v31.s[0] +add v11.4s, v11.4s, v1.4s +str q14, [x0, #928] +sqrdmulh v14.4S, v15.4S, v29.s[0] +str q10, [x0, #992] +mul v15.4S, v15.4S,v30.s[0] +ldr q10, [x0, #112] +sub v1.4s, v12.4s, v18.4s +mla v3.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v2.4S, v29.s[0] +ldr q17, [x0, #176] +mul v2.4S, v2.4S,v30.s[0] +sub v22.4s, v16.4s, v3.4s +mla v15.4S, v14.4S, v31.s[0] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v0.4S, v29.s[0] +ldr q14, [x0, #240] +mul v0.4S, v0.4S,v30.s[0] +sub v21.4s, v8.4s, v15.4s +mla v2.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +sub v18.4s, v13.4s, v2.4s +mla v0.4S, v3.4S, v31.s[0] +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v9.4S, v29.s[0] +mul v9.4S, v9.4S,v30.s[0] +sub v3.4s, v10.4s, v0.4s +mla v20.4S, v15.4S, v31.s[0] +add v10.4s, v10.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v15.4s, v17.4s, v20.4s +mla v9.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +sub v2.4s, v14.4s, v9.4s +mla v16.4S, v0.4S, v31.s[0] +add v14.4s, v14.4s, v9.4s +sqrdmulh v9.4S, v11.4S, v29.s[1] +mul v11.4S, v11.4S,v30.s[1] +sub v0.4s, v17.4s, v16.4s +mla v8.4S, v20.4S, v31.s[0] +add v17.4s, v17.4s, v16.4s +sqrdmulh v16.4S, v12.4S, v29.s[1] +mul v12.4S, v12.4S,v30.s[1] +sub v20.4s, v14.4s, v8.4s +mla v11.4S, v9.4S, v31.s[0] +add v14.4s, v14.4s, v8.4s +sqrdmulh v8.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +sub v9.4s, v13.4s, v11.4s +mla v12.4S, v16.4S, v31.s[0] +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +sub v16.4s, v10.4s, v12.4s +mla v22.4S, v8.4S, v31.s[0] +add v10.4s, v10.4s, v12.4s +sqrdmulh v12.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +sub v8.4s, v15.4s, v22.4s +mla v21.4S, v11.4S, v31.s[0] +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v1.4S, v29.s[2] +mul v1.4S, v1.4S,v30.s[2] +sub v11.4s, v2.4s, v21.4s +mla v19.4S, v12.4S, v31.s[0] +add v2.4s, v2.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v27.s[0] +mul v17.4S, v17.4S,v28.s[0] +sub v12.4s, v18.4s, v19.4s +mla v1.4S, v22.4S, v31.s[0] +add v18.4s, v18.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +sub v22.4s, v3.4s, v1.4s +mla v17.4S, v21.4S, v31.s[0] +add v3.4s, v3.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v21.4s, v13.4s, v17.4s +mla v14.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v17.4s +sqrdmulh v17.4S, v20.4S, v27.s[1] +mul v20.4S, v20.4S,v28.s[1] +sub v19.4s, v10.4s, v14.4s +mla v0.4S, v1.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v27.s[2] +mul v15.4S, v15.4S,v28.s[2] +sub v1.4s, v9.4s, v0.4s +mla v20.4S, v17.4S, v31.s[0] +add v9.4s, v9.4s, v0.4s +sqrdmulh v0.4S, v2.4S, v27.s[2] +mul v2.4S, v2.4S,v28.s[2] +sub v17.4s, v16.4s, v20.4s +mla v15.4S, v14.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v27.s[3] +mul v8.4S, v8.4S,v28.s[3] +sub v14.4s, v18.4s, v15.4s +mla v2.4S, v0.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v27.s[3] +mul v11.4S, v11.4S,v28.s[3] +sub v0.4s, v3.4s, v2.4s +mla v8.4S, v20.4S, v31.s[0] +add v3.4s, v3.4s, v2.4s +sqrdmulh v2.4S, v10.4S, v25.s[0] +mul v10.4S, v10.4S,v26.s[0] +sub v20.4s, v12.4s, v8.4s +mla v11.4S, v15.4S, v31.s[0] +add v12.4s, v12.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v25.s[1] +mul v19.4S, v19.4S,v26.s[1] +sub v15.4s, v22.4s, v11.4s +mla v10.4S, v2.4S, v31.s[0] +add v22.4s, v22.4s, v11.4s +sqrdmulh v11.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v2.4s, v13.4s, v10.4s +mla v19.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v10.4s +sqrdmulh v10.4S, v17.4S, v25.s[3] +mul v17.4S, v17.4S,v26.s[3] +sub v8.4s, v21.4s, v19.4s +mla v16.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v19.4s +str q13, [x0, #48] +sqrdmulh v13.4S, v3.4S, v23.s[0] +str q2, [x0, #112] +mul v3.4S, v3.4S,v24.s[0] +ldr q2, [x0, #768] +sub v19.4s, v9.4s, v16.4s +ldr q11, [x0, #832] +mla v17.4S, v10.4S, v31.s[0] +add v9.4s, v9.4s, v16.4s +str q21, [x0, #176] +sqrdmulh v21.4S, v0.4S, v23.s[1] +str q8, [x0, #240] +mul v0.4S, v0.4S,v24.s[1] +ldr q8, [x0, #896] +sub v16.4s, v1.4s, v17.4s +ldr q10, [x0, #960] +mla v3.4S, v13.4S, v31.s[0] +add v1.4s, v1.4s, v17.4s +str q9, [x0, #304] +sqrdmulh v9.4S, v22.4S, v23.s[2] +str q19, [x0, #368] +mul v22.4S, v22.4S,v24.s[2] +ldr q19, [x0, #256] +sub v17.4s, v18.4s, v3.4s +ldr q13, [x0, #320] +mla v0.4S, v21.4S, v31.s[0] +add v18.4s, v18.4s, v3.4s +str q1, [x0, #432] +sqrdmulh v1.4S, v15.4S, v23.s[3] +str q16, [x0, #496] +mul v15.4S, v15.4S,v24.s[3] +ldr q16, [x0, #384] +sub v3.4s, v14.4s, v0.4s +ldr q21, [x0, #448] +mla v22.4S, v9.4S, v31.s[0] +add v14.4s, v14.4s, v0.4s +str q18, [x0, #560] +sqrdmulh v18.4S, v2.4S, v29.s[0] +str q17, [x0, #624] +ldr q17, [x0, #512] +mul v2.4S, v2.4S,v30.s[0] +ldr q0, [x0, #576] +sub v9.4s, v12.4s, v22.4s +mla v15.4S, v1.4S, v31.s[0] +add v12.4s, v12.4s, v22.4s +str q14, [x0, #688] +sqrdmulh v14.4S, v11.4S, v29.s[0] +str q3, [x0, #752] +ldr q3, [x0, #640] +mul v11.4S, v11.4S,v30.s[0] +ldr q22, [x0, #704] +sub v1.4s, v20.4s, v15.4s +mla v2.4S, v18.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +str q12, [x0, #816] +sqrdmulh v12.4S, v8.4S, v29.s[0] +str q9, [x0, #880] +mul v8.4S, v8.4S,v30.s[0] +ldr q9, [x0, #0] +sub v15.4s, v19.4s, v2.4s +mla v11.4S, v14.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +str q20, [x0, #944] +sqrdmulh v20.4S, v10.4S, v29.s[0] +str q1, [x0, #1008] +mul v10.4S, v10.4S,v30.s[0] +ldr q1, [x0, #64] +sub v2.4s, v13.4s, v11.4s +mla v8.4S, v12.4S, v31.s[0] +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v29.s[0] +ldr q12, [x0, #128] +mul v17.4S, v17.4S,v30.s[0] +sub v14.4s, v16.4s, v8.4s +mla v10.4S, v20.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v0.4S, v29.s[0] +ldr q20, [x0, #192] +mul v0.4S, v0.4S,v30.s[0] +sub v18.4s, v21.4s, v10.4s +mla v17.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +sub v11.4s, v9.4s, v17.4s +mla v0.4S, v8.4S, v31.s[0] +add v9.4s, v9.4s, v17.4s +sqrdmulh v17.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v8.4s, v1.4s, v0.4s +mla v3.4S, v10.4S, v31.s[0] +add v1.4s, v1.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v10.4s, v12.4s, v3.4s +mla v22.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v17.4s, v20.4s, v22.4s +mla v16.4S, v0.4S, v31.s[0] +add v20.4s, v20.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[1] +mul v19.4S, v19.4S,v30.s[1] +sub v0.4s, v12.4s, v16.4s +mla v21.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v13.4S, v29.s[1] +mul v13.4S, v13.4S,v30.s[1] +sub v3.4s, v20.4s, v21.4s +mla v19.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v22.4s, v9.4s, v19.4s +mla v13.4S, v16.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v29.s[2] +mul v18.4S, v18.4S,v30.s[2] +sub v16.4s, v1.4s, v13.4s +mla v14.4S, v21.4S, v31.s[0] +add v1.4s, v1.4s, v13.4s +sqrdmulh v13.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +sub v21.4s, v10.4s, v14.4s +mla v18.4S, v19.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v19.4s, v17.4s, v18.4s +mla v15.4S, v13.4S, v31.s[0] +add v17.4s, v17.4s, v18.4s +sqrdmulh v18.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +sub v13.4s, v11.4s, v15.4s +mla v2.4S, v14.4S, v31.s[0] +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v27.s[0] +mul v20.4S, v20.4S,v28.s[0] +sub v14.4s, v8.4s, v2.4s +mla v12.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v2.4s +sqrdmulh v2.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v18.4s, v9.4s, v12.4s +mla v20.4S, v15.4S, v31.s[0] +add v9.4s, v9.4s, v12.4s +sqrdmulh v12.4S, v3.4S, v27.s[1] +mul v3.4S, v3.4S,v28.s[1] +sub v15.4s, v1.4s, v20.4s +mla v0.4S, v2.4S, v31.s[0] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v10.4S, v27.s[2] +mul v10.4S, v10.4S,v28.s[2] +sub v2.4s, v22.4s, v0.4s +mla v3.4S, v12.4S, v31.s[0] +add v22.4s, v22.4s, v0.4s +sqrdmulh v0.4S, v17.4S, v27.s[2] +mul v17.4S, v17.4S,v28.s[2] +sub v12.4s, v16.4s, v3.4s +mla v10.4S, v20.4S, v31.s[0] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +sub v20.4s, v11.4s, v10.4s +mla v17.4S, v0.4S, v31.s[0] +add v11.4s, v11.4s, v10.4s +sqrdmulh v10.4S, v19.4S, v27.s[3] +mul v19.4S, v19.4S,v28.s[3] +sub v0.4s, v8.4s, v17.4s +mla v21.4S, v3.4S, v31.s[0] +add v8.4s, v8.4s, v17.4s +sqrdmulh v17.4S, v1.4S, v25.s[0] +mul v1.4S, v1.4S,v26.s[0] +sub v3.4s, v13.4s, v21.4s +mla v19.4S, v10.4S, v31.s[0] +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v15.4S, v25.s[1] +mul v15.4S, v15.4S,v26.s[1] +sub v10.4s, v14.4s, v19.4s +mla v1.4S, v17.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +sqrdmulh v19.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v17.4s, v9.4s, v1.4s +mla v15.4S, v21.4S, v31.s[0] +add v9.4s, v9.4s, v1.4s +sqrdmulh v1.4S, v12.4S, v25.s[3] +mul v12.4S, v12.4S,v26.s[3] +sub v21.4s, v18.4s, v15.4s +mla v16.4S, v19.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +str q9, [x0, #0] +sqrdmulh v9.4S, v8.4S, v23.s[0] +str q17, [x0, #64] +mul v8.4S, v8.4S,v24.s[0] +ldr q17, [x0, #784] +sub v15.4s, v22.4s, v16.4s +ldr q19, [x0, #848] +mla v12.4S, v1.4S, v31.s[0] +add v22.4s, v22.4s, v16.4s +str q18, [x0, #128] +sqrdmulh v18.4S, v0.4S, v23.s[1] +str q21, [x0, #192] +mul v0.4S, v0.4S,v24.s[1] +ldr q21, [x0, #912] +sub v16.4s, v2.4s, v12.4s +ldr q1, [x0, #976] +mla v8.4S, v9.4S, v31.s[0] +add v2.4s, v2.4s, v12.4s +str q22, [x0, #256] +sqrdmulh v22.4S, v14.4S, v23.s[2] +str q15, [x0, #320] +mul v14.4S, v14.4S,v24.s[2] +ldr q15, [x0, #272] +sub v12.4s, v11.4s, v8.4s +ldr q9, [x0, #336] +mla v0.4S, v18.4S, v31.s[0] +add v11.4s, v11.4s, v8.4s +str q2, [x0, #384] +sqrdmulh v2.4S, v10.4S, v23.s[3] +str q16, [x0, #448] +mul v10.4S, v10.4S,v24.s[3] +ldr q16, [x0, #400] +sub v8.4s, v20.4s, v0.4s +ldr q18, [x0, #464] +mla v14.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v0.4s +str q11, [x0, #512] +sqrdmulh v11.4S, v17.4S, v29.s[0] +str q12, [x0, #576] +ldr q12, [x0, #528] +mul v17.4S, v17.4S,v30.s[0] +ldr q0, [x0, #592] +sub v22.4s, v13.4s, v14.4s +mla v10.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v14.4s +str q20, [x0, #640] +sqrdmulh v20.4S, v19.4S, v29.s[0] +str q8, [x0, #704] +ldr q8, [x0, #656] +mul v19.4S, v19.4S,v30.s[0] +ldr q14, [x0, #720] +sub v2.4s, v3.4s, v10.4s +mla v17.4S, v11.4S, v31.s[0] +add v3.4s, v3.4s, v10.4s +str q13, [x0, #768] +sqrdmulh v13.4S, v21.4S, v29.s[0] +str q22, [x0, #832] +mul v21.4S, v21.4S,v30.s[0] +ldr q22, [x0, #16] +sub v10.4s, v15.4s, v17.4s +mla v19.4S, v20.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +str q3, [x0, #896] +sqrdmulh v3.4S, v1.4S, v29.s[0] +str q2, [x0, #960] +mul v1.4S, v1.4S,v30.s[0] +ldr q2, [x0, #80] +sub v17.4s, v9.4s, v19.4s +mla v21.4S, v13.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v12.4S, v29.s[0] +ldr q13, [x0, #144] +mul v12.4S, v12.4S,v30.s[0] +sub v20.4s, v16.4s, v21.4s +mla v1.4S, v3.4S, v31.s[0] +add v16.4s, v16.4s, v21.4s +sqrdmulh v21.4S, v0.4S, v29.s[0] +ldr q3, [x0, #208] +mul v0.4S, v0.4S,v30.s[0] +sub v11.4s, v18.4s, v1.4s +mla v12.4S, v19.4S, v31.s[0] +add v18.4s, v18.4s, v1.4s +sqrdmulh v1.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v19.4s, v22.4s, v12.4s +mla v0.4S, v21.4S, v31.s[0] +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v21.4s, v2.4s, v0.4s +mla v8.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v1.4s, v13.4s, v8.4s +mla v14.4S, v12.4S, v31.s[0] +add v13.4s, v13.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v12.4s, v3.4s, v14.4s +mla v16.4S, v0.4S, v31.s[0] +add v3.4s, v3.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +sub v0.4s, v13.4s, v16.4s +mla v18.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v16.4s +sqrdmulh v16.4S, v9.4S, v29.s[1] +mul v9.4S, v9.4S,v30.s[1] +sub v8.4s, v3.4s, v18.4s +mla v15.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +sub v14.4s, v22.4s, v15.4s +mla v9.4S, v16.4S, v31.s[0] +add v22.4s, v22.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v16.4s, v2.4s, v9.4s +mla v20.4S, v18.4S, v31.s[0] +add v2.4s, v2.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v18.4s, v1.4s, v20.4s +mla v11.4S, v15.4S, v31.s[0] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +sub v15.4s, v12.4s, v11.4s +mla v10.4S, v9.4S, v31.s[0] +add v12.4s, v12.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v27.s[0] +mul v13.4S, v13.4S,v28.s[0] +sub v9.4s, v19.4s, v10.4s +mla v17.4S, v20.4S, v31.s[0] +add v19.4s, v19.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v27.s[0] +mul v3.4S, v3.4S,v28.s[0] +sub v20.4s, v21.4s, v17.4s +mla v13.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v11.4s, v22.4s, v13.4s +mla v3.4S, v10.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v8.4S, v27.s[1] +mul v8.4S, v8.4S,v28.s[1] +sub v10.4s, v2.4s, v3.4s +mla v0.4S, v17.4S, v31.s[0] +add v2.4s, v2.4s, v3.4s +sqrdmulh v3.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +sub v17.4s, v14.4s, v0.4s +mla v8.4S, v13.4S, v31.s[0] +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v12.4S, v27.s[2] +mul v12.4S, v12.4S,v28.s[2] +sub v13.4s, v16.4s, v8.4s +mla v1.4S, v3.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v3.4s, v19.4s, v1.4s +mla v12.4S, v0.4S, v31.s[0] +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +sub v0.4s, v21.4s, v12.4s +mla v18.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v2.4S, v25.s[0] +mul v2.4S, v2.4S,v26.s[0] +sub v8.4s, v9.4s, v18.4s +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v10.4S, v25.s[1] +mul v10.4S, v10.4S,v26.s[1] +sub v1.4s, v20.4s, v15.4s +mla v2.4S, v12.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v12.4s, v22.4s, v2.4s +mla v10.4S, v18.4S, v31.s[0] +add v22.4s, v22.4s, v2.4s +sqrdmulh v2.4S, v13.4S, v25.s[3] +mul v13.4S, v13.4S,v26.s[3] +sub v18.4s, v11.4s, v10.4s +mla v16.4S, v15.4S, v31.s[0] +add v11.4s, v11.4s, v10.4s +str q22, [x0, #16] +sqrdmulh v22.4S, v21.4S, v23.s[0] +str q12, [x0, #80] +mul v21.4S, v21.4S,v24.s[0] +sub v12.4s, v14.4s, v16.4s +mla v13.4S, v2.4S, v31.s[0] +add v14.4s, v14.4s, v16.4s +str q11, [x0, #144] +sqrdmulh v11.4S, v0.4S, v23.s[1] +str q18, [x0, #208] +mul v0.4S, v0.4S,v24.s[1] +sub v18.4s, v17.4s, v13.4s +mla v21.4S, v22.4S, v31.s[0] +add v17.4s, v17.4s, v13.4s +str q14, [x0, #272] +sqrdmulh v14.4S, v20.4S, v23.s[2] +str q12, [x0, #336] +mul v20.4S, v20.4S,v24.s[2] +sub v12.4s, v19.4s, v21.4s +mla v0.4S, v11.4S, v31.s[0] +add v19.4s, v19.4s, v21.4s +str q17, [x0, #400] +sqrdmulh v17.4S, v1.4S, v23.s[3] +str q18, [x0, #464] +mul v1.4S, v1.4S,v24.s[3] +sub v18.4s, v3.4s, v0.4s +mla v20.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v0.4s +str q19, [x0, #528] +str q12, [x0, #592] +sub v12.4s, v9.4s, v20.4s +mla v1.4S, v17.4S, v31.s[0] +add v9.4s, v9.4s, v20.4s +str q3, [x0, #656] +str q18, [x0, #720] +sub v18.4s, v8.4s, v1.4s +add v8.4s, v8.4s, v1.4s +str q9, [x0, #784] +str q12, [x0, #848] +str q8, [x0, #912] +str q18, [x0, #976] +ldr q4, [x17, #+128] +ldr q5, [x17, #+144] +ldr q6, [x17, #+160] +ldr q7, [x17, #+176] +ldr q15, [x17, #+192] +ldr q10, [x17, #+208] +ldr q2, [x17, #+224] +ldr q16, [x17, #+240] +ldr q22, [x0, #32] +ldr q13, [x0, #48] +ldr q11, [x0, #0] +ldr q21, [x0, #16] +sqrdmulh v14.4S, v22.4S, v5.s[0] +mul v22.4S, v22.4S,v4.s[0] +mla v22.4S, v14.4S, v31.s[0] +sub v14.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +sqrdmulh v22.4S, v13.4S, v5.s[0] +mul v13.4S, v13.4S,v4.s[0] +mla v13.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +sqrdmulh v13.4S, v21.4S, v5.s[1] +mul v21.4S, v21.4S,v4.s[1] +mla v21.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v21.4s +add v11.4s, v11.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v5.s[2] +mul v22.4S, v22.4S,v4.s[2] +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v14.4s, v22.4s +add v14.4s, v14.4s, v22.4s +trn1 v22.4S, v11.4S, v13.4S +trn2 v0.4S, v11.4S, v13.4S +trn1 v19.4S, v14.4S, v21.4S +trn2 v17.4S, v14.4S, v21.4S +trn2 v14.2D, v22.2D, v19.2D +trn2 v21.2D, v0.2D, v17.2D +trn1 v11.2D, v22.2D, v19.2D +trn1 v13.2D, v0.2D, v17.2D +sqrdmulh v17.4S, v14.4S, v7.4S +mul v14.4S, v14.4S,v6.4S +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v11.4s, v14.4s +add v11.4s, v11.4s, v14.4s +sqrdmulh v14.4S, v21.4S, v7.4S +mul v21.4S, v21.4S,v6.4S +mla v21.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v21.4s +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v13.4S, v10.4S +mul v13.4S, v13.4S,v15.4S +mla v13.4S, v21.4S, v31.s[0] +sub v21.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +sqrdmulh v13.4S, v14.4S, v16.4S +mul v14.4S, v14.4S,v2.4S +mla v14.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v14.4s +add v17.4s, v17.4s, v14.4s +str q11, [x0, #0] +str q21, [x0, #16] +str q17, [x0, #32] +str q13, [x0, #48] +ldr q13, [x17, #+256] +ldr q17, [x17, #+272] +ldr q21, [x17, #+288] +ldr q11, [x17, #+304] +ldr q14, [x17, #+320] +ldr q0, [x17, #+336] +ldr q19, [x17, #+352] +ldr q22, [x17, #+368] +ldr q16, [x0, #96] +ldr q2, [x0, #112] +ldr q10, [x0, #64] +ldr q15, [x0, #80] +sqrdmulh v7.4S, v16.4S, v17.s[0] +mul v16.4S, v16.4S,v13.s[0] +mla v16.4S, v7.4S, v31.s[0] +sub v7.4s, v10.4s, v16.4s +add v10.4s, v10.4s, v16.4s +sqrdmulh v16.4S, v2.4S, v17.s[0] +mul v2.4S, v2.4S,v13.s[0] +mla v2.4S, v16.4S, v31.s[0] +sub v16.4s, v15.4s, v2.4s +add v15.4s, v15.4s, v2.4s +sqrdmulh v2.4S, v15.4S, v17.s[1] +mul v15.4S, v15.4S,v13.s[1] +mla v15.4S, v2.4S, v31.s[0] +sub v2.4s, v10.4s, v15.4s +add v10.4s, v10.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v17.s[2] +mul v16.4S, v16.4S,v13.s[2] +mla v16.4S, v15.4S, v31.s[0] +sub v15.4s, v7.4s, v16.4s +add v7.4s, v7.4s, v16.4s +trn1 v16.4S, v10.4S, v2.4S +trn2 v6.4S, v10.4S, v2.4S +trn1 v5.4S, v7.4S, v15.4S +trn2 v4.4S, v7.4S, v15.4S +trn2 v7.2D, v16.2D, v5.2D +trn2 v15.2D, v6.2D, v4.2D +trn1 v10.2D, v16.2D, v5.2D +trn1 v2.2D, v6.2D, v4.2D +sqrdmulh v4.4S, v7.4S, v11.4S +mul v7.4S, v7.4S,v21.4S +mla v7.4S, v4.4S, v31.s[0] +sub v4.4s, v10.4s, v7.4s +add v10.4s, v10.4s, v7.4s +sqrdmulh v7.4S, v15.4S, v11.4S +mul v15.4S, v15.4S,v21.4S +mla v15.4S, v7.4S, v31.s[0] +sub v7.4s, v2.4s, v15.4s +add v2.4s, v2.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v0.4S +mul v2.4S, v2.4S,v14.4S +mla v2.4S, v15.4S, v31.s[0] +sub v15.4s, v10.4s, v2.4s +add v10.4s, v10.4s, v2.4s +sqrdmulh v2.4S, v7.4S, v22.4S +mul v7.4S, v7.4S,v19.4S +mla v7.4S, v2.4S, v31.s[0] +sub v2.4s, v4.4s, v7.4s +add v4.4s, v4.4s, v7.4s +str q10, [x0, #64] +str q15, [x0, #80] +str q4, [x0, #96] +str q2, [x0, #112] +ldr q2, [x17, #+384] +ldr q4, [x17, #+400] +ldr q15, [x17, #+416] +ldr q10, [x17, #+432] +ldr q7, [x17, #+448] +ldr q6, [x17, #+464] +ldr q5, [x17, #+480] +ldr q16, [x17, #+496] +ldr q22, [x0, #160] +ldr q19, [x0, #176] +ldr q0, [x0, #128] +ldr q14, [x0, #144] +sqrdmulh v11.4S, v22.4S, v4.s[0] +mul v22.4S, v22.4S,v2.s[0] +mla v22.4S, v11.4S, v31.s[0] +sub v11.4s, v0.4s, v22.4s +add v0.4s, v0.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v4.s[0] +mul v19.4S, v19.4S,v2.s[0] +mla v19.4S, v22.4S, v31.s[0] +sub v22.4s, v14.4s, v19.4s +add v14.4s, v14.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v4.s[1] +mul v14.4S, v14.4S,v2.s[1] +mla v14.4S, v19.4S, v31.s[0] +sub v19.4s, v0.4s, v14.4s +add v0.4s, v0.4s, v14.4s +sqrdmulh v14.4S, v22.4S, v4.s[2] +mul v22.4S, v22.4S,v2.s[2] +mla v22.4S, v14.4S, v31.s[0] +sub v14.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +trn1 v22.4S, v0.4S, v19.4S +trn2 v21.4S, v0.4S, v19.4S +trn1 v17.4S, v11.4S, v14.4S +trn2 v13.4S, v11.4S, v14.4S +trn2 v11.2D, v22.2D, v17.2D +trn2 v14.2D, v21.2D, v13.2D +trn1 v0.2D, v22.2D, v17.2D +trn1 v19.2D, v21.2D, v13.2D +sqrdmulh v13.4S, v11.4S, v10.4S +mul v11.4S, v11.4S,v15.4S +mla v11.4S, v13.4S, v31.s[0] +sub v13.4s, v0.4s, v11.4s +add v0.4s, v0.4s, v11.4s +sqrdmulh v11.4S, v14.4S, v10.4S +mul v14.4S, v14.4S,v15.4S +mla v14.4S, v11.4S, v31.s[0] +sub v11.4s, v19.4s, v14.4s +add v19.4s, v19.4s, v14.4s +sqrdmulh v14.4S, v19.4S, v6.4S +mul v19.4S, v19.4S,v7.4S +mla v19.4S, v14.4S, v31.s[0] +sub v14.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v11.4S, v16.4S +mul v11.4S, v11.4S,v5.4S +mla v11.4S, v19.4S, v31.s[0] +sub v19.4s, v13.4s, v11.4s +add v13.4s, v13.4s, v11.4s +str q0, [x0, #128] +str q14, [x0, #144] +str q13, [x0, #160] +str q19, [x0, #176] +ldr q19, [x17, #+512] +ldr q13, [x17, #+528] +ldr q14, [x17, #+544] +ldr q0, [x17, #+560] +ldr q11, [x17, #+576] +ldr q21, [x17, #+592] +ldr q17, [x17, #+608] +ldr q22, [x17, #+624] +ldr q16, [x0, #224] +ldr q5, [x0, #240] +ldr q6, [x0, #192] +ldr q7, [x0, #208] +sqrdmulh v10.4S, v16.4S, v13.s[0] +mul v16.4S, v16.4S,v19.s[0] +mla v16.4S, v10.4S, v31.s[0] +sub v10.4s, v6.4s, v16.4s +add v6.4s, v6.4s, v16.4s +sqrdmulh v16.4S, v5.4S, v13.s[0] +mul v5.4S, v5.4S,v19.s[0] +mla v5.4S, v16.4S, v31.s[0] +sub v16.4s, v7.4s, v5.4s +add v7.4s, v7.4s, v5.4s +sqrdmulh v5.4S, v7.4S, v13.s[1] +mul v7.4S, v7.4S,v19.s[1] +mla v7.4S, v5.4S, v31.s[0] +sub v5.4s, v6.4s, v7.4s +add v6.4s, v6.4s, v7.4s +sqrdmulh v7.4S, v16.4S, v13.s[2] +mul v16.4S, v16.4S,v19.s[2] +mla v16.4S, v7.4S, v31.s[0] +sub v7.4s, v10.4s, v16.4s +add v10.4s, v10.4s, v16.4s +trn1 v16.4S, v6.4S, v5.4S +trn2 v15.4S, v6.4S, v5.4S +trn1 v4.4S, v10.4S, v7.4S +trn2 v2.4S, v10.4S, v7.4S +trn2 v10.2D, v16.2D, v4.2D +trn2 v7.2D, v15.2D, v2.2D +trn1 v6.2D, v16.2D, v4.2D +trn1 v5.2D, v15.2D, v2.2D +sqrdmulh v2.4S, v10.4S, v0.4S +mul v10.4S, v10.4S,v14.4S +mla v10.4S, v2.4S, v31.s[0] +sub v2.4s, v6.4s, v10.4s +add v6.4s, v6.4s, v10.4s +sqrdmulh v10.4S, v7.4S, v0.4S +mul v7.4S, v7.4S,v14.4S +mla v7.4S, v10.4S, v31.s[0] +sub v10.4s, v5.4s, v7.4s +add v5.4s, v5.4s, v7.4s +sqrdmulh v7.4S, v5.4S, v21.4S +mul v5.4S, v5.4S,v11.4S +mla v5.4S, v7.4S, v31.s[0] +sub v7.4s, v6.4s, v5.4s +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v10.4S, v22.4S +mul v10.4S, v10.4S,v17.4S +mla v10.4S, v5.4S, v31.s[0] +sub v5.4s, v2.4s, v10.4s +add v2.4s, v2.4s, v10.4s +str q6, [x0, #192] +str q7, [x0, #208] +str q2, [x0, #224] +str q5, [x0, #240] +ldr q5, [x17, #+640] +ldr q2, [x17, #+656] +ldr q7, [x17, #+672] +ldr q6, [x17, #+688] +ldr q10, [x17, #+704] +ldr q15, [x17, #+720] +ldr q4, [x17, #+736] +ldr q16, [x17, #+752] +ldr q22, [x0, #288] +ldr q17, [x0, #304] +ldr q21, [x0, #256] +ldr q11, [x0, #272] +sqrdmulh v0.4S, v22.4S, v2.s[0] +mul v22.4S, v22.4S,v5.s[0] +mla v22.4S, v0.4S, v31.s[0] +sub v0.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +sqrdmulh v22.4S, v17.4S, v2.s[0] +mul v17.4S, v17.4S,v5.s[0] +mla v17.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v17.4s +add v11.4s, v11.4s, v17.4s +sqrdmulh v17.4S, v11.4S, v2.s[1] +mul v11.4S, v11.4S,v5.s[1] +mla v11.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v11.4s +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v2.s[2] +mul v22.4S, v22.4S,v5.s[2] +mla v22.4S, v11.4S, v31.s[0] +sub v11.4s, v0.4s, v22.4s +add v0.4s, v0.4s, v22.4s +trn1 v22.4S, v21.4S, v17.4S +trn2 v14.4S, v21.4S, v17.4S +trn1 v13.4S, v0.4S, v11.4S +trn2 v19.4S, v0.4S, v11.4S +trn2 v0.2D, v22.2D, v13.2D +trn2 v11.2D, v14.2D, v19.2D +trn1 v21.2D, v22.2D, v13.2D +trn1 v17.2D, v14.2D, v19.2D +sqrdmulh v19.4S, v0.4S, v6.4S +mul v0.4S, v0.4S,v7.4S +mla v0.4S, v19.4S, v31.s[0] +sub v19.4s, v21.4s, v0.4s +add v21.4s, v21.4s, v0.4s +sqrdmulh v0.4S, v11.4S, v6.4S +mul v11.4S, v11.4S,v7.4S +mla v11.4S, v0.4S, v31.s[0] +sub v0.4s, v17.4s, v11.4s +add v17.4s, v17.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v15.4S +mul v17.4S, v17.4S,v10.4S +mla v17.4S, v11.4S, v31.s[0] +sub v11.4s, v21.4s, v17.4s +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v0.4S, v16.4S +mul v0.4S, v0.4S,v4.4S +mla v0.4S, v17.4S, v31.s[0] +sub v17.4s, v19.4s, v0.4s +add v19.4s, v19.4s, v0.4s +str q21, [x0, #256] +str q11, [x0, #272] +str q19, [x0, #288] +str q17, [x0, #304] +ldr q17, [x17, #+768] +ldr q19, [x17, #+784] +ldr q11, [x17, #+800] +ldr q21, [x17, #+816] +ldr q0, [x17, #+832] +ldr q14, [x17, #+848] +ldr q13, [x17, #+864] +ldr q22, [x17, #+880] +ldr q16, [x0, #352] +ldr q4, [x0, #368] +ldr q15, [x0, #320] +ldr q10, [x0, #336] +sqrdmulh v6.4S, v16.4S, v19.s[0] +mul v16.4S, v16.4S,v17.s[0] +mla v16.4S, v6.4S, v31.s[0] +sub v6.4s, v15.4s, v16.4s +add v15.4s, v15.4s, v16.4s +sqrdmulh v16.4S, v4.4S, v19.s[0] +mul v4.4S, v4.4S,v17.s[0] +mla v4.4S, v16.4S, v31.s[0] +sub v16.4s, v10.4s, v4.4s +add v10.4s, v10.4s, v4.4s +sqrdmulh v4.4S, v10.4S, v19.s[1] +mul v10.4S, v10.4S,v17.s[1] +mla v10.4S, v4.4S, v31.s[0] +sub v4.4s, v15.4s, v10.4s +add v15.4s, v15.4s, v10.4s +sqrdmulh v10.4S, v16.4S, v19.s[2] +mul v16.4S, v16.4S,v17.s[2] +mla v16.4S, v10.4S, v31.s[0] +sub v10.4s, v6.4s, v16.4s +add v6.4s, v6.4s, v16.4s +trn1 v16.4S, v15.4S, v4.4S +trn2 v7.4S, v15.4S, v4.4S +trn1 v2.4S, v6.4S, v10.4S +trn2 v5.4S, v6.4S, v10.4S +trn2 v6.2D, v16.2D, v2.2D +trn2 v10.2D, v7.2D, v5.2D +trn1 v15.2D, v16.2D, v2.2D +trn1 v4.2D, v7.2D, v5.2D +sqrdmulh v5.4S, v6.4S, v21.4S +mul v6.4S, v6.4S,v11.4S +mla v6.4S, v5.4S, v31.s[0] +sub v5.4s, v15.4s, v6.4s +add v15.4s, v15.4s, v6.4s +sqrdmulh v6.4S, v10.4S, v21.4S +mul v10.4S, v10.4S,v11.4S +mla v10.4S, v6.4S, v31.s[0] +sub v6.4s, v4.4s, v10.4s +add v4.4s, v4.4s, v10.4s +sqrdmulh v10.4S, v4.4S, v14.4S +mul v4.4S, v4.4S,v0.4S +mla v4.4S, v10.4S, v31.s[0] +sub v10.4s, v15.4s, v4.4s +add v15.4s, v15.4s, v4.4s +sqrdmulh v4.4S, v6.4S, v22.4S +mul v6.4S, v6.4S,v13.4S +mla v6.4S, v4.4S, v31.s[0] +sub v4.4s, v5.4s, v6.4s +add v5.4s, v5.4s, v6.4s +str q15, [x0, #320] +str q10, [x0, #336] +str q5, [x0, #352] +str q4, [x0, #368] +ldr q4, [x17, #+896] +ldr q5, [x17, #+912] +ldr q10, [x17, #+928] +ldr q15, [x17, #+944] +ldr q6, [x17, #+960] +ldr q7, [x17, #+976] +ldr q2, [x17, #+992] +ldr q16, [x17, #+1008] +ldr q22, [x0, #416] +ldr q13, [x0, #432] +ldr q14, [x0, #384] +ldr q0, [x0, #400] +sqrdmulh v21.4S, v22.4S, v5.s[0] +mul v22.4S, v22.4S,v4.s[0] +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v14.4s, v22.4s +add v14.4s, v14.4s, v22.4s +sqrdmulh v22.4S, v13.4S, v5.s[0] +mul v13.4S, v13.4S,v4.s[0] +mla v13.4S, v22.4S, v31.s[0] +sub v22.4s, v0.4s, v13.4s +add v0.4s, v0.4s, v13.4s +sqrdmulh v13.4S, v0.4S, v5.s[1] +mul v0.4S, v0.4S,v4.s[1] +mla v0.4S, v13.4S, v31.s[0] +sub v13.4s, v14.4s, v0.4s +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v22.4S, v5.s[2] +mul v22.4S, v22.4S,v4.s[2] +mla v22.4S, v0.4S, v31.s[0] +sub v0.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +trn1 v22.4S, v14.4S, v13.4S +trn2 v11.4S, v14.4S, v13.4S +trn1 v19.4S, v21.4S, v0.4S +trn2 v17.4S, v21.4S, v0.4S +trn2 v21.2D, v22.2D, v19.2D +trn2 v0.2D, v11.2D, v17.2D +trn1 v14.2D, v22.2D, v19.2D +trn1 v13.2D, v11.2D, v17.2D +sqrdmulh v17.4S, v21.4S, v15.4S +mul v21.4S, v21.4S,v10.4S +mla v21.4S, v17.4S, v31.s[0] +sub v17.4s, v14.4s, v21.4s +add v14.4s, v14.4s, v21.4s +sqrdmulh v21.4S, v0.4S, v15.4S +mul v0.4S, v0.4S,v10.4S +mla v0.4S, v21.4S, v31.s[0] +sub v21.4s, v13.4s, v0.4s +add v13.4s, v13.4s, v0.4s +sqrdmulh v0.4S, v13.4S, v7.4S +mul v13.4S, v13.4S,v6.4S +mla v13.4S, v0.4S, v31.s[0] +sub v0.4s, v14.4s, v13.4s +add v14.4s, v14.4s, v13.4s +sqrdmulh v13.4S, v21.4S, v16.4S +mul v21.4S, v21.4S,v2.4S +mla v21.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v21.4s +add v17.4s, v17.4s, v21.4s +str q14, [x0, #384] +str q0, [x0, #400] +str q17, [x0, #416] +str q13, [x0, #432] +ldr q13, [x17, #+1024] +ldr q17, [x17, #+1040] +ldr q0, [x17, #+1056] +ldr q14, [x17, #+1072] +ldr q21, [x17, #+1088] +ldr q11, [x17, #+1104] +ldr q19, [x17, #+1120] +ldr q22, [x17, #+1136] +ldr q16, [x0, #480] +ldr q2, [x0, #496] +ldr q7, [x0, #448] +ldr q6, [x0, #464] +sqrdmulh v15.4S, v16.4S, v17.s[0] +mul v16.4S, v16.4S,v13.s[0] +mla v16.4S, v15.4S, v31.s[0] +sub v15.4s, v7.4s, v16.4s +add v7.4s, v7.4s, v16.4s +sqrdmulh v16.4S, v2.4S, v17.s[0] +mul v2.4S, v2.4S,v13.s[0] +mla v2.4S, v16.4S, v31.s[0] +sub v16.4s, v6.4s, v2.4s +add v6.4s, v6.4s, v2.4s +sqrdmulh v2.4S, v6.4S, v17.s[1] +mul v6.4S, v6.4S,v13.s[1] +mla v6.4S, v2.4S, v31.s[0] +sub v2.4s, v7.4s, v6.4s +add v7.4s, v7.4s, v6.4s +sqrdmulh v6.4S, v16.4S, v17.s[2] +mul v16.4S, v16.4S,v13.s[2] +mla v16.4S, v6.4S, v31.s[0] +sub v6.4s, v15.4s, v16.4s +add v15.4s, v15.4s, v16.4s +trn1 v16.4S, v7.4S, v2.4S +trn2 v10.4S, v7.4S, v2.4S +trn1 v5.4S, v15.4S, v6.4S +trn2 v4.4S, v15.4S, v6.4S +trn2 v15.2D, v16.2D, v5.2D +trn2 v6.2D, v10.2D, v4.2D +trn1 v7.2D, v16.2D, v5.2D +trn1 v2.2D, v10.2D, v4.2D +sqrdmulh v4.4S, v15.4S, v14.4S +mul v15.4S, v15.4S,v0.4S +mla v15.4S, v4.4S, v31.s[0] +sub v4.4s, v7.4s, v15.4s +add v7.4s, v7.4s, v15.4s +sqrdmulh v15.4S, v6.4S, v14.4S +mul v6.4S, v6.4S,v0.4S +mla v6.4S, v15.4S, v31.s[0] +sub v15.4s, v2.4s, v6.4s +add v2.4s, v2.4s, v6.4s +sqrdmulh v6.4S, v2.4S, v11.4S +mul v2.4S, v2.4S,v21.4S +mla v2.4S, v6.4S, v31.s[0] +sub v6.4s, v7.4s, v2.4s +add v7.4s, v7.4s, v2.4s +sqrdmulh v2.4S, v15.4S, v22.4S +mul v15.4S, v15.4S,v19.4S +mla v15.4S, v2.4S, v31.s[0] +sub v2.4s, v4.4s, v15.4s +add v4.4s, v4.4s, v15.4s +str q7, [x0, #448] +str q6, [x0, #464] +str q4, [x0, #480] +str q2, [x0, #496] +ldr q2, [x17, #+1152] +ldr q4, [x17, #+1168] +ldr q6, [x17, #+1184] +ldr q7, [x17, #+1200] +ldr q15, [x17, #+1216] +ldr q10, [x17, #+1232] +ldr q5, [x17, #+1248] +ldr q16, [x17, #+1264] +ldr q22, [x0, #544] +ldr q19, [x0, #560] +ldr q11, [x0, #512] +ldr q21, [x0, #528] +sqrdmulh v14.4S, v22.4S, v4.s[0] +mul v22.4S, v22.4S,v2.s[0] +mla v22.4S, v14.4S, v31.s[0] +sub v14.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v4.s[0] +mul v19.4S, v19.4S,v2.s[0] +mla v19.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v19.4s +add v21.4s, v21.4s, v19.4s +sqrdmulh v19.4S, v21.4S, v4.s[1] +mul v21.4S, v21.4S,v2.s[1] +mla v21.4S, v19.4S, v31.s[0] +sub v19.4s, v11.4s, v21.4s +add v11.4s, v11.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v4.s[2] +mul v22.4S, v22.4S,v2.s[2] +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v14.4s, v22.4s +add v14.4s, v14.4s, v22.4s +trn1 v22.4S, v11.4S, v19.4S +trn2 v0.4S, v11.4S, v19.4S +trn1 v17.4S, v14.4S, v21.4S +trn2 v13.4S, v14.4S, v21.4S +trn2 v14.2D, v22.2D, v17.2D +trn2 v21.2D, v0.2D, v13.2D +trn1 v11.2D, v22.2D, v17.2D +trn1 v19.2D, v0.2D, v13.2D +sqrdmulh v13.4S, v14.4S, v7.4S +mul v14.4S, v14.4S,v6.4S +mla v14.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v14.4s +add v11.4s, v11.4s, v14.4s +sqrdmulh v14.4S, v21.4S, v7.4S +mul v21.4S, v21.4S,v6.4S +mla v21.4S, v14.4S, v31.s[0] +sub v14.4s, v19.4s, v21.4s +add v19.4s, v19.4s, v21.4s +sqrdmulh v21.4S, v19.4S, v10.4S +mul v19.4S, v19.4S,v15.4S +mla v19.4S, v21.4S, v31.s[0] +sub v21.4s, v11.4s, v19.4s +add v11.4s, v11.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v16.4S +mul v14.4S, v14.4S,v5.4S +mla v14.4S, v19.4S, v31.s[0] +sub v19.4s, v13.4s, v14.4s +add v13.4s, v13.4s, v14.4s +str q11, [x0, #512] +str q21, [x0, #528] +str q13, [x0, #544] +str q19, [x0, #560] +ldr q19, [x17, #+1280] +ldr q13, [x17, #+1296] +ldr q21, [x17, #+1312] +ldr q11, [x17, #+1328] +ldr q14, [x17, #+1344] +ldr q0, [x17, #+1360] +ldr q17, [x17, #+1376] +ldr q22, [x17, #+1392] +ldr q16, [x0, #608] +ldr q5, [x0, #624] +ldr q10, [x0, #576] +ldr q15, [x0, #592] +sqrdmulh v7.4S, v16.4S, v13.s[0] +mul v16.4S, v16.4S,v19.s[0] +mla v16.4S, v7.4S, v31.s[0] +sub v7.4s, v10.4s, v16.4s +add v10.4s, v10.4s, v16.4s +sqrdmulh v16.4S, v5.4S, v13.s[0] +mul v5.4S, v5.4S,v19.s[0] +mla v5.4S, v16.4S, v31.s[0] +sub v16.4s, v15.4s, v5.4s +add v15.4s, v15.4s, v5.4s +sqrdmulh v5.4S, v15.4S, v13.s[1] +mul v15.4S, v15.4S,v19.s[1] +mla v15.4S, v5.4S, v31.s[0] +sub v5.4s, v10.4s, v15.4s +add v10.4s, v10.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v13.s[2] +mul v16.4S, v16.4S,v19.s[2] +mla v16.4S, v15.4S, v31.s[0] +sub v15.4s, v7.4s, v16.4s +add v7.4s, v7.4s, v16.4s +trn1 v16.4S, v10.4S, v5.4S +trn2 v6.4S, v10.4S, v5.4S +trn1 v4.4S, v7.4S, v15.4S +trn2 v2.4S, v7.4S, v15.4S +trn2 v7.2D, v16.2D, v4.2D +trn2 v15.2D, v6.2D, v2.2D +trn1 v10.2D, v16.2D, v4.2D +trn1 v5.2D, v6.2D, v2.2D +sqrdmulh v2.4S, v7.4S, v11.4S +mul v7.4S, v7.4S,v21.4S +mla v7.4S, v2.4S, v31.s[0] +sub v2.4s, v10.4s, v7.4s +add v10.4s, v10.4s, v7.4s +sqrdmulh v7.4S, v15.4S, v11.4S +mul v15.4S, v15.4S,v21.4S +mla v15.4S, v7.4S, v31.s[0] +sub v7.4s, v5.4s, v15.4s +add v5.4s, v5.4s, v15.4s +sqrdmulh v15.4S, v5.4S, v0.4S +mul v5.4S, v5.4S,v14.4S +mla v5.4S, v15.4S, v31.s[0] +sub v15.4s, v10.4s, v5.4s +add v10.4s, v10.4s, v5.4s +sqrdmulh v5.4S, v7.4S, v22.4S +mul v7.4S, v7.4S,v17.4S +mla v7.4S, v5.4S, v31.s[0] +sub v5.4s, v2.4s, v7.4s +add v2.4s, v2.4s, v7.4s +str q10, [x0, #576] +str q15, [x0, #592] +str q2, [x0, #608] +str q5, [x0, #624] +ldr q5, [x17, #+1408] +ldr q2, [x17, #+1424] +ldr q15, [x17, #+1440] +ldr q10, [x17, #+1456] +ldr q7, [x17, #+1472] +ldr q6, [x17, #+1488] +ldr q4, [x17, #+1504] +ldr q16, [x17, #+1520] +ldr q22, [x0, #672] +ldr q17, [x0, #688] +ldr q0, [x0, #640] +ldr q14, [x0, #656] +sqrdmulh v11.4S, v22.4S, v2.s[0] +mul v22.4S, v22.4S,v5.s[0] +mla v22.4S, v11.4S, v31.s[0] +sub v11.4s, v0.4s, v22.4s +add v0.4s, v0.4s, v22.4s +sqrdmulh v22.4S, v17.4S, v2.s[0] +mul v17.4S, v17.4S,v5.s[0] +mla v17.4S, v22.4S, v31.s[0] +sub v22.4s, v14.4s, v17.4s +add v14.4s, v14.4s, v17.4s +sqrdmulh v17.4S, v14.4S, v2.s[1] +mul v14.4S, v14.4S,v5.s[1] +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v0.4s, v14.4s +add v0.4s, v0.4s, v14.4s +sqrdmulh v14.4S, v22.4S, v2.s[2] +mul v22.4S, v22.4S,v5.s[2] +mla v22.4S, v14.4S, v31.s[0] +sub v14.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +trn1 v22.4S, v0.4S, v17.4S +trn2 v21.4S, v0.4S, v17.4S +trn1 v13.4S, v11.4S, v14.4S +trn2 v19.4S, v11.4S, v14.4S +trn2 v11.2D, v22.2D, v13.2D +trn2 v14.2D, v21.2D, v19.2D +trn1 v0.2D, v22.2D, v13.2D +trn1 v17.2D, v21.2D, v19.2D +sqrdmulh v19.4S, v11.4S, v10.4S +mul v11.4S, v11.4S,v15.4S +mla v11.4S, v19.4S, v31.s[0] +sub v19.4s, v0.4s, v11.4s +add v0.4s, v0.4s, v11.4s +sqrdmulh v11.4S, v14.4S, v10.4S +mul v14.4S, v14.4S,v15.4S +mla v14.4S, v11.4S, v31.s[0] +sub v11.4s, v17.4s, v14.4s +add v17.4s, v17.4s, v14.4s +sqrdmulh v14.4S, v17.4S, v6.4S +mul v17.4S, v17.4S,v7.4S +mla v17.4S, v14.4S, v31.s[0] +sub v14.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +sqrdmulh v17.4S, v11.4S, v16.4S +mul v11.4S, v11.4S,v4.4S +mla v11.4S, v17.4S, v31.s[0] +sub v17.4s, v19.4s, v11.4s +add v19.4s, v19.4s, v11.4s +str q0, [x0, #640] +str q14, [x0, #656] +str q19, [x0, #672] +str q17, [x0, #688] +ldr q17, [x17, #+1536] +ldr q19, [x17, #+1552] +ldr q14, [x17, #+1568] +ldr q0, [x17, #+1584] +ldr q11, [x17, #+1600] +ldr q21, [x17, #+1616] +ldr q13, [x17, #+1632] +ldr q22, [x17, #+1648] +ldr q16, [x0, #736] +ldr q4, [x0, #752] +ldr q6, [x0, #704] +ldr q7, [x0, #720] +sqrdmulh v10.4S, v16.4S, v19.s[0] +mul v16.4S, v16.4S,v17.s[0] +mla v16.4S, v10.4S, v31.s[0] +sub v10.4s, v6.4s, v16.4s +add v6.4s, v6.4s, v16.4s +sqrdmulh v16.4S, v4.4S, v19.s[0] +mul v4.4S, v4.4S,v17.s[0] +mla v4.4S, v16.4S, v31.s[0] +sub v16.4s, v7.4s, v4.4s +add v7.4s, v7.4s, v4.4s +sqrdmulh v4.4S, v7.4S, v19.s[1] +mul v7.4S, v7.4S,v17.s[1] +mla v7.4S, v4.4S, v31.s[0] +sub v4.4s, v6.4s, v7.4s +add v6.4s, v6.4s, v7.4s +sqrdmulh v7.4S, v16.4S, v19.s[2] +mul v16.4S, v16.4S,v17.s[2] +mla v16.4S, v7.4S, v31.s[0] +sub v7.4s, v10.4s, v16.4s +add v10.4s, v10.4s, v16.4s +trn1 v16.4S, v6.4S, v4.4S +trn2 v15.4S, v6.4S, v4.4S +trn1 v2.4S, v10.4S, v7.4S +trn2 v5.4S, v10.4S, v7.4S +trn2 v10.2D, v16.2D, v2.2D +trn2 v7.2D, v15.2D, v5.2D +trn1 v6.2D, v16.2D, v2.2D +trn1 v4.2D, v15.2D, v5.2D +sqrdmulh v5.4S, v10.4S, v0.4S +mul v10.4S, v10.4S,v14.4S +mla v10.4S, v5.4S, v31.s[0] +sub v5.4s, v6.4s, v10.4s +add v6.4s, v6.4s, v10.4s +sqrdmulh v10.4S, v7.4S, v0.4S +mul v7.4S, v7.4S,v14.4S +mla v7.4S, v10.4S, v31.s[0] +sub v10.4s, v4.4s, v7.4s +add v4.4s, v4.4s, v7.4s +sqrdmulh v7.4S, v4.4S, v21.4S +mul v4.4S, v4.4S,v11.4S +mla v4.4S, v7.4S, v31.s[0] +sub v7.4s, v6.4s, v4.4s +add v6.4s, v6.4s, v4.4s +sqrdmulh v4.4S, v10.4S, v22.4S +mul v10.4S, v10.4S,v13.4S +mla v10.4S, v4.4S, v31.s[0] +sub v4.4s, v5.4s, v10.4s +add v5.4s, v5.4s, v10.4s +str q6, [x0, #704] +str q7, [x0, #720] +str q5, [x0, #736] +str q4, [x0, #752] +ldr q4, [x17, #+1664] +ldr q5, [x17, #+1680] +ldr q7, [x17, #+1696] +ldr q6, [x17, #+1712] +ldr q10, [x17, #+1728] +ldr q15, [x17, #+1744] +ldr q2, [x17, #+1760] +ldr q16, [x17, #+1776] +ldr q22, [x0, #800] +ldr q13, [x0, #816] +ldr q21, [x0, #768] +ldr q11, [x0, #784] +sqrdmulh v0.4S, v22.4S, v5.s[0] +mul v22.4S, v22.4S,v4.s[0] +mla v22.4S, v0.4S, v31.s[0] +sub v0.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +sqrdmulh v22.4S, v13.4S, v5.s[0] +mul v13.4S, v13.4S,v4.s[0] +mla v13.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +sqrdmulh v13.4S, v11.4S, v5.s[1] +mul v11.4S, v11.4S,v4.s[1] +mla v11.4S, v13.4S, v31.s[0] +sub v13.4s, v21.4s, v11.4s +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v5.s[2] +mul v22.4S, v22.4S,v4.s[2] +mla v22.4S, v11.4S, v31.s[0] +sub v11.4s, v0.4s, v22.4s +add v0.4s, v0.4s, v22.4s +trn1 v22.4S, v21.4S, v13.4S +trn2 v14.4S, v21.4S, v13.4S +trn1 v19.4S, v0.4S, v11.4S +trn2 v17.4S, v0.4S, v11.4S +trn2 v0.2D, v22.2D, v19.2D +trn2 v11.2D, v14.2D, v17.2D +trn1 v21.2D, v22.2D, v19.2D +trn1 v13.2D, v14.2D, v17.2D +sqrdmulh v17.4S, v0.4S, v6.4S +mul v0.4S, v0.4S,v7.4S +mla v0.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v0.4s +add v21.4s, v21.4s, v0.4s +sqrdmulh v0.4S, v11.4S, v6.4S +mul v11.4S, v11.4S,v7.4S +mla v11.4S, v0.4S, v31.s[0] +sub v0.4s, v13.4s, v11.4s +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v15.4S +mul v13.4S, v13.4S,v10.4S +mla v13.4S, v11.4S, v31.s[0] +sub v11.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +sqrdmulh v13.4S, v0.4S, v16.4S +mul v0.4S, v0.4S,v2.4S +mla v0.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v0.4s +add v17.4s, v17.4s, v0.4s +str q21, [x0, #768] +str q11, [x0, #784] +str q17, [x0, #800] +str q13, [x0, #816] +ldr q13, [x17, #+1792] +ldr q17, [x17, #+1808] +ldr q11, [x17, #+1824] +ldr q21, [x17, #+1840] +ldr q0, [x17, #+1856] +ldr q14, [x17, #+1872] +ldr q19, [x17, #+1888] +ldr q22, [x17, #+1904] +ldr q16, [x0, #864] +ldr q2, [x0, #880] +ldr q15, [x0, #832] +ldr q10, [x0, #848] +sqrdmulh v6.4S, v16.4S, v17.s[0] +mul v16.4S, v16.4S,v13.s[0] +mla v16.4S, v6.4S, v31.s[0] +sub v6.4s, v15.4s, v16.4s +add v15.4s, v15.4s, v16.4s +sqrdmulh v16.4S, v2.4S, v17.s[0] +mul v2.4S, v2.4S,v13.s[0] +mla v2.4S, v16.4S, v31.s[0] +sub v16.4s, v10.4s, v2.4s +add v10.4s, v10.4s, v2.4s +sqrdmulh v2.4S, v10.4S, v17.s[1] +mul v10.4S, v10.4S,v13.s[1] +mla v10.4S, v2.4S, v31.s[0] +sub v2.4s, v15.4s, v10.4s +add v15.4s, v15.4s, v10.4s +sqrdmulh v10.4S, v16.4S, v17.s[2] +mul v16.4S, v16.4S,v13.s[2] +mla v16.4S, v10.4S, v31.s[0] +sub v10.4s, v6.4s, v16.4s +add v6.4s, v6.4s, v16.4s +trn1 v16.4S, v15.4S, v2.4S +trn2 v7.4S, v15.4S, v2.4S +trn1 v5.4S, v6.4S, v10.4S +trn2 v4.4S, v6.4S, v10.4S +trn2 v6.2D, v16.2D, v5.2D +trn2 v10.2D, v7.2D, v4.2D +trn1 v15.2D, v16.2D, v5.2D +trn1 v2.2D, v7.2D, v4.2D +sqrdmulh v4.4S, v6.4S, v21.4S +mul v6.4S, v6.4S,v11.4S +mla v6.4S, v4.4S, v31.s[0] +sub v4.4s, v15.4s, v6.4s +add v15.4s, v15.4s, v6.4s +sqrdmulh v6.4S, v10.4S, v21.4S +mul v10.4S, v10.4S,v11.4S +mla v10.4S, v6.4S, v31.s[0] +sub v6.4s, v2.4s, v10.4s +add v2.4s, v2.4s, v10.4s +sqrdmulh v10.4S, v2.4S, v14.4S +mul v2.4S, v2.4S,v0.4S +mla v2.4S, v10.4S, v31.s[0] +sub v10.4s, v15.4s, v2.4s +add v15.4s, v15.4s, v2.4s +sqrdmulh v2.4S, v6.4S, v22.4S +mul v6.4S, v6.4S,v19.4S +mla v6.4S, v2.4S, v31.s[0] +sub v2.4s, v4.4s, v6.4s +add v4.4s, v4.4s, v6.4s +str q15, [x0, #832] +str q10, [x0, #848] +str q4, [x0, #864] +str q2, [x0, #880] +ldr q2, [x17, #+1920] +ldr q4, [x17, #+1936] +ldr q10, [x17, #+1952] +ldr q15, [x17, #+1968] +ldr q6, [x17, #+1984] +ldr q7, [x17, #+2000] +ldr q5, [x17, #+2016] +ldr q16, [x17, #+2032] +ldr q22, [x0, #928] +ldr q19, [x0, #944] +ldr q14, [x0, #896] +ldr q0, [x0, #912] +sqrdmulh v21.4S, v22.4S, v4.s[0] +mul v22.4S, v22.4S,v2.s[0] +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v14.4s, v22.4s +add v14.4s, v14.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v4.s[0] +mul v19.4S, v19.4S,v2.s[0] +mla v19.4S, v22.4S, v31.s[0] +sub v22.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v0.4S, v4.s[1] +mul v0.4S, v0.4S,v2.s[1] +mla v0.4S, v19.4S, v31.s[0] +sub v19.4s, v14.4s, v0.4s +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v22.4S, v4.s[2] +mul v22.4S, v22.4S,v2.s[2] +mla v22.4S, v0.4S, v31.s[0] +sub v0.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +trn1 v22.4S, v14.4S, v19.4S +trn2 v11.4S, v14.4S, v19.4S +trn1 v17.4S, v21.4S, v0.4S +trn2 v13.4S, v21.4S, v0.4S +trn2 v21.2D, v22.2D, v17.2D +trn2 v0.2D, v11.2D, v13.2D +trn1 v14.2D, v22.2D, v17.2D +trn1 v19.2D, v11.2D, v13.2D +sqrdmulh v13.4S, v21.4S, v15.4S +mul v21.4S, v21.4S,v10.4S +mla v21.4S, v13.4S, v31.s[0] +sub v13.4s, v14.4s, v21.4s +add v14.4s, v14.4s, v21.4s +sqrdmulh v21.4S, v0.4S, v15.4S +mul v0.4S, v0.4S,v10.4S +mla v0.4S, v21.4S, v31.s[0] +sub v21.4s, v19.4s, v0.4s +add v19.4s, v19.4s, v0.4s +sqrdmulh v0.4S, v19.4S, v7.4S +mul v19.4S, v19.4S,v6.4S +mla v19.4S, v0.4S, v31.s[0] +sub v0.4s, v14.4s, v19.4s +add v14.4s, v14.4s, v19.4s +sqrdmulh v19.4S, v21.4S, v16.4S +mul v21.4S, v21.4S,v5.4S +mla v21.4S, v19.4S, v31.s[0] +sub v19.4s, v13.4s, v21.4s +add v13.4s, v13.4s, v21.4s +str q14, [x0, #896] +str q0, [x0, #912] +str q13, [x0, #928] +str q19, [x0, #944] +ldr q19, [x17, #+2048] +ldr q13, [x17, #+2064] +ldr q0, [x17, #+2080] +ldr q14, [x17, #+2096] +ldr q21, [x17, #+2112] +ldr q11, [x17, #+2128] +ldr q17, [x17, #+2144] +ldr q22, [x17, #+2160] +ldr q16, [x0, #992] +ldr q5, [x0, #1008] +ldr q7, [x0, #960] +ldr q6, [x0, #976] +sqrdmulh v15.4S, v16.4S, v13.s[0] +mul v16.4S, v16.4S,v19.s[0] +mla v16.4S, v15.4S, v31.s[0] +sub v15.4s, v7.4s, v16.4s +add v7.4s, v7.4s, v16.4s +sqrdmulh v16.4S, v5.4S, v13.s[0] +mul v5.4S, v5.4S,v19.s[0] +mla v5.4S, v16.4S, v31.s[0] +sub v16.4s, v6.4s, v5.4s +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v6.4S, v13.s[1] +mul v6.4S, v6.4S,v19.s[1] +mla v6.4S, v5.4S, v31.s[0] +sub v5.4s, v7.4s, v6.4s +add v7.4s, v7.4s, v6.4s +sqrdmulh v6.4S, v16.4S, v13.s[2] +mul v16.4S, v16.4S,v19.s[2] +mla v16.4S, v6.4S, v31.s[0] +sub v6.4s, v15.4s, v16.4s +add v15.4s, v15.4s, v16.4s +trn1 v16.4S, v7.4S, v5.4S +trn2 v10.4S, v7.4S, v5.4S +trn1 v4.4S, v15.4S, v6.4S +trn2 v2.4S, v15.4S, v6.4S +trn2 v15.2D, v16.2D, v4.2D +trn2 v6.2D, v10.2D, v2.2D +trn1 v7.2D, v16.2D, v4.2D +trn1 v5.2D, v10.2D, v2.2D +sqrdmulh v2.4S, v15.4S, v14.4S +mul v15.4S, v15.4S,v0.4S +mla v15.4S, v2.4S, v31.s[0] +sub v2.4s, v7.4s, v15.4s +add v7.4s, v7.4s, v15.4s +sqrdmulh v15.4S, v6.4S, v14.4S +mul v6.4S, v6.4S,v0.4S +mla v6.4S, v15.4S, v31.s[0] +sub v15.4s, v5.4s, v6.4s +add v5.4s, v5.4s, v6.4s +sqrdmulh v6.4S, v5.4S, v11.4S +mul v5.4S, v5.4S,v21.4S +mla v5.4S, v6.4S, v31.s[0] +sub v6.4s, v7.4s, v5.4s +add v7.4s, v7.4s, v5.4s +sqrdmulh v5.4S, v15.4S, v22.4S +mul v15.4S, v15.4S,v17.4S +mla v15.4S, v5.4S, v31.s[0] +sub v5.4s, v2.4s, v15.4s +add v2.4s, v2.4s, v15.4s +str q7, [x0, #960] +str q6, [x0, #976] +str q2, [x0, #992] +str q5, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 2392 +// Instruction count: 2388 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z2_0.s b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z2_0.s new file mode 100644 index 0000000..dadc45d --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z2_0.s @@ -0,0 +1,2422 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 26036764 // Layer 6, block 0 +.word 7065381 // Layer 6, block 1 +.word 11280567 // Layer 6, block 2 +.word 19695786 // Layer 6, block 3 +.word 1666225723 // Layer 6, block 0 +.word 452149874 // Layer 6, block 1 +.word 721901190 // Layer 6, block 2 +.word 1260434103 // Layer 6, block 3 +.word 28678040 // Layer 7, block 0 +.word 5637166 // Layer 7, block 2 +.word 18759424 // Layer 7, block 4 +.word 8648030 // Layer 7, block 6 +.word 1835254486 // Layer 7, block 0 +.word 360751090 // Layer 7, block 2 +.word 1200511508 // Layer 7, block 4 +.word 553431680 // Layer 7, block 6 +.word 7232147 // Layer 7, block 1 +.word 7430689 // Layer 7, block 3 +.word 14819378 // Layer 7, block 5 +.word 22112339 // Layer 7, block 7 +.word 462822084 // Layer 7, block 1 +.word 475527802 // Layer 7, block 3 +.word 948367809 // Layer 7, block 5 +.word 1415081692 // Layer 7, block 7 +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14834498 // Layer 6, block 4 +.word 22861321 // Layer 6, block 5 +.word 23033862 // Layer 6, block 6 +.word 32211066 // Layer 6, block 7 +.word 949335415 // Layer 6, block 4 +.word 1463012881 // Layer 6, block 5 +.word 1474054663 // Layer 6, block 6 +.word 2061350894 // Layer 6, block 7 +.word 7103825 // Layer 7, block 8 +.word 24338119 // Layer 7, block 10 +.word 6674394 // Layer 7, block 12 +.word 3716128 // Layer 7, block 14 +.word 454610102 // Layer 7, block 8 +.word 1557520740 // Layer 7, block 10 +.word 427128616 // Layer 7, block 12 +.word 237814041 // Layer 7, block 14 +.word 18577393 // Layer 7, block 9 +.word 17042091 // Layer 7, block 11 +.word 6574213 // Layer 7, block 13 +.word 24666803 // Layer 7, block 15 +.word 1188862414 // Layer 7, block 9 +.word 1090610585 // Layer 7, block 11 +.word 420717521 // Layer 7, block 13 +.word 1578554911 // Layer 7, block 15 +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 11253846 // Layer 6, block 8 +.word 16151303 // Layer 6, block 9 +.word 1821442 // Layer 6, block 10 +.word 23358663 // Layer 6, block 11 +.word 720191176 // Layer 6, block 8 +.word 1033604503 // Layer 6, block 9 +.word 116563391 // Layer 6, block 10 +.word 1494840340 // Layer 6, block 11 +.word 32787475 // Layer 7, block 16 +.word 8269259 // Layer 7, block 18 +.word 20826321 // Layer 7, block 20 +.word 21194054 // Layer 7, block 22 +.word 2098238255 // Layer 7, block 16 +.word 529192186 // Layer 7, block 18 +.word 1332782821 // Layer 7, block 20 +.word 1356315937 // Layer 7, block 22 +.word 28400654 // Layer 7, block 17 +.word 31090287 // Layer 7, block 19 +.word 26776841 // Layer 7, block 21 +.word 22281074 // Layer 7, block 23 +.word 1817503137 // Layer 7, block 17 +.word 1989626512 // Layer 7, block 19 +.word 1713587037 // Layer 7, block 21 +.word 1425879908 // Layer 7, block 23 +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 20504641 // Layer 6, block 12 +.word 7735096 // Layer 6, block 13 +.word 29463916 // Layer 6, block 14 +.word 23172067 // Layer 6, block 15 +.word 1312196872 // Layer 6, block 12 +.word 495008363 // Layer 6, block 13 +.word 1885546712 // Layer 6, block 14 +.word 1482899108 // Layer 6, block 15 +.word 1953000 // Layer 7, block 24 +.word 12766243 // Layer 7, block 26 +.word 16292342 // Layer 7, block 28 +.word 25143337 // Layer 7, block 30 +.word 124982461 // Layer 7, block 24 +.word 816977197 // Layer 7, block 26 +.word 1042630311 // Layer 7, block 28 +.word 1609050759 // Layer 7, block 30 +.word 12486848 // Layer 7, block 25 +.word 31556661 // Layer 7, block 27 +.word 28330310 // Layer 7, block 29 +.word 15137961 // Layer 7, block 31 +.word 799097282 // Layer 7, block 25 +.word 2019472170 // Layer 7, block 27 +.word 1813001465 // Layer 7, block 29 +.word 968755565 // Layer 7, block 31 +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 18663828 // Layer 6, block 16 +.word 25765932 // Layer 6, block 17 +.word 11779122 // Layer 6, block 18 +.word 29112305 // Layer 6, block 19 +.word 1194393831 // Layer 6, block 16 +.word 1648893798 // Layer 6, block 17 +.word 753806275 // Layer 6, block 18 +.word 1863045325 // Layer 6, block 19 +.word 33163184 // Layer 7, block 32 +.word 11550623 // Layer 7, block 34 +.word 25375595 // Layer 7, block 36 +.word 18254638 // Layer 7, block 38 +.word 2122281795 // Layer 7, block 32 +.word 739183455 // Layer 7, block 34 +.word 1623914137 // Layer 7, block 36 +.word 1168207670 // Layer 7, block 38 +.word 9551359 // Layer 7, block 33 +.word 33257316 // Layer 7, block 35 +.word 10387700 // Layer 7, block 37 +.word 4263629 // Layer 7, block 39 +.word 611240324 // Layer 7, block 33 +.word 2128305784 // Layer 7, block 35 +.word 664762063 // Layer 7, block 37 +.word 272851431 // Layer 7, block 39 +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 596073 // Layer 6, block 20 +.word 29039358 // Layer 6, block 21 +.word 6760262 // Layer 6, block 22 +.word 2228887 // Layer 6, block 23 +.word 38145761 // Layer 6, block 20 +.word 1858377074 // Layer 6, block 21 +.word 432623749 // Layer 6, block 22 +.word 142637881 // Layer 6, block 23 +.word 25929180 // Layer 7, block 40 +.word 23508428 // Layer 7, block 42 +.word 22560727 // Layer 7, block 44 +.word 29457393 // Layer 7, block 46 +.word 1659340873 // Layer 7, block 40 +.word 1504424569 // Layer 7, block 42 +.word 1443776334 // Layer 7, block 44 +.word 1885129272 // Layer 7, block 46 +.word 17371159 // Layer 7, block 41 +.word 11558208 // Layer 7, block 43 +.word 15755637 // Layer 7, block 45 +.word 20740787 // Layer 7, block 47 +.word 1111669329 // Layer 7, block 41 +.word 739668858 // Layer 7, block 43 +.word 1008283812 // Layer 7, block 45 +.word 1327309063 // Layer 7, block 47 +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 13624329 // Layer 6, block 24 +.word 9838349 // Layer 6, block 25 +.word 6934560 // Layer 6, block 26 +.word 11310234 // Layer 6, block 27 +.word 871890510 // Layer 6, block 24 +.word 629606282 // Layer 6, block 25 +.word 443777969 // Layer 6, block 26 +.word 723799733 // Layer 6, block 27 +.word 3153984 // Layer 7, block 48 +.word 15599806 // Layer 7, block 50 +.word 23484790 // Layer 7, block 52 +.word 30174454 // Layer 7, block 54 +.word 201839571 // Layer 7, block 48 +.word 998311389 // Layer 7, block 50 +.word 1502911852 // Layer 7, block 52 +.word 1931017673 // Layer 7, block 54 +.word 13598070 // Layer 7, block 49 +.word 31454003 // Layer 7, block 51 +.word 20506260 // Layer 7, block 53 +.word 5928435 // Layer 7, block 55 +.word 870210062 // Layer 7, block 49 +.word 2012902560 // Layer 7, block 51 +.word 1312300480 // Layer 7, block 53 +.word 379390883 // Layer 7, block 55 +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 32798516 // Layer 6, block 28 +.word 9911360 // Layer 6, block 29 +.word 32443170 // Layer 6, block 30 +.word 31293482 // Layer 6, block 31 +.word 2098944825 // Layer 6, block 28 +.word 634278629 // Layer 6, block 29 +.word 2076204416 // Layer 6, block 30 +.word 2002630000 // Layer 6, block 31 +.word 26013877 // Layer 7, block 56 +.word 22928950 // Layer 7, block 58 +.word 24547058 // Layer 7, block 60 +.word 21082546 // Layer 7, block 62 +.word 1664761067 // Layer 7, block 56 +.word 1467340807 // Layer 7, block 58 +.word 1570891816 // Layer 7, block 60 +.word 1349179970 // Layer 7, block 62 +.word 21864746 // Layer 7, block 57 +.word 27678266 // Layer 7, block 59 +.word 30695887 // Layer 7, block 61 +.word 31772478 // Layer 7, block 63 +.word 1399236949 // Layer 7, block 57 +.word 1771273834 // Layer 7, block 59 +.word 1964386839 // Layer 7, block 61 +.word 2033283404 // Layer 7, block 63 +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 2853776 // Layer 6, block 32 +.word 31645959 // Layer 6, block 33 +.word 29723614 // Layer 6, block 34 +.word 31813171 // Layer 6, block 35 +.word 182627725 // Layer 6, block 32 +.word 2025186806 // Layer 6, block 33 +.word 1902166116 // Layer 6, block 34 +.word 2035887557 // Layer 6, block 35 +.word 30377953 // Layer 7, block 64 +.word 4924837 // Layer 7, block 66 +.word 11362575 // Layer 7, block 68 +.word 31398766 // Layer 7, block 70 +.word 1944040616 // Layer 7, block 64 +.word 315165513 // Layer 7, block 66 +.word 727149301 // Layer 7, block 68 +.word 2009367662 // Layer 7, block 70 +.word 27689101 // Layer 7, block 65 +.word 31229525 // Layer 7, block 67 +.word 6544948 // Layer 7, block 69 +.word 13728247 // Layer 7, block 71 +.word 1771967221 // Layer 7, block 65 +.word 1998537064 // Layer 7, block 67 +.word 418844704 // Layer 7, block 69 +.word 878540754 // Layer 7, block 71 +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9116920 // Layer 6, block 36 +.word 26449800 // Layer 6, block 37 +.word 27173300 // Layer 6, block 38 +.word 1574249 // Layer 6, block 39 +.word 583438350 // Layer 6, block 36 +.word 1692658010 // Layer 6, block 37 +.word 1738958476 // Layer 6, block 38 +.word 100744247 // Layer 6, block 39 +.word 6510145 // Layer 7, block 72 +.word 760999 // Layer 7, block 74 +.word 1634503 // Layer 7, block 76 +.word 29546109 // Layer 7, block 78 +.word 416617482 // Layer 7, block 72 +.word 48700219 // Layer 7, block 74 +.word 104600209 // Layer 7, block 76 +.word 1890806663 // Layer 7, block 78 +.word 2195232 // Layer 7, block 73 +.word 4465852 // Layer 7, block 75 +.word 31203102 // Layer 7, block 77 +.word 29916743 // Layer 7, block 79 +.word 140484126 // Layer 7, block 73 +.word 285792715 // Layer 7, block 75 +.word 1996846121 // Layer 7, block 77 +.word 1914525428 // Layer 7, block 79 +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29172999 // Layer 6, block 40 +.word 16825951 // Layer 6, block 41 +.word 11592382 // Layer 6, block 42 +.word 2671395 // Layer 6, block 43 +.word 1866929445 // Layer 6, block 40 +.word 1076778680 // Layer 6, block 41 +.word 741855827 // Layer 6, block 42 +.word 170956232 // Layer 6, block 43 +.word 14579779 // Layer 7, block 80 +.word 24263513 // Layer 7, block 82 +.word 4646776 // Layer 7, block 84 +.word 69049 // Layer 7, block 86 +.word 933034643 // Layer 7, block 80 +.word 1552746321 // Layer 7, block 82 +.word 297370968 // Layer 7, block 84 +.word 4418799 // Layer 7, block 86 +.word 33263488 // Layer 7, block 81 +.word 22493246 // Layer 7, block 83 +.word 22009979 // Layer 7, block 85 +.word 12021234 // Layer 7, block 87 +.word 2128700762 // Layer 7, block 81 +.word 1439457879 // Layer 7, block 83 +.word 1408531152 // Layer 7, block 85 +.word 769300260 // Layer 7, block 87 +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 15720958 // Layer 6, block 44 +.word 4876619 // Layer 6, block 45 +.word 9370171 // Layer 6, block 46 +.word 2197027 // Layer 6, block 47 +.word 1006064525 // Layer 6, block 44 +.word 312079797 // Layer 6, block 45 +.word 599645177 // Layer 6, block 46 +.word 140598997 // Layer 6, block 47 +.word 16117282 // Layer 7, block 88 +.word 9635661 // Layer 7, block 90 +.word 9117520 // Layer 7, block 92 +.word 3506913 // Layer 7, block 94 +.word 1031427326 // Layer 7, block 88 +.word 616635240 // Layer 7, block 90 +.word 583476747 // Layer 7, block 92 +.word 224425303 // Layer 7, block 94 +.word 20014407 // Layer 7, block 89 +.word 25893988 // Layer 7, block 91 +.word 10257619 // Layer 7, block 93 +.word 24501669 // Layer 7, block 95 +.word 1280824291 // Layer 7, block 89 +.word 1657088757 // Layer 7, block 91 +.word 656437514 // Layer 7, block 93 +.word 1567987141 // Layer 7, block 95 +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 23467272 // Layer 6, block 48 +.word 11944835 // Layer 6, block 49 +.word 29768154 // Layer 6, block 50 +.word 3189790 // Layer 6, block 51 +.word 1501790786 // Layer 6, block 48 +.word 764411097 // Layer 6, block 49 +.word 1905016458 // Layer 6, block 50 +.word 204130980 // Layer 6, block 51 +.word 28559032 // Layer 7, block 96 +.word 20151609 // Layer 7, block 98 +.word 11645481 // Layer 7, block 100 +.word 16402437 // Layer 7, block 102 +.word 1827638556 // Layer 7, block 96 +.word 1289604549 // Layer 7, block 98 +.word 745253903 // Layer 7, block 100 +.word 1049675853 // Layer 7, block 102 +.word 1005359 // Layer 7, block 97 +.word 19130139 // Layer 7, block 99 +.word 11690281 // Layer 7, block 101 +.word 5461508 // Layer 7, block 103 +.word 64338065 // Layer 7, block 97 +.word 1224235458 // Layer 7, block 99 +.word 748120885 // Layer 7, block 101 +.word 349509836 // Layer 7, block 103 +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 4898455 // Layer 6, block 52 +.word 22059944 // Layer 6, block 53 +.word 20315246 // Layer 6, block 54 +.word 28615767 // Layer 6, block 55 +.word 313477194 // Layer 6, block 52 +.word 1411728668 // Layer 6, block 53 +.word 1300076517 // Layer 6, block 54 +.word 1831269319 // Layer 6, block 55 +.word 6226096 // Layer 7, block 104 +.word 14029790 // Layer 7, block 106 +.word 7729000 // Layer 7, block 108 +.word 13958531 // Layer 7, block 110 +.word 398439734 // Layer 7, block 104 +.word 897838034 // Layer 7, block 106 +.word 494618249 // Layer 7, block 108 +.word 893277806 // Layer 7, block 110 +.word 31755058 // Layer 7, block 105 +.word 26102744 // Layer 7, block 107 +.word 19175904 // Layer 7, block 109 +.word 19472238 // Layer 7, block 111 +.word 2032168609 // Layer 7, block 105 +.word 1670448121 // Layer 7, block 107 +.word 1227164194 // Layer 7, block 109 +.word 1246128123 // Layer 7, block 111 +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 17302560 // Layer 6, block 56 +.word 8630188 // Layer 6, block 57 +.word 13744680 // Layer 6, block 58 +.word 31890906 // Layer 6, block 59 +.word 1107279328 // Layer 6, block 56 +.word 552289879 // Layer 6, block 57 +.word 879592386 // Layer 6, block 58 +.word 2040862218 // Layer 6, block 59 +.word 4735938 // Layer 7, block 112 +.word 26671657 // Layer 7, block 114 +.word 25810971 // Layer 7, block 116 +.word 25578690 // Layer 7, block 118 +.word 303076900 // Layer 7, block 112 +.word 1706855774 // Layer 7, block 114 +.word 1651776074 // Layer 7, block 116 +.word 1636911225 // Layer 7, block 118 +.word 6957373 // Layer 7, block 113 +.word 25381712 // Layer 7, block 115 +.word 27780827 // Layer 7, block 117 +.word 28062311 // Layer 7, block 119 +.word 445237890 // Layer 7, block 113 +.word 1624305595 // Layer 7, block 115 +.word 1777837237 // Layer 7, block 117 +.word 1795850838 // Layer 7, block 119 +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 26150922 // Layer 6, block 60 +.word 29525906 // Layer 6, block 61 +.word 23080870 // Layer 6, block 62 +.word 1636987 // Layer 6, block 63 +.word 1673531278 // Layer 6, block 60 +.word 1889513769 // Layer 6, block 61 +.word 1477062945 // Layer 6, block 62 +.word 104759172 // Layer 6, block 63 +.word 10674616 // Layer 7, block 120 +.word 9508293 // Layer 7, block 122 +.word 4274200 // Layer 7, block 124 +.word 10066304 // Layer 7, block 126 +.word 683123285 // Layer 7, block 120 +.word 608484310 // Layer 7, block 122 +.word 273527923 // Layer 7, block 124 +.word 644194289 // Layer 7, block 126 +.word 26473446 // Layer 7, block 121 +.word 14853570 // Layer 7, block 123 +.word 32427548 // Layer 7, block 125 +.word 16598340 // Layer 7, block 127 +.word 1694171239 // Layer 7, block 121 +.word 950555930 // Layer 7, block 123 +.word 2075204685 // Layer 7, block 125 +.word 1062212688 // Layer 7, block 127 +.text +.global ntt_u32_full_neon_asm_var_4_4_3_z2_0 +.global _ntt_u32_full_neon_asm_var_4_4_3_z2_0 +ntt_u32_full_neon_asm_var_4_4_3_z2_0: +_ntt_u32_full_neon_asm_var_4_4_3_z2_0: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #800] +ldr q21, [x0, #864] +ldr q20, [x0, #928] +ldr q19, [x0, #992] +ldr q18, [x0, #288] +ldr q17, [x0, #352] +ldr q16, [x0, #416] +ldr q3, [x0, #480] +sqrdmulh v2.4S, v22.4S, v29.s[0] +ldr q1, [x0, #544] +mul v22.4S, v22.4S,v30.s[0] +ldr q0, [x0, #608] +sqrdmulh v15.4S, v21.4S, v29.s[0] +ldr q14, [x0, #672] +mul v21.4S, v21.4S,v30.s[0] +ldr q13, [x0, #736] +mla v22.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q12, [x0, #32] +sub v11.4s, v18.4s, v22.4s +mla v21.4S, v15.4S, v31.s[0] +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q15, [x0, #96] +sub v10.4s, v17.4s, v21.4s +mla v20.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v1.4S, v29.s[0] +ldr q2, [x0, #160] +mul v1.4S, v1.4S,v30.s[0] +sub v9.4s, v16.4s, v20.4s +mla v19.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v0.4S, v29.s[0] +ldr q22, [x0, #224] +mul v0.4S, v0.4S,v30.s[0] +sub v8.4s, v3.4s, v19.4s +mla v1.4S, v21.4S, v31.s[0] +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v21.4s, v12.4s, v1.4s +mla v0.4S, v20.4S, v31.s[0] +add v12.4s, v12.4s, v1.4s +sqrdmulh v1.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +sub v20.4s, v15.4s, v0.4s +mla v14.4S, v19.4S, v31.s[0] +add v15.4s, v15.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v19.4s, v2.4s, v14.4s +mla v13.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v1.4s, v22.4s, v13.4s +mla v16.4S, v0.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v0.4s, v2.4s, v16.4s +mla v3.4S, v14.4S, v31.s[0] +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v14.4s, v22.4s, v3.4s +mla v18.4S, v13.4S, v31.s[0] +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v29.s[2] +mul v9.4S, v9.4S,v30.s[2] +sub v13.4s, v12.4s, v18.4s +mla v17.4S, v16.4S, v31.s[0] +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v8.4S, v29.s[2] +mul v8.4S, v8.4S,v30.s[2] +sub v16.4s, v15.4s, v17.4s +mla v9.4S, v3.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +sqrdmulh v17.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v3.4s, v19.4s, v9.4s +mla v8.4S, v18.4S, v31.s[0] +add v19.4s, v19.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v18.4s, v1.4s, v8.4s +mla v11.4S, v17.4S, v31.s[0] +add v1.4s, v1.4s, v8.4s +sqrdmulh v8.4S, v2.4S, v27.s[0] +mul v2.4S, v2.4S,v28.s[0] +sub v17.4s, v21.4s, v11.4s +mla v10.4S, v9.4S, v31.s[0] +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v27.s[0] +mul v22.4S, v22.4S,v28.s[0] +sub v9.4s, v20.4s, v10.4s +mla v2.4S, v8.4S, v31.s[0] +add v20.4s, v20.4s, v10.4s +sqrdmulh v10.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v8.4s, v12.4s, v2.4s +mla v22.4S, v11.4S, v31.s[0] +add v12.4s, v12.4s, v2.4s +sqrdmulh v2.4S, v14.4S, v27.s[1] +mul v14.4S, v14.4S,v28.s[1] +sub v11.4s, v15.4s, v22.4s +mla v0.4S, v10.4S, v31.s[0] +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v27.s[2] +mul v19.4S, v19.4S,v28.s[2] +sub v10.4s, v13.4s, v0.4s +mla v14.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v0.4s +sqrdmulh v0.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +sub v2.4s, v16.4s, v14.4s +mla v19.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v27.s[3] +mul v3.4S, v3.4S,v28.s[3] +sub v22.4s, v21.4s, v19.4s +mla v1.4S, v0.4S, v31.s[0] +add v21.4s, v21.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v0.4s, v20.4s, v1.4s +mla v3.4S, v14.4S, v31.s[0] +add v20.4s, v20.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v25.s[0] +mul v15.4S, v15.4S,v26.s[0] +sub v14.4s, v17.4s, v3.4s +mla v18.4S, v19.4S, v31.s[0] +add v17.4s, v17.4s, v3.4s +sqrdmulh v3.4S, v11.4S, v25.s[1] +mul v11.4S, v11.4S,v26.s[1] +sub v19.4s, v9.4s, v18.4s +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v1.4s, v12.4s, v15.4s +mla v11.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v25.s[3] +mul v2.4S, v2.4S,v26.s[3] +sub v3.4s, v8.4s, v11.4s +mla v16.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v11.4s +str q12, [x0, #32] +sqrdmulh v12.4S, v20.4S, v23.s[0] +str q1, [x0, #96] +mul v20.4S, v20.4S,v24.s[0] +ldr q1, [x0, #816] +sub v11.4s, v13.4s, v16.4s +ldr q18, [x0, #880] +mla v2.4S, v15.4S, v31.s[0] +add v13.4s, v13.4s, v16.4s +str q8, [x0, #160] +sqrdmulh v8.4S, v0.4S, v23.s[1] +str q3, [x0, #224] +mul v0.4S, v0.4S,v24.s[1] +ldr q3, [x0, #944] +sub v16.4s, v10.4s, v2.4s +ldr q15, [x0, #1008] +mla v20.4S, v12.4S, v31.s[0] +add v10.4s, v10.4s, v2.4s +str q13, [x0, #288] +sqrdmulh v13.4S, v9.4S, v23.s[2] +str q11, [x0, #352] +mul v9.4S, v9.4S,v24.s[2] +ldr q11, [x0, #304] +sub v2.4s, v21.4s, v20.4s +ldr q12, [x0, #368] +mla v0.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v20.4s +str q10, [x0, #416] +sqrdmulh v10.4S, v19.4S, v23.s[3] +str q16, [x0, #480] +mul v19.4S, v19.4S,v24.s[3] +ldr q16, [x0, #432] +sub v20.4s, v22.4s, v0.4s +ldr q8, [x0, #496] +mla v9.4S, v13.4S, v31.s[0] +add v22.4s, v22.4s, v0.4s +str q21, [x0, #544] +sqrdmulh v21.4S, v1.4S, v29.s[0] +str q2, [x0, #608] +ldr q2, [x0, #560] +mul v1.4S, v1.4S,v30.s[0] +ldr q0, [x0, #624] +sub v13.4s, v17.4s, v9.4s +mla v19.4S, v10.4S, v31.s[0] +add v17.4s, v17.4s, v9.4s +str q22, [x0, #672] +sqrdmulh v22.4S, v18.4S, v29.s[0] +str q20, [x0, #736] +ldr q20, [x0, #688] +mul v18.4S, v18.4S,v30.s[0] +ldr q9, [x0, #752] +sub v10.4s, v14.4s, v19.4s +mla v1.4S, v21.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +str q17, [x0, #800] +sqrdmulh v17.4S, v3.4S, v29.s[0] +str q13, [x0, #864] +mul v3.4S, v3.4S,v30.s[0] +ldr q13, [x0, #48] +sub v19.4s, v11.4s, v1.4s +mla v18.4S, v22.4S, v31.s[0] +add v11.4s, v11.4s, v1.4s +str q14, [x0, #928] +sqrdmulh v14.4S, v15.4S, v29.s[0] +str q10, [x0, #992] +mul v15.4S, v15.4S,v30.s[0] +ldr q10, [x0, #112] +sub v1.4s, v12.4s, v18.4s +mla v3.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v2.4S, v29.s[0] +ldr q17, [x0, #176] +mul v2.4S, v2.4S,v30.s[0] +sub v22.4s, v16.4s, v3.4s +mla v15.4S, v14.4S, v31.s[0] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v0.4S, v29.s[0] +ldr q14, [x0, #240] +mul v0.4S, v0.4S,v30.s[0] +sub v21.4s, v8.4s, v15.4s +mla v2.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +sub v18.4s, v13.4s, v2.4s +mla v0.4S, v3.4S, v31.s[0] +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v9.4S, v29.s[0] +mul v9.4S, v9.4S,v30.s[0] +sub v3.4s, v10.4s, v0.4s +mla v20.4S, v15.4S, v31.s[0] +add v10.4s, v10.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v15.4s, v17.4s, v20.4s +mla v9.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +sub v2.4s, v14.4s, v9.4s +mla v16.4S, v0.4S, v31.s[0] +add v14.4s, v14.4s, v9.4s +sqrdmulh v9.4S, v11.4S, v29.s[1] +mul v11.4S, v11.4S,v30.s[1] +sub v0.4s, v17.4s, v16.4s +mla v8.4S, v20.4S, v31.s[0] +add v17.4s, v17.4s, v16.4s +sqrdmulh v16.4S, v12.4S, v29.s[1] +mul v12.4S, v12.4S,v30.s[1] +sub v20.4s, v14.4s, v8.4s +mla v11.4S, v9.4S, v31.s[0] +add v14.4s, v14.4s, v8.4s +sqrdmulh v8.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +sub v9.4s, v13.4s, v11.4s +mla v12.4S, v16.4S, v31.s[0] +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +sub v16.4s, v10.4s, v12.4s +mla v22.4S, v8.4S, v31.s[0] +add v10.4s, v10.4s, v12.4s +sqrdmulh v12.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +sub v8.4s, v15.4s, v22.4s +mla v21.4S, v11.4S, v31.s[0] +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v1.4S, v29.s[2] +mul v1.4S, v1.4S,v30.s[2] +sub v11.4s, v2.4s, v21.4s +mla v19.4S, v12.4S, v31.s[0] +add v2.4s, v2.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v27.s[0] +mul v17.4S, v17.4S,v28.s[0] +sub v12.4s, v18.4s, v19.4s +mla v1.4S, v22.4S, v31.s[0] +add v18.4s, v18.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +sub v22.4s, v3.4s, v1.4s +mla v17.4S, v21.4S, v31.s[0] +add v3.4s, v3.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v21.4s, v13.4s, v17.4s +mla v14.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v17.4s +sqrdmulh v17.4S, v20.4S, v27.s[1] +mul v20.4S, v20.4S,v28.s[1] +sub v19.4s, v10.4s, v14.4s +mla v0.4S, v1.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v27.s[2] +mul v15.4S, v15.4S,v28.s[2] +sub v1.4s, v9.4s, v0.4s +mla v20.4S, v17.4S, v31.s[0] +add v9.4s, v9.4s, v0.4s +sqrdmulh v0.4S, v2.4S, v27.s[2] +mul v2.4S, v2.4S,v28.s[2] +sub v17.4s, v16.4s, v20.4s +mla v15.4S, v14.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v27.s[3] +mul v8.4S, v8.4S,v28.s[3] +sub v14.4s, v18.4s, v15.4s +mla v2.4S, v0.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v27.s[3] +mul v11.4S, v11.4S,v28.s[3] +sub v0.4s, v3.4s, v2.4s +mla v8.4S, v20.4S, v31.s[0] +add v3.4s, v3.4s, v2.4s +sqrdmulh v2.4S, v10.4S, v25.s[0] +mul v10.4S, v10.4S,v26.s[0] +sub v20.4s, v12.4s, v8.4s +mla v11.4S, v15.4S, v31.s[0] +add v12.4s, v12.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v25.s[1] +mul v19.4S, v19.4S,v26.s[1] +sub v15.4s, v22.4s, v11.4s +mla v10.4S, v2.4S, v31.s[0] +add v22.4s, v22.4s, v11.4s +sqrdmulh v11.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v2.4s, v13.4s, v10.4s +mla v19.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v10.4s +sqrdmulh v10.4S, v17.4S, v25.s[3] +mul v17.4S, v17.4S,v26.s[3] +sub v8.4s, v21.4s, v19.4s +mla v16.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v19.4s +str q13, [x0, #48] +sqrdmulh v13.4S, v3.4S, v23.s[0] +str q2, [x0, #112] +mul v3.4S, v3.4S,v24.s[0] +ldr q2, [x0, #768] +sub v19.4s, v9.4s, v16.4s +ldr q11, [x0, #832] +mla v17.4S, v10.4S, v31.s[0] +add v9.4s, v9.4s, v16.4s +str q21, [x0, #176] +sqrdmulh v21.4S, v0.4S, v23.s[1] +str q8, [x0, #240] +mul v0.4S, v0.4S,v24.s[1] +ldr q8, [x0, #896] +sub v16.4s, v1.4s, v17.4s +ldr q10, [x0, #960] +mla v3.4S, v13.4S, v31.s[0] +add v1.4s, v1.4s, v17.4s +str q9, [x0, #304] +sqrdmulh v9.4S, v22.4S, v23.s[2] +str q19, [x0, #368] +mul v22.4S, v22.4S,v24.s[2] +ldr q19, [x0, #256] +sub v17.4s, v18.4s, v3.4s +ldr q13, [x0, #320] +mla v0.4S, v21.4S, v31.s[0] +add v18.4s, v18.4s, v3.4s +str q1, [x0, #432] +sqrdmulh v1.4S, v15.4S, v23.s[3] +str q16, [x0, #496] +mul v15.4S, v15.4S,v24.s[3] +ldr q16, [x0, #384] +sub v3.4s, v14.4s, v0.4s +ldr q21, [x0, #448] +mla v22.4S, v9.4S, v31.s[0] +add v14.4s, v14.4s, v0.4s +str q18, [x0, #560] +sqrdmulh v18.4S, v2.4S, v29.s[0] +str q17, [x0, #624] +ldr q17, [x0, #512] +mul v2.4S, v2.4S,v30.s[0] +ldr q0, [x0, #576] +sub v9.4s, v12.4s, v22.4s +mla v15.4S, v1.4S, v31.s[0] +add v12.4s, v12.4s, v22.4s +str q14, [x0, #688] +sqrdmulh v14.4S, v11.4S, v29.s[0] +str q3, [x0, #752] +ldr q3, [x0, #640] +mul v11.4S, v11.4S,v30.s[0] +ldr q22, [x0, #704] +sub v1.4s, v20.4s, v15.4s +mla v2.4S, v18.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +str q12, [x0, #816] +sqrdmulh v12.4S, v8.4S, v29.s[0] +str q9, [x0, #880] +mul v8.4S, v8.4S,v30.s[0] +ldr q9, [x0, #0] +sub v15.4s, v19.4s, v2.4s +mla v11.4S, v14.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +str q20, [x0, #944] +sqrdmulh v20.4S, v10.4S, v29.s[0] +str q1, [x0, #1008] +mul v10.4S, v10.4S,v30.s[0] +ldr q1, [x0, #64] +sub v2.4s, v13.4s, v11.4s +mla v8.4S, v12.4S, v31.s[0] +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v29.s[0] +ldr q12, [x0, #128] +mul v17.4S, v17.4S,v30.s[0] +sub v14.4s, v16.4s, v8.4s +mla v10.4S, v20.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v0.4S, v29.s[0] +ldr q20, [x0, #192] +mul v0.4S, v0.4S,v30.s[0] +sub v18.4s, v21.4s, v10.4s +mla v17.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +sub v11.4s, v9.4s, v17.4s +mla v0.4S, v8.4S, v31.s[0] +add v9.4s, v9.4s, v17.4s +sqrdmulh v17.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v8.4s, v1.4s, v0.4s +mla v3.4S, v10.4S, v31.s[0] +add v1.4s, v1.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v10.4s, v12.4s, v3.4s +mla v22.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v17.4s, v20.4s, v22.4s +mla v16.4S, v0.4S, v31.s[0] +add v20.4s, v20.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[1] +mul v19.4S, v19.4S,v30.s[1] +sub v0.4s, v12.4s, v16.4s +mla v21.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v13.4S, v29.s[1] +mul v13.4S, v13.4S,v30.s[1] +sub v3.4s, v20.4s, v21.4s +mla v19.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v22.4s, v9.4s, v19.4s +mla v13.4S, v16.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v29.s[2] +mul v18.4S, v18.4S,v30.s[2] +sub v16.4s, v1.4s, v13.4s +mla v14.4S, v21.4S, v31.s[0] +add v1.4s, v1.4s, v13.4s +sqrdmulh v13.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +sub v21.4s, v10.4s, v14.4s +mla v18.4S, v19.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v19.4s, v17.4s, v18.4s +mla v15.4S, v13.4S, v31.s[0] +add v17.4s, v17.4s, v18.4s +sqrdmulh v18.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +sub v13.4s, v11.4s, v15.4s +mla v2.4S, v14.4S, v31.s[0] +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v27.s[0] +mul v20.4S, v20.4S,v28.s[0] +sub v14.4s, v8.4s, v2.4s +mla v12.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v2.4s +sqrdmulh v2.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v18.4s, v9.4s, v12.4s +mla v20.4S, v15.4S, v31.s[0] +add v9.4s, v9.4s, v12.4s +sqrdmulh v12.4S, v3.4S, v27.s[1] +mul v3.4S, v3.4S,v28.s[1] +sub v15.4s, v1.4s, v20.4s +mla v0.4S, v2.4S, v31.s[0] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v10.4S, v27.s[2] +mul v10.4S, v10.4S,v28.s[2] +sub v2.4s, v22.4s, v0.4s +mla v3.4S, v12.4S, v31.s[0] +add v22.4s, v22.4s, v0.4s +sqrdmulh v0.4S, v17.4S, v27.s[2] +mul v17.4S, v17.4S,v28.s[2] +sub v12.4s, v16.4s, v3.4s +mla v10.4S, v20.4S, v31.s[0] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +sub v20.4s, v11.4s, v10.4s +mla v17.4S, v0.4S, v31.s[0] +add v11.4s, v11.4s, v10.4s +sqrdmulh v10.4S, v19.4S, v27.s[3] +mul v19.4S, v19.4S,v28.s[3] +sub v0.4s, v8.4s, v17.4s +mla v21.4S, v3.4S, v31.s[0] +add v8.4s, v8.4s, v17.4s +sqrdmulh v17.4S, v1.4S, v25.s[0] +mul v1.4S, v1.4S,v26.s[0] +sub v3.4s, v13.4s, v21.4s +mla v19.4S, v10.4S, v31.s[0] +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v15.4S, v25.s[1] +mul v15.4S, v15.4S,v26.s[1] +sub v10.4s, v14.4s, v19.4s +mla v1.4S, v17.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +sqrdmulh v19.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v17.4s, v9.4s, v1.4s +mla v15.4S, v21.4S, v31.s[0] +add v9.4s, v9.4s, v1.4s +sqrdmulh v1.4S, v12.4S, v25.s[3] +mul v12.4S, v12.4S,v26.s[3] +sub v21.4s, v18.4s, v15.4s +mla v16.4S, v19.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +str q9, [x0, #0] +sqrdmulh v9.4S, v8.4S, v23.s[0] +str q17, [x0, #64] +mul v8.4S, v8.4S,v24.s[0] +ldr q17, [x0, #784] +sub v15.4s, v22.4s, v16.4s +ldr q19, [x0, #848] +mla v12.4S, v1.4S, v31.s[0] +add v22.4s, v22.4s, v16.4s +str q18, [x0, #128] +sqrdmulh v18.4S, v0.4S, v23.s[1] +str q21, [x0, #192] +mul v0.4S, v0.4S,v24.s[1] +ldr q21, [x0, #912] +sub v16.4s, v2.4s, v12.4s +ldr q1, [x0, #976] +mla v8.4S, v9.4S, v31.s[0] +add v2.4s, v2.4s, v12.4s +str q22, [x0, #256] +sqrdmulh v22.4S, v14.4S, v23.s[2] +str q15, [x0, #320] +mul v14.4S, v14.4S,v24.s[2] +ldr q15, [x0, #272] +sub v12.4s, v11.4s, v8.4s +ldr q9, [x0, #336] +mla v0.4S, v18.4S, v31.s[0] +add v11.4s, v11.4s, v8.4s +str q2, [x0, #384] +sqrdmulh v2.4S, v10.4S, v23.s[3] +str q16, [x0, #448] +mul v10.4S, v10.4S,v24.s[3] +ldr q16, [x0, #400] +sub v8.4s, v20.4s, v0.4s +ldr q18, [x0, #464] +mla v14.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v0.4s +str q11, [x0, #512] +sqrdmulh v11.4S, v17.4S, v29.s[0] +str q12, [x0, #576] +ldr q12, [x0, #528] +mul v17.4S, v17.4S,v30.s[0] +ldr q0, [x0, #592] +sub v22.4s, v13.4s, v14.4s +mla v10.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v14.4s +str q20, [x0, #640] +sqrdmulh v20.4S, v19.4S, v29.s[0] +str q8, [x0, #704] +ldr q8, [x0, #656] +mul v19.4S, v19.4S,v30.s[0] +ldr q14, [x0, #720] +sub v2.4s, v3.4s, v10.4s +mla v17.4S, v11.4S, v31.s[0] +add v3.4s, v3.4s, v10.4s +str q13, [x0, #768] +sqrdmulh v13.4S, v21.4S, v29.s[0] +str q22, [x0, #832] +mul v21.4S, v21.4S,v30.s[0] +ldr q22, [x0, #16] +sub v10.4s, v15.4s, v17.4s +mla v19.4S, v20.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +str q3, [x0, #896] +sqrdmulh v3.4S, v1.4S, v29.s[0] +str q2, [x0, #960] +mul v1.4S, v1.4S,v30.s[0] +ldr q2, [x0, #80] +sub v17.4s, v9.4s, v19.4s +mla v21.4S, v13.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v12.4S, v29.s[0] +ldr q13, [x0, #144] +mul v12.4S, v12.4S,v30.s[0] +sub v20.4s, v16.4s, v21.4s +mla v1.4S, v3.4S, v31.s[0] +add v16.4s, v16.4s, v21.4s +sqrdmulh v21.4S, v0.4S, v29.s[0] +ldr q3, [x0, #208] +mul v0.4S, v0.4S,v30.s[0] +sub v11.4s, v18.4s, v1.4s +mla v12.4S, v19.4S, v31.s[0] +add v18.4s, v18.4s, v1.4s +sqrdmulh v1.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v19.4s, v22.4s, v12.4s +mla v0.4S, v21.4S, v31.s[0] +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v21.4s, v2.4s, v0.4s +mla v8.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v1.4s, v13.4s, v8.4s +mla v14.4S, v12.4S, v31.s[0] +add v13.4s, v13.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v12.4s, v3.4s, v14.4s +mla v16.4S, v0.4S, v31.s[0] +add v3.4s, v3.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +sub v0.4s, v13.4s, v16.4s +mla v18.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v16.4s +sqrdmulh v16.4S, v9.4S, v29.s[1] +mul v9.4S, v9.4S,v30.s[1] +sub v8.4s, v3.4s, v18.4s +mla v15.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +sub v14.4s, v22.4s, v15.4s +mla v9.4S, v16.4S, v31.s[0] +add v22.4s, v22.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v16.4s, v2.4s, v9.4s +mla v20.4S, v18.4S, v31.s[0] +add v2.4s, v2.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v18.4s, v1.4s, v20.4s +mla v11.4S, v15.4S, v31.s[0] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +sub v15.4s, v12.4s, v11.4s +mla v10.4S, v9.4S, v31.s[0] +add v12.4s, v12.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v27.s[0] +mul v13.4S, v13.4S,v28.s[0] +sub v9.4s, v19.4s, v10.4s +mla v17.4S, v20.4S, v31.s[0] +add v19.4s, v19.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v27.s[0] +mul v3.4S, v3.4S,v28.s[0] +sub v20.4s, v21.4s, v17.4s +mla v13.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v11.4s, v22.4s, v13.4s +mla v3.4S, v10.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v8.4S, v27.s[1] +mul v8.4S, v8.4S,v28.s[1] +sub v10.4s, v2.4s, v3.4s +mla v0.4S, v17.4S, v31.s[0] +add v2.4s, v2.4s, v3.4s +sqrdmulh v3.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +sub v17.4s, v14.4s, v0.4s +mla v8.4S, v13.4S, v31.s[0] +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v12.4S, v27.s[2] +mul v12.4S, v12.4S,v28.s[2] +sub v13.4s, v16.4s, v8.4s +mla v1.4S, v3.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v3.4s, v19.4s, v1.4s +mla v12.4S, v0.4S, v31.s[0] +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +sub v0.4s, v21.4s, v12.4s +mla v18.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v2.4S, v25.s[0] +mul v2.4S, v2.4S,v26.s[0] +sub v8.4s, v9.4s, v18.4s +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v10.4S, v25.s[1] +mul v10.4S, v10.4S,v26.s[1] +sub v1.4s, v20.4s, v15.4s +mla v2.4S, v12.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v12.4s, v22.4s, v2.4s +mla v10.4S, v18.4S, v31.s[0] +add v22.4s, v22.4s, v2.4s +sqrdmulh v2.4S, v13.4S, v25.s[3] +mul v13.4S, v13.4S,v26.s[3] +sub v18.4s, v11.4s, v10.4s +mla v16.4S, v15.4S, v31.s[0] +add v11.4s, v11.4s, v10.4s +str q22, [x0, #16] +sqrdmulh v22.4S, v21.4S, v23.s[0] +str q12, [x0, #80] +mul v21.4S, v21.4S,v24.s[0] +sub v12.4s, v14.4s, v16.4s +mla v13.4S, v2.4S, v31.s[0] +add v14.4s, v14.4s, v16.4s +str q11, [x0, #144] +sqrdmulh v11.4S, v0.4S, v23.s[1] +str q18, [x0, #208] +mul v0.4S, v0.4S,v24.s[1] +sub v18.4s, v17.4s, v13.4s +mla v21.4S, v22.4S, v31.s[0] +add v17.4s, v17.4s, v13.4s +str q14, [x0, #272] +sqrdmulh v14.4S, v20.4S, v23.s[2] +str q12, [x0, #336] +mul v20.4S, v20.4S,v24.s[2] +sub v12.4s, v19.4s, v21.4s +mla v0.4S, v11.4S, v31.s[0] +add v19.4s, v19.4s, v21.4s +str q17, [x0, #400] +sqrdmulh v17.4S, v1.4S, v23.s[3] +str q18, [x0, #464] +mul v1.4S, v1.4S,v24.s[3] +sub v18.4s, v3.4s, v0.4s +mla v20.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v0.4s +str q19, [x0, #528] +str q12, [x0, #592] +sub v12.4s, v9.4s, v20.4s +mla v1.4S, v17.4S, v31.s[0] +add v9.4s, v9.4s, v20.4s +str q3, [x0, #656] +str q18, [x0, #720] +sub v18.4s, v8.4s, v1.4s +add v8.4s, v8.4s, v1.4s +str q9, [x0, #784] +str q12, [x0, #848] +str q8, [x0, #912] +str q18, [x0, #976] +ldr q4, [x17, #+128] +ldr q5, [x17, #+144] +ldr q6, [x17, #+160] +ldr q7, [x17, #+176] +ldr q15, [x17, #+192] +ldr q10, [x17, #+208] +ldr q2, [x17, #+224] +ldr q16, [x17, #+240] +ldr q22, [x0, #32] +ldr q13, [x0, #48] +ldr q11, [x0, #0] +ldr q21, [x0, #16] +sqrdmulh v14.4S, v22.4S, v5.s[0] +mul v22.4S, v22.4S,v4.s[0] +mla v22.4S, v14.4S, v31.s[0] +sub v14.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +sqrdmulh v22.4S, v13.4S, v5.s[0] +mul v13.4S, v13.4S,v4.s[0] +mla v13.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +sqrdmulh v13.4S, v21.4S, v5.s[1] +mul v21.4S, v21.4S,v4.s[1] +mla v21.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v21.4s +add v11.4s, v11.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v5.s[2] +mul v22.4S, v22.4S,v4.s[2] +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v14.4s, v22.4s +add v14.4s, v14.4s, v22.4s +trn1 v22.4S, v11.4S, v13.4S +trn2 v0.4S, v11.4S, v13.4S +trn1 v19.4S, v14.4S, v21.4S +trn2 v17.4S, v14.4S, v21.4S +trn2 v14.2D, v22.2D, v19.2D +trn2 v21.2D, v0.2D, v17.2D +trn1 v11.2D, v22.2D, v19.2D +trn1 v13.2D, v0.2D, v17.2D +sqrdmulh v17.4S, v14.4S, v7.4S +mul v14.4S, v14.4S,v6.4S +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v11.4s, v14.4s +add v11.4s, v11.4s, v14.4s +sqrdmulh v14.4S, v21.4S, v7.4S +mul v21.4S, v21.4S,v6.4S +mla v21.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v21.4s +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v13.4S, v10.4S +mul v13.4S, v13.4S,v15.4S +mla v13.4S, v21.4S, v31.s[0] +sub v21.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +sqrdmulh v13.4S, v14.4S, v16.4S +mul v14.4S, v14.4S,v2.4S +mla v14.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v14.4s +add v17.4s, v17.4s, v14.4s +str q11, [x0, #0] +str q21, [x0, #16] +str q17, [x0, #32] +str q13, [x0, #48] +ldr q16, [x17, #+1152] +ldr q2, [x17, #+1168] +ldr q10, [x17, #+1184] +ldr q15, [x17, #+1200] +ldr q7, [x17, #+1216] +ldr q6, [x17, #+1232] +ldr q5, [x17, #+1248] +ldr q4, [x17, #+1264] +ldr q13, [x0, #544] +ldr q17, [x0, #560] +ldr q21, [x0, #512] +ldr q11, [x0, #528] +sqrdmulh v14.4S, v13.4S, v2.s[0] +mul v13.4S, v13.4S,v16.s[0] +mla v13.4S, v14.4S, v31.s[0] +sub v14.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +sqrdmulh v13.4S, v17.4S, v2.s[0] +mul v17.4S, v17.4S,v16.s[0] +mla v17.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v17.4s +add v11.4s, v11.4s, v17.4s +sqrdmulh v17.4S, v11.4S, v2.s[1] +mul v11.4S, v11.4S,v16.s[1] +mla v11.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v11.4s +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v2.s[2] +mul v13.4S, v13.4S,v16.s[2] +mla v13.4S, v11.4S, v31.s[0] +sub v11.4s, v14.4s, v13.4s +add v14.4s, v14.4s, v13.4s +trn1 v13.4S, v21.4S, v17.4S +trn2 v0.4S, v21.4S, v17.4S +trn1 v19.4S, v14.4S, v11.4S +trn2 v22.4S, v14.4S, v11.4S +trn2 v14.2D, v13.2D, v19.2D +trn2 v11.2D, v0.2D, v22.2D +trn1 v21.2D, v13.2D, v19.2D +trn1 v17.2D, v0.2D, v22.2D +sqrdmulh v22.4S, v14.4S, v15.4S +mul v14.4S, v14.4S,v10.4S +mla v14.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v14.4s +add v21.4s, v21.4s, v14.4s +sqrdmulh v14.4S, v11.4S, v15.4S +mul v11.4S, v11.4S,v10.4S +mla v11.4S, v14.4S, v31.s[0] +sub v14.4s, v17.4s, v11.4s +add v17.4s, v17.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v6.4S +mul v17.4S, v17.4S,v7.4S +mla v17.4S, v11.4S, v31.s[0] +sub v11.4s, v21.4s, v17.4s +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v14.4S, v4.4S +mul v14.4S, v14.4S,v5.4S +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v22.4s, v14.4s +add v22.4s, v22.4s, v14.4s +str q21, [x0, #512] +str q11, [x0, #528] +str q22, [x0, #544] +str q17, [x0, #560] +ldr q4, [x17, #+256] +ldr q5, [x17, #+272] +ldr q6, [x17, #+288] +ldr q7, [x17, #+304] +ldr q15, [x17, #+320] +ldr q10, [x17, #+336] +ldr q2, [x17, #+352] +ldr q16, [x17, #+368] +ldr q17, [x0, #96] +ldr q22, [x0, #112] +ldr q11, [x0, #64] +ldr q21, [x0, #80] +sqrdmulh v14.4S, v17.4S, v5.s[0] +mul v17.4S, v17.4S,v4.s[0] +mla v17.4S, v14.4S, v31.s[0] +sub v14.4s, v11.4s, v17.4s +add v11.4s, v11.4s, v17.4s +sqrdmulh v17.4S, v22.4S, v5.s[0] +mul v22.4S, v22.4S,v4.s[0] +mla v22.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v5.s[1] +mul v21.4S, v21.4S,v4.s[1] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v21.4s +add v11.4s, v11.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v5.s[2] +mul v17.4S, v17.4S,v4.s[2] +mla v17.4S, v21.4S, v31.s[0] +sub v21.4s, v14.4s, v17.4s +add v14.4s, v14.4s, v17.4s +trn1 v17.4S, v11.4S, v22.4S +trn2 v0.4S, v11.4S, v22.4S +trn1 v19.4S, v14.4S, v21.4S +trn2 v13.4S, v14.4S, v21.4S +trn2 v14.2D, v17.2D, v19.2D +trn2 v21.2D, v0.2D, v13.2D +trn1 v11.2D, v17.2D, v19.2D +trn1 v22.2D, v0.2D, v13.2D +sqrdmulh v13.4S, v14.4S, v7.4S +mul v14.4S, v14.4S,v6.4S +mla v14.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v14.4s +add v11.4s, v11.4s, v14.4s +sqrdmulh v14.4S, v21.4S, v7.4S +mul v21.4S, v21.4S,v6.4S +mla v21.4S, v14.4S, v31.s[0] +sub v14.4s, v22.4s, v21.4s +add v22.4s, v22.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v10.4S +mul v22.4S, v22.4S,v15.4S +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +sqrdmulh v22.4S, v14.4S, v16.4S +mul v14.4S, v14.4S,v2.4S +mla v14.4S, v22.4S, v31.s[0] +sub v22.4s, v13.4s, v14.4s +add v13.4s, v13.4s, v14.4s +str q11, [x0, #64] +str q21, [x0, #80] +str q13, [x0, #96] +str q22, [x0, #112] +ldr q16, [x17, #+1280] +ldr q2, [x17, #+1296] +ldr q10, [x17, #+1312] +ldr q15, [x17, #+1328] +ldr q7, [x17, #+1344] +ldr q6, [x17, #+1360] +ldr q5, [x17, #+1376] +ldr q4, [x17, #+1392] +ldr q22, [x0, #608] +ldr q13, [x0, #624] +ldr q21, [x0, #576] +ldr q11, [x0, #592] +sqrdmulh v14.4S, v22.4S, v2.s[0] +mul v22.4S, v22.4S,v16.s[0] +mla v22.4S, v14.4S, v31.s[0] +sub v14.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +sqrdmulh v22.4S, v13.4S, v2.s[0] +mul v13.4S, v13.4S,v16.s[0] +mla v13.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +sqrdmulh v13.4S, v11.4S, v2.s[1] +mul v11.4S, v11.4S,v16.s[1] +mla v11.4S, v13.4S, v31.s[0] +sub v13.4s, v21.4s, v11.4s +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v2.s[2] +mul v22.4S, v22.4S,v16.s[2] +mla v22.4S, v11.4S, v31.s[0] +sub v11.4s, v14.4s, v22.4s +add v14.4s, v14.4s, v22.4s +trn1 v22.4S, v21.4S, v13.4S +trn2 v0.4S, v21.4S, v13.4S +trn1 v19.4S, v14.4S, v11.4S +trn2 v17.4S, v14.4S, v11.4S +trn2 v14.2D, v22.2D, v19.2D +trn2 v11.2D, v0.2D, v17.2D +trn1 v21.2D, v22.2D, v19.2D +trn1 v13.2D, v0.2D, v17.2D +sqrdmulh v17.4S, v14.4S, v15.4S +mul v14.4S, v14.4S,v10.4S +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v14.4s +add v21.4s, v21.4s, v14.4s +sqrdmulh v14.4S, v11.4S, v15.4S +mul v11.4S, v11.4S,v10.4S +mla v11.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v11.4s +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v6.4S +mul v13.4S, v13.4S,v7.4S +mla v13.4S, v11.4S, v31.s[0] +sub v11.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +sqrdmulh v13.4S, v14.4S, v4.4S +mul v14.4S, v14.4S,v5.4S +mla v14.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v14.4s +add v17.4s, v17.4s, v14.4s +str q21, [x0, #576] +str q11, [x0, #592] +str q17, [x0, #608] +str q13, [x0, #624] +ldr q4, [x17, #+384] +ldr q5, [x17, #+400] +ldr q6, [x17, #+416] +ldr q7, [x17, #+432] +ldr q15, [x17, #+448] +ldr q10, [x17, #+464] +ldr q2, [x17, #+480] +ldr q16, [x17, #+496] +ldr q13, [x0, #160] +ldr q17, [x0, #176] +ldr q11, [x0, #128] +ldr q21, [x0, #144] +sqrdmulh v14.4S, v13.4S, v5.s[0] +mul v13.4S, v13.4S,v4.s[0] +mla v13.4S, v14.4S, v31.s[0] +sub v14.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +sqrdmulh v13.4S, v17.4S, v5.s[0] +mul v17.4S, v17.4S,v4.s[0] +mla v17.4S, v13.4S, v31.s[0] +sub v13.4s, v21.4s, v17.4s +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v21.4S, v5.s[1] +mul v21.4S, v21.4S,v4.s[1] +mla v21.4S, v17.4S, v31.s[0] +sub v17.4s, v11.4s, v21.4s +add v11.4s, v11.4s, v21.4s +sqrdmulh v21.4S, v13.4S, v5.s[2] +mul v13.4S, v13.4S,v4.s[2] +mla v13.4S, v21.4S, v31.s[0] +sub v21.4s, v14.4s, v13.4s +add v14.4s, v14.4s, v13.4s +trn1 v13.4S, v11.4S, v17.4S +trn2 v0.4S, v11.4S, v17.4S +trn1 v19.4S, v14.4S, v21.4S +trn2 v22.4S, v14.4S, v21.4S +trn2 v14.2D, v13.2D, v19.2D +trn2 v21.2D, v0.2D, v22.2D +trn1 v11.2D, v13.2D, v19.2D +trn1 v17.2D, v0.2D, v22.2D +sqrdmulh v22.4S, v14.4S, v7.4S +mul v14.4S, v14.4S,v6.4S +mla v14.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v14.4s +add v11.4s, v11.4s, v14.4s +sqrdmulh v14.4S, v21.4S, v7.4S +mul v21.4S, v21.4S,v6.4S +mla v21.4S, v14.4S, v31.s[0] +sub v14.4s, v17.4s, v21.4s +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v10.4S +mul v17.4S, v17.4S,v15.4S +mla v17.4S, v21.4S, v31.s[0] +sub v21.4s, v11.4s, v17.4s +add v11.4s, v11.4s, v17.4s +sqrdmulh v17.4S, v14.4S, v16.4S +mul v14.4S, v14.4S,v2.4S +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v22.4s, v14.4s +add v22.4s, v22.4s, v14.4s +str q11, [x0, #128] +str q21, [x0, #144] +str q22, [x0, #160] +str q17, [x0, #176] +ldr q16, [x17, #+1408] +ldr q2, [x17, #+1424] +ldr q10, [x17, #+1440] +ldr q15, [x17, #+1456] +ldr q7, [x17, #+1472] +ldr q6, [x17, #+1488] +ldr q5, [x17, #+1504] +ldr q4, [x17, #+1520] +ldr q17, [x0, #672] +ldr q22, [x0, #688] +ldr q21, [x0, #640] +ldr q11, [x0, #656] +sqrdmulh v14.4S, v17.4S, v2.s[0] +mul v17.4S, v17.4S,v16.s[0] +mla v17.4S, v14.4S, v31.s[0] +sub v14.4s, v21.4s, v17.4s +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v22.4S, v2.s[0] +mul v22.4S, v22.4S,v16.s[0] +mla v22.4S, v17.4S, v31.s[0] +sub v17.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +sqrdmulh v22.4S, v11.4S, v2.s[1] +mul v11.4S, v11.4S,v16.s[1] +mla v11.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v11.4s +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v2.s[2] +mul v17.4S, v17.4S,v16.s[2] +mla v17.4S, v11.4S, v31.s[0] +sub v11.4s, v14.4s, v17.4s +add v14.4s, v14.4s, v17.4s +trn1 v17.4S, v21.4S, v22.4S +trn2 v0.4S, v21.4S, v22.4S +trn1 v19.4S, v14.4S, v11.4S +trn2 v13.4S, v14.4S, v11.4S +trn2 v14.2D, v17.2D, v19.2D +trn2 v11.2D, v0.2D, v13.2D +trn1 v21.2D, v17.2D, v19.2D +trn1 v22.2D, v0.2D, v13.2D +sqrdmulh v13.4S, v14.4S, v15.4S +mul v14.4S, v14.4S,v10.4S +mla v14.4S, v13.4S, v31.s[0] +sub v13.4s, v21.4s, v14.4s +add v21.4s, v21.4s, v14.4s +sqrdmulh v14.4S, v11.4S, v15.4S +mul v11.4S, v11.4S,v10.4S +mla v11.4S, v14.4S, v31.s[0] +sub v14.4s, v22.4s, v11.4s +add v22.4s, v22.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v6.4S +mul v22.4S, v22.4S,v7.4S +mla v22.4S, v11.4S, v31.s[0] +sub v11.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +sqrdmulh v22.4S, v14.4S, v4.4S +mul v14.4S, v14.4S,v5.4S +mla v14.4S, v22.4S, v31.s[0] +sub v22.4s, v13.4s, v14.4s +add v13.4s, v13.4s, v14.4s +str q21, [x0, #640] +str q11, [x0, #656] +str q13, [x0, #672] +str q22, [x0, #688] +ldr q4, [x17, #+512] +ldr q5, [x17, #+528] +ldr q6, [x17, #+544] +ldr q7, [x17, #+560] +ldr q15, [x17, #+576] +ldr q10, [x17, #+592] +ldr q2, [x17, #+608] +ldr q16, [x17, #+624] +ldr q22, [x0, #224] +ldr q13, [x0, #240] +ldr q11, [x0, #192] +ldr q21, [x0, #208] +sqrdmulh v14.4S, v22.4S, v5.s[0] +mul v22.4S, v22.4S,v4.s[0] +mla v22.4S, v14.4S, v31.s[0] +sub v14.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +sqrdmulh v22.4S, v13.4S, v5.s[0] +mul v13.4S, v13.4S,v4.s[0] +mla v13.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +sqrdmulh v13.4S, v21.4S, v5.s[1] +mul v21.4S, v21.4S,v4.s[1] +mla v21.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v21.4s +add v11.4s, v11.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v5.s[2] +mul v22.4S, v22.4S,v4.s[2] +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v14.4s, v22.4s +add v14.4s, v14.4s, v22.4s +trn1 v22.4S, v11.4S, v13.4S +trn2 v0.4S, v11.4S, v13.4S +trn1 v19.4S, v14.4S, v21.4S +trn2 v17.4S, v14.4S, v21.4S +trn2 v14.2D, v22.2D, v19.2D +trn2 v21.2D, v0.2D, v17.2D +trn1 v11.2D, v22.2D, v19.2D +trn1 v13.2D, v0.2D, v17.2D +sqrdmulh v17.4S, v14.4S, v7.4S +mul v14.4S, v14.4S,v6.4S +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v11.4s, v14.4s +add v11.4s, v11.4s, v14.4s +sqrdmulh v14.4S, v21.4S, v7.4S +mul v21.4S, v21.4S,v6.4S +mla v21.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v21.4s +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v13.4S, v10.4S +mul v13.4S, v13.4S,v15.4S +mla v13.4S, v21.4S, v31.s[0] +sub v21.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +sqrdmulh v13.4S, v14.4S, v16.4S +mul v14.4S, v14.4S,v2.4S +mla v14.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v14.4s +add v17.4s, v17.4s, v14.4s +str q11, [x0, #192] +str q21, [x0, #208] +str q17, [x0, #224] +str q13, [x0, #240] +ldr q16, [x17, #+1536] +ldr q2, [x17, #+1552] +ldr q10, [x17, #+1568] +ldr q15, [x17, #+1584] +ldr q7, [x17, #+1600] +ldr q6, [x17, #+1616] +ldr q5, [x17, #+1632] +ldr q4, [x17, #+1648] +ldr q13, [x0, #736] +ldr q17, [x0, #752] +ldr q21, [x0, #704] +ldr q11, [x0, #720] +sqrdmulh v14.4S, v13.4S, v2.s[0] +mul v13.4S, v13.4S,v16.s[0] +mla v13.4S, v14.4S, v31.s[0] +sub v14.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +sqrdmulh v13.4S, v17.4S, v2.s[0] +mul v17.4S, v17.4S,v16.s[0] +mla v17.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v17.4s +add v11.4s, v11.4s, v17.4s +sqrdmulh v17.4S, v11.4S, v2.s[1] +mul v11.4S, v11.4S,v16.s[1] +mla v11.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v11.4s +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v2.s[2] +mul v13.4S, v13.4S,v16.s[2] +mla v13.4S, v11.4S, v31.s[0] +sub v11.4s, v14.4s, v13.4s +add v14.4s, v14.4s, v13.4s +trn1 v13.4S, v21.4S, v17.4S +trn2 v0.4S, v21.4S, v17.4S +trn1 v19.4S, v14.4S, v11.4S +trn2 v22.4S, v14.4S, v11.4S +trn2 v14.2D, v13.2D, v19.2D +trn2 v11.2D, v0.2D, v22.2D +trn1 v21.2D, v13.2D, v19.2D +trn1 v17.2D, v0.2D, v22.2D +sqrdmulh v22.4S, v14.4S, v15.4S +mul v14.4S, v14.4S,v10.4S +mla v14.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v14.4s +add v21.4s, v21.4s, v14.4s +sqrdmulh v14.4S, v11.4S, v15.4S +mul v11.4S, v11.4S,v10.4S +mla v11.4S, v14.4S, v31.s[0] +sub v14.4s, v17.4s, v11.4s +add v17.4s, v17.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v6.4S +mul v17.4S, v17.4S,v7.4S +mla v17.4S, v11.4S, v31.s[0] +sub v11.4s, v21.4s, v17.4s +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v14.4S, v4.4S +mul v14.4S, v14.4S,v5.4S +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v22.4s, v14.4s +add v22.4s, v22.4s, v14.4s +str q21, [x0, #704] +str q11, [x0, #720] +str q22, [x0, #736] +str q17, [x0, #752] +ldr q4, [x17, #+640] +ldr q5, [x17, #+656] +ldr q6, [x17, #+672] +ldr q7, [x17, #+688] +ldr q15, [x17, #+704] +ldr q10, [x17, #+720] +ldr q2, [x17, #+736] +ldr q16, [x17, #+752] +ldr q17, [x0, #288] +ldr q22, [x0, #304] +ldr q11, [x0, #256] +ldr q21, [x0, #272] +sqrdmulh v14.4S, v17.4S, v5.s[0] +mul v17.4S, v17.4S,v4.s[0] +mla v17.4S, v14.4S, v31.s[0] +sub v14.4s, v11.4s, v17.4s +add v11.4s, v11.4s, v17.4s +sqrdmulh v17.4S, v22.4S, v5.s[0] +mul v22.4S, v22.4S,v4.s[0] +mla v22.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v5.s[1] +mul v21.4S, v21.4S,v4.s[1] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v21.4s +add v11.4s, v11.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v5.s[2] +mul v17.4S, v17.4S,v4.s[2] +mla v17.4S, v21.4S, v31.s[0] +sub v21.4s, v14.4s, v17.4s +add v14.4s, v14.4s, v17.4s +trn1 v17.4S, v11.4S, v22.4S +trn2 v0.4S, v11.4S, v22.4S +trn1 v19.4S, v14.4S, v21.4S +trn2 v13.4S, v14.4S, v21.4S +trn2 v14.2D, v17.2D, v19.2D +trn2 v21.2D, v0.2D, v13.2D +trn1 v11.2D, v17.2D, v19.2D +trn1 v22.2D, v0.2D, v13.2D +sqrdmulh v13.4S, v14.4S, v7.4S +mul v14.4S, v14.4S,v6.4S +mla v14.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v14.4s +add v11.4s, v11.4s, v14.4s +sqrdmulh v14.4S, v21.4S, v7.4S +mul v21.4S, v21.4S,v6.4S +mla v21.4S, v14.4S, v31.s[0] +sub v14.4s, v22.4s, v21.4s +add v22.4s, v22.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v10.4S +mul v22.4S, v22.4S,v15.4S +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +sqrdmulh v22.4S, v14.4S, v16.4S +mul v14.4S, v14.4S,v2.4S +mla v14.4S, v22.4S, v31.s[0] +sub v22.4s, v13.4s, v14.4s +add v13.4s, v13.4s, v14.4s +str q11, [x0, #256] +str q21, [x0, #272] +str q13, [x0, #288] +str q22, [x0, #304] +ldr q16, [x17, #+1664] +ldr q2, [x17, #+1680] +ldr q10, [x17, #+1696] +ldr q15, [x17, #+1712] +ldr q7, [x17, #+1728] +ldr q6, [x17, #+1744] +ldr q5, [x17, #+1760] +ldr q4, [x17, #+1776] +ldr q22, [x0, #800] +ldr q13, [x0, #816] +ldr q21, [x0, #768] +ldr q11, [x0, #784] +sqrdmulh v14.4S, v22.4S, v2.s[0] +mul v22.4S, v22.4S,v16.s[0] +mla v22.4S, v14.4S, v31.s[0] +sub v14.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +sqrdmulh v22.4S, v13.4S, v2.s[0] +mul v13.4S, v13.4S,v16.s[0] +mla v13.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +sqrdmulh v13.4S, v11.4S, v2.s[1] +mul v11.4S, v11.4S,v16.s[1] +mla v11.4S, v13.4S, v31.s[0] +sub v13.4s, v21.4s, v11.4s +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v2.s[2] +mul v22.4S, v22.4S,v16.s[2] +mla v22.4S, v11.4S, v31.s[0] +sub v11.4s, v14.4s, v22.4s +add v14.4s, v14.4s, v22.4s +trn1 v22.4S, v21.4S, v13.4S +trn2 v0.4S, v21.4S, v13.4S +trn1 v19.4S, v14.4S, v11.4S +trn2 v17.4S, v14.4S, v11.4S +trn2 v14.2D, v22.2D, v19.2D +trn2 v11.2D, v0.2D, v17.2D +trn1 v21.2D, v22.2D, v19.2D +trn1 v13.2D, v0.2D, v17.2D +sqrdmulh v17.4S, v14.4S, v15.4S +mul v14.4S, v14.4S,v10.4S +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v14.4s +add v21.4s, v21.4s, v14.4s +sqrdmulh v14.4S, v11.4S, v15.4S +mul v11.4S, v11.4S,v10.4S +mla v11.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v11.4s +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v6.4S +mul v13.4S, v13.4S,v7.4S +mla v13.4S, v11.4S, v31.s[0] +sub v11.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +sqrdmulh v13.4S, v14.4S, v4.4S +mul v14.4S, v14.4S,v5.4S +mla v14.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v14.4s +add v17.4s, v17.4s, v14.4s +str q21, [x0, #768] +str q11, [x0, #784] +str q17, [x0, #800] +str q13, [x0, #816] +ldr q4, [x17, #+768] +ldr q5, [x17, #+784] +ldr q6, [x17, #+800] +ldr q7, [x17, #+816] +ldr q15, [x17, #+832] +ldr q10, [x17, #+848] +ldr q2, [x17, #+864] +ldr q16, [x17, #+880] +ldr q13, [x0, #352] +ldr q17, [x0, #368] +ldr q11, [x0, #320] +ldr q21, [x0, #336] +sqrdmulh v14.4S, v13.4S, v5.s[0] +mul v13.4S, v13.4S,v4.s[0] +mla v13.4S, v14.4S, v31.s[0] +sub v14.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +sqrdmulh v13.4S, v17.4S, v5.s[0] +mul v17.4S, v17.4S,v4.s[0] +mla v17.4S, v13.4S, v31.s[0] +sub v13.4s, v21.4s, v17.4s +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v21.4S, v5.s[1] +mul v21.4S, v21.4S,v4.s[1] +mla v21.4S, v17.4S, v31.s[0] +sub v17.4s, v11.4s, v21.4s +add v11.4s, v11.4s, v21.4s +sqrdmulh v21.4S, v13.4S, v5.s[2] +mul v13.4S, v13.4S,v4.s[2] +mla v13.4S, v21.4S, v31.s[0] +sub v21.4s, v14.4s, v13.4s +add v14.4s, v14.4s, v13.4s +trn1 v13.4S, v11.4S, v17.4S +trn2 v0.4S, v11.4S, v17.4S +trn1 v19.4S, v14.4S, v21.4S +trn2 v22.4S, v14.4S, v21.4S +trn2 v14.2D, v13.2D, v19.2D +trn2 v21.2D, v0.2D, v22.2D +trn1 v11.2D, v13.2D, v19.2D +trn1 v17.2D, v0.2D, v22.2D +sqrdmulh v22.4S, v14.4S, v7.4S +mul v14.4S, v14.4S,v6.4S +mla v14.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v14.4s +add v11.4s, v11.4s, v14.4s +sqrdmulh v14.4S, v21.4S, v7.4S +mul v21.4S, v21.4S,v6.4S +mla v21.4S, v14.4S, v31.s[0] +sub v14.4s, v17.4s, v21.4s +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v10.4S +mul v17.4S, v17.4S,v15.4S +mla v17.4S, v21.4S, v31.s[0] +sub v21.4s, v11.4s, v17.4s +add v11.4s, v11.4s, v17.4s +sqrdmulh v17.4S, v14.4S, v16.4S +mul v14.4S, v14.4S,v2.4S +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v22.4s, v14.4s +add v22.4s, v22.4s, v14.4s +str q11, [x0, #320] +str q21, [x0, #336] +str q22, [x0, #352] +str q17, [x0, #368] +ldr q16, [x17, #+1792] +ldr q2, [x17, #+1808] +ldr q10, [x17, #+1824] +ldr q15, [x17, #+1840] +ldr q7, [x17, #+1856] +ldr q6, [x17, #+1872] +ldr q5, [x17, #+1888] +ldr q4, [x17, #+1904] +ldr q17, [x0, #864] +ldr q22, [x0, #880] +ldr q21, [x0, #832] +ldr q11, [x0, #848] +sqrdmulh v14.4S, v17.4S, v2.s[0] +mul v17.4S, v17.4S,v16.s[0] +mla v17.4S, v14.4S, v31.s[0] +sub v14.4s, v21.4s, v17.4s +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v22.4S, v2.s[0] +mul v22.4S, v22.4S,v16.s[0] +mla v22.4S, v17.4S, v31.s[0] +sub v17.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +sqrdmulh v22.4S, v11.4S, v2.s[1] +mul v11.4S, v11.4S,v16.s[1] +mla v11.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v11.4s +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v2.s[2] +mul v17.4S, v17.4S,v16.s[2] +mla v17.4S, v11.4S, v31.s[0] +sub v11.4s, v14.4s, v17.4s +add v14.4s, v14.4s, v17.4s +trn1 v17.4S, v21.4S, v22.4S +trn2 v0.4S, v21.4S, v22.4S +trn1 v19.4S, v14.4S, v11.4S +trn2 v13.4S, v14.4S, v11.4S +trn2 v14.2D, v17.2D, v19.2D +trn2 v11.2D, v0.2D, v13.2D +trn1 v21.2D, v17.2D, v19.2D +trn1 v22.2D, v0.2D, v13.2D +sqrdmulh v13.4S, v14.4S, v15.4S +mul v14.4S, v14.4S,v10.4S +mla v14.4S, v13.4S, v31.s[0] +sub v13.4s, v21.4s, v14.4s +add v21.4s, v21.4s, v14.4s +sqrdmulh v14.4S, v11.4S, v15.4S +mul v11.4S, v11.4S,v10.4S +mla v11.4S, v14.4S, v31.s[0] +sub v14.4s, v22.4s, v11.4s +add v22.4s, v22.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v6.4S +mul v22.4S, v22.4S,v7.4S +mla v22.4S, v11.4S, v31.s[0] +sub v11.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +sqrdmulh v22.4S, v14.4S, v4.4S +mul v14.4S, v14.4S,v5.4S +mla v14.4S, v22.4S, v31.s[0] +sub v22.4s, v13.4s, v14.4s +add v13.4s, v13.4s, v14.4s +str q21, [x0, #832] +str q11, [x0, #848] +str q13, [x0, #864] +str q22, [x0, #880] +ldr q4, [x17, #+896] +ldr q5, [x17, #+912] +ldr q6, [x17, #+928] +ldr q7, [x17, #+944] +ldr q15, [x17, #+960] +ldr q10, [x17, #+976] +ldr q2, [x17, #+992] +ldr q16, [x17, #+1008] +ldr q22, [x0, #416] +ldr q13, [x0, #432] +ldr q11, [x0, #384] +ldr q21, [x0, #400] +sqrdmulh v14.4S, v22.4S, v5.s[0] +mul v22.4S, v22.4S,v4.s[0] +mla v22.4S, v14.4S, v31.s[0] +sub v14.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +sqrdmulh v22.4S, v13.4S, v5.s[0] +mul v13.4S, v13.4S,v4.s[0] +mla v13.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +sqrdmulh v13.4S, v21.4S, v5.s[1] +mul v21.4S, v21.4S,v4.s[1] +mla v21.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v21.4s +add v11.4s, v11.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v5.s[2] +mul v22.4S, v22.4S,v4.s[2] +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v14.4s, v22.4s +add v14.4s, v14.4s, v22.4s +trn1 v22.4S, v11.4S, v13.4S +trn2 v0.4S, v11.4S, v13.4S +trn1 v19.4S, v14.4S, v21.4S +trn2 v17.4S, v14.4S, v21.4S +trn2 v14.2D, v22.2D, v19.2D +trn2 v21.2D, v0.2D, v17.2D +trn1 v11.2D, v22.2D, v19.2D +trn1 v13.2D, v0.2D, v17.2D +sqrdmulh v17.4S, v14.4S, v7.4S +mul v14.4S, v14.4S,v6.4S +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v11.4s, v14.4s +add v11.4s, v11.4s, v14.4s +sqrdmulh v14.4S, v21.4S, v7.4S +mul v21.4S, v21.4S,v6.4S +mla v21.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v21.4s +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v13.4S, v10.4S +mul v13.4S, v13.4S,v15.4S +mla v13.4S, v21.4S, v31.s[0] +sub v21.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +sqrdmulh v13.4S, v14.4S, v16.4S +mul v14.4S, v14.4S,v2.4S +mla v14.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v14.4s +add v17.4s, v17.4s, v14.4s +str q11, [x0, #384] +str q21, [x0, #400] +str q17, [x0, #416] +str q13, [x0, #432] +ldr q16, [x17, #+1920] +ldr q2, [x17, #+1936] +ldr q10, [x17, #+1952] +ldr q15, [x17, #+1968] +ldr q7, [x17, #+1984] +ldr q6, [x17, #+2000] +ldr q5, [x17, #+2016] +ldr q4, [x17, #+2032] +ldr q13, [x0, #928] +ldr q17, [x0, #944] +ldr q21, [x0, #896] +ldr q11, [x0, #912] +sqrdmulh v14.4S, v13.4S, v2.s[0] +mul v13.4S, v13.4S,v16.s[0] +mla v13.4S, v14.4S, v31.s[0] +sub v14.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +sqrdmulh v13.4S, v17.4S, v2.s[0] +mul v17.4S, v17.4S,v16.s[0] +mla v17.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v17.4s +add v11.4s, v11.4s, v17.4s +sqrdmulh v17.4S, v11.4S, v2.s[1] +mul v11.4S, v11.4S,v16.s[1] +mla v11.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v11.4s +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v2.s[2] +mul v13.4S, v13.4S,v16.s[2] +mla v13.4S, v11.4S, v31.s[0] +sub v11.4s, v14.4s, v13.4s +add v14.4s, v14.4s, v13.4s +trn1 v13.4S, v21.4S, v17.4S +trn2 v0.4S, v21.4S, v17.4S +trn1 v19.4S, v14.4S, v11.4S +trn2 v22.4S, v14.4S, v11.4S +trn2 v14.2D, v13.2D, v19.2D +trn2 v11.2D, v0.2D, v22.2D +trn1 v21.2D, v13.2D, v19.2D +trn1 v17.2D, v0.2D, v22.2D +sqrdmulh v22.4S, v14.4S, v15.4S +mul v14.4S, v14.4S,v10.4S +mla v14.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v14.4s +add v21.4s, v21.4s, v14.4s +sqrdmulh v14.4S, v11.4S, v15.4S +mul v11.4S, v11.4S,v10.4S +mla v11.4S, v14.4S, v31.s[0] +sub v14.4s, v17.4s, v11.4s +add v17.4s, v17.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v6.4S +mul v17.4S, v17.4S,v7.4S +mla v17.4S, v11.4S, v31.s[0] +sub v11.4s, v21.4s, v17.4s +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v14.4S, v4.4S +mul v14.4S, v14.4S,v5.4S +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v22.4s, v14.4s +add v22.4s, v22.4s, v14.4s +str q21, [x0, #896] +str q11, [x0, #912] +str q22, [x0, #928] +str q17, [x0, #944] +ldr q4, [x17, #+1024] +ldr q5, [x17, #+1040] +ldr q6, [x17, #+1056] +ldr q7, [x17, #+1072] +ldr q15, [x17, #+1088] +ldr q10, [x17, #+1104] +ldr q2, [x17, #+1120] +ldr q16, [x17, #+1136] +ldr q17, [x0, #480] +ldr q22, [x0, #496] +ldr q11, [x0, #448] +ldr q21, [x0, #464] +sqrdmulh v14.4S, v17.4S, v5.s[0] +mul v17.4S, v17.4S,v4.s[0] +mla v17.4S, v14.4S, v31.s[0] +sub v14.4s, v11.4s, v17.4s +add v11.4s, v11.4s, v17.4s +sqrdmulh v17.4S, v22.4S, v5.s[0] +mul v22.4S, v22.4S,v4.s[0] +mla v22.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v5.s[1] +mul v21.4S, v21.4S,v4.s[1] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v21.4s +add v11.4s, v11.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v5.s[2] +mul v17.4S, v17.4S,v4.s[2] +mla v17.4S, v21.4S, v31.s[0] +sub v21.4s, v14.4s, v17.4s +add v14.4s, v14.4s, v17.4s +trn1 v17.4S, v11.4S, v22.4S +trn2 v0.4S, v11.4S, v22.4S +trn1 v19.4S, v14.4S, v21.4S +trn2 v13.4S, v14.4S, v21.4S +trn2 v14.2D, v17.2D, v19.2D +trn2 v21.2D, v0.2D, v13.2D +trn1 v11.2D, v17.2D, v19.2D +trn1 v22.2D, v0.2D, v13.2D +sqrdmulh v13.4S, v14.4S, v7.4S +mul v14.4S, v14.4S,v6.4S +mla v14.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v14.4s +add v11.4s, v11.4s, v14.4s +sqrdmulh v14.4S, v21.4S, v7.4S +mul v21.4S, v21.4S,v6.4S +mla v21.4S, v14.4S, v31.s[0] +sub v14.4s, v22.4s, v21.4s +add v22.4s, v22.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v10.4S +mul v22.4S, v22.4S,v15.4S +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +sqrdmulh v22.4S, v14.4S, v16.4S +mul v14.4S, v14.4S,v2.4S +mla v14.4S, v22.4S, v31.s[0] +sub v22.4s, v13.4s, v14.4s +add v13.4s, v13.4s, v14.4s +str q11, [x0, #448] +str q21, [x0, #464] +str q13, [x0, #480] +str q22, [x0, #496] +ldr q16, [x17, #+2048] +ldr q2, [x17, #+2064] +ldr q10, [x17, #+2080] +ldr q15, [x17, #+2096] +ldr q7, [x17, #+2112] +ldr q6, [x17, #+2128] +ldr q5, [x17, #+2144] +ldr q4, [x17, #+2160] +ldr q22, [x0, #992] +ldr q13, [x0, #1008] +ldr q21, [x0, #960] +ldr q11, [x0, #976] +sqrdmulh v14.4S, v22.4S, v2.s[0] +mul v22.4S, v22.4S,v16.s[0] +mla v22.4S, v14.4S, v31.s[0] +sub v14.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +sqrdmulh v22.4S, v13.4S, v2.s[0] +mul v13.4S, v13.4S,v16.s[0] +mla v13.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +sqrdmulh v13.4S, v11.4S, v2.s[1] +mul v11.4S, v11.4S,v16.s[1] +mla v11.4S, v13.4S, v31.s[0] +sub v13.4s, v21.4s, v11.4s +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v2.s[2] +mul v22.4S, v22.4S,v16.s[2] +mla v22.4S, v11.4S, v31.s[0] +sub v11.4s, v14.4s, v22.4s +add v14.4s, v14.4s, v22.4s +trn1 v22.4S, v21.4S, v13.4S +trn2 v0.4S, v21.4S, v13.4S +trn1 v19.4S, v14.4S, v11.4S +trn2 v17.4S, v14.4S, v11.4S +trn2 v14.2D, v22.2D, v19.2D +trn2 v11.2D, v0.2D, v17.2D +trn1 v21.2D, v22.2D, v19.2D +trn1 v13.2D, v0.2D, v17.2D +sqrdmulh v17.4S, v14.4S, v15.4S +mul v14.4S, v14.4S,v10.4S +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v14.4s +add v21.4s, v21.4s, v14.4s +sqrdmulh v14.4S, v11.4S, v15.4S +mul v11.4S, v11.4S,v10.4S +mla v11.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v11.4s +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v6.4S +mul v13.4S, v13.4S,v7.4S +mla v13.4S, v11.4S, v31.s[0] +sub v11.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +sqrdmulh v13.4S, v14.4S, v4.4S +mul v14.4S, v14.4S,v5.4S +mla v14.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v14.4s +add v17.4s, v17.4s, v14.4s +str q21, [x0, #960] +str q11, [x0, #976] +str q17, [x0, #992] +str q13, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 2392 +// Instruction count: 2388 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z2_1.s b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z2_1.s new file mode 100644 index 0000000..e51be66 --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z2_1.s @@ -0,0 +1,2422 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 26036764 // Layer 6, block 0 +.word 7065381 // Layer 6, block 1 +.word 11280567 // Layer 6, block 2 +.word 19695786 // Layer 6, block 3 +.word 1666225723 // Layer 6, block 0 +.word 452149874 // Layer 6, block 1 +.word 721901190 // Layer 6, block 2 +.word 1260434103 // Layer 6, block 3 +.word 28678040 // Layer 7, block 0 +.word 5637166 // Layer 7, block 2 +.word 18759424 // Layer 7, block 4 +.word 8648030 // Layer 7, block 6 +.word 1835254486 // Layer 7, block 0 +.word 360751090 // Layer 7, block 2 +.word 1200511508 // Layer 7, block 4 +.word 553431680 // Layer 7, block 6 +.word 7232147 // Layer 7, block 1 +.word 7430689 // Layer 7, block 3 +.word 14819378 // Layer 7, block 5 +.word 22112339 // Layer 7, block 7 +.word 462822084 // Layer 7, block 1 +.word 475527802 // Layer 7, block 3 +.word 948367809 // Layer 7, block 5 +.word 1415081692 // Layer 7, block 7 +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14834498 // Layer 6, block 4 +.word 22861321 // Layer 6, block 5 +.word 23033862 // Layer 6, block 6 +.word 32211066 // Layer 6, block 7 +.word 949335415 // Layer 6, block 4 +.word 1463012881 // Layer 6, block 5 +.word 1474054663 // Layer 6, block 6 +.word 2061350894 // Layer 6, block 7 +.word 7103825 // Layer 7, block 8 +.word 24338119 // Layer 7, block 10 +.word 6674394 // Layer 7, block 12 +.word 3716128 // Layer 7, block 14 +.word 454610102 // Layer 7, block 8 +.word 1557520740 // Layer 7, block 10 +.word 427128616 // Layer 7, block 12 +.word 237814041 // Layer 7, block 14 +.word 18577393 // Layer 7, block 9 +.word 17042091 // Layer 7, block 11 +.word 6574213 // Layer 7, block 13 +.word 24666803 // Layer 7, block 15 +.word 1188862414 // Layer 7, block 9 +.word 1090610585 // Layer 7, block 11 +.word 420717521 // Layer 7, block 13 +.word 1578554911 // Layer 7, block 15 +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 11253846 // Layer 6, block 8 +.word 16151303 // Layer 6, block 9 +.word 1821442 // Layer 6, block 10 +.word 23358663 // Layer 6, block 11 +.word 720191176 // Layer 6, block 8 +.word 1033604503 // Layer 6, block 9 +.word 116563391 // Layer 6, block 10 +.word 1494840340 // Layer 6, block 11 +.word 32787475 // Layer 7, block 16 +.word 8269259 // Layer 7, block 18 +.word 20826321 // Layer 7, block 20 +.word 21194054 // Layer 7, block 22 +.word 2098238255 // Layer 7, block 16 +.word 529192186 // Layer 7, block 18 +.word 1332782821 // Layer 7, block 20 +.word 1356315937 // Layer 7, block 22 +.word 28400654 // Layer 7, block 17 +.word 31090287 // Layer 7, block 19 +.word 26776841 // Layer 7, block 21 +.word 22281074 // Layer 7, block 23 +.word 1817503137 // Layer 7, block 17 +.word 1989626512 // Layer 7, block 19 +.word 1713587037 // Layer 7, block 21 +.word 1425879908 // Layer 7, block 23 +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 20504641 // Layer 6, block 12 +.word 7735096 // Layer 6, block 13 +.word 29463916 // Layer 6, block 14 +.word 23172067 // Layer 6, block 15 +.word 1312196872 // Layer 6, block 12 +.word 495008363 // Layer 6, block 13 +.word 1885546712 // Layer 6, block 14 +.word 1482899108 // Layer 6, block 15 +.word 1953000 // Layer 7, block 24 +.word 12766243 // Layer 7, block 26 +.word 16292342 // Layer 7, block 28 +.word 25143337 // Layer 7, block 30 +.word 124982461 // Layer 7, block 24 +.word 816977197 // Layer 7, block 26 +.word 1042630311 // Layer 7, block 28 +.word 1609050759 // Layer 7, block 30 +.word 12486848 // Layer 7, block 25 +.word 31556661 // Layer 7, block 27 +.word 28330310 // Layer 7, block 29 +.word 15137961 // Layer 7, block 31 +.word 799097282 // Layer 7, block 25 +.word 2019472170 // Layer 7, block 27 +.word 1813001465 // Layer 7, block 29 +.word 968755565 // Layer 7, block 31 +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 18663828 // Layer 6, block 16 +.word 25765932 // Layer 6, block 17 +.word 11779122 // Layer 6, block 18 +.word 29112305 // Layer 6, block 19 +.word 1194393831 // Layer 6, block 16 +.word 1648893798 // Layer 6, block 17 +.word 753806275 // Layer 6, block 18 +.word 1863045325 // Layer 6, block 19 +.word 33163184 // Layer 7, block 32 +.word 11550623 // Layer 7, block 34 +.word 25375595 // Layer 7, block 36 +.word 18254638 // Layer 7, block 38 +.word 2122281795 // Layer 7, block 32 +.word 739183455 // Layer 7, block 34 +.word 1623914137 // Layer 7, block 36 +.word 1168207670 // Layer 7, block 38 +.word 9551359 // Layer 7, block 33 +.word 33257316 // Layer 7, block 35 +.word 10387700 // Layer 7, block 37 +.word 4263629 // Layer 7, block 39 +.word 611240324 // Layer 7, block 33 +.word 2128305784 // Layer 7, block 35 +.word 664762063 // Layer 7, block 37 +.word 272851431 // Layer 7, block 39 +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 596073 // Layer 6, block 20 +.word 29039358 // Layer 6, block 21 +.word 6760262 // Layer 6, block 22 +.word 2228887 // Layer 6, block 23 +.word 38145761 // Layer 6, block 20 +.word 1858377074 // Layer 6, block 21 +.word 432623749 // Layer 6, block 22 +.word 142637881 // Layer 6, block 23 +.word 25929180 // Layer 7, block 40 +.word 23508428 // Layer 7, block 42 +.word 22560727 // Layer 7, block 44 +.word 29457393 // Layer 7, block 46 +.word 1659340873 // Layer 7, block 40 +.word 1504424569 // Layer 7, block 42 +.word 1443776334 // Layer 7, block 44 +.word 1885129272 // Layer 7, block 46 +.word 17371159 // Layer 7, block 41 +.word 11558208 // Layer 7, block 43 +.word 15755637 // Layer 7, block 45 +.word 20740787 // Layer 7, block 47 +.word 1111669329 // Layer 7, block 41 +.word 739668858 // Layer 7, block 43 +.word 1008283812 // Layer 7, block 45 +.word 1327309063 // Layer 7, block 47 +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 13624329 // Layer 6, block 24 +.word 9838349 // Layer 6, block 25 +.word 6934560 // Layer 6, block 26 +.word 11310234 // Layer 6, block 27 +.word 871890510 // Layer 6, block 24 +.word 629606282 // Layer 6, block 25 +.word 443777969 // Layer 6, block 26 +.word 723799733 // Layer 6, block 27 +.word 3153984 // Layer 7, block 48 +.word 15599806 // Layer 7, block 50 +.word 23484790 // Layer 7, block 52 +.word 30174454 // Layer 7, block 54 +.word 201839571 // Layer 7, block 48 +.word 998311389 // Layer 7, block 50 +.word 1502911852 // Layer 7, block 52 +.word 1931017673 // Layer 7, block 54 +.word 13598070 // Layer 7, block 49 +.word 31454003 // Layer 7, block 51 +.word 20506260 // Layer 7, block 53 +.word 5928435 // Layer 7, block 55 +.word 870210062 // Layer 7, block 49 +.word 2012902560 // Layer 7, block 51 +.word 1312300480 // Layer 7, block 53 +.word 379390883 // Layer 7, block 55 +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 32798516 // Layer 6, block 28 +.word 9911360 // Layer 6, block 29 +.word 32443170 // Layer 6, block 30 +.word 31293482 // Layer 6, block 31 +.word 2098944825 // Layer 6, block 28 +.word 634278629 // Layer 6, block 29 +.word 2076204416 // Layer 6, block 30 +.word 2002630000 // Layer 6, block 31 +.word 26013877 // Layer 7, block 56 +.word 22928950 // Layer 7, block 58 +.word 24547058 // Layer 7, block 60 +.word 21082546 // Layer 7, block 62 +.word 1664761067 // Layer 7, block 56 +.word 1467340807 // Layer 7, block 58 +.word 1570891816 // Layer 7, block 60 +.word 1349179970 // Layer 7, block 62 +.word 21864746 // Layer 7, block 57 +.word 27678266 // Layer 7, block 59 +.word 30695887 // Layer 7, block 61 +.word 31772478 // Layer 7, block 63 +.word 1399236949 // Layer 7, block 57 +.word 1771273834 // Layer 7, block 59 +.word 1964386839 // Layer 7, block 61 +.word 2033283404 // Layer 7, block 63 +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 2853776 // Layer 6, block 32 +.word 31645959 // Layer 6, block 33 +.word 29723614 // Layer 6, block 34 +.word 31813171 // Layer 6, block 35 +.word 182627725 // Layer 6, block 32 +.word 2025186806 // Layer 6, block 33 +.word 1902166116 // Layer 6, block 34 +.word 2035887557 // Layer 6, block 35 +.word 30377953 // Layer 7, block 64 +.word 4924837 // Layer 7, block 66 +.word 11362575 // Layer 7, block 68 +.word 31398766 // Layer 7, block 70 +.word 1944040616 // Layer 7, block 64 +.word 315165513 // Layer 7, block 66 +.word 727149301 // Layer 7, block 68 +.word 2009367662 // Layer 7, block 70 +.word 27689101 // Layer 7, block 65 +.word 31229525 // Layer 7, block 67 +.word 6544948 // Layer 7, block 69 +.word 13728247 // Layer 7, block 71 +.word 1771967221 // Layer 7, block 65 +.word 1998537064 // Layer 7, block 67 +.word 418844704 // Layer 7, block 69 +.word 878540754 // Layer 7, block 71 +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9116920 // Layer 6, block 36 +.word 26449800 // Layer 6, block 37 +.word 27173300 // Layer 6, block 38 +.word 1574249 // Layer 6, block 39 +.word 583438350 // Layer 6, block 36 +.word 1692658010 // Layer 6, block 37 +.word 1738958476 // Layer 6, block 38 +.word 100744247 // Layer 6, block 39 +.word 6510145 // Layer 7, block 72 +.word 760999 // Layer 7, block 74 +.word 1634503 // Layer 7, block 76 +.word 29546109 // Layer 7, block 78 +.word 416617482 // Layer 7, block 72 +.word 48700219 // Layer 7, block 74 +.word 104600209 // Layer 7, block 76 +.word 1890806663 // Layer 7, block 78 +.word 2195232 // Layer 7, block 73 +.word 4465852 // Layer 7, block 75 +.word 31203102 // Layer 7, block 77 +.word 29916743 // Layer 7, block 79 +.word 140484126 // Layer 7, block 73 +.word 285792715 // Layer 7, block 75 +.word 1996846121 // Layer 7, block 77 +.word 1914525428 // Layer 7, block 79 +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29172999 // Layer 6, block 40 +.word 16825951 // Layer 6, block 41 +.word 11592382 // Layer 6, block 42 +.word 2671395 // Layer 6, block 43 +.word 1866929445 // Layer 6, block 40 +.word 1076778680 // Layer 6, block 41 +.word 741855827 // Layer 6, block 42 +.word 170956232 // Layer 6, block 43 +.word 14579779 // Layer 7, block 80 +.word 24263513 // Layer 7, block 82 +.word 4646776 // Layer 7, block 84 +.word 69049 // Layer 7, block 86 +.word 933034643 // Layer 7, block 80 +.word 1552746321 // Layer 7, block 82 +.word 297370968 // Layer 7, block 84 +.word 4418799 // Layer 7, block 86 +.word 33263488 // Layer 7, block 81 +.word 22493246 // Layer 7, block 83 +.word 22009979 // Layer 7, block 85 +.word 12021234 // Layer 7, block 87 +.word 2128700762 // Layer 7, block 81 +.word 1439457879 // Layer 7, block 83 +.word 1408531152 // Layer 7, block 85 +.word 769300260 // Layer 7, block 87 +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 15720958 // Layer 6, block 44 +.word 4876619 // Layer 6, block 45 +.word 9370171 // Layer 6, block 46 +.word 2197027 // Layer 6, block 47 +.word 1006064525 // Layer 6, block 44 +.word 312079797 // Layer 6, block 45 +.word 599645177 // Layer 6, block 46 +.word 140598997 // Layer 6, block 47 +.word 16117282 // Layer 7, block 88 +.word 9635661 // Layer 7, block 90 +.word 9117520 // Layer 7, block 92 +.word 3506913 // Layer 7, block 94 +.word 1031427326 // Layer 7, block 88 +.word 616635240 // Layer 7, block 90 +.word 583476747 // Layer 7, block 92 +.word 224425303 // Layer 7, block 94 +.word 20014407 // Layer 7, block 89 +.word 25893988 // Layer 7, block 91 +.word 10257619 // Layer 7, block 93 +.word 24501669 // Layer 7, block 95 +.word 1280824291 // Layer 7, block 89 +.word 1657088757 // Layer 7, block 91 +.word 656437514 // Layer 7, block 93 +.word 1567987141 // Layer 7, block 95 +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 23467272 // Layer 6, block 48 +.word 11944835 // Layer 6, block 49 +.word 29768154 // Layer 6, block 50 +.word 3189790 // Layer 6, block 51 +.word 1501790786 // Layer 6, block 48 +.word 764411097 // Layer 6, block 49 +.word 1905016458 // Layer 6, block 50 +.word 204130980 // Layer 6, block 51 +.word 28559032 // Layer 7, block 96 +.word 20151609 // Layer 7, block 98 +.word 11645481 // Layer 7, block 100 +.word 16402437 // Layer 7, block 102 +.word 1827638556 // Layer 7, block 96 +.word 1289604549 // Layer 7, block 98 +.word 745253903 // Layer 7, block 100 +.word 1049675853 // Layer 7, block 102 +.word 1005359 // Layer 7, block 97 +.word 19130139 // Layer 7, block 99 +.word 11690281 // Layer 7, block 101 +.word 5461508 // Layer 7, block 103 +.word 64338065 // Layer 7, block 97 +.word 1224235458 // Layer 7, block 99 +.word 748120885 // Layer 7, block 101 +.word 349509836 // Layer 7, block 103 +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 4898455 // Layer 6, block 52 +.word 22059944 // Layer 6, block 53 +.word 20315246 // Layer 6, block 54 +.word 28615767 // Layer 6, block 55 +.word 313477194 // Layer 6, block 52 +.word 1411728668 // Layer 6, block 53 +.word 1300076517 // Layer 6, block 54 +.word 1831269319 // Layer 6, block 55 +.word 6226096 // Layer 7, block 104 +.word 14029790 // Layer 7, block 106 +.word 7729000 // Layer 7, block 108 +.word 13958531 // Layer 7, block 110 +.word 398439734 // Layer 7, block 104 +.word 897838034 // Layer 7, block 106 +.word 494618249 // Layer 7, block 108 +.word 893277806 // Layer 7, block 110 +.word 31755058 // Layer 7, block 105 +.word 26102744 // Layer 7, block 107 +.word 19175904 // Layer 7, block 109 +.word 19472238 // Layer 7, block 111 +.word 2032168609 // Layer 7, block 105 +.word 1670448121 // Layer 7, block 107 +.word 1227164194 // Layer 7, block 109 +.word 1246128123 // Layer 7, block 111 +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 17302560 // Layer 6, block 56 +.word 8630188 // Layer 6, block 57 +.word 13744680 // Layer 6, block 58 +.word 31890906 // Layer 6, block 59 +.word 1107279328 // Layer 6, block 56 +.word 552289879 // Layer 6, block 57 +.word 879592386 // Layer 6, block 58 +.word 2040862218 // Layer 6, block 59 +.word 4735938 // Layer 7, block 112 +.word 26671657 // Layer 7, block 114 +.word 25810971 // Layer 7, block 116 +.word 25578690 // Layer 7, block 118 +.word 303076900 // Layer 7, block 112 +.word 1706855774 // Layer 7, block 114 +.word 1651776074 // Layer 7, block 116 +.word 1636911225 // Layer 7, block 118 +.word 6957373 // Layer 7, block 113 +.word 25381712 // Layer 7, block 115 +.word 27780827 // Layer 7, block 117 +.word 28062311 // Layer 7, block 119 +.word 445237890 // Layer 7, block 113 +.word 1624305595 // Layer 7, block 115 +.word 1777837237 // Layer 7, block 117 +.word 1795850838 // Layer 7, block 119 +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 26150922 // Layer 6, block 60 +.word 29525906 // Layer 6, block 61 +.word 23080870 // Layer 6, block 62 +.word 1636987 // Layer 6, block 63 +.word 1673531278 // Layer 6, block 60 +.word 1889513769 // Layer 6, block 61 +.word 1477062945 // Layer 6, block 62 +.word 104759172 // Layer 6, block 63 +.word 10674616 // Layer 7, block 120 +.word 9508293 // Layer 7, block 122 +.word 4274200 // Layer 7, block 124 +.word 10066304 // Layer 7, block 126 +.word 683123285 // Layer 7, block 120 +.word 608484310 // Layer 7, block 122 +.word 273527923 // Layer 7, block 124 +.word 644194289 // Layer 7, block 126 +.word 26473446 // Layer 7, block 121 +.word 14853570 // Layer 7, block 123 +.word 32427548 // Layer 7, block 125 +.word 16598340 // Layer 7, block 127 +.word 1694171239 // Layer 7, block 121 +.word 950555930 // Layer 7, block 123 +.word 2075204685 // Layer 7, block 125 +.word 1062212688 // Layer 7, block 127 +.text +.global ntt_u32_full_neon_asm_var_4_4_3_z2_1 +.global _ntt_u32_full_neon_asm_var_4_4_3_z2_1 +ntt_u32_full_neon_asm_var_4_4_3_z2_1: +_ntt_u32_full_neon_asm_var_4_4_3_z2_1: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #800] +ldr q21, [x0, #864] +ldr q20, [x0, #928] +ldr q19, [x0, #992] +ldr q18, [x0, #288] +ldr q17, [x0, #352] +ldr q16, [x0, #416] +ldr q3, [x0, #480] +sqrdmulh v2.4S, v22.4S, v29.s[0] +ldr q1, [x0, #544] +mul v22.4S, v22.4S,v30.s[0] +ldr q0, [x0, #608] +sqrdmulh v15.4S, v21.4S, v29.s[0] +ldr q14, [x0, #672] +mul v21.4S, v21.4S,v30.s[0] +ldr q13, [x0, #736] +mla v22.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q12, [x0, #32] +sub v11.4s, v18.4s, v22.4s +mla v21.4S, v15.4S, v31.s[0] +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q15, [x0, #96] +sub v10.4s, v17.4s, v21.4s +mla v20.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v1.4S, v29.s[0] +ldr q2, [x0, #160] +mul v1.4S, v1.4S,v30.s[0] +sub v9.4s, v16.4s, v20.4s +mla v19.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v0.4S, v29.s[0] +ldr q22, [x0, #224] +mul v0.4S, v0.4S,v30.s[0] +sub v8.4s, v3.4s, v19.4s +mla v1.4S, v21.4S, v31.s[0] +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v21.4s, v12.4s, v1.4s +mla v0.4S, v20.4S, v31.s[0] +add v12.4s, v12.4s, v1.4s +sqrdmulh v1.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +sub v20.4s, v15.4s, v0.4s +mla v14.4S, v19.4S, v31.s[0] +add v15.4s, v15.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v19.4s, v2.4s, v14.4s +mla v13.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v1.4s, v22.4s, v13.4s +mla v16.4S, v0.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v0.4s, v2.4s, v16.4s +mla v3.4S, v14.4S, v31.s[0] +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v14.4s, v22.4s, v3.4s +mla v18.4S, v13.4S, v31.s[0] +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v29.s[2] +mul v9.4S, v9.4S,v30.s[2] +sub v13.4s, v12.4s, v18.4s +mla v17.4S, v16.4S, v31.s[0] +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v8.4S, v29.s[2] +mul v8.4S, v8.4S,v30.s[2] +sub v16.4s, v15.4s, v17.4s +mla v9.4S, v3.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +sqrdmulh v17.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v3.4s, v19.4s, v9.4s +mla v8.4S, v18.4S, v31.s[0] +add v19.4s, v19.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v18.4s, v1.4s, v8.4s +mla v11.4S, v17.4S, v31.s[0] +add v1.4s, v1.4s, v8.4s +sqrdmulh v8.4S, v2.4S, v27.s[0] +mul v2.4S, v2.4S,v28.s[0] +sub v17.4s, v21.4s, v11.4s +mla v10.4S, v9.4S, v31.s[0] +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v27.s[0] +mul v22.4S, v22.4S,v28.s[0] +sub v9.4s, v20.4s, v10.4s +mla v2.4S, v8.4S, v31.s[0] +add v20.4s, v20.4s, v10.4s +sqrdmulh v10.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v8.4s, v12.4s, v2.4s +mla v22.4S, v11.4S, v31.s[0] +add v12.4s, v12.4s, v2.4s +sqrdmulh v2.4S, v14.4S, v27.s[1] +mul v14.4S, v14.4S,v28.s[1] +sub v11.4s, v15.4s, v22.4s +mla v0.4S, v10.4S, v31.s[0] +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v27.s[2] +mul v19.4S, v19.4S,v28.s[2] +sub v10.4s, v13.4s, v0.4s +mla v14.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v0.4s +sqrdmulh v0.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +sub v2.4s, v16.4s, v14.4s +mla v19.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v27.s[3] +mul v3.4S, v3.4S,v28.s[3] +sub v22.4s, v21.4s, v19.4s +mla v1.4S, v0.4S, v31.s[0] +add v21.4s, v21.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v0.4s, v20.4s, v1.4s +mla v3.4S, v14.4S, v31.s[0] +add v20.4s, v20.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v25.s[0] +mul v15.4S, v15.4S,v26.s[0] +sub v14.4s, v17.4s, v3.4s +mla v18.4S, v19.4S, v31.s[0] +add v17.4s, v17.4s, v3.4s +sqrdmulh v3.4S, v11.4S, v25.s[1] +mul v11.4S, v11.4S,v26.s[1] +sub v19.4s, v9.4s, v18.4s +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v1.4s, v12.4s, v15.4s +mla v11.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v25.s[3] +mul v2.4S, v2.4S,v26.s[3] +sub v3.4s, v8.4s, v11.4s +mla v16.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v11.4s +str q12, [x0, #32] +sqrdmulh v12.4S, v20.4S, v23.s[0] +str q1, [x0, #96] +mul v20.4S, v20.4S,v24.s[0] +ldr q1, [x0, #816] +sub v11.4s, v13.4s, v16.4s +ldr q18, [x0, #880] +mla v2.4S, v15.4S, v31.s[0] +add v13.4s, v13.4s, v16.4s +str q8, [x0, #160] +sqrdmulh v8.4S, v0.4S, v23.s[1] +str q3, [x0, #224] +mul v0.4S, v0.4S,v24.s[1] +ldr q3, [x0, #944] +sub v16.4s, v10.4s, v2.4s +ldr q15, [x0, #1008] +mla v20.4S, v12.4S, v31.s[0] +add v10.4s, v10.4s, v2.4s +str q13, [x0, #288] +sqrdmulh v13.4S, v9.4S, v23.s[2] +str q11, [x0, #352] +mul v9.4S, v9.4S,v24.s[2] +ldr q11, [x0, #304] +sub v2.4s, v21.4s, v20.4s +ldr q12, [x0, #368] +mla v0.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v20.4s +str q10, [x0, #416] +sqrdmulh v10.4S, v19.4S, v23.s[3] +str q16, [x0, #480] +mul v19.4S, v19.4S,v24.s[3] +ldr q16, [x0, #432] +sub v20.4s, v22.4s, v0.4s +ldr q8, [x0, #496] +mla v9.4S, v13.4S, v31.s[0] +add v22.4s, v22.4s, v0.4s +str q21, [x0, #544] +sqrdmulh v21.4S, v1.4S, v29.s[0] +str q2, [x0, #608] +ldr q2, [x0, #560] +mul v1.4S, v1.4S,v30.s[0] +ldr q0, [x0, #624] +sub v13.4s, v17.4s, v9.4s +mla v19.4S, v10.4S, v31.s[0] +add v17.4s, v17.4s, v9.4s +str q22, [x0, #672] +sqrdmulh v22.4S, v18.4S, v29.s[0] +str q20, [x0, #736] +ldr q20, [x0, #688] +mul v18.4S, v18.4S,v30.s[0] +ldr q9, [x0, #752] +sub v10.4s, v14.4s, v19.4s +mla v1.4S, v21.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +str q17, [x0, #800] +sqrdmulh v17.4S, v3.4S, v29.s[0] +str q13, [x0, #864] +mul v3.4S, v3.4S,v30.s[0] +ldr q13, [x0, #48] +sub v19.4s, v11.4s, v1.4s +mla v18.4S, v22.4S, v31.s[0] +add v11.4s, v11.4s, v1.4s +str q14, [x0, #928] +sqrdmulh v14.4S, v15.4S, v29.s[0] +str q10, [x0, #992] +mul v15.4S, v15.4S,v30.s[0] +ldr q10, [x0, #112] +sub v1.4s, v12.4s, v18.4s +mla v3.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v2.4S, v29.s[0] +ldr q17, [x0, #176] +mul v2.4S, v2.4S,v30.s[0] +sub v22.4s, v16.4s, v3.4s +mla v15.4S, v14.4S, v31.s[0] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v0.4S, v29.s[0] +ldr q14, [x0, #240] +mul v0.4S, v0.4S,v30.s[0] +sub v21.4s, v8.4s, v15.4s +mla v2.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +sub v18.4s, v13.4s, v2.4s +mla v0.4S, v3.4S, v31.s[0] +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v9.4S, v29.s[0] +mul v9.4S, v9.4S,v30.s[0] +sub v3.4s, v10.4s, v0.4s +mla v20.4S, v15.4S, v31.s[0] +add v10.4s, v10.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v15.4s, v17.4s, v20.4s +mla v9.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +sub v2.4s, v14.4s, v9.4s +mla v16.4S, v0.4S, v31.s[0] +add v14.4s, v14.4s, v9.4s +sqrdmulh v9.4S, v11.4S, v29.s[1] +mul v11.4S, v11.4S,v30.s[1] +sub v0.4s, v17.4s, v16.4s +mla v8.4S, v20.4S, v31.s[0] +add v17.4s, v17.4s, v16.4s +sqrdmulh v16.4S, v12.4S, v29.s[1] +mul v12.4S, v12.4S,v30.s[1] +sub v20.4s, v14.4s, v8.4s +mla v11.4S, v9.4S, v31.s[0] +add v14.4s, v14.4s, v8.4s +sqrdmulh v8.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +sub v9.4s, v13.4s, v11.4s +mla v12.4S, v16.4S, v31.s[0] +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +sub v16.4s, v10.4s, v12.4s +mla v22.4S, v8.4S, v31.s[0] +add v10.4s, v10.4s, v12.4s +sqrdmulh v12.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +sub v8.4s, v15.4s, v22.4s +mla v21.4S, v11.4S, v31.s[0] +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v1.4S, v29.s[2] +mul v1.4S, v1.4S,v30.s[2] +sub v11.4s, v2.4s, v21.4s +mla v19.4S, v12.4S, v31.s[0] +add v2.4s, v2.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v27.s[0] +mul v17.4S, v17.4S,v28.s[0] +sub v12.4s, v18.4s, v19.4s +mla v1.4S, v22.4S, v31.s[0] +add v18.4s, v18.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +sub v22.4s, v3.4s, v1.4s +mla v17.4S, v21.4S, v31.s[0] +add v3.4s, v3.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v21.4s, v13.4s, v17.4s +mla v14.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v17.4s +sqrdmulh v17.4S, v20.4S, v27.s[1] +mul v20.4S, v20.4S,v28.s[1] +sub v19.4s, v10.4s, v14.4s +mla v0.4S, v1.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v27.s[2] +mul v15.4S, v15.4S,v28.s[2] +sub v1.4s, v9.4s, v0.4s +mla v20.4S, v17.4S, v31.s[0] +add v9.4s, v9.4s, v0.4s +sqrdmulh v0.4S, v2.4S, v27.s[2] +mul v2.4S, v2.4S,v28.s[2] +sub v17.4s, v16.4s, v20.4s +mla v15.4S, v14.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v27.s[3] +mul v8.4S, v8.4S,v28.s[3] +sub v14.4s, v18.4s, v15.4s +mla v2.4S, v0.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v27.s[3] +mul v11.4S, v11.4S,v28.s[3] +sub v0.4s, v3.4s, v2.4s +mla v8.4S, v20.4S, v31.s[0] +add v3.4s, v3.4s, v2.4s +sqrdmulh v2.4S, v10.4S, v25.s[0] +mul v10.4S, v10.4S,v26.s[0] +sub v20.4s, v12.4s, v8.4s +mla v11.4S, v15.4S, v31.s[0] +add v12.4s, v12.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v25.s[1] +mul v19.4S, v19.4S,v26.s[1] +sub v15.4s, v22.4s, v11.4s +mla v10.4S, v2.4S, v31.s[0] +add v22.4s, v22.4s, v11.4s +sqrdmulh v11.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v2.4s, v13.4s, v10.4s +mla v19.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v10.4s +sqrdmulh v10.4S, v17.4S, v25.s[3] +mul v17.4S, v17.4S,v26.s[3] +sub v8.4s, v21.4s, v19.4s +mla v16.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v19.4s +str q13, [x0, #48] +sqrdmulh v13.4S, v3.4S, v23.s[0] +str q2, [x0, #112] +mul v3.4S, v3.4S,v24.s[0] +ldr q2, [x0, #768] +sub v19.4s, v9.4s, v16.4s +ldr q11, [x0, #832] +mla v17.4S, v10.4S, v31.s[0] +add v9.4s, v9.4s, v16.4s +str q21, [x0, #176] +sqrdmulh v21.4S, v0.4S, v23.s[1] +str q8, [x0, #240] +mul v0.4S, v0.4S,v24.s[1] +ldr q8, [x0, #896] +sub v16.4s, v1.4s, v17.4s +ldr q10, [x0, #960] +mla v3.4S, v13.4S, v31.s[0] +add v1.4s, v1.4s, v17.4s +str q9, [x0, #304] +sqrdmulh v9.4S, v22.4S, v23.s[2] +str q19, [x0, #368] +mul v22.4S, v22.4S,v24.s[2] +ldr q19, [x0, #256] +sub v17.4s, v18.4s, v3.4s +ldr q13, [x0, #320] +mla v0.4S, v21.4S, v31.s[0] +add v18.4s, v18.4s, v3.4s +str q1, [x0, #432] +sqrdmulh v1.4S, v15.4S, v23.s[3] +str q16, [x0, #496] +mul v15.4S, v15.4S,v24.s[3] +ldr q16, [x0, #384] +sub v3.4s, v14.4s, v0.4s +ldr q21, [x0, #448] +mla v22.4S, v9.4S, v31.s[0] +add v14.4s, v14.4s, v0.4s +str q18, [x0, #560] +sqrdmulh v18.4S, v2.4S, v29.s[0] +str q17, [x0, #624] +ldr q17, [x0, #512] +mul v2.4S, v2.4S,v30.s[0] +ldr q0, [x0, #576] +sub v9.4s, v12.4s, v22.4s +mla v15.4S, v1.4S, v31.s[0] +add v12.4s, v12.4s, v22.4s +str q14, [x0, #688] +sqrdmulh v14.4S, v11.4S, v29.s[0] +str q3, [x0, #752] +ldr q3, [x0, #640] +mul v11.4S, v11.4S,v30.s[0] +ldr q22, [x0, #704] +sub v1.4s, v20.4s, v15.4s +mla v2.4S, v18.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +str q12, [x0, #816] +sqrdmulh v12.4S, v8.4S, v29.s[0] +str q9, [x0, #880] +mul v8.4S, v8.4S,v30.s[0] +ldr q9, [x0, #0] +sub v15.4s, v19.4s, v2.4s +mla v11.4S, v14.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +str q20, [x0, #944] +sqrdmulh v20.4S, v10.4S, v29.s[0] +str q1, [x0, #1008] +mul v10.4S, v10.4S,v30.s[0] +ldr q1, [x0, #64] +sub v2.4s, v13.4s, v11.4s +mla v8.4S, v12.4S, v31.s[0] +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v29.s[0] +ldr q12, [x0, #128] +mul v17.4S, v17.4S,v30.s[0] +sub v14.4s, v16.4s, v8.4s +mla v10.4S, v20.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v0.4S, v29.s[0] +ldr q20, [x0, #192] +mul v0.4S, v0.4S,v30.s[0] +sub v18.4s, v21.4s, v10.4s +mla v17.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +sub v11.4s, v9.4s, v17.4s +mla v0.4S, v8.4S, v31.s[0] +add v9.4s, v9.4s, v17.4s +sqrdmulh v17.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v8.4s, v1.4s, v0.4s +mla v3.4S, v10.4S, v31.s[0] +add v1.4s, v1.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v10.4s, v12.4s, v3.4s +mla v22.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v17.4s, v20.4s, v22.4s +mla v16.4S, v0.4S, v31.s[0] +add v20.4s, v20.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[1] +mul v19.4S, v19.4S,v30.s[1] +sub v0.4s, v12.4s, v16.4s +mla v21.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v13.4S, v29.s[1] +mul v13.4S, v13.4S,v30.s[1] +sub v3.4s, v20.4s, v21.4s +mla v19.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v22.4s, v9.4s, v19.4s +mla v13.4S, v16.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v29.s[2] +mul v18.4S, v18.4S,v30.s[2] +sub v16.4s, v1.4s, v13.4s +mla v14.4S, v21.4S, v31.s[0] +add v1.4s, v1.4s, v13.4s +sqrdmulh v13.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +sub v21.4s, v10.4s, v14.4s +mla v18.4S, v19.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v19.4s, v17.4s, v18.4s +mla v15.4S, v13.4S, v31.s[0] +add v17.4s, v17.4s, v18.4s +sqrdmulh v18.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +sub v13.4s, v11.4s, v15.4s +mla v2.4S, v14.4S, v31.s[0] +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v27.s[0] +mul v20.4S, v20.4S,v28.s[0] +sub v14.4s, v8.4s, v2.4s +mla v12.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v2.4s +sqrdmulh v2.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v18.4s, v9.4s, v12.4s +mla v20.4S, v15.4S, v31.s[0] +add v9.4s, v9.4s, v12.4s +sqrdmulh v12.4S, v3.4S, v27.s[1] +mul v3.4S, v3.4S,v28.s[1] +sub v15.4s, v1.4s, v20.4s +mla v0.4S, v2.4S, v31.s[0] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v10.4S, v27.s[2] +mul v10.4S, v10.4S,v28.s[2] +sub v2.4s, v22.4s, v0.4s +mla v3.4S, v12.4S, v31.s[0] +add v22.4s, v22.4s, v0.4s +sqrdmulh v0.4S, v17.4S, v27.s[2] +mul v17.4S, v17.4S,v28.s[2] +sub v12.4s, v16.4s, v3.4s +mla v10.4S, v20.4S, v31.s[0] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +sub v20.4s, v11.4s, v10.4s +mla v17.4S, v0.4S, v31.s[0] +add v11.4s, v11.4s, v10.4s +sqrdmulh v10.4S, v19.4S, v27.s[3] +mul v19.4S, v19.4S,v28.s[3] +sub v0.4s, v8.4s, v17.4s +mla v21.4S, v3.4S, v31.s[0] +add v8.4s, v8.4s, v17.4s +sqrdmulh v17.4S, v1.4S, v25.s[0] +mul v1.4S, v1.4S,v26.s[0] +sub v3.4s, v13.4s, v21.4s +mla v19.4S, v10.4S, v31.s[0] +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v15.4S, v25.s[1] +mul v15.4S, v15.4S,v26.s[1] +sub v10.4s, v14.4s, v19.4s +mla v1.4S, v17.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +sqrdmulh v19.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v17.4s, v9.4s, v1.4s +mla v15.4S, v21.4S, v31.s[0] +add v9.4s, v9.4s, v1.4s +sqrdmulh v1.4S, v12.4S, v25.s[3] +mul v12.4S, v12.4S,v26.s[3] +sub v21.4s, v18.4s, v15.4s +mla v16.4S, v19.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +str q9, [x0, #0] +sqrdmulh v9.4S, v8.4S, v23.s[0] +str q17, [x0, #64] +mul v8.4S, v8.4S,v24.s[0] +ldr q17, [x0, #784] +sub v15.4s, v22.4s, v16.4s +ldr q19, [x0, #848] +mla v12.4S, v1.4S, v31.s[0] +add v22.4s, v22.4s, v16.4s +str q18, [x0, #128] +sqrdmulh v18.4S, v0.4S, v23.s[1] +str q21, [x0, #192] +mul v0.4S, v0.4S,v24.s[1] +ldr q21, [x0, #912] +sub v16.4s, v2.4s, v12.4s +ldr q1, [x0, #976] +mla v8.4S, v9.4S, v31.s[0] +add v2.4s, v2.4s, v12.4s +str q22, [x0, #256] +sqrdmulh v22.4S, v14.4S, v23.s[2] +str q15, [x0, #320] +mul v14.4S, v14.4S,v24.s[2] +ldr q15, [x0, #272] +sub v12.4s, v11.4s, v8.4s +ldr q9, [x0, #336] +mla v0.4S, v18.4S, v31.s[0] +add v11.4s, v11.4s, v8.4s +str q2, [x0, #384] +sqrdmulh v2.4S, v10.4S, v23.s[3] +str q16, [x0, #448] +mul v10.4S, v10.4S,v24.s[3] +ldr q16, [x0, #400] +sub v8.4s, v20.4s, v0.4s +ldr q18, [x0, #464] +mla v14.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v0.4s +str q11, [x0, #512] +sqrdmulh v11.4S, v17.4S, v29.s[0] +str q12, [x0, #576] +ldr q12, [x0, #528] +mul v17.4S, v17.4S,v30.s[0] +ldr q0, [x0, #592] +sub v22.4s, v13.4s, v14.4s +mla v10.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v14.4s +str q20, [x0, #640] +sqrdmulh v20.4S, v19.4S, v29.s[0] +str q8, [x0, #704] +ldr q8, [x0, #656] +mul v19.4S, v19.4S,v30.s[0] +ldr q14, [x0, #720] +sub v2.4s, v3.4s, v10.4s +mla v17.4S, v11.4S, v31.s[0] +add v3.4s, v3.4s, v10.4s +str q13, [x0, #768] +sqrdmulh v13.4S, v21.4S, v29.s[0] +str q22, [x0, #832] +mul v21.4S, v21.4S,v30.s[0] +ldr q22, [x0, #16] +sub v10.4s, v15.4s, v17.4s +mla v19.4S, v20.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +str q3, [x0, #896] +sqrdmulh v3.4S, v1.4S, v29.s[0] +str q2, [x0, #960] +mul v1.4S, v1.4S,v30.s[0] +ldr q2, [x0, #80] +sub v17.4s, v9.4s, v19.4s +mla v21.4S, v13.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v12.4S, v29.s[0] +ldr q13, [x0, #144] +mul v12.4S, v12.4S,v30.s[0] +sub v20.4s, v16.4s, v21.4s +mla v1.4S, v3.4S, v31.s[0] +add v16.4s, v16.4s, v21.4s +sqrdmulh v21.4S, v0.4S, v29.s[0] +ldr q3, [x0, #208] +mul v0.4S, v0.4S,v30.s[0] +sub v11.4s, v18.4s, v1.4s +mla v12.4S, v19.4S, v31.s[0] +add v18.4s, v18.4s, v1.4s +sqrdmulh v1.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v19.4s, v22.4s, v12.4s +mla v0.4S, v21.4S, v31.s[0] +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v21.4s, v2.4s, v0.4s +mla v8.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v1.4s, v13.4s, v8.4s +mla v14.4S, v12.4S, v31.s[0] +add v13.4s, v13.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v12.4s, v3.4s, v14.4s +mla v16.4S, v0.4S, v31.s[0] +add v3.4s, v3.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +sub v0.4s, v13.4s, v16.4s +mla v18.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v16.4s +sqrdmulh v16.4S, v9.4S, v29.s[1] +mul v9.4S, v9.4S,v30.s[1] +sub v8.4s, v3.4s, v18.4s +mla v15.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +sub v14.4s, v22.4s, v15.4s +mla v9.4S, v16.4S, v31.s[0] +add v22.4s, v22.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v16.4s, v2.4s, v9.4s +mla v20.4S, v18.4S, v31.s[0] +add v2.4s, v2.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v18.4s, v1.4s, v20.4s +mla v11.4S, v15.4S, v31.s[0] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +sub v15.4s, v12.4s, v11.4s +mla v10.4S, v9.4S, v31.s[0] +add v12.4s, v12.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v27.s[0] +mul v13.4S, v13.4S,v28.s[0] +sub v9.4s, v19.4s, v10.4s +mla v17.4S, v20.4S, v31.s[0] +add v19.4s, v19.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v27.s[0] +mul v3.4S, v3.4S,v28.s[0] +sub v20.4s, v21.4s, v17.4s +mla v13.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v11.4s, v22.4s, v13.4s +mla v3.4S, v10.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v8.4S, v27.s[1] +mul v8.4S, v8.4S,v28.s[1] +sub v10.4s, v2.4s, v3.4s +mla v0.4S, v17.4S, v31.s[0] +add v2.4s, v2.4s, v3.4s +sqrdmulh v3.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +sub v17.4s, v14.4s, v0.4s +mla v8.4S, v13.4S, v31.s[0] +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v12.4S, v27.s[2] +mul v12.4S, v12.4S,v28.s[2] +sub v13.4s, v16.4s, v8.4s +mla v1.4S, v3.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v3.4s, v19.4s, v1.4s +mla v12.4S, v0.4S, v31.s[0] +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +sub v0.4s, v21.4s, v12.4s +mla v18.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v2.4S, v25.s[0] +mul v2.4S, v2.4S,v26.s[0] +sub v8.4s, v9.4s, v18.4s +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v10.4S, v25.s[1] +mul v10.4S, v10.4S,v26.s[1] +sub v1.4s, v20.4s, v15.4s +mla v2.4S, v12.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v12.4s, v22.4s, v2.4s +mla v10.4S, v18.4S, v31.s[0] +add v22.4s, v22.4s, v2.4s +sqrdmulh v2.4S, v13.4S, v25.s[3] +mul v13.4S, v13.4S,v26.s[3] +sub v18.4s, v11.4s, v10.4s +mla v16.4S, v15.4S, v31.s[0] +add v11.4s, v11.4s, v10.4s +str q22, [x0, #16] +sqrdmulh v22.4S, v21.4S, v23.s[0] +str q12, [x0, #80] +mul v21.4S, v21.4S,v24.s[0] +sub v12.4s, v14.4s, v16.4s +mla v13.4S, v2.4S, v31.s[0] +add v14.4s, v14.4s, v16.4s +str q11, [x0, #144] +sqrdmulh v11.4S, v0.4S, v23.s[1] +str q18, [x0, #208] +mul v0.4S, v0.4S,v24.s[1] +sub v18.4s, v17.4s, v13.4s +mla v21.4S, v22.4S, v31.s[0] +add v17.4s, v17.4s, v13.4s +str q14, [x0, #272] +sqrdmulh v14.4S, v20.4S, v23.s[2] +str q12, [x0, #336] +mul v20.4S, v20.4S,v24.s[2] +sub v12.4s, v19.4s, v21.4s +mla v0.4S, v11.4S, v31.s[0] +add v19.4s, v19.4s, v21.4s +str q17, [x0, #400] +sqrdmulh v17.4S, v1.4S, v23.s[3] +str q18, [x0, #464] +mul v1.4S, v1.4S,v24.s[3] +sub v18.4s, v3.4s, v0.4s +mla v20.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v0.4s +str q19, [x0, #528] +str q12, [x0, #592] +sub v12.4s, v9.4s, v20.4s +mla v1.4S, v17.4S, v31.s[0] +add v9.4s, v9.4s, v20.4s +str q3, [x0, #656] +str q18, [x0, #720] +sub v18.4s, v8.4s, v1.4s +add v8.4s, v8.4s, v1.4s +str q9, [x0, #784] +str q12, [x0, #848] +str q8, [x0, #912] +str q18, [x0, #976] +ldr q4, [x17, #+128] +ldr q5, [x17, #+144] +ldr q6, [x17, #+160] +ldr q7, [x17, #+176] +ldr q15, [x17, #+192] +ldr q10, [x17, #+208] +ldr q2, [x17, #+224] +ldr q16, [x17, #+240] +ldr q22, [x0, #32] +ldr q13, [x0, #48] +ldr q11, [x0, #0] +ldr q21, [x0, #16] +ldr q14, [x17, #+1152] +ldr q0, [x17, #+1168] +ldr q19, [x17, #+1184] +ldr q17, [x17, #+1200] +ldr q20, [x17, #+1216] +ldr q3, [x17, #+1232] +ldr q1, [x17, #+1248] +ldr q9, [x17, #+1264] +ldr q12, [x0, #544] +ldr q8, [x0, #560] +ldr q18, [x0, #512] +ldr q30, [x0, #528] +sqrdmulh v29.4S, v22.4S, v5.s[0] +mul v22.4S, v22.4S,v4.s[0] +mla v22.4S, v29.4S, v31.s[0] +sub v29.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +sqrdmulh v22.4S, v12.4S, v0.s[0] +mul v12.4S, v12.4S,v14.s[0] +mla v12.4S, v22.4S, v31.s[0] +sub v22.4s, v18.4s, v12.4s +add v18.4s, v18.4s, v12.4s +sqrdmulh v12.4S, v13.4S, v5.s[0] +mul v13.4S, v13.4S,v4.s[0] +mla v13.4S, v12.4S, v31.s[0] +sub v12.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +sqrdmulh v13.4S, v8.4S, v0.s[0] +mul v8.4S, v8.4S,v14.s[0] +mla v8.4S, v13.4S, v31.s[0] +sub v13.4s, v30.4s, v8.4s +add v30.4s, v30.4s, v8.4s +sqrdmulh v8.4S, v21.4S, v5.s[1] +mul v21.4S, v21.4S,v4.s[1] +mla v21.4S, v8.4S, v31.s[0] +sub v8.4s, v11.4s, v21.4s +add v11.4s, v11.4s, v21.4s +sqrdmulh v21.4S, v30.4S, v0.s[1] +mul v30.4S, v30.4S,v14.s[1] +mla v30.4S, v21.4S, v31.s[0] +sub v21.4s, v18.4s, v30.4s +add v18.4s, v18.4s, v30.4s +sqrdmulh v30.4S, v12.4S, v5.s[2] +mul v12.4S, v12.4S,v4.s[2] +mla v12.4S, v30.4S, v31.s[0] +sub v30.4s, v29.4s, v12.4s +add v29.4s, v29.4s, v12.4s +sqrdmulh v12.4S, v13.4S, v0.s[2] +mul v13.4S, v13.4S,v14.s[2] +mla v13.4S, v12.4S, v31.s[0] +sub v12.4s, v22.4s, v13.4s +add v22.4s, v22.4s, v13.4s +trn1 v13.4S, v11.4S, v8.4S +trn2 v28.4S, v11.4S, v8.4S +trn1 v27.4S, v29.4S, v30.4S +trn2 v26.4S, v29.4S, v30.4S +trn2 v29.2D, v13.2D, v27.2D +trn2 v30.2D, v28.2D, v26.2D +trn1 v11.2D, v13.2D, v27.2D +trn1 v8.2D, v28.2D, v26.2D +trn1 v26.4S, v18.4S, v21.4S +trn2 v28.4S, v18.4S, v21.4S +trn1 v27.4S, v22.4S, v12.4S +trn2 v13.4S, v22.4S, v12.4S +trn2 v22.2D, v26.2D, v27.2D +trn2 v12.2D, v28.2D, v13.2D +trn1 v18.2D, v26.2D, v27.2D +trn1 v21.2D, v28.2D, v13.2D +sqrdmulh v13.4S, v29.4S, v7.4S +mul v29.4S, v29.4S,v6.4S +mla v29.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v29.4s +add v11.4s, v11.4s, v29.4s +sqrdmulh v29.4S, v22.4S, v17.4S +mul v22.4S, v22.4S,v19.4S +mla v22.4S, v29.4S, v31.s[0] +sub v29.4s, v18.4s, v22.4s +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v30.4S, v7.4S +mul v30.4S, v30.4S,v6.4S +mla v30.4S, v22.4S, v31.s[0] +sub v22.4s, v8.4s, v30.4s +add v8.4s, v8.4s, v30.4s +sqrdmulh v30.4S, v12.4S, v17.4S +mul v12.4S, v12.4S,v19.4S +mla v12.4S, v30.4S, v31.s[0] +sub v30.4s, v21.4s, v12.4s +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v8.4S, v10.4S +mul v8.4S, v8.4S,v15.4S +mla v8.4S, v12.4S, v31.s[0] +sub v12.4s, v11.4s, v8.4s +add v11.4s, v11.4s, v8.4s +sqrdmulh v8.4S, v21.4S, v3.4S +mul v21.4S, v21.4S,v20.4S +mla v21.4S, v8.4S, v31.s[0] +sub v8.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v16.4S +mul v22.4S, v22.4S,v2.4S +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v13.4s, v22.4s +add v13.4s, v13.4s, v22.4s +sqrdmulh v22.4S, v30.4S, v9.4S +mul v30.4S, v30.4S,v1.4S +mla v30.4S, v22.4S, v31.s[0] +sub v22.4s, v29.4s, v30.4s +add v29.4s, v29.4s, v30.4s +str q11, [x0, #0] +str q12, [x0, #16] +str q13, [x0, #32] +str q21, [x0, #48] +str q18, [x0, #512] +str q8, [x0, #528] +str q29, [x0, #544] +str q22, [x0, #560] +ldr q9, [x17, #+256] +ldr q1, [x17, #+272] +ldr q3, [x17, #+288] +ldr q20, [x17, #+304] +ldr q17, [x17, #+320] +ldr q19, [x17, #+336] +ldr q0, [x17, #+352] +ldr q14, [x17, #+368] +ldr q22, [x0, #96] +ldr q29, [x0, #112] +ldr q8, [x0, #64] +ldr q18, [x0, #80] +ldr q16, [x17, #+1280] +ldr q2, [x17, #+1296] +ldr q10, [x17, #+1312] +ldr q15, [x17, #+1328] +ldr q7, [x17, #+1344] +ldr q6, [x17, #+1360] +ldr q5, [x17, #+1376] +ldr q4, [x17, #+1392] +ldr q21, [x0, #608] +ldr q13, [x0, #624] +ldr q12, [x0, #576] +ldr q11, [x0, #592] +sqrdmulh v30.4S, v22.4S, v1.s[0] +mul v22.4S, v22.4S,v9.s[0] +mla v22.4S, v30.4S, v31.s[0] +sub v30.4s, v8.4s, v22.4s +add v8.4s, v8.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v2.s[0] +mul v21.4S, v21.4S,v16.s[0] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v12.4s, v21.4s +add v12.4s, v12.4s, v21.4s +sqrdmulh v21.4S, v29.4S, v1.s[0] +mul v29.4S, v29.4S,v9.s[0] +mla v29.4S, v21.4S, v31.s[0] +sub v21.4s, v18.4s, v29.4s +add v18.4s, v18.4s, v29.4s +sqrdmulh v29.4S, v13.4S, v2.s[0] +mul v13.4S, v13.4S,v16.s[0] +mla v13.4S, v29.4S, v31.s[0] +sub v29.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +sqrdmulh v13.4S, v18.4S, v1.s[1] +mul v18.4S, v18.4S,v9.s[1] +mla v18.4S, v13.4S, v31.s[0] +sub v13.4s, v8.4s, v18.4s +add v8.4s, v8.4s, v18.4s +sqrdmulh v18.4S, v11.4S, v2.s[1] +mul v11.4S, v11.4S,v16.s[1] +mla v11.4S, v18.4S, v31.s[0] +sub v18.4s, v12.4s, v11.4s +add v12.4s, v12.4s, v11.4s +sqrdmulh v11.4S, v21.4S, v1.s[2] +mul v21.4S, v21.4S,v9.s[2] +mla v21.4S, v11.4S, v31.s[0] +sub v11.4s, v30.4s, v21.4s +add v30.4s, v30.4s, v21.4s +sqrdmulh v21.4S, v29.4S, v2.s[2] +mul v29.4S, v29.4S,v16.s[2] +mla v29.4S, v21.4S, v31.s[0] +sub v21.4s, v22.4s, v29.4s +add v22.4s, v22.4s, v29.4s +trn1 v29.4S, v8.4S, v13.4S +trn2 v28.4S, v8.4S, v13.4S +trn1 v27.4S, v30.4S, v11.4S +trn2 v26.4S, v30.4S, v11.4S +trn2 v30.2D, v29.2D, v27.2D +trn2 v11.2D, v28.2D, v26.2D +trn1 v8.2D, v29.2D, v27.2D +trn1 v13.2D, v28.2D, v26.2D +trn1 v26.4S, v12.4S, v18.4S +trn2 v28.4S, v12.4S, v18.4S +trn1 v27.4S, v22.4S, v21.4S +trn2 v29.4S, v22.4S, v21.4S +trn2 v22.2D, v26.2D, v27.2D +trn2 v21.2D, v28.2D, v29.2D +trn1 v12.2D, v26.2D, v27.2D +trn1 v18.2D, v28.2D, v29.2D +sqrdmulh v29.4S, v30.4S, v20.4S +mul v30.4S, v30.4S,v3.4S +mla v30.4S, v29.4S, v31.s[0] +sub v29.4s, v8.4s, v30.4s +add v8.4s, v8.4s, v30.4s +sqrdmulh v30.4S, v22.4S, v15.4S +mul v22.4S, v22.4S,v10.4S +mla v22.4S, v30.4S, v31.s[0] +sub v30.4s, v12.4s, v22.4s +add v12.4s, v12.4s, v22.4s +sqrdmulh v22.4S, v11.4S, v20.4S +mul v11.4S, v11.4S,v3.4S +mla v11.4S, v22.4S, v31.s[0] +sub v22.4s, v13.4s, v11.4s +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v21.4S, v15.4S +mul v21.4S, v21.4S,v10.4S +mla v21.4S, v11.4S, v31.s[0] +sub v11.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v13.4S, v19.4S +mul v13.4S, v13.4S,v17.4S +mla v13.4S, v21.4S, v31.s[0] +sub v21.4s, v8.4s, v13.4s +add v8.4s, v8.4s, v13.4s +sqrdmulh v13.4S, v18.4S, v6.4S +mul v18.4S, v18.4S,v7.4S +mla v18.4S, v13.4S, v31.s[0] +sub v13.4s, v12.4s, v18.4s +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v22.4S, v14.4S +mul v22.4S, v22.4S,v0.4S +mla v22.4S, v18.4S, v31.s[0] +sub v18.4s, v29.4s, v22.4s +add v29.4s, v29.4s, v22.4s +sqrdmulh v22.4S, v11.4S, v4.4S +mul v11.4S, v11.4S,v5.4S +mla v11.4S, v22.4S, v31.s[0] +sub v22.4s, v30.4s, v11.4s +add v30.4s, v30.4s, v11.4s +str q8, [x0, #64] +str q21, [x0, #80] +str q29, [x0, #96] +str q18, [x0, #112] +str q12, [x0, #576] +str q13, [x0, #592] +str q30, [x0, #608] +str q22, [x0, #624] +ldr q4, [x17, #+384] +ldr q5, [x17, #+400] +ldr q6, [x17, #+416] +ldr q7, [x17, #+432] +ldr q15, [x17, #+448] +ldr q10, [x17, #+464] +ldr q2, [x17, #+480] +ldr q16, [x17, #+496] +ldr q22, [x0, #160] +ldr q30, [x0, #176] +ldr q13, [x0, #128] +ldr q12, [x0, #144] +ldr q14, [x17, #+1408] +ldr q0, [x17, #+1424] +ldr q19, [x17, #+1440] +ldr q17, [x17, #+1456] +ldr q20, [x17, #+1472] +ldr q3, [x17, #+1488] +ldr q1, [x17, #+1504] +ldr q9, [x17, #+1520] +ldr q18, [x0, #672] +ldr q29, [x0, #688] +ldr q21, [x0, #640] +ldr q8, [x0, #656] +sqrdmulh v11.4S, v22.4S, v5.s[0] +mul v22.4S, v22.4S,v4.s[0] +mla v22.4S, v11.4S, v31.s[0] +sub v11.4s, v13.4s, v22.4s +add v13.4s, v13.4s, v22.4s +sqrdmulh v22.4S, v18.4S, v0.s[0] +mul v18.4S, v18.4S,v14.s[0] +mla v18.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v18.4s +add v21.4s, v21.4s, v18.4s +sqrdmulh v18.4S, v30.4S, v5.s[0] +mul v30.4S, v30.4S,v4.s[0] +mla v30.4S, v18.4S, v31.s[0] +sub v18.4s, v12.4s, v30.4s +add v12.4s, v12.4s, v30.4s +sqrdmulh v30.4S, v29.4S, v0.s[0] +mul v29.4S, v29.4S,v14.s[0] +mla v29.4S, v30.4S, v31.s[0] +sub v30.4s, v8.4s, v29.4s +add v8.4s, v8.4s, v29.4s +sqrdmulh v29.4S, v12.4S, v5.s[1] +mul v12.4S, v12.4S,v4.s[1] +mla v12.4S, v29.4S, v31.s[0] +sub v29.4s, v13.4s, v12.4s +add v13.4s, v13.4s, v12.4s +sqrdmulh v12.4S, v8.4S, v0.s[1] +mul v8.4S, v8.4S,v14.s[1] +mla v8.4S, v12.4S, v31.s[0] +sub v12.4s, v21.4s, v8.4s +add v21.4s, v21.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v5.s[2] +mul v18.4S, v18.4S,v4.s[2] +mla v18.4S, v8.4S, v31.s[0] +sub v8.4s, v11.4s, v18.4s +add v11.4s, v11.4s, v18.4s +sqrdmulh v18.4S, v30.4S, v0.s[2] +mul v30.4S, v30.4S,v14.s[2] +mla v30.4S, v18.4S, v31.s[0] +sub v18.4s, v22.4s, v30.4s +add v22.4s, v22.4s, v30.4s +trn1 v30.4S, v13.4S, v29.4S +trn2 v28.4S, v13.4S, v29.4S +trn1 v27.4S, v11.4S, v8.4S +trn2 v26.4S, v11.4S, v8.4S +trn2 v11.2D, v30.2D, v27.2D +trn2 v8.2D, v28.2D, v26.2D +trn1 v13.2D, v30.2D, v27.2D +trn1 v29.2D, v28.2D, v26.2D +trn1 v26.4S, v21.4S, v12.4S +trn2 v28.4S, v21.4S, v12.4S +trn1 v27.4S, v22.4S, v18.4S +trn2 v30.4S, v22.4S, v18.4S +trn2 v22.2D, v26.2D, v27.2D +trn2 v18.2D, v28.2D, v30.2D +trn1 v21.2D, v26.2D, v27.2D +trn1 v12.2D, v28.2D, v30.2D +sqrdmulh v30.4S, v11.4S, v7.4S +mul v11.4S, v11.4S,v6.4S +mla v11.4S, v30.4S, v31.s[0] +sub v30.4s, v13.4s, v11.4s +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v17.4S +mul v22.4S, v22.4S,v19.4S +mla v22.4S, v11.4S, v31.s[0] +sub v11.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +sqrdmulh v22.4S, v8.4S, v7.4S +mul v8.4S, v8.4S,v6.4S +mla v8.4S, v22.4S, v31.s[0] +sub v22.4s, v29.4s, v8.4s +add v29.4s, v29.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v17.4S +mul v18.4S, v18.4S,v19.4S +mla v18.4S, v8.4S, v31.s[0] +sub v8.4s, v12.4s, v18.4s +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v29.4S, v10.4S +mul v29.4S, v29.4S,v15.4S +mla v29.4S, v18.4S, v31.s[0] +sub v18.4s, v13.4s, v29.4s +add v13.4s, v13.4s, v29.4s +sqrdmulh v29.4S, v12.4S, v3.4S +mul v12.4S, v12.4S,v20.4S +mla v12.4S, v29.4S, v31.s[0] +sub v29.4s, v21.4s, v12.4s +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v22.4S, v16.4S +mul v22.4S, v22.4S,v2.4S +mla v22.4S, v12.4S, v31.s[0] +sub v12.4s, v30.4s, v22.4s +add v30.4s, v30.4s, v22.4s +sqrdmulh v22.4S, v8.4S, v9.4S +mul v8.4S, v8.4S,v1.4S +mla v8.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v8.4s +add v11.4s, v11.4s, v8.4s +str q13, [x0, #128] +str q18, [x0, #144] +str q30, [x0, #160] +str q12, [x0, #176] +str q21, [x0, #640] +str q29, [x0, #656] +str q11, [x0, #672] +str q22, [x0, #688] +ldr q9, [x17, #+512] +ldr q1, [x17, #+528] +ldr q3, [x17, #+544] +ldr q20, [x17, #+560] +ldr q17, [x17, #+576] +ldr q19, [x17, #+592] +ldr q0, [x17, #+608] +ldr q14, [x17, #+624] +ldr q22, [x0, #224] +ldr q11, [x0, #240] +ldr q29, [x0, #192] +ldr q21, [x0, #208] +ldr q16, [x17, #+1536] +ldr q2, [x17, #+1552] +ldr q10, [x17, #+1568] +ldr q15, [x17, #+1584] +ldr q7, [x17, #+1600] +ldr q6, [x17, #+1616] +ldr q5, [x17, #+1632] +ldr q4, [x17, #+1648] +ldr q12, [x0, #736] +ldr q30, [x0, #752] +ldr q18, [x0, #704] +ldr q13, [x0, #720] +sqrdmulh v8.4S, v22.4S, v1.s[0] +mul v22.4S, v22.4S,v9.s[0] +mla v22.4S, v8.4S, v31.s[0] +sub v8.4s, v29.4s, v22.4s +add v29.4s, v29.4s, v22.4s +sqrdmulh v22.4S, v12.4S, v2.s[0] +mul v12.4S, v12.4S,v16.s[0] +mla v12.4S, v22.4S, v31.s[0] +sub v22.4s, v18.4s, v12.4s +add v18.4s, v18.4s, v12.4s +sqrdmulh v12.4S, v11.4S, v1.s[0] +mul v11.4S, v11.4S,v9.s[0] +mla v11.4S, v12.4S, v31.s[0] +sub v12.4s, v21.4s, v11.4s +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v30.4S, v2.s[0] +mul v30.4S, v30.4S,v16.s[0] +mla v30.4S, v11.4S, v31.s[0] +sub v11.4s, v13.4s, v30.4s +add v13.4s, v13.4s, v30.4s +sqrdmulh v30.4S, v21.4S, v1.s[1] +mul v21.4S, v21.4S,v9.s[1] +mla v21.4S, v30.4S, v31.s[0] +sub v30.4s, v29.4s, v21.4s +add v29.4s, v29.4s, v21.4s +sqrdmulh v21.4S, v13.4S, v2.s[1] +mul v13.4S, v13.4S,v16.s[1] +mla v13.4S, v21.4S, v31.s[0] +sub v21.4s, v18.4s, v13.4s +add v18.4s, v18.4s, v13.4s +sqrdmulh v13.4S, v12.4S, v1.s[2] +mul v12.4S, v12.4S,v9.s[2] +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v8.4s, v12.4s +add v8.4s, v8.4s, v12.4s +sqrdmulh v12.4S, v11.4S, v2.s[2] +mul v11.4S, v11.4S,v16.s[2] +mla v11.4S, v12.4S, v31.s[0] +sub v12.4s, v22.4s, v11.4s +add v22.4s, v22.4s, v11.4s +trn1 v11.4S, v29.4S, v30.4S +trn2 v28.4S, v29.4S, v30.4S +trn1 v27.4S, v8.4S, v13.4S +trn2 v26.4S, v8.4S, v13.4S +trn2 v8.2D, v11.2D, v27.2D +trn2 v13.2D, v28.2D, v26.2D +trn1 v29.2D, v11.2D, v27.2D +trn1 v30.2D, v28.2D, v26.2D +trn1 v26.4S, v18.4S, v21.4S +trn2 v28.4S, v18.4S, v21.4S +trn1 v27.4S, v22.4S, v12.4S +trn2 v11.4S, v22.4S, v12.4S +trn2 v22.2D, v26.2D, v27.2D +trn2 v12.2D, v28.2D, v11.2D +trn1 v18.2D, v26.2D, v27.2D +trn1 v21.2D, v28.2D, v11.2D +sqrdmulh v11.4S, v8.4S, v20.4S +mul v8.4S, v8.4S,v3.4S +mla v8.4S, v11.4S, v31.s[0] +sub v11.4s, v29.4s, v8.4s +add v29.4s, v29.4s, v8.4s +sqrdmulh v8.4S, v22.4S, v15.4S +mul v22.4S, v22.4S,v10.4S +mla v22.4S, v8.4S, v31.s[0] +sub v8.4s, v18.4s, v22.4s +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v13.4S, v20.4S +mul v13.4S, v13.4S,v3.4S +mla v13.4S, v22.4S, v31.s[0] +sub v22.4s, v30.4s, v13.4s +add v30.4s, v30.4s, v13.4s +sqrdmulh v13.4S, v12.4S, v15.4S +mul v12.4S, v12.4S,v10.4S +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v21.4s, v12.4s +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v30.4S, v19.4S +mul v30.4S, v30.4S,v17.4S +mla v30.4S, v12.4S, v31.s[0] +sub v12.4s, v29.4s, v30.4s +add v29.4s, v29.4s, v30.4s +sqrdmulh v30.4S, v21.4S, v6.4S +mul v21.4S, v21.4S,v7.4S +mla v21.4S, v30.4S, v31.s[0] +sub v30.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v14.4S +mul v22.4S, v22.4S,v0.4S +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +sqrdmulh v22.4S, v13.4S, v4.4S +mul v13.4S, v13.4S,v5.4S +mla v13.4S, v22.4S, v31.s[0] +sub v22.4s, v8.4s, v13.4s +add v8.4s, v8.4s, v13.4s +str q29, [x0, #192] +str q12, [x0, #208] +str q11, [x0, #224] +str q21, [x0, #240] +str q18, [x0, #704] +str q30, [x0, #720] +str q8, [x0, #736] +str q22, [x0, #752] +ldr q4, [x17, #+640] +ldr q5, [x17, #+656] +ldr q6, [x17, #+672] +ldr q7, [x17, #+688] +ldr q15, [x17, #+704] +ldr q10, [x17, #+720] +ldr q2, [x17, #+736] +ldr q16, [x17, #+752] +ldr q22, [x0, #288] +ldr q8, [x0, #304] +ldr q30, [x0, #256] +ldr q18, [x0, #272] +ldr q14, [x17, #+1664] +ldr q0, [x17, #+1680] +ldr q19, [x17, #+1696] +ldr q17, [x17, #+1712] +ldr q20, [x17, #+1728] +ldr q3, [x17, #+1744] +ldr q1, [x17, #+1760] +ldr q9, [x17, #+1776] +ldr q21, [x0, #800] +ldr q11, [x0, #816] +ldr q12, [x0, #768] +ldr q29, [x0, #784] +sqrdmulh v13.4S, v22.4S, v5.s[0] +mul v22.4S, v22.4S,v4.s[0] +mla v22.4S, v13.4S, v31.s[0] +sub v13.4s, v30.4s, v22.4s +add v30.4s, v30.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v0.s[0] +mul v21.4S, v21.4S,v14.s[0] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v12.4s, v21.4s +add v12.4s, v12.4s, v21.4s +sqrdmulh v21.4S, v8.4S, v5.s[0] +mul v8.4S, v8.4S,v4.s[0] +mla v8.4S, v21.4S, v31.s[0] +sub v21.4s, v18.4s, v8.4s +add v18.4s, v18.4s, v8.4s +sqrdmulh v8.4S, v11.4S, v0.s[0] +mul v11.4S, v11.4S,v14.s[0] +mla v11.4S, v8.4S, v31.s[0] +sub v8.4s, v29.4s, v11.4s +add v29.4s, v29.4s, v11.4s +sqrdmulh v11.4S, v18.4S, v5.s[1] +mul v18.4S, v18.4S,v4.s[1] +mla v18.4S, v11.4S, v31.s[0] +sub v11.4s, v30.4s, v18.4s +add v30.4s, v30.4s, v18.4s +sqrdmulh v18.4S, v29.4S, v0.s[1] +mul v29.4S, v29.4S,v14.s[1] +mla v29.4S, v18.4S, v31.s[0] +sub v18.4s, v12.4s, v29.4s +add v12.4s, v12.4s, v29.4s +sqrdmulh v29.4S, v21.4S, v5.s[2] +mul v21.4S, v21.4S,v4.s[2] +mla v21.4S, v29.4S, v31.s[0] +sub v29.4s, v13.4s, v21.4s +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v8.4S, v0.s[2] +mul v8.4S, v8.4S,v14.s[2] +mla v8.4S, v21.4S, v31.s[0] +sub v21.4s, v22.4s, v8.4s +add v22.4s, v22.4s, v8.4s +trn1 v8.4S, v30.4S, v11.4S +trn2 v28.4S, v30.4S, v11.4S +trn1 v27.4S, v13.4S, v29.4S +trn2 v26.4S, v13.4S, v29.4S +trn2 v13.2D, v8.2D, v27.2D +trn2 v29.2D, v28.2D, v26.2D +trn1 v30.2D, v8.2D, v27.2D +trn1 v11.2D, v28.2D, v26.2D +trn1 v26.4S, v12.4S, v18.4S +trn2 v28.4S, v12.4S, v18.4S +trn1 v27.4S, v22.4S, v21.4S +trn2 v8.4S, v22.4S, v21.4S +trn2 v22.2D, v26.2D, v27.2D +trn2 v21.2D, v28.2D, v8.2D +trn1 v12.2D, v26.2D, v27.2D +trn1 v18.2D, v28.2D, v8.2D +sqrdmulh v8.4S, v13.4S, v7.4S +mul v13.4S, v13.4S,v6.4S +mla v13.4S, v8.4S, v31.s[0] +sub v8.4s, v30.4s, v13.4s +add v30.4s, v30.4s, v13.4s +sqrdmulh v13.4S, v22.4S, v17.4S +mul v22.4S, v22.4S,v19.4S +mla v22.4S, v13.4S, v31.s[0] +sub v13.4s, v12.4s, v22.4s +add v12.4s, v12.4s, v22.4s +sqrdmulh v22.4S, v29.4S, v7.4S +mul v29.4S, v29.4S,v6.4S +mla v29.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v29.4s +add v11.4s, v11.4s, v29.4s +sqrdmulh v29.4S, v21.4S, v17.4S +mul v21.4S, v21.4S,v19.4S +mla v21.4S, v29.4S, v31.s[0] +sub v29.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v11.4S, v10.4S +mul v11.4S, v11.4S,v15.4S +mla v11.4S, v21.4S, v31.s[0] +sub v21.4s, v30.4s, v11.4s +add v30.4s, v30.4s, v11.4s +sqrdmulh v11.4S, v18.4S, v3.4S +mul v18.4S, v18.4S,v20.4S +mla v18.4S, v11.4S, v31.s[0] +sub v11.4s, v12.4s, v18.4s +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v22.4S, v16.4S +mul v22.4S, v22.4S,v2.4S +mla v22.4S, v18.4S, v31.s[0] +sub v18.4s, v8.4s, v22.4s +add v8.4s, v8.4s, v22.4s +sqrdmulh v22.4S, v29.4S, v9.4S +mul v29.4S, v29.4S,v1.4S +mla v29.4S, v22.4S, v31.s[0] +sub v22.4s, v13.4s, v29.4s +add v13.4s, v13.4s, v29.4s +str q30, [x0, #256] +str q21, [x0, #272] +str q8, [x0, #288] +str q18, [x0, #304] +str q12, [x0, #768] +str q11, [x0, #784] +str q13, [x0, #800] +str q22, [x0, #816] +ldr q9, [x17, #+768] +ldr q1, [x17, #+784] +ldr q3, [x17, #+800] +ldr q20, [x17, #+816] +ldr q17, [x17, #+832] +ldr q19, [x17, #+848] +ldr q0, [x17, #+864] +ldr q14, [x17, #+880] +ldr q22, [x0, #352] +ldr q13, [x0, #368] +ldr q11, [x0, #320] +ldr q12, [x0, #336] +ldr q16, [x17, #+1792] +ldr q2, [x17, #+1808] +ldr q10, [x17, #+1824] +ldr q15, [x17, #+1840] +ldr q7, [x17, #+1856] +ldr q6, [x17, #+1872] +ldr q5, [x17, #+1888] +ldr q4, [x17, #+1904] +ldr q18, [x0, #864] +ldr q8, [x0, #880] +ldr q21, [x0, #832] +ldr q30, [x0, #848] +sqrdmulh v29.4S, v22.4S, v1.s[0] +mul v22.4S, v22.4S,v9.s[0] +mla v22.4S, v29.4S, v31.s[0] +sub v29.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +sqrdmulh v22.4S, v18.4S, v2.s[0] +mul v18.4S, v18.4S,v16.s[0] +mla v18.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v18.4s +add v21.4s, v21.4s, v18.4s +sqrdmulh v18.4S, v13.4S, v1.s[0] +mul v13.4S, v13.4S,v9.s[0] +mla v13.4S, v18.4S, v31.s[0] +sub v18.4s, v12.4s, v13.4s +add v12.4s, v12.4s, v13.4s +sqrdmulh v13.4S, v8.4S, v2.s[0] +mul v8.4S, v8.4S,v16.s[0] +mla v8.4S, v13.4S, v31.s[0] +sub v13.4s, v30.4s, v8.4s +add v30.4s, v30.4s, v8.4s +sqrdmulh v8.4S, v12.4S, v1.s[1] +mul v12.4S, v12.4S,v9.s[1] +mla v12.4S, v8.4S, v31.s[0] +sub v8.4s, v11.4s, v12.4s +add v11.4s, v11.4s, v12.4s +sqrdmulh v12.4S, v30.4S, v2.s[1] +mul v30.4S, v30.4S,v16.s[1] +mla v30.4S, v12.4S, v31.s[0] +sub v12.4s, v21.4s, v30.4s +add v21.4s, v21.4s, v30.4s +sqrdmulh v30.4S, v18.4S, v1.s[2] +mul v18.4S, v18.4S,v9.s[2] +mla v18.4S, v30.4S, v31.s[0] +sub v30.4s, v29.4s, v18.4s +add v29.4s, v29.4s, v18.4s +sqrdmulh v18.4S, v13.4S, v2.s[2] +mul v13.4S, v13.4S,v16.s[2] +mla v13.4S, v18.4S, v31.s[0] +sub v18.4s, v22.4s, v13.4s +add v22.4s, v22.4s, v13.4s +trn1 v13.4S, v11.4S, v8.4S +trn2 v28.4S, v11.4S, v8.4S +trn1 v27.4S, v29.4S, v30.4S +trn2 v26.4S, v29.4S, v30.4S +trn2 v29.2D, v13.2D, v27.2D +trn2 v30.2D, v28.2D, v26.2D +trn1 v11.2D, v13.2D, v27.2D +trn1 v8.2D, v28.2D, v26.2D +trn1 v26.4S, v21.4S, v12.4S +trn2 v28.4S, v21.4S, v12.4S +trn1 v27.4S, v22.4S, v18.4S +trn2 v13.4S, v22.4S, v18.4S +trn2 v22.2D, v26.2D, v27.2D +trn2 v18.2D, v28.2D, v13.2D +trn1 v21.2D, v26.2D, v27.2D +trn1 v12.2D, v28.2D, v13.2D +sqrdmulh v13.4S, v29.4S, v20.4S +mul v29.4S, v29.4S,v3.4S +mla v29.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v29.4s +add v11.4s, v11.4s, v29.4s +sqrdmulh v29.4S, v22.4S, v15.4S +mul v22.4S, v22.4S,v10.4S +mla v22.4S, v29.4S, v31.s[0] +sub v29.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +sqrdmulh v22.4S, v30.4S, v20.4S +mul v30.4S, v30.4S,v3.4S +mla v30.4S, v22.4S, v31.s[0] +sub v22.4s, v8.4s, v30.4s +add v8.4s, v8.4s, v30.4s +sqrdmulh v30.4S, v18.4S, v15.4S +mul v18.4S, v18.4S,v10.4S +mla v18.4S, v30.4S, v31.s[0] +sub v30.4s, v12.4s, v18.4s +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v8.4S, v19.4S +mul v8.4S, v8.4S,v17.4S +mla v8.4S, v18.4S, v31.s[0] +sub v18.4s, v11.4s, v8.4s +add v11.4s, v11.4s, v8.4s +sqrdmulh v8.4S, v12.4S, v6.4S +mul v12.4S, v12.4S,v7.4S +mla v12.4S, v8.4S, v31.s[0] +sub v8.4s, v21.4s, v12.4s +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v22.4S, v14.4S +mul v22.4S, v22.4S,v0.4S +mla v22.4S, v12.4S, v31.s[0] +sub v12.4s, v13.4s, v22.4s +add v13.4s, v13.4s, v22.4s +sqrdmulh v22.4S, v30.4S, v4.4S +mul v30.4S, v30.4S,v5.4S +mla v30.4S, v22.4S, v31.s[0] +sub v22.4s, v29.4s, v30.4s +add v29.4s, v29.4s, v30.4s +str q11, [x0, #320] +str q18, [x0, #336] +str q13, [x0, #352] +str q12, [x0, #368] +str q21, [x0, #832] +str q8, [x0, #848] +str q29, [x0, #864] +str q22, [x0, #880] +ldr q4, [x17, #+896] +ldr q5, [x17, #+912] +ldr q6, [x17, #+928] +ldr q7, [x17, #+944] +ldr q15, [x17, #+960] +ldr q10, [x17, #+976] +ldr q2, [x17, #+992] +ldr q16, [x17, #+1008] +ldr q22, [x0, #416] +ldr q29, [x0, #432] +ldr q8, [x0, #384] +ldr q21, [x0, #400] +ldr q14, [x17, #+1920] +ldr q0, [x17, #+1936] +ldr q19, [x17, #+1952] +ldr q17, [x17, #+1968] +ldr q20, [x17, #+1984] +ldr q3, [x17, #+2000] +ldr q1, [x17, #+2016] +ldr q9, [x17, #+2032] +ldr q12, [x0, #928] +ldr q13, [x0, #944] +ldr q18, [x0, #896] +ldr q11, [x0, #912] +sqrdmulh v30.4S, v22.4S, v5.s[0] +mul v22.4S, v22.4S,v4.s[0] +mla v22.4S, v30.4S, v31.s[0] +sub v30.4s, v8.4s, v22.4s +add v8.4s, v8.4s, v22.4s +sqrdmulh v22.4S, v12.4S, v0.s[0] +mul v12.4S, v12.4S,v14.s[0] +mla v12.4S, v22.4S, v31.s[0] +sub v22.4s, v18.4s, v12.4s +add v18.4s, v18.4s, v12.4s +sqrdmulh v12.4S, v29.4S, v5.s[0] +mul v29.4S, v29.4S,v4.s[0] +mla v29.4S, v12.4S, v31.s[0] +sub v12.4s, v21.4s, v29.4s +add v21.4s, v21.4s, v29.4s +sqrdmulh v29.4S, v13.4S, v0.s[0] +mul v13.4S, v13.4S,v14.s[0] +mla v13.4S, v29.4S, v31.s[0] +sub v29.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +sqrdmulh v13.4S, v21.4S, v5.s[1] +mul v21.4S, v21.4S,v4.s[1] +mla v21.4S, v13.4S, v31.s[0] +sub v13.4s, v8.4s, v21.4s +add v8.4s, v8.4s, v21.4s +sqrdmulh v21.4S, v11.4S, v0.s[1] +mul v11.4S, v11.4S,v14.s[1] +mla v11.4S, v21.4S, v31.s[0] +sub v21.4s, v18.4s, v11.4s +add v18.4s, v18.4s, v11.4s +sqrdmulh v11.4S, v12.4S, v5.s[2] +mul v12.4S, v12.4S,v4.s[2] +mla v12.4S, v11.4S, v31.s[0] +sub v11.4s, v30.4s, v12.4s +add v30.4s, v30.4s, v12.4s +sqrdmulh v12.4S, v29.4S, v0.s[2] +mul v29.4S, v29.4S,v14.s[2] +mla v29.4S, v12.4S, v31.s[0] +sub v12.4s, v22.4s, v29.4s +add v22.4s, v22.4s, v29.4s +trn1 v29.4S, v8.4S, v13.4S +trn2 v28.4S, v8.4S, v13.4S +trn1 v27.4S, v30.4S, v11.4S +trn2 v26.4S, v30.4S, v11.4S +trn2 v30.2D, v29.2D, v27.2D +trn2 v11.2D, v28.2D, v26.2D +trn1 v8.2D, v29.2D, v27.2D +trn1 v13.2D, v28.2D, v26.2D +trn1 v26.4S, v18.4S, v21.4S +trn2 v28.4S, v18.4S, v21.4S +trn1 v27.4S, v22.4S, v12.4S +trn2 v29.4S, v22.4S, v12.4S +trn2 v22.2D, v26.2D, v27.2D +trn2 v12.2D, v28.2D, v29.2D +trn1 v18.2D, v26.2D, v27.2D +trn1 v21.2D, v28.2D, v29.2D +sqrdmulh v29.4S, v30.4S, v7.4S +mul v30.4S, v30.4S,v6.4S +mla v30.4S, v29.4S, v31.s[0] +sub v29.4s, v8.4s, v30.4s +add v8.4s, v8.4s, v30.4s +sqrdmulh v30.4S, v22.4S, v17.4S +mul v22.4S, v22.4S,v19.4S +mla v22.4S, v30.4S, v31.s[0] +sub v30.4s, v18.4s, v22.4s +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v11.4S, v7.4S +mul v11.4S, v11.4S,v6.4S +mla v11.4S, v22.4S, v31.s[0] +sub v22.4s, v13.4s, v11.4s +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v12.4S, v17.4S +mul v12.4S, v12.4S,v19.4S +mla v12.4S, v11.4S, v31.s[0] +sub v11.4s, v21.4s, v12.4s +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v13.4S, v10.4S +mul v13.4S, v13.4S,v15.4S +mla v13.4S, v12.4S, v31.s[0] +sub v12.4s, v8.4s, v13.4s +add v8.4s, v8.4s, v13.4s +sqrdmulh v13.4S, v21.4S, v3.4S +mul v21.4S, v21.4S,v20.4S +mla v21.4S, v13.4S, v31.s[0] +sub v13.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v16.4S +mul v22.4S, v22.4S,v2.4S +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v29.4s, v22.4s +add v29.4s, v29.4s, v22.4s +sqrdmulh v22.4S, v11.4S, v9.4S +mul v11.4S, v11.4S,v1.4S +mla v11.4S, v22.4S, v31.s[0] +sub v22.4s, v30.4s, v11.4s +add v30.4s, v30.4s, v11.4s +str q8, [x0, #384] +str q12, [x0, #400] +str q29, [x0, #416] +str q21, [x0, #432] +str q18, [x0, #896] +str q13, [x0, #912] +str q30, [x0, #928] +str q22, [x0, #944] +ldr q9, [x17, #+1024] +ldr q1, [x17, #+1040] +ldr q3, [x17, #+1056] +ldr q20, [x17, #+1072] +ldr q17, [x17, #+1088] +ldr q19, [x17, #+1104] +ldr q0, [x17, #+1120] +ldr q14, [x17, #+1136] +ldr q22, [x0, #480] +ldr q30, [x0, #496] +ldr q13, [x0, #448] +ldr q18, [x0, #464] +ldr q16, [x17, #+2048] +ldr q2, [x17, #+2064] +ldr q10, [x17, #+2080] +ldr q15, [x17, #+2096] +ldr q7, [x17, #+2112] +ldr q6, [x17, #+2128] +ldr q5, [x17, #+2144] +ldr q4, [x17, #+2160] +ldr q21, [x0, #992] +ldr q29, [x0, #1008] +ldr q12, [x0, #960] +ldr q8, [x0, #976] +sqrdmulh v11.4S, v22.4S, v1.s[0] +mul v22.4S, v22.4S,v9.s[0] +mla v22.4S, v11.4S, v31.s[0] +sub v11.4s, v13.4s, v22.4s +add v13.4s, v13.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v2.s[0] +mul v21.4S, v21.4S,v16.s[0] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v12.4s, v21.4s +add v12.4s, v12.4s, v21.4s +sqrdmulh v21.4S, v30.4S, v1.s[0] +mul v30.4S, v30.4S,v9.s[0] +mla v30.4S, v21.4S, v31.s[0] +sub v21.4s, v18.4s, v30.4s +add v18.4s, v18.4s, v30.4s +sqrdmulh v30.4S, v29.4S, v2.s[0] +mul v29.4S, v29.4S,v16.s[0] +mla v29.4S, v30.4S, v31.s[0] +sub v30.4s, v8.4s, v29.4s +add v8.4s, v8.4s, v29.4s +sqrdmulh v29.4S, v18.4S, v1.s[1] +mul v18.4S, v18.4S,v9.s[1] +mla v18.4S, v29.4S, v31.s[0] +sub v29.4s, v13.4s, v18.4s +add v13.4s, v13.4s, v18.4s +sqrdmulh v18.4S, v8.4S, v2.s[1] +mul v8.4S, v8.4S,v16.s[1] +mla v8.4S, v18.4S, v31.s[0] +sub v18.4s, v12.4s, v8.4s +add v12.4s, v12.4s, v8.4s +sqrdmulh v8.4S, v21.4S, v1.s[2] +mul v21.4S, v21.4S,v9.s[2] +mla v21.4S, v8.4S, v31.s[0] +sub v8.4s, v11.4s, v21.4s +add v11.4s, v11.4s, v21.4s +sqrdmulh v21.4S, v30.4S, v2.s[2] +mul v30.4S, v30.4S,v16.s[2] +mla v30.4S, v21.4S, v31.s[0] +sub v21.4s, v22.4s, v30.4s +add v22.4s, v22.4s, v30.4s +trn1 v30.4S, v13.4S, v29.4S +trn2 v28.4S, v13.4S, v29.4S +trn1 v27.4S, v11.4S, v8.4S +trn2 v26.4S, v11.4S, v8.4S +trn2 v11.2D, v30.2D, v27.2D +trn2 v8.2D, v28.2D, v26.2D +trn1 v13.2D, v30.2D, v27.2D +trn1 v29.2D, v28.2D, v26.2D +trn1 v26.4S, v12.4S, v18.4S +trn2 v28.4S, v12.4S, v18.4S +trn1 v27.4S, v22.4S, v21.4S +trn2 v30.4S, v22.4S, v21.4S +trn2 v22.2D, v26.2D, v27.2D +trn2 v21.2D, v28.2D, v30.2D +trn1 v12.2D, v26.2D, v27.2D +trn1 v18.2D, v28.2D, v30.2D +sqrdmulh v30.4S, v11.4S, v20.4S +mul v11.4S, v11.4S,v3.4S +mla v11.4S, v30.4S, v31.s[0] +sub v30.4s, v13.4s, v11.4s +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v15.4S +mul v22.4S, v22.4S,v10.4S +mla v22.4S, v11.4S, v31.s[0] +sub v11.4s, v12.4s, v22.4s +add v12.4s, v12.4s, v22.4s +sqrdmulh v22.4S, v8.4S, v20.4S +mul v8.4S, v8.4S,v3.4S +mla v8.4S, v22.4S, v31.s[0] +sub v22.4s, v29.4s, v8.4s +add v29.4s, v29.4s, v8.4s +sqrdmulh v8.4S, v21.4S, v15.4S +mul v21.4S, v21.4S,v10.4S +mla v21.4S, v8.4S, v31.s[0] +sub v8.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v29.4S, v19.4S +mul v29.4S, v29.4S,v17.4S +mla v29.4S, v21.4S, v31.s[0] +sub v21.4s, v13.4s, v29.4s +add v13.4s, v13.4s, v29.4s +sqrdmulh v29.4S, v18.4S, v6.4S +mul v18.4S, v18.4S,v7.4S +mla v18.4S, v29.4S, v31.s[0] +sub v29.4s, v12.4s, v18.4s +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v22.4S, v14.4S +mul v22.4S, v22.4S,v0.4S +mla v22.4S, v18.4S, v31.s[0] +sub v18.4s, v30.4s, v22.4s +add v30.4s, v30.4s, v22.4s +sqrdmulh v22.4S, v8.4S, v4.4S +mul v8.4S, v8.4S,v5.4S +mla v8.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v8.4s +add v11.4s, v11.4s, v8.4s +str q13, [x0, #448] +str q21, [x0, #464] +str q30, [x0, #480] +str q18, [x0, #496] +str q12, [x0, #960] +str q29, [x0, #976] +str q11, [x0, #992] +str q22, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 2392 +// Instruction count: 2388 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z2_2.s b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z2_2.s new file mode 100644 index 0000000..61444d0 --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z2_2.s @@ -0,0 +1,2422 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 26036764 // Layer 6, block 0 +.word 7065381 // Layer 6, block 1 +.word 11280567 // Layer 6, block 2 +.word 19695786 // Layer 6, block 3 +.word 1666225723 // Layer 6, block 0 +.word 452149874 // Layer 6, block 1 +.word 721901190 // Layer 6, block 2 +.word 1260434103 // Layer 6, block 3 +.word 28678040 // Layer 7, block 0 +.word 5637166 // Layer 7, block 2 +.word 18759424 // Layer 7, block 4 +.word 8648030 // Layer 7, block 6 +.word 1835254486 // Layer 7, block 0 +.word 360751090 // Layer 7, block 2 +.word 1200511508 // Layer 7, block 4 +.word 553431680 // Layer 7, block 6 +.word 7232147 // Layer 7, block 1 +.word 7430689 // Layer 7, block 3 +.word 14819378 // Layer 7, block 5 +.word 22112339 // Layer 7, block 7 +.word 462822084 // Layer 7, block 1 +.word 475527802 // Layer 7, block 3 +.word 948367809 // Layer 7, block 5 +.word 1415081692 // Layer 7, block 7 +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14834498 // Layer 6, block 4 +.word 22861321 // Layer 6, block 5 +.word 23033862 // Layer 6, block 6 +.word 32211066 // Layer 6, block 7 +.word 949335415 // Layer 6, block 4 +.word 1463012881 // Layer 6, block 5 +.word 1474054663 // Layer 6, block 6 +.word 2061350894 // Layer 6, block 7 +.word 7103825 // Layer 7, block 8 +.word 24338119 // Layer 7, block 10 +.word 6674394 // Layer 7, block 12 +.word 3716128 // Layer 7, block 14 +.word 454610102 // Layer 7, block 8 +.word 1557520740 // Layer 7, block 10 +.word 427128616 // Layer 7, block 12 +.word 237814041 // Layer 7, block 14 +.word 18577393 // Layer 7, block 9 +.word 17042091 // Layer 7, block 11 +.word 6574213 // Layer 7, block 13 +.word 24666803 // Layer 7, block 15 +.word 1188862414 // Layer 7, block 9 +.word 1090610585 // Layer 7, block 11 +.word 420717521 // Layer 7, block 13 +.word 1578554911 // Layer 7, block 15 +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 11253846 // Layer 6, block 8 +.word 16151303 // Layer 6, block 9 +.word 1821442 // Layer 6, block 10 +.word 23358663 // Layer 6, block 11 +.word 720191176 // Layer 6, block 8 +.word 1033604503 // Layer 6, block 9 +.word 116563391 // Layer 6, block 10 +.word 1494840340 // Layer 6, block 11 +.word 32787475 // Layer 7, block 16 +.word 8269259 // Layer 7, block 18 +.word 20826321 // Layer 7, block 20 +.word 21194054 // Layer 7, block 22 +.word 2098238255 // Layer 7, block 16 +.word 529192186 // Layer 7, block 18 +.word 1332782821 // Layer 7, block 20 +.word 1356315937 // Layer 7, block 22 +.word 28400654 // Layer 7, block 17 +.word 31090287 // Layer 7, block 19 +.word 26776841 // Layer 7, block 21 +.word 22281074 // Layer 7, block 23 +.word 1817503137 // Layer 7, block 17 +.word 1989626512 // Layer 7, block 19 +.word 1713587037 // Layer 7, block 21 +.word 1425879908 // Layer 7, block 23 +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 20504641 // Layer 6, block 12 +.word 7735096 // Layer 6, block 13 +.word 29463916 // Layer 6, block 14 +.word 23172067 // Layer 6, block 15 +.word 1312196872 // Layer 6, block 12 +.word 495008363 // Layer 6, block 13 +.word 1885546712 // Layer 6, block 14 +.word 1482899108 // Layer 6, block 15 +.word 1953000 // Layer 7, block 24 +.word 12766243 // Layer 7, block 26 +.word 16292342 // Layer 7, block 28 +.word 25143337 // Layer 7, block 30 +.word 124982461 // Layer 7, block 24 +.word 816977197 // Layer 7, block 26 +.word 1042630311 // Layer 7, block 28 +.word 1609050759 // Layer 7, block 30 +.word 12486848 // Layer 7, block 25 +.word 31556661 // Layer 7, block 27 +.word 28330310 // Layer 7, block 29 +.word 15137961 // Layer 7, block 31 +.word 799097282 // Layer 7, block 25 +.word 2019472170 // Layer 7, block 27 +.word 1813001465 // Layer 7, block 29 +.word 968755565 // Layer 7, block 31 +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 18663828 // Layer 6, block 16 +.word 25765932 // Layer 6, block 17 +.word 11779122 // Layer 6, block 18 +.word 29112305 // Layer 6, block 19 +.word 1194393831 // Layer 6, block 16 +.word 1648893798 // Layer 6, block 17 +.word 753806275 // Layer 6, block 18 +.word 1863045325 // Layer 6, block 19 +.word 33163184 // Layer 7, block 32 +.word 11550623 // Layer 7, block 34 +.word 25375595 // Layer 7, block 36 +.word 18254638 // Layer 7, block 38 +.word 2122281795 // Layer 7, block 32 +.word 739183455 // Layer 7, block 34 +.word 1623914137 // Layer 7, block 36 +.word 1168207670 // Layer 7, block 38 +.word 9551359 // Layer 7, block 33 +.word 33257316 // Layer 7, block 35 +.word 10387700 // Layer 7, block 37 +.word 4263629 // Layer 7, block 39 +.word 611240324 // Layer 7, block 33 +.word 2128305784 // Layer 7, block 35 +.word 664762063 // Layer 7, block 37 +.word 272851431 // Layer 7, block 39 +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 596073 // Layer 6, block 20 +.word 29039358 // Layer 6, block 21 +.word 6760262 // Layer 6, block 22 +.word 2228887 // Layer 6, block 23 +.word 38145761 // Layer 6, block 20 +.word 1858377074 // Layer 6, block 21 +.word 432623749 // Layer 6, block 22 +.word 142637881 // Layer 6, block 23 +.word 25929180 // Layer 7, block 40 +.word 23508428 // Layer 7, block 42 +.word 22560727 // Layer 7, block 44 +.word 29457393 // Layer 7, block 46 +.word 1659340873 // Layer 7, block 40 +.word 1504424569 // Layer 7, block 42 +.word 1443776334 // Layer 7, block 44 +.word 1885129272 // Layer 7, block 46 +.word 17371159 // Layer 7, block 41 +.word 11558208 // Layer 7, block 43 +.word 15755637 // Layer 7, block 45 +.word 20740787 // Layer 7, block 47 +.word 1111669329 // Layer 7, block 41 +.word 739668858 // Layer 7, block 43 +.word 1008283812 // Layer 7, block 45 +.word 1327309063 // Layer 7, block 47 +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 13624329 // Layer 6, block 24 +.word 9838349 // Layer 6, block 25 +.word 6934560 // Layer 6, block 26 +.word 11310234 // Layer 6, block 27 +.word 871890510 // Layer 6, block 24 +.word 629606282 // Layer 6, block 25 +.word 443777969 // Layer 6, block 26 +.word 723799733 // Layer 6, block 27 +.word 3153984 // Layer 7, block 48 +.word 15599806 // Layer 7, block 50 +.word 23484790 // Layer 7, block 52 +.word 30174454 // Layer 7, block 54 +.word 201839571 // Layer 7, block 48 +.word 998311389 // Layer 7, block 50 +.word 1502911852 // Layer 7, block 52 +.word 1931017673 // Layer 7, block 54 +.word 13598070 // Layer 7, block 49 +.word 31454003 // Layer 7, block 51 +.word 20506260 // Layer 7, block 53 +.word 5928435 // Layer 7, block 55 +.word 870210062 // Layer 7, block 49 +.word 2012902560 // Layer 7, block 51 +.word 1312300480 // Layer 7, block 53 +.word 379390883 // Layer 7, block 55 +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 32798516 // Layer 6, block 28 +.word 9911360 // Layer 6, block 29 +.word 32443170 // Layer 6, block 30 +.word 31293482 // Layer 6, block 31 +.word 2098944825 // Layer 6, block 28 +.word 634278629 // Layer 6, block 29 +.word 2076204416 // Layer 6, block 30 +.word 2002630000 // Layer 6, block 31 +.word 26013877 // Layer 7, block 56 +.word 22928950 // Layer 7, block 58 +.word 24547058 // Layer 7, block 60 +.word 21082546 // Layer 7, block 62 +.word 1664761067 // Layer 7, block 56 +.word 1467340807 // Layer 7, block 58 +.word 1570891816 // Layer 7, block 60 +.word 1349179970 // Layer 7, block 62 +.word 21864746 // Layer 7, block 57 +.word 27678266 // Layer 7, block 59 +.word 30695887 // Layer 7, block 61 +.word 31772478 // Layer 7, block 63 +.word 1399236949 // Layer 7, block 57 +.word 1771273834 // Layer 7, block 59 +.word 1964386839 // Layer 7, block 61 +.word 2033283404 // Layer 7, block 63 +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 2853776 // Layer 6, block 32 +.word 31645959 // Layer 6, block 33 +.word 29723614 // Layer 6, block 34 +.word 31813171 // Layer 6, block 35 +.word 182627725 // Layer 6, block 32 +.word 2025186806 // Layer 6, block 33 +.word 1902166116 // Layer 6, block 34 +.word 2035887557 // Layer 6, block 35 +.word 30377953 // Layer 7, block 64 +.word 4924837 // Layer 7, block 66 +.word 11362575 // Layer 7, block 68 +.word 31398766 // Layer 7, block 70 +.word 1944040616 // Layer 7, block 64 +.word 315165513 // Layer 7, block 66 +.word 727149301 // Layer 7, block 68 +.word 2009367662 // Layer 7, block 70 +.word 27689101 // Layer 7, block 65 +.word 31229525 // Layer 7, block 67 +.word 6544948 // Layer 7, block 69 +.word 13728247 // Layer 7, block 71 +.word 1771967221 // Layer 7, block 65 +.word 1998537064 // Layer 7, block 67 +.word 418844704 // Layer 7, block 69 +.word 878540754 // Layer 7, block 71 +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9116920 // Layer 6, block 36 +.word 26449800 // Layer 6, block 37 +.word 27173300 // Layer 6, block 38 +.word 1574249 // Layer 6, block 39 +.word 583438350 // Layer 6, block 36 +.word 1692658010 // Layer 6, block 37 +.word 1738958476 // Layer 6, block 38 +.word 100744247 // Layer 6, block 39 +.word 6510145 // Layer 7, block 72 +.word 760999 // Layer 7, block 74 +.word 1634503 // Layer 7, block 76 +.word 29546109 // Layer 7, block 78 +.word 416617482 // Layer 7, block 72 +.word 48700219 // Layer 7, block 74 +.word 104600209 // Layer 7, block 76 +.word 1890806663 // Layer 7, block 78 +.word 2195232 // Layer 7, block 73 +.word 4465852 // Layer 7, block 75 +.word 31203102 // Layer 7, block 77 +.word 29916743 // Layer 7, block 79 +.word 140484126 // Layer 7, block 73 +.word 285792715 // Layer 7, block 75 +.word 1996846121 // Layer 7, block 77 +.word 1914525428 // Layer 7, block 79 +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29172999 // Layer 6, block 40 +.word 16825951 // Layer 6, block 41 +.word 11592382 // Layer 6, block 42 +.word 2671395 // Layer 6, block 43 +.word 1866929445 // Layer 6, block 40 +.word 1076778680 // Layer 6, block 41 +.word 741855827 // Layer 6, block 42 +.word 170956232 // Layer 6, block 43 +.word 14579779 // Layer 7, block 80 +.word 24263513 // Layer 7, block 82 +.word 4646776 // Layer 7, block 84 +.word 69049 // Layer 7, block 86 +.word 933034643 // Layer 7, block 80 +.word 1552746321 // Layer 7, block 82 +.word 297370968 // Layer 7, block 84 +.word 4418799 // Layer 7, block 86 +.word 33263488 // Layer 7, block 81 +.word 22493246 // Layer 7, block 83 +.word 22009979 // Layer 7, block 85 +.word 12021234 // Layer 7, block 87 +.word 2128700762 // Layer 7, block 81 +.word 1439457879 // Layer 7, block 83 +.word 1408531152 // Layer 7, block 85 +.word 769300260 // Layer 7, block 87 +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 15720958 // Layer 6, block 44 +.word 4876619 // Layer 6, block 45 +.word 9370171 // Layer 6, block 46 +.word 2197027 // Layer 6, block 47 +.word 1006064525 // Layer 6, block 44 +.word 312079797 // Layer 6, block 45 +.word 599645177 // Layer 6, block 46 +.word 140598997 // Layer 6, block 47 +.word 16117282 // Layer 7, block 88 +.word 9635661 // Layer 7, block 90 +.word 9117520 // Layer 7, block 92 +.word 3506913 // Layer 7, block 94 +.word 1031427326 // Layer 7, block 88 +.word 616635240 // Layer 7, block 90 +.word 583476747 // Layer 7, block 92 +.word 224425303 // Layer 7, block 94 +.word 20014407 // Layer 7, block 89 +.word 25893988 // Layer 7, block 91 +.word 10257619 // Layer 7, block 93 +.word 24501669 // Layer 7, block 95 +.word 1280824291 // Layer 7, block 89 +.word 1657088757 // Layer 7, block 91 +.word 656437514 // Layer 7, block 93 +.word 1567987141 // Layer 7, block 95 +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 23467272 // Layer 6, block 48 +.word 11944835 // Layer 6, block 49 +.word 29768154 // Layer 6, block 50 +.word 3189790 // Layer 6, block 51 +.word 1501790786 // Layer 6, block 48 +.word 764411097 // Layer 6, block 49 +.word 1905016458 // Layer 6, block 50 +.word 204130980 // Layer 6, block 51 +.word 28559032 // Layer 7, block 96 +.word 20151609 // Layer 7, block 98 +.word 11645481 // Layer 7, block 100 +.word 16402437 // Layer 7, block 102 +.word 1827638556 // Layer 7, block 96 +.word 1289604549 // Layer 7, block 98 +.word 745253903 // Layer 7, block 100 +.word 1049675853 // Layer 7, block 102 +.word 1005359 // Layer 7, block 97 +.word 19130139 // Layer 7, block 99 +.word 11690281 // Layer 7, block 101 +.word 5461508 // Layer 7, block 103 +.word 64338065 // Layer 7, block 97 +.word 1224235458 // Layer 7, block 99 +.word 748120885 // Layer 7, block 101 +.word 349509836 // Layer 7, block 103 +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 4898455 // Layer 6, block 52 +.word 22059944 // Layer 6, block 53 +.word 20315246 // Layer 6, block 54 +.word 28615767 // Layer 6, block 55 +.word 313477194 // Layer 6, block 52 +.word 1411728668 // Layer 6, block 53 +.word 1300076517 // Layer 6, block 54 +.word 1831269319 // Layer 6, block 55 +.word 6226096 // Layer 7, block 104 +.word 14029790 // Layer 7, block 106 +.word 7729000 // Layer 7, block 108 +.word 13958531 // Layer 7, block 110 +.word 398439734 // Layer 7, block 104 +.word 897838034 // Layer 7, block 106 +.word 494618249 // Layer 7, block 108 +.word 893277806 // Layer 7, block 110 +.word 31755058 // Layer 7, block 105 +.word 26102744 // Layer 7, block 107 +.word 19175904 // Layer 7, block 109 +.word 19472238 // Layer 7, block 111 +.word 2032168609 // Layer 7, block 105 +.word 1670448121 // Layer 7, block 107 +.word 1227164194 // Layer 7, block 109 +.word 1246128123 // Layer 7, block 111 +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 17302560 // Layer 6, block 56 +.word 8630188 // Layer 6, block 57 +.word 13744680 // Layer 6, block 58 +.word 31890906 // Layer 6, block 59 +.word 1107279328 // Layer 6, block 56 +.word 552289879 // Layer 6, block 57 +.word 879592386 // Layer 6, block 58 +.word 2040862218 // Layer 6, block 59 +.word 4735938 // Layer 7, block 112 +.word 26671657 // Layer 7, block 114 +.word 25810971 // Layer 7, block 116 +.word 25578690 // Layer 7, block 118 +.word 303076900 // Layer 7, block 112 +.word 1706855774 // Layer 7, block 114 +.word 1651776074 // Layer 7, block 116 +.word 1636911225 // Layer 7, block 118 +.word 6957373 // Layer 7, block 113 +.word 25381712 // Layer 7, block 115 +.word 27780827 // Layer 7, block 117 +.word 28062311 // Layer 7, block 119 +.word 445237890 // Layer 7, block 113 +.word 1624305595 // Layer 7, block 115 +.word 1777837237 // Layer 7, block 117 +.word 1795850838 // Layer 7, block 119 +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 26150922 // Layer 6, block 60 +.word 29525906 // Layer 6, block 61 +.word 23080870 // Layer 6, block 62 +.word 1636987 // Layer 6, block 63 +.word 1673531278 // Layer 6, block 60 +.word 1889513769 // Layer 6, block 61 +.word 1477062945 // Layer 6, block 62 +.word 104759172 // Layer 6, block 63 +.word 10674616 // Layer 7, block 120 +.word 9508293 // Layer 7, block 122 +.word 4274200 // Layer 7, block 124 +.word 10066304 // Layer 7, block 126 +.word 683123285 // Layer 7, block 120 +.word 608484310 // Layer 7, block 122 +.word 273527923 // Layer 7, block 124 +.word 644194289 // Layer 7, block 126 +.word 26473446 // Layer 7, block 121 +.word 14853570 // Layer 7, block 123 +.word 32427548 // Layer 7, block 125 +.word 16598340 // Layer 7, block 127 +.word 1694171239 // Layer 7, block 121 +.word 950555930 // Layer 7, block 123 +.word 2075204685 // Layer 7, block 125 +.word 1062212688 // Layer 7, block 127 +.text +.global ntt_u32_full_neon_asm_var_4_4_3_z2_2 +.global _ntt_u32_full_neon_asm_var_4_4_3_z2_2 +ntt_u32_full_neon_asm_var_4_4_3_z2_2: +_ntt_u32_full_neon_asm_var_4_4_3_z2_2: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #800] +ldr q21, [x0, #864] +ldr q20, [x0, #928] +ldr q19, [x0, #992] +ldr q18, [x0, #288] +ldr q17, [x0, #352] +ldr q16, [x0, #416] +ldr q3, [x0, #480] +sqrdmulh v2.4S, v22.4S, v29.s[0] +ldr q1, [x0, #544] +mul v22.4S, v22.4S,v30.s[0] +ldr q0, [x0, #608] +sqrdmulh v15.4S, v21.4S, v29.s[0] +ldr q14, [x0, #672] +mul v21.4S, v21.4S,v30.s[0] +ldr q13, [x0, #736] +mla v22.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q12, [x0, #32] +sub v11.4s, v18.4s, v22.4s +mla v21.4S, v15.4S, v31.s[0] +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q15, [x0, #96] +sub v10.4s, v17.4s, v21.4s +mla v20.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v1.4S, v29.s[0] +ldr q2, [x0, #160] +mul v1.4S, v1.4S,v30.s[0] +sub v9.4s, v16.4s, v20.4s +mla v19.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v0.4S, v29.s[0] +ldr q22, [x0, #224] +mul v0.4S, v0.4S,v30.s[0] +sub v8.4s, v3.4s, v19.4s +mla v1.4S, v21.4S, v31.s[0] +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v21.4s, v12.4s, v1.4s +mla v0.4S, v20.4S, v31.s[0] +add v12.4s, v12.4s, v1.4s +sqrdmulh v1.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +sub v20.4s, v15.4s, v0.4s +mla v14.4S, v19.4S, v31.s[0] +add v15.4s, v15.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v19.4s, v2.4s, v14.4s +mla v13.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v1.4s, v22.4s, v13.4s +mla v16.4S, v0.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v0.4s, v2.4s, v16.4s +mla v3.4S, v14.4S, v31.s[0] +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v14.4s, v22.4s, v3.4s +mla v18.4S, v13.4S, v31.s[0] +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v29.s[2] +mul v9.4S, v9.4S,v30.s[2] +sub v13.4s, v12.4s, v18.4s +mla v17.4S, v16.4S, v31.s[0] +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v8.4S, v29.s[2] +mul v8.4S, v8.4S,v30.s[2] +sub v16.4s, v15.4s, v17.4s +mla v9.4S, v3.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +sqrdmulh v17.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v3.4s, v19.4s, v9.4s +mla v8.4S, v18.4S, v31.s[0] +add v19.4s, v19.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v18.4s, v1.4s, v8.4s +mla v11.4S, v17.4S, v31.s[0] +add v1.4s, v1.4s, v8.4s +sqrdmulh v8.4S, v2.4S, v27.s[0] +mul v2.4S, v2.4S,v28.s[0] +sub v17.4s, v21.4s, v11.4s +mla v10.4S, v9.4S, v31.s[0] +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v27.s[0] +mul v22.4S, v22.4S,v28.s[0] +sub v9.4s, v20.4s, v10.4s +mla v2.4S, v8.4S, v31.s[0] +add v20.4s, v20.4s, v10.4s +sqrdmulh v10.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v8.4s, v12.4s, v2.4s +mla v22.4S, v11.4S, v31.s[0] +add v12.4s, v12.4s, v2.4s +sqrdmulh v2.4S, v14.4S, v27.s[1] +mul v14.4S, v14.4S,v28.s[1] +sub v11.4s, v15.4s, v22.4s +mla v0.4S, v10.4S, v31.s[0] +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v27.s[2] +mul v19.4S, v19.4S,v28.s[2] +sub v10.4s, v13.4s, v0.4s +mla v14.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v0.4s +sqrdmulh v0.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +sub v2.4s, v16.4s, v14.4s +mla v19.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v27.s[3] +mul v3.4S, v3.4S,v28.s[3] +sub v22.4s, v21.4s, v19.4s +mla v1.4S, v0.4S, v31.s[0] +add v21.4s, v21.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v0.4s, v20.4s, v1.4s +mla v3.4S, v14.4S, v31.s[0] +add v20.4s, v20.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v25.s[0] +mul v15.4S, v15.4S,v26.s[0] +sub v14.4s, v17.4s, v3.4s +mla v18.4S, v19.4S, v31.s[0] +add v17.4s, v17.4s, v3.4s +sqrdmulh v3.4S, v11.4S, v25.s[1] +mul v11.4S, v11.4S,v26.s[1] +sub v19.4s, v9.4s, v18.4s +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v1.4s, v12.4s, v15.4s +mla v11.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v25.s[3] +mul v2.4S, v2.4S,v26.s[3] +sub v3.4s, v8.4s, v11.4s +mla v16.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v11.4s +str q12, [x0, #32] +sqrdmulh v12.4S, v20.4S, v23.s[0] +str q1, [x0, #96] +mul v20.4S, v20.4S,v24.s[0] +ldr q1, [x0, #816] +sub v11.4s, v13.4s, v16.4s +ldr q18, [x0, #880] +mla v2.4S, v15.4S, v31.s[0] +add v13.4s, v13.4s, v16.4s +str q8, [x0, #160] +sqrdmulh v8.4S, v0.4S, v23.s[1] +str q3, [x0, #224] +mul v0.4S, v0.4S,v24.s[1] +ldr q3, [x0, #944] +sub v16.4s, v10.4s, v2.4s +ldr q15, [x0, #1008] +mla v20.4S, v12.4S, v31.s[0] +add v10.4s, v10.4s, v2.4s +str q13, [x0, #288] +sqrdmulh v13.4S, v9.4S, v23.s[2] +str q11, [x0, #352] +mul v9.4S, v9.4S,v24.s[2] +ldr q11, [x0, #304] +sub v2.4s, v21.4s, v20.4s +ldr q12, [x0, #368] +mla v0.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v20.4s +str q10, [x0, #416] +sqrdmulh v10.4S, v19.4S, v23.s[3] +str q16, [x0, #480] +mul v19.4S, v19.4S,v24.s[3] +ldr q16, [x0, #432] +sub v20.4s, v22.4s, v0.4s +ldr q8, [x0, #496] +mla v9.4S, v13.4S, v31.s[0] +add v22.4s, v22.4s, v0.4s +str q21, [x0, #544] +sqrdmulh v21.4S, v1.4S, v29.s[0] +str q2, [x0, #608] +ldr q2, [x0, #560] +mul v1.4S, v1.4S,v30.s[0] +ldr q0, [x0, #624] +sub v13.4s, v17.4s, v9.4s +mla v19.4S, v10.4S, v31.s[0] +add v17.4s, v17.4s, v9.4s +str q22, [x0, #672] +sqrdmulh v22.4S, v18.4S, v29.s[0] +str q20, [x0, #736] +ldr q20, [x0, #688] +mul v18.4S, v18.4S,v30.s[0] +ldr q9, [x0, #752] +sub v10.4s, v14.4s, v19.4s +mla v1.4S, v21.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +str q17, [x0, #800] +sqrdmulh v17.4S, v3.4S, v29.s[0] +str q13, [x0, #864] +mul v3.4S, v3.4S,v30.s[0] +ldr q13, [x0, #48] +sub v19.4s, v11.4s, v1.4s +mla v18.4S, v22.4S, v31.s[0] +add v11.4s, v11.4s, v1.4s +str q14, [x0, #928] +sqrdmulh v14.4S, v15.4S, v29.s[0] +str q10, [x0, #992] +mul v15.4S, v15.4S,v30.s[0] +ldr q10, [x0, #112] +sub v1.4s, v12.4s, v18.4s +mla v3.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v2.4S, v29.s[0] +ldr q17, [x0, #176] +mul v2.4S, v2.4S,v30.s[0] +sub v22.4s, v16.4s, v3.4s +mla v15.4S, v14.4S, v31.s[0] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v0.4S, v29.s[0] +ldr q14, [x0, #240] +mul v0.4S, v0.4S,v30.s[0] +sub v21.4s, v8.4s, v15.4s +mla v2.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +sub v18.4s, v13.4s, v2.4s +mla v0.4S, v3.4S, v31.s[0] +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v9.4S, v29.s[0] +mul v9.4S, v9.4S,v30.s[0] +sub v3.4s, v10.4s, v0.4s +mla v20.4S, v15.4S, v31.s[0] +add v10.4s, v10.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v15.4s, v17.4s, v20.4s +mla v9.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +sub v2.4s, v14.4s, v9.4s +mla v16.4S, v0.4S, v31.s[0] +add v14.4s, v14.4s, v9.4s +sqrdmulh v9.4S, v11.4S, v29.s[1] +mul v11.4S, v11.4S,v30.s[1] +sub v0.4s, v17.4s, v16.4s +mla v8.4S, v20.4S, v31.s[0] +add v17.4s, v17.4s, v16.4s +sqrdmulh v16.4S, v12.4S, v29.s[1] +mul v12.4S, v12.4S,v30.s[1] +sub v20.4s, v14.4s, v8.4s +mla v11.4S, v9.4S, v31.s[0] +add v14.4s, v14.4s, v8.4s +sqrdmulh v8.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +sub v9.4s, v13.4s, v11.4s +mla v12.4S, v16.4S, v31.s[0] +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +sub v16.4s, v10.4s, v12.4s +mla v22.4S, v8.4S, v31.s[0] +add v10.4s, v10.4s, v12.4s +sqrdmulh v12.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +sub v8.4s, v15.4s, v22.4s +mla v21.4S, v11.4S, v31.s[0] +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v1.4S, v29.s[2] +mul v1.4S, v1.4S,v30.s[2] +sub v11.4s, v2.4s, v21.4s +mla v19.4S, v12.4S, v31.s[0] +add v2.4s, v2.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v27.s[0] +mul v17.4S, v17.4S,v28.s[0] +sub v12.4s, v18.4s, v19.4s +mla v1.4S, v22.4S, v31.s[0] +add v18.4s, v18.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +sub v22.4s, v3.4s, v1.4s +mla v17.4S, v21.4S, v31.s[0] +add v3.4s, v3.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v21.4s, v13.4s, v17.4s +mla v14.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v17.4s +sqrdmulh v17.4S, v20.4S, v27.s[1] +mul v20.4S, v20.4S,v28.s[1] +sub v19.4s, v10.4s, v14.4s +mla v0.4S, v1.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v27.s[2] +mul v15.4S, v15.4S,v28.s[2] +sub v1.4s, v9.4s, v0.4s +mla v20.4S, v17.4S, v31.s[0] +add v9.4s, v9.4s, v0.4s +sqrdmulh v0.4S, v2.4S, v27.s[2] +mul v2.4S, v2.4S,v28.s[2] +sub v17.4s, v16.4s, v20.4s +mla v15.4S, v14.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v27.s[3] +mul v8.4S, v8.4S,v28.s[3] +sub v14.4s, v18.4s, v15.4s +mla v2.4S, v0.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v27.s[3] +mul v11.4S, v11.4S,v28.s[3] +sub v0.4s, v3.4s, v2.4s +mla v8.4S, v20.4S, v31.s[0] +add v3.4s, v3.4s, v2.4s +sqrdmulh v2.4S, v10.4S, v25.s[0] +mul v10.4S, v10.4S,v26.s[0] +sub v20.4s, v12.4s, v8.4s +mla v11.4S, v15.4S, v31.s[0] +add v12.4s, v12.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v25.s[1] +mul v19.4S, v19.4S,v26.s[1] +sub v15.4s, v22.4s, v11.4s +mla v10.4S, v2.4S, v31.s[0] +add v22.4s, v22.4s, v11.4s +sqrdmulh v11.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v2.4s, v13.4s, v10.4s +mla v19.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v10.4s +sqrdmulh v10.4S, v17.4S, v25.s[3] +mul v17.4S, v17.4S,v26.s[3] +sub v8.4s, v21.4s, v19.4s +mla v16.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v19.4s +str q13, [x0, #48] +sqrdmulh v13.4S, v3.4S, v23.s[0] +str q2, [x0, #112] +mul v3.4S, v3.4S,v24.s[0] +ldr q2, [x0, #768] +sub v19.4s, v9.4s, v16.4s +ldr q11, [x0, #832] +mla v17.4S, v10.4S, v31.s[0] +add v9.4s, v9.4s, v16.4s +str q21, [x0, #176] +sqrdmulh v21.4S, v0.4S, v23.s[1] +str q8, [x0, #240] +mul v0.4S, v0.4S,v24.s[1] +ldr q8, [x0, #896] +sub v16.4s, v1.4s, v17.4s +ldr q10, [x0, #960] +mla v3.4S, v13.4S, v31.s[0] +add v1.4s, v1.4s, v17.4s +str q9, [x0, #304] +sqrdmulh v9.4S, v22.4S, v23.s[2] +str q19, [x0, #368] +mul v22.4S, v22.4S,v24.s[2] +ldr q19, [x0, #256] +sub v17.4s, v18.4s, v3.4s +ldr q13, [x0, #320] +mla v0.4S, v21.4S, v31.s[0] +add v18.4s, v18.4s, v3.4s +str q1, [x0, #432] +sqrdmulh v1.4S, v15.4S, v23.s[3] +str q16, [x0, #496] +mul v15.4S, v15.4S,v24.s[3] +ldr q16, [x0, #384] +sub v3.4s, v14.4s, v0.4s +ldr q21, [x0, #448] +mla v22.4S, v9.4S, v31.s[0] +add v14.4s, v14.4s, v0.4s +str q18, [x0, #560] +sqrdmulh v18.4S, v2.4S, v29.s[0] +str q17, [x0, #624] +ldr q17, [x0, #512] +mul v2.4S, v2.4S,v30.s[0] +ldr q0, [x0, #576] +sub v9.4s, v12.4s, v22.4s +mla v15.4S, v1.4S, v31.s[0] +add v12.4s, v12.4s, v22.4s +str q14, [x0, #688] +sqrdmulh v14.4S, v11.4S, v29.s[0] +str q3, [x0, #752] +ldr q3, [x0, #640] +mul v11.4S, v11.4S,v30.s[0] +ldr q22, [x0, #704] +sub v1.4s, v20.4s, v15.4s +mla v2.4S, v18.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +str q12, [x0, #816] +sqrdmulh v12.4S, v8.4S, v29.s[0] +str q9, [x0, #880] +mul v8.4S, v8.4S,v30.s[0] +ldr q9, [x0, #0] +sub v15.4s, v19.4s, v2.4s +mla v11.4S, v14.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +str q20, [x0, #944] +sqrdmulh v20.4S, v10.4S, v29.s[0] +str q1, [x0, #1008] +mul v10.4S, v10.4S,v30.s[0] +ldr q1, [x0, #64] +sub v2.4s, v13.4s, v11.4s +mla v8.4S, v12.4S, v31.s[0] +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v29.s[0] +ldr q12, [x0, #128] +mul v17.4S, v17.4S,v30.s[0] +sub v14.4s, v16.4s, v8.4s +mla v10.4S, v20.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v0.4S, v29.s[0] +ldr q20, [x0, #192] +mul v0.4S, v0.4S,v30.s[0] +sub v18.4s, v21.4s, v10.4s +mla v17.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +sub v11.4s, v9.4s, v17.4s +mla v0.4S, v8.4S, v31.s[0] +add v9.4s, v9.4s, v17.4s +sqrdmulh v17.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v8.4s, v1.4s, v0.4s +mla v3.4S, v10.4S, v31.s[0] +add v1.4s, v1.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v10.4s, v12.4s, v3.4s +mla v22.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v17.4s, v20.4s, v22.4s +mla v16.4S, v0.4S, v31.s[0] +add v20.4s, v20.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[1] +mul v19.4S, v19.4S,v30.s[1] +sub v0.4s, v12.4s, v16.4s +mla v21.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v13.4S, v29.s[1] +mul v13.4S, v13.4S,v30.s[1] +sub v3.4s, v20.4s, v21.4s +mla v19.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v22.4s, v9.4s, v19.4s +mla v13.4S, v16.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v29.s[2] +mul v18.4S, v18.4S,v30.s[2] +sub v16.4s, v1.4s, v13.4s +mla v14.4S, v21.4S, v31.s[0] +add v1.4s, v1.4s, v13.4s +sqrdmulh v13.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +sub v21.4s, v10.4s, v14.4s +mla v18.4S, v19.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v19.4s, v17.4s, v18.4s +mla v15.4S, v13.4S, v31.s[0] +add v17.4s, v17.4s, v18.4s +sqrdmulh v18.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +sub v13.4s, v11.4s, v15.4s +mla v2.4S, v14.4S, v31.s[0] +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v27.s[0] +mul v20.4S, v20.4S,v28.s[0] +sub v14.4s, v8.4s, v2.4s +mla v12.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v2.4s +sqrdmulh v2.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v18.4s, v9.4s, v12.4s +mla v20.4S, v15.4S, v31.s[0] +add v9.4s, v9.4s, v12.4s +sqrdmulh v12.4S, v3.4S, v27.s[1] +mul v3.4S, v3.4S,v28.s[1] +sub v15.4s, v1.4s, v20.4s +mla v0.4S, v2.4S, v31.s[0] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v10.4S, v27.s[2] +mul v10.4S, v10.4S,v28.s[2] +sub v2.4s, v22.4s, v0.4s +mla v3.4S, v12.4S, v31.s[0] +add v22.4s, v22.4s, v0.4s +sqrdmulh v0.4S, v17.4S, v27.s[2] +mul v17.4S, v17.4S,v28.s[2] +sub v12.4s, v16.4s, v3.4s +mla v10.4S, v20.4S, v31.s[0] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +sub v20.4s, v11.4s, v10.4s +mla v17.4S, v0.4S, v31.s[0] +add v11.4s, v11.4s, v10.4s +sqrdmulh v10.4S, v19.4S, v27.s[3] +mul v19.4S, v19.4S,v28.s[3] +sub v0.4s, v8.4s, v17.4s +mla v21.4S, v3.4S, v31.s[0] +add v8.4s, v8.4s, v17.4s +sqrdmulh v17.4S, v1.4S, v25.s[0] +mul v1.4S, v1.4S,v26.s[0] +sub v3.4s, v13.4s, v21.4s +mla v19.4S, v10.4S, v31.s[0] +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v15.4S, v25.s[1] +mul v15.4S, v15.4S,v26.s[1] +sub v10.4s, v14.4s, v19.4s +mla v1.4S, v17.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +sqrdmulh v19.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v17.4s, v9.4s, v1.4s +mla v15.4S, v21.4S, v31.s[0] +add v9.4s, v9.4s, v1.4s +sqrdmulh v1.4S, v12.4S, v25.s[3] +mul v12.4S, v12.4S,v26.s[3] +sub v21.4s, v18.4s, v15.4s +mla v16.4S, v19.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +str q9, [x0, #0] +sqrdmulh v9.4S, v8.4S, v23.s[0] +str q17, [x0, #64] +mul v8.4S, v8.4S,v24.s[0] +ldr q17, [x0, #784] +sub v15.4s, v22.4s, v16.4s +ldr q19, [x0, #848] +mla v12.4S, v1.4S, v31.s[0] +add v22.4s, v22.4s, v16.4s +str q18, [x0, #128] +sqrdmulh v18.4S, v0.4S, v23.s[1] +str q21, [x0, #192] +mul v0.4S, v0.4S,v24.s[1] +ldr q21, [x0, #912] +sub v16.4s, v2.4s, v12.4s +ldr q1, [x0, #976] +mla v8.4S, v9.4S, v31.s[0] +add v2.4s, v2.4s, v12.4s +str q22, [x0, #256] +sqrdmulh v22.4S, v14.4S, v23.s[2] +str q15, [x0, #320] +mul v14.4S, v14.4S,v24.s[2] +ldr q15, [x0, #272] +sub v12.4s, v11.4s, v8.4s +ldr q9, [x0, #336] +mla v0.4S, v18.4S, v31.s[0] +add v11.4s, v11.4s, v8.4s +str q2, [x0, #384] +sqrdmulh v2.4S, v10.4S, v23.s[3] +str q16, [x0, #448] +mul v10.4S, v10.4S,v24.s[3] +ldr q16, [x0, #400] +sub v8.4s, v20.4s, v0.4s +ldr q18, [x0, #464] +mla v14.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v0.4s +str q11, [x0, #512] +sqrdmulh v11.4S, v17.4S, v29.s[0] +str q12, [x0, #576] +ldr q12, [x0, #528] +mul v17.4S, v17.4S,v30.s[0] +ldr q0, [x0, #592] +sub v22.4s, v13.4s, v14.4s +mla v10.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v14.4s +str q20, [x0, #640] +sqrdmulh v20.4S, v19.4S, v29.s[0] +str q8, [x0, #704] +ldr q8, [x0, #656] +mul v19.4S, v19.4S,v30.s[0] +ldr q14, [x0, #720] +sub v2.4s, v3.4s, v10.4s +mla v17.4S, v11.4S, v31.s[0] +add v3.4s, v3.4s, v10.4s +str q13, [x0, #768] +sqrdmulh v13.4S, v21.4S, v29.s[0] +str q22, [x0, #832] +mul v21.4S, v21.4S,v30.s[0] +ldr q22, [x0, #16] +sub v10.4s, v15.4s, v17.4s +mla v19.4S, v20.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +str q3, [x0, #896] +sqrdmulh v3.4S, v1.4S, v29.s[0] +str q2, [x0, #960] +mul v1.4S, v1.4S,v30.s[0] +ldr q2, [x0, #80] +sub v17.4s, v9.4s, v19.4s +mla v21.4S, v13.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v12.4S, v29.s[0] +ldr q13, [x0, #144] +mul v12.4S, v12.4S,v30.s[0] +sub v20.4s, v16.4s, v21.4s +mla v1.4S, v3.4S, v31.s[0] +add v16.4s, v16.4s, v21.4s +sqrdmulh v21.4S, v0.4S, v29.s[0] +ldr q3, [x0, #208] +mul v0.4S, v0.4S,v30.s[0] +sub v11.4s, v18.4s, v1.4s +mla v12.4S, v19.4S, v31.s[0] +add v18.4s, v18.4s, v1.4s +sqrdmulh v1.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v19.4s, v22.4s, v12.4s +mla v0.4S, v21.4S, v31.s[0] +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v21.4s, v2.4s, v0.4s +mla v8.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v1.4s, v13.4s, v8.4s +mla v14.4S, v12.4S, v31.s[0] +add v13.4s, v13.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v12.4s, v3.4s, v14.4s +mla v16.4S, v0.4S, v31.s[0] +add v3.4s, v3.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +sub v0.4s, v13.4s, v16.4s +mla v18.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v16.4s +sqrdmulh v16.4S, v9.4S, v29.s[1] +mul v9.4S, v9.4S,v30.s[1] +sub v8.4s, v3.4s, v18.4s +mla v15.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +sub v14.4s, v22.4s, v15.4s +mla v9.4S, v16.4S, v31.s[0] +add v22.4s, v22.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v16.4s, v2.4s, v9.4s +mla v20.4S, v18.4S, v31.s[0] +add v2.4s, v2.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v18.4s, v1.4s, v20.4s +mla v11.4S, v15.4S, v31.s[0] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +sub v15.4s, v12.4s, v11.4s +mla v10.4S, v9.4S, v31.s[0] +add v12.4s, v12.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v27.s[0] +mul v13.4S, v13.4S,v28.s[0] +sub v9.4s, v19.4s, v10.4s +mla v17.4S, v20.4S, v31.s[0] +add v19.4s, v19.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v27.s[0] +mul v3.4S, v3.4S,v28.s[0] +sub v20.4s, v21.4s, v17.4s +mla v13.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v11.4s, v22.4s, v13.4s +mla v3.4S, v10.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v8.4S, v27.s[1] +mul v8.4S, v8.4S,v28.s[1] +sub v10.4s, v2.4s, v3.4s +mla v0.4S, v17.4S, v31.s[0] +add v2.4s, v2.4s, v3.4s +sqrdmulh v3.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +sub v17.4s, v14.4s, v0.4s +mla v8.4S, v13.4S, v31.s[0] +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v12.4S, v27.s[2] +mul v12.4S, v12.4S,v28.s[2] +sub v13.4s, v16.4s, v8.4s +mla v1.4S, v3.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v3.4s, v19.4s, v1.4s +mla v12.4S, v0.4S, v31.s[0] +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +sub v0.4s, v21.4s, v12.4s +mla v18.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v2.4S, v25.s[0] +mul v2.4S, v2.4S,v26.s[0] +sub v8.4s, v9.4s, v18.4s +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v10.4S, v25.s[1] +mul v10.4S, v10.4S,v26.s[1] +sub v1.4s, v20.4s, v15.4s +mla v2.4S, v12.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v12.4s, v22.4s, v2.4s +mla v10.4S, v18.4S, v31.s[0] +add v22.4s, v22.4s, v2.4s +sqrdmulh v2.4S, v13.4S, v25.s[3] +mul v13.4S, v13.4S,v26.s[3] +sub v18.4s, v11.4s, v10.4s +mla v16.4S, v15.4S, v31.s[0] +add v11.4s, v11.4s, v10.4s +str q22, [x0, #16] +sqrdmulh v22.4S, v21.4S, v23.s[0] +str q12, [x0, #80] +mul v21.4S, v21.4S,v24.s[0] +sub v12.4s, v14.4s, v16.4s +mla v13.4S, v2.4S, v31.s[0] +add v14.4s, v14.4s, v16.4s +str q11, [x0, #144] +sqrdmulh v11.4S, v0.4S, v23.s[1] +str q18, [x0, #208] +mul v0.4S, v0.4S,v24.s[1] +sub v18.4s, v17.4s, v13.4s +mla v21.4S, v22.4S, v31.s[0] +add v17.4s, v17.4s, v13.4s +str q14, [x0, #272] +sqrdmulh v14.4S, v20.4S, v23.s[2] +str q12, [x0, #336] +mul v20.4S, v20.4S,v24.s[2] +sub v12.4s, v19.4s, v21.4s +mla v0.4S, v11.4S, v31.s[0] +add v19.4s, v19.4s, v21.4s +str q17, [x0, #400] +sqrdmulh v17.4S, v1.4S, v23.s[3] +str q18, [x0, #464] +mul v1.4S, v1.4S,v24.s[3] +sub v18.4s, v3.4s, v0.4s +mla v20.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v0.4s +str q19, [x0, #528] +str q12, [x0, #592] +sub v12.4s, v9.4s, v20.4s +mla v1.4S, v17.4S, v31.s[0] +add v9.4s, v9.4s, v20.4s +str q3, [x0, #656] +str q18, [x0, #720] +sub v18.4s, v8.4s, v1.4s +add v8.4s, v8.4s, v1.4s +str q9, [x0, #784] +str q12, [x0, #848] +str q8, [x0, #912] +str q18, [x0, #976] +ldr q4, [x17, #+128] +ldr q5, [x17, #+144] +ldr q6, [x17, #+160] +ldr q7, [x17, #+176] +ldr q15, [x17, #+192] +ldr q10, [x17, #+208] +ldr q2, [x17, #+224] +ldr q16, [x17, #+240] +ldr q22, [x0, #32] +ldr q13, [x0, #48] +ldr q11, [x0, #0] +ldr q21, [x0, #16] +ldr q14, [x17, #+1152] +ldr q0, [x17, #+1168] +ldr q19, [x17, #+1184] +ldr q17, [x17, #+1200] +ldr q20, [x17, #+1216] +ldr q3, [x17, #+1232] +ldr q1, [x17, #+1248] +ldr q9, [x17, #+1264] +ldr q12, [x0, #544] +ldr q8, [x0, #560] +ldr q18, [x0, #512] +ldr q30, [x0, #528] +sqrdmulh v29.4S, v22.4S, v5.s[0] +mul v22.4S, v22.4S,v4.s[0] +mla v22.4S, v29.4S, v31.s[0] +sub v29.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +sqrdmulh v22.4S, v13.4S, v5.s[0] +mul v13.4S, v13.4S,v4.s[0] +mla v13.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +sqrdmulh v13.4S, v12.4S, v0.s[0] +mul v12.4S, v12.4S,v14.s[0] +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v18.4s, v12.4s +add v18.4s, v18.4s, v12.4s +sqrdmulh v12.4S, v8.4S, v0.s[0] +mul v8.4S, v8.4S,v14.s[0] +mla v8.4S, v12.4S, v31.s[0] +sub v12.4s, v30.4s, v8.4s +add v30.4s, v30.4s, v8.4s +sqrdmulh v8.4S, v21.4S, v5.s[1] +mul v21.4S, v21.4S,v4.s[1] +mla v21.4S, v8.4S, v31.s[0] +sub v8.4s, v11.4s, v21.4s +add v11.4s, v11.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v5.s[2] +mul v22.4S, v22.4S,v4.s[2] +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v29.4s, v22.4s +add v29.4s, v29.4s, v22.4s +sqrdmulh v22.4S, v30.4S, v0.s[1] +mul v30.4S, v30.4S,v14.s[1] +mla v30.4S, v22.4S, v31.s[0] +sub v22.4s, v18.4s, v30.4s +add v18.4s, v18.4s, v30.4s +sqrdmulh v30.4S, v12.4S, v0.s[2] +mul v12.4S, v12.4S,v14.s[2] +mla v12.4S, v30.4S, v31.s[0] +sub v30.4s, v13.4s, v12.4s +add v13.4s, v13.4s, v12.4s +trn1 v12.4S, v11.4S, v8.4S +trn2 v28.4S, v11.4S, v8.4S +trn1 v27.4S, v29.4S, v21.4S +trn2 v26.4S, v29.4S, v21.4S +trn2 v29.2D, v12.2D, v27.2D +trn2 v21.2D, v28.2D, v26.2D +trn1 v11.2D, v12.2D, v27.2D +trn1 v8.2D, v28.2D, v26.2D +trn1 v26.4S, v18.4S, v22.4S +trn2 v28.4S, v18.4S, v22.4S +trn1 v27.4S, v13.4S, v30.4S +trn2 v12.4S, v13.4S, v30.4S +trn2 v13.2D, v26.2D, v27.2D +trn2 v30.2D, v28.2D, v12.2D +trn1 v18.2D, v26.2D, v27.2D +trn1 v22.2D, v28.2D, v12.2D +sqrdmulh v12.4S, v29.4S, v7.4S +mul v29.4S, v29.4S,v6.4S +mla v29.4S, v12.4S, v31.s[0] +sub v12.4s, v11.4s, v29.4s +add v11.4s, v11.4s, v29.4s +sqrdmulh v29.4S, v21.4S, v7.4S +mul v21.4S, v21.4S,v6.4S +mla v21.4S, v29.4S, v31.s[0] +sub v29.4s, v8.4s, v21.4s +add v8.4s, v8.4s, v21.4s +sqrdmulh v21.4S, v13.4S, v17.4S +mul v13.4S, v13.4S,v19.4S +mla v13.4S, v21.4S, v31.s[0] +sub v21.4s, v18.4s, v13.4s +add v18.4s, v18.4s, v13.4s +sqrdmulh v13.4S, v30.4S, v17.4S +mul v30.4S, v30.4S,v19.4S +mla v30.4S, v13.4S, v31.s[0] +sub v13.4s, v22.4s, v30.4s +add v22.4s, v22.4s, v30.4s +sqrdmulh v30.4S, v8.4S, v10.4S +mul v8.4S, v8.4S,v15.4S +mla v8.4S, v30.4S, v31.s[0] +sub v30.4s, v11.4s, v8.4s +add v11.4s, v11.4s, v8.4s +sqrdmulh v8.4S, v29.4S, v16.4S +mul v29.4S, v29.4S,v2.4S +mla v29.4S, v8.4S, v31.s[0] +sub v8.4s, v12.4s, v29.4s +add v12.4s, v12.4s, v29.4s +sqrdmulh v29.4S, v22.4S, v3.4S +mul v22.4S, v22.4S,v20.4S +mla v22.4S, v29.4S, v31.s[0] +sub v29.4s, v18.4s, v22.4s +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v13.4S, v9.4S +mul v13.4S, v13.4S,v1.4S +mla v13.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +str q11, [x0, #0] +str q30, [x0, #16] +str q12, [x0, #32] +str q8, [x0, #48] +str q18, [x0, #512] +str q29, [x0, #528] +str q21, [x0, #544] +str q22, [x0, #560] +ldr q9, [x17, #+256] +ldr q1, [x17, #+272] +ldr q3, [x17, #+288] +ldr q20, [x17, #+304] +ldr q17, [x17, #+320] +ldr q19, [x17, #+336] +ldr q0, [x17, #+352] +ldr q14, [x17, #+368] +ldr q22, [x0, #96] +ldr q21, [x0, #112] +ldr q29, [x0, #64] +ldr q18, [x0, #80] +ldr q16, [x17, #+1280] +ldr q2, [x17, #+1296] +ldr q10, [x17, #+1312] +ldr q15, [x17, #+1328] +ldr q7, [x17, #+1344] +ldr q6, [x17, #+1360] +ldr q5, [x17, #+1376] +ldr q4, [x17, #+1392] +ldr q8, [x0, #608] +ldr q12, [x0, #624] +ldr q30, [x0, #576] +ldr q11, [x0, #592] +sqrdmulh v13.4S, v22.4S, v1.s[0] +mul v22.4S, v22.4S,v9.s[0] +mla v22.4S, v13.4S, v31.s[0] +sub v13.4s, v29.4s, v22.4s +add v29.4s, v29.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v1.s[0] +mul v21.4S, v21.4S,v9.s[0] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v8.4S, v2.s[0] +mul v8.4S, v8.4S,v16.s[0] +mla v8.4S, v21.4S, v31.s[0] +sub v21.4s, v30.4s, v8.4s +add v30.4s, v30.4s, v8.4s +sqrdmulh v8.4S, v12.4S, v2.s[0] +mul v12.4S, v12.4S,v16.s[0] +mla v12.4S, v8.4S, v31.s[0] +sub v8.4s, v11.4s, v12.4s +add v11.4s, v11.4s, v12.4s +sqrdmulh v12.4S, v18.4S, v1.s[1] +mul v18.4S, v18.4S,v9.s[1] +mla v18.4S, v12.4S, v31.s[0] +sub v12.4s, v29.4s, v18.4s +add v29.4s, v29.4s, v18.4s +sqrdmulh v18.4S, v22.4S, v1.s[2] +mul v22.4S, v22.4S,v9.s[2] +mla v22.4S, v18.4S, v31.s[0] +sub v18.4s, v13.4s, v22.4s +add v13.4s, v13.4s, v22.4s +sqrdmulh v22.4S, v11.4S, v2.s[1] +mul v11.4S, v11.4S,v16.s[1] +mla v11.4S, v22.4S, v31.s[0] +sub v22.4s, v30.4s, v11.4s +add v30.4s, v30.4s, v11.4s +sqrdmulh v11.4S, v8.4S, v2.s[2] +mul v8.4S, v8.4S,v16.s[2] +mla v8.4S, v11.4S, v31.s[0] +sub v11.4s, v21.4s, v8.4s +add v21.4s, v21.4s, v8.4s +trn1 v8.4S, v29.4S, v12.4S +trn2 v28.4S, v29.4S, v12.4S +trn1 v27.4S, v13.4S, v18.4S +trn2 v26.4S, v13.4S, v18.4S +trn2 v13.2D, v8.2D, v27.2D +trn2 v18.2D, v28.2D, v26.2D +trn1 v29.2D, v8.2D, v27.2D +trn1 v12.2D, v28.2D, v26.2D +trn1 v26.4S, v30.4S, v22.4S +trn2 v28.4S, v30.4S, v22.4S +trn1 v27.4S, v21.4S, v11.4S +trn2 v8.4S, v21.4S, v11.4S +trn2 v21.2D, v26.2D, v27.2D +trn2 v11.2D, v28.2D, v8.2D +trn1 v30.2D, v26.2D, v27.2D +trn1 v22.2D, v28.2D, v8.2D +sqrdmulh v8.4S, v13.4S, v20.4S +mul v13.4S, v13.4S,v3.4S +mla v13.4S, v8.4S, v31.s[0] +sub v8.4s, v29.4s, v13.4s +add v29.4s, v29.4s, v13.4s +sqrdmulh v13.4S, v18.4S, v20.4S +mul v18.4S, v18.4S,v3.4S +mla v18.4S, v13.4S, v31.s[0] +sub v13.4s, v12.4s, v18.4s +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v21.4S, v15.4S +mul v21.4S, v21.4S,v10.4S +mla v21.4S, v18.4S, v31.s[0] +sub v18.4s, v30.4s, v21.4s +add v30.4s, v30.4s, v21.4s +sqrdmulh v21.4S, v11.4S, v15.4S +mul v11.4S, v11.4S,v10.4S +mla v11.4S, v21.4S, v31.s[0] +sub v21.4s, v22.4s, v11.4s +add v22.4s, v22.4s, v11.4s +sqrdmulh v11.4S, v12.4S, v19.4S +mul v12.4S, v12.4S,v17.4S +mla v12.4S, v11.4S, v31.s[0] +sub v11.4s, v29.4s, v12.4s +add v29.4s, v29.4s, v12.4s +sqrdmulh v12.4S, v13.4S, v14.4S +mul v13.4S, v13.4S,v0.4S +mla v13.4S, v12.4S, v31.s[0] +sub v12.4s, v8.4s, v13.4s +add v8.4s, v8.4s, v13.4s +sqrdmulh v13.4S, v22.4S, v6.4S +mul v22.4S, v22.4S,v7.4S +mla v22.4S, v13.4S, v31.s[0] +sub v13.4s, v30.4s, v22.4s +add v30.4s, v30.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v4.4S +mul v21.4S, v21.4S,v5.4S +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +str q29, [x0, #64] +str q11, [x0, #80] +str q8, [x0, #96] +str q12, [x0, #112] +str q30, [x0, #576] +str q13, [x0, #592] +str q18, [x0, #608] +str q22, [x0, #624] +ldr q4, [x17, #+384] +ldr q5, [x17, #+400] +ldr q6, [x17, #+416] +ldr q7, [x17, #+432] +ldr q15, [x17, #+448] +ldr q10, [x17, #+464] +ldr q2, [x17, #+480] +ldr q16, [x17, #+496] +ldr q22, [x0, #160] +ldr q18, [x0, #176] +ldr q13, [x0, #128] +ldr q30, [x0, #144] +ldr q14, [x17, #+1408] +ldr q0, [x17, #+1424] +ldr q19, [x17, #+1440] +ldr q17, [x17, #+1456] +ldr q20, [x17, #+1472] +ldr q3, [x17, #+1488] +ldr q1, [x17, #+1504] +ldr q9, [x17, #+1520] +ldr q12, [x0, #672] +ldr q8, [x0, #688] +ldr q11, [x0, #640] +ldr q29, [x0, #656] +sqrdmulh v21.4S, v22.4S, v5.s[0] +mul v22.4S, v22.4S,v4.s[0] +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v13.4s, v22.4s +add v13.4s, v13.4s, v22.4s +sqrdmulh v22.4S, v18.4S, v5.s[0] +mul v18.4S, v18.4S,v4.s[0] +mla v18.4S, v22.4S, v31.s[0] +sub v22.4s, v30.4s, v18.4s +add v30.4s, v30.4s, v18.4s +sqrdmulh v18.4S, v12.4S, v0.s[0] +mul v12.4S, v12.4S,v14.s[0] +mla v12.4S, v18.4S, v31.s[0] +sub v18.4s, v11.4s, v12.4s +add v11.4s, v11.4s, v12.4s +sqrdmulh v12.4S, v8.4S, v0.s[0] +mul v8.4S, v8.4S,v14.s[0] +mla v8.4S, v12.4S, v31.s[0] +sub v12.4s, v29.4s, v8.4s +add v29.4s, v29.4s, v8.4s +sqrdmulh v8.4S, v30.4S, v5.s[1] +mul v30.4S, v30.4S,v4.s[1] +mla v30.4S, v8.4S, v31.s[0] +sub v8.4s, v13.4s, v30.4s +add v13.4s, v13.4s, v30.4s +sqrdmulh v30.4S, v22.4S, v5.s[2] +mul v22.4S, v22.4S,v4.s[2] +mla v22.4S, v30.4S, v31.s[0] +sub v30.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +sqrdmulh v22.4S, v29.4S, v0.s[1] +mul v29.4S, v29.4S,v14.s[1] +mla v29.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v29.4s +add v11.4s, v11.4s, v29.4s +sqrdmulh v29.4S, v12.4S, v0.s[2] +mul v12.4S, v12.4S,v14.s[2] +mla v12.4S, v29.4S, v31.s[0] +sub v29.4s, v18.4s, v12.4s +add v18.4s, v18.4s, v12.4s +trn1 v12.4S, v13.4S, v8.4S +trn2 v28.4S, v13.4S, v8.4S +trn1 v27.4S, v21.4S, v30.4S +trn2 v26.4S, v21.4S, v30.4S +trn2 v21.2D, v12.2D, v27.2D +trn2 v30.2D, v28.2D, v26.2D +trn1 v13.2D, v12.2D, v27.2D +trn1 v8.2D, v28.2D, v26.2D +trn1 v26.4S, v11.4S, v22.4S +trn2 v28.4S, v11.4S, v22.4S +trn1 v27.4S, v18.4S, v29.4S +trn2 v12.4S, v18.4S, v29.4S +trn2 v18.2D, v26.2D, v27.2D +trn2 v29.2D, v28.2D, v12.2D +trn1 v11.2D, v26.2D, v27.2D +trn1 v22.2D, v28.2D, v12.2D +sqrdmulh v12.4S, v21.4S, v7.4S +mul v21.4S, v21.4S,v6.4S +mla v21.4S, v12.4S, v31.s[0] +sub v12.4s, v13.4s, v21.4s +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v30.4S, v7.4S +mul v30.4S, v30.4S,v6.4S +mla v30.4S, v21.4S, v31.s[0] +sub v21.4s, v8.4s, v30.4s +add v8.4s, v8.4s, v30.4s +sqrdmulh v30.4S, v18.4S, v17.4S +mul v18.4S, v18.4S,v19.4S +mla v18.4S, v30.4S, v31.s[0] +sub v30.4s, v11.4s, v18.4s +add v11.4s, v11.4s, v18.4s +sqrdmulh v18.4S, v29.4S, v17.4S +mul v29.4S, v29.4S,v19.4S +mla v29.4S, v18.4S, v31.s[0] +sub v18.4s, v22.4s, v29.4s +add v22.4s, v22.4s, v29.4s +sqrdmulh v29.4S, v8.4S, v10.4S +mul v8.4S, v8.4S,v15.4S +mla v8.4S, v29.4S, v31.s[0] +sub v29.4s, v13.4s, v8.4s +add v13.4s, v13.4s, v8.4s +sqrdmulh v8.4S, v21.4S, v16.4S +mul v21.4S, v21.4S,v2.4S +mla v21.4S, v8.4S, v31.s[0] +sub v8.4s, v12.4s, v21.4s +add v12.4s, v12.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v3.4S +mul v22.4S, v22.4S,v20.4S +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +sqrdmulh v22.4S, v18.4S, v9.4S +mul v18.4S, v18.4S,v1.4S +mla v18.4S, v22.4S, v31.s[0] +sub v22.4s, v30.4s, v18.4s +add v30.4s, v30.4s, v18.4s +str q13, [x0, #128] +str q29, [x0, #144] +str q12, [x0, #160] +str q8, [x0, #176] +str q11, [x0, #640] +str q21, [x0, #656] +str q30, [x0, #672] +str q22, [x0, #688] +ldr q9, [x17, #+512] +ldr q1, [x17, #+528] +ldr q3, [x17, #+544] +ldr q20, [x17, #+560] +ldr q17, [x17, #+576] +ldr q19, [x17, #+592] +ldr q0, [x17, #+608] +ldr q14, [x17, #+624] +ldr q22, [x0, #224] +ldr q30, [x0, #240] +ldr q21, [x0, #192] +ldr q11, [x0, #208] +ldr q16, [x17, #+1536] +ldr q2, [x17, #+1552] +ldr q10, [x17, #+1568] +ldr q15, [x17, #+1584] +ldr q7, [x17, #+1600] +ldr q6, [x17, #+1616] +ldr q5, [x17, #+1632] +ldr q4, [x17, #+1648] +ldr q8, [x0, #736] +ldr q12, [x0, #752] +ldr q29, [x0, #704] +ldr q13, [x0, #720] +sqrdmulh v18.4S, v22.4S, v1.s[0] +mul v22.4S, v22.4S,v9.s[0] +mla v22.4S, v18.4S, v31.s[0] +sub v18.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +sqrdmulh v22.4S, v30.4S, v1.s[0] +mul v30.4S, v30.4S,v9.s[0] +mla v30.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v30.4s +add v11.4s, v11.4s, v30.4s +sqrdmulh v30.4S, v8.4S, v2.s[0] +mul v8.4S, v8.4S,v16.s[0] +mla v8.4S, v30.4S, v31.s[0] +sub v30.4s, v29.4s, v8.4s +add v29.4s, v29.4s, v8.4s +sqrdmulh v8.4S, v12.4S, v2.s[0] +mul v12.4S, v12.4S,v16.s[0] +mla v12.4S, v8.4S, v31.s[0] +sub v8.4s, v13.4s, v12.4s +add v13.4s, v13.4s, v12.4s +sqrdmulh v12.4S, v11.4S, v1.s[1] +mul v11.4S, v11.4S,v9.s[1] +mla v11.4S, v12.4S, v31.s[0] +sub v12.4s, v21.4s, v11.4s +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v1.s[2] +mul v22.4S, v22.4S,v9.s[2] +mla v22.4S, v11.4S, v31.s[0] +sub v11.4s, v18.4s, v22.4s +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v13.4S, v2.s[1] +mul v13.4S, v13.4S,v16.s[1] +mla v13.4S, v22.4S, v31.s[0] +sub v22.4s, v29.4s, v13.4s +add v29.4s, v29.4s, v13.4s +sqrdmulh v13.4S, v8.4S, v2.s[2] +mul v8.4S, v8.4S,v16.s[2] +mla v8.4S, v13.4S, v31.s[0] +sub v13.4s, v30.4s, v8.4s +add v30.4s, v30.4s, v8.4s +trn1 v8.4S, v21.4S, v12.4S +trn2 v28.4S, v21.4S, v12.4S +trn1 v27.4S, v18.4S, v11.4S +trn2 v26.4S, v18.4S, v11.4S +trn2 v18.2D, v8.2D, v27.2D +trn2 v11.2D, v28.2D, v26.2D +trn1 v21.2D, v8.2D, v27.2D +trn1 v12.2D, v28.2D, v26.2D +trn1 v26.4S, v29.4S, v22.4S +trn2 v28.4S, v29.4S, v22.4S +trn1 v27.4S, v30.4S, v13.4S +trn2 v8.4S, v30.4S, v13.4S +trn2 v30.2D, v26.2D, v27.2D +trn2 v13.2D, v28.2D, v8.2D +trn1 v29.2D, v26.2D, v27.2D +trn1 v22.2D, v28.2D, v8.2D +sqrdmulh v8.4S, v18.4S, v20.4S +mul v18.4S, v18.4S,v3.4S +mla v18.4S, v8.4S, v31.s[0] +sub v8.4s, v21.4s, v18.4s +add v21.4s, v21.4s, v18.4s +sqrdmulh v18.4S, v11.4S, v20.4S +mul v11.4S, v11.4S,v3.4S +mla v11.4S, v18.4S, v31.s[0] +sub v18.4s, v12.4s, v11.4s +add v12.4s, v12.4s, v11.4s +sqrdmulh v11.4S, v30.4S, v15.4S +mul v30.4S, v30.4S,v10.4S +mla v30.4S, v11.4S, v31.s[0] +sub v11.4s, v29.4s, v30.4s +add v29.4s, v29.4s, v30.4s +sqrdmulh v30.4S, v13.4S, v15.4S +mul v13.4S, v13.4S,v10.4S +mla v13.4S, v30.4S, v31.s[0] +sub v30.4s, v22.4s, v13.4s +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v12.4S, v19.4S +mul v12.4S, v12.4S,v17.4S +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v21.4s, v12.4s +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v18.4S, v14.4S +mul v18.4S, v18.4S,v0.4S +mla v18.4S, v12.4S, v31.s[0] +sub v12.4s, v8.4s, v18.4s +add v8.4s, v8.4s, v18.4s +sqrdmulh v18.4S, v22.4S, v6.4S +mul v22.4S, v22.4S,v7.4S +mla v22.4S, v18.4S, v31.s[0] +sub v18.4s, v29.4s, v22.4s +add v29.4s, v29.4s, v22.4s +sqrdmulh v22.4S, v30.4S, v4.4S +mul v30.4S, v30.4S,v5.4S +mla v30.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v30.4s +add v11.4s, v11.4s, v30.4s +str q21, [x0, #192] +str q13, [x0, #208] +str q8, [x0, #224] +str q12, [x0, #240] +str q29, [x0, #704] +str q18, [x0, #720] +str q11, [x0, #736] +str q22, [x0, #752] +ldr q4, [x17, #+640] +ldr q5, [x17, #+656] +ldr q6, [x17, #+672] +ldr q7, [x17, #+688] +ldr q15, [x17, #+704] +ldr q10, [x17, #+720] +ldr q2, [x17, #+736] +ldr q16, [x17, #+752] +ldr q22, [x0, #288] +ldr q11, [x0, #304] +ldr q18, [x0, #256] +ldr q29, [x0, #272] +ldr q14, [x17, #+1664] +ldr q0, [x17, #+1680] +ldr q19, [x17, #+1696] +ldr q17, [x17, #+1712] +ldr q20, [x17, #+1728] +ldr q3, [x17, #+1744] +ldr q1, [x17, #+1760] +ldr q9, [x17, #+1776] +ldr q12, [x0, #800] +ldr q8, [x0, #816] +ldr q13, [x0, #768] +ldr q21, [x0, #784] +sqrdmulh v30.4S, v22.4S, v5.s[0] +mul v22.4S, v22.4S,v4.s[0] +mla v22.4S, v30.4S, v31.s[0] +sub v30.4s, v18.4s, v22.4s +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v11.4S, v5.s[0] +mul v11.4S, v11.4S,v4.s[0] +mla v11.4S, v22.4S, v31.s[0] +sub v22.4s, v29.4s, v11.4s +add v29.4s, v29.4s, v11.4s +sqrdmulh v11.4S, v12.4S, v0.s[0] +mul v12.4S, v12.4S,v14.s[0] +mla v12.4S, v11.4S, v31.s[0] +sub v11.4s, v13.4s, v12.4s +add v13.4s, v13.4s, v12.4s +sqrdmulh v12.4S, v8.4S, v0.s[0] +mul v8.4S, v8.4S,v14.s[0] +mla v8.4S, v12.4S, v31.s[0] +sub v12.4s, v21.4s, v8.4s +add v21.4s, v21.4s, v8.4s +sqrdmulh v8.4S, v29.4S, v5.s[1] +mul v29.4S, v29.4S,v4.s[1] +mla v29.4S, v8.4S, v31.s[0] +sub v8.4s, v18.4s, v29.4s +add v18.4s, v18.4s, v29.4s +sqrdmulh v29.4S, v22.4S, v5.s[2] +mul v22.4S, v22.4S,v4.s[2] +mla v22.4S, v29.4S, v31.s[0] +sub v29.4s, v30.4s, v22.4s +add v30.4s, v30.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v0.s[1] +mul v21.4S, v21.4S,v14.s[1] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v13.4s, v21.4s +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v12.4S, v0.s[2] +mul v12.4S, v12.4S,v14.s[2] +mla v12.4S, v21.4S, v31.s[0] +sub v21.4s, v11.4s, v12.4s +add v11.4s, v11.4s, v12.4s +trn1 v12.4S, v18.4S, v8.4S +trn2 v28.4S, v18.4S, v8.4S +trn1 v27.4S, v30.4S, v29.4S +trn2 v26.4S, v30.4S, v29.4S +trn2 v30.2D, v12.2D, v27.2D +trn2 v29.2D, v28.2D, v26.2D +trn1 v18.2D, v12.2D, v27.2D +trn1 v8.2D, v28.2D, v26.2D +trn1 v26.4S, v13.4S, v22.4S +trn2 v28.4S, v13.4S, v22.4S +trn1 v27.4S, v11.4S, v21.4S +trn2 v12.4S, v11.4S, v21.4S +trn2 v11.2D, v26.2D, v27.2D +trn2 v21.2D, v28.2D, v12.2D +trn1 v13.2D, v26.2D, v27.2D +trn1 v22.2D, v28.2D, v12.2D +sqrdmulh v12.4S, v30.4S, v7.4S +mul v30.4S, v30.4S,v6.4S +mla v30.4S, v12.4S, v31.s[0] +sub v12.4s, v18.4s, v30.4s +add v18.4s, v18.4s, v30.4s +sqrdmulh v30.4S, v29.4S, v7.4S +mul v29.4S, v29.4S,v6.4S +mla v29.4S, v30.4S, v31.s[0] +sub v30.4s, v8.4s, v29.4s +add v8.4s, v8.4s, v29.4s +sqrdmulh v29.4S, v11.4S, v17.4S +mul v11.4S, v11.4S,v19.4S +mla v11.4S, v29.4S, v31.s[0] +sub v29.4s, v13.4s, v11.4s +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v21.4S, v17.4S +mul v21.4S, v21.4S,v19.4S +mla v21.4S, v11.4S, v31.s[0] +sub v11.4s, v22.4s, v21.4s +add v22.4s, v22.4s, v21.4s +sqrdmulh v21.4S, v8.4S, v10.4S +mul v8.4S, v8.4S,v15.4S +mla v8.4S, v21.4S, v31.s[0] +sub v21.4s, v18.4s, v8.4s +add v18.4s, v18.4s, v8.4s +sqrdmulh v8.4S, v30.4S, v16.4S +mul v30.4S, v30.4S,v2.4S +mla v30.4S, v8.4S, v31.s[0] +sub v8.4s, v12.4s, v30.4s +add v12.4s, v12.4s, v30.4s +sqrdmulh v30.4S, v22.4S, v3.4S +mul v22.4S, v22.4S,v20.4S +mla v22.4S, v30.4S, v31.s[0] +sub v30.4s, v13.4s, v22.4s +add v13.4s, v13.4s, v22.4s +sqrdmulh v22.4S, v11.4S, v9.4S +mul v11.4S, v11.4S,v1.4S +mla v11.4S, v22.4S, v31.s[0] +sub v22.4s, v29.4s, v11.4s +add v29.4s, v29.4s, v11.4s +str q18, [x0, #256] +str q21, [x0, #272] +str q12, [x0, #288] +str q8, [x0, #304] +str q13, [x0, #768] +str q30, [x0, #784] +str q29, [x0, #800] +str q22, [x0, #816] +ldr q9, [x17, #+768] +ldr q1, [x17, #+784] +ldr q3, [x17, #+800] +ldr q20, [x17, #+816] +ldr q17, [x17, #+832] +ldr q19, [x17, #+848] +ldr q0, [x17, #+864] +ldr q14, [x17, #+880] +ldr q22, [x0, #352] +ldr q29, [x0, #368] +ldr q30, [x0, #320] +ldr q13, [x0, #336] +ldr q16, [x17, #+1792] +ldr q2, [x17, #+1808] +ldr q10, [x17, #+1824] +ldr q15, [x17, #+1840] +ldr q7, [x17, #+1856] +ldr q6, [x17, #+1872] +ldr q5, [x17, #+1888] +ldr q4, [x17, #+1904] +ldr q8, [x0, #864] +ldr q12, [x0, #880] +ldr q21, [x0, #832] +ldr q18, [x0, #848] +sqrdmulh v11.4S, v22.4S, v1.s[0] +mul v22.4S, v22.4S,v9.s[0] +mla v22.4S, v11.4S, v31.s[0] +sub v11.4s, v30.4s, v22.4s +add v30.4s, v30.4s, v22.4s +sqrdmulh v22.4S, v29.4S, v1.s[0] +mul v29.4S, v29.4S,v9.s[0] +mla v29.4S, v22.4S, v31.s[0] +sub v22.4s, v13.4s, v29.4s +add v13.4s, v13.4s, v29.4s +sqrdmulh v29.4S, v8.4S, v2.s[0] +mul v8.4S, v8.4S,v16.s[0] +mla v8.4S, v29.4S, v31.s[0] +sub v29.4s, v21.4s, v8.4s +add v21.4s, v21.4s, v8.4s +sqrdmulh v8.4S, v12.4S, v2.s[0] +mul v12.4S, v12.4S,v16.s[0] +mla v12.4S, v8.4S, v31.s[0] +sub v8.4s, v18.4s, v12.4s +add v18.4s, v18.4s, v12.4s +sqrdmulh v12.4S, v13.4S, v1.s[1] +mul v13.4S, v13.4S,v9.s[1] +mla v13.4S, v12.4S, v31.s[0] +sub v12.4s, v30.4s, v13.4s +add v30.4s, v30.4s, v13.4s +sqrdmulh v13.4S, v22.4S, v1.s[2] +mul v22.4S, v22.4S,v9.s[2] +mla v22.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +sqrdmulh v22.4S, v18.4S, v2.s[1] +mul v18.4S, v18.4S,v16.s[1] +mla v18.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v18.4s +add v21.4s, v21.4s, v18.4s +sqrdmulh v18.4S, v8.4S, v2.s[2] +mul v8.4S, v8.4S,v16.s[2] +mla v8.4S, v18.4S, v31.s[0] +sub v18.4s, v29.4s, v8.4s +add v29.4s, v29.4s, v8.4s +trn1 v8.4S, v30.4S, v12.4S +trn2 v28.4S, v30.4S, v12.4S +trn1 v27.4S, v11.4S, v13.4S +trn2 v26.4S, v11.4S, v13.4S +trn2 v11.2D, v8.2D, v27.2D +trn2 v13.2D, v28.2D, v26.2D +trn1 v30.2D, v8.2D, v27.2D +trn1 v12.2D, v28.2D, v26.2D +trn1 v26.4S, v21.4S, v22.4S +trn2 v28.4S, v21.4S, v22.4S +trn1 v27.4S, v29.4S, v18.4S +trn2 v8.4S, v29.4S, v18.4S +trn2 v29.2D, v26.2D, v27.2D +trn2 v18.2D, v28.2D, v8.2D +trn1 v21.2D, v26.2D, v27.2D +trn1 v22.2D, v28.2D, v8.2D +sqrdmulh v8.4S, v11.4S, v20.4S +mul v11.4S, v11.4S,v3.4S +mla v11.4S, v8.4S, v31.s[0] +sub v8.4s, v30.4s, v11.4s +add v30.4s, v30.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v20.4S +mul v13.4S, v13.4S,v3.4S +mla v13.4S, v11.4S, v31.s[0] +sub v11.4s, v12.4s, v13.4s +add v12.4s, v12.4s, v13.4s +sqrdmulh v13.4S, v29.4S, v15.4S +mul v29.4S, v29.4S,v10.4S +mla v29.4S, v13.4S, v31.s[0] +sub v13.4s, v21.4s, v29.4s +add v21.4s, v21.4s, v29.4s +sqrdmulh v29.4S, v18.4S, v15.4S +mul v18.4S, v18.4S,v10.4S +mla v18.4S, v29.4S, v31.s[0] +sub v29.4s, v22.4s, v18.4s +add v22.4s, v22.4s, v18.4s +sqrdmulh v18.4S, v12.4S, v19.4S +mul v12.4S, v12.4S,v17.4S +mla v12.4S, v18.4S, v31.s[0] +sub v18.4s, v30.4s, v12.4s +add v30.4s, v30.4s, v12.4s +sqrdmulh v12.4S, v11.4S, v14.4S +mul v11.4S, v11.4S,v0.4S +mla v11.4S, v12.4S, v31.s[0] +sub v12.4s, v8.4s, v11.4s +add v8.4s, v8.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v6.4S +mul v22.4S, v22.4S,v7.4S +mla v22.4S, v11.4S, v31.s[0] +sub v11.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +sqrdmulh v22.4S, v29.4S, v4.4S +mul v29.4S, v29.4S,v5.4S +mla v29.4S, v22.4S, v31.s[0] +sub v22.4s, v13.4s, v29.4s +add v13.4s, v13.4s, v29.4s +str q30, [x0, #320] +str q18, [x0, #336] +str q8, [x0, #352] +str q12, [x0, #368] +str q21, [x0, #832] +str q11, [x0, #848] +str q13, [x0, #864] +str q22, [x0, #880] +ldr q4, [x17, #+896] +ldr q5, [x17, #+912] +ldr q6, [x17, #+928] +ldr q7, [x17, #+944] +ldr q15, [x17, #+960] +ldr q10, [x17, #+976] +ldr q2, [x17, #+992] +ldr q16, [x17, #+1008] +ldr q22, [x0, #416] +ldr q13, [x0, #432] +ldr q11, [x0, #384] +ldr q21, [x0, #400] +ldr q14, [x17, #+1920] +ldr q0, [x17, #+1936] +ldr q19, [x17, #+1952] +ldr q17, [x17, #+1968] +ldr q20, [x17, #+1984] +ldr q3, [x17, #+2000] +ldr q1, [x17, #+2016] +ldr q9, [x17, #+2032] +ldr q12, [x0, #928] +ldr q8, [x0, #944] +ldr q18, [x0, #896] +ldr q30, [x0, #912] +sqrdmulh v29.4S, v22.4S, v5.s[0] +mul v22.4S, v22.4S,v4.s[0] +mla v22.4S, v29.4S, v31.s[0] +sub v29.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +sqrdmulh v22.4S, v13.4S, v5.s[0] +mul v13.4S, v13.4S,v4.s[0] +mla v13.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +sqrdmulh v13.4S, v12.4S, v0.s[0] +mul v12.4S, v12.4S,v14.s[0] +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v18.4s, v12.4s +add v18.4s, v18.4s, v12.4s +sqrdmulh v12.4S, v8.4S, v0.s[0] +mul v8.4S, v8.4S,v14.s[0] +mla v8.4S, v12.4S, v31.s[0] +sub v12.4s, v30.4s, v8.4s +add v30.4s, v30.4s, v8.4s +sqrdmulh v8.4S, v21.4S, v5.s[1] +mul v21.4S, v21.4S,v4.s[1] +mla v21.4S, v8.4S, v31.s[0] +sub v8.4s, v11.4s, v21.4s +add v11.4s, v11.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v5.s[2] +mul v22.4S, v22.4S,v4.s[2] +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v29.4s, v22.4s +add v29.4s, v29.4s, v22.4s +sqrdmulh v22.4S, v30.4S, v0.s[1] +mul v30.4S, v30.4S,v14.s[1] +mla v30.4S, v22.4S, v31.s[0] +sub v22.4s, v18.4s, v30.4s +add v18.4s, v18.4s, v30.4s +sqrdmulh v30.4S, v12.4S, v0.s[2] +mul v12.4S, v12.4S,v14.s[2] +mla v12.4S, v30.4S, v31.s[0] +sub v30.4s, v13.4s, v12.4s +add v13.4s, v13.4s, v12.4s +trn1 v12.4S, v11.4S, v8.4S +trn2 v28.4S, v11.4S, v8.4S +trn1 v27.4S, v29.4S, v21.4S +trn2 v26.4S, v29.4S, v21.4S +trn2 v29.2D, v12.2D, v27.2D +trn2 v21.2D, v28.2D, v26.2D +trn1 v11.2D, v12.2D, v27.2D +trn1 v8.2D, v28.2D, v26.2D +trn1 v26.4S, v18.4S, v22.4S +trn2 v28.4S, v18.4S, v22.4S +trn1 v27.4S, v13.4S, v30.4S +trn2 v12.4S, v13.4S, v30.4S +trn2 v13.2D, v26.2D, v27.2D +trn2 v30.2D, v28.2D, v12.2D +trn1 v18.2D, v26.2D, v27.2D +trn1 v22.2D, v28.2D, v12.2D +sqrdmulh v12.4S, v29.4S, v7.4S +mul v29.4S, v29.4S,v6.4S +mla v29.4S, v12.4S, v31.s[0] +sub v12.4s, v11.4s, v29.4s +add v11.4s, v11.4s, v29.4s +sqrdmulh v29.4S, v21.4S, v7.4S +mul v21.4S, v21.4S,v6.4S +mla v21.4S, v29.4S, v31.s[0] +sub v29.4s, v8.4s, v21.4s +add v8.4s, v8.4s, v21.4s +sqrdmulh v21.4S, v13.4S, v17.4S +mul v13.4S, v13.4S,v19.4S +mla v13.4S, v21.4S, v31.s[0] +sub v21.4s, v18.4s, v13.4s +add v18.4s, v18.4s, v13.4s +sqrdmulh v13.4S, v30.4S, v17.4S +mul v30.4S, v30.4S,v19.4S +mla v30.4S, v13.4S, v31.s[0] +sub v13.4s, v22.4s, v30.4s +add v22.4s, v22.4s, v30.4s +sqrdmulh v30.4S, v8.4S, v10.4S +mul v8.4S, v8.4S,v15.4S +mla v8.4S, v30.4S, v31.s[0] +sub v30.4s, v11.4s, v8.4s +add v11.4s, v11.4s, v8.4s +sqrdmulh v8.4S, v29.4S, v16.4S +mul v29.4S, v29.4S,v2.4S +mla v29.4S, v8.4S, v31.s[0] +sub v8.4s, v12.4s, v29.4s +add v12.4s, v12.4s, v29.4s +sqrdmulh v29.4S, v22.4S, v3.4S +mul v22.4S, v22.4S,v20.4S +mla v22.4S, v29.4S, v31.s[0] +sub v29.4s, v18.4s, v22.4s +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v13.4S, v9.4S +mul v13.4S, v13.4S,v1.4S +mla v13.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +str q11, [x0, #384] +str q30, [x0, #400] +str q12, [x0, #416] +str q8, [x0, #432] +str q18, [x0, #896] +str q29, [x0, #912] +str q21, [x0, #928] +str q22, [x0, #944] +ldr q9, [x17, #+1024] +ldr q1, [x17, #+1040] +ldr q3, [x17, #+1056] +ldr q20, [x17, #+1072] +ldr q17, [x17, #+1088] +ldr q19, [x17, #+1104] +ldr q0, [x17, #+1120] +ldr q14, [x17, #+1136] +ldr q22, [x0, #480] +ldr q21, [x0, #496] +ldr q29, [x0, #448] +ldr q18, [x0, #464] +ldr q16, [x17, #+2048] +ldr q2, [x17, #+2064] +ldr q10, [x17, #+2080] +ldr q15, [x17, #+2096] +ldr q7, [x17, #+2112] +ldr q6, [x17, #+2128] +ldr q5, [x17, #+2144] +ldr q4, [x17, #+2160] +ldr q8, [x0, #992] +ldr q12, [x0, #1008] +ldr q30, [x0, #960] +ldr q11, [x0, #976] +sqrdmulh v13.4S, v22.4S, v1.s[0] +mul v22.4S, v22.4S,v9.s[0] +mla v22.4S, v13.4S, v31.s[0] +sub v13.4s, v29.4s, v22.4s +add v29.4s, v29.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v1.s[0] +mul v21.4S, v21.4S,v9.s[0] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v8.4S, v2.s[0] +mul v8.4S, v8.4S,v16.s[0] +mla v8.4S, v21.4S, v31.s[0] +sub v21.4s, v30.4s, v8.4s +add v30.4s, v30.4s, v8.4s +sqrdmulh v8.4S, v12.4S, v2.s[0] +mul v12.4S, v12.4S,v16.s[0] +mla v12.4S, v8.4S, v31.s[0] +sub v8.4s, v11.4s, v12.4s +add v11.4s, v11.4s, v12.4s +sqrdmulh v12.4S, v18.4S, v1.s[1] +mul v18.4S, v18.4S,v9.s[1] +mla v18.4S, v12.4S, v31.s[0] +sub v12.4s, v29.4s, v18.4s +add v29.4s, v29.4s, v18.4s +sqrdmulh v18.4S, v22.4S, v1.s[2] +mul v22.4S, v22.4S,v9.s[2] +mla v22.4S, v18.4S, v31.s[0] +sub v18.4s, v13.4s, v22.4s +add v13.4s, v13.4s, v22.4s +sqrdmulh v22.4S, v11.4S, v2.s[1] +mul v11.4S, v11.4S,v16.s[1] +mla v11.4S, v22.4S, v31.s[0] +sub v22.4s, v30.4s, v11.4s +add v30.4s, v30.4s, v11.4s +sqrdmulh v11.4S, v8.4S, v2.s[2] +mul v8.4S, v8.4S,v16.s[2] +mla v8.4S, v11.4S, v31.s[0] +sub v11.4s, v21.4s, v8.4s +add v21.4s, v21.4s, v8.4s +trn1 v8.4S, v29.4S, v12.4S +trn2 v28.4S, v29.4S, v12.4S +trn1 v27.4S, v13.4S, v18.4S +trn2 v26.4S, v13.4S, v18.4S +trn2 v13.2D, v8.2D, v27.2D +trn2 v18.2D, v28.2D, v26.2D +trn1 v29.2D, v8.2D, v27.2D +trn1 v12.2D, v28.2D, v26.2D +trn1 v26.4S, v30.4S, v22.4S +trn2 v28.4S, v30.4S, v22.4S +trn1 v27.4S, v21.4S, v11.4S +trn2 v8.4S, v21.4S, v11.4S +trn2 v21.2D, v26.2D, v27.2D +trn2 v11.2D, v28.2D, v8.2D +trn1 v30.2D, v26.2D, v27.2D +trn1 v22.2D, v28.2D, v8.2D +sqrdmulh v8.4S, v13.4S, v20.4S +mul v13.4S, v13.4S,v3.4S +mla v13.4S, v8.4S, v31.s[0] +sub v8.4s, v29.4s, v13.4s +add v29.4s, v29.4s, v13.4s +sqrdmulh v13.4S, v18.4S, v20.4S +mul v18.4S, v18.4S,v3.4S +mla v18.4S, v13.4S, v31.s[0] +sub v13.4s, v12.4s, v18.4s +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v21.4S, v15.4S +mul v21.4S, v21.4S,v10.4S +mla v21.4S, v18.4S, v31.s[0] +sub v18.4s, v30.4s, v21.4s +add v30.4s, v30.4s, v21.4s +sqrdmulh v21.4S, v11.4S, v15.4S +mul v11.4S, v11.4S,v10.4S +mla v11.4S, v21.4S, v31.s[0] +sub v21.4s, v22.4s, v11.4s +add v22.4s, v22.4s, v11.4s +sqrdmulh v11.4S, v12.4S, v19.4S +mul v12.4S, v12.4S,v17.4S +mla v12.4S, v11.4S, v31.s[0] +sub v11.4s, v29.4s, v12.4s +add v29.4s, v29.4s, v12.4s +sqrdmulh v12.4S, v13.4S, v14.4S +mul v13.4S, v13.4S,v0.4S +mla v13.4S, v12.4S, v31.s[0] +sub v12.4s, v8.4s, v13.4s +add v8.4s, v8.4s, v13.4s +sqrdmulh v13.4S, v22.4S, v6.4S +mul v22.4S, v22.4S,v7.4S +mla v22.4S, v13.4S, v31.s[0] +sub v13.4s, v30.4s, v22.4s +add v30.4s, v30.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v4.4S +mul v21.4S, v21.4S,v5.4S +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +str q29, [x0, #448] +str q11, [x0, #464] +str q8, [x0, #480] +str q12, [x0, #496] +str q30, [x0, #960] +str q13, [x0, #976] +str q18, [x0, #992] +str q22, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 2392 +// Instruction count: 2388 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z2_3.s b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z2_3.s new file mode 100644 index 0000000..097a1c9 --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z2_3.s @@ -0,0 +1,2422 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 26036764 // Layer 6, block 0 +.word 7065381 // Layer 6, block 1 +.word 11280567 // Layer 6, block 2 +.word 19695786 // Layer 6, block 3 +.word 1666225723 // Layer 6, block 0 +.word 452149874 // Layer 6, block 1 +.word 721901190 // Layer 6, block 2 +.word 1260434103 // Layer 6, block 3 +.word 28678040 // Layer 7, block 0 +.word 5637166 // Layer 7, block 2 +.word 18759424 // Layer 7, block 4 +.word 8648030 // Layer 7, block 6 +.word 1835254486 // Layer 7, block 0 +.word 360751090 // Layer 7, block 2 +.word 1200511508 // Layer 7, block 4 +.word 553431680 // Layer 7, block 6 +.word 7232147 // Layer 7, block 1 +.word 7430689 // Layer 7, block 3 +.word 14819378 // Layer 7, block 5 +.word 22112339 // Layer 7, block 7 +.word 462822084 // Layer 7, block 1 +.word 475527802 // Layer 7, block 3 +.word 948367809 // Layer 7, block 5 +.word 1415081692 // Layer 7, block 7 +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14834498 // Layer 6, block 4 +.word 22861321 // Layer 6, block 5 +.word 23033862 // Layer 6, block 6 +.word 32211066 // Layer 6, block 7 +.word 949335415 // Layer 6, block 4 +.word 1463012881 // Layer 6, block 5 +.word 1474054663 // Layer 6, block 6 +.word 2061350894 // Layer 6, block 7 +.word 7103825 // Layer 7, block 8 +.word 24338119 // Layer 7, block 10 +.word 6674394 // Layer 7, block 12 +.word 3716128 // Layer 7, block 14 +.word 454610102 // Layer 7, block 8 +.word 1557520740 // Layer 7, block 10 +.word 427128616 // Layer 7, block 12 +.word 237814041 // Layer 7, block 14 +.word 18577393 // Layer 7, block 9 +.word 17042091 // Layer 7, block 11 +.word 6574213 // Layer 7, block 13 +.word 24666803 // Layer 7, block 15 +.word 1188862414 // Layer 7, block 9 +.word 1090610585 // Layer 7, block 11 +.word 420717521 // Layer 7, block 13 +.word 1578554911 // Layer 7, block 15 +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 11253846 // Layer 6, block 8 +.word 16151303 // Layer 6, block 9 +.word 1821442 // Layer 6, block 10 +.word 23358663 // Layer 6, block 11 +.word 720191176 // Layer 6, block 8 +.word 1033604503 // Layer 6, block 9 +.word 116563391 // Layer 6, block 10 +.word 1494840340 // Layer 6, block 11 +.word 32787475 // Layer 7, block 16 +.word 8269259 // Layer 7, block 18 +.word 20826321 // Layer 7, block 20 +.word 21194054 // Layer 7, block 22 +.word 2098238255 // Layer 7, block 16 +.word 529192186 // Layer 7, block 18 +.word 1332782821 // Layer 7, block 20 +.word 1356315937 // Layer 7, block 22 +.word 28400654 // Layer 7, block 17 +.word 31090287 // Layer 7, block 19 +.word 26776841 // Layer 7, block 21 +.word 22281074 // Layer 7, block 23 +.word 1817503137 // Layer 7, block 17 +.word 1989626512 // Layer 7, block 19 +.word 1713587037 // Layer 7, block 21 +.word 1425879908 // Layer 7, block 23 +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 20504641 // Layer 6, block 12 +.word 7735096 // Layer 6, block 13 +.word 29463916 // Layer 6, block 14 +.word 23172067 // Layer 6, block 15 +.word 1312196872 // Layer 6, block 12 +.word 495008363 // Layer 6, block 13 +.word 1885546712 // Layer 6, block 14 +.word 1482899108 // Layer 6, block 15 +.word 1953000 // Layer 7, block 24 +.word 12766243 // Layer 7, block 26 +.word 16292342 // Layer 7, block 28 +.word 25143337 // Layer 7, block 30 +.word 124982461 // Layer 7, block 24 +.word 816977197 // Layer 7, block 26 +.word 1042630311 // Layer 7, block 28 +.word 1609050759 // Layer 7, block 30 +.word 12486848 // Layer 7, block 25 +.word 31556661 // Layer 7, block 27 +.word 28330310 // Layer 7, block 29 +.word 15137961 // Layer 7, block 31 +.word 799097282 // Layer 7, block 25 +.word 2019472170 // Layer 7, block 27 +.word 1813001465 // Layer 7, block 29 +.word 968755565 // Layer 7, block 31 +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 18663828 // Layer 6, block 16 +.word 25765932 // Layer 6, block 17 +.word 11779122 // Layer 6, block 18 +.word 29112305 // Layer 6, block 19 +.word 1194393831 // Layer 6, block 16 +.word 1648893798 // Layer 6, block 17 +.word 753806275 // Layer 6, block 18 +.word 1863045325 // Layer 6, block 19 +.word 33163184 // Layer 7, block 32 +.word 11550623 // Layer 7, block 34 +.word 25375595 // Layer 7, block 36 +.word 18254638 // Layer 7, block 38 +.word 2122281795 // Layer 7, block 32 +.word 739183455 // Layer 7, block 34 +.word 1623914137 // Layer 7, block 36 +.word 1168207670 // Layer 7, block 38 +.word 9551359 // Layer 7, block 33 +.word 33257316 // Layer 7, block 35 +.word 10387700 // Layer 7, block 37 +.word 4263629 // Layer 7, block 39 +.word 611240324 // Layer 7, block 33 +.word 2128305784 // Layer 7, block 35 +.word 664762063 // Layer 7, block 37 +.word 272851431 // Layer 7, block 39 +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 596073 // Layer 6, block 20 +.word 29039358 // Layer 6, block 21 +.word 6760262 // Layer 6, block 22 +.word 2228887 // Layer 6, block 23 +.word 38145761 // Layer 6, block 20 +.word 1858377074 // Layer 6, block 21 +.word 432623749 // Layer 6, block 22 +.word 142637881 // Layer 6, block 23 +.word 25929180 // Layer 7, block 40 +.word 23508428 // Layer 7, block 42 +.word 22560727 // Layer 7, block 44 +.word 29457393 // Layer 7, block 46 +.word 1659340873 // Layer 7, block 40 +.word 1504424569 // Layer 7, block 42 +.word 1443776334 // Layer 7, block 44 +.word 1885129272 // Layer 7, block 46 +.word 17371159 // Layer 7, block 41 +.word 11558208 // Layer 7, block 43 +.word 15755637 // Layer 7, block 45 +.word 20740787 // Layer 7, block 47 +.word 1111669329 // Layer 7, block 41 +.word 739668858 // Layer 7, block 43 +.word 1008283812 // Layer 7, block 45 +.word 1327309063 // Layer 7, block 47 +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 13624329 // Layer 6, block 24 +.word 9838349 // Layer 6, block 25 +.word 6934560 // Layer 6, block 26 +.word 11310234 // Layer 6, block 27 +.word 871890510 // Layer 6, block 24 +.word 629606282 // Layer 6, block 25 +.word 443777969 // Layer 6, block 26 +.word 723799733 // Layer 6, block 27 +.word 3153984 // Layer 7, block 48 +.word 15599806 // Layer 7, block 50 +.word 23484790 // Layer 7, block 52 +.word 30174454 // Layer 7, block 54 +.word 201839571 // Layer 7, block 48 +.word 998311389 // Layer 7, block 50 +.word 1502911852 // Layer 7, block 52 +.word 1931017673 // Layer 7, block 54 +.word 13598070 // Layer 7, block 49 +.word 31454003 // Layer 7, block 51 +.word 20506260 // Layer 7, block 53 +.word 5928435 // Layer 7, block 55 +.word 870210062 // Layer 7, block 49 +.word 2012902560 // Layer 7, block 51 +.word 1312300480 // Layer 7, block 53 +.word 379390883 // Layer 7, block 55 +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 32798516 // Layer 6, block 28 +.word 9911360 // Layer 6, block 29 +.word 32443170 // Layer 6, block 30 +.word 31293482 // Layer 6, block 31 +.word 2098944825 // Layer 6, block 28 +.word 634278629 // Layer 6, block 29 +.word 2076204416 // Layer 6, block 30 +.word 2002630000 // Layer 6, block 31 +.word 26013877 // Layer 7, block 56 +.word 22928950 // Layer 7, block 58 +.word 24547058 // Layer 7, block 60 +.word 21082546 // Layer 7, block 62 +.word 1664761067 // Layer 7, block 56 +.word 1467340807 // Layer 7, block 58 +.word 1570891816 // Layer 7, block 60 +.word 1349179970 // Layer 7, block 62 +.word 21864746 // Layer 7, block 57 +.word 27678266 // Layer 7, block 59 +.word 30695887 // Layer 7, block 61 +.word 31772478 // Layer 7, block 63 +.word 1399236949 // Layer 7, block 57 +.word 1771273834 // Layer 7, block 59 +.word 1964386839 // Layer 7, block 61 +.word 2033283404 // Layer 7, block 63 +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 2853776 // Layer 6, block 32 +.word 31645959 // Layer 6, block 33 +.word 29723614 // Layer 6, block 34 +.word 31813171 // Layer 6, block 35 +.word 182627725 // Layer 6, block 32 +.word 2025186806 // Layer 6, block 33 +.word 1902166116 // Layer 6, block 34 +.word 2035887557 // Layer 6, block 35 +.word 30377953 // Layer 7, block 64 +.word 4924837 // Layer 7, block 66 +.word 11362575 // Layer 7, block 68 +.word 31398766 // Layer 7, block 70 +.word 1944040616 // Layer 7, block 64 +.word 315165513 // Layer 7, block 66 +.word 727149301 // Layer 7, block 68 +.word 2009367662 // Layer 7, block 70 +.word 27689101 // Layer 7, block 65 +.word 31229525 // Layer 7, block 67 +.word 6544948 // Layer 7, block 69 +.word 13728247 // Layer 7, block 71 +.word 1771967221 // Layer 7, block 65 +.word 1998537064 // Layer 7, block 67 +.word 418844704 // Layer 7, block 69 +.word 878540754 // Layer 7, block 71 +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9116920 // Layer 6, block 36 +.word 26449800 // Layer 6, block 37 +.word 27173300 // Layer 6, block 38 +.word 1574249 // Layer 6, block 39 +.word 583438350 // Layer 6, block 36 +.word 1692658010 // Layer 6, block 37 +.word 1738958476 // Layer 6, block 38 +.word 100744247 // Layer 6, block 39 +.word 6510145 // Layer 7, block 72 +.word 760999 // Layer 7, block 74 +.word 1634503 // Layer 7, block 76 +.word 29546109 // Layer 7, block 78 +.word 416617482 // Layer 7, block 72 +.word 48700219 // Layer 7, block 74 +.word 104600209 // Layer 7, block 76 +.word 1890806663 // Layer 7, block 78 +.word 2195232 // Layer 7, block 73 +.word 4465852 // Layer 7, block 75 +.word 31203102 // Layer 7, block 77 +.word 29916743 // Layer 7, block 79 +.word 140484126 // Layer 7, block 73 +.word 285792715 // Layer 7, block 75 +.word 1996846121 // Layer 7, block 77 +.word 1914525428 // Layer 7, block 79 +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29172999 // Layer 6, block 40 +.word 16825951 // Layer 6, block 41 +.word 11592382 // Layer 6, block 42 +.word 2671395 // Layer 6, block 43 +.word 1866929445 // Layer 6, block 40 +.word 1076778680 // Layer 6, block 41 +.word 741855827 // Layer 6, block 42 +.word 170956232 // Layer 6, block 43 +.word 14579779 // Layer 7, block 80 +.word 24263513 // Layer 7, block 82 +.word 4646776 // Layer 7, block 84 +.word 69049 // Layer 7, block 86 +.word 933034643 // Layer 7, block 80 +.word 1552746321 // Layer 7, block 82 +.word 297370968 // Layer 7, block 84 +.word 4418799 // Layer 7, block 86 +.word 33263488 // Layer 7, block 81 +.word 22493246 // Layer 7, block 83 +.word 22009979 // Layer 7, block 85 +.word 12021234 // Layer 7, block 87 +.word 2128700762 // Layer 7, block 81 +.word 1439457879 // Layer 7, block 83 +.word 1408531152 // Layer 7, block 85 +.word 769300260 // Layer 7, block 87 +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 15720958 // Layer 6, block 44 +.word 4876619 // Layer 6, block 45 +.word 9370171 // Layer 6, block 46 +.word 2197027 // Layer 6, block 47 +.word 1006064525 // Layer 6, block 44 +.word 312079797 // Layer 6, block 45 +.word 599645177 // Layer 6, block 46 +.word 140598997 // Layer 6, block 47 +.word 16117282 // Layer 7, block 88 +.word 9635661 // Layer 7, block 90 +.word 9117520 // Layer 7, block 92 +.word 3506913 // Layer 7, block 94 +.word 1031427326 // Layer 7, block 88 +.word 616635240 // Layer 7, block 90 +.word 583476747 // Layer 7, block 92 +.word 224425303 // Layer 7, block 94 +.word 20014407 // Layer 7, block 89 +.word 25893988 // Layer 7, block 91 +.word 10257619 // Layer 7, block 93 +.word 24501669 // Layer 7, block 95 +.word 1280824291 // Layer 7, block 89 +.word 1657088757 // Layer 7, block 91 +.word 656437514 // Layer 7, block 93 +.word 1567987141 // Layer 7, block 95 +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 23467272 // Layer 6, block 48 +.word 11944835 // Layer 6, block 49 +.word 29768154 // Layer 6, block 50 +.word 3189790 // Layer 6, block 51 +.word 1501790786 // Layer 6, block 48 +.word 764411097 // Layer 6, block 49 +.word 1905016458 // Layer 6, block 50 +.word 204130980 // Layer 6, block 51 +.word 28559032 // Layer 7, block 96 +.word 20151609 // Layer 7, block 98 +.word 11645481 // Layer 7, block 100 +.word 16402437 // Layer 7, block 102 +.word 1827638556 // Layer 7, block 96 +.word 1289604549 // Layer 7, block 98 +.word 745253903 // Layer 7, block 100 +.word 1049675853 // Layer 7, block 102 +.word 1005359 // Layer 7, block 97 +.word 19130139 // Layer 7, block 99 +.word 11690281 // Layer 7, block 101 +.word 5461508 // Layer 7, block 103 +.word 64338065 // Layer 7, block 97 +.word 1224235458 // Layer 7, block 99 +.word 748120885 // Layer 7, block 101 +.word 349509836 // Layer 7, block 103 +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 4898455 // Layer 6, block 52 +.word 22059944 // Layer 6, block 53 +.word 20315246 // Layer 6, block 54 +.word 28615767 // Layer 6, block 55 +.word 313477194 // Layer 6, block 52 +.word 1411728668 // Layer 6, block 53 +.word 1300076517 // Layer 6, block 54 +.word 1831269319 // Layer 6, block 55 +.word 6226096 // Layer 7, block 104 +.word 14029790 // Layer 7, block 106 +.word 7729000 // Layer 7, block 108 +.word 13958531 // Layer 7, block 110 +.word 398439734 // Layer 7, block 104 +.word 897838034 // Layer 7, block 106 +.word 494618249 // Layer 7, block 108 +.word 893277806 // Layer 7, block 110 +.word 31755058 // Layer 7, block 105 +.word 26102744 // Layer 7, block 107 +.word 19175904 // Layer 7, block 109 +.word 19472238 // Layer 7, block 111 +.word 2032168609 // Layer 7, block 105 +.word 1670448121 // Layer 7, block 107 +.word 1227164194 // Layer 7, block 109 +.word 1246128123 // Layer 7, block 111 +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 17302560 // Layer 6, block 56 +.word 8630188 // Layer 6, block 57 +.word 13744680 // Layer 6, block 58 +.word 31890906 // Layer 6, block 59 +.word 1107279328 // Layer 6, block 56 +.word 552289879 // Layer 6, block 57 +.word 879592386 // Layer 6, block 58 +.word 2040862218 // Layer 6, block 59 +.word 4735938 // Layer 7, block 112 +.word 26671657 // Layer 7, block 114 +.word 25810971 // Layer 7, block 116 +.word 25578690 // Layer 7, block 118 +.word 303076900 // Layer 7, block 112 +.word 1706855774 // Layer 7, block 114 +.word 1651776074 // Layer 7, block 116 +.word 1636911225 // Layer 7, block 118 +.word 6957373 // Layer 7, block 113 +.word 25381712 // Layer 7, block 115 +.word 27780827 // Layer 7, block 117 +.word 28062311 // Layer 7, block 119 +.word 445237890 // Layer 7, block 113 +.word 1624305595 // Layer 7, block 115 +.word 1777837237 // Layer 7, block 117 +.word 1795850838 // Layer 7, block 119 +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 26150922 // Layer 6, block 60 +.word 29525906 // Layer 6, block 61 +.word 23080870 // Layer 6, block 62 +.word 1636987 // Layer 6, block 63 +.word 1673531278 // Layer 6, block 60 +.word 1889513769 // Layer 6, block 61 +.word 1477062945 // Layer 6, block 62 +.word 104759172 // Layer 6, block 63 +.word 10674616 // Layer 7, block 120 +.word 9508293 // Layer 7, block 122 +.word 4274200 // Layer 7, block 124 +.word 10066304 // Layer 7, block 126 +.word 683123285 // Layer 7, block 120 +.word 608484310 // Layer 7, block 122 +.word 273527923 // Layer 7, block 124 +.word 644194289 // Layer 7, block 126 +.word 26473446 // Layer 7, block 121 +.word 14853570 // Layer 7, block 123 +.word 32427548 // Layer 7, block 125 +.word 16598340 // Layer 7, block 127 +.word 1694171239 // Layer 7, block 121 +.word 950555930 // Layer 7, block 123 +.word 2075204685 // Layer 7, block 125 +.word 1062212688 // Layer 7, block 127 +.text +.global ntt_u32_full_neon_asm_var_4_4_3_z2_3 +.global _ntt_u32_full_neon_asm_var_4_4_3_z2_3 +ntt_u32_full_neon_asm_var_4_4_3_z2_3: +_ntt_u32_full_neon_asm_var_4_4_3_z2_3: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #800] +ldr q21, [x0, #864] +ldr q20, [x0, #928] +ldr q19, [x0, #992] +ldr q18, [x0, #288] +ldr q17, [x0, #352] +ldr q16, [x0, #416] +ldr q3, [x0, #480] +sqrdmulh v2.4S, v22.4S, v29.s[0] +ldr q1, [x0, #544] +mul v22.4S, v22.4S,v30.s[0] +ldr q0, [x0, #608] +sqrdmulh v15.4S, v21.4S, v29.s[0] +ldr q14, [x0, #672] +mul v21.4S, v21.4S,v30.s[0] +ldr q13, [x0, #736] +mla v22.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q12, [x0, #32] +sub v11.4s, v18.4s, v22.4s +mla v21.4S, v15.4S, v31.s[0] +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q15, [x0, #96] +sub v10.4s, v17.4s, v21.4s +mla v20.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v1.4S, v29.s[0] +ldr q2, [x0, #160] +mul v1.4S, v1.4S,v30.s[0] +sub v9.4s, v16.4s, v20.4s +mla v19.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v0.4S, v29.s[0] +ldr q22, [x0, #224] +mul v0.4S, v0.4S,v30.s[0] +sub v8.4s, v3.4s, v19.4s +mla v1.4S, v21.4S, v31.s[0] +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v21.4s, v12.4s, v1.4s +mla v0.4S, v20.4S, v31.s[0] +add v12.4s, v12.4s, v1.4s +sqrdmulh v1.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +sub v20.4s, v15.4s, v0.4s +mla v14.4S, v19.4S, v31.s[0] +add v15.4s, v15.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v19.4s, v2.4s, v14.4s +mla v13.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v1.4s, v22.4s, v13.4s +mla v16.4S, v0.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v0.4s, v2.4s, v16.4s +mla v3.4S, v14.4S, v31.s[0] +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v14.4s, v22.4s, v3.4s +mla v18.4S, v13.4S, v31.s[0] +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v29.s[2] +mul v9.4S, v9.4S,v30.s[2] +sub v13.4s, v12.4s, v18.4s +mla v17.4S, v16.4S, v31.s[0] +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v8.4S, v29.s[2] +mul v8.4S, v8.4S,v30.s[2] +sub v16.4s, v15.4s, v17.4s +mla v9.4S, v3.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +sqrdmulh v17.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v3.4s, v19.4s, v9.4s +mla v8.4S, v18.4S, v31.s[0] +add v19.4s, v19.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v18.4s, v1.4s, v8.4s +mla v11.4S, v17.4S, v31.s[0] +add v1.4s, v1.4s, v8.4s +sqrdmulh v8.4S, v2.4S, v27.s[0] +mul v2.4S, v2.4S,v28.s[0] +sub v17.4s, v21.4s, v11.4s +mla v10.4S, v9.4S, v31.s[0] +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v27.s[0] +mul v22.4S, v22.4S,v28.s[0] +sub v9.4s, v20.4s, v10.4s +mla v2.4S, v8.4S, v31.s[0] +add v20.4s, v20.4s, v10.4s +sqrdmulh v10.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v8.4s, v12.4s, v2.4s +mla v22.4S, v11.4S, v31.s[0] +add v12.4s, v12.4s, v2.4s +sqrdmulh v2.4S, v14.4S, v27.s[1] +mul v14.4S, v14.4S,v28.s[1] +sub v11.4s, v15.4s, v22.4s +mla v0.4S, v10.4S, v31.s[0] +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v27.s[2] +mul v19.4S, v19.4S,v28.s[2] +sub v10.4s, v13.4s, v0.4s +mla v14.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v0.4s +sqrdmulh v0.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +sub v2.4s, v16.4s, v14.4s +mla v19.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v27.s[3] +mul v3.4S, v3.4S,v28.s[3] +sub v22.4s, v21.4s, v19.4s +mla v1.4S, v0.4S, v31.s[0] +add v21.4s, v21.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v0.4s, v20.4s, v1.4s +mla v3.4S, v14.4S, v31.s[0] +add v20.4s, v20.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v25.s[0] +mul v15.4S, v15.4S,v26.s[0] +sub v14.4s, v17.4s, v3.4s +mla v18.4S, v19.4S, v31.s[0] +add v17.4s, v17.4s, v3.4s +sqrdmulh v3.4S, v11.4S, v25.s[1] +mul v11.4S, v11.4S,v26.s[1] +sub v19.4s, v9.4s, v18.4s +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v1.4s, v12.4s, v15.4s +mla v11.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v25.s[3] +mul v2.4S, v2.4S,v26.s[3] +sub v3.4s, v8.4s, v11.4s +mla v16.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v11.4s +str q12, [x0, #32] +sqrdmulh v12.4S, v20.4S, v23.s[0] +str q1, [x0, #96] +mul v20.4S, v20.4S,v24.s[0] +ldr q1, [x0, #816] +sub v11.4s, v13.4s, v16.4s +ldr q18, [x0, #880] +mla v2.4S, v15.4S, v31.s[0] +add v13.4s, v13.4s, v16.4s +str q8, [x0, #160] +sqrdmulh v8.4S, v0.4S, v23.s[1] +str q3, [x0, #224] +mul v0.4S, v0.4S,v24.s[1] +ldr q3, [x0, #944] +sub v16.4s, v10.4s, v2.4s +ldr q15, [x0, #1008] +mla v20.4S, v12.4S, v31.s[0] +add v10.4s, v10.4s, v2.4s +str q13, [x0, #288] +sqrdmulh v13.4S, v9.4S, v23.s[2] +str q11, [x0, #352] +mul v9.4S, v9.4S,v24.s[2] +ldr q11, [x0, #304] +sub v2.4s, v21.4s, v20.4s +ldr q12, [x0, #368] +mla v0.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v20.4s +str q10, [x0, #416] +sqrdmulh v10.4S, v19.4S, v23.s[3] +str q16, [x0, #480] +mul v19.4S, v19.4S,v24.s[3] +ldr q16, [x0, #432] +sub v20.4s, v22.4s, v0.4s +ldr q8, [x0, #496] +mla v9.4S, v13.4S, v31.s[0] +add v22.4s, v22.4s, v0.4s +str q21, [x0, #544] +sqrdmulh v21.4S, v1.4S, v29.s[0] +str q2, [x0, #608] +ldr q2, [x0, #560] +mul v1.4S, v1.4S,v30.s[0] +ldr q0, [x0, #624] +sub v13.4s, v17.4s, v9.4s +mla v19.4S, v10.4S, v31.s[0] +add v17.4s, v17.4s, v9.4s +str q22, [x0, #672] +sqrdmulh v22.4S, v18.4S, v29.s[0] +str q20, [x0, #736] +ldr q20, [x0, #688] +mul v18.4S, v18.4S,v30.s[0] +ldr q9, [x0, #752] +sub v10.4s, v14.4s, v19.4s +mla v1.4S, v21.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +str q17, [x0, #800] +sqrdmulh v17.4S, v3.4S, v29.s[0] +str q13, [x0, #864] +mul v3.4S, v3.4S,v30.s[0] +ldr q13, [x0, #48] +sub v19.4s, v11.4s, v1.4s +mla v18.4S, v22.4S, v31.s[0] +add v11.4s, v11.4s, v1.4s +str q14, [x0, #928] +sqrdmulh v14.4S, v15.4S, v29.s[0] +str q10, [x0, #992] +mul v15.4S, v15.4S,v30.s[0] +ldr q10, [x0, #112] +sub v1.4s, v12.4s, v18.4s +mla v3.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v2.4S, v29.s[0] +ldr q17, [x0, #176] +mul v2.4S, v2.4S,v30.s[0] +sub v22.4s, v16.4s, v3.4s +mla v15.4S, v14.4S, v31.s[0] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v0.4S, v29.s[0] +ldr q14, [x0, #240] +mul v0.4S, v0.4S,v30.s[0] +sub v21.4s, v8.4s, v15.4s +mla v2.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +sub v18.4s, v13.4s, v2.4s +mla v0.4S, v3.4S, v31.s[0] +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v9.4S, v29.s[0] +mul v9.4S, v9.4S,v30.s[0] +sub v3.4s, v10.4s, v0.4s +mla v20.4S, v15.4S, v31.s[0] +add v10.4s, v10.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v15.4s, v17.4s, v20.4s +mla v9.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +sub v2.4s, v14.4s, v9.4s +mla v16.4S, v0.4S, v31.s[0] +add v14.4s, v14.4s, v9.4s +sqrdmulh v9.4S, v11.4S, v29.s[1] +mul v11.4S, v11.4S,v30.s[1] +sub v0.4s, v17.4s, v16.4s +mla v8.4S, v20.4S, v31.s[0] +add v17.4s, v17.4s, v16.4s +sqrdmulh v16.4S, v12.4S, v29.s[1] +mul v12.4S, v12.4S,v30.s[1] +sub v20.4s, v14.4s, v8.4s +mla v11.4S, v9.4S, v31.s[0] +add v14.4s, v14.4s, v8.4s +sqrdmulh v8.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +sub v9.4s, v13.4s, v11.4s +mla v12.4S, v16.4S, v31.s[0] +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +sub v16.4s, v10.4s, v12.4s +mla v22.4S, v8.4S, v31.s[0] +add v10.4s, v10.4s, v12.4s +sqrdmulh v12.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +sub v8.4s, v15.4s, v22.4s +mla v21.4S, v11.4S, v31.s[0] +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v1.4S, v29.s[2] +mul v1.4S, v1.4S,v30.s[2] +sub v11.4s, v2.4s, v21.4s +mla v19.4S, v12.4S, v31.s[0] +add v2.4s, v2.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v27.s[0] +mul v17.4S, v17.4S,v28.s[0] +sub v12.4s, v18.4s, v19.4s +mla v1.4S, v22.4S, v31.s[0] +add v18.4s, v18.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +sub v22.4s, v3.4s, v1.4s +mla v17.4S, v21.4S, v31.s[0] +add v3.4s, v3.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v21.4s, v13.4s, v17.4s +mla v14.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v17.4s +sqrdmulh v17.4S, v20.4S, v27.s[1] +mul v20.4S, v20.4S,v28.s[1] +sub v19.4s, v10.4s, v14.4s +mla v0.4S, v1.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v27.s[2] +mul v15.4S, v15.4S,v28.s[2] +sub v1.4s, v9.4s, v0.4s +mla v20.4S, v17.4S, v31.s[0] +add v9.4s, v9.4s, v0.4s +sqrdmulh v0.4S, v2.4S, v27.s[2] +mul v2.4S, v2.4S,v28.s[2] +sub v17.4s, v16.4s, v20.4s +mla v15.4S, v14.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v27.s[3] +mul v8.4S, v8.4S,v28.s[3] +sub v14.4s, v18.4s, v15.4s +mla v2.4S, v0.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v27.s[3] +mul v11.4S, v11.4S,v28.s[3] +sub v0.4s, v3.4s, v2.4s +mla v8.4S, v20.4S, v31.s[0] +add v3.4s, v3.4s, v2.4s +sqrdmulh v2.4S, v10.4S, v25.s[0] +mul v10.4S, v10.4S,v26.s[0] +sub v20.4s, v12.4s, v8.4s +mla v11.4S, v15.4S, v31.s[0] +add v12.4s, v12.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v25.s[1] +mul v19.4S, v19.4S,v26.s[1] +sub v15.4s, v22.4s, v11.4s +mla v10.4S, v2.4S, v31.s[0] +add v22.4s, v22.4s, v11.4s +sqrdmulh v11.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v2.4s, v13.4s, v10.4s +mla v19.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v10.4s +sqrdmulh v10.4S, v17.4S, v25.s[3] +mul v17.4S, v17.4S,v26.s[3] +sub v8.4s, v21.4s, v19.4s +mla v16.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v19.4s +str q13, [x0, #48] +sqrdmulh v13.4S, v3.4S, v23.s[0] +str q2, [x0, #112] +mul v3.4S, v3.4S,v24.s[0] +ldr q2, [x0, #768] +sub v19.4s, v9.4s, v16.4s +ldr q11, [x0, #832] +mla v17.4S, v10.4S, v31.s[0] +add v9.4s, v9.4s, v16.4s +str q21, [x0, #176] +sqrdmulh v21.4S, v0.4S, v23.s[1] +str q8, [x0, #240] +mul v0.4S, v0.4S,v24.s[1] +ldr q8, [x0, #896] +sub v16.4s, v1.4s, v17.4s +ldr q10, [x0, #960] +mla v3.4S, v13.4S, v31.s[0] +add v1.4s, v1.4s, v17.4s +str q9, [x0, #304] +sqrdmulh v9.4S, v22.4S, v23.s[2] +str q19, [x0, #368] +mul v22.4S, v22.4S,v24.s[2] +ldr q19, [x0, #256] +sub v17.4s, v18.4s, v3.4s +ldr q13, [x0, #320] +mla v0.4S, v21.4S, v31.s[0] +add v18.4s, v18.4s, v3.4s +str q1, [x0, #432] +sqrdmulh v1.4S, v15.4S, v23.s[3] +str q16, [x0, #496] +mul v15.4S, v15.4S,v24.s[3] +ldr q16, [x0, #384] +sub v3.4s, v14.4s, v0.4s +ldr q21, [x0, #448] +mla v22.4S, v9.4S, v31.s[0] +add v14.4s, v14.4s, v0.4s +str q18, [x0, #560] +sqrdmulh v18.4S, v2.4S, v29.s[0] +str q17, [x0, #624] +ldr q17, [x0, #512] +mul v2.4S, v2.4S,v30.s[0] +ldr q0, [x0, #576] +sub v9.4s, v12.4s, v22.4s +mla v15.4S, v1.4S, v31.s[0] +add v12.4s, v12.4s, v22.4s +str q14, [x0, #688] +sqrdmulh v14.4S, v11.4S, v29.s[0] +str q3, [x0, #752] +ldr q3, [x0, #640] +mul v11.4S, v11.4S,v30.s[0] +ldr q22, [x0, #704] +sub v1.4s, v20.4s, v15.4s +mla v2.4S, v18.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +str q12, [x0, #816] +sqrdmulh v12.4S, v8.4S, v29.s[0] +str q9, [x0, #880] +mul v8.4S, v8.4S,v30.s[0] +ldr q9, [x0, #0] +sub v15.4s, v19.4s, v2.4s +mla v11.4S, v14.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +str q20, [x0, #944] +sqrdmulh v20.4S, v10.4S, v29.s[0] +str q1, [x0, #1008] +mul v10.4S, v10.4S,v30.s[0] +ldr q1, [x0, #64] +sub v2.4s, v13.4s, v11.4s +mla v8.4S, v12.4S, v31.s[0] +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v29.s[0] +ldr q12, [x0, #128] +mul v17.4S, v17.4S,v30.s[0] +sub v14.4s, v16.4s, v8.4s +mla v10.4S, v20.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v0.4S, v29.s[0] +ldr q20, [x0, #192] +mul v0.4S, v0.4S,v30.s[0] +sub v18.4s, v21.4s, v10.4s +mla v17.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +sub v11.4s, v9.4s, v17.4s +mla v0.4S, v8.4S, v31.s[0] +add v9.4s, v9.4s, v17.4s +sqrdmulh v17.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v8.4s, v1.4s, v0.4s +mla v3.4S, v10.4S, v31.s[0] +add v1.4s, v1.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v10.4s, v12.4s, v3.4s +mla v22.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v17.4s, v20.4s, v22.4s +mla v16.4S, v0.4S, v31.s[0] +add v20.4s, v20.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[1] +mul v19.4S, v19.4S,v30.s[1] +sub v0.4s, v12.4s, v16.4s +mla v21.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v13.4S, v29.s[1] +mul v13.4S, v13.4S,v30.s[1] +sub v3.4s, v20.4s, v21.4s +mla v19.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v22.4s, v9.4s, v19.4s +mla v13.4S, v16.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v29.s[2] +mul v18.4S, v18.4S,v30.s[2] +sub v16.4s, v1.4s, v13.4s +mla v14.4S, v21.4S, v31.s[0] +add v1.4s, v1.4s, v13.4s +sqrdmulh v13.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +sub v21.4s, v10.4s, v14.4s +mla v18.4S, v19.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v19.4s, v17.4s, v18.4s +mla v15.4S, v13.4S, v31.s[0] +add v17.4s, v17.4s, v18.4s +sqrdmulh v18.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +sub v13.4s, v11.4s, v15.4s +mla v2.4S, v14.4S, v31.s[0] +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v27.s[0] +mul v20.4S, v20.4S,v28.s[0] +sub v14.4s, v8.4s, v2.4s +mla v12.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v2.4s +sqrdmulh v2.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v18.4s, v9.4s, v12.4s +mla v20.4S, v15.4S, v31.s[0] +add v9.4s, v9.4s, v12.4s +sqrdmulh v12.4S, v3.4S, v27.s[1] +mul v3.4S, v3.4S,v28.s[1] +sub v15.4s, v1.4s, v20.4s +mla v0.4S, v2.4S, v31.s[0] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v10.4S, v27.s[2] +mul v10.4S, v10.4S,v28.s[2] +sub v2.4s, v22.4s, v0.4s +mla v3.4S, v12.4S, v31.s[0] +add v22.4s, v22.4s, v0.4s +sqrdmulh v0.4S, v17.4S, v27.s[2] +mul v17.4S, v17.4S,v28.s[2] +sub v12.4s, v16.4s, v3.4s +mla v10.4S, v20.4S, v31.s[0] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +sub v20.4s, v11.4s, v10.4s +mla v17.4S, v0.4S, v31.s[0] +add v11.4s, v11.4s, v10.4s +sqrdmulh v10.4S, v19.4S, v27.s[3] +mul v19.4S, v19.4S,v28.s[3] +sub v0.4s, v8.4s, v17.4s +mla v21.4S, v3.4S, v31.s[0] +add v8.4s, v8.4s, v17.4s +sqrdmulh v17.4S, v1.4S, v25.s[0] +mul v1.4S, v1.4S,v26.s[0] +sub v3.4s, v13.4s, v21.4s +mla v19.4S, v10.4S, v31.s[0] +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v15.4S, v25.s[1] +mul v15.4S, v15.4S,v26.s[1] +sub v10.4s, v14.4s, v19.4s +mla v1.4S, v17.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +sqrdmulh v19.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v17.4s, v9.4s, v1.4s +mla v15.4S, v21.4S, v31.s[0] +add v9.4s, v9.4s, v1.4s +sqrdmulh v1.4S, v12.4S, v25.s[3] +mul v12.4S, v12.4S,v26.s[3] +sub v21.4s, v18.4s, v15.4s +mla v16.4S, v19.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +str q9, [x0, #0] +sqrdmulh v9.4S, v8.4S, v23.s[0] +str q17, [x0, #64] +mul v8.4S, v8.4S,v24.s[0] +ldr q17, [x0, #784] +sub v15.4s, v22.4s, v16.4s +ldr q19, [x0, #848] +mla v12.4S, v1.4S, v31.s[0] +add v22.4s, v22.4s, v16.4s +str q18, [x0, #128] +sqrdmulh v18.4S, v0.4S, v23.s[1] +str q21, [x0, #192] +mul v0.4S, v0.4S,v24.s[1] +ldr q21, [x0, #912] +sub v16.4s, v2.4s, v12.4s +ldr q1, [x0, #976] +mla v8.4S, v9.4S, v31.s[0] +add v2.4s, v2.4s, v12.4s +str q22, [x0, #256] +sqrdmulh v22.4S, v14.4S, v23.s[2] +str q15, [x0, #320] +mul v14.4S, v14.4S,v24.s[2] +ldr q15, [x0, #272] +sub v12.4s, v11.4s, v8.4s +ldr q9, [x0, #336] +mla v0.4S, v18.4S, v31.s[0] +add v11.4s, v11.4s, v8.4s +str q2, [x0, #384] +sqrdmulh v2.4S, v10.4S, v23.s[3] +str q16, [x0, #448] +mul v10.4S, v10.4S,v24.s[3] +ldr q16, [x0, #400] +sub v8.4s, v20.4s, v0.4s +ldr q18, [x0, #464] +mla v14.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v0.4s +str q11, [x0, #512] +sqrdmulh v11.4S, v17.4S, v29.s[0] +str q12, [x0, #576] +ldr q12, [x0, #528] +mul v17.4S, v17.4S,v30.s[0] +ldr q0, [x0, #592] +sub v22.4s, v13.4s, v14.4s +mla v10.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v14.4s +str q20, [x0, #640] +sqrdmulh v20.4S, v19.4S, v29.s[0] +str q8, [x0, #704] +ldr q8, [x0, #656] +mul v19.4S, v19.4S,v30.s[0] +ldr q14, [x0, #720] +sub v2.4s, v3.4s, v10.4s +mla v17.4S, v11.4S, v31.s[0] +add v3.4s, v3.4s, v10.4s +str q13, [x0, #768] +sqrdmulh v13.4S, v21.4S, v29.s[0] +str q22, [x0, #832] +mul v21.4S, v21.4S,v30.s[0] +ldr q22, [x0, #16] +sub v10.4s, v15.4s, v17.4s +mla v19.4S, v20.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +str q3, [x0, #896] +sqrdmulh v3.4S, v1.4S, v29.s[0] +str q2, [x0, #960] +mul v1.4S, v1.4S,v30.s[0] +ldr q2, [x0, #80] +sub v17.4s, v9.4s, v19.4s +mla v21.4S, v13.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v12.4S, v29.s[0] +ldr q13, [x0, #144] +mul v12.4S, v12.4S,v30.s[0] +sub v20.4s, v16.4s, v21.4s +mla v1.4S, v3.4S, v31.s[0] +add v16.4s, v16.4s, v21.4s +sqrdmulh v21.4S, v0.4S, v29.s[0] +ldr q3, [x0, #208] +mul v0.4S, v0.4S,v30.s[0] +sub v11.4s, v18.4s, v1.4s +mla v12.4S, v19.4S, v31.s[0] +add v18.4s, v18.4s, v1.4s +sqrdmulh v1.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v19.4s, v22.4s, v12.4s +mla v0.4S, v21.4S, v31.s[0] +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v21.4s, v2.4s, v0.4s +mla v8.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v1.4s, v13.4s, v8.4s +mla v14.4S, v12.4S, v31.s[0] +add v13.4s, v13.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v12.4s, v3.4s, v14.4s +mla v16.4S, v0.4S, v31.s[0] +add v3.4s, v3.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +sub v0.4s, v13.4s, v16.4s +mla v18.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v16.4s +sqrdmulh v16.4S, v9.4S, v29.s[1] +mul v9.4S, v9.4S,v30.s[1] +sub v8.4s, v3.4s, v18.4s +mla v15.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +sub v14.4s, v22.4s, v15.4s +mla v9.4S, v16.4S, v31.s[0] +add v22.4s, v22.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v16.4s, v2.4s, v9.4s +mla v20.4S, v18.4S, v31.s[0] +add v2.4s, v2.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v18.4s, v1.4s, v20.4s +mla v11.4S, v15.4S, v31.s[0] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +sub v15.4s, v12.4s, v11.4s +mla v10.4S, v9.4S, v31.s[0] +add v12.4s, v12.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v27.s[0] +mul v13.4S, v13.4S,v28.s[0] +sub v9.4s, v19.4s, v10.4s +mla v17.4S, v20.4S, v31.s[0] +add v19.4s, v19.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v27.s[0] +mul v3.4S, v3.4S,v28.s[0] +sub v20.4s, v21.4s, v17.4s +mla v13.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v11.4s, v22.4s, v13.4s +mla v3.4S, v10.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v8.4S, v27.s[1] +mul v8.4S, v8.4S,v28.s[1] +sub v10.4s, v2.4s, v3.4s +mla v0.4S, v17.4S, v31.s[0] +add v2.4s, v2.4s, v3.4s +sqrdmulh v3.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +sub v17.4s, v14.4s, v0.4s +mla v8.4S, v13.4S, v31.s[0] +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v12.4S, v27.s[2] +mul v12.4S, v12.4S,v28.s[2] +sub v13.4s, v16.4s, v8.4s +mla v1.4S, v3.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v3.4s, v19.4s, v1.4s +mla v12.4S, v0.4S, v31.s[0] +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +sub v0.4s, v21.4s, v12.4s +mla v18.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v2.4S, v25.s[0] +mul v2.4S, v2.4S,v26.s[0] +sub v8.4s, v9.4s, v18.4s +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v10.4S, v25.s[1] +mul v10.4S, v10.4S,v26.s[1] +sub v1.4s, v20.4s, v15.4s +mla v2.4S, v12.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v12.4s, v22.4s, v2.4s +mla v10.4S, v18.4S, v31.s[0] +add v22.4s, v22.4s, v2.4s +sqrdmulh v2.4S, v13.4S, v25.s[3] +mul v13.4S, v13.4S,v26.s[3] +sub v18.4s, v11.4s, v10.4s +mla v16.4S, v15.4S, v31.s[0] +add v11.4s, v11.4s, v10.4s +str q22, [x0, #16] +sqrdmulh v22.4S, v21.4S, v23.s[0] +str q12, [x0, #80] +mul v21.4S, v21.4S,v24.s[0] +sub v12.4s, v14.4s, v16.4s +mla v13.4S, v2.4S, v31.s[0] +add v14.4s, v14.4s, v16.4s +str q11, [x0, #144] +sqrdmulh v11.4S, v0.4S, v23.s[1] +str q18, [x0, #208] +mul v0.4S, v0.4S,v24.s[1] +sub v18.4s, v17.4s, v13.4s +mla v21.4S, v22.4S, v31.s[0] +add v17.4s, v17.4s, v13.4s +str q14, [x0, #272] +sqrdmulh v14.4S, v20.4S, v23.s[2] +str q12, [x0, #336] +mul v20.4S, v20.4S,v24.s[2] +sub v12.4s, v19.4s, v21.4s +mla v0.4S, v11.4S, v31.s[0] +add v19.4s, v19.4s, v21.4s +str q17, [x0, #400] +sqrdmulh v17.4S, v1.4S, v23.s[3] +str q18, [x0, #464] +mul v1.4S, v1.4S,v24.s[3] +sub v18.4s, v3.4s, v0.4s +mla v20.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v0.4s +str q19, [x0, #528] +str q12, [x0, #592] +sub v12.4s, v9.4s, v20.4s +mla v1.4S, v17.4S, v31.s[0] +add v9.4s, v9.4s, v20.4s +str q3, [x0, #656] +str q18, [x0, #720] +sub v18.4s, v8.4s, v1.4s +add v8.4s, v8.4s, v1.4s +str q9, [x0, #784] +str q12, [x0, #848] +str q8, [x0, #912] +str q18, [x0, #976] +ldr q4, [x0, #32] +ldr q5, [x0, #48] +ldr q6, [x17, #+128] +ldr q7, [x17, #+144] +ldr q15, [x0, #0] +ldr q10, [x0, #16] +ldr q2, [x17, #+1152] +ldr q16, [x17, #+1168] +sqrdmulh v22.4S, v4.4S, v7.s[0] +ldr q13, [x0, #544] +mul v4.4S, v4.4S,v6.s[0] +ldr q11, [x0, #560] +mla v4.4S, v22.4S, v31.s[0] +sub v22.4s, v15.4s, v4.4s +add v15.4s, v15.4s, v4.4s +sqrdmulh v4.4S, v5.4S, v7.s[0] +ldr q21, [x0, #512] +mul v5.4S, v5.4S,v6.s[0] +ldr q14, [x0, #528] +mla v5.4S, v4.4S, v31.s[0] +sub v4.4s, v10.4s, v5.4s +add v10.4s, v10.4s, v5.4s +sqrdmulh v5.4S, v13.4S, v16.s[0] +mul v13.4S, v13.4S,v2.s[0] +mla v13.4S, v5.4S, v31.s[0] +sub v5.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +sqrdmulh v13.4S, v11.4S, v16.s[0] +mul v11.4S, v11.4S,v2.s[0] +mla v11.4S, v13.4S, v31.s[0] +sub v13.4s, v14.4s, v11.4s +add v14.4s, v14.4s, v11.4s +sqrdmulh v11.4S, v10.4S, v7.s[1] +mul v10.4S, v10.4S,v6.s[1] +mla v10.4S, v11.4S, v31.s[0] +sub v11.4s, v15.4s, v10.4s +add v15.4s, v15.4s, v10.4s +sqrdmulh v10.4S, v4.4S, v7.s[2] +mul v4.4S, v4.4S,v6.s[2] +mla v4.4S, v10.4S, v31.s[0] +sub v10.4s, v22.4s, v4.4s +add v22.4s, v22.4s, v4.4s +sqrdmulh v4.4S, v14.4S, v16.s[1] +mul v14.4S, v14.4S,v2.s[1] +mla v14.4S, v4.4S, v31.s[0] +sub v4.4s, v21.4s, v14.4s +add v21.4s, v21.4s, v14.4s +sqrdmulh v14.4S, v13.4S, v16.s[2] +mul v13.4S, v13.4S,v2.s[2] +mla v13.4S, v14.4S, v31.s[0] +sub v14.4s, v5.4s, v13.4s +add v5.4s, v5.4s, v13.4s +trn1 v13.4S, v15.4S, v11.4S +trn2 v0.4S, v15.4S, v11.4S +trn1 v19.4S, v22.4S, v10.4S +trn2 v17.4S, v22.4S, v10.4S +trn2 v22.2D, v13.2D, v19.2D +trn2 v10.2D, v0.2D, v17.2D +trn1 v15.2D, v13.2D, v19.2D +trn1 v11.2D, v0.2D, v17.2D +ldr q17, [x17, #+160] +ldr q0, [x17, #+176] +trn1 v19.4S, v21.4S, v4.4S +trn2 v13.4S, v21.4S, v4.4S +trn1 v20.4S, v5.4S, v14.4S +trn2 v3.4S, v5.4S, v14.4S +trn2 v5.2D, v19.2D, v20.2D +trn2 v14.2D, v13.2D, v3.2D +trn1 v21.2D, v19.2D, v20.2D +trn1 v4.2D, v13.2D, v3.2D +ldr q3, [x17, #+1184] +ldr q13, [x17, #+1200] +sqrdmulh v20.4S, v22.4S, v0.4S +mul v22.4S, v22.4S,v17.4S +mla v22.4S, v20.4S, v31.s[0] +sub v20.4s, v15.4s, v22.4s +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v10.4S, v0.4S +mul v10.4S, v10.4S,v17.4S +mla v10.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v10.4s +add v11.4s, v11.4s, v10.4s +sqrdmulh v10.4S, v5.4S, v13.4S +mul v5.4S, v5.4S,v3.4S +mla v5.4S, v10.4S, v31.s[0] +sub v10.4s, v21.4s, v5.4s +add v21.4s, v21.4s, v5.4s +ldr q5, [x17, #+192] +ldr q19, [x17, #+208] +sqrdmulh v1.4S, v14.4S, v13.4S +mul v14.4S, v14.4S,v3.4S +mla v14.4S, v1.4S, v31.s[0] +sub v1.4s, v4.4s, v14.4s +add v4.4s, v4.4s, v14.4s +ldr q14, [x17, #+224] +ldr q9, [x17, #+240] +sqrdmulh v12.4S, v11.4S, v19.4S +mul v11.4S, v11.4S,v5.4S +mla v11.4S, v12.4S, v31.s[0] +sub v12.4s, v15.4s, v11.4s +add v15.4s, v15.4s, v11.4s +ldr q11, [x17, #+1216] +ldr q8, [x17, #+1232] +sqrdmulh v18.4S, v22.4S, v9.4S +mul v22.4S, v22.4S,v14.4S +mla v22.4S, v18.4S, v31.s[0] +sub v18.4s, v20.4s, v22.4s +add v20.4s, v20.4s, v22.4s +ldr q22, [x17, #+1248] +ldr q30, [x17, #+1264] +sqrdmulh v29.4S, v4.4S, v8.4S +ldr q28, [x0, #96] +mul v4.4S, v4.4S,v11.4S +mla v4.4S, v29.4S, v31.s[0] +sub v29.4s, v21.4s, v4.4s +add v21.4s, v21.4s, v4.4s +sqrdmulh v4.4S, v1.4S, v30.4S +ldr q27, [x0, #112] +mul v1.4S, v1.4S,v22.4S +mla v1.4S, v4.4S, v31.s[0] +sub v4.4s, v10.4s, v1.4s +add v10.4s, v10.4s, v1.4s +str q15, [x0, #0] +str q12, [x0, #16] +str q20, [x0, #32] +str q18, [x0, #48] +str q21, [x0, #512] +str q29, [x0, #528] +str q10, [x0, #544] +str q4, [x0, #560] +ldr q30, [x17, #+256] +ldr q22, [x17, #+272] +ldr q8, [x0, #64] +ldr q11, [x0, #80] +ldr q13, [x17, #+1280] +ldr q3, [x17, #+1296] +sqrdmulh v16.4S, v28.4S, v22.s[0] +ldr q2, [x0, #608] +mul v28.4S, v28.4S,v30.s[0] +ldr q4, [x0, #624] +mla v28.4S, v16.4S, v31.s[0] +sub v16.4s, v8.4s, v28.4s +add v8.4s, v8.4s, v28.4s +sqrdmulh v28.4S, v27.4S, v22.s[0] +ldr q10, [x0, #576] +mul v27.4S, v27.4S,v30.s[0] +ldr q29, [x0, #592] +mla v27.4S, v28.4S, v31.s[0] +sub v28.4s, v11.4s, v27.4s +add v11.4s, v11.4s, v27.4s +sqrdmulh v27.4S, v2.4S, v3.s[0] +mul v2.4S, v2.4S,v13.s[0] +mla v2.4S, v27.4S, v31.s[0] +sub v27.4s, v10.4s, v2.4s +add v10.4s, v10.4s, v2.4s +sqrdmulh v2.4S, v4.4S, v3.s[0] +mul v4.4S, v4.4S,v13.s[0] +mla v4.4S, v2.4S, v31.s[0] +sub v2.4s, v29.4s, v4.4s +add v29.4s, v29.4s, v4.4s +sqrdmulh v4.4S, v11.4S, v22.s[1] +mul v11.4S, v11.4S,v30.s[1] +mla v11.4S, v4.4S, v31.s[0] +sub v4.4s, v8.4s, v11.4s +add v8.4s, v8.4s, v11.4s +sqrdmulh v11.4S, v28.4S, v22.s[2] +mul v28.4S, v28.4S,v30.s[2] +mla v28.4S, v11.4S, v31.s[0] +sub v11.4s, v16.4s, v28.4s +add v16.4s, v16.4s, v28.4s +sqrdmulh v28.4S, v29.4S, v3.s[1] +mul v29.4S, v29.4S,v13.s[1] +mla v29.4S, v28.4S, v31.s[0] +sub v28.4s, v10.4s, v29.4s +add v10.4s, v10.4s, v29.4s +sqrdmulh v29.4S, v2.4S, v3.s[2] +mul v2.4S, v2.4S,v13.s[2] +mla v2.4S, v29.4S, v31.s[0] +sub v29.4s, v27.4s, v2.4s +add v27.4s, v27.4s, v2.4s +trn1 v2.4S, v8.4S, v4.4S +trn2 v21.4S, v8.4S, v4.4S +trn1 v9.4S, v16.4S, v11.4S +trn2 v14.4S, v16.4S, v11.4S +trn2 v16.2D, v2.2D, v9.2D +trn2 v11.2D, v21.2D, v14.2D +trn1 v8.2D, v2.2D, v9.2D +trn1 v4.2D, v21.2D, v14.2D +ldr q14, [x17, #+288] +ldr q21, [x17, #+304] +trn1 v9.4S, v10.4S, v28.4S +trn2 v2.4S, v10.4S, v28.4S +trn1 v19.4S, v27.4S, v29.4S +trn2 v5.4S, v27.4S, v29.4S +trn2 v27.2D, v9.2D, v19.2D +trn2 v29.2D, v2.2D, v5.2D +trn1 v10.2D, v9.2D, v19.2D +trn1 v28.2D, v2.2D, v5.2D +ldr q5, [x17, #+1312] +ldr q2, [x17, #+1328] +sqrdmulh v19.4S, v16.4S, v21.4S +mul v16.4S, v16.4S,v14.4S +mla v16.4S, v19.4S, v31.s[0] +sub v19.4s, v8.4s, v16.4s +add v8.4s, v8.4s, v16.4s +sqrdmulh v16.4S, v11.4S, v21.4S +mul v11.4S, v11.4S,v14.4S +mla v11.4S, v16.4S, v31.s[0] +sub v16.4s, v4.4s, v11.4s +add v4.4s, v4.4s, v11.4s +sqrdmulh v11.4S, v27.4S, v2.4S +mul v27.4S, v27.4S,v5.4S +mla v27.4S, v11.4S, v31.s[0] +sub v11.4s, v10.4s, v27.4s +add v10.4s, v10.4s, v27.4s +ldr q27, [x17, #+320] +ldr q9, [x17, #+336] +sqrdmulh v0.4S, v29.4S, v2.4S +mul v29.4S, v29.4S,v5.4S +mla v29.4S, v0.4S, v31.s[0] +sub v0.4s, v28.4s, v29.4s +add v28.4s, v28.4s, v29.4s +ldr q29, [x17, #+352] +ldr q17, [x17, #+368] +sqrdmulh v7.4S, v4.4S, v9.4S +mul v4.4S, v4.4S,v27.4S +mla v4.4S, v7.4S, v31.s[0] +sub v7.4s, v8.4s, v4.4s +add v8.4s, v8.4s, v4.4s +ldr q4, [x17, #+1344] +ldr q6, [x17, #+1360] +sqrdmulh v18.4S, v16.4S, v17.4S +mul v16.4S, v16.4S,v29.4S +mla v16.4S, v18.4S, v31.s[0] +sub v18.4s, v19.4s, v16.4s +add v19.4s, v19.4s, v16.4s +ldr q16, [x17, #+1376] +ldr q20, [x17, #+1392] +sqrdmulh v12.4S, v28.4S, v6.4S +ldr q15, [x0, #160] +mul v28.4S, v28.4S,v4.4S +mla v28.4S, v12.4S, v31.s[0] +sub v12.4s, v10.4s, v28.4s +add v10.4s, v10.4s, v28.4s +sqrdmulh v28.4S, v0.4S, v20.4S +ldr q1, [x0, #176] +mul v0.4S, v0.4S,v16.4S +mla v0.4S, v28.4S, v31.s[0] +sub v28.4s, v11.4s, v0.4s +add v11.4s, v11.4s, v0.4s +str q8, [x0, #64] +str q7, [x0, #80] +str q19, [x0, #96] +str q18, [x0, #112] +str q10, [x0, #576] +str q12, [x0, #592] +str q11, [x0, #608] +str q28, [x0, #624] +ldr q20, [x17, #+384] +ldr q16, [x17, #+400] +ldr q6, [x0, #128] +ldr q4, [x0, #144] +ldr q2, [x17, #+1408] +ldr q5, [x17, #+1424] +sqrdmulh v3.4S, v15.4S, v16.s[0] +ldr q13, [x0, #672] +mul v15.4S, v15.4S,v20.s[0] +ldr q28, [x0, #688] +mla v15.4S, v3.4S, v31.s[0] +sub v3.4s, v6.4s, v15.4s +add v6.4s, v6.4s, v15.4s +sqrdmulh v15.4S, v1.4S, v16.s[0] +ldr q11, [x0, #640] +mul v1.4S, v1.4S,v20.s[0] +ldr q12, [x0, #656] +mla v1.4S, v15.4S, v31.s[0] +sub v15.4s, v4.4s, v1.4s +add v4.4s, v4.4s, v1.4s +sqrdmulh v1.4S, v13.4S, v5.s[0] +mul v13.4S, v13.4S,v2.s[0] +mla v13.4S, v1.4S, v31.s[0] +sub v1.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +sqrdmulh v13.4S, v28.4S, v5.s[0] +mul v28.4S, v28.4S,v2.s[0] +mla v28.4S, v13.4S, v31.s[0] +sub v13.4s, v12.4s, v28.4s +add v12.4s, v12.4s, v28.4s +sqrdmulh v28.4S, v4.4S, v16.s[1] +mul v4.4S, v4.4S,v20.s[1] +mla v4.4S, v28.4S, v31.s[0] +sub v28.4s, v6.4s, v4.4s +add v6.4s, v6.4s, v4.4s +sqrdmulh v4.4S, v15.4S, v16.s[2] +mul v15.4S, v15.4S,v20.s[2] +mla v15.4S, v4.4S, v31.s[0] +sub v4.4s, v3.4s, v15.4s +add v3.4s, v3.4s, v15.4s +sqrdmulh v15.4S, v12.4S, v5.s[1] +mul v12.4S, v12.4S,v2.s[1] +mla v12.4S, v15.4S, v31.s[0] +sub v15.4s, v11.4s, v12.4s +add v11.4s, v11.4s, v12.4s +sqrdmulh v12.4S, v13.4S, v5.s[2] +mul v13.4S, v13.4S,v2.s[2] +mla v13.4S, v12.4S, v31.s[0] +sub v12.4s, v1.4s, v13.4s +add v1.4s, v1.4s, v13.4s +trn1 v13.4S, v6.4S, v28.4S +trn2 v10.4S, v6.4S, v28.4S +trn1 v17.4S, v3.4S, v4.4S +trn2 v29.4S, v3.4S, v4.4S +trn2 v3.2D, v13.2D, v17.2D +trn2 v4.2D, v10.2D, v29.2D +trn1 v6.2D, v13.2D, v17.2D +trn1 v28.2D, v10.2D, v29.2D +ldr q29, [x17, #+416] +ldr q10, [x17, #+432] +trn1 v17.4S, v11.4S, v15.4S +trn2 v13.4S, v11.4S, v15.4S +trn1 v9.4S, v1.4S, v12.4S +trn2 v27.4S, v1.4S, v12.4S +trn2 v1.2D, v17.2D, v9.2D +trn2 v12.2D, v13.2D, v27.2D +trn1 v11.2D, v17.2D, v9.2D +trn1 v15.2D, v13.2D, v27.2D +ldr q27, [x17, #+1440] +ldr q13, [x17, #+1456] +sqrdmulh v9.4S, v3.4S, v10.4S +mul v3.4S, v3.4S,v29.4S +mla v3.4S, v9.4S, v31.s[0] +sub v9.4s, v6.4s, v3.4s +add v6.4s, v6.4s, v3.4s +sqrdmulh v3.4S, v4.4S, v10.4S +mul v4.4S, v4.4S,v29.4S +mla v4.4S, v3.4S, v31.s[0] +sub v3.4s, v28.4s, v4.4s +add v28.4s, v28.4s, v4.4s +sqrdmulh v4.4S, v1.4S, v13.4S +mul v1.4S, v1.4S,v27.4S +mla v1.4S, v4.4S, v31.s[0] +sub v4.4s, v11.4s, v1.4s +add v11.4s, v11.4s, v1.4s +ldr q1, [x17, #+448] +ldr q17, [x17, #+464] +sqrdmulh v21.4S, v12.4S, v13.4S +mul v12.4S, v12.4S,v27.4S +mla v12.4S, v21.4S, v31.s[0] +sub v21.4s, v15.4s, v12.4s +add v15.4s, v15.4s, v12.4s +ldr q12, [x17, #+480] +ldr q14, [x17, #+496] +sqrdmulh v22.4S, v28.4S, v17.4S +mul v28.4S, v28.4S,v1.4S +mla v28.4S, v22.4S, v31.s[0] +sub v22.4s, v6.4s, v28.4s +add v6.4s, v6.4s, v28.4s +ldr q28, [x17, #+1472] +ldr q30, [x17, #+1488] +sqrdmulh v18.4S, v3.4S, v14.4S +mul v3.4S, v3.4S,v12.4S +mla v3.4S, v18.4S, v31.s[0] +sub v18.4s, v9.4s, v3.4s +add v9.4s, v9.4s, v3.4s +ldr q3, [x17, #+1504] +ldr q19, [x17, #+1520] +sqrdmulh v7.4S, v15.4S, v30.4S +ldr q8, [x0, #224] +mul v15.4S, v15.4S,v28.4S +mla v15.4S, v7.4S, v31.s[0] +sub v7.4s, v11.4s, v15.4s +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v21.4S, v19.4S +ldr q0, [x0, #240] +mul v21.4S, v21.4S,v3.4S +mla v21.4S, v15.4S, v31.s[0] +sub v15.4s, v4.4s, v21.4s +add v4.4s, v4.4s, v21.4s +str q6, [x0, #128] +str q22, [x0, #144] +str q9, [x0, #160] +str q18, [x0, #176] +str q11, [x0, #640] +str q7, [x0, #656] +str q4, [x0, #672] +str q15, [x0, #688] +ldr q19, [x17, #+512] +ldr q3, [x17, #+528] +ldr q30, [x0, #192] +ldr q28, [x0, #208] +ldr q13, [x17, #+1536] +ldr q27, [x17, #+1552] +sqrdmulh v5.4S, v8.4S, v3.s[0] +ldr q2, [x0, #736] +mul v8.4S, v8.4S,v19.s[0] +ldr q15, [x0, #752] +mla v8.4S, v5.4S, v31.s[0] +sub v5.4s, v30.4s, v8.4s +add v30.4s, v30.4s, v8.4s +sqrdmulh v8.4S, v0.4S, v3.s[0] +ldr q4, [x0, #704] +mul v0.4S, v0.4S,v19.s[0] +ldr q7, [x0, #720] +mla v0.4S, v8.4S, v31.s[0] +sub v8.4s, v28.4s, v0.4s +add v28.4s, v28.4s, v0.4s +sqrdmulh v0.4S, v2.4S, v27.s[0] +mul v2.4S, v2.4S,v13.s[0] +mla v2.4S, v0.4S, v31.s[0] +sub v0.4s, v4.4s, v2.4s +add v4.4s, v4.4s, v2.4s +sqrdmulh v2.4S, v15.4S, v27.s[0] +mul v15.4S, v15.4S,v13.s[0] +mla v15.4S, v2.4S, v31.s[0] +sub v2.4s, v7.4s, v15.4s +add v7.4s, v7.4s, v15.4s +sqrdmulh v15.4S, v28.4S, v3.s[1] +mul v28.4S, v28.4S,v19.s[1] +mla v28.4S, v15.4S, v31.s[0] +sub v15.4s, v30.4s, v28.4s +add v30.4s, v30.4s, v28.4s +sqrdmulh v28.4S, v8.4S, v3.s[2] +mul v8.4S, v8.4S,v19.s[2] +mla v8.4S, v28.4S, v31.s[0] +sub v28.4s, v5.4s, v8.4s +add v5.4s, v5.4s, v8.4s +sqrdmulh v8.4S, v7.4S, v27.s[1] +mul v7.4S, v7.4S,v13.s[1] +mla v7.4S, v8.4S, v31.s[0] +sub v8.4s, v4.4s, v7.4s +add v4.4s, v4.4s, v7.4s +sqrdmulh v7.4S, v2.4S, v27.s[2] +mul v2.4S, v2.4S,v13.s[2] +mla v2.4S, v7.4S, v31.s[0] +sub v7.4s, v0.4s, v2.4s +add v0.4s, v0.4s, v2.4s +trn1 v2.4S, v30.4S, v15.4S +trn2 v11.4S, v30.4S, v15.4S +trn1 v14.4S, v5.4S, v28.4S +trn2 v12.4S, v5.4S, v28.4S +trn2 v5.2D, v2.2D, v14.2D +trn2 v28.2D, v11.2D, v12.2D +trn1 v30.2D, v2.2D, v14.2D +trn1 v15.2D, v11.2D, v12.2D +ldr q12, [x17, #+544] +ldr q11, [x17, #+560] +trn1 v14.4S, v4.4S, v8.4S +trn2 v2.4S, v4.4S, v8.4S +trn1 v17.4S, v0.4S, v7.4S +trn2 v1.4S, v0.4S, v7.4S +trn2 v0.2D, v14.2D, v17.2D +trn2 v7.2D, v2.2D, v1.2D +trn1 v4.2D, v14.2D, v17.2D +trn1 v8.2D, v2.2D, v1.2D +ldr q1, [x17, #+1568] +ldr q2, [x17, #+1584] +sqrdmulh v17.4S, v5.4S, v11.4S +mul v5.4S, v5.4S,v12.4S +mla v5.4S, v17.4S, v31.s[0] +sub v17.4s, v30.4s, v5.4s +add v30.4s, v30.4s, v5.4s +sqrdmulh v5.4S, v28.4S, v11.4S +mul v28.4S, v28.4S,v12.4S +mla v28.4S, v5.4S, v31.s[0] +sub v5.4s, v15.4s, v28.4s +add v15.4s, v15.4s, v28.4s +sqrdmulh v28.4S, v0.4S, v2.4S +mul v0.4S, v0.4S,v1.4S +mla v0.4S, v28.4S, v31.s[0] +sub v28.4s, v4.4s, v0.4s +add v4.4s, v4.4s, v0.4s +ldr q0, [x17, #+576] +ldr q14, [x17, #+592] +sqrdmulh v10.4S, v7.4S, v2.4S +mul v7.4S, v7.4S,v1.4S +mla v7.4S, v10.4S, v31.s[0] +sub v10.4s, v8.4s, v7.4s +add v8.4s, v8.4s, v7.4s +ldr q7, [x17, #+608] +ldr q29, [x17, #+624] +sqrdmulh v16.4S, v15.4S, v14.4S +mul v15.4S, v15.4S,v0.4S +mla v15.4S, v16.4S, v31.s[0] +sub v16.4s, v30.4s, v15.4s +add v30.4s, v30.4s, v15.4s +ldr q15, [x17, #+1600] +ldr q20, [x17, #+1616] +sqrdmulh v18.4S, v5.4S, v29.4S +mul v5.4S, v5.4S,v7.4S +mla v5.4S, v18.4S, v31.s[0] +sub v18.4s, v17.4s, v5.4s +add v17.4s, v17.4s, v5.4s +ldr q5, [x17, #+1632] +ldr q9, [x17, #+1648] +sqrdmulh v22.4S, v8.4S, v20.4S +ldr q6, [x0, #288] +mul v8.4S, v8.4S,v15.4S +mla v8.4S, v22.4S, v31.s[0] +sub v22.4s, v4.4s, v8.4s +add v4.4s, v4.4s, v8.4s +sqrdmulh v8.4S, v10.4S, v9.4S +ldr q21, [x0, #304] +mul v10.4S, v10.4S,v5.4S +mla v10.4S, v8.4S, v31.s[0] +sub v8.4s, v28.4s, v10.4s +add v28.4s, v28.4s, v10.4s +str q30, [x0, #192] +str q16, [x0, #208] +str q17, [x0, #224] +str q18, [x0, #240] +str q4, [x0, #704] +str q22, [x0, #720] +str q28, [x0, #736] +str q8, [x0, #752] +ldr q9, [x17, #+640] +ldr q5, [x17, #+656] +ldr q20, [x0, #256] +ldr q15, [x0, #272] +ldr q2, [x17, #+1664] +ldr q1, [x17, #+1680] +sqrdmulh v27.4S, v6.4S, v5.s[0] +ldr q13, [x0, #800] +mul v6.4S, v6.4S,v9.s[0] +ldr q8, [x0, #816] +mla v6.4S, v27.4S, v31.s[0] +sub v27.4s, v20.4s, v6.4s +add v20.4s, v20.4s, v6.4s +sqrdmulh v6.4S, v21.4S, v5.s[0] +ldr q28, [x0, #768] +mul v21.4S, v21.4S,v9.s[0] +ldr q22, [x0, #784] +mla v21.4S, v6.4S, v31.s[0] +sub v6.4s, v15.4s, v21.4s +add v15.4s, v15.4s, v21.4s +sqrdmulh v21.4S, v13.4S, v1.s[0] +mul v13.4S, v13.4S,v2.s[0] +mla v13.4S, v21.4S, v31.s[0] +sub v21.4s, v28.4s, v13.4s +add v28.4s, v28.4s, v13.4s +sqrdmulh v13.4S, v8.4S, v1.s[0] +mul v8.4S, v8.4S,v2.s[0] +mla v8.4S, v13.4S, v31.s[0] +sub v13.4s, v22.4s, v8.4s +add v22.4s, v22.4s, v8.4s +sqrdmulh v8.4S, v15.4S, v5.s[1] +mul v15.4S, v15.4S,v9.s[1] +mla v15.4S, v8.4S, v31.s[0] +sub v8.4s, v20.4s, v15.4s +add v20.4s, v20.4s, v15.4s +sqrdmulh v15.4S, v6.4S, v5.s[2] +mul v6.4S, v6.4S,v9.s[2] +mla v6.4S, v15.4S, v31.s[0] +sub v15.4s, v27.4s, v6.4s +add v27.4s, v27.4s, v6.4s +sqrdmulh v6.4S, v22.4S, v1.s[1] +mul v22.4S, v22.4S,v2.s[1] +mla v22.4S, v6.4S, v31.s[0] +sub v6.4s, v28.4s, v22.4s +add v28.4s, v28.4s, v22.4s +sqrdmulh v22.4S, v13.4S, v1.s[2] +mul v13.4S, v13.4S,v2.s[2] +mla v13.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +trn1 v13.4S, v20.4S, v8.4S +trn2 v4.4S, v20.4S, v8.4S +trn1 v29.4S, v27.4S, v15.4S +trn2 v7.4S, v27.4S, v15.4S +trn2 v27.2D, v13.2D, v29.2D +trn2 v15.2D, v4.2D, v7.2D +trn1 v20.2D, v13.2D, v29.2D +trn1 v8.2D, v4.2D, v7.2D +ldr q7, [x17, #+672] +ldr q4, [x17, #+688] +trn1 v29.4S, v28.4S, v6.4S +trn2 v13.4S, v28.4S, v6.4S +trn1 v14.4S, v21.4S, v22.4S +trn2 v0.4S, v21.4S, v22.4S +trn2 v21.2D, v29.2D, v14.2D +trn2 v22.2D, v13.2D, v0.2D +trn1 v28.2D, v29.2D, v14.2D +trn1 v6.2D, v13.2D, v0.2D +ldr q0, [x17, #+1696] +ldr q13, [x17, #+1712] +sqrdmulh v14.4S, v27.4S, v4.4S +mul v27.4S, v27.4S,v7.4S +mla v27.4S, v14.4S, v31.s[0] +sub v14.4s, v20.4s, v27.4s +add v20.4s, v20.4s, v27.4s +sqrdmulh v27.4S, v15.4S, v4.4S +mul v15.4S, v15.4S,v7.4S +mla v15.4S, v27.4S, v31.s[0] +sub v27.4s, v8.4s, v15.4s +add v8.4s, v8.4s, v15.4s +sqrdmulh v15.4S, v21.4S, v13.4S +mul v21.4S, v21.4S,v0.4S +mla v21.4S, v15.4S, v31.s[0] +sub v15.4s, v28.4s, v21.4s +add v28.4s, v28.4s, v21.4s +ldr q21, [x17, #+704] +ldr q29, [x17, #+720] +sqrdmulh v11.4S, v22.4S, v13.4S +mul v22.4S, v22.4S,v0.4S +mla v22.4S, v11.4S, v31.s[0] +sub v11.4s, v6.4s, v22.4s +add v6.4s, v6.4s, v22.4s +ldr q22, [x17, #+736] +ldr q12, [x17, #+752] +sqrdmulh v3.4S, v8.4S, v29.4S +mul v8.4S, v8.4S,v21.4S +mla v8.4S, v3.4S, v31.s[0] +sub v3.4s, v20.4s, v8.4s +add v20.4s, v20.4s, v8.4s +ldr q8, [x17, #+1728] +ldr q19, [x17, #+1744] +sqrdmulh v18.4S, v27.4S, v12.4S +mul v27.4S, v27.4S,v22.4S +mla v27.4S, v18.4S, v31.s[0] +sub v18.4s, v14.4s, v27.4s +add v14.4s, v14.4s, v27.4s +ldr q27, [x17, #+1760] +ldr q17, [x17, #+1776] +sqrdmulh v16.4S, v6.4S, v19.4S +ldr q30, [x0, #352] +mul v6.4S, v6.4S,v8.4S +mla v6.4S, v16.4S, v31.s[0] +sub v16.4s, v28.4s, v6.4s +add v28.4s, v28.4s, v6.4s +sqrdmulh v6.4S, v11.4S, v17.4S +ldr q10, [x0, #368] +mul v11.4S, v11.4S,v27.4S +mla v11.4S, v6.4S, v31.s[0] +sub v6.4s, v15.4s, v11.4s +add v15.4s, v15.4s, v11.4s +str q20, [x0, #256] +str q3, [x0, #272] +str q14, [x0, #288] +str q18, [x0, #304] +str q28, [x0, #768] +str q16, [x0, #784] +str q15, [x0, #800] +str q6, [x0, #816] +ldr q17, [x17, #+768] +ldr q27, [x17, #+784] +ldr q19, [x0, #320] +ldr q8, [x0, #336] +ldr q13, [x17, #+1792] +ldr q0, [x17, #+1808] +sqrdmulh v1.4S, v30.4S, v27.s[0] +ldr q2, [x0, #864] +mul v30.4S, v30.4S,v17.s[0] +ldr q6, [x0, #880] +mla v30.4S, v1.4S, v31.s[0] +sub v1.4s, v19.4s, v30.4s +add v19.4s, v19.4s, v30.4s +sqrdmulh v30.4S, v10.4S, v27.s[0] +ldr q15, [x0, #832] +mul v10.4S, v10.4S,v17.s[0] +ldr q16, [x0, #848] +mla v10.4S, v30.4S, v31.s[0] +sub v30.4s, v8.4s, v10.4s +add v8.4s, v8.4s, v10.4s +sqrdmulh v10.4S, v2.4S, v0.s[0] +mul v2.4S, v2.4S,v13.s[0] +mla v2.4S, v10.4S, v31.s[0] +sub v10.4s, v15.4s, v2.4s +add v15.4s, v15.4s, v2.4s +sqrdmulh v2.4S, v6.4S, v0.s[0] +mul v6.4S, v6.4S,v13.s[0] +mla v6.4S, v2.4S, v31.s[0] +sub v2.4s, v16.4s, v6.4s +add v16.4s, v16.4s, v6.4s +sqrdmulh v6.4S, v8.4S, v27.s[1] +mul v8.4S, v8.4S,v17.s[1] +mla v8.4S, v6.4S, v31.s[0] +sub v6.4s, v19.4s, v8.4s +add v19.4s, v19.4s, v8.4s +sqrdmulh v8.4S, v30.4S, v27.s[2] +mul v30.4S, v30.4S,v17.s[2] +mla v30.4S, v8.4S, v31.s[0] +sub v8.4s, v1.4s, v30.4s +add v1.4s, v1.4s, v30.4s +sqrdmulh v30.4S, v16.4S, v0.s[1] +mul v16.4S, v16.4S,v13.s[1] +mla v16.4S, v30.4S, v31.s[0] +sub v30.4s, v15.4s, v16.4s +add v15.4s, v15.4s, v16.4s +sqrdmulh v16.4S, v2.4S, v0.s[2] +mul v2.4S, v2.4S,v13.s[2] +mla v2.4S, v16.4S, v31.s[0] +sub v16.4s, v10.4s, v2.4s +add v10.4s, v10.4s, v2.4s +trn1 v2.4S, v19.4S, v6.4S +trn2 v28.4S, v19.4S, v6.4S +trn1 v12.4S, v1.4S, v8.4S +trn2 v22.4S, v1.4S, v8.4S +trn2 v1.2D, v2.2D, v12.2D +trn2 v8.2D, v28.2D, v22.2D +trn1 v19.2D, v2.2D, v12.2D +trn1 v6.2D, v28.2D, v22.2D +ldr q22, [x17, #+800] +ldr q28, [x17, #+816] +trn1 v12.4S, v15.4S, v30.4S +trn2 v2.4S, v15.4S, v30.4S +trn1 v29.4S, v10.4S, v16.4S +trn2 v21.4S, v10.4S, v16.4S +trn2 v10.2D, v12.2D, v29.2D +trn2 v16.2D, v2.2D, v21.2D +trn1 v15.2D, v12.2D, v29.2D +trn1 v30.2D, v2.2D, v21.2D +ldr q21, [x17, #+1824] +ldr q2, [x17, #+1840] +sqrdmulh v29.4S, v1.4S, v28.4S +mul v1.4S, v1.4S,v22.4S +mla v1.4S, v29.4S, v31.s[0] +sub v29.4s, v19.4s, v1.4s +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v8.4S, v28.4S +mul v8.4S, v8.4S,v22.4S +mla v8.4S, v1.4S, v31.s[0] +sub v1.4s, v6.4s, v8.4s +add v6.4s, v6.4s, v8.4s +sqrdmulh v8.4S, v10.4S, v2.4S +mul v10.4S, v10.4S,v21.4S +mla v10.4S, v8.4S, v31.s[0] +sub v8.4s, v15.4s, v10.4s +add v15.4s, v15.4s, v10.4s +ldr q10, [x17, #+832] +ldr q12, [x17, #+848] +sqrdmulh v4.4S, v16.4S, v2.4S +mul v16.4S, v16.4S,v21.4S +mla v16.4S, v4.4S, v31.s[0] +sub v4.4s, v30.4s, v16.4s +add v30.4s, v30.4s, v16.4s +ldr q16, [x17, #+864] +ldr q7, [x17, #+880] +sqrdmulh v5.4S, v6.4S, v12.4S +mul v6.4S, v6.4S,v10.4S +mla v6.4S, v5.4S, v31.s[0] +sub v5.4s, v19.4s, v6.4s +add v19.4s, v19.4s, v6.4s +ldr q6, [x17, #+1856] +ldr q9, [x17, #+1872] +sqrdmulh v18.4S, v1.4S, v7.4S +mul v1.4S, v1.4S,v16.4S +mla v1.4S, v18.4S, v31.s[0] +sub v18.4s, v29.4s, v1.4s +add v29.4s, v29.4s, v1.4s +ldr q1, [x17, #+1888] +ldr q14, [x17, #+1904] +sqrdmulh v3.4S, v30.4S, v9.4S +ldr q20, [x0, #416] +mul v30.4S, v30.4S,v6.4S +mla v30.4S, v3.4S, v31.s[0] +sub v3.4s, v15.4s, v30.4s +add v15.4s, v15.4s, v30.4s +sqrdmulh v30.4S, v4.4S, v14.4S +ldr q11, [x0, #432] +mul v4.4S, v4.4S,v1.4S +mla v4.4S, v30.4S, v31.s[0] +sub v30.4s, v8.4s, v4.4s +add v8.4s, v8.4s, v4.4s +str q19, [x0, #320] +str q5, [x0, #336] +str q29, [x0, #352] +str q18, [x0, #368] +str q15, [x0, #832] +str q3, [x0, #848] +str q8, [x0, #864] +str q30, [x0, #880] +ldr q14, [x17, #+896] +ldr q1, [x17, #+912] +ldr q9, [x0, #384] +ldr q6, [x0, #400] +ldr q2, [x17, #+1920] +ldr q21, [x17, #+1936] +sqrdmulh v0.4S, v20.4S, v1.s[0] +ldr q13, [x0, #928] +mul v20.4S, v20.4S,v14.s[0] +ldr q30, [x0, #944] +mla v20.4S, v0.4S, v31.s[0] +sub v0.4s, v9.4s, v20.4s +add v9.4s, v9.4s, v20.4s +sqrdmulh v20.4S, v11.4S, v1.s[0] +ldr q8, [x0, #896] +mul v11.4S, v11.4S,v14.s[0] +ldr q3, [x0, #912] +mla v11.4S, v20.4S, v31.s[0] +sub v20.4s, v6.4s, v11.4s +add v6.4s, v6.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v21.s[0] +mul v13.4S, v13.4S,v2.s[0] +mla v13.4S, v11.4S, v31.s[0] +sub v11.4s, v8.4s, v13.4s +add v8.4s, v8.4s, v13.4s +sqrdmulh v13.4S, v30.4S, v21.s[0] +mul v30.4S, v30.4S,v2.s[0] +mla v30.4S, v13.4S, v31.s[0] +sub v13.4s, v3.4s, v30.4s +add v3.4s, v3.4s, v30.4s +sqrdmulh v30.4S, v6.4S, v1.s[1] +mul v6.4S, v6.4S,v14.s[1] +mla v6.4S, v30.4S, v31.s[0] +sub v30.4s, v9.4s, v6.4s +add v9.4s, v9.4s, v6.4s +sqrdmulh v6.4S, v20.4S, v1.s[2] +mul v20.4S, v20.4S,v14.s[2] +mla v20.4S, v6.4S, v31.s[0] +sub v6.4s, v0.4s, v20.4s +add v0.4s, v0.4s, v20.4s +sqrdmulh v20.4S, v3.4S, v21.s[1] +mul v3.4S, v3.4S,v2.s[1] +mla v3.4S, v20.4S, v31.s[0] +sub v20.4s, v8.4s, v3.4s +add v8.4s, v8.4s, v3.4s +sqrdmulh v3.4S, v13.4S, v21.s[2] +mul v13.4S, v13.4S,v2.s[2] +mla v13.4S, v3.4S, v31.s[0] +sub v3.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +trn1 v13.4S, v9.4S, v30.4S +trn2 v15.4S, v9.4S, v30.4S +trn1 v7.4S, v0.4S, v6.4S +trn2 v16.4S, v0.4S, v6.4S +trn2 v0.2D, v13.2D, v7.2D +trn2 v6.2D, v15.2D, v16.2D +trn1 v9.2D, v13.2D, v7.2D +trn1 v30.2D, v15.2D, v16.2D +ldr q16, [x17, #+928] +ldr q15, [x17, #+944] +trn1 v7.4S, v8.4S, v20.4S +trn2 v13.4S, v8.4S, v20.4S +trn1 v12.4S, v11.4S, v3.4S +trn2 v10.4S, v11.4S, v3.4S +trn2 v11.2D, v7.2D, v12.2D +trn2 v3.2D, v13.2D, v10.2D +trn1 v8.2D, v7.2D, v12.2D +trn1 v20.2D, v13.2D, v10.2D +ldr q10, [x17, #+1952] +ldr q13, [x17, #+1968] +sqrdmulh v12.4S, v0.4S, v15.4S +mul v0.4S, v0.4S,v16.4S +mla v0.4S, v12.4S, v31.s[0] +sub v12.4s, v9.4s, v0.4s +add v9.4s, v9.4s, v0.4s +sqrdmulh v0.4S, v6.4S, v15.4S +mul v6.4S, v6.4S,v16.4S +mla v6.4S, v0.4S, v31.s[0] +sub v0.4s, v30.4s, v6.4s +add v30.4s, v30.4s, v6.4s +sqrdmulh v6.4S, v11.4S, v13.4S +mul v11.4S, v11.4S,v10.4S +mla v11.4S, v6.4S, v31.s[0] +sub v6.4s, v8.4s, v11.4s +add v8.4s, v8.4s, v11.4s +ldr q11, [x17, #+960] +ldr q7, [x17, #+976] +sqrdmulh v28.4S, v3.4S, v13.4S +mul v3.4S, v3.4S,v10.4S +mla v3.4S, v28.4S, v31.s[0] +sub v28.4s, v20.4s, v3.4s +add v20.4s, v20.4s, v3.4s +ldr q3, [x17, #+992] +ldr q22, [x17, #+1008] +sqrdmulh v27.4S, v30.4S, v7.4S +mul v30.4S, v30.4S,v11.4S +mla v30.4S, v27.4S, v31.s[0] +sub v27.4s, v9.4s, v30.4s +add v9.4s, v9.4s, v30.4s +ldr q30, [x17, #+1984] +ldr q17, [x17, #+2000] +sqrdmulh v18.4S, v0.4S, v22.4S +mul v0.4S, v0.4S,v3.4S +mla v0.4S, v18.4S, v31.s[0] +sub v18.4s, v12.4s, v0.4s +add v12.4s, v12.4s, v0.4s +ldr q0, [x17, #+2016] +ldr q29, [x17, #+2032] +sqrdmulh v5.4S, v20.4S, v17.4S +ldr q19, [x0, #480] +mul v20.4S, v20.4S,v30.4S +mla v20.4S, v5.4S, v31.s[0] +sub v5.4s, v8.4s, v20.4s +add v8.4s, v8.4s, v20.4s +sqrdmulh v20.4S, v28.4S, v29.4S +ldr q4, [x0, #496] +mul v28.4S, v28.4S,v0.4S +mla v28.4S, v20.4S, v31.s[0] +sub v20.4s, v6.4s, v28.4s +add v6.4s, v6.4s, v28.4s +str q9, [x0, #384] +str q27, [x0, #400] +str q12, [x0, #416] +str q18, [x0, #432] +str q8, [x0, #896] +str q5, [x0, #912] +str q6, [x0, #928] +str q20, [x0, #944] +ldr q29, [x17, #+1024] +ldr q0, [x17, #+1040] +ldr q17, [x0, #448] +ldr q30, [x0, #464] +ldr q13, [x17, #+2048] +ldr q10, [x17, #+2064] +sqrdmulh v21.4S, v19.4S, v0.s[0] +ldr q2, [x0, #992] +mul v19.4S, v19.4S,v29.s[0] +ldr q20, [x0, #1008] +mla v19.4S, v21.4S, v31.s[0] +sub v21.4s, v17.4s, v19.4s +add v17.4s, v17.4s, v19.4s +sqrdmulh v19.4S, v4.4S, v0.s[0] +ldr q6, [x0, #960] +mul v4.4S, v4.4S,v29.s[0] +ldr q5, [x0, #976] +mla v4.4S, v19.4S, v31.s[0] +sub v19.4s, v30.4s, v4.4s +add v30.4s, v30.4s, v4.4s +sqrdmulh v4.4S, v2.4S, v10.s[0] +mul v2.4S, v2.4S,v13.s[0] +mla v2.4S, v4.4S, v31.s[0] +sub v4.4s, v6.4s, v2.4s +add v6.4s, v6.4s, v2.4s +sqrdmulh v2.4S, v20.4S, v10.s[0] +mul v20.4S, v20.4S,v13.s[0] +mla v20.4S, v2.4S, v31.s[0] +sub v2.4s, v5.4s, v20.4s +add v5.4s, v5.4s, v20.4s +sqrdmulh v20.4S, v30.4S, v0.s[1] +mul v30.4S, v30.4S,v29.s[1] +mla v30.4S, v20.4S, v31.s[0] +sub v20.4s, v17.4s, v30.4s +add v17.4s, v17.4s, v30.4s +sqrdmulh v30.4S, v19.4S, v0.s[2] +mul v19.4S, v19.4S,v29.s[2] +mla v19.4S, v30.4S, v31.s[0] +sub v30.4s, v21.4s, v19.4s +add v21.4s, v21.4s, v19.4s +sqrdmulh v19.4S, v5.4S, v10.s[1] +mul v5.4S, v5.4S,v13.s[1] +mla v5.4S, v19.4S, v31.s[0] +sub v19.4s, v6.4s, v5.4s +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v2.4S, v10.s[2] +mul v2.4S, v2.4S,v13.s[2] +mla v2.4S, v5.4S, v31.s[0] +sub v5.4s, v4.4s, v2.4s +add v4.4s, v4.4s, v2.4s +trn1 v2.4S, v17.4S, v20.4S +trn2 v8.4S, v17.4S, v20.4S +trn1 v22.4S, v21.4S, v30.4S +trn2 v3.4S, v21.4S, v30.4S +trn2 v21.2D, v2.2D, v22.2D +trn2 v30.2D, v8.2D, v3.2D +trn1 v17.2D, v2.2D, v22.2D +trn1 v20.2D, v8.2D, v3.2D +ldr q3, [x17, #+1056] +ldr q8, [x17, #+1072] +trn1 v22.4S, v6.4S, v19.4S +trn2 v2.4S, v6.4S, v19.4S +trn1 v7.4S, v4.4S, v5.4S +trn2 v11.4S, v4.4S, v5.4S +trn2 v4.2D, v22.2D, v7.2D +trn2 v5.2D, v2.2D, v11.2D +trn1 v6.2D, v22.2D, v7.2D +trn1 v19.2D, v2.2D, v11.2D +ldr q11, [x17, #+2080] +ldr q2, [x17, #+2096] +sqrdmulh v7.4S, v21.4S, v8.4S +mul v21.4S, v21.4S,v3.4S +mla v21.4S, v7.4S, v31.s[0] +sub v7.4s, v17.4s, v21.4s +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v30.4S, v8.4S +mul v30.4S, v30.4S,v3.4S +mla v30.4S, v21.4S, v31.s[0] +sub v21.4s, v20.4s, v30.4s +add v20.4s, v20.4s, v30.4s +sqrdmulh v30.4S, v4.4S, v2.4S +mul v4.4S, v4.4S,v11.4S +mla v4.4S, v30.4S, v31.s[0] +sub v30.4s, v6.4s, v4.4s +add v6.4s, v6.4s, v4.4s +ldr q4, [x17, #+1088] +ldr q22, [x17, #+1104] +sqrdmulh v15.4S, v5.4S, v2.4S +mul v5.4S, v5.4S,v11.4S +mla v5.4S, v15.4S, v31.s[0] +sub v15.4s, v19.4s, v5.4s +add v19.4s, v19.4s, v5.4s +ldr q5, [x17, #+1120] +ldr q16, [x17, #+1136] +sqrdmulh v1.4S, v20.4S, v22.4S +mul v20.4S, v20.4S,v4.4S +mla v20.4S, v1.4S, v31.s[0] +sub v1.4s, v17.4s, v20.4s +add v17.4s, v17.4s, v20.4s +ldr q20, [x17, #+2112] +ldr q14, [x17, #+2128] +sqrdmulh v18.4S, v21.4S, v16.4S +mul v21.4S, v21.4S,v5.4S +mla v21.4S, v18.4S, v31.s[0] +sub v18.4s, v7.4s, v21.4s +add v7.4s, v7.4s, v21.4s +ldr q21, [x17, #+2144] +ldr q12, [x17, #+2160] +sqrdmulh v27.4S, v19.4S, v14.4S +mul v19.4S, v19.4S,v20.4S +mla v19.4S, v27.4S, v31.s[0] +sub v27.4s, v6.4s, v19.4s +add v6.4s, v6.4s, v19.4s +sqrdmulh v19.4S, v15.4S, v12.4S +mul v15.4S, v15.4S,v21.4S +mla v15.4S, v19.4S, v31.s[0] +sub v19.4s, v30.4s, v15.4s +add v30.4s, v30.4s, v15.4s +str q17, [x0, #448] +str q1, [x0, #464] +str q7, [x0, #480] +str q18, [x0, #496] +str q6, [x0, #960] +str q27, [x0, #976] +str q30, [x0, #992] +str q19, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 2392 +// Instruction count: 2388 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z2_4.s b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z2_4.s new file mode 100644 index 0000000..b9b1089 --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z2_4.s @@ -0,0 +1,2422 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 26036764 // Layer 6, block 0 +.word 7065381 // Layer 6, block 1 +.word 11280567 // Layer 6, block 2 +.word 19695786 // Layer 6, block 3 +.word 1666225723 // Layer 6, block 0 +.word 452149874 // Layer 6, block 1 +.word 721901190 // Layer 6, block 2 +.word 1260434103 // Layer 6, block 3 +.word 28678040 // Layer 7, block 0 +.word 5637166 // Layer 7, block 2 +.word 18759424 // Layer 7, block 4 +.word 8648030 // Layer 7, block 6 +.word 1835254486 // Layer 7, block 0 +.word 360751090 // Layer 7, block 2 +.word 1200511508 // Layer 7, block 4 +.word 553431680 // Layer 7, block 6 +.word 7232147 // Layer 7, block 1 +.word 7430689 // Layer 7, block 3 +.word 14819378 // Layer 7, block 5 +.word 22112339 // Layer 7, block 7 +.word 462822084 // Layer 7, block 1 +.word 475527802 // Layer 7, block 3 +.word 948367809 // Layer 7, block 5 +.word 1415081692 // Layer 7, block 7 +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14834498 // Layer 6, block 4 +.word 22861321 // Layer 6, block 5 +.word 23033862 // Layer 6, block 6 +.word 32211066 // Layer 6, block 7 +.word 949335415 // Layer 6, block 4 +.word 1463012881 // Layer 6, block 5 +.word 1474054663 // Layer 6, block 6 +.word 2061350894 // Layer 6, block 7 +.word 7103825 // Layer 7, block 8 +.word 24338119 // Layer 7, block 10 +.word 6674394 // Layer 7, block 12 +.word 3716128 // Layer 7, block 14 +.word 454610102 // Layer 7, block 8 +.word 1557520740 // Layer 7, block 10 +.word 427128616 // Layer 7, block 12 +.word 237814041 // Layer 7, block 14 +.word 18577393 // Layer 7, block 9 +.word 17042091 // Layer 7, block 11 +.word 6574213 // Layer 7, block 13 +.word 24666803 // Layer 7, block 15 +.word 1188862414 // Layer 7, block 9 +.word 1090610585 // Layer 7, block 11 +.word 420717521 // Layer 7, block 13 +.word 1578554911 // Layer 7, block 15 +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 11253846 // Layer 6, block 8 +.word 16151303 // Layer 6, block 9 +.word 1821442 // Layer 6, block 10 +.word 23358663 // Layer 6, block 11 +.word 720191176 // Layer 6, block 8 +.word 1033604503 // Layer 6, block 9 +.word 116563391 // Layer 6, block 10 +.word 1494840340 // Layer 6, block 11 +.word 32787475 // Layer 7, block 16 +.word 8269259 // Layer 7, block 18 +.word 20826321 // Layer 7, block 20 +.word 21194054 // Layer 7, block 22 +.word 2098238255 // Layer 7, block 16 +.word 529192186 // Layer 7, block 18 +.word 1332782821 // Layer 7, block 20 +.word 1356315937 // Layer 7, block 22 +.word 28400654 // Layer 7, block 17 +.word 31090287 // Layer 7, block 19 +.word 26776841 // Layer 7, block 21 +.word 22281074 // Layer 7, block 23 +.word 1817503137 // Layer 7, block 17 +.word 1989626512 // Layer 7, block 19 +.word 1713587037 // Layer 7, block 21 +.word 1425879908 // Layer 7, block 23 +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 20504641 // Layer 6, block 12 +.word 7735096 // Layer 6, block 13 +.word 29463916 // Layer 6, block 14 +.word 23172067 // Layer 6, block 15 +.word 1312196872 // Layer 6, block 12 +.word 495008363 // Layer 6, block 13 +.word 1885546712 // Layer 6, block 14 +.word 1482899108 // Layer 6, block 15 +.word 1953000 // Layer 7, block 24 +.word 12766243 // Layer 7, block 26 +.word 16292342 // Layer 7, block 28 +.word 25143337 // Layer 7, block 30 +.word 124982461 // Layer 7, block 24 +.word 816977197 // Layer 7, block 26 +.word 1042630311 // Layer 7, block 28 +.word 1609050759 // Layer 7, block 30 +.word 12486848 // Layer 7, block 25 +.word 31556661 // Layer 7, block 27 +.word 28330310 // Layer 7, block 29 +.word 15137961 // Layer 7, block 31 +.word 799097282 // Layer 7, block 25 +.word 2019472170 // Layer 7, block 27 +.word 1813001465 // Layer 7, block 29 +.word 968755565 // Layer 7, block 31 +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 18663828 // Layer 6, block 16 +.word 25765932 // Layer 6, block 17 +.word 11779122 // Layer 6, block 18 +.word 29112305 // Layer 6, block 19 +.word 1194393831 // Layer 6, block 16 +.word 1648893798 // Layer 6, block 17 +.word 753806275 // Layer 6, block 18 +.word 1863045325 // Layer 6, block 19 +.word 33163184 // Layer 7, block 32 +.word 11550623 // Layer 7, block 34 +.word 25375595 // Layer 7, block 36 +.word 18254638 // Layer 7, block 38 +.word 2122281795 // Layer 7, block 32 +.word 739183455 // Layer 7, block 34 +.word 1623914137 // Layer 7, block 36 +.word 1168207670 // Layer 7, block 38 +.word 9551359 // Layer 7, block 33 +.word 33257316 // Layer 7, block 35 +.word 10387700 // Layer 7, block 37 +.word 4263629 // Layer 7, block 39 +.word 611240324 // Layer 7, block 33 +.word 2128305784 // Layer 7, block 35 +.word 664762063 // Layer 7, block 37 +.word 272851431 // Layer 7, block 39 +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 596073 // Layer 6, block 20 +.word 29039358 // Layer 6, block 21 +.word 6760262 // Layer 6, block 22 +.word 2228887 // Layer 6, block 23 +.word 38145761 // Layer 6, block 20 +.word 1858377074 // Layer 6, block 21 +.word 432623749 // Layer 6, block 22 +.word 142637881 // Layer 6, block 23 +.word 25929180 // Layer 7, block 40 +.word 23508428 // Layer 7, block 42 +.word 22560727 // Layer 7, block 44 +.word 29457393 // Layer 7, block 46 +.word 1659340873 // Layer 7, block 40 +.word 1504424569 // Layer 7, block 42 +.word 1443776334 // Layer 7, block 44 +.word 1885129272 // Layer 7, block 46 +.word 17371159 // Layer 7, block 41 +.word 11558208 // Layer 7, block 43 +.word 15755637 // Layer 7, block 45 +.word 20740787 // Layer 7, block 47 +.word 1111669329 // Layer 7, block 41 +.word 739668858 // Layer 7, block 43 +.word 1008283812 // Layer 7, block 45 +.word 1327309063 // Layer 7, block 47 +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 13624329 // Layer 6, block 24 +.word 9838349 // Layer 6, block 25 +.word 6934560 // Layer 6, block 26 +.word 11310234 // Layer 6, block 27 +.word 871890510 // Layer 6, block 24 +.word 629606282 // Layer 6, block 25 +.word 443777969 // Layer 6, block 26 +.word 723799733 // Layer 6, block 27 +.word 3153984 // Layer 7, block 48 +.word 15599806 // Layer 7, block 50 +.word 23484790 // Layer 7, block 52 +.word 30174454 // Layer 7, block 54 +.word 201839571 // Layer 7, block 48 +.word 998311389 // Layer 7, block 50 +.word 1502911852 // Layer 7, block 52 +.word 1931017673 // Layer 7, block 54 +.word 13598070 // Layer 7, block 49 +.word 31454003 // Layer 7, block 51 +.word 20506260 // Layer 7, block 53 +.word 5928435 // Layer 7, block 55 +.word 870210062 // Layer 7, block 49 +.word 2012902560 // Layer 7, block 51 +.word 1312300480 // Layer 7, block 53 +.word 379390883 // Layer 7, block 55 +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 32798516 // Layer 6, block 28 +.word 9911360 // Layer 6, block 29 +.word 32443170 // Layer 6, block 30 +.word 31293482 // Layer 6, block 31 +.word 2098944825 // Layer 6, block 28 +.word 634278629 // Layer 6, block 29 +.word 2076204416 // Layer 6, block 30 +.word 2002630000 // Layer 6, block 31 +.word 26013877 // Layer 7, block 56 +.word 22928950 // Layer 7, block 58 +.word 24547058 // Layer 7, block 60 +.word 21082546 // Layer 7, block 62 +.word 1664761067 // Layer 7, block 56 +.word 1467340807 // Layer 7, block 58 +.word 1570891816 // Layer 7, block 60 +.word 1349179970 // Layer 7, block 62 +.word 21864746 // Layer 7, block 57 +.word 27678266 // Layer 7, block 59 +.word 30695887 // Layer 7, block 61 +.word 31772478 // Layer 7, block 63 +.word 1399236949 // Layer 7, block 57 +.word 1771273834 // Layer 7, block 59 +.word 1964386839 // Layer 7, block 61 +.word 2033283404 // Layer 7, block 63 +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 2853776 // Layer 6, block 32 +.word 31645959 // Layer 6, block 33 +.word 29723614 // Layer 6, block 34 +.word 31813171 // Layer 6, block 35 +.word 182627725 // Layer 6, block 32 +.word 2025186806 // Layer 6, block 33 +.word 1902166116 // Layer 6, block 34 +.word 2035887557 // Layer 6, block 35 +.word 30377953 // Layer 7, block 64 +.word 4924837 // Layer 7, block 66 +.word 11362575 // Layer 7, block 68 +.word 31398766 // Layer 7, block 70 +.word 1944040616 // Layer 7, block 64 +.word 315165513 // Layer 7, block 66 +.word 727149301 // Layer 7, block 68 +.word 2009367662 // Layer 7, block 70 +.word 27689101 // Layer 7, block 65 +.word 31229525 // Layer 7, block 67 +.word 6544948 // Layer 7, block 69 +.word 13728247 // Layer 7, block 71 +.word 1771967221 // Layer 7, block 65 +.word 1998537064 // Layer 7, block 67 +.word 418844704 // Layer 7, block 69 +.word 878540754 // Layer 7, block 71 +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9116920 // Layer 6, block 36 +.word 26449800 // Layer 6, block 37 +.word 27173300 // Layer 6, block 38 +.word 1574249 // Layer 6, block 39 +.word 583438350 // Layer 6, block 36 +.word 1692658010 // Layer 6, block 37 +.word 1738958476 // Layer 6, block 38 +.word 100744247 // Layer 6, block 39 +.word 6510145 // Layer 7, block 72 +.word 760999 // Layer 7, block 74 +.word 1634503 // Layer 7, block 76 +.word 29546109 // Layer 7, block 78 +.word 416617482 // Layer 7, block 72 +.word 48700219 // Layer 7, block 74 +.word 104600209 // Layer 7, block 76 +.word 1890806663 // Layer 7, block 78 +.word 2195232 // Layer 7, block 73 +.word 4465852 // Layer 7, block 75 +.word 31203102 // Layer 7, block 77 +.word 29916743 // Layer 7, block 79 +.word 140484126 // Layer 7, block 73 +.word 285792715 // Layer 7, block 75 +.word 1996846121 // Layer 7, block 77 +.word 1914525428 // Layer 7, block 79 +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29172999 // Layer 6, block 40 +.word 16825951 // Layer 6, block 41 +.word 11592382 // Layer 6, block 42 +.word 2671395 // Layer 6, block 43 +.word 1866929445 // Layer 6, block 40 +.word 1076778680 // Layer 6, block 41 +.word 741855827 // Layer 6, block 42 +.word 170956232 // Layer 6, block 43 +.word 14579779 // Layer 7, block 80 +.word 24263513 // Layer 7, block 82 +.word 4646776 // Layer 7, block 84 +.word 69049 // Layer 7, block 86 +.word 933034643 // Layer 7, block 80 +.word 1552746321 // Layer 7, block 82 +.word 297370968 // Layer 7, block 84 +.word 4418799 // Layer 7, block 86 +.word 33263488 // Layer 7, block 81 +.word 22493246 // Layer 7, block 83 +.word 22009979 // Layer 7, block 85 +.word 12021234 // Layer 7, block 87 +.word 2128700762 // Layer 7, block 81 +.word 1439457879 // Layer 7, block 83 +.word 1408531152 // Layer 7, block 85 +.word 769300260 // Layer 7, block 87 +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 15720958 // Layer 6, block 44 +.word 4876619 // Layer 6, block 45 +.word 9370171 // Layer 6, block 46 +.word 2197027 // Layer 6, block 47 +.word 1006064525 // Layer 6, block 44 +.word 312079797 // Layer 6, block 45 +.word 599645177 // Layer 6, block 46 +.word 140598997 // Layer 6, block 47 +.word 16117282 // Layer 7, block 88 +.word 9635661 // Layer 7, block 90 +.word 9117520 // Layer 7, block 92 +.word 3506913 // Layer 7, block 94 +.word 1031427326 // Layer 7, block 88 +.word 616635240 // Layer 7, block 90 +.word 583476747 // Layer 7, block 92 +.word 224425303 // Layer 7, block 94 +.word 20014407 // Layer 7, block 89 +.word 25893988 // Layer 7, block 91 +.word 10257619 // Layer 7, block 93 +.word 24501669 // Layer 7, block 95 +.word 1280824291 // Layer 7, block 89 +.word 1657088757 // Layer 7, block 91 +.word 656437514 // Layer 7, block 93 +.word 1567987141 // Layer 7, block 95 +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 23467272 // Layer 6, block 48 +.word 11944835 // Layer 6, block 49 +.word 29768154 // Layer 6, block 50 +.word 3189790 // Layer 6, block 51 +.word 1501790786 // Layer 6, block 48 +.word 764411097 // Layer 6, block 49 +.word 1905016458 // Layer 6, block 50 +.word 204130980 // Layer 6, block 51 +.word 28559032 // Layer 7, block 96 +.word 20151609 // Layer 7, block 98 +.word 11645481 // Layer 7, block 100 +.word 16402437 // Layer 7, block 102 +.word 1827638556 // Layer 7, block 96 +.word 1289604549 // Layer 7, block 98 +.word 745253903 // Layer 7, block 100 +.word 1049675853 // Layer 7, block 102 +.word 1005359 // Layer 7, block 97 +.word 19130139 // Layer 7, block 99 +.word 11690281 // Layer 7, block 101 +.word 5461508 // Layer 7, block 103 +.word 64338065 // Layer 7, block 97 +.word 1224235458 // Layer 7, block 99 +.word 748120885 // Layer 7, block 101 +.word 349509836 // Layer 7, block 103 +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 4898455 // Layer 6, block 52 +.word 22059944 // Layer 6, block 53 +.word 20315246 // Layer 6, block 54 +.word 28615767 // Layer 6, block 55 +.word 313477194 // Layer 6, block 52 +.word 1411728668 // Layer 6, block 53 +.word 1300076517 // Layer 6, block 54 +.word 1831269319 // Layer 6, block 55 +.word 6226096 // Layer 7, block 104 +.word 14029790 // Layer 7, block 106 +.word 7729000 // Layer 7, block 108 +.word 13958531 // Layer 7, block 110 +.word 398439734 // Layer 7, block 104 +.word 897838034 // Layer 7, block 106 +.word 494618249 // Layer 7, block 108 +.word 893277806 // Layer 7, block 110 +.word 31755058 // Layer 7, block 105 +.word 26102744 // Layer 7, block 107 +.word 19175904 // Layer 7, block 109 +.word 19472238 // Layer 7, block 111 +.word 2032168609 // Layer 7, block 105 +.word 1670448121 // Layer 7, block 107 +.word 1227164194 // Layer 7, block 109 +.word 1246128123 // Layer 7, block 111 +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 17302560 // Layer 6, block 56 +.word 8630188 // Layer 6, block 57 +.word 13744680 // Layer 6, block 58 +.word 31890906 // Layer 6, block 59 +.word 1107279328 // Layer 6, block 56 +.word 552289879 // Layer 6, block 57 +.word 879592386 // Layer 6, block 58 +.word 2040862218 // Layer 6, block 59 +.word 4735938 // Layer 7, block 112 +.word 26671657 // Layer 7, block 114 +.word 25810971 // Layer 7, block 116 +.word 25578690 // Layer 7, block 118 +.word 303076900 // Layer 7, block 112 +.word 1706855774 // Layer 7, block 114 +.word 1651776074 // Layer 7, block 116 +.word 1636911225 // Layer 7, block 118 +.word 6957373 // Layer 7, block 113 +.word 25381712 // Layer 7, block 115 +.word 27780827 // Layer 7, block 117 +.word 28062311 // Layer 7, block 119 +.word 445237890 // Layer 7, block 113 +.word 1624305595 // Layer 7, block 115 +.word 1777837237 // Layer 7, block 117 +.word 1795850838 // Layer 7, block 119 +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 26150922 // Layer 6, block 60 +.word 29525906 // Layer 6, block 61 +.word 23080870 // Layer 6, block 62 +.word 1636987 // Layer 6, block 63 +.word 1673531278 // Layer 6, block 60 +.word 1889513769 // Layer 6, block 61 +.word 1477062945 // Layer 6, block 62 +.word 104759172 // Layer 6, block 63 +.word 10674616 // Layer 7, block 120 +.word 9508293 // Layer 7, block 122 +.word 4274200 // Layer 7, block 124 +.word 10066304 // Layer 7, block 126 +.word 683123285 // Layer 7, block 120 +.word 608484310 // Layer 7, block 122 +.word 273527923 // Layer 7, block 124 +.word 644194289 // Layer 7, block 126 +.word 26473446 // Layer 7, block 121 +.word 14853570 // Layer 7, block 123 +.word 32427548 // Layer 7, block 125 +.word 16598340 // Layer 7, block 127 +.word 1694171239 // Layer 7, block 121 +.word 950555930 // Layer 7, block 123 +.word 2075204685 // Layer 7, block 125 +.word 1062212688 // Layer 7, block 127 +.text +.global ntt_u32_full_neon_asm_var_4_4_3_z2_4 +.global _ntt_u32_full_neon_asm_var_4_4_3_z2_4 +ntt_u32_full_neon_asm_var_4_4_3_z2_4: +_ntt_u32_full_neon_asm_var_4_4_3_z2_4: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #800] +ldr q21, [x0, #864] +ldr q20, [x0, #928] +ldr q19, [x0, #992] +ldr q18, [x0, #288] +ldr q17, [x0, #352] +ldr q16, [x0, #416] +ldr q3, [x0, #480] +sqrdmulh v2.4S, v22.4S, v29.s[0] +ldr q1, [x0, #544] +mul v22.4S, v22.4S,v30.s[0] +ldr q0, [x0, #608] +sqrdmulh v15.4S, v21.4S, v29.s[0] +ldr q14, [x0, #672] +mul v21.4S, v21.4S,v30.s[0] +ldr q13, [x0, #736] +mla v22.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q12, [x0, #32] +sub v11.4s, v18.4s, v22.4s +mla v21.4S, v15.4S, v31.s[0] +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q15, [x0, #96] +sub v10.4s, v17.4s, v21.4s +mla v20.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v1.4S, v29.s[0] +ldr q2, [x0, #160] +mul v1.4S, v1.4S,v30.s[0] +sub v9.4s, v16.4s, v20.4s +mla v19.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v0.4S, v29.s[0] +ldr q22, [x0, #224] +mul v0.4S, v0.4S,v30.s[0] +sub v8.4s, v3.4s, v19.4s +mla v1.4S, v21.4S, v31.s[0] +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v21.4s, v12.4s, v1.4s +mla v0.4S, v20.4S, v31.s[0] +add v12.4s, v12.4s, v1.4s +sqrdmulh v1.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +sub v20.4s, v15.4s, v0.4s +mla v14.4S, v19.4S, v31.s[0] +add v15.4s, v15.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v19.4s, v2.4s, v14.4s +mla v13.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v1.4s, v22.4s, v13.4s +mla v16.4S, v0.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v0.4s, v2.4s, v16.4s +mla v3.4S, v14.4S, v31.s[0] +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v14.4s, v22.4s, v3.4s +mla v18.4S, v13.4S, v31.s[0] +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v29.s[2] +mul v9.4S, v9.4S,v30.s[2] +sub v13.4s, v12.4s, v18.4s +mla v17.4S, v16.4S, v31.s[0] +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v8.4S, v29.s[2] +mul v8.4S, v8.4S,v30.s[2] +sub v16.4s, v15.4s, v17.4s +mla v9.4S, v3.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +sqrdmulh v17.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v3.4s, v19.4s, v9.4s +mla v8.4S, v18.4S, v31.s[0] +add v19.4s, v19.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v18.4s, v1.4s, v8.4s +mla v11.4S, v17.4S, v31.s[0] +add v1.4s, v1.4s, v8.4s +sqrdmulh v8.4S, v2.4S, v27.s[0] +mul v2.4S, v2.4S,v28.s[0] +sub v17.4s, v21.4s, v11.4s +mla v10.4S, v9.4S, v31.s[0] +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v27.s[0] +mul v22.4S, v22.4S,v28.s[0] +sub v9.4s, v20.4s, v10.4s +mla v2.4S, v8.4S, v31.s[0] +add v20.4s, v20.4s, v10.4s +sqrdmulh v10.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v8.4s, v12.4s, v2.4s +mla v22.4S, v11.4S, v31.s[0] +add v12.4s, v12.4s, v2.4s +sqrdmulh v2.4S, v14.4S, v27.s[1] +mul v14.4S, v14.4S,v28.s[1] +sub v11.4s, v15.4s, v22.4s +mla v0.4S, v10.4S, v31.s[0] +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v27.s[2] +mul v19.4S, v19.4S,v28.s[2] +sub v10.4s, v13.4s, v0.4s +mla v14.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v0.4s +sqrdmulh v0.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +sub v2.4s, v16.4s, v14.4s +mla v19.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v27.s[3] +mul v3.4S, v3.4S,v28.s[3] +sub v22.4s, v21.4s, v19.4s +mla v1.4S, v0.4S, v31.s[0] +add v21.4s, v21.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v0.4s, v20.4s, v1.4s +mla v3.4S, v14.4S, v31.s[0] +add v20.4s, v20.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v25.s[0] +mul v15.4S, v15.4S,v26.s[0] +sub v14.4s, v17.4s, v3.4s +mla v18.4S, v19.4S, v31.s[0] +add v17.4s, v17.4s, v3.4s +sqrdmulh v3.4S, v11.4S, v25.s[1] +mul v11.4S, v11.4S,v26.s[1] +sub v19.4s, v9.4s, v18.4s +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v1.4s, v12.4s, v15.4s +mla v11.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v25.s[3] +mul v2.4S, v2.4S,v26.s[3] +sub v3.4s, v8.4s, v11.4s +mla v16.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v11.4s +str q12, [x0, #32] +sqrdmulh v12.4S, v20.4S, v23.s[0] +str q1, [x0, #96] +mul v20.4S, v20.4S,v24.s[0] +ldr q1, [x0, #816] +sub v11.4s, v13.4s, v16.4s +ldr q18, [x0, #880] +mla v2.4S, v15.4S, v31.s[0] +add v13.4s, v13.4s, v16.4s +str q8, [x0, #160] +sqrdmulh v8.4S, v0.4S, v23.s[1] +str q3, [x0, #224] +mul v0.4S, v0.4S,v24.s[1] +ldr q3, [x0, #944] +sub v16.4s, v10.4s, v2.4s +ldr q15, [x0, #1008] +mla v20.4S, v12.4S, v31.s[0] +add v10.4s, v10.4s, v2.4s +str q13, [x0, #288] +sqrdmulh v13.4S, v9.4S, v23.s[2] +str q11, [x0, #352] +mul v9.4S, v9.4S,v24.s[2] +ldr q11, [x0, #304] +sub v2.4s, v21.4s, v20.4s +ldr q12, [x0, #368] +mla v0.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v20.4s +str q10, [x0, #416] +sqrdmulh v10.4S, v19.4S, v23.s[3] +str q16, [x0, #480] +mul v19.4S, v19.4S,v24.s[3] +ldr q16, [x0, #432] +sub v20.4s, v22.4s, v0.4s +ldr q8, [x0, #496] +mla v9.4S, v13.4S, v31.s[0] +add v22.4s, v22.4s, v0.4s +str q21, [x0, #544] +sqrdmulh v21.4S, v1.4S, v29.s[0] +str q2, [x0, #608] +ldr q2, [x0, #560] +mul v1.4S, v1.4S,v30.s[0] +ldr q0, [x0, #624] +sub v13.4s, v17.4s, v9.4s +mla v19.4S, v10.4S, v31.s[0] +add v17.4s, v17.4s, v9.4s +str q22, [x0, #672] +sqrdmulh v22.4S, v18.4S, v29.s[0] +str q20, [x0, #736] +ldr q20, [x0, #688] +mul v18.4S, v18.4S,v30.s[0] +ldr q9, [x0, #752] +sub v10.4s, v14.4s, v19.4s +mla v1.4S, v21.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +str q17, [x0, #800] +sqrdmulh v17.4S, v3.4S, v29.s[0] +str q13, [x0, #864] +mul v3.4S, v3.4S,v30.s[0] +ldr q13, [x0, #48] +sub v19.4s, v11.4s, v1.4s +mla v18.4S, v22.4S, v31.s[0] +add v11.4s, v11.4s, v1.4s +str q14, [x0, #928] +sqrdmulh v14.4S, v15.4S, v29.s[0] +str q10, [x0, #992] +mul v15.4S, v15.4S,v30.s[0] +ldr q10, [x0, #112] +sub v1.4s, v12.4s, v18.4s +mla v3.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v2.4S, v29.s[0] +ldr q17, [x0, #176] +mul v2.4S, v2.4S,v30.s[0] +sub v22.4s, v16.4s, v3.4s +mla v15.4S, v14.4S, v31.s[0] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v0.4S, v29.s[0] +ldr q14, [x0, #240] +mul v0.4S, v0.4S,v30.s[0] +sub v21.4s, v8.4s, v15.4s +mla v2.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +sub v18.4s, v13.4s, v2.4s +mla v0.4S, v3.4S, v31.s[0] +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v9.4S, v29.s[0] +mul v9.4S, v9.4S,v30.s[0] +sub v3.4s, v10.4s, v0.4s +mla v20.4S, v15.4S, v31.s[0] +add v10.4s, v10.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v15.4s, v17.4s, v20.4s +mla v9.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +sub v2.4s, v14.4s, v9.4s +mla v16.4S, v0.4S, v31.s[0] +add v14.4s, v14.4s, v9.4s +sqrdmulh v9.4S, v11.4S, v29.s[1] +mul v11.4S, v11.4S,v30.s[1] +sub v0.4s, v17.4s, v16.4s +mla v8.4S, v20.4S, v31.s[0] +add v17.4s, v17.4s, v16.4s +sqrdmulh v16.4S, v12.4S, v29.s[1] +mul v12.4S, v12.4S,v30.s[1] +sub v20.4s, v14.4s, v8.4s +mla v11.4S, v9.4S, v31.s[0] +add v14.4s, v14.4s, v8.4s +sqrdmulh v8.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +sub v9.4s, v13.4s, v11.4s +mla v12.4S, v16.4S, v31.s[0] +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +sub v16.4s, v10.4s, v12.4s +mla v22.4S, v8.4S, v31.s[0] +add v10.4s, v10.4s, v12.4s +sqrdmulh v12.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +sub v8.4s, v15.4s, v22.4s +mla v21.4S, v11.4S, v31.s[0] +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v1.4S, v29.s[2] +mul v1.4S, v1.4S,v30.s[2] +sub v11.4s, v2.4s, v21.4s +mla v19.4S, v12.4S, v31.s[0] +add v2.4s, v2.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v27.s[0] +mul v17.4S, v17.4S,v28.s[0] +sub v12.4s, v18.4s, v19.4s +mla v1.4S, v22.4S, v31.s[0] +add v18.4s, v18.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +sub v22.4s, v3.4s, v1.4s +mla v17.4S, v21.4S, v31.s[0] +add v3.4s, v3.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v21.4s, v13.4s, v17.4s +mla v14.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v17.4s +sqrdmulh v17.4S, v20.4S, v27.s[1] +mul v20.4S, v20.4S,v28.s[1] +sub v19.4s, v10.4s, v14.4s +mla v0.4S, v1.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v27.s[2] +mul v15.4S, v15.4S,v28.s[2] +sub v1.4s, v9.4s, v0.4s +mla v20.4S, v17.4S, v31.s[0] +add v9.4s, v9.4s, v0.4s +sqrdmulh v0.4S, v2.4S, v27.s[2] +mul v2.4S, v2.4S,v28.s[2] +sub v17.4s, v16.4s, v20.4s +mla v15.4S, v14.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v27.s[3] +mul v8.4S, v8.4S,v28.s[3] +sub v14.4s, v18.4s, v15.4s +mla v2.4S, v0.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v27.s[3] +mul v11.4S, v11.4S,v28.s[3] +sub v0.4s, v3.4s, v2.4s +mla v8.4S, v20.4S, v31.s[0] +add v3.4s, v3.4s, v2.4s +sqrdmulh v2.4S, v10.4S, v25.s[0] +mul v10.4S, v10.4S,v26.s[0] +sub v20.4s, v12.4s, v8.4s +mla v11.4S, v15.4S, v31.s[0] +add v12.4s, v12.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v25.s[1] +mul v19.4S, v19.4S,v26.s[1] +sub v15.4s, v22.4s, v11.4s +mla v10.4S, v2.4S, v31.s[0] +add v22.4s, v22.4s, v11.4s +sqrdmulh v11.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v2.4s, v13.4s, v10.4s +mla v19.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v10.4s +sqrdmulh v10.4S, v17.4S, v25.s[3] +mul v17.4S, v17.4S,v26.s[3] +sub v8.4s, v21.4s, v19.4s +mla v16.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v19.4s +str q13, [x0, #48] +sqrdmulh v13.4S, v3.4S, v23.s[0] +str q2, [x0, #112] +mul v3.4S, v3.4S,v24.s[0] +ldr q2, [x0, #768] +sub v19.4s, v9.4s, v16.4s +ldr q11, [x0, #832] +mla v17.4S, v10.4S, v31.s[0] +add v9.4s, v9.4s, v16.4s +str q21, [x0, #176] +sqrdmulh v21.4S, v0.4S, v23.s[1] +str q8, [x0, #240] +mul v0.4S, v0.4S,v24.s[1] +ldr q8, [x0, #896] +sub v16.4s, v1.4s, v17.4s +ldr q10, [x0, #960] +mla v3.4S, v13.4S, v31.s[0] +add v1.4s, v1.4s, v17.4s +str q9, [x0, #304] +sqrdmulh v9.4S, v22.4S, v23.s[2] +str q19, [x0, #368] +mul v22.4S, v22.4S,v24.s[2] +ldr q19, [x0, #256] +sub v17.4s, v18.4s, v3.4s +ldr q13, [x0, #320] +mla v0.4S, v21.4S, v31.s[0] +add v18.4s, v18.4s, v3.4s +str q1, [x0, #432] +sqrdmulh v1.4S, v15.4S, v23.s[3] +str q16, [x0, #496] +mul v15.4S, v15.4S,v24.s[3] +ldr q16, [x0, #384] +sub v3.4s, v14.4s, v0.4s +ldr q21, [x0, #448] +mla v22.4S, v9.4S, v31.s[0] +add v14.4s, v14.4s, v0.4s +str q18, [x0, #560] +sqrdmulh v18.4S, v2.4S, v29.s[0] +str q17, [x0, #624] +ldr q17, [x0, #512] +mul v2.4S, v2.4S,v30.s[0] +ldr q0, [x0, #576] +sub v9.4s, v12.4s, v22.4s +mla v15.4S, v1.4S, v31.s[0] +add v12.4s, v12.4s, v22.4s +str q14, [x0, #688] +sqrdmulh v14.4S, v11.4S, v29.s[0] +str q3, [x0, #752] +ldr q3, [x0, #640] +mul v11.4S, v11.4S,v30.s[0] +ldr q22, [x0, #704] +sub v1.4s, v20.4s, v15.4s +mla v2.4S, v18.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +str q12, [x0, #816] +sqrdmulh v12.4S, v8.4S, v29.s[0] +str q9, [x0, #880] +mul v8.4S, v8.4S,v30.s[0] +ldr q9, [x0, #0] +sub v15.4s, v19.4s, v2.4s +mla v11.4S, v14.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +str q20, [x0, #944] +sqrdmulh v20.4S, v10.4S, v29.s[0] +str q1, [x0, #1008] +mul v10.4S, v10.4S,v30.s[0] +ldr q1, [x0, #64] +sub v2.4s, v13.4s, v11.4s +mla v8.4S, v12.4S, v31.s[0] +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v29.s[0] +ldr q12, [x0, #128] +mul v17.4S, v17.4S,v30.s[0] +sub v14.4s, v16.4s, v8.4s +mla v10.4S, v20.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v0.4S, v29.s[0] +ldr q20, [x0, #192] +mul v0.4S, v0.4S,v30.s[0] +sub v18.4s, v21.4s, v10.4s +mla v17.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +sub v11.4s, v9.4s, v17.4s +mla v0.4S, v8.4S, v31.s[0] +add v9.4s, v9.4s, v17.4s +sqrdmulh v17.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v8.4s, v1.4s, v0.4s +mla v3.4S, v10.4S, v31.s[0] +add v1.4s, v1.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v10.4s, v12.4s, v3.4s +mla v22.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v17.4s, v20.4s, v22.4s +mla v16.4S, v0.4S, v31.s[0] +add v20.4s, v20.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[1] +mul v19.4S, v19.4S,v30.s[1] +sub v0.4s, v12.4s, v16.4s +mla v21.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v13.4S, v29.s[1] +mul v13.4S, v13.4S,v30.s[1] +sub v3.4s, v20.4s, v21.4s +mla v19.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v22.4s, v9.4s, v19.4s +mla v13.4S, v16.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v29.s[2] +mul v18.4S, v18.4S,v30.s[2] +sub v16.4s, v1.4s, v13.4s +mla v14.4S, v21.4S, v31.s[0] +add v1.4s, v1.4s, v13.4s +sqrdmulh v13.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +sub v21.4s, v10.4s, v14.4s +mla v18.4S, v19.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v19.4s, v17.4s, v18.4s +mla v15.4S, v13.4S, v31.s[0] +add v17.4s, v17.4s, v18.4s +sqrdmulh v18.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +sub v13.4s, v11.4s, v15.4s +mla v2.4S, v14.4S, v31.s[0] +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v27.s[0] +mul v20.4S, v20.4S,v28.s[0] +sub v14.4s, v8.4s, v2.4s +mla v12.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v2.4s +sqrdmulh v2.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v18.4s, v9.4s, v12.4s +mla v20.4S, v15.4S, v31.s[0] +add v9.4s, v9.4s, v12.4s +sqrdmulh v12.4S, v3.4S, v27.s[1] +mul v3.4S, v3.4S,v28.s[1] +sub v15.4s, v1.4s, v20.4s +mla v0.4S, v2.4S, v31.s[0] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v10.4S, v27.s[2] +mul v10.4S, v10.4S,v28.s[2] +sub v2.4s, v22.4s, v0.4s +mla v3.4S, v12.4S, v31.s[0] +add v22.4s, v22.4s, v0.4s +sqrdmulh v0.4S, v17.4S, v27.s[2] +mul v17.4S, v17.4S,v28.s[2] +sub v12.4s, v16.4s, v3.4s +mla v10.4S, v20.4S, v31.s[0] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +sub v20.4s, v11.4s, v10.4s +mla v17.4S, v0.4S, v31.s[0] +add v11.4s, v11.4s, v10.4s +sqrdmulh v10.4S, v19.4S, v27.s[3] +mul v19.4S, v19.4S,v28.s[3] +sub v0.4s, v8.4s, v17.4s +mla v21.4S, v3.4S, v31.s[0] +add v8.4s, v8.4s, v17.4s +sqrdmulh v17.4S, v1.4S, v25.s[0] +mul v1.4S, v1.4S,v26.s[0] +sub v3.4s, v13.4s, v21.4s +mla v19.4S, v10.4S, v31.s[0] +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v15.4S, v25.s[1] +mul v15.4S, v15.4S,v26.s[1] +sub v10.4s, v14.4s, v19.4s +mla v1.4S, v17.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +sqrdmulh v19.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v17.4s, v9.4s, v1.4s +mla v15.4S, v21.4S, v31.s[0] +add v9.4s, v9.4s, v1.4s +sqrdmulh v1.4S, v12.4S, v25.s[3] +mul v12.4S, v12.4S,v26.s[3] +sub v21.4s, v18.4s, v15.4s +mla v16.4S, v19.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +str q9, [x0, #0] +sqrdmulh v9.4S, v8.4S, v23.s[0] +str q17, [x0, #64] +mul v8.4S, v8.4S,v24.s[0] +ldr q17, [x0, #784] +sub v15.4s, v22.4s, v16.4s +ldr q19, [x0, #848] +mla v12.4S, v1.4S, v31.s[0] +add v22.4s, v22.4s, v16.4s +str q18, [x0, #128] +sqrdmulh v18.4S, v0.4S, v23.s[1] +str q21, [x0, #192] +mul v0.4S, v0.4S,v24.s[1] +ldr q21, [x0, #912] +sub v16.4s, v2.4s, v12.4s +ldr q1, [x0, #976] +mla v8.4S, v9.4S, v31.s[0] +add v2.4s, v2.4s, v12.4s +str q22, [x0, #256] +sqrdmulh v22.4S, v14.4S, v23.s[2] +str q15, [x0, #320] +mul v14.4S, v14.4S,v24.s[2] +ldr q15, [x0, #272] +sub v12.4s, v11.4s, v8.4s +ldr q9, [x0, #336] +mla v0.4S, v18.4S, v31.s[0] +add v11.4s, v11.4s, v8.4s +str q2, [x0, #384] +sqrdmulh v2.4S, v10.4S, v23.s[3] +str q16, [x0, #448] +mul v10.4S, v10.4S,v24.s[3] +ldr q16, [x0, #400] +sub v8.4s, v20.4s, v0.4s +ldr q18, [x0, #464] +mla v14.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v0.4s +str q11, [x0, #512] +sqrdmulh v11.4S, v17.4S, v29.s[0] +str q12, [x0, #576] +ldr q12, [x0, #528] +mul v17.4S, v17.4S,v30.s[0] +ldr q0, [x0, #592] +sub v22.4s, v13.4s, v14.4s +mla v10.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v14.4s +str q20, [x0, #640] +sqrdmulh v20.4S, v19.4S, v29.s[0] +str q8, [x0, #704] +ldr q8, [x0, #656] +mul v19.4S, v19.4S,v30.s[0] +ldr q14, [x0, #720] +sub v2.4s, v3.4s, v10.4s +mla v17.4S, v11.4S, v31.s[0] +add v3.4s, v3.4s, v10.4s +str q13, [x0, #768] +sqrdmulh v13.4S, v21.4S, v29.s[0] +str q22, [x0, #832] +mul v21.4S, v21.4S,v30.s[0] +ldr q22, [x0, #16] +sub v10.4s, v15.4s, v17.4s +mla v19.4S, v20.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +str q3, [x0, #896] +sqrdmulh v3.4S, v1.4S, v29.s[0] +str q2, [x0, #960] +mul v1.4S, v1.4S,v30.s[0] +ldr q2, [x0, #80] +sub v17.4s, v9.4s, v19.4s +mla v21.4S, v13.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v12.4S, v29.s[0] +ldr q13, [x0, #144] +mul v12.4S, v12.4S,v30.s[0] +sub v20.4s, v16.4s, v21.4s +mla v1.4S, v3.4S, v31.s[0] +add v16.4s, v16.4s, v21.4s +sqrdmulh v21.4S, v0.4S, v29.s[0] +ldr q3, [x0, #208] +mul v0.4S, v0.4S,v30.s[0] +sub v11.4s, v18.4s, v1.4s +mla v12.4S, v19.4S, v31.s[0] +add v18.4s, v18.4s, v1.4s +sqrdmulh v1.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v19.4s, v22.4s, v12.4s +mla v0.4S, v21.4S, v31.s[0] +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v21.4s, v2.4s, v0.4s +mla v8.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v1.4s, v13.4s, v8.4s +mla v14.4S, v12.4S, v31.s[0] +add v13.4s, v13.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v12.4s, v3.4s, v14.4s +mla v16.4S, v0.4S, v31.s[0] +add v3.4s, v3.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +sub v0.4s, v13.4s, v16.4s +mla v18.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v16.4s +sqrdmulh v16.4S, v9.4S, v29.s[1] +mul v9.4S, v9.4S,v30.s[1] +sub v8.4s, v3.4s, v18.4s +mla v15.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +sub v14.4s, v22.4s, v15.4s +mla v9.4S, v16.4S, v31.s[0] +add v22.4s, v22.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v16.4s, v2.4s, v9.4s +mla v20.4S, v18.4S, v31.s[0] +add v2.4s, v2.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v18.4s, v1.4s, v20.4s +mla v11.4S, v15.4S, v31.s[0] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +sub v15.4s, v12.4s, v11.4s +mla v10.4S, v9.4S, v31.s[0] +add v12.4s, v12.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v27.s[0] +mul v13.4S, v13.4S,v28.s[0] +sub v9.4s, v19.4s, v10.4s +mla v17.4S, v20.4S, v31.s[0] +add v19.4s, v19.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v27.s[0] +mul v3.4S, v3.4S,v28.s[0] +sub v20.4s, v21.4s, v17.4s +mla v13.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v11.4s, v22.4s, v13.4s +mla v3.4S, v10.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v8.4S, v27.s[1] +mul v8.4S, v8.4S,v28.s[1] +sub v10.4s, v2.4s, v3.4s +mla v0.4S, v17.4S, v31.s[0] +add v2.4s, v2.4s, v3.4s +sqrdmulh v3.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +sub v17.4s, v14.4s, v0.4s +mla v8.4S, v13.4S, v31.s[0] +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v12.4S, v27.s[2] +mul v12.4S, v12.4S,v28.s[2] +sub v13.4s, v16.4s, v8.4s +mla v1.4S, v3.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v3.4s, v19.4s, v1.4s +mla v12.4S, v0.4S, v31.s[0] +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +sub v0.4s, v21.4s, v12.4s +mla v18.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v2.4S, v25.s[0] +mul v2.4S, v2.4S,v26.s[0] +sub v8.4s, v9.4s, v18.4s +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v10.4S, v25.s[1] +mul v10.4S, v10.4S,v26.s[1] +sub v1.4s, v20.4s, v15.4s +mla v2.4S, v12.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v12.4s, v22.4s, v2.4s +mla v10.4S, v18.4S, v31.s[0] +add v22.4s, v22.4s, v2.4s +sqrdmulh v2.4S, v13.4S, v25.s[3] +mul v13.4S, v13.4S,v26.s[3] +sub v18.4s, v11.4s, v10.4s +mla v16.4S, v15.4S, v31.s[0] +add v11.4s, v11.4s, v10.4s +str q22, [x0, #16] +sqrdmulh v22.4S, v21.4S, v23.s[0] +str q12, [x0, #80] +mul v21.4S, v21.4S,v24.s[0] +sub v12.4s, v14.4s, v16.4s +mla v13.4S, v2.4S, v31.s[0] +add v14.4s, v14.4s, v16.4s +str q11, [x0, #144] +sqrdmulh v11.4S, v0.4S, v23.s[1] +str q18, [x0, #208] +mul v0.4S, v0.4S,v24.s[1] +sub v18.4s, v17.4s, v13.4s +mla v21.4S, v22.4S, v31.s[0] +add v17.4s, v17.4s, v13.4s +str q14, [x0, #272] +sqrdmulh v14.4S, v20.4S, v23.s[2] +str q12, [x0, #336] +mul v20.4S, v20.4S,v24.s[2] +sub v12.4s, v19.4s, v21.4s +mla v0.4S, v11.4S, v31.s[0] +add v19.4s, v19.4s, v21.4s +str q17, [x0, #400] +sqrdmulh v17.4S, v1.4S, v23.s[3] +str q18, [x0, #464] +mul v1.4S, v1.4S,v24.s[3] +sub v18.4s, v3.4s, v0.4s +mla v20.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v0.4s +str q19, [x0, #528] +str q12, [x0, #592] +sub v12.4s, v9.4s, v20.4s +mla v1.4S, v17.4S, v31.s[0] +add v9.4s, v9.4s, v20.4s +str q3, [x0, #656] +str q18, [x0, #720] +sub v18.4s, v8.4s, v1.4s +add v8.4s, v8.4s, v1.4s +str q9, [x0, #784] +str q12, [x0, #848] +str q8, [x0, #912] +str q18, [x0, #976] +ldr q4, [x0, #32] +ldr q5, [x0, #48] +ldr q6, [x17, #+128] +ldr q7, [x17, #+144] +ldr q15, [x0, #0] +ldr q10, [x0, #16] +ldr q2, [x17, #+1152] +ldr q16, [x17, #+1168] +sqrdmulh v22.4S, v4.4S, v7.s[0] +ldr q13, [x0, #544] +mul v4.4S, v4.4S,v6.s[0] +ldr q11, [x0, #560] +mla v4.4S, v22.4S, v31.s[0] +sub v22.4s, v15.4s, v4.4s +add v15.4s, v15.4s, v4.4s +sqrdmulh v4.4S, v5.4S, v7.s[0] +ldr q21, [x0, #512] +mul v5.4S, v5.4S,v6.s[0] +ldr q14, [x0, #528] +mla v5.4S, v4.4S, v31.s[0] +sub v4.4s, v10.4s, v5.4s +add v10.4s, v10.4s, v5.4s +sqrdmulh v5.4S, v13.4S, v16.s[0] +mul v13.4S, v13.4S,v2.s[0] +mla v13.4S, v5.4S, v31.s[0] +sub v5.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +sqrdmulh v13.4S, v11.4S, v16.s[0] +mul v11.4S, v11.4S,v2.s[0] +mla v11.4S, v13.4S, v31.s[0] +sub v13.4s, v14.4s, v11.4s +add v14.4s, v14.4s, v11.4s +sqrdmulh v11.4S, v10.4S, v7.s[1] +mul v10.4S, v10.4S,v6.s[1] +mla v10.4S, v11.4S, v31.s[0] +sub v11.4s, v15.4s, v10.4s +add v15.4s, v15.4s, v10.4s +sqrdmulh v10.4S, v4.4S, v7.s[2] +mul v4.4S, v4.4S,v6.s[2] +mla v4.4S, v10.4S, v31.s[0] +sub v10.4s, v22.4s, v4.4s +add v22.4s, v22.4s, v4.4s +sqrdmulh v4.4S, v14.4S, v16.s[1] +mul v14.4S, v14.4S,v2.s[1] +mla v14.4S, v4.4S, v31.s[0] +sub v4.4s, v21.4s, v14.4s +trn1 v0.4S, v15.4S, v11.4S +trn2 v19.4S, v15.4S, v11.4S +add v21.4s, v21.4s, v14.4s +trn1 v14.4S, v22.4S, v10.4S +trn2 v17.4S, v22.4S, v10.4S +sqrdmulh v20.4S, v13.4S, v16.s[2] +ldr q3, [x17, #+160] +mul v13.4S, v13.4S,v2.s[2] +ldr q1, [x17, #+176] +mla v13.4S, v20.4S, v31.s[0] +trn2 v22.2D, v0.2D, v14.2D +trn2 v10.2D, v19.2D, v17.2D +sub v20.4s, v5.4s, v13.4s +trn1 v15.2D, v0.2D, v14.2D +trn1 v11.2D, v19.2D, v17.2D +add v5.4s, v5.4s, v13.4s +sqrdmulh v13.4S, v22.4S, v1.4S +mul v22.4S, v22.4S,v3.4S +mla v22.4S, v13.4S, v31.s[0] +trn1 v13.4S, v21.4S, v4.4S +trn2 v17.4S, v21.4S, v4.4S +sub v19.4s, v15.4s, v22.4s +trn1 v14.4S, v5.4S, v20.4S +trn2 v0.4S, v5.4S, v20.4S +add v15.4s, v15.4s, v22.4s +trn2 v5.2D, v13.2D, v14.2D +trn2 v20.2D, v17.2D, v0.2D +sqrdmulh v22.4S, v10.4S, v1.4S +trn1 v21.2D, v13.2D, v14.2D +trn1 v4.2D, v17.2D, v0.2D +ldr q0, [x17, #+1184] +ldr q17, [x17, #+1200] +mul v10.4S, v10.4S,v3.4S +mla v10.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v10.4s +add v11.4s, v11.4s, v10.4s +sqrdmulh v10.4S, v5.4S, v17.4S +mul v5.4S, v5.4S,v0.4S +mla v5.4S, v10.4S, v31.s[0] +sub v10.4s, v21.4s, v5.4s +add v21.4s, v21.4s, v5.4s +ldr q5, [x17, #+192] +ldr q14, [x17, #+208] +sqrdmulh v13.4S, v20.4S, v17.4S +mul v20.4S, v20.4S,v0.4S +mla v20.4S, v13.4S, v31.s[0] +sub v13.4s, v4.4s, v20.4s +add v4.4s, v4.4s, v20.4s +ldr q20, [x17, #+224] +ldr q9, [x17, #+240] +sqrdmulh v12.4S, v11.4S, v14.4S +mul v11.4S, v11.4S,v5.4S +mla v11.4S, v12.4S, v31.s[0] +sub v12.4s, v15.4s, v11.4s +add v15.4s, v15.4s, v11.4s +ldr q11, [x17, #+1216] +ldr q8, [x17, #+1232] +sqrdmulh v18.4S, v22.4S, v9.4S +mul v22.4S, v22.4S,v20.4S +mla v22.4S, v18.4S, v31.s[0] +sub v18.4s, v19.4s, v22.4s +add v19.4s, v19.4s, v22.4s +ldr q22, [x17, #+1248] +ldr q30, [x17, #+1264] +sqrdmulh v29.4S, v4.4S, v8.4S +ldr q28, [x0, #96] +mul v4.4S, v4.4S,v11.4S +mla v4.4S, v29.4S, v31.s[0] +sub v29.4s, v21.4s, v4.4s +add v21.4s, v21.4s, v4.4s +sqrdmulh v4.4S, v13.4S, v30.4S +ldr q27, [x0, #112] +mul v13.4S, v13.4S,v22.4S +mla v13.4S, v4.4S, v31.s[0] +sub v4.4s, v10.4s, v13.4s +add v10.4s, v10.4s, v13.4s +str q15, [x0, #0] +str q12, [x0, #16] +str q19, [x0, #32] +str q18, [x0, #48] +str q21, [x0, #512] +str q29, [x0, #528] +str q10, [x0, #544] +str q4, [x0, #560] +ldr q30, [x17, #+256] +ldr q22, [x17, #+272] +ldr q8, [x0, #64] +ldr q11, [x0, #80] +ldr q17, [x17, #+1280] +ldr q0, [x17, #+1296] +sqrdmulh v16.4S, v28.4S, v22.s[0] +ldr q2, [x0, #608] +mul v28.4S, v28.4S,v30.s[0] +ldr q4, [x0, #624] +mla v28.4S, v16.4S, v31.s[0] +sub v16.4s, v8.4s, v28.4s +add v8.4s, v8.4s, v28.4s +sqrdmulh v28.4S, v27.4S, v22.s[0] +ldr q10, [x0, #576] +mul v27.4S, v27.4S,v30.s[0] +ldr q29, [x0, #592] +mla v27.4S, v28.4S, v31.s[0] +sub v28.4s, v11.4s, v27.4s +add v11.4s, v11.4s, v27.4s +sqrdmulh v27.4S, v2.4S, v0.s[0] +mul v2.4S, v2.4S,v17.s[0] +mla v2.4S, v27.4S, v31.s[0] +sub v27.4s, v10.4s, v2.4s +add v10.4s, v10.4s, v2.4s +sqrdmulh v2.4S, v4.4S, v0.s[0] +mul v4.4S, v4.4S,v17.s[0] +mla v4.4S, v2.4S, v31.s[0] +sub v2.4s, v29.4s, v4.4s +add v29.4s, v29.4s, v4.4s +sqrdmulh v4.4S, v11.4S, v22.s[1] +mul v11.4S, v11.4S,v30.s[1] +mla v11.4S, v4.4S, v31.s[0] +sub v4.4s, v8.4s, v11.4s +add v8.4s, v8.4s, v11.4s +sqrdmulh v11.4S, v28.4S, v22.s[2] +mul v28.4S, v28.4S,v30.s[2] +mla v28.4S, v11.4S, v31.s[0] +sub v11.4s, v16.4s, v28.4s +add v16.4s, v16.4s, v28.4s +sqrdmulh v28.4S, v29.4S, v0.s[1] +mul v29.4S, v29.4S,v17.s[1] +mla v29.4S, v28.4S, v31.s[0] +sub v28.4s, v10.4s, v29.4s +trn1 v21.4S, v8.4S, v4.4S +trn2 v9.4S, v8.4S, v4.4S +add v10.4s, v10.4s, v29.4s +trn1 v29.4S, v16.4S, v11.4S +trn2 v20.4S, v16.4S, v11.4S +sqrdmulh v14.4S, v2.4S, v0.s[2] +ldr q5, [x17, #+288] +mul v2.4S, v2.4S,v17.s[2] +ldr q1, [x17, #+304] +mla v2.4S, v14.4S, v31.s[0] +trn2 v16.2D, v21.2D, v29.2D +trn2 v11.2D, v9.2D, v20.2D +sub v14.4s, v27.4s, v2.4s +trn1 v8.2D, v21.2D, v29.2D +trn1 v4.2D, v9.2D, v20.2D +add v27.4s, v27.4s, v2.4s +sqrdmulh v2.4S, v16.4S, v1.4S +mul v16.4S, v16.4S,v5.4S +mla v16.4S, v2.4S, v31.s[0] +trn1 v2.4S, v10.4S, v28.4S +trn2 v20.4S, v10.4S, v28.4S +sub v9.4s, v8.4s, v16.4s +trn1 v29.4S, v27.4S, v14.4S +trn2 v21.4S, v27.4S, v14.4S +add v8.4s, v8.4s, v16.4s +trn2 v27.2D, v2.2D, v29.2D +trn2 v14.2D, v20.2D, v21.2D +sqrdmulh v16.4S, v11.4S, v1.4S +trn1 v10.2D, v2.2D, v29.2D +trn1 v28.2D, v20.2D, v21.2D +ldr q21, [x17, #+1312] +ldr q20, [x17, #+1328] +mul v11.4S, v11.4S,v5.4S +mla v11.4S, v16.4S, v31.s[0] +sub v16.4s, v4.4s, v11.4s +add v4.4s, v4.4s, v11.4s +sqrdmulh v11.4S, v27.4S, v20.4S +mul v27.4S, v27.4S,v21.4S +mla v27.4S, v11.4S, v31.s[0] +sub v11.4s, v10.4s, v27.4s +add v10.4s, v10.4s, v27.4s +ldr q27, [x17, #+320] +ldr q29, [x17, #+336] +sqrdmulh v2.4S, v14.4S, v20.4S +mul v14.4S, v14.4S,v21.4S +mla v14.4S, v2.4S, v31.s[0] +sub v2.4s, v28.4s, v14.4s +add v28.4s, v28.4s, v14.4s +ldr q14, [x17, #+352] +ldr q3, [x17, #+368] +sqrdmulh v7.4S, v4.4S, v29.4S +mul v4.4S, v4.4S,v27.4S +mla v4.4S, v7.4S, v31.s[0] +sub v7.4s, v8.4s, v4.4s +add v8.4s, v8.4s, v4.4s +ldr q4, [x17, #+1344] +ldr q6, [x17, #+1360] +sqrdmulh v18.4S, v16.4S, v3.4S +mul v16.4S, v16.4S,v14.4S +mla v16.4S, v18.4S, v31.s[0] +sub v18.4s, v9.4s, v16.4s +add v9.4s, v9.4s, v16.4s +ldr q16, [x17, #+1376] +ldr q19, [x17, #+1392] +sqrdmulh v12.4S, v28.4S, v6.4S +ldr q15, [x0, #160] +mul v28.4S, v28.4S,v4.4S +mla v28.4S, v12.4S, v31.s[0] +sub v12.4s, v10.4s, v28.4s +add v10.4s, v10.4s, v28.4s +sqrdmulh v28.4S, v2.4S, v19.4S +ldr q13, [x0, #176] +mul v2.4S, v2.4S,v16.4S +mla v2.4S, v28.4S, v31.s[0] +sub v28.4s, v11.4s, v2.4s +add v11.4s, v11.4s, v2.4s +str q8, [x0, #64] +str q7, [x0, #80] +str q9, [x0, #96] +str q18, [x0, #112] +str q10, [x0, #576] +str q12, [x0, #592] +str q11, [x0, #608] +str q28, [x0, #624] +ldr q19, [x17, #+384] +ldr q16, [x17, #+400] +ldr q6, [x0, #128] +ldr q4, [x0, #144] +ldr q20, [x17, #+1408] +ldr q21, [x17, #+1424] +sqrdmulh v0.4S, v15.4S, v16.s[0] +ldr q17, [x0, #672] +mul v15.4S, v15.4S,v19.s[0] +ldr q28, [x0, #688] +mla v15.4S, v0.4S, v31.s[0] +sub v0.4s, v6.4s, v15.4s +add v6.4s, v6.4s, v15.4s +sqrdmulh v15.4S, v13.4S, v16.s[0] +ldr q11, [x0, #640] +mul v13.4S, v13.4S,v19.s[0] +ldr q12, [x0, #656] +mla v13.4S, v15.4S, v31.s[0] +sub v15.4s, v4.4s, v13.4s +add v4.4s, v4.4s, v13.4s +sqrdmulh v13.4S, v17.4S, v21.s[0] +mul v17.4S, v17.4S,v20.s[0] +mla v17.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v17.4s +add v11.4s, v11.4s, v17.4s +sqrdmulh v17.4S, v28.4S, v21.s[0] +mul v28.4S, v28.4S,v20.s[0] +mla v28.4S, v17.4S, v31.s[0] +sub v17.4s, v12.4s, v28.4s +add v12.4s, v12.4s, v28.4s +sqrdmulh v28.4S, v4.4S, v16.s[1] +mul v4.4S, v4.4S,v19.s[1] +mla v4.4S, v28.4S, v31.s[0] +sub v28.4s, v6.4s, v4.4s +add v6.4s, v6.4s, v4.4s +sqrdmulh v4.4S, v15.4S, v16.s[2] +mul v15.4S, v15.4S,v19.s[2] +mla v15.4S, v4.4S, v31.s[0] +sub v4.4s, v0.4s, v15.4s +add v0.4s, v0.4s, v15.4s +sqrdmulh v15.4S, v12.4S, v21.s[1] +mul v12.4S, v12.4S,v20.s[1] +mla v12.4S, v15.4S, v31.s[0] +sub v15.4s, v11.4s, v12.4s +trn1 v10.4S, v6.4S, v28.4S +trn2 v3.4S, v6.4S, v28.4S +add v11.4s, v11.4s, v12.4s +trn1 v12.4S, v0.4S, v4.4S +trn2 v14.4S, v0.4S, v4.4S +sqrdmulh v29.4S, v17.4S, v21.s[2] +ldr q27, [x17, #+416] +mul v17.4S, v17.4S,v20.s[2] +ldr q1, [x17, #+432] +mla v17.4S, v29.4S, v31.s[0] +trn2 v0.2D, v10.2D, v12.2D +trn2 v4.2D, v3.2D, v14.2D +sub v29.4s, v13.4s, v17.4s +trn1 v6.2D, v10.2D, v12.2D +trn1 v28.2D, v3.2D, v14.2D +add v13.4s, v13.4s, v17.4s +sqrdmulh v17.4S, v0.4S, v1.4S +mul v0.4S, v0.4S,v27.4S +mla v0.4S, v17.4S, v31.s[0] +trn1 v17.4S, v11.4S, v15.4S +trn2 v14.4S, v11.4S, v15.4S +sub v3.4s, v6.4s, v0.4s +trn1 v12.4S, v13.4S, v29.4S +trn2 v10.4S, v13.4S, v29.4S +add v6.4s, v6.4s, v0.4s +trn2 v13.2D, v17.2D, v12.2D +trn2 v29.2D, v14.2D, v10.2D +sqrdmulh v0.4S, v4.4S, v1.4S +trn1 v11.2D, v17.2D, v12.2D +trn1 v15.2D, v14.2D, v10.2D +ldr q10, [x17, #+1440] +ldr q14, [x17, #+1456] +mul v4.4S, v4.4S,v27.4S +mla v4.4S, v0.4S, v31.s[0] +sub v0.4s, v28.4s, v4.4s +add v28.4s, v28.4s, v4.4s +sqrdmulh v4.4S, v13.4S, v14.4S +mul v13.4S, v13.4S,v10.4S +mla v13.4S, v4.4S, v31.s[0] +sub v4.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +ldr q13, [x17, #+448] +ldr q12, [x17, #+464] +sqrdmulh v17.4S, v29.4S, v14.4S +mul v29.4S, v29.4S,v10.4S +mla v29.4S, v17.4S, v31.s[0] +sub v17.4s, v15.4s, v29.4s +add v15.4s, v15.4s, v29.4s +ldr q29, [x17, #+480] +ldr q5, [x17, #+496] +sqrdmulh v22.4S, v28.4S, v12.4S +mul v28.4S, v28.4S,v13.4S +mla v28.4S, v22.4S, v31.s[0] +sub v22.4s, v6.4s, v28.4s +add v6.4s, v6.4s, v28.4s +ldr q28, [x17, #+1472] +ldr q30, [x17, #+1488] +sqrdmulh v18.4S, v0.4S, v5.4S +mul v0.4S, v0.4S,v29.4S +mla v0.4S, v18.4S, v31.s[0] +sub v18.4s, v3.4s, v0.4s +add v3.4s, v3.4s, v0.4s +ldr q0, [x17, #+1504] +ldr q9, [x17, #+1520] +sqrdmulh v7.4S, v15.4S, v30.4S +ldr q8, [x0, #224] +mul v15.4S, v15.4S,v28.4S +mla v15.4S, v7.4S, v31.s[0] +sub v7.4s, v11.4s, v15.4s +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v17.4S, v9.4S +ldr q2, [x0, #240] +mul v17.4S, v17.4S,v0.4S +mla v17.4S, v15.4S, v31.s[0] +sub v15.4s, v4.4s, v17.4s +add v4.4s, v4.4s, v17.4s +str q6, [x0, #128] +str q22, [x0, #144] +str q3, [x0, #160] +str q18, [x0, #176] +str q11, [x0, #640] +str q7, [x0, #656] +str q4, [x0, #672] +str q15, [x0, #688] +ldr q9, [x17, #+512] +ldr q0, [x17, #+528] +ldr q30, [x0, #192] +ldr q28, [x0, #208] +ldr q14, [x17, #+1536] +ldr q10, [x17, #+1552] +sqrdmulh v21.4S, v8.4S, v0.s[0] +ldr q20, [x0, #736] +mul v8.4S, v8.4S,v9.s[0] +ldr q15, [x0, #752] +mla v8.4S, v21.4S, v31.s[0] +sub v21.4s, v30.4s, v8.4s +add v30.4s, v30.4s, v8.4s +sqrdmulh v8.4S, v2.4S, v0.s[0] +ldr q4, [x0, #704] +mul v2.4S, v2.4S,v9.s[0] +ldr q7, [x0, #720] +mla v2.4S, v8.4S, v31.s[0] +sub v8.4s, v28.4s, v2.4s +add v28.4s, v28.4s, v2.4s +sqrdmulh v2.4S, v20.4S, v10.s[0] +mul v20.4S, v20.4S,v14.s[0] +mla v20.4S, v2.4S, v31.s[0] +sub v2.4s, v4.4s, v20.4s +add v4.4s, v4.4s, v20.4s +sqrdmulh v20.4S, v15.4S, v10.s[0] +mul v15.4S, v15.4S,v14.s[0] +mla v15.4S, v20.4S, v31.s[0] +sub v20.4s, v7.4s, v15.4s +add v7.4s, v7.4s, v15.4s +sqrdmulh v15.4S, v28.4S, v0.s[1] +mul v28.4S, v28.4S,v9.s[1] +mla v28.4S, v15.4S, v31.s[0] +sub v15.4s, v30.4s, v28.4s +add v30.4s, v30.4s, v28.4s +sqrdmulh v28.4S, v8.4S, v0.s[2] +mul v8.4S, v8.4S,v9.s[2] +mla v8.4S, v28.4S, v31.s[0] +sub v28.4s, v21.4s, v8.4s +add v21.4s, v21.4s, v8.4s +sqrdmulh v8.4S, v7.4S, v10.s[1] +mul v7.4S, v7.4S,v14.s[1] +mla v7.4S, v8.4S, v31.s[0] +sub v8.4s, v4.4s, v7.4s +trn1 v11.4S, v30.4S, v15.4S +trn2 v5.4S, v30.4S, v15.4S +add v4.4s, v4.4s, v7.4s +trn1 v7.4S, v21.4S, v28.4S +trn2 v29.4S, v21.4S, v28.4S +sqrdmulh v12.4S, v20.4S, v10.s[2] +ldr q13, [x17, #+544] +mul v20.4S, v20.4S,v14.s[2] +ldr q1, [x17, #+560] +mla v20.4S, v12.4S, v31.s[0] +trn2 v21.2D, v11.2D, v7.2D +trn2 v28.2D, v5.2D, v29.2D +sub v12.4s, v2.4s, v20.4s +trn1 v30.2D, v11.2D, v7.2D +trn1 v15.2D, v5.2D, v29.2D +add v2.4s, v2.4s, v20.4s +sqrdmulh v20.4S, v21.4S, v1.4S +mul v21.4S, v21.4S,v13.4S +mla v21.4S, v20.4S, v31.s[0] +trn1 v20.4S, v4.4S, v8.4S +trn2 v29.4S, v4.4S, v8.4S +sub v5.4s, v30.4s, v21.4s +trn1 v7.4S, v2.4S, v12.4S +trn2 v11.4S, v2.4S, v12.4S +add v30.4s, v30.4s, v21.4s +trn2 v2.2D, v20.2D, v7.2D +trn2 v12.2D, v29.2D, v11.2D +sqrdmulh v21.4S, v28.4S, v1.4S +trn1 v4.2D, v20.2D, v7.2D +trn1 v8.2D, v29.2D, v11.2D +ldr q11, [x17, #+1568] +ldr q29, [x17, #+1584] +mul v28.4S, v28.4S,v13.4S +mla v28.4S, v21.4S, v31.s[0] +sub v21.4s, v15.4s, v28.4s +add v15.4s, v15.4s, v28.4s +sqrdmulh v28.4S, v2.4S, v29.4S +mul v2.4S, v2.4S,v11.4S +mla v2.4S, v28.4S, v31.s[0] +sub v28.4s, v4.4s, v2.4s +add v4.4s, v4.4s, v2.4s +ldr q2, [x17, #+576] +ldr q7, [x17, #+592] +sqrdmulh v20.4S, v12.4S, v29.4S +mul v12.4S, v12.4S,v11.4S +mla v12.4S, v20.4S, v31.s[0] +sub v20.4s, v8.4s, v12.4s +add v8.4s, v8.4s, v12.4s +ldr q12, [x17, #+608] +ldr q27, [x17, #+624] +sqrdmulh v16.4S, v15.4S, v7.4S +mul v15.4S, v15.4S,v2.4S +mla v15.4S, v16.4S, v31.s[0] +sub v16.4s, v30.4s, v15.4s +add v30.4s, v30.4s, v15.4s +ldr q15, [x17, #+1600] +ldr q19, [x17, #+1616] +sqrdmulh v18.4S, v21.4S, v27.4S +mul v21.4S, v21.4S,v12.4S +mla v21.4S, v18.4S, v31.s[0] +sub v18.4s, v5.4s, v21.4s +add v5.4s, v5.4s, v21.4s +ldr q21, [x17, #+1632] +ldr q3, [x17, #+1648] +sqrdmulh v22.4S, v8.4S, v19.4S +ldr q6, [x0, #288] +mul v8.4S, v8.4S,v15.4S +mla v8.4S, v22.4S, v31.s[0] +sub v22.4s, v4.4s, v8.4s +add v4.4s, v4.4s, v8.4s +sqrdmulh v8.4S, v20.4S, v3.4S +ldr q17, [x0, #304] +mul v20.4S, v20.4S,v21.4S +mla v20.4S, v8.4S, v31.s[0] +sub v8.4s, v28.4s, v20.4s +add v28.4s, v28.4s, v20.4s +str q30, [x0, #192] +str q16, [x0, #208] +str q5, [x0, #224] +str q18, [x0, #240] +str q4, [x0, #704] +str q22, [x0, #720] +str q28, [x0, #736] +str q8, [x0, #752] +ldr q3, [x17, #+640] +ldr q21, [x17, #+656] +ldr q19, [x0, #256] +ldr q15, [x0, #272] +ldr q29, [x17, #+1664] +ldr q11, [x17, #+1680] +sqrdmulh v10.4S, v6.4S, v21.s[0] +ldr q14, [x0, #800] +mul v6.4S, v6.4S,v3.s[0] +ldr q8, [x0, #816] +mla v6.4S, v10.4S, v31.s[0] +sub v10.4s, v19.4s, v6.4s +add v19.4s, v19.4s, v6.4s +sqrdmulh v6.4S, v17.4S, v21.s[0] +ldr q28, [x0, #768] +mul v17.4S, v17.4S,v3.s[0] +ldr q22, [x0, #784] +mla v17.4S, v6.4S, v31.s[0] +sub v6.4s, v15.4s, v17.4s +add v15.4s, v15.4s, v17.4s +sqrdmulh v17.4S, v14.4S, v11.s[0] +mul v14.4S, v14.4S,v29.s[0] +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v28.4s, v14.4s +add v28.4s, v28.4s, v14.4s +sqrdmulh v14.4S, v8.4S, v11.s[0] +mul v8.4S, v8.4S,v29.s[0] +mla v8.4S, v14.4S, v31.s[0] +sub v14.4s, v22.4s, v8.4s +add v22.4s, v22.4s, v8.4s +sqrdmulh v8.4S, v15.4S, v21.s[1] +mul v15.4S, v15.4S,v3.s[1] +mla v15.4S, v8.4S, v31.s[0] +sub v8.4s, v19.4s, v15.4s +add v19.4s, v19.4s, v15.4s +sqrdmulh v15.4S, v6.4S, v21.s[2] +mul v6.4S, v6.4S,v3.s[2] +mla v6.4S, v15.4S, v31.s[0] +sub v15.4s, v10.4s, v6.4s +add v10.4s, v10.4s, v6.4s +sqrdmulh v6.4S, v22.4S, v11.s[1] +mul v22.4S, v22.4S,v29.s[1] +mla v22.4S, v6.4S, v31.s[0] +sub v6.4s, v28.4s, v22.4s +trn1 v4.4S, v19.4S, v8.4S +trn2 v27.4S, v19.4S, v8.4S +add v28.4s, v28.4s, v22.4s +trn1 v22.4S, v10.4S, v15.4S +trn2 v12.4S, v10.4S, v15.4S +sqrdmulh v7.4S, v14.4S, v11.s[2] +ldr q2, [x17, #+672] +mul v14.4S, v14.4S,v29.s[2] +ldr q1, [x17, #+688] +mla v14.4S, v7.4S, v31.s[0] +trn2 v10.2D, v4.2D, v22.2D +trn2 v15.2D, v27.2D, v12.2D +sub v7.4s, v17.4s, v14.4s +trn1 v19.2D, v4.2D, v22.2D +trn1 v8.2D, v27.2D, v12.2D +add v17.4s, v17.4s, v14.4s +sqrdmulh v14.4S, v10.4S, v1.4S +mul v10.4S, v10.4S,v2.4S +mla v10.4S, v14.4S, v31.s[0] +trn1 v14.4S, v28.4S, v6.4S +trn2 v12.4S, v28.4S, v6.4S +sub v27.4s, v19.4s, v10.4s +trn1 v22.4S, v17.4S, v7.4S +trn2 v4.4S, v17.4S, v7.4S +add v19.4s, v19.4s, v10.4s +trn2 v17.2D, v14.2D, v22.2D +trn2 v7.2D, v12.2D, v4.2D +sqrdmulh v10.4S, v15.4S, v1.4S +trn1 v28.2D, v14.2D, v22.2D +trn1 v6.2D, v12.2D, v4.2D +ldr q4, [x17, #+1696] +ldr q12, [x17, #+1712] +mul v15.4S, v15.4S,v2.4S +mla v15.4S, v10.4S, v31.s[0] +sub v10.4s, v8.4s, v15.4s +add v8.4s, v8.4s, v15.4s +sqrdmulh v15.4S, v17.4S, v12.4S +mul v17.4S, v17.4S,v4.4S +mla v17.4S, v15.4S, v31.s[0] +sub v15.4s, v28.4s, v17.4s +add v28.4s, v28.4s, v17.4s +ldr q17, [x17, #+704] +ldr q22, [x17, #+720] +sqrdmulh v14.4S, v7.4S, v12.4S +mul v7.4S, v7.4S,v4.4S +mla v7.4S, v14.4S, v31.s[0] +sub v14.4s, v6.4s, v7.4s +add v6.4s, v6.4s, v7.4s +ldr q7, [x17, #+736] +ldr q13, [x17, #+752] +sqrdmulh v0.4S, v8.4S, v22.4S +mul v8.4S, v8.4S,v17.4S +mla v8.4S, v0.4S, v31.s[0] +sub v0.4s, v19.4s, v8.4s +add v19.4s, v19.4s, v8.4s +ldr q8, [x17, #+1728] +ldr q9, [x17, #+1744] +sqrdmulh v18.4S, v10.4S, v13.4S +mul v10.4S, v10.4S,v7.4S +mla v10.4S, v18.4S, v31.s[0] +sub v18.4s, v27.4s, v10.4s +add v27.4s, v27.4s, v10.4s +ldr q10, [x17, #+1760] +ldr q5, [x17, #+1776] +sqrdmulh v16.4S, v6.4S, v9.4S +ldr q30, [x0, #352] +mul v6.4S, v6.4S,v8.4S +mla v6.4S, v16.4S, v31.s[0] +sub v16.4s, v28.4s, v6.4s +add v28.4s, v28.4s, v6.4s +sqrdmulh v6.4S, v14.4S, v5.4S +ldr q20, [x0, #368] +mul v14.4S, v14.4S,v10.4S +mla v14.4S, v6.4S, v31.s[0] +sub v6.4s, v15.4s, v14.4s +add v15.4s, v15.4s, v14.4s +str q19, [x0, #256] +str q0, [x0, #272] +str q27, [x0, #288] +str q18, [x0, #304] +str q28, [x0, #768] +str q16, [x0, #784] +str q15, [x0, #800] +str q6, [x0, #816] +ldr q5, [x17, #+768] +ldr q10, [x17, #+784] +ldr q9, [x0, #320] +ldr q8, [x0, #336] +ldr q12, [x17, #+1792] +ldr q4, [x17, #+1808] +sqrdmulh v11.4S, v30.4S, v10.s[0] +ldr q29, [x0, #864] +mul v30.4S, v30.4S,v5.s[0] +ldr q6, [x0, #880] +mla v30.4S, v11.4S, v31.s[0] +sub v11.4s, v9.4s, v30.4s +add v9.4s, v9.4s, v30.4s +sqrdmulh v30.4S, v20.4S, v10.s[0] +ldr q15, [x0, #832] +mul v20.4S, v20.4S,v5.s[0] +ldr q16, [x0, #848] +mla v20.4S, v30.4S, v31.s[0] +sub v30.4s, v8.4s, v20.4s +add v8.4s, v8.4s, v20.4s +sqrdmulh v20.4S, v29.4S, v4.s[0] +mul v29.4S, v29.4S,v12.s[0] +mla v29.4S, v20.4S, v31.s[0] +sub v20.4s, v15.4s, v29.4s +add v15.4s, v15.4s, v29.4s +sqrdmulh v29.4S, v6.4S, v4.s[0] +mul v6.4S, v6.4S,v12.s[0] +mla v6.4S, v29.4S, v31.s[0] +sub v29.4s, v16.4s, v6.4s +add v16.4s, v16.4s, v6.4s +sqrdmulh v6.4S, v8.4S, v10.s[1] +mul v8.4S, v8.4S,v5.s[1] +mla v8.4S, v6.4S, v31.s[0] +sub v6.4s, v9.4s, v8.4s +add v9.4s, v9.4s, v8.4s +sqrdmulh v8.4S, v30.4S, v10.s[2] +mul v30.4S, v30.4S,v5.s[2] +mla v30.4S, v8.4S, v31.s[0] +sub v8.4s, v11.4s, v30.4s +add v11.4s, v11.4s, v30.4s +sqrdmulh v30.4S, v16.4S, v4.s[1] +mul v16.4S, v16.4S,v12.s[1] +mla v16.4S, v30.4S, v31.s[0] +sub v30.4s, v15.4s, v16.4s +trn1 v28.4S, v9.4S, v6.4S +trn2 v13.4S, v9.4S, v6.4S +add v15.4s, v15.4s, v16.4s +trn1 v16.4S, v11.4S, v8.4S +trn2 v7.4S, v11.4S, v8.4S +sqrdmulh v22.4S, v29.4S, v4.s[2] +ldr q17, [x17, #+800] +mul v29.4S, v29.4S,v12.s[2] +ldr q1, [x17, #+816] +mla v29.4S, v22.4S, v31.s[0] +trn2 v11.2D, v28.2D, v16.2D +trn2 v8.2D, v13.2D, v7.2D +sub v22.4s, v20.4s, v29.4s +trn1 v9.2D, v28.2D, v16.2D +trn1 v6.2D, v13.2D, v7.2D +add v20.4s, v20.4s, v29.4s +sqrdmulh v29.4S, v11.4S, v1.4S +mul v11.4S, v11.4S,v17.4S +mla v11.4S, v29.4S, v31.s[0] +trn1 v29.4S, v15.4S, v30.4S +trn2 v7.4S, v15.4S, v30.4S +sub v13.4s, v9.4s, v11.4s +trn1 v16.4S, v20.4S, v22.4S +trn2 v28.4S, v20.4S, v22.4S +add v9.4s, v9.4s, v11.4s +trn2 v20.2D, v29.2D, v16.2D +trn2 v22.2D, v7.2D, v28.2D +sqrdmulh v11.4S, v8.4S, v1.4S +trn1 v15.2D, v29.2D, v16.2D +trn1 v30.2D, v7.2D, v28.2D +ldr q28, [x17, #+1824] +ldr q7, [x17, #+1840] +mul v8.4S, v8.4S,v17.4S +mla v8.4S, v11.4S, v31.s[0] +sub v11.4s, v6.4s, v8.4s +add v6.4s, v6.4s, v8.4s +sqrdmulh v8.4S, v20.4S, v7.4S +mul v20.4S, v20.4S,v28.4S +mla v20.4S, v8.4S, v31.s[0] +sub v8.4s, v15.4s, v20.4s +add v15.4s, v15.4s, v20.4s +ldr q20, [x17, #+832] +ldr q16, [x17, #+848] +sqrdmulh v29.4S, v22.4S, v7.4S +mul v22.4S, v22.4S,v28.4S +mla v22.4S, v29.4S, v31.s[0] +sub v29.4s, v30.4s, v22.4s +add v30.4s, v30.4s, v22.4s +ldr q22, [x17, #+864] +ldr q2, [x17, #+880] +sqrdmulh v21.4S, v6.4S, v16.4S +mul v6.4S, v6.4S,v20.4S +mla v6.4S, v21.4S, v31.s[0] +sub v21.4s, v9.4s, v6.4s +add v9.4s, v9.4s, v6.4s +ldr q6, [x17, #+1856] +ldr q3, [x17, #+1872] +sqrdmulh v18.4S, v11.4S, v2.4S +mul v11.4S, v11.4S,v22.4S +mla v11.4S, v18.4S, v31.s[0] +sub v18.4s, v13.4s, v11.4s +add v13.4s, v13.4s, v11.4s +ldr q11, [x17, #+1888] +ldr q27, [x17, #+1904] +sqrdmulh v0.4S, v30.4S, v3.4S +ldr q19, [x0, #416] +mul v30.4S, v30.4S,v6.4S +mla v30.4S, v0.4S, v31.s[0] +sub v0.4s, v15.4s, v30.4s +add v15.4s, v15.4s, v30.4s +sqrdmulh v30.4S, v29.4S, v27.4S +ldr q14, [x0, #432] +mul v29.4S, v29.4S,v11.4S +mla v29.4S, v30.4S, v31.s[0] +sub v30.4s, v8.4s, v29.4s +add v8.4s, v8.4s, v29.4s +str q9, [x0, #320] +str q21, [x0, #336] +str q13, [x0, #352] +str q18, [x0, #368] +str q15, [x0, #832] +str q0, [x0, #848] +str q8, [x0, #864] +str q30, [x0, #880] +ldr q27, [x17, #+896] +ldr q11, [x17, #+912] +ldr q3, [x0, #384] +ldr q6, [x0, #400] +ldr q7, [x17, #+1920] +ldr q28, [x17, #+1936] +sqrdmulh v4.4S, v19.4S, v11.s[0] +ldr q12, [x0, #928] +mul v19.4S, v19.4S,v27.s[0] +ldr q30, [x0, #944] +mla v19.4S, v4.4S, v31.s[0] +sub v4.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v11.s[0] +ldr q8, [x0, #896] +mul v14.4S, v14.4S,v27.s[0] +ldr q0, [x0, #912] +mla v14.4S, v19.4S, v31.s[0] +sub v19.4s, v6.4s, v14.4s +add v6.4s, v6.4s, v14.4s +sqrdmulh v14.4S, v12.4S, v28.s[0] +mul v12.4S, v12.4S,v7.s[0] +mla v12.4S, v14.4S, v31.s[0] +sub v14.4s, v8.4s, v12.4s +add v8.4s, v8.4s, v12.4s +sqrdmulh v12.4S, v30.4S, v28.s[0] +mul v30.4S, v30.4S,v7.s[0] +mla v30.4S, v12.4S, v31.s[0] +sub v12.4s, v0.4s, v30.4s +add v0.4s, v0.4s, v30.4s +sqrdmulh v30.4S, v6.4S, v11.s[1] +mul v6.4S, v6.4S,v27.s[1] +mla v6.4S, v30.4S, v31.s[0] +sub v30.4s, v3.4s, v6.4s +add v3.4s, v3.4s, v6.4s +sqrdmulh v6.4S, v19.4S, v11.s[2] +mul v19.4S, v19.4S,v27.s[2] +mla v19.4S, v6.4S, v31.s[0] +sub v6.4s, v4.4s, v19.4s +add v4.4s, v4.4s, v19.4s +sqrdmulh v19.4S, v0.4S, v28.s[1] +mul v0.4S, v0.4S,v7.s[1] +mla v0.4S, v19.4S, v31.s[0] +sub v19.4s, v8.4s, v0.4s +trn1 v15.4S, v3.4S, v30.4S +trn2 v2.4S, v3.4S, v30.4S +add v8.4s, v8.4s, v0.4s +trn1 v0.4S, v4.4S, v6.4S +trn2 v22.4S, v4.4S, v6.4S +sqrdmulh v16.4S, v12.4S, v28.s[2] +ldr q20, [x17, #+928] +mul v12.4S, v12.4S,v7.s[2] +ldr q1, [x17, #+944] +mla v12.4S, v16.4S, v31.s[0] +trn2 v4.2D, v15.2D, v0.2D +trn2 v6.2D, v2.2D, v22.2D +sub v16.4s, v14.4s, v12.4s +trn1 v3.2D, v15.2D, v0.2D +trn1 v30.2D, v2.2D, v22.2D +add v14.4s, v14.4s, v12.4s +sqrdmulh v12.4S, v4.4S, v1.4S +mul v4.4S, v4.4S,v20.4S +mla v4.4S, v12.4S, v31.s[0] +trn1 v12.4S, v8.4S, v19.4S +trn2 v22.4S, v8.4S, v19.4S +sub v2.4s, v3.4s, v4.4s +trn1 v0.4S, v14.4S, v16.4S +trn2 v15.4S, v14.4S, v16.4S +add v3.4s, v3.4s, v4.4s +trn2 v14.2D, v12.2D, v0.2D +trn2 v16.2D, v22.2D, v15.2D +sqrdmulh v4.4S, v6.4S, v1.4S +trn1 v8.2D, v12.2D, v0.2D +trn1 v19.2D, v22.2D, v15.2D +ldr q15, [x17, #+1952] +ldr q22, [x17, #+1968] +mul v6.4S, v6.4S,v20.4S +mla v6.4S, v4.4S, v31.s[0] +sub v4.4s, v30.4s, v6.4s +add v30.4s, v30.4s, v6.4s +sqrdmulh v6.4S, v14.4S, v22.4S +mul v14.4S, v14.4S,v15.4S +mla v14.4S, v6.4S, v31.s[0] +sub v6.4s, v8.4s, v14.4s +add v8.4s, v8.4s, v14.4s +ldr q14, [x17, #+960] +ldr q0, [x17, #+976] +sqrdmulh v12.4S, v16.4S, v22.4S +mul v16.4S, v16.4S,v15.4S +mla v16.4S, v12.4S, v31.s[0] +sub v12.4s, v19.4s, v16.4s +add v19.4s, v19.4s, v16.4s +ldr q16, [x17, #+992] +ldr q17, [x17, #+1008] +sqrdmulh v10.4S, v30.4S, v0.4S +mul v30.4S, v30.4S,v14.4S +mla v30.4S, v10.4S, v31.s[0] +sub v10.4s, v3.4s, v30.4s +add v3.4s, v3.4s, v30.4s +ldr q30, [x17, #+1984] +ldr q5, [x17, #+2000] +sqrdmulh v18.4S, v4.4S, v17.4S +mul v4.4S, v4.4S,v16.4S +mla v4.4S, v18.4S, v31.s[0] +sub v18.4s, v2.4s, v4.4s +add v2.4s, v2.4s, v4.4s +ldr q4, [x17, #+2016] +ldr q13, [x17, #+2032] +sqrdmulh v21.4S, v19.4S, v5.4S +ldr q9, [x0, #480] +mul v19.4S, v19.4S,v30.4S +mla v19.4S, v21.4S, v31.s[0] +sub v21.4s, v8.4s, v19.4s +add v8.4s, v8.4s, v19.4s +sqrdmulh v19.4S, v12.4S, v13.4S +ldr q29, [x0, #496] +mul v12.4S, v12.4S,v4.4S +mla v12.4S, v19.4S, v31.s[0] +sub v19.4s, v6.4s, v12.4s +add v6.4s, v6.4s, v12.4s +str q3, [x0, #384] +str q10, [x0, #400] +str q2, [x0, #416] +str q18, [x0, #432] +str q8, [x0, #896] +str q21, [x0, #912] +str q6, [x0, #928] +str q19, [x0, #944] +ldr q13, [x17, #+1024] +ldr q4, [x17, #+1040] +ldr q5, [x0, #448] +ldr q30, [x0, #464] +ldr q22, [x17, #+2048] +ldr q15, [x17, #+2064] +sqrdmulh v28.4S, v9.4S, v4.s[0] +ldr q7, [x0, #992] +mul v9.4S, v9.4S,v13.s[0] +ldr q19, [x0, #1008] +mla v9.4S, v28.4S, v31.s[0] +sub v28.4s, v5.4s, v9.4s +add v5.4s, v5.4s, v9.4s +sqrdmulh v9.4S, v29.4S, v4.s[0] +ldr q6, [x0, #960] +mul v29.4S, v29.4S,v13.s[0] +ldr q21, [x0, #976] +mla v29.4S, v9.4S, v31.s[0] +sub v9.4s, v30.4s, v29.4s +add v30.4s, v30.4s, v29.4s +sqrdmulh v29.4S, v7.4S, v15.s[0] +mul v7.4S, v7.4S,v22.s[0] +mla v7.4S, v29.4S, v31.s[0] +sub v29.4s, v6.4s, v7.4s +add v6.4s, v6.4s, v7.4s +sqrdmulh v7.4S, v19.4S, v15.s[0] +mul v19.4S, v19.4S,v22.s[0] +mla v19.4S, v7.4S, v31.s[0] +sub v7.4s, v21.4s, v19.4s +add v21.4s, v21.4s, v19.4s +sqrdmulh v19.4S, v30.4S, v4.s[1] +mul v30.4S, v30.4S,v13.s[1] +mla v30.4S, v19.4S, v31.s[0] +sub v19.4s, v5.4s, v30.4s +add v5.4s, v5.4s, v30.4s +sqrdmulh v30.4S, v9.4S, v4.s[2] +mul v9.4S, v9.4S,v13.s[2] +mla v9.4S, v30.4S, v31.s[0] +sub v30.4s, v28.4s, v9.4s +add v28.4s, v28.4s, v9.4s +sqrdmulh v9.4S, v21.4S, v15.s[1] +mul v21.4S, v21.4S,v22.s[1] +mla v21.4S, v9.4S, v31.s[0] +sub v9.4s, v6.4s, v21.4s +trn1 v8.4S, v5.4S, v19.4S +trn2 v17.4S, v5.4S, v19.4S +add v6.4s, v6.4s, v21.4s +trn1 v21.4S, v28.4S, v30.4S +trn2 v16.4S, v28.4S, v30.4S +sqrdmulh v0.4S, v7.4S, v15.s[2] +ldr q14, [x17, #+1056] +mul v7.4S, v7.4S,v22.s[2] +ldr q1, [x17, #+1072] +mla v7.4S, v0.4S, v31.s[0] +trn2 v28.2D, v8.2D, v21.2D +trn2 v30.2D, v17.2D, v16.2D +sub v0.4s, v29.4s, v7.4s +trn1 v5.2D, v8.2D, v21.2D +trn1 v19.2D, v17.2D, v16.2D +add v29.4s, v29.4s, v7.4s +sqrdmulh v7.4S, v28.4S, v1.4S +mul v28.4S, v28.4S,v14.4S +mla v28.4S, v7.4S, v31.s[0] +trn1 v7.4S, v6.4S, v9.4S +trn2 v16.4S, v6.4S, v9.4S +sub v17.4s, v5.4s, v28.4s +trn1 v21.4S, v29.4S, v0.4S +trn2 v8.4S, v29.4S, v0.4S +add v5.4s, v5.4s, v28.4s +trn2 v29.2D, v7.2D, v21.2D +trn2 v0.2D, v16.2D, v8.2D +sqrdmulh v28.4S, v30.4S, v1.4S +trn1 v6.2D, v7.2D, v21.2D +trn1 v9.2D, v16.2D, v8.2D +ldr q8, [x17, #+2080] +ldr q16, [x17, #+2096] +mul v30.4S, v30.4S,v14.4S +mla v30.4S, v28.4S, v31.s[0] +sub v28.4s, v19.4s, v30.4s +add v19.4s, v19.4s, v30.4s +sqrdmulh v30.4S, v29.4S, v16.4S +mul v29.4S, v29.4S,v8.4S +mla v29.4S, v30.4S, v31.s[0] +sub v30.4s, v6.4s, v29.4s +add v6.4s, v6.4s, v29.4s +ldr q29, [x17, #+1088] +ldr q21, [x17, #+1104] +sqrdmulh v7.4S, v0.4S, v16.4S +mul v0.4S, v0.4S,v8.4S +mla v0.4S, v7.4S, v31.s[0] +sub v7.4s, v9.4s, v0.4s +add v9.4s, v9.4s, v0.4s +ldr q0, [x17, #+1120] +ldr q20, [x17, #+1136] +sqrdmulh v11.4S, v19.4S, v21.4S +mul v19.4S, v19.4S,v29.4S +mla v19.4S, v11.4S, v31.s[0] +sub v11.4s, v5.4s, v19.4s +add v5.4s, v5.4s, v19.4s +ldr q19, [x17, #+2112] +ldr q27, [x17, #+2128] +sqrdmulh v18.4S, v28.4S, v20.4S +mul v28.4S, v28.4S,v0.4S +mla v28.4S, v18.4S, v31.s[0] +sub v18.4s, v17.4s, v28.4s +add v17.4s, v17.4s, v28.4s +ldr q28, [x17, #+2144] +ldr q2, [x17, #+2160] +sqrdmulh v10.4S, v9.4S, v27.4S +mul v9.4S, v9.4S,v19.4S +mla v9.4S, v10.4S, v31.s[0] +sub v10.4s, v6.4s, v9.4s +add v6.4s, v6.4s, v9.4s +sqrdmulh v9.4S, v7.4S, v2.4S +mul v7.4S, v7.4S,v28.4S +mla v7.4S, v9.4S, v31.s[0] +sub v9.4s, v30.4s, v7.4s +add v30.4s, v30.4s, v7.4s +str q5, [x0, #448] +str q11, [x0, #464] +str q17, [x0, #480] +str q18, [x0, #496] +str q6, [x0, #960] +str q10, [x0, #976] +str q30, [x0, #992] +str q9, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 2392 +// Instruction count: 2388 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z2_5.s b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z2_5.s new file mode 100644 index 0000000..64b3010 --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z2_5.s @@ -0,0 +1,2422 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 26036764 // Layer 6, block 0 +.word 7065381 // Layer 6, block 1 +.word 11280567 // Layer 6, block 2 +.word 19695786 // Layer 6, block 3 +.word 1666225723 // Layer 6, block 0 +.word 452149874 // Layer 6, block 1 +.word 721901190 // Layer 6, block 2 +.word 1260434103 // Layer 6, block 3 +.word 28678040 // Layer 7, block 0 +.word 5637166 // Layer 7, block 2 +.word 18759424 // Layer 7, block 4 +.word 8648030 // Layer 7, block 6 +.word 1835254486 // Layer 7, block 0 +.word 360751090 // Layer 7, block 2 +.word 1200511508 // Layer 7, block 4 +.word 553431680 // Layer 7, block 6 +.word 7232147 // Layer 7, block 1 +.word 7430689 // Layer 7, block 3 +.word 14819378 // Layer 7, block 5 +.word 22112339 // Layer 7, block 7 +.word 462822084 // Layer 7, block 1 +.word 475527802 // Layer 7, block 3 +.word 948367809 // Layer 7, block 5 +.word 1415081692 // Layer 7, block 7 +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14834498 // Layer 6, block 4 +.word 22861321 // Layer 6, block 5 +.word 23033862 // Layer 6, block 6 +.word 32211066 // Layer 6, block 7 +.word 949335415 // Layer 6, block 4 +.word 1463012881 // Layer 6, block 5 +.word 1474054663 // Layer 6, block 6 +.word 2061350894 // Layer 6, block 7 +.word 7103825 // Layer 7, block 8 +.word 24338119 // Layer 7, block 10 +.word 6674394 // Layer 7, block 12 +.word 3716128 // Layer 7, block 14 +.word 454610102 // Layer 7, block 8 +.word 1557520740 // Layer 7, block 10 +.word 427128616 // Layer 7, block 12 +.word 237814041 // Layer 7, block 14 +.word 18577393 // Layer 7, block 9 +.word 17042091 // Layer 7, block 11 +.word 6574213 // Layer 7, block 13 +.word 24666803 // Layer 7, block 15 +.word 1188862414 // Layer 7, block 9 +.word 1090610585 // Layer 7, block 11 +.word 420717521 // Layer 7, block 13 +.word 1578554911 // Layer 7, block 15 +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 11253846 // Layer 6, block 8 +.word 16151303 // Layer 6, block 9 +.word 1821442 // Layer 6, block 10 +.word 23358663 // Layer 6, block 11 +.word 720191176 // Layer 6, block 8 +.word 1033604503 // Layer 6, block 9 +.word 116563391 // Layer 6, block 10 +.word 1494840340 // Layer 6, block 11 +.word 32787475 // Layer 7, block 16 +.word 8269259 // Layer 7, block 18 +.word 20826321 // Layer 7, block 20 +.word 21194054 // Layer 7, block 22 +.word 2098238255 // Layer 7, block 16 +.word 529192186 // Layer 7, block 18 +.word 1332782821 // Layer 7, block 20 +.word 1356315937 // Layer 7, block 22 +.word 28400654 // Layer 7, block 17 +.word 31090287 // Layer 7, block 19 +.word 26776841 // Layer 7, block 21 +.word 22281074 // Layer 7, block 23 +.word 1817503137 // Layer 7, block 17 +.word 1989626512 // Layer 7, block 19 +.word 1713587037 // Layer 7, block 21 +.word 1425879908 // Layer 7, block 23 +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 20504641 // Layer 6, block 12 +.word 7735096 // Layer 6, block 13 +.word 29463916 // Layer 6, block 14 +.word 23172067 // Layer 6, block 15 +.word 1312196872 // Layer 6, block 12 +.word 495008363 // Layer 6, block 13 +.word 1885546712 // Layer 6, block 14 +.word 1482899108 // Layer 6, block 15 +.word 1953000 // Layer 7, block 24 +.word 12766243 // Layer 7, block 26 +.word 16292342 // Layer 7, block 28 +.word 25143337 // Layer 7, block 30 +.word 124982461 // Layer 7, block 24 +.word 816977197 // Layer 7, block 26 +.word 1042630311 // Layer 7, block 28 +.word 1609050759 // Layer 7, block 30 +.word 12486848 // Layer 7, block 25 +.word 31556661 // Layer 7, block 27 +.word 28330310 // Layer 7, block 29 +.word 15137961 // Layer 7, block 31 +.word 799097282 // Layer 7, block 25 +.word 2019472170 // Layer 7, block 27 +.word 1813001465 // Layer 7, block 29 +.word 968755565 // Layer 7, block 31 +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 18663828 // Layer 6, block 16 +.word 25765932 // Layer 6, block 17 +.word 11779122 // Layer 6, block 18 +.word 29112305 // Layer 6, block 19 +.word 1194393831 // Layer 6, block 16 +.word 1648893798 // Layer 6, block 17 +.word 753806275 // Layer 6, block 18 +.word 1863045325 // Layer 6, block 19 +.word 33163184 // Layer 7, block 32 +.word 11550623 // Layer 7, block 34 +.word 25375595 // Layer 7, block 36 +.word 18254638 // Layer 7, block 38 +.word 2122281795 // Layer 7, block 32 +.word 739183455 // Layer 7, block 34 +.word 1623914137 // Layer 7, block 36 +.word 1168207670 // Layer 7, block 38 +.word 9551359 // Layer 7, block 33 +.word 33257316 // Layer 7, block 35 +.word 10387700 // Layer 7, block 37 +.word 4263629 // Layer 7, block 39 +.word 611240324 // Layer 7, block 33 +.word 2128305784 // Layer 7, block 35 +.word 664762063 // Layer 7, block 37 +.word 272851431 // Layer 7, block 39 +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 596073 // Layer 6, block 20 +.word 29039358 // Layer 6, block 21 +.word 6760262 // Layer 6, block 22 +.word 2228887 // Layer 6, block 23 +.word 38145761 // Layer 6, block 20 +.word 1858377074 // Layer 6, block 21 +.word 432623749 // Layer 6, block 22 +.word 142637881 // Layer 6, block 23 +.word 25929180 // Layer 7, block 40 +.word 23508428 // Layer 7, block 42 +.word 22560727 // Layer 7, block 44 +.word 29457393 // Layer 7, block 46 +.word 1659340873 // Layer 7, block 40 +.word 1504424569 // Layer 7, block 42 +.word 1443776334 // Layer 7, block 44 +.word 1885129272 // Layer 7, block 46 +.word 17371159 // Layer 7, block 41 +.word 11558208 // Layer 7, block 43 +.word 15755637 // Layer 7, block 45 +.word 20740787 // Layer 7, block 47 +.word 1111669329 // Layer 7, block 41 +.word 739668858 // Layer 7, block 43 +.word 1008283812 // Layer 7, block 45 +.word 1327309063 // Layer 7, block 47 +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 13624329 // Layer 6, block 24 +.word 9838349 // Layer 6, block 25 +.word 6934560 // Layer 6, block 26 +.word 11310234 // Layer 6, block 27 +.word 871890510 // Layer 6, block 24 +.word 629606282 // Layer 6, block 25 +.word 443777969 // Layer 6, block 26 +.word 723799733 // Layer 6, block 27 +.word 3153984 // Layer 7, block 48 +.word 15599806 // Layer 7, block 50 +.word 23484790 // Layer 7, block 52 +.word 30174454 // Layer 7, block 54 +.word 201839571 // Layer 7, block 48 +.word 998311389 // Layer 7, block 50 +.word 1502911852 // Layer 7, block 52 +.word 1931017673 // Layer 7, block 54 +.word 13598070 // Layer 7, block 49 +.word 31454003 // Layer 7, block 51 +.word 20506260 // Layer 7, block 53 +.word 5928435 // Layer 7, block 55 +.word 870210062 // Layer 7, block 49 +.word 2012902560 // Layer 7, block 51 +.word 1312300480 // Layer 7, block 53 +.word 379390883 // Layer 7, block 55 +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 32798516 // Layer 6, block 28 +.word 9911360 // Layer 6, block 29 +.word 32443170 // Layer 6, block 30 +.word 31293482 // Layer 6, block 31 +.word 2098944825 // Layer 6, block 28 +.word 634278629 // Layer 6, block 29 +.word 2076204416 // Layer 6, block 30 +.word 2002630000 // Layer 6, block 31 +.word 26013877 // Layer 7, block 56 +.word 22928950 // Layer 7, block 58 +.word 24547058 // Layer 7, block 60 +.word 21082546 // Layer 7, block 62 +.word 1664761067 // Layer 7, block 56 +.word 1467340807 // Layer 7, block 58 +.word 1570891816 // Layer 7, block 60 +.word 1349179970 // Layer 7, block 62 +.word 21864746 // Layer 7, block 57 +.word 27678266 // Layer 7, block 59 +.word 30695887 // Layer 7, block 61 +.word 31772478 // Layer 7, block 63 +.word 1399236949 // Layer 7, block 57 +.word 1771273834 // Layer 7, block 59 +.word 1964386839 // Layer 7, block 61 +.word 2033283404 // Layer 7, block 63 +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 2853776 // Layer 6, block 32 +.word 31645959 // Layer 6, block 33 +.word 29723614 // Layer 6, block 34 +.word 31813171 // Layer 6, block 35 +.word 182627725 // Layer 6, block 32 +.word 2025186806 // Layer 6, block 33 +.word 1902166116 // Layer 6, block 34 +.word 2035887557 // Layer 6, block 35 +.word 30377953 // Layer 7, block 64 +.word 4924837 // Layer 7, block 66 +.word 11362575 // Layer 7, block 68 +.word 31398766 // Layer 7, block 70 +.word 1944040616 // Layer 7, block 64 +.word 315165513 // Layer 7, block 66 +.word 727149301 // Layer 7, block 68 +.word 2009367662 // Layer 7, block 70 +.word 27689101 // Layer 7, block 65 +.word 31229525 // Layer 7, block 67 +.word 6544948 // Layer 7, block 69 +.word 13728247 // Layer 7, block 71 +.word 1771967221 // Layer 7, block 65 +.word 1998537064 // Layer 7, block 67 +.word 418844704 // Layer 7, block 69 +.word 878540754 // Layer 7, block 71 +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9116920 // Layer 6, block 36 +.word 26449800 // Layer 6, block 37 +.word 27173300 // Layer 6, block 38 +.word 1574249 // Layer 6, block 39 +.word 583438350 // Layer 6, block 36 +.word 1692658010 // Layer 6, block 37 +.word 1738958476 // Layer 6, block 38 +.word 100744247 // Layer 6, block 39 +.word 6510145 // Layer 7, block 72 +.word 760999 // Layer 7, block 74 +.word 1634503 // Layer 7, block 76 +.word 29546109 // Layer 7, block 78 +.word 416617482 // Layer 7, block 72 +.word 48700219 // Layer 7, block 74 +.word 104600209 // Layer 7, block 76 +.word 1890806663 // Layer 7, block 78 +.word 2195232 // Layer 7, block 73 +.word 4465852 // Layer 7, block 75 +.word 31203102 // Layer 7, block 77 +.word 29916743 // Layer 7, block 79 +.word 140484126 // Layer 7, block 73 +.word 285792715 // Layer 7, block 75 +.word 1996846121 // Layer 7, block 77 +.word 1914525428 // Layer 7, block 79 +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29172999 // Layer 6, block 40 +.word 16825951 // Layer 6, block 41 +.word 11592382 // Layer 6, block 42 +.word 2671395 // Layer 6, block 43 +.word 1866929445 // Layer 6, block 40 +.word 1076778680 // Layer 6, block 41 +.word 741855827 // Layer 6, block 42 +.word 170956232 // Layer 6, block 43 +.word 14579779 // Layer 7, block 80 +.word 24263513 // Layer 7, block 82 +.word 4646776 // Layer 7, block 84 +.word 69049 // Layer 7, block 86 +.word 933034643 // Layer 7, block 80 +.word 1552746321 // Layer 7, block 82 +.word 297370968 // Layer 7, block 84 +.word 4418799 // Layer 7, block 86 +.word 33263488 // Layer 7, block 81 +.word 22493246 // Layer 7, block 83 +.word 22009979 // Layer 7, block 85 +.word 12021234 // Layer 7, block 87 +.word 2128700762 // Layer 7, block 81 +.word 1439457879 // Layer 7, block 83 +.word 1408531152 // Layer 7, block 85 +.word 769300260 // Layer 7, block 87 +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 15720958 // Layer 6, block 44 +.word 4876619 // Layer 6, block 45 +.word 9370171 // Layer 6, block 46 +.word 2197027 // Layer 6, block 47 +.word 1006064525 // Layer 6, block 44 +.word 312079797 // Layer 6, block 45 +.word 599645177 // Layer 6, block 46 +.word 140598997 // Layer 6, block 47 +.word 16117282 // Layer 7, block 88 +.word 9635661 // Layer 7, block 90 +.word 9117520 // Layer 7, block 92 +.word 3506913 // Layer 7, block 94 +.word 1031427326 // Layer 7, block 88 +.word 616635240 // Layer 7, block 90 +.word 583476747 // Layer 7, block 92 +.word 224425303 // Layer 7, block 94 +.word 20014407 // Layer 7, block 89 +.word 25893988 // Layer 7, block 91 +.word 10257619 // Layer 7, block 93 +.word 24501669 // Layer 7, block 95 +.word 1280824291 // Layer 7, block 89 +.word 1657088757 // Layer 7, block 91 +.word 656437514 // Layer 7, block 93 +.word 1567987141 // Layer 7, block 95 +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 23467272 // Layer 6, block 48 +.word 11944835 // Layer 6, block 49 +.word 29768154 // Layer 6, block 50 +.word 3189790 // Layer 6, block 51 +.word 1501790786 // Layer 6, block 48 +.word 764411097 // Layer 6, block 49 +.word 1905016458 // Layer 6, block 50 +.word 204130980 // Layer 6, block 51 +.word 28559032 // Layer 7, block 96 +.word 20151609 // Layer 7, block 98 +.word 11645481 // Layer 7, block 100 +.word 16402437 // Layer 7, block 102 +.word 1827638556 // Layer 7, block 96 +.word 1289604549 // Layer 7, block 98 +.word 745253903 // Layer 7, block 100 +.word 1049675853 // Layer 7, block 102 +.word 1005359 // Layer 7, block 97 +.word 19130139 // Layer 7, block 99 +.word 11690281 // Layer 7, block 101 +.word 5461508 // Layer 7, block 103 +.word 64338065 // Layer 7, block 97 +.word 1224235458 // Layer 7, block 99 +.word 748120885 // Layer 7, block 101 +.word 349509836 // Layer 7, block 103 +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 4898455 // Layer 6, block 52 +.word 22059944 // Layer 6, block 53 +.word 20315246 // Layer 6, block 54 +.word 28615767 // Layer 6, block 55 +.word 313477194 // Layer 6, block 52 +.word 1411728668 // Layer 6, block 53 +.word 1300076517 // Layer 6, block 54 +.word 1831269319 // Layer 6, block 55 +.word 6226096 // Layer 7, block 104 +.word 14029790 // Layer 7, block 106 +.word 7729000 // Layer 7, block 108 +.word 13958531 // Layer 7, block 110 +.word 398439734 // Layer 7, block 104 +.word 897838034 // Layer 7, block 106 +.word 494618249 // Layer 7, block 108 +.word 893277806 // Layer 7, block 110 +.word 31755058 // Layer 7, block 105 +.word 26102744 // Layer 7, block 107 +.word 19175904 // Layer 7, block 109 +.word 19472238 // Layer 7, block 111 +.word 2032168609 // Layer 7, block 105 +.word 1670448121 // Layer 7, block 107 +.word 1227164194 // Layer 7, block 109 +.word 1246128123 // Layer 7, block 111 +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 17302560 // Layer 6, block 56 +.word 8630188 // Layer 6, block 57 +.word 13744680 // Layer 6, block 58 +.word 31890906 // Layer 6, block 59 +.word 1107279328 // Layer 6, block 56 +.word 552289879 // Layer 6, block 57 +.word 879592386 // Layer 6, block 58 +.word 2040862218 // Layer 6, block 59 +.word 4735938 // Layer 7, block 112 +.word 26671657 // Layer 7, block 114 +.word 25810971 // Layer 7, block 116 +.word 25578690 // Layer 7, block 118 +.word 303076900 // Layer 7, block 112 +.word 1706855774 // Layer 7, block 114 +.word 1651776074 // Layer 7, block 116 +.word 1636911225 // Layer 7, block 118 +.word 6957373 // Layer 7, block 113 +.word 25381712 // Layer 7, block 115 +.word 27780827 // Layer 7, block 117 +.word 28062311 // Layer 7, block 119 +.word 445237890 // Layer 7, block 113 +.word 1624305595 // Layer 7, block 115 +.word 1777837237 // Layer 7, block 117 +.word 1795850838 // Layer 7, block 119 +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 26150922 // Layer 6, block 60 +.word 29525906 // Layer 6, block 61 +.word 23080870 // Layer 6, block 62 +.word 1636987 // Layer 6, block 63 +.word 1673531278 // Layer 6, block 60 +.word 1889513769 // Layer 6, block 61 +.word 1477062945 // Layer 6, block 62 +.word 104759172 // Layer 6, block 63 +.word 10674616 // Layer 7, block 120 +.word 9508293 // Layer 7, block 122 +.word 4274200 // Layer 7, block 124 +.word 10066304 // Layer 7, block 126 +.word 683123285 // Layer 7, block 120 +.word 608484310 // Layer 7, block 122 +.word 273527923 // Layer 7, block 124 +.word 644194289 // Layer 7, block 126 +.word 26473446 // Layer 7, block 121 +.word 14853570 // Layer 7, block 123 +.word 32427548 // Layer 7, block 125 +.word 16598340 // Layer 7, block 127 +.word 1694171239 // Layer 7, block 121 +.word 950555930 // Layer 7, block 123 +.word 2075204685 // Layer 7, block 125 +.word 1062212688 // Layer 7, block 127 +.text +.global ntt_u32_full_neon_asm_var_4_4_3_z2_5 +.global _ntt_u32_full_neon_asm_var_4_4_3_z2_5 +ntt_u32_full_neon_asm_var_4_4_3_z2_5: +_ntt_u32_full_neon_asm_var_4_4_3_z2_5: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #800] +ldr q21, [x0, #864] +ldr q20, [x0, #928] +ldr q19, [x0, #992] +ldr q18, [x0, #288] +ldr q17, [x0, #352] +ldr q16, [x0, #416] +ldr q3, [x0, #480] +sqrdmulh v2.4S, v22.4S, v29.s[0] +ldr q1, [x0, #544] +mul v22.4S, v22.4S,v30.s[0] +ldr q0, [x0, #608] +sqrdmulh v15.4S, v21.4S, v29.s[0] +ldr q14, [x0, #672] +mul v21.4S, v21.4S,v30.s[0] +ldr q13, [x0, #736] +mla v22.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q12, [x0, #32] +sub v11.4s, v18.4s, v22.4s +mla v21.4S, v15.4S, v31.s[0] +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q15, [x0, #96] +sub v10.4s, v17.4s, v21.4s +mla v20.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v1.4S, v29.s[0] +ldr q2, [x0, #160] +mul v1.4S, v1.4S,v30.s[0] +sub v9.4s, v16.4s, v20.4s +mla v19.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v0.4S, v29.s[0] +ldr q22, [x0, #224] +mul v0.4S, v0.4S,v30.s[0] +sub v8.4s, v3.4s, v19.4s +mla v1.4S, v21.4S, v31.s[0] +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v21.4s, v12.4s, v1.4s +mla v0.4S, v20.4S, v31.s[0] +add v12.4s, v12.4s, v1.4s +sqrdmulh v1.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +sub v20.4s, v15.4s, v0.4s +mla v14.4S, v19.4S, v31.s[0] +add v15.4s, v15.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v19.4s, v2.4s, v14.4s +mla v13.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v1.4s, v22.4s, v13.4s +mla v16.4S, v0.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v0.4s, v2.4s, v16.4s +mla v3.4S, v14.4S, v31.s[0] +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v14.4s, v22.4s, v3.4s +mla v18.4S, v13.4S, v31.s[0] +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v29.s[2] +mul v9.4S, v9.4S,v30.s[2] +sub v13.4s, v12.4s, v18.4s +mla v17.4S, v16.4S, v31.s[0] +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v8.4S, v29.s[2] +mul v8.4S, v8.4S,v30.s[2] +sub v16.4s, v15.4s, v17.4s +mla v9.4S, v3.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +sqrdmulh v17.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v3.4s, v19.4s, v9.4s +mla v8.4S, v18.4S, v31.s[0] +add v19.4s, v19.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v18.4s, v1.4s, v8.4s +mla v11.4S, v17.4S, v31.s[0] +add v1.4s, v1.4s, v8.4s +sqrdmulh v8.4S, v2.4S, v27.s[0] +mul v2.4S, v2.4S,v28.s[0] +sub v17.4s, v21.4s, v11.4s +mla v10.4S, v9.4S, v31.s[0] +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v27.s[0] +mul v22.4S, v22.4S,v28.s[0] +sub v9.4s, v20.4s, v10.4s +mla v2.4S, v8.4S, v31.s[0] +add v20.4s, v20.4s, v10.4s +sqrdmulh v10.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v8.4s, v12.4s, v2.4s +mla v22.4S, v11.4S, v31.s[0] +add v12.4s, v12.4s, v2.4s +sqrdmulh v2.4S, v14.4S, v27.s[1] +mul v14.4S, v14.4S,v28.s[1] +sub v11.4s, v15.4s, v22.4s +mla v0.4S, v10.4S, v31.s[0] +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v27.s[2] +mul v19.4S, v19.4S,v28.s[2] +sub v10.4s, v13.4s, v0.4s +mla v14.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v0.4s +sqrdmulh v0.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +sub v2.4s, v16.4s, v14.4s +mla v19.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v27.s[3] +mul v3.4S, v3.4S,v28.s[3] +sub v22.4s, v21.4s, v19.4s +mla v1.4S, v0.4S, v31.s[0] +add v21.4s, v21.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v0.4s, v20.4s, v1.4s +mla v3.4S, v14.4S, v31.s[0] +add v20.4s, v20.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v25.s[0] +mul v15.4S, v15.4S,v26.s[0] +sub v14.4s, v17.4s, v3.4s +mla v18.4S, v19.4S, v31.s[0] +add v17.4s, v17.4s, v3.4s +sqrdmulh v3.4S, v11.4S, v25.s[1] +mul v11.4S, v11.4S,v26.s[1] +sub v19.4s, v9.4s, v18.4s +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v1.4s, v12.4s, v15.4s +mla v11.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v25.s[3] +mul v2.4S, v2.4S,v26.s[3] +sub v3.4s, v8.4s, v11.4s +mla v16.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v11.4s +str q12, [x0, #32] +sqrdmulh v12.4S, v20.4S, v23.s[0] +str q1, [x0, #96] +mul v20.4S, v20.4S,v24.s[0] +ldr q1, [x0, #816] +sub v11.4s, v13.4s, v16.4s +ldr q18, [x0, #880] +mla v2.4S, v15.4S, v31.s[0] +add v13.4s, v13.4s, v16.4s +str q8, [x0, #160] +sqrdmulh v8.4S, v0.4S, v23.s[1] +str q3, [x0, #224] +mul v0.4S, v0.4S,v24.s[1] +ldr q3, [x0, #944] +sub v16.4s, v10.4s, v2.4s +ldr q15, [x0, #1008] +mla v20.4S, v12.4S, v31.s[0] +add v10.4s, v10.4s, v2.4s +str q13, [x0, #288] +sqrdmulh v13.4S, v9.4S, v23.s[2] +str q11, [x0, #352] +mul v9.4S, v9.4S,v24.s[2] +ldr q11, [x0, #304] +sub v2.4s, v21.4s, v20.4s +ldr q12, [x0, #368] +mla v0.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v20.4s +str q10, [x0, #416] +sqrdmulh v10.4S, v19.4S, v23.s[3] +str q16, [x0, #480] +mul v19.4S, v19.4S,v24.s[3] +ldr q16, [x0, #432] +sub v20.4s, v22.4s, v0.4s +ldr q8, [x0, #496] +mla v9.4S, v13.4S, v31.s[0] +add v22.4s, v22.4s, v0.4s +str q21, [x0, #544] +sqrdmulh v21.4S, v1.4S, v29.s[0] +str q2, [x0, #608] +ldr q2, [x0, #560] +mul v1.4S, v1.4S,v30.s[0] +ldr q0, [x0, #624] +sub v13.4s, v17.4s, v9.4s +mla v19.4S, v10.4S, v31.s[0] +add v17.4s, v17.4s, v9.4s +str q22, [x0, #672] +sqrdmulh v22.4S, v18.4S, v29.s[0] +str q20, [x0, #736] +ldr q20, [x0, #688] +mul v18.4S, v18.4S,v30.s[0] +ldr q9, [x0, #752] +sub v10.4s, v14.4s, v19.4s +mla v1.4S, v21.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +str q17, [x0, #800] +sqrdmulh v17.4S, v3.4S, v29.s[0] +str q13, [x0, #864] +mul v3.4S, v3.4S,v30.s[0] +ldr q13, [x0, #48] +sub v19.4s, v11.4s, v1.4s +mla v18.4S, v22.4S, v31.s[0] +add v11.4s, v11.4s, v1.4s +str q14, [x0, #928] +sqrdmulh v14.4S, v15.4S, v29.s[0] +str q10, [x0, #992] +mul v15.4S, v15.4S,v30.s[0] +ldr q10, [x0, #112] +sub v1.4s, v12.4s, v18.4s +mla v3.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v2.4S, v29.s[0] +ldr q17, [x0, #176] +mul v2.4S, v2.4S,v30.s[0] +sub v22.4s, v16.4s, v3.4s +mla v15.4S, v14.4S, v31.s[0] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v0.4S, v29.s[0] +ldr q14, [x0, #240] +mul v0.4S, v0.4S,v30.s[0] +sub v21.4s, v8.4s, v15.4s +mla v2.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +sub v18.4s, v13.4s, v2.4s +mla v0.4S, v3.4S, v31.s[0] +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v9.4S, v29.s[0] +mul v9.4S, v9.4S,v30.s[0] +sub v3.4s, v10.4s, v0.4s +mla v20.4S, v15.4S, v31.s[0] +add v10.4s, v10.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v15.4s, v17.4s, v20.4s +mla v9.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +sub v2.4s, v14.4s, v9.4s +mla v16.4S, v0.4S, v31.s[0] +add v14.4s, v14.4s, v9.4s +sqrdmulh v9.4S, v11.4S, v29.s[1] +mul v11.4S, v11.4S,v30.s[1] +sub v0.4s, v17.4s, v16.4s +mla v8.4S, v20.4S, v31.s[0] +add v17.4s, v17.4s, v16.4s +sqrdmulh v16.4S, v12.4S, v29.s[1] +mul v12.4S, v12.4S,v30.s[1] +sub v20.4s, v14.4s, v8.4s +mla v11.4S, v9.4S, v31.s[0] +add v14.4s, v14.4s, v8.4s +sqrdmulh v8.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +sub v9.4s, v13.4s, v11.4s +mla v12.4S, v16.4S, v31.s[0] +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +sub v16.4s, v10.4s, v12.4s +mla v22.4S, v8.4S, v31.s[0] +add v10.4s, v10.4s, v12.4s +sqrdmulh v12.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +sub v8.4s, v15.4s, v22.4s +mla v21.4S, v11.4S, v31.s[0] +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v1.4S, v29.s[2] +mul v1.4S, v1.4S,v30.s[2] +sub v11.4s, v2.4s, v21.4s +mla v19.4S, v12.4S, v31.s[0] +add v2.4s, v2.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v27.s[0] +mul v17.4S, v17.4S,v28.s[0] +sub v12.4s, v18.4s, v19.4s +mla v1.4S, v22.4S, v31.s[0] +add v18.4s, v18.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +sub v22.4s, v3.4s, v1.4s +mla v17.4S, v21.4S, v31.s[0] +add v3.4s, v3.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v21.4s, v13.4s, v17.4s +mla v14.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v17.4s +sqrdmulh v17.4S, v20.4S, v27.s[1] +mul v20.4S, v20.4S,v28.s[1] +sub v19.4s, v10.4s, v14.4s +mla v0.4S, v1.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v27.s[2] +mul v15.4S, v15.4S,v28.s[2] +sub v1.4s, v9.4s, v0.4s +mla v20.4S, v17.4S, v31.s[0] +add v9.4s, v9.4s, v0.4s +sqrdmulh v0.4S, v2.4S, v27.s[2] +mul v2.4S, v2.4S,v28.s[2] +sub v17.4s, v16.4s, v20.4s +mla v15.4S, v14.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v27.s[3] +mul v8.4S, v8.4S,v28.s[3] +sub v14.4s, v18.4s, v15.4s +mla v2.4S, v0.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v27.s[3] +mul v11.4S, v11.4S,v28.s[3] +sub v0.4s, v3.4s, v2.4s +mla v8.4S, v20.4S, v31.s[0] +add v3.4s, v3.4s, v2.4s +sqrdmulh v2.4S, v10.4S, v25.s[0] +mul v10.4S, v10.4S,v26.s[0] +sub v20.4s, v12.4s, v8.4s +mla v11.4S, v15.4S, v31.s[0] +add v12.4s, v12.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v25.s[1] +mul v19.4S, v19.4S,v26.s[1] +sub v15.4s, v22.4s, v11.4s +mla v10.4S, v2.4S, v31.s[0] +add v22.4s, v22.4s, v11.4s +sqrdmulh v11.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v2.4s, v13.4s, v10.4s +mla v19.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v10.4s +sqrdmulh v10.4S, v17.4S, v25.s[3] +mul v17.4S, v17.4S,v26.s[3] +sub v8.4s, v21.4s, v19.4s +mla v16.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v19.4s +str q13, [x0, #48] +sqrdmulh v13.4S, v3.4S, v23.s[0] +str q2, [x0, #112] +mul v3.4S, v3.4S,v24.s[0] +ldr q2, [x0, #768] +sub v19.4s, v9.4s, v16.4s +ldr q11, [x0, #832] +mla v17.4S, v10.4S, v31.s[0] +add v9.4s, v9.4s, v16.4s +str q21, [x0, #176] +sqrdmulh v21.4S, v0.4S, v23.s[1] +str q8, [x0, #240] +mul v0.4S, v0.4S,v24.s[1] +ldr q8, [x0, #896] +sub v16.4s, v1.4s, v17.4s +ldr q10, [x0, #960] +mla v3.4S, v13.4S, v31.s[0] +add v1.4s, v1.4s, v17.4s +str q9, [x0, #304] +sqrdmulh v9.4S, v22.4S, v23.s[2] +str q19, [x0, #368] +mul v22.4S, v22.4S,v24.s[2] +ldr q19, [x0, #256] +sub v17.4s, v18.4s, v3.4s +ldr q13, [x0, #320] +mla v0.4S, v21.4S, v31.s[0] +add v18.4s, v18.4s, v3.4s +str q1, [x0, #432] +sqrdmulh v1.4S, v15.4S, v23.s[3] +str q16, [x0, #496] +mul v15.4S, v15.4S,v24.s[3] +ldr q16, [x0, #384] +sub v3.4s, v14.4s, v0.4s +ldr q21, [x0, #448] +mla v22.4S, v9.4S, v31.s[0] +add v14.4s, v14.4s, v0.4s +str q18, [x0, #560] +sqrdmulh v18.4S, v2.4S, v29.s[0] +str q17, [x0, #624] +ldr q17, [x0, #512] +mul v2.4S, v2.4S,v30.s[0] +ldr q0, [x0, #576] +sub v9.4s, v12.4s, v22.4s +mla v15.4S, v1.4S, v31.s[0] +add v12.4s, v12.4s, v22.4s +str q14, [x0, #688] +sqrdmulh v14.4S, v11.4S, v29.s[0] +str q3, [x0, #752] +ldr q3, [x0, #640] +mul v11.4S, v11.4S,v30.s[0] +ldr q22, [x0, #704] +sub v1.4s, v20.4s, v15.4s +mla v2.4S, v18.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +str q12, [x0, #816] +sqrdmulh v12.4S, v8.4S, v29.s[0] +str q9, [x0, #880] +mul v8.4S, v8.4S,v30.s[0] +ldr q9, [x0, #0] +sub v15.4s, v19.4s, v2.4s +mla v11.4S, v14.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +str q20, [x0, #944] +sqrdmulh v20.4S, v10.4S, v29.s[0] +str q1, [x0, #1008] +mul v10.4S, v10.4S,v30.s[0] +ldr q1, [x0, #64] +sub v2.4s, v13.4s, v11.4s +mla v8.4S, v12.4S, v31.s[0] +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v29.s[0] +ldr q12, [x0, #128] +mul v17.4S, v17.4S,v30.s[0] +sub v14.4s, v16.4s, v8.4s +mla v10.4S, v20.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v0.4S, v29.s[0] +ldr q20, [x0, #192] +mul v0.4S, v0.4S,v30.s[0] +sub v18.4s, v21.4s, v10.4s +mla v17.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +sub v11.4s, v9.4s, v17.4s +mla v0.4S, v8.4S, v31.s[0] +add v9.4s, v9.4s, v17.4s +sqrdmulh v17.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v8.4s, v1.4s, v0.4s +mla v3.4S, v10.4S, v31.s[0] +add v1.4s, v1.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v10.4s, v12.4s, v3.4s +mla v22.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v17.4s, v20.4s, v22.4s +mla v16.4S, v0.4S, v31.s[0] +add v20.4s, v20.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[1] +mul v19.4S, v19.4S,v30.s[1] +sub v0.4s, v12.4s, v16.4s +mla v21.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v13.4S, v29.s[1] +mul v13.4S, v13.4S,v30.s[1] +sub v3.4s, v20.4s, v21.4s +mla v19.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v22.4s, v9.4s, v19.4s +mla v13.4S, v16.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v29.s[2] +mul v18.4S, v18.4S,v30.s[2] +sub v16.4s, v1.4s, v13.4s +mla v14.4S, v21.4S, v31.s[0] +add v1.4s, v1.4s, v13.4s +sqrdmulh v13.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +sub v21.4s, v10.4s, v14.4s +mla v18.4S, v19.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v19.4s, v17.4s, v18.4s +mla v15.4S, v13.4S, v31.s[0] +add v17.4s, v17.4s, v18.4s +sqrdmulh v18.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +sub v13.4s, v11.4s, v15.4s +mla v2.4S, v14.4S, v31.s[0] +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v27.s[0] +mul v20.4S, v20.4S,v28.s[0] +sub v14.4s, v8.4s, v2.4s +mla v12.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v2.4s +sqrdmulh v2.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v18.4s, v9.4s, v12.4s +mla v20.4S, v15.4S, v31.s[0] +add v9.4s, v9.4s, v12.4s +sqrdmulh v12.4S, v3.4S, v27.s[1] +mul v3.4S, v3.4S,v28.s[1] +sub v15.4s, v1.4s, v20.4s +mla v0.4S, v2.4S, v31.s[0] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v10.4S, v27.s[2] +mul v10.4S, v10.4S,v28.s[2] +sub v2.4s, v22.4s, v0.4s +mla v3.4S, v12.4S, v31.s[0] +add v22.4s, v22.4s, v0.4s +sqrdmulh v0.4S, v17.4S, v27.s[2] +mul v17.4S, v17.4S,v28.s[2] +sub v12.4s, v16.4s, v3.4s +mla v10.4S, v20.4S, v31.s[0] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +sub v20.4s, v11.4s, v10.4s +mla v17.4S, v0.4S, v31.s[0] +add v11.4s, v11.4s, v10.4s +sqrdmulh v10.4S, v19.4S, v27.s[3] +mul v19.4S, v19.4S,v28.s[3] +sub v0.4s, v8.4s, v17.4s +mla v21.4S, v3.4S, v31.s[0] +add v8.4s, v8.4s, v17.4s +sqrdmulh v17.4S, v1.4S, v25.s[0] +mul v1.4S, v1.4S,v26.s[0] +sub v3.4s, v13.4s, v21.4s +mla v19.4S, v10.4S, v31.s[0] +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v15.4S, v25.s[1] +mul v15.4S, v15.4S,v26.s[1] +sub v10.4s, v14.4s, v19.4s +mla v1.4S, v17.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +sqrdmulh v19.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v17.4s, v9.4s, v1.4s +mla v15.4S, v21.4S, v31.s[0] +add v9.4s, v9.4s, v1.4s +sqrdmulh v1.4S, v12.4S, v25.s[3] +mul v12.4S, v12.4S,v26.s[3] +sub v21.4s, v18.4s, v15.4s +mla v16.4S, v19.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +str q9, [x0, #0] +sqrdmulh v9.4S, v8.4S, v23.s[0] +str q17, [x0, #64] +mul v8.4S, v8.4S,v24.s[0] +ldr q17, [x0, #784] +sub v15.4s, v22.4s, v16.4s +ldr q19, [x0, #848] +mla v12.4S, v1.4S, v31.s[0] +add v22.4s, v22.4s, v16.4s +str q18, [x0, #128] +sqrdmulh v18.4S, v0.4S, v23.s[1] +str q21, [x0, #192] +mul v0.4S, v0.4S,v24.s[1] +ldr q21, [x0, #912] +sub v16.4s, v2.4s, v12.4s +ldr q1, [x0, #976] +mla v8.4S, v9.4S, v31.s[0] +add v2.4s, v2.4s, v12.4s +str q22, [x0, #256] +sqrdmulh v22.4S, v14.4S, v23.s[2] +str q15, [x0, #320] +mul v14.4S, v14.4S,v24.s[2] +ldr q15, [x0, #272] +sub v12.4s, v11.4s, v8.4s +ldr q9, [x0, #336] +mla v0.4S, v18.4S, v31.s[0] +add v11.4s, v11.4s, v8.4s +str q2, [x0, #384] +sqrdmulh v2.4S, v10.4S, v23.s[3] +str q16, [x0, #448] +mul v10.4S, v10.4S,v24.s[3] +ldr q16, [x0, #400] +sub v8.4s, v20.4s, v0.4s +ldr q18, [x0, #464] +mla v14.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v0.4s +str q11, [x0, #512] +sqrdmulh v11.4S, v17.4S, v29.s[0] +str q12, [x0, #576] +ldr q12, [x0, #528] +mul v17.4S, v17.4S,v30.s[0] +ldr q0, [x0, #592] +sub v22.4s, v13.4s, v14.4s +mla v10.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v14.4s +str q20, [x0, #640] +sqrdmulh v20.4S, v19.4S, v29.s[0] +str q8, [x0, #704] +ldr q8, [x0, #656] +mul v19.4S, v19.4S,v30.s[0] +ldr q14, [x0, #720] +sub v2.4s, v3.4s, v10.4s +mla v17.4S, v11.4S, v31.s[0] +add v3.4s, v3.4s, v10.4s +str q13, [x0, #768] +sqrdmulh v13.4S, v21.4S, v29.s[0] +str q22, [x0, #832] +mul v21.4S, v21.4S,v30.s[0] +ldr q22, [x0, #16] +sub v10.4s, v15.4s, v17.4s +mla v19.4S, v20.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +str q3, [x0, #896] +sqrdmulh v3.4S, v1.4S, v29.s[0] +str q2, [x0, #960] +mul v1.4S, v1.4S,v30.s[0] +ldr q2, [x0, #80] +sub v17.4s, v9.4s, v19.4s +mla v21.4S, v13.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v12.4S, v29.s[0] +ldr q13, [x0, #144] +mul v12.4S, v12.4S,v30.s[0] +sub v20.4s, v16.4s, v21.4s +mla v1.4S, v3.4S, v31.s[0] +add v16.4s, v16.4s, v21.4s +sqrdmulh v21.4S, v0.4S, v29.s[0] +ldr q3, [x0, #208] +mul v0.4S, v0.4S,v30.s[0] +sub v11.4s, v18.4s, v1.4s +mla v12.4S, v19.4S, v31.s[0] +add v18.4s, v18.4s, v1.4s +sqrdmulh v1.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v19.4s, v22.4s, v12.4s +mla v0.4S, v21.4S, v31.s[0] +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v21.4s, v2.4s, v0.4s +mla v8.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v1.4s, v13.4s, v8.4s +mla v14.4S, v12.4S, v31.s[0] +add v13.4s, v13.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v12.4s, v3.4s, v14.4s +mla v16.4S, v0.4S, v31.s[0] +add v3.4s, v3.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +sub v0.4s, v13.4s, v16.4s +mla v18.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v16.4s +sqrdmulh v16.4S, v9.4S, v29.s[1] +mul v9.4S, v9.4S,v30.s[1] +sub v8.4s, v3.4s, v18.4s +mla v15.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +sub v14.4s, v22.4s, v15.4s +mla v9.4S, v16.4S, v31.s[0] +add v22.4s, v22.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v16.4s, v2.4s, v9.4s +mla v20.4S, v18.4S, v31.s[0] +add v2.4s, v2.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v18.4s, v1.4s, v20.4s +mla v11.4S, v15.4S, v31.s[0] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +sub v15.4s, v12.4s, v11.4s +mla v10.4S, v9.4S, v31.s[0] +add v12.4s, v12.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v27.s[0] +mul v13.4S, v13.4S,v28.s[0] +sub v9.4s, v19.4s, v10.4s +mla v17.4S, v20.4S, v31.s[0] +add v19.4s, v19.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v27.s[0] +mul v3.4S, v3.4S,v28.s[0] +sub v20.4s, v21.4s, v17.4s +mla v13.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v11.4s, v22.4s, v13.4s +mla v3.4S, v10.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v8.4S, v27.s[1] +mul v8.4S, v8.4S,v28.s[1] +sub v10.4s, v2.4s, v3.4s +mla v0.4S, v17.4S, v31.s[0] +add v2.4s, v2.4s, v3.4s +sqrdmulh v3.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +sub v17.4s, v14.4s, v0.4s +mla v8.4S, v13.4S, v31.s[0] +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v12.4S, v27.s[2] +mul v12.4S, v12.4S,v28.s[2] +sub v13.4s, v16.4s, v8.4s +mla v1.4S, v3.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v3.4s, v19.4s, v1.4s +mla v12.4S, v0.4S, v31.s[0] +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +sub v0.4s, v21.4s, v12.4s +mla v18.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v2.4S, v25.s[0] +mul v2.4S, v2.4S,v26.s[0] +sub v8.4s, v9.4s, v18.4s +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v10.4S, v25.s[1] +mul v10.4S, v10.4S,v26.s[1] +sub v1.4s, v20.4s, v15.4s +mla v2.4S, v12.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v12.4s, v22.4s, v2.4s +mla v10.4S, v18.4S, v31.s[0] +add v22.4s, v22.4s, v2.4s +sqrdmulh v2.4S, v13.4S, v25.s[3] +mul v13.4S, v13.4S,v26.s[3] +sub v18.4s, v11.4s, v10.4s +mla v16.4S, v15.4S, v31.s[0] +add v11.4s, v11.4s, v10.4s +str q22, [x0, #16] +sqrdmulh v22.4S, v21.4S, v23.s[0] +str q12, [x0, #80] +mul v21.4S, v21.4S,v24.s[0] +sub v12.4s, v14.4s, v16.4s +mla v13.4S, v2.4S, v31.s[0] +add v14.4s, v14.4s, v16.4s +str q11, [x0, #144] +sqrdmulh v11.4S, v0.4S, v23.s[1] +str q18, [x0, #208] +mul v0.4S, v0.4S,v24.s[1] +sub v18.4s, v17.4s, v13.4s +mla v21.4S, v22.4S, v31.s[0] +add v17.4s, v17.4s, v13.4s +str q14, [x0, #272] +sqrdmulh v14.4S, v20.4S, v23.s[2] +str q12, [x0, #336] +mul v20.4S, v20.4S,v24.s[2] +sub v12.4s, v19.4s, v21.4s +mla v0.4S, v11.4S, v31.s[0] +add v19.4s, v19.4s, v21.4s +str q17, [x0, #400] +sqrdmulh v17.4S, v1.4S, v23.s[3] +str q18, [x0, #464] +mul v1.4S, v1.4S,v24.s[3] +sub v18.4s, v3.4s, v0.4s +mla v20.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v0.4s +str q19, [x0, #528] +str q12, [x0, #592] +sub v12.4s, v9.4s, v20.4s +mla v1.4S, v17.4S, v31.s[0] +add v9.4s, v9.4s, v20.4s +str q3, [x0, #656] +str q18, [x0, #720] +sub v18.4s, v8.4s, v1.4s +add v8.4s, v8.4s, v1.4s +str q9, [x0, #784] +str q12, [x0, #848] +str q8, [x0, #912] +str q18, [x0, #976] +ldr q4, [x0, #32] +ldr q5, [x0, #48] +ldr q6, [x17, #+128] +ldr q7, [x17, #+144] +ldr q15, [x0, #0] +ldr q10, [x0, #16] +ldr q2, [x17, #+1152] +ldr q16, [x17, #+1168] +sqrdmulh v22.4S, v4.4S, v7.s[0] +ldr q13, [x0, #544] +mul v4.4S, v4.4S,v6.s[0] +ldr q11, [x0, #560] +sqrdmulh v21.4S, v5.4S, v7.s[0] +ldr q14, [x0, #512] +mul v5.4S, v5.4S,v6.s[0] +ldr q0, [x0, #528] +mla v4.4S, v22.4S, v31.s[0] +sqrdmulh v22.4S, v13.4S, v16.s[0] +mul v13.4S, v13.4S,v2.s[0] +mla v5.4S, v21.4S, v31.s[0] +sub v21.4s, v15.4s, v4.4s +add v15.4s, v15.4s, v4.4s +sqrdmulh v4.4S, v11.4S, v16.s[0] +mul v11.4S, v11.4S,v2.s[0] +mla v13.4S, v22.4S, v31.s[0] +sub v22.4s, v10.4s, v5.4s +add v10.4s, v10.4s, v5.4s +sqrdmulh v5.4S, v10.4S, v7.s[1] +mul v10.4S, v10.4S,v6.s[1] +mla v11.4S, v4.4S, v31.s[0] +sub v4.4s, v14.4s, v13.4s +add v14.4s, v14.4s, v13.4s +sqrdmulh v13.4S, v22.4S, v7.s[2] +mul v22.4S, v22.4S,v6.s[2] +mla v10.4S, v5.4S, v31.s[0] +sub v5.4s, v0.4s, v11.4s +add v0.4s, v0.4s, v11.4s +sqrdmulh v11.4S, v0.4S, v16.s[1] +mul v0.4S, v0.4S,v2.s[1] +mla v22.4S, v13.4S, v31.s[0] +sub v13.4s, v15.4s, v10.4s +add v15.4s, v15.4s, v10.4s +sqrdmulh v10.4S, v5.4S, v16.s[2] +mul v5.4S, v5.4S,v2.s[2] +mla v0.4S, v11.4S, v31.s[0] +sub v11.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +trn1 v22.4S, v15.4S, v13.4S +trn2 v19.4S, v15.4S, v13.4S +trn1 v17.4S, v21.4S, v11.4S +trn2 v20.4S, v21.4S, v11.4S +trn2 v21.2D, v22.2D, v17.2D +trn2 v11.2D, v19.2D, v20.2D +trn1 v15.2D, v22.2D, v17.2D +trn1 v13.2D, v19.2D, v20.2D +ldr q20, [x17, #+160] +ldr q19, [x17, #+176] +sqrdmulh v17.4S, v21.4S, v19.4S +mul v21.4S, v21.4S,v20.4S +mla v5.4S, v10.4S, v31.s[0] +sub v10.4s, v14.4s, v0.4s +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v11.4S, v19.4S +mul v11.4S, v11.4S,v20.4S +mla v21.4S, v17.4S, v31.s[0] +sub v17.4s, v4.4s, v5.4s +add v4.4s, v4.4s, v5.4s +trn1 v5.4S, v14.4S, v10.4S +trn2 v22.4S, v14.4S, v10.4S +trn1 v3.4S, v4.4S, v17.4S +trn2 v1.4S, v4.4S, v17.4S +trn2 v4.2D, v5.2D, v3.2D +trn2 v17.2D, v22.2D, v1.2D +trn1 v14.2D, v5.2D, v3.2D +trn1 v10.2D, v22.2D, v1.2D +ldr q1, [x17, #+1184] +ldr q22, [x17, #+1200] +sqrdmulh v3.4S, v4.4S, v22.4S +ldr q5, [x17, #+192] +ldr q9, [x17, #+208] +mul v4.4S, v4.4S,v1.4S +mla v11.4S, v0.4S, v31.s[0] +sub v0.4s, v15.4s, v21.4s +add v15.4s, v15.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v22.4S +ldr q12, [x17, #+224] +ldr q8, [x17, #+240] +mul v17.4S, v17.4S,v1.4S +mla v4.4S, v3.4S, v31.s[0] +sub v3.4s, v13.4s, v11.4s +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v9.4S +ldr q18, [x17, #+1216] +ldr q30, [x17, #+1232] +mul v13.4S, v13.4S,v5.4S +mla v17.4S, v21.4S, v31.s[0] +sub v21.4s, v14.4s, v4.4s +add v14.4s, v14.4s, v4.4s +sqrdmulh v4.4S, v3.4S, v8.4S +ldr q29, [x17, #+1248] +ldr q28, [x17, #+1264] +mul v3.4S, v3.4S,v12.4S +mla v13.4S, v11.4S, v31.s[0] +sub v11.4s, v10.4s, v17.4s +add v10.4s, v10.4s, v17.4s +sqrdmulh v17.4S, v10.4S, v30.4S +ldr q27, [x0, #96] +mul v10.4S, v10.4S,v18.4S +mla v3.4S, v4.4S, v31.s[0] +sub v4.4s, v15.4s, v13.4s +add v15.4s, v15.4s, v13.4s +sqrdmulh v13.4S, v11.4S, v28.4S +ldr q26, [x0, #112] +mul v11.4S, v11.4S,v29.4S +mla v10.4S, v17.4S, v31.s[0] +sub v17.4s, v0.4s, v3.4s +add v0.4s, v0.4s, v3.4s +mla v11.4S, v13.4S, v31.s[0] +sub v13.4s, v14.4s, v10.4s +add v14.4s, v14.4s, v10.4s +sub v10.4s, v21.4s, v11.4s +add v21.4s, v21.4s, v11.4s +str q15, [x0, #0] +str q4, [x0, #16] +str q0, [x0, #32] +str q17, [x0, #48] +str q14, [x0, #512] +str q13, [x0, #528] +str q21, [x0, #544] +str q10, [x0, #560] +ldr q28, [x17, #+256] +ldr q29, [x17, #+272] +ldr q30, [x0, #64] +ldr q18, [x0, #80] +ldr q22, [x17, #+1280] +ldr q1, [x17, #+1296] +sqrdmulh v16.4S, v27.4S, v29.s[0] +ldr q2, [x0, #608] +mul v27.4S, v27.4S,v28.s[0] +ldr q10, [x0, #624] +sqrdmulh v21.4S, v26.4S, v29.s[0] +ldr q13, [x0, #576] +mul v26.4S, v26.4S,v28.s[0] +ldr q14, [x0, #592] +mla v27.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v2.4S, v1.s[0] +mul v2.4S, v2.4S,v22.s[0] +mla v26.4S, v21.4S, v31.s[0] +sub v21.4s, v30.4s, v27.4s +add v30.4s, v30.4s, v27.4s +sqrdmulh v27.4S, v10.4S, v1.s[0] +mul v10.4S, v10.4S,v22.s[0] +mla v2.4S, v16.4S, v31.s[0] +sub v16.4s, v18.4s, v26.4s +add v18.4s, v18.4s, v26.4s +sqrdmulh v26.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v28.s[1] +mla v10.4S, v27.4S, v31.s[0] +sub v27.4s, v13.4s, v2.4s +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v28.s[2] +mla v18.4S, v26.4S, v31.s[0] +sub v26.4s, v14.4s, v10.4s +add v14.4s, v14.4s, v10.4s +sqrdmulh v10.4S, v14.4S, v1.s[1] +mul v14.4S, v14.4S,v22.s[1] +mla v16.4S, v2.4S, v31.s[0] +sub v2.4s, v30.4s, v18.4s +add v30.4s, v30.4s, v18.4s +sqrdmulh v18.4S, v26.4S, v1.s[2] +mul v26.4S, v26.4S,v22.s[2] +mla v14.4S, v10.4S, v31.s[0] +sub v10.4s, v21.4s, v16.4s +add v21.4s, v21.4s, v16.4s +trn1 v16.4S, v30.4S, v2.4S +trn2 v8.4S, v30.4S, v2.4S +trn1 v12.4S, v21.4S, v10.4S +trn2 v9.4S, v21.4S, v10.4S +trn2 v21.2D, v16.2D, v12.2D +trn2 v10.2D, v8.2D, v9.2D +trn1 v30.2D, v16.2D, v12.2D +trn1 v2.2D, v8.2D, v9.2D +ldr q9, [x17, #+288] +ldr q8, [x17, #+304] +sqrdmulh v12.4S, v21.4S, v8.4S +mul v21.4S, v21.4S,v9.4S +mla v26.4S, v18.4S, v31.s[0] +sub v18.4s, v13.4s, v14.4s +add v13.4s, v13.4s, v14.4s +sqrdmulh v14.4S, v10.4S, v8.4S +mul v10.4S, v10.4S,v9.4S +mla v21.4S, v12.4S, v31.s[0] +sub v12.4s, v27.4s, v26.4s +add v27.4s, v27.4s, v26.4s +trn1 v26.4S, v13.4S, v18.4S +trn2 v16.4S, v13.4S, v18.4S +trn1 v5.4S, v27.4S, v12.4S +trn2 v19.4S, v27.4S, v12.4S +trn2 v27.2D, v26.2D, v5.2D +trn2 v12.2D, v16.2D, v19.2D +trn1 v13.2D, v26.2D, v5.2D +trn1 v18.2D, v16.2D, v19.2D +ldr q19, [x17, #+1312] +ldr q16, [x17, #+1328] +sqrdmulh v5.4S, v27.4S, v16.4S +ldr q26, [x17, #+320] +ldr q20, [x17, #+336] +mul v27.4S, v27.4S,v19.4S +mla v10.4S, v14.4S, v31.s[0] +sub v14.4s, v30.4s, v21.4s +add v30.4s, v30.4s, v21.4s +sqrdmulh v21.4S, v12.4S, v16.4S +ldr q7, [x17, #+352] +ldr q6, [x17, #+368] +mul v12.4S, v12.4S,v19.4S +mla v27.4S, v5.4S, v31.s[0] +sub v5.4s, v2.4s, v10.4s +add v2.4s, v2.4s, v10.4s +sqrdmulh v10.4S, v2.4S, v20.4S +ldr q17, [x17, #+1344] +ldr q0, [x17, #+1360] +mul v2.4S, v2.4S,v26.4S +mla v12.4S, v21.4S, v31.s[0] +sub v21.4s, v13.4s, v27.4s +add v13.4s, v13.4s, v27.4s +sqrdmulh v27.4S, v5.4S, v6.4S +ldr q4, [x17, #+1376] +ldr q15, [x17, #+1392] +mul v5.4S, v5.4S,v7.4S +mla v2.4S, v10.4S, v31.s[0] +sub v10.4s, v18.4s, v12.4s +add v18.4s, v18.4s, v12.4s +sqrdmulh v12.4S, v18.4S, v0.4S +ldr q11, [x0, #160] +mul v18.4S, v18.4S,v17.4S +mla v5.4S, v27.4S, v31.s[0] +sub v27.4s, v30.4s, v2.4s +add v30.4s, v30.4s, v2.4s +sqrdmulh v2.4S, v10.4S, v15.4S +ldr q3, [x0, #176] +mul v10.4S, v10.4S,v4.4S +mla v18.4S, v12.4S, v31.s[0] +sub v12.4s, v14.4s, v5.4s +add v14.4s, v14.4s, v5.4s +mla v10.4S, v2.4S, v31.s[0] +sub v2.4s, v13.4s, v18.4s +add v13.4s, v13.4s, v18.4s +sub v18.4s, v21.4s, v10.4s +add v21.4s, v21.4s, v10.4s +str q30, [x0, #64] +str q27, [x0, #80] +str q14, [x0, #96] +str q12, [x0, #112] +str q13, [x0, #576] +str q2, [x0, #592] +str q21, [x0, #608] +str q18, [x0, #624] +ldr q15, [x17, #+384] +ldr q4, [x17, #+400] +ldr q0, [x0, #128] +ldr q17, [x0, #144] +ldr q16, [x17, #+1408] +ldr q19, [x17, #+1424] +sqrdmulh v1.4S, v11.4S, v4.s[0] +ldr q22, [x0, #672] +mul v11.4S, v11.4S,v15.s[0] +ldr q18, [x0, #688] +sqrdmulh v21.4S, v3.4S, v4.s[0] +ldr q2, [x0, #640] +mul v3.4S, v3.4S,v15.s[0] +ldr q13, [x0, #656] +mla v11.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v22.4S, v19.s[0] +mul v22.4S, v22.4S,v16.s[0] +mla v3.4S, v21.4S, v31.s[0] +sub v21.4s, v0.4s, v11.4s +add v0.4s, v0.4s, v11.4s +sqrdmulh v11.4S, v18.4S, v19.s[0] +mul v18.4S, v18.4S,v16.s[0] +mla v22.4S, v1.4S, v31.s[0] +sub v1.4s, v17.4s, v3.4s +add v17.4s, v17.4s, v3.4s +sqrdmulh v3.4S, v17.4S, v4.s[1] +mul v17.4S, v17.4S,v15.s[1] +mla v18.4S, v11.4S, v31.s[0] +sub v11.4s, v2.4s, v22.4s +add v2.4s, v2.4s, v22.4s +sqrdmulh v22.4S, v1.4S, v4.s[2] +mul v1.4S, v1.4S,v15.s[2] +mla v17.4S, v3.4S, v31.s[0] +sub v3.4s, v13.4s, v18.4s +add v13.4s, v13.4s, v18.4s +sqrdmulh v18.4S, v13.4S, v19.s[1] +mul v13.4S, v13.4S,v16.s[1] +mla v1.4S, v22.4S, v31.s[0] +sub v22.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +sqrdmulh v17.4S, v3.4S, v19.s[2] +mul v3.4S, v3.4S,v16.s[2] +mla v13.4S, v18.4S, v31.s[0] +sub v18.4s, v21.4s, v1.4s +add v21.4s, v21.4s, v1.4s +trn1 v1.4S, v0.4S, v22.4S +trn2 v6.4S, v0.4S, v22.4S +trn1 v7.4S, v21.4S, v18.4S +trn2 v20.4S, v21.4S, v18.4S +trn2 v21.2D, v1.2D, v7.2D +trn2 v18.2D, v6.2D, v20.2D +trn1 v0.2D, v1.2D, v7.2D +trn1 v22.2D, v6.2D, v20.2D +ldr q20, [x17, #+416] +ldr q6, [x17, #+432] +sqrdmulh v7.4S, v21.4S, v6.4S +mul v21.4S, v21.4S,v20.4S +mla v3.4S, v17.4S, v31.s[0] +sub v17.4s, v2.4s, v13.4s +add v2.4s, v2.4s, v13.4s +sqrdmulh v13.4S, v18.4S, v6.4S +mul v18.4S, v18.4S,v20.4S +mla v21.4S, v7.4S, v31.s[0] +sub v7.4s, v11.4s, v3.4s +add v11.4s, v11.4s, v3.4s +trn1 v3.4S, v2.4S, v17.4S +trn2 v1.4S, v2.4S, v17.4S +trn1 v26.4S, v11.4S, v7.4S +trn2 v8.4S, v11.4S, v7.4S +trn2 v11.2D, v3.2D, v26.2D +trn2 v7.2D, v1.2D, v8.2D +trn1 v2.2D, v3.2D, v26.2D +trn1 v17.2D, v1.2D, v8.2D +ldr q8, [x17, #+1440] +ldr q1, [x17, #+1456] +sqrdmulh v26.4S, v11.4S, v1.4S +ldr q3, [x17, #+448] +ldr q9, [x17, #+464] +mul v11.4S, v11.4S,v8.4S +mla v18.4S, v13.4S, v31.s[0] +sub v13.4s, v0.4s, v21.4s +add v0.4s, v0.4s, v21.4s +sqrdmulh v21.4S, v7.4S, v1.4S +ldr q29, [x17, #+480] +ldr q28, [x17, #+496] +mul v7.4S, v7.4S,v8.4S +mla v11.4S, v26.4S, v31.s[0] +sub v26.4s, v22.4s, v18.4s +add v22.4s, v22.4s, v18.4s +sqrdmulh v18.4S, v22.4S, v9.4S +ldr q12, [x17, #+1472] +ldr q14, [x17, #+1488] +mul v22.4S, v22.4S,v3.4S +mla v7.4S, v21.4S, v31.s[0] +sub v21.4s, v2.4s, v11.4s +add v2.4s, v2.4s, v11.4s +sqrdmulh v11.4S, v26.4S, v28.4S +ldr q27, [x17, #+1504] +ldr q30, [x17, #+1520] +mul v26.4S, v26.4S,v29.4S +mla v22.4S, v18.4S, v31.s[0] +sub v18.4s, v17.4s, v7.4s +add v17.4s, v17.4s, v7.4s +sqrdmulh v7.4S, v17.4S, v14.4S +ldr q10, [x0, #224] +mul v17.4S, v17.4S,v12.4S +mla v26.4S, v11.4S, v31.s[0] +sub v11.4s, v0.4s, v22.4s +add v0.4s, v0.4s, v22.4s +sqrdmulh v22.4S, v18.4S, v30.4S +ldr q5, [x0, #240] +mul v18.4S, v18.4S,v27.4S +mla v17.4S, v7.4S, v31.s[0] +sub v7.4s, v13.4s, v26.4s +add v13.4s, v13.4s, v26.4s +mla v18.4S, v22.4S, v31.s[0] +sub v22.4s, v2.4s, v17.4s +add v2.4s, v2.4s, v17.4s +sub v17.4s, v21.4s, v18.4s +add v21.4s, v21.4s, v18.4s +str q0, [x0, #128] +str q11, [x0, #144] +str q13, [x0, #160] +str q7, [x0, #176] +str q2, [x0, #640] +str q22, [x0, #656] +str q21, [x0, #672] +str q17, [x0, #688] +ldr q30, [x17, #+512] +ldr q27, [x17, #+528] +ldr q14, [x0, #192] +ldr q12, [x0, #208] +ldr q1, [x17, #+1536] +ldr q8, [x17, #+1552] +sqrdmulh v19.4S, v10.4S, v27.s[0] +ldr q16, [x0, #736] +mul v10.4S, v10.4S,v30.s[0] +ldr q17, [x0, #752] +sqrdmulh v21.4S, v5.4S, v27.s[0] +ldr q22, [x0, #704] +mul v5.4S, v5.4S,v30.s[0] +ldr q2, [x0, #720] +mla v10.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v16.4S, v8.s[0] +mul v16.4S, v16.4S,v1.s[0] +mla v5.4S, v21.4S, v31.s[0] +sub v21.4s, v14.4s, v10.4s +add v14.4s, v14.4s, v10.4s +sqrdmulh v10.4S, v17.4S, v8.s[0] +mul v17.4S, v17.4S,v1.s[0] +mla v16.4S, v19.4S, v31.s[0] +sub v19.4s, v12.4s, v5.4s +add v12.4s, v12.4s, v5.4s +sqrdmulh v5.4S, v12.4S, v27.s[1] +mul v12.4S, v12.4S,v30.s[1] +mla v17.4S, v10.4S, v31.s[0] +sub v10.4s, v22.4s, v16.4s +add v22.4s, v22.4s, v16.4s +sqrdmulh v16.4S, v19.4S, v27.s[2] +mul v19.4S, v19.4S,v30.s[2] +mla v12.4S, v5.4S, v31.s[0] +sub v5.4s, v2.4s, v17.4s +add v2.4s, v2.4s, v17.4s +sqrdmulh v17.4S, v2.4S, v8.s[1] +mul v2.4S, v2.4S,v1.s[1] +mla v19.4S, v16.4S, v31.s[0] +sub v16.4s, v14.4s, v12.4s +add v14.4s, v14.4s, v12.4s +sqrdmulh v12.4S, v5.4S, v8.s[2] +mul v5.4S, v5.4S,v1.s[2] +mla v2.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v19.4s +add v21.4s, v21.4s, v19.4s +trn1 v19.4S, v14.4S, v16.4S +trn2 v28.4S, v14.4S, v16.4S +trn1 v29.4S, v21.4S, v17.4S +trn2 v9.4S, v21.4S, v17.4S +trn2 v21.2D, v19.2D, v29.2D +trn2 v17.2D, v28.2D, v9.2D +trn1 v14.2D, v19.2D, v29.2D +trn1 v16.2D, v28.2D, v9.2D +ldr q9, [x17, #+544] +ldr q28, [x17, #+560] +sqrdmulh v29.4S, v21.4S, v28.4S +mul v21.4S, v21.4S,v9.4S +mla v5.4S, v12.4S, v31.s[0] +sub v12.4s, v22.4s, v2.4s +add v22.4s, v22.4s, v2.4s +sqrdmulh v2.4S, v17.4S, v28.4S +mul v17.4S, v17.4S,v9.4S +mla v21.4S, v29.4S, v31.s[0] +sub v29.4s, v10.4s, v5.4s +add v10.4s, v10.4s, v5.4s +trn1 v5.4S, v22.4S, v12.4S +trn2 v19.4S, v22.4S, v12.4S +trn1 v3.4S, v10.4S, v29.4S +trn2 v6.4S, v10.4S, v29.4S +trn2 v10.2D, v5.2D, v3.2D +trn2 v29.2D, v19.2D, v6.2D +trn1 v22.2D, v5.2D, v3.2D +trn1 v12.2D, v19.2D, v6.2D +ldr q6, [x17, #+1568] +ldr q19, [x17, #+1584] +sqrdmulh v3.4S, v10.4S, v19.4S +ldr q5, [x17, #+576] +ldr q20, [x17, #+592] +mul v10.4S, v10.4S,v6.4S +mla v17.4S, v2.4S, v31.s[0] +sub v2.4s, v14.4s, v21.4s +add v14.4s, v14.4s, v21.4s +sqrdmulh v21.4S, v29.4S, v19.4S +ldr q4, [x17, #+608] +ldr q15, [x17, #+624] +mul v29.4S, v29.4S,v6.4S +mla v10.4S, v3.4S, v31.s[0] +sub v3.4s, v16.4s, v17.4s +add v16.4s, v16.4s, v17.4s +sqrdmulh v17.4S, v16.4S, v20.4S +ldr q7, [x17, #+1600] +ldr q13, [x17, #+1616] +mul v16.4S, v16.4S,v5.4S +mla v29.4S, v21.4S, v31.s[0] +sub v21.4s, v22.4s, v10.4s +add v22.4s, v22.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v15.4S +ldr q11, [x17, #+1632] +ldr q0, [x17, #+1648] +mul v3.4S, v3.4S,v4.4S +mla v16.4S, v17.4S, v31.s[0] +sub v17.4s, v12.4s, v29.4s +add v12.4s, v12.4s, v29.4s +sqrdmulh v29.4S, v12.4S, v13.4S +ldr q18, [x0, #288] +mul v12.4S, v12.4S,v7.4S +mla v3.4S, v10.4S, v31.s[0] +sub v10.4s, v14.4s, v16.4s +add v14.4s, v14.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v0.4S +ldr q26, [x0, #304] +mul v17.4S, v17.4S,v11.4S +mla v12.4S, v29.4S, v31.s[0] +sub v29.4s, v2.4s, v3.4s +add v2.4s, v2.4s, v3.4s +mla v17.4S, v16.4S, v31.s[0] +sub v16.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +sub v12.4s, v21.4s, v17.4s +add v21.4s, v21.4s, v17.4s +str q14, [x0, #192] +str q10, [x0, #208] +str q2, [x0, #224] +str q29, [x0, #240] +str q22, [x0, #704] +str q16, [x0, #720] +str q21, [x0, #736] +str q12, [x0, #752] +ldr q0, [x17, #+640] +ldr q11, [x17, #+656] +ldr q13, [x0, #256] +ldr q7, [x0, #272] +ldr q19, [x17, #+1664] +ldr q6, [x17, #+1680] +sqrdmulh v8.4S, v18.4S, v11.s[0] +ldr q1, [x0, #800] +mul v18.4S, v18.4S,v0.s[0] +ldr q12, [x0, #816] +sqrdmulh v21.4S, v26.4S, v11.s[0] +ldr q16, [x0, #768] +mul v26.4S, v26.4S,v0.s[0] +ldr q22, [x0, #784] +mla v18.4S, v8.4S, v31.s[0] +sqrdmulh v8.4S, v1.4S, v6.s[0] +mul v1.4S, v1.4S,v19.s[0] +mla v26.4S, v21.4S, v31.s[0] +sub v21.4s, v13.4s, v18.4s +add v13.4s, v13.4s, v18.4s +sqrdmulh v18.4S, v12.4S, v6.s[0] +mul v12.4S, v12.4S,v19.s[0] +mla v1.4S, v8.4S, v31.s[0] +sub v8.4s, v7.4s, v26.4s +add v7.4s, v7.4s, v26.4s +sqrdmulh v26.4S, v7.4S, v11.s[1] +mul v7.4S, v7.4S,v0.s[1] +mla v12.4S, v18.4S, v31.s[0] +sub v18.4s, v16.4s, v1.4s +add v16.4s, v16.4s, v1.4s +sqrdmulh v1.4S, v8.4S, v11.s[2] +mul v8.4S, v8.4S,v0.s[2] +mla v7.4S, v26.4S, v31.s[0] +sub v26.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v22.4S, v6.s[1] +mul v22.4S, v22.4S,v19.s[1] +mla v8.4S, v1.4S, v31.s[0] +sub v1.4s, v13.4s, v7.4s +add v13.4s, v13.4s, v7.4s +sqrdmulh v7.4S, v26.4S, v6.s[2] +mul v26.4S, v26.4S,v19.s[2] +mla v22.4S, v12.4S, v31.s[0] +sub v12.4s, v21.4s, v8.4s +add v21.4s, v21.4s, v8.4s +trn1 v8.4S, v13.4S, v1.4S +trn2 v15.4S, v13.4S, v1.4S +trn1 v4.4S, v21.4S, v12.4S +trn2 v20.4S, v21.4S, v12.4S +trn2 v21.2D, v8.2D, v4.2D +trn2 v12.2D, v15.2D, v20.2D +trn1 v13.2D, v8.2D, v4.2D +trn1 v1.2D, v15.2D, v20.2D +ldr q20, [x17, #+672] +ldr q15, [x17, #+688] +sqrdmulh v4.4S, v21.4S, v15.4S +mul v21.4S, v21.4S,v20.4S +mla v26.4S, v7.4S, v31.s[0] +sub v7.4s, v16.4s, v22.4s +add v16.4s, v16.4s, v22.4s +sqrdmulh v22.4S, v12.4S, v15.4S +mul v12.4S, v12.4S,v20.4S +mla v21.4S, v4.4S, v31.s[0] +sub v4.4s, v18.4s, v26.4s +add v18.4s, v18.4s, v26.4s +trn1 v26.4S, v16.4S, v7.4S +trn2 v8.4S, v16.4S, v7.4S +trn1 v5.4S, v18.4S, v4.4S +trn2 v28.4S, v18.4S, v4.4S +trn2 v18.2D, v26.2D, v5.2D +trn2 v4.2D, v8.2D, v28.2D +trn1 v16.2D, v26.2D, v5.2D +trn1 v7.2D, v8.2D, v28.2D +ldr q28, [x17, #+1696] +ldr q8, [x17, #+1712] +sqrdmulh v5.4S, v18.4S, v8.4S +ldr q26, [x17, #+704] +ldr q9, [x17, #+720] +mul v18.4S, v18.4S,v28.4S +mla v12.4S, v22.4S, v31.s[0] +sub v22.4s, v13.4s, v21.4s +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v4.4S, v8.4S +ldr q27, [x17, #+736] +ldr q30, [x17, #+752] +mul v4.4S, v4.4S,v28.4S +mla v18.4S, v5.4S, v31.s[0] +sub v5.4s, v1.4s, v12.4s +add v1.4s, v1.4s, v12.4s +sqrdmulh v12.4S, v1.4S, v9.4S +ldr q29, [x17, #+1728] +ldr q2, [x17, #+1744] +mul v1.4S, v1.4S,v26.4S +mla v4.4S, v21.4S, v31.s[0] +sub v21.4s, v16.4s, v18.4s +add v16.4s, v16.4s, v18.4s +sqrdmulh v18.4S, v5.4S, v30.4S +ldr q10, [x17, #+1760] +ldr q14, [x17, #+1776] +mul v5.4S, v5.4S,v27.4S +mla v1.4S, v12.4S, v31.s[0] +sub v12.4s, v7.4s, v4.4s +add v7.4s, v7.4s, v4.4s +sqrdmulh v4.4S, v7.4S, v2.4S +ldr q17, [x0, #352] +mul v7.4S, v7.4S,v29.4S +mla v5.4S, v18.4S, v31.s[0] +sub v18.4s, v13.4s, v1.4s +add v13.4s, v13.4s, v1.4s +sqrdmulh v1.4S, v12.4S, v14.4S +ldr q3, [x0, #368] +mul v12.4S, v12.4S,v10.4S +mla v7.4S, v4.4S, v31.s[0] +sub v4.4s, v22.4s, v5.4s +add v22.4s, v22.4s, v5.4s +mla v12.4S, v1.4S, v31.s[0] +sub v1.4s, v16.4s, v7.4s +add v16.4s, v16.4s, v7.4s +sub v7.4s, v21.4s, v12.4s +add v21.4s, v21.4s, v12.4s +str q13, [x0, #256] +str q18, [x0, #272] +str q22, [x0, #288] +str q4, [x0, #304] +str q16, [x0, #768] +str q1, [x0, #784] +str q21, [x0, #800] +str q7, [x0, #816] +ldr q14, [x17, #+768] +ldr q10, [x17, #+784] +ldr q2, [x0, #320] +ldr q29, [x0, #336] +ldr q8, [x17, #+1792] +ldr q28, [x17, #+1808] +sqrdmulh v6.4S, v17.4S, v10.s[0] +ldr q19, [x0, #864] +mul v17.4S, v17.4S,v14.s[0] +ldr q7, [x0, #880] +sqrdmulh v21.4S, v3.4S, v10.s[0] +ldr q1, [x0, #832] +mul v3.4S, v3.4S,v14.s[0] +ldr q16, [x0, #848] +mla v17.4S, v6.4S, v31.s[0] +sqrdmulh v6.4S, v19.4S, v28.s[0] +mul v19.4S, v19.4S,v8.s[0] +mla v3.4S, v21.4S, v31.s[0] +sub v21.4s, v2.4s, v17.4s +add v2.4s, v2.4s, v17.4s +sqrdmulh v17.4S, v7.4S, v28.s[0] +mul v7.4S, v7.4S,v8.s[0] +mla v19.4S, v6.4S, v31.s[0] +sub v6.4s, v29.4s, v3.4s +add v29.4s, v29.4s, v3.4s +sqrdmulh v3.4S, v29.4S, v10.s[1] +mul v29.4S, v29.4S,v14.s[1] +mla v7.4S, v17.4S, v31.s[0] +sub v17.4s, v1.4s, v19.4s +add v1.4s, v1.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v10.s[2] +mul v6.4S, v6.4S,v14.s[2] +mla v29.4S, v3.4S, v31.s[0] +sub v3.4s, v16.4s, v7.4s +add v16.4s, v16.4s, v7.4s +sqrdmulh v7.4S, v16.4S, v28.s[1] +mul v16.4S, v16.4S,v8.s[1] +mla v6.4S, v19.4S, v31.s[0] +sub v19.4s, v2.4s, v29.4s +add v2.4s, v2.4s, v29.4s +sqrdmulh v29.4S, v3.4S, v28.s[2] +mul v3.4S, v3.4S,v8.s[2] +mla v16.4S, v7.4S, v31.s[0] +sub v7.4s, v21.4s, v6.4s +add v21.4s, v21.4s, v6.4s +trn1 v6.4S, v2.4S, v19.4S +trn2 v30.4S, v2.4S, v19.4S +trn1 v27.4S, v21.4S, v7.4S +trn2 v9.4S, v21.4S, v7.4S +trn2 v21.2D, v6.2D, v27.2D +trn2 v7.2D, v30.2D, v9.2D +trn1 v2.2D, v6.2D, v27.2D +trn1 v19.2D, v30.2D, v9.2D +ldr q9, [x17, #+800] +ldr q30, [x17, #+816] +sqrdmulh v27.4S, v21.4S, v30.4S +mul v21.4S, v21.4S,v9.4S +mla v3.4S, v29.4S, v31.s[0] +sub v29.4s, v1.4s, v16.4s +add v1.4s, v1.4s, v16.4s +sqrdmulh v16.4S, v7.4S, v30.4S +mul v7.4S, v7.4S,v9.4S +mla v21.4S, v27.4S, v31.s[0] +sub v27.4s, v17.4s, v3.4s +add v17.4s, v17.4s, v3.4s +trn1 v3.4S, v1.4S, v29.4S +trn2 v6.4S, v1.4S, v29.4S +trn1 v26.4S, v17.4S, v27.4S +trn2 v15.4S, v17.4S, v27.4S +trn2 v17.2D, v3.2D, v26.2D +trn2 v27.2D, v6.2D, v15.2D +trn1 v1.2D, v3.2D, v26.2D +trn1 v29.2D, v6.2D, v15.2D +ldr q15, [x17, #+1824] +ldr q6, [x17, #+1840] +sqrdmulh v26.4S, v17.4S, v6.4S +ldr q3, [x17, #+832] +ldr q20, [x17, #+848] +mul v17.4S, v17.4S,v15.4S +mla v7.4S, v16.4S, v31.s[0] +sub v16.4s, v2.4s, v21.4s +add v2.4s, v2.4s, v21.4s +sqrdmulh v21.4S, v27.4S, v6.4S +ldr q11, [x17, #+864] +ldr q0, [x17, #+880] +mul v27.4S, v27.4S,v15.4S +mla v17.4S, v26.4S, v31.s[0] +sub v26.4s, v19.4s, v7.4s +add v19.4s, v19.4s, v7.4s +sqrdmulh v7.4S, v19.4S, v20.4S +ldr q4, [x17, #+1856] +ldr q22, [x17, #+1872] +mul v19.4S, v19.4S,v3.4S +mla v27.4S, v21.4S, v31.s[0] +sub v21.4s, v1.4s, v17.4s +add v1.4s, v1.4s, v17.4s +sqrdmulh v17.4S, v26.4S, v0.4S +ldr q18, [x17, #+1888] +ldr q13, [x17, #+1904] +mul v26.4S, v26.4S,v11.4S +mla v19.4S, v7.4S, v31.s[0] +sub v7.4s, v29.4s, v27.4s +add v29.4s, v29.4s, v27.4s +sqrdmulh v27.4S, v29.4S, v22.4S +ldr q12, [x0, #416] +mul v29.4S, v29.4S,v4.4S +mla v26.4S, v17.4S, v31.s[0] +sub v17.4s, v2.4s, v19.4s +add v2.4s, v2.4s, v19.4s +sqrdmulh v19.4S, v7.4S, v13.4S +ldr q5, [x0, #432] +mul v7.4S, v7.4S,v18.4S +mla v29.4S, v27.4S, v31.s[0] +sub v27.4s, v16.4s, v26.4s +add v16.4s, v16.4s, v26.4s +mla v7.4S, v19.4S, v31.s[0] +sub v19.4s, v1.4s, v29.4s +add v1.4s, v1.4s, v29.4s +sub v29.4s, v21.4s, v7.4s +add v21.4s, v21.4s, v7.4s +str q2, [x0, #320] +str q17, [x0, #336] +str q16, [x0, #352] +str q27, [x0, #368] +str q1, [x0, #832] +str q19, [x0, #848] +str q21, [x0, #864] +str q29, [x0, #880] +ldr q13, [x17, #+896] +ldr q18, [x17, #+912] +ldr q22, [x0, #384] +ldr q4, [x0, #400] +ldr q6, [x17, #+1920] +ldr q15, [x17, #+1936] +sqrdmulh v28.4S, v12.4S, v18.s[0] +ldr q8, [x0, #928] +mul v12.4S, v12.4S,v13.s[0] +ldr q29, [x0, #944] +sqrdmulh v21.4S, v5.4S, v18.s[0] +ldr q19, [x0, #896] +mul v5.4S, v5.4S,v13.s[0] +ldr q1, [x0, #912] +mla v12.4S, v28.4S, v31.s[0] +sqrdmulh v28.4S, v8.4S, v15.s[0] +mul v8.4S, v8.4S,v6.s[0] +mla v5.4S, v21.4S, v31.s[0] +sub v21.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v29.4S, v15.s[0] +mul v29.4S, v29.4S,v6.s[0] +mla v8.4S, v28.4S, v31.s[0] +sub v28.4s, v4.4s, v5.4s +add v4.4s, v4.4s, v5.4s +sqrdmulh v5.4S, v4.4S, v18.s[1] +mul v4.4S, v4.4S,v13.s[1] +mla v29.4S, v12.4S, v31.s[0] +sub v12.4s, v19.4s, v8.4s +add v19.4s, v19.4s, v8.4s +sqrdmulh v8.4S, v28.4S, v18.s[2] +mul v28.4S, v28.4S,v13.s[2] +mla v4.4S, v5.4S, v31.s[0] +sub v5.4s, v1.4s, v29.4s +add v1.4s, v1.4s, v29.4s +sqrdmulh v29.4S, v1.4S, v15.s[1] +mul v1.4S, v1.4S,v6.s[1] +mla v28.4S, v8.4S, v31.s[0] +sub v8.4s, v22.4s, v4.4s +add v22.4s, v22.4s, v4.4s +sqrdmulh v4.4S, v5.4S, v15.s[2] +mul v5.4S, v5.4S,v6.s[2] +mla v1.4S, v29.4S, v31.s[0] +sub v29.4s, v21.4s, v28.4s +add v21.4s, v21.4s, v28.4s +trn1 v28.4S, v22.4S, v8.4S +trn2 v0.4S, v22.4S, v8.4S +trn1 v11.4S, v21.4S, v29.4S +trn2 v20.4S, v21.4S, v29.4S +trn2 v21.2D, v28.2D, v11.2D +trn2 v29.2D, v0.2D, v20.2D +trn1 v22.2D, v28.2D, v11.2D +trn1 v8.2D, v0.2D, v20.2D +ldr q20, [x17, #+928] +ldr q0, [x17, #+944] +sqrdmulh v11.4S, v21.4S, v0.4S +mul v21.4S, v21.4S,v20.4S +mla v5.4S, v4.4S, v31.s[0] +sub v4.4s, v19.4s, v1.4s +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v29.4S, v0.4S +mul v29.4S, v29.4S,v20.4S +mla v21.4S, v11.4S, v31.s[0] +sub v11.4s, v12.4s, v5.4s +add v12.4s, v12.4s, v5.4s +trn1 v5.4S, v19.4S, v4.4S +trn2 v28.4S, v19.4S, v4.4S +trn1 v3.4S, v12.4S, v11.4S +trn2 v30.4S, v12.4S, v11.4S +trn2 v12.2D, v5.2D, v3.2D +trn2 v11.2D, v28.2D, v30.2D +trn1 v19.2D, v5.2D, v3.2D +trn1 v4.2D, v28.2D, v30.2D +ldr q30, [x17, #+1952] +ldr q28, [x17, #+1968] +sqrdmulh v3.4S, v12.4S, v28.4S +ldr q5, [x17, #+960] +ldr q9, [x17, #+976] +mul v12.4S, v12.4S,v30.4S +mla v29.4S, v1.4S, v31.s[0] +sub v1.4s, v22.4s, v21.4s +add v22.4s, v22.4s, v21.4s +sqrdmulh v21.4S, v11.4S, v28.4S +ldr q10, [x17, #+992] +ldr q14, [x17, #+1008] +mul v11.4S, v11.4S,v30.4S +mla v12.4S, v3.4S, v31.s[0] +sub v3.4s, v8.4s, v29.4s +add v8.4s, v8.4s, v29.4s +sqrdmulh v29.4S, v8.4S, v9.4S +ldr q27, [x17, #+1984] +ldr q16, [x17, #+2000] +mul v8.4S, v8.4S,v5.4S +mla v11.4S, v21.4S, v31.s[0] +sub v21.4s, v19.4s, v12.4s +add v19.4s, v19.4s, v12.4s +sqrdmulh v12.4S, v3.4S, v14.4S +ldr q17, [x17, #+2016] +ldr q2, [x17, #+2032] +mul v3.4S, v3.4S,v10.4S +mla v8.4S, v29.4S, v31.s[0] +sub v29.4s, v4.4s, v11.4s +add v4.4s, v4.4s, v11.4s +sqrdmulh v11.4S, v4.4S, v16.4S +ldr q7, [x0, #480] +mul v4.4S, v4.4S,v27.4S +mla v3.4S, v12.4S, v31.s[0] +sub v12.4s, v22.4s, v8.4s +add v22.4s, v22.4s, v8.4s +sqrdmulh v8.4S, v29.4S, v2.4S +ldr q26, [x0, #496] +mul v29.4S, v29.4S,v17.4S +mla v4.4S, v11.4S, v31.s[0] +sub v11.4s, v1.4s, v3.4s +add v1.4s, v1.4s, v3.4s +mla v29.4S, v8.4S, v31.s[0] +sub v8.4s, v19.4s, v4.4s +add v19.4s, v19.4s, v4.4s +sub v4.4s, v21.4s, v29.4s +add v21.4s, v21.4s, v29.4s +str q22, [x0, #384] +str q12, [x0, #400] +str q1, [x0, #416] +str q11, [x0, #432] +str q19, [x0, #896] +str q8, [x0, #912] +str q21, [x0, #928] +str q4, [x0, #944] +ldr q2, [x17, #+1024] +ldr q17, [x17, #+1040] +ldr q16, [x0, #448] +ldr q27, [x0, #464] +ldr q28, [x17, #+2048] +ldr q30, [x17, #+2064] +sqrdmulh v15.4S, v7.4S, v17.s[0] +ldr q6, [x0, #992] +mul v7.4S, v7.4S,v2.s[0] +ldr q4, [x0, #1008] +sqrdmulh v21.4S, v26.4S, v17.s[0] +ldr q8, [x0, #960] +mul v26.4S, v26.4S,v2.s[0] +ldr q19, [x0, #976] +mla v7.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v6.4S, v30.s[0] +mul v6.4S, v6.4S,v28.s[0] +mla v26.4S, v21.4S, v31.s[0] +sub v21.4s, v16.4s, v7.4s +add v16.4s, v16.4s, v7.4s +sqrdmulh v7.4S, v4.4S, v30.s[0] +mul v4.4S, v4.4S,v28.s[0] +mla v6.4S, v15.4S, v31.s[0] +sub v15.4s, v27.4s, v26.4s +add v27.4s, v27.4s, v26.4s +sqrdmulh v26.4S, v27.4S, v17.s[1] +mul v27.4S, v27.4S,v2.s[1] +mla v4.4S, v7.4S, v31.s[0] +sub v7.4s, v8.4s, v6.4s +add v8.4s, v8.4s, v6.4s +sqrdmulh v6.4S, v15.4S, v17.s[2] +mul v15.4S, v15.4S,v2.s[2] +mla v27.4S, v26.4S, v31.s[0] +sub v26.4s, v19.4s, v4.4s +add v19.4s, v19.4s, v4.4s +sqrdmulh v4.4S, v19.4S, v30.s[1] +mul v19.4S, v19.4S,v28.s[1] +mla v15.4S, v6.4S, v31.s[0] +sub v6.4s, v16.4s, v27.4s +add v16.4s, v16.4s, v27.4s +sqrdmulh v27.4S, v26.4S, v30.s[2] +mul v26.4S, v26.4S,v28.s[2] +mla v19.4S, v4.4S, v31.s[0] +sub v4.4s, v21.4s, v15.4s +add v21.4s, v21.4s, v15.4s +trn1 v15.4S, v16.4S, v6.4S +trn2 v14.4S, v16.4S, v6.4S +trn1 v10.4S, v21.4S, v4.4S +trn2 v9.4S, v21.4S, v4.4S +trn2 v21.2D, v15.2D, v10.2D +trn2 v4.2D, v14.2D, v9.2D +trn1 v16.2D, v15.2D, v10.2D +trn1 v6.2D, v14.2D, v9.2D +ldr q9, [x17, #+1056] +ldr q14, [x17, #+1072] +sqrdmulh v10.4S, v21.4S, v14.4S +mul v21.4S, v21.4S,v9.4S +mla v26.4S, v27.4S, v31.s[0] +sub v27.4s, v8.4s, v19.4s +add v8.4s, v8.4s, v19.4s +sqrdmulh v19.4S, v4.4S, v14.4S +mul v4.4S, v4.4S,v9.4S +mla v21.4S, v10.4S, v31.s[0] +sub v10.4s, v7.4s, v26.4s +add v7.4s, v7.4s, v26.4s +trn1 v26.4S, v8.4S, v27.4S +trn2 v15.4S, v8.4S, v27.4S +trn1 v5.4S, v7.4S, v10.4S +trn2 v0.4S, v7.4S, v10.4S +trn2 v7.2D, v26.2D, v5.2D +trn2 v10.2D, v15.2D, v0.2D +trn1 v8.2D, v26.2D, v5.2D +trn1 v27.2D, v15.2D, v0.2D +ldr q0, [x17, #+2080] +ldr q15, [x17, #+2096] +sqrdmulh v5.4S, v7.4S, v15.4S +ldr q26, [x17, #+1088] +ldr q20, [x17, #+1104] +mul v7.4S, v7.4S,v0.4S +mla v4.4S, v19.4S, v31.s[0] +sub v19.4s, v16.4s, v21.4s +add v16.4s, v16.4s, v21.4s +sqrdmulh v21.4S, v10.4S, v15.4S +ldr q18, [x17, #+1120] +ldr q13, [x17, #+1136] +mul v10.4S, v10.4S,v0.4S +mla v7.4S, v5.4S, v31.s[0] +sub v5.4s, v6.4s, v4.4s +add v6.4s, v6.4s, v4.4s +sqrdmulh v4.4S, v6.4S, v20.4S +ldr q11, [x17, #+2112] +ldr q1, [x17, #+2128] +mul v6.4S, v6.4S,v26.4S +mla v10.4S, v21.4S, v31.s[0] +sub v21.4s, v8.4s, v7.4s +add v8.4s, v8.4s, v7.4s +sqrdmulh v7.4S, v5.4S, v13.4S +ldr q12, [x17, #+2144] +ldr q22, [x17, #+2160] +mul v5.4S, v5.4S,v18.4S +mla v6.4S, v4.4S, v31.s[0] +sub v4.4s, v27.4s, v10.4s +add v27.4s, v27.4s, v10.4s +sqrdmulh v10.4S, v27.4S, v1.4S +mul v27.4S, v27.4S,v11.4S +mla v5.4S, v7.4S, v31.s[0] +sub v7.4s, v16.4s, v6.4s +add v16.4s, v16.4s, v6.4s +sqrdmulh v6.4S, v4.4S, v22.4S +mul v4.4S, v4.4S,v12.4S +mla v27.4S, v10.4S, v31.s[0] +sub v10.4s, v19.4s, v5.4s +add v19.4s, v19.4s, v5.4s +mla v4.4S, v6.4S, v31.s[0] +sub v6.4s, v8.4s, v27.4s +add v8.4s, v8.4s, v27.4s +sub v27.4s, v21.4s, v4.4s +add v21.4s, v21.4s, v4.4s +str q16, [x0, #448] +str q7, [x0, #464] +str q19, [x0, #480] +str q10, [x0, #496] +str q8, [x0, #960] +str q6, [x0, #976] +str q21, [x0, #992] +str q27, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 2392 +// Instruction count: 2388 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z4_0.s b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z4_0.s new file mode 100644 index 0000000..7675702 --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z4_0.s @@ -0,0 +1,2422 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 26036764 // Layer 6, block 0 +.word 7065381 // Layer 6, block 1 +.word 11280567 // Layer 6, block 2 +.word 19695786 // Layer 6, block 3 +.word 1666225723 // Layer 6, block 0 +.word 452149874 // Layer 6, block 1 +.word 721901190 // Layer 6, block 2 +.word 1260434103 // Layer 6, block 3 +.word 28678040 // Layer 7, block 0 +.word 5637166 // Layer 7, block 2 +.word 18759424 // Layer 7, block 4 +.word 8648030 // Layer 7, block 6 +.word 1835254486 // Layer 7, block 0 +.word 360751090 // Layer 7, block 2 +.word 1200511508 // Layer 7, block 4 +.word 553431680 // Layer 7, block 6 +.word 7232147 // Layer 7, block 1 +.word 7430689 // Layer 7, block 3 +.word 14819378 // Layer 7, block 5 +.word 22112339 // Layer 7, block 7 +.word 462822084 // Layer 7, block 1 +.word 475527802 // Layer 7, block 3 +.word 948367809 // Layer 7, block 5 +.word 1415081692 // Layer 7, block 7 +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14834498 // Layer 6, block 4 +.word 22861321 // Layer 6, block 5 +.word 23033862 // Layer 6, block 6 +.word 32211066 // Layer 6, block 7 +.word 949335415 // Layer 6, block 4 +.word 1463012881 // Layer 6, block 5 +.word 1474054663 // Layer 6, block 6 +.word 2061350894 // Layer 6, block 7 +.word 7103825 // Layer 7, block 8 +.word 24338119 // Layer 7, block 10 +.word 6674394 // Layer 7, block 12 +.word 3716128 // Layer 7, block 14 +.word 454610102 // Layer 7, block 8 +.word 1557520740 // Layer 7, block 10 +.word 427128616 // Layer 7, block 12 +.word 237814041 // Layer 7, block 14 +.word 18577393 // Layer 7, block 9 +.word 17042091 // Layer 7, block 11 +.word 6574213 // Layer 7, block 13 +.word 24666803 // Layer 7, block 15 +.word 1188862414 // Layer 7, block 9 +.word 1090610585 // Layer 7, block 11 +.word 420717521 // Layer 7, block 13 +.word 1578554911 // Layer 7, block 15 +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 11253846 // Layer 6, block 8 +.word 16151303 // Layer 6, block 9 +.word 1821442 // Layer 6, block 10 +.word 23358663 // Layer 6, block 11 +.word 720191176 // Layer 6, block 8 +.word 1033604503 // Layer 6, block 9 +.word 116563391 // Layer 6, block 10 +.word 1494840340 // Layer 6, block 11 +.word 32787475 // Layer 7, block 16 +.word 8269259 // Layer 7, block 18 +.word 20826321 // Layer 7, block 20 +.word 21194054 // Layer 7, block 22 +.word 2098238255 // Layer 7, block 16 +.word 529192186 // Layer 7, block 18 +.word 1332782821 // Layer 7, block 20 +.word 1356315937 // Layer 7, block 22 +.word 28400654 // Layer 7, block 17 +.word 31090287 // Layer 7, block 19 +.word 26776841 // Layer 7, block 21 +.word 22281074 // Layer 7, block 23 +.word 1817503137 // Layer 7, block 17 +.word 1989626512 // Layer 7, block 19 +.word 1713587037 // Layer 7, block 21 +.word 1425879908 // Layer 7, block 23 +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 20504641 // Layer 6, block 12 +.word 7735096 // Layer 6, block 13 +.word 29463916 // Layer 6, block 14 +.word 23172067 // Layer 6, block 15 +.word 1312196872 // Layer 6, block 12 +.word 495008363 // Layer 6, block 13 +.word 1885546712 // Layer 6, block 14 +.word 1482899108 // Layer 6, block 15 +.word 1953000 // Layer 7, block 24 +.word 12766243 // Layer 7, block 26 +.word 16292342 // Layer 7, block 28 +.word 25143337 // Layer 7, block 30 +.word 124982461 // Layer 7, block 24 +.word 816977197 // Layer 7, block 26 +.word 1042630311 // Layer 7, block 28 +.word 1609050759 // Layer 7, block 30 +.word 12486848 // Layer 7, block 25 +.word 31556661 // Layer 7, block 27 +.word 28330310 // Layer 7, block 29 +.word 15137961 // Layer 7, block 31 +.word 799097282 // Layer 7, block 25 +.word 2019472170 // Layer 7, block 27 +.word 1813001465 // Layer 7, block 29 +.word 968755565 // Layer 7, block 31 +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 18663828 // Layer 6, block 16 +.word 25765932 // Layer 6, block 17 +.word 11779122 // Layer 6, block 18 +.word 29112305 // Layer 6, block 19 +.word 1194393831 // Layer 6, block 16 +.word 1648893798 // Layer 6, block 17 +.word 753806275 // Layer 6, block 18 +.word 1863045325 // Layer 6, block 19 +.word 33163184 // Layer 7, block 32 +.word 11550623 // Layer 7, block 34 +.word 25375595 // Layer 7, block 36 +.word 18254638 // Layer 7, block 38 +.word 2122281795 // Layer 7, block 32 +.word 739183455 // Layer 7, block 34 +.word 1623914137 // Layer 7, block 36 +.word 1168207670 // Layer 7, block 38 +.word 9551359 // Layer 7, block 33 +.word 33257316 // Layer 7, block 35 +.word 10387700 // Layer 7, block 37 +.word 4263629 // Layer 7, block 39 +.word 611240324 // Layer 7, block 33 +.word 2128305784 // Layer 7, block 35 +.word 664762063 // Layer 7, block 37 +.word 272851431 // Layer 7, block 39 +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 596073 // Layer 6, block 20 +.word 29039358 // Layer 6, block 21 +.word 6760262 // Layer 6, block 22 +.word 2228887 // Layer 6, block 23 +.word 38145761 // Layer 6, block 20 +.word 1858377074 // Layer 6, block 21 +.word 432623749 // Layer 6, block 22 +.word 142637881 // Layer 6, block 23 +.word 25929180 // Layer 7, block 40 +.word 23508428 // Layer 7, block 42 +.word 22560727 // Layer 7, block 44 +.word 29457393 // Layer 7, block 46 +.word 1659340873 // Layer 7, block 40 +.word 1504424569 // Layer 7, block 42 +.word 1443776334 // Layer 7, block 44 +.word 1885129272 // Layer 7, block 46 +.word 17371159 // Layer 7, block 41 +.word 11558208 // Layer 7, block 43 +.word 15755637 // Layer 7, block 45 +.word 20740787 // Layer 7, block 47 +.word 1111669329 // Layer 7, block 41 +.word 739668858 // Layer 7, block 43 +.word 1008283812 // Layer 7, block 45 +.word 1327309063 // Layer 7, block 47 +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 13624329 // Layer 6, block 24 +.word 9838349 // Layer 6, block 25 +.word 6934560 // Layer 6, block 26 +.word 11310234 // Layer 6, block 27 +.word 871890510 // Layer 6, block 24 +.word 629606282 // Layer 6, block 25 +.word 443777969 // Layer 6, block 26 +.word 723799733 // Layer 6, block 27 +.word 3153984 // Layer 7, block 48 +.word 15599806 // Layer 7, block 50 +.word 23484790 // Layer 7, block 52 +.word 30174454 // Layer 7, block 54 +.word 201839571 // Layer 7, block 48 +.word 998311389 // Layer 7, block 50 +.word 1502911852 // Layer 7, block 52 +.word 1931017673 // Layer 7, block 54 +.word 13598070 // Layer 7, block 49 +.word 31454003 // Layer 7, block 51 +.word 20506260 // Layer 7, block 53 +.word 5928435 // Layer 7, block 55 +.word 870210062 // Layer 7, block 49 +.word 2012902560 // Layer 7, block 51 +.word 1312300480 // Layer 7, block 53 +.word 379390883 // Layer 7, block 55 +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 32798516 // Layer 6, block 28 +.word 9911360 // Layer 6, block 29 +.word 32443170 // Layer 6, block 30 +.word 31293482 // Layer 6, block 31 +.word 2098944825 // Layer 6, block 28 +.word 634278629 // Layer 6, block 29 +.word 2076204416 // Layer 6, block 30 +.word 2002630000 // Layer 6, block 31 +.word 26013877 // Layer 7, block 56 +.word 22928950 // Layer 7, block 58 +.word 24547058 // Layer 7, block 60 +.word 21082546 // Layer 7, block 62 +.word 1664761067 // Layer 7, block 56 +.word 1467340807 // Layer 7, block 58 +.word 1570891816 // Layer 7, block 60 +.word 1349179970 // Layer 7, block 62 +.word 21864746 // Layer 7, block 57 +.word 27678266 // Layer 7, block 59 +.word 30695887 // Layer 7, block 61 +.word 31772478 // Layer 7, block 63 +.word 1399236949 // Layer 7, block 57 +.word 1771273834 // Layer 7, block 59 +.word 1964386839 // Layer 7, block 61 +.word 2033283404 // Layer 7, block 63 +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 2853776 // Layer 6, block 32 +.word 31645959 // Layer 6, block 33 +.word 29723614 // Layer 6, block 34 +.word 31813171 // Layer 6, block 35 +.word 182627725 // Layer 6, block 32 +.word 2025186806 // Layer 6, block 33 +.word 1902166116 // Layer 6, block 34 +.word 2035887557 // Layer 6, block 35 +.word 30377953 // Layer 7, block 64 +.word 4924837 // Layer 7, block 66 +.word 11362575 // Layer 7, block 68 +.word 31398766 // Layer 7, block 70 +.word 1944040616 // Layer 7, block 64 +.word 315165513 // Layer 7, block 66 +.word 727149301 // Layer 7, block 68 +.word 2009367662 // Layer 7, block 70 +.word 27689101 // Layer 7, block 65 +.word 31229525 // Layer 7, block 67 +.word 6544948 // Layer 7, block 69 +.word 13728247 // Layer 7, block 71 +.word 1771967221 // Layer 7, block 65 +.word 1998537064 // Layer 7, block 67 +.word 418844704 // Layer 7, block 69 +.word 878540754 // Layer 7, block 71 +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9116920 // Layer 6, block 36 +.word 26449800 // Layer 6, block 37 +.word 27173300 // Layer 6, block 38 +.word 1574249 // Layer 6, block 39 +.word 583438350 // Layer 6, block 36 +.word 1692658010 // Layer 6, block 37 +.word 1738958476 // Layer 6, block 38 +.word 100744247 // Layer 6, block 39 +.word 6510145 // Layer 7, block 72 +.word 760999 // Layer 7, block 74 +.word 1634503 // Layer 7, block 76 +.word 29546109 // Layer 7, block 78 +.word 416617482 // Layer 7, block 72 +.word 48700219 // Layer 7, block 74 +.word 104600209 // Layer 7, block 76 +.word 1890806663 // Layer 7, block 78 +.word 2195232 // Layer 7, block 73 +.word 4465852 // Layer 7, block 75 +.word 31203102 // Layer 7, block 77 +.word 29916743 // Layer 7, block 79 +.word 140484126 // Layer 7, block 73 +.word 285792715 // Layer 7, block 75 +.word 1996846121 // Layer 7, block 77 +.word 1914525428 // Layer 7, block 79 +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29172999 // Layer 6, block 40 +.word 16825951 // Layer 6, block 41 +.word 11592382 // Layer 6, block 42 +.word 2671395 // Layer 6, block 43 +.word 1866929445 // Layer 6, block 40 +.word 1076778680 // Layer 6, block 41 +.word 741855827 // Layer 6, block 42 +.word 170956232 // Layer 6, block 43 +.word 14579779 // Layer 7, block 80 +.word 24263513 // Layer 7, block 82 +.word 4646776 // Layer 7, block 84 +.word 69049 // Layer 7, block 86 +.word 933034643 // Layer 7, block 80 +.word 1552746321 // Layer 7, block 82 +.word 297370968 // Layer 7, block 84 +.word 4418799 // Layer 7, block 86 +.word 33263488 // Layer 7, block 81 +.word 22493246 // Layer 7, block 83 +.word 22009979 // Layer 7, block 85 +.word 12021234 // Layer 7, block 87 +.word 2128700762 // Layer 7, block 81 +.word 1439457879 // Layer 7, block 83 +.word 1408531152 // Layer 7, block 85 +.word 769300260 // Layer 7, block 87 +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 15720958 // Layer 6, block 44 +.word 4876619 // Layer 6, block 45 +.word 9370171 // Layer 6, block 46 +.word 2197027 // Layer 6, block 47 +.word 1006064525 // Layer 6, block 44 +.word 312079797 // Layer 6, block 45 +.word 599645177 // Layer 6, block 46 +.word 140598997 // Layer 6, block 47 +.word 16117282 // Layer 7, block 88 +.word 9635661 // Layer 7, block 90 +.word 9117520 // Layer 7, block 92 +.word 3506913 // Layer 7, block 94 +.word 1031427326 // Layer 7, block 88 +.word 616635240 // Layer 7, block 90 +.word 583476747 // Layer 7, block 92 +.word 224425303 // Layer 7, block 94 +.word 20014407 // Layer 7, block 89 +.word 25893988 // Layer 7, block 91 +.word 10257619 // Layer 7, block 93 +.word 24501669 // Layer 7, block 95 +.word 1280824291 // Layer 7, block 89 +.word 1657088757 // Layer 7, block 91 +.word 656437514 // Layer 7, block 93 +.word 1567987141 // Layer 7, block 95 +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 23467272 // Layer 6, block 48 +.word 11944835 // Layer 6, block 49 +.word 29768154 // Layer 6, block 50 +.word 3189790 // Layer 6, block 51 +.word 1501790786 // Layer 6, block 48 +.word 764411097 // Layer 6, block 49 +.word 1905016458 // Layer 6, block 50 +.word 204130980 // Layer 6, block 51 +.word 28559032 // Layer 7, block 96 +.word 20151609 // Layer 7, block 98 +.word 11645481 // Layer 7, block 100 +.word 16402437 // Layer 7, block 102 +.word 1827638556 // Layer 7, block 96 +.word 1289604549 // Layer 7, block 98 +.word 745253903 // Layer 7, block 100 +.word 1049675853 // Layer 7, block 102 +.word 1005359 // Layer 7, block 97 +.word 19130139 // Layer 7, block 99 +.word 11690281 // Layer 7, block 101 +.word 5461508 // Layer 7, block 103 +.word 64338065 // Layer 7, block 97 +.word 1224235458 // Layer 7, block 99 +.word 748120885 // Layer 7, block 101 +.word 349509836 // Layer 7, block 103 +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 4898455 // Layer 6, block 52 +.word 22059944 // Layer 6, block 53 +.word 20315246 // Layer 6, block 54 +.word 28615767 // Layer 6, block 55 +.word 313477194 // Layer 6, block 52 +.word 1411728668 // Layer 6, block 53 +.word 1300076517 // Layer 6, block 54 +.word 1831269319 // Layer 6, block 55 +.word 6226096 // Layer 7, block 104 +.word 14029790 // Layer 7, block 106 +.word 7729000 // Layer 7, block 108 +.word 13958531 // Layer 7, block 110 +.word 398439734 // Layer 7, block 104 +.word 897838034 // Layer 7, block 106 +.word 494618249 // Layer 7, block 108 +.word 893277806 // Layer 7, block 110 +.word 31755058 // Layer 7, block 105 +.word 26102744 // Layer 7, block 107 +.word 19175904 // Layer 7, block 109 +.word 19472238 // Layer 7, block 111 +.word 2032168609 // Layer 7, block 105 +.word 1670448121 // Layer 7, block 107 +.word 1227164194 // Layer 7, block 109 +.word 1246128123 // Layer 7, block 111 +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 17302560 // Layer 6, block 56 +.word 8630188 // Layer 6, block 57 +.word 13744680 // Layer 6, block 58 +.word 31890906 // Layer 6, block 59 +.word 1107279328 // Layer 6, block 56 +.word 552289879 // Layer 6, block 57 +.word 879592386 // Layer 6, block 58 +.word 2040862218 // Layer 6, block 59 +.word 4735938 // Layer 7, block 112 +.word 26671657 // Layer 7, block 114 +.word 25810971 // Layer 7, block 116 +.word 25578690 // Layer 7, block 118 +.word 303076900 // Layer 7, block 112 +.word 1706855774 // Layer 7, block 114 +.word 1651776074 // Layer 7, block 116 +.word 1636911225 // Layer 7, block 118 +.word 6957373 // Layer 7, block 113 +.word 25381712 // Layer 7, block 115 +.word 27780827 // Layer 7, block 117 +.word 28062311 // Layer 7, block 119 +.word 445237890 // Layer 7, block 113 +.word 1624305595 // Layer 7, block 115 +.word 1777837237 // Layer 7, block 117 +.word 1795850838 // Layer 7, block 119 +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 26150922 // Layer 6, block 60 +.word 29525906 // Layer 6, block 61 +.word 23080870 // Layer 6, block 62 +.word 1636987 // Layer 6, block 63 +.word 1673531278 // Layer 6, block 60 +.word 1889513769 // Layer 6, block 61 +.word 1477062945 // Layer 6, block 62 +.word 104759172 // Layer 6, block 63 +.word 10674616 // Layer 7, block 120 +.word 9508293 // Layer 7, block 122 +.word 4274200 // Layer 7, block 124 +.word 10066304 // Layer 7, block 126 +.word 683123285 // Layer 7, block 120 +.word 608484310 // Layer 7, block 122 +.word 273527923 // Layer 7, block 124 +.word 644194289 // Layer 7, block 126 +.word 26473446 // Layer 7, block 121 +.word 14853570 // Layer 7, block 123 +.word 32427548 // Layer 7, block 125 +.word 16598340 // Layer 7, block 127 +.word 1694171239 // Layer 7, block 121 +.word 950555930 // Layer 7, block 123 +.word 2075204685 // Layer 7, block 125 +.word 1062212688 // Layer 7, block 127 +.text +.global ntt_u32_full_neon_asm_var_4_4_3_z4_0 +.global _ntt_u32_full_neon_asm_var_4_4_3_z4_0 +ntt_u32_full_neon_asm_var_4_4_3_z4_0: +_ntt_u32_full_neon_asm_var_4_4_3_z4_0: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #800] +ldr q21, [x0, #864] +ldr q20, [x0, #928] +ldr q19, [x0, #992] +ldr q18, [x0, #288] +ldr q17, [x0, #352] +ldr q16, [x0, #416] +ldr q3, [x0, #480] +sqrdmulh v2.4S, v22.4S, v29.s[0] +ldr q1, [x0, #544] +mul v22.4S, v22.4S,v30.s[0] +ldr q0, [x0, #608] +sqrdmulh v15.4S, v21.4S, v29.s[0] +ldr q14, [x0, #672] +mul v21.4S, v21.4S,v30.s[0] +ldr q13, [x0, #736] +mla v22.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q12, [x0, #32] +sub v11.4s, v18.4s, v22.4s +mla v21.4S, v15.4S, v31.s[0] +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q15, [x0, #96] +sub v10.4s, v17.4s, v21.4s +mla v20.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v1.4S, v29.s[0] +ldr q2, [x0, #160] +mul v1.4S, v1.4S,v30.s[0] +sub v9.4s, v16.4s, v20.4s +mla v19.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v0.4S, v29.s[0] +ldr q22, [x0, #224] +mul v0.4S, v0.4S,v30.s[0] +sub v8.4s, v3.4s, v19.4s +mla v1.4S, v21.4S, v31.s[0] +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v21.4s, v12.4s, v1.4s +mla v0.4S, v20.4S, v31.s[0] +add v12.4s, v12.4s, v1.4s +sqrdmulh v1.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +sub v20.4s, v15.4s, v0.4s +mla v14.4S, v19.4S, v31.s[0] +add v15.4s, v15.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v19.4s, v2.4s, v14.4s +mla v13.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v1.4s, v22.4s, v13.4s +mla v16.4S, v0.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v0.4s, v2.4s, v16.4s +mla v3.4S, v14.4S, v31.s[0] +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v14.4s, v22.4s, v3.4s +mla v18.4S, v13.4S, v31.s[0] +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v29.s[2] +mul v9.4S, v9.4S,v30.s[2] +sub v13.4s, v12.4s, v18.4s +mla v17.4S, v16.4S, v31.s[0] +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v8.4S, v29.s[2] +mul v8.4S, v8.4S,v30.s[2] +sub v16.4s, v15.4s, v17.4s +mla v9.4S, v3.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +sqrdmulh v17.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v3.4s, v19.4s, v9.4s +mla v8.4S, v18.4S, v31.s[0] +add v19.4s, v19.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v18.4s, v1.4s, v8.4s +mla v11.4S, v17.4S, v31.s[0] +add v1.4s, v1.4s, v8.4s +sqrdmulh v8.4S, v2.4S, v27.s[0] +mul v2.4S, v2.4S,v28.s[0] +sub v17.4s, v21.4s, v11.4s +mla v10.4S, v9.4S, v31.s[0] +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v27.s[0] +mul v22.4S, v22.4S,v28.s[0] +sub v9.4s, v20.4s, v10.4s +mla v2.4S, v8.4S, v31.s[0] +add v20.4s, v20.4s, v10.4s +sqrdmulh v10.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v8.4s, v12.4s, v2.4s +mla v22.4S, v11.4S, v31.s[0] +add v12.4s, v12.4s, v2.4s +sqrdmulh v2.4S, v14.4S, v27.s[1] +mul v14.4S, v14.4S,v28.s[1] +sub v11.4s, v15.4s, v22.4s +mla v0.4S, v10.4S, v31.s[0] +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v27.s[2] +mul v19.4S, v19.4S,v28.s[2] +sub v10.4s, v13.4s, v0.4s +mla v14.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v0.4s +sqrdmulh v0.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +sub v2.4s, v16.4s, v14.4s +mla v19.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v27.s[3] +mul v3.4S, v3.4S,v28.s[3] +sub v22.4s, v21.4s, v19.4s +mla v1.4S, v0.4S, v31.s[0] +add v21.4s, v21.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v0.4s, v20.4s, v1.4s +mla v3.4S, v14.4S, v31.s[0] +add v20.4s, v20.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v25.s[0] +mul v15.4S, v15.4S,v26.s[0] +sub v14.4s, v17.4s, v3.4s +mla v18.4S, v19.4S, v31.s[0] +add v17.4s, v17.4s, v3.4s +sqrdmulh v3.4S, v11.4S, v25.s[1] +mul v11.4S, v11.4S,v26.s[1] +sub v19.4s, v9.4s, v18.4s +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v1.4s, v12.4s, v15.4s +mla v11.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v25.s[3] +mul v2.4S, v2.4S,v26.s[3] +sub v3.4s, v8.4s, v11.4s +mla v16.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v11.4s +str q12, [x0, #32] +sqrdmulh v12.4S, v20.4S, v23.s[0] +str q1, [x0, #96] +mul v20.4S, v20.4S,v24.s[0] +ldr q1, [x0, #816] +sub v11.4s, v13.4s, v16.4s +ldr q18, [x0, #880] +mla v2.4S, v15.4S, v31.s[0] +add v13.4s, v13.4s, v16.4s +str q8, [x0, #160] +sqrdmulh v8.4S, v0.4S, v23.s[1] +str q3, [x0, #224] +mul v0.4S, v0.4S,v24.s[1] +ldr q3, [x0, #944] +sub v16.4s, v10.4s, v2.4s +ldr q15, [x0, #1008] +mla v20.4S, v12.4S, v31.s[0] +add v10.4s, v10.4s, v2.4s +str q13, [x0, #288] +sqrdmulh v13.4S, v9.4S, v23.s[2] +str q11, [x0, #352] +mul v9.4S, v9.4S,v24.s[2] +ldr q11, [x0, #304] +sub v2.4s, v21.4s, v20.4s +ldr q12, [x0, #368] +mla v0.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v20.4s +str q10, [x0, #416] +sqrdmulh v10.4S, v19.4S, v23.s[3] +str q16, [x0, #480] +mul v19.4S, v19.4S,v24.s[3] +ldr q16, [x0, #432] +sub v20.4s, v22.4s, v0.4s +ldr q8, [x0, #496] +mla v9.4S, v13.4S, v31.s[0] +add v22.4s, v22.4s, v0.4s +str q21, [x0, #544] +sqrdmulh v21.4S, v1.4S, v29.s[0] +str q2, [x0, #608] +ldr q2, [x0, #560] +mul v1.4S, v1.4S,v30.s[0] +ldr q0, [x0, #624] +sub v13.4s, v17.4s, v9.4s +mla v19.4S, v10.4S, v31.s[0] +add v17.4s, v17.4s, v9.4s +str q22, [x0, #672] +sqrdmulh v22.4S, v18.4S, v29.s[0] +str q20, [x0, #736] +ldr q20, [x0, #688] +mul v18.4S, v18.4S,v30.s[0] +ldr q9, [x0, #752] +sub v10.4s, v14.4s, v19.4s +mla v1.4S, v21.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +str q17, [x0, #800] +sqrdmulh v17.4S, v3.4S, v29.s[0] +str q13, [x0, #864] +mul v3.4S, v3.4S,v30.s[0] +ldr q13, [x0, #48] +sub v19.4s, v11.4s, v1.4s +mla v18.4S, v22.4S, v31.s[0] +add v11.4s, v11.4s, v1.4s +str q14, [x0, #928] +sqrdmulh v14.4S, v15.4S, v29.s[0] +str q10, [x0, #992] +mul v15.4S, v15.4S,v30.s[0] +ldr q10, [x0, #112] +sub v1.4s, v12.4s, v18.4s +mla v3.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v2.4S, v29.s[0] +ldr q17, [x0, #176] +mul v2.4S, v2.4S,v30.s[0] +sub v22.4s, v16.4s, v3.4s +mla v15.4S, v14.4S, v31.s[0] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v0.4S, v29.s[0] +ldr q14, [x0, #240] +mul v0.4S, v0.4S,v30.s[0] +sub v21.4s, v8.4s, v15.4s +mla v2.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +sub v18.4s, v13.4s, v2.4s +mla v0.4S, v3.4S, v31.s[0] +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v9.4S, v29.s[0] +mul v9.4S, v9.4S,v30.s[0] +sub v3.4s, v10.4s, v0.4s +mla v20.4S, v15.4S, v31.s[0] +add v10.4s, v10.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v15.4s, v17.4s, v20.4s +mla v9.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +sub v2.4s, v14.4s, v9.4s +mla v16.4S, v0.4S, v31.s[0] +add v14.4s, v14.4s, v9.4s +sqrdmulh v9.4S, v11.4S, v29.s[1] +mul v11.4S, v11.4S,v30.s[1] +sub v0.4s, v17.4s, v16.4s +mla v8.4S, v20.4S, v31.s[0] +add v17.4s, v17.4s, v16.4s +sqrdmulh v16.4S, v12.4S, v29.s[1] +mul v12.4S, v12.4S,v30.s[1] +sub v20.4s, v14.4s, v8.4s +mla v11.4S, v9.4S, v31.s[0] +add v14.4s, v14.4s, v8.4s +sqrdmulh v8.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +sub v9.4s, v13.4s, v11.4s +mla v12.4S, v16.4S, v31.s[0] +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +sub v16.4s, v10.4s, v12.4s +mla v22.4S, v8.4S, v31.s[0] +add v10.4s, v10.4s, v12.4s +sqrdmulh v12.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +sub v8.4s, v15.4s, v22.4s +mla v21.4S, v11.4S, v31.s[0] +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v1.4S, v29.s[2] +mul v1.4S, v1.4S,v30.s[2] +sub v11.4s, v2.4s, v21.4s +mla v19.4S, v12.4S, v31.s[0] +add v2.4s, v2.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v27.s[0] +mul v17.4S, v17.4S,v28.s[0] +sub v12.4s, v18.4s, v19.4s +mla v1.4S, v22.4S, v31.s[0] +add v18.4s, v18.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +sub v22.4s, v3.4s, v1.4s +mla v17.4S, v21.4S, v31.s[0] +add v3.4s, v3.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v21.4s, v13.4s, v17.4s +mla v14.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v17.4s +sqrdmulh v17.4S, v20.4S, v27.s[1] +mul v20.4S, v20.4S,v28.s[1] +sub v19.4s, v10.4s, v14.4s +mla v0.4S, v1.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v27.s[2] +mul v15.4S, v15.4S,v28.s[2] +sub v1.4s, v9.4s, v0.4s +mla v20.4S, v17.4S, v31.s[0] +add v9.4s, v9.4s, v0.4s +sqrdmulh v0.4S, v2.4S, v27.s[2] +mul v2.4S, v2.4S,v28.s[2] +sub v17.4s, v16.4s, v20.4s +mla v15.4S, v14.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v27.s[3] +mul v8.4S, v8.4S,v28.s[3] +sub v14.4s, v18.4s, v15.4s +mla v2.4S, v0.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v27.s[3] +mul v11.4S, v11.4S,v28.s[3] +sub v0.4s, v3.4s, v2.4s +mla v8.4S, v20.4S, v31.s[0] +add v3.4s, v3.4s, v2.4s +sqrdmulh v2.4S, v10.4S, v25.s[0] +mul v10.4S, v10.4S,v26.s[0] +sub v20.4s, v12.4s, v8.4s +mla v11.4S, v15.4S, v31.s[0] +add v12.4s, v12.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v25.s[1] +mul v19.4S, v19.4S,v26.s[1] +sub v15.4s, v22.4s, v11.4s +mla v10.4S, v2.4S, v31.s[0] +add v22.4s, v22.4s, v11.4s +sqrdmulh v11.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v2.4s, v13.4s, v10.4s +mla v19.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v10.4s +sqrdmulh v10.4S, v17.4S, v25.s[3] +mul v17.4S, v17.4S,v26.s[3] +sub v8.4s, v21.4s, v19.4s +mla v16.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v19.4s +str q13, [x0, #48] +sqrdmulh v13.4S, v3.4S, v23.s[0] +str q2, [x0, #112] +mul v3.4S, v3.4S,v24.s[0] +ldr q2, [x0, #768] +sub v19.4s, v9.4s, v16.4s +ldr q11, [x0, #832] +mla v17.4S, v10.4S, v31.s[0] +add v9.4s, v9.4s, v16.4s +str q21, [x0, #176] +sqrdmulh v21.4S, v0.4S, v23.s[1] +str q8, [x0, #240] +mul v0.4S, v0.4S,v24.s[1] +ldr q8, [x0, #896] +sub v16.4s, v1.4s, v17.4s +ldr q10, [x0, #960] +mla v3.4S, v13.4S, v31.s[0] +add v1.4s, v1.4s, v17.4s +str q9, [x0, #304] +sqrdmulh v9.4S, v22.4S, v23.s[2] +str q19, [x0, #368] +mul v22.4S, v22.4S,v24.s[2] +ldr q19, [x0, #256] +sub v17.4s, v18.4s, v3.4s +ldr q13, [x0, #320] +mla v0.4S, v21.4S, v31.s[0] +add v18.4s, v18.4s, v3.4s +str q1, [x0, #432] +sqrdmulh v1.4S, v15.4S, v23.s[3] +str q16, [x0, #496] +mul v15.4S, v15.4S,v24.s[3] +ldr q16, [x0, #384] +sub v3.4s, v14.4s, v0.4s +ldr q21, [x0, #448] +mla v22.4S, v9.4S, v31.s[0] +add v14.4s, v14.4s, v0.4s +str q18, [x0, #560] +sqrdmulh v18.4S, v2.4S, v29.s[0] +str q17, [x0, #624] +ldr q17, [x0, #512] +mul v2.4S, v2.4S,v30.s[0] +ldr q0, [x0, #576] +sub v9.4s, v12.4s, v22.4s +mla v15.4S, v1.4S, v31.s[0] +add v12.4s, v12.4s, v22.4s +str q14, [x0, #688] +sqrdmulh v14.4S, v11.4S, v29.s[0] +str q3, [x0, #752] +ldr q3, [x0, #640] +mul v11.4S, v11.4S,v30.s[0] +ldr q22, [x0, #704] +sub v1.4s, v20.4s, v15.4s +mla v2.4S, v18.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +str q12, [x0, #816] +sqrdmulh v12.4S, v8.4S, v29.s[0] +str q9, [x0, #880] +mul v8.4S, v8.4S,v30.s[0] +ldr q9, [x0, #0] +sub v15.4s, v19.4s, v2.4s +mla v11.4S, v14.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +str q20, [x0, #944] +sqrdmulh v20.4S, v10.4S, v29.s[0] +str q1, [x0, #1008] +mul v10.4S, v10.4S,v30.s[0] +ldr q1, [x0, #64] +sub v2.4s, v13.4s, v11.4s +mla v8.4S, v12.4S, v31.s[0] +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v29.s[0] +ldr q12, [x0, #128] +mul v17.4S, v17.4S,v30.s[0] +sub v14.4s, v16.4s, v8.4s +mla v10.4S, v20.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v0.4S, v29.s[0] +ldr q20, [x0, #192] +mul v0.4S, v0.4S,v30.s[0] +sub v18.4s, v21.4s, v10.4s +mla v17.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +sub v11.4s, v9.4s, v17.4s +mla v0.4S, v8.4S, v31.s[0] +add v9.4s, v9.4s, v17.4s +sqrdmulh v17.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v8.4s, v1.4s, v0.4s +mla v3.4S, v10.4S, v31.s[0] +add v1.4s, v1.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v10.4s, v12.4s, v3.4s +mla v22.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v17.4s, v20.4s, v22.4s +mla v16.4S, v0.4S, v31.s[0] +add v20.4s, v20.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[1] +mul v19.4S, v19.4S,v30.s[1] +sub v0.4s, v12.4s, v16.4s +mla v21.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v13.4S, v29.s[1] +mul v13.4S, v13.4S,v30.s[1] +sub v3.4s, v20.4s, v21.4s +mla v19.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v22.4s, v9.4s, v19.4s +mla v13.4S, v16.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v29.s[2] +mul v18.4S, v18.4S,v30.s[2] +sub v16.4s, v1.4s, v13.4s +mla v14.4S, v21.4S, v31.s[0] +add v1.4s, v1.4s, v13.4s +sqrdmulh v13.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +sub v21.4s, v10.4s, v14.4s +mla v18.4S, v19.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v19.4s, v17.4s, v18.4s +mla v15.4S, v13.4S, v31.s[0] +add v17.4s, v17.4s, v18.4s +sqrdmulh v18.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +sub v13.4s, v11.4s, v15.4s +mla v2.4S, v14.4S, v31.s[0] +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v27.s[0] +mul v20.4S, v20.4S,v28.s[0] +sub v14.4s, v8.4s, v2.4s +mla v12.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v2.4s +sqrdmulh v2.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v18.4s, v9.4s, v12.4s +mla v20.4S, v15.4S, v31.s[0] +add v9.4s, v9.4s, v12.4s +sqrdmulh v12.4S, v3.4S, v27.s[1] +mul v3.4S, v3.4S,v28.s[1] +sub v15.4s, v1.4s, v20.4s +mla v0.4S, v2.4S, v31.s[0] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v10.4S, v27.s[2] +mul v10.4S, v10.4S,v28.s[2] +sub v2.4s, v22.4s, v0.4s +mla v3.4S, v12.4S, v31.s[0] +add v22.4s, v22.4s, v0.4s +sqrdmulh v0.4S, v17.4S, v27.s[2] +mul v17.4S, v17.4S,v28.s[2] +sub v12.4s, v16.4s, v3.4s +mla v10.4S, v20.4S, v31.s[0] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +sub v20.4s, v11.4s, v10.4s +mla v17.4S, v0.4S, v31.s[0] +add v11.4s, v11.4s, v10.4s +sqrdmulh v10.4S, v19.4S, v27.s[3] +mul v19.4S, v19.4S,v28.s[3] +sub v0.4s, v8.4s, v17.4s +mla v21.4S, v3.4S, v31.s[0] +add v8.4s, v8.4s, v17.4s +sqrdmulh v17.4S, v1.4S, v25.s[0] +mul v1.4S, v1.4S,v26.s[0] +sub v3.4s, v13.4s, v21.4s +mla v19.4S, v10.4S, v31.s[0] +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v15.4S, v25.s[1] +mul v15.4S, v15.4S,v26.s[1] +sub v10.4s, v14.4s, v19.4s +mla v1.4S, v17.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +sqrdmulh v19.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v17.4s, v9.4s, v1.4s +mla v15.4S, v21.4S, v31.s[0] +add v9.4s, v9.4s, v1.4s +sqrdmulh v1.4S, v12.4S, v25.s[3] +mul v12.4S, v12.4S,v26.s[3] +sub v21.4s, v18.4s, v15.4s +mla v16.4S, v19.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +str q9, [x0, #0] +sqrdmulh v9.4S, v8.4S, v23.s[0] +str q17, [x0, #64] +mul v8.4S, v8.4S,v24.s[0] +ldr q17, [x0, #784] +sub v15.4s, v22.4s, v16.4s +ldr q19, [x0, #848] +mla v12.4S, v1.4S, v31.s[0] +add v22.4s, v22.4s, v16.4s +str q18, [x0, #128] +sqrdmulh v18.4S, v0.4S, v23.s[1] +str q21, [x0, #192] +mul v0.4S, v0.4S,v24.s[1] +ldr q21, [x0, #912] +sub v16.4s, v2.4s, v12.4s +ldr q1, [x0, #976] +mla v8.4S, v9.4S, v31.s[0] +add v2.4s, v2.4s, v12.4s +str q22, [x0, #256] +sqrdmulh v22.4S, v14.4S, v23.s[2] +str q15, [x0, #320] +mul v14.4S, v14.4S,v24.s[2] +ldr q15, [x0, #272] +sub v12.4s, v11.4s, v8.4s +ldr q9, [x0, #336] +mla v0.4S, v18.4S, v31.s[0] +add v11.4s, v11.4s, v8.4s +str q2, [x0, #384] +sqrdmulh v2.4S, v10.4S, v23.s[3] +str q16, [x0, #448] +mul v10.4S, v10.4S,v24.s[3] +ldr q16, [x0, #400] +sub v8.4s, v20.4s, v0.4s +ldr q18, [x0, #464] +mla v14.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v0.4s +str q11, [x0, #512] +sqrdmulh v11.4S, v17.4S, v29.s[0] +str q12, [x0, #576] +ldr q12, [x0, #528] +mul v17.4S, v17.4S,v30.s[0] +ldr q0, [x0, #592] +sub v22.4s, v13.4s, v14.4s +mla v10.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v14.4s +str q20, [x0, #640] +sqrdmulh v20.4S, v19.4S, v29.s[0] +str q8, [x0, #704] +ldr q8, [x0, #656] +mul v19.4S, v19.4S,v30.s[0] +ldr q14, [x0, #720] +sub v2.4s, v3.4s, v10.4s +mla v17.4S, v11.4S, v31.s[0] +add v3.4s, v3.4s, v10.4s +str q13, [x0, #768] +sqrdmulh v13.4S, v21.4S, v29.s[0] +str q22, [x0, #832] +mul v21.4S, v21.4S,v30.s[0] +ldr q22, [x0, #16] +sub v10.4s, v15.4s, v17.4s +mla v19.4S, v20.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +str q3, [x0, #896] +sqrdmulh v3.4S, v1.4S, v29.s[0] +str q2, [x0, #960] +mul v1.4S, v1.4S,v30.s[0] +ldr q2, [x0, #80] +sub v17.4s, v9.4s, v19.4s +mla v21.4S, v13.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v12.4S, v29.s[0] +ldr q13, [x0, #144] +mul v12.4S, v12.4S,v30.s[0] +sub v20.4s, v16.4s, v21.4s +mla v1.4S, v3.4S, v31.s[0] +add v16.4s, v16.4s, v21.4s +sqrdmulh v21.4S, v0.4S, v29.s[0] +ldr q3, [x0, #208] +mul v0.4S, v0.4S,v30.s[0] +sub v11.4s, v18.4s, v1.4s +mla v12.4S, v19.4S, v31.s[0] +add v18.4s, v18.4s, v1.4s +sqrdmulh v1.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v19.4s, v22.4s, v12.4s +mla v0.4S, v21.4S, v31.s[0] +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v21.4s, v2.4s, v0.4s +mla v8.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v1.4s, v13.4s, v8.4s +mla v14.4S, v12.4S, v31.s[0] +add v13.4s, v13.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v12.4s, v3.4s, v14.4s +mla v16.4S, v0.4S, v31.s[0] +add v3.4s, v3.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +sub v0.4s, v13.4s, v16.4s +mla v18.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v16.4s +sqrdmulh v16.4S, v9.4S, v29.s[1] +mul v9.4S, v9.4S,v30.s[1] +sub v8.4s, v3.4s, v18.4s +mla v15.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +sub v14.4s, v22.4s, v15.4s +mla v9.4S, v16.4S, v31.s[0] +add v22.4s, v22.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v16.4s, v2.4s, v9.4s +mla v20.4S, v18.4S, v31.s[0] +add v2.4s, v2.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v18.4s, v1.4s, v20.4s +mla v11.4S, v15.4S, v31.s[0] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +sub v15.4s, v12.4s, v11.4s +mla v10.4S, v9.4S, v31.s[0] +add v12.4s, v12.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v27.s[0] +mul v13.4S, v13.4S,v28.s[0] +sub v9.4s, v19.4s, v10.4s +mla v17.4S, v20.4S, v31.s[0] +add v19.4s, v19.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v27.s[0] +mul v3.4S, v3.4S,v28.s[0] +sub v20.4s, v21.4s, v17.4s +mla v13.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v11.4s, v22.4s, v13.4s +mla v3.4S, v10.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v8.4S, v27.s[1] +mul v8.4S, v8.4S,v28.s[1] +sub v10.4s, v2.4s, v3.4s +mla v0.4S, v17.4S, v31.s[0] +add v2.4s, v2.4s, v3.4s +sqrdmulh v3.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +sub v17.4s, v14.4s, v0.4s +mla v8.4S, v13.4S, v31.s[0] +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v12.4S, v27.s[2] +mul v12.4S, v12.4S,v28.s[2] +sub v13.4s, v16.4s, v8.4s +mla v1.4S, v3.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v3.4s, v19.4s, v1.4s +mla v12.4S, v0.4S, v31.s[0] +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +sub v0.4s, v21.4s, v12.4s +mla v18.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v2.4S, v25.s[0] +mul v2.4S, v2.4S,v26.s[0] +sub v8.4s, v9.4s, v18.4s +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v10.4S, v25.s[1] +mul v10.4S, v10.4S,v26.s[1] +sub v1.4s, v20.4s, v15.4s +mla v2.4S, v12.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v12.4s, v22.4s, v2.4s +mla v10.4S, v18.4S, v31.s[0] +add v22.4s, v22.4s, v2.4s +sqrdmulh v2.4S, v13.4S, v25.s[3] +mul v13.4S, v13.4S,v26.s[3] +sub v18.4s, v11.4s, v10.4s +mla v16.4S, v15.4S, v31.s[0] +add v11.4s, v11.4s, v10.4s +str q22, [x0, #16] +sqrdmulh v22.4S, v21.4S, v23.s[0] +str q12, [x0, #80] +mul v21.4S, v21.4S,v24.s[0] +sub v12.4s, v14.4s, v16.4s +mla v13.4S, v2.4S, v31.s[0] +add v14.4s, v14.4s, v16.4s +str q11, [x0, #144] +sqrdmulh v11.4S, v0.4S, v23.s[1] +str q18, [x0, #208] +mul v0.4S, v0.4S,v24.s[1] +sub v18.4s, v17.4s, v13.4s +mla v21.4S, v22.4S, v31.s[0] +add v17.4s, v17.4s, v13.4s +str q14, [x0, #272] +sqrdmulh v14.4S, v20.4S, v23.s[2] +str q12, [x0, #336] +mul v20.4S, v20.4S,v24.s[2] +sub v12.4s, v19.4s, v21.4s +mla v0.4S, v11.4S, v31.s[0] +add v19.4s, v19.4s, v21.4s +str q17, [x0, #400] +sqrdmulh v17.4S, v1.4S, v23.s[3] +str q18, [x0, #464] +mul v1.4S, v1.4S,v24.s[3] +sub v18.4s, v3.4s, v0.4s +mla v20.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v0.4s +str q19, [x0, #528] +str q12, [x0, #592] +sub v12.4s, v9.4s, v20.4s +mla v1.4S, v17.4S, v31.s[0] +add v9.4s, v9.4s, v20.4s +str q3, [x0, #656] +str q18, [x0, #720] +sub v18.4s, v8.4s, v1.4s +add v8.4s, v8.4s, v1.4s +str q9, [x0, #784] +str q12, [x0, #848] +str q8, [x0, #912] +str q18, [x0, #976] +ldr q4, [x17, #+128] +ldr q5, [x17, #+144] +ldr q6, [x17, #+160] +ldr q7, [x17, #+176] +ldr q15, [x17, #+192] +ldr q10, [x17, #+208] +ldr q2, [x17, #+224] +ldr q16, [x17, #+240] +ldr q22, [x0, #32] +ldr q13, [x0, #48] +ldr q11, [x0, #0] +ldr q21, [x0, #16] +sqrdmulh v14.4S, v22.4S, v5.s[0] +mul v22.4S, v22.4S,v4.s[0] +mla v22.4S, v14.4S, v31.s[0] +sub v14.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +sqrdmulh v22.4S, v13.4S, v5.s[0] +mul v13.4S, v13.4S,v4.s[0] +mla v13.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +sqrdmulh v13.4S, v21.4S, v5.s[1] +mul v21.4S, v21.4S,v4.s[1] +mla v21.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v21.4s +add v11.4s, v11.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v5.s[2] +mul v22.4S, v22.4S,v4.s[2] +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v14.4s, v22.4s +add v14.4s, v14.4s, v22.4s +trn1 v22.4S, v11.4S, v13.4S +trn2 v0.4S, v11.4S, v13.4S +trn1 v19.4S, v14.4S, v21.4S +trn2 v17.4S, v14.4S, v21.4S +trn2 v14.2D, v22.2D, v19.2D +trn2 v21.2D, v0.2D, v17.2D +trn1 v11.2D, v22.2D, v19.2D +trn1 v13.2D, v0.2D, v17.2D +sqrdmulh v17.4S, v14.4S, v7.4S +mul v14.4S, v14.4S,v6.4S +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v11.4s, v14.4s +add v11.4s, v11.4s, v14.4s +sqrdmulh v14.4S, v21.4S, v7.4S +mul v21.4S, v21.4S,v6.4S +mla v21.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v21.4s +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v13.4S, v10.4S +mul v13.4S, v13.4S,v15.4S +mla v13.4S, v21.4S, v31.s[0] +sub v21.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +sqrdmulh v13.4S, v14.4S, v16.4S +mul v14.4S, v14.4S,v2.4S +mla v14.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v14.4s +add v17.4s, v17.4s, v14.4s +str q11, [x0, #0] +str q21, [x0, #16] +str q17, [x0, #32] +str q13, [x0, #48] +ldr q16, [x17, #+256] +ldr q2, [x17, #+272] +ldr q10, [x17, #+288] +ldr q15, [x17, #+304] +ldr q7, [x17, #+320] +ldr q6, [x17, #+336] +ldr q5, [x17, #+352] +ldr q4, [x17, #+368] +ldr q13, [x0, #96] +ldr q17, [x0, #112] +ldr q21, [x0, #64] +ldr q11, [x0, #80] +sqrdmulh v14.4S, v13.4S, v2.s[0] +mul v13.4S, v13.4S,v16.s[0] +mla v13.4S, v14.4S, v31.s[0] +sub v14.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +sqrdmulh v13.4S, v17.4S, v2.s[0] +mul v17.4S, v17.4S,v16.s[0] +mla v17.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v17.4s +add v11.4s, v11.4s, v17.4s +sqrdmulh v17.4S, v11.4S, v2.s[1] +mul v11.4S, v11.4S,v16.s[1] +mla v11.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v11.4s +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v2.s[2] +mul v13.4S, v13.4S,v16.s[2] +mla v13.4S, v11.4S, v31.s[0] +sub v11.4s, v14.4s, v13.4s +add v14.4s, v14.4s, v13.4s +trn1 v13.4S, v21.4S, v17.4S +trn2 v0.4S, v21.4S, v17.4S +trn1 v19.4S, v14.4S, v11.4S +trn2 v22.4S, v14.4S, v11.4S +trn2 v14.2D, v13.2D, v19.2D +trn2 v11.2D, v0.2D, v22.2D +trn1 v21.2D, v13.2D, v19.2D +trn1 v17.2D, v0.2D, v22.2D +sqrdmulh v22.4S, v14.4S, v15.4S +mul v14.4S, v14.4S,v10.4S +mla v14.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v14.4s +add v21.4s, v21.4s, v14.4s +sqrdmulh v14.4S, v11.4S, v15.4S +mul v11.4S, v11.4S,v10.4S +mla v11.4S, v14.4S, v31.s[0] +sub v14.4s, v17.4s, v11.4s +add v17.4s, v17.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v6.4S +mul v17.4S, v17.4S,v7.4S +mla v17.4S, v11.4S, v31.s[0] +sub v11.4s, v21.4s, v17.4s +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v14.4S, v4.4S +mul v14.4S, v14.4S,v5.4S +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v22.4s, v14.4s +add v22.4s, v22.4s, v14.4s +str q21, [x0, #64] +str q11, [x0, #80] +str q22, [x0, #96] +str q17, [x0, #112] +ldr q4, [x17, #+384] +ldr q5, [x17, #+400] +ldr q6, [x17, #+416] +ldr q7, [x17, #+432] +ldr q15, [x17, #+448] +ldr q10, [x17, #+464] +ldr q2, [x17, #+480] +ldr q16, [x17, #+496] +ldr q17, [x0, #160] +ldr q22, [x0, #176] +ldr q11, [x0, #128] +ldr q21, [x0, #144] +sqrdmulh v14.4S, v17.4S, v5.s[0] +mul v17.4S, v17.4S,v4.s[0] +mla v17.4S, v14.4S, v31.s[0] +sub v14.4s, v11.4s, v17.4s +add v11.4s, v11.4s, v17.4s +sqrdmulh v17.4S, v22.4S, v5.s[0] +mul v22.4S, v22.4S,v4.s[0] +mla v22.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v5.s[1] +mul v21.4S, v21.4S,v4.s[1] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v21.4s +add v11.4s, v11.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v5.s[2] +mul v17.4S, v17.4S,v4.s[2] +mla v17.4S, v21.4S, v31.s[0] +sub v21.4s, v14.4s, v17.4s +add v14.4s, v14.4s, v17.4s +trn1 v17.4S, v11.4S, v22.4S +trn2 v0.4S, v11.4S, v22.4S +trn1 v19.4S, v14.4S, v21.4S +trn2 v13.4S, v14.4S, v21.4S +trn2 v14.2D, v17.2D, v19.2D +trn2 v21.2D, v0.2D, v13.2D +trn1 v11.2D, v17.2D, v19.2D +trn1 v22.2D, v0.2D, v13.2D +sqrdmulh v13.4S, v14.4S, v7.4S +mul v14.4S, v14.4S,v6.4S +mla v14.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v14.4s +add v11.4s, v11.4s, v14.4s +sqrdmulh v14.4S, v21.4S, v7.4S +mul v21.4S, v21.4S,v6.4S +mla v21.4S, v14.4S, v31.s[0] +sub v14.4s, v22.4s, v21.4s +add v22.4s, v22.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v10.4S +mul v22.4S, v22.4S,v15.4S +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +sqrdmulh v22.4S, v14.4S, v16.4S +mul v14.4S, v14.4S,v2.4S +mla v14.4S, v22.4S, v31.s[0] +sub v22.4s, v13.4s, v14.4s +add v13.4s, v13.4s, v14.4s +str q11, [x0, #128] +str q21, [x0, #144] +str q13, [x0, #160] +str q22, [x0, #176] +ldr q16, [x17, #+512] +ldr q2, [x17, #+528] +ldr q10, [x17, #+544] +ldr q15, [x17, #+560] +ldr q7, [x17, #+576] +ldr q6, [x17, #+592] +ldr q5, [x17, #+608] +ldr q4, [x17, #+624] +ldr q22, [x0, #224] +ldr q13, [x0, #240] +ldr q21, [x0, #192] +ldr q11, [x0, #208] +sqrdmulh v14.4S, v22.4S, v2.s[0] +mul v22.4S, v22.4S,v16.s[0] +mla v22.4S, v14.4S, v31.s[0] +sub v14.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +sqrdmulh v22.4S, v13.4S, v2.s[0] +mul v13.4S, v13.4S,v16.s[0] +mla v13.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +sqrdmulh v13.4S, v11.4S, v2.s[1] +mul v11.4S, v11.4S,v16.s[1] +mla v11.4S, v13.4S, v31.s[0] +sub v13.4s, v21.4s, v11.4s +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v2.s[2] +mul v22.4S, v22.4S,v16.s[2] +mla v22.4S, v11.4S, v31.s[0] +sub v11.4s, v14.4s, v22.4s +add v14.4s, v14.4s, v22.4s +trn1 v22.4S, v21.4S, v13.4S +trn2 v0.4S, v21.4S, v13.4S +trn1 v19.4S, v14.4S, v11.4S +trn2 v17.4S, v14.4S, v11.4S +trn2 v14.2D, v22.2D, v19.2D +trn2 v11.2D, v0.2D, v17.2D +trn1 v21.2D, v22.2D, v19.2D +trn1 v13.2D, v0.2D, v17.2D +sqrdmulh v17.4S, v14.4S, v15.4S +mul v14.4S, v14.4S,v10.4S +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v14.4s +add v21.4s, v21.4s, v14.4s +sqrdmulh v14.4S, v11.4S, v15.4S +mul v11.4S, v11.4S,v10.4S +mla v11.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v11.4s +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v6.4S +mul v13.4S, v13.4S,v7.4S +mla v13.4S, v11.4S, v31.s[0] +sub v11.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +sqrdmulh v13.4S, v14.4S, v4.4S +mul v14.4S, v14.4S,v5.4S +mla v14.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v14.4s +add v17.4s, v17.4s, v14.4s +str q21, [x0, #192] +str q11, [x0, #208] +str q17, [x0, #224] +str q13, [x0, #240] +ldr q4, [x17, #+640] +ldr q5, [x17, #+656] +ldr q6, [x17, #+672] +ldr q7, [x17, #+688] +ldr q15, [x17, #+704] +ldr q10, [x17, #+720] +ldr q2, [x17, #+736] +ldr q16, [x17, #+752] +ldr q13, [x0, #288] +ldr q17, [x0, #304] +ldr q11, [x0, #256] +ldr q21, [x0, #272] +sqrdmulh v14.4S, v13.4S, v5.s[0] +mul v13.4S, v13.4S,v4.s[0] +mla v13.4S, v14.4S, v31.s[0] +sub v14.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +sqrdmulh v13.4S, v17.4S, v5.s[0] +mul v17.4S, v17.4S,v4.s[0] +mla v17.4S, v13.4S, v31.s[0] +sub v13.4s, v21.4s, v17.4s +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v21.4S, v5.s[1] +mul v21.4S, v21.4S,v4.s[1] +mla v21.4S, v17.4S, v31.s[0] +sub v17.4s, v11.4s, v21.4s +add v11.4s, v11.4s, v21.4s +sqrdmulh v21.4S, v13.4S, v5.s[2] +mul v13.4S, v13.4S,v4.s[2] +mla v13.4S, v21.4S, v31.s[0] +sub v21.4s, v14.4s, v13.4s +add v14.4s, v14.4s, v13.4s +trn1 v13.4S, v11.4S, v17.4S +trn2 v0.4S, v11.4S, v17.4S +trn1 v19.4S, v14.4S, v21.4S +trn2 v22.4S, v14.4S, v21.4S +trn2 v14.2D, v13.2D, v19.2D +trn2 v21.2D, v0.2D, v22.2D +trn1 v11.2D, v13.2D, v19.2D +trn1 v17.2D, v0.2D, v22.2D +sqrdmulh v22.4S, v14.4S, v7.4S +mul v14.4S, v14.4S,v6.4S +mla v14.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v14.4s +add v11.4s, v11.4s, v14.4s +sqrdmulh v14.4S, v21.4S, v7.4S +mul v21.4S, v21.4S,v6.4S +mla v21.4S, v14.4S, v31.s[0] +sub v14.4s, v17.4s, v21.4s +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v10.4S +mul v17.4S, v17.4S,v15.4S +mla v17.4S, v21.4S, v31.s[0] +sub v21.4s, v11.4s, v17.4s +add v11.4s, v11.4s, v17.4s +sqrdmulh v17.4S, v14.4S, v16.4S +mul v14.4S, v14.4S,v2.4S +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v22.4s, v14.4s +add v22.4s, v22.4s, v14.4s +str q11, [x0, #256] +str q21, [x0, #272] +str q22, [x0, #288] +str q17, [x0, #304] +ldr q16, [x17, #+768] +ldr q2, [x17, #+784] +ldr q10, [x17, #+800] +ldr q15, [x17, #+816] +ldr q7, [x17, #+832] +ldr q6, [x17, #+848] +ldr q5, [x17, #+864] +ldr q4, [x17, #+880] +ldr q17, [x0, #352] +ldr q22, [x0, #368] +ldr q21, [x0, #320] +ldr q11, [x0, #336] +sqrdmulh v14.4S, v17.4S, v2.s[0] +mul v17.4S, v17.4S,v16.s[0] +mla v17.4S, v14.4S, v31.s[0] +sub v14.4s, v21.4s, v17.4s +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v22.4S, v2.s[0] +mul v22.4S, v22.4S,v16.s[0] +mla v22.4S, v17.4S, v31.s[0] +sub v17.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +sqrdmulh v22.4S, v11.4S, v2.s[1] +mul v11.4S, v11.4S,v16.s[1] +mla v11.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v11.4s +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v2.s[2] +mul v17.4S, v17.4S,v16.s[2] +mla v17.4S, v11.4S, v31.s[0] +sub v11.4s, v14.4s, v17.4s +add v14.4s, v14.4s, v17.4s +trn1 v17.4S, v21.4S, v22.4S +trn2 v0.4S, v21.4S, v22.4S +trn1 v19.4S, v14.4S, v11.4S +trn2 v13.4S, v14.4S, v11.4S +trn2 v14.2D, v17.2D, v19.2D +trn2 v11.2D, v0.2D, v13.2D +trn1 v21.2D, v17.2D, v19.2D +trn1 v22.2D, v0.2D, v13.2D +sqrdmulh v13.4S, v14.4S, v15.4S +mul v14.4S, v14.4S,v10.4S +mla v14.4S, v13.4S, v31.s[0] +sub v13.4s, v21.4s, v14.4s +add v21.4s, v21.4s, v14.4s +sqrdmulh v14.4S, v11.4S, v15.4S +mul v11.4S, v11.4S,v10.4S +mla v11.4S, v14.4S, v31.s[0] +sub v14.4s, v22.4s, v11.4s +add v22.4s, v22.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v6.4S +mul v22.4S, v22.4S,v7.4S +mla v22.4S, v11.4S, v31.s[0] +sub v11.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +sqrdmulh v22.4S, v14.4S, v4.4S +mul v14.4S, v14.4S,v5.4S +mla v14.4S, v22.4S, v31.s[0] +sub v22.4s, v13.4s, v14.4s +add v13.4s, v13.4s, v14.4s +str q21, [x0, #320] +str q11, [x0, #336] +str q13, [x0, #352] +str q22, [x0, #368] +ldr q4, [x17, #+896] +ldr q5, [x17, #+912] +ldr q6, [x17, #+928] +ldr q7, [x17, #+944] +ldr q15, [x17, #+960] +ldr q10, [x17, #+976] +ldr q2, [x17, #+992] +ldr q16, [x17, #+1008] +ldr q22, [x0, #416] +ldr q13, [x0, #432] +ldr q11, [x0, #384] +ldr q21, [x0, #400] +sqrdmulh v14.4S, v22.4S, v5.s[0] +mul v22.4S, v22.4S,v4.s[0] +mla v22.4S, v14.4S, v31.s[0] +sub v14.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +sqrdmulh v22.4S, v13.4S, v5.s[0] +mul v13.4S, v13.4S,v4.s[0] +mla v13.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +sqrdmulh v13.4S, v21.4S, v5.s[1] +mul v21.4S, v21.4S,v4.s[1] +mla v21.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v21.4s +add v11.4s, v11.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v5.s[2] +mul v22.4S, v22.4S,v4.s[2] +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v14.4s, v22.4s +add v14.4s, v14.4s, v22.4s +trn1 v22.4S, v11.4S, v13.4S +trn2 v0.4S, v11.4S, v13.4S +trn1 v19.4S, v14.4S, v21.4S +trn2 v17.4S, v14.4S, v21.4S +trn2 v14.2D, v22.2D, v19.2D +trn2 v21.2D, v0.2D, v17.2D +trn1 v11.2D, v22.2D, v19.2D +trn1 v13.2D, v0.2D, v17.2D +sqrdmulh v17.4S, v14.4S, v7.4S +mul v14.4S, v14.4S,v6.4S +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v11.4s, v14.4s +add v11.4s, v11.4s, v14.4s +sqrdmulh v14.4S, v21.4S, v7.4S +mul v21.4S, v21.4S,v6.4S +mla v21.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v21.4s +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v13.4S, v10.4S +mul v13.4S, v13.4S,v15.4S +mla v13.4S, v21.4S, v31.s[0] +sub v21.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +sqrdmulh v13.4S, v14.4S, v16.4S +mul v14.4S, v14.4S,v2.4S +mla v14.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v14.4s +add v17.4s, v17.4s, v14.4s +str q11, [x0, #384] +str q21, [x0, #400] +str q17, [x0, #416] +str q13, [x0, #432] +ldr q16, [x17, #+1024] +ldr q2, [x17, #+1040] +ldr q10, [x17, #+1056] +ldr q15, [x17, #+1072] +ldr q7, [x17, #+1088] +ldr q6, [x17, #+1104] +ldr q5, [x17, #+1120] +ldr q4, [x17, #+1136] +ldr q13, [x0, #480] +ldr q17, [x0, #496] +ldr q21, [x0, #448] +ldr q11, [x0, #464] +sqrdmulh v14.4S, v13.4S, v2.s[0] +mul v13.4S, v13.4S,v16.s[0] +mla v13.4S, v14.4S, v31.s[0] +sub v14.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +sqrdmulh v13.4S, v17.4S, v2.s[0] +mul v17.4S, v17.4S,v16.s[0] +mla v17.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v17.4s +add v11.4s, v11.4s, v17.4s +sqrdmulh v17.4S, v11.4S, v2.s[1] +mul v11.4S, v11.4S,v16.s[1] +mla v11.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v11.4s +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v2.s[2] +mul v13.4S, v13.4S,v16.s[2] +mla v13.4S, v11.4S, v31.s[0] +sub v11.4s, v14.4s, v13.4s +add v14.4s, v14.4s, v13.4s +trn1 v13.4S, v21.4S, v17.4S +trn2 v0.4S, v21.4S, v17.4S +trn1 v19.4S, v14.4S, v11.4S +trn2 v22.4S, v14.4S, v11.4S +trn2 v14.2D, v13.2D, v19.2D +trn2 v11.2D, v0.2D, v22.2D +trn1 v21.2D, v13.2D, v19.2D +trn1 v17.2D, v0.2D, v22.2D +sqrdmulh v22.4S, v14.4S, v15.4S +mul v14.4S, v14.4S,v10.4S +mla v14.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v14.4s +add v21.4s, v21.4s, v14.4s +sqrdmulh v14.4S, v11.4S, v15.4S +mul v11.4S, v11.4S,v10.4S +mla v11.4S, v14.4S, v31.s[0] +sub v14.4s, v17.4s, v11.4s +add v17.4s, v17.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v6.4S +mul v17.4S, v17.4S,v7.4S +mla v17.4S, v11.4S, v31.s[0] +sub v11.4s, v21.4s, v17.4s +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v14.4S, v4.4S +mul v14.4S, v14.4S,v5.4S +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v22.4s, v14.4s +add v22.4s, v22.4s, v14.4s +str q21, [x0, #448] +str q11, [x0, #464] +str q22, [x0, #480] +str q17, [x0, #496] +ldr q4, [x17, #+1152] +ldr q5, [x17, #+1168] +ldr q6, [x17, #+1184] +ldr q7, [x17, #+1200] +ldr q15, [x17, #+1216] +ldr q10, [x17, #+1232] +ldr q2, [x17, #+1248] +ldr q16, [x17, #+1264] +ldr q17, [x0, #544] +ldr q22, [x0, #560] +ldr q11, [x0, #512] +ldr q21, [x0, #528] +sqrdmulh v14.4S, v17.4S, v5.s[0] +mul v17.4S, v17.4S,v4.s[0] +mla v17.4S, v14.4S, v31.s[0] +sub v14.4s, v11.4s, v17.4s +add v11.4s, v11.4s, v17.4s +sqrdmulh v17.4S, v22.4S, v5.s[0] +mul v22.4S, v22.4S,v4.s[0] +mla v22.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v5.s[1] +mul v21.4S, v21.4S,v4.s[1] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v21.4s +add v11.4s, v11.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v5.s[2] +mul v17.4S, v17.4S,v4.s[2] +mla v17.4S, v21.4S, v31.s[0] +sub v21.4s, v14.4s, v17.4s +add v14.4s, v14.4s, v17.4s +trn1 v17.4S, v11.4S, v22.4S +trn2 v0.4S, v11.4S, v22.4S +trn1 v19.4S, v14.4S, v21.4S +trn2 v13.4S, v14.4S, v21.4S +trn2 v14.2D, v17.2D, v19.2D +trn2 v21.2D, v0.2D, v13.2D +trn1 v11.2D, v17.2D, v19.2D +trn1 v22.2D, v0.2D, v13.2D +sqrdmulh v13.4S, v14.4S, v7.4S +mul v14.4S, v14.4S,v6.4S +mla v14.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v14.4s +add v11.4s, v11.4s, v14.4s +sqrdmulh v14.4S, v21.4S, v7.4S +mul v21.4S, v21.4S,v6.4S +mla v21.4S, v14.4S, v31.s[0] +sub v14.4s, v22.4s, v21.4s +add v22.4s, v22.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v10.4S +mul v22.4S, v22.4S,v15.4S +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +sqrdmulh v22.4S, v14.4S, v16.4S +mul v14.4S, v14.4S,v2.4S +mla v14.4S, v22.4S, v31.s[0] +sub v22.4s, v13.4s, v14.4s +add v13.4s, v13.4s, v14.4s +str q11, [x0, #512] +str q21, [x0, #528] +str q13, [x0, #544] +str q22, [x0, #560] +ldr q16, [x17, #+1280] +ldr q2, [x17, #+1296] +ldr q10, [x17, #+1312] +ldr q15, [x17, #+1328] +ldr q7, [x17, #+1344] +ldr q6, [x17, #+1360] +ldr q5, [x17, #+1376] +ldr q4, [x17, #+1392] +ldr q22, [x0, #608] +ldr q13, [x0, #624] +ldr q21, [x0, #576] +ldr q11, [x0, #592] +sqrdmulh v14.4S, v22.4S, v2.s[0] +mul v22.4S, v22.4S,v16.s[0] +mla v22.4S, v14.4S, v31.s[0] +sub v14.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +sqrdmulh v22.4S, v13.4S, v2.s[0] +mul v13.4S, v13.4S,v16.s[0] +mla v13.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +sqrdmulh v13.4S, v11.4S, v2.s[1] +mul v11.4S, v11.4S,v16.s[1] +mla v11.4S, v13.4S, v31.s[0] +sub v13.4s, v21.4s, v11.4s +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v2.s[2] +mul v22.4S, v22.4S,v16.s[2] +mla v22.4S, v11.4S, v31.s[0] +sub v11.4s, v14.4s, v22.4s +add v14.4s, v14.4s, v22.4s +trn1 v22.4S, v21.4S, v13.4S +trn2 v0.4S, v21.4S, v13.4S +trn1 v19.4S, v14.4S, v11.4S +trn2 v17.4S, v14.4S, v11.4S +trn2 v14.2D, v22.2D, v19.2D +trn2 v11.2D, v0.2D, v17.2D +trn1 v21.2D, v22.2D, v19.2D +trn1 v13.2D, v0.2D, v17.2D +sqrdmulh v17.4S, v14.4S, v15.4S +mul v14.4S, v14.4S,v10.4S +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v14.4s +add v21.4s, v21.4s, v14.4s +sqrdmulh v14.4S, v11.4S, v15.4S +mul v11.4S, v11.4S,v10.4S +mla v11.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v11.4s +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v6.4S +mul v13.4S, v13.4S,v7.4S +mla v13.4S, v11.4S, v31.s[0] +sub v11.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +sqrdmulh v13.4S, v14.4S, v4.4S +mul v14.4S, v14.4S,v5.4S +mla v14.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v14.4s +add v17.4s, v17.4s, v14.4s +str q21, [x0, #576] +str q11, [x0, #592] +str q17, [x0, #608] +str q13, [x0, #624] +ldr q4, [x17, #+1408] +ldr q5, [x17, #+1424] +ldr q6, [x17, #+1440] +ldr q7, [x17, #+1456] +ldr q15, [x17, #+1472] +ldr q10, [x17, #+1488] +ldr q2, [x17, #+1504] +ldr q16, [x17, #+1520] +ldr q13, [x0, #672] +ldr q17, [x0, #688] +ldr q11, [x0, #640] +ldr q21, [x0, #656] +sqrdmulh v14.4S, v13.4S, v5.s[0] +mul v13.4S, v13.4S,v4.s[0] +mla v13.4S, v14.4S, v31.s[0] +sub v14.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +sqrdmulh v13.4S, v17.4S, v5.s[0] +mul v17.4S, v17.4S,v4.s[0] +mla v17.4S, v13.4S, v31.s[0] +sub v13.4s, v21.4s, v17.4s +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v21.4S, v5.s[1] +mul v21.4S, v21.4S,v4.s[1] +mla v21.4S, v17.4S, v31.s[0] +sub v17.4s, v11.4s, v21.4s +add v11.4s, v11.4s, v21.4s +sqrdmulh v21.4S, v13.4S, v5.s[2] +mul v13.4S, v13.4S,v4.s[2] +mla v13.4S, v21.4S, v31.s[0] +sub v21.4s, v14.4s, v13.4s +add v14.4s, v14.4s, v13.4s +trn1 v13.4S, v11.4S, v17.4S +trn2 v0.4S, v11.4S, v17.4S +trn1 v19.4S, v14.4S, v21.4S +trn2 v22.4S, v14.4S, v21.4S +trn2 v14.2D, v13.2D, v19.2D +trn2 v21.2D, v0.2D, v22.2D +trn1 v11.2D, v13.2D, v19.2D +trn1 v17.2D, v0.2D, v22.2D +sqrdmulh v22.4S, v14.4S, v7.4S +mul v14.4S, v14.4S,v6.4S +mla v14.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v14.4s +add v11.4s, v11.4s, v14.4s +sqrdmulh v14.4S, v21.4S, v7.4S +mul v21.4S, v21.4S,v6.4S +mla v21.4S, v14.4S, v31.s[0] +sub v14.4s, v17.4s, v21.4s +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v10.4S +mul v17.4S, v17.4S,v15.4S +mla v17.4S, v21.4S, v31.s[0] +sub v21.4s, v11.4s, v17.4s +add v11.4s, v11.4s, v17.4s +sqrdmulh v17.4S, v14.4S, v16.4S +mul v14.4S, v14.4S,v2.4S +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v22.4s, v14.4s +add v22.4s, v22.4s, v14.4s +str q11, [x0, #640] +str q21, [x0, #656] +str q22, [x0, #672] +str q17, [x0, #688] +ldr q16, [x17, #+1536] +ldr q2, [x17, #+1552] +ldr q10, [x17, #+1568] +ldr q15, [x17, #+1584] +ldr q7, [x17, #+1600] +ldr q6, [x17, #+1616] +ldr q5, [x17, #+1632] +ldr q4, [x17, #+1648] +ldr q17, [x0, #736] +ldr q22, [x0, #752] +ldr q21, [x0, #704] +ldr q11, [x0, #720] +sqrdmulh v14.4S, v17.4S, v2.s[0] +mul v17.4S, v17.4S,v16.s[0] +mla v17.4S, v14.4S, v31.s[0] +sub v14.4s, v21.4s, v17.4s +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v22.4S, v2.s[0] +mul v22.4S, v22.4S,v16.s[0] +mla v22.4S, v17.4S, v31.s[0] +sub v17.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +sqrdmulh v22.4S, v11.4S, v2.s[1] +mul v11.4S, v11.4S,v16.s[1] +mla v11.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v11.4s +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v2.s[2] +mul v17.4S, v17.4S,v16.s[2] +mla v17.4S, v11.4S, v31.s[0] +sub v11.4s, v14.4s, v17.4s +add v14.4s, v14.4s, v17.4s +trn1 v17.4S, v21.4S, v22.4S +trn2 v0.4S, v21.4S, v22.4S +trn1 v19.4S, v14.4S, v11.4S +trn2 v13.4S, v14.4S, v11.4S +trn2 v14.2D, v17.2D, v19.2D +trn2 v11.2D, v0.2D, v13.2D +trn1 v21.2D, v17.2D, v19.2D +trn1 v22.2D, v0.2D, v13.2D +sqrdmulh v13.4S, v14.4S, v15.4S +mul v14.4S, v14.4S,v10.4S +mla v14.4S, v13.4S, v31.s[0] +sub v13.4s, v21.4s, v14.4s +add v21.4s, v21.4s, v14.4s +sqrdmulh v14.4S, v11.4S, v15.4S +mul v11.4S, v11.4S,v10.4S +mla v11.4S, v14.4S, v31.s[0] +sub v14.4s, v22.4s, v11.4s +add v22.4s, v22.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v6.4S +mul v22.4S, v22.4S,v7.4S +mla v22.4S, v11.4S, v31.s[0] +sub v11.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +sqrdmulh v22.4S, v14.4S, v4.4S +mul v14.4S, v14.4S,v5.4S +mla v14.4S, v22.4S, v31.s[0] +sub v22.4s, v13.4s, v14.4s +add v13.4s, v13.4s, v14.4s +str q21, [x0, #704] +str q11, [x0, #720] +str q13, [x0, #736] +str q22, [x0, #752] +ldr q4, [x17, #+1664] +ldr q5, [x17, #+1680] +ldr q6, [x17, #+1696] +ldr q7, [x17, #+1712] +ldr q15, [x17, #+1728] +ldr q10, [x17, #+1744] +ldr q2, [x17, #+1760] +ldr q16, [x17, #+1776] +ldr q22, [x0, #800] +ldr q13, [x0, #816] +ldr q11, [x0, #768] +ldr q21, [x0, #784] +sqrdmulh v14.4S, v22.4S, v5.s[0] +mul v22.4S, v22.4S,v4.s[0] +mla v22.4S, v14.4S, v31.s[0] +sub v14.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +sqrdmulh v22.4S, v13.4S, v5.s[0] +mul v13.4S, v13.4S,v4.s[0] +mla v13.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +sqrdmulh v13.4S, v21.4S, v5.s[1] +mul v21.4S, v21.4S,v4.s[1] +mla v21.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v21.4s +add v11.4s, v11.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v5.s[2] +mul v22.4S, v22.4S,v4.s[2] +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v14.4s, v22.4s +add v14.4s, v14.4s, v22.4s +trn1 v22.4S, v11.4S, v13.4S +trn2 v0.4S, v11.4S, v13.4S +trn1 v19.4S, v14.4S, v21.4S +trn2 v17.4S, v14.4S, v21.4S +trn2 v14.2D, v22.2D, v19.2D +trn2 v21.2D, v0.2D, v17.2D +trn1 v11.2D, v22.2D, v19.2D +trn1 v13.2D, v0.2D, v17.2D +sqrdmulh v17.4S, v14.4S, v7.4S +mul v14.4S, v14.4S,v6.4S +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v11.4s, v14.4s +add v11.4s, v11.4s, v14.4s +sqrdmulh v14.4S, v21.4S, v7.4S +mul v21.4S, v21.4S,v6.4S +mla v21.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v21.4s +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v13.4S, v10.4S +mul v13.4S, v13.4S,v15.4S +mla v13.4S, v21.4S, v31.s[0] +sub v21.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +sqrdmulh v13.4S, v14.4S, v16.4S +mul v14.4S, v14.4S,v2.4S +mla v14.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v14.4s +add v17.4s, v17.4s, v14.4s +str q11, [x0, #768] +str q21, [x0, #784] +str q17, [x0, #800] +str q13, [x0, #816] +ldr q16, [x17, #+1792] +ldr q2, [x17, #+1808] +ldr q10, [x17, #+1824] +ldr q15, [x17, #+1840] +ldr q7, [x17, #+1856] +ldr q6, [x17, #+1872] +ldr q5, [x17, #+1888] +ldr q4, [x17, #+1904] +ldr q13, [x0, #864] +ldr q17, [x0, #880] +ldr q21, [x0, #832] +ldr q11, [x0, #848] +sqrdmulh v14.4S, v13.4S, v2.s[0] +mul v13.4S, v13.4S,v16.s[0] +mla v13.4S, v14.4S, v31.s[0] +sub v14.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +sqrdmulh v13.4S, v17.4S, v2.s[0] +mul v17.4S, v17.4S,v16.s[0] +mla v17.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v17.4s +add v11.4s, v11.4s, v17.4s +sqrdmulh v17.4S, v11.4S, v2.s[1] +mul v11.4S, v11.4S,v16.s[1] +mla v11.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v11.4s +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v2.s[2] +mul v13.4S, v13.4S,v16.s[2] +mla v13.4S, v11.4S, v31.s[0] +sub v11.4s, v14.4s, v13.4s +add v14.4s, v14.4s, v13.4s +trn1 v13.4S, v21.4S, v17.4S +trn2 v0.4S, v21.4S, v17.4S +trn1 v19.4S, v14.4S, v11.4S +trn2 v22.4S, v14.4S, v11.4S +trn2 v14.2D, v13.2D, v19.2D +trn2 v11.2D, v0.2D, v22.2D +trn1 v21.2D, v13.2D, v19.2D +trn1 v17.2D, v0.2D, v22.2D +sqrdmulh v22.4S, v14.4S, v15.4S +mul v14.4S, v14.4S,v10.4S +mla v14.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v14.4s +add v21.4s, v21.4s, v14.4s +sqrdmulh v14.4S, v11.4S, v15.4S +mul v11.4S, v11.4S,v10.4S +mla v11.4S, v14.4S, v31.s[0] +sub v14.4s, v17.4s, v11.4s +add v17.4s, v17.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v6.4S +mul v17.4S, v17.4S,v7.4S +mla v17.4S, v11.4S, v31.s[0] +sub v11.4s, v21.4s, v17.4s +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v14.4S, v4.4S +mul v14.4S, v14.4S,v5.4S +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v22.4s, v14.4s +add v22.4s, v22.4s, v14.4s +str q21, [x0, #832] +str q11, [x0, #848] +str q22, [x0, #864] +str q17, [x0, #880] +ldr q4, [x17, #+1920] +ldr q5, [x17, #+1936] +ldr q6, [x17, #+1952] +ldr q7, [x17, #+1968] +ldr q15, [x17, #+1984] +ldr q10, [x17, #+2000] +ldr q2, [x17, #+2016] +ldr q16, [x17, #+2032] +ldr q17, [x0, #928] +ldr q22, [x0, #944] +ldr q11, [x0, #896] +ldr q21, [x0, #912] +sqrdmulh v14.4S, v17.4S, v5.s[0] +mul v17.4S, v17.4S,v4.s[0] +mla v17.4S, v14.4S, v31.s[0] +sub v14.4s, v11.4s, v17.4s +add v11.4s, v11.4s, v17.4s +sqrdmulh v17.4S, v22.4S, v5.s[0] +mul v22.4S, v22.4S,v4.s[0] +mla v22.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v5.s[1] +mul v21.4S, v21.4S,v4.s[1] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v21.4s +add v11.4s, v11.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v5.s[2] +mul v17.4S, v17.4S,v4.s[2] +mla v17.4S, v21.4S, v31.s[0] +sub v21.4s, v14.4s, v17.4s +add v14.4s, v14.4s, v17.4s +trn1 v17.4S, v11.4S, v22.4S +trn2 v0.4S, v11.4S, v22.4S +trn1 v19.4S, v14.4S, v21.4S +trn2 v13.4S, v14.4S, v21.4S +trn2 v14.2D, v17.2D, v19.2D +trn2 v21.2D, v0.2D, v13.2D +trn1 v11.2D, v17.2D, v19.2D +trn1 v22.2D, v0.2D, v13.2D +sqrdmulh v13.4S, v14.4S, v7.4S +mul v14.4S, v14.4S,v6.4S +mla v14.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v14.4s +add v11.4s, v11.4s, v14.4s +sqrdmulh v14.4S, v21.4S, v7.4S +mul v21.4S, v21.4S,v6.4S +mla v21.4S, v14.4S, v31.s[0] +sub v14.4s, v22.4s, v21.4s +add v22.4s, v22.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v10.4S +mul v22.4S, v22.4S,v15.4S +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +sqrdmulh v22.4S, v14.4S, v16.4S +mul v14.4S, v14.4S,v2.4S +mla v14.4S, v22.4S, v31.s[0] +sub v22.4s, v13.4s, v14.4s +add v13.4s, v13.4s, v14.4s +str q11, [x0, #896] +str q21, [x0, #912] +str q13, [x0, #928] +str q22, [x0, #944] +ldr q16, [x17, #+2048] +ldr q2, [x17, #+2064] +ldr q10, [x17, #+2080] +ldr q15, [x17, #+2096] +ldr q7, [x17, #+2112] +ldr q6, [x17, #+2128] +ldr q5, [x17, #+2144] +ldr q4, [x17, #+2160] +ldr q22, [x0, #992] +ldr q13, [x0, #1008] +ldr q21, [x0, #960] +ldr q11, [x0, #976] +sqrdmulh v14.4S, v22.4S, v2.s[0] +mul v22.4S, v22.4S,v16.s[0] +mla v22.4S, v14.4S, v31.s[0] +sub v14.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +sqrdmulh v22.4S, v13.4S, v2.s[0] +mul v13.4S, v13.4S,v16.s[0] +mla v13.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +sqrdmulh v13.4S, v11.4S, v2.s[1] +mul v11.4S, v11.4S,v16.s[1] +mla v11.4S, v13.4S, v31.s[0] +sub v13.4s, v21.4s, v11.4s +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v2.s[2] +mul v22.4S, v22.4S,v16.s[2] +mla v22.4S, v11.4S, v31.s[0] +sub v11.4s, v14.4s, v22.4s +add v14.4s, v14.4s, v22.4s +trn1 v22.4S, v21.4S, v13.4S +trn2 v0.4S, v21.4S, v13.4S +trn1 v19.4S, v14.4S, v11.4S +trn2 v17.4S, v14.4S, v11.4S +trn2 v14.2D, v22.2D, v19.2D +trn2 v11.2D, v0.2D, v17.2D +trn1 v21.2D, v22.2D, v19.2D +trn1 v13.2D, v0.2D, v17.2D +sqrdmulh v17.4S, v14.4S, v15.4S +mul v14.4S, v14.4S,v10.4S +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v14.4s +add v21.4s, v21.4s, v14.4s +sqrdmulh v14.4S, v11.4S, v15.4S +mul v11.4S, v11.4S,v10.4S +mla v11.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v11.4s +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v6.4S +mul v13.4S, v13.4S,v7.4S +mla v13.4S, v11.4S, v31.s[0] +sub v11.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +sqrdmulh v13.4S, v14.4S, v4.4S +mul v14.4S, v14.4S,v5.4S +mla v14.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v14.4s +add v17.4s, v17.4s, v14.4s +str q21, [x0, #960] +str q11, [x0, #976] +str q17, [x0, #992] +str q13, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 2392 +// Instruction count: 2388 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z4_1.s b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z4_1.s new file mode 100644 index 0000000..02d26b2 --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z4_1.s @@ -0,0 +1,2422 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 26036764 // Layer 6, block 0 +.word 7065381 // Layer 6, block 1 +.word 11280567 // Layer 6, block 2 +.word 19695786 // Layer 6, block 3 +.word 1666225723 // Layer 6, block 0 +.word 452149874 // Layer 6, block 1 +.word 721901190 // Layer 6, block 2 +.word 1260434103 // Layer 6, block 3 +.word 28678040 // Layer 7, block 0 +.word 5637166 // Layer 7, block 2 +.word 18759424 // Layer 7, block 4 +.word 8648030 // Layer 7, block 6 +.word 1835254486 // Layer 7, block 0 +.word 360751090 // Layer 7, block 2 +.word 1200511508 // Layer 7, block 4 +.word 553431680 // Layer 7, block 6 +.word 7232147 // Layer 7, block 1 +.word 7430689 // Layer 7, block 3 +.word 14819378 // Layer 7, block 5 +.word 22112339 // Layer 7, block 7 +.word 462822084 // Layer 7, block 1 +.word 475527802 // Layer 7, block 3 +.word 948367809 // Layer 7, block 5 +.word 1415081692 // Layer 7, block 7 +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14834498 // Layer 6, block 4 +.word 22861321 // Layer 6, block 5 +.word 23033862 // Layer 6, block 6 +.word 32211066 // Layer 6, block 7 +.word 949335415 // Layer 6, block 4 +.word 1463012881 // Layer 6, block 5 +.word 1474054663 // Layer 6, block 6 +.word 2061350894 // Layer 6, block 7 +.word 7103825 // Layer 7, block 8 +.word 24338119 // Layer 7, block 10 +.word 6674394 // Layer 7, block 12 +.word 3716128 // Layer 7, block 14 +.word 454610102 // Layer 7, block 8 +.word 1557520740 // Layer 7, block 10 +.word 427128616 // Layer 7, block 12 +.word 237814041 // Layer 7, block 14 +.word 18577393 // Layer 7, block 9 +.word 17042091 // Layer 7, block 11 +.word 6574213 // Layer 7, block 13 +.word 24666803 // Layer 7, block 15 +.word 1188862414 // Layer 7, block 9 +.word 1090610585 // Layer 7, block 11 +.word 420717521 // Layer 7, block 13 +.word 1578554911 // Layer 7, block 15 +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 11253846 // Layer 6, block 8 +.word 16151303 // Layer 6, block 9 +.word 1821442 // Layer 6, block 10 +.word 23358663 // Layer 6, block 11 +.word 720191176 // Layer 6, block 8 +.word 1033604503 // Layer 6, block 9 +.word 116563391 // Layer 6, block 10 +.word 1494840340 // Layer 6, block 11 +.word 32787475 // Layer 7, block 16 +.word 8269259 // Layer 7, block 18 +.word 20826321 // Layer 7, block 20 +.word 21194054 // Layer 7, block 22 +.word 2098238255 // Layer 7, block 16 +.word 529192186 // Layer 7, block 18 +.word 1332782821 // Layer 7, block 20 +.word 1356315937 // Layer 7, block 22 +.word 28400654 // Layer 7, block 17 +.word 31090287 // Layer 7, block 19 +.word 26776841 // Layer 7, block 21 +.word 22281074 // Layer 7, block 23 +.word 1817503137 // Layer 7, block 17 +.word 1989626512 // Layer 7, block 19 +.word 1713587037 // Layer 7, block 21 +.word 1425879908 // Layer 7, block 23 +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 20504641 // Layer 6, block 12 +.word 7735096 // Layer 6, block 13 +.word 29463916 // Layer 6, block 14 +.word 23172067 // Layer 6, block 15 +.word 1312196872 // Layer 6, block 12 +.word 495008363 // Layer 6, block 13 +.word 1885546712 // Layer 6, block 14 +.word 1482899108 // Layer 6, block 15 +.word 1953000 // Layer 7, block 24 +.word 12766243 // Layer 7, block 26 +.word 16292342 // Layer 7, block 28 +.word 25143337 // Layer 7, block 30 +.word 124982461 // Layer 7, block 24 +.word 816977197 // Layer 7, block 26 +.word 1042630311 // Layer 7, block 28 +.word 1609050759 // Layer 7, block 30 +.word 12486848 // Layer 7, block 25 +.word 31556661 // Layer 7, block 27 +.word 28330310 // Layer 7, block 29 +.word 15137961 // Layer 7, block 31 +.word 799097282 // Layer 7, block 25 +.word 2019472170 // Layer 7, block 27 +.word 1813001465 // Layer 7, block 29 +.word 968755565 // Layer 7, block 31 +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 18663828 // Layer 6, block 16 +.word 25765932 // Layer 6, block 17 +.word 11779122 // Layer 6, block 18 +.word 29112305 // Layer 6, block 19 +.word 1194393831 // Layer 6, block 16 +.word 1648893798 // Layer 6, block 17 +.word 753806275 // Layer 6, block 18 +.word 1863045325 // Layer 6, block 19 +.word 33163184 // Layer 7, block 32 +.word 11550623 // Layer 7, block 34 +.word 25375595 // Layer 7, block 36 +.word 18254638 // Layer 7, block 38 +.word 2122281795 // Layer 7, block 32 +.word 739183455 // Layer 7, block 34 +.word 1623914137 // Layer 7, block 36 +.word 1168207670 // Layer 7, block 38 +.word 9551359 // Layer 7, block 33 +.word 33257316 // Layer 7, block 35 +.word 10387700 // Layer 7, block 37 +.word 4263629 // Layer 7, block 39 +.word 611240324 // Layer 7, block 33 +.word 2128305784 // Layer 7, block 35 +.word 664762063 // Layer 7, block 37 +.word 272851431 // Layer 7, block 39 +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 596073 // Layer 6, block 20 +.word 29039358 // Layer 6, block 21 +.word 6760262 // Layer 6, block 22 +.word 2228887 // Layer 6, block 23 +.word 38145761 // Layer 6, block 20 +.word 1858377074 // Layer 6, block 21 +.word 432623749 // Layer 6, block 22 +.word 142637881 // Layer 6, block 23 +.word 25929180 // Layer 7, block 40 +.word 23508428 // Layer 7, block 42 +.word 22560727 // Layer 7, block 44 +.word 29457393 // Layer 7, block 46 +.word 1659340873 // Layer 7, block 40 +.word 1504424569 // Layer 7, block 42 +.word 1443776334 // Layer 7, block 44 +.word 1885129272 // Layer 7, block 46 +.word 17371159 // Layer 7, block 41 +.word 11558208 // Layer 7, block 43 +.word 15755637 // Layer 7, block 45 +.word 20740787 // Layer 7, block 47 +.word 1111669329 // Layer 7, block 41 +.word 739668858 // Layer 7, block 43 +.word 1008283812 // Layer 7, block 45 +.word 1327309063 // Layer 7, block 47 +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 13624329 // Layer 6, block 24 +.word 9838349 // Layer 6, block 25 +.word 6934560 // Layer 6, block 26 +.word 11310234 // Layer 6, block 27 +.word 871890510 // Layer 6, block 24 +.word 629606282 // Layer 6, block 25 +.word 443777969 // Layer 6, block 26 +.word 723799733 // Layer 6, block 27 +.word 3153984 // Layer 7, block 48 +.word 15599806 // Layer 7, block 50 +.word 23484790 // Layer 7, block 52 +.word 30174454 // Layer 7, block 54 +.word 201839571 // Layer 7, block 48 +.word 998311389 // Layer 7, block 50 +.word 1502911852 // Layer 7, block 52 +.word 1931017673 // Layer 7, block 54 +.word 13598070 // Layer 7, block 49 +.word 31454003 // Layer 7, block 51 +.word 20506260 // Layer 7, block 53 +.word 5928435 // Layer 7, block 55 +.word 870210062 // Layer 7, block 49 +.word 2012902560 // Layer 7, block 51 +.word 1312300480 // Layer 7, block 53 +.word 379390883 // Layer 7, block 55 +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 32798516 // Layer 6, block 28 +.word 9911360 // Layer 6, block 29 +.word 32443170 // Layer 6, block 30 +.word 31293482 // Layer 6, block 31 +.word 2098944825 // Layer 6, block 28 +.word 634278629 // Layer 6, block 29 +.word 2076204416 // Layer 6, block 30 +.word 2002630000 // Layer 6, block 31 +.word 26013877 // Layer 7, block 56 +.word 22928950 // Layer 7, block 58 +.word 24547058 // Layer 7, block 60 +.word 21082546 // Layer 7, block 62 +.word 1664761067 // Layer 7, block 56 +.word 1467340807 // Layer 7, block 58 +.word 1570891816 // Layer 7, block 60 +.word 1349179970 // Layer 7, block 62 +.word 21864746 // Layer 7, block 57 +.word 27678266 // Layer 7, block 59 +.word 30695887 // Layer 7, block 61 +.word 31772478 // Layer 7, block 63 +.word 1399236949 // Layer 7, block 57 +.word 1771273834 // Layer 7, block 59 +.word 1964386839 // Layer 7, block 61 +.word 2033283404 // Layer 7, block 63 +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 2853776 // Layer 6, block 32 +.word 31645959 // Layer 6, block 33 +.word 29723614 // Layer 6, block 34 +.word 31813171 // Layer 6, block 35 +.word 182627725 // Layer 6, block 32 +.word 2025186806 // Layer 6, block 33 +.word 1902166116 // Layer 6, block 34 +.word 2035887557 // Layer 6, block 35 +.word 30377953 // Layer 7, block 64 +.word 4924837 // Layer 7, block 66 +.word 11362575 // Layer 7, block 68 +.word 31398766 // Layer 7, block 70 +.word 1944040616 // Layer 7, block 64 +.word 315165513 // Layer 7, block 66 +.word 727149301 // Layer 7, block 68 +.word 2009367662 // Layer 7, block 70 +.word 27689101 // Layer 7, block 65 +.word 31229525 // Layer 7, block 67 +.word 6544948 // Layer 7, block 69 +.word 13728247 // Layer 7, block 71 +.word 1771967221 // Layer 7, block 65 +.word 1998537064 // Layer 7, block 67 +.word 418844704 // Layer 7, block 69 +.word 878540754 // Layer 7, block 71 +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9116920 // Layer 6, block 36 +.word 26449800 // Layer 6, block 37 +.word 27173300 // Layer 6, block 38 +.word 1574249 // Layer 6, block 39 +.word 583438350 // Layer 6, block 36 +.word 1692658010 // Layer 6, block 37 +.word 1738958476 // Layer 6, block 38 +.word 100744247 // Layer 6, block 39 +.word 6510145 // Layer 7, block 72 +.word 760999 // Layer 7, block 74 +.word 1634503 // Layer 7, block 76 +.word 29546109 // Layer 7, block 78 +.word 416617482 // Layer 7, block 72 +.word 48700219 // Layer 7, block 74 +.word 104600209 // Layer 7, block 76 +.word 1890806663 // Layer 7, block 78 +.word 2195232 // Layer 7, block 73 +.word 4465852 // Layer 7, block 75 +.word 31203102 // Layer 7, block 77 +.word 29916743 // Layer 7, block 79 +.word 140484126 // Layer 7, block 73 +.word 285792715 // Layer 7, block 75 +.word 1996846121 // Layer 7, block 77 +.word 1914525428 // Layer 7, block 79 +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29172999 // Layer 6, block 40 +.word 16825951 // Layer 6, block 41 +.word 11592382 // Layer 6, block 42 +.word 2671395 // Layer 6, block 43 +.word 1866929445 // Layer 6, block 40 +.word 1076778680 // Layer 6, block 41 +.word 741855827 // Layer 6, block 42 +.word 170956232 // Layer 6, block 43 +.word 14579779 // Layer 7, block 80 +.word 24263513 // Layer 7, block 82 +.word 4646776 // Layer 7, block 84 +.word 69049 // Layer 7, block 86 +.word 933034643 // Layer 7, block 80 +.word 1552746321 // Layer 7, block 82 +.word 297370968 // Layer 7, block 84 +.word 4418799 // Layer 7, block 86 +.word 33263488 // Layer 7, block 81 +.word 22493246 // Layer 7, block 83 +.word 22009979 // Layer 7, block 85 +.word 12021234 // Layer 7, block 87 +.word 2128700762 // Layer 7, block 81 +.word 1439457879 // Layer 7, block 83 +.word 1408531152 // Layer 7, block 85 +.word 769300260 // Layer 7, block 87 +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 15720958 // Layer 6, block 44 +.word 4876619 // Layer 6, block 45 +.word 9370171 // Layer 6, block 46 +.word 2197027 // Layer 6, block 47 +.word 1006064525 // Layer 6, block 44 +.word 312079797 // Layer 6, block 45 +.word 599645177 // Layer 6, block 46 +.word 140598997 // Layer 6, block 47 +.word 16117282 // Layer 7, block 88 +.word 9635661 // Layer 7, block 90 +.word 9117520 // Layer 7, block 92 +.word 3506913 // Layer 7, block 94 +.word 1031427326 // Layer 7, block 88 +.word 616635240 // Layer 7, block 90 +.word 583476747 // Layer 7, block 92 +.word 224425303 // Layer 7, block 94 +.word 20014407 // Layer 7, block 89 +.word 25893988 // Layer 7, block 91 +.word 10257619 // Layer 7, block 93 +.word 24501669 // Layer 7, block 95 +.word 1280824291 // Layer 7, block 89 +.word 1657088757 // Layer 7, block 91 +.word 656437514 // Layer 7, block 93 +.word 1567987141 // Layer 7, block 95 +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 23467272 // Layer 6, block 48 +.word 11944835 // Layer 6, block 49 +.word 29768154 // Layer 6, block 50 +.word 3189790 // Layer 6, block 51 +.word 1501790786 // Layer 6, block 48 +.word 764411097 // Layer 6, block 49 +.word 1905016458 // Layer 6, block 50 +.word 204130980 // Layer 6, block 51 +.word 28559032 // Layer 7, block 96 +.word 20151609 // Layer 7, block 98 +.word 11645481 // Layer 7, block 100 +.word 16402437 // Layer 7, block 102 +.word 1827638556 // Layer 7, block 96 +.word 1289604549 // Layer 7, block 98 +.word 745253903 // Layer 7, block 100 +.word 1049675853 // Layer 7, block 102 +.word 1005359 // Layer 7, block 97 +.word 19130139 // Layer 7, block 99 +.word 11690281 // Layer 7, block 101 +.word 5461508 // Layer 7, block 103 +.word 64338065 // Layer 7, block 97 +.word 1224235458 // Layer 7, block 99 +.word 748120885 // Layer 7, block 101 +.word 349509836 // Layer 7, block 103 +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 4898455 // Layer 6, block 52 +.word 22059944 // Layer 6, block 53 +.word 20315246 // Layer 6, block 54 +.word 28615767 // Layer 6, block 55 +.word 313477194 // Layer 6, block 52 +.word 1411728668 // Layer 6, block 53 +.word 1300076517 // Layer 6, block 54 +.word 1831269319 // Layer 6, block 55 +.word 6226096 // Layer 7, block 104 +.word 14029790 // Layer 7, block 106 +.word 7729000 // Layer 7, block 108 +.word 13958531 // Layer 7, block 110 +.word 398439734 // Layer 7, block 104 +.word 897838034 // Layer 7, block 106 +.word 494618249 // Layer 7, block 108 +.word 893277806 // Layer 7, block 110 +.word 31755058 // Layer 7, block 105 +.word 26102744 // Layer 7, block 107 +.word 19175904 // Layer 7, block 109 +.word 19472238 // Layer 7, block 111 +.word 2032168609 // Layer 7, block 105 +.word 1670448121 // Layer 7, block 107 +.word 1227164194 // Layer 7, block 109 +.word 1246128123 // Layer 7, block 111 +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 17302560 // Layer 6, block 56 +.word 8630188 // Layer 6, block 57 +.word 13744680 // Layer 6, block 58 +.word 31890906 // Layer 6, block 59 +.word 1107279328 // Layer 6, block 56 +.word 552289879 // Layer 6, block 57 +.word 879592386 // Layer 6, block 58 +.word 2040862218 // Layer 6, block 59 +.word 4735938 // Layer 7, block 112 +.word 26671657 // Layer 7, block 114 +.word 25810971 // Layer 7, block 116 +.word 25578690 // Layer 7, block 118 +.word 303076900 // Layer 7, block 112 +.word 1706855774 // Layer 7, block 114 +.word 1651776074 // Layer 7, block 116 +.word 1636911225 // Layer 7, block 118 +.word 6957373 // Layer 7, block 113 +.word 25381712 // Layer 7, block 115 +.word 27780827 // Layer 7, block 117 +.word 28062311 // Layer 7, block 119 +.word 445237890 // Layer 7, block 113 +.word 1624305595 // Layer 7, block 115 +.word 1777837237 // Layer 7, block 117 +.word 1795850838 // Layer 7, block 119 +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 26150922 // Layer 6, block 60 +.word 29525906 // Layer 6, block 61 +.word 23080870 // Layer 6, block 62 +.word 1636987 // Layer 6, block 63 +.word 1673531278 // Layer 6, block 60 +.word 1889513769 // Layer 6, block 61 +.word 1477062945 // Layer 6, block 62 +.word 104759172 // Layer 6, block 63 +.word 10674616 // Layer 7, block 120 +.word 9508293 // Layer 7, block 122 +.word 4274200 // Layer 7, block 124 +.word 10066304 // Layer 7, block 126 +.word 683123285 // Layer 7, block 120 +.word 608484310 // Layer 7, block 122 +.word 273527923 // Layer 7, block 124 +.word 644194289 // Layer 7, block 126 +.word 26473446 // Layer 7, block 121 +.word 14853570 // Layer 7, block 123 +.word 32427548 // Layer 7, block 125 +.word 16598340 // Layer 7, block 127 +.word 1694171239 // Layer 7, block 121 +.word 950555930 // Layer 7, block 123 +.word 2075204685 // Layer 7, block 125 +.word 1062212688 // Layer 7, block 127 +.text +.global ntt_u32_full_neon_asm_var_4_4_3_z4_1 +.global _ntt_u32_full_neon_asm_var_4_4_3_z4_1 +ntt_u32_full_neon_asm_var_4_4_3_z4_1: +_ntt_u32_full_neon_asm_var_4_4_3_z4_1: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #800] +ldr q21, [x0, #864] +ldr q20, [x0, #928] +ldr q19, [x0, #992] +ldr q18, [x0, #288] +ldr q17, [x0, #352] +ldr q16, [x0, #416] +ldr q3, [x0, #480] +sqrdmulh v2.4S, v22.4S, v29.s[0] +ldr q1, [x0, #544] +mul v22.4S, v22.4S,v30.s[0] +ldr q0, [x0, #608] +sqrdmulh v15.4S, v21.4S, v29.s[0] +ldr q14, [x0, #672] +mul v21.4S, v21.4S,v30.s[0] +ldr q13, [x0, #736] +mla v22.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q12, [x0, #32] +sub v11.4s, v18.4s, v22.4s +mla v21.4S, v15.4S, v31.s[0] +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q15, [x0, #96] +sub v10.4s, v17.4s, v21.4s +mla v20.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v1.4S, v29.s[0] +ldr q2, [x0, #160] +mul v1.4S, v1.4S,v30.s[0] +sub v9.4s, v16.4s, v20.4s +mla v19.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v0.4S, v29.s[0] +ldr q22, [x0, #224] +mul v0.4S, v0.4S,v30.s[0] +sub v8.4s, v3.4s, v19.4s +mla v1.4S, v21.4S, v31.s[0] +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v21.4s, v12.4s, v1.4s +mla v0.4S, v20.4S, v31.s[0] +add v12.4s, v12.4s, v1.4s +sqrdmulh v1.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +sub v20.4s, v15.4s, v0.4s +mla v14.4S, v19.4S, v31.s[0] +add v15.4s, v15.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v19.4s, v2.4s, v14.4s +mla v13.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v1.4s, v22.4s, v13.4s +mla v16.4S, v0.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v0.4s, v2.4s, v16.4s +mla v3.4S, v14.4S, v31.s[0] +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v14.4s, v22.4s, v3.4s +mla v18.4S, v13.4S, v31.s[0] +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v29.s[2] +mul v9.4S, v9.4S,v30.s[2] +sub v13.4s, v12.4s, v18.4s +mla v17.4S, v16.4S, v31.s[0] +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v8.4S, v29.s[2] +mul v8.4S, v8.4S,v30.s[2] +sub v16.4s, v15.4s, v17.4s +mla v9.4S, v3.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +sqrdmulh v17.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v3.4s, v19.4s, v9.4s +mla v8.4S, v18.4S, v31.s[0] +add v19.4s, v19.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v18.4s, v1.4s, v8.4s +mla v11.4S, v17.4S, v31.s[0] +add v1.4s, v1.4s, v8.4s +sqrdmulh v8.4S, v2.4S, v27.s[0] +mul v2.4S, v2.4S,v28.s[0] +sub v17.4s, v21.4s, v11.4s +mla v10.4S, v9.4S, v31.s[0] +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v27.s[0] +mul v22.4S, v22.4S,v28.s[0] +sub v9.4s, v20.4s, v10.4s +mla v2.4S, v8.4S, v31.s[0] +add v20.4s, v20.4s, v10.4s +sqrdmulh v10.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v8.4s, v12.4s, v2.4s +mla v22.4S, v11.4S, v31.s[0] +add v12.4s, v12.4s, v2.4s +sqrdmulh v2.4S, v14.4S, v27.s[1] +mul v14.4S, v14.4S,v28.s[1] +sub v11.4s, v15.4s, v22.4s +mla v0.4S, v10.4S, v31.s[0] +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v27.s[2] +mul v19.4S, v19.4S,v28.s[2] +sub v10.4s, v13.4s, v0.4s +mla v14.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v0.4s +sqrdmulh v0.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +sub v2.4s, v16.4s, v14.4s +mla v19.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v27.s[3] +mul v3.4S, v3.4S,v28.s[3] +sub v22.4s, v21.4s, v19.4s +mla v1.4S, v0.4S, v31.s[0] +add v21.4s, v21.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v0.4s, v20.4s, v1.4s +mla v3.4S, v14.4S, v31.s[0] +add v20.4s, v20.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v25.s[0] +mul v15.4S, v15.4S,v26.s[0] +sub v14.4s, v17.4s, v3.4s +mla v18.4S, v19.4S, v31.s[0] +add v17.4s, v17.4s, v3.4s +sqrdmulh v3.4S, v11.4S, v25.s[1] +mul v11.4S, v11.4S,v26.s[1] +sub v19.4s, v9.4s, v18.4s +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v1.4s, v12.4s, v15.4s +mla v11.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v25.s[3] +mul v2.4S, v2.4S,v26.s[3] +sub v3.4s, v8.4s, v11.4s +mla v16.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v11.4s +str q12, [x0, #32] +sqrdmulh v12.4S, v20.4S, v23.s[0] +str q1, [x0, #96] +mul v20.4S, v20.4S,v24.s[0] +ldr q1, [x0, #816] +sub v11.4s, v13.4s, v16.4s +ldr q18, [x0, #880] +mla v2.4S, v15.4S, v31.s[0] +add v13.4s, v13.4s, v16.4s +str q8, [x0, #160] +sqrdmulh v8.4S, v0.4S, v23.s[1] +str q3, [x0, #224] +mul v0.4S, v0.4S,v24.s[1] +ldr q3, [x0, #944] +sub v16.4s, v10.4s, v2.4s +ldr q15, [x0, #1008] +mla v20.4S, v12.4S, v31.s[0] +add v10.4s, v10.4s, v2.4s +str q13, [x0, #288] +sqrdmulh v13.4S, v9.4S, v23.s[2] +str q11, [x0, #352] +mul v9.4S, v9.4S,v24.s[2] +ldr q11, [x0, #304] +sub v2.4s, v21.4s, v20.4s +ldr q12, [x0, #368] +mla v0.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v20.4s +str q10, [x0, #416] +sqrdmulh v10.4S, v19.4S, v23.s[3] +str q16, [x0, #480] +mul v19.4S, v19.4S,v24.s[3] +ldr q16, [x0, #432] +sub v20.4s, v22.4s, v0.4s +ldr q8, [x0, #496] +mla v9.4S, v13.4S, v31.s[0] +add v22.4s, v22.4s, v0.4s +str q21, [x0, #544] +sqrdmulh v21.4S, v1.4S, v29.s[0] +str q2, [x0, #608] +ldr q2, [x0, #560] +mul v1.4S, v1.4S,v30.s[0] +ldr q0, [x0, #624] +sub v13.4s, v17.4s, v9.4s +mla v19.4S, v10.4S, v31.s[0] +add v17.4s, v17.4s, v9.4s +str q22, [x0, #672] +sqrdmulh v22.4S, v18.4S, v29.s[0] +str q20, [x0, #736] +ldr q20, [x0, #688] +mul v18.4S, v18.4S,v30.s[0] +ldr q9, [x0, #752] +sub v10.4s, v14.4s, v19.4s +mla v1.4S, v21.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +str q17, [x0, #800] +sqrdmulh v17.4S, v3.4S, v29.s[0] +str q13, [x0, #864] +mul v3.4S, v3.4S,v30.s[0] +ldr q13, [x0, #48] +sub v19.4s, v11.4s, v1.4s +mla v18.4S, v22.4S, v31.s[0] +add v11.4s, v11.4s, v1.4s +str q14, [x0, #928] +sqrdmulh v14.4S, v15.4S, v29.s[0] +str q10, [x0, #992] +mul v15.4S, v15.4S,v30.s[0] +ldr q10, [x0, #112] +sub v1.4s, v12.4s, v18.4s +mla v3.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v2.4S, v29.s[0] +ldr q17, [x0, #176] +mul v2.4S, v2.4S,v30.s[0] +sub v22.4s, v16.4s, v3.4s +mla v15.4S, v14.4S, v31.s[0] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v0.4S, v29.s[0] +ldr q14, [x0, #240] +mul v0.4S, v0.4S,v30.s[0] +sub v21.4s, v8.4s, v15.4s +mla v2.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +sub v18.4s, v13.4s, v2.4s +mla v0.4S, v3.4S, v31.s[0] +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v9.4S, v29.s[0] +mul v9.4S, v9.4S,v30.s[0] +sub v3.4s, v10.4s, v0.4s +mla v20.4S, v15.4S, v31.s[0] +add v10.4s, v10.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v15.4s, v17.4s, v20.4s +mla v9.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +sub v2.4s, v14.4s, v9.4s +mla v16.4S, v0.4S, v31.s[0] +add v14.4s, v14.4s, v9.4s +sqrdmulh v9.4S, v11.4S, v29.s[1] +mul v11.4S, v11.4S,v30.s[1] +sub v0.4s, v17.4s, v16.4s +mla v8.4S, v20.4S, v31.s[0] +add v17.4s, v17.4s, v16.4s +sqrdmulh v16.4S, v12.4S, v29.s[1] +mul v12.4S, v12.4S,v30.s[1] +sub v20.4s, v14.4s, v8.4s +mla v11.4S, v9.4S, v31.s[0] +add v14.4s, v14.4s, v8.4s +sqrdmulh v8.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +sub v9.4s, v13.4s, v11.4s +mla v12.4S, v16.4S, v31.s[0] +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +sub v16.4s, v10.4s, v12.4s +mla v22.4S, v8.4S, v31.s[0] +add v10.4s, v10.4s, v12.4s +sqrdmulh v12.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +sub v8.4s, v15.4s, v22.4s +mla v21.4S, v11.4S, v31.s[0] +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v1.4S, v29.s[2] +mul v1.4S, v1.4S,v30.s[2] +sub v11.4s, v2.4s, v21.4s +mla v19.4S, v12.4S, v31.s[0] +add v2.4s, v2.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v27.s[0] +mul v17.4S, v17.4S,v28.s[0] +sub v12.4s, v18.4s, v19.4s +mla v1.4S, v22.4S, v31.s[0] +add v18.4s, v18.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +sub v22.4s, v3.4s, v1.4s +mla v17.4S, v21.4S, v31.s[0] +add v3.4s, v3.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v21.4s, v13.4s, v17.4s +mla v14.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v17.4s +sqrdmulh v17.4S, v20.4S, v27.s[1] +mul v20.4S, v20.4S,v28.s[1] +sub v19.4s, v10.4s, v14.4s +mla v0.4S, v1.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v27.s[2] +mul v15.4S, v15.4S,v28.s[2] +sub v1.4s, v9.4s, v0.4s +mla v20.4S, v17.4S, v31.s[0] +add v9.4s, v9.4s, v0.4s +sqrdmulh v0.4S, v2.4S, v27.s[2] +mul v2.4S, v2.4S,v28.s[2] +sub v17.4s, v16.4s, v20.4s +mla v15.4S, v14.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v27.s[3] +mul v8.4S, v8.4S,v28.s[3] +sub v14.4s, v18.4s, v15.4s +mla v2.4S, v0.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v27.s[3] +mul v11.4S, v11.4S,v28.s[3] +sub v0.4s, v3.4s, v2.4s +mla v8.4S, v20.4S, v31.s[0] +add v3.4s, v3.4s, v2.4s +sqrdmulh v2.4S, v10.4S, v25.s[0] +mul v10.4S, v10.4S,v26.s[0] +sub v20.4s, v12.4s, v8.4s +mla v11.4S, v15.4S, v31.s[0] +add v12.4s, v12.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v25.s[1] +mul v19.4S, v19.4S,v26.s[1] +sub v15.4s, v22.4s, v11.4s +mla v10.4S, v2.4S, v31.s[0] +add v22.4s, v22.4s, v11.4s +sqrdmulh v11.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v2.4s, v13.4s, v10.4s +mla v19.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v10.4s +sqrdmulh v10.4S, v17.4S, v25.s[3] +mul v17.4S, v17.4S,v26.s[3] +sub v8.4s, v21.4s, v19.4s +mla v16.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v19.4s +str q13, [x0, #48] +sqrdmulh v13.4S, v3.4S, v23.s[0] +str q2, [x0, #112] +mul v3.4S, v3.4S,v24.s[0] +ldr q2, [x0, #768] +sub v19.4s, v9.4s, v16.4s +ldr q11, [x0, #832] +mla v17.4S, v10.4S, v31.s[0] +add v9.4s, v9.4s, v16.4s +str q21, [x0, #176] +sqrdmulh v21.4S, v0.4S, v23.s[1] +str q8, [x0, #240] +mul v0.4S, v0.4S,v24.s[1] +ldr q8, [x0, #896] +sub v16.4s, v1.4s, v17.4s +ldr q10, [x0, #960] +mla v3.4S, v13.4S, v31.s[0] +add v1.4s, v1.4s, v17.4s +str q9, [x0, #304] +sqrdmulh v9.4S, v22.4S, v23.s[2] +str q19, [x0, #368] +mul v22.4S, v22.4S,v24.s[2] +ldr q19, [x0, #256] +sub v17.4s, v18.4s, v3.4s +ldr q13, [x0, #320] +mla v0.4S, v21.4S, v31.s[0] +add v18.4s, v18.4s, v3.4s +str q1, [x0, #432] +sqrdmulh v1.4S, v15.4S, v23.s[3] +str q16, [x0, #496] +mul v15.4S, v15.4S,v24.s[3] +ldr q16, [x0, #384] +sub v3.4s, v14.4s, v0.4s +ldr q21, [x0, #448] +mla v22.4S, v9.4S, v31.s[0] +add v14.4s, v14.4s, v0.4s +str q18, [x0, #560] +sqrdmulh v18.4S, v2.4S, v29.s[0] +str q17, [x0, #624] +ldr q17, [x0, #512] +mul v2.4S, v2.4S,v30.s[0] +ldr q0, [x0, #576] +sub v9.4s, v12.4s, v22.4s +mla v15.4S, v1.4S, v31.s[0] +add v12.4s, v12.4s, v22.4s +str q14, [x0, #688] +sqrdmulh v14.4S, v11.4S, v29.s[0] +str q3, [x0, #752] +ldr q3, [x0, #640] +mul v11.4S, v11.4S,v30.s[0] +ldr q22, [x0, #704] +sub v1.4s, v20.4s, v15.4s +mla v2.4S, v18.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +str q12, [x0, #816] +sqrdmulh v12.4S, v8.4S, v29.s[0] +str q9, [x0, #880] +mul v8.4S, v8.4S,v30.s[0] +ldr q9, [x0, #0] +sub v15.4s, v19.4s, v2.4s +mla v11.4S, v14.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +str q20, [x0, #944] +sqrdmulh v20.4S, v10.4S, v29.s[0] +str q1, [x0, #1008] +mul v10.4S, v10.4S,v30.s[0] +ldr q1, [x0, #64] +sub v2.4s, v13.4s, v11.4s +mla v8.4S, v12.4S, v31.s[0] +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v29.s[0] +ldr q12, [x0, #128] +mul v17.4S, v17.4S,v30.s[0] +sub v14.4s, v16.4s, v8.4s +mla v10.4S, v20.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v0.4S, v29.s[0] +ldr q20, [x0, #192] +mul v0.4S, v0.4S,v30.s[0] +sub v18.4s, v21.4s, v10.4s +mla v17.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +sub v11.4s, v9.4s, v17.4s +mla v0.4S, v8.4S, v31.s[0] +add v9.4s, v9.4s, v17.4s +sqrdmulh v17.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v8.4s, v1.4s, v0.4s +mla v3.4S, v10.4S, v31.s[0] +add v1.4s, v1.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v10.4s, v12.4s, v3.4s +mla v22.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v17.4s, v20.4s, v22.4s +mla v16.4S, v0.4S, v31.s[0] +add v20.4s, v20.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[1] +mul v19.4S, v19.4S,v30.s[1] +sub v0.4s, v12.4s, v16.4s +mla v21.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v13.4S, v29.s[1] +mul v13.4S, v13.4S,v30.s[1] +sub v3.4s, v20.4s, v21.4s +mla v19.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v22.4s, v9.4s, v19.4s +mla v13.4S, v16.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v29.s[2] +mul v18.4S, v18.4S,v30.s[2] +sub v16.4s, v1.4s, v13.4s +mla v14.4S, v21.4S, v31.s[0] +add v1.4s, v1.4s, v13.4s +sqrdmulh v13.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +sub v21.4s, v10.4s, v14.4s +mla v18.4S, v19.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v19.4s, v17.4s, v18.4s +mla v15.4S, v13.4S, v31.s[0] +add v17.4s, v17.4s, v18.4s +sqrdmulh v18.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +sub v13.4s, v11.4s, v15.4s +mla v2.4S, v14.4S, v31.s[0] +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v27.s[0] +mul v20.4S, v20.4S,v28.s[0] +sub v14.4s, v8.4s, v2.4s +mla v12.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v2.4s +sqrdmulh v2.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v18.4s, v9.4s, v12.4s +mla v20.4S, v15.4S, v31.s[0] +add v9.4s, v9.4s, v12.4s +sqrdmulh v12.4S, v3.4S, v27.s[1] +mul v3.4S, v3.4S,v28.s[1] +sub v15.4s, v1.4s, v20.4s +mla v0.4S, v2.4S, v31.s[0] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v10.4S, v27.s[2] +mul v10.4S, v10.4S,v28.s[2] +sub v2.4s, v22.4s, v0.4s +mla v3.4S, v12.4S, v31.s[0] +add v22.4s, v22.4s, v0.4s +sqrdmulh v0.4S, v17.4S, v27.s[2] +mul v17.4S, v17.4S,v28.s[2] +sub v12.4s, v16.4s, v3.4s +mla v10.4S, v20.4S, v31.s[0] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +sub v20.4s, v11.4s, v10.4s +mla v17.4S, v0.4S, v31.s[0] +add v11.4s, v11.4s, v10.4s +sqrdmulh v10.4S, v19.4S, v27.s[3] +mul v19.4S, v19.4S,v28.s[3] +sub v0.4s, v8.4s, v17.4s +mla v21.4S, v3.4S, v31.s[0] +add v8.4s, v8.4s, v17.4s +sqrdmulh v17.4S, v1.4S, v25.s[0] +mul v1.4S, v1.4S,v26.s[0] +sub v3.4s, v13.4s, v21.4s +mla v19.4S, v10.4S, v31.s[0] +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v15.4S, v25.s[1] +mul v15.4S, v15.4S,v26.s[1] +sub v10.4s, v14.4s, v19.4s +mla v1.4S, v17.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +sqrdmulh v19.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v17.4s, v9.4s, v1.4s +mla v15.4S, v21.4S, v31.s[0] +add v9.4s, v9.4s, v1.4s +sqrdmulh v1.4S, v12.4S, v25.s[3] +mul v12.4S, v12.4S,v26.s[3] +sub v21.4s, v18.4s, v15.4s +mla v16.4S, v19.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +str q9, [x0, #0] +sqrdmulh v9.4S, v8.4S, v23.s[0] +str q17, [x0, #64] +mul v8.4S, v8.4S,v24.s[0] +ldr q17, [x0, #784] +sub v15.4s, v22.4s, v16.4s +ldr q19, [x0, #848] +mla v12.4S, v1.4S, v31.s[0] +add v22.4s, v22.4s, v16.4s +str q18, [x0, #128] +sqrdmulh v18.4S, v0.4S, v23.s[1] +str q21, [x0, #192] +mul v0.4S, v0.4S,v24.s[1] +ldr q21, [x0, #912] +sub v16.4s, v2.4s, v12.4s +ldr q1, [x0, #976] +mla v8.4S, v9.4S, v31.s[0] +add v2.4s, v2.4s, v12.4s +str q22, [x0, #256] +sqrdmulh v22.4S, v14.4S, v23.s[2] +str q15, [x0, #320] +mul v14.4S, v14.4S,v24.s[2] +ldr q15, [x0, #272] +sub v12.4s, v11.4s, v8.4s +ldr q9, [x0, #336] +mla v0.4S, v18.4S, v31.s[0] +add v11.4s, v11.4s, v8.4s +str q2, [x0, #384] +sqrdmulh v2.4S, v10.4S, v23.s[3] +str q16, [x0, #448] +mul v10.4S, v10.4S,v24.s[3] +ldr q16, [x0, #400] +sub v8.4s, v20.4s, v0.4s +ldr q18, [x0, #464] +mla v14.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v0.4s +str q11, [x0, #512] +sqrdmulh v11.4S, v17.4S, v29.s[0] +str q12, [x0, #576] +ldr q12, [x0, #528] +mul v17.4S, v17.4S,v30.s[0] +ldr q0, [x0, #592] +sub v22.4s, v13.4s, v14.4s +mla v10.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v14.4s +str q20, [x0, #640] +sqrdmulh v20.4S, v19.4S, v29.s[0] +str q8, [x0, #704] +ldr q8, [x0, #656] +mul v19.4S, v19.4S,v30.s[0] +ldr q14, [x0, #720] +sub v2.4s, v3.4s, v10.4s +mla v17.4S, v11.4S, v31.s[0] +add v3.4s, v3.4s, v10.4s +str q13, [x0, #768] +sqrdmulh v13.4S, v21.4S, v29.s[0] +str q22, [x0, #832] +mul v21.4S, v21.4S,v30.s[0] +ldr q22, [x0, #16] +sub v10.4s, v15.4s, v17.4s +mla v19.4S, v20.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +str q3, [x0, #896] +sqrdmulh v3.4S, v1.4S, v29.s[0] +str q2, [x0, #960] +mul v1.4S, v1.4S,v30.s[0] +ldr q2, [x0, #80] +sub v17.4s, v9.4s, v19.4s +mla v21.4S, v13.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v12.4S, v29.s[0] +ldr q13, [x0, #144] +mul v12.4S, v12.4S,v30.s[0] +sub v20.4s, v16.4s, v21.4s +mla v1.4S, v3.4S, v31.s[0] +add v16.4s, v16.4s, v21.4s +sqrdmulh v21.4S, v0.4S, v29.s[0] +ldr q3, [x0, #208] +mul v0.4S, v0.4S,v30.s[0] +sub v11.4s, v18.4s, v1.4s +mla v12.4S, v19.4S, v31.s[0] +add v18.4s, v18.4s, v1.4s +sqrdmulh v1.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v19.4s, v22.4s, v12.4s +mla v0.4S, v21.4S, v31.s[0] +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v21.4s, v2.4s, v0.4s +mla v8.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v1.4s, v13.4s, v8.4s +mla v14.4S, v12.4S, v31.s[0] +add v13.4s, v13.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v12.4s, v3.4s, v14.4s +mla v16.4S, v0.4S, v31.s[0] +add v3.4s, v3.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +sub v0.4s, v13.4s, v16.4s +mla v18.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v16.4s +sqrdmulh v16.4S, v9.4S, v29.s[1] +mul v9.4S, v9.4S,v30.s[1] +sub v8.4s, v3.4s, v18.4s +mla v15.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +sub v14.4s, v22.4s, v15.4s +mla v9.4S, v16.4S, v31.s[0] +add v22.4s, v22.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v16.4s, v2.4s, v9.4s +mla v20.4S, v18.4S, v31.s[0] +add v2.4s, v2.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v18.4s, v1.4s, v20.4s +mla v11.4S, v15.4S, v31.s[0] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +sub v15.4s, v12.4s, v11.4s +mla v10.4S, v9.4S, v31.s[0] +add v12.4s, v12.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v27.s[0] +mul v13.4S, v13.4S,v28.s[0] +sub v9.4s, v19.4s, v10.4s +mla v17.4S, v20.4S, v31.s[0] +add v19.4s, v19.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v27.s[0] +mul v3.4S, v3.4S,v28.s[0] +sub v20.4s, v21.4s, v17.4s +mla v13.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v11.4s, v22.4s, v13.4s +mla v3.4S, v10.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v8.4S, v27.s[1] +mul v8.4S, v8.4S,v28.s[1] +sub v10.4s, v2.4s, v3.4s +mla v0.4S, v17.4S, v31.s[0] +add v2.4s, v2.4s, v3.4s +sqrdmulh v3.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +sub v17.4s, v14.4s, v0.4s +mla v8.4S, v13.4S, v31.s[0] +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v12.4S, v27.s[2] +mul v12.4S, v12.4S,v28.s[2] +sub v13.4s, v16.4s, v8.4s +mla v1.4S, v3.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v3.4s, v19.4s, v1.4s +mla v12.4S, v0.4S, v31.s[0] +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +sub v0.4s, v21.4s, v12.4s +mla v18.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v2.4S, v25.s[0] +mul v2.4S, v2.4S,v26.s[0] +sub v8.4s, v9.4s, v18.4s +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v10.4S, v25.s[1] +mul v10.4S, v10.4S,v26.s[1] +sub v1.4s, v20.4s, v15.4s +mla v2.4S, v12.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v12.4s, v22.4s, v2.4s +mla v10.4S, v18.4S, v31.s[0] +add v22.4s, v22.4s, v2.4s +sqrdmulh v2.4S, v13.4S, v25.s[3] +mul v13.4S, v13.4S,v26.s[3] +sub v18.4s, v11.4s, v10.4s +mla v16.4S, v15.4S, v31.s[0] +add v11.4s, v11.4s, v10.4s +str q22, [x0, #16] +sqrdmulh v22.4S, v21.4S, v23.s[0] +str q12, [x0, #80] +mul v21.4S, v21.4S,v24.s[0] +sub v12.4s, v14.4s, v16.4s +mla v13.4S, v2.4S, v31.s[0] +add v14.4s, v14.4s, v16.4s +str q11, [x0, #144] +sqrdmulh v11.4S, v0.4S, v23.s[1] +str q18, [x0, #208] +mul v0.4S, v0.4S,v24.s[1] +sub v18.4s, v17.4s, v13.4s +mla v21.4S, v22.4S, v31.s[0] +add v17.4s, v17.4s, v13.4s +str q14, [x0, #272] +sqrdmulh v14.4S, v20.4S, v23.s[2] +str q12, [x0, #336] +mul v20.4S, v20.4S,v24.s[2] +sub v12.4s, v19.4s, v21.4s +mla v0.4S, v11.4S, v31.s[0] +add v19.4s, v19.4s, v21.4s +str q17, [x0, #400] +sqrdmulh v17.4S, v1.4S, v23.s[3] +str q18, [x0, #464] +mul v1.4S, v1.4S,v24.s[3] +sub v18.4s, v3.4s, v0.4s +mla v20.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v0.4s +str q19, [x0, #528] +str q12, [x0, #592] +sub v12.4s, v9.4s, v20.4s +mla v1.4S, v17.4S, v31.s[0] +add v9.4s, v9.4s, v20.4s +str q3, [x0, #656] +str q18, [x0, #720] +sub v18.4s, v8.4s, v1.4s +add v8.4s, v8.4s, v1.4s +str q9, [x0, #784] +str q12, [x0, #848] +str q8, [x0, #912] +str q18, [x0, #976] +ldr q4, [x0, #32] +ldr q5, [x0, #48] +ldr q6, [x0, #0] +ldr q7, [x0, #16] +ldr q15, [x0, #96] +ldr q10, [x0, #112] +ldr q2, [x0, #64] +ldr q16, [x0, #80] +ldr q22, [x0, #160] +ldr q13, [x0, #176] +ldr q11, [x0, #128] +ldr q21, [x0, #144] +ldr q14, [x0, #224] +ldr q0, [x0, #240] +ldr q19, [x0, #192] +ldr q17, [x0, #208] +ldr q20, [x17, #+128] +ldr q3, [x17, #+144] +ldr q1, [x17, #+256] +ldr q9, [x17, #+272] +ldr q12, [x17, #+384] +ldr q8, [x17, #+400] +ldr q18, [x17, #+512] +ldr q30, [x17, #+528] +sqrdmulh v29.4S, v4.4S, v3.s[0] +mul v4.4S, v4.4S,v20.s[0] +sqrdmulh v28.4S, v5.4S, v3.s[0] +mul v5.4S, v5.4S,v20.s[0] +mla v4.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v15.4S, v9.s[0] +mul v15.4S, v15.4S,v1.s[0] +mla v5.4S, v28.4S, v31.s[0] +sub v28.4s, v6.4s, v4.4s +add v6.4s, v6.4s, v4.4s +sqrdmulh v4.4S, v10.4S, v9.s[0] +mul v10.4S, v10.4S,v1.s[0] +mla v15.4S, v29.4S, v31.s[0] +sub v29.4s, v7.4s, v5.4s +add v7.4s, v7.4s, v5.4s +sqrdmulh v5.4S, v7.4S, v3.s[1] +mul v7.4S, v7.4S,v20.s[1] +mla v10.4S, v4.4S, v31.s[0] +sub v4.4s, v2.4s, v15.4s +add v2.4s, v2.4s, v15.4s +sqrdmulh v15.4S, v29.4S, v3.s[2] +mul v29.4S, v29.4S,v20.s[2] +mla v7.4S, v5.4S, v31.s[0] +sub v5.4s, v16.4s, v10.4s +add v16.4s, v16.4s, v10.4s +sqrdmulh v10.4S, v16.4S, v9.s[1] +mul v16.4S, v16.4S,v1.s[1] +mla v29.4S, v15.4S, v31.s[0] +sub v15.4s, v6.4s, v7.4s +add v6.4s, v6.4s, v7.4s +sqrdmulh v3.4S, v5.4S, v9.s[2] +mul v5.4S, v5.4S,v1.s[2] +mla v16.4S, v10.4S, v31.s[0] +sub v10.4s, v28.4s, v29.4s +add v28.4s, v28.4s, v29.4s +sqrdmulh v29.4S, v22.4S, v8.s[0] +mul v22.4S, v22.4S,v12.s[0] +mla v5.4S, v3.4S, v31.s[0] +sub v3.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +sqrdmulh v9.4S, v13.4S, v8.s[0] +mul v13.4S, v13.4S,v12.s[0] +mla v22.4S, v29.4S, v31.s[0] +sub v29.4s, v4.4s, v5.4s +add v4.4s, v4.4s, v5.4s +sqrdmulh v5.4S, v14.4S, v30.s[0] +mul v14.4S, v14.4S,v18.s[0] +mla v13.4S, v9.4S, v31.s[0] +sub v9.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +sqrdmulh v22.4S, v0.4S, v30.s[0] +mul v0.4S, v0.4S,v18.s[0] +mla v14.4S, v5.4S, v31.s[0] +sub v5.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +sqrdmulh v13.4S, v21.4S, v8.s[1] +mul v21.4S, v21.4S,v12.s[1] +mla v0.4S, v22.4S, v31.s[0] +sub v22.4s, v19.4s, v14.4s +add v19.4s, v19.4s, v14.4s +sqrdmulh v14.4S, v5.4S, v8.s[2] +mul v5.4S, v5.4S,v12.s[2] +mla v21.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v0.4s +add v17.4s, v17.4s, v0.4s +sqrdmulh v0.4S, v17.4S, v30.s[1] +mul v17.4S, v17.4S,v18.s[1] +mla v5.4S, v14.4S, v31.s[0] +sub v14.4s, v11.4s, v21.4s +add v11.4s, v11.4s, v21.4s +sqrdmulh v8.4S, v13.4S, v30.s[2] +mul v13.4S, v13.4S,v18.s[2] +mla v17.4S, v0.4S, v31.s[0] +sub v0.4s, v9.4s, v5.4s +add v9.4s, v9.4s, v5.4s +mla v13.4S, v8.4S, v31.s[0] +sub v8.4s, v19.4s, v17.4s +add v19.4s, v19.4s, v17.4s +sub v30.4s, v22.4s, v13.4s +add v22.4s, v22.4s, v13.4s +trn1 v13.4S, v6.4S, v15.4S +trn2 v18.4S, v6.4S, v15.4S +trn1 v17.4S, v28.4S, v10.4S +trn2 v5.4S, v28.4S, v10.4S +trn2 v28.2D, v13.2D, v17.2D +trn2 v10.2D, v18.2D, v5.2D +trn1 v6.2D, v13.2D, v17.2D +trn1 v15.2D, v18.2D, v5.2D +trn1 v5.4S, v2.4S, v3.4S +trn2 v18.4S, v2.4S, v3.4S +trn1 v17.4S, v4.4S, v29.4S +trn2 v13.4S, v4.4S, v29.4S +trn2 v4.2D, v5.2D, v17.2D +trn2 v29.2D, v18.2D, v13.2D +trn1 v2.2D, v5.2D, v17.2D +trn1 v3.2D, v18.2D, v13.2D +trn1 v13.4S, v11.4S, v14.4S +trn2 v18.4S, v11.4S, v14.4S +trn1 v17.4S, v9.4S, v0.4S +trn2 v5.4S, v9.4S, v0.4S +trn2 v9.2D, v13.2D, v17.2D +trn2 v0.2D, v18.2D, v5.2D +trn1 v11.2D, v13.2D, v17.2D +trn1 v14.2D, v18.2D, v5.2D +trn1 v5.4S, v19.4S, v8.4S +trn2 v18.4S, v19.4S, v8.4S +trn1 v17.4S, v22.4S, v30.4S +trn2 v13.4S, v22.4S, v30.4S +trn2 v22.2D, v5.2D, v17.2D +trn2 v30.2D, v18.2D, v13.2D +trn1 v19.2D, v5.2D, v17.2D +trn1 v8.2D, v18.2D, v13.2D +ldr q13, [x17, #+160] +ldr q18, [x17, #+176] +sqrdmulh v17.4S, v28.4S, v18.4S +mul v28.4S, v28.4S,v13.4S +sqrdmulh v5.4S, v10.4S, v18.4S +mul v10.4S, v10.4S,v13.4S +mla v28.4S, v17.4S, v31.s[0] +ldr q17, [x17, #+288] +ldr q18, [x17, #+304] +sqrdmulh v13.4S, v4.4S, v18.4S +mul v4.4S, v4.4S,v17.4S +mla v10.4S, v5.4S, v31.s[0] +sub v5.4s, v6.4s, v28.4s +add v6.4s, v6.4s, v28.4s +sqrdmulh v28.4S, v29.4S, v18.4S +mul v29.4S, v29.4S,v17.4S +mla v4.4S, v13.4S, v31.s[0] +sub v13.4s, v15.4s, v10.4s +add v15.4s, v15.4s, v10.4s +ldr q10, [x17, #+192] +ldr q18, [x17, #+208] +sqrdmulh v17.4S, v15.4S, v18.4S +mul v15.4S, v15.4S,v10.4S +mla v29.4S, v28.4S, v31.s[0] +sub v28.4s, v2.4s, v4.4s +add v2.4s, v2.4s, v4.4s +ldr q4, [x17, #+224] +ldr q18, [x17, #+240] +sqrdmulh v10.4S, v13.4S, v18.4S +mul v13.4S, v13.4S,v4.4S +mla v15.4S, v17.4S, v31.s[0] +sub v17.4s, v3.4s, v29.4s +add v3.4s, v3.4s, v29.4s +ldr q29, [x17, #+320] +ldr q18, [x17, #+336] +sqrdmulh v4.4S, v3.4S, v18.4S +mul v3.4S, v3.4S,v29.4S +mla v13.4S, v10.4S, v31.s[0] +sub v10.4s, v6.4s, v15.4s +add v6.4s, v6.4s, v15.4s +ldr q15, [x17, #+352] +ldr q18, [x17, #+368] +sqrdmulh v29.4S, v17.4S, v18.4S +mul v17.4S, v17.4S,v15.4S +mla v3.4S, v4.4S, v31.s[0] +sub v4.4s, v5.4s, v13.4s +add v5.4s, v5.4s, v13.4s +mla v17.4S, v29.4S, v31.s[0] +sub v29.4s, v2.4s, v3.4s +add v2.4s, v2.4s, v3.4s +sub v3.4s, v28.4s, v17.4s +add v28.4s, v28.4s, v17.4s +str q6, [x0, #0] +str q10, [x0, #16] +str q5, [x0, #32] +str q4, [x0, #48] +str q2, [x0, #64] +str q29, [x0, #80] +str q28, [x0, #96] +str q3, [x0, #112] +ldr q3, [x17, #+416] +ldr q28, [x17, #+432] +sqrdmulh v29.4S, v9.4S, v28.4S +mul v9.4S, v9.4S,v3.4S +sqrdmulh v2.4S, v0.4S, v28.4S +mul v0.4S, v0.4S,v3.4S +mla v9.4S, v29.4S, v31.s[0] +ldr q29, [x17, #+544] +ldr q28, [x17, #+560] +sqrdmulh v3.4S, v22.4S, v28.4S +mul v22.4S, v22.4S,v29.4S +mla v0.4S, v2.4S, v31.s[0] +sub v2.4s, v11.4s, v9.4s +add v11.4s, v11.4s, v9.4s +sqrdmulh v9.4S, v30.4S, v28.4S +mul v30.4S, v30.4S,v29.4S +mla v22.4S, v3.4S, v31.s[0] +sub v3.4s, v14.4s, v0.4s +add v14.4s, v14.4s, v0.4s +ldr q0, [x17, #+448] +ldr q28, [x17, #+464] +sqrdmulh v29.4S, v14.4S, v28.4S +mul v14.4S, v14.4S,v0.4S +mla v30.4S, v9.4S, v31.s[0] +sub v9.4s, v19.4s, v22.4s +add v19.4s, v19.4s, v22.4s +ldr q22, [x17, #+480] +ldr q28, [x17, #+496] +sqrdmulh v0.4S, v3.4S, v28.4S +mul v3.4S, v3.4S,v22.4S +mla v14.4S, v29.4S, v31.s[0] +sub v29.4s, v8.4s, v30.4s +add v8.4s, v8.4s, v30.4s +ldr q30, [x17, #+576] +ldr q28, [x17, #+592] +sqrdmulh v22.4S, v8.4S, v28.4S +mul v8.4S, v8.4S,v30.4S +mla v3.4S, v0.4S, v31.s[0] +sub v0.4s, v11.4s, v14.4s +add v11.4s, v11.4s, v14.4s +ldr q14, [x17, #+608] +ldr q28, [x17, #+624] +sqrdmulh v30.4S, v29.4S, v28.4S +mul v29.4S, v29.4S,v14.4S +mla v8.4S, v22.4S, v31.s[0] +sub v22.4s, v2.4s, v3.4s +add v2.4s, v2.4s, v3.4s +mla v29.4S, v30.4S, v31.s[0] +sub v30.4s, v19.4s, v8.4s +add v19.4s, v19.4s, v8.4s +sub v8.4s, v9.4s, v29.4s +add v9.4s, v9.4s, v29.4s +str q11, [x0, #128] +str q0, [x0, #144] +str q2, [x0, #160] +str q22, [x0, #176] +str q19, [x0, #192] +str q30, [x0, #208] +str q9, [x0, #224] +str q8, [x0, #240] +ldr q8, [x0, #288] +ldr q9, [x0, #304] +ldr q30, [x0, #256] +ldr q19, [x0, #272] +ldr q22, [x0, #352] +ldr q2, [x0, #368] +ldr q0, [x0, #320] +ldr q11, [x0, #336] +ldr q29, [x0, #416] +ldr q3, [x0, #432] +ldr q28, [x0, #384] +ldr q14, [x0, #400] +ldr q4, [x0, #480] +ldr q5, [x0, #496] +ldr q10, [x0, #448] +ldr q6, [x0, #464] +ldr q17, [x17, #+640] +ldr q13, [x17, #+656] +ldr q18, [x17, #+768] +ldr q15, [x17, #+784] +ldr q12, [x17, #+896] +ldr q21, [x17, #+912] +ldr q1, [x17, #+1024] +ldr q16, [x17, #+1040] +sqrdmulh v20.4S, v8.4S, v13.s[0] +mul v8.4S, v8.4S,v17.s[0] +sqrdmulh v7.4S, v9.4S, v13.s[0] +mul v9.4S, v9.4S,v17.s[0] +mla v8.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v22.4S, v15.s[0] +mul v22.4S, v22.4S,v18.s[0] +mla v9.4S, v7.4S, v31.s[0] +sub v7.4s, v30.4s, v8.4s +add v30.4s, v30.4s, v8.4s +sqrdmulh v8.4S, v2.4S, v15.s[0] +mul v2.4S, v2.4S,v18.s[0] +mla v22.4S, v20.4S, v31.s[0] +sub v20.4s, v19.4s, v9.4s +add v19.4s, v19.4s, v9.4s +sqrdmulh v9.4S, v19.4S, v13.s[1] +mul v19.4S, v19.4S,v17.s[1] +mla v2.4S, v8.4S, v31.s[0] +sub v8.4s, v0.4s, v22.4s +add v0.4s, v0.4s, v22.4s +sqrdmulh v22.4S, v20.4S, v13.s[2] +mul v20.4S, v20.4S,v17.s[2] +mla v19.4S, v9.4S, v31.s[0] +sub v9.4s, v11.4s, v2.4s +add v11.4s, v11.4s, v2.4s +sqrdmulh v2.4S, v11.4S, v15.s[1] +mul v11.4S, v11.4S,v18.s[1] +mla v20.4S, v22.4S, v31.s[0] +sub v22.4s, v30.4s, v19.4s +add v30.4s, v30.4s, v19.4s +sqrdmulh v13.4S, v9.4S, v15.s[2] +mul v9.4S, v9.4S,v18.s[2] +mla v11.4S, v2.4S, v31.s[0] +sub v2.4s, v7.4s, v20.4s +add v7.4s, v7.4s, v20.4s +sqrdmulh v20.4S, v29.4S, v21.s[0] +mul v29.4S, v29.4S,v12.s[0] +mla v9.4S, v13.4S, v31.s[0] +sub v13.4s, v0.4s, v11.4s +add v0.4s, v0.4s, v11.4s +sqrdmulh v15.4S, v3.4S, v21.s[0] +mul v3.4S, v3.4S,v12.s[0] +mla v29.4S, v20.4S, v31.s[0] +sub v20.4s, v8.4s, v9.4s +add v8.4s, v8.4s, v9.4s +sqrdmulh v9.4S, v4.4S, v16.s[0] +mul v4.4S, v4.4S,v1.s[0] +mla v3.4S, v15.4S, v31.s[0] +sub v15.4s, v28.4s, v29.4s +add v28.4s, v28.4s, v29.4s +sqrdmulh v29.4S, v5.4S, v16.s[0] +mul v5.4S, v5.4S,v1.s[0] +mla v4.4S, v9.4S, v31.s[0] +sub v9.4s, v14.4s, v3.4s +add v14.4s, v14.4s, v3.4s +sqrdmulh v3.4S, v14.4S, v21.s[1] +mul v14.4S, v14.4S,v12.s[1] +mla v5.4S, v29.4S, v31.s[0] +sub v29.4s, v10.4s, v4.4s +add v10.4s, v10.4s, v4.4s +sqrdmulh v4.4S, v9.4S, v21.s[2] +mul v9.4S, v9.4S,v12.s[2] +mla v14.4S, v3.4S, v31.s[0] +sub v3.4s, v6.4s, v5.4s +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v6.4S, v16.s[1] +mul v6.4S, v6.4S,v1.s[1] +mla v9.4S, v4.4S, v31.s[0] +sub v4.4s, v28.4s, v14.4s +add v28.4s, v28.4s, v14.4s +sqrdmulh v21.4S, v3.4S, v16.s[2] +mul v3.4S, v3.4S,v1.s[2] +mla v6.4S, v5.4S, v31.s[0] +sub v5.4s, v15.4s, v9.4s +add v15.4s, v15.4s, v9.4s +mla v3.4S, v21.4S, v31.s[0] +sub v21.4s, v10.4s, v6.4s +add v10.4s, v10.4s, v6.4s +sub v16.4s, v29.4s, v3.4s +add v29.4s, v29.4s, v3.4s +trn1 v3.4S, v30.4S, v22.4S +trn2 v1.4S, v30.4S, v22.4S +trn1 v6.4S, v7.4S, v2.4S +trn2 v9.4S, v7.4S, v2.4S +trn2 v7.2D, v3.2D, v6.2D +trn2 v2.2D, v1.2D, v9.2D +trn1 v30.2D, v3.2D, v6.2D +trn1 v22.2D, v1.2D, v9.2D +trn1 v9.4S, v0.4S, v13.4S +trn2 v1.4S, v0.4S, v13.4S +trn1 v6.4S, v8.4S, v20.4S +trn2 v3.4S, v8.4S, v20.4S +trn2 v8.2D, v9.2D, v6.2D +trn2 v20.2D, v1.2D, v3.2D +trn1 v0.2D, v9.2D, v6.2D +trn1 v13.2D, v1.2D, v3.2D +trn1 v3.4S, v28.4S, v4.4S +trn2 v1.4S, v28.4S, v4.4S +trn1 v6.4S, v15.4S, v5.4S +trn2 v9.4S, v15.4S, v5.4S +trn2 v15.2D, v3.2D, v6.2D +trn2 v5.2D, v1.2D, v9.2D +trn1 v28.2D, v3.2D, v6.2D +trn1 v4.2D, v1.2D, v9.2D +trn1 v9.4S, v10.4S, v21.4S +trn2 v1.4S, v10.4S, v21.4S +trn1 v6.4S, v29.4S, v16.4S +trn2 v3.4S, v29.4S, v16.4S +trn2 v29.2D, v9.2D, v6.2D +trn2 v16.2D, v1.2D, v3.2D +trn1 v10.2D, v9.2D, v6.2D +trn1 v21.2D, v1.2D, v3.2D +ldr q3, [x17, #+672] +ldr q1, [x17, #+688] +sqrdmulh v6.4S, v7.4S, v1.4S +mul v7.4S, v7.4S,v3.4S +sqrdmulh v9.4S, v2.4S, v1.4S +mul v2.4S, v2.4S,v3.4S +mla v7.4S, v6.4S, v31.s[0] +ldr q6, [x17, #+800] +ldr q1, [x17, #+816] +sqrdmulh v3.4S, v8.4S, v1.4S +mul v8.4S, v8.4S,v6.4S +mla v2.4S, v9.4S, v31.s[0] +sub v9.4s, v30.4s, v7.4s +add v30.4s, v30.4s, v7.4s +sqrdmulh v7.4S, v20.4S, v1.4S +mul v20.4S, v20.4S,v6.4S +mla v8.4S, v3.4S, v31.s[0] +sub v3.4s, v22.4s, v2.4s +add v22.4s, v22.4s, v2.4s +ldr q2, [x17, #+704] +ldr q1, [x17, #+720] +sqrdmulh v6.4S, v22.4S, v1.4S +mul v22.4S, v22.4S,v2.4S +mla v20.4S, v7.4S, v31.s[0] +sub v7.4s, v0.4s, v8.4s +add v0.4s, v0.4s, v8.4s +ldr q8, [x17, #+736] +ldr q1, [x17, #+752] +sqrdmulh v2.4S, v3.4S, v1.4S +mul v3.4S, v3.4S,v8.4S +mla v22.4S, v6.4S, v31.s[0] +sub v6.4s, v13.4s, v20.4s +add v13.4s, v13.4s, v20.4s +ldr q20, [x17, #+832] +ldr q1, [x17, #+848] +sqrdmulh v8.4S, v13.4S, v1.4S +mul v13.4S, v13.4S,v20.4S +mla v3.4S, v2.4S, v31.s[0] +sub v2.4s, v30.4s, v22.4s +add v30.4s, v30.4s, v22.4s +ldr q22, [x17, #+864] +ldr q1, [x17, #+880] +sqrdmulh v20.4S, v6.4S, v1.4S +mul v6.4S, v6.4S,v22.4S +mla v13.4S, v8.4S, v31.s[0] +sub v8.4s, v9.4s, v3.4s +add v9.4s, v9.4s, v3.4s +mla v6.4S, v20.4S, v31.s[0] +sub v20.4s, v0.4s, v13.4s +add v0.4s, v0.4s, v13.4s +sub v13.4s, v7.4s, v6.4s +add v7.4s, v7.4s, v6.4s +str q30, [x0, #256] +str q2, [x0, #272] +str q9, [x0, #288] +str q8, [x0, #304] +str q0, [x0, #320] +str q20, [x0, #336] +str q7, [x0, #352] +str q13, [x0, #368] +ldr q13, [x17, #+928] +ldr q7, [x17, #+944] +sqrdmulh v20.4S, v15.4S, v7.4S +mul v15.4S, v15.4S,v13.4S +sqrdmulh v0.4S, v5.4S, v7.4S +mul v5.4S, v5.4S,v13.4S +mla v15.4S, v20.4S, v31.s[0] +ldr q20, [x17, #+1056] +ldr q7, [x17, #+1072] +sqrdmulh v13.4S, v29.4S, v7.4S +mul v29.4S, v29.4S,v20.4S +mla v5.4S, v0.4S, v31.s[0] +sub v0.4s, v28.4s, v15.4s +add v28.4s, v28.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v7.4S +mul v16.4S, v16.4S,v20.4S +mla v29.4S, v13.4S, v31.s[0] +sub v13.4s, v4.4s, v5.4s +add v4.4s, v4.4s, v5.4s +ldr q5, [x17, #+960] +ldr q7, [x17, #+976] +sqrdmulh v20.4S, v4.4S, v7.4S +mul v4.4S, v4.4S,v5.4S +mla v16.4S, v15.4S, v31.s[0] +sub v15.4s, v10.4s, v29.4s +add v10.4s, v10.4s, v29.4s +ldr q29, [x17, #+992] +ldr q7, [x17, #+1008] +sqrdmulh v5.4S, v13.4S, v7.4S +mul v13.4S, v13.4S,v29.4S +mla v4.4S, v20.4S, v31.s[0] +sub v20.4s, v21.4s, v16.4s +add v21.4s, v21.4s, v16.4s +ldr q16, [x17, #+1088] +ldr q7, [x17, #+1104] +sqrdmulh v29.4S, v21.4S, v7.4S +mul v21.4S, v21.4S,v16.4S +mla v13.4S, v5.4S, v31.s[0] +sub v5.4s, v28.4s, v4.4s +add v28.4s, v28.4s, v4.4s +ldr q4, [x17, #+1120] +ldr q7, [x17, #+1136] +sqrdmulh v16.4S, v20.4S, v7.4S +mul v20.4S, v20.4S,v4.4S +mla v21.4S, v29.4S, v31.s[0] +sub v29.4s, v0.4s, v13.4s +add v0.4s, v0.4s, v13.4s +mla v20.4S, v16.4S, v31.s[0] +sub v16.4s, v10.4s, v21.4s +add v10.4s, v10.4s, v21.4s +sub v21.4s, v15.4s, v20.4s +add v15.4s, v15.4s, v20.4s +str q28, [x0, #384] +str q5, [x0, #400] +str q0, [x0, #416] +str q29, [x0, #432] +str q10, [x0, #448] +str q16, [x0, #464] +str q15, [x0, #480] +str q21, [x0, #496] +ldr q21, [x0, #544] +ldr q15, [x0, #560] +ldr q16, [x0, #512] +ldr q10, [x0, #528] +ldr q29, [x0, #608] +ldr q0, [x0, #624] +ldr q5, [x0, #576] +ldr q28, [x0, #592] +ldr q20, [x0, #672] +ldr q13, [x0, #688] +ldr q7, [x0, #640] +ldr q4, [x0, #656] +ldr q8, [x0, #736] +ldr q9, [x0, #752] +ldr q2, [x0, #704] +ldr q30, [x0, #720] +ldr q6, [x17, #+1152] +ldr q3, [x17, #+1168] +ldr q1, [x17, #+1280] +ldr q22, [x17, #+1296] +ldr q12, [x17, #+1408] +ldr q14, [x17, #+1424] +ldr q18, [x17, #+1536] +ldr q11, [x17, #+1552] +sqrdmulh v17.4S, v21.4S, v3.s[0] +mul v21.4S, v21.4S,v6.s[0] +sqrdmulh v19.4S, v15.4S, v3.s[0] +mul v15.4S, v15.4S,v6.s[0] +mla v21.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v29.4S, v22.s[0] +mul v29.4S, v29.4S,v1.s[0] +mla v15.4S, v19.4S, v31.s[0] +sub v19.4s, v16.4s, v21.4s +add v16.4s, v16.4s, v21.4s +sqrdmulh v21.4S, v0.4S, v22.s[0] +mul v0.4S, v0.4S,v1.s[0] +mla v29.4S, v17.4S, v31.s[0] +sub v17.4s, v10.4s, v15.4s +add v10.4s, v10.4s, v15.4s +sqrdmulh v15.4S, v10.4S, v3.s[1] +mul v10.4S, v10.4S,v6.s[1] +mla v0.4S, v21.4S, v31.s[0] +sub v21.4s, v5.4s, v29.4s +add v5.4s, v5.4s, v29.4s +sqrdmulh v29.4S, v17.4S, v3.s[2] +mul v17.4S, v17.4S,v6.s[2] +mla v10.4S, v15.4S, v31.s[0] +sub v15.4s, v28.4s, v0.4s +add v28.4s, v28.4s, v0.4s +sqrdmulh v0.4S, v28.4S, v22.s[1] +mul v28.4S, v28.4S,v1.s[1] +mla v17.4S, v29.4S, v31.s[0] +sub v29.4s, v16.4s, v10.4s +add v16.4s, v16.4s, v10.4s +sqrdmulh v3.4S, v15.4S, v22.s[2] +mul v15.4S, v15.4S,v1.s[2] +mla v28.4S, v0.4S, v31.s[0] +sub v0.4s, v19.4s, v17.4s +add v19.4s, v19.4s, v17.4s +sqrdmulh v17.4S, v20.4S, v14.s[0] +mul v20.4S, v20.4S,v12.s[0] +mla v15.4S, v3.4S, v31.s[0] +sub v3.4s, v5.4s, v28.4s +add v5.4s, v5.4s, v28.4s +sqrdmulh v22.4S, v13.4S, v14.s[0] +mul v13.4S, v13.4S,v12.s[0] +mla v20.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v15.4s +add v21.4s, v21.4s, v15.4s +sqrdmulh v15.4S, v8.4S, v11.s[0] +mul v8.4S, v8.4S,v18.s[0] +mla v13.4S, v22.4S, v31.s[0] +sub v22.4s, v7.4s, v20.4s +add v7.4s, v7.4s, v20.4s +sqrdmulh v20.4S, v9.4S, v11.s[0] +mul v9.4S, v9.4S,v18.s[0] +mla v8.4S, v15.4S, v31.s[0] +sub v15.4s, v4.4s, v13.4s +add v4.4s, v4.4s, v13.4s +sqrdmulh v13.4S, v4.4S, v14.s[1] +mul v4.4S, v4.4S,v12.s[1] +mla v9.4S, v20.4S, v31.s[0] +sub v20.4s, v2.4s, v8.4s +add v2.4s, v2.4s, v8.4s +sqrdmulh v8.4S, v15.4S, v14.s[2] +mul v15.4S, v15.4S,v12.s[2] +mla v4.4S, v13.4S, v31.s[0] +sub v13.4s, v30.4s, v9.4s +add v30.4s, v30.4s, v9.4s +sqrdmulh v9.4S, v30.4S, v11.s[1] +mul v30.4S, v30.4S,v18.s[1] +mla v15.4S, v8.4S, v31.s[0] +sub v8.4s, v7.4s, v4.4s +add v7.4s, v7.4s, v4.4s +sqrdmulh v14.4S, v13.4S, v11.s[2] +mul v13.4S, v13.4S,v18.s[2] +mla v30.4S, v9.4S, v31.s[0] +sub v9.4s, v22.4s, v15.4s +add v22.4s, v22.4s, v15.4s +mla v13.4S, v14.4S, v31.s[0] +sub v14.4s, v2.4s, v30.4s +add v2.4s, v2.4s, v30.4s +sub v11.4s, v20.4s, v13.4s +add v20.4s, v20.4s, v13.4s +trn1 v13.4S, v16.4S, v29.4S +trn2 v18.4S, v16.4S, v29.4S +trn1 v30.4S, v19.4S, v0.4S +trn2 v15.4S, v19.4S, v0.4S +trn2 v19.2D, v13.2D, v30.2D +trn2 v0.2D, v18.2D, v15.2D +trn1 v16.2D, v13.2D, v30.2D +trn1 v29.2D, v18.2D, v15.2D +trn1 v15.4S, v5.4S, v3.4S +trn2 v18.4S, v5.4S, v3.4S +trn1 v30.4S, v21.4S, v17.4S +trn2 v13.4S, v21.4S, v17.4S +trn2 v21.2D, v15.2D, v30.2D +trn2 v17.2D, v18.2D, v13.2D +trn1 v5.2D, v15.2D, v30.2D +trn1 v3.2D, v18.2D, v13.2D +trn1 v13.4S, v7.4S, v8.4S +trn2 v18.4S, v7.4S, v8.4S +trn1 v30.4S, v22.4S, v9.4S +trn2 v15.4S, v22.4S, v9.4S +trn2 v22.2D, v13.2D, v30.2D +trn2 v9.2D, v18.2D, v15.2D +trn1 v7.2D, v13.2D, v30.2D +trn1 v8.2D, v18.2D, v15.2D +trn1 v15.4S, v2.4S, v14.4S +trn2 v18.4S, v2.4S, v14.4S +trn1 v30.4S, v20.4S, v11.4S +trn2 v13.4S, v20.4S, v11.4S +trn2 v20.2D, v15.2D, v30.2D +trn2 v11.2D, v18.2D, v13.2D +trn1 v2.2D, v15.2D, v30.2D +trn1 v14.2D, v18.2D, v13.2D +ldr q13, [x17, #+1184] +ldr q18, [x17, #+1200] +sqrdmulh v30.4S, v19.4S, v18.4S +mul v19.4S, v19.4S,v13.4S +sqrdmulh v15.4S, v0.4S, v18.4S +mul v0.4S, v0.4S,v13.4S +mla v19.4S, v30.4S, v31.s[0] +ldr q30, [x17, #+1312] +ldr q18, [x17, #+1328] +sqrdmulh v13.4S, v21.4S, v18.4S +mul v21.4S, v21.4S,v30.4S +mla v0.4S, v15.4S, v31.s[0] +sub v15.4s, v16.4s, v19.4s +add v16.4s, v16.4s, v19.4s +sqrdmulh v19.4S, v17.4S, v18.4S +mul v17.4S, v17.4S,v30.4S +mla v21.4S, v13.4S, v31.s[0] +sub v13.4s, v29.4s, v0.4s +add v29.4s, v29.4s, v0.4s +ldr q0, [x17, #+1216] +ldr q18, [x17, #+1232] +sqrdmulh v30.4S, v29.4S, v18.4S +mul v29.4S, v29.4S,v0.4S +mla v17.4S, v19.4S, v31.s[0] +sub v19.4s, v5.4s, v21.4s +add v5.4s, v5.4s, v21.4s +ldr q21, [x17, #+1248] +ldr q18, [x17, #+1264] +sqrdmulh v0.4S, v13.4S, v18.4S +mul v13.4S, v13.4S,v21.4S +mla v29.4S, v30.4S, v31.s[0] +sub v30.4s, v3.4s, v17.4s +add v3.4s, v3.4s, v17.4s +ldr q17, [x17, #+1344] +ldr q18, [x17, #+1360] +sqrdmulh v21.4S, v3.4S, v18.4S +mul v3.4S, v3.4S,v17.4S +mla v13.4S, v0.4S, v31.s[0] +sub v0.4s, v16.4s, v29.4s +add v16.4s, v16.4s, v29.4s +ldr q29, [x17, #+1376] +ldr q18, [x17, #+1392] +sqrdmulh v17.4S, v30.4S, v18.4S +mul v30.4S, v30.4S,v29.4S +mla v3.4S, v21.4S, v31.s[0] +sub v21.4s, v15.4s, v13.4s +add v15.4s, v15.4s, v13.4s +mla v30.4S, v17.4S, v31.s[0] +sub v17.4s, v5.4s, v3.4s +add v5.4s, v5.4s, v3.4s +sub v3.4s, v19.4s, v30.4s +add v19.4s, v19.4s, v30.4s +str q16, [x0, #512] +str q0, [x0, #528] +str q15, [x0, #544] +str q21, [x0, #560] +str q5, [x0, #576] +str q17, [x0, #592] +str q19, [x0, #608] +str q3, [x0, #624] +ldr q3, [x17, #+1440] +ldr q19, [x17, #+1456] +sqrdmulh v17.4S, v22.4S, v19.4S +mul v22.4S, v22.4S,v3.4S +sqrdmulh v5.4S, v9.4S, v19.4S +mul v9.4S, v9.4S,v3.4S +mla v22.4S, v17.4S, v31.s[0] +ldr q17, [x17, #+1568] +ldr q19, [x17, #+1584] +sqrdmulh v3.4S, v20.4S, v19.4S +mul v20.4S, v20.4S,v17.4S +mla v9.4S, v5.4S, v31.s[0] +sub v5.4s, v7.4s, v22.4s +add v7.4s, v7.4s, v22.4s +sqrdmulh v22.4S, v11.4S, v19.4S +mul v11.4S, v11.4S,v17.4S +mla v20.4S, v3.4S, v31.s[0] +sub v3.4s, v8.4s, v9.4s +add v8.4s, v8.4s, v9.4s +ldr q9, [x17, #+1472] +ldr q19, [x17, #+1488] +sqrdmulh v17.4S, v8.4S, v19.4S +mul v8.4S, v8.4S,v9.4S +mla v11.4S, v22.4S, v31.s[0] +sub v22.4s, v2.4s, v20.4s +add v2.4s, v2.4s, v20.4s +ldr q20, [x17, #+1504] +ldr q19, [x17, #+1520] +sqrdmulh v9.4S, v3.4S, v19.4S +mul v3.4S, v3.4S,v20.4S +mla v8.4S, v17.4S, v31.s[0] +sub v17.4s, v14.4s, v11.4s +add v14.4s, v14.4s, v11.4s +ldr q11, [x17, #+1600] +ldr q19, [x17, #+1616] +sqrdmulh v20.4S, v14.4S, v19.4S +mul v14.4S, v14.4S,v11.4S +mla v3.4S, v9.4S, v31.s[0] +sub v9.4s, v7.4s, v8.4s +add v7.4s, v7.4s, v8.4s +ldr q8, [x17, #+1632] +ldr q19, [x17, #+1648] +sqrdmulh v11.4S, v17.4S, v19.4S +mul v17.4S, v17.4S,v8.4S +mla v14.4S, v20.4S, v31.s[0] +sub v20.4s, v5.4s, v3.4s +add v5.4s, v5.4s, v3.4s +mla v17.4S, v11.4S, v31.s[0] +sub v11.4s, v2.4s, v14.4s +add v2.4s, v2.4s, v14.4s +sub v14.4s, v22.4s, v17.4s +add v22.4s, v22.4s, v17.4s +str q7, [x0, #640] +str q9, [x0, #656] +str q5, [x0, #672] +str q20, [x0, #688] +str q2, [x0, #704] +str q11, [x0, #720] +str q22, [x0, #736] +str q14, [x0, #752] +ldr q14, [x0, #800] +ldr q22, [x0, #816] +ldr q11, [x0, #768] +ldr q2, [x0, #784] +ldr q20, [x0, #864] +ldr q5, [x0, #880] +ldr q9, [x0, #832] +ldr q7, [x0, #848] +ldr q17, [x0, #928] +ldr q3, [x0, #944] +ldr q19, [x0, #896] +ldr q8, [x0, #912] +ldr q21, [x0, #992] +ldr q15, [x0, #1008] +ldr q0, [x0, #960] +ldr q16, [x0, #976] +ldr q30, [x17, #+1664] +ldr q13, [x17, #+1680] +ldr q18, [x17, #+1792] +ldr q29, [x17, #+1808] +ldr q12, [x17, #+1920] +ldr q4, [x17, #+1936] +ldr q1, [x17, #+2048] +ldr q28, [x17, #+2064] +sqrdmulh v6.4S, v14.4S, v13.s[0] +mul v14.4S, v14.4S,v30.s[0] +sqrdmulh v10.4S, v22.4S, v13.s[0] +mul v22.4S, v22.4S,v30.s[0] +mla v14.4S, v6.4S, v31.s[0] +sqrdmulh v6.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v18.s[0] +mla v22.4S, v10.4S, v31.s[0] +sub v10.4s, v11.4s, v14.4s +add v11.4s, v11.4s, v14.4s +sqrdmulh v14.4S, v5.4S, v29.s[0] +mul v5.4S, v5.4S,v18.s[0] +mla v20.4S, v6.4S, v31.s[0] +sub v6.4s, v2.4s, v22.4s +add v2.4s, v2.4s, v22.4s +sqrdmulh v22.4S, v2.4S, v13.s[1] +mul v2.4S, v2.4S,v30.s[1] +mla v5.4S, v14.4S, v31.s[0] +sub v14.4s, v9.4s, v20.4s +add v9.4s, v9.4s, v20.4s +sqrdmulh v20.4S, v6.4S, v13.s[2] +mul v6.4S, v6.4S,v30.s[2] +mla v2.4S, v22.4S, v31.s[0] +sub v22.4s, v7.4s, v5.4s +add v7.4s, v7.4s, v5.4s +sqrdmulh v5.4S, v7.4S, v29.s[1] +mul v7.4S, v7.4S,v18.s[1] +mla v6.4S, v20.4S, v31.s[0] +sub v20.4s, v11.4s, v2.4s +add v11.4s, v11.4s, v2.4s +sqrdmulh v13.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v18.s[2] +mla v7.4S, v5.4S, v31.s[0] +sub v5.4s, v10.4s, v6.4s +add v10.4s, v10.4s, v6.4s +sqrdmulh v6.4S, v17.4S, v4.s[0] +mul v17.4S, v17.4S,v12.s[0] +mla v22.4S, v13.4S, v31.s[0] +sub v13.4s, v9.4s, v7.4s +add v9.4s, v9.4s, v7.4s +sqrdmulh v29.4S, v3.4S, v4.s[0] +mul v3.4S, v3.4S,v12.s[0] +mla v17.4S, v6.4S, v31.s[0] +sub v6.4s, v14.4s, v22.4s +add v14.4s, v14.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v28.s[0] +mul v21.4S, v21.4S,v1.s[0] +mla v3.4S, v29.4S, v31.s[0] +sub v29.4s, v19.4s, v17.4s +add v19.4s, v19.4s, v17.4s +sqrdmulh v17.4S, v15.4S, v28.s[0] +mul v15.4S, v15.4S,v1.s[0] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v8.4s, v3.4s +add v8.4s, v8.4s, v3.4s +sqrdmulh v3.4S, v8.4S, v4.s[1] +mul v8.4S, v8.4S,v12.s[1] +mla v15.4S, v17.4S, v31.s[0] +sub v17.4s, v0.4s, v21.4s +add v0.4s, v0.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v4.s[2] +mul v22.4S, v22.4S,v12.s[2] +mla v8.4S, v3.4S, v31.s[0] +sub v3.4s, v16.4s, v15.4s +add v16.4s, v16.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v28.s[1] +mul v16.4S, v16.4S,v1.s[1] +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v19.4s, v8.4s +add v19.4s, v19.4s, v8.4s +sqrdmulh v4.4S, v3.4S, v28.s[2] +mul v3.4S, v3.4S,v1.s[2] +mla v16.4S, v15.4S, v31.s[0] +sub v15.4s, v29.4s, v22.4s +add v29.4s, v29.4s, v22.4s +mla v3.4S, v4.4S, v31.s[0] +sub v4.4s, v0.4s, v16.4s +add v0.4s, v0.4s, v16.4s +sub v28.4s, v17.4s, v3.4s +add v17.4s, v17.4s, v3.4s +trn1 v3.4S, v11.4S, v20.4S +trn2 v1.4S, v11.4S, v20.4S +trn1 v16.4S, v10.4S, v5.4S +trn2 v22.4S, v10.4S, v5.4S +trn2 v10.2D, v3.2D, v16.2D +trn2 v5.2D, v1.2D, v22.2D +trn1 v11.2D, v3.2D, v16.2D +trn1 v20.2D, v1.2D, v22.2D +trn1 v22.4S, v9.4S, v13.4S +trn2 v1.4S, v9.4S, v13.4S +trn1 v16.4S, v14.4S, v6.4S +trn2 v3.4S, v14.4S, v6.4S +trn2 v14.2D, v22.2D, v16.2D +trn2 v6.2D, v1.2D, v3.2D +trn1 v9.2D, v22.2D, v16.2D +trn1 v13.2D, v1.2D, v3.2D +trn1 v3.4S, v19.4S, v21.4S +trn2 v1.4S, v19.4S, v21.4S +trn1 v16.4S, v29.4S, v15.4S +trn2 v22.4S, v29.4S, v15.4S +trn2 v29.2D, v3.2D, v16.2D +trn2 v15.2D, v1.2D, v22.2D +trn1 v19.2D, v3.2D, v16.2D +trn1 v21.2D, v1.2D, v22.2D +trn1 v22.4S, v0.4S, v4.4S +trn2 v1.4S, v0.4S, v4.4S +trn1 v16.4S, v17.4S, v28.4S +trn2 v3.4S, v17.4S, v28.4S +trn2 v17.2D, v22.2D, v16.2D +trn2 v28.2D, v1.2D, v3.2D +trn1 v0.2D, v22.2D, v16.2D +trn1 v4.2D, v1.2D, v3.2D +ldr q3, [x17, #+1696] +ldr q1, [x17, #+1712] +sqrdmulh v16.4S, v10.4S, v1.4S +mul v10.4S, v10.4S,v3.4S +sqrdmulh v22.4S, v5.4S, v1.4S +mul v5.4S, v5.4S,v3.4S +mla v10.4S, v16.4S, v31.s[0] +ldr q16, [x17, #+1824] +ldr q1, [x17, #+1840] +sqrdmulh v3.4S, v14.4S, v1.4S +mul v14.4S, v14.4S,v16.4S +mla v5.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v10.4s +add v11.4s, v11.4s, v10.4s +sqrdmulh v10.4S, v6.4S, v1.4S +mul v6.4S, v6.4S,v16.4S +mla v14.4S, v3.4S, v31.s[0] +sub v3.4s, v20.4s, v5.4s +add v20.4s, v20.4s, v5.4s +ldr q5, [x17, #+1728] +ldr q1, [x17, #+1744] +sqrdmulh v16.4S, v20.4S, v1.4S +mul v20.4S, v20.4S,v5.4S +mla v6.4S, v10.4S, v31.s[0] +sub v10.4s, v9.4s, v14.4s +add v9.4s, v9.4s, v14.4s +ldr q14, [x17, #+1760] +ldr q1, [x17, #+1776] +sqrdmulh v5.4S, v3.4S, v1.4S +mul v3.4S, v3.4S,v14.4S +mla v20.4S, v16.4S, v31.s[0] +sub v16.4s, v13.4s, v6.4s +add v13.4s, v13.4s, v6.4s +ldr q6, [x17, #+1856] +ldr q1, [x17, #+1872] +sqrdmulh v14.4S, v13.4S, v1.4S +mul v13.4S, v13.4S,v6.4S +mla v3.4S, v5.4S, v31.s[0] +sub v5.4s, v11.4s, v20.4s +add v11.4s, v11.4s, v20.4s +ldr q20, [x17, #+1888] +ldr q1, [x17, #+1904] +sqrdmulh v6.4S, v16.4S, v1.4S +mul v16.4S, v16.4S,v20.4S +mla v13.4S, v14.4S, v31.s[0] +sub v14.4s, v22.4s, v3.4s +add v22.4s, v22.4s, v3.4s +mla v16.4S, v6.4S, v31.s[0] +sub v6.4s, v9.4s, v13.4s +add v9.4s, v9.4s, v13.4s +sub v13.4s, v10.4s, v16.4s +add v10.4s, v10.4s, v16.4s +str q11, [x0, #768] +str q5, [x0, #784] +str q22, [x0, #800] +str q14, [x0, #816] +str q9, [x0, #832] +str q6, [x0, #848] +str q10, [x0, #864] +str q13, [x0, #880] +ldr q13, [x17, #+1952] +ldr q10, [x17, #+1968] +sqrdmulh v6.4S, v29.4S, v10.4S +mul v29.4S, v29.4S,v13.4S +sqrdmulh v9.4S, v15.4S, v10.4S +mul v15.4S, v15.4S,v13.4S +mla v29.4S, v6.4S, v31.s[0] +ldr q6, [x17, #+2080] +ldr q10, [x17, #+2096] +sqrdmulh v13.4S, v17.4S, v10.4S +mul v17.4S, v17.4S,v6.4S +mla v15.4S, v9.4S, v31.s[0] +sub v9.4s, v19.4s, v29.4s +add v19.4s, v19.4s, v29.4s +sqrdmulh v29.4S, v28.4S, v10.4S +mul v28.4S, v28.4S,v6.4S +mla v17.4S, v13.4S, v31.s[0] +sub v13.4s, v21.4s, v15.4s +add v21.4s, v21.4s, v15.4s +ldr q15, [x17, #+1984] +ldr q10, [x17, #+2000] +sqrdmulh v6.4S, v21.4S, v10.4S +mul v21.4S, v21.4S,v15.4S +mla v28.4S, v29.4S, v31.s[0] +sub v29.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +ldr q17, [x17, #+2016] +ldr q10, [x17, #+2032] +sqrdmulh v15.4S, v13.4S, v10.4S +mul v13.4S, v13.4S,v17.4S +mla v21.4S, v6.4S, v31.s[0] +sub v6.4s, v4.4s, v28.4s +add v4.4s, v4.4s, v28.4s +ldr q28, [x17, #+2112] +ldr q10, [x17, #+2128] +sqrdmulh v17.4S, v4.4S, v10.4S +mul v4.4S, v4.4S,v28.4S +mla v13.4S, v15.4S, v31.s[0] +sub v15.4s, v19.4s, v21.4s +add v19.4s, v19.4s, v21.4s +ldr q21, [x17, #+2144] +ldr q10, [x17, #+2160] +sqrdmulh v28.4S, v6.4S, v10.4S +mul v6.4S, v6.4S,v21.4S +mla v4.4S, v17.4S, v31.s[0] +sub v17.4s, v9.4s, v13.4s +add v9.4s, v9.4s, v13.4s +mla v6.4S, v28.4S, v31.s[0] +sub v28.4s, v0.4s, v4.4s +add v0.4s, v0.4s, v4.4s +sub v4.4s, v29.4s, v6.4s +add v29.4s, v29.4s, v6.4s +str q19, [x0, #896] +str q15, [x0, #912] +str q9, [x0, #928] +str q17, [x0, #944] +str q0, [x0, #960] +str q28, [x0, #976] +str q29, [x0, #992] +str q4, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 2392 +// Instruction count: 2388 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z4_2.s b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z4_2.s new file mode 100644 index 0000000..302342b --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z4_2.s @@ -0,0 +1,2422 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 26036764 // Layer 6, block 0 +.word 7065381 // Layer 6, block 1 +.word 11280567 // Layer 6, block 2 +.word 19695786 // Layer 6, block 3 +.word 1666225723 // Layer 6, block 0 +.word 452149874 // Layer 6, block 1 +.word 721901190 // Layer 6, block 2 +.word 1260434103 // Layer 6, block 3 +.word 28678040 // Layer 7, block 0 +.word 5637166 // Layer 7, block 2 +.word 18759424 // Layer 7, block 4 +.word 8648030 // Layer 7, block 6 +.word 1835254486 // Layer 7, block 0 +.word 360751090 // Layer 7, block 2 +.word 1200511508 // Layer 7, block 4 +.word 553431680 // Layer 7, block 6 +.word 7232147 // Layer 7, block 1 +.word 7430689 // Layer 7, block 3 +.word 14819378 // Layer 7, block 5 +.word 22112339 // Layer 7, block 7 +.word 462822084 // Layer 7, block 1 +.word 475527802 // Layer 7, block 3 +.word 948367809 // Layer 7, block 5 +.word 1415081692 // Layer 7, block 7 +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14834498 // Layer 6, block 4 +.word 22861321 // Layer 6, block 5 +.word 23033862 // Layer 6, block 6 +.word 32211066 // Layer 6, block 7 +.word 949335415 // Layer 6, block 4 +.word 1463012881 // Layer 6, block 5 +.word 1474054663 // Layer 6, block 6 +.word 2061350894 // Layer 6, block 7 +.word 7103825 // Layer 7, block 8 +.word 24338119 // Layer 7, block 10 +.word 6674394 // Layer 7, block 12 +.word 3716128 // Layer 7, block 14 +.word 454610102 // Layer 7, block 8 +.word 1557520740 // Layer 7, block 10 +.word 427128616 // Layer 7, block 12 +.word 237814041 // Layer 7, block 14 +.word 18577393 // Layer 7, block 9 +.word 17042091 // Layer 7, block 11 +.word 6574213 // Layer 7, block 13 +.word 24666803 // Layer 7, block 15 +.word 1188862414 // Layer 7, block 9 +.word 1090610585 // Layer 7, block 11 +.word 420717521 // Layer 7, block 13 +.word 1578554911 // Layer 7, block 15 +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 11253846 // Layer 6, block 8 +.word 16151303 // Layer 6, block 9 +.word 1821442 // Layer 6, block 10 +.word 23358663 // Layer 6, block 11 +.word 720191176 // Layer 6, block 8 +.word 1033604503 // Layer 6, block 9 +.word 116563391 // Layer 6, block 10 +.word 1494840340 // Layer 6, block 11 +.word 32787475 // Layer 7, block 16 +.word 8269259 // Layer 7, block 18 +.word 20826321 // Layer 7, block 20 +.word 21194054 // Layer 7, block 22 +.word 2098238255 // Layer 7, block 16 +.word 529192186 // Layer 7, block 18 +.word 1332782821 // Layer 7, block 20 +.word 1356315937 // Layer 7, block 22 +.word 28400654 // Layer 7, block 17 +.word 31090287 // Layer 7, block 19 +.word 26776841 // Layer 7, block 21 +.word 22281074 // Layer 7, block 23 +.word 1817503137 // Layer 7, block 17 +.word 1989626512 // Layer 7, block 19 +.word 1713587037 // Layer 7, block 21 +.word 1425879908 // Layer 7, block 23 +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 20504641 // Layer 6, block 12 +.word 7735096 // Layer 6, block 13 +.word 29463916 // Layer 6, block 14 +.word 23172067 // Layer 6, block 15 +.word 1312196872 // Layer 6, block 12 +.word 495008363 // Layer 6, block 13 +.word 1885546712 // Layer 6, block 14 +.word 1482899108 // Layer 6, block 15 +.word 1953000 // Layer 7, block 24 +.word 12766243 // Layer 7, block 26 +.word 16292342 // Layer 7, block 28 +.word 25143337 // Layer 7, block 30 +.word 124982461 // Layer 7, block 24 +.word 816977197 // Layer 7, block 26 +.word 1042630311 // Layer 7, block 28 +.word 1609050759 // Layer 7, block 30 +.word 12486848 // Layer 7, block 25 +.word 31556661 // Layer 7, block 27 +.word 28330310 // Layer 7, block 29 +.word 15137961 // Layer 7, block 31 +.word 799097282 // Layer 7, block 25 +.word 2019472170 // Layer 7, block 27 +.word 1813001465 // Layer 7, block 29 +.word 968755565 // Layer 7, block 31 +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 18663828 // Layer 6, block 16 +.word 25765932 // Layer 6, block 17 +.word 11779122 // Layer 6, block 18 +.word 29112305 // Layer 6, block 19 +.word 1194393831 // Layer 6, block 16 +.word 1648893798 // Layer 6, block 17 +.word 753806275 // Layer 6, block 18 +.word 1863045325 // Layer 6, block 19 +.word 33163184 // Layer 7, block 32 +.word 11550623 // Layer 7, block 34 +.word 25375595 // Layer 7, block 36 +.word 18254638 // Layer 7, block 38 +.word 2122281795 // Layer 7, block 32 +.word 739183455 // Layer 7, block 34 +.word 1623914137 // Layer 7, block 36 +.word 1168207670 // Layer 7, block 38 +.word 9551359 // Layer 7, block 33 +.word 33257316 // Layer 7, block 35 +.word 10387700 // Layer 7, block 37 +.word 4263629 // Layer 7, block 39 +.word 611240324 // Layer 7, block 33 +.word 2128305784 // Layer 7, block 35 +.word 664762063 // Layer 7, block 37 +.word 272851431 // Layer 7, block 39 +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 596073 // Layer 6, block 20 +.word 29039358 // Layer 6, block 21 +.word 6760262 // Layer 6, block 22 +.word 2228887 // Layer 6, block 23 +.word 38145761 // Layer 6, block 20 +.word 1858377074 // Layer 6, block 21 +.word 432623749 // Layer 6, block 22 +.word 142637881 // Layer 6, block 23 +.word 25929180 // Layer 7, block 40 +.word 23508428 // Layer 7, block 42 +.word 22560727 // Layer 7, block 44 +.word 29457393 // Layer 7, block 46 +.word 1659340873 // Layer 7, block 40 +.word 1504424569 // Layer 7, block 42 +.word 1443776334 // Layer 7, block 44 +.word 1885129272 // Layer 7, block 46 +.word 17371159 // Layer 7, block 41 +.word 11558208 // Layer 7, block 43 +.word 15755637 // Layer 7, block 45 +.word 20740787 // Layer 7, block 47 +.word 1111669329 // Layer 7, block 41 +.word 739668858 // Layer 7, block 43 +.word 1008283812 // Layer 7, block 45 +.word 1327309063 // Layer 7, block 47 +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 13624329 // Layer 6, block 24 +.word 9838349 // Layer 6, block 25 +.word 6934560 // Layer 6, block 26 +.word 11310234 // Layer 6, block 27 +.word 871890510 // Layer 6, block 24 +.word 629606282 // Layer 6, block 25 +.word 443777969 // Layer 6, block 26 +.word 723799733 // Layer 6, block 27 +.word 3153984 // Layer 7, block 48 +.word 15599806 // Layer 7, block 50 +.word 23484790 // Layer 7, block 52 +.word 30174454 // Layer 7, block 54 +.word 201839571 // Layer 7, block 48 +.word 998311389 // Layer 7, block 50 +.word 1502911852 // Layer 7, block 52 +.word 1931017673 // Layer 7, block 54 +.word 13598070 // Layer 7, block 49 +.word 31454003 // Layer 7, block 51 +.word 20506260 // Layer 7, block 53 +.word 5928435 // Layer 7, block 55 +.word 870210062 // Layer 7, block 49 +.word 2012902560 // Layer 7, block 51 +.word 1312300480 // Layer 7, block 53 +.word 379390883 // Layer 7, block 55 +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 32798516 // Layer 6, block 28 +.word 9911360 // Layer 6, block 29 +.word 32443170 // Layer 6, block 30 +.word 31293482 // Layer 6, block 31 +.word 2098944825 // Layer 6, block 28 +.word 634278629 // Layer 6, block 29 +.word 2076204416 // Layer 6, block 30 +.word 2002630000 // Layer 6, block 31 +.word 26013877 // Layer 7, block 56 +.word 22928950 // Layer 7, block 58 +.word 24547058 // Layer 7, block 60 +.word 21082546 // Layer 7, block 62 +.word 1664761067 // Layer 7, block 56 +.word 1467340807 // Layer 7, block 58 +.word 1570891816 // Layer 7, block 60 +.word 1349179970 // Layer 7, block 62 +.word 21864746 // Layer 7, block 57 +.word 27678266 // Layer 7, block 59 +.word 30695887 // Layer 7, block 61 +.word 31772478 // Layer 7, block 63 +.word 1399236949 // Layer 7, block 57 +.word 1771273834 // Layer 7, block 59 +.word 1964386839 // Layer 7, block 61 +.word 2033283404 // Layer 7, block 63 +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 2853776 // Layer 6, block 32 +.word 31645959 // Layer 6, block 33 +.word 29723614 // Layer 6, block 34 +.word 31813171 // Layer 6, block 35 +.word 182627725 // Layer 6, block 32 +.word 2025186806 // Layer 6, block 33 +.word 1902166116 // Layer 6, block 34 +.word 2035887557 // Layer 6, block 35 +.word 30377953 // Layer 7, block 64 +.word 4924837 // Layer 7, block 66 +.word 11362575 // Layer 7, block 68 +.word 31398766 // Layer 7, block 70 +.word 1944040616 // Layer 7, block 64 +.word 315165513 // Layer 7, block 66 +.word 727149301 // Layer 7, block 68 +.word 2009367662 // Layer 7, block 70 +.word 27689101 // Layer 7, block 65 +.word 31229525 // Layer 7, block 67 +.word 6544948 // Layer 7, block 69 +.word 13728247 // Layer 7, block 71 +.word 1771967221 // Layer 7, block 65 +.word 1998537064 // Layer 7, block 67 +.word 418844704 // Layer 7, block 69 +.word 878540754 // Layer 7, block 71 +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9116920 // Layer 6, block 36 +.word 26449800 // Layer 6, block 37 +.word 27173300 // Layer 6, block 38 +.word 1574249 // Layer 6, block 39 +.word 583438350 // Layer 6, block 36 +.word 1692658010 // Layer 6, block 37 +.word 1738958476 // Layer 6, block 38 +.word 100744247 // Layer 6, block 39 +.word 6510145 // Layer 7, block 72 +.word 760999 // Layer 7, block 74 +.word 1634503 // Layer 7, block 76 +.word 29546109 // Layer 7, block 78 +.word 416617482 // Layer 7, block 72 +.word 48700219 // Layer 7, block 74 +.word 104600209 // Layer 7, block 76 +.word 1890806663 // Layer 7, block 78 +.word 2195232 // Layer 7, block 73 +.word 4465852 // Layer 7, block 75 +.word 31203102 // Layer 7, block 77 +.word 29916743 // Layer 7, block 79 +.word 140484126 // Layer 7, block 73 +.word 285792715 // Layer 7, block 75 +.word 1996846121 // Layer 7, block 77 +.word 1914525428 // Layer 7, block 79 +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29172999 // Layer 6, block 40 +.word 16825951 // Layer 6, block 41 +.word 11592382 // Layer 6, block 42 +.word 2671395 // Layer 6, block 43 +.word 1866929445 // Layer 6, block 40 +.word 1076778680 // Layer 6, block 41 +.word 741855827 // Layer 6, block 42 +.word 170956232 // Layer 6, block 43 +.word 14579779 // Layer 7, block 80 +.word 24263513 // Layer 7, block 82 +.word 4646776 // Layer 7, block 84 +.word 69049 // Layer 7, block 86 +.word 933034643 // Layer 7, block 80 +.word 1552746321 // Layer 7, block 82 +.word 297370968 // Layer 7, block 84 +.word 4418799 // Layer 7, block 86 +.word 33263488 // Layer 7, block 81 +.word 22493246 // Layer 7, block 83 +.word 22009979 // Layer 7, block 85 +.word 12021234 // Layer 7, block 87 +.word 2128700762 // Layer 7, block 81 +.word 1439457879 // Layer 7, block 83 +.word 1408531152 // Layer 7, block 85 +.word 769300260 // Layer 7, block 87 +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 15720958 // Layer 6, block 44 +.word 4876619 // Layer 6, block 45 +.word 9370171 // Layer 6, block 46 +.word 2197027 // Layer 6, block 47 +.word 1006064525 // Layer 6, block 44 +.word 312079797 // Layer 6, block 45 +.word 599645177 // Layer 6, block 46 +.word 140598997 // Layer 6, block 47 +.word 16117282 // Layer 7, block 88 +.word 9635661 // Layer 7, block 90 +.word 9117520 // Layer 7, block 92 +.word 3506913 // Layer 7, block 94 +.word 1031427326 // Layer 7, block 88 +.word 616635240 // Layer 7, block 90 +.word 583476747 // Layer 7, block 92 +.word 224425303 // Layer 7, block 94 +.word 20014407 // Layer 7, block 89 +.word 25893988 // Layer 7, block 91 +.word 10257619 // Layer 7, block 93 +.word 24501669 // Layer 7, block 95 +.word 1280824291 // Layer 7, block 89 +.word 1657088757 // Layer 7, block 91 +.word 656437514 // Layer 7, block 93 +.word 1567987141 // Layer 7, block 95 +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 23467272 // Layer 6, block 48 +.word 11944835 // Layer 6, block 49 +.word 29768154 // Layer 6, block 50 +.word 3189790 // Layer 6, block 51 +.word 1501790786 // Layer 6, block 48 +.word 764411097 // Layer 6, block 49 +.word 1905016458 // Layer 6, block 50 +.word 204130980 // Layer 6, block 51 +.word 28559032 // Layer 7, block 96 +.word 20151609 // Layer 7, block 98 +.word 11645481 // Layer 7, block 100 +.word 16402437 // Layer 7, block 102 +.word 1827638556 // Layer 7, block 96 +.word 1289604549 // Layer 7, block 98 +.word 745253903 // Layer 7, block 100 +.word 1049675853 // Layer 7, block 102 +.word 1005359 // Layer 7, block 97 +.word 19130139 // Layer 7, block 99 +.word 11690281 // Layer 7, block 101 +.word 5461508 // Layer 7, block 103 +.word 64338065 // Layer 7, block 97 +.word 1224235458 // Layer 7, block 99 +.word 748120885 // Layer 7, block 101 +.word 349509836 // Layer 7, block 103 +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 4898455 // Layer 6, block 52 +.word 22059944 // Layer 6, block 53 +.word 20315246 // Layer 6, block 54 +.word 28615767 // Layer 6, block 55 +.word 313477194 // Layer 6, block 52 +.word 1411728668 // Layer 6, block 53 +.word 1300076517 // Layer 6, block 54 +.word 1831269319 // Layer 6, block 55 +.word 6226096 // Layer 7, block 104 +.word 14029790 // Layer 7, block 106 +.word 7729000 // Layer 7, block 108 +.word 13958531 // Layer 7, block 110 +.word 398439734 // Layer 7, block 104 +.word 897838034 // Layer 7, block 106 +.word 494618249 // Layer 7, block 108 +.word 893277806 // Layer 7, block 110 +.word 31755058 // Layer 7, block 105 +.word 26102744 // Layer 7, block 107 +.word 19175904 // Layer 7, block 109 +.word 19472238 // Layer 7, block 111 +.word 2032168609 // Layer 7, block 105 +.word 1670448121 // Layer 7, block 107 +.word 1227164194 // Layer 7, block 109 +.word 1246128123 // Layer 7, block 111 +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 17302560 // Layer 6, block 56 +.word 8630188 // Layer 6, block 57 +.word 13744680 // Layer 6, block 58 +.word 31890906 // Layer 6, block 59 +.word 1107279328 // Layer 6, block 56 +.word 552289879 // Layer 6, block 57 +.word 879592386 // Layer 6, block 58 +.word 2040862218 // Layer 6, block 59 +.word 4735938 // Layer 7, block 112 +.word 26671657 // Layer 7, block 114 +.word 25810971 // Layer 7, block 116 +.word 25578690 // Layer 7, block 118 +.word 303076900 // Layer 7, block 112 +.word 1706855774 // Layer 7, block 114 +.word 1651776074 // Layer 7, block 116 +.word 1636911225 // Layer 7, block 118 +.word 6957373 // Layer 7, block 113 +.word 25381712 // Layer 7, block 115 +.word 27780827 // Layer 7, block 117 +.word 28062311 // Layer 7, block 119 +.word 445237890 // Layer 7, block 113 +.word 1624305595 // Layer 7, block 115 +.word 1777837237 // Layer 7, block 117 +.word 1795850838 // Layer 7, block 119 +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 26150922 // Layer 6, block 60 +.word 29525906 // Layer 6, block 61 +.word 23080870 // Layer 6, block 62 +.word 1636987 // Layer 6, block 63 +.word 1673531278 // Layer 6, block 60 +.word 1889513769 // Layer 6, block 61 +.word 1477062945 // Layer 6, block 62 +.word 104759172 // Layer 6, block 63 +.word 10674616 // Layer 7, block 120 +.word 9508293 // Layer 7, block 122 +.word 4274200 // Layer 7, block 124 +.word 10066304 // Layer 7, block 126 +.word 683123285 // Layer 7, block 120 +.word 608484310 // Layer 7, block 122 +.word 273527923 // Layer 7, block 124 +.word 644194289 // Layer 7, block 126 +.word 26473446 // Layer 7, block 121 +.word 14853570 // Layer 7, block 123 +.word 32427548 // Layer 7, block 125 +.word 16598340 // Layer 7, block 127 +.word 1694171239 // Layer 7, block 121 +.word 950555930 // Layer 7, block 123 +.word 2075204685 // Layer 7, block 125 +.word 1062212688 // Layer 7, block 127 +.text +.global ntt_u32_full_neon_asm_var_4_4_3_z4_2 +.global _ntt_u32_full_neon_asm_var_4_4_3_z4_2 +ntt_u32_full_neon_asm_var_4_4_3_z4_2: +_ntt_u32_full_neon_asm_var_4_4_3_z4_2: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #800] +ldr q21, [x0, #864] +ldr q20, [x0, #928] +ldr q19, [x0, #992] +ldr q18, [x0, #288] +ldr q17, [x0, #352] +ldr q16, [x0, #416] +ldr q3, [x0, #480] +sqrdmulh v2.4S, v22.4S, v29.s[0] +ldr q1, [x0, #544] +mul v22.4S, v22.4S,v30.s[0] +ldr q0, [x0, #608] +sqrdmulh v15.4S, v21.4S, v29.s[0] +ldr q14, [x0, #672] +mul v21.4S, v21.4S,v30.s[0] +ldr q13, [x0, #736] +mla v22.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q12, [x0, #32] +sub v11.4s, v18.4s, v22.4s +mla v21.4S, v15.4S, v31.s[0] +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q15, [x0, #96] +sub v10.4s, v17.4s, v21.4s +mla v20.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v1.4S, v29.s[0] +ldr q2, [x0, #160] +mul v1.4S, v1.4S,v30.s[0] +sub v9.4s, v16.4s, v20.4s +mla v19.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v0.4S, v29.s[0] +ldr q22, [x0, #224] +mul v0.4S, v0.4S,v30.s[0] +sub v8.4s, v3.4s, v19.4s +mla v1.4S, v21.4S, v31.s[0] +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v21.4s, v12.4s, v1.4s +mla v0.4S, v20.4S, v31.s[0] +add v12.4s, v12.4s, v1.4s +sqrdmulh v1.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +sub v20.4s, v15.4s, v0.4s +mla v14.4S, v19.4S, v31.s[0] +add v15.4s, v15.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v19.4s, v2.4s, v14.4s +mla v13.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v1.4s, v22.4s, v13.4s +mla v16.4S, v0.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v0.4s, v2.4s, v16.4s +mla v3.4S, v14.4S, v31.s[0] +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v14.4s, v22.4s, v3.4s +mla v18.4S, v13.4S, v31.s[0] +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v29.s[2] +mul v9.4S, v9.4S,v30.s[2] +sub v13.4s, v12.4s, v18.4s +mla v17.4S, v16.4S, v31.s[0] +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v8.4S, v29.s[2] +mul v8.4S, v8.4S,v30.s[2] +sub v16.4s, v15.4s, v17.4s +mla v9.4S, v3.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +sqrdmulh v17.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v3.4s, v19.4s, v9.4s +mla v8.4S, v18.4S, v31.s[0] +add v19.4s, v19.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v18.4s, v1.4s, v8.4s +mla v11.4S, v17.4S, v31.s[0] +add v1.4s, v1.4s, v8.4s +sqrdmulh v8.4S, v2.4S, v27.s[0] +mul v2.4S, v2.4S,v28.s[0] +sub v17.4s, v21.4s, v11.4s +mla v10.4S, v9.4S, v31.s[0] +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v27.s[0] +mul v22.4S, v22.4S,v28.s[0] +sub v9.4s, v20.4s, v10.4s +mla v2.4S, v8.4S, v31.s[0] +add v20.4s, v20.4s, v10.4s +sqrdmulh v10.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v8.4s, v12.4s, v2.4s +mla v22.4S, v11.4S, v31.s[0] +add v12.4s, v12.4s, v2.4s +sqrdmulh v2.4S, v14.4S, v27.s[1] +mul v14.4S, v14.4S,v28.s[1] +sub v11.4s, v15.4s, v22.4s +mla v0.4S, v10.4S, v31.s[0] +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v27.s[2] +mul v19.4S, v19.4S,v28.s[2] +sub v10.4s, v13.4s, v0.4s +mla v14.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v0.4s +sqrdmulh v0.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +sub v2.4s, v16.4s, v14.4s +mla v19.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v27.s[3] +mul v3.4S, v3.4S,v28.s[3] +sub v22.4s, v21.4s, v19.4s +mla v1.4S, v0.4S, v31.s[0] +add v21.4s, v21.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v0.4s, v20.4s, v1.4s +mla v3.4S, v14.4S, v31.s[0] +add v20.4s, v20.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v25.s[0] +mul v15.4S, v15.4S,v26.s[0] +sub v14.4s, v17.4s, v3.4s +mla v18.4S, v19.4S, v31.s[0] +add v17.4s, v17.4s, v3.4s +sqrdmulh v3.4S, v11.4S, v25.s[1] +mul v11.4S, v11.4S,v26.s[1] +sub v19.4s, v9.4s, v18.4s +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v1.4s, v12.4s, v15.4s +mla v11.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v25.s[3] +mul v2.4S, v2.4S,v26.s[3] +sub v3.4s, v8.4s, v11.4s +mla v16.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v11.4s +str q12, [x0, #32] +sqrdmulh v12.4S, v20.4S, v23.s[0] +str q1, [x0, #96] +mul v20.4S, v20.4S,v24.s[0] +ldr q1, [x0, #816] +sub v11.4s, v13.4s, v16.4s +ldr q18, [x0, #880] +mla v2.4S, v15.4S, v31.s[0] +add v13.4s, v13.4s, v16.4s +str q8, [x0, #160] +sqrdmulh v8.4S, v0.4S, v23.s[1] +str q3, [x0, #224] +mul v0.4S, v0.4S,v24.s[1] +ldr q3, [x0, #944] +sub v16.4s, v10.4s, v2.4s +ldr q15, [x0, #1008] +mla v20.4S, v12.4S, v31.s[0] +add v10.4s, v10.4s, v2.4s +str q13, [x0, #288] +sqrdmulh v13.4S, v9.4S, v23.s[2] +str q11, [x0, #352] +mul v9.4S, v9.4S,v24.s[2] +ldr q11, [x0, #304] +sub v2.4s, v21.4s, v20.4s +ldr q12, [x0, #368] +mla v0.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v20.4s +str q10, [x0, #416] +sqrdmulh v10.4S, v19.4S, v23.s[3] +str q16, [x0, #480] +mul v19.4S, v19.4S,v24.s[3] +ldr q16, [x0, #432] +sub v20.4s, v22.4s, v0.4s +ldr q8, [x0, #496] +mla v9.4S, v13.4S, v31.s[0] +add v22.4s, v22.4s, v0.4s +str q21, [x0, #544] +sqrdmulh v21.4S, v1.4S, v29.s[0] +str q2, [x0, #608] +ldr q2, [x0, #560] +mul v1.4S, v1.4S,v30.s[0] +ldr q0, [x0, #624] +sub v13.4s, v17.4s, v9.4s +mla v19.4S, v10.4S, v31.s[0] +add v17.4s, v17.4s, v9.4s +str q22, [x0, #672] +sqrdmulh v22.4S, v18.4S, v29.s[0] +str q20, [x0, #736] +ldr q20, [x0, #688] +mul v18.4S, v18.4S,v30.s[0] +ldr q9, [x0, #752] +sub v10.4s, v14.4s, v19.4s +mla v1.4S, v21.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +str q17, [x0, #800] +sqrdmulh v17.4S, v3.4S, v29.s[0] +str q13, [x0, #864] +mul v3.4S, v3.4S,v30.s[0] +ldr q13, [x0, #48] +sub v19.4s, v11.4s, v1.4s +mla v18.4S, v22.4S, v31.s[0] +add v11.4s, v11.4s, v1.4s +str q14, [x0, #928] +sqrdmulh v14.4S, v15.4S, v29.s[0] +str q10, [x0, #992] +mul v15.4S, v15.4S,v30.s[0] +ldr q10, [x0, #112] +sub v1.4s, v12.4s, v18.4s +mla v3.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v2.4S, v29.s[0] +ldr q17, [x0, #176] +mul v2.4S, v2.4S,v30.s[0] +sub v22.4s, v16.4s, v3.4s +mla v15.4S, v14.4S, v31.s[0] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v0.4S, v29.s[0] +ldr q14, [x0, #240] +mul v0.4S, v0.4S,v30.s[0] +sub v21.4s, v8.4s, v15.4s +mla v2.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +sub v18.4s, v13.4s, v2.4s +mla v0.4S, v3.4S, v31.s[0] +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v9.4S, v29.s[0] +mul v9.4S, v9.4S,v30.s[0] +sub v3.4s, v10.4s, v0.4s +mla v20.4S, v15.4S, v31.s[0] +add v10.4s, v10.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v15.4s, v17.4s, v20.4s +mla v9.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +sub v2.4s, v14.4s, v9.4s +mla v16.4S, v0.4S, v31.s[0] +add v14.4s, v14.4s, v9.4s +sqrdmulh v9.4S, v11.4S, v29.s[1] +mul v11.4S, v11.4S,v30.s[1] +sub v0.4s, v17.4s, v16.4s +mla v8.4S, v20.4S, v31.s[0] +add v17.4s, v17.4s, v16.4s +sqrdmulh v16.4S, v12.4S, v29.s[1] +mul v12.4S, v12.4S,v30.s[1] +sub v20.4s, v14.4s, v8.4s +mla v11.4S, v9.4S, v31.s[0] +add v14.4s, v14.4s, v8.4s +sqrdmulh v8.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +sub v9.4s, v13.4s, v11.4s +mla v12.4S, v16.4S, v31.s[0] +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +sub v16.4s, v10.4s, v12.4s +mla v22.4S, v8.4S, v31.s[0] +add v10.4s, v10.4s, v12.4s +sqrdmulh v12.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +sub v8.4s, v15.4s, v22.4s +mla v21.4S, v11.4S, v31.s[0] +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v1.4S, v29.s[2] +mul v1.4S, v1.4S,v30.s[2] +sub v11.4s, v2.4s, v21.4s +mla v19.4S, v12.4S, v31.s[0] +add v2.4s, v2.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v27.s[0] +mul v17.4S, v17.4S,v28.s[0] +sub v12.4s, v18.4s, v19.4s +mla v1.4S, v22.4S, v31.s[0] +add v18.4s, v18.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +sub v22.4s, v3.4s, v1.4s +mla v17.4S, v21.4S, v31.s[0] +add v3.4s, v3.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v21.4s, v13.4s, v17.4s +mla v14.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v17.4s +sqrdmulh v17.4S, v20.4S, v27.s[1] +mul v20.4S, v20.4S,v28.s[1] +sub v19.4s, v10.4s, v14.4s +mla v0.4S, v1.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v27.s[2] +mul v15.4S, v15.4S,v28.s[2] +sub v1.4s, v9.4s, v0.4s +mla v20.4S, v17.4S, v31.s[0] +add v9.4s, v9.4s, v0.4s +sqrdmulh v0.4S, v2.4S, v27.s[2] +mul v2.4S, v2.4S,v28.s[2] +sub v17.4s, v16.4s, v20.4s +mla v15.4S, v14.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v27.s[3] +mul v8.4S, v8.4S,v28.s[3] +sub v14.4s, v18.4s, v15.4s +mla v2.4S, v0.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v27.s[3] +mul v11.4S, v11.4S,v28.s[3] +sub v0.4s, v3.4s, v2.4s +mla v8.4S, v20.4S, v31.s[0] +add v3.4s, v3.4s, v2.4s +sqrdmulh v2.4S, v10.4S, v25.s[0] +mul v10.4S, v10.4S,v26.s[0] +sub v20.4s, v12.4s, v8.4s +mla v11.4S, v15.4S, v31.s[0] +add v12.4s, v12.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v25.s[1] +mul v19.4S, v19.4S,v26.s[1] +sub v15.4s, v22.4s, v11.4s +mla v10.4S, v2.4S, v31.s[0] +add v22.4s, v22.4s, v11.4s +sqrdmulh v11.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v2.4s, v13.4s, v10.4s +mla v19.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v10.4s +sqrdmulh v10.4S, v17.4S, v25.s[3] +mul v17.4S, v17.4S,v26.s[3] +sub v8.4s, v21.4s, v19.4s +mla v16.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v19.4s +str q13, [x0, #48] +sqrdmulh v13.4S, v3.4S, v23.s[0] +str q2, [x0, #112] +mul v3.4S, v3.4S,v24.s[0] +ldr q2, [x0, #768] +sub v19.4s, v9.4s, v16.4s +ldr q11, [x0, #832] +mla v17.4S, v10.4S, v31.s[0] +add v9.4s, v9.4s, v16.4s +str q21, [x0, #176] +sqrdmulh v21.4S, v0.4S, v23.s[1] +str q8, [x0, #240] +mul v0.4S, v0.4S,v24.s[1] +ldr q8, [x0, #896] +sub v16.4s, v1.4s, v17.4s +ldr q10, [x0, #960] +mla v3.4S, v13.4S, v31.s[0] +add v1.4s, v1.4s, v17.4s +str q9, [x0, #304] +sqrdmulh v9.4S, v22.4S, v23.s[2] +str q19, [x0, #368] +mul v22.4S, v22.4S,v24.s[2] +ldr q19, [x0, #256] +sub v17.4s, v18.4s, v3.4s +ldr q13, [x0, #320] +mla v0.4S, v21.4S, v31.s[0] +add v18.4s, v18.4s, v3.4s +str q1, [x0, #432] +sqrdmulh v1.4S, v15.4S, v23.s[3] +str q16, [x0, #496] +mul v15.4S, v15.4S,v24.s[3] +ldr q16, [x0, #384] +sub v3.4s, v14.4s, v0.4s +ldr q21, [x0, #448] +mla v22.4S, v9.4S, v31.s[0] +add v14.4s, v14.4s, v0.4s +str q18, [x0, #560] +sqrdmulh v18.4S, v2.4S, v29.s[0] +str q17, [x0, #624] +ldr q17, [x0, #512] +mul v2.4S, v2.4S,v30.s[0] +ldr q0, [x0, #576] +sub v9.4s, v12.4s, v22.4s +mla v15.4S, v1.4S, v31.s[0] +add v12.4s, v12.4s, v22.4s +str q14, [x0, #688] +sqrdmulh v14.4S, v11.4S, v29.s[0] +str q3, [x0, #752] +ldr q3, [x0, #640] +mul v11.4S, v11.4S,v30.s[0] +ldr q22, [x0, #704] +sub v1.4s, v20.4s, v15.4s +mla v2.4S, v18.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +str q12, [x0, #816] +sqrdmulh v12.4S, v8.4S, v29.s[0] +str q9, [x0, #880] +mul v8.4S, v8.4S,v30.s[0] +ldr q9, [x0, #0] +sub v15.4s, v19.4s, v2.4s +mla v11.4S, v14.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +str q20, [x0, #944] +sqrdmulh v20.4S, v10.4S, v29.s[0] +str q1, [x0, #1008] +mul v10.4S, v10.4S,v30.s[0] +ldr q1, [x0, #64] +sub v2.4s, v13.4s, v11.4s +mla v8.4S, v12.4S, v31.s[0] +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v29.s[0] +ldr q12, [x0, #128] +mul v17.4S, v17.4S,v30.s[0] +sub v14.4s, v16.4s, v8.4s +mla v10.4S, v20.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v0.4S, v29.s[0] +ldr q20, [x0, #192] +mul v0.4S, v0.4S,v30.s[0] +sub v18.4s, v21.4s, v10.4s +mla v17.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +sub v11.4s, v9.4s, v17.4s +mla v0.4S, v8.4S, v31.s[0] +add v9.4s, v9.4s, v17.4s +sqrdmulh v17.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v8.4s, v1.4s, v0.4s +mla v3.4S, v10.4S, v31.s[0] +add v1.4s, v1.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v10.4s, v12.4s, v3.4s +mla v22.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v17.4s, v20.4s, v22.4s +mla v16.4S, v0.4S, v31.s[0] +add v20.4s, v20.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[1] +mul v19.4S, v19.4S,v30.s[1] +sub v0.4s, v12.4s, v16.4s +mla v21.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v13.4S, v29.s[1] +mul v13.4S, v13.4S,v30.s[1] +sub v3.4s, v20.4s, v21.4s +mla v19.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v22.4s, v9.4s, v19.4s +mla v13.4S, v16.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v29.s[2] +mul v18.4S, v18.4S,v30.s[2] +sub v16.4s, v1.4s, v13.4s +mla v14.4S, v21.4S, v31.s[0] +add v1.4s, v1.4s, v13.4s +sqrdmulh v13.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +sub v21.4s, v10.4s, v14.4s +mla v18.4S, v19.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v19.4s, v17.4s, v18.4s +mla v15.4S, v13.4S, v31.s[0] +add v17.4s, v17.4s, v18.4s +sqrdmulh v18.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +sub v13.4s, v11.4s, v15.4s +mla v2.4S, v14.4S, v31.s[0] +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v27.s[0] +mul v20.4S, v20.4S,v28.s[0] +sub v14.4s, v8.4s, v2.4s +mla v12.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v2.4s +sqrdmulh v2.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v18.4s, v9.4s, v12.4s +mla v20.4S, v15.4S, v31.s[0] +add v9.4s, v9.4s, v12.4s +sqrdmulh v12.4S, v3.4S, v27.s[1] +mul v3.4S, v3.4S,v28.s[1] +sub v15.4s, v1.4s, v20.4s +mla v0.4S, v2.4S, v31.s[0] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v10.4S, v27.s[2] +mul v10.4S, v10.4S,v28.s[2] +sub v2.4s, v22.4s, v0.4s +mla v3.4S, v12.4S, v31.s[0] +add v22.4s, v22.4s, v0.4s +sqrdmulh v0.4S, v17.4S, v27.s[2] +mul v17.4S, v17.4S,v28.s[2] +sub v12.4s, v16.4s, v3.4s +mla v10.4S, v20.4S, v31.s[0] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +sub v20.4s, v11.4s, v10.4s +mla v17.4S, v0.4S, v31.s[0] +add v11.4s, v11.4s, v10.4s +sqrdmulh v10.4S, v19.4S, v27.s[3] +mul v19.4S, v19.4S,v28.s[3] +sub v0.4s, v8.4s, v17.4s +mla v21.4S, v3.4S, v31.s[0] +add v8.4s, v8.4s, v17.4s +sqrdmulh v17.4S, v1.4S, v25.s[0] +mul v1.4S, v1.4S,v26.s[0] +sub v3.4s, v13.4s, v21.4s +mla v19.4S, v10.4S, v31.s[0] +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v15.4S, v25.s[1] +mul v15.4S, v15.4S,v26.s[1] +sub v10.4s, v14.4s, v19.4s +mla v1.4S, v17.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +sqrdmulh v19.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v17.4s, v9.4s, v1.4s +mla v15.4S, v21.4S, v31.s[0] +add v9.4s, v9.4s, v1.4s +sqrdmulh v1.4S, v12.4S, v25.s[3] +mul v12.4S, v12.4S,v26.s[3] +sub v21.4s, v18.4s, v15.4s +mla v16.4S, v19.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +str q9, [x0, #0] +sqrdmulh v9.4S, v8.4S, v23.s[0] +str q17, [x0, #64] +mul v8.4S, v8.4S,v24.s[0] +ldr q17, [x0, #784] +sub v15.4s, v22.4s, v16.4s +ldr q19, [x0, #848] +mla v12.4S, v1.4S, v31.s[0] +add v22.4s, v22.4s, v16.4s +str q18, [x0, #128] +sqrdmulh v18.4S, v0.4S, v23.s[1] +str q21, [x0, #192] +mul v0.4S, v0.4S,v24.s[1] +ldr q21, [x0, #912] +sub v16.4s, v2.4s, v12.4s +ldr q1, [x0, #976] +mla v8.4S, v9.4S, v31.s[0] +add v2.4s, v2.4s, v12.4s +str q22, [x0, #256] +sqrdmulh v22.4S, v14.4S, v23.s[2] +str q15, [x0, #320] +mul v14.4S, v14.4S,v24.s[2] +ldr q15, [x0, #272] +sub v12.4s, v11.4s, v8.4s +ldr q9, [x0, #336] +mla v0.4S, v18.4S, v31.s[0] +add v11.4s, v11.4s, v8.4s +str q2, [x0, #384] +sqrdmulh v2.4S, v10.4S, v23.s[3] +str q16, [x0, #448] +mul v10.4S, v10.4S,v24.s[3] +ldr q16, [x0, #400] +sub v8.4s, v20.4s, v0.4s +ldr q18, [x0, #464] +mla v14.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v0.4s +str q11, [x0, #512] +sqrdmulh v11.4S, v17.4S, v29.s[0] +str q12, [x0, #576] +ldr q12, [x0, #528] +mul v17.4S, v17.4S,v30.s[0] +ldr q0, [x0, #592] +sub v22.4s, v13.4s, v14.4s +mla v10.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v14.4s +str q20, [x0, #640] +sqrdmulh v20.4S, v19.4S, v29.s[0] +str q8, [x0, #704] +ldr q8, [x0, #656] +mul v19.4S, v19.4S,v30.s[0] +ldr q14, [x0, #720] +sub v2.4s, v3.4s, v10.4s +mla v17.4S, v11.4S, v31.s[0] +add v3.4s, v3.4s, v10.4s +str q13, [x0, #768] +sqrdmulh v13.4S, v21.4S, v29.s[0] +str q22, [x0, #832] +mul v21.4S, v21.4S,v30.s[0] +ldr q22, [x0, #16] +sub v10.4s, v15.4s, v17.4s +mla v19.4S, v20.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +str q3, [x0, #896] +sqrdmulh v3.4S, v1.4S, v29.s[0] +str q2, [x0, #960] +mul v1.4S, v1.4S,v30.s[0] +ldr q2, [x0, #80] +sub v17.4s, v9.4s, v19.4s +mla v21.4S, v13.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v12.4S, v29.s[0] +ldr q13, [x0, #144] +mul v12.4S, v12.4S,v30.s[0] +sub v20.4s, v16.4s, v21.4s +mla v1.4S, v3.4S, v31.s[0] +add v16.4s, v16.4s, v21.4s +sqrdmulh v21.4S, v0.4S, v29.s[0] +ldr q3, [x0, #208] +mul v0.4S, v0.4S,v30.s[0] +sub v11.4s, v18.4s, v1.4s +mla v12.4S, v19.4S, v31.s[0] +add v18.4s, v18.4s, v1.4s +sqrdmulh v1.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v19.4s, v22.4s, v12.4s +mla v0.4S, v21.4S, v31.s[0] +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v21.4s, v2.4s, v0.4s +mla v8.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v1.4s, v13.4s, v8.4s +mla v14.4S, v12.4S, v31.s[0] +add v13.4s, v13.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v12.4s, v3.4s, v14.4s +mla v16.4S, v0.4S, v31.s[0] +add v3.4s, v3.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +sub v0.4s, v13.4s, v16.4s +mla v18.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v16.4s +sqrdmulh v16.4S, v9.4S, v29.s[1] +mul v9.4S, v9.4S,v30.s[1] +sub v8.4s, v3.4s, v18.4s +mla v15.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +sub v14.4s, v22.4s, v15.4s +mla v9.4S, v16.4S, v31.s[0] +add v22.4s, v22.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v16.4s, v2.4s, v9.4s +mla v20.4S, v18.4S, v31.s[0] +add v2.4s, v2.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v18.4s, v1.4s, v20.4s +mla v11.4S, v15.4S, v31.s[0] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +sub v15.4s, v12.4s, v11.4s +mla v10.4S, v9.4S, v31.s[0] +add v12.4s, v12.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v27.s[0] +mul v13.4S, v13.4S,v28.s[0] +sub v9.4s, v19.4s, v10.4s +mla v17.4S, v20.4S, v31.s[0] +add v19.4s, v19.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v27.s[0] +mul v3.4S, v3.4S,v28.s[0] +sub v20.4s, v21.4s, v17.4s +mla v13.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v11.4s, v22.4s, v13.4s +mla v3.4S, v10.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v8.4S, v27.s[1] +mul v8.4S, v8.4S,v28.s[1] +sub v10.4s, v2.4s, v3.4s +mla v0.4S, v17.4S, v31.s[0] +add v2.4s, v2.4s, v3.4s +sqrdmulh v3.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +sub v17.4s, v14.4s, v0.4s +mla v8.4S, v13.4S, v31.s[0] +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v12.4S, v27.s[2] +mul v12.4S, v12.4S,v28.s[2] +sub v13.4s, v16.4s, v8.4s +mla v1.4S, v3.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v3.4s, v19.4s, v1.4s +mla v12.4S, v0.4S, v31.s[0] +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +sub v0.4s, v21.4s, v12.4s +mla v18.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v2.4S, v25.s[0] +mul v2.4S, v2.4S,v26.s[0] +sub v8.4s, v9.4s, v18.4s +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v10.4S, v25.s[1] +mul v10.4S, v10.4S,v26.s[1] +sub v1.4s, v20.4s, v15.4s +mla v2.4S, v12.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v12.4s, v22.4s, v2.4s +mla v10.4S, v18.4S, v31.s[0] +add v22.4s, v22.4s, v2.4s +sqrdmulh v2.4S, v13.4S, v25.s[3] +mul v13.4S, v13.4S,v26.s[3] +sub v18.4s, v11.4s, v10.4s +mla v16.4S, v15.4S, v31.s[0] +add v11.4s, v11.4s, v10.4s +str q22, [x0, #16] +sqrdmulh v22.4S, v21.4S, v23.s[0] +str q12, [x0, #80] +mul v21.4S, v21.4S,v24.s[0] +sub v12.4s, v14.4s, v16.4s +mla v13.4S, v2.4S, v31.s[0] +add v14.4s, v14.4s, v16.4s +str q11, [x0, #144] +sqrdmulh v11.4S, v0.4S, v23.s[1] +str q18, [x0, #208] +mul v0.4S, v0.4S,v24.s[1] +sub v18.4s, v17.4s, v13.4s +mla v21.4S, v22.4S, v31.s[0] +add v17.4s, v17.4s, v13.4s +str q14, [x0, #272] +sqrdmulh v14.4S, v20.4S, v23.s[2] +str q12, [x0, #336] +mul v20.4S, v20.4S,v24.s[2] +sub v12.4s, v19.4s, v21.4s +mla v0.4S, v11.4S, v31.s[0] +add v19.4s, v19.4s, v21.4s +str q17, [x0, #400] +sqrdmulh v17.4S, v1.4S, v23.s[3] +str q18, [x0, #464] +mul v1.4S, v1.4S,v24.s[3] +sub v18.4s, v3.4s, v0.4s +mla v20.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v0.4s +str q19, [x0, #528] +str q12, [x0, #592] +sub v12.4s, v9.4s, v20.4s +mla v1.4S, v17.4S, v31.s[0] +add v9.4s, v9.4s, v20.4s +str q3, [x0, #656] +str q18, [x0, #720] +sub v18.4s, v8.4s, v1.4s +add v8.4s, v8.4s, v1.4s +str q9, [x0, #784] +str q12, [x0, #848] +str q8, [x0, #912] +str q18, [x0, #976] +ldr q4, [x0, #32] +ldr q5, [x0, #48] +ldr q6, [x0, #0] +ldr q7, [x0, #16] +ldr q15, [x0, #96] +ldr q10, [x0, #112] +ldr q2, [x0, #64] +ldr q16, [x0, #80] +ldr q22, [x0, #160] +ldr q13, [x0, #176] +ldr q11, [x0, #128] +ldr q21, [x0, #144] +ldr q14, [x0, #224] +ldr q0, [x0, #240] +ldr q19, [x0, #192] +ldr q17, [x0, #208] +ldr q20, [x17, #+128] +ldr q3, [x17, #+144] +ldr q1, [x17, #+256] +ldr q9, [x17, #+272] +ldr q12, [x17, #+384] +ldr q8, [x17, #+400] +ldr q18, [x17, #+512] +ldr q30, [x17, #+528] +sqrdmulh v29.4S, v4.4S, v3.s[0] +mul v4.4S, v4.4S,v20.s[0] +sqrdmulh v28.4S, v5.4S, v3.s[0] +mul v5.4S, v5.4S,v20.s[0] +mla v4.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v15.4S, v9.s[0] +mul v15.4S, v15.4S,v1.s[0] +mla v5.4S, v28.4S, v31.s[0] +sub v28.4s, v6.4s, v4.4s +add v6.4s, v6.4s, v4.4s +sqrdmulh v4.4S, v10.4S, v9.s[0] +mul v10.4S, v10.4S,v1.s[0] +mla v15.4S, v29.4S, v31.s[0] +sub v29.4s, v7.4s, v5.4s +add v7.4s, v7.4s, v5.4s +sqrdmulh v5.4S, v7.4S, v3.s[1] +mul v7.4S, v7.4S,v20.s[1] +mla v10.4S, v4.4S, v31.s[0] +sub v4.4s, v2.4s, v15.4s +add v2.4s, v2.4s, v15.4s +sqrdmulh v15.4S, v29.4S, v3.s[2] +mul v29.4S, v29.4S,v20.s[2] +mla v7.4S, v5.4S, v31.s[0] +sub v5.4s, v16.4s, v10.4s +add v16.4s, v16.4s, v10.4s +sqrdmulh v10.4S, v16.4S, v9.s[1] +mul v16.4S, v16.4S,v1.s[1] +mla v29.4S, v15.4S, v31.s[0] +sub v15.4s, v6.4s, v7.4s +add v6.4s, v6.4s, v7.4s +sqrdmulh v3.4S, v5.4S, v9.s[2] +mul v5.4S, v5.4S,v1.s[2] +mla v16.4S, v10.4S, v31.s[0] +sub v10.4s, v28.4s, v29.4s +add v28.4s, v28.4s, v29.4s +sqrdmulh v29.4S, v22.4S, v8.s[0] +mul v22.4S, v22.4S,v12.s[0] +trn1 v20.4S, v6.4S, v15.4S +trn2 v7.4S, v6.4S, v15.4S +mla v5.4S, v3.4S, v31.s[0] +sub v3.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +sqrdmulh v9.4S, v13.4S, v8.s[0] +mul v13.4S, v13.4S,v12.s[0] +trn1 v1.4S, v28.4S, v10.4S +trn2 v16.4S, v28.4S, v10.4S +mla v22.4S, v29.4S, v31.s[0] +sub v29.4s, v4.4s, v5.4s +add v4.4s, v4.4s, v5.4s +sqrdmulh v5.4S, v14.4S, v30.s[0] +mul v14.4S, v14.4S,v18.s[0] +trn2 v28.2D, v20.2D, v1.2D +trn2 v10.2D, v7.2D, v16.2D +mla v13.4S, v9.4S, v31.s[0] +sub v9.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +sqrdmulh v22.4S, v0.4S, v30.s[0] +mul v0.4S, v0.4S,v18.s[0] +trn1 v6.2D, v20.2D, v1.2D +trn1 v15.2D, v7.2D, v16.2D +mla v14.4S, v5.4S, v31.s[0] +sub v5.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +sqrdmulh v13.4S, v21.4S, v8.s[1] +mul v21.4S, v21.4S,v12.s[1] +trn1 v16.4S, v2.4S, v3.4S +trn2 v7.4S, v2.4S, v3.4S +mla v0.4S, v22.4S, v31.s[0] +sub v22.4s, v19.4s, v14.4s +add v19.4s, v19.4s, v14.4s +sqrdmulh v14.4S, v5.4S, v8.s[2] +mul v5.4S, v5.4S,v12.s[2] +trn1 v1.4S, v4.4S, v29.4S +trn2 v20.4S, v4.4S, v29.4S +mla v21.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v0.4s +add v17.4s, v17.4s, v0.4s +sqrdmulh v0.4S, v17.4S, v30.s[1] +mul v17.4S, v17.4S,v18.s[1] +trn2 v4.2D, v16.2D, v1.2D +trn2 v29.2D, v7.2D, v20.2D +mla v5.4S, v14.4S, v31.s[0] +sub v14.4s, v11.4s, v21.4s +add v11.4s, v11.4s, v21.4s +sqrdmulh v8.4S, v13.4S, v30.s[2] +mul v13.4S, v13.4S,v18.s[2] +trn1 v2.2D, v16.2D, v1.2D +trn1 v3.2D, v7.2D, v20.2D +mla v17.4S, v0.4S, v31.s[0] +sub v0.4s, v9.4s, v5.4s +add v9.4s, v9.4s, v5.4s +mla v13.4S, v8.4S, v31.s[0] +sub v8.4s, v19.4s, v17.4s +add v19.4s, v19.4s, v17.4s +sub v30.4s, v22.4s, v13.4s +add v22.4s, v22.4s, v13.4s +ldr q13, [x17, #+160] +ldr q18, [x17, #+176] +sqrdmulh v17.4S, v28.4S, v18.4S +mul v28.4S, v28.4S,v13.4S +trn1 v5.4S, v11.4S, v14.4S +trn2 v20.4S, v11.4S, v14.4S +sqrdmulh v7.4S, v10.4S, v18.4S +mul v10.4S, v10.4S,v13.4S +trn1 v1.4S, v9.4S, v0.4S +trn2 v16.4S, v9.4S, v0.4S +mla v28.4S, v17.4S, v31.s[0] +ldr q17, [x17, #+288] +ldr q18, [x17, #+304] +sqrdmulh v13.4S, v4.4S, v18.4S +mul v4.4S, v4.4S,v17.4S +trn2 v9.2D, v5.2D, v1.2D +trn2 v0.2D, v20.2D, v16.2D +mla v10.4S, v7.4S, v31.s[0] +sub v7.4s, v6.4s, v28.4s +add v6.4s, v6.4s, v28.4s +sqrdmulh v28.4S, v29.4S, v18.4S +mul v29.4S, v29.4S,v17.4S +trn1 v11.2D, v5.2D, v1.2D +trn1 v14.2D, v20.2D, v16.2D +mla v4.4S, v13.4S, v31.s[0] +sub v13.4s, v15.4s, v10.4s +add v15.4s, v15.4s, v10.4s +ldr q10, [x17, #+192] +ldr q16, [x17, #+208] +sqrdmulh v20.4S, v15.4S, v16.4S +mul v15.4S, v15.4S,v10.4S +trn1 v16.4S, v19.4S, v8.4S +trn2 v10.4S, v19.4S, v8.4S +mla v29.4S, v28.4S, v31.s[0] +sub v28.4s, v2.4s, v4.4s +add v2.4s, v2.4s, v4.4s +ldr q4, [x17, #+224] +ldr q1, [x17, #+240] +sqrdmulh v5.4S, v13.4S, v1.4S +mul v13.4S, v13.4S,v4.4S +trn1 v1.4S, v22.4S, v30.4S +trn2 v4.4S, v22.4S, v30.4S +mla v15.4S, v20.4S, v31.s[0] +sub v20.4s, v3.4s, v29.4s +add v3.4s, v3.4s, v29.4s +ldr q29, [x17, #+320] +ldr q18, [x17, #+336] +sqrdmulh v17.4S, v3.4S, v18.4S +mul v3.4S, v3.4S,v29.4S +trn2 v22.2D, v16.2D, v1.2D +trn2 v30.2D, v10.2D, v4.2D +mla v13.4S, v5.4S, v31.s[0] +sub v5.4s, v6.4s, v15.4s +add v6.4s, v6.4s, v15.4s +ldr q15, [x17, #+352] +ldr q18, [x17, #+368] +sqrdmulh v29.4S, v20.4S, v18.4S +mul v20.4S, v20.4S,v15.4S +trn1 v19.2D, v16.2D, v1.2D +trn1 v8.2D, v10.2D, v4.2D +mla v3.4S, v17.4S, v31.s[0] +sub v17.4s, v7.4s, v13.4s +add v7.4s, v7.4s, v13.4s +mla v20.4S, v29.4S, v31.s[0] +sub v29.4s, v2.4s, v3.4s +add v2.4s, v2.4s, v3.4s +sub v3.4s, v28.4s, v20.4s +add v28.4s, v28.4s, v20.4s +str q6, [x0, #0] +str q5, [x0, #16] +str q7, [x0, #32] +str q17, [x0, #48] +str q2, [x0, #64] +str q29, [x0, #80] +str q28, [x0, #96] +str q3, [x0, #112] +ldr q3, [x17, #+416] +ldr q28, [x17, #+432] +sqrdmulh v29.4S, v9.4S, v28.4S +mul v9.4S, v9.4S,v3.4S +sqrdmulh v2.4S, v0.4S, v28.4S +mul v0.4S, v0.4S,v3.4S +mla v9.4S, v29.4S, v31.s[0] +ldr q29, [x17, #+544] +ldr q28, [x17, #+560] +sqrdmulh v3.4S, v22.4S, v28.4S +mul v22.4S, v22.4S,v29.4S +mla v0.4S, v2.4S, v31.s[0] +sub v2.4s, v11.4s, v9.4s +add v11.4s, v11.4s, v9.4s +sqrdmulh v9.4S, v30.4S, v28.4S +mul v30.4S, v30.4S,v29.4S +mla v22.4S, v3.4S, v31.s[0] +sub v3.4s, v14.4s, v0.4s +add v14.4s, v14.4s, v0.4s +ldr q0, [x17, #+448] +ldr q28, [x17, #+464] +sqrdmulh v29.4S, v14.4S, v28.4S +mul v14.4S, v14.4S,v0.4S +mla v30.4S, v9.4S, v31.s[0] +sub v9.4s, v19.4s, v22.4s +add v19.4s, v19.4s, v22.4s +ldr q22, [x17, #+480] +ldr q28, [x17, #+496] +sqrdmulh v0.4S, v3.4S, v28.4S +mul v3.4S, v3.4S,v22.4S +mla v14.4S, v29.4S, v31.s[0] +sub v29.4s, v8.4s, v30.4s +add v8.4s, v8.4s, v30.4s +ldr q30, [x17, #+576] +ldr q28, [x17, #+592] +sqrdmulh v22.4S, v8.4S, v28.4S +mul v8.4S, v8.4S,v30.4S +mla v3.4S, v0.4S, v31.s[0] +sub v0.4s, v11.4s, v14.4s +add v11.4s, v11.4s, v14.4s +ldr q14, [x17, #+608] +ldr q28, [x17, #+624] +sqrdmulh v30.4S, v29.4S, v28.4S +mul v29.4S, v29.4S,v14.4S +mla v8.4S, v22.4S, v31.s[0] +sub v22.4s, v2.4s, v3.4s +add v2.4s, v2.4s, v3.4s +mla v29.4S, v30.4S, v31.s[0] +sub v30.4s, v19.4s, v8.4s +add v19.4s, v19.4s, v8.4s +sub v8.4s, v9.4s, v29.4s +add v9.4s, v9.4s, v29.4s +str q11, [x0, #128] +str q0, [x0, #144] +str q2, [x0, #160] +str q22, [x0, #176] +str q19, [x0, #192] +str q30, [x0, #208] +str q9, [x0, #224] +str q8, [x0, #240] +ldr q8, [x0, #288] +ldr q9, [x0, #304] +ldr q30, [x0, #256] +ldr q19, [x0, #272] +ldr q22, [x0, #352] +ldr q2, [x0, #368] +ldr q0, [x0, #320] +ldr q11, [x0, #336] +ldr q29, [x0, #416] +ldr q3, [x0, #432] +ldr q28, [x0, #384] +ldr q14, [x0, #400] +ldr q17, [x0, #480] +ldr q7, [x0, #496] +ldr q5, [x0, #448] +ldr q6, [x0, #464] +ldr q20, [x17, #+640] +ldr q13, [x17, #+656] +ldr q4, [x17, #+768] +ldr q10, [x17, #+784] +ldr q1, [x17, #+896] +ldr q16, [x17, #+912] +ldr q18, [x17, #+1024] +ldr q15, [x17, #+1040] +sqrdmulh v12.4S, v8.4S, v13.s[0] +mul v8.4S, v8.4S,v20.s[0] +sqrdmulh v21.4S, v9.4S, v13.s[0] +mul v9.4S, v9.4S,v20.s[0] +mla v8.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v22.4S, v10.s[0] +mul v22.4S, v22.4S,v4.s[0] +mla v9.4S, v21.4S, v31.s[0] +sub v21.4s, v30.4s, v8.4s +add v30.4s, v30.4s, v8.4s +sqrdmulh v8.4S, v2.4S, v10.s[0] +mul v2.4S, v2.4S,v4.s[0] +mla v22.4S, v12.4S, v31.s[0] +sub v12.4s, v19.4s, v9.4s +add v19.4s, v19.4s, v9.4s +sqrdmulh v9.4S, v19.4S, v13.s[1] +mul v19.4S, v19.4S,v20.s[1] +mla v2.4S, v8.4S, v31.s[0] +sub v8.4s, v0.4s, v22.4s +add v0.4s, v0.4s, v22.4s +sqrdmulh v22.4S, v12.4S, v13.s[2] +mul v12.4S, v12.4S,v20.s[2] +mla v19.4S, v9.4S, v31.s[0] +sub v9.4s, v11.4s, v2.4s +add v11.4s, v11.4s, v2.4s +sqrdmulh v2.4S, v11.4S, v10.s[1] +mul v11.4S, v11.4S,v4.s[1] +mla v12.4S, v22.4S, v31.s[0] +sub v22.4s, v30.4s, v19.4s +add v30.4s, v30.4s, v19.4s +sqrdmulh v13.4S, v9.4S, v10.s[2] +mul v9.4S, v9.4S,v4.s[2] +mla v11.4S, v2.4S, v31.s[0] +sub v2.4s, v21.4s, v12.4s +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v29.4S, v16.s[0] +mul v29.4S, v29.4S,v1.s[0] +trn1 v20.4S, v30.4S, v22.4S +trn2 v19.4S, v30.4S, v22.4S +mla v9.4S, v13.4S, v31.s[0] +sub v13.4s, v0.4s, v11.4s +add v0.4s, v0.4s, v11.4s +sqrdmulh v10.4S, v3.4S, v16.s[0] +mul v3.4S, v3.4S,v1.s[0] +trn1 v4.4S, v21.4S, v2.4S +trn2 v11.4S, v21.4S, v2.4S +mla v29.4S, v12.4S, v31.s[0] +sub v12.4s, v8.4s, v9.4s +add v8.4s, v8.4s, v9.4s +sqrdmulh v9.4S, v17.4S, v15.s[0] +mul v17.4S, v17.4S,v18.s[0] +trn2 v21.2D, v20.2D, v4.2D +trn2 v2.2D, v19.2D, v11.2D +mla v3.4S, v10.4S, v31.s[0] +sub v10.4s, v28.4s, v29.4s +add v28.4s, v28.4s, v29.4s +sqrdmulh v29.4S, v7.4S, v15.s[0] +mul v7.4S, v7.4S,v18.s[0] +trn1 v30.2D, v20.2D, v4.2D +trn1 v22.2D, v19.2D, v11.2D +mla v17.4S, v9.4S, v31.s[0] +sub v9.4s, v14.4s, v3.4s +add v14.4s, v14.4s, v3.4s +sqrdmulh v3.4S, v14.4S, v16.s[1] +mul v14.4S, v14.4S,v1.s[1] +trn1 v11.4S, v0.4S, v13.4S +trn2 v19.4S, v0.4S, v13.4S +mla v7.4S, v29.4S, v31.s[0] +sub v29.4s, v5.4s, v17.4s +add v5.4s, v5.4s, v17.4s +sqrdmulh v17.4S, v9.4S, v16.s[2] +mul v9.4S, v9.4S,v1.s[2] +trn1 v4.4S, v8.4S, v12.4S +trn2 v20.4S, v8.4S, v12.4S +mla v14.4S, v3.4S, v31.s[0] +sub v3.4s, v6.4s, v7.4s +add v6.4s, v6.4s, v7.4s +sqrdmulh v7.4S, v6.4S, v15.s[1] +mul v6.4S, v6.4S,v18.s[1] +trn2 v8.2D, v11.2D, v4.2D +trn2 v12.2D, v19.2D, v20.2D +mla v9.4S, v17.4S, v31.s[0] +sub v17.4s, v28.4s, v14.4s +add v28.4s, v28.4s, v14.4s +sqrdmulh v16.4S, v3.4S, v15.s[2] +mul v3.4S, v3.4S,v18.s[2] +trn1 v0.2D, v11.2D, v4.2D +trn1 v13.2D, v19.2D, v20.2D +mla v6.4S, v7.4S, v31.s[0] +sub v7.4s, v10.4s, v9.4s +add v10.4s, v10.4s, v9.4s +mla v3.4S, v16.4S, v31.s[0] +sub v16.4s, v5.4s, v6.4s +add v5.4s, v5.4s, v6.4s +sub v15.4s, v29.4s, v3.4s +add v29.4s, v29.4s, v3.4s +ldr q3, [x17, #+672] +ldr q18, [x17, #+688] +sqrdmulh v6.4S, v21.4S, v18.4S +mul v21.4S, v21.4S,v3.4S +trn1 v9.4S, v28.4S, v17.4S +trn2 v20.4S, v28.4S, v17.4S +sqrdmulh v19.4S, v2.4S, v18.4S +mul v2.4S, v2.4S,v3.4S +trn1 v4.4S, v10.4S, v7.4S +trn2 v11.4S, v10.4S, v7.4S +mla v21.4S, v6.4S, v31.s[0] +ldr q6, [x17, #+800] +ldr q18, [x17, #+816] +sqrdmulh v3.4S, v8.4S, v18.4S +mul v8.4S, v8.4S,v6.4S +trn2 v10.2D, v9.2D, v4.2D +trn2 v7.2D, v20.2D, v11.2D +mla v2.4S, v19.4S, v31.s[0] +sub v19.4s, v30.4s, v21.4s +add v30.4s, v30.4s, v21.4s +sqrdmulh v21.4S, v12.4S, v18.4S +mul v12.4S, v12.4S,v6.4S +trn1 v28.2D, v9.2D, v4.2D +trn1 v17.2D, v20.2D, v11.2D +mla v8.4S, v3.4S, v31.s[0] +sub v3.4s, v22.4s, v2.4s +add v22.4s, v22.4s, v2.4s +ldr q2, [x17, #+704] +ldr q11, [x17, #+720] +sqrdmulh v20.4S, v22.4S, v11.4S +mul v22.4S, v22.4S,v2.4S +trn1 v11.4S, v5.4S, v16.4S +trn2 v2.4S, v5.4S, v16.4S +mla v12.4S, v21.4S, v31.s[0] +sub v21.4s, v0.4s, v8.4s +add v0.4s, v0.4s, v8.4s +ldr q8, [x17, #+736] +ldr q4, [x17, #+752] +sqrdmulh v9.4S, v3.4S, v4.4S +mul v3.4S, v3.4S,v8.4S +trn1 v4.4S, v29.4S, v15.4S +trn2 v8.4S, v29.4S, v15.4S +mla v22.4S, v20.4S, v31.s[0] +sub v20.4s, v13.4s, v12.4s +add v13.4s, v13.4s, v12.4s +ldr q12, [x17, #+832] +ldr q18, [x17, #+848] +sqrdmulh v6.4S, v13.4S, v18.4S +mul v13.4S, v13.4S,v12.4S +trn2 v29.2D, v11.2D, v4.2D +trn2 v15.2D, v2.2D, v8.2D +mla v3.4S, v9.4S, v31.s[0] +sub v9.4s, v30.4s, v22.4s +add v30.4s, v30.4s, v22.4s +ldr q22, [x17, #+864] +ldr q18, [x17, #+880] +sqrdmulh v12.4S, v20.4S, v18.4S +mul v20.4S, v20.4S,v22.4S +trn1 v5.2D, v11.2D, v4.2D +trn1 v16.2D, v2.2D, v8.2D +mla v13.4S, v6.4S, v31.s[0] +sub v6.4s, v19.4s, v3.4s +add v19.4s, v19.4s, v3.4s +mla v20.4S, v12.4S, v31.s[0] +sub v12.4s, v0.4s, v13.4s +add v0.4s, v0.4s, v13.4s +sub v13.4s, v21.4s, v20.4s +add v21.4s, v21.4s, v20.4s +str q30, [x0, #256] +str q9, [x0, #272] +str q19, [x0, #288] +str q6, [x0, #304] +str q0, [x0, #320] +str q12, [x0, #336] +str q21, [x0, #352] +str q13, [x0, #368] +ldr q13, [x17, #+928] +ldr q21, [x17, #+944] +sqrdmulh v12.4S, v10.4S, v21.4S +mul v10.4S, v10.4S,v13.4S +sqrdmulh v0.4S, v7.4S, v21.4S +mul v7.4S, v7.4S,v13.4S +mla v10.4S, v12.4S, v31.s[0] +ldr q12, [x17, #+1056] +ldr q21, [x17, #+1072] +sqrdmulh v13.4S, v29.4S, v21.4S +mul v29.4S, v29.4S,v12.4S +mla v7.4S, v0.4S, v31.s[0] +sub v0.4s, v28.4s, v10.4s +add v28.4s, v28.4s, v10.4s +sqrdmulh v10.4S, v15.4S, v21.4S +mul v15.4S, v15.4S,v12.4S +mla v29.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v7.4s +add v17.4s, v17.4s, v7.4s +ldr q7, [x17, #+960] +ldr q21, [x17, #+976] +sqrdmulh v12.4S, v17.4S, v21.4S +mul v17.4S, v17.4S,v7.4S +mla v15.4S, v10.4S, v31.s[0] +sub v10.4s, v5.4s, v29.4s +add v5.4s, v5.4s, v29.4s +ldr q29, [x17, #+992] +ldr q21, [x17, #+1008] +sqrdmulh v7.4S, v13.4S, v21.4S +mul v13.4S, v13.4S,v29.4S +mla v17.4S, v12.4S, v31.s[0] +sub v12.4s, v16.4s, v15.4s +add v16.4s, v16.4s, v15.4s +ldr q15, [x17, #+1088] +ldr q21, [x17, #+1104] +sqrdmulh v29.4S, v16.4S, v21.4S +mul v16.4S, v16.4S,v15.4S +mla v13.4S, v7.4S, v31.s[0] +sub v7.4s, v28.4s, v17.4s +add v28.4s, v28.4s, v17.4s +ldr q17, [x17, #+1120] +ldr q21, [x17, #+1136] +sqrdmulh v15.4S, v12.4S, v21.4S +mul v12.4S, v12.4S,v17.4S +mla v16.4S, v29.4S, v31.s[0] +sub v29.4s, v0.4s, v13.4s +add v0.4s, v0.4s, v13.4s +mla v12.4S, v15.4S, v31.s[0] +sub v15.4s, v5.4s, v16.4s +add v5.4s, v5.4s, v16.4s +sub v16.4s, v10.4s, v12.4s +add v10.4s, v10.4s, v12.4s +str q28, [x0, #384] +str q7, [x0, #400] +str q0, [x0, #416] +str q29, [x0, #432] +str q5, [x0, #448] +str q15, [x0, #464] +str q10, [x0, #480] +str q16, [x0, #496] +ldr q16, [x0, #544] +ldr q10, [x0, #560] +ldr q15, [x0, #512] +ldr q5, [x0, #528] +ldr q29, [x0, #608] +ldr q0, [x0, #624] +ldr q7, [x0, #576] +ldr q28, [x0, #592] +ldr q12, [x0, #672] +ldr q13, [x0, #688] +ldr q21, [x0, #640] +ldr q17, [x0, #656] +ldr q6, [x0, #736] +ldr q19, [x0, #752] +ldr q9, [x0, #704] +ldr q30, [x0, #720] +ldr q20, [x17, #+1152] +ldr q3, [x17, #+1168] +ldr q8, [x17, #+1280] +ldr q2, [x17, #+1296] +ldr q4, [x17, #+1408] +ldr q11, [x17, #+1424] +ldr q18, [x17, #+1536] +ldr q22, [x17, #+1552] +sqrdmulh v1.4S, v16.4S, v3.s[0] +mul v16.4S, v16.4S,v20.s[0] +sqrdmulh v14.4S, v10.4S, v3.s[0] +mul v10.4S, v10.4S,v20.s[0] +mla v16.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v29.4S, v2.s[0] +mul v29.4S, v29.4S,v8.s[0] +mla v10.4S, v14.4S, v31.s[0] +sub v14.4s, v15.4s, v16.4s +add v15.4s, v15.4s, v16.4s +sqrdmulh v16.4S, v0.4S, v2.s[0] +mul v0.4S, v0.4S,v8.s[0] +mla v29.4S, v1.4S, v31.s[0] +sub v1.4s, v5.4s, v10.4s +add v5.4s, v5.4s, v10.4s +sqrdmulh v10.4S, v5.4S, v3.s[1] +mul v5.4S, v5.4S,v20.s[1] +mla v0.4S, v16.4S, v31.s[0] +sub v16.4s, v7.4s, v29.4s +add v7.4s, v7.4s, v29.4s +sqrdmulh v29.4S, v1.4S, v3.s[2] +mul v1.4S, v1.4S,v20.s[2] +mla v5.4S, v10.4S, v31.s[0] +sub v10.4s, v28.4s, v0.4s +add v28.4s, v28.4s, v0.4s +sqrdmulh v0.4S, v28.4S, v2.s[1] +mul v28.4S, v28.4S,v8.s[1] +mla v1.4S, v29.4S, v31.s[0] +sub v29.4s, v15.4s, v5.4s +add v15.4s, v15.4s, v5.4s +sqrdmulh v3.4S, v10.4S, v2.s[2] +mul v10.4S, v10.4S,v8.s[2] +mla v28.4S, v0.4S, v31.s[0] +sub v0.4s, v14.4s, v1.4s +add v14.4s, v14.4s, v1.4s +sqrdmulh v1.4S, v12.4S, v11.s[0] +mul v12.4S, v12.4S,v4.s[0] +trn1 v20.4S, v15.4S, v29.4S +trn2 v5.4S, v15.4S, v29.4S +mla v10.4S, v3.4S, v31.s[0] +sub v3.4s, v7.4s, v28.4s +add v7.4s, v7.4s, v28.4s +sqrdmulh v2.4S, v13.4S, v11.s[0] +mul v13.4S, v13.4S,v4.s[0] +trn1 v8.4S, v14.4S, v0.4S +trn2 v28.4S, v14.4S, v0.4S +mla v12.4S, v1.4S, v31.s[0] +sub v1.4s, v16.4s, v10.4s +add v16.4s, v16.4s, v10.4s +sqrdmulh v10.4S, v6.4S, v22.s[0] +mul v6.4S, v6.4S,v18.s[0] +trn2 v14.2D, v20.2D, v8.2D +trn2 v0.2D, v5.2D, v28.2D +mla v13.4S, v2.4S, v31.s[0] +sub v2.4s, v21.4s, v12.4s +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v18.s[0] +trn1 v15.2D, v20.2D, v8.2D +trn1 v29.2D, v5.2D, v28.2D +mla v6.4S, v10.4S, v31.s[0] +sub v10.4s, v17.4s, v13.4s +add v17.4s, v17.4s, v13.4s +sqrdmulh v13.4S, v17.4S, v11.s[1] +mul v17.4S, v17.4S,v4.s[1] +trn1 v28.4S, v7.4S, v3.4S +trn2 v5.4S, v7.4S, v3.4S +mla v19.4S, v12.4S, v31.s[0] +sub v12.4s, v9.4s, v6.4s +add v9.4s, v9.4s, v6.4s +sqrdmulh v6.4S, v10.4S, v11.s[2] +mul v10.4S, v10.4S,v4.s[2] +trn1 v8.4S, v16.4S, v1.4S +trn2 v20.4S, v16.4S, v1.4S +mla v17.4S, v13.4S, v31.s[0] +sub v13.4s, v30.4s, v19.4s +add v30.4s, v30.4s, v19.4s +sqrdmulh v19.4S, v30.4S, v22.s[1] +mul v30.4S, v30.4S,v18.s[1] +trn2 v16.2D, v28.2D, v8.2D +trn2 v1.2D, v5.2D, v20.2D +mla v10.4S, v6.4S, v31.s[0] +sub v6.4s, v21.4s, v17.4s +add v21.4s, v21.4s, v17.4s +sqrdmulh v11.4S, v13.4S, v22.s[2] +mul v13.4S, v13.4S,v18.s[2] +trn1 v7.2D, v28.2D, v8.2D +trn1 v3.2D, v5.2D, v20.2D +mla v30.4S, v19.4S, v31.s[0] +sub v19.4s, v2.4s, v10.4s +add v2.4s, v2.4s, v10.4s +mla v13.4S, v11.4S, v31.s[0] +sub v11.4s, v9.4s, v30.4s +add v9.4s, v9.4s, v30.4s +sub v22.4s, v12.4s, v13.4s +add v12.4s, v12.4s, v13.4s +ldr q13, [x17, #+1184] +ldr q18, [x17, #+1200] +sqrdmulh v30.4S, v14.4S, v18.4S +mul v14.4S, v14.4S,v13.4S +trn1 v10.4S, v21.4S, v6.4S +trn2 v20.4S, v21.4S, v6.4S +sqrdmulh v5.4S, v0.4S, v18.4S +mul v0.4S, v0.4S,v13.4S +trn1 v8.4S, v2.4S, v19.4S +trn2 v28.4S, v2.4S, v19.4S +mla v14.4S, v30.4S, v31.s[0] +ldr q30, [x17, #+1312] +ldr q18, [x17, #+1328] +sqrdmulh v13.4S, v16.4S, v18.4S +mul v16.4S, v16.4S,v30.4S +trn2 v2.2D, v10.2D, v8.2D +trn2 v19.2D, v20.2D, v28.2D +mla v0.4S, v5.4S, v31.s[0] +sub v5.4s, v15.4s, v14.4s +add v15.4s, v15.4s, v14.4s +sqrdmulh v14.4S, v1.4S, v18.4S +mul v1.4S, v1.4S,v30.4S +trn1 v21.2D, v10.2D, v8.2D +trn1 v6.2D, v20.2D, v28.2D +mla v16.4S, v13.4S, v31.s[0] +sub v13.4s, v29.4s, v0.4s +add v29.4s, v29.4s, v0.4s +ldr q0, [x17, #+1216] +ldr q28, [x17, #+1232] +sqrdmulh v20.4S, v29.4S, v28.4S +mul v29.4S, v29.4S,v0.4S +trn1 v28.4S, v9.4S, v11.4S +trn2 v0.4S, v9.4S, v11.4S +mla v1.4S, v14.4S, v31.s[0] +sub v14.4s, v7.4s, v16.4s +add v7.4s, v7.4s, v16.4s +ldr q16, [x17, #+1248] +ldr q8, [x17, #+1264] +sqrdmulh v10.4S, v13.4S, v8.4S +mul v13.4S, v13.4S,v16.4S +trn1 v8.4S, v12.4S, v22.4S +trn2 v16.4S, v12.4S, v22.4S +mla v29.4S, v20.4S, v31.s[0] +sub v20.4s, v3.4s, v1.4s +add v3.4s, v3.4s, v1.4s +ldr q1, [x17, #+1344] +ldr q18, [x17, #+1360] +sqrdmulh v30.4S, v3.4S, v18.4S +mul v3.4S, v3.4S,v1.4S +trn2 v12.2D, v28.2D, v8.2D +trn2 v22.2D, v0.2D, v16.2D +mla v13.4S, v10.4S, v31.s[0] +sub v10.4s, v15.4s, v29.4s +add v15.4s, v15.4s, v29.4s +ldr q29, [x17, #+1376] +ldr q18, [x17, #+1392] +sqrdmulh v1.4S, v20.4S, v18.4S +mul v20.4S, v20.4S,v29.4S +trn1 v9.2D, v28.2D, v8.2D +trn1 v11.2D, v0.2D, v16.2D +mla v3.4S, v30.4S, v31.s[0] +sub v30.4s, v5.4s, v13.4s +add v5.4s, v5.4s, v13.4s +mla v20.4S, v1.4S, v31.s[0] +sub v1.4s, v7.4s, v3.4s +add v7.4s, v7.4s, v3.4s +sub v3.4s, v14.4s, v20.4s +add v14.4s, v14.4s, v20.4s +str q15, [x0, #512] +str q10, [x0, #528] +str q5, [x0, #544] +str q30, [x0, #560] +str q7, [x0, #576] +str q1, [x0, #592] +str q14, [x0, #608] +str q3, [x0, #624] +ldr q3, [x17, #+1440] +ldr q14, [x17, #+1456] +sqrdmulh v1.4S, v2.4S, v14.4S +mul v2.4S, v2.4S,v3.4S +sqrdmulh v7.4S, v19.4S, v14.4S +mul v19.4S, v19.4S,v3.4S +mla v2.4S, v1.4S, v31.s[0] +ldr q1, [x17, #+1568] +ldr q14, [x17, #+1584] +sqrdmulh v3.4S, v12.4S, v14.4S +mul v12.4S, v12.4S,v1.4S +mla v19.4S, v7.4S, v31.s[0] +sub v7.4s, v21.4s, v2.4s +add v21.4s, v21.4s, v2.4s +sqrdmulh v2.4S, v22.4S, v14.4S +mul v22.4S, v22.4S,v1.4S +mla v12.4S, v3.4S, v31.s[0] +sub v3.4s, v6.4s, v19.4s +add v6.4s, v6.4s, v19.4s +ldr q19, [x17, #+1472] +ldr q14, [x17, #+1488] +sqrdmulh v1.4S, v6.4S, v14.4S +mul v6.4S, v6.4S,v19.4S +mla v22.4S, v2.4S, v31.s[0] +sub v2.4s, v9.4s, v12.4s +add v9.4s, v9.4s, v12.4s +ldr q12, [x17, #+1504] +ldr q14, [x17, #+1520] +sqrdmulh v19.4S, v3.4S, v14.4S +mul v3.4S, v3.4S,v12.4S +mla v6.4S, v1.4S, v31.s[0] +sub v1.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +ldr q22, [x17, #+1600] +ldr q14, [x17, #+1616] +sqrdmulh v12.4S, v11.4S, v14.4S +mul v11.4S, v11.4S,v22.4S +mla v3.4S, v19.4S, v31.s[0] +sub v19.4s, v21.4s, v6.4s +add v21.4s, v21.4s, v6.4s +ldr q6, [x17, #+1632] +ldr q14, [x17, #+1648] +sqrdmulh v22.4S, v1.4S, v14.4S +mul v1.4S, v1.4S,v6.4S +mla v11.4S, v12.4S, v31.s[0] +sub v12.4s, v7.4s, v3.4s +add v7.4s, v7.4s, v3.4s +mla v1.4S, v22.4S, v31.s[0] +sub v22.4s, v9.4s, v11.4s +add v9.4s, v9.4s, v11.4s +sub v11.4s, v2.4s, v1.4s +add v2.4s, v2.4s, v1.4s +str q21, [x0, #640] +str q19, [x0, #656] +str q7, [x0, #672] +str q12, [x0, #688] +str q9, [x0, #704] +str q22, [x0, #720] +str q2, [x0, #736] +str q11, [x0, #752] +ldr q11, [x0, #800] +ldr q2, [x0, #816] +ldr q22, [x0, #768] +ldr q9, [x0, #784] +ldr q12, [x0, #864] +ldr q7, [x0, #880] +ldr q19, [x0, #832] +ldr q21, [x0, #848] +ldr q1, [x0, #928] +ldr q3, [x0, #944] +ldr q14, [x0, #896] +ldr q6, [x0, #912] +ldr q30, [x0, #992] +ldr q5, [x0, #1008] +ldr q10, [x0, #960] +ldr q15, [x0, #976] +ldr q20, [x17, #+1664] +ldr q13, [x17, #+1680] +ldr q16, [x17, #+1792] +ldr q0, [x17, #+1808] +ldr q8, [x17, #+1920] +ldr q28, [x17, #+1936] +ldr q18, [x17, #+2048] +ldr q29, [x17, #+2064] +sqrdmulh v4.4S, v11.4S, v13.s[0] +mul v11.4S, v11.4S,v20.s[0] +sqrdmulh v17.4S, v2.4S, v13.s[0] +mul v2.4S, v2.4S,v20.s[0] +mla v11.4S, v4.4S, v31.s[0] +sqrdmulh v4.4S, v12.4S, v0.s[0] +mul v12.4S, v12.4S,v16.s[0] +mla v2.4S, v17.4S, v31.s[0] +sub v17.4s, v22.4s, v11.4s +add v22.4s, v22.4s, v11.4s +sqrdmulh v11.4S, v7.4S, v0.s[0] +mul v7.4S, v7.4S,v16.s[0] +mla v12.4S, v4.4S, v31.s[0] +sub v4.4s, v9.4s, v2.4s +add v9.4s, v9.4s, v2.4s +sqrdmulh v2.4S, v9.4S, v13.s[1] +mul v9.4S, v9.4S,v20.s[1] +mla v7.4S, v11.4S, v31.s[0] +sub v11.4s, v19.4s, v12.4s +add v19.4s, v19.4s, v12.4s +sqrdmulh v12.4S, v4.4S, v13.s[2] +mul v4.4S, v4.4S,v20.s[2] +mla v9.4S, v2.4S, v31.s[0] +sub v2.4s, v21.4s, v7.4s +add v21.4s, v21.4s, v7.4s +sqrdmulh v7.4S, v21.4S, v0.s[1] +mul v21.4S, v21.4S,v16.s[1] +mla v4.4S, v12.4S, v31.s[0] +sub v12.4s, v22.4s, v9.4s +add v22.4s, v22.4s, v9.4s +sqrdmulh v13.4S, v2.4S, v0.s[2] +mul v2.4S, v2.4S,v16.s[2] +mla v21.4S, v7.4S, v31.s[0] +sub v7.4s, v17.4s, v4.4s +add v17.4s, v17.4s, v4.4s +sqrdmulh v4.4S, v1.4S, v28.s[0] +mul v1.4S, v1.4S,v8.s[0] +trn1 v20.4S, v22.4S, v12.4S +trn2 v9.4S, v22.4S, v12.4S +mla v2.4S, v13.4S, v31.s[0] +sub v13.4s, v19.4s, v21.4s +add v19.4s, v19.4s, v21.4s +sqrdmulh v0.4S, v3.4S, v28.s[0] +mul v3.4S, v3.4S,v8.s[0] +trn1 v16.4S, v17.4S, v7.4S +trn2 v21.4S, v17.4S, v7.4S +mla v1.4S, v4.4S, v31.s[0] +sub v4.4s, v11.4s, v2.4s +add v11.4s, v11.4s, v2.4s +sqrdmulh v2.4S, v30.4S, v29.s[0] +mul v30.4S, v30.4S,v18.s[0] +trn2 v17.2D, v20.2D, v16.2D +trn2 v7.2D, v9.2D, v21.2D +mla v3.4S, v0.4S, v31.s[0] +sub v0.4s, v14.4s, v1.4s +add v14.4s, v14.4s, v1.4s +sqrdmulh v1.4S, v5.4S, v29.s[0] +mul v5.4S, v5.4S,v18.s[0] +trn1 v22.2D, v20.2D, v16.2D +trn1 v12.2D, v9.2D, v21.2D +mla v30.4S, v2.4S, v31.s[0] +sub v2.4s, v6.4s, v3.4s +add v6.4s, v6.4s, v3.4s +sqrdmulh v3.4S, v6.4S, v28.s[1] +mul v6.4S, v6.4S,v8.s[1] +trn1 v21.4S, v19.4S, v13.4S +trn2 v9.4S, v19.4S, v13.4S +mla v5.4S, v1.4S, v31.s[0] +sub v1.4s, v10.4s, v30.4s +add v10.4s, v10.4s, v30.4s +sqrdmulh v30.4S, v2.4S, v28.s[2] +mul v2.4S, v2.4S,v8.s[2] +trn1 v16.4S, v11.4S, v4.4S +trn2 v20.4S, v11.4S, v4.4S +mla v6.4S, v3.4S, v31.s[0] +sub v3.4s, v15.4s, v5.4s +add v15.4s, v15.4s, v5.4s +sqrdmulh v5.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v18.s[1] +trn2 v11.2D, v21.2D, v16.2D +trn2 v4.2D, v9.2D, v20.2D +mla v2.4S, v30.4S, v31.s[0] +sub v30.4s, v14.4s, v6.4s +add v14.4s, v14.4s, v6.4s +sqrdmulh v28.4S, v3.4S, v29.s[2] +mul v3.4S, v3.4S,v18.s[2] +trn1 v19.2D, v21.2D, v16.2D +trn1 v13.2D, v9.2D, v20.2D +mla v15.4S, v5.4S, v31.s[0] +sub v5.4s, v0.4s, v2.4s +add v0.4s, v0.4s, v2.4s +mla v3.4S, v28.4S, v31.s[0] +sub v28.4s, v10.4s, v15.4s +add v10.4s, v10.4s, v15.4s +sub v29.4s, v1.4s, v3.4s +add v1.4s, v1.4s, v3.4s +ldr q3, [x17, #+1696] +ldr q18, [x17, #+1712] +sqrdmulh v15.4S, v17.4S, v18.4S +mul v17.4S, v17.4S,v3.4S +trn1 v2.4S, v14.4S, v30.4S +trn2 v20.4S, v14.4S, v30.4S +sqrdmulh v9.4S, v7.4S, v18.4S +mul v7.4S, v7.4S,v3.4S +trn1 v16.4S, v0.4S, v5.4S +trn2 v21.4S, v0.4S, v5.4S +mla v17.4S, v15.4S, v31.s[0] +ldr q15, [x17, #+1824] +ldr q18, [x17, #+1840] +sqrdmulh v3.4S, v11.4S, v18.4S +mul v11.4S, v11.4S,v15.4S +trn2 v0.2D, v2.2D, v16.2D +trn2 v5.2D, v20.2D, v21.2D +mla v7.4S, v9.4S, v31.s[0] +sub v9.4s, v22.4s, v17.4s +add v22.4s, v22.4s, v17.4s +sqrdmulh v17.4S, v4.4S, v18.4S +mul v4.4S, v4.4S,v15.4S +trn1 v14.2D, v2.2D, v16.2D +trn1 v30.2D, v20.2D, v21.2D +mla v11.4S, v3.4S, v31.s[0] +sub v3.4s, v12.4s, v7.4s +add v12.4s, v12.4s, v7.4s +ldr q7, [x17, #+1728] +ldr q21, [x17, #+1744] +sqrdmulh v20.4S, v12.4S, v21.4S +mul v12.4S, v12.4S,v7.4S +trn1 v21.4S, v10.4S, v28.4S +trn2 v7.4S, v10.4S, v28.4S +mla v4.4S, v17.4S, v31.s[0] +sub v17.4s, v19.4s, v11.4s +add v19.4s, v19.4s, v11.4s +ldr q11, [x17, #+1760] +ldr q16, [x17, #+1776] +sqrdmulh v2.4S, v3.4S, v16.4S +mul v3.4S, v3.4S,v11.4S +trn1 v16.4S, v1.4S, v29.4S +trn2 v11.4S, v1.4S, v29.4S +mla v12.4S, v20.4S, v31.s[0] +sub v20.4s, v13.4s, v4.4s +add v13.4s, v13.4s, v4.4s +ldr q4, [x17, #+1856] +ldr q18, [x17, #+1872] +sqrdmulh v15.4S, v13.4S, v18.4S +mul v13.4S, v13.4S,v4.4S +trn2 v1.2D, v21.2D, v16.2D +trn2 v29.2D, v7.2D, v11.2D +mla v3.4S, v2.4S, v31.s[0] +sub v2.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +ldr q12, [x17, #+1888] +ldr q18, [x17, #+1904] +sqrdmulh v4.4S, v20.4S, v18.4S +mul v20.4S, v20.4S,v12.4S +trn1 v10.2D, v21.2D, v16.2D +trn1 v28.2D, v7.2D, v11.2D +mla v13.4S, v15.4S, v31.s[0] +sub v15.4s, v9.4s, v3.4s +add v9.4s, v9.4s, v3.4s +mla v20.4S, v4.4S, v31.s[0] +sub v4.4s, v19.4s, v13.4s +add v19.4s, v19.4s, v13.4s +sub v13.4s, v17.4s, v20.4s +add v17.4s, v17.4s, v20.4s +str q22, [x0, #768] +str q2, [x0, #784] +str q9, [x0, #800] +str q15, [x0, #816] +str q19, [x0, #832] +str q4, [x0, #848] +str q17, [x0, #864] +str q13, [x0, #880] +ldr q13, [x17, #+1952] +ldr q17, [x17, #+1968] +sqrdmulh v4.4S, v0.4S, v17.4S +mul v0.4S, v0.4S,v13.4S +sqrdmulh v19.4S, v5.4S, v17.4S +mul v5.4S, v5.4S,v13.4S +mla v0.4S, v4.4S, v31.s[0] +ldr q4, [x17, #+2080] +ldr q17, [x17, #+2096] +sqrdmulh v13.4S, v1.4S, v17.4S +mul v1.4S, v1.4S,v4.4S +mla v5.4S, v19.4S, v31.s[0] +sub v19.4s, v14.4s, v0.4s +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v29.4S, v17.4S +mul v29.4S, v29.4S,v4.4S +mla v1.4S, v13.4S, v31.s[0] +sub v13.4s, v30.4s, v5.4s +add v30.4s, v30.4s, v5.4s +ldr q5, [x17, #+1984] +ldr q17, [x17, #+2000] +sqrdmulh v4.4S, v30.4S, v17.4S +mul v30.4S, v30.4S,v5.4S +mla v29.4S, v0.4S, v31.s[0] +sub v0.4s, v10.4s, v1.4s +add v10.4s, v10.4s, v1.4s +ldr q1, [x17, #+2016] +ldr q17, [x17, #+2032] +sqrdmulh v5.4S, v13.4S, v17.4S +mul v13.4S, v13.4S,v1.4S +mla v30.4S, v4.4S, v31.s[0] +sub v4.4s, v28.4s, v29.4s +add v28.4s, v28.4s, v29.4s +ldr q29, [x17, #+2112] +ldr q17, [x17, #+2128] +sqrdmulh v1.4S, v28.4S, v17.4S +mul v28.4S, v28.4S,v29.4S +mla v13.4S, v5.4S, v31.s[0] +sub v5.4s, v14.4s, v30.4s +add v14.4s, v14.4s, v30.4s +ldr q30, [x17, #+2144] +ldr q17, [x17, #+2160] +sqrdmulh v29.4S, v4.4S, v17.4S +mul v4.4S, v4.4S,v30.4S +mla v28.4S, v1.4S, v31.s[0] +sub v1.4s, v19.4s, v13.4s +add v19.4s, v19.4s, v13.4s +mla v4.4S, v29.4S, v31.s[0] +sub v29.4s, v10.4s, v28.4s +add v10.4s, v10.4s, v28.4s +sub v28.4s, v0.4s, v4.4s +add v0.4s, v0.4s, v4.4s +str q14, [x0, #896] +str q5, [x0, #912] +str q19, [x0, #928] +str q1, [x0, #944] +str q10, [x0, #960] +str q29, [x0, #976] +str q0, [x0, #992] +str q28, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 2392 +// Instruction count: 2388 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z4_3.s b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z4_3.s new file mode 100644 index 0000000..6810a7b --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z4_3.s @@ -0,0 +1,2422 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 26036764 // Layer 6, block 0 +.word 7065381 // Layer 6, block 1 +.word 11280567 // Layer 6, block 2 +.word 19695786 // Layer 6, block 3 +.word 1666225723 // Layer 6, block 0 +.word 452149874 // Layer 6, block 1 +.word 721901190 // Layer 6, block 2 +.word 1260434103 // Layer 6, block 3 +.word 28678040 // Layer 7, block 0 +.word 5637166 // Layer 7, block 2 +.word 18759424 // Layer 7, block 4 +.word 8648030 // Layer 7, block 6 +.word 1835254486 // Layer 7, block 0 +.word 360751090 // Layer 7, block 2 +.word 1200511508 // Layer 7, block 4 +.word 553431680 // Layer 7, block 6 +.word 7232147 // Layer 7, block 1 +.word 7430689 // Layer 7, block 3 +.word 14819378 // Layer 7, block 5 +.word 22112339 // Layer 7, block 7 +.word 462822084 // Layer 7, block 1 +.word 475527802 // Layer 7, block 3 +.word 948367809 // Layer 7, block 5 +.word 1415081692 // Layer 7, block 7 +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14834498 // Layer 6, block 4 +.word 22861321 // Layer 6, block 5 +.word 23033862 // Layer 6, block 6 +.word 32211066 // Layer 6, block 7 +.word 949335415 // Layer 6, block 4 +.word 1463012881 // Layer 6, block 5 +.word 1474054663 // Layer 6, block 6 +.word 2061350894 // Layer 6, block 7 +.word 7103825 // Layer 7, block 8 +.word 24338119 // Layer 7, block 10 +.word 6674394 // Layer 7, block 12 +.word 3716128 // Layer 7, block 14 +.word 454610102 // Layer 7, block 8 +.word 1557520740 // Layer 7, block 10 +.word 427128616 // Layer 7, block 12 +.word 237814041 // Layer 7, block 14 +.word 18577393 // Layer 7, block 9 +.word 17042091 // Layer 7, block 11 +.word 6574213 // Layer 7, block 13 +.word 24666803 // Layer 7, block 15 +.word 1188862414 // Layer 7, block 9 +.word 1090610585 // Layer 7, block 11 +.word 420717521 // Layer 7, block 13 +.word 1578554911 // Layer 7, block 15 +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 11253846 // Layer 6, block 8 +.word 16151303 // Layer 6, block 9 +.word 1821442 // Layer 6, block 10 +.word 23358663 // Layer 6, block 11 +.word 720191176 // Layer 6, block 8 +.word 1033604503 // Layer 6, block 9 +.word 116563391 // Layer 6, block 10 +.word 1494840340 // Layer 6, block 11 +.word 32787475 // Layer 7, block 16 +.word 8269259 // Layer 7, block 18 +.word 20826321 // Layer 7, block 20 +.word 21194054 // Layer 7, block 22 +.word 2098238255 // Layer 7, block 16 +.word 529192186 // Layer 7, block 18 +.word 1332782821 // Layer 7, block 20 +.word 1356315937 // Layer 7, block 22 +.word 28400654 // Layer 7, block 17 +.word 31090287 // Layer 7, block 19 +.word 26776841 // Layer 7, block 21 +.word 22281074 // Layer 7, block 23 +.word 1817503137 // Layer 7, block 17 +.word 1989626512 // Layer 7, block 19 +.word 1713587037 // Layer 7, block 21 +.word 1425879908 // Layer 7, block 23 +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 20504641 // Layer 6, block 12 +.word 7735096 // Layer 6, block 13 +.word 29463916 // Layer 6, block 14 +.word 23172067 // Layer 6, block 15 +.word 1312196872 // Layer 6, block 12 +.word 495008363 // Layer 6, block 13 +.word 1885546712 // Layer 6, block 14 +.word 1482899108 // Layer 6, block 15 +.word 1953000 // Layer 7, block 24 +.word 12766243 // Layer 7, block 26 +.word 16292342 // Layer 7, block 28 +.word 25143337 // Layer 7, block 30 +.word 124982461 // Layer 7, block 24 +.word 816977197 // Layer 7, block 26 +.word 1042630311 // Layer 7, block 28 +.word 1609050759 // Layer 7, block 30 +.word 12486848 // Layer 7, block 25 +.word 31556661 // Layer 7, block 27 +.word 28330310 // Layer 7, block 29 +.word 15137961 // Layer 7, block 31 +.word 799097282 // Layer 7, block 25 +.word 2019472170 // Layer 7, block 27 +.word 1813001465 // Layer 7, block 29 +.word 968755565 // Layer 7, block 31 +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 18663828 // Layer 6, block 16 +.word 25765932 // Layer 6, block 17 +.word 11779122 // Layer 6, block 18 +.word 29112305 // Layer 6, block 19 +.word 1194393831 // Layer 6, block 16 +.word 1648893798 // Layer 6, block 17 +.word 753806275 // Layer 6, block 18 +.word 1863045325 // Layer 6, block 19 +.word 33163184 // Layer 7, block 32 +.word 11550623 // Layer 7, block 34 +.word 25375595 // Layer 7, block 36 +.word 18254638 // Layer 7, block 38 +.word 2122281795 // Layer 7, block 32 +.word 739183455 // Layer 7, block 34 +.word 1623914137 // Layer 7, block 36 +.word 1168207670 // Layer 7, block 38 +.word 9551359 // Layer 7, block 33 +.word 33257316 // Layer 7, block 35 +.word 10387700 // Layer 7, block 37 +.word 4263629 // Layer 7, block 39 +.word 611240324 // Layer 7, block 33 +.word 2128305784 // Layer 7, block 35 +.word 664762063 // Layer 7, block 37 +.word 272851431 // Layer 7, block 39 +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 596073 // Layer 6, block 20 +.word 29039358 // Layer 6, block 21 +.word 6760262 // Layer 6, block 22 +.word 2228887 // Layer 6, block 23 +.word 38145761 // Layer 6, block 20 +.word 1858377074 // Layer 6, block 21 +.word 432623749 // Layer 6, block 22 +.word 142637881 // Layer 6, block 23 +.word 25929180 // Layer 7, block 40 +.word 23508428 // Layer 7, block 42 +.word 22560727 // Layer 7, block 44 +.word 29457393 // Layer 7, block 46 +.word 1659340873 // Layer 7, block 40 +.word 1504424569 // Layer 7, block 42 +.word 1443776334 // Layer 7, block 44 +.word 1885129272 // Layer 7, block 46 +.word 17371159 // Layer 7, block 41 +.word 11558208 // Layer 7, block 43 +.word 15755637 // Layer 7, block 45 +.word 20740787 // Layer 7, block 47 +.word 1111669329 // Layer 7, block 41 +.word 739668858 // Layer 7, block 43 +.word 1008283812 // Layer 7, block 45 +.word 1327309063 // Layer 7, block 47 +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 13624329 // Layer 6, block 24 +.word 9838349 // Layer 6, block 25 +.word 6934560 // Layer 6, block 26 +.word 11310234 // Layer 6, block 27 +.word 871890510 // Layer 6, block 24 +.word 629606282 // Layer 6, block 25 +.word 443777969 // Layer 6, block 26 +.word 723799733 // Layer 6, block 27 +.word 3153984 // Layer 7, block 48 +.word 15599806 // Layer 7, block 50 +.word 23484790 // Layer 7, block 52 +.word 30174454 // Layer 7, block 54 +.word 201839571 // Layer 7, block 48 +.word 998311389 // Layer 7, block 50 +.word 1502911852 // Layer 7, block 52 +.word 1931017673 // Layer 7, block 54 +.word 13598070 // Layer 7, block 49 +.word 31454003 // Layer 7, block 51 +.word 20506260 // Layer 7, block 53 +.word 5928435 // Layer 7, block 55 +.word 870210062 // Layer 7, block 49 +.word 2012902560 // Layer 7, block 51 +.word 1312300480 // Layer 7, block 53 +.word 379390883 // Layer 7, block 55 +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 32798516 // Layer 6, block 28 +.word 9911360 // Layer 6, block 29 +.word 32443170 // Layer 6, block 30 +.word 31293482 // Layer 6, block 31 +.word 2098944825 // Layer 6, block 28 +.word 634278629 // Layer 6, block 29 +.word 2076204416 // Layer 6, block 30 +.word 2002630000 // Layer 6, block 31 +.word 26013877 // Layer 7, block 56 +.word 22928950 // Layer 7, block 58 +.word 24547058 // Layer 7, block 60 +.word 21082546 // Layer 7, block 62 +.word 1664761067 // Layer 7, block 56 +.word 1467340807 // Layer 7, block 58 +.word 1570891816 // Layer 7, block 60 +.word 1349179970 // Layer 7, block 62 +.word 21864746 // Layer 7, block 57 +.word 27678266 // Layer 7, block 59 +.word 30695887 // Layer 7, block 61 +.word 31772478 // Layer 7, block 63 +.word 1399236949 // Layer 7, block 57 +.word 1771273834 // Layer 7, block 59 +.word 1964386839 // Layer 7, block 61 +.word 2033283404 // Layer 7, block 63 +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 2853776 // Layer 6, block 32 +.word 31645959 // Layer 6, block 33 +.word 29723614 // Layer 6, block 34 +.word 31813171 // Layer 6, block 35 +.word 182627725 // Layer 6, block 32 +.word 2025186806 // Layer 6, block 33 +.word 1902166116 // Layer 6, block 34 +.word 2035887557 // Layer 6, block 35 +.word 30377953 // Layer 7, block 64 +.word 4924837 // Layer 7, block 66 +.word 11362575 // Layer 7, block 68 +.word 31398766 // Layer 7, block 70 +.word 1944040616 // Layer 7, block 64 +.word 315165513 // Layer 7, block 66 +.word 727149301 // Layer 7, block 68 +.word 2009367662 // Layer 7, block 70 +.word 27689101 // Layer 7, block 65 +.word 31229525 // Layer 7, block 67 +.word 6544948 // Layer 7, block 69 +.word 13728247 // Layer 7, block 71 +.word 1771967221 // Layer 7, block 65 +.word 1998537064 // Layer 7, block 67 +.word 418844704 // Layer 7, block 69 +.word 878540754 // Layer 7, block 71 +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9116920 // Layer 6, block 36 +.word 26449800 // Layer 6, block 37 +.word 27173300 // Layer 6, block 38 +.word 1574249 // Layer 6, block 39 +.word 583438350 // Layer 6, block 36 +.word 1692658010 // Layer 6, block 37 +.word 1738958476 // Layer 6, block 38 +.word 100744247 // Layer 6, block 39 +.word 6510145 // Layer 7, block 72 +.word 760999 // Layer 7, block 74 +.word 1634503 // Layer 7, block 76 +.word 29546109 // Layer 7, block 78 +.word 416617482 // Layer 7, block 72 +.word 48700219 // Layer 7, block 74 +.word 104600209 // Layer 7, block 76 +.word 1890806663 // Layer 7, block 78 +.word 2195232 // Layer 7, block 73 +.word 4465852 // Layer 7, block 75 +.word 31203102 // Layer 7, block 77 +.word 29916743 // Layer 7, block 79 +.word 140484126 // Layer 7, block 73 +.word 285792715 // Layer 7, block 75 +.word 1996846121 // Layer 7, block 77 +.word 1914525428 // Layer 7, block 79 +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29172999 // Layer 6, block 40 +.word 16825951 // Layer 6, block 41 +.word 11592382 // Layer 6, block 42 +.word 2671395 // Layer 6, block 43 +.word 1866929445 // Layer 6, block 40 +.word 1076778680 // Layer 6, block 41 +.word 741855827 // Layer 6, block 42 +.word 170956232 // Layer 6, block 43 +.word 14579779 // Layer 7, block 80 +.word 24263513 // Layer 7, block 82 +.word 4646776 // Layer 7, block 84 +.word 69049 // Layer 7, block 86 +.word 933034643 // Layer 7, block 80 +.word 1552746321 // Layer 7, block 82 +.word 297370968 // Layer 7, block 84 +.word 4418799 // Layer 7, block 86 +.word 33263488 // Layer 7, block 81 +.word 22493246 // Layer 7, block 83 +.word 22009979 // Layer 7, block 85 +.word 12021234 // Layer 7, block 87 +.word 2128700762 // Layer 7, block 81 +.word 1439457879 // Layer 7, block 83 +.word 1408531152 // Layer 7, block 85 +.word 769300260 // Layer 7, block 87 +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 15720958 // Layer 6, block 44 +.word 4876619 // Layer 6, block 45 +.word 9370171 // Layer 6, block 46 +.word 2197027 // Layer 6, block 47 +.word 1006064525 // Layer 6, block 44 +.word 312079797 // Layer 6, block 45 +.word 599645177 // Layer 6, block 46 +.word 140598997 // Layer 6, block 47 +.word 16117282 // Layer 7, block 88 +.word 9635661 // Layer 7, block 90 +.word 9117520 // Layer 7, block 92 +.word 3506913 // Layer 7, block 94 +.word 1031427326 // Layer 7, block 88 +.word 616635240 // Layer 7, block 90 +.word 583476747 // Layer 7, block 92 +.word 224425303 // Layer 7, block 94 +.word 20014407 // Layer 7, block 89 +.word 25893988 // Layer 7, block 91 +.word 10257619 // Layer 7, block 93 +.word 24501669 // Layer 7, block 95 +.word 1280824291 // Layer 7, block 89 +.word 1657088757 // Layer 7, block 91 +.word 656437514 // Layer 7, block 93 +.word 1567987141 // Layer 7, block 95 +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 23467272 // Layer 6, block 48 +.word 11944835 // Layer 6, block 49 +.word 29768154 // Layer 6, block 50 +.word 3189790 // Layer 6, block 51 +.word 1501790786 // Layer 6, block 48 +.word 764411097 // Layer 6, block 49 +.word 1905016458 // Layer 6, block 50 +.word 204130980 // Layer 6, block 51 +.word 28559032 // Layer 7, block 96 +.word 20151609 // Layer 7, block 98 +.word 11645481 // Layer 7, block 100 +.word 16402437 // Layer 7, block 102 +.word 1827638556 // Layer 7, block 96 +.word 1289604549 // Layer 7, block 98 +.word 745253903 // Layer 7, block 100 +.word 1049675853 // Layer 7, block 102 +.word 1005359 // Layer 7, block 97 +.word 19130139 // Layer 7, block 99 +.word 11690281 // Layer 7, block 101 +.word 5461508 // Layer 7, block 103 +.word 64338065 // Layer 7, block 97 +.word 1224235458 // Layer 7, block 99 +.word 748120885 // Layer 7, block 101 +.word 349509836 // Layer 7, block 103 +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 4898455 // Layer 6, block 52 +.word 22059944 // Layer 6, block 53 +.word 20315246 // Layer 6, block 54 +.word 28615767 // Layer 6, block 55 +.word 313477194 // Layer 6, block 52 +.word 1411728668 // Layer 6, block 53 +.word 1300076517 // Layer 6, block 54 +.word 1831269319 // Layer 6, block 55 +.word 6226096 // Layer 7, block 104 +.word 14029790 // Layer 7, block 106 +.word 7729000 // Layer 7, block 108 +.word 13958531 // Layer 7, block 110 +.word 398439734 // Layer 7, block 104 +.word 897838034 // Layer 7, block 106 +.word 494618249 // Layer 7, block 108 +.word 893277806 // Layer 7, block 110 +.word 31755058 // Layer 7, block 105 +.word 26102744 // Layer 7, block 107 +.word 19175904 // Layer 7, block 109 +.word 19472238 // Layer 7, block 111 +.word 2032168609 // Layer 7, block 105 +.word 1670448121 // Layer 7, block 107 +.word 1227164194 // Layer 7, block 109 +.word 1246128123 // Layer 7, block 111 +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 17302560 // Layer 6, block 56 +.word 8630188 // Layer 6, block 57 +.word 13744680 // Layer 6, block 58 +.word 31890906 // Layer 6, block 59 +.word 1107279328 // Layer 6, block 56 +.word 552289879 // Layer 6, block 57 +.word 879592386 // Layer 6, block 58 +.word 2040862218 // Layer 6, block 59 +.word 4735938 // Layer 7, block 112 +.word 26671657 // Layer 7, block 114 +.word 25810971 // Layer 7, block 116 +.word 25578690 // Layer 7, block 118 +.word 303076900 // Layer 7, block 112 +.word 1706855774 // Layer 7, block 114 +.word 1651776074 // Layer 7, block 116 +.word 1636911225 // Layer 7, block 118 +.word 6957373 // Layer 7, block 113 +.word 25381712 // Layer 7, block 115 +.word 27780827 // Layer 7, block 117 +.word 28062311 // Layer 7, block 119 +.word 445237890 // Layer 7, block 113 +.word 1624305595 // Layer 7, block 115 +.word 1777837237 // Layer 7, block 117 +.word 1795850838 // Layer 7, block 119 +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 26150922 // Layer 6, block 60 +.word 29525906 // Layer 6, block 61 +.word 23080870 // Layer 6, block 62 +.word 1636987 // Layer 6, block 63 +.word 1673531278 // Layer 6, block 60 +.word 1889513769 // Layer 6, block 61 +.word 1477062945 // Layer 6, block 62 +.word 104759172 // Layer 6, block 63 +.word 10674616 // Layer 7, block 120 +.word 9508293 // Layer 7, block 122 +.word 4274200 // Layer 7, block 124 +.word 10066304 // Layer 7, block 126 +.word 683123285 // Layer 7, block 120 +.word 608484310 // Layer 7, block 122 +.word 273527923 // Layer 7, block 124 +.word 644194289 // Layer 7, block 126 +.word 26473446 // Layer 7, block 121 +.word 14853570 // Layer 7, block 123 +.word 32427548 // Layer 7, block 125 +.word 16598340 // Layer 7, block 127 +.word 1694171239 // Layer 7, block 121 +.word 950555930 // Layer 7, block 123 +.word 2075204685 // Layer 7, block 125 +.word 1062212688 // Layer 7, block 127 +.text +.global ntt_u32_full_neon_asm_var_4_4_3_z4_3 +.global _ntt_u32_full_neon_asm_var_4_4_3_z4_3 +ntt_u32_full_neon_asm_var_4_4_3_z4_3: +_ntt_u32_full_neon_asm_var_4_4_3_z4_3: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #800] +ldr q21, [x0, #864] +ldr q20, [x0, #928] +ldr q19, [x0, #992] +ldr q18, [x0, #288] +ldr q17, [x0, #352] +ldr q16, [x0, #416] +ldr q3, [x0, #480] +sqrdmulh v2.4S, v22.4S, v29.s[0] +ldr q1, [x0, #544] +mul v22.4S, v22.4S,v30.s[0] +ldr q0, [x0, #608] +sqrdmulh v15.4S, v21.4S, v29.s[0] +ldr q14, [x0, #672] +mul v21.4S, v21.4S,v30.s[0] +ldr q13, [x0, #736] +mla v22.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q12, [x0, #32] +sub v11.4s, v18.4s, v22.4s +mla v21.4S, v15.4S, v31.s[0] +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q15, [x0, #96] +sub v10.4s, v17.4s, v21.4s +mla v20.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v1.4S, v29.s[0] +ldr q2, [x0, #160] +mul v1.4S, v1.4S,v30.s[0] +sub v9.4s, v16.4s, v20.4s +mla v19.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v0.4S, v29.s[0] +ldr q22, [x0, #224] +mul v0.4S, v0.4S,v30.s[0] +sub v8.4s, v3.4s, v19.4s +mla v1.4S, v21.4S, v31.s[0] +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v21.4s, v12.4s, v1.4s +mla v0.4S, v20.4S, v31.s[0] +add v12.4s, v12.4s, v1.4s +sqrdmulh v1.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +sub v20.4s, v15.4s, v0.4s +mla v14.4S, v19.4S, v31.s[0] +add v15.4s, v15.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v19.4s, v2.4s, v14.4s +mla v13.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v1.4s, v22.4s, v13.4s +mla v16.4S, v0.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v0.4s, v2.4s, v16.4s +mla v3.4S, v14.4S, v31.s[0] +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v14.4s, v22.4s, v3.4s +mla v18.4S, v13.4S, v31.s[0] +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v29.s[2] +mul v9.4S, v9.4S,v30.s[2] +sub v13.4s, v12.4s, v18.4s +mla v17.4S, v16.4S, v31.s[0] +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v8.4S, v29.s[2] +mul v8.4S, v8.4S,v30.s[2] +sub v16.4s, v15.4s, v17.4s +mla v9.4S, v3.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +sqrdmulh v17.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v3.4s, v19.4s, v9.4s +mla v8.4S, v18.4S, v31.s[0] +add v19.4s, v19.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v18.4s, v1.4s, v8.4s +mla v11.4S, v17.4S, v31.s[0] +add v1.4s, v1.4s, v8.4s +sqrdmulh v8.4S, v2.4S, v27.s[0] +mul v2.4S, v2.4S,v28.s[0] +sub v17.4s, v21.4s, v11.4s +mla v10.4S, v9.4S, v31.s[0] +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v27.s[0] +mul v22.4S, v22.4S,v28.s[0] +sub v9.4s, v20.4s, v10.4s +mla v2.4S, v8.4S, v31.s[0] +add v20.4s, v20.4s, v10.4s +sqrdmulh v10.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v8.4s, v12.4s, v2.4s +mla v22.4S, v11.4S, v31.s[0] +add v12.4s, v12.4s, v2.4s +sqrdmulh v2.4S, v14.4S, v27.s[1] +mul v14.4S, v14.4S,v28.s[1] +sub v11.4s, v15.4s, v22.4s +mla v0.4S, v10.4S, v31.s[0] +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v27.s[2] +mul v19.4S, v19.4S,v28.s[2] +sub v10.4s, v13.4s, v0.4s +mla v14.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v0.4s +sqrdmulh v0.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +sub v2.4s, v16.4s, v14.4s +mla v19.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v27.s[3] +mul v3.4S, v3.4S,v28.s[3] +sub v22.4s, v21.4s, v19.4s +mla v1.4S, v0.4S, v31.s[0] +add v21.4s, v21.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v0.4s, v20.4s, v1.4s +mla v3.4S, v14.4S, v31.s[0] +add v20.4s, v20.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v25.s[0] +mul v15.4S, v15.4S,v26.s[0] +sub v14.4s, v17.4s, v3.4s +mla v18.4S, v19.4S, v31.s[0] +add v17.4s, v17.4s, v3.4s +sqrdmulh v3.4S, v11.4S, v25.s[1] +mul v11.4S, v11.4S,v26.s[1] +sub v19.4s, v9.4s, v18.4s +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v1.4s, v12.4s, v15.4s +mla v11.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v25.s[3] +mul v2.4S, v2.4S,v26.s[3] +sub v3.4s, v8.4s, v11.4s +mla v16.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v11.4s +str q12, [x0, #32] +sqrdmulh v12.4S, v20.4S, v23.s[0] +str q1, [x0, #96] +mul v20.4S, v20.4S,v24.s[0] +ldr q1, [x0, #816] +sub v11.4s, v13.4s, v16.4s +ldr q18, [x0, #880] +mla v2.4S, v15.4S, v31.s[0] +add v13.4s, v13.4s, v16.4s +str q8, [x0, #160] +sqrdmulh v8.4S, v0.4S, v23.s[1] +str q3, [x0, #224] +mul v0.4S, v0.4S,v24.s[1] +ldr q3, [x0, #944] +sub v16.4s, v10.4s, v2.4s +ldr q15, [x0, #1008] +mla v20.4S, v12.4S, v31.s[0] +add v10.4s, v10.4s, v2.4s +str q13, [x0, #288] +sqrdmulh v13.4S, v9.4S, v23.s[2] +str q11, [x0, #352] +mul v9.4S, v9.4S,v24.s[2] +ldr q11, [x0, #304] +sub v2.4s, v21.4s, v20.4s +ldr q12, [x0, #368] +mla v0.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v20.4s +str q10, [x0, #416] +sqrdmulh v10.4S, v19.4S, v23.s[3] +str q16, [x0, #480] +mul v19.4S, v19.4S,v24.s[3] +ldr q16, [x0, #432] +sub v20.4s, v22.4s, v0.4s +ldr q8, [x0, #496] +mla v9.4S, v13.4S, v31.s[0] +add v22.4s, v22.4s, v0.4s +str q21, [x0, #544] +sqrdmulh v21.4S, v1.4S, v29.s[0] +str q2, [x0, #608] +ldr q2, [x0, #560] +mul v1.4S, v1.4S,v30.s[0] +ldr q0, [x0, #624] +sub v13.4s, v17.4s, v9.4s +mla v19.4S, v10.4S, v31.s[0] +add v17.4s, v17.4s, v9.4s +str q22, [x0, #672] +sqrdmulh v22.4S, v18.4S, v29.s[0] +str q20, [x0, #736] +ldr q20, [x0, #688] +mul v18.4S, v18.4S,v30.s[0] +ldr q9, [x0, #752] +sub v10.4s, v14.4s, v19.4s +mla v1.4S, v21.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +str q17, [x0, #800] +sqrdmulh v17.4S, v3.4S, v29.s[0] +str q13, [x0, #864] +mul v3.4S, v3.4S,v30.s[0] +ldr q13, [x0, #48] +sub v19.4s, v11.4s, v1.4s +mla v18.4S, v22.4S, v31.s[0] +add v11.4s, v11.4s, v1.4s +str q14, [x0, #928] +sqrdmulh v14.4S, v15.4S, v29.s[0] +str q10, [x0, #992] +mul v15.4S, v15.4S,v30.s[0] +ldr q10, [x0, #112] +sub v1.4s, v12.4s, v18.4s +mla v3.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v2.4S, v29.s[0] +ldr q17, [x0, #176] +mul v2.4S, v2.4S,v30.s[0] +sub v22.4s, v16.4s, v3.4s +mla v15.4S, v14.4S, v31.s[0] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v0.4S, v29.s[0] +ldr q14, [x0, #240] +mul v0.4S, v0.4S,v30.s[0] +sub v21.4s, v8.4s, v15.4s +mla v2.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +sub v18.4s, v13.4s, v2.4s +mla v0.4S, v3.4S, v31.s[0] +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v9.4S, v29.s[0] +mul v9.4S, v9.4S,v30.s[0] +sub v3.4s, v10.4s, v0.4s +mla v20.4S, v15.4S, v31.s[0] +add v10.4s, v10.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v15.4s, v17.4s, v20.4s +mla v9.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +sub v2.4s, v14.4s, v9.4s +mla v16.4S, v0.4S, v31.s[0] +add v14.4s, v14.4s, v9.4s +sqrdmulh v9.4S, v11.4S, v29.s[1] +mul v11.4S, v11.4S,v30.s[1] +sub v0.4s, v17.4s, v16.4s +mla v8.4S, v20.4S, v31.s[0] +add v17.4s, v17.4s, v16.4s +sqrdmulh v16.4S, v12.4S, v29.s[1] +mul v12.4S, v12.4S,v30.s[1] +sub v20.4s, v14.4s, v8.4s +mla v11.4S, v9.4S, v31.s[0] +add v14.4s, v14.4s, v8.4s +sqrdmulh v8.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +sub v9.4s, v13.4s, v11.4s +mla v12.4S, v16.4S, v31.s[0] +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +sub v16.4s, v10.4s, v12.4s +mla v22.4S, v8.4S, v31.s[0] +add v10.4s, v10.4s, v12.4s +sqrdmulh v12.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +sub v8.4s, v15.4s, v22.4s +mla v21.4S, v11.4S, v31.s[0] +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v1.4S, v29.s[2] +mul v1.4S, v1.4S,v30.s[2] +sub v11.4s, v2.4s, v21.4s +mla v19.4S, v12.4S, v31.s[0] +add v2.4s, v2.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v27.s[0] +mul v17.4S, v17.4S,v28.s[0] +sub v12.4s, v18.4s, v19.4s +mla v1.4S, v22.4S, v31.s[0] +add v18.4s, v18.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +sub v22.4s, v3.4s, v1.4s +mla v17.4S, v21.4S, v31.s[0] +add v3.4s, v3.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v21.4s, v13.4s, v17.4s +mla v14.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v17.4s +sqrdmulh v17.4S, v20.4S, v27.s[1] +mul v20.4S, v20.4S,v28.s[1] +sub v19.4s, v10.4s, v14.4s +mla v0.4S, v1.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v27.s[2] +mul v15.4S, v15.4S,v28.s[2] +sub v1.4s, v9.4s, v0.4s +mla v20.4S, v17.4S, v31.s[0] +add v9.4s, v9.4s, v0.4s +sqrdmulh v0.4S, v2.4S, v27.s[2] +mul v2.4S, v2.4S,v28.s[2] +sub v17.4s, v16.4s, v20.4s +mla v15.4S, v14.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v27.s[3] +mul v8.4S, v8.4S,v28.s[3] +sub v14.4s, v18.4s, v15.4s +mla v2.4S, v0.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v27.s[3] +mul v11.4S, v11.4S,v28.s[3] +sub v0.4s, v3.4s, v2.4s +mla v8.4S, v20.4S, v31.s[0] +add v3.4s, v3.4s, v2.4s +sqrdmulh v2.4S, v10.4S, v25.s[0] +mul v10.4S, v10.4S,v26.s[0] +sub v20.4s, v12.4s, v8.4s +mla v11.4S, v15.4S, v31.s[0] +add v12.4s, v12.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v25.s[1] +mul v19.4S, v19.4S,v26.s[1] +sub v15.4s, v22.4s, v11.4s +mla v10.4S, v2.4S, v31.s[0] +add v22.4s, v22.4s, v11.4s +sqrdmulh v11.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v2.4s, v13.4s, v10.4s +mla v19.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v10.4s +sqrdmulh v10.4S, v17.4S, v25.s[3] +mul v17.4S, v17.4S,v26.s[3] +sub v8.4s, v21.4s, v19.4s +mla v16.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v19.4s +str q13, [x0, #48] +sqrdmulh v13.4S, v3.4S, v23.s[0] +str q2, [x0, #112] +mul v3.4S, v3.4S,v24.s[0] +ldr q2, [x0, #768] +sub v19.4s, v9.4s, v16.4s +ldr q11, [x0, #832] +mla v17.4S, v10.4S, v31.s[0] +add v9.4s, v9.4s, v16.4s +str q21, [x0, #176] +sqrdmulh v21.4S, v0.4S, v23.s[1] +str q8, [x0, #240] +mul v0.4S, v0.4S,v24.s[1] +ldr q8, [x0, #896] +sub v16.4s, v1.4s, v17.4s +ldr q10, [x0, #960] +mla v3.4S, v13.4S, v31.s[0] +add v1.4s, v1.4s, v17.4s +str q9, [x0, #304] +sqrdmulh v9.4S, v22.4S, v23.s[2] +str q19, [x0, #368] +mul v22.4S, v22.4S,v24.s[2] +ldr q19, [x0, #256] +sub v17.4s, v18.4s, v3.4s +ldr q13, [x0, #320] +mla v0.4S, v21.4S, v31.s[0] +add v18.4s, v18.4s, v3.4s +str q1, [x0, #432] +sqrdmulh v1.4S, v15.4S, v23.s[3] +str q16, [x0, #496] +mul v15.4S, v15.4S,v24.s[3] +ldr q16, [x0, #384] +sub v3.4s, v14.4s, v0.4s +ldr q21, [x0, #448] +mla v22.4S, v9.4S, v31.s[0] +add v14.4s, v14.4s, v0.4s +str q18, [x0, #560] +sqrdmulh v18.4S, v2.4S, v29.s[0] +str q17, [x0, #624] +ldr q17, [x0, #512] +mul v2.4S, v2.4S,v30.s[0] +ldr q0, [x0, #576] +sub v9.4s, v12.4s, v22.4s +mla v15.4S, v1.4S, v31.s[0] +add v12.4s, v12.4s, v22.4s +str q14, [x0, #688] +sqrdmulh v14.4S, v11.4S, v29.s[0] +str q3, [x0, #752] +ldr q3, [x0, #640] +mul v11.4S, v11.4S,v30.s[0] +ldr q22, [x0, #704] +sub v1.4s, v20.4s, v15.4s +mla v2.4S, v18.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +str q12, [x0, #816] +sqrdmulh v12.4S, v8.4S, v29.s[0] +str q9, [x0, #880] +mul v8.4S, v8.4S,v30.s[0] +ldr q9, [x0, #0] +sub v15.4s, v19.4s, v2.4s +mla v11.4S, v14.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +str q20, [x0, #944] +sqrdmulh v20.4S, v10.4S, v29.s[0] +str q1, [x0, #1008] +mul v10.4S, v10.4S,v30.s[0] +ldr q1, [x0, #64] +sub v2.4s, v13.4s, v11.4s +mla v8.4S, v12.4S, v31.s[0] +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v29.s[0] +ldr q12, [x0, #128] +mul v17.4S, v17.4S,v30.s[0] +sub v14.4s, v16.4s, v8.4s +mla v10.4S, v20.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v0.4S, v29.s[0] +ldr q20, [x0, #192] +mul v0.4S, v0.4S,v30.s[0] +sub v18.4s, v21.4s, v10.4s +mla v17.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +sub v11.4s, v9.4s, v17.4s +mla v0.4S, v8.4S, v31.s[0] +add v9.4s, v9.4s, v17.4s +sqrdmulh v17.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v8.4s, v1.4s, v0.4s +mla v3.4S, v10.4S, v31.s[0] +add v1.4s, v1.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v10.4s, v12.4s, v3.4s +mla v22.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v17.4s, v20.4s, v22.4s +mla v16.4S, v0.4S, v31.s[0] +add v20.4s, v20.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[1] +mul v19.4S, v19.4S,v30.s[1] +sub v0.4s, v12.4s, v16.4s +mla v21.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v13.4S, v29.s[1] +mul v13.4S, v13.4S,v30.s[1] +sub v3.4s, v20.4s, v21.4s +mla v19.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v22.4s, v9.4s, v19.4s +mla v13.4S, v16.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v29.s[2] +mul v18.4S, v18.4S,v30.s[2] +sub v16.4s, v1.4s, v13.4s +mla v14.4S, v21.4S, v31.s[0] +add v1.4s, v1.4s, v13.4s +sqrdmulh v13.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +sub v21.4s, v10.4s, v14.4s +mla v18.4S, v19.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v19.4s, v17.4s, v18.4s +mla v15.4S, v13.4S, v31.s[0] +add v17.4s, v17.4s, v18.4s +sqrdmulh v18.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +sub v13.4s, v11.4s, v15.4s +mla v2.4S, v14.4S, v31.s[0] +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v27.s[0] +mul v20.4S, v20.4S,v28.s[0] +sub v14.4s, v8.4s, v2.4s +mla v12.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v2.4s +sqrdmulh v2.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v18.4s, v9.4s, v12.4s +mla v20.4S, v15.4S, v31.s[0] +add v9.4s, v9.4s, v12.4s +sqrdmulh v12.4S, v3.4S, v27.s[1] +mul v3.4S, v3.4S,v28.s[1] +sub v15.4s, v1.4s, v20.4s +mla v0.4S, v2.4S, v31.s[0] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v10.4S, v27.s[2] +mul v10.4S, v10.4S,v28.s[2] +sub v2.4s, v22.4s, v0.4s +mla v3.4S, v12.4S, v31.s[0] +add v22.4s, v22.4s, v0.4s +sqrdmulh v0.4S, v17.4S, v27.s[2] +mul v17.4S, v17.4S,v28.s[2] +sub v12.4s, v16.4s, v3.4s +mla v10.4S, v20.4S, v31.s[0] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +sub v20.4s, v11.4s, v10.4s +mla v17.4S, v0.4S, v31.s[0] +add v11.4s, v11.4s, v10.4s +sqrdmulh v10.4S, v19.4S, v27.s[3] +mul v19.4S, v19.4S,v28.s[3] +sub v0.4s, v8.4s, v17.4s +mla v21.4S, v3.4S, v31.s[0] +add v8.4s, v8.4s, v17.4s +sqrdmulh v17.4S, v1.4S, v25.s[0] +mul v1.4S, v1.4S,v26.s[0] +sub v3.4s, v13.4s, v21.4s +mla v19.4S, v10.4S, v31.s[0] +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v15.4S, v25.s[1] +mul v15.4S, v15.4S,v26.s[1] +sub v10.4s, v14.4s, v19.4s +mla v1.4S, v17.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +sqrdmulh v19.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v17.4s, v9.4s, v1.4s +mla v15.4S, v21.4S, v31.s[0] +add v9.4s, v9.4s, v1.4s +sqrdmulh v1.4S, v12.4S, v25.s[3] +mul v12.4S, v12.4S,v26.s[3] +sub v21.4s, v18.4s, v15.4s +mla v16.4S, v19.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +str q9, [x0, #0] +sqrdmulh v9.4S, v8.4S, v23.s[0] +str q17, [x0, #64] +mul v8.4S, v8.4S,v24.s[0] +ldr q17, [x0, #784] +sub v15.4s, v22.4s, v16.4s +ldr q19, [x0, #848] +mla v12.4S, v1.4S, v31.s[0] +add v22.4s, v22.4s, v16.4s +str q18, [x0, #128] +sqrdmulh v18.4S, v0.4S, v23.s[1] +str q21, [x0, #192] +mul v0.4S, v0.4S,v24.s[1] +ldr q21, [x0, #912] +sub v16.4s, v2.4s, v12.4s +ldr q1, [x0, #976] +mla v8.4S, v9.4S, v31.s[0] +add v2.4s, v2.4s, v12.4s +str q22, [x0, #256] +sqrdmulh v22.4S, v14.4S, v23.s[2] +str q15, [x0, #320] +mul v14.4S, v14.4S,v24.s[2] +ldr q15, [x0, #272] +sub v12.4s, v11.4s, v8.4s +ldr q9, [x0, #336] +mla v0.4S, v18.4S, v31.s[0] +add v11.4s, v11.4s, v8.4s +str q2, [x0, #384] +sqrdmulh v2.4S, v10.4S, v23.s[3] +str q16, [x0, #448] +mul v10.4S, v10.4S,v24.s[3] +ldr q16, [x0, #400] +sub v8.4s, v20.4s, v0.4s +ldr q18, [x0, #464] +mla v14.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v0.4s +str q11, [x0, #512] +sqrdmulh v11.4S, v17.4S, v29.s[0] +str q12, [x0, #576] +ldr q12, [x0, #528] +mul v17.4S, v17.4S,v30.s[0] +ldr q0, [x0, #592] +sub v22.4s, v13.4s, v14.4s +mla v10.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v14.4s +str q20, [x0, #640] +sqrdmulh v20.4S, v19.4S, v29.s[0] +str q8, [x0, #704] +ldr q8, [x0, #656] +mul v19.4S, v19.4S,v30.s[0] +ldr q14, [x0, #720] +sub v2.4s, v3.4s, v10.4s +mla v17.4S, v11.4S, v31.s[0] +add v3.4s, v3.4s, v10.4s +str q13, [x0, #768] +sqrdmulh v13.4S, v21.4S, v29.s[0] +str q22, [x0, #832] +mul v21.4S, v21.4S,v30.s[0] +ldr q22, [x0, #16] +sub v10.4s, v15.4s, v17.4s +mla v19.4S, v20.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +str q3, [x0, #896] +sqrdmulh v3.4S, v1.4S, v29.s[0] +str q2, [x0, #960] +mul v1.4S, v1.4S,v30.s[0] +ldr q2, [x0, #80] +sub v17.4s, v9.4s, v19.4s +mla v21.4S, v13.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v12.4S, v29.s[0] +ldr q13, [x0, #144] +mul v12.4S, v12.4S,v30.s[0] +sub v20.4s, v16.4s, v21.4s +mla v1.4S, v3.4S, v31.s[0] +add v16.4s, v16.4s, v21.4s +sqrdmulh v21.4S, v0.4S, v29.s[0] +ldr q3, [x0, #208] +mul v0.4S, v0.4S,v30.s[0] +sub v11.4s, v18.4s, v1.4s +mla v12.4S, v19.4S, v31.s[0] +add v18.4s, v18.4s, v1.4s +sqrdmulh v1.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v19.4s, v22.4s, v12.4s +mla v0.4S, v21.4S, v31.s[0] +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v21.4s, v2.4s, v0.4s +mla v8.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v1.4s, v13.4s, v8.4s +mla v14.4S, v12.4S, v31.s[0] +add v13.4s, v13.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v12.4s, v3.4s, v14.4s +mla v16.4S, v0.4S, v31.s[0] +add v3.4s, v3.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +sub v0.4s, v13.4s, v16.4s +mla v18.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v16.4s +sqrdmulh v16.4S, v9.4S, v29.s[1] +mul v9.4S, v9.4S,v30.s[1] +sub v8.4s, v3.4s, v18.4s +mla v15.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +sub v14.4s, v22.4s, v15.4s +mla v9.4S, v16.4S, v31.s[0] +add v22.4s, v22.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v16.4s, v2.4s, v9.4s +mla v20.4S, v18.4S, v31.s[0] +add v2.4s, v2.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v18.4s, v1.4s, v20.4s +mla v11.4S, v15.4S, v31.s[0] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +sub v15.4s, v12.4s, v11.4s +mla v10.4S, v9.4S, v31.s[0] +add v12.4s, v12.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v27.s[0] +mul v13.4S, v13.4S,v28.s[0] +sub v9.4s, v19.4s, v10.4s +mla v17.4S, v20.4S, v31.s[0] +add v19.4s, v19.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v27.s[0] +mul v3.4S, v3.4S,v28.s[0] +sub v20.4s, v21.4s, v17.4s +mla v13.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v11.4s, v22.4s, v13.4s +mla v3.4S, v10.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v8.4S, v27.s[1] +mul v8.4S, v8.4S,v28.s[1] +sub v10.4s, v2.4s, v3.4s +mla v0.4S, v17.4S, v31.s[0] +add v2.4s, v2.4s, v3.4s +sqrdmulh v3.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +sub v17.4s, v14.4s, v0.4s +mla v8.4S, v13.4S, v31.s[0] +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v12.4S, v27.s[2] +mul v12.4S, v12.4S,v28.s[2] +sub v13.4s, v16.4s, v8.4s +mla v1.4S, v3.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v3.4s, v19.4s, v1.4s +mla v12.4S, v0.4S, v31.s[0] +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +sub v0.4s, v21.4s, v12.4s +mla v18.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v2.4S, v25.s[0] +mul v2.4S, v2.4S,v26.s[0] +sub v8.4s, v9.4s, v18.4s +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v10.4S, v25.s[1] +mul v10.4S, v10.4S,v26.s[1] +sub v1.4s, v20.4s, v15.4s +mla v2.4S, v12.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v12.4s, v22.4s, v2.4s +mla v10.4S, v18.4S, v31.s[0] +add v22.4s, v22.4s, v2.4s +sqrdmulh v2.4S, v13.4S, v25.s[3] +mul v13.4S, v13.4S,v26.s[3] +sub v18.4s, v11.4s, v10.4s +mla v16.4S, v15.4S, v31.s[0] +add v11.4s, v11.4s, v10.4s +str q22, [x0, #16] +sqrdmulh v22.4S, v21.4S, v23.s[0] +str q12, [x0, #80] +mul v21.4S, v21.4S,v24.s[0] +sub v12.4s, v14.4s, v16.4s +mla v13.4S, v2.4S, v31.s[0] +add v14.4s, v14.4s, v16.4s +str q11, [x0, #144] +sqrdmulh v11.4S, v0.4S, v23.s[1] +str q18, [x0, #208] +mul v0.4S, v0.4S,v24.s[1] +sub v18.4s, v17.4s, v13.4s +mla v21.4S, v22.4S, v31.s[0] +add v17.4s, v17.4s, v13.4s +str q14, [x0, #272] +sqrdmulh v14.4S, v20.4S, v23.s[2] +str q12, [x0, #336] +mul v20.4S, v20.4S,v24.s[2] +sub v12.4s, v19.4s, v21.4s +mla v0.4S, v11.4S, v31.s[0] +add v19.4s, v19.4s, v21.4s +str q17, [x0, #400] +sqrdmulh v17.4S, v1.4S, v23.s[3] +str q18, [x0, #464] +mul v1.4S, v1.4S,v24.s[3] +sub v18.4s, v3.4s, v0.4s +mla v20.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v0.4s +str q19, [x0, #528] +str q12, [x0, #592] +sub v12.4s, v9.4s, v20.4s +mla v1.4S, v17.4S, v31.s[0] +add v9.4s, v9.4s, v20.4s +str q3, [x0, #656] +str q18, [x0, #720] +sub v18.4s, v8.4s, v1.4s +add v8.4s, v8.4s, v1.4s +str q9, [x0, #784] +str q12, [x0, #848] +str q8, [x0, #912] +str q18, [x0, #976] +ldr q4, [x0, #32] +ldr q5, [x0, #48] +ldr q6, [x17, #+128] +ldr q7, [x17, #+144] +ldr q15, [x17, #+256] +ldr q10, [x0, #96] +ldr q2, [x17, #+272] +ldr q16, [x0, #112] +sqrdmulh v22.4S, v4.4S, v7.s[0] +mul v4.4S, v4.4S,v6.s[0] +sqrdmulh v13.4S, v5.4S, v7.s[0] +mul v5.4S, v5.4S,v6.s[0] +mla v4.4S, v22.4S, v31.s[0] +ldr q22, [x0, #0] +sqrdmulh v11.4S, v10.4S, v2.s[0] +ldr q21, [x0, #16] +mul v10.4S, v10.4S,v15.s[0] +mla v5.4S, v13.4S, v31.s[0] +sub v13.4s, v22.4s, v4.4s +add v22.4s, v22.4s, v4.4s +sqrdmulh v4.4S, v16.4S, v2.s[0] +ldr q14, [x0, #160] +mul v16.4S, v16.4S,v15.s[0] +ldr q0, [x0, #176] +mla v10.4S, v11.4S, v31.s[0] +ldr q11, [x0, #64] +sub v19.4s, v21.4s, v5.4s +add v21.4s, v21.4s, v5.4s +sqrdmulh v5.4S, v21.4S, v7.s[1] +ldr q17, [x0, #128] +mul v21.4S, v21.4S,v6.s[1] +ldr q20, [x0, #144] +mla v16.4S, v4.4S, v31.s[0] +ldr q4, [x0, #80] +sub v3.4s, v11.4s, v10.4s +add v11.4s, v11.4s, v10.4s +sqrdmulh v10.4S, v19.4S, v7.s[2] +ldr q1, [x17, #+384] +mul v19.4S, v19.4S,v6.s[2] +ldr q9, [x17, #+400] +mla v21.4S, v5.4S, v31.s[0] +sub v5.4s, v4.4s, v16.4s +add v4.4s, v4.4s, v16.4s +sqrdmulh v16.4S, v4.4S, v2.s[1] +ldr q12, [x0, #224] +mul v4.4S, v4.4S,v15.s[1] +ldr q8, [x0, #240] +mla v19.4S, v10.4S, v31.s[0] +sub v10.4s, v22.4s, v21.4s +add v22.4s, v22.4s, v21.4s +sqrdmulh v7.4S, v5.4S, v2.s[2] +ldr q6, [x0, #192] +mul v5.4S, v5.4S,v15.s[2] +ldr q21, [x0, #208] +mla v4.4S, v16.4S, v31.s[0] +sub v16.4s, v13.4s, v19.4s +add v13.4s, v13.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v9.s[0] +ldr q18, [x17, #+512] +mul v14.4S, v14.4S,v1.s[0] +ldr q30, [x17, #+528] +trn1 v29.4S, v22.4S, v10.4S +trn2 v28.4S, v22.4S, v10.4S +mla v5.4S, v7.4S, v31.s[0] +sub v7.4s, v11.4s, v4.4s +add v11.4s, v11.4s, v4.4s +sqrdmulh v2.4S, v0.4S, v9.s[0] +mul v0.4S, v0.4S,v1.s[0] +trn1 v15.4S, v13.4S, v16.4S +trn2 v4.4S, v13.4S, v16.4S +mla v14.4S, v19.4S, v31.s[0] +sub v19.4s, v3.4s, v5.4s +add v3.4s, v3.4s, v5.4s +sqrdmulh v5.4S, v12.4S, v30.s[0] +mul v12.4S, v12.4S,v18.s[0] +trn2 v13.2D, v29.2D, v15.2D +trn2 v16.2D, v28.2D, v4.2D +mla v0.4S, v2.4S, v31.s[0] +sub v2.4s, v17.4s, v14.4s +add v17.4s, v17.4s, v14.4s +sqrdmulh v14.4S, v8.4S, v30.s[0] +mul v8.4S, v8.4S,v18.s[0] +trn1 v22.2D, v29.2D, v15.2D +trn1 v10.2D, v28.2D, v4.2D +mla v12.4S, v5.4S, v31.s[0] +sub v5.4s, v20.4s, v0.4s +add v20.4s, v20.4s, v0.4s +sqrdmulh v0.4S, v20.4S, v9.s[1] +mul v20.4S, v20.4S,v1.s[1] +trn1 v4.4S, v11.4S, v7.4S +trn2 v28.4S, v11.4S, v7.4S +mla v8.4S, v14.4S, v31.s[0] +sub v14.4s, v6.4s, v12.4s +add v6.4s, v6.4s, v12.4s +sqrdmulh v12.4S, v5.4S, v9.s[2] +mul v5.4S, v5.4S,v1.s[2] +trn1 v15.4S, v3.4S, v19.4S +trn2 v29.4S, v3.4S, v19.4S +mla v20.4S, v0.4S, v31.s[0] +sub v0.4s, v21.4s, v8.4s +add v21.4s, v21.4s, v8.4s +sqrdmulh v8.4S, v21.4S, v30.s[1] +mul v21.4S, v21.4S,v18.s[1] +trn2 v3.2D, v4.2D, v15.2D +trn2 v19.2D, v28.2D, v29.2D +mla v5.4S, v12.4S, v31.s[0] +sub v12.4s, v17.4s, v20.4s +add v17.4s, v17.4s, v20.4s +sqrdmulh v9.4S, v0.4S, v30.s[2] +mul v0.4S, v0.4S,v18.s[2] +trn1 v11.2D, v4.2D, v15.2D +trn1 v7.2D, v28.2D, v29.2D +mla v21.4S, v8.4S, v31.s[0] +sub v8.4s, v2.4s, v5.4s +add v2.4s, v2.4s, v5.4s +mla v0.4S, v9.4S, v31.s[0] +sub v9.4s, v6.4s, v21.4s +add v6.4s, v6.4s, v21.4s +sub v30.4s, v14.4s, v0.4s +add v14.4s, v14.4s, v0.4s +ldr q0, [x17, #+160] +ldr q18, [x17, #+176] +sqrdmulh v21.4S, v13.4S, v18.4S +mul v13.4S, v13.4S,v0.4S +trn1 v5.4S, v17.4S, v12.4S +trn2 v29.4S, v17.4S, v12.4S +sqrdmulh v28.4S, v16.4S, v18.4S +mul v16.4S, v16.4S,v0.4S +trn1 v18.4S, v2.4S, v8.4S +trn2 v0.4S, v2.4S, v8.4S +mla v13.4S, v21.4S, v31.s[0] +ldr q21, [x17, #+288] +ldr q15, [x17, #+304] +sqrdmulh v4.4S, v3.4S, v15.4S +mul v3.4S, v3.4S,v21.4S +trn2 v2.2D, v5.2D, v18.2D +trn2 v8.2D, v29.2D, v0.2D +mla v16.4S, v28.4S, v31.s[0] +sub v28.4s, v22.4s, v13.4s +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v19.4S, v15.4S +mul v19.4S, v19.4S,v21.4S +trn1 v17.2D, v5.2D, v18.2D +trn1 v12.2D, v29.2D, v0.2D +mla v3.4S, v4.4S, v31.s[0] +sub v4.4s, v10.4s, v16.4s +add v10.4s, v10.4s, v16.4s +ldr q16, [x17, #+192] +ldr q0, [x17, #+208] +sqrdmulh v29.4S, v10.4S, v0.4S +mul v10.4S, v10.4S,v16.4S +trn1 v0.4S, v6.4S, v9.4S +trn2 v16.4S, v6.4S, v9.4S +mla v19.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v3.4s +add v11.4s, v11.4s, v3.4s +ldr q3, [x17, #+224] +ldr q18, [x17, #+240] +sqrdmulh v5.4S, v4.4S, v18.4S +mul v4.4S, v4.4S,v3.4S +trn1 v18.4S, v14.4S, v30.4S +trn2 v3.4S, v14.4S, v30.4S +mla v10.4S, v29.4S, v31.s[0] +sub v29.4s, v7.4s, v19.4s +add v7.4s, v7.4s, v19.4s +ldr q19, [x17, #+320] +ldr q15, [x17, #+336] +sqrdmulh v21.4S, v7.4S, v15.4S +mul v7.4S, v7.4S,v19.4S +trn2 v14.2D, v0.2D, v18.2D +trn2 v30.2D, v16.2D, v3.2D +mla v4.4S, v5.4S, v31.s[0] +sub v5.4s, v22.4s, v10.4s +add v22.4s, v22.4s, v10.4s +ldr q10, [x17, #+352] +ldr q15, [x17, #+368] +sqrdmulh v19.4S, v29.4S, v15.4S +mul v29.4S, v29.4S,v10.4S +trn1 v6.2D, v0.2D, v18.2D +trn1 v9.2D, v16.2D, v3.2D +mla v7.4S, v21.4S, v31.s[0] +sub v21.4s, v28.4s, v4.4s +add v28.4s, v28.4s, v4.4s +mla v29.4S, v19.4S, v31.s[0] +sub v19.4s, v11.4s, v7.4s +add v11.4s, v11.4s, v7.4s +sub v7.4s, v13.4s, v29.4s +add v13.4s, v13.4s, v29.4s +ldr q29, [x17, #+416] +ldr q4, [x17, #+432] +sqrdmulh v3.4S, v2.4S, v4.4S +mul v2.4S, v2.4S,v29.4S +str q22, [x0, #0] +sqrdmulh v22.4S, v8.4S, v4.4S +str q5, [x0, #16] +mul v8.4S, v8.4S,v29.4S +str q28, [x0, #32] +mla v2.4S, v3.4S, v31.s[0] +ldr q3, [x17, #+544] +ldr q28, [x17, #+560] +sqrdmulh v4.4S, v14.4S, v28.4S +str q11, [x0, #64] +mul v14.4S, v14.4S,v3.4S +str q21, [x0, #48] +mla v8.4S, v22.4S, v31.s[0] +str q19, [x0, #80] +sub v19.4s, v17.4s, v2.4s +add v17.4s, v17.4s, v2.4s +sqrdmulh v2.4S, v30.4S, v28.4S +mul v30.4S, v30.4S,v3.4S +str q13, [x0, #96] +mla v14.4S, v4.4S, v31.s[0] +sub v4.4s, v12.4s, v8.4s +add v12.4s, v12.4s, v8.4s +ldr q8, [x17, #+448] +ldr q13, [x17, #+464] +sqrdmulh v28.4S, v12.4S, v13.4S +mul v12.4S, v12.4S,v8.4S +str q7, [x0, #112] +mla v30.4S, v2.4S, v31.s[0] +sub v2.4s, v6.4s, v14.4s +add v6.4s, v6.4s, v14.4s +ldr q14, [x17, #+480] +ldr q7, [x17, #+496] +sqrdmulh v13.4S, v4.4S, v7.4S +mul v4.4S, v4.4S,v14.4S +mla v12.4S, v28.4S, v31.s[0] +sub v28.4s, v9.4s, v30.4s +add v9.4s, v9.4s, v30.4s +ldr q30, [x17, #+576] +ldr q7, [x17, #+592] +sqrdmulh v14.4S, v9.4S, v7.4S +mul v9.4S, v9.4S,v30.4S +mla v4.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v12.4s +add v17.4s, v17.4s, v12.4s +ldr q12, [x17, #+608] +ldr q7, [x17, #+624] +sqrdmulh v30.4S, v28.4S, v7.4S +mul v28.4S, v28.4S,v12.4S +ldr q7, [x0, #288] +mla v9.4S, v14.4S, v31.s[0] +ldr q14, [x0, #304] +sub v12.4s, v19.4s, v4.4s +ldr q8, [x17, #+640] +add v19.4s, v19.4s, v4.4s +ldr q4, [x17, #+656] +mla v28.4S, v30.4S, v31.s[0] +ldr q30, [x17, #+768] +sub v3.4s, v6.4s, v9.4s +ldr q22, [x0, #352] +add v6.4s, v6.4s, v9.4s +ldr q9, [x17, #+784] +sub v21.4s, v2.4s, v28.4s +ldr q11, [x0, #368] +add v2.4s, v2.4s, v28.4s +sqrdmulh v28.4S, v7.4S, v4.s[0] +mul v7.4S, v7.4S,v8.s[0] +sqrdmulh v29.4S, v14.4S, v4.s[0] +mul v14.4S, v14.4S,v8.s[0] +str q17, [x0, #128] +str q13, [x0, #144] +str q19, [x0, #160] +str q12, [x0, #176] +mla v7.4S, v28.4S, v31.s[0] +ldr q28, [x0, #256] +sqrdmulh v12.4S, v22.4S, v9.s[0] +ldr q19, [x0, #272] +mul v22.4S, v22.4S,v30.s[0] +str q6, [x0, #192] +str q3, [x0, #208] +str q2, [x0, #224] +str q21, [x0, #240] +mla v14.4S, v29.4S, v31.s[0] +sub v29.4s, v28.4s, v7.4s +add v28.4s, v28.4s, v7.4s +sqrdmulh v7.4S, v11.4S, v9.s[0] +ldr q21, [x0, #416] +mul v11.4S, v11.4S,v30.s[0] +ldr q2, [x0, #432] +mla v22.4S, v12.4S, v31.s[0] +ldr q12, [x0, #320] +sub v3.4s, v19.4s, v14.4s +add v19.4s, v19.4s, v14.4s +sqrdmulh v14.4S, v19.4S, v4.s[1] +ldr q6, [x0, #384] +mul v19.4S, v19.4S,v8.s[1] +ldr q13, [x0, #400] +mla v11.4S, v7.4S, v31.s[0] +ldr q7, [x0, #336] +sub v17.4s, v12.4s, v22.4s +add v12.4s, v12.4s, v22.4s +sqrdmulh v22.4S, v3.4S, v4.s[2] +ldr q5, [x17, #+896] +mul v3.4S, v3.4S,v8.s[2] +ldr q16, [x17, #+912] +mla v19.4S, v14.4S, v31.s[0] +sub v14.4s, v7.4s, v11.4s +add v7.4s, v7.4s, v11.4s +sqrdmulh v11.4S, v7.4S, v9.s[1] +ldr q18, [x0, #480] +mul v7.4S, v7.4S,v30.s[1] +ldr q0, [x0, #496] +mla v3.4S, v22.4S, v31.s[0] +sub v22.4s, v28.4s, v19.4s +add v28.4s, v28.4s, v19.4s +sqrdmulh v4.4S, v14.4S, v9.s[2] +ldr q8, [x0, #448] +mul v14.4S, v14.4S,v30.s[2] +ldr q19, [x0, #464] +mla v7.4S, v11.4S, v31.s[0] +sub v11.4s, v29.4s, v3.4s +add v29.4s, v29.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v16.s[0] +ldr q15, [x17, #+1024] +mul v21.4S, v21.4S,v5.s[0] +ldr q10, [x17, #+1040] +trn1 v1.4S, v28.4S, v22.4S +trn2 v20.4S, v28.4S, v22.4S +mla v14.4S, v4.4S, v31.s[0] +sub v4.4s, v12.4s, v7.4s +add v12.4s, v12.4s, v7.4s +sqrdmulh v9.4S, v2.4S, v16.s[0] +mul v2.4S, v2.4S,v5.s[0] +trn1 v30.4S, v29.4S, v11.4S +trn2 v7.4S, v29.4S, v11.4S +mla v21.4S, v3.4S, v31.s[0] +sub v3.4s, v17.4s, v14.4s +add v17.4s, v17.4s, v14.4s +sqrdmulh v14.4S, v18.4S, v10.s[0] +mul v18.4S, v18.4S,v15.s[0] +trn2 v29.2D, v1.2D, v30.2D +trn2 v11.2D, v20.2D, v7.2D +mla v2.4S, v9.4S, v31.s[0] +sub v9.4s, v6.4s, v21.4s +add v6.4s, v6.4s, v21.4s +sqrdmulh v21.4S, v0.4S, v10.s[0] +mul v0.4S, v0.4S,v15.s[0] +trn1 v28.2D, v1.2D, v30.2D +trn1 v22.2D, v20.2D, v7.2D +mla v18.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v2.4s +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v13.4S, v16.s[1] +mul v13.4S, v13.4S,v5.s[1] +trn1 v7.4S, v12.4S, v4.4S +trn2 v20.4S, v12.4S, v4.4S +mla v0.4S, v21.4S, v31.s[0] +sub v21.4s, v8.4s, v18.4s +add v8.4s, v8.4s, v18.4s +sqrdmulh v18.4S, v14.4S, v16.s[2] +mul v14.4S, v14.4S,v5.s[2] +trn1 v30.4S, v17.4S, v3.4S +trn2 v1.4S, v17.4S, v3.4S +mla v13.4S, v2.4S, v31.s[0] +sub v2.4s, v19.4s, v0.4s +add v19.4s, v19.4s, v0.4s +sqrdmulh v0.4S, v19.4S, v10.s[1] +mul v19.4S, v19.4S,v15.s[1] +trn2 v17.2D, v7.2D, v30.2D +trn2 v3.2D, v20.2D, v1.2D +mla v14.4S, v18.4S, v31.s[0] +sub v18.4s, v6.4s, v13.4s +add v6.4s, v6.4s, v13.4s +sqrdmulh v16.4S, v2.4S, v10.s[2] +mul v2.4S, v2.4S,v15.s[2] +trn1 v12.2D, v7.2D, v30.2D +trn1 v4.2D, v20.2D, v1.2D +mla v19.4S, v0.4S, v31.s[0] +sub v0.4s, v9.4s, v14.4s +add v9.4s, v9.4s, v14.4s +mla v2.4S, v16.4S, v31.s[0] +sub v16.4s, v8.4s, v19.4s +add v8.4s, v8.4s, v19.4s +sub v10.4s, v21.4s, v2.4s +add v21.4s, v21.4s, v2.4s +ldr q2, [x17, #+672] +ldr q15, [x17, #+688] +sqrdmulh v19.4S, v29.4S, v15.4S +mul v29.4S, v29.4S,v2.4S +trn1 v14.4S, v6.4S, v18.4S +trn2 v1.4S, v6.4S, v18.4S +sqrdmulh v20.4S, v11.4S, v15.4S +mul v11.4S, v11.4S,v2.4S +trn1 v15.4S, v9.4S, v0.4S +trn2 v2.4S, v9.4S, v0.4S +mla v29.4S, v19.4S, v31.s[0] +ldr q19, [x17, #+800] +ldr q30, [x17, #+816] +sqrdmulh v7.4S, v17.4S, v30.4S +mul v17.4S, v17.4S,v19.4S +trn2 v9.2D, v14.2D, v15.2D +trn2 v0.2D, v1.2D, v2.2D +mla v11.4S, v20.4S, v31.s[0] +sub v20.4s, v28.4s, v29.4s +add v28.4s, v28.4s, v29.4s +sqrdmulh v29.4S, v3.4S, v30.4S +mul v3.4S, v3.4S,v19.4S +trn1 v6.2D, v14.2D, v15.2D +trn1 v18.2D, v1.2D, v2.2D +mla v17.4S, v7.4S, v31.s[0] +sub v7.4s, v22.4s, v11.4s +add v22.4s, v22.4s, v11.4s +ldr q11, [x17, #+704] +ldr q2, [x17, #+720] +sqrdmulh v1.4S, v22.4S, v2.4S +mul v22.4S, v22.4S,v11.4S +trn1 v2.4S, v8.4S, v16.4S +trn2 v11.4S, v8.4S, v16.4S +mla v3.4S, v29.4S, v31.s[0] +sub v29.4s, v12.4s, v17.4s +add v12.4s, v12.4s, v17.4s +ldr q17, [x17, #+736] +ldr q15, [x17, #+752] +sqrdmulh v14.4S, v7.4S, v15.4S +mul v7.4S, v7.4S,v17.4S +trn1 v15.4S, v21.4S, v10.4S +trn2 v17.4S, v21.4S, v10.4S +mla v22.4S, v1.4S, v31.s[0] +sub v1.4s, v4.4s, v3.4s +add v4.4s, v4.4s, v3.4s +ldr q3, [x17, #+832] +ldr q30, [x17, #+848] +sqrdmulh v19.4S, v4.4S, v30.4S +mul v4.4S, v4.4S,v3.4S +trn2 v21.2D, v2.2D, v15.2D +trn2 v10.2D, v11.2D, v17.2D +mla v7.4S, v14.4S, v31.s[0] +sub v14.4s, v28.4s, v22.4s +add v28.4s, v28.4s, v22.4s +ldr q22, [x17, #+864] +ldr q30, [x17, #+880] +sqrdmulh v3.4S, v1.4S, v30.4S +mul v1.4S, v1.4S,v22.4S +trn1 v8.2D, v2.2D, v15.2D +trn1 v16.2D, v11.2D, v17.2D +mla v4.4S, v19.4S, v31.s[0] +sub v19.4s, v20.4s, v7.4s +add v20.4s, v20.4s, v7.4s +mla v1.4S, v3.4S, v31.s[0] +sub v3.4s, v12.4s, v4.4s +add v12.4s, v12.4s, v4.4s +sub v4.4s, v29.4s, v1.4s +add v29.4s, v29.4s, v1.4s +ldr q1, [x17, #+928] +ldr q7, [x17, #+944] +sqrdmulh v17.4S, v9.4S, v7.4S +mul v9.4S, v9.4S,v1.4S +str q28, [x0, #256] +sqrdmulh v28.4S, v0.4S, v7.4S +str q14, [x0, #272] +mul v0.4S, v0.4S,v1.4S +str q20, [x0, #288] +mla v9.4S, v17.4S, v31.s[0] +ldr q17, [x17, #+1056] +ldr q20, [x17, #+1072] +sqrdmulh v7.4S, v21.4S, v20.4S +str q12, [x0, #320] +mul v21.4S, v21.4S,v17.4S +str q19, [x0, #304] +mla v0.4S, v28.4S, v31.s[0] +str q3, [x0, #336] +sub v3.4s, v6.4s, v9.4s +add v6.4s, v6.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v20.4S +mul v10.4S, v10.4S,v17.4S +str q29, [x0, #352] +mla v21.4S, v7.4S, v31.s[0] +sub v7.4s, v18.4s, v0.4s +add v18.4s, v18.4s, v0.4s +ldr q0, [x17, #+960] +ldr q29, [x17, #+976] +sqrdmulh v20.4S, v18.4S, v29.4S +mul v18.4S, v18.4S,v0.4S +str q4, [x0, #368] +mla v10.4S, v9.4S, v31.s[0] +sub v9.4s, v8.4s, v21.4s +add v8.4s, v8.4s, v21.4s +ldr q21, [x17, #+992] +ldr q4, [x17, #+1008] +sqrdmulh v29.4S, v7.4S, v4.4S +mul v7.4S, v7.4S,v21.4S +mla v18.4S, v20.4S, v31.s[0] +sub v20.4s, v16.4s, v10.4s +add v16.4s, v16.4s, v10.4s +ldr q10, [x17, #+1088] +ldr q4, [x17, #+1104] +sqrdmulh v21.4S, v16.4S, v4.4S +mul v16.4S, v16.4S,v10.4S +mla v7.4S, v29.4S, v31.s[0] +sub v29.4s, v6.4s, v18.4s +add v6.4s, v6.4s, v18.4s +ldr q18, [x17, #+1120] +ldr q4, [x17, #+1136] +sqrdmulh v10.4S, v20.4S, v4.4S +mul v20.4S, v20.4S,v18.4S +ldr q4, [x0, #544] +mla v16.4S, v21.4S, v31.s[0] +ldr q21, [x0, #560] +sub v18.4s, v3.4s, v7.4s +ldr q0, [x17, #+1152] +add v3.4s, v3.4s, v7.4s +ldr q7, [x17, #+1168] +mla v20.4S, v10.4S, v31.s[0] +ldr q10, [x17, #+1280] +sub v17.4s, v8.4s, v16.4s +ldr q28, [x0, #608] +add v8.4s, v8.4s, v16.4s +ldr q16, [x17, #+1296] +sub v19.4s, v9.4s, v20.4s +ldr q12, [x0, #624] +add v9.4s, v9.4s, v20.4s +sqrdmulh v20.4S, v4.4S, v7.s[0] +mul v4.4S, v4.4S,v0.s[0] +sqrdmulh v1.4S, v21.4S, v7.s[0] +mul v21.4S, v21.4S,v0.s[0] +str q6, [x0, #384] +str q29, [x0, #400] +str q3, [x0, #416] +str q18, [x0, #432] +mla v4.4S, v20.4S, v31.s[0] +ldr q20, [x0, #512] +sqrdmulh v18.4S, v28.4S, v16.s[0] +ldr q3, [x0, #528] +mul v28.4S, v28.4S,v10.s[0] +str q8, [x0, #448] +str q17, [x0, #464] +str q9, [x0, #480] +str q19, [x0, #496] +mla v21.4S, v1.4S, v31.s[0] +sub v1.4s, v20.4s, v4.4s +add v20.4s, v20.4s, v4.4s +sqrdmulh v4.4S, v12.4S, v16.s[0] +ldr q19, [x0, #672] +mul v12.4S, v12.4S,v10.s[0] +ldr q9, [x0, #688] +mla v28.4S, v18.4S, v31.s[0] +ldr q18, [x0, #576] +sub v17.4s, v3.4s, v21.4s +add v3.4s, v3.4s, v21.4s +sqrdmulh v21.4S, v3.4S, v7.s[1] +ldr q8, [x0, #640] +mul v3.4S, v3.4S,v0.s[1] +ldr q29, [x0, #656] +mla v12.4S, v4.4S, v31.s[0] +ldr q4, [x0, #592] +sub v6.4s, v18.4s, v28.4s +add v18.4s, v18.4s, v28.4s +sqrdmulh v28.4S, v17.4S, v7.s[2] +ldr q14, [x17, #+1408] +mul v17.4S, v17.4S,v0.s[2] +ldr q11, [x17, #+1424] +mla v3.4S, v21.4S, v31.s[0] +sub v21.4s, v4.4s, v12.4s +add v4.4s, v4.4s, v12.4s +sqrdmulh v12.4S, v4.4S, v16.s[1] +ldr q15, [x0, #736] +mul v4.4S, v4.4S,v10.s[1] +ldr q2, [x0, #752] +mla v17.4S, v28.4S, v31.s[0] +sub v28.4s, v20.4s, v3.4s +add v20.4s, v20.4s, v3.4s +sqrdmulh v7.4S, v21.4S, v16.s[2] +ldr q0, [x0, #704] +mul v21.4S, v21.4S,v10.s[2] +ldr q3, [x0, #720] +mla v4.4S, v12.4S, v31.s[0] +sub v12.4s, v1.4s, v17.4s +add v1.4s, v1.4s, v17.4s +sqrdmulh v17.4S, v19.4S, v11.s[0] +ldr q30, [x17, #+1536] +mul v19.4S, v19.4S,v14.s[0] +ldr q22, [x17, #+1552] +trn1 v5.4S, v20.4S, v28.4S +trn2 v13.4S, v20.4S, v28.4S +mla v21.4S, v7.4S, v31.s[0] +sub v7.4s, v18.4s, v4.4s +add v18.4s, v18.4s, v4.4s +sqrdmulh v16.4S, v9.4S, v11.s[0] +mul v9.4S, v9.4S,v14.s[0] +trn1 v10.4S, v1.4S, v12.4S +trn2 v4.4S, v1.4S, v12.4S +mla v19.4S, v17.4S, v31.s[0] +sub v17.4s, v6.4s, v21.4s +add v6.4s, v6.4s, v21.4s +sqrdmulh v21.4S, v15.4S, v22.s[0] +mul v15.4S, v15.4S,v30.s[0] +trn2 v1.2D, v5.2D, v10.2D +trn2 v12.2D, v13.2D, v4.2D +mla v9.4S, v16.4S, v31.s[0] +sub v16.4s, v8.4s, v19.4s +add v8.4s, v8.4s, v19.4s +sqrdmulh v19.4S, v2.4S, v22.s[0] +mul v2.4S, v2.4S,v30.s[0] +trn1 v20.2D, v5.2D, v10.2D +trn1 v28.2D, v13.2D, v4.2D +mla v15.4S, v21.4S, v31.s[0] +sub v21.4s, v29.4s, v9.4s +add v29.4s, v29.4s, v9.4s +sqrdmulh v9.4S, v29.4S, v11.s[1] +mul v29.4S, v29.4S,v14.s[1] +trn1 v4.4S, v18.4S, v7.4S +trn2 v13.4S, v18.4S, v7.4S +mla v2.4S, v19.4S, v31.s[0] +sub v19.4s, v0.4s, v15.4s +add v0.4s, v0.4s, v15.4s +sqrdmulh v15.4S, v21.4S, v11.s[2] +mul v21.4S, v21.4S,v14.s[2] +trn1 v10.4S, v6.4S, v17.4S +trn2 v5.4S, v6.4S, v17.4S +mla v29.4S, v9.4S, v31.s[0] +sub v9.4s, v3.4s, v2.4s +add v3.4s, v3.4s, v2.4s +sqrdmulh v2.4S, v3.4S, v22.s[1] +mul v3.4S, v3.4S,v30.s[1] +trn2 v6.2D, v4.2D, v10.2D +trn2 v17.2D, v13.2D, v5.2D +mla v21.4S, v15.4S, v31.s[0] +sub v15.4s, v8.4s, v29.4s +add v8.4s, v8.4s, v29.4s +sqrdmulh v11.4S, v9.4S, v22.s[2] +mul v9.4S, v9.4S,v30.s[2] +trn1 v18.2D, v4.2D, v10.2D +trn1 v7.2D, v13.2D, v5.2D +mla v3.4S, v2.4S, v31.s[0] +sub v2.4s, v16.4s, v21.4s +add v16.4s, v16.4s, v21.4s +mla v9.4S, v11.4S, v31.s[0] +sub v11.4s, v0.4s, v3.4s +add v0.4s, v0.4s, v3.4s +sub v22.4s, v19.4s, v9.4s +add v19.4s, v19.4s, v9.4s +ldr q9, [x17, #+1184] +ldr q30, [x17, #+1200] +sqrdmulh v3.4S, v1.4S, v30.4S +mul v1.4S, v1.4S,v9.4S +trn1 v21.4S, v8.4S, v15.4S +trn2 v5.4S, v8.4S, v15.4S +sqrdmulh v13.4S, v12.4S, v30.4S +mul v12.4S, v12.4S,v9.4S +trn1 v30.4S, v16.4S, v2.4S +trn2 v9.4S, v16.4S, v2.4S +mla v1.4S, v3.4S, v31.s[0] +ldr q3, [x17, #+1312] +ldr q10, [x17, #+1328] +sqrdmulh v4.4S, v6.4S, v10.4S +mul v6.4S, v6.4S,v3.4S +trn2 v16.2D, v21.2D, v30.2D +trn2 v2.2D, v5.2D, v9.2D +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v20.4s, v1.4s +add v20.4s, v20.4s, v1.4s +sqrdmulh v1.4S, v17.4S, v10.4S +mul v17.4S, v17.4S,v3.4S +trn1 v8.2D, v21.2D, v30.2D +trn1 v15.2D, v5.2D, v9.2D +mla v6.4S, v4.4S, v31.s[0] +sub v4.4s, v28.4s, v12.4s +add v28.4s, v28.4s, v12.4s +ldr q12, [x17, #+1216] +ldr q9, [x17, #+1232] +sqrdmulh v5.4S, v28.4S, v9.4S +mul v28.4S, v28.4S,v12.4S +trn1 v9.4S, v0.4S, v11.4S +trn2 v12.4S, v0.4S, v11.4S +mla v17.4S, v1.4S, v31.s[0] +sub v1.4s, v18.4s, v6.4s +add v18.4s, v18.4s, v6.4s +ldr q6, [x17, #+1248] +ldr q30, [x17, #+1264] +sqrdmulh v21.4S, v4.4S, v30.4S +mul v4.4S, v4.4S,v6.4S +trn1 v30.4S, v19.4S, v22.4S +trn2 v6.4S, v19.4S, v22.4S +mla v28.4S, v5.4S, v31.s[0] +sub v5.4s, v7.4s, v17.4s +add v7.4s, v7.4s, v17.4s +ldr q17, [x17, #+1344] +ldr q10, [x17, #+1360] +sqrdmulh v3.4S, v7.4S, v10.4S +mul v7.4S, v7.4S,v17.4S +trn2 v19.2D, v9.2D, v30.2D +trn2 v22.2D, v12.2D, v6.2D +mla v4.4S, v21.4S, v31.s[0] +sub v21.4s, v20.4s, v28.4s +add v20.4s, v20.4s, v28.4s +ldr q28, [x17, #+1376] +ldr q10, [x17, #+1392] +sqrdmulh v17.4S, v5.4S, v10.4S +mul v5.4S, v5.4S,v28.4S +trn1 v0.2D, v9.2D, v30.2D +trn1 v11.2D, v12.2D, v6.2D +mla v7.4S, v3.4S, v31.s[0] +sub v3.4s, v13.4s, v4.4s +add v13.4s, v13.4s, v4.4s +mla v5.4S, v17.4S, v31.s[0] +sub v17.4s, v18.4s, v7.4s +add v18.4s, v18.4s, v7.4s +sub v7.4s, v1.4s, v5.4s +add v1.4s, v1.4s, v5.4s +ldr q5, [x17, #+1440] +ldr q4, [x17, #+1456] +sqrdmulh v6.4S, v16.4S, v4.4S +mul v16.4S, v16.4S,v5.4S +str q20, [x0, #512] +sqrdmulh v20.4S, v2.4S, v4.4S +str q21, [x0, #528] +mul v2.4S, v2.4S,v5.4S +str q13, [x0, #544] +mla v16.4S, v6.4S, v31.s[0] +ldr q6, [x17, #+1568] +ldr q13, [x17, #+1584] +sqrdmulh v4.4S, v19.4S, v13.4S +str q18, [x0, #576] +mul v19.4S, v19.4S,v6.4S +str q3, [x0, #560] +mla v2.4S, v20.4S, v31.s[0] +str q17, [x0, #592] +sub v17.4s, v8.4s, v16.4s +add v8.4s, v8.4s, v16.4s +sqrdmulh v16.4S, v22.4S, v13.4S +mul v22.4S, v22.4S,v6.4S +str q1, [x0, #608] +mla v19.4S, v4.4S, v31.s[0] +sub v4.4s, v15.4s, v2.4s +add v15.4s, v15.4s, v2.4s +ldr q2, [x17, #+1472] +ldr q1, [x17, #+1488] +sqrdmulh v13.4S, v15.4S, v1.4S +mul v15.4S, v15.4S,v2.4S +str q7, [x0, #624] +mla v22.4S, v16.4S, v31.s[0] +sub v16.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +ldr q19, [x17, #+1504] +ldr q7, [x17, #+1520] +sqrdmulh v1.4S, v4.4S, v7.4S +mul v4.4S, v4.4S,v19.4S +mla v15.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +ldr q22, [x17, #+1600] +ldr q7, [x17, #+1616] +sqrdmulh v19.4S, v11.4S, v7.4S +mul v11.4S, v11.4S,v22.4S +mla v4.4S, v1.4S, v31.s[0] +sub v1.4s, v8.4s, v15.4s +add v8.4s, v8.4s, v15.4s +ldr q15, [x17, #+1632] +ldr q7, [x17, #+1648] +sqrdmulh v22.4S, v13.4S, v7.4S +mul v13.4S, v13.4S,v15.4S +ldr q7, [x0, #800] +mla v11.4S, v19.4S, v31.s[0] +ldr q19, [x0, #816] +sub v15.4s, v17.4s, v4.4s +ldr q2, [x17, #+1664] +add v17.4s, v17.4s, v4.4s +ldr q4, [x17, #+1680] +mla v13.4S, v22.4S, v31.s[0] +ldr q22, [x17, #+1792] +sub v6.4s, v0.4s, v11.4s +ldr q20, [x0, #864] +add v0.4s, v0.4s, v11.4s +ldr q11, [x17, #+1808] +sub v3.4s, v16.4s, v13.4s +ldr q18, [x0, #880] +add v16.4s, v16.4s, v13.4s +sqrdmulh v13.4S, v7.4S, v4.s[0] +mul v7.4S, v7.4S,v2.s[0] +sqrdmulh v5.4S, v19.4S, v4.s[0] +mul v19.4S, v19.4S,v2.s[0] +str q8, [x0, #640] +str q1, [x0, #656] +str q17, [x0, #672] +str q15, [x0, #688] +mla v7.4S, v13.4S, v31.s[0] +ldr q13, [x0, #768] +sqrdmulh v15.4S, v20.4S, v11.s[0] +ldr q17, [x0, #784] +mul v20.4S, v20.4S,v22.s[0] +str q0, [x0, #704] +str q6, [x0, #720] +str q16, [x0, #736] +str q3, [x0, #752] +mla v19.4S, v5.4S, v31.s[0] +sub v5.4s, v13.4s, v7.4s +add v13.4s, v13.4s, v7.4s +sqrdmulh v7.4S, v18.4S, v11.s[0] +ldr q3, [x0, #928] +mul v18.4S, v18.4S,v22.s[0] +ldr q16, [x0, #944] +mla v20.4S, v15.4S, v31.s[0] +ldr q15, [x0, #832] +sub v6.4s, v17.4s, v19.4s +add v17.4s, v17.4s, v19.4s +sqrdmulh v19.4S, v17.4S, v4.s[1] +ldr q0, [x0, #896] +mul v17.4S, v17.4S,v2.s[1] +ldr q1, [x0, #912] +mla v18.4S, v7.4S, v31.s[0] +ldr q7, [x0, #848] +sub v8.4s, v15.4s, v20.4s +add v15.4s, v15.4s, v20.4s +sqrdmulh v20.4S, v6.4S, v4.s[2] +ldr q21, [x17, #+1920] +mul v6.4S, v6.4S,v2.s[2] +ldr q12, [x17, #+1936] +mla v17.4S, v19.4S, v31.s[0] +sub v19.4s, v7.4s, v18.4s +add v7.4s, v7.4s, v18.4s +sqrdmulh v18.4S, v7.4S, v11.s[1] +ldr q30, [x0, #992] +mul v7.4S, v7.4S,v22.s[1] +ldr q9, [x0, #1008] +mla v6.4S, v20.4S, v31.s[0] +sub v20.4s, v13.4s, v17.4s +add v13.4s, v13.4s, v17.4s +sqrdmulh v4.4S, v19.4S, v11.s[2] +ldr q2, [x0, #960] +mul v19.4S, v19.4S,v22.s[2] +ldr q17, [x0, #976] +mla v7.4S, v18.4S, v31.s[0] +sub v18.4s, v5.4s, v6.4s +add v5.4s, v5.4s, v6.4s +sqrdmulh v6.4S, v3.4S, v12.s[0] +ldr q10, [x17, #+2048] +mul v3.4S, v3.4S,v21.s[0] +ldr q28, [x17, #+2064] +trn1 v14.4S, v13.4S, v20.4S +trn2 v29.4S, v13.4S, v20.4S +mla v19.4S, v4.4S, v31.s[0] +sub v4.4s, v15.4s, v7.4s +add v15.4s, v15.4s, v7.4s +sqrdmulh v11.4S, v16.4S, v12.s[0] +mul v16.4S, v16.4S,v21.s[0] +trn1 v22.4S, v5.4S, v18.4S +trn2 v7.4S, v5.4S, v18.4S +mla v3.4S, v6.4S, v31.s[0] +sub v6.4s, v8.4s, v19.4s +add v8.4s, v8.4s, v19.4s +sqrdmulh v19.4S, v30.4S, v28.s[0] +mul v30.4S, v30.4S,v10.s[0] +trn2 v5.2D, v14.2D, v22.2D +trn2 v18.2D, v29.2D, v7.2D +mla v16.4S, v11.4S, v31.s[0] +sub v11.4s, v0.4s, v3.4s +add v0.4s, v0.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v28.s[0] +mul v9.4S, v9.4S,v10.s[0] +trn1 v13.2D, v14.2D, v22.2D +trn1 v20.2D, v29.2D, v7.2D +mla v30.4S, v19.4S, v31.s[0] +sub v19.4s, v1.4s, v16.4s +add v1.4s, v1.4s, v16.4s +sqrdmulh v16.4S, v1.4S, v12.s[1] +mul v1.4S, v1.4S,v21.s[1] +trn1 v7.4S, v15.4S, v4.4S +trn2 v29.4S, v15.4S, v4.4S +mla v9.4S, v3.4S, v31.s[0] +sub v3.4s, v2.4s, v30.4s +add v2.4s, v2.4s, v30.4s +sqrdmulh v30.4S, v19.4S, v12.s[2] +mul v19.4S, v19.4S,v21.s[2] +trn1 v22.4S, v8.4S, v6.4S +trn2 v14.4S, v8.4S, v6.4S +mla v1.4S, v16.4S, v31.s[0] +sub v16.4s, v17.4s, v9.4s +add v17.4s, v17.4s, v9.4s +sqrdmulh v9.4S, v17.4S, v28.s[1] +mul v17.4S, v17.4S,v10.s[1] +trn2 v8.2D, v7.2D, v22.2D +trn2 v6.2D, v29.2D, v14.2D +mla v19.4S, v30.4S, v31.s[0] +sub v30.4s, v0.4s, v1.4s +add v0.4s, v0.4s, v1.4s +sqrdmulh v12.4S, v16.4S, v28.s[2] +mul v16.4S, v16.4S,v10.s[2] +trn1 v15.2D, v7.2D, v22.2D +trn1 v4.2D, v29.2D, v14.2D +mla v17.4S, v9.4S, v31.s[0] +sub v9.4s, v11.4s, v19.4s +add v11.4s, v11.4s, v19.4s +mla v16.4S, v12.4S, v31.s[0] +sub v12.4s, v2.4s, v17.4s +add v2.4s, v2.4s, v17.4s +sub v28.4s, v3.4s, v16.4s +add v3.4s, v3.4s, v16.4s +ldr q16, [x17, #+1696] +ldr q10, [x17, #+1712] +sqrdmulh v17.4S, v5.4S, v10.4S +mul v5.4S, v5.4S,v16.4S +trn1 v19.4S, v0.4S, v30.4S +trn2 v14.4S, v0.4S, v30.4S +sqrdmulh v29.4S, v18.4S, v10.4S +mul v18.4S, v18.4S,v16.4S +trn1 v10.4S, v11.4S, v9.4S +trn2 v16.4S, v11.4S, v9.4S +mla v5.4S, v17.4S, v31.s[0] +ldr q17, [x17, #+1824] +ldr q22, [x17, #+1840] +sqrdmulh v7.4S, v8.4S, v22.4S +mul v8.4S, v8.4S,v17.4S +trn2 v11.2D, v19.2D, v10.2D +trn2 v9.2D, v14.2D, v16.2D +mla v18.4S, v29.4S, v31.s[0] +sub v29.4s, v13.4s, v5.4s +add v13.4s, v13.4s, v5.4s +sqrdmulh v5.4S, v6.4S, v22.4S +mul v6.4S, v6.4S,v17.4S +trn1 v0.2D, v19.2D, v10.2D +trn1 v30.2D, v14.2D, v16.2D +mla v8.4S, v7.4S, v31.s[0] +sub v7.4s, v20.4s, v18.4s +add v20.4s, v20.4s, v18.4s +ldr q18, [x17, #+1728] +ldr q16, [x17, #+1744] +sqrdmulh v14.4S, v20.4S, v16.4S +mul v20.4S, v20.4S,v18.4S +trn1 v16.4S, v2.4S, v12.4S +trn2 v18.4S, v2.4S, v12.4S +mla v6.4S, v5.4S, v31.s[0] +sub v5.4s, v15.4s, v8.4s +add v15.4s, v15.4s, v8.4s +ldr q8, [x17, #+1760] +ldr q10, [x17, #+1776] +sqrdmulh v19.4S, v7.4S, v10.4S +mul v7.4S, v7.4S,v8.4S +trn1 v10.4S, v3.4S, v28.4S +trn2 v8.4S, v3.4S, v28.4S +mla v20.4S, v14.4S, v31.s[0] +sub v14.4s, v4.4s, v6.4s +add v4.4s, v4.4s, v6.4s +ldr q6, [x17, #+1856] +ldr q22, [x17, #+1872] +sqrdmulh v17.4S, v4.4S, v22.4S +mul v4.4S, v4.4S,v6.4S +trn2 v3.2D, v16.2D, v10.2D +trn2 v28.2D, v18.2D, v8.2D +mla v7.4S, v19.4S, v31.s[0] +sub v19.4s, v13.4s, v20.4s +add v13.4s, v13.4s, v20.4s +ldr q20, [x17, #+1888] +ldr q22, [x17, #+1904] +sqrdmulh v6.4S, v14.4S, v22.4S +mul v14.4S, v14.4S,v20.4S +trn1 v2.2D, v16.2D, v10.2D +trn1 v12.2D, v18.2D, v8.2D +mla v4.4S, v17.4S, v31.s[0] +sub v17.4s, v29.4s, v7.4s +add v29.4s, v29.4s, v7.4s +mla v14.4S, v6.4S, v31.s[0] +sub v6.4s, v15.4s, v4.4s +add v15.4s, v15.4s, v4.4s +sub v4.4s, v5.4s, v14.4s +add v5.4s, v5.4s, v14.4s +ldr q14, [x17, #+1952] +ldr q7, [x17, #+1968] +sqrdmulh v8.4S, v11.4S, v7.4S +mul v11.4S, v11.4S,v14.4S +str q13, [x0, #768] +sqrdmulh v13.4S, v9.4S, v7.4S +str q19, [x0, #784] +mul v9.4S, v9.4S,v14.4S +str q29, [x0, #800] +mla v11.4S, v8.4S, v31.s[0] +ldr q8, [x17, #+2080] +ldr q29, [x17, #+2096] +sqrdmulh v7.4S, v3.4S, v29.4S +str q15, [x0, #832] +mul v3.4S, v3.4S,v8.4S +str q17, [x0, #816] +mla v9.4S, v13.4S, v31.s[0] +str q6, [x0, #848] +sub v6.4s, v0.4s, v11.4s +add v0.4s, v0.4s, v11.4s +sqrdmulh v11.4S, v28.4S, v29.4S +mul v28.4S, v28.4S,v8.4S +str q5, [x0, #864] +mla v3.4S, v7.4S, v31.s[0] +sub v7.4s, v30.4s, v9.4s +add v30.4s, v30.4s, v9.4s +ldr q9, [x17, #+1984] +ldr q5, [x17, #+2000] +sqrdmulh v29.4S, v30.4S, v5.4S +mul v30.4S, v30.4S,v9.4S +str q4, [x0, #880] +mla v28.4S, v11.4S, v31.s[0] +sub v11.4s, v2.4s, v3.4s +add v2.4s, v2.4s, v3.4s +ldr q3, [x17, #+2016] +ldr q4, [x17, #+2032] +sqrdmulh v5.4S, v7.4S, v4.4S +mul v7.4S, v7.4S,v3.4S +mla v30.4S, v29.4S, v31.s[0] +sub v29.4s, v12.4s, v28.4s +add v12.4s, v12.4s, v28.4s +ldr q28, [x17, #+2112] +ldr q4, [x17, #+2128] +sqrdmulh v3.4S, v12.4S, v4.4S +mul v12.4S, v12.4S,v28.4S +mla v7.4S, v5.4S, v31.s[0] +sub v5.4s, v0.4s, v30.4s +add v0.4s, v0.4s, v30.4s +ldr q30, [x17, #+2144] +ldr q4, [x17, #+2160] +sqrdmulh v28.4S, v29.4S, v4.4S +mul v29.4S, v29.4S,v30.4S +mla v12.4S, v3.4S, v31.s[0] +sub v3.4s, v6.4s, v7.4s +add v6.4s, v6.4s, v7.4s +mla v29.4S, v28.4S, v31.s[0] +sub v28.4s, v2.4s, v12.4s +add v2.4s, v2.4s, v12.4s +sub v12.4s, v11.4s, v29.4s +add v11.4s, v11.4s, v29.4s +str q0, [x0, #896] +str q5, [x0, #912] +str q6, [x0, #928] +str q3, [x0, #944] +str q2, [x0, #960] +str q28, [x0, #976] +str q11, [x0, #992] +str q12, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 2392 +// Instruction count: 2388 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z4_4.s b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z4_4.s new file mode 100644 index 0000000..39f413d --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z4_4.s @@ -0,0 +1,2422 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 26036764 // Layer 6, block 0 +.word 7065381 // Layer 6, block 1 +.word 11280567 // Layer 6, block 2 +.word 19695786 // Layer 6, block 3 +.word 1666225723 // Layer 6, block 0 +.word 452149874 // Layer 6, block 1 +.word 721901190 // Layer 6, block 2 +.word 1260434103 // Layer 6, block 3 +.word 28678040 // Layer 7, block 0 +.word 5637166 // Layer 7, block 2 +.word 18759424 // Layer 7, block 4 +.word 8648030 // Layer 7, block 6 +.word 1835254486 // Layer 7, block 0 +.word 360751090 // Layer 7, block 2 +.word 1200511508 // Layer 7, block 4 +.word 553431680 // Layer 7, block 6 +.word 7232147 // Layer 7, block 1 +.word 7430689 // Layer 7, block 3 +.word 14819378 // Layer 7, block 5 +.word 22112339 // Layer 7, block 7 +.word 462822084 // Layer 7, block 1 +.word 475527802 // Layer 7, block 3 +.word 948367809 // Layer 7, block 5 +.word 1415081692 // Layer 7, block 7 +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14834498 // Layer 6, block 4 +.word 22861321 // Layer 6, block 5 +.word 23033862 // Layer 6, block 6 +.word 32211066 // Layer 6, block 7 +.word 949335415 // Layer 6, block 4 +.word 1463012881 // Layer 6, block 5 +.word 1474054663 // Layer 6, block 6 +.word 2061350894 // Layer 6, block 7 +.word 7103825 // Layer 7, block 8 +.word 24338119 // Layer 7, block 10 +.word 6674394 // Layer 7, block 12 +.word 3716128 // Layer 7, block 14 +.word 454610102 // Layer 7, block 8 +.word 1557520740 // Layer 7, block 10 +.word 427128616 // Layer 7, block 12 +.word 237814041 // Layer 7, block 14 +.word 18577393 // Layer 7, block 9 +.word 17042091 // Layer 7, block 11 +.word 6574213 // Layer 7, block 13 +.word 24666803 // Layer 7, block 15 +.word 1188862414 // Layer 7, block 9 +.word 1090610585 // Layer 7, block 11 +.word 420717521 // Layer 7, block 13 +.word 1578554911 // Layer 7, block 15 +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 11253846 // Layer 6, block 8 +.word 16151303 // Layer 6, block 9 +.word 1821442 // Layer 6, block 10 +.word 23358663 // Layer 6, block 11 +.word 720191176 // Layer 6, block 8 +.word 1033604503 // Layer 6, block 9 +.word 116563391 // Layer 6, block 10 +.word 1494840340 // Layer 6, block 11 +.word 32787475 // Layer 7, block 16 +.word 8269259 // Layer 7, block 18 +.word 20826321 // Layer 7, block 20 +.word 21194054 // Layer 7, block 22 +.word 2098238255 // Layer 7, block 16 +.word 529192186 // Layer 7, block 18 +.word 1332782821 // Layer 7, block 20 +.word 1356315937 // Layer 7, block 22 +.word 28400654 // Layer 7, block 17 +.word 31090287 // Layer 7, block 19 +.word 26776841 // Layer 7, block 21 +.word 22281074 // Layer 7, block 23 +.word 1817503137 // Layer 7, block 17 +.word 1989626512 // Layer 7, block 19 +.word 1713587037 // Layer 7, block 21 +.word 1425879908 // Layer 7, block 23 +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 20504641 // Layer 6, block 12 +.word 7735096 // Layer 6, block 13 +.word 29463916 // Layer 6, block 14 +.word 23172067 // Layer 6, block 15 +.word 1312196872 // Layer 6, block 12 +.word 495008363 // Layer 6, block 13 +.word 1885546712 // Layer 6, block 14 +.word 1482899108 // Layer 6, block 15 +.word 1953000 // Layer 7, block 24 +.word 12766243 // Layer 7, block 26 +.word 16292342 // Layer 7, block 28 +.word 25143337 // Layer 7, block 30 +.word 124982461 // Layer 7, block 24 +.word 816977197 // Layer 7, block 26 +.word 1042630311 // Layer 7, block 28 +.word 1609050759 // Layer 7, block 30 +.word 12486848 // Layer 7, block 25 +.word 31556661 // Layer 7, block 27 +.word 28330310 // Layer 7, block 29 +.word 15137961 // Layer 7, block 31 +.word 799097282 // Layer 7, block 25 +.word 2019472170 // Layer 7, block 27 +.word 1813001465 // Layer 7, block 29 +.word 968755565 // Layer 7, block 31 +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 18663828 // Layer 6, block 16 +.word 25765932 // Layer 6, block 17 +.word 11779122 // Layer 6, block 18 +.word 29112305 // Layer 6, block 19 +.word 1194393831 // Layer 6, block 16 +.word 1648893798 // Layer 6, block 17 +.word 753806275 // Layer 6, block 18 +.word 1863045325 // Layer 6, block 19 +.word 33163184 // Layer 7, block 32 +.word 11550623 // Layer 7, block 34 +.word 25375595 // Layer 7, block 36 +.word 18254638 // Layer 7, block 38 +.word 2122281795 // Layer 7, block 32 +.word 739183455 // Layer 7, block 34 +.word 1623914137 // Layer 7, block 36 +.word 1168207670 // Layer 7, block 38 +.word 9551359 // Layer 7, block 33 +.word 33257316 // Layer 7, block 35 +.word 10387700 // Layer 7, block 37 +.word 4263629 // Layer 7, block 39 +.word 611240324 // Layer 7, block 33 +.word 2128305784 // Layer 7, block 35 +.word 664762063 // Layer 7, block 37 +.word 272851431 // Layer 7, block 39 +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 596073 // Layer 6, block 20 +.word 29039358 // Layer 6, block 21 +.word 6760262 // Layer 6, block 22 +.word 2228887 // Layer 6, block 23 +.word 38145761 // Layer 6, block 20 +.word 1858377074 // Layer 6, block 21 +.word 432623749 // Layer 6, block 22 +.word 142637881 // Layer 6, block 23 +.word 25929180 // Layer 7, block 40 +.word 23508428 // Layer 7, block 42 +.word 22560727 // Layer 7, block 44 +.word 29457393 // Layer 7, block 46 +.word 1659340873 // Layer 7, block 40 +.word 1504424569 // Layer 7, block 42 +.word 1443776334 // Layer 7, block 44 +.word 1885129272 // Layer 7, block 46 +.word 17371159 // Layer 7, block 41 +.word 11558208 // Layer 7, block 43 +.word 15755637 // Layer 7, block 45 +.word 20740787 // Layer 7, block 47 +.word 1111669329 // Layer 7, block 41 +.word 739668858 // Layer 7, block 43 +.word 1008283812 // Layer 7, block 45 +.word 1327309063 // Layer 7, block 47 +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 13624329 // Layer 6, block 24 +.word 9838349 // Layer 6, block 25 +.word 6934560 // Layer 6, block 26 +.word 11310234 // Layer 6, block 27 +.word 871890510 // Layer 6, block 24 +.word 629606282 // Layer 6, block 25 +.word 443777969 // Layer 6, block 26 +.word 723799733 // Layer 6, block 27 +.word 3153984 // Layer 7, block 48 +.word 15599806 // Layer 7, block 50 +.word 23484790 // Layer 7, block 52 +.word 30174454 // Layer 7, block 54 +.word 201839571 // Layer 7, block 48 +.word 998311389 // Layer 7, block 50 +.word 1502911852 // Layer 7, block 52 +.word 1931017673 // Layer 7, block 54 +.word 13598070 // Layer 7, block 49 +.word 31454003 // Layer 7, block 51 +.word 20506260 // Layer 7, block 53 +.word 5928435 // Layer 7, block 55 +.word 870210062 // Layer 7, block 49 +.word 2012902560 // Layer 7, block 51 +.word 1312300480 // Layer 7, block 53 +.word 379390883 // Layer 7, block 55 +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 32798516 // Layer 6, block 28 +.word 9911360 // Layer 6, block 29 +.word 32443170 // Layer 6, block 30 +.word 31293482 // Layer 6, block 31 +.word 2098944825 // Layer 6, block 28 +.word 634278629 // Layer 6, block 29 +.word 2076204416 // Layer 6, block 30 +.word 2002630000 // Layer 6, block 31 +.word 26013877 // Layer 7, block 56 +.word 22928950 // Layer 7, block 58 +.word 24547058 // Layer 7, block 60 +.word 21082546 // Layer 7, block 62 +.word 1664761067 // Layer 7, block 56 +.word 1467340807 // Layer 7, block 58 +.word 1570891816 // Layer 7, block 60 +.word 1349179970 // Layer 7, block 62 +.word 21864746 // Layer 7, block 57 +.word 27678266 // Layer 7, block 59 +.word 30695887 // Layer 7, block 61 +.word 31772478 // Layer 7, block 63 +.word 1399236949 // Layer 7, block 57 +.word 1771273834 // Layer 7, block 59 +.word 1964386839 // Layer 7, block 61 +.word 2033283404 // Layer 7, block 63 +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 2853776 // Layer 6, block 32 +.word 31645959 // Layer 6, block 33 +.word 29723614 // Layer 6, block 34 +.word 31813171 // Layer 6, block 35 +.word 182627725 // Layer 6, block 32 +.word 2025186806 // Layer 6, block 33 +.word 1902166116 // Layer 6, block 34 +.word 2035887557 // Layer 6, block 35 +.word 30377953 // Layer 7, block 64 +.word 4924837 // Layer 7, block 66 +.word 11362575 // Layer 7, block 68 +.word 31398766 // Layer 7, block 70 +.word 1944040616 // Layer 7, block 64 +.word 315165513 // Layer 7, block 66 +.word 727149301 // Layer 7, block 68 +.word 2009367662 // Layer 7, block 70 +.word 27689101 // Layer 7, block 65 +.word 31229525 // Layer 7, block 67 +.word 6544948 // Layer 7, block 69 +.word 13728247 // Layer 7, block 71 +.word 1771967221 // Layer 7, block 65 +.word 1998537064 // Layer 7, block 67 +.word 418844704 // Layer 7, block 69 +.word 878540754 // Layer 7, block 71 +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9116920 // Layer 6, block 36 +.word 26449800 // Layer 6, block 37 +.word 27173300 // Layer 6, block 38 +.word 1574249 // Layer 6, block 39 +.word 583438350 // Layer 6, block 36 +.word 1692658010 // Layer 6, block 37 +.word 1738958476 // Layer 6, block 38 +.word 100744247 // Layer 6, block 39 +.word 6510145 // Layer 7, block 72 +.word 760999 // Layer 7, block 74 +.word 1634503 // Layer 7, block 76 +.word 29546109 // Layer 7, block 78 +.word 416617482 // Layer 7, block 72 +.word 48700219 // Layer 7, block 74 +.word 104600209 // Layer 7, block 76 +.word 1890806663 // Layer 7, block 78 +.word 2195232 // Layer 7, block 73 +.word 4465852 // Layer 7, block 75 +.word 31203102 // Layer 7, block 77 +.word 29916743 // Layer 7, block 79 +.word 140484126 // Layer 7, block 73 +.word 285792715 // Layer 7, block 75 +.word 1996846121 // Layer 7, block 77 +.word 1914525428 // Layer 7, block 79 +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29172999 // Layer 6, block 40 +.word 16825951 // Layer 6, block 41 +.word 11592382 // Layer 6, block 42 +.word 2671395 // Layer 6, block 43 +.word 1866929445 // Layer 6, block 40 +.word 1076778680 // Layer 6, block 41 +.word 741855827 // Layer 6, block 42 +.word 170956232 // Layer 6, block 43 +.word 14579779 // Layer 7, block 80 +.word 24263513 // Layer 7, block 82 +.word 4646776 // Layer 7, block 84 +.word 69049 // Layer 7, block 86 +.word 933034643 // Layer 7, block 80 +.word 1552746321 // Layer 7, block 82 +.word 297370968 // Layer 7, block 84 +.word 4418799 // Layer 7, block 86 +.word 33263488 // Layer 7, block 81 +.word 22493246 // Layer 7, block 83 +.word 22009979 // Layer 7, block 85 +.word 12021234 // Layer 7, block 87 +.word 2128700762 // Layer 7, block 81 +.word 1439457879 // Layer 7, block 83 +.word 1408531152 // Layer 7, block 85 +.word 769300260 // Layer 7, block 87 +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 15720958 // Layer 6, block 44 +.word 4876619 // Layer 6, block 45 +.word 9370171 // Layer 6, block 46 +.word 2197027 // Layer 6, block 47 +.word 1006064525 // Layer 6, block 44 +.word 312079797 // Layer 6, block 45 +.word 599645177 // Layer 6, block 46 +.word 140598997 // Layer 6, block 47 +.word 16117282 // Layer 7, block 88 +.word 9635661 // Layer 7, block 90 +.word 9117520 // Layer 7, block 92 +.word 3506913 // Layer 7, block 94 +.word 1031427326 // Layer 7, block 88 +.word 616635240 // Layer 7, block 90 +.word 583476747 // Layer 7, block 92 +.word 224425303 // Layer 7, block 94 +.word 20014407 // Layer 7, block 89 +.word 25893988 // Layer 7, block 91 +.word 10257619 // Layer 7, block 93 +.word 24501669 // Layer 7, block 95 +.word 1280824291 // Layer 7, block 89 +.word 1657088757 // Layer 7, block 91 +.word 656437514 // Layer 7, block 93 +.word 1567987141 // Layer 7, block 95 +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 23467272 // Layer 6, block 48 +.word 11944835 // Layer 6, block 49 +.word 29768154 // Layer 6, block 50 +.word 3189790 // Layer 6, block 51 +.word 1501790786 // Layer 6, block 48 +.word 764411097 // Layer 6, block 49 +.word 1905016458 // Layer 6, block 50 +.word 204130980 // Layer 6, block 51 +.word 28559032 // Layer 7, block 96 +.word 20151609 // Layer 7, block 98 +.word 11645481 // Layer 7, block 100 +.word 16402437 // Layer 7, block 102 +.word 1827638556 // Layer 7, block 96 +.word 1289604549 // Layer 7, block 98 +.word 745253903 // Layer 7, block 100 +.word 1049675853 // Layer 7, block 102 +.word 1005359 // Layer 7, block 97 +.word 19130139 // Layer 7, block 99 +.word 11690281 // Layer 7, block 101 +.word 5461508 // Layer 7, block 103 +.word 64338065 // Layer 7, block 97 +.word 1224235458 // Layer 7, block 99 +.word 748120885 // Layer 7, block 101 +.word 349509836 // Layer 7, block 103 +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 4898455 // Layer 6, block 52 +.word 22059944 // Layer 6, block 53 +.word 20315246 // Layer 6, block 54 +.word 28615767 // Layer 6, block 55 +.word 313477194 // Layer 6, block 52 +.word 1411728668 // Layer 6, block 53 +.word 1300076517 // Layer 6, block 54 +.word 1831269319 // Layer 6, block 55 +.word 6226096 // Layer 7, block 104 +.word 14029790 // Layer 7, block 106 +.word 7729000 // Layer 7, block 108 +.word 13958531 // Layer 7, block 110 +.word 398439734 // Layer 7, block 104 +.word 897838034 // Layer 7, block 106 +.word 494618249 // Layer 7, block 108 +.word 893277806 // Layer 7, block 110 +.word 31755058 // Layer 7, block 105 +.word 26102744 // Layer 7, block 107 +.word 19175904 // Layer 7, block 109 +.word 19472238 // Layer 7, block 111 +.word 2032168609 // Layer 7, block 105 +.word 1670448121 // Layer 7, block 107 +.word 1227164194 // Layer 7, block 109 +.word 1246128123 // Layer 7, block 111 +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 17302560 // Layer 6, block 56 +.word 8630188 // Layer 6, block 57 +.word 13744680 // Layer 6, block 58 +.word 31890906 // Layer 6, block 59 +.word 1107279328 // Layer 6, block 56 +.word 552289879 // Layer 6, block 57 +.word 879592386 // Layer 6, block 58 +.word 2040862218 // Layer 6, block 59 +.word 4735938 // Layer 7, block 112 +.word 26671657 // Layer 7, block 114 +.word 25810971 // Layer 7, block 116 +.word 25578690 // Layer 7, block 118 +.word 303076900 // Layer 7, block 112 +.word 1706855774 // Layer 7, block 114 +.word 1651776074 // Layer 7, block 116 +.word 1636911225 // Layer 7, block 118 +.word 6957373 // Layer 7, block 113 +.word 25381712 // Layer 7, block 115 +.word 27780827 // Layer 7, block 117 +.word 28062311 // Layer 7, block 119 +.word 445237890 // Layer 7, block 113 +.word 1624305595 // Layer 7, block 115 +.word 1777837237 // Layer 7, block 117 +.word 1795850838 // Layer 7, block 119 +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 26150922 // Layer 6, block 60 +.word 29525906 // Layer 6, block 61 +.word 23080870 // Layer 6, block 62 +.word 1636987 // Layer 6, block 63 +.word 1673531278 // Layer 6, block 60 +.word 1889513769 // Layer 6, block 61 +.word 1477062945 // Layer 6, block 62 +.word 104759172 // Layer 6, block 63 +.word 10674616 // Layer 7, block 120 +.word 9508293 // Layer 7, block 122 +.word 4274200 // Layer 7, block 124 +.word 10066304 // Layer 7, block 126 +.word 683123285 // Layer 7, block 120 +.word 608484310 // Layer 7, block 122 +.word 273527923 // Layer 7, block 124 +.word 644194289 // Layer 7, block 126 +.word 26473446 // Layer 7, block 121 +.word 14853570 // Layer 7, block 123 +.word 32427548 // Layer 7, block 125 +.word 16598340 // Layer 7, block 127 +.word 1694171239 // Layer 7, block 121 +.word 950555930 // Layer 7, block 123 +.word 2075204685 // Layer 7, block 125 +.word 1062212688 // Layer 7, block 127 +.text +.global ntt_u32_full_neon_asm_var_4_4_3_z4_4 +.global _ntt_u32_full_neon_asm_var_4_4_3_z4_4 +ntt_u32_full_neon_asm_var_4_4_3_z4_4: +_ntt_u32_full_neon_asm_var_4_4_3_z4_4: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #800] +ldr q21, [x0, #864] +ldr q20, [x0, #928] +ldr q19, [x0, #992] +ldr q18, [x0, #288] +ldr q17, [x0, #352] +ldr q16, [x0, #416] +ldr q3, [x0, #480] +sqrdmulh v2.4S, v22.4S, v29.s[0] +ldr q1, [x0, #544] +mul v22.4S, v22.4S,v30.s[0] +ldr q0, [x0, #608] +sqrdmulh v15.4S, v21.4S, v29.s[0] +ldr q14, [x0, #672] +mul v21.4S, v21.4S,v30.s[0] +ldr q13, [x0, #736] +mla v22.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q12, [x0, #32] +sub v11.4s, v18.4s, v22.4s +mla v21.4S, v15.4S, v31.s[0] +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q15, [x0, #96] +sub v10.4s, v17.4s, v21.4s +mla v20.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v1.4S, v29.s[0] +ldr q2, [x0, #160] +mul v1.4S, v1.4S,v30.s[0] +sub v9.4s, v16.4s, v20.4s +mla v19.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v0.4S, v29.s[0] +ldr q22, [x0, #224] +mul v0.4S, v0.4S,v30.s[0] +sub v8.4s, v3.4s, v19.4s +mla v1.4S, v21.4S, v31.s[0] +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v21.4s, v12.4s, v1.4s +mla v0.4S, v20.4S, v31.s[0] +add v12.4s, v12.4s, v1.4s +sqrdmulh v1.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +sub v20.4s, v15.4s, v0.4s +mla v14.4S, v19.4S, v31.s[0] +add v15.4s, v15.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v19.4s, v2.4s, v14.4s +mla v13.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v1.4s, v22.4s, v13.4s +mla v16.4S, v0.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v0.4s, v2.4s, v16.4s +mla v3.4S, v14.4S, v31.s[0] +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v14.4s, v22.4s, v3.4s +mla v18.4S, v13.4S, v31.s[0] +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v29.s[2] +mul v9.4S, v9.4S,v30.s[2] +sub v13.4s, v12.4s, v18.4s +mla v17.4S, v16.4S, v31.s[0] +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v8.4S, v29.s[2] +mul v8.4S, v8.4S,v30.s[2] +sub v16.4s, v15.4s, v17.4s +mla v9.4S, v3.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +sqrdmulh v17.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v3.4s, v19.4s, v9.4s +mla v8.4S, v18.4S, v31.s[0] +add v19.4s, v19.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v18.4s, v1.4s, v8.4s +mla v11.4S, v17.4S, v31.s[0] +add v1.4s, v1.4s, v8.4s +sqrdmulh v8.4S, v2.4S, v27.s[0] +mul v2.4S, v2.4S,v28.s[0] +sub v17.4s, v21.4s, v11.4s +mla v10.4S, v9.4S, v31.s[0] +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v27.s[0] +mul v22.4S, v22.4S,v28.s[0] +sub v9.4s, v20.4s, v10.4s +mla v2.4S, v8.4S, v31.s[0] +add v20.4s, v20.4s, v10.4s +sqrdmulh v10.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v8.4s, v12.4s, v2.4s +mla v22.4S, v11.4S, v31.s[0] +add v12.4s, v12.4s, v2.4s +sqrdmulh v2.4S, v14.4S, v27.s[1] +mul v14.4S, v14.4S,v28.s[1] +sub v11.4s, v15.4s, v22.4s +mla v0.4S, v10.4S, v31.s[0] +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v27.s[2] +mul v19.4S, v19.4S,v28.s[2] +sub v10.4s, v13.4s, v0.4s +mla v14.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v0.4s +sqrdmulh v0.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +sub v2.4s, v16.4s, v14.4s +mla v19.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v27.s[3] +mul v3.4S, v3.4S,v28.s[3] +sub v22.4s, v21.4s, v19.4s +mla v1.4S, v0.4S, v31.s[0] +add v21.4s, v21.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v0.4s, v20.4s, v1.4s +mla v3.4S, v14.4S, v31.s[0] +add v20.4s, v20.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v25.s[0] +mul v15.4S, v15.4S,v26.s[0] +sub v14.4s, v17.4s, v3.4s +mla v18.4S, v19.4S, v31.s[0] +add v17.4s, v17.4s, v3.4s +sqrdmulh v3.4S, v11.4S, v25.s[1] +mul v11.4S, v11.4S,v26.s[1] +sub v19.4s, v9.4s, v18.4s +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v1.4s, v12.4s, v15.4s +mla v11.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v25.s[3] +mul v2.4S, v2.4S,v26.s[3] +sub v3.4s, v8.4s, v11.4s +mla v16.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v11.4s +str q12, [x0, #32] +sqrdmulh v12.4S, v20.4S, v23.s[0] +str q1, [x0, #96] +mul v20.4S, v20.4S,v24.s[0] +ldr q1, [x0, #816] +sub v11.4s, v13.4s, v16.4s +ldr q18, [x0, #880] +mla v2.4S, v15.4S, v31.s[0] +add v13.4s, v13.4s, v16.4s +str q8, [x0, #160] +sqrdmulh v8.4S, v0.4S, v23.s[1] +str q3, [x0, #224] +mul v0.4S, v0.4S,v24.s[1] +ldr q3, [x0, #944] +sub v16.4s, v10.4s, v2.4s +ldr q15, [x0, #1008] +mla v20.4S, v12.4S, v31.s[0] +add v10.4s, v10.4s, v2.4s +str q13, [x0, #288] +sqrdmulh v13.4S, v9.4S, v23.s[2] +str q11, [x0, #352] +mul v9.4S, v9.4S,v24.s[2] +ldr q11, [x0, #304] +sub v2.4s, v21.4s, v20.4s +ldr q12, [x0, #368] +mla v0.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v20.4s +str q10, [x0, #416] +sqrdmulh v10.4S, v19.4S, v23.s[3] +str q16, [x0, #480] +mul v19.4S, v19.4S,v24.s[3] +ldr q16, [x0, #432] +sub v20.4s, v22.4s, v0.4s +ldr q8, [x0, #496] +mla v9.4S, v13.4S, v31.s[0] +add v22.4s, v22.4s, v0.4s +str q21, [x0, #544] +sqrdmulh v21.4S, v1.4S, v29.s[0] +str q2, [x0, #608] +ldr q2, [x0, #560] +mul v1.4S, v1.4S,v30.s[0] +ldr q0, [x0, #624] +sub v13.4s, v17.4s, v9.4s +mla v19.4S, v10.4S, v31.s[0] +add v17.4s, v17.4s, v9.4s +str q22, [x0, #672] +sqrdmulh v22.4S, v18.4S, v29.s[0] +str q20, [x0, #736] +ldr q20, [x0, #688] +mul v18.4S, v18.4S,v30.s[0] +ldr q9, [x0, #752] +sub v10.4s, v14.4s, v19.4s +mla v1.4S, v21.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +str q17, [x0, #800] +sqrdmulh v17.4S, v3.4S, v29.s[0] +str q13, [x0, #864] +mul v3.4S, v3.4S,v30.s[0] +ldr q13, [x0, #48] +sub v19.4s, v11.4s, v1.4s +mla v18.4S, v22.4S, v31.s[0] +add v11.4s, v11.4s, v1.4s +str q14, [x0, #928] +sqrdmulh v14.4S, v15.4S, v29.s[0] +str q10, [x0, #992] +mul v15.4S, v15.4S,v30.s[0] +ldr q10, [x0, #112] +sub v1.4s, v12.4s, v18.4s +mla v3.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v2.4S, v29.s[0] +ldr q17, [x0, #176] +mul v2.4S, v2.4S,v30.s[0] +sub v22.4s, v16.4s, v3.4s +mla v15.4S, v14.4S, v31.s[0] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v0.4S, v29.s[0] +ldr q14, [x0, #240] +mul v0.4S, v0.4S,v30.s[0] +sub v21.4s, v8.4s, v15.4s +mla v2.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +sub v18.4s, v13.4s, v2.4s +mla v0.4S, v3.4S, v31.s[0] +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v9.4S, v29.s[0] +mul v9.4S, v9.4S,v30.s[0] +sub v3.4s, v10.4s, v0.4s +mla v20.4S, v15.4S, v31.s[0] +add v10.4s, v10.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v15.4s, v17.4s, v20.4s +mla v9.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +sub v2.4s, v14.4s, v9.4s +mla v16.4S, v0.4S, v31.s[0] +add v14.4s, v14.4s, v9.4s +sqrdmulh v9.4S, v11.4S, v29.s[1] +mul v11.4S, v11.4S,v30.s[1] +sub v0.4s, v17.4s, v16.4s +mla v8.4S, v20.4S, v31.s[0] +add v17.4s, v17.4s, v16.4s +sqrdmulh v16.4S, v12.4S, v29.s[1] +mul v12.4S, v12.4S,v30.s[1] +sub v20.4s, v14.4s, v8.4s +mla v11.4S, v9.4S, v31.s[0] +add v14.4s, v14.4s, v8.4s +sqrdmulh v8.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +sub v9.4s, v13.4s, v11.4s +mla v12.4S, v16.4S, v31.s[0] +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +sub v16.4s, v10.4s, v12.4s +mla v22.4S, v8.4S, v31.s[0] +add v10.4s, v10.4s, v12.4s +sqrdmulh v12.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +sub v8.4s, v15.4s, v22.4s +mla v21.4S, v11.4S, v31.s[0] +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v1.4S, v29.s[2] +mul v1.4S, v1.4S,v30.s[2] +sub v11.4s, v2.4s, v21.4s +mla v19.4S, v12.4S, v31.s[0] +add v2.4s, v2.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v27.s[0] +mul v17.4S, v17.4S,v28.s[0] +sub v12.4s, v18.4s, v19.4s +mla v1.4S, v22.4S, v31.s[0] +add v18.4s, v18.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +sub v22.4s, v3.4s, v1.4s +mla v17.4S, v21.4S, v31.s[0] +add v3.4s, v3.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v21.4s, v13.4s, v17.4s +mla v14.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v17.4s +sqrdmulh v17.4S, v20.4S, v27.s[1] +mul v20.4S, v20.4S,v28.s[1] +sub v19.4s, v10.4s, v14.4s +mla v0.4S, v1.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v27.s[2] +mul v15.4S, v15.4S,v28.s[2] +sub v1.4s, v9.4s, v0.4s +mla v20.4S, v17.4S, v31.s[0] +add v9.4s, v9.4s, v0.4s +sqrdmulh v0.4S, v2.4S, v27.s[2] +mul v2.4S, v2.4S,v28.s[2] +sub v17.4s, v16.4s, v20.4s +mla v15.4S, v14.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v27.s[3] +mul v8.4S, v8.4S,v28.s[3] +sub v14.4s, v18.4s, v15.4s +mla v2.4S, v0.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v27.s[3] +mul v11.4S, v11.4S,v28.s[3] +sub v0.4s, v3.4s, v2.4s +mla v8.4S, v20.4S, v31.s[0] +add v3.4s, v3.4s, v2.4s +sqrdmulh v2.4S, v10.4S, v25.s[0] +mul v10.4S, v10.4S,v26.s[0] +sub v20.4s, v12.4s, v8.4s +mla v11.4S, v15.4S, v31.s[0] +add v12.4s, v12.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v25.s[1] +mul v19.4S, v19.4S,v26.s[1] +sub v15.4s, v22.4s, v11.4s +mla v10.4S, v2.4S, v31.s[0] +add v22.4s, v22.4s, v11.4s +sqrdmulh v11.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v2.4s, v13.4s, v10.4s +mla v19.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v10.4s +sqrdmulh v10.4S, v17.4S, v25.s[3] +mul v17.4S, v17.4S,v26.s[3] +sub v8.4s, v21.4s, v19.4s +mla v16.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v19.4s +str q13, [x0, #48] +sqrdmulh v13.4S, v3.4S, v23.s[0] +str q2, [x0, #112] +mul v3.4S, v3.4S,v24.s[0] +ldr q2, [x0, #768] +sub v19.4s, v9.4s, v16.4s +ldr q11, [x0, #832] +mla v17.4S, v10.4S, v31.s[0] +add v9.4s, v9.4s, v16.4s +str q21, [x0, #176] +sqrdmulh v21.4S, v0.4S, v23.s[1] +str q8, [x0, #240] +mul v0.4S, v0.4S,v24.s[1] +ldr q8, [x0, #896] +sub v16.4s, v1.4s, v17.4s +ldr q10, [x0, #960] +mla v3.4S, v13.4S, v31.s[0] +add v1.4s, v1.4s, v17.4s +str q9, [x0, #304] +sqrdmulh v9.4S, v22.4S, v23.s[2] +str q19, [x0, #368] +mul v22.4S, v22.4S,v24.s[2] +ldr q19, [x0, #256] +sub v17.4s, v18.4s, v3.4s +ldr q13, [x0, #320] +mla v0.4S, v21.4S, v31.s[0] +add v18.4s, v18.4s, v3.4s +str q1, [x0, #432] +sqrdmulh v1.4S, v15.4S, v23.s[3] +str q16, [x0, #496] +mul v15.4S, v15.4S,v24.s[3] +ldr q16, [x0, #384] +sub v3.4s, v14.4s, v0.4s +ldr q21, [x0, #448] +mla v22.4S, v9.4S, v31.s[0] +add v14.4s, v14.4s, v0.4s +str q18, [x0, #560] +sqrdmulh v18.4S, v2.4S, v29.s[0] +str q17, [x0, #624] +ldr q17, [x0, #512] +mul v2.4S, v2.4S,v30.s[0] +ldr q0, [x0, #576] +sub v9.4s, v12.4s, v22.4s +mla v15.4S, v1.4S, v31.s[0] +add v12.4s, v12.4s, v22.4s +str q14, [x0, #688] +sqrdmulh v14.4S, v11.4S, v29.s[0] +str q3, [x0, #752] +ldr q3, [x0, #640] +mul v11.4S, v11.4S,v30.s[0] +ldr q22, [x0, #704] +sub v1.4s, v20.4s, v15.4s +mla v2.4S, v18.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +str q12, [x0, #816] +sqrdmulh v12.4S, v8.4S, v29.s[0] +str q9, [x0, #880] +mul v8.4S, v8.4S,v30.s[0] +ldr q9, [x0, #0] +sub v15.4s, v19.4s, v2.4s +mla v11.4S, v14.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +str q20, [x0, #944] +sqrdmulh v20.4S, v10.4S, v29.s[0] +str q1, [x0, #1008] +mul v10.4S, v10.4S,v30.s[0] +ldr q1, [x0, #64] +sub v2.4s, v13.4s, v11.4s +mla v8.4S, v12.4S, v31.s[0] +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v29.s[0] +ldr q12, [x0, #128] +mul v17.4S, v17.4S,v30.s[0] +sub v14.4s, v16.4s, v8.4s +mla v10.4S, v20.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v0.4S, v29.s[0] +ldr q20, [x0, #192] +mul v0.4S, v0.4S,v30.s[0] +sub v18.4s, v21.4s, v10.4s +mla v17.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +sub v11.4s, v9.4s, v17.4s +mla v0.4S, v8.4S, v31.s[0] +add v9.4s, v9.4s, v17.4s +sqrdmulh v17.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v8.4s, v1.4s, v0.4s +mla v3.4S, v10.4S, v31.s[0] +add v1.4s, v1.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v10.4s, v12.4s, v3.4s +mla v22.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v17.4s, v20.4s, v22.4s +mla v16.4S, v0.4S, v31.s[0] +add v20.4s, v20.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[1] +mul v19.4S, v19.4S,v30.s[1] +sub v0.4s, v12.4s, v16.4s +mla v21.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v13.4S, v29.s[1] +mul v13.4S, v13.4S,v30.s[1] +sub v3.4s, v20.4s, v21.4s +mla v19.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v22.4s, v9.4s, v19.4s +mla v13.4S, v16.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v29.s[2] +mul v18.4S, v18.4S,v30.s[2] +sub v16.4s, v1.4s, v13.4s +mla v14.4S, v21.4S, v31.s[0] +add v1.4s, v1.4s, v13.4s +sqrdmulh v13.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +sub v21.4s, v10.4s, v14.4s +mla v18.4S, v19.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v19.4s, v17.4s, v18.4s +mla v15.4S, v13.4S, v31.s[0] +add v17.4s, v17.4s, v18.4s +sqrdmulh v18.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +sub v13.4s, v11.4s, v15.4s +mla v2.4S, v14.4S, v31.s[0] +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v27.s[0] +mul v20.4S, v20.4S,v28.s[0] +sub v14.4s, v8.4s, v2.4s +mla v12.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v2.4s +sqrdmulh v2.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v18.4s, v9.4s, v12.4s +mla v20.4S, v15.4S, v31.s[0] +add v9.4s, v9.4s, v12.4s +sqrdmulh v12.4S, v3.4S, v27.s[1] +mul v3.4S, v3.4S,v28.s[1] +sub v15.4s, v1.4s, v20.4s +mla v0.4S, v2.4S, v31.s[0] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v10.4S, v27.s[2] +mul v10.4S, v10.4S,v28.s[2] +sub v2.4s, v22.4s, v0.4s +mla v3.4S, v12.4S, v31.s[0] +add v22.4s, v22.4s, v0.4s +sqrdmulh v0.4S, v17.4S, v27.s[2] +mul v17.4S, v17.4S,v28.s[2] +sub v12.4s, v16.4s, v3.4s +mla v10.4S, v20.4S, v31.s[0] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +sub v20.4s, v11.4s, v10.4s +mla v17.4S, v0.4S, v31.s[0] +add v11.4s, v11.4s, v10.4s +sqrdmulh v10.4S, v19.4S, v27.s[3] +mul v19.4S, v19.4S,v28.s[3] +sub v0.4s, v8.4s, v17.4s +mla v21.4S, v3.4S, v31.s[0] +add v8.4s, v8.4s, v17.4s +sqrdmulh v17.4S, v1.4S, v25.s[0] +mul v1.4S, v1.4S,v26.s[0] +sub v3.4s, v13.4s, v21.4s +mla v19.4S, v10.4S, v31.s[0] +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v15.4S, v25.s[1] +mul v15.4S, v15.4S,v26.s[1] +sub v10.4s, v14.4s, v19.4s +mla v1.4S, v17.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +sqrdmulh v19.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v17.4s, v9.4s, v1.4s +mla v15.4S, v21.4S, v31.s[0] +add v9.4s, v9.4s, v1.4s +sqrdmulh v1.4S, v12.4S, v25.s[3] +mul v12.4S, v12.4S,v26.s[3] +sub v21.4s, v18.4s, v15.4s +mla v16.4S, v19.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +str q9, [x0, #0] +sqrdmulh v9.4S, v8.4S, v23.s[0] +str q17, [x0, #64] +mul v8.4S, v8.4S,v24.s[0] +ldr q17, [x0, #784] +sub v15.4s, v22.4s, v16.4s +ldr q19, [x0, #848] +mla v12.4S, v1.4S, v31.s[0] +add v22.4s, v22.4s, v16.4s +str q18, [x0, #128] +sqrdmulh v18.4S, v0.4S, v23.s[1] +str q21, [x0, #192] +mul v0.4S, v0.4S,v24.s[1] +ldr q21, [x0, #912] +sub v16.4s, v2.4s, v12.4s +ldr q1, [x0, #976] +mla v8.4S, v9.4S, v31.s[0] +add v2.4s, v2.4s, v12.4s +str q22, [x0, #256] +sqrdmulh v22.4S, v14.4S, v23.s[2] +str q15, [x0, #320] +mul v14.4S, v14.4S,v24.s[2] +ldr q15, [x0, #272] +sub v12.4s, v11.4s, v8.4s +ldr q9, [x0, #336] +mla v0.4S, v18.4S, v31.s[0] +add v11.4s, v11.4s, v8.4s +str q2, [x0, #384] +sqrdmulh v2.4S, v10.4S, v23.s[3] +str q16, [x0, #448] +mul v10.4S, v10.4S,v24.s[3] +ldr q16, [x0, #400] +sub v8.4s, v20.4s, v0.4s +ldr q18, [x0, #464] +mla v14.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v0.4s +str q11, [x0, #512] +sqrdmulh v11.4S, v17.4S, v29.s[0] +str q12, [x0, #576] +ldr q12, [x0, #528] +mul v17.4S, v17.4S,v30.s[0] +ldr q0, [x0, #592] +sub v22.4s, v13.4s, v14.4s +mla v10.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v14.4s +str q20, [x0, #640] +sqrdmulh v20.4S, v19.4S, v29.s[0] +str q8, [x0, #704] +ldr q8, [x0, #656] +mul v19.4S, v19.4S,v30.s[0] +ldr q14, [x0, #720] +sub v2.4s, v3.4s, v10.4s +mla v17.4S, v11.4S, v31.s[0] +add v3.4s, v3.4s, v10.4s +str q13, [x0, #768] +sqrdmulh v13.4S, v21.4S, v29.s[0] +str q22, [x0, #832] +mul v21.4S, v21.4S,v30.s[0] +ldr q22, [x0, #16] +sub v10.4s, v15.4s, v17.4s +mla v19.4S, v20.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +str q3, [x0, #896] +sqrdmulh v3.4S, v1.4S, v29.s[0] +str q2, [x0, #960] +mul v1.4S, v1.4S,v30.s[0] +ldr q2, [x0, #80] +sub v17.4s, v9.4s, v19.4s +mla v21.4S, v13.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v12.4S, v29.s[0] +ldr q13, [x0, #144] +mul v12.4S, v12.4S,v30.s[0] +sub v20.4s, v16.4s, v21.4s +mla v1.4S, v3.4S, v31.s[0] +add v16.4s, v16.4s, v21.4s +sqrdmulh v21.4S, v0.4S, v29.s[0] +ldr q3, [x0, #208] +mul v0.4S, v0.4S,v30.s[0] +sub v11.4s, v18.4s, v1.4s +mla v12.4S, v19.4S, v31.s[0] +add v18.4s, v18.4s, v1.4s +sqrdmulh v1.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v19.4s, v22.4s, v12.4s +mla v0.4S, v21.4S, v31.s[0] +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v21.4s, v2.4s, v0.4s +mla v8.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v1.4s, v13.4s, v8.4s +mla v14.4S, v12.4S, v31.s[0] +add v13.4s, v13.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v12.4s, v3.4s, v14.4s +mla v16.4S, v0.4S, v31.s[0] +add v3.4s, v3.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +sub v0.4s, v13.4s, v16.4s +mla v18.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v16.4s +sqrdmulh v16.4S, v9.4S, v29.s[1] +mul v9.4S, v9.4S,v30.s[1] +sub v8.4s, v3.4s, v18.4s +mla v15.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +sub v14.4s, v22.4s, v15.4s +mla v9.4S, v16.4S, v31.s[0] +add v22.4s, v22.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v16.4s, v2.4s, v9.4s +mla v20.4S, v18.4S, v31.s[0] +add v2.4s, v2.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v18.4s, v1.4s, v20.4s +mla v11.4S, v15.4S, v31.s[0] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +sub v15.4s, v12.4s, v11.4s +mla v10.4S, v9.4S, v31.s[0] +add v12.4s, v12.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v27.s[0] +mul v13.4S, v13.4S,v28.s[0] +sub v9.4s, v19.4s, v10.4s +mla v17.4S, v20.4S, v31.s[0] +add v19.4s, v19.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v27.s[0] +mul v3.4S, v3.4S,v28.s[0] +sub v20.4s, v21.4s, v17.4s +mla v13.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v11.4s, v22.4s, v13.4s +mla v3.4S, v10.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v8.4S, v27.s[1] +mul v8.4S, v8.4S,v28.s[1] +sub v10.4s, v2.4s, v3.4s +mla v0.4S, v17.4S, v31.s[0] +add v2.4s, v2.4s, v3.4s +sqrdmulh v3.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +sub v17.4s, v14.4s, v0.4s +mla v8.4S, v13.4S, v31.s[0] +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v12.4S, v27.s[2] +mul v12.4S, v12.4S,v28.s[2] +sub v13.4s, v16.4s, v8.4s +mla v1.4S, v3.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v3.4s, v19.4s, v1.4s +mla v12.4S, v0.4S, v31.s[0] +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +sub v0.4s, v21.4s, v12.4s +mla v18.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v2.4S, v25.s[0] +mul v2.4S, v2.4S,v26.s[0] +sub v8.4s, v9.4s, v18.4s +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v10.4S, v25.s[1] +mul v10.4S, v10.4S,v26.s[1] +sub v1.4s, v20.4s, v15.4s +mla v2.4S, v12.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v12.4s, v22.4s, v2.4s +mla v10.4S, v18.4S, v31.s[0] +add v22.4s, v22.4s, v2.4s +sqrdmulh v2.4S, v13.4S, v25.s[3] +mul v13.4S, v13.4S,v26.s[3] +sub v18.4s, v11.4s, v10.4s +mla v16.4S, v15.4S, v31.s[0] +add v11.4s, v11.4s, v10.4s +str q22, [x0, #16] +sqrdmulh v22.4S, v21.4S, v23.s[0] +str q12, [x0, #80] +mul v21.4S, v21.4S,v24.s[0] +sub v12.4s, v14.4s, v16.4s +mla v13.4S, v2.4S, v31.s[0] +add v14.4s, v14.4s, v16.4s +str q11, [x0, #144] +sqrdmulh v11.4S, v0.4S, v23.s[1] +str q18, [x0, #208] +mul v0.4S, v0.4S,v24.s[1] +sub v18.4s, v17.4s, v13.4s +mla v21.4S, v22.4S, v31.s[0] +add v17.4s, v17.4s, v13.4s +str q14, [x0, #272] +sqrdmulh v14.4S, v20.4S, v23.s[2] +str q12, [x0, #336] +mul v20.4S, v20.4S,v24.s[2] +sub v12.4s, v19.4s, v21.4s +mla v0.4S, v11.4S, v31.s[0] +add v19.4s, v19.4s, v21.4s +str q17, [x0, #400] +sqrdmulh v17.4S, v1.4S, v23.s[3] +str q18, [x0, #464] +mul v1.4S, v1.4S,v24.s[3] +sub v18.4s, v3.4s, v0.4s +mla v20.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v0.4s +str q19, [x0, #528] +str q12, [x0, #592] +sub v12.4s, v9.4s, v20.4s +mla v1.4S, v17.4S, v31.s[0] +add v9.4s, v9.4s, v20.4s +str q3, [x0, #656] +str q18, [x0, #720] +sub v18.4s, v8.4s, v1.4s +add v8.4s, v8.4s, v1.4s +str q9, [x0, #784] +str q12, [x0, #848] +str q8, [x0, #912] +str q18, [x0, #976] +ldr q4, [x0, #32] +ldr q5, [x0, #48] +ldr q6, [x17, #+128] +ldr q7, [x17, #+144] +ldr q15, [x17, #+256] +ldr q10, [x0, #96] +ldr q2, [x17, #+272] +ldr q16, [x0, #112] +sqrdmulh v22.4S, v4.4S, v7.s[0] +mul v4.4S, v4.4S,v6.s[0] +sqrdmulh v13.4S, v5.4S, v7.s[0] +mul v5.4S, v5.4S,v6.s[0] +mla v4.4S, v22.4S, v31.s[0] +ldr q22, [x0, #0] +sqrdmulh v11.4S, v10.4S, v2.s[0] +ldr q21, [x0, #16] +mul v10.4S, v10.4S,v15.s[0] +mla v5.4S, v13.4S, v31.s[0] +sub v13.4s, v22.4s, v4.4s +add v22.4s, v22.4s, v4.4s +sqrdmulh v4.4S, v16.4S, v2.s[0] +ldr q14, [x0, #160] +mul v16.4S, v16.4S,v15.s[0] +ldr q0, [x0, #176] +mla v10.4S, v11.4S, v31.s[0] +ldr q11, [x0, #64] +sub v19.4s, v21.4s, v5.4s +add v21.4s, v21.4s, v5.4s +sqrdmulh v5.4S, v21.4S, v7.s[1] +ldr q17, [x0, #128] +mul v21.4S, v21.4S,v6.s[1] +ldr q20, [x0, #144] +mla v16.4S, v4.4S, v31.s[0] +ldr q4, [x0, #80] +sub v3.4s, v11.4s, v10.4s +add v11.4s, v11.4s, v10.4s +sqrdmulh v10.4S, v19.4S, v7.s[2] +ldr q1, [x17, #+384] +mul v19.4S, v19.4S,v6.s[2] +ldr q9, [x17, #+400] +mla v21.4S, v5.4S, v31.s[0] +sub v5.4s, v4.4s, v16.4s +add v4.4s, v4.4s, v16.4s +sqrdmulh v16.4S, v4.4S, v2.s[1] +ldr q12, [x0, #224] +mul v4.4S, v4.4S,v15.s[1] +ldr q8, [x0, #240] +mla v19.4S, v10.4S, v31.s[0] +sub v10.4s, v22.4s, v21.4s +add v22.4s, v22.4s, v21.4s +sqrdmulh v7.4S, v5.4S, v2.s[2] +ldr q6, [x0, #192] +mul v5.4S, v5.4S,v15.s[2] +ldr q21, [x0, #208] +mla v4.4S, v16.4S, v31.s[0] +sub v16.4s, v13.4s, v19.4s +add v13.4s, v13.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v9.s[0] +ldr q18, [x17, #+512] +mul v14.4S, v14.4S,v1.s[0] +ldr q30, [x17, #+528] +trn1 v29.4S, v22.4S, v10.4S +trn2 v28.4S, v22.4S, v10.4S +mla v5.4S, v7.4S, v31.s[0] +sub v7.4s, v11.4s, v4.4s +add v11.4s, v11.4s, v4.4s +sqrdmulh v2.4S, v0.4S, v9.s[0] +mul v0.4S, v0.4S,v1.s[0] +trn1 v15.4S, v13.4S, v16.4S +trn2 v4.4S, v13.4S, v16.4S +mla v14.4S, v19.4S, v31.s[0] +sub v19.4s, v3.4s, v5.4s +add v3.4s, v3.4s, v5.4s +sqrdmulh v5.4S, v12.4S, v30.s[0] +mul v12.4S, v12.4S,v18.s[0] +trn2 v13.2D, v29.2D, v15.2D +trn2 v16.2D, v28.2D, v4.2D +mla v0.4S, v2.4S, v31.s[0] +sub v2.4s, v17.4s, v14.4s +add v17.4s, v17.4s, v14.4s +sqrdmulh v14.4S, v8.4S, v30.s[0] +mul v8.4S, v8.4S,v18.s[0] +trn1 v22.2D, v29.2D, v15.2D +trn1 v10.2D, v28.2D, v4.2D +mla v12.4S, v5.4S, v31.s[0] +sub v5.4s, v20.4s, v0.4s +add v20.4s, v20.4s, v0.4s +sqrdmulh v0.4S, v20.4S, v9.s[1] +mul v20.4S, v20.4S,v1.s[1] +trn1 v4.4S, v11.4S, v7.4S +trn2 v28.4S, v11.4S, v7.4S +mla v8.4S, v14.4S, v31.s[0] +sub v14.4s, v6.4s, v12.4s +add v6.4s, v6.4s, v12.4s +sqrdmulh v12.4S, v5.4S, v9.s[2] +mul v5.4S, v5.4S,v1.s[2] +trn1 v15.4S, v3.4S, v19.4S +trn2 v29.4S, v3.4S, v19.4S +ldr q27, [x17, #+160] +ldr q26, [x17, #+176] +mla v20.4S, v0.4S, v31.s[0] +sub v0.4s, v21.4s, v8.4s +add v21.4s, v21.4s, v8.4s +sqrdmulh v8.4S, v21.4S, v30.s[1] +mul v21.4S, v21.4S,v18.s[1] +trn2 v3.2D, v4.2D, v15.2D +trn2 v19.2D, v28.2D, v29.2D +mla v5.4S, v12.4S, v31.s[0] +sub v12.4s, v17.4s, v20.4s +add v17.4s, v17.4s, v20.4s +sqrdmulh v9.4S, v0.4S, v30.s[2] +mul v0.4S, v0.4S,v18.s[2] +trn1 v11.2D, v4.2D, v15.2D +trn1 v7.2D, v28.2D, v29.2D +mla v21.4S, v8.4S, v31.s[0] +sub v8.4s, v2.4s, v5.4s +add v2.4s, v2.4s, v5.4s +sqrdmulh v5.4S, v13.4S, v26.4S +mul v13.4S, v13.4S,v27.4S +mla v0.4S, v9.4S, v31.s[0] +sub v9.4s, v6.4s, v21.4s +add v6.4s, v6.4s, v21.4s +ldr q30, [x17, #+288] +ldr q18, [x17, #+304] +sqrdmulh v21.4S, v16.4S, v26.4S +mul v16.4S, v16.4S,v27.4S +trn1 v29.4S, v17.4S, v12.4S +trn2 v28.4S, v17.4S, v12.4S +mla v13.4S, v5.4S, v31.s[0] +sub v5.4s, v14.4s, v0.4s +add v14.4s, v14.4s, v0.4s +ldr q26, [x17, #+192] +ldr q27, [x17, #+208] +trn1 v0.4S, v2.4S, v8.4S +trn2 v15.4S, v2.4S, v8.4S +sqrdmulh v4.4S, v3.4S, v18.4S +mul v3.4S, v3.4S,v30.4S +trn2 v2.2D, v29.2D, v0.2D +trn2 v8.2D, v28.2D, v15.2D +ldr q1, [x17, #+224] +ldr q20, [x17, #+240] +mla v16.4S, v21.4S, v31.s[0] +sub v21.4s, v22.4s, v13.4s +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v19.4S, v18.4S +mul v19.4S, v19.4S,v30.4S +trn1 v17.2D, v29.2D, v0.2D +trn1 v12.2D, v28.2D, v15.2D +ldr q15, [x17, #+320] +ldr q28, [x17, #+336] +mla v3.4S, v4.4S, v31.s[0] +sub v4.4s, v10.4s, v16.4s +add v10.4s, v10.4s, v16.4s +sqrdmulh v16.4S, v10.4S, v27.4S +mul v10.4S, v10.4S,v26.4S +trn1 v27.4S, v6.4S, v9.4S +trn2 v26.4S, v6.4S, v9.4S +ldr q0, [x17, #+352] +ldr q29, [x17, #+368] +mla v19.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v3.4s +add v11.4s, v11.4s, v3.4s +sqrdmulh v3.4S, v4.4S, v20.4S +mul v4.4S, v4.4S,v1.4S +trn1 v20.4S, v14.4S, v5.4S +trn2 v1.4S, v14.4S, v5.4S +mla v10.4S, v16.4S, v31.s[0] +sub v16.4s, v7.4s, v19.4s +add v7.4s, v7.4s, v19.4s +sqrdmulh v19.4S, v7.4S, v28.4S +mul v7.4S, v7.4S,v15.4S +ldr q18, [x17, #+416] +ldr q30, [x17, #+432] +trn2 v14.2D, v27.2D, v20.2D +trn2 v5.2D, v26.2D, v1.2D +mla v4.4S, v3.4S, v31.s[0] +sub v3.4s, v22.4s, v10.4s +add v22.4s, v22.4s, v10.4s +sqrdmulh v10.4S, v16.4S, v29.4S +mul v16.4S, v16.4S,v0.4S +trn1 v6.2D, v27.2D, v20.2D +trn1 v9.2D, v26.2D, v1.2D +mla v7.4S, v19.4S, v31.s[0] +sub v19.4s, v21.4s, v4.4s +add v21.4s, v21.4s, v4.4s +sqrdmulh v4.4S, v2.4S, v30.4S +ldr q1, [x17, #+544] +ldr q26, [x17, #+560] +mul v2.4S, v2.4S,v18.4S +str q22, [x0, #0] +str q3, [x0, #16] +mla v16.4S, v10.4S, v31.s[0] +sub v10.4s, v11.4s, v7.4s +add v11.4s, v11.4s, v7.4s +sqrdmulh v7.4S, v8.4S, v30.4S +mul v8.4S, v8.4S,v18.4S +str q21, [x0, #32] +mla v2.4S, v4.4S, v31.s[0] +sub v4.4s, v13.4s, v16.4s +add v13.4s, v13.4s, v16.4s +ldr q16, [x17, #+448] +ldr q21, [x17, #+464] +sqrdmulh v30.4S, v14.4S, v26.4S +str q11, [x0, #64] +mul v14.4S, v14.4S,v1.4S +str q19, [x0, #48] +mla v8.4S, v7.4S, v31.s[0] +str q10, [x0, #80] +sub v10.4s, v17.4s, v2.4s +add v17.4s, v17.4s, v2.4s +ldr q2, [x17, #+480] +ldr q7, [x17, #+496] +sqrdmulh v19.4S, v5.4S, v26.4S +mul v5.4S, v5.4S,v1.4S +str q13, [x0, #96] +mla v14.4S, v30.4S, v31.s[0] +sub v30.4s, v12.4s, v8.4s +add v12.4s, v12.4s, v8.4s +ldr q8, [x17, #+576] +ldr q13, [x17, #+592] +sqrdmulh v26.4S, v12.4S, v21.4S +mul v12.4S, v12.4S,v16.4S +ldr q21, [x17, #+608] +ldr q16, [x17, #+624] +str q4, [x0, #112] +mla v5.4S, v19.4S, v31.s[0] +sub v19.4s, v6.4s, v14.4s +add v6.4s, v6.4s, v14.4s +sqrdmulh v14.4S, v30.4S, v7.4S +mul v30.4S, v30.4S,v2.4S +mla v12.4S, v26.4S, v31.s[0] +sub v26.4s, v9.4s, v5.4s +add v9.4s, v9.4s, v5.4s +sqrdmulh v5.4S, v9.4S, v13.4S +mul v9.4S, v9.4S,v8.4S +ldr q13, [x0, #288] +mla v30.4S, v14.4S, v31.s[0] +ldr q14, [x0, #304] +sub v8.4s, v17.4s, v12.4s +add v17.4s, v17.4s, v12.4s +sqrdmulh v12.4S, v26.4S, v16.4S +ldr q7, [x17, #+640] +mul v26.4S, v26.4S,v21.4S +ldr q16, [x17, #+656] +mla v9.4S, v5.4S, v31.s[0] +ldr q5, [x17, #+768] +sub v21.4s, v10.4s, v30.4s +ldr q2, [x0, #352] +add v10.4s, v10.4s, v30.4s +ldr q30, [x17, #+784] +ldr q4, [x0, #368] +sqrdmulh v1.4S, v13.4S, v16.s[0] +mul v13.4S, v13.4S,v7.s[0] +mla v26.4S, v12.4S, v31.s[0] +sub v12.4s, v6.4s, v9.4s +add v6.4s, v6.4s, v9.4s +sqrdmulh v9.4S, v14.4S, v16.s[0] +mul v14.4S, v14.4S,v7.s[0] +str q17, [x0, #128] +str q8, [x0, #144] +str q10, [x0, #160] +str q21, [x0, #176] +mla v13.4S, v1.4S, v31.s[0] +sub v1.4s, v19.4s, v26.4s +add v19.4s, v19.4s, v26.4s +ldr q26, [x0, #256] +sqrdmulh v21.4S, v2.4S, v30.s[0] +ldr q10, [x0, #272] +mul v2.4S, v2.4S,v5.s[0] +str q6, [x0, #192] +str q12, [x0, #208] +str q19, [x0, #224] +str q1, [x0, #240] +mla v14.4S, v9.4S, v31.s[0] +sub v9.4s, v26.4s, v13.4s +add v26.4s, v26.4s, v13.4s +sqrdmulh v13.4S, v4.4S, v30.s[0] +ldr q1, [x0, #416] +mul v4.4S, v4.4S,v5.s[0] +ldr q19, [x0, #432] +mla v2.4S, v21.4S, v31.s[0] +ldr q21, [x0, #320] +sub v12.4s, v10.4s, v14.4s +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v10.4S, v16.s[1] +ldr q6, [x0, #384] +mul v10.4S, v10.4S,v7.s[1] +ldr q8, [x0, #400] +mla v4.4S, v13.4S, v31.s[0] +ldr q13, [x0, #336] +sub v17.4s, v21.4s, v2.4s +add v21.4s, v21.4s, v2.4s +sqrdmulh v2.4S, v12.4S, v16.s[2] +ldr q11, [x17, #+896] +mul v12.4S, v12.4S,v7.s[2] +ldr q18, [x17, #+912] +mla v10.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v4.4s +add v13.4s, v13.4s, v4.4s +sqrdmulh v4.4S, v13.4S, v30.s[1] +ldr q3, [x0, #480] +mul v13.4S, v13.4S,v5.s[1] +ldr q22, [x0, #496] +mla v12.4S, v2.4S, v31.s[0] +sub v2.4s, v26.4s, v10.4s +add v26.4s, v26.4s, v10.4s +sqrdmulh v16.4S, v14.4S, v30.s[2] +ldr q7, [x0, #448] +mul v14.4S, v14.4S,v5.s[2] +ldr q10, [x0, #464] +mla v13.4S, v4.4S, v31.s[0] +sub v4.4s, v9.4s, v12.4s +add v9.4s, v9.4s, v12.4s +sqrdmulh v12.4S, v1.4S, v18.s[0] +ldr q20, [x17, #+1024] +mul v1.4S, v1.4S,v11.s[0] +ldr q27, [x17, #+1040] +trn1 v29.4S, v26.4S, v2.4S +trn2 v0.4S, v26.4S, v2.4S +mla v14.4S, v16.4S, v31.s[0] +sub v16.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +sqrdmulh v30.4S, v19.4S, v18.s[0] +mul v19.4S, v19.4S,v11.s[0] +trn1 v5.4S, v9.4S, v4.4S +trn2 v13.4S, v9.4S, v4.4S +mla v1.4S, v12.4S, v31.s[0] +sub v12.4s, v17.4s, v14.4s +add v17.4s, v17.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v27.s[0] +mul v3.4S, v3.4S,v20.s[0] +trn2 v9.2D, v29.2D, v5.2D +trn2 v4.2D, v0.2D, v13.2D +mla v19.4S, v30.4S, v31.s[0] +sub v30.4s, v6.4s, v1.4s +add v6.4s, v6.4s, v1.4s +sqrdmulh v1.4S, v22.4S, v27.s[0] +mul v22.4S, v22.4S,v20.s[0] +trn1 v26.2D, v29.2D, v5.2D +trn1 v2.2D, v0.2D, v13.2D +mla v3.4S, v14.4S, v31.s[0] +sub v14.4s, v8.4s, v19.4s +add v8.4s, v8.4s, v19.4s +sqrdmulh v19.4S, v8.4S, v18.s[1] +mul v8.4S, v8.4S,v11.s[1] +trn1 v13.4S, v21.4S, v16.4S +trn2 v0.4S, v21.4S, v16.4S +mla v22.4S, v1.4S, v31.s[0] +sub v1.4s, v7.4s, v3.4s +add v7.4s, v7.4s, v3.4s +sqrdmulh v3.4S, v14.4S, v18.s[2] +mul v14.4S, v14.4S,v11.s[2] +trn1 v5.4S, v17.4S, v12.4S +trn2 v29.4S, v17.4S, v12.4S +ldr q28, [x17, #+672] +ldr q15, [x17, #+688] +mla v8.4S, v19.4S, v31.s[0] +sub v19.4s, v10.4s, v22.4s +add v10.4s, v10.4s, v22.4s +sqrdmulh v22.4S, v10.4S, v27.s[1] +mul v10.4S, v10.4S,v20.s[1] +trn2 v17.2D, v13.2D, v5.2D +trn2 v12.2D, v0.2D, v29.2D +mla v14.4S, v3.4S, v31.s[0] +sub v3.4s, v6.4s, v8.4s +add v6.4s, v6.4s, v8.4s +sqrdmulh v18.4S, v19.4S, v27.s[2] +mul v19.4S, v19.4S,v20.s[2] +trn1 v21.2D, v13.2D, v5.2D +trn1 v16.2D, v0.2D, v29.2D +mla v10.4S, v22.4S, v31.s[0] +sub v22.4s, v30.4s, v14.4s +add v30.4s, v30.4s, v14.4s +sqrdmulh v14.4S, v9.4S, v15.4S +mul v9.4S, v9.4S,v28.4S +mla v19.4S, v18.4S, v31.s[0] +sub v18.4s, v7.4s, v10.4s +add v7.4s, v7.4s, v10.4s +ldr q27, [x17, #+800] +ldr q20, [x17, #+816] +sqrdmulh v10.4S, v4.4S, v15.4S +mul v4.4S, v4.4S,v28.4S +trn1 v29.4S, v6.4S, v3.4S +trn2 v0.4S, v6.4S, v3.4S +mla v9.4S, v14.4S, v31.s[0] +sub v14.4s, v1.4s, v19.4s +add v1.4s, v1.4s, v19.4s +ldr q15, [x17, #+704] +ldr q28, [x17, #+720] +trn1 v19.4S, v30.4S, v22.4S +trn2 v5.4S, v30.4S, v22.4S +sqrdmulh v13.4S, v17.4S, v20.4S +mul v17.4S, v17.4S,v27.4S +trn2 v30.2D, v29.2D, v19.2D +trn2 v22.2D, v0.2D, v5.2D +ldr q11, [x17, #+736] +ldr q8, [x17, #+752] +mla v4.4S, v10.4S, v31.s[0] +sub v10.4s, v26.4s, v9.4s +add v26.4s, v26.4s, v9.4s +sqrdmulh v9.4S, v12.4S, v20.4S +mul v12.4S, v12.4S,v27.4S +trn1 v6.2D, v29.2D, v19.2D +trn1 v3.2D, v0.2D, v5.2D +ldr q5, [x17, #+832] +ldr q0, [x17, #+848] +mla v17.4S, v13.4S, v31.s[0] +sub v13.4s, v2.4s, v4.4s +add v2.4s, v2.4s, v4.4s +sqrdmulh v4.4S, v2.4S, v28.4S +mul v2.4S, v2.4S,v15.4S +trn1 v28.4S, v7.4S, v18.4S +trn2 v15.4S, v7.4S, v18.4S +ldr q19, [x17, #+864] +ldr q29, [x17, #+880] +mla v12.4S, v9.4S, v31.s[0] +sub v9.4s, v21.4s, v17.4s +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v13.4S, v8.4S +mul v13.4S, v13.4S,v11.4S +trn1 v8.4S, v1.4S, v14.4S +trn2 v11.4S, v1.4S, v14.4S +mla v2.4S, v4.4S, v31.s[0] +sub v4.4s, v16.4s, v12.4s +add v16.4s, v16.4s, v12.4s +sqrdmulh v12.4S, v16.4S, v0.4S +mul v16.4S, v16.4S,v5.4S +ldr q20, [x17, #+928] +ldr q27, [x17, #+944] +trn2 v1.2D, v28.2D, v8.2D +trn2 v14.2D, v15.2D, v11.2D +mla v13.4S, v17.4S, v31.s[0] +sub v17.4s, v26.4s, v2.4s +add v26.4s, v26.4s, v2.4s +sqrdmulh v2.4S, v4.4S, v29.4S +mul v4.4S, v4.4S,v19.4S +trn1 v7.2D, v28.2D, v8.2D +trn1 v18.2D, v15.2D, v11.2D +mla v16.4S, v12.4S, v31.s[0] +sub v12.4s, v10.4s, v13.4s +add v10.4s, v10.4s, v13.4s +sqrdmulh v13.4S, v30.4S, v27.4S +ldr q11, [x17, #+1056] +ldr q15, [x17, #+1072] +mul v30.4S, v30.4S,v20.4S +str q26, [x0, #256] +str q17, [x0, #272] +mla v4.4S, v2.4S, v31.s[0] +sub v2.4s, v21.4s, v16.4s +add v21.4s, v21.4s, v16.4s +sqrdmulh v16.4S, v22.4S, v27.4S +mul v22.4S, v22.4S,v20.4S +str q10, [x0, #288] +mla v30.4S, v13.4S, v31.s[0] +sub v13.4s, v9.4s, v4.4s +add v9.4s, v9.4s, v4.4s +ldr q4, [x17, #+960] +ldr q10, [x17, #+976] +sqrdmulh v27.4S, v1.4S, v15.4S +str q21, [x0, #320] +mul v1.4S, v1.4S,v11.4S +str q12, [x0, #304] +mla v22.4S, v16.4S, v31.s[0] +str q2, [x0, #336] +sub v2.4s, v6.4s, v30.4s +add v6.4s, v6.4s, v30.4s +ldr q30, [x17, #+992] +ldr q16, [x17, #+1008] +sqrdmulh v12.4S, v14.4S, v15.4S +mul v14.4S, v14.4S,v11.4S +str q9, [x0, #352] +mla v1.4S, v27.4S, v31.s[0] +sub v27.4s, v3.4s, v22.4s +add v3.4s, v3.4s, v22.4s +ldr q22, [x17, #+1088] +ldr q9, [x17, #+1104] +sqrdmulh v15.4S, v3.4S, v10.4S +mul v3.4S, v3.4S,v4.4S +ldr q10, [x17, #+1120] +ldr q4, [x17, #+1136] +str q13, [x0, #368] +mla v14.4S, v12.4S, v31.s[0] +sub v12.4s, v7.4s, v1.4s +add v7.4s, v7.4s, v1.4s +sqrdmulh v1.4S, v27.4S, v16.4S +mul v27.4S, v27.4S,v30.4S +mla v3.4S, v15.4S, v31.s[0] +sub v15.4s, v18.4s, v14.4s +add v18.4s, v18.4s, v14.4s +sqrdmulh v14.4S, v18.4S, v9.4S +mul v18.4S, v18.4S,v22.4S +ldr q9, [x0, #544] +mla v27.4S, v1.4S, v31.s[0] +ldr q1, [x0, #560] +sub v22.4s, v6.4s, v3.4s +add v6.4s, v6.4s, v3.4s +sqrdmulh v3.4S, v15.4S, v4.4S +ldr q16, [x17, #+1152] +mul v15.4S, v15.4S,v10.4S +ldr q4, [x17, #+1168] +mla v18.4S, v14.4S, v31.s[0] +ldr q14, [x17, #+1280] +sub v10.4s, v2.4s, v27.4s +ldr q30, [x0, #608] +add v2.4s, v2.4s, v27.4s +ldr q27, [x17, #+1296] +ldr q13, [x0, #624] +sqrdmulh v11.4S, v9.4S, v4.s[0] +mul v9.4S, v9.4S,v16.s[0] +mla v15.4S, v3.4S, v31.s[0] +sub v3.4s, v7.4s, v18.4s +add v7.4s, v7.4s, v18.4s +sqrdmulh v18.4S, v1.4S, v4.s[0] +mul v1.4S, v1.4S,v16.s[0] +str q6, [x0, #384] +str q22, [x0, #400] +str q2, [x0, #416] +str q10, [x0, #432] +mla v9.4S, v11.4S, v31.s[0] +sub v11.4s, v12.4s, v15.4s +add v12.4s, v12.4s, v15.4s +ldr q15, [x0, #512] +sqrdmulh v10.4S, v30.4S, v27.s[0] +ldr q2, [x0, #528] +mul v30.4S, v30.4S,v14.s[0] +str q7, [x0, #448] +str q3, [x0, #464] +str q12, [x0, #480] +str q11, [x0, #496] +mla v1.4S, v18.4S, v31.s[0] +sub v18.4s, v15.4s, v9.4s +add v15.4s, v15.4s, v9.4s +sqrdmulh v9.4S, v13.4S, v27.s[0] +ldr q11, [x0, #672] +mul v13.4S, v13.4S,v14.s[0] +ldr q12, [x0, #688] +mla v30.4S, v10.4S, v31.s[0] +ldr q10, [x0, #576] +sub v3.4s, v2.4s, v1.4s +add v2.4s, v2.4s, v1.4s +sqrdmulh v1.4S, v2.4S, v4.s[1] +ldr q7, [x0, #640] +mul v2.4S, v2.4S,v16.s[1] +ldr q22, [x0, #656] +mla v13.4S, v9.4S, v31.s[0] +ldr q9, [x0, #592] +sub v6.4s, v10.4s, v30.4s +add v10.4s, v10.4s, v30.4s +sqrdmulh v30.4S, v3.4S, v4.s[2] +ldr q21, [x17, #+1408] +mul v3.4S, v3.4S,v16.s[2] +ldr q20, [x17, #+1424] +mla v2.4S, v1.4S, v31.s[0] +sub v1.4s, v9.4s, v13.4s +add v9.4s, v9.4s, v13.4s +sqrdmulh v13.4S, v9.4S, v27.s[1] +ldr q17, [x0, #736] +mul v9.4S, v9.4S,v14.s[1] +ldr q26, [x0, #752] +mla v3.4S, v30.4S, v31.s[0] +sub v30.4s, v15.4s, v2.4s +add v15.4s, v15.4s, v2.4s +sqrdmulh v4.4S, v1.4S, v27.s[2] +ldr q16, [x0, #704] +mul v1.4S, v1.4S,v14.s[2] +ldr q2, [x0, #720] +mla v9.4S, v13.4S, v31.s[0] +sub v13.4s, v18.4s, v3.4s +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v11.4S, v20.s[0] +ldr q8, [x17, #+1536] +mul v11.4S, v11.4S,v21.s[0] +ldr q28, [x17, #+1552] +trn1 v29.4S, v15.4S, v30.4S +trn2 v19.4S, v15.4S, v30.4S +mla v1.4S, v4.4S, v31.s[0] +sub v4.4s, v10.4s, v9.4s +add v10.4s, v10.4s, v9.4s +sqrdmulh v27.4S, v12.4S, v20.s[0] +mul v12.4S, v12.4S,v21.s[0] +trn1 v14.4S, v18.4S, v13.4S +trn2 v9.4S, v18.4S, v13.4S +mla v11.4S, v3.4S, v31.s[0] +sub v3.4s, v6.4s, v1.4s +add v6.4s, v6.4s, v1.4s +sqrdmulh v1.4S, v17.4S, v28.s[0] +mul v17.4S, v17.4S,v8.s[0] +trn2 v18.2D, v29.2D, v14.2D +trn2 v13.2D, v19.2D, v9.2D +mla v12.4S, v27.4S, v31.s[0] +sub v27.4s, v7.4s, v11.4s +add v7.4s, v7.4s, v11.4s +sqrdmulh v11.4S, v26.4S, v28.s[0] +mul v26.4S, v26.4S,v8.s[0] +trn1 v15.2D, v29.2D, v14.2D +trn1 v30.2D, v19.2D, v9.2D +mla v17.4S, v1.4S, v31.s[0] +sub v1.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v22.4S, v20.s[1] +mul v22.4S, v22.4S,v21.s[1] +trn1 v9.4S, v10.4S, v4.4S +trn2 v19.4S, v10.4S, v4.4S +mla v26.4S, v11.4S, v31.s[0] +sub v11.4s, v16.4s, v17.4s +add v16.4s, v16.4s, v17.4s +sqrdmulh v17.4S, v1.4S, v20.s[2] +mul v1.4S, v1.4S,v21.s[2] +trn1 v14.4S, v6.4S, v3.4S +trn2 v29.4S, v6.4S, v3.4S +ldr q0, [x17, #+1184] +ldr q5, [x17, #+1200] +mla v22.4S, v12.4S, v31.s[0] +sub v12.4s, v2.4s, v26.4s +add v2.4s, v2.4s, v26.4s +sqrdmulh v26.4S, v2.4S, v28.s[1] +mul v2.4S, v2.4S,v8.s[1] +trn2 v6.2D, v9.2D, v14.2D +trn2 v3.2D, v19.2D, v29.2D +mla v1.4S, v17.4S, v31.s[0] +sub v17.4s, v7.4s, v22.4s +add v7.4s, v7.4s, v22.4s +sqrdmulh v20.4S, v12.4S, v28.s[2] +mul v12.4S, v12.4S,v8.s[2] +trn1 v10.2D, v9.2D, v14.2D +trn1 v4.2D, v19.2D, v29.2D +mla v2.4S, v26.4S, v31.s[0] +sub v26.4s, v27.4s, v1.4s +add v27.4s, v27.4s, v1.4s +sqrdmulh v1.4S, v18.4S, v5.4S +mul v18.4S, v18.4S,v0.4S +mla v12.4S, v20.4S, v31.s[0] +sub v20.4s, v16.4s, v2.4s +add v16.4s, v16.4s, v2.4s +ldr q28, [x17, #+1312] +ldr q8, [x17, #+1328] +sqrdmulh v2.4S, v13.4S, v5.4S +mul v13.4S, v13.4S,v0.4S +trn1 v29.4S, v7.4S, v17.4S +trn2 v19.4S, v7.4S, v17.4S +mla v18.4S, v1.4S, v31.s[0] +sub v1.4s, v11.4s, v12.4s +add v11.4s, v11.4s, v12.4s +ldr q5, [x17, #+1216] +ldr q0, [x17, #+1232] +trn1 v12.4S, v27.4S, v26.4S +trn2 v14.4S, v27.4S, v26.4S +sqrdmulh v9.4S, v6.4S, v8.4S +mul v6.4S, v6.4S,v28.4S +trn2 v27.2D, v29.2D, v12.2D +trn2 v26.2D, v19.2D, v14.2D +ldr q21, [x17, #+1248] +ldr q22, [x17, #+1264] +mla v13.4S, v2.4S, v31.s[0] +sub v2.4s, v15.4s, v18.4s +add v15.4s, v15.4s, v18.4s +sqrdmulh v18.4S, v3.4S, v8.4S +mul v3.4S, v3.4S,v28.4S +trn1 v7.2D, v29.2D, v12.2D +trn1 v17.2D, v19.2D, v14.2D +ldr q14, [x17, #+1344] +ldr q19, [x17, #+1360] +mla v6.4S, v9.4S, v31.s[0] +sub v9.4s, v30.4s, v13.4s +add v30.4s, v30.4s, v13.4s +sqrdmulh v13.4S, v30.4S, v0.4S +mul v30.4S, v30.4S,v5.4S +trn1 v0.4S, v16.4S, v20.4S +trn2 v5.4S, v16.4S, v20.4S +ldr q12, [x17, #+1376] +ldr q29, [x17, #+1392] +mla v3.4S, v18.4S, v31.s[0] +sub v18.4s, v10.4s, v6.4s +add v10.4s, v10.4s, v6.4s +sqrdmulh v6.4S, v9.4S, v22.4S +mul v9.4S, v9.4S,v21.4S +trn1 v22.4S, v11.4S, v1.4S +trn2 v21.4S, v11.4S, v1.4S +mla v30.4S, v13.4S, v31.s[0] +sub v13.4s, v4.4s, v3.4s +add v4.4s, v4.4s, v3.4s +sqrdmulh v3.4S, v4.4S, v19.4S +mul v4.4S, v4.4S,v14.4S +ldr q8, [x17, #+1440] +ldr q28, [x17, #+1456] +trn2 v11.2D, v0.2D, v22.2D +trn2 v1.2D, v5.2D, v21.2D +mla v9.4S, v6.4S, v31.s[0] +sub v6.4s, v15.4s, v30.4s +add v15.4s, v15.4s, v30.4s +sqrdmulh v30.4S, v13.4S, v29.4S +mul v13.4S, v13.4S,v12.4S +trn1 v16.2D, v0.2D, v22.2D +trn1 v20.2D, v5.2D, v21.2D +mla v4.4S, v3.4S, v31.s[0] +sub v3.4s, v2.4s, v9.4s +add v2.4s, v2.4s, v9.4s +sqrdmulh v9.4S, v27.4S, v28.4S +ldr q21, [x17, #+1568] +ldr q5, [x17, #+1584] +mul v27.4S, v27.4S,v8.4S +str q15, [x0, #512] +str q6, [x0, #528] +mla v13.4S, v30.4S, v31.s[0] +sub v30.4s, v10.4s, v4.4s +add v10.4s, v10.4s, v4.4s +sqrdmulh v4.4S, v26.4S, v28.4S +mul v26.4S, v26.4S,v8.4S +str q2, [x0, #544] +mla v27.4S, v9.4S, v31.s[0] +sub v9.4s, v18.4s, v13.4s +add v18.4s, v18.4s, v13.4s +ldr q13, [x17, #+1472] +ldr q2, [x17, #+1488] +sqrdmulh v28.4S, v11.4S, v5.4S +str q10, [x0, #576] +mul v11.4S, v11.4S,v21.4S +str q3, [x0, #560] +mla v26.4S, v4.4S, v31.s[0] +str q30, [x0, #592] +sub v30.4s, v7.4s, v27.4s +add v7.4s, v7.4s, v27.4s +ldr q27, [x17, #+1504] +ldr q4, [x17, #+1520] +sqrdmulh v3.4S, v1.4S, v5.4S +mul v1.4S, v1.4S,v21.4S +str q18, [x0, #608] +mla v11.4S, v28.4S, v31.s[0] +sub v28.4s, v17.4s, v26.4s +add v17.4s, v17.4s, v26.4s +ldr q26, [x17, #+1600] +ldr q18, [x17, #+1616] +sqrdmulh v5.4S, v17.4S, v2.4S +mul v17.4S, v17.4S,v13.4S +ldr q2, [x17, #+1632] +ldr q13, [x17, #+1648] +str q9, [x0, #624] +mla v1.4S, v3.4S, v31.s[0] +sub v3.4s, v16.4s, v11.4s +add v16.4s, v16.4s, v11.4s +sqrdmulh v11.4S, v28.4S, v4.4S +mul v28.4S, v28.4S,v27.4S +mla v17.4S, v5.4S, v31.s[0] +sub v5.4s, v20.4s, v1.4s +add v20.4s, v20.4s, v1.4s +sqrdmulh v1.4S, v20.4S, v18.4S +mul v20.4S, v20.4S,v26.4S +ldr q18, [x0, #800] +mla v28.4S, v11.4S, v31.s[0] +ldr q11, [x0, #816] +sub v26.4s, v7.4s, v17.4s +add v7.4s, v7.4s, v17.4s +sqrdmulh v17.4S, v5.4S, v13.4S +ldr q4, [x17, #+1664] +mul v5.4S, v5.4S,v2.4S +ldr q13, [x17, #+1680] +mla v20.4S, v1.4S, v31.s[0] +ldr q1, [x17, #+1792] +sub v2.4s, v30.4s, v28.4s +ldr q27, [x0, #864] +add v30.4s, v30.4s, v28.4s +ldr q28, [x17, #+1808] +ldr q9, [x0, #880] +sqrdmulh v21.4S, v18.4S, v13.s[0] +mul v18.4S, v18.4S,v4.s[0] +mla v5.4S, v17.4S, v31.s[0] +sub v17.4s, v16.4s, v20.4s +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v11.4S, v13.s[0] +mul v11.4S, v11.4S,v4.s[0] +str q7, [x0, #640] +str q26, [x0, #656] +str q30, [x0, #672] +str q2, [x0, #688] +mla v18.4S, v21.4S, v31.s[0] +sub v21.4s, v3.4s, v5.4s +add v3.4s, v3.4s, v5.4s +ldr q5, [x0, #768] +sqrdmulh v2.4S, v27.4S, v28.s[0] +ldr q30, [x0, #784] +mul v27.4S, v27.4S,v1.s[0] +str q16, [x0, #704] +str q17, [x0, #720] +str q3, [x0, #736] +str q21, [x0, #752] +mla v11.4S, v20.4S, v31.s[0] +sub v20.4s, v5.4s, v18.4s +add v5.4s, v5.4s, v18.4s +sqrdmulh v18.4S, v9.4S, v28.s[0] +ldr q21, [x0, #928] +mul v9.4S, v9.4S,v1.s[0] +ldr q3, [x0, #944] +mla v27.4S, v2.4S, v31.s[0] +ldr q2, [x0, #832] +sub v17.4s, v30.4s, v11.4s +add v30.4s, v30.4s, v11.4s +sqrdmulh v11.4S, v30.4S, v13.s[1] +ldr q16, [x0, #896] +mul v30.4S, v30.4S,v4.s[1] +ldr q26, [x0, #912] +mla v9.4S, v18.4S, v31.s[0] +ldr q18, [x0, #848] +sub v7.4s, v2.4s, v27.4s +add v2.4s, v2.4s, v27.4s +sqrdmulh v27.4S, v17.4S, v13.s[2] +ldr q10, [x17, #+1920] +mul v17.4S, v17.4S,v4.s[2] +ldr q8, [x17, #+1936] +mla v30.4S, v11.4S, v31.s[0] +sub v11.4s, v18.4s, v9.4s +add v18.4s, v18.4s, v9.4s +sqrdmulh v9.4S, v18.4S, v28.s[1] +ldr q6, [x0, #992] +mul v18.4S, v18.4S,v1.s[1] +ldr q15, [x0, #1008] +mla v17.4S, v27.4S, v31.s[0] +sub v27.4s, v5.4s, v30.4s +add v5.4s, v5.4s, v30.4s +sqrdmulh v13.4S, v11.4S, v28.s[2] +ldr q4, [x0, #960] +mul v11.4S, v11.4S,v1.s[2] +ldr q30, [x0, #976] +mla v18.4S, v9.4S, v31.s[0] +sub v9.4s, v20.4s, v17.4s +add v20.4s, v20.4s, v17.4s +sqrdmulh v17.4S, v21.4S, v8.s[0] +ldr q22, [x17, #+2048] +mul v21.4S, v21.4S,v10.s[0] +ldr q0, [x17, #+2064] +trn1 v29.4S, v5.4S, v27.4S +trn2 v12.4S, v5.4S, v27.4S +mla v11.4S, v13.4S, v31.s[0] +sub v13.4s, v2.4s, v18.4s +add v2.4s, v2.4s, v18.4s +sqrdmulh v28.4S, v3.4S, v8.s[0] +mul v3.4S, v3.4S,v10.s[0] +trn1 v1.4S, v20.4S, v9.4S +trn2 v18.4S, v20.4S, v9.4S +mla v21.4S, v17.4S, v31.s[0] +sub v17.4s, v7.4s, v11.4s +add v7.4s, v7.4s, v11.4s +sqrdmulh v11.4S, v6.4S, v0.s[0] +mul v6.4S, v6.4S,v22.s[0] +trn2 v20.2D, v29.2D, v1.2D +trn2 v9.2D, v12.2D, v18.2D +mla v3.4S, v28.4S, v31.s[0] +sub v28.4s, v16.4s, v21.4s +add v16.4s, v16.4s, v21.4s +sqrdmulh v21.4S, v15.4S, v0.s[0] +mul v15.4S, v15.4S,v22.s[0] +trn1 v5.2D, v29.2D, v1.2D +trn1 v27.2D, v12.2D, v18.2D +mla v6.4S, v11.4S, v31.s[0] +sub v11.4s, v26.4s, v3.4s +add v26.4s, v26.4s, v3.4s +sqrdmulh v3.4S, v26.4S, v8.s[1] +mul v26.4S, v26.4S,v10.s[1] +trn1 v18.4S, v2.4S, v13.4S +trn2 v12.4S, v2.4S, v13.4S +mla v15.4S, v21.4S, v31.s[0] +sub v21.4s, v4.4s, v6.4s +add v4.4s, v4.4s, v6.4s +sqrdmulh v6.4S, v11.4S, v8.s[2] +mul v11.4S, v11.4S,v10.s[2] +trn1 v1.4S, v7.4S, v17.4S +trn2 v29.4S, v7.4S, v17.4S +ldr q19, [x17, #+1696] +ldr q14, [x17, #+1712] +mla v26.4S, v3.4S, v31.s[0] +sub v3.4s, v30.4s, v15.4s +add v30.4s, v30.4s, v15.4s +sqrdmulh v15.4S, v30.4S, v0.s[1] +mul v30.4S, v30.4S,v22.s[1] +trn2 v7.2D, v18.2D, v1.2D +trn2 v17.2D, v12.2D, v29.2D +mla v11.4S, v6.4S, v31.s[0] +sub v6.4s, v16.4s, v26.4s +add v16.4s, v16.4s, v26.4s +sqrdmulh v8.4S, v3.4S, v0.s[2] +mul v3.4S, v3.4S,v22.s[2] +trn1 v2.2D, v18.2D, v1.2D +trn1 v13.2D, v12.2D, v29.2D +mla v30.4S, v15.4S, v31.s[0] +sub v15.4s, v28.4s, v11.4s +add v28.4s, v28.4s, v11.4s +sqrdmulh v11.4S, v20.4S, v14.4S +mul v20.4S, v20.4S,v19.4S +mla v3.4S, v8.4S, v31.s[0] +sub v8.4s, v4.4s, v30.4s +add v4.4s, v4.4s, v30.4s +ldr q0, [x17, #+1824] +ldr q22, [x17, #+1840] +sqrdmulh v30.4S, v9.4S, v14.4S +mul v9.4S, v9.4S,v19.4S +trn1 v29.4S, v16.4S, v6.4S +trn2 v12.4S, v16.4S, v6.4S +mla v20.4S, v11.4S, v31.s[0] +sub v11.4s, v21.4s, v3.4s +add v21.4s, v21.4s, v3.4s +ldr q14, [x17, #+1728] +ldr q19, [x17, #+1744] +trn1 v3.4S, v28.4S, v15.4S +trn2 v1.4S, v28.4S, v15.4S +sqrdmulh v18.4S, v7.4S, v22.4S +mul v7.4S, v7.4S,v0.4S +trn2 v28.2D, v29.2D, v3.2D +trn2 v15.2D, v12.2D, v1.2D +ldr q10, [x17, #+1760] +ldr q26, [x17, #+1776] +mla v9.4S, v30.4S, v31.s[0] +sub v30.4s, v5.4s, v20.4s +add v5.4s, v5.4s, v20.4s +sqrdmulh v20.4S, v17.4S, v22.4S +mul v17.4S, v17.4S,v0.4S +trn1 v16.2D, v29.2D, v3.2D +trn1 v6.2D, v12.2D, v1.2D +ldr q1, [x17, #+1856] +ldr q12, [x17, #+1872] +mla v7.4S, v18.4S, v31.s[0] +sub v18.4s, v27.4s, v9.4s +add v27.4s, v27.4s, v9.4s +sqrdmulh v9.4S, v27.4S, v19.4S +mul v27.4S, v27.4S,v14.4S +trn1 v19.4S, v4.4S, v8.4S +trn2 v14.4S, v4.4S, v8.4S +ldr q3, [x17, #+1888] +ldr q29, [x17, #+1904] +mla v17.4S, v20.4S, v31.s[0] +sub v20.4s, v2.4s, v7.4s +add v2.4s, v2.4s, v7.4s +sqrdmulh v7.4S, v18.4S, v26.4S +mul v18.4S, v18.4S,v10.4S +trn1 v26.4S, v21.4S, v11.4S +trn2 v10.4S, v21.4S, v11.4S +mla v27.4S, v9.4S, v31.s[0] +sub v9.4s, v13.4s, v17.4s +add v13.4s, v13.4s, v17.4s +sqrdmulh v17.4S, v13.4S, v12.4S +mul v13.4S, v13.4S,v1.4S +ldr q22, [x17, #+1952] +ldr q0, [x17, #+1968] +trn2 v21.2D, v19.2D, v26.2D +trn2 v11.2D, v14.2D, v10.2D +mla v18.4S, v7.4S, v31.s[0] +sub v7.4s, v5.4s, v27.4s +add v5.4s, v5.4s, v27.4s +sqrdmulh v27.4S, v9.4S, v29.4S +mul v9.4S, v9.4S,v3.4S +trn1 v4.2D, v19.2D, v26.2D +trn1 v8.2D, v14.2D, v10.2D +mla v13.4S, v17.4S, v31.s[0] +sub v17.4s, v30.4s, v18.4s +add v30.4s, v30.4s, v18.4s +sqrdmulh v18.4S, v28.4S, v0.4S +ldr q10, [x17, #+2080] +ldr q14, [x17, #+2096] +mul v28.4S, v28.4S,v22.4S +str q5, [x0, #768] +str q7, [x0, #784] +mla v9.4S, v27.4S, v31.s[0] +sub v27.4s, v2.4s, v13.4s +add v2.4s, v2.4s, v13.4s +sqrdmulh v13.4S, v15.4S, v0.4S +mul v15.4S, v15.4S,v22.4S +str q30, [x0, #800] +mla v28.4S, v18.4S, v31.s[0] +sub v18.4s, v20.4s, v9.4s +add v20.4s, v20.4s, v9.4s +ldr q9, [x17, #+1984] +ldr q30, [x17, #+2000] +sqrdmulh v0.4S, v21.4S, v14.4S +str q2, [x0, #832] +mul v21.4S, v21.4S,v10.4S +str q17, [x0, #816] +mla v15.4S, v13.4S, v31.s[0] +str q27, [x0, #848] +sub v27.4s, v16.4s, v28.4s +add v16.4s, v16.4s, v28.4s +ldr q28, [x17, #+2016] +ldr q13, [x17, #+2032] +sqrdmulh v17.4S, v11.4S, v14.4S +mul v11.4S, v11.4S,v10.4S +str q20, [x0, #864] +mla v21.4S, v0.4S, v31.s[0] +sub v0.4s, v6.4s, v15.4s +add v6.4s, v6.4s, v15.4s +ldr q15, [x17, #+2112] +ldr q20, [x17, #+2128] +sqrdmulh v14.4S, v6.4S, v30.4S +mul v6.4S, v6.4S,v9.4S +ldr q30, [x17, #+2144] +ldr q9, [x17, #+2160] +str q18, [x0, #880] +mla v11.4S, v17.4S, v31.s[0] +sub v17.4s, v4.4s, v21.4s +add v4.4s, v4.4s, v21.4s +sqrdmulh v21.4S, v0.4S, v13.4S +mul v0.4S, v0.4S,v28.4S +mla v6.4S, v14.4S, v31.s[0] +sub v14.4s, v8.4s, v11.4s +add v8.4s, v8.4s, v11.4s +sqrdmulh v11.4S, v8.4S, v20.4S +mul v8.4S, v8.4S,v15.4S +mla v0.4S, v21.4S, v31.s[0] +sub v21.4s, v16.4s, v6.4s +add v16.4s, v16.4s, v6.4s +sqrdmulh v6.4S, v14.4S, v9.4S +mul v14.4S, v14.4S,v30.4S +mla v8.4S, v11.4S, v31.s[0] +sub v11.4s, v27.4s, v0.4s +add v27.4s, v27.4s, v0.4s +mla v14.4S, v6.4S, v31.s[0] +sub v6.4s, v4.4s, v8.4s +add v4.4s, v4.4s, v8.4s +str q16, [x0, #896] +str q21, [x0, #912] +str q27, [x0, #928] +str q11, [x0, #944] +sub v11.4s, v17.4s, v14.4s +add v17.4s, v17.4s, v14.4s +str q4, [x0, #960] +str q6, [x0, #976] +str q17, [x0, #992] +str q11, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 2392 +// Instruction count: 2388 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_4_0.s b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_4_0.s new file mode 100644 index 0000000..807dda1 --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_4_0.s @@ -0,0 +1,2422 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 26036764 // Layer 6, block 0 +.word 7065381 // Layer 6, block 1 +.word 11280567 // Layer 6, block 2 +.word 19695786 // Layer 6, block 3 +.word 1666225723 // Layer 6, block 0 +.word 452149874 // Layer 6, block 1 +.word 721901190 // Layer 6, block 2 +.word 1260434103 // Layer 6, block 3 +.word 28678040 // Layer 7, block 0 +.word 5637166 // Layer 7, block 2 +.word 18759424 // Layer 7, block 4 +.word 8648030 // Layer 7, block 6 +.word 1835254486 // Layer 7, block 0 +.word 360751090 // Layer 7, block 2 +.word 1200511508 // Layer 7, block 4 +.word 553431680 // Layer 7, block 6 +.word 7232147 // Layer 7, block 1 +.word 7430689 // Layer 7, block 3 +.word 14819378 // Layer 7, block 5 +.word 22112339 // Layer 7, block 7 +.word 462822084 // Layer 7, block 1 +.word 475527802 // Layer 7, block 3 +.word 948367809 // Layer 7, block 5 +.word 1415081692 // Layer 7, block 7 +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14834498 // Layer 6, block 4 +.word 22861321 // Layer 6, block 5 +.word 23033862 // Layer 6, block 6 +.word 32211066 // Layer 6, block 7 +.word 949335415 // Layer 6, block 4 +.word 1463012881 // Layer 6, block 5 +.word 1474054663 // Layer 6, block 6 +.word 2061350894 // Layer 6, block 7 +.word 7103825 // Layer 7, block 8 +.word 24338119 // Layer 7, block 10 +.word 6674394 // Layer 7, block 12 +.word 3716128 // Layer 7, block 14 +.word 454610102 // Layer 7, block 8 +.word 1557520740 // Layer 7, block 10 +.word 427128616 // Layer 7, block 12 +.word 237814041 // Layer 7, block 14 +.word 18577393 // Layer 7, block 9 +.word 17042091 // Layer 7, block 11 +.word 6574213 // Layer 7, block 13 +.word 24666803 // Layer 7, block 15 +.word 1188862414 // Layer 7, block 9 +.word 1090610585 // Layer 7, block 11 +.word 420717521 // Layer 7, block 13 +.word 1578554911 // Layer 7, block 15 +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 11253846 // Layer 6, block 8 +.word 16151303 // Layer 6, block 9 +.word 1821442 // Layer 6, block 10 +.word 23358663 // Layer 6, block 11 +.word 720191176 // Layer 6, block 8 +.word 1033604503 // Layer 6, block 9 +.word 116563391 // Layer 6, block 10 +.word 1494840340 // Layer 6, block 11 +.word 32787475 // Layer 7, block 16 +.word 8269259 // Layer 7, block 18 +.word 20826321 // Layer 7, block 20 +.word 21194054 // Layer 7, block 22 +.word 2098238255 // Layer 7, block 16 +.word 529192186 // Layer 7, block 18 +.word 1332782821 // Layer 7, block 20 +.word 1356315937 // Layer 7, block 22 +.word 28400654 // Layer 7, block 17 +.word 31090287 // Layer 7, block 19 +.word 26776841 // Layer 7, block 21 +.word 22281074 // Layer 7, block 23 +.word 1817503137 // Layer 7, block 17 +.word 1989626512 // Layer 7, block 19 +.word 1713587037 // Layer 7, block 21 +.word 1425879908 // Layer 7, block 23 +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 20504641 // Layer 6, block 12 +.word 7735096 // Layer 6, block 13 +.word 29463916 // Layer 6, block 14 +.word 23172067 // Layer 6, block 15 +.word 1312196872 // Layer 6, block 12 +.word 495008363 // Layer 6, block 13 +.word 1885546712 // Layer 6, block 14 +.word 1482899108 // Layer 6, block 15 +.word 1953000 // Layer 7, block 24 +.word 12766243 // Layer 7, block 26 +.word 16292342 // Layer 7, block 28 +.word 25143337 // Layer 7, block 30 +.word 124982461 // Layer 7, block 24 +.word 816977197 // Layer 7, block 26 +.word 1042630311 // Layer 7, block 28 +.word 1609050759 // Layer 7, block 30 +.word 12486848 // Layer 7, block 25 +.word 31556661 // Layer 7, block 27 +.word 28330310 // Layer 7, block 29 +.word 15137961 // Layer 7, block 31 +.word 799097282 // Layer 7, block 25 +.word 2019472170 // Layer 7, block 27 +.word 1813001465 // Layer 7, block 29 +.word 968755565 // Layer 7, block 31 +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 18663828 // Layer 6, block 16 +.word 25765932 // Layer 6, block 17 +.word 11779122 // Layer 6, block 18 +.word 29112305 // Layer 6, block 19 +.word 1194393831 // Layer 6, block 16 +.word 1648893798 // Layer 6, block 17 +.word 753806275 // Layer 6, block 18 +.word 1863045325 // Layer 6, block 19 +.word 33163184 // Layer 7, block 32 +.word 11550623 // Layer 7, block 34 +.word 25375595 // Layer 7, block 36 +.word 18254638 // Layer 7, block 38 +.word 2122281795 // Layer 7, block 32 +.word 739183455 // Layer 7, block 34 +.word 1623914137 // Layer 7, block 36 +.word 1168207670 // Layer 7, block 38 +.word 9551359 // Layer 7, block 33 +.word 33257316 // Layer 7, block 35 +.word 10387700 // Layer 7, block 37 +.word 4263629 // Layer 7, block 39 +.word 611240324 // Layer 7, block 33 +.word 2128305784 // Layer 7, block 35 +.word 664762063 // Layer 7, block 37 +.word 272851431 // Layer 7, block 39 +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 596073 // Layer 6, block 20 +.word 29039358 // Layer 6, block 21 +.word 6760262 // Layer 6, block 22 +.word 2228887 // Layer 6, block 23 +.word 38145761 // Layer 6, block 20 +.word 1858377074 // Layer 6, block 21 +.word 432623749 // Layer 6, block 22 +.word 142637881 // Layer 6, block 23 +.word 25929180 // Layer 7, block 40 +.word 23508428 // Layer 7, block 42 +.word 22560727 // Layer 7, block 44 +.word 29457393 // Layer 7, block 46 +.word 1659340873 // Layer 7, block 40 +.word 1504424569 // Layer 7, block 42 +.word 1443776334 // Layer 7, block 44 +.word 1885129272 // Layer 7, block 46 +.word 17371159 // Layer 7, block 41 +.word 11558208 // Layer 7, block 43 +.word 15755637 // Layer 7, block 45 +.word 20740787 // Layer 7, block 47 +.word 1111669329 // Layer 7, block 41 +.word 739668858 // Layer 7, block 43 +.word 1008283812 // Layer 7, block 45 +.word 1327309063 // Layer 7, block 47 +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 13624329 // Layer 6, block 24 +.word 9838349 // Layer 6, block 25 +.word 6934560 // Layer 6, block 26 +.word 11310234 // Layer 6, block 27 +.word 871890510 // Layer 6, block 24 +.word 629606282 // Layer 6, block 25 +.word 443777969 // Layer 6, block 26 +.word 723799733 // Layer 6, block 27 +.word 3153984 // Layer 7, block 48 +.word 15599806 // Layer 7, block 50 +.word 23484790 // Layer 7, block 52 +.word 30174454 // Layer 7, block 54 +.word 201839571 // Layer 7, block 48 +.word 998311389 // Layer 7, block 50 +.word 1502911852 // Layer 7, block 52 +.word 1931017673 // Layer 7, block 54 +.word 13598070 // Layer 7, block 49 +.word 31454003 // Layer 7, block 51 +.word 20506260 // Layer 7, block 53 +.word 5928435 // Layer 7, block 55 +.word 870210062 // Layer 7, block 49 +.word 2012902560 // Layer 7, block 51 +.word 1312300480 // Layer 7, block 53 +.word 379390883 // Layer 7, block 55 +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 32798516 // Layer 6, block 28 +.word 9911360 // Layer 6, block 29 +.word 32443170 // Layer 6, block 30 +.word 31293482 // Layer 6, block 31 +.word 2098944825 // Layer 6, block 28 +.word 634278629 // Layer 6, block 29 +.word 2076204416 // Layer 6, block 30 +.word 2002630000 // Layer 6, block 31 +.word 26013877 // Layer 7, block 56 +.word 22928950 // Layer 7, block 58 +.word 24547058 // Layer 7, block 60 +.word 21082546 // Layer 7, block 62 +.word 1664761067 // Layer 7, block 56 +.word 1467340807 // Layer 7, block 58 +.word 1570891816 // Layer 7, block 60 +.word 1349179970 // Layer 7, block 62 +.word 21864746 // Layer 7, block 57 +.word 27678266 // Layer 7, block 59 +.word 30695887 // Layer 7, block 61 +.word 31772478 // Layer 7, block 63 +.word 1399236949 // Layer 7, block 57 +.word 1771273834 // Layer 7, block 59 +.word 1964386839 // Layer 7, block 61 +.word 2033283404 // Layer 7, block 63 +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 2853776 // Layer 6, block 32 +.word 31645959 // Layer 6, block 33 +.word 29723614 // Layer 6, block 34 +.word 31813171 // Layer 6, block 35 +.word 182627725 // Layer 6, block 32 +.word 2025186806 // Layer 6, block 33 +.word 1902166116 // Layer 6, block 34 +.word 2035887557 // Layer 6, block 35 +.word 30377953 // Layer 7, block 64 +.word 4924837 // Layer 7, block 66 +.word 11362575 // Layer 7, block 68 +.word 31398766 // Layer 7, block 70 +.word 1944040616 // Layer 7, block 64 +.word 315165513 // Layer 7, block 66 +.word 727149301 // Layer 7, block 68 +.word 2009367662 // Layer 7, block 70 +.word 27689101 // Layer 7, block 65 +.word 31229525 // Layer 7, block 67 +.word 6544948 // Layer 7, block 69 +.word 13728247 // Layer 7, block 71 +.word 1771967221 // Layer 7, block 65 +.word 1998537064 // Layer 7, block 67 +.word 418844704 // Layer 7, block 69 +.word 878540754 // Layer 7, block 71 +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9116920 // Layer 6, block 36 +.word 26449800 // Layer 6, block 37 +.word 27173300 // Layer 6, block 38 +.word 1574249 // Layer 6, block 39 +.word 583438350 // Layer 6, block 36 +.word 1692658010 // Layer 6, block 37 +.word 1738958476 // Layer 6, block 38 +.word 100744247 // Layer 6, block 39 +.word 6510145 // Layer 7, block 72 +.word 760999 // Layer 7, block 74 +.word 1634503 // Layer 7, block 76 +.word 29546109 // Layer 7, block 78 +.word 416617482 // Layer 7, block 72 +.word 48700219 // Layer 7, block 74 +.word 104600209 // Layer 7, block 76 +.word 1890806663 // Layer 7, block 78 +.word 2195232 // Layer 7, block 73 +.word 4465852 // Layer 7, block 75 +.word 31203102 // Layer 7, block 77 +.word 29916743 // Layer 7, block 79 +.word 140484126 // Layer 7, block 73 +.word 285792715 // Layer 7, block 75 +.word 1996846121 // Layer 7, block 77 +.word 1914525428 // Layer 7, block 79 +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29172999 // Layer 6, block 40 +.word 16825951 // Layer 6, block 41 +.word 11592382 // Layer 6, block 42 +.word 2671395 // Layer 6, block 43 +.word 1866929445 // Layer 6, block 40 +.word 1076778680 // Layer 6, block 41 +.word 741855827 // Layer 6, block 42 +.word 170956232 // Layer 6, block 43 +.word 14579779 // Layer 7, block 80 +.word 24263513 // Layer 7, block 82 +.word 4646776 // Layer 7, block 84 +.word 69049 // Layer 7, block 86 +.word 933034643 // Layer 7, block 80 +.word 1552746321 // Layer 7, block 82 +.word 297370968 // Layer 7, block 84 +.word 4418799 // Layer 7, block 86 +.word 33263488 // Layer 7, block 81 +.word 22493246 // Layer 7, block 83 +.word 22009979 // Layer 7, block 85 +.word 12021234 // Layer 7, block 87 +.word 2128700762 // Layer 7, block 81 +.word 1439457879 // Layer 7, block 83 +.word 1408531152 // Layer 7, block 85 +.word 769300260 // Layer 7, block 87 +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 15720958 // Layer 6, block 44 +.word 4876619 // Layer 6, block 45 +.word 9370171 // Layer 6, block 46 +.word 2197027 // Layer 6, block 47 +.word 1006064525 // Layer 6, block 44 +.word 312079797 // Layer 6, block 45 +.word 599645177 // Layer 6, block 46 +.word 140598997 // Layer 6, block 47 +.word 16117282 // Layer 7, block 88 +.word 9635661 // Layer 7, block 90 +.word 9117520 // Layer 7, block 92 +.word 3506913 // Layer 7, block 94 +.word 1031427326 // Layer 7, block 88 +.word 616635240 // Layer 7, block 90 +.word 583476747 // Layer 7, block 92 +.word 224425303 // Layer 7, block 94 +.word 20014407 // Layer 7, block 89 +.word 25893988 // Layer 7, block 91 +.word 10257619 // Layer 7, block 93 +.word 24501669 // Layer 7, block 95 +.word 1280824291 // Layer 7, block 89 +.word 1657088757 // Layer 7, block 91 +.word 656437514 // Layer 7, block 93 +.word 1567987141 // Layer 7, block 95 +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 23467272 // Layer 6, block 48 +.word 11944835 // Layer 6, block 49 +.word 29768154 // Layer 6, block 50 +.word 3189790 // Layer 6, block 51 +.word 1501790786 // Layer 6, block 48 +.word 764411097 // Layer 6, block 49 +.word 1905016458 // Layer 6, block 50 +.word 204130980 // Layer 6, block 51 +.word 28559032 // Layer 7, block 96 +.word 20151609 // Layer 7, block 98 +.word 11645481 // Layer 7, block 100 +.word 16402437 // Layer 7, block 102 +.word 1827638556 // Layer 7, block 96 +.word 1289604549 // Layer 7, block 98 +.word 745253903 // Layer 7, block 100 +.word 1049675853 // Layer 7, block 102 +.word 1005359 // Layer 7, block 97 +.word 19130139 // Layer 7, block 99 +.word 11690281 // Layer 7, block 101 +.word 5461508 // Layer 7, block 103 +.word 64338065 // Layer 7, block 97 +.word 1224235458 // Layer 7, block 99 +.word 748120885 // Layer 7, block 101 +.word 349509836 // Layer 7, block 103 +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 4898455 // Layer 6, block 52 +.word 22059944 // Layer 6, block 53 +.word 20315246 // Layer 6, block 54 +.word 28615767 // Layer 6, block 55 +.word 313477194 // Layer 6, block 52 +.word 1411728668 // Layer 6, block 53 +.word 1300076517 // Layer 6, block 54 +.word 1831269319 // Layer 6, block 55 +.word 6226096 // Layer 7, block 104 +.word 14029790 // Layer 7, block 106 +.word 7729000 // Layer 7, block 108 +.word 13958531 // Layer 7, block 110 +.word 398439734 // Layer 7, block 104 +.word 897838034 // Layer 7, block 106 +.word 494618249 // Layer 7, block 108 +.word 893277806 // Layer 7, block 110 +.word 31755058 // Layer 7, block 105 +.word 26102744 // Layer 7, block 107 +.word 19175904 // Layer 7, block 109 +.word 19472238 // Layer 7, block 111 +.word 2032168609 // Layer 7, block 105 +.word 1670448121 // Layer 7, block 107 +.word 1227164194 // Layer 7, block 109 +.word 1246128123 // Layer 7, block 111 +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 17302560 // Layer 6, block 56 +.word 8630188 // Layer 6, block 57 +.word 13744680 // Layer 6, block 58 +.word 31890906 // Layer 6, block 59 +.word 1107279328 // Layer 6, block 56 +.word 552289879 // Layer 6, block 57 +.word 879592386 // Layer 6, block 58 +.word 2040862218 // Layer 6, block 59 +.word 4735938 // Layer 7, block 112 +.word 26671657 // Layer 7, block 114 +.word 25810971 // Layer 7, block 116 +.word 25578690 // Layer 7, block 118 +.word 303076900 // Layer 7, block 112 +.word 1706855774 // Layer 7, block 114 +.word 1651776074 // Layer 7, block 116 +.word 1636911225 // Layer 7, block 118 +.word 6957373 // Layer 7, block 113 +.word 25381712 // Layer 7, block 115 +.word 27780827 // Layer 7, block 117 +.word 28062311 // Layer 7, block 119 +.word 445237890 // Layer 7, block 113 +.word 1624305595 // Layer 7, block 115 +.word 1777837237 // Layer 7, block 117 +.word 1795850838 // Layer 7, block 119 +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 26150922 // Layer 6, block 60 +.word 29525906 // Layer 6, block 61 +.word 23080870 // Layer 6, block 62 +.word 1636987 // Layer 6, block 63 +.word 1673531278 // Layer 6, block 60 +.word 1889513769 // Layer 6, block 61 +.word 1477062945 // Layer 6, block 62 +.word 104759172 // Layer 6, block 63 +.word 10674616 // Layer 7, block 120 +.word 9508293 // Layer 7, block 122 +.word 4274200 // Layer 7, block 124 +.word 10066304 // Layer 7, block 126 +.word 683123285 // Layer 7, block 120 +.word 608484310 // Layer 7, block 122 +.word 273527923 // Layer 7, block 124 +.word 644194289 // Layer 7, block 126 +.word 26473446 // Layer 7, block 121 +.word 14853570 // Layer 7, block 123 +.word 32427548 // Layer 7, block 125 +.word 16598340 // Layer 7, block 127 +.word 1694171239 // Layer 7, block 121 +.word 950555930 // Layer 7, block 123 +.word 2075204685 // Layer 7, block 125 +.word 1062212688 // Layer 7, block 127 +.text +.global ntt_u32_full_neon_asm_var_4_4_4_0 +.global _ntt_u32_full_neon_asm_var_4_4_4_0 +ntt_u32_full_neon_asm_var_4_4_4_0: +_ntt_u32_full_neon_asm_var_4_4_4_0: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x0, #800] +ldr q29, [x0, #864] +ldr q28, [x0, #928] +ldr q27, [x0, #992] +ldr q26, [x0, #288] +ldr q25, [x0, #352] +ldr q24, [x0, #416] +ldr q23, [x0, #480] +ldr q22, [x0, #544] +ldr q21, [x0, #608] +ldr q20, [x0, #672] +ldr q19, [x0, #736] +ldr q18, [x0, #32] +ldr q17, [x0, #96] +ldr q16, [x0, #160] +ldr q3, [x0, #224] +ldr q2, [x17, #+0] +ldr q1, [x17, #+16] +ldr q0, [x17, #+32] +ldr q15, [x17, #+48] +ldr q14, [x17, #+64] +ldr q13, [x17, #+80] +ldr q12, [x17, #+96] +ldr q11, [x17, #+112] +sqrdmulh v10.4S, v30.4S, v1.s[0] +sqrdmulh v9.4S, v29.4S, v1.s[0] +sqrdmulh v8.4S, v28.4S, v1.s[0] +sqrdmulh v7.4S, v27.4S, v1.s[0] +mul v30.4S, v30.4S,v2.s[0] +mul v29.4S, v29.4S,v2.s[0] +mul v28.4S, v28.4S,v2.s[0] +mul v27.4S, v27.4S,v2.s[0] +mla v30.4S, v10.4S, v31.s[0] +mla v29.4S, v9.4S, v31.s[0] +mla v28.4S, v8.4S, v31.s[0] +mla v27.4S, v7.4S, v31.s[0] +sub v7.4s, v26.4s, v30.4s +sub v8.4s, v25.4s, v29.4s +sub v9.4s, v24.4s, v28.4s +sub v10.4s, v23.4s, v27.4s +add v26.4s, v26.4s, v30.4s +add v25.4s, v25.4s, v29.4s +add v24.4s, v24.4s, v28.4s +add v23.4s, v23.4s, v27.4s +sqrdmulh v27.4S, v22.4S, v1.s[0] +sqrdmulh v28.4S, v21.4S, v1.s[0] +sqrdmulh v29.4S, v20.4S, v1.s[0] +sqrdmulh v30.4S, v19.4S, v1.s[0] +mul v22.4S, v22.4S,v2.s[0] +mul v21.4S, v21.4S,v2.s[0] +mul v20.4S, v20.4S,v2.s[0] +mul v19.4S, v19.4S,v2.s[0] +mla v22.4S, v27.4S, v31.s[0] +mla v21.4S, v28.4S, v31.s[0] +mla v20.4S, v29.4S, v31.s[0] +mla v19.4S, v30.4S, v31.s[0] +sub v30.4s, v18.4s, v22.4s +sub v29.4s, v17.4s, v21.4s +sub v28.4s, v16.4s, v20.4s +sub v27.4s, v3.4s, v19.4s +add v18.4s, v18.4s, v22.4s +add v17.4s, v17.4s, v21.4s +add v16.4s, v16.4s, v20.4s +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v24.4S, v1.s[1] +sqrdmulh v20.4S, v23.4S, v1.s[1] +sqrdmulh v21.4S, v26.4S, v1.s[1] +sqrdmulh v22.4S, v25.4S, v1.s[1] +mul v24.4S, v24.4S,v2.s[1] +mul v23.4S, v23.4S,v2.s[1] +mul v26.4S, v26.4S,v2.s[1] +mul v25.4S, v25.4S,v2.s[1] +mla v24.4S, v19.4S, v31.s[0] +mla v23.4S, v20.4S, v31.s[0] +mla v26.4S, v21.4S, v31.s[0] +mla v25.4S, v22.4S, v31.s[0] +sub v22.4s, v16.4s, v24.4s +sub v21.4s, v3.4s, v23.4s +sub v20.4s, v18.4s, v26.4s +sub v19.4s, v17.4s, v25.4s +add v16.4s, v16.4s, v24.4s +add v3.4s, v3.4s, v23.4s +add v18.4s, v18.4s, v26.4s +add v17.4s, v17.4s, v25.4s +sqrdmulh v25.4S, v9.4S, v1.s[2] +sqrdmulh v26.4S, v10.4S, v1.s[2] +sqrdmulh v23.4S, v7.4S, v1.s[2] +sqrdmulh v24.4S, v8.4S, v1.s[2] +mul v9.4S, v9.4S,v2.s[2] +mul v10.4S, v10.4S,v2.s[2] +mul v7.4S, v7.4S,v2.s[2] +mul v8.4S, v8.4S,v2.s[2] +mla v9.4S, v25.4S, v31.s[0] +mla v10.4S, v26.4S, v31.s[0] +mla v7.4S, v23.4S, v31.s[0] +mla v8.4S, v24.4S, v31.s[0] +sub v24.4s, v28.4s, v9.4s +sub v23.4s, v27.4s, v10.4s +sub v26.4s, v30.4s, v7.4s +sub v25.4s, v29.4s, v8.4s +add v28.4s, v28.4s, v9.4s +add v27.4s, v27.4s, v10.4s +add v30.4s, v30.4s, v7.4s +add v29.4s, v29.4s, v8.4s +sqrdmulh v8.4S, v16.4S, v15.s[0] +sqrdmulh v7.4S, v3.4S, v15.s[0] +sqrdmulh v10.4S, v22.4S, v15.s[1] +sqrdmulh v9.4S, v21.4S, v15.s[1] +mul v16.4S, v16.4S,v0.s[0] +mul v3.4S, v3.4S,v0.s[0] +mul v22.4S, v22.4S,v0.s[1] +mul v21.4S, v21.4S,v0.s[1] +mla v16.4S, v8.4S, v31.s[0] +mla v3.4S, v7.4S, v31.s[0] +mla v22.4S, v10.4S, v31.s[0] +mla v21.4S, v9.4S, v31.s[0] +sub v9.4s, v18.4s, v16.4s +sub v10.4s, v17.4s, v3.4s +sub v7.4s, v20.4s, v22.4s +sub v8.4s, v19.4s, v21.4s +add v18.4s, v18.4s, v16.4s +add v17.4s, v17.4s, v3.4s +add v20.4s, v20.4s, v22.4s +add v19.4s, v19.4s, v21.4s +sqrdmulh v21.4S, v28.4S, v15.s[2] +sqrdmulh v22.4S, v27.4S, v15.s[2] +sqrdmulh v3.4S, v24.4S, v15.s[3] +sqrdmulh v16.4S, v23.4S, v15.s[3] +mul v28.4S, v28.4S,v0.s[2] +mul v27.4S, v27.4S,v0.s[2] +mul v24.4S, v24.4S,v0.s[3] +mul v23.4S, v23.4S,v0.s[3] +mla v28.4S, v21.4S, v31.s[0] +mla v27.4S, v22.4S, v31.s[0] +mla v24.4S, v3.4S, v31.s[0] +mla v23.4S, v16.4S, v31.s[0] +sub v16.4s, v30.4s, v28.4s +sub v3.4s, v29.4s, v27.4s +sub v22.4s, v26.4s, v24.4s +sub v21.4s, v25.4s, v23.4s +add v30.4s, v30.4s, v28.4s +add v29.4s, v29.4s, v27.4s +add v26.4s, v26.4s, v24.4s +add v25.4s, v25.4s, v23.4s +sqrdmulh v23.4S, v17.4S, v13.s[0] +sqrdmulh v24.4S, v10.4S, v13.s[1] +sqrdmulh v27.4S, v19.4S, v13.s[2] +sqrdmulh v28.4S, v8.4S, v13.s[3] +mul v17.4S, v17.4S,v14.s[0] +mul v10.4S, v10.4S,v14.s[1] +mul v19.4S, v19.4S,v14.s[2] +mul v8.4S, v8.4S,v14.s[3] +mla v17.4S, v23.4S, v31.s[0] +mla v10.4S, v24.4S, v31.s[0] +mla v19.4S, v27.4S, v31.s[0] +mla v8.4S, v28.4S, v31.s[0] +sub v28.4s, v18.4s, v17.4s +sub v27.4s, v9.4s, v10.4s +sub v24.4s, v20.4s, v19.4s +sub v23.4s, v7.4s, v8.4s +add v18.4s, v18.4s, v17.4s +add v9.4s, v9.4s, v10.4s +add v20.4s, v20.4s, v19.4s +add v7.4s, v7.4s, v8.4s +sqrdmulh v8.4S, v29.4S, v11.s[0] +sqrdmulh v19.4S, v3.4S, v11.s[1] +sqrdmulh v10.4S, v25.4S, v11.s[2] +sqrdmulh v17.4S, v21.4S, v11.s[3] +mul v29.4S, v29.4S,v12.s[0] +mul v3.4S, v3.4S,v12.s[1] +mul v25.4S, v25.4S,v12.s[2] +mul v21.4S, v21.4S,v12.s[3] +mla v29.4S, v8.4S, v31.s[0] +mla v3.4S, v19.4S, v31.s[0] +mla v25.4S, v10.4S, v31.s[0] +mla v21.4S, v17.4S, v31.s[0] +sub v17.4s, v30.4s, v29.4s +sub v10.4s, v16.4s, v3.4s +sub v19.4s, v26.4s, v25.4s +sub v8.4s, v22.4s, v21.4s +add v30.4s, v30.4s, v29.4s +add v16.4s, v16.4s, v3.4s +add v26.4s, v26.4s, v25.4s +add v22.4s, v22.4s, v21.4s +str q18, [x0, #32] +str q28, [x0, #96] +str q9, [x0, #160] +str q27, [x0, #224] +str q20, [x0, #288] +str q24, [x0, #352] +str q7, [x0, #416] +str q23, [x0, #480] +str q30, [x0, #544] +str q17, [x0, #608] +str q16, [x0, #672] +str q10, [x0, #736] +str q26, [x0, #800] +str q19, [x0, #864] +str q22, [x0, #928] +str q8, [x0, #992] +ldr q8, [x0, #816] +ldr q22, [x0, #880] +ldr q19, [x0, #944] +ldr q26, [x0, #1008] +ldr q10, [x0, #304] +ldr q16, [x0, #368] +ldr q17, [x0, #432] +ldr q30, [x0, #496] +ldr q23, [x0, #560] +ldr q7, [x0, #624] +ldr q24, [x0, #688] +ldr q20, [x0, #752] +ldr q27, [x0, #48] +ldr q9, [x0, #112] +ldr q28, [x0, #176] +ldr q18, [x0, #240] +sqrdmulh v21.4S, v8.4S, v1.s[0] +sqrdmulh v25.4S, v22.4S, v1.s[0] +sqrdmulh v3.4S, v19.4S, v1.s[0] +sqrdmulh v29.4S, v26.4S, v1.s[0] +mul v8.4S, v8.4S,v2.s[0] +mul v22.4S, v22.4S,v2.s[0] +mul v19.4S, v19.4S,v2.s[0] +mul v26.4S, v26.4S,v2.s[0] +mla v8.4S, v21.4S, v31.s[0] +mla v22.4S, v25.4S, v31.s[0] +mla v19.4S, v3.4S, v31.s[0] +mla v26.4S, v29.4S, v31.s[0] +sub v29.4s, v10.4s, v8.4s +sub v3.4s, v16.4s, v22.4s +sub v25.4s, v17.4s, v19.4s +sub v21.4s, v30.4s, v26.4s +add v10.4s, v10.4s, v8.4s +add v16.4s, v16.4s, v22.4s +add v17.4s, v17.4s, v19.4s +add v30.4s, v30.4s, v26.4s +sqrdmulh v26.4S, v23.4S, v1.s[0] +sqrdmulh v19.4S, v7.4S, v1.s[0] +sqrdmulh v22.4S, v24.4S, v1.s[0] +sqrdmulh v8.4S, v20.4S, v1.s[0] +mul v23.4S, v23.4S,v2.s[0] +mul v7.4S, v7.4S,v2.s[0] +mul v24.4S, v24.4S,v2.s[0] +mul v20.4S, v20.4S,v2.s[0] +mla v23.4S, v26.4S, v31.s[0] +mla v7.4S, v19.4S, v31.s[0] +mla v24.4S, v22.4S, v31.s[0] +mla v20.4S, v8.4S, v31.s[0] +sub v8.4s, v27.4s, v23.4s +sub v22.4s, v9.4s, v7.4s +sub v19.4s, v28.4s, v24.4s +sub v26.4s, v18.4s, v20.4s +add v27.4s, v27.4s, v23.4s +add v9.4s, v9.4s, v7.4s +add v28.4s, v28.4s, v24.4s +add v18.4s, v18.4s, v20.4s +sqrdmulh v20.4S, v17.4S, v1.s[1] +sqrdmulh v24.4S, v30.4S, v1.s[1] +sqrdmulh v7.4S, v10.4S, v1.s[1] +sqrdmulh v23.4S, v16.4S, v1.s[1] +mul v17.4S, v17.4S,v2.s[1] +mul v30.4S, v30.4S,v2.s[1] +mul v10.4S, v10.4S,v2.s[1] +mul v16.4S, v16.4S,v2.s[1] +mla v17.4S, v20.4S, v31.s[0] +mla v30.4S, v24.4S, v31.s[0] +mla v10.4S, v7.4S, v31.s[0] +mla v16.4S, v23.4S, v31.s[0] +sub v23.4s, v28.4s, v17.4s +sub v7.4s, v18.4s, v30.4s +sub v24.4s, v27.4s, v10.4s +sub v20.4s, v9.4s, v16.4s +add v28.4s, v28.4s, v17.4s +add v18.4s, v18.4s, v30.4s +add v27.4s, v27.4s, v10.4s +add v9.4s, v9.4s, v16.4s +sqrdmulh v16.4S, v25.4S, v1.s[2] +sqrdmulh v10.4S, v21.4S, v1.s[2] +sqrdmulh v30.4S, v29.4S, v1.s[2] +sqrdmulh v17.4S, v3.4S, v1.s[2] +mul v25.4S, v25.4S,v2.s[2] +mul v21.4S, v21.4S,v2.s[2] +mul v29.4S, v29.4S,v2.s[2] +mul v3.4S, v3.4S,v2.s[2] +mla v25.4S, v16.4S, v31.s[0] +mla v21.4S, v10.4S, v31.s[0] +mla v29.4S, v30.4S, v31.s[0] +mla v3.4S, v17.4S, v31.s[0] +sub v17.4s, v19.4s, v25.4s +sub v30.4s, v26.4s, v21.4s +sub v10.4s, v8.4s, v29.4s +sub v16.4s, v22.4s, v3.4s +add v19.4s, v19.4s, v25.4s +add v26.4s, v26.4s, v21.4s +add v8.4s, v8.4s, v29.4s +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v28.4S, v15.s[0] +sqrdmulh v29.4S, v18.4S, v15.s[0] +sqrdmulh v21.4S, v23.4S, v15.s[1] +sqrdmulh v25.4S, v7.4S, v15.s[1] +mul v28.4S, v28.4S,v0.s[0] +mul v18.4S, v18.4S,v0.s[0] +mul v23.4S, v23.4S,v0.s[1] +mul v7.4S, v7.4S,v0.s[1] +mla v28.4S, v3.4S, v31.s[0] +mla v18.4S, v29.4S, v31.s[0] +mla v23.4S, v21.4S, v31.s[0] +mla v7.4S, v25.4S, v31.s[0] +sub v25.4s, v27.4s, v28.4s +sub v21.4s, v9.4s, v18.4s +sub v29.4s, v24.4s, v23.4s +sub v3.4s, v20.4s, v7.4s +add v27.4s, v27.4s, v28.4s +add v9.4s, v9.4s, v18.4s +add v24.4s, v24.4s, v23.4s +add v20.4s, v20.4s, v7.4s +sqrdmulh v7.4S, v19.4S, v15.s[2] +sqrdmulh v23.4S, v26.4S, v15.s[2] +sqrdmulh v18.4S, v17.4S, v15.s[3] +sqrdmulh v28.4S, v30.4S, v15.s[3] +mul v19.4S, v19.4S,v0.s[2] +mul v26.4S, v26.4S,v0.s[2] +mul v17.4S, v17.4S,v0.s[3] +mul v30.4S, v30.4S,v0.s[3] +mla v19.4S, v7.4S, v31.s[0] +mla v26.4S, v23.4S, v31.s[0] +mla v17.4S, v18.4S, v31.s[0] +mla v30.4S, v28.4S, v31.s[0] +sub v28.4s, v8.4s, v19.4s +sub v18.4s, v22.4s, v26.4s +sub v23.4s, v10.4s, v17.4s +sub v7.4s, v16.4s, v30.4s +add v8.4s, v8.4s, v19.4s +add v22.4s, v22.4s, v26.4s +add v10.4s, v10.4s, v17.4s +add v16.4s, v16.4s, v30.4s +sqrdmulh v30.4S, v9.4S, v13.s[0] +sqrdmulh v17.4S, v21.4S, v13.s[1] +sqrdmulh v26.4S, v20.4S, v13.s[2] +sqrdmulh v19.4S, v3.4S, v13.s[3] +mul v9.4S, v9.4S,v14.s[0] +mul v21.4S, v21.4S,v14.s[1] +mul v20.4S, v20.4S,v14.s[2] +mul v3.4S, v3.4S,v14.s[3] +mla v9.4S, v30.4S, v31.s[0] +mla v21.4S, v17.4S, v31.s[0] +mla v20.4S, v26.4S, v31.s[0] +mla v3.4S, v19.4S, v31.s[0] +sub v19.4s, v27.4s, v9.4s +sub v26.4s, v25.4s, v21.4s +sub v17.4s, v24.4s, v20.4s +sub v30.4s, v29.4s, v3.4s +add v27.4s, v27.4s, v9.4s +add v25.4s, v25.4s, v21.4s +add v24.4s, v24.4s, v20.4s +add v29.4s, v29.4s, v3.4s +sqrdmulh v3.4S, v22.4S, v11.s[0] +sqrdmulh v20.4S, v18.4S, v11.s[1] +sqrdmulh v21.4S, v16.4S, v11.s[2] +sqrdmulh v9.4S, v7.4S, v11.s[3] +mul v22.4S, v22.4S,v12.s[0] +mul v18.4S, v18.4S,v12.s[1] +mul v16.4S, v16.4S,v12.s[2] +mul v7.4S, v7.4S,v12.s[3] +mla v22.4S, v3.4S, v31.s[0] +mla v18.4S, v20.4S, v31.s[0] +mla v16.4S, v21.4S, v31.s[0] +mla v7.4S, v9.4S, v31.s[0] +sub v9.4s, v8.4s, v22.4s +sub v21.4s, v28.4s, v18.4s +sub v20.4s, v10.4s, v16.4s +sub v3.4s, v23.4s, v7.4s +add v8.4s, v8.4s, v22.4s +add v28.4s, v28.4s, v18.4s +add v10.4s, v10.4s, v16.4s +add v23.4s, v23.4s, v7.4s +str q27, [x0, #48] +str q19, [x0, #112] +str q25, [x0, #176] +str q26, [x0, #240] +str q24, [x0, #304] +str q17, [x0, #368] +str q29, [x0, #432] +str q30, [x0, #496] +str q8, [x0, #560] +str q9, [x0, #624] +str q28, [x0, #688] +str q21, [x0, #752] +str q10, [x0, #816] +str q20, [x0, #880] +str q23, [x0, #944] +str q3, [x0, #1008] +ldr q3, [x0, #768] +ldr q23, [x0, #832] +ldr q20, [x0, #896] +ldr q10, [x0, #960] +ldr q21, [x0, #256] +ldr q28, [x0, #320] +ldr q9, [x0, #384] +ldr q8, [x0, #448] +ldr q30, [x0, #512] +ldr q29, [x0, #576] +ldr q17, [x0, #640] +ldr q24, [x0, #704] +ldr q26, [x0, #0] +ldr q25, [x0, #64] +ldr q19, [x0, #128] +ldr q27, [x0, #192] +sqrdmulh v7.4S, v3.4S, v1.s[0] +sqrdmulh v16.4S, v23.4S, v1.s[0] +sqrdmulh v18.4S, v20.4S, v1.s[0] +sqrdmulh v22.4S, v10.4S, v1.s[0] +mul v3.4S, v3.4S,v2.s[0] +mul v23.4S, v23.4S,v2.s[0] +mul v20.4S, v20.4S,v2.s[0] +mul v10.4S, v10.4S,v2.s[0] +mla v3.4S, v7.4S, v31.s[0] +mla v23.4S, v16.4S, v31.s[0] +mla v20.4S, v18.4S, v31.s[0] +mla v10.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v3.4s +sub v18.4s, v28.4s, v23.4s +sub v16.4s, v9.4s, v20.4s +sub v7.4s, v8.4s, v10.4s +add v21.4s, v21.4s, v3.4s +add v28.4s, v28.4s, v23.4s +add v9.4s, v9.4s, v20.4s +add v8.4s, v8.4s, v10.4s +sqrdmulh v10.4S, v30.4S, v1.s[0] +sqrdmulh v20.4S, v29.4S, v1.s[0] +sqrdmulh v23.4S, v17.4S, v1.s[0] +sqrdmulh v3.4S, v24.4S, v1.s[0] +mul v30.4S, v30.4S,v2.s[0] +mul v29.4S, v29.4S,v2.s[0] +mul v17.4S, v17.4S,v2.s[0] +mul v24.4S, v24.4S,v2.s[0] +mla v30.4S, v10.4S, v31.s[0] +mla v29.4S, v20.4S, v31.s[0] +mla v17.4S, v23.4S, v31.s[0] +mla v24.4S, v3.4S, v31.s[0] +sub v3.4s, v26.4s, v30.4s +sub v23.4s, v25.4s, v29.4s +sub v20.4s, v19.4s, v17.4s +sub v10.4s, v27.4s, v24.4s +add v26.4s, v26.4s, v30.4s +add v25.4s, v25.4s, v29.4s +add v19.4s, v19.4s, v17.4s +add v27.4s, v27.4s, v24.4s +sqrdmulh v24.4S, v9.4S, v1.s[1] +sqrdmulh v17.4S, v8.4S, v1.s[1] +sqrdmulh v29.4S, v21.4S, v1.s[1] +sqrdmulh v30.4S, v28.4S, v1.s[1] +mul v9.4S, v9.4S,v2.s[1] +mul v8.4S, v8.4S,v2.s[1] +mul v21.4S, v21.4S,v2.s[1] +mul v28.4S, v28.4S,v2.s[1] +mla v9.4S, v24.4S, v31.s[0] +mla v8.4S, v17.4S, v31.s[0] +mla v21.4S, v29.4S, v31.s[0] +mla v28.4S, v30.4S, v31.s[0] +sub v30.4s, v19.4s, v9.4s +sub v29.4s, v27.4s, v8.4s +sub v17.4s, v26.4s, v21.4s +sub v24.4s, v25.4s, v28.4s +add v19.4s, v19.4s, v9.4s +add v27.4s, v27.4s, v8.4s +add v26.4s, v26.4s, v21.4s +add v25.4s, v25.4s, v28.4s +sqrdmulh v28.4S, v16.4S, v1.s[2] +sqrdmulh v21.4S, v7.4S, v1.s[2] +sqrdmulh v8.4S, v22.4S, v1.s[2] +sqrdmulh v9.4S, v18.4S, v1.s[2] +mul v16.4S, v16.4S,v2.s[2] +mul v7.4S, v7.4S,v2.s[2] +mul v22.4S, v22.4S,v2.s[2] +mul v18.4S, v18.4S,v2.s[2] +mla v16.4S, v28.4S, v31.s[0] +mla v7.4S, v21.4S, v31.s[0] +mla v22.4S, v8.4S, v31.s[0] +mla v18.4S, v9.4S, v31.s[0] +sub v9.4s, v20.4s, v16.4s +sub v8.4s, v10.4s, v7.4s +sub v21.4s, v3.4s, v22.4s +sub v28.4s, v23.4s, v18.4s +add v20.4s, v20.4s, v16.4s +add v10.4s, v10.4s, v7.4s +add v3.4s, v3.4s, v22.4s +add v23.4s, v23.4s, v18.4s +sqrdmulh v18.4S, v19.4S, v15.s[0] +sqrdmulh v22.4S, v27.4S, v15.s[0] +sqrdmulh v7.4S, v30.4S, v15.s[1] +sqrdmulh v16.4S, v29.4S, v15.s[1] +mul v19.4S, v19.4S,v0.s[0] +mul v27.4S, v27.4S,v0.s[0] +mul v30.4S, v30.4S,v0.s[1] +mul v29.4S, v29.4S,v0.s[1] +mla v19.4S, v18.4S, v31.s[0] +mla v27.4S, v22.4S, v31.s[0] +mla v30.4S, v7.4S, v31.s[0] +mla v29.4S, v16.4S, v31.s[0] +sub v16.4s, v26.4s, v19.4s +sub v7.4s, v25.4s, v27.4s +sub v22.4s, v17.4s, v30.4s +sub v18.4s, v24.4s, v29.4s +add v26.4s, v26.4s, v19.4s +add v25.4s, v25.4s, v27.4s +add v17.4s, v17.4s, v30.4s +add v24.4s, v24.4s, v29.4s +sqrdmulh v29.4S, v20.4S, v15.s[2] +sqrdmulh v30.4S, v10.4S, v15.s[2] +sqrdmulh v27.4S, v9.4S, v15.s[3] +sqrdmulh v19.4S, v8.4S, v15.s[3] +mul v20.4S, v20.4S,v0.s[2] +mul v10.4S, v10.4S,v0.s[2] +mul v9.4S, v9.4S,v0.s[3] +mul v8.4S, v8.4S,v0.s[3] +mla v20.4S, v29.4S, v31.s[0] +mla v10.4S, v30.4S, v31.s[0] +mla v9.4S, v27.4S, v31.s[0] +mla v8.4S, v19.4S, v31.s[0] +sub v19.4s, v3.4s, v20.4s +sub v27.4s, v23.4s, v10.4s +sub v30.4s, v21.4s, v9.4s +sub v29.4s, v28.4s, v8.4s +add v3.4s, v3.4s, v20.4s +add v23.4s, v23.4s, v10.4s +add v21.4s, v21.4s, v9.4s +add v28.4s, v28.4s, v8.4s +sqrdmulh v8.4S, v25.4S, v13.s[0] +sqrdmulh v9.4S, v7.4S, v13.s[1] +sqrdmulh v10.4S, v24.4S, v13.s[2] +sqrdmulh v20.4S, v18.4S, v13.s[3] +mul v25.4S, v25.4S,v14.s[0] +mul v7.4S, v7.4S,v14.s[1] +mul v24.4S, v24.4S,v14.s[2] +mul v18.4S, v18.4S,v14.s[3] +mla v25.4S, v8.4S, v31.s[0] +mla v7.4S, v9.4S, v31.s[0] +mla v24.4S, v10.4S, v31.s[0] +mla v18.4S, v20.4S, v31.s[0] +sub v20.4s, v26.4s, v25.4s +sub v10.4s, v16.4s, v7.4s +sub v9.4s, v17.4s, v24.4s +sub v8.4s, v22.4s, v18.4s +add v26.4s, v26.4s, v25.4s +add v16.4s, v16.4s, v7.4s +add v17.4s, v17.4s, v24.4s +add v22.4s, v22.4s, v18.4s +sqrdmulh v18.4S, v23.4S, v11.s[0] +sqrdmulh v24.4S, v27.4S, v11.s[1] +sqrdmulh v7.4S, v28.4S, v11.s[2] +sqrdmulh v25.4S, v29.4S, v11.s[3] +mul v23.4S, v23.4S,v12.s[0] +mul v27.4S, v27.4S,v12.s[1] +mul v28.4S, v28.4S,v12.s[2] +mul v29.4S, v29.4S,v12.s[3] +mla v23.4S, v18.4S, v31.s[0] +mla v27.4S, v24.4S, v31.s[0] +mla v28.4S, v7.4S, v31.s[0] +mla v29.4S, v25.4S, v31.s[0] +sub v25.4s, v3.4s, v23.4s +sub v7.4s, v19.4s, v27.4s +sub v24.4s, v21.4s, v28.4s +sub v18.4s, v30.4s, v29.4s +add v3.4s, v3.4s, v23.4s +add v19.4s, v19.4s, v27.4s +add v21.4s, v21.4s, v28.4s +add v30.4s, v30.4s, v29.4s +str q26, [x0, #0] +str q20, [x0, #64] +str q16, [x0, #128] +str q10, [x0, #192] +str q17, [x0, #256] +str q9, [x0, #320] +str q22, [x0, #384] +str q8, [x0, #448] +str q3, [x0, #512] +str q25, [x0, #576] +str q19, [x0, #640] +str q7, [x0, #704] +str q21, [x0, #768] +str q24, [x0, #832] +str q30, [x0, #896] +str q18, [x0, #960] +ldr q18, [x0, #784] +ldr q30, [x0, #848] +ldr q24, [x0, #912] +ldr q21, [x0, #976] +ldr q7, [x0, #272] +ldr q19, [x0, #336] +ldr q25, [x0, #400] +ldr q3, [x0, #464] +ldr q8, [x0, #528] +ldr q22, [x0, #592] +ldr q9, [x0, #656] +ldr q17, [x0, #720] +ldr q10, [x0, #16] +ldr q16, [x0, #80] +ldr q20, [x0, #144] +ldr q26, [x0, #208] +sqrdmulh v29.4S, v18.4S, v1.s[0] +sqrdmulh v28.4S, v30.4S, v1.s[0] +sqrdmulh v27.4S, v24.4S, v1.s[0] +sqrdmulh v23.4S, v21.4S, v1.s[0] +mul v18.4S, v18.4S,v2.s[0] +mul v30.4S, v30.4S,v2.s[0] +mul v24.4S, v24.4S,v2.s[0] +mul v21.4S, v21.4S,v2.s[0] +mla v18.4S, v29.4S, v31.s[0] +mla v30.4S, v28.4S, v31.s[0] +mla v24.4S, v27.4S, v31.s[0] +mla v21.4S, v23.4S, v31.s[0] +sub v23.4s, v7.4s, v18.4s +sub v27.4s, v19.4s, v30.4s +sub v28.4s, v25.4s, v24.4s +sub v29.4s, v3.4s, v21.4s +add v7.4s, v7.4s, v18.4s +add v19.4s, v19.4s, v30.4s +add v25.4s, v25.4s, v24.4s +add v3.4s, v3.4s, v21.4s +sqrdmulh v21.4S, v8.4S, v1.s[0] +sqrdmulh v24.4S, v22.4S, v1.s[0] +sqrdmulh v30.4S, v9.4S, v1.s[0] +sqrdmulh v18.4S, v17.4S, v1.s[0] +mul v8.4S, v8.4S,v2.s[0] +mul v22.4S, v22.4S,v2.s[0] +mul v9.4S, v9.4S,v2.s[0] +mul v17.4S, v17.4S,v2.s[0] +mla v8.4S, v21.4S, v31.s[0] +mla v22.4S, v24.4S, v31.s[0] +mla v9.4S, v30.4S, v31.s[0] +mla v17.4S, v18.4S, v31.s[0] +sub v18.4s, v10.4s, v8.4s +sub v30.4s, v16.4s, v22.4s +sub v24.4s, v20.4s, v9.4s +sub v21.4s, v26.4s, v17.4s +add v10.4s, v10.4s, v8.4s +add v16.4s, v16.4s, v22.4s +add v20.4s, v20.4s, v9.4s +add v26.4s, v26.4s, v17.4s +sqrdmulh v17.4S, v25.4S, v1.s[1] +sqrdmulh v9.4S, v3.4S, v1.s[1] +sqrdmulh v22.4S, v7.4S, v1.s[1] +sqrdmulh v8.4S, v19.4S, v1.s[1] +mul v25.4S, v25.4S,v2.s[1] +mul v3.4S, v3.4S,v2.s[1] +mul v7.4S, v7.4S,v2.s[1] +mul v19.4S, v19.4S,v2.s[1] +mla v25.4S, v17.4S, v31.s[0] +mla v3.4S, v9.4S, v31.s[0] +mla v7.4S, v22.4S, v31.s[0] +mla v19.4S, v8.4S, v31.s[0] +sub v8.4s, v20.4s, v25.4s +sub v22.4s, v26.4s, v3.4s +sub v9.4s, v10.4s, v7.4s +sub v17.4s, v16.4s, v19.4s +add v20.4s, v20.4s, v25.4s +add v26.4s, v26.4s, v3.4s +add v10.4s, v10.4s, v7.4s +add v16.4s, v16.4s, v19.4s +sqrdmulh v19.4S, v28.4S, v1.s[2] +sqrdmulh v7.4S, v29.4S, v1.s[2] +sqrdmulh v3.4S, v23.4S, v1.s[2] +sqrdmulh v25.4S, v27.4S, v1.s[2] +mul v28.4S, v28.4S,v2.s[2] +mul v29.4S, v29.4S,v2.s[2] +mul v23.4S, v23.4S,v2.s[2] +mul v27.4S, v27.4S,v2.s[2] +mla v28.4S, v19.4S, v31.s[0] +mla v29.4S, v7.4S, v31.s[0] +mla v23.4S, v3.4S, v31.s[0] +mla v27.4S, v25.4S, v31.s[0] +sub v25.4s, v24.4s, v28.4s +sub v3.4s, v21.4s, v29.4s +sub v7.4s, v18.4s, v23.4s +sub v19.4s, v30.4s, v27.4s +add v24.4s, v24.4s, v28.4s +add v21.4s, v21.4s, v29.4s +add v18.4s, v18.4s, v23.4s +add v30.4s, v30.4s, v27.4s +sqrdmulh v27.4S, v20.4S, v15.s[0] +sqrdmulh v23.4S, v26.4S, v15.s[0] +sqrdmulh v29.4S, v8.4S, v15.s[1] +sqrdmulh v28.4S, v22.4S, v15.s[1] +mul v20.4S, v20.4S,v0.s[0] +mul v26.4S, v26.4S,v0.s[0] +mul v8.4S, v8.4S,v0.s[1] +mul v22.4S, v22.4S,v0.s[1] +mla v20.4S, v27.4S, v31.s[0] +mla v26.4S, v23.4S, v31.s[0] +mla v8.4S, v29.4S, v31.s[0] +mla v22.4S, v28.4S, v31.s[0] +sub v28.4s, v10.4s, v20.4s +sub v29.4s, v16.4s, v26.4s +sub v23.4s, v9.4s, v8.4s +sub v27.4s, v17.4s, v22.4s +add v10.4s, v10.4s, v20.4s +add v16.4s, v16.4s, v26.4s +add v9.4s, v9.4s, v8.4s +add v17.4s, v17.4s, v22.4s +sqrdmulh v22.4S, v24.4S, v15.s[2] +sqrdmulh v8.4S, v21.4S, v15.s[2] +sqrdmulh v26.4S, v25.4S, v15.s[3] +sqrdmulh v20.4S, v3.4S, v15.s[3] +mul v24.4S, v24.4S,v0.s[2] +mul v21.4S, v21.4S,v0.s[2] +mul v25.4S, v25.4S,v0.s[3] +mul v3.4S, v3.4S,v0.s[3] +mla v24.4S, v22.4S, v31.s[0] +mla v21.4S, v8.4S, v31.s[0] +mla v25.4S, v26.4S, v31.s[0] +mla v3.4S, v20.4S, v31.s[0] +sub v20.4s, v18.4s, v24.4s +sub v26.4s, v30.4s, v21.4s +sub v8.4s, v7.4s, v25.4s +sub v22.4s, v19.4s, v3.4s +add v18.4s, v18.4s, v24.4s +add v30.4s, v30.4s, v21.4s +add v7.4s, v7.4s, v25.4s +add v19.4s, v19.4s, v3.4s +sqrdmulh v3.4S, v16.4S, v13.s[0] +sqrdmulh v25.4S, v29.4S, v13.s[1] +sqrdmulh v21.4S, v17.4S, v13.s[2] +sqrdmulh v24.4S, v27.4S, v13.s[3] +mul v16.4S, v16.4S,v14.s[0] +mul v29.4S, v29.4S,v14.s[1] +mul v17.4S, v17.4S,v14.s[2] +mul v27.4S, v27.4S,v14.s[3] +mla v16.4S, v3.4S, v31.s[0] +mla v29.4S, v25.4S, v31.s[0] +mla v17.4S, v21.4S, v31.s[0] +mla v27.4S, v24.4S, v31.s[0] +sub v24.4s, v10.4s, v16.4s +sub v21.4s, v28.4s, v29.4s +sub v25.4s, v9.4s, v17.4s +sub v3.4s, v23.4s, v27.4s +add v10.4s, v10.4s, v16.4s +add v28.4s, v28.4s, v29.4s +add v9.4s, v9.4s, v17.4s +add v23.4s, v23.4s, v27.4s +sqrdmulh v27.4S, v30.4S, v11.s[0] +sqrdmulh v17.4S, v26.4S, v11.s[1] +sqrdmulh v29.4S, v19.4S, v11.s[2] +sqrdmulh v16.4S, v22.4S, v11.s[3] +mul v30.4S, v30.4S,v12.s[0] +mul v26.4S, v26.4S,v12.s[1] +mul v19.4S, v19.4S,v12.s[2] +mul v22.4S, v22.4S,v12.s[3] +mla v30.4S, v27.4S, v31.s[0] +mla v26.4S, v17.4S, v31.s[0] +mla v19.4S, v29.4S, v31.s[0] +mla v22.4S, v16.4S, v31.s[0] +sub v16.4s, v18.4s, v30.4s +sub v29.4s, v20.4s, v26.4s +sub v17.4s, v7.4s, v19.4s +sub v27.4s, v8.4s, v22.4s +add v18.4s, v18.4s, v30.4s +add v20.4s, v20.4s, v26.4s +add v7.4s, v7.4s, v19.4s +add v8.4s, v8.4s, v22.4s +str q10, [x0, #16] +str q24, [x0, #80] +str q28, [x0, #144] +str q21, [x0, #208] +str q9, [x0, #272] +str q25, [x0, #336] +str q23, [x0, #400] +str q3, [x0, #464] +str q18, [x0, #528] +str q16, [x0, #592] +str q20, [x0, #656] +str q29, [x0, #720] +str q7, [x0, #784] +str q17, [x0, #848] +str q8, [x0, #912] +str q27, [x0, #976] +ldr q4, [x17, #+128] +ldr q5, [x17, #+144] +ldr q6, [x17, #+160] +ldr q30, [x17, #+176] +ldr q26, [x17, #+192] +ldr q19, [x17, #+208] +ldr q22, [x17, #+224] +ldr q10, [x17, #+240] +ldr q24, [x0, #32] +ldr q28, [x0, #48] +ldr q21, [x0, #0] +ldr q9, [x0, #16] +sqrdmulh v25.4S, v24.4S, v5.s[0] +mul v24.4S, v24.4S,v4.s[0] +mla v24.4S, v25.4S, v31.s[0] +sub v25.4s, v21.4s, v24.4s +add v21.4s, v21.4s, v24.4s +sqrdmulh v24.4S, v28.4S, v5.s[0] +mul v28.4S, v28.4S,v4.s[0] +mla v28.4S, v24.4S, v31.s[0] +sub v24.4s, v9.4s, v28.4s +add v9.4s, v9.4s, v28.4s +sqrdmulh v28.4S, v9.4S, v5.s[1] +mul v9.4S, v9.4S,v4.s[1] +mla v9.4S, v28.4S, v31.s[0] +sub v28.4s, v21.4s, v9.4s +add v21.4s, v21.4s, v9.4s +sqrdmulh v9.4S, v24.4S, v5.s[2] +mul v24.4S, v24.4S,v4.s[2] +mla v24.4S, v9.4S, v31.s[0] +sub v9.4s, v25.4s, v24.4s +add v25.4s, v25.4s, v24.4s +trn1 v24.4S, v21.4S, v28.4S +trn2 v23.4S, v21.4S, v28.4S +trn1 v3.4S, v25.4S, v9.4S +trn2 v18.4S, v25.4S, v9.4S +trn2 v25.2D, v24.2D, v3.2D +trn2 v9.2D, v23.2D, v18.2D +trn1 v21.2D, v24.2D, v3.2D +trn1 v28.2D, v23.2D, v18.2D +sqrdmulh v18.4S, v25.4S, v30.4S +mul v25.4S, v25.4S,v6.4S +mla v25.4S, v18.4S, v31.s[0] +sub v18.4s, v21.4s, v25.4s +add v21.4s, v21.4s, v25.4s +sqrdmulh v25.4S, v9.4S, v30.4S +mul v9.4S, v9.4S,v6.4S +mla v9.4S, v25.4S, v31.s[0] +sub v25.4s, v28.4s, v9.4s +add v28.4s, v28.4s, v9.4s +sqrdmulh v9.4S, v28.4S, v19.4S +mul v28.4S, v28.4S,v26.4S +mla v28.4S, v9.4S, v31.s[0] +sub v9.4s, v21.4s, v28.4s +add v21.4s, v21.4s, v28.4s +sqrdmulh v28.4S, v25.4S, v10.4S +mul v25.4S, v25.4S,v22.4S +mla v25.4S, v28.4S, v31.s[0] +sub v28.4s, v18.4s, v25.4s +add v18.4s, v18.4s, v25.4s +str q21, [x0, #0] +str q9, [x0, #16] +str q18, [x0, #32] +str q28, [x0, #48] +ldr q28, [x17, #+256] +ldr q18, [x17, #+272] +ldr q9, [x17, #+288] +ldr q21, [x17, #+304] +ldr q25, [x17, #+320] +ldr q23, [x17, #+336] +ldr q3, [x17, #+352] +ldr q24, [x17, #+368] +ldr q10, [x0, #96] +ldr q22, [x0, #112] +ldr q19, [x0, #64] +ldr q26, [x0, #80] +sqrdmulh v30.4S, v10.4S, v18.s[0] +mul v10.4S, v10.4S,v28.s[0] +mla v10.4S, v30.4S, v31.s[0] +sub v30.4s, v19.4s, v10.4s +add v19.4s, v19.4s, v10.4s +sqrdmulh v10.4S, v22.4S, v18.s[0] +mul v22.4S, v22.4S,v28.s[0] +mla v22.4S, v10.4S, v31.s[0] +sub v10.4s, v26.4s, v22.4s +add v26.4s, v26.4s, v22.4s +sqrdmulh v22.4S, v26.4S, v18.s[1] +mul v26.4S, v26.4S,v28.s[1] +mla v26.4S, v22.4S, v31.s[0] +sub v22.4s, v19.4s, v26.4s +add v19.4s, v19.4s, v26.4s +sqrdmulh v26.4S, v10.4S, v18.s[2] +mul v10.4S, v10.4S,v28.s[2] +mla v10.4S, v26.4S, v31.s[0] +sub v26.4s, v30.4s, v10.4s +add v30.4s, v30.4s, v10.4s +trn1 v10.4S, v19.4S, v22.4S +trn2 v6.4S, v19.4S, v22.4S +trn1 v5.4S, v30.4S, v26.4S +trn2 v4.4S, v30.4S, v26.4S +trn2 v30.2D, v10.2D, v5.2D +trn2 v26.2D, v6.2D, v4.2D +trn1 v19.2D, v10.2D, v5.2D +trn1 v22.2D, v6.2D, v4.2D +sqrdmulh v4.4S, v30.4S, v21.4S +mul v30.4S, v30.4S,v9.4S +mla v30.4S, v4.4S, v31.s[0] +sub v4.4s, v19.4s, v30.4s +add v19.4s, v19.4s, v30.4s +sqrdmulh v30.4S, v26.4S, v21.4S +mul v26.4S, v26.4S,v9.4S +mla v26.4S, v30.4S, v31.s[0] +sub v30.4s, v22.4s, v26.4s +add v22.4s, v22.4s, v26.4s +sqrdmulh v26.4S, v22.4S, v23.4S +mul v22.4S, v22.4S,v25.4S +mla v22.4S, v26.4S, v31.s[0] +sub v26.4s, v19.4s, v22.4s +add v19.4s, v19.4s, v22.4s +sqrdmulh v22.4S, v30.4S, v24.4S +mul v30.4S, v30.4S,v3.4S +mla v30.4S, v22.4S, v31.s[0] +sub v22.4s, v4.4s, v30.4s +add v4.4s, v4.4s, v30.4s +str q19, [x0, #64] +str q26, [x0, #80] +str q4, [x0, #96] +str q22, [x0, #112] +ldr q22, [x17, #+384] +ldr q4, [x17, #+400] +ldr q26, [x17, #+416] +ldr q19, [x17, #+432] +ldr q30, [x17, #+448] +ldr q6, [x17, #+464] +ldr q5, [x17, #+480] +ldr q10, [x17, #+496] +ldr q24, [x0, #160] +ldr q3, [x0, #176] +ldr q23, [x0, #128] +ldr q25, [x0, #144] +sqrdmulh v21.4S, v24.4S, v4.s[0] +mul v24.4S, v24.4S,v22.s[0] +mla v24.4S, v21.4S, v31.s[0] +sub v21.4s, v23.4s, v24.4s +add v23.4s, v23.4s, v24.4s +sqrdmulh v24.4S, v3.4S, v4.s[0] +mul v3.4S, v3.4S,v22.s[0] +mla v3.4S, v24.4S, v31.s[0] +sub v24.4s, v25.4s, v3.4s +add v25.4s, v25.4s, v3.4s +sqrdmulh v3.4S, v25.4S, v4.s[1] +mul v25.4S, v25.4S,v22.s[1] +mla v25.4S, v3.4S, v31.s[0] +sub v3.4s, v23.4s, v25.4s +add v23.4s, v23.4s, v25.4s +sqrdmulh v25.4S, v24.4S, v4.s[2] +mul v24.4S, v24.4S,v22.s[2] +mla v24.4S, v25.4S, v31.s[0] +sub v25.4s, v21.4s, v24.4s +add v21.4s, v21.4s, v24.4s +trn1 v24.4S, v23.4S, v3.4S +trn2 v9.4S, v23.4S, v3.4S +trn1 v18.4S, v21.4S, v25.4S +trn2 v28.4S, v21.4S, v25.4S +trn2 v21.2D, v24.2D, v18.2D +trn2 v25.2D, v9.2D, v28.2D +trn1 v23.2D, v24.2D, v18.2D +trn1 v3.2D, v9.2D, v28.2D +sqrdmulh v28.4S, v21.4S, v19.4S +mul v21.4S, v21.4S,v26.4S +mla v21.4S, v28.4S, v31.s[0] +sub v28.4s, v23.4s, v21.4s +add v23.4s, v23.4s, v21.4s +sqrdmulh v21.4S, v25.4S, v19.4S +mul v25.4S, v25.4S,v26.4S +mla v25.4S, v21.4S, v31.s[0] +sub v21.4s, v3.4s, v25.4s +add v3.4s, v3.4s, v25.4s +sqrdmulh v25.4S, v3.4S, v6.4S +mul v3.4S, v3.4S,v30.4S +mla v3.4S, v25.4S, v31.s[0] +sub v25.4s, v23.4s, v3.4s +add v23.4s, v23.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v10.4S +mul v21.4S, v21.4S,v5.4S +mla v21.4S, v3.4S, v31.s[0] +sub v3.4s, v28.4s, v21.4s +add v28.4s, v28.4s, v21.4s +str q23, [x0, #128] +str q25, [x0, #144] +str q28, [x0, #160] +str q3, [x0, #176] +ldr q3, [x17, #+512] +ldr q28, [x17, #+528] +ldr q25, [x17, #+544] +ldr q23, [x17, #+560] +ldr q21, [x17, #+576] +ldr q9, [x17, #+592] +ldr q18, [x17, #+608] +ldr q24, [x17, #+624] +ldr q10, [x0, #224] +ldr q5, [x0, #240] +ldr q6, [x0, #192] +ldr q30, [x0, #208] +sqrdmulh v19.4S, v10.4S, v28.s[0] +mul v10.4S, v10.4S,v3.s[0] +mla v10.4S, v19.4S, v31.s[0] +sub v19.4s, v6.4s, v10.4s +add v6.4s, v6.4s, v10.4s +sqrdmulh v10.4S, v5.4S, v28.s[0] +mul v5.4S, v5.4S,v3.s[0] +mla v5.4S, v10.4S, v31.s[0] +sub v10.4s, v30.4s, v5.4s +add v30.4s, v30.4s, v5.4s +sqrdmulh v5.4S, v30.4S, v28.s[1] +mul v30.4S, v30.4S,v3.s[1] +mla v30.4S, v5.4S, v31.s[0] +sub v5.4s, v6.4s, v30.4s +add v6.4s, v6.4s, v30.4s +sqrdmulh v30.4S, v10.4S, v28.s[2] +mul v10.4S, v10.4S,v3.s[2] +mla v10.4S, v30.4S, v31.s[0] +sub v30.4s, v19.4s, v10.4s +add v19.4s, v19.4s, v10.4s +trn1 v10.4S, v6.4S, v5.4S +trn2 v26.4S, v6.4S, v5.4S +trn1 v4.4S, v19.4S, v30.4S +trn2 v22.4S, v19.4S, v30.4S +trn2 v19.2D, v10.2D, v4.2D +trn2 v30.2D, v26.2D, v22.2D +trn1 v6.2D, v10.2D, v4.2D +trn1 v5.2D, v26.2D, v22.2D +sqrdmulh v22.4S, v19.4S, v23.4S +mul v19.4S, v19.4S,v25.4S +mla v19.4S, v22.4S, v31.s[0] +sub v22.4s, v6.4s, v19.4s +add v6.4s, v6.4s, v19.4s +sqrdmulh v19.4S, v30.4S, v23.4S +mul v30.4S, v30.4S,v25.4S +mla v30.4S, v19.4S, v31.s[0] +sub v19.4s, v5.4s, v30.4s +add v5.4s, v5.4s, v30.4s +sqrdmulh v30.4S, v5.4S, v9.4S +mul v5.4S, v5.4S,v21.4S +mla v5.4S, v30.4S, v31.s[0] +sub v30.4s, v6.4s, v5.4s +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v19.4S, v24.4S +mul v19.4S, v19.4S,v18.4S +mla v19.4S, v5.4S, v31.s[0] +sub v5.4s, v22.4s, v19.4s +add v22.4s, v22.4s, v19.4s +str q6, [x0, #192] +str q30, [x0, #208] +str q22, [x0, #224] +str q5, [x0, #240] +ldr q5, [x17, #+640] +ldr q22, [x17, #+656] +ldr q30, [x17, #+672] +ldr q6, [x17, #+688] +ldr q19, [x17, #+704] +ldr q26, [x17, #+720] +ldr q4, [x17, #+736] +ldr q10, [x17, #+752] +ldr q24, [x0, #288] +ldr q18, [x0, #304] +ldr q9, [x0, #256] +ldr q21, [x0, #272] +sqrdmulh v23.4S, v24.4S, v22.s[0] +mul v24.4S, v24.4S,v5.s[0] +mla v24.4S, v23.4S, v31.s[0] +sub v23.4s, v9.4s, v24.4s +add v9.4s, v9.4s, v24.4s +sqrdmulh v24.4S, v18.4S, v22.s[0] +mul v18.4S, v18.4S,v5.s[0] +mla v18.4S, v24.4S, v31.s[0] +sub v24.4s, v21.4s, v18.4s +add v21.4s, v21.4s, v18.4s +sqrdmulh v18.4S, v21.4S, v22.s[1] +mul v21.4S, v21.4S,v5.s[1] +mla v21.4S, v18.4S, v31.s[0] +sub v18.4s, v9.4s, v21.4s +add v9.4s, v9.4s, v21.4s +sqrdmulh v21.4S, v24.4S, v22.s[2] +mul v24.4S, v24.4S,v5.s[2] +mla v24.4S, v21.4S, v31.s[0] +sub v21.4s, v23.4s, v24.4s +add v23.4s, v23.4s, v24.4s +trn1 v24.4S, v9.4S, v18.4S +trn2 v25.4S, v9.4S, v18.4S +trn1 v28.4S, v23.4S, v21.4S +trn2 v3.4S, v23.4S, v21.4S +trn2 v23.2D, v24.2D, v28.2D +trn2 v21.2D, v25.2D, v3.2D +trn1 v9.2D, v24.2D, v28.2D +trn1 v18.2D, v25.2D, v3.2D +sqrdmulh v3.4S, v23.4S, v6.4S +mul v23.4S, v23.4S,v30.4S +mla v23.4S, v3.4S, v31.s[0] +sub v3.4s, v9.4s, v23.4s +add v9.4s, v9.4s, v23.4s +sqrdmulh v23.4S, v21.4S, v6.4S +mul v21.4S, v21.4S,v30.4S +mla v21.4S, v23.4S, v31.s[0] +sub v23.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v18.4S, v26.4S +mul v18.4S, v18.4S,v19.4S +mla v18.4S, v21.4S, v31.s[0] +sub v21.4s, v9.4s, v18.4s +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v23.4S, v10.4S +mul v23.4S, v23.4S,v4.4S +mla v23.4S, v18.4S, v31.s[0] +sub v18.4s, v3.4s, v23.4s +add v3.4s, v3.4s, v23.4s +str q9, [x0, #256] +str q21, [x0, #272] +str q3, [x0, #288] +str q18, [x0, #304] +ldr q18, [x17, #+768] +ldr q3, [x17, #+784] +ldr q21, [x17, #+800] +ldr q9, [x17, #+816] +ldr q23, [x17, #+832] +ldr q25, [x17, #+848] +ldr q28, [x17, #+864] +ldr q24, [x17, #+880] +ldr q10, [x0, #352] +ldr q4, [x0, #368] +ldr q26, [x0, #320] +ldr q19, [x0, #336] +sqrdmulh v6.4S, v10.4S, v3.s[0] +mul v10.4S, v10.4S,v18.s[0] +mla v10.4S, v6.4S, v31.s[0] +sub v6.4s, v26.4s, v10.4s +add v26.4s, v26.4s, v10.4s +sqrdmulh v10.4S, v4.4S, v3.s[0] +mul v4.4S, v4.4S,v18.s[0] +mla v4.4S, v10.4S, v31.s[0] +sub v10.4s, v19.4s, v4.4s +add v19.4s, v19.4s, v4.4s +sqrdmulh v4.4S, v19.4S, v3.s[1] +mul v19.4S, v19.4S,v18.s[1] +mla v19.4S, v4.4S, v31.s[0] +sub v4.4s, v26.4s, v19.4s +add v26.4s, v26.4s, v19.4s +sqrdmulh v19.4S, v10.4S, v3.s[2] +mul v10.4S, v10.4S,v18.s[2] +mla v10.4S, v19.4S, v31.s[0] +sub v19.4s, v6.4s, v10.4s +add v6.4s, v6.4s, v10.4s +trn1 v10.4S, v26.4S, v4.4S +trn2 v30.4S, v26.4S, v4.4S +trn1 v22.4S, v6.4S, v19.4S +trn2 v5.4S, v6.4S, v19.4S +trn2 v6.2D, v10.2D, v22.2D +trn2 v19.2D, v30.2D, v5.2D +trn1 v26.2D, v10.2D, v22.2D +trn1 v4.2D, v30.2D, v5.2D +sqrdmulh v5.4S, v6.4S, v9.4S +mul v6.4S, v6.4S,v21.4S +mla v6.4S, v5.4S, v31.s[0] +sub v5.4s, v26.4s, v6.4s +add v26.4s, v26.4s, v6.4s +sqrdmulh v6.4S, v19.4S, v9.4S +mul v19.4S, v19.4S,v21.4S +mla v19.4S, v6.4S, v31.s[0] +sub v6.4s, v4.4s, v19.4s +add v4.4s, v4.4s, v19.4s +sqrdmulh v19.4S, v4.4S, v25.4S +mul v4.4S, v4.4S,v23.4S +mla v4.4S, v19.4S, v31.s[0] +sub v19.4s, v26.4s, v4.4s +add v26.4s, v26.4s, v4.4s +sqrdmulh v4.4S, v6.4S, v24.4S +mul v6.4S, v6.4S,v28.4S +mla v6.4S, v4.4S, v31.s[0] +sub v4.4s, v5.4s, v6.4s +add v5.4s, v5.4s, v6.4s +str q26, [x0, #320] +str q19, [x0, #336] +str q5, [x0, #352] +str q4, [x0, #368] +ldr q4, [x17, #+896] +ldr q5, [x17, #+912] +ldr q19, [x17, #+928] +ldr q26, [x17, #+944] +ldr q6, [x17, #+960] +ldr q30, [x17, #+976] +ldr q22, [x17, #+992] +ldr q10, [x17, #+1008] +ldr q24, [x0, #416] +ldr q28, [x0, #432] +ldr q25, [x0, #384] +ldr q23, [x0, #400] +sqrdmulh v9.4S, v24.4S, v5.s[0] +mul v24.4S, v24.4S,v4.s[0] +mla v24.4S, v9.4S, v31.s[0] +sub v9.4s, v25.4s, v24.4s +add v25.4s, v25.4s, v24.4s +sqrdmulh v24.4S, v28.4S, v5.s[0] +mul v28.4S, v28.4S,v4.s[0] +mla v28.4S, v24.4S, v31.s[0] +sub v24.4s, v23.4s, v28.4s +add v23.4s, v23.4s, v28.4s +sqrdmulh v28.4S, v23.4S, v5.s[1] +mul v23.4S, v23.4S,v4.s[1] +mla v23.4S, v28.4S, v31.s[0] +sub v28.4s, v25.4s, v23.4s +add v25.4s, v25.4s, v23.4s +sqrdmulh v23.4S, v24.4S, v5.s[2] +mul v24.4S, v24.4S,v4.s[2] +mla v24.4S, v23.4S, v31.s[0] +sub v23.4s, v9.4s, v24.4s +add v9.4s, v9.4s, v24.4s +trn1 v24.4S, v25.4S, v28.4S +trn2 v21.4S, v25.4S, v28.4S +trn1 v3.4S, v9.4S, v23.4S +trn2 v18.4S, v9.4S, v23.4S +trn2 v9.2D, v24.2D, v3.2D +trn2 v23.2D, v21.2D, v18.2D +trn1 v25.2D, v24.2D, v3.2D +trn1 v28.2D, v21.2D, v18.2D +sqrdmulh v18.4S, v9.4S, v26.4S +mul v9.4S, v9.4S,v19.4S +mla v9.4S, v18.4S, v31.s[0] +sub v18.4s, v25.4s, v9.4s +add v25.4s, v25.4s, v9.4s +sqrdmulh v9.4S, v23.4S, v26.4S +mul v23.4S, v23.4S,v19.4S +mla v23.4S, v9.4S, v31.s[0] +sub v9.4s, v28.4s, v23.4s +add v28.4s, v28.4s, v23.4s +sqrdmulh v23.4S, v28.4S, v30.4S +mul v28.4S, v28.4S,v6.4S +mla v28.4S, v23.4S, v31.s[0] +sub v23.4s, v25.4s, v28.4s +add v25.4s, v25.4s, v28.4s +sqrdmulh v28.4S, v9.4S, v10.4S +mul v9.4S, v9.4S,v22.4S +mla v9.4S, v28.4S, v31.s[0] +sub v28.4s, v18.4s, v9.4s +add v18.4s, v18.4s, v9.4s +str q25, [x0, #384] +str q23, [x0, #400] +str q18, [x0, #416] +str q28, [x0, #432] +ldr q28, [x17, #+1024] +ldr q18, [x17, #+1040] +ldr q23, [x17, #+1056] +ldr q25, [x17, #+1072] +ldr q9, [x17, #+1088] +ldr q21, [x17, #+1104] +ldr q3, [x17, #+1120] +ldr q24, [x17, #+1136] +ldr q10, [x0, #480] +ldr q22, [x0, #496] +ldr q30, [x0, #448] +ldr q6, [x0, #464] +sqrdmulh v26.4S, v10.4S, v18.s[0] +mul v10.4S, v10.4S,v28.s[0] +mla v10.4S, v26.4S, v31.s[0] +sub v26.4s, v30.4s, v10.4s +add v30.4s, v30.4s, v10.4s +sqrdmulh v10.4S, v22.4S, v18.s[0] +mul v22.4S, v22.4S,v28.s[0] +mla v22.4S, v10.4S, v31.s[0] +sub v10.4s, v6.4s, v22.4s +add v6.4s, v6.4s, v22.4s +sqrdmulh v22.4S, v6.4S, v18.s[1] +mul v6.4S, v6.4S,v28.s[1] +mla v6.4S, v22.4S, v31.s[0] +sub v22.4s, v30.4s, v6.4s +add v30.4s, v30.4s, v6.4s +sqrdmulh v6.4S, v10.4S, v18.s[2] +mul v10.4S, v10.4S,v28.s[2] +mla v10.4S, v6.4S, v31.s[0] +sub v6.4s, v26.4s, v10.4s +add v26.4s, v26.4s, v10.4s +trn1 v10.4S, v30.4S, v22.4S +trn2 v19.4S, v30.4S, v22.4S +trn1 v5.4S, v26.4S, v6.4S +trn2 v4.4S, v26.4S, v6.4S +trn2 v26.2D, v10.2D, v5.2D +trn2 v6.2D, v19.2D, v4.2D +trn1 v30.2D, v10.2D, v5.2D +trn1 v22.2D, v19.2D, v4.2D +sqrdmulh v4.4S, v26.4S, v25.4S +mul v26.4S, v26.4S,v23.4S +mla v26.4S, v4.4S, v31.s[0] +sub v4.4s, v30.4s, v26.4s +add v30.4s, v30.4s, v26.4s +sqrdmulh v26.4S, v6.4S, v25.4S +mul v6.4S, v6.4S,v23.4S +mla v6.4S, v26.4S, v31.s[0] +sub v26.4s, v22.4s, v6.4s +add v22.4s, v22.4s, v6.4s +sqrdmulh v6.4S, v22.4S, v21.4S +mul v22.4S, v22.4S,v9.4S +mla v22.4S, v6.4S, v31.s[0] +sub v6.4s, v30.4s, v22.4s +add v30.4s, v30.4s, v22.4s +sqrdmulh v22.4S, v26.4S, v24.4S +mul v26.4S, v26.4S,v3.4S +mla v26.4S, v22.4S, v31.s[0] +sub v22.4s, v4.4s, v26.4s +add v4.4s, v4.4s, v26.4s +str q30, [x0, #448] +str q6, [x0, #464] +str q4, [x0, #480] +str q22, [x0, #496] +ldr q22, [x17, #+1152] +ldr q4, [x17, #+1168] +ldr q6, [x17, #+1184] +ldr q30, [x17, #+1200] +ldr q26, [x17, #+1216] +ldr q19, [x17, #+1232] +ldr q5, [x17, #+1248] +ldr q10, [x17, #+1264] +ldr q24, [x0, #544] +ldr q3, [x0, #560] +ldr q21, [x0, #512] +ldr q9, [x0, #528] +sqrdmulh v25.4S, v24.4S, v4.s[0] +mul v24.4S, v24.4S,v22.s[0] +mla v24.4S, v25.4S, v31.s[0] +sub v25.4s, v21.4s, v24.4s +add v21.4s, v21.4s, v24.4s +sqrdmulh v24.4S, v3.4S, v4.s[0] +mul v3.4S, v3.4S,v22.s[0] +mla v3.4S, v24.4S, v31.s[0] +sub v24.4s, v9.4s, v3.4s +add v9.4s, v9.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v4.s[1] +mul v9.4S, v9.4S,v22.s[1] +mla v9.4S, v3.4S, v31.s[0] +sub v3.4s, v21.4s, v9.4s +add v21.4s, v21.4s, v9.4s +sqrdmulh v9.4S, v24.4S, v4.s[2] +mul v24.4S, v24.4S,v22.s[2] +mla v24.4S, v9.4S, v31.s[0] +sub v9.4s, v25.4s, v24.4s +add v25.4s, v25.4s, v24.4s +trn1 v24.4S, v21.4S, v3.4S +trn2 v23.4S, v21.4S, v3.4S +trn1 v18.4S, v25.4S, v9.4S +trn2 v28.4S, v25.4S, v9.4S +trn2 v25.2D, v24.2D, v18.2D +trn2 v9.2D, v23.2D, v28.2D +trn1 v21.2D, v24.2D, v18.2D +trn1 v3.2D, v23.2D, v28.2D +sqrdmulh v28.4S, v25.4S, v30.4S +mul v25.4S, v25.4S,v6.4S +mla v25.4S, v28.4S, v31.s[0] +sub v28.4s, v21.4s, v25.4s +add v21.4s, v21.4s, v25.4s +sqrdmulh v25.4S, v9.4S, v30.4S +mul v9.4S, v9.4S,v6.4S +mla v9.4S, v25.4S, v31.s[0] +sub v25.4s, v3.4s, v9.4s +add v3.4s, v3.4s, v9.4s +sqrdmulh v9.4S, v3.4S, v19.4S +mul v3.4S, v3.4S,v26.4S +mla v3.4S, v9.4S, v31.s[0] +sub v9.4s, v21.4s, v3.4s +add v21.4s, v21.4s, v3.4s +sqrdmulh v3.4S, v25.4S, v10.4S +mul v25.4S, v25.4S,v5.4S +mla v25.4S, v3.4S, v31.s[0] +sub v3.4s, v28.4s, v25.4s +add v28.4s, v28.4s, v25.4s +str q21, [x0, #512] +str q9, [x0, #528] +str q28, [x0, #544] +str q3, [x0, #560] +ldr q3, [x17, #+1280] +ldr q28, [x17, #+1296] +ldr q9, [x17, #+1312] +ldr q21, [x17, #+1328] +ldr q25, [x17, #+1344] +ldr q23, [x17, #+1360] +ldr q18, [x17, #+1376] +ldr q24, [x17, #+1392] +ldr q10, [x0, #608] +ldr q5, [x0, #624] +ldr q19, [x0, #576] +ldr q26, [x0, #592] +sqrdmulh v30.4S, v10.4S, v28.s[0] +mul v10.4S, v10.4S,v3.s[0] +mla v10.4S, v30.4S, v31.s[0] +sub v30.4s, v19.4s, v10.4s +add v19.4s, v19.4s, v10.4s +sqrdmulh v10.4S, v5.4S, v28.s[0] +mul v5.4S, v5.4S,v3.s[0] +mla v5.4S, v10.4S, v31.s[0] +sub v10.4s, v26.4s, v5.4s +add v26.4s, v26.4s, v5.4s +sqrdmulh v5.4S, v26.4S, v28.s[1] +mul v26.4S, v26.4S,v3.s[1] +mla v26.4S, v5.4S, v31.s[0] +sub v5.4s, v19.4s, v26.4s +add v19.4s, v19.4s, v26.4s +sqrdmulh v26.4S, v10.4S, v28.s[2] +mul v10.4S, v10.4S,v3.s[2] +mla v10.4S, v26.4S, v31.s[0] +sub v26.4s, v30.4s, v10.4s +add v30.4s, v30.4s, v10.4s +trn1 v10.4S, v19.4S, v5.4S +trn2 v6.4S, v19.4S, v5.4S +trn1 v4.4S, v30.4S, v26.4S +trn2 v22.4S, v30.4S, v26.4S +trn2 v30.2D, v10.2D, v4.2D +trn2 v26.2D, v6.2D, v22.2D +trn1 v19.2D, v10.2D, v4.2D +trn1 v5.2D, v6.2D, v22.2D +sqrdmulh v22.4S, v30.4S, v21.4S +mul v30.4S, v30.4S,v9.4S +mla v30.4S, v22.4S, v31.s[0] +sub v22.4s, v19.4s, v30.4s +add v19.4s, v19.4s, v30.4s +sqrdmulh v30.4S, v26.4S, v21.4S +mul v26.4S, v26.4S,v9.4S +mla v26.4S, v30.4S, v31.s[0] +sub v30.4s, v5.4s, v26.4s +add v5.4s, v5.4s, v26.4s +sqrdmulh v26.4S, v5.4S, v23.4S +mul v5.4S, v5.4S,v25.4S +mla v5.4S, v26.4S, v31.s[0] +sub v26.4s, v19.4s, v5.4s +add v19.4s, v19.4s, v5.4s +sqrdmulh v5.4S, v30.4S, v24.4S +mul v30.4S, v30.4S,v18.4S +mla v30.4S, v5.4S, v31.s[0] +sub v5.4s, v22.4s, v30.4s +add v22.4s, v22.4s, v30.4s +str q19, [x0, #576] +str q26, [x0, #592] +str q22, [x0, #608] +str q5, [x0, #624] +ldr q5, [x17, #+1408] +ldr q22, [x17, #+1424] +ldr q26, [x17, #+1440] +ldr q19, [x17, #+1456] +ldr q30, [x17, #+1472] +ldr q6, [x17, #+1488] +ldr q4, [x17, #+1504] +ldr q10, [x17, #+1520] +ldr q24, [x0, #672] +ldr q18, [x0, #688] +ldr q23, [x0, #640] +ldr q25, [x0, #656] +sqrdmulh v21.4S, v24.4S, v22.s[0] +mul v24.4S, v24.4S,v5.s[0] +mla v24.4S, v21.4S, v31.s[0] +sub v21.4s, v23.4s, v24.4s +add v23.4s, v23.4s, v24.4s +sqrdmulh v24.4S, v18.4S, v22.s[0] +mul v18.4S, v18.4S,v5.s[0] +mla v18.4S, v24.4S, v31.s[0] +sub v24.4s, v25.4s, v18.4s +add v25.4s, v25.4s, v18.4s +sqrdmulh v18.4S, v25.4S, v22.s[1] +mul v25.4S, v25.4S,v5.s[1] +mla v25.4S, v18.4S, v31.s[0] +sub v18.4s, v23.4s, v25.4s +add v23.4s, v23.4s, v25.4s +sqrdmulh v25.4S, v24.4S, v22.s[2] +mul v24.4S, v24.4S,v5.s[2] +mla v24.4S, v25.4S, v31.s[0] +sub v25.4s, v21.4s, v24.4s +add v21.4s, v21.4s, v24.4s +trn1 v24.4S, v23.4S, v18.4S +trn2 v9.4S, v23.4S, v18.4S +trn1 v28.4S, v21.4S, v25.4S +trn2 v3.4S, v21.4S, v25.4S +trn2 v21.2D, v24.2D, v28.2D +trn2 v25.2D, v9.2D, v3.2D +trn1 v23.2D, v24.2D, v28.2D +trn1 v18.2D, v9.2D, v3.2D +sqrdmulh v3.4S, v21.4S, v19.4S +mul v21.4S, v21.4S,v26.4S +mla v21.4S, v3.4S, v31.s[0] +sub v3.4s, v23.4s, v21.4s +add v23.4s, v23.4s, v21.4s +sqrdmulh v21.4S, v25.4S, v19.4S +mul v25.4S, v25.4S,v26.4S +mla v25.4S, v21.4S, v31.s[0] +sub v21.4s, v18.4s, v25.4s +add v18.4s, v18.4s, v25.4s +sqrdmulh v25.4S, v18.4S, v6.4S +mul v18.4S, v18.4S,v30.4S +mla v18.4S, v25.4S, v31.s[0] +sub v25.4s, v23.4s, v18.4s +add v23.4s, v23.4s, v18.4s +sqrdmulh v18.4S, v21.4S, v10.4S +mul v21.4S, v21.4S,v4.4S +mla v21.4S, v18.4S, v31.s[0] +sub v18.4s, v3.4s, v21.4s +add v3.4s, v3.4s, v21.4s +str q23, [x0, #640] +str q25, [x0, #656] +str q3, [x0, #672] +str q18, [x0, #688] +ldr q18, [x17, #+1536] +ldr q3, [x17, #+1552] +ldr q25, [x17, #+1568] +ldr q23, [x17, #+1584] +ldr q21, [x17, #+1600] +ldr q9, [x17, #+1616] +ldr q28, [x17, #+1632] +ldr q24, [x17, #+1648] +ldr q10, [x0, #736] +ldr q4, [x0, #752] +ldr q6, [x0, #704] +ldr q30, [x0, #720] +sqrdmulh v19.4S, v10.4S, v3.s[0] +mul v10.4S, v10.4S,v18.s[0] +mla v10.4S, v19.4S, v31.s[0] +sub v19.4s, v6.4s, v10.4s +add v6.4s, v6.4s, v10.4s +sqrdmulh v10.4S, v4.4S, v3.s[0] +mul v4.4S, v4.4S,v18.s[0] +mla v4.4S, v10.4S, v31.s[0] +sub v10.4s, v30.4s, v4.4s +add v30.4s, v30.4s, v4.4s +sqrdmulh v4.4S, v30.4S, v3.s[1] +mul v30.4S, v30.4S,v18.s[1] +mla v30.4S, v4.4S, v31.s[0] +sub v4.4s, v6.4s, v30.4s +add v6.4s, v6.4s, v30.4s +sqrdmulh v30.4S, v10.4S, v3.s[2] +mul v10.4S, v10.4S,v18.s[2] +mla v10.4S, v30.4S, v31.s[0] +sub v30.4s, v19.4s, v10.4s +add v19.4s, v19.4s, v10.4s +trn1 v10.4S, v6.4S, v4.4S +trn2 v26.4S, v6.4S, v4.4S +trn1 v22.4S, v19.4S, v30.4S +trn2 v5.4S, v19.4S, v30.4S +trn2 v19.2D, v10.2D, v22.2D +trn2 v30.2D, v26.2D, v5.2D +trn1 v6.2D, v10.2D, v22.2D +trn1 v4.2D, v26.2D, v5.2D +sqrdmulh v5.4S, v19.4S, v23.4S +mul v19.4S, v19.4S,v25.4S +mla v19.4S, v5.4S, v31.s[0] +sub v5.4s, v6.4s, v19.4s +add v6.4s, v6.4s, v19.4s +sqrdmulh v19.4S, v30.4S, v23.4S +mul v30.4S, v30.4S,v25.4S +mla v30.4S, v19.4S, v31.s[0] +sub v19.4s, v4.4s, v30.4s +add v4.4s, v4.4s, v30.4s +sqrdmulh v30.4S, v4.4S, v9.4S +mul v4.4S, v4.4S,v21.4S +mla v4.4S, v30.4S, v31.s[0] +sub v30.4s, v6.4s, v4.4s +add v6.4s, v6.4s, v4.4s +sqrdmulh v4.4S, v19.4S, v24.4S +mul v19.4S, v19.4S,v28.4S +mla v19.4S, v4.4S, v31.s[0] +sub v4.4s, v5.4s, v19.4s +add v5.4s, v5.4s, v19.4s +str q6, [x0, #704] +str q30, [x0, #720] +str q5, [x0, #736] +str q4, [x0, #752] +ldr q4, [x17, #+1664] +ldr q5, [x17, #+1680] +ldr q30, [x17, #+1696] +ldr q6, [x17, #+1712] +ldr q19, [x17, #+1728] +ldr q26, [x17, #+1744] +ldr q22, [x17, #+1760] +ldr q10, [x17, #+1776] +ldr q24, [x0, #800] +ldr q28, [x0, #816] +ldr q9, [x0, #768] +ldr q21, [x0, #784] +sqrdmulh v23.4S, v24.4S, v5.s[0] +mul v24.4S, v24.4S,v4.s[0] +mla v24.4S, v23.4S, v31.s[0] +sub v23.4s, v9.4s, v24.4s +add v9.4s, v9.4s, v24.4s +sqrdmulh v24.4S, v28.4S, v5.s[0] +mul v28.4S, v28.4S,v4.s[0] +mla v28.4S, v24.4S, v31.s[0] +sub v24.4s, v21.4s, v28.4s +add v21.4s, v21.4s, v28.4s +sqrdmulh v28.4S, v21.4S, v5.s[1] +mul v21.4S, v21.4S,v4.s[1] +mla v21.4S, v28.4S, v31.s[0] +sub v28.4s, v9.4s, v21.4s +add v9.4s, v9.4s, v21.4s +sqrdmulh v21.4S, v24.4S, v5.s[2] +mul v24.4S, v24.4S,v4.s[2] +mla v24.4S, v21.4S, v31.s[0] +sub v21.4s, v23.4s, v24.4s +add v23.4s, v23.4s, v24.4s +trn1 v24.4S, v9.4S, v28.4S +trn2 v25.4S, v9.4S, v28.4S +trn1 v3.4S, v23.4S, v21.4S +trn2 v18.4S, v23.4S, v21.4S +trn2 v23.2D, v24.2D, v3.2D +trn2 v21.2D, v25.2D, v18.2D +trn1 v9.2D, v24.2D, v3.2D +trn1 v28.2D, v25.2D, v18.2D +sqrdmulh v18.4S, v23.4S, v6.4S +mul v23.4S, v23.4S,v30.4S +mla v23.4S, v18.4S, v31.s[0] +sub v18.4s, v9.4s, v23.4s +add v9.4s, v9.4s, v23.4s +sqrdmulh v23.4S, v21.4S, v6.4S +mul v21.4S, v21.4S,v30.4S +mla v21.4S, v23.4S, v31.s[0] +sub v23.4s, v28.4s, v21.4s +add v28.4s, v28.4s, v21.4s +sqrdmulh v21.4S, v28.4S, v26.4S +mul v28.4S, v28.4S,v19.4S +mla v28.4S, v21.4S, v31.s[0] +sub v21.4s, v9.4s, v28.4s +add v9.4s, v9.4s, v28.4s +sqrdmulh v28.4S, v23.4S, v10.4S +mul v23.4S, v23.4S,v22.4S +mla v23.4S, v28.4S, v31.s[0] +sub v28.4s, v18.4s, v23.4s +add v18.4s, v18.4s, v23.4s +str q9, [x0, #768] +str q21, [x0, #784] +str q18, [x0, #800] +str q28, [x0, #816] +ldr q28, [x17, #+1792] +ldr q18, [x17, #+1808] +ldr q21, [x17, #+1824] +ldr q9, [x17, #+1840] +ldr q23, [x17, #+1856] +ldr q25, [x17, #+1872] +ldr q3, [x17, #+1888] +ldr q24, [x17, #+1904] +ldr q10, [x0, #864] +ldr q22, [x0, #880] +ldr q26, [x0, #832] +ldr q19, [x0, #848] +sqrdmulh v6.4S, v10.4S, v18.s[0] +mul v10.4S, v10.4S,v28.s[0] +mla v10.4S, v6.4S, v31.s[0] +sub v6.4s, v26.4s, v10.4s +add v26.4s, v26.4s, v10.4s +sqrdmulh v10.4S, v22.4S, v18.s[0] +mul v22.4S, v22.4S,v28.s[0] +mla v22.4S, v10.4S, v31.s[0] +sub v10.4s, v19.4s, v22.4s +add v19.4s, v19.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v18.s[1] +mul v19.4S, v19.4S,v28.s[1] +mla v19.4S, v22.4S, v31.s[0] +sub v22.4s, v26.4s, v19.4s +add v26.4s, v26.4s, v19.4s +sqrdmulh v19.4S, v10.4S, v18.s[2] +mul v10.4S, v10.4S,v28.s[2] +mla v10.4S, v19.4S, v31.s[0] +sub v19.4s, v6.4s, v10.4s +add v6.4s, v6.4s, v10.4s +trn1 v10.4S, v26.4S, v22.4S +trn2 v30.4S, v26.4S, v22.4S +trn1 v5.4S, v6.4S, v19.4S +trn2 v4.4S, v6.4S, v19.4S +trn2 v6.2D, v10.2D, v5.2D +trn2 v19.2D, v30.2D, v4.2D +trn1 v26.2D, v10.2D, v5.2D +trn1 v22.2D, v30.2D, v4.2D +sqrdmulh v4.4S, v6.4S, v9.4S +mul v6.4S, v6.4S,v21.4S +mla v6.4S, v4.4S, v31.s[0] +sub v4.4s, v26.4s, v6.4s +add v26.4s, v26.4s, v6.4s +sqrdmulh v6.4S, v19.4S, v9.4S +mul v19.4S, v19.4S,v21.4S +mla v19.4S, v6.4S, v31.s[0] +sub v6.4s, v22.4s, v19.4s +add v22.4s, v22.4s, v19.4s +sqrdmulh v19.4S, v22.4S, v25.4S +mul v22.4S, v22.4S,v23.4S +mla v22.4S, v19.4S, v31.s[0] +sub v19.4s, v26.4s, v22.4s +add v26.4s, v26.4s, v22.4s +sqrdmulh v22.4S, v6.4S, v24.4S +mul v6.4S, v6.4S,v3.4S +mla v6.4S, v22.4S, v31.s[0] +sub v22.4s, v4.4s, v6.4s +add v4.4s, v4.4s, v6.4s +str q26, [x0, #832] +str q19, [x0, #848] +str q4, [x0, #864] +str q22, [x0, #880] +ldr q22, [x17, #+1920] +ldr q4, [x17, #+1936] +ldr q19, [x17, #+1952] +ldr q26, [x17, #+1968] +ldr q6, [x17, #+1984] +ldr q30, [x17, #+2000] +ldr q5, [x17, #+2016] +ldr q10, [x17, #+2032] +ldr q24, [x0, #928] +ldr q3, [x0, #944] +ldr q25, [x0, #896] +ldr q23, [x0, #912] +sqrdmulh v9.4S, v24.4S, v4.s[0] +mul v24.4S, v24.4S,v22.s[0] +mla v24.4S, v9.4S, v31.s[0] +sub v9.4s, v25.4s, v24.4s +add v25.4s, v25.4s, v24.4s +sqrdmulh v24.4S, v3.4S, v4.s[0] +mul v3.4S, v3.4S,v22.s[0] +mla v3.4S, v24.4S, v31.s[0] +sub v24.4s, v23.4s, v3.4s +add v23.4s, v23.4s, v3.4s +sqrdmulh v3.4S, v23.4S, v4.s[1] +mul v23.4S, v23.4S,v22.s[1] +mla v23.4S, v3.4S, v31.s[0] +sub v3.4s, v25.4s, v23.4s +add v25.4s, v25.4s, v23.4s +sqrdmulh v23.4S, v24.4S, v4.s[2] +mul v24.4S, v24.4S,v22.s[2] +mla v24.4S, v23.4S, v31.s[0] +sub v23.4s, v9.4s, v24.4s +add v9.4s, v9.4s, v24.4s +trn1 v24.4S, v25.4S, v3.4S +trn2 v21.4S, v25.4S, v3.4S +trn1 v18.4S, v9.4S, v23.4S +trn2 v28.4S, v9.4S, v23.4S +trn2 v9.2D, v24.2D, v18.2D +trn2 v23.2D, v21.2D, v28.2D +trn1 v25.2D, v24.2D, v18.2D +trn1 v3.2D, v21.2D, v28.2D +sqrdmulh v28.4S, v9.4S, v26.4S +mul v9.4S, v9.4S,v19.4S +mla v9.4S, v28.4S, v31.s[0] +sub v28.4s, v25.4s, v9.4s +add v25.4s, v25.4s, v9.4s +sqrdmulh v9.4S, v23.4S, v26.4S +mul v23.4S, v23.4S,v19.4S +mla v23.4S, v9.4S, v31.s[0] +sub v9.4s, v3.4s, v23.4s +add v3.4s, v3.4s, v23.4s +sqrdmulh v23.4S, v3.4S, v30.4S +mul v3.4S, v3.4S,v6.4S +mla v3.4S, v23.4S, v31.s[0] +sub v23.4s, v25.4s, v3.4s +add v25.4s, v25.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v10.4S +mul v9.4S, v9.4S,v5.4S +mla v9.4S, v3.4S, v31.s[0] +sub v3.4s, v28.4s, v9.4s +add v28.4s, v28.4s, v9.4s +str q25, [x0, #896] +str q23, [x0, #912] +str q28, [x0, #928] +str q3, [x0, #944] +ldr q3, [x17, #+2048] +ldr q28, [x17, #+2064] +ldr q23, [x17, #+2080] +ldr q25, [x17, #+2096] +ldr q9, [x17, #+2112] +ldr q21, [x17, #+2128] +ldr q18, [x17, #+2144] +ldr q24, [x17, #+2160] +ldr q10, [x0, #992] +ldr q5, [x0, #1008] +ldr q30, [x0, #960] +ldr q6, [x0, #976] +sqrdmulh v26.4S, v10.4S, v28.s[0] +mul v10.4S, v10.4S,v3.s[0] +mla v10.4S, v26.4S, v31.s[0] +sub v26.4s, v30.4s, v10.4s +add v30.4s, v30.4s, v10.4s +sqrdmulh v10.4S, v5.4S, v28.s[0] +mul v5.4S, v5.4S,v3.s[0] +mla v5.4S, v10.4S, v31.s[0] +sub v10.4s, v6.4s, v5.4s +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v6.4S, v28.s[1] +mul v6.4S, v6.4S,v3.s[1] +mla v6.4S, v5.4S, v31.s[0] +sub v5.4s, v30.4s, v6.4s +add v30.4s, v30.4s, v6.4s +sqrdmulh v6.4S, v10.4S, v28.s[2] +mul v10.4S, v10.4S,v3.s[2] +mla v10.4S, v6.4S, v31.s[0] +sub v6.4s, v26.4s, v10.4s +add v26.4s, v26.4s, v10.4s +trn1 v10.4S, v30.4S, v5.4S +trn2 v19.4S, v30.4S, v5.4S +trn1 v4.4S, v26.4S, v6.4S +trn2 v22.4S, v26.4S, v6.4S +trn2 v26.2D, v10.2D, v4.2D +trn2 v6.2D, v19.2D, v22.2D +trn1 v30.2D, v10.2D, v4.2D +trn1 v5.2D, v19.2D, v22.2D +sqrdmulh v22.4S, v26.4S, v25.4S +mul v26.4S, v26.4S,v23.4S +mla v26.4S, v22.4S, v31.s[0] +sub v22.4s, v30.4s, v26.4s +add v30.4s, v30.4s, v26.4s +sqrdmulh v26.4S, v6.4S, v25.4S +mul v6.4S, v6.4S,v23.4S +mla v6.4S, v26.4S, v31.s[0] +sub v26.4s, v5.4s, v6.4s +add v5.4s, v5.4s, v6.4s +sqrdmulh v6.4S, v5.4S, v21.4S +mul v5.4S, v5.4S,v9.4S +mla v5.4S, v6.4S, v31.s[0] +sub v6.4s, v30.4s, v5.4s +add v30.4s, v30.4s, v5.4s +sqrdmulh v5.4S, v26.4S, v24.4S +mul v26.4S, v26.4S,v18.4S +mla v26.4S, v5.4S, v31.s[0] +sub v5.4s, v22.4s, v26.4s +add v22.4s, v22.4s, v26.4s +str q30, [x0, #960] +str q6, [x0, #976] +str q22, [x0, #992] +str q5, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 2392 +// Instruction count: 2388 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_5_0.s b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_5_0.s new file mode 100644 index 0000000..c47ecf1 --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_5_0.s @@ -0,0 +1,2422 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 26036764 // Layer 6, block 0 +.word 7065381 // Layer 6, block 1 +.word 11280567 // Layer 6, block 2 +.word 19695786 // Layer 6, block 3 +.word 1666225723 // Layer 6, block 0 +.word 452149874 // Layer 6, block 1 +.word 721901190 // Layer 6, block 2 +.word 1260434103 // Layer 6, block 3 +.word 28678040 // Layer 7, block 0 +.word 5637166 // Layer 7, block 2 +.word 18759424 // Layer 7, block 4 +.word 8648030 // Layer 7, block 6 +.word 1835254486 // Layer 7, block 0 +.word 360751090 // Layer 7, block 2 +.word 1200511508 // Layer 7, block 4 +.word 553431680 // Layer 7, block 6 +.word 7232147 // Layer 7, block 1 +.word 7430689 // Layer 7, block 3 +.word 14819378 // Layer 7, block 5 +.word 22112339 // Layer 7, block 7 +.word 462822084 // Layer 7, block 1 +.word 475527802 // Layer 7, block 3 +.word 948367809 // Layer 7, block 5 +.word 1415081692 // Layer 7, block 7 +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14834498 // Layer 6, block 4 +.word 22861321 // Layer 6, block 5 +.word 23033862 // Layer 6, block 6 +.word 32211066 // Layer 6, block 7 +.word 949335415 // Layer 6, block 4 +.word 1463012881 // Layer 6, block 5 +.word 1474054663 // Layer 6, block 6 +.word 2061350894 // Layer 6, block 7 +.word 7103825 // Layer 7, block 8 +.word 24338119 // Layer 7, block 10 +.word 6674394 // Layer 7, block 12 +.word 3716128 // Layer 7, block 14 +.word 454610102 // Layer 7, block 8 +.word 1557520740 // Layer 7, block 10 +.word 427128616 // Layer 7, block 12 +.word 237814041 // Layer 7, block 14 +.word 18577393 // Layer 7, block 9 +.word 17042091 // Layer 7, block 11 +.word 6574213 // Layer 7, block 13 +.word 24666803 // Layer 7, block 15 +.word 1188862414 // Layer 7, block 9 +.word 1090610585 // Layer 7, block 11 +.word 420717521 // Layer 7, block 13 +.word 1578554911 // Layer 7, block 15 +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 11253846 // Layer 6, block 8 +.word 16151303 // Layer 6, block 9 +.word 1821442 // Layer 6, block 10 +.word 23358663 // Layer 6, block 11 +.word 720191176 // Layer 6, block 8 +.word 1033604503 // Layer 6, block 9 +.word 116563391 // Layer 6, block 10 +.word 1494840340 // Layer 6, block 11 +.word 32787475 // Layer 7, block 16 +.word 8269259 // Layer 7, block 18 +.word 20826321 // Layer 7, block 20 +.word 21194054 // Layer 7, block 22 +.word 2098238255 // Layer 7, block 16 +.word 529192186 // Layer 7, block 18 +.word 1332782821 // Layer 7, block 20 +.word 1356315937 // Layer 7, block 22 +.word 28400654 // Layer 7, block 17 +.word 31090287 // Layer 7, block 19 +.word 26776841 // Layer 7, block 21 +.word 22281074 // Layer 7, block 23 +.word 1817503137 // Layer 7, block 17 +.word 1989626512 // Layer 7, block 19 +.word 1713587037 // Layer 7, block 21 +.word 1425879908 // Layer 7, block 23 +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 20504641 // Layer 6, block 12 +.word 7735096 // Layer 6, block 13 +.word 29463916 // Layer 6, block 14 +.word 23172067 // Layer 6, block 15 +.word 1312196872 // Layer 6, block 12 +.word 495008363 // Layer 6, block 13 +.word 1885546712 // Layer 6, block 14 +.word 1482899108 // Layer 6, block 15 +.word 1953000 // Layer 7, block 24 +.word 12766243 // Layer 7, block 26 +.word 16292342 // Layer 7, block 28 +.word 25143337 // Layer 7, block 30 +.word 124982461 // Layer 7, block 24 +.word 816977197 // Layer 7, block 26 +.word 1042630311 // Layer 7, block 28 +.word 1609050759 // Layer 7, block 30 +.word 12486848 // Layer 7, block 25 +.word 31556661 // Layer 7, block 27 +.word 28330310 // Layer 7, block 29 +.word 15137961 // Layer 7, block 31 +.word 799097282 // Layer 7, block 25 +.word 2019472170 // Layer 7, block 27 +.word 1813001465 // Layer 7, block 29 +.word 968755565 // Layer 7, block 31 +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 18663828 // Layer 6, block 16 +.word 25765932 // Layer 6, block 17 +.word 11779122 // Layer 6, block 18 +.word 29112305 // Layer 6, block 19 +.word 1194393831 // Layer 6, block 16 +.word 1648893798 // Layer 6, block 17 +.word 753806275 // Layer 6, block 18 +.word 1863045325 // Layer 6, block 19 +.word 33163184 // Layer 7, block 32 +.word 11550623 // Layer 7, block 34 +.word 25375595 // Layer 7, block 36 +.word 18254638 // Layer 7, block 38 +.word 2122281795 // Layer 7, block 32 +.word 739183455 // Layer 7, block 34 +.word 1623914137 // Layer 7, block 36 +.word 1168207670 // Layer 7, block 38 +.word 9551359 // Layer 7, block 33 +.word 33257316 // Layer 7, block 35 +.word 10387700 // Layer 7, block 37 +.word 4263629 // Layer 7, block 39 +.word 611240324 // Layer 7, block 33 +.word 2128305784 // Layer 7, block 35 +.word 664762063 // Layer 7, block 37 +.word 272851431 // Layer 7, block 39 +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 596073 // Layer 6, block 20 +.word 29039358 // Layer 6, block 21 +.word 6760262 // Layer 6, block 22 +.word 2228887 // Layer 6, block 23 +.word 38145761 // Layer 6, block 20 +.word 1858377074 // Layer 6, block 21 +.word 432623749 // Layer 6, block 22 +.word 142637881 // Layer 6, block 23 +.word 25929180 // Layer 7, block 40 +.word 23508428 // Layer 7, block 42 +.word 22560727 // Layer 7, block 44 +.word 29457393 // Layer 7, block 46 +.word 1659340873 // Layer 7, block 40 +.word 1504424569 // Layer 7, block 42 +.word 1443776334 // Layer 7, block 44 +.word 1885129272 // Layer 7, block 46 +.word 17371159 // Layer 7, block 41 +.word 11558208 // Layer 7, block 43 +.word 15755637 // Layer 7, block 45 +.word 20740787 // Layer 7, block 47 +.word 1111669329 // Layer 7, block 41 +.word 739668858 // Layer 7, block 43 +.word 1008283812 // Layer 7, block 45 +.word 1327309063 // Layer 7, block 47 +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 13624329 // Layer 6, block 24 +.word 9838349 // Layer 6, block 25 +.word 6934560 // Layer 6, block 26 +.word 11310234 // Layer 6, block 27 +.word 871890510 // Layer 6, block 24 +.word 629606282 // Layer 6, block 25 +.word 443777969 // Layer 6, block 26 +.word 723799733 // Layer 6, block 27 +.word 3153984 // Layer 7, block 48 +.word 15599806 // Layer 7, block 50 +.word 23484790 // Layer 7, block 52 +.word 30174454 // Layer 7, block 54 +.word 201839571 // Layer 7, block 48 +.word 998311389 // Layer 7, block 50 +.word 1502911852 // Layer 7, block 52 +.word 1931017673 // Layer 7, block 54 +.word 13598070 // Layer 7, block 49 +.word 31454003 // Layer 7, block 51 +.word 20506260 // Layer 7, block 53 +.word 5928435 // Layer 7, block 55 +.word 870210062 // Layer 7, block 49 +.word 2012902560 // Layer 7, block 51 +.word 1312300480 // Layer 7, block 53 +.word 379390883 // Layer 7, block 55 +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 32798516 // Layer 6, block 28 +.word 9911360 // Layer 6, block 29 +.word 32443170 // Layer 6, block 30 +.word 31293482 // Layer 6, block 31 +.word 2098944825 // Layer 6, block 28 +.word 634278629 // Layer 6, block 29 +.word 2076204416 // Layer 6, block 30 +.word 2002630000 // Layer 6, block 31 +.word 26013877 // Layer 7, block 56 +.word 22928950 // Layer 7, block 58 +.word 24547058 // Layer 7, block 60 +.word 21082546 // Layer 7, block 62 +.word 1664761067 // Layer 7, block 56 +.word 1467340807 // Layer 7, block 58 +.word 1570891816 // Layer 7, block 60 +.word 1349179970 // Layer 7, block 62 +.word 21864746 // Layer 7, block 57 +.word 27678266 // Layer 7, block 59 +.word 30695887 // Layer 7, block 61 +.word 31772478 // Layer 7, block 63 +.word 1399236949 // Layer 7, block 57 +.word 1771273834 // Layer 7, block 59 +.word 1964386839 // Layer 7, block 61 +.word 2033283404 // Layer 7, block 63 +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 2853776 // Layer 6, block 32 +.word 31645959 // Layer 6, block 33 +.word 29723614 // Layer 6, block 34 +.word 31813171 // Layer 6, block 35 +.word 182627725 // Layer 6, block 32 +.word 2025186806 // Layer 6, block 33 +.word 1902166116 // Layer 6, block 34 +.word 2035887557 // Layer 6, block 35 +.word 30377953 // Layer 7, block 64 +.word 4924837 // Layer 7, block 66 +.word 11362575 // Layer 7, block 68 +.word 31398766 // Layer 7, block 70 +.word 1944040616 // Layer 7, block 64 +.word 315165513 // Layer 7, block 66 +.word 727149301 // Layer 7, block 68 +.word 2009367662 // Layer 7, block 70 +.word 27689101 // Layer 7, block 65 +.word 31229525 // Layer 7, block 67 +.word 6544948 // Layer 7, block 69 +.word 13728247 // Layer 7, block 71 +.word 1771967221 // Layer 7, block 65 +.word 1998537064 // Layer 7, block 67 +.word 418844704 // Layer 7, block 69 +.word 878540754 // Layer 7, block 71 +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9116920 // Layer 6, block 36 +.word 26449800 // Layer 6, block 37 +.word 27173300 // Layer 6, block 38 +.word 1574249 // Layer 6, block 39 +.word 583438350 // Layer 6, block 36 +.word 1692658010 // Layer 6, block 37 +.word 1738958476 // Layer 6, block 38 +.word 100744247 // Layer 6, block 39 +.word 6510145 // Layer 7, block 72 +.word 760999 // Layer 7, block 74 +.word 1634503 // Layer 7, block 76 +.word 29546109 // Layer 7, block 78 +.word 416617482 // Layer 7, block 72 +.word 48700219 // Layer 7, block 74 +.word 104600209 // Layer 7, block 76 +.word 1890806663 // Layer 7, block 78 +.word 2195232 // Layer 7, block 73 +.word 4465852 // Layer 7, block 75 +.word 31203102 // Layer 7, block 77 +.word 29916743 // Layer 7, block 79 +.word 140484126 // Layer 7, block 73 +.word 285792715 // Layer 7, block 75 +.word 1996846121 // Layer 7, block 77 +.word 1914525428 // Layer 7, block 79 +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29172999 // Layer 6, block 40 +.word 16825951 // Layer 6, block 41 +.word 11592382 // Layer 6, block 42 +.word 2671395 // Layer 6, block 43 +.word 1866929445 // Layer 6, block 40 +.word 1076778680 // Layer 6, block 41 +.word 741855827 // Layer 6, block 42 +.word 170956232 // Layer 6, block 43 +.word 14579779 // Layer 7, block 80 +.word 24263513 // Layer 7, block 82 +.word 4646776 // Layer 7, block 84 +.word 69049 // Layer 7, block 86 +.word 933034643 // Layer 7, block 80 +.word 1552746321 // Layer 7, block 82 +.word 297370968 // Layer 7, block 84 +.word 4418799 // Layer 7, block 86 +.word 33263488 // Layer 7, block 81 +.word 22493246 // Layer 7, block 83 +.word 22009979 // Layer 7, block 85 +.word 12021234 // Layer 7, block 87 +.word 2128700762 // Layer 7, block 81 +.word 1439457879 // Layer 7, block 83 +.word 1408531152 // Layer 7, block 85 +.word 769300260 // Layer 7, block 87 +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 15720958 // Layer 6, block 44 +.word 4876619 // Layer 6, block 45 +.word 9370171 // Layer 6, block 46 +.word 2197027 // Layer 6, block 47 +.word 1006064525 // Layer 6, block 44 +.word 312079797 // Layer 6, block 45 +.word 599645177 // Layer 6, block 46 +.word 140598997 // Layer 6, block 47 +.word 16117282 // Layer 7, block 88 +.word 9635661 // Layer 7, block 90 +.word 9117520 // Layer 7, block 92 +.word 3506913 // Layer 7, block 94 +.word 1031427326 // Layer 7, block 88 +.word 616635240 // Layer 7, block 90 +.word 583476747 // Layer 7, block 92 +.word 224425303 // Layer 7, block 94 +.word 20014407 // Layer 7, block 89 +.word 25893988 // Layer 7, block 91 +.word 10257619 // Layer 7, block 93 +.word 24501669 // Layer 7, block 95 +.word 1280824291 // Layer 7, block 89 +.word 1657088757 // Layer 7, block 91 +.word 656437514 // Layer 7, block 93 +.word 1567987141 // Layer 7, block 95 +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 23467272 // Layer 6, block 48 +.word 11944835 // Layer 6, block 49 +.word 29768154 // Layer 6, block 50 +.word 3189790 // Layer 6, block 51 +.word 1501790786 // Layer 6, block 48 +.word 764411097 // Layer 6, block 49 +.word 1905016458 // Layer 6, block 50 +.word 204130980 // Layer 6, block 51 +.word 28559032 // Layer 7, block 96 +.word 20151609 // Layer 7, block 98 +.word 11645481 // Layer 7, block 100 +.word 16402437 // Layer 7, block 102 +.word 1827638556 // Layer 7, block 96 +.word 1289604549 // Layer 7, block 98 +.word 745253903 // Layer 7, block 100 +.word 1049675853 // Layer 7, block 102 +.word 1005359 // Layer 7, block 97 +.word 19130139 // Layer 7, block 99 +.word 11690281 // Layer 7, block 101 +.word 5461508 // Layer 7, block 103 +.word 64338065 // Layer 7, block 97 +.word 1224235458 // Layer 7, block 99 +.word 748120885 // Layer 7, block 101 +.word 349509836 // Layer 7, block 103 +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 4898455 // Layer 6, block 52 +.word 22059944 // Layer 6, block 53 +.word 20315246 // Layer 6, block 54 +.word 28615767 // Layer 6, block 55 +.word 313477194 // Layer 6, block 52 +.word 1411728668 // Layer 6, block 53 +.word 1300076517 // Layer 6, block 54 +.word 1831269319 // Layer 6, block 55 +.word 6226096 // Layer 7, block 104 +.word 14029790 // Layer 7, block 106 +.word 7729000 // Layer 7, block 108 +.word 13958531 // Layer 7, block 110 +.word 398439734 // Layer 7, block 104 +.word 897838034 // Layer 7, block 106 +.word 494618249 // Layer 7, block 108 +.word 893277806 // Layer 7, block 110 +.word 31755058 // Layer 7, block 105 +.word 26102744 // Layer 7, block 107 +.word 19175904 // Layer 7, block 109 +.word 19472238 // Layer 7, block 111 +.word 2032168609 // Layer 7, block 105 +.word 1670448121 // Layer 7, block 107 +.word 1227164194 // Layer 7, block 109 +.word 1246128123 // Layer 7, block 111 +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 17302560 // Layer 6, block 56 +.word 8630188 // Layer 6, block 57 +.word 13744680 // Layer 6, block 58 +.word 31890906 // Layer 6, block 59 +.word 1107279328 // Layer 6, block 56 +.word 552289879 // Layer 6, block 57 +.word 879592386 // Layer 6, block 58 +.word 2040862218 // Layer 6, block 59 +.word 4735938 // Layer 7, block 112 +.word 26671657 // Layer 7, block 114 +.word 25810971 // Layer 7, block 116 +.word 25578690 // Layer 7, block 118 +.word 303076900 // Layer 7, block 112 +.word 1706855774 // Layer 7, block 114 +.word 1651776074 // Layer 7, block 116 +.word 1636911225 // Layer 7, block 118 +.word 6957373 // Layer 7, block 113 +.word 25381712 // Layer 7, block 115 +.word 27780827 // Layer 7, block 117 +.word 28062311 // Layer 7, block 119 +.word 445237890 // Layer 7, block 113 +.word 1624305595 // Layer 7, block 115 +.word 1777837237 // Layer 7, block 117 +.word 1795850838 // Layer 7, block 119 +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 26150922 // Layer 6, block 60 +.word 29525906 // Layer 6, block 61 +.word 23080870 // Layer 6, block 62 +.word 1636987 // Layer 6, block 63 +.word 1673531278 // Layer 6, block 60 +.word 1889513769 // Layer 6, block 61 +.word 1477062945 // Layer 6, block 62 +.word 104759172 // Layer 6, block 63 +.word 10674616 // Layer 7, block 120 +.word 9508293 // Layer 7, block 122 +.word 4274200 // Layer 7, block 124 +.word 10066304 // Layer 7, block 126 +.word 683123285 // Layer 7, block 120 +.word 608484310 // Layer 7, block 122 +.word 273527923 // Layer 7, block 124 +.word 644194289 // Layer 7, block 126 +.word 26473446 // Layer 7, block 121 +.word 14853570 // Layer 7, block 123 +.word 32427548 // Layer 7, block 125 +.word 16598340 // Layer 7, block 127 +.word 1694171239 // Layer 7, block 121 +.word 950555930 // Layer 7, block 123 +.word 2075204685 // Layer 7, block 125 +.word 1062212688 // Layer 7, block 127 +.text +.global ntt_u32_full_neon_asm_var_4_4_5_0 +.global _ntt_u32_full_neon_asm_var_4_4_5_0 +ntt_u32_full_neon_asm_var_4_4_5_0: +_ntt_u32_full_neon_asm_var_4_4_5_0: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x0, #800] +ldr q29, [x0, #864] +ldr q28, [x0, #928] +ldr q27, [x0, #992] +ldr q26, [x0, #288] +ldr q25, [x0, #352] +ldr q24, [x0, #416] +ldr q23, [x0, #480] +ldr q22, [x0, #544] +ldr q21, [x0, #608] +ldr q20, [x0, #672] +ldr q19, [x0, #736] +ldr q18, [x0, #32] +ldr q17, [x0, #96] +ldr q16, [x0, #160] +ldr q3, [x0, #224] +ldr q2, [x17, #+0] +ldr q1, [x17, #+16] +ldr q0, [x17, #+32] +ldr q15, [x17, #+48] +ldr q14, [x17, #+64] +ldr q13, [x17, #+80] +ldr q12, [x17, #+96] +ldr q11, [x17, #+112] +sqrdmulh v10.4S, v30.4S, v1.s[0] +mul v30.4S, v30.4S,v2.s[0] +sqrdmulh v9.4S, v29.4S, v1.s[0] +mul v29.4S, v29.4S,v2.s[0] +sqrdmulh v8.4S, v28.4S, v1.s[0] +mul v28.4S, v28.4S,v2.s[0] +sqrdmulh v7.4S, v27.4S, v1.s[0] +mul v27.4S, v27.4S,v2.s[0] +mla v30.4S, v10.4S, v31.s[0] +mla v29.4S, v9.4S, v31.s[0] +mla v28.4S, v8.4S, v31.s[0] +mla v27.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v22.4S, v1.s[0] +mul v22.4S, v22.4S,v2.s[0] +sqrdmulh v8.4S, v21.4S, v1.s[0] +mul v21.4S, v21.4S,v2.s[0] +sqrdmulh v9.4S, v20.4S, v1.s[0] +mul v20.4S, v20.4S,v2.s[0] +sqrdmulh v10.4S, v19.4S, v1.s[0] +mul v19.4S, v19.4S,v2.s[0] +mla v22.4S, v7.4S, v31.s[0] +mla v21.4S, v8.4S, v31.s[0] +mla v20.4S, v9.4S, v31.s[0] +mla v19.4S, v10.4S, v31.s[0] +sub v10.4s, v26.4s, v30.4s +add v26.4s, v26.4s, v30.4s +sub v30.4s, v25.4s, v29.4s +add v25.4s, v25.4s, v29.4s +sub v29.4s, v24.4s, v28.4s +add v24.4s, v24.4s, v28.4s +sub v28.4s, v23.4s, v27.4s +add v23.4s, v23.4s, v27.4s +sub v27.4s, v18.4s, v22.4s +add v18.4s, v18.4s, v22.4s +sub v22.4s, v17.4s, v21.4s +add v17.4s, v17.4s, v21.4s +sub v21.4s, v16.4s, v20.4s +add v16.4s, v16.4s, v20.4s +sub v20.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v24.4S, v1.s[1] +mul v24.4S, v24.4S,v2.s[1] +sqrdmulh v9.4S, v23.4S, v1.s[1] +mul v23.4S, v23.4S,v2.s[1] +sqrdmulh v8.4S, v26.4S, v1.s[1] +mul v26.4S, v26.4S,v2.s[1] +sqrdmulh v7.4S, v25.4S, v1.s[1] +mul v25.4S, v25.4S,v2.s[1] +mla v24.4S, v19.4S, v31.s[0] +mla v23.4S, v9.4S, v31.s[0] +mla v26.4S, v8.4S, v31.s[0] +mla v25.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v29.4S, v1.s[2] +mul v29.4S, v29.4S,v2.s[2] +sqrdmulh v8.4S, v28.4S, v1.s[2] +mul v28.4S, v28.4S,v2.s[2] +sqrdmulh v9.4S, v10.4S, v1.s[2] +mul v10.4S, v10.4S,v2.s[2] +sqrdmulh v19.4S, v30.4S, v1.s[2] +mul v30.4S, v30.4S,v2.s[2] +mla v29.4S, v7.4S, v31.s[0] +mla v28.4S, v8.4S, v31.s[0] +mla v10.4S, v9.4S, v31.s[0] +mla v30.4S, v19.4S, v31.s[0] +sub v19.4s, v16.4s, v24.4s +add v16.4s, v16.4s, v24.4s +sub v24.4s, v3.4s, v23.4s +add v3.4s, v3.4s, v23.4s +sub v23.4s, v18.4s, v26.4s +add v18.4s, v18.4s, v26.4s +sub v26.4s, v17.4s, v25.4s +add v17.4s, v17.4s, v25.4s +sub v25.4s, v21.4s, v29.4s +add v21.4s, v21.4s, v29.4s +sub v29.4s, v20.4s, v28.4s +add v20.4s, v20.4s, v28.4s +sub v28.4s, v27.4s, v10.4s +add v27.4s, v27.4s, v10.4s +sub v10.4s, v22.4s, v30.4s +add v22.4s, v22.4s, v30.4s +sqrdmulh v30.4S, v16.4S, v15.s[0] +mul v16.4S, v16.4S,v0.s[0] +sqrdmulh v9.4S, v3.4S, v15.s[0] +mul v3.4S, v3.4S,v0.s[0] +sqrdmulh v8.4S, v19.4S, v15.s[1] +mul v19.4S, v19.4S,v0.s[1] +sqrdmulh v7.4S, v24.4S, v15.s[1] +mul v24.4S, v24.4S,v0.s[1] +mla v16.4S, v30.4S, v31.s[0] +mla v3.4S, v9.4S, v31.s[0] +mla v19.4S, v8.4S, v31.s[0] +mla v24.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v21.4S, v15.s[2] +mul v21.4S, v21.4S,v0.s[2] +sqrdmulh v8.4S, v20.4S, v15.s[2] +mul v20.4S, v20.4S,v0.s[2] +sqrdmulh v9.4S, v25.4S, v15.s[3] +mul v25.4S, v25.4S,v0.s[3] +sqrdmulh v30.4S, v29.4S, v15.s[3] +mul v29.4S, v29.4S,v0.s[3] +mla v21.4S, v7.4S, v31.s[0] +mla v20.4S, v8.4S, v31.s[0] +mla v25.4S, v9.4S, v31.s[0] +mla v29.4S, v30.4S, v31.s[0] +sub v30.4s, v18.4s, v16.4s +add v18.4s, v18.4s, v16.4s +sub v16.4s, v17.4s, v3.4s +add v17.4s, v17.4s, v3.4s +sub v3.4s, v23.4s, v19.4s +add v23.4s, v23.4s, v19.4s +sub v19.4s, v26.4s, v24.4s +add v26.4s, v26.4s, v24.4s +sub v24.4s, v27.4s, v21.4s +add v27.4s, v27.4s, v21.4s +sub v21.4s, v22.4s, v20.4s +add v22.4s, v22.4s, v20.4s +sub v20.4s, v28.4s, v25.4s +add v28.4s, v28.4s, v25.4s +sub v25.4s, v10.4s, v29.4s +add v10.4s, v10.4s, v29.4s +sqrdmulh v29.4S, v17.4S, v13.s[0] +mul v17.4S, v17.4S,v14.s[0] +sqrdmulh v9.4S, v16.4S, v13.s[1] +mul v16.4S, v16.4S,v14.s[1] +sqrdmulh v8.4S, v26.4S, v13.s[2] +mul v26.4S, v26.4S,v14.s[2] +sqrdmulh v7.4S, v19.4S, v13.s[3] +mul v19.4S, v19.4S,v14.s[3] +mla v17.4S, v29.4S, v31.s[0] +mla v16.4S, v9.4S, v31.s[0] +mla v26.4S, v8.4S, v31.s[0] +mla v19.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v22.4S, v11.s[0] +mul v22.4S, v22.4S,v12.s[0] +sqrdmulh v8.4S, v21.4S, v11.s[1] +mul v21.4S, v21.4S,v12.s[1] +sqrdmulh v9.4S, v10.4S, v11.s[2] +mul v10.4S, v10.4S,v12.s[2] +sqrdmulh v29.4S, v25.4S, v11.s[3] +mul v25.4S, v25.4S,v12.s[3] +mla v22.4S, v7.4S, v31.s[0] +mla v21.4S, v8.4S, v31.s[0] +mla v10.4S, v9.4S, v31.s[0] +mla v25.4S, v29.4S, v31.s[0] +sub v29.4s, v18.4s, v17.4s +add v18.4s, v18.4s, v17.4s +sub v17.4s, v30.4s, v16.4s +add v30.4s, v30.4s, v16.4s +sub v16.4s, v23.4s, v26.4s +add v23.4s, v23.4s, v26.4s +sub v26.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +sub v19.4s, v27.4s, v22.4s +add v27.4s, v27.4s, v22.4s +sub v22.4s, v24.4s, v21.4s +add v24.4s, v24.4s, v21.4s +sub v21.4s, v28.4s, v10.4s +add v28.4s, v28.4s, v10.4s +sub v10.4s, v20.4s, v25.4s +add v20.4s, v20.4s, v25.4s +str q18, [x0, #32] +str q29, [x0, #96] +str q30, [x0, #160] +str q17, [x0, #224] +str q23, [x0, #288] +str q16, [x0, #352] +str q3, [x0, #416] +str q26, [x0, #480] +str q27, [x0, #544] +str q19, [x0, #608] +str q24, [x0, #672] +str q22, [x0, #736] +str q28, [x0, #800] +str q21, [x0, #864] +str q20, [x0, #928] +str q10, [x0, #992] +ldr q10, [x0, #816] +ldr q20, [x0, #880] +ldr q21, [x0, #944] +ldr q28, [x0, #1008] +ldr q22, [x0, #304] +ldr q24, [x0, #368] +ldr q19, [x0, #432] +ldr q27, [x0, #496] +ldr q26, [x0, #560] +ldr q3, [x0, #624] +ldr q16, [x0, #688] +ldr q23, [x0, #752] +ldr q17, [x0, #48] +ldr q30, [x0, #112] +ldr q29, [x0, #176] +ldr q18, [x0, #240] +sqrdmulh v25.4S, v10.4S, v1.s[0] +mul v10.4S, v10.4S,v2.s[0] +sqrdmulh v9.4S, v20.4S, v1.s[0] +mul v20.4S, v20.4S,v2.s[0] +sqrdmulh v8.4S, v21.4S, v1.s[0] +mul v21.4S, v21.4S,v2.s[0] +sqrdmulh v7.4S, v28.4S, v1.s[0] +mul v28.4S, v28.4S,v2.s[0] +mla v10.4S, v25.4S, v31.s[0] +mla v20.4S, v9.4S, v31.s[0] +mla v21.4S, v8.4S, v31.s[0] +mla v28.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v26.4S, v1.s[0] +mul v26.4S, v26.4S,v2.s[0] +sqrdmulh v8.4S, v3.4S, v1.s[0] +mul v3.4S, v3.4S,v2.s[0] +sqrdmulh v9.4S, v16.4S, v1.s[0] +mul v16.4S, v16.4S,v2.s[0] +sqrdmulh v25.4S, v23.4S, v1.s[0] +mul v23.4S, v23.4S,v2.s[0] +mla v26.4S, v7.4S, v31.s[0] +mla v3.4S, v8.4S, v31.s[0] +mla v16.4S, v9.4S, v31.s[0] +mla v23.4S, v25.4S, v31.s[0] +sub v25.4s, v22.4s, v10.4s +add v22.4s, v22.4s, v10.4s +sub v10.4s, v24.4s, v20.4s +add v24.4s, v24.4s, v20.4s +sub v20.4s, v19.4s, v21.4s +add v19.4s, v19.4s, v21.4s +sub v21.4s, v27.4s, v28.4s +add v27.4s, v27.4s, v28.4s +sub v28.4s, v17.4s, v26.4s +add v17.4s, v17.4s, v26.4s +sub v26.4s, v30.4s, v3.4s +add v30.4s, v30.4s, v3.4s +sub v3.4s, v29.4s, v16.4s +add v29.4s, v29.4s, v16.4s +sub v16.4s, v18.4s, v23.4s +add v18.4s, v18.4s, v23.4s +sqrdmulh v23.4S, v19.4S, v1.s[1] +mul v19.4S, v19.4S,v2.s[1] +sqrdmulh v9.4S, v27.4S, v1.s[1] +mul v27.4S, v27.4S,v2.s[1] +sqrdmulh v8.4S, v22.4S, v1.s[1] +mul v22.4S, v22.4S,v2.s[1] +sqrdmulh v7.4S, v24.4S, v1.s[1] +mul v24.4S, v24.4S,v2.s[1] +mla v19.4S, v23.4S, v31.s[0] +mla v27.4S, v9.4S, v31.s[0] +mla v22.4S, v8.4S, v31.s[0] +mla v24.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v20.4S, v1.s[2] +mul v20.4S, v20.4S,v2.s[2] +sqrdmulh v8.4S, v21.4S, v1.s[2] +mul v21.4S, v21.4S,v2.s[2] +sqrdmulh v9.4S, v25.4S, v1.s[2] +mul v25.4S, v25.4S,v2.s[2] +sqrdmulh v23.4S, v10.4S, v1.s[2] +mul v10.4S, v10.4S,v2.s[2] +mla v20.4S, v7.4S, v31.s[0] +mla v21.4S, v8.4S, v31.s[0] +mla v25.4S, v9.4S, v31.s[0] +mla v10.4S, v23.4S, v31.s[0] +sub v23.4s, v29.4s, v19.4s +add v29.4s, v29.4s, v19.4s +sub v19.4s, v18.4s, v27.4s +add v18.4s, v18.4s, v27.4s +sub v27.4s, v17.4s, v22.4s +add v17.4s, v17.4s, v22.4s +sub v22.4s, v30.4s, v24.4s +add v30.4s, v30.4s, v24.4s +sub v24.4s, v3.4s, v20.4s +add v3.4s, v3.4s, v20.4s +sub v20.4s, v16.4s, v21.4s +add v16.4s, v16.4s, v21.4s +sub v21.4s, v28.4s, v25.4s +add v28.4s, v28.4s, v25.4s +sub v25.4s, v26.4s, v10.4s +add v26.4s, v26.4s, v10.4s +sqrdmulh v10.4S, v29.4S, v15.s[0] +mul v29.4S, v29.4S,v0.s[0] +sqrdmulh v9.4S, v18.4S, v15.s[0] +mul v18.4S, v18.4S,v0.s[0] +sqrdmulh v8.4S, v23.4S, v15.s[1] +mul v23.4S, v23.4S,v0.s[1] +sqrdmulh v7.4S, v19.4S, v15.s[1] +mul v19.4S, v19.4S,v0.s[1] +mla v29.4S, v10.4S, v31.s[0] +mla v18.4S, v9.4S, v31.s[0] +mla v23.4S, v8.4S, v31.s[0] +mla v19.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v3.4S, v15.s[2] +mul v3.4S, v3.4S,v0.s[2] +sqrdmulh v8.4S, v16.4S, v15.s[2] +mul v16.4S, v16.4S,v0.s[2] +sqrdmulh v9.4S, v24.4S, v15.s[3] +mul v24.4S, v24.4S,v0.s[3] +sqrdmulh v10.4S, v20.4S, v15.s[3] +mul v20.4S, v20.4S,v0.s[3] +mla v3.4S, v7.4S, v31.s[0] +mla v16.4S, v8.4S, v31.s[0] +mla v24.4S, v9.4S, v31.s[0] +mla v20.4S, v10.4S, v31.s[0] +sub v10.4s, v17.4s, v29.4s +add v17.4s, v17.4s, v29.4s +sub v29.4s, v30.4s, v18.4s +add v30.4s, v30.4s, v18.4s +sub v18.4s, v27.4s, v23.4s +add v27.4s, v27.4s, v23.4s +sub v23.4s, v22.4s, v19.4s +add v22.4s, v22.4s, v19.4s +sub v19.4s, v28.4s, v3.4s +add v28.4s, v28.4s, v3.4s +sub v3.4s, v26.4s, v16.4s +add v26.4s, v26.4s, v16.4s +sub v16.4s, v21.4s, v24.4s +add v21.4s, v21.4s, v24.4s +sub v24.4s, v25.4s, v20.4s +add v25.4s, v25.4s, v20.4s +sqrdmulh v20.4S, v30.4S, v13.s[0] +mul v30.4S, v30.4S,v14.s[0] +sqrdmulh v9.4S, v29.4S, v13.s[1] +mul v29.4S, v29.4S,v14.s[1] +sqrdmulh v8.4S, v22.4S, v13.s[2] +mul v22.4S, v22.4S,v14.s[2] +sqrdmulh v7.4S, v23.4S, v13.s[3] +mul v23.4S, v23.4S,v14.s[3] +mla v30.4S, v20.4S, v31.s[0] +mla v29.4S, v9.4S, v31.s[0] +mla v22.4S, v8.4S, v31.s[0] +mla v23.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v26.4S, v11.s[0] +mul v26.4S, v26.4S,v12.s[0] +sqrdmulh v8.4S, v3.4S, v11.s[1] +mul v3.4S, v3.4S,v12.s[1] +sqrdmulh v9.4S, v25.4S, v11.s[2] +mul v25.4S, v25.4S,v12.s[2] +sqrdmulh v20.4S, v24.4S, v11.s[3] +mul v24.4S, v24.4S,v12.s[3] +mla v26.4S, v7.4S, v31.s[0] +mla v3.4S, v8.4S, v31.s[0] +mla v25.4S, v9.4S, v31.s[0] +mla v24.4S, v20.4S, v31.s[0] +sub v20.4s, v17.4s, v30.4s +add v17.4s, v17.4s, v30.4s +sub v30.4s, v10.4s, v29.4s +add v10.4s, v10.4s, v29.4s +sub v29.4s, v27.4s, v22.4s +add v27.4s, v27.4s, v22.4s +sub v22.4s, v18.4s, v23.4s +add v18.4s, v18.4s, v23.4s +sub v23.4s, v28.4s, v26.4s +add v28.4s, v28.4s, v26.4s +sub v26.4s, v19.4s, v3.4s +add v19.4s, v19.4s, v3.4s +sub v3.4s, v21.4s, v25.4s +add v21.4s, v21.4s, v25.4s +sub v25.4s, v16.4s, v24.4s +add v16.4s, v16.4s, v24.4s +str q17, [x0, #48] +str q20, [x0, #112] +str q10, [x0, #176] +str q30, [x0, #240] +str q27, [x0, #304] +str q29, [x0, #368] +str q18, [x0, #432] +str q22, [x0, #496] +str q28, [x0, #560] +str q23, [x0, #624] +str q19, [x0, #688] +str q26, [x0, #752] +str q21, [x0, #816] +str q3, [x0, #880] +str q16, [x0, #944] +str q25, [x0, #1008] +ldr q25, [x0, #768] +ldr q16, [x0, #832] +ldr q3, [x0, #896] +ldr q21, [x0, #960] +ldr q26, [x0, #256] +ldr q19, [x0, #320] +ldr q23, [x0, #384] +ldr q28, [x0, #448] +ldr q22, [x0, #512] +ldr q18, [x0, #576] +ldr q29, [x0, #640] +ldr q27, [x0, #704] +ldr q30, [x0, #0] +ldr q10, [x0, #64] +ldr q20, [x0, #128] +ldr q17, [x0, #192] +sqrdmulh v24.4S, v25.4S, v1.s[0] +mul v25.4S, v25.4S,v2.s[0] +sqrdmulh v9.4S, v16.4S, v1.s[0] +mul v16.4S, v16.4S,v2.s[0] +sqrdmulh v8.4S, v3.4S, v1.s[0] +mul v3.4S, v3.4S,v2.s[0] +sqrdmulh v7.4S, v21.4S, v1.s[0] +mul v21.4S, v21.4S,v2.s[0] +mla v25.4S, v24.4S, v31.s[0] +mla v16.4S, v9.4S, v31.s[0] +mla v3.4S, v8.4S, v31.s[0] +mla v21.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v22.4S, v1.s[0] +mul v22.4S, v22.4S,v2.s[0] +sqrdmulh v8.4S, v18.4S, v1.s[0] +mul v18.4S, v18.4S,v2.s[0] +sqrdmulh v9.4S, v29.4S, v1.s[0] +mul v29.4S, v29.4S,v2.s[0] +sqrdmulh v24.4S, v27.4S, v1.s[0] +mul v27.4S, v27.4S,v2.s[0] +mla v22.4S, v7.4S, v31.s[0] +mla v18.4S, v8.4S, v31.s[0] +mla v29.4S, v9.4S, v31.s[0] +mla v27.4S, v24.4S, v31.s[0] +sub v24.4s, v26.4s, v25.4s +add v26.4s, v26.4s, v25.4s +sub v25.4s, v19.4s, v16.4s +add v19.4s, v19.4s, v16.4s +sub v16.4s, v23.4s, v3.4s +add v23.4s, v23.4s, v3.4s +sub v3.4s, v28.4s, v21.4s +add v28.4s, v28.4s, v21.4s +sub v21.4s, v30.4s, v22.4s +add v30.4s, v30.4s, v22.4s +sub v22.4s, v10.4s, v18.4s +add v10.4s, v10.4s, v18.4s +sub v18.4s, v20.4s, v29.4s +add v20.4s, v20.4s, v29.4s +sub v29.4s, v17.4s, v27.4s +add v17.4s, v17.4s, v27.4s +sqrdmulh v27.4S, v23.4S, v1.s[1] +mul v23.4S, v23.4S,v2.s[1] +sqrdmulh v9.4S, v28.4S, v1.s[1] +mul v28.4S, v28.4S,v2.s[1] +sqrdmulh v8.4S, v26.4S, v1.s[1] +mul v26.4S, v26.4S,v2.s[1] +sqrdmulh v7.4S, v19.4S, v1.s[1] +mul v19.4S, v19.4S,v2.s[1] +mla v23.4S, v27.4S, v31.s[0] +mla v28.4S, v9.4S, v31.s[0] +mla v26.4S, v8.4S, v31.s[0] +mla v19.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v16.4S, v1.s[2] +mul v16.4S, v16.4S,v2.s[2] +sqrdmulh v8.4S, v3.4S, v1.s[2] +mul v3.4S, v3.4S,v2.s[2] +sqrdmulh v9.4S, v24.4S, v1.s[2] +mul v24.4S, v24.4S,v2.s[2] +sqrdmulh v27.4S, v25.4S, v1.s[2] +mul v25.4S, v25.4S,v2.s[2] +mla v16.4S, v7.4S, v31.s[0] +mla v3.4S, v8.4S, v31.s[0] +mla v24.4S, v9.4S, v31.s[0] +mla v25.4S, v27.4S, v31.s[0] +sub v27.4s, v20.4s, v23.4s +add v20.4s, v20.4s, v23.4s +sub v23.4s, v17.4s, v28.4s +add v17.4s, v17.4s, v28.4s +sub v28.4s, v30.4s, v26.4s +add v30.4s, v30.4s, v26.4s +sub v26.4s, v10.4s, v19.4s +add v10.4s, v10.4s, v19.4s +sub v19.4s, v18.4s, v16.4s +add v18.4s, v18.4s, v16.4s +sub v16.4s, v29.4s, v3.4s +add v29.4s, v29.4s, v3.4s +sub v3.4s, v21.4s, v24.4s +add v21.4s, v21.4s, v24.4s +sub v24.4s, v22.4s, v25.4s +add v22.4s, v22.4s, v25.4s +sqrdmulh v25.4S, v20.4S, v15.s[0] +mul v20.4S, v20.4S,v0.s[0] +sqrdmulh v9.4S, v17.4S, v15.s[0] +mul v17.4S, v17.4S,v0.s[0] +sqrdmulh v8.4S, v27.4S, v15.s[1] +mul v27.4S, v27.4S,v0.s[1] +sqrdmulh v7.4S, v23.4S, v15.s[1] +mul v23.4S, v23.4S,v0.s[1] +mla v20.4S, v25.4S, v31.s[0] +mla v17.4S, v9.4S, v31.s[0] +mla v27.4S, v8.4S, v31.s[0] +mla v23.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v18.4S, v15.s[2] +mul v18.4S, v18.4S,v0.s[2] +sqrdmulh v8.4S, v29.4S, v15.s[2] +mul v29.4S, v29.4S,v0.s[2] +sqrdmulh v9.4S, v19.4S, v15.s[3] +mul v19.4S, v19.4S,v0.s[3] +sqrdmulh v25.4S, v16.4S, v15.s[3] +mul v16.4S, v16.4S,v0.s[3] +mla v18.4S, v7.4S, v31.s[0] +mla v29.4S, v8.4S, v31.s[0] +mla v19.4S, v9.4S, v31.s[0] +mla v16.4S, v25.4S, v31.s[0] +sub v25.4s, v30.4s, v20.4s +add v30.4s, v30.4s, v20.4s +sub v20.4s, v10.4s, v17.4s +add v10.4s, v10.4s, v17.4s +sub v17.4s, v28.4s, v27.4s +add v28.4s, v28.4s, v27.4s +sub v27.4s, v26.4s, v23.4s +add v26.4s, v26.4s, v23.4s +sub v23.4s, v21.4s, v18.4s +add v21.4s, v21.4s, v18.4s +sub v18.4s, v22.4s, v29.4s +add v22.4s, v22.4s, v29.4s +sub v29.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +sub v19.4s, v24.4s, v16.4s +add v24.4s, v24.4s, v16.4s +sqrdmulh v16.4S, v10.4S, v13.s[0] +mul v10.4S, v10.4S,v14.s[0] +sqrdmulh v9.4S, v20.4S, v13.s[1] +mul v20.4S, v20.4S,v14.s[1] +sqrdmulh v8.4S, v26.4S, v13.s[2] +mul v26.4S, v26.4S,v14.s[2] +sqrdmulh v7.4S, v27.4S, v13.s[3] +mul v27.4S, v27.4S,v14.s[3] +mla v10.4S, v16.4S, v31.s[0] +mla v20.4S, v9.4S, v31.s[0] +mla v26.4S, v8.4S, v31.s[0] +mla v27.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v22.4S, v11.s[0] +mul v22.4S, v22.4S,v12.s[0] +sqrdmulh v8.4S, v18.4S, v11.s[1] +mul v18.4S, v18.4S,v12.s[1] +sqrdmulh v9.4S, v24.4S, v11.s[2] +mul v24.4S, v24.4S,v12.s[2] +sqrdmulh v16.4S, v19.4S, v11.s[3] +mul v19.4S, v19.4S,v12.s[3] +mla v22.4S, v7.4S, v31.s[0] +mla v18.4S, v8.4S, v31.s[0] +mla v24.4S, v9.4S, v31.s[0] +mla v19.4S, v16.4S, v31.s[0] +sub v16.4s, v30.4s, v10.4s +add v30.4s, v30.4s, v10.4s +sub v10.4s, v25.4s, v20.4s +add v25.4s, v25.4s, v20.4s +sub v20.4s, v28.4s, v26.4s +add v28.4s, v28.4s, v26.4s +sub v26.4s, v17.4s, v27.4s +add v17.4s, v17.4s, v27.4s +sub v27.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +sub v22.4s, v23.4s, v18.4s +add v23.4s, v23.4s, v18.4s +sub v18.4s, v3.4s, v24.4s +add v3.4s, v3.4s, v24.4s +sub v24.4s, v29.4s, v19.4s +add v29.4s, v29.4s, v19.4s +str q30, [x0, #0] +str q16, [x0, #64] +str q25, [x0, #128] +str q10, [x0, #192] +str q28, [x0, #256] +str q20, [x0, #320] +str q17, [x0, #384] +str q26, [x0, #448] +str q21, [x0, #512] +str q27, [x0, #576] +str q23, [x0, #640] +str q22, [x0, #704] +str q3, [x0, #768] +str q18, [x0, #832] +str q29, [x0, #896] +str q24, [x0, #960] +ldr q24, [x0, #784] +ldr q29, [x0, #848] +ldr q18, [x0, #912] +ldr q3, [x0, #976] +ldr q22, [x0, #272] +ldr q23, [x0, #336] +ldr q27, [x0, #400] +ldr q21, [x0, #464] +ldr q26, [x0, #528] +ldr q17, [x0, #592] +ldr q20, [x0, #656] +ldr q28, [x0, #720] +ldr q10, [x0, #16] +ldr q25, [x0, #80] +ldr q16, [x0, #144] +ldr q30, [x0, #208] +sqrdmulh v19.4S, v24.4S, v1.s[0] +mul v24.4S, v24.4S,v2.s[0] +sqrdmulh v9.4S, v29.4S, v1.s[0] +mul v29.4S, v29.4S,v2.s[0] +sqrdmulh v8.4S, v18.4S, v1.s[0] +mul v18.4S, v18.4S,v2.s[0] +sqrdmulh v7.4S, v3.4S, v1.s[0] +mul v3.4S, v3.4S,v2.s[0] +mla v24.4S, v19.4S, v31.s[0] +mla v29.4S, v9.4S, v31.s[0] +mla v18.4S, v8.4S, v31.s[0] +mla v3.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v26.4S, v1.s[0] +mul v26.4S, v26.4S,v2.s[0] +sqrdmulh v8.4S, v17.4S, v1.s[0] +mul v17.4S, v17.4S,v2.s[0] +sqrdmulh v9.4S, v20.4S, v1.s[0] +mul v20.4S, v20.4S,v2.s[0] +sqrdmulh v19.4S, v28.4S, v1.s[0] +mul v28.4S, v28.4S,v2.s[0] +mla v26.4S, v7.4S, v31.s[0] +mla v17.4S, v8.4S, v31.s[0] +mla v20.4S, v9.4S, v31.s[0] +mla v28.4S, v19.4S, v31.s[0] +sub v19.4s, v22.4s, v24.4s +add v22.4s, v22.4s, v24.4s +sub v24.4s, v23.4s, v29.4s +add v23.4s, v23.4s, v29.4s +sub v29.4s, v27.4s, v18.4s +add v27.4s, v27.4s, v18.4s +sub v18.4s, v21.4s, v3.4s +add v21.4s, v21.4s, v3.4s +sub v3.4s, v10.4s, v26.4s +add v10.4s, v10.4s, v26.4s +sub v26.4s, v25.4s, v17.4s +add v25.4s, v25.4s, v17.4s +sub v17.4s, v16.4s, v20.4s +add v16.4s, v16.4s, v20.4s +sub v20.4s, v30.4s, v28.4s +add v30.4s, v30.4s, v28.4s +sqrdmulh v28.4S, v27.4S, v1.s[1] +mul v27.4S, v27.4S,v2.s[1] +sqrdmulh v9.4S, v21.4S, v1.s[1] +mul v21.4S, v21.4S,v2.s[1] +sqrdmulh v8.4S, v22.4S, v1.s[1] +mul v22.4S, v22.4S,v2.s[1] +sqrdmulh v7.4S, v23.4S, v1.s[1] +mul v23.4S, v23.4S,v2.s[1] +mla v27.4S, v28.4S, v31.s[0] +mla v21.4S, v9.4S, v31.s[0] +mla v22.4S, v8.4S, v31.s[0] +mla v23.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v29.4S, v1.s[2] +mul v29.4S, v29.4S,v2.s[2] +sqrdmulh v8.4S, v18.4S, v1.s[2] +mul v18.4S, v18.4S,v2.s[2] +sqrdmulh v9.4S, v19.4S, v1.s[2] +mul v19.4S, v19.4S,v2.s[2] +sqrdmulh v28.4S, v24.4S, v1.s[2] +mul v24.4S, v24.4S,v2.s[2] +mla v29.4S, v7.4S, v31.s[0] +mla v18.4S, v8.4S, v31.s[0] +mla v19.4S, v9.4S, v31.s[0] +mla v24.4S, v28.4S, v31.s[0] +sub v28.4s, v16.4s, v27.4s +add v16.4s, v16.4s, v27.4s +sub v27.4s, v30.4s, v21.4s +add v30.4s, v30.4s, v21.4s +sub v21.4s, v10.4s, v22.4s +add v10.4s, v10.4s, v22.4s +sub v22.4s, v25.4s, v23.4s +add v25.4s, v25.4s, v23.4s +sub v23.4s, v17.4s, v29.4s +add v17.4s, v17.4s, v29.4s +sub v29.4s, v20.4s, v18.4s +add v20.4s, v20.4s, v18.4s +sub v18.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +sub v19.4s, v26.4s, v24.4s +add v26.4s, v26.4s, v24.4s +sqrdmulh v24.4S, v16.4S, v15.s[0] +mul v16.4S, v16.4S,v0.s[0] +sqrdmulh v9.4S, v30.4S, v15.s[0] +mul v30.4S, v30.4S,v0.s[0] +sqrdmulh v8.4S, v28.4S, v15.s[1] +mul v28.4S, v28.4S,v0.s[1] +sqrdmulh v7.4S, v27.4S, v15.s[1] +mul v27.4S, v27.4S,v0.s[1] +mla v16.4S, v24.4S, v31.s[0] +mla v30.4S, v9.4S, v31.s[0] +mla v28.4S, v8.4S, v31.s[0] +mla v27.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v17.4S, v15.s[2] +mul v17.4S, v17.4S,v0.s[2] +sqrdmulh v8.4S, v20.4S, v15.s[2] +mul v20.4S, v20.4S,v0.s[2] +sqrdmulh v9.4S, v23.4S, v15.s[3] +mul v23.4S, v23.4S,v0.s[3] +sqrdmulh v24.4S, v29.4S, v15.s[3] +mul v29.4S, v29.4S,v0.s[3] +mla v17.4S, v7.4S, v31.s[0] +mla v20.4S, v8.4S, v31.s[0] +mla v23.4S, v9.4S, v31.s[0] +mla v29.4S, v24.4S, v31.s[0] +sub v24.4s, v10.4s, v16.4s +add v10.4s, v10.4s, v16.4s +sub v16.4s, v25.4s, v30.4s +add v25.4s, v25.4s, v30.4s +sub v30.4s, v21.4s, v28.4s +add v21.4s, v21.4s, v28.4s +sub v28.4s, v22.4s, v27.4s +add v22.4s, v22.4s, v27.4s +sub v27.4s, v3.4s, v17.4s +add v3.4s, v3.4s, v17.4s +sub v17.4s, v26.4s, v20.4s +add v26.4s, v26.4s, v20.4s +sub v20.4s, v18.4s, v23.4s +add v18.4s, v18.4s, v23.4s +sub v23.4s, v19.4s, v29.4s +add v19.4s, v19.4s, v29.4s +sqrdmulh v29.4S, v25.4S, v13.s[0] +mul v25.4S, v25.4S,v14.s[0] +sqrdmulh v9.4S, v16.4S, v13.s[1] +mul v16.4S, v16.4S,v14.s[1] +sqrdmulh v8.4S, v22.4S, v13.s[2] +mul v22.4S, v22.4S,v14.s[2] +sqrdmulh v7.4S, v28.4S, v13.s[3] +mul v28.4S, v28.4S,v14.s[3] +mla v25.4S, v29.4S, v31.s[0] +mla v16.4S, v9.4S, v31.s[0] +mla v22.4S, v8.4S, v31.s[0] +mla v28.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v26.4S, v11.s[0] +mul v26.4S, v26.4S,v12.s[0] +sqrdmulh v8.4S, v17.4S, v11.s[1] +mul v17.4S, v17.4S,v12.s[1] +sqrdmulh v9.4S, v19.4S, v11.s[2] +mul v19.4S, v19.4S,v12.s[2] +sqrdmulh v29.4S, v23.4S, v11.s[3] +mul v23.4S, v23.4S,v12.s[3] +mla v26.4S, v7.4S, v31.s[0] +mla v17.4S, v8.4S, v31.s[0] +mla v19.4S, v9.4S, v31.s[0] +mla v23.4S, v29.4S, v31.s[0] +sub v29.4s, v10.4s, v25.4s +add v10.4s, v10.4s, v25.4s +sub v25.4s, v24.4s, v16.4s +add v24.4s, v24.4s, v16.4s +sub v16.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +sub v22.4s, v30.4s, v28.4s +add v30.4s, v30.4s, v28.4s +sub v28.4s, v3.4s, v26.4s +add v3.4s, v3.4s, v26.4s +sub v26.4s, v27.4s, v17.4s +add v27.4s, v27.4s, v17.4s +sub v17.4s, v18.4s, v19.4s +add v18.4s, v18.4s, v19.4s +sub v19.4s, v20.4s, v23.4s +add v20.4s, v20.4s, v23.4s +str q10, [x0, #16] +str q29, [x0, #80] +str q24, [x0, #144] +str q25, [x0, #208] +str q21, [x0, #272] +str q16, [x0, #336] +str q30, [x0, #400] +str q22, [x0, #464] +str q3, [x0, #528] +str q28, [x0, #592] +str q27, [x0, #656] +str q26, [x0, #720] +str q18, [x0, #784] +str q17, [x0, #848] +str q20, [x0, #912] +str q19, [x0, #976] +ldr q4, [x17, #+128] +ldr q5, [x17, #+144] +ldr q6, [x17, #+160] +ldr q7, [x17, #+176] +ldr q8, [x17, #+192] +ldr q9, [x17, #+208] +ldr q23, [x17, #+224] +ldr q10, [x17, #+240] +ldr q29, [x0, #32] +ldr q24, [x0, #48] +ldr q25, [x0, #0] +ldr q21, [x0, #16] +sqrdmulh v16.4S, v29.4S, v5.s[0] +mul v29.4S, v29.4S,v4.s[0] +mla v29.4S, v16.4S, v31.s[0] +sub v16.4s, v25.4s, v29.4s +add v25.4s, v25.4s, v29.4s +sqrdmulh v29.4S, v24.4S, v5.s[0] +mul v24.4S, v24.4S,v4.s[0] +mla v24.4S, v29.4S, v31.s[0] +sub v29.4s, v21.4s, v24.4s +add v21.4s, v21.4s, v24.4s +sqrdmulh v24.4S, v21.4S, v5.s[1] +mul v21.4S, v21.4S,v4.s[1] +mla v21.4S, v24.4S, v31.s[0] +sub v24.4s, v25.4s, v21.4s +add v25.4s, v25.4s, v21.4s +sqrdmulh v21.4S, v29.4S, v5.s[2] +mul v29.4S, v29.4S,v4.s[2] +mla v29.4S, v21.4S, v31.s[0] +sub v21.4s, v16.4s, v29.4s +add v16.4s, v16.4s, v29.4s +trn1 v29.4S, v25.4S, v24.4S +trn2 v30.4S, v25.4S, v24.4S +trn1 v22.4S, v16.4S, v21.4S +trn2 v3.4S, v16.4S, v21.4S +trn2 v16.2D, v29.2D, v22.2D +trn2 v21.2D, v30.2D, v3.2D +trn1 v25.2D, v29.2D, v22.2D +trn1 v24.2D, v30.2D, v3.2D +sqrdmulh v3.4S, v16.4S, v7.4S +mul v16.4S, v16.4S,v6.4S +mla v16.4S, v3.4S, v31.s[0] +sub v3.4s, v25.4s, v16.4s +add v25.4s, v25.4s, v16.4s +sqrdmulh v16.4S, v21.4S, v7.4S +mul v21.4S, v21.4S,v6.4S +mla v21.4S, v16.4S, v31.s[0] +sub v16.4s, v24.4s, v21.4s +add v24.4s, v24.4s, v21.4s +sqrdmulh v21.4S, v24.4S, v9.4S +mul v24.4S, v24.4S,v8.4S +mla v24.4S, v21.4S, v31.s[0] +sub v21.4s, v25.4s, v24.4s +add v25.4s, v25.4s, v24.4s +sqrdmulh v24.4S, v16.4S, v10.4S +mul v16.4S, v16.4S,v23.4S +mla v16.4S, v24.4S, v31.s[0] +sub v24.4s, v3.4s, v16.4s +add v3.4s, v3.4s, v16.4s +str q25, [x0, #0] +str q21, [x0, #16] +str q3, [x0, #32] +str q24, [x0, #48] +ldr q24, [x17, #+256] +ldr q3, [x17, #+272] +ldr q21, [x17, #+288] +ldr q25, [x17, #+304] +ldr q16, [x17, #+320] +ldr q30, [x17, #+336] +ldr q22, [x17, #+352] +ldr q29, [x17, #+368] +ldr q10, [x0, #96] +ldr q23, [x0, #112] +ldr q9, [x0, #64] +ldr q8, [x0, #80] +sqrdmulh v7.4S, v10.4S, v3.s[0] +mul v10.4S, v10.4S,v24.s[0] +mla v10.4S, v7.4S, v31.s[0] +sub v7.4s, v9.4s, v10.4s +add v9.4s, v9.4s, v10.4s +sqrdmulh v10.4S, v23.4S, v3.s[0] +mul v23.4S, v23.4S,v24.s[0] +mla v23.4S, v10.4S, v31.s[0] +sub v10.4s, v8.4s, v23.4s +add v8.4s, v8.4s, v23.4s +sqrdmulh v23.4S, v8.4S, v3.s[1] +mul v8.4S, v8.4S,v24.s[1] +mla v8.4S, v23.4S, v31.s[0] +sub v23.4s, v9.4s, v8.4s +add v9.4s, v9.4s, v8.4s +sqrdmulh v8.4S, v10.4S, v3.s[2] +mul v10.4S, v10.4S,v24.s[2] +mla v10.4S, v8.4S, v31.s[0] +sub v8.4s, v7.4s, v10.4s +add v7.4s, v7.4s, v10.4s +trn1 v10.4S, v9.4S, v23.4S +trn2 v6.4S, v9.4S, v23.4S +trn1 v5.4S, v7.4S, v8.4S +trn2 v4.4S, v7.4S, v8.4S +trn2 v7.2D, v10.2D, v5.2D +trn2 v8.2D, v6.2D, v4.2D +trn1 v9.2D, v10.2D, v5.2D +trn1 v23.2D, v6.2D, v4.2D +sqrdmulh v4.4S, v7.4S, v25.4S +mul v7.4S, v7.4S,v21.4S +mla v7.4S, v4.4S, v31.s[0] +sub v4.4s, v9.4s, v7.4s +add v9.4s, v9.4s, v7.4s +sqrdmulh v7.4S, v8.4S, v25.4S +mul v8.4S, v8.4S,v21.4S +mla v8.4S, v7.4S, v31.s[0] +sub v7.4s, v23.4s, v8.4s +add v23.4s, v23.4s, v8.4s +sqrdmulh v8.4S, v23.4S, v30.4S +mul v23.4S, v23.4S,v16.4S +mla v23.4S, v8.4S, v31.s[0] +sub v8.4s, v9.4s, v23.4s +add v9.4s, v9.4s, v23.4s +sqrdmulh v23.4S, v7.4S, v29.4S +mul v7.4S, v7.4S,v22.4S +mla v7.4S, v23.4S, v31.s[0] +sub v23.4s, v4.4s, v7.4s +add v4.4s, v4.4s, v7.4s +str q9, [x0, #64] +str q8, [x0, #80] +str q4, [x0, #96] +str q23, [x0, #112] +ldr q23, [x17, #+384] +ldr q4, [x17, #+400] +ldr q8, [x17, #+416] +ldr q9, [x17, #+432] +ldr q7, [x17, #+448] +ldr q6, [x17, #+464] +ldr q5, [x17, #+480] +ldr q10, [x17, #+496] +ldr q29, [x0, #160] +ldr q22, [x0, #176] +ldr q30, [x0, #128] +ldr q16, [x0, #144] +sqrdmulh v25.4S, v29.4S, v4.s[0] +mul v29.4S, v29.4S,v23.s[0] +mla v29.4S, v25.4S, v31.s[0] +sub v25.4s, v30.4s, v29.4s +add v30.4s, v30.4s, v29.4s +sqrdmulh v29.4S, v22.4S, v4.s[0] +mul v22.4S, v22.4S,v23.s[0] +mla v22.4S, v29.4S, v31.s[0] +sub v29.4s, v16.4s, v22.4s +add v16.4s, v16.4s, v22.4s +sqrdmulh v22.4S, v16.4S, v4.s[1] +mul v16.4S, v16.4S,v23.s[1] +mla v16.4S, v22.4S, v31.s[0] +sub v22.4s, v30.4s, v16.4s +add v30.4s, v30.4s, v16.4s +sqrdmulh v16.4S, v29.4S, v4.s[2] +mul v29.4S, v29.4S,v23.s[2] +mla v29.4S, v16.4S, v31.s[0] +sub v16.4s, v25.4s, v29.4s +add v25.4s, v25.4s, v29.4s +trn1 v29.4S, v30.4S, v22.4S +trn2 v21.4S, v30.4S, v22.4S +trn1 v3.4S, v25.4S, v16.4S +trn2 v24.4S, v25.4S, v16.4S +trn2 v25.2D, v29.2D, v3.2D +trn2 v16.2D, v21.2D, v24.2D +trn1 v30.2D, v29.2D, v3.2D +trn1 v22.2D, v21.2D, v24.2D +sqrdmulh v24.4S, v25.4S, v9.4S +mul v25.4S, v25.4S,v8.4S +mla v25.4S, v24.4S, v31.s[0] +sub v24.4s, v30.4s, v25.4s +add v30.4s, v30.4s, v25.4s +sqrdmulh v25.4S, v16.4S, v9.4S +mul v16.4S, v16.4S,v8.4S +mla v16.4S, v25.4S, v31.s[0] +sub v25.4s, v22.4s, v16.4s +add v22.4s, v22.4s, v16.4s +sqrdmulh v16.4S, v22.4S, v6.4S +mul v22.4S, v22.4S,v7.4S +mla v22.4S, v16.4S, v31.s[0] +sub v16.4s, v30.4s, v22.4s +add v30.4s, v30.4s, v22.4s +sqrdmulh v22.4S, v25.4S, v10.4S +mul v25.4S, v25.4S,v5.4S +mla v25.4S, v22.4S, v31.s[0] +sub v22.4s, v24.4s, v25.4s +add v24.4s, v24.4s, v25.4s +str q30, [x0, #128] +str q16, [x0, #144] +str q24, [x0, #160] +str q22, [x0, #176] +ldr q22, [x17, #+512] +ldr q24, [x17, #+528] +ldr q16, [x17, #+544] +ldr q30, [x17, #+560] +ldr q25, [x17, #+576] +ldr q21, [x17, #+592] +ldr q3, [x17, #+608] +ldr q29, [x17, #+624] +ldr q10, [x0, #224] +ldr q5, [x0, #240] +ldr q6, [x0, #192] +ldr q7, [x0, #208] +sqrdmulh v9.4S, v10.4S, v24.s[0] +mul v10.4S, v10.4S,v22.s[0] +mla v10.4S, v9.4S, v31.s[0] +sub v9.4s, v6.4s, v10.4s +add v6.4s, v6.4s, v10.4s +sqrdmulh v10.4S, v5.4S, v24.s[0] +mul v5.4S, v5.4S,v22.s[0] +mla v5.4S, v10.4S, v31.s[0] +sub v10.4s, v7.4s, v5.4s +add v7.4s, v7.4s, v5.4s +sqrdmulh v5.4S, v7.4S, v24.s[1] +mul v7.4S, v7.4S,v22.s[1] +mla v7.4S, v5.4S, v31.s[0] +sub v5.4s, v6.4s, v7.4s +add v6.4s, v6.4s, v7.4s +sqrdmulh v7.4S, v10.4S, v24.s[2] +mul v10.4S, v10.4S,v22.s[2] +mla v10.4S, v7.4S, v31.s[0] +sub v7.4s, v9.4s, v10.4s +add v9.4s, v9.4s, v10.4s +trn1 v10.4S, v6.4S, v5.4S +trn2 v8.4S, v6.4S, v5.4S +trn1 v4.4S, v9.4S, v7.4S +trn2 v23.4S, v9.4S, v7.4S +trn2 v9.2D, v10.2D, v4.2D +trn2 v7.2D, v8.2D, v23.2D +trn1 v6.2D, v10.2D, v4.2D +trn1 v5.2D, v8.2D, v23.2D +sqrdmulh v23.4S, v9.4S, v30.4S +mul v9.4S, v9.4S,v16.4S +mla v9.4S, v23.4S, v31.s[0] +sub v23.4s, v6.4s, v9.4s +add v6.4s, v6.4s, v9.4s +sqrdmulh v9.4S, v7.4S, v30.4S +mul v7.4S, v7.4S,v16.4S +mla v7.4S, v9.4S, v31.s[0] +sub v9.4s, v5.4s, v7.4s +add v5.4s, v5.4s, v7.4s +sqrdmulh v7.4S, v5.4S, v21.4S +mul v5.4S, v5.4S,v25.4S +mla v5.4S, v7.4S, v31.s[0] +sub v7.4s, v6.4s, v5.4s +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v9.4S, v29.4S +mul v9.4S, v9.4S,v3.4S +mla v9.4S, v5.4S, v31.s[0] +sub v5.4s, v23.4s, v9.4s +add v23.4s, v23.4s, v9.4s +str q6, [x0, #192] +str q7, [x0, #208] +str q23, [x0, #224] +str q5, [x0, #240] +ldr q5, [x17, #+640] +ldr q23, [x17, #+656] +ldr q7, [x17, #+672] +ldr q6, [x17, #+688] +ldr q9, [x17, #+704] +ldr q8, [x17, #+720] +ldr q4, [x17, #+736] +ldr q10, [x17, #+752] +ldr q29, [x0, #288] +ldr q3, [x0, #304] +ldr q21, [x0, #256] +ldr q25, [x0, #272] +sqrdmulh v30.4S, v29.4S, v23.s[0] +mul v29.4S, v29.4S,v5.s[0] +mla v29.4S, v30.4S, v31.s[0] +sub v30.4s, v21.4s, v29.4s +add v21.4s, v21.4s, v29.4s +sqrdmulh v29.4S, v3.4S, v23.s[0] +mul v3.4S, v3.4S,v5.s[0] +mla v3.4S, v29.4S, v31.s[0] +sub v29.4s, v25.4s, v3.4s +add v25.4s, v25.4s, v3.4s +sqrdmulh v3.4S, v25.4S, v23.s[1] +mul v25.4S, v25.4S,v5.s[1] +mla v25.4S, v3.4S, v31.s[0] +sub v3.4s, v21.4s, v25.4s +add v21.4s, v21.4s, v25.4s +sqrdmulh v25.4S, v29.4S, v23.s[2] +mul v29.4S, v29.4S,v5.s[2] +mla v29.4S, v25.4S, v31.s[0] +sub v25.4s, v30.4s, v29.4s +add v30.4s, v30.4s, v29.4s +trn1 v29.4S, v21.4S, v3.4S +trn2 v16.4S, v21.4S, v3.4S +trn1 v24.4S, v30.4S, v25.4S +trn2 v22.4S, v30.4S, v25.4S +trn2 v30.2D, v29.2D, v24.2D +trn2 v25.2D, v16.2D, v22.2D +trn1 v21.2D, v29.2D, v24.2D +trn1 v3.2D, v16.2D, v22.2D +sqrdmulh v22.4S, v30.4S, v6.4S +mul v30.4S, v30.4S,v7.4S +mla v30.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v30.4s +add v21.4s, v21.4s, v30.4s +sqrdmulh v30.4S, v25.4S, v6.4S +mul v25.4S, v25.4S,v7.4S +mla v25.4S, v30.4S, v31.s[0] +sub v30.4s, v3.4s, v25.4s +add v3.4s, v3.4s, v25.4s +sqrdmulh v25.4S, v3.4S, v8.4S +mul v3.4S, v3.4S,v9.4S +mla v3.4S, v25.4S, v31.s[0] +sub v25.4s, v21.4s, v3.4s +add v21.4s, v21.4s, v3.4s +sqrdmulh v3.4S, v30.4S, v10.4S +mul v30.4S, v30.4S,v4.4S +mla v30.4S, v3.4S, v31.s[0] +sub v3.4s, v22.4s, v30.4s +add v22.4s, v22.4s, v30.4s +str q21, [x0, #256] +str q25, [x0, #272] +str q22, [x0, #288] +str q3, [x0, #304] +ldr q3, [x17, #+768] +ldr q22, [x17, #+784] +ldr q25, [x17, #+800] +ldr q21, [x17, #+816] +ldr q30, [x17, #+832] +ldr q16, [x17, #+848] +ldr q24, [x17, #+864] +ldr q29, [x17, #+880] +ldr q10, [x0, #352] +ldr q4, [x0, #368] +ldr q8, [x0, #320] +ldr q9, [x0, #336] +sqrdmulh v6.4S, v10.4S, v22.s[0] +mul v10.4S, v10.4S,v3.s[0] +mla v10.4S, v6.4S, v31.s[0] +sub v6.4s, v8.4s, v10.4s +add v8.4s, v8.4s, v10.4s +sqrdmulh v10.4S, v4.4S, v22.s[0] +mul v4.4S, v4.4S,v3.s[0] +mla v4.4S, v10.4S, v31.s[0] +sub v10.4s, v9.4s, v4.4s +add v9.4s, v9.4s, v4.4s +sqrdmulh v4.4S, v9.4S, v22.s[1] +mul v9.4S, v9.4S,v3.s[1] +mla v9.4S, v4.4S, v31.s[0] +sub v4.4s, v8.4s, v9.4s +add v8.4s, v8.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v22.s[2] +mul v10.4S, v10.4S,v3.s[2] +mla v10.4S, v9.4S, v31.s[0] +sub v9.4s, v6.4s, v10.4s +add v6.4s, v6.4s, v10.4s +trn1 v10.4S, v8.4S, v4.4S +trn2 v7.4S, v8.4S, v4.4S +trn1 v23.4S, v6.4S, v9.4S +trn2 v5.4S, v6.4S, v9.4S +trn2 v6.2D, v10.2D, v23.2D +trn2 v9.2D, v7.2D, v5.2D +trn1 v8.2D, v10.2D, v23.2D +trn1 v4.2D, v7.2D, v5.2D +sqrdmulh v5.4S, v6.4S, v21.4S +mul v6.4S, v6.4S,v25.4S +mla v6.4S, v5.4S, v31.s[0] +sub v5.4s, v8.4s, v6.4s +add v8.4s, v8.4s, v6.4s +sqrdmulh v6.4S, v9.4S, v21.4S +mul v9.4S, v9.4S,v25.4S +mla v9.4S, v6.4S, v31.s[0] +sub v6.4s, v4.4s, v9.4s +add v4.4s, v4.4s, v9.4s +sqrdmulh v9.4S, v4.4S, v16.4S +mul v4.4S, v4.4S,v30.4S +mla v4.4S, v9.4S, v31.s[0] +sub v9.4s, v8.4s, v4.4s +add v8.4s, v8.4s, v4.4s +sqrdmulh v4.4S, v6.4S, v29.4S +mul v6.4S, v6.4S,v24.4S +mla v6.4S, v4.4S, v31.s[0] +sub v4.4s, v5.4s, v6.4s +add v5.4s, v5.4s, v6.4s +str q8, [x0, #320] +str q9, [x0, #336] +str q5, [x0, #352] +str q4, [x0, #368] +ldr q4, [x17, #+896] +ldr q5, [x17, #+912] +ldr q9, [x17, #+928] +ldr q8, [x17, #+944] +ldr q6, [x17, #+960] +ldr q7, [x17, #+976] +ldr q23, [x17, #+992] +ldr q10, [x17, #+1008] +ldr q29, [x0, #416] +ldr q24, [x0, #432] +ldr q16, [x0, #384] +ldr q30, [x0, #400] +sqrdmulh v21.4S, v29.4S, v5.s[0] +mul v29.4S, v29.4S,v4.s[0] +mla v29.4S, v21.4S, v31.s[0] +sub v21.4s, v16.4s, v29.4s +add v16.4s, v16.4s, v29.4s +sqrdmulh v29.4S, v24.4S, v5.s[0] +mul v24.4S, v24.4S,v4.s[0] +mla v24.4S, v29.4S, v31.s[0] +sub v29.4s, v30.4s, v24.4s +add v30.4s, v30.4s, v24.4s +sqrdmulh v24.4S, v30.4S, v5.s[1] +mul v30.4S, v30.4S,v4.s[1] +mla v30.4S, v24.4S, v31.s[0] +sub v24.4s, v16.4s, v30.4s +add v16.4s, v16.4s, v30.4s +sqrdmulh v30.4S, v29.4S, v5.s[2] +mul v29.4S, v29.4S,v4.s[2] +mla v29.4S, v30.4S, v31.s[0] +sub v30.4s, v21.4s, v29.4s +add v21.4s, v21.4s, v29.4s +trn1 v29.4S, v16.4S, v24.4S +trn2 v25.4S, v16.4S, v24.4S +trn1 v22.4S, v21.4S, v30.4S +trn2 v3.4S, v21.4S, v30.4S +trn2 v21.2D, v29.2D, v22.2D +trn2 v30.2D, v25.2D, v3.2D +trn1 v16.2D, v29.2D, v22.2D +trn1 v24.2D, v25.2D, v3.2D +sqrdmulh v3.4S, v21.4S, v8.4S +mul v21.4S, v21.4S,v9.4S +mla v21.4S, v3.4S, v31.s[0] +sub v3.4s, v16.4s, v21.4s +add v16.4s, v16.4s, v21.4s +sqrdmulh v21.4S, v30.4S, v8.4S +mul v30.4S, v30.4S,v9.4S +mla v30.4S, v21.4S, v31.s[0] +sub v21.4s, v24.4s, v30.4s +add v24.4s, v24.4s, v30.4s +sqrdmulh v30.4S, v24.4S, v7.4S +mul v24.4S, v24.4S,v6.4S +mla v24.4S, v30.4S, v31.s[0] +sub v30.4s, v16.4s, v24.4s +add v16.4s, v16.4s, v24.4s +sqrdmulh v24.4S, v21.4S, v10.4S +mul v21.4S, v21.4S,v23.4S +mla v21.4S, v24.4S, v31.s[0] +sub v24.4s, v3.4s, v21.4s +add v3.4s, v3.4s, v21.4s +str q16, [x0, #384] +str q30, [x0, #400] +str q3, [x0, #416] +str q24, [x0, #432] +ldr q24, [x17, #+1024] +ldr q3, [x17, #+1040] +ldr q30, [x17, #+1056] +ldr q16, [x17, #+1072] +ldr q21, [x17, #+1088] +ldr q25, [x17, #+1104] +ldr q22, [x17, #+1120] +ldr q29, [x17, #+1136] +ldr q10, [x0, #480] +ldr q23, [x0, #496] +ldr q7, [x0, #448] +ldr q6, [x0, #464] +sqrdmulh v8.4S, v10.4S, v3.s[0] +mul v10.4S, v10.4S,v24.s[0] +mla v10.4S, v8.4S, v31.s[0] +sub v8.4s, v7.4s, v10.4s +add v7.4s, v7.4s, v10.4s +sqrdmulh v10.4S, v23.4S, v3.s[0] +mul v23.4S, v23.4S,v24.s[0] +mla v23.4S, v10.4S, v31.s[0] +sub v10.4s, v6.4s, v23.4s +add v6.4s, v6.4s, v23.4s +sqrdmulh v23.4S, v6.4S, v3.s[1] +mul v6.4S, v6.4S,v24.s[1] +mla v6.4S, v23.4S, v31.s[0] +sub v23.4s, v7.4s, v6.4s +add v7.4s, v7.4s, v6.4s +sqrdmulh v6.4S, v10.4S, v3.s[2] +mul v10.4S, v10.4S,v24.s[2] +mla v10.4S, v6.4S, v31.s[0] +sub v6.4s, v8.4s, v10.4s +add v8.4s, v8.4s, v10.4s +trn1 v10.4S, v7.4S, v23.4S +trn2 v9.4S, v7.4S, v23.4S +trn1 v5.4S, v8.4S, v6.4S +trn2 v4.4S, v8.4S, v6.4S +trn2 v8.2D, v10.2D, v5.2D +trn2 v6.2D, v9.2D, v4.2D +trn1 v7.2D, v10.2D, v5.2D +trn1 v23.2D, v9.2D, v4.2D +sqrdmulh v4.4S, v8.4S, v16.4S +mul v8.4S, v8.4S,v30.4S +mla v8.4S, v4.4S, v31.s[0] +sub v4.4s, v7.4s, v8.4s +add v7.4s, v7.4s, v8.4s +sqrdmulh v8.4S, v6.4S, v16.4S +mul v6.4S, v6.4S,v30.4S +mla v6.4S, v8.4S, v31.s[0] +sub v8.4s, v23.4s, v6.4s +add v23.4s, v23.4s, v6.4s +sqrdmulh v6.4S, v23.4S, v25.4S +mul v23.4S, v23.4S,v21.4S +mla v23.4S, v6.4S, v31.s[0] +sub v6.4s, v7.4s, v23.4s +add v7.4s, v7.4s, v23.4s +sqrdmulh v23.4S, v8.4S, v29.4S +mul v8.4S, v8.4S,v22.4S +mla v8.4S, v23.4S, v31.s[0] +sub v23.4s, v4.4s, v8.4s +add v4.4s, v4.4s, v8.4s +str q7, [x0, #448] +str q6, [x0, #464] +str q4, [x0, #480] +str q23, [x0, #496] +ldr q23, [x17, #+1152] +ldr q4, [x17, #+1168] +ldr q6, [x17, #+1184] +ldr q7, [x17, #+1200] +ldr q8, [x17, #+1216] +ldr q9, [x17, #+1232] +ldr q5, [x17, #+1248] +ldr q10, [x17, #+1264] +ldr q29, [x0, #544] +ldr q22, [x0, #560] +ldr q25, [x0, #512] +ldr q21, [x0, #528] +sqrdmulh v16.4S, v29.4S, v4.s[0] +mul v29.4S, v29.4S,v23.s[0] +mla v29.4S, v16.4S, v31.s[0] +sub v16.4s, v25.4s, v29.4s +add v25.4s, v25.4s, v29.4s +sqrdmulh v29.4S, v22.4S, v4.s[0] +mul v22.4S, v22.4S,v23.s[0] +mla v22.4S, v29.4S, v31.s[0] +sub v29.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v4.s[1] +mul v21.4S, v21.4S,v23.s[1] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v25.4s, v21.4s +add v25.4s, v25.4s, v21.4s +sqrdmulh v21.4S, v29.4S, v4.s[2] +mul v29.4S, v29.4S,v23.s[2] +mla v29.4S, v21.4S, v31.s[0] +sub v21.4s, v16.4s, v29.4s +add v16.4s, v16.4s, v29.4s +trn1 v29.4S, v25.4S, v22.4S +trn2 v30.4S, v25.4S, v22.4S +trn1 v3.4S, v16.4S, v21.4S +trn2 v24.4S, v16.4S, v21.4S +trn2 v16.2D, v29.2D, v3.2D +trn2 v21.2D, v30.2D, v24.2D +trn1 v25.2D, v29.2D, v3.2D +trn1 v22.2D, v30.2D, v24.2D +sqrdmulh v24.4S, v16.4S, v7.4S +mul v16.4S, v16.4S,v6.4S +mla v16.4S, v24.4S, v31.s[0] +sub v24.4s, v25.4s, v16.4s +add v25.4s, v25.4s, v16.4s +sqrdmulh v16.4S, v21.4S, v7.4S +mul v21.4S, v21.4S,v6.4S +mla v21.4S, v16.4S, v31.s[0] +sub v16.4s, v22.4s, v21.4s +add v22.4s, v22.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v9.4S +mul v22.4S, v22.4S,v8.4S +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v25.4s, v22.4s +add v25.4s, v25.4s, v22.4s +sqrdmulh v22.4S, v16.4S, v10.4S +mul v16.4S, v16.4S,v5.4S +mla v16.4S, v22.4S, v31.s[0] +sub v22.4s, v24.4s, v16.4s +add v24.4s, v24.4s, v16.4s +str q25, [x0, #512] +str q21, [x0, #528] +str q24, [x0, #544] +str q22, [x0, #560] +ldr q22, [x17, #+1280] +ldr q24, [x17, #+1296] +ldr q21, [x17, #+1312] +ldr q25, [x17, #+1328] +ldr q16, [x17, #+1344] +ldr q30, [x17, #+1360] +ldr q3, [x17, #+1376] +ldr q29, [x17, #+1392] +ldr q10, [x0, #608] +ldr q5, [x0, #624] +ldr q9, [x0, #576] +ldr q8, [x0, #592] +sqrdmulh v7.4S, v10.4S, v24.s[0] +mul v10.4S, v10.4S,v22.s[0] +mla v10.4S, v7.4S, v31.s[0] +sub v7.4s, v9.4s, v10.4s +add v9.4s, v9.4s, v10.4s +sqrdmulh v10.4S, v5.4S, v24.s[0] +mul v5.4S, v5.4S,v22.s[0] +mla v5.4S, v10.4S, v31.s[0] +sub v10.4s, v8.4s, v5.4s +add v8.4s, v8.4s, v5.4s +sqrdmulh v5.4S, v8.4S, v24.s[1] +mul v8.4S, v8.4S,v22.s[1] +mla v8.4S, v5.4S, v31.s[0] +sub v5.4s, v9.4s, v8.4s +add v9.4s, v9.4s, v8.4s +sqrdmulh v8.4S, v10.4S, v24.s[2] +mul v10.4S, v10.4S,v22.s[2] +mla v10.4S, v8.4S, v31.s[0] +sub v8.4s, v7.4s, v10.4s +add v7.4s, v7.4s, v10.4s +trn1 v10.4S, v9.4S, v5.4S +trn2 v6.4S, v9.4S, v5.4S +trn1 v4.4S, v7.4S, v8.4S +trn2 v23.4S, v7.4S, v8.4S +trn2 v7.2D, v10.2D, v4.2D +trn2 v8.2D, v6.2D, v23.2D +trn1 v9.2D, v10.2D, v4.2D +trn1 v5.2D, v6.2D, v23.2D +sqrdmulh v23.4S, v7.4S, v25.4S +mul v7.4S, v7.4S,v21.4S +mla v7.4S, v23.4S, v31.s[0] +sub v23.4s, v9.4s, v7.4s +add v9.4s, v9.4s, v7.4s +sqrdmulh v7.4S, v8.4S, v25.4S +mul v8.4S, v8.4S,v21.4S +mla v8.4S, v7.4S, v31.s[0] +sub v7.4s, v5.4s, v8.4s +add v5.4s, v5.4s, v8.4s +sqrdmulh v8.4S, v5.4S, v30.4S +mul v5.4S, v5.4S,v16.4S +mla v5.4S, v8.4S, v31.s[0] +sub v8.4s, v9.4s, v5.4s +add v9.4s, v9.4s, v5.4s +sqrdmulh v5.4S, v7.4S, v29.4S +mul v7.4S, v7.4S,v3.4S +mla v7.4S, v5.4S, v31.s[0] +sub v5.4s, v23.4s, v7.4s +add v23.4s, v23.4s, v7.4s +str q9, [x0, #576] +str q8, [x0, #592] +str q23, [x0, #608] +str q5, [x0, #624] +ldr q5, [x17, #+1408] +ldr q23, [x17, #+1424] +ldr q8, [x17, #+1440] +ldr q9, [x17, #+1456] +ldr q7, [x17, #+1472] +ldr q6, [x17, #+1488] +ldr q4, [x17, #+1504] +ldr q10, [x17, #+1520] +ldr q29, [x0, #672] +ldr q3, [x0, #688] +ldr q30, [x0, #640] +ldr q16, [x0, #656] +sqrdmulh v25.4S, v29.4S, v23.s[0] +mul v29.4S, v29.4S,v5.s[0] +mla v29.4S, v25.4S, v31.s[0] +sub v25.4s, v30.4s, v29.4s +add v30.4s, v30.4s, v29.4s +sqrdmulh v29.4S, v3.4S, v23.s[0] +mul v3.4S, v3.4S,v5.s[0] +mla v3.4S, v29.4S, v31.s[0] +sub v29.4s, v16.4s, v3.4s +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v16.4S, v23.s[1] +mul v16.4S, v16.4S,v5.s[1] +mla v16.4S, v3.4S, v31.s[0] +sub v3.4s, v30.4s, v16.4s +add v30.4s, v30.4s, v16.4s +sqrdmulh v16.4S, v29.4S, v23.s[2] +mul v29.4S, v29.4S,v5.s[2] +mla v29.4S, v16.4S, v31.s[0] +sub v16.4s, v25.4s, v29.4s +add v25.4s, v25.4s, v29.4s +trn1 v29.4S, v30.4S, v3.4S +trn2 v21.4S, v30.4S, v3.4S +trn1 v24.4S, v25.4S, v16.4S +trn2 v22.4S, v25.4S, v16.4S +trn2 v25.2D, v29.2D, v24.2D +trn2 v16.2D, v21.2D, v22.2D +trn1 v30.2D, v29.2D, v24.2D +trn1 v3.2D, v21.2D, v22.2D +sqrdmulh v22.4S, v25.4S, v9.4S +mul v25.4S, v25.4S,v8.4S +mla v25.4S, v22.4S, v31.s[0] +sub v22.4s, v30.4s, v25.4s +add v30.4s, v30.4s, v25.4s +sqrdmulh v25.4S, v16.4S, v9.4S +mul v16.4S, v16.4S,v8.4S +mla v16.4S, v25.4S, v31.s[0] +sub v25.4s, v3.4s, v16.4s +add v3.4s, v3.4s, v16.4s +sqrdmulh v16.4S, v3.4S, v6.4S +mul v3.4S, v3.4S,v7.4S +mla v3.4S, v16.4S, v31.s[0] +sub v16.4s, v30.4s, v3.4s +add v30.4s, v30.4s, v3.4s +sqrdmulh v3.4S, v25.4S, v10.4S +mul v25.4S, v25.4S,v4.4S +mla v25.4S, v3.4S, v31.s[0] +sub v3.4s, v22.4s, v25.4s +add v22.4s, v22.4s, v25.4s +str q30, [x0, #640] +str q16, [x0, #656] +str q22, [x0, #672] +str q3, [x0, #688] +ldr q3, [x17, #+1536] +ldr q22, [x17, #+1552] +ldr q16, [x17, #+1568] +ldr q30, [x17, #+1584] +ldr q25, [x17, #+1600] +ldr q21, [x17, #+1616] +ldr q24, [x17, #+1632] +ldr q29, [x17, #+1648] +ldr q10, [x0, #736] +ldr q4, [x0, #752] +ldr q6, [x0, #704] +ldr q7, [x0, #720] +sqrdmulh v9.4S, v10.4S, v22.s[0] +mul v10.4S, v10.4S,v3.s[0] +mla v10.4S, v9.4S, v31.s[0] +sub v9.4s, v6.4s, v10.4s +add v6.4s, v6.4s, v10.4s +sqrdmulh v10.4S, v4.4S, v22.s[0] +mul v4.4S, v4.4S,v3.s[0] +mla v4.4S, v10.4S, v31.s[0] +sub v10.4s, v7.4s, v4.4s +add v7.4s, v7.4s, v4.4s +sqrdmulh v4.4S, v7.4S, v22.s[1] +mul v7.4S, v7.4S,v3.s[1] +mla v7.4S, v4.4S, v31.s[0] +sub v4.4s, v6.4s, v7.4s +add v6.4s, v6.4s, v7.4s +sqrdmulh v7.4S, v10.4S, v22.s[2] +mul v10.4S, v10.4S,v3.s[2] +mla v10.4S, v7.4S, v31.s[0] +sub v7.4s, v9.4s, v10.4s +add v9.4s, v9.4s, v10.4s +trn1 v10.4S, v6.4S, v4.4S +trn2 v8.4S, v6.4S, v4.4S +trn1 v23.4S, v9.4S, v7.4S +trn2 v5.4S, v9.4S, v7.4S +trn2 v9.2D, v10.2D, v23.2D +trn2 v7.2D, v8.2D, v5.2D +trn1 v6.2D, v10.2D, v23.2D +trn1 v4.2D, v8.2D, v5.2D +sqrdmulh v5.4S, v9.4S, v30.4S +mul v9.4S, v9.4S,v16.4S +mla v9.4S, v5.4S, v31.s[0] +sub v5.4s, v6.4s, v9.4s +add v6.4s, v6.4s, v9.4s +sqrdmulh v9.4S, v7.4S, v30.4S +mul v7.4S, v7.4S,v16.4S +mla v7.4S, v9.4S, v31.s[0] +sub v9.4s, v4.4s, v7.4s +add v4.4s, v4.4s, v7.4s +sqrdmulh v7.4S, v4.4S, v21.4S +mul v4.4S, v4.4S,v25.4S +mla v4.4S, v7.4S, v31.s[0] +sub v7.4s, v6.4s, v4.4s +add v6.4s, v6.4s, v4.4s +sqrdmulh v4.4S, v9.4S, v29.4S +mul v9.4S, v9.4S,v24.4S +mla v9.4S, v4.4S, v31.s[0] +sub v4.4s, v5.4s, v9.4s +add v5.4s, v5.4s, v9.4s +str q6, [x0, #704] +str q7, [x0, #720] +str q5, [x0, #736] +str q4, [x0, #752] +ldr q4, [x17, #+1664] +ldr q5, [x17, #+1680] +ldr q7, [x17, #+1696] +ldr q6, [x17, #+1712] +ldr q9, [x17, #+1728] +ldr q8, [x17, #+1744] +ldr q23, [x17, #+1760] +ldr q10, [x17, #+1776] +ldr q29, [x0, #800] +ldr q24, [x0, #816] +ldr q21, [x0, #768] +ldr q25, [x0, #784] +sqrdmulh v30.4S, v29.4S, v5.s[0] +mul v29.4S, v29.4S,v4.s[0] +mla v29.4S, v30.4S, v31.s[0] +sub v30.4s, v21.4s, v29.4s +add v21.4s, v21.4s, v29.4s +sqrdmulh v29.4S, v24.4S, v5.s[0] +mul v24.4S, v24.4S,v4.s[0] +mla v24.4S, v29.4S, v31.s[0] +sub v29.4s, v25.4s, v24.4s +add v25.4s, v25.4s, v24.4s +sqrdmulh v24.4S, v25.4S, v5.s[1] +mul v25.4S, v25.4S,v4.s[1] +mla v25.4S, v24.4S, v31.s[0] +sub v24.4s, v21.4s, v25.4s +add v21.4s, v21.4s, v25.4s +sqrdmulh v25.4S, v29.4S, v5.s[2] +mul v29.4S, v29.4S,v4.s[2] +mla v29.4S, v25.4S, v31.s[0] +sub v25.4s, v30.4s, v29.4s +add v30.4s, v30.4s, v29.4s +trn1 v29.4S, v21.4S, v24.4S +trn2 v16.4S, v21.4S, v24.4S +trn1 v22.4S, v30.4S, v25.4S +trn2 v3.4S, v30.4S, v25.4S +trn2 v30.2D, v29.2D, v22.2D +trn2 v25.2D, v16.2D, v3.2D +trn1 v21.2D, v29.2D, v22.2D +trn1 v24.2D, v16.2D, v3.2D +sqrdmulh v3.4S, v30.4S, v6.4S +mul v30.4S, v30.4S,v7.4S +mla v30.4S, v3.4S, v31.s[0] +sub v3.4s, v21.4s, v30.4s +add v21.4s, v21.4s, v30.4s +sqrdmulh v30.4S, v25.4S, v6.4S +mul v25.4S, v25.4S,v7.4S +mla v25.4S, v30.4S, v31.s[0] +sub v30.4s, v24.4s, v25.4s +add v24.4s, v24.4s, v25.4s +sqrdmulh v25.4S, v24.4S, v8.4S +mul v24.4S, v24.4S,v9.4S +mla v24.4S, v25.4S, v31.s[0] +sub v25.4s, v21.4s, v24.4s +add v21.4s, v21.4s, v24.4s +sqrdmulh v24.4S, v30.4S, v10.4S +mul v30.4S, v30.4S,v23.4S +mla v30.4S, v24.4S, v31.s[0] +sub v24.4s, v3.4s, v30.4s +add v3.4s, v3.4s, v30.4s +str q21, [x0, #768] +str q25, [x0, #784] +str q3, [x0, #800] +str q24, [x0, #816] +ldr q24, [x17, #+1792] +ldr q3, [x17, #+1808] +ldr q25, [x17, #+1824] +ldr q21, [x17, #+1840] +ldr q30, [x17, #+1856] +ldr q16, [x17, #+1872] +ldr q22, [x17, #+1888] +ldr q29, [x17, #+1904] +ldr q10, [x0, #864] +ldr q23, [x0, #880] +ldr q8, [x0, #832] +ldr q9, [x0, #848] +sqrdmulh v6.4S, v10.4S, v3.s[0] +mul v10.4S, v10.4S,v24.s[0] +mla v10.4S, v6.4S, v31.s[0] +sub v6.4s, v8.4s, v10.4s +add v8.4s, v8.4s, v10.4s +sqrdmulh v10.4S, v23.4S, v3.s[0] +mul v23.4S, v23.4S,v24.s[0] +mla v23.4S, v10.4S, v31.s[0] +sub v10.4s, v9.4s, v23.4s +add v9.4s, v9.4s, v23.4s +sqrdmulh v23.4S, v9.4S, v3.s[1] +mul v9.4S, v9.4S,v24.s[1] +mla v9.4S, v23.4S, v31.s[0] +sub v23.4s, v8.4s, v9.4s +add v8.4s, v8.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v3.s[2] +mul v10.4S, v10.4S,v24.s[2] +mla v10.4S, v9.4S, v31.s[0] +sub v9.4s, v6.4s, v10.4s +add v6.4s, v6.4s, v10.4s +trn1 v10.4S, v8.4S, v23.4S +trn2 v7.4S, v8.4S, v23.4S +trn1 v5.4S, v6.4S, v9.4S +trn2 v4.4S, v6.4S, v9.4S +trn2 v6.2D, v10.2D, v5.2D +trn2 v9.2D, v7.2D, v4.2D +trn1 v8.2D, v10.2D, v5.2D +trn1 v23.2D, v7.2D, v4.2D +sqrdmulh v4.4S, v6.4S, v21.4S +mul v6.4S, v6.4S,v25.4S +mla v6.4S, v4.4S, v31.s[0] +sub v4.4s, v8.4s, v6.4s +add v8.4s, v8.4s, v6.4s +sqrdmulh v6.4S, v9.4S, v21.4S +mul v9.4S, v9.4S,v25.4S +mla v9.4S, v6.4S, v31.s[0] +sub v6.4s, v23.4s, v9.4s +add v23.4s, v23.4s, v9.4s +sqrdmulh v9.4S, v23.4S, v16.4S +mul v23.4S, v23.4S,v30.4S +mla v23.4S, v9.4S, v31.s[0] +sub v9.4s, v8.4s, v23.4s +add v8.4s, v8.4s, v23.4s +sqrdmulh v23.4S, v6.4S, v29.4S +mul v6.4S, v6.4S,v22.4S +mla v6.4S, v23.4S, v31.s[0] +sub v23.4s, v4.4s, v6.4s +add v4.4s, v4.4s, v6.4s +str q8, [x0, #832] +str q9, [x0, #848] +str q4, [x0, #864] +str q23, [x0, #880] +ldr q23, [x17, #+1920] +ldr q4, [x17, #+1936] +ldr q9, [x17, #+1952] +ldr q8, [x17, #+1968] +ldr q6, [x17, #+1984] +ldr q7, [x17, #+2000] +ldr q5, [x17, #+2016] +ldr q10, [x17, #+2032] +ldr q29, [x0, #928] +ldr q22, [x0, #944] +ldr q16, [x0, #896] +ldr q30, [x0, #912] +sqrdmulh v21.4S, v29.4S, v4.s[0] +mul v29.4S, v29.4S,v23.s[0] +mla v29.4S, v21.4S, v31.s[0] +sub v21.4s, v16.4s, v29.4s +add v16.4s, v16.4s, v29.4s +sqrdmulh v29.4S, v22.4S, v4.s[0] +mul v22.4S, v22.4S,v23.s[0] +mla v22.4S, v29.4S, v31.s[0] +sub v29.4s, v30.4s, v22.4s +add v30.4s, v30.4s, v22.4s +sqrdmulh v22.4S, v30.4S, v4.s[1] +mul v30.4S, v30.4S,v23.s[1] +mla v30.4S, v22.4S, v31.s[0] +sub v22.4s, v16.4s, v30.4s +add v16.4s, v16.4s, v30.4s +sqrdmulh v30.4S, v29.4S, v4.s[2] +mul v29.4S, v29.4S,v23.s[2] +mla v29.4S, v30.4S, v31.s[0] +sub v30.4s, v21.4s, v29.4s +add v21.4s, v21.4s, v29.4s +trn1 v29.4S, v16.4S, v22.4S +trn2 v25.4S, v16.4S, v22.4S +trn1 v3.4S, v21.4S, v30.4S +trn2 v24.4S, v21.4S, v30.4S +trn2 v21.2D, v29.2D, v3.2D +trn2 v30.2D, v25.2D, v24.2D +trn1 v16.2D, v29.2D, v3.2D +trn1 v22.2D, v25.2D, v24.2D +sqrdmulh v24.4S, v21.4S, v8.4S +mul v21.4S, v21.4S,v9.4S +mla v21.4S, v24.4S, v31.s[0] +sub v24.4s, v16.4s, v21.4s +add v16.4s, v16.4s, v21.4s +sqrdmulh v21.4S, v30.4S, v8.4S +mul v30.4S, v30.4S,v9.4S +mla v30.4S, v21.4S, v31.s[0] +sub v21.4s, v22.4s, v30.4s +add v22.4s, v22.4s, v30.4s +sqrdmulh v30.4S, v22.4S, v7.4S +mul v22.4S, v22.4S,v6.4S +mla v22.4S, v30.4S, v31.s[0] +sub v30.4s, v16.4s, v22.4s +add v16.4s, v16.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v10.4S +mul v21.4S, v21.4S,v5.4S +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v24.4s, v21.4s +add v24.4s, v24.4s, v21.4s +str q16, [x0, #896] +str q30, [x0, #912] +str q24, [x0, #928] +str q22, [x0, #944] +ldr q22, [x17, #+2048] +ldr q24, [x17, #+2064] +ldr q30, [x17, #+2080] +ldr q16, [x17, #+2096] +ldr q21, [x17, #+2112] +ldr q25, [x17, #+2128] +ldr q3, [x17, #+2144] +ldr q29, [x17, #+2160] +ldr q10, [x0, #992] +ldr q5, [x0, #1008] +ldr q7, [x0, #960] +ldr q6, [x0, #976] +sqrdmulh v8.4S, v10.4S, v24.s[0] +mul v10.4S, v10.4S,v22.s[0] +mla v10.4S, v8.4S, v31.s[0] +sub v8.4s, v7.4s, v10.4s +add v7.4s, v7.4s, v10.4s +sqrdmulh v10.4S, v5.4S, v24.s[0] +mul v5.4S, v5.4S,v22.s[0] +mla v5.4S, v10.4S, v31.s[0] +sub v10.4s, v6.4s, v5.4s +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v6.4S, v24.s[1] +mul v6.4S, v6.4S,v22.s[1] +mla v6.4S, v5.4S, v31.s[0] +sub v5.4s, v7.4s, v6.4s +add v7.4s, v7.4s, v6.4s +sqrdmulh v6.4S, v10.4S, v24.s[2] +mul v10.4S, v10.4S,v22.s[2] +mla v10.4S, v6.4S, v31.s[0] +sub v6.4s, v8.4s, v10.4s +add v8.4s, v8.4s, v10.4s +trn1 v10.4S, v7.4S, v5.4S +trn2 v9.4S, v7.4S, v5.4S +trn1 v4.4S, v8.4S, v6.4S +trn2 v23.4S, v8.4S, v6.4S +trn2 v8.2D, v10.2D, v4.2D +trn2 v6.2D, v9.2D, v23.2D +trn1 v7.2D, v10.2D, v4.2D +trn1 v5.2D, v9.2D, v23.2D +sqrdmulh v23.4S, v8.4S, v16.4S +mul v8.4S, v8.4S,v30.4S +mla v8.4S, v23.4S, v31.s[0] +sub v23.4s, v7.4s, v8.4s +add v7.4s, v7.4s, v8.4s +sqrdmulh v8.4S, v6.4S, v16.4S +mul v6.4S, v6.4S,v30.4S +mla v6.4S, v8.4S, v31.s[0] +sub v8.4s, v5.4s, v6.4s +add v5.4s, v5.4s, v6.4s +sqrdmulh v6.4S, v5.4S, v25.4S +mul v5.4S, v5.4S,v21.4S +mla v5.4S, v6.4S, v31.s[0] +sub v6.4s, v7.4s, v5.4s +add v7.4s, v7.4s, v5.4s +sqrdmulh v5.4S, v8.4S, v29.4S +mul v8.4S, v8.4S,v3.4S +mla v8.4S, v5.4S, v31.s[0] +sub v5.4s, v23.4s, v8.4s +add v23.4s, v23.4s, v8.4s +str q7, [x0, #960] +str q6, [x0, #976] +str q23, [x0, #992] +str q5, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 2392 +// Instruction count: 2388 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_6_0.s b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_6_0.s new file mode 100644 index 0000000..b0b458f --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_6_0.s @@ -0,0 +1,2422 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 26036764 // Layer 6, block 0 +.word 7065381 // Layer 6, block 1 +.word 11280567 // Layer 6, block 2 +.word 19695786 // Layer 6, block 3 +.word 1666225723 // Layer 6, block 0 +.word 452149874 // Layer 6, block 1 +.word 721901190 // Layer 6, block 2 +.word 1260434103 // Layer 6, block 3 +.word 28678040 // Layer 7, block 0 +.word 5637166 // Layer 7, block 2 +.word 18759424 // Layer 7, block 4 +.word 8648030 // Layer 7, block 6 +.word 1835254486 // Layer 7, block 0 +.word 360751090 // Layer 7, block 2 +.word 1200511508 // Layer 7, block 4 +.word 553431680 // Layer 7, block 6 +.word 7232147 // Layer 7, block 1 +.word 7430689 // Layer 7, block 3 +.word 14819378 // Layer 7, block 5 +.word 22112339 // Layer 7, block 7 +.word 462822084 // Layer 7, block 1 +.word 475527802 // Layer 7, block 3 +.word 948367809 // Layer 7, block 5 +.word 1415081692 // Layer 7, block 7 +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14834498 // Layer 6, block 4 +.word 22861321 // Layer 6, block 5 +.word 23033862 // Layer 6, block 6 +.word 32211066 // Layer 6, block 7 +.word 949335415 // Layer 6, block 4 +.word 1463012881 // Layer 6, block 5 +.word 1474054663 // Layer 6, block 6 +.word 2061350894 // Layer 6, block 7 +.word 7103825 // Layer 7, block 8 +.word 24338119 // Layer 7, block 10 +.word 6674394 // Layer 7, block 12 +.word 3716128 // Layer 7, block 14 +.word 454610102 // Layer 7, block 8 +.word 1557520740 // Layer 7, block 10 +.word 427128616 // Layer 7, block 12 +.word 237814041 // Layer 7, block 14 +.word 18577393 // Layer 7, block 9 +.word 17042091 // Layer 7, block 11 +.word 6574213 // Layer 7, block 13 +.word 24666803 // Layer 7, block 15 +.word 1188862414 // Layer 7, block 9 +.word 1090610585 // Layer 7, block 11 +.word 420717521 // Layer 7, block 13 +.word 1578554911 // Layer 7, block 15 +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 11253846 // Layer 6, block 8 +.word 16151303 // Layer 6, block 9 +.word 1821442 // Layer 6, block 10 +.word 23358663 // Layer 6, block 11 +.word 720191176 // Layer 6, block 8 +.word 1033604503 // Layer 6, block 9 +.word 116563391 // Layer 6, block 10 +.word 1494840340 // Layer 6, block 11 +.word 32787475 // Layer 7, block 16 +.word 8269259 // Layer 7, block 18 +.word 20826321 // Layer 7, block 20 +.word 21194054 // Layer 7, block 22 +.word 2098238255 // Layer 7, block 16 +.word 529192186 // Layer 7, block 18 +.word 1332782821 // Layer 7, block 20 +.word 1356315937 // Layer 7, block 22 +.word 28400654 // Layer 7, block 17 +.word 31090287 // Layer 7, block 19 +.word 26776841 // Layer 7, block 21 +.word 22281074 // Layer 7, block 23 +.word 1817503137 // Layer 7, block 17 +.word 1989626512 // Layer 7, block 19 +.word 1713587037 // Layer 7, block 21 +.word 1425879908 // Layer 7, block 23 +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 20504641 // Layer 6, block 12 +.word 7735096 // Layer 6, block 13 +.word 29463916 // Layer 6, block 14 +.word 23172067 // Layer 6, block 15 +.word 1312196872 // Layer 6, block 12 +.word 495008363 // Layer 6, block 13 +.word 1885546712 // Layer 6, block 14 +.word 1482899108 // Layer 6, block 15 +.word 1953000 // Layer 7, block 24 +.word 12766243 // Layer 7, block 26 +.word 16292342 // Layer 7, block 28 +.word 25143337 // Layer 7, block 30 +.word 124982461 // Layer 7, block 24 +.word 816977197 // Layer 7, block 26 +.word 1042630311 // Layer 7, block 28 +.word 1609050759 // Layer 7, block 30 +.word 12486848 // Layer 7, block 25 +.word 31556661 // Layer 7, block 27 +.word 28330310 // Layer 7, block 29 +.word 15137961 // Layer 7, block 31 +.word 799097282 // Layer 7, block 25 +.word 2019472170 // Layer 7, block 27 +.word 1813001465 // Layer 7, block 29 +.word 968755565 // Layer 7, block 31 +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 18663828 // Layer 6, block 16 +.word 25765932 // Layer 6, block 17 +.word 11779122 // Layer 6, block 18 +.word 29112305 // Layer 6, block 19 +.word 1194393831 // Layer 6, block 16 +.word 1648893798 // Layer 6, block 17 +.word 753806275 // Layer 6, block 18 +.word 1863045325 // Layer 6, block 19 +.word 33163184 // Layer 7, block 32 +.word 11550623 // Layer 7, block 34 +.word 25375595 // Layer 7, block 36 +.word 18254638 // Layer 7, block 38 +.word 2122281795 // Layer 7, block 32 +.word 739183455 // Layer 7, block 34 +.word 1623914137 // Layer 7, block 36 +.word 1168207670 // Layer 7, block 38 +.word 9551359 // Layer 7, block 33 +.word 33257316 // Layer 7, block 35 +.word 10387700 // Layer 7, block 37 +.word 4263629 // Layer 7, block 39 +.word 611240324 // Layer 7, block 33 +.word 2128305784 // Layer 7, block 35 +.word 664762063 // Layer 7, block 37 +.word 272851431 // Layer 7, block 39 +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 596073 // Layer 6, block 20 +.word 29039358 // Layer 6, block 21 +.word 6760262 // Layer 6, block 22 +.word 2228887 // Layer 6, block 23 +.word 38145761 // Layer 6, block 20 +.word 1858377074 // Layer 6, block 21 +.word 432623749 // Layer 6, block 22 +.word 142637881 // Layer 6, block 23 +.word 25929180 // Layer 7, block 40 +.word 23508428 // Layer 7, block 42 +.word 22560727 // Layer 7, block 44 +.word 29457393 // Layer 7, block 46 +.word 1659340873 // Layer 7, block 40 +.word 1504424569 // Layer 7, block 42 +.word 1443776334 // Layer 7, block 44 +.word 1885129272 // Layer 7, block 46 +.word 17371159 // Layer 7, block 41 +.word 11558208 // Layer 7, block 43 +.word 15755637 // Layer 7, block 45 +.word 20740787 // Layer 7, block 47 +.word 1111669329 // Layer 7, block 41 +.word 739668858 // Layer 7, block 43 +.word 1008283812 // Layer 7, block 45 +.word 1327309063 // Layer 7, block 47 +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 13624329 // Layer 6, block 24 +.word 9838349 // Layer 6, block 25 +.word 6934560 // Layer 6, block 26 +.word 11310234 // Layer 6, block 27 +.word 871890510 // Layer 6, block 24 +.word 629606282 // Layer 6, block 25 +.word 443777969 // Layer 6, block 26 +.word 723799733 // Layer 6, block 27 +.word 3153984 // Layer 7, block 48 +.word 15599806 // Layer 7, block 50 +.word 23484790 // Layer 7, block 52 +.word 30174454 // Layer 7, block 54 +.word 201839571 // Layer 7, block 48 +.word 998311389 // Layer 7, block 50 +.word 1502911852 // Layer 7, block 52 +.word 1931017673 // Layer 7, block 54 +.word 13598070 // Layer 7, block 49 +.word 31454003 // Layer 7, block 51 +.word 20506260 // Layer 7, block 53 +.word 5928435 // Layer 7, block 55 +.word 870210062 // Layer 7, block 49 +.word 2012902560 // Layer 7, block 51 +.word 1312300480 // Layer 7, block 53 +.word 379390883 // Layer 7, block 55 +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 32798516 // Layer 6, block 28 +.word 9911360 // Layer 6, block 29 +.word 32443170 // Layer 6, block 30 +.word 31293482 // Layer 6, block 31 +.word 2098944825 // Layer 6, block 28 +.word 634278629 // Layer 6, block 29 +.word 2076204416 // Layer 6, block 30 +.word 2002630000 // Layer 6, block 31 +.word 26013877 // Layer 7, block 56 +.word 22928950 // Layer 7, block 58 +.word 24547058 // Layer 7, block 60 +.word 21082546 // Layer 7, block 62 +.word 1664761067 // Layer 7, block 56 +.word 1467340807 // Layer 7, block 58 +.word 1570891816 // Layer 7, block 60 +.word 1349179970 // Layer 7, block 62 +.word 21864746 // Layer 7, block 57 +.word 27678266 // Layer 7, block 59 +.word 30695887 // Layer 7, block 61 +.word 31772478 // Layer 7, block 63 +.word 1399236949 // Layer 7, block 57 +.word 1771273834 // Layer 7, block 59 +.word 1964386839 // Layer 7, block 61 +.word 2033283404 // Layer 7, block 63 +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 2853776 // Layer 6, block 32 +.word 31645959 // Layer 6, block 33 +.word 29723614 // Layer 6, block 34 +.word 31813171 // Layer 6, block 35 +.word 182627725 // Layer 6, block 32 +.word 2025186806 // Layer 6, block 33 +.word 1902166116 // Layer 6, block 34 +.word 2035887557 // Layer 6, block 35 +.word 30377953 // Layer 7, block 64 +.word 4924837 // Layer 7, block 66 +.word 11362575 // Layer 7, block 68 +.word 31398766 // Layer 7, block 70 +.word 1944040616 // Layer 7, block 64 +.word 315165513 // Layer 7, block 66 +.word 727149301 // Layer 7, block 68 +.word 2009367662 // Layer 7, block 70 +.word 27689101 // Layer 7, block 65 +.word 31229525 // Layer 7, block 67 +.word 6544948 // Layer 7, block 69 +.word 13728247 // Layer 7, block 71 +.word 1771967221 // Layer 7, block 65 +.word 1998537064 // Layer 7, block 67 +.word 418844704 // Layer 7, block 69 +.word 878540754 // Layer 7, block 71 +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9116920 // Layer 6, block 36 +.word 26449800 // Layer 6, block 37 +.word 27173300 // Layer 6, block 38 +.word 1574249 // Layer 6, block 39 +.word 583438350 // Layer 6, block 36 +.word 1692658010 // Layer 6, block 37 +.word 1738958476 // Layer 6, block 38 +.word 100744247 // Layer 6, block 39 +.word 6510145 // Layer 7, block 72 +.word 760999 // Layer 7, block 74 +.word 1634503 // Layer 7, block 76 +.word 29546109 // Layer 7, block 78 +.word 416617482 // Layer 7, block 72 +.word 48700219 // Layer 7, block 74 +.word 104600209 // Layer 7, block 76 +.word 1890806663 // Layer 7, block 78 +.word 2195232 // Layer 7, block 73 +.word 4465852 // Layer 7, block 75 +.word 31203102 // Layer 7, block 77 +.word 29916743 // Layer 7, block 79 +.word 140484126 // Layer 7, block 73 +.word 285792715 // Layer 7, block 75 +.word 1996846121 // Layer 7, block 77 +.word 1914525428 // Layer 7, block 79 +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29172999 // Layer 6, block 40 +.word 16825951 // Layer 6, block 41 +.word 11592382 // Layer 6, block 42 +.word 2671395 // Layer 6, block 43 +.word 1866929445 // Layer 6, block 40 +.word 1076778680 // Layer 6, block 41 +.word 741855827 // Layer 6, block 42 +.word 170956232 // Layer 6, block 43 +.word 14579779 // Layer 7, block 80 +.word 24263513 // Layer 7, block 82 +.word 4646776 // Layer 7, block 84 +.word 69049 // Layer 7, block 86 +.word 933034643 // Layer 7, block 80 +.word 1552746321 // Layer 7, block 82 +.word 297370968 // Layer 7, block 84 +.word 4418799 // Layer 7, block 86 +.word 33263488 // Layer 7, block 81 +.word 22493246 // Layer 7, block 83 +.word 22009979 // Layer 7, block 85 +.word 12021234 // Layer 7, block 87 +.word 2128700762 // Layer 7, block 81 +.word 1439457879 // Layer 7, block 83 +.word 1408531152 // Layer 7, block 85 +.word 769300260 // Layer 7, block 87 +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 15720958 // Layer 6, block 44 +.word 4876619 // Layer 6, block 45 +.word 9370171 // Layer 6, block 46 +.word 2197027 // Layer 6, block 47 +.word 1006064525 // Layer 6, block 44 +.word 312079797 // Layer 6, block 45 +.word 599645177 // Layer 6, block 46 +.word 140598997 // Layer 6, block 47 +.word 16117282 // Layer 7, block 88 +.word 9635661 // Layer 7, block 90 +.word 9117520 // Layer 7, block 92 +.word 3506913 // Layer 7, block 94 +.word 1031427326 // Layer 7, block 88 +.word 616635240 // Layer 7, block 90 +.word 583476747 // Layer 7, block 92 +.word 224425303 // Layer 7, block 94 +.word 20014407 // Layer 7, block 89 +.word 25893988 // Layer 7, block 91 +.word 10257619 // Layer 7, block 93 +.word 24501669 // Layer 7, block 95 +.word 1280824291 // Layer 7, block 89 +.word 1657088757 // Layer 7, block 91 +.word 656437514 // Layer 7, block 93 +.word 1567987141 // Layer 7, block 95 +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 23467272 // Layer 6, block 48 +.word 11944835 // Layer 6, block 49 +.word 29768154 // Layer 6, block 50 +.word 3189790 // Layer 6, block 51 +.word 1501790786 // Layer 6, block 48 +.word 764411097 // Layer 6, block 49 +.word 1905016458 // Layer 6, block 50 +.word 204130980 // Layer 6, block 51 +.word 28559032 // Layer 7, block 96 +.word 20151609 // Layer 7, block 98 +.word 11645481 // Layer 7, block 100 +.word 16402437 // Layer 7, block 102 +.word 1827638556 // Layer 7, block 96 +.word 1289604549 // Layer 7, block 98 +.word 745253903 // Layer 7, block 100 +.word 1049675853 // Layer 7, block 102 +.word 1005359 // Layer 7, block 97 +.word 19130139 // Layer 7, block 99 +.word 11690281 // Layer 7, block 101 +.word 5461508 // Layer 7, block 103 +.word 64338065 // Layer 7, block 97 +.word 1224235458 // Layer 7, block 99 +.word 748120885 // Layer 7, block 101 +.word 349509836 // Layer 7, block 103 +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 4898455 // Layer 6, block 52 +.word 22059944 // Layer 6, block 53 +.word 20315246 // Layer 6, block 54 +.word 28615767 // Layer 6, block 55 +.word 313477194 // Layer 6, block 52 +.word 1411728668 // Layer 6, block 53 +.word 1300076517 // Layer 6, block 54 +.word 1831269319 // Layer 6, block 55 +.word 6226096 // Layer 7, block 104 +.word 14029790 // Layer 7, block 106 +.word 7729000 // Layer 7, block 108 +.word 13958531 // Layer 7, block 110 +.word 398439734 // Layer 7, block 104 +.word 897838034 // Layer 7, block 106 +.word 494618249 // Layer 7, block 108 +.word 893277806 // Layer 7, block 110 +.word 31755058 // Layer 7, block 105 +.word 26102744 // Layer 7, block 107 +.word 19175904 // Layer 7, block 109 +.word 19472238 // Layer 7, block 111 +.word 2032168609 // Layer 7, block 105 +.word 1670448121 // Layer 7, block 107 +.word 1227164194 // Layer 7, block 109 +.word 1246128123 // Layer 7, block 111 +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 17302560 // Layer 6, block 56 +.word 8630188 // Layer 6, block 57 +.word 13744680 // Layer 6, block 58 +.word 31890906 // Layer 6, block 59 +.word 1107279328 // Layer 6, block 56 +.word 552289879 // Layer 6, block 57 +.word 879592386 // Layer 6, block 58 +.word 2040862218 // Layer 6, block 59 +.word 4735938 // Layer 7, block 112 +.word 26671657 // Layer 7, block 114 +.word 25810971 // Layer 7, block 116 +.word 25578690 // Layer 7, block 118 +.word 303076900 // Layer 7, block 112 +.word 1706855774 // Layer 7, block 114 +.word 1651776074 // Layer 7, block 116 +.word 1636911225 // Layer 7, block 118 +.word 6957373 // Layer 7, block 113 +.word 25381712 // Layer 7, block 115 +.word 27780827 // Layer 7, block 117 +.word 28062311 // Layer 7, block 119 +.word 445237890 // Layer 7, block 113 +.word 1624305595 // Layer 7, block 115 +.word 1777837237 // Layer 7, block 117 +.word 1795850838 // Layer 7, block 119 +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 26150922 // Layer 6, block 60 +.word 29525906 // Layer 6, block 61 +.word 23080870 // Layer 6, block 62 +.word 1636987 // Layer 6, block 63 +.word 1673531278 // Layer 6, block 60 +.word 1889513769 // Layer 6, block 61 +.word 1477062945 // Layer 6, block 62 +.word 104759172 // Layer 6, block 63 +.word 10674616 // Layer 7, block 120 +.word 9508293 // Layer 7, block 122 +.word 4274200 // Layer 7, block 124 +.word 10066304 // Layer 7, block 126 +.word 683123285 // Layer 7, block 120 +.word 608484310 // Layer 7, block 122 +.word 273527923 // Layer 7, block 124 +.word 644194289 // Layer 7, block 126 +.word 26473446 // Layer 7, block 121 +.word 14853570 // Layer 7, block 123 +.word 32427548 // Layer 7, block 125 +.word 16598340 // Layer 7, block 127 +.word 1694171239 // Layer 7, block 121 +.word 950555930 // Layer 7, block 123 +.word 2075204685 // Layer 7, block 125 +.word 1062212688 // Layer 7, block 127 +.text +.global ntt_u32_full_neon_asm_var_4_4_6_0 +.global _ntt_u32_full_neon_asm_var_4_4_6_0 +ntt_u32_full_neon_asm_var_4_4_6_0: +_ntt_u32_full_neon_asm_var_4_4_6_0: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x0, #800] +ldr q29, [x0, #864] +ldr q28, [x0, #928] +ldr q27, [x0, #992] +ldr q26, [x0, #288] +ldr q25, [x0, #352] +ldr q24, [x0, #416] +ldr q23, [x0, #480] +ldr q22, [x17, #+0] +ldr q21, [x17, #+16] +ldr q20, [x17, #+32] +ldr q19, [x17, #+48] +ldr q18, [x17, #+64] +ldr q17, [x17, #+80] +ldr q16, [x17, #+96] +ldr q3, [x17, #+112] +sqrdmulh v2.4S, v30.4S, v21.s[0] +ldr q1, [x0, #544] +ldr q0, [x0, #608] +mul v30.4S, v30.4S,v22.s[0] +ldr q15, [x0, #672] +ldr q14, [x0, #736] +sqrdmulh v13.4S, v29.4S, v21.s[0] +ldr q12, [x0, #32] +mul v29.4S, v29.4S,v22.s[0] +ldr q11, [x0, #96] +sqrdmulh v10.4S, v28.4S, v21.s[0] +ldr q9, [x0, #160] +mul v28.4S, v28.4S,v22.s[0] +ldr q8, [x0, #224] +sqrdmulh v7.4S, v27.4S, v21.s[0] +mul v27.4S, v27.4S,v22.s[0] +mla v30.4S, v2.4S, v31.s[0] +mla v29.4S, v13.4S, v31.s[0] +mla v28.4S, v10.4S, v31.s[0] +mla v27.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v1.4S, v21.s[0] +mul v1.4S, v1.4S,v22.s[0] +sub v10.4s, v26.4s, v30.4s +add v26.4s, v26.4s, v30.4s +sqrdmulh v30.4S, v0.4S, v21.s[0] +mul v0.4S, v0.4S,v22.s[0] +sub v13.4s, v25.4s, v29.4s +add v25.4s, v25.4s, v29.4s +sqrdmulh v29.4S, v15.4S, v21.s[0] +mul v15.4S, v15.4S,v22.s[0] +sub v2.4s, v24.4s, v28.4s +add v24.4s, v24.4s, v28.4s +sqrdmulh v28.4S, v14.4S, v21.s[0] +mul v14.4S, v14.4S,v22.s[0] +mla v1.4S, v7.4S, v31.s[0] +sub v7.4s, v23.4s, v27.4s +mla v0.4S, v30.4S, v31.s[0] +add v23.4s, v23.4s, v27.4s +mla v15.4S, v29.4S, v31.s[0] +mla v14.4S, v28.4S, v31.s[0] +sqrdmulh v28.4S, v24.4S, v21.s[1] +mul v24.4S, v24.4S,v22.s[1] +sub v29.4s, v12.4s, v1.4s +add v12.4s, v12.4s, v1.4s +sqrdmulh v1.4S, v23.4S, v21.s[1] +mul v23.4S, v23.4S,v22.s[1] +sub v27.4s, v11.4s, v0.4s +add v11.4s, v11.4s, v0.4s +sqrdmulh v0.4S, v26.4S, v21.s[1] +mul v26.4S, v26.4S,v22.s[1] +sub v30.4s, v9.4s, v15.4s +add v9.4s, v9.4s, v15.4s +sqrdmulh v15.4S, v25.4S, v21.s[1] +mul v25.4S, v25.4S,v22.s[1] +mla v24.4S, v28.4S, v31.s[0] +sub v28.4s, v8.4s, v14.4s +add v8.4s, v8.4s, v14.4s +mla v23.4S, v1.4S, v31.s[0] +mla v26.4S, v0.4S, v31.s[0] +mla v25.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v2.4S, v21.s[2] +mul v2.4S, v2.4S,v22.s[2] +sub v0.4s, v9.4s, v24.4s +add v9.4s, v9.4s, v24.4s +sqrdmulh v24.4S, v7.4S, v21.s[2] +mul v7.4S, v7.4S,v22.s[2] +sub v1.4s, v8.4s, v23.4s +add v8.4s, v8.4s, v23.4s +sqrdmulh v23.4S, v10.4S, v21.s[2] +mul v10.4S, v10.4S,v22.s[2] +sub v14.4s, v12.4s, v26.4s +add v12.4s, v12.4s, v26.4s +sqrdmulh v26.4S, v13.4S, v21.s[2] +mul v13.4S, v13.4S,v22.s[2] +mla v2.4S, v15.4S, v31.s[0] +sub v15.4s, v11.4s, v25.4s +mla v7.4S, v24.4S, v31.s[0] +add v11.4s, v11.4s, v25.4s +mla v10.4S, v23.4S, v31.s[0] +mla v13.4S, v26.4S, v31.s[0] +sqrdmulh v26.4S, v9.4S, v19.s[0] +mul v9.4S, v9.4S,v20.s[0] +sub v23.4s, v30.4s, v2.4s +add v30.4s, v30.4s, v2.4s +sqrdmulh v2.4S, v8.4S, v19.s[0] +mul v8.4S, v8.4S,v20.s[0] +sub v25.4s, v28.4s, v7.4s +add v28.4s, v28.4s, v7.4s +sqrdmulh v7.4S, v0.4S, v19.s[1] +mul v0.4S, v0.4S,v20.s[1] +sub v24.4s, v29.4s, v10.4s +add v29.4s, v29.4s, v10.4s +sqrdmulh v10.4S, v1.4S, v19.s[1] +mul v1.4S, v1.4S,v20.s[1] +mla v9.4S, v26.4S, v31.s[0] +sub v26.4s, v27.4s, v13.4s +add v27.4s, v27.4s, v13.4s +mla v8.4S, v2.4S, v31.s[0] +mla v0.4S, v7.4S, v31.s[0] +mla v1.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v30.4S, v19.s[2] +mul v30.4S, v30.4S,v20.s[2] +sub v7.4s, v12.4s, v9.4s +add v12.4s, v12.4s, v9.4s +sqrdmulh v9.4S, v28.4S, v19.s[2] +mul v28.4S, v28.4S,v20.s[2] +sub v2.4s, v11.4s, v8.4s +add v11.4s, v11.4s, v8.4s +sqrdmulh v8.4S, v23.4S, v19.s[3] +mul v23.4S, v23.4S,v20.s[3] +sub v13.4s, v14.4s, v0.4s +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v25.4S, v19.s[3] +mul v25.4S, v25.4S,v20.s[3] +mla v30.4S, v10.4S, v31.s[0] +sub v10.4s, v15.4s, v1.4s +mla v28.4S, v9.4S, v31.s[0] +add v15.4s, v15.4s, v1.4s +mla v23.4S, v8.4S, v31.s[0] +mla v25.4S, v0.4S, v31.s[0] +sqrdmulh v0.4S, v11.4S, v17.s[0] +mul v11.4S, v11.4S,v18.s[0] +sub v8.4s, v29.4s, v30.4s +add v29.4s, v29.4s, v30.4s +sqrdmulh v30.4S, v2.4S, v17.s[1] +mul v2.4S, v2.4S,v18.s[1] +sub v1.4s, v27.4s, v28.4s +add v27.4s, v27.4s, v28.4s +sqrdmulh v28.4S, v15.4S, v17.s[2] +mul v15.4S, v15.4S,v18.s[2] +sub v9.4s, v24.4s, v23.4s +add v24.4s, v24.4s, v23.4s +sqrdmulh v23.4S, v10.4S, v17.s[3] +mul v10.4S, v10.4S,v18.s[3] +mla v11.4S, v0.4S, v31.s[0] +sub v0.4s, v26.4s, v25.4s +add v26.4s, v26.4s, v25.4s +mla v2.4S, v30.4S, v31.s[0] +mla v15.4S, v28.4S, v31.s[0] +mla v10.4S, v23.4S, v31.s[0] +sqrdmulh v23.4S, v27.4S, v3.s[0] +mul v27.4S, v27.4S,v16.s[0] +sub v28.4s, v12.4s, v11.4s +add v12.4s, v12.4s, v11.4s +str q12, [x0, #32] +str q28, [x0, #96] +sqrdmulh v28.4S, v1.4S, v3.s[1] +mul v1.4S, v1.4S,v16.s[1] +ldr q12, [x0, #816] +ldr q11, [x0, #880] +sub v30.4s, v7.4s, v2.4s +add v7.4s, v7.4s, v2.4s +str q7, [x0, #160] +str q30, [x0, #224] +sqrdmulh v30.4S, v26.4S, v3.s[2] +mul v26.4S, v26.4S,v16.s[2] +ldr q7, [x0, #944] +ldr q2, [x0, #1008] +sub v25.4s, v14.4s, v15.4s +add v14.4s, v14.4s, v15.4s +str q14, [x0, #288] +str q25, [x0, #352] +sqrdmulh v25.4S, v0.4S, v3.s[3] +mul v0.4S, v0.4S,v16.s[3] +ldr q14, [x0, #304] +ldr q15, [x0, #368] +mla v27.4S, v23.4S, v31.s[0] +sub v23.4s, v13.4s, v10.4s +mla v1.4S, v28.4S, v31.s[0] +add v13.4s, v13.4s, v10.4s +str q13, [x0, #416] +str q23, [x0, #480] +mla v26.4S, v30.4S, v31.s[0] +ldr q30, [x0, #432] +ldr q23, [x0, #496] +mla v0.4S, v25.4S, v31.s[0] +sub v25.4s, v29.4s, v27.4s +add v29.4s, v29.4s, v27.4s +sub v27.4s, v8.4s, v1.4s +add v8.4s, v8.4s, v1.4s +sub v1.4s, v24.4s, v26.4s +add v24.4s, v24.4s, v26.4s +str q29, [x0, #544] +str q25, [x0, #608] +str q8, [x0, #672] +str q27, [x0, #736] +str q24, [x0, #800] +str q1, [x0, #864] +sqrdmulh v1.4S, v12.4S, v21.s[0] +ldr q24, [x0, #560] +ldr q27, [x0, #624] +mul v12.4S, v12.4S,v22.s[0] +sub v8.4s, v9.4s, v0.4s +add v9.4s, v9.4s, v0.4s +str q9, [x0, #928] +str q8, [x0, #992] +ldr q8, [x0, #688] +ldr q9, [x0, #752] +sqrdmulh v0.4S, v11.4S, v21.s[0] +ldr q25, [x0, #48] +mul v11.4S, v11.4S,v22.s[0] +ldr q29, [x0, #112] +sqrdmulh v26.4S, v7.4S, v21.s[0] +ldr q13, [x0, #176] +mul v7.4S, v7.4S,v22.s[0] +ldr q10, [x0, #240] +sqrdmulh v28.4S, v2.4S, v21.s[0] +mul v2.4S, v2.4S,v22.s[0] +mla v12.4S, v1.4S, v31.s[0] +mla v11.4S, v0.4S, v31.s[0] +mla v7.4S, v26.4S, v31.s[0] +mla v2.4S, v28.4S, v31.s[0] +sqrdmulh v28.4S, v24.4S, v21.s[0] +mul v24.4S, v24.4S,v22.s[0] +sub v26.4s, v14.4s, v12.4s +add v14.4s, v14.4s, v12.4s +sqrdmulh v12.4S, v27.4S, v21.s[0] +mul v27.4S, v27.4S,v22.s[0] +sub v0.4s, v15.4s, v11.4s +add v15.4s, v15.4s, v11.4s +sqrdmulh v11.4S, v8.4S, v21.s[0] +mul v8.4S, v8.4S,v22.s[0] +sub v1.4s, v30.4s, v7.4s +add v30.4s, v30.4s, v7.4s +sqrdmulh v7.4S, v9.4S, v21.s[0] +mul v9.4S, v9.4S,v22.s[0] +mla v24.4S, v28.4S, v31.s[0] +sub v28.4s, v23.4s, v2.4s +mla v27.4S, v12.4S, v31.s[0] +add v23.4s, v23.4s, v2.4s +mla v8.4S, v11.4S, v31.s[0] +mla v9.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v30.4S, v21.s[1] +mul v30.4S, v30.4S,v22.s[1] +sub v11.4s, v25.4s, v24.4s +add v25.4s, v25.4s, v24.4s +sqrdmulh v24.4S, v23.4S, v21.s[1] +mul v23.4S, v23.4S,v22.s[1] +sub v2.4s, v29.4s, v27.4s +add v29.4s, v29.4s, v27.4s +sqrdmulh v27.4S, v14.4S, v21.s[1] +mul v14.4S, v14.4S,v22.s[1] +sub v12.4s, v13.4s, v8.4s +add v13.4s, v13.4s, v8.4s +sqrdmulh v8.4S, v15.4S, v21.s[1] +mul v15.4S, v15.4S,v22.s[1] +mla v30.4S, v7.4S, v31.s[0] +sub v7.4s, v10.4s, v9.4s +add v10.4s, v10.4s, v9.4s +mla v23.4S, v24.4S, v31.s[0] +mla v14.4S, v27.4S, v31.s[0] +mla v15.4S, v8.4S, v31.s[0] +sqrdmulh v8.4S, v1.4S, v21.s[2] +mul v1.4S, v1.4S,v22.s[2] +sub v27.4s, v13.4s, v30.4s +add v13.4s, v13.4s, v30.4s +sqrdmulh v30.4S, v28.4S, v21.s[2] +mul v28.4S, v28.4S,v22.s[2] +sub v24.4s, v10.4s, v23.4s +add v10.4s, v10.4s, v23.4s +sqrdmulh v23.4S, v26.4S, v21.s[2] +mul v26.4S, v26.4S,v22.s[2] +sub v9.4s, v25.4s, v14.4s +add v25.4s, v25.4s, v14.4s +sqrdmulh v14.4S, v0.4S, v21.s[2] +mul v0.4S, v0.4S,v22.s[2] +mla v1.4S, v8.4S, v31.s[0] +sub v8.4s, v29.4s, v15.4s +mla v28.4S, v30.4S, v31.s[0] +add v29.4s, v29.4s, v15.4s +mla v26.4S, v23.4S, v31.s[0] +mla v0.4S, v14.4S, v31.s[0] +sqrdmulh v14.4S, v13.4S, v19.s[0] +mul v13.4S, v13.4S,v20.s[0] +sub v23.4s, v12.4s, v1.4s +add v12.4s, v12.4s, v1.4s +sqrdmulh v1.4S, v10.4S, v19.s[0] +mul v10.4S, v10.4S,v20.s[0] +sub v15.4s, v7.4s, v28.4s +add v7.4s, v7.4s, v28.4s +sqrdmulh v28.4S, v27.4S, v19.s[1] +mul v27.4S, v27.4S,v20.s[1] +sub v30.4s, v11.4s, v26.4s +add v11.4s, v11.4s, v26.4s +sqrdmulh v26.4S, v24.4S, v19.s[1] +mul v24.4S, v24.4S,v20.s[1] +mla v13.4S, v14.4S, v31.s[0] +sub v14.4s, v2.4s, v0.4s +add v2.4s, v2.4s, v0.4s +mla v10.4S, v1.4S, v31.s[0] +mla v27.4S, v28.4S, v31.s[0] +mla v24.4S, v26.4S, v31.s[0] +sqrdmulh v26.4S, v12.4S, v19.s[2] +mul v12.4S, v12.4S,v20.s[2] +sub v28.4s, v25.4s, v13.4s +add v25.4s, v25.4s, v13.4s +sqrdmulh v13.4S, v7.4S, v19.s[2] +mul v7.4S, v7.4S,v20.s[2] +sub v1.4s, v29.4s, v10.4s +add v29.4s, v29.4s, v10.4s +sqrdmulh v10.4S, v23.4S, v19.s[3] +mul v23.4S, v23.4S,v20.s[3] +sub v0.4s, v9.4s, v27.4s +add v9.4s, v9.4s, v27.4s +sqrdmulh v27.4S, v15.4S, v19.s[3] +mul v15.4S, v15.4S,v20.s[3] +mla v12.4S, v26.4S, v31.s[0] +sub v26.4s, v8.4s, v24.4s +mla v7.4S, v13.4S, v31.s[0] +add v8.4s, v8.4s, v24.4s +mla v23.4S, v10.4S, v31.s[0] +mla v15.4S, v27.4S, v31.s[0] +sqrdmulh v27.4S, v29.4S, v17.s[0] +mul v29.4S, v29.4S,v18.s[0] +sub v10.4s, v11.4s, v12.4s +add v11.4s, v11.4s, v12.4s +sqrdmulh v12.4S, v1.4S, v17.s[1] +mul v1.4S, v1.4S,v18.s[1] +sub v24.4s, v2.4s, v7.4s +add v2.4s, v2.4s, v7.4s +sqrdmulh v7.4S, v8.4S, v17.s[2] +mul v8.4S, v8.4S,v18.s[2] +sub v13.4s, v30.4s, v23.4s +add v30.4s, v30.4s, v23.4s +sqrdmulh v23.4S, v26.4S, v17.s[3] +mul v26.4S, v26.4S,v18.s[3] +mla v29.4S, v27.4S, v31.s[0] +sub v27.4s, v14.4s, v15.4s +add v14.4s, v14.4s, v15.4s +mla v1.4S, v12.4S, v31.s[0] +mla v8.4S, v7.4S, v31.s[0] +mla v26.4S, v23.4S, v31.s[0] +sqrdmulh v23.4S, v2.4S, v3.s[0] +mul v2.4S, v2.4S,v16.s[0] +sub v7.4s, v25.4s, v29.4s +add v25.4s, v25.4s, v29.4s +str q25, [x0, #48] +str q7, [x0, #112] +sqrdmulh v7.4S, v24.4S, v3.s[1] +mul v24.4S, v24.4S,v16.s[1] +ldr q25, [x0, #768] +ldr q29, [x0, #832] +sub v12.4s, v28.4s, v1.4s +add v28.4s, v28.4s, v1.4s +str q28, [x0, #176] +str q12, [x0, #240] +sqrdmulh v12.4S, v14.4S, v3.s[2] +mul v14.4S, v14.4S,v16.s[2] +ldr q28, [x0, #896] +ldr q1, [x0, #960] +sub v15.4s, v9.4s, v8.4s +add v9.4s, v9.4s, v8.4s +str q9, [x0, #304] +str q15, [x0, #368] +sqrdmulh v15.4S, v27.4S, v3.s[3] +mul v27.4S, v27.4S,v16.s[3] +ldr q9, [x0, #256] +ldr q8, [x0, #320] +mla v2.4S, v23.4S, v31.s[0] +sub v23.4s, v0.4s, v26.4s +mla v24.4S, v7.4S, v31.s[0] +add v0.4s, v0.4s, v26.4s +str q0, [x0, #432] +str q23, [x0, #496] +mla v14.4S, v12.4S, v31.s[0] +ldr q12, [x0, #384] +ldr q23, [x0, #448] +mla v27.4S, v15.4S, v31.s[0] +sub v15.4s, v11.4s, v2.4s +add v11.4s, v11.4s, v2.4s +sub v2.4s, v10.4s, v24.4s +add v10.4s, v10.4s, v24.4s +sub v24.4s, v30.4s, v14.4s +add v30.4s, v30.4s, v14.4s +str q11, [x0, #560] +str q15, [x0, #624] +str q10, [x0, #688] +str q2, [x0, #752] +str q30, [x0, #816] +str q24, [x0, #880] +sqrdmulh v24.4S, v25.4S, v21.s[0] +ldr q30, [x0, #512] +ldr q2, [x0, #576] +mul v25.4S, v25.4S,v22.s[0] +sub v10.4s, v13.4s, v27.4s +add v13.4s, v13.4s, v27.4s +str q13, [x0, #944] +str q10, [x0, #1008] +ldr q10, [x0, #640] +ldr q13, [x0, #704] +sqrdmulh v27.4S, v29.4S, v21.s[0] +ldr q15, [x0, #0] +mul v29.4S, v29.4S,v22.s[0] +ldr q11, [x0, #64] +sqrdmulh v14.4S, v28.4S, v21.s[0] +ldr q0, [x0, #128] +mul v28.4S, v28.4S,v22.s[0] +ldr q26, [x0, #192] +sqrdmulh v7.4S, v1.4S, v21.s[0] +mul v1.4S, v1.4S,v22.s[0] +mla v25.4S, v24.4S, v31.s[0] +mla v29.4S, v27.4S, v31.s[0] +mla v28.4S, v14.4S, v31.s[0] +mla v1.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v30.4S, v21.s[0] +mul v30.4S, v30.4S,v22.s[0] +sub v14.4s, v9.4s, v25.4s +add v9.4s, v9.4s, v25.4s +sqrdmulh v25.4S, v2.4S, v21.s[0] +mul v2.4S, v2.4S,v22.s[0] +sub v27.4s, v8.4s, v29.4s +add v8.4s, v8.4s, v29.4s +sqrdmulh v29.4S, v10.4S, v21.s[0] +mul v10.4S, v10.4S,v22.s[0] +sub v24.4s, v12.4s, v28.4s +add v12.4s, v12.4s, v28.4s +sqrdmulh v28.4S, v13.4S, v21.s[0] +mul v13.4S, v13.4S,v22.s[0] +mla v30.4S, v7.4S, v31.s[0] +sub v7.4s, v23.4s, v1.4s +mla v2.4S, v25.4S, v31.s[0] +add v23.4s, v23.4s, v1.4s +mla v10.4S, v29.4S, v31.s[0] +mla v13.4S, v28.4S, v31.s[0] +sqrdmulh v28.4S, v12.4S, v21.s[1] +mul v12.4S, v12.4S,v22.s[1] +sub v29.4s, v15.4s, v30.4s +add v15.4s, v15.4s, v30.4s +sqrdmulh v30.4S, v23.4S, v21.s[1] +mul v23.4S, v23.4S,v22.s[1] +sub v1.4s, v11.4s, v2.4s +add v11.4s, v11.4s, v2.4s +sqrdmulh v2.4S, v9.4S, v21.s[1] +mul v9.4S, v9.4S,v22.s[1] +sub v25.4s, v0.4s, v10.4s +add v0.4s, v0.4s, v10.4s +sqrdmulh v10.4S, v8.4S, v21.s[1] +mul v8.4S, v8.4S,v22.s[1] +mla v12.4S, v28.4S, v31.s[0] +sub v28.4s, v26.4s, v13.4s +add v26.4s, v26.4s, v13.4s +mla v23.4S, v30.4S, v31.s[0] +mla v9.4S, v2.4S, v31.s[0] +mla v8.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v24.4S, v21.s[2] +mul v24.4S, v24.4S,v22.s[2] +sub v2.4s, v0.4s, v12.4s +add v0.4s, v0.4s, v12.4s +sqrdmulh v12.4S, v7.4S, v21.s[2] +mul v7.4S, v7.4S,v22.s[2] +sub v30.4s, v26.4s, v23.4s +add v26.4s, v26.4s, v23.4s +sqrdmulh v23.4S, v14.4S, v21.s[2] +mul v14.4S, v14.4S,v22.s[2] +sub v13.4s, v15.4s, v9.4s +add v15.4s, v15.4s, v9.4s +sqrdmulh v9.4S, v27.4S, v21.s[2] +mul v27.4S, v27.4S,v22.s[2] +mla v24.4S, v10.4S, v31.s[0] +sub v10.4s, v11.4s, v8.4s +mla v7.4S, v12.4S, v31.s[0] +add v11.4s, v11.4s, v8.4s +mla v14.4S, v23.4S, v31.s[0] +mla v27.4S, v9.4S, v31.s[0] +sqrdmulh v9.4S, v0.4S, v19.s[0] +mul v0.4S, v0.4S,v20.s[0] +sub v23.4s, v25.4s, v24.4s +add v25.4s, v25.4s, v24.4s +sqrdmulh v24.4S, v26.4S, v19.s[0] +mul v26.4S, v26.4S,v20.s[0] +sub v8.4s, v28.4s, v7.4s +add v28.4s, v28.4s, v7.4s +sqrdmulh v7.4S, v2.4S, v19.s[1] +mul v2.4S, v2.4S,v20.s[1] +sub v12.4s, v29.4s, v14.4s +add v29.4s, v29.4s, v14.4s +sqrdmulh v14.4S, v30.4S, v19.s[1] +mul v30.4S, v30.4S,v20.s[1] +mla v0.4S, v9.4S, v31.s[0] +sub v9.4s, v1.4s, v27.4s +add v1.4s, v1.4s, v27.4s +mla v26.4S, v24.4S, v31.s[0] +mla v2.4S, v7.4S, v31.s[0] +mla v30.4S, v14.4S, v31.s[0] +sqrdmulh v14.4S, v25.4S, v19.s[2] +mul v25.4S, v25.4S,v20.s[2] +sub v7.4s, v15.4s, v0.4s +add v15.4s, v15.4s, v0.4s +sqrdmulh v0.4S, v28.4S, v19.s[2] +mul v28.4S, v28.4S,v20.s[2] +sub v24.4s, v11.4s, v26.4s +add v11.4s, v11.4s, v26.4s +sqrdmulh v26.4S, v23.4S, v19.s[3] +mul v23.4S, v23.4S,v20.s[3] +sub v27.4s, v13.4s, v2.4s +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v8.4S, v19.s[3] +mul v8.4S, v8.4S,v20.s[3] +mla v25.4S, v14.4S, v31.s[0] +sub v14.4s, v10.4s, v30.4s +mla v28.4S, v0.4S, v31.s[0] +add v10.4s, v10.4s, v30.4s +mla v23.4S, v26.4S, v31.s[0] +mla v8.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v11.4S, v17.s[0] +mul v11.4S, v11.4S,v18.s[0] +sub v26.4s, v29.4s, v25.4s +add v29.4s, v29.4s, v25.4s +sqrdmulh v25.4S, v24.4S, v17.s[1] +mul v24.4S, v24.4S,v18.s[1] +sub v30.4s, v1.4s, v28.4s +add v1.4s, v1.4s, v28.4s +sqrdmulh v28.4S, v10.4S, v17.s[2] +mul v10.4S, v10.4S,v18.s[2] +sub v0.4s, v12.4s, v23.4s +add v12.4s, v12.4s, v23.4s +sqrdmulh v23.4S, v14.4S, v17.s[3] +mul v14.4S, v14.4S,v18.s[3] +mla v11.4S, v2.4S, v31.s[0] +sub v2.4s, v9.4s, v8.4s +add v9.4s, v9.4s, v8.4s +mla v24.4S, v25.4S, v31.s[0] +mla v10.4S, v28.4S, v31.s[0] +mla v14.4S, v23.4S, v31.s[0] +sqrdmulh v23.4S, v1.4S, v3.s[0] +mul v1.4S, v1.4S,v16.s[0] +sub v28.4s, v15.4s, v11.4s +add v15.4s, v15.4s, v11.4s +str q15, [x0, #0] +str q28, [x0, #64] +sqrdmulh v28.4S, v30.4S, v3.s[1] +mul v30.4S, v30.4S,v16.s[1] +ldr q15, [x0, #784] +ldr q11, [x0, #848] +sub v25.4s, v7.4s, v24.4s +add v7.4s, v7.4s, v24.4s +str q7, [x0, #128] +str q25, [x0, #192] +sqrdmulh v25.4S, v9.4S, v3.s[2] +mul v9.4S, v9.4S,v16.s[2] +ldr q7, [x0, #912] +ldr q24, [x0, #976] +sub v8.4s, v13.4s, v10.4s +add v13.4s, v13.4s, v10.4s +str q13, [x0, #256] +str q8, [x0, #320] +sqrdmulh v8.4S, v2.4S, v3.s[3] +mul v2.4S, v2.4S,v16.s[3] +ldr q13, [x0, #272] +ldr q10, [x0, #336] +mla v1.4S, v23.4S, v31.s[0] +sub v23.4s, v27.4s, v14.4s +mla v30.4S, v28.4S, v31.s[0] +add v27.4s, v27.4s, v14.4s +str q27, [x0, #384] +str q23, [x0, #448] +mla v9.4S, v25.4S, v31.s[0] +ldr q25, [x0, #400] +ldr q23, [x0, #464] +mla v2.4S, v8.4S, v31.s[0] +sub v8.4s, v29.4s, v1.4s +add v29.4s, v29.4s, v1.4s +sub v1.4s, v26.4s, v30.4s +add v26.4s, v26.4s, v30.4s +sub v30.4s, v12.4s, v9.4s +add v12.4s, v12.4s, v9.4s +str q29, [x0, #512] +str q8, [x0, #576] +str q26, [x0, #640] +str q1, [x0, #704] +str q12, [x0, #768] +str q30, [x0, #832] +sqrdmulh v30.4S, v15.4S, v21.s[0] +ldr q12, [x0, #528] +ldr q1, [x0, #592] +mul v15.4S, v15.4S,v22.s[0] +sub v26.4s, v0.4s, v2.4s +add v0.4s, v0.4s, v2.4s +str q0, [x0, #896] +str q26, [x0, #960] +ldr q26, [x0, #656] +ldr q0, [x0, #720] +sqrdmulh v2.4S, v11.4S, v21.s[0] +ldr q8, [x0, #16] +mul v11.4S, v11.4S,v22.s[0] +ldr q29, [x0, #80] +sqrdmulh v9.4S, v7.4S, v21.s[0] +ldr q27, [x0, #144] +mul v7.4S, v7.4S,v22.s[0] +ldr q14, [x0, #208] +sqrdmulh v28.4S, v24.4S, v21.s[0] +mul v24.4S, v24.4S,v22.s[0] +mla v15.4S, v30.4S, v31.s[0] +mla v11.4S, v2.4S, v31.s[0] +mla v7.4S, v9.4S, v31.s[0] +mla v24.4S, v28.4S, v31.s[0] +sqrdmulh v28.4S, v12.4S, v21.s[0] +mul v12.4S, v12.4S,v22.s[0] +sub v9.4s, v13.4s, v15.4s +add v13.4s, v13.4s, v15.4s +sqrdmulh v15.4S, v1.4S, v21.s[0] +mul v1.4S, v1.4S,v22.s[0] +sub v2.4s, v10.4s, v11.4s +add v10.4s, v10.4s, v11.4s +sqrdmulh v11.4S, v26.4S, v21.s[0] +mul v26.4S, v26.4S,v22.s[0] +sub v30.4s, v25.4s, v7.4s +add v25.4s, v25.4s, v7.4s +sqrdmulh v7.4S, v0.4S, v21.s[0] +mul v0.4S, v0.4S,v22.s[0] +mla v12.4S, v28.4S, v31.s[0] +sub v28.4s, v23.4s, v24.4s +mla v1.4S, v15.4S, v31.s[0] +add v23.4s, v23.4s, v24.4s +mla v26.4S, v11.4S, v31.s[0] +mla v0.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v25.4S, v21.s[1] +mul v25.4S, v25.4S,v22.s[1] +sub v11.4s, v8.4s, v12.4s +add v8.4s, v8.4s, v12.4s +sqrdmulh v12.4S, v23.4S, v21.s[1] +mul v23.4S, v23.4S,v22.s[1] +sub v24.4s, v29.4s, v1.4s +add v29.4s, v29.4s, v1.4s +sqrdmulh v1.4S, v13.4S, v21.s[1] +mul v13.4S, v13.4S,v22.s[1] +sub v15.4s, v27.4s, v26.4s +add v27.4s, v27.4s, v26.4s +sqrdmulh v26.4S, v10.4S, v21.s[1] +mul v10.4S, v10.4S,v22.s[1] +mla v25.4S, v7.4S, v31.s[0] +sub v7.4s, v14.4s, v0.4s +add v14.4s, v14.4s, v0.4s +mla v23.4S, v12.4S, v31.s[0] +mla v13.4S, v1.4S, v31.s[0] +mla v10.4S, v26.4S, v31.s[0] +sqrdmulh v26.4S, v30.4S, v21.s[2] +mul v30.4S, v30.4S,v22.s[2] +sub v1.4s, v27.4s, v25.4s +add v27.4s, v27.4s, v25.4s +sqrdmulh v25.4S, v28.4S, v21.s[2] +mul v28.4S, v28.4S,v22.s[2] +sub v12.4s, v14.4s, v23.4s +add v14.4s, v14.4s, v23.4s +sqrdmulh v23.4S, v9.4S, v21.s[2] +mul v9.4S, v9.4S,v22.s[2] +sub v0.4s, v8.4s, v13.4s +add v8.4s, v8.4s, v13.4s +sqrdmulh v13.4S, v2.4S, v21.s[2] +mul v2.4S, v2.4S,v22.s[2] +mla v30.4S, v26.4S, v31.s[0] +sub v26.4s, v29.4s, v10.4s +mla v28.4S, v25.4S, v31.s[0] +add v29.4s, v29.4s, v10.4s +mla v9.4S, v23.4S, v31.s[0] +mla v2.4S, v13.4S, v31.s[0] +sqrdmulh v13.4S, v27.4S, v19.s[0] +mul v27.4S, v27.4S,v20.s[0] +sub v23.4s, v15.4s, v30.4s +add v15.4s, v15.4s, v30.4s +sqrdmulh v30.4S, v14.4S, v19.s[0] +mul v14.4S, v14.4S,v20.s[0] +sub v10.4s, v7.4s, v28.4s +add v7.4s, v7.4s, v28.4s +sqrdmulh v28.4S, v1.4S, v19.s[1] +mul v1.4S, v1.4S,v20.s[1] +sub v25.4s, v11.4s, v9.4s +add v11.4s, v11.4s, v9.4s +sqrdmulh v9.4S, v12.4S, v19.s[1] +mul v12.4S, v12.4S,v20.s[1] +mla v27.4S, v13.4S, v31.s[0] +sub v13.4s, v24.4s, v2.4s +add v24.4s, v24.4s, v2.4s +mla v14.4S, v30.4S, v31.s[0] +mla v1.4S, v28.4S, v31.s[0] +mla v12.4S, v9.4S, v31.s[0] +sqrdmulh v9.4S, v15.4S, v19.s[2] +mul v15.4S, v15.4S,v20.s[2] +sub v28.4s, v8.4s, v27.4s +add v8.4s, v8.4s, v27.4s +sqrdmulh v27.4S, v7.4S, v19.s[2] +mul v7.4S, v7.4S,v20.s[2] +sub v30.4s, v29.4s, v14.4s +add v29.4s, v29.4s, v14.4s +sqrdmulh v14.4S, v23.4S, v19.s[3] +mul v23.4S, v23.4S,v20.s[3] +sub v2.4s, v0.4s, v1.4s +add v0.4s, v0.4s, v1.4s +sqrdmulh v1.4S, v10.4S, v19.s[3] +mul v10.4S, v10.4S,v20.s[3] +mla v15.4S, v9.4S, v31.s[0] +sub v9.4s, v26.4s, v12.4s +mla v7.4S, v27.4S, v31.s[0] +add v26.4s, v26.4s, v12.4s +mla v23.4S, v14.4S, v31.s[0] +mla v10.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v29.4S, v17.s[0] +mul v29.4S, v29.4S,v18.s[0] +sub v14.4s, v11.4s, v15.4s +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v30.4S, v17.s[1] +mul v30.4S, v30.4S,v18.s[1] +sub v12.4s, v24.4s, v7.4s +add v24.4s, v24.4s, v7.4s +sqrdmulh v7.4S, v26.4S, v17.s[2] +mul v26.4S, v26.4S,v18.s[2] +sub v27.4s, v25.4s, v23.4s +add v25.4s, v25.4s, v23.4s +sqrdmulh v23.4S, v9.4S, v17.s[3] +mul v9.4S, v9.4S,v18.s[3] +mla v29.4S, v1.4S, v31.s[0] +sub v1.4s, v13.4s, v10.4s +add v13.4s, v13.4s, v10.4s +mla v30.4S, v15.4S, v31.s[0] +mla v26.4S, v7.4S, v31.s[0] +mla v9.4S, v23.4S, v31.s[0] +sqrdmulh v23.4S, v24.4S, v3.s[0] +mul v24.4S, v24.4S,v16.s[0] +sub v7.4s, v8.4s, v29.4s +add v8.4s, v8.4s, v29.4s +str q8, [x0, #16] +str q7, [x0, #80] +sqrdmulh v7.4S, v12.4S, v3.s[1] +mul v12.4S, v12.4S,v16.s[1] +sub v8.4s, v28.4s, v30.4s +add v28.4s, v28.4s, v30.4s +str q28, [x0, #144] +str q8, [x0, #208] +sqrdmulh v8.4S, v13.4S, v3.s[2] +mul v13.4S, v13.4S,v16.s[2] +sub v28.4s, v0.4s, v26.4s +add v0.4s, v0.4s, v26.4s +str q0, [x0, #272] +str q28, [x0, #336] +sqrdmulh v28.4S, v1.4S, v3.s[3] +mul v1.4S, v1.4S,v16.s[3] +mla v24.4S, v23.4S, v31.s[0] +sub v23.4s, v2.4s, v9.4s +mla v12.4S, v7.4S, v31.s[0] +add v2.4s, v2.4s, v9.4s +str q2, [x0, #400] +str q23, [x0, #464] +mla v13.4S, v8.4S, v31.s[0] +mla v1.4S, v28.4S, v31.s[0] +sub v28.4s, v11.4s, v24.4s +add v11.4s, v11.4s, v24.4s +sub v24.4s, v14.4s, v12.4s +add v14.4s, v14.4s, v12.4s +sub v12.4s, v25.4s, v13.4s +add v25.4s, v25.4s, v13.4s +str q11, [x0, #528] +str q28, [x0, #592] +str q14, [x0, #656] +str q24, [x0, #720] +str q25, [x0, #784] +str q12, [x0, #848] +sub v3.4s, v27.4s, v1.4s +add v27.4s, v27.4s, v1.4s +str q27, [x0, #912] +str q3, [x0, #976] +ldr q4, [x17, #+128] +ldr q5, [x17, #+144] +ldr q6, [x17, #+160] +ldr q10, [x17, #+176] +ldr q15, [x17, #+192] +ldr q29, [x17, #+208] +ldr q30, [x17, #+224] +ldr q26, [x17, #+240] +ldr q0, [x0, #32] +ldr q7, [x0, #48] +ldr q9, [x0, #0] +ldr q2, [x0, #16] +sqrdmulh v23.4S, v0.4S, v5.s[0] +mul v0.4S, v0.4S,v4.s[0] +mla v0.4S, v23.4S, v31.s[0] +sub v23.4s, v9.4s, v0.4s +add v9.4s, v9.4s, v0.4s +sqrdmulh v0.4S, v7.4S, v5.s[0] +mul v7.4S, v7.4S,v4.s[0] +mla v7.4S, v0.4S, v31.s[0] +sub v0.4s, v2.4s, v7.4s +add v2.4s, v2.4s, v7.4s +sqrdmulh v7.4S, v2.4S, v5.s[1] +mul v2.4S, v2.4S,v4.s[1] +mla v2.4S, v7.4S, v31.s[0] +sub v7.4s, v9.4s, v2.4s +add v9.4s, v9.4s, v2.4s +sqrdmulh v2.4S, v0.4S, v5.s[2] +mul v0.4S, v0.4S,v4.s[2] +mla v0.4S, v2.4S, v31.s[0] +sub v2.4s, v23.4s, v0.4s +add v23.4s, v23.4s, v0.4s +trn1 v0.4S, v9.4S, v7.4S +trn2 v8.4S, v9.4S, v7.4S +trn1 v13.4S, v23.4S, v2.4S +trn2 v11.4S, v23.4S, v2.4S +trn2 v23.2D, v0.2D, v13.2D +trn2 v2.2D, v8.2D, v11.2D +trn1 v9.2D, v0.2D, v13.2D +trn1 v7.2D, v8.2D, v11.2D +sqrdmulh v11.4S, v23.4S, v10.4S +mul v23.4S, v23.4S,v6.4S +mla v23.4S, v11.4S, v31.s[0] +sub v11.4s, v9.4s, v23.4s +add v9.4s, v9.4s, v23.4s +sqrdmulh v23.4S, v2.4S, v10.4S +mul v2.4S, v2.4S,v6.4S +mla v2.4S, v23.4S, v31.s[0] +sub v23.4s, v7.4s, v2.4s +add v7.4s, v7.4s, v2.4s +sqrdmulh v2.4S, v7.4S, v29.4S +mul v7.4S, v7.4S,v15.4S +mla v7.4S, v2.4S, v31.s[0] +sub v2.4s, v9.4s, v7.4s +add v9.4s, v9.4s, v7.4s +sqrdmulh v7.4S, v23.4S, v26.4S +mul v23.4S, v23.4S,v30.4S +mla v23.4S, v7.4S, v31.s[0] +sub v7.4s, v11.4s, v23.4s +add v11.4s, v11.4s, v23.4s +str q9, [x0, #0] +str q2, [x0, #16] +str q11, [x0, #32] +str q7, [x0, #48] +ldr q7, [x17, #+256] +ldr q11, [x17, #+272] +ldr q2, [x17, #+288] +ldr q9, [x17, #+304] +ldr q23, [x17, #+320] +ldr q8, [x17, #+336] +ldr q13, [x17, #+352] +ldr q0, [x17, #+368] +ldr q26, [x0, #96] +ldr q30, [x0, #112] +ldr q29, [x0, #64] +ldr q15, [x0, #80] +sqrdmulh v10.4S, v26.4S, v11.s[0] +mul v26.4S, v26.4S,v7.s[0] +mla v26.4S, v10.4S, v31.s[0] +sub v10.4s, v29.4s, v26.4s +add v29.4s, v29.4s, v26.4s +sqrdmulh v26.4S, v30.4S, v11.s[0] +mul v30.4S, v30.4S,v7.s[0] +mla v30.4S, v26.4S, v31.s[0] +sub v26.4s, v15.4s, v30.4s +add v15.4s, v15.4s, v30.4s +sqrdmulh v30.4S, v15.4S, v11.s[1] +mul v15.4S, v15.4S,v7.s[1] +mla v15.4S, v30.4S, v31.s[0] +sub v30.4s, v29.4s, v15.4s +add v29.4s, v29.4s, v15.4s +sqrdmulh v15.4S, v26.4S, v11.s[2] +mul v26.4S, v26.4S,v7.s[2] +mla v26.4S, v15.4S, v31.s[0] +sub v15.4s, v10.4s, v26.4s +add v10.4s, v10.4s, v26.4s +trn1 v26.4S, v29.4S, v30.4S +trn2 v6.4S, v29.4S, v30.4S +trn1 v5.4S, v10.4S, v15.4S +trn2 v4.4S, v10.4S, v15.4S +trn2 v10.2D, v26.2D, v5.2D +trn2 v15.2D, v6.2D, v4.2D +trn1 v29.2D, v26.2D, v5.2D +trn1 v30.2D, v6.2D, v4.2D +sqrdmulh v4.4S, v10.4S, v9.4S +mul v10.4S, v10.4S,v2.4S +mla v10.4S, v4.4S, v31.s[0] +sub v4.4s, v29.4s, v10.4s +add v29.4s, v29.4s, v10.4s +sqrdmulh v10.4S, v15.4S, v9.4S +mul v15.4S, v15.4S,v2.4S +mla v15.4S, v10.4S, v31.s[0] +sub v10.4s, v30.4s, v15.4s +add v30.4s, v30.4s, v15.4s +sqrdmulh v15.4S, v30.4S, v8.4S +mul v30.4S, v30.4S,v23.4S +mla v30.4S, v15.4S, v31.s[0] +sub v15.4s, v29.4s, v30.4s +add v29.4s, v29.4s, v30.4s +sqrdmulh v30.4S, v10.4S, v0.4S +mul v10.4S, v10.4S,v13.4S +mla v10.4S, v30.4S, v31.s[0] +sub v30.4s, v4.4s, v10.4s +add v4.4s, v4.4s, v10.4s +str q29, [x0, #64] +str q15, [x0, #80] +str q4, [x0, #96] +str q30, [x0, #112] +ldr q30, [x17, #+384] +ldr q4, [x17, #+400] +ldr q15, [x17, #+416] +ldr q29, [x17, #+432] +ldr q10, [x17, #+448] +ldr q6, [x17, #+464] +ldr q5, [x17, #+480] +ldr q26, [x17, #+496] +ldr q0, [x0, #160] +ldr q13, [x0, #176] +ldr q8, [x0, #128] +ldr q23, [x0, #144] +sqrdmulh v9.4S, v0.4S, v4.s[0] +mul v0.4S, v0.4S,v30.s[0] +mla v0.4S, v9.4S, v31.s[0] +sub v9.4s, v8.4s, v0.4s +add v8.4s, v8.4s, v0.4s +sqrdmulh v0.4S, v13.4S, v4.s[0] +mul v13.4S, v13.4S,v30.s[0] +mla v13.4S, v0.4S, v31.s[0] +sub v0.4s, v23.4s, v13.4s +add v23.4s, v23.4s, v13.4s +sqrdmulh v13.4S, v23.4S, v4.s[1] +mul v23.4S, v23.4S,v30.s[1] +mla v23.4S, v13.4S, v31.s[0] +sub v13.4s, v8.4s, v23.4s +add v8.4s, v8.4s, v23.4s +sqrdmulh v23.4S, v0.4S, v4.s[2] +mul v0.4S, v0.4S,v30.s[2] +mla v0.4S, v23.4S, v31.s[0] +sub v23.4s, v9.4s, v0.4s +add v9.4s, v9.4s, v0.4s +trn1 v0.4S, v8.4S, v13.4S +trn2 v2.4S, v8.4S, v13.4S +trn1 v11.4S, v9.4S, v23.4S +trn2 v7.4S, v9.4S, v23.4S +trn2 v9.2D, v0.2D, v11.2D +trn2 v23.2D, v2.2D, v7.2D +trn1 v8.2D, v0.2D, v11.2D +trn1 v13.2D, v2.2D, v7.2D +sqrdmulh v7.4S, v9.4S, v29.4S +mul v9.4S, v9.4S,v15.4S +mla v9.4S, v7.4S, v31.s[0] +sub v7.4s, v8.4s, v9.4s +add v8.4s, v8.4s, v9.4s +sqrdmulh v9.4S, v23.4S, v29.4S +mul v23.4S, v23.4S,v15.4S +mla v23.4S, v9.4S, v31.s[0] +sub v9.4s, v13.4s, v23.4s +add v13.4s, v13.4s, v23.4s +sqrdmulh v23.4S, v13.4S, v6.4S +mul v13.4S, v13.4S,v10.4S +mla v13.4S, v23.4S, v31.s[0] +sub v23.4s, v8.4s, v13.4s +add v8.4s, v8.4s, v13.4s +sqrdmulh v13.4S, v9.4S, v26.4S +mul v9.4S, v9.4S,v5.4S +mla v9.4S, v13.4S, v31.s[0] +sub v13.4s, v7.4s, v9.4s +add v7.4s, v7.4s, v9.4s +str q8, [x0, #128] +str q23, [x0, #144] +str q7, [x0, #160] +str q13, [x0, #176] +ldr q13, [x17, #+512] +ldr q7, [x17, #+528] +ldr q23, [x17, #+544] +ldr q8, [x17, #+560] +ldr q9, [x17, #+576] +ldr q2, [x17, #+592] +ldr q11, [x17, #+608] +ldr q0, [x17, #+624] +ldr q26, [x0, #224] +ldr q5, [x0, #240] +ldr q6, [x0, #192] +ldr q10, [x0, #208] +sqrdmulh v29.4S, v26.4S, v7.s[0] +mul v26.4S, v26.4S,v13.s[0] +mla v26.4S, v29.4S, v31.s[0] +sub v29.4s, v6.4s, v26.4s +add v6.4s, v6.4s, v26.4s +sqrdmulh v26.4S, v5.4S, v7.s[0] +mul v5.4S, v5.4S,v13.s[0] +mla v5.4S, v26.4S, v31.s[0] +sub v26.4s, v10.4s, v5.4s +add v10.4s, v10.4s, v5.4s +sqrdmulh v5.4S, v10.4S, v7.s[1] +mul v10.4S, v10.4S,v13.s[1] +mla v10.4S, v5.4S, v31.s[0] +sub v5.4s, v6.4s, v10.4s +add v6.4s, v6.4s, v10.4s +sqrdmulh v10.4S, v26.4S, v7.s[2] +mul v26.4S, v26.4S,v13.s[2] +mla v26.4S, v10.4S, v31.s[0] +sub v10.4s, v29.4s, v26.4s +add v29.4s, v29.4s, v26.4s +trn1 v26.4S, v6.4S, v5.4S +trn2 v15.4S, v6.4S, v5.4S +trn1 v4.4S, v29.4S, v10.4S +trn2 v30.4S, v29.4S, v10.4S +trn2 v29.2D, v26.2D, v4.2D +trn2 v10.2D, v15.2D, v30.2D +trn1 v6.2D, v26.2D, v4.2D +trn1 v5.2D, v15.2D, v30.2D +sqrdmulh v30.4S, v29.4S, v8.4S +mul v29.4S, v29.4S,v23.4S +mla v29.4S, v30.4S, v31.s[0] +sub v30.4s, v6.4s, v29.4s +add v6.4s, v6.4s, v29.4s +sqrdmulh v29.4S, v10.4S, v8.4S +mul v10.4S, v10.4S,v23.4S +mla v10.4S, v29.4S, v31.s[0] +sub v29.4s, v5.4s, v10.4s +add v5.4s, v5.4s, v10.4s +sqrdmulh v10.4S, v5.4S, v2.4S +mul v5.4S, v5.4S,v9.4S +mla v5.4S, v10.4S, v31.s[0] +sub v10.4s, v6.4s, v5.4s +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v29.4S, v0.4S +mul v29.4S, v29.4S,v11.4S +mla v29.4S, v5.4S, v31.s[0] +sub v5.4s, v30.4s, v29.4s +add v30.4s, v30.4s, v29.4s +str q6, [x0, #192] +str q10, [x0, #208] +str q30, [x0, #224] +str q5, [x0, #240] +ldr q5, [x17, #+640] +ldr q30, [x17, #+656] +ldr q10, [x17, #+672] +ldr q6, [x17, #+688] +ldr q29, [x17, #+704] +ldr q15, [x17, #+720] +ldr q4, [x17, #+736] +ldr q26, [x17, #+752] +ldr q0, [x0, #288] +ldr q11, [x0, #304] +ldr q2, [x0, #256] +ldr q9, [x0, #272] +sqrdmulh v8.4S, v0.4S, v30.s[0] +mul v0.4S, v0.4S,v5.s[0] +mla v0.4S, v8.4S, v31.s[0] +sub v8.4s, v2.4s, v0.4s +add v2.4s, v2.4s, v0.4s +sqrdmulh v0.4S, v11.4S, v30.s[0] +mul v11.4S, v11.4S,v5.s[0] +mla v11.4S, v0.4S, v31.s[0] +sub v0.4s, v9.4s, v11.4s +add v9.4s, v9.4s, v11.4s +sqrdmulh v11.4S, v9.4S, v30.s[1] +mul v9.4S, v9.4S,v5.s[1] +mla v9.4S, v11.4S, v31.s[0] +sub v11.4s, v2.4s, v9.4s +add v2.4s, v2.4s, v9.4s +sqrdmulh v9.4S, v0.4S, v30.s[2] +mul v0.4S, v0.4S,v5.s[2] +mla v0.4S, v9.4S, v31.s[0] +sub v9.4s, v8.4s, v0.4s +add v8.4s, v8.4s, v0.4s +trn1 v0.4S, v2.4S, v11.4S +trn2 v23.4S, v2.4S, v11.4S +trn1 v7.4S, v8.4S, v9.4S +trn2 v13.4S, v8.4S, v9.4S +trn2 v8.2D, v0.2D, v7.2D +trn2 v9.2D, v23.2D, v13.2D +trn1 v2.2D, v0.2D, v7.2D +trn1 v11.2D, v23.2D, v13.2D +sqrdmulh v13.4S, v8.4S, v6.4S +mul v8.4S, v8.4S,v10.4S +mla v8.4S, v13.4S, v31.s[0] +sub v13.4s, v2.4s, v8.4s +add v2.4s, v2.4s, v8.4s +sqrdmulh v8.4S, v9.4S, v6.4S +mul v9.4S, v9.4S,v10.4S +mla v9.4S, v8.4S, v31.s[0] +sub v8.4s, v11.4s, v9.4s +add v11.4s, v11.4s, v9.4s +sqrdmulh v9.4S, v11.4S, v15.4S +mul v11.4S, v11.4S,v29.4S +mla v11.4S, v9.4S, v31.s[0] +sub v9.4s, v2.4s, v11.4s +add v2.4s, v2.4s, v11.4s +sqrdmulh v11.4S, v8.4S, v26.4S +mul v8.4S, v8.4S,v4.4S +mla v8.4S, v11.4S, v31.s[0] +sub v11.4s, v13.4s, v8.4s +add v13.4s, v13.4s, v8.4s +str q2, [x0, #256] +str q9, [x0, #272] +str q13, [x0, #288] +str q11, [x0, #304] +ldr q11, [x17, #+768] +ldr q13, [x17, #+784] +ldr q9, [x17, #+800] +ldr q2, [x17, #+816] +ldr q8, [x17, #+832] +ldr q23, [x17, #+848] +ldr q7, [x17, #+864] +ldr q0, [x17, #+880] +ldr q26, [x0, #352] +ldr q4, [x0, #368] +ldr q15, [x0, #320] +ldr q29, [x0, #336] +sqrdmulh v6.4S, v26.4S, v13.s[0] +mul v26.4S, v26.4S,v11.s[0] +mla v26.4S, v6.4S, v31.s[0] +sub v6.4s, v15.4s, v26.4s +add v15.4s, v15.4s, v26.4s +sqrdmulh v26.4S, v4.4S, v13.s[0] +mul v4.4S, v4.4S,v11.s[0] +mla v4.4S, v26.4S, v31.s[0] +sub v26.4s, v29.4s, v4.4s +add v29.4s, v29.4s, v4.4s +sqrdmulh v4.4S, v29.4S, v13.s[1] +mul v29.4S, v29.4S,v11.s[1] +mla v29.4S, v4.4S, v31.s[0] +sub v4.4s, v15.4s, v29.4s +add v15.4s, v15.4s, v29.4s +sqrdmulh v29.4S, v26.4S, v13.s[2] +mul v26.4S, v26.4S,v11.s[2] +mla v26.4S, v29.4S, v31.s[0] +sub v29.4s, v6.4s, v26.4s +add v6.4s, v6.4s, v26.4s +trn1 v26.4S, v15.4S, v4.4S +trn2 v10.4S, v15.4S, v4.4S +trn1 v30.4S, v6.4S, v29.4S +trn2 v5.4S, v6.4S, v29.4S +trn2 v6.2D, v26.2D, v30.2D +trn2 v29.2D, v10.2D, v5.2D +trn1 v15.2D, v26.2D, v30.2D +trn1 v4.2D, v10.2D, v5.2D +sqrdmulh v5.4S, v6.4S, v2.4S +mul v6.4S, v6.4S,v9.4S +mla v6.4S, v5.4S, v31.s[0] +sub v5.4s, v15.4s, v6.4s +add v15.4s, v15.4s, v6.4s +sqrdmulh v6.4S, v29.4S, v2.4S +mul v29.4S, v29.4S,v9.4S +mla v29.4S, v6.4S, v31.s[0] +sub v6.4s, v4.4s, v29.4s +add v4.4s, v4.4s, v29.4s +sqrdmulh v29.4S, v4.4S, v23.4S +mul v4.4S, v4.4S,v8.4S +mla v4.4S, v29.4S, v31.s[0] +sub v29.4s, v15.4s, v4.4s +add v15.4s, v15.4s, v4.4s +sqrdmulh v4.4S, v6.4S, v0.4S +mul v6.4S, v6.4S,v7.4S +mla v6.4S, v4.4S, v31.s[0] +sub v4.4s, v5.4s, v6.4s +add v5.4s, v5.4s, v6.4s +str q15, [x0, #320] +str q29, [x0, #336] +str q5, [x0, #352] +str q4, [x0, #368] +ldr q4, [x17, #+896] +ldr q5, [x17, #+912] +ldr q29, [x17, #+928] +ldr q15, [x17, #+944] +ldr q6, [x17, #+960] +ldr q10, [x17, #+976] +ldr q30, [x17, #+992] +ldr q26, [x17, #+1008] +ldr q0, [x0, #416] +ldr q7, [x0, #432] +ldr q23, [x0, #384] +ldr q8, [x0, #400] +sqrdmulh v2.4S, v0.4S, v5.s[0] +mul v0.4S, v0.4S,v4.s[0] +mla v0.4S, v2.4S, v31.s[0] +sub v2.4s, v23.4s, v0.4s +add v23.4s, v23.4s, v0.4s +sqrdmulh v0.4S, v7.4S, v5.s[0] +mul v7.4S, v7.4S,v4.s[0] +mla v7.4S, v0.4S, v31.s[0] +sub v0.4s, v8.4s, v7.4s +add v8.4s, v8.4s, v7.4s +sqrdmulh v7.4S, v8.4S, v5.s[1] +mul v8.4S, v8.4S,v4.s[1] +mla v8.4S, v7.4S, v31.s[0] +sub v7.4s, v23.4s, v8.4s +add v23.4s, v23.4s, v8.4s +sqrdmulh v8.4S, v0.4S, v5.s[2] +mul v0.4S, v0.4S,v4.s[2] +mla v0.4S, v8.4S, v31.s[0] +sub v8.4s, v2.4s, v0.4s +add v2.4s, v2.4s, v0.4s +trn1 v0.4S, v23.4S, v7.4S +trn2 v9.4S, v23.4S, v7.4S +trn1 v13.4S, v2.4S, v8.4S +trn2 v11.4S, v2.4S, v8.4S +trn2 v2.2D, v0.2D, v13.2D +trn2 v8.2D, v9.2D, v11.2D +trn1 v23.2D, v0.2D, v13.2D +trn1 v7.2D, v9.2D, v11.2D +sqrdmulh v11.4S, v2.4S, v15.4S +mul v2.4S, v2.4S,v29.4S +mla v2.4S, v11.4S, v31.s[0] +sub v11.4s, v23.4s, v2.4s +add v23.4s, v23.4s, v2.4s +sqrdmulh v2.4S, v8.4S, v15.4S +mul v8.4S, v8.4S,v29.4S +mla v8.4S, v2.4S, v31.s[0] +sub v2.4s, v7.4s, v8.4s +add v7.4s, v7.4s, v8.4s +sqrdmulh v8.4S, v7.4S, v10.4S +mul v7.4S, v7.4S,v6.4S +mla v7.4S, v8.4S, v31.s[0] +sub v8.4s, v23.4s, v7.4s +add v23.4s, v23.4s, v7.4s +sqrdmulh v7.4S, v2.4S, v26.4S +mul v2.4S, v2.4S,v30.4S +mla v2.4S, v7.4S, v31.s[0] +sub v7.4s, v11.4s, v2.4s +add v11.4s, v11.4s, v2.4s +str q23, [x0, #384] +str q8, [x0, #400] +str q11, [x0, #416] +str q7, [x0, #432] +ldr q7, [x17, #+1024] +ldr q11, [x17, #+1040] +ldr q8, [x17, #+1056] +ldr q23, [x17, #+1072] +ldr q2, [x17, #+1088] +ldr q9, [x17, #+1104] +ldr q13, [x17, #+1120] +ldr q0, [x17, #+1136] +ldr q26, [x0, #480] +ldr q30, [x0, #496] +ldr q10, [x0, #448] +ldr q6, [x0, #464] +sqrdmulh v15.4S, v26.4S, v11.s[0] +mul v26.4S, v26.4S,v7.s[0] +mla v26.4S, v15.4S, v31.s[0] +sub v15.4s, v10.4s, v26.4s +add v10.4s, v10.4s, v26.4s +sqrdmulh v26.4S, v30.4S, v11.s[0] +mul v30.4S, v30.4S,v7.s[0] +mla v30.4S, v26.4S, v31.s[0] +sub v26.4s, v6.4s, v30.4s +add v6.4s, v6.4s, v30.4s +sqrdmulh v30.4S, v6.4S, v11.s[1] +mul v6.4S, v6.4S,v7.s[1] +mla v6.4S, v30.4S, v31.s[0] +sub v30.4s, v10.4s, v6.4s +add v10.4s, v10.4s, v6.4s +sqrdmulh v6.4S, v26.4S, v11.s[2] +mul v26.4S, v26.4S,v7.s[2] +mla v26.4S, v6.4S, v31.s[0] +sub v6.4s, v15.4s, v26.4s +add v15.4s, v15.4s, v26.4s +trn1 v26.4S, v10.4S, v30.4S +trn2 v29.4S, v10.4S, v30.4S +trn1 v5.4S, v15.4S, v6.4S +trn2 v4.4S, v15.4S, v6.4S +trn2 v15.2D, v26.2D, v5.2D +trn2 v6.2D, v29.2D, v4.2D +trn1 v10.2D, v26.2D, v5.2D +trn1 v30.2D, v29.2D, v4.2D +sqrdmulh v4.4S, v15.4S, v23.4S +mul v15.4S, v15.4S,v8.4S +mla v15.4S, v4.4S, v31.s[0] +sub v4.4s, v10.4s, v15.4s +add v10.4s, v10.4s, v15.4s +sqrdmulh v15.4S, v6.4S, v23.4S +mul v6.4S, v6.4S,v8.4S +mla v6.4S, v15.4S, v31.s[0] +sub v15.4s, v30.4s, v6.4s +add v30.4s, v30.4s, v6.4s +sqrdmulh v6.4S, v30.4S, v9.4S +mul v30.4S, v30.4S,v2.4S +mla v30.4S, v6.4S, v31.s[0] +sub v6.4s, v10.4s, v30.4s +add v10.4s, v10.4s, v30.4s +sqrdmulh v30.4S, v15.4S, v0.4S +mul v15.4S, v15.4S,v13.4S +mla v15.4S, v30.4S, v31.s[0] +sub v30.4s, v4.4s, v15.4s +add v4.4s, v4.4s, v15.4s +str q10, [x0, #448] +str q6, [x0, #464] +str q4, [x0, #480] +str q30, [x0, #496] +ldr q30, [x17, #+1152] +ldr q4, [x17, #+1168] +ldr q6, [x17, #+1184] +ldr q10, [x17, #+1200] +ldr q15, [x17, #+1216] +ldr q29, [x17, #+1232] +ldr q5, [x17, #+1248] +ldr q26, [x17, #+1264] +ldr q0, [x0, #544] +ldr q13, [x0, #560] +ldr q9, [x0, #512] +ldr q2, [x0, #528] +sqrdmulh v23.4S, v0.4S, v4.s[0] +mul v0.4S, v0.4S,v30.s[0] +mla v0.4S, v23.4S, v31.s[0] +sub v23.4s, v9.4s, v0.4s +add v9.4s, v9.4s, v0.4s +sqrdmulh v0.4S, v13.4S, v4.s[0] +mul v13.4S, v13.4S,v30.s[0] +mla v13.4S, v0.4S, v31.s[0] +sub v0.4s, v2.4s, v13.4s +add v2.4s, v2.4s, v13.4s +sqrdmulh v13.4S, v2.4S, v4.s[1] +mul v2.4S, v2.4S,v30.s[1] +mla v2.4S, v13.4S, v31.s[0] +sub v13.4s, v9.4s, v2.4s +add v9.4s, v9.4s, v2.4s +sqrdmulh v2.4S, v0.4S, v4.s[2] +mul v0.4S, v0.4S,v30.s[2] +mla v0.4S, v2.4S, v31.s[0] +sub v2.4s, v23.4s, v0.4s +add v23.4s, v23.4s, v0.4s +trn1 v0.4S, v9.4S, v13.4S +trn2 v8.4S, v9.4S, v13.4S +trn1 v11.4S, v23.4S, v2.4S +trn2 v7.4S, v23.4S, v2.4S +trn2 v23.2D, v0.2D, v11.2D +trn2 v2.2D, v8.2D, v7.2D +trn1 v9.2D, v0.2D, v11.2D +trn1 v13.2D, v8.2D, v7.2D +sqrdmulh v7.4S, v23.4S, v10.4S +mul v23.4S, v23.4S,v6.4S +mla v23.4S, v7.4S, v31.s[0] +sub v7.4s, v9.4s, v23.4s +add v9.4s, v9.4s, v23.4s +sqrdmulh v23.4S, v2.4S, v10.4S +mul v2.4S, v2.4S,v6.4S +mla v2.4S, v23.4S, v31.s[0] +sub v23.4s, v13.4s, v2.4s +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v13.4S, v29.4S +mul v13.4S, v13.4S,v15.4S +mla v13.4S, v2.4S, v31.s[0] +sub v2.4s, v9.4s, v13.4s +add v9.4s, v9.4s, v13.4s +sqrdmulh v13.4S, v23.4S, v26.4S +mul v23.4S, v23.4S,v5.4S +mla v23.4S, v13.4S, v31.s[0] +sub v13.4s, v7.4s, v23.4s +add v7.4s, v7.4s, v23.4s +str q9, [x0, #512] +str q2, [x0, #528] +str q7, [x0, #544] +str q13, [x0, #560] +ldr q13, [x17, #+1280] +ldr q7, [x17, #+1296] +ldr q2, [x17, #+1312] +ldr q9, [x17, #+1328] +ldr q23, [x17, #+1344] +ldr q8, [x17, #+1360] +ldr q11, [x17, #+1376] +ldr q0, [x17, #+1392] +ldr q26, [x0, #608] +ldr q5, [x0, #624] +ldr q29, [x0, #576] +ldr q15, [x0, #592] +sqrdmulh v10.4S, v26.4S, v7.s[0] +mul v26.4S, v26.4S,v13.s[0] +mla v26.4S, v10.4S, v31.s[0] +sub v10.4s, v29.4s, v26.4s +add v29.4s, v29.4s, v26.4s +sqrdmulh v26.4S, v5.4S, v7.s[0] +mul v5.4S, v5.4S,v13.s[0] +mla v5.4S, v26.4S, v31.s[0] +sub v26.4s, v15.4s, v5.4s +add v15.4s, v15.4s, v5.4s +sqrdmulh v5.4S, v15.4S, v7.s[1] +mul v15.4S, v15.4S,v13.s[1] +mla v15.4S, v5.4S, v31.s[0] +sub v5.4s, v29.4s, v15.4s +add v29.4s, v29.4s, v15.4s +sqrdmulh v15.4S, v26.4S, v7.s[2] +mul v26.4S, v26.4S,v13.s[2] +mla v26.4S, v15.4S, v31.s[0] +sub v15.4s, v10.4s, v26.4s +add v10.4s, v10.4s, v26.4s +trn1 v26.4S, v29.4S, v5.4S +trn2 v6.4S, v29.4S, v5.4S +trn1 v4.4S, v10.4S, v15.4S +trn2 v30.4S, v10.4S, v15.4S +trn2 v10.2D, v26.2D, v4.2D +trn2 v15.2D, v6.2D, v30.2D +trn1 v29.2D, v26.2D, v4.2D +trn1 v5.2D, v6.2D, v30.2D +sqrdmulh v30.4S, v10.4S, v9.4S +mul v10.4S, v10.4S,v2.4S +mla v10.4S, v30.4S, v31.s[0] +sub v30.4s, v29.4s, v10.4s +add v29.4s, v29.4s, v10.4s +sqrdmulh v10.4S, v15.4S, v9.4S +mul v15.4S, v15.4S,v2.4S +mla v15.4S, v10.4S, v31.s[0] +sub v10.4s, v5.4s, v15.4s +add v5.4s, v5.4s, v15.4s +sqrdmulh v15.4S, v5.4S, v8.4S +mul v5.4S, v5.4S,v23.4S +mla v5.4S, v15.4S, v31.s[0] +sub v15.4s, v29.4s, v5.4s +add v29.4s, v29.4s, v5.4s +sqrdmulh v5.4S, v10.4S, v0.4S +mul v10.4S, v10.4S,v11.4S +mla v10.4S, v5.4S, v31.s[0] +sub v5.4s, v30.4s, v10.4s +add v30.4s, v30.4s, v10.4s +str q29, [x0, #576] +str q15, [x0, #592] +str q30, [x0, #608] +str q5, [x0, #624] +ldr q5, [x17, #+1408] +ldr q30, [x17, #+1424] +ldr q15, [x17, #+1440] +ldr q29, [x17, #+1456] +ldr q10, [x17, #+1472] +ldr q6, [x17, #+1488] +ldr q4, [x17, #+1504] +ldr q26, [x17, #+1520] +ldr q0, [x0, #672] +ldr q11, [x0, #688] +ldr q8, [x0, #640] +ldr q23, [x0, #656] +sqrdmulh v9.4S, v0.4S, v30.s[0] +mul v0.4S, v0.4S,v5.s[0] +mla v0.4S, v9.4S, v31.s[0] +sub v9.4s, v8.4s, v0.4s +add v8.4s, v8.4s, v0.4s +sqrdmulh v0.4S, v11.4S, v30.s[0] +mul v11.4S, v11.4S,v5.s[0] +mla v11.4S, v0.4S, v31.s[0] +sub v0.4s, v23.4s, v11.4s +add v23.4s, v23.4s, v11.4s +sqrdmulh v11.4S, v23.4S, v30.s[1] +mul v23.4S, v23.4S,v5.s[1] +mla v23.4S, v11.4S, v31.s[0] +sub v11.4s, v8.4s, v23.4s +add v8.4s, v8.4s, v23.4s +sqrdmulh v23.4S, v0.4S, v30.s[2] +mul v0.4S, v0.4S,v5.s[2] +mla v0.4S, v23.4S, v31.s[0] +sub v23.4s, v9.4s, v0.4s +add v9.4s, v9.4s, v0.4s +trn1 v0.4S, v8.4S, v11.4S +trn2 v2.4S, v8.4S, v11.4S +trn1 v7.4S, v9.4S, v23.4S +trn2 v13.4S, v9.4S, v23.4S +trn2 v9.2D, v0.2D, v7.2D +trn2 v23.2D, v2.2D, v13.2D +trn1 v8.2D, v0.2D, v7.2D +trn1 v11.2D, v2.2D, v13.2D +sqrdmulh v13.4S, v9.4S, v29.4S +mul v9.4S, v9.4S,v15.4S +mla v9.4S, v13.4S, v31.s[0] +sub v13.4s, v8.4s, v9.4s +add v8.4s, v8.4s, v9.4s +sqrdmulh v9.4S, v23.4S, v29.4S +mul v23.4S, v23.4S,v15.4S +mla v23.4S, v9.4S, v31.s[0] +sub v9.4s, v11.4s, v23.4s +add v11.4s, v11.4s, v23.4s +sqrdmulh v23.4S, v11.4S, v6.4S +mul v11.4S, v11.4S,v10.4S +mla v11.4S, v23.4S, v31.s[0] +sub v23.4s, v8.4s, v11.4s +add v8.4s, v8.4s, v11.4s +sqrdmulh v11.4S, v9.4S, v26.4S +mul v9.4S, v9.4S,v4.4S +mla v9.4S, v11.4S, v31.s[0] +sub v11.4s, v13.4s, v9.4s +add v13.4s, v13.4s, v9.4s +str q8, [x0, #640] +str q23, [x0, #656] +str q13, [x0, #672] +str q11, [x0, #688] +ldr q11, [x17, #+1536] +ldr q13, [x17, #+1552] +ldr q23, [x17, #+1568] +ldr q8, [x17, #+1584] +ldr q9, [x17, #+1600] +ldr q2, [x17, #+1616] +ldr q7, [x17, #+1632] +ldr q0, [x17, #+1648] +ldr q26, [x0, #736] +ldr q4, [x0, #752] +ldr q6, [x0, #704] +ldr q10, [x0, #720] +sqrdmulh v29.4S, v26.4S, v13.s[0] +mul v26.4S, v26.4S,v11.s[0] +mla v26.4S, v29.4S, v31.s[0] +sub v29.4s, v6.4s, v26.4s +add v6.4s, v6.4s, v26.4s +sqrdmulh v26.4S, v4.4S, v13.s[0] +mul v4.4S, v4.4S,v11.s[0] +mla v4.4S, v26.4S, v31.s[0] +sub v26.4s, v10.4s, v4.4s +add v10.4s, v10.4s, v4.4s +sqrdmulh v4.4S, v10.4S, v13.s[1] +mul v10.4S, v10.4S,v11.s[1] +mla v10.4S, v4.4S, v31.s[0] +sub v4.4s, v6.4s, v10.4s +add v6.4s, v6.4s, v10.4s +sqrdmulh v10.4S, v26.4S, v13.s[2] +mul v26.4S, v26.4S,v11.s[2] +mla v26.4S, v10.4S, v31.s[0] +sub v10.4s, v29.4s, v26.4s +add v29.4s, v29.4s, v26.4s +trn1 v26.4S, v6.4S, v4.4S +trn2 v15.4S, v6.4S, v4.4S +trn1 v30.4S, v29.4S, v10.4S +trn2 v5.4S, v29.4S, v10.4S +trn2 v29.2D, v26.2D, v30.2D +trn2 v10.2D, v15.2D, v5.2D +trn1 v6.2D, v26.2D, v30.2D +trn1 v4.2D, v15.2D, v5.2D +sqrdmulh v5.4S, v29.4S, v8.4S +mul v29.4S, v29.4S,v23.4S +mla v29.4S, v5.4S, v31.s[0] +sub v5.4s, v6.4s, v29.4s +add v6.4s, v6.4s, v29.4s +sqrdmulh v29.4S, v10.4S, v8.4S +mul v10.4S, v10.4S,v23.4S +mla v10.4S, v29.4S, v31.s[0] +sub v29.4s, v4.4s, v10.4s +add v4.4s, v4.4s, v10.4s +sqrdmulh v10.4S, v4.4S, v2.4S +mul v4.4S, v4.4S,v9.4S +mla v4.4S, v10.4S, v31.s[0] +sub v10.4s, v6.4s, v4.4s +add v6.4s, v6.4s, v4.4s +sqrdmulh v4.4S, v29.4S, v0.4S +mul v29.4S, v29.4S,v7.4S +mla v29.4S, v4.4S, v31.s[0] +sub v4.4s, v5.4s, v29.4s +add v5.4s, v5.4s, v29.4s +str q6, [x0, #704] +str q10, [x0, #720] +str q5, [x0, #736] +str q4, [x0, #752] +ldr q4, [x17, #+1664] +ldr q5, [x17, #+1680] +ldr q10, [x17, #+1696] +ldr q6, [x17, #+1712] +ldr q29, [x17, #+1728] +ldr q15, [x17, #+1744] +ldr q30, [x17, #+1760] +ldr q26, [x17, #+1776] +ldr q0, [x0, #800] +ldr q7, [x0, #816] +ldr q2, [x0, #768] +ldr q9, [x0, #784] +sqrdmulh v8.4S, v0.4S, v5.s[0] +mul v0.4S, v0.4S,v4.s[0] +mla v0.4S, v8.4S, v31.s[0] +sub v8.4s, v2.4s, v0.4s +add v2.4s, v2.4s, v0.4s +sqrdmulh v0.4S, v7.4S, v5.s[0] +mul v7.4S, v7.4S,v4.s[0] +mla v7.4S, v0.4S, v31.s[0] +sub v0.4s, v9.4s, v7.4s +add v9.4s, v9.4s, v7.4s +sqrdmulh v7.4S, v9.4S, v5.s[1] +mul v9.4S, v9.4S,v4.s[1] +mla v9.4S, v7.4S, v31.s[0] +sub v7.4s, v2.4s, v9.4s +add v2.4s, v2.4s, v9.4s +sqrdmulh v9.4S, v0.4S, v5.s[2] +mul v0.4S, v0.4S,v4.s[2] +mla v0.4S, v9.4S, v31.s[0] +sub v9.4s, v8.4s, v0.4s +add v8.4s, v8.4s, v0.4s +trn1 v0.4S, v2.4S, v7.4S +trn2 v23.4S, v2.4S, v7.4S +trn1 v13.4S, v8.4S, v9.4S +trn2 v11.4S, v8.4S, v9.4S +trn2 v8.2D, v0.2D, v13.2D +trn2 v9.2D, v23.2D, v11.2D +trn1 v2.2D, v0.2D, v13.2D +trn1 v7.2D, v23.2D, v11.2D +sqrdmulh v11.4S, v8.4S, v6.4S +mul v8.4S, v8.4S,v10.4S +mla v8.4S, v11.4S, v31.s[0] +sub v11.4s, v2.4s, v8.4s +add v2.4s, v2.4s, v8.4s +sqrdmulh v8.4S, v9.4S, v6.4S +mul v9.4S, v9.4S,v10.4S +mla v9.4S, v8.4S, v31.s[0] +sub v8.4s, v7.4s, v9.4s +add v7.4s, v7.4s, v9.4s +sqrdmulh v9.4S, v7.4S, v15.4S +mul v7.4S, v7.4S,v29.4S +mla v7.4S, v9.4S, v31.s[0] +sub v9.4s, v2.4s, v7.4s +add v2.4s, v2.4s, v7.4s +sqrdmulh v7.4S, v8.4S, v26.4S +mul v8.4S, v8.4S,v30.4S +mla v8.4S, v7.4S, v31.s[0] +sub v7.4s, v11.4s, v8.4s +add v11.4s, v11.4s, v8.4s +str q2, [x0, #768] +str q9, [x0, #784] +str q11, [x0, #800] +str q7, [x0, #816] +ldr q7, [x17, #+1792] +ldr q11, [x17, #+1808] +ldr q9, [x17, #+1824] +ldr q2, [x17, #+1840] +ldr q8, [x17, #+1856] +ldr q23, [x17, #+1872] +ldr q13, [x17, #+1888] +ldr q0, [x17, #+1904] +ldr q26, [x0, #864] +ldr q30, [x0, #880] +ldr q15, [x0, #832] +ldr q29, [x0, #848] +sqrdmulh v6.4S, v26.4S, v11.s[0] +mul v26.4S, v26.4S,v7.s[0] +mla v26.4S, v6.4S, v31.s[0] +sub v6.4s, v15.4s, v26.4s +add v15.4s, v15.4s, v26.4s +sqrdmulh v26.4S, v30.4S, v11.s[0] +mul v30.4S, v30.4S,v7.s[0] +mla v30.4S, v26.4S, v31.s[0] +sub v26.4s, v29.4s, v30.4s +add v29.4s, v29.4s, v30.4s +sqrdmulh v30.4S, v29.4S, v11.s[1] +mul v29.4S, v29.4S,v7.s[1] +mla v29.4S, v30.4S, v31.s[0] +sub v30.4s, v15.4s, v29.4s +add v15.4s, v15.4s, v29.4s +sqrdmulh v29.4S, v26.4S, v11.s[2] +mul v26.4S, v26.4S,v7.s[2] +mla v26.4S, v29.4S, v31.s[0] +sub v29.4s, v6.4s, v26.4s +add v6.4s, v6.4s, v26.4s +trn1 v26.4S, v15.4S, v30.4S +trn2 v10.4S, v15.4S, v30.4S +trn1 v5.4S, v6.4S, v29.4S +trn2 v4.4S, v6.4S, v29.4S +trn2 v6.2D, v26.2D, v5.2D +trn2 v29.2D, v10.2D, v4.2D +trn1 v15.2D, v26.2D, v5.2D +trn1 v30.2D, v10.2D, v4.2D +sqrdmulh v4.4S, v6.4S, v2.4S +mul v6.4S, v6.4S,v9.4S +mla v6.4S, v4.4S, v31.s[0] +sub v4.4s, v15.4s, v6.4s +add v15.4s, v15.4s, v6.4s +sqrdmulh v6.4S, v29.4S, v2.4S +mul v29.4S, v29.4S,v9.4S +mla v29.4S, v6.4S, v31.s[0] +sub v6.4s, v30.4s, v29.4s +add v30.4s, v30.4s, v29.4s +sqrdmulh v29.4S, v30.4S, v23.4S +mul v30.4S, v30.4S,v8.4S +mla v30.4S, v29.4S, v31.s[0] +sub v29.4s, v15.4s, v30.4s +add v15.4s, v15.4s, v30.4s +sqrdmulh v30.4S, v6.4S, v0.4S +mul v6.4S, v6.4S,v13.4S +mla v6.4S, v30.4S, v31.s[0] +sub v30.4s, v4.4s, v6.4s +add v4.4s, v4.4s, v6.4s +str q15, [x0, #832] +str q29, [x0, #848] +str q4, [x0, #864] +str q30, [x0, #880] +ldr q30, [x17, #+1920] +ldr q4, [x17, #+1936] +ldr q29, [x17, #+1952] +ldr q15, [x17, #+1968] +ldr q6, [x17, #+1984] +ldr q10, [x17, #+2000] +ldr q5, [x17, #+2016] +ldr q26, [x17, #+2032] +ldr q0, [x0, #928] +ldr q13, [x0, #944] +ldr q23, [x0, #896] +ldr q8, [x0, #912] +sqrdmulh v2.4S, v0.4S, v4.s[0] +mul v0.4S, v0.4S,v30.s[0] +mla v0.4S, v2.4S, v31.s[0] +sub v2.4s, v23.4s, v0.4s +add v23.4s, v23.4s, v0.4s +sqrdmulh v0.4S, v13.4S, v4.s[0] +mul v13.4S, v13.4S,v30.s[0] +mla v13.4S, v0.4S, v31.s[0] +sub v0.4s, v8.4s, v13.4s +add v8.4s, v8.4s, v13.4s +sqrdmulh v13.4S, v8.4S, v4.s[1] +mul v8.4S, v8.4S,v30.s[1] +mla v8.4S, v13.4S, v31.s[0] +sub v13.4s, v23.4s, v8.4s +add v23.4s, v23.4s, v8.4s +sqrdmulh v8.4S, v0.4S, v4.s[2] +mul v0.4S, v0.4S,v30.s[2] +mla v0.4S, v8.4S, v31.s[0] +sub v8.4s, v2.4s, v0.4s +add v2.4s, v2.4s, v0.4s +trn1 v0.4S, v23.4S, v13.4S +trn2 v9.4S, v23.4S, v13.4S +trn1 v11.4S, v2.4S, v8.4S +trn2 v7.4S, v2.4S, v8.4S +trn2 v2.2D, v0.2D, v11.2D +trn2 v8.2D, v9.2D, v7.2D +trn1 v23.2D, v0.2D, v11.2D +trn1 v13.2D, v9.2D, v7.2D +sqrdmulh v7.4S, v2.4S, v15.4S +mul v2.4S, v2.4S,v29.4S +mla v2.4S, v7.4S, v31.s[0] +sub v7.4s, v23.4s, v2.4s +add v23.4s, v23.4s, v2.4s +sqrdmulh v2.4S, v8.4S, v15.4S +mul v8.4S, v8.4S,v29.4S +mla v8.4S, v2.4S, v31.s[0] +sub v2.4s, v13.4s, v8.4s +add v13.4s, v13.4s, v8.4s +sqrdmulh v8.4S, v13.4S, v10.4S +mul v13.4S, v13.4S,v6.4S +mla v13.4S, v8.4S, v31.s[0] +sub v8.4s, v23.4s, v13.4s +add v23.4s, v23.4s, v13.4s +sqrdmulh v13.4S, v2.4S, v26.4S +mul v2.4S, v2.4S,v5.4S +mla v2.4S, v13.4S, v31.s[0] +sub v13.4s, v7.4s, v2.4s +add v7.4s, v7.4s, v2.4s +str q23, [x0, #896] +str q8, [x0, #912] +str q7, [x0, #928] +str q13, [x0, #944] +ldr q13, [x17, #+2048] +ldr q7, [x17, #+2064] +ldr q8, [x17, #+2080] +ldr q23, [x17, #+2096] +ldr q2, [x17, #+2112] +ldr q9, [x17, #+2128] +ldr q11, [x17, #+2144] +ldr q0, [x17, #+2160] +ldr q26, [x0, #992] +ldr q5, [x0, #1008] +ldr q10, [x0, #960] +ldr q6, [x0, #976] +sqrdmulh v15.4S, v26.4S, v7.s[0] +mul v26.4S, v26.4S,v13.s[0] +mla v26.4S, v15.4S, v31.s[0] +sub v15.4s, v10.4s, v26.4s +add v10.4s, v10.4s, v26.4s +sqrdmulh v26.4S, v5.4S, v7.s[0] +mul v5.4S, v5.4S,v13.s[0] +mla v5.4S, v26.4S, v31.s[0] +sub v26.4s, v6.4s, v5.4s +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v6.4S, v7.s[1] +mul v6.4S, v6.4S,v13.s[1] +mla v6.4S, v5.4S, v31.s[0] +sub v5.4s, v10.4s, v6.4s +add v10.4s, v10.4s, v6.4s +sqrdmulh v6.4S, v26.4S, v7.s[2] +mul v26.4S, v26.4S,v13.s[2] +mla v26.4S, v6.4S, v31.s[0] +sub v6.4s, v15.4s, v26.4s +add v15.4s, v15.4s, v26.4s +trn1 v26.4S, v10.4S, v5.4S +trn2 v29.4S, v10.4S, v5.4S +trn1 v4.4S, v15.4S, v6.4S +trn2 v30.4S, v15.4S, v6.4S +trn2 v15.2D, v26.2D, v4.2D +trn2 v6.2D, v29.2D, v30.2D +trn1 v10.2D, v26.2D, v4.2D +trn1 v5.2D, v29.2D, v30.2D +sqrdmulh v30.4S, v15.4S, v23.4S +mul v15.4S, v15.4S,v8.4S +mla v15.4S, v30.4S, v31.s[0] +sub v30.4s, v10.4s, v15.4s +add v10.4s, v10.4s, v15.4s +sqrdmulh v15.4S, v6.4S, v23.4S +mul v6.4S, v6.4S,v8.4S +mla v6.4S, v15.4S, v31.s[0] +sub v15.4s, v5.4s, v6.4s +add v5.4s, v5.4s, v6.4s +sqrdmulh v6.4S, v5.4S, v9.4S +mul v5.4S, v5.4S,v2.4S +mla v5.4S, v6.4S, v31.s[0] +sub v6.4s, v10.4s, v5.4s +add v10.4s, v10.4s, v5.4s +sqrdmulh v5.4S, v15.4S, v0.4S +mul v15.4S, v15.4S,v11.4S +mla v15.4S, v5.4S, v31.s[0] +sub v5.4s, v30.4s, v15.4s +add v30.4s, v30.4s, v15.4s +str q10, [x0, #960] +str q6, [x0, #976] +str q30, [x0, #992] +str q5, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 2392 +// Instruction count: 2388 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_7_0.s b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_7_0.s new file mode 100644 index 0000000..1628189 --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_7_0.s @@ -0,0 +1,2422 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 26036764 // Layer 6, block 0 +.word 7065381 // Layer 6, block 1 +.word 11280567 // Layer 6, block 2 +.word 19695786 // Layer 6, block 3 +.word 1666225723 // Layer 6, block 0 +.word 452149874 // Layer 6, block 1 +.word 721901190 // Layer 6, block 2 +.word 1260434103 // Layer 6, block 3 +.word 28678040 // Layer 7, block 0 +.word 5637166 // Layer 7, block 2 +.word 18759424 // Layer 7, block 4 +.word 8648030 // Layer 7, block 6 +.word 1835254486 // Layer 7, block 0 +.word 360751090 // Layer 7, block 2 +.word 1200511508 // Layer 7, block 4 +.word 553431680 // Layer 7, block 6 +.word 7232147 // Layer 7, block 1 +.word 7430689 // Layer 7, block 3 +.word 14819378 // Layer 7, block 5 +.word 22112339 // Layer 7, block 7 +.word 462822084 // Layer 7, block 1 +.word 475527802 // Layer 7, block 3 +.word 948367809 // Layer 7, block 5 +.word 1415081692 // Layer 7, block 7 +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14834498 // Layer 6, block 4 +.word 22861321 // Layer 6, block 5 +.word 23033862 // Layer 6, block 6 +.word 32211066 // Layer 6, block 7 +.word 949335415 // Layer 6, block 4 +.word 1463012881 // Layer 6, block 5 +.word 1474054663 // Layer 6, block 6 +.word 2061350894 // Layer 6, block 7 +.word 7103825 // Layer 7, block 8 +.word 24338119 // Layer 7, block 10 +.word 6674394 // Layer 7, block 12 +.word 3716128 // Layer 7, block 14 +.word 454610102 // Layer 7, block 8 +.word 1557520740 // Layer 7, block 10 +.word 427128616 // Layer 7, block 12 +.word 237814041 // Layer 7, block 14 +.word 18577393 // Layer 7, block 9 +.word 17042091 // Layer 7, block 11 +.word 6574213 // Layer 7, block 13 +.word 24666803 // Layer 7, block 15 +.word 1188862414 // Layer 7, block 9 +.word 1090610585 // Layer 7, block 11 +.word 420717521 // Layer 7, block 13 +.word 1578554911 // Layer 7, block 15 +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 11253846 // Layer 6, block 8 +.word 16151303 // Layer 6, block 9 +.word 1821442 // Layer 6, block 10 +.word 23358663 // Layer 6, block 11 +.word 720191176 // Layer 6, block 8 +.word 1033604503 // Layer 6, block 9 +.word 116563391 // Layer 6, block 10 +.word 1494840340 // Layer 6, block 11 +.word 32787475 // Layer 7, block 16 +.word 8269259 // Layer 7, block 18 +.word 20826321 // Layer 7, block 20 +.word 21194054 // Layer 7, block 22 +.word 2098238255 // Layer 7, block 16 +.word 529192186 // Layer 7, block 18 +.word 1332782821 // Layer 7, block 20 +.word 1356315937 // Layer 7, block 22 +.word 28400654 // Layer 7, block 17 +.word 31090287 // Layer 7, block 19 +.word 26776841 // Layer 7, block 21 +.word 22281074 // Layer 7, block 23 +.word 1817503137 // Layer 7, block 17 +.word 1989626512 // Layer 7, block 19 +.word 1713587037 // Layer 7, block 21 +.word 1425879908 // Layer 7, block 23 +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 20504641 // Layer 6, block 12 +.word 7735096 // Layer 6, block 13 +.word 29463916 // Layer 6, block 14 +.word 23172067 // Layer 6, block 15 +.word 1312196872 // Layer 6, block 12 +.word 495008363 // Layer 6, block 13 +.word 1885546712 // Layer 6, block 14 +.word 1482899108 // Layer 6, block 15 +.word 1953000 // Layer 7, block 24 +.word 12766243 // Layer 7, block 26 +.word 16292342 // Layer 7, block 28 +.word 25143337 // Layer 7, block 30 +.word 124982461 // Layer 7, block 24 +.word 816977197 // Layer 7, block 26 +.word 1042630311 // Layer 7, block 28 +.word 1609050759 // Layer 7, block 30 +.word 12486848 // Layer 7, block 25 +.word 31556661 // Layer 7, block 27 +.word 28330310 // Layer 7, block 29 +.word 15137961 // Layer 7, block 31 +.word 799097282 // Layer 7, block 25 +.word 2019472170 // Layer 7, block 27 +.word 1813001465 // Layer 7, block 29 +.word 968755565 // Layer 7, block 31 +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 18663828 // Layer 6, block 16 +.word 25765932 // Layer 6, block 17 +.word 11779122 // Layer 6, block 18 +.word 29112305 // Layer 6, block 19 +.word 1194393831 // Layer 6, block 16 +.word 1648893798 // Layer 6, block 17 +.word 753806275 // Layer 6, block 18 +.word 1863045325 // Layer 6, block 19 +.word 33163184 // Layer 7, block 32 +.word 11550623 // Layer 7, block 34 +.word 25375595 // Layer 7, block 36 +.word 18254638 // Layer 7, block 38 +.word 2122281795 // Layer 7, block 32 +.word 739183455 // Layer 7, block 34 +.word 1623914137 // Layer 7, block 36 +.word 1168207670 // Layer 7, block 38 +.word 9551359 // Layer 7, block 33 +.word 33257316 // Layer 7, block 35 +.word 10387700 // Layer 7, block 37 +.word 4263629 // Layer 7, block 39 +.word 611240324 // Layer 7, block 33 +.word 2128305784 // Layer 7, block 35 +.word 664762063 // Layer 7, block 37 +.word 272851431 // Layer 7, block 39 +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 596073 // Layer 6, block 20 +.word 29039358 // Layer 6, block 21 +.word 6760262 // Layer 6, block 22 +.word 2228887 // Layer 6, block 23 +.word 38145761 // Layer 6, block 20 +.word 1858377074 // Layer 6, block 21 +.word 432623749 // Layer 6, block 22 +.word 142637881 // Layer 6, block 23 +.word 25929180 // Layer 7, block 40 +.word 23508428 // Layer 7, block 42 +.word 22560727 // Layer 7, block 44 +.word 29457393 // Layer 7, block 46 +.word 1659340873 // Layer 7, block 40 +.word 1504424569 // Layer 7, block 42 +.word 1443776334 // Layer 7, block 44 +.word 1885129272 // Layer 7, block 46 +.word 17371159 // Layer 7, block 41 +.word 11558208 // Layer 7, block 43 +.word 15755637 // Layer 7, block 45 +.word 20740787 // Layer 7, block 47 +.word 1111669329 // Layer 7, block 41 +.word 739668858 // Layer 7, block 43 +.word 1008283812 // Layer 7, block 45 +.word 1327309063 // Layer 7, block 47 +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 13624329 // Layer 6, block 24 +.word 9838349 // Layer 6, block 25 +.word 6934560 // Layer 6, block 26 +.word 11310234 // Layer 6, block 27 +.word 871890510 // Layer 6, block 24 +.word 629606282 // Layer 6, block 25 +.word 443777969 // Layer 6, block 26 +.word 723799733 // Layer 6, block 27 +.word 3153984 // Layer 7, block 48 +.word 15599806 // Layer 7, block 50 +.word 23484790 // Layer 7, block 52 +.word 30174454 // Layer 7, block 54 +.word 201839571 // Layer 7, block 48 +.word 998311389 // Layer 7, block 50 +.word 1502911852 // Layer 7, block 52 +.word 1931017673 // Layer 7, block 54 +.word 13598070 // Layer 7, block 49 +.word 31454003 // Layer 7, block 51 +.word 20506260 // Layer 7, block 53 +.word 5928435 // Layer 7, block 55 +.word 870210062 // Layer 7, block 49 +.word 2012902560 // Layer 7, block 51 +.word 1312300480 // Layer 7, block 53 +.word 379390883 // Layer 7, block 55 +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 32798516 // Layer 6, block 28 +.word 9911360 // Layer 6, block 29 +.word 32443170 // Layer 6, block 30 +.word 31293482 // Layer 6, block 31 +.word 2098944825 // Layer 6, block 28 +.word 634278629 // Layer 6, block 29 +.word 2076204416 // Layer 6, block 30 +.word 2002630000 // Layer 6, block 31 +.word 26013877 // Layer 7, block 56 +.word 22928950 // Layer 7, block 58 +.word 24547058 // Layer 7, block 60 +.word 21082546 // Layer 7, block 62 +.word 1664761067 // Layer 7, block 56 +.word 1467340807 // Layer 7, block 58 +.word 1570891816 // Layer 7, block 60 +.word 1349179970 // Layer 7, block 62 +.word 21864746 // Layer 7, block 57 +.word 27678266 // Layer 7, block 59 +.word 30695887 // Layer 7, block 61 +.word 31772478 // Layer 7, block 63 +.word 1399236949 // Layer 7, block 57 +.word 1771273834 // Layer 7, block 59 +.word 1964386839 // Layer 7, block 61 +.word 2033283404 // Layer 7, block 63 +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 2853776 // Layer 6, block 32 +.word 31645959 // Layer 6, block 33 +.word 29723614 // Layer 6, block 34 +.word 31813171 // Layer 6, block 35 +.word 182627725 // Layer 6, block 32 +.word 2025186806 // Layer 6, block 33 +.word 1902166116 // Layer 6, block 34 +.word 2035887557 // Layer 6, block 35 +.word 30377953 // Layer 7, block 64 +.word 4924837 // Layer 7, block 66 +.word 11362575 // Layer 7, block 68 +.word 31398766 // Layer 7, block 70 +.word 1944040616 // Layer 7, block 64 +.word 315165513 // Layer 7, block 66 +.word 727149301 // Layer 7, block 68 +.word 2009367662 // Layer 7, block 70 +.word 27689101 // Layer 7, block 65 +.word 31229525 // Layer 7, block 67 +.word 6544948 // Layer 7, block 69 +.word 13728247 // Layer 7, block 71 +.word 1771967221 // Layer 7, block 65 +.word 1998537064 // Layer 7, block 67 +.word 418844704 // Layer 7, block 69 +.word 878540754 // Layer 7, block 71 +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9116920 // Layer 6, block 36 +.word 26449800 // Layer 6, block 37 +.word 27173300 // Layer 6, block 38 +.word 1574249 // Layer 6, block 39 +.word 583438350 // Layer 6, block 36 +.word 1692658010 // Layer 6, block 37 +.word 1738958476 // Layer 6, block 38 +.word 100744247 // Layer 6, block 39 +.word 6510145 // Layer 7, block 72 +.word 760999 // Layer 7, block 74 +.word 1634503 // Layer 7, block 76 +.word 29546109 // Layer 7, block 78 +.word 416617482 // Layer 7, block 72 +.word 48700219 // Layer 7, block 74 +.word 104600209 // Layer 7, block 76 +.word 1890806663 // Layer 7, block 78 +.word 2195232 // Layer 7, block 73 +.word 4465852 // Layer 7, block 75 +.word 31203102 // Layer 7, block 77 +.word 29916743 // Layer 7, block 79 +.word 140484126 // Layer 7, block 73 +.word 285792715 // Layer 7, block 75 +.word 1996846121 // Layer 7, block 77 +.word 1914525428 // Layer 7, block 79 +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29172999 // Layer 6, block 40 +.word 16825951 // Layer 6, block 41 +.word 11592382 // Layer 6, block 42 +.word 2671395 // Layer 6, block 43 +.word 1866929445 // Layer 6, block 40 +.word 1076778680 // Layer 6, block 41 +.word 741855827 // Layer 6, block 42 +.word 170956232 // Layer 6, block 43 +.word 14579779 // Layer 7, block 80 +.word 24263513 // Layer 7, block 82 +.word 4646776 // Layer 7, block 84 +.word 69049 // Layer 7, block 86 +.word 933034643 // Layer 7, block 80 +.word 1552746321 // Layer 7, block 82 +.word 297370968 // Layer 7, block 84 +.word 4418799 // Layer 7, block 86 +.word 33263488 // Layer 7, block 81 +.word 22493246 // Layer 7, block 83 +.word 22009979 // Layer 7, block 85 +.word 12021234 // Layer 7, block 87 +.word 2128700762 // Layer 7, block 81 +.word 1439457879 // Layer 7, block 83 +.word 1408531152 // Layer 7, block 85 +.word 769300260 // Layer 7, block 87 +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 15720958 // Layer 6, block 44 +.word 4876619 // Layer 6, block 45 +.word 9370171 // Layer 6, block 46 +.word 2197027 // Layer 6, block 47 +.word 1006064525 // Layer 6, block 44 +.word 312079797 // Layer 6, block 45 +.word 599645177 // Layer 6, block 46 +.word 140598997 // Layer 6, block 47 +.word 16117282 // Layer 7, block 88 +.word 9635661 // Layer 7, block 90 +.word 9117520 // Layer 7, block 92 +.word 3506913 // Layer 7, block 94 +.word 1031427326 // Layer 7, block 88 +.word 616635240 // Layer 7, block 90 +.word 583476747 // Layer 7, block 92 +.word 224425303 // Layer 7, block 94 +.word 20014407 // Layer 7, block 89 +.word 25893988 // Layer 7, block 91 +.word 10257619 // Layer 7, block 93 +.word 24501669 // Layer 7, block 95 +.word 1280824291 // Layer 7, block 89 +.word 1657088757 // Layer 7, block 91 +.word 656437514 // Layer 7, block 93 +.word 1567987141 // Layer 7, block 95 +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 23467272 // Layer 6, block 48 +.word 11944835 // Layer 6, block 49 +.word 29768154 // Layer 6, block 50 +.word 3189790 // Layer 6, block 51 +.word 1501790786 // Layer 6, block 48 +.word 764411097 // Layer 6, block 49 +.word 1905016458 // Layer 6, block 50 +.word 204130980 // Layer 6, block 51 +.word 28559032 // Layer 7, block 96 +.word 20151609 // Layer 7, block 98 +.word 11645481 // Layer 7, block 100 +.word 16402437 // Layer 7, block 102 +.word 1827638556 // Layer 7, block 96 +.word 1289604549 // Layer 7, block 98 +.word 745253903 // Layer 7, block 100 +.word 1049675853 // Layer 7, block 102 +.word 1005359 // Layer 7, block 97 +.word 19130139 // Layer 7, block 99 +.word 11690281 // Layer 7, block 101 +.word 5461508 // Layer 7, block 103 +.word 64338065 // Layer 7, block 97 +.word 1224235458 // Layer 7, block 99 +.word 748120885 // Layer 7, block 101 +.word 349509836 // Layer 7, block 103 +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 4898455 // Layer 6, block 52 +.word 22059944 // Layer 6, block 53 +.word 20315246 // Layer 6, block 54 +.word 28615767 // Layer 6, block 55 +.word 313477194 // Layer 6, block 52 +.word 1411728668 // Layer 6, block 53 +.word 1300076517 // Layer 6, block 54 +.word 1831269319 // Layer 6, block 55 +.word 6226096 // Layer 7, block 104 +.word 14029790 // Layer 7, block 106 +.word 7729000 // Layer 7, block 108 +.word 13958531 // Layer 7, block 110 +.word 398439734 // Layer 7, block 104 +.word 897838034 // Layer 7, block 106 +.word 494618249 // Layer 7, block 108 +.word 893277806 // Layer 7, block 110 +.word 31755058 // Layer 7, block 105 +.word 26102744 // Layer 7, block 107 +.word 19175904 // Layer 7, block 109 +.word 19472238 // Layer 7, block 111 +.word 2032168609 // Layer 7, block 105 +.word 1670448121 // Layer 7, block 107 +.word 1227164194 // Layer 7, block 109 +.word 1246128123 // Layer 7, block 111 +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 17302560 // Layer 6, block 56 +.word 8630188 // Layer 6, block 57 +.word 13744680 // Layer 6, block 58 +.word 31890906 // Layer 6, block 59 +.word 1107279328 // Layer 6, block 56 +.word 552289879 // Layer 6, block 57 +.word 879592386 // Layer 6, block 58 +.word 2040862218 // Layer 6, block 59 +.word 4735938 // Layer 7, block 112 +.word 26671657 // Layer 7, block 114 +.word 25810971 // Layer 7, block 116 +.word 25578690 // Layer 7, block 118 +.word 303076900 // Layer 7, block 112 +.word 1706855774 // Layer 7, block 114 +.word 1651776074 // Layer 7, block 116 +.word 1636911225 // Layer 7, block 118 +.word 6957373 // Layer 7, block 113 +.word 25381712 // Layer 7, block 115 +.word 27780827 // Layer 7, block 117 +.word 28062311 // Layer 7, block 119 +.word 445237890 // Layer 7, block 113 +.word 1624305595 // Layer 7, block 115 +.word 1777837237 // Layer 7, block 117 +.word 1795850838 // Layer 7, block 119 +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 26150922 // Layer 6, block 60 +.word 29525906 // Layer 6, block 61 +.word 23080870 // Layer 6, block 62 +.word 1636987 // Layer 6, block 63 +.word 1673531278 // Layer 6, block 60 +.word 1889513769 // Layer 6, block 61 +.word 1477062945 // Layer 6, block 62 +.word 104759172 // Layer 6, block 63 +.word 10674616 // Layer 7, block 120 +.word 9508293 // Layer 7, block 122 +.word 4274200 // Layer 7, block 124 +.word 10066304 // Layer 7, block 126 +.word 683123285 // Layer 7, block 120 +.word 608484310 // Layer 7, block 122 +.word 273527923 // Layer 7, block 124 +.word 644194289 // Layer 7, block 126 +.word 26473446 // Layer 7, block 121 +.word 14853570 // Layer 7, block 123 +.word 32427548 // Layer 7, block 125 +.word 16598340 // Layer 7, block 127 +.word 1694171239 // Layer 7, block 121 +.word 950555930 // Layer 7, block 123 +.word 2075204685 // Layer 7, block 125 +.word 1062212688 // Layer 7, block 127 +.text +.global ntt_u32_full_neon_asm_var_4_4_7_0 +.global _ntt_u32_full_neon_asm_var_4_4_7_0 +ntt_u32_full_neon_asm_var_4_4_7_0: +_ntt_u32_full_neon_asm_var_4_4_7_0: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #928] +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +ldr q20, [x0, #992] +sqrdmulh v19.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q18, [x0, #800] +sqrdmulh v17.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +ldr q16, [x0, #864] +sqrdmulh v3.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +mla v22.4S, v21.4S, v31.s[0] +mla v20.4S, v19.4S, v31.s[0] +mla v18.4S, v17.4S, v31.s[0] +mla v16.4S, v3.4S, v31.s[0] +ldr q3, [x0, #544] +sqrdmulh v17.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +ldr q19, [x0, #608] +sqrdmulh v21.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q2, [x0, #672] +ldr q1, [x0, #416] +sqrdmulh v0.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +sub v15.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +ldr q22, [x0, #736] +ldr q14, [x0, #480] +sqrdmulh v13.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v12.4s, v14.4s, v20.4s +add v14.4s, v14.4s, v20.4s +ldr q20, [x0, #288] +mla v3.4S, v17.4S, v31.s[0] +mla v19.4S, v21.4S, v31.s[0] +sub v21.4s, v20.4s, v18.4s +mla v2.4S, v0.4S, v31.s[0] +mla v22.4S, v13.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +ldr q18, [x0, #352] +sqrdmulh v13.4S, v1.4S, v29.s[1] +mul v1.4S, v1.4S,v30.s[1] +sub v0.4s, v18.4s, v16.4s +sqrdmulh v17.4S, v14.4S, v29.s[1] +mul v14.4S, v14.4S,v30.s[1] +add v18.4s, v18.4s, v16.4s +ldr q16, [x0, #32] +sqrdmulh v11.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v10.4s, v16.4s, v3.4s +add v16.4s, v16.4s, v3.4s +ldr q3, [x0, #96] +sqrdmulh v9.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v8.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +ldr q19, [x0, #160] +mla v1.4S, v13.4S, v31.s[0] +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v19.4s, v2.4s +mla v20.4S, v11.4S, v31.s[0] +mla v18.4S, v9.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +ldr q2, [x0, #224] +sqrdmulh v9.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +sub v11.4s, v2.4s, v22.4s +sqrdmulh v13.4S, v12.4S, v29.s[2] +mul v12.4S, v12.4S,v30.s[2] +add v2.4s, v2.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +sub v7.4s, v19.4s, v1.4s +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v29.s[2] +mul v0.4S, v0.4S,v30.s[2] +sub v6.4s, v2.4s, v14.4s +add v2.4s, v2.4s, v14.4s +mla v15.4S, v9.4S, v31.s[0] +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v16.4s, v20.4s +mla v21.4S, v22.4S, v31.s[0] +mla v0.4S, v1.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v7.4S, v27.s[1] +mul v7.4S, v7.4S,v28.s[1] +sub v1.4s, v3.4s, v18.4s +sqrdmulh v22.4S, v6.4S, v27.s[1] +mul v6.4S, v6.4S,v28.s[1] +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v19.4S, v27.s[0] +mul v19.4S, v19.4S,v28.s[0] +sub v9.4s, v17.4s, v15.4s +add v17.4s, v17.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v27.s[0] +mul v2.4S, v2.4S,v28.s[0] +sub v14.4s, v11.4s, v12.4s +add v11.4s, v11.4s, v12.4s +mla v7.4S, v20.4S, v31.s[0] +mla v6.4S, v22.4S, v31.s[0] +sub v22.4s, v10.4s, v21.4s +mla v19.4S, v18.4S, v31.s[0] +mla v2.4S, v15.4S, v31.s[0] +add v10.4s, v10.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v27.s[2] +mul v17.4S, v17.4S,v28.s[2] +sub v15.4s, v8.4s, v0.4s +sqrdmulh v18.4S, v11.4S, v27.s[2] +mul v11.4S, v11.4S,v28.s[2] +add v8.4s, v8.4s, v0.4s +sqrdmulh v0.4S, v9.4S, v27.s[3] +mul v9.4S, v9.4S,v28.s[3] +sub v20.4s, v13.4s, v7.4s +add v13.4s, v13.4s, v7.4s +sqrdmulh v7.4S, v14.4S, v27.s[3] +mul v14.4S, v14.4S,v28.s[3] +sub v12.4s, v1.4s, v6.4s +add v1.4s, v1.4s, v6.4s +mla v17.4S, v21.4S, v31.s[0] +mla v11.4S, v18.4S, v31.s[0] +sub v18.4s, v16.4s, v19.4s +mla v9.4S, v0.4S, v31.s[0] +mla v14.4S, v7.4S, v31.s[0] +add v16.4s, v16.4s, v19.4s +sqrdmulh v19.4S, v1.4S, v25.s[2] +mul v1.4S, v1.4S,v26.s[2] +sub v7.4s, v3.4s, v2.4s +sqrdmulh v0.4S, v12.4S, v25.s[3] +mul v12.4S, v12.4S,v26.s[3] +add v3.4s, v3.4s, v2.4s +sqrdmulh v2.4S, v7.4S, v25.s[1] +mul v7.4S, v7.4S,v26.s[1] +sub v21.4s, v10.4s, v17.4s +add v10.4s, v10.4s, v17.4s +sqrdmulh v17.4S, v3.4S, v25.s[0] +mul v3.4S, v3.4S,v26.s[0] +sub v6.4s, v8.4s, v11.4s +add v8.4s, v8.4s, v11.4s +mla v1.4S, v19.4S, v31.s[0] +mla v12.4S, v0.4S, v31.s[0] +sub v0.4s, v22.4s, v9.4s +mla v7.4S, v2.4S, v31.s[0] +mla v3.4S, v17.4S, v31.s[0] +add v22.4s, v22.4s, v9.4s +sqrdmulh v9.4S, v8.4S, v23.s[0] +mul v8.4S, v8.4S,v24.s[0] +sub v17.4s, v15.4s, v14.4s +sqrdmulh v2.4S, v6.4S, v23.s[1] +mul v6.4S, v6.4S,v24.s[1] +add v15.4s, v15.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v23.s[2] +mul v15.4S, v15.4S,v24.s[2] +sub v19.4s, v13.4s, v1.4s +add v13.4s, v13.4s, v1.4s +sqrdmulh v1.4S, v17.4S, v23.s[3] +mul v17.4S, v17.4S,v24.s[3] +sub v11.4s, v20.4s, v12.4s +add v20.4s, v20.4s, v12.4s +mla v8.4S, v9.4S, v31.s[0] +mla v6.4S, v2.4S, v31.s[0] +sub v2.4s, v18.4s, v7.4s +str q13, [x0, #288] +mla v15.4S, v14.4S, v31.s[0] +mla v17.4S, v1.4S, v31.s[0] +add v18.4s, v18.4s, v7.4s +str q19, [x0, #352] +ldr q19, [x0, #944] +sqrdmulh v7.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +sub v1.4s, v16.4s, v3.4s +str q20, [x0, #416] +ldr q20, [x0, #1008] +sqrdmulh v14.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v16.4s, v16.4s, v3.4s +str q11, [x0, #480] +ldr q11, [x0, #816] +sqrdmulh v3.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +sub v13.4s, v10.4s, v8.4s +add v10.4s, v10.4s, v8.4s +ldr q8, [x0, #880] +sqrdmulh v9.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v12.4s, v21.4s, v6.4s +add v21.4s, v21.4s, v6.4s +mla v19.4S, v7.4S, v31.s[0] +mla v20.4S, v14.4S, v31.s[0] +sub v14.4s, v22.4s, v15.4s +str q18, [x0, #160] +mla v11.4S, v3.4S, v31.s[0] +mla v8.4S, v9.4S, v31.s[0] +add v22.4s, v22.4s, v15.4s +str q2, [x0, #224] +ldr q2, [x0, #560] +sqrdmulh v15.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +sub v9.4s, v0.4s, v17.4s +str q16, [x0, #32] +ldr q16, [x0, #624] +sqrdmulh v3.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +add v0.4s, v0.4s, v17.4s +str q1, [x0, #96] +ldr q1, [x0, #688] +ldr q17, [x0, #432] +sqrdmulh v18.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +sub v7.4s, v17.4s, v19.4s +add v17.4s, v17.4s, v19.4s +ldr q19, [x0, #752] +ldr q6, [x0, #496] +sqrdmulh v5.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +sub v4.4s, v6.4s, v20.4s +add v6.4s, v6.4s, v20.4s +ldr q20, [x0, #304] +mla v2.4S, v15.4S, v31.s[0] +mla v16.4S, v3.4S, v31.s[0] +sub v3.4s, v20.4s, v11.4s +str q10, [x0, #544] +mla v1.4S, v18.4S, v31.s[0] +mla v19.4S, v5.4S, v31.s[0] +add v20.4s, v20.4s, v11.4s +str q13, [x0, #608] +ldr q13, [x0, #368] +sqrdmulh v11.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v5.4s, v13.4s, v8.4s +str q21, [x0, #672] +sqrdmulh v21.4S, v6.4S, v29.s[1] +mul v6.4S, v6.4S,v30.s[1] +add v13.4s, v13.4s, v8.4s +str q12, [x0, #736] +ldr q12, [x0, #48] +sqrdmulh v8.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v18.4s, v12.4s, v2.4s +add v12.4s, v12.4s, v2.4s +ldr q2, [x0, #112] +sqrdmulh v10.4S, v13.4S, v29.s[1] +mul v13.4S, v13.4S,v30.s[1] +sub v15.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +ldr q16, [x0, #176] +mla v17.4S, v11.4S, v31.s[0] +mla v6.4S, v21.4S, v31.s[0] +sub v21.4s, v16.4s, v1.4s +str q22, [x0, #800] +mla v20.4S, v8.4S, v31.s[0] +mla v13.4S, v10.4S, v31.s[0] +add v16.4s, v16.4s, v1.4s +str q14, [x0, #864] +ldr q14, [x0, #240] +sqrdmulh v1.4S, v7.4S, v29.s[2] +mul v7.4S, v7.4S,v30.s[2] +sub v10.4s, v14.4s, v19.4s +str q0, [x0, #928] +sqrdmulh v0.4S, v4.4S, v29.s[2] +mul v4.4S, v4.4S,v30.s[2] +add v14.4s, v14.4s, v19.4s +str q9, [x0, #992] +sqrdmulh v9.4S, v3.4S, v29.s[2] +mul v3.4S, v3.4S,v30.s[2] +sub v19.4s, v16.4s, v17.4s +add v16.4s, v16.4s, v17.4s +sqrdmulh v17.4S, v5.4S, v29.s[2] +mul v5.4S, v5.4S,v30.s[2] +sub v8.4s, v14.4s, v6.4s +add v14.4s, v14.4s, v6.4s +mla v7.4S, v1.4S, v31.s[0] +mla v4.4S, v0.4S, v31.s[0] +sub v0.4s, v12.4s, v20.4s +mla v3.4S, v9.4S, v31.s[0] +mla v5.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v27.s[1] +mul v19.4S, v19.4S,v28.s[1] +sub v17.4s, v2.4s, v13.4s +sqrdmulh v9.4S, v8.4S, v27.s[1] +mul v8.4S, v8.4S,v28.s[1] +add v2.4s, v2.4s, v13.4s +sqrdmulh v13.4S, v16.4S, v27.s[0] +mul v16.4S, v16.4S,v28.s[0] +sub v1.4s, v21.4s, v7.4s +add v21.4s, v21.4s, v7.4s +sqrdmulh v7.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +sub v6.4s, v10.4s, v4.4s +add v10.4s, v10.4s, v4.4s +mla v19.4S, v20.4S, v31.s[0] +mla v8.4S, v9.4S, v31.s[0] +sub v9.4s, v18.4s, v3.4s +mla v16.4S, v13.4S, v31.s[0] +mla v14.4S, v7.4S, v31.s[0] +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v27.s[2] +mul v21.4S, v21.4S,v28.s[2] +sub v7.4s, v15.4s, v5.4s +sqrdmulh v13.4S, v10.4S, v27.s[2] +mul v10.4S, v10.4S,v28.s[2] +add v15.4s, v15.4s, v5.4s +sqrdmulh v5.4S, v1.4S, v27.s[3] +mul v1.4S, v1.4S,v28.s[3] +sub v20.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v27.s[3] +mul v6.4S, v6.4S,v28.s[3] +sub v4.4s, v17.4s, v8.4s +add v17.4s, v17.4s, v8.4s +mla v21.4S, v3.4S, v31.s[0] +mla v10.4S, v13.4S, v31.s[0] +sub v13.4s, v12.4s, v16.4s +mla v1.4S, v5.4S, v31.s[0] +mla v6.4S, v19.4S, v31.s[0] +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v25.s[2] +mul v17.4S, v17.4S,v26.s[2] +sub v19.4s, v2.4s, v14.4s +sqrdmulh v5.4S, v4.4S, v25.s[3] +mul v4.4S, v4.4S,v26.s[3] +add v2.4s, v2.4s, v14.4s +sqrdmulh v14.4S, v19.4S, v25.s[1] +mul v19.4S, v19.4S,v26.s[1] +sub v3.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v2.4S, v25.s[0] +mul v2.4S, v2.4S,v26.s[0] +sub v8.4s, v15.4s, v10.4s +add v15.4s, v15.4s, v10.4s +mla v17.4S, v16.4S, v31.s[0] +mla v4.4S, v5.4S, v31.s[0] +sub v5.4s, v9.4s, v1.4s +mla v19.4S, v14.4S, v31.s[0] +mla v2.4S, v21.4S, v31.s[0] +add v9.4s, v9.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v23.s[0] +mul v15.4S, v15.4S,v24.s[0] +sub v21.4s, v7.4s, v6.4s +sqrdmulh v14.4S, v8.4S, v23.s[1] +mul v8.4S, v8.4S,v24.s[1] +add v7.4s, v7.4s, v6.4s +sqrdmulh v6.4S, v7.4S, v23.s[2] +mul v7.4S, v7.4S,v24.s[2] +sub v16.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +sqrdmulh v17.4S, v21.4S, v23.s[3] +mul v21.4S, v21.4S,v24.s[3] +sub v10.4s, v20.4s, v4.4s +add v20.4s, v20.4s, v4.4s +mla v15.4S, v1.4S, v31.s[0] +mla v8.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v19.4s +str q0, [x0, #304] +mla v7.4S, v6.4S, v31.s[0] +mla v21.4S, v17.4S, v31.s[0] +add v13.4s, v13.4s, v19.4s +str q16, [x0, #368] +ldr q16, [x0, #896] +sqrdmulh v19.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v17.4s, v12.4s, v2.4s +str q20, [x0, #432] +ldr q20, [x0, #960] +sqrdmulh v6.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v12.4s, v12.4s, v2.4s +str q10, [x0, #496] +ldr q10, [x0, #768] +sqrdmulh v2.4S, v10.4S, v29.s[0] +mul v10.4S, v10.4S,v30.s[0] +sub v0.4s, v18.4s, v15.4s +add v18.4s, v18.4s, v15.4s +ldr q15, [x0, #832] +sqrdmulh v1.4S, v15.4S, v29.s[0] +mul v15.4S, v15.4S,v30.s[0] +sub v4.4s, v3.4s, v8.4s +add v3.4s, v3.4s, v8.4s +mla v16.4S, v19.4S, v31.s[0] +mla v20.4S, v6.4S, v31.s[0] +sub v6.4s, v9.4s, v7.4s +str q13, [x0, #176] +mla v10.4S, v2.4S, v31.s[0] +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v7.4s +str q14, [x0, #240] +ldr q14, [x0, #512] +sqrdmulh v7.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v1.4s, v5.4s, v21.4s +str q12, [x0, #48] +ldr q12, [x0, #576] +sqrdmulh v2.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +add v5.4s, v5.4s, v21.4s +str q17, [x0, #112] +ldr q17, [x0, #640] +ldr q21, [x0, #384] +sqrdmulh v13.4S, v17.4S, v29.s[0] +mul v17.4S, v17.4S,v30.s[0] +sub v19.4s, v21.4s, v16.4s +add v21.4s, v21.4s, v16.4s +ldr q16, [x0, #704] +ldr q8, [x0, #448] +sqrdmulh v22.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v11.4s, v8.4s, v20.4s +add v8.4s, v8.4s, v20.4s +ldr q20, [x0, #256] +mla v14.4S, v7.4S, v31.s[0] +mla v12.4S, v2.4S, v31.s[0] +sub v2.4s, v20.4s, v10.4s +str q18, [x0, #560] +mla v17.4S, v13.4S, v31.s[0] +mla v16.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v10.4s +str q0, [x0, #624] +ldr q0, [x0, #320] +sqrdmulh v10.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v22.4s, v0.4s, v15.4s +str q3, [x0, #688] +sqrdmulh v3.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +add v0.4s, v0.4s, v15.4s +str q4, [x0, #752] +ldr q4, [x0, #0] +sqrdmulh v15.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v13.4s, v4.4s, v14.4s +add v4.4s, v4.4s, v14.4s +ldr q14, [x0, #64] +sqrdmulh v18.4S, v0.4S, v29.s[1] +mul v0.4S, v0.4S,v30.s[1] +sub v7.4s, v14.4s, v12.4s +add v14.4s, v14.4s, v12.4s +ldr q12, [x0, #128] +mla v21.4S, v10.4S, v31.s[0] +mla v8.4S, v3.4S, v31.s[0] +sub v3.4s, v12.4s, v17.4s +str q9, [x0, #816] +mla v20.4S, v15.4S, v31.s[0] +mla v0.4S, v18.4S, v31.s[0] +add v12.4s, v12.4s, v17.4s +str q6, [x0, #880] +ldr q6, [x0, #192] +sqrdmulh v17.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +sub v18.4s, v6.4s, v16.4s +str q5, [x0, #944] +sqrdmulh v5.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +add v6.4s, v6.4s, v16.4s +str q1, [x0, #1008] +sqrdmulh v1.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v16.4s, v12.4s, v21.4s +add v12.4s, v12.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +sub v15.4s, v6.4s, v8.4s +add v6.4s, v6.4s, v8.4s +mla v19.4S, v17.4S, v31.s[0] +mla v11.4S, v5.4S, v31.s[0] +sub v5.4s, v4.4s, v20.4s +mla v2.4S, v1.4S, v31.s[0] +mla v22.4S, v21.4S, v31.s[0] +add v4.4s, v4.4s, v20.4s +sqrdmulh v20.4S, v16.4S, v27.s[1] +mul v16.4S, v16.4S,v28.s[1] +sub v21.4s, v14.4s, v0.4s +sqrdmulh v1.4S, v15.4S, v27.s[1] +mul v15.4S, v15.4S,v28.s[1] +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +sub v17.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v27.s[0] +mul v6.4S, v6.4S,v28.s[0] +sub v8.4s, v18.4s, v11.4s +add v18.4s, v18.4s, v11.4s +mla v16.4S, v20.4S, v31.s[0] +mla v15.4S, v1.4S, v31.s[0] +sub v1.4s, v13.4s, v2.4s +mla v12.4S, v0.4S, v31.s[0] +mla v6.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v3.4S, v27.s[2] +mul v3.4S, v3.4S,v28.s[2] +sub v19.4s, v7.4s, v22.4s +sqrdmulh v0.4S, v18.4S, v27.s[2] +mul v18.4S, v18.4S,v28.s[2] +add v7.4s, v7.4s, v22.4s +sqrdmulh v22.4S, v17.4S, v27.s[3] +mul v17.4S, v17.4S,v28.s[3] +sub v20.4s, v5.4s, v16.4s +add v5.4s, v5.4s, v16.4s +sqrdmulh v16.4S, v8.4S, v27.s[3] +mul v8.4S, v8.4S,v28.s[3] +sub v11.4s, v21.4s, v15.4s +add v21.4s, v21.4s, v15.4s +mla v3.4S, v2.4S, v31.s[0] +mla v18.4S, v0.4S, v31.s[0] +sub v0.4s, v4.4s, v12.4s +mla v17.4S, v22.4S, v31.s[0] +mla v8.4S, v16.4S, v31.s[0] +add v4.4s, v4.4s, v12.4s +sqrdmulh v12.4S, v21.4S, v25.s[2] +mul v21.4S, v21.4S,v26.s[2] +sub v16.4s, v14.4s, v6.4s +sqrdmulh v22.4S, v11.4S, v25.s[3] +mul v11.4S, v11.4S,v26.s[3] +add v14.4s, v14.4s, v6.4s +sqrdmulh v6.4S, v16.4S, v25.s[1] +mul v16.4S, v16.4S,v26.s[1] +sub v2.4s, v13.4s, v3.4s +add v13.4s, v13.4s, v3.4s +sqrdmulh v3.4S, v14.4S, v25.s[0] +mul v14.4S, v14.4S,v26.s[0] +sub v15.4s, v7.4s, v18.4s +add v7.4s, v7.4s, v18.4s +mla v21.4S, v12.4S, v31.s[0] +mla v11.4S, v22.4S, v31.s[0] +sub v22.4s, v1.4s, v17.4s +mla v16.4S, v6.4S, v31.s[0] +mla v14.4S, v3.4S, v31.s[0] +add v1.4s, v1.4s, v17.4s +sqrdmulh v17.4S, v7.4S, v23.s[0] +mul v7.4S, v7.4S,v24.s[0] +sub v3.4s, v19.4s, v8.4s +sqrdmulh v6.4S, v15.4S, v23.s[1] +mul v15.4S, v15.4S,v24.s[1] +add v19.4s, v19.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v23.s[2] +mul v19.4S, v19.4S,v24.s[2] +sub v12.4s, v5.4s, v21.4s +add v5.4s, v5.4s, v21.4s +sqrdmulh v21.4S, v3.4S, v23.s[3] +mul v3.4S, v3.4S,v24.s[3] +sub v18.4s, v20.4s, v11.4s +add v20.4s, v20.4s, v11.4s +mla v7.4S, v17.4S, v31.s[0] +mla v15.4S, v6.4S, v31.s[0] +sub v6.4s, v0.4s, v16.4s +str q5, [x0, #256] +mla v19.4S, v8.4S, v31.s[0] +mla v3.4S, v21.4S, v31.s[0] +add v0.4s, v0.4s, v16.4s +str q12, [x0, #320] +ldr q12, [x0, #912] +sqrdmulh v16.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +sub v21.4s, v4.4s, v14.4s +str q20, [x0, #384] +ldr q20, [x0, #976] +sqrdmulh v8.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v4.4s, v4.4s, v14.4s +str q18, [x0, #448] +ldr q18, [x0, #784] +sqrdmulh v14.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +sub v5.4s, v13.4s, v7.4s +add v13.4s, v13.4s, v7.4s +ldr q7, [x0, #848] +sqrdmulh v17.4S, v7.4S, v29.s[0] +mul v7.4S, v7.4S,v30.s[0] +sub v11.4s, v2.4s, v15.4s +add v2.4s, v2.4s, v15.4s +mla v12.4S, v16.4S, v31.s[0] +mla v20.4S, v8.4S, v31.s[0] +sub v8.4s, v1.4s, v19.4s +str q0, [x0, #128] +mla v18.4S, v14.4S, v31.s[0] +mla v7.4S, v17.4S, v31.s[0] +add v1.4s, v1.4s, v19.4s +str q6, [x0, #192] +ldr q6, [x0, #528] +sqrdmulh v19.4S, v6.4S, v29.s[0] +mul v6.4S, v6.4S,v30.s[0] +sub v17.4s, v22.4s, v3.4s +str q4, [x0, #0] +ldr q4, [x0, #592] +sqrdmulh v14.4S, v4.4S, v29.s[0] +mul v4.4S, v4.4S,v30.s[0] +add v22.4s, v22.4s, v3.4s +str q21, [x0, #64] +ldr q21, [x0, #656] +ldr q3, [x0, #400] +sqrdmulh v0.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +sub v16.4s, v3.4s, v12.4s +add v3.4s, v3.4s, v12.4s +ldr q12, [x0, #720] +ldr q15, [x0, #464] +sqrdmulh v9.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +sub v10.4s, v15.4s, v20.4s +add v15.4s, v15.4s, v20.4s +ldr q20, [x0, #272] +mla v6.4S, v19.4S, v31.s[0] +mla v4.4S, v14.4S, v31.s[0] +sub v14.4s, v20.4s, v18.4s +str q13, [x0, #512] +mla v21.4S, v0.4S, v31.s[0] +mla v12.4S, v9.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +str q5, [x0, #576] +ldr q5, [x0, #336] +sqrdmulh v18.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v9.4s, v5.4s, v7.4s +str q2, [x0, #640] +sqrdmulh v2.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +add v5.4s, v5.4s, v7.4s +str q11, [x0, #704] +ldr q11, [x0, #16] +sqrdmulh v7.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v0.4s, v11.4s, v6.4s +add v11.4s, v11.4s, v6.4s +ldr q6, [x0, #80] +sqrdmulh v13.4S, v5.4S, v29.s[1] +mul v5.4S, v5.4S,v30.s[1] +sub v19.4s, v6.4s, v4.4s +add v6.4s, v6.4s, v4.4s +ldr q4, [x0, #144] +mla v3.4S, v18.4S, v31.s[0] +mla v15.4S, v2.4S, v31.s[0] +sub v2.4s, v4.4s, v21.4s +str q1, [x0, #768] +mla v20.4S, v7.4S, v31.s[0] +mla v5.4S, v13.4S, v31.s[0] +add v4.4s, v4.4s, v21.4s +str q8, [x0, #832] +ldr q8, [x0, #208] +sqrdmulh v21.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +sub v13.4s, v8.4s, v12.4s +str q22, [x0, #896] +sqrdmulh v22.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +add v8.4s, v8.4s, v12.4s +str q17, [x0, #960] +sqrdmulh v17.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v12.4s, v4.4s, v3.4s +add v4.4s, v4.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v29.s[2] +mul v9.4S, v9.4S,v30.s[2] +sub v7.4s, v8.4s, v15.4s +add v8.4s, v8.4s, v15.4s +mla v16.4S, v21.4S, v31.s[0] +mla v10.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v20.4s +mla v14.4S, v17.4S, v31.s[0] +mla v9.4S, v3.4S, v31.s[0] +add v11.4s, v11.4s, v20.4s +sqrdmulh v20.4S, v12.4S, v27.s[1] +mul v12.4S, v12.4S,v28.s[1] +sub v3.4s, v6.4s, v5.4s +sqrdmulh v17.4S, v7.4S, v27.s[1] +mul v7.4S, v7.4S,v28.s[1] +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v4.4S, v27.s[0] +mul v4.4S, v4.4S,v28.s[0] +sub v21.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v8.4S, v27.s[0] +mul v8.4S, v8.4S,v28.s[0] +sub v15.4s, v13.4s, v10.4s +add v13.4s, v13.4s, v10.4s +mla v12.4S, v20.4S, v31.s[0] +mla v7.4S, v17.4S, v31.s[0] +sub v17.4s, v0.4s, v14.4s +mla v4.4S, v5.4S, v31.s[0] +mla v8.4S, v16.4S, v31.s[0] +add v0.4s, v0.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v27.s[2] +mul v2.4S, v2.4S,v28.s[2] +sub v16.4s, v19.4s, v9.4s +sqrdmulh v5.4S, v13.4S, v27.s[2] +mul v13.4S, v13.4S,v28.s[2] +add v19.4s, v19.4s, v9.4s +sqrdmulh v9.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +sub v20.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +sub v10.4s, v3.4s, v7.4s +add v3.4s, v3.4s, v7.4s +mla v2.4S, v14.4S, v31.s[0] +mla v13.4S, v5.4S, v31.s[0] +sub v5.4s, v11.4s, v4.4s +mla v21.4S, v9.4S, v31.s[0] +mla v15.4S, v12.4S, v31.s[0] +add v11.4s, v11.4s, v4.4s +sqrdmulh v4.4S, v3.4S, v25.s[2] +mul v3.4S, v3.4S,v26.s[2] +sub v12.4s, v6.4s, v8.4s +sqrdmulh v9.4S, v10.4S, v25.s[3] +mul v10.4S, v10.4S,v26.s[3] +add v6.4s, v6.4s, v8.4s +sqrdmulh v8.4S, v12.4S, v25.s[1] +mul v12.4S, v12.4S,v26.s[1] +sub v14.4s, v0.4s, v2.4s +add v0.4s, v0.4s, v2.4s +sqrdmulh v2.4S, v6.4S, v25.s[0] +mul v6.4S, v6.4S,v26.s[0] +sub v7.4s, v19.4s, v13.4s +add v19.4s, v19.4s, v13.4s +mla v3.4S, v4.4S, v31.s[0] +mla v10.4S, v9.4S, v31.s[0] +sub v9.4s, v17.4s, v21.4s +mla v12.4S, v8.4S, v31.s[0] +mla v6.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v19.4S, v23.s[0] +mul v19.4S, v19.4S,v24.s[0] +sub v2.4s, v16.4s, v15.4s +sqrdmulh v8.4S, v7.4S, v23.s[1] +mul v7.4S, v7.4S,v24.s[1] +add v16.4s, v16.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v23.s[2] +mul v16.4S, v16.4S,v24.s[2] +sub v4.4s, v22.4s, v3.4s +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v2.4S, v23.s[3] +mul v2.4S, v2.4S,v24.s[3] +sub v13.4s, v20.4s, v10.4s +add v20.4s, v20.4s, v10.4s +mla v19.4S, v21.4S, v31.s[0] +mla v7.4S, v8.4S, v31.s[0] +sub v8.4s, v5.4s, v12.4s +str q22, [x0, #272] +mla v16.4S, v15.4S, v31.s[0] +mla v2.4S, v3.4S, v31.s[0] +add v5.4s, v5.4s, v12.4s +str q4, [x0, #336] +sub v23.4s, v11.4s, v6.4s +str q20, [x0, #400] +add v11.4s, v11.4s, v6.4s +str q13, [x0, #464] +sub v13.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sub v19.4s, v14.4s, v7.4s +add v14.4s, v14.4s, v7.4s +sub v7.4s, v17.4s, v16.4s +str q5, [x0, #144] +add v17.4s, v17.4s, v16.4s +str q8, [x0, #208] +sub v8.4s, v9.4s, v2.4s +str q11, [x0, #16] +add v9.4s, v9.4s, v2.4s +str q23, [x0, #80] +str q0, [x0, #528] +str q13, [x0, #592] +str q14, [x0, #656] +str q19, [x0, #720] +str q17, [x0, #784] +str q7, [x0, #848] +str q9, [x0, #912] +str q8, [x0, #976] +ldr q18, [x17, #+128] +ldr q1, [x17, #+144] +ldr q10, [x17, #+160] +ldr q21, [x17, #+176] +ldr q22, [x17, #+192] +ldr q15, [x17, #+208] +ldr q3, [x17, #+224] +ldr q12, [x17, #+240] +ldr q4, [x0, #32] +ldr q30, [x0, #48] +ldr q29, [x0, #0] +ldr q28, [x0, #16] +sqrdmulh v27.4S, v4.4S, v1.s[0] +mul v4.4S, v4.4S,v18.s[0] +mla v4.4S, v27.4S, v31.s[0] +sub v27.4s, v29.4s, v4.4s +add v29.4s, v29.4s, v4.4s +sqrdmulh v4.4S, v30.4S, v1.s[0] +mul v30.4S, v30.4S,v18.s[0] +mla v30.4S, v4.4S, v31.s[0] +sub v4.4s, v28.4s, v30.4s +add v28.4s, v28.4s, v30.4s +sqrdmulh v30.4S, v28.4S, v1.s[1] +mul v28.4S, v28.4S,v18.s[1] +mla v28.4S, v30.4S, v31.s[0] +sub v30.4s, v29.4s, v28.4s +add v29.4s, v29.4s, v28.4s +sqrdmulh v28.4S, v4.4S, v1.s[2] +mul v4.4S, v4.4S,v18.s[2] +mla v4.4S, v28.4S, v31.s[0] +sub v28.4s, v27.4s, v4.4s +add v27.4s, v27.4s, v4.4s +trn1 v4.4S, v29.4S, v30.4S +trn2 v26.4S, v29.4S, v30.4S +trn1 v25.4S, v27.4S, v28.4S +trn2 v24.4S, v27.4S, v28.4S +trn2 v27.2D, v4.2D, v25.2D +trn2 v28.2D, v26.2D, v24.2D +trn1 v29.2D, v4.2D, v25.2D +trn1 v30.2D, v26.2D, v24.2D +sqrdmulh v24.4S, v27.4S, v21.4S +mul v27.4S, v27.4S,v10.4S +mla v27.4S, v24.4S, v31.s[0] +sub v24.4s, v29.4s, v27.4s +add v29.4s, v29.4s, v27.4s +sqrdmulh v27.4S, v28.4S, v21.4S +mul v28.4S, v28.4S,v10.4S +mla v28.4S, v27.4S, v31.s[0] +sub v27.4s, v30.4s, v28.4s +add v30.4s, v30.4s, v28.4s +sqrdmulh v28.4S, v30.4S, v15.4S +mul v30.4S, v30.4S,v22.4S +mla v30.4S, v28.4S, v31.s[0] +sub v28.4s, v29.4s, v30.4s +add v29.4s, v29.4s, v30.4s +sqrdmulh v30.4S, v27.4S, v12.4S +mul v27.4S, v27.4S,v3.4S +mla v27.4S, v30.4S, v31.s[0] +sub v30.4s, v24.4s, v27.4s +add v24.4s, v24.4s, v27.4s +str q29, [x0, #0] +str q28, [x0, #16] +str q24, [x0, #32] +str q30, [x0, #48] +ldr q30, [x17, #+256] +ldr q24, [x17, #+272] +ldr q28, [x17, #+288] +ldr q29, [x17, #+304] +ldr q27, [x17, #+320] +ldr q26, [x17, #+336] +ldr q25, [x17, #+352] +ldr q4, [x17, #+368] +ldr q12, [x0, #96] +ldr q3, [x0, #112] +ldr q15, [x0, #64] +ldr q22, [x0, #80] +sqrdmulh v21.4S, v12.4S, v24.s[0] +mul v12.4S, v12.4S,v30.s[0] +mla v12.4S, v21.4S, v31.s[0] +sub v21.4s, v15.4s, v12.4s +add v15.4s, v15.4s, v12.4s +sqrdmulh v12.4S, v3.4S, v24.s[0] +mul v3.4S, v3.4S,v30.s[0] +mla v3.4S, v12.4S, v31.s[0] +sub v12.4s, v22.4s, v3.4s +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v22.4S, v24.s[1] +mul v22.4S, v22.4S,v30.s[1] +mla v22.4S, v3.4S, v31.s[0] +sub v3.4s, v15.4s, v22.4s +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v12.4S, v24.s[2] +mul v12.4S, v12.4S,v30.s[2] +mla v12.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v12.4s +add v21.4s, v21.4s, v12.4s +trn1 v12.4S, v15.4S, v3.4S +trn2 v10.4S, v15.4S, v3.4S +trn1 v1.4S, v21.4S, v22.4S +trn2 v18.4S, v21.4S, v22.4S +trn2 v21.2D, v12.2D, v1.2D +trn2 v22.2D, v10.2D, v18.2D +trn1 v15.2D, v12.2D, v1.2D +trn1 v3.2D, v10.2D, v18.2D +sqrdmulh v18.4S, v21.4S, v29.4S +mul v21.4S, v21.4S,v28.4S +mla v21.4S, v18.4S, v31.s[0] +sub v18.4s, v15.4s, v21.4s +add v15.4s, v15.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v29.4S +mul v22.4S, v22.4S,v28.4S +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v3.4s, v22.4s +add v3.4s, v3.4s, v22.4s +sqrdmulh v22.4S, v3.4S, v26.4S +mul v3.4S, v3.4S,v27.4S +mla v3.4S, v22.4S, v31.s[0] +sub v22.4s, v15.4s, v3.4s +add v15.4s, v15.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v4.4S +mul v21.4S, v21.4S,v25.4S +mla v21.4S, v3.4S, v31.s[0] +sub v3.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +str q15, [x0, #64] +str q22, [x0, #80] +str q18, [x0, #96] +str q3, [x0, #112] +ldr q3, [x17, #+384] +ldr q18, [x17, #+400] +ldr q22, [x17, #+416] +ldr q15, [x17, #+432] +ldr q21, [x17, #+448] +ldr q10, [x17, #+464] +ldr q1, [x17, #+480] +ldr q12, [x17, #+496] +ldr q4, [x0, #160] +ldr q25, [x0, #176] +ldr q26, [x0, #128] +ldr q27, [x0, #144] +sqrdmulh v29.4S, v4.4S, v18.s[0] +mul v4.4S, v4.4S,v3.s[0] +mla v4.4S, v29.4S, v31.s[0] +sub v29.4s, v26.4s, v4.4s +add v26.4s, v26.4s, v4.4s +sqrdmulh v4.4S, v25.4S, v18.s[0] +mul v25.4S, v25.4S,v3.s[0] +mla v25.4S, v4.4S, v31.s[0] +sub v4.4s, v27.4s, v25.4s +add v27.4s, v27.4s, v25.4s +sqrdmulh v25.4S, v27.4S, v18.s[1] +mul v27.4S, v27.4S,v3.s[1] +mla v27.4S, v25.4S, v31.s[0] +sub v25.4s, v26.4s, v27.4s +add v26.4s, v26.4s, v27.4s +sqrdmulh v27.4S, v4.4S, v18.s[2] +mul v4.4S, v4.4S,v3.s[2] +mla v4.4S, v27.4S, v31.s[0] +sub v27.4s, v29.4s, v4.4s +add v29.4s, v29.4s, v4.4s +trn1 v4.4S, v26.4S, v25.4S +trn2 v28.4S, v26.4S, v25.4S +trn1 v24.4S, v29.4S, v27.4S +trn2 v30.4S, v29.4S, v27.4S +trn2 v29.2D, v4.2D, v24.2D +trn2 v27.2D, v28.2D, v30.2D +trn1 v26.2D, v4.2D, v24.2D +trn1 v25.2D, v28.2D, v30.2D +sqrdmulh v30.4S, v29.4S, v15.4S +mul v29.4S, v29.4S,v22.4S +mla v29.4S, v30.4S, v31.s[0] +sub v30.4s, v26.4s, v29.4s +add v26.4s, v26.4s, v29.4s +sqrdmulh v29.4S, v27.4S, v15.4S +mul v27.4S, v27.4S,v22.4S +mla v27.4S, v29.4S, v31.s[0] +sub v29.4s, v25.4s, v27.4s +add v25.4s, v25.4s, v27.4s +sqrdmulh v27.4S, v25.4S, v10.4S +mul v25.4S, v25.4S,v21.4S +mla v25.4S, v27.4S, v31.s[0] +sub v27.4s, v26.4s, v25.4s +add v26.4s, v26.4s, v25.4s +sqrdmulh v25.4S, v29.4S, v12.4S +mul v29.4S, v29.4S,v1.4S +mla v29.4S, v25.4S, v31.s[0] +sub v25.4s, v30.4s, v29.4s +add v30.4s, v30.4s, v29.4s +str q26, [x0, #128] +str q27, [x0, #144] +str q30, [x0, #160] +str q25, [x0, #176] +ldr q25, [x17, #+512] +ldr q30, [x17, #+528] +ldr q27, [x17, #+544] +ldr q26, [x17, #+560] +ldr q29, [x17, #+576] +ldr q28, [x17, #+592] +ldr q24, [x17, #+608] +ldr q4, [x17, #+624] +ldr q12, [x0, #224] +ldr q1, [x0, #240] +ldr q10, [x0, #192] +ldr q21, [x0, #208] +sqrdmulh v15.4S, v12.4S, v30.s[0] +mul v12.4S, v12.4S,v25.s[0] +mla v12.4S, v15.4S, v31.s[0] +sub v15.4s, v10.4s, v12.4s +add v10.4s, v10.4s, v12.4s +sqrdmulh v12.4S, v1.4S, v30.s[0] +mul v1.4S, v1.4S,v25.s[0] +mla v1.4S, v12.4S, v31.s[0] +sub v12.4s, v21.4s, v1.4s +add v21.4s, v21.4s, v1.4s +sqrdmulh v1.4S, v21.4S, v30.s[1] +mul v21.4S, v21.4S,v25.s[1] +mla v21.4S, v1.4S, v31.s[0] +sub v1.4s, v10.4s, v21.4s +add v10.4s, v10.4s, v21.4s +sqrdmulh v21.4S, v12.4S, v30.s[2] +mul v12.4S, v12.4S,v25.s[2] +mla v12.4S, v21.4S, v31.s[0] +sub v21.4s, v15.4s, v12.4s +add v15.4s, v15.4s, v12.4s +trn1 v12.4S, v10.4S, v1.4S +trn2 v22.4S, v10.4S, v1.4S +trn1 v18.4S, v15.4S, v21.4S +trn2 v3.4S, v15.4S, v21.4S +trn2 v15.2D, v12.2D, v18.2D +trn2 v21.2D, v22.2D, v3.2D +trn1 v10.2D, v12.2D, v18.2D +trn1 v1.2D, v22.2D, v3.2D +sqrdmulh v3.4S, v15.4S, v26.4S +mul v15.4S, v15.4S,v27.4S +mla v15.4S, v3.4S, v31.s[0] +sub v3.4s, v10.4s, v15.4s +add v10.4s, v10.4s, v15.4s +sqrdmulh v15.4S, v21.4S, v26.4S +mul v21.4S, v21.4S,v27.4S +mla v21.4S, v15.4S, v31.s[0] +sub v15.4s, v1.4s, v21.4s +add v1.4s, v1.4s, v21.4s +sqrdmulh v21.4S, v1.4S, v28.4S +mul v1.4S, v1.4S,v29.4S +mla v1.4S, v21.4S, v31.s[0] +sub v21.4s, v10.4s, v1.4s +add v10.4s, v10.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v4.4S +mul v15.4S, v15.4S,v24.4S +mla v15.4S, v1.4S, v31.s[0] +sub v1.4s, v3.4s, v15.4s +add v3.4s, v3.4s, v15.4s +str q10, [x0, #192] +str q21, [x0, #208] +str q3, [x0, #224] +str q1, [x0, #240] +ldr q1, [x17, #+640] +ldr q3, [x17, #+656] +ldr q21, [x17, #+672] +ldr q10, [x17, #+688] +ldr q15, [x17, #+704] +ldr q22, [x17, #+720] +ldr q18, [x17, #+736] +ldr q12, [x17, #+752] +ldr q4, [x0, #288] +ldr q24, [x0, #304] +ldr q28, [x0, #256] +ldr q29, [x0, #272] +sqrdmulh v26.4S, v4.4S, v3.s[0] +mul v4.4S, v4.4S,v1.s[0] +mla v4.4S, v26.4S, v31.s[0] +sub v26.4s, v28.4s, v4.4s +add v28.4s, v28.4s, v4.4s +sqrdmulh v4.4S, v24.4S, v3.s[0] +mul v24.4S, v24.4S,v1.s[0] +mla v24.4S, v4.4S, v31.s[0] +sub v4.4s, v29.4s, v24.4s +add v29.4s, v29.4s, v24.4s +sqrdmulh v24.4S, v29.4S, v3.s[1] +mul v29.4S, v29.4S,v1.s[1] +mla v29.4S, v24.4S, v31.s[0] +sub v24.4s, v28.4s, v29.4s +add v28.4s, v28.4s, v29.4s +sqrdmulh v29.4S, v4.4S, v3.s[2] +mul v4.4S, v4.4S,v1.s[2] +mla v4.4S, v29.4S, v31.s[0] +sub v29.4s, v26.4s, v4.4s +add v26.4s, v26.4s, v4.4s +trn1 v4.4S, v28.4S, v24.4S +trn2 v27.4S, v28.4S, v24.4S +trn1 v30.4S, v26.4S, v29.4S +trn2 v25.4S, v26.4S, v29.4S +trn2 v26.2D, v4.2D, v30.2D +trn2 v29.2D, v27.2D, v25.2D +trn1 v28.2D, v4.2D, v30.2D +trn1 v24.2D, v27.2D, v25.2D +sqrdmulh v25.4S, v26.4S, v10.4S +mul v26.4S, v26.4S,v21.4S +mla v26.4S, v25.4S, v31.s[0] +sub v25.4s, v28.4s, v26.4s +add v28.4s, v28.4s, v26.4s +sqrdmulh v26.4S, v29.4S, v10.4S +mul v29.4S, v29.4S,v21.4S +mla v29.4S, v26.4S, v31.s[0] +sub v26.4s, v24.4s, v29.4s +add v24.4s, v24.4s, v29.4s +sqrdmulh v29.4S, v24.4S, v22.4S +mul v24.4S, v24.4S,v15.4S +mla v24.4S, v29.4S, v31.s[0] +sub v29.4s, v28.4s, v24.4s +add v28.4s, v28.4s, v24.4s +sqrdmulh v24.4S, v26.4S, v12.4S +mul v26.4S, v26.4S,v18.4S +mla v26.4S, v24.4S, v31.s[0] +sub v24.4s, v25.4s, v26.4s +add v25.4s, v25.4s, v26.4s +str q28, [x0, #256] +str q29, [x0, #272] +str q25, [x0, #288] +str q24, [x0, #304] +ldr q24, [x17, #+768] +ldr q25, [x17, #+784] +ldr q29, [x17, #+800] +ldr q28, [x17, #+816] +ldr q26, [x17, #+832] +ldr q27, [x17, #+848] +ldr q30, [x17, #+864] +ldr q4, [x17, #+880] +ldr q12, [x0, #352] +ldr q18, [x0, #368] +ldr q22, [x0, #320] +ldr q15, [x0, #336] +sqrdmulh v10.4S, v12.4S, v25.s[0] +mul v12.4S, v12.4S,v24.s[0] +mla v12.4S, v10.4S, v31.s[0] +sub v10.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v18.4S, v25.s[0] +mul v18.4S, v18.4S,v24.s[0] +mla v18.4S, v12.4S, v31.s[0] +sub v12.4s, v15.4s, v18.4s +add v15.4s, v15.4s, v18.4s +sqrdmulh v18.4S, v15.4S, v25.s[1] +mul v15.4S, v15.4S,v24.s[1] +mla v15.4S, v18.4S, v31.s[0] +sub v18.4s, v22.4s, v15.4s +add v22.4s, v22.4s, v15.4s +sqrdmulh v15.4S, v12.4S, v25.s[2] +mul v12.4S, v12.4S,v24.s[2] +mla v12.4S, v15.4S, v31.s[0] +sub v15.4s, v10.4s, v12.4s +add v10.4s, v10.4s, v12.4s +trn1 v12.4S, v22.4S, v18.4S +trn2 v21.4S, v22.4S, v18.4S +trn1 v3.4S, v10.4S, v15.4S +trn2 v1.4S, v10.4S, v15.4S +trn2 v10.2D, v12.2D, v3.2D +trn2 v15.2D, v21.2D, v1.2D +trn1 v22.2D, v12.2D, v3.2D +trn1 v18.2D, v21.2D, v1.2D +sqrdmulh v1.4S, v10.4S, v28.4S +mul v10.4S, v10.4S,v29.4S +mla v10.4S, v1.4S, v31.s[0] +sub v1.4s, v22.4s, v10.4s +add v22.4s, v22.4s, v10.4s +sqrdmulh v10.4S, v15.4S, v28.4S +mul v15.4S, v15.4S,v29.4S +mla v15.4S, v10.4S, v31.s[0] +sub v10.4s, v18.4s, v15.4s +add v18.4s, v18.4s, v15.4s +sqrdmulh v15.4S, v18.4S, v27.4S +mul v18.4S, v18.4S,v26.4S +mla v18.4S, v15.4S, v31.s[0] +sub v15.4s, v22.4s, v18.4s +add v22.4s, v22.4s, v18.4s +sqrdmulh v18.4S, v10.4S, v4.4S +mul v10.4S, v10.4S,v30.4S +mla v10.4S, v18.4S, v31.s[0] +sub v18.4s, v1.4s, v10.4s +add v1.4s, v1.4s, v10.4s +str q22, [x0, #320] +str q15, [x0, #336] +str q1, [x0, #352] +str q18, [x0, #368] +ldr q18, [x17, #+896] +ldr q1, [x17, #+912] +ldr q15, [x17, #+928] +ldr q22, [x17, #+944] +ldr q10, [x17, #+960] +ldr q21, [x17, #+976] +ldr q3, [x17, #+992] +ldr q12, [x17, #+1008] +ldr q4, [x0, #416] +ldr q30, [x0, #432] +ldr q27, [x0, #384] +ldr q26, [x0, #400] +sqrdmulh v28.4S, v4.4S, v1.s[0] +mul v4.4S, v4.4S,v18.s[0] +mla v4.4S, v28.4S, v31.s[0] +sub v28.4s, v27.4s, v4.4s +add v27.4s, v27.4s, v4.4s +sqrdmulh v4.4S, v30.4S, v1.s[0] +mul v30.4S, v30.4S,v18.s[0] +mla v30.4S, v4.4S, v31.s[0] +sub v4.4s, v26.4s, v30.4s +add v26.4s, v26.4s, v30.4s +sqrdmulh v30.4S, v26.4S, v1.s[1] +mul v26.4S, v26.4S,v18.s[1] +mla v26.4S, v30.4S, v31.s[0] +sub v30.4s, v27.4s, v26.4s +add v27.4s, v27.4s, v26.4s +sqrdmulh v26.4S, v4.4S, v1.s[2] +mul v4.4S, v4.4S,v18.s[2] +mla v4.4S, v26.4S, v31.s[0] +sub v26.4s, v28.4s, v4.4s +add v28.4s, v28.4s, v4.4s +trn1 v4.4S, v27.4S, v30.4S +trn2 v29.4S, v27.4S, v30.4S +trn1 v25.4S, v28.4S, v26.4S +trn2 v24.4S, v28.4S, v26.4S +trn2 v28.2D, v4.2D, v25.2D +trn2 v26.2D, v29.2D, v24.2D +trn1 v27.2D, v4.2D, v25.2D +trn1 v30.2D, v29.2D, v24.2D +sqrdmulh v24.4S, v28.4S, v22.4S +mul v28.4S, v28.4S,v15.4S +mla v28.4S, v24.4S, v31.s[0] +sub v24.4s, v27.4s, v28.4s +add v27.4s, v27.4s, v28.4s +sqrdmulh v28.4S, v26.4S, v22.4S +mul v26.4S, v26.4S,v15.4S +mla v26.4S, v28.4S, v31.s[0] +sub v28.4s, v30.4s, v26.4s +add v30.4s, v30.4s, v26.4s +sqrdmulh v26.4S, v30.4S, v21.4S +mul v30.4S, v30.4S,v10.4S +mla v30.4S, v26.4S, v31.s[0] +sub v26.4s, v27.4s, v30.4s +add v27.4s, v27.4s, v30.4s +sqrdmulh v30.4S, v28.4S, v12.4S +mul v28.4S, v28.4S,v3.4S +mla v28.4S, v30.4S, v31.s[0] +sub v30.4s, v24.4s, v28.4s +add v24.4s, v24.4s, v28.4s +str q27, [x0, #384] +str q26, [x0, #400] +str q24, [x0, #416] +str q30, [x0, #432] +ldr q30, [x17, #+1024] +ldr q24, [x17, #+1040] +ldr q26, [x17, #+1056] +ldr q27, [x17, #+1072] +ldr q28, [x17, #+1088] +ldr q29, [x17, #+1104] +ldr q25, [x17, #+1120] +ldr q4, [x17, #+1136] +ldr q12, [x0, #480] +ldr q3, [x0, #496] +ldr q21, [x0, #448] +ldr q10, [x0, #464] +sqrdmulh v22.4S, v12.4S, v24.s[0] +mul v12.4S, v12.4S,v30.s[0] +mla v12.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v12.4s +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v3.4S, v24.s[0] +mul v3.4S, v3.4S,v30.s[0] +mla v3.4S, v12.4S, v31.s[0] +sub v12.4s, v10.4s, v3.4s +add v10.4s, v10.4s, v3.4s +sqrdmulh v3.4S, v10.4S, v24.s[1] +mul v10.4S, v10.4S,v30.s[1] +mla v10.4S, v3.4S, v31.s[0] +sub v3.4s, v21.4s, v10.4s +add v21.4s, v21.4s, v10.4s +sqrdmulh v10.4S, v12.4S, v24.s[2] +mul v12.4S, v12.4S,v30.s[2] +mla v12.4S, v10.4S, v31.s[0] +sub v10.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +trn1 v12.4S, v21.4S, v3.4S +trn2 v15.4S, v21.4S, v3.4S +trn1 v1.4S, v22.4S, v10.4S +trn2 v18.4S, v22.4S, v10.4S +trn2 v22.2D, v12.2D, v1.2D +trn2 v10.2D, v15.2D, v18.2D +trn1 v21.2D, v12.2D, v1.2D +trn1 v3.2D, v15.2D, v18.2D +sqrdmulh v18.4S, v22.4S, v27.4S +mul v22.4S, v22.4S,v26.4S +mla v22.4S, v18.4S, v31.s[0] +sub v18.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +sqrdmulh v22.4S, v10.4S, v27.4S +mul v10.4S, v10.4S,v26.4S +mla v10.4S, v22.4S, v31.s[0] +sub v22.4s, v3.4s, v10.4s +add v3.4s, v3.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v29.4S +mul v3.4S, v3.4S,v28.4S +mla v3.4S, v10.4S, v31.s[0] +sub v10.4s, v21.4s, v3.4s +add v21.4s, v21.4s, v3.4s +sqrdmulh v3.4S, v22.4S, v4.4S +mul v22.4S, v22.4S,v25.4S +mla v22.4S, v3.4S, v31.s[0] +sub v3.4s, v18.4s, v22.4s +add v18.4s, v18.4s, v22.4s +str q21, [x0, #448] +str q10, [x0, #464] +str q18, [x0, #480] +str q3, [x0, #496] +ldr q3, [x17, #+1152] +ldr q18, [x17, #+1168] +ldr q10, [x17, #+1184] +ldr q21, [x17, #+1200] +ldr q22, [x17, #+1216] +ldr q15, [x17, #+1232] +ldr q1, [x17, #+1248] +ldr q12, [x17, #+1264] +ldr q4, [x0, #544] +ldr q25, [x0, #560] +ldr q29, [x0, #512] +ldr q28, [x0, #528] +sqrdmulh v27.4S, v4.4S, v18.s[0] +mul v4.4S, v4.4S,v3.s[0] +mla v4.4S, v27.4S, v31.s[0] +sub v27.4s, v29.4s, v4.4s +add v29.4s, v29.4s, v4.4s +sqrdmulh v4.4S, v25.4S, v18.s[0] +mul v25.4S, v25.4S,v3.s[0] +mla v25.4S, v4.4S, v31.s[0] +sub v4.4s, v28.4s, v25.4s +add v28.4s, v28.4s, v25.4s +sqrdmulh v25.4S, v28.4S, v18.s[1] +mul v28.4S, v28.4S,v3.s[1] +mla v28.4S, v25.4S, v31.s[0] +sub v25.4s, v29.4s, v28.4s +add v29.4s, v29.4s, v28.4s +sqrdmulh v28.4S, v4.4S, v18.s[2] +mul v4.4S, v4.4S,v3.s[2] +mla v4.4S, v28.4S, v31.s[0] +sub v28.4s, v27.4s, v4.4s +add v27.4s, v27.4s, v4.4s +trn1 v4.4S, v29.4S, v25.4S +trn2 v26.4S, v29.4S, v25.4S +trn1 v24.4S, v27.4S, v28.4S +trn2 v30.4S, v27.4S, v28.4S +trn2 v27.2D, v4.2D, v24.2D +trn2 v28.2D, v26.2D, v30.2D +trn1 v29.2D, v4.2D, v24.2D +trn1 v25.2D, v26.2D, v30.2D +sqrdmulh v30.4S, v27.4S, v21.4S +mul v27.4S, v27.4S,v10.4S +mla v27.4S, v30.4S, v31.s[0] +sub v30.4s, v29.4s, v27.4s +add v29.4s, v29.4s, v27.4s +sqrdmulh v27.4S, v28.4S, v21.4S +mul v28.4S, v28.4S,v10.4S +mla v28.4S, v27.4S, v31.s[0] +sub v27.4s, v25.4s, v28.4s +add v25.4s, v25.4s, v28.4s +sqrdmulh v28.4S, v25.4S, v15.4S +mul v25.4S, v25.4S,v22.4S +mla v25.4S, v28.4S, v31.s[0] +sub v28.4s, v29.4s, v25.4s +add v29.4s, v29.4s, v25.4s +sqrdmulh v25.4S, v27.4S, v12.4S +mul v27.4S, v27.4S,v1.4S +mla v27.4S, v25.4S, v31.s[0] +sub v25.4s, v30.4s, v27.4s +add v30.4s, v30.4s, v27.4s +str q29, [x0, #512] +str q28, [x0, #528] +str q30, [x0, #544] +str q25, [x0, #560] +ldr q25, [x17, #+1280] +ldr q30, [x17, #+1296] +ldr q28, [x17, #+1312] +ldr q29, [x17, #+1328] +ldr q27, [x17, #+1344] +ldr q26, [x17, #+1360] +ldr q24, [x17, #+1376] +ldr q4, [x17, #+1392] +ldr q12, [x0, #608] +ldr q1, [x0, #624] +ldr q15, [x0, #576] +ldr q22, [x0, #592] +sqrdmulh v21.4S, v12.4S, v30.s[0] +mul v12.4S, v12.4S,v25.s[0] +mla v12.4S, v21.4S, v31.s[0] +sub v21.4s, v15.4s, v12.4s +add v15.4s, v15.4s, v12.4s +sqrdmulh v12.4S, v1.4S, v30.s[0] +mul v1.4S, v1.4S,v25.s[0] +mla v1.4S, v12.4S, v31.s[0] +sub v12.4s, v22.4s, v1.4s +add v22.4s, v22.4s, v1.4s +sqrdmulh v1.4S, v22.4S, v30.s[1] +mul v22.4S, v22.4S,v25.s[1] +mla v22.4S, v1.4S, v31.s[0] +sub v1.4s, v15.4s, v22.4s +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v12.4S, v30.s[2] +mul v12.4S, v12.4S,v25.s[2] +mla v12.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v12.4s +add v21.4s, v21.4s, v12.4s +trn1 v12.4S, v15.4S, v1.4S +trn2 v10.4S, v15.4S, v1.4S +trn1 v18.4S, v21.4S, v22.4S +trn2 v3.4S, v21.4S, v22.4S +trn2 v21.2D, v12.2D, v18.2D +trn2 v22.2D, v10.2D, v3.2D +trn1 v15.2D, v12.2D, v18.2D +trn1 v1.2D, v10.2D, v3.2D +sqrdmulh v3.4S, v21.4S, v29.4S +mul v21.4S, v21.4S,v28.4S +mla v21.4S, v3.4S, v31.s[0] +sub v3.4s, v15.4s, v21.4s +add v15.4s, v15.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v29.4S +mul v22.4S, v22.4S,v28.4S +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +sqrdmulh v22.4S, v1.4S, v26.4S +mul v1.4S, v1.4S,v27.4S +mla v1.4S, v22.4S, v31.s[0] +sub v22.4s, v15.4s, v1.4s +add v15.4s, v15.4s, v1.4s +sqrdmulh v1.4S, v21.4S, v4.4S +mul v21.4S, v21.4S,v24.4S +mla v21.4S, v1.4S, v31.s[0] +sub v1.4s, v3.4s, v21.4s +add v3.4s, v3.4s, v21.4s +str q15, [x0, #576] +str q22, [x0, #592] +str q3, [x0, #608] +str q1, [x0, #624] +ldr q1, [x17, #+1408] +ldr q3, [x17, #+1424] +ldr q22, [x17, #+1440] +ldr q15, [x17, #+1456] +ldr q21, [x17, #+1472] +ldr q10, [x17, #+1488] +ldr q18, [x17, #+1504] +ldr q12, [x17, #+1520] +ldr q4, [x0, #672] +ldr q24, [x0, #688] +ldr q26, [x0, #640] +ldr q27, [x0, #656] +sqrdmulh v29.4S, v4.4S, v3.s[0] +mul v4.4S, v4.4S,v1.s[0] +mla v4.4S, v29.4S, v31.s[0] +sub v29.4s, v26.4s, v4.4s +add v26.4s, v26.4s, v4.4s +sqrdmulh v4.4S, v24.4S, v3.s[0] +mul v24.4S, v24.4S,v1.s[0] +mla v24.4S, v4.4S, v31.s[0] +sub v4.4s, v27.4s, v24.4s +add v27.4s, v27.4s, v24.4s +sqrdmulh v24.4S, v27.4S, v3.s[1] +mul v27.4S, v27.4S,v1.s[1] +mla v27.4S, v24.4S, v31.s[0] +sub v24.4s, v26.4s, v27.4s +add v26.4s, v26.4s, v27.4s +sqrdmulh v27.4S, v4.4S, v3.s[2] +mul v4.4S, v4.4S,v1.s[2] +mla v4.4S, v27.4S, v31.s[0] +sub v27.4s, v29.4s, v4.4s +add v29.4s, v29.4s, v4.4s +trn1 v4.4S, v26.4S, v24.4S +trn2 v28.4S, v26.4S, v24.4S +trn1 v30.4S, v29.4S, v27.4S +trn2 v25.4S, v29.4S, v27.4S +trn2 v29.2D, v4.2D, v30.2D +trn2 v27.2D, v28.2D, v25.2D +trn1 v26.2D, v4.2D, v30.2D +trn1 v24.2D, v28.2D, v25.2D +sqrdmulh v25.4S, v29.4S, v15.4S +mul v29.4S, v29.4S,v22.4S +mla v29.4S, v25.4S, v31.s[0] +sub v25.4s, v26.4s, v29.4s +add v26.4s, v26.4s, v29.4s +sqrdmulh v29.4S, v27.4S, v15.4S +mul v27.4S, v27.4S,v22.4S +mla v27.4S, v29.4S, v31.s[0] +sub v29.4s, v24.4s, v27.4s +add v24.4s, v24.4s, v27.4s +sqrdmulh v27.4S, v24.4S, v10.4S +mul v24.4S, v24.4S,v21.4S +mla v24.4S, v27.4S, v31.s[0] +sub v27.4s, v26.4s, v24.4s +add v26.4s, v26.4s, v24.4s +sqrdmulh v24.4S, v29.4S, v12.4S +mul v29.4S, v29.4S,v18.4S +mla v29.4S, v24.4S, v31.s[0] +sub v24.4s, v25.4s, v29.4s +add v25.4s, v25.4s, v29.4s +str q26, [x0, #640] +str q27, [x0, #656] +str q25, [x0, #672] +str q24, [x0, #688] +ldr q24, [x17, #+1536] +ldr q25, [x17, #+1552] +ldr q27, [x17, #+1568] +ldr q26, [x17, #+1584] +ldr q29, [x17, #+1600] +ldr q28, [x17, #+1616] +ldr q30, [x17, #+1632] +ldr q4, [x17, #+1648] +ldr q12, [x0, #736] +ldr q18, [x0, #752] +ldr q10, [x0, #704] +ldr q21, [x0, #720] +sqrdmulh v15.4S, v12.4S, v25.s[0] +mul v12.4S, v12.4S,v24.s[0] +mla v12.4S, v15.4S, v31.s[0] +sub v15.4s, v10.4s, v12.4s +add v10.4s, v10.4s, v12.4s +sqrdmulh v12.4S, v18.4S, v25.s[0] +mul v18.4S, v18.4S,v24.s[0] +mla v18.4S, v12.4S, v31.s[0] +sub v12.4s, v21.4s, v18.4s +add v21.4s, v21.4s, v18.4s +sqrdmulh v18.4S, v21.4S, v25.s[1] +mul v21.4S, v21.4S,v24.s[1] +mla v21.4S, v18.4S, v31.s[0] +sub v18.4s, v10.4s, v21.4s +add v10.4s, v10.4s, v21.4s +sqrdmulh v21.4S, v12.4S, v25.s[2] +mul v12.4S, v12.4S,v24.s[2] +mla v12.4S, v21.4S, v31.s[0] +sub v21.4s, v15.4s, v12.4s +add v15.4s, v15.4s, v12.4s +trn1 v12.4S, v10.4S, v18.4S +trn2 v22.4S, v10.4S, v18.4S +trn1 v3.4S, v15.4S, v21.4S +trn2 v1.4S, v15.4S, v21.4S +trn2 v15.2D, v12.2D, v3.2D +trn2 v21.2D, v22.2D, v1.2D +trn1 v10.2D, v12.2D, v3.2D +trn1 v18.2D, v22.2D, v1.2D +sqrdmulh v1.4S, v15.4S, v26.4S +mul v15.4S, v15.4S,v27.4S +mla v15.4S, v1.4S, v31.s[0] +sub v1.4s, v10.4s, v15.4s +add v10.4s, v10.4s, v15.4s +sqrdmulh v15.4S, v21.4S, v26.4S +mul v21.4S, v21.4S,v27.4S +mla v21.4S, v15.4S, v31.s[0] +sub v15.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v18.4S, v28.4S +mul v18.4S, v18.4S,v29.4S +mla v18.4S, v21.4S, v31.s[0] +sub v21.4s, v10.4s, v18.4s +add v10.4s, v10.4s, v18.4s +sqrdmulh v18.4S, v15.4S, v4.4S +mul v15.4S, v15.4S,v30.4S +mla v15.4S, v18.4S, v31.s[0] +sub v18.4s, v1.4s, v15.4s +add v1.4s, v1.4s, v15.4s +str q10, [x0, #704] +str q21, [x0, #720] +str q1, [x0, #736] +str q18, [x0, #752] +ldr q18, [x17, #+1664] +ldr q1, [x17, #+1680] +ldr q21, [x17, #+1696] +ldr q10, [x17, #+1712] +ldr q15, [x17, #+1728] +ldr q22, [x17, #+1744] +ldr q3, [x17, #+1760] +ldr q12, [x17, #+1776] +ldr q4, [x0, #800] +ldr q30, [x0, #816] +ldr q28, [x0, #768] +ldr q29, [x0, #784] +sqrdmulh v26.4S, v4.4S, v1.s[0] +mul v4.4S, v4.4S,v18.s[0] +mla v4.4S, v26.4S, v31.s[0] +sub v26.4s, v28.4s, v4.4s +add v28.4s, v28.4s, v4.4s +sqrdmulh v4.4S, v30.4S, v1.s[0] +mul v30.4S, v30.4S,v18.s[0] +mla v30.4S, v4.4S, v31.s[0] +sub v4.4s, v29.4s, v30.4s +add v29.4s, v29.4s, v30.4s +sqrdmulh v30.4S, v29.4S, v1.s[1] +mul v29.4S, v29.4S,v18.s[1] +mla v29.4S, v30.4S, v31.s[0] +sub v30.4s, v28.4s, v29.4s +add v28.4s, v28.4s, v29.4s +sqrdmulh v29.4S, v4.4S, v1.s[2] +mul v4.4S, v4.4S,v18.s[2] +mla v4.4S, v29.4S, v31.s[0] +sub v29.4s, v26.4s, v4.4s +add v26.4s, v26.4s, v4.4s +trn1 v4.4S, v28.4S, v30.4S +trn2 v27.4S, v28.4S, v30.4S +trn1 v25.4S, v26.4S, v29.4S +trn2 v24.4S, v26.4S, v29.4S +trn2 v26.2D, v4.2D, v25.2D +trn2 v29.2D, v27.2D, v24.2D +trn1 v28.2D, v4.2D, v25.2D +trn1 v30.2D, v27.2D, v24.2D +sqrdmulh v24.4S, v26.4S, v10.4S +mul v26.4S, v26.4S,v21.4S +mla v26.4S, v24.4S, v31.s[0] +sub v24.4s, v28.4s, v26.4s +add v28.4s, v28.4s, v26.4s +sqrdmulh v26.4S, v29.4S, v10.4S +mul v29.4S, v29.4S,v21.4S +mla v29.4S, v26.4S, v31.s[0] +sub v26.4s, v30.4s, v29.4s +add v30.4s, v30.4s, v29.4s +sqrdmulh v29.4S, v30.4S, v22.4S +mul v30.4S, v30.4S,v15.4S +mla v30.4S, v29.4S, v31.s[0] +sub v29.4s, v28.4s, v30.4s +add v28.4s, v28.4s, v30.4s +sqrdmulh v30.4S, v26.4S, v12.4S +mul v26.4S, v26.4S,v3.4S +mla v26.4S, v30.4S, v31.s[0] +sub v30.4s, v24.4s, v26.4s +add v24.4s, v24.4s, v26.4s +str q28, [x0, #768] +str q29, [x0, #784] +str q24, [x0, #800] +str q30, [x0, #816] +ldr q30, [x17, #+1792] +ldr q24, [x17, #+1808] +ldr q29, [x17, #+1824] +ldr q28, [x17, #+1840] +ldr q26, [x17, #+1856] +ldr q27, [x17, #+1872] +ldr q25, [x17, #+1888] +ldr q4, [x17, #+1904] +ldr q12, [x0, #864] +ldr q3, [x0, #880] +ldr q22, [x0, #832] +ldr q15, [x0, #848] +sqrdmulh v10.4S, v12.4S, v24.s[0] +mul v12.4S, v12.4S,v30.s[0] +mla v12.4S, v10.4S, v31.s[0] +sub v10.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v3.4S, v24.s[0] +mul v3.4S, v3.4S,v30.s[0] +mla v3.4S, v12.4S, v31.s[0] +sub v12.4s, v15.4s, v3.4s +add v15.4s, v15.4s, v3.4s +sqrdmulh v3.4S, v15.4S, v24.s[1] +mul v15.4S, v15.4S,v30.s[1] +mla v15.4S, v3.4S, v31.s[0] +sub v3.4s, v22.4s, v15.4s +add v22.4s, v22.4s, v15.4s +sqrdmulh v15.4S, v12.4S, v24.s[2] +mul v12.4S, v12.4S,v30.s[2] +mla v12.4S, v15.4S, v31.s[0] +sub v15.4s, v10.4s, v12.4s +add v10.4s, v10.4s, v12.4s +trn1 v12.4S, v22.4S, v3.4S +trn2 v21.4S, v22.4S, v3.4S +trn1 v1.4S, v10.4S, v15.4S +trn2 v18.4S, v10.4S, v15.4S +trn2 v10.2D, v12.2D, v1.2D +trn2 v15.2D, v21.2D, v18.2D +trn1 v22.2D, v12.2D, v1.2D +trn1 v3.2D, v21.2D, v18.2D +sqrdmulh v18.4S, v10.4S, v28.4S +mul v10.4S, v10.4S,v29.4S +mla v10.4S, v18.4S, v31.s[0] +sub v18.4s, v22.4s, v10.4s +add v22.4s, v22.4s, v10.4s +sqrdmulh v10.4S, v15.4S, v28.4S +mul v15.4S, v15.4S,v29.4S +mla v15.4S, v10.4S, v31.s[0] +sub v10.4s, v3.4s, v15.4s +add v3.4s, v3.4s, v15.4s +sqrdmulh v15.4S, v3.4S, v27.4S +mul v3.4S, v3.4S,v26.4S +mla v3.4S, v15.4S, v31.s[0] +sub v15.4s, v22.4s, v3.4s +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v10.4S, v4.4S +mul v10.4S, v10.4S,v25.4S +mla v10.4S, v3.4S, v31.s[0] +sub v3.4s, v18.4s, v10.4s +add v18.4s, v18.4s, v10.4s +str q22, [x0, #832] +str q15, [x0, #848] +str q18, [x0, #864] +str q3, [x0, #880] +ldr q3, [x17, #+1920] +ldr q18, [x17, #+1936] +ldr q15, [x17, #+1952] +ldr q22, [x17, #+1968] +ldr q10, [x17, #+1984] +ldr q21, [x17, #+2000] +ldr q1, [x17, #+2016] +ldr q12, [x17, #+2032] +ldr q4, [x0, #928] +ldr q25, [x0, #944] +ldr q27, [x0, #896] +ldr q26, [x0, #912] +sqrdmulh v28.4S, v4.4S, v18.s[0] +mul v4.4S, v4.4S,v3.s[0] +mla v4.4S, v28.4S, v31.s[0] +sub v28.4s, v27.4s, v4.4s +add v27.4s, v27.4s, v4.4s +sqrdmulh v4.4S, v25.4S, v18.s[0] +mul v25.4S, v25.4S,v3.s[0] +mla v25.4S, v4.4S, v31.s[0] +sub v4.4s, v26.4s, v25.4s +add v26.4s, v26.4s, v25.4s +sqrdmulh v25.4S, v26.4S, v18.s[1] +mul v26.4S, v26.4S,v3.s[1] +mla v26.4S, v25.4S, v31.s[0] +sub v25.4s, v27.4s, v26.4s +add v27.4s, v27.4s, v26.4s +sqrdmulh v26.4S, v4.4S, v18.s[2] +mul v4.4S, v4.4S,v3.s[2] +mla v4.4S, v26.4S, v31.s[0] +sub v26.4s, v28.4s, v4.4s +add v28.4s, v28.4s, v4.4s +trn1 v4.4S, v27.4S, v25.4S +trn2 v29.4S, v27.4S, v25.4S +trn1 v24.4S, v28.4S, v26.4S +trn2 v30.4S, v28.4S, v26.4S +trn2 v28.2D, v4.2D, v24.2D +trn2 v26.2D, v29.2D, v30.2D +trn1 v27.2D, v4.2D, v24.2D +trn1 v25.2D, v29.2D, v30.2D +sqrdmulh v30.4S, v28.4S, v22.4S +mul v28.4S, v28.4S,v15.4S +mla v28.4S, v30.4S, v31.s[0] +sub v30.4s, v27.4s, v28.4s +add v27.4s, v27.4s, v28.4s +sqrdmulh v28.4S, v26.4S, v22.4S +mul v26.4S, v26.4S,v15.4S +mla v26.4S, v28.4S, v31.s[0] +sub v28.4s, v25.4s, v26.4s +add v25.4s, v25.4s, v26.4s +sqrdmulh v26.4S, v25.4S, v21.4S +mul v25.4S, v25.4S,v10.4S +mla v25.4S, v26.4S, v31.s[0] +sub v26.4s, v27.4s, v25.4s +add v27.4s, v27.4s, v25.4s +sqrdmulh v25.4S, v28.4S, v12.4S +mul v28.4S, v28.4S,v1.4S +mla v28.4S, v25.4S, v31.s[0] +sub v25.4s, v30.4s, v28.4s +add v30.4s, v30.4s, v28.4s +str q27, [x0, #896] +str q26, [x0, #912] +str q30, [x0, #928] +str q25, [x0, #944] +ldr q25, [x17, #+2048] +ldr q30, [x17, #+2064] +ldr q26, [x17, #+2080] +ldr q27, [x17, #+2096] +ldr q28, [x17, #+2112] +ldr q29, [x17, #+2128] +ldr q24, [x17, #+2144] +ldr q4, [x17, #+2160] +ldr q12, [x0, #992] +ldr q1, [x0, #1008] +ldr q21, [x0, #960] +ldr q10, [x0, #976] +sqrdmulh v22.4S, v12.4S, v30.s[0] +mul v12.4S, v12.4S,v25.s[0] +mla v12.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v12.4s +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v1.4S, v30.s[0] +mul v1.4S, v1.4S,v25.s[0] +mla v1.4S, v12.4S, v31.s[0] +sub v12.4s, v10.4s, v1.4s +add v10.4s, v10.4s, v1.4s +sqrdmulh v1.4S, v10.4S, v30.s[1] +mul v10.4S, v10.4S,v25.s[1] +mla v10.4S, v1.4S, v31.s[0] +sub v1.4s, v21.4s, v10.4s +add v21.4s, v21.4s, v10.4s +sqrdmulh v10.4S, v12.4S, v30.s[2] +mul v12.4S, v12.4S,v25.s[2] +mla v12.4S, v10.4S, v31.s[0] +sub v10.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +trn1 v12.4S, v21.4S, v1.4S +trn2 v15.4S, v21.4S, v1.4S +trn1 v18.4S, v22.4S, v10.4S +trn2 v3.4S, v22.4S, v10.4S +trn2 v22.2D, v12.2D, v18.2D +trn2 v10.2D, v15.2D, v3.2D +trn1 v21.2D, v12.2D, v18.2D +trn1 v1.2D, v15.2D, v3.2D +sqrdmulh v3.4S, v22.4S, v27.4S +mul v22.4S, v22.4S,v26.4S +mla v22.4S, v3.4S, v31.s[0] +sub v3.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +sqrdmulh v22.4S, v10.4S, v27.4S +mul v10.4S, v10.4S,v26.4S +mla v10.4S, v22.4S, v31.s[0] +sub v22.4s, v1.4s, v10.4s +add v1.4s, v1.4s, v10.4s +sqrdmulh v10.4S, v1.4S, v29.4S +mul v1.4S, v1.4S,v28.4S +mla v1.4S, v10.4S, v31.s[0] +sub v10.4s, v21.4s, v1.4s +add v21.4s, v21.4s, v1.4s +sqrdmulh v1.4S, v22.4S, v4.4S +mul v22.4S, v22.4S,v24.4S +mla v22.4S, v1.4S, v31.s[0] +sub v1.4s, v3.4s, v22.4s +add v3.4s, v3.4s, v22.4s +str q21, [x0, #960] +str q10, [x0, #976] +str q3, [x0, #992] +str q1, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 2392 +// Instruction count: 2388 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_8_0.s b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_8_0.s new file mode 100644 index 0000000..761b80d --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_8_0.s @@ -0,0 +1,2422 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 26036764 // Layer 6, block 0 +.word 7065381 // Layer 6, block 1 +.word 11280567 // Layer 6, block 2 +.word 19695786 // Layer 6, block 3 +.word 1666225723 // Layer 6, block 0 +.word 452149874 // Layer 6, block 1 +.word 721901190 // Layer 6, block 2 +.word 1260434103 // Layer 6, block 3 +.word 28678040 // Layer 7, block 0 +.word 5637166 // Layer 7, block 2 +.word 18759424 // Layer 7, block 4 +.word 8648030 // Layer 7, block 6 +.word 1835254486 // Layer 7, block 0 +.word 360751090 // Layer 7, block 2 +.word 1200511508 // Layer 7, block 4 +.word 553431680 // Layer 7, block 6 +.word 7232147 // Layer 7, block 1 +.word 7430689 // Layer 7, block 3 +.word 14819378 // Layer 7, block 5 +.word 22112339 // Layer 7, block 7 +.word 462822084 // Layer 7, block 1 +.word 475527802 // Layer 7, block 3 +.word 948367809 // Layer 7, block 5 +.word 1415081692 // Layer 7, block 7 +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14834498 // Layer 6, block 4 +.word 22861321 // Layer 6, block 5 +.word 23033862 // Layer 6, block 6 +.word 32211066 // Layer 6, block 7 +.word 949335415 // Layer 6, block 4 +.word 1463012881 // Layer 6, block 5 +.word 1474054663 // Layer 6, block 6 +.word 2061350894 // Layer 6, block 7 +.word 7103825 // Layer 7, block 8 +.word 24338119 // Layer 7, block 10 +.word 6674394 // Layer 7, block 12 +.word 3716128 // Layer 7, block 14 +.word 454610102 // Layer 7, block 8 +.word 1557520740 // Layer 7, block 10 +.word 427128616 // Layer 7, block 12 +.word 237814041 // Layer 7, block 14 +.word 18577393 // Layer 7, block 9 +.word 17042091 // Layer 7, block 11 +.word 6574213 // Layer 7, block 13 +.word 24666803 // Layer 7, block 15 +.word 1188862414 // Layer 7, block 9 +.word 1090610585 // Layer 7, block 11 +.word 420717521 // Layer 7, block 13 +.word 1578554911 // Layer 7, block 15 +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 11253846 // Layer 6, block 8 +.word 16151303 // Layer 6, block 9 +.word 1821442 // Layer 6, block 10 +.word 23358663 // Layer 6, block 11 +.word 720191176 // Layer 6, block 8 +.word 1033604503 // Layer 6, block 9 +.word 116563391 // Layer 6, block 10 +.word 1494840340 // Layer 6, block 11 +.word 32787475 // Layer 7, block 16 +.word 8269259 // Layer 7, block 18 +.word 20826321 // Layer 7, block 20 +.word 21194054 // Layer 7, block 22 +.word 2098238255 // Layer 7, block 16 +.word 529192186 // Layer 7, block 18 +.word 1332782821 // Layer 7, block 20 +.word 1356315937 // Layer 7, block 22 +.word 28400654 // Layer 7, block 17 +.word 31090287 // Layer 7, block 19 +.word 26776841 // Layer 7, block 21 +.word 22281074 // Layer 7, block 23 +.word 1817503137 // Layer 7, block 17 +.word 1989626512 // Layer 7, block 19 +.word 1713587037 // Layer 7, block 21 +.word 1425879908 // Layer 7, block 23 +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 20504641 // Layer 6, block 12 +.word 7735096 // Layer 6, block 13 +.word 29463916 // Layer 6, block 14 +.word 23172067 // Layer 6, block 15 +.word 1312196872 // Layer 6, block 12 +.word 495008363 // Layer 6, block 13 +.word 1885546712 // Layer 6, block 14 +.word 1482899108 // Layer 6, block 15 +.word 1953000 // Layer 7, block 24 +.word 12766243 // Layer 7, block 26 +.word 16292342 // Layer 7, block 28 +.word 25143337 // Layer 7, block 30 +.word 124982461 // Layer 7, block 24 +.word 816977197 // Layer 7, block 26 +.word 1042630311 // Layer 7, block 28 +.word 1609050759 // Layer 7, block 30 +.word 12486848 // Layer 7, block 25 +.word 31556661 // Layer 7, block 27 +.word 28330310 // Layer 7, block 29 +.word 15137961 // Layer 7, block 31 +.word 799097282 // Layer 7, block 25 +.word 2019472170 // Layer 7, block 27 +.word 1813001465 // Layer 7, block 29 +.word 968755565 // Layer 7, block 31 +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 18663828 // Layer 6, block 16 +.word 25765932 // Layer 6, block 17 +.word 11779122 // Layer 6, block 18 +.word 29112305 // Layer 6, block 19 +.word 1194393831 // Layer 6, block 16 +.word 1648893798 // Layer 6, block 17 +.word 753806275 // Layer 6, block 18 +.word 1863045325 // Layer 6, block 19 +.word 33163184 // Layer 7, block 32 +.word 11550623 // Layer 7, block 34 +.word 25375595 // Layer 7, block 36 +.word 18254638 // Layer 7, block 38 +.word 2122281795 // Layer 7, block 32 +.word 739183455 // Layer 7, block 34 +.word 1623914137 // Layer 7, block 36 +.word 1168207670 // Layer 7, block 38 +.word 9551359 // Layer 7, block 33 +.word 33257316 // Layer 7, block 35 +.word 10387700 // Layer 7, block 37 +.word 4263629 // Layer 7, block 39 +.word 611240324 // Layer 7, block 33 +.word 2128305784 // Layer 7, block 35 +.word 664762063 // Layer 7, block 37 +.word 272851431 // Layer 7, block 39 +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 596073 // Layer 6, block 20 +.word 29039358 // Layer 6, block 21 +.word 6760262 // Layer 6, block 22 +.word 2228887 // Layer 6, block 23 +.word 38145761 // Layer 6, block 20 +.word 1858377074 // Layer 6, block 21 +.word 432623749 // Layer 6, block 22 +.word 142637881 // Layer 6, block 23 +.word 25929180 // Layer 7, block 40 +.word 23508428 // Layer 7, block 42 +.word 22560727 // Layer 7, block 44 +.word 29457393 // Layer 7, block 46 +.word 1659340873 // Layer 7, block 40 +.word 1504424569 // Layer 7, block 42 +.word 1443776334 // Layer 7, block 44 +.word 1885129272 // Layer 7, block 46 +.word 17371159 // Layer 7, block 41 +.word 11558208 // Layer 7, block 43 +.word 15755637 // Layer 7, block 45 +.word 20740787 // Layer 7, block 47 +.word 1111669329 // Layer 7, block 41 +.word 739668858 // Layer 7, block 43 +.word 1008283812 // Layer 7, block 45 +.word 1327309063 // Layer 7, block 47 +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 13624329 // Layer 6, block 24 +.word 9838349 // Layer 6, block 25 +.word 6934560 // Layer 6, block 26 +.word 11310234 // Layer 6, block 27 +.word 871890510 // Layer 6, block 24 +.word 629606282 // Layer 6, block 25 +.word 443777969 // Layer 6, block 26 +.word 723799733 // Layer 6, block 27 +.word 3153984 // Layer 7, block 48 +.word 15599806 // Layer 7, block 50 +.word 23484790 // Layer 7, block 52 +.word 30174454 // Layer 7, block 54 +.word 201839571 // Layer 7, block 48 +.word 998311389 // Layer 7, block 50 +.word 1502911852 // Layer 7, block 52 +.word 1931017673 // Layer 7, block 54 +.word 13598070 // Layer 7, block 49 +.word 31454003 // Layer 7, block 51 +.word 20506260 // Layer 7, block 53 +.word 5928435 // Layer 7, block 55 +.word 870210062 // Layer 7, block 49 +.word 2012902560 // Layer 7, block 51 +.word 1312300480 // Layer 7, block 53 +.word 379390883 // Layer 7, block 55 +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 32798516 // Layer 6, block 28 +.word 9911360 // Layer 6, block 29 +.word 32443170 // Layer 6, block 30 +.word 31293482 // Layer 6, block 31 +.word 2098944825 // Layer 6, block 28 +.word 634278629 // Layer 6, block 29 +.word 2076204416 // Layer 6, block 30 +.word 2002630000 // Layer 6, block 31 +.word 26013877 // Layer 7, block 56 +.word 22928950 // Layer 7, block 58 +.word 24547058 // Layer 7, block 60 +.word 21082546 // Layer 7, block 62 +.word 1664761067 // Layer 7, block 56 +.word 1467340807 // Layer 7, block 58 +.word 1570891816 // Layer 7, block 60 +.word 1349179970 // Layer 7, block 62 +.word 21864746 // Layer 7, block 57 +.word 27678266 // Layer 7, block 59 +.word 30695887 // Layer 7, block 61 +.word 31772478 // Layer 7, block 63 +.word 1399236949 // Layer 7, block 57 +.word 1771273834 // Layer 7, block 59 +.word 1964386839 // Layer 7, block 61 +.word 2033283404 // Layer 7, block 63 +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 2853776 // Layer 6, block 32 +.word 31645959 // Layer 6, block 33 +.word 29723614 // Layer 6, block 34 +.word 31813171 // Layer 6, block 35 +.word 182627725 // Layer 6, block 32 +.word 2025186806 // Layer 6, block 33 +.word 1902166116 // Layer 6, block 34 +.word 2035887557 // Layer 6, block 35 +.word 30377953 // Layer 7, block 64 +.word 4924837 // Layer 7, block 66 +.word 11362575 // Layer 7, block 68 +.word 31398766 // Layer 7, block 70 +.word 1944040616 // Layer 7, block 64 +.word 315165513 // Layer 7, block 66 +.word 727149301 // Layer 7, block 68 +.word 2009367662 // Layer 7, block 70 +.word 27689101 // Layer 7, block 65 +.word 31229525 // Layer 7, block 67 +.word 6544948 // Layer 7, block 69 +.word 13728247 // Layer 7, block 71 +.word 1771967221 // Layer 7, block 65 +.word 1998537064 // Layer 7, block 67 +.word 418844704 // Layer 7, block 69 +.word 878540754 // Layer 7, block 71 +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9116920 // Layer 6, block 36 +.word 26449800 // Layer 6, block 37 +.word 27173300 // Layer 6, block 38 +.word 1574249 // Layer 6, block 39 +.word 583438350 // Layer 6, block 36 +.word 1692658010 // Layer 6, block 37 +.word 1738958476 // Layer 6, block 38 +.word 100744247 // Layer 6, block 39 +.word 6510145 // Layer 7, block 72 +.word 760999 // Layer 7, block 74 +.word 1634503 // Layer 7, block 76 +.word 29546109 // Layer 7, block 78 +.word 416617482 // Layer 7, block 72 +.word 48700219 // Layer 7, block 74 +.word 104600209 // Layer 7, block 76 +.word 1890806663 // Layer 7, block 78 +.word 2195232 // Layer 7, block 73 +.word 4465852 // Layer 7, block 75 +.word 31203102 // Layer 7, block 77 +.word 29916743 // Layer 7, block 79 +.word 140484126 // Layer 7, block 73 +.word 285792715 // Layer 7, block 75 +.word 1996846121 // Layer 7, block 77 +.word 1914525428 // Layer 7, block 79 +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29172999 // Layer 6, block 40 +.word 16825951 // Layer 6, block 41 +.word 11592382 // Layer 6, block 42 +.word 2671395 // Layer 6, block 43 +.word 1866929445 // Layer 6, block 40 +.word 1076778680 // Layer 6, block 41 +.word 741855827 // Layer 6, block 42 +.word 170956232 // Layer 6, block 43 +.word 14579779 // Layer 7, block 80 +.word 24263513 // Layer 7, block 82 +.word 4646776 // Layer 7, block 84 +.word 69049 // Layer 7, block 86 +.word 933034643 // Layer 7, block 80 +.word 1552746321 // Layer 7, block 82 +.word 297370968 // Layer 7, block 84 +.word 4418799 // Layer 7, block 86 +.word 33263488 // Layer 7, block 81 +.word 22493246 // Layer 7, block 83 +.word 22009979 // Layer 7, block 85 +.word 12021234 // Layer 7, block 87 +.word 2128700762 // Layer 7, block 81 +.word 1439457879 // Layer 7, block 83 +.word 1408531152 // Layer 7, block 85 +.word 769300260 // Layer 7, block 87 +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 15720958 // Layer 6, block 44 +.word 4876619 // Layer 6, block 45 +.word 9370171 // Layer 6, block 46 +.word 2197027 // Layer 6, block 47 +.word 1006064525 // Layer 6, block 44 +.word 312079797 // Layer 6, block 45 +.word 599645177 // Layer 6, block 46 +.word 140598997 // Layer 6, block 47 +.word 16117282 // Layer 7, block 88 +.word 9635661 // Layer 7, block 90 +.word 9117520 // Layer 7, block 92 +.word 3506913 // Layer 7, block 94 +.word 1031427326 // Layer 7, block 88 +.word 616635240 // Layer 7, block 90 +.word 583476747 // Layer 7, block 92 +.word 224425303 // Layer 7, block 94 +.word 20014407 // Layer 7, block 89 +.word 25893988 // Layer 7, block 91 +.word 10257619 // Layer 7, block 93 +.word 24501669 // Layer 7, block 95 +.word 1280824291 // Layer 7, block 89 +.word 1657088757 // Layer 7, block 91 +.word 656437514 // Layer 7, block 93 +.word 1567987141 // Layer 7, block 95 +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 23467272 // Layer 6, block 48 +.word 11944835 // Layer 6, block 49 +.word 29768154 // Layer 6, block 50 +.word 3189790 // Layer 6, block 51 +.word 1501790786 // Layer 6, block 48 +.word 764411097 // Layer 6, block 49 +.word 1905016458 // Layer 6, block 50 +.word 204130980 // Layer 6, block 51 +.word 28559032 // Layer 7, block 96 +.word 20151609 // Layer 7, block 98 +.word 11645481 // Layer 7, block 100 +.word 16402437 // Layer 7, block 102 +.word 1827638556 // Layer 7, block 96 +.word 1289604549 // Layer 7, block 98 +.word 745253903 // Layer 7, block 100 +.word 1049675853 // Layer 7, block 102 +.word 1005359 // Layer 7, block 97 +.word 19130139 // Layer 7, block 99 +.word 11690281 // Layer 7, block 101 +.word 5461508 // Layer 7, block 103 +.word 64338065 // Layer 7, block 97 +.word 1224235458 // Layer 7, block 99 +.word 748120885 // Layer 7, block 101 +.word 349509836 // Layer 7, block 103 +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 4898455 // Layer 6, block 52 +.word 22059944 // Layer 6, block 53 +.word 20315246 // Layer 6, block 54 +.word 28615767 // Layer 6, block 55 +.word 313477194 // Layer 6, block 52 +.word 1411728668 // Layer 6, block 53 +.word 1300076517 // Layer 6, block 54 +.word 1831269319 // Layer 6, block 55 +.word 6226096 // Layer 7, block 104 +.word 14029790 // Layer 7, block 106 +.word 7729000 // Layer 7, block 108 +.word 13958531 // Layer 7, block 110 +.word 398439734 // Layer 7, block 104 +.word 897838034 // Layer 7, block 106 +.word 494618249 // Layer 7, block 108 +.word 893277806 // Layer 7, block 110 +.word 31755058 // Layer 7, block 105 +.word 26102744 // Layer 7, block 107 +.word 19175904 // Layer 7, block 109 +.word 19472238 // Layer 7, block 111 +.word 2032168609 // Layer 7, block 105 +.word 1670448121 // Layer 7, block 107 +.word 1227164194 // Layer 7, block 109 +.word 1246128123 // Layer 7, block 111 +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 17302560 // Layer 6, block 56 +.word 8630188 // Layer 6, block 57 +.word 13744680 // Layer 6, block 58 +.word 31890906 // Layer 6, block 59 +.word 1107279328 // Layer 6, block 56 +.word 552289879 // Layer 6, block 57 +.word 879592386 // Layer 6, block 58 +.word 2040862218 // Layer 6, block 59 +.word 4735938 // Layer 7, block 112 +.word 26671657 // Layer 7, block 114 +.word 25810971 // Layer 7, block 116 +.word 25578690 // Layer 7, block 118 +.word 303076900 // Layer 7, block 112 +.word 1706855774 // Layer 7, block 114 +.word 1651776074 // Layer 7, block 116 +.word 1636911225 // Layer 7, block 118 +.word 6957373 // Layer 7, block 113 +.word 25381712 // Layer 7, block 115 +.word 27780827 // Layer 7, block 117 +.word 28062311 // Layer 7, block 119 +.word 445237890 // Layer 7, block 113 +.word 1624305595 // Layer 7, block 115 +.word 1777837237 // Layer 7, block 117 +.word 1795850838 // Layer 7, block 119 +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 26150922 // Layer 6, block 60 +.word 29525906 // Layer 6, block 61 +.word 23080870 // Layer 6, block 62 +.word 1636987 // Layer 6, block 63 +.word 1673531278 // Layer 6, block 60 +.word 1889513769 // Layer 6, block 61 +.word 1477062945 // Layer 6, block 62 +.word 104759172 // Layer 6, block 63 +.word 10674616 // Layer 7, block 120 +.word 9508293 // Layer 7, block 122 +.word 4274200 // Layer 7, block 124 +.word 10066304 // Layer 7, block 126 +.word 683123285 // Layer 7, block 120 +.word 608484310 // Layer 7, block 122 +.word 273527923 // Layer 7, block 124 +.word 644194289 // Layer 7, block 126 +.word 26473446 // Layer 7, block 121 +.word 14853570 // Layer 7, block 123 +.word 32427548 // Layer 7, block 125 +.word 16598340 // Layer 7, block 127 +.word 1694171239 // Layer 7, block 121 +.word 950555930 // Layer 7, block 123 +.word 2075204685 // Layer 7, block 125 +.word 1062212688 // Layer 7, block 127 +.text +.global ntt_u32_full_neon_asm_var_4_4_8_0 +.global _ntt_u32_full_neon_asm_var_4_4_8_0 +ntt_u32_full_neon_asm_var_4_4_8_0: +_ntt_u32_full_neon_asm_var_4_4_8_0: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #928] +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +ldr q20, [x0, #992] +sqrdmulh v19.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q18, [x0, #800] +sqrdmulh v17.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +ldr q16, [x0, #864] +sqrdmulh v3.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +mla v22.4S, v21.4S, v31.s[0] +mla v20.4S, v19.4S, v31.s[0] +mla v18.4S, v17.4S, v31.s[0] +mla v16.4S, v3.4S, v31.s[0] +ldr q3, [x0, #544] +sqrdmulh v17.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +ldr q19, [x0, #608] +sqrdmulh v21.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q2, [x0, #672] +ldr q1, [x0, #416] +sqrdmulh v0.4S, v2.4S, v29.s[0] +sub v15.4s, v1.4s, v22.4s +mul v2.4S, v2.4S,v30.s[0] +add v1.4s, v1.4s, v22.4s +ldr q22, [x0, #736] +ldr q14, [x0, #480] +sqrdmulh v13.4S, v22.4S, v29.s[0] +sub v12.4s, v14.4s, v20.4s +mul v22.4S, v22.4S,v30.s[0] +add v14.4s, v14.4s, v20.4s +ldr q20, [x0, #288] +mla v3.4S, v17.4S, v31.s[0] +sub v17.4s, v20.4s, v18.4s +mla v19.4S, v21.4S, v31.s[0] +mla v2.4S, v0.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +mla v22.4S, v13.4S, v31.s[0] +ldr q13, [x0, #352] +sqrdmulh v18.4S, v1.4S, v29.s[1] +sub v0.4s, v13.4s, v16.4s +mul v1.4S, v1.4S,v30.s[1] +sqrdmulh v21.4S, v14.4S, v29.s[1] +add v13.4s, v13.4s, v16.4s +mul v14.4S, v14.4S,v30.s[1] +ldr q16, [x0, #32] +sqrdmulh v11.4S, v20.4S, v29.s[1] +sub v10.4s, v16.4s, v3.4s +mul v20.4S, v20.4S,v30.s[1] +add v16.4s, v16.4s, v3.4s +ldr q3, [x0, #96] +sqrdmulh v9.4S, v13.4S, v29.s[1] +sub v8.4s, v3.4s, v19.4s +mul v13.4S, v13.4S,v30.s[1] +add v3.4s, v3.4s, v19.4s +ldr q19, [x0, #160] +mla v1.4S, v18.4S, v31.s[0] +sub v18.4s, v19.4s, v2.4s +mla v14.4S, v21.4S, v31.s[0] +mla v20.4S, v11.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +mla v13.4S, v9.4S, v31.s[0] +ldr q9, [x0, #224] +sqrdmulh v2.4S, v15.4S, v29.s[2] +sub v11.4s, v9.4s, v22.4s +mul v15.4S, v15.4S,v30.s[2] +sqrdmulh v21.4S, v12.4S, v29.s[2] +add v9.4s, v9.4s, v22.4s +mul v12.4S, v12.4S,v30.s[2] +sqrdmulh v22.4S, v17.4S, v29.s[2] +sub v7.4s, v19.4s, v1.4s +mul v17.4S, v17.4S,v30.s[2] +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v29.s[2] +sub v6.4s, v9.4s, v14.4s +mul v0.4S, v0.4S,v30.s[2] +add v9.4s, v9.4s, v14.4s +mla v15.4S, v2.4S, v31.s[0] +sub v2.4s, v16.4s, v20.4s +mla v12.4S, v21.4S, v31.s[0] +mla v17.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +mla v0.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v7.4S, v27.s[1] +sub v20.4s, v3.4s, v13.4s +mul v7.4S, v7.4S,v28.s[1] +sqrdmulh v22.4S, v6.4S, v27.s[1] +add v3.4s, v3.4s, v13.4s +mul v6.4S, v6.4S,v28.s[1] +sqrdmulh v13.4S, v19.4S, v27.s[0] +sub v21.4s, v18.4s, v15.4s +mul v19.4S, v19.4S,v28.s[0] +add v18.4s, v18.4s, v15.4s +sqrdmulh v15.4S, v9.4S, v27.s[0] +sub v14.4s, v11.4s, v12.4s +mul v9.4S, v9.4S,v28.s[0] +add v11.4s, v11.4s, v12.4s +mla v7.4S, v1.4S, v31.s[0] +sub v1.4s, v10.4s, v17.4s +mla v6.4S, v22.4S, v31.s[0] +mla v19.4S, v13.4S, v31.s[0] +add v10.4s, v10.4s, v17.4s +mla v9.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v18.4S, v27.s[2] +sub v17.4s, v8.4s, v0.4s +mul v18.4S, v18.4S,v28.s[2] +sqrdmulh v13.4S, v11.4S, v27.s[2] +add v8.4s, v8.4s, v0.4s +mul v11.4S, v11.4S,v28.s[2] +sqrdmulh v0.4S, v21.4S, v27.s[3] +sub v22.4s, v2.4s, v7.4s +mul v21.4S, v21.4S,v28.s[3] +add v2.4s, v2.4s, v7.4s +sqrdmulh v7.4S, v14.4S, v27.s[3] +sub v12.4s, v20.4s, v6.4s +mul v14.4S, v14.4S,v28.s[3] +add v20.4s, v20.4s, v6.4s +mla v18.4S, v15.4S, v31.s[0] +sub v15.4s, v16.4s, v19.4s +mla v11.4S, v13.4S, v31.s[0] +mla v21.4S, v0.4S, v31.s[0] +add v16.4s, v16.4s, v19.4s +mla v14.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v20.4S, v25.s[2] +sub v19.4s, v3.4s, v9.4s +mul v20.4S, v20.4S,v26.s[2] +sqrdmulh v0.4S, v12.4S, v25.s[3] +add v3.4s, v3.4s, v9.4s +mul v12.4S, v12.4S,v26.s[3] +sqrdmulh v9.4S, v19.4S, v25.s[1] +sub v13.4s, v10.4s, v18.4s +mul v19.4S, v19.4S,v26.s[1] +add v10.4s, v10.4s, v18.4s +sqrdmulh v18.4S, v3.4S, v25.s[0] +sub v6.4s, v8.4s, v11.4s +mul v3.4S, v3.4S,v26.s[0] +add v8.4s, v8.4s, v11.4s +mla v20.4S, v7.4S, v31.s[0] +sub v7.4s, v1.4s, v21.4s +mla v12.4S, v0.4S, v31.s[0] +mla v19.4S, v9.4S, v31.s[0] +add v1.4s, v1.4s, v21.4s +mla v3.4S, v18.4S, v31.s[0] +sqrdmulh v18.4S, v8.4S, v23.s[0] +sub v21.4s, v17.4s, v14.4s +mul v8.4S, v8.4S,v24.s[0] +sqrdmulh v9.4S, v6.4S, v23.s[1] +add v17.4s, v17.4s, v14.4s +mul v6.4S, v6.4S,v24.s[1] +sqrdmulh v14.4S, v17.4S, v23.s[2] +sub v0.4s, v2.4s, v20.4s +mul v17.4S, v17.4S,v24.s[2] +add v2.4s, v2.4s, v20.4s +sqrdmulh v20.4S, v21.4S, v23.s[3] +sub v11.4s, v22.4s, v12.4s +mul v21.4S, v21.4S,v24.s[3] +add v22.4s, v22.4s, v12.4s +mla v8.4S, v18.4S, v31.s[0] +sub v18.4s, v15.4s, v19.4s +mla v6.4S, v9.4S, v31.s[0] +str q2, [x0, #288] +mla v17.4S, v14.4S, v31.s[0] +add v15.4s, v15.4s, v19.4s +mla v21.4S, v20.4S, v31.s[0] +str q0, [x0, #352] +ldr q0, [x0, #944] +sqrdmulh v20.4S, v0.4S, v29.s[0] +sub v19.4s, v16.4s, v3.4s +mul v0.4S, v0.4S,v30.s[0] +str q22, [x0, #416] +ldr q22, [x0, #1008] +sqrdmulh v14.4S, v22.4S, v29.s[0] +add v16.4s, v16.4s, v3.4s +mul v22.4S, v22.4S,v30.s[0] +str q11, [x0, #480] +ldr q11, [x0, #816] +sqrdmulh v3.4S, v11.4S, v29.s[0] +sub v2.4s, v10.4s, v8.4s +mul v11.4S, v11.4S,v30.s[0] +add v10.4s, v10.4s, v8.4s +ldr q8, [x0, #880] +sqrdmulh v9.4S, v8.4S, v29.s[0] +sub v12.4s, v13.4s, v6.4s +mul v8.4S, v8.4S,v30.s[0] +add v13.4s, v13.4s, v6.4s +mla v0.4S, v20.4S, v31.s[0] +sub v20.4s, v1.4s, v17.4s +mla v22.4S, v14.4S, v31.s[0] +str q15, [x0, #160] +mla v11.4S, v3.4S, v31.s[0] +add v1.4s, v1.4s, v17.4s +mla v8.4S, v9.4S, v31.s[0] +str q18, [x0, #224] +ldr q18, [x0, #560] +sqrdmulh v9.4S, v18.4S, v29.s[0] +sub v17.4s, v7.4s, v21.4s +mul v18.4S, v18.4S,v30.s[0] +str q16, [x0, #32] +ldr q16, [x0, #624] +sqrdmulh v3.4S, v16.4S, v29.s[0] +add v7.4s, v7.4s, v21.4s +mul v16.4S, v16.4S,v30.s[0] +str q19, [x0, #96] +ldr q19, [x0, #688] +ldr q21, [x0, #432] +sqrdmulh v15.4S, v19.4S, v29.s[0] +sub v14.4s, v21.4s, v0.4s +mul v19.4S, v19.4S,v30.s[0] +add v21.4s, v21.4s, v0.4s +ldr q0, [x0, #752] +ldr q6, [x0, #496] +sqrdmulh v5.4S, v0.4S, v29.s[0] +sub v4.4s, v6.4s, v22.4s +mul v0.4S, v0.4S,v30.s[0] +add v6.4s, v6.4s, v22.4s +ldr q22, [x0, #304] +mla v18.4S, v9.4S, v31.s[0] +sub v9.4s, v22.4s, v11.4s +mla v16.4S, v3.4S, v31.s[0] +str q10, [x0, #544] +mla v19.4S, v15.4S, v31.s[0] +add v22.4s, v22.4s, v11.4s +mla v0.4S, v5.4S, v31.s[0] +str q2, [x0, #608] +ldr q2, [x0, #368] +sqrdmulh v5.4S, v21.4S, v29.s[1] +sub v11.4s, v2.4s, v8.4s +mul v21.4S, v21.4S,v30.s[1] +str q13, [x0, #672] +sqrdmulh v13.4S, v6.4S, v29.s[1] +add v2.4s, v2.4s, v8.4s +mul v6.4S, v6.4S,v30.s[1] +str q12, [x0, #736] +ldr q12, [x0, #48] +sqrdmulh v8.4S, v22.4S, v29.s[1] +sub v15.4s, v12.4s, v18.4s +mul v22.4S, v22.4S,v30.s[1] +add v12.4s, v12.4s, v18.4s +ldr q18, [x0, #112] +sqrdmulh v10.4S, v2.4S, v29.s[1] +sub v3.4s, v18.4s, v16.4s +mul v2.4S, v2.4S,v30.s[1] +add v18.4s, v18.4s, v16.4s +ldr q16, [x0, #176] +mla v21.4S, v5.4S, v31.s[0] +sub v5.4s, v16.4s, v19.4s +mla v6.4S, v13.4S, v31.s[0] +str q1, [x0, #800] +mla v22.4S, v8.4S, v31.s[0] +add v16.4s, v16.4s, v19.4s +mla v2.4S, v10.4S, v31.s[0] +str q20, [x0, #864] +ldr q20, [x0, #240] +sqrdmulh v10.4S, v14.4S, v29.s[2] +sub v19.4s, v20.4s, v0.4s +mul v14.4S, v14.4S,v30.s[2] +str q7, [x0, #928] +sqrdmulh v7.4S, v4.4S, v29.s[2] +add v20.4s, v20.4s, v0.4s +mul v4.4S, v4.4S,v30.s[2] +str q17, [x0, #992] +sqrdmulh v17.4S, v9.4S, v29.s[2] +sub v0.4s, v16.4s, v21.4s +mul v9.4S, v9.4S,v30.s[2] +add v16.4s, v16.4s, v21.4s +sqrdmulh v21.4S, v11.4S, v29.s[2] +sub v8.4s, v20.4s, v6.4s +mul v11.4S, v11.4S,v30.s[2] +add v20.4s, v20.4s, v6.4s +mla v14.4S, v10.4S, v31.s[0] +sub v10.4s, v12.4s, v22.4s +mla v4.4S, v7.4S, v31.s[0] +mla v9.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v22.4s +mla v11.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v0.4S, v27.s[1] +sub v22.4s, v18.4s, v2.4s +mul v0.4S, v0.4S,v28.s[1] +sqrdmulh v17.4S, v8.4S, v27.s[1] +add v18.4s, v18.4s, v2.4s +mul v8.4S, v8.4S,v28.s[1] +sqrdmulh v2.4S, v16.4S, v27.s[0] +sub v7.4s, v5.4s, v14.4s +mul v16.4S, v16.4S,v28.s[0] +add v5.4s, v5.4s, v14.4s +sqrdmulh v14.4S, v20.4S, v27.s[0] +sub v6.4s, v19.4s, v4.4s +mul v20.4S, v20.4S,v28.s[0] +add v19.4s, v19.4s, v4.4s +mla v0.4S, v21.4S, v31.s[0] +sub v21.4s, v15.4s, v9.4s +mla v8.4S, v17.4S, v31.s[0] +mla v16.4S, v2.4S, v31.s[0] +add v15.4s, v15.4s, v9.4s +mla v20.4S, v14.4S, v31.s[0] +sqrdmulh v14.4S, v5.4S, v27.s[2] +sub v9.4s, v3.4s, v11.4s +mul v5.4S, v5.4S,v28.s[2] +sqrdmulh v2.4S, v19.4S, v27.s[2] +add v3.4s, v3.4s, v11.4s +mul v19.4S, v19.4S,v28.s[2] +sqrdmulh v11.4S, v7.4S, v27.s[3] +sub v17.4s, v10.4s, v0.4s +mul v7.4S, v7.4S,v28.s[3] +add v10.4s, v10.4s, v0.4s +sqrdmulh v0.4S, v6.4S, v27.s[3] +sub v4.4s, v22.4s, v8.4s +mul v6.4S, v6.4S,v28.s[3] +add v22.4s, v22.4s, v8.4s +mla v5.4S, v14.4S, v31.s[0] +sub v14.4s, v12.4s, v16.4s +mla v19.4S, v2.4S, v31.s[0] +mla v7.4S, v11.4S, v31.s[0] +add v12.4s, v12.4s, v16.4s +mla v6.4S, v0.4S, v31.s[0] +sqrdmulh v0.4S, v22.4S, v25.s[2] +sub v16.4s, v18.4s, v20.4s +mul v22.4S, v22.4S,v26.s[2] +sqrdmulh v11.4S, v4.4S, v25.s[3] +add v18.4s, v18.4s, v20.4s +mul v4.4S, v4.4S,v26.s[3] +sqrdmulh v20.4S, v16.4S, v25.s[1] +sub v2.4s, v15.4s, v5.4s +mul v16.4S, v16.4S,v26.s[1] +add v15.4s, v15.4s, v5.4s +sqrdmulh v5.4S, v18.4S, v25.s[0] +sub v8.4s, v3.4s, v19.4s +mul v18.4S, v18.4S,v26.s[0] +add v3.4s, v3.4s, v19.4s +mla v22.4S, v0.4S, v31.s[0] +sub v0.4s, v21.4s, v7.4s +mla v4.4S, v11.4S, v31.s[0] +mla v16.4S, v20.4S, v31.s[0] +add v21.4s, v21.4s, v7.4s +mla v18.4S, v5.4S, v31.s[0] +sqrdmulh v5.4S, v3.4S, v23.s[0] +sub v7.4s, v9.4s, v6.4s +mul v3.4S, v3.4S,v24.s[0] +sqrdmulh v20.4S, v8.4S, v23.s[1] +add v9.4s, v9.4s, v6.4s +mul v8.4S, v8.4S,v24.s[1] +sqrdmulh v6.4S, v9.4S, v23.s[2] +sub v11.4s, v10.4s, v22.4s +mul v9.4S, v9.4S,v24.s[2] +add v10.4s, v10.4s, v22.4s +sqrdmulh v22.4S, v7.4S, v23.s[3] +sub v19.4s, v17.4s, v4.4s +mul v7.4S, v7.4S,v24.s[3] +add v17.4s, v17.4s, v4.4s +mla v3.4S, v5.4S, v31.s[0] +sub v5.4s, v14.4s, v16.4s +mla v8.4S, v20.4S, v31.s[0] +str q10, [x0, #304] +mla v9.4S, v6.4S, v31.s[0] +add v14.4s, v14.4s, v16.4s +mla v7.4S, v22.4S, v31.s[0] +str q11, [x0, #368] +ldr q11, [x0, #896] +sqrdmulh v22.4S, v11.4S, v29.s[0] +sub v16.4s, v12.4s, v18.4s +mul v11.4S, v11.4S,v30.s[0] +str q17, [x0, #432] +ldr q17, [x0, #960] +sqrdmulh v6.4S, v17.4S, v29.s[0] +add v12.4s, v12.4s, v18.4s +mul v17.4S, v17.4S,v30.s[0] +str q19, [x0, #496] +ldr q19, [x0, #768] +sqrdmulh v18.4S, v19.4S, v29.s[0] +sub v10.4s, v15.4s, v3.4s +mul v19.4S, v19.4S,v30.s[0] +add v15.4s, v15.4s, v3.4s +ldr q3, [x0, #832] +sqrdmulh v20.4S, v3.4S, v29.s[0] +sub v4.4s, v2.4s, v8.4s +mul v3.4S, v3.4S,v30.s[0] +add v2.4s, v2.4s, v8.4s +mla v11.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v9.4s +mla v17.4S, v6.4S, v31.s[0] +str q14, [x0, #176] +mla v19.4S, v18.4S, v31.s[0] +add v21.4s, v21.4s, v9.4s +mla v3.4S, v20.4S, v31.s[0] +str q5, [x0, #240] +ldr q5, [x0, #512] +sqrdmulh v20.4S, v5.4S, v29.s[0] +sub v9.4s, v0.4s, v7.4s +mul v5.4S, v5.4S,v30.s[0] +str q12, [x0, #48] +ldr q12, [x0, #576] +sqrdmulh v18.4S, v12.4S, v29.s[0] +add v0.4s, v0.4s, v7.4s +mul v12.4S, v12.4S,v30.s[0] +str q16, [x0, #112] +ldr q16, [x0, #640] +ldr q7, [x0, #384] +sqrdmulh v14.4S, v16.4S, v29.s[0] +sub v6.4s, v7.4s, v11.4s +mul v16.4S, v16.4S,v30.s[0] +add v7.4s, v7.4s, v11.4s +ldr q11, [x0, #704] +ldr q8, [x0, #448] +sqrdmulh v1.4S, v11.4S, v29.s[0] +sub v13.4s, v8.4s, v17.4s +mul v11.4S, v11.4S,v30.s[0] +add v8.4s, v8.4s, v17.4s +ldr q17, [x0, #256] +mla v5.4S, v20.4S, v31.s[0] +sub v20.4s, v17.4s, v19.4s +mla v12.4S, v18.4S, v31.s[0] +str q15, [x0, #560] +mla v16.4S, v14.4S, v31.s[0] +add v17.4s, v17.4s, v19.4s +mla v11.4S, v1.4S, v31.s[0] +str q10, [x0, #624] +ldr q10, [x0, #320] +sqrdmulh v1.4S, v7.4S, v29.s[1] +sub v19.4s, v10.4s, v3.4s +mul v7.4S, v7.4S,v30.s[1] +str q2, [x0, #688] +sqrdmulh v2.4S, v8.4S, v29.s[1] +add v10.4s, v10.4s, v3.4s +mul v8.4S, v8.4S,v30.s[1] +str q4, [x0, #752] +ldr q4, [x0, #0] +sqrdmulh v3.4S, v17.4S, v29.s[1] +sub v14.4s, v4.4s, v5.4s +mul v17.4S, v17.4S,v30.s[1] +add v4.4s, v4.4s, v5.4s +ldr q5, [x0, #64] +sqrdmulh v15.4S, v10.4S, v29.s[1] +sub v18.4s, v5.4s, v12.4s +mul v10.4S, v10.4S,v30.s[1] +add v5.4s, v5.4s, v12.4s +ldr q12, [x0, #128] +mla v7.4S, v1.4S, v31.s[0] +sub v1.4s, v12.4s, v16.4s +mla v8.4S, v2.4S, v31.s[0] +str q21, [x0, #816] +mla v17.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v16.4s +mla v10.4S, v15.4S, v31.s[0] +str q22, [x0, #880] +ldr q22, [x0, #192] +sqrdmulh v15.4S, v6.4S, v29.s[2] +sub v16.4s, v22.4s, v11.4s +mul v6.4S, v6.4S,v30.s[2] +str q0, [x0, #944] +sqrdmulh v0.4S, v13.4S, v29.s[2] +add v22.4s, v22.4s, v11.4s +mul v13.4S, v13.4S,v30.s[2] +str q9, [x0, #1008] +sqrdmulh v9.4S, v20.4S, v29.s[2] +sub v11.4s, v12.4s, v7.4s +mul v20.4S, v20.4S,v30.s[2] +add v12.4s, v12.4s, v7.4s +sqrdmulh v7.4S, v19.4S, v29.s[2] +sub v3.4s, v22.4s, v8.4s +mul v19.4S, v19.4S,v30.s[2] +add v22.4s, v22.4s, v8.4s +mla v6.4S, v15.4S, v31.s[0] +sub v15.4s, v4.4s, v17.4s +mla v13.4S, v0.4S, v31.s[0] +mla v20.4S, v9.4S, v31.s[0] +add v4.4s, v4.4s, v17.4s +mla v19.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v11.4S, v27.s[1] +sub v17.4s, v5.4s, v10.4s +mul v11.4S, v11.4S,v28.s[1] +sqrdmulh v9.4S, v3.4S, v27.s[1] +add v5.4s, v5.4s, v10.4s +mul v3.4S, v3.4S,v28.s[1] +sqrdmulh v10.4S, v12.4S, v27.s[0] +sub v0.4s, v1.4s, v6.4s +mul v12.4S, v12.4S,v28.s[0] +add v1.4s, v1.4s, v6.4s +sqrdmulh v6.4S, v22.4S, v27.s[0] +sub v8.4s, v16.4s, v13.4s +mul v22.4S, v22.4S,v28.s[0] +add v16.4s, v16.4s, v13.4s +mla v11.4S, v7.4S, v31.s[0] +sub v7.4s, v14.4s, v20.4s +mla v3.4S, v9.4S, v31.s[0] +mla v12.4S, v10.4S, v31.s[0] +add v14.4s, v14.4s, v20.4s +mla v22.4S, v6.4S, v31.s[0] +sqrdmulh v6.4S, v1.4S, v27.s[2] +sub v20.4s, v18.4s, v19.4s +mul v1.4S, v1.4S,v28.s[2] +sqrdmulh v10.4S, v16.4S, v27.s[2] +add v18.4s, v18.4s, v19.4s +mul v16.4S, v16.4S,v28.s[2] +sqrdmulh v19.4S, v0.4S, v27.s[3] +sub v9.4s, v15.4s, v11.4s +mul v0.4S, v0.4S,v28.s[3] +add v15.4s, v15.4s, v11.4s +sqrdmulh v11.4S, v8.4S, v27.s[3] +sub v13.4s, v17.4s, v3.4s +mul v8.4S, v8.4S,v28.s[3] +add v17.4s, v17.4s, v3.4s +mla v1.4S, v6.4S, v31.s[0] +sub v6.4s, v4.4s, v12.4s +mla v16.4S, v10.4S, v31.s[0] +mla v0.4S, v19.4S, v31.s[0] +add v4.4s, v4.4s, v12.4s +mla v8.4S, v11.4S, v31.s[0] +sqrdmulh v11.4S, v17.4S, v25.s[2] +sub v12.4s, v5.4s, v22.4s +mul v17.4S, v17.4S,v26.s[2] +sqrdmulh v19.4S, v13.4S, v25.s[3] +add v5.4s, v5.4s, v22.4s +mul v13.4S, v13.4S,v26.s[3] +sqrdmulh v22.4S, v12.4S, v25.s[1] +sub v10.4s, v14.4s, v1.4s +mul v12.4S, v12.4S,v26.s[1] +add v14.4s, v14.4s, v1.4s +sqrdmulh v1.4S, v5.4S, v25.s[0] +sub v3.4s, v18.4s, v16.4s +mul v5.4S, v5.4S,v26.s[0] +add v18.4s, v18.4s, v16.4s +mla v17.4S, v11.4S, v31.s[0] +sub v11.4s, v7.4s, v0.4s +mla v13.4S, v19.4S, v31.s[0] +mla v12.4S, v22.4S, v31.s[0] +add v7.4s, v7.4s, v0.4s +mla v5.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v18.4S, v23.s[0] +sub v0.4s, v20.4s, v8.4s +mul v18.4S, v18.4S,v24.s[0] +sqrdmulh v22.4S, v3.4S, v23.s[1] +add v20.4s, v20.4s, v8.4s +mul v3.4S, v3.4S,v24.s[1] +sqrdmulh v8.4S, v20.4S, v23.s[2] +sub v19.4s, v15.4s, v17.4s +mul v20.4S, v20.4S,v24.s[2] +add v15.4s, v15.4s, v17.4s +sqrdmulh v17.4S, v0.4S, v23.s[3] +sub v16.4s, v9.4s, v13.4s +mul v0.4S, v0.4S,v24.s[3] +add v9.4s, v9.4s, v13.4s +mla v18.4S, v1.4S, v31.s[0] +sub v1.4s, v6.4s, v12.4s +mla v3.4S, v22.4S, v31.s[0] +str q15, [x0, #256] +mla v20.4S, v8.4S, v31.s[0] +add v6.4s, v6.4s, v12.4s +mla v0.4S, v17.4S, v31.s[0] +str q19, [x0, #320] +ldr q19, [x0, #912] +sqrdmulh v17.4S, v19.4S, v29.s[0] +sub v12.4s, v4.4s, v5.4s +mul v19.4S, v19.4S,v30.s[0] +str q9, [x0, #384] +ldr q9, [x0, #976] +sqrdmulh v8.4S, v9.4S, v29.s[0] +add v4.4s, v4.4s, v5.4s +mul v9.4S, v9.4S,v30.s[0] +str q16, [x0, #448] +ldr q16, [x0, #784] +sqrdmulh v5.4S, v16.4S, v29.s[0] +sub v15.4s, v14.4s, v18.4s +mul v16.4S, v16.4S,v30.s[0] +add v14.4s, v14.4s, v18.4s +ldr q18, [x0, #848] +sqrdmulh v22.4S, v18.4S, v29.s[0] +sub v13.4s, v10.4s, v3.4s +mul v18.4S, v18.4S,v30.s[0] +add v10.4s, v10.4s, v3.4s +mla v19.4S, v17.4S, v31.s[0] +sub v17.4s, v7.4s, v20.4s +mla v9.4S, v8.4S, v31.s[0] +str q6, [x0, #128] +mla v16.4S, v5.4S, v31.s[0] +add v7.4s, v7.4s, v20.4s +mla v18.4S, v22.4S, v31.s[0] +str q1, [x0, #192] +ldr q1, [x0, #528] +sqrdmulh v22.4S, v1.4S, v29.s[0] +sub v20.4s, v11.4s, v0.4s +mul v1.4S, v1.4S,v30.s[0] +str q4, [x0, #0] +ldr q4, [x0, #592] +sqrdmulh v5.4S, v4.4S, v29.s[0] +add v11.4s, v11.4s, v0.4s +mul v4.4S, v4.4S,v30.s[0] +str q12, [x0, #64] +ldr q12, [x0, #656] +ldr q0, [x0, #400] +sqrdmulh v6.4S, v12.4S, v29.s[0] +sub v8.4s, v0.4s, v19.4s +mul v12.4S, v12.4S,v30.s[0] +add v0.4s, v0.4s, v19.4s +ldr q19, [x0, #720] +ldr q3, [x0, #464] +sqrdmulh v21.4S, v19.4S, v29.s[0] +sub v2.4s, v3.4s, v9.4s +mul v19.4S, v19.4S,v30.s[0] +add v3.4s, v3.4s, v9.4s +ldr q9, [x0, #272] +mla v1.4S, v22.4S, v31.s[0] +sub v22.4s, v9.4s, v16.4s +mla v4.4S, v5.4S, v31.s[0] +str q14, [x0, #512] +mla v12.4S, v6.4S, v31.s[0] +add v9.4s, v9.4s, v16.4s +mla v19.4S, v21.4S, v31.s[0] +str q15, [x0, #576] +ldr q15, [x0, #336] +sqrdmulh v21.4S, v0.4S, v29.s[1] +sub v16.4s, v15.4s, v18.4s +mul v0.4S, v0.4S,v30.s[1] +str q10, [x0, #640] +sqrdmulh v10.4S, v3.4S, v29.s[1] +add v15.4s, v15.4s, v18.4s +mul v3.4S, v3.4S,v30.s[1] +str q13, [x0, #704] +ldr q13, [x0, #16] +sqrdmulh v18.4S, v9.4S, v29.s[1] +sub v6.4s, v13.4s, v1.4s +mul v9.4S, v9.4S,v30.s[1] +add v13.4s, v13.4s, v1.4s +ldr q1, [x0, #80] +sqrdmulh v14.4S, v15.4S, v29.s[1] +sub v5.4s, v1.4s, v4.4s +mul v15.4S, v15.4S,v30.s[1] +add v1.4s, v1.4s, v4.4s +ldr q4, [x0, #144] +mla v0.4S, v21.4S, v31.s[0] +sub v21.4s, v4.4s, v12.4s +mla v3.4S, v10.4S, v31.s[0] +str q7, [x0, #768] +mla v9.4S, v18.4S, v31.s[0] +add v4.4s, v4.4s, v12.4s +mla v15.4S, v14.4S, v31.s[0] +str q17, [x0, #832] +ldr q17, [x0, #208] +sqrdmulh v14.4S, v8.4S, v29.s[2] +sub v12.4s, v17.4s, v19.4s +mul v8.4S, v8.4S,v30.s[2] +str q11, [x0, #896] +sqrdmulh v11.4S, v2.4S, v29.s[2] +add v17.4s, v17.4s, v19.4s +mul v2.4S, v2.4S,v30.s[2] +str q20, [x0, #960] +sqrdmulh v20.4S, v22.4S, v29.s[2] +sub v19.4s, v4.4s, v0.4s +mul v22.4S, v22.4S,v30.s[2] +add v4.4s, v4.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[2] +sub v18.4s, v17.4s, v3.4s +mul v16.4S, v16.4S,v30.s[2] +add v17.4s, v17.4s, v3.4s +mla v8.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v9.4s +mla v2.4S, v11.4S, v31.s[0] +mla v22.4S, v20.4S, v31.s[0] +add v13.4s, v13.4s, v9.4s +mla v16.4S, v0.4S, v31.s[0] +sqrdmulh v0.4S, v19.4S, v27.s[1] +sub v9.4s, v1.4s, v15.4s +mul v19.4S, v19.4S,v28.s[1] +sqrdmulh v20.4S, v18.4S, v27.s[1] +add v1.4s, v1.4s, v15.4s +mul v18.4S, v18.4S,v28.s[1] +sqrdmulh v15.4S, v4.4S, v27.s[0] +sub v11.4s, v21.4s, v8.4s +mul v4.4S, v4.4S,v28.s[0] +add v21.4s, v21.4s, v8.4s +sqrdmulh v8.4S, v17.4S, v27.s[0] +sub v3.4s, v12.4s, v2.4s +mul v17.4S, v17.4S,v28.s[0] +add v12.4s, v12.4s, v2.4s +mla v19.4S, v0.4S, v31.s[0] +sub v0.4s, v6.4s, v22.4s +mla v18.4S, v20.4S, v31.s[0] +mla v4.4S, v15.4S, v31.s[0] +add v6.4s, v6.4s, v22.4s +mla v17.4S, v8.4S, v31.s[0] +sqrdmulh v8.4S, v21.4S, v27.s[2] +sub v22.4s, v5.4s, v16.4s +mul v21.4S, v21.4S,v28.s[2] +sqrdmulh v15.4S, v12.4S, v27.s[2] +add v5.4s, v5.4s, v16.4s +mul v12.4S, v12.4S,v28.s[2] +sqrdmulh v16.4S, v11.4S, v27.s[3] +sub v20.4s, v14.4s, v19.4s +mul v11.4S, v11.4S,v28.s[3] +add v14.4s, v14.4s, v19.4s +sqrdmulh v19.4S, v3.4S, v27.s[3] +sub v2.4s, v9.4s, v18.4s +mul v3.4S, v3.4S,v28.s[3] +add v9.4s, v9.4s, v18.4s +mla v21.4S, v8.4S, v31.s[0] +sub v8.4s, v13.4s, v4.4s +mla v12.4S, v15.4S, v31.s[0] +mla v11.4S, v16.4S, v31.s[0] +add v13.4s, v13.4s, v4.4s +mla v3.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v9.4S, v25.s[2] +sub v4.4s, v1.4s, v17.4s +mul v9.4S, v9.4S,v26.s[2] +sqrdmulh v16.4S, v2.4S, v25.s[3] +add v1.4s, v1.4s, v17.4s +mul v2.4S, v2.4S,v26.s[3] +sqrdmulh v17.4S, v4.4S, v25.s[1] +sub v15.4s, v6.4s, v21.4s +mul v4.4S, v4.4S,v26.s[1] +add v6.4s, v6.4s, v21.4s +sqrdmulh v21.4S, v1.4S, v25.s[0] +sub v18.4s, v5.4s, v12.4s +mul v1.4S, v1.4S,v26.s[0] +add v5.4s, v5.4s, v12.4s +mla v9.4S, v19.4S, v31.s[0] +sub v19.4s, v0.4s, v11.4s +mla v2.4S, v16.4S, v31.s[0] +mla v4.4S, v17.4S, v31.s[0] +add v0.4s, v0.4s, v11.4s +mla v1.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v5.4S, v23.s[0] +sub v11.4s, v22.4s, v3.4s +mul v5.4S, v5.4S,v24.s[0] +sqrdmulh v17.4S, v18.4S, v23.s[1] +add v22.4s, v22.4s, v3.4s +mul v18.4S, v18.4S,v24.s[1] +sqrdmulh v3.4S, v22.4S, v23.s[2] +sub v16.4s, v14.4s, v9.4s +mul v22.4S, v22.4S,v24.s[2] +add v14.4s, v14.4s, v9.4s +sqrdmulh v9.4S, v11.4S, v23.s[3] +sub v12.4s, v20.4s, v2.4s +mul v11.4S, v11.4S,v24.s[3] +add v20.4s, v20.4s, v2.4s +mla v5.4S, v21.4S, v31.s[0] +sub v21.4s, v8.4s, v4.4s +mla v18.4S, v17.4S, v31.s[0] +str q14, [x0, #272] +mla v22.4S, v3.4S, v31.s[0] +add v8.4s, v8.4s, v4.4s +mla v11.4S, v9.4S, v31.s[0] +str q16, [x0, #336] +sub v23.4s, v13.4s, v1.4s +str q20, [x0, #400] +add v13.4s, v13.4s, v1.4s +str q12, [x0, #464] +sub v12.4s, v6.4s, v5.4s +add v6.4s, v6.4s, v5.4s +sub v5.4s, v15.4s, v18.4s +add v15.4s, v15.4s, v18.4s +sub v18.4s, v0.4s, v22.4s +str q8, [x0, #144] +add v0.4s, v0.4s, v22.4s +str q21, [x0, #208] +sub v21.4s, v19.4s, v11.4s +str q13, [x0, #16] +add v19.4s, v19.4s, v11.4s +str q23, [x0, #80] +str q6, [x0, #528] +str q12, [x0, #592] +str q15, [x0, #656] +str q5, [x0, #720] +str q0, [x0, #784] +str q18, [x0, #848] +str q19, [x0, #912] +str q21, [x0, #976] +ldr q10, [x17, #+128] +ldr q7, [x17, #+144] +ldr q2, [x17, #+160] +ldr q17, [x17, #+176] +ldr q14, [x17, #+192] +ldr q3, [x17, #+208] +ldr q4, [x17, #+224] +ldr q9, [x17, #+240] +ldr q16, [x0, #32] +ldr q30, [x0, #48] +ldr q29, [x0, #0] +ldr q28, [x0, #16] +sqrdmulh v27.4S, v16.4S, v7.s[0] +mul v16.4S, v16.4S,v10.s[0] +mla v16.4S, v27.4S, v31.s[0] +sub v27.4s, v29.4s, v16.4s +add v29.4s, v29.4s, v16.4s +sqrdmulh v16.4S, v30.4S, v7.s[0] +mul v30.4S, v30.4S,v10.s[0] +mla v30.4S, v16.4S, v31.s[0] +sub v16.4s, v28.4s, v30.4s +add v28.4s, v28.4s, v30.4s +sqrdmulh v30.4S, v28.4S, v7.s[1] +mul v28.4S, v28.4S,v10.s[1] +mla v28.4S, v30.4S, v31.s[0] +sub v30.4s, v29.4s, v28.4s +add v29.4s, v29.4s, v28.4s +sqrdmulh v28.4S, v16.4S, v7.s[2] +mul v16.4S, v16.4S,v10.s[2] +mla v16.4S, v28.4S, v31.s[0] +sub v28.4s, v27.4s, v16.4s +add v27.4s, v27.4s, v16.4s +trn1 v16.4S, v29.4S, v30.4S +trn2 v26.4S, v29.4S, v30.4S +trn1 v25.4S, v27.4S, v28.4S +trn2 v24.4S, v27.4S, v28.4S +trn2 v27.2D, v16.2D, v25.2D +trn2 v28.2D, v26.2D, v24.2D +trn1 v29.2D, v16.2D, v25.2D +trn1 v30.2D, v26.2D, v24.2D +sqrdmulh v24.4S, v27.4S, v17.4S +mul v27.4S, v27.4S,v2.4S +mla v27.4S, v24.4S, v31.s[0] +sub v24.4s, v29.4s, v27.4s +add v29.4s, v29.4s, v27.4s +sqrdmulh v27.4S, v28.4S, v17.4S +mul v28.4S, v28.4S,v2.4S +mla v28.4S, v27.4S, v31.s[0] +sub v27.4s, v30.4s, v28.4s +add v30.4s, v30.4s, v28.4s +sqrdmulh v28.4S, v30.4S, v3.4S +mul v30.4S, v30.4S,v14.4S +mla v30.4S, v28.4S, v31.s[0] +sub v28.4s, v29.4s, v30.4s +add v29.4s, v29.4s, v30.4s +sqrdmulh v30.4S, v27.4S, v9.4S +mul v27.4S, v27.4S,v4.4S +mla v27.4S, v30.4S, v31.s[0] +sub v30.4s, v24.4s, v27.4s +add v24.4s, v24.4s, v27.4s +str q29, [x0, #0] +str q28, [x0, #16] +str q24, [x0, #32] +str q30, [x0, #48] +ldr q30, [x17, #+256] +ldr q24, [x17, #+272] +ldr q28, [x17, #+288] +ldr q29, [x17, #+304] +ldr q27, [x17, #+320] +ldr q26, [x17, #+336] +ldr q25, [x17, #+352] +ldr q16, [x17, #+368] +ldr q9, [x0, #96] +ldr q4, [x0, #112] +ldr q3, [x0, #64] +ldr q14, [x0, #80] +sqrdmulh v17.4S, v9.4S, v24.s[0] +mul v9.4S, v9.4S,v30.s[0] +mla v9.4S, v17.4S, v31.s[0] +sub v17.4s, v3.4s, v9.4s +add v3.4s, v3.4s, v9.4s +sqrdmulh v9.4S, v4.4S, v24.s[0] +mul v4.4S, v4.4S,v30.s[0] +mla v4.4S, v9.4S, v31.s[0] +sub v9.4s, v14.4s, v4.4s +add v14.4s, v14.4s, v4.4s +sqrdmulh v4.4S, v14.4S, v24.s[1] +mul v14.4S, v14.4S,v30.s[1] +mla v14.4S, v4.4S, v31.s[0] +sub v4.4s, v3.4s, v14.4s +add v3.4s, v3.4s, v14.4s +sqrdmulh v14.4S, v9.4S, v24.s[2] +mul v9.4S, v9.4S,v30.s[2] +mla v9.4S, v14.4S, v31.s[0] +sub v14.4s, v17.4s, v9.4s +add v17.4s, v17.4s, v9.4s +trn1 v9.4S, v3.4S, v4.4S +trn2 v2.4S, v3.4S, v4.4S +trn1 v7.4S, v17.4S, v14.4S +trn2 v10.4S, v17.4S, v14.4S +trn2 v17.2D, v9.2D, v7.2D +trn2 v14.2D, v2.2D, v10.2D +trn1 v3.2D, v9.2D, v7.2D +trn1 v4.2D, v2.2D, v10.2D +sqrdmulh v10.4S, v17.4S, v29.4S +mul v17.4S, v17.4S,v28.4S +mla v17.4S, v10.4S, v31.s[0] +sub v10.4s, v3.4s, v17.4s +add v3.4s, v3.4s, v17.4s +sqrdmulh v17.4S, v14.4S, v29.4S +mul v14.4S, v14.4S,v28.4S +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v4.4s, v14.4s +add v4.4s, v4.4s, v14.4s +sqrdmulh v14.4S, v4.4S, v26.4S +mul v4.4S, v4.4S,v27.4S +mla v4.4S, v14.4S, v31.s[0] +sub v14.4s, v3.4s, v4.4s +add v3.4s, v3.4s, v4.4s +sqrdmulh v4.4S, v17.4S, v16.4S +mul v17.4S, v17.4S,v25.4S +mla v17.4S, v4.4S, v31.s[0] +sub v4.4s, v10.4s, v17.4s +add v10.4s, v10.4s, v17.4s +str q3, [x0, #64] +str q14, [x0, #80] +str q10, [x0, #96] +str q4, [x0, #112] +ldr q4, [x17, #+384] +ldr q10, [x17, #+400] +ldr q14, [x17, #+416] +ldr q3, [x17, #+432] +ldr q17, [x17, #+448] +ldr q2, [x17, #+464] +ldr q7, [x17, #+480] +ldr q9, [x17, #+496] +ldr q16, [x0, #160] +ldr q25, [x0, #176] +ldr q26, [x0, #128] +ldr q27, [x0, #144] +sqrdmulh v29.4S, v16.4S, v10.s[0] +mul v16.4S, v16.4S,v4.s[0] +mla v16.4S, v29.4S, v31.s[0] +sub v29.4s, v26.4s, v16.4s +add v26.4s, v26.4s, v16.4s +sqrdmulh v16.4S, v25.4S, v10.s[0] +mul v25.4S, v25.4S,v4.s[0] +mla v25.4S, v16.4S, v31.s[0] +sub v16.4s, v27.4s, v25.4s +add v27.4s, v27.4s, v25.4s +sqrdmulh v25.4S, v27.4S, v10.s[1] +mul v27.4S, v27.4S,v4.s[1] +mla v27.4S, v25.4S, v31.s[0] +sub v25.4s, v26.4s, v27.4s +add v26.4s, v26.4s, v27.4s +sqrdmulh v27.4S, v16.4S, v10.s[2] +mul v16.4S, v16.4S,v4.s[2] +mla v16.4S, v27.4S, v31.s[0] +sub v27.4s, v29.4s, v16.4s +add v29.4s, v29.4s, v16.4s +trn1 v16.4S, v26.4S, v25.4S +trn2 v28.4S, v26.4S, v25.4S +trn1 v24.4S, v29.4S, v27.4S +trn2 v30.4S, v29.4S, v27.4S +trn2 v29.2D, v16.2D, v24.2D +trn2 v27.2D, v28.2D, v30.2D +trn1 v26.2D, v16.2D, v24.2D +trn1 v25.2D, v28.2D, v30.2D +sqrdmulh v30.4S, v29.4S, v3.4S +mul v29.4S, v29.4S,v14.4S +mla v29.4S, v30.4S, v31.s[0] +sub v30.4s, v26.4s, v29.4s +add v26.4s, v26.4s, v29.4s +sqrdmulh v29.4S, v27.4S, v3.4S +mul v27.4S, v27.4S,v14.4S +mla v27.4S, v29.4S, v31.s[0] +sub v29.4s, v25.4s, v27.4s +add v25.4s, v25.4s, v27.4s +sqrdmulh v27.4S, v25.4S, v2.4S +mul v25.4S, v25.4S,v17.4S +mla v25.4S, v27.4S, v31.s[0] +sub v27.4s, v26.4s, v25.4s +add v26.4s, v26.4s, v25.4s +sqrdmulh v25.4S, v29.4S, v9.4S +mul v29.4S, v29.4S,v7.4S +mla v29.4S, v25.4S, v31.s[0] +sub v25.4s, v30.4s, v29.4s +add v30.4s, v30.4s, v29.4s +str q26, [x0, #128] +str q27, [x0, #144] +str q30, [x0, #160] +str q25, [x0, #176] +ldr q25, [x17, #+512] +ldr q30, [x17, #+528] +ldr q27, [x17, #+544] +ldr q26, [x17, #+560] +ldr q29, [x17, #+576] +ldr q28, [x17, #+592] +ldr q24, [x17, #+608] +ldr q16, [x17, #+624] +ldr q9, [x0, #224] +ldr q7, [x0, #240] +ldr q2, [x0, #192] +ldr q17, [x0, #208] +sqrdmulh v3.4S, v9.4S, v30.s[0] +mul v9.4S, v9.4S,v25.s[0] +mla v9.4S, v3.4S, v31.s[0] +sub v3.4s, v2.4s, v9.4s +add v2.4s, v2.4s, v9.4s +sqrdmulh v9.4S, v7.4S, v30.s[0] +mul v7.4S, v7.4S,v25.s[0] +mla v7.4S, v9.4S, v31.s[0] +sub v9.4s, v17.4s, v7.4s +add v17.4s, v17.4s, v7.4s +sqrdmulh v7.4S, v17.4S, v30.s[1] +mul v17.4S, v17.4S,v25.s[1] +mla v17.4S, v7.4S, v31.s[0] +sub v7.4s, v2.4s, v17.4s +add v2.4s, v2.4s, v17.4s +sqrdmulh v17.4S, v9.4S, v30.s[2] +mul v9.4S, v9.4S,v25.s[2] +mla v9.4S, v17.4S, v31.s[0] +sub v17.4s, v3.4s, v9.4s +add v3.4s, v3.4s, v9.4s +trn1 v9.4S, v2.4S, v7.4S +trn2 v14.4S, v2.4S, v7.4S +trn1 v10.4S, v3.4S, v17.4S +trn2 v4.4S, v3.4S, v17.4S +trn2 v3.2D, v9.2D, v10.2D +trn2 v17.2D, v14.2D, v4.2D +trn1 v2.2D, v9.2D, v10.2D +trn1 v7.2D, v14.2D, v4.2D +sqrdmulh v4.4S, v3.4S, v26.4S +mul v3.4S, v3.4S,v27.4S +mla v3.4S, v4.4S, v31.s[0] +sub v4.4s, v2.4s, v3.4s +add v2.4s, v2.4s, v3.4s +sqrdmulh v3.4S, v17.4S, v26.4S +mul v17.4S, v17.4S,v27.4S +mla v17.4S, v3.4S, v31.s[0] +sub v3.4s, v7.4s, v17.4s +add v7.4s, v7.4s, v17.4s +sqrdmulh v17.4S, v7.4S, v28.4S +mul v7.4S, v7.4S,v29.4S +mla v7.4S, v17.4S, v31.s[0] +sub v17.4s, v2.4s, v7.4s +add v2.4s, v2.4s, v7.4s +sqrdmulh v7.4S, v3.4S, v16.4S +mul v3.4S, v3.4S,v24.4S +mla v3.4S, v7.4S, v31.s[0] +sub v7.4s, v4.4s, v3.4s +add v4.4s, v4.4s, v3.4s +str q2, [x0, #192] +str q17, [x0, #208] +str q4, [x0, #224] +str q7, [x0, #240] +ldr q7, [x17, #+640] +ldr q4, [x17, #+656] +ldr q17, [x17, #+672] +ldr q2, [x17, #+688] +ldr q3, [x17, #+704] +ldr q14, [x17, #+720] +ldr q10, [x17, #+736] +ldr q9, [x17, #+752] +ldr q16, [x0, #288] +ldr q24, [x0, #304] +ldr q28, [x0, #256] +ldr q29, [x0, #272] +sqrdmulh v26.4S, v16.4S, v4.s[0] +mul v16.4S, v16.4S,v7.s[0] +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v28.4s, v16.4s +add v28.4s, v28.4s, v16.4s +sqrdmulh v16.4S, v24.4S, v4.s[0] +mul v24.4S, v24.4S,v7.s[0] +mla v24.4S, v16.4S, v31.s[0] +sub v16.4s, v29.4s, v24.4s +add v29.4s, v29.4s, v24.4s +sqrdmulh v24.4S, v29.4S, v4.s[1] +mul v29.4S, v29.4S,v7.s[1] +mla v29.4S, v24.4S, v31.s[0] +sub v24.4s, v28.4s, v29.4s +add v28.4s, v28.4s, v29.4s +sqrdmulh v29.4S, v16.4S, v4.s[2] +mul v16.4S, v16.4S,v7.s[2] +mla v16.4S, v29.4S, v31.s[0] +sub v29.4s, v26.4s, v16.4s +add v26.4s, v26.4s, v16.4s +trn1 v16.4S, v28.4S, v24.4S +trn2 v27.4S, v28.4S, v24.4S +trn1 v30.4S, v26.4S, v29.4S +trn2 v25.4S, v26.4S, v29.4S +trn2 v26.2D, v16.2D, v30.2D +trn2 v29.2D, v27.2D, v25.2D +trn1 v28.2D, v16.2D, v30.2D +trn1 v24.2D, v27.2D, v25.2D +sqrdmulh v25.4S, v26.4S, v2.4S +mul v26.4S, v26.4S,v17.4S +mla v26.4S, v25.4S, v31.s[0] +sub v25.4s, v28.4s, v26.4s +add v28.4s, v28.4s, v26.4s +sqrdmulh v26.4S, v29.4S, v2.4S +mul v29.4S, v29.4S,v17.4S +mla v29.4S, v26.4S, v31.s[0] +sub v26.4s, v24.4s, v29.4s +add v24.4s, v24.4s, v29.4s +sqrdmulh v29.4S, v24.4S, v14.4S +mul v24.4S, v24.4S,v3.4S +mla v24.4S, v29.4S, v31.s[0] +sub v29.4s, v28.4s, v24.4s +add v28.4s, v28.4s, v24.4s +sqrdmulh v24.4S, v26.4S, v9.4S +mul v26.4S, v26.4S,v10.4S +mla v26.4S, v24.4S, v31.s[0] +sub v24.4s, v25.4s, v26.4s +add v25.4s, v25.4s, v26.4s +str q28, [x0, #256] +str q29, [x0, #272] +str q25, [x0, #288] +str q24, [x0, #304] +ldr q24, [x17, #+768] +ldr q25, [x17, #+784] +ldr q29, [x17, #+800] +ldr q28, [x17, #+816] +ldr q26, [x17, #+832] +ldr q27, [x17, #+848] +ldr q30, [x17, #+864] +ldr q16, [x17, #+880] +ldr q9, [x0, #352] +ldr q10, [x0, #368] +ldr q14, [x0, #320] +ldr q3, [x0, #336] +sqrdmulh v2.4S, v9.4S, v25.s[0] +mul v9.4S, v9.4S,v24.s[0] +mla v9.4S, v2.4S, v31.s[0] +sub v2.4s, v14.4s, v9.4s +add v14.4s, v14.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v25.s[0] +mul v10.4S, v10.4S,v24.s[0] +mla v10.4S, v9.4S, v31.s[0] +sub v9.4s, v3.4s, v10.4s +add v3.4s, v3.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v25.s[1] +mul v3.4S, v3.4S,v24.s[1] +mla v3.4S, v10.4S, v31.s[0] +sub v10.4s, v14.4s, v3.4s +add v14.4s, v14.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v25.s[2] +mul v9.4S, v9.4S,v24.s[2] +mla v9.4S, v3.4S, v31.s[0] +sub v3.4s, v2.4s, v9.4s +add v2.4s, v2.4s, v9.4s +trn1 v9.4S, v14.4S, v10.4S +trn2 v17.4S, v14.4S, v10.4S +trn1 v4.4S, v2.4S, v3.4S +trn2 v7.4S, v2.4S, v3.4S +trn2 v2.2D, v9.2D, v4.2D +trn2 v3.2D, v17.2D, v7.2D +trn1 v14.2D, v9.2D, v4.2D +trn1 v10.2D, v17.2D, v7.2D +sqrdmulh v7.4S, v2.4S, v28.4S +mul v2.4S, v2.4S,v29.4S +mla v2.4S, v7.4S, v31.s[0] +sub v7.4s, v14.4s, v2.4s +add v14.4s, v14.4s, v2.4s +sqrdmulh v2.4S, v3.4S, v28.4S +mul v3.4S, v3.4S,v29.4S +mla v3.4S, v2.4S, v31.s[0] +sub v2.4s, v10.4s, v3.4s +add v10.4s, v10.4s, v3.4s +sqrdmulh v3.4S, v10.4S, v27.4S +mul v10.4S, v10.4S,v26.4S +mla v10.4S, v3.4S, v31.s[0] +sub v3.4s, v14.4s, v10.4s +add v14.4s, v14.4s, v10.4s +sqrdmulh v10.4S, v2.4S, v16.4S +mul v2.4S, v2.4S,v30.4S +mla v2.4S, v10.4S, v31.s[0] +sub v10.4s, v7.4s, v2.4s +add v7.4s, v7.4s, v2.4s +str q14, [x0, #320] +str q3, [x0, #336] +str q7, [x0, #352] +str q10, [x0, #368] +ldr q10, [x17, #+896] +ldr q7, [x17, #+912] +ldr q3, [x17, #+928] +ldr q14, [x17, #+944] +ldr q2, [x17, #+960] +ldr q17, [x17, #+976] +ldr q4, [x17, #+992] +ldr q9, [x17, #+1008] +ldr q16, [x0, #416] +ldr q30, [x0, #432] +ldr q27, [x0, #384] +ldr q26, [x0, #400] +sqrdmulh v28.4S, v16.4S, v7.s[0] +mul v16.4S, v16.4S,v10.s[0] +mla v16.4S, v28.4S, v31.s[0] +sub v28.4s, v27.4s, v16.4s +add v27.4s, v27.4s, v16.4s +sqrdmulh v16.4S, v30.4S, v7.s[0] +mul v30.4S, v30.4S,v10.s[0] +mla v30.4S, v16.4S, v31.s[0] +sub v16.4s, v26.4s, v30.4s +add v26.4s, v26.4s, v30.4s +sqrdmulh v30.4S, v26.4S, v7.s[1] +mul v26.4S, v26.4S,v10.s[1] +mla v26.4S, v30.4S, v31.s[0] +sub v30.4s, v27.4s, v26.4s +add v27.4s, v27.4s, v26.4s +sqrdmulh v26.4S, v16.4S, v7.s[2] +mul v16.4S, v16.4S,v10.s[2] +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v28.4s, v16.4s +add v28.4s, v28.4s, v16.4s +trn1 v16.4S, v27.4S, v30.4S +trn2 v29.4S, v27.4S, v30.4S +trn1 v25.4S, v28.4S, v26.4S +trn2 v24.4S, v28.4S, v26.4S +trn2 v28.2D, v16.2D, v25.2D +trn2 v26.2D, v29.2D, v24.2D +trn1 v27.2D, v16.2D, v25.2D +trn1 v30.2D, v29.2D, v24.2D +sqrdmulh v24.4S, v28.4S, v14.4S +mul v28.4S, v28.4S,v3.4S +mla v28.4S, v24.4S, v31.s[0] +sub v24.4s, v27.4s, v28.4s +add v27.4s, v27.4s, v28.4s +sqrdmulh v28.4S, v26.4S, v14.4S +mul v26.4S, v26.4S,v3.4S +mla v26.4S, v28.4S, v31.s[0] +sub v28.4s, v30.4s, v26.4s +add v30.4s, v30.4s, v26.4s +sqrdmulh v26.4S, v30.4S, v17.4S +mul v30.4S, v30.4S,v2.4S +mla v30.4S, v26.4S, v31.s[0] +sub v26.4s, v27.4s, v30.4s +add v27.4s, v27.4s, v30.4s +sqrdmulh v30.4S, v28.4S, v9.4S +mul v28.4S, v28.4S,v4.4S +mla v28.4S, v30.4S, v31.s[0] +sub v30.4s, v24.4s, v28.4s +add v24.4s, v24.4s, v28.4s +str q27, [x0, #384] +str q26, [x0, #400] +str q24, [x0, #416] +str q30, [x0, #432] +ldr q30, [x17, #+1024] +ldr q24, [x17, #+1040] +ldr q26, [x17, #+1056] +ldr q27, [x17, #+1072] +ldr q28, [x17, #+1088] +ldr q29, [x17, #+1104] +ldr q25, [x17, #+1120] +ldr q16, [x17, #+1136] +ldr q9, [x0, #480] +ldr q4, [x0, #496] +ldr q17, [x0, #448] +ldr q2, [x0, #464] +sqrdmulh v14.4S, v9.4S, v24.s[0] +mul v9.4S, v9.4S,v30.s[0] +mla v9.4S, v14.4S, v31.s[0] +sub v14.4s, v17.4s, v9.4s +add v17.4s, v17.4s, v9.4s +sqrdmulh v9.4S, v4.4S, v24.s[0] +mul v4.4S, v4.4S,v30.s[0] +mla v4.4S, v9.4S, v31.s[0] +sub v9.4s, v2.4s, v4.4s +add v2.4s, v2.4s, v4.4s +sqrdmulh v4.4S, v2.4S, v24.s[1] +mul v2.4S, v2.4S,v30.s[1] +mla v2.4S, v4.4S, v31.s[0] +sub v4.4s, v17.4s, v2.4s +add v17.4s, v17.4s, v2.4s +sqrdmulh v2.4S, v9.4S, v24.s[2] +mul v9.4S, v9.4S,v30.s[2] +mla v9.4S, v2.4S, v31.s[0] +sub v2.4s, v14.4s, v9.4s +add v14.4s, v14.4s, v9.4s +trn1 v9.4S, v17.4S, v4.4S +trn2 v3.4S, v17.4S, v4.4S +trn1 v7.4S, v14.4S, v2.4S +trn2 v10.4S, v14.4S, v2.4S +trn2 v14.2D, v9.2D, v7.2D +trn2 v2.2D, v3.2D, v10.2D +trn1 v17.2D, v9.2D, v7.2D +trn1 v4.2D, v3.2D, v10.2D +sqrdmulh v10.4S, v14.4S, v27.4S +mul v14.4S, v14.4S,v26.4S +mla v14.4S, v10.4S, v31.s[0] +sub v10.4s, v17.4s, v14.4s +add v17.4s, v17.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v27.4S +mul v2.4S, v2.4S,v26.4S +mla v2.4S, v14.4S, v31.s[0] +sub v14.4s, v4.4s, v2.4s +add v4.4s, v4.4s, v2.4s +sqrdmulh v2.4S, v4.4S, v29.4S +mul v4.4S, v4.4S,v28.4S +mla v4.4S, v2.4S, v31.s[0] +sub v2.4s, v17.4s, v4.4s +add v17.4s, v17.4s, v4.4s +sqrdmulh v4.4S, v14.4S, v16.4S +mul v14.4S, v14.4S,v25.4S +mla v14.4S, v4.4S, v31.s[0] +sub v4.4s, v10.4s, v14.4s +add v10.4s, v10.4s, v14.4s +str q17, [x0, #448] +str q2, [x0, #464] +str q10, [x0, #480] +str q4, [x0, #496] +ldr q4, [x17, #+1152] +ldr q10, [x17, #+1168] +ldr q2, [x17, #+1184] +ldr q17, [x17, #+1200] +ldr q14, [x17, #+1216] +ldr q3, [x17, #+1232] +ldr q7, [x17, #+1248] +ldr q9, [x17, #+1264] +ldr q16, [x0, #544] +ldr q25, [x0, #560] +ldr q29, [x0, #512] +ldr q28, [x0, #528] +sqrdmulh v27.4S, v16.4S, v10.s[0] +mul v16.4S, v16.4S,v4.s[0] +mla v16.4S, v27.4S, v31.s[0] +sub v27.4s, v29.4s, v16.4s +add v29.4s, v29.4s, v16.4s +sqrdmulh v16.4S, v25.4S, v10.s[0] +mul v25.4S, v25.4S,v4.s[0] +mla v25.4S, v16.4S, v31.s[0] +sub v16.4s, v28.4s, v25.4s +add v28.4s, v28.4s, v25.4s +sqrdmulh v25.4S, v28.4S, v10.s[1] +mul v28.4S, v28.4S,v4.s[1] +mla v28.4S, v25.4S, v31.s[0] +sub v25.4s, v29.4s, v28.4s +add v29.4s, v29.4s, v28.4s +sqrdmulh v28.4S, v16.4S, v10.s[2] +mul v16.4S, v16.4S,v4.s[2] +mla v16.4S, v28.4S, v31.s[0] +sub v28.4s, v27.4s, v16.4s +add v27.4s, v27.4s, v16.4s +trn1 v16.4S, v29.4S, v25.4S +trn2 v26.4S, v29.4S, v25.4S +trn1 v24.4S, v27.4S, v28.4S +trn2 v30.4S, v27.4S, v28.4S +trn2 v27.2D, v16.2D, v24.2D +trn2 v28.2D, v26.2D, v30.2D +trn1 v29.2D, v16.2D, v24.2D +trn1 v25.2D, v26.2D, v30.2D +sqrdmulh v30.4S, v27.4S, v17.4S +mul v27.4S, v27.4S,v2.4S +mla v27.4S, v30.4S, v31.s[0] +sub v30.4s, v29.4s, v27.4s +add v29.4s, v29.4s, v27.4s +sqrdmulh v27.4S, v28.4S, v17.4S +mul v28.4S, v28.4S,v2.4S +mla v28.4S, v27.4S, v31.s[0] +sub v27.4s, v25.4s, v28.4s +add v25.4s, v25.4s, v28.4s +sqrdmulh v28.4S, v25.4S, v3.4S +mul v25.4S, v25.4S,v14.4S +mla v25.4S, v28.4S, v31.s[0] +sub v28.4s, v29.4s, v25.4s +add v29.4s, v29.4s, v25.4s +sqrdmulh v25.4S, v27.4S, v9.4S +mul v27.4S, v27.4S,v7.4S +mla v27.4S, v25.4S, v31.s[0] +sub v25.4s, v30.4s, v27.4s +add v30.4s, v30.4s, v27.4s +str q29, [x0, #512] +str q28, [x0, #528] +str q30, [x0, #544] +str q25, [x0, #560] +ldr q25, [x17, #+1280] +ldr q30, [x17, #+1296] +ldr q28, [x17, #+1312] +ldr q29, [x17, #+1328] +ldr q27, [x17, #+1344] +ldr q26, [x17, #+1360] +ldr q24, [x17, #+1376] +ldr q16, [x17, #+1392] +ldr q9, [x0, #608] +ldr q7, [x0, #624] +ldr q3, [x0, #576] +ldr q14, [x0, #592] +sqrdmulh v17.4S, v9.4S, v30.s[0] +mul v9.4S, v9.4S,v25.s[0] +mla v9.4S, v17.4S, v31.s[0] +sub v17.4s, v3.4s, v9.4s +add v3.4s, v3.4s, v9.4s +sqrdmulh v9.4S, v7.4S, v30.s[0] +mul v7.4S, v7.4S,v25.s[0] +mla v7.4S, v9.4S, v31.s[0] +sub v9.4s, v14.4s, v7.4s +add v14.4s, v14.4s, v7.4s +sqrdmulh v7.4S, v14.4S, v30.s[1] +mul v14.4S, v14.4S,v25.s[1] +mla v14.4S, v7.4S, v31.s[0] +sub v7.4s, v3.4s, v14.4s +add v3.4s, v3.4s, v14.4s +sqrdmulh v14.4S, v9.4S, v30.s[2] +mul v9.4S, v9.4S,v25.s[2] +mla v9.4S, v14.4S, v31.s[0] +sub v14.4s, v17.4s, v9.4s +add v17.4s, v17.4s, v9.4s +trn1 v9.4S, v3.4S, v7.4S +trn2 v2.4S, v3.4S, v7.4S +trn1 v10.4S, v17.4S, v14.4S +trn2 v4.4S, v17.4S, v14.4S +trn2 v17.2D, v9.2D, v10.2D +trn2 v14.2D, v2.2D, v4.2D +trn1 v3.2D, v9.2D, v10.2D +trn1 v7.2D, v2.2D, v4.2D +sqrdmulh v4.4S, v17.4S, v29.4S +mul v17.4S, v17.4S,v28.4S +mla v17.4S, v4.4S, v31.s[0] +sub v4.4s, v3.4s, v17.4s +add v3.4s, v3.4s, v17.4s +sqrdmulh v17.4S, v14.4S, v29.4S +mul v14.4S, v14.4S,v28.4S +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v7.4s, v14.4s +add v7.4s, v7.4s, v14.4s +sqrdmulh v14.4S, v7.4S, v26.4S +mul v7.4S, v7.4S,v27.4S +mla v7.4S, v14.4S, v31.s[0] +sub v14.4s, v3.4s, v7.4s +add v3.4s, v3.4s, v7.4s +sqrdmulh v7.4S, v17.4S, v16.4S +mul v17.4S, v17.4S,v24.4S +mla v17.4S, v7.4S, v31.s[0] +sub v7.4s, v4.4s, v17.4s +add v4.4s, v4.4s, v17.4s +str q3, [x0, #576] +str q14, [x0, #592] +str q4, [x0, #608] +str q7, [x0, #624] +ldr q7, [x17, #+1408] +ldr q4, [x17, #+1424] +ldr q14, [x17, #+1440] +ldr q3, [x17, #+1456] +ldr q17, [x17, #+1472] +ldr q2, [x17, #+1488] +ldr q10, [x17, #+1504] +ldr q9, [x17, #+1520] +ldr q16, [x0, #672] +ldr q24, [x0, #688] +ldr q26, [x0, #640] +ldr q27, [x0, #656] +sqrdmulh v29.4S, v16.4S, v4.s[0] +mul v16.4S, v16.4S,v7.s[0] +mla v16.4S, v29.4S, v31.s[0] +sub v29.4s, v26.4s, v16.4s +add v26.4s, v26.4s, v16.4s +sqrdmulh v16.4S, v24.4S, v4.s[0] +mul v24.4S, v24.4S,v7.s[0] +mla v24.4S, v16.4S, v31.s[0] +sub v16.4s, v27.4s, v24.4s +add v27.4s, v27.4s, v24.4s +sqrdmulh v24.4S, v27.4S, v4.s[1] +mul v27.4S, v27.4S,v7.s[1] +mla v27.4S, v24.4S, v31.s[0] +sub v24.4s, v26.4s, v27.4s +add v26.4s, v26.4s, v27.4s +sqrdmulh v27.4S, v16.4S, v4.s[2] +mul v16.4S, v16.4S,v7.s[2] +mla v16.4S, v27.4S, v31.s[0] +sub v27.4s, v29.4s, v16.4s +add v29.4s, v29.4s, v16.4s +trn1 v16.4S, v26.4S, v24.4S +trn2 v28.4S, v26.4S, v24.4S +trn1 v30.4S, v29.4S, v27.4S +trn2 v25.4S, v29.4S, v27.4S +trn2 v29.2D, v16.2D, v30.2D +trn2 v27.2D, v28.2D, v25.2D +trn1 v26.2D, v16.2D, v30.2D +trn1 v24.2D, v28.2D, v25.2D +sqrdmulh v25.4S, v29.4S, v3.4S +mul v29.4S, v29.4S,v14.4S +mla v29.4S, v25.4S, v31.s[0] +sub v25.4s, v26.4s, v29.4s +add v26.4s, v26.4s, v29.4s +sqrdmulh v29.4S, v27.4S, v3.4S +mul v27.4S, v27.4S,v14.4S +mla v27.4S, v29.4S, v31.s[0] +sub v29.4s, v24.4s, v27.4s +add v24.4s, v24.4s, v27.4s +sqrdmulh v27.4S, v24.4S, v2.4S +mul v24.4S, v24.4S,v17.4S +mla v24.4S, v27.4S, v31.s[0] +sub v27.4s, v26.4s, v24.4s +add v26.4s, v26.4s, v24.4s +sqrdmulh v24.4S, v29.4S, v9.4S +mul v29.4S, v29.4S,v10.4S +mla v29.4S, v24.4S, v31.s[0] +sub v24.4s, v25.4s, v29.4s +add v25.4s, v25.4s, v29.4s +str q26, [x0, #640] +str q27, [x0, #656] +str q25, [x0, #672] +str q24, [x0, #688] +ldr q24, [x17, #+1536] +ldr q25, [x17, #+1552] +ldr q27, [x17, #+1568] +ldr q26, [x17, #+1584] +ldr q29, [x17, #+1600] +ldr q28, [x17, #+1616] +ldr q30, [x17, #+1632] +ldr q16, [x17, #+1648] +ldr q9, [x0, #736] +ldr q10, [x0, #752] +ldr q2, [x0, #704] +ldr q17, [x0, #720] +sqrdmulh v3.4S, v9.4S, v25.s[0] +mul v9.4S, v9.4S,v24.s[0] +mla v9.4S, v3.4S, v31.s[0] +sub v3.4s, v2.4s, v9.4s +add v2.4s, v2.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v25.s[0] +mul v10.4S, v10.4S,v24.s[0] +mla v10.4S, v9.4S, v31.s[0] +sub v9.4s, v17.4s, v10.4s +add v17.4s, v17.4s, v10.4s +sqrdmulh v10.4S, v17.4S, v25.s[1] +mul v17.4S, v17.4S,v24.s[1] +mla v17.4S, v10.4S, v31.s[0] +sub v10.4s, v2.4s, v17.4s +add v2.4s, v2.4s, v17.4s +sqrdmulh v17.4S, v9.4S, v25.s[2] +mul v9.4S, v9.4S,v24.s[2] +mla v9.4S, v17.4S, v31.s[0] +sub v17.4s, v3.4s, v9.4s +add v3.4s, v3.4s, v9.4s +trn1 v9.4S, v2.4S, v10.4S +trn2 v14.4S, v2.4S, v10.4S +trn1 v4.4S, v3.4S, v17.4S +trn2 v7.4S, v3.4S, v17.4S +trn2 v3.2D, v9.2D, v4.2D +trn2 v17.2D, v14.2D, v7.2D +trn1 v2.2D, v9.2D, v4.2D +trn1 v10.2D, v14.2D, v7.2D +sqrdmulh v7.4S, v3.4S, v26.4S +mul v3.4S, v3.4S,v27.4S +mla v3.4S, v7.4S, v31.s[0] +sub v7.4s, v2.4s, v3.4s +add v2.4s, v2.4s, v3.4s +sqrdmulh v3.4S, v17.4S, v26.4S +mul v17.4S, v17.4S,v27.4S +mla v17.4S, v3.4S, v31.s[0] +sub v3.4s, v10.4s, v17.4s +add v10.4s, v10.4s, v17.4s +sqrdmulh v17.4S, v10.4S, v28.4S +mul v10.4S, v10.4S,v29.4S +mla v10.4S, v17.4S, v31.s[0] +sub v17.4s, v2.4s, v10.4s +add v2.4s, v2.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v16.4S +mul v3.4S, v3.4S,v30.4S +mla v3.4S, v10.4S, v31.s[0] +sub v10.4s, v7.4s, v3.4s +add v7.4s, v7.4s, v3.4s +str q2, [x0, #704] +str q17, [x0, #720] +str q7, [x0, #736] +str q10, [x0, #752] +ldr q10, [x17, #+1664] +ldr q7, [x17, #+1680] +ldr q17, [x17, #+1696] +ldr q2, [x17, #+1712] +ldr q3, [x17, #+1728] +ldr q14, [x17, #+1744] +ldr q4, [x17, #+1760] +ldr q9, [x17, #+1776] +ldr q16, [x0, #800] +ldr q30, [x0, #816] +ldr q28, [x0, #768] +ldr q29, [x0, #784] +sqrdmulh v26.4S, v16.4S, v7.s[0] +mul v16.4S, v16.4S,v10.s[0] +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v28.4s, v16.4s +add v28.4s, v28.4s, v16.4s +sqrdmulh v16.4S, v30.4S, v7.s[0] +mul v30.4S, v30.4S,v10.s[0] +mla v30.4S, v16.4S, v31.s[0] +sub v16.4s, v29.4s, v30.4s +add v29.4s, v29.4s, v30.4s +sqrdmulh v30.4S, v29.4S, v7.s[1] +mul v29.4S, v29.4S,v10.s[1] +mla v29.4S, v30.4S, v31.s[0] +sub v30.4s, v28.4s, v29.4s +add v28.4s, v28.4s, v29.4s +sqrdmulh v29.4S, v16.4S, v7.s[2] +mul v16.4S, v16.4S,v10.s[2] +mla v16.4S, v29.4S, v31.s[0] +sub v29.4s, v26.4s, v16.4s +add v26.4s, v26.4s, v16.4s +trn1 v16.4S, v28.4S, v30.4S +trn2 v27.4S, v28.4S, v30.4S +trn1 v25.4S, v26.4S, v29.4S +trn2 v24.4S, v26.4S, v29.4S +trn2 v26.2D, v16.2D, v25.2D +trn2 v29.2D, v27.2D, v24.2D +trn1 v28.2D, v16.2D, v25.2D +trn1 v30.2D, v27.2D, v24.2D +sqrdmulh v24.4S, v26.4S, v2.4S +mul v26.4S, v26.4S,v17.4S +mla v26.4S, v24.4S, v31.s[0] +sub v24.4s, v28.4s, v26.4s +add v28.4s, v28.4s, v26.4s +sqrdmulh v26.4S, v29.4S, v2.4S +mul v29.4S, v29.4S,v17.4S +mla v29.4S, v26.4S, v31.s[0] +sub v26.4s, v30.4s, v29.4s +add v30.4s, v30.4s, v29.4s +sqrdmulh v29.4S, v30.4S, v14.4S +mul v30.4S, v30.4S,v3.4S +mla v30.4S, v29.4S, v31.s[0] +sub v29.4s, v28.4s, v30.4s +add v28.4s, v28.4s, v30.4s +sqrdmulh v30.4S, v26.4S, v9.4S +mul v26.4S, v26.4S,v4.4S +mla v26.4S, v30.4S, v31.s[0] +sub v30.4s, v24.4s, v26.4s +add v24.4s, v24.4s, v26.4s +str q28, [x0, #768] +str q29, [x0, #784] +str q24, [x0, #800] +str q30, [x0, #816] +ldr q30, [x17, #+1792] +ldr q24, [x17, #+1808] +ldr q29, [x17, #+1824] +ldr q28, [x17, #+1840] +ldr q26, [x17, #+1856] +ldr q27, [x17, #+1872] +ldr q25, [x17, #+1888] +ldr q16, [x17, #+1904] +ldr q9, [x0, #864] +ldr q4, [x0, #880] +ldr q14, [x0, #832] +ldr q3, [x0, #848] +sqrdmulh v2.4S, v9.4S, v24.s[0] +mul v9.4S, v9.4S,v30.s[0] +mla v9.4S, v2.4S, v31.s[0] +sub v2.4s, v14.4s, v9.4s +add v14.4s, v14.4s, v9.4s +sqrdmulh v9.4S, v4.4S, v24.s[0] +mul v4.4S, v4.4S,v30.s[0] +mla v4.4S, v9.4S, v31.s[0] +sub v9.4s, v3.4s, v4.4s +add v3.4s, v3.4s, v4.4s +sqrdmulh v4.4S, v3.4S, v24.s[1] +mul v3.4S, v3.4S,v30.s[1] +mla v3.4S, v4.4S, v31.s[0] +sub v4.4s, v14.4s, v3.4s +add v14.4s, v14.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v24.s[2] +mul v9.4S, v9.4S,v30.s[2] +mla v9.4S, v3.4S, v31.s[0] +sub v3.4s, v2.4s, v9.4s +add v2.4s, v2.4s, v9.4s +trn1 v9.4S, v14.4S, v4.4S +trn2 v17.4S, v14.4S, v4.4S +trn1 v7.4S, v2.4S, v3.4S +trn2 v10.4S, v2.4S, v3.4S +trn2 v2.2D, v9.2D, v7.2D +trn2 v3.2D, v17.2D, v10.2D +trn1 v14.2D, v9.2D, v7.2D +trn1 v4.2D, v17.2D, v10.2D +sqrdmulh v10.4S, v2.4S, v28.4S +mul v2.4S, v2.4S,v29.4S +mla v2.4S, v10.4S, v31.s[0] +sub v10.4s, v14.4s, v2.4s +add v14.4s, v14.4s, v2.4s +sqrdmulh v2.4S, v3.4S, v28.4S +mul v3.4S, v3.4S,v29.4S +mla v3.4S, v2.4S, v31.s[0] +sub v2.4s, v4.4s, v3.4s +add v4.4s, v4.4s, v3.4s +sqrdmulh v3.4S, v4.4S, v27.4S +mul v4.4S, v4.4S,v26.4S +mla v4.4S, v3.4S, v31.s[0] +sub v3.4s, v14.4s, v4.4s +add v14.4s, v14.4s, v4.4s +sqrdmulh v4.4S, v2.4S, v16.4S +mul v2.4S, v2.4S,v25.4S +mla v2.4S, v4.4S, v31.s[0] +sub v4.4s, v10.4s, v2.4s +add v10.4s, v10.4s, v2.4s +str q14, [x0, #832] +str q3, [x0, #848] +str q10, [x0, #864] +str q4, [x0, #880] +ldr q4, [x17, #+1920] +ldr q10, [x17, #+1936] +ldr q3, [x17, #+1952] +ldr q14, [x17, #+1968] +ldr q2, [x17, #+1984] +ldr q17, [x17, #+2000] +ldr q7, [x17, #+2016] +ldr q9, [x17, #+2032] +ldr q16, [x0, #928] +ldr q25, [x0, #944] +ldr q27, [x0, #896] +ldr q26, [x0, #912] +sqrdmulh v28.4S, v16.4S, v10.s[0] +mul v16.4S, v16.4S,v4.s[0] +mla v16.4S, v28.4S, v31.s[0] +sub v28.4s, v27.4s, v16.4s +add v27.4s, v27.4s, v16.4s +sqrdmulh v16.4S, v25.4S, v10.s[0] +mul v25.4S, v25.4S,v4.s[0] +mla v25.4S, v16.4S, v31.s[0] +sub v16.4s, v26.4s, v25.4s +add v26.4s, v26.4s, v25.4s +sqrdmulh v25.4S, v26.4S, v10.s[1] +mul v26.4S, v26.4S,v4.s[1] +mla v26.4S, v25.4S, v31.s[0] +sub v25.4s, v27.4s, v26.4s +add v27.4s, v27.4s, v26.4s +sqrdmulh v26.4S, v16.4S, v10.s[2] +mul v16.4S, v16.4S,v4.s[2] +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v28.4s, v16.4s +add v28.4s, v28.4s, v16.4s +trn1 v16.4S, v27.4S, v25.4S +trn2 v29.4S, v27.4S, v25.4S +trn1 v24.4S, v28.4S, v26.4S +trn2 v30.4S, v28.4S, v26.4S +trn2 v28.2D, v16.2D, v24.2D +trn2 v26.2D, v29.2D, v30.2D +trn1 v27.2D, v16.2D, v24.2D +trn1 v25.2D, v29.2D, v30.2D +sqrdmulh v30.4S, v28.4S, v14.4S +mul v28.4S, v28.4S,v3.4S +mla v28.4S, v30.4S, v31.s[0] +sub v30.4s, v27.4s, v28.4s +add v27.4s, v27.4s, v28.4s +sqrdmulh v28.4S, v26.4S, v14.4S +mul v26.4S, v26.4S,v3.4S +mla v26.4S, v28.4S, v31.s[0] +sub v28.4s, v25.4s, v26.4s +add v25.4s, v25.4s, v26.4s +sqrdmulh v26.4S, v25.4S, v17.4S +mul v25.4S, v25.4S,v2.4S +mla v25.4S, v26.4S, v31.s[0] +sub v26.4s, v27.4s, v25.4s +add v27.4s, v27.4s, v25.4s +sqrdmulh v25.4S, v28.4S, v9.4S +mul v28.4S, v28.4S,v7.4S +mla v28.4S, v25.4S, v31.s[0] +sub v25.4s, v30.4s, v28.4s +add v30.4s, v30.4s, v28.4s +str q27, [x0, #896] +str q26, [x0, #912] +str q30, [x0, #928] +str q25, [x0, #944] +ldr q25, [x17, #+2048] +ldr q30, [x17, #+2064] +ldr q26, [x17, #+2080] +ldr q27, [x17, #+2096] +ldr q28, [x17, #+2112] +ldr q29, [x17, #+2128] +ldr q24, [x17, #+2144] +ldr q16, [x17, #+2160] +ldr q9, [x0, #992] +ldr q7, [x0, #1008] +ldr q17, [x0, #960] +ldr q2, [x0, #976] +sqrdmulh v14.4S, v9.4S, v30.s[0] +mul v9.4S, v9.4S,v25.s[0] +mla v9.4S, v14.4S, v31.s[0] +sub v14.4s, v17.4s, v9.4s +add v17.4s, v17.4s, v9.4s +sqrdmulh v9.4S, v7.4S, v30.s[0] +mul v7.4S, v7.4S,v25.s[0] +mla v7.4S, v9.4S, v31.s[0] +sub v9.4s, v2.4s, v7.4s +add v2.4s, v2.4s, v7.4s +sqrdmulh v7.4S, v2.4S, v30.s[1] +mul v2.4S, v2.4S,v25.s[1] +mla v2.4S, v7.4S, v31.s[0] +sub v7.4s, v17.4s, v2.4s +add v17.4s, v17.4s, v2.4s +sqrdmulh v2.4S, v9.4S, v30.s[2] +mul v9.4S, v9.4S,v25.s[2] +mla v9.4S, v2.4S, v31.s[0] +sub v2.4s, v14.4s, v9.4s +add v14.4s, v14.4s, v9.4s +trn1 v9.4S, v17.4S, v7.4S +trn2 v3.4S, v17.4S, v7.4S +trn1 v10.4S, v14.4S, v2.4S +trn2 v4.4S, v14.4S, v2.4S +trn2 v14.2D, v9.2D, v10.2D +trn2 v2.2D, v3.2D, v4.2D +trn1 v17.2D, v9.2D, v10.2D +trn1 v7.2D, v3.2D, v4.2D +sqrdmulh v4.4S, v14.4S, v27.4S +mul v14.4S, v14.4S,v26.4S +mla v14.4S, v4.4S, v31.s[0] +sub v4.4s, v17.4s, v14.4s +add v17.4s, v17.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v27.4S +mul v2.4S, v2.4S,v26.4S +mla v2.4S, v14.4S, v31.s[0] +sub v14.4s, v7.4s, v2.4s +add v7.4s, v7.4s, v2.4s +sqrdmulh v2.4S, v7.4S, v29.4S +mul v7.4S, v7.4S,v28.4S +mla v7.4S, v2.4S, v31.s[0] +sub v2.4s, v17.4s, v7.4s +add v17.4s, v17.4s, v7.4s +sqrdmulh v7.4S, v14.4S, v16.4S +mul v14.4S, v14.4S,v24.4S +mla v14.4S, v7.4S, v31.s[0] +sub v7.4s, v4.4s, v14.4s +add v4.4s, v4.4s, v14.4s +str q17, [x0, #960] +str q2, [x0, #976] +str q4, [x0, #992] +str q7, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 2392 +// Instruction count: 2388 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_9_0.s b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_9_0.s new file mode 100644 index 0000000..df18890 --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_9_0.s @@ -0,0 +1,2422 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 26036764 // Layer 6, block 0 +.word 7065381 // Layer 6, block 1 +.word 11280567 // Layer 6, block 2 +.word 19695786 // Layer 6, block 3 +.word 1666225723 // Layer 6, block 0 +.word 452149874 // Layer 6, block 1 +.word 721901190 // Layer 6, block 2 +.word 1260434103 // Layer 6, block 3 +.word 28678040 // Layer 7, block 0 +.word 5637166 // Layer 7, block 2 +.word 18759424 // Layer 7, block 4 +.word 8648030 // Layer 7, block 6 +.word 1835254486 // Layer 7, block 0 +.word 360751090 // Layer 7, block 2 +.word 1200511508 // Layer 7, block 4 +.word 553431680 // Layer 7, block 6 +.word 7232147 // Layer 7, block 1 +.word 7430689 // Layer 7, block 3 +.word 14819378 // Layer 7, block 5 +.word 22112339 // Layer 7, block 7 +.word 462822084 // Layer 7, block 1 +.word 475527802 // Layer 7, block 3 +.word 948367809 // Layer 7, block 5 +.word 1415081692 // Layer 7, block 7 +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14834498 // Layer 6, block 4 +.word 22861321 // Layer 6, block 5 +.word 23033862 // Layer 6, block 6 +.word 32211066 // Layer 6, block 7 +.word 949335415 // Layer 6, block 4 +.word 1463012881 // Layer 6, block 5 +.word 1474054663 // Layer 6, block 6 +.word 2061350894 // Layer 6, block 7 +.word 7103825 // Layer 7, block 8 +.word 24338119 // Layer 7, block 10 +.word 6674394 // Layer 7, block 12 +.word 3716128 // Layer 7, block 14 +.word 454610102 // Layer 7, block 8 +.word 1557520740 // Layer 7, block 10 +.word 427128616 // Layer 7, block 12 +.word 237814041 // Layer 7, block 14 +.word 18577393 // Layer 7, block 9 +.word 17042091 // Layer 7, block 11 +.word 6574213 // Layer 7, block 13 +.word 24666803 // Layer 7, block 15 +.word 1188862414 // Layer 7, block 9 +.word 1090610585 // Layer 7, block 11 +.word 420717521 // Layer 7, block 13 +.word 1578554911 // Layer 7, block 15 +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 11253846 // Layer 6, block 8 +.word 16151303 // Layer 6, block 9 +.word 1821442 // Layer 6, block 10 +.word 23358663 // Layer 6, block 11 +.word 720191176 // Layer 6, block 8 +.word 1033604503 // Layer 6, block 9 +.word 116563391 // Layer 6, block 10 +.word 1494840340 // Layer 6, block 11 +.word 32787475 // Layer 7, block 16 +.word 8269259 // Layer 7, block 18 +.word 20826321 // Layer 7, block 20 +.word 21194054 // Layer 7, block 22 +.word 2098238255 // Layer 7, block 16 +.word 529192186 // Layer 7, block 18 +.word 1332782821 // Layer 7, block 20 +.word 1356315937 // Layer 7, block 22 +.word 28400654 // Layer 7, block 17 +.word 31090287 // Layer 7, block 19 +.word 26776841 // Layer 7, block 21 +.word 22281074 // Layer 7, block 23 +.word 1817503137 // Layer 7, block 17 +.word 1989626512 // Layer 7, block 19 +.word 1713587037 // Layer 7, block 21 +.word 1425879908 // Layer 7, block 23 +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 20504641 // Layer 6, block 12 +.word 7735096 // Layer 6, block 13 +.word 29463916 // Layer 6, block 14 +.word 23172067 // Layer 6, block 15 +.word 1312196872 // Layer 6, block 12 +.word 495008363 // Layer 6, block 13 +.word 1885546712 // Layer 6, block 14 +.word 1482899108 // Layer 6, block 15 +.word 1953000 // Layer 7, block 24 +.word 12766243 // Layer 7, block 26 +.word 16292342 // Layer 7, block 28 +.word 25143337 // Layer 7, block 30 +.word 124982461 // Layer 7, block 24 +.word 816977197 // Layer 7, block 26 +.word 1042630311 // Layer 7, block 28 +.word 1609050759 // Layer 7, block 30 +.word 12486848 // Layer 7, block 25 +.word 31556661 // Layer 7, block 27 +.word 28330310 // Layer 7, block 29 +.word 15137961 // Layer 7, block 31 +.word 799097282 // Layer 7, block 25 +.word 2019472170 // Layer 7, block 27 +.word 1813001465 // Layer 7, block 29 +.word 968755565 // Layer 7, block 31 +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 18663828 // Layer 6, block 16 +.word 25765932 // Layer 6, block 17 +.word 11779122 // Layer 6, block 18 +.word 29112305 // Layer 6, block 19 +.word 1194393831 // Layer 6, block 16 +.word 1648893798 // Layer 6, block 17 +.word 753806275 // Layer 6, block 18 +.word 1863045325 // Layer 6, block 19 +.word 33163184 // Layer 7, block 32 +.word 11550623 // Layer 7, block 34 +.word 25375595 // Layer 7, block 36 +.word 18254638 // Layer 7, block 38 +.word 2122281795 // Layer 7, block 32 +.word 739183455 // Layer 7, block 34 +.word 1623914137 // Layer 7, block 36 +.word 1168207670 // Layer 7, block 38 +.word 9551359 // Layer 7, block 33 +.word 33257316 // Layer 7, block 35 +.word 10387700 // Layer 7, block 37 +.word 4263629 // Layer 7, block 39 +.word 611240324 // Layer 7, block 33 +.word 2128305784 // Layer 7, block 35 +.word 664762063 // Layer 7, block 37 +.word 272851431 // Layer 7, block 39 +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 596073 // Layer 6, block 20 +.word 29039358 // Layer 6, block 21 +.word 6760262 // Layer 6, block 22 +.word 2228887 // Layer 6, block 23 +.word 38145761 // Layer 6, block 20 +.word 1858377074 // Layer 6, block 21 +.word 432623749 // Layer 6, block 22 +.word 142637881 // Layer 6, block 23 +.word 25929180 // Layer 7, block 40 +.word 23508428 // Layer 7, block 42 +.word 22560727 // Layer 7, block 44 +.word 29457393 // Layer 7, block 46 +.word 1659340873 // Layer 7, block 40 +.word 1504424569 // Layer 7, block 42 +.word 1443776334 // Layer 7, block 44 +.word 1885129272 // Layer 7, block 46 +.word 17371159 // Layer 7, block 41 +.word 11558208 // Layer 7, block 43 +.word 15755637 // Layer 7, block 45 +.word 20740787 // Layer 7, block 47 +.word 1111669329 // Layer 7, block 41 +.word 739668858 // Layer 7, block 43 +.word 1008283812 // Layer 7, block 45 +.word 1327309063 // Layer 7, block 47 +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 13624329 // Layer 6, block 24 +.word 9838349 // Layer 6, block 25 +.word 6934560 // Layer 6, block 26 +.word 11310234 // Layer 6, block 27 +.word 871890510 // Layer 6, block 24 +.word 629606282 // Layer 6, block 25 +.word 443777969 // Layer 6, block 26 +.word 723799733 // Layer 6, block 27 +.word 3153984 // Layer 7, block 48 +.word 15599806 // Layer 7, block 50 +.word 23484790 // Layer 7, block 52 +.word 30174454 // Layer 7, block 54 +.word 201839571 // Layer 7, block 48 +.word 998311389 // Layer 7, block 50 +.word 1502911852 // Layer 7, block 52 +.word 1931017673 // Layer 7, block 54 +.word 13598070 // Layer 7, block 49 +.word 31454003 // Layer 7, block 51 +.word 20506260 // Layer 7, block 53 +.word 5928435 // Layer 7, block 55 +.word 870210062 // Layer 7, block 49 +.word 2012902560 // Layer 7, block 51 +.word 1312300480 // Layer 7, block 53 +.word 379390883 // Layer 7, block 55 +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 32798516 // Layer 6, block 28 +.word 9911360 // Layer 6, block 29 +.word 32443170 // Layer 6, block 30 +.word 31293482 // Layer 6, block 31 +.word 2098944825 // Layer 6, block 28 +.word 634278629 // Layer 6, block 29 +.word 2076204416 // Layer 6, block 30 +.word 2002630000 // Layer 6, block 31 +.word 26013877 // Layer 7, block 56 +.word 22928950 // Layer 7, block 58 +.word 24547058 // Layer 7, block 60 +.word 21082546 // Layer 7, block 62 +.word 1664761067 // Layer 7, block 56 +.word 1467340807 // Layer 7, block 58 +.word 1570891816 // Layer 7, block 60 +.word 1349179970 // Layer 7, block 62 +.word 21864746 // Layer 7, block 57 +.word 27678266 // Layer 7, block 59 +.word 30695887 // Layer 7, block 61 +.word 31772478 // Layer 7, block 63 +.word 1399236949 // Layer 7, block 57 +.word 1771273834 // Layer 7, block 59 +.word 1964386839 // Layer 7, block 61 +.word 2033283404 // Layer 7, block 63 +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 2853776 // Layer 6, block 32 +.word 31645959 // Layer 6, block 33 +.word 29723614 // Layer 6, block 34 +.word 31813171 // Layer 6, block 35 +.word 182627725 // Layer 6, block 32 +.word 2025186806 // Layer 6, block 33 +.word 1902166116 // Layer 6, block 34 +.word 2035887557 // Layer 6, block 35 +.word 30377953 // Layer 7, block 64 +.word 4924837 // Layer 7, block 66 +.word 11362575 // Layer 7, block 68 +.word 31398766 // Layer 7, block 70 +.word 1944040616 // Layer 7, block 64 +.word 315165513 // Layer 7, block 66 +.word 727149301 // Layer 7, block 68 +.word 2009367662 // Layer 7, block 70 +.word 27689101 // Layer 7, block 65 +.word 31229525 // Layer 7, block 67 +.word 6544948 // Layer 7, block 69 +.word 13728247 // Layer 7, block 71 +.word 1771967221 // Layer 7, block 65 +.word 1998537064 // Layer 7, block 67 +.word 418844704 // Layer 7, block 69 +.word 878540754 // Layer 7, block 71 +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9116920 // Layer 6, block 36 +.word 26449800 // Layer 6, block 37 +.word 27173300 // Layer 6, block 38 +.word 1574249 // Layer 6, block 39 +.word 583438350 // Layer 6, block 36 +.word 1692658010 // Layer 6, block 37 +.word 1738958476 // Layer 6, block 38 +.word 100744247 // Layer 6, block 39 +.word 6510145 // Layer 7, block 72 +.word 760999 // Layer 7, block 74 +.word 1634503 // Layer 7, block 76 +.word 29546109 // Layer 7, block 78 +.word 416617482 // Layer 7, block 72 +.word 48700219 // Layer 7, block 74 +.word 104600209 // Layer 7, block 76 +.word 1890806663 // Layer 7, block 78 +.word 2195232 // Layer 7, block 73 +.word 4465852 // Layer 7, block 75 +.word 31203102 // Layer 7, block 77 +.word 29916743 // Layer 7, block 79 +.word 140484126 // Layer 7, block 73 +.word 285792715 // Layer 7, block 75 +.word 1996846121 // Layer 7, block 77 +.word 1914525428 // Layer 7, block 79 +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29172999 // Layer 6, block 40 +.word 16825951 // Layer 6, block 41 +.word 11592382 // Layer 6, block 42 +.word 2671395 // Layer 6, block 43 +.word 1866929445 // Layer 6, block 40 +.word 1076778680 // Layer 6, block 41 +.word 741855827 // Layer 6, block 42 +.word 170956232 // Layer 6, block 43 +.word 14579779 // Layer 7, block 80 +.word 24263513 // Layer 7, block 82 +.word 4646776 // Layer 7, block 84 +.word 69049 // Layer 7, block 86 +.word 933034643 // Layer 7, block 80 +.word 1552746321 // Layer 7, block 82 +.word 297370968 // Layer 7, block 84 +.word 4418799 // Layer 7, block 86 +.word 33263488 // Layer 7, block 81 +.word 22493246 // Layer 7, block 83 +.word 22009979 // Layer 7, block 85 +.word 12021234 // Layer 7, block 87 +.word 2128700762 // Layer 7, block 81 +.word 1439457879 // Layer 7, block 83 +.word 1408531152 // Layer 7, block 85 +.word 769300260 // Layer 7, block 87 +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 15720958 // Layer 6, block 44 +.word 4876619 // Layer 6, block 45 +.word 9370171 // Layer 6, block 46 +.word 2197027 // Layer 6, block 47 +.word 1006064525 // Layer 6, block 44 +.word 312079797 // Layer 6, block 45 +.word 599645177 // Layer 6, block 46 +.word 140598997 // Layer 6, block 47 +.word 16117282 // Layer 7, block 88 +.word 9635661 // Layer 7, block 90 +.word 9117520 // Layer 7, block 92 +.word 3506913 // Layer 7, block 94 +.word 1031427326 // Layer 7, block 88 +.word 616635240 // Layer 7, block 90 +.word 583476747 // Layer 7, block 92 +.word 224425303 // Layer 7, block 94 +.word 20014407 // Layer 7, block 89 +.word 25893988 // Layer 7, block 91 +.word 10257619 // Layer 7, block 93 +.word 24501669 // Layer 7, block 95 +.word 1280824291 // Layer 7, block 89 +.word 1657088757 // Layer 7, block 91 +.word 656437514 // Layer 7, block 93 +.word 1567987141 // Layer 7, block 95 +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 23467272 // Layer 6, block 48 +.word 11944835 // Layer 6, block 49 +.word 29768154 // Layer 6, block 50 +.word 3189790 // Layer 6, block 51 +.word 1501790786 // Layer 6, block 48 +.word 764411097 // Layer 6, block 49 +.word 1905016458 // Layer 6, block 50 +.word 204130980 // Layer 6, block 51 +.word 28559032 // Layer 7, block 96 +.word 20151609 // Layer 7, block 98 +.word 11645481 // Layer 7, block 100 +.word 16402437 // Layer 7, block 102 +.word 1827638556 // Layer 7, block 96 +.word 1289604549 // Layer 7, block 98 +.word 745253903 // Layer 7, block 100 +.word 1049675853 // Layer 7, block 102 +.word 1005359 // Layer 7, block 97 +.word 19130139 // Layer 7, block 99 +.word 11690281 // Layer 7, block 101 +.word 5461508 // Layer 7, block 103 +.word 64338065 // Layer 7, block 97 +.word 1224235458 // Layer 7, block 99 +.word 748120885 // Layer 7, block 101 +.word 349509836 // Layer 7, block 103 +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 4898455 // Layer 6, block 52 +.word 22059944 // Layer 6, block 53 +.word 20315246 // Layer 6, block 54 +.word 28615767 // Layer 6, block 55 +.word 313477194 // Layer 6, block 52 +.word 1411728668 // Layer 6, block 53 +.word 1300076517 // Layer 6, block 54 +.word 1831269319 // Layer 6, block 55 +.word 6226096 // Layer 7, block 104 +.word 14029790 // Layer 7, block 106 +.word 7729000 // Layer 7, block 108 +.word 13958531 // Layer 7, block 110 +.word 398439734 // Layer 7, block 104 +.word 897838034 // Layer 7, block 106 +.word 494618249 // Layer 7, block 108 +.word 893277806 // Layer 7, block 110 +.word 31755058 // Layer 7, block 105 +.word 26102744 // Layer 7, block 107 +.word 19175904 // Layer 7, block 109 +.word 19472238 // Layer 7, block 111 +.word 2032168609 // Layer 7, block 105 +.word 1670448121 // Layer 7, block 107 +.word 1227164194 // Layer 7, block 109 +.word 1246128123 // Layer 7, block 111 +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 17302560 // Layer 6, block 56 +.word 8630188 // Layer 6, block 57 +.word 13744680 // Layer 6, block 58 +.word 31890906 // Layer 6, block 59 +.word 1107279328 // Layer 6, block 56 +.word 552289879 // Layer 6, block 57 +.word 879592386 // Layer 6, block 58 +.word 2040862218 // Layer 6, block 59 +.word 4735938 // Layer 7, block 112 +.word 26671657 // Layer 7, block 114 +.word 25810971 // Layer 7, block 116 +.word 25578690 // Layer 7, block 118 +.word 303076900 // Layer 7, block 112 +.word 1706855774 // Layer 7, block 114 +.word 1651776074 // Layer 7, block 116 +.word 1636911225 // Layer 7, block 118 +.word 6957373 // Layer 7, block 113 +.word 25381712 // Layer 7, block 115 +.word 27780827 // Layer 7, block 117 +.word 28062311 // Layer 7, block 119 +.word 445237890 // Layer 7, block 113 +.word 1624305595 // Layer 7, block 115 +.word 1777837237 // Layer 7, block 117 +.word 1795850838 // Layer 7, block 119 +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 26150922 // Layer 6, block 60 +.word 29525906 // Layer 6, block 61 +.word 23080870 // Layer 6, block 62 +.word 1636987 // Layer 6, block 63 +.word 1673531278 // Layer 6, block 60 +.word 1889513769 // Layer 6, block 61 +.word 1477062945 // Layer 6, block 62 +.word 104759172 // Layer 6, block 63 +.word 10674616 // Layer 7, block 120 +.word 9508293 // Layer 7, block 122 +.word 4274200 // Layer 7, block 124 +.word 10066304 // Layer 7, block 126 +.word 683123285 // Layer 7, block 120 +.word 608484310 // Layer 7, block 122 +.word 273527923 // Layer 7, block 124 +.word 644194289 // Layer 7, block 126 +.word 26473446 // Layer 7, block 121 +.word 14853570 // Layer 7, block 123 +.word 32427548 // Layer 7, block 125 +.word 16598340 // Layer 7, block 127 +.word 1694171239 // Layer 7, block 121 +.word 950555930 // Layer 7, block 123 +.word 2075204685 // Layer 7, block 125 +.word 1062212688 // Layer 7, block 127 +.text +.global ntt_u32_full_neon_asm_var_4_4_9_0 +.global _ntt_u32_full_neon_asm_var_4_4_9_0 +ntt_u32_full_neon_asm_var_4_4_9_0: +_ntt_u32_full_neon_asm_var_4_4_9_0: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #928] +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +ldr q20, [x0, #992] +sqrdmulh v19.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q18, [x0, #800] +sqrdmulh v17.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +ldr q16, [x0, #864] +sqrdmulh v3.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +mla v22.4S, v21.4S, v31.s[0] +mla v20.4S, v19.4S, v31.s[0] +mla v18.4S, v17.4S, v31.s[0] +mla v16.4S, v3.4S, v31.s[0] +ldr q3, [x0, #544] +sqrdmulh v17.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +ldr q19, [x0, #608] +sqrdmulh v21.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q2, [x0, #672] +ldr q1, [x0, #416] +sqrdmulh v0.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +sub v15.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +ldr q22, [x0, #736] +ldr q14, [x0, #480] +sqrdmulh v13.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v12.4s, v14.4s, v20.4s +add v14.4s, v14.4s, v20.4s +ldr q20, [x0, #288] +mla v3.4S, v17.4S, v31.s[0] +mla v19.4S, v21.4S, v31.s[0] +sub v21.4s, v20.4s, v18.4s +mla v2.4S, v0.4S, v31.s[0] +mla v22.4S, v13.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +ldr q18, [x0, #352] +sqrdmulh v13.4S, v1.4S, v29.s[1] +mul v1.4S, v1.4S,v30.s[1] +sub v0.4s, v18.4s, v16.4s +sqrdmulh v17.4S, v14.4S, v29.s[1] +mul v14.4S, v14.4S,v30.s[1] +add v18.4s, v18.4s, v16.4s +ldr q16, [x0, #32] +sqrdmulh v11.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v10.4s, v16.4s, v3.4s +add v16.4s, v16.4s, v3.4s +ldr q3, [x0, #96] +sqrdmulh v9.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v8.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +ldr q19, [x0, #160] +mla v1.4S, v13.4S, v31.s[0] +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v19.4s, v2.4s +mla v20.4S, v11.4S, v31.s[0] +mla v18.4S, v9.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +ldr q2, [x0, #224] +sqrdmulh v9.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +sub v11.4s, v2.4s, v22.4s +sqrdmulh v13.4S, v12.4S, v29.s[2] +mul v12.4S, v12.4S,v30.s[2] +add v2.4s, v2.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +sub v7.4s, v19.4s, v1.4s +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v29.s[2] +mul v0.4S, v0.4S,v30.s[2] +sub v6.4s, v2.4s, v14.4s +add v2.4s, v2.4s, v14.4s +mla v15.4S, v9.4S, v31.s[0] +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v16.4s, v20.4s +mla v21.4S, v22.4S, v31.s[0] +mla v0.4S, v1.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v7.4S, v27.s[1] +mul v7.4S, v7.4S,v28.s[1] +sub v1.4s, v3.4s, v18.4s +sqrdmulh v22.4S, v6.4S, v27.s[1] +mul v6.4S, v6.4S,v28.s[1] +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v19.4S, v27.s[0] +mul v19.4S, v19.4S,v28.s[0] +sub v9.4s, v17.4s, v15.4s +add v17.4s, v17.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v27.s[0] +mul v2.4S, v2.4S,v28.s[0] +sub v14.4s, v11.4s, v12.4s +add v11.4s, v11.4s, v12.4s +mla v7.4S, v20.4S, v31.s[0] +mla v6.4S, v22.4S, v31.s[0] +sub v22.4s, v10.4s, v21.4s +mla v19.4S, v18.4S, v31.s[0] +mla v2.4S, v15.4S, v31.s[0] +add v10.4s, v10.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v27.s[2] +mul v17.4S, v17.4S,v28.s[2] +sub v15.4s, v8.4s, v0.4s +sqrdmulh v18.4S, v11.4S, v27.s[2] +mul v11.4S, v11.4S,v28.s[2] +add v8.4s, v8.4s, v0.4s +sqrdmulh v0.4S, v9.4S, v27.s[3] +mul v9.4S, v9.4S,v28.s[3] +sub v20.4s, v13.4s, v7.4s +add v13.4s, v13.4s, v7.4s +sqrdmulh v7.4S, v14.4S, v27.s[3] +mul v14.4S, v14.4S,v28.s[3] +sub v12.4s, v1.4s, v6.4s +add v1.4s, v1.4s, v6.4s +mla v17.4S, v21.4S, v31.s[0] +mla v11.4S, v18.4S, v31.s[0] +sub v18.4s, v16.4s, v19.4s +mla v9.4S, v0.4S, v31.s[0] +mla v14.4S, v7.4S, v31.s[0] +add v16.4s, v16.4s, v19.4s +sqrdmulh v19.4S, v1.4S, v25.s[2] +mul v1.4S, v1.4S,v26.s[2] +sub v7.4s, v3.4s, v2.4s +sqrdmulh v0.4S, v12.4S, v25.s[3] +mul v12.4S, v12.4S,v26.s[3] +add v3.4s, v3.4s, v2.4s +sqrdmulh v2.4S, v7.4S, v25.s[1] +mul v7.4S, v7.4S,v26.s[1] +sub v21.4s, v10.4s, v17.4s +add v10.4s, v10.4s, v17.4s +sqrdmulh v17.4S, v3.4S, v25.s[0] +mul v3.4S, v3.4S,v26.s[0] +sub v6.4s, v8.4s, v11.4s +add v8.4s, v8.4s, v11.4s +mla v1.4S, v19.4S, v31.s[0] +mla v12.4S, v0.4S, v31.s[0] +sub v0.4s, v22.4s, v9.4s +mla v7.4S, v2.4S, v31.s[0] +mla v3.4S, v17.4S, v31.s[0] +add v22.4s, v22.4s, v9.4s +sqrdmulh v9.4S, v8.4S, v23.s[0] +mul v8.4S, v8.4S,v24.s[0] +sub v17.4s, v15.4s, v14.4s +sqrdmulh v2.4S, v6.4S, v23.s[1] +mul v6.4S, v6.4S,v24.s[1] +add v15.4s, v15.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v23.s[2] +mul v15.4S, v15.4S,v24.s[2] +sub v19.4s, v13.4s, v1.4s +add v13.4s, v13.4s, v1.4s +sqrdmulh v1.4S, v17.4S, v23.s[3] +mul v17.4S, v17.4S,v24.s[3] +sub v11.4s, v20.4s, v12.4s +add v20.4s, v20.4s, v12.4s +mla v8.4S, v9.4S, v31.s[0] +mla v6.4S, v2.4S, v31.s[0] +sub v2.4s, v18.4s, v7.4s +str q13, [x0, #288] +mla v15.4S, v14.4S, v31.s[0] +mla v17.4S, v1.4S, v31.s[0] +add v18.4s, v18.4s, v7.4s +str q19, [x0, #352] +ldr q19, [x0, #944] +sqrdmulh v7.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +str q20, [x0, #416] +sub v20.4s, v16.4s, v3.4s +ldr q1, [x0, #1008] +sqrdmulh v14.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +str q11, [x0, #480] +add v16.4s, v16.4s, v3.4s +ldr q3, [x0, #816] +sqrdmulh v11.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +sub v13.4s, v10.4s, v8.4s +add v10.4s, v10.4s, v8.4s +ldr q8, [x0, #880] +sqrdmulh v9.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v12.4s, v21.4s, v6.4s +add v21.4s, v21.4s, v6.4s +mla v19.4S, v7.4S, v31.s[0] +mla v1.4S, v14.4S, v31.s[0] +str q18, [x0, #160] +sub v18.4s, v22.4s, v15.4s +mla v3.4S, v11.4S, v31.s[0] +mla v8.4S, v9.4S, v31.s[0] +str q2, [x0, #224] +add v22.4s, v22.4s, v15.4s +ldr q15, [x0, #560] +sqrdmulh v2.4S, v15.4S, v29.s[0] +mul v15.4S, v15.4S,v30.s[0] +str q16, [x0, #32] +sub v16.4s, v0.4s, v17.4s +ldr q9, [x0, #624] +sqrdmulh v11.4S, v9.4S, v29.s[0] +mul v9.4S, v9.4S,v30.s[0] +str q20, [x0, #96] +add v0.4s, v0.4s, v17.4s +ldr q17, [x0, #688] +ldr q20, [x0, #432] +sqrdmulh v14.4S, v17.4S, v29.s[0] +mul v17.4S, v17.4S,v30.s[0] +sub v7.4s, v20.4s, v19.4s +add v20.4s, v20.4s, v19.4s +ldr q19, [x0, #752] +ldr q6, [x0, #496] +sqrdmulh v5.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +sub v4.4s, v6.4s, v1.4s +add v6.4s, v6.4s, v1.4s +ldr q1, [x0, #304] +mla v15.4S, v2.4S, v31.s[0] +mla v9.4S, v11.4S, v31.s[0] +str q10, [x0, #544] +sub v10.4s, v1.4s, v3.4s +mla v17.4S, v14.4S, v31.s[0] +mla v19.4S, v5.4S, v31.s[0] +str q13, [x0, #608] +add v1.4s, v1.4s, v3.4s +ldr q3, [x0, #368] +sqrdmulh v13.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +str q21, [x0, #672] +sub v21.4s, v3.4s, v8.4s +sqrdmulh v5.4S, v6.4S, v29.s[1] +mul v6.4S, v6.4S,v30.s[1] +str q12, [x0, #736] +add v3.4s, v3.4s, v8.4s +ldr q8, [x0, #48] +sqrdmulh v12.4S, v1.4S, v29.s[1] +mul v1.4S, v1.4S,v30.s[1] +sub v14.4s, v8.4s, v15.4s +add v8.4s, v8.4s, v15.4s +ldr q15, [x0, #112] +sqrdmulh v11.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v2.4s, v15.4s, v9.4s +add v15.4s, v15.4s, v9.4s +ldr q9, [x0, #176] +mla v20.4S, v13.4S, v31.s[0] +mla v6.4S, v5.4S, v31.s[0] +str q22, [x0, #800] +sub v22.4s, v9.4s, v17.4s +mla v1.4S, v12.4S, v31.s[0] +mla v3.4S, v11.4S, v31.s[0] +str q18, [x0, #864] +add v9.4s, v9.4s, v17.4s +ldr q17, [x0, #240] +sqrdmulh v18.4S, v7.4S, v29.s[2] +mul v7.4S, v7.4S,v30.s[2] +str q0, [x0, #928] +sub v0.4s, v17.4s, v19.4s +sqrdmulh v11.4S, v4.4S, v29.s[2] +mul v4.4S, v4.4S,v30.s[2] +str q16, [x0, #992] +add v17.4s, v17.4s, v19.4s +sqrdmulh v19.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v16.4s, v9.4s, v20.4s +add v9.4s, v9.4s, v20.4s +sqrdmulh v20.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +sub v12.4s, v17.4s, v6.4s +add v17.4s, v17.4s, v6.4s +mla v7.4S, v18.4S, v31.s[0] +mla v4.4S, v11.4S, v31.s[0] +sub v11.4s, v8.4s, v1.4s +mla v10.4S, v19.4S, v31.s[0] +mla v21.4S, v20.4S, v31.s[0] +add v8.4s, v8.4s, v1.4s +sqrdmulh v1.4S, v16.4S, v27.s[1] +mul v16.4S, v16.4S,v28.s[1] +sub v20.4s, v15.4s, v3.4s +sqrdmulh v19.4S, v12.4S, v27.s[1] +mul v12.4S, v12.4S,v28.s[1] +add v15.4s, v15.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v27.s[0] +mul v9.4S, v9.4S,v28.s[0] +sub v18.4s, v22.4s, v7.4s +add v22.4s, v22.4s, v7.4s +sqrdmulh v7.4S, v17.4S, v27.s[0] +mul v17.4S, v17.4S,v28.s[0] +sub v6.4s, v0.4s, v4.4s +add v0.4s, v0.4s, v4.4s +mla v16.4S, v1.4S, v31.s[0] +mla v12.4S, v19.4S, v31.s[0] +sub v19.4s, v14.4s, v10.4s +mla v9.4S, v3.4S, v31.s[0] +mla v17.4S, v7.4S, v31.s[0] +add v14.4s, v14.4s, v10.4s +sqrdmulh v10.4S, v22.4S, v27.s[2] +mul v22.4S, v22.4S,v28.s[2] +sub v7.4s, v2.4s, v21.4s +sqrdmulh v3.4S, v0.4S, v27.s[2] +mul v0.4S, v0.4S,v28.s[2] +add v2.4s, v2.4s, v21.4s +sqrdmulh v21.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v1.4s, v11.4s, v16.4s +add v11.4s, v11.4s, v16.4s +sqrdmulh v16.4S, v6.4S, v27.s[3] +mul v6.4S, v6.4S,v28.s[3] +sub v4.4s, v20.4s, v12.4s +add v20.4s, v20.4s, v12.4s +mla v22.4S, v10.4S, v31.s[0] +mla v0.4S, v3.4S, v31.s[0] +sub v3.4s, v8.4s, v9.4s +mla v18.4S, v21.4S, v31.s[0] +mla v6.4S, v16.4S, v31.s[0] +add v8.4s, v8.4s, v9.4s +sqrdmulh v9.4S, v20.4S, v25.s[2] +mul v20.4S, v20.4S,v26.s[2] +sub v16.4s, v15.4s, v17.4s +sqrdmulh v21.4S, v4.4S, v25.s[3] +mul v4.4S, v4.4S,v26.s[3] +add v15.4s, v15.4s, v17.4s +sqrdmulh v17.4S, v16.4S, v25.s[1] +mul v16.4S, v16.4S,v26.s[1] +sub v10.4s, v14.4s, v22.4s +add v14.4s, v14.4s, v22.4s +sqrdmulh v22.4S, v15.4S, v25.s[0] +mul v15.4S, v15.4S,v26.s[0] +sub v12.4s, v2.4s, v0.4s +add v2.4s, v2.4s, v0.4s +mla v20.4S, v9.4S, v31.s[0] +mla v4.4S, v21.4S, v31.s[0] +sub v21.4s, v19.4s, v18.4s +mla v16.4S, v17.4S, v31.s[0] +mla v15.4S, v22.4S, v31.s[0] +add v19.4s, v19.4s, v18.4s +sqrdmulh v18.4S, v2.4S, v23.s[0] +mul v2.4S, v2.4S,v24.s[0] +sub v22.4s, v7.4s, v6.4s +sqrdmulh v17.4S, v12.4S, v23.s[1] +mul v12.4S, v12.4S,v24.s[1] +add v7.4s, v7.4s, v6.4s +sqrdmulh v6.4S, v7.4S, v23.s[2] +mul v7.4S, v7.4S,v24.s[2] +sub v9.4s, v11.4s, v20.4s +add v11.4s, v11.4s, v20.4s +sqrdmulh v20.4S, v22.4S, v23.s[3] +mul v22.4S, v22.4S,v24.s[3] +sub v0.4s, v1.4s, v4.4s +add v1.4s, v1.4s, v4.4s +mla v2.4S, v18.4S, v31.s[0] +mla v12.4S, v17.4S, v31.s[0] +sub v17.4s, v3.4s, v16.4s +str q11, [x0, #304] +mla v7.4S, v6.4S, v31.s[0] +mla v22.4S, v20.4S, v31.s[0] +add v3.4s, v3.4s, v16.4s +str q9, [x0, #368] +ldr q9, [x0, #896] +sqrdmulh v16.4S, v9.4S, v29.s[0] +mul v9.4S, v9.4S,v30.s[0] +str q1, [x0, #432] +sub v1.4s, v8.4s, v15.4s +ldr q20, [x0, #960] +sqrdmulh v6.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +str q0, [x0, #496] +add v8.4s, v8.4s, v15.4s +ldr q15, [x0, #768] +sqrdmulh v0.4S, v15.4S, v29.s[0] +mul v15.4S, v15.4S,v30.s[0] +sub v11.4s, v14.4s, v2.4s +add v14.4s, v14.4s, v2.4s +ldr q2, [x0, #832] +sqrdmulh v18.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +sub v4.4s, v10.4s, v12.4s +add v10.4s, v10.4s, v12.4s +mla v9.4S, v16.4S, v31.s[0] +mla v20.4S, v6.4S, v31.s[0] +str q3, [x0, #176] +sub v3.4s, v19.4s, v7.4s +mla v15.4S, v0.4S, v31.s[0] +mla v2.4S, v18.4S, v31.s[0] +str q17, [x0, #240] +add v19.4s, v19.4s, v7.4s +ldr q7, [x0, #512] +sqrdmulh v17.4S, v7.4S, v29.s[0] +mul v7.4S, v7.4S,v30.s[0] +str q8, [x0, #48] +sub v8.4s, v21.4s, v22.4s +ldr q18, [x0, #576] +sqrdmulh v0.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +str q1, [x0, #112] +add v21.4s, v21.4s, v22.4s +ldr q22, [x0, #640] +ldr q1, [x0, #384] +sqrdmulh v6.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v16.4s, v1.4s, v9.4s +add v1.4s, v1.4s, v9.4s +ldr q9, [x0, #704] +ldr q12, [x0, #448] +sqrdmulh v5.4S, v9.4S, v29.s[0] +mul v9.4S, v9.4S,v30.s[0] +sub v13.4s, v12.4s, v20.4s +add v12.4s, v12.4s, v20.4s +ldr q20, [x0, #256] +mla v7.4S, v17.4S, v31.s[0] +mla v18.4S, v0.4S, v31.s[0] +str q14, [x0, #560] +sub v14.4s, v20.4s, v15.4s +mla v22.4S, v6.4S, v31.s[0] +mla v9.4S, v5.4S, v31.s[0] +str q11, [x0, #624] +add v20.4s, v20.4s, v15.4s +ldr q15, [x0, #320] +sqrdmulh v11.4S, v1.4S, v29.s[1] +mul v1.4S, v1.4S,v30.s[1] +str q10, [x0, #688] +sub v10.4s, v15.4s, v2.4s +sqrdmulh v5.4S, v12.4S, v29.s[1] +mul v12.4S, v12.4S,v30.s[1] +str q4, [x0, #752] +add v15.4s, v15.4s, v2.4s +ldr q2, [x0, #0] +sqrdmulh v4.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v6.4s, v2.4s, v7.4s +add v2.4s, v2.4s, v7.4s +ldr q7, [x0, #64] +sqrdmulh v0.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +sub v17.4s, v7.4s, v18.4s +add v7.4s, v7.4s, v18.4s +ldr q18, [x0, #128] +mla v1.4S, v11.4S, v31.s[0] +mla v12.4S, v5.4S, v31.s[0] +str q19, [x0, #816] +sub v19.4s, v18.4s, v22.4s +mla v20.4S, v4.4S, v31.s[0] +mla v15.4S, v0.4S, v31.s[0] +str q3, [x0, #880] +add v18.4s, v18.4s, v22.4s +ldr q22, [x0, #192] +sqrdmulh v3.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +str q21, [x0, #944] +sub v21.4s, v22.4s, v9.4s +sqrdmulh v0.4S, v13.4S, v29.s[2] +mul v13.4S, v13.4S,v30.s[2] +str q8, [x0, #1008] +add v22.4s, v22.4s, v9.4s +sqrdmulh v9.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v8.4s, v18.4s, v1.4s +add v18.4s, v18.4s, v1.4s +sqrdmulh v1.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v4.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +mla v16.4S, v3.4S, v31.s[0] +mla v13.4S, v0.4S, v31.s[0] +sub v0.4s, v2.4s, v20.4s +mla v14.4S, v9.4S, v31.s[0] +mla v10.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v27.s[1] +mul v8.4S, v8.4S,v28.s[1] +sub v1.4s, v7.4s, v15.4s +sqrdmulh v9.4S, v4.4S, v27.s[1] +mul v4.4S, v4.4S,v28.s[1] +add v7.4s, v7.4s, v15.4s +sqrdmulh v15.4S, v18.4S, v27.s[0] +mul v18.4S, v18.4S,v28.s[0] +sub v3.4s, v19.4s, v16.4s +add v19.4s, v19.4s, v16.4s +sqrdmulh v16.4S, v22.4S, v27.s[0] +mul v22.4S, v22.4S,v28.s[0] +sub v12.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +mla v8.4S, v20.4S, v31.s[0] +mla v4.4S, v9.4S, v31.s[0] +sub v9.4s, v6.4s, v14.4s +mla v18.4S, v15.4S, v31.s[0] +mla v22.4S, v16.4S, v31.s[0] +add v6.4s, v6.4s, v14.4s +sqrdmulh v14.4S, v19.4S, v27.s[2] +mul v19.4S, v19.4S,v28.s[2] +sub v16.4s, v17.4s, v10.4s +sqrdmulh v15.4S, v21.4S, v27.s[2] +mul v21.4S, v21.4S,v28.s[2] +add v17.4s, v17.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v27.s[3] +mul v3.4S, v3.4S,v28.s[3] +sub v20.4s, v0.4s, v8.4s +add v0.4s, v0.4s, v8.4s +sqrdmulh v8.4S, v12.4S, v27.s[3] +mul v12.4S, v12.4S,v28.s[3] +sub v13.4s, v1.4s, v4.4s +add v1.4s, v1.4s, v4.4s +mla v19.4S, v14.4S, v31.s[0] +mla v21.4S, v15.4S, v31.s[0] +sub v15.4s, v2.4s, v18.4s +mla v3.4S, v10.4S, v31.s[0] +mla v12.4S, v8.4S, v31.s[0] +add v2.4s, v2.4s, v18.4s +sqrdmulh v18.4S, v1.4S, v25.s[2] +mul v1.4S, v1.4S,v26.s[2] +sub v8.4s, v7.4s, v22.4s +sqrdmulh v10.4S, v13.4S, v25.s[3] +mul v13.4S, v13.4S,v26.s[3] +add v7.4s, v7.4s, v22.4s +sqrdmulh v22.4S, v8.4S, v25.s[1] +mul v8.4S, v8.4S,v26.s[1] +sub v14.4s, v6.4s, v19.4s +add v6.4s, v6.4s, v19.4s +sqrdmulh v19.4S, v7.4S, v25.s[0] +mul v7.4S, v7.4S,v26.s[0] +sub v4.4s, v17.4s, v21.4s +add v17.4s, v17.4s, v21.4s +mla v1.4S, v18.4S, v31.s[0] +mla v13.4S, v10.4S, v31.s[0] +sub v10.4s, v9.4s, v3.4s +mla v8.4S, v22.4S, v31.s[0] +mla v7.4S, v19.4S, v31.s[0] +add v9.4s, v9.4s, v3.4s +sqrdmulh v3.4S, v17.4S, v23.s[0] +mul v17.4S, v17.4S,v24.s[0] +sub v19.4s, v16.4s, v12.4s +sqrdmulh v22.4S, v4.4S, v23.s[1] +mul v4.4S, v4.4S,v24.s[1] +add v16.4s, v16.4s, v12.4s +sqrdmulh v12.4S, v16.4S, v23.s[2] +mul v16.4S, v16.4S,v24.s[2] +sub v18.4s, v0.4s, v1.4s +add v0.4s, v0.4s, v1.4s +sqrdmulh v1.4S, v19.4S, v23.s[3] +mul v19.4S, v19.4S,v24.s[3] +sub v21.4s, v20.4s, v13.4s +add v20.4s, v20.4s, v13.4s +mla v17.4S, v3.4S, v31.s[0] +mla v4.4S, v22.4S, v31.s[0] +sub v22.4s, v15.4s, v8.4s +str q0, [x0, #256] +mla v16.4S, v12.4S, v31.s[0] +mla v19.4S, v1.4S, v31.s[0] +add v15.4s, v15.4s, v8.4s +str q18, [x0, #320] +ldr q18, [x0, #912] +sqrdmulh v8.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +str q20, [x0, #384] +sub v20.4s, v2.4s, v7.4s +ldr q1, [x0, #976] +sqrdmulh v12.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +str q21, [x0, #448] +add v2.4s, v2.4s, v7.4s +ldr q7, [x0, #784] +sqrdmulh v21.4S, v7.4S, v29.s[0] +mul v7.4S, v7.4S,v30.s[0] +sub v0.4s, v6.4s, v17.4s +add v6.4s, v6.4s, v17.4s +ldr q17, [x0, #848] +sqrdmulh v3.4S, v17.4S, v29.s[0] +mul v17.4S, v17.4S,v30.s[0] +sub v13.4s, v14.4s, v4.4s +add v14.4s, v14.4s, v4.4s +mla v18.4S, v8.4S, v31.s[0] +mla v1.4S, v12.4S, v31.s[0] +str q15, [x0, #128] +sub v15.4s, v9.4s, v16.4s +mla v7.4S, v21.4S, v31.s[0] +mla v17.4S, v3.4S, v31.s[0] +str q22, [x0, #192] +add v9.4s, v9.4s, v16.4s +ldr q16, [x0, #528] +sqrdmulh v22.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +str q2, [x0, #0] +sub v2.4s, v10.4s, v19.4s +ldr q3, [x0, #592] +sqrdmulh v21.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +str q20, [x0, #64] +add v10.4s, v10.4s, v19.4s +ldr q19, [x0, #656] +ldr q20, [x0, #400] +sqrdmulh v12.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +sub v8.4s, v20.4s, v18.4s +add v20.4s, v20.4s, v18.4s +ldr q18, [x0, #720] +ldr q4, [x0, #464] +sqrdmulh v5.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +sub v11.4s, v4.4s, v1.4s +add v4.4s, v4.4s, v1.4s +ldr q1, [x0, #272] +mla v16.4S, v22.4S, v31.s[0] +mla v3.4S, v21.4S, v31.s[0] +str q6, [x0, #512] +sub v6.4s, v1.4s, v7.4s +mla v19.4S, v12.4S, v31.s[0] +mla v18.4S, v5.4S, v31.s[0] +str q0, [x0, #576] +add v1.4s, v1.4s, v7.4s +ldr q7, [x0, #336] +sqrdmulh v0.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +str q14, [x0, #640] +sub v14.4s, v7.4s, v17.4s +sqrdmulh v5.4S, v4.4S, v29.s[1] +mul v4.4S, v4.4S,v30.s[1] +str q13, [x0, #704] +add v7.4s, v7.4s, v17.4s +ldr q17, [x0, #16] +sqrdmulh v13.4S, v1.4S, v29.s[1] +mul v1.4S, v1.4S,v30.s[1] +sub v12.4s, v17.4s, v16.4s +add v17.4s, v17.4s, v16.4s +ldr q16, [x0, #80] +sqrdmulh v21.4S, v7.4S, v29.s[1] +mul v7.4S, v7.4S,v30.s[1] +sub v22.4s, v16.4s, v3.4s +add v16.4s, v16.4s, v3.4s +ldr q3, [x0, #144] +mla v20.4S, v0.4S, v31.s[0] +mla v4.4S, v5.4S, v31.s[0] +str q9, [x0, #768] +sub v9.4s, v3.4s, v19.4s +mla v1.4S, v13.4S, v31.s[0] +mla v7.4S, v21.4S, v31.s[0] +str q15, [x0, #832] +add v3.4s, v3.4s, v19.4s +ldr q19, [x0, #208] +sqrdmulh v15.4S, v8.4S, v29.s[2] +mul v8.4S, v8.4S,v30.s[2] +str q10, [x0, #896] +sub v10.4s, v19.4s, v18.4s +sqrdmulh v21.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +str q2, [x0, #960] +add v19.4s, v19.4s, v18.4s +sqrdmulh v18.4S, v6.4S, v29.s[2] +mul v6.4S, v6.4S,v30.s[2] +sub v2.4s, v3.4s, v20.4s +add v3.4s, v3.4s, v20.4s +sqrdmulh v20.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v13.4s, v19.4s, v4.4s +add v19.4s, v19.4s, v4.4s +mla v8.4S, v15.4S, v31.s[0] +mla v11.4S, v21.4S, v31.s[0] +sub v21.4s, v17.4s, v1.4s +mla v6.4S, v18.4S, v31.s[0] +mla v14.4S, v20.4S, v31.s[0] +add v17.4s, v17.4s, v1.4s +sqrdmulh v1.4S, v2.4S, v27.s[1] +mul v2.4S, v2.4S,v28.s[1] +sub v20.4s, v16.4s, v7.4s +sqrdmulh v18.4S, v13.4S, v27.s[1] +mul v13.4S, v13.4S,v28.s[1] +add v16.4s, v16.4s, v7.4s +sqrdmulh v7.4S, v3.4S, v27.s[0] +mul v3.4S, v3.4S,v28.s[0] +sub v15.4s, v9.4s, v8.4s +add v9.4s, v9.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v27.s[0] +mul v19.4S, v19.4S,v28.s[0] +sub v4.4s, v10.4s, v11.4s +add v10.4s, v10.4s, v11.4s +mla v2.4S, v1.4S, v31.s[0] +mla v13.4S, v18.4S, v31.s[0] +sub v18.4s, v12.4s, v6.4s +mla v3.4S, v7.4S, v31.s[0] +mla v19.4S, v8.4S, v31.s[0] +add v12.4s, v12.4s, v6.4s +sqrdmulh v6.4S, v9.4S, v27.s[2] +mul v9.4S, v9.4S,v28.s[2] +sub v8.4s, v22.4s, v14.4s +sqrdmulh v7.4S, v10.4S, v27.s[2] +mul v10.4S, v10.4S,v28.s[2] +add v22.4s, v22.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +sub v1.4s, v21.4s, v2.4s +add v21.4s, v21.4s, v2.4s +sqrdmulh v2.4S, v4.4S, v27.s[3] +mul v4.4S, v4.4S,v28.s[3] +sub v11.4s, v20.4s, v13.4s +add v20.4s, v20.4s, v13.4s +mla v9.4S, v6.4S, v31.s[0] +mla v10.4S, v7.4S, v31.s[0] +sub v7.4s, v17.4s, v3.4s +mla v15.4S, v14.4S, v31.s[0] +mla v4.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v3.4s +sqrdmulh v3.4S, v20.4S, v25.s[2] +mul v20.4S, v20.4S,v26.s[2] +sub v2.4s, v16.4s, v19.4s +sqrdmulh v14.4S, v11.4S, v25.s[3] +mul v11.4S, v11.4S,v26.s[3] +add v16.4s, v16.4s, v19.4s +sqrdmulh v19.4S, v2.4S, v25.s[1] +mul v2.4S, v2.4S,v26.s[1] +sub v6.4s, v12.4s, v9.4s +add v12.4s, v12.4s, v9.4s +sqrdmulh v9.4S, v16.4S, v25.s[0] +mul v16.4S, v16.4S,v26.s[0] +sub v13.4s, v22.4s, v10.4s +add v22.4s, v22.4s, v10.4s +mla v20.4S, v3.4S, v31.s[0] +mla v11.4S, v14.4S, v31.s[0] +sub v14.4s, v18.4s, v15.4s +mla v2.4S, v19.4S, v31.s[0] +mla v16.4S, v9.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +sqrdmulh v15.4S, v22.4S, v23.s[0] +mul v22.4S, v22.4S,v24.s[0] +sub v9.4s, v8.4s, v4.4s +sqrdmulh v19.4S, v13.4S, v23.s[1] +mul v13.4S, v13.4S,v24.s[1] +add v8.4s, v8.4s, v4.4s +sqrdmulh v4.4S, v8.4S, v23.s[2] +mul v8.4S, v8.4S,v24.s[2] +sub v3.4s, v21.4s, v20.4s +add v21.4s, v21.4s, v20.4s +sqrdmulh v20.4S, v9.4S, v23.s[3] +mul v9.4S, v9.4S,v24.s[3] +sub v10.4s, v1.4s, v11.4s +add v1.4s, v1.4s, v11.4s +mla v22.4S, v15.4S, v31.s[0] +mla v13.4S, v19.4S, v31.s[0] +sub v19.4s, v7.4s, v2.4s +str q21, [x0, #272] +mla v8.4S, v4.4S, v31.s[0] +mla v9.4S, v20.4S, v31.s[0] +add v7.4s, v7.4s, v2.4s +str q3, [x0, #336] +str q1, [x0, #400] +sub v1.4s, v17.4s, v16.4s +str q10, [x0, #464] +add v17.4s, v17.4s, v16.4s +sub v16.4s, v12.4s, v22.4s +add v12.4s, v12.4s, v22.4s +sub v22.4s, v6.4s, v13.4s +add v6.4s, v6.4s, v13.4s +str q7, [x0, #144] +sub v7.4s, v18.4s, v8.4s +str q19, [x0, #208] +add v18.4s, v18.4s, v8.4s +str q17, [x0, #16] +sub v17.4s, v14.4s, v9.4s +str q1, [x0, #80] +add v14.4s, v14.4s, v9.4s +str q12, [x0, #528] +str q16, [x0, #592] +str q6, [x0, #656] +str q22, [x0, #720] +str q18, [x0, #784] +str q7, [x0, #848] +str q14, [x0, #912] +str q17, [x0, #976] +ldr q0, [x17, #+128] +ldr q5, [x17, #+144] +ldr q11, [x17, #+160] +ldr q15, [x17, #+176] +ldr q21, [x17, #+192] +ldr q4, [x17, #+208] +ldr q20, [x17, #+224] +ldr q2, [x17, #+240] +ldr q3, [x0, #32] +ldr q30, [x0, #48] +ldr q29, [x0, #0] +ldr q28, [x0, #16] +sqrdmulh v27.4S, v3.4S, v5.s[0] +mul v3.4S, v3.4S,v0.s[0] +mla v3.4S, v27.4S, v31.s[0] +sub v27.4s, v29.4s, v3.4s +add v29.4s, v29.4s, v3.4s +sqrdmulh v3.4S, v30.4S, v5.s[0] +mul v30.4S, v30.4S,v0.s[0] +mla v30.4S, v3.4S, v31.s[0] +sub v3.4s, v28.4s, v30.4s +add v28.4s, v28.4s, v30.4s +sqrdmulh v30.4S, v28.4S, v5.s[1] +mul v28.4S, v28.4S,v0.s[1] +mla v28.4S, v30.4S, v31.s[0] +sub v30.4s, v29.4s, v28.4s +add v29.4s, v29.4s, v28.4s +sqrdmulh v28.4S, v3.4S, v5.s[2] +mul v3.4S, v3.4S,v0.s[2] +mla v3.4S, v28.4S, v31.s[0] +sub v28.4s, v27.4s, v3.4s +add v27.4s, v27.4s, v3.4s +trn1 v3.4S, v29.4S, v30.4S +trn2 v26.4S, v29.4S, v30.4S +trn1 v25.4S, v27.4S, v28.4S +trn2 v24.4S, v27.4S, v28.4S +trn2 v27.2D, v3.2D, v25.2D +trn2 v28.2D, v26.2D, v24.2D +trn1 v29.2D, v3.2D, v25.2D +trn1 v30.2D, v26.2D, v24.2D +sqrdmulh v24.4S, v27.4S, v15.4S +mul v27.4S, v27.4S,v11.4S +mla v27.4S, v24.4S, v31.s[0] +sub v24.4s, v29.4s, v27.4s +add v29.4s, v29.4s, v27.4s +sqrdmulh v27.4S, v28.4S, v15.4S +mul v28.4S, v28.4S,v11.4S +mla v28.4S, v27.4S, v31.s[0] +sub v27.4s, v30.4s, v28.4s +add v30.4s, v30.4s, v28.4s +sqrdmulh v28.4S, v30.4S, v4.4S +mul v30.4S, v30.4S,v21.4S +mla v30.4S, v28.4S, v31.s[0] +sub v28.4s, v29.4s, v30.4s +add v29.4s, v29.4s, v30.4s +sqrdmulh v30.4S, v27.4S, v2.4S +mul v27.4S, v27.4S,v20.4S +mla v27.4S, v30.4S, v31.s[0] +sub v30.4s, v24.4s, v27.4s +add v24.4s, v24.4s, v27.4s +str q29, [x0, #0] +str q28, [x0, #16] +str q24, [x0, #32] +str q30, [x0, #48] +ldr q30, [x17, #+256] +ldr q24, [x17, #+272] +ldr q28, [x17, #+288] +ldr q29, [x17, #+304] +ldr q27, [x17, #+320] +ldr q26, [x17, #+336] +ldr q25, [x17, #+352] +ldr q3, [x17, #+368] +ldr q2, [x0, #96] +ldr q20, [x0, #112] +ldr q4, [x0, #64] +ldr q21, [x0, #80] +sqrdmulh v15.4S, v2.4S, v24.s[0] +mul v2.4S, v2.4S,v30.s[0] +mla v2.4S, v15.4S, v31.s[0] +sub v15.4s, v4.4s, v2.4s +add v4.4s, v4.4s, v2.4s +sqrdmulh v2.4S, v20.4S, v24.s[0] +mul v20.4S, v20.4S,v30.s[0] +mla v20.4S, v2.4S, v31.s[0] +sub v2.4s, v21.4s, v20.4s +add v21.4s, v21.4s, v20.4s +sqrdmulh v20.4S, v21.4S, v24.s[1] +mul v21.4S, v21.4S,v30.s[1] +mla v21.4S, v20.4S, v31.s[0] +sub v20.4s, v4.4s, v21.4s +add v4.4s, v4.4s, v21.4s +sqrdmulh v21.4S, v2.4S, v24.s[2] +mul v2.4S, v2.4S,v30.s[2] +mla v2.4S, v21.4S, v31.s[0] +sub v21.4s, v15.4s, v2.4s +add v15.4s, v15.4s, v2.4s +trn1 v2.4S, v4.4S, v20.4S +trn2 v11.4S, v4.4S, v20.4S +trn1 v5.4S, v15.4S, v21.4S +trn2 v0.4S, v15.4S, v21.4S +trn2 v15.2D, v2.2D, v5.2D +trn2 v21.2D, v11.2D, v0.2D +trn1 v4.2D, v2.2D, v5.2D +trn1 v20.2D, v11.2D, v0.2D +sqrdmulh v0.4S, v15.4S, v29.4S +mul v15.4S, v15.4S,v28.4S +mla v15.4S, v0.4S, v31.s[0] +sub v0.4s, v4.4s, v15.4s +add v4.4s, v4.4s, v15.4s +sqrdmulh v15.4S, v21.4S, v29.4S +mul v21.4S, v21.4S,v28.4S +mla v21.4S, v15.4S, v31.s[0] +sub v15.4s, v20.4s, v21.4s +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v20.4S, v26.4S +mul v20.4S, v20.4S,v27.4S +mla v20.4S, v21.4S, v31.s[0] +sub v21.4s, v4.4s, v20.4s +add v4.4s, v4.4s, v20.4s +sqrdmulh v20.4S, v15.4S, v3.4S +mul v15.4S, v15.4S,v25.4S +mla v15.4S, v20.4S, v31.s[0] +sub v20.4s, v0.4s, v15.4s +add v0.4s, v0.4s, v15.4s +str q4, [x0, #64] +str q21, [x0, #80] +str q0, [x0, #96] +str q20, [x0, #112] +ldr q20, [x17, #+384] +ldr q0, [x17, #+400] +ldr q21, [x17, #+416] +ldr q4, [x17, #+432] +ldr q15, [x17, #+448] +ldr q11, [x17, #+464] +ldr q5, [x17, #+480] +ldr q2, [x17, #+496] +ldr q3, [x0, #160] +ldr q25, [x0, #176] +ldr q26, [x0, #128] +ldr q27, [x0, #144] +sqrdmulh v29.4S, v3.4S, v0.s[0] +mul v3.4S, v3.4S,v20.s[0] +mla v3.4S, v29.4S, v31.s[0] +sub v29.4s, v26.4s, v3.4s +add v26.4s, v26.4s, v3.4s +sqrdmulh v3.4S, v25.4S, v0.s[0] +mul v25.4S, v25.4S,v20.s[0] +mla v25.4S, v3.4S, v31.s[0] +sub v3.4s, v27.4s, v25.4s +add v27.4s, v27.4s, v25.4s +sqrdmulh v25.4S, v27.4S, v0.s[1] +mul v27.4S, v27.4S,v20.s[1] +mla v27.4S, v25.4S, v31.s[0] +sub v25.4s, v26.4s, v27.4s +add v26.4s, v26.4s, v27.4s +sqrdmulh v27.4S, v3.4S, v0.s[2] +mul v3.4S, v3.4S,v20.s[2] +mla v3.4S, v27.4S, v31.s[0] +sub v27.4s, v29.4s, v3.4s +add v29.4s, v29.4s, v3.4s +trn1 v3.4S, v26.4S, v25.4S +trn2 v28.4S, v26.4S, v25.4S +trn1 v24.4S, v29.4S, v27.4S +trn2 v30.4S, v29.4S, v27.4S +trn2 v29.2D, v3.2D, v24.2D +trn2 v27.2D, v28.2D, v30.2D +trn1 v26.2D, v3.2D, v24.2D +trn1 v25.2D, v28.2D, v30.2D +sqrdmulh v30.4S, v29.4S, v4.4S +mul v29.4S, v29.4S,v21.4S +mla v29.4S, v30.4S, v31.s[0] +sub v30.4s, v26.4s, v29.4s +add v26.4s, v26.4s, v29.4s +sqrdmulh v29.4S, v27.4S, v4.4S +mul v27.4S, v27.4S,v21.4S +mla v27.4S, v29.4S, v31.s[0] +sub v29.4s, v25.4s, v27.4s +add v25.4s, v25.4s, v27.4s +sqrdmulh v27.4S, v25.4S, v11.4S +mul v25.4S, v25.4S,v15.4S +mla v25.4S, v27.4S, v31.s[0] +sub v27.4s, v26.4s, v25.4s +add v26.4s, v26.4s, v25.4s +sqrdmulh v25.4S, v29.4S, v2.4S +mul v29.4S, v29.4S,v5.4S +mla v29.4S, v25.4S, v31.s[0] +sub v25.4s, v30.4s, v29.4s +add v30.4s, v30.4s, v29.4s +str q26, [x0, #128] +str q27, [x0, #144] +str q30, [x0, #160] +str q25, [x0, #176] +ldr q25, [x17, #+512] +ldr q30, [x17, #+528] +ldr q27, [x17, #+544] +ldr q26, [x17, #+560] +ldr q29, [x17, #+576] +ldr q28, [x17, #+592] +ldr q24, [x17, #+608] +ldr q3, [x17, #+624] +ldr q2, [x0, #224] +ldr q5, [x0, #240] +ldr q11, [x0, #192] +ldr q15, [x0, #208] +sqrdmulh v4.4S, v2.4S, v30.s[0] +mul v2.4S, v2.4S,v25.s[0] +mla v2.4S, v4.4S, v31.s[0] +sub v4.4s, v11.4s, v2.4s +add v11.4s, v11.4s, v2.4s +sqrdmulh v2.4S, v5.4S, v30.s[0] +mul v5.4S, v5.4S,v25.s[0] +mla v5.4S, v2.4S, v31.s[0] +sub v2.4s, v15.4s, v5.4s +add v15.4s, v15.4s, v5.4s +sqrdmulh v5.4S, v15.4S, v30.s[1] +mul v15.4S, v15.4S,v25.s[1] +mla v15.4S, v5.4S, v31.s[0] +sub v5.4s, v11.4s, v15.4s +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v30.s[2] +mul v2.4S, v2.4S,v25.s[2] +mla v2.4S, v15.4S, v31.s[0] +sub v15.4s, v4.4s, v2.4s +add v4.4s, v4.4s, v2.4s +trn1 v2.4S, v11.4S, v5.4S +trn2 v21.4S, v11.4S, v5.4S +trn1 v0.4S, v4.4S, v15.4S +trn2 v20.4S, v4.4S, v15.4S +trn2 v4.2D, v2.2D, v0.2D +trn2 v15.2D, v21.2D, v20.2D +trn1 v11.2D, v2.2D, v0.2D +trn1 v5.2D, v21.2D, v20.2D +sqrdmulh v20.4S, v4.4S, v26.4S +mul v4.4S, v4.4S,v27.4S +mla v4.4S, v20.4S, v31.s[0] +sub v20.4s, v11.4s, v4.4s +add v11.4s, v11.4s, v4.4s +sqrdmulh v4.4S, v15.4S, v26.4S +mul v15.4S, v15.4S,v27.4S +mla v15.4S, v4.4S, v31.s[0] +sub v4.4s, v5.4s, v15.4s +add v5.4s, v5.4s, v15.4s +sqrdmulh v15.4S, v5.4S, v28.4S +mul v5.4S, v5.4S,v29.4S +mla v5.4S, v15.4S, v31.s[0] +sub v15.4s, v11.4s, v5.4s +add v11.4s, v11.4s, v5.4s +sqrdmulh v5.4S, v4.4S, v3.4S +mul v4.4S, v4.4S,v24.4S +mla v4.4S, v5.4S, v31.s[0] +sub v5.4s, v20.4s, v4.4s +add v20.4s, v20.4s, v4.4s +str q11, [x0, #192] +str q15, [x0, #208] +str q20, [x0, #224] +str q5, [x0, #240] +ldr q5, [x17, #+640] +ldr q20, [x17, #+656] +ldr q15, [x17, #+672] +ldr q11, [x17, #+688] +ldr q4, [x17, #+704] +ldr q21, [x17, #+720] +ldr q0, [x17, #+736] +ldr q2, [x17, #+752] +ldr q3, [x0, #288] +ldr q24, [x0, #304] +ldr q28, [x0, #256] +ldr q29, [x0, #272] +sqrdmulh v26.4S, v3.4S, v20.s[0] +mul v3.4S, v3.4S,v5.s[0] +mla v3.4S, v26.4S, v31.s[0] +sub v26.4s, v28.4s, v3.4s +add v28.4s, v28.4s, v3.4s +sqrdmulh v3.4S, v24.4S, v20.s[0] +mul v24.4S, v24.4S,v5.s[0] +mla v24.4S, v3.4S, v31.s[0] +sub v3.4s, v29.4s, v24.4s +add v29.4s, v29.4s, v24.4s +sqrdmulh v24.4S, v29.4S, v20.s[1] +mul v29.4S, v29.4S,v5.s[1] +mla v29.4S, v24.4S, v31.s[0] +sub v24.4s, v28.4s, v29.4s +add v28.4s, v28.4s, v29.4s +sqrdmulh v29.4S, v3.4S, v20.s[2] +mul v3.4S, v3.4S,v5.s[2] +mla v3.4S, v29.4S, v31.s[0] +sub v29.4s, v26.4s, v3.4s +add v26.4s, v26.4s, v3.4s +trn1 v3.4S, v28.4S, v24.4S +trn2 v27.4S, v28.4S, v24.4S +trn1 v30.4S, v26.4S, v29.4S +trn2 v25.4S, v26.4S, v29.4S +trn2 v26.2D, v3.2D, v30.2D +trn2 v29.2D, v27.2D, v25.2D +trn1 v28.2D, v3.2D, v30.2D +trn1 v24.2D, v27.2D, v25.2D +sqrdmulh v25.4S, v26.4S, v11.4S +mul v26.4S, v26.4S,v15.4S +mla v26.4S, v25.4S, v31.s[0] +sub v25.4s, v28.4s, v26.4s +add v28.4s, v28.4s, v26.4s +sqrdmulh v26.4S, v29.4S, v11.4S +mul v29.4S, v29.4S,v15.4S +mla v29.4S, v26.4S, v31.s[0] +sub v26.4s, v24.4s, v29.4s +add v24.4s, v24.4s, v29.4s +sqrdmulh v29.4S, v24.4S, v21.4S +mul v24.4S, v24.4S,v4.4S +mla v24.4S, v29.4S, v31.s[0] +sub v29.4s, v28.4s, v24.4s +add v28.4s, v28.4s, v24.4s +sqrdmulh v24.4S, v26.4S, v2.4S +mul v26.4S, v26.4S,v0.4S +mla v26.4S, v24.4S, v31.s[0] +sub v24.4s, v25.4s, v26.4s +add v25.4s, v25.4s, v26.4s +str q28, [x0, #256] +str q29, [x0, #272] +str q25, [x0, #288] +str q24, [x0, #304] +ldr q24, [x17, #+768] +ldr q25, [x17, #+784] +ldr q29, [x17, #+800] +ldr q28, [x17, #+816] +ldr q26, [x17, #+832] +ldr q27, [x17, #+848] +ldr q30, [x17, #+864] +ldr q3, [x17, #+880] +ldr q2, [x0, #352] +ldr q0, [x0, #368] +ldr q21, [x0, #320] +ldr q4, [x0, #336] +sqrdmulh v11.4S, v2.4S, v25.s[0] +mul v2.4S, v2.4S,v24.s[0] +mla v2.4S, v11.4S, v31.s[0] +sub v11.4s, v21.4s, v2.4s +add v21.4s, v21.4s, v2.4s +sqrdmulh v2.4S, v0.4S, v25.s[0] +mul v0.4S, v0.4S,v24.s[0] +mla v0.4S, v2.4S, v31.s[0] +sub v2.4s, v4.4s, v0.4s +add v4.4s, v4.4s, v0.4s +sqrdmulh v0.4S, v4.4S, v25.s[1] +mul v4.4S, v4.4S,v24.s[1] +mla v4.4S, v0.4S, v31.s[0] +sub v0.4s, v21.4s, v4.4s +add v21.4s, v21.4s, v4.4s +sqrdmulh v4.4S, v2.4S, v25.s[2] +mul v2.4S, v2.4S,v24.s[2] +mla v2.4S, v4.4S, v31.s[0] +sub v4.4s, v11.4s, v2.4s +add v11.4s, v11.4s, v2.4s +trn1 v2.4S, v21.4S, v0.4S +trn2 v15.4S, v21.4S, v0.4S +trn1 v20.4S, v11.4S, v4.4S +trn2 v5.4S, v11.4S, v4.4S +trn2 v11.2D, v2.2D, v20.2D +trn2 v4.2D, v15.2D, v5.2D +trn1 v21.2D, v2.2D, v20.2D +trn1 v0.2D, v15.2D, v5.2D +sqrdmulh v5.4S, v11.4S, v28.4S +mul v11.4S, v11.4S,v29.4S +mla v11.4S, v5.4S, v31.s[0] +sub v5.4s, v21.4s, v11.4s +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v4.4S, v28.4S +mul v4.4S, v4.4S,v29.4S +mla v4.4S, v11.4S, v31.s[0] +sub v11.4s, v0.4s, v4.4s +add v0.4s, v0.4s, v4.4s +sqrdmulh v4.4S, v0.4S, v27.4S +mul v0.4S, v0.4S,v26.4S +mla v0.4S, v4.4S, v31.s[0] +sub v4.4s, v21.4s, v0.4s +add v21.4s, v21.4s, v0.4s +sqrdmulh v0.4S, v11.4S, v3.4S +mul v11.4S, v11.4S,v30.4S +mla v11.4S, v0.4S, v31.s[0] +sub v0.4s, v5.4s, v11.4s +add v5.4s, v5.4s, v11.4s +str q21, [x0, #320] +str q4, [x0, #336] +str q5, [x0, #352] +str q0, [x0, #368] +ldr q0, [x17, #+896] +ldr q5, [x17, #+912] +ldr q4, [x17, #+928] +ldr q21, [x17, #+944] +ldr q11, [x17, #+960] +ldr q15, [x17, #+976] +ldr q20, [x17, #+992] +ldr q2, [x17, #+1008] +ldr q3, [x0, #416] +ldr q30, [x0, #432] +ldr q27, [x0, #384] +ldr q26, [x0, #400] +sqrdmulh v28.4S, v3.4S, v5.s[0] +mul v3.4S, v3.4S,v0.s[0] +mla v3.4S, v28.4S, v31.s[0] +sub v28.4s, v27.4s, v3.4s +add v27.4s, v27.4s, v3.4s +sqrdmulh v3.4S, v30.4S, v5.s[0] +mul v30.4S, v30.4S,v0.s[0] +mla v30.4S, v3.4S, v31.s[0] +sub v3.4s, v26.4s, v30.4s +add v26.4s, v26.4s, v30.4s +sqrdmulh v30.4S, v26.4S, v5.s[1] +mul v26.4S, v26.4S,v0.s[1] +mla v26.4S, v30.4S, v31.s[0] +sub v30.4s, v27.4s, v26.4s +add v27.4s, v27.4s, v26.4s +sqrdmulh v26.4S, v3.4S, v5.s[2] +mul v3.4S, v3.4S,v0.s[2] +mla v3.4S, v26.4S, v31.s[0] +sub v26.4s, v28.4s, v3.4s +add v28.4s, v28.4s, v3.4s +trn1 v3.4S, v27.4S, v30.4S +trn2 v29.4S, v27.4S, v30.4S +trn1 v25.4S, v28.4S, v26.4S +trn2 v24.4S, v28.4S, v26.4S +trn2 v28.2D, v3.2D, v25.2D +trn2 v26.2D, v29.2D, v24.2D +trn1 v27.2D, v3.2D, v25.2D +trn1 v30.2D, v29.2D, v24.2D +sqrdmulh v24.4S, v28.4S, v21.4S +mul v28.4S, v28.4S,v4.4S +mla v28.4S, v24.4S, v31.s[0] +sub v24.4s, v27.4s, v28.4s +add v27.4s, v27.4s, v28.4s +sqrdmulh v28.4S, v26.4S, v21.4S +mul v26.4S, v26.4S,v4.4S +mla v26.4S, v28.4S, v31.s[0] +sub v28.4s, v30.4s, v26.4s +add v30.4s, v30.4s, v26.4s +sqrdmulh v26.4S, v30.4S, v15.4S +mul v30.4S, v30.4S,v11.4S +mla v30.4S, v26.4S, v31.s[0] +sub v26.4s, v27.4s, v30.4s +add v27.4s, v27.4s, v30.4s +sqrdmulh v30.4S, v28.4S, v2.4S +mul v28.4S, v28.4S,v20.4S +mla v28.4S, v30.4S, v31.s[0] +sub v30.4s, v24.4s, v28.4s +add v24.4s, v24.4s, v28.4s +str q27, [x0, #384] +str q26, [x0, #400] +str q24, [x0, #416] +str q30, [x0, #432] +ldr q30, [x17, #+1024] +ldr q24, [x17, #+1040] +ldr q26, [x17, #+1056] +ldr q27, [x17, #+1072] +ldr q28, [x17, #+1088] +ldr q29, [x17, #+1104] +ldr q25, [x17, #+1120] +ldr q3, [x17, #+1136] +ldr q2, [x0, #480] +ldr q20, [x0, #496] +ldr q15, [x0, #448] +ldr q11, [x0, #464] +sqrdmulh v21.4S, v2.4S, v24.s[0] +mul v2.4S, v2.4S,v30.s[0] +mla v2.4S, v21.4S, v31.s[0] +sub v21.4s, v15.4s, v2.4s +add v15.4s, v15.4s, v2.4s +sqrdmulh v2.4S, v20.4S, v24.s[0] +mul v20.4S, v20.4S,v30.s[0] +mla v20.4S, v2.4S, v31.s[0] +sub v2.4s, v11.4s, v20.4s +add v11.4s, v11.4s, v20.4s +sqrdmulh v20.4S, v11.4S, v24.s[1] +mul v11.4S, v11.4S,v30.s[1] +mla v11.4S, v20.4S, v31.s[0] +sub v20.4s, v15.4s, v11.4s +add v15.4s, v15.4s, v11.4s +sqrdmulh v11.4S, v2.4S, v24.s[2] +mul v2.4S, v2.4S,v30.s[2] +mla v2.4S, v11.4S, v31.s[0] +sub v11.4s, v21.4s, v2.4s +add v21.4s, v21.4s, v2.4s +trn1 v2.4S, v15.4S, v20.4S +trn2 v4.4S, v15.4S, v20.4S +trn1 v5.4S, v21.4S, v11.4S +trn2 v0.4S, v21.4S, v11.4S +trn2 v21.2D, v2.2D, v5.2D +trn2 v11.2D, v4.2D, v0.2D +trn1 v15.2D, v2.2D, v5.2D +trn1 v20.2D, v4.2D, v0.2D +sqrdmulh v0.4S, v21.4S, v27.4S +mul v21.4S, v21.4S,v26.4S +mla v21.4S, v0.4S, v31.s[0] +sub v0.4s, v15.4s, v21.4s +add v15.4s, v15.4s, v21.4s +sqrdmulh v21.4S, v11.4S, v27.4S +mul v11.4S, v11.4S,v26.4S +mla v11.4S, v21.4S, v31.s[0] +sub v21.4s, v20.4s, v11.4s +add v20.4s, v20.4s, v11.4s +sqrdmulh v11.4S, v20.4S, v29.4S +mul v20.4S, v20.4S,v28.4S +mla v20.4S, v11.4S, v31.s[0] +sub v11.4s, v15.4s, v20.4s +add v15.4s, v15.4s, v20.4s +sqrdmulh v20.4S, v21.4S, v3.4S +mul v21.4S, v21.4S,v25.4S +mla v21.4S, v20.4S, v31.s[0] +sub v20.4s, v0.4s, v21.4s +add v0.4s, v0.4s, v21.4s +str q15, [x0, #448] +str q11, [x0, #464] +str q0, [x0, #480] +str q20, [x0, #496] +ldr q20, [x17, #+1152] +ldr q0, [x17, #+1168] +ldr q11, [x17, #+1184] +ldr q15, [x17, #+1200] +ldr q21, [x17, #+1216] +ldr q4, [x17, #+1232] +ldr q5, [x17, #+1248] +ldr q2, [x17, #+1264] +ldr q3, [x0, #544] +ldr q25, [x0, #560] +ldr q29, [x0, #512] +ldr q28, [x0, #528] +sqrdmulh v27.4S, v3.4S, v0.s[0] +mul v3.4S, v3.4S,v20.s[0] +mla v3.4S, v27.4S, v31.s[0] +sub v27.4s, v29.4s, v3.4s +add v29.4s, v29.4s, v3.4s +sqrdmulh v3.4S, v25.4S, v0.s[0] +mul v25.4S, v25.4S,v20.s[0] +mla v25.4S, v3.4S, v31.s[0] +sub v3.4s, v28.4s, v25.4s +add v28.4s, v28.4s, v25.4s +sqrdmulh v25.4S, v28.4S, v0.s[1] +mul v28.4S, v28.4S,v20.s[1] +mla v28.4S, v25.4S, v31.s[0] +sub v25.4s, v29.4s, v28.4s +add v29.4s, v29.4s, v28.4s +sqrdmulh v28.4S, v3.4S, v0.s[2] +mul v3.4S, v3.4S,v20.s[2] +mla v3.4S, v28.4S, v31.s[0] +sub v28.4s, v27.4s, v3.4s +add v27.4s, v27.4s, v3.4s +trn1 v3.4S, v29.4S, v25.4S +trn2 v26.4S, v29.4S, v25.4S +trn1 v24.4S, v27.4S, v28.4S +trn2 v30.4S, v27.4S, v28.4S +trn2 v27.2D, v3.2D, v24.2D +trn2 v28.2D, v26.2D, v30.2D +trn1 v29.2D, v3.2D, v24.2D +trn1 v25.2D, v26.2D, v30.2D +sqrdmulh v30.4S, v27.4S, v15.4S +mul v27.4S, v27.4S,v11.4S +mla v27.4S, v30.4S, v31.s[0] +sub v30.4s, v29.4s, v27.4s +add v29.4s, v29.4s, v27.4s +sqrdmulh v27.4S, v28.4S, v15.4S +mul v28.4S, v28.4S,v11.4S +mla v28.4S, v27.4S, v31.s[0] +sub v27.4s, v25.4s, v28.4s +add v25.4s, v25.4s, v28.4s +sqrdmulh v28.4S, v25.4S, v4.4S +mul v25.4S, v25.4S,v21.4S +mla v25.4S, v28.4S, v31.s[0] +sub v28.4s, v29.4s, v25.4s +add v29.4s, v29.4s, v25.4s +sqrdmulh v25.4S, v27.4S, v2.4S +mul v27.4S, v27.4S,v5.4S +mla v27.4S, v25.4S, v31.s[0] +sub v25.4s, v30.4s, v27.4s +add v30.4s, v30.4s, v27.4s +str q29, [x0, #512] +str q28, [x0, #528] +str q30, [x0, #544] +str q25, [x0, #560] +ldr q25, [x17, #+1280] +ldr q30, [x17, #+1296] +ldr q28, [x17, #+1312] +ldr q29, [x17, #+1328] +ldr q27, [x17, #+1344] +ldr q26, [x17, #+1360] +ldr q24, [x17, #+1376] +ldr q3, [x17, #+1392] +ldr q2, [x0, #608] +ldr q5, [x0, #624] +ldr q4, [x0, #576] +ldr q21, [x0, #592] +sqrdmulh v15.4S, v2.4S, v30.s[0] +mul v2.4S, v2.4S,v25.s[0] +mla v2.4S, v15.4S, v31.s[0] +sub v15.4s, v4.4s, v2.4s +add v4.4s, v4.4s, v2.4s +sqrdmulh v2.4S, v5.4S, v30.s[0] +mul v5.4S, v5.4S,v25.s[0] +mla v5.4S, v2.4S, v31.s[0] +sub v2.4s, v21.4s, v5.4s +add v21.4s, v21.4s, v5.4s +sqrdmulh v5.4S, v21.4S, v30.s[1] +mul v21.4S, v21.4S,v25.s[1] +mla v21.4S, v5.4S, v31.s[0] +sub v5.4s, v4.4s, v21.4s +add v4.4s, v4.4s, v21.4s +sqrdmulh v21.4S, v2.4S, v30.s[2] +mul v2.4S, v2.4S,v25.s[2] +mla v2.4S, v21.4S, v31.s[0] +sub v21.4s, v15.4s, v2.4s +add v15.4s, v15.4s, v2.4s +trn1 v2.4S, v4.4S, v5.4S +trn2 v11.4S, v4.4S, v5.4S +trn1 v0.4S, v15.4S, v21.4S +trn2 v20.4S, v15.4S, v21.4S +trn2 v15.2D, v2.2D, v0.2D +trn2 v21.2D, v11.2D, v20.2D +trn1 v4.2D, v2.2D, v0.2D +trn1 v5.2D, v11.2D, v20.2D +sqrdmulh v20.4S, v15.4S, v29.4S +mul v15.4S, v15.4S,v28.4S +mla v15.4S, v20.4S, v31.s[0] +sub v20.4s, v4.4s, v15.4s +add v4.4s, v4.4s, v15.4s +sqrdmulh v15.4S, v21.4S, v29.4S +mul v21.4S, v21.4S,v28.4S +mla v21.4S, v15.4S, v31.s[0] +sub v15.4s, v5.4s, v21.4s +add v5.4s, v5.4s, v21.4s +sqrdmulh v21.4S, v5.4S, v26.4S +mul v5.4S, v5.4S,v27.4S +mla v5.4S, v21.4S, v31.s[0] +sub v21.4s, v4.4s, v5.4s +add v4.4s, v4.4s, v5.4s +sqrdmulh v5.4S, v15.4S, v3.4S +mul v15.4S, v15.4S,v24.4S +mla v15.4S, v5.4S, v31.s[0] +sub v5.4s, v20.4s, v15.4s +add v20.4s, v20.4s, v15.4s +str q4, [x0, #576] +str q21, [x0, #592] +str q20, [x0, #608] +str q5, [x0, #624] +ldr q5, [x17, #+1408] +ldr q20, [x17, #+1424] +ldr q21, [x17, #+1440] +ldr q4, [x17, #+1456] +ldr q15, [x17, #+1472] +ldr q11, [x17, #+1488] +ldr q0, [x17, #+1504] +ldr q2, [x17, #+1520] +ldr q3, [x0, #672] +ldr q24, [x0, #688] +ldr q26, [x0, #640] +ldr q27, [x0, #656] +sqrdmulh v29.4S, v3.4S, v20.s[0] +mul v3.4S, v3.4S,v5.s[0] +mla v3.4S, v29.4S, v31.s[0] +sub v29.4s, v26.4s, v3.4s +add v26.4s, v26.4s, v3.4s +sqrdmulh v3.4S, v24.4S, v20.s[0] +mul v24.4S, v24.4S,v5.s[0] +mla v24.4S, v3.4S, v31.s[0] +sub v3.4s, v27.4s, v24.4s +add v27.4s, v27.4s, v24.4s +sqrdmulh v24.4S, v27.4S, v20.s[1] +mul v27.4S, v27.4S,v5.s[1] +mla v27.4S, v24.4S, v31.s[0] +sub v24.4s, v26.4s, v27.4s +add v26.4s, v26.4s, v27.4s +sqrdmulh v27.4S, v3.4S, v20.s[2] +mul v3.4S, v3.4S,v5.s[2] +mla v3.4S, v27.4S, v31.s[0] +sub v27.4s, v29.4s, v3.4s +add v29.4s, v29.4s, v3.4s +trn1 v3.4S, v26.4S, v24.4S +trn2 v28.4S, v26.4S, v24.4S +trn1 v30.4S, v29.4S, v27.4S +trn2 v25.4S, v29.4S, v27.4S +trn2 v29.2D, v3.2D, v30.2D +trn2 v27.2D, v28.2D, v25.2D +trn1 v26.2D, v3.2D, v30.2D +trn1 v24.2D, v28.2D, v25.2D +sqrdmulh v25.4S, v29.4S, v4.4S +mul v29.4S, v29.4S,v21.4S +mla v29.4S, v25.4S, v31.s[0] +sub v25.4s, v26.4s, v29.4s +add v26.4s, v26.4s, v29.4s +sqrdmulh v29.4S, v27.4S, v4.4S +mul v27.4S, v27.4S,v21.4S +mla v27.4S, v29.4S, v31.s[0] +sub v29.4s, v24.4s, v27.4s +add v24.4s, v24.4s, v27.4s +sqrdmulh v27.4S, v24.4S, v11.4S +mul v24.4S, v24.4S,v15.4S +mla v24.4S, v27.4S, v31.s[0] +sub v27.4s, v26.4s, v24.4s +add v26.4s, v26.4s, v24.4s +sqrdmulh v24.4S, v29.4S, v2.4S +mul v29.4S, v29.4S,v0.4S +mla v29.4S, v24.4S, v31.s[0] +sub v24.4s, v25.4s, v29.4s +add v25.4s, v25.4s, v29.4s +str q26, [x0, #640] +str q27, [x0, #656] +str q25, [x0, #672] +str q24, [x0, #688] +ldr q24, [x17, #+1536] +ldr q25, [x17, #+1552] +ldr q27, [x17, #+1568] +ldr q26, [x17, #+1584] +ldr q29, [x17, #+1600] +ldr q28, [x17, #+1616] +ldr q30, [x17, #+1632] +ldr q3, [x17, #+1648] +ldr q2, [x0, #736] +ldr q0, [x0, #752] +ldr q11, [x0, #704] +ldr q15, [x0, #720] +sqrdmulh v4.4S, v2.4S, v25.s[0] +mul v2.4S, v2.4S,v24.s[0] +mla v2.4S, v4.4S, v31.s[0] +sub v4.4s, v11.4s, v2.4s +add v11.4s, v11.4s, v2.4s +sqrdmulh v2.4S, v0.4S, v25.s[0] +mul v0.4S, v0.4S,v24.s[0] +mla v0.4S, v2.4S, v31.s[0] +sub v2.4s, v15.4s, v0.4s +add v15.4s, v15.4s, v0.4s +sqrdmulh v0.4S, v15.4S, v25.s[1] +mul v15.4S, v15.4S,v24.s[1] +mla v15.4S, v0.4S, v31.s[0] +sub v0.4s, v11.4s, v15.4s +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v25.s[2] +mul v2.4S, v2.4S,v24.s[2] +mla v2.4S, v15.4S, v31.s[0] +sub v15.4s, v4.4s, v2.4s +add v4.4s, v4.4s, v2.4s +trn1 v2.4S, v11.4S, v0.4S +trn2 v21.4S, v11.4S, v0.4S +trn1 v20.4S, v4.4S, v15.4S +trn2 v5.4S, v4.4S, v15.4S +trn2 v4.2D, v2.2D, v20.2D +trn2 v15.2D, v21.2D, v5.2D +trn1 v11.2D, v2.2D, v20.2D +trn1 v0.2D, v21.2D, v5.2D +sqrdmulh v5.4S, v4.4S, v26.4S +mul v4.4S, v4.4S,v27.4S +mla v4.4S, v5.4S, v31.s[0] +sub v5.4s, v11.4s, v4.4s +add v11.4s, v11.4s, v4.4s +sqrdmulh v4.4S, v15.4S, v26.4S +mul v15.4S, v15.4S,v27.4S +mla v15.4S, v4.4S, v31.s[0] +sub v4.4s, v0.4s, v15.4s +add v0.4s, v0.4s, v15.4s +sqrdmulh v15.4S, v0.4S, v28.4S +mul v0.4S, v0.4S,v29.4S +mla v0.4S, v15.4S, v31.s[0] +sub v15.4s, v11.4s, v0.4s +add v11.4s, v11.4s, v0.4s +sqrdmulh v0.4S, v4.4S, v3.4S +mul v4.4S, v4.4S,v30.4S +mla v4.4S, v0.4S, v31.s[0] +sub v0.4s, v5.4s, v4.4s +add v5.4s, v5.4s, v4.4s +str q11, [x0, #704] +str q15, [x0, #720] +str q5, [x0, #736] +str q0, [x0, #752] +ldr q0, [x17, #+1664] +ldr q5, [x17, #+1680] +ldr q15, [x17, #+1696] +ldr q11, [x17, #+1712] +ldr q4, [x17, #+1728] +ldr q21, [x17, #+1744] +ldr q20, [x17, #+1760] +ldr q2, [x17, #+1776] +ldr q3, [x0, #800] +ldr q30, [x0, #816] +ldr q28, [x0, #768] +ldr q29, [x0, #784] +sqrdmulh v26.4S, v3.4S, v5.s[0] +mul v3.4S, v3.4S,v0.s[0] +mla v3.4S, v26.4S, v31.s[0] +sub v26.4s, v28.4s, v3.4s +add v28.4s, v28.4s, v3.4s +sqrdmulh v3.4S, v30.4S, v5.s[0] +mul v30.4S, v30.4S,v0.s[0] +mla v30.4S, v3.4S, v31.s[0] +sub v3.4s, v29.4s, v30.4s +add v29.4s, v29.4s, v30.4s +sqrdmulh v30.4S, v29.4S, v5.s[1] +mul v29.4S, v29.4S,v0.s[1] +mla v29.4S, v30.4S, v31.s[0] +sub v30.4s, v28.4s, v29.4s +add v28.4s, v28.4s, v29.4s +sqrdmulh v29.4S, v3.4S, v5.s[2] +mul v3.4S, v3.4S,v0.s[2] +mla v3.4S, v29.4S, v31.s[0] +sub v29.4s, v26.4s, v3.4s +add v26.4s, v26.4s, v3.4s +trn1 v3.4S, v28.4S, v30.4S +trn2 v27.4S, v28.4S, v30.4S +trn1 v25.4S, v26.4S, v29.4S +trn2 v24.4S, v26.4S, v29.4S +trn2 v26.2D, v3.2D, v25.2D +trn2 v29.2D, v27.2D, v24.2D +trn1 v28.2D, v3.2D, v25.2D +trn1 v30.2D, v27.2D, v24.2D +sqrdmulh v24.4S, v26.4S, v11.4S +mul v26.4S, v26.4S,v15.4S +mla v26.4S, v24.4S, v31.s[0] +sub v24.4s, v28.4s, v26.4s +add v28.4s, v28.4s, v26.4s +sqrdmulh v26.4S, v29.4S, v11.4S +mul v29.4S, v29.4S,v15.4S +mla v29.4S, v26.4S, v31.s[0] +sub v26.4s, v30.4s, v29.4s +add v30.4s, v30.4s, v29.4s +sqrdmulh v29.4S, v30.4S, v21.4S +mul v30.4S, v30.4S,v4.4S +mla v30.4S, v29.4S, v31.s[0] +sub v29.4s, v28.4s, v30.4s +add v28.4s, v28.4s, v30.4s +sqrdmulh v30.4S, v26.4S, v2.4S +mul v26.4S, v26.4S,v20.4S +mla v26.4S, v30.4S, v31.s[0] +sub v30.4s, v24.4s, v26.4s +add v24.4s, v24.4s, v26.4s +str q28, [x0, #768] +str q29, [x0, #784] +str q24, [x0, #800] +str q30, [x0, #816] +ldr q30, [x17, #+1792] +ldr q24, [x17, #+1808] +ldr q29, [x17, #+1824] +ldr q28, [x17, #+1840] +ldr q26, [x17, #+1856] +ldr q27, [x17, #+1872] +ldr q25, [x17, #+1888] +ldr q3, [x17, #+1904] +ldr q2, [x0, #864] +ldr q20, [x0, #880] +ldr q21, [x0, #832] +ldr q4, [x0, #848] +sqrdmulh v11.4S, v2.4S, v24.s[0] +mul v2.4S, v2.4S,v30.s[0] +mla v2.4S, v11.4S, v31.s[0] +sub v11.4s, v21.4s, v2.4s +add v21.4s, v21.4s, v2.4s +sqrdmulh v2.4S, v20.4S, v24.s[0] +mul v20.4S, v20.4S,v30.s[0] +mla v20.4S, v2.4S, v31.s[0] +sub v2.4s, v4.4s, v20.4s +add v4.4s, v4.4s, v20.4s +sqrdmulh v20.4S, v4.4S, v24.s[1] +mul v4.4S, v4.4S,v30.s[1] +mla v4.4S, v20.4S, v31.s[0] +sub v20.4s, v21.4s, v4.4s +add v21.4s, v21.4s, v4.4s +sqrdmulh v4.4S, v2.4S, v24.s[2] +mul v2.4S, v2.4S,v30.s[2] +mla v2.4S, v4.4S, v31.s[0] +sub v4.4s, v11.4s, v2.4s +add v11.4s, v11.4s, v2.4s +trn1 v2.4S, v21.4S, v20.4S +trn2 v15.4S, v21.4S, v20.4S +trn1 v5.4S, v11.4S, v4.4S +trn2 v0.4S, v11.4S, v4.4S +trn2 v11.2D, v2.2D, v5.2D +trn2 v4.2D, v15.2D, v0.2D +trn1 v21.2D, v2.2D, v5.2D +trn1 v20.2D, v15.2D, v0.2D +sqrdmulh v0.4S, v11.4S, v28.4S +mul v11.4S, v11.4S,v29.4S +mla v11.4S, v0.4S, v31.s[0] +sub v0.4s, v21.4s, v11.4s +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v4.4S, v28.4S +mul v4.4S, v4.4S,v29.4S +mla v4.4S, v11.4S, v31.s[0] +sub v11.4s, v20.4s, v4.4s +add v20.4s, v20.4s, v4.4s +sqrdmulh v4.4S, v20.4S, v27.4S +mul v20.4S, v20.4S,v26.4S +mla v20.4S, v4.4S, v31.s[0] +sub v4.4s, v21.4s, v20.4s +add v21.4s, v21.4s, v20.4s +sqrdmulh v20.4S, v11.4S, v3.4S +mul v11.4S, v11.4S,v25.4S +mla v11.4S, v20.4S, v31.s[0] +sub v20.4s, v0.4s, v11.4s +add v0.4s, v0.4s, v11.4s +str q21, [x0, #832] +str q4, [x0, #848] +str q0, [x0, #864] +str q20, [x0, #880] +ldr q20, [x17, #+1920] +ldr q0, [x17, #+1936] +ldr q4, [x17, #+1952] +ldr q21, [x17, #+1968] +ldr q11, [x17, #+1984] +ldr q15, [x17, #+2000] +ldr q5, [x17, #+2016] +ldr q2, [x17, #+2032] +ldr q3, [x0, #928] +ldr q25, [x0, #944] +ldr q27, [x0, #896] +ldr q26, [x0, #912] +sqrdmulh v28.4S, v3.4S, v0.s[0] +mul v3.4S, v3.4S,v20.s[0] +mla v3.4S, v28.4S, v31.s[0] +sub v28.4s, v27.4s, v3.4s +add v27.4s, v27.4s, v3.4s +sqrdmulh v3.4S, v25.4S, v0.s[0] +mul v25.4S, v25.4S,v20.s[0] +mla v25.4S, v3.4S, v31.s[0] +sub v3.4s, v26.4s, v25.4s +add v26.4s, v26.4s, v25.4s +sqrdmulh v25.4S, v26.4S, v0.s[1] +mul v26.4S, v26.4S,v20.s[1] +mla v26.4S, v25.4S, v31.s[0] +sub v25.4s, v27.4s, v26.4s +add v27.4s, v27.4s, v26.4s +sqrdmulh v26.4S, v3.4S, v0.s[2] +mul v3.4S, v3.4S,v20.s[2] +mla v3.4S, v26.4S, v31.s[0] +sub v26.4s, v28.4s, v3.4s +add v28.4s, v28.4s, v3.4s +trn1 v3.4S, v27.4S, v25.4S +trn2 v29.4S, v27.4S, v25.4S +trn1 v24.4S, v28.4S, v26.4S +trn2 v30.4S, v28.4S, v26.4S +trn2 v28.2D, v3.2D, v24.2D +trn2 v26.2D, v29.2D, v30.2D +trn1 v27.2D, v3.2D, v24.2D +trn1 v25.2D, v29.2D, v30.2D +sqrdmulh v30.4S, v28.4S, v21.4S +mul v28.4S, v28.4S,v4.4S +mla v28.4S, v30.4S, v31.s[0] +sub v30.4s, v27.4s, v28.4s +add v27.4s, v27.4s, v28.4s +sqrdmulh v28.4S, v26.4S, v21.4S +mul v26.4S, v26.4S,v4.4S +mla v26.4S, v28.4S, v31.s[0] +sub v28.4s, v25.4s, v26.4s +add v25.4s, v25.4s, v26.4s +sqrdmulh v26.4S, v25.4S, v15.4S +mul v25.4S, v25.4S,v11.4S +mla v25.4S, v26.4S, v31.s[0] +sub v26.4s, v27.4s, v25.4s +add v27.4s, v27.4s, v25.4s +sqrdmulh v25.4S, v28.4S, v2.4S +mul v28.4S, v28.4S,v5.4S +mla v28.4S, v25.4S, v31.s[0] +sub v25.4s, v30.4s, v28.4s +add v30.4s, v30.4s, v28.4s +str q27, [x0, #896] +str q26, [x0, #912] +str q30, [x0, #928] +str q25, [x0, #944] +ldr q25, [x17, #+2048] +ldr q30, [x17, #+2064] +ldr q26, [x17, #+2080] +ldr q27, [x17, #+2096] +ldr q28, [x17, #+2112] +ldr q29, [x17, #+2128] +ldr q24, [x17, #+2144] +ldr q3, [x17, #+2160] +ldr q2, [x0, #992] +ldr q5, [x0, #1008] +ldr q15, [x0, #960] +ldr q11, [x0, #976] +sqrdmulh v21.4S, v2.4S, v30.s[0] +mul v2.4S, v2.4S,v25.s[0] +mla v2.4S, v21.4S, v31.s[0] +sub v21.4s, v15.4s, v2.4s +add v15.4s, v15.4s, v2.4s +sqrdmulh v2.4S, v5.4S, v30.s[0] +mul v5.4S, v5.4S,v25.s[0] +mla v5.4S, v2.4S, v31.s[0] +sub v2.4s, v11.4s, v5.4s +add v11.4s, v11.4s, v5.4s +sqrdmulh v5.4S, v11.4S, v30.s[1] +mul v11.4S, v11.4S,v25.s[1] +mla v11.4S, v5.4S, v31.s[0] +sub v5.4s, v15.4s, v11.4s +add v15.4s, v15.4s, v11.4s +sqrdmulh v11.4S, v2.4S, v30.s[2] +mul v2.4S, v2.4S,v25.s[2] +mla v2.4S, v11.4S, v31.s[0] +sub v11.4s, v21.4s, v2.4s +add v21.4s, v21.4s, v2.4s +trn1 v2.4S, v15.4S, v5.4S +trn2 v4.4S, v15.4S, v5.4S +trn1 v0.4S, v21.4S, v11.4S +trn2 v20.4S, v21.4S, v11.4S +trn2 v21.2D, v2.2D, v0.2D +trn2 v11.2D, v4.2D, v20.2D +trn1 v15.2D, v2.2D, v0.2D +trn1 v5.2D, v4.2D, v20.2D +sqrdmulh v20.4S, v21.4S, v27.4S +mul v21.4S, v21.4S,v26.4S +mla v21.4S, v20.4S, v31.s[0] +sub v20.4s, v15.4s, v21.4s +add v15.4s, v15.4s, v21.4s +sqrdmulh v21.4S, v11.4S, v27.4S +mul v11.4S, v11.4S,v26.4S +mla v11.4S, v21.4S, v31.s[0] +sub v21.4s, v5.4s, v11.4s +add v5.4s, v5.4s, v11.4s +sqrdmulh v11.4S, v5.4S, v29.4S +mul v5.4S, v5.4S,v28.4S +mla v5.4S, v11.4S, v31.s[0] +sub v11.4s, v15.4s, v5.4s +add v15.4s, v15.4s, v5.4s +sqrdmulh v5.4S, v21.4S, v3.4S +mul v21.4S, v21.4S,v24.4S +mla v21.4S, v5.4S, v31.s[0] +sub v5.4s, v20.4s, v21.4s +add v20.4s, v20.4s, v21.4s +str q15, [x0, #960] +str q11, [x0, #976] +str q20, [x0, #992] +str q5, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 2392 +// Instruction count: 2388 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_3_3_0.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_3_3_0.s new file mode 100644 index 0000000..10f383d --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_3_3_0.s @@ -0,0 +1,1474 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 23825509 // Layer 4, block 0 +.word 27028662 // Layer 4, block 1 +.word 0 // Layer None, block None +.word 1307297022 // Layer 3, block 0 +.word 1524716204 // Layer 4, block 0 +.word 1729702351 // Layer 4, block 1 +.word 0 // Layer None, block None +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 14626653 // Layer 3, block 1 +.word 14833295 // Layer 4, block 2 +.word 2138810 // Layer 4, block 3 +.word 0 // Layer None, block None +.word 936034350 // Layer 3, block 1 +.word 949258429 // Layer 4, block 2 +.word 136873393 // Layer 4, block 3 +.word 0 // Layer None, block None +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 29737761 // Layer 3, block 2 +.word 6490403 // Layer 4, block 4 +.word 19648405 // Layer 4, block 5 +.word 0 // Layer None, block None +.word 1903071454 // Layer 3, block 2 +.word 415354091 // Layer 4, block 4 +.word 1257401950 // Layer 4, block 5 +.word 0 // Layer None, block None +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 30285189 // Layer 3, block 3 +.word 31254932 // Layer 4, block 6 +.word 26362414 // Layer 4, block 7 +.word 0 // Layer None, block None +.word 1938104173 // Layer 3, block 3 +.word 2000162988 // Layer 4, block 6 +.word 1687065733 // Layer 4, block 7 +.word 0 // Layer None, block None +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 21289485 // Layer 3, block 4 +.word 572895 // Layer 4, block 8 +.word 26691971 // Layer 4, block 9 +.word 0 // Layer None, block None +.word 1362423055 // Layer 3, block 4 +.word 36662482 // Layer 4, block 8 +.word 1708155771 // Layer 4, block 9 +.word 0 // Layer None, block None +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 9914896 // Layer 3, block 5 +.word 9249292 // Layer 4, block 10 +.word 29292862 // Layer 4, block 11 +.word 0 // Layer None, block None +.word 634504916 // Layer 3, block 5 +.word 591909511 // Layer 4, block 10 +.word 1874600091 // Layer 4, block 11 +.word 0 // Layer None, block None +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 22603682 // Layer 3, block 6 +.word 8247799 // Layer 4, block 12 +.word 5086187 // Layer 4, block 13 +.word 0 // Layer None, block None +.word 1446525244 // Layer 3, block 6 +.word 527818851 // Layer 4, block 12 +.word 325491125 // Layer 4, block 13 +.word 0 // Layer None, block None +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 16204162 // Layer 3, block 7 +.word 28113639 // Layer 4, block 14 +.word 8471290 // Layer 4, block 15 +.word 0 // Layer None, block None +.word 1036987221 // Layer 3, block 7 +.word 1799135579 // Layer 4, block 14 +.word 542121183 // Layer 4, block 15 +.word 0 // Layer None, block None +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.text +.global ntt_u32_incomplete_neon_asm_var_3_3_0 +.global _ntt_u32_incomplete_neon_asm_var_3_3_0 +ntt_u32_incomplete_neon_asm_var_3_3_0: +_ntt_u32_incomplete_neon_asm_var_3_3_0: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x0, #960] +ldr q25, [x0, #832] +ldr q24, [x0, #576] +ldr q23, [x0, #704] +ldr q22, [x0, #448] +ldr q21, [x0, #320] +ldr q20, [x0, #64] +ldr q19, [x0, #192] +sqrdmulh v18.4S, v26.4S, v29.s[0] +mul v26.4S, v26.4S,v30.s[0] +mla v26.4S, v18.4S, v31.s[0] +sub v18.4s, v22.4s, v26.4s +add v22.4s, v22.4s, v26.4s +sqrdmulh v26.4S, v25.4S, v29.s[0] +mul v25.4S, v25.4S,v30.s[0] +mla v25.4S, v26.4S, v31.s[0] +sub v26.4s, v21.4s, v25.4s +add v21.4s, v21.4s, v25.4s +sqrdmulh v25.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +mla v24.4S, v25.4S, v31.s[0] +sub v25.4s, v20.4s, v24.4s +add v20.4s, v20.4s, v24.4s +sqrdmulh v24.4S, v23.4S, v29.s[0] +mul v23.4S, v23.4S,v30.s[0] +mla v23.4S, v24.4S, v31.s[0] +sub v24.4s, v19.4s, v23.4s +add v19.4s, v19.4s, v23.4s +sqrdmulh v23.4S, v22.4S, v29.s[1] +mul v22.4S, v22.4S,v30.s[1] +mla v22.4S, v23.4S, v31.s[0] +sub v23.4s, v19.4s, v22.4s +add v19.4s, v19.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v20.4s, v21.4s +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v18.4S, v29.s[2] +mul v18.4S, v18.4S,v30.s[2] +mla v18.4S, v21.4S, v31.s[0] +sub v21.4s, v24.4s, v18.4s +add v24.4s, v24.4s, v18.4s +sqrdmulh v18.4S, v26.4S, v29.s[2] +mul v26.4S, v26.4S,v30.s[2] +mla v26.4S, v18.4S, v31.s[0] +sub v18.4s, v25.4s, v26.4s +add v25.4s, v25.4s, v26.4s +sqrdmulh v26.4S, v19.4S, v27.s[0] +mul v19.4S, v19.4S,v28.s[0] +mla v19.4S, v26.4S, v31.s[0] +sub v26.4s, v20.4s, v19.4s +add v20.4s, v20.4s, v19.4s +str q20, [x0, #64] +str q26, [x0, #192] +sqrdmulh v26.4S, v23.4S, v27.s[1] +mul v23.4S, v23.4S,v28.s[1] +mla v23.4S, v26.4S, v31.s[0] +sub v26.4s, v22.4s, v23.4s +add v22.4s, v22.4s, v23.4s +str q22, [x0, #320] +str q26, [x0, #448] +sqrdmulh v26.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +mla v21.4S, v26.4S, v31.s[0] +sub v26.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +str q18, [x0, #832] +str q26, [x0, #960] +sqrdmulh v26.4S, v24.4S, v27.s[2] +mul v24.4S, v24.4S,v28.s[2] +mla v24.4S, v26.4S, v31.s[0] +sub v26.4s, v25.4s, v24.4s +add v25.4s, v25.4s, v24.4s +str q25, [x0, #576] +str q26, [x0, #704] +ldr q26, [x0, #976] +ldr q25, [x0, #848] +ldr q24, [x0, #592] +ldr q18, [x0, #720] +ldr q21, [x0, #464] +ldr q22, [x0, #336] +ldr q23, [x0, #80] +ldr q20, [x0, #208] +sqrdmulh v19.4S, v26.4S, v29.s[0] +mul v26.4S, v26.4S,v30.s[0] +mla v26.4S, v19.4S, v31.s[0] +sub v19.4s, v21.4s, v26.4s +add v21.4s, v21.4s, v26.4s +sqrdmulh v26.4S, v25.4S, v29.s[0] +mul v25.4S, v25.4S,v30.s[0] +mla v25.4S, v26.4S, v31.s[0] +sub v26.4s, v22.4s, v25.4s +add v22.4s, v22.4s, v25.4s +sqrdmulh v25.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +mla v24.4S, v25.4S, v31.s[0] +sub v25.4s, v23.4s, v24.4s +add v23.4s, v23.4s, v24.4s +sqrdmulh v24.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +mla v18.4S, v24.4S, v31.s[0] +sub v24.4s, v20.4s, v18.4s +add v20.4s, v20.4s, v18.4s +sqrdmulh v18.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +mla v21.4S, v18.4S, v31.s[0] +sub v18.4s, v20.4s, v21.4s +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v29.s[1] +mul v22.4S, v22.4S,v30.s[1] +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v23.4s, v22.4s +add v23.4s, v23.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +mla v19.4S, v22.4S, v31.s[0] +sub v22.4s, v24.4s, v19.4s +add v24.4s, v24.4s, v19.4s +sqrdmulh v19.4S, v26.4S, v29.s[2] +mul v26.4S, v26.4S,v30.s[2] +mla v26.4S, v19.4S, v31.s[0] +sub v19.4s, v25.4s, v26.4s +add v25.4s, v25.4s, v26.4s +sqrdmulh v26.4S, v20.4S, v27.s[0] +mul v20.4S, v20.4S,v28.s[0] +mla v20.4S, v26.4S, v31.s[0] +sub v26.4s, v23.4s, v20.4s +add v23.4s, v23.4s, v20.4s +str q23, [x0, #80] +str q26, [x0, #208] +sqrdmulh v26.4S, v18.4S, v27.s[1] +mul v18.4S, v18.4S,v28.s[1] +mla v18.4S, v26.4S, v31.s[0] +sub v26.4s, v21.4s, v18.4s +add v21.4s, v21.4s, v18.4s +str q21, [x0, #336] +str q26, [x0, #464] +sqrdmulh v26.4S, v22.4S, v27.s[3] +mul v22.4S, v22.4S,v28.s[3] +mla v22.4S, v26.4S, v31.s[0] +sub v26.4s, v19.4s, v22.4s +add v19.4s, v19.4s, v22.4s +str q19, [x0, #848] +str q26, [x0, #976] +sqrdmulh v26.4S, v24.4S, v27.s[2] +mul v24.4S, v24.4S,v28.s[2] +mla v24.4S, v26.4S, v31.s[0] +sub v26.4s, v25.4s, v24.4s +add v25.4s, v25.4s, v24.4s +str q25, [x0, #592] +str q26, [x0, #720] +ldr q26, [x0, #992] +ldr q25, [x0, #864] +ldr q24, [x0, #608] +ldr q19, [x0, #736] +ldr q22, [x0, #480] +ldr q21, [x0, #352] +ldr q18, [x0, #96] +ldr q23, [x0, #224] +sqrdmulh v20.4S, v26.4S, v29.s[0] +mul v26.4S, v26.4S,v30.s[0] +mla v26.4S, v20.4S, v31.s[0] +sub v20.4s, v22.4s, v26.4s +add v22.4s, v22.4s, v26.4s +sqrdmulh v26.4S, v25.4S, v29.s[0] +mul v25.4S, v25.4S,v30.s[0] +mla v25.4S, v26.4S, v31.s[0] +sub v26.4s, v21.4s, v25.4s +add v21.4s, v21.4s, v25.4s +sqrdmulh v25.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +mla v24.4S, v25.4S, v31.s[0] +sub v25.4s, v18.4s, v24.4s +add v18.4s, v18.4s, v24.4s +sqrdmulh v24.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +mla v19.4S, v24.4S, v31.s[0] +sub v24.4s, v23.4s, v19.4s +add v23.4s, v23.4s, v19.4s +sqrdmulh v19.4S, v22.4S, v29.s[1] +mul v22.4S, v22.4S,v30.s[1] +mla v22.4S, v19.4S, v31.s[0] +sub v19.4s, v23.4s, v22.4s +add v23.4s, v23.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +mla v20.4S, v21.4S, v31.s[0] +sub v21.4s, v24.4s, v20.4s +add v24.4s, v24.4s, v20.4s +sqrdmulh v20.4S, v26.4S, v29.s[2] +mul v26.4S, v26.4S,v30.s[2] +mla v26.4S, v20.4S, v31.s[0] +sub v20.4s, v25.4s, v26.4s +add v25.4s, v25.4s, v26.4s +sqrdmulh v26.4S, v23.4S, v27.s[0] +mul v23.4S, v23.4S,v28.s[0] +mla v23.4S, v26.4S, v31.s[0] +sub v26.4s, v18.4s, v23.4s +add v18.4s, v18.4s, v23.4s +str q18, [x0, #96] +str q26, [x0, #224] +sqrdmulh v26.4S, v19.4S, v27.s[1] +mul v19.4S, v19.4S,v28.s[1] +mla v19.4S, v26.4S, v31.s[0] +sub v26.4s, v22.4s, v19.4s +add v22.4s, v22.4s, v19.4s +str q22, [x0, #352] +str q26, [x0, #480] +sqrdmulh v26.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +mla v21.4S, v26.4S, v31.s[0] +sub v26.4s, v20.4s, v21.4s +add v20.4s, v20.4s, v21.4s +str q20, [x0, #864] +str q26, [x0, #992] +sqrdmulh v26.4S, v24.4S, v27.s[2] +mul v24.4S, v24.4S,v28.s[2] +mla v24.4S, v26.4S, v31.s[0] +sub v26.4s, v25.4s, v24.4s +add v25.4s, v25.4s, v24.4s +str q25, [x0, #608] +str q26, [x0, #736] +ldr q26, [x0, #1008] +ldr q25, [x0, #880] +ldr q24, [x0, #624] +ldr q20, [x0, #752] +ldr q21, [x0, #496] +ldr q22, [x0, #368] +ldr q19, [x0, #112] +ldr q18, [x0, #240] +sqrdmulh v23.4S, v26.4S, v29.s[0] +mul v26.4S, v26.4S,v30.s[0] +mla v26.4S, v23.4S, v31.s[0] +sub v23.4s, v21.4s, v26.4s +add v21.4s, v21.4s, v26.4s +sqrdmulh v26.4S, v25.4S, v29.s[0] +mul v25.4S, v25.4S,v30.s[0] +mla v25.4S, v26.4S, v31.s[0] +sub v26.4s, v22.4s, v25.4s +add v22.4s, v22.4s, v25.4s +sqrdmulh v25.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +mla v24.4S, v25.4S, v31.s[0] +sub v25.4s, v19.4s, v24.4s +add v19.4s, v19.4s, v24.4s +sqrdmulh v24.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +mla v20.4S, v24.4S, v31.s[0] +sub v24.4s, v18.4s, v20.4s +add v18.4s, v18.4s, v20.4s +sqrdmulh v20.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +mla v21.4S, v20.4S, v31.s[0] +sub v20.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v29.s[1] +mul v22.4S, v22.4S,v30.s[1] +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v19.4s, v22.4s +add v19.4s, v19.4s, v22.4s +sqrdmulh v22.4S, v23.4S, v29.s[2] +mul v23.4S, v23.4S,v30.s[2] +mla v23.4S, v22.4S, v31.s[0] +sub v22.4s, v24.4s, v23.4s +add v24.4s, v24.4s, v23.4s +sqrdmulh v23.4S, v26.4S, v29.s[2] +mul v26.4S, v26.4S,v30.s[2] +mla v26.4S, v23.4S, v31.s[0] +sub v23.4s, v25.4s, v26.4s +add v25.4s, v25.4s, v26.4s +sqrdmulh v26.4S, v18.4S, v27.s[0] +mul v18.4S, v18.4S,v28.s[0] +mla v18.4S, v26.4S, v31.s[0] +sub v26.4s, v19.4s, v18.4s +add v19.4s, v19.4s, v18.4s +str q19, [x0, #112] +str q26, [x0, #240] +sqrdmulh v26.4S, v20.4S, v27.s[1] +mul v20.4S, v20.4S,v28.s[1] +mla v20.4S, v26.4S, v31.s[0] +sub v26.4s, v21.4s, v20.4s +add v21.4s, v21.4s, v20.4s +str q21, [x0, #368] +str q26, [x0, #496] +sqrdmulh v26.4S, v22.4S, v27.s[3] +mul v22.4S, v22.4S,v28.s[3] +mla v22.4S, v26.4S, v31.s[0] +sub v26.4s, v23.4s, v22.4s +add v23.4s, v23.4s, v22.4s +str q23, [x0, #880] +str q26, [x0, #1008] +sqrdmulh v26.4S, v24.4S, v27.s[2] +mul v24.4S, v24.4S,v28.s[2] +mla v24.4S, v26.4S, v31.s[0] +sub v26.4s, v25.4s, v24.4s +add v25.4s, v25.4s, v24.4s +str q25, [x0, #624] +str q26, [x0, #752] +ldr q26, [x0, #896] +ldr q25, [x0, #768] +ldr q24, [x0, #512] +ldr q23, [x0, #640] +ldr q22, [x0, #384] +ldr q21, [x0, #256] +ldr q20, [x0, #0] +ldr q19, [x0, #128] +sqrdmulh v18.4S, v26.4S, v29.s[0] +mul v26.4S, v26.4S,v30.s[0] +mla v26.4S, v18.4S, v31.s[0] +sub v18.4s, v22.4s, v26.4s +add v22.4s, v22.4s, v26.4s +sqrdmulh v26.4S, v25.4S, v29.s[0] +mul v25.4S, v25.4S,v30.s[0] +mla v25.4S, v26.4S, v31.s[0] +sub v26.4s, v21.4s, v25.4s +add v21.4s, v21.4s, v25.4s +sqrdmulh v25.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +mla v24.4S, v25.4S, v31.s[0] +sub v25.4s, v20.4s, v24.4s +add v20.4s, v20.4s, v24.4s +sqrdmulh v24.4S, v23.4S, v29.s[0] +mul v23.4S, v23.4S,v30.s[0] +mla v23.4S, v24.4S, v31.s[0] +sub v24.4s, v19.4s, v23.4s +add v19.4s, v19.4s, v23.4s +sqrdmulh v23.4S, v22.4S, v29.s[1] +mul v22.4S, v22.4S,v30.s[1] +mla v22.4S, v23.4S, v31.s[0] +sub v23.4s, v19.4s, v22.4s +add v19.4s, v19.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v20.4s, v21.4s +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v18.4S, v29.s[2] +mul v18.4S, v18.4S,v30.s[2] +mla v18.4S, v21.4S, v31.s[0] +sub v21.4s, v24.4s, v18.4s +add v24.4s, v24.4s, v18.4s +sqrdmulh v18.4S, v26.4S, v29.s[2] +mul v26.4S, v26.4S,v30.s[2] +mla v26.4S, v18.4S, v31.s[0] +sub v18.4s, v25.4s, v26.4s +add v25.4s, v25.4s, v26.4s +sqrdmulh v26.4S, v19.4S, v27.s[0] +mul v19.4S, v19.4S,v28.s[0] +mla v19.4S, v26.4S, v31.s[0] +sub v26.4s, v20.4s, v19.4s +add v20.4s, v20.4s, v19.4s +str q20, [x0, #0] +str q26, [x0, #128] +sqrdmulh v26.4S, v23.4S, v27.s[1] +mul v23.4S, v23.4S,v28.s[1] +mla v23.4S, v26.4S, v31.s[0] +sub v26.4s, v22.4s, v23.4s +add v22.4s, v22.4s, v23.4s +str q22, [x0, #256] +str q26, [x0, #384] +sqrdmulh v26.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +mla v21.4S, v26.4S, v31.s[0] +sub v26.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +str q18, [x0, #768] +str q26, [x0, #896] +sqrdmulh v26.4S, v24.4S, v27.s[2] +mul v24.4S, v24.4S,v28.s[2] +mla v24.4S, v26.4S, v31.s[0] +sub v26.4s, v25.4s, v24.4s +add v25.4s, v25.4s, v24.4s +str q25, [x0, #512] +str q26, [x0, #640] +ldr q26, [x0, #912] +ldr q25, [x0, #784] +ldr q24, [x0, #528] +ldr q18, [x0, #656] +ldr q21, [x0, #400] +ldr q22, [x0, #272] +ldr q23, [x0, #16] +ldr q20, [x0, #144] +sqrdmulh v19.4S, v26.4S, v29.s[0] +mul v26.4S, v26.4S,v30.s[0] +mla v26.4S, v19.4S, v31.s[0] +sub v19.4s, v21.4s, v26.4s +add v21.4s, v21.4s, v26.4s +sqrdmulh v26.4S, v25.4S, v29.s[0] +mul v25.4S, v25.4S,v30.s[0] +mla v25.4S, v26.4S, v31.s[0] +sub v26.4s, v22.4s, v25.4s +add v22.4s, v22.4s, v25.4s +sqrdmulh v25.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +mla v24.4S, v25.4S, v31.s[0] +sub v25.4s, v23.4s, v24.4s +add v23.4s, v23.4s, v24.4s +sqrdmulh v24.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +mla v18.4S, v24.4S, v31.s[0] +sub v24.4s, v20.4s, v18.4s +add v20.4s, v20.4s, v18.4s +sqrdmulh v18.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +mla v21.4S, v18.4S, v31.s[0] +sub v18.4s, v20.4s, v21.4s +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v29.s[1] +mul v22.4S, v22.4S,v30.s[1] +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v23.4s, v22.4s +add v23.4s, v23.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +mla v19.4S, v22.4S, v31.s[0] +sub v22.4s, v24.4s, v19.4s +add v24.4s, v24.4s, v19.4s +sqrdmulh v19.4S, v26.4S, v29.s[2] +mul v26.4S, v26.4S,v30.s[2] +mla v26.4S, v19.4S, v31.s[0] +sub v19.4s, v25.4s, v26.4s +add v25.4s, v25.4s, v26.4s +sqrdmulh v26.4S, v20.4S, v27.s[0] +mul v20.4S, v20.4S,v28.s[0] +mla v20.4S, v26.4S, v31.s[0] +sub v26.4s, v23.4s, v20.4s +add v23.4s, v23.4s, v20.4s +str q23, [x0, #16] +str q26, [x0, #144] +sqrdmulh v26.4S, v18.4S, v27.s[1] +mul v18.4S, v18.4S,v28.s[1] +mla v18.4S, v26.4S, v31.s[0] +sub v26.4s, v21.4s, v18.4s +add v21.4s, v21.4s, v18.4s +str q21, [x0, #272] +str q26, [x0, #400] +sqrdmulh v26.4S, v22.4S, v27.s[3] +mul v22.4S, v22.4S,v28.s[3] +mla v22.4S, v26.4S, v31.s[0] +sub v26.4s, v19.4s, v22.4s +add v19.4s, v19.4s, v22.4s +str q19, [x0, #784] +str q26, [x0, #912] +sqrdmulh v26.4S, v24.4S, v27.s[2] +mul v24.4S, v24.4S,v28.s[2] +mla v24.4S, v26.4S, v31.s[0] +sub v26.4s, v25.4s, v24.4s +add v25.4s, v25.4s, v24.4s +str q25, [x0, #528] +str q26, [x0, #656] +ldr q26, [x0, #928] +ldr q25, [x0, #800] +ldr q24, [x0, #544] +ldr q19, [x0, #672] +ldr q22, [x0, #416] +ldr q21, [x0, #288] +ldr q18, [x0, #32] +ldr q23, [x0, #160] +sqrdmulh v20.4S, v26.4S, v29.s[0] +mul v26.4S, v26.4S,v30.s[0] +mla v26.4S, v20.4S, v31.s[0] +sub v20.4s, v22.4s, v26.4s +add v22.4s, v22.4s, v26.4s +sqrdmulh v26.4S, v25.4S, v29.s[0] +mul v25.4S, v25.4S,v30.s[0] +mla v25.4S, v26.4S, v31.s[0] +sub v26.4s, v21.4s, v25.4s +add v21.4s, v21.4s, v25.4s +sqrdmulh v25.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +mla v24.4S, v25.4S, v31.s[0] +sub v25.4s, v18.4s, v24.4s +add v18.4s, v18.4s, v24.4s +sqrdmulh v24.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +mla v19.4S, v24.4S, v31.s[0] +sub v24.4s, v23.4s, v19.4s +add v23.4s, v23.4s, v19.4s +sqrdmulh v19.4S, v22.4S, v29.s[1] +mul v22.4S, v22.4S,v30.s[1] +mla v22.4S, v19.4S, v31.s[0] +sub v19.4s, v23.4s, v22.4s +add v23.4s, v23.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +mla v20.4S, v21.4S, v31.s[0] +sub v21.4s, v24.4s, v20.4s +add v24.4s, v24.4s, v20.4s +sqrdmulh v20.4S, v26.4S, v29.s[2] +mul v26.4S, v26.4S,v30.s[2] +mla v26.4S, v20.4S, v31.s[0] +sub v20.4s, v25.4s, v26.4s +add v25.4s, v25.4s, v26.4s +sqrdmulh v26.4S, v23.4S, v27.s[0] +mul v23.4S, v23.4S,v28.s[0] +mla v23.4S, v26.4S, v31.s[0] +sub v26.4s, v18.4s, v23.4s +add v18.4s, v18.4s, v23.4s +str q18, [x0, #32] +str q26, [x0, #160] +sqrdmulh v26.4S, v19.4S, v27.s[1] +mul v19.4S, v19.4S,v28.s[1] +mla v19.4S, v26.4S, v31.s[0] +sub v26.4s, v22.4s, v19.4s +add v22.4s, v22.4s, v19.4s +str q22, [x0, #288] +str q26, [x0, #416] +sqrdmulh v26.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +mla v21.4S, v26.4S, v31.s[0] +sub v26.4s, v20.4s, v21.4s +add v20.4s, v20.4s, v21.4s +str q20, [x0, #800] +str q26, [x0, #928] +sqrdmulh v26.4S, v24.4S, v27.s[2] +mul v24.4S, v24.4S,v28.s[2] +mla v24.4S, v26.4S, v31.s[0] +sub v26.4s, v25.4s, v24.4s +add v25.4s, v25.4s, v24.4s +str q25, [x0, #544] +str q26, [x0, #672] +ldr q26, [x0, #944] +ldr q25, [x0, #816] +ldr q24, [x0, #560] +ldr q20, [x0, #688] +ldr q21, [x0, #432] +ldr q22, [x0, #304] +ldr q19, [x0, #48] +ldr q18, [x0, #176] +sqrdmulh v23.4S, v26.4S, v29.s[0] +mul v26.4S, v26.4S,v30.s[0] +mla v26.4S, v23.4S, v31.s[0] +sub v23.4s, v21.4s, v26.4s +add v21.4s, v21.4s, v26.4s +sqrdmulh v26.4S, v25.4S, v29.s[0] +mul v25.4S, v25.4S,v30.s[0] +mla v25.4S, v26.4S, v31.s[0] +sub v26.4s, v22.4s, v25.4s +add v22.4s, v22.4s, v25.4s +sqrdmulh v25.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +mla v24.4S, v25.4S, v31.s[0] +sub v25.4s, v19.4s, v24.4s +add v19.4s, v19.4s, v24.4s +sqrdmulh v24.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +mla v20.4S, v24.4S, v31.s[0] +sub v24.4s, v18.4s, v20.4s +add v18.4s, v18.4s, v20.4s +sqrdmulh v20.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +mla v21.4S, v20.4S, v31.s[0] +sub v20.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v29.s[1] +mul v22.4S, v22.4S,v30.s[1] +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v19.4s, v22.4s +add v19.4s, v19.4s, v22.4s +sqrdmulh v22.4S, v23.4S, v29.s[2] +mul v23.4S, v23.4S,v30.s[2] +mla v23.4S, v22.4S, v31.s[0] +sub v22.4s, v24.4s, v23.4s +add v24.4s, v24.4s, v23.4s +sqrdmulh v23.4S, v26.4S, v29.s[2] +mul v26.4S, v26.4S,v30.s[2] +mla v26.4S, v23.4S, v31.s[0] +sub v23.4s, v25.4s, v26.4s +add v25.4s, v25.4s, v26.4s +sqrdmulh v26.4S, v18.4S, v27.s[0] +mul v18.4S, v18.4S,v28.s[0] +mla v18.4S, v26.4S, v31.s[0] +sub v26.4s, v19.4s, v18.4s +add v19.4s, v19.4s, v18.4s +str q19, [x0, #48] +str q26, [x0, #176] +sqrdmulh v26.4S, v20.4S, v27.s[1] +mul v20.4S, v20.4S,v28.s[1] +mla v20.4S, v26.4S, v31.s[0] +sub v26.4s, v21.4s, v20.4s +add v21.4s, v21.4s, v20.4s +str q21, [x0, #304] +str q26, [x0, #432] +sqrdmulh v26.4S, v22.4S, v27.s[3] +mul v22.4S, v22.4S,v28.s[3] +mla v22.4S, v26.4S, v31.s[0] +sub v26.4s, v23.4s, v22.4s +add v23.4s, v23.4s, v22.4s +str q23, [x0, #816] +str q26, [x0, #944] +sqrdmulh v26.4S, v24.4S, v27.s[2] +mul v24.4S, v24.4S,v28.s[2] +mla v24.4S, v26.4S, v31.s[0] +sub v26.4s, v25.4s, v24.4s +add v25.4s, v25.4s, v24.4s +str q25, [x0, #560] +str q26, [x0, #688] +ldr q4, [x17, #+64] +ldr q5, [x17, #+80] +ldr q6, [x17, #+96] +ldr q7, [x17, #+112] +ldr q8, [x0, #112] +ldr q9, [x0, #96] +ldr q10, [x0, #64] +ldr q11, [x0, #80] +ldr q12, [x0, #48] +ldr q13, [x0, #32] +ldr q14, [x0, #0] +ldr q15, [x0, #16] +sqrdmulh v0.4S, v8.4S, v5.s[0] +mul v8.4S, v8.4S,v4.s[0] +mla v8.4S, v0.4S, v31.s[0] +sub v0.4s, v12.4s, v8.4s +add v12.4s, v12.4s, v8.4s +sqrdmulh v8.4S, v9.4S, v5.s[0] +mul v9.4S, v9.4S,v4.s[0] +mla v9.4S, v8.4S, v31.s[0] +sub v8.4s, v13.4s, v9.4s +add v13.4s, v13.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v5.s[0] +mul v10.4S, v10.4S,v4.s[0] +mla v10.4S, v9.4S, v31.s[0] +sub v9.4s, v14.4s, v10.4s +add v14.4s, v14.4s, v10.4s +sqrdmulh v10.4S, v11.4S, v5.s[0] +mul v11.4S, v11.4S,v4.s[0] +mla v11.4S, v10.4S, v31.s[0] +sub v10.4s, v15.4s, v11.4s +add v15.4s, v15.4s, v11.4s +sqrdmulh v11.4S, v12.4S, v5.s[1] +mul v12.4S, v12.4S,v4.s[1] +mla v12.4S, v11.4S, v31.s[0] +sub v11.4s, v15.4s, v12.4s +add v15.4s, v15.4s, v12.4s +ldr q12, [x17, #+128] +ldr q1, [x17, #+144] +ldr q2, [x17, #+160] +ldr q3, [x17, #+176] +sqrdmulh v16.4S, v13.4S, v5.s[1] +mul v13.4S, v13.4S,v4.s[1] +mla v13.4S, v16.4S, v31.s[0] +sub v16.4s, v14.4s, v13.4s +add v14.4s, v14.4s, v13.4s +sqrdmulh v13.4S, v0.4S, v5.s[2] +mul v0.4S, v0.4S,v4.s[2] +mla v0.4S, v13.4S, v31.s[0] +sub v13.4s, v10.4s, v0.4s +add v10.4s, v10.4s, v0.4s +sqrdmulh v0.4S, v8.4S, v5.s[2] +mul v8.4S, v8.4S,v4.s[2] +mla v8.4S, v0.4S, v31.s[0] +sub v0.4s, v9.4s, v8.4s +add v9.4s, v9.4s, v8.4s +sqrdmulh v8.4S, v15.4S, v7.s[0] +mul v15.4S, v15.4S,v6.s[0] +mla v15.4S, v8.4S, v31.s[0] +sub v8.4s, v14.4s, v15.4s +add v14.4s, v14.4s, v15.4s +str q14, [x0, #0] +str q8, [x0, #16] +sqrdmulh v8.4S, v11.4S, v7.s[1] +mul v11.4S, v11.4S,v6.s[1] +mla v11.4S, v8.4S, v31.s[0] +sub v8.4s, v16.4s, v11.4s +add v16.4s, v16.4s, v11.4s +str q16, [x0, #32] +str q8, [x0, #48] +sqrdmulh v8.4S, v13.4S, v7.s[3] +mul v13.4S, v13.4S,v6.s[3] +mla v13.4S, v8.4S, v31.s[0] +sub v8.4s, v0.4s, v13.4s +add v0.4s, v0.4s, v13.4s +str q0, [x0, #96] +str q8, [x0, #112] +sqrdmulh v8.4S, v10.4S, v7.s[2] +mul v10.4S, v10.4S,v6.s[2] +mla v10.4S, v8.4S, v31.s[0] +sub v8.4s, v9.4s, v10.4s +add v9.4s, v9.4s, v10.4s +str q9, [x0, #64] +str q8, [x0, #80] +ldr q8, [x0, #240] +ldr q9, [x0, #224] +ldr q10, [x0, #192] +ldr q0, [x0, #208] +ldr q13, [x0, #176] +ldr q16, [x0, #160] +ldr q11, [x0, #128] +ldr q14, [x0, #144] +sqrdmulh v15.4S, v8.4S, v1.s[0] +mul v8.4S, v8.4S,v12.s[0] +mla v8.4S, v15.4S, v31.s[0] +sub v15.4s, v13.4s, v8.4s +add v13.4s, v13.4s, v8.4s +sqrdmulh v8.4S, v9.4S, v1.s[0] +mul v9.4S, v9.4S,v12.s[0] +mla v9.4S, v8.4S, v31.s[0] +sub v8.4s, v16.4s, v9.4s +add v16.4s, v16.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v1.s[0] +mul v10.4S, v10.4S,v12.s[0] +mla v10.4S, v9.4S, v31.s[0] +sub v9.4s, v11.4s, v10.4s +add v11.4s, v11.4s, v10.4s +sqrdmulh v10.4S, v0.4S, v1.s[0] +mul v0.4S, v0.4S,v12.s[0] +mla v0.4S, v10.4S, v31.s[0] +sub v10.4s, v14.4s, v0.4s +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v13.4S, v1.s[1] +mul v13.4S, v13.4S,v12.s[1] +mla v13.4S, v0.4S, v31.s[0] +sub v0.4s, v14.4s, v13.4s +add v14.4s, v14.4s, v13.4s +ldr q13, [x17, #+192] +ldr q17, [x17, #+208] +ldr q18, [x17, #+224] +ldr q19, [x17, #+240] +sqrdmulh v20.4S, v16.4S, v1.s[1] +mul v16.4S, v16.4S,v12.s[1] +mla v16.4S, v20.4S, v31.s[0] +sub v20.4s, v11.4s, v16.4s +add v11.4s, v11.4s, v16.4s +sqrdmulh v16.4S, v15.4S, v1.s[2] +mul v15.4S, v15.4S,v12.s[2] +mla v15.4S, v16.4S, v31.s[0] +sub v16.4s, v10.4s, v15.4s +add v10.4s, v10.4s, v15.4s +sqrdmulh v15.4S, v8.4S, v1.s[2] +mul v8.4S, v8.4S,v12.s[2] +mla v8.4S, v15.4S, v31.s[0] +sub v15.4s, v9.4s, v8.4s +add v9.4s, v9.4s, v8.4s +sqrdmulh v8.4S, v14.4S, v3.s[0] +mul v14.4S, v14.4S,v2.s[0] +mla v14.4S, v8.4S, v31.s[0] +sub v8.4s, v11.4s, v14.4s +add v11.4s, v11.4s, v14.4s +str q11, [x0, #128] +str q8, [x0, #144] +sqrdmulh v8.4S, v0.4S, v3.s[1] +mul v0.4S, v0.4S,v2.s[1] +mla v0.4S, v8.4S, v31.s[0] +sub v8.4s, v20.4s, v0.4s +add v20.4s, v20.4s, v0.4s +str q20, [x0, #160] +str q8, [x0, #176] +sqrdmulh v8.4S, v16.4S, v3.s[3] +mul v16.4S, v16.4S,v2.s[3] +mla v16.4S, v8.4S, v31.s[0] +sub v8.4s, v15.4s, v16.4s +add v15.4s, v15.4s, v16.4s +str q15, [x0, #224] +str q8, [x0, #240] +sqrdmulh v8.4S, v10.4S, v3.s[2] +mul v10.4S, v10.4S,v2.s[2] +mla v10.4S, v8.4S, v31.s[0] +sub v8.4s, v9.4s, v10.4s +add v9.4s, v9.4s, v10.4s +str q9, [x0, #192] +str q8, [x0, #208] +ldr q7, [x0, #368] +ldr q6, [x0, #352] +ldr q5, [x0, #320] +ldr q4, [x0, #336] +ldr q8, [x0, #304] +ldr q9, [x0, #288] +ldr q10, [x0, #256] +ldr q15, [x0, #272] +sqrdmulh v16.4S, v7.4S, v17.s[0] +mul v7.4S, v7.4S,v13.s[0] +mla v7.4S, v16.4S, v31.s[0] +sub v16.4s, v8.4s, v7.4s +add v8.4s, v8.4s, v7.4s +sqrdmulh v7.4S, v6.4S, v17.s[0] +mul v6.4S, v6.4S,v13.s[0] +mla v6.4S, v7.4S, v31.s[0] +sub v7.4s, v9.4s, v6.4s +add v9.4s, v9.4s, v6.4s +sqrdmulh v6.4S, v5.4S, v17.s[0] +mul v5.4S, v5.4S,v13.s[0] +mla v5.4S, v6.4S, v31.s[0] +sub v6.4s, v10.4s, v5.4s +add v10.4s, v10.4s, v5.4s +sqrdmulh v5.4S, v4.4S, v17.s[0] +mul v4.4S, v4.4S,v13.s[0] +mla v4.4S, v5.4S, v31.s[0] +sub v5.4s, v15.4s, v4.4s +add v15.4s, v15.4s, v4.4s +sqrdmulh v4.4S, v8.4S, v17.s[1] +mul v8.4S, v8.4S,v13.s[1] +mla v8.4S, v4.4S, v31.s[0] +sub v4.4s, v15.4s, v8.4s +add v15.4s, v15.4s, v8.4s +ldr q8, [x17, #+256] +ldr q20, [x17, #+272] +ldr q0, [x17, #+288] +ldr q11, [x17, #+304] +sqrdmulh v14.4S, v9.4S, v17.s[1] +mul v9.4S, v9.4S,v13.s[1] +mla v9.4S, v14.4S, v31.s[0] +sub v14.4s, v10.4s, v9.4s +add v10.4s, v10.4s, v9.4s +sqrdmulh v9.4S, v16.4S, v17.s[2] +mul v16.4S, v16.4S,v13.s[2] +mla v16.4S, v9.4S, v31.s[0] +sub v9.4s, v5.4s, v16.4s +add v5.4s, v5.4s, v16.4s +sqrdmulh v16.4S, v7.4S, v17.s[2] +mul v7.4S, v7.4S,v13.s[2] +mla v7.4S, v16.4S, v31.s[0] +sub v16.4s, v6.4s, v7.4s +add v6.4s, v6.4s, v7.4s +sqrdmulh v7.4S, v15.4S, v19.s[0] +mul v15.4S, v15.4S,v18.s[0] +mla v15.4S, v7.4S, v31.s[0] +sub v7.4s, v10.4s, v15.4s +add v10.4s, v10.4s, v15.4s +str q10, [x0, #256] +str q7, [x0, #272] +sqrdmulh v7.4S, v4.4S, v19.s[1] +mul v4.4S, v4.4S,v18.s[1] +mla v4.4S, v7.4S, v31.s[0] +sub v7.4s, v14.4s, v4.4s +add v14.4s, v14.4s, v4.4s +str q14, [x0, #288] +str q7, [x0, #304] +sqrdmulh v7.4S, v9.4S, v19.s[3] +mul v9.4S, v9.4S,v18.s[3] +mla v9.4S, v7.4S, v31.s[0] +sub v7.4s, v16.4s, v9.4s +add v16.4s, v16.4s, v9.4s +str q16, [x0, #352] +str q7, [x0, #368] +sqrdmulh v7.4S, v5.4S, v19.s[2] +mul v5.4S, v5.4S,v18.s[2] +mla v5.4S, v7.4S, v31.s[0] +sub v7.4s, v6.4s, v5.4s +add v6.4s, v6.4s, v5.4s +str q6, [x0, #320] +str q7, [x0, #336] +ldr q3, [x0, #496] +ldr q2, [x0, #480] +ldr q1, [x0, #448] +ldr q12, [x0, #464] +ldr q7, [x0, #432] +ldr q6, [x0, #416] +ldr q5, [x0, #384] +ldr q16, [x0, #400] +sqrdmulh v9.4S, v3.4S, v20.s[0] +mul v3.4S, v3.4S,v8.s[0] +mla v3.4S, v9.4S, v31.s[0] +sub v9.4s, v7.4s, v3.4s +add v7.4s, v7.4s, v3.4s +sqrdmulh v3.4S, v2.4S, v20.s[0] +mul v2.4S, v2.4S,v8.s[0] +mla v2.4S, v3.4S, v31.s[0] +sub v3.4s, v6.4s, v2.4s +add v6.4s, v6.4s, v2.4s +sqrdmulh v2.4S, v1.4S, v20.s[0] +mul v1.4S, v1.4S,v8.s[0] +mla v1.4S, v2.4S, v31.s[0] +sub v2.4s, v5.4s, v1.4s +add v5.4s, v5.4s, v1.4s +sqrdmulh v1.4S, v12.4S, v20.s[0] +mul v12.4S, v12.4S,v8.s[0] +mla v12.4S, v1.4S, v31.s[0] +sub v1.4s, v16.4s, v12.4s +add v16.4s, v16.4s, v12.4s +sqrdmulh v12.4S, v7.4S, v20.s[1] +mul v7.4S, v7.4S,v8.s[1] +mla v7.4S, v12.4S, v31.s[0] +sub v12.4s, v16.4s, v7.4s +add v16.4s, v16.4s, v7.4s +ldr q7, [x17, #+320] +ldr q14, [x17, #+336] +ldr q4, [x17, #+352] +ldr q10, [x17, #+368] +sqrdmulh v15.4S, v6.4S, v20.s[1] +mul v6.4S, v6.4S,v8.s[1] +mla v6.4S, v15.4S, v31.s[0] +sub v15.4s, v5.4s, v6.4s +add v5.4s, v5.4s, v6.4s +sqrdmulh v6.4S, v9.4S, v20.s[2] +mul v9.4S, v9.4S,v8.s[2] +mla v9.4S, v6.4S, v31.s[0] +sub v6.4s, v1.4s, v9.4s +add v1.4s, v1.4s, v9.4s +sqrdmulh v9.4S, v3.4S, v20.s[2] +mul v3.4S, v3.4S,v8.s[2] +mla v3.4S, v9.4S, v31.s[0] +sub v9.4s, v2.4s, v3.4s +add v2.4s, v2.4s, v3.4s +sqrdmulh v3.4S, v16.4S, v11.s[0] +mul v16.4S, v16.4S,v0.s[0] +mla v16.4S, v3.4S, v31.s[0] +sub v3.4s, v5.4s, v16.4s +add v5.4s, v5.4s, v16.4s +str q5, [x0, #384] +str q3, [x0, #400] +sqrdmulh v3.4S, v12.4S, v11.s[1] +mul v12.4S, v12.4S,v0.s[1] +mla v12.4S, v3.4S, v31.s[0] +sub v3.4s, v15.4s, v12.4s +add v15.4s, v15.4s, v12.4s +str q15, [x0, #416] +str q3, [x0, #432] +sqrdmulh v3.4S, v6.4S, v11.s[3] +mul v6.4S, v6.4S,v0.s[3] +mla v6.4S, v3.4S, v31.s[0] +sub v3.4s, v9.4s, v6.4s +add v9.4s, v9.4s, v6.4s +str q9, [x0, #480] +str q3, [x0, #496] +sqrdmulh v3.4S, v1.4S, v11.s[2] +mul v1.4S, v1.4S,v0.s[2] +mla v1.4S, v3.4S, v31.s[0] +sub v3.4s, v2.4s, v1.4s +add v2.4s, v2.4s, v1.4s +str q2, [x0, #448] +str q3, [x0, #464] +ldr q19, [x0, #624] +ldr q18, [x0, #608] +ldr q17, [x0, #576] +ldr q13, [x0, #592] +ldr q3, [x0, #560] +ldr q2, [x0, #544] +ldr q1, [x0, #512] +ldr q9, [x0, #528] +sqrdmulh v6.4S, v19.4S, v14.s[0] +mul v19.4S, v19.4S,v7.s[0] +mla v19.4S, v6.4S, v31.s[0] +sub v6.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v14.s[0] +mul v18.4S, v18.4S,v7.s[0] +mla v18.4S, v19.4S, v31.s[0] +sub v19.4s, v2.4s, v18.4s +add v2.4s, v2.4s, v18.4s +sqrdmulh v18.4S, v17.4S, v14.s[0] +mul v17.4S, v17.4S,v7.s[0] +mla v17.4S, v18.4S, v31.s[0] +sub v18.4s, v1.4s, v17.4s +add v1.4s, v1.4s, v17.4s +sqrdmulh v17.4S, v13.4S, v14.s[0] +mul v13.4S, v13.4S,v7.s[0] +mla v13.4S, v17.4S, v31.s[0] +sub v17.4s, v9.4s, v13.4s +add v9.4s, v9.4s, v13.4s +sqrdmulh v13.4S, v3.4S, v14.s[1] +mul v3.4S, v3.4S,v7.s[1] +mla v3.4S, v13.4S, v31.s[0] +sub v13.4s, v9.4s, v3.4s +add v9.4s, v9.4s, v3.4s +ldr q3, [x17, #+384] +ldr q15, [x17, #+400] +ldr q12, [x17, #+416] +ldr q5, [x17, #+432] +sqrdmulh v16.4S, v2.4S, v14.s[1] +mul v2.4S, v2.4S,v7.s[1] +mla v2.4S, v16.4S, v31.s[0] +sub v16.4s, v1.4s, v2.4s +add v1.4s, v1.4s, v2.4s +sqrdmulh v2.4S, v6.4S, v14.s[2] +mul v6.4S, v6.4S,v7.s[2] +mla v6.4S, v2.4S, v31.s[0] +sub v2.4s, v17.4s, v6.4s +add v17.4s, v17.4s, v6.4s +sqrdmulh v6.4S, v19.4S, v14.s[2] +mul v19.4S, v19.4S,v7.s[2] +mla v19.4S, v6.4S, v31.s[0] +sub v6.4s, v18.4s, v19.4s +add v18.4s, v18.4s, v19.4s +sqrdmulh v19.4S, v9.4S, v10.s[0] +mul v9.4S, v9.4S,v4.s[0] +mla v9.4S, v19.4S, v31.s[0] +sub v19.4s, v1.4s, v9.4s +add v1.4s, v1.4s, v9.4s +str q1, [x0, #512] +str q19, [x0, #528] +sqrdmulh v19.4S, v13.4S, v10.s[1] +mul v13.4S, v13.4S,v4.s[1] +mla v13.4S, v19.4S, v31.s[0] +sub v19.4s, v16.4s, v13.4s +add v16.4s, v16.4s, v13.4s +str q16, [x0, #544] +str q19, [x0, #560] +sqrdmulh v19.4S, v2.4S, v10.s[3] +mul v2.4S, v2.4S,v4.s[3] +mla v2.4S, v19.4S, v31.s[0] +sub v19.4s, v6.4s, v2.4s +add v6.4s, v6.4s, v2.4s +str q6, [x0, #608] +str q19, [x0, #624] +sqrdmulh v19.4S, v17.4S, v10.s[2] +mul v17.4S, v17.4S,v4.s[2] +mla v17.4S, v19.4S, v31.s[0] +sub v19.4s, v18.4s, v17.4s +add v18.4s, v18.4s, v17.4s +str q18, [x0, #576] +str q19, [x0, #592] +ldr q11, [x0, #752] +ldr q0, [x0, #736] +ldr q20, [x0, #704] +ldr q8, [x0, #720] +ldr q19, [x0, #688] +ldr q18, [x0, #672] +ldr q17, [x0, #640] +ldr q6, [x0, #656] +sqrdmulh v2.4S, v11.4S, v15.s[0] +mul v11.4S, v11.4S,v3.s[0] +mla v11.4S, v2.4S, v31.s[0] +sub v2.4s, v19.4s, v11.4s +add v19.4s, v19.4s, v11.4s +sqrdmulh v11.4S, v0.4S, v15.s[0] +mul v0.4S, v0.4S,v3.s[0] +mla v0.4S, v11.4S, v31.s[0] +sub v11.4s, v18.4s, v0.4s +add v18.4s, v18.4s, v0.4s +sqrdmulh v0.4S, v20.4S, v15.s[0] +mul v20.4S, v20.4S,v3.s[0] +mla v20.4S, v0.4S, v31.s[0] +sub v0.4s, v17.4s, v20.4s +add v17.4s, v17.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v15.s[0] +mul v8.4S, v8.4S,v3.s[0] +mla v8.4S, v20.4S, v31.s[0] +sub v20.4s, v6.4s, v8.4s +add v6.4s, v6.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v15.s[1] +mul v19.4S, v19.4S,v3.s[1] +mla v19.4S, v8.4S, v31.s[0] +sub v8.4s, v6.4s, v19.4s +add v6.4s, v6.4s, v19.4s +ldr q19, [x17, #+448] +ldr q16, [x17, #+464] +ldr q13, [x17, #+480] +ldr q1, [x17, #+496] +sqrdmulh v9.4S, v18.4S, v15.s[1] +mul v18.4S, v18.4S,v3.s[1] +mla v18.4S, v9.4S, v31.s[0] +sub v9.4s, v17.4s, v18.4s +add v17.4s, v17.4s, v18.4s +sqrdmulh v18.4S, v2.4S, v15.s[2] +mul v2.4S, v2.4S,v3.s[2] +mla v2.4S, v18.4S, v31.s[0] +sub v18.4s, v20.4s, v2.4s +add v20.4s, v20.4s, v2.4s +sqrdmulh v2.4S, v11.4S, v15.s[2] +mul v11.4S, v11.4S,v3.s[2] +mla v11.4S, v2.4S, v31.s[0] +sub v2.4s, v0.4s, v11.4s +add v0.4s, v0.4s, v11.4s +sqrdmulh v11.4S, v6.4S, v5.s[0] +mul v6.4S, v6.4S,v12.s[0] +mla v6.4S, v11.4S, v31.s[0] +sub v11.4s, v17.4s, v6.4s +add v17.4s, v17.4s, v6.4s +str q17, [x0, #640] +str q11, [x0, #656] +sqrdmulh v11.4S, v8.4S, v5.s[1] +mul v8.4S, v8.4S,v12.s[1] +mla v8.4S, v11.4S, v31.s[0] +sub v11.4s, v9.4s, v8.4s +add v9.4s, v9.4s, v8.4s +str q9, [x0, #672] +str q11, [x0, #688] +sqrdmulh v11.4S, v18.4S, v5.s[3] +mul v18.4S, v18.4S,v12.s[3] +mla v18.4S, v11.4S, v31.s[0] +sub v11.4s, v2.4s, v18.4s +add v2.4s, v2.4s, v18.4s +str q2, [x0, #736] +str q11, [x0, #752] +sqrdmulh v11.4S, v20.4S, v5.s[2] +mul v20.4S, v20.4S,v12.s[2] +mla v20.4S, v11.4S, v31.s[0] +sub v11.4s, v0.4s, v20.4s +add v0.4s, v0.4s, v20.4s +str q0, [x0, #704] +str q11, [x0, #720] +ldr q10, [x0, #880] +ldr q4, [x0, #864] +ldr q14, [x0, #832] +ldr q7, [x0, #848] +ldr q11, [x0, #816] +ldr q0, [x0, #800] +ldr q20, [x0, #768] +ldr q2, [x0, #784] +sqrdmulh v18.4S, v10.4S, v16.s[0] +mul v10.4S, v10.4S,v19.s[0] +mla v10.4S, v18.4S, v31.s[0] +sub v18.4s, v11.4s, v10.4s +add v11.4s, v11.4s, v10.4s +sqrdmulh v10.4S, v4.4S, v16.s[0] +mul v4.4S, v4.4S,v19.s[0] +mla v4.4S, v10.4S, v31.s[0] +sub v10.4s, v0.4s, v4.4s +add v0.4s, v0.4s, v4.4s +sqrdmulh v4.4S, v14.4S, v16.s[0] +mul v14.4S, v14.4S,v19.s[0] +mla v14.4S, v4.4S, v31.s[0] +sub v4.4s, v20.4s, v14.4s +add v20.4s, v20.4s, v14.4s +sqrdmulh v14.4S, v7.4S, v16.s[0] +mul v7.4S, v7.4S,v19.s[0] +mla v7.4S, v14.4S, v31.s[0] +sub v14.4s, v2.4s, v7.4s +add v2.4s, v2.4s, v7.4s +sqrdmulh v7.4S, v11.4S, v16.s[1] +mul v11.4S, v11.4S,v19.s[1] +mla v11.4S, v7.4S, v31.s[0] +sub v7.4s, v2.4s, v11.4s +add v2.4s, v2.4s, v11.4s +ldr q11, [x17, #+512] +ldr q9, [x17, #+528] +ldr q8, [x17, #+544] +ldr q17, [x17, #+560] +sqrdmulh v6.4S, v0.4S, v16.s[1] +mul v0.4S, v0.4S,v19.s[1] +mla v0.4S, v6.4S, v31.s[0] +sub v6.4s, v20.4s, v0.4s +add v20.4s, v20.4s, v0.4s +sqrdmulh v0.4S, v18.4S, v16.s[2] +mul v18.4S, v18.4S,v19.s[2] +mla v18.4S, v0.4S, v31.s[0] +sub v0.4s, v14.4s, v18.4s +add v14.4s, v14.4s, v18.4s +sqrdmulh v18.4S, v10.4S, v16.s[2] +mul v10.4S, v10.4S,v19.s[2] +mla v10.4S, v18.4S, v31.s[0] +sub v18.4s, v4.4s, v10.4s +add v4.4s, v4.4s, v10.4s +sqrdmulh v10.4S, v2.4S, v1.s[0] +mul v2.4S, v2.4S,v13.s[0] +mla v2.4S, v10.4S, v31.s[0] +sub v10.4s, v20.4s, v2.4s +add v20.4s, v20.4s, v2.4s +str q20, [x0, #768] +str q10, [x0, #784] +sqrdmulh v10.4S, v7.4S, v1.s[1] +mul v7.4S, v7.4S,v13.s[1] +mla v7.4S, v10.4S, v31.s[0] +sub v10.4s, v6.4s, v7.4s +add v6.4s, v6.4s, v7.4s +str q6, [x0, #800] +str q10, [x0, #816] +sqrdmulh v10.4S, v0.4S, v1.s[3] +mul v0.4S, v0.4S,v13.s[3] +mla v0.4S, v10.4S, v31.s[0] +sub v10.4s, v18.4s, v0.4s +add v18.4s, v18.4s, v0.4s +str q18, [x0, #864] +str q10, [x0, #880] +sqrdmulh v10.4S, v14.4S, v1.s[2] +mul v14.4S, v14.4S,v13.s[2] +mla v14.4S, v10.4S, v31.s[0] +sub v10.4s, v4.4s, v14.4s +add v4.4s, v4.4s, v14.4s +str q4, [x0, #832] +str q10, [x0, #848] +ldr q5, [x0, #1008] +ldr q12, [x0, #992] +ldr q15, [x0, #960] +ldr q3, [x0, #976] +ldr q10, [x0, #944] +ldr q4, [x0, #928] +ldr q14, [x0, #896] +ldr q18, [x0, #912] +sqrdmulh v0.4S, v5.4S, v9.s[0] +mul v5.4S, v5.4S,v11.s[0] +mla v5.4S, v0.4S, v31.s[0] +sub v0.4s, v10.4s, v5.4s +add v10.4s, v10.4s, v5.4s +sqrdmulh v5.4S, v12.4S, v9.s[0] +mul v12.4S, v12.4S,v11.s[0] +mla v12.4S, v5.4S, v31.s[0] +sub v5.4s, v4.4s, v12.4s +add v4.4s, v4.4s, v12.4s +sqrdmulh v12.4S, v15.4S, v9.s[0] +mul v15.4S, v15.4S,v11.s[0] +mla v15.4S, v12.4S, v31.s[0] +sub v12.4s, v14.4s, v15.4s +add v14.4s, v14.4s, v15.4s +sqrdmulh v15.4S, v3.4S, v9.s[0] +mul v3.4S, v3.4S,v11.s[0] +mla v3.4S, v15.4S, v31.s[0] +sub v15.4s, v18.4s, v3.4s +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v10.4S, v9.s[1] +mul v10.4S, v10.4S,v11.s[1] +mla v10.4S, v3.4S, v31.s[0] +sub v3.4s, v18.4s, v10.4s +add v18.4s, v18.4s, v10.4s +sqrdmulh v10.4S, v4.4S, v9.s[1] +mul v4.4S, v4.4S,v11.s[1] +mla v4.4S, v10.4S, v31.s[0] +sub v10.4s, v14.4s, v4.4s +add v14.4s, v14.4s, v4.4s +sqrdmulh v4.4S, v0.4S, v9.s[2] +mul v0.4S, v0.4S,v11.s[2] +mla v0.4S, v4.4S, v31.s[0] +sub v4.4s, v15.4s, v0.4s +add v15.4s, v15.4s, v0.4s +sqrdmulh v0.4S, v5.4S, v9.s[2] +mul v5.4S, v5.4S,v11.s[2] +mla v5.4S, v0.4S, v31.s[0] +sub v0.4s, v12.4s, v5.4s +add v12.4s, v12.4s, v5.4s +sqrdmulh v5.4S, v18.4S, v17.s[0] +mul v18.4S, v18.4S,v8.s[0] +mla v18.4S, v5.4S, v31.s[0] +sub v5.4s, v14.4s, v18.4s +add v14.4s, v14.4s, v18.4s +str q14, [x0, #896] +str q5, [x0, #912] +sqrdmulh v5.4S, v3.4S, v17.s[1] +mul v3.4S, v3.4S,v8.s[1] +mla v3.4S, v5.4S, v31.s[0] +sub v5.4s, v10.4s, v3.4s +add v10.4s, v10.4s, v3.4s +str q10, [x0, #928] +str q5, [x0, #944] +sqrdmulh v5.4S, v4.4S, v17.s[3] +mul v4.4S, v4.4S,v8.s[3] +mla v4.4S, v5.4S, v31.s[0] +sub v5.4s, v0.4s, v4.4s +add v0.4s, v0.4s, v4.4s +str q0, [x0, #992] +str q5, [x0, #1008] +sqrdmulh v5.4S, v15.4S, v17.s[2] +mul v15.4S, v15.4S,v8.s[2] +mla v15.4S, v5.4S, v31.s[0] +sub v5.4s, v12.4s, v15.4s +add v12.4s, v12.4s, v15.4s +str q12, [x0, #960] +str q5, [x0, #976] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1444 +// Instruction count: 1440 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_3_3_1.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_3_3_1.s new file mode 100644 index 0000000..bef74a7 --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_3_3_1.s @@ -0,0 +1,1474 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 23825509 // Layer 4, block 0 +.word 27028662 // Layer 4, block 1 +.word 0 // Layer None, block None +.word 1307297022 // Layer 3, block 0 +.word 1524716204 // Layer 4, block 0 +.word 1729702351 // Layer 4, block 1 +.word 0 // Layer None, block None +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 14626653 // Layer 3, block 1 +.word 14833295 // Layer 4, block 2 +.word 2138810 // Layer 4, block 3 +.word 0 // Layer None, block None +.word 936034350 // Layer 3, block 1 +.word 949258429 // Layer 4, block 2 +.word 136873393 // Layer 4, block 3 +.word 0 // Layer None, block None +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 29737761 // Layer 3, block 2 +.word 6490403 // Layer 4, block 4 +.word 19648405 // Layer 4, block 5 +.word 0 // Layer None, block None +.word 1903071454 // Layer 3, block 2 +.word 415354091 // Layer 4, block 4 +.word 1257401950 // Layer 4, block 5 +.word 0 // Layer None, block None +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 30285189 // Layer 3, block 3 +.word 31254932 // Layer 4, block 6 +.word 26362414 // Layer 4, block 7 +.word 0 // Layer None, block None +.word 1938104173 // Layer 3, block 3 +.word 2000162988 // Layer 4, block 6 +.word 1687065733 // Layer 4, block 7 +.word 0 // Layer None, block None +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 21289485 // Layer 3, block 4 +.word 572895 // Layer 4, block 8 +.word 26691971 // Layer 4, block 9 +.word 0 // Layer None, block None +.word 1362423055 // Layer 3, block 4 +.word 36662482 // Layer 4, block 8 +.word 1708155771 // Layer 4, block 9 +.word 0 // Layer None, block None +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 9914896 // Layer 3, block 5 +.word 9249292 // Layer 4, block 10 +.word 29292862 // Layer 4, block 11 +.word 0 // Layer None, block None +.word 634504916 // Layer 3, block 5 +.word 591909511 // Layer 4, block 10 +.word 1874600091 // Layer 4, block 11 +.word 0 // Layer None, block None +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 22603682 // Layer 3, block 6 +.word 8247799 // Layer 4, block 12 +.word 5086187 // Layer 4, block 13 +.word 0 // Layer None, block None +.word 1446525244 // Layer 3, block 6 +.word 527818851 // Layer 4, block 12 +.word 325491125 // Layer 4, block 13 +.word 0 // Layer None, block None +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 16204162 // Layer 3, block 7 +.word 28113639 // Layer 4, block 14 +.word 8471290 // Layer 4, block 15 +.word 0 // Layer None, block None +.word 1036987221 // Layer 3, block 7 +.word 1799135579 // Layer 4, block 14 +.word 542121183 // Layer 4, block 15 +.word 0 // Layer None, block None +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.text +.global ntt_u32_incomplete_neon_asm_var_3_3_1 +.global _ntt_u32_incomplete_neon_asm_var_3_3_1 +ntt_u32_incomplete_neon_asm_var_3_3_1: +_ntt_u32_incomplete_neon_asm_var_3_3_1: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x0, #960] +ldr q25, [x0, #832] +sqrdmulh v24.4S, v26.4S, v29.s[0] +mul v26.4S, v26.4S,v30.s[0] +ldr q23, [x0, #576] +ldr q22, [x0, #704] +sqrdmulh v21.4S, v25.4S, v29.s[0] +mla v26.4S, v24.4S, v31.s[0] +mul v25.4S, v25.4S,v30.s[0] +ldr q24, [x0, #448] +ldr q20, [x0, #320] +sqrdmulh v19.4S, v23.4S, v29.s[0] +sub v18.4s, v24.4s, v26.4s +mla v25.4S, v21.4S, v31.s[0] +mul v23.4S, v23.4S,v30.s[0] +add v24.4s, v24.4s, v26.4s +ldr q26, [x0, #64] +ldr q21, [x0, #192] +sqrdmulh v17.4S, v22.4S, v29.s[0] +sub v16.4s, v20.4s, v25.4s +mla v23.4S, v19.4S, v31.s[0] +mul v22.4S, v22.4S,v30.s[0] +add v20.4s, v20.4s, v25.4s +sqrdmulh v25.4S, v24.4S, v29.s[1] +sub v19.4s, v26.4s, v23.4s +mla v22.4S, v17.4S, v31.s[0] +mul v24.4S, v24.4S,v30.s[1] +add v26.4s, v26.4s, v23.4s +sqrdmulh v23.4S, v20.4S, v29.s[1] +sub v17.4s, v21.4s, v22.4s +mla v24.4S, v25.4S, v31.s[0] +mul v20.4S, v20.4S,v30.s[1] +add v21.4s, v21.4s, v22.4s +sqrdmulh v22.4S, v18.4S, v29.s[2] +sub v25.4s, v21.4s, v24.4s +mla v20.4S, v23.4S, v31.s[0] +mul v18.4S, v18.4S,v30.s[2] +add v21.4s, v21.4s, v24.4s +sqrdmulh v24.4S, v16.4S, v29.s[2] +sub v23.4s, v26.4s, v20.4s +mla v18.4S, v22.4S, v31.s[0] +mul v16.4S, v16.4S,v30.s[2] +add v26.4s, v26.4s, v20.4s +sqrdmulh v20.4S, v21.4S, v27.s[0] +sub v22.4s, v17.4s, v18.4s +mla v16.4S, v24.4S, v31.s[0] +mul v21.4S, v21.4S,v28.s[0] +add v17.4s, v17.4s, v18.4s +sqrdmulh v18.4S, v25.4S, v27.s[1] +sub v24.4s, v19.4s, v16.4s +mla v21.4S, v20.4S, v31.s[0] +mul v25.4S, v25.4S,v28.s[1] +add v19.4s, v19.4s, v16.4s +sqrdmulh v16.4S, v22.4S, v27.s[3] +sub v20.4s, v26.4s, v21.4s +mla v25.4S, v18.4S, v31.s[0] +mul v22.4S, v22.4S,v28.s[3] +add v26.4s, v26.4s, v21.4s +str q26, [x0, #64] +str q20, [x0, #192] +sqrdmulh v20.4S, v17.4S, v27.s[2] +sub v26.4s, v23.4s, v25.4s +mla v22.4S, v16.4S, v31.s[0] +mul v17.4S, v17.4S,v28.s[2] +add v23.4s, v23.4s, v25.4s +str q23, [x0, #320] +str q26, [x0, #448] +ldr q26, [x0, #976] +ldr q23, [x0, #848] +sqrdmulh v25.4S, v26.4S, v29.s[0] +sub v16.4s, v24.4s, v22.4s +mla v17.4S, v20.4S, v31.s[0] +mul v26.4S, v26.4S,v30.s[0] +add v24.4s, v24.4s, v22.4s +str q24, [x0, #832] +str q16, [x0, #960] +ldr q16, [x0, #592] +ldr q24, [x0, #720] +sqrdmulh v22.4S, v23.4S, v29.s[0] +sub v20.4s, v19.4s, v17.4s +mla v26.4S, v25.4S, v31.s[0] +mul v23.4S, v23.4S,v30.s[0] +add v19.4s, v19.4s, v17.4s +str q19, [x0, #576] +str q20, [x0, #704] +ldr q20, [x0, #464] +ldr q19, [x0, #336] +sqrdmulh v17.4S, v16.4S, v29.s[0] +sub v25.4s, v20.4s, v26.4s +mla v23.4S, v22.4S, v31.s[0] +mul v16.4S, v16.4S,v30.s[0] +add v20.4s, v20.4s, v26.4s +ldr q26, [x0, #80] +ldr q22, [x0, #208] +sqrdmulh v21.4S, v24.4S, v29.s[0] +sub v18.4s, v19.4s, v23.4s +mla v16.4S, v17.4S, v31.s[0] +mul v24.4S, v24.4S,v30.s[0] +add v19.4s, v19.4s, v23.4s +sqrdmulh v23.4S, v20.4S, v29.s[1] +sub v17.4s, v26.4s, v16.4s +mla v24.4S, v21.4S, v31.s[0] +mul v20.4S, v20.4S,v30.s[1] +add v26.4s, v26.4s, v16.4s +sqrdmulh v16.4S, v19.4S, v29.s[1] +sub v21.4s, v22.4s, v24.4s +mla v20.4S, v23.4S, v31.s[0] +mul v19.4S, v19.4S,v30.s[1] +add v22.4s, v22.4s, v24.4s +sqrdmulh v24.4S, v25.4S, v29.s[2] +sub v23.4s, v22.4s, v20.4s +mla v19.4S, v16.4S, v31.s[0] +mul v25.4S, v25.4S,v30.s[2] +add v22.4s, v22.4s, v20.4s +sqrdmulh v20.4S, v18.4S, v29.s[2] +sub v16.4s, v26.4s, v19.4s +mla v25.4S, v24.4S, v31.s[0] +mul v18.4S, v18.4S,v30.s[2] +add v26.4s, v26.4s, v19.4s +sqrdmulh v19.4S, v22.4S, v27.s[0] +sub v24.4s, v21.4s, v25.4s +mla v18.4S, v20.4S, v31.s[0] +mul v22.4S, v22.4S,v28.s[0] +add v21.4s, v21.4s, v25.4s +sqrdmulh v25.4S, v23.4S, v27.s[1] +sub v20.4s, v17.4s, v18.4s +mla v22.4S, v19.4S, v31.s[0] +mul v23.4S, v23.4S,v28.s[1] +add v17.4s, v17.4s, v18.4s +sqrdmulh v18.4S, v24.4S, v27.s[3] +sub v19.4s, v26.4s, v22.4s +mla v23.4S, v25.4S, v31.s[0] +mul v24.4S, v24.4S,v28.s[3] +add v26.4s, v26.4s, v22.4s +str q26, [x0, #80] +str q19, [x0, #208] +sqrdmulh v19.4S, v21.4S, v27.s[2] +sub v26.4s, v16.4s, v23.4s +mla v24.4S, v18.4S, v31.s[0] +mul v21.4S, v21.4S,v28.s[2] +add v16.4s, v16.4s, v23.4s +str q16, [x0, #336] +str q26, [x0, #464] +ldr q26, [x0, #992] +ldr q16, [x0, #864] +sqrdmulh v23.4S, v26.4S, v29.s[0] +sub v18.4s, v20.4s, v24.4s +mla v21.4S, v19.4S, v31.s[0] +mul v26.4S, v26.4S,v30.s[0] +add v20.4s, v20.4s, v24.4s +str q20, [x0, #848] +str q18, [x0, #976] +ldr q18, [x0, #608] +ldr q20, [x0, #736] +sqrdmulh v24.4S, v16.4S, v29.s[0] +sub v19.4s, v17.4s, v21.4s +mla v26.4S, v23.4S, v31.s[0] +mul v16.4S, v16.4S,v30.s[0] +add v17.4s, v17.4s, v21.4s +str q17, [x0, #592] +str q19, [x0, #720] +ldr q19, [x0, #480] +ldr q17, [x0, #352] +sqrdmulh v21.4S, v18.4S, v29.s[0] +sub v23.4s, v19.4s, v26.4s +mla v16.4S, v24.4S, v31.s[0] +mul v18.4S, v18.4S,v30.s[0] +add v19.4s, v19.4s, v26.4s +ldr q26, [x0, #96] +ldr q24, [x0, #224] +sqrdmulh v22.4S, v20.4S, v29.s[0] +sub v25.4s, v17.4s, v16.4s +mla v18.4S, v21.4S, v31.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v17.4s, v17.4s, v16.4s +sqrdmulh v16.4S, v19.4S, v29.s[1] +sub v21.4s, v26.4s, v18.4s +mla v20.4S, v22.4S, v31.s[0] +mul v19.4S, v19.4S,v30.s[1] +add v26.4s, v26.4s, v18.4s +sqrdmulh v18.4S, v17.4S, v29.s[1] +sub v22.4s, v24.4s, v20.4s +mla v19.4S, v16.4S, v31.s[0] +mul v17.4S, v17.4S,v30.s[1] +add v24.4s, v24.4s, v20.4s +sqrdmulh v20.4S, v23.4S, v29.s[2] +sub v16.4s, v24.4s, v19.4s +mla v17.4S, v18.4S, v31.s[0] +mul v23.4S, v23.4S,v30.s[2] +add v24.4s, v24.4s, v19.4s +sqrdmulh v19.4S, v25.4S, v29.s[2] +sub v18.4s, v26.4s, v17.4s +mla v23.4S, v20.4S, v31.s[0] +mul v25.4S, v25.4S,v30.s[2] +add v26.4s, v26.4s, v17.4s +sqrdmulh v17.4S, v24.4S, v27.s[0] +sub v20.4s, v22.4s, v23.4s +mla v25.4S, v19.4S, v31.s[0] +mul v24.4S, v24.4S,v28.s[0] +add v22.4s, v22.4s, v23.4s +sqrdmulh v23.4S, v16.4S, v27.s[1] +sub v19.4s, v21.4s, v25.4s +mla v24.4S, v17.4S, v31.s[0] +mul v16.4S, v16.4S,v28.s[1] +add v21.4s, v21.4s, v25.4s +sqrdmulh v25.4S, v20.4S, v27.s[3] +sub v17.4s, v26.4s, v24.4s +mla v16.4S, v23.4S, v31.s[0] +mul v20.4S, v20.4S,v28.s[3] +add v26.4s, v26.4s, v24.4s +str q26, [x0, #96] +str q17, [x0, #224] +sqrdmulh v17.4S, v22.4S, v27.s[2] +sub v26.4s, v18.4s, v16.4s +mla v20.4S, v25.4S, v31.s[0] +mul v22.4S, v22.4S,v28.s[2] +add v18.4s, v18.4s, v16.4s +str q18, [x0, #352] +str q26, [x0, #480] +ldr q26, [x0, #1008] +ldr q18, [x0, #880] +sqrdmulh v16.4S, v26.4S, v29.s[0] +sub v25.4s, v19.4s, v20.4s +mla v22.4S, v17.4S, v31.s[0] +mul v26.4S, v26.4S,v30.s[0] +add v19.4s, v19.4s, v20.4s +str q19, [x0, #864] +str q25, [x0, #992] +ldr q25, [x0, #624] +ldr q19, [x0, #752] +sqrdmulh v20.4S, v18.4S, v29.s[0] +sub v17.4s, v21.4s, v22.4s +mla v26.4S, v16.4S, v31.s[0] +mul v18.4S, v18.4S,v30.s[0] +add v21.4s, v21.4s, v22.4s +str q21, [x0, #608] +str q17, [x0, #736] +ldr q17, [x0, #496] +ldr q21, [x0, #368] +sqrdmulh v22.4S, v25.4S, v29.s[0] +sub v16.4s, v17.4s, v26.4s +mla v18.4S, v20.4S, v31.s[0] +mul v25.4S, v25.4S,v30.s[0] +add v17.4s, v17.4s, v26.4s +ldr q26, [x0, #112] +ldr q20, [x0, #240] +sqrdmulh v24.4S, v19.4S, v29.s[0] +sub v23.4s, v21.4s, v18.4s +mla v25.4S, v22.4S, v31.s[0] +mul v19.4S, v19.4S,v30.s[0] +add v21.4s, v21.4s, v18.4s +sqrdmulh v18.4S, v17.4S, v29.s[1] +sub v22.4s, v26.4s, v25.4s +mla v19.4S, v24.4S, v31.s[0] +mul v17.4S, v17.4S,v30.s[1] +add v26.4s, v26.4s, v25.4s +sqrdmulh v25.4S, v21.4S, v29.s[1] +sub v24.4s, v20.4s, v19.4s +mla v17.4S, v18.4S, v31.s[0] +mul v21.4S, v21.4S,v30.s[1] +add v20.4s, v20.4s, v19.4s +sqrdmulh v19.4S, v16.4S, v29.s[2] +sub v18.4s, v20.4s, v17.4s +mla v21.4S, v25.4S, v31.s[0] +mul v16.4S, v16.4S,v30.s[2] +add v20.4s, v20.4s, v17.4s +sqrdmulh v17.4S, v23.4S, v29.s[2] +sub v25.4s, v26.4s, v21.4s +mla v16.4S, v19.4S, v31.s[0] +mul v23.4S, v23.4S,v30.s[2] +add v26.4s, v26.4s, v21.4s +sqrdmulh v21.4S, v20.4S, v27.s[0] +sub v19.4s, v24.4s, v16.4s +mla v23.4S, v17.4S, v31.s[0] +mul v20.4S, v20.4S,v28.s[0] +add v24.4s, v24.4s, v16.4s +sqrdmulh v16.4S, v18.4S, v27.s[1] +sub v17.4s, v22.4s, v23.4s +mla v20.4S, v21.4S, v31.s[0] +mul v18.4S, v18.4S,v28.s[1] +add v22.4s, v22.4s, v23.4s +sqrdmulh v23.4S, v19.4S, v27.s[3] +sub v21.4s, v26.4s, v20.4s +mla v18.4S, v16.4S, v31.s[0] +mul v19.4S, v19.4S,v28.s[3] +add v26.4s, v26.4s, v20.4s +str q26, [x0, #112] +str q21, [x0, #240] +sqrdmulh v21.4S, v24.4S, v27.s[2] +sub v26.4s, v25.4s, v18.4s +mla v19.4S, v23.4S, v31.s[0] +mul v24.4S, v24.4S,v28.s[2] +add v25.4s, v25.4s, v18.4s +str q25, [x0, #368] +str q26, [x0, #496] +ldr q26, [x0, #896] +ldr q25, [x0, #768] +sqrdmulh v18.4S, v26.4S, v29.s[0] +sub v23.4s, v17.4s, v19.4s +mla v24.4S, v21.4S, v31.s[0] +mul v26.4S, v26.4S,v30.s[0] +add v17.4s, v17.4s, v19.4s +str q17, [x0, #880] +str q23, [x0, #1008] +ldr q23, [x0, #512] +ldr q17, [x0, #640] +sqrdmulh v19.4S, v25.4S, v29.s[0] +sub v21.4s, v22.4s, v24.4s +mla v26.4S, v18.4S, v31.s[0] +mul v25.4S, v25.4S,v30.s[0] +add v22.4s, v22.4s, v24.4s +str q22, [x0, #624] +str q21, [x0, #752] +ldr q21, [x0, #384] +ldr q22, [x0, #256] +sqrdmulh v24.4S, v23.4S, v29.s[0] +sub v18.4s, v21.4s, v26.4s +mla v25.4S, v19.4S, v31.s[0] +mul v23.4S, v23.4S,v30.s[0] +add v21.4s, v21.4s, v26.4s +ldr q26, [x0, #0] +ldr q19, [x0, #128] +sqrdmulh v20.4S, v17.4S, v29.s[0] +sub v16.4s, v22.4s, v25.4s +mla v23.4S, v24.4S, v31.s[0] +mul v17.4S, v17.4S,v30.s[0] +add v22.4s, v22.4s, v25.4s +sqrdmulh v25.4S, v21.4S, v29.s[1] +sub v24.4s, v26.4s, v23.4s +mla v17.4S, v20.4S, v31.s[0] +mul v21.4S, v21.4S,v30.s[1] +add v26.4s, v26.4s, v23.4s +sqrdmulh v23.4S, v22.4S, v29.s[1] +sub v20.4s, v19.4s, v17.4s +mla v21.4S, v25.4S, v31.s[0] +mul v22.4S, v22.4S,v30.s[1] +add v19.4s, v19.4s, v17.4s +sqrdmulh v17.4S, v18.4S, v29.s[2] +sub v25.4s, v19.4s, v21.4s +mla v22.4S, v23.4S, v31.s[0] +mul v18.4S, v18.4S,v30.s[2] +add v19.4s, v19.4s, v21.4s +sqrdmulh v21.4S, v16.4S, v29.s[2] +sub v23.4s, v26.4s, v22.4s +mla v18.4S, v17.4S, v31.s[0] +mul v16.4S, v16.4S,v30.s[2] +add v26.4s, v26.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v27.s[0] +sub v17.4s, v20.4s, v18.4s +mla v16.4S, v21.4S, v31.s[0] +mul v19.4S, v19.4S,v28.s[0] +add v20.4s, v20.4s, v18.4s +sqrdmulh v18.4S, v25.4S, v27.s[1] +sub v21.4s, v24.4s, v16.4s +mla v19.4S, v22.4S, v31.s[0] +mul v25.4S, v25.4S,v28.s[1] +add v24.4s, v24.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v27.s[3] +sub v22.4s, v26.4s, v19.4s +mla v25.4S, v18.4S, v31.s[0] +mul v17.4S, v17.4S,v28.s[3] +add v26.4s, v26.4s, v19.4s +str q26, [x0, #0] +str q22, [x0, #128] +sqrdmulh v22.4S, v20.4S, v27.s[2] +sub v26.4s, v23.4s, v25.4s +mla v17.4S, v16.4S, v31.s[0] +mul v20.4S, v20.4S,v28.s[2] +add v23.4s, v23.4s, v25.4s +str q23, [x0, #256] +str q26, [x0, #384] +ldr q26, [x0, #912] +ldr q23, [x0, #784] +sqrdmulh v25.4S, v26.4S, v29.s[0] +sub v16.4s, v21.4s, v17.4s +mla v20.4S, v22.4S, v31.s[0] +mul v26.4S, v26.4S,v30.s[0] +add v21.4s, v21.4s, v17.4s +str q21, [x0, #768] +str q16, [x0, #896] +ldr q16, [x0, #528] +ldr q21, [x0, #656] +sqrdmulh v17.4S, v23.4S, v29.s[0] +sub v22.4s, v24.4s, v20.4s +mla v26.4S, v25.4S, v31.s[0] +mul v23.4S, v23.4S,v30.s[0] +add v24.4s, v24.4s, v20.4s +str q24, [x0, #512] +str q22, [x0, #640] +ldr q22, [x0, #400] +ldr q24, [x0, #272] +sqrdmulh v20.4S, v16.4S, v29.s[0] +sub v25.4s, v22.4s, v26.4s +mla v23.4S, v17.4S, v31.s[0] +mul v16.4S, v16.4S,v30.s[0] +add v22.4s, v22.4s, v26.4s +ldr q26, [x0, #16] +ldr q17, [x0, #144] +sqrdmulh v19.4S, v21.4S, v29.s[0] +sub v18.4s, v24.4s, v23.4s +mla v16.4S, v20.4S, v31.s[0] +mul v21.4S, v21.4S,v30.s[0] +add v24.4s, v24.4s, v23.4s +sqrdmulh v23.4S, v22.4S, v29.s[1] +sub v20.4s, v26.4s, v16.4s +mla v21.4S, v19.4S, v31.s[0] +mul v22.4S, v22.4S,v30.s[1] +add v26.4s, v26.4s, v16.4s +sqrdmulh v16.4S, v24.4S, v29.s[1] +sub v19.4s, v17.4s, v21.4s +mla v22.4S, v23.4S, v31.s[0] +mul v24.4S, v24.4S,v30.s[1] +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v25.4S, v29.s[2] +sub v23.4s, v17.4s, v22.4s +mla v24.4S, v16.4S, v31.s[0] +mul v25.4S, v25.4S,v30.s[2] +add v17.4s, v17.4s, v22.4s +sqrdmulh v22.4S, v18.4S, v29.s[2] +sub v16.4s, v26.4s, v24.4s +mla v25.4S, v21.4S, v31.s[0] +mul v18.4S, v18.4S,v30.s[2] +add v26.4s, v26.4s, v24.4s +sqrdmulh v24.4S, v17.4S, v27.s[0] +sub v21.4s, v19.4s, v25.4s +mla v18.4S, v22.4S, v31.s[0] +mul v17.4S, v17.4S,v28.s[0] +add v19.4s, v19.4s, v25.4s +sqrdmulh v25.4S, v23.4S, v27.s[1] +sub v22.4s, v20.4s, v18.4s +mla v17.4S, v24.4S, v31.s[0] +mul v23.4S, v23.4S,v28.s[1] +add v20.4s, v20.4s, v18.4s +sqrdmulh v18.4S, v21.4S, v27.s[3] +sub v24.4s, v26.4s, v17.4s +mla v23.4S, v25.4S, v31.s[0] +mul v21.4S, v21.4S,v28.s[3] +add v26.4s, v26.4s, v17.4s +str q26, [x0, #16] +str q24, [x0, #144] +sqrdmulh v24.4S, v19.4S, v27.s[2] +sub v26.4s, v16.4s, v23.4s +mla v21.4S, v18.4S, v31.s[0] +mul v19.4S, v19.4S,v28.s[2] +add v16.4s, v16.4s, v23.4s +str q16, [x0, #272] +str q26, [x0, #400] +ldr q26, [x0, #928] +ldr q16, [x0, #800] +sqrdmulh v23.4S, v26.4S, v29.s[0] +sub v18.4s, v22.4s, v21.4s +mla v19.4S, v24.4S, v31.s[0] +mul v26.4S, v26.4S,v30.s[0] +add v22.4s, v22.4s, v21.4s +str q22, [x0, #784] +str q18, [x0, #912] +ldr q18, [x0, #544] +ldr q22, [x0, #672] +sqrdmulh v21.4S, v16.4S, v29.s[0] +sub v24.4s, v20.4s, v19.4s +mla v26.4S, v23.4S, v31.s[0] +mul v16.4S, v16.4S,v30.s[0] +add v20.4s, v20.4s, v19.4s +str q20, [x0, #528] +str q24, [x0, #656] +ldr q24, [x0, #416] +ldr q20, [x0, #288] +sqrdmulh v19.4S, v18.4S, v29.s[0] +sub v23.4s, v24.4s, v26.4s +mla v16.4S, v21.4S, v31.s[0] +mul v18.4S, v18.4S,v30.s[0] +add v24.4s, v24.4s, v26.4s +ldr q26, [x0, #32] +ldr q21, [x0, #160] +sqrdmulh v17.4S, v22.4S, v29.s[0] +sub v25.4s, v20.4s, v16.4s +mla v18.4S, v19.4S, v31.s[0] +mul v22.4S, v22.4S,v30.s[0] +add v20.4s, v20.4s, v16.4s +sqrdmulh v16.4S, v24.4S, v29.s[1] +sub v19.4s, v26.4s, v18.4s +mla v22.4S, v17.4S, v31.s[0] +mul v24.4S, v24.4S,v30.s[1] +add v26.4s, v26.4s, v18.4s +sqrdmulh v18.4S, v20.4S, v29.s[1] +sub v17.4s, v21.4s, v22.4s +mla v24.4S, v16.4S, v31.s[0] +mul v20.4S, v20.4S,v30.s[1] +add v21.4s, v21.4s, v22.4s +sqrdmulh v22.4S, v23.4S, v29.s[2] +sub v16.4s, v21.4s, v24.4s +mla v20.4S, v18.4S, v31.s[0] +mul v23.4S, v23.4S,v30.s[2] +add v21.4s, v21.4s, v24.4s +sqrdmulh v24.4S, v25.4S, v29.s[2] +sub v18.4s, v26.4s, v20.4s +mla v23.4S, v22.4S, v31.s[0] +mul v25.4S, v25.4S,v30.s[2] +add v26.4s, v26.4s, v20.4s +sqrdmulh v20.4S, v21.4S, v27.s[0] +sub v22.4s, v17.4s, v23.4s +mla v25.4S, v24.4S, v31.s[0] +mul v21.4S, v21.4S,v28.s[0] +add v17.4s, v17.4s, v23.4s +sqrdmulh v23.4S, v16.4S, v27.s[1] +sub v24.4s, v19.4s, v25.4s +mla v21.4S, v20.4S, v31.s[0] +mul v16.4S, v16.4S,v28.s[1] +add v19.4s, v19.4s, v25.4s +sqrdmulh v25.4S, v22.4S, v27.s[3] +sub v20.4s, v26.4s, v21.4s +mla v16.4S, v23.4S, v31.s[0] +mul v22.4S, v22.4S,v28.s[3] +add v26.4s, v26.4s, v21.4s +str q26, [x0, #32] +str q20, [x0, #160] +sqrdmulh v20.4S, v17.4S, v27.s[2] +sub v26.4s, v18.4s, v16.4s +mla v22.4S, v25.4S, v31.s[0] +mul v17.4S, v17.4S,v28.s[2] +add v18.4s, v18.4s, v16.4s +str q18, [x0, #288] +str q26, [x0, #416] +ldr q26, [x0, #944] +ldr q18, [x0, #816] +sqrdmulh v16.4S, v26.4S, v29.s[0] +sub v25.4s, v24.4s, v22.4s +mla v17.4S, v20.4S, v31.s[0] +mul v26.4S, v26.4S,v30.s[0] +add v24.4s, v24.4s, v22.4s +str q24, [x0, #800] +str q25, [x0, #928] +ldr q25, [x0, #560] +ldr q24, [x0, #688] +sqrdmulh v22.4S, v18.4S, v29.s[0] +sub v20.4s, v19.4s, v17.4s +mla v26.4S, v16.4S, v31.s[0] +mul v18.4S, v18.4S,v30.s[0] +add v19.4s, v19.4s, v17.4s +str q19, [x0, #544] +str q20, [x0, #672] +ldr q20, [x0, #432] +ldr q19, [x0, #304] +sqrdmulh v17.4S, v25.4S, v29.s[0] +sub v16.4s, v20.4s, v26.4s +mla v18.4S, v22.4S, v31.s[0] +mul v25.4S, v25.4S,v30.s[0] +add v20.4s, v20.4s, v26.4s +ldr q26, [x0, #48] +ldr q22, [x0, #176] +sqrdmulh v21.4S, v24.4S, v29.s[0] +sub v23.4s, v19.4s, v18.4s +mla v25.4S, v17.4S, v31.s[0] +mul v24.4S, v24.4S,v30.s[0] +add v19.4s, v19.4s, v18.4s +sqrdmulh v18.4S, v20.4S, v29.s[1] +sub v17.4s, v26.4s, v25.4s +mla v24.4S, v21.4S, v31.s[0] +mul v20.4S, v20.4S,v30.s[1] +add v26.4s, v26.4s, v25.4s +sqrdmulh v25.4S, v19.4S, v29.s[1] +sub v21.4s, v22.4s, v24.4s +mla v20.4S, v18.4S, v31.s[0] +mul v19.4S, v19.4S,v30.s[1] +add v22.4s, v22.4s, v24.4s +sqrdmulh v24.4S, v16.4S, v29.s[2] +sub v18.4s, v22.4s, v20.4s +mla v19.4S, v25.4S, v31.s[0] +mul v16.4S, v16.4S,v30.s[2] +add v22.4s, v22.4s, v20.4s +sqrdmulh v20.4S, v23.4S, v29.s[2] +sub v25.4s, v26.4s, v19.4s +mla v16.4S, v24.4S, v31.s[0] +mul v23.4S, v23.4S,v30.s[2] +add v26.4s, v26.4s, v19.4s +sqrdmulh v19.4S, v22.4S, v27.s[0] +sub v24.4s, v21.4s, v16.4s +mla v23.4S, v20.4S, v31.s[0] +mul v22.4S, v22.4S,v28.s[0] +add v21.4s, v21.4s, v16.4s +sqrdmulh v16.4S, v18.4S, v27.s[1] +sub v20.4s, v17.4s, v23.4s +mla v22.4S, v19.4S, v31.s[0] +mul v18.4S, v18.4S,v28.s[1] +add v17.4s, v17.4s, v23.4s +sqrdmulh v23.4S, v24.4S, v27.s[3] +sub v19.4s, v26.4s, v22.4s +mla v18.4S, v16.4S, v31.s[0] +mul v24.4S, v24.4S,v28.s[3] +add v26.4s, v26.4s, v22.4s +str q26, [x0, #48] +str q19, [x0, #176] +sqrdmulh v19.4S, v21.4S, v27.s[2] +sub v26.4s, v25.4s, v18.4s +mla v24.4S, v23.4S, v31.s[0] +mul v21.4S, v21.4S,v28.s[2] +add v25.4s, v25.4s, v18.4s +str q25, [x0, #304] +str q26, [x0, #432] +sub v26.4s, v20.4s, v24.4s +mla v21.4S, v19.4S, v31.s[0] +add v20.4s, v20.4s, v24.4s +str q20, [x0, #816] +str q26, [x0, #944] +sub v26.4s, v17.4s, v21.4s +add v17.4s, v17.4s, v21.4s +str q17, [x0, #560] +str q26, [x0, #688] +ldr q4, [x17, #+64] +ldr q5, [x17, #+80] +ldr q6, [x17, #+96] +ldr q7, [x17, #+112] +ldr q8, [x0, #112] +ldr q9, [x0, #96] +sqrdmulh v10.4S, v8.4S, v5.s[0] +mul v8.4S, v8.4S,v4.s[0] +ldr q11, [x0, #64] +ldr q12, [x0, #80] +sqrdmulh v13.4S, v9.4S, v5.s[0] +mla v8.4S, v10.4S, v31.s[0] +mul v9.4S, v9.4S,v4.s[0] +ldr q10, [x0, #48] +ldr q14, [x0, #32] +sqrdmulh v15.4S, v11.4S, v5.s[0] +sub v0.4s, v10.4s, v8.4s +mla v9.4S, v13.4S, v31.s[0] +mul v11.4S, v11.4S,v4.s[0] +add v10.4s, v10.4s, v8.4s +ldr q8, [x0, #0] +ldr q13, [x0, #16] +sqrdmulh v1.4S, v12.4S, v5.s[0] +sub v2.4s, v14.4s, v9.4s +mla v11.4S, v15.4S, v31.s[0] +mul v12.4S, v12.4S,v4.s[0] +add v14.4s, v14.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v5.s[1] +sub v15.4s, v8.4s, v11.4s +mla v12.4S, v1.4S, v31.s[0] +mul v10.4S, v10.4S,v4.s[1] +add v8.4s, v8.4s, v11.4s +sqrdmulh v11.4S, v14.4S, v5.s[1] +sub v1.4s, v13.4s, v12.4s +mla v10.4S, v9.4S, v31.s[0] +mul v14.4S, v14.4S,v4.s[1] +add v13.4s, v13.4s, v12.4s +sqrdmulh v12.4S, v0.4S, v5.s[2] +sub v9.4s, v13.4s, v10.4s +mla v14.4S, v11.4S, v31.s[0] +mul v0.4S, v0.4S,v4.s[2] +add v13.4s, v13.4s, v10.4s +ldr q10, [x17, #+128] +ldr q11, [x17, #+144] +ldr q3, [x17, #+160] +ldr q16, [x17, #+176] +sqrdmulh v22.4S, v2.4S, v5.s[2] +sub v23.4s, v8.4s, v14.4s +mla v0.4S, v12.4S, v31.s[0] +mul v2.4S, v2.4S,v4.s[2] +add v8.4s, v8.4s, v14.4s +sqrdmulh v14.4S, v13.4S, v7.s[0] +sub v12.4s, v1.4s, v0.4s +mla v2.4S, v22.4S, v31.s[0] +mul v13.4S, v13.4S,v6.s[0] +add v1.4s, v1.4s, v0.4s +sqrdmulh v0.4S, v9.4S, v7.s[1] +sub v22.4s, v15.4s, v2.4s +mla v13.4S, v14.4S, v31.s[0] +mul v9.4S, v9.4S,v6.s[1] +add v15.4s, v15.4s, v2.4s +sqrdmulh v2.4S, v12.4S, v7.s[3] +sub v14.4s, v8.4s, v13.4s +mla v9.4S, v0.4S, v31.s[0] +mul v12.4S, v12.4S,v6.s[3] +add v8.4s, v8.4s, v13.4s +str q8, [x0, #0] +str q14, [x0, #16] +sqrdmulh v14.4S, v1.4S, v7.s[2] +sub v8.4s, v23.4s, v9.4s +mla v12.4S, v2.4S, v31.s[0] +mul v1.4S, v1.4S,v6.s[2] +add v23.4s, v23.4s, v9.4s +str q23, [x0, #32] +str q8, [x0, #48] +ldr q8, [x0, #240] +ldr q23, [x0, #224] +sqrdmulh v9.4S, v8.4S, v11.s[0] +sub v2.4s, v22.4s, v12.4s +mla v1.4S, v14.4S, v31.s[0] +mul v8.4S, v8.4S,v10.s[0] +add v22.4s, v22.4s, v12.4s +str q22, [x0, #96] +str q2, [x0, #112] +ldr q2, [x0, #192] +ldr q22, [x0, #208] +sqrdmulh v12.4S, v23.4S, v11.s[0] +sub v14.4s, v15.4s, v1.4s +mla v8.4S, v9.4S, v31.s[0] +mul v23.4S, v23.4S,v10.s[0] +add v15.4s, v15.4s, v1.4s +str q15, [x0, #64] +str q14, [x0, #80] +ldr q7, [x0, #176] +ldr q6, [x0, #160] +sqrdmulh v5.4S, v2.4S, v11.s[0] +sub v4.4s, v7.4s, v8.4s +mla v23.4S, v12.4S, v31.s[0] +mul v2.4S, v2.4S,v10.s[0] +add v7.4s, v7.4s, v8.4s +ldr q8, [x0, #128] +ldr q12, [x0, #144] +sqrdmulh v14.4S, v22.4S, v11.s[0] +sub v15.4s, v6.4s, v23.4s +mla v2.4S, v5.4S, v31.s[0] +mul v22.4S, v22.4S,v10.s[0] +add v6.4s, v6.4s, v23.4s +sqrdmulh v23.4S, v7.4S, v11.s[1] +sub v5.4s, v8.4s, v2.4s +mla v22.4S, v14.4S, v31.s[0] +mul v7.4S, v7.4S,v10.s[1] +add v8.4s, v8.4s, v2.4s +sqrdmulh v2.4S, v6.4S, v11.s[1] +sub v14.4s, v12.4s, v22.4s +mla v7.4S, v23.4S, v31.s[0] +mul v6.4S, v6.4S,v10.s[1] +add v12.4s, v12.4s, v22.4s +sqrdmulh v22.4S, v4.4S, v11.s[2] +sub v23.4s, v12.4s, v7.4s +mla v6.4S, v2.4S, v31.s[0] +mul v4.4S, v4.4S,v10.s[2] +add v12.4s, v12.4s, v7.4s +ldr q7, [x17, #+192] +ldr q2, [x17, #+208] +ldr q1, [x17, #+224] +ldr q9, [x17, #+240] +sqrdmulh v13.4S, v15.4S, v11.s[2] +sub v0.4s, v8.4s, v6.4s +mla v4.4S, v22.4S, v31.s[0] +mul v15.4S, v15.4S,v10.s[2] +add v8.4s, v8.4s, v6.4s +sqrdmulh v6.4S, v12.4S, v16.s[0] +sub v22.4s, v14.4s, v4.4s +mla v15.4S, v13.4S, v31.s[0] +mul v12.4S, v12.4S,v3.s[0] +add v14.4s, v14.4s, v4.4s +sqrdmulh v4.4S, v23.4S, v16.s[1] +sub v13.4s, v5.4s, v15.4s +mla v12.4S, v6.4S, v31.s[0] +mul v23.4S, v23.4S,v3.s[1] +add v5.4s, v5.4s, v15.4s +sqrdmulh v15.4S, v22.4S, v16.s[3] +sub v6.4s, v8.4s, v12.4s +mla v23.4S, v4.4S, v31.s[0] +mul v22.4S, v22.4S,v3.s[3] +add v8.4s, v8.4s, v12.4s +str q8, [x0, #128] +str q6, [x0, #144] +sqrdmulh v6.4S, v14.4S, v16.s[2] +sub v8.4s, v0.4s, v23.4s +mla v22.4S, v15.4S, v31.s[0] +mul v14.4S, v14.4S,v3.s[2] +add v0.4s, v0.4s, v23.4s +str q0, [x0, #160] +str q8, [x0, #176] +ldr q8, [x0, #368] +ldr q0, [x0, #352] +sqrdmulh v23.4S, v8.4S, v2.s[0] +sub v15.4s, v13.4s, v22.4s +mla v14.4S, v6.4S, v31.s[0] +mul v8.4S, v8.4S,v7.s[0] +add v13.4s, v13.4s, v22.4s +str q13, [x0, #224] +str q15, [x0, #240] +ldr q15, [x0, #320] +ldr q13, [x0, #336] +sqrdmulh v22.4S, v0.4S, v2.s[0] +sub v6.4s, v5.4s, v14.4s +mla v8.4S, v23.4S, v31.s[0] +mul v0.4S, v0.4S,v7.s[0] +add v5.4s, v5.4s, v14.4s +str q5, [x0, #192] +str q6, [x0, #208] +ldr q16, [x0, #304] +ldr q3, [x0, #288] +sqrdmulh v11.4S, v15.4S, v2.s[0] +sub v10.4s, v16.4s, v8.4s +mla v0.4S, v22.4S, v31.s[0] +mul v15.4S, v15.4S,v7.s[0] +add v16.4s, v16.4s, v8.4s +ldr q8, [x0, #256] +ldr q22, [x0, #272] +sqrdmulh v6.4S, v13.4S, v2.s[0] +sub v5.4s, v3.4s, v0.4s +mla v15.4S, v11.4S, v31.s[0] +mul v13.4S, v13.4S,v7.s[0] +add v3.4s, v3.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v2.s[1] +sub v11.4s, v8.4s, v15.4s +mla v13.4S, v6.4S, v31.s[0] +mul v16.4S, v16.4S,v7.s[1] +add v8.4s, v8.4s, v15.4s +sqrdmulh v15.4S, v3.4S, v2.s[1] +sub v6.4s, v22.4s, v13.4s +mla v16.4S, v0.4S, v31.s[0] +mul v3.4S, v3.4S,v7.s[1] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v10.4S, v2.s[2] +sub v0.4s, v22.4s, v16.4s +mla v3.4S, v15.4S, v31.s[0] +mul v10.4S, v10.4S,v7.s[2] +add v22.4s, v22.4s, v16.4s +ldr q16, [x17, #+256] +ldr q15, [x17, #+272] +ldr q14, [x17, #+288] +ldr q23, [x17, #+304] +sqrdmulh v12.4S, v5.4S, v2.s[2] +sub v4.4s, v8.4s, v3.4s +mla v10.4S, v13.4S, v31.s[0] +mul v5.4S, v5.4S,v7.s[2] +add v8.4s, v8.4s, v3.4s +sqrdmulh v3.4S, v22.4S, v9.s[0] +sub v13.4s, v6.4s, v10.4s +mla v5.4S, v12.4S, v31.s[0] +mul v22.4S, v22.4S,v1.s[0] +add v6.4s, v6.4s, v10.4s +sqrdmulh v10.4S, v0.4S, v9.s[1] +sub v12.4s, v11.4s, v5.4s +mla v22.4S, v3.4S, v31.s[0] +mul v0.4S, v0.4S,v1.s[1] +add v11.4s, v11.4s, v5.4s +sqrdmulh v5.4S, v13.4S, v9.s[3] +sub v3.4s, v8.4s, v22.4s +mla v0.4S, v10.4S, v31.s[0] +mul v13.4S, v13.4S,v1.s[3] +add v8.4s, v8.4s, v22.4s +str q8, [x0, #256] +str q3, [x0, #272] +sqrdmulh v3.4S, v6.4S, v9.s[2] +sub v8.4s, v4.4s, v0.4s +mla v13.4S, v5.4S, v31.s[0] +mul v6.4S, v6.4S,v1.s[2] +add v4.4s, v4.4s, v0.4s +str q4, [x0, #288] +str q8, [x0, #304] +ldr q8, [x0, #496] +ldr q4, [x0, #480] +sqrdmulh v0.4S, v8.4S, v15.s[0] +sub v5.4s, v12.4s, v13.4s +mla v6.4S, v3.4S, v31.s[0] +mul v8.4S, v8.4S,v16.s[0] +add v12.4s, v12.4s, v13.4s +str q12, [x0, #352] +str q5, [x0, #368] +ldr q5, [x0, #448] +ldr q12, [x0, #464] +sqrdmulh v13.4S, v4.4S, v15.s[0] +sub v3.4s, v11.4s, v6.4s +mla v8.4S, v0.4S, v31.s[0] +mul v4.4S, v4.4S,v16.s[0] +add v11.4s, v11.4s, v6.4s +str q11, [x0, #320] +str q3, [x0, #336] +ldr q9, [x0, #432] +ldr q1, [x0, #416] +sqrdmulh v2.4S, v5.4S, v15.s[0] +sub v7.4s, v9.4s, v8.4s +mla v4.4S, v13.4S, v31.s[0] +mul v5.4S, v5.4S,v16.s[0] +add v9.4s, v9.4s, v8.4s +ldr q8, [x0, #384] +ldr q13, [x0, #400] +sqrdmulh v3.4S, v12.4S, v15.s[0] +sub v11.4s, v1.4s, v4.4s +mla v5.4S, v2.4S, v31.s[0] +mul v12.4S, v12.4S,v16.s[0] +add v1.4s, v1.4s, v4.4s +sqrdmulh v4.4S, v9.4S, v15.s[1] +sub v2.4s, v8.4s, v5.4s +mla v12.4S, v3.4S, v31.s[0] +mul v9.4S, v9.4S,v16.s[1] +add v8.4s, v8.4s, v5.4s +sqrdmulh v5.4S, v1.4S, v15.s[1] +sub v3.4s, v13.4s, v12.4s +mla v9.4S, v4.4S, v31.s[0] +mul v1.4S, v1.4S,v16.s[1] +add v13.4s, v13.4s, v12.4s +sqrdmulh v12.4S, v7.4S, v15.s[2] +sub v4.4s, v13.4s, v9.4s +mla v1.4S, v5.4S, v31.s[0] +mul v7.4S, v7.4S,v16.s[2] +add v13.4s, v13.4s, v9.4s +ldr q9, [x17, #+320] +ldr q5, [x17, #+336] +ldr q6, [x17, #+352] +ldr q0, [x17, #+368] +sqrdmulh v22.4S, v11.4S, v15.s[2] +sub v10.4s, v8.4s, v1.4s +mla v7.4S, v12.4S, v31.s[0] +mul v11.4S, v11.4S,v16.s[2] +add v8.4s, v8.4s, v1.4s +sqrdmulh v1.4S, v13.4S, v23.s[0] +sub v12.4s, v3.4s, v7.4s +mla v11.4S, v22.4S, v31.s[0] +mul v13.4S, v13.4S,v14.s[0] +add v3.4s, v3.4s, v7.4s +sqrdmulh v7.4S, v4.4S, v23.s[1] +sub v22.4s, v2.4s, v11.4s +mla v13.4S, v1.4S, v31.s[0] +mul v4.4S, v4.4S,v14.s[1] +add v2.4s, v2.4s, v11.4s +sqrdmulh v11.4S, v12.4S, v23.s[3] +sub v1.4s, v8.4s, v13.4s +mla v4.4S, v7.4S, v31.s[0] +mul v12.4S, v12.4S,v14.s[3] +add v8.4s, v8.4s, v13.4s +str q8, [x0, #384] +str q1, [x0, #400] +sqrdmulh v1.4S, v3.4S, v23.s[2] +sub v8.4s, v10.4s, v4.4s +mla v12.4S, v11.4S, v31.s[0] +mul v3.4S, v3.4S,v14.s[2] +add v10.4s, v10.4s, v4.4s +str q10, [x0, #416] +str q8, [x0, #432] +ldr q8, [x0, #624] +ldr q10, [x0, #608] +sqrdmulh v4.4S, v8.4S, v5.s[0] +sub v11.4s, v22.4s, v12.4s +mla v3.4S, v1.4S, v31.s[0] +mul v8.4S, v8.4S,v9.s[0] +add v22.4s, v22.4s, v12.4s +str q22, [x0, #480] +str q11, [x0, #496] +ldr q11, [x0, #576] +ldr q22, [x0, #592] +sqrdmulh v12.4S, v10.4S, v5.s[0] +sub v1.4s, v2.4s, v3.4s +mla v8.4S, v4.4S, v31.s[0] +mul v10.4S, v10.4S,v9.s[0] +add v2.4s, v2.4s, v3.4s +str q2, [x0, #448] +str q1, [x0, #464] +ldr q23, [x0, #560] +ldr q14, [x0, #544] +sqrdmulh v15.4S, v11.4S, v5.s[0] +sub v16.4s, v23.4s, v8.4s +mla v10.4S, v12.4S, v31.s[0] +mul v11.4S, v11.4S,v9.s[0] +add v23.4s, v23.4s, v8.4s +ldr q8, [x0, #512] +ldr q12, [x0, #528] +sqrdmulh v1.4S, v22.4S, v5.s[0] +sub v2.4s, v14.4s, v10.4s +mla v11.4S, v15.4S, v31.s[0] +mul v22.4S, v22.4S,v9.s[0] +add v14.4s, v14.4s, v10.4s +sqrdmulh v10.4S, v23.4S, v5.s[1] +sub v15.4s, v8.4s, v11.4s +mla v22.4S, v1.4S, v31.s[0] +mul v23.4S, v23.4S,v9.s[1] +add v8.4s, v8.4s, v11.4s +sqrdmulh v11.4S, v14.4S, v5.s[1] +sub v1.4s, v12.4s, v22.4s +mla v23.4S, v10.4S, v31.s[0] +mul v14.4S, v14.4S,v9.s[1] +add v12.4s, v12.4s, v22.4s +sqrdmulh v22.4S, v16.4S, v5.s[2] +sub v10.4s, v12.4s, v23.4s +mla v14.4S, v11.4S, v31.s[0] +mul v16.4S, v16.4S,v9.s[2] +add v12.4s, v12.4s, v23.4s +ldr q23, [x17, #+384] +ldr q11, [x17, #+400] +ldr q3, [x17, #+416] +ldr q4, [x17, #+432] +sqrdmulh v13.4S, v2.4S, v5.s[2] +sub v7.4s, v8.4s, v14.4s +mla v16.4S, v22.4S, v31.s[0] +mul v2.4S, v2.4S,v9.s[2] +add v8.4s, v8.4s, v14.4s +sqrdmulh v14.4S, v12.4S, v0.s[0] +sub v22.4s, v1.4s, v16.4s +mla v2.4S, v13.4S, v31.s[0] +mul v12.4S, v12.4S,v6.s[0] +add v1.4s, v1.4s, v16.4s +sqrdmulh v16.4S, v10.4S, v0.s[1] +sub v13.4s, v15.4s, v2.4s +mla v12.4S, v14.4S, v31.s[0] +mul v10.4S, v10.4S,v6.s[1] +add v15.4s, v15.4s, v2.4s +sqrdmulh v2.4S, v22.4S, v0.s[3] +sub v14.4s, v8.4s, v12.4s +mla v10.4S, v16.4S, v31.s[0] +mul v22.4S, v22.4S,v6.s[3] +add v8.4s, v8.4s, v12.4s +str q8, [x0, #512] +str q14, [x0, #528] +sqrdmulh v14.4S, v1.4S, v0.s[2] +sub v8.4s, v7.4s, v10.4s +mla v22.4S, v2.4S, v31.s[0] +mul v1.4S, v1.4S,v6.s[2] +add v7.4s, v7.4s, v10.4s +str q7, [x0, #544] +str q8, [x0, #560] +ldr q8, [x0, #752] +ldr q7, [x0, #736] +sqrdmulh v10.4S, v8.4S, v11.s[0] +sub v2.4s, v13.4s, v22.4s +mla v1.4S, v14.4S, v31.s[0] +mul v8.4S, v8.4S,v23.s[0] +add v13.4s, v13.4s, v22.4s +str q13, [x0, #608] +str q2, [x0, #624] +ldr q2, [x0, #704] +ldr q13, [x0, #720] +sqrdmulh v22.4S, v7.4S, v11.s[0] +sub v14.4s, v15.4s, v1.4s +mla v8.4S, v10.4S, v31.s[0] +mul v7.4S, v7.4S,v23.s[0] +add v15.4s, v15.4s, v1.4s +str q15, [x0, #576] +str q14, [x0, #592] +ldr q0, [x0, #688] +ldr q6, [x0, #672] +sqrdmulh v5.4S, v2.4S, v11.s[0] +sub v9.4s, v0.4s, v8.4s +mla v7.4S, v22.4S, v31.s[0] +mul v2.4S, v2.4S,v23.s[0] +add v0.4s, v0.4s, v8.4s +ldr q8, [x0, #640] +ldr q22, [x0, #656] +sqrdmulh v14.4S, v13.4S, v11.s[0] +sub v15.4s, v6.4s, v7.4s +mla v2.4S, v5.4S, v31.s[0] +mul v13.4S, v13.4S,v23.s[0] +add v6.4s, v6.4s, v7.4s +sqrdmulh v7.4S, v0.4S, v11.s[1] +sub v5.4s, v8.4s, v2.4s +mla v13.4S, v14.4S, v31.s[0] +mul v0.4S, v0.4S,v23.s[1] +add v8.4s, v8.4s, v2.4s +sqrdmulh v2.4S, v6.4S, v11.s[1] +sub v14.4s, v22.4s, v13.4s +mla v0.4S, v7.4S, v31.s[0] +mul v6.4S, v6.4S,v23.s[1] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v9.4S, v11.s[2] +sub v7.4s, v22.4s, v0.4s +mla v6.4S, v2.4S, v31.s[0] +mul v9.4S, v9.4S,v23.s[2] +add v22.4s, v22.4s, v0.4s +ldr q0, [x17, #+448] +ldr q2, [x17, #+464] +ldr q1, [x17, #+480] +ldr q10, [x17, #+496] +sqrdmulh v12.4S, v15.4S, v11.s[2] +sub v16.4s, v8.4s, v6.4s +mla v9.4S, v13.4S, v31.s[0] +mul v15.4S, v15.4S,v23.s[2] +add v8.4s, v8.4s, v6.4s +sqrdmulh v6.4S, v22.4S, v4.s[0] +sub v13.4s, v14.4s, v9.4s +mla v15.4S, v12.4S, v31.s[0] +mul v22.4S, v22.4S,v3.s[0] +add v14.4s, v14.4s, v9.4s +sqrdmulh v9.4S, v7.4S, v4.s[1] +sub v12.4s, v5.4s, v15.4s +mla v22.4S, v6.4S, v31.s[0] +mul v7.4S, v7.4S,v3.s[1] +add v5.4s, v5.4s, v15.4s +sqrdmulh v15.4S, v13.4S, v4.s[3] +sub v6.4s, v8.4s, v22.4s +mla v7.4S, v9.4S, v31.s[0] +mul v13.4S, v13.4S,v3.s[3] +add v8.4s, v8.4s, v22.4s +str q8, [x0, #640] +str q6, [x0, #656] +sqrdmulh v6.4S, v14.4S, v4.s[2] +sub v8.4s, v16.4s, v7.4s +mla v13.4S, v15.4S, v31.s[0] +mul v14.4S, v14.4S,v3.s[2] +add v16.4s, v16.4s, v7.4s +str q16, [x0, #672] +str q8, [x0, #688] +ldr q8, [x0, #880] +ldr q16, [x0, #864] +sqrdmulh v7.4S, v8.4S, v2.s[0] +sub v15.4s, v12.4s, v13.4s +mla v14.4S, v6.4S, v31.s[0] +mul v8.4S, v8.4S,v0.s[0] +add v12.4s, v12.4s, v13.4s +str q12, [x0, #736] +str q15, [x0, #752] +ldr q15, [x0, #832] +ldr q12, [x0, #848] +sqrdmulh v13.4S, v16.4S, v2.s[0] +sub v6.4s, v5.4s, v14.4s +mla v8.4S, v7.4S, v31.s[0] +mul v16.4S, v16.4S,v0.s[0] +add v5.4s, v5.4s, v14.4s +str q5, [x0, #704] +str q6, [x0, #720] +ldr q4, [x0, #816] +ldr q3, [x0, #800] +sqrdmulh v11.4S, v15.4S, v2.s[0] +sub v23.4s, v4.4s, v8.4s +mla v16.4S, v13.4S, v31.s[0] +mul v15.4S, v15.4S,v0.s[0] +add v4.4s, v4.4s, v8.4s +ldr q8, [x0, #768] +ldr q13, [x0, #784] +sqrdmulh v6.4S, v12.4S, v2.s[0] +sub v5.4s, v3.4s, v16.4s +mla v15.4S, v11.4S, v31.s[0] +mul v12.4S, v12.4S,v0.s[0] +add v3.4s, v3.4s, v16.4s +sqrdmulh v16.4S, v4.4S, v2.s[1] +sub v11.4s, v8.4s, v15.4s +mla v12.4S, v6.4S, v31.s[0] +mul v4.4S, v4.4S,v0.s[1] +add v8.4s, v8.4s, v15.4s +sqrdmulh v15.4S, v3.4S, v2.s[1] +sub v6.4s, v13.4s, v12.4s +mla v4.4S, v16.4S, v31.s[0] +mul v3.4S, v3.4S,v0.s[1] +add v13.4s, v13.4s, v12.4s +sqrdmulh v12.4S, v23.4S, v2.s[2] +sub v16.4s, v13.4s, v4.4s +mla v3.4S, v15.4S, v31.s[0] +mul v23.4S, v23.4S,v0.s[2] +add v13.4s, v13.4s, v4.4s +ldr q4, [x17, #+512] +ldr q15, [x17, #+528] +ldr q14, [x17, #+544] +ldr q7, [x17, #+560] +sqrdmulh v22.4S, v5.4S, v2.s[2] +sub v9.4s, v8.4s, v3.4s +mla v23.4S, v12.4S, v31.s[0] +mul v5.4S, v5.4S,v0.s[2] +add v8.4s, v8.4s, v3.4s +sqrdmulh v3.4S, v13.4S, v10.s[0] +sub v12.4s, v6.4s, v23.4s +mla v5.4S, v22.4S, v31.s[0] +mul v13.4S, v13.4S,v1.s[0] +add v6.4s, v6.4s, v23.4s +sqrdmulh v23.4S, v16.4S, v10.s[1] +sub v22.4s, v11.4s, v5.4s +mla v13.4S, v3.4S, v31.s[0] +mul v16.4S, v16.4S,v1.s[1] +add v11.4s, v11.4s, v5.4s +sqrdmulh v5.4S, v12.4S, v10.s[3] +sub v3.4s, v8.4s, v13.4s +mla v16.4S, v23.4S, v31.s[0] +mul v12.4S, v12.4S,v1.s[3] +add v8.4s, v8.4s, v13.4s +str q8, [x0, #768] +str q3, [x0, #784] +sqrdmulh v3.4S, v6.4S, v10.s[2] +sub v8.4s, v9.4s, v16.4s +mla v12.4S, v5.4S, v31.s[0] +mul v6.4S, v6.4S,v1.s[2] +add v9.4s, v9.4s, v16.4s +str q9, [x0, #800] +str q8, [x0, #816] +ldr q8, [x0, #1008] +ldr q9, [x0, #992] +sqrdmulh v16.4S, v8.4S, v15.s[0] +sub v5.4s, v22.4s, v12.4s +mla v6.4S, v3.4S, v31.s[0] +mul v8.4S, v8.4S,v4.s[0] +add v22.4s, v22.4s, v12.4s +str q22, [x0, #864] +str q5, [x0, #880] +ldr q5, [x0, #960] +ldr q22, [x0, #976] +sqrdmulh v12.4S, v9.4S, v15.s[0] +sub v3.4s, v11.4s, v6.4s +mla v8.4S, v16.4S, v31.s[0] +mul v9.4S, v9.4S,v4.s[0] +add v11.4s, v11.4s, v6.4s +str q11, [x0, #832] +str q3, [x0, #848] +ldr q10, [x0, #944] +ldr q1, [x0, #928] +sqrdmulh v2.4S, v5.4S, v15.s[0] +sub v0.4s, v10.4s, v8.4s +mla v9.4S, v12.4S, v31.s[0] +mul v5.4S, v5.4S,v4.s[0] +add v10.4s, v10.4s, v8.4s +ldr q8, [x0, #896] +ldr q12, [x0, #912] +sqrdmulh v3.4S, v22.4S, v15.s[0] +sub v11.4s, v1.4s, v9.4s +mla v5.4S, v2.4S, v31.s[0] +mul v22.4S, v22.4S,v4.s[0] +add v1.4s, v1.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v15.s[1] +sub v2.4s, v8.4s, v5.4s +mla v22.4S, v3.4S, v31.s[0] +mul v10.4S, v10.4S,v4.s[1] +add v8.4s, v8.4s, v5.4s +sqrdmulh v5.4S, v1.4S, v15.s[1] +sub v3.4s, v12.4s, v22.4s +mla v10.4S, v9.4S, v31.s[0] +mul v1.4S, v1.4S,v4.s[1] +add v12.4s, v12.4s, v22.4s +sqrdmulh v22.4S, v0.4S, v15.s[2] +sub v9.4s, v12.4s, v10.4s +mla v1.4S, v5.4S, v31.s[0] +mul v0.4S, v0.4S,v4.s[2] +add v12.4s, v12.4s, v10.4s +sqrdmulh v10.4S, v11.4S, v15.s[2] +sub v5.4s, v8.4s, v1.4s +mla v0.4S, v22.4S, v31.s[0] +mul v11.4S, v11.4S,v4.s[2] +add v8.4s, v8.4s, v1.4s +sqrdmulh v1.4S, v12.4S, v7.s[0] +sub v22.4s, v3.4s, v0.4s +mla v11.4S, v10.4S, v31.s[0] +mul v12.4S, v12.4S,v14.s[0] +add v3.4s, v3.4s, v0.4s +sqrdmulh v0.4S, v9.4S, v7.s[1] +sub v10.4s, v2.4s, v11.4s +mla v12.4S, v1.4S, v31.s[0] +mul v9.4S, v9.4S,v14.s[1] +add v2.4s, v2.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v7.s[3] +sub v1.4s, v8.4s, v12.4s +mla v9.4S, v0.4S, v31.s[0] +mul v22.4S, v22.4S,v14.s[3] +add v8.4s, v8.4s, v12.4s +str q8, [x0, #896] +str q1, [x0, #912] +sqrdmulh v1.4S, v3.4S, v7.s[2] +sub v8.4s, v5.4s, v9.4s +mla v22.4S, v11.4S, v31.s[0] +mul v3.4S, v3.4S,v14.s[2] +add v5.4s, v5.4s, v9.4s +str q5, [x0, #928] +str q8, [x0, #944] +sub v8.4s, v10.4s, v22.4s +mla v3.4S, v1.4S, v31.s[0] +add v10.4s, v10.4s, v22.4s +str q10, [x0, #992] +str q8, [x0, #1008] +sub v8.4s, v2.4s, v3.4s +add v2.4s, v2.4s, v3.4s +str q2, [x0, #960] +str q8, [x0, #976] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1444 +// Instruction count: 1440 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_3_3_2.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_3_3_2.s new file mode 100644 index 0000000..622ba55 --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_3_3_2.s @@ -0,0 +1,1474 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 23825509 // Layer 4, block 0 +.word 27028662 // Layer 4, block 1 +.word 0 // Layer None, block None +.word 1307297022 // Layer 3, block 0 +.word 1524716204 // Layer 4, block 0 +.word 1729702351 // Layer 4, block 1 +.word 0 // Layer None, block None +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 14626653 // Layer 3, block 1 +.word 14833295 // Layer 4, block 2 +.word 2138810 // Layer 4, block 3 +.word 0 // Layer None, block None +.word 936034350 // Layer 3, block 1 +.word 949258429 // Layer 4, block 2 +.word 136873393 // Layer 4, block 3 +.word 0 // Layer None, block None +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 29737761 // Layer 3, block 2 +.word 6490403 // Layer 4, block 4 +.word 19648405 // Layer 4, block 5 +.word 0 // Layer None, block None +.word 1903071454 // Layer 3, block 2 +.word 415354091 // Layer 4, block 4 +.word 1257401950 // Layer 4, block 5 +.word 0 // Layer None, block None +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 30285189 // Layer 3, block 3 +.word 31254932 // Layer 4, block 6 +.word 26362414 // Layer 4, block 7 +.word 0 // Layer None, block None +.word 1938104173 // Layer 3, block 3 +.word 2000162988 // Layer 4, block 6 +.word 1687065733 // Layer 4, block 7 +.word 0 // Layer None, block None +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 21289485 // Layer 3, block 4 +.word 572895 // Layer 4, block 8 +.word 26691971 // Layer 4, block 9 +.word 0 // Layer None, block None +.word 1362423055 // Layer 3, block 4 +.word 36662482 // Layer 4, block 8 +.word 1708155771 // Layer 4, block 9 +.word 0 // Layer None, block None +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 9914896 // Layer 3, block 5 +.word 9249292 // Layer 4, block 10 +.word 29292862 // Layer 4, block 11 +.word 0 // Layer None, block None +.word 634504916 // Layer 3, block 5 +.word 591909511 // Layer 4, block 10 +.word 1874600091 // Layer 4, block 11 +.word 0 // Layer None, block None +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 22603682 // Layer 3, block 6 +.word 8247799 // Layer 4, block 12 +.word 5086187 // Layer 4, block 13 +.word 0 // Layer None, block None +.word 1446525244 // Layer 3, block 6 +.word 527818851 // Layer 4, block 12 +.word 325491125 // Layer 4, block 13 +.word 0 // Layer None, block None +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 16204162 // Layer 3, block 7 +.word 28113639 // Layer 4, block 14 +.word 8471290 // Layer 4, block 15 +.word 0 // Layer None, block None +.word 1036987221 // Layer 3, block 7 +.word 1799135579 // Layer 4, block 14 +.word 542121183 // Layer 4, block 15 +.word 0 // Layer None, block None +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.text +.global ntt_u32_incomplete_neon_asm_var_3_3_2 +.global _ntt_u32_incomplete_neon_asm_var_3_3_2 +ntt_u32_incomplete_neon_asm_var_3_3_2: +_ntt_u32_incomplete_neon_asm_var_3_3_2: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x0, #960] +ldr q25, [x0, #832] +sqrdmulh v24.4S, v26.4S, v29.s[0] +mul v26.4S, v26.4S,v30.s[0] +ldr q23, [x0, #576] +sqrdmulh v22.4S, v25.4S, v29.s[0] +mul v25.4S, v25.4S,v30.s[0] +ldr q21, [x0, #704] +mla v26.4S, v24.4S, v31.s[0] +sqrdmulh v24.4S, v23.4S, v29.s[0] +mul v23.4S, v23.4S,v30.s[0] +ldr q20, [x0, #448] +mla v25.4S, v22.4S, v31.s[0] +sub v22.4s, v20.4s, v26.4s +add v20.4s, v20.4s, v26.4s +sqrdmulh v26.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +ldr q19, [x0, #320] +mla v23.4S, v24.4S, v31.s[0] +sub v24.4s, v19.4s, v25.4s +add v19.4s, v19.4s, v25.4s +sqrdmulh v25.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +ldr q18, [x0, #64] +mla v21.4S, v26.4S, v31.s[0] +sub v26.4s, v18.4s, v23.4s +add v18.4s, v18.4s, v23.4s +sqrdmulh v23.4S, v19.4S, v29.s[1] +mul v19.4S, v19.4S,v30.s[1] +ldr q17, [x0, #192] +mla v20.4S, v25.4S, v31.s[0] +sub v25.4s, v17.4s, v21.4s +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +mla v19.4S, v23.4S, v31.s[0] +sub v23.4s, v17.4s, v20.4s +add v17.4s, v17.4s, v20.4s +sqrdmulh v20.4S, v24.4S, v29.s[2] +mul v24.4S, v24.4S,v30.s[2] +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v18.4s, v19.4s +add v18.4s, v18.4s, v19.4s +sqrdmulh v19.4S, v17.4S, v27.s[0] +mul v17.4S, v17.4S,v28.s[0] +mla v24.4S, v20.4S, v31.s[0] +sub v20.4s, v25.4s, v22.4s +add v25.4s, v25.4s, v22.4s +sqrdmulh v22.4S, v23.4S, v27.s[1] +mul v23.4S, v23.4S,v28.s[1] +mla v17.4S, v19.4S, v31.s[0] +sub v19.4s, v26.4s, v24.4s +add v26.4s, v26.4s, v24.4s +sqrdmulh v24.4S, v20.4S, v27.s[3] +mul v20.4S, v20.4S,v28.s[3] +ldr q16, [x0, #976] +mla v23.4S, v22.4S, v31.s[0] +sub v22.4s, v18.4s, v17.4s +add v18.4s, v18.4s, v17.4s +sqrdmulh v17.4S, v25.4S, v27.s[2] +mul v25.4S, v25.4S,v28.s[2] +ldr q3, [x0, #848] +mla v20.4S, v24.4S, v31.s[0] +sub v24.4s, v21.4s, v23.4s +add v21.4s, v21.4s, v23.4s +sqrdmulh v23.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +ldr q2, [x0, #592] +mla v25.4S, v17.4S, v31.s[0] +sub v17.4s, v19.4s, v20.4s +add v19.4s, v19.4s, v20.4s +sqrdmulh v20.4S, v3.4S, v29.s[0] +str q18, [x0, #64] +mul v3.4S, v3.4S,v30.s[0] +ldr q18, [x0, #720] +mla v16.4S, v23.4S, v31.s[0] +sub v23.4s, v26.4s, v25.4s +add v26.4s, v26.4s, v25.4s +sqrdmulh v25.4S, v2.4S, v29.s[0] +str q22, [x0, #192] +mul v2.4S, v2.4S,v30.s[0] +ldr q22, [x0, #464] +mla v3.4S, v20.4S, v31.s[0] +sub v20.4s, v22.4s, v16.4s +add v22.4s, v22.4s, v16.4s +sqrdmulh v16.4S, v18.4S, v29.s[0] +str q21, [x0, #320] +mul v18.4S, v18.4S,v30.s[0] +ldr q21, [x0, #336] +mla v2.4S, v25.4S, v31.s[0] +sub v25.4s, v21.4s, v3.4s +add v21.4s, v21.4s, v3.4s +sqrdmulh v3.4S, v22.4S, v29.s[1] +str q24, [x0, #448] +mul v22.4S, v22.4S,v30.s[1] +ldr q24, [x0, #80] +mla v18.4S, v16.4S, v31.s[0] +sub v16.4s, v24.4s, v2.4s +add v24.4s, v24.4s, v2.4s +sqrdmulh v2.4S, v21.4S, v29.s[1] +str q19, [x0, #832] +mul v21.4S, v21.4S,v30.s[1] +ldr q19, [x0, #208] +mla v22.4S, v3.4S, v31.s[0] +sub v3.4s, v19.4s, v18.4s +add v19.4s, v19.4s, v18.4s +sqrdmulh v18.4S, v20.4S, v29.s[2] +str q17, [x0, #960] +mul v20.4S, v20.4S,v30.s[2] +mla v21.4S, v2.4S, v31.s[0] +sub v2.4s, v19.4s, v22.4s +add v19.4s, v19.4s, v22.4s +sqrdmulh v22.4S, v25.4S, v29.s[2] +str q26, [x0, #576] +mul v25.4S, v25.4S,v30.s[2] +mla v20.4S, v18.4S, v31.s[0] +sub v18.4s, v24.4s, v21.4s +add v24.4s, v24.4s, v21.4s +sqrdmulh v21.4S, v19.4S, v27.s[0] +str q23, [x0, #704] +mul v19.4S, v19.4S,v28.s[0] +mla v25.4S, v22.4S, v31.s[0] +sub v22.4s, v3.4s, v20.4s +add v3.4s, v3.4s, v20.4s +sqrdmulh v20.4S, v2.4S, v27.s[1] +mul v2.4S, v2.4S,v28.s[1] +mla v19.4S, v21.4S, v31.s[0] +sub v21.4s, v16.4s, v25.4s +add v16.4s, v16.4s, v25.4s +sqrdmulh v25.4S, v22.4S, v27.s[3] +mul v22.4S, v22.4S,v28.s[3] +ldr q23, [x0, #992] +mla v2.4S, v20.4S, v31.s[0] +sub v20.4s, v24.4s, v19.4s +add v24.4s, v24.4s, v19.4s +sqrdmulh v19.4S, v3.4S, v27.s[2] +mul v3.4S, v3.4S,v28.s[2] +ldr q26, [x0, #864] +mla v22.4S, v25.4S, v31.s[0] +sub v25.4s, v18.4s, v2.4s +add v18.4s, v18.4s, v2.4s +sqrdmulh v2.4S, v23.4S, v29.s[0] +mul v23.4S, v23.4S,v30.s[0] +ldr q17, [x0, #608] +mla v3.4S, v19.4S, v31.s[0] +sub v19.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +sqrdmulh v22.4S, v26.4S, v29.s[0] +str q24, [x0, #80] +mul v26.4S, v26.4S,v30.s[0] +ldr q24, [x0, #736] +mla v23.4S, v2.4S, v31.s[0] +sub v2.4s, v16.4s, v3.4s +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v17.4S, v29.s[0] +str q20, [x0, #208] +mul v17.4S, v17.4S,v30.s[0] +ldr q20, [x0, #480] +mla v26.4S, v22.4S, v31.s[0] +sub v22.4s, v20.4s, v23.4s +add v20.4s, v20.4s, v23.4s +sqrdmulh v23.4S, v24.4S, v29.s[0] +str q18, [x0, #336] +mul v24.4S, v24.4S,v30.s[0] +ldr q18, [x0, #352] +mla v17.4S, v3.4S, v31.s[0] +sub v3.4s, v18.4s, v26.4s +add v18.4s, v18.4s, v26.4s +sqrdmulh v26.4S, v20.4S, v29.s[1] +str q25, [x0, #464] +mul v20.4S, v20.4S,v30.s[1] +ldr q25, [x0, #96] +mla v24.4S, v23.4S, v31.s[0] +sub v23.4s, v25.4s, v17.4s +add v25.4s, v25.4s, v17.4s +sqrdmulh v17.4S, v18.4S, v29.s[1] +str q21, [x0, #848] +mul v18.4S, v18.4S,v30.s[1] +ldr q21, [x0, #224] +mla v20.4S, v26.4S, v31.s[0] +sub v26.4s, v21.4s, v24.4s +add v21.4s, v21.4s, v24.4s +sqrdmulh v24.4S, v22.4S, v29.s[2] +str q19, [x0, #976] +mul v22.4S, v22.4S,v30.s[2] +mla v18.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v20.4s +add v21.4s, v21.4s, v20.4s +sqrdmulh v20.4S, v3.4S, v29.s[2] +str q16, [x0, #592] +mul v3.4S, v3.4S,v30.s[2] +mla v22.4S, v24.4S, v31.s[0] +sub v24.4s, v25.4s, v18.4s +add v25.4s, v25.4s, v18.4s +sqrdmulh v18.4S, v21.4S, v27.s[0] +str q2, [x0, #720] +mul v21.4S, v21.4S,v28.s[0] +mla v3.4S, v20.4S, v31.s[0] +sub v20.4s, v26.4s, v22.4s +add v26.4s, v26.4s, v22.4s +sqrdmulh v22.4S, v17.4S, v27.s[1] +mul v17.4S, v17.4S,v28.s[1] +mla v21.4S, v18.4S, v31.s[0] +sub v18.4s, v23.4s, v3.4s +add v23.4s, v23.4s, v3.4s +sqrdmulh v3.4S, v20.4S, v27.s[3] +mul v20.4S, v20.4S,v28.s[3] +ldr q2, [x0, #1008] +mla v17.4S, v22.4S, v31.s[0] +sub v22.4s, v25.4s, v21.4s +add v25.4s, v25.4s, v21.4s +sqrdmulh v21.4S, v26.4S, v27.s[2] +mul v26.4S, v26.4S,v28.s[2] +ldr q16, [x0, #880] +mla v20.4S, v3.4S, v31.s[0] +sub v3.4s, v24.4s, v17.4s +add v24.4s, v24.4s, v17.4s +sqrdmulh v17.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +ldr q19, [x0, #624] +mla v26.4S, v21.4S, v31.s[0] +sub v21.4s, v18.4s, v20.4s +add v18.4s, v18.4s, v20.4s +sqrdmulh v20.4S, v16.4S, v29.s[0] +str q25, [x0, #96] +mul v16.4S, v16.4S,v30.s[0] +ldr q25, [x0, #752] +mla v2.4S, v17.4S, v31.s[0] +sub v17.4s, v23.4s, v26.4s +add v23.4s, v23.4s, v26.4s +sqrdmulh v26.4S, v19.4S, v29.s[0] +str q22, [x0, #224] +mul v19.4S, v19.4S,v30.s[0] +ldr q22, [x0, #496] +mla v16.4S, v20.4S, v31.s[0] +sub v20.4s, v22.4s, v2.4s +add v22.4s, v22.4s, v2.4s +sqrdmulh v2.4S, v25.4S, v29.s[0] +str q24, [x0, #352] +mul v25.4S, v25.4S,v30.s[0] +ldr q24, [x0, #368] +mla v19.4S, v26.4S, v31.s[0] +sub v26.4s, v24.4s, v16.4s +add v24.4s, v24.4s, v16.4s +sqrdmulh v16.4S, v22.4S, v29.s[1] +str q3, [x0, #480] +mul v22.4S, v22.4S,v30.s[1] +ldr q3, [x0, #112] +mla v25.4S, v2.4S, v31.s[0] +sub v2.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v24.4S, v29.s[1] +str q18, [x0, #864] +mul v24.4S, v24.4S,v30.s[1] +ldr q18, [x0, #240] +mla v22.4S, v16.4S, v31.s[0] +sub v16.4s, v18.4s, v25.4s +add v18.4s, v18.4s, v25.4s +sqrdmulh v25.4S, v20.4S, v29.s[2] +str q21, [x0, #992] +mul v20.4S, v20.4S,v30.s[2] +mla v24.4S, v19.4S, v31.s[0] +sub v19.4s, v18.4s, v22.4s +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v26.4S, v29.s[2] +str q23, [x0, #608] +mul v26.4S, v26.4S,v30.s[2] +mla v20.4S, v25.4S, v31.s[0] +sub v25.4s, v3.4s, v24.4s +add v3.4s, v3.4s, v24.4s +sqrdmulh v24.4S, v18.4S, v27.s[0] +str q17, [x0, #736] +mul v18.4S, v18.4S,v28.s[0] +mla v26.4S, v22.4S, v31.s[0] +sub v22.4s, v16.4s, v20.4s +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v27.s[1] +mul v19.4S, v19.4S,v28.s[1] +mla v18.4S, v24.4S, v31.s[0] +sub v24.4s, v2.4s, v26.4s +add v2.4s, v2.4s, v26.4s +sqrdmulh v26.4S, v22.4S, v27.s[3] +mul v22.4S, v22.4S,v28.s[3] +ldr q17, [x0, #896] +mla v19.4S, v20.4S, v31.s[0] +sub v20.4s, v3.4s, v18.4s +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v16.4S, v27.s[2] +mul v16.4S, v16.4S,v28.s[2] +ldr q23, [x0, #768] +mla v22.4S, v26.4S, v31.s[0] +sub v26.4s, v25.4s, v19.4s +add v25.4s, v25.4s, v19.4s +sqrdmulh v19.4S, v17.4S, v29.s[0] +mul v17.4S, v17.4S,v30.s[0] +ldr q21, [x0, #512] +mla v16.4S, v18.4S, v31.s[0] +sub v18.4s, v24.4s, v22.4s +add v24.4s, v24.4s, v22.4s +sqrdmulh v22.4S, v23.4S, v29.s[0] +str q3, [x0, #112] +mul v23.4S, v23.4S,v30.s[0] +ldr q3, [x0, #640] +mla v17.4S, v19.4S, v31.s[0] +sub v19.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v21.4S, v29.s[0] +str q20, [x0, #240] +mul v21.4S, v21.4S,v30.s[0] +ldr q20, [x0, #384] +mla v23.4S, v22.4S, v31.s[0] +sub v22.4s, v20.4s, v17.4s +add v20.4s, v20.4s, v17.4s +sqrdmulh v17.4S, v3.4S, v29.s[0] +str q25, [x0, #368] +mul v3.4S, v3.4S,v30.s[0] +ldr q25, [x0, #256] +mla v21.4S, v16.4S, v31.s[0] +sub v16.4s, v25.4s, v23.4s +add v25.4s, v25.4s, v23.4s +sqrdmulh v23.4S, v20.4S, v29.s[1] +str q26, [x0, #496] +mul v20.4S, v20.4S,v30.s[1] +ldr q26, [x0, #0] +mla v3.4S, v17.4S, v31.s[0] +sub v17.4s, v26.4s, v21.4s +add v26.4s, v26.4s, v21.4s +sqrdmulh v21.4S, v25.4S, v29.s[1] +str q24, [x0, #880] +mul v25.4S, v25.4S,v30.s[1] +ldr q24, [x0, #128] +mla v20.4S, v23.4S, v31.s[0] +sub v23.4s, v24.4s, v3.4s +add v24.4s, v24.4s, v3.4s +sqrdmulh v3.4S, v22.4S, v29.s[2] +str q18, [x0, #1008] +mul v22.4S, v22.4S,v30.s[2] +mla v25.4S, v21.4S, v31.s[0] +sub v21.4s, v24.4s, v20.4s +add v24.4s, v24.4s, v20.4s +sqrdmulh v20.4S, v16.4S, v29.s[2] +str q2, [x0, #624] +mul v16.4S, v16.4S,v30.s[2] +mla v22.4S, v3.4S, v31.s[0] +sub v3.4s, v26.4s, v25.4s +add v26.4s, v26.4s, v25.4s +sqrdmulh v25.4S, v24.4S, v27.s[0] +str q19, [x0, #752] +mul v24.4S, v24.4S,v28.s[0] +mla v16.4S, v20.4S, v31.s[0] +sub v20.4s, v23.4s, v22.4s +add v23.4s, v23.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v27.s[1] +mul v21.4S, v21.4S,v28.s[1] +mla v24.4S, v25.4S, v31.s[0] +sub v25.4s, v17.4s, v16.4s +add v17.4s, v17.4s, v16.4s +sqrdmulh v16.4S, v20.4S, v27.s[3] +mul v20.4S, v20.4S,v28.s[3] +ldr q19, [x0, #912] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v26.4s, v24.4s +add v26.4s, v26.4s, v24.4s +sqrdmulh v24.4S, v23.4S, v27.s[2] +mul v23.4S, v23.4S,v28.s[2] +ldr q2, [x0, #784] +mla v20.4S, v16.4S, v31.s[0] +sub v16.4s, v3.4s, v21.4s +add v3.4s, v3.4s, v21.4s +sqrdmulh v21.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q18, [x0, #528] +mla v23.4S, v24.4S, v31.s[0] +sub v24.4s, v25.4s, v20.4s +add v25.4s, v25.4s, v20.4s +sqrdmulh v20.4S, v2.4S, v29.s[0] +str q26, [x0, #0] +mul v2.4S, v2.4S,v30.s[0] +ldr q26, [x0, #656] +mla v19.4S, v21.4S, v31.s[0] +sub v21.4s, v17.4s, v23.4s +add v17.4s, v17.4s, v23.4s +sqrdmulh v23.4S, v18.4S, v29.s[0] +str q22, [x0, #128] +mul v18.4S, v18.4S,v30.s[0] +ldr q22, [x0, #400] +mla v2.4S, v20.4S, v31.s[0] +sub v20.4s, v22.4s, v19.4s +add v22.4s, v22.4s, v19.4s +sqrdmulh v19.4S, v26.4S, v29.s[0] +str q3, [x0, #256] +mul v26.4S, v26.4S,v30.s[0] +ldr q3, [x0, #272] +mla v18.4S, v23.4S, v31.s[0] +sub v23.4s, v3.4s, v2.4s +add v3.4s, v3.4s, v2.4s +sqrdmulh v2.4S, v22.4S, v29.s[1] +str q16, [x0, #384] +mul v22.4S, v22.4S,v30.s[1] +ldr q16, [x0, #16] +mla v26.4S, v19.4S, v31.s[0] +sub v19.4s, v16.4s, v18.4s +add v16.4s, v16.4s, v18.4s +sqrdmulh v18.4S, v3.4S, v29.s[1] +str q25, [x0, #768] +mul v3.4S, v3.4S,v30.s[1] +ldr q25, [x0, #144] +mla v22.4S, v2.4S, v31.s[0] +sub v2.4s, v25.4s, v26.4s +add v25.4s, v25.4s, v26.4s +sqrdmulh v26.4S, v20.4S, v29.s[2] +str q24, [x0, #896] +mul v20.4S, v20.4S,v30.s[2] +mla v3.4S, v18.4S, v31.s[0] +sub v18.4s, v25.4s, v22.4s +add v25.4s, v25.4s, v22.4s +sqrdmulh v22.4S, v23.4S, v29.s[2] +str q17, [x0, #512] +mul v23.4S, v23.4S,v30.s[2] +mla v20.4S, v26.4S, v31.s[0] +sub v26.4s, v16.4s, v3.4s +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v25.4S, v27.s[0] +str q21, [x0, #640] +mul v25.4S, v25.4S,v28.s[0] +mla v23.4S, v22.4S, v31.s[0] +sub v22.4s, v2.4s, v20.4s +add v2.4s, v2.4s, v20.4s +sqrdmulh v20.4S, v18.4S, v27.s[1] +mul v18.4S, v18.4S,v28.s[1] +mla v25.4S, v3.4S, v31.s[0] +sub v3.4s, v19.4s, v23.4s +add v19.4s, v19.4s, v23.4s +sqrdmulh v23.4S, v22.4S, v27.s[3] +mul v22.4S, v22.4S,v28.s[3] +ldr q21, [x0, #928] +mla v18.4S, v20.4S, v31.s[0] +sub v20.4s, v16.4s, v25.4s +add v16.4s, v16.4s, v25.4s +sqrdmulh v25.4S, v2.4S, v27.s[2] +mul v2.4S, v2.4S,v28.s[2] +ldr q17, [x0, #800] +mla v22.4S, v23.4S, v31.s[0] +sub v23.4s, v26.4s, v18.4s +add v26.4s, v26.4s, v18.4s +sqrdmulh v18.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +ldr q24, [x0, #544] +mla v2.4S, v25.4S, v31.s[0] +sub v25.4s, v3.4s, v22.4s +add v3.4s, v3.4s, v22.4s +sqrdmulh v22.4S, v17.4S, v29.s[0] +str q16, [x0, #16] +mul v17.4S, v17.4S,v30.s[0] +ldr q16, [x0, #672] +mla v21.4S, v18.4S, v31.s[0] +sub v18.4s, v19.4s, v2.4s +add v19.4s, v19.4s, v2.4s +sqrdmulh v2.4S, v24.4S, v29.s[0] +str q20, [x0, #144] +mul v24.4S, v24.4S,v30.s[0] +ldr q20, [x0, #416] +mla v17.4S, v22.4S, v31.s[0] +sub v22.4s, v20.4s, v21.4s +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v16.4S, v29.s[0] +str q26, [x0, #272] +mul v16.4S, v16.4S,v30.s[0] +ldr q26, [x0, #288] +mla v24.4S, v2.4S, v31.s[0] +sub v2.4s, v26.4s, v17.4s +add v26.4s, v26.4s, v17.4s +sqrdmulh v17.4S, v20.4S, v29.s[1] +str q23, [x0, #400] +mul v20.4S, v20.4S,v30.s[1] +ldr q23, [x0, #32] +mla v16.4S, v21.4S, v31.s[0] +sub v21.4s, v23.4s, v24.4s +add v23.4s, v23.4s, v24.4s +sqrdmulh v24.4S, v26.4S, v29.s[1] +str q3, [x0, #784] +mul v26.4S, v26.4S,v30.s[1] +ldr q3, [x0, #160] +mla v20.4S, v17.4S, v31.s[0] +sub v17.4s, v3.4s, v16.4s +add v3.4s, v3.4s, v16.4s +sqrdmulh v16.4S, v22.4S, v29.s[2] +str q25, [x0, #912] +mul v22.4S, v22.4S,v30.s[2] +mla v26.4S, v24.4S, v31.s[0] +sub v24.4s, v3.4s, v20.4s +add v3.4s, v3.4s, v20.4s +sqrdmulh v20.4S, v2.4S, v29.s[2] +str q19, [x0, #528] +mul v2.4S, v2.4S,v30.s[2] +mla v22.4S, v16.4S, v31.s[0] +sub v16.4s, v23.4s, v26.4s +add v23.4s, v23.4s, v26.4s +sqrdmulh v26.4S, v3.4S, v27.s[0] +str q18, [x0, #656] +mul v3.4S, v3.4S,v28.s[0] +mla v2.4S, v20.4S, v31.s[0] +sub v20.4s, v17.4s, v22.4s +add v17.4s, v17.4s, v22.4s +sqrdmulh v22.4S, v24.4S, v27.s[1] +mul v24.4S, v24.4S,v28.s[1] +mla v3.4S, v26.4S, v31.s[0] +sub v26.4s, v21.4s, v2.4s +add v21.4s, v21.4s, v2.4s +sqrdmulh v2.4S, v20.4S, v27.s[3] +mul v20.4S, v20.4S,v28.s[3] +ldr q18, [x0, #944] +mla v24.4S, v22.4S, v31.s[0] +sub v22.4s, v23.4s, v3.4s +add v23.4s, v23.4s, v3.4s +sqrdmulh v3.4S, v17.4S, v27.s[2] +mul v17.4S, v17.4S,v28.s[2] +ldr q19, [x0, #816] +mla v20.4S, v2.4S, v31.s[0] +sub v2.4s, v16.4s, v24.4s +add v16.4s, v16.4s, v24.4s +sqrdmulh v24.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +ldr q25, [x0, #560] +mla v17.4S, v3.4S, v31.s[0] +sub v3.4s, v26.4s, v20.4s +add v26.4s, v26.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v29.s[0] +str q23, [x0, #32] +mul v19.4S, v19.4S,v30.s[0] +ldr q23, [x0, #688] +mla v18.4S, v24.4S, v31.s[0] +sub v24.4s, v21.4s, v17.4s +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v25.4S, v29.s[0] +str q22, [x0, #160] +mul v25.4S, v25.4S,v30.s[0] +ldr q22, [x0, #432] +mla v19.4S, v20.4S, v31.s[0] +sub v20.4s, v22.4s, v18.4s +add v22.4s, v22.4s, v18.4s +sqrdmulh v18.4S, v23.4S, v29.s[0] +str q16, [x0, #288] +mul v23.4S, v23.4S,v30.s[0] +ldr q16, [x0, #304] +mla v25.4S, v17.4S, v31.s[0] +sub v17.4s, v16.4s, v19.4s +add v16.4s, v16.4s, v19.4s +sqrdmulh v19.4S, v22.4S, v29.s[1] +str q2, [x0, #416] +mul v22.4S, v22.4S,v30.s[1] +ldr q2, [x0, #48] +mla v23.4S, v18.4S, v31.s[0] +sub v18.4s, v2.4s, v25.4s +add v2.4s, v2.4s, v25.4s +sqrdmulh v25.4S, v16.4S, v29.s[1] +str q26, [x0, #800] +mul v16.4S, v16.4S,v30.s[1] +ldr q26, [x0, #176] +mla v22.4S, v19.4S, v31.s[0] +sub v19.4s, v26.4s, v23.4s +add v26.4s, v26.4s, v23.4s +sqrdmulh v23.4S, v20.4S, v29.s[2] +str q3, [x0, #928] +mul v20.4S, v20.4S,v30.s[2] +mla v16.4S, v25.4S, v31.s[0] +sub v25.4s, v26.4s, v22.4s +add v26.4s, v26.4s, v22.4s +sqrdmulh v22.4S, v17.4S, v29.s[2] +str q21, [x0, #544] +mul v17.4S, v17.4S,v30.s[2] +mla v20.4S, v23.4S, v31.s[0] +sub v23.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v26.4S, v27.s[0] +str q24, [x0, #672] +mul v26.4S, v26.4S,v28.s[0] +mla v17.4S, v22.4S, v31.s[0] +sub v22.4s, v19.4s, v20.4s +add v19.4s, v19.4s, v20.4s +sqrdmulh v20.4S, v25.4S, v27.s[1] +mul v25.4S, v25.4S,v28.s[1] +mla v26.4S, v16.4S, v31.s[0] +sub v16.4s, v18.4s, v17.4s +add v18.4s, v18.4s, v17.4s +sqrdmulh v17.4S, v22.4S, v27.s[3] +mul v22.4S, v22.4S,v28.s[3] +mla v25.4S, v20.4S, v31.s[0] +sub v20.4s, v2.4s, v26.4s +add v2.4s, v2.4s, v26.4s +sqrdmulh v26.4S, v19.4S, v27.s[2] +mul v19.4S, v19.4S,v28.s[2] +mla v22.4S, v17.4S, v31.s[0] +sub v17.4s, v23.4s, v25.4s +add v23.4s, v23.4s, v25.4s +mla v19.4S, v26.4S, v31.s[0] +sub v26.4s, v16.4s, v22.4s +add v16.4s, v16.4s, v22.4s +str q2, [x0, #48] +sub v2.4s, v18.4s, v19.4s +add v18.4s, v18.4s, v19.4s +str q20, [x0, #176] +str q23, [x0, #304] +str q17, [x0, #432] +str q16, [x0, #816] +str q26, [x0, #944] +str q18, [x0, #560] +str q2, [x0, #688] +ldr q4, [x17, #+64] +ldr q5, [x17, #+80] +ldr q6, [x17, #+96] +ldr q7, [x17, #+112] +ldr q8, [x0, #112] +ldr q9, [x0, #96] +sqrdmulh v10.4S, v8.4S, v5.s[0] +mul v8.4S, v8.4S,v4.s[0] +ldr q11, [x0, #64] +sqrdmulh v12.4S, v9.4S, v5.s[0] +mul v9.4S, v9.4S,v4.s[0] +ldr q13, [x0, #80] +mla v8.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v11.4S, v5.s[0] +mul v11.4S, v11.4S,v4.s[0] +ldr q14, [x0, #48] +mla v9.4S, v12.4S, v31.s[0] +sub v12.4s, v14.4s, v8.4s +add v14.4s, v14.4s, v8.4s +sqrdmulh v8.4S, v13.4S, v5.s[0] +mul v13.4S, v13.4S,v4.s[0] +ldr q15, [x0, #32] +mla v11.4S, v10.4S, v31.s[0] +sub v10.4s, v15.4s, v9.4s +add v15.4s, v15.4s, v9.4s +sqrdmulh v9.4S, v14.4S, v5.s[1] +mul v14.4S, v14.4S,v4.s[1] +ldr q0, [x0, #0] +mla v13.4S, v8.4S, v31.s[0] +sub v8.4s, v0.4s, v11.4s +add v0.4s, v0.4s, v11.4s +sqrdmulh v11.4S, v15.4S, v5.s[1] +mul v15.4S, v15.4S,v4.s[1] +ldr q1, [x0, #16] +mla v14.4S, v9.4S, v31.s[0] +sub v9.4s, v1.4s, v13.4s +add v1.4s, v1.4s, v13.4s +sqrdmulh v13.4S, v12.4S, v5.s[2] +mul v12.4S, v12.4S,v4.s[2] +mla v15.4S, v11.4S, v31.s[0] +sub v11.4s, v1.4s, v14.4s +add v1.4s, v1.4s, v14.4s +sqrdmulh v14.4S, v10.4S, v5.s[2] +mul v10.4S, v10.4S,v4.s[2] +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v0.4s, v15.4s +add v0.4s, v0.4s, v15.4s +ldr q15, [x17, #+128] +sqrdmulh v3.4S, v1.4S, v7.s[0] +mul v1.4S, v1.4S,v6.s[0] +mla v10.4S, v14.4S, v31.s[0] +sub v14.4s, v9.4s, v12.4s +add v9.4s, v9.4s, v12.4s +ldr q12, [x17, #+144] +sqrdmulh v21.4S, v11.4S, v7.s[1] +mul v11.4S, v11.4S,v6.s[1] +mla v1.4S, v3.4S, v31.s[0] +sub v3.4s, v8.4s, v10.4s +add v8.4s, v8.4s, v10.4s +ldr q10, [x17, #+160] +ldr q24, [x17, #+176] +sqrdmulh v25.4S, v14.4S, v7.s[3] +mul v14.4S, v14.4S,v6.s[3] +ldr q22, [x0, #240] +mla v11.4S, v21.4S, v31.s[0] +sub v21.4s, v0.4s, v1.4s +add v0.4s, v0.4s, v1.4s +sqrdmulh v1.4S, v9.4S, v7.s[2] +mul v9.4S, v9.4S,v6.s[2] +ldr q19, [x0, #224] +mla v14.4S, v25.4S, v31.s[0] +sub v25.4s, v13.4s, v11.4s +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v12.s[0] +mul v22.4S, v22.4S,v15.s[0] +ldr q30, [x0, #192] +mla v9.4S, v1.4S, v31.s[0] +sub v1.4s, v3.4s, v14.4s +add v3.4s, v3.4s, v14.4s +sqrdmulh v14.4S, v19.4S, v12.s[0] +str q0, [x0, #0] +mul v19.4S, v19.4S,v15.s[0] +ldr q0, [x0, #208] +mla v22.4S, v11.4S, v31.s[0] +sub v11.4s, v8.4s, v9.4s +add v8.4s, v8.4s, v9.4s +sqrdmulh v7.4S, v30.4S, v12.s[0] +str q21, [x0, #16] +mul v30.4S, v30.4S,v15.s[0] +ldr q21, [x0, #176] +mla v19.4S, v14.4S, v31.s[0] +sub v14.4s, v21.4s, v22.4s +add v21.4s, v21.4s, v22.4s +sqrdmulh v22.4S, v0.4S, v12.s[0] +str q13, [x0, #32] +mul v0.4S, v0.4S,v15.s[0] +ldr q13, [x0, #160] +mla v30.4S, v7.4S, v31.s[0] +sub v7.4s, v13.4s, v19.4s +add v13.4s, v13.4s, v19.4s +sqrdmulh v19.4S, v21.4S, v12.s[1] +str q25, [x0, #48] +mul v21.4S, v21.4S,v15.s[1] +ldr q25, [x0, #128] +mla v0.4S, v22.4S, v31.s[0] +sub v22.4s, v25.4s, v30.4s +add v25.4s, v25.4s, v30.4s +sqrdmulh v30.4S, v13.4S, v12.s[1] +str q3, [x0, #96] +mul v13.4S, v13.4S,v15.s[1] +ldr q3, [x0, #144] +mla v21.4S, v19.4S, v31.s[0] +sub v19.4s, v3.4s, v0.4s +add v3.4s, v3.4s, v0.4s +sqrdmulh v0.4S, v14.4S, v12.s[2] +str q1, [x0, #112] +mul v14.4S, v14.4S,v15.s[2] +mla v13.4S, v30.4S, v31.s[0] +sub v30.4s, v3.4s, v21.4s +add v3.4s, v3.4s, v21.4s +sqrdmulh v21.4S, v7.4S, v12.s[2] +str q8, [x0, #64] +mul v7.4S, v7.4S,v15.s[2] +mla v14.4S, v0.4S, v31.s[0] +sub v0.4s, v25.4s, v13.4s +add v25.4s, v25.4s, v13.4s +ldr q13, [x17, #+192] +sqrdmulh v8.4S, v3.4S, v24.s[0] +str q11, [x0, #80] +mul v3.4S, v3.4S,v10.s[0] +mla v7.4S, v21.4S, v31.s[0] +sub v21.4s, v19.4s, v14.4s +add v19.4s, v19.4s, v14.4s +ldr q14, [x17, #+208] +sqrdmulh v11.4S, v30.4S, v24.s[1] +mul v30.4S, v30.4S,v10.s[1] +mla v3.4S, v8.4S, v31.s[0] +sub v8.4s, v22.4s, v7.4s +add v22.4s, v22.4s, v7.4s +ldr q7, [x17, #+224] +ldr q1, [x17, #+240] +sqrdmulh v6.4S, v21.4S, v24.s[3] +mul v21.4S, v21.4S,v10.s[3] +ldr q5, [x0, #368] +mla v30.4S, v11.4S, v31.s[0] +sub v11.4s, v25.4s, v3.4s +add v25.4s, v25.4s, v3.4s +sqrdmulh v3.4S, v19.4S, v24.s[2] +mul v19.4S, v19.4S,v10.s[2] +ldr q4, [x0, #352] +mla v21.4S, v6.4S, v31.s[0] +sub v6.4s, v0.4s, v30.4s +add v0.4s, v0.4s, v30.4s +sqrdmulh v30.4S, v5.4S, v14.s[0] +mul v5.4S, v5.4S,v13.s[0] +ldr q9, [x0, #320] +mla v19.4S, v3.4S, v31.s[0] +sub v3.4s, v8.4s, v21.4s +add v8.4s, v8.4s, v21.4s +sqrdmulh v21.4S, v4.4S, v14.s[0] +str q25, [x0, #128] +mul v4.4S, v4.4S,v13.s[0] +ldr q25, [x0, #336] +mla v5.4S, v30.4S, v31.s[0] +sub v30.4s, v22.4s, v19.4s +add v22.4s, v22.4s, v19.4s +sqrdmulh v24.4S, v9.4S, v14.s[0] +str q11, [x0, #144] +mul v9.4S, v9.4S,v13.s[0] +ldr q11, [x0, #304] +mla v4.4S, v21.4S, v31.s[0] +sub v21.4s, v11.4s, v5.4s +add v11.4s, v11.4s, v5.4s +sqrdmulh v5.4S, v25.4S, v14.s[0] +str q0, [x0, #160] +mul v25.4S, v25.4S,v13.s[0] +ldr q0, [x0, #288] +mla v9.4S, v24.4S, v31.s[0] +sub v24.4s, v0.4s, v4.4s +add v0.4s, v0.4s, v4.4s +sqrdmulh v4.4S, v11.4S, v14.s[1] +str q6, [x0, #176] +mul v11.4S, v11.4S,v13.s[1] +ldr q6, [x0, #256] +mla v25.4S, v5.4S, v31.s[0] +sub v5.4s, v6.4s, v9.4s +add v6.4s, v6.4s, v9.4s +sqrdmulh v9.4S, v0.4S, v14.s[1] +str q8, [x0, #224] +mul v0.4S, v0.4S,v13.s[1] +ldr q8, [x0, #272] +mla v11.4S, v4.4S, v31.s[0] +sub v4.4s, v8.4s, v25.4s +add v8.4s, v8.4s, v25.4s +sqrdmulh v25.4S, v21.4S, v14.s[2] +str q3, [x0, #240] +mul v21.4S, v21.4S,v13.s[2] +mla v0.4S, v9.4S, v31.s[0] +sub v9.4s, v8.4s, v11.4s +add v8.4s, v8.4s, v11.4s +sqrdmulh v11.4S, v24.4S, v14.s[2] +str q22, [x0, #192] +mul v24.4S, v24.4S,v13.s[2] +mla v21.4S, v25.4S, v31.s[0] +sub v25.4s, v6.4s, v0.4s +add v6.4s, v6.4s, v0.4s +ldr q0, [x17, #+256] +sqrdmulh v22.4S, v8.4S, v1.s[0] +str q30, [x0, #208] +mul v8.4S, v8.4S,v7.s[0] +mla v24.4S, v11.4S, v31.s[0] +sub v11.4s, v4.4s, v21.4s +add v4.4s, v4.4s, v21.4s +ldr q21, [x17, #+272] +sqrdmulh v30.4S, v9.4S, v1.s[1] +mul v9.4S, v9.4S,v7.s[1] +mla v8.4S, v22.4S, v31.s[0] +sub v22.4s, v5.4s, v24.4s +add v5.4s, v5.4s, v24.4s +ldr q24, [x17, #+288] +ldr q3, [x17, #+304] +sqrdmulh v10.4S, v11.4S, v1.s[3] +mul v11.4S, v11.4S,v7.s[3] +ldr q12, [x0, #496] +mla v9.4S, v30.4S, v31.s[0] +sub v30.4s, v6.4s, v8.4s +add v6.4s, v6.4s, v8.4s +sqrdmulh v8.4S, v4.4S, v1.s[2] +mul v4.4S, v4.4S,v7.s[2] +ldr q15, [x0, #480] +mla v11.4S, v10.4S, v31.s[0] +sub v10.4s, v25.4s, v9.4s +add v25.4s, v25.4s, v9.4s +sqrdmulh v9.4S, v12.4S, v21.s[0] +mul v12.4S, v12.4S,v0.s[0] +ldr q19, [x0, #448] +mla v4.4S, v8.4S, v31.s[0] +sub v8.4s, v22.4s, v11.4s +add v22.4s, v22.4s, v11.4s +sqrdmulh v11.4S, v15.4S, v21.s[0] +str q6, [x0, #256] +mul v15.4S, v15.4S,v0.s[0] +ldr q6, [x0, #464] +mla v12.4S, v9.4S, v31.s[0] +sub v9.4s, v5.4s, v4.4s +add v5.4s, v5.4s, v4.4s +sqrdmulh v1.4S, v19.4S, v21.s[0] +str q30, [x0, #272] +mul v19.4S, v19.4S,v0.s[0] +ldr q30, [x0, #432] +mla v15.4S, v11.4S, v31.s[0] +sub v11.4s, v30.4s, v12.4s +add v30.4s, v30.4s, v12.4s +sqrdmulh v12.4S, v6.4S, v21.s[0] +str q25, [x0, #288] +mul v6.4S, v6.4S,v0.s[0] +ldr q25, [x0, #416] +mla v19.4S, v1.4S, v31.s[0] +sub v1.4s, v25.4s, v15.4s +add v25.4s, v25.4s, v15.4s +sqrdmulh v15.4S, v30.4S, v21.s[1] +str q10, [x0, #304] +mul v30.4S, v30.4S,v0.s[1] +ldr q10, [x0, #384] +mla v6.4S, v12.4S, v31.s[0] +sub v12.4s, v10.4s, v19.4s +add v10.4s, v10.4s, v19.4s +sqrdmulh v19.4S, v25.4S, v21.s[1] +str q22, [x0, #352] +mul v25.4S, v25.4S,v0.s[1] +ldr q22, [x0, #400] +mla v30.4S, v15.4S, v31.s[0] +sub v15.4s, v22.4s, v6.4s +add v22.4s, v22.4s, v6.4s +sqrdmulh v6.4S, v11.4S, v21.s[2] +str q8, [x0, #368] +mul v11.4S, v11.4S,v0.s[2] +mla v25.4S, v19.4S, v31.s[0] +sub v19.4s, v22.4s, v30.4s +add v22.4s, v22.4s, v30.4s +sqrdmulh v30.4S, v1.4S, v21.s[2] +str q5, [x0, #320] +mul v1.4S, v1.4S,v0.s[2] +mla v11.4S, v6.4S, v31.s[0] +sub v6.4s, v10.4s, v25.4s +add v10.4s, v10.4s, v25.4s +ldr q25, [x17, #+320] +sqrdmulh v5.4S, v22.4S, v3.s[0] +str q9, [x0, #336] +mul v22.4S, v22.4S,v24.s[0] +mla v1.4S, v30.4S, v31.s[0] +sub v30.4s, v15.4s, v11.4s +add v15.4s, v15.4s, v11.4s +ldr q11, [x17, #+336] +sqrdmulh v9.4S, v19.4S, v3.s[1] +mul v19.4S, v19.4S,v24.s[1] +mla v22.4S, v5.4S, v31.s[0] +sub v5.4s, v12.4s, v1.4s +add v12.4s, v12.4s, v1.4s +ldr q1, [x17, #+352] +ldr q8, [x17, #+368] +sqrdmulh v7.4S, v30.4S, v3.s[3] +mul v30.4S, v30.4S,v24.s[3] +ldr q14, [x0, #624] +mla v19.4S, v9.4S, v31.s[0] +sub v9.4s, v10.4s, v22.4s +add v10.4s, v10.4s, v22.4s +sqrdmulh v22.4S, v15.4S, v3.s[2] +mul v15.4S, v15.4S,v24.s[2] +ldr q13, [x0, #608] +mla v30.4S, v7.4S, v31.s[0] +sub v7.4s, v6.4s, v19.4s +add v6.4s, v6.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v11.s[0] +mul v14.4S, v14.4S,v25.s[0] +ldr q4, [x0, #576] +mla v15.4S, v22.4S, v31.s[0] +sub v22.4s, v5.4s, v30.4s +add v5.4s, v5.4s, v30.4s +sqrdmulh v30.4S, v13.4S, v11.s[0] +str q10, [x0, #384] +mul v13.4S, v13.4S,v25.s[0] +ldr q10, [x0, #592] +mla v14.4S, v19.4S, v31.s[0] +sub v19.4s, v12.4s, v15.4s +add v12.4s, v12.4s, v15.4s +sqrdmulh v3.4S, v4.4S, v11.s[0] +str q9, [x0, #400] +mul v4.4S, v4.4S,v25.s[0] +ldr q9, [x0, #560] +mla v13.4S, v30.4S, v31.s[0] +sub v30.4s, v9.4s, v14.4s +add v9.4s, v9.4s, v14.4s +sqrdmulh v14.4S, v10.4S, v11.s[0] +str q6, [x0, #416] +mul v10.4S, v10.4S,v25.s[0] +ldr q6, [x0, #544] +mla v4.4S, v3.4S, v31.s[0] +sub v3.4s, v6.4s, v13.4s +add v6.4s, v6.4s, v13.4s +sqrdmulh v13.4S, v9.4S, v11.s[1] +str q7, [x0, #432] +mul v9.4S, v9.4S,v25.s[1] +ldr q7, [x0, #512] +mla v10.4S, v14.4S, v31.s[0] +sub v14.4s, v7.4s, v4.4s +add v7.4s, v7.4s, v4.4s +sqrdmulh v4.4S, v6.4S, v11.s[1] +str q5, [x0, #480] +mul v6.4S, v6.4S,v25.s[1] +ldr q5, [x0, #528] +mla v9.4S, v13.4S, v31.s[0] +sub v13.4s, v5.4s, v10.4s +add v5.4s, v5.4s, v10.4s +sqrdmulh v10.4S, v30.4S, v11.s[2] +str q22, [x0, #496] +mul v30.4S, v30.4S,v25.s[2] +mla v6.4S, v4.4S, v31.s[0] +sub v4.4s, v5.4s, v9.4s +add v5.4s, v5.4s, v9.4s +sqrdmulh v9.4S, v3.4S, v11.s[2] +str q12, [x0, #448] +mul v3.4S, v3.4S,v25.s[2] +mla v30.4S, v10.4S, v31.s[0] +sub v10.4s, v7.4s, v6.4s +add v7.4s, v7.4s, v6.4s +ldr q6, [x17, #+384] +sqrdmulh v12.4S, v5.4S, v8.s[0] +str q19, [x0, #464] +mul v5.4S, v5.4S,v1.s[0] +mla v3.4S, v9.4S, v31.s[0] +sub v9.4s, v13.4s, v30.4s +add v13.4s, v13.4s, v30.4s +ldr q30, [x17, #+400] +sqrdmulh v19.4S, v4.4S, v8.s[1] +mul v4.4S, v4.4S,v1.s[1] +mla v5.4S, v12.4S, v31.s[0] +sub v12.4s, v14.4s, v3.4s +add v14.4s, v14.4s, v3.4s +ldr q3, [x17, #+416] +ldr q22, [x17, #+432] +sqrdmulh v24.4S, v9.4S, v8.s[3] +mul v9.4S, v9.4S,v1.s[3] +ldr q21, [x0, #752] +mla v4.4S, v19.4S, v31.s[0] +sub v19.4s, v7.4s, v5.4s +add v7.4s, v7.4s, v5.4s +sqrdmulh v5.4S, v13.4S, v8.s[2] +mul v13.4S, v13.4S,v1.s[2] +ldr q0, [x0, #736] +mla v9.4S, v24.4S, v31.s[0] +sub v24.4s, v10.4s, v4.4s +add v10.4s, v10.4s, v4.4s +sqrdmulh v4.4S, v21.4S, v30.s[0] +mul v21.4S, v21.4S,v6.s[0] +ldr q15, [x0, #704] +mla v13.4S, v5.4S, v31.s[0] +sub v5.4s, v12.4s, v9.4s +add v12.4s, v12.4s, v9.4s +sqrdmulh v9.4S, v0.4S, v30.s[0] +str q7, [x0, #512] +mul v0.4S, v0.4S,v6.s[0] +ldr q7, [x0, #720] +mla v21.4S, v4.4S, v31.s[0] +sub v4.4s, v14.4s, v13.4s +add v14.4s, v14.4s, v13.4s +sqrdmulh v8.4S, v15.4S, v30.s[0] +str q19, [x0, #528] +mul v15.4S, v15.4S,v6.s[0] +ldr q19, [x0, #688] +mla v0.4S, v9.4S, v31.s[0] +sub v9.4s, v19.4s, v21.4s +add v19.4s, v19.4s, v21.4s +sqrdmulh v21.4S, v7.4S, v30.s[0] +str q10, [x0, #544] +mul v7.4S, v7.4S,v6.s[0] +ldr q10, [x0, #672] +mla v15.4S, v8.4S, v31.s[0] +sub v8.4s, v10.4s, v0.4s +add v10.4s, v10.4s, v0.4s +sqrdmulh v0.4S, v19.4S, v30.s[1] +str q24, [x0, #560] +mul v19.4S, v19.4S,v6.s[1] +ldr q24, [x0, #640] +mla v7.4S, v21.4S, v31.s[0] +sub v21.4s, v24.4s, v15.4s +add v24.4s, v24.4s, v15.4s +sqrdmulh v15.4S, v10.4S, v30.s[1] +str q12, [x0, #608] +mul v10.4S, v10.4S,v6.s[1] +ldr q12, [x0, #656] +mla v19.4S, v0.4S, v31.s[0] +sub v0.4s, v12.4s, v7.4s +add v12.4s, v12.4s, v7.4s +sqrdmulh v7.4S, v9.4S, v30.s[2] +str q5, [x0, #624] +mul v9.4S, v9.4S,v6.s[2] +mla v10.4S, v15.4S, v31.s[0] +sub v15.4s, v12.4s, v19.4s +add v12.4s, v12.4s, v19.4s +sqrdmulh v19.4S, v8.4S, v30.s[2] +str q14, [x0, #576] +mul v8.4S, v8.4S,v6.s[2] +mla v9.4S, v7.4S, v31.s[0] +sub v7.4s, v24.4s, v10.4s +add v24.4s, v24.4s, v10.4s +ldr q10, [x17, #+448] +sqrdmulh v14.4S, v12.4S, v22.s[0] +str q4, [x0, #592] +mul v12.4S, v12.4S,v3.s[0] +mla v8.4S, v19.4S, v31.s[0] +sub v19.4s, v0.4s, v9.4s +add v0.4s, v0.4s, v9.4s +ldr q9, [x17, #+464] +sqrdmulh v4.4S, v15.4S, v22.s[1] +mul v15.4S, v15.4S,v3.s[1] +mla v12.4S, v14.4S, v31.s[0] +sub v14.4s, v21.4s, v8.4s +add v21.4s, v21.4s, v8.4s +ldr q8, [x17, #+480] +ldr q5, [x17, #+496] +sqrdmulh v1.4S, v19.4S, v22.s[3] +mul v19.4S, v19.4S,v3.s[3] +ldr q11, [x0, #880] +mla v15.4S, v4.4S, v31.s[0] +sub v4.4s, v24.4s, v12.4s +add v24.4s, v24.4s, v12.4s +sqrdmulh v12.4S, v0.4S, v22.s[2] +mul v0.4S, v0.4S,v3.s[2] +ldr q25, [x0, #864] +mla v19.4S, v1.4S, v31.s[0] +sub v1.4s, v7.4s, v15.4s +add v7.4s, v7.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v9.s[0] +mul v11.4S, v11.4S,v10.s[0] +ldr q13, [x0, #832] +mla v0.4S, v12.4S, v31.s[0] +sub v12.4s, v14.4s, v19.4s +add v14.4s, v14.4s, v19.4s +sqrdmulh v19.4S, v25.4S, v9.s[0] +str q24, [x0, #640] +mul v25.4S, v25.4S,v10.s[0] +ldr q24, [x0, #848] +mla v11.4S, v15.4S, v31.s[0] +sub v15.4s, v21.4s, v0.4s +add v21.4s, v21.4s, v0.4s +sqrdmulh v22.4S, v13.4S, v9.s[0] +str q4, [x0, #656] +mul v13.4S, v13.4S,v10.s[0] +ldr q4, [x0, #816] +mla v25.4S, v19.4S, v31.s[0] +sub v19.4s, v4.4s, v11.4s +add v4.4s, v4.4s, v11.4s +sqrdmulh v11.4S, v24.4S, v9.s[0] +str q7, [x0, #672] +mul v24.4S, v24.4S,v10.s[0] +ldr q7, [x0, #800] +mla v13.4S, v22.4S, v31.s[0] +sub v22.4s, v7.4s, v25.4s +add v7.4s, v7.4s, v25.4s +sqrdmulh v25.4S, v4.4S, v9.s[1] +str q1, [x0, #688] +mul v4.4S, v4.4S,v10.s[1] +ldr q1, [x0, #768] +mla v24.4S, v11.4S, v31.s[0] +sub v11.4s, v1.4s, v13.4s +add v1.4s, v1.4s, v13.4s +sqrdmulh v13.4S, v7.4S, v9.s[1] +str q14, [x0, #736] +mul v7.4S, v7.4S,v10.s[1] +ldr q14, [x0, #784] +mla v4.4S, v25.4S, v31.s[0] +sub v25.4s, v14.4s, v24.4s +add v14.4s, v14.4s, v24.4s +sqrdmulh v24.4S, v19.4S, v9.s[2] +str q12, [x0, #752] +mul v19.4S, v19.4S,v10.s[2] +mla v7.4S, v13.4S, v31.s[0] +sub v13.4s, v14.4s, v4.4s +add v14.4s, v14.4s, v4.4s +sqrdmulh v4.4S, v22.4S, v9.s[2] +str q21, [x0, #704] +mul v22.4S, v22.4S,v10.s[2] +mla v19.4S, v24.4S, v31.s[0] +sub v24.4s, v1.4s, v7.4s +add v1.4s, v1.4s, v7.4s +ldr q7, [x17, #+512] +sqrdmulh v21.4S, v14.4S, v5.s[0] +str q15, [x0, #720] +mul v14.4S, v14.4S,v8.s[0] +mla v22.4S, v4.4S, v31.s[0] +sub v4.4s, v25.4s, v19.4s +add v25.4s, v25.4s, v19.4s +ldr q19, [x17, #+528] +sqrdmulh v15.4S, v13.4S, v5.s[1] +mul v13.4S, v13.4S,v8.s[1] +mla v14.4S, v21.4S, v31.s[0] +sub v21.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +ldr q22, [x17, #+544] +ldr q12, [x17, #+560] +sqrdmulh v3.4S, v4.4S, v5.s[3] +mul v4.4S, v4.4S,v8.s[3] +ldr q30, [x0, #1008] +mla v13.4S, v15.4S, v31.s[0] +sub v15.4s, v1.4s, v14.4s +add v1.4s, v1.4s, v14.4s +sqrdmulh v14.4S, v25.4S, v5.s[2] +mul v25.4S, v25.4S,v8.s[2] +ldr q6, [x0, #992] +mla v4.4S, v3.4S, v31.s[0] +sub v3.4s, v24.4s, v13.4s +add v24.4s, v24.4s, v13.4s +sqrdmulh v13.4S, v30.4S, v19.s[0] +mul v30.4S, v30.4S,v7.s[0] +ldr q0, [x0, #960] +mla v25.4S, v14.4S, v31.s[0] +sub v14.4s, v21.4s, v4.4s +add v21.4s, v21.4s, v4.4s +sqrdmulh v4.4S, v6.4S, v19.s[0] +str q1, [x0, #768] +mul v6.4S, v6.4S,v7.s[0] +ldr q1, [x0, #976] +mla v30.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v25.4s +add v11.4s, v11.4s, v25.4s +sqrdmulh v5.4S, v0.4S, v19.s[0] +str q15, [x0, #784] +mul v0.4S, v0.4S,v7.s[0] +ldr q15, [x0, #944] +mla v6.4S, v4.4S, v31.s[0] +sub v4.4s, v15.4s, v30.4s +add v15.4s, v15.4s, v30.4s +sqrdmulh v30.4S, v1.4S, v19.s[0] +str q24, [x0, #800] +mul v1.4S, v1.4S,v7.s[0] +ldr q24, [x0, #928] +mla v0.4S, v5.4S, v31.s[0] +sub v5.4s, v24.4s, v6.4s +add v24.4s, v24.4s, v6.4s +sqrdmulh v6.4S, v15.4S, v19.s[1] +str q3, [x0, #816] +mul v15.4S, v15.4S,v7.s[1] +ldr q3, [x0, #896] +mla v1.4S, v30.4S, v31.s[0] +sub v30.4s, v3.4s, v0.4s +add v3.4s, v3.4s, v0.4s +sqrdmulh v0.4S, v24.4S, v19.s[1] +str q21, [x0, #864] +mul v24.4S, v24.4S,v7.s[1] +ldr q21, [x0, #912] +mla v15.4S, v6.4S, v31.s[0] +sub v6.4s, v21.4s, v1.4s +add v21.4s, v21.4s, v1.4s +sqrdmulh v1.4S, v4.4S, v19.s[2] +str q14, [x0, #880] +mul v4.4S, v4.4S,v7.s[2] +mla v24.4S, v0.4S, v31.s[0] +sub v0.4s, v21.4s, v15.4s +add v21.4s, v21.4s, v15.4s +sqrdmulh v15.4S, v5.4S, v19.s[2] +str q11, [x0, #832] +mul v5.4S, v5.4S,v7.s[2] +mla v4.4S, v1.4S, v31.s[0] +sub v1.4s, v3.4s, v24.4s +add v3.4s, v3.4s, v24.4s +sqrdmulh v24.4S, v21.4S, v12.s[0] +str q13, [x0, #848] +mul v21.4S, v21.4S,v22.s[0] +mla v5.4S, v15.4S, v31.s[0] +sub v15.4s, v6.4s, v4.4s +add v6.4s, v6.4s, v4.4s +sqrdmulh v4.4S, v0.4S, v12.s[1] +mul v0.4S, v0.4S,v22.s[1] +mla v21.4S, v24.4S, v31.s[0] +sub v24.4s, v30.4s, v5.4s +add v30.4s, v30.4s, v5.4s +sqrdmulh v5.4S, v15.4S, v12.s[3] +mul v15.4S, v15.4S,v22.s[3] +mla v0.4S, v4.4S, v31.s[0] +sub v4.4s, v3.4s, v21.4s +add v3.4s, v3.4s, v21.4s +sqrdmulh v21.4S, v6.4S, v12.s[2] +mul v6.4S, v6.4S,v22.s[2] +mla v15.4S, v5.4S, v31.s[0] +sub v5.4s, v1.4s, v0.4s +add v1.4s, v1.4s, v0.4s +mla v6.4S, v21.4S, v31.s[0] +sub v21.4s, v24.4s, v15.4s +add v24.4s, v24.4s, v15.4s +str q3, [x0, #896] +sub v3.4s, v30.4s, v6.4s +add v30.4s, v30.4s, v6.4s +str q4, [x0, #912] +str q1, [x0, #928] +str q5, [x0, #944] +str q24, [x0, #992] +str q21, [x0, #1008] +str q30, [x0, #960] +str q3, [x0, #976] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1444 +// Instruction count: 1440 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_3_3_3.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_3_3_3.s new file mode 100644 index 0000000..eebf2a2 --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_3_3_3.s @@ -0,0 +1,1474 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 23825509 // Layer 4, block 0 +.word 27028662 // Layer 4, block 1 +.word 0 // Layer None, block None +.word 1307297022 // Layer 3, block 0 +.word 1524716204 // Layer 4, block 0 +.word 1729702351 // Layer 4, block 1 +.word 0 // Layer None, block None +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 14626653 // Layer 3, block 1 +.word 14833295 // Layer 4, block 2 +.word 2138810 // Layer 4, block 3 +.word 0 // Layer None, block None +.word 936034350 // Layer 3, block 1 +.word 949258429 // Layer 4, block 2 +.word 136873393 // Layer 4, block 3 +.word 0 // Layer None, block None +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 29737761 // Layer 3, block 2 +.word 6490403 // Layer 4, block 4 +.word 19648405 // Layer 4, block 5 +.word 0 // Layer None, block None +.word 1903071454 // Layer 3, block 2 +.word 415354091 // Layer 4, block 4 +.word 1257401950 // Layer 4, block 5 +.word 0 // Layer None, block None +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 30285189 // Layer 3, block 3 +.word 31254932 // Layer 4, block 6 +.word 26362414 // Layer 4, block 7 +.word 0 // Layer None, block None +.word 1938104173 // Layer 3, block 3 +.word 2000162988 // Layer 4, block 6 +.word 1687065733 // Layer 4, block 7 +.word 0 // Layer None, block None +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 21289485 // Layer 3, block 4 +.word 572895 // Layer 4, block 8 +.word 26691971 // Layer 4, block 9 +.word 0 // Layer None, block None +.word 1362423055 // Layer 3, block 4 +.word 36662482 // Layer 4, block 8 +.word 1708155771 // Layer 4, block 9 +.word 0 // Layer None, block None +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 9914896 // Layer 3, block 5 +.word 9249292 // Layer 4, block 10 +.word 29292862 // Layer 4, block 11 +.word 0 // Layer None, block None +.word 634504916 // Layer 3, block 5 +.word 591909511 // Layer 4, block 10 +.word 1874600091 // Layer 4, block 11 +.word 0 // Layer None, block None +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 22603682 // Layer 3, block 6 +.word 8247799 // Layer 4, block 12 +.word 5086187 // Layer 4, block 13 +.word 0 // Layer None, block None +.word 1446525244 // Layer 3, block 6 +.word 527818851 // Layer 4, block 12 +.word 325491125 // Layer 4, block 13 +.word 0 // Layer None, block None +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 16204162 // Layer 3, block 7 +.word 28113639 // Layer 4, block 14 +.word 8471290 // Layer 4, block 15 +.word 0 // Layer None, block None +.word 1036987221 // Layer 3, block 7 +.word 1799135579 // Layer 4, block 14 +.word 542121183 // Layer 4, block 15 +.word 0 // Layer None, block None +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.text +.global ntt_u32_incomplete_neon_asm_var_3_3_3 +.global _ntt_u32_incomplete_neon_asm_var_3_3_3 +ntt_u32_incomplete_neon_asm_var_3_3_3: +_ntt_u32_incomplete_neon_asm_var_3_3_3: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x0, #960] +ldr q29, [x0, #832] +ldr q28, [x0, #576] +ldr q27, [x0, #704] +ldr q26, [x0, #448] +ldr q25, [x17, #+0] +ldr q24, [x17, #+16] +ldr q23, [x17, #+32] +ldr q22, [x17, #+48] +ldr q21, [x0, #320] +ldr q20, [x0, #64] +ldr q19, [x0, #192] +sqrdmulh v18.4S, v30.4S, v24.s[0] +mul v30.4S, v30.4S,v25.s[0] +sqrdmulh v17.4S, v29.4S, v24.s[0] +mul v29.4S, v29.4S,v25.s[0] +mla v30.4S, v18.4S, v31.s[0] +sqrdmulh v18.4S, v28.4S, v24.s[0] +mul v28.4S, v28.4S,v25.s[0] +ldr q16, [x0, #976] +mla v29.4S, v17.4S, v31.s[0] +sub v17.4s, v26.4s, v30.4s +add v26.4s, v26.4s, v30.4s +sqrdmulh v30.4S, v27.4S, v24.s[0] +mul v27.4S, v27.4S,v25.s[0] +ldr q3, [x0, #848] +mla v28.4S, v18.4S, v31.s[0] +sub v18.4s, v21.4s, v29.4s +add v21.4s, v21.4s, v29.4s +sqrdmulh v29.4S, v26.4S, v24.s[1] +mul v26.4S, v26.4S,v25.s[1] +ldr q2, [x0, #592] +mla v27.4S, v30.4S, v31.s[0] +sub v30.4s, v20.4s, v28.4s +add v20.4s, v20.4s, v28.4s +sqrdmulh v28.4S, v21.4S, v24.s[1] +mul v21.4S, v21.4S,v25.s[1] +ldr q1, [x0, #720] +mla v26.4S, v29.4S, v31.s[0] +sub v29.4s, v19.4s, v27.4s +add v19.4s, v19.4s, v27.4s +sqrdmulh v27.4S, v17.4S, v24.s[2] +mul v17.4S, v17.4S,v25.s[2] +ldr q0, [x0, #464] +mla v21.4S, v28.4S, v31.s[0] +sub v28.4s, v19.4s, v26.4s +add v19.4s, v19.4s, v26.4s +sqrdmulh v26.4S, v18.4S, v24.s[2] +mul v18.4S, v18.4S,v25.s[2] +ldr q15, [x0, #336] +mla v17.4S, v27.4S, v31.s[0] +sub v27.4s, v20.4s, v21.4s +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +ldr q14, [x0, #80] +mla v18.4S, v26.4S, v31.s[0] +sub v26.4s, v29.4s, v17.4s +add v29.4s, v29.4s, v17.4s +sqrdmulh v17.4S, v28.4S, v22.s[1] +mul v28.4S, v28.4S,v23.s[1] +ldr q13, [x0, #208] +mla v19.4S, v21.4S, v31.s[0] +sub v21.4s, v30.4s, v18.4s +add v30.4s, v30.4s, v18.4s +sqrdmulh v18.4S, v26.4S, v22.s[3] +mul v26.4S, v26.4S,v23.s[3] +mla v28.4S, v17.4S, v31.s[0] +sub v17.4s, v20.4s, v19.4s +add v20.4s, v20.4s, v19.4s +sqrdmulh v19.4S, v29.4S, v22.s[2] +mul v29.4S, v29.4S,v23.s[2] +mla v26.4S, v18.4S, v31.s[0] +sub v18.4s, v27.4s, v28.4s +add v27.4s, v27.4s, v28.4s +sqrdmulh v28.4S, v16.4S, v24.s[0] +mul v16.4S, v16.4S,v25.s[0] +mla v29.4S, v19.4S, v31.s[0] +sub v19.4s, v21.4s, v26.4s +add v21.4s, v21.4s, v26.4s +sqrdmulh v26.4S, v3.4S, v24.s[0] +mul v3.4S, v3.4S,v25.s[0] +mla v16.4S, v28.4S, v31.s[0] +sub v28.4s, v30.4s, v29.4s +add v30.4s, v30.4s, v29.4s +sqrdmulh v29.4S, v2.4S, v24.s[0] +mul v2.4S, v2.4S,v25.s[0] +ldr q12, [x0, #992] +mla v3.4S, v26.4S, v31.s[0] +sub v26.4s, v0.4s, v16.4s +add v0.4s, v0.4s, v16.4s +sqrdmulh v16.4S, v1.4S, v24.s[0] +mul v1.4S, v1.4S,v25.s[0] +ldr q11, [x0, #864] +mla v2.4S, v29.4S, v31.s[0] +sub v29.4s, v15.4s, v3.4s +add v15.4s, v15.4s, v3.4s +sqrdmulh v3.4S, v0.4S, v24.s[1] +str q20, [x0, #64] +mul v0.4S, v0.4S,v25.s[1] +ldr q20, [x0, #608] +mla v1.4S, v16.4S, v31.s[0] +sub v16.4s, v14.4s, v2.4s +add v14.4s, v14.4s, v2.4s +sqrdmulh v2.4S, v15.4S, v24.s[1] +str q17, [x0, #192] +mul v15.4S, v15.4S,v25.s[1] +ldr q17, [x0, #736] +mla v0.4S, v3.4S, v31.s[0] +sub v3.4s, v13.4s, v1.4s +add v13.4s, v13.4s, v1.4s +sqrdmulh v1.4S, v26.4S, v24.s[2] +str q27, [x0, #320] +mul v26.4S, v26.4S,v25.s[2] +ldr q27, [x0, #480] +mla v15.4S, v2.4S, v31.s[0] +sub v2.4s, v13.4s, v0.4s +add v13.4s, v13.4s, v0.4s +sqrdmulh v0.4S, v29.4S, v24.s[2] +str q18, [x0, #448] +mul v29.4S, v29.4S,v25.s[2] +ldr q18, [x0, #352] +mla v26.4S, v1.4S, v31.s[0] +sub v1.4s, v14.4s, v15.4s +add v14.4s, v14.4s, v15.4s +sqrdmulh v15.4S, v13.4S, v22.s[0] +str q21, [x0, #832] +mul v13.4S, v13.4S,v23.s[0] +ldr q21, [x0, #96] +mla v29.4S, v0.4S, v31.s[0] +sub v0.4s, v3.4s, v26.4s +add v3.4s, v3.4s, v26.4s +sqrdmulh v26.4S, v2.4S, v22.s[1] +str q19, [x0, #960] +mul v2.4S, v2.4S,v23.s[1] +ldr q19, [x0, #224] +mla v13.4S, v15.4S, v31.s[0] +sub v15.4s, v16.4s, v29.4s +add v16.4s, v16.4s, v29.4s +sqrdmulh v29.4S, v0.4S, v22.s[3] +str q30, [x0, #576] +mul v0.4S, v0.4S,v23.s[3] +mla v2.4S, v26.4S, v31.s[0] +sub v26.4s, v14.4s, v13.4s +add v14.4s, v14.4s, v13.4s +sqrdmulh v13.4S, v3.4S, v22.s[2] +str q28, [x0, #704] +mul v3.4S, v3.4S,v23.s[2] +mla v0.4S, v29.4S, v31.s[0] +sub v29.4s, v1.4s, v2.4s +add v1.4s, v1.4s, v2.4s +sqrdmulh v2.4S, v12.4S, v24.s[0] +mul v12.4S, v12.4S,v25.s[0] +mla v3.4S, v13.4S, v31.s[0] +sub v13.4s, v15.4s, v0.4s +add v15.4s, v15.4s, v0.4s +sqrdmulh v0.4S, v11.4S, v24.s[0] +mul v11.4S, v11.4S,v25.s[0] +mla v12.4S, v2.4S, v31.s[0] +sub v2.4s, v16.4s, v3.4s +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v20.4S, v24.s[0] +mul v20.4S, v20.4S,v25.s[0] +ldr q28, [x0, #1008] +mla v11.4S, v0.4S, v31.s[0] +sub v0.4s, v27.4s, v12.4s +add v27.4s, v27.4s, v12.4s +sqrdmulh v12.4S, v17.4S, v24.s[0] +mul v17.4S, v17.4S,v25.s[0] +ldr q30, [x0, #880] +mla v20.4S, v3.4S, v31.s[0] +sub v3.4s, v18.4s, v11.4s +add v18.4s, v18.4s, v11.4s +sqrdmulh v11.4S, v27.4S, v24.s[1] +str q14, [x0, #80] +mul v27.4S, v27.4S,v25.s[1] +ldr q14, [x0, #624] +mla v17.4S, v12.4S, v31.s[0] +sub v12.4s, v21.4s, v20.4s +add v21.4s, v21.4s, v20.4s +sqrdmulh v20.4S, v18.4S, v24.s[1] +str q26, [x0, #208] +mul v18.4S, v18.4S,v25.s[1] +ldr q26, [x0, #752] +mla v27.4S, v11.4S, v31.s[0] +sub v11.4s, v19.4s, v17.4s +add v19.4s, v19.4s, v17.4s +sqrdmulh v17.4S, v0.4S, v24.s[2] +str q1, [x0, #336] +mul v0.4S, v0.4S,v25.s[2] +ldr q1, [x0, #496] +mla v18.4S, v20.4S, v31.s[0] +sub v20.4s, v19.4s, v27.4s +add v19.4s, v19.4s, v27.4s +sqrdmulh v27.4S, v3.4S, v24.s[2] +str q29, [x0, #464] +mul v3.4S, v3.4S,v25.s[2] +ldr q29, [x0, #368] +mla v0.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v18.4s +add v21.4s, v21.4s, v18.4s +sqrdmulh v18.4S, v19.4S, v22.s[0] +str q15, [x0, #848] +mul v19.4S, v19.4S,v23.s[0] +ldr q15, [x0, #112] +mla v3.4S, v27.4S, v31.s[0] +sub v27.4s, v11.4s, v0.4s +add v11.4s, v11.4s, v0.4s +sqrdmulh v0.4S, v20.4S, v22.s[1] +str q13, [x0, #976] +mul v20.4S, v20.4S,v23.s[1] +ldr q13, [x0, #240] +mla v19.4S, v18.4S, v31.s[0] +sub v18.4s, v12.4s, v3.4s +add v12.4s, v12.4s, v3.4s +sqrdmulh v3.4S, v27.4S, v22.s[3] +str q16, [x0, #592] +mul v27.4S, v27.4S,v23.s[3] +mla v20.4S, v0.4S, v31.s[0] +sub v0.4s, v21.4s, v19.4s +add v21.4s, v21.4s, v19.4s +sqrdmulh v19.4S, v11.4S, v22.s[2] +str q2, [x0, #720] +mul v11.4S, v11.4S,v23.s[2] +mla v27.4S, v3.4S, v31.s[0] +sub v3.4s, v17.4s, v20.4s +add v17.4s, v17.4s, v20.4s +sqrdmulh v20.4S, v28.4S, v24.s[0] +mul v28.4S, v28.4S,v25.s[0] +mla v11.4S, v19.4S, v31.s[0] +sub v19.4s, v18.4s, v27.4s +add v18.4s, v18.4s, v27.4s +sqrdmulh v27.4S, v30.4S, v24.s[0] +mul v30.4S, v30.4S,v25.s[0] +mla v28.4S, v20.4S, v31.s[0] +sub v20.4s, v12.4s, v11.4s +add v12.4s, v12.4s, v11.4s +sqrdmulh v11.4S, v14.4S, v24.s[0] +mul v14.4S, v14.4S,v25.s[0] +ldr q2, [x0, #896] +mla v30.4S, v27.4S, v31.s[0] +sub v27.4s, v1.4s, v28.4s +add v1.4s, v1.4s, v28.4s +sqrdmulh v28.4S, v26.4S, v24.s[0] +mul v26.4S, v26.4S,v25.s[0] +ldr q16, [x0, #768] +mla v14.4S, v11.4S, v31.s[0] +sub v11.4s, v29.4s, v30.4s +add v29.4s, v29.4s, v30.4s +sqrdmulh v30.4S, v1.4S, v24.s[1] +str q21, [x0, #96] +mul v1.4S, v1.4S,v25.s[1] +ldr q21, [x0, #512] +mla v26.4S, v28.4S, v31.s[0] +sub v28.4s, v15.4s, v14.4s +add v15.4s, v15.4s, v14.4s +sqrdmulh v14.4S, v29.4S, v24.s[1] +str q0, [x0, #224] +mul v29.4S, v29.4S,v25.s[1] +ldr q0, [x0, #640] +mla v1.4S, v30.4S, v31.s[0] +sub v30.4s, v13.4s, v26.4s +add v13.4s, v13.4s, v26.4s +sqrdmulh v26.4S, v27.4S, v24.s[2] +str q17, [x0, #352] +mul v27.4S, v27.4S,v25.s[2] +ldr q17, [x0, #384] +mla v29.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v1.4s +add v13.4s, v13.4s, v1.4s +sqrdmulh v1.4S, v11.4S, v24.s[2] +str q3, [x0, #480] +mul v11.4S, v11.4S,v25.s[2] +ldr q3, [x0, #256] +mla v27.4S, v26.4S, v31.s[0] +sub v26.4s, v15.4s, v29.4s +add v15.4s, v15.4s, v29.4s +sqrdmulh v29.4S, v13.4S, v22.s[0] +str q18, [x0, #864] +mul v13.4S, v13.4S,v23.s[0] +ldr q18, [x0, #0] +mla v11.4S, v1.4S, v31.s[0] +sub v1.4s, v30.4s, v27.4s +add v30.4s, v30.4s, v27.4s +sqrdmulh v27.4S, v14.4S, v22.s[1] +str q19, [x0, #992] +mul v14.4S, v14.4S,v23.s[1] +ldr q19, [x0, #128] +mla v13.4S, v29.4S, v31.s[0] +sub v29.4s, v28.4s, v11.4s +add v28.4s, v28.4s, v11.4s +sqrdmulh v11.4S, v1.4S, v22.s[3] +str q12, [x0, #608] +mul v1.4S, v1.4S,v23.s[3] +mla v14.4S, v27.4S, v31.s[0] +sub v27.4s, v15.4s, v13.4s +add v15.4s, v15.4s, v13.4s +sqrdmulh v13.4S, v30.4S, v22.s[2] +str q20, [x0, #736] +mul v30.4S, v30.4S,v23.s[2] +mla v1.4S, v11.4S, v31.s[0] +sub v11.4s, v26.4s, v14.4s +add v26.4s, v26.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v24.s[0] +mul v2.4S, v2.4S,v25.s[0] +mla v30.4S, v13.4S, v31.s[0] +sub v13.4s, v29.4s, v1.4s +add v29.4s, v29.4s, v1.4s +sqrdmulh v1.4S, v16.4S, v24.s[0] +mul v16.4S, v16.4S,v25.s[0] +mla v2.4S, v14.4S, v31.s[0] +sub v14.4s, v28.4s, v30.4s +add v28.4s, v28.4s, v30.4s +sqrdmulh v30.4S, v21.4S, v24.s[0] +mul v21.4S, v21.4S,v25.s[0] +ldr q20, [x0, #912] +mla v16.4S, v1.4S, v31.s[0] +sub v1.4s, v17.4s, v2.4s +add v17.4s, v17.4s, v2.4s +sqrdmulh v2.4S, v0.4S, v24.s[0] +mul v0.4S, v0.4S,v25.s[0] +ldr q12, [x0, #784] +mla v21.4S, v30.4S, v31.s[0] +sub v30.4s, v3.4s, v16.4s +add v3.4s, v3.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v24.s[1] +str q15, [x0, #112] +mul v17.4S, v17.4S,v25.s[1] +ldr q15, [x0, #528] +mla v0.4S, v2.4S, v31.s[0] +sub v2.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v3.4S, v24.s[1] +str q27, [x0, #240] +mul v3.4S, v3.4S,v25.s[1] +ldr q27, [x0, #656] +mla v17.4S, v16.4S, v31.s[0] +sub v16.4s, v19.4s, v0.4s +add v19.4s, v19.4s, v0.4s +sqrdmulh v0.4S, v1.4S, v24.s[2] +str q26, [x0, #368] +mul v1.4S, v1.4S,v25.s[2] +ldr q26, [x0, #400] +mla v3.4S, v21.4S, v31.s[0] +sub v21.4s, v19.4s, v17.4s +add v19.4s, v19.4s, v17.4s +sqrdmulh v17.4S, v30.4S, v24.s[2] +str q11, [x0, #496] +mul v30.4S, v30.4S,v25.s[2] +ldr q11, [x0, #272] +mla v1.4S, v0.4S, v31.s[0] +sub v0.4s, v18.4s, v3.4s +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v19.4S, v22.s[0] +str q29, [x0, #880] +mul v19.4S, v19.4S,v23.s[0] +ldr q29, [x0, #16] +mla v30.4S, v17.4S, v31.s[0] +sub v17.4s, v16.4s, v1.4s +add v16.4s, v16.4s, v1.4s +sqrdmulh v1.4S, v21.4S, v22.s[1] +str q13, [x0, #1008] +mul v21.4S, v21.4S,v23.s[1] +ldr q13, [x0, #144] +mla v19.4S, v3.4S, v31.s[0] +sub v3.4s, v2.4s, v30.4s +add v2.4s, v2.4s, v30.4s +sqrdmulh v30.4S, v17.4S, v22.s[3] +str q28, [x0, #624] +mul v17.4S, v17.4S,v23.s[3] +mla v21.4S, v1.4S, v31.s[0] +sub v1.4s, v18.4s, v19.4s +add v18.4s, v18.4s, v19.4s +sqrdmulh v19.4S, v16.4S, v22.s[2] +str q14, [x0, #752] +mul v16.4S, v16.4S,v23.s[2] +mla v17.4S, v30.4S, v31.s[0] +sub v30.4s, v0.4s, v21.4s +add v0.4s, v0.4s, v21.4s +sqrdmulh v21.4S, v20.4S, v24.s[0] +mul v20.4S, v20.4S,v25.s[0] +mla v16.4S, v19.4S, v31.s[0] +sub v19.4s, v3.4s, v17.4s +add v3.4s, v3.4s, v17.4s +sqrdmulh v17.4S, v12.4S, v24.s[0] +mul v12.4S, v12.4S,v25.s[0] +mla v20.4S, v21.4S, v31.s[0] +sub v21.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v15.4S, v24.s[0] +mul v15.4S, v15.4S,v25.s[0] +ldr q14, [x0, #928] +mla v12.4S, v17.4S, v31.s[0] +sub v17.4s, v26.4s, v20.4s +add v26.4s, v26.4s, v20.4s +sqrdmulh v20.4S, v27.4S, v24.s[0] +mul v27.4S, v27.4S,v25.s[0] +ldr q28, [x0, #800] +mla v15.4S, v16.4S, v31.s[0] +sub v16.4s, v11.4s, v12.4s +add v11.4s, v11.4s, v12.4s +sqrdmulh v12.4S, v26.4S, v24.s[1] +str q18, [x0, #0] +mul v26.4S, v26.4S,v25.s[1] +ldr q18, [x0, #544] +mla v27.4S, v20.4S, v31.s[0] +sub v20.4s, v29.4s, v15.4s +add v29.4s, v29.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v24.s[1] +str q1, [x0, #128] +mul v11.4S, v11.4S,v25.s[1] +ldr q1, [x0, #672] +mla v26.4S, v12.4S, v31.s[0] +sub v12.4s, v13.4s, v27.4s +add v13.4s, v13.4s, v27.4s +sqrdmulh v27.4S, v17.4S, v24.s[2] +str q0, [x0, #256] +mul v17.4S, v17.4S,v25.s[2] +ldr q0, [x0, #416] +mla v11.4S, v15.4S, v31.s[0] +sub v15.4s, v13.4s, v26.4s +add v13.4s, v13.4s, v26.4s +sqrdmulh v26.4S, v16.4S, v24.s[2] +str q30, [x0, #384] +mul v16.4S, v16.4S,v25.s[2] +ldr q30, [x0, #288] +mla v17.4S, v27.4S, v31.s[0] +sub v27.4s, v29.4s, v11.4s +add v29.4s, v29.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v22.s[0] +str q3, [x0, #768] +mul v13.4S, v13.4S,v23.s[0] +ldr q3, [x0, #32] +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v12.4s, v17.4s +add v12.4s, v12.4s, v17.4s +sqrdmulh v17.4S, v15.4S, v22.s[1] +str q19, [x0, #896] +mul v15.4S, v15.4S,v23.s[1] +ldr q19, [x0, #160] +mla v13.4S, v11.4S, v31.s[0] +sub v11.4s, v20.4s, v16.4s +add v20.4s, v20.4s, v16.4s +sqrdmulh v16.4S, v26.4S, v22.s[3] +str q2, [x0, #512] +mul v26.4S, v26.4S,v23.s[3] +mla v15.4S, v17.4S, v31.s[0] +sub v17.4s, v29.4s, v13.4s +add v29.4s, v29.4s, v13.4s +sqrdmulh v13.4S, v12.4S, v22.s[2] +str q21, [x0, #640] +mul v12.4S, v12.4S,v23.s[2] +mla v26.4S, v16.4S, v31.s[0] +sub v16.4s, v27.4s, v15.4s +add v27.4s, v27.4s, v15.4s +sqrdmulh v15.4S, v14.4S, v24.s[0] +mul v14.4S, v14.4S,v25.s[0] +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v26.4s +add v11.4s, v11.4s, v26.4s +sqrdmulh v26.4S, v28.4S, v24.s[0] +mul v28.4S, v28.4S,v25.s[0] +mla v14.4S, v15.4S, v31.s[0] +sub v15.4s, v20.4s, v12.4s +add v20.4s, v20.4s, v12.4s +sqrdmulh v12.4S, v18.4S, v24.s[0] +mul v18.4S, v18.4S,v25.s[0] +ldr q21, [x0, #944] +mla v28.4S, v26.4S, v31.s[0] +sub v26.4s, v0.4s, v14.4s +add v0.4s, v0.4s, v14.4s +sqrdmulh v14.4S, v1.4S, v24.s[0] +mul v1.4S, v1.4S,v25.s[0] +ldr q2, [x0, #816] +mla v18.4S, v12.4S, v31.s[0] +sub v12.4s, v30.4s, v28.4s +add v30.4s, v30.4s, v28.4s +sqrdmulh v28.4S, v0.4S, v24.s[1] +str q29, [x0, #16] +mul v0.4S, v0.4S,v25.s[1] +ldr q29, [x0, #560] +mla v1.4S, v14.4S, v31.s[0] +sub v14.4s, v3.4s, v18.4s +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v30.4S, v24.s[1] +str q17, [x0, #144] +mul v30.4S, v30.4S,v25.s[1] +ldr q17, [x0, #688] +mla v0.4S, v28.4S, v31.s[0] +sub v28.4s, v19.4s, v1.4s +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v26.4S, v24.s[2] +str q27, [x0, #272] +mul v26.4S, v26.4S,v25.s[2] +ldr q27, [x0, #432] +mla v30.4S, v18.4S, v31.s[0] +sub v18.4s, v19.4s, v0.4s +add v19.4s, v19.4s, v0.4s +sqrdmulh v0.4S, v12.4S, v24.s[2] +str q16, [x0, #400] +mul v12.4S, v12.4S,v25.s[2] +ldr q16, [x0, #304] +mla v26.4S, v1.4S, v31.s[0] +sub v1.4s, v3.4s, v30.4s +add v3.4s, v3.4s, v30.4s +sqrdmulh v30.4S, v19.4S, v22.s[0] +str q11, [x0, #784] +mul v19.4S, v19.4S,v23.s[0] +ldr q11, [x0, #48] +mla v12.4S, v0.4S, v31.s[0] +sub v0.4s, v28.4s, v26.4s +add v28.4s, v28.4s, v26.4s +sqrdmulh v26.4S, v18.4S, v22.s[1] +str q13, [x0, #912] +mul v18.4S, v18.4S,v23.s[1] +ldr q13, [x0, #176] +mla v19.4S, v30.4S, v31.s[0] +sub v30.4s, v14.4s, v12.4s +add v14.4s, v14.4s, v12.4s +sqrdmulh v12.4S, v0.4S, v22.s[3] +str q20, [x0, #528] +mul v0.4S, v0.4S,v23.s[3] +mla v18.4S, v26.4S, v31.s[0] +sub v26.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v28.4S, v22.s[2] +str q15, [x0, #656] +mul v28.4S, v28.4S,v23.s[2] +mla v0.4S, v12.4S, v31.s[0] +sub v12.4s, v1.4s, v18.4s +add v1.4s, v1.4s, v18.4s +sqrdmulh v18.4S, v21.4S, v24.s[0] +mul v21.4S, v21.4S,v25.s[0] +mla v28.4S, v19.4S, v31.s[0] +sub v19.4s, v30.4s, v0.4s +add v30.4s, v30.4s, v0.4s +sqrdmulh v0.4S, v2.4S, v24.s[0] +mul v2.4S, v2.4S,v25.s[0] +mla v21.4S, v18.4S, v31.s[0] +sub v18.4s, v14.4s, v28.4s +add v14.4s, v14.4s, v28.4s +sqrdmulh v28.4S, v29.4S, v24.s[0] +mul v29.4S, v29.4S,v25.s[0] +mla v2.4S, v0.4S, v31.s[0] +sub v0.4s, v27.4s, v21.4s +add v27.4s, v27.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v24.s[0] +mul v17.4S, v17.4S,v25.s[0] +mla v29.4S, v28.4S, v31.s[0] +sub v28.4s, v16.4s, v2.4s +add v16.4s, v16.4s, v2.4s +sqrdmulh v2.4S, v27.4S, v24.s[1] +str q3, [x0, #32] +mul v27.4S, v27.4S,v25.s[1] +mla v17.4S, v21.4S, v31.s[0] +sub v21.4s, v11.4s, v29.4s +add v11.4s, v11.4s, v29.4s +sqrdmulh v29.4S, v16.4S, v24.s[1] +str q26, [x0, #160] +mul v16.4S, v16.4S,v25.s[1] +mla v27.4S, v2.4S, v31.s[0] +sub v2.4s, v13.4s, v17.4s +add v13.4s, v13.4s, v17.4s +sqrdmulh v17.4S, v0.4S, v24.s[2] +str q1, [x0, #288] +mul v0.4S, v0.4S,v25.s[2] +mla v16.4S, v29.4S, v31.s[0] +sub v29.4s, v13.4s, v27.4s +add v13.4s, v13.4s, v27.4s +sqrdmulh v27.4S, v28.4S, v24.s[2] +str q12, [x0, #416] +mul v28.4S, v28.4S,v25.s[2] +mla v0.4S, v17.4S, v31.s[0] +sub v17.4s, v11.4s, v16.4s +add v11.4s, v11.4s, v16.4s +sqrdmulh v16.4S, v13.4S, v22.s[0] +str q30, [x0, #800] +mul v13.4S, v13.4S,v23.s[0] +mla v28.4S, v27.4S, v31.s[0] +sub v27.4s, v2.4s, v0.4s +add v2.4s, v2.4s, v0.4s +sqrdmulh v0.4S, v29.4S, v22.s[1] +str q19, [x0, #928] +mul v29.4S, v29.4S,v23.s[1] +mla v13.4S, v16.4S, v31.s[0] +sub v16.4s, v21.4s, v28.4s +add v21.4s, v21.4s, v28.4s +sqrdmulh v28.4S, v27.4S, v22.s[3] +str q14, [x0, #544] +mul v27.4S, v27.4S,v23.s[3] +mla v29.4S, v0.4S, v31.s[0] +sub v0.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +sqrdmulh v13.4S, v2.4S, v22.s[2] +str q18, [x0, #672] +mul v2.4S, v2.4S,v23.s[2] +mla v27.4S, v28.4S, v31.s[0] +sub v28.4s, v17.4s, v29.4s +add v17.4s, v17.4s, v29.4s +mla v2.4S, v13.4S, v31.s[0] +sub v13.4s, v16.4s, v27.4s +add v16.4s, v16.4s, v27.4s +sub v27.4s, v21.4s, v2.4s +add v21.4s, v21.4s, v2.4s +str q11, [x0, #48] +str q0, [x0, #176] +str q17, [x0, #304] +str q28, [x0, #432] +str q16, [x0, #816] +str q13, [x0, #944] +str q21, [x0, #560] +str q27, [x0, #688] +ldr q4, [x0, #112] +ldr q5, [x0, #96] +ldr q6, [x0, #64] +ldr q7, [x0, #80] +ldr q8, [x0, #48] +ldr q9, [x17, #+64] +ldr q10, [x17, #+80] +ldr q20, [x17, #+96] +ldr q15, [x17, #+112] +ldr q3, [x0, #32] +ldr q26, [x0, #0] +ldr q1, [x0, #16] +sqrdmulh v12.4S, v4.4S, v10.s[0] +mul v4.4S, v4.4S,v9.s[0] +sqrdmulh v30.4S, v5.4S, v10.s[0] +mul v5.4S, v5.4S,v9.s[0] +mla v4.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v6.4S, v10.s[0] +mul v6.4S, v6.4S,v9.s[0] +ldr q19, [x0, #240] +mla v5.4S, v30.4S, v31.s[0] +sub v30.4s, v8.4s, v4.4s +add v8.4s, v8.4s, v4.4s +sqrdmulh v4.4S, v7.4S, v10.s[0] +mul v7.4S, v7.4S,v9.s[0] +ldr q14, [x0, #224] +mla v6.4S, v12.4S, v31.s[0] +sub v12.4s, v3.4s, v5.4s +add v3.4s, v3.4s, v5.4s +sqrdmulh v5.4S, v8.4S, v10.s[1] +mul v8.4S, v8.4S,v9.s[1] +ldr q18, [x0, #192] +mla v7.4S, v4.4S, v31.s[0] +sub v4.4s, v26.4s, v6.4s +add v26.4s, v26.4s, v6.4s +sqrdmulh v6.4S, v3.4S, v10.s[1] +mul v3.4S, v3.4S,v9.s[1] +ldr q29, [x0, #208] +mla v8.4S, v5.4S, v31.s[0] +sub v5.4s, v1.4s, v7.4s +add v1.4s, v1.4s, v7.4s +sqrdmulh v7.4S, v30.4S, v10.s[2] +mul v30.4S, v30.4S,v9.s[2] +ldr q2, [x0, #176] +mla v3.4S, v6.4S, v31.s[0] +sub v6.4s, v1.4s, v8.4s +add v1.4s, v1.4s, v8.4s +ldr q8, [x17, #+128] +ldr q25, [x17, #+144] +ldr q24, [x17, #+160] +ldr q23, [x17, #+176] +sqrdmulh v22.4S, v12.4S, v10.s[2] +mul v12.4S, v12.4S,v9.s[2] +ldr q11, [x0, #160] +mla v30.4S, v7.4S, v31.s[0] +sub v7.4s, v26.4s, v3.4s +add v26.4s, v26.4s, v3.4s +sqrdmulh v3.4S, v1.4S, v15.s[0] +mul v1.4S, v1.4S,v20.s[0] +ldr q0, [x0, #128] +mla v12.4S, v22.4S, v31.s[0] +sub v22.4s, v5.4s, v30.4s +add v5.4s, v5.4s, v30.4s +sqrdmulh v30.4S, v6.4S, v15.s[1] +mul v6.4S, v6.4S,v20.s[1] +ldr q17, [x0, #144] +mla v1.4S, v3.4S, v31.s[0] +sub v3.4s, v4.4s, v12.4s +add v4.4s, v4.4s, v12.4s +sqrdmulh v12.4S, v22.4S, v15.s[3] +mul v22.4S, v22.4S,v20.s[3] +mla v6.4S, v30.4S, v31.s[0] +sub v30.4s, v26.4s, v1.4s +add v26.4s, v26.4s, v1.4s +sqrdmulh v1.4S, v5.4S, v15.s[2] +mul v5.4S, v5.4S,v20.s[2] +mla v22.4S, v12.4S, v31.s[0] +sub v12.4s, v7.4s, v6.4s +add v7.4s, v7.4s, v6.4s +sqrdmulh v6.4S, v19.4S, v25.s[0] +mul v19.4S, v19.4S,v8.s[0] +mla v5.4S, v1.4S, v31.s[0] +sub v1.4s, v3.4s, v22.4s +add v3.4s, v3.4s, v22.4s +sqrdmulh v22.4S, v14.4S, v25.s[0] +mul v14.4S, v14.4S,v8.s[0] +mla v19.4S, v6.4S, v31.s[0] +sub v6.4s, v4.4s, v5.4s +add v4.4s, v4.4s, v5.4s +sqrdmulh v15.4S, v18.4S, v25.s[0] +mul v18.4S, v18.4S,v8.s[0] +ldr q20, [x0, #368] +mla v14.4S, v22.4S, v31.s[0] +sub v22.4s, v2.4s, v19.4s +add v2.4s, v2.4s, v19.4s +sqrdmulh v19.4S, v29.4S, v25.s[0] +mul v29.4S, v29.4S,v8.s[0] +ldr q10, [x0, #352] +mla v18.4S, v15.4S, v31.s[0] +sub v15.4s, v11.4s, v14.4s +add v11.4s, v11.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v25.s[1] +str q26, [x0, #0] +mul v2.4S, v2.4S,v8.s[1] +ldr q26, [x0, #320] +mla v29.4S, v19.4S, v31.s[0] +sub v19.4s, v0.4s, v18.4s +add v0.4s, v0.4s, v18.4s +sqrdmulh v18.4S, v11.4S, v25.s[1] +str q30, [x0, #16] +mul v11.4S, v11.4S,v8.s[1] +ldr q30, [x0, #336] +mla v2.4S, v14.4S, v31.s[0] +sub v14.4s, v17.4s, v29.4s +add v17.4s, v17.4s, v29.4s +sqrdmulh v29.4S, v22.4S, v25.s[2] +str q7, [x0, #32] +mul v22.4S, v22.4S,v8.s[2] +ldr q7, [x0, #304] +mla v11.4S, v18.4S, v31.s[0] +sub v18.4s, v17.4s, v2.4s +add v17.4s, v17.4s, v2.4s +ldr q2, [x17, #+192] +ldr q9, [x17, #+208] +ldr q5, [x17, #+224] +ldr q28, [x17, #+240] +sqrdmulh v16.4S, v15.4S, v25.s[2] +str q12, [x0, #48] +mul v15.4S, v15.4S,v8.s[2] +ldr q12, [x0, #288] +mla v22.4S, v29.4S, v31.s[0] +sub v29.4s, v0.4s, v11.4s +add v0.4s, v0.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v23.s[0] +str q3, [x0, #96] +mul v17.4S, v17.4S,v24.s[0] +ldr q3, [x0, #256] +mla v15.4S, v16.4S, v31.s[0] +sub v16.4s, v14.4s, v22.4s +add v14.4s, v14.4s, v22.4s +sqrdmulh v22.4S, v18.4S, v23.s[1] +str q1, [x0, #112] +mul v18.4S, v18.4S,v24.s[1] +ldr q1, [x0, #272] +mla v17.4S, v11.4S, v31.s[0] +sub v11.4s, v19.4s, v15.4s +add v19.4s, v19.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v23.s[3] +str q4, [x0, #64] +mul v16.4S, v16.4S,v24.s[3] +mla v18.4S, v22.4S, v31.s[0] +sub v22.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +sqrdmulh v17.4S, v14.4S, v23.s[2] +str q6, [x0, #80] +mul v14.4S, v14.4S,v24.s[2] +mla v16.4S, v15.4S, v31.s[0] +sub v15.4s, v29.4s, v18.4s +add v29.4s, v29.4s, v18.4s +sqrdmulh v18.4S, v20.4S, v9.s[0] +mul v20.4S, v20.4S,v2.s[0] +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v11.4s, v16.4s +add v11.4s, v11.4s, v16.4s +sqrdmulh v16.4S, v10.4S, v9.s[0] +mul v10.4S, v10.4S,v2.s[0] +mla v20.4S, v18.4S, v31.s[0] +sub v18.4s, v19.4s, v14.4s +add v19.4s, v19.4s, v14.4s +sqrdmulh v23.4S, v26.4S, v9.s[0] +mul v26.4S, v26.4S,v2.s[0] +ldr q24, [x0, #496] +mla v10.4S, v16.4S, v31.s[0] +sub v16.4s, v7.4s, v20.4s +add v7.4s, v7.4s, v20.4s +sqrdmulh v20.4S, v30.4S, v9.s[0] +mul v30.4S, v30.4S,v2.s[0] +ldr q25, [x0, #480] +mla v26.4S, v23.4S, v31.s[0] +sub v23.4s, v12.4s, v10.4s +add v12.4s, v12.4s, v10.4s +sqrdmulh v10.4S, v7.4S, v9.s[1] +str q0, [x0, #128] +mul v7.4S, v7.4S,v2.s[1] +ldr q0, [x0, #448] +mla v30.4S, v20.4S, v31.s[0] +sub v20.4s, v3.4s, v26.4s +add v3.4s, v3.4s, v26.4s +sqrdmulh v26.4S, v12.4S, v9.s[1] +str q22, [x0, #144] +mul v12.4S, v12.4S,v2.s[1] +ldr q22, [x0, #464] +mla v7.4S, v10.4S, v31.s[0] +sub v10.4s, v1.4s, v30.4s +add v1.4s, v1.4s, v30.4s +sqrdmulh v30.4S, v16.4S, v9.s[2] +str q29, [x0, #160] +mul v16.4S, v16.4S,v2.s[2] +ldr q29, [x0, #432] +mla v12.4S, v26.4S, v31.s[0] +sub v26.4s, v1.4s, v7.4s +add v1.4s, v1.4s, v7.4s +ldr q7, [x17, #+256] +ldr q8, [x17, #+272] +ldr q14, [x17, #+288] +ldr q6, [x17, #+304] +sqrdmulh v4.4S, v23.4S, v9.s[2] +str q15, [x0, #176] +mul v23.4S, v23.4S,v2.s[2] +ldr q15, [x0, #416] +mla v16.4S, v30.4S, v31.s[0] +sub v30.4s, v3.4s, v12.4s +add v3.4s, v3.4s, v12.4s +sqrdmulh v12.4S, v1.4S, v28.s[0] +str q11, [x0, #224] +mul v1.4S, v1.4S,v5.s[0] +ldr q11, [x0, #384] +mla v23.4S, v4.4S, v31.s[0] +sub v4.4s, v10.4s, v16.4s +add v10.4s, v10.4s, v16.4s +sqrdmulh v16.4S, v26.4S, v28.s[1] +str q17, [x0, #240] +mul v26.4S, v26.4S,v5.s[1] +ldr q17, [x0, #400] +mla v1.4S, v12.4S, v31.s[0] +sub v12.4s, v20.4s, v23.4s +add v20.4s, v20.4s, v23.4s +sqrdmulh v23.4S, v4.4S, v28.s[3] +str q19, [x0, #192] +mul v4.4S, v4.4S,v5.s[3] +mla v26.4S, v16.4S, v31.s[0] +sub v16.4s, v3.4s, v1.4s +add v3.4s, v3.4s, v1.4s +sqrdmulh v1.4S, v10.4S, v28.s[2] +str q18, [x0, #208] +mul v10.4S, v10.4S,v5.s[2] +mla v4.4S, v23.4S, v31.s[0] +sub v23.4s, v30.4s, v26.4s +add v30.4s, v30.4s, v26.4s +sqrdmulh v26.4S, v24.4S, v8.s[0] +mul v24.4S, v24.4S,v7.s[0] +mla v10.4S, v1.4S, v31.s[0] +sub v1.4s, v12.4s, v4.4s +add v12.4s, v12.4s, v4.4s +sqrdmulh v4.4S, v25.4S, v8.s[0] +mul v25.4S, v25.4S,v7.s[0] +mla v24.4S, v26.4S, v31.s[0] +sub v26.4s, v20.4s, v10.4s +add v20.4s, v20.4s, v10.4s +sqrdmulh v28.4S, v0.4S, v8.s[0] +mul v0.4S, v0.4S,v7.s[0] +ldr q5, [x0, #624] +mla v25.4S, v4.4S, v31.s[0] +sub v4.4s, v29.4s, v24.4s +add v29.4s, v29.4s, v24.4s +sqrdmulh v24.4S, v22.4S, v8.s[0] +mul v22.4S, v22.4S,v7.s[0] +ldr q9, [x0, #608] +mla v0.4S, v28.4S, v31.s[0] +sub v28.4s, v15.4s, v25.4s +add v15.4s, v15.4s, v25.4s +sqrdmulh v25.4S, v29.4S, v8.s[1] +str q3, [x0, #256] +mul v29.4S, v29.4S,v7.s[1] +ldr q3, [x0, #576] +mla v22.4S, v24.4S, v31.s[0] +sub v24.4s, v11.4s, v0.4s +add v11.4s, v11.4s, v0.4s +sqrdmulh v0.4S, v15.4S, v8.s[1] +str q16, [x0, #272] +mul v15.4S, v15.4S,v7.s[1] +ldr q16, [x0, #592] +mla v29.4S, v25.4S, v31.s[0] +sub v25.4s, v17.4s, v22.4s +add v17.4s, v17.4s, v22.4s +sqrdmulh v22.4S, v4.4S, v8.s[2] +str q30, [x0, #288] +mul v4.4S, v4.4S,v7.s[2] +ldr q30, [x0, #560] +mla v15.4S, v0.4S, v31.s[0] +sub v0.4s, v17.4s, v29.4s +add v17.4s, v17.4s, v29.4s +ldr q29, [x17, #+320] +ldr q2, [x17, #+336] +ldr q10, [x17, #+352] +ldr q18, [x17, #+368] +sqrdmulh v19.4S, v28.4S, v8.s[2] +str q23, [x0, #304] +mul v28.4S, v28.4S,v7.s[2] +ldr q23, [x0, #544] +mla v4.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v15.4s +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v17.4S, v6.s[0] +str q12, [x0, #352] +mul v17.4S, v17.4S,v14.s[0] +ldr q12, [x0, #512] +mla v28.4S, v19.4S, v31.s[0] +sub v19.4s, v25.4s, v4.4s +add v25.4s, v25.4s, v4.4s +sqrdmulh v4.4S, v0.4S, v6.s[1] +str q1, [x0, #368] +mul v0.4S, v0.4S,v14.s[1] +ldr q1, [x0, #528] +mla v17.4S, v15.4S, v31.s[0] +sub v15.4s, v24.4s, v28.4s +add v24.4s, v24.4s, v28.4s +sqrdmulh v28.4S, v19.4S, v6.s[3] +str q20, [x0, #320] +mul v19.4S, v19.4S,v14.s[3] +mla v0.4S, v4.4S, v31.s[0] +sub v4.4s, v11.4s, v17.4s +add v11.4s, v11.4s, v17.4s +sqrdmulh v17.4S, v25.4S, v6.s[2] +str q26, [x0, #336] +mul v25.4S, v25.4S,v14.s[2] +mla v19.4S, v28.4S, v31.s[0] +sub v28.4s, v22.4s, v0.4s +add v22.4s, v22.4s, v0.4s +sqrdmulh v0.4S, v5.4S, v2.s[0] +mul v5.4S, v5.4S,v29.s[0] +mla v25.4S, v17.4S, v31.s[0] +sub v17.4s, v15.4s, v19.4s +add v15.4s, v15.4s, v19.4s +sqrdmulh v19.4S, v9.4S, v2.s[0] +mul v9.4S, v9.4S,v29.s[0] +mla v5.4S, v0.4S, v31.s[0] +sub v0.4s, v24.4s, v25.4s +add v24.4s, v24.4s, v25.4s +sqrdmulh v6.4S, v3.4S, v2.s[0] +mul v3.4S, v3.4S,v29.s[0] +ldr q14, [x0, #752] +mla v9.4S, v19.4S, v31.s[0] +sub v19.4s, v30.4s, v5.4s +add v30.4s, v30.4s, v5.4s +sqrdmulh v5.4S, v16.4S, v2.s[0] +mul v16.4S, v16.4S,v29.s[0] +ldr q8, [x0, #736] +mla v3.4S, v6.4S, v31.s[0] +sub v6.4s, v23.4s, v9.4s +add v23.4s, v23.4s, v9.4s +sqrdmulh v9.4S, v30.4S, v2.s[1] +str q11, [x0, #384] +mul v30.4S, v30.4S,v29.s[1] +ldr q11, [x0, #704] +mla v16.4S, v5.4S, v31.s[0] +sub v5.4s, v12.4s, v3.4s +add v12.4s, v12.4s, v3.4s +sqrdmulh v3.4S, v23.4S, v2.s[1] +str q4, [x0, #400] +mul v23.4S, v23.4S,v29.s[1] +ldr q4, [x0, #720] +mla v30.4S, v9.4S, v31.s[0] +sub v9.4s, v1.4s, v16.4s +add v1.4s, v1.4s, v16.4s +sqrdmulh v16.4S, v19.4S, v2.s[2] +str q22, [x0, #416] +mul v19.4S, v19.4S,v29.s[2] +ldr q22, [x0, #688] +mla v23.4S, v3.4S, v31.s[0] +sub v3.4s, v1.4s, v30.4s +add v1.4s, v1.4s, v30.4s +ldr q30, [x17, #+384] +ldr q7, [x17, #+400] +ldr q25, [x17, #+416] +ldr q26, [x17, #+432] +sqrdmulh v20.4S, v6.4S, v2.s[2] +str q28, [x0, #432] +mul v6.4S, v6.4S,v29.s[2] +ldr q28, [x0, #672] +mla v19.4S, v16.4S, v31.s[0] +sub v16.4s, v12.4s, v23.4s +add v12.4s, v12.4s, v23.4s +sqrdmulh v23.4S, v1.4S, v18.s[0] +str q15, [x0, #480] +mul v1.4S, v1.4S,v10.s[0] +ldr q15, [x0, #640] +mla v6.4S, v20.4S, v31.s[0] +sub v20.4s, v9.4s, v19.4s +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v3.4S, v18.s[1] +str q17, [x0, #496] +mul v3.4S, v3.4S,v10.s[1] +ldr q17, [x0, #656] +mla v1.4S, v23.4S, v31.s[0] +sub v23.4s, v5.4s, v6.4s +add v5.4s, v5.4s, v6.4s +sqrdmulh v6.4S, v20.4S, v18.s[3] +str q24, [x0, #448] +mul v20.4S, v20.4S,v10.s[3] +mla v3.4S, v19.4S, v31.s[0] +sub v19.4s, v12.4s, v1.4s +add v12.4s, v12.4s, v1.4s +sqrdmulh v1.4S, v9.4S, v18.s[2] +str q0, [x0, #464] +mul v9.4S, v9.4S,v10.s[2] +mla v20.4S, v6.4S, v31.s[0] +sub v6.4s, v16.4s, v3.4s +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v14.4S, v7.s[0] +mul v14.4S, v14.4S,v30.s[0] +mla v9.4S, v1.4S, v31.s[0] +sub v1.4s, v23.4s, v20.4s +add v23.4s, v23.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v7.s[0] +mul v8.4S, v8.4S,v30.s[0] +mla v14.4S, v3.4S, v31.s[0] +sub v3.4s, v5.4s, v9.4s +add v5.4s, v5.4s, v9.4s +sqrdmulh v18.4S, v11.4S, v7.s[0] +mul v11.4S, v11.4S,v30.s[0] +ldr q10, [x0, #880] +mla v8.4S, v20.4S, v31.s[0] +sub v20.4s, v22.4s, v14.4s +add v22.4s, v22.4s, v14.4s +sqrdmulh v14.4S, v4.4S, v7.s[0] +mul v4.4S, v4.4S,v30.s[0] +ldr q2, [x0, #864] +mla v11.4S, v18.4S, v31.s[0] +sub v18.4s, v28.4s, v8.4s +add v28.4s, v28.4s, v8.4s +sqrdmulh v8.4S, v22.4S, v7.s[1] +str q12, [x0, #512] +mul v22.4S, v22.4S,v30.s[1] +ldr q12, [x0, #832] +mla v4.4S, v14.4S, v31.s[0] +sub v14.4s, v15.4s, v11.4s +add v15.4s, v15.4s, v11.4s +sqrdmulh v11.4S, v28.4S, v7.s[1] +str q19, [x0, #528] +mul v28.4S, v28.4S,v30.s[1] +ldr q19, [x0, #848] +mla v22.4S, v8.4S, v31.s[0] +sub v8.4s, v17.4s, v4.4s +add v17.4s, v17.4s, v4.4s +sqrdmulh v4.4S, v20.4S, v7.s[2] +str q16, [x0, #544] +mul v20.4S, v20.4S,v30.s[2] +ldr q16, [x0, #816] +mla v28.4S, v11.4S, v31.s[0] +sub v11.4s, v17.4s, v22.4s +add v17.4s, v17.4s, v22.4s +ldr q22, [x17, #+448] +ldr q29, [x17, #+464] +ldr q9, [x17, #+480] +ldr q0, [x17, #+496] +sqrdmulh v24.4S, v18.4S, v7.s[2] +str q6, [x0, #560] +mul v18.4S, v18.4S,v30.s[2] +ldr q6, [x0, #800] +mla v20.4S, v4.4S, v31.s[0] +sub v4.4s, v15.4s, v28.4s +add v15.4s, v15.4s, v28.4s +sqrdmulh v28.4S, v17.4S, v26.s[0] +str q23, [x0, #608] +mul v17.4S, v17.4S,v25.s[0] +ldr q23, [x0, #768] +mla v18.4S, v24.4S, v31.s[0] +sub v24.4s, v8.4s, v20.4s +add v8.4s, v8.4s, v20.4s +sqrdmulh v20.4S, v11.4S, v26.s[1] +str q1, [x0, #624] +mul v11.4S, v11.4S,v25.s[1] +ldr q1, [x0, #784] +mla v17.4S, v28.4S, v31.s[0] +sub v28.4s, v14.4s, v18.4s +add v14.4s, v14.4s, v18.4s +sqrdmulh v18.4S, v24.4S, v26.s[3] +str q5, [x0, #576] +mul v24.4S, v24.4S,v25.s[3] +mla v11.4S, v20.4S, v31.s[0] +sub v20.4s, v15.4s, v17.4s +add v15.4s, v15.4s, v17.4s +sqrdmulh v17.4S, v8.4S, v26.s[2] +str q3, [x0, #592] +mul v8.4S, v8.4S,v25.s[2] +mla v24.4S, v18.4S, v31.s[0] +sub v18.4s, v4.4s, v11.4s +add v4.4s, v4.4s, v11.4s +sqrdmulh v11.4S, v10.4S, v29.s[0] +mul v10.4S, v10.4S,v22.s[0] +mla v8.4S, v17.4S, v31.s[0] +sub v17.4s, v28.4s, v24.4s +add v28.4s, v28.4s, v24.4s +sqrdmulh v24.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v22.s[0] +mla v10.4S, v11.4S, v31.s[0] +sub v11.4s, v14.4s, v8.4s +add v14.4s, v14.4s, v8.4s +sqrdmulh v26.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v22.s[0] +ldr q25, [x0, #1008] +mla v2.4S, v24.4S, v31.s[0] +sub v24.4s, v16.4s, v10.4s +add v16.4s, v16.4s, v10.4s +sqrdmulh v10.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v22.s[0] +ldr q7, [x0, #992] +mla v12.4S, v26.4S, v31.s[0] +sub v26.4s, v6.4s, v2.4s +add v6.4s, v6.4s, v2.4s +sqrdmulh v2.4S, v16.4S, v29.s[1] +str q15, [x0, #640] +mul v16.4S, v16.4S,v22.s[1] +ldr q15, [x0, #960] +mla v19.4S, v10.4S, v31.s[0] +sub v10.4s, v23.4s, v12.4s +add v23.4s, v23.4s, v12.4s +sqrdmulh v12.4S, v6.4S, v29.s[1] +str q20, [x0, #656] +mul v6.4S, v6.4S,v22.s[1] +ldr q20, [x0, #976] +mla v16.4S, v2.4S, v31.s[0] +sub v2.4s, v1.4s, v19.4s +add v1.4s, v1.4s, v19.4s +sqrdmulh v19.4S, v24.4S, v29.s[2] +str q4, [x0, #672] +mul v24.4S, v24.4S,v22.s[2] +ldr q4, [x0, #944] +mla v6.4S, v12.4S, v31.s[0] +sub v12.4s, v1.4s, v16.4s +add v1.4s, v1.4s, v16.4s +ldr q16, [x17, #+512] +ldr q30, [x17, #+528] +ldr q8, [x17, #+544] +ldr q3, [x17, #+560] +sqrdmulh v5.4S, v26.4S, v29.s[2] +str q18, [x0, #688] +mul v26.4S, v26.4S,v22.s[2] +ldr q18, [x0, #928] +mla v24.4S, v19.4S, v31.s[0] +sub v19.4s, v23.4s, v6.4s +add v23.4s, v23.4s, v6.4s +sqrdmulh v6.4S, v1.4S, v0.s[0] +str q28, [x0, #736] +mul v1.4S, v1.4S,v9.s[0] +ldr q28, [x0, #896] +mla v26.4S, v5.4S, v31.s[0] +sub v5.4s, v2.4s, v24.4s +add v2.4s, v2.4s, v24.4s +sqrdmulh v24.4S, v12.4S, v0.s[1] +str q17, [x0, #752] +mul v12.4S, v12.4S,v9.s[1] +ldr q17, [x0, #912] +mla v1.4S, v6.4S, v31.s[0] +sub v6.4s, v10.4s, v26.4s +add v10.4s, v10.4s, v26.4s +sqrdmulh v26.4S, v5.4S, v0.s[3] +str q14, [x0, #704] +mul v5.4S, v5.4S,v9.s[3] +mla v12.4S, v24.4S, v31.s[0] +sub v24.4s, v23.4s, v1.4s +add v23.4s, v23.4s, v1.4s +sqrdmulh v1.4S, v2.4S, v0.s[2] +str q11, [x0, #720] +mul v2.4S, v2.4S,v9.s[2] +mla v5.4S, v26.4S, v31.s[0] +sub v26.4s, v19.4s, v12.4s +add v19.4s, v19.4s, v12.4s +sqrdmulh v12.4S, v25.4S, v30.s[0] +mul v25.4S, v25.4S,v16.s[0] +mla v2.4S, v1.4S, v31.s[0] +sub v1.4s, v6.4s, v5.4s +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v7.4S, v30.s[0] +mul v7.4S, v7.4S,v16.s[0] +mla v25.4S, v12.4S, v31.s[0] +sub v12.4s, v10.4s, v2.4s +add v10.4s, v10.4s, v2.4s +sqrdmulh v0.4S, v15.4S, v30.s[0] +mul v15.4S, v15.4S,v16.s[0] +mla v7.4S, v5.4S, v31.s[0] +sub v5.4s, v4.4s, v25.4s +add v4.4s, v4.4s, v25.4s +sqrdmulh v25.4S, v20.4S, v30.s[0] +mul v20.4S, v20.4S,v16.s[0] +mla v15.4S, v0.4S, v31.s[0] +sub v0.4s, v18.4s, v7.4s +add v18.4s, v18.4s, v7.4s +sqrdmulh v7.4S, v4.4S, v30.s[1] +str q23, [x0, #768] +mul v4.4S, v4.4S,v16.s[1] +mla v20.4S, v25.4S, v31.s[0] +sub v25.4s, v28.4s, v15.4s +add v28.4s, v28.4s, v15.4s +sqrdmulh v15.4S, v18.4S, v30.s[1] +str q24, [x0, #784] +mul v18.4S, v18.4S,v16.s[1] +mla v4.4S, v7.4S, v31.s[0] +sub v7.4s, v17.4s, v20.4s +add v17.4s, v17.4s, v20.4s +sqrdmulh v20.4S, v5.4S, v30.s[2] +str q19, [x0, #800] +mul v5.4S, v5.4S,v16.s[2] +mla v18.4S, v15.4S, v31.s[0] +sub v15.4s, v17.4s, v4.4s +add v17.4s, v17.4s, v4.4s +sqrdmulh v4.4S, v0.4S, v30.s[2] +str q26, [x0, #816] +mul v0.4S, v0.4S,v16.s[2] +mla v5.4S, v20.4S, v31.s[0] +sub v20.4s, v28.4s, v18.4s +add v28.4s, v28.4s, v18.4s +sqrdmulh v18.4S, v17.4S, v3.s[0] +str q6, [x0, #864] +mul v17.4S, v17.4S,v8.s[0] +mla v0.4S, v4.4S, v31.s[0] +sub v4.4s, v7.4s, v5.4s +add v7.4s, v7.4s, v5.4s +sqrdmulh v5.4S, v15.4S, v3.s[1] +str q1, [x0, #880] +mul v15.4S, v15.4S,v8.s[1] +mla v17.4S, v18.4S, v31.s[0] +sub v18.4s, v25.4s, v0.4s +add v25.4s, v25.4s, v0.4s +sqrdmulh v0.4S, v4.4S, v3.s[3] +str q10, [x0, #832] +mul v4.4S, v4.4S,v8.s[3] +mla v15.4S, v5.4S, v31.s[0] +sub v5.4s, v28.4s, v17.4s +add v28.4s, v28.4s, v17.4s +sqrdmulh v17.4S, v7.4S, v3.s[2] +str q12, [x0, #848] +mul v7.4S, v7.4S,v8.s[2] +mla v4.4S, v0.4S, v31.s[0] +sub v0.4s, v20.4s, v15.4s +add v20.4s, v20.4s, v15.4s +mla v7.4S, v17.4S, v31.s[0] +sub v17.4s, v18.4s, v4.4s +add v18.4s, v18.4s, v4.4s +sub v4.4s, v25.4s, v7.4s +add v25.4s, v25.4s, v7.4s +str q28, [x0, #896] +str q5, [x0, #912] +str q20, [x0, #928] +str q0, [x0, #944] +str q18, [x0, #992] +str q17, [x0, #1008] +str q25, [x0, #960] +str q4, [x0, #976] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1444 +// Instruction count: 1440 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_3_3_4.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_3_3_4.s new file mode 100644 index 0000000..ca19281 --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_3_3_4.s @@ -0,0 +1,1474 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 23825509 // Layer 4, block 0 +.word 27028662 // Layer 4, block 1 +.word 0 // Layer None, block None +.word 1307297022 // Layer 3, block 0 +.word 1524716204 // Layer 4, block 0 +.word 1729702351 // Layer 4, block 1 +.word 0 // Layer None, block None +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 14626653 // Layer 3, block 1 +.word 14833295 // Layer 4, block 2 +.word 2138810 // Layer 4, block 3 +.word 0 // Layer None, block None +.word 936034350 // Layer 3, block 1 +.word 949258429 // Layer 4, block 2 +.word 136873393 // Layer 4, block 3 +.word 0 // Layer None, block None +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 29737761 // Layer 3, block 2 +.word 6490403 // Layer 4, block 4 +.word 19648405 // Layer 4, block 5 +.word 0 // Layer None, block None +.word 1903071454 // Layer 3, block 2 +.word 415354091 // Layer 4, block 4 +.word 1257401950 // Layer 4, block 5 +.word 0 // Layer None, block None +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 30285189 // Layer 3, block 3 +.word 31254932 // Layer 4, block 6 +.word 26362414 // Layer 4, block 7 +.word 0 // Layer None, block None +.word 1938104173 // Layer 3, block 3 +.word 2000162988 // Layer 4, block 6 +.word 1687065733 // Layer 4, block 7 +.word 0 // Layer None, block None +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 21289485 // Layer 3, block 4 +.word 572895 // Layer 4, block 8 +.word 26691971 // Layer 4, block 9 +.word 0 // Layer None, block None +.word 1362423055 // Layer 3, block 4 +.word 36662482 // Layer 4, block 8 +.word 1708155771 // Layer 4, block 9 +.word 0 // Layer None, block None +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 9914896 // Layer 3, block 5 +.word 9249292 // Layer 4, block 10 +.word 29292862 // Layer 4, block 11 +.word 0 // Layer None, block None +.word 634504916 // Layer 3, block 5 +.word 591909511 // Layer 4, block 10 +.word 1874600091 // Layer 4, block 11 +.word 0 // Layer None, block None +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 22603682 // Layer 3, block 6 +.word 8247799 // Layer 4, block 12 +.word 5086187 // Layer 4, block 13 +.word 0 // Layer None, block None +.word 1446525244 // Layer 3, block 6 +.word 527818851 // Layer 4, block 12 +.word 325491125 // Layer 4, block 13 +.word 0 // Layer None, block None +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 16204162 // Layer 3, block 7 +.word 28113639 // Layer 4, block 14 +.word 8471290 // Layer 4, block 15 +.word 0 // Layer None, block None +.word 1036987221 // Layer 3, block 7 +.word 1799135579 // Layer 4, block 14 +.word 542121183 // Layer 4, block 15 +.word 0 // Layer None, block None +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.text +.global ntt_u32_incomplete_neon_asm_var_3_3_4 +.global _ntt_u32_incomplete_neon_asm_var_3_3_4 +ntt_u32_incomplete_neon_asm_var_3_3_4: +_ntt_u32_incomplete_neon_asm_var_3_3_4: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x0, #960] +ldr q29, [x0, #832] +ldr q28, [x0, #576] +ldr q27, [x0, #704] +ldr q26, [x0, #448] +ldr q25, [x17, #+0] +ldr q24, [x17, #+16] +ldr q23, [x17, #+32] +ldr q22, [x17, #+48] +ldr q21, [x0, #320] +ldr q20, [x0, #64] +ldr q19, [x0, #192] +sqrdmulh v18.4S, v30.4S, v24.s[0] +mul v30.4S, v30.4S,v25.s[0] +sqrdmulh v17.4S, v29.4S, v24.s[0] +mla v30.4S, v18.4S, v31.s[0] +mul v29.4S, v29.4S,v25.s[0] +sqrdmulh v18.4S, v28.4S, v24.s[0] +mla v29.4S, v17.4S, v31.s[0] +ldr q17, [x0, #976] +mul v28.4S, v28.4S,v25.s[0] +sub v16.4s, v26.4s, v30.4s +add v26.4s, v26.4s, v30.4s +sqrdmulh v30.4S, v27.4S, v24.s[0] +mla v28.4S, v18.4S, v31.s[0] +ldr q18, [x0, #848] +mul v27.4S, v27.4S,v25.s[0] +sub v3.4s, v21.4s, v29.4s +add v21.4s, v21.4s, v29.4s +sqrdmulh v29.4S, v26.4S, v24.s[1] +mla v27.4S, v30.4S, v31.s[0] +ldr q30, [x0, #592] +mul v26.4S, v26.4S,v25.s[1] +sub v2.4s, v20.4s, v28.4s +add v20.4s, v20.4s, v28.4s +sqrdmulh v28.4S, v21.4S, v24.s[1] +mla v26.4S, v29.4S, v31.s[0] +ldr q29, [x0, #720] +mul v21.4S, v21.4S,v25.s[1] +sub v1.4s, v19.4s, v27.4s +add v19.4s, v19.4s, v27.4s +sqrdmulh v27.4S, v16.4S, v24.s[2] +mla v21.4S, v28.4S, v31.s[0] +ldr q28, [x0, #464] +mul v16.4S, v16.4S,v25.s[2] +sub v0.4s, v19.4s, v26.4s +add v19.4s, v19.4s, v26.4s +sqrdmulh v26.4S, v3.4S, v24.s[2] +mla v16.4S, v27.4S, v31.s[0] +ldr q27, [x0, #336] +mul v3.4S, v3.4S,v25.s[2] +sub v15.4s, v20.4s, v21.4s +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v19.4S, v22.s[0] +mla v3.4S, v26.4S, v31.s[0] +ldr q26, [x0, #80] +mul v19.4S, v19.4S,v23.s[0] +sub v14.4s, v1.4s, v16.4s +add v1.4s, v1.4s, v16.4s +sqrdmulh v16.4S, v0.4S, v22.s[1] +mla v19.4S, v21.4S, v31.s[0] +ldr q21, [x0, #208] +mul v0.4S, v0.4S,v23.s[1] +sub v13.4s, v2.4s, v3.4s +add v2.4s, v2.4s, v3.4s +sqrdmulh v3.4S, v14.4S, v22.s[3] +mla v0.4S, v16.4S, v31.s[0] +mul v14.4S, v14.4S,v23.s[3] +sub v16.4s, v20.4s, v19.4s +add v20.4s, v20.4s, v19.4s +sqrdmulh v19.4S, v1.4S, v22.s[2] +mla v14.4S, v3.4S, v31.s[0] +mul v1.4S, v1.4S,v23.s[2] +sub v3.4s, v15.4s, v0.4s +add v15.4s, v15.4s, v0.4s +sqrdmulh v0.4S, v17.4S, v24.s[0] +mla v1.4S, v19.4S, v31.s[0] +mul v17.4S, v17.4S,v25.s[0] +sub v19.4s, v13.4s, v14.4s +add v13.4s, v13.4s, v14.4s +sqrdmulh v14.4S, v18.4S, v24.s[0] +mla v17.4S, v0.4S, v31.s[0] +mul v18.4S, v18.4S,v25.s[0] +sub v0.4s, v2.4s, v1.4s +add v2.4s, v2.4s, v1.4s +sqrdmulh v1.4S, v30.4S, v24.s[0] +mla v18.4S, v14.4S, v31.s[0] +ldr q14, [x0, #992] +mul v30.4S, v30.4S,v25.s[0] +sub v12.4s, v28.4s, v17.4s +add v28.4s, v28.4s, v17.4s +sqrdmulh v17.4S, v29.4S, v24.s[0] +mla v30.4S, v1.4S, v31.s[0] +ldr q1, [x0, #864] +mul v29.4S, v29.4S,v25.s[0] +sub v11.4s, v27.4s, v18.4s +add v27.4s, v27.4s, v18.4s +sqrdmulh v18.4S, v28.4S, v24.s[1] +str q20, [x0, #64] +mla v29.4S, v17.4S, v31.s[0] +ldr q17, [x0, #608] +mul v28.4S, v28.4S,v25.s[1] +sub v20.4s, v26.4s, v30.4s +add v26.4s, v26.4s, v30.4s +sqrdmulh v30.4S, v27.4S, v24.s[1] +str q16, [x0, #192] +mla v28.4S, v18.4S, v31.s[0] +ldr q18, [x0, #736] +mul v27.4S, v27.4S,v25.s[1] +sub v16.4s, v21.4s, v29.4s +add v21.4s, v21.4s, v29.4s +sqrdmulh v29.4S, v12.4S, v24.s[2] +str q15, [x0, #320] +mla v27.4S, v30.4S, v31.s[0] +ldr q30, [x0, #480] +mul v12.4S, v12.4S,v25.s[2] +sub v15.4s, v21.4s, v28.4s +add v21.4s, v21.4s, v28.4s +sqrdmulh v28.4S, v11.4S, v24.s[2] +str q3, [x0, #448] +mla v12.4S, v29.4S, v31.s[0] +ldr q29, [x0, #352] +mul v11.4S, v11.4S,v25.s[2] +sub v3.4s, v26.4s, v27.4s +add v26.4s, v26.4s, v27.4s +sqrdmulh v27.4S, v21.4S, v22.s[0] +str q13, [x0, #832] +mla v11.4S, v28.4S, v31.s[0] +ldr q28, [x0, #96] +mul v21.4S, v21.4S,v23.s[0] +sub v13.4s, v16.4s, v12.4s +add v16.4s, v16.4s, v12.4s +sqrdmulh v12.4S, v15.4S, v22.s[1] +str q19, [x0, #960] +mla v21.4S, v27.4S, v31.s[0] +ldr q27, [x0, #224] +mul v15.4S, v15.4S,v23.s[1] +sub v19.4s, v20.4s, v11.4s +add v20.4s, v20.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v22.s[3] +str q2, [x0, #576] +mla v15.4S, v12.4S, v31.s[0] +mul v13.4S, v13.4S,v23.s[3] +sub v12.4s, v26.4s, v21.4s +add v26.4s, v26.4s, v21.4s +sqrdmulh v21.4S, v16.4S, v22.s[2] +str q0, [x0, #704] +mla v13.4S, v11.4S, v31.s[0] +mul v16.4S, v16.4S,v23.s[2] +sub v11.4s, v3.4s, v15.4s +add v3.4s, v3.4s, v15.4s +sqrdmulh v15.4S, v14.4S, v24.s[0] +mla v16.4S, v21.4S, v31.s[0] +mul v14.4S, v14.4S,v25.s[0] +sub v21.4s, v19.4s, v13.4s +add v19.4s, v19.4s, v13.4s +sqrdmulh v13.4S, v1.4S, v24.s[0] +mla v14.4S, v15.4S, v31.s[0] +mul v1.4S, v1.4S,v25.s[0] +sub v15.4s, v20.4s, v16.4s +add v20.4s, v20.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v24.s[0] +mla v1.4S, v13.4S, v31.s[0] +ldr q13, [x0, #1008] +mul v17.4S, v17.4S,v25.s[0] +sub v0.4s, v30.4s, v14.4s +add v30.4s, v30.4s, v14.4s +sqrdmulh v14.4S, v18.4S, v24.s[0] +mla v17.4S, v16.4S, v31.s[0] +ldr q16, [x0, #880] +mul v18.4S, v18.4S,v25.s[0] +sub v2.4s, v29.4s, v1.4s +add v29.4s, v29.4s, v1.4s +sqrdmulh v1.4S, v30.4S, v24.s[1] +str q26, [x0, #80] +mla v18.4S, v14.4S, v31.s[0] +ldr q14, [x0, #624] +mul v30.4S, v30.4S,v25.s[1] +sub v26.4s, v28.4s, v17.4s +add v28.4s, v28.4s, v17.4s +sqrdmulh v17.4S, v29.4S, v24.s[1] +str q12, [x0, #208] +mla v30.4S, v1.4S, v31.s[0] +ldr q1, [x0, #752] +mul v29.4S, v29.4S,v25.s[1] +sub v12.4s, v27.4s, v18.4s +add v27.4s, v27.4s, v18.4s +sqrdmulh v18.4S, v0.4S, v24.s[2] +str q3, [x0, #336] +mla v29.4S, v17.4S, v31.s[0] +ldr q17, [x0, #496] +mul v0.4S, v0.4S,v25.s[2] +sub v3.4s, v27.4s, v30.4s +add v27.4s, v27.4s, v30.4s +sqrdmulh v30.4S, v2.4S, v24.s[2] +str q11, [x0, #464] +mla v0.4S, v18.4S, v31.s[0] +ldr q18, [x0, #368] +mul v2.4S, v2.4S,v25.s[2] +sub v11.4s, v28.4s, v29.4s +add v28.4s, v28.4s, v29.4s +sqrdmulh v29.4S, v27.4S, v22.s[0] +str q19, [x0, #848] +mla v2.4S, v30.4S, v31.s[0] +ldr q30, [x0, #112] +mul v27.4S, v27.4S,v23.s[0] +sub v19.4s, v12.4s, v0.4s +add v12.4s, v12.4s, v0.4s +sqrdmulh v0.4S, v3.4S, v22.s[1] +str q21, [x0, #976] +mla v27.4S, v29.4S, v31.s[0] +ldr q29, [x0, #240] +mul v3.4S, v3.4S,v23.s[1] +sub v21.4s, v26.4s, v2.4s +add v26.4s, v26.4s, v2.4s +sqrdmulh v2.4S, v19.4S, v22.s[3] +str q20, [x0, #592] +mla v3.4S, v0.4S, v31.s[0] +mul v19.4S, v19.4S,v23.s[3] +sub v0.4s, v28.4s, v27.4s +add v28.4s, v28.4s, v27.4s +sqrdmulh v27.4S, v12.4S, v22.s[2] +str q15, [x0, #720] +mla v19.4S, v2.4S, v31.s[0] +mul v12.4S, v12.4S,v23.s[2] +sub v2.4s, v11.4s, v3.4s +add v11.4s, v11.4s, v3.4s +sqrdmulh v3.4S, v13.4S, v24.s[0] +mla v12.4S, v27.4S, v31.s[0] +mul v13.4S, v13.4S,v25.s[0] +sub v27.4s, v21.4s, v19.4s +add v21.4s, v21.4s, v19.4s +sqrdmulh v19.4S, v16.4S, v24.s[0] +mla v13.4S, v3.4S, v31.s[0] +mul v16.4S, v16.4S,v25.s[0] +sub v3.4s, v26.4s, v12.4s +add v26.4s, v26.4s, v12.4s +sqrdmulh v12.4S, v14.4S, v24.s[0] +mla v16.4S, v19.4S, v31.s[0] +ldr q19, [x0, #896] +mul v14.4S, v14.4S,v25.s[0] +sub v15.4s, v17.4s, v13.4s +add v17.4s, v17.4s, v13.4s +sqrdmulh v13.4S, v1.4S, v24.s[0] +mla v14.4S, v12.4S, v31.s[0] +ldr q12, [x0, #768] +mul v1.4S, v1.4S,v25.s[0] +sub v20.4s, v18.4s, v16.4s +add v18.4s, v18.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v24.s[1] +str q28, [x0, #96] +mla v1.4S, v13.4S, v31.s[0] +ldr q13, [x0, #512] +mul v17.4S, v17.4S,v25.s[1] +sub v28.4s, v30.4s, v14.4s +add v30.4s, v30.4s, v14.4s +sqrdmulh v14.4S, v18.4S, v24.s[1] +str q0, [x0, #224] +mla v17.4S, v16.4S, v31.s[0] +ldr q16, [x0, #640] +mul v18.4S, v18.4S,v25.s[1] +sub v0.4s, v29.4s, v1.4s +add v29.4s, v29.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v24.s[2] +str q11, [x0, #352] +mla v18.4S, v14.4S, v31.s[0] +ldr q14, [x0, #384] +mul v15.4S, v15.4S,v25.s[2] +sub v11.4s, v29.4s, v17.4s +add v29.4s, v29.4s, v17.4s +sqrdmulh v17.4S, v20.4S, v24.s[2] +str q2, [x0, #480] +mla v15.4S, v1.4S, v31.s[0] +ldr q1, [x0, #256] +mul v20.4S, v20.4S,v25.s[2] +sub v2.4s, v30.4s, v18.4s +add v30.4s, v30.4s, v18.4s +sqrdmulh v18.4S, v29.4S, v22.s[0] +str q21, [x0, #864] +mla v20.4S, v17.4S, v31.s[0] +ldr q17, [x0, #0] +mul v29.4S, v29.4S,v23.s[0] +sub v21.4s, v0.4s, v15.4s +add v0.4s, v0.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v22.s[1] +str q27, [x0, #992] +mla v29.4S, v18.4S, v31.s[0] +ldr q18, [x0, #128] +mul v11.4S, v11.4S,v23.s[1] +sub v27.4s, v28.4s, v20.4s +add v28.4s, v28.4s, v20.4s +sqrdmulh v20.4S, v21.4S, v22.s[3] +str q26, [x0, #608] +mla v11.4S, v15.4S, v31.s[0] +mul v21.4S, v21.4S,v23.s[3] +sub v15.4s, v30.4s, v29.4s +add v30.4s, v30.4s, v29.4s +sqrdmulh v29.4S, v0.4S, v22.s[2] +str q3, [x0, #736] +mla v21.4S, v20.4S, v31.s[0] +mul v0.4S, v0.4S,v23.s[2] +sub v20.4s, v2.4s, v11.4s +add v2.4s, v2.4s, v11.4s +sqrdmulh v11.4S, v19.4S, v24.s[0] +mla v0.4S, v29.4S, v31.s[0] +mul v19.4S, v19.4S,v25.s[0] +sub v29.4s, v27.4s, v21.4s +add v27.4s, v27.4s, v21.4s +sqrdmulh v21.4S, v12.4S, v24.s[0] +mla v19.4S, v11.4S, v31.s[0] +mul v12.4S, v12.4S,v25.s[0] +sub v11.4s, v28.4s, v0.4s +add v28.4s, v28.4s, v0.4s +sqrdmulh v0.4S, v13.4S, v24.s[0] +mla v12.4S, v21.4S, v31.s[0] +ldr q21, [x0, #912] +mul v13.4S, v13.4S,v25.s[0] +sub v3.4s, v14.4s, v19.4s +add v14.4s, v14.4s, v19.4s +sqrdmulh v19.4S, v16.4S, v24.s[0] +mla v13.4S, v0.4S, v31.s[0] +ldr q0, [x0, #784] +mul v16.4S, v16.4S,v25.s[0] +sub v26.4s, v1.4s, v12.4s +add v1.4s, v1.4s, v12.4s +sqrdmulh v12.4S, v14.4S, v24.s[1] +str q30, [x0, #112] +mla v16.4S, v19.4S, v31.s[0] +ldr q19, [x0, #528] +mul v14.4S, v14.4S,v25.s[1] +sub v30.4s, v17.4s, v13.4s +add v17.4s, v17.4s, v13.4s +sqrdmulh v13.4S, v1.4S, v24.s[1] +str q15, [x0, #240] +mla v14.4S, v12.4S, v31.s[0] +ldr q12, [x0, #656] +mul v1.4S, v1.4S,v25.s[1] +sub v15.4s, v18.4s, v16.4s +add v18.4s, v18.4s, v16.4s +sqrdmulh v16.4S, v3.4S, v24.s[2] +str q2, [x0, #368] +mla v1.4S, v13.4S, v31.s[0] +ldr q13, [x0, #400] +mul v3.4S, v3.4S,v25.s[2] +sub v2.4s, v18.4s, v14.4s +add v18.4s, v18.4s, v14.4s +sqrdmulh v14.4S, v26.4S, v24.s[2] +str q20, [x0, #496] +mla v3.4S, v16.4S, v31.s[0] +ldr q16, [x0, #272] +mul v26.4S, v26.4S,v25.s[2] +sub v20.4s, v17.4s, v1.4s +add v17.4s, v17.4s, v1.4s +sqrdmulh v1.4S, v18.4S, v22.s[0] +str q27, [x0, #880] +mla v26.4S, v14.4S, v31.s[0] +ldr q14, [x0, #16] +mul v18.4S, v18.4S,v23.s[0] +sub v27.4s, v15.4s, v3.4s +add v15.4s, v15.4s, v3.4s +sqrdmulh v3.4S, v2.4S, v22.s[1] +str q29, [x0, #1008] +mla v18.4S, v1.4S, v31.s[0] +ldr q1, [x0, #144] +mul v2.4S, v2.4S,v23.s[1] +sub v29.4s, v30.4s, v26.4s +add v30.4s, v30.4s, v26.4s +sqrdmulh v26.4S, v27.4S, v22.s[3] +str q28, [x0, #624] +mla v2.4S, v3.4S, v31.s[0] +mul v27.4S, v27.4S,v23.s[3] +sub v3.4s, v17.4s, v18.4s +add v17.4s, v17.4s, v18.4s +sqrdmulh v18.4S, v15.4S, v22.s[2] +str q11, [x0, #752] +mla v27.4S, v26.4S, v31.s[0] +mul v15.4S, v15.4S,v23.s[2] +sub v26.4s, v20.4s, v2.4s +add v20.4s, v20.4s, v2.4s +sqrdmulh v2.4S, v21.4S, v24.s[0] +mla v15.4S, v18.4S, v31.s[0] +mul v21.4S, v21.4S,v25.s[0] +sub v18.4s, v29.4s, v27.4s +add v29.4s, v29.4s, v27.4s +sqrdmulh v27.4S, v0.4S, v24.s[0] +mla v21.4S, v2.4S, v31.s[0] +mul v0.4S, v0.4S,v25.s[0] +sub v2.4s, v30.4s, v15.4s +add v30.4s, v30.4s, v15.4s +sqrdmulh v15.4S, v19.4S, v24.s[0] +mla v0.4S, v27.4S, v31.s[0] +ldr q27, [x0, #928] +mul v19.4S, v19.4S,v25.s[0] +sub v11.4s, v13.4s, v21.4s +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v12.4S, v24.s[0] +mla v19.4S, v15.4S, v31.s[0] +ldr q15, [x0, #800] +mul v12.4S, v12.4S,v25.s[0] +sub v28.4s, v16.4s, v0.4s +add v16.4s, v16.4s, v0.4s +sqrdmulh v0.4S, v13.4S, v24.s[1] +str q17, [x0, #0] +mla v12.4S, v21.4S, v31.s[0] +ldr q21, [x0, #544] +mul v13.4S, v13.4S,v25.s[1] +sub v17.4s, v14.4s, v19.4s +add v14.4s, v14.4s, v19.4s +sqrdmulh v19.4S, v16.4S, v24.s[1] +str q3, [x0, #128] +mla v13.4S, v0.4S, v31.s[0] +ldr q0, [x0, #672] +mul v16.4S, v16.4S,v25.s[1] +sub v3.4s, v1.4s, v12.4s +add v1.4s, v1.4s, v12.4s +sqrdmulh v12.4S, v11.4S, v24.s[2] +str q20, [x0, #256] +mla v16.4S, v19.4S, v31.s[0] +ldr q19, [x0, #416] +mul v11.4S, v11.4S,v25.s[2] +sub v20.4s, v1.4s, v13.4s +add v1.4s, v1.4s, v13.4s +sqrdmulh v13.4S, v28.4S, v24.s[2] +str q26, [x0, #384] +mla v11.4S, v12.4S, v31.s[0] +ldr q12, [x0, #288] +mul v28.4S, v28.4S,v25.s[2] +sub v26.4s, v14.4s, v16.4s +add v14.4s, v14.4s, v16.4s +sqrdmulh v16.4S, v1.4S, v22.s[0] +str q29, [x0, #768] +mla v28.4S, v13.4S, v31.s[0] +ldr q13, [x0, #32] +mul v1.4S, v1.4S,v23.s[0] +sub v29.4s, v3.4s, v11.4s +add v3.4s, v3.4s, v11.4s +sqrdmulh v11.4S, v20.4S, v22.s[1] +str q18, [x0, #896] +mla v1.4S, v16.4S, v31.s[0] +ldr q16, [x0, #160] +mul v20.4S, v20.4S,v23.s[1] +sub v18.4s, v17.4s, v28.4s +add v17.4s, v17.4s, v28.4s +sqrdmulh v28.4S, v29.4S, v22.s[3] +str q30, [x0, #512] +mla v20.4S, v11.4S, v31.s[0] +mul v29.4S, v29.4S,v23.s[3] +sub v11.4s, v14.4s, v1.4s +add v14.4s, v14.4s, v1.4s +sqrdmulh v1.4S, v3.4S, v22.s[2] +str q2, [x0, #640] +mla v29.4S, v28.4S, v31.s[0] +mul v3.4S, v3.4S,v23.s[2] +sub v28.4s, v26.4s, v20.4s +add v26.4s, v26.4s, v20.4s +sqrdmulh v20.4S, v27.4S, v24.s[0] +mla v3.4S, v1.4S, v31.s[0] +mul v27.4S, v27.4S,v25.s[0] +sub v1.4s, v18.4s, v29.4s +add v18.4s, v18.4s, v29.4s +sqrdmulh v29.4S, v15.4S, v24.s[0] +mla v27.4S, v20.4S, v31.s[0] +mul v15.4S, v15.4S,v25.s[0] +sub v20.4s, v17.4s, v3.4s +add v17.4s, v17.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v24.s[0] +mla v15.4S, v29.4S, v31.s[0] +ldr q29, [x0, #944] +mul v21.4S, v21.4S,v25.s[0] +sub v2.4s, v19.4s, v27.4s +add v19.4s, v19.4s, v27.4s +sqrdmulh v27.4S, v0.4S, v24.s[0] +mla v21.4S, v3.4S, v31.s[0] +ldr q3, [x0, #816] +mul v0.4S, v0.4S,v25.s[0] +sub v30.4s, v12.4s, v15.4s +add v12.4s, v12.4s, v15.4s +sqrdmulh v15.4S, v19.4S, v24.s[1] +str q14, [x0, #16] +mla v0.4S, v27.4S, v31.s[0] +ldr q27, [x0, #560] +mul v19.4S, v19.4S,v25.s[1] +sub v14.4s, v13.4s, v21.4s +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v12.4S, v24.s[1] +str q11, [x0, #144] +mla v19.4S, v15.4S, v31.s[0] +ldr q15, [x0, #688] +mul v12.4S, v12.4S,v25.s[1] +sub v11.4s, v16.4s, v0.4s +add v16.4s, v16.4s, v0.4s +sqrdmulh v0.4S, v2.4S, v24.s[2] +str q26, [x0, #272] +mla v12.4S, v21.4S, v31.s[0] +ldr q21, [x0, #432] +mul v2.4S, v2.4S,v25.s[2] +sub v26.4s, v16.4s, v19.4s +add v16.4s, v16.4s, v19.4s +sqrdmulh v19.4S, v30.4S, v24.s[2] +str q28, [x0, #400] +mla v2.4S, v0.4S, v31.s[0] +ldr q0, [x0, #304] +mul v30.4S, v30.4S,v25.s[2] +sub v28.4s, v13.4s, v12.4s +add v13.4s, v13.4s, v12.4s +sqrdmulh v12.4S, v16.4S, v22.s[0] +str q18, [x0, #784] +mla v30.4S, v19.4S, v31.s[0] +ldr q19, [x0, #48] +mul v16.4S, v16.4S,v23.s[0] +sub v18.4s, v11.4s, v2.4s +add v11.4s, v11.4s, v2.4s +sqrdmulh v2.4S, v26.4S, v22.s[1] +str q1, [x0, #912] +mla v16.4S, v12.4S, v31.s[0] +ldr q12, [x0, #176] +mul v26.4S, v26.4S,v23.s[1] +sub v1.4s, v14.4s, v30.4s +add v14.4s, v14.4s, v30.4s +sqrdmulh v30.4S, v18.4S, v22.s[3] +str q17, [x0, #528] +mla v26.4S, v2.4S, v31.s[0] +mul v18.4S, v18.4S,v23.s[3] +sub v2.4s, v13.4s, v16.4s +add v13.4s, v13.4s, v16.4s +sqrdmulh v16.4S, v11.4S, v22.s[2] +str q20, [x0, #656] +mla v18.4S, v30.4S, v31.s[0] +mul v11.4S, v11.4S,v23.s[2] +sub v30.4s, v28.4s, v26.4s +add v28.4s, v28.4s, v26.4s +sqrdmulh v26.4S, v29.4S, v24.s[0] +mla v11.4S, v16.4S, v31.s[0] +mul v29.4S, v29.4S,v25.s[0] +sub v16.4s, v1.4s, v18.4s +add v1.4s, v1.4s, v18.4s +sqrdmulh v18.4S, v3.4S, v24.s[0] +mla v29.4S, v26.4S, v31.s[0] +mul v3.4S, v3.4S,v25.s[0] +sub v26.4s, v14.4s, v11.4s +add v14.4s, v14.4s, v11.4s +sqrdmulh v11.4S, v27.4S, v24.s[0] +mla v3.4S, v18.4S, v31.s[0] +mul v27.4S, v27.4S,v25.s[0] +sub v18.4s, v21.4s, v29.4s +add v21.4s, v21.4s, v29.4s +sqrdmulh v29.4S, v15.4S, v24.s[0] +mla v27.4S, v11.4S, v31.s[0] +mul v15.4S, v15.4S,v25.s[0] +sub v11.4s, v0.4s, v3.4s +add v0.4s, v0.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v24.s[1] +str q13, [x0, #32] +mla v15.4S, v29.4S, v31.s[0] +mul v21.4S, v21.4S,v25.s[1] +sub v29.4s, v19.4s, v27.4s +add v19.4s, v19.4s, v27.4s +sqrdmulh v27.4S, v0.4S, v24.s[1] +str q2, [x0, #160] +mla v21.4S, v3.4S, v31.s[0] +mul v0.4S, v0.4S,v25.s[1] +sub v3.4s, v12.4s, v15.4s +add v12.4s, v12.4s, v15.4s +sqrdmulh v15.4S, v18.4S, v24.s[2] +str q28, [x0, #288] +mla v0.4S, v27.4S, v31.s[0] +mul v18.4S, v18.4S,v25.s[2] +sub v27.4s, v12.4s, v21.4s +add v12.4s, v12.4s, v21.4s +sqrdmulh v21.4S, v11.4S, v24.s[2] +str q30, [x0, #416] +mla v18.4S, v15.4S, v31.s[0] +mul v11.4S, v11.4S,v25.s[2] +sub v15.4s, v19.4s, v0.4s +add v19.4s, v19.4s, v0.4s +sqrdmulh v0.4S, v12.4S, v22.s[0] +str q1, [x0, #800] +mla v11.4S, v21.4S, v31.s[0] +mul v12.4S, v12.4S,v23.s[0] +sub v21.4s, v3.4s, v18.4s +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v27.4S, v22.s[1] +str q16, [x0, #928] +mla v12.4S, v0.4S, v31.s[0] +mul v27.4S, v27.4S,v23.s[1] +sub v0.4s, v29.4s, v11.4s +add v29.4s, v29.4s, v11.4s +sqrdmulh v11.4S, v21.4S, v22.s[3] +str q14, [x0, #544] +mla v27.4S, v18.4S, v31.s[0] +mul v21.4S, v21.4S,v23.s[3] +sub v18.4s, v19.4s, v12.4s +add v19.4s, v19.4s, v12.4s +sqrdmulh v12.4S, v3.4S, v22.s[2] +str q26, [x0, #672] +mla v21.4S, v11.4S, v31.s[0] +mul v3.4S, v3.4S,v23.s[2] +sub v11.4s, v15.4s, v27.4s +add v15.4s, v15.4s, v27.4s +mla v3.4S, v12.4S, v31.s[0] +sub v12.4s, v0.4s, v21.4s +add v0.4s, v0.4s, v21.4s +sub v21.4s, v29.4s, v3.4s +add v29.4s, v29.4s, v3.4s +str q19, [x0, #48] +str q18, [x0, #176] +str q15, [x0, #304] +str q11, [x0, #432] +str q0, [x0, #816] +str q12, [x0, #944] +str q29, [x0, #560] +str q21, [x0, #688] +ldr q4, [x0, #112] +ldr q5, [x0, #96] +ldr q6, [x0, #64] +ldr q7, [x0, #80] +ldr q8, [x0, #48] +ldr q9, [x17, #+64] +ldr q10, [x17, #+80] +ldr q17, [x17, #+96] +ldr q20, [x17, #+112] +ldr q13, [x0, #32] +ldr q2, [x0, #0] +ldr q28, [x0, #16] +sqrdmulh v30.4S, v4.4S, v10.s[0] +mul v4.4S, v4.4S,v9.s[0] +sqrdmulh v1.4S, v5.4S, v10.s[0] +mla v4.4S, v30.4S, v31.s[0] +mul v5.4S, v5.4S,v9.s[0] +sqrdmulh v30.4S, v6.4S, v10.s[0] +mla v5.4S, v1.4S, v31.s[0] +ldr q1, [x0, #240] +mul v6.4S, v6.4S,v9.s[0] +sub v16.4s, v8.4s, v4.4s +add v8.4s, v8.4s, v4.4s +sqrdmulh v4.4S, v7.4S, v10.s[0] +mla v6.4S, v30.4S, v31.s[0] +ldr q30, [x0, #224] +mul v7.4S, v7.4S,v9.s[0] +sub v14.4s, v13.4s, v5.4s +add v13.4s, v13.4s, v5.4s +sqrdmulh v5.4S, v8.4S, v10.s[1] +mla v7.4S, v4.4S, v31.s[0] +ldr q4, [x0, #192] +mul v8.4S, v8.4S,v9.s[1] +sub v26.4s, v2.4s, v6.4s +add v2.4s, v2.4s, v6.4s +sqrdmulh v6.4S, v13.4S, v10.s[1] +mla v8.4S, v5.4S, v31.s[0] +ldr q5, [x0, #208] +mul v13.4S, v13.4S,v9.s[1] +sub v27.4s, v28.4s, v7.4s +add v28.4s, v28.4s, v7.4s +sqrdmulh v7.4S, v16.4S, v10.s[2] +mla v13.4S, v6.4S, v31.s[0] +ldr q6, [x0, #176] +mul v16.4S, v16.4S,v9.s[2] +sub v3.4s, v28.4s, v8.4s +add v28.4s, v28.4s, v8.4s +ldr q8, [x17, #+128] +ldr q25, [x17, #+144] +ldr q24, [x17, #+160] +ldr q23, [x17, #+176] +sqrdmulh v22.4S, v14.4S, v10.s[2] +mla v16.4S, v7.4S, v31.s[0] +ldr q7, [x0, #160] +mul v14.4S, v14.4S,v9.s[2] +sub v19.4s, v2.4s, v13.4s +add v2.4s, v2.4s, v13.4s +sqrdmulh v13.4S, v28.4S, v20.s[0] +mla v14.4S, v22.4S, v31.s[0] +ldr q22, [x0, #128] +mul v28.4S, v28.4S,v17.s[0] +sub v18.4s, v27.4s, v16.4s +add v27.4s, v27.4s, v16.4s +sqrdmulh v16.4S, v3.4S, v20.s[1] +mla v28.4S, v13.4S, v31.s[0] +ldr q13, [x0, #144] +mul v3.4S, v3.4S,v17.s[1] +sub v15.4s, v26.4s, v14.4s +add v26.4s, v26.4s, v14.4s +sqrdmulh v14.4S, v18.4S, v20.s[3] +mla v3.4S, v16.4S, v31.s[0] +mul v18.4S, v18.4S,v17.s[3] +sub v16.4s, v2.4s, v28.4s +add v2.4s, v2.4s, v28.4s +sqrdmulh v28.4S, v27.4S, v20.s[2] +mla v18.4S, v14.4S, v31.s[0] +mul v27.4S, v27.4S,v17.s[2] +sub v14.4s, v19.4s, v3.4s +add v19.4s, v19.4s, v3.4s +sqrdmulh v3.4S, v1.4S, v25.s[0] +mla v27.4S, v28.4S, v31.s[0] +mul v1.4S, v1.4S,v8.s[0] +sub v28.4s, v15.4s, v18.4s +add v15.4s, v15.4s, v18.4s +sqrdmulh v18.4S, v30.4S, v25.s[0] +mla v1.4S, v3.4S, v31.s[0] +mul v30.4S, v30.4S,v8.s[0] +sub v3.4s, v26.4s, v27.4s +add v26.4s, v26.4s, v27.4s +sqrdmulh v20.4S, v4.4S, v25.s[0] +mla v30.4S, v18.4S, v31.s[0] +ldr q18, [x0, #368] +mul v4.4S, v4.4S,v8.s[0] +sub v17.4s, v6.4s, v1.4s +add v6.4s, v6.4s, v1.4s +sqrdmulh v1.4S, v5.4S, v25.s[0] +mla v4.4S, v20.4S, v31.s[0] +ldr q20, [x0, #352] +mul v5.4S, v5.4S,v8.s[0] +sub v10.4s, v7.4s, v30.4s +add v7.4s, v7.4s, v30.4s +sqrdmulh v30.4S, v6.4S, v25.s[1] +str q2, [x0, #0] +mla v5.4S, v1.4S, v31.s[0] +ldr q1, [x0, #320] +mul v6.4S, v6.4S,v8.s[1] +sub v2.4s, v22.4s, v4.4s +add v22.4s, v22.4s, v4.4s +sqrdmulh v4.4S, v7.4S, v25.s[1] +str q16, [x0, #16] +mla v6.4S, v30.4S, v31.s[0] +ldr q30, [x0, #336] +mul v7.4S, v7.4S,v8.s[1] +sub v16.4s, v13.4s, v5.4s +add v13.4s, v13.4s, v5.4s +sqrdmulh v5.4S, v17.4S, v25.s[2] +str q19, [x0, #32] +mla v7.4S, v4.4S, v31.s[0] +ldr q4, [x0, #304] +mul v17.4S, v17.4S,v8.s[2] +sub v19.4s, v13.4s, v6.4s +add v13.4s, v13.4s, v6.4s +ldr q6, [x17, #+192] +ldr q9, [x17, #+208] +ldr q27, [x17, #+224] +ldr q11, [x17, #+240] +sqrdmulh v0.4S, v10.4S, v25.s[2] +str q14, [x0, #48] +mla v17.4S, v5.4S, v31.s[0] +ldr q5, [x0, #288] +mul v10.4S, v10.4S,v8.s[2] +sub v14.4s, v22.4s, v7.4s +add v22.4s, v22.4s, v7.4s +sqrdmulh v7.4S, v13.4S, v23.s[0] +str q15, [x0, #96] +mla v10.4S, v0.4S, v31.s[0] +ldr q0, [x0, #256] +mul v13.4S, v13.4S,v24.s[0] +sub v15.4s, v16.4s, v17.4s +add v16.4s, v16.4s, v17.4s +sqrdmulh v17.4S, v19.4S, v23.s[1] +str q28, [x0, #112] +mla v13.4S, v7.4S, v31.s[0] +ldr q7, [x0, #272] +mul v19.4S, v19.4S,v24.s[1] +sub v28.4s, v2.4s, v10.4s +add v2.4s, v2.4s, v10.4s +sqrdmulh v10.4S, v15.4S, v23.s[3] +str q26, [x0, #64] +mla v19.4S, v17.4S, v31.s[0] +mul v15.4S, v15.4S,v24.s[3] +sub v17.4s, v22.4s, v13.4s +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v16.4S, v23.s[2] +str q3, [x0, #80] +mla v15.4S, v10.4S, v31.s[0] +mul v16.4S, v16.4S,v24.s[2] +sub v10.4s, v14.4s, v19.4s +add v14.4s, v14.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v9.s[0] +mla v16.4S, v13.4S, v31.s[0] +mul v18.4S, v18.4S,v6.s[0] +sub v13.4s, v28.4s, v15.4s +add v28.4s, v28.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v9.s[0] +mla v18.4S, v19.4S, v31.s[0] +mul v20.4S, v20.4S,v6.s[0] +sub v19.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +sqrdmulh v23.4S, v1.4S, v9.s[0] +mla v20.4S, v15.4S, v31.s[0] +ldr q15, [x0, #496] +mul v1.4S, v1.4S,v6.s[0] +sub v24.4s, v4.4s, v18.4s +add v4.4s, v4.4s, v18.4s +sqrdmulh v18.4S, v30.4S, v9.s[0] +mla v1.4S, v23.4S, v31.s[0] +ldr q23, [x0, #480] +mul v30.4S, v30.4S,v6.s[0] +sub v25.4s, v5.4s, v20.4s +add v5.4s, v5.4s, v20.4s +sqrdmulh v20.4S, v4.4S, v9.s[1] +str q22, [x0, #128] +mla v30.4S, v18.4S, v31.s[0] +ldr q18, [x0, #448] +mul v4.4S, v4.4S,v6.s[1] +sub v22.4s, v0.4s, v1.4s +add v0.4s, v0.4s, v1.4s +sqrdmulh v1.4S, v5.4S, v9.s[1] +str q17, [x0, #144] +mla v4.4S, v20.4S, v31.s[0] +ldr q20, [x0, #464] +mul v5.4S, v5.4S,v6.s[1] +sub v17.4s, v7.4s, v30.4s +add v7.4s, v7.4s, v30.4s +sqrdmulh v30.4S, v24.4S, v9.s[2] +str q14, [x0, #160] +mla v5.4S, v1.4S, v31.s[0] +ldr q1, [x0, #432] +mul v24.4S, v24.4S,v6.s[2] +sub v14.4s, v7.4s, v4.4s +add v7.4s, v7.4s, v4.4s +ldr q4, [x17, #+256] +ldr q8, [x17, #+272] +ldr q16, [x17, #+288] +ldr q3, [x17, #+304] +sqrdmulh v26.4S, v25.4S, v9.s[2] +str q10, [x0, #176] +mla v24.4S, v30.4S, v31.s[0] +ldr q30, [x0, #416] +mul v25.4S, v25.4S,v6.s[2] +sub v10.4s, v0.4s, v5.4s +add v0.4s, v0.4s, v5.4s +sqrdmulh v5.4S, v7.4S, v11.s[0] +str q28, [x0, #224] +mla v25.4S, v26.4S, v31.s[0] +ldr q26, [x0, #384] +mul v7.4S, v7.4S,v27.s[0] +sub v28.4s, v17.4s, v24.4s +add v17.4s, v17.4s, v24.4s +sqrdmulh v24.4S, v14.4S, v11.s[1] +str q13, [x0, #240] +mla v7.4S, v5.4S, v31.s[0] +ldr q5, [x0, #400] +mul v14.4S, v14.4S,v27.s[1] +sub v13.4s, v22.4s, v25.4s +add v22.4s, v22.4s, v25.4s +sqrdmulh v25.4S, v28.4S, v11.s[3] +str q2, [x0, #192] +mla v14.4S, v24.4S, v31.s[0] +mul v28.4S, v28.4S,v27.s[3] +sub v24.4s, v0.4s, v7.4s +add v0.4s, v0.4s, v7.4s +sqrdmulh v7.4S, v17.4S, v11.s[2] +str q19, [x0, #208] +mla v28.4S, v25.4S, v31.s[0] +mul v17.4S, v17.4S,v27.s[2] +sub v25.4s, v10.4s, v14.4s +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v8.s[0] +mla v17.4S, v7.4S, v31.s[0] +mul v15.4S, v15.4S,v4.s[0] +sub v7.4s, v13.4s, v28.4s +add v13.4s, v13.4s, v28.4s +sqrdmulh v28.4S, v23.4S, v8.s[0] +mla v15.4S, v14.4S, v31.s[0] +mul v23.4S, v23.4S,v4.s[0] +sub v14.4s, v22.4s, v17.4s +add v22.4s, v22.4s, v17.4s +sqrdmulh v11.4S, v18.4S, v8.s[0] +mla v23.4S, v28.4S, v31.s[0] +ldr q28, [x0, #624] +mul v18.4S, v18.4S,v4.s[0] +sub v27.4s, v1.4s, v15.4s +add v1.4s, v1.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v8.s[0] +mla v18.4S, v11.4S, v31.s[0] +ldr q11, [x0, #608] +mul v20.4S, v20.4S,v4.s[0] +sub v9.4s, v30.4s, v23.4s +add v30.4s, v30.4s, v23.4s +sqrdmulh v23.4S, v1.4S, v8.s[1] +str q0, [x0, #256] +mla v20.4S, v15.4S, v31.s[0] +ldr q15, [x0, #576] +mul v1.4S, v1.4S,v4.s[1] +sub v0.4s, v26.4s, v18.4s +add v26.4s, v26.4s, v18.4s +sqrdmulh v18.4S, v30.4S, v8.s[1] +str q24, [x0, #272] +mla v1.4S, v23.4S, v31.s[0] +ldr q23, [x0, #592] +mul v30.4S, v30.4S,v4.s[1] +sub v24.4s, v5.4s, v20.4s +add v5.4s, v5.4s, v20.4s +sqrdmulh v20.4S, v27.4S, v8.s[2] +str q10, [x0, #288] +mla v30.4S, v18.4S, v31.s[0] +ldr q18, [x0, #560] +mul v27.4S, v27.4S,v4.s[2] +sub v10.4s, v5.4s, v1.4s +add v5.4s, v5.4s, v1.4s +ldr q1, [x17, #+320] +ldr q6, [x17, #+336] +ldr q17, [x17, #+352] +ldr q19, [x17, #+368] +sqrdmulh v2.4S, v9.4S, v8.s[2] +str q25, [x0, #304] +mla v27.4S, v20.4S, v31.s[0] +ldr q20, [x0, #544] +mul v9.4S, v9.4S,v4.s[2] +sub v25.4s, v26.4s, v30.4s +add v26.4s, v26.4s, v30.4s +sqrdmulh v30.4S, v5.4S, v3.s[0] +str q13, [x0, #352] +mla v9.4S, v2.4S, v31.s[0] +ldr q2, [x0, #512] +mul v5.4S, v5.4S,v16.s[0] +sub v13.4s, v24.4s, v27.4s +add v24.4s, v24.4s, v27.4s +sqrdmulh v27.4S, v10.4S, v3.s[1] +str q7, [x0, #368] +mla v5.4S, v30.4S, v31.s[0] +ldr q30, [x0, #528] +mul v10.4S, v10.4S,v16.s[1] +sub v7.4s, v0.4s, v9.4s +add v0.4s, v0.4s, v9.4s +sqrdmulh v9.4S, v13.4S, v3.s[3] +str q22, [x0, #320] +mla v10.4S, v27.4S, v31.s[0] +mul v13.4S, v13.4S,v16.s[3] +sub v27.4s, v26.4s, v5.4s +add v26.4s, v26.4s, v5.4s +sqrdmulh v5.4S, v24.4S, v3.s[2] +str q14, [x0, #336] +mla v13.4S, v9.4S, v31.s[0] +mul v24.4S, v24.4S,v16.s[2] +sub v9.4s, v25.4s, v10.4s +add v25.4s, v25.4s, v10.4s +sqrdmulh v10.4S, v28.4S, v6.s[0] +mla v24.4S, v5.4S, v31.s[0] +mul v28.4S, v28.4S,v1.s[0] +sub v5.4s, v7.4s, v13.4s +add v7.4s, v7.4s, v13.4s +sqrdmulh v13.4S, v11.4S, v6.s[0] +mla v28.4S, v10.4S, v31.s[0] +mul v11.4S, v11.4S,v1.s[0] +sub v10.4s, v0.4s, v24.4s +add v0.4s, v0.4s, v24.4s +sqrdmulh v3.4S, v15.4S, v6.s[0] +mla v11.4S, v13.4S, v31.s[0] +ldr q13, [x0, #752] +mul v15.4S, v15.4S,v1.s[0] +sub v16.4s, v18.4s, v28.4s +add v18.4s, v18.4s, v28.4s +sqrdmulh v28.4S, v23.4S, v6.s[0] +mla v15.4S, v3.4S, v31.s[0] +ldr q3, [x0, #736] +mul v23.4S, v23.4S,v1.s[0] +sub v8.4s, v20.4s, v11.4s +add v20.4s, v20.4s, v11.4s +sqrdmulh v11.4S, v18.4S, v6.s[1] +str q26, [x0, #384] +mla v23.4S, v28.4S, v31.s[0] +ldr q28, [x0, #704] +mul v18.4S, v18.4S,v1.s[1] +sub v26.4s, v2.4s, v15.4s +add v2.4s, v2.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v6.s[1] +str q27, [x0, #400] +mla v18.4S, v11.4S, v31.s[0] +ldr q11, [x0, #720] +mul v20.4S, v20.4S,v1.s[1] +sub v27.4s, v30.4s, v23.4s +add v30.4s, v30.4s, v23.4s +sqrdmulh v23.4S, v16.4S, v6.s[2] +str q25, [x0, #416] +mla v20.4S, v15.4S, v31.s[0] +ldr q15, [x0, #688] +mul v16.4S, v16.4S,v1.s[2] +sub v25.4s, v30.4s, v18.4s +add v30.4s, v30.4s, v18.4s +ldr q18, [x17, #+384] +ldr q4, [x17, #+400] +ldr q24, [x17, #+416] +ldr q14, [x17, #+432] +sqrdmulh v22.4S, v8.4S, v6.s[2] +str q9, [x0, #432] +mla v16.4S, v23.4S, v31.s[0] +ldr q23, [x0, #672] +mul v8.4S, v8.4S,v1.s[2] +sub v9.4s, v2.4s, v20.4s +add v2.4s, v2.4s, v20.4s +sqrdmulh v20.4S, v30.4S, v19.s[0] +str q7, [x0, #480] +mla v8.4S, v22.4S, v31.s[0] +ldr q22, [x0, #640] +mul v30.4S, v30.4S,v17.s[0] +sub v7.4s, v27.4s, v16.4s +add v27.4s, v27.4s, v16.4s +sqrdmulh v16.4S, v25.4S, v19.s[1] +str q5, [x0, #496] +mla v30.4S, v20.4S, v31.s[0] +ldr q20, [x0, #656] +mul v25.4S, v25.4S,v17.s[1] +sub v5.4s, v26.4s, v8.4s +add v26.4s, v26.4s, v8.4s +sqrdmulh v8.4S, v7.4S, v19.s[3] +str q0, [x0, #448] +mla v25.4S, v16.4S, v31.s[0] +mul v7.4S, v7.4S,v17.s[3] +sub v16.4s, v2.4s, v30.4s +add v2.4s, v2.4s, v30.4s +sqrdmulh v30.4S, v27.4S, v19.s[2] +str q10, [x0, #464] +mla v7.4S, v8.4S, v31.s[0] +mul v27.4S, v27.4S,v17.s[2] +sub v8.4s, v9.4s, v25.4s +add v9.4s, v9.4s, v25.4s +sqrdmulh v25.4S, v13.4S, v4.s[0] +mla v27.4S, v30.4S, v31.s[0] +mul v13.4S, v13.4S,v18.s[0] +sub v30.4s, v5.4s, v7.4s +add v5.4s, v5.4s, v7.4s +sqrdmulh v7.4S, v3.4S, v4.s[0] +mla v13.4S, v25.4S, v31.s[0] +mul v3.4S, v3.4S,v18.s[0] +sub v25.4s, v26.4s, v27.4s +add v26.4s, v26.4s, v27.4s +sqrdmulh v19.4S, v28.4S, v4.s[0] +mla v3.4S, v7.4S, v31.s[0] +ldr q7, [x0, #880] +mul v28.4S, v28.4S,v18.s[0] +sub v17.4s, v15.4s, v13.4s +add v15.4s, v15.4s, v13.4s +sqrdmulh v13.4S, v11.4S, v4.s[0] +mla v28.4S, v19.4S, v31.s[0] +ldr q19, [x0, #864] +mul v11.4S, v11.4S,v18.s[0] +sub v6.4s, v23.4s, v3.4s +add v23.4s, v23.4s, v3.4s +sqrdmulh v3.4S, v15.4S, v4.s[1] +str q2, [x0, #512] +mla v11.4S, v13.4S, v31.s[0] +ldr q13, [x0, #832] +mul v15.4S, v15.4S,v18.s[1] +sub v2.4s, v22.4s, v28.4s +add v22.4s, v22.4s, v28.4s +sqrdmulh v28.4S, v23.4S, v4.s[1] +str q16, [x0, #528] +mla v15.4S, v3.4S, v31.s[0] +ldr q3, [x0, #848] +mul v23.4S, v23.4S,v18.s[1] +sub v16.4s, v20.4s, v11.4s +add v20.4s, v20.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v4.s[2] +str q9, [x0, #544] +mla v23.4S, v28.4S, v31.s[0] +ldr q28, [x0, #816] +mul v17.4S, v17.4S,v18.s[2] +sub v9.4s, v20.4s, v15.4s +add v20.4s, v20.4s, v15.4s +ldr q15, [x17, #+448] +ldr q1, [x17, #+464] +ldr q27, [x17, #+480] +ldr q10, [x17, #+496] +sqrdmulh v0.4S, v6.4S, v4.s[2] +str q8, [x0, #560] +mla v17.4S, v11.4S, v31.s[0] +ldr q11, [x0, #800] +mul v6.4S, v6.4S,v18.s[2] +sub v8.4s, v22.4s, v23.4s +add v22.4s, v22.4s, v23.4s +sqrdmulh v23.4S, v20.4S, v14.s[0] +str q5, [x0, #608] +mla v6.4S, v0.4S, v31.s[0] +ldr q0, [x0, #768] +mul v20.4S, v20.4S,v24.s[0] +sub v5.4s, v16.4s, v17.4s +add v16.4s, v16.4s, v17.4s +sqrdmulh v17.4S, v9.4S, v14.s[1] +str q30, [x0, #624] +mla v20.4S, v23.4S, v31.s[0] +ldr q23, [x0, #784] +mul v9.4S, v9.4S,v24.s[1] +sub v30.4s, v2.4s, v6.4s +add v2.4s, v2.4s, v6.4s +sqrdmulh v6.4S, v5.4S, v14.s[3] +str q26, [x0, #576] +mla v9.4S, v17.4S, v31.s[0] +mul v5.4S, v5.4S,v24.s[3] +sub v17.4s, v22.4s, v20.4s +add v22.4s, v22.4s, v20.4s +sqrdmulh v20.4S, v16.4S, v14.s[2] +str q25, [x0, #592] +mla v5.4S, v6.4S, v31.s[0] +mul v16.4S, v16.4S,v24.s[2] +sub v6.4s, v8.4s, v9.4s +add v8.4s, v8.4s, v9.4s +sqrdmulh v9.4S, v7.4S, v1.s[0] +mla v16.4S, v20.4S, v31.s[0] +mul v7.4S, v7.4S,v15.s[0] +sub v20.4s, v30.4s, v5.4s +add v30.4s, v30.4s, v5.4s +sqrdmulh v5.4S, v19.4S, v1.s[0] +mla v7.4S, v9.4S, v31.s[0] +mul v19.4S, v19.4S,v15.s[0] +sub v9.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +sqrdmulh v14.4S, v13.4S, v1.s[0] +mla v19.4S, v5.4S, v31.s[0] +ldr q5, [x0, #1008] +mul v13.4S, v13.4S,v15.s[0] +sub v24.4s, v28.4s, v7.4s +add v28.4s, v28.4s, v7.4s +sqrdmulh v7.4S, v3.4S, v1.s[0] +mla v13.4S, v14.4S, v31.s[0] +ldr q14, [x0, #992] +mul v3.4S, v3.4S,v15.s[0] +sub v4.4s, v11.4s, v19.4s +add v11.4s, v11.4s, v19.4s +sqrdmulh v19.4S, v28.4S, v1.s[1] +str q22, [x0, #640] +mla v3.4S, v7.4S, v31.s[0] +ldr q7, [x0, #960] +mul v28.4S, v28.4S,v15.s[1] +sub v22.4s, v0.4s, v13.4s +add v0.4s, v0.4s, v13.4s +sqrdmulh v13.4S, v11.4S, v1.s[1] +str q17, [x0, #656] +mla v28.4S, v19.4S, v31.s[0] +ldr q19, [x0, #976] +mul v11.4S, v11.4S,v15.s[1] +sub v17.4s, v23.4s, v3.4s +add v23.4s, v23.4s, v3.4s +sqrdmulh v3.4S, v24.4S, v1.s[2] +str q8, [x0, #672] +mla v11.4S, v13.4S, v31.s[0] +ldr q13, [x0, #944] +mul v24.4S, v24.4S,v15.s[2] +sub v8.4s, v23.4s, v28.4s +add v23.4s, v23.4s, v28.4s +ldr q28, [x17, #+512] +ldr q18, [x17, #+528] +ldr q16, [x17, #+544] +ldr q25, [x17, #+560] +sqrdmulh v26.4S, v4.4S, v1.s[2] +str q6, [x0, #688] +mla v24.4S, v3.4S, v31.s[0] +ldr q3, [x0, #928] +mul v4.4S, v4.4S,v15.s[2] +sub v6.4s, v0.4s, v11.4s +add v0.4s, v0.4s, v11.4s +sqrdmulh v11.4S, v23.4S, v10.s[0] +str q30, [x0, #736] +mla v4.4S, v26.4S, v31.s[0] +ldr q26, [x0, #896] +mul v23.4S, v23.4S,v27.s[0] +sub v30.4s, v17.4s, v24.4s +add v17.4s, v17.4s, v24.4s +sqrdmulh v24.4S, v8.4S, v10.s[1] +str q20, [x0, #752] +mla v23.4S, v11.4S, v31.s[0] +ldr q11, [x0, #912] +mul v8.4S, v8.4S,v27.s[1] +sub v20.4s, v22.4s, v4.4s +add v22.4s, v22.4s, v4.4s +sqrdmulh v4.4S, v30.4S, v10.s[3] +str q2, [x0, #704] +mla v8.4S, v24.4S, v31.s[0] +mul v30.4S, v30.4S,v27.s[3] +sub v24.4s, v0.4s, v23.4s +add v0.4s, v0.4s, v23.4s +sqrdmulh v23.4S, v17.4S, v10.s[2] +str q9, [x0, #720] +mla v30.4S, v4.4S, v31.s[0] +mul v17.4S, v17.4S,v27.s[2] +sub v4.4s, v6.4s, v8.4s +add v6.4s, v6.4s, v8.4s +sqrdmulh v8.4S, v5.4S, v18.s[0] +mla v17.4S, v23.4S, v31.s[0] +mul v5.4S, v5.4S,v28.s[0] +sub v23.4s, v20.4s, v30.4s +add v20.4s, v20.4s, v30.4s +sqrdmulh v30.4S, v14.4S, v18.s[0] +mla v5.4S, v8.4S, v31.s[0] +mul v14.4S, v14.4S,v28.s[0] +sub v8.4s, v22.4s, v17.4s +add v22.4s, v22.4s, v17.4s +sqrdmulh v10.4S, v7.4S, v18.s[0] +mla v14.4S, v30.4S, v31.s[0] +mul v7.4S, v7.4S,v28.s[0] +sub v30.4s, v13.4s, v5.4s +add v13.4s, v13.4s, v5.4s +sqrdmulh v5.4S, v19.4S, v18.s[0] +mla v7.4S, v10.4S, v31.s[0] +mul v19.4S, v19.4S,v28.s[0] +sub v10.4s, v3.4s, v14.4s +add v3.4s, v3.4s, v14.4s +sqrdmulh v14.4S, v13.4S, v18.s[1] +str q0, [x0, #768] +mla v19.4S, v5.4S, v31.s[0] +mul v13.4S, v13.4S,v28.s[1] +sub v5.4s, v26.4s, v7.4s +add v26.4s, v26.4s, v7.4s +sqrdmulh v7.4S, v3.4S, v18.s[1] +str q24, [x0, #784] +mla v13.4S, v14.4S, v31.s[0] +mul v3.4S, v3.4S,v28.s[1] +sub v14.4s, v11.4s, v19.4s +add v11.4s, v11.4s, v19.4s +sqrdmulh v19.4S, v30.4S, v18.s[2] +str q6, [x0, #800] +mla v3.4S, v7.4S, v31.s[0] +mul v30.4S, v30.4S,v28.s[2] +sub v7.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +sqrdmulh v13.4S, v10.4S, v18.s[2] +str q4, [x0, #816] +mla v30.4S, v19.4S, v31.s[0] +mul v10.4S, v10.4S,v28.s[2] +sub v19.4s, v26.4s, v3.4s +add v26.4s, v26.4s, v3.4s +sqrdmulh v3.4S, v11.4S, v25.s[0] +str q20, [x0, #864] +mla v10.4S, v13.4S, v31.s[0] +mul v11.4S, v11.4S,v16.s[0] +sub v13.4s, v14.4s, v30.4s +add v14.4s, v14.4s, v30.4s +sqrdmulh v30.4S, v7.4S, v25.s[1] +str q23, [x0, #880] +mla v11.4S, v3.4S, v31.s[0] +mul v7.4S, v7.4S,v16.s[1] +sub v3.4s, v5.4s, v10.4s +add v5.4s, v5.4s, v10.4s +sqrdmulh v10.4S, v13.4S, v25.s[3] +str q22, [x0, #832] +mla v7.4S, v30.4S, v31.s[0] +mul v13.4S, v13.4S,v16.s[3] +sub v30.4s, v26.4s, v11.4s +add v26.4s, v26.4s, v11.4s +sqrdmulh v11.4S, v14.4S, v25.s[2] +str q8, [x0, #848] +mla v13.4S, v10.4S, v31.s[0] +mul v14.4S, v14.4S,v16.s[2] +sub v10.4s, v19.4s, v7.4s +add v19.4s, v19.4s, v7.4s +mla v14.4S, v11.4S, v31.s[0] +sub v11.4s, v3.4s, v13.4s +add v3.4s, v3.4s, v13.4s +sub v13.4s, v5.4s, v14.4s +add v5.4s, v5.4s, v14.4s +str q26, [x0, #896] +str q30, [x0, #912] +str q19, [x0, #928] +str q10, [x0, #944] +str q3, [x0, #992] +str q11, [x0, #1008] +str q5, [x0, #960] +str q13, [x0, #976] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1444 +// Instruction count: 1440 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_3_3_5.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_3_3_5.s new file mode 100644 index 0000000..2225ee5 --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_3_3_5.s @@ -0,0 +1,1474 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 23825509 // Layer 4, block 0 +.word 27028662 // Layer 4, block 1 +.word 0 // Layer None, block None +.word 1307297022 // Layer 3, block 0 +.word 1524716204 // Layer 4, block 0 +.word 1729702351 // Layer 4, block 1 +.word 0 // Layer None, block None +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 14626653 // Layer 3, block 1 +.word 14833295 // Layer 4, block 2 +.word 2138810 // Layer 4, block 3 +.word 0 // Layer None, block None +.word 936034350 // Layer 3, block 1 +.word 949258429 // Layer 4, block 2 +.word 136873393 // Layer 4, block 3 +.word 0 // Layer None, block None +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 29737761 // Layer 3, block 2 +.word 6490403 // Layer 4, block 4 +.word 19648405 // Layer 4, block 5 +.word 0 // Layer None, block None +.word 1903071454 // Layer 3, block 2 +.word 415354091 // Layer 4, block 4 +.word 1257401950 // Layer 4, block 5 +.word 0 // Layer None, block None +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 30285189 // Layer 3, block 3 +.word 31254932 // Layer 4, block 6 +.word 26362414 // Layer 4, block 7 +.word 0 // Layer None, block None +.word 1938104173 // Layer 3, block 3 +.word 2000162988 // Layer 4, block 6 +.word 1687065733 // Layer 4, block 7 +.word 0 // Layer None, block None +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 21289485 // Layer 3, block 4 +.word 572895 // Layer 4, block 8 +.word 26691971 // Layer 4, block 9 +.word 0 // Layer None, block None +.word 1362423055 // Layer 3, block 4 +.word 36662482 // Layer 4, block 8 +.word 1708155771 // Layer 4, block 9 +.word 0 // Layer None, block None +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 9914896 // Layer 3, block 5 +.word 9249292 // Layer 4, block 10 +.word 29292862 // Layer 4, block 11 +.word 0 // Layer None, block None +.word 634504916 // Layer 3, block 5 +.word 591909511 // Layer 4, block 10 +.word 1874600091 // Layer 4, block 11 +.word 0 // Layer None, block None +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 22603682 // Layer 3, block 6 +.word 8247799 // Layer 4, block 12 +.word 5086187 // Layer 4, block 13 +.word 0 // Layer None, block None +.word 1446525244 // Layer 3, block 6 +.word 527818851 // Layer 4, block 12 +.word 325491125 // Layer 4, block 13 +.word 0 // Layer None, block None +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 16204162 // Layer 3, block 7 +.word 28113639 // Layer 4, block 14 +.word 8471290 // Layer 4, block 15 +.word 0 // Layer None, block None +.word 1036987221 // Layer 3, block 7 +.word 1799135579 // Layer 4, block 14 +.word 542121183 // Layer 4, block 15 +.word 0 // Layer None, block None +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.text +.global ntt_u32_incomplete_neon_asm_var_3_3_5 +.global _ntt_u32_incomplete_neon_asm_var_3_3_5 +ntt_u32_incomplete_neon_asm_var_3_3_5: +_ntt_u32_incomplete_neon_asm_var_3_3_5: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x0, #960] +ldr q29, [x0, #832] +ldr q28, [x0, #576] +ldr q27, [x0, #704] +ldr q26, [x0, #448] +ldr q25, [x17, #+0] +ldr q24, [x17, #+16] +ldr q23, [x17, #+32] +ldr q22, [x17, #+48] +ldr q21, [x0, #320] +ldr q20, [x0, #64] +ldr q19, [x0, #192] +sqrdmulh v18.4S, v30.4S, v24.s[0] +sqrdmulh v17.4S, v29.4S, v24.s[0] +mul v30.4S, v30.4S,v25.s[0] +mla v30.4S, v18.4S, v31.s[0] +sqrdmulh v18.4S, v28.4S, v24.s[0] +ldr q16, [x0, #976] +mul v29.4S, v29.4S,v25.s[0] +mla v29.4S, v17.4S, v31.s[0] +sub v17.4s, v26.4s, v30.4s +add v26.4s, v26.4s, v30.4s +sqrdmulh v30.4S, v27.4S, v24.s[0] +ldr q3, [x0, #848] +mul v28.4S, v28.4S,v25.s[0] +mla v28.4S, v18.4S, v31.s[0] +sub v18.4s, v21.4s, v29.4s +add v21.4s, v21.4s, v29.4s +sqrdmulh v29.4S, v26.4S, v24.s[1] +ldr q2, [x0, #592] +mul v27.4S, v27.4S,v25.s[0] +mla v27.4S, v30.4S, v31.s[0] +sub v30.4s, v20.4s, v28.4s +add v20.4s, v20.4s, v28.4s +sqrdmulh v28.4S, v21.4S, v24.s[1] +ldr q1, [x0, #720] +mul v26.4S, v26.4S,v25.s[1] +mla v26.4S, v29.4S, v31.s[0] +sub v29.4s, v19.4s, v27.4s +add v19.4s, v19.4s, v27.4s +sqrdmulh v27.4S, v17.4S, v24.s[2] +ldr q0, [x0, #464] +mul v21.4S, v21.4S,v25.s[1] +mla v21.4S, v28.4S, v31.s[0] +sub v28.4s, v19.4s, v26.4s +add v19.4s, v19.4s, v26.4s +sqrdmulh v26.4S, v18.4S, v24.s[2] +ldr q15, [x0, #336] +mul v17.4S, v17.4S,v25.s[2] +mla v17.4S, v27.4S, v31.s[0] +sub v27.4s, v20.4s, v21.4s +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v19.4S, v22.s[0] +ldr q14, [x0, #80] +mul v18.4S, v18.4S,v25.s[2] +mla v18.4S, v26.4S, v31.s[0] +sub v26.4s, v29.4s, v17.4s +add v29.4s, v29.4s, v17.4s +sqrdmulh v17.4S, v28.4S, v22.s[1] +ldr q13, [x0, #208] +mul v19.4S, v19.4S,v23.s[0] +mla v19.4S, v21.4S, v31.s[0] +sub v21.4s, v30.4s, v18.4s +add v30.4s, v30.4s, v18.4s +sqrdmulh v18.4S, v26.4S, v22.s[3] +mul v28.4S, v28.4S,v23.s[1] +mla v28.4S, v17.4S, v31.s[0] +sub v17.4s, v20.4s, v19.4s +add v20.4s, v20.4s, v19.4s +sqrdmulh v19.4S, v29.4S, v22.s[2] +mul v26.4S, v26.4S,v23.s[3] +mla v26.4S, v18.4S, v31.s[0] +sub v18.4s, v27.4s, v28.4s +add v27.4s, v27.4s, v28.4s +sqrdmulh v28.4S, v16.4S, v24.s[0] +mul v29.4S, v29.4S,v23.s[2] +mla v29.4S, v19.4S, v31.s[0] +sub v19.4s, v21.4s, v26.4s +add v21.4s, v21.4s, v26.4s +sqrdmulh v26.4S, v3.4S, v24.s[0] +mul v16.4S, v16.4S,v25.s[0] +mla v16.4S, v28.4S, v31.s[0] +sub v28.4s, v30.4s, v29.4s +add v30.4s, v30.4s, v29.4s +sqrdmulh v29.4S, v2.4S, v24.s[0] +ldr q12, [x0, #992] +mul v3.4S, v3.4S,v25.s[0] +mla v3.4S, v26.4S, v31.s[0] +sub v26.4s, v0.4s, v16.4s +add v0.4s, v0.4s, v16.4s +sqrdmulh v16.4S, v1.4S, v24.s[0] +ldr q11, [x0, #864] +mul v2.4S, v2.4S,v25.s[0] +mla v2.4S, v29.4S, v31.s[0] +sub v29.4s, v15.4s, v3.4s +add v15.4s, v15.4s, v3.4s +sqrdmulh v3.4S, v0.4S, v24.s[1] +str q20, [x0, #64] +ldr q20, [x0, #608] +mul v1.4S, v1.4S,v25.s[0] +mla v1.4S, v16.4S, v31.s[0] +sub v16.4s, v14.4s, v2.4s +add v14.4s, v14.4s, v2.4s +sqrdmulh v2.4S, v15.4S, v24.s[1] +str q17, [x0, #192] +ldr q17, [x0, #736] +mul v0.4S, v0.4S,v25.s[1] +mla v0.4S, v3.4S, v31.s[0] +sub v3.4s, v13.4s, v1.4s +add v13.4s, v13.4s, v1.4s +sqrdmulh v1.4S, v26.4S, v24.s[2] +str q27, [x0, #320] +ldr q27, [x0, #480] +mul v15.4S, v15.4S,v25.s[1] +mla v15.4S, v2.4S, v31.s[0] +sub v2.4s, v13.4s, v0.4s +add v13.4s, v13.4s, v0.4s +sqrdmulh v0.4S, v29.4S, v24.s[2] +str q18, [x0, #448] +ldr q18, [x0, #352] +mul v26.4S, v26.4S,v25.s[2] +mla v26.4S, v1.4S, v31.s[0] +sub v1.4s, v14.4s, v15.4s +add v14.4s, v14.4s, v15.4s +sqrdmulh v15.4S, v13.4S, v22.s[0] +str q21, [x0, #832] +ldr q21, [x0, #96] +mul v29.4S, v29.4S,v25.s[2] +mla v29.4S, v0.4S, v31.s[0] +sub v0.4s, v3.4s, v26.4s +add v3.4s, v3.4s, v26.4s +sqrdmulh v26.4S, v2.4S, v22.s[1] +str q19, [x0, #960] +ldr q19, [x0, #224] +mul v13.4S, v13.4S,v23.s[0] +mla v13.4S, v15.4S, v31.s[0] +sub v15.4s, v16.4s, v29.4s +add v16.4s, v16.4s, v29.4s +sqrdmulh v29.4S, v0.4S, v22.s[3] +str q30, [x0, #576] +mul v2.4S, v2.4S,v23.s[1] +mla v2.4S, v26.4S, v31.s[0] +sub v26.4s, v14.4s, v13.4s +add v14.4s, v14.4s, v13.4s +sqrdmulh v13.4S, v3.4S, v22.s[2] +str q28, [x0, #704] +mul v0.4S, v0.4S,v23.s[3] +mla v0.4S, v29.4S, v31.s[0] +sub v29.4s, v1.4s, v2.4s +add v1.4s, v1.4s, v2.4s +sqrdmulh v2.4S, v12.4S, v24.s[0] +mul v3.4S, v3.4S,v23.s[2] +mla v3.4S, v13.4S, v31.s[0] +sub v13.4s, v15.4s, v0.4s +add v15.4s, v15.4s, v0.4s +sqrdmulh v0.4S, v11.4S, v24.s[0] +mul v12.4S, v12.4S,v25.s[0] +mla v12.4S, v2.4S, v31.s[0] +sub v2.4s, v16.4s, v3.4s +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v20.4S, v24.s[0] +ldr q28, [x0, #1008] +mul v11.4S, v11.4S,v25.s[0] +mla v11.4S, v0.4S, v31.s[0] +sub v0.4s, v27.4s, v12.4s +add v27.4s, v27.4s, v12.4s +sqrdmulh v12.4S, v17.4S, v24.s[0] +ldr q30, [x0, #880] +mul v20.4S, v20.4S,v25.s[0] +mla v20.4S, v3.4S, v31.s[0] +sub v3.4s, v18.4s, v11.4s +add v18.4s, v18.4s, v11.4s +sqrdmulh v11.4S, v27.4S, v24.s[1] +str q14, [x0, #80] +ldr q14, [x0, #624] +mul v17.4S, v17.4S,v25.s[0] +mla v17.4S, v12.4S, v31.s[0] +sub v12.4s, v21.4s, v20.4s +add v21.4s, v21.4s, v20.4s +sqrdmulh v20.4S, v18.4S, v24.s[1] +str q26, [x0, #208] +ldr q26, [x0, #752] +mul v27.4S, v27.4S,v25.s[1] +mla v27.4S, v11.4S, v31.s[0] +sub v11.4s, v19.4s, v17.4s +add v19.4s, v19.4s, v17.4s +sqrdmulh v17.4S, v0.4S, v24.s[2] +str q1, [x0, #336] +ldr q1, [x0, #496] +mul v18.4S, v18.4S,v25.s[1] +mla v18.4S, v20.4S, v31.s[0] +sub v20.4s, v19.4s, v27.4s +add v19.4s, v19.4s, v27.4s +sqrdmulh v27.4S, v3.4S, v24.s[2] +str q29, [x0, #464] +ldr q29, [x0, #368] +mul v0.4S, v0.4S,v25.s[2] +mla v0.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v18.4s +add v21.4s, v21.4s, v18.4s +sqrdmulh v18.4S, v19.4S, v22.s[0] +str q15, [x0, #848] +ldr q15, [x0, #112] +mul v3.4S, v3.4S,v25.s[2] +mla v3.4S, v27.4S, v31.s[0] +sub v27.4s, v11.4s, v0.4s +add v11.4s, v11.4s, v0.4s +sqrdmulh v0.4S, v20.4S, v22.s[1] +str q13, [x0, #976] +ldr q13, [x0, #240] +mul v19.4S, v19.4S,v23.s[0] +mla v19.4S, v18.4S, v31.s[0] +sub v18.4s, v12.4s, v3.4s +add v12.4s, v12.4s, v3.4s +sqrdmulh v3.4S, v27.4S, v22.s[3] +str q16, [x0, #592] +mul v20.4S, v20.4S,v23.s[1] +mla v20.4S, v0.4S, v31.s[0] +sub v0.4s, v21.4s, v19.4s +add v21.4s, v21.4s, v19.4s +sqrdmulh v19.4S, v11.4S, v22.s[2] +str q2, [x0, #720] +mul v27.4S, v27.4S,v23.s[3] +mla v27.4S, v3.4S, v31.s[0] +sub v3.4s, v17.4s, v20.4s +add v17.4s, v17.4s, v20.4s +sqrdmulh v20.4S, v28.4S, v24.s[0] +mul v11.4S, v11.4S,v23.s[2] +mla v11.4S, v19.4S, v31.s[0] +sub v19.4s, v18.4s, v27.4s +add v18.4s, v18.4s, v27.4s +sqrdmulh v27.4S, v30.4S, v24.s[0] +mul v28.4S, v28.4S,v25.s[0] +mla v28.4S, v20.4S, v31.s[0] +sub v20.4s, v12.4s, v11.4s +add v12.4s, v12.4s, v11.4s +sqrdmulh v11.4S, v14.4S, v24.s[0] +ldr q2, [x0, #896] +mul v30.4S, v30.4S,v25.s[0] +mla v30.4S, v27.4S, v31.s[0] +sub v27.4s, v1.4s, v28.4s +add v1.4s, v1.4s, v28.4s +sqrdmulh v28.4S, v26.4S, v24.s[0] +ldr q16, [x0, #768] +mul v14.4S, v14.4S,v25.s[0] +mla v14.4S, v11.4S, v31.s[0] +sub v11.4s, v29.4s, v30.4s +add v29.4s, v29.4s, v30.4s +sqrdmulh v30.4S, v1.4S, v24.s[1] +str q21, [x0, #96] +ldr q21, [x0, #512] +mul v26.4S, v26.4S,v25.s[0] +mla v26.4S, v28.4S, v31.s[0] +sub v28.4s, v15.4s, v14.4s +add v15.4s, v15.4s, v14.4s +sqrdmulh v14.4S, v29.4S, v24.s[1] +str q0, [x0, #224] +ldr q0, [x0, #640] +mul v1.4S, v1.4S,v25.s[1] +mla v1.4S, v30.4S, v31.s[0] +sub v30.4s, v13.4s, v26.4s +add v13.4s, v13.4s, v26.4s +sqrdmulh v26.4S, v27.4S, v24.s[2] +str q17, [x0, #352] +ldr q17, [x0, #384] +mul v29.4S, v29.4S,v25.s[1] +mla v29.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v1.4s +add v13.4s, v13.4s, v1.4s +sqrdmulh v1.4S, v11.4S, v24.s[2] +str q3, [x0, #480] +ldr q3, [x0, #256] +mul v27.4S, v27.4S,v25.s[2] +mla v27.4S, v26.4S, v31.s[0] +sub v26.4s, v15.4s, v29.4s +add v15.4s, v15.4s, v29.4s +sqrdmulh v29.4S, v13.4S, v22.s[0] +str q18, [x0, #864] +ldr q18, [x0, #0] +mul v11.4S, v11.4S,v25.s[2] +mla v11.4S, v1.4S, v31.s[0] +sub v1.4s, v30.4s, v27.4s +add v30.4s, v30.4s, v27.4s +sqrdmulh v27.4S, v14.4S, v22.s[1] +str q19, [x0, #992] +ldr q19, [x0, #128] +mul v13.4S, v13.4S,v23.s[0] +mla v13.4S, v29.4S, v31.s[0] +sub v29.4s, v28.4s, v11.4s +add v28.4s, v28.4s, v11.4s +sqrdmulh v11.4S, v1.4S, v22.s[3] +str q12, [x0, #608] +mul v14.4S, v14.4S,v23.s[1] +mla v14.4S, v27.4S, v31.s[0] +sub v27.4s, v15.4s, v13.4s +add v15.4s, v15.4s, v13.4s +sqrdmulh v13.4S, v30.4S, v22.s[2] +str q20, [x0, #736] +mul v1.4S, v1.4S,v23.s[3] +mla v1.4S, v11.4S, v31.s[0] +sub v11.4s, v26.4s, v14.4s +add v26.4s, v26.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v24.s[0] +mul v30.4S, v30.4S,v23.s[2] +mla v30.4S, v13.4S, v31.s[0] +sub v13.4s, v29.4s, v1.4s +add v29.4s, v29.4s, v1.4s +sqrdmulh v1.4S, v16.4S, v24.s[0] +mul v2.4S, v2.4S,v25.s[0] +mla v2.4S, v14.4S, v31.s[0] +sub v14.4s, v28.4s, v30.4s +add v28.4s, v28.4s, v30.4s +sqrdmulh v30.4S, v21.4S, v24.s[0] +ldr q20, [x0, #912] +mul v16.4S, v16.4S,v25.s[0] +mla v16.4S, v1.4S, v31.s[0] +sub v1.4s, v17.4s, v2.4s +add v17.4s, v17.4s, v2.4s +sqrdmulh v2.4S, v0.4S, v24.s[0] +ldr q12, [x0, #784] +mul v21.4S, v21.4S,v25.s[0] +mla v21.4S, v30.4S, v31.s[0] +sub v30.4s, v3.4s, v16.4s +add v3.4s, v3.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v24.s[1] +str q15, [x0, #112] +ldr q15, [x0, #528] +mul v0.4S, v0.4S,v25.s[0] +mla v0.4S, v2.4S, v31.s[0] +sub v2.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v3.4S, v24.s[1] +str q27, [x0, #240] +ldr q27, [x0, #656] +mul v17.4S, v17.4S,v25.s[1] +mla v17.4S, v16.4S, v31.s[0] +sub v16.4s, v19.4s, v0.4s +add v19.4s, v19.4s, v0.4s +sqrdmulh v0.4S, v1.4S, v24.s[2] +str q26, [x0, #368] +ldr q26, [x0, #400] +mul v3.4S, v3.4S,v25.s[1] +mla v3.4S, v21.4S, v31.s[0] +sub v21.4s, v19.4s, v17.4s +add v19.4s, v19.4s, v17.4s +sqrdmulh v17.4S, v30.4S, v24.s[2] +str q11, [x0, #496] +ldr q11, [x0, #272] +mul v1.4S, v1.4S,v25.s[2] +mla v1.4S, v0.4S, v31.s[0] +sub v0.4s, v18.4s, v3.4s +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v19.4S, v22.s[0] +str q29, [x0, #880] +ldr q29, [x0, #16] +mul v30.4S, v30.4S,v25.s[2] +mla v30.4S, v17.4S, v31.s[0] +sub v17.4s, v16.4s, v1.4s +add v16.4s, v16.4s, v1.4s +sqrdmulh v1.4S, v21.4S, v22.s[1] +str q13, [x0, #1008] +ldr q13, [x0, #144] +mul v19.4S, v19.4S,v23.s[0] +mla v19.4S, v3.4S, v31.s[0] +sub v3.4s, v2.4s, v30.4s +add v2.4s, v2.4s, v30.4s +sqrdmulh v30.4S, v17.4S, v22.s[3] +str q28, [x0, #624] +mul v21.4S, v21.4S,v23.s[1] +mla v21.4S, v1.4S, v31.s[0] +sub v1.4s, v18.4s, v19.4s +add v18.4s, v18.4s, v19.4s +sqrdmulh v19.4S, v16.4S, v22.s[2] +str q14, [x0, #752] +mul v17.4S, v17.4S,v23.s[3] +mla v17.4S, v30.4S, v31.s[0] +sub v30.4s, v0.4s, v21.4s +add v0.4s, v0.4s, v21.4s +sqrdmulh v21.4S, v20.4S, v24.s[0] +mul v16.4S, v16.4S,v23.s[2] +mla v16.4S, v19.4S, v31.s[0] +sub v19.4s, v3.4s, v17.4s +add v3.4s, v3.4s, v17.4s +sqrdmulh v17.4S, v12.4S, v24.s[0] +mul v20.4S, v20.4S,v25.s[0] +mla v20.4S, v21.4S, v31.s[0] +sub v21.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v15.4S, v24.s[0] +ldr q14, [x0, #928] +mul v12.4S, v12.4S,v25.s[0] +mla v12.4S, v17.4S, v31.s[0] +sub v17.4s, v26.4s, v20.4s +add v26.4s, v26.4s, v20.4s +sqrdmulh v20.4S, v27.4S, v24.s[0] +ldr q28, [x0, #800] +mul v15.4S, v15.4S,v25.s[0] +mla v15.4S, v16.4S, v31.s[0] +sub v16.4s, v11.4s, v12.4s +add v11.4s, v11.4s, v12.4s +sqrdmulh v12.4S, v26.4S, v24.s[1] +str q18, [x0, #0] +ldr q18, [x0, #544] +mul v27.4S, v27.4S,v25.s[0] +mla v27.4S, v20.4S, v31.s[0] +sub v20.4s, v29.4s, v15.4s +add v29.4s, v29.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v24.s[1] +str q1, [x0, #128] +ldr q1, [x0, #672] +mul v26.4S, v26.4S,v25.s[1] +mla v26.4S, v12.4S, v31.s[0] +sub v12.4s, v13.4s, v27.4s +add v13.4s, v13.4s, v27.4s +sqrdmulh v27.4S, v17.4S, v24.s[2] +str q0, [x0, #256] +ldr q0, [x0, #416] +mul v11.4S, v11.4S,v25.s[1] +mla v11.4S, v15.4S, v31.s[0] +sub v15.4s, v13.4s, v26.4s +add v13.4s, v13.4s, v26.4s +sqrdmulh v26.4S, v16.4S, v24.s[2] +str q30, [x0, #384] +ldr q30, [x0, #288] +mul v17.4S, v17.4S,v25.s[2] +mla v17.4S, v27.4S, v31.s[0] +sub v27.4s, v29.4s, v11.4s +add v29.4s, v29.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v22.s[0] +str q3, [x0, #768] +ldr q3, [x0, #32] +mul v16.4S, v16.4S,v25.s[2] +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v12.4s, v17.4s +add v12.4s, v12.4s, v17.4s +sqrdmulh v17.4S, v15.4S, v22.s[1] +str q19, [x0, #896] +ldr q19, [x0, #160] +mul v13.4S, v13.4S,v23.s[0] +mla v13.4S, v11.4S, v31.s[0] +sub v11.4s, v20.4s, v16.4s +add v20.4s, v20.4s, v16.4s +sqrdmulh v16.4S, v26.4S, v22.s[3] +str q2, [x0, #512] +mul v15.4S, v15.4S,v23.s[1] +mla v15.4S, v17.4S, v31.s[0] +sub v17.4s, v29.4s, v13.4s +add v29.4s, v29.4s, v13.4s +sqrdmulh v13.4S, v12.4S, v22.s[2] +str q21, [x0, #640] +mul v26.4S, v26.4S,v23.s[3] +mla v26.4S, v16.4S, v31.s[0] +sub v16.4s, v27.4s, v15.4s +add v27.4s, v27.4s, v15.4s +sqrdmulh v15.4S, v14.4S, v24.s[0] +mul v12.4S, v12.4S,v23.s[2] +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v26.4s +add v11.4s, v11.4s, v26.4s +sqrdmulh v26.4S, v28.4S, v24.s[0] +mul v14.4S, v14.4S,v25.s[0] +mla v14.4S, v15.4S, v31.s[0] +sub v15.4s, v20.4s, v12.4s +add v20.4s, v20.4s, v12.4s +sqrdmulh v12.4S, v18.4S, v24.s[0] +ldr q21, [x0, #944] +mul v28.4S, v28.4S,v25.s[0] +mla v28.4S, v26.4S, v31.s[0] +sub v26.4s, v0.4s, v14.4s +add v0.4s, v0.4s, v14.4s +sqrdmulh v14.4S, v1.4S, v24.s[0] +ldr q2, [x0, #816] +mul v18.4S, v18.4S,v25.s[0] +mla v18.4S, v12.4S, v31.s[0] +sub v12.4s, v30.4s, v28.4s +add v30.4s, v30.4s, v28.4s +sqrdmulh v28.4S, v0.4S, v24.s[1] +str q29, [x0, #16] +ldr q29, [x0, #560] +mul v1.4S, v1.4S,v25.s[0] +mla v1.4S, v14.4S, v31.s[0] +sub v14.4s, v3.4s, v18.4s +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v30.4S, v24.s[1] +str q17, [x0, #144] +ldr q17, [x0, #688] +mul v0.4S, v0.4S,v25.s[1] +mla v0.4S, v28.4S, v31.s[0] +sub v28.4s, v19.4s, v1.4s +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v26.4S, v24.s[2] +str q27, [x0, #272] +ldr q27, [x0, #432] +mul v30.4S, v30.4S,v25.s[1] +mla v30.4S, v18.4S, v31.s[0] +sub v18.4s, v19.4s, v0.4s +add v19.4s, v19.4s, v0.4s +sqrdmulh v0.4S, v12.4S, v24.s[2] +str q16, [x0, #400] +ldr q16, [x0, #304] +mul v26.4S, v26.4S,v25.s[2] +mla v26.4S, v1.4S, v31.s[0] +sub v1.4s, v3.4s, v30.4s +add v3.4s, v3.4s, v30.4s +sqrdmulh v30.4S, v19.4S, v22.s[0] +str q11, [x0, #784] +ldr q11, [x0, #48] +mul v12.4S, v12.4S,v25.s[2] +mla v12.4S, v0.4S, v31.s[0] +sub v0.4s, v28.4s, v26.4s +add v28.4s, v28.4s, v26.4s +sqrdmulh v26.4S, v18.4S, v22.s[1] +str q13, [x0, #912] +ldr q13, [x0, #176] +mul v19.4S, v19.4S,v23.s[0] +mla v19.4S, v30.4S, v31.s[0] +sub v30.4s, v14.4s, v12.4s +add v14.4s, v14.4s, v12.4s +sqrdmulh v12.4S, v0.4S, v22.s[3] +str q20, [x0, #528] +mul v18.4S, v18.4S,v23.s[1] +mla v18.4S, v26.4S, v31.s[0] +sub v26.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v28.4S, v22.s[2] +str q15, [x0, #656] +mul v0.4S, v0.4S,v23.s[3] +mla v0.4S, v12.4S, v31.s[0] +sub v12.4s, v1.4s, v18.4s +add v1.4s, v1.4s, v18.4s +sqrdmulh v18.4S, v21.4S, v24.s[0] +mul v28.4S, v28.4S,v23.s[2] +mla v28.4S, v19.4S, v31.s[0] +sub v19.4s, v30.4s, v0.4s +add v30.4s, v30.4s, v0.4s +sqrdmulh v0.4S, v2.4S, v24.s[0] +mul v21.4S, v21.4S,v25.s[0] +mla v21.4S, v18.4S, v31.s[0] +sub v18.4s, v14.4s, v28.4s +add v14.4s, v14.4s, v28.4s +sqrdmulh v28.4S, v29.4S, v24.s[0] +mul v2.4S, v2.4S,v25.s[0] +mla v2.4S, v0.4S, v31.s[0] +sub v0.4s, v27.4s, v21.4s +add v27.4s, v27.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v24.s[0] +mul v29.4S, v29.4S,v25.s[0] +mla v29.4S, v28.4S, v31.s[0] +sub v28.4s, v16.4s, v2.4s +add v16.4s, v16.4s, v2.4s +sqrdmulh v2.4S, v27.4S, v24.s[1] +str q3, [x0, #32] +mul v17.4S, v17.4S,v25.s[0] +mla v17.4S, v21.4S, v31.s[0] +sub v21.4s, v11.4s, v29.4s +add v11.4s, v11.4s, v29.4s +sqrdmulh v29.4S, v16.4S, v24.s[1] +str q26, [x0, #160] +mul v27.4S, v27.4S,v25.s[1] +mla v27.4S, v2.4S, v31.s[0] +sub v2.4s, v13.4s, v17.4s +add v13.4s, v13.4s, v17.4s +sqrdmulh v17.4S, v0.4S, v24.s[2] +str q1, [x0, #288] +mul v16.4S, v16.4S,v25.s[1] +mla v16.4S, v29.4S, v31.s[0] +sub v29.4s, v13.4s, v27.4s +add v13.4s, v13.4s, v27.4s +sqrdmulh v27.4S, v28.4S, v24.s[2] +str q12, [x0, #416] +mul v0.4S, v0.4S,v25.s[2] +mla v0.4S, v17.4S, v31.s[0] +sub v17.4s, v11.4s, v16.4s +add v11.4s, v11.4s, v16.4s +sqrdmulh v16.4S, v13.4S, v22.s[0] +str q30, [x0, #800] +mul v28.4S, v28.4S,v25.s[2] +mla v28.4S, v27.4S, v31.s[0] +sub v27.4s, v2.4s, v0.4s +add v2.4s, v2.4s, v0.4s +sqrdmulh v0.4S, v29.4S, v22.s[1] +str q19, [x0, #928] +mul v13.4S, v13.4S,v23.s[0] +mla v13.4S, v16.4S, v31.s[0] +sub v16.4s, v21.4s, v28.4s +add v21.4s, v21.4s, v28.4s +sqrdmulh v28.4S, v27.4S, v22.s[3] +str q14, [x0, #544] +mul v29.4S, v29.4S,v23.s[1] +mla v29.4S, v0.4S, v31.s[0] +sub v0.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +sqrdmulh v13.4S, v2.4S, v22.s[2] +str q18, [x0, #672] +mul v27.4S, v27.4S,v23.s[3] +mla v27.4S, v28.4S, v31.s[0] +sub v28.4s, v17.4s, v29.4s +add v17.4s, v17.4s, v29.4s +mul v2.4S, v2.4S,v23.s[2] +mla v2.4S, v13.4S, v31.s[0] +sub v13.4s, v16.4s, v27.4s +add v16.4s, v16.4s, v27.4s +sub v27.4s, v21.4s, v2.4s +add v21.4s, v21.4s, v2.4s +str q11, [x0, #48] +str q0, [x0, #176] +str q17, [x0, #304] +str q28, [x0, #432] +str q16, [x0, #816] +str q13, [x0, #944] +str q21, [x0, #560] +str q27, [x0, #688] +ldr q4, [x0, #112] +ldr q5, [x0, #96] +ldr q6, [x0, #64] +ldr q7, [x0, #80] +ldr q8, [x0, #48] +ldr q9, [x17, #+64] +ldr q10, [x17, #+80] +ldr q20, [x17, #+96] +ldr q15, [x17, #+112] +ldr q3, [x0, #32] +ldr q26, [x0, #0] +ldr q1, [x0, #16] +sqrdmulh v12.4S, v4.4S, v10.s[0] +sqrdmulh v30.4S, v5.4S, v10.s[0] +mul v4.4S, v4.4S,v9.s[0] +mla v4.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v6.4S, v10.s[0] +ldr q19, [x0, #240] +mul v5.4S, v5.4S,v9.s[0] +mla v5.4S, v30.4S, v31.s[0] +sub v30.4s, v8.4s, v4.4s +add v8.4s, v8.4s, v4.4s +sqrdmulh v4.4S, v7.4S, v10.s[0] +ldr q14, [x0, #224] +mul v6.4S, v6.4S,v9.s[0] +mla v6.4S, v12.4S, v31.s[0] +sub v12.4s, v3.4s, v5.4s +add v3.4s, v3.4s, v5.4s +sqrdmulh v5.4S, v8.4S, v10.s[1] +ldr q18, [x0, #192] +mul v7.4S, v7.4S,v9.s[0] +mla v7.4S, v4.4S, v31.s[0] +sub v4.4s, v26.4s, v6.4s +add v26.4s, v26.4s, v6.4s +sqrdmulh v6.4S, v3.4S, v10.s[1] +ldr q29, [x0, #208] +mul v8.4S, v8.4S,v9.s[1] +mla v8.4S, v5.4S, v31.s[0] +sub v5.4s, v1.4s, v7.4s +add v1.4s, v1.4s, v7.4s +sqrdmulh v7.4S, v30.4S, v10.s[2] +ldr q2, [x0, #176] +mul v3.4S, v3.4S,v9.s[1] +mla v3.4S, v6.4S, v31.s[0] +sub v6.4s, v1.4s, v8.4s +add v1.4s, v1.4s, v8.4s +ldr q8, [x17, #+128] +ldr q25, [x17, #+144] +ldr q24, [x17, #+160] +ldr q23, [x17, #+176] +sqrdmulh v22.4S, v12.4S, v10.s[2] +ldr q11, [x0, #160] +mul v30.4S, v30.4S,v9.s[2] +mla v30.4S, v7.4S, v31.s[0] +sub v7.4s, v26.4s, v3.4s +add v26.4s, v26.4s, v3.4s +sqrdmulh v3.4S, v1.4S, v15.s[0] +ldr q0, [x0, #128] +mul v12.4S, v12.4S,v9.s[2] +mla v12.4S, v22.4S, v31.s[0] +sub v22.4s, v5.4s, v30.4s +add v5.4s, v5.4s, v30.4s +sqrdmulh v30.4S, v6.4S, v15.s[1] +ldr q17, [x0, #144] +mul v1.4S, v1.4S,v20.s[0] +mla v1.4S, v3.4S, v31.s[0] +sub v3.4s, v4.4s, v12.4s +add v4.4s, v4.4s, v12.4s +sqrdmulh v12.4S, v22.4S, v15.s[3] +mul v6.4S, v6.4S,v20.s[1] +mla v6.4S, v30.4S, v31.s[0] +sub v30.4s, v26.4s, v1.4s +add v26.4s, v26.4s, v1.4s +sqrdmulh v1.4S, v5.4S, v15.s[2] +mul v22.4S, v22.4S,v20.s[3] +mla v22.4S, v12.4S, v31.s[0] +sub v12.4s, v7.4s, v6.4s +add v7.4s, v7.4s, v6.4s +sqrdmulh v6.4S, v19.4S, v25.s[0] +mul v5.4S, v5.4S,v20.s[2] +mla v5.4S, v1.4S, v31.s[0] +sub v1.4s, v3.4s, v22.4s +add v3.4s, v3.4s, v22.4s +sqrdmulh v22.4S, v14.4S, v25.s[0] +mul v19.4S, v19.4S,v8.s[0] +mla v19.4S, v6.4S, v31.s[0] +sub v6.4s, v4.4s, v5.4s +add v4.4s, v4.4s, v5.4s +sqrdmulh v15.4S, v18.4S, v25.s[0] +ldr q20, [x0, #368] +mul v14.4S, v14.4S,v8.s[0] +mla v14.4S, v22.4S, v31.s[0] +sub v22.4s, v2.4s, v19.4s +add v2.4s, v2.4s, v19.4s +sqrdmulh v19.4S, v29.4S, v25.s[0] +ldr q10, [x0, #352] +mul v18.4S, v18.4S,v8.s[0] +mla v18.4S, v15.4S, v31.s[0] +sub v15.4s, v11.4s, v14.4s +add v11.4s, v11.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v25.s[1] +str q26, [x0, #0] +ldr q26, [x0, #320] +mul v29.4S, v29.4S,v8.s[0] +mla v29.4S, v19.4S, v31.s[0] +sub v19.4s, v0.4s, v18.4s +add v0.4s, v0.4s, v18.4s +sqrdmulh v18.4S, v11.4S, v25.s[1] +str q30, [x0, #16] +ldr q30, [x0, #336] +mul v2.4S, v2.4S,v8.s[1] +mla v2.4S, v14.4S, v31.s[0] +sub v14.4s, v17.4s, v29.4s +add v17.4s, v17.4s, v29.4s +sqrdmulh v29.4S, v22.4S, v25.s[2] +str q7, [x0, #32] +ldr q7, [x0, #304] +mul v11.4S, v11.4S,v8.s[1] +mla v11.4S, v18.4S, v31.s[0] +sub v18.4s, v17.4s, v2.4s +add v17.4s, v17.4s, v2.4s +ldr q2, [x17, #+192] +ldr q9, [x17, #+208] +ldr q5, [x17, #+224] +ldr q28, [x17, #+240] +sqrdmulh v16.4S, v15.4S, v25.s[2] +str q12, [x0, #48] +ldr q12, [x0, #288] +mul v22.4S, v22.4S,v8.s[2] +mla v22.4S, v29.4S, v31.s[0] +sub v29.4s, v0.4s, v11.4s +add v0.4s, v0.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v23.s[0] +str q3, [x0, #96] +ldr q3, [x0, #256] +mul v15.4S, v15.4S,v8.s[2] +mla v15.4S, v16.4S, v31.s[0] +sub v16.4s, v14.4s, v22.4s +add v14.4s, v14.4s, v22.4s +sqrdmulh v22.4S, v18.4S, v23.s[1] +str q1, [x0, #112] +ldr q1, [x0, #272] +mul v17.4S, v17.4S,v24.s[0] +mla v17.4S, v11.4S, v31.s[0] +sub v11.4s, v19.4s, v15.4s +add v19.4s, v19.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v23.s[3] +str q4, [x0, #64] +mul v18.4S, v18.4S,v24.s[1] +mla v18.4S, v22.4S, v31.s[0] +sub v22.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +sqrdmulh v17.4S, v14.4S, v23.s[2] +str q6, [x0, #80] +mul v16.4S, v16.4S,v24.s[3] +mla v16.4S, v15.4S, v31.s[0] +sub v15.4s, v29.4s, v18.4s +add v29.4s, v29.4s, v18.4s +sqrdmulh v18.4S, v20.4S, v9.s[0] +mul v14.4S, v14.4S,v24.s[2] +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v11.4s, v16.4s +add v11.4s, v11.4s, v16.4s +sqrdmulh v16.4S, v10.4S, v9.s[0] +mul v20.4S, v20.4S,v2.s[0] +mla v20.4S, v18.4S, v31.s[0] +sub v18.4s, v19.4s, v14.4s +add v19.4s, v19.4s, v14.4s +sqrdmulh v23.4S, v26.4S, v9.s[0] +ldr q24, [x0, #496] +mul v10.4S, v10.4S,v2.s[0] +mla v10.4S, v16.4S, v31.s[0] +sub v16.4s, v7.4s, v20.4s +add v7.4s, v7.4s, v20.4s +sqrdmulh v20.4S, v30.4S, v9.s[0] +ldr q25, [x0, #480] +mul v26.4S, v26.4S,v2.s[0] +mla v26.4S, v23.4S, v31.s[0] +sub v23.4s, v12.4s, v10.4s +add v12.4s, v12.4s, v10.4s +sqrdmulh v10.4S, v7.4S, v9.s[1] +str q0, [x0, #128] +ldr q0, [x0, #448] +mul v30.4S, v30.4S,v2.s[0] +mla v30.4S, v20.4S, v31.s[0] +sub v20.4s, v3.4s, v26.4s +add v3.4s, v3.4s, v26.4s +sqrdmulh v26.4S, v12.4S, v9.s[1] +str q22, [x0, #144] +ldr q22, [x0, #464] +mul v7.4S, v7.4S,v2.s[1] +mla v7.4S, v10.4S, v31.s[0] +sub v10.4s, v1.4s, v30.4s +add v1.4s, v1.4s, v30.4s +sqrdmulh v30.4S, v16.4S, v9.s[2] +str q29, [x0, #160] +ldr q29, [x0, #432] +mul v12.4S, v12.4S,v2.s[1] +mla v12.4S, v26.4S, v31.s[0] +sub v26.4s, v1.4s, v7.4s +add v1.4s, v1.4s, v7.4s +ldr q7, [x17, #+256] +ldr q8, [x17, #+272] +ldr q14, [x17, #+288] +ldr q6, [x17, #+304] +sqrdmulh v4.4S, v23.4S, v9.s[2] +str q15, [x0, #176] +ldr q15, [x0, #416] +mul v16.4S, v16.4S,v2.s[2] +mla v16.4S, v30.4S, v31.s[0] +sub v30.4s, v3.4s, v12.4s +add v3.4s, v3.4s, v12.4s +sqrdmulh v12.4S, v1.4S, v28.s[0] +str q11, [x0, #224] +ldr q11, [x0, #384] +mul v23.4S, v23.4S,v2.s[2] +mla v23.4S, v4.4S, v31.s[0] +sub v4.4s, v10.4s, v16.4s +add v10.4s, v10.4s, v16.4s +sqrdmulh v16.4S, v26.4S, v28.s[1] +str q17, [x0, #240] +ldr q17, [x0, #400] +mul v1.4S, v1.4S,v5.s[0] +mla v1.4S, v12.4S, v31.s[0] +sub v12.4s, v20.4s, v23.4s +add v20.4s, v20.4s, v23.4s +sqrdmulh v23.4S, v4.4S, v28.s[3] +str q19, [x0, #192] +mul v26.4S, v26.4S,v5.s[1] +mla v26.4S, v16.4S, v31.s[0] +sub v16.4s, v3.4s, v1.4s +add v3.4s, v3.4s, v1.4s +sqrdmulh v1.4S, v10.4S, v28.s[2] +str q18, [x0, #208] +mul v4.4S, v4.4S,v5.s[3] +mla v4.4S, v23.4S, v31.s[0] +sub v23.4s, v30.4s, v26.4s +add v30.4s, v30.4s, v26.4s +sqrdmulh v26.4S, v24.4S, v8.s[0] +mul v10.4S, v10.4S,v5.s[2] +mla v10.4S, v1.4S, v31.s[0] +sub v1.4s, v12.4s, v4.4s +add v12.4s, v12.4s, v4.4s +sqrdmulh v4.4S, v25.4S, v8.s[0] +mul v24.4S, v24.4S,v7.s[0] +mla v24.4S, v26.4S, v31.s[0] +sub v26.4s, v20.4s, v10.4s +add v20.4s, v20.4s, v10.4s +sqrdmulh v28.4S, v0.4S, v8.s[0] +ldr q5, [x0, #624] +mul v25.4S, v25.4S,v7.s[0] +mla v25.4S, v4.4S, v31.s[0] +sub v4.4s, v29.4s, v24.4s +add v29.4s, v29.4s, v24.4s +sqrdmulh v24.4S, v22.4S, v8.s[0] +ldr q9, [x0, #608] +mul v0.4S, v0.4S,v7.s[0] +mla v0.4S, v28.4S, v31.s[0] +sub v28.4s, v15.4s, v25.4s +add v15.4s, v15.4s, v25.4s +sqrdmulh v25.4S, v29.4S, v8.s[1] +str q3, [x0, #256] +ldr q3, [x0, #576] +mul v22.4S, v22.4S,v7.s[0] +mla v22.4S, v24.4S, v31.s[0] +sub v24.4s, v11.4s, v0.4s +add v11.4s, v11.4s, v0.4s +sqrdmulh v0.4S, v15.4S, v8.s[1] +str q16, [x0, #272] +ldr q16, [x0, #592] +mul v29.4S, v29.4S,v7.s[1] +mla v29.4S, v25.4S, v31.s[0] +sub v25.4s, v17.4s, v22.4s +add v17.4s, v17.4s, v22.4s +sqrdmulh v22.4S, v4.4S, v8.s[2] +str q30, [x0, #288] +ldr q30, [x0, #560] +mul v15.4S, v15.4S,v7.s[1] +mla v15.4S, v0.4S, v31.s[0] +sub v0.4s, v17.4s, v29.4s +add v17.4s, v17.4s, v29.4s +ldr q29, [x17, #+320] +ldr q2, [x17, #+336] +ldr q10, [x17, #+352] +ldr q18, [x17, #+368] +sqrdmulh v19.4S, v28.4S, v8.s[2] +str q23, [x0, #304] +ldr q23, [x0, #544] +mul v4.4S, v4.4S,v7.s[2] +mla v4.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v15.4s +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v17.4S, v6.s[0] +str q12, [x0, #352] +ldr q12, [x0, #512] +mul v28.4S, v28.4S,v7.s[2] +mla v28.4S, v19.4S, v31.s[0] +sub v19.4s, v25.4s, v4.4s +add v25.4s, v25.4s, v4.4s +sqrdmulh v4.4S, v0.4S, v6.s[1] +str q1, [x0, #368] +ldr q1, [x0, #528] +mul v17.4S, v17.4S,v14.s[0] +mla v17.4S, v15.4S, v31.s[0] +sub v15.4s, v24.4s, v28.4s +add v24.4s, v24.4s, v28.4s +sqrdmulh v28.4S, v19.4S, v6.s[3] +str q20, [x0, #320] +mul v0.4S, v0.4S,v14.s[1] +mla v0.4S, v4.4S, v31.s[0] +sub v4.4s, v11.4s, v17.4s +add v11.4s, v11.4s, v17.4s +sqrdmulh v17.4S, v25.4S, v6.s[2] +str q26, [x0, #336] +mul v19.4S, v19.4S,v14.s[3] +mla v19.4S, v28.4S, v31.s[0] +sub v28.4s, v22.4s, v0.4s +add v22.4s, v22.4s, v0.4s +sqrdmulh v0.4S, v5.4S, v2.s[0] +mul v25.4S, v25.4S,v14.s[2] +mla v25.4S, v17.4S, v31.s[0] +sub v17.4s, v15.4s, v19.4s +add v15.4s, v15.4s, v19.4s +sqrdmulh v19.4S, v9.4S, v2.s[0] +mul v5.4S, v5.4S,v29.s[0] +mla v5.4S, v0.4S, v31.s[0] +sub v0.4s, v24.4s, v25.4s +add v24.4s, v24.4s, v25.4s +sqrdmulh v6.4S, v3.4S, v2.s[0] +ldr q14, [x0, #752] +mul v9.4S, v9.4S,v29.s[0] +mla v9.4S, v19.4S, v31.s[0] +sub v19.4s, v30.4s, v5.4s +add v30.4s, v30.4s, v5.4s +sqrdmulh v5.4S, v16.4S, v2.s[0] +ldr q8, [x0, #736] +mul v3.4S, v3.4S,v29.s[0] +mla v3.4S, v6.4S, v31.s[0] +sub v6.4s, v23.4s, v9.4s +add v23.4s, v23.4s, v9.4s +sqrdmulh v9.4S, v30.4S, v2.s[1] +str q11, [x0, #384] +ldr q11, [x0, #704] +mul v16.4S, v16.4S,v29.s[0] +mla v16.4S, v5.4S, v31.s[0] +sub v5.4s, v12.4s, v3.4s +add v12.4s, v12.4s, v3.4s +sqrdmulh v3.4S, v23.4S, v2.s[1] +str q4, [x0, #400] +ldr q4, [x0, #720] +mul v30.4S, v30.4S,v29.s[1] +mla v30.4S, v9.4S, v31.s[0] +sub v9.4s, v1.4s, v16.4s +add v1.4s, v1.4s, v16.4s +sqrdmulh v16.4S, v19.4S, v2.s[2] +str q22, [x0, #416] +ldr q22, [x0, #688] +mul v23.4S, v23.4S,v29.s[1] +mla v23.4S, v3.4S, v31.s[0] +sub v3.4s, v1.4s, v30.4s +add v1.4s, v1.4s, v30.4s +ldr q30, [x17, #+384] +ldr q7, [x17, #+400] +ldr q25, [x17, #+416] +ldr q26, [x17, #+432] +sqrdmulh v20.4S, v6.4S, v2.s[2] +str q28, [x0, #432] +ldr q28, [x0, #672] +mul v19.4S, v19.4S,v29.s[2] +mla v19.4S, v16.4S, v31.s[0] +sub v16.4s, v12.4s, v23.4s +add v12.4s, v12.4s, v23.4s +sqrdmulh v23.4S, v1.4S, v18.s[0] +str q15, [x0, #480] +ldr q15, [x0, #640] +mul v6.4S, v6.4S,v29.s[2] +mla v6.4S, v20.4S, v31.s[0] +sub v20.4s, v9.4s, v19.4s +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v3.4S, v18.s[1] +str q17, [x0, #496] +ldr q17, [x0, #656] +mul v1.4S, v1.4S,v10.s[0] +mla v1.4S, v23.4S, v31.s[0] +sub v23.4s, v5.4s, v6.4s +add v5.4s, v5.4s, v6.4s +sqrdmulh v6.4S, v20.4S, v18.s[3] +str q24, [x0, #448] +mul v3.4S, v3.4S,v10.s[1] +mla v3.4S, v19.4S, v31.s[0] +sub v19.4s, v12.4s, v1.4s +add v12.4s, v12.4s, v1.4s +sqrdmulh v1.4S, v9.4S, v18.s[2] +str q0, [x0, #464] +mul v20.4S, v20.4S,v10.s[3] +mla v20.4S, v6.4S, v31.s[0] +sub v6.4s, v16.4s, v3.4s +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v14.4S, v7.s[0] +mul v9.4S, v9.4S,v10.s[2] +mla v9.4S, v1.4S, v31.s[0] +sub v1.4s, v23.4s, v20.4s +add v23.4s, v23.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v7.s[0] +mul v14.4S, v14.4S,v30.s[0] +mla v14.4S, v3.4S, v31.s[0] +sub v3.4s, v5.4s, v9.4s +add v5.4s, v5.4s, v9.4s +sqrdmulh v18.4S, v11.4S, v7.s[0] +ldr q10, [x0, #880] +mul v8.4S, v8.4S,v30.s[0] +mla v8.4S, v20.4S, v31.s[0] +sub v20.4s, v22.4s, v14.4s +add v22.4s, v22.4s, v14.4s +sqrdmulh v14.4S, v4.4S, v7.s[0] +ldr q2, [x0, #864] +mul v11.4S, v11.4S,v30.s[0] +mla v11.4S, v18.4S, v31.s[0] +sub v18.4s, v28.4s, v8.4s +add v28.4s, v28.4s, v8.4s +sqrdmulh v8.4S, v22.4S, v7.s[1] +str q12, [x0, #512] +ldr q12, [x0, #832] +mul v4.4S, v4.4S,v30.s[0] +mla v4.4S, v14.4S, v31.s[0] +sub v14.4s, v15.4s, v11.4s +add v15.4s, v15.4s, v11.4s +sqrdmulh v11.4S, v28.4S, v7.s[1] +str q19, [x0, #528] +ldr q19, [x0, #848] +mul v22.4S, v22.4S,v30.s[1] +mla v22.4S, v8.4S, v31.s[0] +sub v8.4s, v17.4s, v4.4s +add v17.4s, v17.4s, v4.4s +sqrdmulh v4.4S, v20.4S, v7.s[2] +str q16, [x0, #544] +ldr q16, [x0, #816] +mul v28.4S, v28.4S,v30.s[1] +mla v28.4S, v11.4S, v31.s[0] +sub v11.4s, v17.4s, v22.4s +add v17.4s, v17.4s, v22.4s +ldr q22, [x17, #+448] +ldr q29, [x17, #+464] +ldr q9, [x17, #+480] +ldr q0, [x17, #+496] +sqrdmulh v24.4S, v18.4S, v7.s[2] +str q6, [x0, #560] +ldr q6, [x0, #800] +mul v20.4S, v20.4S,v30.s[2] +mla v20.4S, v4.4S, v31.s[0] +sub v4.4s, v15.4s, v28.4s +add v15.4s, v15.4s, v28.4s +sqrdmulh v28.4S, v17.4S, v26.s[0] +str q23, [x0, #608] +ldr q23, [x0, #768] +mul v18.4S, v18.4S,v30.s[2] +mla v18.4S, v24.4S, v31.s[0] +sub v24.4s, v8.4s, v20.4s +add v8.4s, v8.4s, v20.4s +sqrdmulh v20.4S, v11.4S, v26.s[1] +str q1, [x0, #624] +ldr q1, [x0, #784] +mul v17.4S, v17.4S,v25.s[0] +mla v17.4S, v28.4S, v31.s[0] +sub v28.4s, v14.4s, v18.4s +add v14.4s, v14.4s, v18.4s +sqrdmulh v18.4S, v24.4S, v26.s[3] +str q5, [x0, #576] +mul v11.4S, v11.4S,v25.s[1] +mla v11.4S, v20.4S, v31.s[0] +sub v20.4s, v15.4s, v17.4s +add v15.4s, v15.4s, v17.4s +sqrdmulh v17.4S, v8.4S, v26.s[2] +str q3, [x0, #592] +mul v24.4S, v24.4S,v25.s[3] +mla v24.4S, v18.4S, v31.s[0] +sub v18.4s, v4.4s, v11.4s +add v4.4s, v4.4s, v11.4s +sqrdmulh v11.4S, v10.4S, v29.s[0] +mul v8.4S, v8.4S,v25.s[2] +mla v8.4S, v17.4S, v31.s[0] +sub v17.4s, v28.4s, v24.4s +add v28.4s, v28.4s, v24.4s +sqrdmulh v24.4S, v2.4S, v29.s[0] +mul v10.4S, v10.4S,v22.s[0] +mla v10.4S, v11.4S, v31.s[0] +sub v11.4s, v14.4s, v8.4s +add v14.4s, v14.4s, v8.4s +sqrdmulh v26.4S, v12.4S, v29.s[0] +ldr q25, [x0, #1008] +mul v2.4S, v2.4S,v22.s[0] +mla v2.4S, v24.4S, v31.s[0] +sub v24.4s, v16.4s, v10.4s +add v16.4s, v16.4s, v10.4s +sqrdmulh v10.4S, v19.4S, v29.s[0] +ldr q7, [x0, #992] +mul v12.4S, v12.4S,v22.s[0] +mla v12.4S, v26.4S, v31.s[0] +sub v26.4s, v6.4s, v2.4s +add v6.4s, v6.4s, v2.4s +sqrdmulh v2.4S, v16.4S, v29.s[1] +str q15, [x0, #640] +ldr q15, [x0, #960] +mul v19.4S, v19.4S,v22.s[0] +mla v19.4S, v10.4S, v31.s[0] +sub v10.4s, v23.4s, v12.4s +add v23.4s, v23.4s, v12.4s +sqrdmulh v12.4S, v6.4S, v29.s[1] +str q20, [x0, #656] +ldr q20, [x0, #976] +mul v16.4S, v16.4S,v22.s[1] +mla v16.4S, v2.4S, v31.s[0] +sub v2.4s, v1.4s, v19.4s +add v1.4s, v1.4s, v19.4s +sqrdmulh v19.4S, v24.4S, v29.s[2] +str q4, [x0, #672] +ldr q4, [x0, #944] +mul v6.4S, v6.4S,v22.s[1] +mla v6.4S, v12.4S, v31.s[0] +sub v12.4s, v1.4s, v16.4s +add v1.4s, v1.4s, v16.4s +ldr q16, [x17, #+512] +ldr q30, [x17, #+528] +ldr q8, [x17, #+544] +ldr q3, [x17, #+560] +sqrdmulh v5.4S, v26.4S, v29.s[2] +str q18, [x0, #688] +ldr q18, [x0, #928] +mul v24.4S, v24.4S,v22.s[2] +mla v24.4S, v19.4S, v31.s[0] +sub v19.4s, v23.4s, v6.4s +add v23.4s, v23.4s, v6.4s +sqrdmulh v6.4S, v1.4S, v0.s[0] +str q28, [x0, #736] +ldr q28, [x0, #896] +mul v26.4S, v26.4S,v22.s[2] +mla v26.4S, v5.4S, v31.s[0] +sub v5.4s, v2.4s, v24.4s +add v2.4s, v2.4s, v24.4s +sqrdmulh v24.4S, v12.4S, v0.s[1] +str q17, [x0, #752] +ldr q17, [x0, #912] +mul v1.4S, v1.4S,v9.s[0] +mla v1.4S, v6.4S, v31.s[0] +sub v6.4s, v10.4s, v26.4s +add v10.4s, v10.4s, v26.4s +sqrdmulh v26.4S, v5.4S, v0.s[3] +str q14, [x0, #704] +mul v12.4S, v12.4S,v9.s[1] +mla v12.4S, v24.4S, v31.s[0] +sub v24.4s, v23.4s, v1.4s +add v23.4s, v23.4s, v1.4s +sqrdmulh v1.4S, v2.4S, v0.s[2] +str q11, [x0, #720] +mul v5.4S, v5.4S,v9.s[3] +mla v5.4S, v26.4S, v31.s[0] +sub v26.4s, v19.4s, v12.4s +add v19.4s, v19.4s, v12.4s +sqrdmulh v12.4S, v25.4S, v30.s[0] +mul v2.4S, v2.4S,v9.s[2] +mla v2.4S, v1.4S, v31.s[0] +sub v1.4s, v6.4s, v5.4s +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v7.4S, v30.s[0] +mul v25.4S, v25.4S,v16.s[0] +mla v25.4S, v12.4S, v31.s[0] +sub v12.4s, v10.4s, v2.4s +add v10.4s, v10.4s, v2.4s +sqrdmulh v0.4S, v15.4S, v30.s[0] +mul v7.4S, v7.4S,v16.s[0] +mla v7.4S, v5.4S, v31.s[0] +sub v5.4s, v4.4s, v25.4s +add v4.4s, v4.4s, v25.4s +sqrdmulh v25.4S, v20.4S, v30.s[0] +mul v15.4S, v15.4S,v16.s[0] +mla v15.4S, v0.4S, v31.s[0] +sub v0.4s, v18.4s, v7.4s +add v18.4s, v18.4s, v7.4s +sqrdmulh v7.4S, v4.4S, v30.s[1] +str q23, [x0, #768] +mul v20.4S, v20.4S,v16.s[0] +mla v20.4S, v25.4S, v31.s[0] +sub v25.4s, v28.4s, v15.4s +add v28.4s, v28.4s, v15.4s +sqrdmulh v15.4S, v18.4S, v30.s[1] +str q24, [x0, #784] +mul v4.4S, v4.4S,v16.s[1] +mla v4.4S, v7.4S, v31.s[0] +sub v7.4s, v17.4s, v20.4s +add v17.4s, v17.4s, v20.4s +sqrdmulh v20.4S, v5.4S, v30.s[2] +str q19, [x0, #800] +mul v18.4S, v18.4S,v16.s[1] +mla v18.4S, v15.4S, v31.s[0] +sub v15.4s, v17.4s, v4.4s +add v17.4s, v17.4s, v4.4s +sqrdmulh v4.4S, v0.4S, v30.s[2] +str q26, [x0, #816] +mul v5.4S, v5.4S,v16.s[2] +mla v5.4S, v20.4S, v31.s[0] +sub v20.4s, v28.4s, v18.4s +add v28.4s, v28.4s, v18.4s +sqrdmulh v18.4S, v17.4S, v3.s[0] +str q6, [x0, #864] +mul v0.4S, v0.4S,v16.s[2] +mla v0.4S, v4.4S, v31.s[0] +sub v4.4s, v7.4s, v5.4s +add v7.4s, v7.4s, v5.4s +sqrdmulh v5.4S, v15.4S, v3.s[1] +str q1, [x0, #880] +mul v17.4S, v17.4S,v8.s[0] +mla v17.4S, v18.4S, v31.s[0] +sub v18.4s, v25.4s, v0.4s +add v25.4s, v25.4s, v0.4s +sqrdmulh v0.4S, v4.4S, v3.s[3] +str q10, [x0, #832] +mul v15.4S, v15.4S,v8.s[1] +mla v15.4S, v5.4S, v31.s[0] +sub v5.4s, v28.4s, v17.4s +add v28.4s, v28.4s, v17.4s +sqrdmulh v17.4S, v7.4S, v3.s[2] +str q12, [x0, #848] +mul v4.4S, v4.4S,v8.s[3] +mla v4.4S, v0.4S, v31.s[0] +sub v0.4s, v20.4s, v15.4s +add v20.4s, v20.4s, v15.4s +mul v7.4S, v7.4S,v8.s[2] +mla v7.4S, v17.4S, v31.s[0] +sub v17.4s, v18.4s, v4.4s +add v18.4s, v18.4s, v4.4s +sub v4.4s, v25.4s, v7.4s +add v25.4s, v25.4s, v7.4s +str q28, [x0, #896] +str q5, [x0, #912] +str q20, [x0, #928] +str q0, [x0, #944] +str q18, [x0, #992] +str q17, [x0, #1008] +str q25, [x0, #960] +str q4, [x0, #976] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1444 +// Instruction count: 1440 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_0_0.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_0_0.s new file mode 100644 index 0000000..209eccf --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_0_0.s @@ -0,0 +1,1494 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_0_0 +.global _ntt_u32_incomplete_neon_asm_var_4_2_0_0 +ntt_u32_incomplete_neon_asm_var_4_2_0_0: +_ntt_u32_incomplete_neon_asm_var_4_2_0_0: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #800] +ldr q21, [x0, #864] +ldr q20, [x0, #928] +ldr q19, [x0, #992] +ldr q18, [x0, #288] +ldr q17, [x0, #352] +ldr q16, [x0, #416] +ldr q3, [x0, #480] +ldr q2, [x0, #544] +ldr q1, [x0, #608] +ldr q0, [x0, #672] +ldr q15, [x0, #736] +ldr q14, [x0, #32] +ldr q13, [x0, #96] +ldr q12, [x0, #160] +ldr q11, [x0, #224] +sqrdmulh v10.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +mla v22.4S, v10.4S, v31.s[0] +sub v10.4s, v18.4s, v22.4s +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v17.4s, v21.4s +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +mla v20.4S, v21.4S, v31.s[0] +sub v21.4s, v16.4s, v20.4s +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +mla v19.4S, v20.4S, v31.s[0] +sub v20.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +mla v2.4S, v19.4S, v31.s[0] +sub v19.4s, v14.4s, v2.4s +add v14.4s, v14.4s, v2.4s +sqrdmulh v2.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +mla v1.4S, v2.4S, v31.s[0] +sub v2.4s, v13.4s, v1.4s +add v13.4s, v13.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v29.s[0] +mul v0.4S, v0.4S,v30.s[0] +mla v0.4S, v1.4S, v31.s[0] +sub v1.4s, v12.4s, v0.4s +add v12.4s, v12.4s, v0.4s +sqrdmulh v0.4S, v15.4S, v29.s[0] +mul v15.4S, v15.4S,v30.s[0] +mla v15.4S, v0.4S, v31.s[0] +sub v0.4s, v11.4s, v15.4s +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +mla v16.4S, v15.4S, v31.s[0] +sub v15.4s, v12.4s, v16.4s +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +mla v3.4S, v16.4S, v31.s[0] +sub v16.4s, v11.4s, v3.4s +add v11.4s, v11.4s, v3.4s +sqrdmulh v3.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +mla v18.4S, v3.4S, v31.s[0] +sub v3.4s, v14.4s, v18.4s +add v14.4s, v14.4s, v18.4s +sqrdmulh v18.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +mla v17.4S, v18.4S, v31.s[0] +sub v18.4s, v13.4s, v17.4s +add v13.4s, v13.4s, v17.4s +sqrdmulh v17.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +mla v21.4S, v17.4S, v31.s[0] +sub v17.4s, v1.4s, v21.4s +add v1.4s, v1.4s, v21.4s +sqrdmulh v21.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +mla v20.4S, v21.4S, v31.s[0] +sub v21.4s, v0.4s, v20.4s +add v0.4s, v0.4s, v20.4s +sqrdmulh v20.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +mla v10.4S, v20.4S, v31.s[0] +sub v20.4s, v19.4s, v10.4s +add v19.4s, v19.4s, v10.4s +sqrdmulh v10.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +mla v22.4S, v10.4S, v31.s[0] +sub v10.4s, v2.4s, v22.4s +add v2.4s, v2.4s, v22.4s +sqrdmulh v22.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +mla v12.4S, v22.4S, v31.s[0] +sub v22.4s, v14.4s, v12.4s +add v14.4s, v14.4s, v12.4s +sqrdmulh v12.4S, v11.4S, v27.s[0] +mul v11.4S, v11.4S,v28.s[0] +mla v11.4S, v12.4S, v31.s[0] +sub v12.4s, v13.4s, v11.4s +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v15.4S, v27.s[1] +mul v15.4S, v15.4S,v28.s[1] +mla v15.4S, v11.4S, v31.s[0] +sub v11.4s, v3.4s, v15.4s +add v3.4s, v3.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v27.s[1] +mul v16.4S, v16.4S,v28.s[1] +mla v16.4S, v15.4S, v31.s[0] +sub v15.4s, v18.4s, v16.4s +add v18.4s, v18.4s, v16.4s +sqrdmulh v16.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +mla v1.4S, v16.4S, v31.s[0] +sub v16.4s, v19.4s, v1.4s +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v27.s[2] +mul v0.4S, v0.4S,v28.s[2] +mla v0.4S, v1.4S, v31.s[0] +sub v1.4s, v2.4s, v0.4s +add v2.4s, v2.4s, v0.4s +sqrdmulh v0.4S, v17.4S, v27.s[3] +mul v17.4S, v17.4S,v28.s[3] +mla v17.4S, v0.4S, v31.s[0] +sub v0.4s, v20.4s, v17.4s +add v20.4s, v20.4s, v17.4s +sqrdmulh v17.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +mla v21.4S, v17.4S, v31.s[0] +sub v17.4s, v10.4s, v21.4s +add v10.4s, v10.4s, v21.4s +sqrdmulh v21.4S, v13.4S, v25.s[0] +mul v13.4S, v13.4S,v26.s[0] +mla v13.4S, v21.4S, v31.s[0] +sub v21.4s, v14.4s, v13.4s +add v14.4s, v14.4s, v13.4s +sqrdmulh v13.4S, v12.4S, v25.s[1] +mul v12.4S, v12.4S,v26.s[1] +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v18.4S, v25.s[2] +mul v18.4S, v18.4S,v26.s[2] +mla v18.4S, v12.4S, v31.s[0] +sub v12.4s, v3.4s, v18.4s +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v15.4S, v25.s[3] +mul v15.4S, v15.4S,v26.s[3] +mla v15.4S, v18.4S, v31.s[0] +sub v18.4s, v11.4s, v15.4s +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v23.s[0] +mul v2.4S, v2.4S,v24.s[0] +mla v2.4S, v15.4S, v31.s[0] +sub v15.4s, v19.4s, v2.4s +add v19.4s, v19.4s, v2.4s +sqrdmulh v2.4S, v1.4S, v23.s[1] +mul v1.4S, v1.4S,v24.s[1] +mla v1.4S, v2.4S, v31.s[0] +sub v2.4s, v16.4s, v1.4s +add v16.4s, v16.4s, v1.4s +sqrdmulh v1.4S, v10.4S, v23.s[2] +mul v10.4S, v10.4S,v24.s[2] +mla v10.4S, v1.4S, v31.s[0] +sub v1.4s, v20.4s, v10.4s +add v20.4s, v20.4s, v10.4s +sqrdmulh v10.4S, v17.4S, v23.s[3] +mul v17.4S, v17.4S,v24.s[3] +mla v17.4S, v10.4S, v31.s[0] +sub v10.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +str q14, [x0, #32] +str q21, [x0, #96] +str q22, [x0, #160] +str q13, [x0, #224] +str q3, [x0, #288] +str q12, [x0, #352] +str q11, [x0, #416] +str q18, [x0, #480] +str q19, [x0, #544] +str q15, [x0, #608] +str q16, [x0, #672] +str q2, [x0, #736] +str q20, [x0, #800] +str q1, [x0, #864] +str q0, [x0, #928] +str q10, [x0, #992] +ldr q10, [x0, #816] +ldr q0, [x0, #880] +ldr q1, [x0, #944] +ldr q20, [x0, #1008] +ldr q2, [x0, #304] +ldr q16, [x0, #368] +ldr q15, [x0, #432] +ldr q19, [x0, #496] +ldr q18, [x0, #560] +ldr q11, [x0, #624] +ldr q12, [x0, #688] +ldr q3, [x0, #752] +ldr q13, [x0, #48] +ldr q22, [x0, #112] +ldr q21, [x0, #176] +ldr q14, [x0, #240] +sqrdmulh v17.4S, v10.4S, v29.s[0] +mul v10.4S, v10.4S,v30.s[0] +mla v10.4S, v17.4S, v31.s[0] +sub v17.4s, v2.4s, v10.4s +add v2.4s, v2.4s, v10.4s +sqrdmulh v10.4S, v0.4S, v29.s[0] +mul v0.4S, v0.4S,v30.s[0] +mla v0.4S, v10.4S, v31.s[0] +sub v10.4s, v16.4s, v0.4s +add v16.4s, v16.4s, v0.4s +sqrdmulh v0.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +mla v1.4S, v0.4S, v31.s[0] +sub v0.4s, v15.4s, v1.4s +add v15.4s, v15.4s, v1.4s +sqrdmulh v1.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +mla v20.4S, v1.4S, v31.s[0] +sub v1.4s, v19.4s, v20.4s +add v19.4s, v19.4s, v20.4s +sqrdmulh v20.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +mla v18.4S, v20.4S, v31.s[0] +sub v20.4s, v13.4s, v18.4s +add v13.4s, v13.4s, v18.4s +sqrdmulh v18.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +mla v11.4S, v18.4S, v31.s[0] +sub v18.4s, v22.4s, v11.4s +add v22.4s, v22.4s, v11.4s +sqrdmulh v11.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +mla v12.4S, v11.4S, v31.s[0] +sub v11.4s, v21.4s, v12.4s +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +mla v3.4S, v12.4S, v31.s[0] +sub v12.4s, v14.4s, v3.4s +add v14.4s, v14.4s, v3.4s +sqrdmulh v3.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +mla v15.4S, v3.4S, v31.s[0] +sub v3.4s, v21.4s, v15.4s +add v21.4s, v21.4s, v15.4s +sqrdmulh v15.4S, v19.4S, v29.s[1] +mul v19.4S, v19.4S,v30.s[1] +mla v19.4S, v15.4S, v31.s[0] +sub v15.4s, v14.4s, v19.4s +add v14.4s, v14.4s, v19.4s +sqrdmulh v19.4S, v2.4S, v29.s[1] +mul v2.4S, v2.4S,v30.s[1] +mla v2.4S, v19.4S, v31.s[0] +sub v19.4s, v13.4s, v2.4s +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +mla v16.4S, v2.4S, v31.s[0] +sub v2.4s, v22.4s, v16.4s +add v22.4s, v22.4s, v16.4s +sqrdmulh v16.4S, v0.4S, v29.s[2] +mul v0.4S, v0.4S,v30.s[2] +mla v0.4S, v16.4S, v31.s[0] +sub v16.4s, v11.4s, v0.4s +add v11.4s, v11.4s, v0.4s +sqrdmulh v0.4S, v1.4S, v29.s[2] +mul v1.4S, v1.4S,v30.s[2] +mla v1.4S, v0.4S, v31.s[0] +sub v0.4s, v12.4s, v1.4s +add v12.4s, v12.4s, v1.4s +sqrdmulh v1.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +mla v17.4S, v1.4S, v31.s[0] +sub v1.4s, v20.4s, v17.4s +add v20.4s, v20.4s, v17.4s +sqrdmulh v17.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +mla v10.4S, v17.4S, v31.s[0] +sub v17.4s, v18.4s, v10.4s +add v18.4s, v18.4s, v10.4s +sqrdmulh v10.4S, v21.4S, v27.s[0] +mul v21.4S, v21.4S,v28.s[0] +mla v21.4S, v10.4S, v31.s[0] +sub v10.4s, v13.4s, v21.4s +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +mla v14.4S, v21.4S, v31.s[0] +sub v21.4s, v22.4s, v14.4s +add v22.4s, v22.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v27.s[1] +mul v3.4S, v3.4S,v28.s[1] +mla v3.4S, v14.4S, v31.s[0] +sub v14.4s, v19.4s, v3.4s +add v19.4s, v19.4s, v3.4s +sqrdmulh v3.4S, v15.4S, v27.s[1] +mul v15.4S, v15.4S,v28.s[1] +mla v15.4S, v3.4S, v31.s[0] +sub v3.4s, v2.4s, v15.4s +add v2.4s, v2.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v27.s[2] +mul v11.4S, v11.4S,v28.s[2] +mla v11.4S, v15.4S, v31.s[0] +sub v15.4s, v20.4s, v11.4s +add v20.4s, v20.4s, v11.4s +sqrdmulh v11.4S, v12.4S, v27.s[2] +mul v12.4S, v12.4S,v28.s[2] +mla v12.4S, v11.4S, v31.s[0] +sub v11.4s, v18.4s, v12.4s +add v18.4s, v18.4s, v12.4s +sqrdmulh v12.4S, v16.4S, v27.s[3] +mul v16.4S, v16.4S,v28.s[3] +mla v16.4S, v12.4S, v31.s[0] +sub v12.4s, v1.4s, v16.4s +add v1.4s, v1.4s, v16.4s +sqrdmulh v16.4S, v0.4S, v27.s[3] +mul v0.4S, v0.4S,v28.s[3] +mla v0.4S, v16.4S, v31.s[0] +sub v16.4s, v17.4s, v0.4s +add v17.4s, v17.4s, v0.4s +sqrdmulh v0.4S, v22.4S, v25.s[0] +mul v22.4S, v22.4S,v26.s[0] +mla v22.4S, v0.4S, v31.s[0] +sub v0.4s, v13.4s, v22.4s +add v13.4s, v13.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v25.s[1] +mul v21.4S, v21.4S,v26.s[1] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v10.4s, v21.4s +add v10.4s, v10.4s, v21.4s +sqrdmulh v21.4S, v2.4S, v25.s[2] +mul v2.4S, v2.4S,v26.s[2] +mla v2.4S, v21.4S, v31.s[0] +sub v21.4s, v19.4s, v2.4s +add v19.4s, v19.4s, v2.4s +sqrdmulh v2.4S, v3.4S, v25.s[3] +mul v3.4S, v3.4S,v26.s[3] +mla v3.4S, v2.4S, v31.s[0] +sub v2.4s, v14.4s, v3.4s +add v14.4s, v14.4s, v3.4s +sqrdmulh v3.4S, v18.4S, v23.s[0] +mul v18.4S, v18.4S,v24.s[0] +mla v18.4S, v3.4S, v31.s[0] +sub v3.4s, v20.4s, v18.4s +add v20.4s, v20.4s, v18.4s +sqrdmulh v18.4S, v11.4S, v23.s[1] +mul v11.4S, v11.4S,v24.s[1] +mla v11.4S, v18.4S, v31.s[0] +sub v18.4s, v15.4s, v11.4s +add v15.4s, v15.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v23.s[2] +mul v17.4S, v17.4S,v24.s[2] +mla v17.4S, v11.4S, v31.s[0] +sub v11.4s, v1.4s, v17.4s +add v1.4s, v1.4s, v17.4s +sqrdmulh v17.4S, v16.4S, v23.s[3] +mul v16.4S, v16.4S,v24.s[3] +mla v16.4S, v17.4S, v31.s[0] +sub v17.4s, v12.4s, v16.4s +add v12.4s, v12.4s, v16.4s +str q13, [x0, #48] +str q0, [x0, #112] +str q10, [x0, #176] +str q22, [x0, #240] +str q19, [x0, #304] +str q21, [x0, #368] +str q14, [x0, #432] +str q2, [x0, #496] +str q20, [x0, #560] +str q3, [x0, #624] +str q15, [x0, #688] +str q18, [x0, #752] +str q1, [x0, #816] +str q11, [x0, #880] +str q12, [x0, #944] +str q17, [x0, #1008] +ldr q17, [x0, #768] +ldr q12, [x0, #832] +ldr q11, [x0, #896] +ldr q1, [x0, #960] +ldr q18, [x0, #256] +ldr q15, [x0, #320] +ldr q3, [x0, #384] +ldr q20, [x0, #448] +ldr q2, [x0, #512] +ldr q14, [x0, #576] +ldr q21, [x0, #640] +ldr q19, [x0, #704] +ldr q22, [x0, #0] +ldr q10, [x0, #64] +ldr q0, [x0, #128] +ldr q13, [x0, #192] +sqrdmulh v16.4S, v17.4S, v29.s[0] +mul v17.4S, v17.4S,v30.s[0] +mla v17.4S, v16.4S, v31.s[0] +sub v16.4s, v18.4s, v17.4s +add v18.4s, v18.4s, v17.4s +sqrdmulh v17.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +mla v12.4S, v17.4S, v31.s[0] +sub v17.4s, v15.4s, v12.4s +add v15.4s, v15.4s, v12.4s +sqrdmulh v12.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +mla v11.4S, v12.4S, v31.s[0] +sub v12.4s, v3.4s, v11.4s +add v3.4s, v3.4s, v11.4s +sqrdmulh v11.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +mla v1.4S, v11.4S, v31.s[0] +sub v11.4s, v20.4s, v1.4s +add v20.4s, v20.4s, v1.4s +sqrdmulh v1.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +mla v2.4S, v1.4S, v31.s[0] +sub v1.4s, v22.4s, v2.4s +add v22.4s, v22.4s, v2.4s +sqrdmulh v2.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +mla v14.4S, v2.4S, v31.s[0] +sub v2.4s, v10.4s, v14.4s +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +mla v21.4S, v14.4S, v31.s[0] +sub v14.4s, v0.4s, v21.4s +add v0.4s, v0.4s, v21.4s +sqrdmulh v21.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +mla v19.4S, v21.4S, v31.s[0] +sub v21.4s, v13.4s, v19.4s +add v13.4s, v13.4s, v19.4s +sqrdmulh v19.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +mla v3.4S, v19.4S, v31.s[0] +sub v19.4s, v0.4s, v3.4s +add v0.4s, v0.4s, v3.4s +sqrdmulh v3.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +mla v20.4S, v3.4S, v31.s[0] +sub v3.4s, v13.4s, v20.4s +add v13.4s, v13.4s, v20.4s +sqrdmulh v20.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +mla v18.4S, v20.4S, v31.s[0] +sub v20.4s, v22.4s, v18.4s +add v22.4s, v22.4s, v18.4s +sqrdmulh v18.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +mla v15.4S, v18.4S, v31.s[0] +sub v18.4s, v10.4s, v15.4s +add v10.4s, v10.4s, v15.4s +sqrdmulh v15.4S, v12.4S, v29.s[2] +mul v12.4S, v12.4S,v30.s[2] +mla v12.4S, v15.4S, v31.s[0] +sub v15.4s, v14.4s, v12.4s +add v14.4s, v14.4s, v12.4s +sqrdmulh v12.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +mla v11.4S, v12.4S, v31.s[0] +sub v12.4s, v21.4s, v11.4s +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +mla v16.4S, v11.4S, v31.s[0] +sub v11.4s, v1.4s, v16.4s +add v1.4s, v1.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +mla v17.4S, v16.4S, v31.s[0] +sub v16.4s, v2.4s, v17.4s +add v2.4s, v2.4s, v17.4s +sqrdmulh v17.4S, v0.4S, v27.s[0] +mul v0.4S, v0.4S,v28.s[0] +mla v0.4S, v17.4S, v31.s[0] +sub v17.4s, v22.4s, v0.4s +add v22.4s, v22.4s, v0.4s +sqrdmulh v0.4S, v13.4S, v27.s[0] +mul v13.4S, v13.4S,v28.s[0] +mla v13.4S, v0.4S, v31.s[0] +sub v0.4s, v10.4s, v13.4s +add v10.4s, v10.4s, v13.4s +sqrdmulh v13.4S, v19.4S, v27.s[1] +mul v19.4S, v19.4S,v28.s[1] +mla v19.4S, v13.4S, v31.s[0] +sub v13.4s, v20.4s, v19.4s +add v20.4s, v20.4s, v19.4s +sqrdmulh v19.4S, v3.4S, v27.s[1] +mul v3.4S, v3.4S,v28.s[1] +mla v3.4S, v19.4S, v31.s[0] +sub v19.4s, v18.4s, v3.4s +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v14.4S, v27.s[2] +mul v14.4S, v14.4S,v28.s[2] +mla v14.4S, v3.4S, v31.s[0] +sub v3.4s, v1.4s, v14.4s +add v1.4s, v1.4s, v14.4s +sqrdmulh v14.4S, v21.4S, v27.s[2] +mul v21.4S, v21.4S,v28.s[2] +mla v21.4S, v14.4S, v31.s[0] +sub v14.4s, v2.4s, v21.4s +add v2.4s, v2.4s, v21.4s +sqrdmulh v21.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +mla v15.4S, v21.4S, v31.s[0] +sub v21.4s, v11.4s, v15.4s +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v12.4S, v27.s[3] +mul v12.4S, v12.4S,v28.s[3] +mla v12.4S, v15.4S, v31.s[0] +sub v15.4s, v16.4s, v12.4s +add v16.4s, v16.4s, v12.4s +sqrdmulh v12.4S, v10.4S, v25.s[0] +mul v10.4S, v10.4S,v26.s[0] +mla v10.4S, v12.4S, v31.s[0] +sub v12.4s, v22.4s, v10.4s +add v22.4s, v22.4s, v10.4s +sqrdmulh v10.4S, v0.4S, v25.s[1] +mul v0.4S, v0.4S,v26.s[1] +mla v0.4S, v10.4S, v31.s[0] +sub v10.4s, v17.4s, v0.4s +add v17.4s, v17.4s, v0.4s +sqrdmulh v0.4S, v18.4S, v25.s[2] +mul v18.4S, v18.4S,v26.s[2] +mla v18.4S, v0.4S, v31.s[0] +sub v0.4s, v20.4s, v18.4s +add v20.4s, v20.4s, v18.4s +sqrdmulh v18.4S, v19.4S, v25.s[3] +mul v19.4S, v19.4S,v26.s[3] +mla v19.4S, v18.4S, v31.s[0] +sub v18.4s, v13.4s, v19.4s +add v13.4s, v13.4s, v19.4s +sqrdmulh v19.4S, v2.4S, v23.s[0] +mul v2.4S, v2.4S,v24.s[0] +mla v2.4S, v19.4S, v31.s[0] +sub v19.4s, v1.4s, v2.4s +add v1.4s, v1.4s, v2.4s +sqrdmulh v2.4S, v14.4S, v23.s[1] +mul v14.4S, v14.4S,v24.s[1] +mla v14.4S, v2.4S, v31.s[0] +sub v2.4s, v3.4s, v14.4s +add v3.4s, v3.4s, v14.4s +sqrdmulh v14.4S, v16.4S, v23.s[2] +mul v16.4S, v16.4S,v24.s[2] +mla v16.4S, v14.4S, v31.s[0] +sub v14.4s, v11.4s, v16.4s +add v11.4s, v11.4s, v16.4s +sqrdmulh v16.4S, v15.4S, v23.s[3] +mul v15.4S, v15.4S,v24.s[3] +mla v15.4S, v16.4S, v31.s[0] +sub v16.4s, v21.4s, v15.4s +add v21.4s, v21.4s, v15.4s +str q22, [x0, #0] +str q12, [x0, #64] +str q17, [x0, #128] +str q10, [x0, #192] +str q20, [x0, #256] +str q0, [x0, #320] +str q13, [x0, #384] +str q18, [x0, #448] +str q1, [x0, #512] +str q19, [x0, #576] +str q3, [x0, #640] +str q2, [x0, #704] +str q11, [x0, #768] +str q14, [x0, #832] +str q21, [x0, #896] +str q16, [x0, #960] +ldr q16, [x0, #784] +ldr q21, [x0, #848] +ldr q14, [x0, #912] +ldr q11, [x0, #976] +ldr q2, [x0, #272] +ldr q3, [x0, #336] +ldr q19, [x0, #400] +ldr q1, [x0, #464] +ldr q18, [x0, #528] +ldr q13, [x0, #592] +ldr q0, [x0, #656] +ldr q20, [x0, #720] +ldr q10, [x0, #16] +ldr q17, [x0, #80] +ldr q12, [x0, #144] +ldr q22, [x0, #208] +sqrdmulh v15.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +mla v16.4S, v15.4S, v31.s[0] +sub v15.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +mla v21.4S, v16.4S, v31.s[0] +sub v16.4s, v3.4s, v21.4s +add v3.4s, v3.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +mla v14.4S, v21.4S, v31.s[0] +sub v21.4s, v19.4s, v14.4s +add v19.4s, v19.4s, v14.4s +sqrdmulh v14.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +mla v11.4S, v14.4S, v31.s[0] +sub v14.4s, v1.4s, v11.4s +add v1.4s, v1.4s, v11.4s +sqrdmulh v11.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +mla v18.4S, v11.4S, v31.s[0] +sub v11.4s, v10.4s, v18.4s +add v10.4s, v10.4s, v18.4s +sqrdmulh v18.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +mla v13.4S, v18.4S, v31.s[0] +sub v18.4s, v17.4s, v13.4s +add v17.4s, v17.4s, v13.4s +sqrdmulh v13.4S, v0.4S, v29.s[0] +mul v0.4S, v0.4S,v30.s[0] +mla v0.4S, v13.4S, v31.s[0] +sub v13.4s, v12.4s, v0.4s +add v12.4s, v12.4s, v0.4s +sqrdmulh v0.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +mla v20.4S, v0.4S, v31.s[0] +sub v0.4s, v22.4s, v20.4s +add v22.4s, v22.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v29.s[1] +mul v19.4S, v19.4S,v30.s[1] +mla v19.4S, v20.4S, v31.s[0] +sub v20.4s, v12.4s, v19.4s +add v12.4s, v12.4s, v19.4s +sqrdmulh v19.4S, v1.4S, v29.s[1] +mul v1.4S, v1.4S,v30.s[1] +mla v1.4S, v19.4S, v31.s[0] +sub v19.4s, v22.4s, v1.4s +add v22.4s, v22.4s, v1.4s +sqrdmulh v1.4S, v2.4S, v29.s[1] +mul v2.4S, v2.4S,v30.s[1] +mla v2.4S, v1.4S, v31.s[0] +sub v1.4s, v10.4s, v2.4s +add v10.4s, v10.4s, v2.4s +sqrdmulh v2.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +mla v3.4S, v2.4S, v31.s[0] +sub v2.4s, v17.4s, v3.4s +add v17.4s, v17.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +mla v21.4S, v3.4S, v31.s[0] +sub v3.4s, v13.4s, v21.4s +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +mla v14.4S, v21.4S, v31.s[0] +sub v21.4s, v0.4s, v14.4s +add v0.4s, v0.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +mla v15.4S, v14.4S, v31.s[0] +sub v14.4s, v11.4s, v15.4s +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +mla v16.4S, v15.4S, v31.s[0] +sub v15.4s, v18.4s, v16.4s +add v18.4s, v18.4s, v16.4s +sqrdmulh v16.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +mla v12.4S, v16.4S, v31.s[0] +sub v16.4s, v10.4s, v12.4s +add v10.4s, v10.4s, v12.4s +sqrdmulh v12.4S, v22.4S, v27.s[0] +mul v22.4S, v22.4S,v28.s[0] +mla v22.4S, v12.4S, v31.s[0] +sub v12.4s, v17.4s, v22.4s +add v17.4s, v17.4s, v22.4s +sqrdmulh v22.4S, v20.4S, v27.s[1] +mul v20.4S, v20.4S,v28.s[1] +mla v20.4S, v22.4S, v31.s[0] +sub v22.4s, v1.4s, v20.4s +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v27.s[1] +mul v19.4S, v19.4S,v28.s[1] +mla v19.4S, v20.4S, v31.s[0] +sub v20.4s, v2.4s, v19.4s +add v2.4s, v2.4s, v19.4s +sqrdmulh v19.4S, v13.4S, v27.s[2] +mul v13.4S, v13.4S,v28.s[2] +mla v13.4S, v19.4S, v31.s[0] +sub v19.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +sqrdmulh v13.4S, v0.4S, v27.s[2] +mul v0.4S, v0.4S,v28.s[2] +mla v0.4S, v13.4S, v31.s[0] +sub v13.4s, v18.4s, v0.4s +add v18.4s, v18.4s, v0.4s +sqrdmulh v0.4S, v3.4S, v27.s[3] +mul v3.4S, v3.4S,v28.s[3] +mla v3.4S, v0.4S, v31.s[0] +sub v0.4s, v14.4s, v3.4s +add v14.4s, v14.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +mla v21.4S, v3.4S, v31.s[0] +sub v3.4s, v15.4s, v21.4s +add v15.4s, v15.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v25.s[0] +mul v17.4S, v17.4S,v26.s[0] +mla v17.4S, v21.4S, v31.s[0] +sub v21.4s, v10.4s, v17.4s +add v10.4s, v10.4s, v17.4s +sqrdmulh v17.4S, v12.4S, v25.s[1] +mul v12.4S, v12.4S,v26.s[1] +mla v12.4S, v17.4S, v31.s[0] +sub v17.4s, v16.4s, v12.4s +add v16.4s, v16.4s, v12.4s +sqrdmulh v12.4S, v2.4S, v25.s[2] +mul v2.4S, v2.4S,v26.s[2] +mla v2.4S, v12.4S, v31.s[0] +sub v12.4s, v1.4s, v2.4s +add v1.4s, v1.4s, v2.4s +sqrdmulh v2.4S, v20.4S, v25.s[3] +mul v20.4S, v20.4S,v26.s[3] +mla v20.4S, v2.4S, v31.s[0] +sub v2.4s, v22.4s, v20.4s +add v22.4s, v22.4s, v20.4s +sqrdmulh v20.4S, v18.4S, v23.s[0] +mul v18.4S, v18.4S,v24.s[0] +mla v18.4S, v20.4S, v31.s[0] +sub v20.4s, v11.4s, v18.4s +add v11.4s, v11.4s, v18.4s +sqrdmulh v18.4S, v13.4S, v23.s[1] +mul v13.4S, v13.4S,v24.s[1] +mla v13.4S, v18.4S, v31.s[0] +sub v18.4s, v19.4s, v13.4s +add v19.4s, v19.4s, v13.4s +sqrdmulh v13.4S, v15.4S, v23.s[2] +mul v15.4S, v15.4S,v24.s[2] +mla v15.4S, v13.4S, v31.s[0] +sub v13.4s, v14.4s, v15.4s +add v14.4s, v14.4s, v15.4s +sqrdmulh v15.4S, v3.4S, v23.s[3] +mul v3.4S, v3.4S,v24.s[3] +mla v3.4S, v15.4S, v31.s[0] +sub v15.4s, v0.4s, v3.4s +add v0.4s, v0.4s, v3.4s +str q10, [x0, #16] +str q21, [x0, #80] +str q16, [x0, #144] +str q17, [x0, #208] +str q1, [x0, #272] +str q12, [x0, #336] +str q22, [x0, #400] +str q2, [x0, #464] +str q11, [x0, #528] +str q20, [x0, #592] +str q19, [x0, #656] +str q18, [x0, #720] +str q14, [x0, #784] +str q13, [x0, #848] +str q0, [x0, #912] +str q15, [x0, #976] +ldr q4, [x17, #+128] +ldr q5, [x17, #+144] +ldr q6, [x0, #32] +ldr q7, [x0, #48] +ldr q8, [x0, #0] +ldr q9, [x0, #16] +sqrdmulh v3.4S, v6.4S, v5.s[0] +mul v6.4S, v6.4S,v4.s[0] +mla v6.4S, v3.4S, v31.s[0] +sub v3.4s, v8.4s, v6.4s +add v8.4s, v8.4s, v6.4s +sqrdmulh v6.4S, v7.4S, v5.s[0] +mul v7.4S, v7.4S,v4.s[0] +mla v7.4S, v6.4S, v31.s[0] +sub v6.4s, v9.4s, v7.4s +add v9.4s, v9.4s, v7.4s +ldr q7, [x17, #+160] +ldr q10, [x17, #+176] +sqrdmulh v21.4S, v9.4S, v5.s[1] +mul v9.4S, v9.4S,v4.s[1] +mla v9.4S, v21.4S, v31.s[0] +sub v21.4s, v8.4s, v9.4s +add v8.4s, v8.4s, v9.4s +sqrdmulh v9.4S, v6.4S, v5.s[2] +mul v6.4S, v6.4S,v4.s[2] +mla v6.4S, v9.4S, v31.s[0] +sub v9.4s, v3.4s, v6.4s +add v3.4s, v3.4s, v6.4s +str q8, [x0, #0] +str q21, [x0, #16] +str q3, [x0, #32] +str q9, [x0, #48] +ldr q5, [x0, #96] +ldr q4, [x0, #112] +ldr q9, [x0, #64] +ldr q3, [x0, #80] +sqrdmulh v21.4S, v5.4S, v10.s[0] +mul v5.4S, v5.4S,v7.s[0] +mla v5.4S, v21.4S, v31.s[0] +sub v21.4s, v9.4s, v5.4s +add v9.4s, v9.4s, v5.4s +sqrdmulh v5.4S, v4.4S, v10.s[0] +mul v4.4S, v4.4S,v7.s[0] +mla v4.4S, v5.4S, v31.s[0] +sub v5.4s, v3.4s, v4.4s +add v3.4s, v3.4s, v4.4s +ldr q4, [x17, #+192] +ldr q8, [x17, #+208] +sqrdmulh v6.4S, v3.4S, v10.s[1] +mul v3.4S, v3.4S,v7.s[1] +mla v3.4S, v6.4S, v31.s[0] +sub v6.4s, v9.4s, v3.4s +add v9.4s, v9.4s, v3.4s +sqrdmulh v3.4S, v5.4S, v10.s[2] +mul v5.4S, v5.4S,v7.s[2] +mla v5.4S, v3.4S, v31.s[0] +sub v3.4s, v21.4s, v5.4s +add v21.4s, v21.4s, v5.4s +str q9, [x0, #64] +str q6, [x0, #80] +str q21, [x0, #96] +str q3, [x0, #112] +ldr q10, [x0, #160] +ldr q7, [x0, #176] +ldr q3, [x0, #128] +ldr q21, [x0, #144] +sqrdmulh v6.4S, v10.4S, v8.s[0] +mul v10.4S, v10.4S,v4.s[0] +mla v10.4S, v6.4S, v31.s[0] +sub v6.4s, v3.4s, v10.4s +add v3.4s, v3.4s, v10.4s +sqrdmulh v10.4S, v7.4S, v8.s[0] +mul v7.4S, v7.4S,v4.s[0] +mla v7.4S, v10.4S, v31.s[0] +sub v10.4s, v21.4s, v7.4s +add v21.4s, v21.4s, v7.4s +ldr q7, [x17, #+224] +ldr q9, [x17, #+240] +sqrdmulh v5.4S, v21.4S, v8.s[1] +mul v21.4S, v21.4S,v4.s[1] +mla v21.4S, v5.4S, v31.s[0] +sub v5.4s, v3.4s, v21.4s +add v3.4s, v3.4s, v21.4s +sqrdmulh v21.4S, v10.4S, v8.s[2] +mul v10.4S, v10.4S,v4.s[2] +mla v10.4S, v21.4S, v31.s[0] +sub v21.4s, v6.4s, v10.4s +add v6.4s, v6.4s, v10.4s +str q3, [x0, #128] +str q5, [x0, #144] +str q6, [x0, #160] +str q21, [x0, #176] +ldr q8, [x0, #224] +ldr q4, [x0, #240] +ldr q21, [x0, #192] +ldr q6, [x0, #208] +sqrdmulh v5.4S, v8.4S, v9.s[0] +mul v8.4S, v8.4S,v7.s[0] +mla v8.4S, v5.4S, v31.s[0] +sub v5.4s, v21.4s, v8.4s +add v21.4s, v21.4s, v8.4s +sqrdmulh v8.4S, v4.4S, v9.s[0] +mul v4.4S, v4.4S,v7.s[0] +mla v4.4S, v8.4S, v31.s[0] +sub v8.4s, v6.4s, v4.4s +add v6.4s, v6.4s, v4.4s +ldr q4, [x17, #+256] +ldr q3, [x17, #+272] +sqrdmulh v10.4S, v6.4S, v9.s[1] +mul v6.4S, v6.4S,v7.s[1] +mla v6.4S, v10.4S, v31.s[0] +sub v10.4s, v21.4s, v6.4s +add v21.4s, v21.4s, v6.4s +sqrdmulh v6.4S, v8.4S, v9.s[2] +mul v8.4S, v8.4S,v7.s[2] +mla v8.4S, v6.4S, v31.s[0] +sub v6.4s, v5.4s, v8.4s +add v5.4s, v5.4s, v8.4s +str q21, [x0, #192] +str q10, [x0, #208] +str q5, [x0, #224] +str q6, [x0, #240] +ldr q9, [x0, #288] +ldr q7, [x0, #304] +ldr q6, [x0, #256] +ldr q5, [x0, #272] +sqrdmulh v10.4S, v9.4S, v3.s[0] +mul v9.4S, v9.4S,v4.s[0] +mla v9.4S, v10.4S, v31.s[0] +sub v10.4s, v6.4s, v9.4s +add v6.4s, v6.4s, v9.4s +sqrdmulh v9.4S, v7.4S, v3.s[0] +mul v7.4S, v7.4S,v4.s[0] +mla v7.4S, v9.4S, v31.s[0] +sub v9.4s, v5.4s, v7.4s +add v5.4s, v5.4s, v7.4s +ldr q7, [x17, #+288] +ldr q21, [x17, #+304] +sqrdmulh v8.4S, v5.4S, v3.s[1] +mul v5.4S, v5.4S,v4.s[1] +mla v5.4S, v8.4S, v31.s[0] +sub v8.4s, v6.4s, v5.4s +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v9.4S, v3.s[2] +mul v9.4S, v9.4S,v4.s[2] +mla v9.4S, v5.4S, v31.s[0] +sub v5.4s, v10.4s, v9.4s +add v10.4s, v10.4s, v9.4s +str q6, [x0, #256] +str q8, [x0, #272] +str q10, [x0, #288] +str q5, [x0, #304] +ldr q3, [x0, #352] +ldr q4, [x0, #368] +ldr q5, [x0, #320] +ldr q10, [x0, #336] +sqrdmulh v8.4S, v3.4S, v21.s[0] +mul v3.4S, v3.4S,v7.s[0] +mla v3.4S, v8.4S, v31.s[0] +sub v8.4s, v5.4s, v3.4s +add v5.4s, v5.4s, v3.4s +sqrdmulh v3.4S, v4.4S, v21.s[0] +mul v4.4S, v4.4S,v7.s[0] +mla v4.4S, v3.4S, v31.s[0] +sub v3.4s, v10.4s, v4.4s +add v10.4s, v10.4s, v4.4s +ldr q4, [x17, #+320] +ldr q6, [x17, #+336] +sqrdmulh v9.4S, v10.4S, v21.s[1] +mul v10.4S, v10.4S,v7.s[1] +mla v10.4S, v9.4S, v31.s[0] +sub v9.4s, v5.4s, v10.4s +add v5.4s, v5.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v21.s[2] +mul v3.4S, v3.4S,v7.s[2] +mla v3.4S, v10.4S, v31.s[0] +sub v10.4s, v8.4s, v3.4s +add v8.4s, v8.4s, v3.4s +str q5, [x0, #320] +str q9, [x0, #336] +str q8, [x0, #352] +str q10, [x0, #368] +ldr q21, [x0, #416] +ldr q7, [x0, #432] +ldr q10, [x0, #384] +ldr q8, [x0, #400] +sqrdmulh v9.4S, v21.4S, v6.s[0] +mul v21.4S, v21.4S,v4.s[0] +mla v21.4S, v9.4S, v31.s[0] +sub v9.4s, v10.4s, v21.4s +add v10.4s, v10.4s, v21.4s +sqrdmulh v21.4S, v7.4S, v6.s[0] +mul v7.4S, v7.4S,v4.s[0] +mla v7.4S, v21.4S, v31.s[0] +sub v21.4s, v8.4s, v7.4s +add v8.4s, v8.4s, v7.4s +ldr q7, [x17, #+352] +ldr q5, [x17, #+368] +sqrdmulh v3.4S, v8.4S, v6.s[1] +mul v8.4S, v8.4S,v4.s[1] +mla v8.4S, v3.4S, v31.s[0] +sub v3.4s, v10.4s, v8.4s +add v10.4s, v10.4s, v8.4s +sqrdmulh v8.4S, v21.4S, v6.s[2] +mul v21.4S, v21.4S,v4.s[2] +mla v21.4S, v8.4S, v31.s[0] +sub v8.4s, v9.4s, v21.4s +add v9.4s, v9.4s, v21.4s +str q10, [x0, #384] +str q3, [x0, #400] +str q9, [x0, #416] +str q8, [x0, #432] +ldr q6, [x0, #480] +ldr q4, [x0, #496] +ldr q8, [x0, #448] +ldr q9, [x0, #464] +sqrdmulh v3.4S, v6.4S, v5.s[0] +mul v6.4S, v6.4S,v7.s[0] +mla v6.4S, v3.4S, v31.s[0] +sub v3.4s, v8.4s, v6.4s +add v8.4s, v8.4s, v6.4s +sqrdmulh v6.4S, v4.4S, v5.s[0] +mul v4.4S, v4.4S,v7.s[0] +mla v4.4S, v6.4S, v31.s[0] +sub v6.4s, v9.4s, v4.4s +add v9.4s, v9.4s, v4.4s +ldr q4, [x17, #+384] +ldr q10, [x17, #+400] +sqrdmulh v21.4S, v9.4S, v5.s[1] +mul v9.4S, v9.4S,v7.s[1] +mla v9.4S, v21.4S, v31.s[0] +sub v21.4s, v8.4s, v9.4s +add v8.4s, v8.4s, v9.4s +sqrdmulh v9.4S, v6.4S, v5.s[2] +mul v6.4S, v6.4S,v7.s[2] +mla v6.4S, v9.4S, v31.s[0] +sub v9.4s, v3.4s, v6.4s +add v3.4s, v3.4s, v6.4s +str q8, [x0, #448] +str q21, [x0, #464] +str q3, [x0, #480] +str q9, [x0, #496] +ldr q5, [x0, #544] +ldr q7, [x0, #560] +ldr q9, [x0, #512] +ldr q3, [x0, #528] +sqrdmulh v21.4S, v5.4S, v10.s[0] +mul v5.4S, v5.4S,v4.s[0] +mla v5.4S, v21.4S, v31.s[0] +sub v21.4s, v9.4s, v5.4s +add v9.4s, v9.4s, v5.4s +sqrdmulh v5.4S, v7.4S, v10.s[0] +mul v7.4S, v7.4S,v4.s[0] +mla v7.4S, v5.4S, v31.s[0] +sub v5.4s, v3.4s, v7.4s +add v3.4s, v3.4s, v7.4s +ldr q7, [x17, #+416] +ldr q8, [x17, #+432] +sqrdmulh v6.4S, v3.4S, v10.s[1] +mul v3.4S, v3.4S,v4.s[1] +mla v3.4S, v6.4S, v31.s[0] +sub v6.4s, v9.4s, v3.4s +add v9.4s, v9.4s, v3.4s +sqrdmulh v3.4S, v5.4S, v10.s[2] +mul v5.4S, v5.4S,v4.s[2] +mla v5.4S, v3.4S, v31.s[0] +sub v3.4s, v21.4s, v5.4s +add v21.4s, v21.4s, v5.4s +str q9, [x0, #512] +str q6, [x0, #528] +str q21, [x0, #544] +str q3, [x0, #560] +ldr q10, [x0, #608] +ldr q4, [x0, #624] +ldr q3, [x0, #576] +ldr q21, [x0, #592] +sqrdmulh v6.4S, v10.4S, v8.s[0] +mul v10.4S, v10.4S,v7.s[0] +mla v10.4S, v6.4S, v31.s[0] +sub v6.4s, v3.4s, v10.4s +add v3.4s, v3.4s, v10.4s +sqrdmulh v10.4S, v4.4S, v8.s[0] +mul v4.4S, v4.4S,v7.s[0] +mla v4.4S, v10.4S, v31.s[0] +sub v10.4s, v21.4s, v4.4s +add v21.4s, v21.4s, v4.4s +ldr q4, [x17, #+448] +ldr q9, [x17, #+464] +sqrdmulh v5.4S, v21.4S, v8.s[1] +mul v21.4S, v21.4S,v7.s[1] +mla v21.4S, v5.4S, v31.s[0] +sub v5.4s, v3.4s, v21.4s +add v3.4s, v3.4s, v21.4s +sqrdmulh v21.4S, v10.4S, v8.s[2] +mul v10.4S, v10.4S,v7.s[2] +mla v10.4S, v21.4S, v31.s[0] +sub v21.4s, v6.4s, v10.4s +add v6.4s, v6.4s, v10.4s +str q3, [x0, #576] +str q5, [x0, #592] +str q6, [x0, #608] +str q21, [x0, #624] +ldr q8, [x0, #672] +ldr q7, [x0, #688] +ldr q21, [x0, #640] +ldr q6, [x0, #656] +sqrdmulh v5.4S, v8.4S, v9.s[0] +mul v8.4S, v8.4S,v4.s[0] +mla v8.4S, v5.4S, v31.s[0] +sub v5.4s, v21.4s, v8.4s +add v21.4s, v21.4s, v8.4s +sqrdmulh v8.4S, v7.4S, v9.s[0] +mul v7.4S, v7.4S,v4.s[0] +mla v7.4S, v8.4S, v31.s[0] +sub v8.4s, v6.4s, v7.4s +add v6.4s, v6.4s, v7.4s +ldr q7, [x17, #+480] +ldr q3, [x17, #+496] +sqrdmulh v10.4S, v6.4S, v9.s[1] +mul v6.4S, v6.4S,v4.s[1] +mla v6.4S, v10.4S, v31.s[0] +sub v10.4s, v21.4s, v6.4s +add v21.4s, v21.4s, v6.4s +sqrdmulh v6.4S, v8.4S, v9.s[2] +mul v8.4S, v8.4S,v4.s[2] +mla v8.4S, v6.4S, v31.s[0] +sub v6.4s, v5.4s, v8.4s +add v5.4s, v5.4s, v8.4s +str q21, [x0, #640] +str q10, [x0, #656] +str q5, [x0, #672] +str q6, [x0, #688] +ldr q9, [x0, #736] +ldr q4, [x0, #752] +ldr q6, [x0, #704] +ldr q5, [x0, #720] +sqrdmulh v10.4S, v9.4S, v3.s[0] +mul v9.4S, v9.4S,v7.s[0] +mla v9.4S, v10.4S, v31.s[0] +sub v10.4s, v6.4s, v9.4s +add v6.4s, v6.4s, v9.4s +sqrdmulh v9.4S, v4.4S, v3.s[0] +mul v4.4S, v4.4S,v7.s[0] +mla v4.4S, v9.4S, v31.s[0] +sub v9.4s, v5.4s, v4.4s +add v5.4s, v5.4s, v4.4s +ldr q4, [x17, #+512] +ldr q21, [x17, #+528] +sqrdmulh v8.4S, v5.4S, v3.s[1] +mul v5.4S, v5.4S,v7.s[1] +mla v5.4S, v8.4S, v31.s[0] +sub v8.4s, v6.4s, v5.4s +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v9.4S, v3.s[2] +mul v9.4S, v9.4S,v7.s[2] +mla v9.4S, v5.4S, v31.s[0] +sub v5.4s, v10.4s, v9.4s +add v10.4s, v10.4s, v9.4s +str q6, [x0, #704] +str q8, [x0, #720] +str q10, [x0, #736] +str q5, [x0, #752] +ldr q3, [x0, #800] +ldr q7, [x0, #816] +ldr q5, [x0, #768] +ldr q10, [x0, #784] +sqrdmulh v8.4S, v3.4S, v21.s[0] +mul v3.4S, v3.4S,v4.s[0] +mla v3.4S, v8.4S, v31.s[0] +sub v8.4s, v5.4s, v3.4s +add v5.4s, v5.4s, v3.4s +sqrdmulh v3.4S, v7.4S, v21.s[0] +mul v7.4S, v7.4S,v4.s[0] +mla v7.4S, v3.4S, v31.s[0] +sub v3.4s, v10.4s, v7.4s +add v10.4s, v10.4s, v7.4s +ldr q7, [x17, #+544] +ldr q6, [x17, #+560] +sqrdmulh v9.4S, v10.4S, v21.s[1] +mul v10.4S, v10.4S,v4.s[1] +mla v10.4S, v9.4S, v31.s[0] +sub v9.4s, v5.4s, v10.4s +add v5.4s, v5.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v21.s[2] +mul v3.4S, v3.4S,v4.s[2] +mla v3.4S, v10.4S, v31.s[0] +sub v10.4s, v8.4s, v3.4s +add v8.4s, v8.4s, v3.4s +str q5, [x0, #768] +str q9, [x0, #784] +str q8, [x0, #800] +str q10, [x0, #816] +ldr q21, [x0, #864] +ldr q4, [x0, #880] +ldr q10, [x0, #832] +ldr q8, [x0, #848] +sqrdmulh v9.4S, v21.4S, v6.s[0] +mul v21.4S, v21.4S,v7.s[0] +mla v21.4S, v9.4S, v31.s[0] +sub v9.4s, v10.4s, v21.4s +add v10.4s, v10.4s, v21.4s +sqrdmulh v21.4S, v4.4S, v6.s[0] +mul v4.4S, v4.4S,v7.s[0] +mla v4.4S, v21.4S, v31.s[0] +sub v21.4s, v8.4s, v4.4s +add v8.4s, v8.4s, v4.4s +ldr q4, [x17, #+576] +ldr q5, [x17, #+592] +sqrdmulh v3.4S, v8.4S, v6.s[1] +mul v8.4S, v8.4S,v7.s[1] +mla v8.4S, v3.4S, v31.s[0] +sub v3.4s, v10.4s, v8.4s +add v10.4s, v10.4s, v8.4s +sqrdmulh v8.4S, v21.4S, v6.s[2] +mul v21.4S, v21.4S,v7.s[2] +mla v21.4S, v8.4S, v31.s[0] +sub v8.4s, v9.4s, v21.4s +add v9.4s, v9.4s, v21.4s +str q10, [x0, #832] +str q3, [x0, #848] +str q9, [x0, #864] +str q8, [x0, #880] +ldr q6, [x0, #928] +ldr q7, [x0, #944] +ldr q8, [x0, #896] +ldr q9, [x0, #912] +sqrdmulh v3.4S, v6.4S, v5.s[0] +mul v6.4S, v6.4S,v4.s[0] +mla v6.4S, v3.4S, v31.s[0] +sub v3.4s, v8.4s, v6.4s +add v8.4s, v8.4s, v6.4s +sqrdmulh v6.4S, v7.4S, v5.s[0] +mul v7.4S, v7.4S,v4.s[0] +mla v7.4S, v6.4S, v31.s[0] +sub v6.4s, v9.4s, v7.4s +add v9.4s, v9.4s, v7.4s +ldr q7, [x17, #+608] +ldr q10, [x17, #+624] +sqrdmulh v21.4S, v9.4S, v5.s[1] +mul v9.4S, v9.4S,v4.s[1] +mla v9.4S, v21.4S, v31.s[0] +sub v21.4s, v8.4s, v9.4s +add v8.4s, v8.4s, v9.4s +sqrdmulh v9.4S, v6.4S, v5.s[2] +mul v6.4S, v6.4S,v4.s[2] +mla v6.4S, v9.4S, v31.s[0] +sub v9.4s, v3.4s, v6.4s +add v3.4s, v3.4s, v6.4s +str q8, [x0, #896] +str q21, [x0, #912] +str q3, [x0, #928] +str q9, [x0, #944] +ldr q5, [x0, #992] +ldr q4, [x0, #1008] +ldr q9, [x0, #960] +ldr q3, [x0, #976] +sqrdmulh v21.4S, v5.4S, v10.s[0] +mul v5.4S, v5.4S,v7.s[0] +mla v5.4S, v21.4S, v31.s[0] +sub v21.4s, v9.4s, v5.4s +add v9.4s, v9.4s, v5.4s +sqrdmulh v5.4S, v4.4S, v10.s[0] +mul v4.4S, v4.4S,v7.s[0] +mla v4.4S, v5.4S, v31.s[0] +sub v5.4s, v3.4s, v4.4s +add v3.4s, v3.4s, v4.4s +sqrdmulh v4.4S, v3.4S, v10.s[1] +mul v3.4S, v3.4S,v7.s[1] +mla v3.4S, v4.4S, v31.s[0] +sub v4.4s, v9.4s, v3.4s +add v9.4s, v9.4s, v3.4s +sqrdmulh v3.4S, v5.4S, v10.s[2] +mul v5.4S, v5.4S,v7.s[2] +mla v5.4S, v3.4S, v31.s[0] +sub v3.4s, v21.4s, v5.4s +add v21.4s, v21.4s, v5.4s +str q9, [x0, #960] +str q4, [x0, #976] +str q21, [x0, #992] +str q3, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1464 +// Instruction count: 1460 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_0_z4_0.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_0_z4_0.s new file mode 100644 index 0000000..846ba45 --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_0_z4_0.s @@ -0,0 +1,1494 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_0_z4_0 +.global _ntt_u32_incomplete_neon_asm_var_4_2_0_z4_0 +ntt_u32_incomplete_neon_asm_var_4_2_0_z4_0: +_ntt_u32_incomplete_neon_asm_var_4_2_0_z4_0: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #800] +ldr q21, [x0, #864] +ldr q20, [x0, #928] +ldr q19, [x0, #992] +ldr q18, [x0, #288] +ldr q17, [x0, #352] +ldr q16, [x0, #416] +ldr q3, [x0, #480] +ldr q2, [x0, #544] +ldr q1, [x0, #608] +ldr q0, [x0, #672] +ldr q15, [x0, #736] +ldr q14, [x0, #32] +ldr q13, [x0, #96] +ldr q12, [x0, #160] +ldr q11, [x0, #224] +sqrdmulh v10.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +mla v22.4S, v10.4S, v31.s[0] +sub v10.4s, v18.4s, v22.4s +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v17.4s, v21.4s +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +mla v20.4S, v21.4S, v31.s[0] +sub v21.4s, v16.4s, v20.4s +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +mla v19.4S, v20.4S, v31.s[0] +sub v20.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +mla v2.4S, v19.4S, v31.s[0] +sub v19.4s, v14.4s, v2.4s +add v14.4s, v14.4s, v2.4s +sqrdmulh v2.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +mla v1.4S, v2.4S, v31.s[0] +sub v2.4s, v13.4s, v1.4s +add v13.4s, v13.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v29.s[0] +mul v0.4S, v0.4S,v30.s[0] +mla v0.4S, v1.4S, v31.s[0] +sub v1.4s, v12.4s, v0.4s +add v12.4s, v12.4s, v0.4s +sqrdmulh v0.4S, v15.4S, v29.s[0] +mul v15.4S, v15.4S,v30.s[0] +mla v15.4S, v0.4S, v31.s[0] +sub v0.4s, v11.4s, v15.4s +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +mla v16.4S, v15.4S, v31.s[0] +sub v15.4s, v12.4s, v16.4s +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +mla v3.4S, v16.4S, v31.s[0] +sub v16.4s, v11.4s, v3.4s +add v11.4s, v11.4s, v3.4s +sqrdmulh v3.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +mla v18.4S, v3.4S, v31.s[0] +sub v3.4s, v14.4s, v18.4s +add v14.4s, v14.4s, v18.4s +sqrdmulh v18.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +mla v17.4S, v18.4S, v31.s[0] +sub v18.4s, v13.4s, v17.4s +add v13.4s, v13.4s, v17.4s +sqrdmulh v17.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +mla v21.4S, v17.4S, v31.s[0] +sub v17.4s, v1.4s, v21.4s +add v1.4s, v1.4s, v21.4s +sqrdmulh v21.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +mla v20.4S, v21.4S, v31.s[0] +sub v21.4s, v0.4s, v20.4s +add v0.4s, v0.4s, v20.4s +sqrdmulh v20.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +mla v10.4S, v20.4S, v31.s[0] +sub v20.4s, v19.4s, v10.4s +add v19.4s, v19.4s, v10.4s +sqrdmulh v10.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +mla v22.4S, v10.4S, v31.s[0] +sub v10.4s, v2.4s, v22.4s +add v2.4s, v2.4s, v22.4s +sqrdmulh v22.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +mla v12.4S, v22.4S, v31.s[0] +sub v22.4s, v14.4s, v12.4s +add v14.4s, v14.4s, v12.4s +sqrdmulh v12.4S, v11.4S, v27.s[0] +mul v11.4S, v11.4S,v28.s[0] +mla v11.4S, v12.4S, v31.s[0] +sub v12.4s, v13.4s, v11.4s +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v15.4S, v27.s[1] +mul v15.4S, v15.4S,v28.s[1] +mla v15.4S, v11.4S, v31.s[0] +sub v11.4s, v3.4s, v15.4s +add v3.4s, v3.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v27.s[1] +mul v16.4S, v16.4S,v28.s[1] +mla v16.4S, v15.4S, v31.s[0] +sub v15.4s, v18.4s, v16.4s +add v18.4s, v18.4s, v16.4s +sqrdmulh v16.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +mla v1.4S, v16.4S, v31.s[0] +sub v16.4s, v19.4s, v1.4s +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v27.s[2] +mul v0.4S, v0.4S,v28.s[2] +mla v0.4S, v1.4S, v31.s[0] +sub v1.4s, v2.4s, v0.4s +add v2.4s, v2.4s, v0.4s +sqrdmulh v0.4S, v17.4S, v27.s[3] +mul v17.4S, v17.4S,v28.s[3] +mla v17.4S, v0.4S, v31.s[0] +sub v0.4s, v20.4s, v17.4s +add v20.4s, v20.4s, v17.4s +sqrdmulh v17.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +mla v21.4S, v17.4S, v31.s[0] +sub v17.4s, v10.4s, v21.4s +add v10.4s, v10.4s, v21.4s +sqrdmulh v21.4S, v13.4S, v25.s[0] +mul v13.4S, v13.4S,v26.s[0] +mla v13.4S, v21.4S, v31.s[0] +sub v21.4s, v14.4s, v13.4s +add v14.4s, v14.4s, v13.4s +sqrdmulh v13.4S, v12.4S, v25.s[1] +mul v12.4S, v12.4S,v26.s[1] +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v18.4S, v25.s[2] +mul v18.4S, v18.4S,v26.s[2] +mla v18.4S, v12.4S, v31.s[0] +sub v12.4s, v3.4s, v18.4s +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v15.4S, v25.s[3] +mul v15.4S, v15.4S,v26.s[3] +mla v15.4S, v18.4S, v31.s[0] +sub v18.4s, v11.4s, v15.4s +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v23.s[0] +mul v2.4S, v2.4S,v24.s[0] +mla v2.4S, v15.4S, v31.s[0] +sub v15.4s, v19.4s, v2.4s +add v19.4s, v19.4s, v2.4s +sqrdmulh v2.4S, v1.4S, v23.s[1] +mul v1.4S, v1.4S,v24.s[1] +mla v1.4S, v2.4S, v31.s[0] +sub v2.4s, v16.4s, v1.4s +add v16.4s, v16.4s, v1.4s +sqrdmulh v1.4S, v10.4S, v23.s[2] +mul v10.4S, v10.4S,v24.s[2] +mla v10.4S, v1.4S, v31.s[0] +sub v1.4s, v20.4s, v10.4s +add v20.4s, v20.4s, v10.4s +sqrdmulh v10.4S, v17.4S, v23.s[3] +mul v17.4S, v17.4S,v24.s[3] +mla v17.4S, v10.4S, v31.s[0] +sub v10.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +str q14, [x0, #32] +str q21, [x0, #96] +str q22, [x0, #160] +str q13, [x0, #224] +str q3, [x0, #288] +str q12, [x0, #352] +str q11, [x0, #416] +str q18, [x0, #480] +str q19, [x0, #544] +str q15, [x0, #608] +str q16, [x0, #672] +str q2, [x0, #736] +str q20, [x0, #800] +str q1, [x0, #864] +str q0, [x0, #928] +str q10, [x0, #992] +ldr q10, [x0, #816] +ldr q0, [x0, #880] +ldr q1, [x0, #944] +ldr q20, [x0, #1008] +ldr q2, [x0, #304] +ldr q16, [x0, #368] +ldr q15, [x0, #432] +ldr q19, [x0, #496] +ldr q18, [x0, #560] +ldr q11, [x0, #624] +ldr q12, [x0, #688] +ldr q3, [x0, #752] +ldr q13, [x0, #48] +ldr q22, [x0, #112] +ldr q21, [x0, #176] +ldr q14, [x0, #240] +sqrdmulh v17.4S, v10.4S, v29.s[0] +mul v10.4S, v10.4S,v30.s[0] +mla v10.4S, v17.4S, v31.s[0] +sub v17.4s, v2.4s, v10.4s +add v2.4s, v2.4s, v10.4s +sqrdmulh v10.4S, v0.4S, v29.s[0] +mul v0.4S, v0.4S,v30.s[0] +mla v0.4S, v10.4S, v31.s[0] +sub v10.4s, v16.4s, v0.4s +add v16.4s, v16.4s, v0.4s +sqrdmulh v0.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +mla v1.4S, v0.4S, v31.s[0] +sub v0.4s, v15.4s, v1.4s +add v15.4s, v15.4s, v1.4s +sqrdmulh v1.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +mla v20.4S, v1.4S, v31.s[0] +sub v1.4s, v19.4s, v20.4s +add v19.4s, v19.4s, v20.4s +sqrdmulh v20.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +mla v18.4S, v20.4S, v31.s[0] +sub v20.4s, v13.4s, v18.4s +add v13.4s, v13.4s, v18.4s +sqrdmulh v18.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +mla v11.4S, v18.4S, v31.s[0] +sub v18.4s, v22.4s, v11.4s +add v22.4s, v22.4s, v11.4s +sqrdmulh v11.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +mla v12.4S, v11.4S, v31.s[0] +sub v11.4s, v21.4s, v12.4s +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +mla v3.4S, v12.4S, v31.s[0] +sub v12.4s, v14.4s, v3.4s +add v14.4s, v14.4s, v3.4s +sqrdmulh v3.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +mla v15.4S, v3.4S, v31.s[0] +sub v3.4s, v21.4s, v15.4s +add v21.4s, v21.4s, v15.4s +sqrdmulh v15.4S, v19.4S, v29.s[1] +mul v19.4S, v19.4S,v30.s[1] +mla v19.4S, v15.4S, v31.s[0] +sub v15.4s, v14.4s, v19.4s +add v14.4s, v14.4s, v19.4s +sqrdmulh v19.4S, v2.4S, v29.s[1] +mul v2.4S, v2.4S,v30.s[1] +mla v2.4S, v19.4S, v31.s[0] +sub v19.4s, v13.4s, v2.4s +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +mla v16.4S, v2.4S, v31.s[0] +sub v2.4s, v22.4s, v16.4s +add v22.4s, v22.4s, v16.4s +sqrdmulh v16.4S, v0.4S, v29.s[2] +mul v0.4S, v0.4S,v30.s[2] +mla v0.4S, v16.4S, v31.s[0] +sub v16.4s, v11.4s, v0.4s +add v11.4s, v11.4s, v0.4s +sqrdmulh v0.4S, v1.4S, v29.s[2] +mul v1.4S, v1.4S,v30.s[2] +mla v1.4S, v0.4S, v31.s[0] +sub v0.4s, v12.4s, v1.4s +add v12.4s, v12.4s, v1.4s +sqrdmulh v1.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +mla v17.4S, v1.4S, v31.s[0] +sub v1.4s, v20.4s, v17.4s +add v20.4s, v20.4s, v17.4s +sqrdmulh v17.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +mla v10.4S, v17.4S, v31.s[0] +sub v17.4s, v18.4s, v10.4s +add v18.4s, v18.4s, v10.4s +sqrdmulh v10.4S, v21.4S, v27.s[0] +mul v21.4S, v21.4S,v28.s[0] +mla v21.4S, v10.4S, v31.s[0] +sub v10.4s, v13.4s, v21.4s +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +mla v14.4S, v21.4S, v31.s[0] +sub v21.4s, v22.4s, v14.4s +add v22.4s, v22.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v27.s[1] +mul v3.4S, v3.4S,v28.s[1] +mla v3.4S, v14.4S, v31.s[0] +sub v14.4s, v19.4s, v3.4s +add v19.4s, v19.4s, v3.4s +sqrdmulh v3.4S, v15.4S, v27.s[1] +mul v15.4S, v15.4S,v28.s[1] +mla v15.4S, v3.4S, v31.s[0] +sub v3.4s, v2.4s, v15.4s +add v2.4s, v2.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v27.s[2] +mul v11.4S, v11.4S,v28.s[2] +mla v11.4S, v15.4S, v31.s[0] +sub v15.4s, v20.4s, v11.4s +add v20.4s, v20.4s, v11.4s +sqrdmulh v11.4S, v12.4S, v27.s[2] +mul v12.4S, v12.4S,v28.s[2] +mla v12.4S, v11.4S, v31.s[0] +sub v11.4s, v18.4s, v12.4s +add v18.4s, v18.4s, v12.4s +sqrdmulh v12.4S, v16.4S, v27.s[3] +mul v16.4S, v16.4S,v28.s[3] +mla v16.4S, v12.4S, v31.s[0] +sub v12.4s, v1.4s, v16.4s +add v1.4s, v1.4s, v16.4s +sqrdmulh v16.4S, v0.4S, v27.s[3] +mul v0.4S, v0.4S,v28.s[3] +mla v0.4S, v16.4S, v31.s[0] +sub v16.4s, v17.4s, v0.4s +add v17.4s, v17.4s, v0.4s +sqrdmulh v0.4S, v22.4S, v25.s[0] +mul v22.4S, v22.4S,v26.s[0] +mla v22.4S, v0.4S, v31.s[0] +sub v0.4s, v13.4s, v22.4s +add v13.4s, v13.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v25.s[1] +mul v21.4S, v21.4S,v26.s[1] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v10.4s, v21.4s +add v10.4s, v10.4s, v21.4s +sqrdmulh v21.4S, v2.4S, v25.s[2] +mul v2.4S, v2.4S,v26.s[2] +mla v2.4S, v21.4S, v31.s[0] +sub v21.4s, v19.4s, v2.4s +add v19.4s, v19.4s, v2.4s +sqrdmulh v2.4S, v3.4S, v25.s[3] +mul v3.4S, v3.4S,v26.s[3] +mla v3.4S, v2.4S, v31.s[0] +sub v2.4s, v14.4s, v3.4s +add v14.4s, v14.4s, v3.4s +sqrdmulh v3.4S, v18.4S, v23.s[0] +mul v18.4S, v18.4S,v24.s[0] +mla v18.4S, v3.4S, v31.s[0] +sub v3.4s, v20.4s, v18.4s +add v20.4s, v20.4s, v18.4s +sqrdmulh v18.4S, v11.4S, v23.s[1] +mul v11.4S, v11.4S,v24.s[1] +mla v11.4S, v18.4S, v31.s[0] +sub v18.4s, v15.4s, v11.4s +add v15.4s, v15.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v23.s[2] +mul v17.4S, v17.4S,v24.s[2] +mla v17.4S, v11.4S, v31.s[0] +sub v11.4s, v1.4s, v17.4s +add v1.4s, v1.4s, v17.4s +sqrdmulh v17.4S, v16.4S, v23.s[3] +mul v16.4S, v16.4S,v24.s[3] +mla v16.4S, v17.4S, v31.s[0] +sub v17.4s, v12.4s, v16.4s +add v12.4s, v12.4s, v16.4s +str q13, [x0, #48] +str q0, [x0, #112] +str q10, [x0, #176] +str q22, [x0, #240] +str q19, [x0, #304] +str q21, [x0, #368] +str q14, [x0, #432] +str q2, [x0, #496] +str q20, [x0, #560] +str q3, [x0, #624] +str q15, [x0, #688] +str q18, [x0, #752] +str q1, [x0, #816] +str q11, [x0, #880] +str q12, [x0, #944] +str q17, [x0, #1008] +ldr q17, [x0, #768] +ldr q12, [x0, #832] +ldr q11, [x0, #896] +ldr q1, [x0, #960] +ldr q18, [x0, #256] +ldr q15, [x0, #320] +ldr q3, [x0, #384] +ldr q20, [x0, #448] +ldr q2, [x0, #512] +ldr q14, [x0, #576] +ldr q21, [x0, #640] +ldr q19, [x0, #704] +ldr q22, [x0, #0] +ldr q10, [x0, #64] +ldr q0, [x0, #128] +ldr q13, [x0, #192] +sqrdmulh v16.4S, v17.4S, v29.s[0] +mul v17.4S, v17.4S,v30.s[0] +mla v17.4S, v16.4S, v31.s[0] +sub v16.4s, v18.4s, v17.4s +add v18.4s, v18.4s, v17.4s +sqrdmulh v17.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +mla v12.4S, v17.4S, v31.s[0] +sub v17.4s, v15.4s, v12.4s +add v15.4s, v15.4s, v12.4s +sqrdmulh v12.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +mla v11.4S, v12.4S, v31.s[0] +sub v12.4s, v3.4s, v11.4s +add v3.4s, v3.4s, v11.4s +sqrdmulh v11.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +mla v1.4S, v11.4S, v31.s[0] +sub v11.4s, v20.4s, v1.4s +add v20.4s, v20.4s, v1.4s +sqrdmulh v1.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +mla v2.4S, v1.4S, v31.s[0] +sub v1.4s, v22.4s, v2.4s +add v22.4s, v22.4s, v2.4s +sqrdmulh v2.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +mla v14.4S, v2.4S, v31.s[0] +sub v2.4s, v10.4s, v14.4s +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +mla v21.4S, v14.4S, v31.s[0] +sub v14.4s, v0.4s, v21.4s +add v0.4s, v0.4s, v21.4s +sqrdmulh v21.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +mla v19.4S, v21.4S, v31.s[0] +sub v21.4s, v13.4s, v19.4s +add v13.4s, v13.4s, v19.4s +sqrdmulh v19.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +mla v3.4S, v19.4S, v31.s[0] +sub v19.4s, v0.4s, v3.4s +add v0.4s, v0.4s, v3.4s +sqrdmulh v3.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +mla v20.4S, v3.4S, v31.s[0] +sub v3.4s, v13.4s, v20.4s +add v13.4s, v13.4s, v20.4s +sqrdmulh v20.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +mla v18.4S, v20.4S, v31.s[0] +sub v20.4s, v22.4s, v18.4s +add v22.4s, v22.4s, v18.4s +sqrdmulh v18.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +mla v15.4S, v18.4S, v31.s[0] +sub v18.4s, v10.4s, v15.4s +add v10.4s, v10.4s, v15.4s +sqrdmulh v15.4S, v12.4S, v29.s[2] +mul v12.4S, v12.4S,v30.s[2] +mla v12.4S, v15.4S, v31.s[0] +sub v15.4s, v14.4s, v12.4s +add v14.4s, v14.4s, v12.4s +sqrdmulh v12.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +mla v11.4S, v12.4S, v31.s[0] +sub v12.4s, v21.4s, v11.4s +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +mla v16.4S, v11.4S, v31.s[0] +sub v11.4s, v1.4s, v16.4s +add v1.4s, v1.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +mla v17.4S, v16.4S, v31.s[0] +sub v16.4s, v2.4s, v17.4s +add v2.4s, v2.4s, v17.4s +sqrdmulh v17.4S, v0.4S, v27.s[0] +mul v0.4S, v0.4S,v28.s[0] +mla v0.4S, v17.4S, v31.s[0] +sub v17.4s, v22.4s, v0.4s +add v22.4s, v22.4s, v0.4s +sqrdmulh v0.4S, v13.4S, v27.s[0] +mul v13.4S, v13.4S,v28.s[0] +mla v13.4S, v0.4S, v31.s[0] +sub v0.4s, v10.4s, v13.4s +add v10.4s, v10.4s, v13.4s +sqrdmulh v13.4S, v19.4S, v27.s[1] +mul v19.4S, v19.4S,v28.s[1] +mla v19.4S, v13.4S, v31.s[0] +sub v13.4s, v20.4s, v19.4s +add v20.4s, v20.4s, v19.4s +sqrdmulh v19.4S, v3.4S, v27.s[1] +mul v3.4S, v3.4S,v28.s[1] +mla v3.4S, v19.4S, v31.s[0] +sub v19.4s, v18.4s, v3.4s +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v14.4S, v27.s[2] +mul v14.4S, v14.4S,v28.s[2] +mla v14.4S, v3.4S, v31.s[0] +sub v3.4s, v1.4s, v14.4s +add v1.4s, v1.4s, v14.4s +sqrdmulh v14.4S, v21.4S, v27.s[2] +mul v21.4S, v21.4S,v28.s[2] +mla v21.4S, v14.4S, v31.s[0] +sub v14.4s, v2.4s, v21.4s +add v2.4s, v2.4s, v21.4s +sqrdmulh v21.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +mla v15.4S, v21.4S, v31.s[0] +sub v21.4s, v11.4s, v15.4s +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v12.4S, v27.s[3] +mul v12.4S, v12.4S,v28.s[3] +mla v12.4S, v15.4S, v31.s[0] +sub v15.4s, v16.4s, v12.4s +add v16.4s, v16.4s, v12.4s +sqrdmulh v12.4S, v10.4S, v25.s[0] +mul v10.4S, v10.4S,v26.s[0] +mla v10.4S, v12.4S, v31.s[0] +sub v12.4s, v22.4s, v10.4s +add v22.4s, v22.4s, v10.4s +sqrdmulh v10.4S, v0.4S, v25.s[1] +mul v0.4S, v0.4S,v26.s[1] +mla v0.4S, v10.4S, v31.s[0] +sub v10.4s, v17.4s, v0.4s +add v17.4s, v17.4s, v0.4s +sqrdmulh v0.4S, v18.4S, v25.s[2] +mul v18.4S, v18.4S,v26.s[2] +mla v18.4S, v0.4S, v31.s[0] +sub v0.4s, v20.4s, v18.4s +add v20.4s, v20.4s, v18.4s +sqrdmulh v18.4S, v19.4S, v25.s[3] +mul v19.4S, v19.4S,v26.s[3] +mla v19.4S, v18.4S, v31.s[0] +sub v18.4s, v13.4s, v19.4s +add v13.4s, v13.4s, v19.4s +sqrdmulh v19.4S, v2.4S, v23.s[0] +mul v2.4S, v2.4S,v24.s[0] +mla v2.4S, v19.4S, v31.s[0] +sub v19.4s, v1.4s, v2.4s +add v1.4s, v1.4s, v2.4s +sqrdmulh v2.4S, v14.4S, v23.s[1] +mul v14.4S, v14.4S,v24.s[1] +mla v14.4S, v2.4S, v31.s[0] +sub v2.4s, v3.4s, v14.4s +add v3.4s, v3.4s, v14.4s +sqrdmulh v14.4S, v16.4S, v23.s[2] +mul v16.4S, v16.4S,v24.s[2] +mla v16.4S, v14.4S, v31.s[0] +sub v14.4s, v11.4s, v16.4s +add v11.4s, v11.4s, v16.4s +sqrdmulh v16.4S, v15.4S, v23.s[3] +mul v15.4S, v15.4S,v24.s[3] +mla v15.4S, v16.4S, v31.s[0] +sub v16.4s, v21.4s, v15.4s +add v21.4s, v21.4s, v15.4s +str q22, [x0, #0] +str q12, [x0, #64] +str q17, [x0, #128] +str q10, [x0, #192] +str q20, [x0, #256] +str q0, [x0, #320] +str q13, [x0, #384] +str q18, [x0, #448] +str q1, [x0, #512] +str q19, [x0, #576] +str q3, [x0, #640] +str q2, [x0, #704] +str q11, [x0, #768] +str q14, [x0, #832] +str q21, [x0, #896] +str q16, [x0, #960] +ldr q16, [x0, #784] +ldr q21, [x0, #848] +ldr q14, [x0, #912] +ldr q11, [x0, #976] +ldr q2, [x0, #272] +ldr q3, [x0, #336] +ldr q19, [x0, #400] +ldr q1, [x0, #464] +ldr q18, [x0, #528] +ldr q13, [x0, #592] +ldr q0, [x0, #656] +ldr q20, [x0, #720] +ldr q10, [x0, #16] +ldr q17, [x0, #80] +ldr q12, [x0, #144] +ldr q22, [x0, #208] +sqrdmulh v15.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +mla v16.4S, v15.4S, v31.s[0] +sub v15.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +mla v21.4S, v16.4S, v31.s[0] +sub v16.4s, v3.4s, v21.4s +add v3.4s, v3.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +mla v14.4S, v21.4S, v31.s[0] +sub v21.4s, v19.4s, v14.4s +add v19.4s, v19.4s, v14.4s +sqrdmulh v14.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +mla v11.4S, v14.4S, v31.s[0] +sub v14.4s, v1.4s, v11.4s +add v1.4s, v1.4s, v11.4s +sqrdmulh v11.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +mla v18.4S, v11.4S, v31.s[0] +sub v11.4s, v10.4s, v18.4s +add v10.4s, v10.4s, v18.4s +sqrdmulh v18.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +mla v13.4S, v18.4S, v31.s[0] +sub v18.4s, v17.4s, v13.4s +add v17.4s, v17.4s, v13.4s +sqrdmulh v13.4S, v0.4S, v29.s[0] +mul v0.4S, v0.4S,v30.s[0] +mla v0.4S, v13.4S, v31.s[0] +sub v13.4s, v12.4s, v0.4s +add v12.4s, v12.4s, v0.4s +sqrdmulh v0.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +mla v20.4S, v0.4S, v31.s[0] +sub v0.4s, v22.4s, v20.4s +add v22.4s, v22.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v29.s[1] +mul v19.4S, v19.4S,v30.s[1] +mla v19.4S, v20.4S, v31.s[0] +sub v20.4s, v12.4s, v19.4s +add v12.4s, v12.4s, v19.4s +sqrdmulh v19.4S, v1.4S, v29.s[1] +mul v1.4S, v1.4S,v30.s[1] +mla v1.4S, v19.4S, v31.s[0] +sub v19.4s, v22.4s, v1.4s +add v22.4s, v22.4s, v1.4s +sqrdmulh v1.4S, v2.4S, v29.s[1] +mul v2.4S, v2.4S,v30.s[1] +mla v2.4S, v1.4S, v31.s[0] +sub v1.4s, v10.4s, v2.4s +add v10.4s, v10.4s, v2.4s +sqrdmulh v2.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +mla v3.4S, v2.4S, v31.s[0] +sub v2.4s, v17.4s, v3.4s +add v17.4s, v17.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +mla v21.4S, v3.4S, v31.s[0] +sub v3.4s, v13.4s, v21.4s +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +mla v14.4S, v21.4S, v31.s[0] +sub v21.4s, v0.4s, v14.4s +add v0.4s, v0.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +mla v15.4S, v14.4S, v31.s[0] +sub v14.4s, v11.4s, v15.4s +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +mla v16.4S, v15.4S, v31.s[0] +sub v15.4s, v18.4s, v16.4s +add v18.4s, v18.4s, v16.4s +sqrdmulh v16.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +mla v12.4S, v16.4S, v31.s[0] +sub v16.4s, v10.4s, v12.4s +add v10.4s, v10.4s, v12.4s +sqrdmulh v12.4S, v22.4S, v27.s[0] +mul v22.4S, v22.4S,v28.s[0] +mla v22.4S, v12.4S, v31.s[0] +sub v12.4s, v17.4s, v22.4s +add v17.4s, v17.4s, v22.4s +sqrdmulh v22.4S, v20.4S, v27.s[1] +mul v20.4S, v20.4S,v28.s[1] +mla v20.4S, v22.4S, v31.s[0] +sub v22.4s, v1.4s, v20.4s +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v27.s[1] +mul v19.4S, v19.4S,v28.s[1] +mla v19.4S, v20.4S, v31.s[0] +sub v20.4s, v2.4s, v19.4s +add v2.4s, v2.4s, v19.4s +sqrdmulh v19.4S, v13.4S, v27.s[2] +mul v13.4S, v13.4S,v28.s[2] +mla v13.4S, v19.4S, v31.s[0] +sub v19.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +sqrdmulh v13.4S, v0.4S, v27.s[2] +mul v0.4S, v0.4S,v28.s[2] +mla v0.4S, v13.4S, v31.s[0] +sub v13.4s, v18.4s, v0.4s +add v18.4s, v18.4s, v0.4s +sqrdmulh v0.4S, v3.4S, v27.s[3] +mul v3.4S, v3.4S,v28.s[3] +mla v3.4S, v0.4S, v31.s[0] +sub v0.4s, v14.4s, v3.4s +add v14.4s, v14.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +mla v21.4S, v3.4S, v31.s[0] +sub v3.4s, v15.4s, v21.4s +add v15.4s, v15.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v25.s[0] +mul v17.4S, v17.4S,v26.s[0] +mla v17.4S, v21.4S, v31.s[0] +sub v21.4s, v10.4s, v17.4s +add v10.4s, v10.4s, v17.4s +sqrdmulh v17.4S, v12.4S, v25.s[1] +mul v12.4S, v12.4S,v26.s[1] +mla v12.4S, v17.4S, v31.s[0] +sub v17.4s, v16.4s, v12.4s +add v16.4s, v16.4s, v12.4s +sqrdmulh v12.4S, v2.4S, v25.s[2] +mul v2.4S, v2.4S,v26.s[2] +mla v2.4S, v12.4S, v31.s[0] +sub v12.4s, v1.4s, v2.4s +add v1.4s, v1.4s, v2.4s +sqrdmulh v2.4S, v20.4S, v25.s[3] +mul v20.4S, v20.4S,v26.s[3] +mla v20.4S, v2.4S, v31.s[0] +sub v2.4s, v22.4s, v20.4s +add v22.4s, v22.4s, v20.4s +sqrdmulh v20.4S, v18.4S, v23.s[0] +mul v18.4S, v18.4S,v24.s[0] +mla v18.4S, v20.4S, v31.s[0] +sub v20.4s, v11.4s, v18.4s +add v11.4s, v11.4s, v18.4s +sqrdmulh v18.4S, v13.4S, v23.s[1] +mul v13.4S, v13.4S,v24.s[1] +mla v13.4S, v18.4S, v31.s[0] +sub v18.4s, v19.4s, v13.4s +add v19.4s, v19.4s, v13.4s +sqrdmulh v13.4S, v15.4S, v23.s[2] +mul v15.4S, v15.4S,v24.s[2] +mla v15.4S, v13.4S, v31.s[0] +sub v13.4s, v14.4s, v15.4s +add v14.4s, v14.4s, v15.4s +sqrdmulh v15.4S, v3.4S, v23.s[3] +mul v3.4S, v3.4S,v24.s[3] +mla v3.4S, v15.4S, v31.s[0] +sub v15.4s, v0.4s, v3.4s +add v0.4s, v0.4s, v3.4s +str q10, [x0, #16] +str q21, [x0, #80] +str q16, [x0, #144] +str q17, [x0, #208] +str q1, [x0, #272] +str q12, [x0, #336] +str q22, [x0, #400] +str q2, [x0, #464] +str q11, [x0, #528] +str q20, [x0, #592] +str q19, [x0, #656] +str q18, [x0, #720] +str q14, [x0, #784] +str q13, [x0, #848] +str q0, [x0, #912] +str q15, [x0, #976] +ldr q4, [x17, #+128] +ldr q5, [x17, #+144] +ldr q6, [x17, #+160] +ldr q7, [x17, #+176] +ldr q8, [x17, #+192] +ldr q9, [x17, #+208] +ldr q3, [x17, #+224] +ldr q10, [x17, #+240] +ldr q21, [x0, #32] +ldr q16, [x0, #48] +ldr q17, [x0, #0] +ldr q1, [x0, #16] +sqrdmulh v12.4S, v21.4S, v5.s[0] +mul v21.4S, v21.4S,v4.s[0] +mla v21.4S, v12.4S, v31.s[0] +sub v12.4s, v17.4s, v21.4s +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v16.4S, v5.s[0] +mul v16.4S, v16.4S,v4.s[0] +mla v16.4S, v21.4S, v31.s[0] +sub v21.4s, v1.4s, v16.4s +add v1.4s, v1.4s, v16.4s +ldr q16, [x17, #+256] +ldr q22, [x17, #+272] +sqrdmulh v2.4S, v1.4S, v5.s[1] +mul v1.4S, v1.4S,v4.s[1] +mla v1.4S, v2.4S, v31.s[0] +sub v2.4s, v17.4s, v1.4s +add v17.4s, v17.4s, v1.4s +sqrdmulh v1.4S, v21.4S, v5.s[2] +mul v21.4S, v21.4S,v4.s[2] +mla v21.4S, v1.4S, v31.s[0] +sub v1.4s, v12.4s, v21.4s +add v12.4s, v12.4s, v21.4s +str q17, [x0, #0] +str q2, [x0, #16] +str q12, [x0, #32] +str q1, [x0, #48] +ldr q1, [x0, #96] +ldr q12, [x0, #112] +ldr q2, [x0, #64] +ldr q17, [x0, #80] +sqrdmulh v21.4S, v1.4S, v7.s[0] +mul v1.4S, v1.4S,v6.s[0] +mla v1.4S, v21.4S, v31.s[0] +sub v21.4s, v2.4s, v1.4s +add v2.4s, v2.4s, v1.4s +sqrdmulh v1.4S, v12.4S, v7.s[0] +mul v12.4S, v12.4S,v6.s[0] +mla v12.4S, v1.4S, v31.s[0] +sub v1.4s, v17.4s, v12.4s +add v17.4s, v17.4s, v12.4s +ldr q12, [x17, #+288] +ldr q11, [x17, #+304] +sqrdmulh v20.4S, v17.4S, v7.s[1] +mul v17.4S, v17.4S,v6.s[1] +mla v17.4S, v20.4S, v31.s[0] +sub v20.4s, v2.4s, v17.4s +add v2.4s, v2.4s, v17.4s +sqrdmulh v17.4S, v1.4S, v7.s[2] +mul v1.4S, v1.4S,v6.s[2] +mla v1.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v1.4s +add v21.4s, v21.4s, v1.4s +str q2, [x0, #64] +str q20, [x0, #80] +str q21, [x0, #96] +str q17, [x0, #112] +ldr q17, [x0, #160] +ldr q21, [x0, #176] +ldr q20, [x0, #128] +ldr q2, [x0, #144] +sqrdmulh v1.4S, v17.4S, v9.s[0] +mul v17.4S, v17.4S,v8.s[0] +mla v17.4S, v1.4S, v31.s[0] +sub v1.4s, v20.4s, v17.4s +add v20.4s, v20.4s, v17.4s +sqrdmulh v17.4S, v21.4S, v9.s[0] +mul v21.4S, v21.4S,v8.s[0] +mla v21.4S, v17.4S, v31.s[0] +sub v17.4s, v2.4s, v21.4s +add v2.4s, v2.4s, v21.4s +ldr q21, [x17, #+320] +ldr q19, [x17, #+336] +sqrdmulh v18.4S, v2.4S, v9.s[1] +mul v2.4S, v2.4S,v8.s[1] +mla v2.4S, v18.4S, v31.s[0] +sub v18.4s, v20.4s, v2.4s +add v20.4s, v20.4s, v2.4s +sqrdmulh v2.4S, v17.4S, v9.s[2] +mul v17.4S, v17.4S,v8.s[2] +mla v17.4S, v2.4S, v31.s[0] +sub v2.4s, v1.4s, v17.4s +add v1.4s, v1.4s, v17.4s +str q20, [x0, #128] +str q18, [x0, #144] +str q1, [x0, #160] +str q2, [x0, #176] +ldr q2, [x0, #224] +ldr q1, [x0, #240] +ldr q18, [x0, #192] +ldr q20, [x0, #208] +sqrdmulh v17.4S, v2.4S, v10.s[0] +mul v2.4S, v2.4S,v3.s[0] +mla v2.4S, v17.4S, v31.s[0] +sub v17.4s, v18.4s, v2.4s +add v18.4s, v18.4s, v2.4s +sqrdmulh v2.4S, v1.4S, v10.s[0] +mul v1.4S, v1.4S,v3.s[0] +mla v1.4S, v2.4S, v31.s[0] +sub v2.4s, v20.4s, v1.4s +add v20.4s, v20.4s, v1.4s +ldr q1, [x17, #+352] +ldr q14, [x17, #+368] +sqrdmulh v13.4S, v20.4S, v10.s[1] +mul v20.4S, v20.4S,v3.s[1] +mla v20.4S, v13.4S, v31.s[0] +sub v13.4s, v18.4s, v20.4s +add v18.4s, v18.4s, v20.4s +sqrdmulh v20.4S, v2.4S, v10.s[2] +mul v2.4S, v2.4S,v3.s[2] +mla v2.4S, v20.4S, v31.s[0] +sub v20.4s, v17.4s, v2.4s +add v17.4s, v17.4s, v2.4s +str q18, [x0, #192] +str q13, [x0, #208] +str q17, [x0, #224] +str q20, [x0, #240] +ldr q20, [x0, #288] +ldr q17, [x0, #304] +ldr q13, [x0, #256] +ldr q18, [x0, #272] +sqrdmulh v2.4S, v20.4S, v22.s[0] +mul v20.4S, v20.4S,v16.s[0] +mla v20.4S, v2.4S, v31.s[0] +sub v2.4s, v13.4s, v20.4s +add v13.4s, v13.4s, v20.4s +sqrdmulh v20.4S, v17.4S, v22.s[0] +mul v17.4S, v17.4S,v16.s[0] +mla v17.4S, v20.4S, v31.s[0] +sub v20.4s, v18.4s, v17.4s +add v18.4s, v18.4s, v17.4s +ldr q17, [x17, #+384] +ldr q0, [x17, #+400] +sqrdmulh v15.4S, v18.4S, v22.s[1] +mul v18.4S, v18.4S,v16.s[1] +mla v18.4S, v15.4S, v31.s[0] +sub v15.4s, v13.4s, v18.4s +add v13.4s, v13.4s, v18.4s +sqrdmulh v18.4S, v20.4S, v22.s[2] +mul v20.4S, v20.4S,v16.s[2] +mla v20.4S, v18.4S, v31.s[0] +sub v18.4s, v2.4s, v20.4s +add v2.4s, v2.4s, v20.4s +str q13, [x0, #256] +str q15, [x0, #272] +str q2, [x0, #288] +str q18, [x0, #304] +ldr q5, [x0, #352] +ldr q4, [x0, #368] +ldr q18, [x0, #320] +ldr q2, [x0, #336] +sqrdmulh v15.4S, v5.4S, v11.s[0] +mul v5.4S, v5.4S,v12.s[0] +mla v5.4S, v15.4S, v31.s[0] +sub v15.4s, v18.4s, v5.4s +add v18.4s, v18.4s, v5.4s +sqrdmulh v5.4S, v4.4S, v11.s[0] +mul v4.4S, v4.4S,v12.s[0] +mla v4.4S, v5.4S, v31.s[0] +sub v5.4s, v2.4s, v4.4s +add v2.4s, v2.4s, v4.4s +ldr q4, [x17, #+416] +ldr q13, [x17, #+432] +sqrdmulh v20.4S, v2.4S, v11.s[1] +mul v2.4S, v2.4S,v12.s[1] +mla v2.4S, v20.4S, v31.s[0] +sub v20.4s, v18.4s, v2.4s +add v18.4s, v18.4s, v2.4s +sqrdmulh v2.4S, v5.4S, v11.s[2] +mul v5.4S, v5.4S,v12.s[2] +mla v5.4S, v2.4S, v31.s[0] +sub v2.4s, v15.4s, v5.4s +add v15.4s, v15.4s, v5.4s +str q18, [x0, #320] +str q20, [x0, #336] +str q15, [x0, #352] +str q2, [x0, #368] +ldr q7, [x0, #416] +ldr q6, [x0, #432] +ldr q2, [x0, #384] +ldr q15, [x0, #400] +sqrdmulh v20.4S, v7.4S, v19.s[0] +mul v7.4S, v7.4S,v21.s[0] +mla v7.4S, v20.4S, v31.s[0] +sub v20.4s, v2.4s, v7.4s +add v2.4s, v2.4s, v7.4s +sqrdmulh v7.4S, v6.4S, v19.s[0] +mul v6.4S, v6.4S,v21.s[0] +mla v6.4S, v7.4S, v31.s[0] +sub v7.4s, v15.4s, v6.4s +add v15.4s, v15.4s, v6.4s +ldr q6, [x17, #+448] +ldr q18, [x17, #+464] +sqrdmulh v5.4S, v15.4S, v19.s[1] +mul v15.4S, v15.4S,v21.s[1] +mla v15.4S, v5.4S, v31.s[0] +sub v5.4s, v2.4s, v15.4s +add v2.4s, v2.4s, v15.4s +sqrdmulh v15.4S, v7.4S, v19.s[2] +mul v7.4S, v7.4S,v21.s[2] +mla v7.4S, v15.4S, v31.s[0] +sub v15.4s, v20.4s, v7.4s +add v20.4s, v20.4s, v7.4s +str q2, [x0, #384] +str q5, [x0, #400] +str q20, [x0, #416] +str q15, [x0, #432] +ldr q9, [x0, #480] +ldr q8, [x0, #496] +ldr q15, [x0, #448] +ldr q20, [x0, #464] +sqrdmulh v5.4S, v9.4S, v14.s[0] +mul v9.4S, v9.4S,v1.s[0] +mla v9.4S, v5.4S, v31.s[0] +sub v5.4s, v15.4s, v9.4s +add v15.4s, v15.4s, v9.4s +sqrdmulh v9.4S, v8.4S, v14.s[0] +mul v8.4S, v8.4S,v1.s[0] +mla v8.4S, v9.4S, v31.s[0] +sub v9.4s, v20.4s, v8.4s +add v20.4s, v20.4s, v8.4s +ldr q8, [x17, #+480] +ldr q2, [x17, #+496] +sqrdmulh v7.4S, v20.4S, v14.s[1] +mul v20.4S, v20.4S,v1.s[1] +mla v20.4S, v7.4S, v31.s[0] +sub v7.4s, v15.4s, v20.4s +add v15.4s, v15.4s, v20.4s +sqrdmulh v20.4S, v9.4S, v14.s[2] +mul v9.4S, v9.4S,v1.s[2] +mla v9.4S, v20.4S, v31.s[0] +sub v20.4s, v5.4s, v9.4s +add v5.4s, v5.4s, v9.4s +str q15, [x0, #448] +str q7, [x0, #464] +str q5, [x0, #480] +str q20, [x0, #496] +ldr q10, [x0, #544] +ldr q3, [x0, #560] +ldr q20, [x0, #512] +ldr q5, [x0, #528] +sqrdmulh v7.4S, v10.4S, v0.s[0] +mul v10.4S, v10.4S,v17.s[0] +mla v10.4S, v7.4S, v31.s[0] +sub v7.4s, v20.4s, v10.4s +add v20.4s, v20.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v0.s[0] +mul v3.4S, v3.4S,v17.s[0] +mla v3.4S, v10.4S, v31.s[0] +sub v10.4s, v5.4s, v3.4s +add v5.4s, v5.4s, v3.4s +ldr q3, [x17, #+512] +ldr q15, [x17, #+528] +sqrdmulh v9.4S, v5.4S, v0.s[1] +mul v5.4S, v5.4S,v17.s[1] +mla v5.4S, v9.4S, v31.s[0] +sub v9.4s, v20.4s, v5.4s +add v20.4s, v20.4s, v5.4s +sqrdmulh v5.4S, v10.4S, v0.s[2] +mul v10.4S, v10.4S,v17.s[2] +mla v10.4S, v5.4S, v31.s[0] +sub v5.4s, v7.4s, v10.4s +add v7.4s, v7.4s, v10.4s +str q20, [x0, #512] +str q9, [x0, #528] +str q7, [x0, #544] +str q5, [x0, #560] +ldr q22, [x0, #608] +ldr q16, [x0, #624] +ldr q5, [x0, #576] +ldr q7, [x0, #592] +sqrdmulh v9.4S, v22.4S, v13.s[0] +mul v22.4S, v22.4S,v4.s[0] +mla v22.4S, v9.4S, v31.s[0] +sub v9.4s, v5.4s, v22.4s +add v5.4s, v5.4s, v22.4s +sqrdmulh v22.4S, v16.4S, v13.s[0] +mul v16.4S, v16.4S,v4.s[0] +mla v16.4S, v22.4S, v31.s[0] +sub v22.4s, v7.4s, v16.4s +add v7.4s, v7.4s, v16.4s +ldr q16, [x17, #+544] +ldr q20, [x17, #+560] +sqrdmulh v10.4S, v7.4S, v13.s[1] +mul v7.4S, v7.4S,v4.s[1] +mla v7.4S, v10.4S, v31.s[0] +sub v10.4s, v5.4s, v7.4s +add v5.4s, v5.4s, v7.4s +sqrdmulh v7.4S, v22.4S, v13.s[2] +mul v22.4S, v22.4S,v4.s[2] +mla v22.4S, v7.4S, v31.s[0] +sub v7.4s, v9.4s, v22.4s +add v9.4s, v9.4s, v22.4s +str q5, [x0, #576] +str q10, [x0, #592] +str q9, [x0, #608] +str q7, [x0, #624] +ldr q11, [x0, #672] +ldr q12, [x0, #688] +ldr q7, [x0, #640] +ldr q9, [x0, #656] +sqrdmulh v10.4S, v11.4S, v18.s[0] +mul v11.4S, v11.4S,v6.s[0] +mla v11.4S, v10.4S, v31.s[0] +sub v10.4s, v7.4s, v11.4s +add v7.4s, v7.4s, v11.4s +sqrdmulh v11.4S, v12.4S, v18.s[0] +mul v12.4S, v12.4S,v6.s[0] +mla v12.4S, v11.4S, v31.s[0] +sub v11.4s, v9.4s, v12.4s +add v9.4s, v9.4s, v12.4s +ldr q12, [x17, #+576] +ldr q5, [x17, #+592] +sqrdmulh v22.4S, v9.4S, v18.s[1] +mul v9.4S, v9.4S,v6.s[1] +mla v9.4S, v22.4S, v31.s[0] +sub v22.4s, v7.4s, v9.4s +add v7.4s, v7.4s, v9.4s +sqrdmulh v9.4S, v11.4S, v18.s[2] +mul v11.4S, v11.4S,v6.s[2] +mla v11.4S, v9.4S, v31.s[0] +sub v9.4s, v10.4s, v11.4s +add v10.4s, v10.4s, v11.4s +str q7, [x0, #640] +str q22, [x0, #656] +str q10, [x0, #672] +str q9, [x0, #688] +ldr q19, [x0, #736] +ldr q21, [x0, #752] +ldr q9, [x0, #704] +ldr q10, [x0, #720] +sqrdmulh v22.4S, v19.4S, v2.s[0] +mul v19.4S, v19.4S,v8.s[0] +mla v19.4S, v22.4S, v31.s[0] +sub v22.4s, v9.4s, v19.4s +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v21.4S, v2.s[0] +mul v21.4S, v21.4S,v8.s[0] +mla v21.4S, v19.4S, v31.s[0] +sub v19.4s, v10.4s, v21.4s +add v10.4s, v10.4s, v21.4s +ldr q21, [x17, #+608] +ldr q7, [x17, #+624] +sqrdmulh v11.4S, v10.4S, v2.s[1] +mul v10.4S, v10.4S,v8.s[1] +mla v10.4S, v11.4S, v31.s[0] +sub v11.4s, v9.4s, v10.4s +add v9.4s, v9.4s, v10.4s +sqrdmulh v10.4S, v19.4S, v2.s[2] +mul v19.4S, v19.4S,v8.s[2] +mla v19.4S, v10.4S, v31.s[0] +sub v10.4s, v22.4s, v19.4s +add v22.4s, v22.4s, v19.4s +str q9, [x0, #704] +str q11, [x0, #720] +str q22, [x0, #736] +str q10, [x0, #752] +ldr q14, [x0, #800] +ldr q1, [x0, #816] +ldr q10, [x0, #768] +ldr q22, [x0, #784] +sqrdmulh v11.4S, v14.4S, v15.s[0] +mul v14.4S, v14.4S,v3.s[0] +mla v14.4S, v11.4S, v31.s[0] +sub v11.4s, v10.4s, v14.4s +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v1.4S, v15.s[0] +mul v1.4S, v1.4S,v3.s[0] +mla v1.4S, v14.4S, v31.s[0] +sub v14.4s, v22.4s, v1.4s +add v22.4s, v22.4s, v1.4s +sqrdmulh v1.4S, v22.4S, v15.s[1] +mul v22.4S, v22.4S,v3.s[1] +mla v22.4S, v1.4S, v31.s[0] +sub v1.4s, v10.4s, v22.4s +add v10.4s, v10.4s, v22.4s +sqrdmulh v22.4S, v14.4S, v15.s[2] +mul v14.4S, v14.4S,v3.s[2] +mla v14.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v14.4s +add v11.4s, v11.4s, v14.4s +str q10, [x0, #768] +str q1, [x0, #784] +str q11, [x0, #800] +str q22, [x0, #816] +ldr q0, [x0, #864] +ldr q17, [x0, #880] +ldr q22, [x0, #832] +ldr q11, [x0, #848] +sqrdmulh v1.4S, v0.4S, v20.s[0] +mul v0.4S, v0.4S,v16.s[0] +mla v0.4S, v1.4S, v31.s[0] +sub v1.4s, v22.4s, v0.4s +add v22.4s, v22.4s, v0.4s +sqrdmulh v0.4S, v17.4S, v20.s[0] +mul v17.4S, v17.4S,v16.s[0] +mla v17.4S, v0.4S, v31.s[0] +sub v0.4s, v11.4s, v17.4s +add v11.4s, v11.4s, v17.4s +sqrdmulh v17.4S, v11.4S, v20.s[1] +mul v11.4S, v11.4S,v16.s[1] +mla v11.4S, v17.4S, v31.s[0] +sub v17.4s, v22.4s, v11.4s +add v22.4s, v22.4s, v11.4s +sqrdmulh v11.4S, v0.4S, v20.s[2] +mul v0.4S, v0.4S,v16.s[2] +mla v0.4S, v11.4S, v31.s[0] +sub v11.4s, v1.4s, v0.4s +add v1.4s, v1.4s, v0.4s +str q22, [x0, #832] +str q17, [x0, #848] +str q1, [x0, #864] +str q11, [x0, #880] +ldr q13, [x0, #928] +ldr q4, [x0, #944] +ldr q11, [x0, #896] +ldr q1, [x0, #912] +sqrdmulh v17.4S, v13.4S, v5.s[0] +mul v13.4S, v13.4S,v12.s[0] +mla v13.4S, v17.4S, v31.s[0] +sub v17.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +sqrdmulh v13.4S, v4.4S, v5.s[0] +mul v4.4S, v4.4S,v12.s[0] +mla v4.4S, v13.4S, v31.s[0] +sub v13.4s, v1.4s, v4.4s +add v1.4s, v1.4s, v4.4s +sqrdmulh v4.4S, v1.4S, v5.s[1] +mul v1.4S, v1.4S,v12.s[1] +mla v1.4S, v4.4S, v31.s[0] +sub v4.4s, v11.4s, v1.4s +add v11.4s, v11.4s, v1.4s +sqrdmulh v1.4S, v13.4S, v5.s[2] +mul v13.4S, v13.4S,v12.s[2] +mla v13.4S, v1.4S, v31.s[0] +sub v1.4s, v17.4s, v13.4s +add v17.4s, v17.4s, v13.4s +str q11, [x0, #896] +str q4, [x0, #912] +str q17, [x0, #928] +str q1, [x0, #944] +ldr q18, [x0, #992] +ldr q6, [x0, #1008] +ldr q1, [x0, #960] +ldr q17, [x0, #976] +sqrdmulh v4.4S, v18.4S, v7.s[0] +mul v18.4S, v18.4S,v21.s[0] +mla v18.4S, v4.4S, v31.s[0] +sub v4.4s, v1.4s, v18.4s +add v1.4s, v1.4s, v18.4s +sqrdmulh v18.4S, v6.4S, v7.s[0] +mul v6.4S, v6.4S,v21.s[0] +mla v6.4S, v18.4S, v31.s[0] +sub v18.4s, v17.4s, v6.4s +add v17.4s, v17.4s, v6.4s +sqrdmulh v6.4S, v17.4S, v7.s[1] +mul v17.4S, v17.4S,v21.s[1] +mla v17.4S, v6.4S, v31.s[0] +sub v6.4s, v1.4s, v17.4s +add v1.4s, v1.4s, v17.4s +sqrdmulh v17.4S, v18.4S, v7.s[2] +mul v18.4S, v18.4S,v21.s[2] +mla v18.4S, v17.4S, v31.s[0] +sub v17.4s, v4.4s, v18.4s +add v4.4s, v4.4s, v18.4s +str q1, [x0, #960] +str q6, [x0, #976] +str q4, [x0, #992] +str q17, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1464 +// Instruction count: 1460 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_0_z4_16.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_0_z4_16.s new file mode 100644 index 0000000..1443403 --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_0_z4_16.s @@ -0,0 +1,1494 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_0_z4_16 +.global _ntt_u32_incomplete_neon_asm_var_4_2_0_z4_16 +ntt_u32_incomplete_neon_asm_var_4_2_0_z4_16: +_ntt_u32_incomplete_neon_asm_var_4_2_0_z4_16: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #800] +ldr q21, [x0, #864] +ldr q20, [x0, #928] +ldr q19, [x0, #992] +ldr q18, [x0, #288] +ldr q17, [x0, #352] +ldr q16, [x0, #416] +ldr q3, [x0, #480] +ldr q2, [x0, #544] +ldr q1, [x0, #608] +ldr q0, [x0, #672] +ldr q15, [x0, #736] +ldr q14, [x0, #32] +ldr q13, [x0, #96] +ldr q12, [x0, #160] +ldr q11, [x0, #224] +sqrdmulh v10.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +mla v22.4S, v10.4S, v31.s[0] +sub v10.4s, v18.4s, v22.4s +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v17.4s, v21.4s +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +mla v20.4S, v21.4S, v31.s[0] +sub v21.4s, v16.4s, v20.4s +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +mla v19.4S, v20.4S, v31.s[0] +sub v20.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +mla v2.4S, v19.4S, v31.s[0] +sub v19.4s, v14.4s, v2.4s +add v14.4s, v14.4s, v2.4s +sqrdmulh v2.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +mla v1.4S, v2.4S, v31.s[0] +sub v2.4s, v13.4s, v1.4s +add v13.4s, v13.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v29.s[0] +mul v0.4S, v0.4S,v30.s[0] +mla v0.4S, v1.4S, v31.s[0] +sub v1.4s, v12.4s, v0.4s +add v12.4s, v12.4s, v0.4s +sqrdmulh v0.4S, v15.4S, v29.s[0] +mul v15.4S, v15.4S,v30.s[0] +mla v15.4S, v0.4S, v31.s[0] +sub v0.4s, v11.4s, v15.4s +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +mla v16.4S, v15.4S, v31.s[0] +sub v15.4s, v12.4s, v16.4s +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +mla v3.4S, v16.4S, v31.s[0] +sub v16.4s, v11.4s, v3.4s +add v11.4s, v11.4s, v3.4s +sqrdmulh v3.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +mla v18.4S, v3.4S, v31.s[0] +sub v3.4s, v14.4s, v18.4s +add v14.4s, v14.4s, v18.4s +sqrdmulh v18.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +mla v17.4S, v18.4S, v31.s[0] +sub v18.4s, v13.4s, v17.4s +add v13.4s, v13.4s, v17.4s +sqrdmulh v17.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +mla v21.4S, v17.4S, v31.s[0] +sub v17.4s, v1.4s, v21.4s +add v1.4s, v1.4s, v21.4s +sqrdmulh v21.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +mla v20.4S, v21.4S, v31.s[0] +sub v21.4s, v0.4s, v20.4s +add v0.4s, v0.4s, v20.4s +sqrdmulh v20.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +mla v10.4S, v20.4S, v31.s[0] +sub v20.4s, v19.4s, v10.4s +add v19.4s, v19.4s, v10.4s +sqrdmulh v10.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +mla v22.4S, v10.4S, v31.s[0] +sub v10.4s, v2.4s, v22.4s +add v2.4s, v2.4s, v22.4s +sqrdmulh v22.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +mla v12.4S, v22.4S, v31.s[0] +sub v22.4s, v14.4s, v12.4s +add v14.4s, v14.4s, v12.4s +sqrdmulh v12.4S, v11.4S, v27.s[0] +mul v11.4S, v11.4S,v28.s[0] +mla v11.4S, v12.4S, v31.s[0] +sub v12.4s, v13.4s, v11.4s +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v15.4S, v27.s[1] +mul v15.4S, v15.4S,v28.s[1] +mla v15.4S, v11.4S, v31.s[0] +sub v11.4s, v3.4s, v15.4s +add v3.4s, v3.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v27.s[1] +mul v16.4S, v16.4S,v28.s[1] +mla v16.4S, v15.4S, v31.s[0] +sub v15.4s, v18.4s, v16.4s +add v18.4s, v18.4s, v16.4s +sqrdmulh v16.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +mla v1.4S, v16.4S, v31.s[0] +sub v16.4s, v19.4s, v1.4s +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v27.s[2] +mul v0.4S, v0.4S,v28.s[2] +mla v0.4S, v1.4S, v31.s[0] +sub v1.4s, v2.4s, v0.4s +add v2.4s, v2.4s, v0.4s +sqrdmulh v0.4S, v17.4S, v27.s[3] +mul v17.4S, v17.4S,v28.s[3] +mla v17.4S, v0.4S, v31.s[0] +sub v0.4s, v20.4s, v17.4s +add v20.4s, v20.4s, v17.4s +sqrdmulh v17.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +mla v21.4S, v17.4S, v31.s[0] +sub v17.4s, v10.4s, v21.4s +add v10.4s, v10.4s, v21.4s +sqrdmulh v21.4S, v13.4S, v25.s[0] +mul v13.4S, v13.4S,v26.s[0] +mla v13.4S, v21.4S, v31.s[0] +sub v21.4s, v14.4s, v13.4s +add v14.4s, v14.4s, v13.4s +sqrdmulh v13.4S, v12.4S, v25.s[1] +mul v12.4S, v12.4S,v26.s[1] +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v18.4S, v25.s[2] +mul v18.4S, v18.4S,v26.s[2] +mla v18.4S, v12.4S, v31.s[0] +sub v12.4s, v3.4s, v18.4s +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v15.4S, v25.s[3] +mul v15.4S, v15.4S,v26.s[3] +mla v15.4S, v18.4S, v31.s[0] +sub v18.4s, v11.4s, v15.4s +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v23.s[0] +mul v2.4S, v2.4S,v24.s[0] +mla v2.4S, v15.4S, v31.s[0] +sub v15.4s, v19.4s, v2.4s +add v19.4s, v19.4s, v2.4s +sqrdmulh v2.4S, v1.4S, v23.s[1] +mul v1.4S, v1.4S,v24.s[1] +mla v1.4S, v2.4S, v31.s[0] +sub v2.4s, v16.4s, v1.4s +add v16.4s, v16.4s, v1.4s +sqrdmulh v1.4S, v10.4S, v23.s[2] +mul v10.4S, v10.4S,v24.s[2] +mla v10.4S, v1.4S, v31.s[0] +sub v1.4s, v20.4s, v10.4s +add v20.4s, v20.4s, v10.4s +sqrdmulh v10.4S, v17.4S, v23.s[3] +mul v17.4S, v17.4S,v24.s[3] +mla v17.4S, v10.4S, v31.s[0] +sub v10.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +str q14, [x0, #32] +str q21, [x0, #96] +str q22, [x0, #160] +str q13, [x0, #224] +str q3, [x0, #288] +str q12, [x0, #352] +str q11, [x0, #416] +str q18, [x0, #480] +str q19, [x0, #544] +str q15, [x0, #608] +str q16, [x0, #672] +str q2, [x0, #736] +str q20, [x0, #800] +str q1, [x0, #864] +str q0, [x0, #928] +str q10, [x0, #992] +ldr q10, [x0, #816] +ldr q0, [x0, #880] +ldr q1, [x0, #944] +ldr q20, [x0, #1008] +ldr q2, [x0, #304] +ldr q16, [x0, #368] +ldr q15, [x0, #432] +ldr q19, [x0, #496] +ldr q18, [x0, #560] +ldr q11, [x0, #624] +ldr q12, [x0, #688] +ldr q3, [x0, #752] +ldr q13, [x0, #48] +ldr q22, [x0, #112] +ldr q21, [x0, #176] +ldr q14, [x0, #240] +sqrdmulh v17.4S, v10.4S, v29.s[0] +mul v10.4S, v10.4S,v30.s[0] +mla v10.4S, v17.4S, v31.s[0] +sub v17.4s, v2.4s, v10.4s +add v2.4s, v2.4s, v10.4s +sqrdmulh v10.4S, v0.4S, v29.s[0] +mul v0.4S, v0.4S,v30.s[0] +mla v0.4S, v10.4S, v31.s[0] +sub v10.4s, v16.4s, v0.4s +add v16.4s, v16.4s, v0.4s +sqrdmulh v0.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +mla v1.4S, v0.4S, v31.s[0] +sub v0.4s, v15.4s, v1.4s +add v15.4s, v15.4s, v1.4s +sqrdmulh v1.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +mla v20.4S, v1.4S, v31.s[0] +sub v1.4s, v19.4s, v20.4s +add v19.4s, v19.4s, v20.4s +sqrdmulh v20.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +mla v18.4S, v20.4S, v31.s[0] +sub v20.4s, v13.4s, v18.4s +add v13.4s, v13.4s, v18.4s +sqrdmulh v18.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +mla v11.4S, v18.4S, v31.s[0] +sub v18.4s, v22.4s, v11.4s +add v22.4s, v22.4s, v11.4s +sqrdmulh v11.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +mla v12.4S, v11.4S, v31.s[0] +sub v11.4s, v21.4s, v12.4s +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +mla v3.4S, v12.4S, v31.s[0] +sub v12.4s, v14.4s, v3.4s +add v14.4s, v14.4s, v3.4s +sqrdmulh v3.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +mla v15.4S, v3.4S, v31.s[0] +sub v3.4s, v21.4s, v15.4s +add v21.4s, v21.4s, v15.4s +sqrdmulh v15.4S, v19.4S, v29.s[1] +mul v19.4S, v19.4S,v30.s[1] +mla v19.4S, v15.4S, v31.s[0] +sub v15.4s, v14.4s, v19.4s +add v14.4s, v14.4s, v19.4s +sqrdmulh v19.4S, v2.4S, v29.s[1] +mul v2.4S, v2.4S,v30.s[1] +mla v2.4S, v19.4S, v31.s[0] +sub v19.4s, v13.4s, v2.4s +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +mla v16.4S, v2.4S, v31.s[0] +sub v2.4s, v22.4s, v16.4s +add v22.4s, v22.4s, v16.4s +sqrdmulh v16.4S, v0.4S, v29.s[2] +mul v0.4S, v0.4S,v30.s[2] +mla v0.4S, v16.4S, v31.s[0] +sub v16.4s, v11.4s, v0.4s +add v11.4s, v11.4s, v0.4s +sqrdmulh v0.4S, v1.4S, v29.s[2] +mul v1.4S, v1.4S,v30.s[2] +mla v1.4S, v0.4S, v31.s[0] +sub v0.4s, v12.4s, v1.4s +add v12.4s, v12.4s, v1.4s +sqrdmulh v1.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +mla v17.4S, v1.4S, v31.s[0] +sub v1.4s, v20.4s, v17.4s +add v20.4s, v20.4s, v17.4s +sqrdmulh v17.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +mla v10.4S, v17.4S, v31.s[0] +sub v17.4s, v18.4s, v10.4s +add v18.4s, v18.4s, v10.4s +sqrdmulh v10.4S, v21.4S, v27.s[0] +mul v21.4S, v21.4S,v28.s[0] +mla v21.4S, v10.4S, v31.s[0] +sub v10.4s, v13.4s, v21.4s +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +mla v14.4S, v21.4S, v31.s[0] +sub v21.4s, v22.4s, v14.4s +add v22.4s, v22.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v27.s[1] +mul v3.4S, v3.4S,v28.s[1] +mla v3.4S, v14.4S, v31.s[0] +sub v14.4s, v19.4s, v3.4s +add v19.4s, v19.4s, v3.4s +sqrdmulh v3.4S, v15.4S, v27.s[1] +mul v15.4S, v15.4S,v28.s[1] +mla v15.4S, v3.4S, v31.s[0] +sub v3.4s, v2.4s, v15.4s +add v2.4s, v2.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v27.s[2] +mul v11.4S, v11.4S,v28.s[2] +mla v11.4S, v15.4S, v31.s[0] +sub v15.4s, v20.4s, v11.4s +add v20.4s, v20.4s, v11.4s +sqrdmulh v11.4S, v12.4S, v27.s[2] +mul v12.4S, v12.4S,v28.s[2] +mla v12.4S, v11.4S, v31.s[0] +sub v11.4s, v18.4s, v12.4s +add v18.4s, v18.4s, v12.4s +sqrdmulh v12.4S, v16.4S, v27.s[3] +mul v16.4S, v16.4S,v28.s[3] +mla v16.4S, v12.4S, v31.s[0] +sub v12.4s, v1.4s, v16.4s +add v1.4s, v1.4s, v16.4s +sqrdmulh v16.4S, v0.4S, v27.s[3] +mul v0.4S, v0.4S,v28.s[3] +mla v0.4S, v16.4S, v31.s[0] +sub v16.4s, v17.4s, v0.4s +add v17.4s, v17.4s, v0.4s +sqrdmulh v0.4S, v22.4S, v25.s[0] +mul v22.4S, v22.4S,v26.s[0] +mla v22.4S, v0.4S, v31.s[0] +sub v0.4s, v13.4s, v22.4s +add v13.4s, v13.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v25.s[1] +mul v21.4S, v21.4S,v26.s[1] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v10.4s, v21.4s +add v10.4s, v10.4s, v21.4s +sqrdmulh v21.4S, v2.4S, v25.s[2] +mul v2.4S, v2.4S,v26.s[2] +mla v2.4S, v21.4S, v31.s[0] +sub v21.4s, v19.4s, v2.4s +add v19.4s, v19.4s, v2.4s +sqrdmulh v2.4S, v3.4S, v25.s[3] +mul v3.4S, v3.4S,v26.s[3] +mla v3.4S, v2.4S, v31.s[0] +sub v2.4s, v14.4s, v3.4s +add v14.4s, v14.4s, v3.4s +sqrdmulh v3.4S, v18.4S, v23.s[0] +mul v18.4S, v18.4S,v24.s[0] +mla v18.4S, v3.4S, v31.s[0] +sub v3.4s, v20.4s, v18.4s +add v20.4s, v20.4s, v18.4s +sqrdmulh v18.4S, v11.4S, v23.s[1] +mul v11.4S, v11.4S,v24.s[1] +mla v11.4S, v18.4S, v31.s[0] +sub v18.4s, v15.4s, v11.4s +add v15.4s, v15.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v23.s[2] +mul v17.4S, v17.4S,v24.s[2] +mla v17.4S, v11.4S, v31.s[0] +sub v11.4s, v1.4s, v17.4s +add v1.4s, v1.4s, v17.4s +sqrdmulh v17.4S, v16.4S, v23.s[3] +mul v16.4S, v16.4S,v24.s[3] +mla v16.4S, v17.4S, v31.s[0] +sub v17.4s, v12.4s, v16.4s +add v12.4s, v12.4s, v16.4s +str q13, [x0, #48] +str q0, [x0, #112] +str q10, [x0, #176] +str q22, [x0, #240] +str q19, [x0, #304] +str q21, [x0, #368] +str q14, [x0, #432] +str q2, [x0, #496] +str q20, [x0, #560] +str q3, [x0, #624] +str q15, [x0, #688] +str q18, [x0, #752] +str q1, [x0, #816] +str q11, [x0, #880] +str q12, [x0, #944] +str q17, [x0, #1008] +ldr q17, [x0, #768] +ldr q12, [x0, #832] +ldr q11, [x0, #896] +ldr q1, [x0, #960] +ldr q18, [x0, #256] +ldr q15, [x0, #320] +ldr q3, [x0, #384] +ldr q20, [x0, #448] +ldr q2, [x0, #512] +ldr q14, [x0, #576] +ldr q21, [x0, #640] +ldr q19, [x0, #704] +ldr q22, [x0, #0] +ldr q10, [x0, #64] +ldr q0, [x0, #128] +ldr q13, [x0, #192] +sqrdmulh v16.4S, v17.4S, v29.s[0] +mul v17.4S, v17.4S,v30.s[0] +mla v17.4S, v16.4S, v31.s[0] +sub v16.4s, v18.4s, v17.4s +add v18.4s, v18.4s, v17.4s +sqrdmulh v17.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +mla v12.4S, v17.4S, v31.s[0] +sub v17.4s, v15.4s, v12.4s +add v15.4s, v15.4s, v12.4s +sqrdmulh v12.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +mla v11.4S, v12.4S, v31.s[0] +sub v12.4s, v3.4s, v11.4s +add v3.4s, v3.4s, v11.4s +sqrdmulh v11.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +mla v1.4S, v11.4S, v31.s[0] +sub v11.4s, v20.4s, v1.4s +add v20.4s, v20.4s, v1.4s +sqrdmulh v1.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +mla v2.4S, v1.4S, v31.s[0] +sub v1.4s, v22.4s, v2.4s +add v22.4s, v22.4s, v2.4s +sqrdmulh v2.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +mla v14.4S, v2.4S, v31.s[0] +sub v2.4s, v10.4s, v14.4s +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +mla v21.4S, v14.4S, v31.s[0] +sub v14.4s, v0.4s, v21.4s +add v0.4s, v0.4s, v21.4s +sqrdmulh v21.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +mla v19.4S, v21.4S, v31.s[0] +sub v21.4s, v13.4s, v19.4s +add v13.4s, v13.4s, v19.4s +sqrdmulh v19.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +mla v3.4S, v19.4S, v31.s[0] +sub v19.4s, v0.4s, v3.4s +add v0.4s, v0.4s, v3.4s +sqrdmulh v3.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +mla v20.4S, v3.4S, v31.s[0] +sub v3.4s, v13.4s, v20.4s +add v13.4s, v13.4s, v20.4s +sqrdmulh v20.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +mla v18.4S, v20.4S, v31.s[0] +sub v20.4s, v22.4s, v18.4s +add v22.4s, v22.4s, v18.4s +sqrdmulh v18.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +mla v15.4S, v18.4S, v31.s[0] +sub v18.4s, v10.4s, v15.4s +add v10.4s, v10.4s, v15.4s +sqrdmulh v15.4S, v12.4S, v29.s[2] +mul v12.4S, v12.4S,v30.s[2] +mla v12.4S, v15.4S, v31.s[0] +sub v15.4s, v14.4s, v12.4s +add v14.4s, v14.4s, v12.4s +sqrdmulh v12.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +mla v11.4S, v12.4S, v31.s[0] +sub v12.4s, v21.4s, v11.4s +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +mla v16.4S, v11.4S, v31.s[0] +sub v11.4s, v1.4s, v16.4s +add v1.4s, v1.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +mla v17.4S, v16.4S, v31.s[0] +sub v16.4s, v2.4s, v17.4s +add v2.4s, v2.4s, v17.4s +sqrdmulh v17.4S, v0.4S, v27.s[0] +mul v0.4S, v0.4S,v28.s[0] +mla v0.4S, v17.4S, v31.s[0] +sub v17.4s, v22.4s, v0.4s +add v22.4s, v22.4s, v0.4s +sqrdmulh v0.4S, v13.4S, v27.s[0] +mul v13.4S, v13.4S,v28.s[0] +mla v13.4S, v0.4S, v31.s[0] +sub v0.4s, v10.4s, v13.4s +add v10.4s, v10.4s, v13.4s +sqrdmulh v13.4S, v19.4S, v27.s[1] +mul v19.4S, v19.4S,v28.s[1] +mla v19.4S, v13.4S, v31.s[0] +sub v13.4s, v20.4s, v19.4s +add v20.4s, v20.4s, v19.4s +sqrdmulh v19.4S, v3.4S, v27.s[1] +mul v3.4S, v3.4S,v28.s[1] +mla v3.4S, v19.4S, v31.s[0] +sub v19.4s, v18.4s, v3.4s +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v14.4S, v27.s[2] +mul v14.4S, v14.4S,v28.s[2] +mla v14.4S, v3.4S, v31.s[0] +sub v3.4s, v1.4s, v14.4s +add v1.4s, v1.4s, v14.4s +sqrdmulh v14.4S, v21.4S, v27.s[2] +mul v21.4S, v21.4S,v28.s[2] +mla v21.4S, v14.4S, v31.s[0] +sub v14.4s, v2.4s, v21.4s +add v2.4s, v2.4s, v21.4s +sqrdmulh v21.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +mla v15.4S, v21.4S, v31.s[0] +sub v21.4s, v11.4s, v15.4s +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v12.4S, v27.s[3] +mul v12.4S, v12.4S,v28.s[3] +mla v12.4S, v15.4S, v31.s[0] +sub v15.4s, v16.4s, v12.4s +add v16.4s, v16.4s, v12.4s +sqrdmulh v12.4S, v10.4S, v25.s[0] +mul v10.4S, v10.4S,v26.s[0] +mla v10.4S, v12.4S, v31.s[0] +sub v12.4s, v22.4s, v10.4s +add v22.4s, v22.4s, v10.4s +sqrdmulh v10.4S, v0.4S, v25.s[1] +mul v0.4S, v0.4S,v26.s[1] +mla v0.4S, v10.4S, v31.s[0] +sub v10.4s, v17.4s, v0.4s +add v17.4s, v17.4s, v0.4s +sqrdmulh v0.4S, v18.4S, v25.s[2] +mul v18.4S, v18.4S,v26.s[2] +mla v18.4S, v0.4S, v31.s[0] +sub v0.4s, v20.4s, v18.4s +add v20.4s, v20.4s, v18.4s +sqrdmulh v18.4S, v19.4S, v25.s[3] +mul v19.4S, v19.4S,v26.s[3] +mla v19.4S, v18.4S, v31.s[0] +sub v18.4s, v13.4s, v19.4s +add v13.4s, v13.4s, v19.4s +sqrdmulh v19.4S, v2.4S, v23.s[0] +mul v2.4S, v2.4S,v24.s[0] +mla v2.4S, v19.4S, v31.s[0] +sub v19.4s, v1.4s, v2.4s +add v1.4s, v1.4s, v2.4s +sqrdmulh v2.4S, v14.4S, v23.s[1] +mul v14.4S, v14.4S,v24.s[1] +mla v14.4S, v2.4S, v31.s[0] +sub v2.4s, v3.4s, v14.4s +add v3.4s, v3.4s, v14.4s +sqrdmulh v14.4S, v16.4S, v23.s[2] +mul v16.4S, v16.4S,v24.s[2] +mla v16.4S, v14.4S, v31.s[0] +sub v14.4s, v11.4s, v16.4s +add v11.4s, v11.4s, v16.4s +sqrdmulh v16.4S, v15.4S, v23.s[3] +mul v15.4S, v15.4S,v24.s[3] +mla v15.4S, v16.4S, v31.s[0] +sub v16.4s, v21.4s, v15.4s +add v21.4s, v21.4s, v15.4s +str q22, [x0, #0] +str q12, [x0, #64] +str q17, [x0, #128] +str q10, [x0, #192] +str q20, [x0, #256] +str q0, [x0, #320] +str q13, [x0, #384] +str q18, [x0, #448] +str q1, [x0, #512] +str q19, [x0, #576] +str q3, [x0, #640] +str q2, [x0, #704] +str q11, [x0, #768] +str q14, [x0, #832] +str q21, [x0, #896] +str q16, [x0, #960] +ldr q16, [x0, #784] +ldr q21, [x0, #848] +ldr q14, [x0, #912] +ldr q11, [x0, #976] +ldr q2, [x0, #272] +ldr q3, [x0, #336] +ldr q19, [x0, #400] +ldr q1, [x0, #464] +ldr q18, [x0, #528] +ldr q13, [x0, #592] +ldr q0, [x0, #656] +ldr q20, [x0, #720] +ldr q10, [x0, #16] +ldr q17, [x0, #80] +ldr q12, [x0, #144] +ldr q22, [x0, #208] +sqrdmulh v15.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +mla v16.4S, v15.4S, v31.s[0] +sub v15.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +mla v21.4S, v16.4S, v31.s[0] +sub v16.4s, v3.4s, v21.4s +add v3.4s, v3.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +mla v14.4S, v21.4S, v31.s[0] +sub v21.4s, v19.4s, v14.4s +add v19.4s, v19.4s, v14.4s +sqrdmulh v14.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +mla v11.4S, v14.4S, v31.s[0] +sub v14.4s, v1.4s, v11.4s +add v1.4s, v1.4s, v11.4s +sqrdmulh v11.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +mla v18.4S, v11.4S, v31.s[0] +sub v11.4s, v10.4s, v18.4s +add v10.4s, v10.4s, v18.4s +sqrdmulh v18.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +mla v13.4S, v18.4S, v31.s[0] +sub v18.4s, v17.4s, v13.4s +add v17.4s, v17.4s, v13.4s +sqrdmulh v13.4S, v0.4S, v29.s[0] +mul v0.4S, v0.4S,v30.s[0] +mla v0.4S, v13.4S, v31.s[0] +sub v13.4s, v12.4s, v0.4s +add v12.4s, v12.4s, v0.4s +sqrdmulh v0.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +mla v20.4S, v0.4S, v31.s[0] +sub v0.4s, v22.4s, v20.4s +add v22.4s, v22.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v29.s[1] +mul v19.4S, v19.4S,v30.s[1] +mla v19.4S, v20.4S, v31.s[0] +sub v20.4s, v12.4s, v19.4s +add v12.4s, v12.4s, v19.4s +sqrdmulh v19.4S, v1.4S, v29.s[1] +mul v1.4S, v1.4S,v30.s[1] +mla v1.4S, v19.4S, v31.s[0] +sub v19.4s, v22.4s, v1.4s +add v22.4s, v22.4s, v1.4s +sqrdmulh v1.4S, v2.4S, v29.s[1] +mul v2.4S, v2.4S,v30.s[1] +mla v2.4S, v1.4S, v31.s[0] +sub v1.4s, v10.4s, v2.4s +add v10.4s, v10.4s, v2.4s +sqrdmulh v2.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +mla v3.4S, v2.4S, v31.s[0] +sub v2.4s, v17.4s, v3.4s +add v17.4s, v17.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +mla v21.4S, v3.4S, v31.s[0] +sub v3.4s, v13.4s, v21.4s +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +mla v14.4S, v21.4S, v31.s[0] +sub v21.4s, v0.4s, v14.4s +add v0.4s, v0.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +mla v15.4S, v14.4S, v31.s[0] +sub v14.4s, v11.4s, v15.4s +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +mla v16.4S, v15.4S, v31.s[0] +sub v15.4s, v18.4s, v16.4s +add v18.4s, v18.4s, v16.4s +sqrdmulh v16.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +mla v12.4S, v16.4S, v31.s[0] +sub v16.4s, v10.4s, v12.4s +add v10.4s, v10.4s, v12.4s +sqrdmulh v12.4S, v22.4S, v27.s[0] +mul v22.4S, v22.4S,v28.s[0] +mla v22.4S, v12.4S, v31.s[0] +sub v12.4s, v17.4s, v22.4s +add v17.4s, v17.4s, v22.4s +sqrdmulh v22.4S, v20.4S, v27.s[1] +mul v20.4S, v20.4S,v28.s[1] +mla v20.4S, v22.4S, v31.s[0] +sub v22.4s, v1.4s, v20.4s +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v27.s[1] +mul v19.4S, v19.4S,v28.s[1] +mla v19.4S, v20.4S, v31.s[0] +sub v20.4s, v2.4s, v19.4s +add v2.4s, v2.4s, v19.4s +sqrdmulh v19.4S, v13.4S, v27.s[2] +mul v13.4S, v13.4S,v28.s[2] +mla v13.4S, v19.4S, v31.s[0] +sub v19.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +sqrdmulh v13.4S, v0.4S, v27.s[2] +mul v0.4S, v0.4S,v28.s[2] +mla v0.4S, v13.4S, v31.s[0] +sub v13.4s, v18.4s, v0.4s +add v18.4s, v18.4s, v0.4s +sqrdmulh v0.4S, v3.4S, v27.s[3] +mul v3.4S, v3.4S,v28.s[3] +mla v3.4S, v0.4S, v31.s[0] +sub v0.4s, v14.4s, v3.4s +add v14.4s, v14.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +mla v21.4S, v3.4S, v31.s[0] +sub v3.4s, v15.4s, v21.4s +add v15.4s, v15.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v25.s[0] +mul v17.4S, v17.4S,v26.s[0] +mla v17.4S, v21.4S, v31.s[0] +sub v21.4s, v10.4s, v17.4s +add v10.4s, v10.4s, v17.4s +sqrdmulh v17.4S, v12.4S, v25.s[1] +mul v12.4S, v12.4S,v26.s[1] +mla v12.4S, v17.4S, v31.s[0] +sub v17.4s, v16.4s, v12.4s +add v16.4s, v16.4s, v12.4s +sqrdmulh v12.4S, v2.4S, v25.s[2] +mul v2.4S, v2.4S,v26.s[2] +mla v2.4S, v12.4S, v31.s[0] +sub v12.4s, v1.4s, v2.4s +add v1.4s, v1.4s, v2.4s +sqrdmulh v2.4S, v20.4S, v25.s[3] +mul v20.4S, v20.4S,v26.s[3] +mla v20.4S, v2.4S, v31.s[0] +sub v2.4s, v22.4s, v20.4s +add v22.4s, v22.4s, v20.4s +sqrdmulh v20.4S, v18.4S, v23.s[0] +mul v18.4S, v18.4S,v24.s[0] +mla v18.4S, v20.4S, v31.s[0] +sub v20.4s, v11.4s, v18.4s +add v11.4s, v11.4s, v18.4s +sqrdmulh v18.4S, v13.4S, v23.s[1] +mul v13.4S, v13.4S,v24.s[1] +mla v13.4S, v18.4S, v31.s[0] +sub v18.4s, v19.4s, v13.4s +add v19.4s, v19.4s, v13.4s +sqrdmulh v13.4S, v15.4S, v23.s[2] +mul v15.4S, v15.4S,v24.s[2] +mla v15.4S, v13.4S, v31.s[0] +sub v13.4s, v14.4s, v15.4s +add v14.4s, v14.4s, v15.4s +sqrdmulh v15.4S, v3.4S, v23.s[3] +mul v3.4S, v3.4S,v24.s[3] +mla v3.4S, v15.4S, v31.s[0] +sub v15.4s, v0.4s, v3.4s +add v0.4s, v0.4s, v3.4s +str q10, [x0, #16] +str q21, [x0, #80] +str q16, [x0, #144] +str q17, [x0, #208] +str q1, [x0, #272] +str q12, [x0, #336] +str q22, [x0, #400] +str q2, [x0, #464] +str q11, [x0, #528] +str q20, [x0, #592] +str q19, [x0, #656] +str q18, [x0, #720] +str q14, [x0, #784] +str q13, [x0, #848] +str q0, [x0, #912] +str q15, [x0, #976] +ldr q4, [x17, #+128] +ldr q5, [x17, #+144] +ldr q6, [x17, #+160] +ldr q7, [x17, #+176] +ldr q8, [x17, #+192] +ldr q9, [x17, #+208] +ldr q3, [x17, #+224] +ldr q10, [x17, #+240] +ldr q21, [x0, #32] +ldr q16, [x0, #48] +ldr q17, [x0, #0] +ldr q1, [x0, #16] +ldr q12, [x17, #+256] +ldr q22, [x17, #+272] +sqrdmulh v2.4S, v21.4S, v5.s[0] +mul v21.4S, v21.4S,v4.s[0] +mla v21.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v16.4S, v5.s[0] +mul v16.4S, v16.4S,v4.s[0] +mla v16.4S, v2.4S, v31.s[0] +sub v2.4s, v17.4s, v21.4s +add v17.4s, v17.4s, v21.4s +sub v21.4s, v1.4s, v16.4s +add v1.4s, v1.4s, v16.4s +sqrdmulh v16.4S, v1.4S, v5.s[1] +mul v1.4S, v1.4S,v4.s[1] +mla v1.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v21.4S, v5.s[2] +mul v21.4S, v21.4S,v4.s[2] +mla v21.4S, v16.4S, v31.s[0] +sub v16.4s, v17.4s, v1.4s +add v17.4s, v17.4s, v1.4s +sub v1.4s, v2.4s, v21.4s +add v2.4s, v2.4s, v21.4s +str q17, [x0, #0] +str q16, [x0, #16] +str q2, [x0, #32] +str q1, [x0, #48] +ldr q1, [x0, #96] +ldr q2, [x0, #112] +ldr q16, [x0, #64] +ldr q17, [x0, #80] +ldr q21, [x17, #+288] +ldr q11, [x17, #+304] +sqrdmulh v20.4S, v1.4S, v7.s[0] +mul v1.4S, v1.4S,v6.s[0] +mla v1.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v2.4S, v7.s[0] +mul v2.4S, v2.4S,v6.s[0] +mla v2.4S, v20.4S, v31.s[0] +sub v20.4s, v16.4s, v1.4s +add v16.4s, v16.4s, v1.4s +sub v1.4s, v17.4s, v2.4s +add v17.4s, v17.4s, v2.4s +sqrdmulh v2.4S, v17.4S, v7.s[1] +mul v17.4S, v17.4S,v6.s[1] +mla v17.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v1.4S, v7.s[2] +mul v1.4S, v1.4S,v6.s[2] +mla v1.4S, v2.4S, v31.s[0] +sub v2.4s, v16.4s, v17.4s +add v16.4s, v16.4s, v17.4s +sub v17.4s, v20.4s, v1.4s +add v20.4s, v20.4s, v1.4s +str q16, [x0, #64] +str q2, [x0, #80] +str q20, [x0, #96] +str q17, [x0, #112] +ldr q17, [x0, #160] +ldr q20, [x0, #176] +ldr q2, [x0, #128] +ldr q16, [x0, #144] +ldr q1, [x17, #+320] +ldr q19, [x17, #+336] +sqrdmulh v18.4S, v17.4S, v9.s[0] +mul v17.4S, v17.4S,v8.s[0] +mla v17.4S, v18.4S, v31.s[0] +sqrdmulh v18.4S, v20.4S, v9.s[0] +mul v20.4S, v20.4S,v8.s[0] +mla v20.4S, v18.4S, v31.s[0] +sub v18.4s, v2.4s, v17.4s +add v2.4s, v2.4s, v17.4s +sub v17.4s, v16.4s, v20.4s +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v16.4S, v9.s[1] +mul v16.4S, v16.4S,v8.s[1] +mla v16.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v17.4S, v9.s[2] +mul v17.4S, v17.4S,v8.s[2] +mla v17.4S, v20.4S, v31.s[0] +sub v20.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +sub v16.4s, v18.4s, v17.4s +add v18.4s, v18.4s, v17.4s +str q2, [x0, #128] +str q20, [x0, #144] +str q18, [x0, #160] +str q16, [x0, #176] +ldr q16, [x0, #224] +ldr q18, [x0, #240] +ldr q20, [x0, #192] +ldr q2, [x0, #208] +ldr q17, [x17, #+352] +ldr q14, [x17, #+368] +sqrdmulh v13.4S, v16.4S, v10.s[0] +mul v16.4S, v16.4S,v3.s[0] +mla v16.4S, v13.4S, v31.s[0] +sqrdmulh v13.4S, v18.4S, v10.s[0] +mul v18.4S, v18.4S,v3.s[0] +mla v18.4S, v13.4S, v31.s[0] +sub v13.4s, v20.4s, v16.4s +add v20.4s, v20.4s, v16.4s +sub v16.4s, v2.4s, v18.4s +add v2.4s, v2.4s, v18.4s +sqrdmulh v18.4S, v2.4S, v10.s[1] +mul v2.4S, v2.4S,v3.s[1] +mla v2.4S, v18.4S, v31.s[0] +sqrdmulh v18.4S, v16.4S, v10.s[2] +mul v16.4S, v16.4S,v3.s[2] +mla v16.4S, v18.4S, v31.s[0] +sub v18.4s, v20.4s, v2.4s +add v20.4s, v20.4s, v2.4s +sub v2.4s, v13.4s, v16.4s +add v13.4s, v13.4s, v16.4s +str q20, [x0, #192] +str q18, [x0, #208] +str q13, [x0, #224] +str q2, [x0, #240] +ldr q2, [x0, #288] +ldr q13, [x0, #304] +ldr q18, [x0, #256] +ldr q20, [x0, #272] +ldr q16, [x17, #+384] +ldr q0, [x17, #+400] +sqrdmulh v15.4S, v2.4S, v22.s[0] +mul v2.4S, v2.4S,v12.s[0] +mla v2.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v13.4S, v22.s[0] +mul v13.4S, v13.4S,v12.s[0] +mla v13.4S, v15.4S, v31.s[0] +sub v15.4s, v18.4s, v2.4s +add v18.4s, v18.4s, v2.4s +sub v2.4s, v20.4s, v13.4s +add v20.4s, v20.4s, v13.4s +sqrdmulh v13.4S, v20.4S, v22.s[1] +mul v20.4S, v20.4S,v12.s[1] +mla v20.4S, v13.4S, v31.s[0] +sqrdmulh v13.4S, v2.4S, v22.s[2] +mul v2.4S, v2.4S,v12.s[2] +mla v2.4S, v13.4S, v31.s[0] +sub v13.4s, v18.4s, v20.4s +add v18.4s, v18.4s, v20.4s +sub v20.4s, v15.4s, v2.4s +add v15.4s, v15.4s, v2.4s +str q18, [x0, #256] +str q13, [x0, #272] +str q15, [x0, #288] +str q20, [x0, #304] +ldr q5, [x0, #352] +ldr q4, [x0, #368] +ldr q20, [x0, #320] +ldr q15, [x0, #336] +ldr q13, [x17, #+416] +ldr q18, [x17, #+432] +sqrdmulh v2.4S, v5.4S, v11.s[0] +mul v5.4S, v5.4S,v21.s[0] +mla v5.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v4.4S, v11.s[0] +mul v4.4S, v4.4S,v21.s[0] +mla v4.4S, v2.4S, v31.s[0] +sub v2.4s, v20.4s, v5.4s +add v20.4s, v20.4s, v5.4s +sub v5.4s, v15.4s, v4.4s +add v15.4s, v15.4s, v4.4s +sqrdmulh v4.4S, v15.4S, v11.s[1] +mul v15.4S, v15.4S,v21.s[1] +mla v15.4S, v4.4S, v31.s[0] +sqrdmulh v4.4S, v5.4S, v11.s[2] +mul v5.4S, v5.4S,v21.s[2] +mla v5.4S, v4.4S, v31.s[0] +sub v4.4s, v20.4s, v15.4s +add v20.4s, v20.4s, v15.4s +sub v15.4s, v2.4s, v5.4s +add v2.4s, v2.4s, v5.4s +str q20, [x0, #320] +str q4, [x0, #336] +str q2, [x0, #352] +str q15, [x0, #368] +ldr q7, [x0, #416] +ldr q6, [x0, #432] +ldr q15, [x0, #384] +ldr q2, [x0, #400] +ldr q4, [x17, #+448] +ldr q20, [x17, #+464] +sqrdmulh v5.4S, v7.4S, v19.s[0] +mul v7.4S, v7.4S,v1.s[0] +mla v7.4S, v5.4S, v31.s[0] +sqrdmulh v5.4S, v6.4S, v19.s[0] +mul v6.4S, v6.4S,v1.s[0] +mla v6.4S, v5.4S, v31.s[0] +sub v5.4s, v15.4s, v7.4s +add v15.4s, v15.4s, v7.4s +sub v7.4s, v2.4s, v6.4s +add v2.4s, v2.4s, v6.4s +sqrdmulh v6.4S, v2.4S, v19.s[1] +mul v2.4S, v2.4S,v1.s[1] +mla v2.4S, v6.4S, v31.s[0] +sqrdmulh v6.4S, v7.4S, v19.s[2] +mul v7.4S, v7.4S,v1.s[2] +mla v7.4S, v6.4S, v31.s[0] +sub v6.4s, v15.4s, v2.4s +add v15.4s, v15.4s, v2.4s +sub v2.4s, v5.4s, v7.4s +add v5.4s, v5.4s, v7.4s +str q15, [x0, #384] +str q6, [x0, #400] +str q5, [x0, #416] +str q2, [x0, #432] +ldr q9, [x0, #480] +ldr q8, [x0, #496] +ldr q2, [x0, #448] +ldr q5, [x0, #464] +ldr q6, [x17, #+480] +ldr q15, [x17, #+496] +sqrdmulh v7.4S, v9.4S, v14.s[0] +mul v9.4S, v9.4S,v17.s[0] +mla v9.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v8.4S, v14.s[0] +mul v8.4S, v8.4S,v17.s[0] +mla v8.4S, v7.4S, v31.s[0] +sub v7.4s, v2.4s, v9.4s +add v2.4s, v2.4s, v9.4s +sub v9.4s, v5.4s, v8.4s +add v5.4s, v5.4s, v8.4s +sqrdmulh v8.4S, v5.4S, v14.s[1] +mul v5.4S, v5.4S,v17.s[1] +mla v5.4S, v8.4S, v31.s[0] +sqrdmulh v8.4S, v9.4S, v14.s[2] +mul v9.4S, v9.4S,v17.s[2] +mla v9.4S, v8.4S, v31.s[0] +sub v8.4s, v2.4s, v5.4s +add v2.4s, v2.4s, v5.4s +sub v5.4s, v7.4s, v9.4s +add v7.4s, v7.4s, v9.4s +str q2, [x0, #448] +str q8, [x0, #464] +str q7, [x0, #480] +str q5, [x0, #496] +ldr q10, [x0, #544] +ldr q3, [x0, #560] +ldr q5, [x0, #512] +ldr q7, [x0, #528] +ldr q8, [x17, #+512] +ldr q2, [x17, #+528] +sqrdmulh v9.4S, v10.4S, v0.s[0] +mul v10.4S, v10.4S,v16.s[0] +mla v10.4S, v9.4S, v31.s[0] +sqrdmulh v9.4S, v3.4S, v0.s[0] +mul v3.4S, v3.4S,v16.s[0] +mla v3.4S, v9.4S, v31.s[0] +sub v9.4s, v5.4s, v10.4s +add v5.4s, v5.4s, v10.4s +sub v10.4s, v7.4s, v3.4s +add v7.4s, v7.4s, v3.4s +sqrdmulh v3.4S, v7.4S, v0.s[1] +mul v7.4S, v7.4S,v16.s[1] +mla v7.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v10.4S, v0.s[2] +mul v10.4S, v10.4S,v16.s[2] +mla v10.4S, v3.4S, v31.s[0] +sub v3.4s, v5.4s, v7.4s +add v5.4s, v5.4s, v7.4s +sub v7.4s, v9.4s, v10.4s +add v9.4s, v9.4s, v10.4s +str q5, [x0, #512] +str q3, [x0, #528] +str q9, [x0, #544] +str q7, [x0, #560] +ldr q22, [x0, #608] +ldr q12, [x0, #624] +ldr q7, [x0, #576] +ldr q9, [x0, #592] +ldr q3, [x17, #+544] +ldr q5, [x17, #+560] +sqrdmulh v10.4S, v22.4S, v18.s[0] +mul v22.4S, v22.4S,v13.s[0] +mla v22.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v12.4S, v18.s[0] +mul v12.4S, v12.4S,v13.s[0] +mla v12.4S, v10.4S, v31.s[0] +sub v10.4s, v7.4s, v22.4s +add v7.4s, v7.4s, v22.4s +sub v22.4s, v9.4s, v12.4s +add v9.4s, v9.4s, v12.4s +sqrdmulh v12.4S, v9.4S, v18.s[1] +mul v9.4S, v9.4S,v13.s[1] +mla v9.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v22.4S, v18.s[2] +mul v22.4S, v22.4S,v13.s[2] +mla v22.4S, v12.4S, v31.s[0] +sub v12.4s, v7.4s, v9.4s +add v7.4s, v7.4s, v9.4s +sub v9.4s, v10.4s, v22.4s +add v10.4s, v10.4s, v22.4s +str q7, [x0, #576] +str q12, [x0, #592] +str q10, [x0, #608] +str q9, [x0, #624] +ldr q11, [x0, #672] +ldr q21, [x0, #688] +ldr q9, [x0, #640] +ldr q10, [x0, #656] +ldr q12, [x17, #+576] +ldr q7, [x17, #+592] +sqrdmulh v22.4S, v11.4S, v20.s[0] +mul v11.4S, v11.4S,v4.s[0] +mla v11.4S, v22.4S, v31.s[0] +sqrdmulh v22.4S, v21.4S, v20.s[0] +mul v21.4S, v21.4S,v4.s[0] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v9.4s, v11.4s +add v9.4s, v9.4s, v11.4s +sub v11.4s, v10.4s, v21.4s +add v10.4s, v10.4s, v21.4s +sqrdmulh v21.4S, v10.4S, v20.s[1] +mul v10.4S, v10.4S,v4.s[1] +mla v10.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v11.4S, v20.s[2] +mul v11.4S, v11.4S,v4.s[2] +mla v11.4S, v21.4S, v31.s[0] +sub v21.4s, v9.4s, v10.4s +add v9.4s, v9.4s, v10.4s +sub v10.4s, v22.4s, v11.4s +add v22.4s, v22.4s, v11.4s +str q9, [x0, #640] +str q21, [x0, #656] +str q22, [x0, #672] +str q10, [x0, #688] +ldr q19, [x0, #736] +ldr q1, [x0, #752] +ldr q10, [x0, #704] +ldr q22, [x0, #720] +ldr q21, [x17, #+608] +ldr q9, [x17, #+624] +sqrdmulh v11.4S, v19.4S, v15.s[0] +mul v19.4S, v19.4S,v6.s[0] +mla v19.4S, v11.4S, v31.s[0] +sqrdmulh v11.4S, v1.4S, v15.s[0] +mul v1.4S, v1.4S,v6.s[0] +mla v1.4S, v11.4S, v31.s[0] +sub v11.4s, v10.4s, v19.4s +add v10.4s, v10.4s, v19.4s +sub v19.4s, v22.4s, v1.4s +add v22.4s, v22.4s, v1.4s +sqrdmulh v1.4S, v22.4S, v15.s[1] +mul v22.4S, v22.4S,v6.s[1] +mla v22.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v19.4S, v15.s[2] +mul v19.4S, v19.4S,v6.s[2] +mla v19.4S, v1.4S, v31.s[0] +sub v1.4s, v10.4s, v22.4s +add v10.4s, v10.4s, v22.4s +sub v22.4s, v11.4s, v19.4s +add v11.4s, v11.4s, v19.4s +str q10, [x0, #704] +str q1, [x0, #720] +str q11, [x0, #736] +str q22, [x0, #752] +ldr q14, [x0, #800] +ldr q17, [x0, #816] +ldr q22, [x0, #768] +ldr q11, [x0, #784] +sqrdmulh v1.4S, v14.4S, v2.s[0] +mul v14.4S, v14.4S,v8.s[0] +mla v14.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v17.4S, v2.s[0] +mul v17.4S, v17.4S,v8.s[0] +mla v17.4S, v1.4S, v31.s[0] +sub v1.4s, v22.4s, v14.4s +add v22.4s, v22.4s, v14.4s +sub v14.4s, v11.4s, v17.4s +add v11.4s, v11.4s, v17.4s +sqrdmulh v17.4S, v11.4S, v2.s[1] +mul v11.4S, v11.4S,v8.s[1] +mla v11.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v14.4S, v2.s[2] +mul v14.4S, v14.4S,v8.s[2] +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v22.4s, v11.4s +add v22.4s, v22.4s, v11.4s +sub v11.4s, v1.4s, v14.4s +add v1.4s, v1.4s, v14.4s +str q22, [x0, #768] +str q17, [x0, #784] +str q1, [x0, #800] +str q11, [x0, #816] +ldr q0, [x0, #864] +ldr q16, [x0, #880] +ldr q11, [x0, #832] +ldr q1, [x0, #848] +sqrdmulh v17.4S, v0.4S, v5.s[0] +mul v0.4S, v0.4S,v3.s[0] +mla v0.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v16.4S, v5.s[0] +mul v16.4S, v16.4S,v3.s[0] +mla v16.4S, v17.4S, v31.s[0] +sub v17.4s, v11.4s, v0.4s +add v11.4s, v11.4s, v0.4s +sub v0.4s, v1.4s, v16.4s +add v1.4s, v1.4s, v16.4s +sqrdmulh v16.4S, v1.4S, v5.s[1] +mul v1.4S, v1.4S,v3.s[1] +mla v1.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v0.4S, v5.s[2] +mul v0.4S, v0.4S,v3.s[2] +mla v0.4S, v16.4S, v31.s[0] +sub v16.4s, v11.4s, v1.4s +add v11.4s, v11.4s, v1.4s +sub v1.4s, v17.4s, v0.4s +add v17.4s, v17.4s, v0.4s +str q11, [x0, #832] +str q16, [x0, #848] +str q17, [x0, #864] +str q1, [x0, #880] +ldr q18, [x0, #928] +ldr q13, [x0, #944] +ldr q1, [x0, #896] +ldr q17, [x0, #912] +sqrdmulh v16.4S, v18.4S, v7.s[0] +mul v18.4S, v18.4S,v12.s[0] +mla v18.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v13.4S, v7.s[0] +mul v13.4S, v13.4S,v12.s[0] +mla v13.4S, v16.4S, v31.s[0] +sub v16.4s, v1.4s, v18.4s +add v1.4s, v1.4s, v18.4s +sub v18.4s, v17.4s, v13.4s +add v17.4s, v17.4s, v13.4s +sqrdmulh v13.4S, v17.4S, v7.s[1] +mul v17.4S, v17.4S,v12.s[1] +mla v17.4S, v13.4S, v31.s[0] +sqrdmulh v13.4S, v18.4S, v7.s[2] +mul v18.4S, v18.4S,v12.s[2] +mla v18.4S, v13.4S, v31.s[0] +sub v13.4s, v1.4s, v17.4s +add v1.4s, v1.4s, v17.4s +sub v17.4s, v16.4s, v18.4s +add v16.4s, v16.4s, v18.4s +str q1, [x0, #896] +str q13, [x0, #912] +str q16, [x0, #928] +str q17, [x0, #944] +ldr q20, [x0, #992] +ldr q4, [x0, #1008] +ldr q17, [x0, #960] +ldr q16, [x0, #976] +sqrdmulh v13.4S, v20.4S, v9.s[0] +mul v20.4S, v20.4S,v21.s[0] +mla v20.4S, v13.4S, v31.s[0] +sqrdmulh v13.4S, v4.4S, v9.s[0] +mul v4.4S, v4.4S,v21.s[0] +mla v4.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v20.4s +add v17.4s, v17.4s, v20.4s +sub v20.4s, v16.4s, v4.4s +add v16.4s, v16.4s, v4.4s +sqrdmulh v4.4S, v16.4S, v9.s[1] +mul v16.4S, v16.4S,v21.s[1] +mla v16.4S, v4.4S, v31.s[0] +sqrdmulh v4.4S, v20.4S, v9.s[2] +mul v20.4S, v20.4S,v21.s[2] +mla v20.4S, v4.4S, v31.s[0] +sub v4.4s, v17.4s, v16.4s +add v17.4s, v17.4s, v16.4s +sub v16.4s, v13.4s, v20.4s +add v13.4s, v13.4s, v20.4s +str q17, [x0, #960] +str q4, [x0, #976] +str q13, [x0, #992] +str q16, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1464 +// Instruction count: 1460 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_10_z4_7.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_10_z4_7.s new file mode 100644 index 0000000..7075abc --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_10_z4_7.s @@ -0,0 +1,1558 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_10_z4_7 +.global _ntt_u32_incomplete_neon_asm_var_4_2_10_z4_7 +ntt_u32_incomplete_neon_asm_var_4_2_10_z4_7: +_ntt_u32_incomplete_neon_asm_var_4_2_10_z4_7: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #928] +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +ldr q20, [x0, #992] +sqrdmulh v19.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q18, [x0, #800] +sqrdmulh v17.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +ldr q16, [x0, #864] +sqrdmulh v3.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +mla v22.4S, v21.4S, v31.s[0] +mla v20.4S, v19.4S, v31.s[0] +mla v18.4S, v17.4S, v31.s[0] +mla v16.4S, v3.4S, v31.s[0] +ldr q3, [x0, #544] +sqrdmulh v17.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +ldr q19, [x0, #608] +sqrdmulh v21.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q2, [x0, #672] +ldr q1, [x0, #416] +sqrdmulh v0.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +sub v15.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +ldr q22, [x0, #736] +ldr q14, [x0, #480] +sqrdmulh v13.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v12.4s, v14.4s, v20.4s +add v14.4s, v14.4s, v20.4s +ldr q20, [x0, #288] +mla v3.4S, v17.4S, v31.s[0] +mla v19.4S, v21.4S, v31.s[0] +sub v21.4s, v20.4s, v18.4s +mla v2.4S, v0.4S, v31.s[0] +mla v22.4S, v13.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +ldr q18, [x0, #352] +sqrdmulh v13.4S, v1.4S, v29.s[1] +mul v1.4S, v1.4S,v30.s[1] +sub v0.4s, v18.4s, v16.4s +sqrdmulh v17.4S, v14.4S, v29.s[1] +mul v14.4S, v14.4S,v30.s[1] +add v18.4s, v18.4s, v16.4s +ldr q16, [x0, #32] +sqrdmulh v11.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v10.4s, v16.4s, v3.4s +add v16.4s, v16.4s, v3.4s +ldr q3, [x0, #96] +sqrdmulh v9.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v8.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +ldr q19, [x0, #160] +mla v1.4S, v13.4S, v31.s[0] +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v19.4s, v2.4s +mla v20.4S, v11.4S, v31.s[0] +mla v18.4S, v9.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +ldr q2, [x0, #224] +sqrdmulh v9.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +sub v11.4s, v2.4s, v22.4s +sqrdmulh v13.4S, v12.4S, v29.s[2] +mul v12.4S, v12.4S,v30.s[2] +add v2.4s, v2.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +sub v7.4s, v19.4s, v1.4s +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v29.s[2] +mul v0.4S, v0.4S,v30.s[2] +sub v6.4s, v2.4s, v14.4s +add v2.4s, v2.4s, v14.4s +mla v15.4S, v9.4S, v31.s[0] +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v16.4s, v20.4s +nop +mla v21.4S, v22.4S, v31.s[0] +mla v0.4S, v1.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +nop +sqrdmulh v20.4S, v7.4S, v27.s[1] +mul v7.4S, v7.4S,v28.s[1] +sub v1.4s, v3.4s, v18.4s +nop +sqrdmulh v22.4S, v6.4S, v27.s[1] +mul v6.4S, v6.4S,v28.s[1] +add v3.4s, v3.4s, v18.4s +nop +sqrdmulh v18.4S, v19.4S, v27.s[0] +mul v19.4S, v19.4S,v28.s[0] +sub v9.4s, v17.4s, v15.4s +add v17.4s, v17.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v27.s[0] +mul v2.4S, v2.4S,v28.s[0] +sub v14.4s, v11.4s, v12.4s +add v11.4s, v11.4s, v12.4s +mla v7.4S, v20.4S, v31.s[0] +mla v6.4S, v22.4S, v31.s[0] +sub v22.4s, v10.4s, v21.4s +nop +mla v19.4S, v18.4S, v31.s[0] +mla v2.4S, v15.4S, v31.s[0] +add v10.4s, v10.4s, v21.4s +nop +sqrdmulh v21.4S, v17.4S, v27.s[2] +mul v17.4S, v17.4S,v28.s[2] +sub v15.4s, v8.4s, v0.4s +nop +sqrdmulh v18.4S, v11.4S, v27.s[2] +mul v11.4S, v11.4S,v28.s[2] +add v8.4s, v8.4s, v0.4s +nop +sqrdmulh v0.4S, v9.4S, v27.s[3] +mul v9.4S, v9.4S,v28.s[3] +sub v20.4s, v13.4s, v7.4s +add v13.4s, v13.4s, v7.4s +sqrdmulh v7.4S, v14.4S, v27.s[3] +mul v14.4S, v14.4S,v28.s[3] +sub v12.4s, v1.4s, v6.4s +add v1.4s, v1.4s, v6.4s +mla v17.4S, v21.4S, v31.s[0] +mla v11.4S, v18.4S, v31.s[0] +sub v18.4s, v16.4s, v19.4s +nop +mla v9.4S, v0.4S, v31.s[0] +mla v14.4S, v7.4S, v31.s[0] +add v16.4s, v16.4s, v19.4s +nop +sqrdmulh v19.4S, v1.4S, v25.s[2] +mul v1.4S, v1.4S,v26.s[2] +sub v7.4s, v3.4s, v2.4s +nop +sqrdmulh v0.4S, v12.4S, v25.s[3] +mul v12.4S, v12.4S,v26.s[3] +add v3.4s, v3.4s, v2.4s +nop +sqrdmulh v2.4S, v7.4S, v25.s[1] +mul v7.4S, v7.4S,v26.s[1] +sub v21.4s, v10.4s, v17.4s +add v10.4s, v10.4s, v17.4s +sqrdmulh v17.4S, v3.4S, v25.s[0] +mul v3.4S, v3.4S,v26.s[0] +sub v6.4s, v8.4s, v11.4s +add v8.4s, v8.4s, v11.4s +mla v1.4S, v19.4S, v31.s[0] +mla v12.4S, v0.4S, v31.s[0] +sub v0.4s, v22.4s, v9.4s +nop +mla v7.4S, v2.4S, v31.s[0] +mla v3.4S, v17.4S, v31.s[0] +add v22.4s, v22.4s, v9.4s +nop +sqrdmulh v9.4S, v8.4S, v23.s[0] +mul v8.4S, v8.4S,v24.s[0] +sub v17.4s, v15.4s, v14.4s +nop +sqrdmulh v2.4S, v6.4S, v23.s[1] +mul v6.4S, v6.4S,v24.s[1] +add v15.4s, v15.4s, v14.4s +nop +sqrdmulh v14.4S, v15.4S, v23.s[2] +mul v15.4S, v15.4S,v24.s[2] +sub v19.4s, v13.4s, v1.4s +add v13.4s, v13.4s, v1.4s +sqrdmulh v1.4S, v17.4S, v23.s[3] +mul v17.4S, v17.4S,v24.s[3] +sub v11.4s, v20.4s, v12.4s +add v20.4s, v20.4s, v12.4s +mla v8.4S, v9.4S, v31.s[0] +mla v6.4S, v2.4S, v31.s[0] +sub v2.4s, v18.4s, v7.4s +str q13, [x0, #288] +mla v15.4S, v14.4S, v31.s[0] +mla v17.4S, v1.4S, v31.s[0] +add v18.4s, v18.4s, v7.4s +str q19, [x0, #352] +ldr q19, [x0, #944] +sqrdmulh v7.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +sub v1.4s, v16.4s, v3.4s +str q20, [x0, #416] +ldr q20, [x0, #1008] +sqrdmulh v14.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v16.4s, v16.4s, v3.4s +str q11, [x0, #480] +ldr q11, [x0, #816] +sqrdmulh v3.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +sub v13.4s, v10.4s, v8.4s +add v10.4s, v10.4s, v8.4s +ldr q8, [x0, #880] +sqrdmulh v9.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v12.4s, v21.4s, v6.4s +add v21.4s, v21.4s, v6.4s +mla v19.4S, v7.4S, v31.s[0] +mla v20.4S, v14.4S, v31.s[0] +sub v14.4s, v22.4s, v15.4s +str q18, [x0, #160] +mla v11.4S, v3.4S, v31.s[0] +mla v8.4S, v9.4S, v31.s[0] +add v22.4s, v22.4s, v15.4s +str q2, [x0, #224] +ldr q2, [x0, #560] +sqrdmulh v15.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +sub v9.4s, v0.4s, v17.4s +str q16, [x0, #32] +ldr q16, [x0, #624] +sqrdmulh v3.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +add v0.4s, v0.4s, v17.4s +str q1, [x0, #96] +ldr q1, [x0, #688] +ldr q17, [x0, #432] +sqrdmulh v18.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +sub v7.4s, v17.4s, v19.4s +add v17.4s, v17.4s, v19.4s +ldr q19, [x0, #752] +ldr q6, [x0, #496] +sqrdmulh v5.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +sub v4.4s, v6.4s, v20.4s +add v6.4s, v6.4s, v20.4s +ldr q20, [x0, #304] +mla v2.4S, v15.4S, v31.s[0] +mla v16.4S, v3.4S, v31.s[0] +sub v3.4s, v20.4s, v11.4s +str q10, [x0, #544] +mla v1.4S, v18.4S, v31.s[0] +mla v19.4S, v5.4S, v31.s[0] +add v20.4s, v20.4s, v11.4s +str q13, [x0, #608] +ldr q13, [x0, #368] +sqrdmulh v11.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v5.4s, v13.4s, v8.4s +str q21, [x0, #672] +sqrdmulh v21.4S, v6.4S, v29.s[1] +mul v6.4S, v6.4S,v30.s[1] +add v13.4s, v13.4s, v8.4s +str q12, [x0, #736] +ldr q12, [x0, #48] +sqrdmulh v8.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v18.4s, v12.4s, v2.4s +add v12.4s, v12.4s, v2.4s +ldr q2, [x0, #112] +sqrdmulh v10.4S, v13.4S, v29.s[1] +mul v13.4S, v13.4S,v30.s[1] +sub v15.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +ldr q16, [x0, #176] +mla v17.4S, v11.4S, v31.s[0] +mla v6.4S, v21.4S, v31.s[0] +sub v21.4s, v16.4s, v1.4s +str q22, [x0, #800] +mla v20.4S, v8.4S, v31.s[0] +mla v13.4S, v10.4S, v31.s[0] +add v16.4s, v16.4s, v1.4s +str q14, [x0, #864] +ldr q14, [x0, #240] +sqrdmulh v1.4S, v7.4S, v29.s[2] +mul v7.4S, v7.4S,v30.s[2] +sub v10.4s, v14.4s, v19.4s +str q0, [x0, #928] +sqrdmulh v0.4S, v4.4S, v29.s[2] +mul v4.4S, v4.4S,v30.s[2] +add v14.4s, v14.4s, v19.4s +str q9, [x0, #992] +sqrdmulh v9.4S, v3.4S, v29.s[2] +mul v3.4S, v3.4S,v30.s[2] +sub v19.4s, v16.4s, v17.4s +add v16.4s, v16.4s, v17.4s +sqrdmulh v17.4S, v5.4S, v29.s[2] +mul v5.4S, v5.4S,v30.s[2] +sub v8.4s, v14.4s, v6.4s +add v14.4s, v14.4s, v6.4s +mla v7.4S, v1.4S, v31.s[0] +mla v4.4S, v0.4S, v31.s[0] +sub v0.4s, v12.4s, v20.4s +nop +mla v3.4S, v9.4S, v31.s[0] +mla v5.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v20.4s +nop +sqrdmulh v20.4S, v19.4S, v27.s[1] +mul v19.4S, v19.4S,v28.s[1] +sub v17.4s, v2.4s, v13.4s +nop +sqrdmulh v9.4S, v8.4S, v27.s[1] +mul v8.4S, v8.4S,v28.s[1] +add v2.4s, v2.4s, v13.4s +nop +sqrdmulh v13.4S, v16.4S, v27.s[0] +mul v16.4S, v16.4S,v28.s[0] +sub v1.4s, v21.4s, v7.4s +add v21.4s, v21.4s, v7.4s +sqrdmulh v7.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +sub v6.4s, v10.4s, v4.4s +add v10.4s, v10.4s, v4.4s +mla v19.4S, v20.4S, v31.s[0] +mla v8.4S, v9.4S, v31.s[0] +sub v9.4s, v18.4s, v3.4s +nop +mla v16.4S, v13.4S, v31.s[0] +mla v14.4S, v7.4S, v31.s[0] +add v18.4s, v18.4s, v3.4s +nop +sqrdmulh v3.4S, v21.4S, v27.s[2] +mul v21.4S, v21.4S,v28.s[2] +sub v7.4s, v15.4s, v5.4s +nop +sqrdmulh v13.4S, v10.4S, v27.s[2] +mul v10.4S, v10.4S,v28.s[2] +add v15.4s, v15.4s, v5.4s +nop +sqrdmulh v5.4S, v1.4S, v27.s[3] +mul v1.4S, v1.4S,v28.s[3] +sub v20.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v27.s[3] +mul v6.4S, v6.4S,v28.s[3] +sub v4.4s, v17.4s, v8.4s +add v17.4s, v17.4s, v8.4s +mla v21.4S, v3.4S, v31.s[0] +mla v10.4S, v13.4S, v31.s[0] +sub v13.4s, v12.4s, v16.4s +nop +mla v1.4S, v5.4S, v31.s[0] +mla v6.4S, v19.4S, v31.s[0] +add v12.4s, v12.4s, v16.4s +nop +sqrdmulh v16.4S, v17.4S, v25.s[2] +mul v17.4S, v17.4S,v26.s[2] +sub v19.4s, v2.4s, v14.4s +nop +sqrdmulh v5.4S, v4.4S, v25.s[3] +mul v4.4S, v4.4S,v26.s[3] +add v2.4s, v2.4s, v14.4s +nop +sqrdmulh v14.4S, v19.4S, v25.s[1] +mul v19.4S, v19.4S,v26.s[1] +sub v3.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v2.4S, v25.s[0] +mul v2.4S, v2.4S,v26.s[0] +sub v8.4s, v15.4s, v10.4s +add v15.4s, v15.4s, v10.4s +mla v17.4S, v16.4S, v31.s[0] +mla v4.4S, v5.4S, v31.s[0] +sub v5.4s, v9.4s, v1.4s +nop +mla v19.4S, v14.4S, v31.s[0] +mla v2.4S, v21.4S, v31.s[0] +add v9.4s, v9.4s, v1.4s +nop +sqrdmulh v1.4S, v15.4S, v23.s[0] +mul v15.4S, v15.4S,v24.s[0] +sub v21.4s, v7.4s, v6.4s +nop +sqrdmulh v14.4S, v8.4S, v23.s[1] +mul v8.4S, v8.4S,v24.s[1] +add v7.4s, v7.4s, v6.4s +nop +sqrdmulh v6.4S, v7.4S, v23.s[2] +mul v7.4S, v7.4S,v24.s[2] +sub v16.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +sqrdmulh v17.4S, v21.4S, v23.s[3] +mul v21.4S, v21.4S,v24.s[3] +sub v10.4s, v20.4s, v4.4s +add v20.4s, v20.4s, v4.4s +mla v15.4S, v1.4S, v31.s[0] +mla v8.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v19.4s +str q0, [x0, #304] +mla v7.4S, v6.4S, v31.s[0] +mla v21.4S, v17.4S, v31.s[0] +add v13.4s, v13.4s, v19.4s +str q16, [x0, #368] +ldr q16, [x0, #896] +sqrdmulh v19.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v17.4s, v12.4s, v2.4s +str q20, [x0, #432] +ldr q20, [x0, #960] +sqrdmulh v6.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v12.4s, v12.4s, v2.4s +str q10, [x0, #496] +ldr q10, [x0, #768] +sqrdmulh v2.4S, v10.4S, v29.s[0] +mul v10.4S, v10.4S,v30.s[0] +sub v0.4s, v18.4s, v15.4s +add v18.4s, v18.4s, v15.4s +ldr q15, [x0, #832] +sqrdmulh v1.4S, v15.4S, v29.s[0] +mul v15.4S, v15.4S,v30.s[0] +sub v4.4s, v3.4s, v8.4s +add v3.4s, v3.4s, v8.4s +mla v16.4S, v19.4S, v31.s[0] +mla v20.4S, v6.4S, v31.s[0] +sub v6.4s, v9.4s, v7.4s +str q13, [x0, #176] +mla v10.4S, v2.4S, v31.s[0] +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v7.4s +str q14, [x0, #240] +ldr q14, [x0, #512] +sqrdmulh v7.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v1.4s, v5.4s, v21.4s +str q12, [x0, #48] +ldr q12, [x0, #576] +sqrdmulh v2.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +add v5.4s, v5.4s, v21.4s +str q17, [x0, #112] +ldr q17, [x0, #640] +ldr q21, [x0, #384] +sqrdmulh v13.4S, v17.4S, v29.s[0] +mul v17.4S, v17.4S,v30.s[0] +sub v19.4s, v21.4s, v16.4s +add v21.4s, v21.4s, v16.4s +ldr q16, [x0, #704] +ldr q8, [x0, #448] +sqrdmulh v22.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v11.4s, v8.4s, v20.4s +add v8.4s, v8.4s, v20.4s +ldr q20, [x0, #256] +mla v14.4S, v7.4S, v31.s[0] +mla v12.4S, v2.4S, v31.s[0] +sub v2.4s, v20.4s, v10.4s +str q18, [x0, #560] +mla v17.4S, v13.4S, v31.s[0] +mla v16.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v10.4s +str q0, [x0, #624] +ldr q0, [x0, #320] +sqrdmulh v10.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v22.4s, v0.4s, v15.4s +str q3, [x0, #688] +sqrdmulh v3.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +add v0.4s, v0.4s, v15.4s +str q4, [x0, #752] +ldr q4, [x0, #0] +sqrdmulh v15.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v13.4s, v4.4s, v14.4s +add v4.4s, v4.4s, v14.4s +ldr q14, [x0, #64] +sqrdmulh v18.4S, v0.4S, v29.s[1] +mul v0.4S, v0.4S,v30.s[1] +sub v7.4s, v14.4s, v12.4s +add v14.4s, v14.4s, v12.4s +ldr q12, [x0, #128] +mla v21.4S, v10.4S, v31.s[0] +mla v8.4S, v3.4S, v31.s[0] +sub v3.4s, v12.4s, v17.4s +str q9, [x0, #816] +mla v20.4S, v15.4S, v31.s[0] +mla v0.4S, v18.4S, v31.s[0] +add v12.4s, v12.4s, v17.4s +str q6, [x0, #880] +ldr q6, [x0, #192] +sqrdmulh v17.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +sub v18.4s, v6.4s, v16.4s +str q5, [x0, #944] +sqrdmulh v5.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +add v6.4s, v6.4s, v16.4s +str q1, [x0, #1008] +sqrdmulh v1.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v16.4s, v12.4s, v21.4s +add v12.4s, v12.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +sub v15.4s, v6.4s, v8.4s +add v6.4s, v6.4s, v8.4s +mla v19.4S, v17.4S, v31.s[0] +mla v11.4S, v5.4S, v31.s[0] +sub v5.4s, v4.4s, v20.4s +nop +mla v2.4S, v1.4S, v31.s[0] +mla v22.4S, v21.4S, v31.s[0] +add v4.4s, v4.4s, v20.4s +nop +sqrdmulh v20.4S, v16.4S, v27.s[1] +mul v16.4S, v16.4S,v28.s[1] +sub v21.4s, v14.4s, v0.4s +nop +sqrdmulh v1.4S, v15.4S, v27.s[1] +mul v15.4S, v15.4S,v28.s[1] +add v14.4s, v14.4s, v0.4s +nop +sqrdmulh v0.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +sub v17.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v27.s[0] +mul v6.4S, v6.4S,v28.s[0] +sub v8.4s, v18.4s, v11.4s +add v18.4s, v18.4s, v11.4s +mla v16.4S, v20.4S, v31.s[0] +mla v15.4S, v1.4S, v31.s[0] +sub v1.4s, v13.4s, v2.4s +nop +mla v12.4S, v0.4S, v31.s[0] +mla v6.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v2.4s +nop +sqrdmulh v2.4S, v3.4S, v27.s[2] +mul v3.4S, v3.4S,v28.s[2] +sub v19.4s, v7.4s, v22.4s +nop +sqrdmulh v0.4S, v18.4S, v27.s[2] +mul v18.4S, v18.4S,v28.s[2] +add v7.4s, v7.4s, v22.4s +nop +sqrdmulh v22.4S, v17.4S, v27.s[3] +mul v17.4S, v17.4S,v28.s[3] +sub v20.4s, v5.4s, v16.4s +add v5.4s, v5.4s, v16.4s +sqrdmulh v16.4S, v8.4S, v27.s[3] +mul v8.4S, v8.4S,v28.s[3] +sub v11.4s, v21.4s, v15.4s +add v21.4s, v21.4s, v15.4s +mla v3.4S, v2.4S, v31.s[0] +mla v18.4S, v0.4S, v31.s[0] +sub v0.4s, v4.4s, v12.4s +nop +mla v17.4S, v22.4S, v31.s[0] +mla v8.4S, v16.4S, v31.s[0] +add v4.4s, v4.4s, v12.4s +nop +sqrdmulh v12.4S, v21.4S, v25.s[2] +mul v21.4S, v21.4S,v26.s[2] +sub v16.4s, v14.4s, v6.4s +nop +sqrdmulh v22.4S, v11.4S, v25.s[3] +mul v11.4S, v11.4S,v26.s[3] +add v14.4s, v14.4s, v6.4s +nop +sqrdmulh v6.4S, v16.4S, v25.s[1] +mul v16.4S, v16.4S,v26.s[1] +sub v2.4s, v13.4s, v3.4s +add v13.4s, v13.4s, v3.4s +sqrdmulh v3.4S, v14.4S, v25.s[0] +mul v14.4S, v14.4S,v26.s[0] +sub v15.4s, v7.4s, v18.4s +add v7.4s, v7.4s, v18.4s +mla v21.4S, v12.4S, v31.s[0] +mla v11.4S, v22.4S, v31.s[0] +sub v22.4s, v1.4s, v17.4s +nop +mla v16.4S, v6.4S, v31.s[0] +mla v14.4S, v3.4S, v31.s[0] +add v1.4s, v1.4s, v17.4s +nop +sqrdmulh v17.4S, v7.4S, v23.s[0] +mul v7.4S, v7.4S,v24.s[0] +sub v3.4s, v19.4s, v8.4s +nop +sqrdmulh v6.4S, v15.4S, v23.s[1] +mul v15.4S, v15.4S,v24.s[1] +add v19.4s, v19.4s, v8.4s +nop +sqrdmulh v8.4S, v19.4S, v23.s[2] +mul v19.4S, v19.4S,v24.s[2] +sub v12.4s, v5.4s, v21.4s +add v5.4s, v5.4s, v21.4s +sqrdmulh v21.4S, v3.4S, v23.s[3] +mul v3.4S, v3.4S,v24.s[3] +sub v18.4s, v20.4s, v11.4s +add v20.4s, v20.4s, v11.4s +mla v7.4S, v17.4S, v31.s[0] +mla v15.4S, v6.4S, v31.s[0] +sub v6.4s, v0.4s, v16.4s +str q5, [x0, #256] +mla v19.4S, v8.4S, v31.s[0] +mla v3.4S, v21.4S, v31.s[0] +add v0.4s, v0.4s, v16.4s +str q12, [x0, #320] +ldr q12, [x0, #912] +sqrdmulh v16.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +sub v21.4s, v4.4s, v14.4s +str q20, [x0, #384] +ldr q20, [x0, #976] +sqrdmulh v8.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v4.4s, v4.4s, v14.4s +str q18, [x0, #448] +ldr q18, [x0, #784] +sqrdmulh v14.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +sub v5.4s, v13.4s, v7.4s +add v13.4s, v13.4s, v7.4s +ldr q7, [x0, #848] +sqrdmulh v17.4S, v7.4S, v29.s[0] +mul v7.4S, v7.4S,v30.s[0] +sub v11.4s, v2.4s, v15.4s +add v2.4s, v2.4s, v15.4s +mla v12.4S, v16.4S, v31.s[0] +mla v20.4S, v8.4S, v31.s[0] +sub v8.4s, v1.4s, v19.4s +str q0, [x0, #128] +mla v18.4S, v14.4S, v31.s[0] +mla v7.4S, v17.4S, v31.s[0] +add v1.4s, v1.4s, v19.4s +str q6, [x0, #192] +ldr q6, [x0, #528] +sqrdmulh v19.4S, v6.4S, v29.s[0] +mul v6.4S, v6.4S,v30.s[0] +sub v17.4s, v22.4s, v3.4s +str q4, [x0, #0] +ldr q4, [x0, #592] +sqrdmulh v14.4S, v4.4S, v29.s[0] +mul v4.4S, v4.4S,v30.s[0] +add v22.4s, v22.4s, v3.4s +str q21, [x0, #64] +ldr q21, [x0, #656] +ldr q3, [x0, #400] +sqrdmulh v0.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +sub v16.4s, v3.4s, v12.4s +add v3.4s, v3.4s, v12.4s +ldr q12, [x0, #720] +ldr q15, [x0, #464] +sqrdmulh v9.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +sub v10.4s, v15.4s, v20.4s +add v15.4s, v15.4s, v20.4s +ldr q20, [x0, #272] +mla v6.4S, v19.4S, v31.s[0] +mla v4.4S, v14.4S, v31.s[0] +sub v14.4s, v20.4s, v18.4s +str q13, [x0, #512] +mla v21.4S, v0.4S, v31.s[0] +mla v12.4S, v9.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +str q5, [x0, #576] +ldr q5, [x0, #336] +sqrdmulh v18.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v9.4s, v5.4s, v7.4s +str q2, [x0, #640] +sqrdmulh v2.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +add v5.4s, v5.4s, v7.4s +str q11, [x0, #704] +ldr q11, [x0, #16] +sqrdmulh v7.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v0.4s, v11.4s, v6.4s +add v11.4s, v11.4s, v6.4s +ldr q6, [x0, #80] +sqrdmulh v13.4S, v5.4S, v29.s[1] +mul v5.4S, v5.4S,v30.s[1] +sub v19.4s, v6.4s, v4.4s +add v6.4s, v6.4s, v4.4s +ldr q4, [x0, #144] +mla v3.4S, v18.4S, v31.s[0] +mla v15.4S, v2.4S, v31.s[0] +sub v2.4s, v4.4s, v21.4s +str q1, [x0, #768] +mla v20.4S, v7.4S, v31.s[0] +mla v5.4S, v13.4S, v31.s[0] +add v4.4s, v4.4s, v21.4s +str q8, [x0, #832] +ldr q8, [x0, #208] +sqrdmulh v21.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +sub v13.4s, v8.4s, v12.4s +str q22, [x0, #896] +sqrdmulh v22.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +add v8.4s, v8.4s, v12.4s +str q17, [x0, #960] +sqrdmulh v17.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v12.4s, v4.4s, v3.4s +add v4.4s, v4.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v29.s[2] +mul v9.4S, v9.4S,v30.s[2] +sub v7.4s, v8.4s, v15.4s +add v8.4s, v8.4s, v15.4s +mla v16.4S, v21.4S, v31.s[0] +mla v10.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v20.4s +nop +mla v14.4S, v17.4S, v31.s[0] +mla v9.4S, v3.4S, v31.s[0] +add v11.4s, v11.4s, v20.4s +nop +sqrdmulh v20.4S, v12.4S, v27.s[1] +mul v12.4S, v12.4S,v28.s[1] +sub v3.4s, v6.4s, v5.4s +nop +sqrdmulh v17.4S, v7.4S, v27.s[1] +mul v7.4S, v7.4S,v28.s[1] +add v6.4s, v6.4s, v5.4s +nop +sqrdmulh v5.4S, v4.4S, v27.s[0] +mul v4.4S, v4.4S,v28.s[0] +sub v21.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v8.4S, v27.s[0] +mul v8.4S, v8.4S,v28.s[0] +sub v15.4s, v13.4s, v10.4s +add v13.4s, v13.4s, v10.4s +mla v12.4S, v20.4S, v31.s[0] +mla v7.4S, v17.4S, v31.s[0] +sub v17.4s, v0.4s, v14.4s +nop +mla v4.4S, v5.4S, v31.s[0] +mla v8.4S, v16.4S, v31.s[0] +add v0.4s, v0.4s, v14.4s +nop +sqrdmulh v14.4S, v2.4S, v27.s[2] +mul v2.4S, v2.4S,v28.s[2] +sub v16.4s, v19.4s, v9.4s +nop +sqrdmulh v5.4S, v13.4S, v27.s[2] +mul v13.4S, v13.4S,v28.s[2] +add v19.4s, v19.4s, v9.4s +nop +sqrdmulh v9.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +sub v20.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +sub v10.4s, v3.4s, v7.4s +add v3.4s, v3.4s, v7.4s +mla v2.4S, v14.4S, v31.s[0] +mla v13.4S, v5.4S, v31.s[0] +sub v5.4s, v11.4s, v4.4s +nop +mla v21.4S, v9.4S, v31.s[0] +mla v15.4S, v12.4S, v31.s[0] +add v11.4s, v11.4s, v4.4s +nop +sqrdmulh v4.4S, v3.4S, v25.s[2] +mul v3.4S, v3.4S,v26.s[2] +sub v12.4s, v6.4s, v8.4s +nop +sqrdmulh v9.4S, v10.4S, v25.s[3] +mul v10.4S, v10.4S,v26.s[3] +add v6.4s, v6.4s, v8.4s +nop +sqrdmulh v8.4S, v12.4S, v25.s[1] +mul v12.4S, v12.4S,v26.s[1] +sub v14.4s, v0.4s, v2.4s +add v0.4s, v0.4s, v2.4s +sqrdmulh v2.4S, v6.4S, v25.s[0] +mul v6.4S, v6.4S,v26.s[0] +sub v7.4s, v19.4s, v13.4s +add v19.4s, v19.4s, v13.4s +mla v3.4S, v4.4S, v31.s[0] +mla v10.4S, v9.4S, v31.s[0] +sub v9.4s, v17.4s, v21.4s +nop +mla v12.4S, v8.4S, v31.s[0] +mla v6.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v21.4s +nop +sqrdmulh v21.4S, v19.4S, v23.s[0] +mul v19.4S, v19.4S,v24.s[0] +sub v2.4s, v16.4s, v15.4s +nop +sqrdmulh v8.4S, v7.4S, v23.s[1] +mul v7.4S, v7.4S,v24.s[1] +add v16.4s, v16.4s, v15.4s +nop +sqrdmulh v15.4S, v16.4S, v23.s[2] +mul v16.4S, v16.4S,v24.s[2] +sub v4.4s, v22.4s, v3.4s +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v2.4S, v23.s[3] +mul v2.4S, v2.4S,v24.s[3] +sub v13.4s, v20.4s, v10.4s +add v20.4s, v20.4s, v10.4s +mla v19.4S, v21.4S, v31.s[0] +mla v7.4S, v8.4S, v31.s[0] +sub v8.4s, v5.4s, v12.4s +str q22, [x0, #272] +mla v16.4S, v15.4S, v31.s[0] +mla v2.4S, v3.4S, v31.s[0] +add v5.4s, v5.4s, v12.4s +str q4, [x0, #336] +sub v23.4s, v11.4s, v6.4s +str q20, [x0, #400] +add v11.4s, v11.4s, v6.4s +str q13, [x0, #464] +sub v13.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sub v19.4s, v14.4s, v7.4s +add v14.4s, v14.4s, v7.4s +sub v7.4s, v17.4s, v16.4s +str q5, [x0, #144] +add v17.4s, v17.4s, v16.4s +str q8, [x0, #208] +sub v8.4s, v9.4s, v2.4s +str q11, [x0, #16] +add v9.4s, v9.4s, v2.4s +str q23, [x0, #80] +str q0, [x0, #528] +str q13, [x0, #592] +str q14, [x0, #656] +str q19, [x0, #720] +str q17, [x0, #784] +str q7, [x0, #848] +str q9, [x0, #912] +str q8, [x0, #976] +ldr q18, [x0, #224] +ldr q1, [x0, #160] +ldr q10, [x0, #32] +ldr q21, [x17, #+128] +ldr q22, [x17, #+144] +sqrdmulh v15.4S, v10.4S, v22.s[0] +mul v10.4S, v10.4S,v21.s[0] +ldr q3, [x0, #48] +sqrdmulh v12.4S, v3.4S, v22.s[0] +mul v3.4S, v3.4S,v21.s[0] +ldr q4, [x17, #+160] +ldr q30, [x17, #+176] +ldr q29, [x0, #96] +sqrdmulh v28.4S, v29.4S, v30.s[0] +mul v29.4S, v29.4S,v4.s[0] +ldr q27, [x0, #112] +sqrdmulh v26.4S, v27.4S, v30.s[0] +mul v27.4S, v27.4S,v4.s[0] +ldr q25, [x17, #+192] +ldr q24, [x17, #+208] +mla v10.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v1.4S, v24.s[0] +ldr q20, [x0, #176] +mla v3.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v20.4S, v24.s[0] +ldr q6, [x17, #+224] +ldr q5, [x17, #+240] +mla v29.4S, v28.4S, v31.s[0] +sqrdmulh v28.4S, v18.4S, v5.s[0] +ldr q16, [x0, #240] +mla v27.4S, v26.4S, v31.s[0] +sqrdmulh v26.4S, v16.4S, v5.s[0] +ldr q11, [x0, #0] +ldr q2, [x0, #128] +mul v1.4S, v1.4S,v25.s[0] +sub v23.4s, v11.4s, v10.4s +ldr q0, [x0, #16] +mul v20.4S, v20.4S,v25.s[0] +add v11.4s, v11.4s, v10.4s +ldr q10, [x0, #144] +mla v1.4S, v15.4S, v31.s[0] +sub v15.4s, v0.4s, v3.4s +ldr q13, [x0, #64] +mla v20.4S, v12.4S, v31.s[0] +add v0.4s, v0.4s, v3.4s +ldr q3, [x0, #192] +mul v18.4S, v18.4S,v6.s[0] +sub v12.4s, v13.4s, v29.4s +ldr q14, [x0, #80] +mul v16.4S, v16.4S,v6.s[0] +add v13.4s, v13.4s, v29.4s +ldr q29, [x0, #208] +mla v18.4S, v28.4S, v31.s[0] +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v14.4s, v27.4s +sqrdmulh v28.4S, v0.4S, v22.s[1] +add v14.4s, v14.4s, v27.4s +mul v0.4S, v0.4S,v21.s[1] +sqrdmulh v27.4S, v15.4S, v22.s[2] +sub v19.4s, v2.4s, v1.4s +mul v15.4S, v15.4S,v21.s[2] +add v2.4s, v2.4s, v1.4s +sqrdmulh v22.4S, v14.4S, v30.s[1] +sub v21.4s, v10.4s, v20.4s +mul v14.4S, v14.4S,v4.s[1] +add v10.4s, v10.4s, v20.4s +sqrdmulh v20.4S, v26.4S, v30.s[2] +sub v1.4s, v3.4s, v18.4s +mul v26.4S, v26.4S,v4.s[2] +add v3.4s, v3.4s, v18.4s +mla v0.4S, v28.4S, v31.s[0] +sub v28.4s, v29.4s, v16.4s +ldr q30, [x0, #480] +sqrdmulh v4.4S, v10.4S, v24.s[1] +add v29.4s, v29.4s, v16.4s +mla v15.4S, v27.4S, v31.s[0] +ldr q27, [x0, #416] +sqrdmulh v16.4S, v21.4S, v24.s[2] +sub v18.4s, v11.4s, v0.4s +mla v14.4S, v22.4S, v31.s[0] +ldr q22, [x0, #288] +sqrdmulh v17.4S, v29.4S, v5.s[1] +add v11.4s, v11.4s, v0.4s +str q18, [x0, #16] +mla v26.4S, v20.4S, v31.s[0] +ldr q20, [x17, #+256] +ldr q18, [x17, #+272] +sqrdmulh v0.4S, v28.4S, v5.s[2] +sub v7.4s, v23.4s, v15.4s +str q11, [x0, #0] +mul v10.4S, v10.4S,v25.s[1] +add v23.4s, v23.4s, v15.4s +mul v21.4S, v21.4S,v25.s[2] +str q7, [x0, #48] +mla v10.4S, v4.4S, v31.s[0] +sub v4.4s, v13.4s, v14.4s +mla v21.4S, v16.4S, v31.s[0] +str q23, [x0, #32] +mul v29.4S, v29.4S,v6.s[1] +str q4, [x0, #80] +mul v28.4S, v28.4S,v6.s[2] +add v13.4s, v13.4s, v14.4s +str q13, [x0, #64] +mla v29.4S, v17.4S, v31.s[0] +sub v17.4s, v12.4s, v26.4s +str q17, [x0, #112] +mla v28.4S, v0.4S, v31.s[0] +add v12.4s, v12.4s, v26.4s +str q12, [x0, #96] +sqrdmulh v5.4S, v22.4S, v18.s[0] +sub v6.4s, v2.4s, v10.4s +mul v22.4S, v22.4S,v20.s[0] +str q6, [x0, #144] +ldr q6, [x0, #304] +sqrdmulh v12.4S, v6.4S, v18.s[0] +add v2.4s, v2.4s, v10.4s +mul v6.4S, v6.4S,v20.s[0] +str q2, [x0, #128] +ldr q2, [x17, #+288] +ldr q10, [x17, #+304] +ldr q26, [x0, #352] +sqrdmulh v0.4S, v26.4S, v10.s[0] +sub v17.4s, v19.4s, v21.4s +mul v26.4S, v26.4S,v2.s[0] +str q17, [x0, #176] +ldr q17, [x0, #368] +sqrdmulh v13.4S, v17.4S, v10.s[0] +add v19.4s, v19.4s, v21.4s +mul v17.4S, v17.4S,v2.s[0] +str q19, [x0, #160] +ldr q19, [x17, #+320] +ldr q21, [x17, #+336] +mla v22.4S, v5.4S, v31.s[0] +sub v5.4s, v3.4s, v29.4s +sqrdmulh v14.4S, v27.4S, v21.s[0] +str q5, [x0, #208] +ldr q5, [x0, #432] +mla v6.4S, v12.4S, v31.s[0] +add v3.4s, v3.4s, v29.4s +sqrdmulh v29.4S, v5.4S, v21.s[0] +str q3, [x0, #192] +ldr q3, [x17, #+352] +ldr q12, [x17, #+368] +mla v26.4S, v0.4S, v31.s[0] +sub v0.4s, v1.4s, v28.4s +sqrdmulh v4.4S, v30.4S, v12.s[0] +str q0, [x0, #240] +ldr q0, [x0, #496] +mla v17.4S, v13.4S, v31.s[0] +add v1.4s, v1.4s, v28.4s +sqrdmulh v28.4S, v0.4S, v12.s[0] +str q1, [x0, #224] +ldr q1, [x0, #256] +ldr q13, [x0, #384] +mul v27.4S, v27.4S,v19.s[0] +sub v24.4s, v1.4s, v22.4s +ldr q25, [x0, #272] +mul v5.4S, v5.4S,v19.s[0] +add v1.4s, v1.4s, v22.4s +ldr q22, [x0, #400] +mla v27.4S, v14.4S, v31.s[0] +sub v14.4s, v25.4s, v6.4s +ldr q23, [x0, #320] +mla v5.4S, v29.4S, v31.s[0] +add v25.4s, v25.4s, v6.4s +ldr q6, [x0, #448] +mul v30.4S, v30.4S,v3.s[0] +sub v29.4s, v23.4s, v26.4s +ldr q16, [x0, #336] +mul v0.4S, v0.4S,v3.s[0] +add v23.4s, v23.4s, v26.4s +ldr q26, [x0, #464] +mla v30.4S, v4.4S, v31.s[0] +mla v0.4S, v28.4S, v31.s[0] +sub v28.4s, v16.4s, v17.4s +sqrdmulh v4.4S, v25.4S, v18.s[1] +add v16.4s, v16.4s, v17.4s +mul v25.4S, v25.4S,v20.s[1] +sqrdmulh v17.4S, v14.4S, v18.s[2] +sub v7.4s, v13.4s, v27.4s +mul v14.4S, v14.4S,v20.s[2] +add v13.4s, v13.4s, v27.4s +sqrdmulh v18.4S, v16.4S, v10.s[1] +sub v20.4s, v22.4s, v5.4s +mul v16.4S, v16.4S,v2.s[1] +add v22.4s, v22.4s, v5.4s +sqrdmulh v5.4S, v28.4S, v10.s[2] +sub v27.4s, v6.4s, v30.4s +mul v28.4S, v28.4S,v2.s[2] +add v6.4s, v6.4s, v30.4s +mla v25.4S, v4.4S, v31.s[0] +sub v4.4s, v26.4s, v0.4s +ldr q10, [x0, #736] +sqrdmulh v2.4S, v22.4S, v21.s[1] +add v26.4s, v26.4s, v0.4s +mla v14.4S, v17.4S, v31.s[0] +ldr q17, [x0, #672] +sqrdmulh v0.4S, v20.4S, v21.s[2] +sub v30.4s, v1.4s, v25.4s +mla v16.4S, v18.4S, v31.s[0] +ldr q18, [x0, #544] +sqrdmulh v15.4S, v26.4S, v12.s[1] +add v1.4s, v1.4s, v25.4s +str q30, [x0, #272] +mla v28.4S, v5.4S, v31.s[0] +ldr q5, [x17, #+384] +ldr q30, [x17, #+400] +sqrdmulh v25.4S, v4.4S, v12.s[2] +sub v11.4s, v24.4s, v14.4s +str q1, [x0, #256] +mul v22.4S, v22.4S,v19.s[1] +add v24.4s, v24.4s, v14.4s +mul v20.4S, v20.4S,v19.s[2] +str q11, [x0, #304] +mla v22.4S, v2.4S, v31.s[0] +sub v2.4s, v23.4s, v16.4s +mla v20.4S, v0.4S, v31.s[0] +str q24, [x0, #288] +mul v26.4S, v26.4S,v3.s[1] +str q2, [x0, #336] +mul v4.4S, v4.4S,v3.s[2] +add v23.4s, v23.4s, v16.4s +str q23, [x0, #320] +mla v26.4S, v15.4S, v31.s[0] +sub v15.4s, v29.4s, v28.4s +str q15, [x0, #368] +mla v4.4S, v25.4S, v31.s[0] +add v29.4s, v29.4s, v28.4s +str q29, [x0, #352] +sqrdmulh v12.4S, v18.4S, v30.s[0] +sub v3.4s, v13.4s, v22.4s +mul v18.4S, v18.4S,v5.s[0] +str q3, [x0, #400] +ldr q3, [x0, #560] +sqrdmulh v29.4S, v3.4S, v30.s[0] +add v13.4s, v13.4s, v22.4s +mul v3.4S, v3.4S,v5.s[0] +str q13, [x0, #384] +ldr q13, [x17, #+416] +ldr q22, [x17, #+432] +ldr q28, [x0, #608] +sqrdmulh v25.4S, v28.4S, v22.s[0] +sub v15.4s, v7.4s, v20.4s +mul v28.4S, v28.4S,v13.s[0] +str q15, [x0, #432] +ldr q15, [x0, #624] +sqrdmulh v23.4S, v15.4S, v22.s[0] +add v7.4s, v7.4s, v20.4s +mul v15.4S, v15.4S,v13.s[0] +str q7, [x0, #416] +ldr q7, [x17, #+448] +ldr q20, [x17, #+464] +mla v18.4S, v12.4S, v31.s[0] +sub v12.4s, v6.4s, v26.4s +sqrdmulh v16.4S, v17.4S, v20.s[0] +str q12, [x0, #464] +ldr q12, [x0, #688] +mla v3.4S, v29.4S, v31.s[0] +add v6.4s, v6.4s, v26.4s +sqrdmulh v26.4S, v12.4S, v20.s[0] +str q6, [x0, #448] +ldr q6, [x17, #+480] +ldr q29, [x17, #+496] +mla v28.4S, v25.4S, v31.s[0] +sub v25.4s, v27.4s, v4.4s +sqrdmulh v2.4S, v10.4S, v29.s[0] +str q25, [x0, #496] +ldr q25, [x0, #752] +mla v15.4S, v23.4S, v31.s[0] +add v27.4s, v27.4s, v4.4s +sqrdmulh v4.4S, v25.4S, v29.s[0] +str q27, [x0, #480] +ldr q27, [x0, #512] +ldr q23, [x0, #640] +mul v17.4S, v17.4S,v7.s[0] +sub v21.4s, v27.4s, v18.4s +ldr q19, [x0, #528] +mul v12.4S, v12.4S,v7.s[0] +add v27.4s, v27.4s, v18.4s +ldr q18, [x0, #656] +mla v17.4S, v16.4S, v31.s[0] +sub v16.4s, v19.4s, v3.4s +ldr q24, [x0, #576] +mla v12.4S, v26.4S, v31.s[0] +add v19.4s, v19.4s, v3.4s +ldr q3, [x0, #704] +mul v10.4S, v10.4S,v6.s[0] +sub v26.4s, v24.4s, v28.4s +ldr q0, [x0, #592] +mul v25.4S, v25.4S,v6.s[0] +add v24.4s, v24.4s, v28.4s +ldr q28, [x0, #720] +mla v10.4S, v2.4S, v31.s[0] +mla v25.4S, v4.4S, v31.s[0] +sub v4.4s, v0.4s, v15.4s +sqrdmulh v2.4S, v19.4S, v30.s[1] +add v0.4s, v0.4s, v15.4s +mul v19.4S, v19.4S,v5.s[1] +sqrdmulh v15.4S, v16.4S, v30.s[2] +sub v11.4s, v23.4s, v17.4s +mul v16.4S, v16.4S,v5.s[2] +add v23.4s, v23.4s, v17.4s +sqrdmulh v30.4S, v0.4S, v22.s[1] +sub v5.4s, v18.4s, v12.4s +mul v0.4S, v0.4S,v13.s[1] +add v18.4s, v18.4s, v12.4s +sqrdmulh v12.4S, v4.4S, v22.s[2] +sub v17.4s, v3.4s, v10.4s +mul v4.4S, v4.4S,v13.s[2] +add v3.4s, v3.4s, v10.4s +mla v19.4S, v2.4S, v31.s[0] +sub v2.4s, v28.4s, v25.4s +ldr q22, [x0, #992] +sqrdmulh v13.4S, v18.4S, v20.s[1] +add v28.4s, v28.4s, v25.4s +mla v16.4S, v15.4S, v31.s[0] +ldr q15, [x0, #928] +sqrdmulh v25.4S, v5.4S, v20.s[2] +sub v10.4s, v27.4s, v19.4s +mla v0.4S, v30.4S, v31.s[0] +ldr q30, [x0, #800] +sqrdmulh v14.4S, v28.4S, v29.s[1] +add v27.4s, v27.4s, v19.4s +str q10, [x0, #528] +mla v4.4S, v12.4S, v31.s[0] +ldr q12, [x17, #+512] +ldr q10, [x17, #+528] +sqrdmulh v19.4S, v2.4S, v29.s[2] +sub v1.4s, v21.4s, v16.4s +str q27, [x0, #512] +mul v18.4S, v18.4S,v7.s[1] +add v21.4s, v21.4s, v16.4s +mul v5.4S, v5.4S,v7.s[2] +str q1, [x0, #560] +mla v18.4S, v13.4S, v31.s[0] +sub v13.4s, v24.4s, v0.4s +mla v5.4S, v25.4S, v31.s[0] +str q21, [x0, #544] +mul v28.4S, v28.4S,v6.s[1] +str q13, [x0, #592] +mul v2.4S, v2.4S,v6.s[2] +add v24.4s, v24.4s, v0.4s +str q24, [x0, #576] +mla v28.4S, v14.4S, v31.s[0] +sub v14.4s, v26.4s, v4.4s +str q14, [x0, #624] +mla v2.4S, v19.4S, v31.s[0] +add v26.4s, v26.4s, v4.4s +str q26, [x0, #608] +sqrdmulh v29.4S, v30.4S, v10.s[0] +sub v6.4s, v23.4s, v18.4s +mul v30.4S, v30.4S,v12.s[0] +str q6, [x0, #656] +ldr q6, [x0, #816] +sqrdmulh v26.4S, v6.4S, v10.s[0] +add v23.4s, v23.4s, v18.4s +mul v6.4S, v6.4S,v12.s[0] +str q23, [x0, #640] +ldr q23, [x17, #+544] +ldr q18, [x17, #+560] +ldr q4, [x0, #864] +sqrdmulh v19.4S, v4.4S, v18.s[0] +sub v14.4s, v11.4s, v5.4s +mul v4.4S, v4.4S,v23.s[0] +str q14, [x0, #688] +ldr q14, [x0, #880] +sqrdmulh v24.4S, v14.4S, v18.s[0] +add v11.4s, v11.4s, v5.4s +mul v14.4S, v14.4S,v23.s[0] +str q11, [x0, #672] +ldr q11, [x17, #+576] +ldr q5, [x17, #+592] +mla v30.4S, v29.4S, v31.s[0] +sub v29.4s, v3.4s, v28.4s +sqrdmulh v0.4S, v15.4S, v5.s[0] +str q29, [x0, #720] +ldr q29, [x0, #944] +mla v6.4S, v26.4S, v31.s[0] +add v3.4s, v3.4s, v28.4s +sqrdmulh v28.4S, v29.4S, v5.s[0] +str q3, [x0, #704] +ldr q3, [x17, #+608] +ldr q26, [x17, #+624] +mla v4.4S, v19.4S, v31.s[0] +sub v19.4s, v17.4s, v2.4s +sqrdmulh v13.4S, v22.4S, v26.s[0] +str q19, [x0, #752] +ldr q19, [x0, #1008] +mla v14.4S, v24.4S, v31.s[0] +add v17.4s, v17.4s, v2.4s +sqrdmulh v2.4S, v19.4S, v26.s[0] +str q17, [x0, #736] +ldr q17, [x0, #768] +ldr q24, [x0, #896] +mul v15.4S, v15.4S,v11.s[0] +sub v20.4s, v17.4s, v30.4s +ldr q7, [x0, #784] +mul v29.4S, v29.4S,v11.s[0] +add v17.4s, v17.4s, v30.4s +ldr q30, [x0, #912] +mla v15.4S, v0.4S, v31.s[0] +sub v0.4s, v7.4s, v6.4s +ldr q21, [x0, #832] +mla v29.4S, v28.4S, v31.s[0] +add v7.4s, v7.4s, v6.4s +ldr q6, [x0, #960] +mul v22.4S, v22.4S,v3.s[0] +sub v28.4s, v21.4s, v4.4s +ldr q25, [x0, #848] +mul v19.4S, v19.4S,v3.s[0] +add v21.4s, v21.4s, v4.4s +ldr q4, [x0, #976] +mla v22.4S, v13.4S, v31.s[0] +mla v19.4S, v2.4S, v31.s[0] +sub v2.4s, v25.4s, v14.4s +sqrdmulh v13.4S, v7.4S, v10.s[1] +add v25.4s, v25.4s, v14.4s +mul v7.4S, v7.4S,v12.s[1] +sqrdmulh v14.4S, v0.4S, v10.s[2] +sub v1.4s, v24.4s, v15.4s +mul v0.4S, v0.4S,v12.s[2] +add v24.4s, v24.4s, v15.4s +sqrdmulh v10.4S, v25.4S, v18.s[1] +sub v12.4s, v30.4s, v29.4s +mul v25.4S, v25.4S,v23.s[1] +add v30.4s, v30.4s, v29.4s +sqrdmulh v29.4S, v2.4S, v18.s[2] +sub v15.4s, v6.4s, v22.4s +mul v2.4S, v2.4S,v23.s[2] +add v6.4s, v6.4s, v22.4s +mla v7.4S, v13.4S, v31.s[0] +sub v13.4s, v4.4s, v19.4s +sqrdmulh v18.4S, v30.4S, v5.s[1] +add v4.4s, v4.4s, v19.4s +mla v0.4S, v14.4S, v31.s[0] +sqrdmulh v14.4S, v12.4S, v5.s[2] +sub v19.4s, v17.4s, v7.4s +mla v25.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v4.4S, v26.s[1] +add v17.4s, v17.4s, v7.4s +str q19, [x0, #784] +mla v2.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v13.4S, v26.s[2] +sub v19.4s, v20.4s, v0.4s +str q17, [x0, #768] +mul v30.4S, v30.4S,v11.s[1] +add v20.4s, v20.4s, v0.4s +mul v12.4S, v12.4S,v11.s[2] +str q19, [x0, #816] +mla v30.4S, v18.4S, v31.s[0] +sub v18.4s, v21.4s, v25.4s +mla v12.4S, v14.4S, v31.s[0] +str q20, [x0, #800] +mul v4.4S, v4.4S,v3.s[1] +str q18, [x0, #848] +mul v13.4S, v13.4S,v3.s[2] +add v21.4s, v21.4s, v25.4s +str q21, [x0, #832] +mla v4.4S, v10.4S, v31.s[0] +sub v10.4s, v28.4s, v2.4s +str q10, [x0, #880] +mla v13.4S, v29.4S, v31.s[0] +add v28.4s, v28.4s, v2.4s +str q28, [x0, #864] +sub v26.4s, v24.4s, v30.4s +str q26, [x0, #912] +add v24.4s, v24.4s, v30.4s +str q24, [x0, #896] +sub v24.4s, v1.4s, v12.4s +str q24, [x0, #944] +add v1.4s, v1.4s, v12.4s +str q1, [x0, #928] +sub v1.4s, v6.4s, v4.4s +str q1, [x0, #976] +add v6.4s, v6.4s, v4.4s +str q6, [x0, #960] +sub v6.4s, v15.4s, v13.4s +str q6, [x0, #1008] +add v15.4s, v15.4s, v13.4s +str q15, [x0, #992] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1528 +// Instruction count: 1524 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_11_z4_7.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_11_z4_7.s new file mode 100644 index 0000000..025fcad --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_11_z4_7.s @@ -0,0 +1,1494 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_11_z4_7 +.global _ntt_u32_incomplete_neon_asm_var_4_2_11_z4_7 +ntt_u32_incomplete_neon_asm_var_4_2_11_z4_7: +_ntt_u32_incomplete_neon_asm_var_4_2_11_z4_7: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x0, #928] +ldr q29, [x17, #+0] +ldr q28, [x17, #+16] +sqrdmulh v27.4S, v30.4S, v28.s[0] +mul v30.4S, v30.4S,v29.s[0] +ldr q26, [x0, #992] +sqrdmulh v25.4S, v26.4S, v28.s[0] +mul v26.4S, v26.4S,v29.s[0] +ldr q24, [x0, #800] +sqrdmulh v23.4S, v24.4S, v28.s[0] +mul v24.4S, v24.4S,v29.s[0] +ldr q22, [x0, #864] +sqrdmulh v21.4S, v22.4S, v28.s[0] +mul v22.4S, v22.4S,v29.s[0] +ldr q20, [x0, #544] +mla v30.4S, v27.4S, v31.s[0] +sqrdmulh v27.4S, v20.4S, v28.s[0] +ldr q19, [x0, #608] +mla v26.4S, v25.4S, v31.s[0] +sqrdmulh v25.4S, v19.4S, v28.s[0] +ldr q18, [x0, #672] +mla v24.4S, v23.4S, v31.s[0] +sqrdmulh v23.4S, v18.4S, v28.s[0] +ldr q17, [x0, #736] +mla v22.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v17.4S, v28.s[0] +ldr q16, [x0, #416] +ldr q3, [x0, #480] +mul v20.4S, v20.4S,v29.s[0] +sub v2.4s, v16.4s, v30.4s +mul v19.4S, v19.4S,v29.s[0] +add v16.4s, v16.4s, v30.4s +ldr q30, [x0, #288] +ldr q1, [x0, #352] +mla v20.4S, v27.4S, v31.s[0] +sub v27.4s, v3.4s, v26.4s +mla v19.4S, v25.4S, v31.s[0] +add v3.4s, v3.4s, v26.4s +ldr q26, [x0, #32] +ldr q25, [x0, #96] +mul v18.4S, v18.4S,v29.s[0] +sub v0.4s, v30.4s, v24.4s +mul v17.4S, v17.4S,v29.s[0] +add v30.4s, v30.4s, v24.4s +ldr q24, [x0, #160] +ldr q15, [x0, #224] +mla v18.4S, v23.4S, v31.s[0] +sub v23.4s, v1.4s, v22.4s +mla v17.4S, v21.4S, v31.s[0] +add v1.4s, v1.4s, v22.4s +sqrdmulh v22.4S, v16.4S, v28.s[1] +mul v16.4S, v16.4S,v29.s[1] +sqrdmulh v21.4S, v3.4S, v28.s[1] +sub v14.4s, v26.4s, v20.4s +mul v3.4S, v3.4S,v29.s[1] +add v26.4s, v26.4s, v20.4s +sqrdmulh v20.4S, v30.4S, v28.s[1] +sub v13.4s, v25.4s, v19.4s +mul v30.4S, v30.4S,v29.s[1] +add v25.4s, v25.4s, v19.4s +sqrdmulh v19.4S, v1.4S, v28.s[1] +sub v12.4s, v24.4s, v18.4s +mul v1.4S, v1.4S,v29.s[1] +add v24.4s, v24.4s, v18.4s +mla v16.4S, v22.4S, v31.s[0] +sub v22.4s, v15.4s, v17.4s +sqrdmulh v18.4S, v2.4S, v28.s[2] +add v15.4s, v15.4s, v17.4s +mla v3.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v27.4S, v28.s[2] +mla v30.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v0.4S, v28.s[2] +mla v1.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v23.4S, v28.s[2] +ldr q17, [x17, #+32] +ldr q11, [x17, #+48] +mul v2.4S, v2.4S,v29.s[2] +sub v10.4s, v24.4s, v16.4s +mul v27.4S, v27.4S,v29.s[2] +add v24.4s, v24.4s, v16.4s +mla v2.4S, v18.4S, v31.s[0] +sub v18.4s, v15.4s, v3.4s +mla v27.4S, v21.4S, v31.s[0] +add v15.4s, v15.4s, v3.4s +mul v0.4S, v0.4S,v29.s[2] +sub v3.4s, v26.4s, v30.4s +mul v23.4S, v23.4S,v29.s[2] +add v26.4s, v26.4s, v30.4s +mla v0.4S, v20.4S, v31.s[0] +sub v20.4s, v25.4s, v1.4s +mla v23.4S, v19.4S, v31.s[0] +add v25.4s, v25.4s, v1.4s +sqrdmulh v1.4S, v10.4S, v11.s[1] +mul v10.4S, v10.4S,v17.s[1] +sqrdmulh v19.4S, v18.4S, v11.s[1] +sub v30.4s, v12.4s, v2.4s +mul v18.4S, v18.4S,v17.s[1] +add v12.4s, v12.4s, v2.4s +sqrdmulh v2.4S, v24.4S, v11.s[0] +sub v21.4s, v22.4s, v27.4s +mul v24.4S, v24.4S,v17.s[0] +add v22.4s, v22.4s, v27.4s +sqrdmulh v27.4S, v15.4S, v11.s[0] +sub v16.4s, v14.4s, v0.4s +mul v15.4S, v15.4S,v17.s[0] +add v14.4s, v14.4s, v0.4s +ldr q0, [x17, #+64] +ldr q9, [x17, #+80] +mla v10.4S, v1.4S, v31.s[0] +sub v1.4s, v13.4s, v23.4s +sqrdmulh v8.4S, v12.4S, v11.s[2] +add v13.4s, v13.4s, v23.4s +mla v18.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v22.4S, v11.s[2] +mla v24.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v30.4S, v11.s[3] +mla v15.4S, v27.4S, v31.s[0] +sqrdmulh v27.4S, v21.4S, v11.s[3] +ldr q23, [x17, #+96] +ldr q7, [x17, #+112] +mul v12.4S, v12.4S,v17.s[2] +sub v6.4s, v3.4s, v10.4s +mul v22.4S, v22.4S,v17.s[2] +add v3.4s, v3.4s, v10.4s +mla v12.4S, v8.4S, v31.s[0] +sub v8.4s, v20.4s, v18.4s +mla v22.4S, v19.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +mul v30.4S, v30.4S,v17.s[3] +sub v18.4s, v26.4s, v24.4s +mul v21.4S, v21.4S,v17.s[3] +add v26.4s, v26.4s, v24.4s +mla v30.4S, v2.4S, v31.s[0] +sub v2.4s, v25.4s, v15.4s +mla v21.4S, v27.4S, v31.s[0] +add v25.4s, v25.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v9.s[2] +mul v20.4S, v20.4S,v0.s[2] +sqrdmulh v27.4S, v8.4S, v9.s[3] +sub v24.4s, v14.4s, v12.4s +mul v8.4S, v8.4S,v0.s[3] +add v14.4s, v14.4s, v12.4s +sqrdmulh v12.4S, v2.4S, v9.s[1] +sub v19.4s, v13.4s, v22.4s +mul v2.4S, v2.4S,v0.s[1] +add v13.4s, v13.4s, v22.4s +sqrdmulh v22.4S, v25.4S, v9.s[0] +sub v10.4s, v16.4s, v30.4s +mul v25.4S, v25.4S,v0.s[0] +add v16.4s, v16.4s, v30.4s +mla v20.4S, v15.4S, v31.s[0] +sub v15.4s, v1.4s, v21.4s +sqrdmulh v30.4S, v13.4S, v7.s[0] +add v1.4s, v1.4s, v21.4s +mla v8.4S, v27.4S, v31.s[0] +sqrdmulh v27.4S, v19.4S, v7.s[1] +mla v2.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v1.4S, v7.s[2] +mla v25.4S, v22.4S, v31.s[0] +sqrdmulh v22.4S, v15.4S, v7.s[3] +mul v13.4S, v13.4S,v23.s[0] +sub v21.4s, v3.4s, v20.4s +str q21, [x0, #352] +mul v19.4S, v19.4S,v23.s[1] +add v3.4s, v3.4s, v20.4s +str q3, [x0, #288] +mla v13.4S, v30.4S, v31.s[0] +sub v30.4s, v6.4s, v8.4s +str q30, [x0, #480] +mla v19.4S, v27.4S, v31.s[0] +add v6.4s, v6.4s, v8.4s +str q6, [x0, #416] +mul v1.4S, v1.4S,v23.s[2] +sub v6.4s, v18.4s, v2.4s +str q6, [x0, #224] +mul v15.4S, v15.4S,v23.s[3] +add v18.4s, v18.4s, v2.4s +str q18, [x0, #160] +mla v1.4S, v12.4S, v31.s[0] +sub v12.4s, v26.4s, v25.4s +str q12, [x0, #96] +mla v15.4S, v22.4S, v31.s[0] +add v26.4s, v26.4s, v25.4s +str q26, [x0, #32] +ldr q26, [x0, #944] +sqrdmulh v25.4S, v26.4S, v28.s[0] +mul v26.4S, v26.4S,v29.s[0] +ldr q22, [x0, #1008] +sqrdmulh v12.4S, v22.4S, v28.s[0] +sub v18.4s, v14.4s, v13.4s +str q18, [x0, #608] +mul v22.4S, v22.4S,v29.s[0] +add v14.4s, v14.4s, v13.4s +str q14, [x0, #544] +ldr q14, [x0, #816] +sqrdmulh v13.4S, v14.4S, v28.s[0] +sub v18.4s, v24.4s, v19.4s +str q18, [x0, #736] +mul v14.4S, v14.4S,v29.s[0] +add v24.4s, v24.4s, v19.4s +str q24, [x0, #672] +ldr q24, [x0, #880] +sqrdmulh v19.4S, v24.4S, v28.s[0] +sub v18.4s, v16.4s, v1.4s +str q18, [x0, #864] +mul v24.4S, v24.4S,v29.s[0] +add v16.4s, v16.4s, v1.4s +str q16, [x0, #800] +ldr q16, [x0, #560] +mla v26.4S, v25.4S, v31.s[0] +sub v25.4s, v10.4s, v15.4s +str q25, [x0, #992] +sqrdmulh v25.4S, v16.4S, v28.s[0] +add v10.4s, v10.4s, v15.4s +str q10, [x0, #928] +ldr q10, [x0, #624] +mla v22.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v10.4S, v28.s[0] +ldr q15, [x0, #688] +mla v14.4S, v13.4S, v31.s[0] +sqrdmulh v13.4S, v15.4S, v28.s[0] +ldr q1, [x0, #752] +mla v24.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v1.4S, v28.s[0] +ldr q18, [x0, #432] +ldr q2, [x0, #496] +mul v16.4S, v16.4S,v29.s[0] +sub v6.4s, v18.4s, v26.4s +mul v10.4S, v10.4S,v29.s[0] +add v18.4s, v18.4s, v26.4s +ldr q26, [x0, #304] +ldr q8, [x0, #368] +mla v16.4S, v25.4S, v31.s[0] +sub v25.4s, v2.4s, v22.4s +mla v10.4S, v12.4S, v31.s[0] +add v2.4s, v2.4s, v22.4s +ldr q22, [x0, #48] +ldr q12, [x0, #112] +mul v15.4S, v15.4S,v29.s[0] +sub v27.4s, v26.4s, v14.4s +mul v1.4S, v1.4S,v29.s[0] +add v26.4s, v26.4s, v14.4s +ldr q14, [x0, #176] +ldr q30, [x0, #240] +mla v15.4S, v13.4S, v31.s[0] +sub v13.4s, v8.4s, v24.4s +mla v1.4S, v19.4S, v31.s[0] +add v8.4s, v8.4s, v24.4s +sqrdmulh v24.4S, v18.4S, v28.s[1] +mul v18.4S, v18.4S,v29.s[1] +sqrdmulh v19.4S, v2.4S, v28.s[1] +sub v3.4s, v22.4s, v16.4s +mul v2.4S, v2.4S,v29.s[1] +add v22.4s, v22.4s, v16.4s +sqrdmulh v16.4S, v26.4S, v28.s[1] +sub v20.4s, v12.4s, v10.4s +mul v26.4S, v26.4S,v29.s[1] +add v12.4s, v12.4s, v10.4s +sqrdmulh v10.4S, v8.4S, v28.s[1] +sub v21.4s, v14.4s, v15.4s +mul v8.4S, v8.4S,v29.s[1] +add v14.4s, v14.4s, v15.4s +mla v18.4S, v24.4S, v31.s[0] +sub v24.4s, v30.4s, v1.4s +sqrdmulh v15.4S, v6.4S, v28.s[2] +add v30.4s, v30.4s, v1.4s +mla v2.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v25.4S, v28.s[2] +mla v26.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v27.4S, v28.s[2] +mla v8.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v13.4S, v28.s[2] +mul v6.4S, v6.4S,v29.s[2] +sub v1.4s, v14.4s, v18.4s +mul v25.4S, v25.4S,v29.s[2] +add v14.4s, v14.4s, v18.4s +mla v6.4S, v15.4S, v31.s[0] +sub v15.4s, v30.4s, v2.4s +mla v25.4S, v19.4S, v31.s[0] +add v30.4s, v30.4s, v2.4s +mul v27.4S, v27.4S,v29.s[2] +sub v2.4s, v22.4s, v26.4s +mul v13.4S, v13.4S,v29.s[2] +add v22.4s, v22.4s, v26.4s +mla v27.4S, v16.4S, v31.s[0] +sub v16.4s, v12.4s, v8.4s +mla v13.4S, v10.4S, v31.s[0] +add v12.4s, v12.4s, v8.4s +sqrdmulh v8.4S, v1.4S, v11.s[1] +mul v1.4S, v1.4S,v17.s[1] +sqrdmulh v10.4S, v15.4S, v11.s[1] +sub v26.4s, v21.4s, v6.4s +mul v15.4S, v15.4S,v17.s[1] +add v21.4s, v21.4s, v6.4s +sqrdmulh v6.4S, v14.4S, v11.s[0] +sub v19.4s, v24.4s, v25.4s +mul v14.4S, v14.4S,v17.s[0] +add v24.4s, v24.4s, v25.4s +sqrdmulh v25.4S, v30.4S, v11.s[0] +sub v18.4s, v3.4s, v27.4s +mul v30.4S, v30.4S,v17.s[0] +add v3.4s, v3.4s, v27.4s +mla v1.4S, v8.4S, v31.s[0] +sub v8.4s, v20.4s, v13.4s +sqrdmulh v27.4S, v21.4S, v11.s[2] +add v20.4s, v20.4s, v13.4s +mla v15.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v24.4S, v11.s[2] +mla v14.4S, v6.4S, v31.s[0] +sqrdmulh v6.4S, v26.4S, v11.s[3] +mla v30.4S, v25.4S, v31.s[0] +sqrdmulh v25.4S, v19.4S, v11.s[3] +mul v21.4S, v21.4S,v17.s[2] +sub v13.4s, v2.4s, v1.4s +mul v24.4S, v24.4S,v17.s[2] +add v2.4s, v2.4s, v1.4s +mla v21.4S, v27.4S, v31.s[0] +sub v27.4s, v16.4s, v15.4s +mla v24.4S, v10.4S, v31.s[0] +add v16.4s, v16.4s, v15.4s +mul v26.4S, v26.4S,v17.s[3] +sub v15.4s, v22.4s, v14.4s +mul v19.4S, v19.4S,v17.s[3] +add v22.4s, v22.4s, v14.4s +mla v26.4S, v6.4S, v31.s[0] +sub v6.4s, v12.4s, v30.4s +mla v19.4S, v25.4S, v31.s[0] +add v12.4s, v12.4s, v30.4s +sqrdmulh v30.4S, v16.4S, v9.s[2] +mul v16.4S, v16.4S,v0.s[2] +sqrdmulh v25.4S, v27.4S, v9.s[3] +sub v14.4s, v3.4s, v21.4s +mul v27.4S, v27.4S,v0.s[3] +add v3.4s, v3.4s, v21.4s +sqrdmulh v21.4S, v6.4S, v9.s[1] +sub v10.4s, v20.4s, v24.4s +mul v6.4S, v6.4S,v0.s[1] +add v20.4s, v20.4s, v24.4s +sqrdmulh v24.4S, v12.4S, v9.s[0] +sub v1.4s, v18.4s, v26.4s +mul v12.4S, v12.4S,v0.s[0] +add v18.4s, v18.4s, v26.4s +mla v16.4S, v30.4S, v31.s[0] +sub v30.4s, v8.4s, v19.4s +sqrdmulh v26.4S, v20.4S, v7.s[0] +add v8.4s, v8.4s, v19.4s +mla v27.4S, v25.4S, v31.s[0] +sqrdmulh v25.4S, v10.4S, v7.s[1] +mla v6.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v8.4S, v7.s[2] +mla v12.4S, v24.4S, v31.s[0] +sqrdmulh v24.4S, v30.4S, v7.s[3] +mul v20.4S, v20.4S,v23.s[0] +sub v19.4s, v2.4s, v16.4s +str q19, [x0, #368] +mul v10.4S, v10.4S,v23.s[1] +add v2.4s, v2.4s, v16.4s +str q2, [x0, #304] +mla v20.4S, v26.4S, v31.s[0] +sub v26.4s, v13.4s, v27.4s +str q26, [x0, #496] +mla v10.4S, v25.4S, v31.s[0] +add v13.4s, v13.4s, v27.4s +str q13, [x0, #432] +mul v8.4S, v8.4S,v23.s[2] +sub v13.4s, v15.4s, v6.4s +str q13, [x0, #240] +mul v30.4S, v30.4S,v23.s[3] +add v15.4s, v15.4s, v6.4s +str q15, [x0, #176] +mla v8.4S, v21.4S, v31.s[0] +sub v21.4s, v22.4s, v12.4s +str q21, [x0, #112] +mla v30.4S, v24.4S, v31.s[0] +add v22.4s, v22.4s, v12.4s +str q22, [x0, #48] +ldr q22, [x0, #896] +sqrdmulh v12.4S, v22.4S, v28.s[0] +mul v22.4S, v22.4S,v29.s[0] +ldr q24, [x0, #960] +sqrdmulh v21.4S, v24.4S, v28.s[0] +sub v15.4s, v3.4s, v20.4s +str q15, [x0, #624] +mul v24.4S, v24.4S,v29.s[0] +add v3.4s, v3.4s, v20.4s +str q3, [x0, #560] +ldr q3, [x0, #768] +sqrdmulh v20.4S, v3.4S, v28.s[0] +sub v15.4s, v14.4s, v10.4s +str q15, [x0, #752] +mul v3.4S, v3.4S,v29.s[0] +add v14.4s, v14.4s, v10.4s +str q14, [x0, #688] +ldr q14, [x0, #832] +sqrdmulh v10.4S, v14.4S, v28.s[0] +sub v15.4s, v18.4s, v8.4s +str q15, [x0, #880] +mul v14.4S, v14.4S,v29.s[0] +add v18.4s, v18.4s, v8.4s +str q18, [x0, #816] +ldr q18, [x0, #512] +mla v22.4S, v12.4S, v31.s[0] +sub v12.4s, v1.4s, v30.4s +str q12, [x0, #1008] +sqrdmulh v12.4S, v18.4S, v28.s[0] +add v1.4s, v1.4s, v30.4s +str q1, [x0, #944] +ldr q1, [x0, #576] +mla v24.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v1.4S, v28.s[0] +ldr q30, [x0, #640] +mla v3.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v30.4S, v28.s[0] +ldr q8, [x0, #704] +mla v14.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v8.4S, v28.s[0] +ldr q15, [x0, #384] +ldr q6, [x0, #448] +mul v18.4S, v18.4S,v29.s[0] +sub v13.4s, v15.4s, v22.4s +mul v1.4S, v1.4S,v29.s[0] +add v15.4s, v15.4s, v22.4s +ldr q22, [x0, #256] +ldr q27, [x0, #320] +mla v18.4S, v12.4S, v31.s[0] +sub v12.4s, v6.4s, v24.4s +mla v1.4S, v21.4S, v31.s[0] +add v6.4s, v6.4s, v24.4s +ldr q24, [x0, #0] +ldr q21, [x0, #64] +mul v30.4S, v30.4S,v29.s[0] +sub v25.4s, v22.4s, v3.4s +mul v8.4S, v8.4S,v29.s[0] +add v22.4s, v22.4s, v3.4s +ldr q3, [x0, #128] +ldr q26, [x0, #192] +mla v30.4S, v20.4S, v31.s[0] +sub v20.4s, v27.4s, v14.4s +mla v8.4S, v10.4S, v31.s[0] +add v27.4s, v27.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v28.s[1] +mul v15.4S, v15.4S,v29.s[1] +sqrdmulh v10.4S, v6.4S, v28.s[1] +sub v2.4s, v24.4s, v18.4s +mul v6.4S, v6.4S,v29.s[1] +add v24.4s, v24.4s, v18.4s +sqrdmulh v18.4S, v22.4S, v28.s[1] +sub v16.4s, v21.4s, v1.4s +mul v22.4S, v22.4S,v29.s[1] +add v21.4s, v21.4s, v1.4s +sqrdmulh v1.4S, v27.4S, v28.s[1] +sub v19.4s, v3.4s, v30.4s +mul v27.4S, v27.4S,v29.s[1] +add v3.4s, v3.4s, v30.4s +mla v15.4S, v14.4S, v31.s[0] +sub v14.4s, v26.4s, v8.4s +sqrdmulh v30.4S, v13.4S, v28.s[2] +add v26.4s, v26.4s, v8.4s +mla v6.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v12.4S, v28.s[2] +mla v22.4S, v18.4S, v31.s[0] +sqrdmulh v18.4S, v25.4S, v28.s[2] +mla v27.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v20.4S, v28.s[2] +mul v13.4S, v13.4S,v29.s[2] +sub v8.4s, v3.4s, v15.4s +mul v12.4S, v12.4S,v29.s[2] +add v3.4s, v3.4s, v15.4s +mla v13.4S, v30.4S, v31.s[0] +sub v30.4s, v26.4s, v6.4s +mla v12.4S, v10.4S, v31.s[0] +add v26.4s, v26.4s, v6.4s +mul v25.4S, v25.4S,v29.s[2] +sub v6.4s, v24.4s, v22.4s +mul v20.4S, v20.4S,v29.s[2] +add v24.4s, v24.4s, v22.4s +mla v25.4S, v18.4S, v31.s[0] +sub v18.4s, v21.4s, v27.4s +mla v20.4S, v1.4S, v31.s[0] +add v21.4s, v21.4s, v27.4s +sqrdmulh v27.4S, v8.4S, v11.s[1] +mul v8.4S, v8.4S,v17.s[1] +sqrdmulh v1.4S, v30.4S, v11.s[1] +sub v22.4s, v19.4s, v13.4s +mul v30.4S, v30.4S,v17.s[1] +add v19.4s, v19.4s, v13.4s +sqrdmulh v13.4S, v3.4S, v11.s[0] +sub v10.4s, v14.4s, v12.4s +mul v3.4S, v3.4S,v17.s[0] +add v14.4s, v14.4s, v12.4s +sqrdmulh v12.4S, v26.4S, v11.s[0] +sub v15.4s, v2.4s, v25.4s +mul v26.4S, v26.4S,v17.s[0] +add v2.4s, v2.4s, v25.4s +mla v8.4S, v27.4S, v31.s[0] +sub v27.4s, v16.4s, v20.4s +sqrdmulh v25.4S, v19.4S, v11.s[2] +add v16.4s, v16.4s, v20.4s +mla v30.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v14.4S, v11.s[2] +mla v3.4S, v13.4S, v31.s[0] +sqrdmulh v13.4S, v22.4S, v11.s[3] +mla v26.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v10.4S, v11.s[3] +mul v19.4S, v19.4S,v17.s[2] +sub v20.4s, v6.4s, v8.4s +mul v14.4S, v14.4S,v17.s[2] +add v6.4s, v6.4s, v8.4s +mla v19.4S, v25.4S, v31.s[0] +sub v25.4s, v18.4s, v30.4s +mla v14.4S, v1.4S, v31.s[0] +add v18.4s, v18.4s, v30.4s +mul v22.4S, v22.4S,v17.s[3] +sub v30.4s, v24.4s, v3.4s +mul v10.4S, v10.4S,v17.s[3] +add v24.4s, v24.4s, v3.4s +mla v22.4S, v13.4S, v31.s[0] +sub v13.4s, v21.4s, v26.4s +mla v10.4S, v12.4S, v31.s[0] +add v21.4s, v21.4s, v26.4s +sqrdmulh v26.4S, v18.4S, v9.s[2] +mul v18.4S, v18.4S,v0.s[2] +sqrdmulh v12.4S, v25.4S, v9.s[3] +sub v3.4s, v2.4s, v19.4s +mul v25.4S, v25.4S,v0.s[3] +add v2.4s, v2.4s, v19.4s +sqrdmulh v19.4S, v13.4S, v9.s[1] +sub v1.4s, v16.4s, v14.4s +mul v13.4S, v13.4S,v0.s[1] +add v16.4s, v16.4s, v14.4s +sqrdmulh v14.4S, v21.4S, v9.s[0] +sub v8.4s, v15.4s, v22.4s +mul v21.4S, v21.4S,v0.s[0] +add v15.4s, v15.4s, v22.4s +mla v18.4S, v26.4S, v31.s[0] +sub v26.4s, v27.4s, v10.4s +sqrdmulh v22.4S, v16.4S, v7.s[0] +add v27.4s, v27.4s, v10.4s +mla v25.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v1.4S, v7.s[1] +mla v13.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v27.4S, v7.s[2] +mla v21.4S, v14.4S, v31.s[0] +sqrdmulh v14.4S, v26.4S, v7.s[3] +mul v16.4S, v16.4S,v23.s[0] +sub v10.4s, v6.4s, v18.4s +str q10, [x0, #320] +mul v1.4S, v1.4S,v23.s[1] +add v6.4s, v6.4s, v18.4s +str q6, [x0, #256] +mla v16.4S, v22.4S, v31.s[0] +sub v22.4s, v20.4s, v25.4s +str q22, [x0, #448] +mla v1.4S, v12.4S, v31.s[0] +add v20.4s, v20.4s, v25.4s +str q20, [x0, #384] +mul v27.4S, v27.4S,v23.s[2] +sub v20.4s, v30.4s, v13.4s +str q20, [x0, #192] +mul v26.4S, v26.4S,v23.s[3] +add v30.4s, v30.4s, v13.4s +str q30, [x0, #128] +mla v27.4S, v19.4S, v31.s[0] +sub v19.4s, v24.4s, v21.4s +str q19, [x0, #64] +mla v26.4S, v14.4S, v31.s[0] +add v24.4s, v24.4s, v21.4s +str q24, [x0, #0] +ldr q24, [x0, #912] +sqrdmulh v21.4S, v24.4S, v28.s[0] +mul v24.4S, v24.4S,v29.s[0] +ldr q14, [x0, #976] +sqrdmulh v19.4S, v14.4S, v28.s[0] +sub v30.4s, v2.4s, v16.4s +str q30, [x0, #576] +mul v14.4S, v14.4S,v29.s[0] +add v2.4s, v2.4s, v16.4s +str q2, [x0, #512] +ldr q2, [x0, #784] +sqrdmulh v16.4S, v2.4S, v28.s[0] +sub v30.4s, v3.4s, v1.4s +str q30, [x0, #704] +mul v2.4S, v2.4S,v29.s[0] +add v3.4s, v3.4s, v1.4s +str q3, [x0, #640] +ldr q3, [x0, #848] +sqrdmulh v1.4S, v3.4S, v28.s[0] +sub v30.4s, v15.4s, v27.4s +str q30, [x0, #832] +mul v3.4S, v3.4S,v29.s[0] +add v15.4s, v15.4s, v27.4s +str q15, [x0, #768] +ldr q15, [x0, #528] +mla v24.4S, v21.4S, v31.s[0] +sub v21.4s, v8.4s, v26.4s +str q21, [x0, #960] +sqrdmulh v21.4S, v15.4S, v28.s[0] +add v8.4s, v8.4s, v26.4s +str q8, [x0, #896] +ldr q8, [x0, #592] +mla v14.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v8.4S, v28.s[0] +ldr q26, [x0, #656] +mla v2.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v26.4S, v28.s[0] +ldr q27, [x0, #720] +mla v3.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v27.4S, v28.s[0] +ldr q30, [x0, #400] +ldr q13, [x0, #464] +mul v15.4S, v15.4S,v29.s[0] +sub v20.4s, v30.4s, v24.4s +mul v8.4S, v8.4S,v29.s[0] +add v30.4s, v30.4s, v24.4s +ldr q24, [x0, #272] +ldr q25, [x0, #336] +mla v15.4S, v21.4S, v31.s[0] +sub v21.4s, v13.4s, v14.4s +mla v8.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v14.4s +ldr q14, [x0, #16] +ldr q19, [x0, #80] +mul v26.4S, v26.4S,v29.s[0] +sub v12.4s, v24.4s, v2.4s +mul v27.4S, v27.4S,v29.s[0] +add v24.4s, v24.4s, v2.4s +ldr q2, [x0, #144] +ldr q22, [x0, #208] +mla v26.4S, v16.4S, v31.s[0] +sub v16.4s, v25.4s, v3.4s +mla v27.4S, v1.4S, v31.s[0] +add v25.4s, v25.4s, v3.4s +sqrdmulh v3.4S, v30.4S, v28.s[1] +mul v30.4S, v30.4S,v29.s[1] +sqrdmulh v1.4S, v13.4S, v28.s[1] +sub v6.4s, v14.4s, v15.4s +mul v13.4S, v13.4S,v29.s[1] +add v14.4s, v14.4s, v15.4s +sqrdmulh v15.4S, v24.4S, v28.s[1] +sub v18.4s, v19.4s, v8.4s +mul v24.4S, v24.4S,v29.s[1] +add v19.4s, v19.4s, v8.4s +sqrdmulh v8.4S, v25.4S, v28.s[1] +sub v10.4s, v2.4s, v26.4s +mul v25.4S, v25.4S,v29.s[1] +add v2.4s, v2.4s, v26.4s +mla v30.4S, v3.4S, v31.s[0] +sub v3.4s, v22.4s, v27.4s +sqrdmulh v26.4S, v20.4S, v28.s[2] +add v22.4s, v22.4s, v27.4s +mla v13.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v21.4S, v28.s[2] +mla v24.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v12.4S, v28.s[2] +mla v25.4S, v8.4S, v31.s[0] +sqrdmulh v8.4S, v16.4S, v28.s[2] +mul v20.4S, v20.4S,v29.s[2] +sub v27.4s, v2.4s, v30.4s +mul v21.4S, v21.4S,v29.s[2] +add v2.4s, v2.4s, v30.4s +mla v20.4S, v26.4S, v31.s[0] +sub v26.4s, v22.4s, v13.4s +mla v21.4S, v1.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +mul v12.4S, v12.4S,v29.s[2] +sub v13.4s, v14.4s, v24.4s +mul v16.4S, v16.4S,v29.s[2] +add v14.4s, v14.4s, v24.4s +mla v12.4S, v15.4S, v31.s[0] +sub v15.4s, v19.4s, v25.4s +mla v16.4S, v8.4S, v31.s[0] +add v19.4s, v19.4s, v25.4s +sqrdmulh v28.4S, v27.4S, v11.s[1] +mul v27.4S, v27.4S,v17.s[1] +sqrdmulh v29.4S, v26.4S, v11.s[1] +sub v25.4s, v10.4s, v20.4s +mul v26.4S, v26.4S,v17.s[1] +add v10.4s, v10.4s, v20.4s +sqrdmulh v20.4S, v2.4S, v11.s[0] +sub v8.4s, v3.4s, v21.4s +mul v2.4S, v2.4S,v17.s[0] +add v3.4s, v3.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v11.s[0] +sub v24.4s, v6.4s, v12.4s +mul v22.4S, v22.4S,v17.s[0] +add v6.4s, v6.4s, v12.4s +mla v27.4S, v28.4S, v31.s[0] +sub v28.4s, v18.4s, v16.4s +sqrdmulh v12.4S, v10.4S, v11.s[2] +add v18.4s, v18.4s, v16.4s +mla v26.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v3.4S, v11.s[2] +mla v2.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v25.4S, v11.s[3] +mla v22.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v8.4S, v11.s[3] +mul v10.4S, v10.4S,v17.s[2] +sub v16.4s, v13.4s, v27.4s +mul v3.4S, v3.4S,v17.s[2] +add v13.4s, v13.4s, v27.4s +mla v10.4S, v12.4S, v31.s[0] +sub v12.4s, v15.4s, v26.4s +mla v3.4S, v29.4S, v31.s[0] +add v15.4s, v15.4s, v26.4s +mul v25.4S, v25.4S,v17.s[3] +sub v26.4s, v14.4s, v2.4s +mul v8.4S, v8.4S,v17.s[3] +add v14.4s, v14.4s, v2.4s +mla v25.4S, v20.4S, v31.s[0] +sub v20.4s, v19.4s, v22.4s +mla v8.4S, v21.4S, v31.s[0] +add v19.4s, v19.4s, v22.4s +sqrdmulh v11.4S, v15.4S, v9.s[2] +mul v15.4S, v15.4S,v0.s[2] +sqrdmulh v17.4S, v12.4S, v9.s[3] +sub v22.4s, v6.4s, v10.4s +mul v12.4S, v12.4S,v0.s[3] +add v6.4s, v6.4s, v10.4s +sqrdmulh v10.4S, v20.4S, v9.s[1] +sub v21.4s, v18.4s, v3.4s +mul v20.4S, v20.4S,v0.s[1] +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v19.4S, v9.s[0] +sub v2.4s, v24.4s, v25.4s +mul v19.4S, v19.4S,v0.s[0] +add v24.4s, v24.4s, v25.4s +mla v15.4S, v11.4S, v31.s[0] +sub v11.4s, v28.4s, v8.4s +sqrdmulh v25.4S, v18.4S, v7.s[0] +add v28.4s, v28.4s, v8.4s +mla v12.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v21.4S, v7.s[1] +mla v20.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v28.4S, v7.s[2] +mla v19.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v11.4S, v7.s[3] +mul v18.4S, v18.4S,v23.s[0] +sub v8.4s, v13.4s, v15.4s +str q8, [x0, #336] +mul v21.4S, v21.4S,v23.s[1] +add v13.4s, v13.4s, v15.4s +str q13, [x0, #272] +mla v18.4S, v25.4S, v31.s[0] +sub v25.4s, v16.4s, v12.4s +str q25, [x0, #464] +mla v21.4S, v17.4S, v31.s[0] +add v16.4s, v16.4s, v12.4s +str q16, [x0, #400] +mul v28.4S, v28.4S,v23.s[2] +sub v16.4s, v26.4s, v20.4s +str q16, [x0, #208] +mul v11.4S, v11.4S,v23.s[3] +add v26.4s, v26.4s, v20.4s +str q26, [x0, #144] +mla v28.4S, v10.4S, v31.s[0] +sub v10.4s, v14.4s, v19.4s +str q10, [x0, #80] +mla v11.4S, v3.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +str q14, [x0, #16] +sub v7.4s, v6.4s, v18.4s +str q7, [x0, #592] +add v6.4s, v6.4s, v18.4s +str q6, [x0, #528] +sub v6.4s, v22.4s, v21.4s +str q6, [x0, #720] +add v22.4s, v22.4s, v21.4s +str q22, [x0, #656] +sub v22.4s, v24.4s, v28.4s +str q22, [x0, #848] +add v24.4s, v24.4s, v28.4s +str q24, [x0, #784] +sub v24.4s, v2.4s, v11.4s +str q24, [x0, #976] +add v2.4s, v2.4s, v11.4s +str q2, [x0, #912] +ldr q4, [x0, #224] +ldr q5, [x0, #160] +ldr q30, [x0, #32] +ldr q1, [x17, #+128] +ldr q27, [x17, #+144] +sqrdmulh v29.4S, v30.4S, v27.s[0] +mul v30.4S, v30.4S,v1.s[0] +ldr q8, [x0, #48] +sqrdmulh v15.4S, v8.4S, v27.s[0] +mul v8.4S, v8.4S,v1.s[0] +ldr q13, [x17, #+160] +ldr q25, [x17, #+176] +ldr q17, [x0, #96] +sqrdmulh v12.4S, v17.4S, v25.s[0] +mul v17.4S, v17.4S,v13.s[0] +ldr q16, [x0, #112] +sqrdmulh v20.4S, v16.4S, v25.s[0] +mul v16.4S, v16.4S,v13.s[0] +ldr q26, [x17, #+192] +ldr q10, [x17, #+208] +mla v30.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v5.4S, v10.s[0] +ldr q3, [x0, #176] +mla v8.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v3.4S, v10.s[0] +ldr q19, [x17, #+224] +ldr q14, [x17, #+240] +mla v17.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v4.4S, v14.s[0] +ldr q0, [x0, #240] +mla v16.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v0.4S, v14.s[0] +ldr q9, [x0, #0] +ldr q23, [x0, #128] +mul v5.4S, v5.4S,v26.s[0] +sub v7.4s, v9.4s, v30.4s +ldr q18, [x0, #16] +mul v3.4S, v3.4S,v26.s[0] +add v9.4s, v9.4s, v30.4s +ldr q30, [x0, #144] +mla v5.4S, v29.4S, v31.s[0] +sub v29.4s, v18.4s, v8.4s +ldr q6, [x0, #64] +mla v3.4S, v15.4S, v31.s[0] +add v18.4s, v18.4s, v8.4s +ldr q8, [x0, #192] +mul v4.4S, v4.4S,v19.s[0] +sub v15.4s, v6.4s, v17.4s +ldr q21, [x0, #80] +mul v0.4S, v0.4S,v19.s[0] +add v6.4s, v6.4s, v17.4s +ldr q17, [x0, #208] +mla v4.4S, v12.4S, v31.s[0] +mla v0.4S, v20.4S, v31.s[0] +sub v20.4s, v21.4s, v16.4s +sqrdmulh v12.4S, v18.4S, v27.s[1] +add v21.4s, v21.4s, v16.4s +mul v18.4S, v18.4S,v1.s[1] +sqrdmulh v16.4S, v29.4S, v27.s[2] +sub v22.4s, v23.4s, v5.4s +mul v29.4S, v29.4S,v1.s[2] +add v23.4s, v23.4s, v5.4s +sqrdmulh v27.4S, v21.4S, v25.s[1] +sub v1.4s, v30.4s, v3.4s +mul v21.4S, v21.4S,v13.s[1] +add v30.4s, v30.4s, v3.4s +sqrdmulh v3.4S, v20.4S, v25.s[2] +sub v5.4s, v8.4s, v4.4s +mul v20.4S, v20.4S,v13.s[2] +add v8.4s, v8.4s, v4.4s +mla v18.4S, v12.4S, v31.s[0] +sub v12.4s, v17.4s, v0.4s +ldr q25, [x0, #480] +sqrdmulh v13.4S, v30.4S, v10.s[1] +add v17.4s, v17.4s, v0.4s +mla v29.4S, v16.4S, v31.s[0] +ldr q16, [x0, #416] +sqrdmulh v0.4S, v1.4S, v10.s[2] +sub v4.4s, v9.4s, v18.4s +mla v21.4S, v27.4S, v31.s[0] +ldr q27, [x0, #288] +sqrdmulh v28.4S, v17.4S, v14.s[1] +add v9.4s, v9.4s, v18.4s +str q4, [x0, #16] +mla v20.4S, v3.4S, v31.s[0] +ldr q3, [x17, #+256] +ldr q4, [x17, #+272] +sqrdmulh v18.4S, v12.4S, v14.s[2] +sub v24.4s, v7.4s, v29.4s +str q9, [x0, #0] +mul v30.4S, v30.4S,v26.s[1] +add v7.4s, v7.4s, v29.4s +mul v1.4S, v1.4S,v26.s[2] +str q24, [x0, #48] +mla v30.4S, v13.4S, v31.s[0] +sub v13.4s, v6.4s, v21.4s +mla v1.4S, v0.4S, v31.s[0] +str q7, [x0, #32] +mul v17.4S, v17.4S,v19.s[1] +str q13, [x0, #80] +mul v12.4S, v12.4S,v19.s[2] +add v6.4s, v6.4s, v21.4s +str q6, [x0, #64] +mla v17.4S, v28.4S, v31.s[0] +sub v28.4s, v15.4s, v20.4s +str q28, [x0, #112] +mla v12.4S, v18.4S, v31.s[0] +add v15.4s, v15.4s, v20.4s +str q15, [x0, #96] +sqrdmulh v14.4S, v27.4S, v4.s[0] +sub v19.4s, v23.4s, v30.4s +mul v27.4S, v27.4S,v3.s[0] +str q19, [x0, #144] +ldr q19, [x0, #304] +sqrdmulh v15.4S, v19.4S, v4.s[0] +add v23.4s, v23.4s, v30.4s +mul v19.4S, v19.4S,v3.s[0] +str q23, [x0, #128] +ldr q23, [x17, #+288] +ldr q30, [x17, #+304] +ldr q20, [x0, #352] +sqrdmulh v18.4S, v20.4S, v30.s[0] +sub v28.4s, v22.4s, v1.4s +mul v20.4S, v20.4S,v23.s[0] +str q28, [x0, #176] +ldr q28, [x0, #368] +sqrdmulh v6.4S, v28.4S, v30.s[0] +add v22.4s, v22.4s, v1.4s +mul v28.4S, v28.4S,v23.s[0] +str q22, [x0, #160] +ldr q22, [x17, #+320] +ldr q1, [x17, #+336] +mla v27.4S, v14.4S, v31.s[0] +sub v14.4s, v8.4s, v17.4s +sqrdmulh v21.4S, v16.4S, v1.s[0] +str q14, [x0, #208] +ldr q14, [x0, #432] +mla v19.4S, v15.4S, v31.s[0] +add v8.4s, v8.4s, v17.4s +sqrdmulh v17.4S, v14.4S, v1.s[0] +str q8, [x0, #192] +ldr q8, [x17, #+352] +ldr q15, [x17, #+368] +mla v20.4S, v18.4S, v31.s[0] +sub v18.4s, v5.4s, v12.4s +sqrdmulh v13.4S, v25.4S, v15.s[0] +str q18, [x0, #240] +ldr q18, [x0, #496] +mla v28.4S, v6.4S, v31.s[0] +add v5.4s, v5.4s, v12.4s +sqrdmulh v12.4S, v18.4S, v15.s[0] +str q5, [x0, #224] +ldr q5, [x0, #256] +ldr q6, [x0, #384] +mul v16.4S, v16.4S,v22.s[0] +sub v10.4s, v5.4s, v27.4s +ldr q26, [x0, #272] +mul v14.4S, v14.4S,v22.s[0] +add v5.4s, v5.4s, v27.4s +ldr q27, [x0, #400] +mla v16.4S, v21.4S, v31.s[0] +sub v21.4s, v26.4s, v19.4s +ldr q7, [x0, #320] +mla v14.4S, v17.4S, v31.s[0] +add v26.4s, v26.4s, v19.4s +ldr q19, [x0, #448] +mul v25.4S, v25.4S,v8.s[0] +sub v17.4s, v7.4s, v20.4s +ldr q0, [x0, #336] +mul v18.4S, v18.4S,v8.s[0] +add v7.4s, v7.4s, v20.4s +ldr q20, [x0, #464] +mla v25.4S, v13.4S, v31.s[0] +mla v18.4S, v12.4S, v31.s[0] +sub v12.4s, v0.4s, v28.4s +sqrdmulh v13.4S, v26.4S, v4.s[1] +add v0.4s, v0.4s, v28.4s +mul v26.4S, v26.4S,v3.s[1] +sqrdmulh v28.4S, v21.4S, v4.s[2] +sub v24.4s, v6.4s, v16.4s +mul v21.4S, v21.4S,v3.s[2] +add v6.4s, v6.4s, v16.4s +sqrdmulh v4.4S, v0.4S, v30.s[1] +sub v3.4s, v27.4s, v14.4s +mul v0.4S, v0.4S,v23.s[1] +add v27.4s, v27.4s, v14.4s +sqrdmulh v14.4S, v12.4S, v30.s[2] +sub v16.4s, v19.4s, v25.4s +mul v12.4S, v12.4S,v23.s[2] +add v19.4s, v19.4s, v25.4s +mla v26.4S, v13.4S, v31.s[0] +sub v13.4s, v20.4s, v18.4s +ldr q30, [x0, #736] +sqrdmulh v23.4S, v27.4S, v1.s[1] +add v20.4s, v20.4s, v18.4s +mla v21.4S, v28.4S, v31.s[0] +ldr q28, [x0, #672] +sqrdmulh v18.4S, v3.4S, v1.s[2] +sub v25.4s, v5.4s, v26.4s +mla v0.4S, v4.4S, v31.s[0] +ldr q4, [x0, #544] +sqrdmulh v29.4S, v20.4S, v15.s[1] +add v5.4s, v5.4s, v26.4s +str q25, [x0, #272] +mla v12.4S, v14.4S, v31.s[0] +ldr q14, [x17, #+384] +ldr q25, [x17, #+400] +sqrdmulh v26.4S, v13.4S, v15.s[2] +sub v9.4s, v10.4s, v21.4s +str q5, [x0, #256] +mul v27.4S, v27.4S,v22.s[1] +add v10.4s, v10.4s, v21.4s +mul v3.4S, v3.4S,v22.s[2] +str q9, [x0, #304] +mla v27.4S, v23.4S, v31.s[0] +sub v23.4s, v7.4s, v0.4s +mla v3.4S, v18.4S, v31.s[0] +str q10, [x0, #288] +mul v20.4S, v20.4S,v8.s[1] +str q23, [x0, #336] +mul v13.4S, v13.4S,v8.s[2] +add v7.4s, v7.4s, v0.4s +str q7, [x0, #320] +mla v20.4S, v29.4S, v31.s[0] +sub v29.4s, v17.4s, v12.4s +str q29, [x0, #368] +mla v13.4S, v26.4S, v31.s[0] +add v17.4s, v17.4s, v12.4s +str q17, [x0, #352] +sqrdmulh v15.4S, v4.4S, v25.s[0] +sub v8.4s, v6.4s, v27.4s +mul v4.4S, v4.4S,v14.s[0] +str q8, [x0, #400] +ldr q8, [x0, #560] +sqrdmulh v17.4S, v8.4S, v25.s[0] +add v6.4s, v6.4s, v27.4s +mul v8.4S, v8.4S,v14.s[0] +str q6, [x0, #384] +ldr q6, [x17, #+416] +ldr q27, [x17, #+432] +ldr q12, [x0, #608] +sqrdmulh v26.4S, v12.4S, v27.s[0] +sub v29.4s, v24.4s, v3.4s +mul v12.4S, v12.4S,v6.s[0] +str q29, [x0, #432] +ldr q29, [x0, #624] +sqrdmulh v7.4S, v29.4S, v27.s[0] +add v24.4s, v24.4s, v3.4s +mul v29.4S, v29.4S,v6.s[0] +str q24, [x0, #416] +ldr q24, [x17, #+448] +ldr q3, [x17, #+464] +mla v4.4S, v15.4S, v31.s[0] +sub v15.4s, v19.4s, v20.4s +sqrdmulh v0.4S, v28.4S, v3.s[0] +str q15, [x0, #464] +ldr q15, [x0, #688] +mla v8.4S, v17.4S, v31.s[0] +add v19.4s, v19.4s, v20.4s +sqrdmulh v20.4S, v15.4S, v3.s[0] +str q19, [x0, #448] +ldr q19, [x17, #+480] +ldr q17, [x17, #+496] +mla v12.4S, v26.4S, v31.s[0] +sub v26.4s, v16.4s, v13.4s +sqrdmulh v23.4S, v30.4S, v17.s[0] +str q26, [x0, #496] +ldr q26, [x0, #752] +mla v29.4S, v7.4S, v31.s[0] +add v16.4s, v16.4s, v13.4s +sqrdmulh v13.4S, v26.4S, v17.s[0] +str q16, [x0, #480] +ldr q16, [x0, #512] +ldr q7, [x0, #640] +mul v28.4S, v28.4S,v24.s[0] +sub v1.4s, v16.4s, v4.4s +ldr q22, [x0, #528] +mul v15.4S, v15.4S,v24.s[0] +add v16.4s, v16.4s, v4.4s +ldr q4, [x0, #656] +mla v28.4S, v0.4S, v31.s[0] +sub v0.4s, v22.4s, v8.4s +ldr q10, [x0, #576] +mla v15.4S, v20.4S, v31.s[0] +add v22.4s, v22.4s, v8.4s +ldr q8, [x0, #704] +mul v30.4S, v30.4S,v19.s[0] +sub v20.4s, v10.4s, v12.4s +ldr q18, [x0, #592] +mul v26.4S, v26.4S,v19.s[0] +add v10.4s, v10.4s, v12.4s +ldr q12, [x0, #720] +mla v30.4S, v23.4S, v31.s[0] +mla v26.4S, v13.4S, v31.s[0] +sub v13.4s, v18.4s, v29.4s +sqrdmulh v23.4S, v22.4S, v25.s[1] +add v18.4s, v18.4s, v29.4s +mul v22.4S, v22.4S,v14.s[1] +sqrdmulh v29.4S, v0.4S, v25.s[2] +sub v9.4s, v7.4s, v28.4s +mul v0.4S, v0.4S,v14.s[2] +add v7.4s, v7.4s, v28.4s +sqrdmulh v25.4S, v18.4S, v27.s[1] +sub v14.4s, v4.4s, v15.4s +mul v18.4S, v18.4S,v6.s[1] +add v4.4s, v4.4s, v15.4s +sqrdmulh v15.4S, v13.4S, v27.s[2] +sub v28.4s, v8.4s, v30.4s +mul v13.4S, v13.4S,v6.s[2] +add v8.4s, v8.4s, v30.4s +mla v22.4S, v23.4S, v31.s[0] +sub v23.4s, v12.4s, v26.4s +ldr q27, [x0, #992] +sqrdmulh v6.4S, v4.4S, v3.s[1] +add v12.4s, v12.4s, v26.4s +mla v0.4S, v29.4S, v31.s[0] +ldr q29, [x0, #928] +sqrdmulh v26.4S, v14.4S, v3.s[2] +sub v30.4s, v16.4s, v22.4s +mla v18.4S, v25.4S, v31.s[0] +ldr q25, [x0, #800] +sqrdmulh v21.4S, v12.4S, v17.s[1] +add v16.4s, v16.4s, v22.4s +str q30, [x0, #528] +mla v13.4S, v15.4S, v31.s[0] +ldr q15, [x17, #+512] +ldr q30, [x17, #+528] +sqrdmulh v22.4S, v23.4S, v17.s[2] +sub v5.4s, v1.4s, v0.4s +str q16, [x0, #512] +mul v4.4S, v4.4S,v24.s[1] +add v1.4s, v1.4s, v0.4s +mul v14.4S, v14.4S,v24.s[2] +str q5, [x0, #560] +mla v4.4S, v6.4S, v31.s[0] +sub v6.4s, v10.4s, v18.4s +mla v14.4S, v26.4S, v31.s[0] +str q1, [x0, #544] +mul v12.4S, v12.4S,v19.s[1] +str q6, [x0, #592] +mul v23.4S, v23.4S,v19.s[2] +add v10.4s, v10.4s, v18.4s +str q10, [x0, #576] +mla v12.4S, v21.4S, v31.s[0] +sub v21.4s, v20.4s, v13.4s +str q21, [x0, #624] +mla v23.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v13.4s +str q20, [x0, #608] +sqrdmulh v17.4S, v25.4S, v30.s[0] +sub v19.4s, v7.4s, v4.4s +mul v25.4S, v25.4S,v15.s[0] +str q19, [x0, #656] +ldr q19, [x0, #816] +sqrdmulh v20.4S, v19.4S, v30.s[0] +add v7.4s, v7.4s, v4.4s +mul v19.4S, v19.4S,v15.s[0] +str q7, [x0, #640] +ldr q7, [x17, #+544] +ldr q4, [x17, #+560] +ldr q13, [x0, #864] +sqrdmulh v22.4S, v13.4S, v4.s[0] +sub v21.4s, v9.4s, v14.4s +mul v13.4S, v13.4S,v7.s[0] +str q21, [x0, #688] +ldr q21, [x0, #880] +sqrdmulh v10.4S, v21.4S, v4.s[0] +add v9.4s, v9.4s, v14.4s +mul v21.4S, v21.4S,v7.s[0] +str q9, [x0, #672] +ldr q9, [x17, #+576] +ldr q14, [x17, #+592] +mla v25.4S, v17.4S, v31.s[0] +sub v17.4s, v8.4s, v12.4s +sqrdmulh v18.4S, v29.4S, v14.s[0] +str q17, [x0, #720] +ldr q17, [x0, #944] +mla v19.4S, v20.4S, v31.s[0] +add v8.4s, v8.4s, v12.4s +sqrdmulh v12.4S, v17.4S, v14.s[0] +str q8, [x0, #704] +ldr q8, [x17, #+608] +ldr q20, [x17, #+624] +mla v13.4S, v22.4S, v31.s[0] +sub v22.4s, v28.4s, v23.4s +sqrdmulh v6.4S, v27.4S, v20.s[0] +str q22, [x0, #752] +ldr q22, [x0, #1008] +mla v21.4S, v10.4S, v31.s[0] +add v28.4s, v28.4s, v23.4s +sqrdmulh v23.4S, v22.4S, v20.s[0] +str q28, [x0, #736] +ldr q28, [x0, #768] +ldr q10, [x0, #896] +mul v29.4S, v29.4S,v9.s[0] +sub v3.4s, v28.4s, v25.4s +ldr q24, [x0, #784] +mul v17.4S, v17.4S,v9.s[0] +add v28.4s, v28.4s, v25.4s +ldr q25, [x0, #912] +mla v29.4S, v18.4S, v31.s[0] +sub v18.4s, v24.4s, v19.4s +ldr q1, [x0, #832] +mla v17.4S, v12.4S, v31.s[0] +add v24.4s, v24.4s, v19.4s +ldr q19, [x0, #960] +mul v27.4S, v27.4S,v8.s[0] +sub v12.4s, v1.4s, v13.4s +ldr q26, [x0, #848] +mul v22.4S, v22.4S,v8.s[0] +add v1.4s, v1.4s, v13.4s +ldr q13, [x0, #976] +mla v27.4S, v6.4S, v31.s[0] +mla v22.4S, v23.4S, v31.s[0] +sub v23.4s, v26.4s, v21.4s +sqrdmulh v6.4S, v24.4S, v30.s[1] +add v26.4s, v26.4s, v21.4s +mul v24.4S, v24.4S,v15.s[1] +sqrdmulh v21.4S, v18.4S, v30.s[2] +sub v5.4s, v10.4s, v29.4s +mul v18.4S, v18.4S,v15.s[2] +add v10.4s, v10.4s, v29.4s +sqrdmulh v30.4S, v26.4S, v4.s[1] +sub v15.4s, v25.4s, v17.4s +mul v26.4S, v26.4S,v7.s[1] +add v25.4s, v25.4s, v17.4s +sqrdmulh v17.4S, v23.4S, v4.s[2] +sub v29.4s, v19.4s, v27.4s +mul v23.4S, v23.4S,v7.s[2] +add v19.4s, v19.4s, v27.4s +mla v24.4S, v6.4S, v31.s[0] +sub v6.4s, v13.4s, v22.4s +sqrdmulh v4.4S, v25.4S, v14.s[1] +add v13.4s, v13.4s, v22.4s +mla v18.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v15.4S, v14.s[2] +sub v22.4s, v28.4s, v24.4s +mla v26.4S, v30.4S, v31.s[0] +sqrdmulh v30.4S, v13.4S, v20.s[1] +add v28.4s, v28.4s, v24.4s +str q22, [x0, #784] +mla v23.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v6.4S, v20.s[2] +sub v22.4s, v3.4s, v18.4s +str q28, [x0, #768] +mul v25.4S, v25.4S,v9.s[1] +add v3.4s, v3.4s, v18.4s +mul v15.4S, v15.4S,v9.s[2] +str q22, [x0, #816] +mla v25.4S, v4.4S, v31.s[0] +sub v4.4s, v1.4s, v26.4s +mla v15.4S, v21.4S, v31.s[0] +str q3, [x0, #800] +mul v13.4S, v13.4S,v8.s[1] +str q4, [x0, #848] +mul v6.4S, v6.4S,v8.s[2] +add v1.4s, v1.4s, v26.4s +str q1, [x0, #832] +mla v13.4S, v30.4S, v31.s[0] +sub v30.4s, v12.4s, v23.4s +str q30, [x0, #880] +mla v6.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v23.4s +str q12, [x0, #864] +sub v20.4s, v10.4s, v25.4s +str q20, [x0, #912] +add v10.4s, v10.4s, v25.4s +str q10, [x0, #896] +sub v10.4s, v5.4s, v15.4s +str q10, [x0, #944] +add v5.4s, v5.4s, v15.4s +str q5, [x0, #928] +sub v5.4s, v19.4s, v13.4s +str q5, [x0, #976] +add v19.4s, v19.4s, v13.4s +str q19, [x0, #960] +sub v19.4s, v29.4s, v6.4s +str q19, [x0, #1008] +add v29.4s, v29.4s, v6.4s +str q29, [x0, #992] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1464 +// Instruction count: 1460 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_12_z4_7.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_12_z4_7.s new file mode 100644 index 0000000..b2cda5a --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_12_z4_7.s @@ -0,0 +1,1494 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_12_z4_7 +.global _ntt_u32_incomplete_neon_asm_var_4_2_12_z4_7 +ntt_u32_incomplete_neon_asm_var_4_2_12_z4_7: +_ntt_u32_incomplete_neon_asm_var_4_2_12_z4_7: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x0, #928] +ldr q29, [x17, #+0] +ldr q28, [x17, #+16] +sqrdmulh v27.4S, v30.4S, v28.s[0] +mul v30.4S, v30.4S,v29.s[0] +ldr q26, [x0, #992] +sqrdmulh v25.4S, v26.4S, v28.s[0] +mul v26.4S, v26.4S,v29.s[0] +ldr q24, [x0, #800] +sqrdmulh v23.4S, v24.4S, v28.s[0] +mul v24.4S, v24.4S,v29.s[0] +ldr q22, [x0, #864] +sqrdmulh v21.4S, v22.4S, v28.s[0] +mul v22.4S, v22.4S,v29.s[0] +ldr q20, [x0, #544] +mla v30.4S, v27.4S, v31.s[0] +sqrdmulh v27.4S, v20.4S, v28.s[0] +ldr q19, [x0, #608] +mla v26.4S, v25.4S, v31.s[0] +sqrdmulh v25.4S, v19.4S, v28.s[0] +ldr q18, [x0, #672] +mla v24.4S, v23.4S, v31.s[0] +sqrdmulh v23.4S, v18.4S, v28.s[0] +ldr q17, [x0, #736] +mla v22.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v17.4S, v28.s[0] +ldr q16, [x0, #416] +ldr q3, [x0, #480] +mul v20.4S, v20.4S,v29.s[0] +sub v2.4s, v16.4s, v30.4s +mul v19.4S, v19.4S,v29.s[0] +add v16.4s, v16.4s, v30.4s +ldr q30, [x0, #288] +ldr q1, [x0, #352] +mla v20.4S, v27.4S, v31.s[0] +sub v27.4s, v3.4s, v26.4s +mla v19.4S, v25.4S, v31.s[0] +add v3.4s, v3.4s, v26.4s +ldr q26, [x0, #32] +ldr q25, [x0, #96] +mul v18.4S, v18.4S,v29.s[0] +sub v0.4s, v30.4s, v24.4s +mul v17.4S, v17.4S,v29.s[0] +add v30.4s, v30.4s, v24.4s +ldr q24, [x0, #160] +ldr q15, [x0, #224] +mla v18.4S, v23.4S, v31.s[0] +sub v23.4s, v1.4s, v22.4s +mla v17.4S, v21.4S, v31.s[0] +add v1.4s, v1.4s, v22.4s +sqrdmulh v22.4S, v16.4S, v28.s[1] +mul v16.4S, v16.4S,v29.s[1] +sqrdmulh v21.4S, v3.4S, v28.s[1] +sub v14.4s, v26.4s, v20.4s +mul v3.4S, v3.4S,v29.s[1] +add v26.4s, v26.4s, v20.4s +sqrdmulh v20.4S, v30.4S, v28.s[1] +sub v13.4s, v25.4s, v19.4s +mul v30.4S, v30.4S,v29.s[1] +add v25.4s, v25.4s, v19.4s +sqrdmulh v19.4S, v1.4S, v28.s[1] +sub v12.4s, v24.4s, v18.4s +mul v1.4S, v1.4S,v29.s[1] +add v24.4s, v24.4s, v18.4s +mla v16.4S, v22.4S, v31.s[0] +sub v22.4s, v15.4s, v17.4s +sqrdmulh v18.4S, v2.4S, v28.s[2] +add v15.4s, v15.4s, v17.4s +mla v3.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v27.4S, v28.s[2] +mla v30.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v0.4S, v28.s[2] +mla v1.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v23.4S, v28.s[2] +ldr q17, [x17, #+32] +ldr q11, [x17, #+48] +mul v2.4S, v2.4S,v29.s[2] +sub v10.4s, v24.4s, v16.4s +mul v27.4S, v27.4S,v29.s[2] +add v24.4s, v24.4s, v16.4s +mla v2.4S, v18.4S, v31.s[0] +sub v18.4s, v15.4s, v3.4s +mla v27.4S, v21.4S, v31.s[0] +add v15.4s, v15.4s, v3.4s +mul v0.4S, v0.4S,v29.s[2] +sub v3.4s, v26.4s, v30.4s +mul v23.4S, v23.4S,v29.s[2] +add v26.4s, v26.4s, v30.4s +mla v0.4S, v20.4S, v31.s[0] +sub v20.4s, v25.4s, v1.4s +mla v23.4S, v19.4S, v31.s[0] +add v25.4s, v25.4s, v1.4s +sqrdmulh v1.4S, v10.4S, v11.s[1] +mul v10.4S, v10.4S,v17.s[1] +sqrdmulh v19.4S, v18.4S, v11.s[1] +sub v30.4s, v12.4s, v2.4s +mul v18.4S, v18.4S,v17.s[1] +add v12.4s, v12.4s, v2.4s +sqrdmulh v2.4S, v24.4S, v11.s[0] +sub v21.4s, v22.4s, v27.4s +mul v24.4S, v24.4S,v17.s[0] +add v22.4s, v22.4s, v27.4s +sqrdmulh v27.4S, v15.4S, v11.s[0] +sub v16.4s, v14.4s, v0.4s +mul v15.4S, v15.4S,v17.s[0] +add v14.4s, v14.4s, v0.4s +ldr q0, [x17, #+64] +ldr q9, [x17, #+80] +mla v10.4S, v1.4S, v31.s[0] +sub v1.4s, v13.4s, v23.4s +sqrdmulh v8.4S, v12.4S, v11.s[2] +add v13.4s, v13.4s, v23.4s +mla v18.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v22.4S, v11.s[2] +mla v24.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v30.4S, v11.s[3] +mla v15.4S, v27.4S, v31.s[0] +sqrdmulh v27.4S, v21.4S, v11.s[3] +ldr q23, [x17, #+96] +ldr q7, [x17, #+112] +mul v12.4S, v12.4S,v17.s[2] +sub v6.4s, v3.4s, v10.4s +mul v22.4S, v22.4S,v17.s[2] +add v3.4s, v3.4s, v10.4s +mla v12.4S, v8.4S, v31.s[0] +sub v8.4s, v20.4s, v18.4s +mla v22.4S, v19.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +mul v30.4S, v30.4S,v17.s[3] +sub v18.4s, v26.4s, v24.4s +mul v21.4S, v21.4S,v17.s[3] +add v26.4s, v26.4s, v24.4s +mla v30.4S, v2.4S, v31.s[0] +sub v2.4s, v25.4s, v15.4s +mla v21.4S, v27.4S, v31.s[0] +add v25.4s, v25.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v9.s[2] +mul v20.4S, v20.4S,v0.s[2] +sqrdmulh v27.4S, v8.4S, v9.s[3] +sub v24.4s, v14.4s, v12.4s +mul v8.4S, v8.4S,v0.s[3] +add v14.4s, v14.4s, v12.4s +sqrdmulh v12.4S, v2.4S, v9.s[1] +sub v19.4s, v13.4s, v22.4s +mul v2.4S, v2.4S,v0.s[1] +add v13.4s, v13.4s, v22.4s +sqrdmulh v22.4S, v25.4S, v9.s[0] +sub v10.4s, v16.4s, v30.4s +mul v25.4S, v25.4S,v0.s[0] +add v16.4s, v16.4s, v30.4s +mla v20.4S, v15.4S, v31.s[0] +sub v15.4s, v1.4s, v21.4s +sqrdmulh v30.4S, v13.4S, v7.s[0] +add v1.4s, v1.4s, v21.4s +mla v8.4S, v27.4S, v31.s[0] +sub v27.4s, v3.4s, v20.4s +sqrdmulh v21.4S, v19.4S, v7.s[1] +add v3.4s, v3.4s, v20.4s +mla v2.4S, v12.4S, v31.s[0] +sub v12.4s, v6.4s, v8.4s +sqrdmulh v20.4S, v1.4S, v7.s[2] +add v6.4s, v6.4s, v8.4s +mla v25.4S, v22.4S, v31.s[0] +sub v22.4s, v18.4s, v2.4s +sqrdmulh v8.4S, v15.4S, v7.s[3] +add v18.4s, v18.4s, v2.4s +mul v13.4S, v13.4S,v23.s[0] +sub v2.4s, v26.4s, v25.4s +mul v19.4S, v19.4S,v23.s[1] +add v26.4s, v26.4s, v25.4s +mla v13.4S, v30.4S, v31.s[0] +str q27, [x0, #352] +mla v19.4S, v21.4S, v31.s[0] +str q3, [x0, #288] +mul v1.4S, v1.4S,v23.s[2] +str q12, [x0, #480] +mul v15.4S, v15.4S,v23.s[3] +str q6, [x0, #416] +mla v1.4S, v20.4S, v31.s[0] +str q22, [x0, #224] +mla v15.4S, v8.4S, v31.s[0] +str q18, [x0, #160] +ldr q18, [x0, #944] +sqrdmulh v8.4S, v18.4S, v28.s[0] +str q2, [x0, #96] +mul v18.4S, v18.4S,v29.s[0] +str q26, [x0, #32] +ldr q26, [x0, #1008] +sqrdmulh v2.4S, v26.4S, v28.s[0] +sub v22.4s, v14.4s, v13.4s +str q22, [x0, #608] +mul v26.4S, v26.4S,v29.s[0] +add v14.4s, v14.4s, v13.4s +ldr q13, [x0, #816] +sqrdmulh v22.4S, v13.4S, v28.s[0] +sub v20.4s, v24.4s, v19.4s +str q14, [x0, #544] +mul v13.4S, v13.4S,v29.s[0] +add v24.4s, v24.4s, v19.4s +ldr q19, [x0, #880] +sqrdmulh v14.4S, v19.4S, v28.s[0] +sub v6.4s, v16.4s, v1.4s +str q20, [x0, #736] +mul v19.4S, v19.4S,v29.s[0] +add v16.4s, v16.4s, v1.4s +ldr q1, [x0, #560] +mla v18.4S, v8.4S, v31.s[0] +sub v8.4s, v10.4s, v15.4s +str q24, [x0, #672] +sqrdmulh v24.4S, v1.4S, v28.s[0] +add v10.4s, v10.4s, v15.4s +ldr q15, [x0, #624] +mla v26.4S, v2.4S, v31.s[0] +str q6, [x0, #864] +sqrdmulh v6.4S, v15.4S, v28.s[0] +ldr q2, [x0, #688] +mla v13.4S, v22.4S, v31.s[0] +str q16, [x0, #800] +sqrdmulh v16.4S, v2.4S, v28.s[0] +ldr q22, [x0, #752] +mla v19.4S, v14.4S, v31.s[0] +str q8, [x0, #992] +sqrdmulh v8.4S, v22.4S, v28.s[0] +ldr q14, [x0, #432] +ldr q20, [x0, #496] +mul v1.4S, v1.4S,v29.s[0] +sub v12.4s, v14.4s, v18.4s +str q10, [x0, #928] +mul v15.4S, v15.4S,v29.s[0] +add v14.4s, v14.4s, v18.4s +ldr q18, [x0, #304] +ldr q10, [x0, #368] +mla v1.4S, v24.4S, v31.s[0] +sub v24.4s, v20.4s, v26.4s +mla v15.4S, v6.4S, v31.s[0] +add v20.4s, v20.4s, v26.4s +ldr q26, [x0, #48] +ldr q6, [x0, #112] +mul v2.4S, v2.4S,v29.s[0] +sub v3.4s, v18.4s, v13.4s +mul v22.4S, v22.4S,v29.s[0] +add v18.4s, v18.4s, v13.4s +ldr q13, [x0, #176] +ldr q21, [x0, #240] +mla v2.4S, v16.4S, v31.s[0] +sub v16.4s, v10.4s, v19.4s +mla v22.4S, v8.4S, v31.s[0] +add v10.4s, v10.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v28.s[1] +mul v14.4S, v14.4S,v29.s[1] +sqrdmulh v8.4S, v20.4S, v28.s[1] +sub v27.4s, v26.4s, v1.4s +mul v20.4S, v20.4S,v29.s[1] +add v26.4s, v26.4s, v1.4s +sqrdmulh v1.4S, v18.4S, v28.s[1] +sub v30.4s, v6.4s, v15.4s +mul v18.4S, v18.4S,v29.s[1] +add v6.4s, v6.4s, v15.4s +sqrdmulh v15.4S, v10.4S, v28.s[1] +sub v25.4s, v13.4s, v2.4s +mul v10.4S, v10.4S,v29.s[1] +add v13.4s, v13.4s, v2.4s +mla v14.4S, v19.4S, v31.s[0] +sub v19.4s, v21.4s, v22.4s +sqrdmulh v2.4S, v12.4S, v28.s[2] +add v21.4s, v21.4s, v22.4s +mla v20.4S, v8.4S, v31.s[0] +sqrdmulh v8.4S, v24.4S, v28.s[2] +mla v18.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v3.4S, v28.s[2] +mla v10.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v16.4S, v28.s[2] +mul v12.4S, v12.4S,v29.s[2] +sub v22.4s, v13.4s, v14.4s +mul v24.4S, v24.4S,v29.s[2] +add v13.4s, v13.4s, v14.4s +mla v12.4S, v2.4S, v31.s[0] +sub v2.4s, v21.4s, v20.4s +mla v24.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v20.4s +mul v3.4S, v3.4S,v29.s[2] +sub v20.4s, v26.4s, v18.4s +mul v16.4S, v16.4S,v29.s[2] +add v26.4s, v26.4s, v18.4s +mla v3.4S, v1.4S, v31.s[0] +sub v1.4s, v6.4s, v10.4s +mla v16.4S, v15.4S, v31.s[0] +add v6.4s, v6.4s, v10.4s +sqrdmulh v10.4S, v22.4S, v11.s[1] +mul v22.4S, v22.4S,v17.s[1] +sqrdmulh v15.4S, v2.4S, v11.s[1] +sub v18.4s, v25.4s, v12.4s +mul v2.4S, v2.4S,v17.s[1] +add v25.4s, v25.4s, v12.4s +sqrdmulh v12.4S, v13.4S, v11.s[0] +sub v8.4s, v19.4s, v24.4s +mul v13.4S, v13.4S,v17.s[0] +add v19.4s, v19.4s, v24.4s +sqrdmulh v24.4S, v21.4S, v11.s[0] +sub v14.4s, v27.4s, v3.4s +mul v21.4S, v21.4S,v17.s[0] +add v27.4s, v27.4s, v3.4s +mla v22.4S, v10.4S, v31.s[0] +sub v10.4s, v30.4s, v16.4s +sqrdmulh v3.4S, v25.4S, v11.s[2] +add v30.4s, v30.4s, v16.4s +mla v2.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v19.4S, v11.s[2] +mla v13.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v18.4S, v11.s[3] +mla v21.4S, v24.4S, v31.s[0] +sqrdmulh v24.4S, v8.4S, v11.s[3] +mul v25.4S, v25.4S,v17.s[2] +sub v16.4s, v20.4s, v22.4s +mul v19.4S, v19.4S,v17.s[2] +add v20.4s, v20.4s, v22.4s +mla v25.4S, v3.4S, v31.s[0] +sub v3.4s, v1.4s, v2.4s +mla v19.4S, v15.4S, v31.s[0] +add v1.4s, v1.4s, v2.4s +mul v18.4S, v18.4S,v17.s[3] +sub v2.4s, v26.4s, v13.4s +mul v8.4S, v8.4S,v17.s[3] +add v26.4s, v26.4s, v13.4s +mla v18.4S, v12.4S, v31.s[0] +sub v12.4s, v6.4s, v21.4s +mla v8.4S, v24.4S, v31.s[0] +add v6.4s, v6.4s, v21.4s +sqrdmulh v21.4S, v1.4S, v9.s[2] +mul v1.4S, v1.4S,v0.s[2] +sqrdmulh v24.4S, v3.4S, v9.s[3] +sub v13.4s, v27.4s, v25.4s +mul v3.4S, v3.4S,v0.s[3] +add v27.4s, v27.4s, v25.4s +sqrdmulh v25.4S, v12.4S, v9.s[1] +sub v15.4s, v30.4s, v19.4s +mul v12.4S, v12.4S,v0.s[1] +add v30.4s, v30.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v9.s[0] +sub v22.4s, v14.4s, v18.4s +mul v6.4S, v6.4S,v0.s[0] +add v14.4s, v14.4s, v18.4s +mla v1.4S, v21.4S, v31.s[0] +sub v21.4s, v10.4s, v8.4s +sqrdmulh v18.4S, v30.4S, v7.s[0] +add v10.4s, v10.4s, v8.4s +mla v3.4S, v24.4S, v31.s[0] +sub v24.4s, v20.4s, v1.4s +sqrdmulh v8.4S, v15.4S, v7.s[1] +add v20.4s, v20.4s, v1.4s +mla v12.4S, v25.4S, v31.s[0] +sub v25.4s, v16.4s, v3.4s +sqrdmulh v1.4S, v10.4S, v7.s[2] +add v16.4s, v16.4s, v3.4s +mla v6.4S, v19.4S, v31.s[0] +sub v19.4s, v2.4s, v12.4s +sqrdmulh v3.4S, v21.4S, v7.s[3] +add v2.4s, v2.4s, v12.4s +mul v30.4S, v30.4S,v23.s[0] +sub v12.4s, v26.4s, v6.4s +mul v15.4S, v15.4S,v23.s[1] +add v26.4s, v26.4s, v6.4s +mla v30.4S, v18.4S, v31.s[0] +str q24, [x0, #368] +mla v15.4S, v8.4S, v31.s[0] +str q20, [x0, #304] +mul v10.4S, v10.4S,v23.s[2] +str q25, [x0, #496] +mul v21.4S, v21.4S,v23.s[3] +str q16, [x0, #432] +mla v10.4S, v1.4S, v31.s[0] +str q19, [x0, #240] +mla v21.4S, v3.4S, v31.s[0] +str q2, [x0, #176] +ldr q2, [x0, #896] +sqrdmulh v3.4S, v2.4S, v28.s[0] +str q12, [x0, #112] +mul v2.4S, v2.4S,v29.s[0] +str q26, [x0, #48] +ldr q26, [x0, #960] +sqrdmulh v12.4S, v26.4S, v28.s[0] +sub v19.4s, v27.4s, v30.4s +str q19, [x0, #624] +mul v26.4S, v26.4S,v29.s[0] +add v27.4s, v27.4s, v30.4s +ldr q30, [x0, #768] +sqrdmulh v19.4S, v30.4S, v28.s[0] +sub v1.4s, v13.4s, v15.4s +str q27, [x0, #560] +mul v30.4S, v30.4S,v29.s[0] +add v13.4s, v13.4s, v15.4s +ldr q15, [x0, #832] +sqrdmulh v27.4S, v15.4S, v28.s[0] +sub v16.4s, v14.4s, v10.4s +str q1, [x0, #752] +mul v15.4S, v15.4S,v29.s[0] +add v14.4s, v14.4s, v10.4s +ldr q10, [x0, #512] +mla v2.4S, v3.4S, v31.s[0] +sub v3.4s, v22.4s, v21.4s +str q13, [x0, #688] +sqrdmulh v13.4S, v10.4S, v28.s[0] +add v22.4s, v22.4s, v21.4s +ldr q21, [x0, #576] +mla v26.4S, v12.4S, v31.s[0] +str q16, [x0, #880] +sqrdmulh v16.4S, v21.4S, v28.s[0] +ldr q12, [x0, #640] +mla v30.4S, v19.4S, v31.s[0] +str q14, [x0, #816] +sqrdmulh v14.4S, v12.4S, v28.s[0] +ldr q19, [x0, #704] +mla v15.4S, v27.4S, v31.s[0] +str q3, [x0, #1008] +sqrdmulh v3.4S, v19.4S, v28.s[0] +ldr q27, [x0, #384] +ldr q1, [x0, #448] +mul v10.4S, v10.4S,v29.s[0] +sub v25.4s, v27.4s, v2.4s +str q22, [x0, #944] +mul v21.4S, v21.4S,v29.s[0] +add v27.4s, v27.4s, v2.4s +ldr q2, [x0, #256] +ldr q22, [x0, #320] +mla v10.4S, v13.4S, v31.s[0] +sub v13.4s, v1.4s, v26.4s +mla v21.4S, v16.4S, v31.s[0] +add v1.4s, v1.4s, v26.4s +ldr q26, [x0, #0] +ldr q16, [x0, #64] +mul v12.4S, v12.4S,v29.s[0] +sub v20.4s, v2.4s, v30.4s +mul v19.4S, v19.4S,v29.s[0] +add v2.4s, v2.4s, v30.4s +ldr q30, [x0, #128] +ldr q8, [x0, #192] +mla v12.4S, v14.4S, v31.s[0] +sub v14.4s, v22.4s, v15.4s +mla v19.4S, v3.4S, v31.s[0] +add v22.4s, v22.4s, v15.4s +sqrdmulh v15.4S, v27.4S, v28.s[1] +mul v27.4S, v27.4S,v29.s[1] +sqrdmulh v3.4S, v1.4S, v28.s[1] +sub v24.4s, v26.4s, v10.4s +mul v1.4S, v1.4S,v29.s[1] +add v26.4s, v26.4s, v10.4s +sqrdmulh v10.4S, v2.4S, v28.s[1] +sub v18.4s, v16.4s, v21.4s +mul v2.4S, v2.4S,v29.s[1] +add v16.4s, v16.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v28.s[1] +sub v6.4s, v30.4s, v12.4s +mul v22.4S, v22.4S,v29.s[1] +add v30.4s, v30.4s, v12.4s +mla v27.4S, v15.4S, v31.s[0] +sub v15.4s, v8.4s, v19.4s +sqrdmulh v12.4S, v25.4S, v28.s[2] +add v8.4s, v8.4s, v19.4s +mla v1.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v13.4S, v28.s[2] +mla v2.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v20.4S, v28.s[2] +mla v22.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v14.4S, v28.s[2] +mul v25.4S, v25.4S,v29.s[2] +sub v19.4s, v30.4s, v27.4s +mul v13.4S, v13.4S,v29.s[2] +add v30.4s, v30.4s, v27.4s +mla v25.4S, v12.4S, v31.s[0] +sub v12.4s, v8.4s, v1.4s +mla v13.4S, v3.4S, v31.s[0] +add v8.4s, v8.4s, v1.4s +mul v20.4S, v20.4S,v29.s[2] +sub v1.4s, v26.4s, v2.4s +mul v14.4S, v14.4S,v29.s[2] +add v26.4s, v26.4s, v2.4s +mla v20.4S, v10.4S, v31.s[0] +sub v10.4s, v16.4s, v22.4s +mla v14.4S, v21.4S, v31.s[0] +add v16.4s, v16.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v11.s[1] +mul v19.4S, v19.4S,v17.s[1] +sqrdmulh v21.4S, v12.4S, v11.s[1] +sub v2.4s, v6.4s, v25.4s +mul v12.4S, v12.4S,v17.s[1] +add v6.4s, v6.4s, v25.4s +sqrdmulh v25.4S, v30.4S, v11.s[0] +sub v3.4s, v15.4s, v13.4s +mul v30.4S, v30.4S,v17.s[0] +add v15.4s, v15.4s, v13.4s +sqrdmulh v13.4S, v8.4S, v11.s[0] +sub v27.4s, v24.4s, v20.4s +mul v8.4S, v8.4S,v17.s[0] +add v24.4s, v24.4s, v20.4s +mla v19.4S, v22.4S, v31.s[0] +sub v22.4s, v18.4s, v14.4s +sqrdmulh v20.4S, v6.4S, v11.s[2] +add v18.4s, v18.4s, v14.4s +mla v12.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v15.4S, v11.s[2] +mla v30.4S, v25.4S, v31.s[0] +sqrdmulh v25.4S, v2.4S, v11.s[3] +mla v8.4S, v13.4S, v31.s[0] +sqrdmulh v13.4S, v3.4S, v11.s[3] +mul v6.4S, v6.4S,v17.s[2] +sub v14.4s, v1.4s, v19.4s +mul v15.4S, v15.4S,v17.s[2] +add v1.4s, v1.4s, v19.4s +mla v6.4S, v20.4S, v31.s[0] +sub v20.4s, v10.4s, v12.4s +mla v15.4S, v21.4S, v31.s[0] +add v10.4s, v10.4s, v12.4s +mul v2.4S, v2.4S,v17.s[3] +sub v12.4s, v26.4s, v30.4s +mul v3.4S, v3.4S,v17.s[3] +add v26.4s, v26.4s, v30.4s +mla v2.4S, v25.4S, v31.s[0] +sub v25.4s, v16.4s, v8.4s +mla v3.4S, v13.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v10.4S, v9.s[2] +mul v10.4S, v10.4S,v0.s[2] +sqrdmulh v13.4S, v20.4S, v9.s[3] +sub v30.4s, v24.4s, v6.4s +mul v20.4S, v20.4S,v0.s[3] +add v24.4s, v24.4s, v6.4s +sqrdmulh v6.4S, v25.4S, v9.s[1] +sub v21.4s, v18.4s, v15.4s +mul v25.4S, v25.4S,v0.s[1] +add v18.4s, v18.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v9.s[0] +sub v19.4s, v27.4s, v2.4s +mul v16.4S, v16.4S,v0.s[0] +add v27.4s, v27.4s, v2.4s +mla v10.4S, v8.4S, v31.s[0] +sub v8.4s, v22.4s, v3.4s +sqrdmulh v2.4S, v18.4S, v7.s[0] +add v22.4s, v22.4s, v3.4s +mla v20.4S, v13.4S, v31.s[0] +sub v13.4s, v1.4s, v10.4s +sqrdmulh v3.4S, v21.4S, v7.s[1] +add v1.4s, v1.4s, v10.4s +mla v25.4S, v6.4S, v31.s[0] +sub v6.4s, v14.4s, v20.4s +sqrdmulh v10.4S, v22.4S, v7.s[2] +add v14.4s, v14.4s, v20.4s +mla v16.4S, v15.4S, v31.s[0] +sub v15.4s, v12.4s, v25.4s +sqrdmulh v20.4S, v8.4S, v7.s[3] +add v12.4s, v12.4s, v25.4s +mul v18.4S, v18.4S,v23.s[0] +sub v25.4s, v26.4s, v16.4s +mul v21.4S, v21.4S,v23.s[1] +add v26.4s, v26.4s, v16.4s +mla v18.4S, v2.4S, v31.s[0] +str q13, [x0, #320] +mla v21.4S, v3.4S, v31.s[0] +str q1, [x0, #256] +mul v22.4S, v22.4S,v23.s[2] +str q6, [x0, #448] +mul v8.4S, v8.4S,v23.s[3] +str q14, [x0, #384] +mla v22.4S, v10.4S, v31.s[0] +str q15, [x0, #192] +mla v8.4S, v20.4S, v31.s[0] +str q12, [x0, #128] +ldr q12, [x0, #912] +sqrdmulh v20.4S, v12.4S, v28.s[0] +str q25, [x0, #64] +mul v12.4S, v12.4S,v29.s[0] +str q26, [x0, #0] +ldr q26, [x0, #976] +sqrdmulh v25.4S, v26.4S, v28.s[0] +sub v15.4s, v24.4s, v18.4s +str q15, [x0, #576] +mul v26.4S, v26.4S,v29.s[0] +add v24.4s, v24.4s, v18.4s +ldr q18, [x0, #784] +sqrdmulh v15.4S, v18.4S, v28.s[0] +sub v10.4s, v30.4s, v21.4s +str q24, [x0, #512] +mul v18.4S, v18.4S,v29.s[0] +add v30.4s, v30.4s, v21.4s +ldr q21, [x0, #848] +sqrdmulh v24.4S, v21.4S, v28.s[0] +sub v14.4s, v27.4s, v22.4s +str q10, [x0, #704] +mul v21.4S, v21.4S,v29.s[0] +add v27.4s, v27.4s, v22.4s +ldr q22, [x0, #528] +mla v12.4S, v20.4S, v31.s[0] +sub v20.4s, v19.4s, v8.4s +str q30, [x0, #640] +sqrdmulh v30.4S, v22.4S, v28.s[0] +add v19.4s, v19.4s, v8.4s +ldr q8, [x0, #592] +mla v26.4S, v25.4S, v31.s[0] +str q14, [x0, #832] +sqrdmulh v14.4S, v8.4S, v28.s[0] +ldr q25, [x0, #656] +mla v18.4S, v15.4S, v31.s[0] +str q27, [x0, #768] +sqrdmulh v27.4S, v25.4S, v28.s[0] +ldr q15, [x0, #720] +mla v21.4S, v24.4S, v31.s[0] +str q20, [x0, #960] +sqrdmulh v20.4S, v15.4S, v28.s[0] +ldr q24, [x0, #400] +ldr q10, [x0, #464] +mul v22.4S, v22.4S,v29.s[0] +sub v6.4s, v24.4s, v12.4s +str q19, [x0, #896] +mul v8.4S, v8.4S,v29.s[0] +add v24.4s, v24.4s, v12.4s +ldr q12, [x0, #272] +ldr q19, [x0, #336] +mla v22.4S, v30.4S, v31.s[0] +sub v30.4s, v10.4s, v26.4s +mla v8.4S, v14.4S, v31.s[0] +add v10.4s, v10.4s, v26.4s +ldr q26, [x0, #16] +ldr q14, [x0, #80] +mul v25.4S, v25.4S,v29.s[0] +sub v1.4s, v12.4s, v18.4s +mul v15.4S, v15.4S,v29.s[0] +add v12.4s, v12.4s, v18.4s +ldr q18, [x0, #144] +ldr q3, [x0, #208] +mla v25.4S, v27.4S, v31.s[0] +sub v27.4s, v19.4s, v21.4s +mla v15.4S, v20.4S, v31.s[0] +add v19.4s, v19.4s, v21.4s +sqrdmulh v21.4S, v24.4S, v28.s[1] +mul v24.4S, v24.4S,v29.s[1] +sqrdmulh v20.4S, v10.4S, v28.s[1] +sub v13.4s, v26.4s, v22.4s +mul v10.4S, v10.4S,v29.s[1] +add v26.4s, v26.4s, v22.4s +sqrdmulh v22.4S, v12.4S, v28.s[1] +sub v2.4s, v14.4s, v8.4s +mul v12.4S, v12.4S,v29.s[1] +add v14.4s, v14.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v28.s[1] +sub v16.4s, v18.4s, v25.4s +mul v19.4S, v19.4S,v29.s[1] +add v18.4s, v18.4s, v25.4s +mla v24.4S, v21.4S, v31.s[0] +sub v21.4s, v3.4s, v15.4s +sqrdmulh v25.4S, v6.4S, v28.s[2] +add v3.4s, v3.4s, v15.4s +mla v10.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v30.4S, v28.s[2] +mla v12.4S, v22.4S, v31.s[0] +sqrdmulh v22.4S, v1.4S, v28.s[2] +mla v19.4S, v8.4S, v31.s[0] +sqrdmulh v8.4S, v27.4S, v28.s[2] +mul v6.4S, v6.4S,v29.s[2] +sub v15.4s, v18.4s, v24.4s +mul v30.4S, v30.4S,v29.s[2] +add v18.4s, v18.4s, v24.4s +mla v6.4S, v25.4S, v31.s[0] +sub v25.4s, v3.4s, v10.4s +mla v30.4S, v20.4S, v31.s[0] +add v3.4s, v3.4s, v10.4s +mul v1.4S, v1.4S,v29.s[2] +sub v10.4s, v26.4s, v12.4s +mul v27.4S, v27.4S,v29.s[2] +add v26.4s, v26.4s, v12.4s +mla v1.4S, v22.4S, v31.s[0] +sub v22.4s, v14.4s, v19.4s +mla v27.4S, v8.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +sqrdmulh v28.4S, v15.4S, v11.s[1] +mul v15.4S, v15.4S,v17.s[1] +sqrdmulh v29.4S, v25.4S, v11.s[1] +sub v19.4s, v16.4s, v6.4s +mul v25.4S, v25.4S,v17.s[1] +add v16.4s, v16.4s, v6.4s +sqrdmulh v6.4S, v18.4S, v11.s[0] +sub v8.4s, v21.4s, v30.4s +mul v18.4S, v18.4S,v17.s[0] +add v21.4s, v21.4s, v30.4s +sqrdmulh v30.4S, v3.4S, v11.s[0] +sub v12.4s, v13.4s, v1.4s +mul v3.4S, v3.4S,v17.s[0] +add v13.4s, v13.4s, v1.4s +mla v15.4S, v28.4S, v31.s[0] +sub v28.4s, v2.4s, v27.4s +sqrdmulh v1.4S, v16.4S, v11.s[2] +add v2.4s, v2.4s, v27.4s +mla v25.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v21.4S, v11.s[2] +mla v18.4S, v6.4S, v31.s[0] +sqrdmulh v6.4S, v19.4S, v11.s[3] +mla v3.4S, v30.4S, v31.s[0] +sqrdmulh v30.4S, v8.4S, v11.s[3] +mul v16.4S, v16.4S,v17.s[2] +sub v27.4s, v10.4s, v15.4s +mul v21.4S, v21.4S,v17.s[2] +add v10.4s, v10.4s, v15.4s +mla v16.4S, v1.4S, v31.s[0] +sub v1.4s, v22.4s, v25.4s +mla v21.4S, v29.4S, v31.s[0] +add v22.4s, v22.4s, v25.4s +mul v19.4S, v19.4S,v17.s[3] +sub v25.4s, v26.4s, v18.4s +mul v8.4S, v8.4S,v17.s[3] +add v26.4s, v26.4s, v18.4s +mla v19.4S, v6.4S, v31.s[0] +sub v6.4s, v14.4s, v3.4s +mla v8.4S, v30.4S, v31.s[0] +add v14.4s, v14.4s, v3.4s +sqrdmulh v11.4S, v22.4S, v9.s[2] +mul v22.4S, v22.4S,v0.s[2] +sqrdmulh v17.4S, v1.4S, v9.s[3] +sub v3.4s, v13.4s, v16.4s +mul v1.4S, v1.4S,v0.s[3] +add v13.4s, v13.4s, v16.4s +sqrdmulh v16.4S, v6.4S, v9.s[1] +sub v30.4s, v2.4s, v21.4s +mul v6.4S, v6.4S,v0.s[1] +add v2.4s, v2.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v9.s[0] +sub v18.4s, v12.4s, v19.4s +mul v14.4S, v14.4S,v0.s[0] +add v12.4s, v12.4s, v19.4s +mla v22.4S, v11.4S, v31.s[0] +sub v11.4s, v28.4s, v8.4s +sqrdmulh v9.4S, v2.4S, v7.s[0] +add v28.4s, v28.4s, v8.4s +mla v1.4S, v17.4S, v31.s[0] +sub v17.4s, v10.4s, v22.4s +sqrdmulh v8.4S, v30.4S, v7.s[1] +add v10.4s, v10.4s, v22.4s +mla v6.4S, v16.4S, v31.s[0] +sub v16.4s, v27.4s, v1.4s +sqrdmulh v22.4S, v28.4S, v7.s[2] +add v27.4s, v27.4s, v1.4s +mla v14.4S, v21.4S, v31.s[0] +sub v21.4s, v25.4s, v6.4s +sqrdmulh v1.4S, v11.4S, v7.s[3] +add v25.4s, v25.4s, v6.4s +mul v2.4S, v2.4S,v23.s[0] +sub v6.4s, v26.4s, v14.4s +mul v30.4S, v30.4S,v23.s[1] +add v26.4s, v26.4s, v14.4s +mla v2.4S, v9.4S, v31.s[0] +str q17, [x0, #336] +mla v30.4S, v8.4S, v31.s[0] +str q10, [x0, #272] +mul v28.4S, v28.4S,v23.s[2] +str q16, [x0, #464] +mul v11.4S, v11.4S,v23.s[3] +str q27, [x0, #400] +mla v28.4S, v22.4S, v31.s[0] +str q21, [x0, #208] +mla v11.4S, v1.4S, v31.s[0] +str q25, [x0, #144] +str q6, [x0, #80] +str q26, [x0, #16] +sub v26.4s, v13.4s, v2.4s +str q26, [x0, #592] +add v13.4s, v13.4s, v2.4s +sub v2.4s, v3.4s, v30.4s +str q13, [x0, #528] +add v3.4s, v3.4s, v30.4s +sub v30.4s, v12.4s, v28.4s +str q2, [x0, #720] +add v12.4s, v12.4s, v28.4s +sub v28.4s, v18.4s, v11.4s +str q3, [x0, #656] +add v18.4s, v18.4s, v11.4s +str q30, [x0, #848] +str q12, [x0, #784] +str q28, [x0, #976] +str q18, [x0, #912] +ldr q4, [x0, #224] +ldr q5, [x0, #160] +ldr q24, [x0, #32] +ldr q20, [x17, #+128] +ldr q15, [x17, #+144] +sqrdmulh v29.4S, v24.4S, v15.s[0] +mul v24.4S, v24.4S,v20.s[0] +ldr q19, [x0, #48] +sqrdmulh v0.4S, v19.4S, v15.s[0] +mul v19.4S, v19.4S,v20.s[0] +ldr q14, [x17, #+160] +ldr q9, [x17, #+176] +ldr q17, [x0, #96] +sqrdmulh v8.4S, v17.4S, v9.s[0] +mul v17.4S, v17.4S,v14.s[0] +ldr q10, [x0, #112] +sqrdmulh v16.4S, v10.4S, v9.s[0] +mul v10.4S, v10.4S,v14.s[0] +ldr q27, [x17, #+192] +ldr q22, [x17, #+208] +mla v24.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v5.4S, v22.s[0] +ldr q21, [x0, #176] +mla v19.4S, v0.4S, v31.s[0] +sqrdmulh v0.4S, v21.4S, v22.s[0] +ldr q1, [x17, #+224] +ldr q25, [x17, #+240] +mla v17.4S, v8.4S, v31.s[0] +sqrdmulh v8.4S, v4.4S, v25.s[0] +ldr q23, [x0, #240] +mla v10.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v23.4S, v25.s[0] +ldr q7, [x0, #0] +ldr q6, [x0, #128] +mul v5.4S, v5.4S,v27.s[0] +sub v26.4s, v7.4s, v24.4s +ldr q13, [x0, #16] +mul v21.4S, v21.4S,v27.s[0] +add v7.4s, v7.4s, v24.4s +ldr q24, [x0, #144] +mla v5.4S, v29.4S, v31.s[0] +sub v29.4s, v13.4s, v19.4s +ldr q2, [x0, #64] +mla v21.4S, v0.4S, v31.s[0] +add v13.4s, v13.4s, v19.4s +ldr q19, [x0, #192] +mul v4.4S, v4.4S,v1.s[0] +sub v0.4s, v2.4s, v17.4s +ldr q3, [x0, #80] +mul v23.4S, v23.4S,v1.s[0] +add v2.4s, v2.4s, v17.4s +ldr q17, [x0, #208] +mla v4.4S, v8.4S, v31.s[0] +mla v23.4S, v16.4S, v31.s[0] +sub v16.4s, v3.4s, v10.4s +sqrdmulh v8.4S, v13.4S, v15.s[1] +add v3.4s, v3.4s, v10.4s +mul v13.4S, v13.4S,v20.s[1] +sqrdmulh v10.4S, v29.4S, v15.s[2] +sub v11.4s, v6.4s, v5.4s +mul v29.4S, v29.4S,v20.s[2] +add v6.4s, v6.4s, v5.4s +sqrdmulh v15.4S, v3.4S, v9.s[1] +sub v20.4s, v24.4s, v21.4s +mul v3.4S, v3.4S,v14.s[1] +add v24.4s, v24.4s, v21.4s +sqrdmulh v21.4S, v16.4S, v9.s[2] +sub v5.4s, v19.4s, v4.4s +mul v16.4S, v16.4S,v14.s[2] +add v19.4s, v19.4s, v4.4s +mla v13.4S, v8.4S, v31.s[0] +sub v8.4s, v17.4s, v23.4s +ldr q9, [x0, #480] +sqrdmulh v14.4S, v24.4S, v22.s[1] +add v17.4s, v17.4s, v23.4s +mla v29.4S, v10.4S, v31.s[0] +ldr q10, [x0, #416] +sqrdmulh v23.4S, v20.4S, v22.s[2] +sub v4.4s, v7.4s, v13.4s +mla v3.4S, v15.4S, v31.s[0] +ldr q15, [x0, #288] +sqrdmulh v30.4S, v17.4S, v25.s[1] +add v7.4s, v7.4s, v13.4s +str q4, [x0, #16] +mla v16.4S, v21.4S, v31.s[0] +ldr q21, [x17, #+256] +ldr q4, [x17, #+272] +sqrdmulh v13.4S, v8.4S, v25.s[2] +sub v12.4s, v26.4s, v29.4s +str q7, [x0, #0] +mul v24.4S, v24.4S,v27.s[1] +add v26.4s, v26.4s, v29.4s +mul v20.4S, v20.4S,v27.s[2] +str q12, [x0, #48] +mla v24.4S, v14.4S, v31.s[0] +sub v14.4s, v2.4s, v3.4s +mla v20.4S, v23.4S, v31.s[0] +str q26, [x0, #32] +mul v17.4S, v17.4S,v1.s[1] +str q14, [x0, #80] +mul v8.4S, v8.4S,v1.s[2] +add v2.4s, v2.4s, v3.4s +str q2, [x0, #64] +mla v17.4S, v30.4S, v31.s[0] +sub v30.4s, v0.4s, v16.4s +str q30, [x0, #112] +mla v8.4S, v13.4S, v31.s[0] +add v0.4s, v0.4s, v16.4s +str q0, [x0, #96] +sqrdmulh v25.4S, v15.4S, v4.s[0] +sub v1.4s, v6.4s, v24.4s +mul v15.4S, v15.4S,v21.s[0] +str q1, [x0, #144] +ldr q1, [x0, #304] +sqrdmulh v0.4S, v1.4S, v4.s[0] +add v6.4s, v6.4s, v24.4s +mul v1.4S, v1.4S,v21.s[0] +str q6, [x0, #128] +ldr q6, [x17, #+288] +ldr q24, [x17, #+304] +ldr q16, [x0, #352] +sqrdmulh v13.4S, v16.4S, v24.s[0] +sub v30.4s, v11.4s, v20.4s +mul v16.4S, v16.4S,v6.s[0] +str q30, [x0, #176] +ldr q30, [x0, #368] +sqrdmulh v2.4S, v30.4S, v24.s[0] +add v11.4s, v11.4s, v20.4s +mul v30.4S, v30.4S,v6.s[0] +str q11, [x0, #160] +ldr q11, [x17, #+320] +ldr q20, [x17, #+336] +mla v15.4S, v25.4S, v31.s[0] +sub v25.4s, v19.4s, v17.4s +sqrdmulh v3.4S, v10.4S, v20.s[0] +str q25, [x0, #208] +ldr q25, [x0, #432] +mla v1.4S, v0.4S, v31.s[0] +add v19.4s, v19.4s, v17.4s +sqrdmulh v17.4S, v25.4S, v20.s[0] +str q19, [x0, #192] +ldr q19, [x17, #+352] +ldr q0, [x17, #+368] +mla v16.4S, v13.4S, v31.s[0] +sub v13.4s, v5.4s, v8.4s +sqrdmulh v14.4S, v9.4S, v0.s[0] +str q13, [x0, #240] +ldr q13, [x0, #496] +mla v30.4S, v2.4S, v31.s[0] +add v5.4s, v5.4s, v8.4s +sqrdmulh v8.4S, v13.4S, v0.s[0] +str q5, [x0, #224] +ldr q5, [x0, #256] +ldr q2, [x0, #384] +mul v10.4S, v10.4S,v11.s[0] +sub v22.4s, v5.4s, v15.4s +ldr q27, [x0, #272] +mul v25.4S, v25.4S,v11.s[0] +add v5.4s, v5.4s, v15.4s +ldr q15, [x0, #400] +mla v10.4S, v3.4S, v31.s[0] +sub v3.4s, v27.4s, v1.4s +ldr q26, [x0, #320] +mla v25.4S, v17.4S, v31.s[0] +add v27.4s, v27.4s, v1.4s +ldr q1, [x0, #448] +mul v9.4S, v9.4S,v19.s[0] +sub v17.4s, v26.4s, v16.4s +ldr q23, [x0, #336] +mul v13.4S, v13.4S,v19.s[0] +add v26.4s, v26.4s, v16.4s +ldr q16, [x0, #464] +mla v9.4S, v14.4S, v31.s[0] +mla v13.4S, v8.4S, v31.s[0] +sub v8.4s, v23.4s, v30.4s +sqrdmulh v14.4S, v27.4S, v4.s[1] +add v23.4s, v23.4s, v30.4s +mul v27.4S, v27.4S,v21.s[1] +sqrdmulh v30.4S, v3.4S, v4.s[2] +sub v12.4s, v2.4s, v10.4s +mul v3.4S, v3.4S,v21.s[2] +add v2.4s, v2.4s, v10.4s +sqrdmulh v4.4S, v23.4S, v24.s[1] +sub v21.4s, v15.4s, v25.4s +mul v23.4S, v23.4S,v6.s[1] +add v15.4s, v15.4s, v25.4s +sqrdmulh v25.4S, v8.4S, v24.s[2] +sub v10.4s, v1.4s, v9.4s +mul v8.4S, v8.4S,v6.s[2] +add v1.4s, v1.4s, v9.4s +mla v27.4S, v14.4S, v31.s[0] +sub v14.4s, v16.4s, v13.4s +ldr q24, [x0, #736] +sqrdmulh v6.4S, v15.4S, v20.s[1] +add v16.4s, v16.4s, v13.4s +mla v3.4S, v30.4S, v31.s[0] +ldr q30, [x0, #672] +sqrdmulh v13.4S, v21.4S, v20.s[2] +sub v9.4s, v5.4s, v27.4s +mla v23.4S, v4.4S, v31.s[0] +ldr q4, [x0, #544] +sqrdmulh v29.4S, v16.4S, v0.s[1] +add v5.4s, v5.4s, v27.4s +str q9, [x0, #272] +mla v8.4S, v25.4S, v31.s[0] +ldr q25, [x17, #+384] +ldr q9, [x17, #+400] +sqrdmulh v27.4S, v14.4S, v0.s[2] +sub v7.4s, v22.4s, v3.4s +str q5, [x0, #256] +mul v15.4S, v15.4S,v11.s[1] +add v22.4s, v22.4s, v3.4s +mul v21.4S, v21.4S,v11.s[2] +str q7, [x0, #304] +mla v15.4S, v6.4S, v31.s[0] +sub v6.4s, v26.4s, v23.4s +mla v21.4S, v13.4S, v31.s[0] +str q22, [x0, #288] +mul v16.4S, v16.4S,v19.s[1] +str q6, [x0, #336] +mul v14.4S, v14.4S,v19.s[2] +add v26.4s, v26.4s, v23.4s +str q26, [x0, #320] +mla v16.4S, v29.4S, v31.s[0] +sub v29.4s, v17.4s, v8.4s +str q29, [x0, #368] +mla v14.4S, v27.4S, v31.s[0] +add v17.4s, v17.4s, v8.4s +str q17, [x0, #352] +sqrdmulh v0.4S, v4.4S, v9.s[0] +sub v19.4s, v2.4s, v15.4s +mul v4.4S, v4.4S,v25.s[0] +str q19, [x0, #400] +ldr q19, [x0, #560] +sqrdmulh v17.4S, v19.4S, v9.s[0] +add v2.4s, v2.4s, v15.4s +mul v19.4S, v19.4S,v25.s[0] +str q2, [x0, #384] +ldr q2, [x17, #+416] +ldr q15, [x17, #+432] +ldr q8, [x0, #608] +sqrdmulh v27.4S, v8.4S, v15.s[0] +sub v29.4s, v12.4s, v21.4s +mul v8.4S, v8.4S,v2.s[0] +str q29, [x0, #432] +ldr q29, [x0, #624] +sqrdmulh v26.4S, v29.4S, v15.s[0] +add v12.4s, v12.4s, v21.4s +mul v29.4S, v29.4S,v2.s[0] +str q12, [x0, #416] +ldr q12, [x17, #+448] +ldr q21, [x17, #+464] +mla v4.4S, v0.4S, v31.s[0] +sub v0.4s, v1.4s, v16.4s +sqrdmulh v23.4S, v30.4S, v21.s[0] +str q0, [x0, #464] +ldr q0, [x0, #688] +mla v19.4S, v17.4S, v31.s[0] +add v1.4s, v1.4s, v16.4s +sqrdmulh v16.4S, v0.4S, v21.s[0] +str q1, [x0, #448] +ldr q1, [x17, #+480] +ldr q17, [x17, #+496] +mla v8.4S, v27.4S, v31.s[0] +sub v27.4s, v10.4s, v14.4s +sqrdmulh v6.4S, v24.4S, v17.s[0] +str q27, [x0, #496] +ldr q27, [x0, #752] +mla v29.4S, v26.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v27.4S, v17.s[0] +str q10, [x0, #480] +ldr q10, [x0, #512] +ldr q26, [x0, #640] +mul v30.4S, v30.4S,v12.s[0] +sub v20.4s, v10.4s, v4.4s +ldr q11, [x0, #528] +mul v0.4S, v0.4S,v12.s[0] +add v10.4s, v10.4s, v4.4s +ldr q4, [x0, #656] +mla v30.4S, v23.4S, v31.s[0] +sub v23.4s, v11.4s, v19.4s +ldr q22, [x0, #576] +mla v0.4S, v16.4S, v31.s[0] +add v11.4s, v11.4s, v19.4s +ldr q19, [x0, #704] +mul v24.4S, v24.4S,v1.s[0] +sub v16.4s, v22.4s, v8.4s +ldr q13, [x0, #592] +mul v27.4S, v27.4S,v1.s[0] +add v22.4s, v22.4s, v8.4s +ldr q8, [x0, #720] +mla v24.4S, v6.4S, v31.s[0] +mla v27.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v29.4s +sqrdmulh v6.4S, v11.4S, v9.s[1] +add v13.4s, v13.4s, v29.4s +mul v11.4S, v11.4S,v25.s[1] +sqrdmulh v29.4S, v23.4S, v9.s[2] +sub v7.4s, v26.4s, v30.4s +mul v23.4S, v23.4S,v25.s[2] +add v26.4s, v26.4s, v30.4s +sqrdmulh v9.4S, v13.4S, v15.s[1] +sub v25.4s, v4.4s, v0.4s +mul v13.4S, v13.4S,v2.s[1] +add v4.4s, v4.4s, v0.4s +sqrdmulh v0.4S, v14.4S, v15.s[2] +sub v30.4s, v19.4s, v24.4s +mul v14.4S, v14.4S,v2.s[2] +add v19.4s, v19.4s, v24.4s +mla v11.4S, v6.4S, v31.s[0] +sub v6.4s, v8.4s, v27.4s +ldr q15, [x0, #992] +sqrdmulh v2.4S, v4.4S, v21.s[1] +add v8.4s, v8.4s, v27.4s +mla v23.4S, v29.4S, v31.s[0] +ldr q29, [x0, #928] +sqrdmulh v27.4S, v25.4S, v21.s[2] +sub v24.4s, v10.4s, v11.4s +mla v13.4S, v9.4S, v31.s[0] +ldr q9, [x0, #800] +sqrdmulh v3.4S, v8.4S, v17.s[1] +add v10.4s, v10.4s, v11.4s +str q24, [x0, #528] +mla v14.4S, v0.4S, v31.s[0] +ldr q0, [x17, #+512] +ldr q24, [x17, #+528] +sqrdmulh v11.4S, v6.4S, v17.s[2] +sub v5.4s, v20.4s, v23.4s +str q10, [x0, #512] +mul v4.4S, v4.4S,v12.s[1] +add v20.4s, v20.4s, v23.4s +mul v25.4S, v25.4S,v12.s[2] +str q5, [x0, #560] +mla v4.4S, v2.4S, v31.s[0] +sub v2.4s, v22.4s, v13.4s +mla v25.4S, v27.4S, v31.s[0] +str q20, [x0, #544] +mul v8.4S, v8.4S,v1.s[1] +str q2, [x0, #592] +mul v6.4S, v6.4S,v1.s[2] +add v22.4s, v22.4s, v13.4s +str q22, [x0, #576] +mla v8.4S, v3.4S, v31.s[0] +sub v3.4s, v16.4s, v14.4s +str q3, [x0, #624] +mla v6.4S, v11.4S, v31.s[0] +add v16.4s, v16.4s, v14.4s +str q16, [x0, #608] +sqrdmulh v17.4S, v9.4S, v24.s[0] +sub v1.4s, v26.4s, v4.4s +mul v9.4S, v9.4S,v0.s[0] +str q1, [x0, #656] +ldr q1, [x0, #816] +sqrdmulh v16.4S, v1.4S, v24.s[0] +add v26.4s, v26.4s, v4.4s +mul v1.4S, v1.4S,v0.s[0] +str q26, [x0, #640] +ldr q26, [x17, #+544] +ldr q4, [x17, #+560] +ldr q14, [x0, #864] +sqrdmulh v11.4S, v14.4S, v4.s[0] +sub v3.4s, v7.4s, v25.4s +mul v14.4S, v14.4S,v26.s[0] +str q3, [x0, #688] +ldr q3, [x0, #880] +sqrdmulh v22.4S, v3.4S, v4.s[0] +add v7.4s, v7.4s, v25.4s +mul v3.4S, v3.4S,v26.s[0] +str q7, [x0, #672] +ldr q7, [x17, #+576] +ldr q25, [x17, #+592] +mla v9.4S, v17.4S, v31.s[0] +sub v17.4s, v19.4s, v8.4s +sqrdmulh v13.4S, v29.4S, v25.s[0] +str q17, [x0, #720] +ldr q17, [x0, #944] +mla v1.4S, v16.4S, v31.s[0] +add v19.4s, v19.4s, v8.4s +sqrdmulh v8.4S, v17.4S, v25.s[0] +str q19, [x0, #704] +ldr q19, [x17, #+608] +ldr q16, [x17, #+624] +mla v14.4S, v11.4S, v31.s[0] +sub v11.4s, v30.4s, v6.4s +sqrdmulh v2.4S, v15.4S, v16.s[0] +str q11, [x0, #752] +ldr q11, [x0, #1008] +mla v3.4S, v22.4S, v31.s[0] +add v30.4s, v30.4s, v6.4s +sqrdmulh v6.4S, v11.4S, v16.s[0] +str q30, [x0, #736] +ldr q30, [x0, #768] +ldr q22, [x0, #896] +mul v29.4S, v29.4S,v7.s[0] +sub v21.4s, v30.4s, v9.4s +ldr q12, [x0, #784] +mul v17.4S, v17.4S,v7.s[0] +add v30.4s, v30.4s, v9.4s +ldr q9, [x0, #912] +mla v29.4S, v13.4S, v31.s[0] +sub v13.4s, v12.4s, v1.4s +ldr q20, [x0, #832] +mla v17.4S, v8.4S, v31.s[0] +add v12.4s, v12.4s, v1.4s +ldr q1, [x0, #960] +mul v15.4S, v15.4S,v19.s[0] +sub v8.4s, v20.4s, v14.4s +ldr q27, [x0, #848] +mul v11.4S, v11.4S,v19.s[0] +add v20.4s, v20.4s, v14.4s +ldr q14, [x0, #976] +mla v15.4S, v2.4S, v31.s[0] +mla v11.4S, v6.4S, v31.s[0] +sub v6.4s, v27.4s, v3.4s +sqrdmulh v2.4S, v12.4S, v24.s[1] +add v27.4s, v27.4s, v3.4s +mul v12.4S, v12.4S,v0.s[1] +sqrdmulh v3.4S, v13.4S, v24.s[2] +sub v5.4s, v22.4s, v29.4s +mul v13.4S, v13.4S,v0.s[2] +add v22.4s, v22.4s, v29.4s +sqrdmulh v24.4S, v27.4S, v4.s[1] +sub v0.4s, v9.4s, v17.4s +mul v27.4S, v27.4S,v26.s[1] +add v9.4s, v9.4s, v17.4s +sqrdmulh v17.4S, v6.4S, v4.s[2] +sub v29.4s, v1.4s, v15.4s +mul v6.4S, v6.4S,v26.s[2] +add v1.4s, v1.4s, v15.4s +mla v12.4S, v2.4S, v31.s[0] +sub v2.4s, v14.4s, v11.4s +sqrdmulh v4.4S, v9.4S, v25.s[1] +add v14.4s, v14.4s, v11.4s +mla v13.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v0.4S, v25.s[2] +sub v11.4s, v30.4s, v12.4s +mla v27.4S, v24.4S, v31.s[0] +sqrdmulh v24.4S, v14.4S, v16.s[1] +add v30.4s, v30.4s, v12.4s +str q11, [x0, #784] +mla v6.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v2.4S, v16.s[2] +sub v11.4s, v21.4s, v13.4s +str q30, [x0, #768] +mul v9.4S, v9.4S,v7.s[1] +add v21.4s, v21.4s, v13.4s +mul v0.4S, v0.4S,v7.s[2] +str q11, [x0, #816] +mla v9.4S, v4.4S, v31.s[0] +sub v4.4s, v20.4s, v27.4s +mla v0.4S, v3.4S, v31.s[0] +str q21, [x0, #800] +mul v14.4S, v14.4S,v19.s[1] +str q4, [x0, #848] +mul v2.4S, v2.4S,v19.s[2] +add v20.4s, v20.4s, v27.4s +str q20, [x0, #832] +mla v14.4S, v24.4S, v31.s[0] +sub v24.4s, v8.4s, v6.4s +str q24, [x0, #880] +mla v2.4S, v17.4S, v31.s[0] +add v8.4s, v8.4s, v6.4s +str q8, [x0, #864] +sub v16.4s, v22.4s, v9.4s +str q16, [x0, #912] +add v22.4s, v22.4s, v9.4s +str q22, [x0, #896] +sub v22.4s, v5.4s, v0.4s +str q22, [x0, #944] +add v5.4s, v5.4s, v0.4s +str q5, [x0, #928] +sub v5.4s, v1.4s, v14.4s +str q5, [x0, #976] +add v1.4s, v1.4s, v14.4s +str q1, [x0, #960] +sub v1.4s, v29.4s, v2.4s +str q1, [x0, #1008] +add v29.4s, v29.4s, v2.4s +str q29, [x0, #992] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1464 +// Instruction count: 1460 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_13_z4_7.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_13_z4_7.s new file mode 100644 index 0000000..4533d8c --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_13_z4_7.s @@ -0,0 +1,1494 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_13_z4_7 +.global _ntt_u32_incomplete_neon_asm_var_4_2_13_z4_7 +ntt_u32_incomplete_neon_asm_var_4_2_13_z4_7: +_ntt_u32_incomplete_neon_asm_var_4_2_13_z4_7: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #928] +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +ldr q20, [x0, #992] +sqrdmulh v19.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q18, [x0, #800] +sqrdmulh v17.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +ldr q16, [x0, #864] +sqrdmulh v3.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +ldr q2, [x0, #544] +mla v22.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v2.4S, v29.s[0] +ldr q1, [x0, #608] +mla v20.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v1.4S, v29.s[0] +ldr q0, [x0, #672] +mla v18.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v0.4S, v29.s[0] +ldr q15, [x0, #736] +mla v16.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v15.4S, v29.s[0] +ldr q14, [x0, #416] +ldr q13, [x0, #480] +mul v2.4S, v2.4S,v30.s[0] +sub v12.4s, v14.4s, v22.4s +mul v1.4S, v1.4S,v30.s[0] +add v14.4s, v14.4s, v22.4s +ldr q22, [x0, #288] +ldr q11, [x0, #352] +mla v2.4S, v21.4S, v31.s[0] +sub v21.4s, v13.4s, v20.4s +mla v1.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v20.4s +ldr q20, [x0, #32] +ldr q19, [x0, #96] +mul v0.4S, v0.4S,v30.s[0] +sub v10.4s, v22.4s, v18.4s +mul v15.4S, v15.4S,v30.s[0] +add v22.4s, v22.4s, v18.4s +ldr q18, [x0, #160] +ldr q9, [x0, #224] +mla v0.4S, v17.4S, v31.s[0] +sub v17.4s, v11.4s, v16.4s +mla v15.4S, v3.4S, v31.s[0] +add v11.4s, v11.4s, v16.4s +sqrdmulh v16.4S, v14.4S, v29.s[1] +mul v14.4S, v14.4S,v30.s[1] +sqrdmulh v3.4S, v13.4S, v29.s[1] +sub v8.4s, v20.4s, v2.4s +mul v13.4S, v13.4S,v30.s[1] +add v20.4s, v20.4s, v2.4s +sqrdmulh v2.4S, v22.4S, v29.s[1] +sub v7.4s, v19.4s, v1.4s +mul v22.4S, v22.4S,v30.s[1] +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v11.4S, v29.s[1] +sub v6.4s, v18.4s, v0.4s +mul v11.4S, v11.4S,v30.s[1] +add v18.4s, v18.4s, v0.4s +mla v14.4S, v16.4S, v31.s[0] +sub v16.4s, v9.4s, v15.4s +sqrdmulh v0.4S, v12.4S, v29.s[2] +add v9.4s, v9.4s, v15.4s +mla v13.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v21.4S, v29.s[2] +mla v22.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v10.4S, v29.s[2] +mla v11.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v17.4S, v29.s[2] +mul v12.4S, v12.4S,v30.s[2] +sub v15.4s, v18.4s, v14.4s +mul v21.4S, v21.4S,v30.s[2] +add v18.4s, v18.4s, v14.4s +mla v12.4S, v0.4S, v31.s[0] +sub v0.4s, v9.4s, v13.4s +mla v21.4S, v3.4S, v31.s[0] +add v9.4s, v9.4s, v13.4s +mul v10.4S, v10.4S,v30.s[2] +sub v13.4s, v20.4s, v22.4s +mul v17.4S, v17.4S,v30.s[2] +add v20.4s, v20.4s, v22.4s +mla v10.4S, v2.4S, v31.s[0] +sub v2.4s, v19.4s, v11.4s +mla v17.4S, v1.4S, v31.s[0] +add v19.4s, v19.4s, v11.4s +sqrdmulh v11.4S, v15.4S, v27.s[1] +mul v15.4S, v15.4S,v28.s[1] +sqrdmulh v1.4S, v0.4S, v27.s[1] +sub v22.4s, v6.4s, v12.4s +mul v0.4S, v0.4S,v28.s[1] +add v6.4s, v6.4s, v12.4s +sqrdmulh v12.4S, v18.4S, v27.s[0] +sub v3.4s, v16.4s, v21.4s +mul v18.4S, v18.4S,v28.s[0] +add v16.4s, v16.4s, v21.4s +sqrdmulh v21.4S, v9.4S, v27.s[0] +sub v14.4s, v8.4s, v10.4s +mul v9.4S, v9.4S,v28.s[0] +add v8.4s, v8.4s, v10.4s +mla v15.4S, v11.4S, v31.s[0] +sub v11.4s, v7.4s, v17.4s +sqrdmulh v10.4S, v6.4S, v27.s[2] +add v7.4s, v7.4s, v17.4s +mla v0.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v16.4S, v27.s[2] +mla v18.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v22.4S, v27.s[3] +mla v9.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v3.4S, v27.s[3] +mul v6.4S, v6.4S,v28.s[2] +sub v17.4s, v13.4s, v15.4s +mul v16.4S, v16.4S,v28.s[2] +add v13.4s, v13.4s, v15.4s +mla v6.4S, v10.4S, v31.s[0] +sub v10.4s, v2.4s, v0.4s +mla v16.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v0.4s +mul v22.4S, v22.4S,v28.s[3] +sub v0.4s, v20.4s, v18.4s +mul v3.4S, v3.4S,v28.s[3] +add v20.4s, v20.4s, v18.4s +mla v22.4S, v12.4S, v31.s[0] +sub v12.4s, v19.4s, v9.4s +mla v3.4S, v21.4S, v31.s[0] +add v19.4s, v19.4s, v9.4s +sqrdmulh v9.4S, v2.4S, v25.s[2] +mul v2.4S, v2.4S,v26.s[2] +sqrdmulh v21.4S, v10.4S, v25.s[3] +sub v18.4s, v8.4s, v6.4s +mul v10.4S, v10.4S,v26.s[3] +add v8.4s, v8.4s, v6.4s +sqrdmulh v6.4S, v12.4S, v25.s[1] +sub v1.4s, v7.4s, v16.4s +mul v12.4S, v12.4S,v26.s[1] +add v7.4s, v7.4s, v16.4s +sqrdmulh v16.4S, v19.4S, v25.s[0] +sub v15.4s, v14.4s, v22.4s +mul v19.4S, v19.4S,v26.s[0] +add v14.4s, v14.4s, v22.4s +mla v2.4S, v9.4S, v31.s[0] +sub v9.4s, v11.4s, v3.4s +sqrdmulh v22.4S, v7.4S, v23.s[0] +add v11.4s, v11.4s, v3.4s +mla v10.4S, v21.4S, v31.s[0] +sub v21.4s, v13.4s, v2.4s +sqrdmulh v3.4S, v1.4S, v23.s[1] +add v13.4s, v13.4s, v2.4s +mla v12.4S, v6.4S, v31.s[0] +sub v6.4s, v17.4s, v10.4s +sqrdmulh v2.4S, v11.4S, v23.s[2] +add v17.4s, v17.4s, v10.4s +mla v19.4S, v16.4S, v31.s[0] +sub v16.4s, v0.4s, v12.4s +sqrdmulh v10.4S, v9.4S, v23.s[3] +add v0.4s, v0.4s, v12.4s +mul v7.4S, v7.4S,v24.s[0] +sub v12.4s, v20.4s, v19.4s +mul v1.4S, v1.4S,v24.s[1] +add v20.4s, v20.4s, v19.4s +mla v7.4S, v22.4S, v31.s[0] +str q21, [x0, #352] +mla v1.4S, v3.4S, v31.s[0] +str q13, [x0, #288] +mul v11.4S, v11.4S,v24.s[2] +str q6, [x0, #480] +mul v9.4S, v9.4S,v24.s[3] +str q17, [x0, #416] +mla v11.4S, v2.4S, v31.s[0] +str q16, [x0, #224] +mla v9.4S, v10.4S, v31.s[0] +str q0, [x0, #160] +ldr q0, [x0, #944] +sqrdmulh v10.4S, v0.4S, v29.s[0] +str q12, [x0, #96] +mul v0.4S, v0.4S,v30.s[0] +str q20, [x0, #32] +ldr q20, [x0, #1008] +sqrdmulh v12.4S, v20.4S, v29.s[0] +sub v16.4s, v8.4s, v7.4s +str q16, [x0, #608] +mul v20.4S, v20.4S,v30.s[0] +add v8.4s, v8.4s, v7.4s +ldr q7, [x0, #816] +sqrdmulh v16.4S, v7.4S, v29.s[0] +sub v2.4s, v18.4s, v1.4s +str q8, [x0, #544] +mul v7.4S, v7.4S,v30.s[0] +add v18.4s, v18.4s, v1.4s +ldr q1, [x0, #880] +sqrdmulh v8.4S, v1.4S, v29.s[0] +sub v17.4s, v14.4s, v11.4s +str q2, [x0, #736] +mul v1.4S, v1.4S,v30.s[0] +add v14.4s, v14.4s, v11.4s +ldr q11, [x0, #560] +mla v0.4S, v10.4S, v31.s[0] +sub v10.4s, v15.4s, v9.4s +str q18, [x0, #672] +sqrdmulh v18.4S, v11.4S, v29.s[0] +add v15.4s, v15.4s, v9.4s +ldr q9, [x0, #624] +mla v20.4S, v12.4S, v31.s[0] +str q17, [x0, #864] +sqrdmulh v17.4S, v9.4S, v29.s[0] +ldr q12, [x0, #688] +mla v7.4S, v16.4S, v31.s[0] +str q14, [x0, #800] +sqrdmulh v14.4S, v12.4S, v29.s[0] +ldr q16, [x0, #752] +mla v1.4S, v8.4S, v31.s[0] +str q10, [x0, #992] +sqrdmulh v10.4S, v16.4S, v29.s[0] +ldr q8, [x0, #432] +ldr q2, [x0, #496] +mul v11.4S, v11.4S,v30.s[0] +sub v6.4s, v8.4s, v0.4s +str q15, [x0, #928] +mul v9.4S, v9.4S,v30.s[0] +add v8.4s, v8.4s, v0.4s +ldr q0, [x0, #304] +ldr q15, [x0, #368] +mla v11.4S, v18.4S, v31.s[0] +sub v18.4s, v2.4s, v20.4s +mla v9.4S, v17.4S, v31.s[0] +add v2.4s, v2.4s, v20.4s +ldr q20, [x0, #48] +ldr q17, [x0, #112] +mul v12.4S, v12.4S,v30.s[0] +sub v13.4s, v0.4s, v7.4s +mul v16.4S, v16.4S,v30.s[0] +add v0.4s, v0.4s, v7.4s +ldr q7, [x0, #176] +ldr q3, [x0, #240] +mla v12.4S, v14.4S, v31.s[0] +sub v14.4s, v15.4s, v1.4s +mla v16.4S, v10.4S, v31.s[0] +add v15.4s, v15.4s, v1.4s +sqrdmulh v1.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +sqrdmulh v10.4S, v2.4S, v29.s[1] +sub v21.4s, v20.4s, v11.4s +mul v2.4S, v2.4S,v30.s[1] +add v20.4s, v20.4s, v11.4s +sqrdmulh v11.4S, v0.4S, v29.s[1] +sub v22.4s, v17.4s, v9.4s +mul v0.4S, v0.4S,v30.s[1] +add v17.4s, v17.4s, v9.4s +sqrdmulh v9.4S, v15.4S, v29.s[1] +sub v19.4s, v7.4s, v12.4s +mul v15.4S, v15.4S,v30.s[1] +add v7.4s, v7.4s, v12.4s +mla v8.4S, v1.4S, v31.s[0] +sub v1.4s, v3.4s, v16.4s +sqrdmulh v12.4S, v6.4S, v29.s[2] +add v3.4s, v3.4s, v16.4s +mla v2.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v18.4S, v29.s[2] +mla v0.4S, v11.4S, v31.s[0] +sqrdmulh v11.4S, v13.4S, v29.s[2] +mla v15.4S, v9.4S, v31.s[0] +sqrdmulh v9.4S, v14.4S, v29.s[2] +mul v6.4S, v6.4S,v30.s[2] +sub v16.4s, v7.4s, v8.4s +mul v18.4S, v18.4S,v30.s[2] +add v7.4s, v7.4s, v8.4s +mla v6.4S, v12.4S, v31.s[0] +sub v12.4s, v3.4s, v2.4s +mla v18.4S, v10.4S, v31.s[0] +add v3.4s, v3.4s, v2.4s +mul v13.4S, v13.4S,v30.s[2] +sub v2.4s, v20.4s, v0.4s +mul v14.4S, v14.4S,v30.s[2] +add v20.4s, v20.4s, v0.4s +mla v13.4S, v11.4S, v31.s[0] +sub v11.4s, v17.4s, v15.4s +mla v14.4S, v9.4S, v31.s[0] +add v17.4s, v17.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v27.s[1] +mul v16.4S, v16.4S,v28.s[1] +sqrdmulh v9.4S, v12.4S, v27.s[1] +sub v0.4s, v19.4s, v6.4s +mul v12.4S, v12.4S,v28.s[1] +add v19.4s, v19.4s, v6.4s +sqrdmulh v6.4S, v7.4S, v27.s[0] +sub v10.4s, v1.4s, v18.4s +mul v7.4S, v7.4S,v28.s[0] +add v1.4s, v1.4s, v18.4s +sqrdmulh v18.4S, v3.4S, v27.s[0] +sub v8.4s, v21.4s, v13.4s +mul v3.4S, v3.4S,v28.s[0] +add v21.4s, v21.4s, v13.4s +mla v16.4S, v15.4S, v31.s[0] +sub v15.4s, v22.4s, v14.4s +sqrdmulh v13.4S, v19.4S, v27.s[2] +add v22.4s, v22.4s, v14.4s +mla v12.4S, v9.4S, v31.s[0] +sqrdmulh v9.4S, v1.4S, v27.s[2] +mla v7.4S, v6.4S, v31.s[0] +sqrdmulh v6.4S, v0.4S, v27.s[3] +mla v3.4S, v18.4S, v31.s[0] +sqrdmulh v18.4S, v10.4S, v27.s[3] +mul v19.4S, v19.4S,v28.s[2] +sub v14.4s, v2.4s, v16.4s +mul v1.4S, v1.4S,v28.s[2] +add v2.4s, v2.4s, v16.4s +mla v19.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v12.4s +mla v1.4S, v9.4S, v31.s[0] +add v11.4s, v11.4s, v12.4s +mul v0.4S, v0.4S,v28.s[3] +sub v12.4s, v20.4s, v7.4s +mul v10.4S, v10.4S,v28.s[3] +add v20.4s, v20.4s, v7.4s +mla v0.4S, v6.4S, v31.s[0] +sub v6.4s, v17.4s, v3.4s +mla v10.4S, v18.4S, v31.s[0] +add v17.4s, v17.4s, v3.4s +sqrdmulh v3.4S, v11.4S, v25.s[2] +mul v11.4S, v11.4S,v26.s[2] +sqrdmulh v18.4S, v13.4S, v25.s[3] +sub v7.4s, v21.4s, v19.4s +mul v13.4S, v13.4S,v26.s[3] +add v21.4s, v21.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v25.s[1] +sub v9.4s, v22.4s, v1.4s +mul v6.4S, v6.4S,v26.s[1] +add v22.4s, v22.4s, v1.4s +sqrdmulh v1.4S, v17.4S, v25.s[0] +sub v16.4s, v8.4s, v0.4s +mul v17.4S, v17.4S,v26.s[0] +add v8.4s, v8.4s, v0.4s +mla v11.4S, v3.4S, v31.s[0] +sub v3.4s, v15.4s, v10.4s +sqrdmulh v0.4S, v22.4S, v23.s[0] +add v15.4s, v15.4s, v10.4s +mla v13.4S, v18.4S, v31.s[0] +sub v18.4s, v2.4s, v11.4s +sqrdmulh v10.4S, v9.4S, v23.s[1] +add v2.4s, v2.4s, v11.4s +mla v6.4S, v19.4S, v31.s[0] +sub v19.4s, v14.4s, v13.4s +sqrdmulh v11.4S, v15.4S, v23.s[2] +add v14.4s, v14.4s, v13.4s +mla v17.4S, v1.4S, v31.s[0] +sub v1.4s, v12.4s, v6.4s +sqrdmulh v13.4S, v3.4S, v23.s[3] +add v12.4s, v12.4s, v6.4s +mul v22.4S, v22.4S,v24.s[0] +sub v6.4s, v20.4s, v17.4s +mul v9.4S, v9.4S,v24.s[1] +add v20.4s, v20.4s, v17.4s +mla v22.4S, v0.4S, v31.s[0] +str q18, [x0, #368] +mla v9.4S, v10.4S, v31.s[0] +str q2, [x0, #304] +mul v15.4S, v15.4S,v24.s[2] +str q19, [x0, #496] +mul v3.4S, v3.4S,v24.s[3] +str q14, [x0, #432] +mla v15.4S, v11.4S, v31.s[0] +str q1, [x0, #240] +mla v3.4S, v13.4S, v31.s[0] +str q12, [x0, #176] +ldr q12, [x0, #896] +sqrdmulh v13.4S, v12.4S, v29.s[0] +str q6, [x0, #112] +mul v12.4S, v12.4S,v30.s[0] +str q20, [x0, #48] +ldr q20, [x0, #960] +sqrdmulh v6.4S, v20.4S, v29.s[0] +sub v1.4s, v21.4s, v22.4s +str q1, [x0, #624] +mul v20.4S, v20.4S,v30.s[0] +add v21.4s, v21.4s, v22.4s +ldr q22, [x0, #768] +sqrdmulh v1.4S, v22.4S, v29.s[0] +sub v11.4s, v7.4s, v9.4s +str q21, [x0, #560] +mul v22.4S, v22.4S,v30.s[0] +add v7.4s, v7.4s, v9.4s +ldr q9, [x0, #832] +sqrdmulh v21.4S, v9.4S, v29.s[0] +sub v14.4s, v8.4s, v15.4s +str q11, [x0, #752] +mul v9.4S, v9.4S,v30.s[0] +add v8.4s, v8.4s, v15.4s +ldr q15, [x0, #512] +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v16.4s, v3.4s +str q7, [x0, #688] +sqrdmulh v7.4S, v15.4S, v29.s[0] +add v16.4s, v16.4s, v3.4s +ldr q3, [x0, #576] +mla v20.4S, v6.4S, v31.s[0] +str q14, [x0, #880] +sqrdmulh v14.4S, v3.4S, v29.s[0] +ldr q6, [x0, #640] +mla v22.4S, v1.4S, v31.s[0] +str q8, [x0, #816] +sqrdmulh v8.4S, v6.4S, v29.s[0] +ldr q1, [x0, #704] +mla v9.4S, v21.4S, v31.s[0] +str q13, [x0, #1008] +sqrdmulh v13.4S, v1.4S, v29.s[0] +ldr q21, [x0, #384] +ldr q11, [x0, #448] +mul v15.4S, v15.4S,v30.s[0] +sub v19.4s, v21.4s, v12.4s +str q16, [x0, #944] +mul v3.4S, v3.4S,v30.s[0] +add v21.4s, v21.4s, v12.4s +ldr q12, [x0, #256] +ldr q16, [x0, #320] +mla v15.4S, v7.4S, v31.s[0] +sub v7.4s, v11.4s, v20.4s +mla v3.4S, v14.4S, v31.s[0] +add v11.4s, v11.4s, v20.4s +ldr q20, [x0, #0] +ldr q14, [x0, #64] +mul v6.4S, v6.4S,v30.s[0] +sub v2.4s, v12.4s, v22.4s +mul v1.4S, v1.4S,v30.s[0] +add v12.4s, v12.4s, v22.4s +ldr q22, [x0, #128] +ldr q10, [x0, #192] +mla v6.4S, v8.4S, v31.s[0] +sub v8.4s, v16.4s, v9.4s +mla v1.4S, v13.4S, v31.s[0] +add v16.4s, v16.4s, v9.4s +sqrdmulh v9.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sqrdmulh v13.4S, v11.4S, v29.s[1] +sub v18.4s, v20.4s, v15.4s +mul v11.4S, v11.4S,v30.s[1] +add v20.4s, v20.4s, v15.4s +sqrdmulh v15.4S, v12.4S, v29.s[1] +sub v0.4s, v14.4s, v3.4s +mul v12.4S, v12.4S,v30.s[1] +add v14.4s, v14.4s, v3.4s +sqrdmulh v3.4S, v16.4S, v29.s[1] +sub v17.4s, v22.4s, v6.4s +mul v16.4S, v16.4S,v30.s[1] +add v22.4s, v22.4s, v6.4s +mla v21.4S, v9.4S, v31.s[0] +sub v9.4s, v10.4s, v1.4s +sqrdmulh v6.4S, v19.4S, v29.s[2] +add v10.4s, v10.4s, v1.4s +mla v11.4S, v13.4S, v31.s[0] +sqrdmulh v13.4S, v7.4S, v29.s[2] +mla v12.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v2.4S, v29.s[2] +mla v16.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v8.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +sub v1.4s, v22.4s, v21.4s +mul v7.4S, v7.4S,v30.s[2] +add v22.4s, v22.4s, v21.4s +mla v19.4S, v6.4S, v31.s[0] +sub v6.4s, v10.4s, v11.4s +mla v7.4S, v13.4S, v31.s[0] +add v10.4s, v10.4s, v11.4s +mul v2.4S, v2.4S,v30.s[2] +sub v11.4s, v20.4s, v12.4s +mul v8.4S, v8.4S,v30.s[2] +add v20.4s, v20.4s, v12.4s +mla v2.4S, v15.4S, v31.s[0] +sub v15.4s, v14.4s, v16.4s +mla v8.4S, v3.4S, v31.s[0] +add v14.4s, v14.4s, v16.4s +sqrdmulh v16.4S, v1.4S, v27.s[1] +mul v1.4S, v1.4S,v28.s[1] +sqrdmulh v3.4S, v6.4S, v27.s[1] +sub v12.4s, v17.4s, v19.4s +mul v6.4S, v6.4S,v28.s[1] +add v17.4s, v17.4s, v19.4s +sqrdmulh v19.4S, v22.4S, v27.s[0] +sub v13.4s, v9.4s, v7.4s +mul v22.4S, v22.4S,v28.s[0] +add v9.4s, v9.4s, v7.4s +sqrdmulh v7.4S, v10.4S, v27.s[0] +sub v21.4s, v18.4s, v2.4s +mul v10.4S, v10.4S,v28.s[0] +add v18.4s, v18.4s, v2.4s +mla v1.4S, v16.4S, v31.s[0] +sub v16.4s, v0.4s, v8.4s +sqrdmulh v2.4S, v17.4S, v27.s[2] +add v0.4s, v0.4s, v8.4s +mla v6.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v9.4S, v27.s[2] +mla v22.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v12.4S, v27.s[3] +mla v10.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v13.4S, v27.s[3] +mul v17.4S, v17.4S,v28.s[2] +sub v8.4s, v11.4s, v1.4s +mul v9.4S, v9.4S,v28.s[2] +add v11.4s, v11.4s, v1.4s +mla v17.4S, v2.4S, v31.s[0] +sub v2.4s, v15.4s, v6.4s +mla v9.4S, v3.4S, v31.s[0] +add v15.4s, v15.4s, v6.4s +mul v12.4S, v12.4S,v28.s[3] +sub v6.4s, v20.4s, v22.4s +mul v13.4S, v13.4S,v28.s[3] +add v20.4s, v20.4s, v22.4s +mla v12.4S, v19.4S, v31.s[0] +sub v19.4s, v14.4s, v10.4s +mla v13.4S, v7.4S, v31.s[0] +add v14.4s, v14.4s, v10.4s +sqrdmulh v10.4S, v15.4S, v25.s[2] +mul v15.4S, v15.4S,v26.s[2] +sqrdmulh v7.4S, v2.4S, v25.s[3] +sub v22.4s, v18.4s, v17.4s +mul v2.4S, v2.4S,v26.s[3] +add v18.4s, v18.4s, v17.4s +sqrdmulh v17.4S, v19.4S, v25.s[1] +sub v3.4s, v0.4s, v9.4s +mul v19.4S, v19.4S,v26.s[1] +add v0.4s, v0.4s, v9.4s +sqrdmulh v9.4S, v14.4S, v25.s[0] +sub v1.4s, v21.4s, v12.4s +mul v14.4S, v14.4S,v26.s[0] +add v21.4s, v21.4s, v12.4s +mla v15.4S, v10.4S, v31.s[0] +sub v10.4s, v16.4s, v13.4s +sqrdmulh v12.4S, v0.4S, v23.s[0] +add v16.4s, v16.4s, v13.4s +mla v2.4S, v7.4S, v31.s[0] +sub v7.4s, v11.4s, v15.4s +sqrdmulh v13.4S, v3.4S, v23.s[1] +add v11.4s, v11.4s, v15.4s +mla v19.4S, v17.4S, v31.s[0] +sub v17.4s, v8.4s, v2.4s +sqrdmulh v15.4S, v16.4S, v23.s[2] +add v8.4s, v8.4s, v2.4s +mla v14.4S, v9.4S, v31.s[0] +sub v9.4s, v6.4s, v19.4s +sqrdmulh v2.4S, v10.4S, v23.s[3] +add v6.4s, v6.4s, v19.4s +mul v0.4S, v0.4S,v24.s[0] +sub v19.4s, v20.4s, v14.4s +mul v3.4S, v3.4S,v24.s[1] +add v20.4s, v20.4s, v14.4s +mla v0.4S, v12.4S, v31.s[0] +str q7, [x0, #320] +mla v3.4S, v13.4S, v31.s[0] +str q11, [x0, #256] +mul v16.4S, v16.4S,v24.s[2] +str q17, [x0, #448] +mul v10.4S, v10.4S,v24.s[3] +str q8, [x0, #384] +mla v16.4S, v15.4S, v31.s[0] +str q9, [x0, #192] +mla v10.4S, v2.4S, v31.s[0] +str q6, [x0, #128] +ldr q6, [x0, #912] +sqrdmulh v2.4S, v6.4S, v29.s[0] +str q19, [x0, #64] +mul v6.4S, v6.4S,v30.s[0] +str q20, [x0, #0] +ldr q20, [x0, #976] +sqrdmulh v19.4S, v20.4S, v29.s[0] +sub v9.4s, v18.4s, v0.4s +str q9, [x0, #576] +mul v20.4S, v20.4S,v30.s[0] +add v18.4s, v18.4s, v0.4s +ldr q0, [x0, #784] +sqrdmulh v9.4S, v0.4S, v29.s[0] +sub v15.4s, v22.4s, v3.4s +str q18, [x0, #512] +mul v0.4S, v0.4S,v30.s[0] +add v22.4s, v22.4s, v3.4s +ldr q3, [x0, #848] +sqrdmulh v18.4S, v3.4S, v29.s[0] +sub v8.4s, v21.4s, v16.4s +str q15, [x0, #704] +mul v3.4S, v3.4S,v30.s[0] +add v21.4s, v21.4s, v16.4s +ldr q16, [x0, #528] +mla v6.4S, v2.4S, v31.s[0] +sub v2.4s, v1.4s, v10.4s +str q22, [x0, #640] +sqrdmulh v22.4S, v16.4S, v29.s[0] +add v1.4s, v1.4s, v10.4s +ldr q10, [x0, #592] +mla v20.4S, v19.4S, v31.s[0] +str q8, [x0, #832] +sqrdmulh v8.4S, v10.4S, v29.s[0] +ldr q19, [x0, #656] +mla v0.4S, v9.4S, v31.s[0] +str q21, [x0, #768] +sqrdmulh v21.4S, v19.4S, v29.s[0] +ldr q9, [x0, #720] +mla v3.4S, v18.4S, v31.s[0] +str q2, [x0, #960] +sqrdmulh v2.4S, v9.4S, v29.s[0] +ldr q18, [x0, #400] +ldr q15, [x0, #464] +mul v16.4S, v16.4S,v30.s[0] +sub v17.4s, v18.4s, v6.4s +str q1, [x0, #896] +mul v10.4S, v10.4S,v30.s[0] +add v18.4s, v18.4s, v6.4s +ldr q6, [x0, #272] +ldr q1, [x0, #336] +mla v16.4S, v22.4S, v31.s[0] +sub v22.4s, v15.4s, v20.4s +mla v10.4S, v8.4S, v31.s[0] +add v15.4s, v15.4s, v20.4s +ldr q20, [x0, #16] +ldr q8, [x0, #80] +mul v19.4S, v19.4S,v30.s[0] +sub v11.4s, v6.4s, v0.4s +mul v9.4S, v9.4S,v30.s[0] +add v6.4s, v6.4s, v0.4s +ldr q0, [x0, #144] +ldr q13, [x0, #208] +mla v19.4S, v21.4S, v31.s[0] +sub v21.4s, v1.4s, v3.4s +mla v9.4S, v2.4S, v31.s[0] +add v1.4s, v1.4s, v3.4s +sqrdmulh v3.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sqrdmulh v2.4S, v15.4S, v29.s[1] +sub v7.4s, v20.4s, v16.4s +mul v15.4S, v15.4S,v30.s[1] +add v20.4s, v20.4s, v16.4s +sqrdmulh v16.4S, v6.4S, v29.s[1] +sub v12.4s, v8.4s, v10.4s +mul v6.4S, v6.4S,v30.s[1] +add v8.4s, v8.4s, v10.4s +sqrdmulh v10.4S, v1.4S, v29.s[1] +sub v14.4s, v0.4s, v19.4s +mul v1.4S, v1.4S,v30.s[1] +add v0.4s, v0.4s, v19.4s +mla v18.4S, v3.4S, v31.s[0] +sub v3.4s, v13.4s, v9.4s +sqrdmulh v19.4S, v17.4S, v29.s[2] +add v13.4s, v13.4s, v9.4s +mla v15.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v22.4S, v29.s[2] +mla v6.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v11.4S, v29.s[2] +mla v1.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v21.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +sub v9.4s, v0.4s, v18.4s +mul v22.4S, v22.4S,v30.s[2] +add v0.4s, v0.4s, v18.4s +mla v17.4S, v19.4S, v31.s[0] +sub v19.4s, v13.4s, v15.4s +mla v22.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v15.4s +mul v11.4S, v11.4S,v30.s[2] +sub v15.4s, v20.4s, v6.4s +mul v21.4S, v21.4S,v30.s[2] +add v20.4s, v20.4s, v6.4s +mla v11.4S, v16.4S, v31.s[0] +sub v16.4s, v8.4s, v1.4s +mla v21.4S, v10.4S, v31.s[0] +add v8.4s, v8.4s, v1.4s +sqrdmulh v29.4S, v9.4S, v27.s[1] +mul v9.4S, v9.4S,v28.s[1] +sqrdmulh v30.4S, v19.4S, v27.s[1] +sub v1.4s, v14.4s, v17.4s +mul v19.4S, v19.4S,v28.s[1] +add v14.4s, v14.4s, v17.4s +sqrdmulh v17.4S, v0.4S, v27.s[0] +sub v10.4s, v3.4s, v22.4s +mul v0.4S, v0.4S,v28.s[0] +add v3.4s, v3.4s, v22.4s +sqrdmulh v22.4S, v13.4S, v27.s[0] +sub v6.4s, v7.4s, v11.4s +mul v13.4S, v13.4S,v28.s[0] +add v7.4s, v7.4s, v11.4s +mla v9.4S, v29.4S, v31.s[0] +sub v29.4s, v12.4s, v21.4s +sqrdmulh v11.4S, v14.4S, v27.s[2] +add v12.4s, v12.4s, v21.4s +mla v19.4S, v30.4S, v31.s[0] +sqrdmulh v30.4S, v3.4S, v27.s[2] +mla v0.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v1.4S, v27.s[3] +mla v13.4S, v22.4S, v31.s[0] +sqrdmulh v22.4S, v10.4S, v27.s[3] +mul v14.4S, v14.4S,v28.s[2] +sub v21.4s, v15.4s, v9.4s +mul v3.4S, v3.4S,v28.s[2] +add v15.4s, v15.4s, v9.4s +mla v14.4S, v11.4S, v31.s[0] +sub v11.4s, v16.4s, v19.4s +mla v3.4S, v30.4S, v31.s[0] +add v16.4s, v16.4s, v19.4s +mul v1.4S, v1.4S,v28.s[3] +sub v19.4s, v20.4s, v0.4s +mul v10.4S, v10.4S,v28.s[3] +add v20.4s, v20.4s, v0.4s +mla v1.4S, v17.4S, v31.s[0] +sub v17.4s, v8.4s, v13.4s +mla v10.4S, v22.4S, v31.s[0] +add v8.4s, v8.4s, v13.4s +sqrdmulh v27.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sqrdmulh v28.4S, v11.4S, v25.s[3] +sub v13.4s, v7.4s, v14.4s +mul v11.4S, v11.4S,v26.s[3] +add v7.4s, v7.4s, v14.4s +sqrdmulh v14.4S, v17.4S, v25.s[1] +sub v22.4s, v12.4s, v3.4s +mul v17.4S, v17.4S,v26.s[1] +add v12.4s, v12.4s, v3.4s +sqrdmulh v3.4S, v8.4S, v25.s[0] +sub v0.4s, v6.4s, v1.4s +mul v8.4S, v8.4S,v26.s[0] +add v6.4s, v6.4s, v1.4s +mla v16.4S, v27.4S, v31.s[0] +sub v27.4s, v29.4s, v10.4s +sqrdmulh v25.4S, v12.4S, v23.s[0] +add v29.4s, v29.4s, v10.4s +mla v11.4S, v28.4S, v31.s[0] +sub v28.4s, v15.4s, v16.4s +sqrdmulh v10.4S, v22.4S, v23.s[1] +add v15.4s, v15.4s, v16.4s +mla v17.4S, v14.4S, v31.s[0] +sub v14.4s, v21.4s, v11.4s +sqrdmulh v16.4S, v29.4S, v23.s[2] +add v21.4s, v21.4s, v11.4s +mla v8.4S, v3.4S, v31.s[0] +sub v3.4s, v19.4s, v17.4s +sqrdmulh v11.4S, v27.4S, v23.s[3] +add v19.4s, v19.4s, v17.4s +mul v12.4S, v12.4S,v24.s[0] +sub v17.4s, v20.4s, v8.4s +mul v22.4S, v22.4S,v24.s[1] +add v20.4s, v20.4s, v8.4s +mla v12.4S, v25.4S, v31.s[0] +str q28, [x0, #336] +mla v22.4S, v10.4S, v31.s[0] +str q15, [x0, #272] +mul v29.4S, v29.4S,v24.s[2] +str q14, [x0, #464] +mul v27.4S, v27.4S,v24.s[3] +str q21, [x0, #400] +mla v29.4S, v16.4S, v31.s[0] +str q3, [x0, #208] +mla v27.4S, v11.4S, v31.s[0] +str q19, [x0, #144] +str q17, [x0, #80] +str q20, [x0, #16] +sub v20.4s, v7.4s, v12.4s +str q20, [x0, #592] +add v7.4s, v7.4s, v12.4s +sub v12.4s, v13.4s, v22.4s +str q7, [x0, #528] +add v13.4s, v13.4s, v22.4s +sub v22.4s, v6.4s, v29.4s +str q12, [x0, #720] +add v6.4s, v6.4s, v29.4s +sub v29.4s, v0.4s, v27.4s +str q13, [x0, #656] +add v0.4s, v0.4s, v27.4s +str q22, [x0, #848] +str q6, [x0, #784] +str q29, [x0, #976] +str q0, [x0, #912] +ldr q4, [x0, #224] +ldr q5, [x0, #160] +ldr q18, [x0, #32] +ldr q2, [x17, #+128] +ldr q9, [x17, #+144] +sqrdmulh v30.4S, v18.4S, v9.s[0] +mul v18.4S, v18.4S,v2.s[0] +ldr q1, [x0, #48] +sqrdmulh v26.4S, v1.4S, v9.s[0] +mul v1.4S, v1.4S,v2.s[0] +ldr q8, [x17, #+160] +ldr q25, [x17, #+176] +ldr q28, [x0, #96] +sqrdmulh v10.4S, v28.4S, v25.s[0] +mul v28.4S, v28.4S,v8.s[0] +ldr q15, [x0, #112] +sqrdmulh v14.4S, v15.4S, v25.s[0] +mul v15.4S, v15.4S,v8.s[0] +ldr q21, [x17, #+192] +ldr q16, [x17, #+208] +mla v18.4S, v30.4S, v31.s[0] +sqrdmulh v30.4S, v5.4S, v16.s[0] +ldr q3, [x0, #176] +mla v1.4S, v26.4S, v31.s[0] +sqrdmulh v26.4S, v3.4S, v16.s[0] +ldr q11, [x17, #+224] +ldr q19, [x17, #+240] +mla v28.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v4.4S, v19.s[0] +ldr q24, [x0, #240] +mla v15.4S, v14.4S, v31.s[0] +sqrdmulh v14.4S, v24.4S, v19.s[0] +ldr q23, [x0, #0] +ldr q17, [x0, #128] +mul v5.4S, v5.4S,v21.s[0] +sub v20.4s, v23.4s, v18.4s +ldr q7, [x0, #16] +mul v3.4S, v3.4S,v21.s[0] +add v23.4s, v23.4s, v18.4s +ldr q18, [x0, #144] +mla v5.4S, v30.4S, v31.s[0] +sub v30.4s, v7.4s, v1.4s +ldr q12, [x0, #64] +mla v3.4S, v26.4S, v31.s[0] +add v7.4s, v7.4s, v1.4s +ldr q1, [x0, #192] +mul v4.4S, v4.4S,v11.s[0] +sub v26.4s, v12.4s, v28.4s +ldr q13, [x0, #80] +mul v24.4S, v24.4S,v11.s[0] +add v12.4s, v12.4s, v28.4s +ldr q28, [x0, #208] +mla v4.4S, v10.4S, v31.s[0] +mla v24.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v15.4s +sqrdmulh v10.4S, v7.4S, v9.s[1] +add v13.4s, v13.4s, v15.4s +mul v7.4S, v7.4S,v2.s[1] +sqrdmulh v15.4S, v30.4S, v9.s[2] +sub v27.4s, v17.4s, v5.4s +mul v30.4S, v30.4S,v2.s[2] +add v17.4s, v17.4s, v5.4s +sqrdmulh v9.4S, v13.4S, v25.s[1] +sub v2.4s, v18.4s, v3.4s +mul v13.4S, v13.4S,v8.s[1] +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v14.4S, v25.s[2] +sub v5.4s, v1.4s, v4.4s +mul v14.4S, v14.4S,v8.s[2] +add v1.4s, v1.4s, v4.4s +mla v7.4S, v10.4S, v31.s[0] +sub v10.4s, v28.4s, v24.4s +ldr q25, [x0, #480] +sqrdmulh v8.4S, v18.4S, v16.s[1] +add v28.4s, v28.4s, v24.4s +mla v30.4S, v15.4S, v31.s[0] +ldr q15, [x0, #416] +sqrdmulh v24.4S, v2.4S, v16.s[2] +sub v4.4s, v23.4s, v7.4s +mla v13.4S, v9.4S, v31.s[0] +ldr q9, [x0, #288] +sqrdmulh v22.4S, v28.4S, v19.s[1] +add v23.4s, v23.4s, v7.4s +str q4, [x0, #16] +mla v14.4S, v3.4S, v31.s[0] +ldr q3, [x17, #+256] +ldr q4, [x17, #+272] +sqrdmulh v7.4S, v10.4S, v19.s[2] +sub v6.4s, v20.4s, v30.4s +str q23, [x0, #0] +mul v18.4S, v18.4S,v21.s[1] +add v20.4s, v20.4s, v30.4s +mul v2.4S, v2.4S,v21.s[2] +str q6, [x0, #48] +mla v18.4S, v8.4S, v31.s[0] +sub v8.4s, v12.4s, v13.4s +mla v2.4S, v24.4S, v31.s[0] +str q20, [x0, #32] +mul v28.4S, v28.4S,v11.s[1] +str q8, [x0, #80] +mul v10.4S, v10.4S,v11.s[2] +add v12.4s, v12.4s, v13.4s +str q12, [x0, #64] +mla v28.4S, v22.4S, v31.s[0] +sub v22.4s, v26.4s, v14.4s +str q22, [x0, #112] +mla v10.4S, v7.4S, v31.s[0] +add v26.4s, v26.4s, v14.4s +str q26, [x0, #96] +sqrdmulh v19.4S, v9.4S, v4.s[0] +sub v11.4s, v17.4s, v18.4s +mul v9.4S, v9.4S,v3.s[0] +str q11, [x0, #144] +ldr q11, [x0, #304] +sqrdmulh v26.4S, v11.4S, v4.s[0] +add v17.4s, v17.4s, v18.4s +mul v11.4S, v11.4S,v3.s[0] +str q17, [x0, #128] +ldr q17, [x17, #+288] +ldr q18, [x17, #+304] +ldr q14, [x0, #352] +sqrdmulh v7.4S, v14.4S, v18.s[0] +sub v22.4s, v27.4s, v2.4s +mul v14.4S, v14.4S,v17.s[0] +str q22, [x0, #176] +ldr q22, [x0, #368] +sqrdmulh v12.4S, v22.4S, v18.s[0] +add v27.4s, v27.4s, v2.4s +mul v22.4S, v22.4S,v17.s[0] +str q27, [x0, #160] +ldr q27, [x17, #+320] +ldr q2, [x17, #+336] +mla v9.4S, v19.4S, v31.s[0] +sub v19.4s, v1.4s, v28.4s +sqrdmulh v13.4S, v15.4S, v2.s[0] +str q19, [x0, #208] +ldr q19, [x0, #432] +mla v11.4S, v26.4S, v31.s[0] +add v1.4s, v1.4s, v28.4s +sqrdmulh v28.4S, v19.4S, v2.s[0] +str q1, [x0, #192] +ldr q1, [x17, #+352] +ldr q26, [x17, #+368] +mla v14.4S, v7.4S, v31.s[0] +sub v7.4s, v5.4s, v10.4s +sqrdmulh v8.4S, v25.4S, v26.s[0] +str q7, [x0, #240] +ldr q7, [x0, #496] +mla v22.4S, v12.4S, v31.s[0] +add v5.4s, v5.4s, v10.4s +sqrdmulh v10.4S, v7.4S, v26.s[0] +str q5, [x0, #224] +ldr q5, [x0, #256] +ldr q12, [x0, #384] +mul v15.4S, v15.4S,v27.s[0] +sub v16.4s, v5.4s, v9.4s +ldr q21, [x0, #272] +mul v19.4S, v19.4S,v27.s[0] +add v5.4s, v5.4s, v9.4s +ldr q9, [x0, #400] +mla v15.4S, v13.4S, v31.s[0] +sub v13.4s, v21.4s, v11.4s +ldr q20, [x0, #320] +mla v19.4S, v28.4S, v31.s[0] +add v21.4s, v21.4s, v11.4s +ldr q11, [x0, #448] +mul v25.4S, v25.4S,v1.s[0] +sub v28.4s, v20.4s, v14.4s +ldr q24, [x0, #336] +mul v7.4S, v7.4S,v1.s[0] +add v20.4s, v20.4s, v14.4s +ldr q14, [x0, #464] +mla v25.4S, v8.4S, v31.s[0] +mla v7.4S, v10.4S, v31.s[0] +sub v10.4s, v24.4s, v22.4s +sqrdmulh v8.4S, v21.4S, v4.s[1] +add v24.4s, v24.4s, v22.4s +mul v21.4S, v21.4S,v3.s[1] +sqrdmulh v22.4S, v13.4S, v4.s[2] +sub v6.4s, v12.4s, v15.4s +mul v13.4S, v13.4S,v3.s[2] +add v12.4s, v12.4s, v15.4s +sqrdmulh v4.4S, v24.4S, v18.s[1] +sub v3.4s, v9.4s, v19.4s +mul v24.4S, v24.4S,v17.s[1] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v10.4S, v18.s[2] +sub v15.4s, v11.4s, v25.4s +mul v10.4S, v10.4S,v17.s[2] +add v11.4s, v11.4s, v25.4s +mla v21.4S, v8.4S, v31.s[0] +sub v8.4s, v14.4s, v7.4s +ldr q18, [x0, #736] +sqrdmulh v17.4S, v9.4S, v2.s[1] +add v14.4s, v14.4s, v7.4s +mla v13.4S, v22.4S, v31.s[0] +ldr q22, [x0, #672] +sqrdmulh v7.4S, v3.4S, v2.s[2] +sub v25.4s, v5.4s, v21.4s +mla v24.4S, v4.4S, v31.s[0] +ldr q4, [x0, #544] +sqrdmulh v30.4S, v14.4S, v26.s[1] +add v5.4s, v5.4s, v21.4s +str q25, [x0, #272] +mla v10.4S, v19.4S, v31.s[0] +ldr q19, [x17, #+384] +ldr q25, [x17, #+400] +sqrdmulh v21.4S, v8.4S, v26.s[2] +sub v23.4s, v16.4s, v13.4s +str q5, [x0, #256] +mul v9.4S, v9.4S,v27.s[1] +add v16.4s, v16.4s, v13.4s +mul v3.4S, v3.4S,v27.s[2] +str q23, [x0, #304] +mla v9.4S, v17.4S, v31.s[0] +sub v17.4s, v20.4s, v24.4s +mla v3.4S, v7.4S, v31.s[0] +str q16, [x0, #288] +mul v14.4S, v14.4S,v1.s[1] +str q17, [x0, #336] +mul v8.4S, v8.4S,v1.s[2] +add v20.4s, v20.4s, v24.4s +str q20, [x0, #320] +mla v14.4S, v30.4S, v31.s[0] +sub v30.4s, v28.4s, v10.4s +str q30, [x0, #368] +mla v8.4S, v21.4S, v31.s[0] +add v28.4s, v28.4s, v10.4s +str q28, [x0, #352] +sqrdmulh v26.4S, v4.4S, v25.s[0] +sub v1.4s, v12.4s, v9.4s +mul v4.4S, v4.4S,v19.s[0] +str q1, [x0, #400] +ldr q1, [x0, #560] +sqrdmulh v28.4S, v1.4S, v25.s[0] +add v12.4s, v12.4s, v9.4s +mul v1.4S, v1.4S,v19.s[0] +str q12, [x0, #384] +ldr q12, [x17, #+416] +ldr q9, [x17, #+432] +ldr q10, [x0, #608] +sqrdmulh v21.4S, v10.4S, v9.s[0] +sub v30.4s, v6.4s, v3.4s +mul v10.4S, v10.4S,v12.s[0] +str q30, [x0, #432] +ldr q30, [x0, #624] +sqrdmulh v20.4S, v30.4S, v9.s[0] +add v6.4s, v6.4s, v3.4s +mul v30.4S, v30.4S,v12.s[0] +str q6, [x0, #416] +ldr q6, [x17, #+448] +ldr q3, [x17, #+464] +mla v4.4S, v26.4S, v31.s[0] +sub v26.4s, v11.4s, v14.4s +sqrdmulh v24.4S, v22.4S, v3.s[0] +str q26, [x0, #464] +ldr q26, [x0, #688] +mla v1.4S, v28.4S, v31.s[0] +add v11.4s, v11.4s, v14.4s +sqrdmulh v14.4S, v26.4S, v3.s[0] +str q11, [x0, #448] +ldr q11, [x17, #+480] +ldr q28, [x17, #+496] +mla v10.4S, v21.4S, v31.s[0] +sub v21.4s, v15.4s, v8.4s +sqrdmulh v17.4S, v18.4S, v28.s[0] +str q21, [x0, #496] +ldr q21, [x0, #752] +mla v30.4S, v20.4S, v31.s[0] +add v15.4s, v15.4s, v8.4s +sqrdmulh v8.4S, v21.4S, v28.s[0] +str q15, [x0, #480] +ldr q15, [x0, #512] +ldr q20, [x0, #640] +mul v22.4S, v22.4S,v6.s[0] +sub v2.4s, v15.4s, v4.4s +ldr q27, [x0, #528] +mul v26.4S, v26.4S,v6.s[0] +add v15.4s, v15.4s, v4.4s +ldr q4, [x0, #656] +mla v22.4S, v24.4S, v31.s[0] +sub v24.4s, v27.4s, v1.4s +ldr q16, [x0, #576] +mla v26.4S, v14.4S, v31.s[0] +add v27.4s, v27.4s, v1.4s +ldr q1, [x0, #704] +mul v18.4S, v18.4S,v11.s[0] +sub v14.4s, v16.4s, v10.4s +ldr q7, [x0, #592] +mul v21.4S, v21.4S,v11.s[0] +add v16.4s, v16.4s, v10.4s +ldr q10, [x0, #720] +mla v18.4S, v17.4S, v31.s[0] +mla v21.4S, v8.4S, v31.s[0] +sub v8.4s, v7.4s, v30.4s +sqrdmulh v17.4S, v27.4S, v25.s[1] +add v7.4s, v7.4s, v30.4s +mul v27.4S, v27.4S,v19.s[1] +sqrdmulh v30.4S, v24.4S, v25.s[2] +sub v23.4s, v20.4s, v22.4s +mul v24.4S, v24.4S,v19.s[2] +add v20.4s, v20.4s, v22.4s +sqrdmulh v25.4S, v7.4S, v9.s[1] +sub v19.4s, v4.4s, v26.4s +mul v7.4S, v7.4S,v12.s[1] +add v4.4s, v4.4s, v26.4s +sqrdmulh v26.4S, v8.4S, v9.s[2] +sub v22.4s, v1.4s, v18.4s +mul v8.4S, v8.4S,v12.s[2] +add v1.4s, v1.4s, v18.4s +mla v27.4S, v17.4S, v31.s[0] +sub v17.4s, v10.4s, v21.4s +ldr q9, [x0, #992] +sqrdmulh v12.4S, v4.4S, v3.s[1] +add v10.4s, v10.4s, v21.4s +mla v24.4S, v30.4S, v31.s[0] +ldr q30, [x0, #928] +sqrdmulh v21.4S, v19.4S, v3.s[2] +sub v18.4s, v15.4s, v27.4s +mla v7.4S, v25.4S, v31.s[0] +ldr q25, [x0, #800] +sqrdmulh v13.4S, v10.4S, v28.s[1] +add v15.4s, v15.4s, v27.4s +str q18, [x0, #528] +mla v8.4S, v26.4S, v31.s[0] +ldr q26, [x17, #+512] +ldr q18, [x17, #+528] +sqrdmulh v27.4S, v17.4S, v28.s[2] +sub v5.4s, v2.4s, v24.4s +str q15, [x0, #512] +mul v4.4S, v4.4S,v6.s[1] +add v2.4s, v2.4s, v24.4s +mul v19.4S, v19.4S,v6.s[2] +str q5, [x0, #560] +mla v4.4S, v12.4S, v31.s[0] +sub v12.4s, v16.4s, v7.4s +mla v19.4S, v21.4S, v31.s[0] +str q2, [x0, #544] +mul v10.4S, v10.4S,v11.s[1] +str q12, [x0, #592] +mul v17.4S, v17.4S,v11.s[2] +add v16.4s, v16.4s, v7.4s +str q16, [x0, #576] +mla v10.4S, v13.4S, v31.s[0] +sub v13.4s, v14.4s, v8.4s +str q13, [x0, #624] +mla v17.4S, v27.4S, v31.s[0] +add v14.4s, v14.4s, v8.4s +str q14, [x0, #608] +sqrdmulh v28.4S, v25.4S, v18.s[0] +sub v11.4s, v20.4s, v4.4s +mul v25.4S, v25.4S,v26.s[0] +str q11, [x0, #656] +ldr q11, [x0, #816] +sqrdmulh v14.4S, v11.4S, v18.s[0] +add v20.4s, v20.4s, v4.4s +mul v11.4S, v11.4S,v26.s[0] +str q20, [x0, #640] +ldr q20, [x17, #+544] +ldr q4, [x17, #+560] +ldr q8, [x0, #864] +sqrdmulh v27.4S, v8.4S, v4.s[0] +sub v13.4s, v23.4s, v19.4s +mul v8.4S, v8.4S,v20.s[0] +str q13, [x0, #688] +ldr q13, [x0, #880] +sqrdmulh v16.4S, v13.4S, v4.s[0] +add v23.4s, v23.4s, v19.4s +mul v13.4S, v13.4S,v20.s[0] +str q23, [x0, #672] +ldr q23, [x17, #+576] +ldr q19, [x17, #+592] +mla v25.4S, v28.4S, v31.s[0] +sub v28.4s, v1.4s, v10.4s +sqrdmulh v7.4S, v30.4S, v19.s[0] +str q28, [x0, #720] +ldr q28, [x0, #944] +mla v11.4S, v14.4S, v31.s[0] +add v1.4s, v1.4s, v10.4s +sqrdmulh v10.4S, v28.4S, v19.s[0] +str q1, [x0, #704] +ldr q1, [x17, #+608] +ldr q14, [x17, #+624] +mla v8.4S, v27.4S, v31.s[0] +sub v27.4s, v22.4s, v17.4s +sqrdmulh v12.4S, v9.4S, v14.s[0] +str q27, [x0, #752] +ldr q27, [x0, #1008] +mla v13.4S, v16.4S, v31.s[0] +add v22.4s, v22.4s, v17.4s +sqrdmulh v17.4S, v27.4S, v14.s[0] +str q22, [x0, #736] +ldr q22, [x0, #768] +ldr q16, [x0, #896] +mul v30.4S, v30.4S,v23.s[0] +sub v3.4s, v22.4s, v25.4s +ldr q6, [x0, #784] +mul v28.4S, v28.4S,v23.s[0] +add v22.4s, v22.4s, v25.4s +ldr q25, [x0, #912] +mla v30.4S, v7.4S, v31.s[0] +sub v7.4s, v6.4s, v11.4s +ldr q2, [x0, #832] +mla v28.4S, v10.4S, v31.s[0] +add v6.4s, v6.4s, v11.4s +ldr q11, [x0, #960] +mul v9.4S, v9.4S,v1.s[0] +sub v10.4s, v2.4s, v8.4s +ldr q21, [x0, #848] +mul v27.4S, v27.4S,v1.s[0] +add v2.4s, v2.4s, v8.4s +ldr q8, [x0, #976] +mla v9.4S, v12.4S, v31.s[0] +mla v27.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v13.4s +sqrdmulh v12.4S, v6.4S, v18.s[1] +add v21.4s, v21.4s, v13.4s +mul v6.4S, v6.4S,v26.s[1] +sqrdmulh v13.4S, v7.4S, v18.s[2] +sub v5.4s, v16.4s, v30.4s +mul v7.4S, v7.4S,v26.s[2] +add v16.4s, v16.4s, v30.4s +sqrdmulh v18.4S, v21.4S, v4.s[1] +sub v26.4s, v25.4s, v28.4s +mul v21.4S, v21.4S,v20.s[1] +add v25.4s, v25.4s, v28.4s +sqrdmulh v28.4S, v17.4S, v4.s[2] +sub v30.4s, v11.4s, v9.4s +mul v17.4S, v17.4S,v20.s[2] +add v11.4s, v11.4s, v9.4s +mla v6.4S, v12.4S, v31.s[0] +sub v12.4s, v8.4s, v27.4s +sqrdmulh v4.4S, v25.4S, v19.s[1] +add v8.4s, v8.4s, v27.4s +mla v7.4S, v13.4S, v31.s[0] +sqrdmulh v13.4S, v26.4S, v19.s[2] +sub v27.4s, v22.4s, v6.4s +mla v21.4S, v18.4S, v31.s[0] +sqrdmulh v18.4S, v8.4S, v14.s[1] +add v22.4s, v22.4s, v6.4s +str q27, [x0, #784] +mla v17.4S, v28.4S, v31.s[0] +sqrdmulh v28.4S, v12.4S, v14.s[2] +sub v27.4s, v3.4s, v7.4s +str q22, [x0, #768] +mul v25.4S, v25.4S,v23.s[1] +add v3.4s, v3.4s, v7.4s +mul v26.4S, v26.4S,v23.s[2] +str q27, [x0, #816] +mla v25.4S, v4.4S, v31.s[0] +sub v4.4s, v2.4s, v21.4s +mla v26.4S, v13.4S, v31.s[0] +str q3, [x0, #800] +mul v8.4S, v8.4S,v1.s[1] +str q4, [x0, #848] +mul v12.4S, v12.4S,v1.s[2] +add v2.4s, v2.4s, v21.4s +str q2, [x0, #832] +mla v8.4S, v18.4S, v31.s[0] +sub v18.4s, v10.4s, v17.4s +str q18, [x0, #880] +mla v12.4S, v28.4S, v31.s[0] +add v10.4s, v10.4s, v17.4s +str q10, [x0, #864] +sub v14.4s, v16.4s, v25.4s +str q14, [x0, #912] +add v16.4s, v16.4s, v25.4s +str q16, [x0, #896] +sub v16.4s, v5.4s, v26.4s +str q16, [x0, #944] +add v5.4s, v5.4s, v26.4s +str q5, [x0, #928] +sub v5.4s, v11.4s, v8.4s +str q5, [x0, #976] +add v11.4s, v11.4s, v8.4s +str q11, [x0, #960] +sub v11.4s, v30.4s, v12.4s +str q11, [x0, #1008] +add v30.4s, v30.4s, v12.4s +str q30, [x0, #992] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1464 +// Instruction count: 1460 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_14_z4_7.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_14_z4_7.s new file mode 100644 index 0000000..2cf09d4 --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_14_z4_7.s @@ -0,0 +1,1578 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_14_z4_7 +.global _ntt_u32_incomplete_neon_asm_var_4_2_14_z4_7 +ntt_u32_incomplete_neon_asm_var_4_2_14_z4_7: +_ntt_u32_incomplete_neon_asm_var_4_2_14_z4_7: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x0, #928] +ldr q29, [x17, #+0] +ldr q28, [x17, #+16] +sqrdmulh v27.4S, v30.4S, v28.s[0] +mul v30.4S, v30.4S,v29.s[0] +ldr q26, [x0, #992] +sqrdmulh v25.4S, v26.4S, v28.s[0] +mul v26.4S, v26.4S,v29.s[0] +ldr q24, [x0, #800] +sqrdmulh v23.4S, v24.4S, v28.s[0] +mul v24.4S, v24.4S,v29.s[0] +ldr q22, [x0, #864] +sqrdmulh v21.4S, v22.4S, v28.s[0] +mul v22.4S, v22.4S,v29.s[0] +ldr q20, [x0, #544] +mla v30.4S, v27.4S, v31.s[0] +sqrdmulh v27.4S, v20.4S, v28.s[0] +ldr q19, [x0, #608] +mla v26.4S, v25.4S, v31.s[0] +sqrdmulh v25.4S, v19.4S, v28.s[0] +nop +ldr q18, [x0, #672] +mla v24.4S, v23.4S, v31.s[0] +sqrdmulh v23.4S, v18.4S, v28.s[0] +nop +ldr q17, [x0, #736] +mla v22.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v17.4S, v28.s[0] +nop +ldr q16, [x0, #416] +ldr q3, [x0, #480] +mul v20.4S, v20.4S,v29.s[0] +sub v2.4s, v16.4s, v30.4s +mul v19.4S, v19.4S,v29.s[0] +add v16.4s, v16.4s, v30.4s +ldr q30, [x0, #288] +ldr q1, [x0, #352] +mla v20.4S, v27.4S, v31.s[0] +sub v27.4s, v3.4s, v26.4s +mla v19.4S, v25.4S, v31.s[0] +add v3.4s, v3.4s, v26.4s +ldr q26, [x0, #32] +ldr q25, [x0, #96] +mul v18.4S, v18.4S,v29.s[0] +sub v0.4s, v30.4s, v24.4s +mul v17.4S, v17.4S,v29.s[0] +add v30.4s, v30.4s, v24.4s +ldr q24, [x0, #160] +ldr q15, [x0, #224] +mla v18.4S, v23.4S, v31.s[0] +sub v23.4s, v1.4s, v22.4s +mla v17.4S, v21.4S, v31.s[0] +add v1.4s, v1.4s, v22.4s +sqrdmulh v22.4S, v16.4S, v28.s[1] +nop +mul v16.4S, v16.4S,v29.s[1] +nop +sqrdmulh v21.4S, v3.4S, v28.s[1] +sub v14.4s, v26.4s, v20.4s +mul v3.4S, v3.4S,v29.s[1] +add v26.4s, v26.4s, v20.4s +sqrdmulh v20.4S, v30.4S, v28.s[1] +sub v13.4s, v25.4s, v19.4s +mul v30.4S, v30.4S,v29.s[1] +add v25.4s, v25.4s, v19.4s +sqrdmulh v19.4S, v1.4S, v28.s[1] +sub v12.4s, v24.4s, v18.4s +mul v1.4S, v1.4S,v29.s[1] +add v24.4s, v24.4s, v18.4s +mla v16.4S, v22.4S, v31.s[0] +sub v22.4s, v15.4s, v17.4s +sqrdmulh v18.4S, v2.4S, v28.s[2] +add v15.4s, v15.4s, v17.4s +mla v3.4S, v21.4S, v31.s[0] +nop +sqrdmulh v21.4S, v27.4S, v28.s[2] +nop +mla v30.4S, v20.4S, v31.s[0] +nop +sqrdmulh v20.4S, v0.4S, v28.s[2] +nop +mla v1.4S, v19.4S, v31.s[0] +nop +sqrdmulh v19.4S, v23.4S, v28.s[2] +nop +ldr q17, [x17, #+32] +ldr q11, [x17, #+48] +mul v2.4S, v2.4S,v29.s[2] +sub v10.4s, v24.4s, v16.4s +mul v27.4S, v27.4S,v29.s[2] +add v24.4s, v24.4s, v16.4s +mla v2.4S, v18.4S, v31.s[0] +sub v18.4s, v15.4s, v3.4s +mla v27.4S, v21.4S, v31.s[0] +add v15.4s, v15.4s, v3.4s +mul v0.4S, v0.4S,v29.s[2] +sub v3.4s, v26.4s, v30.4s +mul v23.4S, v23.4S,v29.s[2] +add v26.4s, v26.4s, v30.4s +mla v0.4S, v20.4S, v31.s[0] +sub v20.4s, v25.4s, v1.4s +mla v23.4S, v19.4S, v31.s[0] +add v25.4s, v25.4s, v1.4s +sqrdmulh v1.4S, v10.4S, v11.s[1] +nop +mul v10.4S, v10.4S,v17.s[1] +nop +sqrdmulh v19.4S, v18.4S, v11.s[1] +sub v30.4s, v12.4s, v2.4s +mul v18.4S, v18.4S,v17.s[1] +add v12.4s, v12.4s, v2.4s +sqrdmulh v2.4S, v24.4S, v11.s[0] +sub v21.4s, v22.4s, v27.4s +mul v24.4S, v24.4S,v17.s[0] +add v22.4s, v22.4s, v27.4s +sqrdmulh v27.4S, v15.4S, v11.s[0] +sub v16.4s, v14.4s, v0.4s +mul v15.4S, v15.4S,v17.s[0] +add v14.4s, v14.4s, v0.4s +ldr q0, [x17, #+64] +ldr q9, [x17, #+80] +mla v10.4S, v1.4S, v31.s[0] +sub v1.4s, v13.4s, v23.4s +sqrdmulh v8.4S, v12.4S, v11.s[2] +add v13.4s, v13.4s, v23.4s +mla v18.4S, v19.4S, v31.s[0] +nop +sqrdmulh v19.4S, v22.4S, v11.s[2] +nop +mla v24.4S, v2.4S, v31.s[0] +nop +sqrdmulh v2.4S, v30.4S, v11.s[3] +nop +mla v15.4S, v27.4S, v31.s[0] +nop +sqrdmulh v27.4S, v21.4S, v11.s[3] +nop +ldr q23, [x17, #+96] +ldr q7, [x17, #+112] +mul v12.4S, v12.4S,v17.s[2] +sub v6.4s, v3.4s, v10.4s +mul v22.4S, v22.4S,v17.s[2] +add v3.4s, v3.4s, v10.4s +mla v12.4S, v8.4S, v31.s[0] +sub v8.4s, v20.4s, v18.4s +mla v22.4S, v19.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +mul v30.4S, v30.4S,v17.s[3] +sub v18.4s, v26.4s, v24.4s +mul v21.4S, v21.4S,v17.s[3] +add v26.4s, v26.4s, v24.4s +mla v30.4S, v2.4S, v31.s[0] +sub v2.4s, v25.4s, v15.4s +mla v21.4S, v27.4S, v31.s[0] +add v25.4s, v25.4s, v15.4s +sqrdmulh v15.4S, v8.4S, v9.s[3] +nop +mul v8.4S, v8.4S,v0.s[3] +nop +sqrdmulh v27.4S, v20.4S, v9.s[2] +sub v24.4s, v14.4s, v12.4s +mul v20.4S, v20.4S,v0.s[2] +add v14.4s, v14.4s, v12.4s +sqrdmulh v12.4S, v2.4S, v9.s[1] +sub v19.4s, v13.4s, v22.4s +mul v2.4S, v2.4S,v0.s[1] +add v13.4s, v13.4s, v22.4s +sqrdmulh v22.4S, v25.4S, v9.s[0] +sub v10.4s, v16.4s, v30.4s +mul v25.4S, v25.4S,v0.s[0] +add v16.4s, v16.4s, v30.4s +mla v8.4S, v15.4S, v31.s[0] +sub v15.4s, v1.4s, v21.4s +sqrdmulh v30.4S, v13.4S, v7.s[0] +add v1.4s, v1.4s, v21.4s +mla v20.4S, v27.4S, v31.s[0] +sub v27.4s, v6.4s, v8.4s +sqrdmulh v21.4S, v19.4S, v7.s[1] +add v6.4s, v6.4s, v8.4s +mla v2.4S, v12.4S, v31.s[0] +sub v12.4s, v3.4s, v20.4s +sqrdmulh v8.4S, v1.4S, v7.s[2] +add v3.4s, v3.4s, v20.4s +mla v25.4S, v22.4S, v31.s[0] +sub v22.4s, v18.4s, v2.4s +sqrdmulh v20.4S, v15.4S, v7.s[3] +add v18.4s, v18.4s, v2.4s +mul v13.4S, v13.4S,v23.s[0] +sub v2.4s, v26.4s, v25.4s +mul v19.4S, v19.4S,v23.s[1] +add v26.4s, v26.4s, v25.4s +mla v13.4S, v30.4S, v31.s[0] +str q12, [x0, #352] +mla v19.4S, v21.4S, v31.s[0] +str q3, [x0, #288] +mul v1.4S, v1.4S,v23.s[2] +str q27, [x0, #480] +mul v15.4S, v15.4S,v23.s[3] +str q6, [x0, #416] +mla v1.4S, v8.4S, v31.s[0] +str q22, [x0, #224] +mla v15.4S, v20.4S, v31.s[0] +str q18, [x0, #160] +ldr q18, [x0, #944] +sqrdmulh v20.4S, v18.4S, v28.s[0] +str q2, [x0, #96] +mul v18.4S, v18.4S,v29.s[0] +str q26, [x0, #32] +ldr q26, [x0, #1008] +sqrdmulh v2.4S, v26.4S, v28.s[0] +sub v22.4s, v14.4s, v13.4s +str q22, [x0, #608] +mul v26.4S, v26.4S,v29.s[0] +add v14.4s, v14.4s, v13.4s +ldr q13, [x0, #816] +sqrdmulh v22.4S, v13.4S, v28.s[0] +sub v8.4s, v24.4s, v19.4s +str q14, [x0, #544] +mul v13.4S, v13.4S,v29.s[0] +add v24.4s, v24.4s, v19.4s +ldr q19, [x0, #880] +sqrdmulh v14.4S, v19.4S, v28.s[0] +sub v6.4s, v16.4s, v1.4s +str q8, [x0, #736] +mul v19.4S, v19.4S,v29.s[0] +add v16.4s, v16.4s, v1.4s +ldr q1, [x0, #560] +mla v18.4S, v20.4S, v31.s[0] +sub v20.4s, v10.4s, v15.4s +str q24, [x0, #672] +sqrdmulh v24.4S, v1.4S, v28.s[0] +add v10.4s, v10.4s, v15.4s +ldr q15, [x0, #624] +mla v26.4S, v2.4S, v31.s[0] +str q6, [x0, #864] +sqrdmulh v6.4S, v15.4S, v28.s[0] +nop +ldr q2, [x0, #688] +mla v13.4S, v22.4S, v31.s[0] +str q16, [x0, #800] +sqrdmulh v16.4S, v2.4S, v28.s[0] +nop +ldr q22, [x0, #752] +mla v19.4S, v14.4S, v31.s[0] +str q20, [x0, #992] +sqrdmulh v20.4S, v22.4S, v28.s[0] +nop +ldr q14, [x0, #432] +ldr q8, [x0, #496] +mul v1.4S, v1.4S,v29.s[0] +sub v27.4s, v14.4s, v18.4s +str q10, [x0, #928] +mul v15.4S, v15.4S,v29.s[0] +add v14.4s, v14.4s, v18.4s +ldr q18, [x0, #304] +ldr q10, [x0, #368] +mla v1.4S, v24.4S, v31.s[0] +sub v24.4s, v8.4s, v26.4s +mla v15.4S, v6.4S, v31.s[0] +add v8.4s, v8.4s, v26.4s +ldr q26, [x0, #48] +ldr q6, [x0, #112] +mul v2.4S, v2.4S,v29.s[0] +sub v3.4s, v18.4s, v13.4s +mul v22.4S, v22.4S,v29.s[0] +add v18.4s, v18.4s, v13.4s +ldr q13, [x0, #176] +ldr q21, [x0, #240] +mla v2.4S, v16.4S, v31.s[0] +sub v16.4s, v10.4s, v19.4s +mla v22.4S, v20.4S, v31.s[0] +add v10.4s, v10.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v28.s[1] +nop +mul v14.4S, v14.4S,v29.s[1] +nop +sqrdmulh v20.4S, v8.4S, v28.s[1] +sub v12.4s, v26.4s, v1.4s +mul v8.4S, v8.4S,v29.s[1] +add v26.4s, v26.4s, v1.4s +sqrdmulh v1.4S, v18.4S, v28.s[1] +sub v30.4s, v6.4s, v15.4s +mul v18.4S, v18.4S,v29.s[1] +add v6.4s, v6.4s, v15.4s +sqrdmulh v15.4S, v10.4S, v28.s[1] +sub v25.4s, v13.4s, v2.4s +mul v10.4S, v10.4S,v29.s[1] +add v13.4s, v13.4s, v2.4s +mla v14.4S, v19.4S, v31.s[0] +sub v19.4s, v21.4s, v22.4s +sqrdmulh v2.4S, v27.4S, v28.s[2] +add v21.4s, v21.4s, v22.4s +mla v8.4S, v20.4S, v31.s[0] +nop +sqrdmulh v20.4S, v24.4S, v28.s[2] +nop +mla v18.4S, v1.4S, v31.s[0] +nop +sqrdmulh v1.4S, v3.4S, v28.s[2] +nop +mla v10.4S, v15.4S, v31.s[0] +nop +sqrdmulh v15.4S, v16.4S, v28.s[2] +nop +mul v27.4S, v27.4S,v29.s[2] +sub v22.4s, v13.4s, v14.4s +mul v24.4S, v24.4S,v29.s[2] +add v13.4s, v13.4s, v14.4s +mla v27.4S, v2.4S, v31.s[0] +sub v2.4s, v21.4s, v8.4s +mla v24.4S, v20.4S, v31.s[0] +add v21.4s, v21.4s, v8.4s +mul v3.4S, v3.4S,v29.s[2] +sub v8.4s, v26.4s, v18.4s +mul v16.4S, v16.4S,v29.s[2] +add v26.4s, v26.4s, v18.4s +mla v3.4S, v1.4S, v31.s[0] +sub v1.4s, v6.4s, v10.4s +mla v16.4S, v15.4S, v31.s[0] +add v6.4s, v6.4s, v10.4s +sqrdmulh v10.4S, v22.4S, v11.s[1] +nop +mul v22.4S, v22.4S,v17.s[1] +nop +sqrdmulh v15.4S, v2.4S, v11.s[1] +sub v18.4s, v25.4s, v27.4s +mul v2.4S, v2.4S,v17.s[1] +add v25.4s, v25.4s, v27.4s +sqrdmulh v27.4S, v13.4S, v11.s[0] +sub v20.4s, v19.4s, v24.4s +mul v13.4S, v13.4S,v17.s[0] +add v19.4s, v19.4s, v24.4s +sqrdmulh v24.4S, v21.4S, v11.s[0] +sub v14.4s, v12.4s, v3.4s +mul v21.4S, v21.4S,v17.s[0] +add v12.4s, v12.4s, v3.4s +mla v22.4S, v10.4S, v31.s[0] +sub v10.4s, v30.4s, v16.4s +sqrdmulh v3.4S, v25.4S, v11.s[2] +add v30.4s, v30.4s, v16.4s +mla v2.4S, v15.4S, v31.s[0] +nop +sqrdmulh v15.4S, v19.4S, v11.s[2] +nop +mla v13.4S, v27.4S, v31.s[0] +nop +sqrdmulh v27.4S, v18.4S, v11.s[3] +nop +mla v21.4S, v24.4S, v31.s[0] +nop +sqrdmulh v24.4S, v20.4S, v11.s[3] +nop +mul v25.4S, v25.4S,v17.s[2] +sub v16.4s, v8.4s, v22.4s +mul v19.4S, v19.4S,v17.s[2] +add v8.4s, v8.4s, v22.4s +mla v25.4S, v3.4S, v31.s[0] +sub v3.4s, v1.4s, v2.4s +mla v19.4S, v15.4S, v31.s[0] +add v1.4s, v1.4s, v2.4s +mul v18.4S, v18.4S,v17.s[3] +sub v2.4s, v26.4s, v13.4s +mul v20.4S, v20.4S,v17.s[3] +add v26.4s, v26.4s, v13.4s +mla v18.4S, v27.4S, v31.s[0] +sub v27.4s, v6.4s, v21.4s +mla v20.4S, v24.4S, v31.s[0] +add v6.4s, v6.4s, v21.4s +sqrdmulh v21.4S, v3.4S, v9.s[3] +nop +mul v3.4S, v3.4S,v0.s[3] +nop +sqrdmulh v24.4S, v1.4S, v9.s[2] +sub v13.4s, v12.4s, v25.4s +mul v1.4S, v1.4S,v0.s[2] +add v12.4s, v12.4s, v25.4s +sqrdmulh v25.4S, v27.4S, v9.s[1] +sub v15.4s, v30.4s, v19.4s +mul v27.4S, v27.4S,v0.s[1] +add v30.4s, v30.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v9.s[0] +sub v22.4s, v14.4s, v18.4s +mul v6.4S, v6.4S,v0.s[0] +add v14.4s, v14.4s, v18.4s +mla v3.4S, v21.4S, v31.s[0] +sub v21.4s, v10.4s, v20.4s +sqrdmulh v18.4S, v30.4S, v7.s[0] +add v10.4s, v10.4s, v20.4s +mla v1.4S, v24.4S, v31.s[0] +sub v24.4s, v16.4s, v3.4s +sqrdmulh v20.4S, v15.4S, v7.s[1] +add v16.4s, v16.4s, v3.4s +mla v27.4S, v25.4S, v31.s[0] +sub v25.4s, v8.4s, v1.4s +sqrdmulh v3.4S, v10.4S, v7.s[2] +add v8.4s, v8.4s, v1.4s +mla v6.4S, v19.4S, v31.s[0] +sub v19.4s, v2.4s, v27.4s +sqrdmulh v1.4S, v21.4S, v7.s[3] +add v2.4s, v2.4s, v27.4s +mul v30.4S, v30.4S,v23.s[0] +sub v27.4s, v26.4s, v6.4s +mul v15.4S, v15.4S,v23.s[1] +add v26.4s, v26.4s, v6.4s +mla v30.4S, v18.4S, v31.s[0] +str q25, [x0, #368] +mla v15.4S, v20.4S, v31.s[0] +str q8, [x0, #304] +mul v10.4S, v10.4S,v23.s[2] +str q24, [x0, #496] +mul v21.4S, v21.4S,v23.s[3] +str q16, [x0, #432] +mla v10.4S, v3.4S, v31.s[0] +str q19, [x0, #240] +mla v21.4S, v1.4S, v31.s[0] +str q2, [x0, #176] +ldr q2, [x0, #896] +sqrdmulh v1.4S, v2.4S, v28.s[0] +str q27, [x0, #112] +mul v2.4S, v2.4S,v29.s[0] +str q26, [x0, #48] +ldr q26, [x0, #960] +sqrdmulh v27.4S, v26.4S, v28.s[0] +sub v19.4s, v12.4s, v30.4s +str q19, [x0, #624] +mul v26.4S, v26.4S,v29.s[0] +add v12.4s, v12.4s, v30.4s +ldr q30, [x0, #768] +sqrdmulh v19.4S, v30.4S, v28.s[0] +sub v3.4s, v13.4s, v15.4s +str q12, [x0, #560] +mul v30.4S, v30.4S,v29.s[0] +add v13.4s, v13.4s, v15.4s +ldr q15, [x0, #832] +sqrdmulh v12.4S, v15.4S, v28.s[0] +sub v16.4s, v14.4s, v10.4s +str q3, [x0, #752] +mul v15.4S, v15.4S,v29.s[0] +add v14.4s, v14.4s, v10.4s +ldr q10, [x0, #512] +mla v2.4S, v1.4S, v31.s[0] +sub v1.4s, v22.4s, v21.4s +str q13, [x0, #688] +sqrdmulh v13.4S, v10.4S, v28.s[0] +add v22.4s, v22.4s, v21.4s +ldr q21, [x0, #576] +mla v26.4S, v27.4S, v31.s[0] +str q16, [x0, #880] +sqrdmulh v16.4S, v21.4S, v28.s[0] +nop +ldr q27, [x0, #640] +mla v30.4S, v19.4S, v31.s[0] +str q14, [x0, #816] +sqrdmulh v14.4S, v27.4S, v28.s[0] +nop +ldr q19, [x0, #704] +mla v15.4S, v12.4S, v31.s[0] +str q1, [x0, #1008] +sqrdmulh v1.4S, v19.4S, v28.s[0] +nop +ldr q12, [x0, #384] +ldr q3, [x0, #448] +mul v10.4S, v10.4S,v29.s[0] +sub v24.4s, v12.4s, v2.4s +str q22, [x0, #944] +mul v21.4S, v21.4S,v29.s[0] +add v12.4s, v12.4s, v2.4s +ldr q2, [x0, #256] +ldr q22, [x0, #320] +mla v10.4S, v13.4S, v31.s[0] +sub v13.4s, v3.4s, v26.4s +mla v21.4S, v16.4S, v31.s[0] +add v3.4s, v3.4s, v26.4s +ldr q26, [x0, #0] +ldr q16, [x0, #64] +mul v27.4S, v27.4S,v29.s[0] +sub v8.4s, v2.4s, v30.4s +mul v19.4S, v19.4S,v29.s[0] +add v2.4s, v2.4s, v30.4s +ldr q30, [x0, #128] +ldr q20, [x0, #192] +mla v27.4S, v14.4S, v31.s[0] +sub v14.4s, v22.4s, v15.4s +mla v19.4S, v1.4S, v31.s[0] +add v22.4s, v22.4s, v15.4s +sqrdmulh v15.4S, v12.4S, v28.s[1] +nop +mul v12.4S, v12.4S,v29.s[1] +nop +sqrdmulh v1.4S, v3.4S, v28.s[1] +sub v25.4s, v26.4s, v10.4s +mul v3.4S, v3.4S,v29.s[1] +add v26.4s, v26.4s, v10.4s +sqrdmulh v10.4S, v2.4S, v28.s[1] +sub v18.4s, v16.4s, v21.4s +mul v2.4S, v2.4S,v29.s[1] +add v16.4s, v16.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v28.s[1] +sub v6.4s, v30.4s, v27.4s +mul v22.4S, v22.4S,v29.s[1] +add v30.4s, v30.4s, v27.4s +mla v12.4S, v15.4S, v31.s[0] +sub v15.4s, v20.4s, v19.4s +sqrdmulh v27.4S, v24.4S, v28.s[2] +add v20.4s, v20.4s, v19.4s +mla v3.4S, v1.4S, v31.s[0] +nop +sqrdmulh v1.4S, v13.4S, v28.s[2] +nop +mla v2.4S, v10.4S, v31.s[0] +nop +sqrdmulh v10.4S, v8.4S, v28.s[2] +nop +mla v22.4S, v21.4S, v31.s[0] +nop +sqrdmulh v21.4S, v14.4S, v28.s[2] +nop +mul v24.4S, v24.4S,v29.s[2] +sub v19.4s, v30.4s, v12.4s +mul v13.4S, v13.4S,v29.s[2] +add v30.4s, v30.4s, v12.4s +mla v24.4S, v27.4S, v31.s[0] +sub v27.4s, v20.4s, v3.4s +mla v13.4S, v1.4S, v31.s[0] +add v20.4s, v20.4s, v3.4s +mul v8.4S, v8.4S,v29.s[2] +sub v3.4s, v26.4s, v2.4s +mul v14.4S, v14.4S,v29.s[2] +add v26.4s, v26.4s, v2.4s +mla v8.4S, v10.4S, v31.s[0] +sub v10.4s, v16.4s, v22.4s +mla v14.4S, v21.4S, v31.s[0] +add v16.4s, v16.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v11.s[1] +nop +mul v19.4S, v19.4S,v17.s[1] +nop +sqrdmulh v21.4S, v27.4S, v11.s[1] +sub v2.4s, v6.4s, v24.4s +mul v27.4S, v27.4S,v17.s[1] +add v6.4s, v6.4s, v24.4s +sqrdmulh v24.4S, v30.4S, v11.s[0] +sub v1.4s, v15.4s, v13.4s +mul v30.4S, v30.4S,v17.s[0] +add v15.4s, v15.4s, v13.4s +sqrdmulh v13.4S, v20.4S, v11.s[0] +sub v12.4s, v25.4s, v8.4s +mul v20.4S, v20.4S,v17.s[0] +add v25.4s, v25.4s, v8.4s +mla v19.4S, v22.4S, v31.s[0] +sub v22.4s, v18.4s, v14.4s +sqrdmulh v8.4S, v6.4S, v11.s[2] +add v18.4s, v18.4s, v14.4s +mla v27.4S, v21.4S, v31.s[0] +nop +sqrdmulh v21.4S, v15.4S, v11.s[2] +nop +mla v30.4S, v24.4S, v31.s[0] +nop +sqrdmulh v24.4S, v2.4S, v11.s[3] +nop +mla v20.4S, v13.4S, v31.s[0] +nop +sqrdmulh v13.4S, v1.4S, v11.s[3] +nop +mul v6.4S, v6.4S,v17.s[2] +sub v14.4s, v3.4s, v19.4s +mul v15.4S, v15.4S,v17.s[2] +add v3.4s, v3.4s, v19.4s +mla v6.4S, v8.4S, v31.s[0] +sub v8.4s, v10.4s, v27.4s +mla v15.4S, v21.4S, v31.s[0] +add v10.4s, v10.4s, v27.4s +mul v2.4S, v2.4S,v17.s[3] +sub v27.4s, v26.4s, v30.4s +mul v1.4S, v1.4S,v17.s[3] +add v26.4s, v26.4s, v30.4s +mla v2.4S, v24.4S, v31.s[0] +sub v24.4s, v16.4s, v20.4s +mla v1.4S, v13.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v9.s[3] +nop +mul v8.4S, v8.4S,v0.s[3] +nop +sqrdmulh v13.4S, v10.4S, v9.s[2] +sub v30.4s, v25.4s, v6.4s +mul v10.4S, v10.4S,v0.s[2] +add v25.4s, v25.4s, v6.4s +sqrdmulh v6.4S, v24.4S, v9.s[1] +sub v21.4s, v18.4s, v15.4s +mul v24.4S, v24.4S,v0.s[1] +add v18.4s, v18.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v9.s[0] +sub v19.4s, v12.4s, v2.4s +mul v16.4S, v16.4S,v0.s[0] +add v12.4s, v12.4s, v2.4s +mla v8.4S, v20.4S, v31.s[0] +sub v20.4s, v22.4s, v1.4s +sqrdmulh v2.4S, v18.4S, v7.s[0] +add v22.4s, v22.4s, v1.4s +mla v10.4S, v13.4S, v31.s[0] +sub v13.4s, v14.4s, v8.4s +sqrdmulh v1.4S, v21.4S, v7.s[1] +add v14.4s, v14.4s, v8.4s +mla v24.4S, v6.4S, v31.s[0] +sub v6.4s, v3.4s, v10.4s +sqrdmulh v8.4S, v22.4S, v7.s[2] +add v3.4s, v3.4s, v10.4s +mla v16.4S, v15.4S, v31.s[0] +sub v15.4s, v27.4s, v24.4s +sqrdmulh v10.4S, v20.4S, v7.s[3] +add v27.4s, v27.4s, v24.4s +mul v18.4S, v18.4S,v23.s[0] +sub v24.4s, v26.4s, v16.4s +mul v21.4S, v21.4S,v23.s[1] +add v26.4s, v26.4s, v16.4s +mla v18.4S, v2.4S, v31.s[0] +str q6, [x0, #320] +mla v21.4S, v1.4S, v31.s[0] +str q3, [x0, #256] +mul v22.4S, v22.4S,v23.s[2] +str q13, [x0, #448] +mul v20.4S, v20.4S,v23.s[3] +str q14, [x0, #384] +mla v22.4S, v8.4S, v31.s[0] +str q15, [x0, #192] +mla v20.4S, v10.4S, v31.s[0] +str q27, [x0, #128] +ldr q27, [x0, #912] +sqrdmulh v10.4S, v27.4S, v28.s[0] +str q24, [x0, #64] +mul v27.4S, v27.4S,v29.s[0] +str q26, [x0, #0] +ldr q26, [x0, #976] +sqrdmulh v24.4S, v26.4S, v28.s[0] +sub v15.4s, v25.4s, v18.4s +str q15, [x0, #576] +mul v26.4S, v26.4S,v29.s[0] +add v25.4s, v25.4s, v18.4s +ldr q18, [x0, #784] +sqrdmulh v15.4S, v18.4S, v28.s[0] +sub v8.4s, v30.4s, v21.4s +str q25, [x0, #512] +mul v18.4S, v18.4S,v29.s[0] +add v30.4s, v30.4s, v21.4s +ldr q21, [x0, #848] +sqrdmulh v25.4S, v21.4S, v28.s[0] +sub v14.4s, v12.4s, v22.4s +str q8, [x0, #704] +mul v21.4S, v21.4S,v29.s[0] +add v12.4s, v12.4s, v22.4s +ldr q22, [x0, #528] +mla v27.4S, v10.4S, v31.s[0] +sub v10.4s, v19.4s, v20.4s +str q30, [x0, #640] +sqrdmulh v30.4S, v22.4S, v28.s[0] +add v19.4s, v19.4s, v20.4s +ldr q20, [x0, #592] +mla v26.4S, v24.4S, v31.s[0] +str q14, [x0, #832] +sqrdmulh v14.4S, v20.4S, v28.s[0] +nop +ldr q24, [x0, #656] +mla v18.4S, v15.4S, v31.s[0] +str q12, [x0, #768] +sqrdmulh v12.4S, v24.4S, v28.s[0] +nop +ldr q15, [x0, #720] +mla v21.4S, v25.4S, v31.s[0] +str q10, [x0, #960] +sqrdmulh v10.4S, v15.4S, v28.s[0] +nop +ldr q25, [x0, #400] +ldr q8, [x0, #464] +mul v22.4S, v22.4S,v29.s[0] +sub v13.4s, v25.4s, v27.4s +str q19, [x0, #896] +mul v20.4S, v20.4S,v29.s[0] +add v25.4s, v25.4s, v27.4s +ldr q27, [x0, #272] +ldr q19, [x0, #336] +mla v22.4S, v30.4S, v31.s[0] +sub v30.4s, v8.4s, v26.4s +mla v20.4S, v14.4S, v31.s[0] +add v8.4s, v8.4s, v26.4s +ldr q26, [x0, #16] +ldr q14, [x0, #80] +mul v24.4S, v24.4S,v29.s[0] +sub v3.4s, v27.4s, v18.4s +mul v15.4S, v15.4S,v29.s[0] +add v27.4s, v27.4s, v18.4s +ldr q18, [x0, #144] +ldr q1, [x0, #208] +mla v24.4S, v12.4S, v31.s[0] +sub v12.4s, v19.4s, v21.4s +mla v15.4S, v10.4S, v31.s[0] +add v19.4s, v19.4s, v21.4s +sqrdmulh v21.4S, v25.4S, v28.s[1] +nop +mul v25.4S, v25.4S,v29.s[1] +nop +sqrdmulh v10.4S, v8.4S, v28.s[1] +sub v6.4s, v26.4s, v22.4s +mul v8.4S, v8.4S,v29.s[1] +add v26.4s, v26.4s, v22.4s +sqrdmulh v22.4S, v27.4S, v28.s[1] +sub v2.4s, v14.4s, v20.4s +mul v27.4S, v27.4S,v29.s[1] +add v14.4s, v14.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v28.s[1] +sub v16.4s, v18.4s, v24.4s +mul v19.4S, v19.4S,v29.s[1] +add v18.4s, v18.4s, v24.4s +mla v25.4S, v21.4S, v31.s[0] +sub v21.4s, v1.4s, v15.4s +sqrdmulh v24.4S, v13.4S, v28.s[2] +add v1.4s, v1.4s, v15.4s +mla v8.4S, v10.4S, v31.s[0] +nop +sqrdmulh v10.4S, v30.4S, v28.s[2] +nop +mla v27.4S, v22.4S, v31.s[0] +nop +sqrdmulh v22.4S, v3.4S, v28.s[2] +nop +mla v19.4S, v20.4S, v31.s[0] +nop +sqrdmulh v20.4S, v12.4S, v28.s[2] +nop +mul v13.4S, v13.4S,v29.s[2] +sub v15.4s, v18.4s, v25.4s +mul v30.4S, v30.4S,v29.s[2] +add v18.4s, v18.4s, v25.4s +mla v13.4S, v24.4S, v31.s[0] +sub v24.4s, v1.4s, v8.4s +mla v30.4S, v10.4S, v31.s[0] +add v1.4s, v1.4s, v8.4s +mul v3.4S, v3.4S,v29.s[2] +sub v8.4s, v26.4s, v27.4s +mul v12.4S, v12.4S,v29.s[2] +add v26.4s, v26.4s, v27.4s +mla v3.4S, v22.4S, v31.s[0] +sub v22.4s, v14.4s, v19.4s +mla v12.4S, v20.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +sqrdmulh v28.4S, v15.4S, v11.s[1] +nop +mul v15.4S, v15.4S,v17.s[1] +nop +sqrdmulh v29.4S, v24.4S, v11.s[1] +sub v19.4s, v16.4s, v13.4s +mul v24.4S, v24.4S,v17.s[1] +add v16.4s, v16.4s, v13.4s +sqrdmulh v13.4S, v18.4S, v11.s[0] +sub v20.4s, v21.4s, v30.4s +mul v18.4S, v18.4S,v17.s[0] +add v21.4s, v21.4s, v30.4s +sqrdmulh v30.4S, v1.4S, v11.s[0] +sub v27.4s, v6.4s, v3.4s +mul v1.4S, v1.4S,v17.s[0] +add v6.4s, v6.4s, v3.4s +mla v15.4S, v28.4S, v31.s[0] +sub v28.4s, v2.4s, v12.4s +sqrdmulh v3.4S, v16.4S, v11.s[2] +add v2.4s, v2.4s, v12.4s +mla v24.4S, v29.4S, v31.s[0] +nop +sqrdmulh v29.4S, v21.4S, v11.s[2] +nop +mla v18.4S, v13.4S, v31.s[0] +nop +sqrdmulh v13.4S, v19.4S, v11.s[3] +nop +mla v1.4S, v30.4S, v31.s[0] +nop +sqrdmulh v30.4S, v20.4S, v11.s[3] +nop +mul v16.4S, v16.4S,v17.s[2] +sub v12.4s, v8.4s, v15.4s +mul v21.4S, v21.4S,v17.s[2] +add v8.4s, v8.4s, v15.4s +mla v16.4S, v3.4S, v31.s[0] +sub v3.4s, v22.4s, v24.4s +mla v21.4S, v29.4S, v31.s[0] +add v22.4s, v22.4s, v24.4s +mul v19.4S, v19.4S,v17.s[3] +sub v24.4s, v26.4s, v18.4s +mul v20.4S, v20.4S,v17.s[3] +add v26.4s, v26.4s, v18.4s +mla v19.4S, v13.4S, v31.s[0] +sub v13.4s, v14.4s, v1.4s +mla v20.4S, v30.4S, v31.s[0] +add v14.4s, v14.4s, v1.4s +sqrdmulh v11.4S, v3.4S, v9.s[3] +nop +mul v3.4S, v3.4S,v0.s[3] +nop +sqrdmulh v17.4S, v22.4S, v9.s[2] +sub v1.4s, v6.4s, v16.4s +mul v22.4S, v22.4S,v0.s[2] +add v6.4s, v6.4s, v16.4s +sqrdmulh v16.4S, v13.4S, v9.s[1] +sub v30.4s, v2.4s, v21.4s +mul v13.4S, v13.4S,v0.s[1] +add v2.4s, v2.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v9.s[0] +sub v18.4s, v27.4s, v19.4s +mul v14.4S, v14.4S,v0.s[0] +add v27.4s, v27.4s, v19.4s +mla v3.4S, v11.4S, v31.s[0] +sub v11.4s, v28.4s, v20.4s +sqrdmulh v9.4S, v2.4S, v7.s[0] +add v28.4s, v28.4s, v20.4s +mla v22.4S, v17.4S, v31.s[0] +sub v17.4s, v12.4s, v3.4s +sqrdmulh v20.4S, v30.4S, v7.s[1] +add v12.4s, v12.4s, v3.4s +mla v13.4S, v16.4S, v31.s[0] +sub v16.4s, v8.4s, v22.4s +sqrdmulh v3.4S, v28.4S, v7.s[2] +add v8.4s, v8.4s, v22.4s +mla v14.4S, v21.4S, v31.s[0] +sub v21.4s, v24.4s, v13.4s +sqrdmulh v22.4S, v11.4S, v7.s[3] +add v24.4s, v24.4s, v13.4s +mul v2.4S, v2.4S,v23.s[0] +sub v13.4s, v26.4s, v14.4s +mul v30.4S, v30.4S,v23.s[1] +add v26.4s, v26.4s, v14.4s +mla v2.4S, v9.4S, v31.s[0] +str q16, [x0, #336] +mla v30.4S, v20.4S, v31.s[0] +str q8, [x0, #272] +mul v28.4S, v28.4S,v23.s[2] +str q17, [x0, #464] +mul v11.4S, v11.4S,v23.s[3] +str q12, [x0, #400] +mla v28.4S, v3.4S, v31.s[0] +str q21, [x0, #208] +mla v11.4S, v22.4S, v31.s[0] +str q24, [x0, #144] +str q13, [x0, #80] +str q26, [x0, #16] +sub v26.4s, v6.4s, v2.4s +str q26, [x0, #592] +add v6.4s, v6.4s, v2.4s +sub v2.4s, v1.4s, v30.4s +str q6, [x0, #528] +add v1.4s, v1.4s, v30.4s +sub v30.4s, v27.4s, v28.4s +str q2, [x0, #720] +add v27.4s, v27.4s, v28.4s +sub v28.4s, v18.4s, v11.4s +str q1, [x0, #656] +add v18.4s, v18.4s, v11.4s +str q30, [x0, #848] +str q27, [x0, #784] +str q28, [x0, #976] +str q18, [x0, #912] +ldr q4, [x0, #224] +ldr q5, [x0, #160] +ldr q25, [x0, #32] +ldr q10, [x17, #+128] +ldr q15, [x17, #+144] +sqrdmulh v29.4S, v25.4S, v15.s[0] +mul v25.4S, v25.4S,v10.s[0] +ldr q19, [x0, #48] +sqrdmulh v0.4S, v19.4S, v15.s[0] +mul v19.4S, v19.4S,v10.s[0] +ldr q14, [x17, #+160] +ldr q9, [x17, #+176] +ldr q16, [x0, #96] +sqrdmulh v20.4S, v16.4S, v9.s[0] +mul v16.4S, v16.4S,v14.s[0] +ldr q8, [x0, #112] +sqrdmulh v17.4S, v8.4S, v9.s[0] +mul v8.4S, v8.4S,v14.s[0] +ldr q12, [x17, #+192] +ldr q3, [x17, #+208] +mla v25.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v5.4S, v3.s[0] +ldr q21, [x0, #176] +mla v19.4S, v0.4S, v31.s[0] +sqrdmulh v0.4S, v21.4S, v3.s[0] +ldr q22, [x17, #+224] +ldr q24, [x17, #+240] +mla v16.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v4.4S, v24.s[0] +ldr q23, [x0, #240] +mla v8.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v23.4S, v24.s[0] +ldr q7, [x0, #0] +ldr q13, [x0, #128] +mul v5.4S, v5.4S,v12.s[0] +sub v26.4s, v7.4s, v25.4s +ldr q6, [x0, #16] +mul v21.4S, v21.4S,v12.s[0] +add v7.4s, v7.4s, v25.4s +ldr q25, [x0, #144] +mla v5.4S, v29.4S, v31.s[0] +sub v29.4s, v6.4s, v19.4s +ldr q2, [x0, #64] +mla v21.4S, v0.4S, v31.s[0] +add v6.4s, v6.4s, v19.4s +ldr q19, [x0, #192] +mul v4.4S, v4.4S,v22.s[0] +sub v0.4s, v2.4s, v16.4s +ldr q1, [x0, #80] +mul v23.4S, v23.4S,v22.s[0] +add v2.4s, v2.4s, v16.4s +ldr q16, [x0, #208] +mla v4.4S, v20.4S, v31.s[0] +mla v23.4S, v17.4S, v31.s[0] +sub v17.4s, v1.4s, v8.4s +sqrdmulh v20.4S, v6.4S, v15.s[1] +add v1.4s, v1.4s, v8.4s +mul v6.4S, v6.4S,v10.s[1] +sqrdmulh v8.4S, v29.4S, v15.s[2] +sub v11.4s, v13.4s, v5.4s +mul v29.4S, v29.4S,v10.s[2] +add v13.4s, v13.4s, v5.4s +sqrdmulh v15.4S, v1.4S, v9.s[1] +sub v10.4s, v25.4s, v21.4s +mul v1.4S, v1.4S,v14.s[1] +add v25.4s, v25.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v9.s[2] +sub v5.4s, v19.4s, v4.4s +mul v17.4S, v17.4S,v14.s[2] +add v19.4s, v19.4s, v4.4s +mla v6.4S, v20.4S, v31.s[0] +sub v20.4s, v16.4s, v23.4s +ldr q9, [x0, #480] +sqrdmulh v14.4S, v25.4S, v3.s[1] +add v16.4s, v16.4s, v23.4s +mla v29.4S, v8.4S, v31.s[0] +ldr q8, [x0, #416] +sqrdmulh v23.4S, v10.4S, v3.s[2] +sub v4.4s, v7.4s, v6.4s +mla v1.4S, v15.4S, v31.s[0] +ldr q15, [x0, #288] +sqrdmulh v30.4S, v16.4S, v24.s[1] +add v7.4s, v7.4s, v6.4s +str q4, [x0, #16] +mla v17.4S, v21.4S, v31.s[0] +ldr q21, [x17, #+256] +ldr q4, [x17, #+272] +sqrdmulh v6.4S, v20.4S, v24.s[2] +sub v27.4s, v26.4s, v29.4s +str q7, [x0, #0] +mul v25.4S, v25.4S,v12.s[1] +add v26.4s, v26.4s, v29.4s +mul v10.4S, v10.4S,v12.s[2] +str q27, [x0, #48] +mla v25.4S, v14.4S, v31.s[0] +sub v14.4s, v2.4s, v1.4s +mla v10.4S, v23.4S, v31.s[0] +str q26, [x0, #32] +mul v16.4S, v16.4S,v22.s[1] +str q14, [x0, #80] +mul v20.4S, v20.4S,v22.s[2] +add v2.4s, v2.4s, v1.4s +str q2, [x0, #64] +mla v16.4S, v30.4S, v31.s[0] +sub v30.4s, v0.4s, v17.4s +str q30, [x0, #112] +mla v20.4S, v6.4S, v31.s[0] +add v0.4s, v0.4s, v17.4s +str q0, [x0, #96] +sqrdmulh v24.4S, v15.4S, v4.s[0] +sub v22.4s, v13.4s, v25.4s +mul v15.4S, v15.4S,v21.s[0] +str q22, [x0, #144] +ldr q22, [x0, #304] +sqrdmulh v0.4S, v22.4S, v4.s[0] +add v13.4s, v13.4s, v25.4s +mul v22.4S, v22.4S,v21.s[0] +str q13, [x0, #128] +ldr q13, [x17, #+288] +ldr q25, [x17, #+304] +ldr q17, [x0, #352] +sqrdmulh v6.4S, v17.4S, v25.s[0] +sub v30.4s, v11.4s, v10.4s +mul v17.4S, v17.4S,v13.s[0] +str q30, [x0, #176] +ldr q30, [x0, #368] +sqrdmulh v2.4S, v30.4S, v25.s[0] +add v11.4s, v11.4s, v10.4s +mul v30.4S, v30.4S,v13.s[0] +str q11, [x0, #160] +ldr q11, [x17, #+320] +ldr q10, [x17, #+336] +mla v15.4S, v24.4S, v31.s[0] +sub v24.4s, v19.4s, v16.4s +sqrdmulh v1.4S, v8.4S, v10.s[0] +str q24, [x0, #208] +ldr q24, [x0, #432] +mla v22.4S, v0.4S, v31.s[0] +add v19.4s, v19.4s, v16.4s +sqrdmulh v16.4S, v24.4S, v10.s[0] +str q19, [x0, #192] +ldr q19, [x17, #+352] +ldr q0, [x17, #+368] +mla v17.4S, v6.4S, v31.s[0] +sub v6.4s, v5.4s, v20.4s +sqrdmulh v14.4S, v9.4S, v0.s[0] +str q6, [x0, #240] +ldr q6, [x0, #496] +mla v30.4S, v2.4S, v31.s[0] +add v5.4s, v5.4s, v20.4s +sqrdmulh v20.4S, v6.4S, v0.s[0] +str q5, [x0, #224] +ldr q5, [x0, #256] +ldr q2, [x0, #384] +mul v8.4S, v8.4S,v11.s[0] +sub v3.4s, v5.4s, v15.4s +ldr q12, [x0, #272] +mul v24.4S, v24.4S,v11.s[0] +add v5.4s, v5.4s, v15.4s +ldr q15, [x0, #400] +mla v8.4S, v1.4S, v31.s[0] +sub v1.4s, v12.4s, v22.4s +ldr q26, [x0, #320] +mla v24.4S, v16.4S, v31.s[0] +add v12.4s, v12.4s, v22.4s +ldr q22, [x0, #448] +mul v9.4S, v9.4S,v19.s[0] +sub v16.4s, v26.4s, v17.4s +ldr q23, [x0, #336] +mul v6.4S, v6.4S,v19.s[0] +add v26.4s, v26.4s, v17.4s +ldr q17, [x0, #464] +mla v9.4S, v14.4S, v31.s[0] +mla v6.4S, v20.4S, v31.s[0] +sub v20.4s, v23.4s, v30.4s +sqrdmulh v14.4S, v12.4S, v4.s[1] +add v23.4s, v23.4s, v30.4s +mul v12.4S, v12.4S,v21.s[1] +sqrdmulh v30.4S, v1.4S, v4.s[2] +sub v27.4s, v2.4s, v8.4s +mul v1.4S, v1.4S,v21.s[2] +add v2.4s, v2.4s, v8.4s +sqrdmulh v4.4S, v23.4S, v25.s[1] +sub v21.4s, v15.4s, v24.4s +mul v23.4S, v23.4S,v13.s[1] +add v15.4s, v15.4s, v24.4s +sqrdmulh v24.4S, v20.4S, v25.s[2] +sub v8.4s, v22.4s, v9.4s +mul v20.4S, v20.4S,v13.s[2] +add v22.4s, v22.4s, v9.4s +mla v12.4S, v14.4S, v31.s[0] +sub v14.4s, v17.4s, v6.4s +ldr q25, [x0, #736] +sqrdmulh v13.4S, v15.4S, v10.s[1] +add v17.4s, v17.4s, v6.4s +mla v1.4S, v30.4S, v31.s[0] +ldr q30, [x0, #672] +sqrdmulh v6.4S, v21.4S, v10.s[2] +sub v9.4s, v5.4s, v12.4s +mla v23.4S, v4.4S, v31.s[0] +ldr q4, [x0, #544] +sqrdmulh v29.4S, v17.4S, v0.s[1] +add v5.4s, v5.4s, v12.4s +str q9, [x0, #272] +mla v20.4S, v24.4S, v31.s[0] +ldr q24, [x17, #+384] +ldr q9, [x17, #+400] +sqrdmulh v12.4S, v14.4S, v0.s[2] +sub v7.4s, v3.4s, v1.4s +str q5, [x0, #256] +mul v15.4S, v15.4S,v11.s[1] +add v3.4s, v3.4s, v1.4s +mul v21.4S, v21.4S,v11.s[2] +str q7, [x0, #304] +mla v15.4S, v13.4S, v31.s[0] +sub v13.4s, v26.4s, v23.4s +mla v21.4S, v6.4S, v31.s[0] +str q3, [x0, #288] +mul v17.4S, v17.4S,v19.s[1] +str q13, [x0, #336] +mul v14.4S, v14.4S,v19.s[2] +add v26.4s, v26.4s, v23.4s +str q26, [x0, #320] +mla v17.4S, v29.4S, v31.s[0] +sub v29.4s, v16.4s, v20.4s +str q29, [x0, #368] +mla v14.4S, v12.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +str q16, [x0, #352] +sqrdmulh v0.4S, v4.4S, v9.s[0] +sub v19.4s, v2.4s, v15.4s +mul v4.4S, v4.4S,v24.s[0] +str q19, [x0, #400] +ldr q19, [x0, #560] +sqrdmulh v16.4S, v19.4S, v9.s[0] +add v2.4s, v2.4s, v15.4s +mul v19.4S, v19.4S,v24.s[0] +str q2, [x0, #384] +ldr q2, [x17, #+416] +ldr q15, [x17, #+432] +ldr q20, [x0, #608] +sqrdmulh v12.4S, v20.4S, v15.s[0] +sub v29.4s, v27.4s, v21.4s +mul v20.4S, v20.4S,v2.s[0] +str q29, [x0, #432] +ldr q29, [x0, #624] +sqrdmulh v26.4S, v29.4S, v15.s[0] +add v27.4s, v27.4s, v21.4s +mul v29.4S, v29.4S,v2.s[0] +str q27, [x0, #416] +ldr q27, [x17, #+448] +ldr q21, [x17, #+464] +mla v4.4S, v0.4S, v31.s[0] +sub v0.4s, v22.4s, v17.4s +sqrdmulh v23.4S, v30.4S, v21.s[0] +str q0, [x0, #464] +ldr q0, [x0, #688] +mla v19.4S, v16.4S, v31.s[0] +add v22.4s, v22.4s, v17.4s +sqrdmulh v17.4S, v0.4S, v21.s[0] +str q22, [x0, #448] +ldr q22, [x17, #+480] +ldr q16, [x17, #+496] +mla v20.4S, v12.4S, v31.s[0] +sub v12.4s, v8.4s, v14.4s +sqrdmulh v13.4S, v25.4S, v16.s[0] +str q12, [x0, #496] +ldr q12, [x0, #752] +mla v29.4S, v26.4S, v31.s[0] +add v8.4s, v8.4s, v14.4s +sqrdmulh v14.4S, v12.4S, v16.s[0] +str q8, [x0, #480] +ldr q8, [x0, #512] +ldr q26, [x0, #640] +mul v30.4S, v30.4S,v27.s[0] +sub v10.4s, v8.4s, v4.4s +ldr q11, [x0, #528] +mul v0.4S, v0.4S,v27.s[0] +add v8.4s, v8.4s, v4.4s +ldr q4, [x0, #656] +mla v30.4S, v23.4S, v31.s[0] +sub v23.4s, v11.4s, v19.4s +ldr q3, [x0, #576] +mla v0.4S, v17.4S, v31.s[0] +add v11.4s, v11.4s, v19.4s +ldr q19, [x0, #704] +mul v25.4S, v25.4S,v22.s[0] +sub v17.4s, v3.4s, v20.4s +ldr q6, [x0, #592] +mul v12.4S, v12.4S,v22.s[0] +add v3.4s, v3.4s, v20.4s +ldr q20, [x0, #720] +mla v25.4S, v13.4S, v31.s[0] +mla v12.4S, v14.4S, v31.s[0] +sub v14.4s, v6.4s, v29.4s +sqrdmulh v13.4S, v11.4S, v9.s[1] +add v6.4s, v6.4s, v29.4s +mul v11.4S, v11.4S,v24.s[1] +sqrdmulh v29.4S, v23.4S, v9.s[2] +sub v7.4s, v26.4s, v30.4s +mul v23.4S, v23.4S,v24.s[2] +add v26.4s, v26.4s, v30.4s +sqrdmulh v9.4S, v6.4S, v15.s[1] +sub v24.4s, v4.4s, v0.4s +mul v6.4S, v6.4S,v2.s[1] +add v4.4s, v4.4s, v0.4s +sqrdmulh v0.4S, v14.4S, v15.s[2] +sub v30.4s, v19.4s, v25.4s +mul v14.4S, v14.4S,v2.s[2] +add v19.4s, v19.4s, v25.4s +mla v11.4S, v13.4S, v31.s[0] +sub v13.4s, v20.4s, v12.4s +ldr q15, [x0, #992] +sqrdmulh v2.4S, v4.4S, v21.s[1] +add v20.4s, v20.4s, v12.4s +mla v23.4S, v29.4S, v31.s[0] +ldr q29, [x0, #928] +sqrdmulh v12.4S, v24.4S, v21.s[2] +sub v25.4s, v8.4s, v11.4s +mla v6.4S, v9.4S, v31.s[0] +ldr q9, [x0, #800] +sqrdmulh v1.4S, v20.4S, v16.s[1] +add v8.4s, v8.4s, v11.4s +str q25, [x0, #528] +mla v14.4S, v0.4S, v31.s[0] +ldr q0, [x17, #+512] +ldr q25, [x17, #+528] +sqrdmulh v11.4S, v13.4S, v16.s[2] +sub v5.4s, v10.4s, v23.4s +str q8, [x0, #512] +mul v4.4S, v4.4S,v27.s[1] +add v10.4s, v10.4s, v23.4s +mul v24.4S, v24.4S,v27.s[2] +str q5, [x0, #560] +mla v4.4S, v2.4S, v31.s[0] +sub v2.4s, v3.4s, v6.4s +mla v24.4S, v12.4S, v31.s[0] +str q10, [x0, #544] +mul v20.4S, v20.4S,v22.s[1] +str q2, [x0, #592] +mul v13.4S, v13.4S,v22.s[2] +add v3.4s, v3.4s, v6.4s +str q3, [x0, #576] +mla v20.4S, v1.4S, v31.s[0] +sub v1.4s, v17.4s, v14.4s +str q1, [x0, #624] +mla v13.4S, v11.4S, v31.s[0] +add v17.4s, v17.4s, v14.4s +str q17, [x0, #608] +sqrdmulh v16.4S, v9.4S, v25.s[0] +sub v22.4s, v26.4s, v4.4s +mul v9.4S, v9.4S,v0.s[0] +str q22, [x0, #656] +ldr q22, [x0, #816] +sqrdmulh v17.4S, v22.4S, v25.s[0] +add v26.4s, v26.4s, v4.4s +mul v22.4S, v22.4S,v0.s[0] +str q26, [x0, #640] +ldr q26, [x17, #+544] +ldr q4, [x17, #+560] +ldr q14, [x0, #864] +sqrdmulh v11.4S, v14.4S, v4.s[0] +sub v1.4s, v7.4s, v24.4s +mul v14.4S, v14.4S,v26.s[0] +str q1, [x0, #688] +ldr q1, [x0, #880] +sqrdmulh v3.4S, v1.4S, v4.s[0] +add v7.4s, v7.4s, v24.4s +mul v1.4S, v1.4S,v26.s[0] +str q7, [x0, #672] +ldr q7, [x17, #+576] +ldr q24, [x17, #+592] +mla v9.4S, v16.4S, v31.s[0] +sub v16.4s, v19.4s, v20.4s +sqrdmulh v6.4S, v29.4S, v24.s[0] +str q16, [x0, #720] +ldr q16, [x0, #944] +mla v22.4S, v17.4S, v31.s[0] +add v19.4s, v19.4s, v20.4s +sqrdmulh v20.4S, v16.4S, v24.s[0] +str q19, [x0, #704] +ldr q19, [x17, #+608] +ldr q17, [x17, #+624] +mla v14.4S, v11.4S, v31.s[0] +sub v11.4s, v30.4s, v13.4s +sqrdmulh v2.4S, v15.4S, v17.s[0] +str q11, [x0, #752] +ldr q11, [x0, #1008] +mla v1.4S, v3.4S, v31.s[0] +add v30.4s, v30.4s, v13.4s +sqrdmulh v13.4S, v11.4S, v17.s[0] +str q30, [x0, #736] +ldr q30, [x0, #768] +ldr q3, [x0, #896] +mul v29.4S, v29.4S,v7.s[0] +sub v21.4s, v30.4s, v9.4s +ldr q27, [x0, #784] +mul v16.4S, v16.4S,v7.s[0] +add v30.4s, v30.4s, v9.4s +ldr q9, [x0, #912] +mla v29.4S, v6.4S, v31.s[0] +sub v6.4s, v27.4s, v22.4s +ldr q10, [x0, #832] +mla v16.4S, v20.4S, v31.s[0] +add v27.4s, v27.4s, v22.4s +ldr q22, [x0, #960] +mul v15.4S, v15.4S,v19.s[0] +sub v20.4s, v10.4s, v14.4s +ldr q12, [x0, #848] +mul v11.4S, v11.4S,v19.s[0] +add v10.4s, v10.4s, v14.4s +ldr q14, [x0, #976] +mla v15.4S, v2.4S, v31.s[0] +mla v11.4S, v13.4S, v31.s[0] +sub v13.4s, v12.4s, v1.4s +sqrdmulh v2.4S, v27.4S, v25.s[1] +add v12.4s, v12.4s, v1.4s +mul v27.4S, v27.4S,v0.s[1] +sqrdmulh v1.4S, v6.4S, v25.s[2] +sub v5.4s, v3.4s, v29.4s +mul v6.4S, v6.4S,v0.s[2] +add v3.4s, v3.4s, v29.4s +sqrdmulh v25.4S, v12.4S, v4.s[1] +sub v0.4s, v9.4s, v16.4s +mul v12.4S, v12.4S,v26.s[1] +add v9.4s, v9.4s, v16.4s +sqrdmulh v16.4S, v13.4S, v4.s[2] +sub v29.4s, v22.4s, v15.4s +mul v13.4S, v13.4S,v26.s[2] +add v22.4s, v22.4s, v15.4s +mla v27.4S, v2.4S, v31.s[0] +sub v2.4s, v14.4s, v11.4s +sqrdmulh v4.4S, v9.4S, v24.s[1] +add v14.4s, v14.4s, v11.4s +mla v6.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v0.4S, v24.s[2] +sub v11.4s, v30.4s, v27.4s +mla v12.4S, v25.4S, v31.s[0] +sqrdmulh v25.4S, v14.4S, v17.s[1] +add v30.4s, v30.4s, v27.4s +str q11, [x0, #784] +mla v13.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v2.4S, v17.s[2] +sub v11.4s, v21.4s, v6.4s +str q30, [x0, #768] +mul v9.4S, v9.4S,v7.s[1] +add v21.4s, v21.4s, v6.4s +mul v0.4S, v0.4S,v7.s[2] +str q11, [x0, #816] +mla v9.4S, v4.4S, v31.s[0] +sub v4.4s, v10.4s, v12.4s +mla v0.4S, v1.4S, v31.s[0] +str q21, [x0, #800] +mul v14.4S, v14.4S,v19.s[1] +str q4, [x0, #848] +mul v2.4S, v2.4S,v19.s[2] +add v10.4s, v10.4s, v12.4s +str q10, [x0, #832] +mla v14.4S, v25.4S, v31.s[0] +sub v25.4s, v20.4s, v13.4s +str q25, [x0, #880] +mla v2.4S, v16.4S, v31.s[0] +add v20.4s, v20.4s, v13.4s +str q20, [x0, #864] +sub v17.4s, v3.4s, v9.4s +str q17, [x0, #912] +add v3.4s, v3.4s, v9.4s +str q3, [x0, #896] +sub v3.4s, v5.4s, v0.4s +str q3, [x0, #944] +add v5.4s, v5.4s, v0.4s +str q5, [x0, #928] +sub v5.4s, v22.4s, v14.4s +str q5, [x0, #976] +add v22.4s, v22.4s, v14.4s +str q22, [x0, #960] +sub v22.4s, v29.4s, v2.4s +str q22, [x0, #1008] +add v29.4s, v29.4s, v2.4s +str q29, [x0, #992] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1548 +// Instruction count: 1544 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_15_z4_7.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_15_z4_7.s new file mode 100644 index 0000000..cbb2ab0 --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_15_z4_7.s @@ -0,0 +1,1578 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_15_z4_7 +.global _ntt_u32_incomplete_neon_asm_var_4_2_15_z4_7 +ntt_u32_incomplete_neon_asm_var_4_2_15_z4_7: +_ntt_u32_incomplete_neon_asm_var_4_2_15_z4_7: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x0, #992] +ldr q29, [x17, #+0] +ldr q28, [x17, #+16] +sqrdmulh v27.4S, v30.4S, v28.s[0] +mul v30.4S, v30.4S,v29.s[0] +ldr q26, [x0, #928] +sqrdmulh v25.4S, v26.4S, v28.s[0] +mul v26.4S, v26.4S,v29.s[0] +ldr q24, [x0, #864] +sqrdmulh v23.4S, v24.4S, v28.s[0] +mul v24.4S, v24.4S,v29.s[0] +ldr q22, [x0, #800] +sqrdmulh v21.4S, v22.4S, v28.s[0] +mul v22.4S, v22.4S,v29.s[0] +ldr q20, [x0, #736] +mla v30.4S, v27.4S, v31.s[0] +sqrdmulh v27.4S, v20.4S, v28.s[0] +ldr q19, [x0, #672] +mla v26.4S, v25.4S, v31.s[0] +sqrdmulh v25.4S, v19.4S, v28.s[0] +nop +ldr q18, [x0, #608] +mla v24.4S, v23.4S, v31.s[0] +sqrdmulh v23.4S, v18.4S, v28.s[0] +nop +ldr q17, [x0, #544] +mla v22.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v17.4S, v28.s[0] +nop +ldr q16, [x0, #480] +ldr q3, [x0, #416] +mul v20.4S, v20.4S,v29.s[0] +sub v2.4s, v16.4s, v30.4s +mul v19.4S, v19.4S,v29.s[0] +add v16.4s, v16.4s, v30.4s +ldr q30, [x0, #352] +ldr q1, [x0, #288] +mla v20.4S, v27.4S, v31.s[0] +sub v27.4s, v3.4s, v26.4s +mla v19.4S, v25.4S, v31.s[0] +add v3.4s, v3.4s, v26.4s +ldr q26, [x0, #224] +ldr q25, [x0, #160] +mul v18.4S, v18.4S,v29.s[0] +sub v0.4s, v30.4s, v24.4s +mul v17.4S, v17.4S,v29.s[0] +add v30.4s, v30.4s, v24.4s +ldr q24, [x0, #96] +ldr q15, [x0, #32] +mla v18.4S, v23.4S, v31.s[0] +sub v23.4s, v1.4s, v22.4s +mla v17.4S, v21.4S, v31.s[0] +add v1.4s, v1.4s, v22.4s +sqrdmulh v22.4S, v2.4S, v28.s[2] +nop +mul v2.4S, v2.4S,v29.s[2] +nop +sqrdmulh v21.4S, v27.4S, v28.s[2] +sub v14.4s, v26.4s, v20.4s +mul v27.4S, v27.4S,v29.s[2] +add v26.4s, v26.4s, v20.4s +sqrdmulh v20.4S, v0.4S, v28.s[2] +sub v13.4s, v25.4s, v19.4s +mul v0.4S, v0.4S,v29.s[2] +add v25.4s, v25.4s, v19.4s +sqrdmulh v19.4S, v23.4S, v28.s[2] +sub v12.4s, v24.4s, v18.4s +mul v23.4S, v23.4S,v29.s[2] +add v24.4s, v24.4s, v18.4s +mla v2.4S, v22.4S, v31.s[0] +sub v22.4s, v15.4s, v17.4s +sqrdmulh v18.4S, v16.4S, v28.s[1] +add v15.4s, v15.4s, v17.4s +mla v27.4S, v21.4S, v31.s[0] +nop +sqrdmulh v21.4S, v3.4S, v28.s[1] +nop +mla v0.4S, v20.4S, v31.s[0] +nop +sqrdmulh v20.4S, v30.4S, v28.s[1] +nop +mla v23.4S, v19.4S, v31.s[0] +nop +sqrdmulh v19.4S, v1.4S, v28.s[1] +nop +ldr q17, [x17, #+32] +ldr q11, [x17, #+48] +mul v16.4S, v16.4S,v29.s[1] +sub v10.4s, v14.4s, v2.4s +mul v3.4S, v3.4S,v29.s[1] +add v14.4s, v14.4s, v2.4s +mla v16.4S, v18.4S, v31.s[0] +sub v18.4s, v13.4s, v27.4s +mla v3.4S, v21.4S, v31.s[0] +add v13.4s, v13.4s, v27.4s +mul v30.4S, v30.4S,v29.s[1] +sub v27.4s, v12.4s, v0.4s +mul v1.4S, v1.4S,v29.s[1] +add v12.4s, v12.4s, v0.4s +mla v30.4S, v20.4S, v31.s[0] +sub v20.4s, v22.4s, v23.4s +mla v1.4S, v19.4S, v31.s[0] +add v22.4s, v22.4s, v23.4s +sqrdmulh v23.4S, v10.4S, v11.s[3] +nop +mul v10.4S, v10.4S,v17.s[3] +nop +sqrdmulh v19.4S, v18.4S, v11.s[3] +sub v0.4s, v26.4s, v16.4s +mul v18.4S, v18.4S,v17.s[3] +add v26.4s, v26.4s, v16.4s +sqrdmulh v16.4S, v14.4S, v11.s[2] +sub v21.4s, v25.4s, v3.4s +mul v14.4S, v14.4S,v17.s[2] +add v25.4s, v25.4s, v3.4s +sqrdmulh v3.4S, v13.4S, v11.s[2] +sub v2.4s, v24.4s, v30.4s +mul v13.4S, v13.4S,v17.s[2] +add v24.4s, v24.4s, v30.4s +ldr q30, [x17, #+96] +ldr q9, [x17, #+112] +mla v10.4S, v23.4S, v31.s[0] +sub v23.4s, v15.4s, v1.4s +sqrdmulh v8.4S, v0.4S, v11.s[1] +add v15.4s, v15.4s, v1.4s +mla v18.4S, v19.4S, v31.s[0] +nop +sqrdmulh v19.4S, v21.4S, v11.s[1] +nop +mla v14.4S, v16.4S, v31.s[0] +nop +sqrdmulh v16.4S, v26.4S, v11.s[0] +nop +mla v13.4S, v3.4S, v31.s[0] +nop +sqrdmulh v3.4S, v25.4S, v11.s[0] +nop +ldr q1, [x17, #+64] +ldr q7, [x17, #+80] +mul v0.4S, v0.4S,v17.s[1] +sub v6.4s, v27.4s, v10.4s +mul v21.4S, v21.4S,v17.s[1] +add v27.4s, v27.4s, v10.4s +mla v0.4S, v8.4S, v31.s[0] +sub v8.4s, v20.4s, v18.4s +mla v21.4S, v19.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +mul v26.4S, v26.4S,v17.s[0] +sub v18.4s, v12.4s, v14.4s +mul v25.4S, v25.4S,v17.s[0] +add v12.4s, v12.4s, v14.4s +mla v26.4S, v16.4S, v31.s[0] +sub v16.4s, v22.4s, v13.4s +mla v25.4S, v3.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v6.4S, v9.s[3] +nop +mul v6.4S, v6.4S,v30.s[3] +nop +sqrdmulh v3.4S, v27.4S, v9.s[2] +sub v14.4s, v2.4s, v0.4s +mul v27.4S, v27.4S,v30.s[2] +add v2.4s, v2.4s, v0.4s +sqrdmulh v0.4S, v18.4S, v9.s[1] +sub v19.4s, v23.4s, v21.4s +mul v18.4S, v18.4S,v30.s[1] +add v23.4s, v23.4s, v21.4s +sqrdmulh v21.4S, v12.4S, v9.s[0] +sub v10.4s, v24.4s, v26.4s +mul v12.4S, v12.4S,v30.s[0] +add v24.4s, v24.4s, v26.4s +mla v6.4S, v13.4S, v31.s[0] +sub v13.4s, v15.4s, v25.4s +sqrdmulh v26.4S, v14.4S, v7.s[3] +add v15.4s, v15.4s, v25.4s +mla v27.4S, v3.4S, v31.s[0] +sub v3.4s, v8.4s, v6.4s +sqrdmulh v25.4S, v2.4S, v7.s[2] +add v8.4s, v8.4s, v6.4s +mla v18.4S, v0.4S, v31.s[0] +sub v0.4s, v20.4s, v27.4s +sqrdmulh v6.4S, v10.4S, v7.s[1] +add v20.4s, v20.4s, v27.4s +mla v12.4S, v21.4S, v31.s[0] +sub v21.4s, v16.4s, v18.4s +sqrdmulh v27.4S, v24.4S, v7.s[0] +add v16.4s, v16.4s, v18.4s +mul v14.4S, v14.4S,v1.s[3] +sub v18.4s, v22.4s, v12.4s +mul v2.4S, v2.4S,v1.s[2] +add v22.4s, v22.4s, v12.4s +mla v14.4S, v26.4S, v31.s[0] +str q3, [x0, #992] +mla v2.4S, v25.4S, v31.s[0] +str q8, [x0, #928] +mul v10.4S, v10.4S,v1.s[1] +str q0, [x0, #864] +mul v24.4S, v24.4S,v1.s[0] +str q20, [x0, #800] +mla v10.4S, v6.4S, v31.s[0] +str q21, [x0, #736] +mla v24.4S, v27.4S, v31.s[0] +str q16, [x0, #672] +ldr q16, [x0, #1008] +sqrdmulh v27.4S, v16.4S, v28.s[0] +str q18, [x0, #608] +mul v16.4S, v16.4S,v29.s[0] +str q22, [x0, #544] +ldr q22, [x0, #944] +sqrdmulh v18.4S, v22.4S, v28.s[0] +sub v21.4s, v19.4s, v14.4s +str q21, [x0, #480] +mul v22.4S, v22.4S,v29.s[0] +add v19.4s, v19.4s, v14.4s +ldr q14, [x0, #880] +sqrdmulh v21.4S, v14.4S, v28.s[0] +sub v6.4s, v23.4s, v2.4s +str q19, [x0, #416] +mul v14.4S, v14.4S,v29.s[0] +add v23.4s, v23.4s, v2.4s +ldr q2, [x0, #816] +sqrdmulh v19.4S, v2.4S, v28.s[0] +sub v20.4s, v13.4s, v10.4s +str q6, [x0, #352] +mul v2.4S, v2.4S,v29.s[0] +add v13.4s, v13.4s, v10.4s +ldr q10, [x0, #752] +mla v16.4S, v27.4S, v31.s[0] +sub v27.4s, v15.4s, v24.4s +str q23, [x0, #288] +sqrdmulh v23.4S, v10.4S, v28.s[0] +add v15.4s, v15.4s, v24.4s +ldr q24, [x0, #688] +mla v22.4S, v18.4S, v31.s[0] +str q20, [x0, #224] +sqrdmulh v20.4S, v24.4S, v28.s[0] +nop +ldr q18, [x0, #624] +mla v14.4S, v21.4S, v31.s[0] +str q13, [x0, #160] +sqrdmulh v13.4S, v18.4S, v28.s[0] +nop +ldr q21, [x0, #560] +mla v2.4S, v19.4S, v31.s[0] +str q27, [x0, #96] +sqrdmulh v27.4S, v21.4S, v28.s[0] +nop +ldr q19, [x0, #496] +ldr q6, [x0, #432] +mul v10.4S, v10.4S,v29.s[0] +sub v0.4s, v19.4s, v16.4s +str q15, [x0, #32] +mul v24.4S, v24.4S,v29.s[0] +add v19.4s, v19.4s, v16.4s +ldr q16, [x0, #368] +ldr q15, [x0, #304] +mla v10.4S, v23.4S, v31.s[0] +sub v23.4s, v6.4s, v22.4s +mla v24.4S, v20.4S, v31.s[0] +add v6.4s, v6.4s, v22.4s +ldr q22, [x0, #240] +ldr q20, [x0, #176] +mul v18.4S, v18.4S,v29.s[0] +sub v8.4s, v16.4s, v14.4s +mul v21.4S, v21.4S,v29.s[0] +add v16.4s, v16.4s, v14.4s +ldr q14, [x0, #112] +ldr q25, [x0, #48] +mla v18.4S, v13.4S, v31.s[0] +sub v13.4s, v15.4s, v2.4s +mla v21.4S, v27.4S, v31.s[0] +add v15.4s, v15.4s, v2.4s +sqrdmulh v2.4S, v0.4S, v28.s[2] +nop +mul v0.4S, v0.4S,v29.s[2] +nop +sqrdmulh v27.4S, v23.4S, v28.s[2] +sub v3.4s, v22.4s, v10.4s +mul v23.4S, v23.4S,v29.s[2] +add v22.4s, v22.4s, v10.4s +sqrdmulh v10.4S, v8.4S, v28.s[2] +sub v26.4s, v20.4s, v24.4s +mul v8.4S, v8.4S,v29.s[2] +add v20.4s, v20.4s, v24.4s +sqrdmulh v24.4S, v13.4S, v28.s[2] +sub v12.4s, v14.4s, v18.4s +mul v13.4S, v13.4S,v29.s[2] +add v14.4s, v14.4s, v18.4s +mla v0.4S, v2.4S, v31.s[0] +sub v2.4s, v25.4s, v21.4s +sqrdmulh v18.4S, v19.4S, v28.s[1] +add v25.4s, v25.4s, v21.4s +mla v23.4S, v27.4S, v31.s[0] +nop +sqrdmulh v27.4S, v6.4S, v28.s[1] +nop +mla v8.4S, v10.4S, v31.s[0] +nop +sqrdmulh v10.4S, v16.4S, v28.s[1] +nop +mla v13.4S, v24.4S, v31.s[0] +nop +sqrdmulh v24.4S, v15.4S, v28.s[1] +nop +mul v19.4S, v19.4S,v29.s[1] +sub v21.4s, v3.4s, v0.4s +mul v6.4S, v6.4S,v29.s[1] +add v3.4s, v3.4s, v0.4s +mla v19.4S, v18.4S, v31.s[0] +sub v18.4s, v26.4s, v23.4s +mla v6.4S, v27.4S, v31.s[0] +add v26.4s, v26.4s, v23.4s +mul v16.4S, v16.4S,v29.s[1] +sub v23.4s, v12.4s, v8.4s +mul v15.4S, v15.4S,v29.s[1] +add v12.4s, v12.4s, v8.4s +mla v16.4S, v10.4S, v31.s[0] +sub v10.4s, v2.4s, v13.4s +mla v15.4S, v24.4S, v31.s[0] +add v2.4s, v2.4s, v13.4s +sqrdmulh v13.4S, v21.4S, v11.s[3] +nop +mul v21.4S, v21.4S,v17.s[3] +nop +sqrdmulh v24.4S, v18.4S, v11.s[3] +sub v8.4s, v22.4s, v19.4s +mul v18.4S, v18.4S,v17.s[3] +add v22.4s, v22.4s, v19.4s +sqrdmulh v19.4S, v3.4S, v11.s[2] +sub v27.4s, v20.4s, v6.4s +mul v3.4S, v3.4S,v17.s[2] +add v20.4s, v20.4s, v6.4s +sqrdmulh v6.4S, v26.4S, v11.s[2] +sub v0.4s, v14.4s, v16.4s +mul v26.4S, v26.4S,v17.s[2] +add v14.4s, v14.4s, v16.4s +mla v21.4S, v13.4S, v31.s[0] +sub v13.4s, v25.4s, v15.4s +sqrdmulh v16.4S, v8.4S, v11.s[1] +add v25.4s, v25.4s, v15.4s +mla v18.4S, v24.4S, v31.s[0] +nop +sqrdmulh v24.4S, v27.4S, v11.s[1] +nop +mla v3.4S, v19.4S, v31.s[0] +nop +sqrdmulh v19.4S, v22.4S, v11.s[0] +nop +mla v26.4S, v6.4S, v31.s[0] +nop +sqrdmulh v6.4S, v20.4S, v11.s[0] +nop +mul v8.4S, v8.4S,v17.s[1] +sub v15.4s, v23.4s, v21.4s +mul v27.4S, v27.4S,v17.s[1] +add v23.4s, v23.4s, v21.4s +mla v8.4S, v16.4S, v31.s[0] +sub v16.4s, v10.4s, v18.4s +mla v27.4S, v24.4S, v31.s[0] +add v10.4s, v10.4s, v18.4s +mul v22.4S, v22.4S,v17.s[0] +sub v18.4s, v12.4s, v3.4s +mul v20.4S, v20.4S,v17.s[0] +add v12.4s, v12.4s, v3.4s +mla v22.4S, v19.4S, v31.s[0] +sub v19.4s, v2.4s, v26.4s +mla v20.4S, v6.4S, v31.s[0] +add v2.4s, v2.4s, v26.4s +sqrdmulh v26.4S, v15.4S, v9.s[3] +nop +mul v15.4S, v15.4S,v30.s[3] +nop +sqrdmulh v6.4S, v23.4S, v9.s[2] +sub v3.4s, v0.4s, v8.4s +mul v23.4S, v23.4S,v30.s[2] +add v0.4s, v0.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v9.s[1] +sub v24.4s, v13.4s, v27.4s +mul v18.4S, v18.4S,v30.s[1] +add v13.4s, v13.4s, v27.4s +sqrdmulh v27.4S, v12.4S, v9.s[0] +sub v21.4s, v14.4s, v22.4s +mul v12.4S, v12.4S,v30.s[0] +add v14.4s, v14.4s, v22.4s +mla v15.4S, v26.4S, v31.s[0] +sub v26.4s, v25.4s, v20.4s +sqrdmulh v22.4S, v3.4S, v7.s[3] +add v25.4s, v25.4s, v20.4s +mla v23.4S, v6.4S, v31.s[0] +sub v6.4s, v16.4s, v15.4s +sqrdmulh v20.4S, v0.4S, v7.s[2] +add v16.4s, v16.4s, v15.4s +mla v18.4S, v8.4S, v31.s[0] +sub v8.4s, v10.4s, v23.4s +sqrdmulh v15.4S, v21.4S, v7.s[1] +add v10.4s, v10.4s, v23.4s +mla v12.4S, v27.4S, v31.s[0] +sub v27.4s, v19.4s, v18.4s +sqrdmulh v23.4S, v14.4S, v7.s[0] +add v19.4s, v19.4s, v18.4s +mul v3.4S, v3.4S,v1.s[3] +sub v18.4s, v2.4s, v12.4s +mul v0.4S, v0.4S,v1.s[2] +add v2.4s, v2.4s, v12.4s +mla v3.4S, v22.4S, v31.s[0] +str q6, [x0, #1008] +mla v0.4S, v20.4S, v31.s[0] +str q16, [x0, #944] +mul v21.4S, v21.4S,v1.s[1] +str q8, [x0, #880] +mul v14.4S, v14.4S,v1.s[0] +str q10, [x0, #816] +mla v21.4S, v15.4S, v31.s[0] +str q27, [x0, #752] +mla v14.4S, v23.4S, v31.s[0] +str q19, [x0, #688] +ldr q19, [x0, #960] +sqrdmulh v23.4S, v19.4S, v28.s[0] +str q18, [x0, #624] +mul v19.4S, v19.4S,v29.s[0] +str q2, [x0, #560] +ldr q2, [x0, #896] +sqrdmulh v18.4S, v2.4S, v28.s[0] +sub v27.4s, v24.4s, v3.4s +str q27, [x0, #496] +mul v2.4S, v2.4S,v29.s[0] +add v24.4s, v24.4s, v3.4s +ldr q3, [x0, #832] +sqrdmulh v27.4S, v3.4S, v28.s[0] +sub v15.4s, v13.4s, v0.4s +str q24, [x0, #432] +mul v3.4S, v3.4S,v29.s[0] +add v13.4s, v13.4s, v0.4s +ldr q0, [x0, #768] +sqrdmulh v24.4S, v0.4S, v28.s[0] +sub v10.4s, v26.4s, v21.4s +str q15, [x0, #368] +mul v0.4S, v0.4S,v29.s[0] +add v26.4s, v26.4s, v21.4s +ldr q21, [x0, #704] +mla v19.4S, v23.4S, v31.s[0] +sub v23.4s, v25.4s, v14.4s +str q13, [x0, #304] +sqrdmulh v13.4S, v21.4S, v28.s[0] +add v25.4s, v25.4s, v14.4s +ldr q14, [x0, #640] +mla v2.4S, v18.4S, v31.s[0] +str q10, [x0, #240] +sqrdmulh v10.4S, v14.4S, v28.s[0] +nop +ldr q18, [x0, #576] +mla v3.4S, v27.4S, v31.s[0] +str q26, [x0, #176] +sqrdmulh v26.4S, v18.4S, v28.s[0] +nop +ldr q27, [x0, #512] +mla v0.4S, v24.4S, v31.s[0] +str q23, [x0, #112] +sqrdmulh v23.4S, v27.4S, v28.s[0] +nop +ldr q24, [x0, #448] +ldr q15, [x0, #384] +mul v21.4S, v21.4S,v29.s[0] +sub v8.4s, v24.4s, v19.4s +str q25, [x0, #48] +mul v14.4S, v14.4S,v29.s[0] +add v24.4s, v24.4s, v19.4s +ldr q19, [x0, #320] +ldr q25, [x0, #256] +mla v21.4S, v13.4S, v31.s[0] +sub v13.4s, v15.4s, v2.4s +mla v14.4S, v10.4S, v31.s[0] +add v15.4s, v15.4s, v2.4s +ldr q2, [x0, #192] +ldr q10, [x0, #128] +mul v18.4S, v18.4S,v29.s[0] +sub v16.4s, v19.4s, v3.4s +mul v27.4S, v27.4S,v29.s[0] +add v19.4s, v19.4s, v3.4s +ldr q3, [x0, #64] +ldr q20, [x0, #0] +mla v18.4S, v26.4S, v31.s[0] +sub v26.4s, v25.4s, v0.4s +mla v27.4S, v23.4S, v31.s[0] +add v25.4s, v25.4s, v0.4s +sqrdmulh v0.4S, v8.4S, v28.s[2] +nop +mul v8.4S, v8.4S,v29.s[2] +nop +sqrdmulh v23.4S, v13.4S, v28.s[2] +sub v6.4s, v2.4s, v21.4s +mul v13.4S, v13.4S,v29.s[2] +add v2.4s, v2.4s, v21.4s +sqrdmulh v21.4S, v16.4S, v28.s[2] +sub v22.4s, v10.4s, v14.4s +mul v16.4S, v16.4S,v29.s[2] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v26.4S, v28.s[2] +sub v12.4s, v3.4s, v18.4s +mul v26.4S, v26.4S,v29.s[2] +add v3.4s, v3.4s, v18.4s +mla v8.4S, v0.4S, v31.s[0] +sub v0.4s, v20.4s, v27.4s +sqrdmulh v18.4S, v24.4S, v28.s[1] +add v20.4s, v20.4s, v27.4s +mla v13.4S, v23.4S, v31.s[0] +nop +sqrdmulh v23.4S, v15.4S, v28.s[1] +nop +mla v16.4S, v21.4S, v31.s[0] +nop +sqrdmulh v21.4S, v19.4S, v28.s[1] +nop +mla v26.4S, v14.4S, v31.s[0] +nop +sqrdmulh v14.4S, v25.4S, v28.s[1] +nop +mul v24.4S, v24.4S,v29.s[1] +sub v27.4s, v6.4s, v8.4s +mul v15.4S, v15.4S,v29.s[1] +add v6.4s, v6.4s, v8.4s +mla v24.4S, v18.4S, v31.s[0] +sub v18.4s, v22.4s, v13.4s +mla v15.4S, v23.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +mul v19.4S, v19.4S,v29.s[1] +sub v13.4s, v12.4s, v16.4s +mul v25.4S, v25.4S,v29.s[1] +add v12.4s, v12.4s, v16.4s +mla v19.4S, v21.4S, v31.s[0] +sub v21.4s, v0.4s, v26.4s +mla v25.4S, v14.4S, v31.s[0] +add v0.4s, v0.4s, v26.4s +sqrdmulh v26.4S, v27.4S, v11.s[3] +nop +mul v27.4S, v27.4S,v17.s[3] +nop +sqrdmulh v14.4S, v18.4S, v11.s[3] +sub v16.4s, v2.4s, v24.4s +mul v18.4S, v18.4S,v17.s[3] +add v2.4s, v2.4s, v24.4s +sqrdmulh v24.4S, v6.4S, v11.s[2] +sub v23.4s, v10.4s, v15.4s +mul v6.4S, v6.4S,v17.s[2] +add v10.4s, v10.4s, v15.4s +sqrdmulh v15.4S, v22.4S, v11.s[2] +sub v8.4s, v3.4s, v19.4s +mul v22.4S, v22.4S,v17.s[2] +add v3.4s, v3.4s, v19.4s +mla v27.4S, v26.4S, v31.s[0] +sub v26.4s, v20.4s, v25.4s +sqrdmulh v19.4S, v16.4S, v11.s[1] +add v20.4s, v20.4s, v25.4s +mla v18.4S, v14.4S, v31.s[0] +nop +sqrdmulh v14.4S, v23.4S, v11.s[1] +nop +mla v6.4S, v24.4S, v31.s[0] +nop +sqrdmulh v24.4S, v2.4S, v11.s[0] +nop +mla v22.4S, v15.4S, v31.s[0] +nop +sqrdmulh v15.4S, v10.4S, v11.s[0] +nop +mul v16.4S, v16.4S,v17.s[1] +sub v25.4s, v13.4s, v27.4s +mul v23.4S, v23.4S,v17.s[1] +add v13.4s, v13.4s, v27.4s +mla v16.4S, v19.4S, v31.s[0] +sub v19.4s, v21.4s, v18.4s +mla v23.4S, v14.4S, v31.s[0] +add v21.4s, v21.4s, v18.4s +mul v2.4S, v2.4S,v17.s[0] +sub v18.4s, v12.4s, v6.4s +mul v10.4S, v10.4S,v17.s[0] +add v12.4s, v12.4s, v6.4s +mla v2.4S, v24.4S, v31.s[0] +sub v24.4s, v0.4s, v22.4s +mla v10.4S, v15.4S, v31.s[0] +add v0.4s, v0.4s, v22.4s +sqrdmulh v22.4S, v25.4S, v9.s[3] +nop +mul v25.4S, v25.4S,v30.s[3] +nop +sqrdmulh v15.4S, v13.4S, v9.s[2] +sub v6.4s, v8.4s, v16.4s +mul v13.4S, v13.4S,v30.s[2] +add v8.4s, v8.4s, v16.4s +sqrdmulh v16.4S, v18.4S, v9.s[1] +sub v14.4s, v26.4s, v23.4s +mul v18.4S, v18.4S,v30.s[1] +add v26.4s, v26.4s, v23.4s +sqrdmulh v23.4S, v12.4S, v9.s[0] +sub v27.4s, v3.4s, v2.4s +mul v12.4S, v12.4S,v30.s[0] +add v3.4s, v3.4s, v2.4s +mla v25.4S, v22.4S, v31.s[0] +sub v22.4s, v20.4s, v10.4s +sqrdmulh v2.4S, v6.4S, v7.s[3] +add v20.4s, v20.4s, v10.4s +mla v13.4S, v15.4S, v31.s[0] +sub v15.4s, v19.4s, v25.4s +sqrdmulh v10.4S, v8.4S, v7.s[2] +add v19.4s, v19.4s, v25.4s +mla v18.4S, v16.4S, v31.s[0] +sub v16.4s, v21.4s, v13.4s +sqrdmulh v25.4S, v27.4S, v7.s[1] +add v21.4s, v21.4s, v13.4s +mla v12.4S, v23.4S, v31.s[0] +sub v23.4s, v24.4s, v18.4s +sqrdmulh v13.4S, v3.4S, v7.s[0] +add v24.4s, v24.4s, v18.4s +mul v6.4S, v6.4S,v1.s[3] +sub v18.4s, v0.4s, v12.4s +mul v8.4S, v8.4S,v1.s[2] +add v0.4s, v0.4s, v12.4s +mla v6.4S, v2.4S, v31.s[0] +str q15, [x0, #960] +mla v8.4S, v10.4S, v31.s[0] +str q19, [x0, #896] +mul v27.4S, v27.4S,v1.s[1] +str q16, [x0, #832] +mul v3.4S, v3.4S,v1.s[0] +str q21, [x0, #768] +mla v27.4S, v25.4S, v31.s[0] +str q23, [x0, #704] +mla v3.4S, v13.4S, v31.s[0] +str q24, [x0, #640] +ldr q24, [x0, #976] +sqrdmulh v13.4S, v24.4S, v28.s[0] +str q18, [x0, #576] +mul v24.4S, v24.4S,v29.s[0] +str q0, [x0, #512] +ldr q0, [x0, #912] +sqrdmulh v18.4S, v0.4S, v28.s[0] +sub v23.4s, v14.4s, v6.4s +str q23, [x0, #448] +mul v0.4S, v0.4S,v29.s[0] +add v14.4s, v14.4s, v6.4s +ldr q6, [x0, #848] +sqrdmulh v23.4S, v6.4S, v28.s[0] +sub v25.4s, v26.4s, v8.4s +str q14, [x0, #384] +mul v6.4S, v6.4S,v29.s[0] +add v26.4s, v26.4s, v8.4s +ldr q8, [x0, #784] +sqrdmulh v14.4S, v8.4S, v28.s[0] +sub v21.4s, v22.4s, v27.4s +str q25, [x0, #320] +mul v8.4S, v8.4S,v29.s[0] +add v22.4s, v22.4s, v27.4s +ldr q27, [x0, #720] +mla v24.4S, v13.4S, v31.s[0] +sub v13.4s, v20.4s, v3.4s +str q26, [x0, #256] +sqrdmulh v26.4S, v27.4S, v28.s[0] +add v20.4s, v20.4s, v3.4s +ldr q3, [x0, #656] +mla v0.4S, v18.4S, v31.s[0] +str q21, [x0, #192] +sqrdmulh v21.4S, v3.4S, v28.s[0] +nop +ldr q18, [x0, #592] +mla v6.4S, v23.4S, v31.s[0] +str q22, [x0, #128] +sqrdmulh v22.4S, v18.4S, v28.s[0] +nop +ldr q23, [x0, #528] +mla v8.4S, v14.4S, v31.s[0] +str q13, [x0, #64] +sqrdmulh v13.4S, v23.4S, v28.s[0] +nop +ldr q14, [x0, #464] +ldr q25, [x0, #400] +mul v27.4S, v27.4S,v29.s[0] +sub v16.4s, v14.4s, v24.4s +str q20, [x0, #0] +mul v3.4S, v3.4S,v29.s[0] +add v14.4s, v14.4s, v24.4s +ldr q24, [x0, #336] +ldr q20, [x0, #272] +mla v27.4S, v26.4S, v31.s[0] +sub v26.4s, v25.4s, v0.4s +mla v3.4S, v21.4S, v31.s[0] +add v25.4s, v25.4s, v0.4s +ldr q0, [x0, #208] +ldr q21, [x0, #144] +mul v18.4S, v18.4S,v29.s[0] +sub v19.4s, v24.4s, v6.4s +mul v23.4S, v23.4S,v29.s[0] +add v24.4s, v24.4s, v6.4s +ldr q6, [x0, #80] +ldr q10, [x0, #16] +mla v18.4S, v22.4S, v31.s[0] +sub v22.4s, v20.4s, v8.4s +mla v23.4S, v13.4S, v31.s[0] +add v20.4s, v20.4s, v8.4s +sqrdmulh v8.4S, v16.4S, v28.s[2] +nop +mul v16.4S, v16.4S,v29.s[2] +nop +sqrdmulh v13.4S, v26.4S, v28.s[2] +sub v15.4s, v0.4s, v27.4s +mul v26.4S, v26.4S,v29.s[2] +add v0.4s, v0.4s, v27.4s +sqrdmulh v27.4S, v19.4S, v28.s[2] +sub v2.4s, v21.4s, v3.4s +mul v19.4S, v19.4S,v29.s[2] +add v21.4s, v21.4s, v3.4s +sqrdmulh v3.4S, v22.4S, v28.s[2] +sub v12.4s, v6.4s, v18.4s +mul v22.4S, v22.4S,v29.s[2] +add v6.4s, v6.4s, v18.4s +mla v16.4S, v8.4S, v31.s[0] +sub v8.4s, v10.4s, v23.4s +sqrdmulh v18.4S, v14.4S, v28.s[1] +add v10.4s, v10.4s, v23.4s +mla v26.4S, v13.4S, v31.s[0] +nop +sqrdmulh v13.4S, v25.4S, v28.s[1] +nop +mla v19.4S, v27.4S, v31.s[0] +nop +sqrdmulh v27.4S, v24.4S, v28.s[1] +nop +mla v22.4S, v3.4S, v31.s[0] +nop +sqrdmulh v3.4S, v20.4S, v28.s[1] +nop +mul v14.4S, v14.4S,v29.s[1] +sub v23.4s, v15.4s, v16.4s +mul v25.4S, v25.4S,v29.s[1] +add v15.4s, v15.4s, v16.4s +mla v14.4S, v18.4S, v31.s[0] +sub v18.4s, v2.4s, v26.4s +mla v25.4S, v13.4S, v31.s[0] +add v2.4s, v2.4s, v26.4s +mul v24.4S, v24.4S,v29.s[1] +sub v26.4s, v12.4s, v19.4s +mul v20.4S, v20.4S,v29.s[1] +add v12.4s, v12.4s, v19.4s +mla v24.4S, v27.4S, v31.s[0] +sub v27.4s, v8.4s, v22.4s +mla v20.4S, v3.4S, v31.s[0] +add v8.4s, v8.4s, v22.4s +sqrdmulh v28.4S, v23.4S, v11.s[3] +nop +mul v23.4S, v23.4S,v17.s[3] +nop +sqrdmulh v29.4S, v18.4S, v11.s[3] +sub v22.4s, v0.4s, v14.4s +mul v18.4S, v18.4S,v17.s[3] +add v0.4s, v0.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v11.s[2] +sub v3.4s, v21.4s, v25.4s +mul v15.4S, v15.4S,v17.s[2] +add v21.4s, v21.4s, v25.4s +sqrdmulh v25.4S, v2.4S, v11.s[2] +sub v19.4s, v6.4s, v24.4s +mul v2.4S, v2.4S,v17.s[2] +add v6.4s, v6.4s, v24.4s +mla v23.4S, v28.4S, v31.s[0] +sub v28.4s, v10.4s, v20.4s +sqrdmulh v24.4S, v22.4S, v11.s[1] +add v10.4s, v10.4s, v20.4s +mla v18.4S, v29.4S, v31.s[0] +nop +sqrdmulh v29.4S, v3.4S, v11.s[1] +nop +mla v15.4S, v14.4S, v31.s[0] +nop +sqrdmulh v14.4S, v0.4S, v11.s[0] +nop +mla v2.4S, v25.4S, v31.s[0] +nop +sqrdmulh v25.4S, v21.4S, v11.s[0] +nop +mul v22.4S, v22.4S,v17.s[1] +sub v20.4s, v26.4s, v23.4s +mul v3.4S, v3.4S,v17.s[1] +add v26.4s, v26.4s, v23.4s +mla v22.4S, v24.4S, v31.s[0] +sub v24.4s, v27.4s, v18.4s +mla v3.4S, v29.4S, v31.s[0] +add v27.4s, v27.4s, v18.4s +mul v0.4S, v0.4S,v17.s[0] +sub v18.4s, v12.4s, v15.4s +mul v21.4S, v21.4S,v17.s[0] +add v12.4s, v12.4s, v15.4s +mla v0.4S, v14.4S, v31.s[0] +sub v14.4s, v8.4s, v2.4s +mla v21.4S, v25.4S, v31.s[0] +add v8.4s, v8.4s, v2.4s +sqrdmulh v11.4S, v20.4S, v9.s[3] +nop +mul v20.4S, v20.4S,v30.s[3] +nop +sqrdmulh v17.4S, v26.4S, v9.s[2] +sub v2.4s, v19.4s, v22.4s +mul v26.4S, v26.4S,v30.s[2] +add v19.4s, v19.4s, v22.4s +sqrdmulh v22.4S, v18.4S, v9.s[1] +sub v25.4s, v28.4s, v3.4s +mul v18.4S, v18.4S,v30.s[1] +add v28.4s, v28.4s, v3.4s +sqrdmulh v3.4S, v12.4S, v9.s[0] +sub v15.4s, v6.4s, v0.4s +mul v12.4S, v12.4S,v30.s[0] +add v6.4s, v6.4s, v0.4s +mla v20.4S, v11.4S, v31.s[0] +sub v11.4s, v10.4s, v21.4s +sqrdmulh v9.4S, v2.4S, v7.s[3] +add v10.4s, v10.4s, v21.4s +mla v26.4S, v17.4S, v31.s[0] +sub v17.4s, v24.4s, v20.4s +sqrdmulh v21.4S, v19.4S, v7.s[2] +add v24.4s, v24.4s, v20.4s +mla v18.4S, v22.4S, v31.s[0] +sub v22.4s, v27.4s, v26.4s +sqrdmulh v20.4S, v15.4S, v7.s[1] +add v27.4s, v27.4s, v26.4s +mla v12.4S, v3.4S, v31.s[0] +sub v3.4s, v14.4s, v18.4s +sqrdmulh v26.4S, v6.4S, v7.s[0] +add v14.4s, v14.4s, v18.4s +mul v2.4S, v2.4S,v1.s[3] +sub v18.4s, v8.4s, v12.4s +mul v19.4S, v19.4S,v1.s[2] +add v8.4s, v8.4s, v12.4s +mla v2.4S, v9.4S, v31.s[0] +str q17, [x0, #976] +mla v19.4S, v21.4S, v31.s[0] +str q24, [x0, #912] +mul v15.4S, v15.4S,v1.s[1] +str q22, [x0, #848] +mul v6.4S, v6.4S,v1.s[0] +str q27, [x0, #784] +mla v15.4S, v20.4S, v31.s[0] +str q3, [x0, #720] +mla v6.4S, v26.4S, v31.s[0] +str q14, [x0, #656] +str q18, [x0, #592] +str q8, [x0, #528] +sub v8.4s, v25.4s, v2.4s +str q8, [x0, #464] +add v25.4s, v25.4s, v2.4s +sub v2.4s, v28.4s, v19.4s +str q25, [x0, #400] +add v28.4s, v28.4s, v19.4s +sub v19.4s, v11.4s, v15.4s +str q2, [x0, #336] +add v11.4s, v11.4s, v15.4s +sub v15.4s, v10.4s, v6.4s +str q28, [x0, #272] +add v10.4s, v10.4s, v6.4s +str q19, [x0, #208] +str q11, [x0, #144] +str q15, [x0, #80] +str q10, [x0, #16] +ldr q4, [x0, #224] +ldr q5, [x0, #160] +ldr q16, [x0, #32] +ldr q13, [x17, #+128] +ldr q23, [x17, #+144] +sqrdmulh v29.4S, v16.4S, v23.s[0] +mul v16.4S, v16.4S,v13.s[0] +ldr q0, [x0, #48] +sqrdmulh v30.4S, v0.4S, v23.s[0] +mul v0.4S, v0.4S,v13.s[0] +ldr q12, [x17, #+160] +ldr q9, [x17, #+176] +ldr q17, [x0, #96] +sqrdmulh v21.4S, v17.4S, v9.s[0] +mul v17.4S, v17.4S,v12.s[0] +ldr q24, [x0, #112] +sqrdmulh v22.4S, v24.4S, v9.s[0] +mul v24.4S, v24.4S,v12.s[0] +ldr q27, [x17, #+192] +ldr q20, [x17, #+208] +mla v16.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v5.4S, v20.s[0] +ldr q3, [x0, #176] +mla v0.4S, v30.4S, v31.s[0] +sqrdmulh v30.4S, v3.4S, v20.s[0] +ldr q26, [x17, #+224] +ldr q14, [x17, #+240] +mla v17.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v4.4S, v14.s[0] +ldr q1, [x0, #240] +mla v24.4S, v22.4S, v31.s[0] +sqrdmulh v22.4S, v1.4S, v14.s[0] +ldr q7, [x0, #0] +ldr q18, [x0, #128] +mul v5.4S, v5.4S,v27.s[0] +sub v8.4s, v7.4s, v16.4s +ldr q25, [x0, #16] +mul v3.4S, v3.4S,v27.s[0] +add v7.4s, v7.4s, v16.4s +ldr q16, [x0, #144] +mla v5.4S, v29.4S, v31.s[0] +sub v29.4s, v25.4s, v0.4s +ldr q2, [x0, #64] +mla v3.4S, v30.4S, v31.s[0] +add v25.4s, v25.4s, v0.4s +ldr q0, [x0, #192] +mul v4.4S, v4.4S,v26.s[0] +sub v30.4s, v2.4s, v17.4s +ldr q28, [x0, #80] +mul v1.4S, v1.4S,v26.s[0] +add v2.4s, v2.4s, v17.4s +ldr q17, [x0, #208] +mla v4.4S, v21.4S, v31.s[0] +mla v1.4S, v22.4S, v31.s[0] +sub v22.4s, v28.4s, v24.4s +sqrdmulh v21.4S, v25.4S, v23.s[1] +add v28.4s, v28.4s, v24.4s +mul v25.4S, v25.4S,v13.s[1] +sqrdmulh v24.4S, v29.4S, v23.s[2] +sub v6.4s, v18.4s, v5.4s +mul v29.4S, v29.4S,v13.s[2] +add v18.4s, v18.4s, v5.4s +sqrdmulh v23.4S, v28.4S, v9.s[1] +sub v13.4s, v16.4s, v3.4s +mul v28.4S, v28.4S,v12.s[1] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v22.4S, v9.s[2] +sub v5.4s, v0.4s, v4.4s +mul v22.4S, v22.4S,v12.s[2] +add v0.4s, v0.4s, v4.4s +mla v25.4S, v21.4S, v31.s[0] +sub v21.4s, v17.4s, v1.4s +ldr q9, [x0, #480] +sqrdmulh v12.4S, v16.4S, v20.s[1] +add v17.4s, v17.4s, v1.4s +mla v29.4S, v24.4S, v31.s[0] +ldr q24, [x0, #416] +sqrdmulh v1.4S, v13.4S, v20.s[2] +sub v4.4s, v7.4s, v25.4s +mla v28.4S, v23.4S, v31.s[0] +ldr q23, [x0, #288] +sqrdmulh v19.4S, v17.4S, v14.s[1] +add v7.4s, v7.4s, v25.4s +str q4, [x0, #16] +mla v22.4S, v3.4S, v31.s[0] +ldr q3, [x17, #+256] +ldr q4, [x17, #+272] +sqrdmulh v25.4S, v21.4S, v14.s[2] +sub v11.4s, v8.4s, v29.4s +str q7, [x0, #0] +mul v16.4S, v16.4S,v27.s[1] +add v8.4s, v8.4s, v29.4s +mul v13.4S, v13.4S,v27.s[2] +str q11, [x0, #48] +mla v16.4S, v12.4S, v31.s[0] +sub v12.4s, v2.4s, v28.4s +mla v13.4S, v1.4S, v31.s[0] +str q8, [x0, #32] +mul v17.4S, v17.4S,v26.s[1] +str q12, [x0, #80] +mul v21.4S, v21.4S,v26.s[2] +add v2.4s, v2.4s, v28.4s +str q2, [x0, #64] +mla v17.4S, v19.4S, v31.s[0] +sub v19.4s, v30.4s, v22.4s +str q19, [x0, #112] +mla v21.4S, v25.4S, v31.s[0] +add v30.4s, v30.4s, v22.4s +str q30, [x0, #96] +sqrdmulh v14.4S, v23.4S, v4.s[0] +sub v26.4s, v18.4s, v16.4s +mul v23.4S, v23.4S,v3.s[0] +str q26, [x0, #144] +ldr q26, [x0, #304] +sqrdmulh v30.4S, v26.4S, v4.s[0] +add v18.4s, v18.4s, v16.4s +mul v26.4S, v26.4S,v3.s[0] +str q18, [x0, #128] +ldr q18, [x17, #+288] +ldr q16, [x17, #+304] +ldr q22, [x0, #352] +sqrdmulh v25.4S, v22.4S, v16.s[0] +sub v19.4s, v6.4s, v13.4s +mul v22.4S, v22.4S,v18.s[0] +str q19, [x0, #176] +ldr q19, [x0, #368] +sqrdmulh v2.4S, v19.4S, v16.s[0] +add v6.4s, v6.4s, v13.4s +mul v19.4S, v19.4S,v18.s[0] +str q6, [x0, #160] +ldr q6, [x17, #+320] +ldr q13, [x17, #+336] +mla v23.4S, v14.4S, v31.s[0] +sub v14.4s, v0.4s, v17.4s +sqrdmulh v28.4S, v24.4S, v13.s[0] +str q14, [x0, #208] +ldr q14, [x0, #432] +mla v26.4S, v30.4S, v31.s[0] +add v0.4s, v0.4s, v17.4s +sqrdmulh v17.4S, v14.4S, v13.s[0] +str q0, [x0, #192] +ldr q0, [x17, #+352] +ldr q30, [x17, #+368] +mla v22.4S, v25.4S, v31.s[0] +sub v25.4s, v5.4s, v21.4s +sqrdmulh v12.4S, v9.4S, v30.s[0] +str q25, [x0, #240] +ldr q25, [x0, #496] +mla v19.4S, v2.4S, v31.s[0] +add v5.4s, v5.4s, v21.4s +sqrdmulh v21.4S, v25.4S, v30.s[0] +str q5, [x0, #224] +ldr q5, [x0, #256] +ldr q2, [x0, #384] +mul v24.4S, v24.4S,v6.s[0] +sub v20.4s, v5.4s, v23.4s +ldr q27, [x0, #272] +mul v14.4S, v14.4S,v6.s[0] +add v5.4s, v5.4s, v23.4s +ldr q23, [x0, #400] +mla v24.4S, v28.4S, v31.s[0] +sub v28.4s, v27.4s, v26.4s +ldr q8, [x0, #320] +mla v14.4S, v17.4S, v31.s[0] +add v27.4s, v27.4s, v26.4s +ldr q26, [x0, #448] +mul v9.4S, v9.4S,v0.s[0] +sub v17.4s, v8.4s, v22.4s +ldr q1, [x0, #336] +mul v25.4S, v25.4S,v0.s[0] +add v8.4s, v8.4s, v22.4s +ldr q22, [x0, #464] +mla v9.4S, v12.4S, v31.s[0] +mla v25.4S, v21.4S, v31.s[0] +sub v21.4s, v1.4s, v19.4s +sqrdmulh v12.4S, v27.4S, v4.s[1] +add v1.4s, v1.4s, v19.4s +mul v27.4S, v27.4S,v3.s[1] +sqrdmulh v19.4S, v28.4S, v4.s[2] +sub v11.4s, v2.4s, v24.4s +mul v28.4S, v28.4S,v3.s[2] +add v2.4s, v2.4s, v24.4s +sqrdmulh v4.4S, v1.4S, v16.s[1] +sub v3.4s, v23.4s, v14.4s +mul v1.4S, v1.4S,v18.s[1] +add v23.4s, v23.4s, v14.4s +sqrdmulh v14.4S, v21.4S, v16.s[2] +sub v24.4s, v26.4s, v9.4s +mul v21.4S, v21.4S,v18.s[2] +add v26.4s, v26.4s, v9.4s +mla v27.4S, v12.4S, v31.s[0] +sub v12.4s, v22.4s, v25.4s +ldr q16, [x0, #736] +sqrdmulh v18.4S, v23.4S, v13.s[1] +add v22.4s, v22.4s, v25.4s +mla v28.4S, v19.4S, v31.s[0] +ldr q19, [x0, #672] +sqrdmulh v25.4S, v3.4S, v13.s[2] +sub v9.4s, v5.4s, v27.4s +mla v1.4S, v4.4S, v31.s[0] +ldr q4, [x0, #544] +sqrdmulh v29.4S, v22.4S, v30.s[1] +add v5.4s, v5.4s, v27.4s +str q9, [x0, #272] +mla v21.4S, v14.4S, v31.s[0] +ldr q14, [x17, #+384] +ldr q9, [x17, #+400] +sqrdmulh v27.4S, v12.4S, v30.s[2] +sub v7.4s, v20.4s, v28.4s +str q5, [x0, #256] +mul v23.4S, v23.4S,v6.s[1] +add v20.4s, v20.4s, v28.4s +mul v3.4S, v3.4S,v6.s[2] +str q7, [x0, #304] +mla v23.4S, v18.4S, v31.s[0] +sub v18.4s, v8.4s, v1.4s +mla v3.4S, v25.4S, v31.s[0] +str q20, [x0, #288] +mul v22.4S, v22.4S,v0.s[1] +str q18, [x0, #336] +mul v12.4S, v12.4S,v0.s[2] +add v8.4s, v8.4s, v1.4s +str q8, [x0, #320] +mla v22.4S, v29.4S, v31.s[0] +sub v29.4s, v17.4s, v21.4s +str q29, [x0, #368] +mla v12.4S, v27.4S, v31.s[0] +add v17.4s, v17.4s, v21.4s +str q17, [x0, #352] +sqrdmulh v30.4S, v4.4S, v9.s[0] +sub v0.4s, v2.4s, v23.4s +mul v4.4S, v4.4S,v14.s[0] +str q0, [x0, #400] +ldr q0, [x0, #560] +sqrdmulh v17.4S, v0.4S, v9.s[0] +add v2.4s, v2.4s, v23.4s +mul v0.4S, v0.4S,v14.s[0] +str q2, [x0, #384] +ldr q2, [x17, #+416] +ldr q23, [x17, #+432] +ldr q21, [x0, #608] +sqrdmulh v27.4S, v21.4S, v23.s[0] +sub v29.4s, v11.4s, v3.4s +mul v21.4S, v21.4S,v2.s[0] +str q29, [x0, #432] +ldr q29, [x0, #624] +sqrdmulh v8.4S, v29.4S, v23.s[0] +add v11.4s, v11.4s, v3.4s +mul v29.4S, v29.4S,v2.s[0] +str q11, [x0, #416] +ldr q11, [x17, #+448] +ldr q3, [x17, #+464] +mla v4.4S, v30.4S, v31.s[0] +sub v30.4s, v26.4s, v22.4s +sqrdmulh v1.4S, v19.4S, v3.s[0] +str q30, [x0, #464] +ldr q30, [x0, #688] +mla v0.4S, v17.4S, v31.s[0] +add v26.4s, v26.4s, v22.4s +sqrdmulh v22.4S, v30.4S, v3.s[0] +str q26, [x0, #448] +ldr q26, [x17, #+480] +ldr q17, [x17, #+496] +mla v21.4S, v27.4S, v31.s[0] +sub v27.4s, v24.4s, v12.4s +sqrdmulh v18.4S, v16.4S, v17.s[0] +str q27, [x0, #496] +ldr q27, [x0, #752] +mla v29.4S, v8.4S, v31.s[0] +add v24.4s, v24.4s, v12.4s +sqrdmulh v12.4S, v27.4S, v17.s[0] +str q24, [x0, #480] +ldr q24, [x0, #512] +ldr q8, [x0, #640] +mul v19.4S, v19.4S,v11.s[0] +sub v13.4s, v24.4s, v4.4s +ldr q6, [x0, #528] +mul v30.4S, v30.4S,v11.s[0] +add v24.4s, v24.4s, v4.4s +ldr q4, [x0, #656] +mla v19.4S, v1.4S, v31.s[0] +sub v1.4s, v6.4s, v0.4s +ldr q20, [x0, #576] +mla v30.4S, v22.4S, v31.s[0] +add v6.4s, v6.4s, v0.4s +ldr q0, [x0, #704] +mul v16.4S, v16.4S,v26.s[0] +sub v22.4s, v20.4s, v21.4s +ldr q25, [x0, #592] +mul v27.4S, v27.4S,v26.s[0] +add v20.4s, v20.4s, v21.4s +ldr q21, [x0, #720] +mla v16.4S, v18.4S, v31.s[0] +mla v27.4S, v12.4S, v31.s[0] +sub v12.4s, v25.4s, v29.4s +sqrdmulh v18.4S, v6.4S, v9.s[1] +add v25.4s, v25.4s, v29.4s +mul v6.4S, v6.4S,v14.s[1] +sqrdmulh v29.4S, v1.4S, v9.s[2] +sub v7.4s, v8.4s, v19.4s +mul v1.4S, v1.4S,v14.s[2] +add v8.4s, v8.4s, v19.4s +sqrdmulh v9.4S, v25.4S, v23.s[1] +sub v14.4s, v4.4s, v30.4s +mul v25.4S, v25.4S,v2.s[1] +add v4.4s, v4.4s, v30.4s +sqrdmulh v30.4S, v12.4S, v23.s[2] +sub v19.4s, v0.4s, v16.4s +mul v12.4S, v12.4S,v2.s[2] +add v0.4s, v0.4s, v16.4s +mla v6.4S, v18.4S, v31.s[0] +sub v18.4s, v21.4s, v27.4s +ldr q23, [x0, #992] +sqrdmulh v2.4S, v4.4S, v3.s[1] +add v21.4s, v21.4s, v27.4s +mla v1.4S, v29.4S, v31.s[0] +ldr q29, [x0, #928] +sqrdmulh v27.4S, v14.4S, v3.s[2] +sub v16.4s, v24.4s, v6.4s +mla v25.4S, v9.4S, v31.s[0] +ldr q9, [x0, #800] +sqrdmulh v28.4S, v21.4S, v17.s[1] +add v24.4s, v24.4s, v6.4s +str q16, [x0, #528] +mla v12.4S, v30.4S, v31.s[0] +ldr q30, [x17, #+512] +ldr q16, [x17, #+528] +sqrdmulh v6.4S, v18.4S, v17.s[2] +sub v5.4s, v13.4s, v1.4s +str q24, [x0, #512] +mul v4.4S, v4.4S,v11.s[1] +add v13.4s, v13.4s, v1.4s +mul v14.4S, v14.4S,v11.s[2] +str q5, [x0, #560] +mla v4.4S, v2.4S, v31.s[0] +sub v2.4s, v20.4s, v25.4s +mla v14.4S, v27.4S, v31.s[0] +str q13, [x0, #544] +mul v21.4S, v21.4S,v26.s[1] +str q2, [x0, #592] +mul v18.4S, v18.4S,v26.s[2] +add v20.4s, v20.4s, v25.4s +str q20, [x0, #576] +mla v21.4S, v28.4S, v31.s[0] +sub v28.4s, v22.4s, v12.4s +str q28, [x0, #624] +mla v18.4S, v6.4S, v31.s[0] +add v22.4s, v22.4s, v12.4s +str q22, [x0, #608] +sqrdmulh v17.4S, v9.4S, v16.s[0] +sub v26.4s, v8.4s, v4.4s +mul v9.4S, v9.4S,v30.s[0] +str q26, [x0, #656] +ldr q26, [x0, #816] +sqrdmulh v22.4S, v26.4S, v16.s[0] +add v8.4s, v8.4s, v4.4s +mul v26.4S, v26.4S,v30.s[0] +str q8, [x0, #640] +ldr q8, [x17, #+544] +ldr q4, [x17, #+560] +ldr q12, [x0, #864] +sqrdmulh v6.4S, v12.4S, v4.s[0] +sub v28.4s, v7.4s, v14.4s +mul v12.4S, v12.4S,v8.s[0] +str q28, [x0, #688] +ldr q28, [x0, #880] +sqrdmulh v20.4S, v28.4S, v4.s[0] +add v7.4s, v7.4s, v14.4s +mul v28.4S, v28.4S,v8.s[0] +str q7, [x0, #672] +ldr q7, [x17, #+576] +ldr q14, [x17, #+592] +mla v9.4S, v17.4S, v31.s[0] +sub v17.4s, v0.4s, v21.4s +sqrdmulh v25.4S, v29.4S, v14.s[0] +str q17, [x0, #720] +ldr q17, [x0, #944] +mla v26.4S, v22.4S, v31.s[0] +add v0.4s, v0.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v14.s[0] +str q0, [x0, #704] +ldr q0, [x17, #+608] +ldr q22, [x17, #+624] +mla v12.4S, v6.4S, v31.s[0] +sub v6.4s, v19.4s, v18.4s +sqrdmulh v2.4S, v23.4S, v22.s[0] +str q6, [x0, #752] +ldr q6, [x0, #1008] +mla v28.4S, v20.4S, v31.s[0] +add v19.4s, v19.4s, v18.4s +sqrdmulh v18.4S, v6.4S, v22.s[0] +str q19, [x0, #736] +ldr q19, [x0, #768] +ldr q20, [x0, #896] +mul v29.4S, v29.4S,v7.s[0] +sub v3.4s, v19.4s, v9.4s +ldr q11, [x0, #784] +mul v17.4S, v17.4S,v7.s[0] +add v19.4s, v19.4s, v9.4s +ldr q9, [x0, #912] +mla v29.4S, v25.4S, v31.s[0] +sub v25.4s, v11.4s, v26.4s +ldr q13, [x0, #832] +mla v17.4S, v21.4S, v31.s[0] +add v11.4s, v11.4s, v26.4s +ldr q26, [x0, #960] +mul v23.4S, v23.4S,v0.s[0] +sub v21.4s, v13.4s, v12.4s +ldr q27, [x0, #848] +mul v6.4S, v6.4S,v0.s[0] +add v13.4s, v13.4s, v12.4s +ldr q12, [x0, #976] +mla v23.4S, v2.4S, v31.s[0] +mla v6.4S, v18.4S, v31.s[0] +sub v18.4s, v27.4s, v28.4s +sqrdmulh v2.4S, v11.4S, v16.s[1] +add v27.4s, v27.4s, v28.4s +mul v11.4S, v11.4S,v30.s[1] +sqrdmulh v28.4S, v25.4S, v16.s[2] +sub v5.4s, v20.4s, v29.4s +mul v25.4S, v25.4S,v30.s[2] +add v20.4s, v20.4s, v29.4s +sqrdmulh v16.4S, v27.4S, v4.s[1] +sub v30.4s, v9.4s, v17.4s +mul v27.4S, v27.4S,v8.s[1] +add v9.4s, v9.4s, v17.4s +sqrdmulh v17.4S, v18.4S, v4.s[2] +sub v29.4s, v26.4s, v23.4s +mul v18.4S, v18.4S,v8.s[2] +add v26.4s, v26.4s, v23.4s +mla v11.4S, v2.4S, v31.s[0] +sub v2.4s, v12.4s, v6.4s +sqrdmulh v4.4S, v9.4S, v14.s[1] +add v12.4s, v12.4s, v6.4s +mla v25.4S, v28.4S, v31.s[0] +sqrdmulh v28.4S, v30.4S, v14.s[2] +sub v6.4s, v19.4s, v11.4s +mla v27.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v12.4S, v22.s[1] +add v19.4s, v19.4s, v11.4s +str q6, [x0, #784] +mla v18.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v2.4S, v22.s[2] +sub v6.4s, v3.4s, v25.4s +str q19, [x0, #768] +mul v9.4S, v9.4S,v7.s[1] +add v3.4s, v3.4s, v25.4s +mul v30.4S, v30.4S,v7.s[2] +str q6, [x0, #816] +mla v9.4S, v4.4S, v31.s[0] +sub v4.4s, v13.4s, v27.4s +mla v30.4S, v28.4S, v31.s[0] +str q3, [x0, #800] +mul v12.4S, v12.4S,v0.s[1] +str q4, [x0, #848] +mul v2.4S, v2.4S,v0.s[2] +add v13.4s, v13.4s, v27.4s +str q13, [x0, #832] +mla v12.4S, v16.4S, v31.s[0] +sub v16.4s, v21.4s, v18.4s +str q16, [x0, #880] +mla v2.4S, v17.4S, v31.s[0] +add v21.4s, v21.4s, v18.4s +str q21, [x0, #864] +sub v22.4s, v20.4s, v9.4s +str q22, [x0, #912] +add v20.4s, v20.4s, v9.4s +str q20, [x0, #896] +sub v20.4s, v5.4s, v30.4s +str q20, [x0, #944] +add v5.4s, v5.4s, v30.4s +str q5, [x0, #928] +sub v5.4s, v26.4s, v12.4s +str q5, [x0, #976] +add v26.4s, v26.4s, v12.4s +str q26, [x0, #960] +sub v26.4s, v29.4s, v2.4s +str q26, [x0, #1008] +add v29.4s, v29.4s, v2.4s +str q29, [x0, #992] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1548 +// Instruction count: 1544 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_16_z4_7.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_16_z4_7.s new file mode 100644 index 0000000..d7db1d0 --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_16_z4_7.s @@ -0,0 +1,1578 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_16_z4_7 +.global _ntt_u32_incomplete_neon_asm_var_4_2_16_z4_7 +ntt_u32_incomplete_neon_asm_var_4_2_16_z4_7: +_ntt_u32_incomplete_neon_asm_var_4_2_16_z4_7: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x0, #992] +ldr q29, [x17, #+0] +ldr q28, [x17, #+16] +sqrdmulh v27.4S, v30.4S, v28.s[0] +mul v30.4S, v30.4S,v29.s[0] +ldr q26, [x0, #928] +sqrdmulh v25.4S, v26.4S, v28.s[0] +mul v26.4S, v26.4S,v29.s[0] +ldr q24, [x0, #864] +sqrdmulh v23.4S, v24.4S, v28.s[0] +mul v24.4S, v24.4S,v29.s[0] +ldr q22, [x0, #800] +sqrdmulh v21.4S, v22.4S, v28.s[0] +mul v22.4S, v22.4S,v29.s[0] +ldr q20, [x0, #736] +mla v30.4S, v27.4S, v31.s[0] +sqrdmulh v27.4S, v20.4S, v28.s[0] +ldr q19, [x0, #672] +mla v26.4S, v25.4S, v31.s[0] +sqrdmulh v25.4S, v19.4S, v28.s[0] +nop +ldr q18, [x0, #608] +mla v24.4S, v23.4S, v31.s[0] +sqrdmulh v23.4S, v18.4S, v28.s[0] +nop +ldr q17, [x0, #544] +mla v22.4S, v21.4S, v31.s[0] +nop +sqrdmulh v21.4S, v17.4S, v28.s[0] +ldr q16, [x0, #480] +ldr q3, [x0, #416] +mul v20.4S, v20.4S,v29.s[0] +sub v2.4s, v16.4s, v30.4s +mul v19.4S, v19.4S,v29.s[0] +add v16.4s, v16.4s, v30.4s +ldr q30, [x0, #352] +ldr q1, [x0, #288] +mla v20.4S, v27.4S, v31.s[0] +sub v27.4s, v3.4s, v26.4s +mla v19.4S, v25.4S, v31.s[0] +add v3.4s, v3.4s, v26.4s +ldr q26, [x0, #224] +ldr q25, [x0, #160] +mul v18.4S, v18.4S,v29.s[0] +sub v0.4s, v30.4s, v24.4s +mul v17.4S, v17.4S,v29.s[0] +add v30.4s, v30.4s, v24.4s +ldr q24, [x0, #96] +ldr q15, [x0, #32] +mla v18.4S, v23.4S, v31.s[0] +sub v23.4s, v1.4s, v22.4s +mla v17.4S, v21.4S, v31.s[0] +add v1.4s, v1.4s, v22.4s +sqrdmulh v22.4S, v2.4S, v28.s[2] +nop +mul v2.4S, v2.4S,v29.s[2] +nop +sqrdmulh v21.4S, v27.4S, v28.s[2] +sub v14.4s, v26.4s, v20.4s +mul v27.4S, v27.4S,v29.s[2] +add v26.4s, v26.4s, v20.4s +sqrdmulh v20.4S, v16.4S, v28.s[1] +sub v13.4s, v25.4s, v19.4s +mul v16.4S, v16.4S,v29.s[1] +add v25.4s, v25.4s, v19.4s +sqrdmulh v19.4S, v3.4S, v28.s[1] +sub v12.4s, v24.4s, v18.4s +mul v3.4S, v3.4S,v29.s[1] +add v24.4s, v24.4s, v18.4s +mla v2.4S, v22.4S, v31.s[0] +sub v22.4s, v15.4s, v17.4s +sqrdmulh v18.4S, v0.4S, v28.s[2] +add v15.4s, v15.4s, v17.4s +mla v27.4S, v21.4S, v31.s[0] +nop +sqrdmulh v21.4S, v23.4S, v28.s[2] +nop +mla v16.4S, v20.4S, v31.s[0] +nop +sqrdmulh v20.4S, v30.4S, v28.s[1] +nop +mla v3.4S, v19.4S, v31.s[0] +nop +sqrdmulh v19.4S, v1.4S, v28.s[1] +nop +ldr q17, [x17, #+32] +ldr q11, [x17, #+48] +mul v0.4S, v0.4S,v29.s[2] +sub v10.4s, v14.4s, v2.4s +mul v23.4S, v23.4S,v29.s[2] +add v14.4s, v14.4s, v2.4s +mla v0.4S, v18.4S, v31.s[0] +sub v18.4s, v13.4s, v27.4s +mla v23.4S, v21.4S, v31.s[0] +add v13.4s, v13.4s, v27.4s +mul v30.4S, v30.4S,v29.s[1] +sub v27.4s, v26.4s, v16.4s +mul v1.4S, v1.4S,v29.s[1] +add v26.4s, v26.4s, v16.4s +mla v30.4S, v20.4S, v31.s[0] +sub v20.4s, v25.4s, v3.4s +mla v1.4S, v19.4S, v31.s[0] +add v25.4s, v25.4s, v3.4s +sqrdmulh v3.4S, v10.4S, v11.s[3] +nop +mul v10.4S, v10.4S,v17.s[3] +nop +sqrdmulh v19.4S, v14.4S, v11.s[2] +sub v16.4s, v12.4s, v0.4s +mul v14.4S, v14.4S,v17.s[2] +add v12.4s, v12.4s, v0.4s +sqrdmulh v0.4S, v27.4S, v11.s[1] +sub v21.4s, v22.4s, v23.4s +mul v27.4S, v27.4S,v17.s[1] +add v22.4s, v22.4s, v23.4s +sqrdmulh v23.4S, v26.4S, v11.s[0] +sub v2.4s, v24.4s, v30.4s +mul v26.4S, v26.4S,v17.s[0] +add v24.4s, v24.4s, v30.4s +ldr q30, [x17, #+96] +ldr q9, [x17, #+112] +mla v10.4S, v3.4S, v31.s[0] +sub v3.4s, v15.4s, v1.4s +sqrdmulh v8.4S, v18.4S, v11.s[3] +add v15.4s, v15.4s, v1.4s +mla v14.4S, v19.4S, v31.s[0] +nop +sqrdmulh v19.4S, v13.4S, v11.s[2] +nop +mla v27.4S, v0.4S, v31.s[0] +nop +sqrdmulh v0.4S, v20.4S, v11.s[1] +nop +mla v26.4S, v23.4S, v31.s[0] +nop +sqrdmulh v23.4S, v25.4S, v11.s[0] +nop +ldr q1, [x17, #+64] +ldr q7, [x17, #+80] +mul v18.4S, v18.4S,v17.s[3] +sub v6.4s, v16.4s, v10.4s +mul v13.4S, v13.4S,v17.s[2] +add v16.4s, v16.4s, v10.4s +mla v18.4S, v8.4S, v31.s[0] +sub v8.4s, v12.4s, v14.4s +mla v13.4S, v19.4S, v31.s[0] +add v12.4s, v12.4s, v14.4s +mul v20.4S, v20.4S,v17.s[1] +sub v14.4s, v2.4s, v27.4s +mul v25.4S, v25.4S,v17.s[0] +add v2.4s, v2.4s, v27.4s +mla v20.4S, v0.4S, v31.s[0] +sub v0.4s, v24.4s, v26.4s +mla v25.4S, v23.4S, v31.s[0] +add v24.4s, v24.4s, v26.4s +sqrdmulh v26.4S, v6.4S, v9.s[3] +nop +mul v6.4S, v6.4S,v30.s[3] +nop +sqrdmulh v23.4S, v16.4S, v9.s[2] +sub v27.4s, v21.4s, v18.4s +mul v16.4S, v16.4S,v30.s[2] +add v21.4s, v21.4s, v18.4s +sqrdmulh v18.4S, v8.4S, v9.s[1] +sub v19.4s, v22.4s, v13.4s +mul v8.4S, v8.4S,v30.s[1] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v12.4S, v9.s[0] +sub v10.4s, v3.4s, v20.4s +mul v12.4S, v12.4S,v30.s[0] +add v3.4s, v3.4s, v20.4s +mla v6.4S, v26.4S, v31.s[0] +sub v26.4s, v15.4s, v25.4s +sqrdmulh v20.4S, v14.4S, v7.s[3] +add v15.4s, v15.4s, v25.4s +mla v16.4S, v23.4S, v31.s[0] +sub v23.4s, v27.4s, v6.4s +sqrdmulh v25.4S, v2.4S, v7.s[2] +add v27.4s, v27.4s, v6.4s +mla v8.4S, v18.4S, v31.s[0] +sub v18.4s, v21.4s, v16.4s +sqrdmulh v6.4S, v0.4S, v7.s[1] +add v21.4s, v21.4s, v16.4s +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v19.4s, v8.4s +sqrdmulh v16.4S, v24.4S, v7.s[0] +add v19.4s, v19.4s, v8.4s +mul v14.4S, v14.4S,v1.s[3] +sub v8.4s, v22.4s, v12.4s +mul v2.4S, v2.4S,v1.s[2] +add v22.4s, v22.4s, v12.4s +mla v14.4S, v20.4S, v31.s[0] +str q23, [x0, #992] +mla v2.4S, v25.4S, v31.s[0] +str q27, [x0, #928] +mul v0.4S, v0.4S,v1.s[1] +str q18, [x0, #864] +mul v24.4S, v24.4S,v1.s[0] +str q21, [x0, #800] +mla v0.4S, v6.4S, v31.s[0] +str q13, [x0, #736] +mla v24.4S, v16.4S, v31.s[0] +str q19, [x0, #672] +ldr q19, [x0, #1008] +sqrdmulh v16.4S, v19.4S, v28.s[0] +str q8, [x0, #608] +mul v19.4S, v19.4S,v29.s[0] +str q22, [x0, #544] +ldr q22, [x0, #944] +sqrdmulh v8.4S, v22.4S, v28.s[0] +sub v13.4s, v10.4s, v14.4s +str q13, [x0, #480] +mul v22.4S, v22.4S,v29.s[0] +add v10.4s, v10.4s, v14.4s +ldr q14, [x0, #880] +sqrdmulh v13.4S, v14.4S, v28.s[0] +sub v6.4s, v3.4s, v2.4s +str q10, [x0, #416] +mul v14.4S, v14.4S,v29.s[0] +add v3.4s, v3.4s, v2.4s +ldr q2, [x0, #816] +sqrdmulh v10.4S, v2.4S, v28.s[0] +sub v21.4s, v26.4s, v0.4s +str q6, [x0, #352] +mul v2.4S, v2.4S,v29.s[0] +add v26.4s, v26.4s, v0.4s +ldr q0, [x0, #752] +mla v19.4S, v16.4S, v31.s[0] +sub v16.4s, v15.4s, v24.4s +str q3, [x0, #288] +sqrdmulh v3.4S, v0.4S, v28.s[0] +add v15.4s, v15.4s, v24.4s +ldr q24, [x0, #688] +mla v22.4S, v8.4S, v31.s[0] +str q21, [x0, #224] +sqrdmulh v21.4S, v24.4S, v28.s[0] +nop +ldr q8, [x0, #624] +mla v14.4S, v13.4S, v31.s[0] +str q26, [x0, #160] +sqrdmulh v26.4S, v8.4S, v28.s[0] +nop +ldr q13, [x0, #560] +mla v2.4S, v10.4S, v31.s[0] +nop +sqrdmulh v10.4S, v13.4S, v28.s[0] +str q16, [x0, #96] +ldr q16, [x0, #496] +ldr q6, [x0, #432] +mul v0.4S, v0.4S,v29.s[0] +sub v18.4s, v16.4s, v19.4s +str q15, [x0, #32] +mul v24.4S, v24.4S,v29.s[0] +add v16.4s, v16.4s, v19.4s +ldr q19, [x0, #368] +ldr q15, [x0, #304] +mla v0.4S, v3.4S, v31.s[0] +sub v3.4s, v6.4s, v22.4s +mla v24.4S, v21.4S, v31.s[0] +add v6.4s, v6.4s, v22.4s +ldr q22, [x0, #240] +ldr q21, [x0, #176] +mul v8.4S, v8.4S,v29.s[0] +sub v27.4s, v19.4s, v14.4s +mul v13.4S, v13.4S,v29.s[0] +add v19.4s, v19.4s, v14.4s +ldr q14, [x0, #112] +ldr q25, [x0, #48] +mla v8.4S, v26.4S, v31.s[0] +sub v26.4s, v15.4s, v2.4s +mla v13.4S, v10.4S, v31.s[0] +add v15.4s, v15.4s, v2.4s +sqrdmulh v2.4S, v18.4S, v28.s[2] +nop +mul v18.4S, v18.4S,v29.s[2] +nop +sqrdmulh v10.4S, v3.4S, v28.s[2] +sub v23.4s, v22.4s, v0.4s +mul v3.4S, v3.4S,v29.s[2] +add v22.4s, v22.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v28.s[1] +sub v20.4s, v21.4s, v24.4s +mul v16.4S, v16.4S,v29.s[1] +add v21.4s, v21.4s, v24.4s +sqrdmulh v24.4S, v6.4S, v28.s[1] +sub v12.4s, v14.4s, v8.4s +mul v6.4S, v6.4S,v29.s[1] +add v14.4s, v14.4s, v8.4s +mla v18.4S, v2.4S, v31.s[0] +sub v2.4s, v25.4s, v13.4s +sqrdmulh v8.4S, v27.4S, v28.s[2] +add v25.4s, v25.4s, v13.4s +mla v3.4S, v10.4S, v31.s[0] +nop +sqrdmulh v10.4S, v26.4S, v28.s[2] +nop +mla v16.4S, v0.4S, v31.s[0] +nop +sqrdmulh v0.4S, v19.4S, v28.s[1] +nop +mla v6.4S, v24.4S, v31.s[0] +nop +sqrdmulh v24.4S, v15.4S, v28.s[1] +nop +mul v27.4S, v27.4S,v29.s[2] +sub v13.4s, v23.4s, v18.4s +mul v26.4S, v26.4S,v29.s[2] +add v23.4s, v23.4s, v18.4s +mla v27.4S, v8.4S, v31.s[0] +sub v8.4s, v20.4s, v3.4s +mla v26.4S, v10.4S, v31.s[0] +add v20.4s, v20.4s, v3.4s +mul v19.4S, v19.4S,v29.s[1] +sub v3.4s, v22.4s, v16.4s +mul v15.4S, v15.4S,v29.s[1] +add v22.4s, v22.4s, v16.4s +mla v19.4S, v0.4S, v31.s[0] +sub v0.4s, v21.4s, v6.4s +mla v15.4S, v24.4S, v31.s[0] +add v21.4s, v21.4s, v6.4s +sqrdmulh v6.4S, v13.4S, v11.s[3] +nop +mul v13.4S, v13.4S,v17.s[3] +nop +sqrdmulh v24.4S, v23.4S, v11.s[2] +sub v16.4s, v12.4s, v27.4s +mul v23.4S, v23.4S,v17.s[2] +add v12.4s, v12.4s, v27.4s +sqrdmulh v27.4S, v3.4S, v11.s[1] +sub v10.4s, v2.4s, v26.4s +mul v3.4S, v3.4S,v17.s[1] +add v2.4s, v2.4s, v26.4s +sqrdmulh v26.4S, v22.4S, v11.s[0] +sub v18.4s, v14.4s, v19.4s +mul v22.4S, v22.4S,v17.s[0] +add v14.4s, v14.4s, v19.4s +mla v13.4S, v6.4S, v31.s[0] +sub v6.4s, v25.4s, v15.4s +sqrdmulh v19.4S, v8.4S, v11.s[3] +add v25.4s, v25.4s, v15.4s +mla v23.4S, v24.4S, v31.s[0] +nop +sqrdmulh v24.4S, v20.4S, v11.s[2] +nop +mla v3.4S, v27.4S, v31.s[0] +nop +sqrdmulh v27.4S, v0.4S, v11.s[1] +nop +mla v22.4S, v26.4S, v31.s[0] +nop +sqrdmulh v26.4S, v21.4S, v11.s[0] +nop +mul v8.4S, v8.4S,v17.s[3] +sub v15.4s, v16.4s, v13.4s +mul v20.4S, v20.4S,v17.s[2] +add v16.4s, v16.4s, v13.4s +mla v8.4S, v19.4S, v31.s[0] +sub v19.4s, v12.4s, v23.4s +mla v20.4S, v24.4S, v31.s[0] +add v12.4s, v12.4s, v23.4s +mul v0.4S, v0.4S,v17.s[1] +sub v23.4s, v18.4s, v3.4s +mul v21.4S, v21.4S,v17.s[0] +add v18.4s, v18.4s, v3.4s +mla v0.4S, v27.4S, v31.s[0] +sub v27.4s, v14.4s, v22.4s +mla v21.4S, v26.4S, v31.s[0] +add v14.4s, v14.4s, v22.4s +sqrdmulh v22.4S, v15.4S, v9.s[3] +nop +mul v15.4S, v15.4S,v30.s[3] +nop +sqrdmulh v26.4S, v16.4S, v9.s[2] +sub v3.4s, v10.4s, v8.4s +mul v16.4S, v16.4S,v30.s[2] +add v10.4s, v10.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v9.s[1] +sub v24.4s, v2.4s, v20.4s +mul v19.4S, v19.4S,v30.s[1] +add v2.4s, v2.4s, v20.4s +sqrdmulh v20.4S, v12.4S, v9.s[0] +sub v13.4s, v6.4s, v0.4s +mul v12.4S, v12.4S,v30.s[0] +add v6.4s, v6.4s, v0.4s +mla v15.4S, v22.4S, v31.s[0] +sub v22.4s, v25.4s, v21.4s +sqrdmulh v0.4S, v23.4S, v7.s[3] +add v25.4s, v25.4s, v21.4s +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v3.4s, v15.4s +sqrdmulh v21.4S, v18.4S, v7.s[2] +add v3.4s, v3.4s, v15.4s +mla v19.4S, v8.4S, v31.s[0] +sub v8.4s, v10.4s, v16.4s +sqrdmulh v15.4S, v27.4S, v7.s[1] +add v10.4s, v10.4s, v16.4s +mla v12.4S, v20.4S, v31.s[0] +sub v20.4s, v24.4s, v19.4s +sqrdmulh v16.4S, v14.4S, v7.s[0] +add v24.4s, v24.4s, v19.4s +mul v23.4S, v23.4S,v1.s[3] +sub v19.4s, v2.4s, v12.4s +mul v18.4S, v18.4S,v1.s[2] +add v2.4s, v2.4s, v12.4s +mla v23.4S, v0.4S, v31.s[0] +str q26, [x0, #1008] +mla v18.4S, v21.4S, v31.s[0] +str q3, [x0, #944] +mul v27.4S, v27.4S,v1.s[1] +str q8, [x0, #880] +mul v14.4S, v14.4S,v1.s[0] +str q10, [x0, #816] +mla v27.4S, v15.4S, v31.s[0] +str q20, [x0, #752] +mla v14.4S, v16.4S, v31.s[0] +str q24, [x0, #688] +ldr q24, [x0, #960] +sqrdmulh v16.4S, v24.4S, v28.s[0] +str q19, [x0, #624] +mul v24.4S, v24.4S,v29.s[0] +str q2, [x0, #560] +ldr q2, [x0, #896] +sqrdmulh v19.4S, v2.4S, v28.s[0] +sub v20.4s, v13.4s, v23.4s +str q20, [x0, #496] +mul v2.4S, v2.4S,v29.s[0] +add v13.4s, v13.4s, v23.4s +ldr q23, [x0, #832] +sqrdmulh v20.4S, v23.4S, v28.s[0] +sub v15.4s, v6.4s, v18.4s +str q13, [x0, #432] +mul v23.4S, v23.4S,v29.s[0] +add v6.4s, v6.4s, v18.4s +ldr q18, [x0, #768] +sqrdmulh v13.4S, v18.4S, v28.s[0] +sub v10.4s, v22.4s, v27.4s +str q15, [x0, #368] +mul v18.4S, v18.4S,v29.s[0] +add v22.4s, v22.4s, v27.4s +ldr q27, [x0, #704] +mla v24.4S, v16.4S, v31.s[0] +sub v16.4s, v25.4s, v14.4s +str q6, [x0, #304] +sqrdmulh v6.4S, v27.4S, v28.s[0] +add v25.4s, v25.4s, v14.4s +ldr q14, [x0, #640] +mla v2.4S, v19.4S, v31.s[0] +str q10, [x0, #240] +sqrdmulh v10.4S, v14.4S, v28.s[0] +nop +ldr q19, [x0, #576] +mla v23.4S, v20.4S, v31.s[0] +str q22, [x0, #176] +sqrdmulh v22.4S, v19.4S, v28.s[0] +nop +ldr q20, [x0, #512] +mla v18.4S, v13.4S, v31.s[0] +nop +sqrdmulh v13.4S, v20.4S, v28.s[0] +str q16, [x0, #112] +ldr q16, [x0, #448] +ldr q15, [x0, #384] +mul v27.4S, v27.4S,v29.s[0] +sub v8.4s, v16.4s, v24.4s +str q25, [x0, #48] +mul v14.4S, v14.4S,v29.s[0] +add v16.4s, v16.4s, v24.4s +ldr q24, [x0, #320] +ldr q25, [x0, #256] +mla v27.4S, v6.4S, v31.s[0] +sub v6.4s, v15.4s, v2.4s +mla v14.4S, v10.4S, v31.s[0] +add v15.4s, v15.4s, v2.4s +ldr q2, [x0, #192] +ldr q10, [x0, #128] +mul v19.4S, v19.4S,v29.s[0] +sub v3.4s, v24.4s, v23.4s +mul v20.4S, v20.4S,v29.s[0] +add v24.4s, v24.4s, v23.4s +ldr q23, [x0, #64] +ldr q21, [x0, #0] +mla v19.4S, v22.4S, v31.s[0] +sub v22.4s, v25.4s, v18.4s +mla v20.4S, v13.4S, v31.s[0] +add v25.4s, v25.4s, v18.4s +sqrdmulh v18.4S, v8.4S, v28.s[2] +nop +mul v8.4S, v8.4S,v29.s[2] +nop +sqrdmulh v13.4S, v6.4S, v28.s[2] +sub v26.4s, v2.4s, v27.4s +mul v6.4S, v6.4S,v29.s[2] +add v2.4s, v2.4s, v27.4s +sqrdmulh v27.4S, v16.4S, v28.s[1] +sub v0.4s, v10.4s, v14.4s +mul v16.4S, v16.4S,v29.s[1] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v28.s[1] +sub v12.4s, v23.4s, v19.4s +mul v15.4S, v15.4S,v29.s[1] +add v23.4s, v23.4s, v19.4s +mla v8.4S, v18.4S, v31.s[0] +sub v18.4s, v21.4s, v20.4s +sqrdmulh v19.4S, v3.4S, v28.s[2] +add v21.4s, v21.4s, v20.4s +mla v6.4S, v13.4S, v31.s[0] +nop +sqrdmulh v13.4S, v22.4S, v28.s[2] +nop +mla v16.4S, v27.4S, v31.s[0] +nop +sqrdmulh v27.4S, v24.4S, v28.s[1] +nop +mla v15.4S, v14.4S, v31.s[0] +nop +sqrdmulh v14.4S, v25.4S, v28.s[1] +nop +mul v3.4S, v3.4S,v29.s[2] +sub v20.4s, v26.4s, v8.4s +mul v22.4S, v22.4S,v29.s[2] +add v26.4s, v26.4s, v8.4s +mla v3.4S, v19.4S, v31.s[0] +sub v19.4s, v0.4s, v6.4s +mla v22.4S, v13.4S, v31.s[0] +add v0.4s, v0.4s, v6.4s +mul v24.4S, v24.4S,v29.s[1] +sub v6.4s, v2.4s, v16.4s +mul v25.4S, v25.4S,v29.s[1] +add v2.4s, v2.4s, v16.4s +mla v24.4S, v27.4S, v31.s[0] +sub v27.4s, v10.4s, v15.4s +mla v25.4S, v14.4S, v31.s[0] +add v10.4s, v10.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v11.s[3] +nop +mul v20.4S, v20.4S,v17.s[3] +nop +sqrdmulh v14.4S, v26.4S, v11.s[2] +sub v16.4s, v12.4s, v3.4s +mul v26.4S, v26.4S,v17.s[2] +add v12.4s, v12.4s, v3.4s +sqrdmulh v3.4S, v6.4S, v11.s[1] +sub v13.4s, v18.4s, v22.4s +mul v6.4S, v6.4S,v17.s[1] +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v2.4S, v11.s[0] +sub v8.4s, v23.4s, v24.4s +mul v2.4S, v2.4S,v17.s[0] +add v23.4s, v23.4s, v24.4s +mla v20.4S, v15.4S, v31.s[0] +sub v15.4s, v21.4s, v25.4s +sqrdmulh v24.4S, v19.4S, v11.s[3] +add v21.4s, v21.4s, v25.4s +mla v26.4S, v14.4S, v31.s[0] +nop +sqrdmulh v14.4S, v0.4S, v11.s[2] +nop +mla v6.4S, v3.4S, v31.s[0] +nop +sqrdmulh v3.4S, v27.4S, v11.s[1] +nop +mla v2.4S, v22.4S, v31.s[0] +nop +sqrdmulh v22.4S, v10.4S, v11.s[0] +nop +mul v19.4S, v19.4S,v17.s[3] +sub v25.4s, v16.4s, v20.4s +mul v0.4S, v0.4S,v17.s[2] +add v16.4s, v16.4s, v20.4s +mla v19.4S, v24.4S, v31.s[0] +sub v24.4s, v12.4s, v26.4s +mla v0.4S, v14.4S, v31.s[0] +add v12.4s, v12.4s, v26.4s +mul v27.4S, v27.4S,v17.s[1] +sub v26.4s, v8.4s, v6.4s +mul v10.4S, v10.4S,v17.s[0] +add v8.4s, v8.4s, v6.4s +mla v27.4S, v3.4S, v31.s[0] +sub v3.4s, v23.4s, v2.4s +mla v10.4S, v22.4S, v31.s[0] +add v23.4s, v23.4s, v2.4s +sqrdmulh v2.4S, v25.4S, v9.s[3] +nop +mul v25.4S, v25.4S,v30.s[3] +nop +sqrdmulh v22.4S, v16.4S, v9.s[2] +sub v6.4s, v13.4s, v19.4s +mul v16.4S, v16.4S,v30.s[2] +add v13.4s, v13.4s, v19.4s +sqrdmulh v19.4S, v24.4S, v9.s[1] +sub v14.4s, v18.4s, v0.4s +mul v24.4S, v24.4S,v30.s[1] +add v18.4s, v18.4s, v0.4s +sqrdmulh v0.4S, v12.4S, v9.s[0] +sub v20.4s, v15.4s, v27.4s +mul v12.4S, v12.4S,v30.s[0] +add v15.4s, v15.4s, v27.4s +mla v25.4S, v2.4S, v31.s[0] +sub v2.4s, v21.4s, v10.4s +sqrdmulh v27.4S, v26.4S, v7.s[3] +add v21.4s, v21.4s, v10.4s +mla v16.4S, v22.4S, v31.s[0] +sub v22.4s, v6.4s, v25.4s +sqrdmulh v10.4S, v8.4S, v7.s[2] +add v6.4s, v6.4s, v25.4s +mla v24.4S, v19.4S, v31.s[0] +sub v19.4s, v13.4s, v16.4s +sqrdmulh v25.4S, v3.4S, v7.s[1] +add v13.4s, v13.4s, v16.4s +mla v12.4S, v0.4S, v31.s[0] +sub v0.4s, v14.4s, v24.4s +sqrdmulh v16.4S, v23.4S, v7.s[0] +add v14.4s, v14.4s, v24.4s +mul v26.4S, v26.4S,v1.s[3] +sub v24.4s, v18.4s, v12.4s +mul v8.4S, v8.4S,v1.s[2] +add v18.4s, v18.4s, v12.4s +mla v26.4S, v27.4S, v31.s[0] +str q22, [x0, #960] +mla v8.4S, v10.4S, v31.s[0] +str q6, [x0, #896] +mul v3.4S, v3.4S,v1.s[1] +str q19, [x0, #832] +mul v23.4S, v23.4S,v1.s[0] +str q13, [x0, #768] +mla v3.4S, v25.4S, v31.s[0] +str q0, [x0, #704] +mla v23.4S, v16.4S, v31.s[0] +str q14, [x0, #640] +ldr q14, [x0, #976] +sqrdmulh v16.4S, v14.4S, v28.s[0] +str q24, [x0, #576] +mul v14.4S, v14.4S,v29.s[0] +str q18, [x0, #512] +ldr q18, [x0, #912] +sqrdmulh v24.4S, v18.4S, v28.s[0] +sub v0.4s, v20.4s, v26.4s +str q0, [x0, #448] +mul v18.4S, v18.4S,v29.s[0] +add v20.4s, v20.4s, v26.4s +ldr q26, [x0, #848] +sqrdmulh v0.4S, v26.4S, v28.s[0] +sub v25.4s, v15.4s, v8.4s +str q20, [x0, #384] +mul v26.4S, v26.4S,v29.s[0] +add v15.4s, v15.4s, v8.4s +ldr q8, [x0, #784] +sqrdmulh v20.4S, v8.4S, v28.s[0] +sub v13.4s, v2.4s, v3.4s +str q25, [x0, #320] +mul v8.4S, v8.4S,v29.s[0] +add v2.4s, v2.4s, v3.4s +ldr q3, [x0, #720] +mla v14.4S, v16.4S, v31.s[0] +sub v16.4s, v21.4s, v23.4s +str q15, [x0, #256] +sqrdmulh v15.4S, v3.4S, v28.s[0] +add v21.4s, v21.4s, v23.4s +ldr q23, [x0, #656] +mla v18.4S, v24.4S, v31.s[0] +str q13, [x0, #192] +sqrdmulh v13.4S, v23.4S, v28.s[0] +nop +ldr q24, [x0, #592] +mla v26.4S, v0.4S, v31.s[0] +str q2, [x0, #128] +sqrdmulh v2.4S, v24.4S, v28.s[0] +nop +ldr q0, [x0, #528] +mla v8.4S, v20.4S, v31.s[0] +nop +sqrdmulh v20.4S, v0.4S, v28.s[0] +str q16, [x0, #64] +ldr q16, [x0, #464] +ldr q25, [x0, #400] +mul v3.4S, v3.4S,v29.s[0] +sub v19.4s, v16.4s, v14.4s +str q21, [x0, #0] +mul v23.4S, v23.4S,v29.s[0] +add v16.4s, v16.4s, v14.4s +ldr q14, [x0, #336] +ldr q21, [x0, #272] +mla v3.4S, v15.4S, v31.s[0] +sub v15.4s, v25.4s, v18.4s +mla v23.4S, v13.4S, v31.s[0] +add v25.4s, v25.4s, v18.4s +ldr q18, [x0, #208] +ldr q13, [x0, #144] +mul v24.4S, v24.4S,v29.s[0] +sub v6.4s, v14.4s, v26.4s +mul v0.4S, v0.4S,v29.s[0] +add v14.4s, v14.4s, v26.4s +ldr q26, [x0, #80] +ldr q10, [x0, #16] +mla v24.4S, v2.4S, v31.s[0] +sub v2.4s, v21.4s, v8.4s +mla v0.4S, v20.4S, v31.s[0] +add v21.4s, v21.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v28.s[2] +nop +mul v19.4S, v19.4S,v29.s[2] +nop +sqrdmulh v20.4S, v15.4S, v28.s[2] +sub v22.4s, v18.4s, v3.4s +mul v15.4S, v15.4S,v29.s[2] +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v16.4S, v28.s[1] +sub v27.4s, v13.4s, v23.4s +mul v16.4S, v16.4S,v29.s[1] +add v13.4s, v13.4s, v23.4s +sqrdmulh v23.4S, v25.4S, v28.s[1] +sub v12.4s, v26.4s, v24.4s +mul v25.4S, v25.4S,v29.s[1] +add v26.4s, v26.4s, v24.4s +mla v19.4S, v8.4S, v31.s[0] +sub v8.4s, v10.4s, v0.4s +sqrdmulh v24.4S, v6.4S, v28.s[2] +add v10.4s, v10.4s, v0.4s +mla v15.4S, v20.4S, v31.s[0] +nop +sqrdmulh v20.4S, v2.4S, v28.s[2] +nop +mla v16.4S, v3.4S, v31.s[0] +nop +sqrdmulh v3.4S, v14.4S, v28.s[1] +nop +mla v25.4S, v23.4S, v31.s[0] +nop +sqrdmulh v23.4S, v21.4S, v28.s[1] +nop +mul v6.4S, v6.4S,v29.s[2] +sub v0.4s, v22.4s, v19.4s +mul v2.4S, v2.4S,v29.s[2] +add v22.4s, v22.4s, v19.4s +mla v6.4S, v24.4S, v31.s[0] +sub v24.4s, v27.4s, v15.4s +mla v2.4S, v20.4S, v31.s[0] +add v27.4s, v27.4s, v15.4s +mul v14.4S, v14.4S,v29.s[1] +sub v15.4s, v18.4s, v16.4s +mul v21.4S, v21.4S,v29.s[1] +add v18.4s, v18.4s, v16.4s +mla v14.4S, v3.4S, v31.s[0] +sub v3.4s, v13.4s, v25.4s +mla v21.4S, v23.4S, v31.s[0] +add v13.4s, v13.4s, v25.4s +sqrdmulh v28.4S, v0.4S, v11.s[3] +nop +mul v0.4S, v0.4S,v17.s[3] +nop +sqrdmulh v29.4S, v22.4S, v11.s[2] +sub v25.4s, v12.4s, v6.4s +mul v22.4S, v22.4S,v17.s[2] +add v12.4s, v12.4s, v6.4s +sqrdmulh v6.4S, v15.4S, v11.s[1] +sub v23.4s, v8.4s, v2.4s +mul v15.4S, v15.4S,v17.s[1] +add v8.4s, v8.4s, v2.4s +sqrdmulh v2.4S, v18.4S, v11.s[0] +sub v16.4s, v26.4s, v14.4s +mul v18.4S, v18.4S,v17.s[0] +add v26.4s, v26.4s, v14.4s +mla v0.4S, v28.4S, v31.s[0] +sub v28.4s, v10.4s, v21.4s +sqrdmulh v14.4S, v24.4S, v11.s[3] +add v10.4s, v10.4s, v21.4s +mla v22.4S, v29.4S, v31.s[0] +nop +sqrdmulh v29.4S, v27.4S, v11.s[2] +nop +mla v15.4S, v6.4S, v31.s[0] +nop +sqrdmulh v6.4S, v3.4S, v11.s[1] +nop +mla v18.4S, v2.4S, v31.s[0] +nop +sqrdmulh v2.4S, v13.4S, v11.s[0] +nop +mul v24.4S, v24.4S,v17.s[3] +sub v21.4s, v25.4s, v0.4s +mul v27.4S, v27.4S,v17.s[2] +add v25.4s, v25.4s, v0.4s +mla v24.4S, v14.4S, v31.s[0] +sub v14.4s, v12.4s, v22.4s +mla v27.4S, v29.4S, v31.s[0] +add v12.4s, v12.4s, v22.4s +mul v3.4S, v3.4S,v17.s[1] +sub v22.4s, v16.4s, v15.4s +mul v13.4S, v13.4S,v17.s[0] +add v16.4s, v16.4s, v15.4s +mla v3.4S, v6.4S, v31.s[0] +sub v6.4s, v26.4s, v18.4s +mla v13.4S, v2.4S, v31.s[0] +add v26.4s, v26.4s, v18.4s +sqrdmulh v11.4S, v21.4S, v9.s[3] +nop +mul v21.4S, v21.4S,v30.s[3] +nop +sqrdmulh v17.4S, v25.4S, v9.s[2] +sub v18.4s, v23.4s, v24.4s +mul v25.4S, v25.4S,v30.s[2] +add v23.4s, v23.4s, v24.4s +sqrdmulh v24.4S, v14.4S, v9.s[1] +sub v2.4s, v8.4s, v27.4s +mul v14.4S, v14.4S,v30.s[1] +add v8.4s, v8.4s, v27.4s +sqrdmulh v27.4S, v12.4S, v9.s[0] +sub v15.4s, v28.4s, v3.4s +mul v12.4S, v12.4S,v30.s[0] +add v28.4s, v28.4s, v3.4s +mla v21.4S, v11.4S, v31.s[0] +sub v11.4s, v10.4s, v13.4s +sqrdmulh v9.4S, v22.4S, v7.s[3] +add v10.4s, v10.4s, v13.4s +mla v25.4S, v17.4S, v31.s[0] +sub v17.4s, v18.4s, v21.4s +sqrdmulh v13.4S, v16.4S, v7.s[2] +add v18.4s, v18.4s, v21.4s +mla v14.4S, v24.4S, v31.s[0] +sub v24.4s, v23.4s, v25.4s +sqrdmulh v21.4S, v6.4S, v7.s[1] +add v23.4s, v23.4s, v25.4s +mla v12.4S, v27.4S, v31.s[0] +sub v27.4s, v2.4s, v14.4s +sqrdmulh v25.4S, v26.4S, v7.s[0] +add v2.4s, v2.4s, v14.4s +mul v22.4S, v22.4S,v1.s[3] +sub v14.4s, v8.4s, v12.4s +mul v16.4S, v16.4S,v1.s[2] +add v8.4s, v8.4s, v12.4s +mla v22.4S, v9.4S, v31.s[0] +str q17, [x0, #976] +mla v16.4S, v13.4S, v31.s[0] +str q18, [x0, #912] +mul v6.4S, v6.4S,v1.s[1] +str q24, [x0, #848] +mul v26.4S, v26.4S,v1.s[0] +str q23, [x0, #784] +mla v6.4S, v21.4S, v31.s[0] +str q27, [x0, #720] +mla v26.4S, v25.4S, v31.s[0] +str q2, [x0, #656] +str q14, [x0, #592] +str q8, [x0, #528] +sub v8.4s, v15.4s, v22.4s +str q8, [x0, #464] +add v15.4s, v15.4s, v22.4s +sub v22.4s, v28.4s, v16.4s +str q15, [x0, #400] +add v28.4s, v28.4s, v16.4s +sub v16.4s, v11.4s, v6.4s +str q22, [x0, #336] +add v11.4s, v11.4s, v6.4s +sub v6.4s, v10.4s, v26.4s +str q28, [x0, #272] +add v10.4s, v10.4s, v26.4s +str q16, [x0, #208] +str q11, [x0, #144] +str q6, [x0, #80] +str q10, [x0, #16] +ldr q4, [x0, #224] +ldr q5, [x0, #160] +ldr q19, [x0, #32] +ldr q20, [x17, #+128] +ldr q0, [x17, #+144] +sqrdmulh v29.4S, v19.4S, v0.s[0] +mul v19.4S, v19.4S,v20.s[0] +ldr q3, [x0, #48] +sqrdmulh v30.4S, v3.4S, v0.s[0] +mul v3.4S, v3.4S,v20.s[0] +ldr q12, [x17, #+160] +ldr q9, [x17, #+176] +ldr q17, [x0, #96] +sqrdmulh v13.4S, v17.4S, v9.s[0] +mul v17.4S, v17.4S,v12.s[0] +ldr q18, [x0, #112] +sqrdmulh v24.4S, v18.4S, v9.s[0] +mul v18.4S, v18.4S,v12.s[0] +ldr q23, [x17, #+192] +ldr q21, [x17, #+208] +mla v19.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v5.4S, v21.s[0] +ldr q27, [x0, #176] +mla v3.4S, v30.4S, v31.s[0] +sqrdmulh v30.4S, v27.4S, v21.s[0] +ldr q25, [x17, #+224] +ldr q2, [x17, #+240] +mla v17.4S, v13.4S, v31.s[0] +sqrdmulh v13.4S, v4.4S, v2.s[0] +ldr q1, [x0, #240] +mla v18.4S, v24.4S, v31.s[0] +sqrdmulh v24.4S, v1.4S, v2.s[0] +ldr q7, [x0, #0] +ldr q14, [x0, #128] +mul v5.4S, v5.4S,v23.s[0] +sub v8.4s, v7.4s, v19.4s +ldr q15, [x0, #16] +mul v27.4S, v27.4S,v23.s[0] +add v7.4s, v7.4s, v19.4s +ldr q19, [x0, #144] +mla v5.4S, v29.4S, v31.s[0] +sub v29.4s, v15.4s, v3.4s +ldr q22, [x0, #64] +mla v27.4S, v30.4S, v31.s[0] +add v15.4s, v15.4s, v3.4s +ldr q3, [x0, #192] +mul v4.4S, v4.4S,v25.s[0] +sub v30.4s, v22.4s, v17.4s +ldr q28, [x0, #80] +mul v1.4S, v1.4S,v25.s[0] +add v22.4s, v22.4s, v17.4s +ldr q17, [x0, #208] +mla v4.4S, v13.4S, v31.s[0] +mla v1.4S, v24.4S, v31.s[0] +sub v24.4s, v28.4s, v18.4s +sqrdmulh v13.4S, v15.4S, v0.s[1] +add v28.4s, v28.4s, v18.4s +mul v15.4S, v15.4S,v20.s[1] +sqrdmulh v18.4S, v29.4S, v0.s[2] +sub v26.4s, v14.4s, v5.4s +mul v29.4S, v29.4S,v20.s[2] +add v14.4s, v14.4s, v5.4s +sqrdmulh v0.4S, v28.4S, v9.s[1] +sub v20.4s, v19.4s, v27.4s +mul v28.4S, v28.4S,v12.s[1] +add v19.4s, v19.4s, v27.4s +sqrdmulh v27.4S, v24.4S, v9.s[2] +sub v5.4s, v3.4s, v4.4s +mul v24.4S, v24.4S,v12.s[2] +add v3.4s, v3.4s, v4.4s +mla v15.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v1.4s +ldr q9, [x0, #480] +sqrdmulh v12.4S, v19.4S, v21.s[1] +add v17.4s, v17.4s, v1.4s +mla v29.4S, v18.4S, v31.s[0] +ldr q18, [x0, #416] +sqrdmulh v1.4S, v20.4S, v21.s[2] +sub v4.4s, v7.4s, v15.4s +mla v28.4S, v0.4S, v31.s[0] +ldr q0, [x0, #288] +sqrdmulh v16.4S, v17.4S, v2.s[1] +add v7.4s, v7.4s, v15.4s +str q4, [x0, #16] +mla v24.4S, v27.4S, v31.s[0] +ldr q27, [x17, #+256] +ldr q4, [x17, #+272] +sqrdmulh v15.4S, v13.4S, v2.s[2] +sub v11.4s, v8.4s, v29.4s +str q7, [x0, #0] +mul v19.4S, v19.4S,v23.s[1] +add v8.4s, v8.4s, v29.4s +mul v20.4S, v20.4S,v23.s[2] +str q11, [x0, #48] +mla v19.4S, v12.4S, v31.s[0] +sub v12.4s, v22.4s, v28.4s +mla v20.4S, v1.4S, v31.s[0] +str q8, [x0, #32] +mul v17.4S, v17.4S,v25.s[1] +str q12, [x0, #80] +mul v13.4S, v13.4S,v25.s[2] +add v22.4s, v22.4s, v28.4s +str q22, [x0, #64] +mla v17.4S, v16.4S, v31.s[0] +sub v16.4s, v30.4s, v24.4s +str q16, [x0, #112] +mla v13.4S, v15.4S, v31.s[0] +add v30.4s, v30.4s, v24.4s +str q30, [x0, #96] +sqrdmulh v2.4S, v0.4S, v4.s[0] +sub v25.4s, v14.4s, v19.4s +mul v0.4S, v0.4S,v27.s[0] +str q25, [x0, #144] +ldr q25, [x0, #304] +sqrdmulh v30.4S, v25.4S, v4.s[0] +add v14.4s, v14.4s, v19.4s +mul v25.4S, v25.4S,v27.s[0] +str q14, [x0, #128] +ldr q14, [x17, #+288] +ldr q19, [x17, #+304] +ldr q24, [x0, #352] +sqrdmulh v15.4S, v24.4S, v19.s[0] +sub v16.4s, v26.4s, v20.4s +mul v24.4S, v24.4S,v14.s[0] +str q16, [x0, #176] +ldr q16, [x0, #368] +sqrdmulh v22.4S, v16.4S, v19.s[0] +add v26.4s, v26.4s, v20.4s +mul v16.4S, v16.4S,v14.s[0] +str q26, [x0, #160] +ldr q26, [x17, #+320] +ldr q20, [x17, #+336] +mla v0.4S, v2.4S, v31.s[0] +sub v2.4s, v3.4s, v17.4s +sqrdmulh v28.4S, v18.4S, v20.s[0] +str q2, [x0, #208] +ldr q2, [x0, #432] +mla v25.4S, v30.4S, v31.s[0] +add v3.4s, v3.4s, v17.4s +sqrdmulh v17.4S, v2.4S, v20.s[0] +str q3, [x0, #192] +ldr q3, [x17, #+352] +ldr q30, [x17, #+368] +mla v24.4S, v15.4S, v31.s[0] +sub v15.4s, v5.4s, v13.4s +sqrdmulh v12.4S, v9.4S, v30.s[0] +str q15, [x0, #240] +ldr q15, [x0, #496] +mla v16.4S, v22.4S, v31.s[0] +add v5.4s, v5.4s, v13.4s +sqrdmulh v13.4S, v15.4S, v30.s[0] +str q5, [x0, #224] +ldr q5, [x0, #256] +ldr q22, [x0, #384] +mul v18.4S, v18.4S,v26.s[0] +sub v21.4s, v5.4s, v0.4s +ldr q23, [x0, #272] +mul v2.4S, v2.4S,v26.s[0] +add v5.4s, v5.4s, v0.4s +ldr q0, [x0, #400] +mla v18.4S, v28.4S, v31.s[0] +sub v28.4s, v23.4s, v25.4s +ldr q8, [x0, #320] +mla v2.4S, v17.4S, v31.s[0] +add v23.4s, v23.4s, v25.4s +ldr q25, [x0, #448] +mul v9.4S, v9.4S,v3.s[0] +sub v17.4s, v8.4s, v24.4s +ldr q1, [x0, #336] +mul v15.4S, v15.4S,v3.s[0] +add v8.4s, v8.4s, v24.4s +ldr q24, [x0, #464] +mla v9.4S, v12.4S, v31.s[0] +mla v15.4S, v13.4S, v31.s[0] +sub v13.4s, v1.4s, v16.4s +sqrdmulh v12.4S, v23.4S, v4.s[1] +add v1.4s, v1.4s, v16.4s +mul v23.4S, v23.4S,v27.s[1] +sqrdmulh v16.4S, v28.4S, v4.s[2] +sub v11.4s, v22.4s, v18.4s +mul v28.4S, v28.4S,v27.s[2] +add v22.4s, v22.4s, v18.4s +sqrdmulh v4.4S, v1.4S, v19.s[1] +sub v27.4s, v0.4s, v2.4s +mul v1.4S, v1.4S,v14.s[1] +add v0.4s, v0.4s, v2.4s +sqrdmulh v2.4S, v13.4S, v19.s[2] +sub v18.4s, v25.4s, v9.4s +mul v13.4S, v13.4S,v14.s[2] +add v25.4s, v25.4s, v9.4s +mla v23.4S, v12.4S, v31.s[0] +sub v12.4s, v24.4s, v15.4s +ldr q19, [x0, #736] +sqrdmulh v14.4S, v0.4S, v20.s[1] +add v24.4s, v24.4s, v15.4s +mla v28.4S, v16.4S, v31.s[0] +ldr q16, [x0, #672] +sqrdmulh v15.4S, v27.4S, v20.s[2] +sub v9.4s, v5.4s, v23.4s +mla v1.4S, v4.4S, v31.s[0] +ldr q4, [x0, #544] +sqrdmulh v29.4S, v24.4S, v30.s[1] +add v5.4s, v5.4s, v23.4s +str q9, [x0, #272] +mla v13.4S, v2.4S, v31.s[0] +ldr q2, [x17, #+384] +ldr q9, [x17, #+400] +sqrdmulh v23.4S, v12.4S, v30.s[2] +sub v7.4s, v21.4s, v28.4s +str q5, [x0, #256] +mul v0.4S, v0.4S,v26.s[1] +add v21.4s, v21.4s, v28.4s +mul v27.4S, v27.4S,v26.s[2] +str q7, [x0, #304] +mla v0.4S, v14.4S, v31.s[0] +sub v14.4s, v8.4s, v1.4s +mla v27.4S, v15.4S, v31.s[0] +str q21, [x0, #288] +mul v24.4S, v24.4S,v3.s[1] +str q14, [x0, #336] +mul v12.4S, v12.4S,v3.s[2] +add v8.4s, v8.4s, v1.4s +str q8, [x0, #320] +mla v24.4S, v29.4S, v31.s[0] +sub v29.4s, v17.4s, v13.4s +str q29, [x0, #368] +mla v12.4S, v23.4S, v31.s[0] +add v17.4s, v17.4s, v13.4s +str q17, [x0, #352] +sqrdmulh v30.4S, v4.4S, v9.s[0] +sub v3.4s, v22.4s, v0.4s +mul v4.4S, v4.4S,v2.s[0] +str q3, [x0, #400] +ldr q3, [x0, #560] +sqrdmulh v17.4S, v3.4S, v9.s[0] +add v22.4s, v22.4s, v0.4s +mul v3.4S, v3.4S,v2.s[0] +str q22, [x0, #384] +ldr q22, [x17, #+416] +ldr q0, [x17, #+432] +ldr q13, [x0, #608] +sqrdmulh v23.4S, v13.4S, v0.s[0] +sub v29.4s, v11.4s, v27.4s +mul v13.4S, v13.4S,v22.s[0] +str q29, [x0, #432] +ldr q29, [x0, #624] +sqrdmulh v8.4S, v29.4S, v0.s[0] +add v11.4s, v11.4s, v27.4s +mul v29.4S, v29.4S,v22.s[0] +str q11, [x0, #416] +ldr q11, [x17, #+448] +ldr q27, [x17, #+464] +mla v4.4S, v30.4S, v31.s[0] +sub v30.4s, v25.4s, v24.4s +sqrdmulh v1.4S, v16.4S, v27.s[0] +str q30, [x0, #464] +ldr q30, [x0, #688] +mla v3.4S, v17.4S, v31.s[0] +add v25.4s, v25.4s, v24.4s +sqrdmulh v24.4S, v30.4S, v27.s[0] +str q25, [x0, #448] +ldr q25, [x17, #+480] +ldr q17, [x17, #+496] +mla v13.4S, v23.4S, v31.s[0] +sub v23.4s, v18.4s, v12.4s +sqrdmulh v14.4S, v19.4S, v17.s[0] +str q23, [x0, #496] +ldr q23, [x0, #752] +mla v29.4S, v8.4S, v31.s[0] +add v18.4s, v18.4s, v12.4s +sqrdmulh v12.4S, v23.4S, v17.s[0] +str q18, [x0, #480] +ldr q18, [x0, #512] +ldr q8, [x0, #640] +mul v16.4S, v16.4S,v11.s[0] +sub v20.4s, v18.4s, v4.4s +ldr q26, [x0, #528] +mul v30.4S, v30.4S,v11.s[0] +add v18.4s, v18.4s, v4.4s +ldr q4, [x0, #656] +mla v16.4S, v1.4S, v31.s[0] +sub v1.4s, v26.4s, v3.4s +ldr q21, [x0, #576] +mla v30.4S, v24.4S, v31.s[0] +add v26.4s, v26.4s, v3.4s +ldr q3, [x0, #704] +mul v19.4S, v19.4S,v25.s[0] +sub v24.4s, v21.4s, v13.4s +ldr q15, [x0, #592] +mul v23.4S, v23.4S,v25.s[0] +add v21.4s, v21.4s, v13.4s +ldr q13, [x0, #720] +mla v19.4S, v14.4S, v31.s[0] +mla v23.4S, v12.4S, v31.s[0] +sub v12.4s, v15.4s, v29.4s +sqrdmulh v14.4S, v26.4S, v9.s[1] +add v15.4s, v15.4s, v29.4s +mul v26.4S, v26.4S,v2.s[1] +sqrdmulh v29.4S, v1.4S, v9.s[2] +sub v7.4s, v8.4s, v16.4s +mul v1.4S, v1.4S,v2.s[2] +add v8.4s, v8.4s, v16.4s +sqrdmulh v9.4S, v15.4S, v0.s[1] +sub v2.4s, v4.4s, v30.4s +mul v15.4S, v15.4S,v22.s[1] +add v4.4s, v4.4s, v30.4s +sqrdmulh v30.4S, v12.4S, v0.s[2] +sub v16.4s, v3.4s, v19.4s +mul v12.4S, v12.4S,v22.s[2] +add v3.4s, v3.4s, v19.4s +mla v26.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v23.4s +ldr q0, [x0, #992] +sqrdmulh v22.4S, v4.4S, v27.s[1] +add v13.4s, v13.4s, v23.4s +mla v1.4S, v29.4S, v31.s[0] +ldr q29, [x0, #928] +sqrdmulh v23.4S, v2.4S, v27.s[2] +sub v19.4s, v18.4s, v26.4s +mla v15.4S, v9.4S, v31.s[0] +ldr q9, [x0, #800] +sqrdmulh v28.4S, v13.4S, v17.s[1] +add v18.4s, v18.4s, v26.4s +str q19, [x0, #528] +mla v12.4S, v30.4S, v31.s[0] +ldr q30, [x17, #+512] +ldr q19, [x17, #+528] +sqrdmulh v26.4S, v14.4S, v17.s[2] +sub v5.4s, v20.4s, v1.4s +str q18, [x0, #512] +mul v4.4S, v4.4S,v11.s[1] +add v20.4s, v20.4s, v1.4s +mul v2.4S, v2.4S,v11.s[2] +str q5, [x0, #560] +mla v4.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v15.4s +mla v2.4S, v23.4S, v31.s[0] +str q20, [x0, #544] +mul v13.4S, v13.4S,v25.s[1] +str q22, [x0, #592] +mul v14.4S, v14.4S,v25.s[2] +add v21.4s, v21.4s, v15.4s +str q21, [x0, #576] +mla v13.4S, v28.4S, v31.s[0] +sub v28.4s, v24.4s, v12.4s +str q28, [x0, #624] +mla v14.4S, v26.4S, v31.s[0] +add v24.4s, v24.4s, v12.4s +str q24, [x0, #608] +sqrdmulh v17.4S, v9.4S, v19.s[0] +sub v25.4s, v8.4s, v4.4s +mul v9.4S, v9.4S,v30.s[0] +str q25, [x0, #656] +ldr q25, [x0, #816] +sqrdmulh v24.4S, v25.4S, v19.s[0] +add v8.4s, v8.4s, v4.4s +mul v25.4S, v25.4S,v30.s[0] +str q8, [x0, #640] +ldr q8, [x17, #+544] +ldr q4, [x17, #+560] +ldr q12, [x0, #864] +sqrdmulh v26.4S, v12.4S, v4.s[0] +sub v28.4s, v7.4s, v2.4s +mul v12.4S, v12.4S,v8.s[0] +str q28, [x0, #688] +ldr q28, [x0, #880] +sqrdmulh v21.4S, v28.4S, v4.s[0] +add v7.4s, v7.4s, v2.4s +mul v28.4S, v28.4S,v8.s[0] +str q7, [x0, #672] +ldr q7, [x17, #+576] +ldr q2, [x17, #+592] +mla v9.4S, v17.4S, v31.s[0] +sub v17.4s, v3.4s, v13.4s +sqrdmulh v15.4S, v29.4S, v2.s[0] +str q17, [x0, #720] +ldr q17, [x0, #944] +mla v25.4S, v24.4S, v31.s[0] +add v3.4s, v3.4s, v13.4s +sqrdmulh v13.4S, v17.4S, v2.s[0] +str q3, [x0, #704] +ldr q3, [x17, #+608] +ldr q24, [x17, #+624] +mla v12.4S, v26.4S, v31.s[0] +sub v26.4s, v16.4s, v14.4s +sqrdmulh v22.4S, v0.4S, v24.s[0] +str q26, [x0, #752] +ldr q26, [x0, #1008] +mla v28.4S, v21.4S, v31.s[0] +add v16.4s, v16.4s, v14.4s +sqrdmulh v14.4S, v26.4S, v24.s[0] +str q16, [x0, #736] +ldr q16, [x0, #768] +ldr q21, [x0, #896] +mul v29.4S, v29.4S,v7.s[0] +sub v27.4s, v16.4s, v9.4s +ldr q11, [x0, #784] +mul v17.4S, v17.4S,v7.s[0] +add v16.4s, v16.4s, v9.4s +ldr q9, [x0, #912] +mla v29.4S, v15.4S, v31.s[0] +sub v15.4s, v11.4s, v25.4s +ldr q20, [x0, #832] +mla v17.4S, v13.4S, v31.s[0] +add v11.4s, v11.4s, v25.4s +ldr q25, [x0, #960] +mul v0.4S, v0.4S,v3.s[0] +sub v13.4s, v20.4s, v12.4s +ldr q23, [x0, #848] +mul v26.4S, v26.4S,v3.s[0] +add v20.4s, v20.4s, v12.4s +ldr q12, [x0, #976] +mla v0.4S, v22.4S, v31.s[0] +mla v26.4S, v14.4S, v31.s[0] +sub v14.4s, v23.4s, v28.4s +sqrdmulh v22.4S, v11.4S, v19.s[1] +add v23.4s, v23.4s, v28.4s +mul v11.4S, v11.4S,v30.s[1] +sqrdmulh v28.4S, v15.4S, v19.s[2] +sub v5.4s, v21.4s, v29.4s +mul v15.4S, v15.4S,v30.s[2] +add v21.4s, v21.4s, v29.4s +sqrdmulh v19.4S, v23.4S, v4.s[1] +sub v30.4s, v9.4s, v17.4s +mul v23.4S, v23.4S,v8.s[1] +add v9.4s, v9.4s, v17.4s +sqrdmulh v17.4S, v14.4S, v4.s[2] +sub v29.4s, v25.4s, v0.4s +mul v14.4S, v14.4S,v8.s[2] +add v25.4s, v25.4s, v0.4s +mla v11.4S, v22.4S, v31.s[0] +sub v22.4s, v12.4s, v26.4s +sqrdmulh v4.4S, v9.4S, v2.s[1] +add v12.4s, v12.4s, v26.4s +mla v15.4S, v28.4S, v31.s[0] +sqrdmulh v28.4S, v30.4S, v2.s[2] +sub v26.4s, v16.4s, v11.4s +mla v23.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v12.4S, v24.s[1] +add v16.4s, v16.4s, v11.4s +str q26, [x0, #784] +mla v14.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v22.4S, v24.s[2] +sub v26.4s, v27.4s, v15.4s +str q16, [x0, #768] +mul v9.4S, v9.4S,v7.s[1] +add v27.4s, v27.4s, v15.4s +mul v30.4S, v30.4S,v7.s[2] +str q26, [x0, #816] +mla v9.4S, v4.4S, v31.s[0] +sub v4.4s, v20.4s, v23.4s +mla v30.4S, v28.4S, v31.s[0] +str q27, [x0, #800] +mul v12.4S, v12.4S,v3.s[1] +str q4, [x0, #848] +mul v22.4S, v22.4S,v3.s[2] +add v20.4s, v20.4s, v23.4s +str q20, [x0, #832] +mla v12.4S, v19.4S, v31.s[0] +sub v19.4s, v13.4s, v14.4s +str q19, [x0, #880] +mla v22.4S, v17.4S, v31.s[0] +add v13.4s, v13.4s, v14.4s +str q13, [x0, #864] +sub v24.4s, v21.4s, v9.4s +str q24, [x0, #912] +add v21.4s, v21.4s, v9.4s +str q21, [x0, #896] +sub v21.4s, v5.4s, v30.4s +str q21, [x0, #944] +add v5.4s, v5.4s, v30.4s +str q5, [x0, #928] +sub v5.4s, v25.4s, v12.4s +str q5, [x0, #976] +add v25.4s, v25.4s, v12.4s +str q25, [x0, #960] +sub v25.4s, v29.4s, v22.4s +str q25, [x0, #1008] +add v29.4s, v29.4s, v22.4s +str q29, [x0, #992] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1548 +// Instruction count: 1544 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_17_z4_7.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_17_z4_7.s new file mode 100644 index 0000000..83076cb --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_17_z4_7.s @@ -0,0 +1,1558 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_17_z4_7 +.global _ntt_u32_incomplete_neon_asm_var_4_2_17_z4_7 +ntt_u32_incomplete_neon_asm_var_4_2_17_z4_7: +_ntt_u32_incomplete_neon_asm_var_4_2_17_z4_7: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x0, #992] +sqrdmulh v27.4S, v28.4S, v29.s[0] +mul v28.4S, v28.4S,v30.s[0] +ldr q26, [x0, #928] +sqrdmulh v25.4S, v26.4S, v29.s[0] +mul v26.4S, v26.4S,v30.s[0] +ldr q24, [x0, #864] +sqrdmulh v23.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +ldr q22, [x0, #800] +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +ldr q20, [x0, #736] +mla v28.4S, v27.4S, v31.s[0] +sqrdmulh v27.4S, v20.4S, v29.s[0] +ldr q19, [x0, #672] +mla v26.4S, v25.4S, v31.s[0] +sqrdmulh v25.4S, v19.4S, v29.s[0] +ldr q18, [x0, #608] +mla v24.4S, v23.4S, v31.s[0] +sqrdmulh v23.4S, v18.4S, v29.s[0] +ldr q17, [x0, #544] +mla v22.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v17.4S, v29.s[0] +ldr q16, [x0, #480] +ldr q3, [x0, #416] +mul v20.4S, v20.4S,v30.s[0] +sub v2.4s, v16.4s, v28.4s +mul v19.4S, v19.4S,v30.s[0] +add v16.4s, v16.4s, v28.4s +ldr q28, [x0, #352] +ldr q1, [x0, #288] +mla v20.4S, v27.4S, v31.s[0] +sub v27.4s, v3.4s, v26.4s +mla v19.4S, v25.4S, v31.s[0] +add v3.4s, v3.4s, v26.4s +ldr q26, [x0, #224] +ldr q25, [x0, #160] +mul v18.4S, v18.4S,v30.s[0] +sub v0.4s, v28.4s, v24.4s +mul v17.4S, v17.4S,v30.s[0] +add v28.4s, v28.4s, v24.4s +ldr q24, [x0, #96] +ldr q15, [x0, #32] +mla v18.4S, v23.4S, v31.s[0] +sub v23.4s, v1.4s, v22.4s +mla v17.4S, v21.4S, v31.s[0] +add v1.4s, v1.4s, v22.4s +sqrdmulh v22.4S, v2.4S, v29.s[2] +nop +mul v2.4S, v2.4S,v30.s[2] +nop +sqrdmulh v21.4S, v27.4S, v29.s[2] +sub v14.4s, v26.4s, v20.4s +mul v27.4S, v27.4S,v30.s[2] +add v26.4s, v26.4s, v20.4s +sqrdmulh v20.4S, v16.4S, v29.s[1] +sub v13.4s, v25.4s, v19.4s +mul v16.4S, v16.4S,v30.s[1] +add v25.4s, v25.4s, v19.4s +sqrdmulh v19.4S, v3.4S, v29.s[1] +sub v12.4s, v24.4s, v18.4s +mul v3.4S, v3.4S,v30.s[1] +add v24.4s, v24.4s, v18.4s +mla v2.4S, v22.4S, v31.s[0] +sub v22.4s, v15.4s, v17.4s +sqrdmulh v18.4S, v0.4S, v29.s[2] +add v15.4s, v15.4s, v17.4s +mla v27.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v23.4S, v29.s[2] +nop +mla v16.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v28.4S, v29.s[1] +nop +mla v3.4S, v19.4S, v31.s[0] +nop +sqrdmulh v19.4S, v1.4S, v29.s[1] +nop +ldr q17, [x17, #+32] +ldr q11, [x17, #+48] +mul v0.4S, v0.4S,v30.s[2] +sub v10.4s, v14.4s, v2.4s +mul v23.4S, v23.4S,v30.s[2] +add v14.4s, v14.4s, v2.4s +mla v0.4S, v18.4S, v31.s[0] +sub v18.4s, v13.4s, v27.4s +mla v23.4S, v21.4S, v31.s[0] +add v13.4s, v13.4s, v27.4s +mul v28.4S, v28.4S,v30.s[1] +sub v27.4s, v26.4s, v16.4s +mul v1.4S, v1.4S,v30.s[1] +add v26.4s, v26.4s, v16.4s +mla v28.4S, v20.4S, v31.s[0] +sub v20.4s, v25.4s, v3.4s +mla v1.4S, v19.4S, v31.s[0] +add v25.4s, v25.4s, v3.4s +sqrdmulh v3.4S, v10.4S, v11.s[3] +nop +mul v10.4S, v10.4S,v17.s[3] +nop +sqrdmulh v19.4S, v14.4S, v11.s[2] +sub v16.4s, v12.4s, v0.4s +mul v14.4S, v14.4S,v17.s[2] +add v12.4s, v12.4s, v0.4s +sqrdmulh v0.4S, v27.4S, v11.s[1] +sub v21.4s, v22.4s, v23.4s +mul v27.4S, v27.4S,v17.s[1] +add v22.4s, v22.4s, v23.4s +sqrdmulh v23.4S, v26.4S, v11.s[0] +sub v2.4s, v24.4s, v28.4s +mul v26.4S, v26.4S,v17.s[0] +add v24.4s, v24.4s, v28.4s +ldr q28, [x17, #+96] +ldr q9, [x17, #+112] +mla v10.4S, v3.4S, v31.s[0] +sub v3.4s, v15.4s, v1.4s +sqrdmulh v8.4S, v18.4S, v11.s[3] +add v15.4s, v15.4s, v1.4s +mla v14.4S, v19.4S, v31.s[0] +nop +sqrdmulh v19.4S, v13.4S, v11.s[2] +nop +mla v27.4S, v0.4S, v31.s[0] +nop +sqrdmulh v0.4S, v20.4S, v11.s[1] +nop +mla v26.4S, v23.4S, v31.s[0] +nop +sqrdmulh v23.4S, v25.4S, v11.s[0] +nop +ldr q1, [x17, #+64] +ldr q7, [x17, #+80] +mul v18.4S, v18.4S,v17.s[3] +sub v6.4s, v16.4s, v10.4s +mul v13.4S, v13.4S,v17.s[2] +add v16.4s, v16.4s, v10.4s +mla v18.4S, v8.4S, v31.s[0] +sub v8.4s, v12.4s, v14.4s +mla v13.4S, v19.4S, v31.s[0] +add v12.4s, v12.4s, v14.4s +mul v20.4S, v20.4S,v17.s[1] +sub v14.4s, v2.4s, v27.4s +mul v25.4S, v25.4S,v17.s[0] +add v2.4s, v2.4s, v27.4s +mla v20.4S, v0.4S, v31.s[0] +sub v0.4s, v24.4s, v26.4s +mla v25.4S, v23.4S, v31.s[0] +add v24.4s, v24.4s, v26.4s +sqrdmulh v26.4S, v6.4S, v9.s[3] +nop +mul v6.4S, v6.4S,v28.s[3] +nop +sqrdmulh v23.4S, v16.4S, v9.s[2] +sub v27.4s, v21.4s, v18.4s +mul v16.4S, v16.4S,v28.s[2] +add v21.4s, v21.4s, v18.4s +sqrdmulh v18.4S, v8.4S, v9.s[1] +sub v19.4s, v22.4s, v13.4s +mul v8.4S, v8.4S,v28.s[1] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v12.4S, v9.s[0] +sub v10.4s, v3.4s, v20.4s +mul v12.4S, v12.4S,v28.s[0] +add v3.4s, v3.4s, v20.4s +mla v6.4S, v26.4S, v31.s[0] +sub v26.4s, v15.4s, v25.4s +sqrdmulh v20.4S, v14.4S, v7.s[3] +add v15.4s, v15.4s, v25.4s +mla v16.4S, v23.4S, v31.s[0] +sub v23.4s, v27.4s, v6.4s +sqrdmulh v25.4S, v2.4S, v7.s[2] +add v27.4s, v27.4s, v6.4s +mla v8.4S, v18.4S, v31.s[0] +sub v18.4s, v21.4s, v16.4s +sqrdmulh v6.4S, v0.4S, v7.s[1] +add v21.4s, v21.4s, v16.4s +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v19.4s, v8.4s +sqrdmulh v16.4S, v24.4S, v7.s[0] +add v19.4s, v19.4s, v8.4s +mul v14.4S, v14.4S,v1.s[3] +sub v8.4s, v22.4s, v12.4s +mul v2.4S, v2.4S,v1.s[2] +add v22.4s, v22.4s, v12.4s +mla v14.4S, v20.4S, v31.s[0] +str q23, [x0, #992] +mla v2.4S, v25.4S, v31.s[0] +str q27, [x0, #928] +mul v0.4S, v0.4S,v1.s[1] +str q18, [x0, #864] +mul v24.4S, v24.4S,v1.s[0] +str q21, [x0, #800] +mla v0.4S, v6.4S, v31.s[0] +str q13, [x0, #736] +mla v24.4S, v16.4S, v31.s[0] +str q19, [x0, #672] +ldr q19, [x0, #1008] +sqrdmulh v16.4S, v19.4S, v29.s[0] +str q8, [x0, #608] +mul v19.4S, v19.4S,v30.s[0] +sub v8.4s, v10.4s, v14.4s +ldr q13, [x0, #944] +sqrdmulh v6.4S, v13.4S, v29.s[0] +str q22, [x0, #544] +mul v13.4S, v13.4S,v30.s[0] +add v10.4s, v10.4s, v14.4s +ldr q14, [x0, #880] +sqrdmulh v22.4S, v14.4S, v29.s[0] +str q8, [x0, #480] +mul v14.4S, v14.4S,v30.s[0] +sub v8.4s, v3.4s, v2.4s +ldr q21, [x0, #816] +sqrdmulh v18.4S, v21.4S, v29.s[0] +str q10, [x0, #416] +mul v21.4S, v21.4S,v30.s[0] +add v3.4s, v3.4s, v2.4s +ldr q2, [x0, #752] +mla v19.4S, v16.4S, v31.s[0] +str q8, [x0, #352] +sqrdmulh v8.4S, v2.4S, v29.s[0] +sub v16.4s, v26.4s, v0.4s +ldr q10, [x0, #688] +mla v13.4S, v6.4S, v31.s[0] +str q3, [x0, #288] +sqrdmulh v3.4S, v10.4S, v29.s[0] +add v26.4s, v26.4s, v0.4s +ldr q0, [x0, #624] +mla v14.4S, v22.4S, v31.s[0] +str q16, [x0, #224] +sqrdmulh v16.4S, v0.4S, v29.s[0] +sub v22.4s, v15.4s, v24.4s +ldr q6, [x0, #560] +mla v21.4S, v18.4S, v31.s[0] +str q26, [x0, #160] +sqrdmulh v26.4S, v6.4S, v29.s[0] +add v15.4s, v15.4s, v24.4s +ldr q24, [x0, #496] +ldr q18, [x0, #432] +mul v2.4S, v2.4S,v30.s[0] +sub v27.4s, v24.4s, v19.4s +mul v10.4S, v10.4S,v30.s[0] +add v24.4s, v24.4s, v19.4s +ldr q19, [x0, #368] +ldr q25, [x0, #304] +mla v2.4S, v8.4S, v31.s[0] +sub v8.4s, v18.4s, v13.4s +mla v10.4S, v3.4S, v31.s[0] +add v18.4s, v18.4s, v13.4s +ldr q13, [x0, #240] +ldr q3, [x0, #176] +mul v0.4S, v0.4S,v30.s[0] +sub v23.4s, v19.4s, v14.4s +mul v6.4S, v6.4S,v30.s[0] +add v19.4s, v19.4s, v14.4s +ldr q14, [x0, #112] +ldr q20, [x0, #48] +mla v0.4S, v16.4S, v31.s[0] +sub v16.4s, v25.4s, v21.4s +mla v6.4S, v26.4S, v31.s[0] +add v25.4s, v25.4s, v21.4s +sqrdmulh v21.4S, v27.4S, v29.s[2] +nop +mul v27.4S, v27.4S,v30.s[2] +nop +sqrdmulh v26.4S, v8.4S, v29.s[2] +sub v12.4s, v13.4s, v2.4s +mul v8.4S, v8.4S,v30.s[2] +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v24.4S, v29.s[1] +sub v5.4s, v3.4s, v10.4s +mul v24.4S, v24.4S,v30.s[1] +add v3.4s, v3.4s, v10.4s +sqrdmulh v10.4S, v18.4S, v29.s[1] +sub v4.4s, v14.4s, v0.4s +mul v18.4S, v18.4S,v30.s[1] +add v14.4s, v14.4s, v0.4s +mla v27.4S, v21.4S, v31.s[0] +sub v21.4s, v20.4s, v6.4s +sqrdmulh v0.4S, v23.4S, v29.s[2] +add v20.4s, v20.4s, v6.4s +mla v8.4S, v26.4S, v31.s[0] +str q22, [x0, #96] +sqrdmulh v22.4S, v16.4S, v29.s[2] +nop +mla v24.4S, v2.4S, v31.s[0] +str q15, [x0, #32] +sqrdmulh v15.4S, v19.4S, v29.s[1] +nop +mla v18.4S, v10.4S, v31.s[0] +nop +sqrdmulh v10.4S, v25.4S, v29.s[1] +nop +mul v23.4S, v23.4S,v30.s[2] +sub v2.4s, v12.4s, v27.4s +mul v16.4S, v16.4S,v30.s[2] +add v12.4s, v12.4s, v27.4s +mla v23.4S, v0.4S, v31.s[0] +sub v0.4s, v5.4s, v8.4s +mla v16.4S, v22.4S, v31.s[0] +add v5.4s, v5.4s, v8.4s +mul v19.4S, v19.4S,v30.s[1] +sub v8.4s, v13.4s, v24.4s +mul v25.4S, v25.4S,v30.s[1] +add v13.4s, v13.4s, v24.4s +mla v19.4S, v15.4S, v31.s[0] +sub v15.4s, v3.4s, v18.4s +mla v25.4S, v10.4S, v31.s[0] +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v2.4S, v11.s[3] +nop +mul v2.4S, v2.4S,v17.s[3] +nop +sqrdmulh v10.4S, v12.4S, v11.s[2] +sub v24.4s, v4.4s, v23.4s +mul v12.4S, v12.4S,v17.s[2] +add v4.4s, v4.4s, v23.4s +sqrdmulh v23.4S, v8.4S, v11.s[1] +sub v22.4s, v21.4s, v16.4s +mul v8.4S, v8.4S,v17.s[1] +add v21.4s, v21.4s, v16.4s +sqrdmulh v16.4S, v13.4S, v11.s[0] +sub v27.4s, v14.4s, v19.4s +mul v13.4S, v13.4S,v17.s[0] +add v14.4s, v14.4s, v19.4s +mla v2.4S, v18.4S, v31.s[0] +sub v18.4s, v20.4s, v25.4s +sqrdmulh v19.4S, v0.4S, v11.s[3] +add v20.4s, v20.4s, v25.4s +mla v12.4S, v10.4S, v31.s[0] +nop +sqrdmulh v10.4S, v5.4S, v11.s[2] +nop +mla v8.4S, v23.4S, v31.s[0] +nop +sqrdmulh v23.4S, v15.4S, v11.s[1] +nop +mla v13.4S, v16.4S, v31.s[0] +nop +sqrdmulh v16.4S, v3.4S, v11.s[0] +nop +mul v0.4S, v0.4S,v17.s[3] +sub v25.4s, v24.4s, v2.4s +mul v5.4S, v5.4S,v17.s[2] +add v24.4s, v24.4s, v2.4s +mla v0.4S, v19.4S, v31.s[0] +sub v19.4s, v4.4s, v12.4s +mla v5.4S, v10.4S, v31.s[0] +add v4.4s, v4.4s, v12.4s +mul v15.4S, v15.4S,v17.s[1] +sub v12.4s, v27.4s, v8.4s +mul v3.4S, v3.4S,v17.s[0] +add v27.4s, v27.4s, v8.4s +mla v15.4S, v23.4S, v31.s[0] +sub v23.4s, v14.4s, v13.4s +mla v3.4S, v16.4S, v31.s[0] +add v14.4s, v14.4s, v13.4s +sqrdmulh v13.4S, v25.4S, v9.s[3] +nop +mul v25.4S, v25.4S,v28.s[3] +nop +sqrdmulh v16.4S, v24.4S, v9.s[2] +sub v8.4s, v22.4s, v0.4s +mul v24.4S, v24.4S,v28.s[2] +add v22.4s, v22.4s, v0.4s +sqrdmulh v0.4S, v19.4S, v9.s[1] +sub v10.4s, v21.4s, v5.4s +mul v19.4S, v19.4S,v28.s[1] +add v21.4s, v21.4s, v5.4s +sqrdmulh v5.4S, v4.4S, v9.s[0] +sub v2.4s, v18.4s, v15.4s +mul v4.4S, v4.4S,v28.s[0] +add v18.4s, v18.4s, v15.4s +mla v25.4S, v13.4S, v31.s[0] +sub v13.4s, v20.4s, v3.4s +sqrdmulh v15.4S, v12.4S, v7.s[3] +add v20.4s, v20.4s, v3.4s +mla v24.4S, v16.4S, v31.s[0] +sub v16.4s, v8.4s, v25.4s +sqrdmulh v3.4S, v27.4S, v7.s[2] +add v8.4s, v8.4s, v25.4s +mla v19.4S, v0.4S, v31.s[0] +sub v0.4s, v22.4s, v24.4s +sqrdmulh v25.4S, v23.4S, v7.s[1] +add v22.4s, v22.4s, v24.4s +mla v4.4S, v5.4S, v31.s[0] +sub v5.4s, v10.4s, v19.4s +sqrdmulh v24.4S, v14.4S, v7.s[0] +add v10.4s, v10.4s, v19.4s +mul v12.4S, v12.4S,v1.s[3] +sub v19.4s, v21.4s, v4.4s +mul v27.4S, v27.4S,v1.s[2] +add v21.4s, v21.4s, v4.4s +mla v12.4S, v15.4S, v31.s[0] +str q16, [x0, #1008] +mla v27.4S, v3.4S, v31.s[0] +str q8, [x0, #944] +mul v23.4S, v23.4S,v1.s[1] +str q0, [x0, #880] +mul v14.4S, v14.4S,v1.s[0] +str q22, [x0, #816] +mla v23.4S, v25.4S, v31.s[0] +str q5, [x0, #752] +mla v14.4S, v24.4S, v31.s[0] +str q10, [x0, #688] +ldr q10, [x0, #960] +sqrdmulh v24.4S, v10.4S, v29.s[0] +str q19, [x0, #624] +mul v10.4S, v10.4S,v30.s[0] +sub v19.4s, v2.4s, v12.4s +ldr q5, [x0, #896] +sqrdmulh v25.4S, v5.4S, v29.s[0] +str q21, [x0, #560] +mul v5.4S, v5.4S,v30.s[0] +add v2.4s, v2.4s, v12.4s +ldr q12, [x0, #832] +sqrdmulh v21.4S, v12.4S, v29.s[0] +str q19, [x0, #496] +mul v12.4S, v12.4S,v30.s[0] +sub v19.4s, v18.4s, v27.4s +ldr q22, [x0, #768] +sqrdmulh v0.4S, v22.4S, v29.s[0] +str q2, [x0, #432] +mul v22.4S, v22.4S,v30.s[0] +add v18.4s, v18.4s, v27.4s +ldr q27, [x0, #704] +mla v10.4S, v24.4S, v31.s[0] +str q19, [x0, #368] +sqrdmulh v19.4S, v27.4S, v29.s[0] +sub v24.4s, v13.4s, v23.4s +ldr q2, [x0, #640] +mla v5.4S, v25.4S, v31.s[0] +str q18, [x0, #304] +sqrdmulh v18.4S, v2.4S, v29.s[0] +add v13.4s, v13.4s, v23.4s +ldr q23, [x0, #576] +mla v12.4S, v21.4S, v31.s[0] +str q24, [x0, #240] +sqrdmulh v24.4S, v23.4S, v29.s[0] +sub v21.4s, v20.4s, v14.4s +ldr q25, [x0, #512] +mla v22.4S, v0.4S, v31.s[0] +str q13, [x0, #176] +sqrdmulh v13.4S, v25.4S, v29.s[0] +add v20.4s, v20.4s, v14.4s +ldr q14, [x0, #448] +ldr q0, [x0, #384] +mul v27.4S, v27.4S,v30.s[0] +sub v8.4s, v14.4s, v10.4s +mul v2.4S, v2.4S,v30.s[0] +add v14.4s, v14.4s, v10.4s +ldr q10, [x0, #320] +ldr q3, [x0, #256] +mla v27.4S, v19.4S, v31.s[0] +sub v19.4s, v0.4s, v5.4s +mla v2.4S, v18.4S, v31.s[0] +add v0.4s, v0.4s, v5.4s +ldr q5, [x0, #192] +ldr q18, [x0, #128] +mul v23.4S, v23.4S,v30.s[0] +sub v16.4s, v10.4s, v12.4s +mul v25.4S, v25.4S,v30.s[0] +add v10.4s, v10.4s, v12.4s +ldr q12, [x0, #64] +ldr q15, [x0, #0] +mla v23.4S, v24.4S, v31.s[0] +sub v24.4s, v3.4s, v22.4s +mla v25.4S, v13.4S, v31.s[0] +add v3.4s, v3.4s, v22.4s +sqrdmulh v22.4S, v8.4S, v29.s[2] +nop +mul v8.4S, v8.4S,v30.s[2] +nop +sqrdmulh v13.4S, v19.4S, v29.s[2] +sub v4.4s, v5.4s, v27.4s +mul v19.4S, v19.4S,v30.s[2] +add v5.4s, v5.4s, v27.4s +sqrdmulh v27.4S, v14.4S, v29.s[1] +sub v26.4s, v18.4s, v2.4s +mul v14.4S, v14.4S,v30.s[1] +add v18.4s, v18.4s, v2.4s +sqrdmulh v2.4S, v0.4S, v29.s[1] +sub v6.4s, v12.4s, v23.4s +mul v0.4S, v0.4S,v30.s[1] +add v12.4s, v12.4s, v23.4s +mla v8.4S, v22.4S, v31.s[0] +sub v22.4s, v15.4s, v25.4s +sqrdmulh v23.4S, v16.4S, v29.s[2] +add v15.4s, v15.4s, v25.4s +mla v19.4S, v13.4S, v31.s[0] +str q21, [x0, #112] +sqrdmulh v21.4S, v24.4S, v29.s[2] +nop +mla v14.4S, v27.4S, v31.s[0] +str q20, [x0, #48] +sqrdmulh v20.4S, v10.4S, v29.s[1] +nop +mla v0.4S, v2.4S, v31.s[0] +nop +sqrdmulh v2.4S, v3.4S, v29.s[1] +nop +mul v16.4S, v16.4S,v30.s[2] +sub v27.4s, v4.4s, v8.4s +mul v24.4S, v24.4S,v30.s[2] +add v4.4s, v4.4s, v8.4s +mla v16.4S, v23.4S, v31.s[0] +sub v23.4s, v26.4s, v19.4s +mla v24.4S, v21.4S, v31.s[0] +add v26.4s, v26.4s, v19.4s +mul v10.4S, v10.4S,v30.s[1] +sub v19.4s, v5.4s, v14.4s +mul v3.4S, v3.4S,v30.s[1] +add v5.4s, v5.4s, v14.4s +mla v10.4S, v20.4S, v31.s[0] +sub v20.4s, v18.4s, v0.4s +mla v3.4S, v2.4S, v31.s[0] +add v18.4s, v18.4s, v0.4s +sqrdmulh v0.4S, v27.4S, v11.s[3] +nop +mul v27.4S, v27.4S,v17.s[3] +nop +sqrdmulh v2.4S, v4.4S, v11.s[2] +sub v14.4s, v6.4s, v16.4s +mul v4.4S, v4.4S,v17.s[2] +add v6.4s, v6.4s, v16.4s +sqrdmulh v16.4S, v19.4S, v11.s[1] +sub v21.4s, v22.4s, v24.4s +mul v19.4S, v19.4S,v17.s[1] +add v22.4s, v22.4s, v24.4s +sqrdmulh v24.4S, v5.4S, v11.s[0] +sub v8.4s, v12.4s, v10.4s +mul v5.4S, v5.4S,v17.s[0] +add v12.4s, v12.4s, v10.4s +mla v27.4S, v0.4S, v31.s[0] +sub v0.4s, v15.4s, v3.4s +sqrdmulh v10.4S, v23.4S, v11.s[3] +add v15.4s, v15.4s, v3.4s +mla v4.4S, v2.4S, v31.s[0] +nop +sqrdmulh v2.4S, v26.4S, v11.s[2] +nop +mla v19.4S, v16.4S, v31.s[0] +nop +sqrdmulh v16.4S, v20.4S, v11.s[1] +nop +mla v5.4S, v24.4S, v31.s[0] +nop +sqrdmulh v24.4S, v18.4S, v11.s[0] +nop +mul v23.4S, v23.4S,v17.s[3] +sub v3.4s, v14.4s, v27.4s +mul v26.4S, v26.4S,v17.s[2] +add v14.4s, v14.4s, v27.4s +mla v23.4S, v10.4S, v31.s[0] +sub v10.4s, v6.4s, v4.4s +mla v26.4S, v2.4S, v31.s[0] +add v6.4s, v6.4s, v4.4s +mul v20.4S, v20.4S,v17.s[1] +sub v4.4s, v8.4s, v19.4s +mul v18.4S, v18.4S,v17.s[0] +add v8.4s, v8.4s, v19.4s +mla v20.4S, v16.4S, v31.s[0] +sub v16.4s, v12.4s, v5.4s +mla v18.4S, v24.4S, v31.s[0] +add v12.4s, v12.4s, v5.4s +sqrdmulh v5.4S, v3.4S, v9.s[3] +nop +mul v3.4S, v3.4S,v28.s[3] +nop +sqrdmulh v24.4S, v14.4S, v9.s[2] +sub v19.4s, v21.4s, v23.4s +mul v14.4S, v14.4S,v28.s[2] +add v21.4s, v21.4s, v23.4s +sqrdmulh v23.4S, v10.4S, v9.s[1] +sub v2.4s, v22.4s, v26.4s +mul v10.4S, v10.4S,v28.s[1] +add v22.4s, v22.4s, v26.4s +sqrdmulh v26.4S, v6.4S, v9.s[0] +sub v27.4s, v0.4s, v20.4s +mul v6.4S, v6.4S,v28.s[0] +add v0.4s, v0.4s, v20.4s +mla v3.4S, v5.4S, v31.s[0] +sub v5.4s, v15.4s, v18.4s +sqrdmulh v20.4S, v4.4S, v7.s[3] +add v15.4s, v15.4s, v18.4s +mla v14.4S, v24.4S, v31.s[0] +sub v24.4s, v19.4s, v3.4s +sqrdmulh v18.4S, v8.4S, v7.s[2] +add v19.4s, v19.4s, v3.4s +mla v10.4S, v23.4S, v31.s[0] +sub v23.4s, v21.4s, v14.4s +sqrdmulh v3.4S, v16.4S, v7.s[1] +add v21.4s, v21.4s, v14.4s +mla v6.4S, v26.4S, v31.s[0] +sub v26.4s, v2.4s, v10.4s +sqrdmulh v14.4S, v12.4S, v7.s[0] +add v2.4s, v2.4s, v10.4s +mul v4.4S, v4.4S,v1.s[3] +sub v10.4s, v22.4s, v6.4s +mul v8.4S, v8.4S,v1.s[2] +add v22.4s, v22.4s, v6.4s +mla v4.4S, v20.4S, v31.s[0] +str q24, [x0, #960] +mla v8.4S, v18.4S, v31.s[0] +str q19, [x0, #896] +mul v16.4S, v16.4S,v1.s[1] +str q23, [x0, #832] +mul v12.4S, v12.4S,v1.s[0] +str q21, [x0, #768] +mla v16.4S, v3.4S, v31.s[0] +str q26, [x0, #704] +mla v12.4S, v14.4S, v31.s[0] +str q2, [x0, #640] +ldr q2, [x0, #976] +sqrdmulh v14.4S, v2.4S, v29.s[0] +str q10, [x0, #576] +mul v2.4S, v2.4S,v30.s[0] +sub v10.4s, v27.4s, v4.4s +ldr q26, [x0, #912] +sqrdmulh v3.4S, v26.4S, v29.s[0] +str q22, [x0, #512] +mul v26.4S, v26.4S,v30.s[0] +add v27.4s, v27.4s, v4.4s +ldr q4, [x0, #848] +sqrdmulh v22.4S, v4.4S, v29.s[0] +str q10, [x0, #448] +mul v4.4S, v4.4S,v30.s[0] +sub v10.4s, v0.4s, v8.4s +ldr q21, [x0, #784] +sqrdmulh v23.4S, v21.4S, v29.s[0] +str q27, [x0, #384] +mul v21.4S, v21.4S,v30.s[0] +add v0.4s, v0.4s, v8.4s +ldr q8, [x0, #720] +mla v2.4S, v14.4S, v31.s[0] +str q10, [x0, #320] +sqrdmulh v10.4S, v8.4S, v29.s[0] +sub v14.4s, v5.4s, v16.4s +ldr q27, [x0, #656] +mla v26.4S, v3.4S, v31.s[0] +str q0, [x0, #256] +sqrdmulh v0.4S, v27.4S, v29.s[0] +add v5.4s, v5.4s, v16.4s +ldr q16, [x0, #592] +mla v4.4S, v22.4S, v31.s[0] +str q14, [x0, #192] +sqrdmulh v14.4S, v16.4S, v29.s[0] +sub v22.4s, v15.4s, v12.4s +ldr q3, [x0, #528] +mla v21.4S, v23.4S, v31.s[0] +str q5, [x0, #128] +sqrdmulh v5.4S, v3.4S, v29.s[0] +add v15.4s, v15.4s, v12.4s +ldr q12, [x0, #464] +ldr q23, [x0, #400] +mul v8.4S, v8.4S,v30.s[0] +sub v19.4s, v12.4s, v2.4s +mul v27.4S, v27.4S,v30.s[0] +add v12.4s, v12.4s, v2.4s +ldr q2, [x0, #336] +ldr q18, [x0, #272] +mla v8.4S, v10.4S, v31.s[0] +sub v10.4s, v23.4s, v26.4s +mla v27.4S, v0.4S, v31.s[0] +add v23.4s, v23.4s, v26.4s +ldr q26, [x0, #208] +ldr q0, [x0, #144] +mul v16.4S, v16.4S,v30.s[0] +sub v24.4s, v2.4s, v4.4s +mul v3.4S, v3.4S,v30.s[0] +add v2.4s, v2.4s, v4.4s +ldr q4, [x0, #80] +ldr q20, [x0, #16] +mla v16.4S, v14.4S, v31.s[0] +sub v14.4s, v18.4s, v21.4s +mla v3.4S, v5.4S, v31.s[0] +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v19.4S, v29.s[2] +nop +mul v19.4S, v19.4S,v30.s[2] +nop +sqrdmulh v5.4S, v10.4S, v29.s[2] +sub v6.4s, v26.4s, v8.4s +mul v10.4S, v10.4S,v30.s[2] +add v26.4s, v26.4s, v8.4s +sqrdmulh v8.4S, v12.4S, v29.s[1] +sub v13.4s, v0.4s, v27.4s +mul v12.4S, v12.4S,v30.s[1] +add v0.4s, v0.4s, v27.4s +sqrdmulh v27.4S, v23.4S, v29.s[1] +sub v25.4s, v4.4s, v16.4s +mul v23.4S, v23.4S,v30.s[1] +add v4.4s, v4.4s, v16.4s +mla v19.4S, v21.4S, v31.s[0] +sub v21.4s, v20.4s, v3.4s +sqrdmulh v16.4S, v24.4S, v29.s[2] +add v20.4s, v20.4s, v3.4s +mla v10.4S, v5.4S, v31.s[0] +str q22, [x0, #64] +sqrdmulh v22.4S, v14.4S, v29.s[2] +nop +mla v12.4S, v8.4S, v31.s[0] +str q15, [x0, #0] +sqrdmulh v15.4S, v2.4S, v29.s[1] +nop +mla v23.4S, v27.4S, v31.s[0] +nop +sqrdmulh v27.4S, v18.4S, v29.s[1] +nop +mul v24.4S, v24.4S,v30.s[2] +sub v8.4s, v6.4s, v19.4s +mul v14.4S, v14.4S,v30.s[2] +add v6.4s, v6.4s, v19.4s +mla v24.4S, v16.4S, v31.s[0] +sub v16.4s, v13.4s, v10.4s +mla v14.4S, v22.4S, v31.s[0] +add v13.4s, v13.4s, v10.4s +mul v2.4S, v2.4S,v30.s[1] +sub v10.4s, v26.4s, v12.4s +mul v18.4S, v18.4S,v30.s[1] +add v26.4s, v26.4s, v12.4s +mla v2.4S, v15.4S, v31.s[0] +sub v15.4s, v0.4s, v23.4s +mla v18.4S, v27.4S, v31.s[0] +add v0.4s, v0.4s, v23.4s +sqrdmulh v29.4S, v8.4S, v11.s[3] +nop +mul v8.4S, v8.4S,v17.s[3] +nop +sqrdmulh v30.4S, v6.4S, v11.s[2] +sub v23.4s, v25.4s, v24.4s +mul v6.4S, v6.4S,v17.s[2] +add v25.4s, v25.4s, v24.4s +sqrdmulh v24.4S, v10.4S, v11.s[1] +sub v27.4s, v21.4s, v14.4s +mul v10.4S, v10.4S,v17.s[1] +add v21.4s, v21.4s, v14.4s +sqrdmulh v14.4S, v26.4S, v11.s[0] +sub v12.4s, v4.4s, v2.4s +mul v26.4S, v26.4S,v17.s[0] +add v4.4s, v4.4s, v2.4s +mla v8.4S, v29.4S, v31.s[0] +sub v29.4s, v20.4s, v18.4s +sqrdmulh v2.4S, v16.4S, v11.s[3] +add v20.4s, v20.4s, v18.4s +mla v6.4S, v30.4S, v31.s[0] +nop +sqrdmulh v30.4S, v13.4S, v11.s[2] +nop +mla v10.4S, v24.4S, v31.s[0] +nop +sqrdmulh v24.4S, v15.4S, v11.s[1] +nop +mla v26.4S, v14.4S, v31.s[0] +nop +sqrdmulh v14.4S, v0.4S, v11.s[0] +nop +mul v16.4S, v16.4S,v17.s[3] +sub v18.4s, v23.4s, v8.4s +mul v13.4S, v13.4S,v17.s[2] +add v23.4s, v23.4s, v8.4s +mla v16.4S, v2.4S, v31.s[0] +sub v2.4s, v25.4s, v6.4s +mla v13.4S, v30.4S, v31.s[0] +add v25.4s, v25.4s, v6.4s +mul v15.4S, v15.4S,v17.s[1] +sub v6.4s, v12.4s, v10.4s +mul v0.4S, v0.4S,v17.s[0] +add v12.4s, v12.4s, v10.4s +mla v15.4S, v24.4S, v31.s[0] +sub v24.4s, v4.4s, v26.4s +mla v0.4S, v14.4S, v31.s[0] +add v4.4s, v4.4s, v26.4s +sqrdmulh v11.4S, v18.4S, v9.s[3] +nop +mul v18.4S, v18.4S,v28.s[3] +nop +sqrdmulh v17.4S, v23.4S, v9.s[2] +sub v26.4s, v27.4s, v16.4s +mul v23.4S, v23.4S,v28.s[2] +add v27.4s, v27.4s, v16.4s +sqrdmulh v16.4S, v2.4S, v9.s[1] +sub v14.4s, v21.4s, v13.4s +mul v2.4S, v2.4S,v28.s[1] +add v21.4s, v21.4s, v13.4s +sqrdmulh v13.4S, v25.4S, v9.s[0] +sub v10.4s, v29.4s, v15.4s +mul v25.4S, v25.4S,v28.s[0] +add v29.4s, v29.4s, v15.4s +mla v18.4S, v11.4S, v31.s[0] +sub v11.4s, v20.4s, v0.4s +sqrdmulh v9.4S, v6.4S, v7.s[3] +add v20.4s, v20.4s, v0.4s +mla v23.4S, v17.4S, v31.s[0] +sub v17.4s, v26.4s, v18.4s +sqrdmulh v0.4S, v12.4S, v7.s[2] +add v26.4s, v26.4s, v18.4s +mla v2.4S, v16.4S, v31.s[0] +sub v16.4s, v27.4s, v23.4s +sqrdmulh v18.4S, v24.4S, v7.s[1] +add v27.4s, v27.4s, v23.4s +mla v25.4S, v13.4S, v31.s[0] +sub v13.4s, v14.4s, v2.4s +sqrdmulh v23.4S, v4.4S, v7.s[0] +add v14.4s, v14.4s, v2.4s +mul v6.4S, v6.4S,v1.s[3] +sub v2.4s, v21.4s, v25.4s +mul v12.4S, v12.4S,v1.s[2] +add v21.4s, v21.4s, v25.4s +mla v6.4S, v9.4S, v31.s[0] +str q17, [x0, #976] +mla v12.4S, v0.4S, v31.s[0] +str q26, [x0, #912] +mul v24.4S, v24.4S,v1.s[1] +str q16, [x0, #848] +mul v4.4S, v4.4S,v1.s[0] +str q27, [x0, #784] +mla v24.4S, v18.4S, v31.s[0] +str q13, [x0, #720] +mla v4.4S, v23.4S, v31.s[0] +str q14, [x0, #656] +str q2, [x0, #592] +sub v2.4s, v10.4s, v6.4s +str q21, [x0, #528] +add v10.4s, v10.4s, v6.4s +str q2, [x0, #464] +sub v2.4s, v29.4s, v12.4s +str q10, [x0, #400] +add v29.4s, v29.4s, v12.4s +str q2, [x0, #336] +sub v2.4s, v11.4s, v24.4s +str q29, [x0, #272] +add v11.4s, v11.4s, v24.4s +str q2, [x0, #208] +sub v2.4s, v20.4s, v4.4s +str q11, [x0, #144] +add v20.4s, v20.4s, v4.4s +str q2, [x0, #80] +str q20, [x0, #16] +ldr q3, [x0, #224] +ldr q5, [x0, #160] +ldr q19, [x0, #32] +ldr q22, [x17, #+128] +ldr q8, [x17, #+144] +sqrdmulh v30.4S, v19.4S, v8.s[0] +mul v19.4S, v19.4S,v22.s[0] +ldr q15, [x0, #48] +sqrdmulh v28.4S, v15.4S, v8.s[0] +mul v15.4S, v15.4S,v22.s[0] +ldr q25, [x17, #+160] +ldr q9, [x17, #+176] +ldr q17, [x0, #96] +sqrdmulh v0.4S, v17.4S, v9.s[0] +mul v17.4S, v17.4S,v25.s[0] +ldr q26, [x0, #112] +sqrdmulh v16.4S, v26.4S, v9.s[0] +mul v26.4S, v26.4S,v25.s[0] +ldr q27, [x17, #+192] +ldr q18, [x17, #+208] +mla v19.4S, v30.4S, v31.s[0] +sqrdmulh v30.4S, v5.4S, v18.s[0] +ldr q13, [x0, #176] +mla v15.4S, v28.4S, v31.s[0] +sqrdmulh v28.4S, v13.4S, v18.s[0] +ldr q23, [x17, #+224] +ldr q14, [x17, #+240] +mla v17.4S, v0.4S, v31.s[0] +sqrdmulh v0.4S, v3.4S, v14.s[0] +ldr q1, [x0, #240] +mla v26.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v1.4S, v14.s[0] +ldr q7, [x0, #0] +ldr q21, [x0, #128] +mul v5.4S, v5.4S,v27.s[0] +sub v6.4s, v7.4s, v19.4s +ldr q10, [x0, #16] +mul v13.4S, v13.4S,v27.s[0] +add v7.4s, v7.4s, v19.4s +ldr q19, [x0, #144] +mla v5.4S, v30.4S, v31.s[0] +sub v30.4s, v10.4s, v15.4s +ldr q12, [x0, #64] +mla v13.4S, v28.4S, v31.s[0] +add v10.4s, v10.4s, v15.4s +ldr q15, [x0, #192] +mul v3.4S, v3.4S,v23.s[0] +sub v28.4s, v12.4s, v17.4s +ldr q29, [x0, #80] +mul v1.4S, v1.4S,v23.s[0] +add v12.4s, v12.4s, v17.4s +ldr q17, [x0, #208] +mla v3.4S, v0.4S, v31.s[0] +mla v1.4S, v16.4S, v31.s[0] +sub v16.4s, v29.4s, v26.4s +sqrdmulh v0.4S, v10.4S, v8.s[1] +add v29.4s, v29.4s, v26.4s +mul v10.4S, v10.4S,v22.s[1] +sqrdmulh v26.4S, v30.4S, v8.s[2] +sub v24.4s, v21.4s, v5.4s +mul v30.4S, v30.4S,v22.s[2] +add v21.4s, v21.4s, v5.4s +sqrdmulh v8.4S, v29.4S, v9.s[1] +sub v22.4s, v19.4s, v13.4s +mul v29.4S, v29.4S,v25.s[1] +add v19.4s, v19.4s, v13.4s +sqrdmulh v13.4S, v16.4S, v9.s[2] +sub v5.4s, v15.4s, v3.4s +mul v16.4S, v16.4S,v25.s[2] +add v15.4s, v15.4s, v3.4s +mla v10.4S, v0.4S, v31.s[0] +sub v0.4s, v17.4s, v1.4s +ldr q9, [x0, #480] +sqrdmulh v25.4S, v19.4S, v18.s[1] +add v17.4s, v17.4s, v1.4s +mla v30.4S, v26.4S, v31.s[0] +ldr q26, [x0, #416] +sqrdmulh v1.4S, v22.4S, v18.s[2] +sub v3.4s, v7.4s, v10.4s +mla v29.4S, v8.4S, v31.s[0] +ldr q8, [x0, #288] +sqrdmulh v11.4S, v17.4S, v14.s[1] +add v7.4s, v7.4s, v10.4s +str q3, [x0, #16] +mla v16.4S, v13.4S, v31.s[0] +ldr q13, [x17, #+256] +ldr q3, [x17, #+272] +sqrdmulh v10.4S, v0.4S, v14.s[2] +sub v4.4s, v6.4s, v30.4s +str q7, [x0, #0] +mul v19.4S, v19.4S,v27.s[1] +add v6.4s, v6.4s, v30.4s +mul v22.4S, v22.4S,v27.s[2] +str q4, [x0, #48] +mla v19.4S, v25.4S, v31.s[0] +sub v25.4s, v12.4s, v29.4s +mla v22.4S, v1.4S, v31.s[0] +str q6, [x0, #32] +mul v17.4S, v17.4S,v23.s[1] +str q25, [x0, #80] +mul v0.4S, v0.4S,v23.s[2] +add v12.4s, v12.4s, v29.4s +str q12, [x0, #64] +mla v17.4S, v11.4S, v31.s[0] +sub v11.4s, v28.4s, v16.4s +str q11, [x0, #112] +mla v0.4S, v10.4S, v31.s[0] +add v28.4s, v28.4s, v16.4s +str q28, [x0, #96] +sqrdmulh v14.4S, v8.4S, v3.s[0] +sub v23.4s, v21.4s, v19.4s +mul v8.4S, v8.4S,v13.s[0] +str q23, [x0, #144] +ldr q23, [x0, #304] +sqrdmulh v28.4S, v23.4S, v3.s[0] +add v21.4s, v21.4s, v19.4s +mul v23.4S, v23.4S,v13.s[0] +str q21, [x0, #128] +ldr q21, [x17, #+288] +ldr q19, [x17, #+304] +ldr q16, [x0, #352] +sqrdmulh v10.4S, v16.4S, v19.s[0] +sub v11.4s, v24.4s, v22.4s +mul v16.4S, v16.4S,v21.s[0] +str q11, [x0, #176] +ldr q11, [x0, #368] +sqrdmulh v12.4S, v11.4S, v19.s[0] +add v24.4s, v24.4s, v22.4s +mul v11.4S, v11.4S,v21.s[0] +str q24, [x0, #160] +ldr q24, [x17, #+320] +ldr q22, [x17, #+336] +mla v8.4S, v14.4S, v31.s[0] +sub v14.4s, v15.4s, v17.4s +sqrdmulh v29.4S, v26.4S, v22.s[0] +str q14, [x0, #208] +ldr q14, [x0, #432] +mla v23.4S, v28.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +sqrdmulh v17.4S, v14.4S, v22.s[0] +str q15, [x0, #192] +ldr q15, [x17, #+352] +ldr q28, [x17, #+368] +mla v16.4S, v10.4S, v31.s[0] +sub v10.4s, v5.4s, v0.4s +sqrdmulh v25.4S, v9.4S, v28.s[0] +str q10, [x0, #240] +ldr q10, [x0, #496] +mla v11.4S, v12.4S, v31.s[0] +add v5.4s, v5.4s, v0.4s +sqrdmulh v0.4S, v10.4S, v28.s[0] +str q5, [x0, #224] +ldr q5, [x0, #256] +ldr q12, [x0, #384] +mul v26.4S, v26.4S,v24.s[0] +sub v18.4s, v5.4s, v8.4s +ldr q27, [x0, #272] +mul v14.4S, v14.4S,v24.s[0] +add v5.4s, v5.4s, v8.4s +ldr q8, [x0, #400] +mla v26.4S, v29.4S, v31.s[0] +sub v29.4s, v27.4s, v23.4s +ldr q6, [x0, #320] +mla v14.4S, v17.4S, v31.s[0] +add v27.4s, v27.4s, v23.4s +ldr q23, [x0, #448] +mul v9.4S, v9.4S,v15.s[0] +sub v17.4s, v6.4s, v16.4s +ldr q1, [x0, #336] +mul v10.4S, v10.4S,v15.s[0] +add v6.4s, v6.4s, v16.4s +ldr q16, [x0, #464] +mla v9.4S, v25.4S, v31.s[0] +mla v10.4S, v0.4S, v31.s[0] +sub v0.4s, v1.4s, v11.4s +sqrdmulh v25.4S, v27.4S, v3.s[1] +add v1.4s, v1.4s, v11.4s +mul v27.4S, v27.4S,v13.s[1] +sqrdmulh v11.4S, v29.4S, v3.s[2] +sub v4.4s, v12.4s, v26.4s +mul v29.4S, v29.4S,v13.s[2] +add v12.4s, v12.4s, v26.4s +sqrdmulh v3.4S, v1.4S, v19.s[1] +sub v13.4s, v8.4s, v14.4s +mul v1.4S, v1.4S,v21.s[1] +add v8.4s, v8.4s, v14.4s +sqrdmulh v14.4S, v0.4S, v19.s[2] +sub v26.4s, v23.4s, v9.4s +mul v0.4S, v0.4S,v21.s[2] +add v23.4s, v23.4s, v9.4s +mla v27.4S, v25.4S, v31.s[0] +sub v25.4s, v16.4s, v10.4s +ldr q19, [x0, #736] +sqrdmulh v21.4S, v8.4S, v22.s[1] +add v16.4s, v16.4s, v10.4s +mla v29.4S, v11.4S, v31.s[0] +ldr q11, [x0, #672] +sqrdmulh v10.4S, v13.4S, v22.s[2] +sub v9.4s, v5.4s, v27.4s +mla v1.4S, v3.4S, v31.s[0] +ldr q3, [x0, #544] +sqrdmulh v30.4S, v16.4S, v28.s[1] +add v5.4s, v5.4s, v27.4s +str q9, [x0, #272] +mla v0.4S, v14.4S, v31.s[0] +ldr q14, [x17, #+384] +ldr q9, [x17, #+400] +sqrdmulh v27.4S, v25.4S, v28.s[2] +sub v7.4s, v18.4s, v29.4s +str q5, [x0, #256] +mul v8.4S, v8.4S,v24.s[1] +add v18.4s, v18.4s, v29.4s +mul v13.4S, v13.4S,v24.s[2] +str q7, [x0, #304] +mla v8.4S, v21.4S, v31.s[0] +sub v21.4s, v6.4s, v1.4s +mla v13.4S, v10.4S, v31.s[0] +str q18, [x0, #288] +mul v16.4S, v16.4S,v15.s[1] +str q21, [x0, #336] +mul v25.4S, v25.4S,v15.s[2] +add v6.4s, v6.4s, v1.4s +str q6, [x0, #320] +mla v16.4S, v30.4S, v31.s[0] +sub v30.4s, v17.4s, v0.4s +str q30, [x0, #368] +mla v25.4S, v27.4S, v31.s[0] +add v17.4s, v17.4s, v0.4s +str q17, [x0, #352] +sqrdmulh v28.4S, v3.4S, v9.s[0] +sub v15.4s, v12.4s, v8.4s +mul v3.4S, v3.4S,v14.s[0] +str q15, [x0, #400] +ldr q15, [x0, #560] +sqrdmulh v17.4S, v15.4S, v9.s[0] +add v12.4s, v12.4s, v8.4s +mul v15.4S, v15.4S,v14.s[0] +str q12, [x0, #384] +ldr q12, [x17, #+416] +ldr q8, [x17, #+432] +ldr q0, [x0, #608] +sqrdmulh v27.4S, v0.4S, v8.s[0] +sub v30.4s, v4.4s, v13.4s +mul v0.4S, v0.4S,v12.s[0] +str q30, [x0, #432] +ldr q30, [x0, #624] +sqrdmulh v6.4S, v30.4S, v8.s[0] +add v4.4s, v4.4s, v13.4s +mul v30.4S, v30.4S,v12.s[0] +str q4, [x0, #416] +ldr q4, [x17, #+448] +ldr q13, [x17, #+464] +mla v3.4S, v28.4S, v31.s[0] +sub v28.4s, v23.4s, v16.4s +sqrdmulh v1.4S, v11.4S, v13.s[0] +str q28, [x0, #464] +ldr q28, [x0, #688] +mla v15.4S, v17.4S, v31.s[0] +add v23.4s, v23.4s, v16.4s +sqrdmulh v16.4S, v28.4S, v13.s[0] +str q23, [x0, #448] +ldr q23, [x17, #+480] +ldr q17, [x17, #+496] +mla v0.4S, v27.4S, v31.s[0] +sub v27.4s, v26.4s, v25.4s +sqrdmulh v21.4S, v19.4S, v17.s[0] +str q27, [x0, #496] +ldr q27, [x0, #752] +mla v30.4S, v6.4S, v31.s[0] +add v26.4s, v26.4s, v25.4s +sqrdmulh v25.4S, v27.4S, v17.s[0] +str q26, [x0, #480] +ldr q26, [x0, #512] +ldr q6, [x0, #640] +mul v11.4S, v11.4S,v4.s[0] +sub v22.4s, v26.4s, v3.4s +ldr q24, [x0, #528] +mul v28.4S, v28.4S,v4.s[0] +add v26.4s, v26.4s, v3.4s +ldr q3, [x0, #656] +mla v11.4S, v1.4S, v31.s[0] +sub v1.4s, v24.4s, v15.4s +ldr q18, [x0, #576] +mla v28.4S, v16.4S, v31.s[0] +add v24.4s, v24.4s, v15.4s +ldr q15, [x0, #704] +mul v19.4S, v19.4S,v23.s[0] +sub v16.4s, v18.4s, v0.4s +ldr q10, [x0, #592] +mul v27.4S, v27.4S,v23.s[0] +add v18.4s, v18.4s, v0.4s +ldr q0, [x0, #720] +mla v19.4S, v21.4S, v31.s[0] +mla v27.4S, v25.4S, v31.s[0] +sub v25.4s, v10.4s, v30.4s +sqrdmulh v21.4S, v24.4S, v9.s[1] +add v10.4s, v10.4s, v30.4s +mul v24.4S, v24.4S,v14.s[1] +sqrdmulh v30.4S, v1.4S, v9.s[2] +sub v7.4s, v6.4s, v11.4s +mul v1.4S, v1.4S,v14.s[2] +add v6.4s, v6.4s, v11.4s +sqrdmulh v9.4S, v10.4S, v8.s[1] +sub v14.4s, v3.4s, v28.4s +mul v10.4S, v10.4S,v12.s[1] +add v3.4s, v3.4s, v28.4s +sqrdmulh v28.4S, v25.4S, v8.s[2] +sub v11.4s, v15.4s, v19.4s +mul v25.4S, v25.4S,v12.s[2] +add v15.4s, v15.4s, v19.4s +mla v24.4S, v21.4S, v31.s[0] +sub v21.4s, v0.4s, v27.4s +ldr q8, [x0, #992] +sqrdmulh v12.4S, v3.4S, v13.s[1] +add v0.4s, v0.4s, v27.4s +mla v1.4S, v30.4S, v31.s[0] +ldr q30, [x0, #928] +sqrdmulh v27.4S, v14.4S, v13.s[2] +sub v19.4s, v26.4s, v24.4s +mla v10.4S, v9.4S, v31.s[0] +ldr q9, [x0, #800] +sqrdmulh v29.4S, v0.4S, v17.s[1] +add v26.4s, v26.4s, v24.4s +str q19, [x0, #528] +mla v25.4S, v28.4S, v31.s[0] +ldr q28, [x17, #+512] +ldr q19, [x17, #+528] +sqrdmulh v24.4S, v21.4S, v17.s[2] +sub v5.4s, v22.4s, v1.4s +str q26, [x0, #512] +mul v3.4S, v3.4S,v4.s[1] +add v22.4s, v22.4s, v1.4s +mul v14.4S, v14.4S,v4.s[2] +str q5, [x0, #560] +mla v3.4S, v12.4S, v31.s[0] +sub v12.4s, v18.4s, v10.4s +mla v14.4S, v27.4S, v31.s[0] +str q22, [x0, #544] +mul v0.4S, v0.4S,v23.s[1] +str q12, [x0, #592] +mul v21.4S, v21.4S,v23.s[2] +add v18.4s, v18.4s, v10.4s +str q18, [x0, #576] +mla v0.4S, v29.4S, v31.s[0] +sub v29.4s, v16.4s, v25.4s +str q29, [x0, #624] +mla v21.4S, v24.4S, v31.s[0] +add v16.4s, v16.4s, v25.4s +str q16, [x0, #608] +sqrdmulh v17.4S, v9.4S, v19.s[0] +sub v23.4s, v6.4s, v3.4s +mul v9.4S, v9.4S,v28.s[0] +str q23, [x0, #656] +ldr q23, [x0, #816] +sqrdmulh v16.4S, v23.4S, v19.s[0] +add v6.4s, v6.4s, v3.4s +mul v23.4S, v23.4S,v28.s[0] +str q6, [x0, #640] +ldr q6, [x17, #+544] +ldr q3, [x17, #+560] +ldr q25, [x0, #864] +sqrdmulh v24.4S, v25.4S, v3.s[0] +sub v29.4s, v7.4s, v14.4s +mul v25.4S, v25.4S,v6.s[0] +str q29, [x0, #688] +ldr q29, [x0, #880] +sqrdmulh v18.4S, v29.4S, v3.s[0] +add v7.4s, v7.4s, v14.4s +mul v29.4S, v29.4S,v6.s[0] +str q7, [x0, #672] +ldr q7, [x17, #+576] +ldr q14, [x17, #+592] +mla v9.4S, v17.4S, v31.s[0] +sub v17.4s, v15.4s, v0.4s +sqrdmulh v10.4S, v30.4S, v14.s[0] +str q17, [x0, #720] +ldr q17, [x0, #944] +mla v23.4S, v16.4S, v31.s[0] +add v15.4s, v15.4s, v0.4s +sqrdmulh v0.4S, v17.4S, v14.s[0] +str q15, [x0, #704] +ldr q15, [x17, #+608] +ldr q16, [x17, #+624] +mla v25.4S, v24.4S, v31.s[0] +sub v24.4s, v11.4s, v21.4s +sqrdmulh v12.4S, v8.4S, v16.s[0] +str q24, [x0, #752] +ldr q24, [x0, #1008] +mla v29.4S, v18.4S, v31.s[0] +add v11.4s, v11.4s, v21.4s +sqrdmulh v21.4S, v24.4S, v16.s[0] +str q11, [x0, #736] +ldr q11, [x0, #768] +ldr q18, [x0, #896] +mul v30.4S, v30.4S,v7.s[0] +sub v13.4s, v11.4s, v9.4s +ldr q4, [x0, #784] +mul v17.4S, v17.4S,v7.s[0] +add v11.4s, v11.4s, v9.4s +ldr q9, [x0, #912] +mla v30.4S, v10.4S, v31.s[0] +sub v10.4s, v4.4s, v23.4s +ldr q22, [x0, #832] +mla v17.4S, v0.4S, v31.s[0] +add v4.4s, v4.4s, v23.4s +ldr q23, [x0, #960] +mul v8.4S, v8.4S,v15.s[0] +sub v0.4s, v22.4s, v25.4s +ldr q27, [x0, #848] +mul v24.4S, v24.4S,v15.s[0] +add v22.4s, v22.4s, v25.4s +ldr q25, [x0, #976] +mla v8.4S, v12.4S, v31.s[0] +mla v24.4S, v21.4S, v31.s[0] +sub v21.4s, v27.4s, v29.4s +sqrdmulh v12.4S, v4.4S, v19.s[1] +add v27.4s, v27.4s, v29.4s +mul v4.4S, v4.4S,v28.s[1] +sqrdmulh v29.4S, v10.4S, v19.s[2] +sub v5.4s, v18.4s, v30.4s +mul v10.4S, v10.4S,v28.s[2] +add v18.4s, v18.4s, v30.4s +sqrdmulh v19.4S, v27.4S, v3.s[1] +sub v28.4s, v9.4s, v17.4s +mul v27.4S, v27.4S,v6.s[1] +add v9.4s, v9.4s, v17.4s +sqrdmulh v17.4S, v21.4S, v3.s[2] +sub v30.4s, v23.4s, v8.4s +mul v21.4S, v21.4S,v6.s[2] +add v23.4s, v23.4s, v8.4s +mla v4.4S, v12.4S, v31.s[0] +sub v12.4s, v25.4s, v24.4s +sqrdmulh v3.4S, v9.4S, v14.s[1] +add v25.4s, v25.4s, v24.4s +mla v10.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v28.4S, v14.s[2] +sub v24.4s, v11.4s, v4.4s +mla v27.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v25.4S, v16.s[1] +add v11.4s, v11.4s, v4.4s +str q24, [x0, #784] +mla v21.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v12.4S, v16.s[2] +sub v24.4s, v13.4s, v10.4s +str q11, [x0, #768] +mul v9.4S, v9.4S,v7.s[1] +add v13.4s, v13.4s, v10.4s +mul v28.4S, v28.4S,v7.s[2] +str q24, [x0, #816] +mla v9.4S, v3.4S, v31.s[0] +sub v3.4s, v22.4s, v27.4s +mla v28.4S, v29.4S, v31.s[0] +str q13, [x0, #800] +mul v25.4S, v25.4S,v15.s[1] +str q3, [x0, #848] +mul v12.4S, v12.4S,v15.s[2] +add v22.4s, v22.4s, v27.4s +str q22, [x0, #832] +mla v25.4S, v19.4S, v31.s[0] +sub v19.4s, v0.4s, v21.4s +str q19, [x0, #880] +mla v12.4S, v17.4S, v31.s[0] +add v0.4s, v0.4s, v21.4s +str q0, [x0, #864] +sub v16.4s, v18.4s, v9.4s +str q16, [x0, #912] +add v18.4s, v18.4s, v9.4s +str q18, [x0, #896] +sub v18.4s, v5.4s, v28.4s +str q18, [x0, #944] +add v5.4s, v5.4s, v28.4s +str q5, [x0, #928] +sub v5.4s, v23.4s, v25.4s +str q5, [x0, #976] +add v23.4s, v23.4s, v25.4s +str q23, [x0, #960] +sub v23.4s, v30.4s, v12.4s +str q23, [x0, #1008] +add v30.4s, v30.4s, v12.4s +str q30, [x0, #992] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1528 +// Instruction count: 1524 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_18_z4_7.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_18_z4_7.s new file mode 100644 index 0000000..42ecff5 --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_18_z4_7.s @@ -0,0 +1,1558 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_18_z4_7 +.global _ntt_u32_incomplete_neon_asm_var_4_2_18_z4_7 +ntt_u32_incomplete_neon_asm_var_4_2_18_z4_7: +_ntt_u32_incomplete_neon_asm_var_4_2_18_z4_7: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x0, #992] +sqrdmulh v27.4S, v28.4S, v29.s[0] +mul v28.4S, v28.4S,v30.s[0] +ldr q26, [x0, #928] +sqrdmulh v25.4S, v26.4S, v29.s[0] +mul v26.4S, v26.4S,v30.s[0] +ldr q24, [x0, #864] +sqrdmulh v23.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +ldr q22, [x0, #800] +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +ldr q20, [x0, #736] +sqrdmulh v19.4S, v20.4S, v29.s[0] +mla v28.4S, v27.4S, v31.s[0] +ldr q27, [x0, #672] +sqrdmulh v18.4S, v27.4S, v29.s[0] +mla v26.4S, v25.4S, v31.s[0] +ldr q25, [x0, #608] +sqrdmulh v17.4S, v25.4S, v29.s[0] +mla v24.4S, v23.4S, v31.s[0] +ldr q23, [x0, #544] +sqrdmulh v16.4S, v23.4S, v29.s[0] +mla v22.4S, v21.4S, v31.s[0] +ldr q21, [x0, #480] +ldr q3, [x0, #416] +mul v27.4S, v27.4S,v30.s[0] +mul v20.4S, v20.4S,v30.s[0] +sub v2.4s, v21.4s, v28.4s +add v21.4s, v21.4s, v28.4s +ldr q28, [x0, #352] +ldr q1, [x0, #288] +mla v27.4S, v18.4S, v31.s[0] +mla v20.4S, v19.4S, v31.s[0] +sub v19.4s, v3.4s, v26.4s +add v3.4s, v3.4s, v26.4s +ldr q26, [x0, #224] +ldr q18, [x0, #160] +mul v23.4S, v23.4S,v30.s[0] +mul v25.4S, v25.4S,v30.s[0] +sub v0.4s, v28.4s, v24.4s +add v28.4s, v28.4s, v24.4s +ldr q24, [x0, #96] +ldr q15, [x0, #32] +mla v23.4S, v16.4S, v31.s[0] +mla v25.4S, v17.4S, v31.s[0] +sub v17.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +sqrdmulh v22.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v16.4s, v26.4s, v20.4s +nop +sqrdmulh v14.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +add v26.4s, v26.4s, v20.4s +nop +sqrdmulh v20.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v13.4s, v18.4s, v27.4s +add v18.4s, v18.4s, v27.4s +sqrdmulh v27.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v12.4s, v24.4s, v25.4s +add v24.4s, v24.4s, v25.4s +sqrdmulh v25.4S, v0.4S, v29.s[2] +mla v2.4S, v22.4S, v31.s[0] +sub v22.4s, v15.4s, v23.4s +sqrdmulh v11.4S, v17.4S, v29.s[2] +mla v19.4S, v14.4S, v31.s[0] +add v15.4s, v15.4s, v23.4s +nop +sqrdmulh v23.4S, v28.4S, v29.s[1] +mla v21.4S, v20.4S, v31.s[0] +nop +sqrdmulh v20.4S, v1.4S, v29.s[1] +mla v3.4S, v27.4S, v31.s[0] +nop +nop +ldr q27, [x17, #+32] +ldr q14, [x17, #+48] +mul v17.4S, v17.4S,v30.s[2] +mul v0.4S, v0.4S,v30.s[2] +sub v10.4s, v16.4s, v2.4s +add v16.4s, v16.4s, v2.4s +mla v17.4S, v11.4S, v31.s[0] +mla v0.4S, v25.4S, v31.s[0] +sub v25.4s, v13.4s, v19.4s +add v13.4s, v13.4s, v19.4s +mul v1.4S, v1.4S,v30.s[1] +mul v28.4S, v28.4S,v30.s[1] +sub v19.4s, v26.4s, v21.4s +add v26.4s, v26.4s, v21.4s +mla v1.4S, v20.4S, v31.s[0] +mla v28.4S, v23.4S, v31.s[0] +sub v23.4s, v18.4s, v3.4s +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v10.4S, v14.s[3] +mul v10.4S, v10.4S,v27.s[3] +nop +nop +sqrdmulh v20.4S, v16.4S, v14.s[2] +mul v16.4S, v16.4S,v27.s[2] +sub v21.4s, v12.4s, v0.4s +add v12.4s, v12.4s, v0.4s +sqrdmulh v0.4S, v19.4S, v14.s[1] +mul v19.4S, v19.4S,v27.s[1] +sub v11.4s, v22.4s, v17.4s +add v22.4s, v22.4s, v17.4s +sqrdmulh v17.4S, v26.4S, v14.s[0] +mul v26.4S, v26.4S,v27.s[0] +sub v2.4s, v24.4s, v28.4s +add v24.4s, v24.4s, v28.4s +ldr q28, [x17, #+96] +ldr q9, [x17, #+112] +sqrdmulh v8.4S, v25.4S, v14.s[3] +mla v10.4S, v3.4S, v31.s[0] +sub v3.4s, v15.4s, v1.4s +add v15.4s, v15.4s, v1.4s +sqrdmulh v1.4S, v13.4S, v14.s[2] +mla v16.4S, v20.4S, v31.s[0] +nop +nop +sqrdmulh v20.4S, v23.4S, v14.s[1] +mla v19.4S, v0.4S, v31.s[0] +nop +nop +sqrdmulh v0.4S, v18.4S, v14.s[0] +mla v26.4S, v17.4S, v31.s[0] +nop +nop +ldr q17, [x17, #+64] +ldr q7, [x17, #+80] +mul v13.4S, v13.4S,v27.s[2] +mul v25.4S, v25.4S,v27.s[3] +sub v6.4s, v21.4s, v10.4s +add v21.4s, v21.4s, v10.4s +mla v13.4S, v1.4S, v31.s[0] +mla v25.4S, v8.4S, v31.s[0] +sub v8.4s, v12.4s, v16.4s +add v12.4s, v12.4s, v16.4s +mul v18.4S, v18.4S,v27.s[0] +mul v23.4S, v23.4S,v27.s[1] +sub v16.4s, v2.4s, v19.4s +add v2.4s, v2.4s, v19.4s +mla v18.4S, v0.4S, v31.s[0] +mla v23.4S, v20.4S, v31.s[0] +sub v20.4s, v24.4s, v26.4s +add v24.4s, v24.4s, v26.4s +sqrdmulh v26.4S, v6.4S, v9.s[3] +mul v6.4S, v6.4S,v28.s[3] +nop +nop +sqrdmulh v0.4S, v21.4S, v9.s[2] +mul v21.4S, v21.4S,v28.s[2] +sub v19.4s, v11.4s, v25.4s +add v11.4s, v11.4s, v25.4s +sqrdmulh v25.4S, v8.4S, v9.s[1] +mul v8.4S, v8.4S,v28.s[1] +sub v1.4s, v22.4s, v13.4s +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v12.4S, v9.s[0] +mul v12.4S, v12.4S,v28.s[0] +sub v10.4s, v3.4s, v23.4s +add v3.4s, v3.4s, v23.4s +sqrdmulh v23.4S, v16.4S, v7.s[3] +mla v6.4S, v26.4S, v31.s[0] +sub v26.4s, v15.4s, v18.4s +add v15.4s, v15.4s, v18.4s +sqrdmulh v18.4S, v2.4S, v7.s[2] +mla v21.4S, v0.4S, v31.s[0] +sub v0.4s, v19.4s, v6.4s +str q0, [x0, #992] +sqrdmulh v0.4S, v20.4S, v7.s[1] +mla v8.4S, v25.4S, v31.s[0] +add v19.4s, v19.4s, v6.4s +str q19, [x0, #928] +sqrdmulh v19.4S, v24.4S, v7.s[0] +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v21.4s +str q13, [x0, #864] +mul v2.4S, v2.4S,v17.s[2] +mul v16.4S, v16.4S,v17.s[3] +add v11.4s, v11.4s, v21.4s +sub v21.4s, v1.4s, v8.4s +mla v2.4S, v18.4S, v31.s[0] +mla v16.4S, v23.4S, v31.s[0] +add v1.4s, v1.4s, v8.4s +str q11, [x0, #800] +mul v24.4S, v24.4S,v17.s[0] +mul v20.4S, v20.4S,v17.s[1] +sub v11.4s, v22.4s, v12.4s +str q21, [x0, #736] +mla v24.4S, v19.4S, v31.s[0] +mla v20.4S, v0.4S, v31.s[0] +add v22.4s, v22.4s, v12.4s +str q1, [x0, #672] +ldr q1, [x0, #1008] +sqrdmulh v12.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +str q11, [x0, #608] +sub v11.4s, v10.4s, v16.4s +ldr q0, [x0, #944] +sqrdmulh v19.4S, v0.4S, v29.s[0] +mul v0.4S, v0.4S,v30.s[0] +str q22, [x0, #544] +add v10.4s, v10.4s, v16.4s +ldr q16, [x0, #880] +sqrdmulh v22.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +str q11, [x0, #480] +sub v11.4s, v3.4s, v2.4s +ldr q21, [x0, #816] +sqrdmulh v8.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +str q10, [x0, #416] +add v3.4s, v3.4s, v2.4s +ldr q2, [x0, #752] +sqrdmulh v10.4S, v2.4S, v29.s[0] +mla v1.4S, v12.4S, v31.s[0] +str q11, [x0, #352] +sub v11.4s, v26.4s, v20.4s +ldr q12, [x0, #688] +sqrdmulh v23.4S, v12.4S, v29.s[0] +mla v0.4S, v19.4S, v31.s[0] +str q3, [x0, #288] +add v26.4s, v26.4s, v20.4s +ldr q20, [x0, #624] +sqrdmulh v3.4S, v20.4S, v29.s[0] +mla v16.4S, v22.4S, v31.s[0] +str q11, [x0, #224] +sub v11.4s, v15.4s, v24.4s +ldr q22, [x0, #560] +sqrdmulh v19.4S, v22.4S, v29.s[0] +mla v21.4S, v8.4S, v31.s[0] +str q26, [x0, #160] +add v15.4s, v15.4s, v24.4s +ldr q24, [x0, #496] +ldr q26, [x0, #432] +mul v12.4S, v12.4S,v30.s[0] +mul v2.4S, v2.4S,v30.s[0] +sub v8.4s, v24.4s, v1.4s +add v24.4s, v24.4s, v1.4s +ldr q1, [x0, #368] +ldr q18, [x0, #304] +mla v12.4S, v23.4S, v31.s[0] +mla v2.4S, v10.4S, v31.s[0] +sub v10.4s, v26.4s, v0.4s +add v26.4s, v26.4s, v0.4s +ldr q0, [x0, #240] +ldr q23, [x0, #176] +mul v22.4S, v22.4S,v30.s[0] +mul v20.4S, v20.4S,v30.s[0] +sub v13.4s, v1.4s, v16.4s +add v1.4s, v1.4s, v16.4s +ldr q16, [x0, #112] +ldr q6, [x0, #48] +mla v22.4S, v19.4S, v31.s[0] +mla v20.4S, v3.4S, v31.s[0] +sub v3.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v8.4S, v29.s[2] +mul v8.4S, v8.4S,v30.s[2] +sub v19.4s, v0.4s, v2.4s +nop +sqrdmulh v25.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +add v0.4s, v0.4s, v2.4s +nop +sqrdmulh v2.4S, v24.4S, v29.s[1] +mul v24.4S, v24.4S,v30.s[1] +sub v5.4s, v23.4s, v12.4s +add v23.4s, v23.4s, v12.4s +sqrdmulh v12.4S, v26.4S, v29.s[1] +mul v26.4S, v26.4S,v30.s[1] +sub v4.4s, v16.4s, v20.4s +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v13.4S, v29.s[2] +mla v8.4S, v21.4S, v31.s[0] +sub v21.4s, v6.4s, v22.4s +str q11, [x0, #96] +sqrdmulh v11.4S, v3.4S, v29.s[2] +mla v10.4S, v25.4S, v31.s[0] +add v6.4s, v6.4s, v22.4s +nop +sqrdmulh v22.4S, v1.4S, v29.s[1] +mla v24.4S, v2.4S, v31.s[0] +str q15, [x0, #32] +nop +sqrdmulh v15.4S, v18.4S, v29.s[1] +mla v26.4S, v12.4S, v31.s[0] +nop +nop +mul v3.4S, v3.4S,v30.s[2] +mul v13.4S, v13.4S,v30.s[2] +sub v12.4s, v19.4s, v8.4s +add v19.4s, v19.4s, v8.4s +mla v3.4S, v11.4S, v31.s[0] +mla v13.4S, v20.4S, v31.s[0] +sub v20.4s, v5.4s, v10.4s +add v5.4s, v5.4s, v10.4s +mul v18.4S, v18.4S,v30.s[1] +mul v1.4S, v1.4S,v30.s[1] +sub v10.4s, v0.4s, v24.4s +add v0.4s, v0.4s, v24.4s +mla v18.4S, v15.4S, v31.s[0] +mla v1.4S, v22.4S, v31.s[0] +sub v22.4s, v23.4s, v26.4s +add v23.4s, v23.4s, v26.4s +sqrdmulh v26.4S, v12.4S, v14.s[3] +mul v12.4S, v12.4S,v27.s[3] +nop +nop +sqrdmulh v15.4S, v19.4S, v14.s[2] +mul v19.4S, v19.4S,v27.s[2] +sub v24.4s, v4.4s, v13.4s +add v4.4s, v4.4s, v13.4s +sqrdmulh v13.4S, v10.4S, v14.s[1] +mul v10.4S, v10.4S,v27.s[1] +sub v11.4s, v21.4s, v3.4s +add v21.4s, v21.4s, v3.4s +sqrdmulh v3.4S, v0.4S, v14.s[0] +mul v0.4S, v0.4S,v27.s[0] +sub v8.4s, v16.4s, v1.4s +add v16.4s, v16.4s, v1.4s +sqrdmulh v1.4S, v20.4S, v14.s[3] +mla v12.4S, v26.4S, v31.s[0] +sub v26.4s, v6.4s, v18.4s +add v6.4s, v6.4s, v18.4s +sqrdmulh v18.4S, v5.4S, v14.s[2] +mla v19.4S, v15.4S, v31.s[0] +nop +nop +sqrdmulh v15.4S, v22.4S, v14.s[1] +mla v10.4S, v13.4S, v31.s[0] +nop +nop +sqrdmulh v13.4S, v23.4S, v14.s[0] +mla v0.4S, v3.4S, v31.s[0] +nop +nop +mul v5.4S, v5.4S,v27.s[2] +mul v20.4S, v20.4S,v27.s[3] +sub v3.4s, v24.4s, v12.4s +add v24.4s, v24.4s, v12.4s +mla v5.4S, v18.4S, v31.s[0] +mla v20.4S, v1.4S, v31.s[0] +sub v1.4s, v4.4s, v19.4s +add v4.4s, v4.4s, v19.4s +mul v23.4S, v23.4S,v27.s[0] +mul v22.4S, v22.4S,v27.s[1] +sub v19.4s, v8.4s, v10.4s +add v8.4s, v8.4s, v10.4s +mla v23.4S, v13.4S, v31.s[0] +mla v22.4S, v15.4S, v31.s[0] +sub v15.4s, v16.4s, v0.4s +add v16.4s, v16.4s, v0.4s +sqrdmulh v0.4S, v3.4S, v9.s[3] +mul v3.4S, v3.4S,v28.s[3] +nop +nop +sqrdmulh v13.4S, v24.4S, v9.s[2] +mul v24.4S, v24.4S,v28.s[2] +sub v10.4s, v11.4s, v20.4s +add v11.4s, v11.4s, v20.4s +sqrdmulh v20.4S, v1.4S, v9.s[1] +mul v1.4S, v1.4S,v28.s[1] +sub v18.4s, v21.4s, v5.4s +add v21.4s, v21.4s, v5.4s +sqrdmulh v5.4S, v4.4S, v9.s[0] +mul v4.4S, v4.4S,v28.s[0] +sub v12.4s, v26.4s, v22.4s +add v26.4s, v26.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v7.s[3] +mla v3.4S, v0.4S, v31.s[0] +sub v0.4s, v6.4s, v23.4s +add v6.4s, v6.4s, v23.4s +sqrdmulh v23.4S, v8.4S, v7.s[2] +mla v24.4S, v13.4S, v31.s[0] +sub v13.4s, v10.4s, v3.4s +str q13, [x0, #1008] +sqrdmulh v13.4S, v15.4S, v7.s[1] +mla v1.4S, v20.4S, v31.s[0] +add v10.4s, v10.4s, v3.4s +str q10, [x0, #944] +sqrdmulh v10.4S, v16.4S, v7.s[0] +mla v4.4S, v5.4S, v31.s[0] +sub v5.4s, v11.4s, v24.4s +str q5, [x0, #880] +mul v8.4S, v8.4S,v17.s[2] +mul v19.4S, v19.4S,v17.s[3] +add v11.4s, v11.4s, v24.4s +sub v24.4s, v18.4s, v1.4s +mla v8.4S, v23.4S, v31.s[0] +mla v19.4S, v22.4S, v31.s[0] +add v18.4s, v18.4s, v1.4s +str q11, [x0, #816] +mul v16.4S, v16.4S,v17.s[0] +mul v15.4S, v15.4S,v17.s[1] +sub v11.4s, v21.4s, v4.4s +str q24, [x0, #752] +mla v16.4S, v10.4S, v31.s[0] +mla v15.4S, v13.4S, v31.s[0] +add v21.4s, v21.4s, v4.4s +str q18, [x0, #688] +ldr q18, [x0, #960] +sqrdmulh v4.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +str q11, [x0, #624] +sub v11.4s, v12.4s, v19.4s +ldr q13, [x0, #896] +sqrdmulh v10.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +str q21, [x0, #560] +add v12.4s, v12.4s, v19.4s +ldr q19, [x0, #832] +sqrdmulh v21.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +str q11, [x0, #496] +sub v11.4s, v26.4s, v8.4s +ldr q24, [x0, #768] +sqrdmulh v1.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +str q12, [x0, #432] +add v26.4s, v26.4s, v8.4s +ldr q8, [x0, #704] +sqrdmulh v12.4S, v8.4S, v29.s[0] +mla v18.4S, v4.4S, v31.s[0] +str q11, [x0, #368] +sub v11.4s, v0.4s, v15.4s +ldr q4, [x0, #640] +sqrdmulh v22.4S, v4.4S, v29.s[0] +mla v13.4S, v10.4S, v31.s[0] +str q26, [x0, #304] +add v0.4s, v0.4s, v15.4s +ldr q15, [x0, #576] +sqrdmulh v26.4S, v15.4S, v29.s[0] +mla v19.4S, v21.4S, v31.s[0] +str q11, [x0, #240] +sub v11.4s, v6.4s, v16.4s +ldr q21, [x0, #512] +sqrdmulh v10.4S, v21.4S, v29.s[0] +mla v24.4S, v1.4S, v31.s[0] +str q0, [x0, #176] +add v6.4s, v6.4s, v16.4s +ldr q16, [x0, #448] +ldr q0, [x0, #384] +mul v4.4S, v4.4S,v30.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v1.4s, v16.4s, v18.4s +add v16.4s, v16.4s, v18.4s +ldr q18, [x0, #320] +ldr q23, [x0, #256] +mla v4.4S, v22.4S, v31.s[0] +mla v8.4S, v12.4S, v31.s[0] +sub v12.4s, v0.4s, v13.4s +add v0.4s, v0.4s, v13.4s +ldr q13, [x0, #192] +ldr q22, [x0, #128] +mul v21.4S, v21.4S,v30.s[0] +mul v15.4S, v15.4S,v30.s[0] +sub v5.4s, v18.4s, v19.4s +add v18.4s, v18.4s, v19.4s +ldr q19, [x0, #64] +ldr q3, [x0, #0] +mla v21.4S, v10.4S, v31.s[0] +mla v15.4S, v26.4S, v31.s[0] +sub v26.4s, v23.4s, v24.4s +add v23.4s, v23.4s, v24.4s +sqrdmulh v24.4S, v1.4S, v29.s[2] +mul v1.4S, v1.4S,v30.s[2] +sub v10.4s, v13.4s, v8.4s +nop +sqrdmulh v20.4S, v12.4S, v29.s[2] +mul v12.4S, v12.4S,v30.s[2] +add v13.4s, v13.4s, v8.4s +nop +sqrdmulh v8.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v2.4s, v22.4s, v4.4s +add v22.4s, v22.4s, v4.4s +sqrdmulh v4.4S, v0.4S, v29.s[1] +mul v0.4S, v0.4S,v30.s[1] +sub v25.4s, v19.4s, v15.4s +add v19.4s, v19.4s, v15.4s +sqrdmulh v15.4S, v5.4S, v29.s[2] +mla v1.4S, v24.4S, v31.s[0] +sub v24.4s, v3.4s, v21.4s +str q11, [x0, #112] +sqrdmulh v11.4S, v26.4S, v29.s[2] +mla v12.4S, v20.4S, v31.s[0] +add v3.4s, v3.4s, v21.4s +nop +sqrdmulh v21.4S, v18.4S, v29.s[1] +mla v16.4S, v8.4S, v31.s[0] +str q6, [x0, #48] +nop +sqrdmulh v6.4S, v23.4S, v29.s[1] +mla v0.4S, v4.4S, v31.s[0] +nop +nop +mul v26.4S, v26.4S,v30.s[2] +mul v5.4S, v5.4S,v30.s[2] +sub v4.4s, v10.4s, v1.4s +add v10.4s, v10.4s, v1.4s +mla v26.4S, v11.4S, v31.s[0] +mla v5.4S, v15.4S, v31.s[0] +sub v15.4s, v2.4s, v12.4s +add v2.4s, v2.4s, v12.4s +mul v23.4S, v23.4S,v30.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v12.4s, v13.4s, v16.4s +add v13.4s, v13.4s, v16.4s +mla v23.4S, v6.4S, v31.s[0] +mla v18.4S, v21.4S, v31.s[0] +sub v21.4s, v22.4s, v0.4s +add v22.4s, v22.4s, v0.4s +sqrdmulh v0.4S, v4.4S, v14.s[3] +mul v4.4S, v4.4S,v27.s[3] +nop +nop +sqrdmulh v6.4S, v10.4S, v14.s[2] +mul v10.4S, v10.4S,v27.s[2] +sub v16.4s, v25.4s, v5.4s +add v25.4s, v25.4s, v5.4s +sqrdmulh v5.4S, v12.4S, v14.s[1] +mul v12.4S, v12.4S,v27.s[1] +sub v11.4s, v24.4s, v26.4s +add v24.4s, v24.4s, v26.4s +sqrdmulh v26.4S, v13.4S, v14.s[0] +mul v13.4S, v13.4S,v27.s[0] +sub v1.4s, v19.4s, v18.4s +add v19.4s, v19.4s, v18.4s +sqrdmulh v18.4S, v15.4S, v14.s[3] +mla v4.4S, v0.4S, v31.s[0] +sub v0.4s, v3.4s, v23.4s +add v3.4s, v3.4s, v23.4s +sqrdmulh v23.4S, v2.4S, v14.s[2] +mla v10.4S, v6.4S, v31.s[0] +nop +nop +sqrdmulh v6.4S, v21.4S, v14.s[1] +mla v12.4S, v5.4S, v31.s[0] +nop +nop +sqrdmulh v5.4S, v22.4S, v14.s[0] +mla v13.4S, v26.4S, v31.s[0] +nop +nop +mul v2.4S, v2.4S,v27.s[2] +mul v15.4S, v15.4S,v27.s[3] +sub v26.4s, v16.4s, v4.4s +add v16.4s, v16.4s, v4.4s +mla v2.4S, v23.4S, v31.s[0] +mla v15.4S, v18.4S, v31.s[0] +sub v18.4s, v25.4s, v10.4s +add v25.4s, v25.4s, v10.4s +mul v22.4S, v22.4S,v27.s[0] +mul v21.4S, v21.4S,v27.s[1] +sub v10.4s, v1.4s, v12.4s +add v1.4s, v1.4s, v12.4s +mla v22.4S, v5.4S, v31.s[0] +mla v21.4S, v6.4S, v31.s[0] +sub v6.4s, v19.4s, v13.4s +add v19.4s, v19.4s, v13.4s +sqrdmulh v13.4S, v26.4S, v9.s[3] +mul v26.4S, v26.4S,v28.s[3] +nop +nop +sqrdmulh v5.4S, v16.4S, v9.s[2] +mul v16.4S, v16.4S,v28.s[2] +sub v12.4s, v11.4s, v15.4s +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v18.4S, v9.s[1] +mul v18.4S, v18.4S,v28.s[1] +sub v23.4s, v24.4s, v2.4s +add v24.4s, v24.4s, v2.4s +sqrdmulh v2.4S, v25.4S, v9.s[0] +mul v25.4S, v25.4S,v28.s[0] +sub v4.4s, v0.4s, v21.4s +add v0.4s, v0.4s, v21.4s +sqrdmulh v21.4S, v10.4S, v7.s[3] +mla v26.4S, v13.4S, v31.s[0] +sub v13.4s, v3.4s, v22.4s +add v3.4s, v3.4s, v22.4s +sqrdmulh v22.4S, v1.4S, v7.s[2] +mla v16.4S, v5.4S, v31.s[0] +sub v5.4s, v12.4s, v26.4s +str q5, [x0, #960] +sqrdmulh v5.4S, v6.4S, v7.s[1] +mla v18.4S, v15.4S, v31.s[0] +add v12.4s, v12.4s, v26.4s +str q12, [x0, #896] +sqrdmulh v12.4S, v19.4S, v7.s[0] +mla v25.4S, v2.4S, v31.s[0] +sub v2.4s, v11.4s, v16.4s +str q2, [x0, #832] +mul v1.4S, v1.4S,v17.s[2] +mul v10.4S, v10.4S,v17.s[3] +add v11.4s, v11.4s, v16.4s +sub v16.4s, v23.4s, v18.4s +mla v1.4S, v22.4S, v31.s[0] +mla v10.4S, v21.4S, v31.s[0] +add v23.4s, v23.4s, v18.4s +str q11, [x0, #768] +mul v19.4S, v19.4S,v17.s[0] +mul v6.4S, v6.4S,v17.s[1] +sub v11.4s, v24.4s, v25.4s +str q16, [x0, #704] +mla v19.4S, v12.4S, v31.s[0] +mla v6.4S, v5.4S, v31.s[0] +add v24.4s, v24.4s, v25.4s +str q23, [x0, #640] +ldr q23, [x0, #976] +sqrdmulh v25.4S, v23.4S, v29.s[0] +mul v23.4S, v23.4S,v30.s[0] +str q11, [x0, #576] +sub v11.4s, v4.4s, v10.4s +ldr q5, [x0, #912] +sqrdmulh v12.4S, v5.4S, v29.s[0] +mul v5.4S, v5.4S,v30.s[0] +str q24, [x0, #512] +add v4.4s, v4.4s, v10.4s +ldr q10, [x0, #848] +sqrdmulh v24.4S, v10.4S, v29.s[0] +mul v10.4S, v10.4S,v30.s[0] +str q11, [x0, #448] +sub v11.4s, v0.4s, v1.4s +ldr q16, [x0, #784] +sqrdmulh v18.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +str q4, [x0, #384] +add v0.4s, v0.4s, v1.4s +ldr q1, [x0, #720] +sqrdmulh v4.4S, v1.4S, v29.s[0] +mla v23.4S, v25.4S, v31.s[0] +str q11, [x0, #320] +sub v11.4s, v13.4s, v6.4s +ldr q25, [x0, #656] +sqrdmulh v21.4S, v25.4S, v29.s[0] +mla v5.4S, v12.4S, v31.s[0] +str q0, [x0, #256] +add v13.4s, v13.4s, v6.4s +ldr q6, [x0, #592] +sqrdmulh v0.4S, v6.4S, v29.s[0] +mla v10.4S, v24.4S, v31.s[0] +str q11, [x0, #192] +sub v11.4s, v3.4s, v19.4s +ldr q24, [x0, #528] +sqrdmulh v12.4S, v24.4S, v29.s[0] +mla v16.4S, v18.4S, v31.s[0] +str q13, [x0, #128] +add v3.4s, v3.4s, v19.4s +ldr q19, [x0, #464] +ldr q13, [x0, #400] +mul v25.4S, v25.4S,v30.s[0] +mul v1.4S, v1.4S,v30.s[0] +sub v18.4s, v19.4s, v23.4s +add v19.4s, v19.4s, v23.4s +ldr q23, [x0, #336] +ldr q22, [x0, #272] +mla v25.4S, v21.4S, v31.s[0] +mla v1.4S, v4.4S, v31.s[0] +sub v4.4s, v13.4s, v5.4s +add v13.4s, v13.4s, v5.4s +ldr q5, [x0, #208] +ldr q21, [x0, #144] +mul v24.4S, v24.4S,v30.s[0] +mul v6.4S, v6.4S,v30.s[0] +sub v2.4s, v23.4s, v10.4s +add v23.4s, v23.4s, v10.4s +ldr q10, [x0, #80] +ldr q26, [x0, #16] +mla v24.4S, v12.4S, v31.s[0] +mla v6.4S, v0.4S, v31.s[0] +sub v0.4s, v22.4s, v16.4s +add v22.4s, v22.4s, v16.4s +sqrdmulh v16.4S, v18.4S, v29.s[2] +mul v18.4S, v18.4S,v30.s[2] +sub v12.4s, v5.4s, v1.4s +nop +sqrdmulh v15.4S, v4.4S, v29.s[2] +mul v4.4S, v4.4S,v30.s[2] +add v5.4s, v5.4s, v1.4s +nop +sqrdmulh v1.4S, v19.4S, v29.s[1] +mul v19.4S, v19.4S,v30.s[1] +sub v8.4s, v21.4s, v25.4s +add v21.4s, v21.4s, v25.4s +sqrdmulh v25.4S, v13.4S, v29.s[1] +mul v13.4S, v13.4S,v30.s[1] +sub v20.4s, v10.4s, v6.4s +add v10.4s, v10.4s, v6.4s +sqrdmulh v6.4S, v2.4S, v29.s[2] +mla v18.4S, v16.4S, v31.s[0] +sub v16.4s, v26.4s, v24.4s +str q11, [x0, #64] +sqrdmulh v11.4S, v0.4S, v29.s[2] +mla v4.4S, v15.4S, v31.s[0] +add v26.4s, v26.4s, v24.4s +nop +sqrdmulh v24.4S, v23.4S, v29.s[1] +mla v19.4S, v1.4S, v31.s[0] +str q3, [x0, #0] +nop +sqrdmulh v3.4S, v22.4S, v29.s[1] +mla v13.4S, v25.4S, v31.s[0] +nop +nop +mul v0.4S, v0.4S,v30.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v25.4s, v12.4s, v18.4s +add v12.4s, v12.4s, v18.4s +mla v0.4S, v11.4S, v31.s[0] +mla v2.4S, v6.4S, v31.s[0] +sub v6.4s, v8.4s, v4.4s +add v8.4s, v8.4s, v4.4s +mul v22.4S, v22.4S,v30.s[1] +mul v23.4S, v23.4S,v30.s[1] +sub v4.4s, v5.4s, v19.4s +add v5.4s, v5.4s, v19.4s +mla v22.4S, v3.4S, v31.s[0] +mla v23.4S, v24.4S, v31.s[0] +sub v24.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +sqrdmulh v29.4S, v25.4S, v14.s[3] +mul v25.4S, v25.4S,v27.s[3] +nop +nop +sqrdmulh v30.4S, v12.4S, v14.s[2] +mul v12.4S, v12.4S,v27.s[2] +sub v13.4s, v20.4s, v2.4s +add v20.4s, v20.4s, v2.4s +sqrdmulh v2.4S, v4.4S, v14.s[1] +mul v4.4S, v4.4S,v27.s[1] +sub v3.4s, v16.4s, v0.4s +add v16.4s, v16.4s, v0.4s +sqrdmulh v0.4S, v5.4S, v14.s[0] +mul v5.4S, v5.4S,v27.s[0] +sub v19.4s, v10.4s, v23.4s +add v10.4s, v10.4s, v23.4s +sqrdmulh v23.4S, v6.4S, v14.s[3] +mla v25.4S, v29.4S, v31.s[0] +sub v29.4s, v26.4s, v22.4s +add v26.4s, v26.4s, v22.4s +sqrdmulh v22.4S, v8.4S, v14.s[2] +mla v12.4S, v30.4S, v31.s[0] +nop +nop +sqrdmulh v30.4S, v24.4S, v14.s[1] +mla v4.4S, v2.4S, v31.s[0] +nop +nop +sqrdmulh v2.4S, v21.4S, v14.s[0] +mla v5.4S, v0.4S, v31.s[0] +nop +nop +mul v8.4S, v8.4S,v27.s[2] +mul v6.4S, v6.4S,v27.s[3] +sub v0.4s, v13.4s, v25.4s +add v13.4s, v13.4s, v25.4s +mla v8.4S, v22.4S, v31.s[0] +mla v6.4S, v23.4S, v31.s[0] +sub v23.4s, v20.4s, v12.4s +add v20.4s, v20.4s, v12.4s +mul v21.4S, v21.4S,v27.s[0] +mul v24.4S, v24.4S,v27.s[1] +sub v12.4s, v19.4s, v4.4s +add v19.4s, v19.4s, v4.4s +mla v21.4S, v2.4S, v31.s[0] +mla v24.4S, v30.4S, v31.s[0] +sub v30.4s, v10.4s, v5.4s +add v10.4s, v10.4s, v5.4s +sqrdmulh v14.4S, v0.4S, v9.s[3] +mul v0.4S, v0.4S,v28.s[3] +nop +nop +sqrdmulh v27.4S, v13.4S, v9.s[2] +mul v13.4S, v13.4S,v28.s[2] +sub v5.4s, v3.4s, v6.4s +add v3.4s, v3.4s, v6.4s +sqrdmulh v6.4S, v23.4S, v9.s[1] +mul v23.4S, v23.4S,v28.s[1] +sub v2.4s, v16.4s, v8.4s +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v20.4S, v9.s[0] +mul v20.4S, v20.4S,v28.s[0] +sub v4.4s, v29.4s, v24.4s +add v29.4s, v29.4s, v24.4s +sqrdmulh v9.4S, v12.4S, v7.s[3] +mla v0.4S, v14.4S, v31.s[0] +sub v14.4s, v26.4s, v21.4s +add v26.4s, v26.4s, v21.4s +sqrdmulh v21.4S, v19.4S, v7.s[2] +mla v13.4S, v27.4S, v31.s[0] +sub v27.4s, v5.4s, v0.4s +str q27, [x0, #976] +sqrdmulh v27.4S, v30.4S, v7.s[1] +mla v23.4S, v6.4S, v31.s[0] +add v5.4s, v5.4s, v0.4s +str q5, [x0, #912] +sqrdmulh v5.4S, v10.4S, v7.s[0] +mla v20.4S, v8.4S, v31.s[0] +sub v8.4s, v3.4s, v13.4s +str q8, [x0, #848] +mul v19.4S, v19.4S,v17.s[2] +mul v12.4S, v12.4S,v17.s[3] +add v3.4s, v3.4s, v13.4s +sub v13.4s, v2.4s, v23.4s +mla v19.4S, v21.4S, v31.s[0] +mla v12.4S, v9.4S, v31.s[0] +add v2.4s, v2.4s, v23.4s +str q3, [x0, #784] +mul v10.4S, v10.4S,v17.s[0] +mul v30.4S, v30.4S,v17.s[1] +sub v3.4s, v16.4s, v20.4s +str q13, [x0, #720] +mla v10.4S, v5.4S, v31.s[0] +mla v30.4S, v27.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +str q2, [x0, #656] +str q3, [x0, #592] +sub v3.4s, v4.4s, v12.4s +str q16, [x0, #528] +add v4.4s, v4.4s, v12.4s +str q3, [x0, #464] +sub v3.4s, v29.4s, v19.4s +str q4, [x0, #400] +add v29.4s, v29.4s, v19.4s +str q3, [x0, #336] +sub v3.4s, v14.4s, v30.4s +str q29, [x0, #272] +add v14.4s, v14.4s, v30.4s +str q3, [x0, #208] +sub v3.4s, v26.4s, v10.4s +str q14, [x0, #144] +add v26.4s, v26.4s, v10.4s +str q3, [x0, #80] +str q26, [x0, #16] +ldr q15, [x0, #224] +ldr q1, [x0, #160] +ldr q18, [x0, #32] +ldr q11, [x17, #+128] +ldr q25, [x17, #+144] +sqrdmulh v22.4S, v18.4S, v25.s[0] +mul v18.4S, v18.4S,v11.s[0] +ldr q24, [x0, #48] +sqrdmulh v28.4S, v24.4S, v25.s[0] +mul v24.4S, v24.4S,v11.s[0] +ldr q6, [x17, #+160] +ldr q0, [x17, #+176] +ldr q8, [x0, #96] +sqrdmulh v21.4S, v8.4S, v0.s[0] +mul v8.4S, v8.4S,v6.s[0] +ldr q9, [x0, #112] +sqrdmulh v23.4S, v9.4S, v0.s[0] +mul v9.4S, v9.4S,v6.s[0] +ldr q13, [x17, #+192] +ldr q5, [x17, #+208] +mla v18.4S, v22.4S, v31.s[0] +sqrdmulh v22.4S, v1.4S, v5.s[0] +ldr q27, [x0, #176] +mla v24.4S, v28.4S, v31.s[0] +sqrdmulh v28.4S, v27.4S, v5.s[0] +ldr q20, [x17, #+224] +ldr q2, [x17, #+240] +mla v8.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v15.4S, v2.s[0] +ldr q17, [x0, #240] +mla v9.4S, v23.4S, v31.s[0] +sqrdmulh v23.4S, v17.4S, v2.s[0] +ldr q7, [x0, #0] +ldr q16, [x0, #128] +mul v1.4S, v1.4S,v13.s[0] +sub v12.4s, v7.4s, v18.4s +ldr q4, [x0, #16] +mul v27.4S, v27.4S,v13.s[0] +add v7.4s, v7.4s, v18.4s +ldr q18, [x0, #144] +mla v1.4S, v22.4S, v31.s[0] +sub v22.4s, v4.4s, v24.4s +ldr q19, [x0, #64] +mla v27.4S, v28.4S, v31.s[0] +add v4.4s, v4.4s, v24.4s +ldr q24, [x0, #192] +mul v15.4S, v15.4S,v20.s[0] +sub v28.4s, v19.4s, v8.4s +ldr q29, [x0, #80] +mul v17.4S, v17.4S,v20.s[0] +add v19.4s, v19.4s, v8.4s +ldr q8, [x0, #208] +mla v15.4S, v21.4S, v31.s[0] +mla v17.4S, v23.4S, v31.s[0] +sub v23.4s, v29.4s, v9.4s +sqrdmulh v21.4S, v4.4S, v25.s[1] +add v29.4s, v29.4s, v9.4s +mul v4.4S, v4.4S,v11.s[1] +sqrdmulh v9.4S, v22.4S, v25.s[2] +sub v30.4s, v16.4s, v1.4s +mul v22.4S, v22.4S,v11.s[2] +add v16.4s, v16.4s, v1.4s +sqrdmulh v25.4S, v29.4S, v0.s[1] +sub v11.4s, v18.4s, v27.4s +mul v29.4S, v29.4S,v6.s[1] +add v18.4s, v18.4s, v27.4s +sqrdmulh v27.4S, v23.4S, v0.s[2] +sub v1.4s, v24.4s, v15.4s +mul v23.4S, v23.4S,v6.s[2] +add v24.4s, v24.4s, v15.4s +mla v4.4S, v21.4S, v31.s[0] +sub v21.4s, v8.4s, v17.4s +ldr q0, [x0, #480] +sqrdmulh v6.4S, v18.4S, v5.s[1] +add v8.4s, v8.4s, v17.4s +mla v22.4S, v9.4S, v31.s[0] +ldr q9, [x0, #416] +sqrdmulh v17.4S, v11.4S, v5.s[2] +sub v15.4s, v7.4s, v4.4s +mla v29.4S, v25.4S, v31.s[0] +ldr q25, [x0, #288] +sqrdmulh v14.4S, v8.4S, v2.s[1] +add v7.4s, v7.4s, v4.4s +str q15, [x0, #16] +mla v23.4S, v27.4S, v31.s[0] +ldr q27, [x17, #+256] +ldr q15, [x17, #+272] +sqrdmulh v4.4S, v21.4S, v2.s[2] +sub v10.4s, v12.4s, v22.4s +str q7, [x0, #0] +mul v18.4S, v18.4S,v13.s[1] +add v12.4s, v12.4s, v22.4s +mul v11.4S, v11.4S,v13.s[2] +str q10, [x0, #48] +mla v18.4S, v6.4S, v31.s[0] +sub v6.4s, v19.4s, v29.4s +mla v11.4S, v17.4S, v31.s[0] +str q12, [x0, #32] +mul v8.4S, v8.4S,v20.s[1] +str q6, [x0, #80] +mul v21.4S, v21.4S,v20.s[2] +add v19.4s, v19.4s, v29.4s +str q19, [x0, #64] +mla v8.4S, v14.4S, v31.s[0] +sub v14.4s, v28.4s, v23.4s +str q14, [x0, #112] +mla v21.4S, v4.4S, v31.s[0] +add v28.4s, v28.4s, v23.4s +str q28, [x0, #96] +sqrdmulh v2.4S, v25.4S, v15.s[0] +sub v20.4s, v16.4s, v18.4s +mul v25.4S, v25.4S,v27.s[0] +str q20, [x0, #144] +ldr q20, [x0, #304] +sqrdmulh v28.4S, v20.4S, v15.s[0] +add v16.4s, v16.4s, v18.4s +mul v20.4S, v20.4S,v27.s[0] +str q16, [x0, #128] +ldr q16, [x17, #+288] +ldr q18, [x17, #+304] +ldr q23, [x0, #352] +sqrdmulh v4.4S, v23.4S, v18.s[0] +sub v14.4s, v30.4s, v11.4s +mul v23.4S, v23.4S,v16.s[0] +str q14, [x0, #176] +ldr q14, [x0, #368] +sqrdmulh v19.4S, v14.4S, v18.s[0] +add v30.4s, v30.4s, v11.4s +mul v14.4S, v14.4S,v16.s[0] +str q30, [x0, #160] +ldr q30, [x17, #+320] +ldr q11, [x17, #+336] +mla v25.4S, v2.4S, v31.s[0] +sub v2.4s, v24.4s, v8.4s +sqrdmulh v29.4S, v9.4S, v11.s[0] +str q2, [x0, #208] +ldr q2, [x0, #432] +mla v20.4S, v28.4S, v31.s[0] +add v24.4s, v24.4s, v8.4s +sqrdmulh v8.4S, v2.4S, v11.s[0] +str q24, [x0, #192] +ldr q24, [x17, #+352] +ldr q28, [x17, #+368] +mla v23.4S, v4.4S, v31.s[0] +sub v4.4s, v1.4s, v21.4s +sqrdmulh v6.4S, v0.4S, v28.s[0] +str q4, [x0, #240] +ldr q4, [x0, #496] +mla v14.4S, v19.4S, v31.s[0] +add v1.4s, v1.4s, v21.4s +sqrdmulh v21.4S, v4.4S, v28.s[0] +str q1, [x0, #224] +ldr q1, [x0, #256] +ldr q19, [x0, #384] +mul v9.4S, v9.4S,v30.s[0] +sub v5.4s, v1.4s, v25.4s +ldr q13, [x0, #272] +mul v2.4S, v2.4S,v30.s[0] +add v1.4s, v1.4s, v25.4s +ldr q25, [x0, #400] +mla v9.4S, v29.4S, v31.s[0] +sub v29.4s, v13.4s, v20.4s +ldr q12, [x0, #320] +mla v2.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v20.4s +ldr q20, [x0, #448] +mul v0.4S, v0.4S,v24.s[0] +sub v8.4s, v12.4s, v23.4s +ldr q17, [x0, #336] +mul v4.4S, v4.4S,v24.s[0] +add v12.4s, v12.4s, v23.4s +ldr q23, [x0, #464] +mla v0.4S, v6.4S, v31.s[0] +mla v4.4S, v21.4S, v31.s[0] +sub v21.4s, v17.4s, v14.4s +sqrdmulh v6.4S, v13.4S, v15.s[1] +add v17.4s, v17.4s, v14.4s +mul v13.4S, v13.4S,v27.s[1] +sqrdmulh v14.4S, v29.4S, v15.s[2] +sub v10.4s, v19.4s, v9.4s +mul v29.4S, v29.4S,v27.s[2] +add v19.4s, v19.4s, v9.4s +sqrdmulh v15.4S, v17.4S, v18.s[1] +sub v27.4s, v25.4s, v2.4s +mul v17.4S, v17.4S,v16.s[1] +add v25.4s, v25.4s, v2.4s +sqrdmulh v2.4S, v21.4S, v18.s[2] +sub v9.4s, v20.4s, v0.4s +mul v21.4S, v21.4S,v16.s[2] +add v20.4s, v20.4s, v0.4s +mla v13.4S, v6.4S, v31.s[0] +sub v6.4s, v23.4s, v4.4s +ldr q18, [x0, #736] +sqrdmulh v16.4S, v25.4S, v11.s[1] +add v23.4s, v23.4s, v4.4s +mla v29.4S, v14.4S, v31.s[0] +ldr q14, [x0, #672] +sqrdmulh v4.4S, v27.4S, v11.s[2] +sub v0.4s, v1.4s, v13.4s +mla v17.4S, v15.4S, v31.s[0] +ldr q15, [x0, #544] +sqrdmulh v22.4S, v23.4S, v28.s[1] +add v1.4s, v1.4s, v13.4s +str q0, [x0, #272] +mla v21.4S, v2.4S, v31.s[0] +ldr q2, [x17, #+384] +ldr q0, [x17, #+400] +sqrdmulh v13.4S, v6.4S, v28.s[2] +sub v7.4s, v5.4s, v29.4s +str q1, [x0, #256] +mul v25.4S, v25.4S,v30.s[1] +add v5.4s, v5.4s, v29.4s +mul v27.4S, v27.4S,v30.s[2] +str q7, [x0, #304] +mla v25.4S, v16.4S, v31.s[0] +sub v16.4s, v12.4s, v17.4s +mla v27.4S, v4.4S, v31.s[0] +str q5, [x0, #288] +mul v23.4S, v23.4S,v24.s[1] +str q16, [x0, #336] +mul v6.4S, v6.4S,v24.s[2] +add v12.4s, v12.4s, v17.4s +str q12, [x0, #320] +mla v23.4S, v22.4S, v31.s[0] +sub v22.4s, v8.4s, v21.4s +str q22, [x0, #368] +mla v6.4S, v13.4S, v31.s[0] +add v8.4s, v8.4s, v21.4s +str q8, [x0, #352] +sqrdmulh v28.4S, v15.4S, v0.s[0] +sub v24.4s, v19.4s, v25.4s +mul v15.4S, v15.4S,v2.s[0] +str q24, [x0, #400] +ldr q24, [x0, #560] +sqrdmulh v8.4S, v24.4S, v0.s[0] +add v19.4s, v19.4s, v25.4s +mul v24.4S, v24.4S,v2.s[0] +str q19, [x0, #384] +ldr q19, [x17, #+416] +ldr q25, [x17, #+432] +ldr q21, [x0, #608] +sqrdmulh v13.4S, v21.4S, v25.s[0] +sub v22.4s, v10.4s, v27.4s +mul v21.4S, v21.4S,v19.s[0] +str q22, [x0, #432] +ldr q22, [x0, #624] +sqrdmulh v12.4S, v22.4S, v25.s[0] +add v10.4s, v10.4s, v27.4s +mul v22.4S, v22.4S,v19.s[0] +str q10, [x0, #416] +ldr q10, [x17, #+448] +ldr q27, [x17, #+464] +mla v15.4S, v28.4S, v31.s[0] +sub v28.4s, v20.4s, v23.4s +sqrdmulh v17.4S, v14.4S, v27.s[0] +str q28, [x0, #464] +ldr q28, [x0, #688] +mla v24.4S, v8.4S, v31.s[0] +add v20.4s, v20.4s, v23.4s +sqrdmulh v23.4S, v28.4S, v27.s[0] +str q20, [x0, #448] +ldr q20, [x17, #+480] +ldr q8, [x17, #+496] +mla v21.4S, v13.4S, v31.s[0] +sub v13.4s, v9.4s, v6.4s +sqrdmulh v16.4S, v18.4S, v8.s[0] +str q13, [x0, #496] +ldr q13, [x0, #752] +mla v22.4S, v12.4S, v31.s[0] +add v9.4s, v9.4s, v6.4s +sqrdmulh v6.4S, v13.4S, v8.s[0] +str q9, [x0, #480] +ldr q9, [x0, #512] +ldr q12, [x0, #640] +mul v14.4S, v14.4S,v10.s[0] +sub v11.4s, v9.4s, v15.4s +ldr q30, [x0, #528] +mul v28.4S, v28.4S,v10.s[0] +add v9.4s, v9.4s, v15.4s +ldr q15, [x0, #656] +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v30.4s, v24.4s +ldr q5, [x0, #576] +mla v28.4S, v23.4S, v31.s[0] +add v30.4s, v30.4s, v24.4s +ldr q24, [x0, #704] +mul v18.4S, v18.4S,v20.s[0] +sub v23.4s, v5.4s, v21.4s +ldr q4, [x0, #592] +mul v13.4S, v13.4S,v20.s[0] +add v5.4s, v5.4s, v21.4s +ldr q21, [x0, #720] +mla v18.4S, v16.4S, v31.s[0] +mla v13.4S, v6.4S, v31.s[0] +sub v6.4s, v4.4s, v22.4s +sqrdmulh v16.4S, v30.4S, v0.s[1] +add v4.4s, v4.4s, v22.4s +mul v30.4S, v30.4S,v2.s[1] +sqrdmulh v22.4S, v17.4S, v0.s[2] +sub v7.4s, v12.4s, v14.4s +mul v17.4S, v17.4S,v2.s[2] +add v12.4s, v12.4s, v14.4s +sqrdmulh v0.4S, v4.4S, v25.s[1] +sub v2.4s, v15.4s, v28.4s +mul v4.4S, v4.4S,v19.s[1] +add v15.4s, v15.4s, v28.4s +sqrdmulh v28.4S, v6.4S, v25.s[2] +sub v14.4s, v24.4s, v18.4s +mul v6.4S, v6.4S,v19.s[2] +add v24.4s, v24.4s, v18.4s +mla v30.4S, v16.4S, v31.s[0] +sub v16.4s, v21.4s, v13.4s +ldr q25, [x0, #992] +sqrdmulh v19.4S, v15.4S, v27.s[1] +add v21.4s, v21.4s, v13.4s +mla v17.4S, v22.4S, v31.s[0] +ldr q22, [x0, #928] +sqrdmulh v13.4S, v2.4S, v27.s[2] +sub v18.4s, v9.4s, v30.4s +mla v4.4S, v0.4S, v31.s[0] +ldr q0, [x0, #800] +sqrdmulh v29.4S, v21.4S, v8.s[1] +add v9.4s, v9.4s, v30.4s +str q18, [x0, #528] +mla v6.4S, v28.4S, v31.s[0] +ldr q28, [x17, #+512] +ldr q18, [x17, #+528] +sqrdmulh v30.4S, v16.4S, v8.s[2] +sub v1.4s, v11.4s, v17.4s +str q9, [x0, #512] +mul v15.4S, v15.4S,v10.s[1] +add v11.4s, v11.4s, v17.4s +mul v2.4S, v2.4S,v10.s[2] +str q1, [x0, #560] +mla v15.4S, v19.4S, v31.s[0] +sub v19.4s, v5.4s, v4.4s +mla v2.4S, v13.4S, v31.s[0] +str q11, [x0, #544] +mul v21.4S, v21.4S,v20.s[1] +str q19, [x0, #592] +mul v16.4S, v16.4S,v20.s[2] +add v5.4s, v5.4s, v4.4s +str q5, [x0, #576] +mla v21.4S, v29.4S, v31.s[0] +sub v29.4s, v23.4s, v6.4s +str q29, [x0, #624] +mla v16.4S, v30.4S, v31.s[0] +add v23.4s, v23.4s, v6.4s +str q23, [x0, #608] +sqrdmulh v8.4S, v0.4S, v18.s[0] +sub v20.4s, v12.4s, v15.4s +mul v0.4S, v0.4S,v28.s[0] +str q20, [x0, #656] +ldr q20, [x0, #816] +sqrdmulh v23.4S, v20.4S, v18.s[0] +add v12.4s, v12.4s, v15.4s +mul v20.4S, v20.4S,v28.s[0] +str q12, [x0, #640] +ldr q12, [x17, #+544] +ldr q15, [x17, #+560] +ldr q6, [x0, #864] +sqrdmulh v30.4S, v6.4S, v15.s[0] +sub v29.4s, v7.4s, v2.4s +mul v6.4S, v6.4S,v12.s[0] +str q29, [x0, #688] +ldr q29, [x0, #880] +sqrdmulh v5.4S, v29.4S, v15.s[0] +add v7.4s, v7.4s, v2.4s +mul v29.4S, v29.4S,v12.s[0] +str q7, [x0, #672] +ldr q7, [x17, #+576] +ldr q2, [x17, #+592] +mla v0.4S, v8.4S, v31.s[0] +sub v8.4s, v24.4s, v21.4s +sqrdmulh v4.4S, v22.4S, v2.s[0] +str q8, [x0, #720] +ldr q8, [x0, #944] +mla v20.4S, v23.4S, v31.s[0] +add v24.4s, v24.4s, v21.4s +sqrdmulh v21.4S, v8.4S, v2.s[0] +str q24, [x0, #704] +ldr q24, [x17, #+608] +ldr q23, [x17, #+624] +mla v6.4S, v30.4S, v31.s[0] +sub v30.4s, v14.4s, v16.4s +sqrdmulh v19.4S, v25.4S, v23.s[0] +str q30, [x0, #752] +ldr q30, [x0, #1008] +mla v29.4S, v5.4S, v31.s[0] +add v14.4s, v14.4s, v16.4s +sqrdmulh v16.4S, v30.4S, v23.s[0] +str q14, [x0, #736] +ldr q14, [x0, #768] +ldr q5, [x0, #896] +mul v22.4S, v22.4S,v7.s[0] +sub v27.4s, v14.4s, v0.4s +ldr q10, [x0, #784] +mul v8.4S, v8.4S,v7.s[0] +add v14.4s, v14.4s, v0.4s +ldr q0, [x0, #912] +mla v22.4S, v4.4S, v31.s[0] +sub v4.4s, v10.4s, v20.4s +ldr q11, [x0, #832] +mla v8.4S, v21.4S, v31.s[0] +add v10.4s, v10.4s, v20.4s +ldr q20, [x0, #960] +mul v25.4S, v25.4S,v24.s[0] +sub v21.4s, v11.4s, v6.4s +ldr q13, [x0, #848] +mul v30.4S, v30.4S,v24.s[0] +add v11.4s, v11.4s, v6.4s +ldr q6, [x0, #976] +mla v25.4S, v19.4S, v31.s[0] +mla v30.4S, v16.4S, v31.s[0] +sub v16.4s, v13.4s, v29.4s +sqrdmulh v19.4S, v10.4S, v18.s[1] +add v13.4s, v13.4s, v29.4s +mul v10.4S, v10.4S,v28.s[1] +sqrdmulh v29.4S, v4.4S, v18.s[2] +sub v1.4s, v5.4s, v22.4s +mul v4.4S, v4.4S,v28.s[2] +add v5.4s, v5.4s, v22.4s +sqrdmulh v18.4S, v13.4S, v15.s[1] +sub v28.4s, v0.4s, v8.4s +mul v13.4S, v13.4S,v12.s[1] +add v0.4s, v0.4s, v8.4s +sqrdmulh v8.4S, v16.4S, v15.s[2] +sub v22.4s, v20.4s, v25.4s +mul v16.4S, v16.4S,v12.s[2] +add v20.4s, v20.4s, v25.4s +mla v10.4S, v19.4S, v31.s[0] +sub v19.4s, v6.4s, v30.4s +sqrdmulh v15.4S, v0.4S, v2.s[1] +add v6.4s, v6.4s, v30.4s +mla v4.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v28.4S, v2.s[2] +sub v30.4s, v14.4s, v10.4s +mla v13.4S, v18.4S, v31.s[0] +sqrdmulh v18.4S, v6.4S, v23.s[1] +add v14.4s, v14.4s, v10.4s +str q30, [x0, #784] +mla v16.4S, v8.4S, v31.s[0] +sqrdmulh v8.4S, v19.4S, v23.s[2] +sub v30.4s, v27.4s, v4.4s +str q14, [x0, #768] +mul v0.4S, v0.4S,v7.s[1] +add v27.4s, v27.4s, v4.4s +mul v28.4S, v28.4S,v7.s[2] +str q30, [x0, #816] +mla v0.4S, v15.4S, v31.s[0] +sub v15.4s, v11.4s, v13.4s +mla v28.4S, v29.4S, v31.s[0] +str q27, [x0, #800] +mul v6.4S, v6.4S,v24.s[1] +str q15, [x0, #848] +mul v19.4S, v19.4S,v24.s[2] +add v11.4s, v11.4s, v13.4s +str q11, [x0, #832] +mla v6.4S, v18.4S, v31.s[0] +sub v18.4s, v21.4s, v16.4s +str q18, [x0, #880] +mla v19.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v16.4s +str q21, [x0, #864] +sub v23.4s, v5.4s, v0.4s +str q23, [x0, #912] +add v5.4s, v5.4s, v0.4s +str q5, [x0, #896] +sub v5.4s, v1.4s, v28.4s +str q5, [x0, #944] +add v1.4s, v1.4s, v28.4s +str q1, [x0, #928] +sub v1.4s, v20.4s, v6.4s +str q1, [x0, #976] +add v20.4s, v20.4s, v6.4s +str q20, [x0, #960] +sub v20.4s, v22.4s, v19.4s +str q20, [x0, #1008] +add v22.4s, v22.4s, v19.4s +str q22, [x0, #992] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1528 +// Instruction count: 1524 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_19_z4_7.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_19_z4_7.s new file mode 100644 index 0000000..db8d7f4 --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_19_z4_7.s @@ -0,0 +1,1558 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_19_z4_7 +.global _ntt_u32_incomplete_neon_asm_var_4_2_19_z4_7 +ntt_u32_incomplete_neon_asm_var_4_2_19_z4_7: +_ntt_u32_incomplete_neon_asm_var_4_2_19_z4_7: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x0, #992] +sqrdmulh v27.4S, v28.4S, v29.s[0] +mul v28.4S, v28.4S,v30.s[0] +ldr q26, [x0, #928] +sqrdmulh v25.4S, v26.4S, v29.s[0] +mul v26.4S, v26.4S,v30.s[0] +ldr q24, [x0, #864] +sqrdmulh v23.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +ldr q22, [x0, #800] +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +ldr q20, [x0, #736] +sqrdmulh v19.4S, v20.4S, v29.s[0] +mla v28.4S, v27.4S, v31.s[0] +ldr q27, [x0, #672] +sqrdmulh v18.4S, v27.4S, v29.s[0] +mla v26.4S, v25.4S, v31.s[0] +ldr q25, [x0, #608] +sqrdmulh v17.4S, v25.4S, v29.s[0] +mla v24.4S, v23.4S, v31.s[0] +ldr q23, [x0, #544] +sqrdmulh v16.4S, v23.4S, v29.s[0] +mla v22.4S, v21.4S, v31.s[0] +ldr q21, [x0, #480] +ldr q3, [x0, #416] +mul v27.4S, v27.4S,v30.s[0] +mul v20.4S, v20.4S,v30.s[0] +sub v2.4s, v21.4s, v28.4s +add v21.4s, v21.4s, v28.4s +ldr q28, [x0, #352] +ldr q1, [x0, #288] +mla v27.4S, v18.4S, v31.s[0] +mla v20.4S, v19.4S, v31.s[0] +sub v19.4s, v3.4s, v26.4s +add v3.4s, v3.4s, v26.4s +ldr q26, [x0, #224] +ldr q18, [x0, #160] +mul v23.4S, v23.4S,v30.s[0] +mul v25.4S, v25.4S,v30.s[0] +sub v0.4s, v28.4s, v24.4s +add v28.4s, v28.4s, v24.4s +ldr q24, [x0, #96] +ldr q15, [x0, #32] +mla v23.4S, v16.4S, v31.s[0] +mla v25.4S, v17.4S, v31.s[0] +sub v17.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +sqrdmulh v22.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v16.4s, v26.4s, v20.4s +nop +sqrdmulh v14.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +add v26.4s, v26.4s, v20.4s +nop +sqrdmulh v20.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v13.4s, v18.4s, v27.4s +add v18.4s, v18.4s, v27.4s +sqrdmulh v27.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v12.4s, v24.4s, v25.4s +add v24.4s, v24.4s, v25.4s +sqrdmulh v25.4S, v0.4S, v29.s[2] +mla v2.4S, v22.4S, v31.s[0] +sub v22.4s, v15.4s, v23.4s +sqrdmulh v11.4S, v17.4S, v29.s[2] +mla v19.4S, v14.4S, v31.s[0] +add v15.4s, v15.4s, v23.4s +nop +sqrdmulh v23.4S, v28.4S, v29.s[1] +mla v21.4S, v20.4S, v31.s[0] +nop +sqrdmulh v20.4S, v1.4S, v29.s[1] +mla v3.4S, v27.4S, v31.s[0] +nop +nop +ldr q27, [x17, #+32] +ldr q14, [x17, #+48] +mul v17.4S, v17.4S,v30.s[2] +mul v0.4S, v0.4S,v30.s[2] +sub v10.4s, v16.4s, v2.4s +add v16.4s, v16.4s, v2.4s +mla v17.4S, v11.4S, v31.s[0] +mla v0.4S, v25.4S, v31.s[0] +sub v25.4s, v13.4s, v19.4s +add v13.4s, v13.4s, v19.4s +mul v1.4S, v1.4S,v30.s[1] +mul v28.4S, v28.4S,v30.s[1] +sub v19.4s, v26.4s, v21.4s +add v26.4s, v26.4s, v21.4s +mla v1.4S, v20.4S, v31.s[0] +mla v28.4S, v23.4S, v31.s[0] +sub v23.4s, v18.4s, v3.4s +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v10.4S, v14.s[3] +mul v10.4S, v10.4S,v27.s[3] +sub v20.4s, v12.4s, v0.4s +add v12.4s, v12.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v14.s[2] +mul v16.4S, v16.4S,v27.s[2] +sub v21.4s, v22.4s, v17.4s +add v22.4s, v22.4s, v17.4s +sqrdmulh v17.4S, v19.4S, v14.s[1] +mul v19.4S, v19.4S,v27.s[1] +sub v11.4s, v24.4s, v28.4s +add v24.4s, v24.4s, v28.4s +sqrdmulh v28.4S, v26.4S, v14.s[0] +mul v26.4S, v26.4S,v27.s[0] +sub v2.4s, v15.4s, v1.4s +add v15.4s, v15.4s, v1.4s +ldr q1, [x17, #+96] +ldr q9, [x17, #+112] +sqrdmulh v8.4S, v25.4S, v14.s[3] +mla v10.4S, v3.4S, v31.s[0] +nop +nop +sqrdmulh v3.4S, v13.4S, v14.s[2] +mla v16.4S, v0.4S, v31.s[0] +nop +nop +sqrdmulh v0.4S, v23.4S, v14.s[1] +mla v19.4S, v17.4S, v31.s[0] +nop +nop +sqrdmulh v17.4S, v18.4S, v14.s[0] +mla v26.4S, v28.4S, v31.s[0] +nop +nop +ldr q28, [x17, #+64] +ldr q7, [x17, #+80] +mul v13.4S, v13.4S,v27.s[2] +mul v25.4S, v25.4S,v27.s[3] +sub v6.4s, v20.4s, v10.4s +add v20.4s, v20.4s, v10.4s +mla v13.4S, v3.4S, v31.s[0] +mla v25.4S, v8.4S, v31.s[0] +sub v8.4s, v12.4s, v16.4s +add v12.4s, v12.4s, v16.4s +mul v18.4S, v18.4S,v27.s[0] +mul v23.4S, v23.4S,v27.s[1] +sub v16.4s, v11.4s, v19.4s +add v11.4s, v11.4s, v19.4s +mla v18.4S, v17.4S, v31.s[0] +mla v23.4S, v0.4S, v31.s[0] +sub v0.4s, v24.4s, v26.4s +add v24.4s, v24.4s, v26.4s +sqrdmulh v26.4S, v6.4S, v9.s[3] +mul v6.4S, v6.4S,v1.s[3] +sub v17.4s, v21.4s, v25.4s +add v21.4s, v21.4s, v25.4s +sqrdmulh v25.4S, v20.4S, v9.s[2] +mul v20.4S, v20.4S,v1.s[2] +sub v19.4s, v22.4s, v13.4s +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v8.4S, v9.s[1] +mul v8.4S, v8.4S,v1.s[1] +sub v3.4s, v2.4s, v23.4s +add v2.4s, v2.4s, v23.4s +sqrdmulh v23.4S, v12.4S, v9.s[0] +mul v12.4S, v12.4S,v1.s[0] +sub v10.4s, v15.4s, v18.4s +add v15.4s, v15.4s, v18.4s +sqrdmulh v18.4S, v16.4S, v7.s[3] +mla v6.4S, v26.4S, v31.s[0] +nop +nop +sqrdmulh v26.4S, v11.4S, v7.s[2] +mla v20.4S, v25.4S, v31.s[0] +sub v25.4s, v17.4s, v6.4s +str q25, [x0, #992] +sqrdmulh v25.4S, v0.4S, v7.s[1] +mla v8.4S, v13.4S, v31.s[0] +add v17.4s, v17.4s, v6.4s +str q17, [x0, #928] +sqrdmulh v17.4S, v24.4S, v7.s[0] +mla v12.4S, v23.4S, v31.s[0] +sub v23.4s, v21.4s, v20.4s +str q23, [x0, #864] +mul v11.4S, v11.4S,v28.s[2] +mul v16.4S, v16.4S,v28.s[3] +add v21.4s, v21.4s, v20.4s +sub v20.4s, v19.4s, v8.4s +mla v11.4S, v26.4S, v31.s[0] +mla v16.4S, v18.4S, v31.s[0] +add v19.4s, v19.4s, v8.4s +str q21, [x0, #800] +mul v24.4S, v24.4S,v28.s[0] +mul v0.4S, v0.4S,v28.s[1] +sub v21.4s, v22.4s, v12.4s +str q20, [x0, #736] +mla v24.4S, v17.4S, v31.s[0] +mla v0.4S, v25.4S, v31.s[0] +add v22.4s, v22.4s, v12.4s +str q19, [x0, #672] +ldr q19, [x0, #1008] +sqrdmulh v12.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +str q21, [x0, #608] +sub v21.4s, v3.4s, v16.4s +ldr q25, [x0, #944] +sqrdmulh v17.4S, v25.4S, v29.s[0] +mul v25.4S, v25.4S,v30.s[0] +str q22, [x0, #544] +add v3.4s, v3.4s, v16.4s +ldr q16, [x0, #880] +sqrdmulh v22.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +str q21, [x0, #480] +sub v21.4s, v2.4s, v11.4s +ldr q20, [x0, #816] +sqrdmulh v8.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +str q3, [x0, #416] +add v2.4s, v2.4s, v11.4s +ldr q11, [x0, #752] +sqrdmulh v3.4S, v11.4S, v29.s[0] +mla v19.4S, v12.4S, v31.s[0] +str q21, [x0, #352] +sub v21.4s, v10.4s, v0.4s +ldr q12, [x0, #688] +sqrdmulh v18.4S, v12.4S, v29.s[0] +mla v25.4S, v17.4S, v31.s[0] +str q2, [x0, #288] +add v10.4s, v10.4s, v0.4s +ldr q0, [x0, #624] +sqrdmulh v2.4S, v0.4S, v29.s[0] +mla v16.4S, v22.4S, v31.s[0] +str q21, [x0, #224] +sub v21.4s, v15.4s, v24.4s +ldr q22, [x0, #560] +sqrdmulh v17.4S, v22.4S, v29.s[0] +mla v20.4S, v8.4S, v31.s[0] +str q10, [x0, #160] +add v15.4s, v15.4s, v24.4s +ldr q24, [x0, #496] +ldr q10, [x0, #432] +mul v12.4S, v12.4S,v30.s[0] +mul v11.4S, v11.4S,v30.s[0] +sub v8.4s, v24.4s, v19.4s +add v24.4s, v24.4s, v19.4s +ldr q19, [x0, #368] +ldr q26, [x0, #304] +mla v12.4S, v18.4S, v31.s[0] +mla v11.4S, v3.4S, v31.s[0] +sub v3.4s, v10.4s, v25.4s +add v10.4s, v10.4s, v25.4s +ldr q25, [x0, #240] +ldr q18, [x0, #176] +mul v22.4S, v22.4S,v30.s[0] +mul v0.4S, v0.4S,v30.s[0] +sub v23.4s, v19.4s, v16.4s +add v19.4s, v19.4s, v16.4s +ldr q16, [x0, #112] +ldr q6, [x0, #48] +mla v22.4S, v17.4S, v31.s[0] +mla v0.4S, v2.4S, v31.s[0] +sub v2.4s, v26.4s, v20.4s +add v26.4s, v26.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v29.s[2] +mul v8.4S, v8.4S,v30.s[2] +sub v17.4s, v25.4s, v11.4s +nop +sqrdmulh v13.4S, v3.4S, v29.s[2] +mul v3.4S, v3.4S,v30.s[2] +add v25.4s, v25.4s, v11.4s +nop +sqrdmulh v11.4S, v24.4S, v29.s[1] +mul v24.4S, v24.4S,v30.s[1] +sub v5.4s, v18.4s, v12.4s +add v18.4s, v18.4s, v12.4s +sqrdmulh v12.4S, v10.4S, v29.s[1] +mul v10.4S, v10.4S,v30.s[1] +sub v4.4s, v16.4s, v0.4s +add v16.4s, v16.4s, v0.4s +sqrdmulh v0.4S, v23.4S, v29.s[2] +mla v8.4S, v20.4S, v31.s[0] +sub v20.4s, v6.4s, v22.4s +str q21, [x0, #96] +sqrdmulh v21.4S, v2.4S, v29.s[2] +mla v3.4S, v13.4S, v31.s[0] +add v6.4s, v6.4s, v22.4s +nop +sqrdmulh v22.4S, v19.4S, v29.s[1] +mla v24.4S, v11.4S, v31.s[0] +str q15, [x0, #32] +nop +sqrdmulh v15.4S, v26.4S, v29.s[1] +mla v10.4S, v12.4S, v31.s[0] +nop +nop +mul v2.4S, v2.4S,v30.s[2] +mul v23.4S, v23.4S,v30.s[2] +sub v12.4s, v17.4s, v8.4s +add v17.4s, v17.4s, v8.4s +mla v2.4S, v21.4S, v31.s[0] +mla v23.4S, v0.4S, v31.s[0] +sub v0.4s, v5.4s, v3.4s +add v5.4s, v5.4s, v3.4s +mul v26.4S, v26.4S,v30.s[1] +mul v19.4S, v19.4S,v30.s[1] +sub v3.4s, v25.4s, v24.4s +add v25.4s, v25.4s, v24.4s +mla v26.4S, v15.4S, v31.s[0] +mla v19.4S, v22.4S, v31.s[0] +sub v22.4s, v18.4s, v10.4s +add v18.4s, v18.4s, v10.4s +sqrdmulh v10.4S, v12.4S, v14.s[3] +mul v12.4S, v12.4S,v27.s[3] +sub v15.4s, v4.4s, v23.4s +add v4.4s, v4.4s, v23.4s +sqrdmulh v23.4S, v17.4S, v14.s[2] +mul v17.4S, v17.4S,v27.s[2] +sub v24.4s, v20.4s, v2.4s +add v20.4s, v20.4s, v2.4s +sqrdmulh v2.4S, v3.4S, v14.s[1] +mul v3.4S, v3.4S,v27.s[1] +sub v21.4s, v16.4s, v19.4s +add v16.4s, v16.4s, v19.4s +sqrdmulh v19.4S, v25.4S, v14.s[0] +mul v25.4S, v25.4S,v27.s[0] +sub v8.4s, v6.4s, v26.4s +add v6.4s, v6.4s, v26.4s +sqrdmulh v26.4S, v0.4S, v14.s[3] +mla v12.4S, v10.4S, v31.s[0] +nop +nop +sqrdmulh v10.4S, v5.4S, v14.s[2] +mla v17.4S, v23.4S, v31.s[0] +nop +nop +sqrdmulh v23.4S, v22.4S, v14.s[1] +mla v3.4S, v2.4S, v31.s[0] +nop +nop +sqrdmulh v2.4S, v18.4S, v14.s[0] +mla v25.4S, v19.4S, v31.s[0] +nop +nop +mul v5.4S, v5.4S,v27.s[2] +mul v0.4S, v0.4S,v27.s[3] +sub v19.4s, v15.4s, v12.4s +add v15.4s, v15.4s, v12.4s +mla v5.4S, v10.4S, v31.s[0] +mla v0.4S, v26.4S, v31.s[0] +sub v26.4s, v4.4s, v17.4s +add v4.4s, v4.4s, v17.4s +mul v18.4S, v18.4S,v27.s[0] +mul v22.4S, v22.4S,v27.s[1] +sub v17.4s, v21.4s, v3.4s +add v21.4s, v21.4s, v3.4s +mla v18.4S, v2.4S, v31.s[0] +mla v22.4S, v23.4S, v31.s[0] +sub v23.4s, v16.4s, v25.4s +add v16.4s, v16.4s, v25.4s +sqrdmulh v25.4S, v19.4S, v9.s[3] +mul v19.4S, v19.4S,v1.s[3] +sub v2.4s, v24.4s, v0.4s +add v24.4s, v24.4s, v0.4s +sqrdmulh v0.4S, v15.4S, v9.s[2] +mul v15.4S, v15.4S,v1.s[2] +sub v3.4s, v20.4s, v5.4s +add v20.4s, v20.4s, v5.4s +sqrdmulh v5.4S, v26.4S, v9.s[1] +mul v26.4S, v26.4S,v1.s[1] +sub v10.4s, v8.4s, v22.4s +add v8.4s, v8.4s, v22.4s +sqrdmulh v22.4S, v4.4S, v9.s[0] +mul v4.4S, v4.4S,v1.s[0] +sub v12.4s, v6.4s, v18.4s +add v6.4s, v6.4s, v18.4s +sqrdmulh v18.4S, v17.4S, v7.s[3] +mla v19.4S, v25.4S, v31.s[0] +nop +nop +sqrdmulh v25.4S, v21.4S, v7.s[2] +mla v15.4S, v0.4S, v31.s[0] +sub v0.4s, v2.4s, v19.4s +str q0, [x0, #1008] +sqrdmulh v0.4S, v23.4S, v7.s[1] +mla v26.4S, v5.4S, v31.s[0] +add v2.4s, v2.4s, v19.4s +str q2, [x0, #944] +sqrdmulh v2.4S, v16.4S, v7.s[0] +mla v4.4S, v22.4S, v31.s[0] +sub v22.4s, v24.4s, v15.4s +str q22, [x0, #880] +mul v21.4S, v21.4S,v28.s[2] +mul v17.4S, v17.4S,v28.s[3] +add v24.4s, v24.4s, v15.4s +sub v15.4s, v3.4s, v26.4s +mla v21.4S, v25.4S, v31.s[0] +mla v17.4S, v18.4S, v31.s[0] +add v3.4s, v3.4s, v26.4s +str q24, [x0, #816] +mul v16.4S, v16.4S,v28.s[0] +mul v23.4S, v23.4S,v28.s[1] +sub v24.4s, v20.4s, v4.4s +str q15, [x0, #752] +mla v16.4S, v2.4S, v31.s[0] +mla v23.4S, v0.4S, v31.s[0] +add v20.4s, v20.4s, v4.4s +str q3, [x0, #688] +ldr q3, [x0, #960] +sqrdmulh v4.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +str q24, [x0, #624] +sub v24.4s, v10.4s, v17.4s +ldr q0, [x0, #896] +sqrdmulh v2.4S, v0.4S, v29.s[0] +mul v0.4S, v0.4S,v30.s[0] +str q20, [x0, #560] +add v10.4s, v10.4s, v17.4s +ldr q17, [x0, #832] +sqrdmulh v20.4S, v17.4S, v29.s[0] +mul v17.4S, v17.4S,v30.s[0] +str q24, [x0, #496] +sub v24.4s, v8.4s, v21.4s +ldr q15, [x0, #768] +sqrdmulh v26.4S, v15.4S, v29.s[0] +mul v15.4S, v15.4S,v30.s[0] +str q10, [x0, #432] +add v8.4s, v8.4s, v21.4s +ldr q21, [x0, #704] +sqrdmulh v10.4S, v21.4S, v29.s[0] +mla v3.4S, v4.4S, v31.s[0] +str q24, [x0, #368] +sub v24.4s, v12.4s, v23.4s +ldr q4, [x0, #640] +sqrdmulh v18.4S, v4.4S, v29.s[0] +mla v0.4S, v2.4S, v31.s[0] +str q8, [x0, #304] +add v12.4s, v12.4s, v23.4s +ldr q23, [x0, #576] +sqrdmulh v8.4S, v23.4S, v29.s[0] +mla v17.4S, v20.4S, v31.s[0] +str q24, [x0, #240] +sub v24.4s, v6.4s, v16.4s +ldr q20, [x0, #512] +sqrdmulh v2.4S, v20.4S, v29.s[0] +mla v15.4S, v26.4S, v31.s[0] +str q12, [x0, #176] +add v6.4s, v6.4s, v16.4s +ldr q16, [x0, #448] +ldr q12, [x0, #384] +mul v4.4S, v4.4S,v30.s[0] +mul v21.4S, v21.4S,v30.s[0] +sub v26.4s, v16.4s, v3.4s +add v16.4s, v16.4s, v3.4s +ldr q3, [x0, #320] +ldr q25, [x0, #256] +mla v4.4S, v18.4S, v31.s[0] +mla v21.4S, v10.4S, v31.s[0] +sub v10.4s, v12.4s, v0.4s +add v12.4s, v12.4s, v0.4s +ldr q0, [x0, #192] +ldr q18, [x0, #128] +mul v20.4S, v20.4S,v30.s[0] +mul v23.4S, v23.4S,v30.s[0] +sub v22.4s, v3.4s, v17.4s +add v3.4s, v3.4s, v17.4s +ldr q17, [x0, #64] +ldr q19, [x0, #0] +mla v20.4S, v2.4S, v31.s[0] +mla v23.4S, v8.4S, v31.s[0] +sub v8.4s, v25.4s, v15.4s +add v25.4s, v25.4s, v15.4s +sqrdmulh v15.4S, v26.4S, v29.s[2] +mul v26.4S, v26.4S,v30.s[2] +sub v2.4s, v0.4s, v21.4s +nop +sqrdmulh v5.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +add v0.4s, v0.4s, v21.4s +nop +sqrdmulh v21.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v11.4s, v18.4s, v4.4s +add v18.4s, v18.4s, v4.4s +sqrdmulh v4.4S, v12.4S, v29.s[1] +mul v12.4S, v12.4S,v30.s[1] +sub v13.4s, v17.4s, v23.4s +add v17.4s, v17.4s, v23.4s +sqrdmulh v23.4S, v22.4S, v29.s[2] +mla v26.4S, v15.4S, v31.s[0] +sub v15.4s, v19.4s, v20.4s +str q24, [x0, #112] +sqrdmulh v24.4S, v8.4S, v29.s[2] +mla v10.4S, v5.4S, v31.s[0] +add v19.4s, v19.4s, v20.4s +nop +sqrdmulh v20.4S, v3.4S, v29.s[1] +mla v16.4S, v21.4S, v31.s[0] +str q6, [x0, #48] +nop +sqrdmulh v6.4S, v25.4S, v29.s[1] +mla v12.4S, v4.4S, v31.s[0] +nop +nop +mul v8.4S, v8.4S,v30.s[2] +mul v22.4S, v22.4S,v30.s[2] +sub v4.4s, v2.4s, v26.4s +add v2.4s, v2.4s, v26.4s +mla v8.4S, v24.4S, v31.s[0] +mla v22.4S, v23.4S, v31.s[0] +sub v23.4s, v11.4s, v10.4s +add v11.4s, v11.4s, v10.4s +mul v25.4S, v25.4S,v30.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v10.4s, v0.4s, v16.4s +add v0.4s, v0.4s, v16.4s +mla v25.4S, v6.4S, v31.s[0] +mla v3.4S, v20.4S, v31.s[0] +sub v20.4s, v18.4s, v12.4s +add v18.4s, v18.4s, v12.4s +sqrdmulh v12.4S, v4.4S, v14.s[3] +mul v4.4S, v4.4S,v27.s[3] +sub v6.4s, v13.4s, v22.4s +add v13.4s, v13.4s, v22.4s +sqrdmulh v22.4S, v2.4S, v14.s[2] +mul v2.4S, v2.4S,v27.s[2] +sub v16.4s, v15.4s, v8.4s +add v15.4s, v15.4s, v8.4s +sqrdmulh v8.4S, v10.4S, v14.s[1] +mul v10.4S, v10.4S,v27.s[1] +sub v24.4s, v17.4s, v3.4s +add v17.4s, v17.4s, v3.4s +sqrdmulh v3.4S, v0.4S, v14.s[0] +mul v0.4S, v0.4S,v27.s[0] +sub v26.4s, v19.4s, v25.4s +add v19.4s, v19.4s, v25.4s +sqrdmulh v25.4S, v23.4S, v14.s[3] +mla v4.4S, v12.4S, v31.s[0] +nop +nop +sqrdmulh v12.4S, v11.4S, v14.s[2] +mla v2.4S, v22.4S, v31.s[0] +nop +nop +sqrdmulh v22.4S, v20.4S, v14.s[1] +mla v10.4S, v8.4S, v31.s[0] +nop +nop +sqrdmulh v8.4S, v18.4S, v14.s[0] +mla v0.4S, v3.4S, v31.s[0] +nop +nop +mul v11.4S, v11.4S,v27.s[2] +mul v23.4S, v23.4S,v27.s[3] +sub v3.4s, v6.4s, v4.4s +add v6.4s, v6.4s, v4.4s +mla v11.4S, v12.4S, v31.s[0] +mla v23.4S, v25.4S, v31.s[0] +sub v25.4s, v13.4s, v2.4s +add v13.4s, v13.4s, v2.4s +mul v18.4S, v18.4S,v27.s[0] +mul v20.4S, v20.4S,v27.s[1] +sub v2.4s, v24.4s, v10.4s +add v24.4s, v24.4s, v10.4s +mla v18.4S, v8.4S, v31.s[0] +mla v20.4S, v22.4S, v31.s[0] +sub v22.4s, v17.4s, v0.4s +add v17.4s, v17.4s, v0.4s +sqrdmulh v0.4S, v3.4S, v9.s[3] +mul v3.4S, v3.4S,v1.s[3] +sub v8.4s, v16.4s, v23.4s +add v16.4s, v16.4s, v23.4s +sqrdmulh v23.4S, v6.4S, v9.s[2] +mul v6.4S, v6.4S,v1.s[2] +sub v10.4s, v15.4s, v11.4s +add v15.4s, v15.4s, v11.4s +sqrdmulh v11.4S, v25.4S, v9.s[1] +mul v25.4S, v25.4S,v1.s[1] +sub v12.4s, v26.4s, v20.4s +add v26.4s, v26.4s, v20.4s +sqrdmulh v20.4S, v13.4S, v9.s[0] +mul v13.4S, v13.4S,v1.s[0] +sub v4.4s, v19.4s, v18.4s +add v19.4s, v19.4s, v18.4s +sqrdmulh v18.4S, v2.4S, v7.s[3] +mla v3.4S, v0.4S, v31.s[0] +nop +nop +sqrdmulh v0.4S, v24.4S, v7.s[2] +mla v6.4S, v23.4S, v31.s[0] +sub v23.4s, v8.4s, v3.4s +str q23, [x0, #960] +sqrdmulh v23.4S, v22.4S, v7.s[1] +mla v25.4S, v11.4S, v31.s[0] +add v8.4s, v8.4s, v3.4s +str q8, [x0, #896] +sqrdmulh v8.4S, v17.4S, v7.s[0] +mla v13.4S, v20.4S, v31.s[0] +sub v20.4s, v16.4s, v6.4s +str q20, [x0, #832] +mul v24.4S, v24.4S,v28.s[2] +mul v2.4S, v2.4S,v28.s[3] +add v16.4s, v16.4s, v6.4s +sub v6.4s, v10.4s, v25.4s +mla v24.4S, v0.4S, v31.s[0] +mla v2.4S, v18.4S, v31.s[0] +add v10.4s, v10.4s, v25.4s +str q16, [x0, #768] +mul v17.4S, v17.4S,v28.s[0] +mul v22.4S, v22.4S,v28.s[1] +sub v16.4s, v15.4s, v13.4s +str q6, [x0, #704] +mla v17.4S, v8.4S, v31.s[0] +mla v22.4S, v23.4S, v31.s[0] +add v15.4s, v15.4s, v13.4s +str q10, [x0, #640] +ldr q10, [x0, #976] +sqrdmulh v13.4S, v10.4S, v29.s[0] +mul v10.4S, v10.4S,v30.s[0] +str q16, [x0, #576] +sub v16.4s, v12.4s, v2.4s +ldr q23, [x0, #912] +sqrdmulh v8.4S, v23.4S, v29.s[0] +mul v23.4S, v23.4S,v30.s[0] +str q15, [x0, #512] +add v12.4s, v12.4s, v2.4s +ldr q2, [x0, #848] +sqrdmulh v15.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +str q16, [x0, #448] +sub v16.4s, v26.4s, v24.4s +ldr q6, [x0, #784] +sqrdmulh v25.4S, v6.4S, v29.s[0] +mul v6.4S, v6.4S,v30.s[0] +str q12, [x0, #384] +add v26.4s, v26.4s, v24.4s +ldr q24, [x0, #720] +sqrdmulh v12.4S, v24.4S, v29.s[0] +mla v10.4S, v13.4S, v31.s[0] +str q16, [x0, #320] +sub v16.4s, v4.4s, v22.4s +ldr q13, [x0, #656] +sqrdmulh v18.4S, v13.4S, v29.s[0] +mla v23.4S, v8.4S, v31.s[0] +str q26, [x0, #256] +add v4.4s, v4.4s, v22.4s +ldr q22, [x0, #592] +sqrdmulh v26.4S, v22.4S, v29.s[0] +mla v2.4S, v15.4S, v31.s[0] +str q16, [x0, #192] +sub v16.4s, v19.4s, v17.4s +ldr q15, [x0, #528] +sqrdmulh v8.4S, v15.4S, v29.s[0] +mla v6.4S, v25.4S, v31.s[0] +str q4, [x0, #128] +add v19.4s, v19.4s, v17.4s +ldr q17, [x0, #464] +ldr q4, [x0, #400] +mul v13.4S, v13.4S,v30.s[0] +mul v24.4S, v24.4S,v30.s[0] +sub v25.4s, v17.4s, v10.4s +add v17.4s, v17.4s, v10.4s +ldr q10, [x0, #336] +ldr q0, [x0, #272] +mla v13.4S, v18.4S, v31.s[0] +mla v24.4S, v12.4S, v31.s[0] +sub v12.4s, v4.4s, v23.4s +add v4.4s, v4.4s, v23.4s +ldr q23, [x0, #208] +ldr q18, [x0, #144] +mul v15.4S, v15.4S,v30.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v20.4s, v10.4s, v2.4s +add v10.4s, v10.4s, v2.4s +ldr q2, [x0, #80] +ldr q3, [x0, #16] +mla v15.4S, v8.4S, v31.s[0] +mla v22.4S, v26.4S, v31.s[0] +sub v26.4s, v0.4s, v6.4s +add v0.4s, v0.4s, v6.4s +sqrdmulh v6.4S, v25.4S, v29.s[2] +mul v25.4S, v25.4S,v30.s[2] +sub v8.4s, v23.4s, v24.4s +nop +sqrdmulh v11.4S, v12.4S, v29.s[2] +mul v12.4S, v12.4S,v30.s[2] +add v23.4s, v23.4s, v24.4s +nop +sqrdmulh v24.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v21.4s, v18.4s, v13.4s +add v18.4s, v18.4s, v13.4s +sqrdmulh v13.4S, v4.4S, v29.s[1] +mul v4.4S, v4.4S,v30.s[1] +sub v5.4s, v2.4s, v22.4s +add v2.4s, v2.4s, v22.4s +sqrdmulh v22.4S, v20.4S, v29.s[2] +mla v25.4S, v6.4S, v31.s[0] +sub v6.4s, v3.4s, v15.4s +str q16, [x0, #64] +sqrdmulh v16.4S, v26.4S, v29.s[2] +mla v12.4S, v11.4S, v31.s[0] +add v3.4s, v3.4s, v15.4s +nop +sqrdmulh v15.4S, v10.4S, v29.s[1] +mla v17.4S, v24.4S, v31.s[0] +str q19, [x0, #0] +nop +sqrdmulh v19.4S, v0.4S, v29.s[1] +mla v4.4S, v13.4S, v31.s[0] +nop +nop +mul v26.4S, v26.4S,v30.s[2] +mul v20.4S, v20.4S,v30.s[2] +sub v13.4s, v8.4s, v25.4s +add v8.4s, v8.4s, v25.4s +mla v26.4S, v16.4S, v31.s[0] +mla v20.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v12.4s +add v21.4s, v21.4s, v12.4s +mul v0.4S, v0.4S,v30.s[1] +mul v10.4S, v10.4S,v30.s[1] +sub v12.4s, v23.4s, v17.4s +add v23.4s, v23.4s, v17.4s +mla v0.4S, v19.4S, v31.s[0] +mla v10.4S, v15.4S, v31.s[0] +sub v15.4s, v18.4s, v4.4s +add v18.4s, v18.4s, v4.4s +sqrdmulh v29.4S, v13.4S, v14.s[3] +mul v13.4S, v13.4S,v27.s[3] +sub v30.4s, v5.4s, v20.4s +add v5.4s, v5.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v14.s[2] +mul v8.4S, v8.4S,v27.s[2] +sub v4.4s, v6.4s, v26.4s +add v6.4s, v6.4s, v26.4s +sqrdmulh v26.4S, v12.4S, v14.s[1] +mul v12.4S, v12.4S,v27.s[1] +sub v19.4s, v2.4s, v10.4s +add v2.4s, v2.4s, v10.4s +sqrdmulh v10.4S, v23.4S, v14.s[0] +mul v23.4S, v23.4S,v27.s[0] +sub v17.4s, v3.4s, v0.4s +add v3.4s, v3.4s, v0.4s +sqrdmulh v0.4S, v22.4S, v14.s[3] +mla v13.4S, v29.4S, v31.s[0] +nop +nop +sqrdmulh v29.4S, v21.4S, v14.s[2] +mla v8.4S, v20.4S, v31.s[0] +nop +nop +sqrdmulh v20.4S, v15.4S, v14.s[1] +mla v12.4S, v26.4S, v31.s[0] +nop +nop +sqrdmulh v26.4S, v18.4S, v14.s[0] +mla v23.4S, v10.4S, v31.s[0] +nop +nop +mul v21.4S, v21.4S,v27.s[2] +mul v22.4S, v22.4S,v27.s[3] +sub v10.4s, v30.4s, v13.4s +add v30.4s, v30.4s, v13.4s +mla v21.4S, v29.4S, v31.s[0] +mla v22.4S, v0.4S, v31.s[0] +sub v0.4s, v5.4s, v8.4s +add v5.4s, v5.4s, v8.4s +mul v18.4S, v18.4S,v27.s[0] +mul v15.4S, v15.4S,v27.s[1] +sub v8.4s, v19.4s, v12.4s +add v19.4s, v19.4s, v12.4s +mla v18.4S, v26.4S, v31.s[0] +mla v15.4S, v20.4S, v31.s[0] +sub v20.4s, v2.4s, v23.4s +add v2.4s, v2.4s, v23.4s +sqrdmulh v14.4S, v10.4S, v9.s[3] +mul v10.4S, v10.4S,v1.s[3] +sub v27.4s, v4.4s, v22.4s +add v4.4s, v4.4s, v22.4s +sqrdmulh v22.4S, v30.4S, v9.s[2] +mul v30.4S, v30.4S,v1.s[2] +sub v23.4s, v6.4s, v21.4s +add v6.4s, v6.4s, v21.4s +sqrdmulh v21.4S, v0.4S, v9.s[1] +mul v0.4S, v0.4S,v1.s[1] +sub v26.4s, v17.4s, v15.4s +add v17.4s, v17.4s, v15.4s +sqrdmulh v15.4S, v5.4S, v9.s[0] +mul v5.4S, v5.4S,v1.s[0] +sub v12.4s, v3.4s, v18.4s +add v3.4s, v3.4s, v18.4s +sqrdmulh v9.4S, v8.4S, v7.s[3] +mla v10.4S, v14.4S, v31.s[0] +nop +nop +sqrdmulh v14.4S, v19.4S, v7.s[2] +mla v30.4S, v22.4S, v31.s[0] +sub v22.4s, v27.4s, v10.4s +str q22, [x0, #976] +sqrdmulh v22.4S, v20.4S, v7.s[1] +mla v0.4S, v21.4S, v31.s[0] +add v27.4s, v27.4s, v10.4s +str q27, [x0, #912] +sqrdmulh v27.4S, v2.4S, v7.s[0] +mla v5.4S, v15.4S, v31.s[0] +sub v15.4s, v4.4s, v30.4s +str q15, [x0, #848] +mul v19.4S, v19.4S,v28.s[2] +mul v8.4S, v8.4S,v28.s[3] +add v4.4s, v4.4s, v30.4s +sub v30.4s, v23.4s, v0.4s +mla v19.4S, v14.4S, v31.s[0] +mla v8.4S, v9.4S, v31.s[0] +add v23.4s, v23.4s, v0.4s +str q4, [x0, #784] +mul v2.4S, v2.4S,v28.s[0] +mul v20.4S, v20.4S,v28.s[1] +sub v4.4s, v6.4s, v5.4s +str q30, [x0, #720] +mla v2.4S, v27.4S, v31.s[0] +mla v20.4S, v22.4S, v31.s[0] +add v6.4s, v6.4s, v5.4s +str q23, [x0, #656] +str q4, [x0, #592] +sub v4.4s, v26.4s, v8.4s +str q6, [x0, #528] +add v26.4s, v26.4s, v8.4s +str q4, [x0, #464] +sub v4.4s, v17.4s, v19.4s +str q26, [x0, #400] +add v17.4s, v17.4s, v19.4s +str q4, [x0, #336] +sub v4.4s, v12.4s, v20.4s +str q17, [x0, #272] +add v12.4s, v12.4s, v20.4s +str q4, [x0, #208] +sub v4.4s, v3.4s, v2.4s +str q12, [x0, #144] +add v3.4s, v3.4s, v2.4s +str q4, [x0, #80] +str q3, [x0, #16] +ldr q11, [x0, #224] +ldr q24, [x0, #160] +ldr q25, [x0, #32] +ldr q16, [x17, #+128] +ldr q13, [x17, #+144] +sqrdmulh v29.4S, v25.4S, v13.s[0] +mul v25.4S, v25.4S,v16.s[0] +ldr q18, [x0, #48] +sqrdmulh v1.4S, v18.4S, v13.s[0] +mul v18.4S, v18.4S,v16.s[0] +ldr q21, [x17, #+160] +ldr q10, [x17, #+176] +ldr q15, [x0, #96] +sqrdmulh v14.4S, v15.4S, v10.s[0] +mul v15.4S, v15.4S,v21.s[0] +ldr q9, [x0, #112] +sqrdmulh v0.4S, v9.4S, v10.s[0] +mul v9.4S, v9.4S,v21.s[0] +ldr q30, [x17, #+192] +ldr q27, [x17, #+208] +mla v25.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v24.4S, v27.s[0] +ldr q22, [x0, #176] +mla v18.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v22.4S, v27.s[0] +ldr q5, [x17, #+224] +ldr q23, [x17, #+240] +mla v15.4S, v14.4S, v31.s[0] +sqrdmulh v14.4S, v11.4S, v23.s[0] +ldr q28, [x0, #240] +mla v9.4S, v0.4S, v31.s[0] +sqrdmulh v0.4S, v28.4S, v23.s[0] +ldr q7, [x0, #0] +ldr q6, [x0, #128] +mul v24.4S, v24.4S,v30.s[0] +sub v8.4s, v7.4s, v25.4s +ldr q26, [x0, #16] +mul v22.4S, v22.4S,v30.s[0] +add v7.4s, v7.4s, v25.4s +ldr q25, [x0, #144] +mla v24.4S, v29.4S, v31.s[0] +sub v29.4s, v26.4s, v18.4s +ldr q19, [x0, #64] +mla v22.4S, v1.4S, v31.s[0] +add v26.4s, v26.4s, v18.4s +ldr q18, [x0, #192] +mul v11.4S, v11.4S,v5.s[0] +sub v1.4s, v19.4s, v15.4s +ldr q17, [x0, #80] +mul v28.4S, v28.4S,v5.s[0] +add v19.4s, v19.4s, v15.4s +ldr q15, [x0, #208] +mla v11.4S, v14.4S, v31.s[0] +mla v28.4S, v0.4S, v31.s[0] +sub v0.4s, v17.4s, v9.4s +sqrdmulh v14.4S, v26.4S, v13.s[1] +add v17.4s, v17.4s, v9.4s +mul v26.4S, v26.4S,v16.s[1] +sqrdmulh v9.4S, v29.4S, v13.s[2] +sub v20.4s, v6.4s, v24.4s +mul v29.4S, v29.4S,v16.s[2] +add v6.4s, v6.4s, v24.4s +sqrdmulh v13.4S, v17.4S, v10.s[1] +sub v16.4s, v25.4s, v22.4s +mul v17.4S, v17.4S,v21.s[1] +add v25.4s, v25.4s, v22.4s +sqrdmulh v22.4S, v0.4S, v10.s[2] +sub v24.4s, v18.4s, v11.4s +mul v0.4S, v0.4S,v21.s[2] +add v18.4s, v18.4s, v11.4s +mla v26.4S, v14.4S, v31.s[0] +sub v14.4s, v15.4s, v28.4s +ldr q10, [x0, #480] +sqrdmulh v21.4S, v25.4S, v27.s[1] +add v15.4s, v15.4s, v28.4s +mla v29.4S, v9.4S, v31.s[0] +ldr q9, [x0, #416] +sqrdmulh v28.4S, v16.4S, v27.s[2] +sub v11.4s, v7.4s, v26.4s +mla v17.4S, v13.4S, v31.s[0] +ldr q13, [x0, #288] +sqrdmulh v12.4S, v15.4S, v23.s[1] +add v7.4s, v7.4s, v26.4s +str q11, [x0, #16] +mla v0.4S, v22.4S, v31.s[0] +ldr q22, [x17, #+256] +ldr q11, [x17, #+272] +sqrdmulh v26.4S, v14.4S, v23.s[2] +sub v2.4s, v8.4s, v29.4s +str q7, [x0, #0] +mul v25.4S, v25.4S,v30.s[1] +add v8.4s, v8.4s, v29.4s +mul v16.4S, v16.4S,v30.s[2] +str q2, [x0, #48] +mla v25.4S, v21.4S, v31.s[0] +sub v21.4s, v19.4s, v17.4s +mla v16.4S, v28.4S, v31.s[0] +str q8, [x0, #32] +mul v15.4S, v15.4S,v5.s[1] +str q21, [x0, #80] +mul v14.4S, v14.4S,v5.s[2] +add v19.4s, v19.4s, v17.4s +str q19, [x0, #64] +mla v15.4S, v12.4S, v31.s[0] +sub v12.4s, v1.4s, v0.4s +str q12, [x0, #112] +mla v14.4S, v26.4S, v31.s[0] +add v1.4s, v1.4s, v0.4s +str q1, [x0, #96] +sqrdmulh v23.4S, v13.4S, v11.s[0] +sub v5.4s, v6.4s, v25.4s +mul v13.4S, v13.4S,v22.s[0] +str q5, [x0, #144] +ldr q5, [x0, #304] +sqrdmulh v1.4S, v5.4S, v11.s[0] +add v6.4s, v6.4s, v25.4s +mul v5.4S, v5.4S,v22.s[0] +str q6, [x0, #128] +ldr q6, [x17, #+288] +ldr q25, [x17, #+304] +ldr q0, [x0, #352] +sqrdmulh v26.4S, v0.4S, v25.s[0] +sub v12.4s, v20.4s, v16.4s +mul v0.4S, v0.4S,v6.s[0] +str q12, [x0, #176] +ldr q12, [x0, #368] +sqrdmulh v19.4S, v12.4S, v25.s[0] +add v20.4s, v20.4s, v16.4s +mul v12.4S, v12.4S,v6.s[0] +str q20, [x0, #160] +ldr q20, [x17, #+320] +ldr q16, [x17, #+336] +mla v13.4S, v23.4S, v31.s[0] +sub v23.4s, v18.4s, v15.4s +sqrdmulh v17.4S, v9.4S, v16.s[0] +str q23, [x0, #208] +ldr q23, [x0, #432] +mla v5.4S, v1.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +sqrdmulh v15.4S, v23.4S, v16.s[0] +str q18, [x0, #192] +ldr q18, [x17, #+352] +ldr q1, [x17, #+368] +mla v0.4S, v26.4S, v31.s[0] +sub v26.4s, v24.4s, v14.4s +sqrdmulh v21.4S, v10.4S, v1.s[0] +str q26, [x0, #240] +ldr q26, [x0, #496] +mla v12.4S, v19.4S, v31.s[0] +add v24.4s, v24.4s, v14.4s +sqrdmulh v14.4S, v26.4S, v1.s[0] +str q24, [x0, #224] +ldr q24, [x0, #256] +ldr q19, [x0, #384] +mul v9.4S, v9.4S,v20.s[0] +sub v27.4s, v24.4s, v13.4s +ldr q30, [x0, #272] +mul v23.4S, v23.4S,v20.s[0] +add v24.4s, v24.4s, v13.4s +ldr q13, [x0, #400] +mla v9.4S, v17.4S, v31.s[0] +sub v17.4s, v30.4s, v5.4s +ldr q8, [x0, #320] +mla v23.4S, v15.4S, v31.s[0] +add v30.4s, v30.4s, v5.4s +ldr q5, [x0, #448] +mul v10.4S, v10.4S,v18.s[0] +sub v15.4s, v8.4s, v0.4s +ldr q28, [x0, #336] +mul v26.4S, v26.4S,v18.s[0] +add v8.4s, v8.4s, v0.4s +ldr q0, [x0, #464] +mla v10.4S, v21.4S, v31.s[0] +mla v26.4S, v14.4S, v31.s[0] +sub v14.4s, v28.4s, v12.4s +sqrdmulh v21.4S, v30.4S, v11.s[1] +add v28.4s, v28.4s, v12.4s +mul v30.4S, v30.4S,v22.s[1] +sqrdmulh v12.4S, v17.4S, v11.s[2] +sub v2.4s, v19.4s, v9.4s +mul v17.4S, v17.4S,v22.s[2] +add v19.4s, v19.4s, v9.4s +sqrdmulh v11.4S, v28.4S, v25.s[1] +sub v22.4s, v13.4s, v23.4s +mul v28.4S, v28.4S,v6.s[1] +add v13.4s, v13.4s, v23.4s +sqrdmulh v23.4S, v14.4S, v25.s[2] +sub v9.4s, v5.4s, v10.4s +mul v14.4S, v14.4S,v6.s[2] +add v5.4s, v5.4s, v10.4s +mla v30.4S, v21.4S, v31.s[0] +sub v21.4s, v0.4s, v26.4s +ldr q25, [x0, #736] +sqrdmulh v6.4S, v13.4S, v16.s[1] +add v0.4s, v0.4s, v26.4s +mla v17.4S, v12.4S, v31.s[0] +ldr q12, [x0, #672] +sqrdmulh v26.4S, v22.4S, v16.s[2] +sub v10.4s, v24.4s, v30.4s +mla v28.4S, v11.4S, v31.s[0] +ldr q11, [x0, #544] +sqrdmulh v29.4S, v0.4S, v1.s[1] +add v24.4s, v24.4s, v30.4s +str q10, [x0, #272] +mla v14.4S, v23.4S, v31.s[0] +ldr q23, [x17, #+384] +ldr q10, [x17, #+400] +sqrdmulh v30.4S, v21.4S, v1.s[2] +sub v7.4s, v27.4s, v17.4s +str q24, [x0, #256] +mul v13.4S, v13.4S,v20.s[1] +add v27.4s, v27.4s, v17.4s +mul v22.4S, v22.4S,v20.s[2] +str q7, [x0, #304] +mla v13.4S, v6.4S, v31.s[0] +sub v6.4s, v8.4s, v28.4s +mla v22.4S, v26.4S, v31.s[0] +str q27, [x0, #288] +mul v0.4S, v0.4S,v18.s[1] +str q6, [x0, #336] +mul v21.4S, v21.4S,v18.s[2] +add v8.4s, v8.4s, v28.4s +str q8, [x0, #320] +mla v0.4S, v29.4S, v31.s[0] +sub v29.4s, v15.4s, v14.4s +str q29, [x0, #368] +mla v21.4S, v30.4S, v31.s[0] +add v15.4s, v15.4s, v14.4s +str q15, [x0, #352] +sqrdmulh v1.4S, v11.4S, v10.s[0] +sub v18.4s, v19.4s, v13.4s +mul v11.4S, v11.4S,v23.s[0] +str q18, [x0, #400] +ldr q18, [x0, #560] +sqrdmulh v15.4S, v18.4S, v10.s[0] +add v19.4s, v19.4s, v13.4s +mul v18.4S, v18.4S,v23.s[0] +str q19, [x0, #384] +ldr q19, [x17, #+416] +ldr q13, [x17, #+432] +ldr q14, [x0, #608] +sqrdmulh v30.4S, v14.4S, v13.s[0] +sub v29.4s, v2.4s, v22.4s +mul v14.4S, v14.4S,v19.s[0] +str q29, [x0, #432] +ldr q29, [x0, #624] +sqrdmulh v8.4S, v29.4S, v13.s[0] +add v2.4s, v2.4s, v22.4s +mul v29.4S, v29.4S,v19.s[0] +str q2, [x0, #416] +ldr q2, [x17, #+448] +ldr q22, [x17, #+464] +mla v11.4S, v1.4S, v31.s[0] +sub v1.4s, v5.4s, v0.4s +sqrdmulh v28.4S, v12.4S, v22.s[0] +str q1, [x0, #464] +ldr q1, [x0, #688] +mla v18.4S, v15.4S, v31.s[0] +add v5.4s, v5.4s, v0.4s +sqrdmulh v0.4S, v1.4S, v22.s[0] +str q5, [x0, #448] +ldr q5, [x17, #+480] +ldr q15, [x17, #+496] +mla v14.4S, v30.4S, v31.s[0] +sub v30.4s, v9.4s, v21.4s +sqrdmulh v6.4S, v25.4S, v15.s[0] +str q30, [x0, #496] +ldr q30, [x0, #752] +mla v29.4S, v8.4S, v31.s[0] +add v9.4s, v9.4s, v21.4s +sqrdmulh v21.4S, v30.4S, v15.s[0] +str q9, [x0, #480] +ldr q9, [x0, #512] +ldr q8, [x0, #640] +mul v12.4S, v12.4S,v2.s[0] +sub v16.4s, v9.4s, v11.4s +ldr q20, [x0, #528] +mul v1.4S, v1.4S,v2.s[0] +add v9.4s, v9.4s, v11.4s +ldr q11, [x0, #656] +mla v12.4S, v28.4S, v31.s[0] +sub v28.4s, v20.4s, v18.4s +ldr q27, [x0, #576] +mla v1.4S, v0.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +ldr q18, [x0, #704] +mul v25.4S, v25.4S,v5.s[0] +sub v0.4s, v27.4s, v14.4s +ldr q26, [x0, #592] +mul v30.4S, v30.4S,v5.s[0] +add v27.4s, v27.4s, v14.4s +ldr q14, [x0, #720] +mla v25.4S, v6.4S, v31.s[0] +mla v30.4S, v21.4S, v31.s[0] +sub v21.4s, v26.4s, v29.4s +sqrdmulh v6.4S, v20.4S, v10.s[1] +add v26.4s, v26.4s, v29.4s +mul v20.4S, v20.4S,v23.s[1] +sqrdmulh v29.4S, v28.4S, v10.s[2] +sub v7.4s, v8.4s, v12.4s +mul v28.4S, v28.4S,v23.s[2] +add v8.4s, v8.4s, v12.4s +sqrdmulh v10.4S, v26.4S, v13.s[1] +sub v23.4s, v11.4s, v1.4s +mul v26.4S, v26.4S,v19.s[1] +add v11.4s, v11.4s, v1.4s +sqrdmulh v1.4S, v21.4S, v13.s[2] +sub v12.4s, v18.4s, v25.4s +mul v21.4S, v21.4S,v19.s[2] +add v18.4s, v18.4s, v25.4s +mla v20.4S, v6.4S, v31.s[0] +sub v6.4s, v14.4s, v30.4s +ldr q13, [x0, #992] +sqrdmulh v19.4S, v11.4S, v22.s[1] +add v14.4s, v14.4s, v30.4s +mla v28.4S, v29.4S, v31.s[0] +ldr q29, [x0, #928] +sqrdmulh v30.4S, v23.4S, v22.s[2] +sub v25.4s, v9.4s, v20.4s +mla v26.4S, v10.4S, v31.s[0] +ldr q10, [x0, #800] +sqrdmulh v17.4S, v14.4S, v15.s[1] +add v9.4s, v9.4s, v20.4s +str q25, [x0, #528] +mla v21.4S, v1.4S, v31.s[0] +ldr q1, [x17, #+512] +ldr q25, [x17, #+528] +sqrdmulh v20.4S, v6.4S, v15.s[2] +sub v24.4s, v16.4s, v28.4s +str q9, [x0, #512] +mul v11.4S, v11.4S,v2.s[1] +add v16.4s, v16.4s, v28.4s +mul v23.4S, v23.4S,v2.s[2] +str q24, [x0, #560] +mla v11.4S, v19.4S, v31.s[0] +sub v19.4s, v27.4s, v26.4s +mla v23.4S, v30.4S, v31.s[0] +str q16, [x0, #544] +mul v14.4S, v14.4S,v5.s[1] +str q19, [x0, #592] +mul v6.4S, v6.4S,v5.s[2] +add v27.4s, v27.4s, v26.4s +str q27, [x0, #576] +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v0.4s, v21.4s +str q17, [x0, #624] +mla v6.4S, v20.4S, v31.s[0] +add v0.4s, v0.4s, v21.4s +str q0, [x0, #608] +sqrdmulh v15.4S, v10.4S, v25.s[0] +sub v5.4s, v8.4s, v11.4s +mul v10.4S, v10.4S,v1.s[0] +str q5, [x0, #656] +ldr q5, [x0, #816] +sqrdmulh v0.4S, v5.4S, v25.s[0] +add v8.4s, v8.4s, v11.4s +mul v5.4S, v5.4S,v1.s[0] +str q8, [x0, #640] +ldr q8, [x17, #+544] +ldr q11, [x17, #+560] +ldr q21, [x0, #864] +sqrdmulh v20.4S, v21.4S, v11.s[0] +sub v17.4s, v7.4s, v23.4s +mul v21.4S, v21.4S,v8.s[0] +str q17, [x0, #688] +ldr q17, [x0, #880] +sqrdmulh v27.4S, v17.4S, v11.s[0] +add v7.4s, v7.4s, v23.4s +mul v17.4S, v17.4S,v8.s[0] +str q7, [x0, #672] +ldr q7, [x17, #+576] +ldr q23, [x17, #+592] +mla v10.4S, v15.4S, v31.s[0] +sub v15.4s, v18.4s, v14.4s +sqrdmulh v26.4S, v29.4S, v23.s[0] +str q15, [x0, #720] +ldr q15, [x0, #944] +mla v5.4S, v0.4S, v31.s[0] +add v18.4s, v18.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v23.s[0] +str q18, [x0, #704] +ldr q18, [x17, #+608] +ldr q0, [x17, #+624] +mla v21.4S, v20.4S, v31.s[0] +sub v20.4s, v12.4s, v6.4s +sqrdmulh v19.4S, v13.4S, v0.s[0] +str q20, [x0, #752] +ldr q20, [x0, #1008] +mla v17.4S, v27.4S, v31.s[0] +add v12.4s, v12.4s, v6.4s +sqrdmulh v6.4S, v20.4S, v0.s[0] +str q12, [x0, #736] +ldr q12, [x0, #768] +ldr q27, [x0, #896] +mul v29.4S, v29.4S,v7.s[0] +sub v22.4s, v12.4s, v10.4s +ldr q2, [x0, #784] +mul v15.4S, v15.4S,v7.s[0] +add v12.4s, v12.4s, v10.4s +ldr q10, [x0, #912] +mla v29.4S, v26.4S, v31.s[0] +sub v26.4s, v2.4s, v5.4s +ldr q16, [x0, #832] +mla v15.4S, v14.4S, v31.s[0] +add v2.4s, v2.4s, v5.4s +ldr q5, [x0, #960] +mul v13.4S, v13.4S,v18.s[0] +sub v14.4s, v16.4s, v21.4s +ldr q30, [x0, #848] +mul v20.4S, v20.4S,v18.s[0] +add v16.4s, v16.4s, v21.4s +ldr q21, [x0, #976] +mla v13.4S, v19.4S, v31.s[0] +mla v20.4S, v6.4S, v31.s[0] +sub v6.4s, v30.4s, v17.4s +sqrdmulh v19.4S, v2.4S, v25.s[1] +add v30.4s, v30.4s, v17.4s +mul v2.4S, v2.4S,v1.s[1] +sqrdmulh v17.4S, v26.4S, v25.s[2] +sub v24.4s, v27.4s, v29.4s +mul v26.4S, v26.4S,v1.s[2] +add v27.4s, v27.4s, v29.4s +sqrdmulh v25.4S, v30.4S, v11.s[1] +sub v1.4s, v10.4s, v15.4s +mul v30.4S, v30.4S,v8.s[1] +add v10.4s, v10.4s, v15.4s +sqrdmulh v15.4S, v6.4S, v11.s[2] +sub v29.4s, v5.4s, v13.4s +mul v6.4S, v6.4S,v8.s[2] +add v5.4s, v5.4s, v13.4s +mla v2.4S, v19.4S, v31.s[0] +sub v19.4s, v21.4s, v20.4s +sqrdmulh v11.4S, v10.4S, v23.s[1] +add v21.4s, v21.4s, v20.4s +mla v26.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v1.4S, v23.s[2] +sub v20.4s, v12.4s, v2.4s +mla v30.4S, v25.4S, v31.s[0] +sqrdmulh v25.4S, v21.4S, v0.s[1] +add v12.4s, v12.4s, v2.4s +str q20, [x0, #784] +mla v6.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v19.4S, v0.s[2] +sub v20.4s, v22.4s, v26.4s +str q12, [x0, #768] +mul v10.4S, v10.4S,v7.s[1] +add v22.4s, v22.4s, v26.4s +mul v1.4S, v1.4S,v7.s[2] +str q20, [x0, #816] +mla v10.4S, v11.4S, v31.s[0] +sub v11.4s, v16.4s, v30.4s +mla v1.4S, v17.4S, v31.s[0] +str q22, [x0, #800] +mul v21.4S, v21.4S,v18.s[1] +str q11, [x0, #848] +mul v19.4S, v19.4S,v18.s[2] +add v16.4s, v16.4s, v30.4s +str q16, [x0, #832] +mla v21.4S, v25.4S, v31.s[0] +sub v25.4s, v14.4s, v6.4s +str q25, [x0, #880] +mla v19.4S, v15.4S, v31.s[0] +add v14.4s, v14.4s, v6.4s +str q14, [x0, #864] +sub v0.4s, v27.4s, v10.4s +str q0, [x0, #912] +add v27.4s, v27.4s, v10.4s +str q27, [x0, #896] +sub v27.4s, v24.4s, v1.4s +str q27, [x0, #944] +add v24.4s, v24.4s, v1.4s +str q24, [x0, #928] +sub v24.4s, v5.4s, v21.4s +str q24, [x0, #976] +add v5.4s, v5.4s, v21.4s +str q5, [x0, #960] +sub v5.4s, v29.4s, v19.4s +str q5, [x0, #1008] +add v29.4s, v29.4s, v19.4s +str q29, [x0, #992] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1528 +// Instruction count: 1524 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_20_z4_7.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_20_z4_7.s new file mode 100644 index 0000000..203055c --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_20_z4_7.s @@ -0,0 +1,1558 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_20_z4_7 +.global _ntt_u32_incomplete_neon_asm_var_4_2_20_z4_7 +ntt_u32_incomplete_neon_asm_var_4_2_20_z4_7: +_ntt_u32_incomplete_neon_asm_var_4_2_20_z4_7: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x0, #992] +sqrdmulh v27.4S, v28.4S, v29.s[0] +mul v28.4S, v28.4S,v30.s[0] +ldr q26, [x0, #928] +sqrdmulh v25.4S, v26.4S, v29.s[0] +mul v26.4S, v26.4S,v30.s[0] +ldr q24, [x0, #864] +sqrdmulh v23.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +ldr q22, [x0, #800] +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +ldr q20, [x0, #736] +sqrdmulh v19.4S, v20.4S, v29.s[0] +mla v28.4S, v27.4S, v31.s[0] +ldr q27, [x0, #672] +sqrdmulh v18.4S, v27.4S, v29.s[0] +mla v26.4S, v25.4S, v31.s[0] +ldr q25, [x0, #608] +sqrdmulh v17.4S, v25.4S, v29.s[0] +mla v24.4S, v23.4S, v31.s[0] +ldr q23, [x0, #544] +sqrdmulh v16.4S, v23.4S, v29.s[0] +mla v22.4S, v21.4S, v31.s[0] +ldr q21, [x0, #480] +ldr q3, [x0, #416] +mul v27.4S, v27.4S,v30.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q2, [x0, #352] +ldr q1, [x0, #288] +mla v27.4S, v18.4S, v31.s[0] +mla v20.4S, v19.4S, v31.s[0] +ldr q19, [x0, #224] +ldr q18, [x0, #160] +mul v23.4S, v23.4S,v30.s[0] +mul v25.4S, v25.4S,v30.s[0] +ldr q0, [x0, #96] +ldr q15, [x0, #32] +mla v23.4S, v16.4S, v31.s[0] +mla v25.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v28.4s +add v21.4s, v21.4s, v28.4s +sqrdmulh v28.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +sub v16.4s, v3.4s, v26.4s +add v3.4s, v3.4s, v26.4s +sqrdmulh v26.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +sub v14.4s, v2.4s, v24.4s +add v2.4s, v2.4s, v24.4s +sqrdmulh v24.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v13.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +sqrdmulh v22.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v12.4s, v19.4s, v20.4s +add v19.4s, v19.4s, v20.4s +sqrdmulh v20.4S, v14.4S, v29.s[2] +mla v17.4S, v28.4S, v31.s[0] +sub v28.4s, v18.4s, v27.4s +add v18.4s, v18.4s, v27.4s +sqrdmulh v27.4S, v13.4S, v29.s[2] +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v0.4s, v25.4s +add v0.4s, v0.4s, v25.4s +sqrdmulh v25.4S, v2.4S, v29.s[1] +mla v21.4S, v24.4S, v31.s[0] +sub v24.4s, v15.4s, v23.4s +sqrdmulh v11.4S, v1.4S, v29.s[1] +mla v3.4S, v22.4S, v31.s[0] +add v15.4s, v15.4s, v23.4s +ldr q23, [x17, #+32] +ldr q22, [x17, #+48] +mul v13.4S, v13.4S,v30.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v10.4s, v12.4s, v17.4s +add v12.4s, v12.4s, v17.4s +mla v13.4S, v27.4S, v31.s[0] +mla v14.4S, v20.4S, v31.s[0] +sub v20.4s, v28.4s, v16.4s +add v28.4s, v28.4s, v16.4s +mul v1.4S, v1.4S,v30.s[1] +mul v2.4S, v2.4S,v30.s[1] +sub v16.4s, v19.4s, v21.4s +add v19.4s, v19.4s, v21.4s +mla v1.4S, v11.4S, v31.s[0] +mla v2.4S, v25.4S, v31.s[0] +sub v25.4s, v18.4s, v3.4s +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v10.4S, v22.s[3] +mul v10.4S, v10.4S,v23.s[3] +sub v11.4s, v26.4s, v14.4s +add v26.4s, v26.4s, v14.4s +sqrdmulh v14.4S, v12.4S, v22.s[2] +mul v12.4S, v12.4S,v23.s[2] +sub v21.4s, v24.4s, v13.4s +add v24.4s, v24.4s, v13.4s +sqrdmulh v13.4S, v16.4S, v22.s[1] +mul v16.4S, v16.4S,v23.s[1] +sub v27.4s, v0.4s, v2.4s +add v0.4s, v0.4s, v2.4s +sqrdmulh v2.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v17.4s, v15.4s, v1.4s +add v15.4s, v15.4s, v1.4s +ldr q1, [x17, #+96] +ldr q9, [x17, #+112] +sqrdmulh v8.4S, v20.4S, v22.s[3] +mla v10.4S, v3.4S, v31.s[0] +nop +nop +sqrdmulh v3.4S, v28.4S, v22.s[2] +mla v12.4S, v14.4S, v31.s[0] +nop +nop +sqrdmulh v14.4S, v25.4S, v22.s[1] +mla v16.4S, v13.4S, v31.s[0] +nop +nop +sqrdmulh v13.4S, v18.4S, v22.s[0] +mla v19.4S, v2.4S, v31.s[0] +nop +nop +ldr q2, [x17, #+64] +ldr q7, [x17, #+80] +mul v28.4S, v28.4S,v23.s[2] +mul v20.4S, v20.4S,v23.s[3] +sub v6.4s, v11.4s, v10.4s +add v11.4s, v11.4s, v10.4s +mla v28.4S, v3.4S, v31.s[0] +mla v20.4S, v8.4S, v31.s[0] +sub v8.4s, v26.4s, v12.4s +add v26.4s, v26.4s, v12.4s +mul v18.4S, v18.4S,v23.s[0] +mul v25.4S, v25.4S,v23.s[1] +sub v12.4s, v27.4s, v16.4s +add v27.4s, v27.4s, v16.4s +mla v18.4S, v13.4S, v31.s[0] +mla v25.4S, v14.4S, v31.s[0] +sub v14.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v9.s[3] +mul v6.4S, v6.4S,v1.s[3] +sub v13.4s, v21.4s, v20.4s +add v21.4s, v21.4s, v20.4s +sqrdmulh v20.4S, v11.4S, v9.s[2] +mul v11.4S, v11.4S,v1.s[2] +sub v16.4s, v24.4s, v28.4s +add v24.4s, v24.4s, v28.4s +sqrdmulh v28.4S, v8.4S, v9.s[1] +mul v8.4S, v8.4S,v1.s[1] +sub v3.4s, v17.4s, v25.4s +add v17.4s, v17.4s, v25.4s +sqrdmulh v25.4S, v26.4S, v9.s[0] +mul v26.4S, v26.4S,v1.s[0] +sub v10.4s, v15.4s, v18.4s +add v15.4s, v15.4s, v18.4s +sqrdmulh v18.4S, v12.4S, v7.s[3] +mla v6.4S, v19.4S, v31.s[0] +nop +nop +sqrdmulh v19.4S, v27.4S, v7.s[2] +mla v11.4S, v20.4S, v31.s[0] +nop +nop +sqrdmulh v20.4S, v14.4S, v7.s[1] +mla v8.4S, v28.4S, v31.s[0] +nop +nop +sqrdmulh v28.4S, v0.4S, v7.s[0] +mla v26.4S, v25.4S, v31.s[0] +nop +nop +mul v27.4S, v27.4S,v2.s[2] +mul v12.4S, v12.4S,v2.s[3] +sub v25.4s, v13.4s, v6.4s +str q25, [x0, #992] +mla v27.4S, v19.4S, v31.s[0] +mla v12.4S, v18.4S, v31.s[0] +add v13.4s, v13.4s, v6.4s +str q13, [x0, #928] +mul v0.4S, v0.4S,v2.s[0] +mul v14.4S, v14.4S,v2.s[1] +sub v13.4s, v21.4s, v11.4s +str q13, [x0, #864] +mla v0.4S, v28.4S, v31.s[0] +mla v14.4S, v20.4S, v31.s[0] +add v21.4s, v21.4s, v11.4s +sub v11.4s, v16.4s, v8.4s +ldr q20, [x0, #1008] +sqrdmulh v28.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v16.4s, v16.4s, v8.4s +str q21, [x0, #800] +ldr q21, [x0, #944] +sqrdmulh v8.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +sub v13.4s, v24.4s, v26.4s +str q11, [x0, #736] +ldr q11, [x0, #880] +sqrdmulh v6.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +add v24.4s, v24.4s, v26.4s +str q16, [x0, #672] +ldr q16, [x0, #816] +sqrdmulh v26.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +str q13, [x0, #608] +sub v13.4s, v3.4s, v12.4s +ldr q18, [x0, #752] +sqrdmulh v19.4S, v18.4S, v29.s[0] +mla v20.4S, v28.4S, v31.s[0] +str q24, [x0, #544] +add v3.4s, v3.4s, v12.4s +ldr q12, [x0, #688] +sqrdmulh v24.4S, v12.4S, v29.s[0] +mla v21.4S, v8.4S, v31.s[0] +str q13, [x0, #480] +sub v13.4s, v17.4s, v27.4s +ldr q8, [x0, #624] +sqrdmulh v28.4S, v8.4S, v29.s[0] +mla v11.4S, v6.4S, v31.s[0] +str q3, [x0, #416] +add v17.4s, v17.4s, v27.4s +ldr q27, [x0, #560] +sqrdmulh v3.4S, v27.4S, v29.s[0] +mla v16.4S, v26.4S, v31.s[0] +str q13, [x0, #352] +sub v13.4s, v10.4s, v14.4s +ldr q26, [x0, #496] +ldr q6, [x0, #432] +mul v12.4S, v12.4S,v30.s[0] +mul v18.4S, v18.4S,v30.s[0] +str q17, [x0, #288] +add v10.4s, v10.4s, v14.4s +ldr q14, [x0, #368] +ldr q17, [x0, #304] +mla v12.4S, v24.4S, v31.s[0] +mla v18.4S, v19.4S, v31.s[0] +str q13, [x0, #224] +sub v13.4s, v15.4s, v0.4s +ldr q19, [x0, #240] +ldr q24, [x0, #176] +mul v27.4S, v27.4S,v30.s[0] +mul v8.4S, v8.4S,v30.s[0] +str q10, [x0, #160] +add v15.4s, v15.4s, v0.4s +ldr q0, [x0, #112] +ldr q10, [x0, #48] +mla v27.4S, v3.4S, v31.s[0] +mla v8.4S, v28.4S, v31.s[0] +sub v28.4s, v26.4s, v20.4s +add v26.4s, v26.4s, v20.4s +sqrdmulh v20.4S, v28.4S, v29.s[2] +mul v28.4S, v28.4S,v30.s[2] +sub v3.4s, v6.4s, v21.4s +add v6.4s, v6.4s, v21.4s +sqrdmulh v21.4S, v3.4S, v29.s[2] +mul v3.4S, v3.4S,v30.s[2] +sub v25.4s, v14.4s, v11.4s +add v14.4s, v14.4s, v11.4s +sqrdmulh v11.4S, v26.4S, v29.s[1] +mul v26.4S, v26.4S,v30.s[1] +sub v5.4s, v17.4s, v16.4s +add v17.4s, v17.4s, v16.4s +sqrdmulh v16.4S, v6.4S, v29.s[1] +mul v6.4S, v6.4S,v30.s[1] +sub v4.4s, v19.4s, v18.4s +add v19.4s, v19.4s, v18.4s +sqrdmulh v18.4S, v25.4S, v29.s[2] +mla v28.4S, v20.4S, v31.s[0] +sub v20.4s, v24.4s, v12.4s +add v24.4s, v24.4s, v12.4s +sqrdmulh v12.4S, v5.4S, v29.s[2] +mla v3.4S, v21.4S, v31.s[0] +sub v21.4s, v0.4s, v8.4s +add v0.4s, v0.4s, v8.4s +sqrdmulh v8.4S, v14.4S, v29.s[1] +mla v26.4S, v11.4S, v31.s[0] +sub v11.4s, v10.4s, v27.4s +str q13, [x0, #96] +sqrdmulh v13.4S, v17.4S, v29.s[1] +mla v6.4S, v16.4S, v31.s[0] +add v10.4s, v10.4s, v27.4s +str q15, [x0, #32] +mul v5.4S, v5.4S,v30.s[2] +mul v25.4S, v25.4S,v30.s[2] +sub v15.4s, v4.4s, v28.4s +add v4.4s, v4.4s, v28.4s +mla v5.4S, v12.4S, v31.s[0] +mla v25.4S, v18.4S, v31.s[0] +sub v18.4s, v20.4s, v3.4s +add v20.4s, v20.4s, v3.4s +mul v17.4S, v17.4S,v30.s[1] +mul v14.4S, v14.4S,v30.s[1] +sub v3.4s, v19.4s, v26.4s +add v19.4s, v19.4s, v26.4s +mla v17.4S, v13.4S, v31.s[0] +mla v14.4S, v8.4S, v31.s[0] +sub v8.4s, v24.4s, v6.4s +add v24.4s, v24.4s, v6.4s +sqrdmulh v6.4S, v15.4S, v22.s[3] +mul v15.4S, v15.4S,v23.s[3] +sub v13.4s, v21.4s, v25.4s +add v21.4s, v21.4s, v25.4s +sqrdmulh v25.4S, v4.4S, v22.s[2] +mul v4.4S, v4.4S,v23.s[2] +sub v26.4s, v11.4s, v5.4s +add v11.4s, v11.4s, v5.4s +sqrdmulh v5.4S, v3.4S, v22.s[1] +mul v3.4S, v3.4S,v23.s[1] +sub v12.4s, v0.4s, v14.4s +add v0.4s, v0.4s, v14.4s +sqrdmulh v14.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v28.4s, v10.4s, v17.4s +add v10.4s, v10.4s, v17.4s +sqrdmulh v17.4S, v18.4S, v22.s[3] +mla v15.4S, v6.4S, v31.s[0] +nop +nop +sqrdmulh v6.4S, v20.4S, v22.s[2] +mla v4.4S, v25.4S, v31.s[0] +nop +nop +sqrdmulh v25.4S, v8.4S, v22.s[1] +mla v3.4S, v5.4S, v31.s[0] +nop +nop +sqrdmulh v5.4S, v24.4S, v22.s[0] +mla v19.4S, v14.4S, v31.s[0] +nop +nop +mul v20.4S, v20.4S,v23.s[2] +mul v18.4S, v18.4S,v23.s[3] +sub v14.4s, v13.4s, v15.4s +add v13.4s, v13.4s, v15.4s +mla v20.4S, v6.4S, v31.s[0] +mla v18.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v4.4s +add v21.4s, v21.4s, v4.4s +mul v24.4S, v24.4S,v23.s[0] +mul v8.4S, v8.4S,v23.s[1] +sub v4.4s, v12.4s, v3.4s +add v12.4s, v12.4s, v3.4s +mla v24.4S, v5.4S, v31.s[0] +mla v8.4S, v25.4S, v31.s[0] +sub v25.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v9.s[3] +mul v14.4S, v14.4S,v1.s[3] +sub v5.4s, v26.4s, v18.4s +add v26.4s, v26.4s, v18.4s +sqrdmulh v18.4S, v13.4S, v9.s[2] +mul v13.4S, v13.4S,v1.s[2] +sub v3.4s, v11.4s, v20.4s +add v11.4s, v11.4s, v20.4s +sqrdmulh v20.4S, v17.4S, v9.s[1] +mul v17.4S, v17.4S,v1.s[1] +sub v6.4s, v28.4s, v8.4s +add v28.4s, v28.4s, v8.4s +sqrdmulh v8.4S, v21.4S, v9.s[0] +mul v21.4S, v21.4S,v1.s[0] +sub v15.4s, v10.4s, v24.4s +add v10.4s, v10.4s, v24.4s +sqrdmulh v24.4S, v4.4S, v7.s[3] +mla v14.4S, v19.4S, v31.s[0] +nop +nop +sqrdmulh v19.4S, v12.4S, v7.s[2] +mla v13.4S, v18.4S, v31.s[0] +nop +nop +sqrdmulh v18.4S, v25.4S, v7.s[1] +mla v17.4S, v20.4S, v31.s[0] +nop +nop +sqrdmulh v20.4S, v0.4S, v7.s[0] +mla v21.4S, v8.4S, v31.s[0] +nop +nop +mul v12.4S, v12.4S,v2.s[2] +mul v4.4S, v4.4S,v2.s[3] +sub v8.4s, v5.4s, v14.4s +str q8, [x0, #1008] +mla v12.4S, v19.4S, v31.s[0] +mla v4.4S, v24.4S, v31.s[0] +add v5.4s, v5.4s, v14.4s +str q5, [x0, #944] +mul v0.4S, v0.4S,v2.s[0] +mul v25.4S, v25.4S,v2.s[1] +sub v5.4s, v26.4s, v13.4s +str q5, [x0, #880] +mla v0.4S, v20.4S, v31.s[0] +mla v25.4S, v18.4S, v31.s[0] +add v26.4s, v26.4s, v13.4s +sub v13.4s, v3.4s, v17.4s +ldr q18, [x0, #960] +sqrdmulh v20.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +add v3.4s, v3.4s, v17.4s +str q26, [x0, #816] +ldr q26, [x0, #896] +sqrdmulh v17.4S, v26.4S, v29.s[0] +mul v26.4S, v26.4S,v30.s[0] +sub v5.4s, v11.4s, v21.4s +str q13, [x0, #752] +ldr q13, [x0, #832] +sqrdmulh v14.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +add v11.4s, v11.4s, v21.4s +str q3, [x0, #688] +ldr q3, [x0, #768] +sqrdmulh v21.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +str q5, [x0, #624] +sub v5.4s, v6.4s, v4.4s +ldr q24, [x0, #704] +sqrdmulh v19.4S, v24.4S, v29.s[0] +mla v18.4S, v20.4S, v31.s[0] +str q11, [x0, #560] +add v6.4s, v6.4s, v4.4s +ldr q4, [x0, #640] +sqrdmulh v11.4S, v4.4S, v29.s[0] +mla v26.4S, v17.4S, v31.s[0] +str q5, [x0, #496] +sub v5.4s, v28.4s, v12.4s +ldr q17, [x0, #576] +sqrdmulh v20.4S, v17.4S, v29.s[0] +mla v13.4S, v14.4S, v31.s[0] +str q6, [x0, #432] +add v28.4s, v28.4s, v12.4s +ldr q12, [x0, #512] +sqrdmulh v6.4S, v12.4S, v29.s[0] +mla v3.4S, v21.4S, v31.s[0] +str q5, [x0, #368] +sub v5.4s, v15.4s, v25.4s +ldr q21, [x0, #448] +ldr q14, [x0, #384] +mul v4.4S, v4.4S,v30.s[0] +mul v24.4S, v24.4S,v30.s[0] +str q28, [x0, #304] +add v15.4s, v15.4s, v25.4s +ldr q25, [x0, #320] +ldr q28, [x0, #256] +mla v4.4S, v11.4S, v31.s[0] +mla v24.4S, v19.4S, v31.s[0] +str q5, [x0, #240] +sub v5.4s, v10.4s, v0.4s +ldr q19, [x0, #192] +ldr q11, [x0, #128] +mul v12.4S, v12.4S,v30.s[0] +mul v17.4S, v17.4S,v30.s[0] +str q15, [x0, #176] +add v10.4s, v10.4s, v0.4s +ldr q0, [x0, #64] +ldr q15, [x0, #0] +mla v12.4S, v6.4S, v31.s[0] +mla v17.4S, v20.4S, v31.s[0] +sub v20.4s, v21.4s, v18.4s +add v21.4s, v21.4s, v18.4s +sqrdmulh v18.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +sub v6.4s, v14.4s, v26.4s +add v14.4s, v14.4s, v26.4s +sqrdmulh v26.4S, v6.4S, v29.s[2] +mul v6.4S, v6.4S,v30.s[2] +sub v8.4s, v25.4s, v13.4s +add v25.4s, v25.4s, v13.4s +sqrdmulh v13.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v27.4s, v28.4s, v3.4s +add v28.4s, v28.4s, v3.4s +sqrdmulh v3.4S, v14.4S, v29.s[1] +mul v14.4S, v14.4S,v30.s[1] +sub v16.4s, v19.4s, v24.4s +add v19.4s, v19.4s, v24.4s +sqrdmulh v24.4S, v8.4S, v29.s[2] +mla v20.4S, v18.4S, v31.s[0] +sub v18.4s, v11.4s, v4.4s +add v11.4s, v11.4s, v4.4s +sqrdmulh v4.4S, v27.4S, v29.s[2] +mla v6.4S, v26.4S, v31.s[0] +sub v26.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +sqrdmulh v17.4S, v25.4S, v29.s[1] +mla v21.4S, v13.4S, v31.s[0] +sub v13.4s, v15.4s, v12.4s +str q5, [x0, #112] +sqrdmulh v5.4S, v28.4S, v29.s[1] +mla v14.4S, v3.4S, v31.s[0] +add v15.4s, v15.4s, v12.4s +str q10, [x0, #48] +mul v27.4S, v27.4S,v30.s[2] +mul v8.4S, v8.4S,v30.s[2] +sub v10.4s, v16.4s, v20.4s +add v16.4s, v16.4s, v20.4s +mla v27.4S, v4.4S, v31.s[0] +mla v8.4S, v24.4S, v31.s[0] +sub v24.4s, v18.4s, v6.4s +add v18.4s, v18.4s, v6.4s +mul v28.4S, v28.4S,v30.s[1] +mul v25.4S, v25.4S,v30.s[1] +sub v6.4s, v19.4s, v21.4s +add v19.4s, v19.4s, v21.4s +mla v28.4S, v5.4S, v31.s[0] +mla v25.4S, v17.4S, v31.s[0] +sub v17.4s, v11.4s, v14.4s +add v11.4s, v11.4s, v14.4s +sqrdmulh v14.4S, v10.4S, v22.s[3] +mul v10.4S, v10.4S,v23.s[3] +sub v5.4s, v26.4s, v8.4s +add v26.4s, v26.4s, v8.4s +sqrdmulh v8.4S, v16.4S, v22.s[2] +mul v16.4S, v16.4S,v23.s[2] +sub v21.4s, v13.4s, v27.4s +add v13.4s, v13.4s, v27.4s +sqrdmulh v27.4S, v6.4S, v22.s[1] +mul v6.4S, v6.4S,v23.s[1] +sub v4.4s, v0.4s, v25.4s +add v0.4s, v0.4s, v25.4s +sqrdmulh v25.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v20.4s, v15.4s, v28.4s +add v15.4s, v15.4s, v28.4s +sqrdmulh v28.4S, v24.4S, v22.s[3] +mla v10.4S, v14.4S, v31.s[0] +nop +nop +sqrdmulh v14.4S, v18.4S, v22.s[2] +mla v16.4S, v8.4S, v31.s[0] +nop +nop +sqrdmulh v8.4S, v17.4S, v22.s[1] +mla v6.4S, v27.4S, v31.s[0] +nop +nop +sqrdmulh v27.4S, v11.4S, v22.s[0] +mla v19.4S, v25.4S, v31.s[0] +nop +nop +mul v18.4S, v18.4S,v23.s[2] +mul v24.4S, v24.4S,v23.s[3] +sub v25.4s, v5.4s, v10.4s +add v5.4s, v5.4s, v10.4s +mla v18.4S, v14.4S, v31.s[0] +mla v24.4S, v28.4S, v31.s[0] +sub v28.4s, v26.4s, v16.4s +add v26.4s, v26.4s, v16.4s +mul v11.4S, v11.4S,v23.s[0] +mul v17.4S, v17.4S,v23.s[1] +sub v16.4s, v4.4s, v6.4s +add v4.4s, v4.4s, v6.4s +mla v11.4S, v27.4S, v31.s[0] +mla v17.4S, v8.4S, v31.s[0] +sub v8.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v25.4S, v9.s[3] +mul v25.4S, v25.4S,v1.s[3] +sub v27.4s, v21.4s, v24.4s +add v21.4s, v21.4s, v24.4s +sqrdmulh v24.4S, v5.4S, v9.s[2] +mul v5.4S, v5.4S,v1.s[2] +sub v6.4s, v13.4s, v18.4s +add v13.4s, v13.4s, v18.4s +sqrdmulh v18.4S, v28.4S, v9.s[1] +mul v28.4S, v28.4S,v1.s[1] +sub v14.4s, v20.4s, v17.4s +add v20.4s, v20.4s, v17.4s +sqrdmulh v17.4S, v26.4S, v9.s[0] +mul v26.4S, v26.4S,v1.s[0] +sub v10.4s, v15.4s, v11.4s +add v15.4s, v15.4s, v11.4s +sqrdmulh v11.4S, v16.4S, v7.s[3] +mla v25.4S, v19.4S, v31.s[0] +nop +nop +sqrdmulh v19.4S, v4.4S, v7.s[2] +mla v5.4S, v24.4S, v31.s[0] +nop +nop +sqrdmulh v24.4S, v8.4S, v7.s[1] +mla v28.4S, v18.4S, v31.s[0] +nop +nop +sqrdmulh v18.4S, v0.4S, v7.s[0] +mla v26.4S, v17.4S, v31.s[0] +nop +nop +mul v4.4S, v4.4S,v2.s[2] +mul v16.4S, v16.4S,v2.s[3] +sub v17.4s, v27.4s, v25.4s +str q17, [x0, #960] +mla v4.4S, v19.4S, v31.s[0] +mla v16.4S, v11.4S, v31.s[0] +add v27.4s, v27.4s, v25.4s +str q27, [x0, #896] +mul v0.4S, v0.4S,v2.s[0] +mul v8.4S, v8.4S,v2.s[1] +sub v27.4s, v21.4s, v5.4s +str q27, [x0, #832] +mla v0.4S, v18.4S, v31.s[0] +mla v8.4S, v24.4S, v31.s[0] +add v21.4s, v21.4s, v5.4s +sub v5.4s, v6.4s, v28.4s +ldr q24, [x0, #976] +sqrdmulh v18.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +add v6.4s, v6.4s, v28.4s +str q21, [x0, #768] +ldr q21, [x0, #912] +sqrdmulh v28.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +sub v27.4s, v13.4s, v26.4s +str q5, [x0, #704] +ldr q5, [x0, #848] +sqrdmulh v25.4S, v5.4S, v29.s[0] +mul v5.4S, v5.4S,v30.s[0] +add v13.4s, v13.4s, v26.4s +str q6, [x0, #640] +ldr q6, [x0, #784] +sqrdmulh v26.4S, v6.4S, v29.s[0] +mul v6.4S, v6.4S,v30.s[0] +str q27, [x0, #576] +sub v27.4s, v14.4s, v16.4s +ldr q11, [x0, #720] +sqrdmulh v19.4S, v11.4S, v29.s[0] +mla v24.4S, v18.4S, v31.s[0] +str q13, [x0, #512] +add v14.4s, v14.4s, v16.4s +ldr q16, [x0, #656] +sqrdmulh v13.4S, v16.4S, v29.s[0] +mla v21.4S, v28.4S, v31.s[0] +str q27, [x0, #448] +sub v27.4s, v20.4s, v4.4s +ldr q28, [x0, #592] +sqrdmulh v18.4S, v28.4S, v29.s[0] +mla v5.4S, v25.4S, v31.s[0] +str q14, [x0, #384] +add v20.4s, v20.4s, v4.4s +ldr q4, [x0, #528] +sqrdmulh v14.4S, v4.4S, v29.s[0] +mla v6.4S, v26.4S, v31.s[0] +str q27, [x0, #320] +sub v27.4s, v10.4s, v8.4s +ldr q26, [x0, #464] +ldr q25, [x0, #400] +mul v16.4S, v16.4S,v30.s[0] +mul v11.4S, v11.4S,v30.s[0] +str q20, [x0, #256] +add v10.4s, v10.4s, v8.4s +ldr q8, [x0, #336] +ldr q20, [x0, #272] +mla v16.4S, v13.4S, v31.s[0] +mla v11.4S, v19.4S, v31.s[0] +str q27, [x0, #192] +sub v27.4s, v15.4s, v0.4s +ldr q19, [x0, #208] +ldr q13, [x0, #144] +mul v4.4S, v4.4S,v30.s[0] +mul v28.4S, v28.4S,v30.s[0] +str q10, [x0, #128] +add v15.4s, v15.4s, v0.4s +ldr q0, [x0, #80] +ldr q10, [x0, #16] +mla v4.4S, v14.4S, v31.s[0] +mla v28.4S, v18.4S, v31.s[0] +sub v18.4s, v26.4s, v24.4s +add v26.4s, v26.4s, v24.4s +sqrdmulh v24.4S, v18.4S, v29.s[2] +mul v18.4S, v18.4S,v30.s[2] +sub v14.4s, v25.4s, v21.4s +add v25.4s, v25.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v17.4s, v8.4s, v5.4s +add v8.4s, v8.4s, v5.4s +sqrdmulh v5.4S, v26.4S, v29.s[1] +mul v26.4S, v26.4S,v30.s[1] +sub v12.4s, v20.4s, v6.4s +add v20.4s, v20.4s, v6.4s +sqrdmulh v6.4S, v25.4S, v29.s[1] +mul v25.4S, v25.4S,v30.s[1] +sub v3.4s, v19.4s, v11.4s +add v19.4s, v19.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v29.s[2] +mla v18.4S, v24.4S, v31.s[0] +sub v24.4s, v13.4s, v16.4s +add v13.4s, v13.4s, v16.4s +sqrdmulh v16.4S, v12.4S, v29.s[2] +mla v14.4S, v21.4S, v31.s[0] +sub v21.4s, v0.4s, v28.4s +add v0.4s, v0.4s, v28.4s +sqrdmulh v28.4S, v8.4S, v29.s[1] +mla v26.4S, v5.4S, v31.s[0] +sub v5.4s, v10.4s, v4.4s +str q27, [x0, #64] +sqrdmulh v27.4S, v20.4S, v29.s[1] +mla v25.4S, v6.4S, v31.s[0] +add v10.4s, v10.4s, v4.4s +str q15, [x0, #0] +mul v12.4S, v12.4S,v30.s[2] +mul v17.4S, v17.4S,v30.s[2] +sub v15.4s, v3.4s, v18.4s +add v3.4s, v3.4s, v18.4s +mla v12.4S, v16.4S, v31.s[0] +mla v17.4S, v11.4S, v31.s[0] +sub v11.4s, v24.4s, v14.4s +add v24.4s, v24.4s, v14.4s +mul v20.4S, v20.4S,v30.s[1] +mul v8.4S, v8.4S,v30.s[1] +sub v14.4s, v19.4s, v26.4s +add v19.4s, v19.4s, v26.4s +mla v20.4S, v27.4S, v31.s[0] +mla v8.4S, v28.4S, v31.s[0] +sub v28.4s, v13.4s, v25.4s +add v13.4s, v13.4s, v25.4s +sqrdmulh v29.4S, v15.4S, v22.s[3] +mul v15.4S, v15.4S,v23.s[3] +sub v30.4s, v21.4s, v17.4s +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v3.4S, v22.s[2] +mul v3.4S, v3.4S,v23.s[2] +sub v25.4s, v5.4s, v12.4s +add v5.4s, v5.4s, v12.4s +sqrdmulh v12.4S, v14.4S, v22.s[1] +mul v14.4S, v14.4S,v23.s[1] +sub v27.4s, v0.4s, v8.4s +add v0.4s, v0.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v26.4s, v10.4s, v20.4s +add v10.4s, v10.4s, v20.4s +sqrdmulh v20.4S, v11.4S, v22.s[3] +mla v15.4S, v29.4S, v31.s[0] +nop +nop +sqrdmulh v29.4S, v24.4S, v22.s[2] +mla v3.4S, v17.4S, v31.s[0] +nop +nop +sqrdmulh v17.4S, v28.4S, v22.s[1] +mla v14.4S, v12.4S, v31.s[0] +nop +nop +sqrdmulh v12.4S, v13.4S, v22.s[0] +mla v19.4S, v8.4S, v31.s[0] +nop +nop +mul v24.4S, v24.4S,v23.s[2] +mul v11.4S, v11.4S,v23.s[3] +sub v8.4s, v30.4s, v15.4s +add v30.4s, v30.4s, v15.4s +mla v24.4S, v29.4S, v31.s[0] +mla v11.4S, v20.4S, v31.s[0] +sub v20.4s, v21.4s, v3.4s +add v21.4s, v21.4s, v3.4s +mul v13.4S, v13.4S,v23.s[0] +mul v28.4S, v28.4S,v23.s[1] +sub v3.4s, v27.4s, v14.4s +add v27.4s, v27.4s, v14.4s +mla v13.4S, v12.4S, v31.s[0] +mla v28.4S, v17.4S, v31.s[0] +sub v17.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v22.4S, v8.4S, v9.s[3] +mul v8.4S, v8.4S,v1.s[3] +sub v23.4s, v25.4s, v11.4s +add v25.4s, v25.4s, v11.4s +sqrdmulh v11.4S, v30.4S, v9.s[2] +mul v30.4S, v30.4S,v1.s[2] +sub v19.4s, v5.4s, v24.4s +add v5.4s, v5.4s, v24.4s +sqrdmulh v24.4S, v20.4S, v9.s[1] +mul v20.4S, v20.4S,v1.s[1] +sub v12.4s, v26.4s, v28.4s +add v26.4s, v26.4s, v28.4s +sqrdmulh v28.4S, v21.4S, v9.s[0] +mul v21.4S, v21.4S,v1.s[0] +sub v14.4s, v10.4s, v13.4s +add v10.4s, v10.4s, v13.4s +sqrdmulh v9.4S, v3.4S, v7.s[3] +mla v8.4S, v22.4S, v31.s[0] +nop +nop +sqrdmulh v22.4S, v27.4S, v7.s[2] +mla v30.4S, v11.4S, v31.s[0] +nop +nop +sqrdmulh v11.4S, v17.4S, v7.s[1] +mla v20.4S, v24.4S, v31.s[0] +nop +nop +sqrdmulh v24.4S, v0.4S, v7.s[0] +mla v21.4S, v28.4S, v31.s[0] +nop +nop +mul v27.4S, v27.4S,v2.s[2] +mul v3.4S, v3.4S,v2.s[3] +sub v28.4s, v23.4s, v8.4s +str q28, [x0, #976] +mla v27.4S, v22.4S, v31.s[0] +mla v3.4S, v9.4S, v31.s[0] +add v23.4s, v23.4s, v8.4s +str q23, [x0, #912] +mul v0.4S, v0.4S,v2.s[0] +mul v17.4S, v17.4S,v2.s[1] +sub v23.4s, v25.4s, v30.4s +str q23, [x0, #848] +mla v0.4S, v24.4S, v31.s[0] +mla v17.4S, v11.4S, v31.s[0] +add v25.4s, v25.4s, v30.4s +sub v30.4s, v19.4s, v20.4s +add v19.4s, v19.4s, v20.4s +str q25, [x0, #784] +sub v25.4s, v5.4s, v21.4s +str q30, [x0, #720] +add v5.4s, v5.4s, v21.4s +str q19, [x0, #656] +str q25, [x0, #592] +sub v25.4s, v12.4s, v3.4s +str q5, [x0, #528] +add v12.4s, v12.4s, v3.4s +str q25, [x0, #464] +sub v25.4s, v26.4s, v27.4s +str q12, [x0, #400] +add v26.4s, v26.4s, v27.4s +str q25, [x0, #336] +sub v25.4s, v14.4s, v17.4s +str q26, [x0, #272] +add v14.4s, v14.4s, v17.4s +str q25, [x0, #208] +sub v25.4s, v10.4s, v0.4s +str q14, [x0, #144] +add v10.4s, v10.4s, v0.4s +str q25, [x0, #80] +str q10, [x0, #16] +ldr q6, [x0, #224] +ldr q4, [x0, #160] +ldr q18, [x0, #32] +ldr q16, [x17, #+128] +ldr q15, [x17, #+144] +sqrdmulh v29.4S, v18.4S, v15.s[0] +mul v18.4S, v18.4S,v16.s[0] +ldr q13, [x0, #48] +sqrdmulh v1.4S, v13.4S, v15.s[0] +mul v13.4S, v13.4S,v16.s[0] +ldr q28, [x17, #+160] +ldr q22, [x17, #+176] +ldr q9, [x0, #96] +sqrdmulh v8.4S, v9.4S, v22.s[0] +mul v9.4S, v9.4S,v28.s[0] +ldr q23, [x0, #112] +sqrdmulh v24.4S, v23.4S, v22.s[0] +mul v23.4S, v23.4S,v28.s[0] +ldr q11, [x17, #+192] +ldr q2, [x17, #+208] +mla v18.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v4.4S, v2.s[0] +ldr q7, [x0, #176] +mla v13.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v7.4S, v2.s[0] +ldr q20, [x17, #+224] +ldr q30, [x17, #+240] +mla v9.4S, v8.4S, v31.s[0] +sqrdmulh v8.4S, v6.4S, v30.s[0] +ldr q21, [x0, #240] +mla v23.4S, v24.4S, v31.s[0] +sqrdmulh v24.4S, v21.4S, v30.s[0] +ldr q19, [x0, #0] +ldr q5, [x0, #128] +mul v4.4S, v4.4S,v11.s[0] +sub v3.4s, v19.4s, v18.4s +ldr q12, [x0, #16] +mul v7.4S, v7.4S,v11.s[0] +add v19.4s, v19.4s, v18.4s +ldr q18, [x0, #144] +mla v4.4S, v29.4S, v31.s[0] +sub v29.4s, v12.4s, v13.4s +ldr q27, [x0, #64] +mla v7.4S, v1.4S, v31.s[0] +add v12.4s, v12.4s, v13.4s +ldr q13, [x0, #192] +mul v6.4S, v6.4S,v20.s[0] +sub v1.4s, v27.4s, v9.4s +ldr q26, [x0, #80] +mul v21.4S, v21.4S,v20.s[0] +add v27.4s, v27.4s, v9.4s +ldr q9, [x0, #208] +mla v6.4S, v8.4S, v31.s[0] +mla v21.4S, v24.4S, v31.s[0] +sub v24.4s, v26.4s, v23.4s +sqrdmulh v8.4S, v12.4S, v15.s[1] +add v26.4s, v26.4s, v23.4s +mul v12.4S, v12.4S,v16.s[1] +sqrdmulh v23.4S, v29.4S, v15.s[2] +sub v17.4s, v5.4s, v4.4s +mul v29.4S, v29.4S,v16.s[2] +add v5.4s, v5.4s, v4.4s +sqrdmulh v15.4S, v26.4S, v22.s[1] +sub v16.4s, v18.4s, v7.4s +mul v26.4S, v26.4S,v28.s[1] +add v18.4s, v18.4s, v7.4s +sqrdmulh v7.4S, v24.4S, v22.s[2] +sub v4.4s, v13.4s, v6.4s +mul v24.4S, v24.4S,v28.s[2] +add v13.4s, v13.4s, v6.4s +mla v12.4S, v8.4S, v31.s[0] +sub v8.4s, v9.4s, v21.4s +ldr q22, [x0, #480] +sqrdmulh v28.4S, v18.4S, v2.s[1] +add v9.4s, v9.4s, v21.4s +mla v29.4S, v23.4S, v31.s[0] +ldr q23, [x0, #416] +sqrdmulh v21.4S, v16.4S, v2.s[2] +sub v6.4s, v19.4s, v12.4s +mla v26.4S, v15.4S, v31.s[0] +ldr q15, [x0, #288] +sqrdmulh v14.4S, v9.4S, v30.s[1] +add v19.4s, v19.4s, v12.4s +str q6, [x0, #16] +mla v24.4S, v7.4S, v31.s[0] +ldr q7, [x17, #+256] +ldr q6, [x17, #+272] +sqrdmulh v12.4S, v8.4S, v30.s[2] +sub v0.4s, v3.4s, v29.4s +str q19, [x0, #0] +mul v18.4S, v18.4S,v11.s[1] +add v3.4s, v3.4s, v29.4s +mul v16.4S, v16.4S,v11.s[2] +str q0, [x0, #48] +mla v18.4S, v28.4S, v31.s[0] +sub v28.4s, v27.4s, v26.4s +mla v16.4S, v21.4S, v31.s[0] +str q3, [x0, #32] +mul v9.4S, v9.4S,v20.s[1] +str q28, [x0, #80] +mul v8.4S, v8.4S,v20.s[2] +add v27.4s, v27.4s, v26.4s +str q27, [x0, #64] +mla v9.4S, v14.4S, v31.s[0] +sub v14.4s, v1.4s, v24.4s +str q14, [x0, #112] +mla v8.4S, v12.4S, v31.s[0] +add v1.4s, v1.4s, v24.4s +str q1, [x0, #96] +sqrdmulh v30.4S, v15.4S, v6.s[0] +sub v20.4s, v5.4s, v18.4s +mul v15.4S, v15.4S,v7.s[0] +str q20, [x0, #144] +ldr q20, [x0, #304] +sqrdmulh v1.4S, v20.4S, v6.s[0] +add v5.4s, v5.4s, v18.4s +mul v20.4S, v20.4S,v7.s[0] +str q5, [x0, #128] +ldr q5, [x17, #+288] +ldr q18, [x17, #+304] +ldr q24, [x0, #352] +sqrdmulh v12.4S, v24.4S, v18.s[0] +sub v14.4s, v17.4s, v16.4s +mul v24.4S, v24.4S,v5.s[0] +str q14, [x0, #176] +ldr q14, [x0, #368] +sqrdmulh v27.4S, v14.4S, v18.s[0] +add v17.4s, v17.4s, v16.4s +mul v14.4S, v14.4S,v5.s[0] +str q17, [x0, #160] +ldr q17, [x17, #+320] +ldr q16, [x17, #+336] +mla v15.4S, v30.4S, v31.s[0] +sub v30.4s, v13.4s, v9.4s +sqrdmulh v26.4S, v23.4S, v16.s[0] +str q30, [x0, #208] +ldr q30, [x0, #432] +mla v20.4S, v1.4S, v31.s[0] +add v13.4s, v13.4s, v9.4s +sqrdmulh v9.4S, v30.4S, v16.s[0] +str q13, [x0, #192] +ldr q13, [x17, #+352] +ldr q1, [x17, #+368] +mla v24.4S, v12.4S, v31.s[0] +sub v12.4s, v4.4s, v8.4s +sqrdmulh v28.4S, v22.4S, v1.s[0] +str q12, [x0, #240] +ldr q12, [x0, #496] +mla v14.4S, v27.4S, v31.s[0] +add v4.4s, v4.4s, v8.4s +sqrdmulh v8.4S, v12.4S, v1.s[0] +str q4, [x0, #224] +ldr q4, [x0, #256] +ldr q27, [x0, #384] +mul v23.4S, v23.4S,v17.s[0] +sub v2.4s, v4.4s, v15.4s +ldr q11, [x0, #272] +mul v30.4S, v30.4S,v17.s[0] +add v4.4s, v4.4s, v15.4s +ldr q15, [x0, #400] +mla v23.4S, v26.4S, v31.s[0] +sub v26.4s, v11.4s, v20.4s +ldr q3, [x0, #320] +mla v30.4S, v9.4S, v31.s[0] +add v11.4s, v11.4s, v20.4s +ldr q20, [x0, #448] +mul v22.4S, v22.4S,v13.s[0] +sub v9.4s, v3.4s, v24.4s +ldr q21, [x0, #336] +mul v12.4S, v12.4S,v13.s[0] +add v3.4s, v3.4s, v24.4s +ldr q24, [x0, #464] +mla v22.4S, v28.4S, v31.s[0] +mla v12.4S, v8.4S, v31.s[0] +sub v8.4s, v21.4s, v14.4s +sqrdmulh v28.4S, v11.4S, v6.s[1] +add v21.4s, v21.4s, v14.4s +mul v11.4S, v11.4S,v7.s[1] +sqrdmulh v14.4S, v26.4S, v6.s[2] +sub v0.4s, v27.4s, v23.4s +mul v26.4S, v26.4S,v7.s[2] +add v27.4s, v27.4s, v23.4s +sqrdmulh v6.4S, v21.4S, v18.s[1] +sub v7.4s, v15.4s, v30.4s +mul v21.4S, v21.4S,v5.s[1] +add v15.4s, v15.4s, v30.4s +sqrdmulh v30.4S, v8.4S, v18.s[2] +sub v23.4s, v20.4s, v22.4s +mul v8.4S, v8.4S,v5.s[2] +add v20.4s, v20.4s, v22.4s +mla v11.4S, v28.4S, v31.s[0] +sub v28.4s, v24.4s, v12.4s +ldr q18, [x0, #736] +sqrdmulh v5.4S, v15.4S, v16.s[1] +add v24.4s, v24.4s, v12.4s +mla v26.4S, v14.4S, v31.s[0] +ldr q14, [x0, #672] +sqrdmulh v12.4S, v7.4S, v16.s[2] +sub v22.4s, v4.4s, v11.4s +mla v21.4S, v6.4S, v31.s[0] +ldr q6, [x0, #544] +sqrdmulh v29.4S, v24.4S, v1.s[1] +add v4.4s, v4.4s, v11.4s +str q22, [x0, #272] +mla v8.4S, v30.4S, v31.s[0] +ldr q30, [x17, #+384] +ldr q22, [x17, #+400] +sqrdmulh v11.4S, v28.4S, v1.s[2] +sub v19.4s, v2.4s, v26.4s +str q4, [x0, #256] +mul v15.4S, v15.4S,v17.s[1] +add v2.4s, v2.4s, v26.4s +mul v7.4S, v7.4S,v17.s[2] +str q19, [x0, #304] +mla v15.4S, v5.4S, v31.s[0] +sub v5.4s, v3.4s, v21.4s +mla v7.4S, v12.4S, v31.s[0] +str q2, [x0, #288] +mul v24.4S, v24.4S,v13.s[1] +str q5, [x0, #336] +mul v28.4S, v28.4S,v13.s[2] +add v3.4s, v3.4s, v21.4s +str q3, [x0, #320] +mla v24.4S, v29.4S, v31.s[0] +sub v29.4s, v9.4s, v8.4s +str q29, [x0, #368] +mla v28.4S, v11.4S, v31.s[0] +add v9.4s, v9.4s, v8.4s +str q9, [x0, #352] +sqrdmulh v1.4S, v6.4S, v22.s[0] +sub v13.4s, v27.4s, v15.4s +mul v6.4S, v6.4S,v30.s[0] +str q13, [x0, #400] +ldr q13, [x0, #560] +sqrdmulh v9.4S, v13.4S, v22.s[0] +add v27.4s, v27.4s, v15.4s +mul v13.4S, v13.4S,v30.s[0] +str q27, [x0, #384] +ldr q27, [x17, #+416] +ldr q15, [x17, #+432] +ldr q8, [x0, #608] +sqrdmulh v11.4S, v8.4S, v15.s[0] +sub v29.4s, v0.4s, v7.4s +mul v8.4S, v8.4S,v27.s[0] +str q29, [x0, #432] +ldr q29, [x0, #624] +sqrdmulh v3.4S, v29.4S, v15.s[0] +add v0.4s, v0.4s, v7.4s +mul v29.4S, v29.4S,v27.s[0] +str q0, [x0, #416] +ldr q0, [x17, #+448] +ldr q7, [x17, #+464] +mla v6.4S, v1.4S, v31.s[0] +sub v1.4s, v20.4s, v24.4s +sqrdmulh v21.4S, v14.4S, v7.s[0] +str q1, [x0, #464] +ldr q1, [x0, #688] +mla v13.4S, v9.4S, v31.s[0] +add v20.4s, v20.4s, v24.4s +sqrdmulh v24.4S, v1.4S, v7.s[0] +str q20, [x0, #448] +ldr q20, [x17, #+480] +ldr q9, [x17, #+496] +mla v8.4S, v11.4S, v31.s[0] +sub v11.4s, v23.4s, v28.4s +sqrdmulh v5.4S, v18.4S, v9.s[0] +str q11, [x0, #496] +ldr q11, [x0, #752] +mla v29.4S, v3.4S, v31.s[0] +add v23.4s, v23.4s, v28.4s +sqrdmulh v28.4S, v11.4S, v9.s[0] +str q23, [x0, #480] +ldr q23, [x0, #512] +ldr q3, [x0, #640] +mul v14.4S, v14.4S,v0.s[0] +sub v16.4s, v23.4s, v6.4s +ldr q17, [x0, #528] +mul v1.4S, v1.4S,v0.s[0] +add v23.4s, v23.4s, v6.4s +ldr q6, [x0, #656] +mla v14.4S, v21.4S, v31.s[0] +sub v21.4s, v17.4s, v13.4s +ldr q2, [x0, #576] +mla v1.4S, v24.4S, v31.s[0] +add v17.4s, v17.4s, v13.4s +ldr q13, [x0, #704] +mul v18.4S, v18.4S,v20.s[0] +sub v24.4s, v2.4s, v8.4s +ldr q12, [x0, #592] +mul v11.4S, v11.4S,v20.s[0] +add v2.4s, v2.4s, v8.4s +ldr q8, [x0, #720] +mla v18.4S, v5.4S, v31.s[0] +mla v11.4S, v28.4S, v31.s[0] +sub v28.4s, v12.4s, v29.4s +sqrdmulh v5.4S, v17.4S, v22.s[1] +add v12.4s, v12.4s, v29.4s +mul v17.4S, v17.4S,v30.s[1] +sqrdmulh v29.4S, v21.4S, v22.s[2] +sub v19.4s, v3.4s, v14.4s +mul v21.4S, v21.4S,v30.s[2] +add v3.4s, v3.4s, v14.4s +sqrdmulh v22.4S, v12.4S, v15.s[1] +sub v30.4s, v6.4s, v1.4s +mul v12.4S, v12.4S,v27.s[1] +add v6.4s, v6.4s, v1.4s +sqrdmulh v1.4S, v28.4S, v15.s[2] +sub v14.4s, v13.4s, v18.4s +mul v28.4S, v28.4S,v27.s[2] +add v13.4s, v13.4s, v18.4s +mla v17.4S, v5.4S, v31.s[0] +sub v5.4s, v8.4s, v11.4s +ldr q15, [x0, #992] +sqrdmulh v27.4S, v6.4S, v7.s[1] +add v8.4s, v8.4s, v11.4s +mla v21.4S, v29.4S, v31.s[0] +ldr q29, [x0, #928] +sqrdmulh v11.4S, v30.4S, v7.s[2] +sub v18.4s, v23.4s, v17.4s +mla v12.4S, v22.4S, v31.s[0] +ldr q22, [x0, #800] +sqrdmulh v26.4S, v8.4S, v9.s[1] +add v23.4s, v23.4s, v17.4s +str q18, [x0, #528] +mla v28.4S, v1.4S, v31.s[0] +ldr q1, [x17, #+512] +ldr q18, [x17, #+528] +sqrdmulh v17.4S, v5.4S, v9.s[2] +sub v4.4s, v16.4s, v21.4s +str q23, [x0, #512] +mul v6.4S, v6.4S,v0.s[1] +add v16.4s, v16.4s, v21.4s +mul v30.4S, v30.4S,v0.s[2] +str q4, [x0, #560] +mla v6.4S, v27.4S, v31.s[0] +sub v27.4s, v2.4s, v12.4s +mla v30.4S, v11.4S, v31.s[0] +str q16, [x0, #544] +mul v8.4S, v8.4S,v20.s[1] +str q27, [x0, #592] +mul v5.4S, v5.4S,v20.s[2] +add v2.4s, v2.4s, v12.4s +str q2, [x0, #576] +mla v8.4S, v26.4S, v31.s[0] +sub v26.4s, v24.4s, v28.4s +str q26, [x0, #624] +mla v5.4S, v17.4S, v31.s[0] +add v24.4s, v24.4s, v28.4s +str q24, [x0, #608] +sqrdmulh v9.4S, v22.4S, v18.s[0] +sub v20.4s, v3.4s, v6.4s +mul v22.4S, v22.4S,v1.s[0] +str q20, [x0, #656] +ldr q20, [x0, #816] +sqrdmulh v24.4S, v20.4S, v18.s[0] +add v3.4s, v3.4s, v6.4s +mul v20.4S, v20.4S,v1.s[0] +str q3, [x0, #640] +ldr q3, [x17, #+544] +ldr q6, [x17, #+560] +ldr q28, [x0, #864] +sqrdmulh v17.4S, v28.4S, v6.s[0] +sub v26.4s, v19.4s, v30.4s +mul v28.4S, v28.4S,v3.s[0] +str q26, [x0, #688] +ldr q26, [x0, #880] +sqrdmulh v2.4S, v26.4S, v6.s[0] +add v19.4s, v19.4s, v30.4s +mul v26.4S, v26.4S,v3.s[0] +str q19, [x0, #672] +ldr q19, [x17, #+576] +ldr q30, [x17, #+592] +mla v22.4S, v9.4S, v31.s[0] +sub v9.4s, v13.4s, v8.4s +sqrdmulh v12.4S, v29.4S, v30.s[0] +str q9, [x0, #720] +ldr q9, [x0, #944] +mla v20.4S, v24.4S, v31.s[0] +add v13.4s, v13.4s, v8.4s +sqrdmulh v8.4S, v9.4S, v30.s[0] +str q13, [x0, #704] +ldr q13, [x17, #+608] +ldr q24, [x17, #+624] +mla v28.4S, v17.4S, v31.s[0] +sub v17.4s, v14.4s, v5.4s +sqrdmulh v27.4S, v15.4S, v24.s[0] +str q17, [x0, #752] +ldr q17, [x0, #1008] +mla v26.4S, v2.4S, v31.s[0] +add v14.4s, v14.4s, v5.4s +sqrdmulh v5.4S, v17.4S, v24.s[0] +str q14, [x0, #736] +ldr q14, [x0, #768] +ldr q2, [x0, #896] +mul v29.4S, v29.4S,v19.s[0] +sub v7.4s, v14.4s, v22.4s +ldr q0, [x0, #784] +mul v9.4S, v9.4S,v19.s[0] +add v14.4s, v14.4s, v22.4s +ldr q22, [x0, #912] +mla v29.4S, v12.4S, v31.s[0] +sub v12.4s, v0.4s, v20.4s +ldr q16, [x0, #832] +mla v9.4S, v8.4S, v31.s[0] +add v0.4s, v0.4s, v20.4s +ldr q20, [x0, #960] +mul v15.4S, v15.4S,v13.s[0] +sub v8.4s, v16.4s, v28.4s +ldr q11, [x0, #848] +mul v17.4S, v17.4S,v13.s[0] +add v16.4s, v16.4s, v28.4s +ldr q28, [x0, #976] +mla v15.4S, v27.4S, v31.s[0] +mla v17.4S, v5.4S, v31.s[0] +sub v5.4s, v11.4s, v26.4s +sqrdmulh v27.4S, v0.4S, v18.s[1] +add v11.4s, v11.4s, v26.4s +mul v0.4S, v0.4S,v1.s[1] +sqrdmulh v26.4S, v12.4S, v18.s[2] +sub v4.4s, v2.4s, v29.4s +mul v12.4S, v12.4S,v1.s[2] +add v2.4s, v2.4s, v29.4s +sqrdmulh v18.4S, v11.4S, v6.s[1] +sub v1.4s, v22.4s, v9.4s +mul v11.4S, v11.4S,v3.s[1] +add v22.4s, v22.4s, v9.4s +sqrdmulh v9.4S, v5.4S, v6.s[2] +sub v29.4s, v20.4s, v15.4s +mul v5.4S, v5.4S,v3.s[2] +add v20.4s, v20.4s, v15.4s +mla v0.4S, v27.4S, v31.s[0] +sub v27.4s, v28.4s, v17.4s +sqrdmulh v6.4S, v22.4S, v30.s[1] +add v28.4s, v28.4s, v17.4s +mla v12.4S, v26.4S, v31.s[0] +sqrdmulh v26.4S, v1.4S, v30.s[2] +sub v17.4s, v14.4s, v0.4s +mla v11.4S, v18.4S, v31.s[0] +sqrdmulh v18.4S, v28.4S, v24.s[1] +add v14.4s, v14.4s, v0.4s +str q17, [x0, #784] +mla v5.4S, v9.4S, v31.s[0] +sqrdmulh v9.4S, v27.4S, v24.s[2] +sub v17.4s, v7.4s, v12.4s +str q14, [x0, #768] +mul v22.4S, v22.4S,v19.s[1] +add v7.4s, v7.4s, v12.4s +mul v1.4S, v1.4S,v19.s[2] +str q17, [x0, #816] +mla v22.4S, v6.4S, v31.s[0] +sub v6.4s, v16.4s, v11.4s +mla v1.4S, v26.4S, v31.s[0] +str q7, [x0, #800] +mul v28.4S, v28.4S,v13.s[1] +str q6, [x0, #848] +mul v27.4S, v27.4S,v13.s[2] +add v16.4s, v16.4s, v11.4s +str q16, [x0, #832] +mla v28.4S, v18.4S, v31.s[0] +sub v18.4s, v8.4s, v5.4s +str q18, [x0, #880] +mla v27.4S, v9.4S, v31.s[0] +add v8.4s, v8.4s, v5.4s +str q8, [x0, #864] +sub v24.4s, v2.4s, v22.4s +str q24, [x0, #912] +add v2.4s, v2.4s, v22.4s +str q2, [x0, #896] +sub v2.4s, v4.4s, v1.4s +str q2, [x0, #944] +add v4.4s, v4.4s, v1.4s +str q4, [x0, #928] +sub v4.4s, v20.4s, v28.4s +str q4, [x0, #976] +add v20.4s, v20.4s, v28.4s +str q20, [x0, #960] +sub v20.4s, v29.4s, v27.4s +str q20, [x0, #1008] +add v29.4s, v29.4s, v27.4s +str q29, [x0, #992] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1528 +// Instruction count: 1524 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_21_z4_7.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_21_z4_7.s new file mode 100644 index 0000000..3e72693 --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_21_z4_7.s @@ -0,0 +1,1558 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_21_z4_7 +.global _ntt_u32_incomplete_neon_asm_var_4_2_21_z4_7 +ntt_u32_incomplete_neon_asm_var_4_2_21_z4_7: +_ntt_u32_incomplete_neon_asm_var_4_2_21_z4_7: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x0, #992] +sqrdmulh v27.4S, v28.4S, v29.s[0] +mul v28.4S, v28.4S,v30.s[0] +ldr q26, [x0, #928] +sqrdmulh v25.4S, v26.4S, v29.s[0] +mul v26.4S, v26.4S,v30.s[0] +ldr q24, [x0, #864] +sqrdmulh v23.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +ldr q22, [x0, #800] +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +ldr q20, [x0, #736] +sqrdmulh v19.4S, v20.4S, v29.s[0] +mla v28.4S, v27.4S, v31.s[0] +ldr q27, [x0, #672] +sqrdmulh v18.4S, v27.4S, v29.s[0] +mla v26.4S, v25.4S, v31.s[0] +ldr q25, [x0, #608] +sqrdmulh v17.4S, v25.4S, v29.s[0] +mla v24.4S, v23.4S, v31.s[0] +ldr q23, [x0, #544] +sqrdmulh v16.4S, v23.4S, v29.s[0] +mla v22.4S, v21.4S, v31.s[0] +ldr q21, [x0, #480] +ldr q3, [x0, #416] +mul v27.4S, v27.4S,v30.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q2, [x0, #352] +ldr q1, [x0, #288] +mla v27.4S, v18.4S, v31.s[0] +mla v20.4S, v19.4S, v31.s[0] +ldr q19, [x0, #224] +ldr q18, [x0, #160] +mul v23.4S, v23.4S,v30.s[0] +mul v25.4S, v25.4S,v30.s[0] +ldr q0, [x0, #96] +ldr q15, [x0, #32] +mla v23.4S, v16.4S, v31.s[0] +mla v25.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v28.4s +add v21.4s, v21.4s, v28.4s +sqrdmulh v28.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +sub v16.4s, v3.4s, v26.4s +add v3.4s, v3.4s, v26.4s +sqrdmulh v26.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +sub v14.4s, v2.4s, v24.4s +add v2.4s, v2.4s, v24.4s +sqrdmulh v24.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v13.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +sqrdmulh v22.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v12.4s, v19.4s, v20.4s +add v19.4s, v19.4s, v20.4s +sqrdmulh v20.4S, v14.4S, v29.s[2] +mla v17.4S, v28.4S, v31.s[0] +sub v28.4s, v18.4s, v27.4s +add v18.4s, v18.4s, v27.4s +sqrdmulh v27.4S, v13.4S, v29.s[2] +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v0.4s, v25.4s +add v0.4s, v0.4s, v25.4s +sqrdmulh v25.4S, v2.4S, v29.s[1] +mla v21.4S, v24.4S, v31.s[0] +sub v24.4s, v15.4s, v23.4s +sqrdmulh v11.4S, v1.4S, v29.s[1] +mla v3.4S, v22.4S, v31.s[0] +add v15.4s, v15.4s, v23.4s +ldr q23, [x17, #+32] +ldr q22, [x17, #+48] +mul v13.4S, v13.4S,v30.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v10.4s, v12.4s, v17.4s +add v12.4s, v12.4s, v17.4s +mla v13.4S, v27.4S, v31.s[0] +mla v14.4S, v20.4S, v31.s[0] +sub v20.4s, v28.4s, v16.4s +add v28.4s, v28.4s, v16.4s +mul v1.4S, v1.4S,v30.s[1] +mul v2.4S, v2.4S,v30.s[1] +sub v16.4s, v19.4s, v21.4s +add v19.4s, v19.4s, v21.4s +mla v1.4S, v11.4S, v31.s[0] +mla v2.4S, v25.4S, v31.s[0] +sub v25.4s, v18.4s, v3.4s +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v10.4S, v22.s[3] +mul v10.4S, v10.4S,v23.s[3] +sub v11.4s, v26.4s, v14.4s +add v26.4s, v26.4s, v14.4s +sqrdmulh v14.4S, v12.4S, v22.s[2] +mul v12.4S, v12.4S,v23.s[2] +sub v21.4s, v24.4s, v13.4s +add v24.4s, v24.4s, v13.4s +sqrdmulh v13.4S, v16.4S, v22.s[1] +mul v16.4S, v16.4S,v23.s[1] +sub v27.4s, v0.4s, v2.4s +add v0.4s, v0.4s, v2.4s +sqrdmulh v2.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v17.4s, v15.4s, v1.4s +add v15.4s, v15.4s, v1.4s +ldr q1, [x17, #+96] +ldr q9, [x17, #+112] +sqrdmulh v8.4S, v20.4S, v22.s[3] +mla v10.4S, v3.4S, v31.s[0] +nop +nop +sqrdmulh v3.4S, v28.4S, v22.s[2] +mla v12.4S, v14.4S, v31.s[0] +nop +nop +sqrdmulh v14.4S, v25.4S, v22.s[1] +mla v16.4S, v13.4S, v31.s[0] +nop +nop +sqrdmulh v13.4S, v18.4S, v22.s[0] +mla v19.4S, v2.4S, v31.s[0] +nop +nop +ldr q2, [x17, #+64] +ldr q7, [x17, #+80] +mul v28.4S, v28.4S,v23.s[2] +mul v20.4S, v20.4S,v23.s[3] +sub v6.4s, v11.4s, v10.4s +add v11.4s, v11.4s, v10.4s +mla v28.4S, v3.4S, v31.s[0] +mla v20.4S, v8.4S, v31.s[0] +sub v8.4s, v26.4s, v12.4s +add v26.4s, v26.4s, v12.4s +mul v18.4S, v18.4S,v23.s[0] +mul v25.4S, v25.4S,v23.s[1] +sub v12.4s, v27.4s, v16.4s +add v27.4s, v27.4s, v16.4s +mla v18.4S, v13.4S, v31.s[0] +mla v25.4S, v14.4S, v31.s[0] +sub v14.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v9.s[3] +mul v6.4S, v6.4S,v1.s[3] +sub v13.4s, v21.4s, v20.4s +add v21.4s, v21.4s, v20.4s +sqrdmulh v20.4S, v11.4S, v9.s[2] +mul v11.4S, v11.4S,v1.s[2] +sub v16.4s, v24.4s, v28.4s +add v24.4s, v24.4s, v28.4s +sqrdmulh v28.4S, v8.4S, v9.s[1] +mul v8.4S, v8.4S,v1.s[1] +sub v3.4s, v17.4s, v25.4s +add v17.4s, v17.4s, v25.4s +sqrdmulh v25.4S, v26.4S, v9.s[0] +mul v26.4S, v26.4S,v1.s[0] +sub v10.4s, v15.4s, v18.4s +add v15.4s, v15.4s, v18.4s +sqrdmulh v18.4S, v12.4S, v7.s[3] +mla v6.4S, v19.4S, v31.s[0] +nop +nop +sqrdmulh v19.4S, v27.4S, v7.s[2] +mla v11.4S, v20.4S, v31.s[0] +nop +nop +sqrdmulh v20.4S, v14.4S, v7.s[1] +mla v8.4S, v28.4S, v31.s[0] +nop +nop +sqrdmulh v28.4S, v0.4S, v7.s[0] +mla v26.4S, v25.4S, v31.s[0] +nop +nop +mul v27.4S, v27.4S,v2.s[2] +mul v12.4S, v12.4S,v2.s[3] +sub v25.4s, v13.4s, v6.4s +str q25, [x0, #992] +mla v27.4S, v19.4S, v31.s[0] +mla v12.4S, v18.4S, v31.s[0] +add v13.4s, v13.4s, v6.4s +str q13, [x0, #928] +mul v0.4S, v0.4S,v2.s[0] +mul v14.4S, v14.4S,v2.s[1] +sub v13.4s, v21.4s, v11.4s +str q13, [x0, #864] +mla v0.4S, v28.4S, v31.s[0] +mla v14.4S, v20.4S, v31.s[0] +add v21.4s, v21.4s, v11.4s +sub v11.4s, v16.4s, v8.4s +ldr q20, [x0, #1008] +sqrdmulh v28.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v16.4s, v16.4s, v8.4s +str q21, [x0, #800] +ldr q21, [x0, #944] +sqrdmulh v8.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +sub v13.4s, v24.4s, v26.4s +str q11, [x0, #736] +ldr q11, [x0, #880] +sqrdmulh v6.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +add v24.4s, v24.4s, v26.4s +str q16, [x0, #672] +ldr q16, [x0, #816] +sqrdmulh v26.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +str q13, [x0, #608] +sub v13.4s, v3.4s, v12.4s +ldr q18, [x0, #752] +sqrdmulh v19.4S, v18.4S, v29.s[0] +mla v20.4S, v28.4S, v31.s[0] +add v3.4s, v3.4s, v12.4s +str q24, [x0, #544] +ldr q24, [x0, #688] +sqrdmulh v12.4S, v24.4S, v29.s[0] +mla v21.4S, v8.4S, v31.s[0] +sub v8.4s, v17.4s, v27.4s +str q13, [x0, #480] +ldr q13, [x0, #624] +sqrdmulh v28.4S, v13.4S, v29.s[0] +mla v11.4S, v6.4S, v31.s[0] +add v17.4s, v17.4s, v27.4s +str q3, [x0, #416] +ldr q3, [x0, #560] +sqrdmulh v27.4S, v3.4S, v29.s[0] +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v10.4s, v14.4s +str q8, [x0, #352] +ldr q8, [x0, #496] +ldr q6, [x0, #432] +mul v24.4S, v24.4S,v30.s[0] +mul v18.4S, v18.4S,v30.s[0] +add v10.4s, v10.4s, v14.4s +str q17, [x0, #288] +ldr q17, [x0, #368] +ldr q14, [x0, #304] +mla v24.4S, v12.4S, v31.s[0] +mla v18.4S, v19.4S, v31.s[0] +sub v19.4s, v15.4s, v0.4s +str q26, [x0, #224] +ldr q26, [x0, #240] +ldr q12, [x0, #176] +mul v3.4S, v3.4S,v30.s[0] +mul v13.4S, v13.4S,v30.s[0] +add v15.4s, v15.4s, v0.4s +str q10, [x0, #160] +ldr q10, [x0, #112] +ldr q0, [x0, #48] +mla v3.4S, v27.4S, v31.s[0] +mla v13.4S, v28.4S, v31.s[0] +sub v28.4s, v8.4s, v20.4s +add v8.4s, v8.4s, v20.4s +sqrdmulh v20.4S, v28.4S, v29.s[2] +mul v28.4S, v28.4S,v30.s[2] +sub v27.4s, v6.4s, v21.4s +add v6.4s, v6.4s, v21.4s +sqrdmulh v21.4S, v27.4S, v29.s[2] +mul v27.4S, v27.4S,v30.s[2] +sub v25.4s, v17.4s, v11.4s +add v17.4s, v17.4s, v11.4s +sqrdmulh v11.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +sub v5.4s, v14.4s, v16.4s +add v14.4s, v14.4s, v16.4s +sqrdmulh v16.4S, v6.4S, v29.s[1] +mul v6.4S, v6.4S,v30.s[1] +sub v4.4s, v26.4s, v18.4s +add v26.4s, v26.4s, v18.4s +sqrdmulh v18.4S, v25.4S, v29.s[2] +mla v28.4S, v20.4S, v31.s[0] +sub v20.4s, v12.4s, v24.4s +add v12.4s, v12.4s, v24.4s +sqrdmulh v24.4S, v5.4S, v29.s[2] +mla v27.4S, v21.4S, v31.s[0] +sub v21.4s, v10.4s, v13.4s +add v10.4s, v10.4s, v13.4s +sqrdmulh v13.4S, v17.4S, v29.s[1] +mla v8.4S, v11.4S, v31.s[0] +str q19, [x0, #96] +sub v19.4s, v0.4s, v3.4s +sqrdmulh v11.4S, v14.4S, v29.s[1] +mla v6.4S, v16.4S, v31.s[0] +str q15, [x0, #32] +add v0.4s, v0.4s, v3.4s +mul v5.4S, v5.4S,v30.s[2] +mul v25.4S, v25.4S,v30.s[2] +sub v3.4s, v4.4s, v28.4s +add v4.4s, v4.4s, v28.4s +mla v5.4S, v24.4S, v31.s[0] +mla v25.4S, v18.4S, v31.s[0] +sub v18.4s, v20.4s, v27.4s +add v20.4s, v20.4s, v27.4s +mul v14.4S, v14.4S,v30.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v27.4s, v26.4s, v8.4s +add v26.4s, v26.4s, v8.4s +mla v14.4S, v11.4S, v31.s[0] +mla v17.4S, v13.4S, v31.s[0] +sub v13.4s, v12.4s, v6.4s +add v12.4s, v12.4s, v6.4s +sqrdmulh v6.4S, v3.4S, v22.s[3] +mul v3.4S, v3.4S,v23.s[3] +sub v11.4s, v21.4s, v25.4s +add v21.4s, v21.4s, v25.4s +sqrdmulh v25.4S, v4.4S, v22.s[2] +mul v4.4S, v4.4S,v23.s[2] +sub v8.4s, v19.4s, v5.4s +add v19.4s, v19.4s, v5.4s +sqrdmulh v5.4S, v27.4S, v22.s[1] +mul v27.4S, v27.4S,v23.s[1] +sub v24.4s, v10.4s, v17.4s +add v10.4s, v10.4s, v17.4s +sqrdmulh v17.4S, v26.4S, v22.s[0] +mul v26.4S, v26.4S,v23.s[0] +sub v28.4s, v0.4s, v14.4s +add v0.4s, v0.4s, v14.4s +sqrdmulh v14.4S, v18.4S, v22.s[3] +mla v3.4S, v6.4S, v31.s[0] +nop +nop +sqrdmulh v6.4S, v20.4S, v22.s[2] +mla v4.4S, v25.4S, v31.s[0] +nop +nop +sqrdmulh v25.4S, v13.4S, v22.s[1] +mla v27.4S, v5.4S, v31.s[0] +nop +nop +sqrdmulh v5.4S, v12.4S, v22.s[0] +mla v26.4S, v17.4S, v31.s[0] +nop +nop +mul v20.4S, v20.4S,v23.s[2] +mul v18.4S, v18.4S,v23.s[3] +sub v17.4s, v11.4s, v3.4s +add v11.4s, v11.4s, v3.4s +mla v20.4S, v6.4S, v31.s[0] +mla v18.4S, v14.4S, v31.s[0] +sub v14.4s, v21.4s, v4.4s +add v21.4s, v21.4s, v4.4s +mul v12.4S, v12.4S,v23.s[0] +mul v13.4S, v13.4S,v23.s[1] +sub v4.4s, v24.4s, v27.4s +add v24.4s, v24.4s, v27.4s +mla v12.4S, v5.4S, v31.s[0] +mla v13.4S, v25.4S, v31.s[0] +sub v25.4s, v10.4s, v26.4s +add v10.4s, v10.4s, v26.4s +sqrdmulh v26.4S, v17.4S, v9.s[3] +mul v17.4S, v17.4S,v1.s[3] +sub v5.4s, v8.4s, v18.4s +add v8.4s, v8.4s, v18.4s +sqrdmulh v18.4S, v11.4S, v9.s[2] +mul v11.4S, v11.4S,v1.s[2] +sub v27.4s, v19.4s, v20.4s +add v19.4s, v19.4s, v20.4s +sqrdmulh v20.4S, v14.4S, v9.s[1] +mul v14.4S, v14.4S,v1.s[1] +sub v6.4s, v28.4s, v13.4s +add v28.4s, v28.4s, v13.4s +sqrdmulh v13.4S, v21.4S, v9.s[0] +mul v21.4S, v21.4S,v1.s[0] +sub v3.4s, v0.4s, v12.4s +add v0.4s, v0.4s, v12.4s +sqrdmulh v12.4S, v4.4S, v7.s[3] +mla v17.4S, v26.4S, v31.s[0] +nop +nop +sqrdmulh v26.4S, v24.4S, v7.s[2] +mla v11.4S, v18.4S, v31.s[0] +nop +nop +sqrdmulh v18.4S, v25.4S, v7.s[1] +mla v14.4S, v20.4S, v31.s[0] +nop +nop +sqrdmulh v20.4S, v10.4S, v7.s[0] +mla v21.4S, v13.4S, v31.s[0] +nop +nop +mul v24.4S, v24.4S,v2.s[2] +mul v4.4S, v4.4S,v2.s[3] +sub v13.4s, v5.4s, v17.4s +str q13, [x0, #1008] +mla v24.4S, v26.4S, v31.s[0] +mla v4.4S, v12.4S, v31.s[0] +add v5.4s, v5.4s, v17.4s +str q5, [x0, #944] +mul v10.4S, v10.4S,v2.s[0] +mul v25.4S, v25.4S,v2.s[1] +sub v5.4s, v8.4s, v11.4s +str q5, [x0, #880] +mla v10.4S, v20.4S, v31.s[0] +mla v25.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v11.4s +sub v11.4s, v27.4s, v14.4s +ldr q18, [x0, #960] +sqrdmulh v20.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +add v27.4s, v27.4s, v14.4s +str q8, [x0, #816] +ldr q8, [x0, #896] +sqrdmulh v14.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v5.4s, v19.4s, v21.4s +str q11, [x0, #752] +ldr q11, [x0, #832] +sqrdmulh v17.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +add v19.4s, v19.4s, v21.4s +str q27, [x0, #688] +ldr q27, [x0, #768] +sqrdmulh v21.4S, v27.4S, v29.s[0] +mul v27.4S, v27.4S,v30.s[0] +str q5, [x0, #624] +sub v5.4s, v6.4s, v4.4s +ldr q12, [x0, #704] +sqrdmulh v26.4S, v12.4S, v29.s[0] +mla v18.4S, v20.4S, v31.s[0] +add v6.4s, v6.4s, v4.4s +str q19, [x0, #560] +ldr q19, [x0, #640] +sqrdmulh v4.4S, v19.4S, v29.s[0] +mla v8.4S, v14.4S, v31.s[0] +sub v14.4s, v28.4s, v24.4s +str q5, [x0, #496] +ldr q5, [x0, #576] +sqrdmulh v20.4S, v5.4S, v29.s[0] +mla v11.4S, v17.4S, v31.s[0] +add v28.4s, v28.4s, v24.4s +str q6, [x0, #432] +ldr q6, [x0, #512] +sqrdmulh v24.4S, v6.4S, v29.s[0] +mla v27.4S, v21.4S, v31.s[0] +sub v21.4s, v3.4s, v25.4s +str q14, [x0, #368] +ldr q14, [x0, #448] +ldr q17, [x0, #384] +mul v19.4S, v19.4S,v30.s[0] +mul v12.4S, v12.4S,v30.s[0] +add v3.4s, v3.4s, v25.4s +str q28, [x0, #304] +ldr q28, [x0, #320] +ldr q25, [x0, #256] +mla v19.4S, v4.4S, v31.s[0] +mla v12.4S, v26.4S, v31.s[0] +sub v26.4s, v0.4s, v10.4s +str q21, [x0, #240] +ldr q21, [x0, #192] +ldr q4, [x0, #128] +mul v6.4S, v6.4S,v30.s[0] +mul v5.4S, v5.4S,v30.s[0] +add v0.4s, v0.4s, v10.4s +str q3, [x0, #176] +ldr q3, [x0, #64] +ldr q10, [x0, #0] +mla v6.4S, v24.4S, v31.s[0] +mla v5.4S, v20.4S, v31.s[0] +sub v20.4s, v14.4s, v18.4s +add v14.4s, v14.4s, v18.4s +sqrdmulh v18.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +sub v24.4s, v17.4s, v8.4s +add v17.4s, v17.4s, v8.4s +sqrdmulh v8.4S, v24.4S, v29.s[2] +mul v24.4S, v24.4S,v30.s[2] +sub v13.4s, v28.4s, v11.4s +add v28.4s, v28.4s, v11.4s +sqrdmulh v11.4S, v14.4S, v29.s[1] +mul v14.4S, v14.4S,v30.s[1] +sub v15.4s, v25.4s, v27.4s +add v25.4s, v25.4s, v27.4s +sqrdmulh v27.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v16.4s, v21.4s, v12.4s +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v13.4S, v29.s[2] +mla v20.4S, v18.4S, v31.s[0] +sub v18.4s, v4.4s, v19.4s +add v4.4s, v4.4s, v19.4s +sqrdmulh v19.4S, v15.4S, v29.s[2] +mla v24.4S, v8.4S, v31.s[0] +sub v8.4s, v3.4s, v5.4s +add v3.4s, v3.4s, v5.4s +sqrdmulh v5.4S, v28.4S, v29.s[1] +mla v14.4S, v11.4S, v31.s[0] +str q26, [x0, #112] +sub v26.4s, v10.4s, v6.4s +sqrdmulh v11.4S, v25.4S, v29.s[1] +mla v17.4S, v27.4S, v31.s[0] +str q0, [x0, #48] +add v10.4s, v10.4s, v6.4s +mul v15.4S, v15.4S,v30.s[2] +mul v13.4S, v13.4S,v30.s[2] +sub v6.4s, v16.4s, v20.4s +add v16.4s, v16.4s, v20.4s +mla v15.4S, v19.4S, v31.s[0] +mla v13.4S, v12.4S, v31.s[0] +sub v12.4s, v18.4s, v24.4s +add v18.4s, v18.4s, v24.4s +mul v25.4S, v25.4S,v30.s[1] +mul v28.4S, v28.4S,v30.s[1] +sub v24.4s, v21.4s, v14.4s +add v21.4s, v21.4s, v14.4s +mla v25.4S, v11.4S, v31.s[0] +mla v28.4S, v5.4S, v31.s[0] +sub v5.4s, v4.4s, v17.4s +add v4.4s, v4.4s, v17.4s +sqrdmulh v17.4S, v6.4S, v22.s[3] +mul v6.4S, v6.4S,v23.s[3] +sub v11.4s, v8.4s, v13.4s +add v8.4s, v8.4s, v13.4s +sqrdmulh v13.4S, v16.4S, v22.s[2] +mul v16.4S, v16.4S,v23.s[2] +sub v14.4s, v26.4s, v15.4s +add v26.4s, v26.4s, v15.4s +sqrdmulh v15.4S, v24.4S, v22.s[1] +mul v24.4S, v24.4S,v23.s[1] +sub v19.4s, v3.4s, v28.4s +add v3.4s, v3.4s, v28.4s +sqrdmulh v28.4S, v21.4S, v22.s[0] +mul v21.4S, v21.4S,v23.s[0] +sub v20.4s, v10.4s, v25.4s +add v10.4s, v10.4s, v25.4s +sqrdmulh v25.4S, v12.4S, v22.s[3] +mla v6.4S, v17.4S, v31.s[0] +nop +nop +sqrdmulh v17.4S, v18.4S, v22.s[2] +mla v16.4S, v13.4S, v31.s[0] +nop +nop +sqrdmulh v13.4S, v5.4S, v22.s[1] +mla v24.4S, v15.4S, v31.s[0] +nop +nop +sqrdmulh v15.4S, v4.4S, v22.s[0] +mla v21.4S, v28.4S, v31.s[0] +nop +nop +mul v18.4S, v18.4S,v23.s[2] +mul v12.4S, v12.4S,v23.s[3] +sub v28.4s, v11.4s, v6.4s +add v11.4s, v11.4s, v6.4s +mla v18.4S, v17.4S, v31.s[0] +mla v12.4S, v25.4S, v31.s[0] +sub v25.4s, v8.4s, v16.4s +add v8.4s, v8.4s, v16.4s +mul v4.4S, v4.4S,v23.s[0] +mul v5.4S, v5.4S,v23.s[1] +sub v16.4s, v19.4s, v24.4s +add v19.4s, v19.4s, v24.4s +mla v4.4S, v15.4S, v31.s[0] +mla v5.4S, v13.4S, v31.s[0] +sub v13.4s, v3.4s, v21.4s +add v3.4s, v3.4s, v21.4s +sqrdmulh v21.4S, v28.4S, v9.s[3] +mul v28.4S, v28.4S,v1.s[3] +sub v15.4s, v14.4s, v12.4s +add v14.4s, v14.4s, v12.4s +sqrdmulh v12.4S, v11.4S, v9.s[2] +mul v11.4S, v11.4S,v1.s[2] +sub v24.4s, v26.4s, v18.4s +add v26.4s, v26.4s, v18.4s +sqrdmulh v18.4S, v25.4S, v9.s[1] +mul v25.4S, v25.4S,v1.s[1] +sub v17.4s, v20.4s, v5.4s +add v20.4s, v20.4s, v5.4s +sqrdmulh v5.4S, v8.4S, v9.s[0] +mul v8.4S, v8.4S,v1.s[0] +sub v6.4s, v10.4s, v4.4s +add v10.4s, v10.4s, v4.4s +sqrdmulh v4.4S, v16.4S, v7.s[3] +mla v28.4S, v21.4S, v31.s[0] +nop +nop +sqrdmulh v21.4S, v19.4S, v7.s[2] +mla v11.4S, v12.4S, v31.s[0] +nop +nop +sqrdmulh v12.4S, v13.4S, v7.s[1] +mla v25.4S, v18.4S, v31.s[0] +nop +nop +sqrdmulh v18.4S, v3.4S, v7.s[0] +mla v8.4S, v5.4S, v31.s[0] +nop +nop +mul v19.4S, v19.4S,v2.s[2] +mul v16.4S, v16.4S,v2.s[3] +sub v5.4s, v15.4s, v28.4s +str q5, [x0, #960] +mla v19.4S, v21.4S, v31.s[0] +mla v16.4S, v4.4S, v31.s[0] +add v15.4s, v15.4s, v28.4s +str q15, [x0, #896] +mul v3.4S, v3.4S,v2.s[0] +mul v13.4S, v13.4S,v2.s[1] +sub v15.4s, v14.4s, v11.4s +str q15, [x0, #832] +mla v3.4S, v18.4S, v31.s[0] +mla v13.4S, v12.4S, v31.s[0] +add v14.4s, v14.4s, v11.4s +sub v11.4s, v24.4s, v25.4s +ldr q12, [x0, #976] +sqrdmulh v18.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +add v24.4s, v24.4s, v25.4s +str q14, [x0, #768] +ldr q14, [x0, #912] +sqrdmulh v25.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v15.4s, v26.4s, v8.4s +str q11, [x0, #704] +ldr q11, [x0, #848] +sqrdmulh v28.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +add v26.4s, v26.4s, v8.4s +str q24, [x0, #640] +ldr q24, [x0, #784] +sqrdmulh v8.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +str q15, [x0, #576] +sub v15.4s, v17.4s, v16.4s +ldr q4, [x0, #720] +sqrdmulh v21.4S, v4.4S, v29.s[0] +mla v12.4S, v18.4S, v31.s[0] +add v17.4s, v17.4s, v16.4s +str q26, [x0, #512] +ldr q26, [x0, #656] +sqrdmulh v16.4S, v26.4S, v29.s[0] +mla v14.4S, v25.4S, v31.s[0] +sub v25.4s, v20.4s, v19.4s +str q15, [x0, #448] +ldr q15, [x0, #592] +sqrdmulh v18.4S, v15.4S, v29.s[0] +mla v11.4S, v28.4S, v31.s[0] +add v20.4s, v20.4s, v19.4s +str q17, [x0, #384] +ldr q17, [x0, #528] +sqrdmulh v19.4S, v17.4S, v29.s[0] +mla v24.4S, v8.4S, v31.s[0] +sub v8.4s, v6.4s, v13.4s +str q25, [x0, #320] +ldr q25, [x0, #464] +ldr q28, [x0, #400] +mul v26.4S, v26.4S,v30.s[0] +mul v4.4S, v4.4S,v30.s[0] +add v6.4s, v6.4s, v13.4s +str q20, [x0, #256] +ldr q20, [x0, #336] +ldr q13, [x0, #272] +mla v26.4S, v16.4S, v31.s[0] +mla v4.4S, v21.4S, v31.s[0] +sub v21.4s, v10.4s, v3.4s +str q8, [x0, #192] +ldr q8, [x0, #208] +ldr q16, [x0, #144] +mul v17.4S, v17.4S,v30.s[0] +mul v15.4S, v15.4S,v30.s[0] +add v10.4s, v10.4s, v3.4s +str q6, [x0, #128] +ldr q6, [x0, #80] +ldr q3, [x0, #16] +mla v17.4S, v19.4S, v31.s[0] +mla v15.4S, v18.4S, v31.s[0] +sub v18.4s, v25.4s, v12.4s +add v25.4s, v25.4s, v12.4s +sqrdmulh v12.4S, v18.4S, v29.s[2] +mul v18.4S, v18.4S,v30.s[2] +sub v19.4s, v28.4s, v14.4s +add v28.4s, v28.4s, v14.4s +sqrdmulh v14.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +sub v5.4s, v20.4s, v11.4s +add v20.4s, v20.4s, v11.4s +sqrdmulh v11.4S, v25.4S, v29.s[1] +mul v25.4S, v25.4S,v30.s[1] +sub v0.4s, v13.4s, v24.4s +add v13.4s, v13.4s, v24.4s +sqrdmulh v24.4S, v28.4S, v29.s[1] +mul v28.4S, v28.4S,v30.s[1] +sub v27.4s, v8.4s, v4.4s +add v8.4s, v8.4s, v4.4s +sqrdmulh v4.4S, v5.4S, v29.s[2] +mla v18.4S, v12.4S, v31.s[0] +sub v12.4s, v16.4s, v26.4s +add v16.4s, v16.4s, v26.4s +sqrdmulh v26.4S, v0.4S, v29.s[2] +mla v19.4S, v14.4S, v31.s[0] +sub v14.4s, v6.4s, v15.4s +add v6.4s, v6.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v29.s[1] +mla v25.4S, v11.4S, v31.s[0] +str q21, [x0, #64] +sub v21.4s, v3.4s, v17.4s +sqrdmulh v11.4S, v13.4S, v29.s[1] +mla v28.4S, v24.4S, v31.s[0] +str q10, [x0, #0] +add v3.4s, v3.4s, v17.4s +mul v0.4S, v0.4S,v30.s[2] +mul v5.4S, v5.4S,v30.s[2] +sub v17.4s, v27.4s, v18.4s +add v27.4s, v27.4s, v18.4s +mla v0.4S, v26.4S, v31.s[0] +mla v5.4S, v4.4S, v31.s[0] +sub v4.4s, v12.4s, v19.4s +add v12.4s, v12.4s, v19.4s +mul v13.4S, v13.4S,v30.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v19.4s, v8.4s, v25.4s +add v8.4s, v8.4s, v25.4s +mla v13.4S, v11.4S, v31.s[0] +mla v20.4S, v15.4S, v31.s[0] +sub v15.4s, v16.4s, v28.4s +add v16.4s, v16.4s, v28.4s +sqrdmulh v29.4S, v17.4S, v22.s[3] +mul v17.4S, v17.4S,v23.s[3] +sub v30.4s, v14.4s, v5.4s +add v14.4s, v14.4s, v5.4s +sqrdmulh v5.4S, v27.4S, v22.s[2] +mul v27.4S, v27.4S,v23.s[2] +sub v28.4s, v21.4s, v0.4s +add v21.4s, v21.4s, v0.4s +sqrdmulh v0.4S, v19.4S, v22.s[1] +mul v19.4S, v19.4S,v23.s[1] +sub v11.4s, v6.4s, v20.4s +add v6.4s, v6.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v22.s[0] +mul v8.4S, v8.4S,v23.s[0] +sub v25.4s, v3.4s, v13.4s +add v3.4s, v3.4s, v13.4s +sqrdmulh v13.4S, v4.4S, v22.s[3] +mla v17.4S, v29.4S, v31.s[0] +nop +nop +sqrdmulh v29.4S, v12.4S, v22.s[2] +mla v27.4S, v5.4S, v31.s[0] +nop +nop +sqrdmulh v5.4S, v15.4S, v22.s[1] +mla v19.4S, v0.4S, v31.s[0] +nop +nop +sqrdmulh v0.4S, v16.4S, v22.s[0] +mla v8.4S, v20.4S, v31.s[0] +nop +nop +mul v12.4S, v12.4S,v23.s[2] +mul v4.4S, v4.4S,v23.s[3] +sub v20.4s, v30.4s, v17.4s +add v30.4s, v30.4s, v17.4s +mla v12.4S, v29.4S, v31.s[0] +mla v4.4S, v13.4S, v31.s[0] +sub v13.4s, v14.4s, v27.4s +add v14.4s, v14.4s, v27.4s +mul v16.4S, v16.4S,v23.s[0] +mul v15.4S, v15.4S,v23.s[1] +sub v27.4s, v11.4s, v19.4s +add v11.4s, v11.4s, v19.4s +mla v16.4S, v0.4S, v31.s[0] +mla v15.4S, v5.4S, v31.s[0] +sub v5.4s, v6.4s, v8.4s +add v6.4s, v6.4s, v8.4s +sqrdmulh v22.4S, v20.4S, v9.s[3] +mul v20.4S, v20.4S,v1.s[3] +sub v23.4s, v28.4s, v4.4s +add v28.4s, v28.4s, v4.4s +sqrdmulh v4.4S, v30.4S, v9.s[2] +mul v30.4S, v30.4S,v1.s[2] +sub v8.4s, v21.4s, v12.4s +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v13.4S, v9.s[1] +mul v13.4S, v13.4S,v1.s[1] +sub v0.4s, v25.4s, v15.4s +add v25.4s, v25.4s, v15.4s +sqrdmulh v15.4S, v14.4S, v9.s[0] +mul v14.4S, v14.4S,v1.s[0] +sub v19.4s, v3.4s, v16.4s +add v3.4s, v3.4s, v16.4s +sqrdmulh v9.4S, v27.4S, v7.s[3] +mla v20.4S, v22.4S, v31.s[0] +nop +nop +sqrdmulh v22.4S, v11.4S, v7.s[2] +mla v30.4S, v4.4S, v31.s[0] +nop +nop +sqrdmulh v4.4S, v5.4S, v7.s[1] +mla v13.4S, v12.4S, v31.s[0] +nop +nop +sqrdmulh v12.4S, v6.4S, v7.s[0] +mla v14.4S, v15.4S, v31.s[0] +nop +nop +mul v11.4S, v11.4S,v2.s[2] +mul v27.4S, v27.4S,v2.s[3] +sub v15.4s, v23.4s, v20.4s +str q15, [x0, #976] +mla v11.4S, v22.4S, v31.s[0] +mla v27.4S, v9.4S, v31.s[0] +add v23.4s, v23.4s, v20.4s +str q23, [x0, #912] +mul v6.4S, v6.4S,v2.s[0] +mul v5.4S, v5.4S,v2.s[1] +sub v23.4s, v28.4s, v30.4s +str q23, [x0, #848] +mla v6.4S, v12.4S, v31.s[0] +mla v5.4S, v4.4S, v31.s[0] +add v28.4s, v28.4s, v30.4s +sub v30.4s, v8.4s, v13.4s +add v8.4s, v8.4s, v13.4s +str q28, [x0, #784] +sub v28.4s, v21.4s, v14.4s +str q30, [x0, #720] +add v21.4s, v21.4s, v14.4s +str q8, [x0, #656] +str q28, [x0, #592] +sub v28.4s, v0.4s, v27.4s +add v0.4s, v0.4s, v27.4s +str q21, [x0, #528] +sub v21.4s, v25.4s, v11.4s +str q28, [x0, #464] +add v25.4s, v25.4s, v11.4s +str q0, [x0, #400] +sub v0.4s, v19.4s, v5.4s +str q21, [x0, #336] +add v19.4s, v19.4s, v5.4s +str q25, [x0, #272] +sub v25.4s, v3.4s, v6.4s +str q0, [x0, #208] +add v3.4s, v3.4s, v6.4s +str q19, [x0, #144] +str q25, [x0, #80] +str q3, [x0, #16] +ldr q24, [x0, #224] +ldr q10, [x0, #160] +ldr q18, [x0, #32] +ldr q26, [x17, #+128] +ldr q17, [x17, #+144] +sqrdmulh v29.4S, v18.4S, v17.s[0] +mul v18.4S, v18.4S,v26.s[0] +ldr q16, [x0, #48] +sqrdmulh v1.4S, v16.4S, v17.s[0] +mul v16.4S, v16.4S,v26.s[0] +ldr q15, [x17, #+160] +ldr q22, [x17, #+176] +ldr q9, [x0, #96] +sqrdmulh v20.4S, v9.4S, v22.s[0] +mul v9.4S, v9.4S,v15.s[0] +ldr q23, [x0, #112] +sqrdmulh v12.4S, v23.4S, v22.s[0] +mul v23.4S, v23.4S,v15.s[0] +ldr q4, [x17, #+192] +ldr q2, [x17, #+208] +mla v18.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v10.4S, v2.s[0] +ldr q7, [x0, #176] +mla v16.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v7.4S, v2.s[0] +ldr q13, [x17, #+224] +ldr q30, [x17, #+240] +mla v9.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v24.4S, v30.s[0] +ldr q14, [x0, #240] +mla v23.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v14.4S, v30.s[0] +ldr q8, [x0, #0] +ldr q27, [x0, #128] +mul v10.4S, v10.4S,v4.s[0] +sub v28.4s, v8.4s, v18.4s +ldr q11, [x0, #16] +mul v7.4S, v7.4S,v4.s[0] +add v8.4s, v8.4s, v18.4s +ldr q18, [x0, #144] +mla v10.4S, v29.4S, v31.s[0] +sub v29.4s, v11.4s, v16.4s +ldr q21, [x0, #64] +mla v7.4S, v1.4S, v31.s[0] +add v11.4s, v11.4s, v16.4s +ldr q16, [x0, #192] +mul v24.4S, v24.4S,v13.s[0] +sub v1.4s, v21.4s, v9.4s +ldr q5, [x0, #80] +mul v14.4S, v14.4S,v13.s[0] +add v21.4s, v21.4s, v9.4s +ldr q9, [x0, #208] +mla v24.4S, v20.4S, v31.s[0] +mla v14.4S, v12.4S, v31.s[0] +sub v12.4s, v5.4s, v23.4s +sqrdmulh v20.4S, v11.4S, v17.s[1] +add v5.4s, v5.4s, v23.4s +mul v11.4S, v11.4S,v26.s[1] +sqrdmulh v23.4S, v29.4S, v17.s[2] +sub v0.4s, v27.4s, v10.4s +mul v29.4S, v29.4S,v26.s[2] +add v27.4s, v27.4s, v10.4s +sqrdmulh v17.4S, v5.4S, v22.s[1] +sub v26.4s, v18.4s, v7.4s +mul v5.4S, v5.4S,v15.s[1] +add v18.4s, v18.4s, v7.4s +sqrdmulh v7.4S, v12.4S, v22.s[2] +sub v10.4s, v16.4s, v24.4s +mul v12.4S, v12.4S,v15.s[2] +add v16.4s, v16.4s, v24.4s +mla v11.4S, v20.4S, v31.s[0] +sub v20.4s, v9.4s, v14.4s +ldr q22, [x0, #480] +sqrdmulh v15.4S, v18.4S, v2.s[1] +add v9.4s, v9.4s, v14.4s +mla v29.4S, v23.4S, v31.s[0] +ldr q23, [x0, #416] +sqrdmulh v14.4S, v26.4S, v2.s[2] +sub v24.4s, v8.4s, v11.4s +mla v5.4S, v17.4S, v31.s[0] +ldr q17, [x0, #288] +sqrdmulh v6.4S, v9.4S, v30.s[1] +add v8.4s, v8.4s, v11.4s +str q24, [x0, #16] +mla v12.4S, v7.4S, v31.s[0] +ldr q7, [x17, #+256] +ldr q24, [x17, #+272] +sqrdmulh v11.4S, v20.4S, v30.s[2] +sub v19.4s, v28.4s, v29.4s +str q8, [x0, #0] +mul v18.4S, v18.4S,v4.s[1] +add v28.4s, v28.4s, v29.4s +mul v26.4S, v26.4S,v4.s[2] +str q19, [x0, #48] +mla v18.4S, v15.4S, v31.s[0] +sub v15.4s, v21.4s, v5.4s +mla v26.4S, v14.4S, v31.s[0] +str q28, [x0, #32] +mul v9.4S, v9.4S,v13.s[1] +str q15, [x0, #80] +mul v20.4S, v20.4S,v13.s[2] +add v21.4s, v21.4s, v5.4s +str q21, [x0, #64] +mla v9.4S, v6.4S, v31.s[0] +sub v6.4s, v1.4s, v12.4s +str q6, [x0, #112] +mla v20.4S, v11.4S, v31.s[0] +add v1.4s, v1.4s, v12.4s +str q1, [x0, #96] +sqrdmulh v30.4S, v17.4S, v24.s[0] +sub v13.4s, v27.4s, v18.4s +mul v17.4S, v17.4S,v7.s[0] +str q13, [x0, #144] +ldr q13, [x0, #304] +sqrdmulh v1.4S, v13.4S, v24.s[0] +add v27.4s, v27.4s, v18.4s +mul v13.4S, v13.4S,v7.s[0] +str q27, [x0, #128] +ldr q27, [x17, #+288] +ldr q18, [x17, #+304] +ldr q12, [x0, #352] +sqrdmulh v11.4S, v12.4S, v18.s[0] +sub v6.4s, v0.4s, v26.4s +mul v12.4S, v12.4S,v27.s[0] +str q6, [x0, #176] +ldr q6, [x0, #368] +sqrdmulh v21.4S, v6.4S, v18.s[0] +add v0.4s, v0.4s, v26.4s +mul v6.4S, v6.4S,v27.s[0] +str q0, [x0, #160] +ldr q0, [x17, #+320] +ldr q26, [x17, #+336] +mla v17.4S, v30.4S, v31.s[0] +sub v30.4s, v16.4s, v9.4s +sqrdmulh v5.4S, v23.4S, v26.s[0] +str q30, [x0, #208] +ldr q30, [x0, #432] +mla v13.4S, v1.4S, v31.s[0] +add v16.4s, v16.4s, v9.4s +sqrdmulh v9.4S, v30.4S, v26.s[0] +str q16, [x0, #192] +ldr q16, [x17, #+352] +ldr q1, [x17, #+368] +mla v12.4S, v11.4S, v31.s[0] +sub v11.4s, v10.4s, v20.4s +sqrdmulh v15.4S, v22.4S, v1.s[0] +str q11, [x0, #240] +ldr q11, [x0, #496] +mla v6.4S, v21.4S, v31.s[0] +add v10.4s, v10.4s, v20.4s +sqrdmulh v20.4S, v11.4S, v1.s[0] +str q10, [x0, #224] +ldr q10, [x0, #256] +ldr q21, [x0, #384] +mul v23.4S, v23.4S,v0.s[0] +sub v2.4s, v10.4s, v17.4s +ldr q4, [x0, #272] +mul v30.4S, v30.4S,v0.s[0] +add v10.4s, v10.4s, v17.4s +ldr q17, [x0, #400] +mla v23.4S, v5.4S, v31.s[0] +sub v5.4s, v4.4s, v13.4s +ldr q28, [x0, #320] +mla v30.4S, v9.4S, v31.s[0] +add v4.4s, v4.4s, v13.4s +ldr q13, [x0, #448] +mul v22.4S, v22.4S,v16.s[0] +sub v9.4s, v28.4s, v12.4s +ldr q14, [x0, #336] +mul v11.4S, v11.4S,v16.s[0] +add v28.4s, v28.4s, v12.4s +ldr q12, [x0, #464] +mla v22.4S, v15.4S, v31.s[0] +mla v11.4S, v20.4S, v31.s[0] +sub v20.4s, v14.4s, v6.4s +sqrdmulh v15.4S, v4.4S, v24.s[1] +add v14.4s, v14.4s, v6.4s +mul v4.4S, v4.4S,v7.s[1] +sqrdmulh v6.4S, v5.4S, v24.s[2] +sub v19.4s, v21.4s, v23.4s +mul v5.4S, v5.4S,v7.s[2] +add v21.4s, v21.4s, v23.4s +sqrdmulh v24.4S, v14.4S, v18.s[1] +sub v7.4s, v17.4s, v30.4s +mul v14.4S, v14.4S,v27.s[1] +add v17.4s, v17.4s, v30.4s +sqrdmulh v30.4S, v20.4S, v18.s[2] +sub v23.4s, v13.4s, v22.4s +mul v20.4S, v20.4S,v27.s[2] +add v13.4s, v13.4s, v22.4s +mla v4.4S, v15.4S, v31.s[0] +sub v15.4s, v12.4s, v11.4s +ldr q18, [x0, #736] +sqrdmulh v27.4S, v17.4S, v26.s[1] +add v12.4s, v12.4s, v11.4s +mla v5.4S, v6.4S, v31.s[0] +ldr q6, [x0, #672] +sqrdmulh v11.4S, v7.4S, v26.s[2] +sub v22.4s, v10.4s, v4.4s +mla v14.4S, v24.4S, v31.s[0] +ldr q24, [x0, #544] +sqrdmulh v29.4S, v12.4S, v1.s[1] +add v10.4s, v10.4s, v4.4s +str q22, [x0, #272] +mla v20.4S, v30.4S, v31.s[0] +ldr q30, [x17, #+384] +ldr q22, [x17, #+400] +sqrdmulh v4.4S, v15.4S, v1.s[2] +sub v8.4s, v2.4s, v5.4s +str q10, [x0, #256] +mul v17.4S, v17.4S,v0.s[1] +add v2.4s, v2.4s, v5.4s +mul v7.4S, v7.4S,v0.s[2] +str q8, [x0, #304] +mla v17.4S, v27.4S, v31.s[0] +sub v27.4s, v28.4s, v14.4s +mla v7.4S, v11.4S, v31.s[0] +str q2, [x0, #288] +mul v12.4S, v12.4S,v16.s[1] +str q27, [x0, #336] +mul v15.4S, v15.4S,v16.s[2] +add v28.4s, v28.4s, v14.4s +str q28, [x0, #320] +mla v12.4S, v29.4S, v31.s[0] +sub v29.4s, v9.4s, v20.4s +str q29, [x0, #368] +mla v15.4S, v4.4S, v31.s[0] +add v9.4s, v9.4s, v20.4s +str q9, [x0, #352] +sqrdmulh v1.4S, v24.4S, v22.s[0] +sub v16.4s, v21.4s, v17.4s +mul v24.4S, v24.4S,v30.s[0] +str q16, [x0, #400] +ldr q16, [x0, #560] +sqrdmulh v9.4S, v16.4S, v22.s[0] +add v21.4s, v21.4s, v17.4s +mul v16.4S, v16.4S,v30.s[0] +str q21, [x0, #384] +ldr q21, [x17, #+416] +ldr q17, [x17, #+432] +ldr q20, [x0, #608] +sqrdmulh v4.4S, v20.4S, v17.s[0] +sub v29.4s, v19.4s, v7.4s +mul v20.4S, v20.4S,v21.s[0] +str q29, [x0, #432] +ldr q29, [x0, #624] +sqrdmulh v28.4S, v29.4S, v17.s[0] +add v19.4s, v19.4s, v7.4s +mul v29.4S, v29.4S,v21.s[0] +str q19, [x0, #416] +ldr q19, [x17, #+448] +ldr q7, [x17, #+464] +mla v24.4S, v1.4S, v31.s[0] +sub v1.4s, v13.4s, v12.4s +sqrdmulh v14.4S, v6.4S, v7.s[0] +str q1, [x0, #464] +ldr q1, [x0, #688] +mla v16.4S, v9.4S, v31.s[0] +add v13.4s, v13.4s, v12.4s +sqrdmulh v12.4S, v1.4S, v7.s[0] +str q13, [x0, #448] +ldr q13, [x17, #+480] +ldr q9, [x17, #+496] +mla v20.4S, v4.4S, v31.s[0] +sub v4.4s, v23.4s, v15.4s +sqrdmulh v27.4S, v18.4S, v9.s[0] +str q4, [x0, #496] +ldr q4, [x0, #752] +mla v29.4S, v28.4S, v31.s[0] +add v23.4s, v23.4s, v15.4s +sqrdmulh v15.4S, v4.4S, v9.s[0] +str q23, [x0, #480] +ldr q23, [x0, #512] +ldr q28, [x0, #640] +mul v6.4S, v6.4S,v19.s[0] +sub v26.4s, v23.4s, v24.4s +ldr q0, [x0, #528] +mul v1.4S, v1.4S,v19.s[0] +add v23.4s, v23.4s, v24.4s +ldr q24, [x0, #656] +mla v6.4S, v14.4S, v31.s[0] +sub v14.4s, v0.4s, v16.4s +ldr q2, [x0, #576] +mla v1.4S, v12.4S, v31.s[0] +add v0.4s, v0.4s, v16.4s +ldr q16, [x0, #704] +mul v18.4S, v18.4S,v13.s[0] +sub v12.4s, v2.4s, v20.4s +ldr q11, [x0, #592] +mul v4.4S, v4.4S,v13.s[0] +add v2.4s, v2.4s, v20.4s +ldr q20, [x0, #720] +mla v18.4S, v27.4S, v31.s[0] +mla v4.4S, v15.4S, v31.s[0] +sub v15.4s, v11.4s, v29.4s +sqrdmulh v27.4S, v0.4S, v22.s[1] +add v11.4s, v11.4s, v29.4s +mul v0.4S, v0.4S,v30.s[1] +sqrdmulh v29.4S, v14.4S, v22.s[2] +sub v8.4s, v28.4s, v6.4s +mul v14.4S, v14.4S,v30.s[2] +add v28.4s, v28.4s, v6.4s +sqrdmulh v22.4S, v11.4S, v17.s[1] +sub v30.4s, v24.4s, v1.4s +mul v11.4S, v11.4S,v21.s[1] +add v24.4s, v24.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v17.s[2] +sub v6.4s, v16.4s, v18.4s +mul v15.4S, v15.4S,v21.s[2] +add v16.4s, v16.4s, v18.4s +mla v0.4S, v27.4S, v31.s[0] +sub v27.4s, v20.4s, v4.4s +ldr q17, [x0, #992] +sqrdmulh v21.4S, v24.4S, v7.s[1] +add v20.4s, v20.4s, v4.4s +mla v14.4S, v29.4S, v31.s[0] +ldr q29, [x0, #928] +sqrdmulh v4.4S, v30.4S, v7.s[2] +sub v18.4s, v23.4s, v0.4s +mla v11.4S, v22.4S, v31.s[0] +ldr q22, [x0, #800] +sqrdmulh v5.4S, v20.4S, v9.s[1] +add v23.4s, v23.4s, v0.4s +str q18, [x0, #528] +mla v15.4S, v1.4S, v31.s[0] +ldr q1, [x17, #+512] +ldr q18, [x17, #+528] +sqrdmulh v0.4S, v27.4S, v9.s[2] +sub v10.4s, v26.4s, v14.4s +str q23, [x0, #512] +mul v24.4S, v24.4S,v19.s[1] +add v26.4s, v26.4s, v14.4s +mul v30.4S, v30.4S,v19.s[2] +str q10, [x0, #560] +mla v24.4S, v21.4S, v31.s[0] +sub v21.4s, v2.4s, v11.4s +mla v30.4S, v4.4S, v31.s[0] +str q26, [x0, #544] +mul v20.4S, v20.4S,v13.s[1] +str q21, [x0, #592] +mul v27.4S, v27.4S,v13.s[2] +add v2.4s, v2.4s, v11.4s +str q2, [x0, #576] +mla v20.4S, v5.4S, v31.s[0] +sub v5.4s, v12.4s, v15.4s +str q5, [x0, #624] +mla v27.4S, v0.4S, v31.s[0] +add v12.4s, v12.4s, v15.4s +str q12, [x0, #608] +sqrdmulh v9.4S, v22.4S, v18.s[0] +sub v13.4s, v28.4s, v24.4s +mul v22.4S, v22.4S,v1.s[0] +str q13, [x0, #656] +ldr q13, [x0, #816] +sqrdmulh v12.4S, v13.4S, v18.s[0] +add v28.4s, v28.4s, v24.4s +mul v13.4S, v13.4S,v1.s[0] +str q28, [x0, #640] +ldr q28, [x17, #+544] +ldr q24, [x17, #+560] +ldr q15, [x0, #864] +sqrdmulh v0.4S, v15.4S, v24.s[0] +sub v5.4s, v8.4s, v30.4s +mul v15.4S, v15.4S,v28.s[0] +str q5, [x0, #688] +ldr q5, [x0, #880] +sqrdmulh v2.4S, v5.4S, v24.s[0] +add v8.4s, v8.4s, v30.4s +mul v5.4S, v5.4S,v28.s[0] +str q8, [x0, #672] +ldr q8, [x17, #+576] +ldr q30, [x17, #+592] +mla v22.4S, v9.4S, v31.s[0] +sub v9.4s, v16.4s, v20.4s +sqrdmulh v11.4S, v29.4S, v30.s[0] +str q9, [x0, #720] +ldr q9, [x0, #944] +mla v13.4S, v12.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v9.4S, v30.s[0] +str q16, [x0, #704] +ldr q16, [x17, #+608] +ldr q12, [x17, #+624] +mla v15.4S, v0.4S, v31.s[0] +sub v0.4s, v6.4s, v27.4s +sqrdmulh v21.4S, v17.4S, v12.s[0] +str q0, [x0, #752] +ldr q0, [x0, #1008] +mla v5.4S, v2.4S, v31.s[0] +add v6.4s, v6.4s, v27.4s +sqrdmulh v27.4S, v0.4S, v12.s[0] +str q6, [x0, #736] +ldr q6, [x0, #768] +ldr q2, [x0, #896] +mul v29.4S, v29.4S,v8.s[0] +sub v7.4s, v6.4s, v22.4s +ldr q19, [x0, #784] +mul v9.4S, v9.4S,v8.s[0] +add v6.4s, v6.4s, v22.4s +ldr q22, [x0, #912] +mla v29.4S, v11.4S, v31.s[0] +sub v11.4s, v19.4s, v13.4s +ldr q26, [x0, #832] +mla v9.4S, v20.4S, v31.s[0] +add v19.4s, v19.4s, v13.4s +ldr q13, [x0, #960] +mul v17.4S, v17.4S,v16.s[0] +sub v20.4s, v26.4s, v15.4s +ldr q4, [x0, #848] +mul v0.4S, v0.4S,v16.s[0] +add v26.4s, v26.4s, v15.4s +ldr q15, [x0, #976] +mla v17.4S, v21.4S, v31.s[0] +mla v0.4S, v27.4S, v31.s[0] +sub v27.4s, v4.4s, v5.4s +sqrdmulh v21.4S, v19.4S, v18.s[1] +add v4.4s, v4.4s, v5.4s +mul v19.4S, v19.4S,v1.s[1] +sqrdmulh v5.4S, v11.4S, v18.s[2] +sub v10.4s, v2.4s, v29.4s +mul v11.4S, v11.4S,v1.s[2] +add v2.4s, v2.4s, v29.4s +sqrdmulh v18.4S, v4.4S, v24.s[1] +sub v1.4s, v22.4s, v9.4s +mul v4.4S, v4.4S,v28.s[1] +add v22.4s, v22.4s, v9.4s +sqrdmulh v9.4S, v27.4S, v24.s[2] +sub v29.4s, v13.4s, v17.4s +mul v27.4S, v27.4S,v28.s[2] +add v13.4s, v13.4s, v17.4s +mla v19.4S, v21.4S, v31.s[0] +sub v21.4s, v15.4s, v0.4s +sqrdmulh v24.4S, v22.4S, v30.s[1] +add v15.4s, v15.4s, v0.4s +mla v11.4S, v5.4S, v31.s[0] +sqrdmulh v5.4S, v1.4S, v30.s[2] +sub v0.4s, v6.4s, v19.4s +mla v4.4S, v18.4S, v31.s[0] +sqrdmulh v18.4S, v15.4S, v12.s[1] +add v6.4s, v6.4s, v19.4s +str q0, [x0, #784] +mla v27.4S, v9.4S, v31.s[0] +sqrdmulh v9.4S, v21.4S, v12.s[2] +sub v0.4s, v7.4s, v11.4s +str q6, [x0, #768] +mul v22.4S, v22.4S,v8.s[1] +add v7.4s, v7.4s, v11.4s +mul v1.4S, v1.4S,v8.s[2] +str q0, [x0, #816] +mla v22.4S, v24.4S, v31.s[0] +sub v24.4s, v26.4s, v4.4s +mla v1.4S, v5.4S, v31.s[0] +str q7, [x0, #800] +mul v15.4S, v15.4S,v16.s[1] +str q24, [x0, #848] +mul v21.4S, v21.4S,v16.s[2] +add v26.4s, v26.4s, v4.4s +str q26, [x0, #832] +mla v15.4S, v18.4S, v31.s[0] +sub v18.4s, v20.4s, v27.4s +str q18, [x0, #880] +mla v21.4S, v9.4S, v31.s[0] +add v20.4s, v20.4s, v27.4s +str q20, [x0, #864] +sub v12.4s, v2.4s, v22.4s +str q12, [x0, #912] +add v2.4s, v2.4s, v22.4s +str q2, [x0, #896] +sub v2.4s, v10.4s, v1.4s +str q2, [x0, #944] +add v10.4s, v10.4s, v1.4s +str q10, [x0, #928] +sub v10.4s, v13.4s, v15.4s +str q10, [x0, #976] +add v13.4s, v13.4s, v15.4s +str q13, [x0, #960] +sub v13.4s, v29.4s, v21.4s +str q13, [x0, #1008] +add v29.4s, v29.4s, v21.4s +str q29, [x0, #992] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1528 +// Instruction count: 1524 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_10.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_10.s new file mode 100644 index 0000000..9f6d143 --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_10.s @@ -0,0 +1,1550 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_22_z4_10 +.global _ntt_u32_incomplete_neon_asm_var_4_2_22_z4_10 +ntt_u32_incomplete_neon_asm_var_4_2_22_z4_10: +_ntt_u32_incomplete_neon_asm_var_4_2_22_z4_10: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x0, #992] +sqrdmulh v27.4S, v28.4S, v29.s[0] +mul v28.4S, v28.4S,v30.s[0] +ldr q26, [x0, #928] +sqrdmulh v25.4S, v26.4S, v29.s[0] +mul v26.4S, v26.4S,v30.s[0] +ldr q24, [x0, #864] +sqrdmulh v23.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +ldr q22, [x0, #800] +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +ldr q20, [x0, #736] +sqrdmulh v19.4S, v20.4S, v29.s[0] +mla v28.4S, v27.4S, v31.s[0] +ldr q27, [x0, #672] +sqrdmulh v18.4S, v27.4S, v29.s[0] +mla v26.4S, v25.4S, v31.s[0] +ldr q25, [x0, #608] +sqrdmulh v17.4S, v25.4S, v29.s[0] +mla v24.4S, v23.4S, v31.s[0] +ldr q23, [x0, #544] +sqrdmulh v16.4S, v23.4S, v29.s[0] +mla v22.4S, v21.4S, v31.s[0] +ldr q21, [x0, #480] +mul v27.4S, v27.4S,v30.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q3, [x0, #416] +ldr q2, [x0, #352] +ldr q1, [x0, #288] +mla v27.4S, v18.4S, v31.s[0] +mla v20.4S, v19.4S, v31.s[0] +ldr q19, [x0, #224] +ldr q18, [x0, #160] +mul v23.4S, v23.4S,v30.s[0] +mul v25.4S, v25.4S,v30.s[0] +ldr q0, [x0, #96] +ldr q15, [x0, #32] +mla v23.4S, v16.4S, v31.s[0] +mla v25.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v28.4s +add v21.4s, v21.4s, v28.4s +sqrdmulh v28.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +sub v16.4s, v3.4s, v26.4s +add v3.4s, v3.4s, v26.4s +sqrdmulh v26.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +sub v14.4s, v2.4s, v24.4s +add v2.4s, v2.4s, v24.4s +sqrdmulh v24.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v13.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +sqrdmulh v22.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v12.4s, v19.4s, v20.4s +add v19.4s, v19.4s, v20.4s +sqrdmulh v20.4S, v14.4S, v29.s[2] +mla v17.4S, v28.4S, v31.s[0] +sub v28.4s, v18.4s, v27.4s +add v18.4s, v18.4s, v27.4s +sqrdmulh v27.4S, v13.4S, v29.s[2] +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v0.4s, v25.4s +add v0.4s, v0.4s, v25.4s +sqrdmulh v25.4S, v2.4S, v29.s[1] +mla v21.4S, v24.4S, v31.s[0] +sub v24.4s, v15.4s, v23.4s +sqrdmulh v11.4S, v1.4S, v29.s[1] +mla v3.4S, v22.4S, v31.s[0] +add v15.4s, v15.4s, v23.4s +ldr q23, [x17, #+32] +ldr q22, [x17, #+48] +mul v13.4S, v13.4S,v30.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v10.4s, v12.4s, v17.4s +add v12.4s, v12.4s, v17.4s +mla v13.4S, v27.4S, v31.s[0] +mla v14.4S, v20.4S, v31.s[0] +sub v20.4s, v28.4s, v16.4s +add v28.4s, v28.4s, v16.4s +mul v1.4S, v1.4S,v30.s[1] +mul v2.4S, v2.4S,v30.s[1] +sub v16.4s, v19.4s, v21.4s +add v19.4s, v19.4s, v21.4s +mla v1.4S, v11.4S, v31.s[0] +mla v2.4S, v25.4S, v31.s[0] +sub v25.4s, v18.4s, v3.4s +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v10.4S, v22.s[3] +mul v10.4S, v10.4S,v23.s[3] +sub v11.4s, v26.4s, v14.4s +add v26.4s, v26.4s, v14.4s +sqrdmulh v14.4S, v12.4S, v22.s[2] +mul v12.4S, v12.4S,v23.s[2] +sub v21.4s, v24.4s, v13.4s +add v24.4s, v24.4s, v13.4s +sqrdmulh v13.4S, v16.4S, v22.s[1] +mul v16.4S, v16.4S,v23.s[1] +sub v27.4s, v0.4s, v2.4s +add v0.4s, v0.4s, v2.4s +sqrdmulh v2.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v17.4s, v15.4s, v1.4s +add v15.4s, v15.4s, v1.4s +ldr q1, [x17, #+96] +ldr q9, [x17, #+112] +sqrdmulh v8.4S, v20.4S, v22.s[3] +mla v10.4S, v3.4S, v31.s[0] +nop +nop +sqrdmulh v3.4S, v28.4S, v22.s[2] +mla v12.4S, v14.4S, v31.s[0] +nop +nop +sqrdmulh v14.4S, v25.4S, v22.s[1] +mla v16.4S, v13.4S, v31.s[0] +nop +nop +sqrdmulh v13.4S, v18.4S, v22.s[0] +mla v19.4S, v2.4S, v31.s[0] +nop +nop +ldr q2, [x17, #+64] +ldr q7, [x17, #+80] +mul v28.4S, v28.4S,v23.s[2] +mul v20.4S, v20.4S,v23.s[3] +sub v6.4s, v11.4s, v10.4s +add v11.4s, v11.4s, v10.4s +mla v28.4S, v3.4S, v31.s[0] +mla v20.4S, v8.4S, v31.s[0] +sub v8.4s, v26.4s, v12.4s +add v26.4s, v26.4s, v12.4s +mul v18.4S, v18.4S,v23.s[0] +mul v25.4S, v25.4S,v23.s[1] +sub v12.4s, v27.4s, v16.4s +add v27.4s, v27.4s, v16.4s +mla v18.4S, v13.4S, v31.s[0] +mla v25.4S, v14.4S, v31.s[0] +sub v14.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v9.s[3] +mul v6.4S, v6.4S,v1.s[3] +sub v13.4s, v21.4s, v20.4s +add v21.4s, v21.4s, v20.4s +sqrdmulh v20.4S, v11.4S, v9.s[2] +mul v11.4S, v11.4S,v1.s[2] +sub v16.4s, v24.4s, v28.4s +add v24.4s, v24.4s, v28.4s +sqrdmulh v28.4S, v8.4S, v9.s[1] +mul v8.4S, v8.4S,v1.s[1] +sub v3.4s, v17.4s, v25.4s +add v17.4s, v17.4s, v25.4s +sqrdmulh v25.4S, v26.4S, v9.s[0] +mul v26.4S, v26.4S,v1.s[0] +sub v10.4s, v15.4s, v18.4s +add v15.4s, v15.4s, v18.4s +sqrdmulh v18.4S, v12.4S, v7.s[3] +mla v6.4S, v19.4S, v31.s[0] +nop +nop +sqrdmulh v19.4S, v27.4S, v7.s[2] +mla v11.4S, v20.4S, v31.s[0] +nop +nop +sqrdmulh v20.4S, v14.4S, v7.s[1] +mla v8.4S, v28.4S, v31.s[0] +nop +nop +sqrdmulh v28.4S, v0.4S, v7.s[0] +mla v26.4S, v25.4S, v31.s[0] +nop +nop +mul v27.4S, v27.4S,v2.s[2] +mul v12.4S, v12.4S,v2.s[3] +sub v25.4s, v13.4s, v6.4s +str q25, [x0, #992] +mla v27.4S, v19.4S, v31.s[0] +mla v12.4S, v18.4S, v31.s[0] +add v13.4s, v13.4s, v6.4s +str q13, [x0, #928] +mul v0.4S, v0.4S,v2.s[0] +mul v14.4S, v14.4S,v2.s[1] +sub v13.4s, v21.4s, v11.4s +str q13, [x0, #864] +mla v0.4S, v28.4S, v31.s[0] +mla v14.4S, v20.4S, v31.s[0] +add v21.4s, v21.4s, v11.4s +sub v11.4s, v16.4s, v8.4s +ldr q20, [x0, #1008] +sqrdmulh v28.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v16.4s, v16.4s, v8.4s +str q21, [x0, #800] +ldr q21, [x0, #944] +sqrdmulh v8.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +sub v13.4s, v24.4s, v26.4s +str q11, [x0, #736] +ldr q11, [x0, #880] +sqrdmulh v6.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +add v24.4s, v24.4s, v26.4s +str q16, [x0, #672] +ldr q16, [x0, #816] +sqrdmulh v26.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v18.4s, v3.4s, v12.4s +str q13, [x0, #608] +ldr q13, [x0, #752] +sqrdmulh v19.4S, v13.4S, v29.s[0] +mla v20.4S, v28.4S, v31.s[0] +add v3.4s, v3.4s, v12.4s +str q24, [x0, #544] +ldr q24, [x0, #688] +sqrdmulh v12.4S, v24.4S, v29.s[0] +mla v21.4S, v8.4S, v31.s[0] +sub v8.4s, v17.4s, v27.4s +str q18, [x0, #480] +ldr q18, [x0, #624] +sqrdmulh v28.4S, v18.4S, v29.s[0] +mla v11.4S, v6.4S, v31.s[0] +add v17.4s, v17.4s, v27.4s +str q3, [x0, #416] +ldr q3, [x0, #560] +sqrdmulh v27.4S, v3.4S, v29.s[0] +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v10.4s, v14.4s +str q8, [x0, #352] +ldr q8, [x0, #496] +add v10.4s, v10.4s, v14.4s +mul v24.4S, v24.4S,v30.s[0] +mul v13.4S, v13.4S,v30.s[0] +ldr q14, [x0, #432] +str q17, [x0, #288] +ldr q17, [x0, #368] +ldr q6, [x0, #304] +mla v24.4S, v12.4S, v31.s[0] +mla v13.4S, v19.4S, v31.s[0] +str q26, [x0, #224] +sub v26.4s, v15.4s, v0.4s +ldr q19, [x0, #240] +ldr q12, [x0, #176] +mul v3.4S, v3.4S,v30.s[0] +mul v18.4S, v18.4S,v30.s[0] +str q10, [x0, #160] +add v15.4s, v15.4s, v0.4s +ldr q0, [x0, #112] +ldr q10, [x0, #48] +mla v3.4S, v27.4S, v31.s[0] +mla v18.4S, v28.4S, v31.s[0] +sub v28.4s, v8.4s, v20.4s +add v8.4s, v8.4s, v20.4s +sqrdmulh v20.4S, v28.4S, v29.s[2] +mul v28.4S, v28.4S,v30.s[2] +sub v27.4s, v14.4s, v21.4s +add v14.4s, v14.4s, v21.4s +sqrdmulh v21.4S, v27.4S, v29.s[2] +mul v27.4S, v27.4S,v30.s[2] +sub v25.4s, v17.4s, v11.4s +add v17.4s, v17.4s, v11.4s +sqrdmulh v11.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +sub v5.4s, v6.4s, v16.4s +add v6.4s, v6.4s, v16.4s +sqrdmulh v16.4S, v14.4S, v29.s[1] +mul v14.4S, v14.4S,v30.s[1] +sub v4.4s, v19.4s, v13.4s +add v19.4s, v19.4s, v13.4s +sqrdmulh v13.4S, v25.4S, v29.s[2] +mla v28.4S, v20.4S, v31.s[0] +sub v20.4s, v12.4s, v24.4s +add v12.4s, v12.4s, v24.4s +sqrdmulh v24.4S, v5.4S, v29.s[2] +mla v27.4S, v21.4S, v31.s[0] +sub v21.4s, v0.4s, v18.4s +add v0.4s, v0.4s, v18.4s +sqrdmulh v18.4S, v17.4S, v29.s[1] +mla v8.4S, v11.4S, v31.s[0] +sub v11.4s, v10.4s, v3.4s +str q26, [x0, #96] +sqrdmulh v26.4S, v6.4S, v29.s[1] +mla v14.4S, v16.4S, v31.s[0] +add v10.4s, v10.4s, v3.4s +str q15, [x0, #32] +mul v5.4S, v5.4S,v30.s[2] +mul v25.4S, v25.4S,v30.s[2] +sub v15.4s, v4.4s, v28.4s +add v4.4s, v4.4s, v28.4s +mla v5.4S, v24.4S, v31.s[0] +mla v25.4S, v13.4S, v31.s[0] +sub v13.4s, v20.4s, v27.4s +add v20.4s, v20.4s, v27.4s +mul v6.4S, v6.4S,v30.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v27.4s, v19.4s, v8.4s +add v19.4s, v19.4s, v8.4s +mla v6.4S, v26.4S, v31.s[0] +mla v17.4S, v18.4S, v31.s[0] +sub v18.4s, v12.4s, v14.4s +add v12.4s, v12.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v22.s[3] +mul v15.4S, v15.4S,v23.s[3] +sub v26.4s, v21.4s, v25.4s +add v21.4s, v21.4s, v25.4s +sqrdmulh v25.4S, v4.4S, v22.s[2] +mul v4.4S, v4.4S,v23.s[2] +sub v8.4s, v11.4s, v5.4s +add v11.4s, v11.4s, v5.4s +sqrdmulh v5.4S, v27.4S, v22.s[1] +mul v27.4S, v27.4S,v23.s[1] +sub v24.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +sqrdmulh v17.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v28.4s, v10.4s, v6.4s +add v10.4s, v10.4s, v6.4s +sqrdmulh v6.4S, v13.4S, v22.s[3] +mla v15.4S, v14.4S, v31.s[0] +nop +nop +sqrdmulh v14.4S, v20.4S, v22.s[2] +mla v4.4S, v25.4S, v31.s[0] +nop +nop +sqrdmulh v25.4S, v18.4S, v22.s[1] +mla v27.4S, v5.4S, v31.s[0] +nop +nop +sqrdmulh v5.4S, v12.4S, v22.s[0] +mla v19.4S, v17.4S, v31.s[0] +nop +nop +mul v20.4S, v20.4S,v23.s[2] +mul v13.4S, v13.4S,v23.s[3] +sub v17.4s, v26.4s, v15.4s +add v26.4s, v26.4s, v15.4s +mla v20.4S, v14.4S, v31.s[0] +mla v13.4S, v6.4S, v31.s[0] +sub v6.4s, v21.4s, v4.4s +add v21.4s, v21.4s, v4.4s +mul v12.4S, v12.4S,v23.s[0] +mul v18.4S, v18.4S,v23.s[1] +sub v4.4s, v24.4s, v27.4s +add v24.4s, v24.4s, v27.4s +mla v12.4S, v5.4S, v31.s[0] +mla v18.4S, v25.4S, v31.s[0] +sub v25.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v17.4S, v9.s[3] +mul v17.4S, v17.4S,v1.s[3] +sub v5.4s, v8.4s, v13.4s +add v8.4s, v8.4s, v13.4s +sqrdmulh v13.4S, v26.4S, v9.s[2] +mul v26.4S, v26.4S,v1.s[2] +sub v27.4s, v11.4s, v20.4s +add v11.4s, v11.4s, v20.4s +sqrdmulh v20.4S, v6.4S, v9.s[1] +mul v6.4S, v6.4S,v1.s[1] +sub v14.4s, v28.4s, v18.4s +add v28.4s, v28.4s, v18.4s +sqrdmulh v18.4S, v21.4S, v9.s[0] +mul v21.4S, v21.4S,v1.s[0] +sub v15.4s, v10.4s, v12.4s +add v10.4s, v10.4s, v12.4s +sqrdmulh v12.4S, v4.4S, v7.s[3] +mla v17.4S, v19.4S, v31.s[0] +nop +nop +sqrdmulh v19.4S, v24.4S, v7.s[2] +mla v26.4S, v13.4S, v31.s[0] +nop +nop +sqrdmulh v13.4S, v25.4S, v7.s[1] +mla v6.4S, v20.4S, v31.s[0] +nop +nop +sqrdmulh v20.4S, v0.4S, v7.s[0] +mla v21.4S, v18.4S, v31.s[0] +nop +nop +mul v24.4S, v24.4S,v2.s[2] +mul v4.4S, v4.4S,v2.s[3] +sub v18.4s, v5.4s, v17.4s +str q18, [x0, #1008] +mla v24.4S, v19.4S, v31.s[0] +mla v4.4S, v12.4S, v31.s[0] +add v5.4s, v5.4s, v17.4s +str q5, [x0, #944] +mul v0.4S, v0.4S,v2.s[0] +mul v25.4S, v25.4S,v2.s[1] +sub v5.4s, v8.4s, v26.4s +str q5, [x0, #880] +mla v0.4S, v20.4S, v31.s[0] +mla v25.4S, v13.4S, v31.s[0] +add v8.4s, v8.4s, v26.4s +sub v26.4s, v27.4s, v6.4s +ldr q13, [x0, #960] +sqrdmulh v20.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +add v27.4s, v27.4s, v6.4s +str q8, [x0, #816] +ldr q8, [x0, #896] +sqrdmulh v6.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v5.4s, v11.4s, v21.4s +str q26, [x0, #752] +ldr q26, [x0, #832] +sqrdmulh v17.4S, v26.4S, v29.s[0] +mul v26.4S, v26.4S,v30.s[0] +add v11.4s, v11.4s, v21.4s +str q27, [x0, #688] +ldr q27, [x0, #768] +sqrdmulh v21.4S, v27.4S, v29.s[0] +mul v27.4S, v27.4S,v30.s[0] +sub v12.4s, v14.4s, v4.4s +str q5, [x0, #624] +ldr q5, [x0, #704] +sqrdmulh v19.4S, v5.4S, v29.s[0] +mla v13.4S, v20.4S, v31.s[0] +add v14.4s, v14.4s, v4.4s +str q11, [x0, #560] +ldr q11, [x0, #640] +sqrdmulh v4.4S, v11.4S, v29.s[0] +mla v8.4S, v6.4S, v31.s[0] +sub v6.4s, v28.4s, v24.4s +str q12, [x0, #496] +ldr q12, [x0, #576] +sqrdmulh v20.4S, v12.4S, v29.s[0] +mla v26.4S, v17.4S, v31.s[0] +add v28.4s, v28.4s, v24.4s +str q14, [x0, #432] +ldr q14, [x0, #512] +sqrdmulh v24.4S, v14.4S, v29.s[0] +mla v27.4S, v21.4S, v31.s[0] +sub v21.4s, v15.4s, v25.4s +str q6, [x0, #368] +ldr q6, [x0, #448] +add v15.4s, v15.4s, v25.4s +mul v11.4S, v11.4S,v30.s[0] +mul v5.4S, v5.4S,v30.s[0] +ldr q25, [x0, #384] +str q28, [x0, #304] +ldr q28, [x0, #320] +ldr q17, [x0, #256] +mla v11.4S, v4.4S, v31.s[0] +mla v5.4S, v19.4S, v31.s[0] +str q21, [x0, #240] +sub v21.4s, v10.4s, v0.4s +ldr q19, [x0, #192] +ldr q4, [x0, #128] +mul v14.4S, v14.4S,v30.s[0] +mul v12.4S, v12.4S,v30.s[0] +str q15, [x0, #176] +add v10.4s, v10.4s, v0.4s +ldr q0, [x0, #64] +ldr q15, [x0, #0] +mla v14.4S, v24.4S, v31.s[0] +mla v12.4S, v20.4S, v31.s[0] +sub v20.4s, v6.4s, v13.4s +add v6.4s, v6.4s, v13.4s +sqrdmulh v13.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +sub v24.4s, v25.4s, v8.4s +add v25.4s, v25.4s, v8.4s +sqrdmulh v8.4S, v24.4S, v29.s[2] +mul v24.4S, v24.4S,v30.s[2] +sub v18.4s, v28.4s, v26.4s +add v28.4s, v28.4s, v26.4s +sqrdmulh v26.4S, v6.4S, v29.s[1] +mul v6.4S, v6.4S,v30.s[1] +sub v3.4s, v17.4s, v27.4s +add v17.4s, v17.4s, v27.4s +sqrdmulh v27.4S, v25.4S, v29.s[1] +mul v25.4S, v25.4S,v30.s[1] +sub v16.4s, v19.4s, v5.4s +add v19.4s, v19.4s, v5.4s +sqrdmulh v5.4S, v18.4S, v29.s[2] +mla v20.4S, v13.4S, v31.s[0] +sub v13.4s, v4.4s, v11.4s +add v4.4s, v4.4s, v11.4s +sqrdmulh v11.4S, v3.4S, v29.s[2] +mla v24.4S, v8.4S, v31.s[0] +sub v8.4s, v0.4s, v12.4s +add v0.4s, v0.4s, v12.4s +sqrdmulh v12.4S, v28.4S, v29.s[1] +mla v6.4S, v26.4S, v31.s[0] +sub v26.4s, v15.4s, v14.4s +str q21, [x0, #112] +sqrdmulh v21.4S, v17.4S, v29.s[1] +mla v25.4S, v27.4S, v31.s[0] +add v15.4s, v15.4s, v14.4s +str q10, [x0, #48] +mul v3.4S, v3.4S,v30.s[2] +mul v18.4S, v18.4S,v30.s[2] +sub v10.4s, v16.4s, v20.4s +add v16.4s, v16.4s, v20.4s +mla v3.4S, v11.4S, v31.s[0] +mla v18.4S, v5.4S, v31.s[0] +sub v5.4s, v13.4s, v24.4s +add v13.4s, v13.4s, v24.4s +mul v17.4S, v17.4S,v30.s[1] +mul v28.4S, v28.4S,v30.s[1] +sub v24.4s, v19.4s, v6.4s +add v19.4s, v19.4s, v6.4s +mla v17.4S, v21.4S, v31.s[0] +mla v28.4S, v12.4S, v31.s[0] +sub v12.4s, v4.4s, v25.4s +add v4.4s, v4.4s, v25.4s +sqrdmulh v25.4S, v10.4S, v22.s[3] +mul v10.4S, v10.4S,v23.s[3] +sub v21.4s, v8.4s, v18.4s +add v8.4s, v8.4s, v18.4s +sqrdmulh v18.4S, v16.4S, v22.s[2] +mul v16.4S, v16.4S,v23.s[2] +sub v6.4s, v26.4s, v3.4s +add v26.4s, v26.4s, v3.4s +sqrdmulh v3.4S, v24.4S, v22.s[1] +mul v24.4S, v24.4S,v23.s[1] +sub v11.4s, v0.4s, v28.4s +add v0.4s, v0.4s, v28.4s +sqrdmulh v28.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v20.4s, v15.4s, v17.4s +add v15.4s, v15.4s, v17.4s +sqrdmulh v17.4S, v5.4S, v22.s[3] +mla v10.4S, v25.4S, v31.s[0] +nop +nop +sqrdmulh v25.4S, v13.4S, v22.s[2] +mla v16.4S, v18.4S, v31.s[0] +nop +nop +sqrdmulh v18.4S, v12.4S, v22.s[1] +mla v24.4S, v3.4S, v31.s[0] +nop +nop +sqrdmulh v3.4S, v4.4S, v22.s[0] +mla v19.4S, v28.4S, v31.s[0] +nop +nop +mul v13.4S, v13.4S,v23.s[2] +mul v5.4S, v5.4S,v23.s[3] +sub v28.4s, v21.4s, v10.4s +add v21.4s, v21.4s, v10.4s +mla v13.4S, v25.4S, v31.s[0] +mla v5.4S, v17.4S, v31.s[0] +sub v17.4s, v8.4s, v16.4s +add v8.4s, v8.4s, v16.4s +mul v4.4S, v4.4S,v23.s[0] +mul v12.4S, v12.4S,v23.s[1] +sub v16.4s, v11.4s, v24.4s +add v11.4s, v11.4s, v24.4s +mla v4.4S, v3.4S, v31.s[0] +mla v12.4S, v18.4S, v31.s[0] +sub v18.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v28.4S, v9.s[3] +mul v28.4S, v28.4S,v1.s[3] +sub v3.4s, v6.4s, v5.4s +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v21.4S, v9.s[2] +mul v21.4S, v21.4S,v1.s[2] +sub v24.4s, v26.4s, v13.4s +add v26.4s, v26.4s, v13.4s +sqrdmulh v13.4S, v17.4S, v9.s[1] +mul v17.4S, v17.4S,v1.s[1] +sub v25.4s, v20.4s, v12.4s +add v20.4s, v20.4s, v12.4s +sqrdmulh v12.4S, v8.4S, v9.s[0] +mul v8.4S, v8.4S,v1.s[0] +sub v10.4s, v15.4s, v4.4s +add v15.4s, v15.4s, v4.4s +sqrdmulh v4.4S, v16.4S, v7.s[3] +mla v28.4S, v19.4S, v31.s[0] +nop +nop +sqrdmulh v19.4S, v11.4S, v7.s[2] +mla v21.4S, v5.4S, v31.s[0] +nop +nop +sqrdmulh v5.4S, v18.4S, v7.s[1] +mla v17.4S, v13.4S, v31.s[0] +nop +nop +sqrdmulh v13.4S, v0.4S, v7.s[0] +mla v8.4S, v12.4S, v31.s[0] +nop +nop +mul v11.4S, v11.4S,v2.s[2] +mul v16.4S, v16.4S,v2.s[3] +sub v12.4s, v3.4s, v28.4s +str q12, [x0, #960] +mla v11.4S, v19.4S, v31.s[0] +mla v16.4S, v4.4S, v31.s[0] +add v3.4s, v3.4s, v28.4s +str q3, [x0, #896] +mul v0.4S, v0.4S,v2.s[0] +mul v18.4S, v18.4S,v2.s[1] +sub v3.4s, v6.4s, v21.4s +str q3, [x0, #832] +mla v0.4S, v13.4S, v31.s[0] +mla v18.4S, v5.4S, v31.s[0] +add v6.4s, v6.4s, v21.4s +sub v21.4s, v24.4s, v17.4s +ldr q5, [x0, #976] +sqrdmulh v13.4S, v5.4S, v29.s[0] +mul v5.4S, v5.4S,v30.s[0] +add v24.4s, v24.4s, v17.4s +str q6, [x0, #768] +ldr q6, [x0, #912] +sqrdmulh v17.4S, v6.4S, v29.s[0] +mul v6.4S, v6.4S,v30.s[0] +sub v3.4s, v26.4s, v8.4s +str q21, [x0, #704] +ldr q21, [x0, #848] +sqrdmulh v28.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +add v26.4s, v26.4s, v8.4s +str q24, [x0, #640] +ldr q24, [x0, #784] +sqrdmulh v8.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +sub v4.4s, v25.4s, v16.4s +str q3, [x0, #576] +ldr q3, [x0, #720] +sqrdmulh v19.4S, v3.4S, v29.s[0] +mla v5.4S, v13.4S, v31.s[0] +add v25.4s, v25.4s, v16.4s +str q26, [x0, #512] +ldr q26, [x0, #656] +sqrdmulh v16.4S, v26.4S, v29.s[0] +mla v6.4S, v17.4S, v31.s[0] +sub v17.4s, v20.4s, v11.4s +str q4, [x0, #448] +ldr q4, [x0, #592] +sqrdmulh v13.4S, v4.4S, v29.s[0] +mla v21.4S, v28.4S, v31.s[0] +add v20.4s, v20.4s, v11.4s +str q25, [x0, #384] +ldr q25, [x0, #528] +sqrdmulh v11.4S, v25.4S, v29.s[0] +mla v24.4S, v8.4S, v31.s[0] +sub v8.4s, v10.4s, v18.4s +str q17, [x0, #320] +ldr q17, [x0, #464] +add v10.4s, v10.4s, v18.4s +mul v26.4S, v26.4S,v30.s[0] +mul v3.4S, v3.4S,v30.s[0] +ldr q18, [x0, #400] +str q20, [x0, #256] +ldr q20, [x0, #336] +ldr q28, [x0, #272] +mla v26.4S, v16.4S, v31.s[0] +mla v3.4S, v19.4S, v31.s[0] +str q8, [x0, #192] +sub v8.4s, v15.4s, v0.4s +ldr q19, [x0, #208] +ldr q16, [x0, #144] +mul v25.4S, v25.4S,v30.s[0] +mul v4.4S, v4.4S,v30.s[0] +str q10, [x0, #128] +add v15.4s, v15.4s, v0.4s +ldr q0, [x0, #80] +ldr q10, [x0, #16] +mla v25.4S, v11.4S, v31.s[0] +mla v4.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v5.4s +add v17.4s, v17.4s, v5.4s +sqrdmulh v5.4S, v13.4S, v29.s[2] +mul v13.4S, v13.4S,v30.s[2] +sub v11.4s, v18.4s, v6.4s +add v18.4s, v18.4s, v6.4s +sqrdmulh v6.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v12.4s, v20.4s, v21.4s +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v14.4s, v28.4s, v24.4s +add v28.4s, v28.4s, v24.4s +sqrdmulh v24.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v27.4s, v19.4s, v3.4s +add v19.4s, v19.4s, v3.4s +sqrdmulh v3.4S, v12.4S, v29.s[2] +mla v13.4S, v5.4S, v31.s[0] +sub v5.4s, v16.4s, v26.4s +add v16.4s, v16.4s, v26.4s +sqrdmulh v26.4S, v14.4S, v29.s[2] +mla v11.4S, v6.4S, v31.s[0] +sub v6.4s, v0.4s, v4.4s +add v0.4s, v0.4s, v4.4s +sqrdmulh v4.4S, v20.4S, v29.s[1] +mla v17.4S, v21.4S, v31.s[0] +sub v21.4s, v10.4s, v25.4s +str q8, [x0, #64] +sqrdmulh v8.4S, v28.4S, v29.s[1] +mla v18.4S, v24.4S, v31.s[0] +add v10.4s, v10.4s, v25.4s +str q15, [x0, #0] +mul v14.4S, v14.4S,v30.s[2] +mul v12.4S, v12.4S,v30.s[2] +sub v15.4s, v27.4s, v13.4s +add v27.4s, v27.4s, v13.4s +mla v14.4S, v26.4S, v31.s[0] +mla v12.4S, v3.4S, v31.s[0] +sub v3.4s, v5.4s, v11.4s +add v5.4s, v5.4s, v11.4s +mul v28.4S, v28.4S,v30.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v11.4s, v19.4s, v17.4s +add v19.4s, v19.4s, v17.4s +mla v28.4S, v8.4S, v31.s[0] +mla v20.4S, v4.4S, v31.s[0] +sub v4.4s, v16.4s, v18.4s +add v16.4s, v16.4s, v18.4s +sqrdmulh v29.4S, v15.4S, v22.s[3] +mul v15.4S, v15.4S,v23.s[3] +sub v30.4s, v6.4s, v12.4s +add v6.4s, v6.4s, v12.4s +sqrdmulh v12.4S, v27.4S, v22.s[2] +mul v27.4S, v27.4S,v23.s[2] +sub v18.4s, v21.4s, v14.4s +add v21.4s, v21.4s, v14.4s +sqrdmulh v14.4S, v11.4S, v22.s[1] +mul v11.4S, v11.4S,v23.s[1] +sub v8.4s, v0.4s, v20.4s +add v0.4s, v0.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v17.4s, v10.4s, v28.4s +add v10.4s, v10.4s, v28.4s +sqrdmulh v28.4S, v3.4S, v22.s[3] +mla v15.4S, v29.4S, v31.s[0] +nop +nop +sqrdmulh v29.4S, v5.4S, v22.s[2] +mla v27.4S, v12.4S, v31.s[0] +nop +nop +sqrdmulh v12.4S, v4.4S, v22.s[1] +mla v11.4S, v14.4S, v31.s[0] +nop +nop +sqrdmulh v14.4S, v16.4S, v22.s[0] +mla v19.4S, v20.4S, v31.s[0] +nop +nop +mul v5.4S, v5.4S,v23.s[2] +mul v3.4S, v3.4S,v23.s[3] +sub v20.4s, v30.4s, v15.4s +add v30.4s, v30.4s, v15.4s +mla v5.4S, v29.4S, v31.s[0] +mla v3.4S, v28.4S, v31.s[0] +sub v28.4s, v6.4s, v27.4s +add v6.4s, v6.4s, v27.4s +mul v16.4S, v16.4S,v23.s[0] +mul v4.4S, v4.4S,v23.s[1] +sub v27.4s, v8.4s, v11.4s +add v8.4s, v8.4s, v11.4s +mla v16.4S, v14.4S, v31.s[0] +mla v4.4S, v12.4S, v31.s[0] +sub v12.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v22.4S, v20.4S, v9.s[3] +mul v20.4S, v20.4S,v1.s[3] +sub v23.4s, v18.4s, v3.4s +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v30.4S, v9.s[2] +mul v30.4S, v30.4S,v1.s[2] +sub v19.4s, v21.4s, v5.4s +add v21.4s, v21.4s, v5.4s +sqrdmulh v5.4S, v28.4S, v9.s[1] +mul v28.4S, v28.4S,v1.s[1] +sub v14.4s, v17.4s, v4.4s +add v17.4s, v17.4s, v4.4s +sqrdmulh v4.4S, v6.4S, v9.s[0] +mul v6.4S, v6.4S,v1.s[0] +sub v11.4s, v10.4s, v16.4s +add v10.4s, v10.4s, v16.4s +sqrdmulh v9.4S, v27.4S, v7.s[3] +mla v20.4S, v22.4S, v31.s[0] +nop +nop +sqrdmulh v22.4S, v8.4S, v7.s[2] +mla v30.4S, v3.4S, v31.s[0] +nop +nop +sqrdmulh v3.4S, v12.4S, v7.s[1] +mla v28.4S, v5.4S, v31.s[0] +nop +nop +sqrdmulh v5.4S, v0.4S, v7.s[0] +mla v6.4S, v4.4S, v31.s[0] +nop +nop +mul v8.4S, v8.4S,v2.s[2] +mul v27.4S, v27.4S,v2.s[3] +sub v4.4s, v23.4s, v20.4s +str q4, [x0, #976] +mla v8.4S, v22.4S, v31.s[0] +mla v27.4S, v9.4S, v31.s[0] +add v23.4s, v23.4s, v20.4s +str q23, [x0, #912] +mul v0.4S, v0.4S,v2.s[0] +mul v12.4S, v12.4S,v2.s[1] +sub v23.4s, v18.4s, v30.4s +str q23, [x0, #848] +mla v0.4S, v5.4S, v31.s[0] +mla v12.4S, v3.4S, v31.s[0] +add v18.4s, v18.4s, v30.4s +sub v30.4s, v19.4s, v28.4s +add v19.4s, v19.4s, v28.4s +str q18, [x0, #784] +sub v18.4s, v21.4s, v6.4s +str q30, [x0, #720] +add v21.4s, v21.4s, v6.4s +str q19, [x0, #656] +sub v19.4s, v14.4s, v27.4s +str q18, [x0, #592] +add v14.4s, v14.4s, v27.4s +str q21, [x0, #528] +sub v21.4s, v17.4s, v8.4s +str q19, [x0, #464] +add v17.4s, v17.4s, v8.4s +str q14, [x0, #400] +sub v14.4s, v11.4s, v12.4s +str q21, [x0, #336] +add v11.4s, v11.4s, v12.4s +str q17, [x0, #272] +sub v17.4s, v10.4s, v0.4s +add v10.4s, v10.4s, v0.4s +ldr q24, [x0, #224] +ldr q25, [x0, #160] +ldr q13, [x0, #32] +ldr q26, [x17, #+128] +ldr q15, [x17, #+144] +sqrdmulh v29.4S, v13.4S, v15.s[0] +mul v13.4S, v13.4S,v26.s[0] +ldr q16, [x0, #48] +ldr q1, [x17, #+160] +sqrdmulh v4.4S, v16.4S, v15.s[0] +mul v16.4S, v16.4S,v26.s[0] +ldr q22, [x17, #+176] +ldr q9, [x0, #96] +sqrdmulh v20.4S, v9.4S, v22.s[0] +mul v9.4S, v9.4S,v1.s[0] +ldr q23, [x0, #112] +sqrdmulh v5.4S, v23.4S, v22.s[0] +mul v23.4S, v23.4S,v1.s[0] +ldr q3, [x17, #+192] +ldr q2, [x17, #+208] +mla v13.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v25.4S, v2.s[0] +ldr q7, [x0, #176] +mla v16.4S, v4.4S, v31.s[0] +sqrdmulh v4.4S, v7.4S, v2.s[0] +ldr q28, [x17, #+224] +ldr q30, [x17, #+240] +mla v9.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v24.4S, v30.s[0] +ldr q6, [x0, #240] +mla v23.4S, v5.4S, v31.s[0] +sqrdmulh v5.4S, v6.4S, v30.s[0] +ldr q18, [x0, #0] +ldr q27, [x0, #128] +mul v25.4S, v25.4S,v3.s[0] +mul v7.4S, v7.4S,v3.s[0] +mla v25.4S, v29.4S, v31.s[0] +mla v7.4S, v4.4S, v31.s[0] +sub v4.4s, v18.4s, v13.4s +ldr q29, [x0, #64] +add v18.4s, v18.4s, v13.4s +ldr q13, [x0, #192] +mul v24.4S, v24.4S,v28.s[0] +mul v6.4S, v6.4S,v28.s[0] +sub v19.4s, v10.4s, v16.4s +add v10.4s, v10.4s, v16.4s +mla v24.4S, v20.4S, v31.s[0] +mla v6.4S, v5.4S, v31.s[0] +sub v5.4s, v29.4s, v9.4s +add v29.4s, v29.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v15.s[1] +mul v10.4S, v10.4S,v26.s[1] +sub v20.4s, v17.4s, v23.4s +add v17.4s, v17.4s, v23.4s +sqrdmulh v23.4S, v19.4S, v15.s[2] +mul v19.4S, v19.4S,v26.s[2] +sub v16.4s, v27.4s, v25.4s +add v27.4s, v27.4s, v25.4s +sqrdmulh v15.4S, v17.4S, v22.s[1] +mul v17.4S, v17.4S,v1.s[1] +sub v25.4s, v11.4s, v7.4s +add v11.4s, v11.4s, v7.4s +sqrdmulh v7.4S, v20.4S, v22.s[2] +mul v20.4S, v20.4S,v1.s[2] +sub v26.4s, v13.4s, v24.4s +add v13.4s, v13.4s, v24.4s +mla v10.4S, v9.4S, v31.s[0] +sqrdmulh v9.4S, v11.4S, v2.s[1] +sub v22.4s, v14.4s, v6.4s +ldr q24, [x0, #480] +add v14.4s, v14.4s, v6.4s +mla v19.4S, v23.4S, v31.s[0] +sqrdmulh v23.4S, v25.4S, v2.s[2] +sub v6.4s, v18.4s, v10.4s +ldr q1, [x0, #416] +str q6, [x0, #16] +mla v17.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v14.4S, v30.s[1] +add v18.4s, v18.4s, v10.4s +ldr q10, [x0, #288] +str q18, [x0, #0] +mla v20.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v22.4S, v30.s[2] +sub v18.4s, v4.4s, v19.4s +ldr q6, [x17, #+256] +str q18, [x0, #48] +mul v11.4S, v11.4S,v3.s[1] +mul v25.4S, v25.4S,v3.s[2] +add v4.4s, v4.4s, v19.4s +str q4, [x0, #32] +ldr q4, [x17, #+272] +mla v11.4S, v9.4S, v31.s[0] +mla v25.4S, v23.4S, v31.s[0] +sub v23.4s, v29.4s, v17.4s +str q23, [x0, #80] +mul v14.4S, v14.4S,v28.s[1] +mul v22.4S, v22.4S,v28.s[2] +add v29.4s, v29.4s, v17.4s +str q29, [x0, #64] +mla v14.4S, v15.4S, v31.s[0] +mla v22.4S, v7.4S, v31.s[0] +sub v7.4s, v5.4s, v20.4s +str q7, [x0, #112] +sqrdmulh v30.4S, v10.4S, v4.s[0] +mul v10.4S, v10.4S,v6.s[0] +add v5.4s, v5.4s, v20.4s +ldr q20, [x0, #304] +str q5, [x0, #96] +ldr q5, [x17, #+288] +sqrdmulh v7.4S, v20.4S, v4.s[0] +mul v20.4S, v20.4S,v6.s[0] +sub v28.4s, v27.4s, v11.4s +ldr q15, [x17, #+304] +str q28, [x0, #144] +ldr q28, [x0, #352] +sqrdmulh v29.4S, v28.4S, v15.s[0] +mul v28.4S, v28.4S,v5.s[0] +add v27.4s, v27.4s, v11.4s +str q27, [x0, #128] +ldr q27, [x0, #368] +sqrdmulh v11.4S, v27.4S, v15.s[0] +mul v27.4S, v27.4S,v5.s[0] +sub v17.4s, v16.4s, v25.4s +ldr q2, [x17, #+320] +str q17, [x0, #176] +ldr q17, [x17, #+336] +mla v10.4S, v30.4S, v31.s[0] +sqrdmulh v30.4S, v1.4S, v17.s[0] +add v16.4s, v16.4s, v25.4s +ldr q25, [x0, #432] +str q16, [x0, #160] +mla v20.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v25.4S, v17.s[0] +sub v16.4s, v13.4s, v14.4s +ldr q23, [x17, #+352] +str q16, [x0, #208] +ldr q16, [x17, #+368] +mla v28.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v24.4S, v16.s[0] +add v13.4s, v13.4s, v14.4s +str q13, [x0, #192] +ldr q13, [x0, #496] +mla v27.4S, v11.4S, v31.s[0] +sqrdmulh v11.4S, v13.4S, v16.s[0] +sub v14.4s, v26.4s, v22.4s +ldr q3, [x0, #256] +str q14, [x0, #240] +ldr q14, [x0, #384] +mul v1.4S, v1.4S,v2.s[0] +mul v25.4S, v25.4S,v2.s[0] +add v26.4s, v26.4s, v22.4s +ldr q22, [x0, #272] +str q26, [x0, #224] +ldr q26, [x0, #400] +mla v1.4S, v30.4S, v31.s[0] +mla v25.4S, v7.4S, v31.s[0] +sub v7.4s, v3.4s, v10.4s +ldr q30, [x0, #320] +add v3.4s, v3.4s, v10.4s +ldr q10, [x0, #448] +mul v24.4S, v24.4S,v23.s[0] +mul v13.4S, v13.4S,v23.s[0] +sub v9.4s, v22.4s, v20.4s +ldr q19, [x0, #336] +add v22.4s, v22.4s, v20.4s +ldr q20, [x0, #464] +mla v24.4S, v29.4S, v31.s[0] +mla v13.4S, v11.4S, v31.s[0] +sub v11.4s, v30.4s, v28.4s +add v30.4s, v30.4s, v28.4s +sqrdmulh v28.4S, v22.4S, v4.s[1] +mul v22.4S, v22.4S,v6.s[1] +sub v29.4s, v19.4s, v27.4s +add v19.4s, v19.4s, v27.4s +sqrdmulh v27.4S, v9.4S, v4.s[2] +mul v9.4S, v9.4S,v6.s[2] +sub v18.4s, v14.4s, v1.4s +add v14.4s, v14.4s, v1.4s +sqrdmulh v4.4S, v19.4S, v15.s[1] +mul v19.4S, v19.4S,v5.s[1] +sub v1.4s, v26.4s, v25.4s +add v26.4s, v26.4s, v25.4s +sqrdmulh v25.4S, v29.4S, v15.s[2] +mul v29.4S, v29.4S,v5.s[2] +sub v6.4s, v10.4s, v24.4s +add v10.4s, v10.4s, v24.4s +mla v22.4S, v28.4S, v31.s[0] +sqrdmulh v28.4S, v26.4S, v17.s[1] +sub v15.4s, v20.4s, v13.4s +ldr q24, [x0, #736] +add v20.4s, v20.4s, v13.4s +mla v9.4S, v27.4S, v31.s[0] +sqrdmulh v27.4S, v1.4S, v17.s[2] +sub v13.4s, v3.4s, v22.4s +ldr q5, [x0, #672] +str q13, [x0, #272] +mla v19.4S, v4.4S, v31.s[0] +sqrdmulh v4.4S, v20.4S, v16.s[1] +add v3.4s, v3.4s, v22.4s +ldr q22, [x0, #544] +str q3, [x0, #256] +mla v29.4S, v25.4S, v31.s[0] +sqrdmulh v25.4S, v15.4S, v16.s[2] +sub v3.4s, v7.4s, v9.4s +ldr q13, [x17, #+384] +str q3, [x0, #304] +mul v26.4S, v26.4S,v2.s[1] +mul v1.4S, v1.4S,v2.s[2] +add v7.4s, v7.4s, v9.4s +str q7, [x0, #288] +ldr q7, [x17, #+400] +mla v26.4S, v28.4S, v31.s[0] +mla v1.4S, v27.4S, v31.s[0] +sub v27.4s, v30.4s, v19.4s +str q27, [x0, #336] +mul v20.4S, v20.4S,v23.s[1] +mul v15.4S, v15.4S,v23.s[2] +add v30.4s, v30.4s, v19.4s +str q30, [x0, #320] +mla v20.4S, v4.4S, v31.s[0] +mla v15.4S, v25.4S, v31.s[0] +sub v25.4s, v11.4s, v29.4s +str q25, [x0, #368] +sqrdmulh v16.4S, v22.4S, v7.s[0] +mul v22.4S, v22.4S,v13.s[0] +add v11.4s, v11.4s, v29.4s +ldr q29, [x0, #560] +str q11, [x0, #352] +ldr q11, [x17, #+416] +sqrdmulh v25.4S, v29.4S, v7.s[0] +mul v29.4S, v29.4S,v13.s[0] +sub v23.4s, v14.4s, v26.4s +ldr q4, [x17, #+432] +str q23, [x0, #400] +ldr q23, [x0, #608] +sqrdmulh v30.4S, v23.4S, v4.s[0] +mul v23.4S, v23.4S,v11.s[0] +add v14.4s, v14.4s, v26.4s +str q14, [x0, #384] +ldr q14, [x0, #624] +sqrdmulh v26.4S, v14.4S, v4.s[0] +mul v14.4S, v14.4S,v11.s[0] +sub v19.4s, v18.4s, v1.4s +ldr q17, [x17, #+448] +str q19, [x0, #432] +ldr q19, [x17, #+464] +mla v22.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v5.4S, v19.s[0] +add v18.4s, v18.4s, v1.4s +ldr q1, [x0, #688] +str q18, [x0, #416] +mla v29.4S, v25.4S, v31.s[0] +sqrdmulh v25.4S, v1.4S, v19.s[0] +sub v18.4s, v10.4s, v20.4s +ldr q27, [x17, #+480] +str q18, [x0, #464] +ldr q18, [x17, #+496] +mla v23.4S, v30.4S, v31.s[0] +sqrdmulh v30.4S, v24.4S, v18.s[0] +add v10.4s, v10.4s, v20.4s +str q10, [x0, #448] +ldr q10, [x0, #752] +mla v14.4S, v26.4S, v31.s[0] +sqrdmulh v26.4S, v10.4S, v18.s[0] +sub v20.4s, v6.4s, v15.4s +ldr q2, [x0, #512] +str q20, [x0, #496] +ldr q20, [x0, #640] +mul v5.4S, v5.4S,v17.s[0] +mul v1.4S, v1.4S,v17.s[0] +add v6.4s, v6.4s, v15.4s +ldr q15, [x0, #528] +str q6, [x0, #480] +ldr q6, [x0, #656] +mla v5.4S, v16.4S, v31.s[0] +mla v1.4S, v25.4S, v31.s[0] +sub v25.4s, v2.4s, v22.4s +ldr q16, [x0, #576] +add v2.4s, v2.4s, v22.4s +ldr q22, [x0, #704] +mul v24.4S, v24.4S,v27.s[0] +mul v10.4S, v10.4S,v27.s[0] +sub v28.4s, v15.4s, v29.4s +ldr q9, [x0, #592] +add v15.4s, v15.4s, v29.4s +ldr q29, [x0, #720] +mla v24.4S, v30.4S, v31.s[0] +mla v10.4S, v26.4S, v31.s[0] +sub v26.4s, v16.4s, v23.4s +add v16.4s, v16.4s, v23.4s +sqrdmulh v23.4S, v15.4S, v7.s[1] +mul v15.4S, v15.4S,v13.s[1] +sub v30.4s, v9.4s, v14.4s +add v9.4s, v9.4s, v14.4s +sqrdmulh v14.4S, v28.4S, v7.s[2] +mul v28.4S, v28.4S,v13.s[2] +sub v3.4s, v20.4s, v5.4s +add v20.4s, v20.4s, v5.4s +sqrdmulh v7.4S, v9.4S, v4.s[1] +mul v9.4S, v9.4S,v11.s[1] +sub v5.4s, v6.4s, v1.4s +add v6.4s, v6.4s, v1.4s +sqrdmulh v1.4S, v30.4S, v4.s[2] +mul v30.4S, v30.4S,v11.s[2] +sub v13.4s, v22.4s, v24.4s +add v22.4s, v22.4s, v24.4s +mla v15.4S, v23.4S, v31.s[0] +sqrdmulh v23.4S, v6.4S, v19.s[1] +sub v4.4s, v29.4s, v10.4s +ldr q24, [x0, #992] +add v29.4s, v29.4s, v10.4s +mla v28.4S, v14.4S, v31.s[0] +sqrdmulh v14.4S, v5.4S, v19.s[2] +sub v10.4s, v2.4s, v15.4s +ldr q11, [x0, #928] +str q10, [x0, #528] +mla v9.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v29.4S, v18.s[1] +add v2.4s, v2.4s, v15.4s +ldr q15, [x0, #800] +str q2, [x0, #512] +mla v30.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v4.4S, v18.s[2] +sub v2.4s, v25.4s, v28.4s +ldr q10, [x17, #+512] +str q2, [x0, #560] +mul v6.4S, v6.4S,v17.s[1] +mul v5.4S, v5.4S,v17.s[2] +add v25.4s, v25.4s, v28.4s +str q25, [x0, #544] +ldr q25, [x17, #+528] +mla v6.4S, v23.4S, v31.s[0] +mla v5.4S, v14.4S, v31.s[0] +sub v14.4s, v16.4s, v9.4s +str q14, [x0, #592] +mul v29.4S, v29.4S,v27.s[1] +mul v4.4S, v4.4S,v27.s[2] +add v16.4s, v16.4s, v9.4s +str q16, [x0, #576] +mla v29.4S, v7.4S, v31.s[0] +mla v4.4S, v1.4S, v31.s[0] +sub v1.4s, v26.4s, v30.4s +str q1, [x0, #624] +sqrdmulh v18.4S, v15.4S, v25.s[0] +mul v15.4S, v15.4S,v10.s[0] +add v26.4s, v26.4s, v30.4s +ldr q30, [x0, #816] +str q26, [x0, #608] +ldr q26, [x17, #+544] +sqrdmulh v1.4S, v30.4S, v25.s[0] +mul v30.4S, v30.4S,v10.s[0] +sub v27.4s, v20.4s, v6.4s +ldr q7, [x17, #+560] +str q27, [x0, #656] +ldr q27, [x0, #864] +sqrdmulh v16.4S, v27.4S, v7.s[0] +mul v27.4S, v27.4S,v26.s[0] +add v20.4s, v20.4s, v6.4s +str q20, [x0, #640] +ldr q20, [x0, #880] +sqrdmulh v6.4S, v20.4S, v7.s[0] +mul v20.4S, v20.4S,v26.s[0] +sub v9.4s, v3.4s, v5.4s +ldr q19, [x17, #+576] +str q9, [x0, #688] +ldr q9, [x17, #+592] +mla v15.4S, v18.4S, v31.s[0] +sqrdmulh v18.4S, v11.4S, v9.s[0] +add v3.4s, v3.4s, v5.4s +ldr q5, [x0, #944] +str q3, [x0, #672] +mla v30.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v5.4S, v9.s[0] +sub v3.4s, v22.4s, v29.4s +ldr q14, [x17, #+608] +str q3, [x0, #720] +ldr q3, [x17, #+624] +mla v27.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v24.4S, v3.s[0] +add v22.4s, v22.4s, v29.4s +str q22, [x0, #704] +ldr q22, [x0, #1008] +mla v20.4S, v6.4S, v31.s[0] +sqrdmulh v6.4S, v22.4S, v3.s[0] +sub v29.4s, v13.4s, v4.4s +ldr q17, [x0, #768] +str q29, [x0, #752] +ldr q29, [x0, #896] +mul v11.4S, v11.4S,v19.s[0] +mul v5.4S, v5.4S,v19.s[0] +add v13.4s, v13.4s, v4.4s +ldr q4, [x0, #784] +str q13, [x0, #736] +ldr q13, [x0, #912] +mla v11.4S, v18.4S, v31.s[0] +mla v5.4S, v1.4S, v31.s[0] +sub v1.4s, v17.4s, v15.4s +ldr q18, [x0, #832] +add v17.4s, v17.4s, v15.4s +ldr q15, [x0, #960] +mul v24.4S, v24.4S,v14.s[0] +mul v22.4S, v22.4S,v14.s[0] +sub v23.4s, v4.4s, v30.4s +ldr q28, [x0, #848] +add v4.4s, v4.4s, v30.4s +ldr q30, [x0, #976] +mla v24.4S, v16.4S, v31.s[0] +mla v22.4S, v6.4S, v31.s[0] +sub v6.4s, v18.4s, v27.4s +add v18.4s, v18.4s, v27.4s +sqrdmulh v27.4S, v4.4S, v25.s[1] +mul v4.4S, v4.4S,v10.s[1] +sub v16.4s, v28.4s, v20.4s +add v28.4s, v28.4s, v20.4s +sqrdmulh v20.4S, v23.4S, v25.s[2] +mul v23.4S, v23.4S,v10.s[2] +sub v2.4s, v29.4s, v11.4s +add v29.4s, v29.4s, v11.4s +sqrdmulh v25.4S, v28.4S, v7.s[1] +mul v28.4S, v28.4S,v26.s[1] +sub v11.4s, v13.4s, v5.4s +add v13.4s, v13.4s, v5.4s +sqrdmulh v5.4S, v16.4S, v7.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v10.4s, v15.4s, v24.4s +add v15.4s, v15.4s, v24.4s +mla v4.4S, v27.4S, v31.s[0] +sqrdmulh v27.4S, v13.4S, v9.s[1] +sub v7.4s, v30.4s, v22.4s +add v30.4s, v30.4s, v22.4s +mla v23.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v11.4S, v9.s[2] +sub v22.4s, v17.4s, v4.4s +str q22, [x0, #784] +mla v28.4S, v25.4S, v31.s[0] +sqrdmulh v25.4S, v30.4S, v3.s[1] +add v17.4s, v17.4s, v4.4s +str q17, [x0, #768] +mla v16.4S, v5.4S, v31.s[0] +sqrdmulh v5.4S, v7.4S, v3.s[2] +sub v17.4s, v1.4s, v23.4s +str q17, [x0, #816] +mul v13.4S, v13.4S,v19.s[1] +mul v11.4S, v11.4S,v19.s[2] +add v1.4s, v1.4s, v23.4s +str q1, [x0, #800] +mla v13.4S, v27.4S, v31.s[0] +mla v11.4S, v20.4S, v31.s[0] +sub v20.4s, v18.4s, v28.4s +str q20, [x0, #848] +mul v30.4S, v30.4S,v14.s[1] +mul v7.4S, v7.4S,v14.s[2] +add v18.4s, v18.4s, v28.4s +str q18, [x0, #832] +mla v30.4S, v25.4S, v31.s[0] +mla v7.4S, v5.4S, v31.s[0] +sub v5.4s, v6.4s, v16.4s +str q5, [x0, #880] +add v6.4s, v6.4s, v16.4s +str q6, [x0, #864] +sub v6.4s, v29.4s, v13.4s +str q6, [x0, #912] +add v29.4s, v29.4s, v13.4s +str q29, [x0, #896] +sub v29.4s, v2.4s, v11.4s +str q29, [x0, #944] +add v2.4s, v2.4s, v11.4s +str q2, [x0, #928] +sub v2.4s, v15.4s, v30.4s +str q2, [x0, #976] +add v15.4s, v15.4s, v30.4s +str q15, [x0, #960] +sub v15.4s, v10.4s, v7.4s +str q15, [x0, #1008] +add v10.4s, v10.4s, v7.4s +str q10, [x0, #992] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1520 +// Instruction count: 1516 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_11.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_11.s new file mode 100644 index 0000000..ba9add4 --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_11.s @@ -0,0 +1,1550 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_22_z4_11 +.global _ntt_u32_incomplete_neon_asm_var_4_2_22_z4_11 +ntt_u32_incomplete_neon_asm_var_4_2_22_z4_11: +_ntt_u32_incomplete_neon_asm_var_4_2_22_z4_11: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x0, #992] +sqrdmulh v27.4S, v28.4S, v29.s[0] +mul v28.4S, v28.4S,v30.s[0] +ldr q26, [x0, #928] +sqrdmulh v25.4S, v26.4S, v29.s[0] +mul v26.4S, v26.4S,v30.s[0] +ldr q24, [x0, #864] +sqrdmulh v23.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +ldr q22, [x0, #800] +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +ldr q20, [x0, #736] +sqrdmulh v19.4S, v20.4S, v29.s[0] +mla v28.4S, v27.4S, v31.s[0] +ldr q27, [x0, #672] +sqrdmulh v18.4S, v27.4S, v29.s[0] +mla v26.4S, v25.4S, v31.s[0] +ldr q25, [x0, #608] +sqrdmulh v17.4S, v25.4S, v29.s[0] +mla v24.4S, v23.4S, v31.s[0] +ldr q23, [x0, #544] +sqrdmulh v16.4S, v23.4S, v29.s[0] +mla v22.4S, v21.4S, v31.s[0] +ldr q21, [x0, #480] +mul v27.4S, v27.4S,v30.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q3, [x0, #416] +ldr q2, [x0, #352] +ldr q1, [x0, #288] +mla v27.4S, v18.4S, v31.s[0] +mla v20.4S, v19.4S, v31.s[0] +ldr q19, [x0, #224] +ldr q18, [x0, #160] +mul v23.4S, v23.4S,v30.s[0] +mul v25.4S, v25.4S,v30.s[0] +ldr q0, [x0, #96] +ldr q15, [x0, #32] +mla v23.4S, v16.4S, v31.s[0] +mla v25.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v28.4s +add v21.4s, v21.4s, v28.4s +sqrdmulh v28.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +sub v16.4s, v3.4s, v26.4s +add v3.4s, v3.4s, v26.4s +sqrdmulh v26.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +sub v14.4s, v2.4s, v24.4s +add v2.4s, v2.4s, v24.4s +sqrdmulh v24.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v13.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +sqrdmulh v22.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v12.4s, v19.4s, v20.4s +add v19.4s, v19.4s, v20.4s +sqrdmulh v20.4S, v14.4S, v29.s[2] +mla v17.4S, v28.4S, v31.s[0] +sub v28.4s, v18.4s, v27.4s +add v18.4s, v18.4s, v27.4s +sqrdmulh v27.4S, v13.4S, v29.s[2] +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v0.4s, v25.4s +add v0.4s, v0.4s, v25.4s +sqrdmulh v25.4S, v2.4S, v29.s[1] +mla v21.4S, v24.4S, v31.s[0] +sub v24.4s, v15.4s, v23.4s +sqrdmulh v11.4S, v1.4S, v29.s[1] +mla v3.4S, v22.4S, v31.s[0] +add v15.4s, v15.4s, v23.4s +ldr q23, [x17, #+32] +ldr q22, [x17, #+48] +mul v13.4S, v13.4S,v30.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v10.4s, v12.4s, v17.4s +add v12.4s, v12.4s, v17.4s +mla v13.4S, v27.4S, v31.s[0] +mla v14.4S, v20.4S, v31.s[0] +sub v20.4s, v28.4s, v16.4s +add v28.4s, v28.4s, v16.4s +mul v1.4S, v1.4S,v30.s[1] +mul v2.4S, v2.4S,v30.s[1] +sub v16.4s, v19.4s, v21.4s +add v19.4s, v19.4s, v21.4s +mla v1.4S, v11.4S, v31.s[0] +mla v2.4S, v25.4S, v31.s[0] +sub v25.4s, v18.4s, v3.4s +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v10.4S, v22.s[3] +mul v10.4S, v10.4S,v23.s[3] +sub v11.4s, v26.4s, v14.4s +add v26.4s, v26.4s, v14.4s +sqrdmulh v14.4S, v12.4S, v22.s[2] +mul v12.4S, v12.4S,v23.s[2] +sub v21.4s, v24.4s, v13.4s +add v24.4s, v24.4s, v13.4s +sqrdmulh v13.4S, v16.4S, v22.s[1] +mul v16.4S, v16.4S,v23.s[1] +sub v27.4s, v0.4s, v2.4s +add v0.4s, v0.4s, v2.4s +sqrdmulh v2.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v17.4s, v15.4s, v1.4s +add v15.4s, v15.4s, v1.4s +ldr q1, [x17, #+96] +ldr q9, [x17, #+112] +sqrdmulh v8.4S, v20.4S, v22.s[3] +mla v10.4S, v3.4S, v31.s[0] +nop +nop +sqrdmulh v3.4S, v28.4S, v22.s[2] +mla v12.4S, v14.4S, v31.s[0] +nop +nop +sqrdmulh v14.4S, v25.4S, v22.s[1] +mla v16.4S, v13.4S, v31.s[0] +nop +nop +sqrdmulh v13.4S, v18.4S, v22.s[0] +mla v19.4S, v2.4S, v31.s[0] +nop +nop +ldr q2, [x17, #+64] +ldr q7, [x17, #+80] +mul v28.4S, v28.4S,v23.s[2] +mul v20.4S, v20.4S,v23.s[3] +sub v6.4s, v11.4s, v10.4s +add v11.4s, v11.4s, v10.4s +mla v28.4S, v3.4S, v31.s[0] +mla v20.4S, v8.4S, v31.s[0] +sub v8.4s, v26.4s, v12.4s +add v26.4s, v26.4s, v12.4s +mul v18.4S, v18.4S,v23.s[0] +mul v25.4S, v25.4S,v23.s[1] +sub v12.4s, v27.4s, v16.4s +add v27.4s, v27.4s, v16.4s +mla v18.4S, v13.4S, v31.s[0] +mla v25.4S, v14.4S, v31.s[0] +sub v14.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v9.s[3] +mul v6.4S, v6.4S,v1.s[3] +sub v13.4s, v21.4s, v20.4s +add v21.4s, v21.4s, v20.4s +sqrdmulh v20.4S, v11.4S, v9.s[2] +mul v11.4S, v11.4S,v1.s[2] +sub v16.4s, v24.4s, v28.4s +add v24.4s, v24.4s, v28.4s +sqrdmulh v28.4S, v8.4S, v9.s[1] +mul v8.4S, v8.4S,v1.s[1] +sub v3.4s, v17.4s, v25.4s +add v17.4s, v17.4s, v25.4s +sqrdmulh v25.4S, v26.4S, v9.s[0] +mul v26.4S, v26.4S,v1.s[0] +sub v10.4s, v15.4s, v18.4s +add v15.4s, v15.4s, v18.4s +sqrdmulh v18.4S, v12.4S, v7.s[3] +mla v6.4S, v19.4S, v31.s[0] +nop +nop +sqrdmulh v19.4S, v27.4S, v7.s[2] +mla v11.4S, v20.4S, v31.s[0] +nop +nop +sqrdmulh v20.4S, v14.4S, v7.s[1] +mla v8.4S, v28.4S, v31.s[0] +nop +nop +sqrdmulh v28.4S, v0.4S, v7.s[0] +mla v26.4S, v25.4S, v31.s[0] +nop +nop +mul v27.4S, v27.4S,v2.s[2] +mul v12.4S, v12.4S,v2.s[3] +sub v25.4s, v13.4s, v6.4s +str q25, [x0, #992] +mla v27.4S, v19.4S, v31.s[0] +mla v12.4S, v18.4S, v31.s[0] +add v13.4s, v13.4s, v6.4s +str q13, [x0, #928] +mul v0.4S, v0.4S,v2.s[0] +mul v14.4S, v14.4S,v2.s[1] +sub v13.4s, v21.4s, v11.4s +str q13, [x0, #864] +mla v0.4S, v28.4S, v31.s[0] +mla v14.4S, v20.4S, v31.s[0] +add v21.4s, v21.4s, v11.4s +sub v11.4s, v16.4s, v8.4s +ldr q20, [x0, #1008] +sqrdmulh v28.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v16.4s, v16.4s, v8.4s +str q21, [x0, #800] +ldr q21, [x0, #944] +sqrdmulh v8.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +sub v13.4s, v24.4s, v26.4s +str q11, [x0, #736] +ldr q11, [x0, #880] +sqrdmulh v6.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +add v24.4s, v24.4s, v26.4s +str q16, [x0, #672] +ldr q16, [x0, #816] +sqrdmulh v26.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v18.4s, v3.4s, v12.4s +str q13, [x0, #608] +ldr q13, [x0, #752] +sqrdmulh v19.4S, v13.4S, v29.s[0] +mla v20.4S, v28.4S, v31.s[0] +add v3.4s, v3.4s, v12.4s +str q24, [x0, #544] +ldr q24, [x0, #688] +sqrdmulh v12.4S, v24.4S, v29.s[0] +mla v21.4S, v8.4S, v31.s[0] +sub v8.4s, v17.4s, v27.4s +str q18, [x0, #480] +ldr q18, [x0, #624] +sqrdmulh v28.4S, v18.4S, v29.s[0] +mla v11.4S, v6.4S, v31.s[0] +add v17.4s, v17.4s, v27.4s +str q3, [x0, #416] +ldr q3, [x0, #560] +sqrdmulh v27.4S, v3.4S, v29.s[0] +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v10.4s, v14.4s +str q8, [x0, #352] +ldr q8, [x0, #496] +add v10.4s, v10.4s, v14.4s +mul v24.4S, v24.4S,v30.s[0] +mul v13.4S, v13.4S,v30.s[0] +ldr q14, [x0, #432] +str q17, [x0, #288] +ldr q17, [x0, #368] +ldr q6, [x0, #304] +mla v24.4S, v12.4S, v31.s[0] +mla v13.4S, v19.4S, v31.s[0] +str q26, [x0, #224] +sub v26.4s, v15.4s, v0.4s +ldr q19, [x0, #240] +ldr q12, [x0, #176] +mul v3.4S, v3.4S,v30.s[0] +mul v18.4S, v18.4S,v30.s[0] +str q10, [x0, #160] +add v15.4s, v15.4s, v0.4s +ldr q0, [x0, #112] +ldr q10, [x0, #48] +mla v3.4S, v27.4S, v31.s[0] +mla v18.4S, v28.4S, v31.s[0] +sub v28.4s, v8.4s, v20.4s +add v8.4s, v8.4s, v20.4s +sqrdmulh v20.4S, v28.4S, v29.s[2] +mul v28.4S, v28.4S,v30.s[2] +sub v27.4s, v14.4s, v21.4s +add v14.4s, v14.4s, v21.4s +sqrdmulh v21.4S, v27.4S, v29.s[2] +mul v27.4S, v27.4S,v30.s[2] +sub v25.4s, v17.4s, v11.4s +add v17.4s, v17.4s, v11.4s +sqrdmulh v11.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +sub v5.4s, v6.4s, v16.4s +add v6.4s, v6.4s, v16.4s +sqrdmulh v16.4S, v14.4S, v29.s[1] +mul v14.4S, v14.4S,v30.s[1] +sub v4.4s, v19.4s, v13.4s +add v19.4s, v19.4s, v13.4s +sqrdmulh v13.4S, v25.4S, v29.s[2] +mla v28.4S, v20.4S, v31.s[0] +sub v20.4s, v12.4s, v24.4s +add v12.4s, v12.4s, v24.4s +sqrdmulh v24.4S, v5.4S, v29.s[2] +mla v27.4S, v21.4S, v31.s[0] +sub v21.4s, v0.4s, v18.4s +add v0.4s, v0.4s, v18.4s +sqrdmulh v18.4S, v17.4S, v29.s[1] +mla v8.4S, v11.4S, v31.s[0] +sub v11.4s, v10.4s, v3.4s +str q26, [x0, #96] +sqrdmulh v26.4S, v6.4S, v29.s[1] +mla v14.4S, v16.4S, v31.s[0] +add v10.4s, v10.4s, v3.4s +str q15, [x0, #32] +mul v5.4S, v5.4S,v30.s[2] +mul v25.4S, v25.4S,v30.s[2] +sub v15.4s, v4.4s, v28.4s +add v4.4s, v4.4s, v28.4s +mla v5.4S, v24.4S, v31.s[0] +mla v25.4S, v13.4S, v31.s[0] +sub v13.4s, v20.4s, v27.4s +add v20.4s, v20.4s, v27.4s +mul v6.4S, v6.4S,v30.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v27.4s, v19.4s, v8.4s +add v19.4s, v19.4s, v8.4s +mla v6.4S, v26.4S, v31.s[0] +mla v17.4S, v18.4S, v31.s[0] +sub v18.4s, v12.4s, v14.4s +add v12.4s, v12.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v22.s[3] +mul v15.4S, v15.4S,v23.s[3] +sub v26.4s, v21.4s, v25.4s +add v21.4s, v21.4s, v25.4s +sqrdmulh v25.4S, v4.4S, v22.s[2] +mul v4.4S, v4.4S,v23.s[2] +sub v8.4s, v11.4s, v5.4s +add v11.4s, v11.4s, v5.4s +sqrdmulh v5.4S, v27.4S, v22.s[1] +mul v27.4S, v27.4S,v23.s[1] +sub v24.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +sqrdmulh v17.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v28.4s, v10.4s, v6.4s +add v10.4s, v10.4s, v6.4s +sqrdmulh v6.4S, v13.4S, v22.s[3] +mla v15.4S, v14.4S, v31.s[0] +nop +nop +sqrdmulh v14.4S, v20.4S, v22.s[2] +mla v4.4S, v25.4S, v31.s[0] +nop +nop +sqrdmulh v25.4S, v18.4S, v22.s[1] +mla v27.4S, v5.4S, v31.s[0] +nop +nop +sqrdmulh v5.4S, v12.4S, v22.s[0] +mla v19.4S, v17.4S, v31.s[0] +nop +nop +mul v20.4S, v20.4S,v23.s[2] +mul v13.4S, v13.4S,v23.s[3] +sub v17.4s, v26.4s, v15.4s +add v26.4s, v26.4s, v15.4s +mla v20.4S, v14.4S, v31.s[0] +mla v13.4S, v6.4S, v31.s[0] +sub v6.4s, v21.4s, v4.4s +add v21.4s, v21.4s, v4.4s +mul v12.4S, v12.4S,v23.s[0] +mul v18.4S, v18.4S,v23.s[1] +sub v4.4s, v24.4s, v27.4s +add v24.4s, v24.4s, v27.4s +mla v12.4S, v5.4S, v31.s[0] +mla v18.4S, v25.4S, v31.s[0] +sub v25.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v17.4S, v9.s[3] +mul v17.4S, v17.4S,v1.s[3] +sub v5.4s, v8.4s, v13.4s +add v8.4s, v8.4s, v13.4s +sqrdmulh v13.4S, v26.4S, v9.s[2] +mul v26.4S, v26.4S,v1.s[2] +sub v27.4s, v11.4s, v20.4s +add v11.4s, v11.4s, v20.4s +sqrdmulh v20.4S, v6.4S, v9.s[1] +mul v6.4S, v6.4S,v1.s[1] +sub v14.4s, v28.4s, v18.4s +add v28.4s, v28.4s, v18.4s +sqrdmulh v18.4S, v21.4S, v9.s[0] +mul v21.4S, v21.4S,v1.s[0] +sub v15.4s, v10.4s, v12.4s +add v10.4s, v10.4s, v12.4s +sqrdmulh v12.4S, v4.4S, v7.s[3] +mla v17.4S, v19.4S, v31.s[0] +nop +nop +sqrdmulh v19.4S, v24.4S, v7.s[2] +mla v26.4S, v13.4S, v31.s[0] +nop +nop +sqrdmulh v13.4S, v25.4S, v7.s[1] +mla v6.4S, v20.4S, v31.s[0] +nop +nop +sqrdmulh v20.4S, v0.4S, v7.s[0] +mla v21.4S, v18.4S, v31.s[0] +nop +nop +mul v24.4S, v24.4S,v2.s[2] +mul v4.4S, v4.4S,v2.s[3] +sub v18.4s, v5.4s, v17.4s +str q18, [x0, #1008] +mla v24.4S, v19.4S, v31.s[0] +mla v4.4S, v12.4S, v31.s[0] +add v5.4s, v5.4s, v17.4s +str q5, [x0, #944] +mul v0.4S, v0.4S,v2.s[0] +mul v25.4S, v25.4S,v2.s[1] +sub v5.4s, v8.4s, v26.4s +str q5, [x0, #880] +mla v0.4S, v20.4S, v31.s[0] +mla v25.4S, v13.4S, v31.s[0] +add v8.4s, v8.4s, v26.4s +sub v26.4s, v27.4s, v6.4s +ldr q13, [x0, #960] +sqrdmulh v20.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +add v27.4s, v27.4s, v6.4s +str q8, [x0, #816] +ldr q8, [x0, #896] +sqrdmulh v6.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v5.4s, v11.4s, v21.4s +str q26, [x0, #752] +ldr q26, [x0, #832] +sqrdmulh v17.4S, v26.4S, v29.s[0] +mul v26.4S, v26.4S,v30.s[0] +add v11.4s, v11.4s, v21.4s +str q27, [x0, #688] +ldr q27, [x0, #768] +sqrdmulh v21.4S, v27.4S, v29.s[0] +mul v27.4S, v27.4S,v30.s[0] +sub v12.4s, v14.4s, v4.4s +str q5, [x0, #624] +ldr q5, [x0, #704] +sqrdmulh v19.4S, v5.4S, v29.s[0] +mla v13.4S, v20.4S, v31.s[0] +add v14.4s, v14.4s, v4.4s +str q11, [x0, #560] +ldr q11, [x0, #640] +sqrdmulh v4.4S, v11.4S, v29.s[0] +mla v8.4S, v6.4S, v31.s[0] +sub v6.4s, v28.4s, v24.4s +str q12, [x0, #496] +ldr q12, [x0, #576] +sqrdmulh v20.4S, v12.4S, v29.s[0] +mla v26.4S, v17.4S, v31.s[0] +add v28.4s, v28.4s, v24.4s +str q14, [x0, #432] +ldr q14, [x0, #512] +sqrdmulh v24.4S, v14.4S, v29.s[0] +mla v27.4S, v21.4S, v31.s[0] +sub v21.4s, v15.4s, v25.4s +str q6, [x0, #368] +ldr q6, [x0, #448] +add v15.4s, v15.4s, v25.4s +mul v11.4S, v11.4S,v30.s[0] +mul v5.4S, v5.4S,v30.s[0] +ldr q25, [x0, #384] +str q28, [x0, #304] +ldr q28, [x0, #320] +ldr q17, [x0, #256] +mla v11.4S, v4.4S, v31.s[0] +mla v5.4S, v19.4S, v31.s[0] +str q21, [x0, #240] +sub v21.4s, v10.4s, v0.4s +ldr q19, [x0, #192] +ldr q4, [x0, #128] +mul v14.4S, v14.4S,v30.s[0] +mul v12.4S, v12.4S,v30.s[0] +str q15, [x0, #176] +add v10.4s, v10.4s, v0.4s +ldr q0, [x0, #64] +ldr q15, [x0, #0] +mla v14.4S, v24.4S, v31.s[0] +mla v12.4S, v20.4S, v31.s[0] +sub v20.4s, v6.4s, v13.4s +add v6.4s, v6.4s, v13.4s +sqrdmulh v13.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +sub v24.4s, v25.4s, v8.4s +add v25.4s, v25.4s, v8.4s +sqrdmulh v8.4S, v24.4S, v29.s[2] +mul v24.4S, v24.4S,v30.s[2] +sub v18.4s, v28.4s, v26.4s +add v28.4s, v28.4s, v26.4s +sqrdmulh v26.4S, v6.4S, v29.s[1] +mul v6.4S, v6.4S,v30.s[1] +sub v3.4s, v17.4s, v27.4s +add v17.4s, v17.4s, v27.4s +sqrdmulh v27.4S, v25.4S, v29.s[1] +mul v25.4S, v25.4S,v30.s[1] +sub v16.4s, v19.4s, v5.4s +add v19.4s, v19.4s, v5.4s +sqrdmulh v5.4S, v18.4S, v29.s[2] +mla v20.4S, v13.4S, v31.s[0] +sub v13.4s, v4.4s, v11.4s +add v4.4s, v4.4s, v11.4s +sqrdmulh v11.4S, v3.4S, v29.s[2] +mla v24.4S, v8.4S, v31.s[0] +sub v8.4s, v0.4s, v12.4s +add v0.4s, v0.4s, v12.4s +sqrdmulh v12.4S, v28.4S, v29.s[1] +mla v6.4S, v26.4S, v31.s[0] +sub v26.4s, v15.4s, v14.4s +str q21, [x0, #112] +sqrdmulh v21.4S, v17.4S, v29.s[1] +mla v25.4S, v27.4S, v31.s[0] +add v15.4s, v15.4s, v14.4s +str q10, [x0, #48] +mul v3.4S, v3.4S,v30.s[2] +mul v18.4S, v18.4S,v30.s[2] +sub v10.4s, v16.4s, v20.4s +add v16.4s, v16.4s, v20.4s +mla v3.4S, v11.4S, v31.s[0] +mla v18.4S, v5.4S, v31.s[0] +sub v5.4s, v13.4s, v24.4s +add v13.4s, v13.4s, v24.4s +mul v17.4S, v17.4S,v30.s[1] +mul v28.4S, v28.4S,v30.s[1] +sub v24.4s, v19.4s, v6.4s +add v19.4s, v19.4s, v6.4s +mla v17.4S, v21.4S, v31.s[0] +mla v28.4S, v12.4S, v31.s[0] +sub v12.4s, v4.4s, v25.4s +add v4.4s, v4.4s, v25.4s +sqrdmulh v25.4S, v10.4S, v22.s[3] +mul v10.4S, v10.4S,v23.s[3] +sub v21.4s, v8.4s, v18.4s +add v8.4s, v8.4s, v18.4s +sqrdmulh v18.4S, v16.4S, v22.s[2] +mul v16.4S, v16.4S,v23.s[2] +sub v6.4s, v26.4s, v3.4s +add v26.4s, v26.4s, v3.4s +sqrdmulh v3.4S, v24.4S, v22.s[1] +mul v24.4S, v24.4S,v23.s[1] +sub v11.4s, v0.4s, v28.4s +add v0.4s, v0.4s, v28.4s +sqrdmulh v28.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v20.4s, v15.4s, v17.4s +add v15.4s, v15.4s, v17.4s +sqrdmulh v17.4S, v5.4S, v22.s[3] +mla v10.4S, v25.4S, v31.s[0] +nop +nop +sqrdmulh v25.4S, v13.4S, v22.s[2] +mla v16.4S, v18.4S, v31.s[0] +nop +nop +sqrdmulh v18.4S, v12.4S, v22.s[1] +mla v24.4S, v3.4S, v31.s[0] +nop +nop +sqrdmulh v3.4S, v4.4S, v22.s[0] +mla v19.4S, v28.4S, v31.s[0] +nop +nop +mul v13.4S, v13.4S,v23.s[2] +mul v5.4S, v5.4S,v23.s[3] +sub v28.4s, v21.4s, v10.4s +add v21.4s, v21.4s, v10.4s +mla v13.4S, v25.4S, v31.s[0] +mla v5.4S, v17.4S, v31.s[0] +sub v17.4s, v8.4s, v16.4s +add v8.4s, v8.4s, v16.4s +mul v4.4S, v4.4S,v23.s[0] +mul v12.4S, v12.4S,v23.s[1] +sub v16.4s, v11.4s, v24.4s +add v11.4s, v11.4s, v24.4s +mla v4.4S, v3.4S, v31.s[0] +mla v12.4S, v18.4S, v31.s[0] +sub v18.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v28.4S, v9.s[3] +mul v28.4S, v28.4S,v1.s[3] +sub v3.4s, v6.4s, v5.4s +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v21.4S, v9.s[2] +mul v21.4S, v21.4S,v1.s[2] +sub v24.4s, v26.4s, v13.4s +add v26.4s, v26.4s, v13.4s +sqrdmulh v13.4S, v17.4S, v9.s[1] +mul v17.4S, v17.4S,v1.s[1] +sub v25.4s, v20.4s, v12.4s +add v20.4s, v20.4s, v12.4s +sqrdmulh v12.4S, v8.4S, v9.s[0] +mul v8.4S, v8.4S,v1.s[0] +sub v10.4s, v15.4s, v4.4s +add v15.4s, v15.4s, v4.4s +sqrdmulh v4.4S, v16.4S, v7.s[3] +mla v28.4S, v19.4S, v31.s[0] +nop +nop +sqrdmulh v19.4S, v11.4S, v7.s[2] +mla v21.4S, v5.4S, v31.s[0] +nop +nop +sqrdmulh v5.4S, v18.4S, v7.s[1] +mla v17.4S, v13.4S, v31.s[0] +nop +nop +sqrdmulh v13.4S, v0.4S, v7.s[0] +mla v8.4S, v12.4S, v31.s[0] +nop +nop +mul v11.4S, v11.4S,v2.s[2] +mul v16.4S, v16.4S,v2.s[3] +sub v12.4s, v3.4s, v28.4s +str q12, [x0, #960] +mla v11.4S, v19.4S, v31.s[0] +mla v16.4S, v4.4S, v31.s[0] +add v3.4s, v3.4s, v28.4s +str q3, [x0, #896] +mul v0.4S, v0.4S,v2.s[0] +mul v18.4S, v18.4S,v2.s[1] +sub v3.4s, v6.4s, v21.4s +str q3, [x0, #832] +mla v0.4S, v13.4S, v31.s[0] +mla v18.4S, v5.4S, v31.s[0] +add v6.4s, v6.4s, v21.4s +sub v21.4s, v24.4s, v17.4s +ldr q5, [x0, #976] +sqrdmulh v13.4S, v5.4S, v29.s[0] +mul v5.4S, v5.4S,v30.s[0] +add v24.4s, v24.4s, v17.4s +str q6, [x0, #768] +ldr q6, [x0, #912] +sqrdmulh v17.4S, v6.4S, v29.s[0] +mul v6.4S, v6.4S,v30.s[0] +sub v3.4s, v26.4s, v8.4s +str q21, [x0, #704] +ldr q21, [x0, #848] +sqrdmulh v28.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +add v26.4s, v26.4s, v8.4s +str q24, [x0, #640] +ldr q24, [x0, #784] +sqrdmulh v8.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +sub v4.4s, v25.4s, v16.4s +str q3, [x0, #576] +ldr q3, [x0, #720] +sqrdmulh v19.4S, v3.4S, v29.s[0] +mla v5.4S, v13.4S, v31.s[0] +add v25.4s, v25.4s, v16.4s +str q26, [x0, #512] +ldr q26, [x0, #656] +sqrdmulh v16.4S, v26.4S, v29.s[0] +mla v6.4S, v17.4S, v31.s[0] +sub v17.4s, v20.4s, v11.4s +str q4, [x0, #448] +ldr q4, [x0, #592] +sqrdmulh v13.4S, v4.4S, v29.s[0] +mla v21.4S, v28.4S, v31.s[0] +add v20.4s, v20.4s, v11.4s +str q25, [x0, #384] +ldr q25, [x0, #528] +sqrdmulh v11.4S, v25.4S, v29.s[0] +mla v24.4S, v8.4S, v31.s[0] +sub v8.4s, v10.4s, v18.4s +str q17, [x0, #320] +ldr q17, [x0, #464] +add v10.4s, v10.4s, v18.4s +mul v26.4S, v26.4S,v30.s[0] +mul v3.4S, v3.4S,v30.s[0] +ldr q18, [x0, #400] +str q20, [x0, #256] +ldr q20, [x0, #336] +ldr q28, [x0, #272] +mla v26.4S, v16.4S, v31.s[0] +mla v3.4S, v19.4S, v31.s[0] +str q8, [x0, #192] +sub v8.4s, v15.4s, v0.4s +ldr q19, [x0, #208] +ldr q16, [x0, #144] +mul v25.4S, v25.4S,v30.s[0] +mul v4.4S, v4.4S,v30.s[0] +str q10, [x0, #128] +add v15.4s, v15.4s, v0.4s +ldr q0, [x0, #80] +ldr q10, [x0, #16] +mla v25.4S, v11.4S, v31.s[0] +mla v4.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v5.4s +add v17.4s, v17.4s, v5.4s +sqrdmulh v5.4S, v13.4S, v29.s[2] +mul v13.4S, v13.4S,v30.s[2] +sub v11.4s, v18.4s, v6.4s +add v18.4s, v18.4s, v6.4s +sqrdmulh v6.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v12.4s, v20.4s, v21.4s +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v14.4s, v28.4s, v24.4s +add v28.4s, v28.4s, v24.4s +sqrdmulh v24.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v27.4s, v19.4s, v3.4s +add v19.4s, v19.4s, v3.4s +sqrdmulh v3.4S, v12.4S, v29.s[2] +mla v13.4S, v5.4S, v31.s[0] +sub v5.4s, v16.4s, v26.4s +add v16.4s, v16.4s, v26.4s +sqrdmulh v26.4S, v14.4S, v29.s[2] +mla v11.4S, v6.4S, v31.s[0] +sub v6.4s, v0.4s, v4.4s +add v0.4s, v0.4s, v4.4s +sqrdmulh v4.4S, v20.4S, v29.s[1] +mla v17.4S, v21.4S, v31.s[0] +sub v21.4s, v10.4s, v25.4s +str q8, [x0, #64] +sqrdmulh v8.4S, v28.4S, v29.s[1] +mla v18.4S, v24.4S, v31.s[0] +add v10.4s, v10.4s, v25.4s +str q15, [x0, #0] +mul v14.4S, v14.4S,v30.s[2] +mul v12.4S, v12.4S,v30.s[2] +sub v15.4s, v27.4s, v13.4s +add v27.4s, v27.4s, v13.4s +mla v14.4S, v26.4S, v31.s[0] +mla v12.4S, v3.4S, v31.s[0] +sub v3.4s, v5.4s, v11.4s +add v5.4s, v5.4s, v11.4s +mul v28.4S, v28.4S,v30.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v11.4s, v19.4s, v17.4s +add v19.4s, v19.4s, v17.4s +mla v28.4S, v8.4S, v31.s[0] +mla v20.4S, v4.4S, v31.s[0] +sub v4.4s, v16.4s, v18.4s +add v16.4s, v16.4s, v18.4s +sqrdmulh v29.4S, v15.4S, v22.s[3] +mul v15.4S, v15.4S,v23.s[3] +sub v30.4s, v6.4s, v12.4s +add v6.4s, v6.4s, v12.4s +sqrdmulh v12.4S, v27.4S, v22.s[2] +mul v27.4S, v27.4S,v23.s[2] +sub v18.4s, v21.4s, v14.4s +add v21.4s, v21.4s, v14.4s +sqrdmulh v14.4S, v11.4S, v22.s[1] +mul v11.4S, v11.4S,v23.s[1] +sub v8.4s, v0.4s, v20.4s +add v0.4s, v0.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v17.4s, v10.4s, v28.4s +add v10.4s, v10.4s, v28.4s +sqrdmulh v28.4S, v3.4S, v22.s[3] +mla v15.4S, v29.4S, v31.s[0] +nop +nop +sqrdmulh v29.4S, v5.4S, v22.s[2] +mla v27.4S, v12.4S, v31.s[0] +nop +nop +sqrdmulh v12.4S, v4.4S, v22.s[1] +mla v11.4S, v14.4S, v31.s[0] +nop +nop +sqrdmulh v14.4S, v16.4S, v22.s[0] +mla v19.4S, v20.4S, v31.s[0] +nop +nop +mul v5.4S, v5.4S,v23.s[2] +mul v3.4S, v3.4S,v23.s[3] +sub v20.4s, v30.4s, v15.4s +add v30.4s, v30.4s, v15.4s +mla v5.4S, v29.4S, v31.s[0] +mla v3.4S, v28.4S, v31.s[0] +sub v28.4s, v6.4s, v27.4s +add v6.4s, v6.4s, v27.4s +mul v16.4S, v16.4S,v23.s[0] +mul v4.4S, v4.4S,v23.s[1] +sub v27.4s, v8.4s, v11.4s +add v8.4s, v8.4s, v11.4s +mla v16.4S, v14.4S, v31.s[0] +mla v4.4S, v12.4S, v31.s[0] +sub v12.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v22.4S, v20.4S, v9.s[3] +mul v20.4S, v20.4S,v1.s[3] +sub v23.4s, v18.4s, v3.4s +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v30.4S, v9.s[2] +mul v30.4S, v30.4S,v1.s[2] +sub v19.4s, v21.4s, v5.4s +add v21.4s, v21.4s, v5.4s +sqrdmulh v5.4S, v28.4S, v9.s[1] +mul v28.4S, v28.4S,v1.s[1] +sub v14.4s, v17.4s, v4.4s +add v17.4s, v17.4s, v4.4s +sqrdmulh v4.4S, v6.4S, v9.s[0] +mul v6.4S, v6.4S,v1.s[0] +sub v11.4s, v10.4s, v16.4s +add v10.4s, v10.4s, v16.4s +sqrdmulh v9.4S, v27.4S, v7.s[3] +mla v20.4S, v22.4S, v31.s[0] +nop +nop +sqrdmulh v22.4S, v8.4S, v7.s[2] +mla v30.4S, v3.4S, v31.s[0] +nop +nop +sqrdmulh v3.4S, v12.4S, v7.s[1] +mla v28.4S, v5.4S, v31.s[0] +nop +nop +sqrdmulh v5.4S, v0.4S, v7.s[0] +mla v6.4S, v4.4S, v31.s[0] +nop +nop +mul v8.4S, v8.4S,v2.s[2] +mul v27.4S, v27.4S,v2.s[3] +sub v4.4s, v23.4s, v20.4s +str q4, [x0, #976] +mla v8.4S, v22.4S, v31.s[0] +mla v27.4S, v9.4S, v31.s[0] +add v23.4s, v23.4s, v20.4s +str q23, [x0, #912] +mul v0.4S, v0.4S,v2.s[0] +mul v12.4S, v12.4S,v2.s[1] +sub v23.4s, v18.4s, v30.4s +str q23, [x0, #848] +mla v0.4S, v5.4S, v31.s[0] +mla v12.4S, v3.4S, v31.s[0] +add v18.4s, v18.4s, v30.4s +sub v30.4s, v19.4s, v28.4s +add v19.4s, v19.4s, v28.4s +str q18, [x0, #784] +sub v18.4s, v21.4s, v6.4s +str q30, [x0, #720] +add v21.4s, v21.4s, v6.4s +str q19, [x0, #656] +sub v19.4s, v14.4s, v27.4s +str q18, [x0, #592] +add v14.4s, v14.4s, v27.4s +str q21, [x0, #528] +sub v21.4s, v17.4s, v8.4s +str q19, [x0, #464] +add v17.4s, v17.4s, v8.4s +str q14, [x0, #400] +sub v14.4s, v11.4s, v12.4s +str q21, [x0, #336] +add v11.4s, v11.4s, v12.4s +str q17, [x0, #272] +sub v17.4s, v10.4s, v0.4s +add v10.4s, v10.4s, v0.4s +ldr q24, [x0, #32] +ldr q25, [x0, #48] +ldr q13, [x0, #96] +ldr q26, [x0, #112] +ldr q15, [x17, #+128] +ldr q29, [x17, #+144] +ldr q16, [x17, #+160] +ldr q1, [x17, #+176] +ldr q4, [x0, #160] +ldr q22, [x0, #176] +sqrdmulh v9.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v15.s[0] +ldr q20, [x0, #224] +sqrdmulh v23.4S, v25.4S, v29.s[0] +mul v25.4S, v25.4S,v15.s[0] +ldr q5, [x0, #240] +sqrdmulh v3.4S, v13.4S, v1.s[0] +mul v13.4S, v13.4S,v16.s[0] +ldr q2, [x17, #+192] +sqrdmulh v7.4S, v26.4S, v1.s[0] +mul v26.4S, v26.4S,v16.s[0] +ldr q28, [x17, #+208] +mla v24.4S, v9.4S, v31.s[0] +sqrdmulh v9.4S, v4.4S, v28.s[0] +ldr q30, [x17, #+224] +mla v25.4S, v23.4S, v31.s[0] +sqrdmulh v23.4S, v22.4S, v28.s[0] +ldr q6, [x17, #+240] +mla v13.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v20.4S, v6.s[0] +ldr q18, [x0, #0] +mla v26.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v5.4S, v6.s[0] +mul v4.4S, v4.4S,v2.s[0] +mul v22.4S, v22.4S,v2.s[0] +ldr q27, [x0, #64] +mla v4.4S, v9.4S, v31.s[0] +mla v22.4S, v23.4S, v31.s[0] +sub v23.4s, v18.4s, v24.4s +add v18.4s, v18.4s, v24.4s +mul v20.4S, v20.4S,v30.s[0] +mul v5.4S, v5.4S,v30.s[0] +sub v24.4s, v10.4s, v25.4s +ldr q9, [x0, #128] +add v10.4s, v10.4s, v25.4s +mla v20.4S, v3.4S, v31.s[0] +mla v5.4S, v7.4S, v31.s[0] +sub v7.4s, v27.4s, v13.4s +add v27.4s, v27.4s, v13.4s +sqrdmulh v13.4S, v10.4S, v29.s[1] +mul v10.4S, v10.4S,v15.s[1] +sub v3.4s, v17.4s, v26.4s +ldr q25, [x0, #192] +add v17.4s, v17.4s, v26.4s +sqrdmulh v26.4S, v24.4S, v29.s[2] +mul v24.4S, v24.4S,v15.s[2] +sub v19.4s, v9.4s, v4.4s +add v9.4s, v9.4s, v4.4s +sqrdmulh v29.4S, v17.4S, v1.s[1] +mul v17.4S, v17.4S,v16.s[1] +sub v4.4s, v11.4s, v22.4s +ldr q15, [x0, #288] +add v11.4s, v11.4s, v22.4s +sqrdmulh v22.4S, v3.4S, v1.s[2] +mul v3.4S, v3.4S,v16.s[2] +sub v8.4s, v25.4s, v20.4s +ldr q21, [x0, #304] +add v25.4s, v25.4s, v20.4s +mla v10.4S, v13.4S, v31.s[0] +sqrdmulh v13.4S, v11.4S, v28.s[1] +sub v1.4s, v14.4s, v5.4s +ldr q20, [x0, #352] +add v14.4s, v14.4s, v5.4s +mla v24.4S, v26.4S, v31.s[0] +sqrdmulh v26.4S, v4.4S, v28.s[2] +sub v5.4s, v18.4s, v10.4s +ldr q16, [x0, #368] +str q5, [x0, #16] +mla v17.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v14.4S, v6.s[1] +add v18.4s, v18.4s, v10.4s +ldr q10, [x17, #+256] +str q18, [x0, #0] +mla v3.4S, v22.4S, v31.s[0] +sqrdmulh v22.4S, v1.4S, v6.s[2] +sub v18.4s, v23.4s, v24.4s +ldr q5, [x17, #+272] +str q18, [x0, #48] +mul v11.4S, v11.4S,v2.s[1] +mul v4.4S, v4.4S,v2.s[2] +add v23.4s, v23.4s, v24.4s +ldr q24, [x17, #+288] +str q23, [x0, #32] +mla v11.4S, v13.4S, v31.s[0] +mla v4.4S, v26.4S, v31.s[0] +sub v26.4s, v27.4s, v17.4s +ldr q13, [x17, #+304] +str q26, [x0, #80] +mul v14.4S, v14.4S,v30.s[1] +mul v1.4S, v1.4S,v30.s[2] +add v27.4s, v27.4s, v17.4s +ldr q17, [x0, #416] +str q27, [x0, #64] +mla v14.4S, v29.4S, v31.s[0] +mla v1.4S, v22.4S, v31.s[0] +sub v22.4s, v7.4s, v3.4s +ldr q29, [x0, #432] +str q22, [x0, #112] +sqrdmulh v6.4S, v15.4S, v5.s[0] +mul v15.4S, v15.4S,v10.s[0] +add v7.4s, v7.4s, v3.4s +ldr q3, [x0, #480] +str q7, [x0, #96] +sqrdmulh v7.4S, v21.4S, v5.s[0] +mul v21.4S, v21.4S,v10.s[0] +sub v22.4s, v9.4s, v11.4s +ldr q30, [x0, #496] +str q22, [x0, #144] +sqrdmulh v22.4S, v20.4S, v13.s[0] +mul v20.4S, v20.4S,v24.s[0] +add v9.4s, v9.4s, v11.4s +ldr q11, [x17, #+320] +str q9, [x0, #128] +sqrdmulh v9.4S, v16.4S, v13.s[0] +mul v16.4S, v16.4S,v24.s[0] +sub v27.4s, v19.4s, v4.4s +ldr q28, [x17, #+336] +str q27, [x0, #176] +mla v15.4S, v6.4S, v31.s[0] +sqrdmulh v6.4S, v17.4S, v28.s[0] +add v19.4s, v19.4s, v4.4s +ldr q4, [x17, #+352] +str q19, [x0, #160] +mla v21.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v29.4S, v28.s[0] +sub v19.4s, v25.4s, v14.4s +ldr q27, [x17, #+368] +str q19, [x0, #208] +mla v20.4S, v22.4S, v31.s[0] +sqrdmulh v22.4S, v3.4S, v27.s[0] +add v25.4s, v25.4s, v14.4s +ldr q14, [x0, #256] +str q25, [x0, #192] +mla v16.4S, v9.4S, v31.s[0] +sqrdmulh v9.4S, v30.4S, v27.s[0] +sub v25.4s, v8.4s, v1.4s +ldr q19, [x0, #272] +str q25, [x0, #240] +mul v17.4S, v17.4S,v11.s[0] +mul v29.4S, v29.4S,v11.s[0] +add v8.4s, v8.4s, v1.4s +ldr q1, [x0, #320] +str q8, [x0, #224] +mla v17.4S, v6.4S, v31.s[0] +mla v29.4S, v7.4S, v31.s[0] +sub v7.4s, v14.4s, v15.4s +ldr q6, [x0, #336] +add v14.4s, v14.4s, v15.4s +mul v3.4S, v3.4S,v4.s[0] +mul v30.4S, v30.4S,v4.s[0] +sub v15.4s, v19.4s, v21.4s +ldr q8, [x0, #384] +add v19.4s, v19.4s, v21.4s +mla v3.4S, v22.4S, v31.s[0] +mla v30.4S, v9.4S, v31.s[0] +sub v9.4s, v1.4s, v20.4s +ldr q22, [x0, #400] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v5.s[1] +mul v19.4S, v19.4S,v10.s[1] +sub v21.4s, v6.4s, v16.4s +ldr q25, [x0, #448] +add v6.4s, v6.4s, v16.4s +sqrdmulh v16.4S, v15.4S, v5.s[2] +mul v15.4S, v15.4S,v10.s[2] +sub v26.4s, v8.4s, v17.4s +ldr q2, [x0, #464] +add v8.4s, v8.4s, v17.4s +sqrdmulh v5.4S, v6.4S, v13.s[1] +mul v6.4S, v6.4S,v24.s[1] +sub v17.4s, v22.4s, v29.4s +ldr q10, [x0, #544] +add v22.4s, v22.4s, v29.4s +sqrdmulh v29.4S, v21.4S, v13.s[2] +mul v21.4S, v21.4S,v24.s[2] +sub v23.4s, v25.4s, v3.4s +ldr q18, [x0, #560] +add v25.4s, v25.4s, v3.4s +mla v19.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v22.4S, v28.s[1] +sub v13.4s, v2.4s, v30.4s +ldr q3, [x0, #608] +add v2.4s, v2.4s, v30.4s +mla v15.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v17.4S, v28.s[2] +sub v30.4s, v14.4s, v19.4s +ldr q24, [x0, #624] +str q30, [x0, #272] +mla v6.4S, v5.4S, v31.s[0] +sqrdmulh v5.4S, v2.4S, v27.s[1] +add v14.4s, v14.4s, v19.4s +ldr q19, [x17, #+384] +str q14, [x0, #256] +mla v21.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v13.4S, v27.s[2] +sub v14.4s, v7.4s, v15.4s +ldr q30, [x17, #+400] +str q14, [x0, #304] +mul v22.4S, v22.4S,v11.s[1] +mul v17.4S, v17.4S,v11.s[2] +add v7.4s, v7.4s, v15.4s +ldr q15, [x17, #+416] +str q7, [x0, #288] +mla v22.4S, v20.4S, v31.s[0] +mla v17.4S, v16.4S, v31.s[0] +sub v16.4s, v1.4s, v6.4s +ldr q20, [x17, #+432] +str q16, [x0, #336] +mul v2.4S, v2.4S,v4.s[1] +mul v13.4S, v13.4S,v4.s[2] +add v1.4s, v1.4s, v6.4s +ldr q6, [x0, #672] +str q1, [x0, #320] +mla v2.4S, v5.4S, v31.s[0] +mla v13.4S, v29.4S, v31.s[0] +sub v29.4s, v9.4s, v21.4s +ldr q5, [x0, #688] +str q29, [x0, #368] +sqrdmulh v27.4S, v10.4S, v30.s[0] +mul v10.4S, v10.4S,v19.s[0] +add v9.4s, v9.4s, v21.4s +ldr q21, [x0, #736] +str q9, [x0, #352] +sqrdmulh v9.4S, v18.4S, v30.s[0] +mul v18.4S, v18.4S,v19.s[0] +sub v29.4s, v8.4s, v22.4s +ldr q4, [x0, #752] +str q29, [x0, #400] +sqrdmulh v29.4S, v3.4S, v20.s[0] +mul v3.4S, v3.4S,v15.s[0] +add v8.4s, v8.4s, v22.4s +ldr q22, [x17, #+448] +str q8, [x0, #384] +sqrdmulh v8.4S, v24.4S, v20.s[0] +mul v24.4S, v24.4S,v15.s[0] +sub v1.4s, v26.4s, v17.4s +ldr q28, [x17, #+464] +str q1, [x0, #432] +mla v10.4S, v27.4S, v31.s[0] +sqrdmulh v27.4S, v6.4S, v28.s[0] +add v26.4s, v26.4s, v17.4s +ldr q17, [x17, #+480] +str q26, [x0, #416] +mla v18.4S, v9.4S, v31.s[0] +sqrdmulh v9.4S, v5.4S, v28.s[0] +sub v26.4s, v25.4s, v2.4s +ldr q1, [x17, #+496] +str q26, [x0, #464] +mla v3.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v21.4S, v1.s[0] +add v25.4s, v25.4s, v2.4s +ldr q2, [x0, #512] +str q25, [x0, #448] +mla v24.4S, v8.4S, v31.s[0] +sqrdmulh v8.4S, v4.4S, v1.s[0] +sub v25.4s, v23.4s, v13.4s +ldr q26, [x0, #528] +str q25, [x0, #496] +mul v6.4S, v6.4S,v22.s[0] +mul v5.4S, v5.4S,v22.s[0] +add v23.4s, v23.4s, v13.4s +ldr q13, [x0, #576] +str q23, [x0, #480] +mla v6.4S, v27.4S, v31.s[0] +mla v5.4S, v9.4S, v31.s[0] +sub v9.4s, v2.4s, v10.4s +ldr q27, [x0, #592] +add v2.4s, v2.4s, v10.4s +mul v21.4S, v21.4S,v17.s[0] +mul v4.4S, v4.4S,v17.s[0] +sub v10.4s, v26.4s, v18.4s +ldr q23, [x0, #640] +add v26.4s, v26.4s, v18.4s +mla v21.4S, v29.4S, v31.s[0] +mla v4.4S, v8.4S, v31.s[0] +sub v8.4s, v13.4s, v3.4s +ldr q29, [x0, #656] +add v13.4s, v13.4s, v3.4s +sqrdmulh v3.4S, v26.4S, v30.s[1] +mul v26.4S, v26.4S,v19.s[1] +sub v18.4s, v27.4s, v24.4s +ldr q25, [x0, #704] +add v27.4s, v27.4s, v24.4s +sqrdmulh v24.4S, v10.4S, v30.s[2] +mul v10.4S, v10.4S,v19.s[2] +sub v16.4s, v23.4s, v6.4s +ldr q11, [x0, #720] +add v23.4s, v23.4s, v6.4s +sqrdmulh v30.4S, v27.4S, v20.s[1] +mul v27.4S, v27.4S,v15.s[1] +sub v6.4s, v29.4s, v5.4s +ldr q19, [x0, #800] +add v29.4s, v29.4s, v5.4s +sqrdmulh v5.4S, v18.4S, v20.s[2] +mul v18.4S, v18.4S,v15.s[2] +sub v7.4s, v25.4s, v21.4s +ldr q14, [x0, #816] +add v25.4s, v25.4s, v21.4s +mla v26.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v29.4S, v28.s[1] +sub v20.4s, v11.4s, v4.4s +ldr q21, [x0, #864] +add v11.4s, v11.4s, v4.4s +mla v10.4S, v24.4S, v31.s[0] +sqrdmulh v24.4S, v6.4S, v28.s[2] +sub v4.4s, v2.4s, v26.4s +ldr q15, [x0, #880] +str q4, [x0, #528] +mla v27.4S, v30.4S, v31.s[0] +sqrdmulh v30.4S, v11.4S, v1.s[1] +add v2.4s, v2.4s, v26.4s +ldr q26, [x17, #+512] +str q2, [x0, #512] +mla v18.4S, v5.4S, v31.s[0] +sqrdmulh v5.4S, v20.4S, v1.s[2] +sub v2.4s, v9.4s, v10.4s +ldr q4, [x17, #+528] +str q2, [x0, #560] +mul v29.4S, v29.4S,v22.s[1] +mul v6.4S, v6.4S,v22.s[2] +add v9.4s, v9.4s, v10.4s +ldr q10, [x17, #+544] +str q9, [x0, #544] +mla v29.4S, v3.4S, v31.s[0] +mla v6.4S, v24.4S, v31.s[0] +sub v24.4s, v13.4s, v27.4s +ldr q3, [x17, #+560] +str q24, [x0, #592] +mul v11.4S, v11.4S,v17.s[1] +mul v20.4S, v20.4S,v17.s[2] +add v13.4s, v13.4s, v27.4s +ldr q27, [x0, #928] +str q13, [x0, #576] +mla v11.4S, v30.4S, v31.s[0] +mla v20.4S, v5.4S, v31.s[0] +sub v5.4s, v8.4s, v18.4s +ldr q30, [x0, #944] +str q5, [x0, #624] +sqrdmulh v1.4S, v19.4S, v4.s[0] +mul v19.4S, v19.4S,v26.s[0] +add v8.4s, v8.4s, v18.4s +ldr q18, [x0, #992] +str q8, [x0, #608] +sqrdmulh v8.4S, v14.4S, v4.s[0] +mul v14.4S, v14.4S,v26.s[0] +sub v5.4s, v23.4s, v29.4s +ldr q17, [x0, #1008] +str q5, [x0, #656] +sqrdmulh v5.4S, v21.4S, v3.s[0] +mul v21.4S, v21.4S,v10.s[0] +add v23.4s, v23.4s, v29.4s +ldr q29, [x17, #+576] +str q23, [x0, #640] +sqrdmulh v23.4S, v15.4S, v3.s[0] +mul v15.4S, v15.4S,v10.s[0] +sub v13.4s, v16.4s, v6.4s +ldr q28, [x17, #+592] +str q13, [x0, #688] +mla v19.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v27.4S, v28.s[0] +add v16.4s, v16.4s, v6.4s +ldr q6, [x17, #+608] +str q16, [x0, #672] +mla v14.4S, v8.4S, v31.s[0] +sqrdmulh v8.4S, v30.4S, v28.s[0] +sub v16.4s, v25.4s, v11.4s +ldr q13, [x17, #+624] +str q16, [x0, #720] +mla v21.4S, v5.4S, v31.s[0] +sqrdmulh v5.4S, v18.4S, v13.s[0] +add v25.4s, v25.4s, v11.4s +ldr q11, [x0, #768] +str q25, [x0, #704] +mla v15.4S, v23.4S, v31.s[0] +sqrdmulh v23.4S, v17.4S, v13.s[0] +sub v25.4s, v7.4s, v20.4s +ldr q16, [x0, #784] +str q25, [x0, #752] +mul v27.4S, v27.4S,v29.s[0] +mul v30.4S, v30.4S,v29.s[0] +add v7.4s, v7.4s, v20.4s +ldr q20, [x0, #832] +str q7, [x0, #736] +mla v27.4S, v1.4S, v31.s[0] +mla v30.4S, v8.4S, v31.s[0] +sub v8.4s, v11.4s, v19.4s +ldr q1, [x0, #848] +add v11.4s, v11.4s, v19.4s +mul v18.4S, v18.4S,v6.s[0] +mul v17.4S, v17.4S,v6.s[0] +sub v19.4s, v16.4s, v14.4s +ldr q7, [x0, #896] +add v16.4s, v16.4s, v14.4s +mla v18.4S, v5.4S, v31.s[0] +mla v17.4S, v23.4S, v31.s[0] +sub v23.4s, v20.4s, v21.4s +ldr q5, [x0, #912] +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v16.4S, v4.s[1] +mul v16.4S, v16.4S,v26.s[1] +sub v14.4s, v1.4s, v15.4s +ldr q25, [x0, #960] +add v1.4s, v1.4s, v15.4s +sqrdmulh v15.4S, v19.4S, v4.s[2] +mul v19.4S, v19.4S,v26.s[2] +sub v24.4s, v7.4s, v27.4s +ldr q22, [x0, #976] +add v7.4s, v7.4s, v27.4s +sqrdmulh v4.4S, v1.4S, v3.s[1] +mul v1.4S, v1.4S,v10.s[1] +sub v27.4s, v5.4s, v30.4s +add v5.4s, v5.4s, v30.4s +sqrdmulh v30.4S, v14.4S, v3.s[2] +mul v14.4S, v14.4S,v10.s[2] +sub v26.4s, v25.4s, v18.4s +add v25.4s, v25.4s, v18.4s +mla v16.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v5.4S, v28.s[1] +sub v3.4s, v22.4s, v17.4s +add v22.4s, v22.4s, v17.4s +mla v19.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v27.4S, v28.s[2] +sub v17.4s, v11.4s, v16.4s +str q17, [x0, #784] +mla v1.4S, v4.4S, v31.s[0] +sqrdmulh v4.4S, v22.4S, v13.s[1] +add v11.4s, v11.4s, v16.4s +str q11, [x0, #768] +mla v14.4S, v30.4S, v31.s[0] +sqrdmulh v30.4S, v3.4S, v13.s[2] +sub v11.4s, v8.4s, v19.4s +str q11, [x0, #816] +mul v5.4S, v5.4S,v29.s[1] +mul v27.4S, v27.4S,v29.s[2] +add v8.4s, v8.4s, v19.4s +str q8, [x0, #800] +mla v5.4S, v21.4S, v31.s[0] +mla v27.4S, v15.4S, v31.s[0] +sub v15.4s, v20.4s, v1.4s +str q15, [x0, #848] +mul v22.4S, v22.4S,v6.s[1] +mul v3.4S, v3.4S,v6.s[2] +add v20.4s, v20.4s, v1.4s +str q20, [x0, #832] +mla v22.4S, v4.4S, v31.s[0] +mla v3.4S, v30.4S, v31.s[0] +sub v30.4s, v23.4s, v14.4s +str q30, [x0, #880] +add v23.4s, v23.4s, v14.4s +str q23, [x0, #864] +sub v23.4s, v7.4s, v5.4s +str q23, [x0, #912] +add v7.4s, v7.4s, v5.4s +str q7, [x0, #896] +sub v7.4s, v24.4s, v27.4s +str q7, [x0, #944] +add v24.4s, v24.4s, v27.4s +str q24, [x0, #928] +sub v24.4s, v25.4s, v22.4s +str q24, [x0, #976] +add v25.4s, v25.4s, v22.4s +str q25, [x0, #960] +sub v25.4s, v26.4s, v3.4s +str q25, [x0, #1008] +add v26.4s, v26.4s, v3.4s +str q26, [x0, #992] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1520 +// Instruction count: 1516 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_12.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_12.s new file mode 100644 index 0000000..68e1c14 --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_12.s @@ -0,0 +1,1550 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_22_z4_12 +.global _ntt_u32_incomplete_neon_asm_var_4_2_22_z4_12 +ntt_u32_incomplete_neon_asm_var_4_2_22_z4_12: +_ntt_u32_incomplete_neon_asm_var_4_2_22_z4_12: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x0, #992] +sqrdmulh v27.4S, v28.4S, v29.s[0] +mul v28.4S, v28.4S,v30.s[0] +ldr q26, [x0, #928] +sqrdmulh v25.4S, v26.4S, v29.s[0] +mul v26.4S, v26.4S,v30.s[0] +ldr q24, [x0, #864] +sqrdmulh v23.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +ldr q22, [x0, #800] +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +ldr q20, [x0, #736] +sqrdmulh v19.4S, v20.4S, v29.s[0] +mla v28.4S, v27.4S, v31.s[0] +ldr q27, [x0, #672] +sqrdmulh v18.4S, v27.4S, v29.s[0] +mla v26.4S, v25.4S, v31.s[0] +ldr q25, [x0, #608] +sqrdmulh v17.4S, v25.4S, v29.s[0] +mla v24.4S, v23.4S, v31.s[0] +ldr q23, [x0, #544] +sqrdmulh v16.4S, v23.4S, v29.s[0] +mla v22.4S, v21.4S, v31.s[0] +ldr q21, [x0, #480] +mul v27.4S, v27.4S,v30.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q3, [x0, #416] +ldr q2, [x0, #352] +ldr q1, [x0, #288] +mla v27.4S, v18.4S, v31.s[0] +mla v20.4S, v19.4S, v31.s[0] +ldr q19, [x0, #224] +ldr q18, [x0, #160] +mul v23.4S, v23.4S,v30.s[0] +mul v25.4S, v25.4S,v30.s[0] +ldr q0, [x0, #96] +ldr q15, [x0, #32] +mla v23.4S, v16.4S, v31.s[0] +mla v25.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v28.4s +add v21.4s, v21.4s, v28.4s +sqrdmulh v28.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +sub v16.4s, v3.4s, v26.4s +add v3.4s, v3.4s, v26.4s +sqrdmulh v26.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +sub v14.4s, v2.4s, v24.4s +add v2.4s, v2.4s, v24.4s +sqrdmulh v24.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v13.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +sqrdmulh v22.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v12.4s, v19.4s, v20.4s +add v19.4s, v19.4s, v20.4s +sqrdmulh v20.4S, v14.4S, v29.s[2] +mla v17.4S, v28.4S, v31.s[0] +sub v28.4s, v18.4s, v27.4s +add v18.4s, v18.4s, v27.4s +sqrdmulh v27.4S, v13.4S, v29.s[2] +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v0.4s, v25.4s +add v0.4s, v0.4s, v25.4s +sqrdmulh v25.4S, v2.4S, v29.s[1] +mla v21.4S, v24.4S, v31.s[0] +sub v24.4s, v15.4s, v23.4s +sqrdmulh v11.4S, v1.4S, v29.s[1] +mla v3.4S, v22.4S, v31.s[0] +add v15.4s, v15.4s, v23.4s +ldr q23, [x17, #+32] +ldr q22, [x17, #+48] +mul v13.4S, v13.4S,v30.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v10.4s, v12.4s, v17.4s +add v12.4s, v12.4s, v17.4s +mla v13.4S, v27.4S, v31.s[0] +mla v14.4S, v20.4S, v31.s[0] +sub v20.4s, v28.4s, v16.4s +add v28.4s, v28.4s, v16.4s +mul v1.4S, v1.4S,v30.s[1] +mul v2.4S, v2.4S,v30.s[1] +sub v16.4s, v19.4s, v21.4s +add v19.4s, v19.4s, v21.4s +mla v1.4S, v11.4S, v31.s[0] +mla v2.4S, v25.4S, v31.s[0] +sub v25.4s, v18.4s, v3.4s +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v10.4S, v22.s[3] +mul v10.4S, v10.4S,v23.s[3] +sub v11.4s, v26.4s, v14.4s +add v26.4s, v26.4s, v14.4s +sqrdmulh v14.4S, v12.4S, v22.s[2] +mul v12.4S, v12.4S,v23.s[2] +sub v21.4s, v24.4s, v13.4s +add v24.4s, v24.4s, v13.4s +sqrdmulh v13.4S, v16.4S, v22.s[1] +mul v16.4S, v16.4S,v23.s[1] +sub v27.4s, v0.4s, v2.4s +add v0.4s, v0.4s, v2.4s +sqrdmulh v2.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v17.4s, v15.4s, v1.4s +add v15.4s, v15.4s, v1.4s +ldr q1, [x17, #+96] +ldr q9, [x17, #+112] +sqrdmulh v8.4S, v20.4S, v22.s[3] +mla v10.4S, v3.4S, v31.s[0] +nop +nop +sqrdmulh v3.4S, v28.4S, v22.s[2] +mla v12.4S, v14.4S, v31.s[0] +nop +nop +sqrdmulh v14.4S, v25.4S, v22.s[1] +mla v16.4S, v13.4S, v31.s[0] +nop +nop +sqrdmulh v13.4S, v18.4S, v22.s[0] +mla v19.4S, v2.4S, v31.s[0] +nop +nop +ldr q2, [x17, #+64] +ldr q7, [x17, #+80] +mul v28.4S, v28.4S,v23.s[2] +mul v20.4S, v20.4S,v23.s[3] +sub v6.4s, v11.4s, v10.4s +add v11.4s, v11.4s, v10.4s +mla v28.4S, v3.4S, v31.s[0] +mla v20.4S, v8.4S, v31.s[0] +sub v8.4s, v26.4s, v12.4s +add v26.4s, v26.4s, v12.4s +mul v18.4S, v18.4S,v23.s[0] +mul v25.4S, v25.4S,v23.s[1] +sub v12.4s, v27.4s, v16.4s +add v27.4s, v27.4s, v16.4s +mla v18.4S, v13.4S, v31.s[0] +mla v25.4S, v14.4S, v31.s[0] +sub v14.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v9.s[3] +mul v6.4S, v6.4S,v1.s[3] +sub v13.4s, v21.4s, v20.4s +add v21.4s, v21.4s, v20.4s +sqrdmulh v20.4S, v11.4S, v9.s[2] +mul v11.4S, v11.4S,v1.s[2] +sub v16.4s, v24.4s, v28.4s +add v24.4s, v24.4s, v28.4s +sqrdmulh v28.4S, v8.4S, v9.s[1] +mul v8.4S, v8.4S,v1.s[1] +sub v3.4s, v17.4s, v25.4s +add v17.4s, v17.4s, v25.4s +sqrdmulh v25.4S, v26.4S, v9.s[0] +mul v26.4S, v26.4S,v1.s[0] +sub v10.4s, v15.4s, v18.4s +add v15.4s, v15.4s, v18.4s +sqrdmulh v18.4S, v12.4S, v7.s[3] +mla v6.4S, v19.4S, v31.s[0] +nop +nop +sqrdmulh v19.4S, v27.4S, v7.s[2] +mla v11.4S, v20.4S, v31.s[0] +nop +nop +sqrdmulh v20.4S, v14.4S, v7.s[1] +mla v8.4S, v28.4S, v31.s[0] +nop +nop +sqrdmulh v28.4S, v0.4S, v7.s[0] +mla v26.4S, v25.4S, v31.s[0] +nop +nop +mul v27.4S, v27.4S,v2.s[2] +mul v12.4S, v12.4S,v2.s[3] +sub v25.4s, v13.4s, v6.4s +str q25, [x0, #992] +mla v27.4S, v19.4S, v31.s[0] +mla v12.4S, v18.4S, v31.s[0] +add v13.4s, v13.4s, v6.4s +str q13, [x0, #928] +mul v0.4S, v0.4S,v2.s[0] +mul v14.4S, v14.4S,v2.s[1] +sub v13.4s, v21.4s, v11.4s +str q13, [x0, #864] +mla v0.4S, v28.4S, v31.s[0] +mla v14.4S, v20.4S, v31.s[0] +add v21.4s, v21.4s, v11.4s +sub v11.4s, v16.4s, v8.4s +ldr q20, [x0, #1008] +sqrdmulh v28.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v16.4s, v16.4s, v8.4s +str q21, [x0, #800] +ldr q21, [x0, #944] +sqrdmulh v8.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +sub v13.4s, v24.4s, v26.4s +str q11, [x0, #736] +ldr q11, [x0, #880] +sqrdmulh v6.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +add v24.4s, v24.4s, v26.4s +str q16, [x0, #672] +ldr q16, [x0, #816] +sqrdmulh v26.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v18.4s, v3.4s, v12.4s +str q13, [x0, #608] +ldr q13, [x0, #752] +sqrdmulh v19.4S, v13.4S, v29.s[0] +mla v20.4S, v28.4S, v31.s[0] +add v3.4s, v3.4s, v12.4s +str q24, [x0, #544] +ldr q24, [x0, #688] +sqrdmulh v12.4S, v24.4S, v29.s[0] +mla v21.4S, v8.4S, v31.s[0] +sub v8.4s, v17.4s, v27.4s +str q18, [x0, #480] +ldr q18, [x0, #624] +sqrdmulh v28.4S, v18.4S, v29.s[0] +mla v11.4S, v6.4S, v31.s[0] +add v17.4s, v17.4s, v27.4s +str q3, [x0, #416] +ldr q3, [x0, #560] +sqrdmulh v27.4S, v3.4S, v29.s[0] +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v10.4s, v14.4s +str q8, [x0, #352] +ldr q8, [x0, #496] +add v10.4s, v10.4s, v14.4s +mul v24.4S, v24.4S,v30.s[0] +mul v13.4S, v13.4S,v30.s[0] +ldr q14, [x0, #432] +str q17, [x0, #288] +ldr q17, [x0, #368] +ldr q6, [x0, #304] +mla v24.4S, v12.4S, v31.s[0] +mla v13.4S, v19.4S, v31.s[0] +str q26, [x0, #224] +sub v26.4s, v15.4s, v0.4s +ldr q19, [x0, #240] +ldr q12, [x0, #176] +mul v3.4S, v3.4S,v30.s[0] +mul v18.4S, v18.4S,v30.s[0] +str q10, [x0, #160] +add v15.4s, v15.4s, v0.4s +ldr q0, [x0, #112] +ldr q10, [x0, #48] +mla v3.4S, v27.4S, v31.s[0] +mla v18.4S, v28.4S, v31.s[0] +sub v28.4s, v8.4s, v20.4s +add v8.4s, v8.4s, v20.4s +sqrdmulh v20.4S, v28.4S, v29.s[2] +mul v28.4S, v28.4S,v30.s[2] +sub v27.4s, v14.4s, v21.4s +add v14.4s, v14.4s, v21.4s +sqrdmulh v21.4S, v27.4S, v29.s[2] +mul v27.4S, v27.4S,v30.s[2] +sub v25.4s, v17.4s, v11.4s +add v17.4s, v17.4s, v11.4s +sqrdmulh v11.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +sub v5.4s, v6.4s, v16.4s +add v6.4s, v6.4s, v16.4s +sqrdmulh v16.4S, v14.4S, v29.s[1] +mul v14.4S, v14.4S,v30.s[1] +sub v4.4s, v19.4s, v13.4s +add v19.4s, v19.4s, v13.4s +sqrdmulh v13.4S, v25.4S, v29.s[2] +mla v28.4S, v20.4S, v31.s[0] +sub v20.4s, v12.4s, v24.4s +add v12.4s, v12.4s, v24.4s +sqrdmulh v24.4S, v5.4S, v29.s[2] +mla v27.4S, v21.4S, v31.s[0] +sub v21.4s, v0.4s, v18.4s +add v0.4s, v0.4s, v18.4s +sqrdmulh v18.4S, v17.4S, v29.s[1] +mla v8.4S, v11.4S, v31.s[0] +sub v11.4s, v10.4s, v3.4s +str q26, [x0, #96] +sqrdmulh v26.4S, v6.4S, v29.s[1] +mla v14.4S, v16.4S, v31.s[0] +add v10.4s, v10.4s, v3.4s +str q15, [x0, #32] +mul v5.4S, v5.4S,v30.s[2] +mul v25.4S, v25.4S,v30.s[2] +sub v15.4s, v4.4s, v28.4s +add v4.4s, v4.4s, v28.4s +mla v5.4S, v24.4S, v31.s[0] +mla v25.4S, v13.4S, v31.s[0] +sub v13.4s, v20.4s, v27.4s +add v20.4s, v20.4s, v27.4s +mul v6.4S, v6.4S,v30.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v27.4s, v19.4s, v8.4s +add v19.4s, v19.4s, v8.4s +mla v6.4S, v26.4S, v31.s[0] +mla v17.4S, v18.4S, v31.s[0] +sub v18.4s, v12.4s, v14.4s +add v12.4s, v12.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v22.s[3] +mul v15.4S, v15.4S,v23.s[3] +sub v26.4s, v21.4s, v25.4s +add v21.4s, v21.4s, v25.4s +sqrdmulh v25.4S, v4.4S, v22.s[2] +mul v4.4S, v4.4S,v23.s[2] +sub v8.4s, v11.4s, v5.4s +add v11.4s, v11.4s, v5.4s +sqrdmulh v5.4S, v27.4S, v22.s[1] +mul v27.4S, v27.4S,v23.s[1] +sub v24.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +sqrdmulh v17.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v28.4s, v10.4s, v6.4s +add v10.4s, v10.4s, v6.4s +sqrdmulh v6.4S, v13.4S, v22.s[3] +mla v15.4S, v14.4S, v31.s[0] +nop +nop +sqrdmulh v14.4S, v20.4S, v22.s[2] +mla v4.4S, v25.4S, v31.s[0] +nop +nop +sqrdmulh v25.4S, v18.4S, v22.s[1] +mla v27.4S, v5.4S, v31.s[0] +nop +nop +sqrdmulh v5.4S, v12.4S, v22.s[0] +mla v19.4S, v17.4S, v31.s[0] +nop +nop +mul v20.4S, v20.4S,v23.s[2] +mul v13.4S, v13.4S,v23.s[3] +sub v17.4s, v26.4s, v15.4s +add v26.4s, v26.4s, v15.4s +mla v20.4S, v14.4S, v31.s[0] +mla v13.4S, v6.4S, v31.s[0] +sub v6.4s, v21.4s, v4.4s +add v21.4s, v21.4s, v4.4s +mul v12.4S, v12.4S,v23.s[0] +mul v18.4S, v18.4S,v23.s[1] +sub v4.4s, v24.4s, v27.4s +add v24.4s, v24.4s, v27.4s +mla v12.4S, v5.4S, v31.s[0] +mla v18.4S, v25.4S, v31.s[0] +sub v25.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v17.4S, v9.s[3] +mul v17.4S, v17.4S,v1.s[3] +sub v5.4s, v8.4s, v13.4s +add v8.4s, v8.4s, v13.4s +sqrdmulh v13.4S, v26.4S, v9.s[2] +mul v26.4S, v26.4S,v1.s[2] +sub v27.4s, v11.4s, v20.4s +add v11.4s, v11.4s, v20.4s +sqrdmulh v20.4S, v6.4S, v9.s[1] +mul v6.4S, v6.4S,v1.s[1] +sub v14.4s, v28.4s, v18.4s +add v28.4s, v28.4s, v18.4s +sqrdmulh v18.4S, v21.4S, v9.s[0] +mul v21.4S, v21.4S,v1.s[0] +sub v15.4s, v10.4s, v12.4s +add v10.4s, v10.4s, v12.4s +sqrdmulh v12.4S, v4.4S, v7.s[3] +mla v17.4S, v19.4S, v31.s[0] +nop +nop +sqrdmulh v19.4S, v24.4S, v7.s[2] +mla v26.4S, v13.4S, v31.s[0] +nop +nop +sqrdmulh v13.4S, v25.4S, v7.s[1] +mla v6.4S, v20.4S, v31.s[0] +nop +nop +sqrdmulh v20.4S, v0.4S, v7.s[0] +mla v21.4S, v18.4S, v31.s[0] +nop +nop +mul v24.4S, v24.4S,v2.s[2] +mul v4.4S, v4.4S,v2.s[3] +sub v18.4s, v5.4s, v17.4s +str q18, [x0, #1008] +mla v24.4S, v19.4S, v31.s[0] +mla v4.4S, v12.4S, v31.s[0] +add v5.4s, v5.4s, v17.4s +str q5, [x0, #944] +mul v0.4S, v0.4S,v2.s[0] +mul v25.4S, v25.4S,v2.s[1] +sub v5.4s, v8.4s, v26.4s +str q5, [x0, #880] +mla v0.4S, v20.4S, v31.s[0] +mla v25.4S, v13.4S, v31.s[0] +add v8.4s, v8.4s, v26.4s +sub v26.4s, v27.4s, v6.4s +ldr q13, [x0, #960] +sqrdmulh v20.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +add v27.4s, v27.4s, v6.4s +str q8, [x0, #816] +ldr q8, [x0, #896] +sqrdmulh v6.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v5.4s, v11.4s, v21.4s +str q26, [x0, #752] +ldr q26, [x0, #832] +sqrdmulh v17.4S, v26.4S, v29.s[0] +mul v26.4S, v26.4S,v30.s[0] +add v11.4s, v11.4s, v21.4s +str q27, [x0, #688] +ldr q27, [x0, #768] +sqrdmulh v21.4S, v27.4S, v29.s[0] +mul v27.4S, v27.4S,v30.s[0] +sub v12.4s, v14.4s, v4.4s +str q5, [x0, #624] +ldr q5, [x0, #704] +sqrdmulh v19.4S, v5.4S, v29.s[0] +mla v13.4S, v20.4S, v31.s[0] +add v14.4s, v14.4s, v4.4s +str q11, [x0, #560] +ldr q11, [x0, #640] +sqrdmulh v4.4S, v11.4S, v29.s[0] +mla v8.4S, v6.4S, v31.s[0] +sub v6.4s, v28.4s, v24.4s +str q12, [x0, #496] +ldr q12, [x0, #576] +sqrdmulh v20.4S, v12.4S, v29.s[0] +mla v26.4S, v17.4S, v31.s[0] +add v28.4s, v28.4s, v24.4s +str q14, [x0, #432] +ldr q14, [x0, #512] +sqrdmulh v24.4S, v14.4S, v29.s[0] +mla v27.4S, v21.4S, v31.s[0] +sub v21.4s, v15.4s, v25.4s +str q6, [x0, #368] +ldr q6, [x0, #448] +add v15.4s, v15.4s, v25.4s +mul v11.4S, v11.4S,v30.s[0] +mul v5.4S, v5.4S,v30.s[0] +ldr q25, [x0, #384] +str q28, [x0, #304] +ldr q28, [x0, #320] +ldr q17, [x0, #256] +mla v11.4S, v4.4S, v31.s[0] +mla v5.4S, v19.4S, v31.s[0] +str q21, [x0, #240] +sub v21.4s, v10.4s, v0.4s +ldr q19, [x0, #192] +ldr q4, [x0, #128] +mul v14.4S, v14.4S,v30.s[0] +mul v12.4S, v12.4S,v30.s[0] +str q15, [x0, #176] +add v10.4s, v10.4s, v0.4s +ldr q0, [x0, #64] +ldr q15, [x0, #0] +mla v14.4S, v24.4S, v31.s[0] +mla v12.4S, v20.4S, v31.s[0] +sub v20.4s, v6.4s, v13.4s +add v6.4s, v6.4s, v13.4s +sqrdmulh v13.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +sub v24.4s, v25.4s, v8.4s +add v25.4s, v25.4s, v8.4s +sqrdmulh v8.4S, v24.4S, v29.s[2] +mul v24.4S, v24.4S,v30.s[2] +sub v18.4s, v28.4s, v26.4s +add v28.4s, v28.4s, v26.4s +sqrdmulh v26.4S, v6.4S, v29.s[1] +mul v6.4S, v6.4S,v30.s[1] +sub v3.4s, v17.4s, v27.4s +add v17.4s, v17.4s, v27.4s +sqrdmulh v27.4S, v25.4S, v29.s[1] +mul v25.4S, v25.4S,v30.s[1] +sub v16.4s, v19.4s, v5.4s +add v19.4s, v19.4s, v5.4s +sqrdmulh v5.4S, v18.4S, v29.s[2] +mla v20.4S, v13.4S, v31.s[0] +sub v13.4s, v4.4s, v11.4s +add v4.4s, v4.4s, v11.4s +sqrdmulh v11.4S, v3.4S, v29.s[2] +mla v24.4S, v8.4S, v31.s[0] +sub v8.4s, v0.4s, v12.4s +add v0.4s, v0.4s, v12.4s +sqrdmulh v12.4S, v28.4S, v29.s[1] +mla v6.4S, v26.4S, v31.s[0] +sub v26.4s, v15.4s, v14.4s +str q21, [x0, #112] +sqrdmulh v21.4S, v17.4S, v29.s[1] +mla v25.4S, v27.4S, v31.s[0] +add v15.4s, v15.4s, v14.4s +str q10, [x0, #48] +mul v3.4S, v3.4S,v30.s[2] +mul v18.4S, v18.4S,v30.s[2] +sub v10.4s, v16.4s, v20.4s +add v16.4s, v16.4s, v20.4s +mla v3.4S, v11.4S, v31.s[0] +mla v18.4S, v5.4S, v31.s[0] +sub v5.4s, v13.4s, v24.4s +add v13.4s, v13.4s, v24.4s +mul v17.4S, v17.4S,v30.s[1] +mul v28.4S, v28.4S,v30.s[1] +sub v24.4s, v19.4s, v6.4s +add v19.4s, v19.4s, v6.4s +mla v17.4S, v21.4S, v31.s[0] +mla v28.4S, v12.4S, v31.s[0] +sub v12.4s, v4.4s, v25.4s +add v4.4s, v4.4s, v25.4s +sqrdmulh v25.4S, v10.4S, v22.s[3] +mul v10.4S, v10.4S,v23.s[3] +sub v21.4s, v8.4s, v18.4s +add v8.4s, v8.4s, v18.4s +sqrdmulh v18.4S, v16.4S, v22.s[2] +mul v16.4S, v16.4S,v23.s[2] +sub v6.4s, v26.4s, v3.4s +add v26.4s, v26.4s, v3.4s +sqrdmulh v3.4S, v24.4S, v22.s[1] +mul v24.4S, v24.4S,v23.s[1] +sub v11.4s, v0.4s, v28.4s +add v0.4s, v0.4s, v28.4s +sqrdmulh v28.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v20.4s, v15.4s, v17.4s +add v15.4s, v15.4s, v17.4s +sqrdmulh v17.4S, v5.4S, v22.s[3] +mla v10.4S, v25.4S, v31.s[0] +nop +nop +sqrdmulh v25.4S, v13.4S, v22.s[2] +mla v16.4S, v18.4S, v31.s[0] +nop +nop +sqrdmulh v18.4S, v12.4S, v22.s[1] +mla v24.4S, v3.4S, v31.s[0] +nop +nop +sqrdmulh v3.4S, v4.4S, v22.s[0] +mla v19.4S, v28.4S, v31.s[0] +nop +nop +mul v13.4S, v13.4S,v23.s[2] +mul v5.4S, v5.4S,v23.s[3] +sub v28.4s, v21.4s, v10.4s +add v21.4s, v21.4s, v10.4s +mla v13.4S, v25.4S, v31.s[0] +mla v5.4S, v17.4S, v31.s[0] +sub v17.4s, v8.4s, v16.4s +add v8.4s, v8.4s, v16.4s +mul v4.4S, v4.4S,v23.s[0] +mul v12.4S, v12.4S,v23.s[1] +sub v16.4s, v11.4s, v24.4s +add v11.4s, v11.4s, v24.4s +mla v4.4S, v3.4S, v31.s[0] +mla v12.4S, v18.4S, v31.s[0] +sub v18.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v28.4S, v9.s[3] +mul v28.4S, v28.4S,v1.s[3] +sub v3.4s, v6.4s, v5.4s +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v21.4S, v9.s[2] +mul v21.4S, v21.4S,v1.s[2] +sub v24.4s, v26.4s, v13.4s +add v26.4s, v26.4s, v13.4s +sqrdmulh v13.4S, v17.4S, v9.s[1] +mul v17.4S, v17.4S,v1.s[1] +sub v25.4s, v20.4s, v12.4s +add v20.4s, v20.4s, v12.4s +sqrdmulh v12.4S, v8.4S, v9.s[0] +mul v8.4S, v8.4S,v1.s[0] +sub v10.4s, v15.4s, v4.4s +add v15.4s, v15.4s, v4.4s +sqrdmulh v4.4S, v16.4S, v7.s[3] +mla v28.4S, v19.4S, v31.s[0] +nop +nop +sqrdmulh v19.4S, v11.4S, v7.s[2] +mla v21.4S, v5.4S, v31.s[0] +nop +nop +sqrdmulh v5.4S, v18.4S, v7.s[1] +mla v17.4S, v13.4S, v31.s[0] +nop +nop +sqrdmulh v13.4S, v0.4S, v7.s[0] +mla v8.4S, v12.4S, v31.s[0] +nop +nop +mul v11.4S, v11.4S,v2.s[2] +mul v16.4S, v16.4S,v2.s[3] +sub v12.4s, v3.4s, v28.4s +str q12, [x0, #960] +mla v11.4S, v19.4S, v31.s[0] +mla v16.4S, v4.4S, v31.s[0] +add v3.4s, v3.4s, v28.4s +str q3, [x0, #896] +mul v0.4S, v0.4S,v2.s[0] +mul v18.4S, v18.4S,v2.s[1] +sub v3.4s, v6.4s, v21.4s +str q3, [x0, #832] +mla v0.4S, v13.4S, v31.s[0] +mla v18.4S, v5.4S, v31.s[0] +add v6.4s, v6.4s, v21.4s +sub v21.4s, v24.4s, v17.4s +ldr q5, [x0, #976] +sqrdmulh v13.4S, v5.4S, v29.s[0] +mul v5.4S, v5.4S,v30.s[0] +add v24.4s, v24.4s, v17.4s +str q6, [x0, #768] +ldr q6, [x0, #912] +sqrdmulh v17.4S, v6.4S, v29.s[0] +mul v6.4S, v6.4S,v30.s[0] +sub v3.4s, v26.4s, v8.4s +str q21, [x0, #704] +ldr q21, [x0, #848] +sqrdmulh v28.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +add v26.4s, v26.4s, v8.4s +str q24, [x0, #640] +ldr q24, [x0, #784] +sqrdmulh v8.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +sub v4.4s, v25.4s, v16.4s +str q3, [x0, #576] +ldr q3, [x0, #720] +sqrdmulh v19.4S, v3.4S, v29.s[0] +mla v5.4S, v13.4S, v31.s[0] +add v25.4s, v25.4s, v16.4s +str q26, [x0, #512] +ldr q26, [x0, #656] +sqrdmulh v16.4S, v26.4S, v29.s[0] +mla v6.4S, v17.4S, v31.s[0] +sub v17.4s, v20.4s, v11.4s +str q4, [x0, #448] +ldr q4, [x0, #592] +sqrdmulh v13.4S, v4.4S, v29.s[0] +mla v21.4S, v28.4S, v31.s[0] +add v20.4s, v20.4s, v11.4s +str q25, [x0, #384] +ldr q25, [x0, #528] +sqrdmulh v11.4S, v25.4S, v29.s[0] +mla v24.4S, v8.4S, v31.s[0] +sub v8.4s, v10.4s, v18.4s +str q17, [x0, #320] +ldr q17, [x0, #464] +add v10.4s, v10.4s, v18.4s +mul v26.4S, v26.4S,v30.s[0] +mul v3.4S, v3.4S,v30.s[0] +ldr q18, [x0, #400] +str q20, [x0, #256] +ldr q20, [x0, #336] +ldr q28, [x0, #272] +mla v26.4S, v16.4S, v31.s[0] +mla v3.4S, v19.4S, v31.s[0] +str q8, [x0, #192] +sub v8.4s, v15.4s, v0.4s +ldr q19, [x0, #208] +ldr q16, [x0, #144] +mul v25.4S, v25.4S,v30.s[0] +mul v4.4S, v4.4S,v30.s[0] +str q10, [x0, #128] +add v15.4s, v15.4s, v0.4s +ldr q0, [x0, #80] +ldr q10, [x0, #16] +mla v25.4S, v11.4S, v31.s[0] +mla v4.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v5.4s +add v17.4s, v17.4s, v5.4s +sqrdmulh v5.4S, v13.4S, v29.s[2] +mul v13.4S, v13.4S,v30.s[2] +sub v11.4s, v18.4s, v6.4s +add v18.4s, v18.4s, v6.4s +sqrdmulh v6.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v12.4s, v20.4s, v21.4s +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v14.4s, v28.4s, v24.4s +add v28.4s, v28.4s, v24.4s +sqrdmulh v24.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v27.4s, v19.4s, v3.4s +add v19.4s, v19.4s, v3.4s +sqrdmulh v3.4S, v12.4S, v29.s[2] +mla v13.4S, v5.4S, v31.s[0] +sub v5.4s, v16.4s, v26.4s +add v16.4s, v16.4s, v26.4s +sqrdmulh v26.4S, v14.4S, v29.s[2] +mla v11.4S, v6.4S, v31.s[0] +sub v6.4s, v0.4s, v4.4s +add v0.4s, v0.4s, v4.4s +sqrdmulh v4.4S, v20.4S, v29.s[1] +mla v17.4S, v21.4S, v31.s[0] +sub v21.4s, v10.4s, v25.4s +str q8, [x0, #64] +sqrdmulh v8.4S, v28.4S, v29.s[1] +mla v18.4S, v24.4S, v31.s[0] +add v10.4s, v10.4s, v25.4s +str q15, [x0, #0] +mul v14.4S, v14.4S,v30.s[2] +mul v12.4S, v12.4S,v30.s[2] +sub v15.4s, v27.4s, v13.4s +add v27.4s, v27.4s, v13.4s +mla v14.4S, v26.4S, v31.s[0] +mla v12.4S, v3.4S, v31.s[0] +sub v3.4s, v5.4s, v11.4s +add v5.4s, v5.4s, v11.4s +mul v28.4S, v28.4S,v30.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v11.4s, v19.4s, v17.4s +add v19.4s, v19.4s, v17.4s +mla v28.4S, v8.4S, v31.s[0] +mla v20.4S, v4.4S, v31.s[0] +sub v4.4s, v16.4s, v18.4s +add v16.4s, v16.4s, v18.4s +sqrdmulh v29.4S, v15.4S, v22.s[3] +mul v15.4S, v15.4S,v23.s[3] +sub v30.4s, v6.4s, v12.4s +add v6.4s, v6.4s, v12.4s +sqrdmulh v12.4S, v27.4S, v22.s[2] +mul v27.4S, v27.4S,v23.s[2] +sub v18.4s, v21.4s, v14.4s +add v21.4s, v21.4s, v14.4s +sqrdmulh v14.4S, v11.4S, v22.s[1] +mul v11.4S, v11.4S,v23.s[1] +sub v8.4s, v0.4s, v20.4s +add v0.4s, v0.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v17.4s, v10.4s, v28.4s +add v10.4s, v10.4s, v28.4s +sqrdmulh v28.4S, v3.4S, v22.s[3] +mla v15.4S, v29.4S, v31.s[0] +nop +nop +sqrdmulh v29.4S, v5.4S, v22.s[2] +mla v27.4S, v12.4S, v31.s[0] +nop +nop +sqrdmulh v12.4S, v4.4S, v22.s[1] +mla v11.4S, v14.4S, v31.s[0] +nop +nop +sqrdmulh v14.4S, v16.4S, v22.s[0] +mla v19.4S, v20.4S, v31.s[0] +nop +nop +mul v5.4S, v5.4S,v23.s[2] +mul v3.4S, v3.4S,v23.s[3] +sub v20.4s, v30.4s, v15.4s +add v30.4s, v30.4s, v15.4s +mla v5.4S, v29.4S, v31.s[0] +mla v3.4S, v28.4S, v31.s[0] +sub v28.4s, v6.4s, v27.4s +add v6.4s, v6.4s, v27.4s +mul v16.4S, v16.4S,v23.s[0] +mul v4.4S, v4.4S,v23.s[1] +sub v27.4s, v8.4s, v11.4s +add v8.4s, v8.4s, v11.4s +mla v16.4S, v14.4S, v31.s[0] +mla v4.4S, v12.4S, v31.s[0] +sub v12.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v22.4S, v20.4S, v9.s[3] +mul v20.4S, v20.4S,v1.s[3] +sub v23.4s, v18.4s, v3.4s +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v30.4S, v9.s[2] +mul v30.4S, v30.4S,v1.s[2] +sub v19.4s, v21.4s, v5.4s +add v21.4s, v21.4s, v5.4s +sqrdmulh v5.4S, v28.4S, v9.s[1] +mul v28.4S, v28.4S,v1.s[1] +sub v14.4s, v17.4s, v4.4s +add v17.4s, v17.4s, v4.4s +sqrdmulh v4.4S, v6.4S, v9.s[0] +mul v6.4S, v6.4S,v1.s[0] +sub v11.4s, v10.4s, v16.4s +add v10.4s, v10.4s, v16.4s +sqrdmulh v9.4S, v27.4S, v7.s[3] +mla v20.4S, v22.4S, v31.s[0] +nop +nop +sqrdmulh v22.4S, v8.4S, v7.s[2] +mla v30.4S, v3.4S, v31.s[0] +nop +nop +sqrdmulh v3.4S, v12.4S, v7.s[1] +mla v28.4S, v5.4S, v31.s[0] +nop +nop +sqrdmulh v5.4S, v0.4S, v7.s[0] +mla v6.4S, v4.4S, v31.s[0] +nop +nop +mul v8.4S, v8.4S,v2.s[2] +mul v27.4S, v27.4S,v2.s[3] +sub v4.4s, v23.4s, v20.4s +str q4, [x0, #976] +mla v8.4S, v22.4S, v31.s[0] +mla v27.4S, v9.4S, v31.s[0] +add v23.4s, v23.4s, v20.4s +str q23, [x0, #912] +mul v0.4S, v0.4S,v2.s[0] +mul v12.4S, v12.4S,v2.s[1] +sub v23.4s, v18.4s, v30.4s +str q23, [x0, #848] +mla v0.4S, v5.4S, v31.s[0] +mla v12.4S, v3.4S, v31.s[0] +add v18.4s, v18.4s, v30.4s +sub v30.4s, v19.4s, v28.4s +add v19.4s, v19.4s, v28.4s +str q18, [x0, #784] +sub v18.4s, v21.4s, v6.4s +str q30, [x0, #720] +add v21.4s, v21.4s, v6.4s +str q19, [x0, #656] +sub v19.4s, v14.4s, v27.4s +str q18, [x0, #592] +add v14.4s, v14.4s, v27.4s +str q21, [x0, #528] +sub v21.4s, v17.4s, v8.4s +str q19, [x0, #464] +add v17.4s, v17.4s, v8.4s +str q14, [x0, #400] +sub v14.4s, v11.4s, v12.4s +str q21, [x0, #336] +add v11.4s, v11.4s, v12.4s +str q17, [x0, #272] +sub v17.4s, v10.4s, v0.4s +add v10.4s, v10.4s, v0.4s +ldr q24, [x0, #48] +ldr q25, [x0, #32] +ldr q13, [x0, #112] +ldr q26, [x0, #96] +ldr q15, [x17, #+128] +ldr q29, [x17, #+144] +ldr q16, [x17, #+160] +ldr q1, [x17, #+176] +ldr q4, [x0, #176] +ldr q22, [x0, #160] +sqrdmulh v9.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v15.s[0] +ldr q20, [x0, #240] +sqrdmulh v23.4S, v25.4S, v29.s[0] +mul v25.4S, v25.4S,v15.s[0] +ldr q5, [x0, #224] +sqrdmulh v3.4S, v13.4S, v1.s[0] +mul v13.4S, v13.4S,v16.s[0] +ldr q2, [x17, #+192] +sqrdmulh v7.4S, v26.4S, v1.s[0] +mul v26.4S, v26.4S,v16.s[0] +ldr q28, [x17, #+208] +mla v24.4S, v9.4S, v31.s[0] +sqrdmulh v9.4S, v4.4S, v28.s[0] +ldr q30, [x17, #+224] +mla v25.4S, v23.4S, v31.s[0] +sqrdmulh v23.4S, v22.4S, v28.s[0] +ldr q6, [x17, #+240] +mla v13.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v20.4S, v6.s[0] +mla v26.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v5.4S, v6.s[0] +ldr q18, [x0, #0] +mul v4.4S, v4.4S,v2.s[0] +mul v22.4S, v22.4S,v2.s[0] +mla v4.4S, v9.4S, v31.s[0] +mla v22.4S, v23.4S, v31.s[0] +sub v23.4s, v10.4s, v24.4s +ldr q9, [x0, #64] +add v10.4s, v10.4s, v24.4s +mul v20.4S, v20.4S,v30.s[0] +mul v5.4S, v5.4S,v30.s[0] +sub v24.4s, v18.4s, v25.4s +add v18.4s, v18.4s, v25.4s +mla v20.4S, v3.4S, v31.s[0] +mla v5.4S, v7.4S, v31.s[0] +sub v7.4s, v17.4s, v13.4s +ldr q3, [x0, #128] +add v17.4s, v17.4s, v13.4s +sqrdmulh v13.4S, v23.4S, v29.s[2] +mul v23.4S, v23.4S,v15.s[2] +sub v25.4s, v9.4s, v26.4s +add v9.4s, v9.4s, v26.4s +sqrdmulh v26.4S, v10.4S, v29.s[1] +mul v10.4S, v10.4S,v15.s[1] +sub v27.4s, v11.4s, v4.4s +ldr q19, [x0, #192] +add v11.4s, v11.4s, v4.4s +sqrdmulh v29.4S, v7.4S, v1.s[2] +mul v7.4S, v7.4S,v16.s[2] +sub v4.4s, v3.4s, v22.4s +ldr q15, [x0, #304] +add v3.4s, v3.4s, v22.4s +sqrdmulh v22.4S, v17.4S, v1.s[1] +mul v17.4S, v17.4S,v16.s[1] +sub v8.4s, v14.4s, v20.4s +ldr q21, [x0, #288] +add v14.4s, v14.4s, v20.4s +mla v23.4S, v13.4S, v31.s[0] +sqrdmulh v13.4S, v27.4S, v28.s[2] +sub v1.4s, v19.4s, v5.4s +ldr q20, [x0, #368] +add v19.4s, v19.4s, v5.4s +mla v10.4S, v26.4S, v31.s[0] +sqrdmulh v26.4S, v11.4S, v28.s[1] +sub v5.4s, v24.4s, v23.4s +ldr q16, [x0, #352] +str q5, [x0, #48] +mla v7.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v8.4S, v6.s[2] +add v24.4s, v24.4s, v23.4s +ldr q23, [x17, #+256] +str q24, [x0, #32] +mla v17.4S, v22.4S, v31.s[0] +sqrdmulh v22.4S, v14.4S, v6.s[1] +sub v24.4s, v18.4s, v10.4s +ldr q5, [x17, #+272] +str q24, [x0, #16] +mul v27.4S, v27.4S,v2.s[2] +mul v11.4S, v11.4S,v2.s[1] +add v18.4s, v18.4s, v10.4s +ldr q10, [x17, #+288] +str q18, [x0, #0] +mla v27.4S, v13.4S, v31.s[0] +mla v11.4S, v26.4S, v31.s[0] +sub v26.4s, v25.4s, v7.4s +ldr q13, [x17, #+304] +str q26, [x0, #112] +mul v8.4S, v8.4S,v30.s[2] +mul v14.4S, v14.4S,v30.s[1] +add v25.4s, v25.4s, v7.4s +ldr q7, [x0, #432] +str q25, [x0, #96] +mla v8.4S, v29.4S, v31.s[0] +mla v14.4S, v22.4S, v31.s[0] +sub v22.4s, v9.4s, v17.4s +ldr q29, [x0, #416] +str q22, [x0, #80] +sqrdmulh v6.4S, v15.4S, v5.s[0] +mul v15.4S, v15.4S,v23.s[0] +add v9.4s, v9.4s, v17.4s +ldr q17, [x0, #496] +str q9, [x0, #64] +sqrdmulh v9.4S, v21.4S, v5.s[0] +mul v21.4S, v21.4S,v23.s[0] +sub v22.4s, v4.4s, v27.4s +ldr q30, [x0, #480] +str q22, [x0, #176] +sqrdmulh v22.4S, v20.4S, v13.s[0] +mul v20.4S, v20.4S,v10.s[0] +add v4.4s, v4.4s, v27.4s +ldr q27, [x17, #+320] +str q4, [x0, #160] +sqrdmulh v4.4S, v16.4S, v13.s[0] +mul v16.4S, v16.4S,v10.s[0] +sub v25.4s, v3.4s, v11.4s +ldr q28, [x17, #+336] +str q25, [x0, #144] +mla v15.4S, v6.4S, v31.s[0] +sqrdmulh v6.4S, v7.4S, v28.s[0] +add v3.4s, v3.4s, v11.4s +ldr q11, [x17, #+352] +str q3, [x0, #128] +mla v21.4S, v9.4S, v31.s[0] +sqrdmulh v9.4S, v29.4S, v28.s[0] +sub v3.4s, v1.4s, v8.4s +ldr q25, [x17, #+368] +str q3, [x0, #240] +mla v20.4S, v22.4S, v31.s[0] +sqrdmulh v22.4S, v17.4S, v25.s[0] +add v1.4s, v1.4s, v8.4s +ldr q8, [x0, #272] +str q1, [x0, #224] +mla v16.4S, v4.4S, v31.s[0] +sqrdmulh v4.4S, v30.4S, v25.s[0] +sub v1.4s, v19.4s, v14.4s +ldr q3, [x0, #256] +str q1, [x0, #208] +mul v7.4S, v7.4S,v27.s[0] +mul v29.4S, v29.4S,v27.s[0] +add v19.4s, v19.4s, v14.4s +ldr q14, [x0, #336] +str q19, [x0, #192] +mla v7.4S, v6.4S, v31.s[0] +mla v29.4S, v9.4S, v31.s[0] +sub v9.4s, v8.4s, v15.4s +ldr q6, [x0, #320] +add v8.4s, v8.4s, v15.4s +mul v17.4S, v17.4S,v11.s[0] +mul v30.4S, v30.4S,v11.s[0] +sub v15.4s, v3.4s, v21.4s +ldr q19, [x0, #400] +add v3.4s, v3.4s, v21.4s +mla v17.4S, v22.4S, v31.s[0] +mla v30.4S, v4.4S, v31.s[0] +sub v4.4s, v14.4s, v20.4s +ldr q22, [x0, #384] +add v14.4s, v14.4s, v20.4s +sqrdmulh v20.4S, v9.4S, v5.s[2] +mul v9.4S, v9.4S,v23.s[2] +sub v21.4s, v6.4s, v16.4s +ldr q1, [x0, #464] +add v6.4s, v6.4s, v16.4s +sqrdmulh v16.4S, v8.4S, v5.s[1] +mul v8.4S, v8.4S,v23.s[1] +sub v26.4s, v19.4s, v7.4s +ldr q2, [x0, #448] +add v19.4s, v19.4s, v7.4s +sqrdmulh v5.4S, v4.4S, v13.s[2] +mul v4.4S, v4.4S,v10.s[2] +sub v7.4s, v22.4s, v29.4s +ldr q23, [x0, #560] +add v22.4s, v22.4s, v29.4s +sqrdmulh v29.4S, v14.4S, v13.s[1] +mul v14.4S, v14.4S,v10.s[1] +sub v18.4s, v1.4s, v17.4s +ldr q24, [x0, #544] +add v1.4s, v1.4s, v17.4s +mla v9.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v26.4S, v28.s[2] +sub v13.4s, v2.4s, v30.4s +ldr q17, [x0, #624] +add v2.4s, v2.4s, v30.4s +mla v8.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v19.4S, v28.s[1] +sub v30.4s, v15.4s, v9.4s +ldr q10, [x0, #608] +str q30, [x0, #304] +mla v4.4S, v5.4S, v31.s[0] +sqrdmulh v5.4S, v18.4S, v25.s[2] +add v15.4s, v15.4s, v9.4s +ldr q9, [x17, #+384] +str q15, [x0, #288] +mla v14.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v1.4S, v25.s[1] +sub v15.4s, v3.4s, v8.4s +ldr q30, [x17, #+400] +str q15, [x0, #272] +mul v26.4S, v26.4S,v27.s[2] +mul v19.4S, v19.4S,v27.s[1] +add v3.4s, v3.4s, v8.4s +ldr q8, [x17, #+416] +str q3, [x0, #256] +mla v26.4S, v20.4S, v31.s[0] +mla v19.4S, v16.4S, v31.s[0] +sub v16.4s, v21.4s, v4.4s +ldr q20, [x17, #+432] +str q16, [x0, #368] +mul v18.4S, v18.4S,v11.s[2] +mul v1.4S, v1.4S,v11.s[1] +add v21.4s, v21.4s, v4.4s +ldr q4, [x0, #688] +str q21, [x0, #352] +mla v18.4S, v5.4S, v31.s[0] +mla v1.4S, v29.4S, v31.s[0] +sub v29.4s, v6.4s, v14.4s +ldr q5, [x0, #672] +str q29, [x0, #336] +sqrdmulh v25.4S, v23.4S, v30.s[0] +mul v23.4S, v23.4S,v9.s[0] +add v6.4s, v6.4s, v14.4s +ldr q14, [x0, #752] +str q6, [x0, #320] +sqrdmulh v6.4S, v24.4S, v30.s[0] +mul v24.4S, v24.4S,v9.s[0] +sub v29.4s, v7.4s, v26.4s +ldr q11, [x0, #736] +str q29, [x0, #432] +sqrdmulh v29.4S, v17.4S, v20.s[0] +mul v17.4S, v17.4S,v8.s[0] +add v7.4s, v7.4s, v26.4s +ldr q26, [x17, #+448] +str q7, [x0, #416] +sqrdmulh v7.4S, v10.4S, v20.s[0] +mul v10.4S, v10.4S,v8.s[0] +sub v21.4s, v22.4s, v19.4s +ldr q28, [x17, #+464] +str q21, [x0, #400] +mla v23.4S, v25.4S, v31.s[0] +sqrdmulh v25.4S, v4.4S, v28.s[0] +add v22.4s, v22.4s, v19.4s +ldr q19, [x17, #+480] +str q22, [x0, #384] +mla v24.4S, v6.4S, v31.s[0] +sqrdmulh v6.4S, v5.4S, v28.s[0] +sub v22.4s, v13.4s, v18.4s +ldr q21, [x17, #+496] +str q22, [x0, #496] +mla v17.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v14.4S, v21.s[0] +add v13.4s, v13.4s, v18.4s +ldr q18, [x0, #528] +str q13, [x0, #480] +mla v10.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v11.4S, v21.s[0] +sub v13.4s, v2.4s, v1.4s +ldr q22, [x0, #512] +str q13, [x0, #464] +mul v4.4S, v4.4S,v26.s[0] +mul v5.4S, v5.4S,v26.s[0] +add v2.4s, v2.4s, v1.4s +ldr q1, [x0, #592] +str q2, [x0, #448] +mla v4.4S, v25.4S, v31.s[0] +mla v5.4S, v6.4S, v31.s[0] +sub v6.4s, v18.4s, v23.4s +ldr q25, [x0, #576] +add v18.4s, v18.4s, v23.4s +mul v14.4S, v14.4S,v19.s[0] +mul v11.4S, v11.4S,v19.s[0] +sub v23.4s, v22.4s, v24.4s +ldr q2, [x0, #656] +add v22.4s, v22.4s, v24.4s +mla v14.4S, v29.4S, v31.s[0] +mla v11.4S, v7.4S, v31.s[0] +sub v7.4s, v1.4s, v17.4s +ldr q29, [x0, #640] +add v1.4s, v1.4s, v17.4s +sqrdmulh v17.4S, v6.4S, v30.s[2] +mul v6.4S, v6.4S,v9.s[2] +sub v24.4s, v25.4s, v10.4s +ldr q13, [x0, #720] +add v25.4s, v25.4s, v10.4s +sqrdmulh v10.4S, v18.4S, v30.s[1] +mul v18.4S, v18.4S,v9.s[1] +sub v16.4s, v2.4s, v4.4s +ldr q27, [x0, #704] +add v2.4s, v2.4s, v4.4s +sqrdmulh v30.4S, v7.4S, v20.s[2] +mul v7.4S, v7.4S,v8.s[2] +sub v4.4s, v29.4s, v5.4s +ldr q9, [x0, #816] +add v29.4s, v29.4s, v5.4s +sqrdmulh v5.4S, v1.4S, v20.s[1] +mul v1.4S, v1.4S,v8.s[1] +sub v3.4s, v13.4s, v14.4s +ldr q15, [x0, #800] +add v13.4s, v13.4s, v14.4s +mla v6.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v16.4S, v28.s[2] +sub v20.4s, v27.4s, v11.4s +ldr q14, [x0, #880] +add v27.4s, v27.4s, v11.4s +mla v18.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v2.4S, v28.s[1] +sub v11.4s, v23.4s, v6.4s +ldr q8, [x0, #864] +str q11, [x0, #560] +mla v7.4S, v30.4S, v31.s[0] +sqrdmulh v30.4S, v3.4S, v21.s[2] +add v23.4s, v23.4s, v6.4s +ldr q6, [x17, #+512] +str q23, [x0, #544] +mla v1.4S, v5.4S, v31.s[0] +sqrdmulh v5.4S, v13.4S, v21.s[1] +sub v23.4s, v22.4s, v18.4s +ldr q11, [x17, #+528] +str q23, [x0, #528] +mul v16.4S, v16.4S,v26.s[2] +mul v2.4S, v2.4S,v26.s[1] +add v22.4s, v22.4s, v18.4s +ldr q18, [x17, #+544] +str q22, [x0, #512] +mla v16.4S, v17.4S, v31.s[0] +mla v2.4S, v10.4S, v31.s[0] +sub v10.4s, v24.4s, v7.4s +ldr q17, [x17, #+560] +str q10, [x0, #624] +mul v3.4S, v3.4S,v19.s[2] +mul v13.4S, v13.4S,v19.s[1] +add v24.4s, v24.4s, v7.4s +ldr q7, [x0, #944] +str q24, [x0, #608] +mla v3.4S, v30.4S, v31.s[0] +mla v13.4S, v5.4S, v31.s[0] +sub v5.4s, v25.4s, v1.4s +ldr q30, [x0, #928] +str q5, [x0, #592] +sqrdmulh v21.4S, v9.4S, v11.s[0] +mul v9.4S, v9.4S,v6.s[0] +add v25.4s, v25.4s, v1.4s +ldr q1, [x0, #1008] +str q25, [x0, #576] +sqrdmulh v25.4S, v15.4S, v11.s[0] +mul v15.4S, v15.4S,v6.s[0] +sub v5.4s, v4.4s, v16.4s +ldr q19, [x0, #992] +str q5, [x0, #688] +sqrdmulh v5.4S, v14.4S, v17.s[0] +mul v14.4S, v14.4S,v18.s[0] +add v4.4s, v4.4s, v16.4s +ldr q16, [x17, #+576] +str q4, [x0, #672] +sqrdmulh v4.4S, v8.4S, v17.s[0] +mul v8.4S, v8.4S,v18.s[0] +sub v24.4s, v29.4s, v2.4s +ldr q28, [x17, #+592] +str q24, [x0, #656] +mla v9.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v7.4S, v28.s[0] +add v29.4s, v29.4s, v2.4s +ldr q2, [x17, #+608] +str q29, [x0, #640] +mla v15.4S, v25.4S, v31.s[0] +sqrdmulh v25.4S, v30.4S, v28.s[0] +sub v29.4s, v20.4s, v3.4s +ldr q24, [x17, #+624] +str q29, [x0, #752] +mla v14.4S, v5.4S, v31.s[0] +sqrdmulh v5.4S, v1.4S, v24.s[0] +add v20.4s, v20.4s, v3.4s +ldr q3, [x0, #784] +str q20, [x0, #736] +mla v8.4S, v4.4S, v31.s[0] +sqrdmulh v4.4S, v19.4S, v24.s[0] +sub v20.4s, v27.4s, v13.4s +ldr q29, [x0, #768] +str q20, [x0, #720] +mul v7.4S, v7.4S,v16.s[0] +mul v30.4S, v30.4S,v16.s[0] +add v27.4s, v27.4s, v13.4s +ldr q13, [x0, #848] +str q27, [x0, #704] +mla v7.4S, v21.4S, v31.s[0] +mla v30.4S, v25.4S, v31.s[0] +sub v25.4s, v3.4s, v9.4s +ldr q21, [x0, #832] +add v3.4s, v3.4s, v9.4s +mul v1.4S, v1.4S,v2.s[0] +mul v19.4S, v19.4S,v2.s[0] +sub v9.4s, v29.4s, v15.4s +ldr q27, [x0, #912] +add v29.4s, v29.4s, v15.4s +mla v1.4S, v5.4S, v31.s[0] +mla v19.4S, v4.4S, v31.s[0] +sub v4.4s, v13.4s, v14.4s +ldr q5, [x0, #896] +add v13.4s, v13.4s, v14.4s +sqrdmulh v14.4S, v25.4S, v11.s[2] +mul v25.4S, v25.4S,v6.s[2] +sub v15.4s, v21.4s, v8.4s +ldr q20, [x0, #976] +add v21.4s, v21.4s, v8.4s +sqrdmulh v8.4S, v3.4S, v11.s[1] +mul v3.4S, v3.4S,v6.s[1] +sub v10.4s, v27.4s, v7.4s +ldr q26, [x0, #960] +add v27.4s, v27.4s, v7.4s +sqrdmulh v11.4S, v4.4S, v17.s[2] +mul v4.4S, v4.4S,v18.s[2] +sub v7.4s, v5.4s, v30.4s +add v5.4s, v5.4s, v30.4s +sqrdmulh v30.4S, v13.4S, v17.s[1] +mul v13.4S, v13.4S,v18.s[1] +sub v6.4s, v20.4s, v1.4s +add v20.4s, v20.4s, v1.4s +mla v25.4S, v14.4S, v31.s[0] +sqrdmulh v14.4S, v10.4S, v28.s[2] +sub v17.4s, v26.4s, v19.4s +add v26.4s, v26.4s, v19.4s +mla v3.4S, v8.4S, v31.s[0] +sqrdmulh v8.4S, v27.4S, v28.s[1] +sub v19.4s, v9.4s, v25.4s +str q19, [x0, #816] +mla v4.4S, v11.4S, v31.s[0] +sqrdmulh v11.4S, v6.4S, v24.s[2] +add v9.4s, v9.4s, v25.4s +str q9, [x0, #800] +mla v13.4S, v30.4S, v31.s[0] +sqrdmulh v30.4S, v20.4S, v24.s[1] +sub v9.4s, v29.4s, v3.4s +str q9, [x0, #784] +mul v10.4S, v10.4S,v16.s[2] +mul v27.4S, v27.4S,v16.s[1] +add v29.4s, v29.4s, v3.4s +str q29, [x0, #768] +mla v10.4S, v14.4S, v31.s[0] +mla v27.4S, v8.4S, v31.s[0] +sub v8.4s, v15.4s, v4.4s +str q8, [x0, #880] +mul v6.4S, v6.4S,v2.s[2] +mul v20.4S, v20.4S,v2.s[1] +add v15.4s, v15.4s, v4.4s +str q15, [x0, #864] +mla v6.4S, v11.4S, v31.s[0] +mla v20.4S, v30.4S, v31.s[0] +sub v30.4s, v21.4s, v13.4s +str q30, [x0, #848] +add v21.4s, v21.4s, v13.4s +str q21, [x0, #832] +sub v21.4s, v7.4s, v10.4s +str q21, [x0, #944] +add v7.4s, v7.4s, v10.4s +str q7, [x0, #928] +sub v7.4s, v5.4s, v27.4s +str q7, [x0, #912] +add v5.4s, v5.4s, v27.4s +str q5, [x0, #896] +sub v5.4s, v17.4s, v6.4s +str q5, [x0, #1008] +add v17.4s, v17.4s, v6.4s +str q17, [x0, #992] +sub v17.4s, v26.4s, v20.4s +str q17, [x0, #976] +add v26.4s, v26.4s, v20.4s +str q26, [x0, #960] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1520 +// Instruction count: 1516 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_13.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_13.s new file mode 100644 index 0000000..e12e06a --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_13.s @@ -0,0 +1,1550 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_22_z4_13 +.global _ntt_u32_incomplete_neon_asm_var_4_2_22_z4_13 +ntt_u32_incomplete_neon_asm_var_4_2_22_z4_13: +_ntt_u32_incomplete_neon_asm_var_4_2_22_z4_13: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x0, #992] +sqrdmulh v27.4S, v28.4S, v29.s[0] +mul v28.4S, v28.4S,v30.s[0] +ldr q26, [x0, #928] +sqrdmulh v25.4S, v26.4S, v29.s[0] +mul v26.4S, v26.4S,v30.s[0] +ldr q24, [x0, #864] +sqrdmulh v23.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +ldr q22, [x0, #800] +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +ldr q20, [x0, #736] +sqrdmulh v19.4S, v20.4S, v29.s[0] +mla v28.4S, v27.4S, v31.s[0] +ldr q27, [x0, #672] +sqrdmulh v18.4S, v27.4S, v29.s[0] +mla v26.4S, v25.4S, v31.s[0] +ldr q25, [x0, #608] +sqrdmulh v17.4S, v25.4S, v29.s[0] +mla v24.4S, v23.4S, v31.s[0] +ldr q23, [x0, #544] +sqrdmulh v16.4S, v23.4S, v29.s[0] +mla v22.4S, v21.4S, v31.s[0] +ldr q21, [x0, #480] +mul v27.4S, v27.4S,v30.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q3, [x0, #416] +ldr q2, [x0, #352] +ldr q1, [x0, #288] +mla v27.4S, v18.4S, v31.s[0] +mla v20.4S, v19.4S, v31.s[0] +ldr q19, [x0, #224] +ldr q18, [x0, #160] +mul v23.4S, v23.4S,v30.s[0] +mul v25.4S, v25.4S,v30.s[0] +ldr q0, [x0, #96] +ldr q15, [x0, #32] +mla v23.4S, v16.4S, v31.s[0] +mla v25.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v28.4s +add v21.4s, v21.4s, v28.4s +sqrdmulh v28.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +sub v16.4s, v3.4s, v26.4s +add v3.4s, v3.4s, v26.4s +sqrdmulh v26.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +sub v14.4s, v2.4s, v24.4s +add v2.4s, v2.4s, v24.4s +sqrdmulh v24.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v13.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +sqrdmulh v22.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v12.4s, v19.4s, v20.4s +add v19.4s, v19.4s, v20.4s +sqrdmulh v20.4S, v14.4S, v29.s[2] +mla v17.4S, v28.4S, v31.s[0] +sub v28.4s, v18.4s, v27.4s +add v18.4s, v18.4s, v27.4s +sqrdmulh v27.4S, v13.4S, v29.s[2] +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v0.4s, v25.4s +add v0.4s, v0.4s, v25.4s +sqrdmulh v25.4S, v2.4S, v29.s[1] +mla v21.4S, v24.4S, v31.s[0] +sub v24.4s, v15.4s, v23.4s +sqrdmulh v11.4S, v1.4S, v29.s[1] +mla v3.4S, v22.4S, v31.s[0] +add v15.4s, v15.4s, v23.4s +ldr q23, [x17, #+32] +ldr q22, [x17, #+48] +mul v13.4S, v13.4S,v30.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v10.4s, v12.4s, v17.4s +add v12.4s, v12.4s, v17.4s +mla v13.4S, v27.4S, v31.s[0] +mla v14.4S, v20.4S, v31.s[0] +sub v20.4s, v28.4s, v16.4s +add v28.4s, v28.4s, v16.4s +mul v1.4S, v1.4S,v30.s[1] +mul v2.4S, v2.4S,v30.s[1] +sub v16.4s, v19.4s, v21.4s +add v19.4s, v19.4s, v21.4s +mla v1.4S, v11.4S, v31.s[0] +mla v2.4S, v25.4S, v31.s[0] +sub v25.4s, v18.4s, v3.4s +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v10.4S, v22.s[3] +mul v10.4S, v10.4S,v23.s[3] +sub v11.4s, v26.4s, v14.4s +add v26.4s, v26.4s, v14.4s +sqrdmulh v14.4S, v12.4S, v22.s[2] +mul v12.4S, v12.4S,v23.s[2] +sub v21.4s, v24.4s, v13.4s +add v24.4s, v24.4s, v13.4s +sqrdmulh v13.4S, v16.4S, v22.s[1] +mul v16.4S, v16.4S,v23.s[1] +sub v27.4s, v0.4s, v2.4s +add v0.4s, v0.4s, v2.4s +sqrdmulh v2.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v17.4s, v15.4s, v1.4s +add v15.4s, v15.4s, v1.4s +ldr q1, [x17, #+96] +ldr q9, [x17, #+112] +sqrdmulh v8.4S, v20.4S, v22.s[3] +mla v10.4S, v3.4S, v31.s[0] +nop +nop +sqrdmulh v3.4S, v28.4S, v22.s[2] +mla v12.4S, v14.4S, v31.s[0] +nop +nop +sqrdmulh v14.4S, v25.4S, v22.s[1] +mla v16.4S, v13.4S, v31.s[0] +nop +nop +sqrdmulh v13.4S, v18.4S, v22.s[0] +mla v19.4S, v2.4S, v31.s[0] +nop +nop +ldr q2, [x17, #+64] +ldr q7, [x17, #+80] +mul v28.4S, v28.4S,v23.s[2] +mul v20.4S, v20.4S,v23.s[3] +sub v6.4s, v11.4s, v10.4s +add v11.4s, v11.4s, v10.4s +mla v28.4S, v3.4S, v31.s[0] +mla v20.4S, v8.4S, v31.s[0] +sub v8.4s, v26.4s, v12.4s +add v26.4s, v26.4s, v12.4s +mul v18.4S, v18.4S,v23.s[0] +mul v25.4S, v25.4S,v23.s[1] +sub v12.4s, v27.4s, v16.4s +add v27.4s, v27.4s, v16.4s +mla v18.4S, v13.4S, v31.s[0] +mla v25.4S, v14.4S, v31.s[0] +sub v14.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v9.s[3] +mul v6.4S, v6.4S,v1.s[3] +sub v13.4s, v21.4s, v20.4s +add v21.4s, v21.4s, v20.4s +sqrdmulh v20.4S, v11.4S, v9.s[2] +mul v11.4S, v11.4S,v1.s[2] +sub v16.4s, v24.4s, v28.4s +add v24.4s, v24.4s, v28.4s +sqrdmulh v28.4S, v8.4S, v9.s[1] +mul v8.4S, v8.4S,v1.s[1] +sub v3.4s, v17.4s, v25.4s +add v17.4s, v17.4s, v25.4s +sqrdmulh v25.4S, v26.4S, v9.s[0] +mul v26.4S, v26.4S,v1.s[0] +sub v10.4s, v15.4s, v18.4s +add v15.4s, v15.4s, v18.4s +sqrdmulh v18.4S, v12.4S, v7.s[3] +mla v6.4S, v19.4S, v31.s[0] +nop +nop +sqrdmulh v19.4S, v27.4S, v7.s[2] +mla v11.4S, v20.4S, v31.s[0] +nop +nop +sqrdmulh v20.4S, v14.4S, v7.s[1] +mla v8.4S, v28.4S, v31.s[0] +nop +nop +sqrdmulh v28.4S, v0.4S, v7.s[0] +mla v26.4S, v25.4S, v31.s[0] +nop +nop +mul v27.4S, v27.4S,v2.s[2] +mul v12.4S, v12.4S,v2.s[3] +sub v25.4s, v13.4s, v6.4s +str q25, [x0, #992] +mla v27.4S, v19.4S, v31.s[0] +mla v12.4S, v18.4S, v31.s[0] +add v13.4s, v13.4s, v6.4s +str q13, [x0, #928] +mul v0.4S, v0.4S,v2.s[0] +mul v14.4S, v14.4S,v2.s[1] +sub v13.4s, v21.4s, v11.4s +str q13, [x0, #864] +mla v0.4S, v28.4S, v31.s[0] +mla v14.4S, v20.4S, v31.s[0] +add v21.4s, v21.4s, v11.4s +sub v11.4s, v16.4s, v8.4s +ldr q20, [x0, #1008] +sqrdmulh v28.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v16.4s, v16.4s, v8.4s +str q21, [x0, #800] +ldr q21, [x0, #944] +sqrdmulh v8.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +sub v13.4s, v24.4s, v26.4s +str q11, [x0, #736] +ldr q11, [x0, #880] +sqrdmulh v6.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +add v24.4s, v24.4s, v26.4s +str q16, [x0, #672] +ldr q16, [x0, #816] +sqrdmulh v26.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v18.4s, v3.4s, v12.4s +str q13, [x0, #608] +ldr q13, [x0, #752] +sqrdmulh v19.4S, v13.4S, v29.s[0] +mla v20.4S, v28.4S, v31.s[0] +add v3.4s, v3.4s, v12.4s +str q24, [x0, #544] +ldr q24, [x0, #688] +sqrdmulh v12.4S, v24.4S, v29.s[0] +mla v21.4S, v8.4S, v31.s[0] +sub v8.4s, v17.4s, v27.4s +str q18, [x0, #480] +ldr q18, [x0, #624] +sqrdmulh v28.4S, v18.4S, v29.s[0] +mla v11.4S, v6.4S, v31.s[0] +add v17.4s, v17.4s, v27.4s +str q3, [x0, #416] +ldr q3, [x0, #560] +sqrdmulh v27.4S, v3.4S, v29.s[0] +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v10.4s, v14.4s +str q8, [x0, #352] +ldr q8, [x0, #496] +add v10.4s, v10.4s, v14.4s +mul v24.4S, v24.4S,v30.s[0] +mul v13.4S, v13.4S,v30.s[0] +ldr q14, [x0, #432] +str q17, [x0, #288] +ldr q17, [x0, #368] +ldr q6, [x0, #304] +mla v24.4S, v12.4S, v31.s[0] +mla v13.4S, v19.4S, v31.s[0] +str q26, [x0, #224] +sub v26.4s, v15.4s, v0.4s +ldr q19, [x0, #240] +ldr q12, [x0, #176] +mul v3.4S, v3.4S,v30.s[0] +mul v18.4S, v18.4S,v30.s[0] +str q10, [x0, #160] +add v15.4s, v15.4s, v0.4s +ldr q0, [x0, #112] +ldr q10, [x0, #48] +mla v3.4S, v27.4S, v31.s[0] +mla v18.4S, v28.4S, v31.s[0] +sub v28.4s, v8.4s, v20.4s +add v8.4s, v8.4s, v20.4s +sqrdmulh v20.4S, v28.4S, v29.s[2] +mul v28.4S, v28.4S,v30.s[2] +sub v27.4s, v14.4s, v21.4s +add v14.4s, v14.4s, v21.4s +sqrdmulh v21.4S, v27.4S, v29.s[2] +mul v27.4S, v27.4S,v30.s[2] +sub v25.4s, v17.4s, v11.4s +add v17.4s, v17.4s, v11.4s +sqrdmulh v11.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +sub v5.4s, v6.4s, v16.4s +add v6.4s, v6.4s, v16.4s +sqrdmulh v16.4S, v14.4S, v29.s[1] +mul v14.4S, v14.4S,v30.s[1] +sub v4.4s, v19.4s, v13.4s +add v19.4s, v19.4s, v13.4s +sqrdmulh v13.4S, v25.4S, v29.s[2] +mla v28.4S, v20.4S, v31.s[0] +sub v20.4s, v12.4s, v24.4s +add v12.4s, v12.4s, v24.4s +sqrdmulh v24.4S, v5.4S, v29.s[2] +mla v27.4S, v21.4S, v31.s[0] +sub v21.4s, v0.4s, v18.4s +add v0.4s, v0.4s, v18.4s +sqrdmulh v18.4S, v17.4S, v29.s[1] +mla v8.4S, v11.4S, v31.s[0] +sub v11.4s, v10.4s, v3.4s +str q26, [x0, #96] +sqrdmulh v26.4S, v6.4S, v29.s[1] +mla v14.4S, v16.4S, v31.s[0] +add v10.4s, v10.4s, v3.4s +str q15, [x0, #32] +mul v5.4S, v5.4S,v30.s[2] +mul v25.4S, v25.4S,v30.s[2] +sub v15.4s, v4.4s, v28.4s +add v4.4s, v4.4s, v28.4s +mla v5.4S, v24.4S, v31.s[0] +mla v25.4S, v13.4S, v31.s[0] +sub v13.4s, v20.4s, v27.4s +add v20.4s, v20.4s, v27.4s +mul v6.4S, v6.4S,v30.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v27.4s, v19.4s, v8.4s +add v19.4s, v19.4s, v8.4s +mla v6.4S, v26.4S, v31.s[0] +mla v17.4S, v18.4S, v31.s[0] +sub v18.4s, v12.4s, v14.4s +add v12.4s, v12.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v22.s[3] +mul v15.4S, v15.4S,v23.s[3] +sub v26.4s, v21.4s, v25.4s +add v21.4s, v21.4s, v25.4s +sqrdmulh v25.4S, v4.4S, v22.s[2] +mul v4.4S, v4.4S,v23.s[2] +sub v8.4s, v11.4s, v5.4s +add v11.4s, v11.4s, v5.4s +sqrdmulh v5.4S, v27.4S, v22.s[1] +mul v27.4S, v27.4S,v23.s[1] +sub v24.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +sqrdmulh v17.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v28.4s, v10.4s, v6.4s +add v10.4s, v10.4s, v6.4s +sqrdmulh v6.4S, v13.4S, v22.s[3] +mla v15.4S, v14.4S, v31.s[0] +nop +nop +sqrdmulh v14.4S, v20.4S, v22.s[2] +mla v4.4S, v25.4S, v31.s[0] +nop +nop +sqrdmulh v25.4S, v18.4S, v22.s[1] +mla v27.4S, v5.4S, v31.s[0] +nop +nop +sqrdmulh v5.4S, v12.4S, v22.s[0] +mla v19.4S, v17.4S, v31.s[0] +nop +nop +mul v20.4S, v20.4S,v23.s[2] +mul v13.4S, v13.4S,v23.s[3] +sub v17.4s, v26.4s, v15.4s +add v26.4s, v26.4s, v15.4s +mla v20.4S, v14.4S, v31.s[0] +mla v13.4S, v6.4S, v31.s[0] +sub v6.4s, v21.4s, v4.4s +add v21.4s, v21.4s, v4.4s +mul v12.4S, v12.4S,v23.s[0] +mul v18.4S, v18.4S,v23.s[1] +sub v4.4s, v24.4s, v27.4s +add v24.4s, v24.4s, v27.4s +mla v12.4S, v5.4S, v31.s[0] +mla v18.4S, v25.4S, v31.s[0] +sub v25.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v17.4S, v9.s[3] +mul v17.4S, v17.4S,v1.s[3] +sub v5.4s, v8.4s, v13.4s +add v8.4s, v8.4s, v13.4s +sqrdmulh v13.4S, v26.4S, v9.s[2] +mul v26.4S, v26.4S,v1.s[2] +sub v27.4s, v11.4s, v20.4s +add v11.4s, v11.4s, v20.4s +sqrdmulh v20.4S, v6.4S, v9.s[1] +mul v6.4S, v6.4S,v1.s[1] +sub v14.4s, v28.4s, v18.4s +add v28.4s, v28.4s, v18.4s +sqrdmulh v18.4S, v21.4S, v9.s[0] +mul v21.4S, v21.4S,v1.s[0] +sub v15.4s, v10.4s, v12.4s +add v10.4s, v10.4s, v12.4s +sqrdmulh v12.4S, v4.4S, v7.s[3] +mla v17.4S, v19.4S, v31.s[0] +nop +nop +sqrdmulh v19.4S, v24.4S, v7.s[2] +mla v26.4S, v13.4S, v31.s[0] +nop +nop +sqrdmulh v13.4S, v25.4S, v7.s[1] +mla v6.4S, v20.4S, v31.s[0] +nop +nop +sqrdmulh v20.4S, v0.4S, v7.s[0] +mla v21.4S, v18.4S, v31.s[0] +nop +nop +mul v24.4S, v24.4S,v2.s[2] +mul v4.4S, v4.4S,v2.s[3] +sub v18.4s, v5.4s, v17.4s +str q18, [x0, #1008] +mla v24.4S, v19.4S, v31.s[0] +mla v4.4S, v12.4S, v31.s[0] +add v5.4s, v5.4s, v17.4s +str q5, [x0, #944] +mul v0.4S, v0.4S,v2.s[0] +mul v25.4S, v25.4S,v2.s[1] +sub v5.4s, v8.4s, v26.4s +str q5, [x0, #880] +mla v0.4S, v20.4S, v31.s[0] +mla v25.4S, v13.4S, v31.s[0] +add v8.4s, v8.4s, v26.4s +sub v26.4s, v27.4s, v6.4s +ldr q13, [x0, #960] +sqrdmulh v20.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +add v27.4s, v27.4s, v6.4s +str q8, [x0, #816] +ldr q8, [x0, #896] +sqrdmulh v6.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v5.4s, v11.4s, v21.4s +str q26, [x0, #752] +ldr q26, [x0, #832] +sqrdmulh v17.4S, v26.4S, v29.s[0] +mul v26.4S, v26.4S,v30.s[0] +add v11.4s, v11.4s, v21.4s +str q27, [x0, #688] +ldr q27, [x0, #768] +sqrdmulh v21.4S, v27.4S, v29.s[0] +mul v27.4S, v27.4S,v30.s[0] +sub v12.4s, v14.4s, v4.4s +str q5, [x0, #624] +ldr q5, [x0, #704] +sqrdmulh v19.4S, v5.4S, v29.s[0] +mla v13.4S, v20.4S, v31.s[0] +add v14.4s, v14.4s, v4.4s +str q11, [x0, #560] +ldr q11, [x0, #640] +sqrdmulh v4.4S, v11.4S, v29.s[0] +mla v8.4S, v6.4S, v31.s[0] +sub v6.4s, v28.4s, v24.4s +str q12, [x0, #496] +ldr q12, [x0, #576] +sqrdmulh v20.4S, v12.4S, v29.s[0] +mla v26.4S, v17.4S, v31.s[0] +add v28.4s, v28.4s, v24.4s +str q14, [x0, #432] +ldr q14, [x0, #512] +sqrdmulh v24.4S, v14.4S, v29.s[0] +mla v27.4S, v21.4S, v31.s[0] +sub v21.4s, v15.4s, v25.4s +str q6, [x0, #368] +ldr q6, [x0, #448] +add v15.4s, v15.4s, v25.4s +mul v11.4S, v11.4S,v30.s[0] +mul v5.4S, v5.4S,v30.s[0] +ldr q25, [x0, #384] +str q28, [x0, #304] +ldr q28, [x0, #320] +ldr q17, [x0, #256] +mla v11.4S, v4.4S, v31.s[0] +mla v5.4S, v19.4S, v31.s[0] +str q21, [x0, #240] +sub v21.4s, v10.4s, v0.4s +ldr q19, [x0, #192] +ldr q4, [x0, #128] +mul v14.4S, v14.4S,v30.s[0] +mul v12.4S, v12.4S,v30.s[0] +str q15, [x0, #176] +add v10.4s, v10.4s, v0.4s +ldr q0, [x0, #64] +ldr q15, [x0, #0] +mla v14.4S, v24.4S, v31.s[0] +mla v12.4S, v20.4S, v31.s[0] +sub v20.4s, v6.4s, v13.4s +add v6.4s, v6.4s, v13.4s +sqrdmulh v13.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +sub v24.4s, v25.4s, v8.4s +add v25.4s, v25.4s, v8.4s +sqrdmulh v8.4S, v24.4S, v29.s[2] +mul v24.4S, v24.4S,v30.s[2] +sub v18.4s, v28.4s, v26.4s +add v28.4s, v28.4s, v26.4s +sqrdmulh v26.4S, v6.4S, v29.s[1] +mul v6.4S, v6.4S,v30.s[1] +sub v3.4s, v17.4s, v27.4s +add v17.4s, v17.4s, v27.4s +sqrdmulh v27.4S, v25.4S, v29.s[1] +mul v25.4S, v25.4S,v30.s[1] +sub v16.4s, v19.4s, v5.4s +add v19.4s, v19.4s, v5.4s +sqrdmulh v5.4S, v18.4S, v29.s[2] +mla v20.4S, v13.4S, v31.s[0] +sub v13.4s, v4.4s, v11.4s +add v4.4s, v4.4s, v11.4s +sqrdmulh v11.4S, v3.4S, v29.s[2] +mla v24.4S, v8.4S, v31.s[0] +sub v8.4s, v0.4s, v12.4s +add v0.4s, v0.4s, v12.4s +sqrdmulh v12.4S, v28.4S, v29.s[1] +mla v6.4S, v26.4S, v31.s[0] +sub v26.4s, v15.4s, v14.4s +str q21, [x0, #112] +sqrdmulh v21.4S, v17.4S, v29.s[1] +mla v25.4S, v27.4S, v31.s[0] +add v15.4s, v15.4s, v14.4s +str q10, [x0, #48] +mul v3.4S, v3.4S,v30.s[2] +mul v18.4S, v18.4S,v30.s[2] +sub v10.4s, v16.4s, v20.4s +add v16.4s, v16.4s, v20.4s +mla v3.4S, v11.4S, v31.s[0] +mla v18.4S, v5.4S, v31.s[0] +sub v5.4s, v13.4s, v24.4s +add v13.4s, v13.4s, v24.4s +mul v17.4S, v17.4S,v30.s[1] +mul v28.4S, v28.4S,v30.s[1] +sub v24.4s, v19.4s, v6.4s +add v19.4s, v19.4s, v6.4s +mla v17.4S, v21.4S, v31.s[0] +mla v28.4S, v12.4S, v31.s[0] +sub v12.4s, v4.4s, v25.4s +add v4.4s, v4.4s, v25.4s +sqrdmulh v25.4S, v10.4S, v22.s[3] +mul v10.4S, v10.4S,v23.s[3] +sub v21.4s, v8.4s, v18.4s +add v8.4s, v8.4s, v18.4s +sqrdmulh v18.4S, v16.4S, v22.s[2] +mul v16.4S, v16.4S,v23.s[2] +sub v6.4s, v26.4s, v3.4s +add v26.4s, v26.4s, v3.4s +sqrdmulh v3.4S, v24.4S, v22.s[1] +mul v24.4S, v24.4S,v23.s[1] +sub v11.4s, v0.4s, v28.4s +add v0.4s, v0.4s, v28.4s +sqrdmulh v28.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v20.4s, v15.4s, v17.4s +add v15.4s, v15.4s, v17.4s +sqrdmulh v17.4S, v5.4S, v22.s[3] +mla v10.4S, v25.4S, v31.s[0] +nop +nop +sqrdmulh v25.4S, v13.4S, v22.s[2] +mla v16.4S, v18.4S, v31.s[0] +nop +nop +sqrdmulh v18.4S, v12.4S, v22.s[1] +mla v24.4S, v3.4S, v31.s[0] +nop +nop +sqrdmulh v3.4S, v4.4S, v22.s[0] +mla v19.4S, v28.4S, v31.s[0] +nop +nop +mul v13.4S, v13.4S,v23.s[2] +mul v5.4S, v5.4S,v23.s[3] +sub v28.4s, v21.4s, v10.4s +add v21.4s, v21.4s, v10.4s +mla v13.4S, v25.4S, v31.s[0] +mla v5.4S, v17.4S, v31.s[0] +sub v17.4s, v8.4s, v16.4s +add v8.4s, v8.4s, v16.4s +mul v4.4S, v4.4S,v23.s[0] +mul v12.4S, v12.4S,v23.s[1] +sub v16.4s, v11.4s, v24.4s +add v11.4s, v11.4s, v24.4s +mla v4.4S, v3.4S, v31.s[0] +mla v12.4S, v18.4S, v31.s[0] +sub v18.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v28.4S, v9.s[3] +mul v28.4S, v28.4S,v1.s[3] +sub v3.4s, v6.4s, v5.4s +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v21.4S, v9.s[2] +mul v21.4S, v21.4S,v1.s[2] +sub v24.4s, v26.4s, v13.4s +add v26.4s, v26.4s, v13.4s +sqrdmulh v13.4S, v17.4S, v9.s[1] +mul v17.4S, v17.4S,v1.s[1] +sub v25.4s, v20.4s, v12.4s +add v20.4s, v20.4s, v12.4s +sqrdmulh v12.4S, v8.4S, v9.s[0] +mul v8.4S, v8.4S,v1.s[0] +sub v10.4s, v15.4s, v4.4s +add v15.4s, v15.4s, v4.4s +sqrdmulh v4.4S, v16.4S, v7.s[3] +mla v28.4S, v19.4S, v31.s[0] +nop +nop +sqrdmulh v19.4S, v11.4S, v7.s[2] +mla v21.4S, v5.4S, v31.s[0] +nop +nop +sqrdmulh v5.4S, v18.4S, v7.s[1] +mla v17.4S, v13.4S, v31.s[0] +nop +nop +sqrdmulh v13.4S, v0.4S, v7.s[0] +mla v8.4S, v12.4S, v31.s[0] +nop +nop +mul v11.4S, v11.4S,v2.s[2] +mul v16.4S, v16.4S,v2.s[3] +sub v12.4s, v3.4s, v28.4s +str q12, [x0, #960] +mla v11.4S, v19.4S, v31.s[0] +mla v16.4S, v4.4S, v31.s[0] +add v3.4s, v3.4s, v28.4s +str q3, [x0, #896] +mul v0.4S, v0.4S,v2.s[0] +mul v18.4S, v18.4S,v2.s[1] +sub v3.4s, v6.4s, v21.4s +str q3, [x0, #832] +mla v0.4S, v13.4S, v31.s[0] +mla v18.4S, v5.4S, v31.s[0] +add v6.4s, v6.4s, v21.4s +sub v21.4s, v24.4s, v17.4s +ldr q5, [x0, #976] +sqrdmulh v13.4S, v5.4S, v29.s[0] +mul v5.4S, v5.4S,v30.s[0] +add v24.4s, v24.4s, v17.4s +str q6, [x0, #768] +ldr q6, [x0, #912] +sqrdmulh v17.4S, v6.4S, v29.s[0] +mul v6.4S, v6.4S,v30.s[0] +sub v3.4s, v26.4s, v8.4s +str q21, [x0, #704] +ldr q21, [x0, #848] +sqrdmulh v28.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +add v26.4s, v26.4s, v8.4s +str q24, [x0, #640] +ldr q24, [x0, #784] +sqrdmulh v8.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +sub v4.4s, v25.4s, v16.4s +str q3, [x0, #576] +ldr q3, [x0, #720] +sqrdmulh v19.4S, v3.4S, v29.s[0] +mla v5.4S, v13.4S, v31.s[0] +add v25.4s, v25.4s, v16.4s +str q26, [x0, #512] +ldr q26, [x0, #656] +sqrdmulh v16.4S, v26.4S, v29.s[0] +mla v6.4S, v17.4S, v31.s[0] +sub v17.4s, v20.4s, v11.4s +str q4, [x0, #448] +ldr q4, [x0, #592] +sqrdmulh v13.4S, v4.4S, v29.s[0] +mla v21.4S, v28.4S, v31.s[0] +add v20.4s, v20.4s, v11.4s +str q25, [x0, #384] +ldr q25, [x0, #528] +sqrdmulh v11.4S, v25.4S, v29.s[0] +mla v24.4S, v8.4S, v31.s[0] +sub v8.4s, v10.4s, v18.4s +str q17, [x0, #320] +ldr q17, [x0, #464] +add v10.4s, v10.4s, v18.4s +mul v26.4S, v26.4S,v30.s[0] +mul v3.4S, v3.4S,v30.s[0] +ldr q18, [x0, #400] +str q20, [x0, #256] +ldr q20, [x0, #336] +ldr q28, [x0, #272] +mla v26.4S, v16.4S, v31.s[0] +mla v3.4S, v19.4S, v31.s[0] +str q8, [x0, #192] +sub v8.4s, v15.4s, v0.4s +ldr q19, [x0, #208] +ldr q16, [x0, #144] +mul v25.4S, v25.4S,v30.s[0] +mul v4.4S, v4.4S,v30.s[0] +str q10, [x0, #128] +add v15.4s, v15.4s, v0.4s +ldr q0, [x0, #80] +ldr q10, [x0, #16] +mla v25.4S, v11.4S, v31.s[0] +mla v4.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v5.4s +add v17.4s, v17.4s, v5.4s +sqrdmulh v5.4S, v13.4S, v29.s[2] +mul v13.4S, v13.4S,v30.s[2] +sub v11.4s, v18.4s, v6.4s +add v18.4s, v18.4s, v6.4s +sqrdmulh v6.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v12.4s, v20.4s, v21.4s +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v14.4s, v28.4s, v24.4s +add v28.4s, v28.4s, v24.4s +sqrdmulh v24.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v27.4s, v19.4s, v3.4s +add v19.4s, v19.4s, v3.4s +sqrdmulh v3.4S, v12.4S, v29.s[2] +mla v13.4S, v5.4S, v31.s[0] +sub v5.4s, v16.4s, v26.4s +add v16.4s, v16.4s, v26.4s +sqrdmulh v26.4S, v14.4S, v29.s[2] +mla v11.4S, v6.4S, v31.s[0] +sub v6.4s, v0.4s, v4.4s +add v0.4s, v0.4s, v4.4s +sqrdmulh v4.4S, v20.4S, v29.s[1] +mla v17.4S, v21.4S, v31.s[0] +sub v21.4s, v10.4s, v25.4s +str q8, [x0, #64] +sqrdmulh v8.4S, v28.4S, v29.s[1] +mla v18.4S, v24.4S, v31.s[0] +add v10.4s, v10.4s, v25.4s +str q15, [x0, #0] +mul v14.4S, v14.4S,v30.s[2] +mul v12.4S, v12.4S,v30.s[2] +sub v15.4s, v27.4s, v13.4s +add v27.4s, v27.4s, v13.4s +mla v14.4S, v26.4S, v31.s[0] +mla v12.4S, v3.4S, v31.s[0] +sub v3.4s, v5.4s, v11.4s +add v5.4s, v5.4s, v11.4s +mul v28.4S, v28.4S,v30.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v11.4s, v19.4s, v17.4s +add v19.4s, v19.4s, v17.4s +mla v28.4S, v8.4S, v31.s[0] +mla v20.4S, v4.4S, v31.s[0] +sub v4.4s, v16.4s, v18.4s +add v16.4s, v16.4s, v18.4s +sqrdmulh v29.4S, v15.4S, v22.s[3] +mul v15.4S, v15.4S,v23.s[3] +sub v30.4s, v6.4s, v12.4s +add v6.4s, v6.4s, v12.4s +sqrdmulh v12.4S, v27.4S, v22.s[2] +mul v27.4S, v27.4S,v23.s[2] +sub v18.4s, v21.4s, v14.4s +add v21.4s, v21.4s, v14.4s +sqrdmulh v14.4S, v11.4S, v22.s[1] +mul v11.4S, v11.4S,v23.s[1] +sub v8.4s, v0.4s, v20.4s +add v0.4s, v0.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v17.4s, v10.4s, v28.4s +add v10.4s, v10.4s, v28.4s +sqrdmulh v28.4S, v3.4S, v22.s[3] +mla v15.4S, v29.4S, v31.s[0] +nop +nop +sqrdmulh v29.4S, v5.4S, v22.s[2] +mla v27.4S, v12.4S, v31.s[0] +nop +nop +sqrdmulh v12.4S, v4.4S, v22.s[1] +mla v11.4S, v14.4S, v31.s[0] +nop +nop +sqrdmulh v14.4S, v16.4S, v22.s[0] +mla v19.4S, v20.4S, v31.s[0] +nop +nop +mul v5.4S, v5.4S,v23.s[2] +mul v3.4S, v3.4S,v23.s[3] +sub v20.4s, v30.4s, v15.4s +add v30.4s, v30.4s, v15.4s +mla v5.4S, v29.4S, v31.s[0] +mla v3.4S, v28.4S, v31.s[0] +sub v28.4s, v6.4s, v27.4s +add v6.4s, v6.4s, v27.4s +mul v16.4S, v16.4S,v23.s[0] +mul v4.4S, v4.4S,v23.s[1] +sub v27.4s, v8.4s, v11.4s +add v8.4s, v8.4s, v11.4s +mla v16.4S, v14.4S, v31.s[0] +mla v4.4S, v12.4S, v31.s[0] +sub v12.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v22.4S, v20.4S, v9.s[3] +mul v20.4S, v20.4S,v1.s[3] +sub v23.4s, v18.4s, v3.4s +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v30.4S, v9.s[2] +mul v30.4S, v30.4S,v1.s[2] +sub v19.4s, v21.4s, v5.4s +add v21.4s, v21.4s, v5.4s +sqrdmulh v5.4S, v28.4S, v9.s[1] +mul v28.4S, v28.4S,v1.s[1] +sub v14.4s, v17.4s, v4.4s +add v17.4s, v17.4s, v4.4s +sqrdmulh v4.4S, v6.4S, v9.s[0] +mul v6.4S, v6.4S,v1.s[0] +sub v11.4s, v10.4s, v16.4s +add v10.4s, v10.4s, v16.4s +sqrdmulh v9.4S, v27.4S, v7.s[3] +mla v20.4S, v22.4S, v31.s[0] +nop +nop +sqrdmulh v22.4S, v8.4S, v7.s[2] +mla v30.4S, v3.4S, v31.s[0] +nop +nop +sqrdmulh v3.4S, v12.4S, v7.s[1] +mla v28.4S, v5.4S, v31.s[0] +nop +nop +sqrdmulh v5.4S, v0.4S, v7.s[0] +mla v6.4S, v4.4S, v31.s[0] +nop +nop +mul v8.4S, v8.4S,v2.s[2] +mul v27.4S, v27.4S,v2.s[3] +sub v4.4s, v23.4s, v20.4s +str q4, [x0, #976] +mla v8.4S, v22.4S, v31.s[0] +mla v27.4S, v9.4S, v31.s[0] +add v23.4s, v23.4s, v20.4s +str q23, [x0, #912] +mul v0.4S, v0.4S,v2.s[0] +mul v12.4S, v12.4S,v2.s[1] +sub v23.4s, v18.4s, v30.4s +str q23, [x0, #848] +mla v0.4S, v5.4S, v31.s[0] +mla v12.4S, v3.4S, v31.s[0] +add v18.4s, v18.4s, v30.4s +sub v30.4s, v19.4s, v28.4s +add v19.4s, v19.4s, v28.4s +str q18, [x0, #784] +sub v18.4s, v21.4s, v6.4s +str q30, [x0, #720] +add v21.4s, v21.4s, v6.4s +str q19, [x0, #656] +sub v19.4s, v14.4s, v27.4s +str q18, [x0, #592] +add v14.4s, v14.4s, v27.4s +str q21, [x0, #528] +sub v21.4s, v17.4s, v8.4s +str q19, [x0, #464] +add v17.4s, v17.4s, v8.4s +str q14, [x0, #400] +sub v14.4s, v11.4s, v12.4s +str q21, [x0, #336] +add v11.4s, v11.4s, v12.4s +str q17, [x0, #272] +sub v17.4s, v10.4s, v0.4s +add v10.4s, v10.4s, v0.4s +ldr q24, [x0, #32] +ldr q25, [x0, #48] +ldr q13, [x0, #96] +ldr q26, [x0, #112] +ldr q15, [x17, #+128] +ldr q29, [x17, #+144] +ldr q16, [x17, #+160] +ldr q1, [x17, #+176] +ldr q4, [x0, #160] +ldr q22, [x0, #176] +sqrdmulh v9.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v15.s[0] +ldr q20, [x0, #224] +sqrdmulh v23.4S, v25.4S, v29.s[0] +mul v25.4S, v25.4S,v15.s[0] +ldr q5, [x0, #240] +sqrdmulh v3.4S, v13.4S, v1.s[0] +mul v13.4S, v13.4S,v16.s[0] +ldr q2, [x17, #+192] +sqrdmulh v7.4S, v26.4S, v1.s[0] +mul v26.4S, v26.4S,v16.s[0] +ldr q28, [x17, #+208] +mla v24.4S, v9.4S, v31.s[0] +sqrdmulh v9.4S, v4.4S, v28.s[0] +ldr q30, [x17, #+224] +mla v25.4S, v23.4S, v31.s[0] +sqrdmulh v23.4S, v22.4S, v28.s[0] +ldr q6, [x17, #+240] +mla v13.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v20.4S, v6.s[0] +ldr q18, [x0, #0] +mla v26.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v5.4S, v6.s[0] +mul v4.4S, v4.4S,v2.s[0] +mul v22.4S, v22.4S,v2.s[0] +sub v27.4s, v18.4s, v24.4s +ldr q19, [x0, #64] +add v18.4s, v18.4s, v24.4s +mla v4.4S, v9.4S, v31.s[0] +mla v22.4S, v23.4S, v31.s[0] +sub v23.4s, v10.4s, v25.4s +add v10.4s, v10.4s, v25.4s +mul v20.4S, v20.4S,v30.s[0] +mul v5.4S, v5.4S,v30.s[0] +sub v25.4s, v19.4s, v13.4s +ldr q9, [x0, #128] +add v19.4s, v19.4s, v13.4s +mla v20.4S, v3.4S, v31.s[0] +mla v5.4S, v7.4S, v31.s[0] +sub v7.4s, v17.4s, v26.4s +add v17.4s, v17.4s, v26.4s +sqrdmulh v26.4S, v10.4S, v29.s[1] +mul v10.4S, v10.4S,v15.s[1] +sub v3.4s, v9.4s, v4.4s +ldr q13, [x0, #192] +add v9.4s, v9.4s, v4.4s +sqrdmulh v4.4S, v23.4S, v29.s[2] +mul v23.4S, v23.4S,v15.s[2] +sub v24.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +sqrdmulh v29.4S, v17.4S, v1.s[1] +mul v17.4S, v17.4S,v16.s[1] +sub v22.4s, v13.4s, v20.4s +ldr q15, [x0, #288] +add v13.4s, v13.4s, v20.4s +sqrdmulh v20.4S, v7.4S, v1.s[2] +mul v7.4S, v7.4S,v16.s[2] +sub v8.4s, v14.4s, v5.4s +ldr q21, [x0, #304] +add v14.4s, v14.4s, v5.4s +mla v10.4S, v26.4S, v31.s[0] +sqrdmulh v26.4S, v11.4S, v28.s[1] +sub v1.4s, v18.4s, v10.4s +ldr q5, [x0, #352] +str q1, [x0, #16] +mla v23.4S, v4.4S, v31.s[0] +sqrdmulh v4.4S, v24.4S, v28.s[2] +add v18.4s, v18.4s, v10.4s +ldr q10, [x0, #368] +str q18, [x0, #0] +mla v17.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v14.4S, v6.s[1] +sub v18.4s, v27.4s, v23.4s +ldr q1, [x17, #+256] +str q18, [x0, #48] +mla v7.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v8.4S, v6.s[2] +add v27.4s, v27.4s, v23.4s +ldr q23, [x17, #+272] +str q27, [x0, #32] +mul v11.4S, v11.4S,v2.s[1] +mul v24.4S, v24.4S,v2.s[2] +sub v27.4s, v19.4s, v17.4s +ldr q18, [x17, #+288] +str q27, [x0, #80] +mla v11.4S, v26.4S, v31.s[0] +mla v24.4S, v4.4S, v31.s[0] +add v19.4s, v19.4s, v17.4s +ldr q17, [x17, #+304] +str q19, [x0, #64] +mul v14.4S, v14.4S,v30.s[1] +mul v8.4S, v8.4S,v30.s[2] +sub v28.4s, v25.4s, v7.4s +ldr q19, [x0, #416] +str q28, [x0, #112] +mla v14.4S, v29.4S, v31.s[0] +mla v8.4S, v20.4S, v31.s[0] +add v25.4s, v25.4s, v7.4s +ldr q7, [x0, #432] +str q25, [x0, #96] +sqrdmulh v6.4S, v15.4S, v23.s[0] +mul v15.4S, v15.4S,v1.s[0] +sub v25.4s, v9.4s, v11.4s +ldr q30, [x0, #480] +str q25, [x0, #144] +sqrdmulh v25.4S, v21.4S, v23.s[0] +mul v21.4S, v21.4S,v1.s[0] +add v9.4s, v9.4s, v11.4s +ldr q11, [x0, #496] +str q9, [x0, #128] +sqrdmulh v9.4S, v5.4S, v17.s[0] +mul v5.4S, v5.4S,v18.s[0] +sub v20.4s, v3.4s, v24.4s +ldr q29, [x17, #+320] +str q20, [x0, #176] +sqrdmulh v20.4S, v10.4S, v17.s[0] +mul v10.4S, v10.4S,v18.s[0] +add v3.4s, v3.4s, v24.4s +ldr q24, [x17, #+336] +str q3, [x0, #160] +mla v15.4S, v6.4S, v31.s[0] +sqrdmulh v6.4S, v19.4S, v24.s[0] +sub v3.4s, v13.4s, v14.4s +ldr q28, [x17, #+352] +str q3, [x0, #208] +mla v21.4S, v25.4S, v31.s[0] +sqrdmulh v25.4S, v7.4S, v24.s[0] +add v13.4s, v13.4s, v14.4s +ldr q14, [x17, #+368] +str q13, [x0, #192] +mla v5.4S, v9.4S, v31.s[0] +sqrdmulh v9.4S, v30.4S, v14.s[0] +sub v13.4s, v22.4s, v8.4s +ldr q3, [x0, #256] +str q13, [x0, #240] +mla v10.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v11.4S, v14.s[0] +add v22.4s, v22.4s, v8.4s +ldr q8, [x0, #272] +str q22, [x0, #224] +mul v19.4S, v19.4S,v29.s[0] +mul v7.4S, v7.4S,v29.s[0] +sub v22.4s, v3.4s, v15.4s +ldr q13, [x0, #320] +add v3.4s, v3.4s, v15.4s +mla v19.4S, v6.4S, v31.s[0] +mla v7.4S, v25.4S, v31.s[0] +sub v25.4s, v8.4s, v21.4s +ldr q6, [x0, #336] +add v8.4s, v8.4s, v21.4s +mul v30.4S, v30.4S,v28.s[0] +mul v11.4S, v11.4S,v28.s[0] +sub v21.4s, v13.4s, v5.4s +ldr q15, [x0, #384] +add v13.4s, v13.4s, v5.4s +mla v30.4S, v9.4S, v31.s[0] +mla v11.4S, v20.4S, v31.s[0] +sub v20.4s, v6.4s, v10.4s +ldr q9, [x0, #400] +add v6.4s, v6.4s, v10.4s +sqrdmulh v10.4S, v8.4S, v23.s[1] +mul v8.4S, v8.4S,v1.s[1] +sub v5.4s, v15.4s, v19.4s +ldr q2, [x0, #448] +add v15.4s, v15.4s, v19.4s +sqrdmulh v19.4S, v25.4S, v23.s[2] +mul v25.4S, v25.4S,v1.s[2] +sub v4.4s, v9.4s, v7.4s +ldr q26, [x0, #464] +add v9.4s, v9.4s, v7.4s +sqrdmulh v23.4S, v6.4S, v17.s[1] +mul v6.4S, v6.4S,v18.s[1] +sub v7.4s, v2.4s, v30.4s +ldr q1, [x0, #544] +add v2.4s, v2.4s, v30.4s +sqrdmulh v30.4S, v20.4S, v17.s[2] +mul v20.4S, v20.4S,v18.s[2] +sub v27.4s, v26.4s, v11.4s +ldr q16, [x0, #560] +add v26.4s, v26.4s, v11.4s +mla v8.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v9.4S, v24.s[1] +sub v17.4s, v3.4s, v8.4s +ldr q11, [x0, #608] +str q17, [x0, #272] +mla v25.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v4.4S, v24.s[2] +add v3.4s, v3.4s, v8.4s +ldr q8, [x0, #624] +str q3, [x0, #256] +mla v6.4S, v23.4S, v31.s[0] +sqrdmulh v23.4S, v26.4S, v14.s[1] +sub v3.4s, v22.4s, v25.4s +ldr q17, [x17, #+384] +str q3, [x0, #304] +mla v20.4S, v30.4S, v31.s[0] +sqrdmulh v30.4S, v27.4S, v14.s[2] +add v22.4s, v22.4s, v25.4s +ldr q25, [x17, #+400] +str q22, [x0, #288] +mul v9.4S, v9.4S,v29.s[1] +mul v4.4S, v4.4S,v29.s[2] +sub v22.4s, v13.4s, v6.4s +ldr q3, [x17, #+416] +str q22, [x0, #336] +mla v9.4S, v10.4S, v31.s[0] +mla v4.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v6.4s +ldr q6, [x17, #+432] +str q13, [x0, #320] +mul v26.4S, v26.4S,v28.s[1] +mul v27.4S, v27.4S,v28.s[2] +sub v24.4s, v21.4s, v20.4s +ldr q13, [x0, #672] +str q24, [x0, #368] +mla v26.4S, v23.4S, v31.s[0] +mla v27.4S, v30.4S, v31.s[0] +add v21.4s, v21.4s, v20.4s +ldr q20, [x0, #688] +str q21, [x0, #352] +sqrdmulh v14.4S, v1.4S, v25.s[0] +mul v1.4S, v1.4S,v17.s[0] +sub v21.4s, v15.4s, v9.4s +ldr q28, [x0, #736] +str q21, [x0, #400] +sqrdmulh v21.4S, v16.4S, v25.s[0] +mul v16.4S, v16.4S,v17.s[0] +add v15.4s, v15.4s, v9.4s +ldr q9, [x0, #752] +str q15, [x0, #384] +sqrdmulh v15.4S, v11.4S, v6.s[0] +mul v11.4S, v11.4S,v3.s[0] +sub v30.4s, v5.4s, v4.4s +ldr q23, [x17, #+448] +str q30, [x0, #432] +sqrdmulh v30.4S, v8.4S, v6.s[0] +mul v8.4S, v8.4S,v3.s[0] +add v5.4s, v5.4s, v4.4s +ldr q4, [x17, #+464] +str q5, [x0, #416] +mla v1.4S, v14.4S, v31.s[0] +sqrdmulh v14.4S, v13.4S, v4.s[0] +sub v5.4s, v2.4s, v26.4s +ldr q24, [x17, #+480] +str q5, [x0, #464] +mla v16.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v20.4S, v4.s[0] +add v2.4s, v2.4s, v26.4s +ldr q26, [x17, #+496] +str q2, [x0, #448] +mla v11.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v28.4S, v26.s[0] +sub v2.4s, v7.4s, v27.4s +ldr q5, [x0, #512] +str q2, [x0, #496] +mla v8.4S, v30.4S, v31.s[0] +sqrdmulh v30.4S, v9.4S, v26.s[0] +add v7.4s, v7.4s, v27.4s +ldr q27, [x0, #528] +str q7, [x0, #480] +mul v13.4S, v13.4S,v23.s[0] +mul v20.4S, v20.4S,v23.s[0] +sub v7.4s, v5.4s, v1.4s +ldr q2, [x0, #576] +add v5.4s, v5.4s, v1.4s +mla v13.4S, v14.4S, v31.s[0] +mla v20.4S, v21.4S, v31.s[0] +sub v21.4s, v27.4s, v16.4s +ldr q14, [x0, #592] +add v27.4s, v27.4s, v16.4s +mul v28.4S, v28.4S,v24.s[0] +mul v9.4S, v9.4S,v24.s[0] +sub v16.4s, v2.4s, v11.4s +ldr q1, [x0, #640] +add v2.4s, v2.4s, v11.4s +mla v28.4S, v15.4S, v31.s[0] +mla v9.4S, v30.4S, v31.s[0] +sub v30.4s, v14.4s, v8.4s +ldr q15, [x0, #656] +add v14.4s, v14.4s, v8.4s +sqrdmulh v8.4S, v27.4S, v25.s[1] +mul v27.4S, v27.4S,v17.s[1] +sub v11.4s, v1.4s, v13.4s +ldr q29, [x0, #704] +add v1.4s, v1.4s, v13.4s +sqrdmulh v13.4S, v21.4S, v25.s[2] +mul v21.4S, v21.4S,v17.s[2] +sub v19.4s, v15.4s, v20.4s +ldr q10, [x0, #720] +add v15.4s, v15.4s, v20.4s +sqrdmulh v25.4S, v14.4S, v6.s[1] +mul v14.4S, v14.4S,v3.s[1] +sub v20.4s, v29.4s, v28.4s +ldr q17, [x0, #800] +add v29.4s, v29.4s, v28.4s +sqrdmulh v28.4S, v30.4S, v6.s[2] +mul v30.4S, v30.4S,v3.s[2] +sub v22.4s, v10.4s, v9.4s +ldr q18, [x0, #816] +add v10.4s, v10.4s, v9.4s +mla v27.4S, v8.4S, v31.s[0] +sqrdmulh v8.4S, v15.4S, v4.s[1] +sub v6.4s, v5.4s, v27.4s +ldr q9, [x0, #864] +str q6, [x0, #528] +mla v21.4S, v13.4S, v31.s[0] +sqrdmulh v13.4S, v19.4S, v4.s[2] +add v5.4s, v5.4s, v27.4s +ldr q27, [x0, #880] +str q5, [x0, #512] +mla v14.4S, v25.4S, v31.s[0] +sqrdmulh v25.4S, v10.4S, v26.s[1] +sub v5.4s, v7.4s, v21.4s +ldr q6, [x17, #+512] +str q5, [x0, #560] +mla v30.4S, v28.4S, v31.s[0] +sqrdmulh v28.4S, v22.4S, v26.s[2] +add v7.4s, v7.4s, v21.4s +ldr q21, [x17, #+528] +str q7, [x0, #544] +mul v15.4S, v15.4S,v23.s[1] +mul v19.4S, v19.4S,v23.s[2] +sub v7.4s, v2.4s, v14.4s +ldr q5, [x17, #+544] +str q7, [x0, #592] +mla v15.4S, v8.4S, v31.s[0] +mla v19.4S, v13.4S, v31.s[0] +add v2.4s, v2.4s, v14.4s +ldr q14, [x17, #+560] +str q2, [x0, #576] +mul v10.4S, v10.4S,v24.s[1] +mul v22.4S, v22.4S,v24.s[2] +sub v4.4s, v16.4s, v30.4s +ldr q2, [x0, #928] +str q4, [x0, #624] +mla v10.4S, v25.4S, v31.s[0] +mla v22.4S, v28.4S, v31.s[0] +add v16.4s, v16.4s, v30.4s +ldr q30, [x0, #944] +str q16, [x0, #608] +sqrdmulh v26.4S, v17.4S, v21.s[0] +mul v17.4S, v17.4S,v6.s[0] +sub v16.4s, v1.4s, v15.4s +ldr q24, [x0, #992] +str q16, [x0, #656] +sqrdmulh v16.4S, v18.4S, v21.s[0] +mul v18.4S, v18.4S,v6.s[0] +add v1.4s, v1.4s, v15.4s +ldr q15, [x0, #1008] +str q1, [x0, #640] +sqrdmulh v1.4S, v9.4S, v14.s[0] +mul v9.4S, v9.4S,v5.s[0] +sub v28.4s, v11.4s, v19.4s +ldr q25, [x17, #+576] +str q28, [x0, #688] +sqrdmulh v28.4S, v27.4S, v14.s[0] +mul v27.4S, v27.4S,v5.s[0] +add v11.4s, v11.4s, v19.4s +ldr q19, [x17, #+592] +str q11, [x0, #672] +mla v17.4S, v26.4S, v31.s[0] +sqrdmulh v26.4S, v2.4S, v19.s[0] +sub v11.4s, v29.4s, v10.4s +ldr q4, [x17, #+608] +str q11, [x0, #720] +mla v18.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v30.4S, v19.s[0] +add v29.4s, v29.4s, v10.4s +ldr q10, [x17, #+624] +str q29, [x0, #704] +mla v9.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v24.4S, v10.s[0] +sub v29.4s, v20.4s, v22.4s +ldr q11, [x0, #768] +str q29, [x0, #752] +mla v27.4S, v28.4S, v31.s[0] +sqrdmulh v28.4S, v15.4S, v10.s[0] +add v20.4s, v20.4s, v22.4s +ldr q22, [x0, #784] +str q20, [x0, #736] +mul v2.4S, v2.4S,v25.s[0] +mul v30.4S, v30.4S,v25.s[0] +sub v20.4s, v11.4s, v17.4s +ldr q29, [x0, #832] +add v11.4s, v11.4s, v17.4s +mla v2.4S, v26.4S, v31.s[0] +mla v30.4S, v16.4S, v31.s[0] +sub v16.4s, v22.4s, v18.4s +ldr q26, [x0, #848] +add v22.4s, v22.4s, v18.4s +mul v24.4S, v24.4S,v4.s[0] +mul v15.4S, v15.4S,v4.s[0] +sub v18.4s, v29.4s, v9.4s +ldr q17, [x0, #896] +add v29.4s, v29.4s, v9.4s +mla v24.4S, v1.4S, v31.s[0] +mla v15.4S, v28.4S, v31.s[0] +sub v28.4s, v26.4s, v27.4s +ldr q1, [x0, #912] +add v26.4s, v26.4s, v27.4s +sqrdmulh v27.4S, v22.4S, v21.s[1] +mul v22.4S, v22.4S,v6.s[1] +sub v9.4s, v17.4s, v2.4s +ldr q23, [x0, #960] +add v17.4s, v17.4s, v2.4s +sqrdmulh v2.4S, v16.4S, v21.s[2] +mul v16.4S, v16.4S,v6.s[2] +sub v13.4s, v1.4s, v30.4s +ldr q8, [x0, #976] +add v1.4s, v1.4s, v30.4s +sqrdmulh v21.4S, v26.4S, v14.s[1] +mul v26.4S, v26.4S,v5.s[1] +sub v30.4s, v23.4s, v24.4s +add v23.4s, v23.4s, v24.4s +sqrdmulh v24.4S, v28.4S, v14.s[2] +mul v28.4S, v28.4S,v5.s[2] +sub v6.4s, v8.4s, v15.4s +add v8.4s, v8.4s, v15.4s +mla v22.4S, v27.4S, v31.s[0] +sqrdmulh v27.4S, v1.4S, v19.s[1] +sub v14.4s, v11.4s, v22.4s +str q14, [x0, #784] +mla v16.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v13.4S, v19.s[2] +add v11.4s, v11.4s, v22.4s +str q11, [x0, #768] +mla v26.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v8.4S, v10.s[1] +sub v11.4s, v20.4s, v16.4s +str q11, [x0, #816] +mla v28.4S, v24.4S, v31.s[0] +sqrdmulh v24.4S, v6.4S, v10.s[2] +add v20.4s, v20.4s, v16.4s +str q20, [x0, #800] +mul v1.4S, v1.4S,v25.s[1] +mul v13.4S, v13.4S,v25.s[2] +sub v20.4s, v29.4s, v26.4s +str q20, [x0, #848] +mla v1.4S, v27.4S, v31.s[0] +mla v13.4S, v2.4S, v31.s[0] +add v29.4s, v29.4s, v26.4s +str q29, [x0, #832] +mul v8.4S, v8.4S,v4.s[1] +mul v6.4S, v6.4S,v4.s[2] +sub v19.4s, v18.4s, v28.4s +str q19, [x0, #880] +mla v8.4S, v21.4S, v31.s[0] +mla v6.4S, v24.4S, v31.s[0] +add v18.4s, v18.4s, v28.4s +str q18, [x0, #864] +sub v10.4s, v17.4s, v1.4s +str q10, [x0, #912] +add v17.4s, v17.4s, v1.4s +str q17, [x0, #896] +sub v17.4s, v9.4s, v13.4s +str q17, [x0, #944] +add v9.4s, v9.4s, v13.4s +str q9, [x0, #928] +sub v9.4s, v23.4s, v8.4s +str q9, [x0, #976] +add v23.4s, v23.4s, v8.4s +str q23, [x0, #960] +sub v23.4s, v30.4s, v6.4s +str q23, [x0, #1008] +add v30.4s, v30.4s, v6.4s +str q30, [x0, #992] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1520 +// Instruction count: 1516 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_14.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_14.s new file mode 100644 index 0000000..9604c58 --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_14.s @@ -0,0 +1,1550 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_22_z4_14 +.global _ntt_u32_incomplete_neon_asm_var_4_2_22_z4_14 +ntt_u32_incomplete_neon_asm_var_4_2_22_z4_14: +_ntt_u32_incomplete_neon_asm_var_4_2_22_z4_14: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x0, #992] +sqrdmulh v27.4S, v28.4S, v29.s[0] +mul v28.4S, v28.4S,v30.s[0] +ldr q26, [x0, #928] +sqrdmulh v25.4S, v26.4S, v29.s[0] +mul v26.4S, v26.4S,v30.s[0] +ldr q24, [x0, #864] +sqrdmulh v23.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +ldr q22, [x0, #800] +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +ldr q20, [x0, #736] +sqrdmulh v19.4S, v20.4S, v29.s[0] +mla v28.4S, v27.4S, v31.s[0] +ldr q27, [x0, #672] +sqrdmulh v18.4S, v27.4S, v29.s[0] +mla v26.4S, v25.4S, v31.s[0] +ldr q25, [x0, #608] +sqrdmulh v17.4S, v25.4S, v29.s[0] +mla v24.4S, v23.4S, v31.s[0] +ldr q23, [x0, #544] +sqrdmulh v16.4S, v23.4S, v29.s[0] +mla v22.4S, v21.4S, v31.s[0] +ldr q21, [x0, #480] +mul v27.4S, v27.4S,v30.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q3, [x0, #416] +ldr q2, [x0, #352] +ldr q1, [x0, #288] +mla v27.4S, v18.4S, v31.s[0] +mla v20.4S, v19.4S, v31.s[0] +ldr q19, [x0, #224] +ldr q18, [x0, #160] +mul v23.4S, v23.4S,v30.s[0] +mul v25.4S, v25.4S,v30.s[0] +ldr q0, [x0, #96] +ldr q15, [x0, #32] +mla v23.4S, v16.4S, v31.s[0] +mla v25.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v28.4s +add v21.4s, v21.4s, v28.4s +sqrdmulh v28.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +sub v16.4s, v3.4s, v26.4s +add v3.4s, v3.4s, v26.4s +sqrdmulh v26.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +sub v14.4s, v2.4s, v24.4s +add v2.4s, v2.4s, v24.4s +sqrdmulh v24.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v13.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +sqrdmulh v22.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v12.4s, v19.4s, v20.4s +add v19.4s, v19.4s, v20.4s +sqrdmulh v20.4S, v14.4S, v29.s[2] +mla v17.4S, v28.4S, v31.s[0] +sub v28.4s, v18.4s, v27.4s +add v18.4s, v18.4s, v27.4s +sqrdmulh v27.4S, v13.4S, v29.s[2] +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v0.4s, v25.4s +add v0.4s, v0.4s, v25.4s +sqrdmulh v25.4S, v2.4S, v29.s[1] +mla v21.4S, v24.4S, v31.s[0] +sub v24.4s, v15.4s, v23.4s +sqrdmulh v11.4S, v1.4S, v29.s[1] +mla v3.4S, v22.4S, v31.s[0] +add v15.4s, v15.4s, v23.4s +ldr q23, [x17, #+32] +ldr q22, [x17, #+48] +mul v13.4S, v13.4S,v30.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v10.4s, v12.4s, v17.4s +add v12.4s, v12.4s, v17.4s +mla v13.4S, v27.4S, v31.s[0] +mla v14.4S, v20.4S, v31.s[0] +sub v20.4s, v28.4s, v16.4s +add v28.4s, v28.4s, v16.4s +mul v1.4S, v1.4S,v30.s[1] +mul v2.4S, v2.4S,v30.s[1] +sub v16.4s, v19.4s, v21.4s +add v19.4s, v19.4s, v21.4s +mla v1.4S, v11.4S, v31.s[0] +mla v2.4S, v25.4S, v31.s[0] +sub v25.4s, v18.4s, v3.4s +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v10.4S, v22.s[3] +mul v10.4S, v10.4S,v23.s[3] +sub v11.4s, v26.4s, v14.4s +add v26.4s, v26.4s, v14.4s +sqrdmulh v14.4S, v12.4S, v22.s[2] +mul v12.4S, v12.4S,v23.s[2] +sub v21.4s, v24.4s, v13.4s +add v24.4s, v24.4s, v13.4s +sqrdmulh v13.4S, v16.4S, v22.s[1] +mul v16.4S, v16.4S,v23.s[1] +sub v27.4s, v0.4s, v2.4s +add v0.4s, v0.4s, v2.4s +sqrdmulh v2.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v17.4s, v15.4s, v1.4s +add v15.4s, v15.4s, v1.4s +ldr q1, [x17, #+96] +ldr q9, [x17, #+112] +sqrdmulh v8.4S, v20.4S, v22.s[3] +mla v10.4S, v3.4S, v31.s[0] +nop +nop +sqrdmulh v3.4S, v28.4S, v22.s[2] +mla v12.4S, v14.4S, v31.s[0] +nop +nop +sqrdmulh v14.4S, v25.4S, v22.s[1] +mla v16.4S, v13.4S, v31.s[0] +nop +nop +sqrdmulh v13.4S, v18.4S, v22.s[0] +mla v19.4S, v2.4S, v31.s[0] +nop +nop +ldr q2, [x17, #+64] +ldr q7, [x17, #+80] +mul v28.4S, v28.4S,v23.s[2] +mul v20.4S, v20.4S,v23.s[3] +sub v6.4s, v11.4s, v10.4s +add v11.4s, v11.4s, v10.4s +mla v28.4S, v3.4S, v31.s[0] +mla v20.4S, v8.4S, v31.s[0] +sub v8.4s, v26.4s, v12.4s +add v26.4s, v26.4s, v12.4s +mul v18.4S, v18.4S,v23.s[0] +mul v25.4S, v25.4S,v23.s[1] +sub v12.4s, v27.4s, v16.4s +add v27.4s, v27.4s, v16.4s +mla v18.4S, v13.4S, v31.s[0] +mla v25.4S, v14.4S, v31.s[0] +sub v14.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v9.s[3] +mul v6.4S, v6.4S,v1.s[3] +sub v13.4s, v21.4s, v20.4s +add v21.4s, v21.4s, v20.4s +sqrdmulh v20.4S, v11.4S, v9.s[2] +mul v11.4S, v11.4S,v1.s[2] +sub v16.4s, v24.4s, v28.4s +add v24.4s, v24.4s, v28.4s +sqrdmulh v28.4S, v8.4S, v9.s[1] +mul v8.4S, v8.4S,v1.s[1] +sub v3.4s, v17.4s, v25.4s +add v17.4s, v17.4s, v25.4s +sqrdmulh v25.4S, v26.4S, v9.s[0] +mul v26.4S, v26.4S,v1.s[0] +sub v10.4s, v15.4s, v18.4s +add v15.4s, v15.4s, v18.4s +sqrdmulh v18.4S, v12.4S, v7.s[3] +mla v6.4S, v19.4S, v31.s[0] +nop +nop +sqrdmulh v19.4S, v27.4S, v7.s[2] +mla v11.4S, v20.4S, v31.s[0] +nop +nop +sqrdmulh v20.4S, v14.4S, v7.s[1] +mla v8.4S, v28.4S, v31.s[0] +nop +nop +sqrdmulh v28.4S, v0.4S, v7.s[0] +mla v26.4S, v25.4S, v31.s[0] +nop +nop +mul v27.4S, v27.4S,v2.s[2] +mul v12.4S, v12.4S,v2.s[3] +sub v25.4s, v13.4s, v6.4s +str q25, [x0, #992] +mla v27.4S, v19.4S, v31.s[0] +mla v12.4S, v18.4S, v31.s[0] +add v13.4s, v13.4s, v6.4s +str q13, [x0, #928] +mul v0.4S, v0.4S,v2.s[0] +mul v14.4S, v14.4S,v2.s[1] +sub v13.4s, v21.4s, v11.4s +str q13, [x0, #864] +mla v0.4S, v28.4S, v31.s[0] +mla v14.4S, v20.4S, v31.s[0] +add v21.4s, v21.4s, v11.4s +sub v11.4s, v16.4s, v8.4s +ldr q20, [x0, #1008] +sqrdmulh v28.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v16.4s, v16.4s, v8.4s +str q21, [x0, #800] +ldr q21, [x0, #944] +sqrdmulh v8.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +sub v13.4s, v24.4s, v26.4s +str q11, [x0, #736] +ldr q11, [x0, #880] +sqrdmulh v6.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +add v24.4s, v24.4s, v26.4s +str q16, [x0, #672] +ldr q16, [x0, #816] +sqrdmulh v26.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v18.4s, v3.4s, v12.4s +str q13, [x0, #608] +ldr q13, [x0, #752] +sqrdmulh v19.4S, v13.4S, v29.s[0] +mla v20.4S, v28.4S, v31.s[0] +add v3.4s, v3.4s, v12.4s +str q24, [x0, #544] +ldr q24, [x0, #688] +sqrdmulh v12.4S, v24.4S, v29.s[0] +mla v21.4S, v8.4S, v31.s[0] +sub v8.4s, v17.4s, v27.4s +str q18, [x0, #480] +ldr q18, [x0, #624] +sqrdmulh v28.4S, v18.4S, v29.s[0] +mla v11.4S, v6.4S, v31.s[0] +add v17.4s, v17.4s, v27.4s +str q3, [x0, #416] +ldr q3, [x0, #560] +sqrdmulh v27.4S, v3.4S, v29.s[0] +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v10.4s, v14.4s +str q8, [x0, #352] +ldr q8, [x0, #496] +add v10.4s, v10.4s, v14.4s +mul v24.4S, v24.4S,v30.s[0] +mul v13.4S, v13.4S,v30.s[0] +ldr q14, [x0, #432] +str q17, [x0, #288] +ldr q17, [x0, #368] +ldr q6, [x0, #304] +mla v24.4S, v12.4S, v31.s[0] +mla v13.4S, v19.4S, v31.s[0] +str q26, [x0, #224] +sub v26.4s, v15.4s, v0.4s +ldr q19, [x0, #240] +ldr q12, [x0, #176] +mul v3.4S, v3.4S,v30.s[0] +mul v18.4S, v18.4S,v30.s[0] +str q10, [x0, #160] +add v15.4s, v15.4s, v0.4s +ldr q0, [x0, #112] +ldr q10, [x0, #48] +mla v3.4S, v27.4S, v31.s[0] +mla v18.4S, v28.4S, v31.s[0] +sub v28.4s, v8.4s, v20.4s +add v8.4s, v8.4s, v20.4s +sqrdmulh v20.4S, v28.4S, v29.s[2] +mul v28.4S, v28.4S,v30.s[2] +sub v27.4s, v14.4s, v21.4s +add v14.4s, v14.4s, v21.4s +sqrdmulh v21.4S, v27.4S, v29.s[2] +mul v27.4S, v27.4S,v30.s[2] +sub v25.4s, v17.4s, v11.4s +add v17.4s, v17.4s, v11.4s +sqrdmulh v11.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +sub v5.4s, v6.4s, v16.4s +add v6.4s, v6.4s, v16.4s +sqrdmulh v16.4S, v14.4S, v29.s[1] +mul v14.4S, v14.4S,v30.s[1] +sub v4.4s, v19.4s, v13.4s +add v19.4s, v19.4s, v13.4s +sqrdmulh v13.4S, v25.4S, v29.s[2] +mla v28.4S, v20.4S, v31.s[0] +sub v20.4s, v12.4s, v24.4s +add v12.4s, v12.4s, v24.4s +sqrdmulh v24.4S, v5.4S, v29.s[2] +mla v27.4S, v21.4S, v31.s[0] +sub v21.4s, v0.4s, v18.4s +add v0.4s, v0.4s, v18.4s +sqrdmulh v18.4S, v17.4S, v29.s[1] +mla v8.4S, v11.4S, v31.s[0] +sub v11.4s, v10.4s, v3.4s +str q26, [x0, #96] +sqrdmulh v26.4S, v6.4S, v29.s[1] +mla v14.4S, v16.4S, v31.s[0] +add v10.4s, v10.4s, v3.4s +str q15, [x0, #32] +mul v5.4S, v5.4S,v30.s[2] +mul v25.4S, v25.4S,v30.s[2] +sub v15.4s, v4.4s, v28.4s +add v4.4s, v4.4s, v28.4s +mla v5.4S, v24.4S, v31.s[0] +mla v25.4S, v13.4S, v31.s[0] +sub v13.4s, v20.4s, v27.4s +add v20.4s, v20.4s, v27.4s +mul v6.4S, v6.4S,v30.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v27.4s, v19.4s, v8.4s +add v19.4s, v19.4s, v8.4s +mla v6.4S, v26.4S, v31.s[0] +mla v17.4S, v18.4S, v31.s[0] +sub v18.4s, v12.4s, v14.4s +add v12.4s, v12.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v22.s[3] +mul v15.4S, v15.4S,v23.s[3] +sub v26.4s, v21.4s, v25.4s +add v21.4s, v21.4s, v25.4s +sqrdmulh v25.4S, v4.4S, v22.s[2] +mul v4.4S, v4.4S,v23.s[2] +sub v8.4s, v11.4s, v5.4s +add v11.4s, v11.4s, v5.4s +sqrdmulh v5.4S, v27.4S, v22.s[1] +mul v27.4S, v27.4S,v23.s[1] +sub v24.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +sqrdmulh v17.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v28.4s, v10.4s, v6.4s +add v10.4s, v10.4s, v6.4s +sqrdmulh v6.4S, v13.4S, v22.s[3] +mla v15.4S, v14.4S, v31.s[0] +nop +nop +sqrdmulh v14.4S, v20.4S, v22.s[2] +mla v4.4S, v25.4S, v31.s[0] +nop +nop +sqrdmulh v25.4S, v18.4S, v22.s[1] +mla v27.4S, v5.4S, v31.s[0] +nop +nop +sqrdmulh v5.4S, v12.4S, v22.s[0] +mla v19.4S, v17.4S, v31.s[0] +nop +nop +mul v20.4S, v20.4S,v23.s[2] +mul v13.4S, v13.4S,v23.s[3] +sub v17.4s, v26.4s, v15.4s +add v26.4s, v26.4s, v15.4s +mla v20.4S, v14.4S, v31.s[0] +mla v13.4S, v6.4S, v31.s[0] +sub v6.4s, v21.4s, v4.4s +add v21.4s, v21.4s, v4.4s +mul v12.4S, v12.4S,v23.s[0] +mul v18.4S, v18.4S,v23.s[1] +sub v4.4s, v24.4s, v27.4s +add v24.4s, v24.4s, v27.4s +mla v12.4S, v5.4S, v31.s[0] +mla v18.4S, v25.4S, v31.s[0] +sub v25.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v17.4S, v9.s[3] +mul v17.4S, v17.4S,v1.s[3] +sub v5.4s, v8.4s, v13.4s +add v8.4s, v8.4s, v13.4s +sqrdmulh v13.4S, v26.4S, v9.s[2] +mul v26.4S, v26.4S,v1.s[2] +sub v27.4s, v11.4s, v20.4s +add v11.4s, v11.4s, v20.4s +sqrdmulh v20.4S, v6.4S, v9.s[1] +mul v6.4S, v6.4S,v1.s[1] +sub v14.4s, v28.4s, v18.4s +add v28.4s, v28.4s, v18.4s +sqrdmulh v18.4S, v21.4S, v9.s[0] +mul v21.4S, v21.4S,v1.s[0] +sub v15.4s, v10.4s, v12.4s +add v10.4s, v10.4s, v12.4s +sqrdmulh v12.4S, v4.4S, v7.s[3] +mla v17.4S, v19.4S, v31.s[0] +nop +nop +sqrdmulh v19.4S, v24.4S, v7.s[2] +mla v26.4S, v13.4S, v31.s[0] +nop +nop +sqrdmulh v13.4S, v25.4S, v7.s[1] +mla v6.4S, v20.4S, v31.s[0] +nop +nop +sqrdmulh v20.4S, v0.4S, v7.s[0] +mla v21.4S, v18.4S, v31.s[0] +nop +nop +mul v24.4S, v24.4S,v2.s[2] +mul v4.4S, v4.4S,v2.s[3] +sub v18.4s, v5.4s, v17.4s +str q18, [x0, #1008] +mla v24.4S, v19.4S, v31.s[0] +mla v4.4S, v12.4S, v31.s[0] +add v5.4s, v5.4s, v17.4s +str q5, [x0, #944] +mul v0.4S, v0.4S,v2.s[0] +mul v25.4S, v25.4S,v2.s[1] +sub v5.4s, v8.4s, v26.4s +str q5, [x0, #880] +mla v0.4S, v20.4S, v31.s[0] +mla v25.4S, v13.4S, v31.s[0] +add v8.4s, v8.4s, v26.4s +sub v26.4s, v27.4s, v6.4s +ldr q13, [x0, #960] +sqrdmulh v20.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +add v27.4s, v27.4s, v6.4s +str q8, [x0, #816] +ldr q8, [x0, #896] +sqrdmulh v6.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v5.4s, v11.4s, v21.4s +str q26, [x0, #752] +ldr q26, [x0, #832] +sqrdmulh v17.4S, v26.4S, v29.s[0] +mul v26.4S, v26.4S,v30.s[0] +add v11.4s, v11.4s, v21.4s +str q27, [x0, #688] +ldr q27, [x0, #768] +sqrdmulh v21.4S, v27.4S, v29.s[0] +mul v27.4S, v27.4S,v30.s[0] +sub v12.4s, v14.4s, v4.4s +str q5, [x0, #624] +ldr q5, [x0, #704] +sqrdmulh v19.4S, v5.4S, v29.s[0] +mla v13.4S, v20.4S, v31.s[0] +add v14.4s, v14.4s, v4.4s +str q11, [x0, #560] +ldr q11, [x0, #640] +sqrdmulh v4.4S, v11.4S, v29.s[0] +mla v8.4S, v6.4S, v31.s[0] +sub v6.4s, v28.4s, v24.4s +str q12, [x0, #496] +ldr q12, [x0, #576] +sqrdmulh v20.4S, v12.4S, v29.s[0] +mla v26.4S, v17.4S, v31.s[0] +add v28.4s, v28.4s, v24.4s +str q14, [x0, #432] +ldr q14, [x0, #512] +sqrdmulh v24.4S, v14.4S, v29.s[0] +mla v27.4S, v21.4S, v31.s[0] +sub v21.4s, v15.4s, v25.4s +str q6, [x0, #368] +ldr q6, [x0, #448] +add v15.4s, v15.4s, v25.4s +mul v11.4S, v11.4S,v30.s[0] +mul v5.4S, v5.4S,v30.s[0] +ldr q25, [x0, #384] +str q28, [x0, #304] +ldr q28, [x0, #320] +ldr q17, [x0, #256] +mla v11.4S, v4.4S, v31.s[0] +mla v5.4S, v19.4S, v31.s[0] +str q21, [x0, #240] +sub v21.4s, v10.4s, v0.4s +ldr q19, [x0, #192] +ldr q4, [x0, #128] +mul v14.4S, v14.4S,v30.s[0] +mul v12.4S, v12.4S,v30.s[0] +str q15, [x0, #176] +add v10.4s, v10.4s, v0.4s +ldr q0, [x0, #64] +ldr q15, [x0, #0] +mla v14.4S, v24.4S, v31.s[0] +mla v12.4S, v20.4S, v31.s[0] +sub v20.4s, v6.4s, v13.4s +add v6.4s, v6.4s, v13.4s +sqrdmulh v13.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +sub v24.4s, v25.4s, v8.4s +add v25.4s, v25.4s, v8.4s +sqrdmulh v8.4S, v24.4S, v29.s[2] +mul v24.4S, v24.4S,v30.s[2] +sub v18.4s, v28.4s, v26.4s +add v28.4s, v28.4s, v26.4s +sqrdmulh v26.4S, v6.4S, v29.s[1] +mul v6.4S, v6.4S,v30.s[1] +sub v3.4s, v17.4s, v27.4s +add v17.4s, v17.4s, v27.4s +sqrdmulh v27.4S, v25.4S, v29.s[1] +mul v25.4S, v25.4S,v30.s[1] +sub v16.4s, v19.4s, v5.4s +add v19.4s, v19.4s, v5.4s +sqrdmulh v5.4S, v18.4S, v29.s[2] +mla v20.4S, v13.4S, v31.s[0] +sub v13.4s, v4.4s, v11.4s +add v4.4s, v4.4s, v11.4s +sqrdmulh v11.4S, v3.4S, v29.s[2] +mla v24.4S, v8.4S, v31.s[0] +sub v8.4s, v0.4s, v12.4s +add v0.4s, v0.4s, v12.4s +sqrdmulh v12.4S, v28.4S, v29.s[1] +mla v6.4S, v26.4S, v31.s[0] +sub v26.4s, v15.4s, v14.4s +str q21, [x0, #112] +sqrdmulh v21.4S, v17.4S, v29.s[1] +mla v25.4S, v27.4S, v31.s[0] +add v15.4s, v15.4s, v14.4s +str q10, [x0, #48] +mul v3.4S, v3.4S,v30.s[2] +mul v18.4S, v18.4S,v30.s[2] +sub v10.4s, v16.4s, v20.4s +add v16.4s, v16.4s, v20.4s +mla v3.4S, v11.4S, v31.s[0] +mla v18.4S, v5.4S, v31.s[0] +sub v5.4s, v13.4s, v24.4s +add v13.4s, v13.4s, v24.4s +mul v17.4S, v17.4S,v30.s[1] +mul v28.4S, v28.4S,v30.s[1] +sub v24.4s, v19.4s, v6.4s +add v19.4s, v19.4s, v6.4s +mla v17.4S, v21.4S, v31.s[0] +mla v28.4S, v12.4S, v31.s[0] +sub v12.4s, v4.4s, v25.4s +add v4.4s, v4.4s, v25.4s +sqrdmulh v25.4S, v10.4S, v22.s[3] +mul v10.4S, v10.4S,v23.s[3] +sub v21.4s, v8.4s, v18.4s +add v8.4s, v8.4s, v18.4s +sqrdmulh v18.4S, v16.4S, v22.s[2] +mul v16.4S, v16.4S,v23.s[2] +sub v6.4s, v26.4s, v3.4s +add v26.4s, v26.4s, v3.4s +sqrdmulh v3.4S, v24.4S, v22.s[1] +mul v24.4S, v24.4S,v23.s[1] +sub v11.4s, v0.4s, v28.4s +add v0.4s, v0.4s, v28.4s +sqrdmulh v28.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v20.4s, v15.4s, v17.4s +add v15.4s, v15.4s, v17.4s +sqrdmulh v17.4S, v5.4S, v22.s[3] +mla v10.4S, v25.4S, v31.s[0] +nop +nop +sqrdmulh v25.4S, v13.4S, v22.s[2] +mla v16.4S, v18.4S, v31.s[0] +nop +nop +sqrdmulh v18.4S, v12.4S, v22.s[1] +mla v24.4S, v3.4S, v31.s[0] +nop +nop +sqrdmulh v3.4S, v4.4S, v22.s[0] +mla v19.4S, v28.4S, v31.s[0] +nop +nop +mul v13.4S, v13.4S,v23.s[2] +mul v5.4S, v5.4S,v23.s[3] +sub v28.4s, v21.4s, v10.4s +add v21.4s, v21.4s, v10.4s +mla v13.4S, v25.4S, v31.s[0] +mla v5.4S, v17.4S, v31.s[0] +sub v17.4s, v8.4s, v16.4s +add v8.4s, v8.4s, v16.4s +mul v4.4S, v4.4S,v23.s[0] +mul v12.4S, v12.4S,v23.s[1] +sub v16.4s, v11.4s, v24.4s +add v11.4s, v11.4s, v24.4s +mla v4.4S, v3.4S, v31.s[0] +mla v12.4S, v18.4S, v31.s[0] +sub v18.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v28.4S, v9.s[3] +mul v28.4S, v28.4S,v1.s[3] +sub v3.4s, v6.4s, v5.4s +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v21.4S, v9.s[2] +mul v21.4S, v21.4S,v1.s[2] +sub v24.4s, v26.4s, v13.4s +add v26.4s, v26.4s, v13.4s +sqrdmulh v13.4S, v17.4S, v9.s[1] +mul v17.4S, v17.4S,v1.s[1] +sub v25.4s, v20.4s, v12.4s +add v20.4s, v20.4s, v12.4s +sqrdmulh v12.4S, v8.4S, v9.s[0] +mul v8.4S, v8.4S,v1.s[0] +sub v10.4s, v15.4s, v4.4s +add v15.4s, v15.4s, v4.4s +sqrdmulh v4.4S, v16.4S, v7.s[3] +mla v28.4S, v19.4S, v31.s[0] +nop +nop +sqrdmulh v19.4S, v11.4S, v7.s[2] +mla v21.4S, v5.4S, v31.s[0] +nop +nop +sqrdmulh v5.4S, v18.4S, v7.s[1] +mla v17.4S, v13.4S, v31.s[0] +nop +nop +sqrdmulh v13.4S, v0.4S, v7.s[0] +mla v8.4S, v12.4S, v31.s[0] +nop +nop +mul v11.4S, v11.4S,v2.s[2] +mul v16.4S, v16.4S,v2.s[3] +sub v12.4s, v3.4s, v28.4s +str q12, [x0, #960] +mla v11.4S, v19.4S, v31.s[0] +mla v16.4S, v4.4S, v31.s[0] +add v3.4s, v3.4s, v28.4s +str q3, [x0, #896] +mul v0.4S, v0.4S,v2.s[0] +mul v18.4S, v18.4S,v2.s[1] +sub v3.4s, v6.4s, v21.4s +str q3, [x0, #832] +mla v0.4S, v13.4S, v31.s[0] +mla v18.4S, v5.4S, v31.s[0] +add v6.4s, v6.4s, v21.4s +sub v21.4s, v24.4s, v17.4s +ldr q5, [x0, #976] +sqrdmulh v13.4S, v5.4S, v29.s[0] +mul v5.4S, v5.4S,v30.s[0] +add v24.4s, v24.4s, v17.4s +str q6, [x0, #768] +ldr q6, [x0, #912] +sqrdmulh v17.4S, v6.4S, v29.s[0] +mul v6.4S, v6.4S,v30.s[0] +sub v3.4s, v26.4s, v8.4s +str q21, [x0, #704] +ldr q21, [x0, #848] +sqrdmulh v28.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +add v26.4s, v26.4s, v8.4s +str q24, [x0, #640] +ldr q24, [x0, #784] +sqrdmulh v8.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +sub v4.4s, v25.4s, v16.4s +str q3, [x0, #576] +ldr q3, [x0, #720] +sqrdmulh v19.4S, v3.4S, v29.s[0] +mla v5.4S, v13.4S, v31.s[0] +add v25.4s, v25.4s, v16.4s +str q26, [x0, #512] +ldr q26, [x0, #656] +sqrdmulh v16.4S, v26.4S, v29.s[0] +mla v6.4S, v17.4S, v31.s[0] +sub v17.4s, v20.4s, v11.4s +str q4, [x0, #448] +ldr q4, [x0, #592] +sqrdmulh v13.4S, v4.4S, v29.s[0] +mla v21.4S, v28.4S, v31.s[0] +add v20.4s, v20.4s, v11.4s +str q25, [x0, #384] +ldr q25, [x0, #528] +sqrdmulh v11.4S, v25.4S, v29.s[0] +mla v24.4S, v8.4S, v31.s[0] +sub v8.4s, v10.4s, v18.4s +str q17, [x0, #320] +ldr q17, [x0, #464] +add v10.4s, v10.4s, v18.4s +mul v26.4S, v26.4S,v30.s[0] +mul v3.4S, v3.4S,v30.s[0] +ldr q18, [x0, #400] +str q20, [x0, #256] +ldr q20, [x0, #336] +ldr q28, [x0, #272] +mla v26.4S, v16.4S, v31.s[0] +mla v3.4S, v19.4S, v31.s[0] +str q8, [x0, #192] +sub v8.4s, v15.4s, v0.4s +ldr q19, [x0, #208] +ldr q16, [x0, #144] +mul v25.4S, v25.4S,v30.s[0] +mul v4.4S, v4.4S,v30.s[0] +str q10, [x0, #128] +add v15.4s, v15.4s, v0.4s +ldr q0, [x0, #80] +ldr q10, [x0, #16] +mla v25.4S, v11.4S, v31.s[0] +mla v4.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v5.4s +add v17.4s, v17.4s, v5.4s +sqrdmulh v5.4S, v13.4S, v29.s[2] +mul v13.4S, v13.4S,v30.s[2] +sub v11.4s, v18.4s, v6.4s +add v18.4s, v18.4s, v6.4s +sqrdmulh v6.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v12.4s, v20.4s, v21.4s +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v14.4s, v28.4s, v24.4s +add v28.4s, v28.4s, v24.4s +sqrdmulh v24.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v27.4s, v19.4s, v3.4s +add v19.4s, v19.4s, v3.4s +sqrdmulh v3.4S, v12.4S, v29.s[2] +mla v13.4S, v5.4S, v31.s[0] +sub v5.4s, v16.4s, v26.4s +add v16.4s, v16.4s, v26.4s +sqrdmulh v26.4S, v14.4S, v29.s[2] +mla v11.4S, v6.4S, v31.s[0] +sub v6.4s, v0.4s, v4.4s +add v0.4s, v0.4s, v4.4s +sqrdmulh v4.4S, v20.4S, v29.s[1] +mla v17.4S, v21.4S, v31.s[0] +sub v21.4s, v10.4s, v25.4s +str q8, [x0, #64] +sqrdmulh v8.4S, v28.4S, v29.s[1] +mla v18.4S, v24.4S, v31.s[0] +add v10.4s, v10.4s, v25.4s +str q15, [x0, #0] +mul v14.4S, v14.4S,v30.s[2] +mul v12.4S, v12.4S,v30.s[2] +sub v15.4s, v27.4s, v13.4s +add v27.4s, v27.4s, v13.4s +mla v14.4S, v26.4S, v31.s[0] +mla v12.4S, v3.4S, v31.s[0] +sub v3.4s, v5.4s, v11.4s +add v5.4s, v5.4s, v11.4s +mul v28.4S, v28.4S,v30.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v11.4s, v19.4s, v17.4s +add v19.4s, v19.4s, v17.4s +mla v28.4S, v8.4S, v31.s[0] +mla v20.4S, v4.4S, v31.s[0] +sub v4.4s, v16.4s, v18.4s +add v16.4s, v16.4s, v18.4s +sqrdmulh v29.4S, v15.4S, v22.s[3] +mul v15.4S, v15.4S,v23.s[3] +sub v30.4s, v6.4s, v12.4s +add v6.4s, v6.4s, v12.4s +sqrdmulh v12.4S, v27.4S, v22.s[2] +mul v27.4S, v27.4S,v23.s[2] +sub v18.4s, v21.4s, v14.4s +add v21.4s, v21.4s, v14.4s +sqrdmulh v14.4S, v11.4S, v22.s[1] +mul v11.4S, v11.4S,v23.s[1] +sub v8.4s, v0.4s, v20.4s +add v0.4s, v0.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v17.4s, v10.4s, v28.4s +add v10.4s, v10.4s, v28.4s +sqrdmulh v28.4S, v3.4S, v22.s[3] +mla v15.4S, v29.4S, v31.s[0] +nop +nop +sqrdmulh v29.4S, v5.4S, v22.s[2] +mla v27.4S, v12.4S, v31.s[0] +nop +nop +sqrdmulh v12.4S, v4.4S, v22.s[1] +mla v11.4S, v14.4S, v31.s[0] +nop +nop +sqrdmulh v14.4S, v16.4S, v22.s[0] +mla v19.4S, v20.4S, v31.s[0] +nop +nop +mul v5.4S, v5.4S,v23.s[2] +mul v3.4S, v3.4S,v23.s[3] +sub v20.4s, v30.4s, v15.4s +add v30.4s, v30.4s, v15.4s +mla v5.4S, v29.4S, v31.s[0] +mla v3.4S, v28.4S, v31.s[0] +sub v28.4s, v6.4s, v27.4s +add v6.4s, v6.4s, v27.4s +mul v16.4S, v16.4S,v23.s[0] +mul v4.4S, v4.4S,v23.s[1] +sub v27.4s, v8.4s, v11.4s +add v8.4s, v8.4s, v11.4s +mla v16.4S, v14.4S, v31.s[0] +mla v4.4S, v12.4S, v31.s[0] +sub v12.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v22.4S, v20.4S, v9.s[3] +mul v20.4S, v20.4S,v1.s[3] +sub v23.4s, v18.4s, v3.4s +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v30.4S, v9.s[2] +mul v30.4S, v30.4S,v1.s[2] +sub v19.4s, v21.4s, v5.4s +add v21.4s, v21.4s, v5.4s +sqrdmulh v5.4S, v28.4S, v9.s[1] +mul v28.4S, v28.4S,v1.s[1] +sub v14.4s, v17.4s, v4.4s +add v17.4s, v17.4s, v4.4s +sqrdmulh v4.4S, v6.4S, v9.s[0] +mul v6.4S, v6.4S,v1.s[0] +sub v11.4s, v10.4s, v16.4s +add v10.4s, v10.4s, v16.4s +sqrdmulh v9.4S, v27.4S, v7.s[3] +mla v20.4S, v22.4S, v31.s[0] +nop +nop +sqrdmulh v22.4S, v8.4S, v7.s[2] +mla v30.4S, v3.4S, v31.s[0] +nop +nop +sqrdmulh v3.4S, v12.4S, v7.s[1] +mla v28.4S, v5.4S, v31.s[0] +nop +nop +sqrdmulh v5.4S, v0.4S, v7.s[0] +mla v6.4S, v4.4S, v31.s[0] +nop +nop +mul v8.4S, v8.4S,v2.s[2] +mul v27.4S, v27.4S,v2.s[3] +sub v4.4s, v23.4s, v20.4s +str q4, [x0, #976] +mla v8.4S, v22.4S, v31.s[0] +mla v27.4S, v9.4S, v31.s[0] +add v23.4s, v23.4s, v20.4s +str q23, [x0, #912] +mul v0.4S, v0.4S,v2.s[0] +mul v12.4S, v12.4S,v2.s[1] +sub v23.4s, v18.4s, v30.4s +str q23, [x0, #848] +mla v0.4S, v5.4S, v31.s[0] +mla v12.4S, v3.4S, v31.s[0] +add v18.4s, v18.4s, v30.4s +sub v30.4s, v19.4s, v28.4s +add v19.4s, v19.4s, v28.4s +str q18, [x0, #784] +sub v18.4s, v21.4s, v6.4s +str q30, [x0, #720] +add v21.4s, v21.4s, v6.4s +str q19, [x0, #656] +sub v19.4s, v14.4s, v27.4s +str q18, [x0, #592] +add v14.4s, v14.4s, v27.4s +str q21, [x0, #528] +sub v21.4s, v17.4s, v8.4s +str q19, [x0, #464] +add v17.4s, v17.4s, v8.4s +str q14, [x0, #400] +sub v14.4s, v11.4s, v12.4s +str q21, [x0, #336] +add v11.4s, v11.4s, v12.4s +str q17, [x0, #272] +sub v17.4s, v10.4s, v0.4s +add v10.4s, v10.4s, v0.4s +ldr q24, [x0, #48] +ldr q25, [x0, #32] +ldr q13, [x0, #112] +ldr q26, [x0, #96] +ldr q15, [x17, #+128] +ldr q29, [x17, #+144] +ldr q16, [x17, #+160] +ldr q1, [x17, #+176] +ldr q4, [x0, #176] +ldr q22, [x0, #160] +sqrdmulh v9.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v15.s[0] +ldr q20, [x0, #240] +sqrdmulh v23.4S, v25.4S, v29.s[0] +mul v25.4S, v25.4S,v15.s[0] +ldr q5, [x0, #224] +sqrdmulh v3.4S, v13.4S, v1.s[0] +mul v13.4S, v13.4S,v16.s[0] +ldr q2, [x17, #+192] +sqrdmulh v7.4S, v26.4S, v1.s[0] +mul v26.4S, v26.4S,v16.s[0] +ldr q28, [x17, #+208] +mla v24.4S, v9.4S, v31.s[0] +sqrdmulh v9.4S, v4.4S, v28.s[0] +ldr q30, [x17, #+224] +mla v25.4S, v23.4S, v31.s[0] +sqrdmulh v23.4S, v22.4S, v28.s[0] +ldr q6, [x17, #+240] +mla v13.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v20.4S, v6.s[0] +mla v26.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v5.4S, v6.s[0] +ldr q18, [x0, #0] +mul v4.4S, v4.4S,v2.s[0] +mul v22.4S, v22.4S,v2.s[0] +sub v27.4s, v10.4s, v24.4s +add v10.4s, v10.4s, v24.4s +mla v4.4S, v9.4S, v31.s[0] +mla v22.4S, v23.4S, v31.s[0] +sub v23.4s, v18.4s, v25.4s +ldr q9, [x0, #64] +add v18.4s, v18.4s, v25.4s +mul v20.4S, v20.4S,v30.s[0] +mul v5.4S, v5.4S,v30.s[0] +sub v25.4s, v17.4s, v13.4s +add v17.4s, v17.4s, v13.4s +mla v20.4S, v3.4S, v31.s[0] +mla v5.4S, v7.4S, v31.s[0] +sub v7.4s, v9.4s, v26.4s +ldr q3, [x0, #128] +add v9.4s, v9.4s, v26.4s +sqrdmulh v26.4S, v27.4S, v29.s[2] +mul v27.4S, v27.4S,v15.s[2] +sub v13.4s, v11.4s, v4.4s +add v11.4s, v11.4s, v4.4s +sqrdmulh v4.4S, v10.4S, v29.s[1] +mul v10.4S, v10.4S,v15.s[1] +sub v24.4s, v3.4s, v22.4s +ldr q19, [x0, #192] +add v3.4s, v3.4s, v22.4s +sqrdmulh v29.4S, v25.4S, v1.s[2] +mul v25.4S, v25.4S,v16.s[2] +sub v22.4s, v14.4s, v20.4s +ldr q15, [x0, #304] +add v14.4s, v14.4s, v20.4s +sqrdmulh v20.4S, v17.4S, v1.s[1] +mul v17.4S, v17.4S,v16.s[1] +sub v8.4s, v19.4s, v5.4s +ldr q21, [x0, #288] +add v19.4s, v19.4s, v5.4s +mla v27.4S, v26.4S, v31.s[0] +sqrdmulh v26.4S, v13.4S, v28.s[2] +sub v1.4s, v23.4s, v27.4s +ldr q5, [x0, #368] +str q1, [x0, #48] +mla v10.4S, v4.4S, v31.s[0] +sqrdmulh v4.4S, v11.4S, v28.s[1] +add v23.4s, v23.4s, v27.4s +ldr q27, [x0, #352] +str q23, [x0, #32] +mla v25.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v22.4S, v6.s[2] +sub v23.4s, v18.4s, v10.4s +ldr q1, [x17, #+256] +str q23, [x0, #16] +mla v17.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v14.4S, v6.s[1] +add v18.4s, v18.4s, v10.4s +ldr q10, [x17, #+272] +str q18, [x0, #0] +mul v13.4S, v13.4S,v2.s[2] +mul v11.4S, v11.4S,v2.s[1] +sub v18.4s, v7.4s, v25.4s +ldr q23, [x17, #+288] +str q18, [x0, #112] +mla v13.4S, v26.4S, v31.s[0] +mla v11.4S, v4.4S, v31.s[0] +add v7.4s, v7.4s, v25.4s +ldr q25, [x17, #+304] +str q7, [x0, #96] +mul v22.4S, v22.4S,v30.s[2] +mul v14.4S, v14.4S,v30.s[1] +sub v28.4s, v9.4s, v17.4s +ldr q7, [x0, #432] +str q28, [x0, #80] +mla v22.4S, v29.4S, v31.s[0] +mla v14.4S, v20.4S, v31.s[0] +add v9.4s, v9.4s, v17.4s +ldr q17, [x0, #416] +str q9, [x0, #64] +sqrdmulh v6.4S, v15.4S, v10.s[0] +mul v15.4S, v15.4S,v1.s[0] +sub v9.4s, v24.4s, v13.4s +ldr q30, [x0, #496] +str q9, [x0, #176] +sqrdmulh v9.4S, v21.4S, v10.s[0] +mul v21.4S, v21.4S,v1.s[0] +add v24.4s, v24.4s, v13.4s +ldr q13, [x0, #480] +str q24, [x0, #160] +sqrdmulh v24.4S, v5.4S, v25.s[0] +mul v5.4S, v5.4S,v23.s[0] +sub v20.4s, v3.4s, v11.4s +ldr q29, [x17, #+320] +str q20, [x0, #144] +sqrdmulh v20.4S, v27.4S, v25.s[0] +mul v27.4S, v27.4S,v23.s[0] +add v3.4s, v3.4s, v11.4s +ldr q11, [x17, #+336] +str q3, [x0, #128] +mla v15.4S, v6.4S, v31.s[0] +sqrdmulh v6.4S, v7.4S, v11.s[0] +sub v3.4s, v8.4s, v22.4s +ldr q28, [x17, #+352] +str q3, [x0, #240] +mla v21.4S, v9.4S, v31.s[0] +sqrdmulh v9.4S, v17.4S, v11.s[0] +add v8.4s, v8.4s, v22.4s +ldr q22, [x17, #+368] +str q8, [x0, #224] +mla v5.4S, v24.4S, v31.s[0] +sqrdmulh v24.4S, v30.4S, v22.s[0] +sub v8.4s, v19.4s, v14.4s +ldr q3, [x0, #272] +str q8, [x0, #208] +mla v27.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v13.4S, v22.s[0] +add v19.4s, v19.4s, v14.4s +ldr q14, [x0, #256] +str q19, [x0, #192] +mul v7.4S, v7.4S,v29.s[0] +mul v17.4S, v17.4S,v29.s[0] +sub v19.4s, v3.4s, v15.4s +ldr q8, [x0, #336] +add v3.4s, v3.4s, v15.4s +mla v7.4S, v6.4S, v31.s[0] +mla v17.4S, v9.4S, v31.s[0] +sub v9.4s, v14.4s, v21.4s +ldr q6, [x0, #320] +add v14.4s, v14.4s, v21.4s +mul v30.4S, v30.4S,v28.s[0] +mul v13.4S, v13.4S,v28.s[0] +sub v21.4s, v8.4s, v5.4s +ldr q15, [x0, #400] +add v8.4s, v8.4s, v5.4s +mla v30.4S, v24.4S, v31.s[0] +mla v13.4S, v20.4S, v31.s[0] +sub v20.4s, v6.4s, v27.4s +ldr q24, [x0, #384] +add v6.4s, v6.4s, v27.4s +sqrdmulh v27.4S, v19.4S, v10.s[2] +mul v19.4S, v19.4S,v1.s[2] +sub v5.4s, v15.4s, v7.4s +ldr q2, [x0, #464] +add v15.4s, v15.4s, v7.4s +sqrdmulh v7.4S, v3.4S, v10.s[1] +mul v3.4S, v3.4S,v1.s[1] +sub v4.4s, v24.4s, v17.4s +ldr q26, [x0, #448] +add v24.4s, v24.4s, v17.4s +sqrdmulh v10.4S, v21.4S, v25.s[2] +mul v21.4S, v21.4S,v23.s[2] +sub v17.4s, v2.4s, v30.4s +ldr q1, [x0, #560] +add v2.4s, v2.4s, v30.4s +sqrdmulh v30.4S, v8.4S, v25.s[1] +mul v8.4S, v8.4S,v23.s[1] +sub v18.4s, v26.4s, v13.4s +ldr q16, [x0, #544] +add v26.4s, v26.4s, v13.4s +mla v19.4S, v27.4S, v31.s[0] +sqrdmulh v27.4S, v5.4S, v11.s[2] +sub v25.4s, v9.4s, v19.4s +ldr q13, [x0, #624] +str q25, [x0, #304] +mla v3.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v15.4S, v11.s[1] +add v9.4s, v9.4s, v19.4s +ldr q19, [x0, #608] +str q9, [x0, #288] +mla v21.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v17.4S, v22.s[2] +sub v9.4s, v14.4s, v3.4s +ldr q25, [x17, #+384] +str q9, [x0, #272] +mla v8.4S, v30.4S, v31.s[0] +sqrdmulh v30.4S, v2.4S, v22.s[1] +add v14.4s, v14.4s, v3.4s +ldr q3, [x17, #+400] +str q14, [x0, #256] +mul v5.4S, v5.4S,v29.s[2] +mul v15.4S, v15.4S,v29.s[1] +sub v14.4s, v20.4s, v21.4s +ldr q9, [x17, #+416] +str q14, [x0, #368] +mla v5.4S, v27.4S, v31.s[0] +mla v15.4S, v7.4S, v31.s[0] +add v20.4s, v20.4s, v21.4s +ldr q21, [x17, #+432] +str q20, [x0, #352] +mul v17.4S, v17.4S,v28.s[2] +mul v2.4S, v2.4S,v28.s[1] +sub v11.4s, v6.4s, v8.4s +ldr q20, [x0, #688] +str q11, [x0, #336] +mla v17.4S, v10.4S, v31.s[0] +mla v2.4S, v30.4S, v31.s[0] +add v6.4s, v6.4s, v8.4s +ldr q8, [x0, #672] +str q6, [x0, #320] +sqrdmulh v22.4S, v1.4S, v3.s[0] +mul v1.4S, v1.4S,v25.s[0] +sub v6.4s, v4.4s, v5.4s +ldr q28, [x0, #752] +str q6, [x0, #432] +sqrdmulh v6.4S, v16.4S, v3.s[0] +mul v16.4S, v16.4S,v25.s[0] +add v4.4s, v4.4s, v5.4s +ldr q5, [x0, #736] +str q4, [x0, #416] +sqrdmulh v4.4S, v13.4S, v21.s[0] +mul v13.4S, v13.4S,v9.s[0] +sub v30.4s, v24.4s, v15.4s +ldr q10, [x17, #+448] +str q30, [x0, #400] +sqrdmulh v30.4S, v19.4S, v21.s[0] +mul v19.4S, v19.4S,v9.s[0] +add v24.4s, v24.4s, v15.4s +ldr q15, [x17, #+464] +str q24, [x0, #384] +mla v1.4S, v22.4S, v31.s[0] +sqrdmulh v22.4S, v20.4S, v15.s[0] +sub v24.4s, v18.4s, v17.4s +ldr q11, [x17, #+480] +str q24, [x0, #496] +mla v16.4S, v6.4S, v31.s[0] +sqrdmulh v6.4S, v8.4S, v15.s[0] +add v18.4s, v18.4s, v17.4s +ldr q17, [x17, #+496] +str q18, [x0, #480] +mla v13.4S, v4.4S, v31.s[0] +sqrdmulh v4.4S, v28.4S, v17.s[0] +sub v18.4s, v26.4s, v2.4s +ldr q24, [x0, #528] +str q18, [x0, #464] +mla v19.4S, v30.4S, v31.s[0] +sqrdmulh v30.4S, v5.4S, v17.s[0] +add v26.4s, v26.4s, v2.4s +ldr q2, [x0, #512] +str q26, [x0, #448] +mul v20.4S, v20.4S,v10.s[0] +mul v8.4S, v8.4S,v10.s[0] +sub v26.4s, v24.4s, v1.4s +ldr q18, [x0, #592] +add v24.4s, v24.4s, v1.4s +mla v20.4S, v22.4S, v31.s[0] +mla v8.4S, v6.4S, v31.s[0] +sub v6.4s, v2.4s, v16.4s +ldr q22, [x0, #576] +add v2.4s, v2.4s, v16.4s +mul v28.4S, v28.4S,v11.s[0] +mul v5.4S, v5.4S,v11.s[0] +sub v16.4s, v18.4s, v13.4s +ldr q1, [x0, #656] +add v18.4s, v18.4s, v13.4s +mla v28.4S, v4.4S, v31.s[0] +mla v5.4S, v30.4S, v31.s[0] +sub v30.4s, v22.4s, v19.4s +ldr q4, [x0, #640] +add v22.4s, v22.4s, v19.4s +sqrdmulh v19.4S, v26.4S, v3.s[2] +mul v26.4S, v26.4S,v25.s[2] +sub v13.4s, v1.4s, v20.4s +ldr q29, [x0, #720] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v24.4S, v3.s[1] +mul v24.4S, v24.4S,v25.s[1] +sub v7.4s, v4.4s, v8.4s +ldr q27, [x0, #704] +add v4.4s, v4.4s, v8.4s +sqrdmulh v3.4S, v16.4S, v21.s[2] +mul v16.4S, v16.4S,v9.s[2] +sub v8.4s, v29.4s, v28.4s +ldr q25, [x0, #816] +add v29.4s, v29.4s, v28.4s +sqrdmulh v28.4S, v18.4S, v21.s[1] +mul v18.4S, v18.4S,v9.s[1] +sub v14.4s, v27.4s, v5.4s +ldr q23, [x0, #800] +add v27.4s, v27.4s, v5.4s +mla v26.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v13.4S, v15.s[2] +sub v21.4s, v6.4s, v26.4s +ldr q5, [x0, #880] +str q21, [x0, #560] +mla v24.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v1.4S, v15.s[1] +add v6.4s, v6.4s, v26.4s +ldr q26, [x0, #864] +str q6, [x0, #544] +mla v16.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v8.4S, v17.s[2] +sub v6.4s, v2.4s, v24.4s +ldr q21, [x17, #+512] +str q6, [x0, #528] +mla v18.4S, v28.4S, v31.s[0] +sqrdmulh v28.4S, v29.4S, v17.s[1] +add v2.4s, v2.4s, v24.4s +ldr q24, [x17, #+528] +str q2, [x0, #512] +mul v13.4S, v13.4S,v10.s[2] +mul v1.4S, v1.4S,v10.s[1] +sub v2.4s, v30.4s, v16.4s +ldr q6, [x17, #+544] +str q2, [x0, #624] +mla v13.4S, v19.4S, v31.s[0] +mla v1.4S, v20.4S, v31.s[0] +add v30.4s, v30.4s, v16.4s +ldr q16, [x17, #+560] +str q30, [x0, #608] +mul v8.4S, v8.4S,v11.s[2] +mul v29.4S, v29.4S,v11.s[1] +sub v15.4s, v22.4s, v18.4s +ldr q30, [x0, #944] +str q15, [x0, #592] +mla v8.4S, v3.4S, v31.s[0] +mla v29.4S, v28.4S, v31.s[0] +add v22.4s, v22.4s, v18.4s +ldr q18, [x0, #928] +str q22, [x0, #576] +sqrdmulh v17.4S, v25.4S, v24.s[0] +mul v25.4S, v25.4S,v21.s[0] +sub v22.4s, v7.4s, v13.4s +ldr q11, [x0, #1008] +str q22, [x0, #688] +sqrdmulh v22.4S, v23.4S, v24.s[0] +mul v23.4S, v23.4S,v21.s[0] +add v7.4s, v7.4s, v13.4s +ldr q13, [x0, #992] +str q7, [x0, #672] +sqrdmulh v7.4S, v5.4S, v16.s[0] +mul v5.4S, v5.4S,v6.s[0] +sub v28.4s, v4.4s, v1.4s +ldr q3, [x17, #+576] +str q28, [x0, #656] +sqrdmulh v28.4S, v26.4S, v16.s[0] +mul v26.4S, v26.4S,v6.s[0] +add v4.4s, v4.4s, v1.4s +ldr q1, [x17, #+592] +str q4, [x0, #640] +mla v25.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v30.4S, v1.s[0] +sub v4.4s, v14.4s, v8.4s +ldr q15, [x17, #+608] +str q4, [x0, #752] +mla v23.4S, v22.4S, v31.s[0] +sqrdmulh v22.4S, v18.4S, v1.s[0] +add v14.4s, v14.4s, v8.4s +ldr q8, [x17, #+624] +str q14, [x0, #736] +mla v5.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v11.4S, v8.s[0] +sub v14.4s, v27.4s, v29.4s +ldr q4, [x0, #784] +str q14, [x0, #720] +mla v26.4S, v28.4S, v31.s[0] +sqrdmulh v28.4S, v13.4S, v8.s[0] +add v27.4s, v27.4s, v29.4s +ldr q29, [x0, #768] +str q27, [x0, #704] +mul v30.4S, v30.4S,v3.s[0] +mul v18.4S, v18.4S,v3.s[0] +sub v27.4s, v4.4s, v25.4s +ldr q14, [x0, #848] +add v4.4s, v4.4s, v25.4s +mla v30.4S, v17.4S, v31.s[0] +mla v18.4S, v22.4S, v31.s[0] +sub v22.4s, v29.4s, v23.4s +ldr q17, [x0, #832] +add v29.4s, v29.4s, v23.4s +mul v11.4S, v11.4S,v15.s[0] +mul v13.4S, v13.4S,v15.s[0] +sub v23.4s, v14.4s, v5.4s +ldr q25, [x0, #912] +add v14.4s, v14.4s, v5.4s +mla v11.4S, v7.4S, v31.s[0] +mla v13.4S, v28.4S, v31.s[0] +sub v28.4s, v17.4s, v26.4s +ldr q7, [x0, #896] +add v17.4s, v17.4s, v26.4s +sqrdmulh v26.4S, v27.4S, v24.s[2] +mul v27.4S, v27.4S,v21.s[2] +sub v5.4s, v25.4s, v30.4s +ldr q10, [x0, #976] +add v25.4s, v25.4s, v30.4s +sqrdmulh v30.4S, v4.4S, v24.s[1] +mul v4.4S, v4.4S,v21.s[1] +sub v20.4s, v7.4s, v18.4s +ldr q19, [x0, #960] +add v7.4s, v7.4s, v18.4s +sqrdmulh v24.4S, v23.4S, v16.s[2] +mul v23.4S, v23.4S,v6.s[2] +sub v18.4s, v10.4s, v11.4s +add v10.4s, v10.4s, v11.4s +sqrdmulh v11.4S, v14.4S, v16.s[1] +mul v14.4S, v14.4S,v6.s[1] +sub v21.4s, v19.4s, v13.4s +add v19.4s, v19.4s, v13.4s +mla v27.4S, v26.4S, v31.s[0] +sqrdmulh v26.4S, v5.4S, v1.s[2] +sub v16.4s, v22.4s, v27.4s +str q16, [x0, #816] +mla v4.4S, v30.4S, v31.s[0] +sqrdmulh v30.4S, v25.4S, v1.s[1] +add v22.4s, v22.4s, v27.4s +str q22, [x0, #800] +mla v23.4S, v24.4S, v31.s[0] +sqrdmulh v24.4S, v18.4S, v8.s[2] +sub v22.4s, v29.4s, v4.4s +str q22, [x0, #784] +mla v14.4S, v11.4S, v31.s[0] +sqrdmulh v11.4S, v10.4S, v8.s[1] +add v29.4s, v29.4s, v4.4s +str q29, [x0, #768] +mul v5.4S, v5.4S,v3.s[2] +mul v25.4S, v25.4S,v3.s[1] +sub v29.4s, v28.4s, v23.4s +str q29, [x0, #880] +mla v5.4S, v26.4S, v31.s[0] +mla v25.4S, v30.4S, v31.s[0] +add v28.4s, v28.4s, v23.4s +str q28, [x0, #864] +mul v18.4S, v18.4S,v15.s[2] +mul v10.4S, v10.4S,v15.s[1] +sub v1.4s, v17.4s, v14.4s +str q1, [x0, #848] +mla v18.4S, v24.4S, v31.s[0] +mla v10.4S, v11.4S, v31.s[0] +add v17.4s, v17.4s, v14.4s +str q17, [x0, #832] +sub v8.4s, v20.4s, v5.4s +str q8, [x0, #944] +add v20.4s, v20.4s, v5.4s +str q20, [x0, #928] +sub v20.4s, v7.4s, v25.4s +str q20, [x0, #912] +add v7.4s, v7.4s, v25.4s +str q7, [x0, #896] +sub v7.4s, v21.4s, v18.4s +str q7, [x0, #1008] +add v21.4s, v21.4s, v18.4s +str q21, [x0, #992] +sub v21.4s, v19.4s, v10.4s +str q21, [x0, #976] +add v19.4s, v19.4s, v10.4s +str q19, [x0, #960] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1520 +// Instruction count: 1516 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_15.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_15.s new file mode 100644 index 0000000..afe097b --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_15.s @@ -0,0 +1,1550 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_22_z4_15 +.global _ntt_u32_incomplete_neon_asm_var_4_2_22_z4_15 +ntt_u32_incomplete_neon_asm_var_4_2_22_z4_15: +_ntt_u32_incomplete_neon_asm_var_4_2_22_z4_15: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x0, #992] +sqrdmulh v27.4S, v28.4S, v29.s[0] +mul v28.4S, v28.4S,v30.s[0] +ldr q26, [x0, #928] +sqrdmulh v25.4S, v26.4S, v29.s[0] +mul v26.4S, v26.4S,v30.s[0] +ldr q24, [x0, #864] +sqrdmulh v23.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +ldr q22, [x0, #800] +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +ldr q20, [x0, #736] +sqrdmulh v19.4S, v20.4S, v29.s[0] +mla v28.4S, v27.4S, v31.s[0] +ldr q27, [x0, #672] +sqrdmulh v18.4S, v27.4S, v29.s[0] +mla v26.4S, v25.4S, v31.s[0] +ldr q25, [x0, #608] +sqrdmulh v17.4S, v25.4S, v29.s[0] +mla v24.4S, v23.4S, v31.s[0] +ldr q23, [x0, #544] +sqrdmulh v16.4S, v23.4S, v29.s[0] +mla v22.4S, v21.4S, v31.s[0] +ldr q21, [x0, #480] +mul v27.4S, v27.4S,v30.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q3, [x0, #416] +ldr q2, [x0, #352] +ldr q1, [x0, #288] +mla v27.4S, v18.4S, v31.s[0] +mla v20.4S, v19.4S, v31.s[0] +ldr q19, [x0, #224] +ldr q18, [x0, #160] +mul v23.4S, v23.4S,v30.s[0] +mul v25.4S, v25.4S,v30.s[0] +ldr q0, [x0, #96] +ldr q15, [x0, #32] +mla v23.4S, v16.4S, v31.s[0] +mla v25.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v28.4s +add v21.4s, v21.4s, v28.4s +sqrdmulh v28.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +sub v16.4s, v3.4s, v26.4s +add v3.4s, v3.4s, v26.4s +sqrdmulh v26.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +sub v14.4s, v2.4s, v24.4s +add v2.4s, v2.4s, v24.4s +sqrdmulh v24.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v13.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +sqrdmulh v22.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v12.4s, v19.4s, v20.4s +add v19.4s, v19.4s, v20.4s +sqrdmulh v20.4S, v14.4S, v29.s[2] +mla v17.4S, v28.4S, v31.s[0] +sub v28.4s, v18.4s, v27.4s +add v18.4s, v18.4s, v27.4s +sqrdmulh v27.4S, v13.4S, v29.s[2] +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v0.4s, v25.4s +add v0.4s, v0.4s, v25.4s +sqrdmulh v25.4S, v2.4S, v29.s[1] +mla v21.4S, v24.4S, v31.s[0] +sub v24.4s, v15.4s, v23.4s +sqrdmulh v11.4S, v1.4S, v29.s[1] +mla v3.4S, v22.4S, v31.s[0] +add v15.4s, v15.4s, v23.4s +ldr q23, [x17, #+32] +ldr q22, [x17, #+48] +mul v13.4S, v13.4S,v30.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v10.4s, v12.4s, v17.4s +add v12.4s, v12.4s, v17.4s +mla v13.4S, v27.4S, v31.s[0] +mla v14.4S, v20.4S, v31.s[0] +sub v20.4s, v28.4s, v16.4s +add v28.4s, v28.4s, v16.4s +mul v1.4S, v1.4S,v30.s[1] +mul v2.4S, v2.4S,v30.s[1] +sub v16.4s, v19.4s, v21.4s +add v19.4s, v19.4s, v21.4s +mla v1.4S, v11.4S, v31.s[0] +mla v2.4S, v25.4S, v31.s[0] +sub v25.4s, v18.4s, v3.4s +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v10.4S, v22.s[3] +mul v10.4S, v10.4S,v23.s[3] +sub v11.4s, v26.4s, v14.4s +add v26.4s, v26.4s, v14.4s +sqrdmulh v14.4S, v12.4S, v22.s[2] +mul v12.4S, v12.4S,v23.s[2] +sub v21.4s, v24.4s, v13.4s +add v24.4s, v24.4s, v13.4s +sqrdmulh v13.4S, v16.4S, v22.s[1] +mul v16.4S, v16.4S,v23.s[1] +sub v27.4s, v0.4s, v2.4s +add v0.4s, v0.4s, v2.4s +sqrdmulh v2.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v17.4s, v15.4s, v1.4s +add v15.4s, v15.4s, v1.4s +ldr q1, [x17, #+96] +ldr q9, [x17, #+112] +sqrdmulh v8.4S, v20.4S, v22.s[3] +mla v10.4S, v3.4S, v31.s[0] +nop +nop +sqrdmulh v3.4S, v28.4S, v22.s[2] +mla v12.4S, v14.4S, v31.s[0] +nop +nop +sqrdmulh v14.4S, v25.4S, v22.s[1] +mla v16.4S, v13.4S, v31.s[0] +nop +nop +sqrdmulh v13.4S, v18.4S, v22.s[0] +mla v19.4S, v2.4S, v31.s[0] +nop +nop +ldr q2, [x17, #+64] +ldr q7, [x17, #+80] +mul v28.4S, v28.4S,v23.s[2] +mul v20.4S, v20.4S,v23.s[3] +sub v6.4s, v11.4s, v10.4s +add v11.4s, v11.4s, v10.4s +mla v28.4S, v3.4S, v31.s[0] +mla v20.4S, v8.4S, v31.s[0] +sub v8.4s, v26.4s, v12.4s +add v26.4s, v26.4s, v12.4s +mul v18.4S, v18.4S,v23.s[0] +mul v25.4S, v25.4S,v23.s[1] +sub v12.4s, v27.4s, v16.4s +add v27.4s, v27.4s, v16.4s +mla v18.4S, v13.4S, v31.s[0] +mla v25.4S, v14.4S, v31.s[0] +sub v14.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v9.s[3] +mul v6.4S, v6.4S,v1.s[3] +sub v13.4s, v21.4s, v20.4s +add v21.4s, v21.4s, v20.4s +sqrdmulh v20.4S, v11.4S, v9.s[2] +mul v11.4S, v11.4S,v1.s[2] +sub v16.4s, v24.4s, v28.4s +add v24.4s, v24.4s, v28.4s +sqrdmulh v28.4S, v8.4S, v9.s[1] +mul v8.4S, v8.4S,v1.s[1] +sub v3.4s, v17.4s, v25.4s +add v17.4s, v17.4s, v25.4s +sqrdmulh v25.4S, v26.4S, v9.s[0] +mul v26.4S, v26.4S,v1.s[0] +sub v10.4s, v15.4s, v18.4s +add v15.4s, v15.4s, v18.4s +sqrdmulh v18.4S, v12.4S, v7.s[3] +mla v6.4S, v19.4S, v31.s[0] +nop +nop +sqrdmulh v19.4S, v27.4S, v7.s[2] +mla v11.4S, v20.4S, v31.s[0] +nop +nop +sqrdmulh v20.4S, v14.4S, v7.s[1] +mla v8.4S, v28.4S, v31.s[0] +nop +nop +sqrdmulh v28.4S, v0.4S, v7.s[0] +mla v26.4S, v25.4S, v31.s[0] +nop +nop +mul v27.4S, v27.4S,v2.s[2] +mul v12.4S, v12.4S,v2.s[3] +sub v25.4s, v13.4s, v6.4s +str q25, [x0, #992] +mla v27.4S, v19.4S, v31.s[0] +mla v12.4S, v18.4S, v31.s[0] +add v13.4s, v13.4s, v6.4s +str q13, [x0, #928] +mul v0.4S, v0.4S,v2.s[0] +mul v14.4S, v14.4S,v2.s[1] +sub v13.4s, v21.4s, v11.4s +str q13, [x0, #864] +mla v0.4S, v28.4S, v31.s[0] +mla v14.4S, v20.4S, v31.s[0] +add v21.4s, v21.4s, v11.4s +sub v11.4s, v16.4s, v8.4s +ldr q20, [x0, #1008] +sqrdmulh v28.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v16.4s, v16.4s, v8.4s +str q21, [x0, #800] +ldr q21, [x0, #944] +sqrdmulh v8.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +sub v13.4s, v24.4s, v26.4s +str q11, [x0, #736] +ldr q11, [x0, #880] +sqrdmulh v6.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +add v24.4s, v24.4s, v26.4s +str q16, [x0, #672] +ldr q16, [x0, #816] +sqrdmulh v26.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v18.4s, v3.4s, v12.4s +str q13, [x0, #608] +ldr q13, [x0, #752] +sqrdmulh v19.4S, v13.4S, v29.s[0] +mla v20.4S, v28.4S, v31.s[0] +add v3.4s, v3.4s, v12.4s +str q24, [x0, #544] +ldr q24, [x0, #688] +sqrdmulh v12.4S, v24.4S, v29.s[0] +mla v21.4S, v8.4S, v31.s[0] +sub v8.4s, v17.4s, v27.4s +str q18, [x0, #480] +ldr q18, [x0, #624] +sqrdmulh v28.4S, v18.4S, v29.s[0] +mla v11.4S, v6.4S, v31.s[0] +add v17.4s, v17.4s, v27.4s +str q3, [x0, #416] +ldr q3, [x0, #560] +sqrdmulh v27.4S, v3.4S, v29.s[0] +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v10.4s, v14.4s +str q8, [x0, #352] +ldr q8, [x0, #496] +add v10.4s, v10.4s, v14.4s +mul v24.4S, v24.4S,v30.s[0] +mul v13.4S, v13.4S,v30.s[0] +ldr q14, [x0, #432] +str q17, [x0, #288] +ldr q17, [x0, #368] +ldr q6, [x0, #304] +mla v24.4S, v12.4S, v31.s[0] +mla v13.4S, v19.4S, v31.s[0] +str q26, [x0, #224] +sub v26.4s, v15.4s, v0.4s +ldr q19, [x0, #240] +ldr q12, [x0, #176] +mul v3.4S, v3.4S,v30.s[0] +mul v18.4S, v18.4S,v30.s[0] +str q10, [x0, #160] +add v15.4s, v15.4s, v0.4s +ldr q0, [x0, #112] +ldr q10, [x0, #48] +mla v3.4S, v27.4S, v31.s[0] +mla v18.4S, v28.4S, v31.s[0] +sub v28.4s, v8.4s, v20.4s +add v8.4s, v8.4s, v20.4s +sqrdmulh v20.4S, v28.4S, v29.s[2] +mul v28.4S, v28.4S,v30.s[2] +sub v27.4s, v14.4s, v21.4s +add v14.4s, v14.4s, v21.4s +sqrdmulh v21.4S, v27.4S, v29.s[2] +mul v27.4S, v27.4S,v30.s[2] +sub v25.4s, v17.4s, v11.4s +add v17.4s, v17.4s, v11.4s +sqrdmulh v11.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +sub v5.4s, v6.4s, v16.4s +add v6.4s, v6.4s, v16.4s +sqrdmulh v16.4S, v14.4S, v29.s[1] +mul v14.4S, v14.4S,v30.s[1] +sub v4.4s, v19.4s, v13.4s +add v19.4s, v19.4s, v13.4s +sqrdmulh v13.4S, v25.4S, v29.s[2] +mla v28.4S, v20.4S, v31.s[0] +sub v20.4s, v12.4s, v24.4s +add v12.4s, v12.4s, v24.4s +sqrdmulh v24.4S, v5.4S, v29.s[2] +mla v27.4S, v21.4S, v31.s[0] +sub v21.4s, v0.4s, v18.4s +add v0.4s, v0.4s, v18.4s +sqrdmulh v18.4S, v17.4S, v29.s[1] +mla v8.4S, v11.4S, v31.s[0] +sub v11.4s, v10.4s, v3.4s +str q26, [x0, #96] +sqrdmulh v26.4S, v6.4S, v29.s[1] +mla v14.4S, v16.4S, v31.s[0] +add v10.4s, v10.4s, v3.4s +str q15, [x0, #32] +mul v5.4S, v5.4S,v30.s[2] +mul v25.4S, v25.4S,v30.s[2] +sub v15.4s, v4.4s, v28.4s +add v4.4s, v4.4s, v28.4s +mla v5.4S, v24.4S, v31.s[0] +mla v25.4S, v13.4S, v31.s[0] +sub v13.4s, v20.4s, v27.4s +add v20.4s, v20.4s, v27.4s +mul v6.4S, v6.4S,v30.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v27.4s, v19.4s, v8.4s +add v19.4s, v19.4s, v8.4s +mla v6.4S, v26.4S, v31.s[0] +mla v17.4S, v18.4S, v31.s[0] +sub v18.4s, v12.4s, v14.4s +add v12.4s, v12.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v22.s[3] +mul v15.4S, v15.4S,v23.s[3] +sub v26.4s, v21.4s, v25.4s +add v21.4s, v21.4s, v25.4s +sqrdmulh v25.4S, v4.4S, v22.s[2] +mul v4.4S, v4.4S,v23.s[2] +sub v8.4s, v11.4s, v5.4s +add v11.4s, v11.4s, v5.4s +sqrdmulh v5.4S, v27.4S, v22.s[1] +mul v27.4S, v27.4S,v23.s[1] +sub v24.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +sqrdmulh v17.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v28.4s, v10.4s, v6.4s +add v10.4s, v10.4s, v6.4s +sqrdmulh v6.4S, v13.4S, v22.s[3] +mla v15.4S, v14.4S, v31.s[0] +nop +nop +sqrdmulh v14.4S, v20.4S, v22.s[2] +mla v4.4S, v25.4S, v31.s[0] +nop +nop +sqrdmulh v25.4S, v18.4S, v22.s[1] +mla v27.4S, v5.4S, v31.s[0] +nop +nop +sqrdmulh v5.4S, v12.4S, v22.s[0] +mla v19.4S, v17.4S, v31.s[0] +nop +nop +mul v20.4S, v20.4S,v23.s[2] +mul v13.4S, v13.4S,v23.s[3] +sub v17.4s, v26.4s, v15.4s +add v26.4s, v26.4s, v15.4s +mla v20.4S, v14.4S, v31.s[0] +mla v13.4S, v6.4S, v31.s[0] +sub v6.4s, v21.4s, v4.4s +add v21.4s, v21.4s, v4.4s +mul v12.4S, v12.4S,v23.s[0] +mul v18.4S, v18.4S,v23.s[1] +sub v4.4s, v24.4s, v27.4s +add v24.4s, v24.4s, v27.4s +mla v12.4S, v5.4S, v31.s[0] +mla v18.4S, v25.4S, v31.s[0] +sub v25.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v17.4S, v9.s[3] +mul v17.4S, v17.4S,v1.s[3] +sub v5.4s, v8.4s, v13.4s +add v8.4s, v8.4s, v13.4s +sqrdmulh v13.4S, v26.4S, v9.s[2] +mul v26.4S, v26.4S,v1.s[2] +sub v27.4s, v11.4s, v20.4s +add v11.4s, v11.4s, v20.4s +sqrdmulh v20.4S, v6.4S, v9.s[1] +mul v6.4S, v6.4S,v1.s[1] +sub v14.4s, v28.4s, v18.4s +add v28.4s, v28.4s, v18.4s +sqrdmulh v18.4S, v21.4S, v9.s[0] +mul v21.4S, v21.4S,v1.s[0] +sub v15.4s, v10.4s, v12.4s +add v10.4s, v10.4s, v12.4s +sqrdmulh v12.4S, v4.4S, v7.s[3] +mla v17.4S, v19.4S, v31.s[0] +nop +nop +sqrdmulh v19.4S, v24.4S, v7.s[2] +mla v26.4S, v13.4S, v31.s[0] +nop +nop +sqrdmulh v13.4S, v25.4S, v7.s[1] +mla v6.4S, v20.4S, v31.s[0] +nop +nop +sqrdmulh v20.4S, v0.4S, v7.s[0] +mla v21.4S, v18.4S, v31.s[0] +nop +nop +mul v24.4S, v24.4S,v2.s[2] +mul v4.4S, v4.4S,v2.s[3] +sub v18.4s, v5.4s, v17.4s +str q18, [x0, #1008] +mla v24.4S, v19.4S, v31.s[0] +mla v4.4S, v12.4S, v31.s[0] +add v5.4s, v5.4s, v17.4s +str q5, [x0, #944] +mul v0.4S, v0.4S,v2.s[0] +mul v25.4S, v25.4S,v2.s[1] +sub v5.4s, v8.4s, v26.4s +str q5, [x0, #880] +mla v0.4S, v20.4S, v31.s[0] +mla v25.4S, v13.4S, v31.s[0] +add v8.4s, v8.4s, v26.4s +sub v26.4s, v27.4s, v6.4s +ldr q13, [x0, #960] +sqrdmulh v20.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +add v27.4s, v27.4s, v6.4s +str q8, [x0, #816] +ldr q8, [x0, #896] +sqrdmulh v6.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v5.4s, v11.4s, v21.4s +str q26, [x0, #752] +ldr q26, [x0, #832] +sqrdmulh v17.4S, v26.4S, v29.s[0] +mul v26.4S, v26.4S,v30.s[0] +add v11.4s, v11.4s, v21.4s +str q27, [x0, #688] +ldr q27, [x0, #768] +sqrdmulh v21.4S, v27.4S, v29.s[0] +mul v27.4S, v27.4S,v30.s[0] +sub v12.4s, v14.4s, v4.4s +str q5, [x0, #624] +ldr q5, [x0, #704] +sqrdmulh v19.4S, v5.4S, v29.s[0] +mla v13.4S, v20.4S, v31.s[0] +add v14.4s, v14.4s, v4.4s +str q11, [x0, #560] +ldr q11, [x0, #640] +sqrdmulh v4.4S, v11.4S, v29.s[0] +mla v8.4S, v6.4S, v31.s[0] +sub v6.4s, v28.4s, v24.4s +str q12, [x0, #496] +ldr q12, [x0, #576] +sqrdmulh v20.4S, v12.4S, v29.s[0] +mla v26.4S, v17.4S, v31.s[0] +add v28.4s, v28.4s, v24.4s +str q14, [x0, #432] +ldr q14, [x0, #512] +sqrdmulh v24.4S, v14.4S, v29.s[0] +mla v27.4S, v21.4S, v31.s[0] +sub v21.4s, v15.4s, v25.4s +str q6, [x0, #368] +ldr q6, [x0, #448] +add v15.4s, v15.4s, v25.4s +mul v11.4S, v11.4S,v30.s[0] +mul v5.4S, v5.4S,v30.s[0] +ldr q25, [x0, #384] +str q28, [x0, #304] +ldr q28, [x0, #320] +ldr q17, [x0, #256] +mla v11.4S, v4.4S, v31.s[0] +mla v5.4S, v19.4S, v31.s[0] +str q21, [x0, #240] +sub v21.4s, v10.4s, v0.4s +ldr q19, [x0, #192] +ldr q4, [x0, #128] +mul v14.4S, v14.4S,v30.s[0] +mul v12.4S, v12.4S,v30.s[0] +str q15, [x0, #176] +add v10.4s, v10.4s, v0.4s +ldr q0, [x0, #64] +ldr q15, [x0, #0] +mla v14.4S, v24.4S, v31.s[0] +mla v12.4S, v20.4S, v31.s[0] +sub v20.4s, v6.4s, v13.4s +add v6.4s, v6.4s, v13.4s +sqrdmulh v13.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +sub v24.4s, v25.4s, v8.4s +add v25.4s, v25.4s, v8.4s +sqrdmulh v8.4S, v24.4S, v29.s[2] +mul v24.4S, v24.4S,v30.s[2] +sub v18.4s, v28.4s, v26.4s +add v28.4s, v28.4s, v26.4s +sqrdmulh v26.4S, v6.4S, v29.s[1] +mul v6.4S, v6.4S,v30.s[1] +sub v3.4s, v17.4s, v27.4s +add v17.4s, v17.4s, v27.4s +sqrdmulh v27.4S, v25.4S, v29.s[1] +mul v25.4S, v25.4S,v30.s[1] +sub v16.4s, v19.4s, v5.4s +add v19.4s, v19.4s, v5.4s +sqrdmulh v5.4S, v18.4S, v29.s[2] +mla v20.4S, v13.4S, v31.s[0] +sub v13.4s, v4.4s, v11.4s +add v4.4s, v4.4s, v11.4s +sqrdmulh v11.4S, v3.4S, v29.s[2] +mla v24.4S, v8.4S, v31.s[0] +sub v8.4s, v0.4s, v12.4s +add v0.4s, v0.4s, v12.4s +sqrdmulh v12.4S, v28.4S, v29.s[1] +mla v6.4S, v26.4S, v31.s[0] +sub v26.4s, v15.4s, v14.4s +str q21, [x0, #112] +sqrdmulh v21.4S, v17.4S, v29.s[1] +mla v25.4S, v27.4S, v31.s[0] +add v15.4s, v15.4s, v14.4s +str q10, [x0, #48] +mul v3.4S, v3.4S,v30.s[2] +mul v18.4S, v18.4S,v30.s[2] +sub v10.4s, v16.4s, v20.4s +add v16.4s, v16.4s, v20.4s +mla v3.4S, v11.4S, v31.s[0] +mla v18.4S, v5.4S, v31.s[0] +sub v5.4s, v13.4s, v24.4s +add v13.4s, v13.4s, v24.4s +mul v17.4S, v17.4S,v30.s[1] +mul v28.4S, v28.4S,v30.s[1] +sub v24.4s, v19.4s, v6.4s +add v19.4s, v19.4s, v6.4s +mla v17.4S, v21.4S, v31.s[0] +mla v28.4S, v12.4S, v31.s[0] +sub v12.4s, v4.4s, v25.4s +add v4.4s, v4.4s, v25.4s +sqrdmulh v25.4S, v10.4S, v22.s[3] +mul v10.4S, v10.4S,v23.s[3] +sub v21.4s, v8.4s, v18.4s +add v8.4s, v8.4s, v18.4s +sqrdmulh v18.4S, v16.4S, v22.s[2] +mul v16.4S, v16.4S,v23.s[2] +sub v6.4s, v26.4s, v3.4s +add v26.4s, v26.4s, v3.4s +sqrdmulh v3.4S, v24.4S, v22.s[1] +mul v24.4S, v24.4S,v23.s[1] +sub v11.4s, v0.4s, v28.4s +add v0.4s, v0.4s, v28.4s +sqrdmulh v28.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v20.4s, v15.4s, v17.4s +add v15.4s, v15.4s, v17.4s +sqrdmulh v17.4S, v5.4S, v22.s[3] +mla v10.4S, v25.4S, v31.s[0] +nop +nop +sqrdmulh v25.4S, v13.4S, v22.s[2] +mla v16.4S, v18.4S, v31.s[0] +nop +nop +sqrdmulh v18.4S, v12.4S, v22.s[1] +mla v24.4S, v3.4S, v31.s[0] +nop +nop +sqrdmulh v3.4S, v4.4S, v22.s[0] +mla v19.4S, v28.4S, v31.s[0] +nop +nop +mul v13.4S, v13.4S,v23.s[2] +mul v5.4S, v5.4S,v23.s[3] +sub v28.4s, v21.4s, v10.4s +add v21.4s, v21.4s, v10.4s +mla v13.4S, v25.4S, v31.s[0] +mla v5.4S, v17.4S, v31.s[0] +sub v17.4s, v8.4s, v16.4s +add v8.4s, v8.4s, v16.4s +mul v4.4S, v4.4S,v23.s[0] +mul v12.4S, v12.4S,v23.s[1] +sub v16.4s, v11.4s, v24.4s +add v11.4s, v11.4s, v24.4s +mla v4.4S, v3.4S, v31.s[0] +mla v12.4S, v18.4S, v31.s[0] +sub v18.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v28.4S, v9.s[3] +mul v28.4S, v28.4S,v1.s[3] +sub v3.4s, v6.4s, v5.4s +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v21.4S, v9.s[2] +mul v21.4S, v21.4S,v1.s[2] +sub v24.4s, v26.4s, v13.4s +add v26.4s, v26.4s, v13.4s +sqrdmulh v13.4S, v17.4S, v9.s[1] +mul v17.4S, v17.4S,v1.s[1] +sub v25.4s, v20.4s, v12.4s +add v20.4s, v20.4s, v12.4s +sqrdmulh v12.4S, v8.4S, v9.s[0] +mul v8.4S, v8.4S,v1.s[0] +sub v10.4s, v15.4s, v4.4s +add v15.4s, v15.4s, v4.4s +sqrdmulh v4.4S, v16.4S, v7.s[3] +mla v28.4S, v19.4S, v31.s[0] +nop +nop +sqrdmulh v19.4S, v11.4S, v7.s[2] +mla v21.4S, v5.4S, v31.s[0] +nop +nop +sqrdmulh v5.4S, v18.4S, v7.s[1] +mla v17.4S, v13.4S, v31.s[0] +nop +nop +sqrdmulh v13.4S, v0.4S, v7.s[0] +mla v8.4S, v12.4S, v31.s[0] +nop +nop +mul v11.4S, v11.4S,v2.s[2] +mul v16.4S, v16.4S,v2.s[3] +sub v12.4s, v3.4s, v28.4s +str q12, [x0, #960] +mla v11.4S, v19.4S, v31.s[0] +mla v16.4S, v4.4S, v31.s[0] +add v3.4s, v3.4s, v28.4s +str q3, [x0, #896] +mul v0.4S, v0.4S,v2.s[0] +mul v18.4S, v18.4S,v2.s[1] +sub v3.4s, v6.4s, v21.4s +str q3, [x0, #832] +mla v0.4S, v13.4S, v31.s[0] +mla v18.4S, v5.4S, v31.s[0] +add v6.4s, v6.4s, v21.4s +sub v21.4s, v24.4s, v17.4s +ldr q5, [x0, #976] +sqrdmulh v13.4S, v5.4S, v29.s[0] +mul v5.4S, v5.4S,v30.s[0] +add v24.4s, v24.4s, v17.4s +str q6, [x0, #768] +ldr q6, [x0, #912] +sqrdmulh v17.4S, v6.4S, v29.s[0] +mul v6.4S, v6.4S,v30.s[0] +sub v3.4s, v26.4s, v8.4s +str q21, [x0, #704] +ldr q21, [x0, #848] +sqrdmulh v28.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +add v26.4s, v26.4s, v8.4s +str q24, [x0, #640] +ldr q24, [x0, #784] +sqrdmulh v8.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +sub v4.4s, v25.4s, v16.4s +str q3, [x0, #576] +ldr q3, [x0, #720] +sqrdmulh v19.4S, v3.4S, v29.s[0] +mla v5.4S, v13.4S, v31.s[0] +add v25.4s, v25.4s, v16.4s +str q26, [x0, #512] +ldr q26, [x0, #656] +sqrdmulh v16.4S, v26.4S, v29.s[0] +mla v6.4S, v17.4S, v31.s[0] +sub v17.4s, v20.4s, v11.4s +str q4, [x0, #448] +ldr q4, [x0, #592] +sqrdmulh v13.4S, v4.4S, v29.s[0] +mla v21.4S, v28.4S, v31.s[0] +add v20.4s, v20.4s, v11.4s +str q25, [x0, #384] +ldr q25, [x0, #528] +sqrdmulh v11.4S, v25.4S, v29.s[0] +mla v24.4S, v8.4S, v31.s[0] +sub v8.4s, v10.4s, v18.4s +str q17, [x0, #320] +ldr q17, [x0, #464] +add v10.4s, v10.4s, v18.4s +mul v26.4S, v26.4S,v30.s[0] +mul v3.4S, v3.4S,v30.s[0] +ldr q18, [x0, #400] +str q20, [x0, #256] +ldr q20, [x0, #336] +ldr q28, [x0, #272] +mla v26.4S, v16.4S, v31.s[0] +mla v3.4S, v19.4S, v31.s[0] +str q8, [x0, #192] +sub v8.4s, v15.4s, v0.4s +ldr q19, [x0, #208] +ldr q16, [x0, #144] +mul v25.4S, v25.4S,v30.s[0] +mul v4.4S, v4.4S,v30.s[0] +str q10, [x0, #128] +add v15.4s, v15.4s, v0.4s +ldr q0, [x0, #80] +ldr q10, [x0, #16] +mla v25.4S, v11.4S, v31.s[0] +mla v4.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v5.4s +add v17.4s, v17.4s, v5.4s +sqrdmulh v5.4S, v13.4S, v29.s[2] +mul v13.4S, v13.4S,v30.s[2] +sub v11.4s, v18.4s, v6.4s +add v18.4s, v18.4s, v6.4s +sqrdmulh v6.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v12.4s, v20.4s, v21.4s +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v14.4s, v28.4s, v24.4s +add v28.4s, v28.4s, v24.4s +sqrdmulh v24.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v27.4s, v19.4s, v3.4s +add v19.4s, v19.4s, v3.4s +sqrdmulh v3.4S, v12.4S, v29.s[2] +mla v13.4S, v5.4S, v31.s[0] +sub v5.4s, v16.4s, v26.4s +add v16.4s, v16.4s, v26.4s +sqrdmulh v26.4S, v14.4S, v29.s[2] +mla v11.4S, v6.4S, v31.s[0] +sub v6.4s, v0.4s, v4.4s +add v0.4s, v0.4s, v4.4s +sqrdmulh v4.4S, v20.4S, v29.s[1] +mla v17.4S, v21.4S, v31.s[0] +sub v21.4s, v10.4s, v25.4s +str q8, [x0, #64] +sqrdmulh v8.4S, v28.4S, v29.s[1] +mla v18.4S, v24.4S, v31.s[0] +add v10.4s, v10.4s, v25.4s +str q15, [x0, #0] +mul v14.4S, v14.4S,v30.s[2] +mul v12.4S, v12.4S,v30.s[2] +sub v15.4s, v27.4s, v13.4s +add v27.4s, v27.4s, v13.4s +mla v14.4S, v26.4S, v31.s[0] +mla v12.4S, v3.4S, v31.s[0] +sub v3.4s, v5.4s, v11.4s +add v5.4s, v5.4s, v11.4s +mul v28.4S, v28.4S,v30.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v11.4s, v19.4s, v17.4s +add v19.4s, v19.4s, v17.4s +mla v28.4S, v8.4S, v31.s[0] +mla v20.4S, v4.4S, v31.s[0] +sub v4.4s, v16.4s, v18.4s +add v16.4s, v16.4s, v18.4s +sqrdmulh v29.4S, v15.4S, v22.s[3] +mul v15.4S, v15.4S,v23.s[3] +sub v30.4s, v6.4s, v12.4s +add v6.4s, v6.4s, v12.4s +sqrdmulh v12.4S, v27.4S, v22.s[2] +mul v27.4S, v27.4S,v23.s[2] +sub v18.4s, v21.4s, v14.4s +add v21.4s, v21.4s, v14.4s +sqrdmulh v14.4S, v11.4S, v22.s[1] +mul v11.4S, v11.4S,v23.s[1] +sub v8.4s, v0.4s, v20.4s +add v0.4s, v0.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v17.4s, v10.4s, v28.4s +add v10.4s, v10.4s, v28.4s +sqrdmulh v28.4S, v3.4S, v22.s[3] +mla v15.4S, v29.4S, v31.s[0] +nop +nop +sqrdmulh v29.4S, v5.4S, v22.s[2] +mla v27.4S, v12.4S, v31.s[0] +nop +nop +sqrdmulh v12.4S, v4.4S, v22.s[1] +mla v11.4S, v14.4S, v31.s[0] +nop +nop +sqrdmulh v14.4S, v16.4S, v22.s[0] +mla v19.4S, v20.4S, v31.s[0] +nop +nop +mul v5.4S, v5.4S,v23.s[2] +mul v3.4S, v3.4S,v23.s[3] +sub v20.4s, v30.4s, v15.4s +add v30.4s, v30.4s, v15.4s +mla v5.4S, v29.4S, v31.s[0] +mla v3.4S, v28.4S, v31.s[0] +sub v28.4s, v6.4s, v27.4s +add v6.4s, v6.4s, v27.4s +mul v16.4S, v16.4S,v23.s[0] +mul v4.4S, v4.4S,v23.s[1] +sub v27.4s, v8.4s, v11.4s +add v8.4s, v8.4s, v11.4s +mla v16.4S, v14.4S, v31.s[0] +mla v4.4S, v12.4S, v31.s[0] +sub v12.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v22.4S, v20.4S, v9.s[3] +mul v20.4S, v20.4S,v1.s[3] +sub v23.4s, v18.4s, v3.4s +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v30.4S, v9.s[2] +mul v30.4S, v30.4S,v1.s[2] +sub v19.4s, v21.4s, v5.4s +add v21.4s, v21.4s, v5.4s +sqrdmulh v5.4S, v28.4S, v9.s[1] +mul v28.4S, v28.4S,v1.s[1] +sub v14.4s, v17.4s, v4.4s +add v17.4s, v17.4s, v4.4s +sqrdmulh v4.4S, v6.4S, v9.s[0] +mul v6.4S, v6.4S,v1.s[0] +sub v11.4s, v10.4s, v16.4s +add v10.4s, v10.4s, v16.4s +sqrdmulh v9.4S, v27.4S, v7.s[3] +mla v20.4S, v22.4S, v31.s[0] +nop +nop +sqrdmulh v22.4S, v8.4S, v7.s[2] +mla v30.4S, v3.4S, v31.s[0] +nop +nop +sqrdmulh v3.4S, v12.4S, v7.s[1] +mla v28.4S, v5.4S, v31.s[0] +nop +nop +sqrdmulh v5.4S, v0.4S, v7.s[0] +mla v6.4S, v4.4S, v31.s[0] +nop +nop +mul v8.4S, v8.4S,v2.s[2] +mul v27.4S, v27.4S,v2.s[3] +sub v4.4s, v23.4s, v20.4s +str q4, [x0, #976] +mla v8.4S, v22.4S, v31.s[0] +mla v27.4S, v9.4S, v31.s[0] +add v23.4s, v23.4s, v20.4s +str q23, [x0, #912] +mul v0.4S, v0.4S,v2.s[0] +mul v12.4S, v12.4S,v2.s[1] +sub v23.4s, v18.4s, v30.4s +str q23, [x0, #848] +mla v0.4S, v5.4S, v31.s[0] +mla v12.4S, v3.4S, v31.s[0] +add v18.4s, v18.4s, v30.4s +sub v30.4s, v19.4s, v28.4s +add v19.4s, v19.4s, v28.4s +str q18, [x0, #784] +sub v18.4s, v21.4s, v6.4s +str q30, [x0, #720] +add v21.4s, v21.4s, v6.4s +str q19, [x0, #656] +sub v19.4s, v14.4s, v27.4s +str q18, [x0, #592] +add v14.4s, v14.4s, v27.4s +str q21, [x0, #528] +sub v21.4s, v17.4s, v8.4s +str q19, [x0, #464] +add v17.4s, v17.4s, v8.4s +str q14, [x0, #400] +sub v14.4s, v11.4s, v12.4s +str q21, [x0, #336] +add v11.4s, v11.4s, v12.4s +str q17, [x0, #272] +sub v17.4s, v10.4s, v0.4s +add v10.4s, v10.4s, v0.4s +ldr q24, [x0, #48] +ldr q25, [x0, #32] +ldr q13, [x0, #112] +ldr q26, [x0, #96] +ldr q15, [x17, #+128] +ldr q29, [x17, #+144] +ldr q16, [x17, #+160] +ldr q1, [x17, #+176] +ldr q4, [x0, #176] +ldr q22, [x0, #160] +sqrdmulh v9.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v15.s[0] +ldr q20, [x0, #240] +sqrdmulh v23.4S, v25.4S, v29.s[0] +mul v25.4S, v25.4S,v15.s[0] +ldr q5, [x0, #224] +sqrdmulh v3.4S, v13.4S, v1.s[0] +mul v13.4S, v13.4S,v16.s[0] +ldr q2, [x17, #+192] +sqrdmulh v7.4S, v26.4S, v1.s[0] +mul v26.4S, v26.4S,v16.s[0] +ldr q28, [x17, #+208] +mla v24.4S, v9.4S, v31.s[0] +sqrdmulh v9.4S, v4.4S, v28.s[0] +ldr q30, [x17, #+224] +mla v25.4S, v23.4S, v31.s[0] +sqrdmulh v23.4S, v22.4S, v28.s[0] +ldr q6, [x17, #+240] +mla v13.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v20.4S, v6.s[0] +mla v26.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v5.4S, v6.s[0] +ldr q18, [x0, #0] +mul v4.4S, v4.4S,v2.s[0] +mul v22.4S, v22.4S,v2.s[0] +mla v4.4S, v9.4S, v31.s[0] +mla v22.4S, v23.4S, v31.s[0] +ldr q23, [x0, #64] +mul v20.4S, v20.4S,v30.s[0] +mul v5.4S, v5.4S,v30.s[0] +sub v9.4s, v10.4s, v24.4s +add v10.4s, v10.4s, v24.4s +mla v20.4S, v3.4S, v31.s[0] +mla v5.4S, v7.4S, v31.s[0] +sub v7.4s, v18.4s, v25.4s +ldr q3, [x0, #128] +add v18.4s, v18.4s, v25.4s +sqrdmulh v25.4S, v9.4S, v29.s[2] +mul v9.4S, v9.4S,v15.s[2] +sub v24.4s, v17.4s, v13.4s +add v17.4s, v17.4s, v13.4s +sqrdmulh v13.4S, v10.4S, v29.s[1] +mul v10.4S, v10.4S,v15.s[1] +sub v27.4s, v23.4s, v26.4s +ldr q19, [x0, #192] +add v23.4s, v23.4s, v26.4s +sqrdmulh v29.4S, v24.4S, v1.s[2] +mul v24.4S, v24.4S,v16.s[2] +sub v26.4s, v11.4s, v4.4s +ldr q15, [x0, #304] +add v11.4s, v11.4s, v4.4s +sqrdmulh v4.4S, v17.4S, v1.s[1] +mul v17.4S, v17.4S,v16.s[1] +sub v8.4s, v3.4s, v22.4s +ldr q21, [x0, #288] +add v3.4s, v3.4s, v22.4s +mla v9.4S, v25.4S, v31.s[0] +sqrdmulh v25.4S, v26.4S, v28.s[2] +sub v1.4s, v14.4s, v20.4s +ldr q22, [x0, #368] +add v14.4s, v14.4s, v20.4s +mla v10.4S, v13.4S, v31.s[0] +sqrdmulh v13.4S, v11.4S, v28.s[1] +sub v20.4s, v19.4s, v5.4s +ldr q16, [x0, #352] +add v19.4s, v19.4s, v5.4s +mla v24.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v1.4S, v6.s[2] +sub v5.4s, v7.4s, v9.4s +ldr q12, [x17, #+256] +str q5, [x0, #48] +mla v17.4S, v4.4S, v31.s[0] +sqrdmulh v4.4S, v14.4S, v6.s[1] +add v7.4s, v7.4s, v9.4s +ldr q9, [x17, #+272] +str q7, [x0, #32] +mul v26.4S, v26.4S,v2.s[2] +mul v11.4S, v11.4S,v2.s[1] +sub v7.4s, v18.4s, v10.4s +ldr q5, [x17, #+288] +str q7, [x0, #16] +mla v26.4S, v25.4S, v31.s[0] +mla v11.4S, v13.4S, v31.s[0] +add v18.4s, v18.4s, v10.4s +ldr q10, [x17, #+304] +str q18, [x0, #0] +mul v1.4S, v1.4S,v30.s[2] +mul v14.4S, v14.4S,v30.s[1] +sub v28.4s, v27.4s, v24.4s +ldr q18, [x0, #432] +str q28, [x0, #112] +mla v1.4S, v29.4S, v31.s[0] +mla v14.4S, v4.4S, v31.s[0] +add v27.4s, v27.4s, v24.4s +ldr q24, [x0, #416] +str q27, [x0, #96] +sqrdmulh v6.4S, v15.4S, v9.s[0] +mul v15.4S, v15.4S,v12.s[0] +sub v27.4s, v23.4s, v17.4s +ldr q30, [x0, #496] +str q27, [x0, #80] +sqrdmulh v27.4S, v21.4S, v9.s[0] +mul v21.4S, v21.4S,v12.s[0] +add v23.4s, v23.4s, v17.4s +ldr q17, [x0, #480] +str q23, [x0, #64] +sqrdmulh v23.4S, v22.4S, v10.s[0] +mul v22.4S, v22.4S,v5.s[0] +sub v4.4s, v8.4s, v26.4s +ldr q29, [x17, #+320] +str q4, [x0, #176] +sqrdmulh v4.4S, v16.4S, v10.s[0] +mul v16.4S, v16.4S,v5.s[0] +add v8.4s, v8.4s, v26.4s +ldr q26, [x17, #+336] +str q8, [x0, #160] +mla v15.4S, v6.4S, v31.s[0] +sqrdmulh v6.4S, v18.4S, v26.s[0] +sub v8.4s, v3.4s, v11.4s +ldr q28, [x17, #+352] +str q8, [x0, #144] +mla v21.4S, v27.4S, v31.s[0] +sqrdmulh v27.4S, v24.4S, v26.s[0] +add v3.4s, v3.4s, v11.4s +ldr q11, [x17, #+368] +str q3, [x0, #128] +mla v22.4S, v23.4S, v31.s[0] +sqrdmulh v23.4S, v30.4S, v11.s[0] +sub v3.4s, v20.4s, v1.4s +ldr q8, [x0, #272] +str q3, [x0, #240] +mla v16.4S, v4.4S, v31.s[0] +sqrdmulh v4.4S, v17.4S, v11.s[0] +add v20.4s, v20.4s, v1.4s +ldr q1, [x0, #256] +str q20, [x0, #224] +mul v18.4S, v18.4S,v29.s[0] +mul v24.4S, v24.4S,v29.s[0] +sub v20.4s, v19.4s, v14.4s +ldr q3, [x0, #336] +str q20, [x0, #208] +mla v18.4S, v6.4S, v31.s[0] +mla v24.4S, v27.4S, v31.s[0] +add v19.4s, v19.4s, v14.4s +ldr q14, [x0, #320] +str q19, [x0, #192] +mul v30.4S, v30.4S,v28.s[0] +mul v17.4S, v17.4S,v28.s[0] +sub v19.4s, v8.4s, v15.4s +ldr q27, [x0, #400] +add v8.4s, v8.4s, v15.4s +mla v30.4S, v23.4S, v31.s[0] +mla v17.4S, v4.4S, v31.s[0] +sub v4.4s, v1.4s, v21.4s +ldr q23, [x0, #384] +add v1.4s, v1.4s, v21.4s +sqrdmulh v21.4S, v19.4S, v9.s[2] +mul v19.4S, v19.4S,v12.s[2] +sub v15.4s, v3.4s, v22.4s +ldr q6, [x0, #464] +add v3.4s, v3.4s, v22.4s +sqrdmulh v22.4S, v8.4S, v9.s[1] +mul v8.4S, v8.4S,v12.s[1] +sub v20.4s, v14.4s, v16.4s +ldr q2, [x0, #448] +add v14.4s, v14.4s, v16.4s +sqrdmulh v9.4S, v15.4S, v10.s[2] +mul v15.4S, v15.4S,v5.s[2] +sub v16.4s, v27.4s, v18.4s +ldr q12, [x0, #560] +add v27.4s, v27.4s, v18.4s +sqrdmulh v18.4S, v3.4S, v10.s[1] +mul v3.4S, v3.4S,v5.s[1] +sub v13.4s, v23.4s, v24.4s +ldr q25, [x0, #544] +add v23.4s, v23.4s, v24.4s +mla v19.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v16.4S, v26.s[2] +sub v10.4s, v6.4s, v30.4s +ldr q24, [x0, #624] +add v6.4s, v6.4s, v30.4s +mla v8.4S, v22.4S, v31.s[0] +sqrdmulh v22.4S, v27.4S, v26.s[1] +sub v30.4s, v2.4s, v17.4s +ldr q5, [x0, #608] +add v2.4s, v2.4s, v17.4s +mla v15.4S, v9.4S, v31.s[0] +sqrdmulh v9.4S, v10.4S, v11.s[2] +sub v17.4s, v4.4s, v19.4s +ldr q7, [x17, #+384] +str q17, [x0, #304] +mla v3.4S, v18.4S, v31.s[0] +sqrdmulh v18.4S, v6.4S, v11.s[1] +add v4.4s, v4.4s, v19.4s +ldr q19, [x17, #+400] +str q4, [x0, #288] +mul v16.4S, v16.4S,v29.s[2] +mul v27.4S, v27.4S,v29.s[1] +sub v4.4s, v1.4s, v8.4s +ldr q17, [x17, #+416] +str q4, [x0, #272] +mla v16.4S, v21.4S, v31.s[0] +mla v27.4S, v22.4S, v31.s[0] +add v1.4s, v1.4s, v8.4s +ldr q8, [x17, #+432] +str q1, [x0, #256] +mul v10.4S, v10.4S,v28.s[2] +mul v6.4S, v6.4S,v28.s[1] +sub v26.4s, v20.4s, v15.4s +ldr q1, [x0, #688] +str q26, [x0, #368] +mla v10.4S, v9.4S, v31.s[0] +mla v6.4S, v18.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +ldr q15, [x0, #672] +str q20, [x0, #352] +sqrdmulh v11.4S, v12.4S, v19.s[0] +mul v12.4S, v12.4S,v7.s[0] +sub v20.4s, v14.4s, v3.4s +ldr q28, [x0, #752] +str q20, [x0, #336] +sqrdmulh v20.4S, v25.4S, v19.s[0] +mul v25.4S, v25.4S,v7.s[0] +add v14.4s, v14.4s, v3.4s +ldr q3, [x0, #736] +str q14, [x0, #320] +sqrdmulh v14.4S, v24.4S, v8.s[0] +mul v24.4S, v24.4S,v17.s[0] +sub v18.4s, v13.4s, v16.4s +ldr q9, [x17, #+448] +str q18, [x0, #432] +sqrdmulh v18.4S, v5.4S, v8.s[0] +mul v5.4S, v5.4S,v17.s[0] +add v13.4s, v13.4s, v16.4s +ldr q16, [x17, #+464] +str q13, [x0, #416] +mla v12.4S, v11.4S, v31.s[0] +sqrdmulh v11.4S, v1.4S, v16.s[0] +sub v13.4s, v23.4s, v27.4s +ldr q26, [x17, #+480] +str q13, [x0, #400] +mla v25.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v15.4S, v16.s[0] +add v23.4s, v23.4s, v27.4s +ldr q27, [x17, #+496] +str q23, [x0, #384] +mla v24.4S, v14.4S, v31.s[0] +sqrdmulh v14.4S, v28.4S, v27.s[0] +sub v23.4s, v30.4s, v10.4s +ldr q13, [x0, #528] +str q23, [x0, #496] +mla v5.4S, v18.4S, v31.s[0] +sqrdmulh v18.4S, v3.4S, v27.s[0] +add v30.4s, v30.4s, v10.4s +ldr q10, [x0, #512] +str q30, [x0, #480] +mul v1.4S, v1.4S,v9.s[0] +mul v15.4S, v15.4S,v9.s[0] +sub v30.4s, v2.4s, v6.4s +ldr q23, [x0, #592] +str q30, [x0, #464] +mla v1.4S, v11.4S, v31.s[0] +mla v15.4S, v20.4S, v31.s[0] +add v2.4s, v2.4s, v6.4s +ldr q6, [x0, #576] +str q2, [x0, #448] +mul v28.4S, v28.4S,v26.s[0] +mul v3.4S, v3.4S,v26.s[0] +sub v2.4s, v13.4s, v12.4s +ldr q20, [x0, #656] +add v13.4s, v13.4s, v12.4s +mla v28.4S, v14.4S, v31.s[0] +mla v3.4S, v18.4S, v31.s[0] +sub v18.4s, v10.4s, v25.4s +ldr q14, [x0, #640] +add v10.4s, v10.4s, v25.4s +sqrdmulh v25.4S, v2.4S, v19.s[2] +mul v2.4S, v2.4S,v7.s[2] +sub v12.4s, v23.4s, v24.4s +ldr q11, [x0, #720] +add v23.4s, v23.4s, v24.4s +sqrdmulh v24.4S, v13.4S, v19.s[1] +mul v13.4S, v13.4S,v7.s[1] +sub v30.4s, v6.4s, v5.4s +ldr q29, [x0, #704] +add v6.4s, v6.4s, v5.4s +sqrdmulh v19.4S, v12.4S, v8.s[2] +mul v12.4S, v12.4S,v17.s[2] +sub v5.4s, v20.4s, v1.4s +ldr q7, [x0, #816] +add v20.4s, v20.4s, v1.4s +sqrdmulh v1.4S, v23.4S, v8.s[1] +mul v23.4S, v23.4S,v17.s[1] +sub v22.4s, v14.4s, v15.4s +ldr q21, [x0, #800] +add v14.4s, v14.4s, v15.4s +mla v2.4S, v25.4S, v31.s[0] +sqrdmulh v25.4S, v5.4S, v16.s[2] +sub v8.4s, v11.4s, v28.4s +ldr q15, [x0, #880] +add v11.4s, v11.4s, v28.4s +mla v13.4S, v24.4S, v31.s[0] +sqrdmulh v24.4S, v20.4S, v16.s[1] +sub v28.4s, v29.4s, v3.4s +ldr q17, [x0, #864] +add v29.4s, v29.4s, v3.4s +mla v12.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v8.4S, v27.s[2] +sub v3.4s, v18.4s, v2.4s +ldr q4, [x17, #+512] +str q3, [x0, #560] +mla v23.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v11.4S, v27.s[1] +add v18.4s, v18.4s, v2.4s +ldr q2, [x17, #+528] +str q18, [x0, #544] +mul v5.4S, v5.4S,v9.s[2] +mul v20.4S, v20.4S,v9.s[1] +sub v18.4s, v10.4s, v13.4s +ldr q3, [x17, #+544] +str q18, [x0, #528] +mla v5.4S, v25.4S, v31.s[0] +mla v20.4S, v24.4S, v31.s[0] +add v10.4s, v10.4s, v13.4s +ldr q13, [x17, #+560] +str q10, [x0, #512] +mul v8.4S, v8.4S,v26.s[2] +mul v11.4S, v11.4S,v26.s[1] +sub v16.4s, v30.4s, v12.4s +ldr q10, [x0, #944] +str q16, [x0, #624] +mla v8.4S, v19.4S, v31.s[0] +mla v11.4S, v1.4S, v31.s[0] +add v30.4s, v30.4s, v12.4s +ldr q12, [x0, #928] +str q30, [x0, #608] +sqrdmulh v27.4S, v7.4S, v2.s[0] +mul v7.4S, v7.4S,v4.s[0] +sub v30.4s, v6.4s, v23.4s +ldr q26, [x0, #1008] +str q30, [x0, #592] +sqrdmulh v30.4S, v21.4S, v2.s[0] +mul v21.4S, v21.4S,v4.s[0] +add v6.4s, v6.4s, v23.4s +ldr q23, [x0, #992] +str q6, [x0, #576] +sqrdmulh v6.4S, v15.4S, v13.s[0] +mul v15.4S, v15.4S,v3.s[0] +sub v1.4s, v22.4s, v5.4s +ldr q19, [x17, #+576] +str q1, [x0, #688] +sqrdmulh v1.4S, v17.4S, v13.s[0] +mul v17.4S, v17.4S,v3.s[0] +add v22.4s, v22.4s, v5.4s +ldr q5, [x17, #+592] +str q22, [x0, #672] +mla v7.4S, v27.4S, v31.s[0] +sqrdmulh v27.4S, v10.4S, v5.s[0] +sub v22.4s, v14.4s, v20.4s +ldr q16, [x17, #+608] +str q22, [x0, #656] +mla v21.4S, v30.4S, v31.s[0] +sqrdmulh v30.4S, v12.4S, v5.s[0] +add v14.4s, v14.4s, v20.4s +ldr q20, [x17, #+624] +str q14, [x0, #640] +mla v15.4S, v6.4S, v31.s[0] +sqrdmulh v6.4S, v26.4S, v20.s[0] +sub v14.4s, v28.4s, v8.4s +ldr q22, [x0, #784] +str q14, [x0, #752] +mla v17.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v23.4S, v20.s[0] +add v28.4s, v28.4s, v8.4s +ldr q8, [x0, #768] +str q28, [x0, #736] +mul v10.4S, v10.4S,v19.s[0] +mul v12.4S, v12.4S,v19.s[0] +sub v28.4s, v29.4s, v11.4s +ldr q14, [x0, #848] +str q28, [x0, #720] +mla v10.4S, v27.4S, v31.s[0] +mla v12.4S, v30.4S, v31.s[0] +add v29.4s, v29.4s, v11.4s +ldr q11, [x0, #832] +str q29, [x0, #704] +mul v26.4S, v26.4S,v16.s[0] +mul v23.4S, v23.4S,v16.s[0] +sub v29.4s, v22.4s, v7.4s +ldr q30, [x0, #912] +add v22.4s, v22.4s, v7.4s +mla v26.4S, v6.4S, v31.s[0] +mla v23.4S, v1.4S, v31.s[0] +sub v1.4s, v8.4s, v21.4s +ldr q6, [x0, #896] +add v8.4s, v8.4s, v21.4s +sqrdmulh v21.4S, v29.4S, v2.s[2] +mul v29.4S, v29.4S,v4.s[2] +sub v7.4s, v14.4s, v15.4s +ldr q27, [x0, #976] +add v14.4s, v14.4s, v15.4s +sqrdmulh v15.4S, v22.4S, v2.s[1] +mul v22.4S, v22.4S,v4.s[1] +sub v28.4s, v11.4s, v17.4s +ldr q9, [x0, #960] +add v11.4s, v11.4s, v17.4s +sqrdmulh v2.4S, v7.4S, v13.s[2] +mul v7.4S, v7.4S,v3.s[2] +sub v17.4s, v30.4s, v10.4s +add v30.4s, v30.4s, v10.4s +sqrdmulh v10.4S, v14.4S, v13.s[1] +mul v14.4S, v14.4S,v3.s[1] +sub v4.4s, v6.4s, v12.4s +add v6.4s, v6.4s, v12.4s +mla v29.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v17.4S, v5.s[2] +sub v13.4s, v27.4s, v26.4s +add v27.4s, v27.4s, v26.4s +mla v22.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v30.4S, v5.s[1] +sub v26.4s, v9.4s, v23.4s +add v9.4s, v9.4s, v23.4s +mla v7.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v13.4S, v20.s[2] +sub v23.4s, v1.4s, v29.4s +str q23, [x0, #816] +mla v14.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v27.4S, v20.s[1] +add v1.4s, v1.4s, v29.4s +str q1, [x0, #800] +mul v17.4S, v17.4S,v19.s[2] +mul v30.4S, v30.4S,v19.s[1] +sub v1.4s, v8.4s, v22.4s +str q1, [x0, #784] +mla v17.4S, v21.4S, v31.s[0] +mla v30.4S, v15.4S, v31.s[0] +add v8.4s, v8.4s, v22.4s +str q8, [x0, #768] +mul v13.4S, v13.4S,v16.s[2] +mul v27.4S, v27.4S,v16.s[1] +sub v5.4s, v28.4s, v7.4s +str q5, [x0, #880] +mla v13.4S, v2.4S, v31.s[0] +mla v27.4S, v10.4S, v31.s[0] +add v28.4s, v28.4s, v7.4s +str q28, [x0, #864] +sub v20.4s, v11.4s, v14.4s +str q20, [x0, #848] +add v11.4s, v11.4s, v14.4s +str q11, [x0, #832] +sub v11.4s, v4.4s, v17.4s +str q11, [x0, #944] +add v4.4s, v4.4s, v17.4s +str q4, [x0, #928] +sub v4.4s, v6.4s, v30.4s +str q4, [x0, #912] +add v6.4s, v6.4s, v30.4s +str q6, [x0, #896] +sub v6.4s, v26.4s, v13.4s +str q6, [x0, #1008] +add v26.4s, v26.4s, v13.4s +str q26, [x0, #992] +sub v26.4s, v9.4s, v27.4s +str q26, [x0, #976] +add v9.4s, v9.4s, v27.4s +str q9, [x0, #960] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1520 +// Instruction count: 1516 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_7.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_7.s new file mode 100644 index 0000000..e7dd93d --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_7.s @@ -0,0 +1,1550 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_22_z4_7 +.global _ntt_u32_incomplete_neon_asm_var_4_2_22_z4_7 +ntt_u32_incomplete_neon_asm_var_4_2_22_z4_7: +_ntt_u32_incomplete_neon_asm_var_4_2_22_z4_7: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x0, #992] +sqrdmulh v27.4S, v28.4S, v29.s[0] +mul v28.4S, v28.4S,v30.s[0] +ldr q26, [x0, #928] +sqrdmulh v25.4S, v26.4S, v29.s[0] +mul v26.4S, v26.4S,v30.s[0] +ldr q24, [x0, #864] +sqrdmulh v23.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +ldr q22, [x0, #800] +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +ldr q20, [x0, #736] +sqrdmulh v19.4S, v20.4S, v29.s[0] +mla v28.4S, v27.4S, v31.s[0] +ldr q27, [x0, #672] +sqrdmulh v18.4S, v27.4S, v29.s[0] +mla v26.4S, v25.4S, v31.s[0] +ldr q25, [x0, #608] +sqrdmulh v17.4S, v25.4S, v29.s[0] +mla v24.4S, v23.4S, v31.s[0] +ldr q23, [x0, #544] +sqrdmulh v16.4S, v23.4S, v29.s[0] +mla v22.4S, v21.4S, v31.s[0] +ldr q21, [x0, #480] +mul v27.4S, v27.4S,v30.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q3, [x0, #416] +ldr q2, [x0, #352] +ldr q1, [x0, #288] +mla v27.4S, v18.4S, v31.s[0] +mla v20.4S, v19.4S, v31.s[0] +ldr q19, [x0, #224] +ldr q18, [x0, #160] +mul v23.4S, v23.4S,v30.s[0] +mul v25.4S, v25.4S,v30.s[0] +ldr q0, [x0, #96] +ldr q15, [x0, #32] +mla v23.4S, v16.4S, v31.s[0] +mla v25.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v28.4s +add v21.4s, v21.4s, v28.4s +sqrdmulh v28.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +sub v16.4s, v3.4s, v26.4s +add v3.4s, v3.4s, v26.4s +sqrdmulh v26.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +sub v14.4s, v2.4s, v24.4s +add v2.4s, v2.4s, v24.4s +sqrdmulh v24.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v13.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +sqrdmulh v22.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v12.4s, v19.4s, v20.4s +add v19.4s, v19.4s, v20.4s +sqrdmulh v20.4S, v14.4S, v29.s[2] +mla v17.4S, v28.4S, v31.s[0] +sub v28.4s, v18.4s, v27.4s +add v18.4s, v18.4s, v27.4s +sqrdmulh v27.4S, v13.4S, v29.s[2] +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v0.4s, v25.4s +add v0.4s, v0.4s, v25.4s +sqrdmulh v25.4S, v2.4S, v29.s[1] +mla v21.4S, v24.4S, v31.s[0] +sub v24.4s, v15.4s, v23.4s +sqrdmulh v11.4S, v1.4S, v29.s[1] +mla v3.4S, v22.4S, v31.s[0] +add v15.4s, v15.4s, v23.4s +ldr q23, [x17, #+32] +ldr q22, [x17, #+48] +mul v13.4S, v13.4S,v30.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v10.4s, v12.4s, v17.4s +add v12.4s, v12.4s, v17.4s +mla v13.4S, v27.4S, v31.s[0] +mla v14.4S, v20.4S, v31.s[0] +sub v20.4s, v28.4s, v16.4s +add v28.4s, v28.4s, v16.4s +mul v1.4S, v1.4S,v30.s[1] +mul v2.4S, v2.4S,v30.s[1] +sub v16.4s, v19.4s, v21.4s +add v19.4s, v19.4s, v21.4s +mla v1.4S, v11.4S, v31.s[0] +mla v2.4S, v25.4S, v31.s[0] +sub v25.4s, v18.4s, v3.4s +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v10.4S, v22.s[3] +mul v10.4S, v10.4S,v23.s[3] +sub v11.4s, v26.4s, v14.4s +add v26.4s, v26.4s, v14.4s +sqrdmulh v14.4S, v12.4S, v22.s[2] +mul v12.4S, v12.4S,v23.s[2] +sub v21.4s, v24.4s, v13.4s +add v24.4s, v24.4s, v13.4s +sqrdmulh v13.4S, v16.4S, v22.s[1] +mul v16.4S, v16.4S,v23.s[1] +sub v27.4s, v0.4s, v2.4s +add v0.4s, v0.4s, v2.4s +sqrdmulh v2.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v17.4s, v15.4s, v1.4s +add v15.4s, v15.4s, v1.4s +ldr q1, [x17, #+96] +ldr q9, [x17, #+112] +sqrdmulh v8.4S, v20.4S, v22.s[3] +mla v10.4S, v3.4S, v31.s[0] +nop +nop +sqrdmulh v3.4S, v28.4S, v22.s[2] +mla v12.4S, v14.4S, v31.s[0] +nop +nop +sqrdmulh v14.4S, v25.4S, v22.s[1] +mla v16.4S, v13.4S, v31.s[0] +nop +nop +sqrdmulh v13.4S, v18.4S, v22.s[0] +mla v19.4S, v2.4S, v31.s[0] +nop +nop +ldr q2, [x17, #+64] +ldr q7, [x17, #+80] +mul v28.4S, v28.4S,v23.s[2] +mul v20.4S, v20.4S,v23.s[3] +sub v6.4s, v11.4s, v10.4s +add v11.4s, v11.4s, v10.4s +mla v28.4S, v3.4S, v31.s[0] +mla v20.4S, v8.4S, v31.s[0] +sub v8.4s, v26.4s, v12.4s +add v26.4s, v26.4s, v12.4s +mul v18.4S, v18.4S,v23.s[0] +mul v25.4S, v25.4S,v23.s[1] +sub v12.4s, v27.4s, v16.4s +add v27.4s, v27.4s, v16.4s +mla v18.4S, v13.4S, v31.s[0] +mla v25.4S, v14.4S, v31.s[0] +sub v14.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v9.s[3] +mul v6.4S, v6.4S,v1.s[3] +sub v13.4s, v21.4s, v20.4s +add v21.4s, v21.4s, v20.4s +sqrdmulh v20.4S, v11.4S, v9.s[2] +mul v11.4S, v11.4S,v1.s[2] +sub v16.4s, v24.4s, v28.4s +add v24.4s, v24.4s, v28.4s +sqrdmulh v28.4S, v8.4S, v9.s[1] +mul v8.4S, v8.4S,v1.s[1] +sub v3.4s, v17.4s, v25.4s +add v17.4s, v17.4s, v25.4s +sqrdmulh v25.4S, v26.4S, v9.s[0] +mul v26.4S, v26.4S,v1.s[0] +sub v10.4s, v15.4s, v18.4s +add v15.4s, v15.4s, v18.4s +sqrdmulh v18.4S, v12.4S, v7.s[3] +mla v6.4S, v19.4S, v31.s[0] +nop +nop +sqrdmulh v19.4S, v27.4S, v7.s[2] +mla v11.4S, v20.4S, v31.s[0] +nop +nop +sqrdmulh v20.4S, v14.4S, v7.s[1] +mla v8.4S, v28.4S, v31.s[0] +nop +nop +sqrdmulh v28.4S, v0.4S, v7.s[0] +mla v26.4S, v25.4S, v31.s[0] +nop +nop +mul v27.4S, v27.4S,v2.s[2] +mul v12.4S, v12.4S,v2.s[3] +sub v25.4s, v13.4s, v6.4s +str q25, [x0, #992] +mla v27.4S, v19.4S, v31.s[0] +mla v12.4S, v18.4S, v31.s[0] +add v13.4s, v13.4s, v6.4s +str q13, [x0, #928] +mul v0.4S, v0.4S,v2.s[0] +mul v14.4S, v14.4S,v2.s[1] +sub v13.4s, v21.4s, v11.4s +str q13, [x0, #864] +mla v0.4S, v28.4S, v31.s[0] +mla v14.4S, v20.4S, v31.s[0] +add v21.4s, v21.4s, v11.4s +sub v11.4s, v16.4s, v8.4s +ldr q20, [x0, #1008] +sqrdmulh v28.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v16.4s, v16.4s, v8.4s +str q21, [x0, #800] +ldr q21, [x0, #944] +sqrdmulh v8.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +sub v13.4s, v24.4s, v26.4s +str q11, [x0, #736] +ldr q11, [x0, #880] +sqrdmulh v6.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +add v24.4s, v24.4s, v26.4s +str q16, [x0, #672] +ldr q16, [x0, #816] +sqrdmulh v26.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v18.4s, v3.4s, v12.4s +str q13, [x0, #608] +ldr q13, [x0, #752] +sqrdmulh v19.4S, v13.4S, v29.s[0] +mla v20.4S, v28.4S, v31.s[0] +add v3.4s, v3.4s, v12.4s +str q24, [x0, #544] +ldr q24, [x0, #688] +sqrdmulh v12.4S, v24.4S, v29.s[0] +mla v21.4S, v8.4S, v31.s[0] +sub v8.4s, v17.4s, v27.4s +str q18, [x0, #480] +ldr q18, [x0, #624] +sqrdmulh v28.4S, v18.4S, v29.s[0] +mla v11.4S, v6.4S, v31.s[0] +add v17.4s, v17.4s, v27.4s +str q3, [x0, #416] +ldr q3, [x0, #560] +sqrdmulh v27.4S, v3.4S, v29.s[0] +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v10.4s, v14.4s +str q8, [x0, #352] +ldr q8, [x0, #496] +add v10.4s, v10.4s, v14.4s +mul v24.4S, v24.4S,v30.s[0] +mul v13.4S, v13.4S,v30.s[0] +ldr q14, [x0, #432] +str q17, [x0, #288] +ldr q17, [x0, #368] +ldr q6, [x0, #304] +mla v24.4S, v12.4S, v31.s[0] +mla v13.4S, v19.4S, v31.s[0] +str q26, [x0, #224] +sub v26.4s, v15.4s, v0.4s +ldr q19, [x0, #240] +ldr q12, [x0, #176] +mul v3.4S, v3.4S,v30.s[0] +mul v18.4S, v18.4S,v30.s[0] +str q10, [x0, #160] +add v15.4s, v15.4s, v0.4s +ldr q0, [x0, #112] +ldr q10, [x0, #48] +mla v3.4S, v27.4S, v31.s[0] +mla v18.4S, v28.4S, v31.s[0] +sub v28.4s, v8.4s, v20.4s +add v8.4s, v8.4s, v20.4s +sqrdmulh v20.4S, v28.4S, v29.s[2] +mul v28.4S, v28.4S,v30.s[2] +sub v27.4s, v14.4s, v21.4s +add v14.4s, v14.4s, v21.4s +sqrdmulh v21.4S, v27.4S, v29.s[2] +mul v27.4S, v27.4S,v30.s[2] +sub v25.4s, v17.4s, v11.4s +add v17.4s, v17.4s, v11.4s +sqrdmulh v11.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +sub v5.4s, v6.4s, v16.4s +add v6.4s, v6.4s, v16.4s +sqrdmulh v16.4S, v14.4S, v29.s[1] +mul v14.4S, v14.4S,v30.s[1] +sub v4.4s, v19.4s, v13.4s +add v19.4s, v19.4s, v13.4s +sqrdmulh v13.4S, v25.4S, v29.s[2] +mla v28.4S, v20.4S, v31.s[0] +sub v20.4s, v12.4s, v24.4s +add v12.4s, v12.4s, v24.4s +sqrdmulh v24.4S, v5.4S, v29.s[2] +mla v27.4S, v21.4S, v31.s[0] +sub v21.4s, v0.4s, v18.4s +add v0.4s, v0.4s, v18.4s +sqrdmulh v18.4S, v17.4S, v29.s[1] +mla v8.4S, v11.4S, v31.s[0] +sub v11.4s, v10.4s, v3.4s +str q26, [x0, #96] +sqrdmulh v26.4S, v6.4S, v29.s[1] +mla v14.4S, v16.4S, v31.s[0] +add v10.4s, v10.4s, v3.4s +str q15, [x0, #32] +mul v5.4S, v5.4S,v30.s[2] +mul v25.4S, v25.4S,v30.s[2] +sub v15.4s, v4.4s, v28.4s +add v4.4s, v4.4s, v28.4s +mla v5.4S, v24.4S, v31.s[0] +mla v25.4S, v13.4S, v31.s[0] +sub v13.4s, v20.4s, v27.4s +add v20.4s, v20.4s, v27.4s +mul v6.4S, v6.4S,v30.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v27.4s, v19.4s, v8.4s +add v19.4s, v19.4s, v8.4s +mla v6.4S, v26.4S, v31.s[0] +mla v17.4S, v18.4S, v31.s[0] +sub v18.4s, v12.4s, v14.4s +add v12.4s, v12.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v22.s[3] +mul v15.4S, v15.4S,v23.s[3] +sub v26.4s, v21.4s, v25.4s +add v21.4s, v21.4s, v25.4s +sqrdmulh v25.4S, v4.4S, v22.s[2] +mul v4.4S, v4.4S,v23.s[2] +sub v8.4s, v11.4s, v5.4s +add v11.4s, v11.4s, v5.4s +sqrdmulh v5.4S, v27.4S, v22.s[1] +mul v27.4S, v27.4S,v23.s[1] +sub v24.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +sqrdmulh v17.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v28.4s, v10.4s, v6.4s +add v10.4s, v10.4s, v6.4s +sqrdmulh v6.4S, v13.4S, v22.s[3] +mla v15.4S, v14.4S, v31.s[0] +nop +nop +sqrdmulh v14.4S, v20.4S, v22.s[2] +mla v4.4S, v25.4S, v31.s[0] +nop +nop +sqrdmulh v25.4S, v18.4S, v22.s[1] +mla v27.4S, v5.4S, v31.s[0] +nop +nop +sqrdmulh v5.4S, v12.4S, v22.s[0] +mla v19.4S, v17.4S, v31.s[0] +nop +nop +mul v20.4S, v20.4S,v23.s[2] +mul v13.4S, v13.4S,v23.s[3] +sub v17.4s, v26.4s, v15.4s +add v26.4s, v26.4s, v15.4s +mla v20.4S, v14.4S, v31.s[0] +mla v13.4S, v6.4S, v31.s[0] +sub v6.4s, v21.4s, v4.4s +add v21.4s, v21.4s, v4.4s +mul v12.4S, v12.4S,v23.s[0] +mul v18.4S, v18.4S,v23.s[1] +sub v4.4s, v24.4s, v27.4s +add v24.4s, v24.4s, v27.4s +mla v12.4S, v5.4S, v31.s[0] +mla v18.4S, v25.4S, v31.s[0] +sub v25.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v17.4S, v9.s[3] +mul v17.4S, v17.4S,v1.s[3] +sub v5.4s, v8.4s, v13.4s +add v8.4s, v8.4s, v13.4s +sqrdmulh v13.4S, v26.4S, v9.s[2] +mul v26.4S, v26.4S,v1.s[2] +sub v27.4s, v11.4s, v20.4s +add v11.4s, v11.4s, v20.4s +sqrdmulh v20.4S, v6.4S, v9.s[1] +mul v6.4S, v6.4S,v1.s[1] +sub v14.4s, v28.4s, v18.4s +add v28.4s, v28.4s, v18.4s +sqrdmulh v18.4S, v21.4S, v9.s[0] +mul v21.4S, v21.4S,v1.s[0] +sub v15.4s, v10.4s, v12.4s +add v10.4s, v10.4s, v12.4s +sqrdmulh v12.4S, v4.4S, v7.s[3] +mla v17.4S, v19.4S, v31.s[0] +nop +nop +sqrdmulh v19.4S, v24.4S, v7.s[2] +mla v26.4S, v13.4S, v31.s[0] +nop +nop +sqrdmulh v13.4S, v25.4S, v7.s[1] +mla v6.4S, v20.4S, v31.s[0] +nop +nop +sqrdmulh v20.4S, v0.4S, v7.s[0] +mla v21.4S, v18.4S, v31.s[0] +nop +nop +mul v24.4S, v24.4S,v2.s[2] +mul v4.4S, v4.4S,v2.s[3] +sub v18.4s, v5.4s, v17.4s +str q18, [x0, #1008] +mla v24.4S, v19.4S, v31.s[0] +mla v4.4S, v12.4S, v31.s[0] +add v5.4s, v5.4s, v17.4s +str q5, [x0, #944] +mul v0.4S, v0.4S,v2.s[0] +mul v25.4S, v25.4S,v2.s[1] +sub v5.4s, v8.4s, v26.4s +str q5, [x0, #880] +mla v0.4S, v20.4S, v31.s[0] +mla v25.4S, v13.4S, v31.s[0] +add v8.4s, v8.4s, v26.4s +sub v26.4s, v27.4s, v6.4s +ldr q13, [x0, #960] +sqrdmulh v20.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +add v27.4s, v27.4s, v6.4s +str q8, [x0, #816] +ldr q8, [x0, #896] +sqrdmulh v6.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v5.4s, v11.4s, v21.4s +str q26, [x0, #752] +ldr q26, [x0, #832] +sqrdmulh v17.4S, v26.4S, v29.s[0] +mul v26.4S, v26.4S,v30.s[0] +add v11.4s, v11.4s, v21.4s +str q27, [x0, #688] +ldr q27, [x0, #768] +sqrdmulh v21.4S, v27.4S, v29.s[0] +mul v27.4S, v27.4S,v30.s[0] +sub v12.4s, v14.4s, v4.4s +str q5, [x0, #624] +ldr q5, [x0, #704] +sqrdmulh v19.4S, v5.4S, v29.s[0] +mla v13.4S, v20.4S, v31.s[0] +add v14.4s, v14.4s, v4.4s +str q11, [x0, #560] +ldr q11, [x0, #640] +sqrdmulh v4.4S, v11.4S, v29.s[0] +mla v8.4S, v6.4S, v31.s[0] +sub v6.4s, v28.4s, v24.4s +str q12, [x0, #496] +ldr q12, [x0, #576] +sqrdmulh v20.4S, v12.4S, v29.s[0] +mla v26.4S, v17.4S, v31.s[0] +add v28.4s, v28.4s, v24.4s +str q14, [x0, #432] +ldr q14, [x0, #512] +sqrdmulh v24.4S, v14.4S, v29.s[0] +mla v27.4S, v21.4S, v31.s[0] +sub v21.4s, v15.4s, v25.4s +str q6, [x0, #368] +ldr q6, [x0, #448] +add v15.4s, v15.4s, v25.4s +mul v11.4S, v11.4S,v30.s[0] +mul v5.4S, v5.4S,v30.s[0] +ldr q25, [x0, #384] +str q28, [x0, #304] +ldr q28, [x0, #320] +ldr q17, [x0, #256] +mla v11.4S, v4.4S, v31.s[0] +mla v5.4S, v19.4S, v31.s[0] +str q21, [x0, #240] +sub v21.4s, v10.4s, v0.4s +ldr q19, [x0, #192] +ldr q4, [x0, #128] +mul v14.4S, v14.4S,v30.s[0] +mul v12.4S, v12.4S,v30.s[0] +str q15, [x0, #176] +add v10.4s, v10.4s, v0.4s +ldr q0, [x0, #64] +ldr q15, [x0, #0] +mla v14.4S, v24.4S, v31.s[0] +mla v12.4S, v20.4S, v31.s[0] +sub v20.4s, v6.4s, v13.4s +add v6.4s, v6.4s, v13.4s +sqrdmulh v13.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +sub v24.4s, v25.4s, v8.4s +add v25.4s, v25.4s, v8.4s +sqrdmulh v8.4S, v24.4S, v29.s[2] +mul v24.4S, v24.4S,v30.s[2] +sub v18.4s, v28.4s, v26.4s +add v28.4s, v28.4s, v26.4s +sqrdmulh v26.4S, v6.4S, v29.s[1] +mul v6.4S, v6.4S,v30.s[1] +sub v3.4s, v17.4s, v27.4s +add v17.4s, v17.4s, v27.4s +sqrdmulh v27.4S, v25.4S, v29.s[1] +mul v25.4S, v25.4S,v30.s[1] +sub v16.4s, v19.4s, v5.4s +add v19.4s, v19.4s, v5.4s +sqrdmulh v5.4S, v18.4S, v29.s[2] +mla v20.4S, v13.4S, v31.s[0] +sub v13.4s, v4.4s, v11.4s +add v4.4s, v4.4s, v11.4s +sqrdmulh v11.4S, v3.4S, v29.s[2] +mla v24.4S, v8.4S, v31.s[0] +sub v8.4s, v0.4s, v12.4s +add v0.4s, v0.4s, v12.4s +sqrdmulh v12.4S, v28.4S, v29.s[1] +mla v6.4S, v26.4S, v31.s[0] +sub v26.4s, v15.4s, v14.4s +str q21, [x0, #112] +sqrdmulh v21.4S, v17.4S, v29.s[1] +mla v25.4S, v27.4S, v31.s[0] +add v15.4s, v15.4s, v14.4s +str q10, [x0, #48] +mul v3.4S, v3.4S,v30.s[2] +mul v18.4S, v18.4S,v30.s[2] +sub v10.4s, v16.4s, v20.4s +add v16.4s, v16.4s, v20.4s +mla v3.4S, v11.4S, v31.s[0] +mla v18.4S, v5.4S, v31.s[0] +sub v5.4s, v13.4s, v24.4s +add v13.4s, v13.4s, v24.4s +mul v17.4S, v17.4S,v30.s[1] +mul v28.4S, v28.4S,v30.s[1] +sub v24.4s, v19.4s, v6.4s +add v19.4s, v19.4s, v6.4s +mla v17.4S, v21.4S, v31.s[0] +mla v28.4S, v12.4S, v31.s[0] +sub v12.4s, v4.4s, v25.4s +add v4.4s, v4.4s, v25.4s +sqrdmulh v25.4S, v10.4S, v22.s[3] +mul v10.4S, v10.4S,v23.s[3] +sub v21.4s, v8.4s, v18.4s +add v8.4s, v8.4s, v18.4s +sqrdmulh v18.4S, v16.4S, v22.s[2] +mul v16.4S, v16.4S,v23.s[2] +sub v6.4s, v26.4s, v3.4s +add v26.4s, v26.4s, v3.4s +sqrdmulh v3.4S, v24.4S, v22.s[1] +mul v24.4S, v24.4S,v23.s[1] +sub v11.4s, v0.4s, v28.4s +add v0.4s, v0.4s, v28.4s +sqrdmulh v28.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v20.4s, v15.4s, v17.4s +add v15.4s, v15.4s, v17.4s +sqrdmulh v17.4S, v5.4S, v22.s[3] +mla v10.4S, v25.4S, v31.s[0] +nop +nop +sqrdmulh v25.4S, v13.4S, v22.s[2] +mla v16.4S, v18.4S, v31.s[0] +nop +nop +sqrdmulh v18.4S, v12.4S, v22.s[1] +mla v24.4S, v3.4S, v31.s[0] +nop +nop +sqrdmulh v3.4S, v4.4S, v22.s[0] +mla v19.4S, v28.4S, v31.s[0] +nop +nop +mul v13.4S, v13.4S,v23.s[2] +mul v5.4S, v5.4S,v23.s[3] +sub v28.4s, v21.4s, v10.4s +add v21.4s, v21.4s, v10.4s +mla v13.4S, v25.4S, v31.s[0] +mla v5.4S, v17.4S, v31.s[0] +sub v17.4s, v8.4s, v16.4s +add v8.4s, v8.4s, v16.4s +mul v4.4S, v4.4S,v23.s[0] +mul v12.4S, v12.4S,v23.s[1] +sub v16.4s, v11.4s, v24.4s +add v11.4s, v11.4s, v24.4s +mla v4.4S, v3.4S, v31.s[0] +mla v12.4S, v18.4S, v31.s[0] +sub v18.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v28.4S, v9.s[3] +mul v28.4S, v28.4S,v1.s[3] +sub v3.4s, v6.4s, v5.4s +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v21.4S, v9.s[2] +mul v21.4S, v21.4S,v1.s[2] +sub v24.4s, v26.4s, v13.4s +add v26.4s, v26.4s, v13.4s +sqrdmulh v13.4S, v17.4S, v9.s[1] +mul v17.4S, v17.4S,v1.s[1] +sub v25.4s, v20.4s, v12.4s +add v20.4s, v20.4s, v12.4s +sqrdmulh v12.4S, v8.4S, v9.s[0] +mul v8.4S, v8.4S,v1.s[0] +sub v10.4s, v15.4s, v4.4s +add v15.4s, v15.4s, v4.4s +sqrdmulh v4.4S, v16.4S, v7.s[3] +mla v28.4S, v19.4S, v31.s[0] +nop +nop +sqrdmulh v19.4S, v11.4S, v7.s[2] +mla v21.4S, v5.4S, v31.s[0] +nop +nop +sqrdmulh v5.4S, v18.4S, v7.s[1] +mla v17.4S, v13.4S, v31.s[0] +nop +nop +sqrdmulh v13.4S, v0.4S, v7.s[0] +mla v8.4S, v12.4S, v31.s[0] +nop +nop +mul v11.4S, v11.4S,v2.s[2] +mul v16.4S, v16.4S,v2.s[3] +sub v12.4s, v3.4s, v28.4s +str q12, [x0, #960] +mla v11.4S, v19.4S, v31.s[0] +mla v16.4S, v4.4S, v31.s[0] +add v3.4s, v3.4s, v28.4s +str q3, [x0, #896] +mul v0.4S, v0.4S,v2.s[0] +mul v18.4S, v18.4S,v2.s[1] +sub v3.4s, v6.4s, v21.4s +str q3, [x0, #832] +mla v0.4S, v13.4S, v31.s[0] +mla v18.4S, v5.4S, v31.s[0] +add v6.4s, v6.4s, v21.4s +sub v21.4s, v24.4s, v17.4s +ldr q5, [x0, #976] +sqrdmulh v13.4S, v5.4S, v29.s[0] +mul v5.4S, v5.4S,v30.s[0] +add v24.4s, v24.4s, v17.4s +str q6, [x0, #768] +ldr q6, [x0, #912] +sqrdmulh v17.4S, v6.4S, v29.s[0] +mul v6.4S, v6.4S,v30.s[0] +sub v3.4s, v26.4s, v8.4s +str q21, [x0, #704] +ldr q21, [x0, #848] +sqrdmulh v28.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +add v26.4s, v26.4s, v8.4s +str q24, [x0, #640] +ldr q24, [x0, #784] +sqrdmulh v8.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +sub v4.4s, v25.4s, v16.4s +str q3, [x0, #576] +ldr q3, [x0, #720] +sqrdmulh v19.4S, v3.4S, v29.s[0] +mla v5.4S, v13.4S, v31.s[0] +add v25.4s, v25.4s, v16.4s +str q26, [x0, #512] +ldr q26, [x0, #656] +sqrdmulh v16.4S, v26.4S, v29.s[0] +mla v6.4S, v17.4S, v31.s[0] +sub v17.4s, v20.4s, v11.4s +str q4, [x0, #448] +ldr q4, [x0, #592] +sqrdmulh v13.4S, v4.4S, v29.s[0] +mla v21.4S, v28.4S, v31.s[0] +add v20.4s, v20.4s, v11.4s +str q25, [x0, #384] +ldr q25, [x0, #528] +sqrdmulh v11.4S, v25.4S, v29.s[0] +mla v24.4S, v8.4S, v31.s[0] +sub v8.4s, v10.4s, v18.4s +str q17, [x0, #320] +ldr q17, [x0, #464] +add v10.4s, v10.4s, v18.4s +mul v26.4S, v26.4S,v30.s[0] +mul v3.4S, v3.4S,v30.s[0] +ldr q18, [x0, #400] +str q20, [x0, #256] +ldr q20, [x0, #336] +ldr q28, [x0, #272] +mla v26.4S, v16.4S, v31.s[0] +mla v3.4S, v19.4S, v31.s[0] +str q8, [x0, #192] +sub v8.4s, v15.4s, v0.4s +ldr q19, [x0, #208] +ldr q16, [x0, #144] +mul v25.4S, v25.4S,v30.s[0] +mul v4.4S, v4.4S,v30.s[0] +str q10, [x0, #128] +add v15.4s, v15.4s, v0.4s +ldr q0, [x0, #80] +ldr q10, [x0, #16] +mla v25.4S, v11.4S, v31.s[0] +mla v4.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v5.4s +add v17.4s, v17.4s, v5.4s +sqrdmulh v5.4S, v13.4S, v29.s[2] +mul v13.4S, v13.4S,v30.s[2] +sub v11.4s, v18.4s, v6.4s +add v18.4s, v18.4s, v6.4s +sqrdmulh v6.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v12.4s, v20.4s, v21.4s +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v14.4s, v28.4s, v24.4s +add v28.4s, v28.4s, v24.4s +sqrdmulh v24.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v27.4s, v19.4s, v3.4s +add v19.4s, v19.4s, v3.4s +sqrdmulh v3.4S, v12.4S, v29.s[2] +mla v13.4S, v5.4S, v31.s[0] +sub v5.4s, v16.4s, v26.4s +add v16.4s, v16.4s, v26.4s +sqrdmulh v26.4S, v14.4S, v29.s[2] +mla v11.4S, v6.4S, v31.s[0] +sub v6.4s, v0.4s, v4.4s +add v0.4s, v0.4s, v4.4s +sqrdmulh v4.4S, v20.4S, v29.s[1] +mla v17.4S, v21.4S, v31.s[0] +sub v21.4s, v10.4s, v25.4s +str q8, [x0, #64] +sqrdmulh v8.4S, v28.4S, v29.s[1] +mla v18.4S, v24.4S, v31.s[0] +add v10.4s, v10.4s, v25.4s +str q15, [x0, #0] +mul v14.4S, v14.4S,v30.s[2] +mul v12.4S, v12.4S,v30.s[2] +sub v15.4s, v27.4s, v13.4s +add v27.4s, v27.4s, v13.4s +mla v14.4S, v26.4S, v31.s[0] +mla v12.4S, v3.4S, v31.s[0] +sub v3.4s, v5.4s, v11.4s +add v5.4s, v5.4s, v11.4s +mul v28.4S, v28.4S,v30.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v11.4s, v19.4s, v17.4s +add v19.4s, v19.4s, v17.4s +mla v28.4S, v8.4S, v31.s[0] +mla v20.4S, v4.4S, v31.s[0] +sub v4.4s, v16.4s, v18.4s +add v16.4s, v16.4s, v18.4s +sqrdmulh v29.4S, v15.4S, v22.s[3] +mul v15.4S, v15.4S,v23.s[3] +sub v30.4s, v6.4s, v12.4s +add v6.4s, v6.4s, v12.4s +sqrdmulh v12.4S, v27.4S, v22.s[2] +mul v27.4S, v27.4S,v23.s[2] +sub v18.4s, v21.4s, v14.4s +add v21.4s, v21.4s, v14.4s +sqrdmulh v14.4S, v11.4S, v22.s[1] +mul v11.4S, v11.4S,v23.s[1] +sub v8.4s, v0.4s, v20.4s +add v0.4s, v0.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v17.4s, v10.4s, v28.4s +add v10.4s, v10.4s, v28.4s +sqrdmulh v28.4S, v3.4S, v22.s[3] +mla v15.4S, v29.4S, v31.s[0] +nop +nop +sqrdmulh v29.4S, v5.4S, v22.s[2] +mla v27.4S, v12.4S, v31.s[0] +nop +nop +sqrdmulh v12.4S, v4.4S, v22.s[1] +mla v11.4S, v14.4S, v31.s[0] +nop +nop +sqrdmulh v14.4S, v16.4S, v22.s[0] +mla v19.4S, v20.4S, v31.s[0] +nop +nop +mul v5.4S, v5.4S,v23.s[2] +mul v3.4S, v3.4S,v23.s[3] +sub v20.4s, v30.4s, v15.4s +add v30.4s, v30.4s, v15.4s +mla v5.4S, v29.4S, v31.s[0] +mla v3.4S, v28.4S, v31.s[0] +sub v28.4s, v6.4s, v27.4s +add v6.4s, v6.4s, v27.4s +mul v16.4S, v16.4S,v23.s[0] +mul v4.4S, v4.4S,v23.s[1] +sub v27.4s, v8.4s, v11.4s +add v8.4s, v8.4s, v11.4s +mla v16.4S, v14.4S, v31.s[0] +mla v4.4S, v12.4S, v31.s[0] +sub v12.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v22.4S, v20.4S, v9.s[3] +mul v20.4S, v20.4S,v1.s[3] +sub v23.4s, v18.4s, v3.4s +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v30.4S, v9.s[2] +mul v30.4S, v30.4S,v1.s[2] +sub v19.4s, v21.4s, v5.4s +add v21.4s, v21.4s, v5.4s +sqrdmulh v5.4S, v28.4S, v9.s[1] +mul v28.4S, v28.4S,v1.s[1] +sub v14.4s, v17.4s, v4.4s +add v17.4s, v17.4s, v4.4s +sqrdmulh v4.4S, v6.4S, v9.s[0] +mul v6.4S, v6.4S,v1.s[0] +sub v11.4s, v10.4s, v16.4s +add v10.4s, v10.4s, v16.4s +sqrdmulh v9.4S, v27.4S, v7.s[3] +mla v20.4S, v22.4S, v31.s[0] +nop +nop +sqrdmulh v22.4S, v8.4S, v7.s[2] +mla v30.4S, v3.4S, v31.s[0] +nop +nop +sqrdmulh v3.4S, v12.4S, v7.s[1] +mla v28.4S, v5.4S, v31.s[0] +nop +nop +sqrdmulh v5.4S, v0.4S, v7.s[0] +mla v6.4S, v4.4S, v31.s[0] +nop +nop +mul v8.4S, v8.4S,v2.s[2] +mul v27.4S, v27.4S,v2.s[3] +sub v4.4s, v23.4s, v20.4s +str q4, [x0, #976] +mla v8.4S, v22.4S, v31.s[0] +mla v27.4S, v9.4S, v31.s[0] +add v23.4s, v23.4s, v20.4s +str q23, [x0, #912] +mul v0.4S, v0.4S,v2.s[0] +mul v12.4S, v12.4S,v2.s[1] +sub v23.4s, v18.4s, v30.4s +str q23, [x0, #848] +mla v0.4S, v5.4S, v31.s[0] +mla v12.4S, v3.4S, v31.s[0] +add v18.4s, v18.4s, v30.4s +sub v30.4s, v19.4s, v28.4s +add v19.4s, v19.4s, v28.4s +str q18, [x0, #784] +sub v18.4s, v21.4s, v6.4s +str q30, [x0, #720] +add v21.4s, v21.4s, v6.4s +str q19, [x0, #656] +sub v19.4s, v14.4s, v27.4s +str q18, [x0, #592] +add v14.4s, v14.4s, v27.4s +str q21, [x0, #528] +sub v21.4s, v17.4s, v8.4s +str q19, [x0, #464] +add v17.4s, v17.4s, v8.4s +str q14, [x0, #400] +sub v14.4s, v11.4s, v12.4s +str q21, [x0, #336] +add v11.4s, v11.4s, v12.4s +str q17, [x0, #272] +sub v17.4s, v10.4s, v0.4s +add v10.4s, v10.4s, v0.4s +ldr q24, [x0, #224] +ldr q25, [x0, #160] +ldr q13, [x0, #32] +ldr q26, [x17, #+128] +ldr q15, [x17, #+144] +sqrdmulh v29.4S, v13.4S, v15.s[0] +mul v13.4S, v13.4S,v26.s[0] +ldr q16, [x0, #48] +sqrdmulh v1.4S, v16.4S, v15.s[0] +mul v16.4S, v16.4S,v26.s[0] +ldr q4, [x17, #+160] +ldr q22, [x17, #+176] +ldr q9, [x0, #96] +sqrdmulh v20.4S, v9.4S, v22.s[0] +mul v9.4S, v9.4S,v4.s[0] +ldr q23, [x0, #112] +sqrdmulh v5.4S, v23.4S, v22.s[0] +mul v23.4S, v23.4S,v4.s[0] +ldr q3, [x17, #+192] +ldr q2, [x17, #+208] +mla v13.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v25.4S, v2.s[0] +ldr q7, [x0, #176] +mla v16.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v7.4S, v2.s[0] +ldr q28, [x17, #+224] +ldr q30, [x17, #+240] +mla v9.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v24.4S, v30.s[0] +ldr q6, [x0, #240] +mla v23.4S, v5.4S, v31.s[0] +sqrdmulh v5.4S, v6.4S, v30.s[0] +ldr q18, [x0, #0] +ldr q27, [x0, #128] +mul v25.4S, v25.4S,v3.s[0] +sub v19.4s, v18.4s, v13.4s +mul v7.4S, v7.4S,v3.s[0] +add v18.4s, v18.4s, v13.4s +mla v25.4S, v29.4S, v31.s[0] +sub v29.4s, v10.4s, v16.4s +ldr q13, [x0, #64] +mla v7.4S, v1.4S, v31.s[0] +add v10.4s, v10.4s, v16.4s +ldr q16, [x0, #192] +mul v24.4S, v24.4S,v28.s[0] +sub v1.4s, v13.4s, v9.4s +mul v6.4S, v6.4S,v28.s[0] +add v13.4s, v13.4s, v9.4s +mla v24.4S, v20.4S, v31.s[0] +mla v6.4S, v5.4S, v31.s[0] +sub v5.4s, v17.4s, v23.4s +sqrdmulh v20.4S, v10.4S, v15.s[1] +add v17.4s, v17.4s, v23.4s +mul v10.4S, v10.4S,v26.s[1] +sqrdmulh v23.4S, v29.4S, v15.s[2] +sub v9.4s, v27.4s, v25.4s +mul v29.4S, v29.4S,v26.s[2] +add v27.4s, v27.4s, v25.4s +sqrdmulh v15.4S, v17.4S, v22.s[1] +sub v26.4s, v11.4s, v7.4s +mul v17.4S, v17.4S,v4.s[1] +add v11.4s, v11.4s, v7.4s +sqrdmulh v7.4S, v5.4S, v22.s[2] +sub v25.4s, v16.4s, v24.4s +mul v5.4S, v5.4S,v4.s[2] +add v16.4s, v16.4s, v24.4s +mla v10.4S, v20.4S, v31.s[0] +sub v20.4s, v14.4s, v6.4s +ldr q22, [x0, #480] +sqrdmulh v4.4S, v11.4S, v2.s[1] +add v14.4s, v14.4s, v6.4s +mla v29.4S, v23.4S, v31.s[0] +ldr q23, [x0, #416] +sqrdmulh v6.4S, v26.4S, v2.s[2] +sub v24.4s, v18.4s, v10.4s +mla v17.4S, v15.4S, v31.s[0] +ldr q15, [x0, #288] +sqrdmulh v8.4S, v14.4S, v30.s[1] +add v18.4s, v18.4s, v10.4s +str q24, [x0, #16] +mla v5.4S, v7.4S, v31.s[0] +ldr q7, [x17, #+256] +ldr q24, [x17, #+272] +sqrdmulh v10.4S, v20.4S, v30.s[2] +sub v21.4s, v19.4s, v29.4s +str q18, [x0, #0] +mul v11.4S, v11.4S,v3.s[1] +add v19.4s, v19.4s, v29.4s +mul v26.4S, v26.4S,v3.s[2] +str q21, [x0, #48] +mla v11.4S, v4.4S, v31.s[0] +sub v4.4s, v13.4s, v17.4s +mla v26.4S, v6.4S, v31.s[0] +str q19, [x0, #32] +mul v14.4S, v14.4S,v28.s[1] +str q4, [x0, #80] +mul v20.4S, v20.4S,v28.s[2] +add v13.4s, v13.4s, v17.4s +str q13, [x0, #64] +mla v14.4S, v8.4S, v31.s[0] +sub v8.4s, v1.4s, v5.4s +str q8, [x0, #112] +mla v20.4S, v10.4S, v31.s[0] +add v1.4s, v1.4s, v5.4s +str q1, [x0, #96] +sqrdmulh v30.4S, v15.4S, v24.s[0] +sub v28.4s, v27.4s, v11.4s +mul v15.4S, v15.4S,v7.s[0] +str q28, [x0, #144] +ldr q28, [x0, #304] +sqrdmulh v1.4S, v28.4S, v24.s[0] +add v27.4s, v27.4s, v11.4s +mul v28.4S, v28.4S,v7.s[0] +str q27, [x0, #128] +ldr q27, [x17, #+288] +ldr q11, [x17, #+304] +ldr q5, [x0, #352] +sqrdmulh v10.4S, v5.4S, v11.s[0] +sub v8.4s, v9.4s, v26.4s +mul v5.4S, v5.4S,v27.s[0] +str q8, [x0, #176] +ldr q8, [x0, #368] +sqrdmulh v13.4S, v8.4S, v11.s[0] +add v9.4s, v9.4s, v26.4s +mul v8.4S, v8.4S,v27.s[0] +str q9, [x0, #160] +ldr q9, [x17, #+320] +ldr q26, [x17, #+336] +mla v15.4S, v30.4S, v31.s[0] +sub v30.4s, v16.4s, v14.4s +sqrdmulh v17.4S, v23.4S, v26.s[0] +str q30, [x0, #208] +ldr q30, [x0, #432] +mla v28.4S, v1.4S, v31.s[0] +add v16.4s, v16.4s, v14.4s +sqrdmulh v14.4S, v30.4S, v26.s[0] +str q16, [x0, #192] +ldr q16, [x17, #+352] +ldr q1, [x17, #+368] +mla v5.4S, v10.4S, v31.s[0] +sub v10.4s, v25.4s, v20.4s +sqrdmulh v4.4S, v22.4S, v1.s[0] +str q10, [x0, #240] +ldr q10, [x0, #496] +mla v8.4S, v13.4S, v31.s[0] +add v25.4s, v25.4s, v20.4s +sqrdmulh v20.4S, v10.4S, v1.s[0] +str q25, [x0, #224] +ldr q25, [x0, #256] +ldr q13, [x0, #384] +mul v23.4S, v23.4S,v9.s[0] +sub v2.4s, v25.4s, v15.4s +ldr q3, [x0, #272] +mul v30.4S, v30.4S,v9.s[0] +add v25.4s, v25.4s, v15.4s +ldr q15, [x0, #400] +mla v23.4S, v17.4S, v31.s[0] +sub v17.4s, v3.4s, v28.4s +ldr q19, [x0, #320] +mla v30.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v28.4s +ldr q28, [x0, #448] +mul v22.4S, v22.4S,v16.s[0] +sub v14.4s, v19.4s, v5.4s +ldr q6, [x0, #336] +mul v10.4S, v10.4S,v16.s[0] +add v19.4s, v19.4s, v5.4s +ldr q5, [x0, #464] +mla v22.4S, v4.4S, v31.s[0] +mla v10.4S, v20.4S, v31.s[0] +sub v20.4s, v6.4s, v8.4s +sqrdmulh v4.4S, v3.4S, v24.s[1] +add v6.4s, v6.4s, v8.4s +mul v3.4S, v3.4S,v7.s[1] +sqrdmulh v8.4S, v17.4S, v24.s[2] +sub v21.4s, v13.4s, v23.4s +mul v17.4S, v17.4S,v7.s[2] +add v13.4s, v13.4s, v23.4s +sqrdmulh v24.4S, v6.4S, v11.s[1] +sub v7.4s, v15.4s, v30.4s +mul v6.4S, v6.4S,v27.s[1] +add v15.4s, v15.4s, v30.4s +sqrdmulh v30.4S, v20.4S, v11.s[2] +sub v23.4s, v28.4s, v22.4s +mul v20.4S, v20.4S,v27.s[2] +add v28.4s, v28.4s, v22.4s +mla v3.4S, v4.4S, v31.s[0] +sub v4.4s, v5.4s, v10.4s +ldr q11, [x0, #736] +sqrdmulh v27.4S, v15.4S, v26.s[1] +add v5.4s, v5.4s, v10.4s +mla v17.4S, v8.4S, v31.s[0] +ldr q8, [x0, #672] +sqrdmulh v10.4S, v7.4S, v26.s[2] +sub v22.4s, v25.4s, v3.4s +mla v6.4S, v24.4S, v31.s[0] +ldr q24, [x0, #544] +sqrdmulh v29.4S, v5.4S, v1.s[1] +add v25.4s, v25.4s, v3.4s +str q22, [x0, #272] +mla v20.4S, v30.4S, v31.s[0] +ldr q30, [x17, #+384] +ldr q22, [x17, #+400] +sqrdmulh v3.4S, v4.4S, v1.s[2] +sub v18.4s, v2.4s, v17.4s +str q25, [x0, #256] +mul v15.4S, v15.4S,v9.s[1] +add v2.4s, v2.4s, v17.4s +mul v7.4S, v7.4S,v9.s[2] +str q18, [x0, #304] +mla v15.4S, v27.4S, v31.s[0] +sub v27.4s, v19.4s, v6.4s +mla v7.4S, v10.4S, v31.s[0] +str q2, [x0, #288] +mul v5.4S, v5.4S,v16.s[1] +str q27, [x0, #336] +mul v4.4S, v4.4S,v16.s[2] +add v19.4s, v19.4s, v6.4s +str q19, [x0, #320] +mla v5.4S, v29.4S, v31.s[0] +sub v29.4s, v14.4s, v20.4s +str q29, [x0, #368] +mla v4.4S, v3.4S, v31.s[0] +add v14.4s, v14.4s, v20.4s +str q14, [x0, #352] +sqrdmulh v1.4S, v24.4S, v22.s[0] +sub v16.4s, v13.4s, v15.4s +mul v24.4S, v24.4S,v30.s[0] +str q16, [x0, #400] +ldr q16, [x0, #560] +sqrdmulh v14.4S, v16.4S, v22.s[0] +add v13.4s, v13.4s, v15.4s +mul v16.4S, v16.4S,v30.s[0] +str q13, [x0, #384] +ldr q13, [x17, #+416] +ldr q15, [x17, #+432] +ldr q20, [x0, #608] +sqrdmulh v3.4S, v20.4S, v15.s[0] +sub v29.4s, v21.4s, v7.4s +mul v20.4S, v20.4S,v13.s[0] +str q29, [x0, #432] +ldr q29, [x0, #624] +sqrdmulh v19.4S, v29.4S, v15.s[0] +add v21.4s, v21.4s, v7.4s +mul v29.4S, v29.4S,v13.s[0] +str q21, [x0, #416] +ldr q21, [x17, #+448] +ldr q7, [x17, #+464] +mla v24.4S, v1.4S, v31.s[0] +sub v1.4s, v28.4s, v5.4s +sqrdmulh v6.4S, v8.4S, v7.s[0] +str q1, [x0, #464] +ldr q1, [x0, #688] +mla v16.4S, v14.4S, v31.s[0] +add v28.4s, v28.4s, v5.4s +sqrdmulh v5.4S, v1.4S, v7.s[0] +str q28, [x0, #448] +ldr q28, [x17, #+480] +ldr q14, [x17, #+496] +mla v20.4S, v3.4S, v31.s[0] +sub v3.4s, v23.4s, v4.4s +sqrdmulh v27.4S, v11.4S, v14.s[0] +str q3, [x0, #496] +ldr q3, [x0, #752] +mla v29.4S, v19.4S, v31.s[0] +add v23.4s, v23.4s, v4.4s +sqrdmulh v4.4S, v3.4S, v14.s[0] +str q23, [x0, #480] +ldr q23, [x0, #512] +ldr q19, [x0, #640] +mul v8.4S, v8.4S,v21.s[0] +sub v26.4s, v23.4s, v24.4s +ldr q9, [x0, #528] +mul v1.4S, v1.4S,v21.s[0] +add v23.4s, v23.4s, v24.4s +ldr q24, [x0, #656] +mla v8.4S, v6.4S, v31.s[0] +sub v6.4s, v9.4s, v16.4s +ldr q2, [x0, #576] +mla v1.4S, v5.4S, v31.s[0] +add v9.4s, v9.4s, v16.4s +ldr q16, [x0, #704] +mul v11.4S, v11.4S,v28.s[0] +sub v5.4s, v2.4s, v20.4s +ldr q10, [x0, #592] +mul v3.4S, v3.4S,v28.s[0] +add v2.4s, v2.4s, v20.4s +ldr q20, [x0, #720] +mla v11.4S, v27.4S, v31.s[0] +mla v3.4S, v4.4S, v31.s[0] +sub v4.4s, v10.4s, v29.4s +sqrdmulh v27.4S, v9.4S, v22.s[1] +add v10.4s, v10.4s, v29.4s +mul v9.4S, v9.4S,v30.s[1] +sqrdmulh v29.4S, v6.4S, v22.s[2] +sub v18.4s, v19.4s, v8.4s +mul v6.4S, v6.4S,v30.s[2] +add v19.4s, v19.4s, v8.4s +sqrdmulh v22.4S, v10.4S, v15.s[1] +sub v30.4s, v24.4s, v1.4s +mul v10.4S, v10.4S,v13.s[1] +add v24.4s, v24.4s, v1.4s +sqrdmulh v1.4S, v4.4S, v15.s[2] +sub v8.4s, v16.4s, v11.4s +mul v4.4S, v4.4S,v13.s[2] +add v16.4s, v16.4s, v11.4s +mla v9.4S, v27.4S, v31.s[0] +sub v27.4s, v20.4s, v3.4s +ldr q15, [x0, #992] +sqrdmulh v13.4S, v24.4S, v7.s[1] +add v20.4s, v20.4s, v3.4s +mla v6.4S, v29.4S, v31.s[0] +ldr q29, [x0, #928] +sqrdmulh v3.4S, v30.4S, v7.s[2] +sub v11.4s, v23.4s, v9.4s +mla v10.4S, v22.4S, v31.s[0] +ldr q22, [x0, #800] +sqrdmulh v17.4S, v20.4S, v14.s[1] +add v23.4s, v23.4s, v9.4s +str q11, [x0, #528] +mla v4.4S, v1.4S, v31.s[0] +ldr q1, [x17, #+512] +ldr q11, [x17, #+528] +sqrdmulh v9.4S, v27.4S, v14.s[2] +sub v25.4s, v26.4s, v6.4s +str q23, [x0, #512] +mul v24.4S, v24.4S,v21.s[1] +add v26.4s, v26.4s, v6.4s +mul v30.4S, v30.4S,v21.s[2] +str q25, [x0, #560] +mla v24.4S, v13.4S, v31.s[0] +sub v13.4s, v2.4s, v10.4s +mla v30.4S, v3.4S, v31.s[0] +str q26, [x0, #544] +mul v20.4S, v20.4S,v28.s[1] +str q13, [x0, #592] +mul v27.4S, v27.4S,v28.s[2] +add v2.4s, v2.4s, v10.4s +str q2, [x0, #576] +mla v20.4S, v17.4S, v31.s[0] +sub v17.4s, v5.4s, v4.4s +str q17, [x0, #624] +mla v27.4S, v9.4S, v31.s[0] +add v5.4s, v5.4s, v4.4s +str q5, [x0, #608] +sqrdmulh v14.4S, v22.4S, v11.s[0] +sub v28.4s, v19.4s, v24.4s +mul v22.4S, v22.4S,v1.s[0] +str q28, [x0, #656] +ldr q28, [x0, #816] +sqrdmulh v5.4S, v28.4S, v11.s[0] +add v19.4s, v19.4s, v24.4s +mul v28.4S, v28.4S,v1.s[0] +str q19, [x0, #640] +ldr q19, [x17, #+544] +ldr q24, [x17, #+560] +ldr q4, [x0, #864] +sqrdmulh v9.4S, v4.4S, v24.s[0] +sub v17.4s, v18.4s, v30.4s +mul v4.4S, v4.4S,v19.s[0] +str q17, [x0, #688] +ldr q17, [x0, #880] +sqrdmulh v2.4S, v17.4S, v24.s[0] +add v18.4s, v18.4s, v30.4s +mul v17.4S, v17.4S,v19.s[0] +str q18, [x0, #672] +ldr q18, [x17, #+576] +ldr q30, [x17, #+592] +mla v22.4S, v14.4S, v31.s[0] +sub v14.4s, v16.4s, v20.4s +sqrdmulh v10.4S, v29.4S, v30.s[0] +str q14, [x0, #720] +ldr q14, [x0, #944] +mla v28.4S, v5.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v14.4S, v30.s[0] +str q16, [x0, #704] +ldr q16, [x17, #+608] +ldr q5, [x17, #+624] +mla v4.4S, v9.4S, v31.s[0] +sub v9.4s, v8.4s, v27.4s +sqrdmulh v13.4S, v15.4S, v5.s[0] +str q9, [x0, #752] +ldr q9, [x0, #1008] +mla v17.4S, v2.4S, v31.s[0] +add v8.4s, v8.4s, v27.4s +sqrdmulh v27.4S, v9.4S, v5.s[0] +str q8, [x0, #736] +ldr q8, [x0, #768] +ldr q2, [x0, #896] +mul v29.4S, v29.4S,v18.s[0] +sub v7.4s, v8.4s, v22.4s +ldr q21, [x0, #784] +mul v14.4S, v14.4S,v18.s[0] +add v8.4s, v8.4s, v22.4s +ldr q22, [x0, #912] +mla v29.4S, v10.4S, v31.s[0] +sub v10.4s, v21.4s, v28.4s +ldr q26, [x0, #832] +mla v14.4S, v20.4S, v31.s[0] +add v21.4s, v21.4s, v28.4s +ldr q28, [x0, #960] +mul v15.4S, v15.4S,v16.s[0] +sub v20.4s, v26.4s, v4.4s +ldr q3, [x0, #848] +mul v9.4S, v9.4S,v16.s[0] +add v26.4s, v26.4s, v4.4s +ldr q4, [x0, #976] +mla v15.4S, v13.4S, v31.s[0] +mla v9.4S, v27.4S, v31.s[0] +sub v27.4s, v3.4s, v17.4s +sqrdmulh v13.4S, v21.4S, v11.s[1] +add v3.4s, v3.4s, v17.4s +mul v21.4S, v21.4S,v1.s[1] +sqrdmulh v17.4S, v10.4S, v11.s[2] +sub v25.4s, v2.4s, v29.4s +mul v10.4S, v10.4S,v1.s[2] +add v2.4s, v2.4s, v29.4s +sqrdmulh v11.4S, v3.4S, v24.s[1] +sub v1.4s, v22.4s, v14.4s +mul v3.4S, v3.4S,v19.s[1] +add v22.4s, v22.4s, v14.4s +sqrdmulh v14.4S, v27.4S, v24.s[2] +sub v29.4s, v28.4s, v15.4s +mul v27.4S, v27.4S,v19.s[2] +add v28.4s, v28.4s, v15.4s +mla v21.4S, v13.4S, v31.s[0] +sub v13.4s, v4.4s, v9.4s +sqrdmulh v24.4S, v22.4S, v30.s[1] +add v4.4s, v4.4s, v9.4s +mla v10.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v1.4S, v30.s[2] +sub v9.4s, v8.4s, v21.4s +mla v3.4S, v11.4S, v31.s[0] +sqrdmulh v11.4S, v4.4S, v5.s[1] +add v8.4s, v8.4s, v21.4s +str q9, [x0, #784] +mla v27.4S, v14.4S, v31.s[0] +sqrdmulh v14.4S, v13.4S, v5.s[2] +sub v9.4s, v7.4s, v10.4s +str q8, [x0, #768] +mul v22.4S, v22.4S,v18.s[1] +add v7.4s, v7.4s, v10.4s +mul v1.4S, v1.4S,v18.s[2] +str q9, [x0, #816] +mla v22.4S, v24.4S, v31.s[0] +sub v24.4s, v26.4s, v3.4s +mla v1.4S, v17.4S, v31.s[0] +str q7, [x0, #800] +mul v4.4S, v4.4S,v16.s[1] +str q24, [x0, #848] +mul v13.4S, v13.4S,v16.s[2] +add v26.4s, v26.4s, v3.4s +str q26, [x0, #832] +mla v4.4S, v11.4S, v31.s[0] +sub v11.4s, v20.4s, v27.4s +str q11, [x0, #880] +mla v13.4S, v14.4S, v31.s[0] +add v20.4s, v20.4s, v27.4s +str q20, [x0, #864] +sub v5.4s, v2.4s, v22.4s +str q5, [x0, #912] +add v2.4s, v2.4s, v22.4s +str q2, [x0, #896] +sub v2.4s, v25.4s, v1.4s +str q2, [x0, #944] +add v25.4s, v25.4s, v1.4s +str q25, [x0, #928] +sub v25.4s, v28.4s, v4.4s +str q25, [x0, #976] +add v28.4s, v28.4s, v4.4s +str q28, [x0, #960] +sub v28.4s, v29.4s, v13.4s +str q28, [x0, #1008] +add v29.4s, v29.4s, v13.4s +str q29, [x0, #992] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1520 +// Instruction count: 1516 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_8.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_8.s new file mode 100644 index 0000000..a3ac527 --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_8.s @@ -0,0 +1,1550 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_22_z4_8 +.global _ntt_u32_incomplete_neon_asm_var_4_2_22_z4_8 +ntt_u32_incomplete_neon_asm_var_4_2_22_z4_8: +_ntt_u32_incomplete_neon_asm_var_4_2_22_z4_8: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x0, #992] +sqrdmulh v27.4S, v28.4S, v29.s[0] +mul v28.4S, v28.4S,v30.s[0] +ldr q26, [x0, #928] +sqrdmulh v25.4S, v26.4S, v29.s[0] +mul v26.4S, v26.4S,v30.s[0] +ldr q24, [x0, #864] +sqrdmulh v23.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +ldr q22, [x0, #800] +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +ldr q20, [x0, #736] +sqrdmulh v19.4S, v20.4S, v29.s[0] +mla v28.4S, v27.4S, v31.s[0] +ldr q27, [x0, #672] +sqrdmulh v18.4S, v27.4S, v29.s[0] +mla v26.4S, v25.4S, v31.s[0] +ldr q25, [x0, #608] +sqrdmulh v17.4S, v25.4S, v29.s[0] +mla v24.4S, v23.4S, v31.s[0] +ldr q23, [x0, #544] +sqrdmulh v16.4S, v23.4S, v29.s[0] +mla v22.4S, v21.4S, v31.s[0] +ldr q21, [x0, #480] +mul v27.4S, v27.4S,v30.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q3, [x0, #416] +ldr q2, [x0, #352] +ldr q1, [x0, #288] +mla v27.4S, v18.4S, v31.s[0] +mla v20.4S, v19.4S, v31.s[0] +ldr q19, [x0, #224] +ldr q18, [x0, #160] +mul v23.4S, v23.4S,v30.s[0] +mul v25.4S, v25.4S,v30.s[0] +ldr q0, [x0, #96] +ldr q15, [x0, #32] +mla v23.4S, v16.4S, v31.s[0] +mla v25.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v28.4s +add v21.4s, v21.4s, v28.4s +sqrdmulh v28.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +sub v16.4s, v3.4s, v26.4s +add v3.4s, v3.4s, v26.4s +sqrdmulh v26.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +sub v14.4s, v2.4s, v24.4s +add v2.4s, v2.4s, v24.4s +sqrdmulh v24.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v13.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +sqrdmulh v22.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v12.4s, v19.4s, v20.4s +add v19.4s, v19.4s, v20.4s +sqrdmulh v20.4S, v14.4S, v29.s[2] +mla v17.4S, v28.4S, v31.s[0] +sub v28.4s, v18.4s, v27.4s +add v18.4s, v18.4s, v27.4s +sqrdmulh v27.4S, v13.4S, v29.s[2] +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v0.4s, v25.4s +add v0.4s, v0.4s, v25.4s +sqrdmulh v25.4S, v2.4S, v29.s[1] +mla v21.4S, v24.4S, v31.s[0] +sub v24.4s, v15.4s, v23.4s +sqrdmulh v11.4S, v1.4S, v29.s[1] +mla v3.4S, v22.4S, v31.s[0] +add v15.4s, v15.4s, v23.4s +ldr q23, [x17, #+32] +ldr q22, [x17, #+48] +mul v13.4S, v13.4S,v30.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v10.4s, v12.4s, v17.4s +add v12.4s, v12.4s, v17.4s +mla v13.4S, v27.4S, v31.s[0] +mla v14.4S, v20.4S, v31.s[0] +sub v20.4s, v28.4s, v16.4s +add v28.4s, v28.4s, v16.4s +mul v1.4S, v1.4S,v30.s[1] +mul v2.4S, v2.4S,v30.s[1] +sub v16.4s, v19.4s, v21.4s +add v19.4s, v19.4s, v21.4s +mla v1.4S, v11.4S, v31.s[0] +mla v2.4S, v25.4S, v31.s[0] +sub v25.4s, v18.4s, v3.4s +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v10.4S, v22.s[3] +mul v10.4S, v10.4S,v23.s[3] +sub v11.4s, v26.4s, v14.4s +add v26.4s, v26.4s, v14.4s +sqrdmulh v14.4S, v12.4S, v22.s[2] +mul v12.4S, v12.4S,v23.s[2] +sub v21.4s, v24.4s, v13.4s +add v24.4s, v24.4s, v13.4s +sqrdmulh v13.4S, v16.4S, v22.s[1] +mul v16.4S, v16.4S,v23.s[1] +sub v27.4s, v0.4s, v2.4s +add v0.4s, v0.4s, v2.4s +sqrdmulh v2.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v17.4s, v15.4s, v1.4s +add v15.4s, v15.4s, v1.4s +ldr q1, [x17, #+96] +ldr q9, [x17, #+112] +sqrdmulh v8.4S, v20.4S, v22.s[3] +mla v10.4S, v3.4S, v31.s[0] +nop +nop +sqrdmulh v3.4S, v28.4S, v22.s[2] +mla v12.4S, v14.4S, v31.s[0] +nop +nop +sqrdmulh v14.4S, v25.4S, v22.s[1] +mla v16.4S, v13.4S, v31.s[0] +nop +nop +sqrdmulh v13.4S, v18.4S, v22.s[0] +mla v19.4S, v2.4S, v31.s[0] +nop +nop +ldr q2, [x17, #+64] +ldr q7, [x17, #+80] +mul v28.4S, v28.4S,v23.s[2] +mul v20.4S, v20.4S,v23.s[3] +sub v6.4s, v11.4s, v10.4s +add v11.4s, v11.4s, v10.4s +mla v28.4S, v3.4S, v31.s[0] +mla v20.4S, v8.4S, v31.s[0] +sub v8.4s, v26.4s, v12.4s +add v26.4s, v26.4s, v12.4s +mul v18.4S, v18.4S,v23.s[0] +mul v25.4S, v25.4S,v23.s[1] +sub v12.4s, v27.4s, v16.4s +add v27.4s, v27.4s, v16.4s +mla v18.4S, v13.4S, v31.s[0] +mla v25.4S, v14.4S, v31.s[0] +sub v14.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v9.s[3] +mul v6.4S, v6.4S,v1.s[3] +sub v13.4s, v21.4s, v20.4s +add v21.4s, v21.4s, v20.4s +sqrdmulh v20.4S, v11.4S, v9.s[2] +mul v11.4S, v11.4S,v1.s[2] +sub v16.4s, v24.4s, v28.4s +add v24.4s, v24.4s, v28.4s +sqrdmulh v28.4S, v8.4S, v9.s[1] +mul v8.4S, v8.4S,v1.s[1] +sub v3.4s, v17.4s, v25.4s +add v17.4s, v17.4s, v25.4s +sqrdmulh v25.4S, v26.4S, v9.s[0] +mul v26.4S, v26.4S,v1.s[0] +sub v10.4s, v15.4s, v18.4s +add v15.4s, v15.4s, v18.4s +sqrdmulh v18.4S, v12.4S, v7.s[3] +mla v6.4S, v19.4S, v31.s[0] +nop +nop +sqrdmulh v19.4S, v27.4S, v7.s[2] +mla v11.4S, v20.4S, v31.s[0] +nop +nop +sqrdmulh v20.4S, v14.4S, v7.s[1] +mla v8.4S, v28.4S, v31.s[0] +nop +nop +sqrdmulh v28.4S, v0.4S, v7.s[0] +mla v26.4S, v25.4S, v31.s[0] +nop +nop +mul v27.4S, v27.4S,v2.s[2] +mul v12.4S, v12.4S,v2.s[3] +sub v25.4s, v13.4s, v6.4s +str q25, [x0, #992] +mla v27.4S, v19.4S, v31.s[0] +mla v12.4S, v18.4S, v31.s[0] +add v13.4s, v13.4s, v6.4s +str q13, [x0, #928] +mul v0.4S, v0.4S,v2.s[0] +mul v14.4S, v14.4S,v2.s[1] +sub v13.4s, v21.4s, v11.4s +str q13, [x0, #864] +mla v0.4S, v28.4S, v31.s[0] +mla v14.4S, v20.4S, v31.s[0] +add v21.4s, v21.4s, v11.4s +sub v11.4s, v16.4s, v8.4s +ldr q20, [x0, #1008] +sqrdmulh v28.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v16.4s, v16.4s, v8.4s +str q21, [x0, #800] +ldr q21, [x0, #944] +sqrdmulh v8.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +sub v13.4s, v24.4s, v26.4s +str q11, [x0, #736] +ldr q11, [x0, #880] +sqrdmulh v6.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +add v24.4s, v24.4s, v26.4s +str q16, [x0, #672] +ldr q16, [x0, #816] +sqrdmulh v26.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v18.4s, v3.4s, v12.4s +str q13, [x0, #608] +ldr q13, [x0, #752] +sqrdmulh v19.4S, v13.4S, v29.s[0] +mla v20.4S, v28.4S, v31.s[0] +add v3.4s, v3.4s, v12.4s +str q24, [x0, #544] +ldr q24, [x0, #688] +sqrdmulh v12.4S, v24.4S, v29.s[0] +mla v21.4S, v8.4S, v31.s[0] +sub v8.4s, v17.4s, v27.4s +str q18, [x0, #480] +ldr q18, [x0, #624] +sqrdmulh v28.4S, v18.4S, v29.s[0] +mla v11.4S, v6.4S, v31.s[0] +add v17.4s, v17.4s, v27.4s +str q3, [x0, #416] +ldr q3, [x0, #560] +sqrdmulh v27.4S, v3.4S, v29.s[0] +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v10.4s, v14.4s +str q8, [x0, #352] +ldr q8, [x0, #496] +add v10.4s, v10.4s, v14.4s +mul v24.4S, v24.4S,v30.s[0] +mul v13.4S, v13.4S,v30.s[0] +ldr q14, [x0, #432] +str q17, [x0, #288] +ldr q17, [x0, #368] +ldr q6, [x0, #304] +mla v24.4S, v12.4S, v31.s[0] +mla v13.4S, v19.4S, v31.s[0] +str q26, [x0, #224] +sub v26.4s, v15.4s, v0.4s +ldr q19, [x0, #240] +ldr q12, [x0, #176] +mul v3.4S, v3.4S,v30.s[0] +mul v18.4S, v18.4S,v30.s[0] +str q10, [x0, #160] +add v15.4s, v15.4s, v0.4s +ldr q0, [x0, #112] +ldr q10, [x0, #48] +mla v3.4S, v27.4S, v31.s[0] +mla v18.4S, v28.4S, v31.s[0] +sub v28.4s, v8.4s, v20.4s +add v8.4s, v8.4s, v20.4s +sqrdmulh v20.4S, v28.4S, v29.s[2] +mul v28.4S, v28.4S,v30.s[2] +sub v27.4s, v14.4s, v21.4s +add v14.4s, v14.4s, v21.4s +sqrdmulh v21.4S, v27.4S, v29.s[2] +mul v27.4S, v27.4S,v30.s[2] +sub v25.4s, v17.4s, v11.4s +add v17.4s, v17.4s, v11.4s +sqrdmulh v11.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +sub v5.4s, v6.4s, v16.4s +add v6.4s, v6.4s, v16.4s +sqrdmulh v16.4S, v14.4S, v29.s[1] +mul v14.4S, v14.4S,v30.s[1] +sub v4.4s, v19.4s, v13.4s +add v19.4s, v19.4s, v13.4s +sqrdmulh v13.4S, v25.4S, v29.s[2] +mla v28.4S, v20.4S, v31.s[0] +sub v20.4s, v12.4s, v24.4s +add v12.4s, v12.4s, v24.4s +sqrdmulh v24.4S, v5.4S, v29.s[2] +mla v27.4S, v21.4S, v31.s[0] +sub v21.4s, v0.4s, v18.4s +add v0.4s, v0.4s, v18.4s +sqrdmulh v18.4S, v17.4S, v29.s[1] +mla v8.4S, v11.4S, v31.s[0] +sub v11.4s, v10.4s, v3.4s +str q26, [x0, #96] +sqrdmulh v26.4S, v6.4S, v29.s[1] +mla v14.4S, v16.4S, v31.s[0] +add v10.4s, v10.4s, v3.4s +str q15, [x0, #32] +mul v5.4S, v5.4S,v30.s[2] +mul v25.4S, v25.4S,v30.s[2] +sub v15.4s, v4.4s, v28.4s +add v4.4s, v4.4s, v28.4s +mla v5.4S, v24.4S, v31.s[0] +mla v25.4S, v13.4S, v31.s[0] +sub v13.4s, v20.4s, v27.4s +add v20.4s, v20.4s, v27.4s +mul v6.4S, v6.4S,v30.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v27.4s, v19.4s, v8.4s +add v19.4s, v19.4s, v8.4s +mla v6.4S, v26.4S, v31.s[0] +mla v17.4S, v18.4S, v31.s[0] +sub v18.4s, v12.4s, v14.4s +add v12.4s, v12.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v22.s[3] +mul v15.4S, v15.4S,v23.s[3] +sub v26.4s, v21.4s, v25.4s +add v21.4s, v21.4s, v25.4s +sqrdmulh v25.4S, v4.4S, v22.s[2] +mul v4.4S, v4.4S,v23.s[2] +sub v8.4s, v11.4s, v5.4s +add v11.4s, v11.4s, v5.4s +sqrdmulh v5.4S, v27.4S, v22.s[1] +mul v27.4S, v27.4S,v23.s[1] +sub v24.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +sqrdmulh v17.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v28.4s, v10.4s, v6.4s +add v10.4s, v10.4s, v6.4s +sqrdmulh v6.4S, v13.4S, v22.s[3] +mla v15.4S, v14.4S, v31.s[0] +nop +nop +sqrdmulh v14.4S, v20.4S, v22.s[2] +mla v4.4S, v25.4S, v31.s[0] +nop +nop +sqrdmulh v25.4S, v18.4S, v22.s[1] +mla v27.4S, v5.4S, v31.s[0] +nop +nop +sqrdmulh v5.4S, v12.4S, v22.s[0] +mla v19.4S, v17.4S, v31.s[0] +nop +nop +mul v20.4S, v20.4S,v23.s[2] +mul v13.4S, v13.4S,v23.s[3] +sub v17.4s, v26.4s, v15.4s +add v26.4s, v26.4s, v15.4s +mla v20.4S, v14.4S, v31.s[0] +mla v13.4S, v6.4S, v31.s[0] +sub v6.4s, v21.4s, v4.4s +add v21.4s, v21.4s, v4.4s +mul v12.4S, v12.4S,v23.s[0] +mul v18.4S, v18.4S,v23.s[1] +sub v4.4s, v24.4s, v27.4s +add v24.4s, v24.4s, v27.4s +mla v12.4S, v5.4S, v31.s[0] +mla v18.4S, v25.4S, v31.s[0] +sub v25.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v17.4S, v9.s[3] +mul v17.4S, v17.4S,v1.s[3] +sub v5.4s, v8.4s, v13.4s +add v8.4s, v8.4s, v13.4s +sqrdmulh v13.4S, v26.4S, v9.s[2] +mul v26.4S, v26.4S,v1.s[2] +sub v27.4s, v11.4s, v20.4s +add v11.4s, v11.4s, v20.4s +sqrdmulh v20.4S, v6.4S, v9.s[1] +mul v6.4S, v6.4S,v1.s[1] +sub v14.4s, v28.4s, v18.4s +add v28.4s, v28.4s, v18.4s +sqrdmulh v18.4S, v21.4S, v9.s[0] +mul v21.4S, v21.4S,v1.s[0] +sub v15.4s, v10.4s, v12.4s +add v10.4s, v10.4s, v12.4s +sqrdmulh v12.4S, v4.4S, v7.s[3] +mla v17.4S, v19.4S, v31.s[0] +nop +nop +sqrdmulh v19.4S, v24.4S, v7.s[2] +mla v26.4S, v13.4S, v31.s[0] +nop +nop +sqrdmulh v13.4S, v25.4S, v7.s[1] +mla v6.4S, v20.4S, v31.s[0] +nop +nop +sqrdmulh v20.4S, v0.4S, v7.s[0] +mla v21.4S, v18.4S, v31.s[0] +nop +nop +mul v24.4S, v24.4S,v2.s[2] +mul v4.4S, v4.4S,v2.s[3] +sub v18.4s, v5.4s, v17.4s +str q18, [x0, #1008] +mla v24.4S, v19.4S, v31.s[0] +mla v4.4S, v12.4S, v31.s[0] +add v5.4s, v5.4s, v17.4s +str q5, [x0, #944] +mul v0.4S, v0.4S,v2.s[0] +mul v25.4S, v25.4S,v2.s[1] +sub v5.4s, v8.4s, v26.4s +str q5, [x0, #880] +mla v0.4S, v20.4S, v31.s[0] +mla v25.4S, v13.4S, v31.s[0] +add v8.4s, v8.4s, v26.4s +sub v26.4s, v27.4s, v6.4s +ldr q13, [x0, #960] +sqrdmulh v20.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +add v27.4s, v27.4s, v6.4s +str q8, [x0, #816] +ldr q8, [x0, #896] +sqrdmulh v6.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v5.4s, v11.4s, v21.4s +str q26, [x0, #752] +ldr q26, [x0, #832] +sqrdmulh v17.4S, v26.4S, v29.s[0] +mul v26.4S, v26.4S,v30.s[0] +add v11.4s, v11.4s, v21.4s +str q27, [x0, #688] +ldr q27, [x0, #768] +sqrdmulh v21.4S, v27.4S, v29.s[0] +mul v27.4S, v27.4S,v30.s[0] +sub v12.4s, v14.4s, v4.4s +str q5, [x0, #624] +ldr q5, [x0, #704] +sqrdmulh v19.4S, v5.4S, v29.s[0] +mla v13.4S, v20.4S, v31.s[0] +add v14.4s, v14.4s, v4.4s +str q11, [x0, #560] +ldr q11, [x0, #640] +sqrdmulh v4.4S, v11.4S, v29.s[0] +mla v8.4S, v6.4S, v31.s[0] +sub v6.4s, v28.4s, v24.4s +str q12, [x0, #496] +ldr q12, [x0, #576] +sqrdmulh v20.4S, v12.4S, v29.s[0] +mla v26.4S, v17.4S, v31.s[0] +add v28.4s, v28.4s, v24.4s +str q14, [x0, #432] +ldr q14, [x0, #512] +sqrdmulh v24.4S, v14.4S, v29.s[0] +mla v27.4S, v21.4S, v31.s[0] +sub v21.4s, v15.4s, v25.4s +str q6, [x0, #368] +ldr q6, [x0, #448] +add v15.4s, v15.4s, v25.4s +mul v11.4S, v11.4S,v30.s[0] +mul v5.4S, v5.4S,v30.s[0] +ldr q25, [x0, #384] +str q28, [x0, #304] +ldr q28, [x0, #320] +ldr q17, [x0, #256] +mla v11.4S, v4.4S, v31.s[0] +mla v5.4S, v19.4S, v31.s[0] +str q21, [x0, #240] +sub v21.4s, v10.4s, v0.4s +ldr q19, [x0, #192] +ldr q4, [x0, #128] +mul v14.4S, v14.4S,v30.s[0] +mul v12.4S, v12.4S,v30.s[0] +str q15, [x0, #176] +add v10.4s, v10.4s, v0.4s +ldr q0, [x0, #64] +ldr q15, [x0, #0] +mla v14.4S, v24.4S, v31.s[0] +mla v12.4S, v20.4S, v31.s[0] +sub v20.4s, v6.4s, v13.4s +add v6.4s, v6.4s, v13.4s +sqrdmulh v13.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +sub v24.4s, v25.4s, v8.4s +add v25.4s, v25.4s, v8.4s +sqrdmulh v8.4S, v24.4S, v29.s[2] +mul v24.4S, v24.4S,v30.s[2] +sub v18.4s, v28.4s, v26.4s +add v28.4s, v28.4s, v26.4s +sqrdmulh v26.4S, v6.4S, v29.s[1] +mul v6.4S, v6.4S,v30.s[1] +sub v3.4s, v17.4s, v27.4s +add v17.4s, v17.4s, v27.4s +sqrdmulh v27.4S, v25.4S, v29.s[1] +mul v25.4S, v25.4S,v30.s[1] +sub v16.4s, v19.4s, v5.4s +add v19.4s, v19.4s, v5.4s +sqrdmulh v5.4S, v18.4S, v29.s[2] +mla v20.4S, v13.4S, v31.s[0] +sub v13.4s, v4.4s, v11.4s +add v4.4s, v4.4s, v11.4s +sqrdmulh v11.4S, v3.4S, v29.s[2] +mla v24.4S, v8.4S, v31.s[0] +sub v8.4s, v0.4s, v12.4s +add v0.4s, v0.4s, v12.4s +sqrdmulh v12.4S, v28.4S, v29.s[1] +mla v6.4S, v26.4S, v31.s[0] +sub v26.4s, v15.4s, v14.4s +str q21, [x0, #112] +sqrdmulh v21.4S, v17.4S, v29.s[1] +mla v25.4S, v27.4S, v31.s[0] +add v15.4s, v15.4s, v14.4s +str q10, [x0, #48] +mul v3.4S, v3.4S,v30.s[2] +mul v18.4S, v18.4S,v30.s[2] +sub v10.4s, v16.4s, v20.4s +add v16.4s, v16.4s, v20.4s +mla v3.4S, v11.4S, v31.s[0] +mla v18.4S, v5.4S, v31.s[0] +sub v5.4s, v13.4s, v24.4s +add v13.4s, v13.4s, v24.4s +mul v17.4S, v17.4S,v30.s[1] +mul v28.4S, v28.4S,v30.s[1] +sub v24.4s, v19.4s, v6.4s +add v19.4s, v19.4s, v6.4s +mla v17.4S, v21.4S, v31.s[0] +mla v28.4S, v12.4S, v31.s[0] +sub v12.4s, v4.4s, v25.4s +add v4.4s, v4.4s, v25.4s +sqrdmulh v25.4S, v10.4S, v22.s[3] +mul v10.4S, v10.4S,v23.s[3] +sub v21.4s, v8.4s, v18.4s +add v8.4s, v8.4s, v18.4s +sqrdmulh v18.4S, v16.4S, v22.s[2] +mul v16.4S, v16.4S,v23.s[2] +sub v6.4s, v26.4s, v3.4s +add v26.4s, v26.4s, v3.4s +sqrdmulh v3.4S, v24.4S, v22.s[1] +mul v24.4S, v24.4S,v23.s[1] +sub v11.4s, v0.4s, v28.4s +add v0.4s, v0.4s, v28.4s +sqrdmulh v28.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v20.4s, v15.4s, v17.4s +add v15.4s, v15.4s, v17.4s +sqrdmulh v17.4S, v5.4S, v22.s[3] +mla v10.4S, v25.4S, v31.s[0] +nop +nop +sqrdmulh v25.4S, v13.4S, v22.s[2] +mla v16.4S, v18.4S, v31.s[0] +nop +nop +sqrdmulh v18.4S, v12.4S, v22.s[1] +mla v24.4S, v3.4S, v31.s[0] +nop +nop +sqrdmulh v3.4S, v4.4S, v22.s[0] +mla v19.4S, v28.4S, v31.s[0] +nop +nop +mul v13.4S, v13.4S,v23.s[2] +mul v5.4S, v5.4S,v23.s[3] +sub v28.4s, v21.4s, v10.4s +add v21.4s, v21.4s, v10.4s +mla v13.4S, v25.4S, v31.s[0] +mla v5.4S, v17.4S, v31.s[0] +sub v17.4s, v8.4s, v16.4s +add v8.4s, v8.4s, v16.4s +mul v4.4S, v4.4S,v23.s[0] +mul v12.4S, v12.4S,v23.s[1] +sub v16.4s, v11.4s, v24.4s +add v11.4s, v11.4s, v24.4s +mla v4.4S, v3.4S, v31.s[0] +mla v12.4S, v18.4S, v31.s[0] +sub v18.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v28.4S, v9.s[3] +mul v28.4S, v28.4S,v1.s[3] +sub v3.4s, v6.4s, v5.4s +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v21.4S, v9.s[2] +mul v21.4S, v21.4S,v1.s[2] +sub v24.4s, v26.4s, v13.4s +add v26.4s, v26.4s, v13.4s +sqrdmulh v13.4S, v17.4S, v9.s[1] +mul v17.4S, v17.4S,v1.s[1] +sub v25.4s, v20.4s, v12.4s +add v20.4s, v20.4s, v12.4s +sqrdmulh v12.4S, v8.4S, v9.s[0] +mul v8.4S, v8.4S,v1.s[0] +sub v10.4s, v15.4s, v4.4s +add v15.4s, v15.4s, v4.4s +sqrdmulh v4.4S, v16.4S, v7.s[3] +mla v28.4S, v19.4S, v31.s[0] +nop +nop +sqrdmulh v19.4S, v11.4S, v7.s[2] +mla v21.4S, v5.4S, v31.s[0] +nop +nop +sqrdmulh v5.4S, v18.4S, v7.s[1] +mla v17.4S, v13.4S, v31.s[0] +nop +nop +sqrdmulh v13.4S, v0.4S, v7.s[0] +mla v8.4S, v12.4S, v31.s[0] +nop +nop +mul v11.4S, v11.4S,v2.s[2] +mul v16.4S, v16.4S,v2.s[3] +sub v12.4s, v3.4s, v28.4s +str q12, [x0, #960] +mla v11.4S, v19.4S, v31.s[0] +mla v16.4S, v4.4S, v31.s[0] +add v3.4s, v3.4s, v28.4s +str q3, [x0, #896] +mul v0.4S, v0.4S,v2.s[0] +mul v18.4S, v18.4S,v2.s[1] +sub v3.4s, v6.4s, v21.4s +str q3, [x0, #832] +mla v0.4S, v13.4S, v31.s[0] +mla v18.4S, v5.4S, v31.s[0] +add v6.4s, v6.4s, v21.4s +sub v21.4s, v24.4s, v17.4s +ldr q5, [x0, #976] +sqrdmulh v13.4S, v5.4S, v29.s[0] +mul v5.4S, v5.4S,v30.s[0] +add v24.4s, v24.4s, v17.4s +str q6, [x0, #768] +ldr q6, [x0, #912] +sqrdmulh v17.4S, v6.4S, v29.s[0] +mul v6.4S, v6.4S,v30.s[0] +sub v3.4s, v26.4s, v8.4s +str q21, [x0, #704] +ldr q21, [x0, #848] +sqrdmulh v28.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +add v26.4s, v26.4s, v8.4s +str q24, [x0, #640] +ldr q24, [x0, #784] +sqrdmulh v8.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +sub v4.4s, v25.4s, v16.4s +str q3, [x0, #576] +ldr q3, [x0, #720] +sqrdmulh v19.4S, v3.4S, v29.s[0] +mla v5.4S, v13.4S, v31.s[0] +add v25.4s, v25.4s, v16.4s +str q26, [x0, #512] +ldr q26, [x0, #656] +sqrdmulh v16.4S, v26.4S, v29.s[0] +mla v6.4S, v17.4S, v31.s[0] +sub v17.4s, v20.4s, v11.4s +str q4, [x0, #448] +ldr q4, [x0, #592] +sqrdmulh v13.4S, v4.4S, v29.s[0] +mla v21.4S, v28.4S, v31.s[0] +add v20.4s, v20.4s, v11.4s +str q25, [x0, #384] +ldr q25, [x0, #528] +sqrdmulh v11.4S, v25.4S, v29.s[0] +mla v24.4S, v8.4S, v31.s[0] +sub v8.4s, v10.4s, v18.4s +str q17, [x0, #320] +ldr q17, [x0, #464] +add v10.4s, v10.4s, v18.4s +mul v26.4S, v26.4S,v30.s[0] +mul v3.4S, v3.4S,v30.s[0] +ldr q18, [x0, #400] +str q20, [x0, #256] +ldr q20, [x0, #336] +ldr q28, [x0, #272] +mla v26.4S, v16.4S, v31.s[0] +mla v3.4S, v19.4S, v31.s[0] +str q8, [x0, #192] +sub v8.4s, v15.4s, v0.4s +ldr q19, [x0, #208] +ldr q16, [x0, #144] +mul v25.4S, v25.4S,v30.s[0] +mul v4.4S, v4.4S,v30.s[0] +str q10, [x0, #128] +add v15.4s, v15.4s, v0.4s +ldr q0, [x0, #80] +ldr q10, [x0, #16] +mla v25.4S, v11.4S, v31.s[0] +mla v4.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v5.4s +add v17.4s, v17.4s, v5.4s +sqrdmulh v5.4S, v13.4S, v29.s[2] +mul v13.4S, v13.4S,v30.s[2] +sub v11.4s, v18.4s, v6.4s +add v18.4s, v18.4s, v6.4s +sqrdmulh v6.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v12.4s, v20.4s, v21.4s +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v14.4s, v28.4s, v24.4s +add v28.4s, v28.4s, v24.4s +sqrdmulh v24.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v27.4s, v19.4s, v3.4s +add v19.4s, v19.4s, v3.4s +sqrdmulh v3.4S, v12.4S, v29.s[2] +mla v13.4S, v5.4S, v31.s[0] +sub v5.4s, v16.4s, v26.4s +add v16.4s, v16.4s, v26.4s +sqrdmulh v26.4S, v14.4S, v29.s[2] +mla v11.4S, v6.4S, v31.s[0] +sub v6.4s, v0.4s, v4.4s +add v0.4s, v0.4s, v4.4s +sqrdmulh v4.4S, v20.4S, v29.s[1] +mla v17.4S, v21.4S, v31.s[0] +sub v21.4s, v10.4s, v25.4s +str q8, [x0, #64] +sqrdmulh v8.4S, v28.4S, v29.s[1] +mla v18.4S, v24.4S, v31.s[0] +add v10.4s, v10.4s, v25.4s +str q15, [x0, #0] +mul v14.4S, v14.4S,v30.s[2] +mul v12.4S, v12.4S,v30.s[2] +sub v15.4s, v27.4s, v13.4s +add v27.4s, v27.4s, v13.4s +mla v14.4S, v26.4S, v31.s[0] +mla v12.4S, v3.4S, v31.s[0] +sub v3.4s, v5.4s, v11.4s +add v5.4s, v5.4s, v11.4s +mul v28.4S, v28.4S,v30.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v11.4s, v19.4s, v17.4s +add v19.4s, v19.4s, v17.4s +mla v28.4S, v8.4S, v31.s[0] +mla v20.4S, v4.4S, v31.s[0] +sub v4.4s, v16.4s, v18.4s +add v16.4s, v16.4s, v18.4s +sqrdmulh v29.4S, v15.4S, v22.s[3] +mul v15.4S, v15.4S,v23.s[3] +sub v30.4s, v6.4s, v12.4s +add v6.4s, v6.4s, v12.4s +sqrdmulh v12.4S, v27.4S, v22.s[2] +mul v27.4S, v27.4S,v23.s[2] +sub v18.4s, v21.4s, v14.4s +add v21.4s, v21.4s, v14.4s +sqrdmulh v14.4S, v11.4S, v22.s[1] +mul v11.4S, v11.4S,v23.s[1] +sub v8.4s, v0.4s, v20.4s +add v0.4s, v0.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v17.4s, v10.4s, v28.4s +add v10.4s, v10.4s, v28.4s +sqrdmulh v28.4S, v3.4S, v22.s[3] +mla v15.4S, v29.4S, v31.s[0] +nop +nop +sqrdmulh v29.4S, v5.4S, v22.s[2] +mla v27.4S, v12.4S, v31.s[0] +nop +nop +sqrdmulh v12.4S, v4.4S, v22.s[1] +mla v11.4S, v14.4S, v31.s[0] +nop +nop +sqrdmulh v14.4S, v16.4S, v22.s[0] +mla v19.4S, v20.4S, v31.s[0] +nop +nop +mul v5.4S, v5.4S,v23.s[2] +mul v3.4S, v3.4S,v23.s[3] +sub v20.4s, v30.4s, v15.4s +add v30.4s, v30.4s, v15.4s +mla v5.4S, v29.4S, v31.s[0] +mla v3.4S, v28.4S, v31.s[0] +sub v28.4s, v6.4s, v27.4s +add v6.4s, v6.4s, v27.4s +mul v16.4S, v16.4S,v23.s[0] +mul v4.4S, v4.4S,v23.s[1] +sub v27.4s, v8.4s, v11.4s +add v8.4s, v8.4s, v11.4s +mla v16.4S, v14.4S, v31.s[0] +mla v4.4S, v12.4S, v31.s[0] +sub v12.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v22.4S, v20.4S, v9.s[3] +mul v20.4S, v20.4S,v1.s[3] +sub v23.4s, v18.4s, v3.4s +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v30.4S, v9.s[2] +mul v30.4S, v30.4S,v1.s[2] +sub v19.4s, v21.4s, v5.4s +add v21.4s, v21.4s, v5.4s +sqrdmulh v5.4S, v28.4S, v9.s[1] +mul v28.4S, v28.4S,v1.s[1] +sub v14.4s, v17.4s, v4.4s +add v17.4s, v17.4s, v4.4s +sqrdmulh v4.4S, v6.4S, v9.s[0] +mul v6.4S, v6.4S,v1.s[0] +sub v11.4s, v10.4s, v16.4s +add v10.4s, v10.4s, v16.4s +sqrdmulh v9.4S, v27.4S, v7.s[3] +mla v20.4S, v22.4S, v31.s[0] +nop +nop +sqrdmulh v22.4S, v8.4S, v7.s[2] +mla v30.4S, v3.4S, v31.s[0] +nop +nop +sqrdmulh v3.4S, v12.4S, v7.s[1] +mla v28.4S, v5.4S, v31.s[0] +nop +nop +sqrdmulh v5.4S, v0.4S, v7.s[0] +mla v6.4S, v4.4S, v31.s[0] +nop +nop +mul v8.4S, v8.4S,v2.s[2] +mul v27.4S, v27.4S,v2.s[3] +sub v4.4s, v23.4s, v20.4s +str q4, [x0, #976] +mla v8.4S, v22.4S, v31.s[0] +mla v27.4S, v9.4S, v31.s[0] +add v23.4s, v23.4s, v20.4s +str q23, [x0, #912] +mul v0.4S, v0.4S,v2.s[0] +mul v12.4S, v12.4S,v2.s[1] +sub v23.4s, v18.4s, v30.4s +str q23, [x0, #848] +mla v0.4S, v5.4S, v31.s[0] +mla v12.4S, v3.4S, v31.s[0] +add v18.4s, v18.4s, v30.4s +sub v30.4s, v19.4s, v28.4s +add v19.4s, v19.4s, v28.4s +str q18, [x0, #784] +sub v18.4s, v21.4s, v6.4s +str q30, [x0, #720] +add v21.4s, v21.4s, v6.4s +str q19, [x0, #656] +sub v19.4s, v14.4s, v27.4s +str q18, [x0, #592] +add v14.4s, v14.4s, v27.4s +str q21, [x0, #528] +sub v21.4s, v17.4s, v8.4s +str q19, [x0, #464] +add v17.4s, v17.4s, v8.4s +str q14, [x0, #400] +sub v14.4s, v11.4s, v12.4s +str q21, [x0, #336] +add v11.4s, v11.4s, v12.4s +str q17, [x0, #272] +sub v17.4s, v10.4s, v0.4s +add v10.4s, v10.4s, v0.4s +ldr q24, [x0, #224] +ldr q25, [x0, #160] +ldr q13, [x0, #32] +ldr q26, [x17, #+128] +ldr q15, [x17, #+144] +sqrdmulh v29.4S, v13.4S, v15.s[0] +mul v13.4S, v13.4S,v26.s[0] +ldr q16, [x0, #48] +sqrdmulh v1.4S, v16.4S, v15.s[0] +ldr q4, [x17, #+160] +mul v16.4S, v16.4S,v26.s[0] +ldr q22, [x17, #+176] +ldr q9, [x0, #96] +sqrdmulh v20.4S, v9.4S, v22.s[0] +mul v9.4S, v9.4S,v4.s[0] +ldr q23, [x0, #112] +sqrdmulh v5.4S, v23.4S, v22.s[0] +mul v23.4S, v23.4S,v4.s[0] +ldr q3, [x17, #+192] +mla v13.4S, v29.4S, v31.s[0] +ldr q29, [x17, #+208] +sqrdmulh v2.4S, v25.4S, v29.s[0] +ldr q7, [x0, #176] +mla v16.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v7.4S, v29.s[0] +ldr q28, [x17, #+224] +mla v9.4S, v20.4S, v31.s[0] +ldr q20, [x17, #+240] +sqrdmulh v30.4S, v24.4S, v20.s[0] +ldr q6, [x0, #240] +mla v23.4S, v5.4S, v31.s[0] +sqrdmulh v5.4S, v6.4S, v20.s[0] +ldr q18, [x0, #0] +ldr q27, [x0, #128] +mul v25.4S, v25.4S,v3.s[0] +sub v19.4s, v18.4s, v13.4s +mul v7.4S, v7.4S,v3.s[0] +add v18.4s, v18.4s, v13.4s +mla v25.4S, v2.4S, v31.s[0] +sub v2.4s, v10.4s, v16.4s +ldr q13, [x0, #64] +mla v7.4S, v1.4S, v31.s[0] +add v10.4s, v10.4s, v16.4s +ldr q16, [x0, #192] +mul v24.4S, v24.4S,v28.s[0] +sub v1.4s, v13.4s, v9.4s +mul v6.4S, v6.4S,v28.s[0] +add v13.4s, v13.4s, v9.4s +mla v24.4S, v30.4S, v31.s[0] +mla v6.4S, v5.4S, v31.s[0] +sub v5.4s, v17.4s, v23.4s +sqrdmulh v30.4S, v10.4S, v15.s[1] +add v17.4s, v17.4s, v23.4s +mul v10.4S, v10.4S,v26.s[1] +sqrdmulh v23.4S, v2.4S, v15.s[2] +sub v9.4s, v27.4s, v25.4s +mul v2.4S, v2.4S,v26.s[2] +add v27.4s, v27.4s, v25.4s +sqrdmulh v15.4S, v17.4S, v22.s[1] +sub v26.4s, v11.4s, v7.4s +mul v17.4S, v17.4S,v4.s[1] +add v11.4s, v11.4s, v7.4s +sqrdmulh v7.4S, v5.4S, v22.s[2] +sub v25.4s, v16.4s, v24.4s +mul v5.4S, v5.4S,v4.s[2] +add v16.4s, v16.4s, v24.4s +mla v10.4S, v30.4S, v31.s[0] +sub v30.4s, v14.4s, v6.4s +ldr q22, [x0, #480] +sqrdmulh v4.4S, v11.4S, v29.s[1] +add v14.4s, v14.4s, v6.4s +mla v2.4S, v23.4S, v31.s[0] +ldr q23, [x0, #416] +sqrdmulh v6.4S, v26.4S, v29.s[2] +sub v24.4s, v18.4s, v10.4s +mla v17.4S, v15.4S, v31.s[0] +ldr q15, [x0, #288] +sqrdmulh v8.4S, v14.4S, v20.s[1] +add v18.4s, v18.4s, v10.4s +str q24, [x0, #16] +mla v5.4S, v7.4S, v31.s[0] +ldr q7, [x17, #+256] +sqrdmulh v24.4S, v30.4S, v20.s[2] +sub v10.4s, v19.4s, v2.4s +str q18, [x0, #0] +mul v11.4S, v11.4S,v3.s[1] +add v19.4s, v19.4s, v2.4s +ldr q2, [x17, #+272] +mul v26.4S, v26.4S,v3.s[2] +str q10, [x0, #48] +mla v11.4S, v4.4S, v31.s[0] +sub v4.4s, v13.4s, v17.4s +mla v26.4S, v6.4S, v31.s[0] +str q19, [x0, #32] +mul v14.4S, v14.4S,v28.s[1] +str q4, [x0, #80] +mul v30.4S, v30.4S,v28.s[2] +add v13.4s, v13.4s, v17.4s +str q13, [x0, #64] +mla v14.4S, v8.4S, v31.s[0] +sub v8.4s, v1.4s, v5.4s +str q8, [x0, #112] +mla v30.4S, v24.4S, v31.s[0] +add v1.4s, v1.4s, v5.4s +str q1, [x0, #96] +sqrdmulh v20.4S, v15.4S, v2.s[0] +sub v28.4s, v27.4s, v11.4s +mul v15.4S, v15.4S,v7.s[0] +str q28, [x0, #144] +ldr q28, [x0, #304] +sqrdmulh v1.4S, v28.4S, v2.s[0] +add v27.4s, v27.4s, v11.4s +ldr q11, [x17, #+288] +mul v28.4S, v28.4S,v7.s[0] +str q27, [x0, #128] +ldr q27, [x17, #+304] +ldr q5, [x0, #352] +sqrdmulh v24.4S, v5.4S, v27.s[0] +sub v8.4s, v9.4s, v26.4s +mul v5.4S, v5.4S,v11.s[0] +str q8, [x0, #176] +ldr q8, [x0, #368] +sqrdmulh v13.4S, v8.4S, v27.s[0] +add v9.4s, v9.4s, v26.4s +mul v8.4S, v8.4S,v11.s[0] +str q9, [x0, #160] +ldr q9, [x17, #+320] +mla v15.4S, v20.4S, v31.s[0] +sub v20.4s, v16.4s, v14.4s +ldr q26, [x17, #+336] +sqrdmulh v17.4S, v23.4S, v26.s[0] +str q20, [x0, #208] +ldr q20, [x0, #432] +mla v28.4S, v1.4S, v31.s[0] +add v16.4s, v16.4s, v14.4s +sqrdmulh v14.4S, v20.4S, v26.s[0] +str q16, [x0, #192] +ldr q16, [x17, #+352] +mla v5.4S, v24.4S, v31.s[0] +sub v24.4s, v25.4s, v30.4s +ldr q1, [x17, #+368] +sqrdmulh v4.4S, v22.4S, v1.s[0] +str q24, [x0, #240] +ldr q24, [x0, #496] +mla v8.4S, v13.4S, v31.s[0] +add v25.4s, v25.4s, v30.4s +sqrdmulh v30.4S, v24.4S, v1.s[0] +str q25, [x0, #224] +ldr q25, [x0, #256] +ldr q13, [x0, #384] +mul v23.4S, v23.4S,v9.s[0] +sub v29.4s, v25.4s, v15.4s +ldr q3, [x0, #272] +mul v20.4S, v20.4S,v9.s[0] +add v25.4s, v25.4s, v15.4s +ldr q15, [x0, #400] +mla v23.4S, v17.4S, v31.s[0] +sub v17.4s, v3.4s, v28.4s +ldr q19, [x0, #320] +mla v20.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v28.4s +ldr q28, [x0, #448] +mul v22.4S, v22.4S,v16.s[0] +sub v14.4s, v19.4s, v5.4s +ldr q6, [x0, #336] +mul v24.4S, v24.4S,v16.s[0] +add v19.4s, v19.4s, v5.4s +ldr q5, [x0, #464] +mla v22.4S, v4.4S, v31.s[0] +mla v24.4S, v30.4S, v31.s[0] +sub v30.4s, v6.4s, v8.4s +sqrdmulh v4.4S, v3.4S, v2.s[1] +add v6.4s, v6.4s, v8.4s +mul v3.4S, v3.4S,v7.s[1] +sqrdmulh v8.4S, v17.4S, v2.s[2] +sub v10.4s, v13.4s, v23.4s +mul v17.4S, v17.4S,v7.s[2] +add v13.4s, v13.4s, v23.4s +sqrdmulh v2.4S, v6.4S, v27.s[1] +sub v7.4s, v15.4s, v20.4s +mul v6.4S, v6.4S,v11.s[1] +add v15.4s, v15.4s, v20.4s +sqrdmulh v20.4S, v30.4S, v27.s[2] +sub v23.4s, v28.4s, v22.4s +mul v30.4S, v30.4S,v11.s[2] +add v28.4s, v28.4s, v22.4s +mla v3.4S, v4.4S, v31.s[0] +sub v4.4s, v5.4s, v24.4s +ldr q27, [x0, #736] +sqrdmulh v11.4S, v15.4S, v26.s[1] +add v5.4s, v5.4s, v24.4s +mla v17.4S, v8.4S, v31.s[0] +ldr q8, [x0, #672] +sqrdmulh v24.4S, v7.4S, v26.s[2] +sub v22.4s, v25.4s, v3.4s +mla v6.4S, v2.4S, v31.s[0] +ldr q2, [x0, #544] +sqrdmulh v18.4S, v5.4S, v1.s[1] +add v25.4s, v25.4s, v3.4s +str q22, [x0, #272] +mla v30.4S, v20.4S, v31.s[0] +ldr q20, [x17, #+384] +sqrdmulh v22.4S, v4.4S, v1.s[2] +sub v3.4s, v29.4s, v17.4s +str q25, [x0, #256] +mul v15.4S, v15.4S,v9.s[1] +add v29.4s, v29.4s, v17.4s +ldr q17, [x17, #+400] +mul v7.4S, v7.4S,v9.s[2] +str q3, [x0, #304] +mla v15.4S, v11.4S, v31.s[0] +sub v11.4s, v19.4s, v6.4s +mla v7.4S, v24.4S, v31.s[0] +str q29, [x0, #288] +mul v5.4S, v5.4S,v16.s[1] +str q11, [x0, #336] +mul v4.4S, v4.4S,v16.s[2] +add v19.4s, v19.4s, v6.4s +str q19, [x0, #320] +mla v5.4S, v18.4S, v31.s[0] +sub v18.4s, v14.4s, v30.4s +str q18, [x0, #368] +mla v4.4S, v22.4S, v31.s[0] +add v14.4s, v14.4s, v30.4s +str q14, [x0, #352] +sqrdmulh v1.4S, v2.4S, v17.s[0] +sub v16.4s, v13.4s, v15.4s +mul v2.4S, v2.4S,v20.s[0] +str q16, [x0, #400] +ldr q16, [x0, #560] +sqrdmulh v14.4S, v16.4S, v17.s[0] +add v13.4s, v13.4s, v15.4s +ldr q15, [x17, #+416] +mul v16.4S, v16.4S,v20.s[0] +str q13, [x0, #384] +ldr q13, [x17, #+432] +ldr q30, [x0, #608] +sqrdmulh v22.4S, v30.4S, v13.s[0] +sub v18.4s, v10.4s, v7.4s +mul v30.4S, v30.4S,v15.s[0] +str q18, [x0, #432] +ldr q18, [x0, #624] +sqrdmulh v19.4S, v18.4S, v13.s[0] +add v10.4s, v10.4s, v7.4s +mul v18.4S, v18.4S,v15.s[0] +str q10, [x0, #416] +ldr q10, [x17, #+448] +mla v2.4S, v1.4S, v31.s[0] +sub v1.4s, v28.4s, v5.4s +ldr q7, [x17, #+464] +sqrdmulh v6.4S, v8.4S, v7.s[0] +str q1, [x0, #464] +ldr q1, [x0, #688] +mla v16.4S, v14.4S, v31.s[0] +add v28.4s, v28.4s, v5.4s +sqrdmulh v5.4S, v1.4S, v7.s[0] +str q28, [x0, #448] +ldr q28, [x17, #+480] +mla v30.4S, v22.4S, v31.s[0] +sub v22.4s, v23.4s, v4.4s +ldr q14, [x17, #+496] +sqrdmulh v11.4S, v27.4S, v14.s[0] +str q22, [x0, #496] +ldr q22, [x0, #752] +mla v18.4S, v19.4S, v31.s[0] +add v23.4s, v23.4s, v4.4s +sqrdmulh v4.4S, v22.4S, v14.s[0] +str q23, [x0, #480] +ldr q23, [x0, #512] +ldr q19, [x0, #640] +mul v8.4S, v8.4S,v10.s[0] +sub v26.4s, v23.4s, v2.4s +ldr q9, [x0, #528] +mul v1.4S, v1.4S,v10.s[0] +add v23.4s, v23.4s, v2.4s +ldr q2, [x0, #656] +mla v8.4S, v6.4S, v31.s[0] +sub v6.4s, v9.4s, v16.4s +ldr q29, [x0, #576] +mla v1.4S, v5.4S, v31.s[0] +add v9.4s, v9.4s, v16.4s +ldr q16, [x0, #704] +mul v27.4S, v27.4S,v28.s[0] +sub v5.4s, v29.4s, v30.4s +ldr q24, [x0, #592] +mul v22.4S, v22.4S,v28.s[0] +add v29.4s, v29.4s, v30.4s +ldr q30, [x0, #720] +mla v27.4S, v11.4S, v31.s[0] +mla v22.4S, v4.4S, v31.s[0] +sub v4.4s, v24.4s, v18.4s +sqrdmulh v11.4S, v9.4S, v17.s[1] +add v24.4s, v24.4s, v18.4s +mul v9.4S, v9.4S,v20.s[1] +sqrdmulh v18.4S, v6.4S, v17.s[2] +sub v3.4s, v19.4s, v8.4s +mul v6.4S, v6.4S,v20.s[2] +add v19.4s, v19.4s, v8.4s +sqrdmulh v17.4S, v24.4S, v13.s[1] +sub v20.4s, v2.4s, v1.4s +mul v24.4S, v24.4S,v15.s[1] +add v2.4s, v2.4s, v1.4s +sqrdmulh v1.4S, v4.4S, v13.s[2] +sub v8.4s, v16.4s, v27.4s +mul v4.4S, v4.4S,v15.s[2] +add v16.4s, v16.4s, v27.4s +mla v9.4S, v11.4S, v31.s[0] +sub v11.4s, v30.4s, v22.4s +ldr q13, [x0, #992] +sqrdmulh v15.4S, v2.4S, v7.s[1] +add v30.4s, v30.4s, v22.4s +mla v6.4S, v18.4S, v31.s[0] +ldr q18, [x0, #928] +sqrdmulh v22.4S, v20.4S, v7.s[2] +sub v27.4s, v23.4s, v9.4s +mla v24.4S, v17.4S, v31.s[0] +ldr q17, [x0, #800] +sqrdmulh v25.4S, v30.4S, v14.s[1] +add v23.4s, v23.4s, v9.4s +str q27, [x0, #528] +mla v4.4S, v1.4S, v31.s[0] +ldr q1, [x17, #+512] +sqrdmulh v27.4S, v11.4S, v14.s[2] +sub v9.4s, v26.4s, v6.4s +str q23, [x0, #512] +mul v2.4S, v2.4S,v10.s[1] +add v26.4s, v26.4s, v6.4s +ldr q6, [x17, #+528] +mul v20.4S, v20.4S,v10.s[2] +str q9, [x0, #560] +mla v2.4S, v15.4S, v31.s[0] +sub v15.4s, v29.4s, v24.4s +mla v20.4S, v22.4S, v31.s[0] +str q26, [x0, #544] +mul v30.4S, v30.4S,v28.s[1] +str q15, [x0, #592] +mul v11.4S, v11.4S,v28.s[2] +add v29.4s, v29.4s, v24.4s +str q29, [x0, #576] +mla v30.4S, v25.4S, v31.s[0] +sub v25.4s, v5.4s, v4.4s +str q25, [x0, #624] +mla v11.4S, v27.4S, v31.s[0] +add v5.4s, v5.4s, v4.4s +str q5, [x0, #608] +sqrdmulh v14.4S, v17.4S, v6.s[0] +sub v28.4s, v19.4s, v2.4s +mul v17.4S, v17.4S,v1.s[0] +str q28, [x0, #656] +ldr q28, [x0, #816] +sqrdmulh v5.4S, v28.4S, v6.s[0] +add v19.4s, v19.4s, v2.4s +ldr q2, [x17, #+544] +mul v28.4S, v28.4S,v1.s[0] +str q19, [x0, #640] +ldr q19, [x17, #+560] +ldr q4, [x0, #864] +sqrdmulh v27.4S, v4.4S, v19.s[0] +sub v25.4s, v3.4s, v20.4s +mul v4.4S, v4.4S,v2.s[0] +str q25, [x0, #688] +ldr q25, [x0, #880] +sqrdmulh v29.4S, v25.4S, v19.s[0] +add v3.4s, v3.4s, v20.4s +mul v25.4S, v25.4S,v2.s[0] +str q3, [x0, #672] +ldr q3, [x17, #+576] +mla v17.4S, v14.4S, v31.s[0] +sub v14.4s, v16.4s, v30.4s +ldr q20, [x17, #+592] +sqrdmulh v24.4S, v18.4S, v20.s[0] +str q14, [x0, #720] +ldr q14, [x0, #944] +mla v28.4S, v5.4S, v31.s[0] +add v16.4s, v16.4s, v30.4s +sqrdmulh v30.4S, v14.4S, v20.s[0] +str q16, [x0, #704] +ldr q16, [x17, #+608] +mla v4.4S, v27.4S, v31.s[0] +sub v27.4s, v8.4s, v11.4s +ldr q5, [x17, #+624] +sqrdmulh v15.4S, v13.4S, v5.s[0] +str q27, [x0, #752] +ldr q27, [x0, #1008] +mla v25.4S, v29.4S, v31.s[0] +add v8.4s, v8.4s, v11.4s +sqrdmulh v11.4S, v27.4S, v5.s[0] +str q8, [x0, #736] +ldr q8, [x0, #768] +ldr q29, [x0, #896] +mul v18.4S, v18.4S,v3.s[0] +sub v7.4s, v8.4s, v17.4s +ldr q10, [x0, #784] +mul v14.4S, v14.4S,v3.s[0] +add v8.4s, v8.4s, v17.4s +ldr q17, [x0, #912] +mla v18.4S, v24.4S, v31.s[0] +sub v24.4s, v10.4s, v28.4s +ldr q26, [x0, #832] +mla v14.4S, v30.4S, v31.s[0] +add v10.4s, v10.4s, v28.4s +ldr q28, [x0, #960] +mul v13.4S, v13.4S,v16.s[0] +sub v30.4s, v26.4s, v4.4s +ldr q22, [x0, #848] +mul v27.4S, v27.4S,v16.s[0] +add v26.4s, v26.4s, v4.4s +ldr q4, [x0, #976] +mla v13.4S, v15.4S, v31.s[0] +mla v27.4S, v11.4S, v31.s[0] +sub v11.4s, v22.4s, v25.4s +sqrdmulh v15.4S, v10.4S, v6.s[1] +add v22.4s, v22.4s, v25.4s +mul v10.4S, v10.4S,v1.s[1] +sqrdmulh v25.4S, v24.4S, v6.s[2] +sub v9.4s, v29.4s, v18.4s +mul v24.4S, v24.4S,v1.s[2] +add v29.4s, v29.4s, v18.4s +sqrdmulh v6.4S, v22.4S, v19.s[1] +sub v1.4s, v17.4s, v14.4s +mul v22.4S, v22.4S,v2.s[1] +add v17.4s, v17.4s, v14.4s +sqrdmulh v14.4S, v11.4S, v19.s[2] +sub v18.4s, v28.4s, v13.4s +mul v11.4S, v11.4S,v2.s[2] +add v28.4s, v28.4s, v13.4s +mla v10.4S, v15.4S, v31.s[0] +sub v15.4s, v4.4s, v27.4s +sqrdmulh v19.4S, v17.4S, v20.s[1] +add v4.4s, v4.4s, v27.4s +mla v24.4S, v25.4S, v31.s[0] +sqrdmulh v25.4S, v1.4S, v20.s[2] +sub v27.4s, v8.4s, v10.4s +mla v22.4S, v6.4S, v31.s[0] +sqrdmulh v6.4S, v4.4S, v5.s[1] +add v8.4s, v8.4s, v10.4s +str q27, [x0, #784] +mla v11.4S, v14.4S, v31.s[0] +sqrdmulh v14.4S, v15.4S, v5.s[2] +sub v27.4s, v7.4s, v24.4s +str q8, [x0, #768] +mul v17.4S, v17.4S,v3.s[1] +add v7.4s, v7.4s, v24.4s +mul v1.4S, v1.4S,v3.s[2] +str q27, [x0, #816] +mla v17.4S, v19.4S, v31.s[0] +sub v19.4s, v26.4s, v22.4s +mla v1.4S, v25.4S, v31.s[0] +str q7, [x0, #800] +mul v4.4S, v4.4S,v16.s[1] +str q19, [x0, #848] +mul v15.4S, v15.4S,v16.s[2] +add v26.4s, v26.4s, v22.4s +str q26, [x0, #832] +mla v4.4S, v6.4S, v31.s[0] +sub v6.4s, v30.4s, v11.4s +str q6, [x0, #880] +mla v15.4S, v14.4S, v31.s[0] +add v30.4s, v30.4s, v11.4s +str q30, [x0, #864] +sub v5.4s, v29.4s, v17.4s +str q5, [x0, #912] +add v29.4s, v29.4s, v17.4s +str q29, [x0, #896] +sub v29.4s, v9.4s, v1.4s +str q29, [x0, #944] +add v9.4s, v9.4s, v1.4s +str q9, [x0, #928] +sub v9.4s, v28.4s, v4.4s +str q9, [x0, #976] +add v28.4s, v28.4s, v4.4s +str q28, [x0, #960] +sub v28.4s, v18.4s, v15.4s +str q28, [x0, #1008] +add v18.4s, v18.4s, v15.4s +str q18, [x0, #992] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1520 +// Instruction count: 1516 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_9.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_9.s new file mode 100644 index 0000000..6c246f1 --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_9.s @@ -0,0 +1,1558 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_22_z4_9 +.global _ntt_u32_incomplete_neon_asm_var_4_2_22_z4_9 +ntt_u32_incomplete_neon_asm_var_4_2_22_z4_9: +_ntt_u32_incomplete_neon_asm_var_4_2_22_z4_9: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x0, #992] +sqrdmulh v27.4S, v28.4S, v29.s[0] +mul v28.4S, v28.4S,v30.s[0] +ldr q26, [x0, #928] +sqrdmulh v25.4S, v26.4S, v29.s[0] +mul v26.4S, v26.4S,v30.s[0] +ldr q24, [x0, #864] +sqrdmulh v23.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +ldr q22, [x0, #800] +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +ldr q20, [x0, #736] +sqrdmulh v19.4S, v20.4S, v29.s[0] +mla v28.4S, v27.4S, v31.s[0] +ldr q27, [x0, #672] +sqrdmulh v18.4S, v27.4S, v29.s[0] +mla v26.4S, v25.4S, v31.s[0] +ldr q25, [x0, #608] +sqrdmulh v17.4S, v25.4S, v29.s[0] +mla v24.4S, v23.4S, v31.s[0] +ldr q23, [x0, #544] +sqrdmulh v16.4S, v23.4S, v29.s[0] +mla v22.4S, v21.4S, v31.s[0] +ldr q21, [x0, #480] +mul v27.4S, v27.4S,v30.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q3, [x0, #416] +ldr q2, [x0, #352] +ldr q1, [x0, #288] +mla v27.4S, v18.4S, v31.s[0] +mla v20.4S, v19.4S, v31.s[0] +ldr q19, [x0, #224] +ldr q18, [x0, #160] +mul v23.4S, v23.4S,v30.s[0] +mul v25.4S, v25.4S,v30.s[0] +ldr q0, [x0, #96] +ldr q15, [x0, #32] +mla v23.4S, v16.4S, v31.s[0] +mla v25.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v28.4s +add v21.4s, v21.4s, v28.4s +sqrdmulh v28.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +sub v16.4s, v3.4s, v26.4s +add v3.4s, v3.4s, v26.4s +sqrdmulh v26.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +sub v14.4s, v2.4s, v24.4s +add v2.4s, v2.4s, v24.4s +sqrdmulh v24.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v13.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +sqrdmulh v22.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v12.4s, v19.4s, v20.4s +add v19.4s, v19.4s, v20.4s +sqrdmulh v20.4S, v14.4S, v29.s[2] +mla v17.4S, v28.4S, v31.s[0] +sub v28.4s, v18.4s, v27.4s +add v18.4s, v18.4s, v27.4s +sqrdmulh v27.4S, v13.4S, v29.s[2] +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v0.4s, v25.4s +add v0.4s, v0.4s, v25.4s +sqrdmulh v25.4S, v2.4S, v29.s[1] +mla v21.4S, v24.4S, v31.s[0] +sub v24.4s, v15.4s, v23.4s +sqrdmulh v11.4S, v1.4S, v29.s[1] +mla v3.4S, v22.4S, v31.s[0] +add v15.4s, v15.4s, v23.4s +ldr q23, [x17, #+32] +ldr q22, [x17, #+48] +mul v13.4S, v13.4S,v30.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v10.4s, v12.4s, v17.4s +add v12.4s, v12.4s, v17.4s +mla v13.4S, v27.4S, v31.s[0] +mla v14.4S, v20.4S, v31.s[0] +sub v20.4s, v28.4s, v16.4s +add v28.4s, v28.4s, v16.4s +mul v1.4S, v1.4S,v30.s[1] +mul v2.4S, v2.4S,v30.s[1] +sub v16.4s, v19.4s, v21.4s +add v19.4s, v19.4s, v21.4s +mla v1.4S, v11.4S, v31.s[0] +mla v2.4S, v25.4S, v31.s[0] +sub v25.4s, v18.4s, v3.4s +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v10.4S, v22.s[3] +mul v10.4S, v10.4S,v23.s[3] +sub v11.4s, v26.4s, v14.4s +add v26.4s, v26.4s, v14.4s +sqrdmulh v14.4S, v12.4S, v22.s[2] +mul v12.4S, v12.4S,v23.s[2] +sub v21.4s, v24.4s, v13.4s +add v24.4s, v24.4s, v13.4s +sqrdmulh v13.4S, v16.4S, v22.s[1] +mul v16.4S, v16.4S,v23.s[1] +sub v27.4s, v0.4s, v2.4s +add v0.4s, v0.4s, v2.4s +sqrdmulh v2.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v17.4s, v15.4s, v1.4s +add v15.4s, v15.4s, v1.4s +ldr q1, [x17, #+96] +ldr q9, [x17, #+112] +sqrdmulh v8.4S, v20.4S, v22.s[3] +mla v10.4S, v3.4S, v31.s[0] +nop +nop +sqrdmulh v3.4S, v28.4S, v22.s[2] +mla v12.4S, v14.4S, v31.s[0] +nop +nop +sqrdmulh v14.4S, v25.4S, v22.s[1] +mla v16.4S, v13.4S, v31.s[0] +nop +nop +sqrdmulh v13.4S, v18.4S, v22.s[0] +mla v19.4S, v2.4S, v31.s[0] +nop +nop +ldr q2, [x17, #+64] +ldr q7, [x17, #+80] +mul v28.4S, v28.4S,v23.s[2] +mul v20.4S, v20.4S,v23.s[3] +sub v6.4s, v11.4s, v10.4s +add v11.4s, v11.4s, v10.4s +mla v28.4S, v3.4S, v31.s[0] +mla v20.4S, v8.4S, v31.s[0] +sub v8.4s, v26.4s, v12.4s +add v26.4s, v26.4s, v12.4s +mul v18.4S, v18.4S,v23.s[0] +mul v25.4S, v25.4S,v23.s[1] +sub v12.4s, v27.4s, v16.4s +add v27.4s, v27.4s, v16.4s +mla v18.4S, v13.4S, v31.s[0] +mla v25.4S, v14.4S, v31.s[0] +sub v14.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v9.s[3] +mul v6.4S, v6.4S,v1.s[3] +sub v13.4s, v21.4s, v20.4s +add v21.4s, v21.4s, v20.4s +sqrdmulh v20.4S, v11.4S, v9.s[2] +mul v11.4S, v11.4S,v1.s[2] +sub v16.4s, v24.4s, v28.4s +add v24.4s, v24.4s, v28.4s +sqrdmulh v28.4S, v8.4S, v9.s[1] +mul v8.4S, v8.4S,v1.s[1] +sub v3.4s, v17.4s, v25.4s +add v17.4s, v17.4s, v25.4s +sqrdmulh v25.4S, v26.4S, v9.s[0] +mul v26.4S, v26.4S,v1.s[0] +sub v10.4s, v15.4s, v18.4s +add v15.4s, v15.4s, v18.4s +sqrdmulh v18.4S, v12.4S, v7.s[3] +mla v6.4S, v19.4S, v31.s[0] +nop +nop +sqrdmulh v19.4S, v27.4S, v7.s[2] +mla v11.4S, v20.4S, v31.s[0] +nop +nop +sqrdmulh v20.4S, v14.4S, v7.s[1] +mla v8.4S, v28.4S, v31.s[0] +nop +nop +sqrdmulh v28.4S, v0.4S, v7.s[0] +mla v26.4S, v25.4S, v31.s[0] +nop +nop +mul v27.4S, v27.4S,v2.s[2] +mul v12.4S, v12.4S,v2.s[3] +sub v25.4s, v13.4s, v6.4s +str q25, [x0, #992] +mla v27.4S, v19.4S, v31.s[0] +mla v12.4S, v18.4S, v31.s[0] +add v13.4s, v13.4s, v6.4s +str q13, [x0, #928] +mul v0.4S, v0.4S,v2.s[0] +mul v14.4S, v14.4S,v2.s[1] +sub v13.4s, v21.4s, v11.4s +str q13, [x0, #864] +mla v0.4S, v28.4S, v31.s[0] +mla v14.4S, v20.4S, v31.s[0] +add v21.4s, v21.4s, v11.4s +sub v11.4s, v16.4s, v8.4s +ldr q20, [x0, #1008] +sqrdmulh v28.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v16.4s, v16.4s, v8.4s +str q21, [x0, #800] +ldr q21, [x0, #944] +sqrdmulh v8.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +sub v13.4s, v24.4s, v26.4s +str q11, [x0, #736] +ldr q11, [x0, #880] +sqrdmulh v6.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +add v24.4s, v24.4s, v26.4s +str q16, [x0, #672] +ldr q16, [x0, #816] +sqrdmulh v26.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v18.4s, v3.4s, v12.4s +str q13, [x0, #608] +ldr q13, [x0, #752] +sqrdmulh v19.4S, v13.4S, v29.s[0] +mla v20.4S, v28.4S, v31.s[0] +add v3.4s, v3.4s, v12.4s +str q24, [x0, #544] +ldr q24, [x0, #688] +sqrdmulh v12.4S, v24.4S, v29.s[0] +mla v21.4S, v8.4S, v31.s[0] +sub v8.4s, v17.4s, v27.4s +str q18, [x0, #480] +ldr q18, [x0, #624] +sqrdmulh v28.4S, v18.4S, v29.s[0] +mla v11.4S, v6.4S, v31.s[0] +add v17.4s, v17.4s, v27.4s +str q3, [x0, #416] +ldr q3, [x0, #560] +sqrdmulh v27.4S, v3.4S, v29.s[0] +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v10.4s, v14.4s +str q8, [x0, #352] +ldr q8, [x0, #496] +add v10.4s, v10.4s, v14.4s +mul v24.4S, v24.4S,v30.s[0] +mul v13.4S, v13.4S,v30.s[0] +ldr q14, [x0, #432] +str q17, [x0, #288] +ldr q17, [x0, #368] +ldr q6, [x0, #304] +mla v24.4S, v12.4S, v31.s[0] +mla v13.4S, v19.4S, v31.s[0] +str q26, [x0, #224] +sub v26.4s, v15.4s, v0.4s +ldr q19, [x0, #240] +ldr q12, [x0, #176] +mul v3.4S, v3.4S,v30.s[0] +mul v18.4S, v18.4S,v30.s[0] +str q10, [x0, #160] +add v15.4s, v15.4s, v0.4s +ldr q0, [x0, #112] +ldr q10, [x0, #48] +mla v3.4S, v27.4S, v31.s[0] +mla v18.4S, v28.4S, v31.s[0] +sub v28.4s, v8.4s, v20.4s +add v8.4s, v8.4s, v20.4s +sqrdmulh v20.4S, v28.4S, v29.s[2] +mul v28.4S, v28.4S,v30.s[2] +sub v27.4s, v14.4s, v21.4s +add v14.4s, v14.4s, v21.4s +sqrdmulh v21.4S, v27.4S, v29.s[2] +mul v27.4S, v27.4S,v30.s[2] +sub v25.4s, v17.4s, v11.4s +add v17.4s, v17.4s, v11.4s +sqrdmulh v11.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +sub v5.4s, v6.4s, v16.4s +add v6.4s, v6.4s, v16.4s +sqrdmulh v16.4S, v14.4S, v29.s[1] +mul v14.4S, v14.4S,v30.s[1] +sub v4.4s, v19.4s, v13.4s +add v19.4s, v19.4s, v13.4s +sqrdmulh v13.4S, v25.4S, v29.s[2] +mla v28.4S, v20.4S, v31.s[0] +sub v20.4s, v12.4s, v24.4s +add v12.4s, v12.4s, v24.4s +sqrdmulh v24.4S, v5.4S, v29.s[2] +mla v27.4S, v21.4S, v31.s[0] +sub v21.4s, v0.4s, v18.4s +add v0.4s, v0.4s, v18.4s +sqrdmulh v18.4S, v17.4S, v29.s[1] +mla v8.4S, v11.4S, v31.s[0] +sub v11.4s, v10.4s, v3.4s +str q26, [x0, #96] +sqrdmulh v26.4S, v6.4S, v29.s[1] +mla v14.4S, v16.4S, v31.s[0] +add v10.4s, v10.4s, v3.4s +str q15, [x0, #32] +mul v5.4S, v5.4S,v30.s[2] +mul v25.4S, v25.4S,v30.s[2] +sub v15.4s, v4.4s, v28.4s +add v4.4s, v4.4s, v28.4s +mla v5.4S, v24.4S, v31.s[0] +mla v25.4S, v13.4S, v31.s[0] +sub v13.4s, v20.4s, v27.4s +add v20.4s, v20.4s, v27.4s +mul v6.4S, v6.4S,v30.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v27.4s, v19.4s, v8.4s +add v19.4s, v19.4s, v8.4s +mla v6.4S, v26.4S, v31.s[0] +mla v17.4S, v18.4S, v31.s[0] +sub v18.4s, v12.4s, v14.4s +add v12.4s, v12.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v22.s[3] +mul v15.4S, v15.4S,v23.s[3] +sub v26.4s, v21.4s, v25.4s +add v21.4s, v21.4s, v25.4s +sqrdmulh v25.4S, v4.4S, v22.s[2] +mul v4.4S, v4.4S,v23.s[2] +sub v8.4s, v11.4s, v5.4s +add v11.4s, v11.4s, v5.4s +sqrdmulh v5.4S, v27.4S, v22.s[1] +mul v27.4S, v27.4S,v23.s[1] +sub v24.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +sqrdmulh v17.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v28.4s, v10.4s, v6.4s +add v10.4s, v10.4s, v6.4s +sqrdmulh v6.4S, v13.4S, v22.s[3] +mla v15.4S, v14.4S, v31.s[0] +nop +nop +sqrdmulh v14.4S, v20.4S, v22.s[2] +mla v4.4S, v25.4S, v31.s[0] +nop +nop +sqrdmulh v25.4S, v18.4S, v22.s[1] +mla v27.4S, v5.4S, v31.s[0] +nop +nop +sqrdmulh v5.4S, v12.4S, v22.s[0] +mla v19.4S, v17.4S, v31.s[0] +nop +nop +mul v20.4S, v20.4S,v23.s[2] +mul v13.4S, v13.4S,v23.s[3] +sub v17.4s, v26.4s, v15.4s +add v26.4s, v26.4s, v15.4s +mla v20.4S, v14.4S, v31.s[0] +mla v13.4S, v6.4S, v31.s[0] +sub v6.4s, v21.4s, v4.4s +add v21.4s, v21.4s, v4.4s +mul v12.4S, v12.4S,v23.s[0] +mul v18.4S, v18.4S,v23.s[1] +sub v4.4s, v24.4s, v27.4s +add v24.4s, v24.4s, v27.4s +mla v12.4S, v5.4S, v31.s[0] +mla v18.4S, v25.4S, v31.s[0] +sub v25.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v17.4S, v9.s[3] +mul v17.4S, v17.4S,v1.s[3] +sub v5.4s, v8.4s, v13.4s +add v8.4s, v8.4s, v13.4s +sqrdmulh v13.4S, v26.4S, v9.s[2] +mul v26.4S, v26.4S,v1.s[2] +sub v27.4s, v11.4s, v20.4s +add v11.4s, v11.4s, v20.4s +sqrdmulh v20.4S, v6.4S, v9.s[1] +mul v6.4S, v6.4S,v1.s[1] +sub v14.4s, v28.4s, v18.4s +add v28.4s, v28.4s, v18.4s +sqrdmulh v18.4S, v21.4S, v9.s[0] +mul v21.4S, v21.4S,v1.s[0] +sub v15.4s, v10.4s, v12.4s +add v10.4s, v10.4s, v12.4s +sqrdmulh v12.4S, v4.4S, v7.s[3] +mla v17.4S, v19.4S, v31.s[0] +nop +nop +sqrdmulh v19.4S, v24.4S, v7.s[2] +mla v26.4S, v13.4S, v31.s[0] +nop +nop +sqrdmulh v13.4S, v25.4S, v7.s[1] +mla v6.4S, v20.4S, v31.s[0] +nop +nop +sqrdmulh v20.4S, v0.4S, v7.s[0] +mla v21.4S, v18.4S, v31.s[0] +nop +nop +mul v24.4S, v24.4S,v2.s[2] +mul v4.4S, v4.4S,v2.s[3] +sub v18.4s, v5.4s, v17.4s +str q18, [x0, #1008] +mla v24.4S, v19.4S, v31.s[0] +mla v4.4S, v12.4S, v31.s[0] +add v5.4s, v5.4s, v17.4s +str q5, [x0, #944] +mul v0.4S, v0.4S,v2.s[0] +mul v25.4S, v25.4S,v2.s[1] +sub v5.4s, v8.4s, v26.4s +str q5, [x0, #880] +mla v0.4S, v20.4S, v31.s[0] +mla v25.4S, v13.4S, v31.s[0] +add v8.4s, v8.4s, v26.4s +sub v26.4s, v27.4s, v6.4s +ldr q13, [x0, #960] +sqrdmulh v20.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +add v27.4s, v27.4s, v6.4s +str q8, [x0, #816] +ldr q8, [x0, #896] +sqrdmulh v6.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v5.4s, v11.4s, v21.4s +str q26, [x0, #752] +ldr q26, [x0, #832] +sqrdmulh v17.4S, v26.4S, v29.s[0] +mul v26.4S, v26.4S,v30.s[0] +add v11.4s, v11.4s, v21.4s +str q27, [x0, #688] +ldr q27, [x0, #768] +sqrdmulh v21.4S, v27.4S, v29.s[0] +mul v27.4S, v27.4S,v30.s[0] +sub v12.4s, v14.4s, v4.4s +str q5, [x0, #624] +ldr q5, [x0, #704] +sqrdmulh v19.4S, v5.4S, v29.s[0] +mla v13.4S, v20.4S, v31.s[0] +add v14.4s, v14.4s, v4.4s +str q11, [x0, #560] +ldr q11, [x0, #640] +sqrdmulh v4.4S, v11.4S, v29.s[0] +mla v8.4S, v6.4S, v31.s[0] +sub v6.4s, v28.4s, v24.4s +str q12, [x0, #496] +ldr q12, [x0, #576] +sqrdmulh v20.4S, v12.4S, v29.s[0] +mla v26.4S, v17.4S, v31.s[0] +add v28.4s, v28.4s, v24.4s +str q14, [x0, #432] +ldr q14, [x0, #512] +sqrdmulh v24.4S, v14.4S, v29.s[0] +mla v27.4S, v21.4S, v31.s[0] +sub v21.4s, v15.4s, v25.4s +str q6, [x0, #368] +ldr q6, [x0, #448] +add v15.4s, v15.4s, v25.4s +mul v11.4S, v11.4S,v30.s[0] +mul v5.4S, v5.4S,v30.s[0] +ldr q25, [x0, #384] +str q28, [x0, #304] +ldr q28, [x0, #320] +ldr q17, [x0, #256] +mla v11.4S, v4.4S, v31.s[0] +mla v5.4S, v19.4S, v31.s[0] +str q21, [x0, #240] +sub v21.4s, v10.4s, v0.4s +ldr q19, [x0, #192] +ldr q4, [x0, #128] +mul v14.4S, v14.4S,v30.s[0] +mul v12.4S, v12.4S,v30.s[0] +str q15, [x0, #176] +add v10.4s, v10.4s, v0.4s +ldr q0, [x0, #64] +ldr q15, [x0, #0] +mla v14.4S, v24.4S, v31.s[0] +mla v12.4S, v20.4S, v31.s[0] +sub v20.4s, v6.4s, v13.4s +add v6.4s, v6.4s, v13.4s +sqrdmulh v13.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +sub v24.4s, v25.4s, v8.4s +add v25.4s, v25.4s, v8.4s +sqrdmulh v8.4S, v24.4S, v29.s[2] +mul v24.4S, v24.4S,v30.s[2] +sub v18.4s, v28.4s, v26.4s +add v28.4s, v28.4s, v26.4s +sqrdmulh v26.4S, v6.4S, v29.s[1] +mul v6.4S, v6.4S,v30.s[1] +sub v3.4s, v17.4s, v27.4s +add v17.4s, v17.4s, v27.4s +sqrdmulh v27.4S, v25.4S, v29.s[1] +mul v25.4S, v25.4S,v30.s[1] +sub v16.4s, v19.4s, v5.4s +add v19.4s, v19.4s, v5.4s +sqrdmulh v5.4S, v18.4S, v29.s[2] +mla v20.4S, v13.4S, v31.s[0] +sub v13.4s, v4.4s, v11.4s +add v4.4s, v4.4s, v11.4s +sqrdmulh v11.4S, v3.4S, v29.s[2] +mla v24.4S, v8.4S, v31.s[0] +sub v8.4s, v0.4s, v12.4s +add v0.4s, v0.4s, v12.4s +sqrdmulh v12.4S, v28.4S, v29.s[1] +mla v6.4S, v26.4S, v31.s[0] +sub v26.4s, v15.4s, v14.4s +str q21, [x0, #112] +sqrdmulh v21.4S, v17.4S, v29.s[1] +mla v25.4S, v27.4S, v31.s[0] +add v15.4s, v15.4s, v14.4s +str q10, [x0, #48] +mul v3.4S, v3.4S,v30.s[2] +mul v18.4S, v18.4S,v30.s[2] +sub v10.4s, v16.4s, v20.4s +add v16.4s, v16.4s, v20.4s +mla v3.4S, v11.4S, v31.s[0] +mla v18.4S, v5.4S, v31.s[0] +sub v5.4s, v13.4s, v24.4s +add v13.4s, v13.4s, v24.4s +mul v17.4S, v17.4S,v30.s[1] +mul v28.4S, v28.4S,v30.s[1] +sub v24.4s, v19.4s, v6.4s +add v19.4s, v19.4s, v6.4s +mla v17.4S, v21.4S, v31.s[0] +mla v28.4S, v12.4S, v31.s[0] +sub v12.4s, v4.4s, v25.4s +add v4.4s, v4.4s, v25.4s +sqrdmulh v25.4S, v10.4S, v22.s[3] +mul v10.4S, v10.4S,v23.s[3] +sub v21.4s, v8.4s, v18.4s +add v8.4s, v8.4s, v18.4s +sqrdmulh v18.4S, v16.4S, v22.s[2] +mul v16.4S, v16.4S,v23.s[2] +sub v6.4s, v26.4s, v3.4s +add v26.4s, v26.4s, v3.4s +sqrdmulh v3.4S, v24.4S, v22.s[1] +mul v24.4S, v24.4S,v23.s[1] +sub v11.4s, v0.4s, v28.4s +add v0.4s, v0.4s, v28.4s +sqrdmulh v28.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v20.4s, v15.4s, v17.4s +add v15.4s, v15.4s, v17.4s +sqrdmulh v17.4S, v5.4S, v22.s[3] +mla v10.4S, v25.4S, v31.s[0] +nop +nop +sqrdmulh v25.4S, v13.4S, v22.s[2] +mla v16.4S, v18.4S, v31.s[0] +nop +nop +sqrdmulh v18.4S, v12.4S, v22.s[1] +mla v24.4S, v3.4S, v31.s[0] +nop +nop +sqrdmulh v3.4S, v4.4S, v22.s[0] +mla v19.4S, v28.4S, v31.s[0] +nop +nop +mul v13.4S, v13.4S,v23.s[2] +mul v5.4S, v5.4S,v23.s[3] +sub v28.4s, v21.4s, v10.4s +add v21.4s, v21.4s, v10.4s +mla v13.4S, v25.4S, v31.s[0] +mla v5.4S, v17.4S, v31.s[0] +sub v17.4s, v8.4s, v16.4s +add v8.4s, v8.4s, v16.4s +mul v4.4S, v4.4S,v23.s[0] +mul v12.4S, v12.4S,v23.s[1] +sub v16.4s, v11.4s, v24.4s +add v11.4s, v11.4s, v24.4s +mla v4.4S, v3.4S, v31.s[0] +mla v12.4S, v18.4S, v31.s[0] +sub v18.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v28.4S, v9.s[3] +mul v28.4S, v28.4S,v1.s[3] +sub v3.4s, v6.4s, v5.4s +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v21.4S, v9.s[2] +mul v21.4S, v21.4S,v1.s[2] +sub v24.4s, v26.4s, v13.4s +add v26.4s, v26.4s, v13.4s +sqrdmulh v13.4S, v17.4S, v9.s[1] +mul v17.4S, v17.4S,v1.s[1] +sub v25.4s, v20.4s, v12.4s +add v20.4s, v20.4s, v12.4s +sqrdmulh v12.4S, v8.4S, v9.s[0] +mul v8.4S, v8.4S,v1.s[0] +sub v10.4s, v15.4s, v4.4s +add v15.4s, v15.4s, v4.4s +sqrdmulh v4.4S, v16.4S, v7.s[3] +mla v28.4S, v19.4S, v31.s[0] +nop +nop +sqrdmulh v19.4S, v11.4S, v7.s[2] +mla v21.4S, v5.4S, v31.s[0] +nop +nop +sqrdmulh v5.4S, v18.4S, v7.s[1] +mla v17.4S, v13.4S, v31.s[0] +nop +nop +sqrdmulh v13.4S, v0.4S, v7.s[0] +mla v8.4S, v12.4S, v31.s[0] +nop +nop +mul v11.4S, v11.4S,v2.s[2] +mul v16.4S, v16.4S,v2.s[3] +sub v12.4s, v3.4s, v28.4s +str q12, [x0, #960] +mla v11.4S, v19.4S, v31.s[0] +mla v16.4S, v4.4S, v31.s[0] +add v3.4s, v3.4s, v28.4s +str q3, [x0, #896] +mul v0.4S, v0.4S,v2.s[0] +mul v18.4S, v18.4S,v2.s[1] +sub v3.4s, v6.4s, v21.4s +str q3, [x0, #832] +mla v0.4S, v13.4S, v31.s[0] +mla v18.4S, v5.4S, v31.s[0] +add v6.4s, v6.4s, v21.4s +sub v21.4s, v24.4s, v17.4s +ldr q5, [x0, #976] +sqrdmulh v13.4S, v5.4S, v29.s[0] +mul v5.4S, v5.4S,v30.s[0] +add v24.4s, v24.4s, v17.4s +str q6, [x0, #768] +ldr q6, [x0, #912] +sqrdmulh v17.4S, v6.4S, v29.s[0] +mul v6.4S, v6.4S,v30.s[0] +sub v3.4s, v26.4s, v8.4s +str q21, [x0, #704] +ldr q21, [x0, #848] +sqrdmulh v28.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +add v26.4s, v26.4s, v8.4s +str q24, [x0, #640] +ldr q24, [x0, #784] +sqrdmulh v8.4S, v24.4S, v29.s[0] +mul v24.4S, v24.4S,v30.s[0] +sub v4.4s, v25.4s, v16.4s +str q3, [x0, #576] +ldr q3, [x0, #720] +sqrdmulh v19.4S, v3.4S, v29.s[0] +mla v5.4S, v13.4S, v31.s[0] +add v25.4s, v25.4s, v16.4s +str q26, [x0, #512] +ldr q26, [x0, #656] +sqrdmulh v16.4S, v26.4S, v29.s[0] +mla v6.4S, v17.4S, v31.s[0] +sub v17.4s, v20.4s, v11.4s +str q4, [x0, #448] +ldr q4, [x0, #592] +sqrdmulh v13.4S, v4.4S, v29.s[0] +mla v21.4S, v28.4S, v31.s[0] +add v20.4s, v20.4s, v11.4s +str q25, [x0, #384] +ldr q25, [x0, #528] +sqrdmulh v11.4S, v25.4S, v29.s[0] +mla v24.4S, v8.4S, v31.s[0] +sub v8.4s, v10.4s, v18.4s +str q17, [x0, #320] +ldr q17, [x0, #464] +add v10.4s, v10.4s, v18.4s +mul v26.4S, v26.4S,v30.s[0] +mul v3.4S, v3.4S,v30.s[0] +ldr q18, [x0, #400] +str q20, [x0, #256] +ldr q20, [x0, #336] +ldr q28, [x0, #272] +mla v26.4S, v16.4S, v31.s[0] +mla v3.4S, v19.4S, v31.s[0] +str q8, [x0, #192] +sub v8.4s, v15.4s, v0.4s +ldr q19, [x0, #208] +ldr q16, [x0, #144] +mul v25.4S, v25.4S,v30.s[0] +mul v4.4S, v4.4S,v30.s[0] +str q10, [x0, #128] +add v15.4s, v15.4s, v0.4s +ldr q0, [x0, #80] +ldr q10, [x0, #16] +mla v25.4S, v11.4S, v31.s[0] +mla v4.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v5.4s +add v17.4s, v17.4s, v5.4s +sqrdmulh v5.4S, v13.4S, v29.s[2] +mul v13.4S, v13.4S,v30.s[2] +sub v11.4s, v18.4s, v6.4s +add v18.4s, v18.4s, v6.4s +sqrdmulh v6.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v12.4s, v20.4s, v21.4s +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v14.4s, v28.4s, v24.4s +add v28.4s, v28.4s, v24.4s +sqrdmulh v24.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v27.4s, v19.4s, v3.4s +add v19.4s, v19.4s, v3.4s +sqrdmulh v3.4S, v12.4S, v29.s[2] +mla v13.4S, v5.4S, v31.s[0] +sub v5.4s, v16.4s, v26.4s +add v16.4s, v16.4s, v26.4s +sqrdmulh v26.4S, v14.4S, v29.s[2] +mla v11.4S, v6.4S, v31.s[0] +sub v6.4s, v0.4s, v4.4s +add v0.4s, v0.4s, v4.4s +sqrdmulh v4.4S, v20.4S, v29.s[1] +mla v17.4S, v21.4S, v31.s[0] +sub v21.4s, v10.4s, v25.4s +str q8, [x0, #64] +sqrdmulh v8.4S, v28.4S, v29.s[1] +mla v18.4S, v24.4S, v31.s[0] +add v10.4s, v10.4s, v25.4s +str q15, [x0, #0] +mul v14.4S, v14.4S,v30.s[2] +mul v12.4S, v12.4S,v30.s[2] +sub v15.4s, v27.4s, v13.4s +add v27.4s, v27.4s, v13.4s +mla v14.4S, v26.4S, v31.s[0] +mla v12.4S, v3.4S, v31.s[0] +sub v3.4s, v5.4s, v11.4s +add v5.4s, v5.4s, v11.4s +mul v28.4S, v28.4S,v30.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v11.4s, v19.4s, v17.4s +add v19.4s, v19.4s, v17.4s +mla v28.4S, v8.4S, v31.s[0] +mla v20.4S, v4.4S, v31.s[0] +sub v4.4s, v16.4s, v18.4s +add v16.4s, v16.4s, v18.4s +sqrdmulh v29.4S, v15.4S, v22.s[3] +mul v15.4S, v15.4S,v23.s[3] +sub v30.4s, v6.4s, v12.4s +add v6.4s, v6.4s, v12.4s +sqrdmulh v12.4S, v27.4S, v22.s[2] +mul v27.4S, v27.4S,v23.s[2] +sub v18.4s, v21.4s, v14.4s +add v21.4s, v21.4s, v14.4s +sqrdmulh v14.4S, v11.4S, v22.s[1] +mul v11.4S, v11.4S,v23.s[1] +sub v8.4s, v0.4s, v20.4s +add v0.4s, v0.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v22.s[0] +mul v19.4S, v19.4S,v23.s[0] +sub v17.4s, v10.4s, v28.4s +add v10.4s, v10.4s, v28.4s +sqrdmulh v28.4S, v3.4S, v22.s[3] +mla v15.4S, v29.4S, v31.s[0] +nop +nop +sqrdmulh v29.4S, v5.4S, v22.s[2] +mla v27.4S, v12.4S, v31.s[0] +nop +nop +sqrdmulh v12.4S, v4.4S, v22.s[1] +mla v11.4S, v14.4S, v31.s[0] +nop +nop +sqrdmulh v14.4S, v16.4S, v22.s[0] +mla v19.4S, v20.4S, v31.s[0] +nop +nop +mul v5.4S, v5.4S,v23.s[2] +mul v3.4S, v3.4S,v23.s[3] +sub v20.4s, v30.4s, v15.4s +add v30.4s, v30.4s, v15.4s +mla v5.4S, v29.4S, v31.s[0] +mla v3.4S, v28.4S, v31.s[0] +sub v28.4s, v6.4s, v27.4s +add v6.4s, v6.4s, v27.4s +mul v16.4S, v16.4S,v23.s[0] +mul v4.4S, v4.4S,v23.s[1] +sub v27.4s, v8.4s, v11.4s +add v8.4s, v8.4s, v11.4s +mla v16.4S, v14.4S, v31.s[0] +mla v4.4S, v12.4S, v31.s[0] +sub v12.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v22.4S, v20.4S, v9.s[3] +mul v20.4S, v20.4S,v1.s[3] +sub v23.4s, v18.4s, v3.4s +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v30.4S, v9.s[2] +mul v30.4S, v30.4S,v1.s[2] +sub v19.4s, v21.4s, v5.4s +add v21.4s, v21.4s, v5.4s +sqrdmulh v5.4S, v28.4S, v9.s[1] +mul v28.4S, v28.4S,v1.s[1] +sub v14.4s, v17.4s, v4.4s +add v17.4s, v17.4s, v4.4s +sqrdmulh v4.4S, v6.4S, v9.s[0] +mul v6.4S, v6.4S,v1.s[0] +sub v11.4s, v10.4s, v16.4s +add v10.4s, v10.4s, v16.4s +sqrdmulh v9.4S, v27.4S, v7.s[3] +mla v20.4S, v22.4S, v31.s[0] +nop +nop +sqrdmulh v22.4S, v8.4S, v7.s[2] +mla v30.4S, v3.4S, v31.s[0] +nop +nop +sqrdmulh v3.4S, v12.4S, v7.s[1] +mla v28.4S, v5.4S, v31.s[0] +nop +nop +sqrdmulh v5.4S, v0.4S, v7.s[0] +mla v6.4S, v4.4S, v31.s[0] +nop +nop +mul v8.4S, v8.4S,v2.s[2] +mul v27.4S, v27.4S,v2.s[3] +sub v4.4s, v23.4s, v20.4s +str q4, [x0, #976] +mla v8.4S, v22.4S, v31.s[0] +mla v27.4S, v9.4S, v31.s[0] +add v23.4s, v23.4s, v20.4s +str q23, [x0, #912] +mul v0.4S, v0.4S,v2.s[0] +mul v12.4S, v12.4S,v2.s[1] +sub v23.4s, v18.4s, v30.4s +str q23, [x0, #848] +mla v0.4S, v5.4S, v31.s[0] +mla v12.4S, v3.4S, v31.s[0] +add v18.4s, v18.4s, v30.4s +sub v30.4s, v19.4s, v28.4s +add v19.4s, v19.4s, v28.4s +str q18, [x0, #784] +sub v18.4s, v21.4s, v6.4s +str q30, [x0, #720] +add v21.4s, v21.4s, v6.4s +str q19, [x0, #656] +sub v19.4s, v14.4s, v27.4s +str q18, [x0, #592] +add v14.4s, v14.4s, v27.4s +str q21, [x0, #528] +sub v21.4s, v17.4s, v8.4s +str q19, [x0, #464] +add v17.4s, v17.4s, v8.4s +str q14, [x0, #400] +sub v14.4s, v11.4s, v12.4s +str q21, [x0, #336] +add v11.4s, v11.4s, v12.4s +str q17, [x0, #272] +sub v17.4s, v10.4s, v0.4s +add v10.4s, v10.4s, v0.4s +ldr q24, [x0, #224] +ldr q25, [x0, #160] +ldr q13, [x0, #32] +ldr q26, [x17, #+128] +ldr q15, [x17, #+144] +sqrdmulh v29.4S, v13.4S, v15.s[0] +mul v13.4S, v13.4S,v26.s[0] +ldr q16, [x0, #48] +sqrdmulh v1.4S, v16.4S, v15.s[0] +mul v16.4S, v16.4S,v26.s[0] +ldr q4, [x17, #+160] +ldr q22, [x17, #+176] +ldr q9, [x0, #96] +sqrdmulh v20.4S, v9.4S, v22.s[0] +mul v9.4S, v9.4S,v4.s[0] +ldr q23, [x0, #112] +sqrdmulh v5.4S, v23.4S, v22.s[0] +mul v23.4S, v23.4S,v4.s[0] +ldr q3, [x17, #+192] +ldr q2, [x17, #+208] +mla v13.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v25.4S, v2.s[0] +ldr q7, [x0, #176] +mla v16.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v7.4S, v2.s[0] +ldr q28, [x17, #+224] +ldr q30, [x17, #+240] +mla v9.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v24.4S, v30.s[0] +ldr q6, [x0, #240] +mla v23.4S, v5.4S, v31.s[0] +sqrdmulh v5.4S, v6.4S, v30.s[0] +ldr q18, [x0, #0] +ldr q27, [x0, #128] +mul v25.4S, v25.4S,v3.s[0] +sub v19.4s, v18.4s, v13.4s +mul v7.4S, v7.4S,v3.s[0] +add v18.4s, v18.4s, v13.4s +mla v25.4S, v29.4S, v31.s[0] +sub v29.4s, v10.4s, v16.4s +ldr q13, [x0, #64] +mla v7.4S, v1.4S, v31.s[0] +add v10.4s, v10.4s, v16.4s +ldr q16, [x0, #192] +mul v24.4S, v24.4S,v28.s[0] +sub v1.4s, v13.4s, v9.4s +mul v6.4S, v6.4S,v28.s[0] +add v13.4s, v13.4s, v9.4s +mla v24.4S, v20.4S, v31.s[0] +nop +mla v6.4S, v5.4S, v31.s[0] +sub v5.4s, v17.4s, v23.4s +sqrdmulh v20.4S, v10.4S, v15.s[1] +add v17.4s, v17.4s, v23.4s +mul v10.4S, v10.4S,v26.s[1] +nop +sqrdmulh v23.4S, v29.4S, v15.s[2] +sub v9.4s, v27.4s, v25.4s +mul v29.4S, v29.4S,v26.s[2] +add v27.4s, v27.4s, v25.4s +sqrdmulh v15.4S, v17.4S, v22.s[1] +sub v26.4s, v11.4s, v7.4s +mul v17.4S, v17.4S,v4.s[1] +add v11.4s, v11.4s, v7.4s +sqrdmulh v7.4S, v5.4S, v22.s[2] +sub v25.4s, v16.4s, v24.4s +mul v5.4S, v5.4S,v4.s[2] +add v16.4s, v16.4s, v24.4s +mla v10.4S, v20.4S, v31.s[0] +sub v20.4s, v14.4s, v6.4s +ldr q22, [x0, #480] +sqrdmulh v4.4S, v11.4S, v2.s[1] +add v14.4s, v14.4s, v6.4s +mla v29.4S, v23.4S, v31.s[0] +ldr q23, [x0, #416] +sqrdmulh v6.4S, v26.4S, v2.s[2] +sub v24.4s, v18.4s, v10.4s +mla v17.4S, v15.4S, v31.s[0] +ldr q15, [x0, #288] +sqrdmulh v8.4S, v14.4S, v30.s[1] +add v18.4s, v18.4s, v10.4s +str q24, [x0, #16] +mla v5.4S, v7.4S, v31.s[0] +ldr q7, [x17, #+256] +ldr q24, [x17, #+272] +sqrdmulh v10.4S, v20.4S, v30.s[2] +sub v21.4s, v19.4s, v29.4s +str q18, [x0, #0] +mul v11.4S, v11.4S,v3.s[1] +add v19.4s, v19.4s, v29.4s +mul v26.4S, v26.4S,v3.s[2] +str q21, [x0, #48] +mla v11.4S, v4.4S, v31.s[0] +sub v4.4s, v13.4s, v17.4s +mla v26.4S, v6.4S, v31.s[0] +str q19, [x0, #32] +mul v14.4S, v14.4S,v28.s[1] +str q4, [x0, #80] +mul v20.4S, v20.4S,v28.s[2] +add v13.4s, v13.4s, v17.4s +str q13, [x0, #64] +mla v14.4S, v8.4S, v31.s[0] +sub v8.4s, v1.4s, v5.4s +str q8, [x0, #112] +mla v20.4S, v10.4S, v31.s[0] +add v1.4s, v1.4s, v5.4s +str q1, [x0, #96] +sqrdmulh v30.4S, v15.4S, v24.s[0] +sub v28.4s, v27.4s, v11.4s +mul v15.4S, v15.4S,v7.s[0] +str q28, [x0, #144] +ldr q28, [x0, #304] +sqrdmulh v1.4S, v28.4S, v24.s[0] +add v27.4s, v27.4s, v11.4s +mul v28.4S, v28.4S,v7.s[0] +str q27, [x0, #128] +ldr q27, [x17, #+288] +ldr q11, [x17, #+304] +ldr q5, [x0, #352] +sqrdmulh v10.4S, v5.4S, v11.s[0] +sub v8.4s, v9.4s, v26.4s +mul v5.4S, v5.4S,v27.s[0] +str q8, [x0, #176] +ldr q8, [x0, #368] +sqrdmulh v13.4S, v8.4S, v11.s[0] +add v9.4s, v9.4s, v26.4s +mul v8.4S, v8.4S,v27.s[0] +str q9, [x0, #160] +ldr q9, [x17, #+320] +ldr q26, [x17, #+336] +mla v15.4S, v30.4S, v31.s[0] +sub v30.4s, v16.4s, v14.4s +sqrdmulh v17.4S, v23.4S, v26.s[0] +str q30, [x0, #208] +ldr q30, [x0, #432] +mla v28.4S, v1.4S, v31.s[0] +add v16.4s, v16.4s, v14.4s +sqrdmulh v14.4S, v30.4S, v26.s[0] +str q16, [x0, #192] +ldr q16, [x17, #+352] +ldr q1, [x17, #+368] +mla v5.4S, v10.4S, v31.s[0] +sub v10.4s, v25.4s, v20.4s +sqrdmulh v4.4S, v22.4S, v1.s[0] +str q10, [x0, #240] +ldr q10, [x0, #496] +mla v8.4S, v13.4S, v31.s[0] +add v25.4s, v25.4s, v20.4s +sqrdmulh v20.4S, v10.4S, v1.s[0] +str q25, [x0, #224] +ldr q25, [x0, #256] +ldr q13, [x0, #384] +mul v23.4S, v23.4S,v9.s[0] +sub v2.4s, v25.4s, v15.4s +ldr q3, [x0, #272] +mul v30.4S, v30.4S,v9.s[0] +add v25.4s, v25.4s, v15.4s +ldr q15, [x0, #400] +mla v23.4S, v17.4S, v31.s[0] +sub v17.4s, v3.4s, v28.4s +ldr q19, [x0, #320] +mla v30.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v28.4s +ldr q28, [x0, #448] +mul v22.4S, v22.4S,v16.s[0] +sub v14.4s, v19.4s, v5.4s +ldr q6, [x0, #336] +mul v10.4S, v10.4S,v16.s[0] +add v19.4s, v19.4s, v5.4s +ldr q5, [x0, #464] +mla v22.4S, v4.4S, v31.s[0] +nop +mla v10.4S, v20.4S, v31.s[0] +sub v20.4s, v6.4s, v8.4s +sqrdmulh v4.4S, v3.4S, v24.s[1] +add v6.4s, v6.4s, v8.4s +mul v3.4S, v3.4S,v7.s[1] +nop +sqrdmulh v8.4S, v17.4S, v24.s[2] +sub v21.4s, v13.4s, v23.4s +mul v17.4S, v17.4S,v7.s[2] +add v13.4s, v13.4s, v23.4s +sqrdmulh v24.4S, v6.4S, v11.s[1] +sub v7.4s, v15.4s, v30.4s +mul v6.4S, v6.4S,v27.s[1] +add v15.4s, v15.4s, v30.4s +sqrdmulh v30.4S, v20.4S, v11.s[2] +sub v23.4s, v28.4s, v22.4s +mul v20.4S, v20.4S,v27.s[2] +add v28.4s, v28.4s, v22.4s +mla v3.4S, v4.4S, v31.s[0] +sub v4.4s, v5.4s, v10.4s +ldr q11, [x0, #736] +sqrdmulh v27.4S, v15.4S, v26.s[1] +add v5.4s, v5.4s, v10.4s +mla v17.4S, v8.4S, v31.s[0] +ldr q8, [x0, #672] +sqrdmulh v10.4S, v7.4S, v26.s[2] +sub v22.4s, v25.4s, v3.4s +mla v6.4S, v24.4S, v31.s[0] +ldr q24, [x0, #544] +sqrdmulh v29.4S, v5.4S, v1.s[1] +add v25.4s, v25.4s, v3.4s +str q22, [x0, #272] +mla v20.4S, v30.4S, v31.s[0] +ldr q30, [x17, #+384] +ldr q22, [x17, #+400] +sqrdmulh v3.4S, v4.4S, v1.s[2] +sub v18.4s, v2.4s, v17.4s +str q25, [x0, #256] +mul v15.4S, v15.4S,v9.s[1] +add v2.4s, v2.4s, v17.4s +mul v7.4S, v7.4S,v9.s[2] +str q18, [x0, #304] +mla v15.4S, v27.4S, v31.s[0] +sub v27.4s, v19.4s, v6.4s +mla v7.4S, v10.4S, v31.s[0] +str q2, [x0, #288] +mul v5.4S, v5.4S,v16.s[1] +str q27, [x0, #336] +mul v4.4S, v4.4S,v16.s[2] +add v19.4s, v19.4s, v6.4s +str q19, [x0, #320] +mla v5.4S, v29.4S, v31.s[0] +sub v29.4s, v14.4s, v20.4s +str q29, [x0, #368] +mla v4.4S, v3.4S, v31.s[0] +add v14.4s, v14.4s, v20.4s +str q14, [x0, #352] +sqrdmulh v1.4S, v24.4S, v22.s[0] +sub v16.4s, v13.4s, v15.4s +mul v24.4S, v24.4S,v30.s[0] +str q16, [x0, #400] +ldr q16, [x0, #560] +sqrdmulh v14.4S, v16.4S, v22.s[0] +add v13.4s, v13.4s, v15.4s +mul v16.4S, v16.4S,v30.s[0] +str q13, [x0, #384] +ldr q13, [x17, #+416] +ldr q15, [x17, #+432] +ldr q20, [x0, #608] +sqrdmulh v3.4S, v20.4S, v15.s[0] +sub v29.4s, v21.4s, v7.4s +mul v20.4S, v20.4S,v13.s[0] +str q29, [x0, #432] +ldr q29, [x0, #624] +sqrdmulh v19.4S, v29.4S, v15.s[0] +add v21.4s, v21.4s, v7.4s +mul v29.4S, v29.4S,v13.s[0] +str q21, [x0, #416] +ldr q21, [x17, #+448] +ldr q7, [x17, #+464] +mla v24.4S, v1.4S, v31.s[0] +sub v1.4s, v28.4s, v5.4s +sqrdmulh v6.4S, v8.4S, v7.s[0] +str q1, [x0, #464] +ldr q1, [x0, #688] +mla v16.4S, v14.4S, v31.s[0] +add v28.4s, v28.4s, v5.4s +sqrdmulh v5.4S, v1.4S, v7.s[0] +str q28, [x0, #448] +ldr q28, [x17, #+480] +ldr q14, [x17, #+496] +mla v20.4S, v3.4S, v31.s[0] +sub v3.4s, v23.4s, v4.4s +sqrdmulh v27.4S, v11.4S, v14.s[0] +str q3, [x0, #496] +ldr q3, [x0, #752] +mla v29.4S, v19.4S, v31.s[0] +add v23.4s, v23.4s, v4.4s +sqrdmulh v4.4S, v3.4S, v14.s[0] +str q23, [x0, #480] +ldr q23, [x0, #512] +ldr q19, [x0, #640] +mul v8.4S, v8.4S,v21.s[0] +sub v26.4s, v23.4s, v24.4s +ldr q9, [x0, #528] +mul v1.4S, v1.4S,v21.s[0] +add v23.4s, v23.4s, v24.4s +ldr q24, [x0, #656] +mla v8.4S, v6.4S, v31.s[0] +sub v6.4s, v9.4s, v16.4s +ldr q2, [x0, #576] +mla v1.4S, v5.4S, v31.s[0] +add v9.4s, v9.4s, v16.4s +ldr q16, [x0, #704] +mul v11.4S, v11.4S,v28.s[0] +sub v5.4s, v2.4s, v20.4s +ldr q10, [x0, #592] +mul v3.4S, v3.4S,v28.s[0] +add v2.4s, v2.4s, v20.4s +ldr q20, [x0, #720] +mla v11.4S, v27.4S, v31.s[0] +nop +mla v3.4S, v4.4S, v31.s[0] +sub v4.4s, v10.4s, v29.4s +sqrdmulh v27.4S, v9.4S, v22.s[1] +add v10.4s, v10.4s, v29.4s +mul v9.4S, v9.4S,v30.s[1] +nop +sqrdmulh v29.4S, v6.4S, v22.s[2] +sub v18.4s, v19.4s, v8.4s +mul v6.4S, v6.4S,v30.s[2] +add v19.4s, v19.4s, v8.4s +sqrdmulh v22.4S, v10.4S, v15.s[1] +sub v30.4s, v24.4s, v1.4s +mul v10.4S, v10.4S,v13.s[1] +add v24.4s, v24.4s, v1.4s +sqrdmulh v1.4S, v4.4S, v15.s[2] +sub v8.4s, v16.4s, v11.4s +mul v4.4S, v4.4S,v13.s[2] +add v16.4s, v16.4s, v11.4s +mla v9.4S, v27.4S, v31.s[0] +sub v27.4s, v20.4s, v3.4s +ldr q15, [x0, #992] +sqrdmulh v13.4S, v24.4S, v7.s[1] +add v20.4s, v20.4s, v3.4s +mla v6.4S, v29.4S, v31.s[0] +ldr q29, [x0, #928] +sqrdmulh v3.4S, v30.4S, v7.s[2] +sub v11.4s, v23.4s, v9.4s +mla v10.4S, v22.4S, v31.s[0] +ldr q22, [x0, #800] +sqrdmulh v17.4S, v20.4S, v14.s[1] +add v23.4s, v23.4s, v9.4s +str q11, [x0, #528] +mla v4.4S, v1.4S, v31.s[0] +ldr q1, [x17, #+512] +ldr q11, [x17, #+528] +sqrdmulh v9.4S, v27.4S, v14.s[2] +sub v25.4s, v26.4s, v6.4s +str q23, [x0, #512] +mul v24.4S, v24.4S,v21.s[1] +add v26.4s, v26.4s, v6.4s +mul v30.4S, v30.4S,v21.s[2] +str q25, [x0, #560] +mla v24.4S, v13.4S, v31.s[0] +sub v13.4s, v2.4s, v10.4s +mla v30.4S, v3.4S, v31.s[0] +str q26, [x0, #544] +mul v20.4S, v20.4S,v28.s[1] +str q13, [x0, #592] +mul v27.4S, v27.4S,v28.s[2] +add v2.4s, v2.4s, v10.4s +str q2, [x0, #576] +mla v20.4S, v17.4S, v31.s[0] +sub v17.4s, v5.4s, v4.4s +str q17, [x0, #624] +mla v27.4S, v9.4S, v31.s[0] +add v5.4s, v5.4s, v4.4s +str q5, [x0, #608] +sqrdmulh v14.4S, v22.4S, v11.s[0] +sub v28.4s, v19.4s, v24.4s +mul v22.4S, v22.4S,v1.s[0] +str q28, [x0, #656] +ldr q28, [x0, #816] +sqrdmulh v5.4S, v28.4S, v11.s[0] +add v19.4s, v19.4s, v24.4s +mul v28.4S, v28.4S,v1.s[0] +str q19, [x0, #640] +ldr q19, [x17, #+544] +ldr q24, [x17, #+560] +ldr q4, [x0, #864] +sqrdmulh v9.4S, v4.4S, v24.s[0] +sub v17.4s, v18.4s, v30.4s +mul v4.4S, v4.4S,v19.s[0] +str q17, [x0, #688] +ldr q17, [x0, #880] +sqrdmulh v2.4S, v17.4S, v24.s[0] +add v18.4s, v18.4s, v30.4s +mul v17.4S, v17.4S,v19.s[0] +str q18, [x0, #672] +ldr q18, [x17, #+576] +ldr q30, [x17, #+592] +mla v22.4S, v14.4S, v31.s[0] +sub v14.4s, v16.4s, v20.4s +sqrdmulh v10.4S, v29.4S, v30.s[0] +str q14, [x0, #720] +ldr q14, [x0, #944] +mla v28.4S, v5.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v14.4S, v30.s[0] +str q16, [x0, #704] +ldr q16, [x17, #+608] +ldr q5, [x17, #+624] +mla v4.4S, v9.4S, v31.s[0] +sub v9.4s, v8.4s, v27.4s +sqrdmulh v13.4S, v15.4S, v5.s[0] +str q9, [x0, #752] +ldr q9, [x0, #1008] +mla v17.4S, v2.4S, v31.s[0] +add v8.4s, v8.4s, v27.4s +sqrdmulh v27.4S, v9.4S, v5.s[0] +str q8, [x0, #736] +ldr q8, [x0, #768] +ldr q2, [x0, #896] +mul v29.4S, v29.4S,v18.s[0] +sub v7.4s, v8.4s, v22.4s +ldr q21, [x0, #784] +mul v14.4S, v14.4S,v18.s[0] +add v8.4s, v8.4s, v22.4s +ldr q22, [x0, #912] +mla v29.4S, v10.4S, v31.s[0] +sub v10.4s, v21.4s, v28.4s +ldr q26, [x0, #832] +mla v14.4S, v20.4S, v31.s[0] +add v21.4s, v21.4s, v28.4s +ldr q28, [x0, #960] +mul v15.4S, v15.4S,v16.s[0] +sub v20.4s, v26.4s, v4.4s +ldr q3, [x0, #848] +mul v9.4S, v9.4S,v16.s[0] +add v26.4s, v26.4s, v4.4s +ldr q4, [x0, #976] +mla v15.4S, v13.4S, v31.s[0] +nop +mla v9.4S, v27.4S, v31.s[0] +sub v27.4s, v3.4s, v17.4s +sqrdmulh v13.4S, v21.4S, v11.s[1] +add v3.4s, v3.4s, v17.4s +mul v21.4S, v21.4S,v1.s[1] +nop +sqrdmulh v17.4S, v10.4S, v11.s[2] +sub v25.4s, v2.4s, v29.4s +mul v10.4S, v10.4S,v1.s[2] +add v2.4s, v2.4s, v29.4s +sqrdmulh v11.4S, v3.4S, v24.s[1] +sub v1.4s, v22.4s, v14.4s +mul v3.4S, v3.4S,v19.s[1] +add v22.4s, v22.4s, v14.4s +sqrdmulh v14.4S, v27.4S, v24.s[2] +sub v29.4s, v28.4s, v15.4s +mul v27.4S, v27.4S,v19.s[2] +add v28.4s, v28.4s, v15.4s +mla v21.4S, v13.4S, v31.s[0] +sub v13.4s, v4.4s, v9.4s +sqrdmulh v24.4S, v22.4S, v30.s[1] +add v4.4s, v4.4s, v9.4s +mla v10.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v1.4S, v30.s[2] +sub v9.4s, v8.4s, v21.4s +mla v3.4S, v11.4S, v31.s[0] +sqrdmulh v11.4S, v4.4S, v5.s[1] +add v8.4s, v8.4s, v21.4s +str q9, [x0, #784] +mla v27.4S, v14.4S, v31.s[0] +sqrdmulh v14.4S, v13.4S, v5.s[2] +sub v9.4s, v7.4s, v10.4s +str q8, [x0, #768] +mul v22.4S, v22.4S,v18.s[1] +add v7.4s, v7.4s, v10.4s +mul v1.4S, v1.4S,v18.s[2] +str q9, [x0, #816] +mla v22.4S, v24.4S, v31.s[0] +sub v24.4s, v26.4s, v3.4s +mla v1.4S, v17.4S, v31.s[0] +str q7, [x0, #800] +mul v4.4S, v4.4S,v16.s[1] +str q24, [x0, #848] +mul v13.4S, v13.4S,v16.s[2] +add v26.4s, v26.4s, v3.4s +str q26, [x0, #832] +mla v4.4S, v11.4S, v31.s[0] +sub v11.4s, v20.4s, v27.4s +str q11, [x0, #880] +mla v13.4S, v14.4S, v31.s[0] +add v20.4s, v20.4s, v27.4s +str q20, [x0, #864] +sub v5.4s, v2.4s, v22.4s +str q5, [x0, #912] +add v2.4s, v2.4s, v22.4s +str q2, [x0, #896] +sub v2.4s, v25.4s, v1.4s +str q2, [x0, #944] +add v25.4s, v25.4s, v1.4s +str q25, [x0, #928] +sub v25.4s, v28.4s, v4.4s +str q25, [x0, #976] +add v28.4s, v28.4s, v4.4s +str q28, [x0, #960] +sub v28.4s, v29.4s, v13.4s +str q28, [x0, #1008] +add v29.4s, v29.4s, v13.4s +str q29, [x0, #992] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1528 +// Instruction count: 1524 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_24_z4_0.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_24_z4_0.s new file mode 100644 index 0000000..cf16f9a --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_24_z4_0.s @@ -0,0 +1,1494 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_24_z4_0 +.global _ntt_u32_incomplete_neon_asm_var_4_2_24_z4_0 +ntt_u32_incomplete_neon_asm_var_4_2_24_z4_0: +_ntt_u32_incomplete_neon_asm_var_4_2_24_z4_0: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #800] +ldr q21, [x0, #864] +ldr q20, [x0, #928] +ldr q19, [x0, #992] +ldr q18, [x0, #288] +ldr q17, [x0, #352] +ldr q16, [x0, #416] +ldr q3, [x0, #480] +ldr q2, [x0, #544] +ldr q1, [x0, #608] +ldr q0, [x0, #672] +ldr q15, [x0, #736] +ldr q14, [x0, #32] +ldr q13, [x0, #96] +ldr q12, [x0, #160] +ldr q11, [x0, #224] +sqrdmulh v10.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +mla v22.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +mla v21.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +mla v20.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +mla v19.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +mla v2.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +mla v1.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v0.4S, v29.s[0] +mul v0.4S, v0.4S,v30.s[0] +mla v0.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v15.4S, v29.s[0] +mul v15.4S, v15.4S,v30.s[0] +mla v15.4S, v10.4S, v31.s[0] +sub v10.4s, v18.4s, v22.4s +add v18.4s, v18.4s, v22.4s +sub v22.4s, v17.4s, v21.4s +add v17.4s, v17.4s, v21.4s +sub v21.4s, v16.4s, v20.4s +add v16.4s, v16.4s, v20.4s +sub v20.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +sub v19.4s, v14.4s, v2.4s +add v14.4s, v14.4s, v2.4s +sub v2.4s, v13.4s, v1.4s +add v13.4s, v13.4s, v1.4s +sub v1.4s, v12.4s, v0.4s +add v12.4s, v12.4s, v0.4s +sub v0.4s, v11.4s, v15.4s +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +mla v16.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +mla v3.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +mla v18.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +mla v17.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +mla v21.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +mla v20.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +mla v10.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +mla v22.4S, v15.4S, v31.s[0] +sub v15.4s, v12.4s, v16.4s +add v12.4s, v12.4s, v16.4s +sub v16.4s, v11.4s, v3.4s +add v11.4s, v11.4s, v3.4s +sub v3.4s, v14.4s, v18.4s +add v14.4s, v14.4s, v18.4s +sub v18.4s, v13.4s, v17.4s +add v13.4s, v13.4s, v17.4s +sub v17.4s, v1.4s, v21.4s +add v1.4s, v1.4s, v21.4s +sub v21.4s, v0.4s, v20.4s +add v0.4s, v0.4s, v20.4s +sub v20.4s, v19.4s, v10.4s +add v19.4s, v19.4s, v10.4s +sub v10.4s, v2.4s, v22.4s +add v2.4s, v2.4s, v22.4s +sqrdmulh v22.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +mla v12.4S, v22.4S, v31.s[0] +sqrdmulh v22.4S, v11.4S, v27.s[0] +mul v11.4S, v11.4S,v28.s[0] +mla v11.4S, v22.4S, v31.s[0] +sqrdmulh v22.4S, v15.4S, v27.s[1] +mul v15.4S, v15.4S,v28.s[1] +mla v15.4S, v22.4S, v31.s[0] +sqrdmulh v22.4S, v16.4S, v27.s[1] +mul v16.4S, v16.4S,v28.s[1] +mla v16.4S, v22.4S, v31.s[0] +sqrdmulh v22.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +mla v1.4S, v22.4S, v31.s[0] +sqrdmulh v22.4S, v0.4S, v27.s[2] +mul v0.4S, v0.4S,v28.s[2] +mla v0.4S, v22.4S, v31.s[0] +sqrdmulh v22.4S, v17.4S, v27.s[3] +mul v17.4S, v17.4S,v28.s[3] +mla v17.4S, v22.4S, v31.s[0] +sqrdmulh v22.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v14.4s, v12.4s +add v14.4s, v14.4s, v12.4s +sub v12.4s, v13.4s, v11.4s +add v13.4s, v13.4s, v11.4s +sub v11.4s, v3.4s, v15.4s +add v3.4s, v3.4s, v15.4s +sub v15.4s, v18.4s, v16.4s +add v18.4s, v18.4s, v16.4s +sub v16.4s, v19.4s, v1.4s +add v19.4s, v19.4s, v1.4s +sub v1.4s, v2.4s, v0.4s +add v2.4s, v2.4s, v0.4s +sub v0.4s, v20.4s, v17.4s +add v20.4s, v20.4s, v17.4s +sub v17.4s, v10.4s, v21.4s +add v10.4s, v10.4s, v21.4s +sqrdmulh v21.4S, v13.4S, v25.s[0] +mul v13.4S, v13.4S,v26.s[0] +mla v13.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v12.4S, v25.s[1] +mul v12.4S, v12.4S,v26.s[1] +mla v12.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v18.4S, v25.s[2] +mul v18.4S, v18.4S,v26.s[2] +mla v18.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v15.4S, v25.s[3] +mul v15.4S, v15.4S,v26.s[3] +mla v15.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v2.4S, v23.s[0] +mul v2.4S, v2.4S,v24.s[0] +mla v2.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v1.4S, v23.s[1] +mul v1.4S, v1.4S,v24.s[1] +mla v1.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v10.4S, v23.s[2] +mul v10.4S, v10.4S,v24.s[2] +mla v10.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v17.4S, v23.s[3] +mul v17.4S, v17.4S,v24.s[3] +mla v17.4S, v21.4S, v31.s[0] +sub v21.4s, v14.4s, v13.4s +add v14.4s, v14.4s, v13.4s +sub v13.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +sub v12.4s, v3.4s, v18.4s +add v3.4s, v3.4s, v18.4s +sub v18.4s, v11.4s, v15.4s +add v11.4s, v11.4s, v15.4s +sub v15.4s, v19.4s, v2.4s +add v19.4s, v19.4s, v2.4s +sub v2.4s, v16.4s, v1.4s +add v16.4s, v16.4s, v1.4s +sub v1.4s, v20.4s, v10.4s +add v20.4s, v20.4s, v10.4s +sub v10.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +str q14, [x0, #32] +str q21, [x0, #96] +str q22, [x0, #160] +str q13, [x0, #224] +str q3, [x0, #288] +str q12, [x0, #352] +str q11, [x0, #416] +str q18, [x0, #480] +str q19, [x0, #544] +str q15, [x0, #608] +str q16, [x0, #672] +str q2, [x0, #736] +str q20, [x0, #800] +str q1, [x0, #864] +str q0, [x0, #928] +str q10, [x0, #992] +ldr q10, [x0, #816] +ldr q0, [x0, #880] +ldr q1, [x0, #944] +ldr q20, [x0, #1008] +ldr q2, [x0, #304] +ldr q16, [x0, #368] +ldr q15, [x0, #432] +ldr q19, [x0, #496] +ldr q18, [x0, #560] +ldr q11, [x0, #624] +ldr q12, [x0, #688] +ldr q3, [x0, #752] +ldr q13, [x0, #48] +ldr q22, [x0, #112] +ldr q21, [x0, #176] +ldr q14, [x0, #240] +sqrdmulh v17.4S, v10.4S, v29.s[0] +mul v10.4S, v10.4S,v30.s[0] +mla v10.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v0.4S, v29.s[0] +mul v0.4S, v0.4S,v30.s[0] +mla v0.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +mla v1.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +mla v20.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +mla v18.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +mla v11.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +mla v12.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +mla v3.4S, v17.4S, v31.s[0] +sub v17.4s, v2.4s, v10.4s +add v2.4s, v2.4s, v10.4s +sub v10.4s, v16.4s, v0.4s +add v16.4s, v16.4s, v0.4s +sub v0.4s, v15.4s, v1.4s +add v15.4s, v15.4s, v1.4s +sub v1.4s, v19.4s, v20.4s +add v19.4s, v19.4s, v20.4s +sub v20.4s, v13.4s, v18.4s +add v13.4s, v13.4s, v18.4s +sub v18.4s, v22.4s, v11.4s +add v22.4s, v22.4s, v11.4s +sub v11.4s, v21.4s, v12.4s +add v21.4s, v21.4s, v12.4s +sub v12.4s, v14.4s, v3.4s +add v14.4s, v14.4s, v3.4s +sqrdmulh v3.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +mla v15.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v19.4S, v29.s[1] +mul v19.4S, v19.4S,v30.s[1] +mla v19.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v2.4S, v29.s[1] +mul v2.4S, v2.4S,v30.s[1] +mla v2.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +mla v16.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v0.4S, v29.s[2] +mul v0.4S, v0.4S,v30.s[2] +mla v0.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v1.4S, v29.s[2] +mul v1.4S, v1.4S,v30.s[2] +mla v1.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +mla v17.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +mla v10.4S, v3.4S, v31.s[0] +sub v3.4s, v21.4s, v15.4s +add v21.4s, v21.4s, v15.4s +sub v15.4s, v14.4s, v19.4s +add v14.4s, v14.4s, v19.4s +sub v19.4s, v13.4s, v2.4s +add v13.4s, v13.4s, v2.4s +sub v2.4s, v22.4s, v16.4s +add v22.4s, v22.4s, v16.4s +sub v16.4s, v11.4s, v0.4s +add v11.4s, v11.4s, v0.4s +sub v0.4s, v12.4s, v1.4s +add v12.4s, v12.4s, v1.4s +sub v1.4s, v20.4s, v17.4s +add v20.4s, v20.4s, v17.4s +sub v17.4s, v18.4s, v10.4s +add v18.4s, v18.4s, v10.4s +sqrdmulh v10.4S, v21.4S, v27.s[0] +mul v21.4S, v21.4S,v28.s[0] +mla v21.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +mla v14.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v3.4S, v27.s[1] +mul v3.4S, v3.4S,v28.s[1] +mla v3.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v15.4S, v27.s[1] +mul v15.4S, v15.4S,v28.s[1] +mla v15.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v11.4S, v27.s[2] +mul v11.4S, v11.4S,v28.s[2] +mla v11.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v12.4S, v27.s[2] +mul v12.4S, v12.4S,v28.s[2] +mla v12.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v16.4S, v27.s[3] +mul v16.4S, v16.4S,v28.s[3] +mla v16.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v0.4S, v27.s[3] +mul v0.4S, v0.4S,v28.s[3] +mla v0.4S, v10.4S, v31.s[0] +sub v10.4s, v13.4s, v21.4s +add v13.4s, v13.4s, v21.4s +sub v21.4s, v22.4s, v14.4s +add v22.4s, v22.4s, v14.4s +sub v14.4s, v19.4s, v3.4s +add v19.4s, v19.4s, v3.4s +sub v3.4s, v2.4s, v15.4s +add v2.4s, v2.4s, v15.4s +sub v15.4s, v20.4s, v11.4s +add v20.4s, v20.4s, v11.4s +sub v11.4s, v18.4s, v12.4s +add v18.4s, v18.4s, v12.4s +sub v12.4s, v1.4s, v16.4s +add v1.4s, v1.4s, v16.4s +sub v16.4s, v17.4s, v0.4s +add v17.4s, v17.4s, v0.4s +sqrdmulh v0.4S, v22.4S, v25.s[0] +mul v22.4S, v22.4S,v26.s[0] +mla v22.4S, v0.4S, v31.s[0] +sqrdmulh v0.4S, v21.4S, v25.s[1] +mul v21.4S, v21.4S,v26.s[1] +mla v21.4S, v0.4S, v31.s[0] +sqrdmulh v0.4S, v2.4S, v25.s[2] +mul v2.4S, v2.4S,v26.s[2] +mla v2.4S, v0.4S, v31.s[0] +sqrdmulh v0.4S, v3.4S, v25.s[3] +mul v3.4S, v3.4S,v26.s[3] +mla v3.4S, v0.4S, v31.s[0] +sqrdmulh v0.4S, v18.4S, v23.s[0] +mul v18.4S, v18.4S,v24.s[0] +mla v18.4S, v0.4S, v31.s[0] +sqrdmulh v0.4S, v11.4S, v23.s[1] +mul v11.4S, v11.4S,v24.s[1] +mla v11.4S, v0.4S, v31.s[0] +sqrdmulh v0.4S, v17.4S, v23.s[2] +mul v17.4S, v17.4S,v24.s[2] +mla v17.4S, v0.4S, v31.s[0] +sqrdmulh v0.4S, v16.4S, v23.s[3] +mul v16.4S, v16.4S,v24.s[3] +mla v16.4S, v0.4S, v31.s[0] +sub v0.4s, v13.4s, v22.4s +add v13.4s, v13.4s, v22.4s +sub v22.4s, v10.4s, v21.4s +add v10.4s, v10.4s, v21.4s +sub v21.4s, v19.4s, v2.4s +add v19.4s, v19.4s, v2.4s +sub v2.4s, v14.4s, v3.4s +add v14.4s, v14.4s, v3.4s +sub v3.4s, v20.4s, v18.4s +add v20.4s, v20.4s, v18.4s +sub v18.4s, v15.4s, v11.4s +add v15.4s, v15.4s, v11.4s +sub v11.4s, v1.4s, v17.4s +add v1.4s, v1.4s, v17.4s +sub v17.4s, v12.4s, v16.4s +add v12.4s, v12.4s, v16.4s +str q13, [x0, #48] +str q0, [x0, #112] +str q10, [x0, #176] +str q22, [x0, #240] +str q19, [x0, #304] +str q21, [x0, #368] +str q14, [x0, #432] +str q2, [x0, #496] +str q20, [x0, #560] +str q3, [x0, #624] +str q15, [x0, #688] +str q18, [x0, #752] +str q1, [x0, #816] +str q11, [x0, #880] +str q12, [x0, #944] +str q17, [x0, #1008] +ldr q17, [x0, #768] +ldr q12, [x0, #832] +ldr q11, [x0, #896] +ldr q1, [x0, #960] +ldr q18, [x0, #256] +ldr q15, [x0, #320] +ldr q3, [x0, #384] +ldr q20, [x0, #448] +ldr q2, [x0, #512] +ldr q14, [x0, #576] +ldr q21, [x0, #640] +ldr q19, [x0, #704] +ldr q22, [x0, #0] +ldr q10, [x0, #64] +ldr q0, [x0, #128] +ldr q13, [x0, #192] +sqrdmulh v16.4S, v17.4S, v29.s[0] +mul v17.4S, v17.4S,v30.s[0] +mla v17.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +mla v12.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +mla v11.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +mla v1.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +mla v2.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +mla v14.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +mla v21.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +mla v19.4S, v16.4S, v31.s[0] +sub v16.4s, v18.4s, v17.4s +add v18.4s, v18.4s, v17.4s +sub v17.4s, v15.4s, v12.4s +add v15.4s, v15.4s, v12.4s +sub v12.4s, v3.4s, v11.4s +add v3.4s, v3.4s, v11.4s +sub v11.4s, v20.4s, v1.4s +add v20.4s, v20.4s, v1.4s +sub v1.4s, v22.4s, v2.4s +add v22.4s, v22.4s, v2.4s +sub v2.4s, v10.4s, v14.4s +add v10.4s, v10.4s, v14.4s +sub v14.4s, v0.4s, v21.4s +add v0.4s, v0.4s, v21.4s +sub v21.4s, v13.4s, v19.4s +add v13.4s, v13.4s, v19.4s +sqrdmulh v19.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +mla v3.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +mla v20.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +mla v18.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +mla v15.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v12.4S, v29.s[2] +mul v12.4S, v12.4S,v30.s[2] +mla v12.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +mla v11.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +mla v16.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +mla v17.4S, v19.4S, v31.s[0] +sub v19.4s, v0.4s, v3.4s +add v0.4s, v0.4s, v3.4s +sub v3.4s, v13.4s, v20.4s +add v13.4s, v13.4s, v20.4s +sub v20.4s, v22.4s, v18.4s +add v22.4s, v22.4s, v18.4s +sub v18.4s, v10.4s, v15.4s +add v10.4s, v10.4s, v15.4s +sub v15.4s, v14.4s, v12.4s +add v14.4s, v14.4s, v12.4s +sub v12.4s, v21.4s, v11.4s +add v21.4s, v21.4s, v11.4s +sub v11.4s, v1.4s, v16.4s +add v1.4s, v1.4s, v16.4s +sub v16.4s, v2.4s, v17.4s +add v2.4s, v2.4s, v17.4s +sqrdmulh v17.4S, v0.4S, v27.s[0] +mul v0.4S, v0.4S,v28.s[0] +mla v0.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v13.4S, v27.s[0] +mul v13.4S, v13.4S,v28.s[0] +mla v13.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v19.4S, v27.s[1] +mul v19.4S, v19.4S,v28.s[1] +mla v19.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v3.4S, v27.s[1] +mul v3.4S, v3.4S,v28.s[1] +mla v3.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v14.4S, v27.s[2] +mul v14.4S, v14.4S,v28.s[2] +mla v14.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v21.4S, v27.s[2] +mul v21.4S, v21.4S,v28.s[2] +mla v21.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +mla v15.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v12.4S, v27.s[3] +mul v12.4S, v12.4S,v28.s[3] +mla v12.4S, v17.4S, v31.s[0] +sub v17.4s, v22.4s, v0.4s +add v22.4s, v22.4s, v0.4s +sub v0.4s, v10.4s, v13.4s +add v10.4s, v10.4s, v13.4s +sub v13.4s, v20.4s, v19.4s +add v20.4s, v20.4s, v19.4s +sub v19.4s, v18.4s, v3.4s +add v18.4s, v18.4s, v3.4s +sub v3.4s, v1.4s, v14.4s +add v1.4s, v1.4s, v14.4s +sub v14.4s, v2.4s, v21.4s +add v2.4s, v2.4s, v21.4s +sub v21.4s, v11.4s, v15.4s +add v11.4s, v11.4s, v15.4s +sub v15.4s, v16.4s, v12.4s +add v16.4s, v16.4s, v12.4s +sqrdmulh v12.4S, v10.4S, v25.s[0] +mul v10.4S, v10.4S,v26.s[0] +mla v10.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v0.4S, v25.s[1] +mul v0.4S, v0.4S,v26.s[1] +mla v0.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v18.4S, v25.s[2] +mul v18.4S, v18.4S,v26.s[2] +mla v18.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v19.4S, v25.s[3] +mul v19.4S, v19.4S,v26.s[3] +mla v19.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v2.4S, v23.s[0] +mul v2.4S, v2.4S,v24.s[0] +mla v2.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v14.4S, v23.s[1] +mul v14.4S, v14.4S,v24.s[1] +mla v14.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v16.4S, v23.s[2] +mul v16.4S, v16.4S,v24.s[2] +mla v16.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v15.4S, v23.s[3] +mul v15.4S, v15.4S,v24.s[3] +mla v15.4S, v12.4S, v31.s[0] +sub v12.4s, v22.4s, v10.4s +add v22.4s, v22.4s, v10.4s +sub v10.4s, v17.4s, v0.4s +add v17.4s, v17.4s, v0.4s +sub v0.4s, v20.4s, v18.4s +add v20.4s, v20.4s, v18.4s +sub v18.4s, v13.4s, v19.4s +add v13.4s, v13.4s, v19.4s +sub v19.4s, v1.4s, v2.4s +add v1.4s, v1.4s, v2.4s +sub v2.4s, v3.4s, v14.4s +add v3.4s, v3.4s, v14.4s +sub v14.4s, v11.4s, v16.4s +add v11.4s, v11.4s, v16.4s +sub v16.4s, v21.4s, v15.4s +add v21.4s, v21.4s, v15.4s +str q22, [x0, #0] +str q12, [x0, #64] +str q17, [x0, #128] +str q10, [x0, #192] +str q20, [x0, #256] +str q0, [x0, #320] +str q13, [x0, #384] +str q18, [x0, #448] +str q1, [x0, #512] +str q19, [x0, #576] +str q3, [x0, #640] +str q2, [x0, #704] +str q11, [x0, #768] +str q14, [x0, #832] +str q21, [x0, #896] +str q16, [x0, #960] +ldr q16, [x0, #784] +ldr q21, [x0, #848] +ldr q14, [x0, #912] +ldr q11, [x0, #976] +ldr q2, [x0, #272] +ldr q3, [x0, #336] +ldr q19, [x0, #400] +ldr q1, [x0, #464] +ldr q18, [x0, #528] +ldr q13, [x0, #592] +ldr q0, [x0, #656] +ldr q20, [x0, #720] +ldr q10, [x0, #16] +ldr q17, [x0, #80] +ldr q12, [x0, #144] +ldr q22, [x0, #208] +sqrdmulh v15.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +mla v16.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +mla v21.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +mla v14.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +mla v11.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +mla v18.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +mla v13.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v0.4S, v29.s[0] +mul v0.4S, v0.4S,v30.s[0] +mla v0.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +mla v20.4S, v15.4S, v31.s[0] +sub v15.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +sub v16.4s, v3.4s, v21.4s +add v3.4s, v3.4s, v21.4s +sub v21.4s, v19.4s, v14.4s +add v19.4s, v19.4s, v14.4s +sub v14.4s, v1.4s, v11.4s +add v1.4s, v1.4s, v11.4s +sub v11.4s, v10.4s, v18.4s +add v10.4s, v10.4s, v18.4s +sub v18.4s, v17.4s, v13.4s +add v17.4s, v17.4s, v13.4s +sub v13.4s, v12.4s, v0.4s +add v12.4s, v12.4s, v0.4s +sub v0.4s, v22.4s, v20.4s +add v22.4s, v22.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v29.s[1] +mul v19.4S, v19.4S,v30.s[1] +mla v19.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v1.4S, v29.s[1] +mul v1.4S, v1.4S,v30.s[1] +mla v1.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v2.4S, v29.s[1] +mul v2.4S, v2.4S,v30.s[1] +mla v2.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +mla v3.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +mla v21.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +mla v14.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +mla v15.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +mla v16.4S, v20.4S, v31.s[0] +sub v20.4s, v12.4s, v19.4s +add v12.4s, v12.4s, v19.4s +sub v19.4s, v22.4s, v1.4s +add v22.4s, v22.4s, v1.4s +sub v1.4s, v10.4s, v2.4s +add v10.4s, v10.4s, v2.4s +sub v2.4s, v17.4s, v3.4s +add v17.4s, v17.4s, v3.4s +sub v3.4s, v13.4s, v21.4s +add v13.4s, v13.4s, v21.4s +sub v21.4s, v0.4s, v14.4s +add v0.4s, v0.4s, v14.4s +sub v14.4s, v11.4s, v15.4s +add v11.4s, v11.4s, v15.4s +sub v15.4s, v18.4s, v16.4s +add v18.4s, v18.4s, v16.4s +sqrdmulh v16.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +mla v12.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v22.4S, v27.s[0] +mul v22.4S, v22.4S,v28.s[0] +mla v22.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v20.4S, v27.s[1] +mul v20.4S, v20.4S,v28.s[1] +mla v20.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v19.4S, v27.s[1] +mul v19.4S, v19.4S,v28.s[1] +mla v19.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v13.4S, v27.s[2] +mul v13.4S, v13.4S,v28.s[2] +mla v13.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v0.4S, v27.s[2] +mul v0.4S, v0.4S,v28.s[2] +mla v0.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v3.4S, v27.s[3] +mul v3.4S, v3.4S,v28.s[3] +mla v3.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +mla v21.4S, v16.4S, v31.s[0] +sub v16.4s, v10.4s, v12.4s +add v10.4s, v10.4s, v12.4s +sub v12.4s, v17.4s, v22.4s +add v17.4s, v17.4s, v22.4s +sub v22.4s, v1.4s, v20.4s +add v1.4s, v1.4s, v20.4s +sub v20.4s, v2.4s, v19.4s +add v2.4s, v2.4s, v19.4s +sub v19.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +sub v13.4s, v18.4s, v0.4s +add v18.4s, v18.4s, v0.4s +sub v0.4s, v14.4s, v3.4s +add v14.4s, v14.4s, v3.4s +sub v3.4s, v15.4s, v21.4s +add v15.4s, v15.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v25.s[0] +mul v17.4S, v17.4S,v26.s[0] +mla v17.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v12.4S, v25.s[1] +mul v12.4S, v12.4S,v26.s[1] +mla v12.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v2.4S, v25.s[2] +mul v2.4S, v2.4S,v26.s[2] +mla v2.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v20.4S, v25.s[3] +mul v20.4S, v20.4S,v26.s[3] +mla v20.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v18.4S, v23.s[0] +mul v18.4S, v18.4S,v24.s[0] +mla v18.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v13.4S, v23.s[1] +mul v13.4S, v13.4S,v24.s[1] +mla v13.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v15.4S, v23.s[2] +mul v15.4S, v15.4S,v24.s[2] +mla v15.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v3.4S, v23.s[3] +mul v3.4S, v3.4S,v24.s[3] +mla v3.4S, v21.4S, v31.s[0] +sub v21.4s, v10.4s, v17.4s +add v10.4s, v10.4s, v17.4s +sub v17.4s, v16.4s, v12.4s +add v16.4s, v16.4s, v12.4s +sub v12.4s, v1.4s, v2.4s +add v1.4s, v1.4s, v2.4s +sub v2.4s, v22.4s, v20.4s +add v22.4s, v22.4s, v20.4s +sub v20.4s, v11.4s, v18.4s +add v11.4s, v11.4s, v18.4s +sub v18.4s, v19.4s, v13.4s +add v19.4s, v19.4s, v13.4s +sub v13.4s, v14.4s, v15.4s +add v14.4s, v14.4s, v15.4s +sub v15.4s, v0.4s, v3.4s +add v0.4s, v0.4s, v3.4s +str q10, [x0, #16] +str q21, [x0, #80] +str q16, [x0, #144] +str q17, [x0, #208] +str q1, [x0, #272] +str q12, [x0, #336] +str q22, [x0, #400] +str q2, [x0, #464] +str q11, [x0, #528] +str q20, [x0, #592] +str q19, [x0, #656] +str q18, [x0, #720] +str q14, [x0, #784] +str q13, [x0, #848] +str q0, [x0, #912] +str q15, [x0, #976] +ldr q4, [x17, #+128] +ldr q5, [x17, #+144] +ldr q6, [x17, #+160] +ldr q7, [x17, #+176] +ldr q8, [x17, #+192] +ldr q9, [x17, #+208] +ldr q3, [x17, #+224] +ldr q10, [x17, #+240] +ldr q21, [x0, #32] +ldr q16, [x0, #48] +ldr q17, [x0, #0] +ldr q1, [x0, #16] +sqrdmulh v12.4S, v21.4S, v5.s[0] +mul v21.4S, v21.4S,v4.s[0] +mla v21.4S, v12.4S, v31.s[0] +sub v12.4s, v17.4s, v21.4s +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v16.4S, v5.s[0] +mul v16.4S, v16.4S,v4.s[0] +mla v16.4S, v21.4S, v31.s[0] +sub v21.4s, v1.4s, v16.4s +add v1.4s, v1.4s, v16.4s +ldr q16, [x17, #+256] +ldr q22, [x17, #+272] +sqrdmulh v2.4S, v1.4S, v5.s[1] +mul v1.4S, v1.4S,v4.s[1] +mla v1.4S, v2.4S, v31.s[0] +sub v2.4s, v17.4s, v1.4s +add v17.4s, v17.4s, v1.4s +sqrdmulh v1.4S, v21.4S, v5.s[2] +mul v21.4S, v21.4S,v4.s[2] +mla v21.4S, v1.4S, v31.s[0] +sub v1.4s, v12.4s, v21.4s +add v12.4s, v12.4s, v21.4s +str q17, [x0, #0] +str q2, [x0, #16] +str q12, [x0, #32] +str q1, [x0, #48] +ldr q1, [x0, #96] +ldr q12, [x0, #112] +ldr q2, [x0, #64] +ldr q17, [x0, #80] +sqrdmulh v21.4S, v1.4S, v7.s[0] +mul v1.4S, v1.4S,v6.s[0] +mla v1.4S, v21.4S, v31.s[0] +sub v21.4s, v2.4s, v1.4s +add v2.4s, v2.4s, v1.4s +sqrdmulh v1.4S, v12.4S, v7.s[0] +mul v12.4S, v12.4S,v6.s[0] +mla v12.4S, v1.4S, v31.s[0] +sub v1.4s, v17.4s, v12.4s +add v17.4s, v17.4s, v12.4s +ldr q12, [x17, #+288] +ldr q11, [x17, #+304] +sqrdmulh v20.4S, v17.4S, v7.s[1] +mul v17.4S, v17.4S,v6.s[1] +mla v17.4S, v20.4S, v31.s[0] +sub v20.4s, v2.4s, v17.4s +add v2.4s, v2.4s, v17.4s +sqrdmulh v17.4S, v1.4S, v7.s[2] +mul v1.4S, v1.4S,v6.s[2] +mla v1.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v1.4s +add v21.4s, v21.4s, v1.4s +str q2, [x0, #64] +str q20, [x0, #80] +str q21, [x0, #96] +str q17, [x0, #112] +ldr q17, [x0, #160] +ldr q21, [x0, #176] +ldr q20, [x0, #128] +ldr q2, [x0, #144] +sqrdmulh v1.4S, v17.4S, v9.s[0] +mul v17.4S, v17.4S,v8.s[0] +mla v17.4S, v1.4S, v31.s[0] +sub v1.4s, v20.4s, v17.4s +add v20.4s, v20.4s, v17.4s +sqrdmulh v17.4S, v21.4S, v9.s[0] +mul v21.4S, v21.4S,v8.s[0] +mla v21.4S, v17.4S, v31.s[0] +sub v17.4s, v2.4s, v21.4s +add v2.4s, v2.4s, v21.4s +ldr q21, [x17, #+320] +ldr q19, [x17, #+336] +sqrdmulh v18.4S, v2.4S, v9.s[1] +mul v2.4S, v2.4S,v8.s[1] +mla v2.4S, v18.4S, v31.s[0] +sub v18.4s, v20.4s, v2.4s +add v20.4s, v20.4s, v2.4s +sqrdmulh v2.4S, v17.4S, v9.s[2] +mul v17.4S, v17.4S,v8.s[2] +mla v17.4S, v2.4S, v31.s[0] +sub v2.4s, v1.4s, v17.4s +add v1.4s, v1.4s, v17.4s +str q20, [x0, #128] +str q18, [x0, #144] +str q1, [x0, #160] +str q2, [x0, #176] +ldr q2, [x0, #224] +ldr q1, [x0, #240] +ldr q18, [x0, #192] +ldr q20, [x0, #208] +sqrdmulh v17.4S, v2.4S, v10.s[0] +mul v2.4S, v2.4S,v3.s[0] +mla v2.4S, v17.4S, v31.s[0] +sub v17.4s, v18.4s, v2.4s +add v18.4s, v18.4s, v2.4s +sqrdmulh v2.4S, v1.4S, v10.s[0] +mul v1.4S, v1.4S,v3.s[0] +mla v1.4S, v2.4S, v31.s[0] +sub v2.4s, v20.4s, v1.4s +add v20.4s, v20.4s, v1.4s +ldr q1, [x17, #+352] +ldr q14, [x17, #+368] +sqrdmulh v13.4S, v20.4S, v10.s[1] +mul v20.4S, v20.4S,v3.s[1] +mla v20.4S, v13.4S, v31.s[0] +sub v13.4s, v18.4s, v20.4s +add v18.4s, v18.4s, v20.4s +sqrdmulh v20.4S, v2.4S, v10.s[2] +mul v2.4S, v2.4S,v3.s[2] +mla v2.4S, v20.4S, v31.s[0] +sub v20.4s, v17.4s, v2.4s +add v17.4s, v17.4s, v2.4s +str q18, [x0, #192] +str q13, [x0, #208] +str q17, [x0, #224] +str q20, [x0, #240] +ldr q20, [x0, #288] +ldr q17, [x0, #304] +ldr q13, [x0, #256] +ldr q18, [x0, #272] +sqrdmulh v2.4S, v20.4S, v22.s[0] +mul v20.4S, v20.4S,v16.s[0] +mla v20.4S, v2.4S, v31.s[0] +sub v2.4s, v13.4s, v20.4s +add v13.4s, v13.4s, v20.4s +sqrdmulh v20.4S, v17.4S, v22.s[0] +mul v17.4S, v17.4S,v16.s[0] +mla v17.4S, v20.4S, v31.s[0] +sub v20.4s, v18.4s, v17.4s +add v18.4s, v18.4s, v17.4s +ldr q17, [x17, #+384] +ldr q0, [x17, #+400] +sqrdmulh v15.4S, v18.4S, v22.s[1] +mul v18.4S, v18.4S,v16.s[1] +mla v18.4S, v15.4S, v31.s[0] +sub v15.4s, v13.4s, v18.4s +add v13.4s, v13.4s, v18.4s +sqrdmulh v18.4S, v20.4S, v22.s[2] +mul v20.4S, v20.4S,v16.s[2] +mla v20.4S, v18.4S, v31.s[0] +sub v18.4s, v2.4s, v20.4s +add v2.4s, v2.4s, v20.4s +str q13, [x0, #256] +str q15, [x0, #272] +str q2, [x0, #288] +str q18, [x0, #304] +ldr q5, [x0, #352] +ldr q4, [x0, #368] +ldr q18, [x0, #320] +ldr q2, [x0, #336] +sqrdmulh v15.4S, v5.4S, v11.s[0] +mul v5.4S, v5.4S,v12.s[0] +mla v5.4S, v15.4S, v31.s[0] +sub v15.4s, v18.4s, v5.4s +add v18.4s, v18.4s, v5.4s +sqrdmulh v5.4S, v4.4S, v11.s[0] +mul v4.4S, v4.4S,v12.s[0] +mla v4.4S, v5.4S, v31.s[0] +sub v5.4s, v2.4s, v4.4s +add v2.4s, v2.4s, v4.4s +ldr q4, [x17, #+416] +ldr q13, [x17, #+432] +sqrdmulh v20.4S, v2.4S, v11.s[1] +mul v2.4S, v2.4S,v12.s[1] +mla v2.4S, v20.4S, v31.s[0] +sub v20.4s, v18.4s, v2.4s +add v18.4s, v18.4s, v2.4s +sqrdmulh v2.4S, v5.4S, v11.s[2] +mul v5.4S, v5.4S,v12.s[2] +mla v5.4S, v2.4S, v31.s[0] +sub v2.4s, v15.4s, v5.4s +add v15.4s, v15.4s, v5.4s +str q18, [x0, #320] +str q20, [x0, #336] +str q15, [x0, #352] +str q2, [x0, #368] +ldr q7, [x0, #416] +ldr q6, [x0, #432] +ldr q2, [x0, #384] +ldr q15, [x0, #400] +sqrdmulh v20.4S, v7.4S, v19.s[0] +mul v7.4S, v7.4S,v21.s[0] +mla v7.4S, v20.4S, v31.s[0] +sub v20.4s, v2.4s, v7.4s +add v2.4s, v2.4s, v7.4s +sqrdmulh v7.4S, v6.4S, v19.s[0] +mul v6.4S, v6.4S,v21.s[0] +mla v6.4S, v7.4S, v31.s[0] +sub v7.4s, v15.4s, v6.4s +add v15.4s, v15.4s, v6.4s +ldr q6, [x17, #+448] +ldr q18, [x17, #+464] +sqrdmulh v5.4S, v15.4S, v19.s[1] +mul v15.4S, v15.4S,v21.s[1] +mla v15.4S, v5.4S, v31.s[0] +sub v5.4s, v2.4s, v15.4s +add v2.4s, v2.4s, v15.4s +sqrdmulh v15.4S, v7.4S, v19.s[2] +mul v7.4S, v7.4S,v21.s[2] +mla v7.4S, v15.4S, v31.s[0] +sub v15.4s, v20.4s, v7.4s +add v20.4s, v20.4s, v7.4s +str q2, [x0, #384] +str q5, [x0, #400] +str q20, [x0, #416] +str q15, [x0, #432] +ldr q9, [x0, #480] +ldr q8, [x0, #496] +ldr q15, [x0, #448] +ldr q20, [x0, #464] +sqrdmulh v5.4S, v9.4S, v14.s[0] +mul v9.4S, v9.4S,v1.s[0] +mla v9.4S, v5.4S, v31.s[0] +sub v5.4s, v15.4s, v9.4s +add v15.4s, v15.4s, v9.4s +sqrdmulh v9.4S, v8.4S, v14.s[0] +mul v8.4S, v8.4S,v1.s[0] +mla v8.4S, v9.4S, v31.s[0] +sub v9.4s, v20.4s, v8.4s +add v20.4s, v20.4s, v8.4s +ldr q8, [x17, #+480] +ldr q2, [x17, #+496] +sqrdmulh v7.4S, v20.4S, v14.s[1] +mul v20.4S, v20.4S,v1.s[1] +mla v20.4S, v7.4S, v31.s[0] +sub v7.4s, v15.4s, v20.4s +add v15.4s, v15.4s, v20.4s +sqrdmulh v20.4S, v9.4S, v14.s[2] +mul v9.4S, v9.4S,v1.s[2] +mla v9.4S, v20.4S, v31.s[0] +sub v20.4s, v5.4s, v9.4s +add v5.4s, v5.4s, v9.4s +str q15, [x0, #448] +str q7, [x0, #464] +str q5, [x0, #480] +str q20, [x0, #496] +ldr q10, [x0, #544] +ldr q3, [x0, #560] +ldr q20, [x0, #512] +ldr q5, [x0, #528] +sqrdmulh v7.4S, v10.4S, v0.s[0] +mul v10.4S, v10.4S,v17.s[0] +mla v10.4S, v7.4S, v31.s[0] +sub v7.4s, v20.4s, v10.4s +add v20.4s, v20.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v0.s[0] +mul v3.4S, v3.4S,v17.s[0] +mla v3.4S, v10.4S, v31.s[0] +sub v10.4s, v5.4s, v3.4s +add v5.4s, v5.4s, v3.4s +ldr q3, [x17, #+512] +ldr q15, [x17, #+528] +sqrdmulh v9.4S, v5.4S, v0.s[1] +mul v5.4S, v5.4S,v17.s[1] +mla v5.4S, v9.4S, v31.s[0] +sub v9.4s, v20.4s, v5.4s +add v20.4s, v20.4s, v5.4s +sqrdmulh v5.4S, v10.4S, v0.s[2] +mul v10.4S, v10.4S,v17.s[2] +mla v10.4S, v5.4S, v31.s[0] +sub v5.4s, v7.4s, v10.4s +add v7.4s, v7.4s, v10.4s +str q20, [x0, #512] +str q9, [x0, #528] +str q7, [x0, #544] +str q5, [x0, #560] +ldr q22, [x0, #608] +ldr q16, [x0, #624] +ldr q5, [x0, #576] +ldr q7, [x0, #592] +sqrdmulh v9.4S, v22.4S, v13.s[0] +mul v22.4S, v22.4S,v4.s[0] +mla v22.4S, v9.4S, v31.s[0] +sub v9.4s, v5.4s, v22.4s +add v5.4s, v5.4s, v22.4s +sqrdmulh v22.4S, v16.4S, v13.s[0] +mul v16.4S, v16.4S,v4.s[0] +mla v16.4S, v22.4S, v31.s[0] +sub v22.4s, v7.4s, v16.4s +add v7.4s, v7.4s, v16.4s +ldr q16, [x17, #+544] +ldr q20, [x17, #+560] +sqrdmulh v10.4S, v7.4S, v13.s[1] +mul v7.4S, v7.4S,v4.s[1] +mla v7.4S, v10.4S, v31.s[0] +sub v10.4s, v5.4s, v7.4s +add v5.4s, v5.4s, v7.4s +sqrdmulh v7.4S, v22.4S, v13.s[2] +mul v22.4S, v22.4S,v4.s[2] +mla v22.4S, v7.4S, v31.s[0] +sub v7.4s, v9.4s, v22.4s +add v9.4s, v9.4s, v22.4s +str q5, [x0, #576] +str q10, [x0, #592] +str q9, [x0, #608] +str q7, [x0, #624] +ldr q11, [x0, #672] +ldr q12, [x0, #688] +ldr q7, [x0, #640] +ldr q9, [x0, #656] +sqrdmulh v10.4S, v11.4S, v18.s[0] +mul v11.4S, v11.4S,v6.s[0] +mla v11.4S, v10.4S, v31.s[0] +sub v10.4s, v7.4s, v11.4s +add v7.4s, v7.4s, v11.4s +sqrdmulh v11.4S, v12.4S, v18.s[0] +mul v12.4S, v12.4S,v6.s[0] +mla v12.4S, v11.4S, v31.s[0] +sub v11.4s, v9.4s, v12.4s +add v9.4s, v9.4s, v12.4s +ldr q12, [x17, #+576] +ldr q5, [x17, #+592] +sqrdmulh v22.4S, v9.4S, v18.s[1] +mul v9.4S, v9.4S,v6.s[1] +mla v9.4S, v22.4S, v31.s[0] +sub v22.4s, v7.4s, v9.4s +add v7.4s, v7.4s, v9.4s +sqrdmulh v9.4S, v11.4S, v18.s[2] +mul v11.4S, v11.4S,v6.s[2] +mla v11.4S, v9.4S, v31.s[0] +sub v9.4s, v10.4s, v11.4s +add v10.4s, v10.4s, v11.4s +str q7, [x0, #640] +str q22, [x0, #656] +str q10, [x0, #672] +str q9, [x0, #688] +ldr q19, [x0, #736] +ldr q21, [x0, #752] +ldr q9, [x0, #704] +ldr q10, [x0, #720] +sqrdmulh v22.4S, v19.4S, v2.s[0] +mul v19.4S, v19.4S,v8.s[0] +mla v19.4S, v22.4S, v31.s[0] +sub v22.4s, v9.4s, v19.4s +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v21.4S, v2.s[0] +mul v21.4S, v21.4S,v8.s[0] +mla v21.4S, v19.4S, v31.s[0] +sub v19.4s, v10.4s, v21.4s +add v10.4s, v10.4s, v21.4s +ldr q21, [x17, #+608] +ldr q7, [x17, #+624] +sqrdmulh v11.4S, v10.4S, v2.s[1] +mul v10.4S, v10.4S,v8.s[1] +mla v10.4S, v11.4S, v31.s[0] +sub v11.4s, v9.4s, v10.4s +add v9.4s, v9.4s, v10.4s +sqrdmulh v10.4S, v19.4S, v2.s[2] +mul v19.4S, v19.4S,v8.s[2] +mla v19.4S, v10.4S, v31.s[0] +sub v10.4s, v22.4s, v19.4s +add v22.4s, v22.4s, v19.4s +str q9, [x0, #704] +str q11, [x0, #720] +str q22, [x0, #736] +str q10, [x0, #752] +ldr q14, [x0, #800] +ldr q1, [x0, #816] +ldr q10, [x0, #768] +ldr q22, [x0, #784] +sqrdmulh v11.4S, v14.4S, v15.s[0] +mul v14.4S, v14.4S,v3.s[0] +mla v14.4S, v11.4S, v31.s[0] +sub v11.4s, v10.4s, v14.4s +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v1.4S, v15.s[0] +mul v1.4S, v1.4S,v3.s[0] +mla v1.4S, v14.4S, v31.s[0] +sub v14.4s, v22.4s, v1.4s +add v22.4s, v22.4s, v1.4s +sqrdmulh v1.4S, v22.4S, v15.s[1] +mul v22.4S, v22.4S,v3.s[1] +mla v22.4S, v1.4S, v31.s[0] +sub v1.4s, v10.4s, v22.4s +add v10.4s, v10.4s, v22.4s +sqrdmulh v22.4S, v14.4S, v15.s[2] +mul v14.4S, v14.4S,v3.s[2] +mla v14.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v14.4s +add v11.4s, v11.4s, v14.4s +str q10, [x0, #768] +str q1, [x0, #784] +str q11, [x0, #800] +str q22, [x0, #816] +ldr q0, [x0, #864] +ldr q17, [x0, #880] +ldr q22, [x0, #832] +ldr q11, [x0, #848] +sqrdmulh v1.4S, v0.4S, v20.s[0] +mul v0.4S, v0.4S,v16.s[0] +mla v0.4S, v1.4S, v31.s[0] +sub v1.4s, v22.4s, v0.4s +add v22.4s, v22.4s, v0.4s +sqrdmulh v0.4S, v17.4S, v20.s[0] +mul v17.4S, v17.4S,v16.s[0] +mla v17.4S, v0.4S, v31.s[0] +sub v0.4s, v11.4s, v17.4s +add v11.4s, v11.4s, v17.4s +sqrdmulh v17.4S, v11.4S, v20.s[1] +mul v11.4S, v11.4S,v16.s[1] +mla v11.4S, v17.4S, v31.s[0] +sub v17.4s, v22.4s, v11.4s +add v22.4s, v22.4s, v11.4s +sqrdmulh v11.4S, v0.4S, v20.s[2] +mul v0.4S, v0.4S,v16.s[2] +mla v0.4S, v11.4S, v31.s[0] +sub v11.4s, v1.4s, v0.4s +add v1.4s, v1.4s, v0.4s +str q22, [x0, #832] +str q17, [x0, #848] +str q1, [x0, #864] +str q11, [x0, #880] +ldr q13, [x0, #928] +ldr q4, [x0, #944] +ldr q11, [x0, #896] +ldr q1, [x0, #912] +sqrdmulh v17.4S, v13.4S, v5.s[0] +mul v13.4S, v13.4S,v12.s[0] +mla v13.4S, v17.4S, v31.s[0] +sub v17.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +sqrdmulh v13.4S, v4.4S, v5.s[0] +mul v4.4S, v4.4S,v12.s[0] +mla v4.4S, v13.4S, v31.s[0] +sub v13.4s, v1.4s, v4.4s +add v1.4s, v1.4s, v4.4s +sqrdmulh v4.4S, v1.4S, v5.s[1] +mul v1.4S, v1.4S,v12.s[1] +mla v1.4S, v4.4S, v31.s[0] +sub v4.4s, v11.4s, v1.4s +add v11.4s, v11.4s, v1.4s +sqrdmulh v1.4S, v13.4S, v5.s[2] +mul v13.4S, v13.4S,v12.s[2] +mla v13.4S, v1.4S, v31.s[0] +sub v1.4s, v17.4s, v13.4s +add v17.4s, v17.4s, v13.4s +str q11, [x0, #896] +str q4, [x0, #912] +str q17, [x0, #928] +str q1, [x0, #944] +ldr q18, [x0, #992] +ldr q6, [x0, #1008] +ldr q1, [x0, #960] +ldr q17, [x0, #976] +sqrdmulh v4.4S, v18.4S, v7.s[0] +mul v18.4S, v18.4S,v21.s[0] +mla v18.4S, v4.4S, v31.s[0] +sub v4.4s, v1.4s, v18.4s +add v1.4s, v1.4s, v18.4s +sqrdmulh v18.4S, v6.4S, v7.s[0] +mul v6.4S, v6.4S,v21.s[0] +mla v6.4S, v18.4S, v31.s[0] +sub v18.4s, v17.4s, v6.4s +add v17.4s, v17.4s, v6.4s +sqrdmulh v6.4S, v17.4S, v7.s[1] +mul v17.4S, v17.4S,v21.s[1] +mla v17.4S, v6.4S, v31.s[0] +sub v6.4s, v1.4s, v17.4s +add v1.4s, v1.4s, v17.4s +sqrdmulh v17.4S, v18.4S, v7.s[2] +mul v18.4S, v18.4S,v21.s[2] +mla v18.4S, v17.4S, v31.s[0] +sub v17.4s, v4.4s, v18.4s +add v4.4s, v4.4s, v18.4s +str q1, [x0, #960] +str q6, [x0, #976] +str q4, [x0, #992] +str q17, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1464 +// Instruction count: 1460 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_24_z4_16.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_24_z4_16.s new file mode 100644 index 0000000..70b872e --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_24_z4_16.s @@ -0,0 +1,1494 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_24_z4_16 +.global _ntt_u32_incomplete_neon_asm_var_4_2_24_z4_16 +ntt_u32_incomplete_neon_asm_var_4_2_24_z4_16: +_ntt_u32_incomplete_neon_asm_var_4_2_24_z4_16: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #800] +ldr q21, [x0, #864] +ldr q20, [x0, #928] +ldr q19, [x0, #992] +ldr q18, [x0, #288] +ldr q17, [x0, #352] +ldr q16, [x0, #416] +ldr q3, [x0, #480] +ldr q2, [x0, #544] +ldr q1, [x0, #608] +ldr q0, [x0, #672] +ldr q15, [x0, #736] +ldr q14, [x0, #32] +ldr q13, [x0, #96] +ldr q12, [x0, #160] +ldr q11, [x0, #224] +sqrdmulh v10.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +mla v22.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +mla v21.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +mla v20.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +mla v19.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +mla v2.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +mla v1.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v0.4S, v29.s[0] +mul v0.4S, v0.4S,v30.s[0] +mla v0.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v15.4S, v29.s[0] +mul v15.4S, v15.4S,v30.s[0] +mla v15.4S, v10.4S, v31.s[0] +sub v10.4s, v18.4s, v22.4s +add v18.4s, v18.4s, v22.4s +sub v22.4s, v17.4s, v21.4s +add v17.4s, v17.4s, v21.4s +sub v21.4s, v16.4s, v20.4s +add v16.4s, v16.4s, v20.4s +sub v20.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +sub v19.4s, v14.4s, v2.4s +add v14.4s, v14.4s, v2.4s +sub v2.4s, v13.4s, v1.4s +add v13.4s, v13.4s, v1.4s +sub v1.4s, v12.4s, v0.4s +add v12.4s, v12.4s, v0.4s +sub v0.4s, v11.4s, v15.4s +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +mla v16.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +mla v3.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +mla v18.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +mla v17.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +mla v21.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +mla v20.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +mla v10.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +mla v22.4S, v15.4S, v31.s[0] +sub v15.4s, v12.4s, v16.4s +add v12.4s, v12.4s, v16.4s +sub v16.4s, v11.4s, v3.4s +add v11.4s, v11.4s, v3.4s +sub v3.4s, v14.4s, v18.4s +add v14.4s, v14.4s, v18.4s +sub v18.4s, v13.4s, v17.4s +add v13.4s, v13.4s, v17.4s +sub v17.4s, v1.4s, v21.4s +add v1.4s, v1.4s, v21.4s +sub v21.4s, v0.4s, v20.4s +add v0.4s, v0.4s, v20.4s +sub v20.4s, v19.4s, v10.4s +add v19.4s, v19.4s, v10.4s +sub v10.4s, v2.4s, v22.4s +add v2.4s, v2.4s, v22.4s +sqrdmulh v22.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +mla v12.4S, v22.4S, v31.s[0] +sqrdmulh v22.4S, v11.4S, v27.s[0] +mul v11.4S, v11.4S,v28.s[0] +mla v11.4S, v22.4S, v31.s[0] +sqrdmulh v22.4S, v15.4S, v27.s[1] +mul v15.4S, v15.4S,v28.s[1] +mla v15.4S, v22.4S, v31.s[0] +sqrdmulh v22.4S, v16.4S, v27.s[1] +mul v16.4S, v16.4S,v28.s[1] +mla v16.4S, v22.4S, v31.s[0] +sqrdmulh v22.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +mla v1.4S, v22.4S, v31.s[0] +sqrdmulh v22.4S, v0.4S, v27.s[2] +mul v0.4S, v0.4S,v28.s[2] +mla v0.4S, v22.4S, v31.s[0] +sqrdmulh v22.4S, v17.4S, v27.s[3] +mul v17.4S, v17.4S,v28.s[3] +mla v17.4S, v22.4S, v31.s[0] +sqrdmulh v22.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v14.4s, v12.4s +add v14.4s, v14.4s, v12.4s +sub v12.4s, v13.4s, v11.4s +add v13.4s, v13.4s, v11.4s +sub v11.4s, v3.4s, v15.4s +add v3.4s, v3.4s, v15.4s +sub v15.4s, v18.4s, v16.4s +add v18.4s, v18.4s, v16.4s +sub v16.4s, v19.4s, v1.4s +add v19.4s, v19.4s, v1.4s +sub v1.4s, v2.4s, v0.4s +add v2.4s, v2.4s, v0.4s +sub v0.4s, v20.4s, v17.4s +add v20.4s, v20.4s, v17.4s +sub v17.4s, v10.4s, v21.4s +add v10.4s, v10.4s, v21.4s +sqrdmulh v21.4S, v13.4S, v25.s[0] +mul v13.4S, v13.4S,v26.s[0] +mla v13.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v12.4S, v25.s[1] +mul v12.4S, v12.4S,v26.s[1] +mla v12.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v18.4S, v25.s[2] +mul v18.4S, v18.4S,v26.s[2] +mla v18.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v15.4S, v25.s[3] +mul v15.4S, v15.4S,v26.s[3] +mla v15.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v2.4S, v23.s[0] +mul v2.4S, v2.4S,v24.s[0] +mla v2.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v1.4S, v23.s[1] +mul v1.4S, v1.4S,v24.s[1] +mla v1.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v10.4S, v23.s[2] +mul v10.4S, v10.4S,v24.s[2] +mla v10.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v17.4S, v23.s[3] +mul v17.4S, v17.4S,v24.s[3] +mla v17.4S, v21.4S, v31.s[0] +sub v21.4s, v14.4s, v13.4s +add v14.4s, v14.4s, v13.4s +sub v13.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +sub v12.4s, v3.4s, v18.4s +add v3.4s, v3.4s, v18.4s +sub v18.4s, v11.4s, v15.4s +add v11.4s, v11.4s, v15.4s +sub v15.4s, v19.4s, v2.4s +add v19.4s, v19.4s, v2.4s +sub v2.4s, v16.4s, v1.4s +add v16.4s, v16.4s, v1.4s +sub v1.4s, v20.4s, v10.4s +add v20.4s, v20.4s, v10.4s +sub v10.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +str q14, [x0, #32] +str q21, [x0, #96] +str q22, [x0, #160] +str q13, [x0, #224] +str q3, [x0, #288] +str q12, [x0, #352] +str q11, [x0, #416] +str q18, [x0, #480] +str q19, [x0, #544] +str q15, [x0, #608] +str q16, [x0, #672] +str q2, [x0, #736] +str q20, [x0, #800] +str q1, [x0, #864] +str q0, [x0, #928] +str q10, [x0, #992] +ldr q10, [x0, #816] +ldr q0, [x0, #880] +ldr q1, [x0, #944] +ldr q20, [x0, #1008] +ldr q2, [x0, #304] +ldr q16, [x0, #368] +ldr q15, [x0, #432] +ldr q19, [x0, #496] +ldr q18, [x0, #560] +ldr q11, [x0, #624] +ldr q12, [x0, #688] +ldr q3, [x0, #752] +ldr q13, [x0, #48] +ldr q22, [x0, #112] +ldr q21, [x0, #176] +ldr q14, [x0, #240] +sqrdmulh v17.4S, v10.4S, v29.s[0] +mul v10.4S, v10.4S,v30.s[0] +mla v10.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v0.4S, v29.s[0] +mul v0.4S, v0.4S,v30.s[0] +mla v0.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +mla v1.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +mla v20.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +mla v18.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +mla v11.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +mla v12.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +mla v3.4S, v17.4S, v31.s[0] +sub v17.4s, v2.4s, v10.4s +add v2.4s, v2.4s, v10.4s +sub v10.4s, v16.4s, v0.4s +add v16.4s, v16.4s, v0.4s +sub v0.4s, v15.4s, v1.4s +add v15.4s, v15.4s, v1.4s +sub v1.4s, v19.4s, v20.4s +add v19.4s, v19.4s, v20.4s +sub v20.4s, v13.4s, v18.4s +add v13.4s, v13.4s, v18.4s +sub v18.4s, v22.4s, v11.4s +add v22.4s, v22.4s, v11.4s +sub v11.4s, v21.4s, v12.4s +add v21.4s, v21.4s, v12.4s +sub v12.4s, v14.4s, v3.4s +add v14.4s, v14.4s, v3.4s +sqrdmulh v3.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +mla v15.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v19.4S, v29.s[1] +mul v19.4S, v19.4S,v30.s[1] +mla v19.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v2.4S, v29.s[1] +mul v2.4S, v2.4S,v30.s[1] +mla v2.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +mla v16.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v0.4S, v29.s[2] +mul v0.4S, v0.4S,v30.s[2] +mla v0.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v1.4S, v29.s[2] +mul v1.4S, v1.4S,v30.s[2] +mla v1.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +mla v17.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +mla v10.4S, v3.4S, v31.s[0] +sub v3.4s, v21.4s, v15.4s +add v21.4s, v21.4s, v15.4s +sub v15.4s, v14.4s, v19.4s +add v14.4s, v14.4s, v19.4s +sub v19.4s, v13.4s, v2.4s +add v13.4s, v13.4s, v2.4s +sub v2.4s, v22.4s, v16.4s +add v22.4s, v22.4s, v16.4s +sub v16.4s, v11.4s, v0.4s +add v11.4s, v11.4s, v0.4s +sub v0.4s, v12.4s, v1.4s +add v12.4s, v12.4s, v1.4s +sub v1.4s, v20.4s, v17.4s +add v20.4s, v20.4s, v17.4s +sub v17.4s, v18.4s, v10.4s +add v18.4s, v18.4s, v10.4s +sqrdmulh v10.4S, v21.4S, v27.s[0] +mul v21.4S, v21.4S,v28.s[0] +mla v21.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +mla v14.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v3.4S, v27.s[1] +mul v3.4S, v3.4S,v28.s[1] +mla v3.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v15.4S, v27.s[1] +mul v15.4S, v15.4S,v28.s[1] +mla v15.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v11.4S, v27.s[2] +mul v11.4S, v11.4S,v28.s[2] +mla v11.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v12.4S, v27.s[2] +mul v12.4S, v12.4S,v28.s[2] +mla v12.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v16.4S, v27.s[3] +mul v16.4S, v16.4S,v28.s[3] +mla v16.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v0.4S, v27.s[3] +mul v0.4S, v0.4S,v28.s[3] +mla v0.4S, v10.4S, v31.s[0] +sub v10.4s, v13.4s, v21.4s +add v13.4s, v13.4s, v21.4s +sub v21.4s, v22.4s, v14.4s +add v22.4s, v22.4s, v14.4s +sub v14.4s, v19.4s, v3.4s +add v19.4s, v19.4s, v3.4s +sub v3.4s, v2.4s, v15.4s +add v2.4s, v2.4s, v15.4s +sub v15.4s, v20.4s, v11.4s +add v20.4s, v20.4s, v11.4s +sub v11.4s, v18.4s, v12.4s +add v18.4s, v18.4s, v12.4s +sub v12.4s, v1.4s, v16.4s +add v1.4s, v1.4s, v16.4s +sub v16.4s, v17.4s, v0.4s +add v17.4s, v17.4s, v0.4s +sqrdmulh v0.4S, v22.4S, v25.s[0] +mul v22.4S, v22.4S,v26.s[0] +mla v22.4S, v0.4S, v31.s[0] +sqrdmulh v0.4S, v21.4S, v25.s[1] +mul v21.4S, v21.4S,v26.s[1] +mla v21.4S, v0.4S, v31.s[0] +sqrdmulh v0.4S, v2.4S, v25.s[2] +mul v2.4S, v2.4S,v26.s[2] +mla v2.4S, v0.4S, v31.s[0] +sqrdmulh v0.4S, v3.4S, v25.s[3] +mul v3.4S, v3.4S,v26.s[3] +mla v3.4S, v0.4S, v31.s[0] +sqrdmulh v0.4S, v18.4S, v23.s[0] +mul v18.4S, v18.4S,v24.s[0] +mla v18.4S, v0.4S, v31.s[0] +sqrdmulh v0.4S, v11.4S, v23.s[1] +mul v11.4S, v11.4S,v24.s[1] +mla v11.4S, v0.4S, v31.s[0] +sqrdmulh v0.4S, v17.4S, v23.s[2] +mul v17.4S, v17.4S,v24.s[2] +mla v17.4S, v0.4S, v31.s[0] +sqrdmulh v0.4S, v16.4S, v23.s[3] +mul v16.4S, v16.4S,v24.s[3] +mla v16.4S, v0.4S, v31.s[0] +sub v0.4s, v13.4s, v22.4s +add v13.4s, v13.4s, v22.4s +sub v22.4s, v10.4s, v21.4s +add v10.4s, v10.4s, v21.4s +sub v21.4s, v19.4s, v2.4s +add v19.4s, v19.4s, v2.4s +sub v2.4s, v14.4s, v3.4s +add v14.4s, v14.4s, v3.4s +sub v3.4s, v20.4s, v18.4s +add v20.4s, v20.4s, v18.4s +sub v18.4s, v15.4s, v11.4s +add v15.4s, v15.4s, v11.4s +sub v11.4s, v1.4s, v17.4s +add v1.4s, v1.4s, v17.4s +sub v17.4s, v12.4s, v16.4s +add v12.4s, v12.4s, v16.4s +str q13, [x0, #48] +str q0, [x0, #112] +str q10, [x0, #176] +str q22, [x0, #240] +str q19, [x0, #304] +str q21, [x0, #368] +str q14, [x0, #432] +str q2, [x0, #496] +str q20, [x0, #560] +str q3, [x0, #624] +str q15, [x0, #688] +str q18, [x0, #752] +str q1, [x0, #816] +str q11, [x0, #880] +str q12, [x0, #944] +str q17, [x0, #1008] +ldr q17, [x0, #768] +ldr q12, [x0, #832] +ldr q11, [x0, #896] +ldr q1, [x0, #960] +ldr q18, [x0, #256] +ldr q15, [x0, #320] +ldr q3, [x0, #384] +ldr q20, [x0, #448] +ldr q2, [x0, #512] +ldr q14, [x0, #576] +ldr q21, [x0, #640] +ldr q19, [x0, #704] +ldr q22, [x0, #0] +ldr q10, [x0, #64] +ldr q0, [x0, #128] +ldr q13, [x0, #192] +sqrdmulh v16.4S, v17.4S, v29.s[0] +mul v17.4S, v17.4S,v30.s[0] +mla v17.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +mla v12.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +mla v11.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +mla v1.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +mla v2.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +mla v14.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +mla v21.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +mla v19.4S, v16.4S, v31.s[0] +sub v16.4s, v18.4s, v17.4s +add v18.4s, v18.4s, v17.4s +sub v17.4s, v15.4s, v12.4s +add v15.4s, v15.4s, v12.4s +sub v12.4s, v3.4s, v11.4s +add v3.4s, v3.4s, v11.4s +sub v11.4s, v20.4s, v1.4s +add v20.4s, v20.4s, v1.4s +sub v1.4s, v22.4s, v2.4s +add v22.4s, v22.4s, v2.4s +sub v2.4s, v10.4s, v14.4s +add v10.4s, v10.4s, v14.4s +sub v14.4s, v0.4s, v21.4s +add v0.4s, v0.4s, v21.4s +sub v21.4s, v13.4s, v19.4s +add v13.4s, v13.4s, v19.4s +sqrdmulh v19.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +mla v3.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +mla v20.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +mla v18.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +mla v15.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v12.4S, v29.s[2] +mul v12.4S, v12.4S,v30.s[2] +mla v12.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +mla v11.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +mla v16.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +mla v17.4S, v19.4S, v31.s[0] +sub v19.4s, v0.4s, v3.4s +add v0.4s, v0.4s, v3.4s +sub v3.4s, v13.4s, v20.4s +add v13.4s, v13.4s, v20.4s +sub v20.4s, v22.4s, v18.4s +add v22.4s, v22.4s, v18.4s +sub v18.4s, v10.4s, v15.4s +add v10.4s, v10.4s, v15.4s +sub v15.4s, v14.4s, v12.4s +add v14.4s, v14.4s, v12.4s +sub v12.4s, v21.4s, v11.4s +add v21.4s, v21.4s, v11.4s +sub v11.4s, v1.4s, v16.4s +add v1.4s, v1.4s, v16.4s +sub v16.4s, v2.4s, v17.4s +add v2.4s, v2.4s, v17.4s +sqrdmulh v17.4S, v0.4S, v27.s[0] +mul v0.4S, v0.4S,v28.s[0] +mla v0.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v13.4S, v27.s[0] +mul v13.4S, v13.4S,v28.s[0] +mla v13.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v19.4S, v27.s[1] +mul v19.4S, v19.4S,v28.s[1] +mla v19.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v3.4S, v27.s[1] +mul v3.4S, v3.4S,v28.s[1] +mla v3.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v14.4S, v27.s[2] +mul v14.4S, v14.4S,v28.s[2] +mla v14.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v21.4S, v27.s[2] +mul v21.4S, v21.4S,v28.s[2] +mla v21.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +mla v15.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v12.4S, v27.s[3] +mul v12.4S, v12.4S,v28.s[3] +mla v12.4S, v17.4S, v31.s[0] +sub v17.4s, v22.4s, v0.4s +add v22.4s, v22.4s, v0.4s +sub v0.4s, v10.4s, v13.4s +add v10.4s, v10.4s, v13.4s +sub v13.4s, v20.4s, v19.4s +add v20.4s, v20.4s, v19.4s +sub v19.4s, v18.4s, v3.4s +add v18.4s, v18.4s, v3.4s +sub v3.4s, v1.4s, v14.4s +add v1.4s, v1.4s, v14.4s +sub v14.4s, v2.4s, v21.4s +add v2.4s, v2.4s, v21.4s +sub v21.4s, v11.4s, v15.4s +add v11.4s, v11.4s, v15.4s +sub v15.4s, v16.4s, v12.4s +add v16.4s, v16.4s, v12.4s +sqrdmulh v12.4S, v10.4S, v25.s[0] +mul v10.4S, v10.4S,v26.s[0] +mla v10.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v0.4S, v25.s[1] +mul v0.4S, v0.4S,v26.s[1] +mla v0.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v18.4S, v25.s[2] +mul v18.4S, v18.4S,v26.s[2] +mla v18.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v19.4S, v25.s[3] +mul v19.4S, v19.4S,v26.s[3] +mla v19.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v2.4S, v23.s[0] +mul v2.4S, v2.4S,v24.s[0] +mla v2.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v14.4S, v23.s[1] +mul v14.4S, v14.4S,v24.s[1] +mla v14.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v16.4S, v23.s[2] +mul v16.4S, v16.4S,v24.s[2] +mla v16.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v15.4S, v23.s[3] +mul v15.4S, v15.4S,v24.s[3] +mla v15.4S, v12.4S, v31.s[0] +sub v12.4s, v22.4s, v10.4s +add v22.4s, v22.4s, v10.4s +sub v10.4s, v17.4s, v0.4s +add v17.4s, v17.4s, v0.4s +sub v0.4s, v20.4s, v18.4s +add v20.4s, v20.4s, v18.4s +sub v18.4s, v13.4s, v19.4s +add v13.4s, v13.4s, v19.4s +sub v19.4s, v1.4s, v2.4s +add v1.4s, v1.4s, v2.4s +sub v2.4s, v3.4s, v14.4s +add v3.4s, v3.4s, v14.4s +sub v14.4s, v11.4s, v16.4s +add v11.4s, v11.4s, v16.4s +sub v16.4s, v21.4s, v15.4s +add v21.4s, v21.4s, v15.4s +str q22, [x0, #0] +str q12, [x0, #64] +str q17, [x0, #128] +str q10, [x0, #192] +str q20, [x0, #256] +str q0, [x0, #320] +str q13, [x0, #384] +str q18, [x0, #448] +str q1, [x0, #512] +str q19, [x0, #576] +str q3, [x0, #640] +str q2, [x0, #704] +str q11, [x0, #768] +str q14, [x0, #832] +str q21, [x0, #896] +str q16, [x0, #960] +ldr q16, [x0, #784] +ldr q21, [x0, #848] +ldr q14, [x0, #912] +ldr q11, [x0, #976] +ldr q2, [x0, #272] +ldr q3, [x0, #336] +ldr q19, [x0, #400] +ldr q1, [x0, #464] +ldr q18, [x0, #528] +ldr q13, [x0, #592] +ldr q0, [x0, #656] +ldr q20, [x0, #720] +ldr q10, [x0, #16] +ldr q17, [x0, #80] +ldr q12, [x0, #144] +ldr q22, [x0, #208] +sqrdmulh v15.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +mla v16.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +mla v21.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +mla v14.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +mla v11.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +mla v18.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +mla v13.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v0.4S, v29.s[0] +mul v0.4S, v0.4S,v30.s[0] +mla v0.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +mla v20.4S, v15.4S, v31.s[0] +sub v15.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +sub v16.4s, v3.4s, v21.4s +add v3.4s, v3.4s, v21.4s +sub v21.4s, v19.4s, v14.4s +add v19.4s, v19.4s, v14.4s +sub v14.4s, v1.4s, v11.4s +add v1.4s, v1.4s, v11.4s +sub v11.4s, v10.4s, v18.4s +add v10.4s, v10.4s, v18.4s +sub v18.4s, v17.4s, v13.4s +add v17.4s, v17.4s, v13.4s +sub v13.4s, v12.4s, v0.4s +add v12.4s, v12.4s, v0.4s +sub v0.4s, v22.4s, v20.4s +add v22.4s, v22.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v29.s[1] +mul v19.4S, v19.4S,v30.s[1] +mla v19.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v1.4S, v29.s[1] +mul v1.4S, v1.4S,v30.s[1] +mla v1.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v2.4S, v29.s[1] +mul v2.4S, v2.4S,v30.s[1] +mla v2.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +mla v3.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +mla v21.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +mla v14.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +mla v15.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +mla v16.4S, v20.4S, v31.s[0] +sub v20.4s, v12.4s, v19.4s +add v12.4s, v12.4s, v19.4s +sub v19.4s, v22.4s, v1.4s +add v22.4s, v22.4s, v1.4s +sub v1.4s, v10.4s, v2.4s +add v10.4s, v10.4s, v2.4s +sub v2.4s, v17.4s, v3.4s +add v17.4s, v17.4s, v3.4s +sub v3.4s, v13.4s, v21.4s +add v13.4s, v13.4s, v21.4s +sub v21.4s, v0.4s, v14.4s +add v0.4s, v0.4s, v14.4s +sub v14.4s, v11.4s, v15.4s +add v11.4s, v11.4s, v15.4s +sub v15.4s, v18.4s, v16.4s +add v18.4s, v18.4s, v16.4s +sqrdmulh v16.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +mla v12.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v22.4S, v27.s[0] +mul v22.4S, v22.4S,v28.s[0] +mla v22.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v20.4S, v27.s[1] +mul v20.4S, v20.4S,v28.s[1] +mla v20.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v19.4S, v27.s[1] +mul v19.4S, v19.4S,v28.s[1] +mla v19.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v13.4S, v27.s[2] +mul v13.4S, v13.4S,v28.s[2] +mla v13.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v0.4S, v27.s[2] +mul v0.4S, v0.4S,v28.s[2] +mla v0.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v3.4S, v27.s[3] +mul v3.4S, v3.4S,v28.s[3] +mla v3.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +mla v21.4S, v16.4S, v31.s[0] +sub v16.4s, v10.4s, v12.4s +add v10.4s, v10.4s, v12.4s +sub v12.4s, v17.4s, v22.4s +add v17.4s, v17.4s, v22.4s +sub v22.4s, v1.4s, v20.4s +add v1.4s, v1.4s, v20.4s +sub v20.4s, v2.4s, v19.4s +add v2.4s, v2.4s, v19.4s +sub v19.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +sub v13.4s, v18.4s, v0.4s +add v18.4s, v18.4s, v0.4s +sub v0.4s, v14.4s, v3.4s +add v14.4s, v14.4s, v3.4s +sub v3.4s, v15.4s, v21.4s +add v15.4s, v15.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v25.s[0] +mul v17.4S, v17.4S,v26.s[0] +mla v17.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v12.4S, v25.s[1] +mul v12.4S, v12.4S,v26.s[1] +mla v12.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v2.4S, v25.s[2] +mul v2.4S, v2.4S,v26.s[2] +mla v2.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v20.4S, v25.s[3] +mul v20.4S, v20.4S,v26.s[3] +mla v20.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v18.4S, v23.s[0] +mul v18.4S, v18.4S,v24.s[0] +mla v18.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v13.4S, v23.s[1] +mul v13.4S, v13.4S,v24.s[1] +mla v13.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v15.4S, v23.s[2] +mul v15.4S, v15.4S,v24.s[2] +mla v15.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v3.4S, v23.s[3] +mul v3.4S, v3.4S,v24.s[3] +mla v3.4S, v21.4S, v31.s[0] +sub v21.4s, v10.4s, v17.4s +add v10.4s, v10.4s, v17.4s +sub v17.4s, v16.4s, v12.4s +add v16.4s, v16.4s, v12.4s +sub v12.4s, v1.4s, v2.4s +add v1.4s, v1.4s, v2.4s +sub v2.4s, v22.4s, v20.4s +add v22.4s, v22.4s, v20.4s +sub v20.4s, v11.4s, v18.4s +add v11.4s, v11.4s, v18.4s +sub v18.4s, v19.4s, v13.4s +add v19.4s, v19.4s, v13.4s +sub v13.4s, v14.4s, v15.4s +add v14.4s, v14.4s, v15.4s +sub v15.4s, v0.4s, v3.4s +add v0.4s, v0.4s, v3.4s +str q10, [x0, #16] +str q21, [x0, #80] +str q16, [x0, #144] +str q17, [x0, #208] +str q1, [x0, #272] +str q12, [x0, #336] +str q22, [x0, #400] +str q2, [x0, #464] +str q11, [x0, #528] +str q20, [x0, #592] +str q19, [x0, #656] +str q18, [x0, #720] +str q14, [x0, #784] +str q13, [x0, #848] +str q0, [x0, #912] +str q15, [x0, #976] +ldr q4, [x17, #+128] +ldr q5, [x17, #+144] +ldr q6, [x17, #+160] +ldr q7, [x17, #+176] +ldr q8, [x17, #+192] +ldr q9, [x17, #+208] +ldr q3, [x17, #+224] +ldr q10, [x17, #+240] +ldr q21, [x0, #32] +ldr q16, [x0, #48] +ldr q17, [x0, #0] +ldr q1, [x0, #16] +ldr q12, [x17, #+256] +ldr q22, [x17, #+272] +sqrdmulh v2.4S, v21.4S, v5.s[0] +mul v21.4S, v21.4S,v4.s[0] +mla v21.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v16.4S, v5.s[0] +mul v16.4S, v16.4S,v4.s[0] +mla v16.4S, v2.4S, v31.s[0] +sub v2.4s, v17.4s, v21.4s +add v17.4s, v17.4s, v21.4s +sub v21.4s, v1.4s, v16.4s +add v1.4s, v1.4s, v16.4s +sqrdmulh v16.4S, v1.4S, v5.s[1] +mul v1.4S, v1.4S,v4.s[1] +mla v1.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v21.4S, v5.s[2] +mul v21.4S, v21.4S,v4.s[2] +mla v21.4S, v16.4S, v31.s[0] +sub v16.4s, v17.4s, v1.4s +add v17.4s, v17.4s, v1.4s +sub v1.4s, v2.4s, v21.4s +add v2.4s, v2.4s, v21.4s +str q17, [x0, #0] +str q16, [x0, #16] +str q2, [x0, #32] +str q1, [x0, #48] +ldr q1, [x0, #96] +ldr q2, [x0, #112] +ldr q16, [x0, #64] +ldr q17, [x0, #80] +ldr q21, [x17, #+288] +ldr q11, [x17, #+304] +sqrdmulh v20.4S, v1.4S, v7.s[0] +mul v1.4S, v1.4S,v6.s[0] +mla v1.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v2.4S, v7.s[0] +mul v2.4S, v2.4S,v6.s[0] +mla v2.4S, v20.4S, v31.s[0] +sub v20.4s, v16.4s, v1.4s +add v16.4s, v16.4s, v1.4s +sub v1.4s, v17.4s, v2.4s +add v17.4s, v17.4s, v2.4s +sqrdmulh v2.4S, v17.4S, v7.s[1] +mul v17.4S, v17.4S,v6.s[1] +mla v17.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v1.4S, v7.s[2] +mul v1.4S, v1.4S,v6.s[2] +mla v1.4S, v2.4S, v31.s[0] +sub v2.4s, v16.4s, v17.4s +add v16.4s, v16.4s, v17.4s +sub v17.4s, v20.4s, v1.4s +add v20.4s, v20.4s, v1.4s +str q16, [x0, #64] +str q2, [x0, #80] +str q20, [x0, #96] +str q17, [x0, #112] +ldr q17, [x0, #160] +ldr q20, [x0, #176] +ldr q2, [x0, #128] +ldr q16, [x0, #144] +ldr q1, [x17, #+320] +ldr q19, [x17, #+336] +sqrdmulh v18.4S, v17.4S, v9.s[0] +mul v17.4S, v17.4S,v8.s[0] +mla v17.4S, v18.4S, v31.s[0] +sqrdmulh v18.4S, v20.4S, v9.s[0] +mul v20.4S, v20.4S,v8.s[0] +mla v20.4S, v18.4S, v31.s[0] +sub v18.4s, v2.4s, v17.4s +add v2.4s, v2.4s, v17.4s +sub v17.4s, v16.4s, v20.4s +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v16.4S, v9.s[1] +mul v16.4S, v16.4S,v8.s[1] +mla v16.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v17.4S, v9.s[2] +mul v17.4S, v17.4S,v8.s[2] +mla v17.4S, v20.4S, v31.s[0] +sub v20.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +sub v16.4s, v18.4s, v17.4s +add v18.4s, v18.4s, v17.4s +str q2, [x0, #128] +str q20, [x0, #144] +str q18, [x0, #160] +str q16, [x0, #176] +ldr q16, [x0, #224] +ldr q18, [x0, #240] +ldr q20, [x0, #192] +ldr q2, [x0, #208] +ldr q17, [x17, #+352] +ldr q14, [x17, #+368] +sqrdmulh v13.4S, v16.4S, v10.s[0] +mul v16.4S, v16.4S,v3.s[0] +mla v16.4S, v13.4S, v31.s[0] +sqrdmulh v13.4S, v18.4S, v10.s[0] +mul v18.4S, v18.4S,v3.s[0] +mla v18.4S, v13.4S, v31.s[0] +sub v13.4s, v20.4s, v16.4s +add v20.4s, v20.4s, v16.4s +sub v16.4s, v2.4s, v18.4s +add v2.4s, v2.4s, v18.4s +sqrdmulh v18.4S, v2.4S, v10.s[1] +mul v2.4S, v2.4S,v3.s[1] +mla v2.4S, v18.4S, v31.s[0] +sqrdmulh v18.4S, v16.4S, v10.s[2] +mul v16.4S, v16.4S,v3.s[2] +mla v16.4S, v18.4S, v31.s[0] +sub v18.4s, v20.4s, v2.4s +add v20.4s, v20.4s, v2.4s +sub v2.4s, v13.4s, v16.4s +add v13.4s, v13.4s, v16.4s +str q20, [x0, #192] +str q18, [x0, #208] +str q13, [x0, #224] +str q2, [x0, #240] +ldr q2, [x0, #288] +ldr q13, [x0, #304] +ldr q18, [x0, #256] +ldr q20, [x0, #272] +ldr q16, [x17, #+384] +ldr q0, [x17, #+400] +sqrdmulh v15.4S, v2.4S, v22.s[0] +mul v2.4S, v2.4S,v12.s[0] +mla v2.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v13.4S, v22.s[0] +mul v13.4S, v13.4S,v12.s[0] +mla v13.4S, v15.4S, v31.s[0] +sub v15.4s, v18.4s, v2.4s +add v18.4s, v18.4s, v2.4s +sub v2.4s, v20.4s, v13.4s +add v20.4s, v20.4s, v13.4s +sqrdmulh v13.4S, v20.4S, v22.s[1] +mul v20.4S, v20.4S,v12.s[1] +mla v20.4S, v13.4S, v31.s[0] +sqrdmulh v13.4S, v2.4S, v22.s[2] +mul v2.4S, v2.4S,v12.s[2] +mla v2.4S, v13.4S, v31.s[0] +sub v13.4s, v18.4s, v20.4s +add v18.4s, v18.4s, v20.4s +sub v20.4s, v15.4s, v2.4s +add v15.4s, v15.4s, v2.4s +str q18, [x0, #256] +str q13, [x0, #272] +str q15, [x0, #288] +str q20, [x0, #304] +ldr q5, [x0, #352] +ldr q4, [x0, #368] +ldr q20, [x0, #320] +ldr q15, [x0, #336] +ldr q13, [x17, #+416] +ldr q18, [x17, #+432] +sqrdmulh v2.4S, v5.4S, v11.s[0] +mul v5.4S, v5.4S,v21.s[0] +mla v5.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v4.4S, v11.s[0] +mul v4.4S, v4.4S,v21.s[0] +mla v4.4S, v2.4S, v31.s[0] +sub v2.4s, v20.4s, v5.4s +add v20.4s, v20.4s, v5.4s +sub v5.4s, v15.4s, v4.4s +add v15.4s, v15.4s, v4.4s +sqrdmulh v4.4S, v15.4S, v11.s[1] +mul v15.4S, v15.4S,v21.s[1] +mla v15.4S, v4.4S, v31.s[0] +sqrdmulh v4.4S, v5.4S, v11.s[2] +mul v5.4S, v5.4S,v21.s[2] +mla v5.4S, v4.4S, v31.s[0] +sub v4.4s, v20.4s, v15.4s +add v20.4s, v20.4s, v15.4s +sub v15.4s, v2.4s, v5.4s +add v2.4s, v2.4s, v5.4s +str q20, [x0, #320] +str q4, [x0, #336] +str q2, [x0, #352] +str q15, [x0, #368] +ldr q7, [x0, #416] +ldr q6, [x0, #432] +ldr q15, [x0, #384] +ldr q2, [x0, #400] +ldr q4, [x17, #+448] +ldr q20, [x17, #+464] +sqrdmulh v5.4S, v7.4S, v19.s[0] +mul v7.4S, v7.4S,v1.s[0] +mla v7.4S, v5.4S, v31.s[0] +sqrdmulh v5.4S, v6.4S, v19.s[0] +mul v6.4S, v6.4S,v1.s[0] +mla v6.4S, v5.4S, v31.s[0] +sub v5.4s, v15.4s, v7.4s +add v15.4s, v15.4s, v7.4s +sub v7.4s, v2.4s, v6.4s +add v2.4s, v2.4s, v6.4s +sqrdmulh v6.4S, v2.4S, v19.s[1] +mul v2.4S, v2.4S,v1.s[1] +mla v2.4S, v6.4S, v31.s[0] +sqrdmulh v6.4S, v7.4S, v19.s[2] +mul v7.4S, v7.4S,v1.s[2] +mla v7.4S, v6.4S, v31.s[0] +sub v6.4s, v15.4s, v2.4s +add v15.4s, v15.4s, v2.4s +sub v2.4s, v5.4s, v7.4s +add v5.4s, v5.4s, v7.4s +str q15, [x0, #384] +str q6, [x0, #400] +str q5, [x0, #416] +str q2, [x0, #432] +ldr q9, [x0, #480] +ldr q8, [x0, #496] +ldr q2, [x0, #448] +ldr q5, [x0, #464] +ldr q6, [x17, #+480] +ldr q15, [x17, #+496] +sqrdmulh v7.4S, v9.4S, v14.s[0] +mul v9.4S, v9.4S,v17.s[0] +mla v9.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v8.4S, v14.s[0] +mul v8.4S, v8.4S,v17.s[0] +mla v8.4S, v7.4S, v31.s[0] +sub v7.4s, v2.4s, v9.4s +add v2.4s, v2.4s, v9.4s +sub v9.4s, v5.4s, v8.4s +add v5.4s, v5.4s, v8.4s +sqrdmulh v8.4S, v5.4S, v14.s[1] +mul v5.4S, v5.4S,v17.s[1] +mla v5.4S, v8.4S, v31.s[0] +sqrdmulh v8.4S, v9.4S, v14.s[2] +mul v9.4S, v9.4S,v17.s[2] +mla v9.4S, v8.4S, v31.s[0] +sub v8.4s, v2.4s, v5.4s +add v2.4s, v2.4s, v5.4s +sub v5.4s, v7.4s, v9.4s +add v7.4s, v7.4s, v9.4s +str q2, [x0, #448] +str q8, [x0, #464] +str q7, [x0, #480] +str q5, [x0, #496] +ldr q10, [x0, #544] +ldr q3, [x0, #560] +ldr q5, [x0, #512] +ldr q7, [x0, #528] +ldr q8, [x17, #+512] +ldr q2, [x17, #+528] +sqrdmulh v9.4S, v10.4S, v0.s[0] +mul v10.4S, v10.4S,v16.s[0] +mla v10.4S, v9.4S, v31.s[0] +sqrdmulh v9.4S, v3.4S, v0.s[0] +mul v3.4S, v3.4S,v16.s[0] +mla v3.4S, v9.4S, v31.s[0] +sub v9.4s, v5.4s, v10.4s +add v5.4s, v5.4s, v10.4s +sub v10.4s, v7.4s, v3.4s +add v7.4s, v7.4s, v3.4s +sqrdmulh v3.4S, v7.4S, v0.s[1] +mul v7.4S, v7.4S,v16.s[1] +mla v7.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v10.4S, v0.s[2] +mul v10.4S, v10.4S,v16.s[2] +mla v10.4S, v3.4S, v31.s[0] +sub v3.4s, v5.4s, v7.4s +add v5.4s, v5.4s, v7.4s +sub v7.4s, v9.4s, v10.4s +add v9.4s, v9.4s, v10.4s +str q5, [x0, #512] +str q3, [x0, #528] +str q9, [x0, #544] +str q7, [x0, #560] +ldr q22, [x0, #608] +ldr q12, [x0, #624] +ldr q7, [x0, #576] +ldr q9, [x0, #592] +ldr q3, [x17, #+544] +ldr q5, [x17, #+560] +sqrdmulh v10.4S, v22.4S, v18.s[0] +mul v22.4S, v22.4S,v13.s[0] +mla v22.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v12.4S, v18.s[0] +mul v12.4S, v12.4S,v13.s[0] +mla v12.4S, v10.4S, v31.s[0] +sub v10.4s, v7.4s, v22.4s +add v7.4s, v7.4s, v22.4s +sub v22.4s, v9.4s, v12.4s +add v9.4s, v9.4s, v12.4s +sqrdmulh v12.4S, v9.4S, v18.s[1] +mul v9.4S, v9.4S,v13.s[1] +mla v9.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v22.4S, v18.s[2] +mul v22.4S, v22.4S,v13.s[2] +mla v22.4S, v12.4S, v31.s[0] +sub v12.4s, v7.4s, v9.4s +add v7.4s, v7.4s, v9.4s +sub v9.4s, v10.4s, v22.4s +add v10.4s, v10.4s, v22.4s +str q7, [x0, #576] +str q12, [x0, #592] +str q10, [x0, #608] +str q9, [x0, #624] +ldr q11, [x0, #672] +ldr q21, [x0, #688] +ldr q9, [x0, #640] +ldr q10, [x0, #656] +ldr q12, [x17, #+576] +ldr q7, [x17, #+592] +sqrdmulh v22.4S, v11.4S, v20.s[0] +mul v11.4S, v11.4S,v4.s[0] +mla v11.4S, v22.4S, v31.s[0] +sqrdmulh v22.4S, v21.4S, v20.s[0] +mul v21.4S, v21.4S,v4.s[0] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v9.4s, v11.4s +add v9.4s, v9.4s, v11.4s +sub v11.4s, v10.4s, v21.4s +add v10.4s, v10.4s, v21.4s +sqrdmulh v21.4S, v10.4S, v20.s[1] +mul v10.4S, v10.4S,v4.s[1] +mla v10.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v11.4S, v20.s[2] +mul v11.4S, v11.4S,v4.s[2] +mla v11.4S, v21.4S, v31.s[0] +sub v21.4s, v9.4s, v10.4s +add v9.4s, v9.4s, v10.4s +sub v10.4s, v22.4s, v11.4s +add v22.4s, v22.4s, v11.4s +str q9, [x0, #640] +str q21, [x0, #656] +str q22, [x0, #672] +str q10, [x0, #688] +ldr q19, [x0, #736] +ldr q1, [x0, #752] +ldr q10, [x0, #704] +ldr q22, [x0, #720] +ldr q21, [x17, #+608] +ldr q9, [x17, #+624] +sqrdmulh v11.4S, v19.4S, v15.s[0] +mul v19.4S, v19.4S,v6.s[0] +mla v19.4S, v11.4S, v31.s[0] +sqrdmulh v11.4S, v1.4S, v15.s[0] +mul v1.4S, v1.4S,v6.s[0] +mla v1.4S, v11.4S, v31.s[0] +sub v11.4s, v10.4s, v19.4s +add v10.4s, v10.4s, v19.4s +sub v19.4s, v22.4s, v1.4s +add v22.4s, v22.4s, v1.4s +sqrdmulh v1.4S, v22.4S, v15.s[1] +mul v22.4S, v22.4S,v6.s[1] +mla v22.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v19.4S, v15.s[2] +mul v19.4S, v19.4S,v6.s[2] +mla v19.4S, v1.4S, v31.s[0] +sub v1.4s, v10.4s, v22.4s +add v10.4s, v10.4s, v22.4s +sub v22.4s, v11.4s, v19.4s +add v11.4s, v11.4s, v19.4s +str q10, [x0, #704] +str q1, [x0, #720] +str q11, [x0, #736] +str q22, [x0, #752] +ldr q14, [x0, #800] +ldr q17, [x0, #816] +ldr q22, [x0, #768] +ldr q11, [x0, #784] +sqrdmulh v1.4S, v14.4S, v2.s[0] +mul v14.4S, v14.4S,v8.s[0] +mla v14.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v17.4S, v2.s[0] +mul v17.4S, v17.4S,v8.s[0] +mla v17.4S, v1.4S, v31.s[0] +sub v1.4s, v22.4s, v14.4s +add v22.4s, v22.4s, v14.4s +sub v14.4s, v11.4s, v17.4s +add v11.4s, v11.4s, v17.4s +sqrdmulh v17.4S, v11.4S, v2.s[1] +mul v11.4S, v11.4S,v8.s[1] +mla v11.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v14.4S, v2.s[2] +mul v14.4S, v14.4S,v8.s[2] +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v22.4s, v11.4s +add v22.4s, v22.4s, v11.4s +sub v11.4s, v1.4s, v14.4s +add v1.4s, v1.4s, v14.4s +str q22, [x0, #768] +str q17, [x0, #784] +str q1, [x0, #800] +str q11, [x0, #816] +ldr q0, [x0, #864] +ldr q16, [x0, #880] +ldr q11, [x0, #832] +ldr q1, [x0, #848] +sqrdmulh v17.4S, v0.4S, v5.s[0] +mul v0.4S, v0.4S,v3.s[0] +mla v0.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v16.4S, v5.s[0] +mul v16.4S, v16.4S,v3.s[0] +mla v16.4S, v17.4S, v31.s[0] +sub v17.4s, v11.4s, v0.4s +add v11.4s, v11.4s, v0.4s +sub v0.4s, v1.4s, v16.4s +add v1.4s, v1.4s, v16.4s +sqrdmulh v16.4S, v1.4S, v5.s[1] +mul v1.4S, v1.4S,v3.s[1] +mla v1.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v0.4S, v5.s[2] +mul v0.4S, v0.4S,v3.s[2] +mla v0.4S, v16.4S, v31.s[0] +sub v16.4s, v11.4s, v1.4s +add v11.4s, v11.4s, v1.4s +sub v1.4s, v17.4s, v0.4s +add v17.4s, v17.4s, v0.4s +str q11, [x0, #832] +str q16, [x0, #848] +str q17, [x0, #864] +str q1, [x0, #880] +ldr q18, [x0, #928] +ldr q13, [x0, #944] +ldr q1, [x0, #896] +ldr q17, [x0, #912] +sqrdmulh v16.4S, v18.4S, v7.s[0] +mul v18.4S, v18.4S,v12.s[0] +mla v18.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v13.4S, v7.s[0] +mul v13.4S, v13.4S,v12.s[0] +mla v13.4S, v16.4S, v31.s[0] +sub v16.4s, v1.4s, v18.4s +add v1.4s, v1.4s, v18.4s +sub v18.4s, v17.4s, v13.4s +add v17.4s, v17.4s, v13.4s +sqrdmulh v13.4S, v17.4S, v7.s[1] +mul v17.4S, v17.4S,v12.s[1] +mla v17.4S, v13.4S, v31.s[0] +sqrdmulh v13.4S, v18.4S, v7.s[2] +mul v18.4S, v18.4S,v12.s[2] +mla v18.4S, v13.4S, v31.s[0] +sub v13.4s, v1.4s, v17.4s +add v1.4s, v1.4s, v17.4s +sub v17.4s, v16.4s, v18.4s +add v16.4s, v16.4s, v18.4s +str q1, [x0, #896] +str q13, [x0, #912] +str q16, [x0, #928] +str q17, [x0, #944] +ldr q20, [x0, #992] +ldr q4, [x0, #1008] +ldr q17, [x0, #960] +ldr q16, [x0, #976] +sqrdmulh v13.4S, v20.4S, v9.s[0] +mul v20.4S, v20.4S,v21.s[0] +mla v20.4S, v13.4S, v31.s[0] +sqrdmulh v13.4S, v4.4S, v9.s[0] +mul v4.4S, v4.4S,v21.s[0] +mla v4.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v20.4s +add v17.4s, v17.4s, v20.4s +sub v20.4s, v16.4s, v4.4s +add v16.4s, v16.4s, v4.4s +sqrdmulh v4.4S, v16.4S, v9.s[1] +mul v16.4S, v16.4S,v21.s[1] +mla v16.4S, v4.4S, v31.s[0] +sqrdmulh v4.4S, v20.4S, v9.s[2] +mul v20.4S, v20.4S,v21.s[2] +mla v20.4S, v4.4S, v31.s[0] +sub v4.4s, v17.4s, v16.4s +add v17.4s, v17.4s, v16.4s +sub v16.4s, v13.4s, v20.4s +add v13.4s, v13.4s, v20.4s +str q17, [x0, #960] +str q4, [x0, #976] +str q13, [x0, #992] +str q16, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1464 +// Instruction count: 1460 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_0.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_0.s new file mode 100644 index 0000000..b9ef60e --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_0.s @@ -0,0 +1,1494 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_3_z4_0 +.global _ntt_u32_incomplete_neon_asm_var_4_2_3_z4_0 +ntt_u32_incomplete_neon_asm_var_4_2_3_z4_0: +_ntt_u32_incomplete_neon_asm_var_4_2_3_z4_0: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #800] +ldr q21, [x0, #864] +ldr q20, [x0, #928] +ldr q19, [x0, #992] +ldr q18, [x0, #288] +ldr q17, [x0, #352] +ldr q16, [x0, #416] +ldr q3, [x0, #480] +sqrdmulh v2.4S, v22.4S, v29.s[0] +ldr q1, [x0, #544] +mul v22.4S, v22.4S,v30.s[0] +ldr q0, [x0, #608] +sqrdmulh v15.4S, v21.4S, v29.s[0] +ldr q14, [x0, #672] +mul v21.4S, v21.4S,v30.s[0] +ldr q13, [x0, #736] +mla v22.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q12, [x0, #32] +sub v11.4s, v18.4s, v22.4s +mla v21.4S, v15.4S, v31.s[0] +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q15, [x0, #96] +sub v10.4s, v17.4s, v21.4s +mla v20.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v1.4S, v29.s[0] +ldr q2, [x0, #160] +mul v1.4S, v1.4S,v30.s[0] +sub v9.4s, v16.4s, v20.4s +mla v19.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v0.4S, v29.s[0] +ldr q22, [x0, #224] +mul v0.4S, v0.4S,v30.s[0] +sub v8.4s, v3.4s, v19.4s +mla v1.4S, v21.4S, v31.s[0] +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v21.4s, v12.4s, v1.4s +mla v0.4S, v20.4S, v31.s[0] +add v12.4s, v12.4s, v1.4s +sqrdmulh v1.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +sub v20.4s, v15.4s, v0.4s +mla v14.4S, v19.4S, v31.s[0] +add v15.4s, v15.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v19.4s, v2.4s, v14.4s +mla v13.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v1.4s, v22.4s, v13.4s +mla v16.4S, v0.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v0.4s, v2.4s, v16.4s +mla v3.4S, v14.4S, v31.s[0] +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v14.4s, v22.4s, v3.4s +mla v18.4S, v13.4S, v31.s[0] +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v29.s[2] +mul v9.4S, v9.4S,v30.s[2] +sub v13.4s, v12.4s, v18.4s +mla v17.4S, v16.4S, v31.s[0] +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v8.4S, v29.s[2] +mul v8.4S, v8.4S,v30.s[2] +sub v16.4s, v15.4s, v17.4s +mla v9.4S, v3.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +sqrdmulh v17.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v3.4s, v19.4s, v9.4s +mla v8.4S, v18.4S, v31.s[0] +add v19.4s, v19.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v18.4s, v1.4s, v8.4s +mla v11.4S, v17.4S, v31.s[0] +add v1.4s, v1.4s, v8.4s +sqrdmulh v8.4S, v2.4S, v27.s[0] +mul v2.4S, v2.4S,v28.s[0] +sub v17.4s, v21.4s, v11.4s +mla v10.4S, v9.4S, v31.s[0] +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v27.s[0] +mul v22.4S, v22.4S,v28.s[0] +sub v9.4s, v20.4s, v10.4s +mla v2.4S, v8.4S, v31.s[0] +add v20.4s, v20.4s, v10.4s +sqrdmulh v10.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v8.4s, v12.4s, v2.4s +mla v22.4S, v11.4S, v31.s[0] +add v12.4s, v12.4s, v2.4s +sqrdmulh v2.4S, v14.4S, v27.s[1] +mul v14.4S, v14.4S,v28.s[1] +sub v11.4s, v15.4s, v22.4s +mla v0.4S, v10.4S, v31.s[0] +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v27.s[2] +mul v19.4S, v19.4S,v28.s[2] +sub v10.4s, v13.4s, v0.4s +mla v14.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v0.4s +sqrdmulh v0.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +sub v2.4s, v16.4s, v14.4s +mla v19.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v27.s[3] +mul v3.4S, v3.4S,v28.s[3] +sub v22.4s, v21.4s, v19.4s +mla v1.4S, v0.4S, v31.s[0] +add v21.4s, v21.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v0.4s, v20.4s, v1.4s +mla v3.4S, v14.4S, v31.s[0] +add v20.4s, v20.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v25.s[0] +mul v15.4S, v15.4S,v26.s[0] +sub v14.4s, v17.4s, v3.4s +mla v18.4S, v19.4S, v31.s[0] +add v17.4s, v17.4s, v3.4s +sqrdmulh v3.4S, v11.4S, v25.s[1] +mul v11.4S, v11.4S,v26.s[1] +sub v19.4s, v9.4s, v18.4s +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v1.4s, v12.4s, v15.4s +mla v11.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v25.s[3] +mul v2.4S, v2.4S,v26.s[3] +sub v3.4s, v8.4s, v11.4s +mla v16.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v11.4s +str q12, [x0, #32] +sqrdmulh v12.4S, v20.4S, v23.s[0] +str q1, [x0, #96] +mul v20.4S, v20.4S,v24.s[0] +ldr q1, [x0, #816] +sub v11.4s, v13.4s, v16.4s +ldr q18, [x0, #880] +mla v2.4S, v15.4S, v31.s[0] +add v13.4s, v13.4s, v16.4s +str q8, [x0, #160] +sqrdmulh v8.4S, v0.4S, v23.s[1] +str q3, [x0, #224] +mul v0.4S, v0.4S,v24.s[1] +ldr q3, [x0, #944] +sub v16.4s, v10.4s, v2.4s +ldr q15, [x0, #1008] +mla v20.4S, v12.4S, v31.s[0] +add v10.4s, v10.4s, v2.4s +str q13, [x0, #288] +sqrdmulh v13.4S, v9.4S, v23.s[2] +str q11, [x0, #352] +mul v9.4S, v9.4S,v24.s[2] +ldr q11, [x0, #304] +sub v2.4s, v21.4s, v20.4s +ldr q12, [x0, #368] +mla v0.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v20.4s +str q10, [x0, #416] +sqrdmulh v10.4S, v19.4S, v23.s[3] +str q16, [x0, #480] +mul v19.4S, v19.4S,v24.s[3] +ldr q16, [x0, #432] +sub v20.4s, v22.4s, v0.4s +ldr q8, [x0, #496] +mla v9.4S, v13.4S, v31.s[0] +add v22.4s, v22.4s, v0.4s +str q21, [x0, #544] +sqrdmulh v21.4S, v1.4S, v29.s[0] +str q2, [x0, #608] +ldr q2, [x0, #560] +mul v1.4S, v1.4S,v30.s[0] +ldr q0, [x0, #624] +sub v13.4s, v17.4s, v9.4s +mla v19.4S, v10.4S, v31.s[0] +add v17.4s, v17.4s, v9.4s +str q22, [x0, #672] +sqrdmulh v22.4S, v18.4S, v29.s[0] +str q20, [x0, #736] +ldr q20, [x0, #688] +mul v18.4S, v18.4S,v30.s[0] +ldr q9, [x0, #752] +sub v10.4s, v14.4s, v19.4s +mla v1.4S, v21.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +str q17, [x0, #800] +sqrdmulh v17.4S, v3.4S, v29.s[0] +str q13, [x0, #864] +mul v3.4S, v3.4S,v30.s[0] +ldr q13, [x0, #48] +sub v19.4s, v11.4s, v1.4s +mla v18.4S, v22.4S, v31.s[0] +add v11.4s, v11.4s, v1.4s +str q14, [x0, #928] +sqrdmulh v14.4S, v15.4S, v29.s[0] +str q10, [x0, #992] +mul v15.4S, v15.4S,v30.s[0] +ldr q10, [x0, #112] +sub v1.4s, v12.4s, v18.4s +mla v3.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v2.4S, v29.s[0] +ldr q17, [x0, #176] +mul v2.4S, v2.4S,v30.s[0] +sub v22.4s, v16.4s, v3.4s +mla v15.4S, v14.4S, v31.s[0] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v0.4S, v29.s[0] +ldr q14, [x0, #240] +mul v0.4S, v0.4S,v30.s[0] +sub v21.4s, v8.4s, v15.4s +mla v2.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +sub v18.4s, v13.4s, v2.4s +mla v0.4S, v3.4S, v31.s[0] +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v9.4S, v29.s[0] +mul v9.4S, v9.4S,v30.s[0] +sub v3.4s, v10.4s, v0.4s +mla v20.4S, v15.4S, v31.s[0] +add v10.4s, v10.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v15.4s, v17.4s, v20.4s +mla v9.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +sub v2.4s, v14.4s, v9.4s +mla v16.4S, v0.4S, v31.s[0] +add v14.4s, v14.4s, v9.4s +sqrdmulh v9.4S, v11.4S, v29.s[1] +mul v11.4S, v11.4S,v30.s[1] +sub v0.4s, v17.4s, v16.4s +mla v8.4S, v20.4S, v31.s[0] +add v17.4s, v17.4s, v16.4s +sqrdmulh v16.4S, v12.4S, v29.s[1] +mul v12.4S, v12.4S,v30.s[1] +sub v20.4s, v14.4s, v8.4s +mla v11.4S, v9.4S, v31.s[0] +add v14.4s, v14.4s, v8.4s +sqrdmulh v8.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +sub v9.4s, v13.4s, v11.4s +mla v12.4S, v16.4S, v31.s[0] +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +sub v16.4s, v10.4s, v12.4s +mla v22.4S, v8.4S, v31.s[0] +add v10.4s, v10.4s, v12.4s +sqrdmulh v12.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +sub v8.4s, v15.4s, v22.4s +mla v21.4S, v11.4S, v31.s[0] +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v1.4S, v29.s[2] +mul v1.4S, v1.4S,v30.s[2] +sub v11.4s, v2.4s, v21.4s +mla v19.4S, v12.4S, v31.s[0] +add v2.4s, v2.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v27.s[0] +mul v17.4S, v17.4S,v28.s[0] +sub v12.4s, v18.4s, v19.4s +mla v1.4S, v22.4S, v31.s[0] +add v18.4s, v18.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +sub v22.4s, v3.4s, v1.4s +mla v17.4S, v21.4S, v31.s[0] +add v3.4s, v3.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v21.4s, v13.4s, v17.4s +mla v14.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v17.4s +sqrdmulh v17.4S, v20.4S, v27.s[1] +mul v20.4S, v20.4S,v28.s[1] +sub v19.4s, v10.4s, v14.4s +mla v0.4S, v1.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v27.s[2] +mul v15.4S, v15.4S,v28.s[2] +sub v1.4s, v9.4s, v0.4s +mla v20.4S, v17.4S, v31.s[0] +add v9.4s, v9.4s, v0.4s +sqrdmulh v0.4S, v2.4S, v27.s[2] +mul v2.4S, v2.4S,v28.s[2] +sub v17.4s, v16.4s, v20.4s +mla v15.4S, v14.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v27.s[3] +mul v8.4S, v8.4S,v28.s[3] +sub v14.4s, v18.4s, v15.4s +mla v2.4S, v0.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v27.s[3] +mul v11.4S, v11.4S,v28.s[3] +sub v0.4s, v3.4s, v2.4s +mla v8.4S, v20.4S, v31.s[0] +add v3.4s, v3.4s, v2.4s +sqrdmulh v2.4S, v10.4S, v25.s[0] +mul v10.4S, v10.4S,v26.s[0] +sub v20.4s, v12.4s, v8.4s +mla v11.4S, v15.4S, v31.s[0] +add v12.4s, v12.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v25.s[1] +mul v19.4S, v19.4S,v26.s[1] +sub v15.4s, v22.4s, v11.4s +mla v10.4S, v2.4S, v31.s[0] +add v22.4s, v22.4s, v11.4s +sqrdmulh v11.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v2.4s, v13.4s, v10.4s +mla v19.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v10.4s +sqrdmulh v10.4S, v17.4S, v25.s[3] +mul v17.4S, v17.4S,v26.s[3] +sub v8.4s, v21.4s, v19.4s +mla v16.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v19.4s +str q13, [x0, #48] +sqrdmulh v13.4S, v3.4S, v23.s[0] +str q2, [x0, #112] +mul v3.4S, v3.4S,v24.s[0] +ldr q2, [x0, #768] +sub v19.4s, v9.4s, v16.4s +ldr q11, [x0, #832] +mla v17.4S, v10.4S, v31.s[0] +add v9.4s, v9.4s, v16.4s +str q21, [x0, #176] +sqrdmulh v21.4S, v0.4S, v23.s[1] +str q8, [x0, #240] +mul v0.4S, v0.4S,v24.s[1] +ldr q8, [x0, #896] +sub v16.4s, v1.4s, v17.4s +ldr q10, [x0, #960] +mla v3.4S, v13.4S, v31.s[0] +add v1.4s, v1.4s, v17.4s +str q9, [x0, #304] +sqrdmulh v9.4S, v22.4S, v23.s[2] +str q19, [x0, #368] +mul v22.4S, v22.4S,v24.s[2] +ldr q19, [x0, #256] +sub v17.4s, v18.4s, v3.4s +ldr q13, [x0, #320] +mla v0.4S, v21.4S, v31.s[0] +add v18.4s, v18.4s, v3.4s +str q1, [x0, #432] +sqrdmulh v1.4S, v15.4S, v23.s[3] +str q16, [x0, #496] +mul v15.4S, v15.4S,v24.s[3] +ldr q16, [x0, #384] +sub v3.4s, v14.4s, v0.4s +ldr q21, [x0, #448] +mla v22.4S, v9.4S, v31.s[0] +add v14.4s, v14.4s, v0.4s +str q18, [x0, #560] +sqrdmulh v18.4S, v2.4S, v29.s[0] +str q17, [x0, #624] +ldr q17, [x0, #512] +mul v2.4S, v2.4S,v30.s[0] +ldr q0, [x0, #576] +sub v9.4s, v12.4s, v22.4s +mla v15.4S, v1.4S, v31.s[0] +add v12.4s, v12.4s, v22.4s +str q14, [x0, #688] +sqrdmulh v14.4S, v11.4S, v29.s[0] +str q3, [x0, #752] +ldr q3, [x0, #640] +mul v11.4S, v11.4S,v30.s[0] +ldr q22, [x0, #704] +sub v1.4s, v20.4s, v15.4s +mla v2.4S, v18.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +str q12, [x0, #816] +sqrdmulh v12.4S, v8.4S, v29.s[0] +str q9, [x0, #880] +mul v8.4S, v8.4S,v30.s[0] +ldr q9, [x0, #0] +sub v15.4s, v19.4s, v2.4s +mla v11.4S, v14.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +str q20, [x0, #944] +sqrdmulh v20.4S, v10.4S, v29.s[0] +str q1, [x0, #1008] +mul v10.4S, v10.4S,v30.s[0] +ldr q1, [x0, #64] +sub v2.4s, v13.4s, v11.4s +mla v8.4S, v12.4S, v31.s[0] +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v29.s[0] +ldr q12, [x0, #128] +mul v17.4S, v17.4S,v30.s[0] +sub v14.4s, v16.4s, v8.4s +mla v10.4S, v20.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v0.4S, v29.s[0] +ldr q20, [x0, #192] +mul v0.4S, v0.4S,v30.s[0] +sub v18.4s, v21.4s, v10.4s +mla v17.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +sub v11.4s, v9.4s, v17.4s +mla v0.4S, v8.4S, v31.s[0] +add v9.4s, v9.4s, v17.4s +sqrdmulh v17.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v8.4s, v1.4s, v0.4s +mla v3.4S, v10.4S, v31.s[0] +add v1.4s, v1.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v10.4s, v12.4s, v3.4s +mla v22.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v17.4s, v20.4s, v22.4s +mla v16.4S, v0.4S, v31.s[0] +add v20.4s, v20.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[1] +mul v19.4S, v19.4S,v30.s[1] +sub v0.4s, v12.4s, v16.4s +mla v21.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v13.4S, v29.s[1] +mul v13.4S, v13.4S,v30.s[1] +sub v3.4s, v20.4s, v21.4s +mla v19.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v22.4s, v9.4s, v19.4s +mla v13.4S, v16.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v29.s[2] +mul v18.4S, v18.4S,v30.s[2] +sub v16.4s, v1.4s, v13.4s +mla v14.4S, v21.4S, v31.s[0] +add v1.4s, v1.4s, v13.4s +sqrdmulh v13.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +sub v21.4s, v10.4s, v14.4s +mla v18.4S, v19.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v19.4s, v17.4s, v18.4s +mla v15.4S, v13.4S, v31.s[0] +add v17.4s, v17.4s, v18.4s +sqrdmulh v18.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +sub v13.4s, v11.4s, v15.4s +mla v2.4S, v14.4S, v31.s[0] +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v27.s[0] +mul v20.4S, v20.4S,v28.s[0] +sub v14.4s, v8.4s, v2.4s +mla v12.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v2.4s +sqrdmulh v2.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v18.4s, v9.4s, v12.4s +mla v20.4S, v15.4S, v31.s[0] +add v9.4s, v9.4s, v12.4s +sqrdmulh v12.4S, v3.4S, v27.s[1] +mul v3.4S, v3.4S,v28.s[1] +sub v15.4s, v1.4s, v20.4s +mla v0.4S, v2.4S, v31.s[0] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v10.4S, v27.s[2] +mul v10.4S, v10.4S,v28.s[2] +sub v2.4s, v22.4s, v0.4s +mla v3.4S, v12.4S, v31.s[0] +add v22.4s, v22.4s, v0.4s +sqrdmulh v0.4S, v17.4S, v27.s[2] +mul v17.4S, v17.4S,v28.s[2] +sub v12.4s, v16.4s, v3.4s +mla v10.4S, v20.4S, v31.s[0] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +sub v20.4s, v11.4s, v10.4s +mla v17.4S, v0.4S, v31.s[0] +add v11.4s, v11.4s, v10.4s +sqrdmulh v10.4S, v19.4S, v27.s[3] +mul v19.4S, v19.4S,v28.s[3] +sub v0.4s, v8.4s, v17.4s +mla v21.4S, v3.4S, v31.s[0] +add v8.4s, v8.4s, v17.4s +sqrdmulh v17.4S, v1.4S, v25.s[0] +mul v1.4S, v1.4S,v26.s[0] +sub v3.4s, v13.4s, v21.4s +mla v19.4S, v10.4S, v31.s[0] +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v15.4S, v25.s[1] +mul v15.4S, v15.4S,v26.s[1] +sub v10.4s, v14.4s, v19.4s +mla v1.4S, v17.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +sqrdmulh v19.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v17.4s, v9.4s, v1.4s +mla v15.4S, v21.4S, v31.s[0] +add v9.4s, v9.4s, v1.4s +sqrdmulh v1.4S, v12.4S, v25.s[3] +mul v12.4S, v12.4S,v26.s[3] +sub v21.4s, v18.4s, v15.4s +mla v16.4S, v19.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +str q9, [x0, #0] +sqrdmulh v9.4S, v8.4S, v23.s[0] +str q17, [x0, #64] +mul v8.4S, v8.4S,v24.s[0] +ldr q17, [x0, #784] +sub v15.4s, v22.4s, v16.4s +ldr q19, [x0, #848] +mla v12.4S, v1.4S, v31.s[0] +add v22.4s, v22.4s, v16.4s +str q18, [x0, #128] +sqrdmulh v18.4S, v0.4S, v23.s[1] +str q21, [x0, #192] +mul v0.4S, v0.4S,v24.s[1] +ldr q21, [x0, #912] +sub v16.4s, v2.4s, v12.4s +ldr q1, [x0, #976] +mla v8.4S, v9.4S, v31.s[0] +add v2.4s, v2.4s, v12.4s +str q22, [x0, #256] +sqrdmulh v22.4S, v14.4S, v23.s[2] +str q15, [x0, #320] +mul v14.4S, v14.4S,v24.s[2] +ldr q15, [x0, #272] +sub v12.4s, v11.4s, v8.4s +ldr q9, [x0, #336] +mla v0.4S, v18.4S, v31.s[0] +add v11.4s, v11.4s, v8.4s +str q2, [x0, #384] +sqrdmulh v2.4S, v10.4S, v23.s[3] +str q16, [x0, #448] +mul v10.4S, v10.4S,v24.s[3] +ldr q16, [x0, #400] +sub v8.4s, v20.4s, v0.4s +ldr q18, [x0, #464] +mla v14.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v0.4s +str q11, [x0, #512] +sqrdmulh v11.4S, v17.4S, v29.s[0] +str q12, [x0, #576] +ldr q12, [x0, #528] +mul v17.4S, v17.4S,v30.s[0] +ldr q0, [x0, #592] +sub v22.4s, v13.4s, v14.4s +mla v10.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v14.4s +str q20, [x0, #640] +sqrdmulh v20.4S, v19.4S, v29.s[0] +str q8, [x0, #704] +ldr q8, [x0, #656] +mul v19.4S, v19.4S,v30.s[0] +ldr q14, [x0, #720] +sub v2.4s, v3.4s, v10.4s +mla v17.4S, v11.4S, v31.s[0] +add v3.4s, v3.4s, v10.4s +str q13, [x0, #768] +sqrdmulh v13.4S, v21.4S, v29.s[0] +str q22, [x0, #832] +mul v21.4S, v21.4S,v30.s[0] +ldr q22, [x0, #16] +sub v10.4s, v15.4s, v17.4s +mla v19.4S, v20.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +str q3, [x0, #896] +sqrdmulh v3.4S, v1.4S, v29.s[0] +str q2, [x0, #960] +mul v1.4S, v1.4S,v30.s[0] +ldr q2, [x0, #80] +sub v17.4s, v9.4s, v19.4s +mla v21.4S, v13.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v12.4S, v29.s[0] +ldr q13, [x0, #144] +mul v12.4S, v12.4S,v30.s[0] +sub v20.4s, v16.4s, v21.4s +mla v1.4S, v3.4S, v31.s[0] +add v16.4s, v16.4s, v21.4s +sqrdmulh v21.4S, v0.4S, v29.s[0] +ldr q3, [x0, #208] +mul v0.4S, v0.4S,v30.s[0] +sub v11.4s, v18.4s, v1.4s +mla v12.4S, v19.4S, v31.s[0] +add v18.4s, v18.4s, v1.4s +sqrdmulh v1.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v19.4s, v22.4s, v12.4s +mla v0.4S, v21.4S, v31.s[0] +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v21.4s, v2.4s, v0.4s +mla v8.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v1.4s, v13.4s, v8.4s +mla v14.4S, v12.4S, v31.s[0] +add v13.4s, v13.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v12.4s, v3.4s, v14.4s +mla v16.4S, v0.4S, v31.s[0] +add v3.4s, v3.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +sub v0.4s, v13.4s, v16.4s +mla v18.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v16.4s +sqrdmulh v16.4S, v9.4S, v29.s[1] +mul v9.4S, v9.4S,v30.s[1] +sub v8.4s, v3.4s, v18.4s +mla v15.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +sub v14.4s, v22.4s, v15.4s +mla v9.4S, v16.4S, v31.s[0] +add v22.4s, v22.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v16.4s, v2.4s, v9.4s +mla v20.4S, v18.4S, v31.s[0] +add v2.4s, v2.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v18.4s, v1.4s, v20.4s +mla v11.4S, v15.4S, v31.s[0] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +sub v15.4s, v12.4s, v11.4s +mla v10.4S, v9.4S, v31.s[0] +add v12.4s, v12.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v27.s[0] +mul v13.4S, v13.4S,v28.s[0] +sub v9.4s, v19.4s, v10.4s +mla v17.4S, v20.4S, v31.s[0] +add v19.4s, v19.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v27.s[0] +mul v3.4S, v3.4S,v28.s[0] +sub v20.4s, v21.4s, v17.4s +mla v13.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v11.4s, v22.4s, v13.4s +mla v3.4S, v10.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v8.4S, v27.s[1] +mul v8.4S, v8.4S,v28.s[1] +sub v10.4s, v2.4s, v3.4s +mla v0.4S, v17.4S, v31.s[0] +add v2.4s, v2.4s, v3.4s +sqrdmulh v3.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +sub v17.4s, v14.4s, v0.4s +mla v8.4S, v13.4S, v31.s[0] +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v12.4S, v27.s[2] +mul v12.4S, v12.4S,v28.s[2] +sub v13.4s, v16.4s, v8.4s +mla v1.4S, v3.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v3.4s, v19.4s, v1.4s +mla v12.4S, v0.4S, v31.s[0] +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +sub v0.4s, v21.4s, v12.4s +mla v18.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v2.4S, v25.s[0] +mul v2.4S, v2.4S,v26.s[0] +sub v8.4s, v9.4s, v18.4s +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v10.4S, v25.s[1] +mul v10.4S, v10.4S,v26.s[1] +sub v1.4s, v20.4s, v15.4s +mla v2.4S, v12.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v12.4s, v22.4s, v2.4s +mla v10.4S, v18.4S, v31.s[0] +add v22.4s, v22.4s, v2.4s +sqrdmulh v2.4S, v13.4S, v25.s[3] +mul v13.4S, v13.4S,v26.s[3] +sub v18.4s, v11.4s, v10.4s +mla v16.4S, v15.4S, v31.s[0] +add v11.4s, v11.4s, v10.4s +str q22, [x0, #16] +sqrdmulh v22.4S, v21.4S, v23.s[0] +str q12, [x0, #80] +mul v21.4S, v21.4S,v24.s[0] +sub v12.4s, v14.4s, v16.4s +mla v13.4S, v2.4S, v31.s[0] +add v14.4s, v14.4s, v16.4s +str q11, [x0, #144] +sqrdmulh v11.4S, v0.4S, v23.s[1] +str q18, [x0, #208] +mul v0.4S, v0.4S,v24.s[1] +sub v18.4s, v17.4s, v13.4s +mla v21.4S, v22.4S, v31.s[0] +add v17.4s, v17.4s, v13.4s +str q14, [x0, #272] +sqrdmulh v14.4S, v20.4S, v23.s[2] +str q12, [x0, #336] +mul v20.4S, v20.4S,v24.s[2] +sub v12.4s, v19.4s, v21.4s +mla v0.4S, v11.4S, v31.s[0] +add v19.4s, v19.4s, v21.4s +str q17, [x0, #400] +sqrdmulh v17.4S, v1.4S, v23.s[3] +str q18, [x0, #464] +mul v1.4S, v1.4S,v24.s[3] +sub v18.4s, v3.4s, v0.4s +mla v20.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v0.4s +str q19, [x0, #528] +str q12, [x0, #592] +sub v12.4s, v9.4s, v20.4s +mla v1.4S, v17.4S, v31.s[0] +add v9.4s, v9.4s, v20.4s +str q3, [x0, #656] +str q18, [x0, #720] +sub v18.4s, v8.4s, v1.4s +add v8.4s, v8.4s, v1.4s +str q9, [x0, #784] +str q12, [x0, #848] +str q8, [x0, #912] +str q18, [x0, #976] +ldr q4, [x17, #+128] +ldr q5, [x17, #+144] +ldr q6, [x17, #+160] +ldr q7, [x17, #+176] +ldr q15, [x17, #+192] +ldr q10, [x17, #+208] +ldr q2, [x17, #+224] +ldr q16, [x17, #+240] +ldr q22, [x0, #32] +ldr q13, [x0, #48] +ldr q11, [x0, #0] +ldr q21, [x0, #16] +sqrdmulh v14.4S, v22.4S, v5.s[0] +mul v22.4S, v22.4S,v4.s[0] +mla v22.4S, v14.4S, v31.s[0] +sub v14.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +sqrdmulh v22.4S, v13.4S, v5.s[0] +mul v13.4S, v13.4S,v4.s[0] +mla v13.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +ldr q13, [x17, #+256] +ldr q0, [x17, #+272] +sqrdmulh v19.4S, v21.4S, v5.s[1] +mul v21.4S, v21.4S,v4.s[1] +mla v21.4S, v19.4S, v31.s[0] +sub v19.4s, v11.4s, v21.4s +add v11.4s, v11.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v5.s[2] +mul v22.4S, v22.4S,v4.s[2] +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v14.4s, v22.4s +add v14.4s, v14.4s, v22.4s +str q11, [x0, #0] +str q19, [x0, #16] +str q14, [x0, #32] +str q21, [x0, #48] +ldr q21, [x0, #96] +ldr q14, [x0, #112] +ldr q19, [x0, #64] +ldr q11, [x0, #80] +sqrdmulh v22.4S, v21.4S, v7.s[0] +mul v21.4S, v21.4S,v6.s[0] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v19.4s, v21.4s +add v19.4s, v19.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v7.s[0] +mul v14.4S, v14.4S,v6.s[0] +mla v14.4S, v21.4S, v31.s[0] +sub v21.4s, v11.4s, v14.4s +add v11.4s, v11.4s, v14.4s +ldr q14, [x17, #+288] +ldr q17, [x17, #+304] +sqrdmulh v20.4S, v11.4S, v7.s[1] +mul v11.4S, v11.4S,v6.s[1] +mla v11.4S, v20.4S, v31.s[0] +sub v20.4s, v19.4s, v11.4s +add v19.4s, v19.4s, v11.4s +sqrdmulh v11.4S, v21.4S, v7.s[2] +mul v21.4S, v21.4S,v6.s[2] +mla v21.4S, v11.4S, v31.s[0] +sub v11.4s, v22.4s, v21.4s +add v22.4s, v22.4s, v21.4s +str q19, [x0, #64] +str q20, [x0, #80] +str q22, [x0, #96] +str q11, [x0, #112] +ldr q11, [x0, #160] +ldr q22, [x0, #176] +ldr q20, [x0, #128] +ldr q19, [x0, #144] +sqrdmulh v21.4S, v11.4S, v10.s[0] +mul v11.4S, v11.4S,v15.s[0] +mla v11.4S, v21.4S, v31.s[0] +sub v21.4s, v20.4s, v11.4s +add v20.4s, v20.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v10.s[0] +mul v22.4S, v22.4S,v15.s[0] +mla v22.4S, v11.4S, v31.s[0] +sub v11.4s, v19.4s, v22.4s +add v19.4s, v19.4s, v22.4s +ldr q22, [x17, #+320] +ldr q3, [x17, #+336] +sqrdmulh v1.4S, v19.4S, v10.s[1] +mul v19.4S, v19.4S,v15.s[1] +mla v19.4S, v1.4S, v31.s[0] +sub v1.4s, v20.4s, v19.4s +add v20.4s, v20.4s, v19.4s +sqrdmulh v19.4S, v11.4S, v10.s[2] +mul v11.4S, v11.4S,v15.s[2] +mla v11.4S, v19.4S, v31.s[0] +sub v19.4s, v21.4s, v11.4s +add v21.4s, v21.4s, v11.4s +str q20, [x0, #128] +str q1, [x0, #144] +str q21, [x0, #160] +str q19, [x0, #176] +ldr q19, [x0, #224] +ldr q21, [x0, #240] +ldr q1, [x0, #192] +ldr q20, [x0, #208] +sqrdmulh v11.4S, v19.4S, v16.s[0] +mul v19.4S, v19.4S,v2.s[0] +mla v19.4S, v11.4S, v31.s[0] +sub v11.4s, v1.4s, v19.4s +add v1.4s, v1.4s, v19.4s +sqrdmulh v19.4S, v21.4S, v16.s[0] +mul v21.4S, v21.4S,v2.s[0] +mla v21.4S, v19.4S, v31.s[0] +sub v19.4s, v20.4s, v21.4s +add v20.4s, v20.4s, v21.4s +ldr q21, [x17, #+352] +ldr q9, [x17, #+368] +sqrdmulh v12.4S, v20.4S, v16.s[1] +mul v20.4S, v20.4S,v2.s[1] +mla v20.4S, v12.4S, v31.s[0] +sub v12.4s, v1.4s, v20.4s +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v16.s[2] +mul v19.4S, v19.4S,v2.s[2] +mla v19.4S, v20.4S, v31.s[0] +sub v20.4s, v11.4s, v19.4s +add v11.4s, v11.4s, v19.4s +str q1, [x0, #192] +str q12, [x0, #208] +str q11, [x0, #224] +str q20, [x0, #240] +ldr q20, [x0, #288] +ldr q11, [x0, #304] +ldr q12, [x0, #256] +ldr q1, [x0, #272] +sqrdmulh v19.4S, v20.4S, v0.s[0] +mul v20.4S, v20.4S,v13.s[0] +mla v20.4S, v19.4S, v31.s[0] +sub v19.4s, v12.4s, v20.4s +add v12.4s, v12.4s, v20.4s +sqrdmulh v20.4S, v11.4S, v0.s[0] +mul v11.4S, v11.4S,v13.s[0] +mla v11.4S, v20.4S, v31.s[0] +sub v20.4s, v1.4s, v11.4s +add v1.4s, v1.4s, v11.4s +ldr q11, [x17, #+384] +ldr q8, [x17, #+400] +sqrdmulh v18.4S, v1.4S, v0.s[1] +mul v1.4S, v1.4S,v13.s[1] +mla v1.4S, v18.4S, v31.s[0] +sub v18.4s, v12.4s, v1.4s +add v12.4s, v12.4s, v1.4s +sqrdmulh v1.4S, v20.4S, v0.s[2] +mul v20.4S, v20.4S,v13.s[2] +mla v20.4S, v1.4S, v31.s[0] +sub v1.4s, v19.4s, v20.4s +add v19.4s, v19.4s, v20.4s +str q12, [x0, #256] +str q18, [x0, #272] +str q19, [x0, #288] +str q1, [x0, #304] +ldr q5, [x0, #352] +ldr q4, [x0, #368] +ldr q1, [x0, #320] +ldr q19, [x0, #336] +sqrdmulh v18.4S, v5.4S, v17.s[0] +mul v5.4S, v5.4S,v14.s[0] +mla v5.4S, v18.4S, v31.s[0] +sub v18.4s, v1.4s, v5.4s +add v1.4s, v1.4s, v5.4s +sqrdmulh v5.4S, v4.4S, v17.s[0] +mul v4.4S, v4.4S,v14.s[0] +mla v4.4S, v5.4S, v31.s[0] +sub v5.4s, v19.4s, v4.4s +add v19.4s, v19.4s, v4.4s +ldr q4, [x17, #+416] +ldr q12, [x17, #+432] +sqrdmulh v20.4S, v19.4S, v17.s[1] +mul v19.4S, v19.4S,v14.s[1] +mla v19.4S, v20.4S, v31.s[0] +sub v20.4s, v1.4s, v19.4s +add v1.4s, v1.4s, v19.4s +sqrdmulh v19.4S, v5.4S, v17.s[2] +mul v5.4S, v5.4S,v14.s[2] +mla v5.4S, v19.4S, v31.s[0] +sub v19.4s, v18.4s, v5.4s +add v18.4s, v18.4s, v5.4s +str q1, [x0, #320] +str q20, [x0, #336] +str q18, [x0, #352] +str q19, [x0, #368] +ldr q7, [x0, #416] +ldr q6, [x0, #432] +ldr q19, [x0, #384] +ldr q18, [x0, #400] +sqrdmulh v20.4S, v7.4S, v3.s[0] +mul v7.4S, v7.4S,v22.s[0] +mla v7.4S, v20.4S, v31.s[0] +sub v20.4s, v19.4s, v7.4s +add v19.4s, v19.4s, v7.4s +sqrdmulh v7.4S, v6.4S, v3.s[0] +mul v6.4S, v6.4S,v22.s[0] +mla v6.4S, v7.4S, v31.s[0] +sub v7.4s, v18.4s, v6.4s +add v18.4s, v18.4s, v6.4s +ldr q6, [x17, #+448] +ldr q1, [x17, #+464] +sqrdmulh v5.4S, v18.4S, v3.s[1] +mul v18.4S, v18.4S,v22.s[1] +mla v18.4S, v5.4S, v31.s[0] +sub v5.4s, v19.4s, v18.4s +add v19.4s, v19.4s, v18.4s +sqrdmulh v18.4S, v7.4S, v3.s[2] +mul v7.4S, v7.4S,v22.s[2] +mla v7.4S, v18.4S, v31.s[0] +sub v18.4s, v20.4s, v7.4s +add v20.4s, v20.4s, v7.4s +str q19, [x0, #384] +str q5, [x0, #400] +str q20, [x0, #416] +str q18, [x0, #432] +ldr q10, [x0, #480] +ldr q15, [x0, #496] +ldr q18, [x0, #448] +ldr q20, [x0, #464] +sqrdmulh v5.4S, v10.4S, v9.s[0] +mul v10.4S, v10.4S,v21.s[0] +mla v10.4S, v5.4S, v31.s[0] +sub v5.4s, v18.4s, v10.4s +add v18.4s, v18.4s, v10.4s +sqrdmulh v10.4S, v15.4S, v9.s[0] +mul v15.4S, v15.4S,v21.s[0] +mla v15.4S, v10.4S, v31.s[0] +sub v10.4s, v20.4s, v15.4s +add v20.4s, v20.4s, v15.4s +ldr q15, [x17, #+480] +ldr q19, [x17, #+496] +sqrdmulh v7.4S, v20.4S, v9.s[1] +mul v20.4S, v20.4S,v21.s[1] +mla v20.4S, v7.4S, v31.s[0] +sub v7.4s, v18.4s, v20.4s +add v18.4s, v18.4s, v20.4s +sqrdmulh v20.4S, v10.4S, v9.s[2] +mul v10.4S, v10.4S,v21.s[2] +mla v10.4S, v20.4S, v31.s[0] +sub v20.4s, v5.4s, v10.4s +add v5.4s, v5.4s, v10.4s +str q18, [x0, #448] +str q7, [x0, #464] +str q5, [x0, #480] +str q20, [x0, #496] +ldr q16, [x0, #544] +ldr q2, [x0, #560] +ldr q20, [x0, #512] +ldr q5, [x0, #528] +sqrdmulh v7.4S, v16.4S, v8.s[0] +mul v16.4S, v16.4S,v11.s[0] +mla v16.4S, v7.4S, v31.s[0] +sub v7.4s, v20.4s, v16.4s +add v20.4s, v20.4s, v16.4s +sqrdmulh v16.4S, v2.4S, v8.s[0] +mul v2.4S, v2.4S,v11.s[0] +mla v2.4S, v16.4S, v31.s[0] +sub v16.4s, v5.4s, v2.4s +add v5.4s, v5.4s, v2.4s +ldr q2, [x17, #+512] +ldr q18, [x17, #+528] +sqrdmulh v10.4S, v5.4S, v8.s[1] +mul v5.4S, v5.4S,v11.s[1] +mla v5.4S, v10.4S, v31.s[0] +sub v10.4s, v20.4s, v5.4s +add v20.4s, v20.4s, v5.4s +sqrdmulh v5.4S, v16.4S, v8.s[2] +mul v16.4S, v16.4S,v11.s[2] +mla v16.4S, v5.4S, v31.s[0] +sub v5.4s, v7.4s, v16.4s +add v7.4s, v7.4s, v16.4s +str q20, [x0, #512] +str q10, [x0, #528] +str q7, [x0, #544] +str q5, [x0, #560] +ldr q0, [x0, #608] +ldr q13, [x0, #624] +ldr q5, [x0, #576] +ldr q7, [x0, #592] +sqrdmulh v10.4S, v0.4S, v12.s[0] +mul v0.4S, v0.4S,v4.s[0] +mla v0.4S, v10.4S, v31.s[0] +sub v10.4s, v5.4s, v0.4s +add v5.4s, v5.4s, v0.4s +sqrdmulh v0.4S, v13.4S, v12.s[0] +mul v13.4S, v13.4S,v4.s[0] +mla v13.4S, v0.4S, v31.s[0] +sub v0.4s, v7.4s, v13.4s +add v7.4s, v7.4s, v13.4s +ldr q13, [x17, #+544] +ldr q20, [x17, #+560] +sqrdmulh v16.4S, v7.4S, v12.s[1] +mul v7.4S, v7.4S,v4.s[1] +mla v7.4S, v16.4S, v31.s[0] +sub v16.4s, v5.4s, v7.4s +add v5.4s, v5.4s, v7.4s +sqrdmulh v7.4S, v0.4S, v12.s[2] +mul v0.4S, v0.4S,v4.s[2] +mla v0.4S, v7.4S, v31.s[0] +sub v7.4s, v10.4s, v0.4s +add v10.4s, v10.4s, v0.4s +str q5, [x0, #576] +str q16, [x0, #592] +str q10, [x0, #608] +str q7, [x0, #624] +ldr q17, [x0, #672] +ldr q14, [x0, #688] +ldr q7, [x0, #640] +ldr q10, [x0, #656] +sqrdmulh v16.4S, v17.4S, v1.s[0] +mul v17.4S, v17.4S,v6.s[0] +mla v17.4S, v16.4S, v31.s[0] +sub v16.4s, v7.4s, v17.4s +add v7.4s, v7.4s, v17.4s +sqrdmulh v17.4S, v14.4S, v1.s[0] +mul v14.4S, v14.4S,v6.s[0] +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v10.4s, v14.4s +add v10.4s, v10.4s, v14.4s +ldr q14, [x17, #+576] +ldr q5, [x17, #+592] +sqrdmulh v0.4S, v10.4S, v1.s[1] +mul v10.4S, v10.4S,v6.s[1] +mla v10.4S, v0.4S, v31.s[0] +sub v0.4s, v7.4s, v10.4s +add v7.4s, v7.4s, v10.4s +sqrdmulh v10.4S, v17.4S, v1.s[2] +mul v17.4S, v17.4S,v6.s[2] +mla v17.4S, v10.4S, v31.s[0] +sub v10.4s, v16.4s, v17.4s +add v16.4s, v16.4s, v17.4s +str q7, [x0, #640] +str q0, [x0, #656] +str q16, [x0, #672] +str q10, [x0, #688] +ldr q3, [x0, #736] +ldr q22, [x0, #752] +ldr q10, [x0, #704] +ldr q16, [x0, #720] +sqrdmulh v0.4S, v3.4S, v19.s[0] +mul v3.4S, v3.4S,v15.s[0] +mla v3.4S, v0.4S, v31.s[0] +sub v0.4s, v10.4s, v3.4s +add v10.4s, v10.4s, v3.4s +sqrdmulh v3.4S, v22.4S, v19.s[0] +mul v22.4S, v22.4S,v15.s[0] +mla v22.4S, v3.4S, v31.s[0] +sub v3.4s, v16.4s, v22.4s +add v16.4s, v16.4s, v22.4s +ldr q22, [x17, #+608] +ldr q7, [x17, #+624] +sqrdmulh v17.4S, v16.4S, v19.s[1] +mul v16.4S, v16.4S,v15.s[1] +mla v16.4S, v17.4S, v31.s[0] +sub v17.4s, v10.4s, v16.4s +add v10.4s, v10.4s, v16.4s +sqrdmulh v16.4S, v3.4S, v19.s[2] +mul v3.4S, v3.4S,v15.s[2] +mla v3.4S, v16.4S, v31.s[0] +sub v16.4s, v0.4s, v3.4s +add v0.4s, v0.4s, v3.4s +str q10, [x0, #704] +str q17, [x0, #720] +str q0, [x0, #736] +str q16, [x0, #752] +ldr q9, [x0, #800] +ldr q21, [x0, #816] +ldr q16, [x0, #768] +ldr q0, [x0, #784] +sqrdmulh v17.4S, v9.4S, v18.s[0] +mul v9.4S, v9.4S,v2.s[0] +mla v9.4S, v17.4S, v31.s[0] +sub v17.4s, v16.4s, v9.4s +add v16.4s, v16.4s, v9.4s +sqrdmulh v9.4S, v21.4S, v18.s[0] +mul v21.4S, v21.4S,v2.s[0] +mla v21.4S, v9.4S, v31.s[0] +sub v9.4s, v0.4s, v21.4s +add v0.4s, v0.4s, v21.4s +sqrdmulh v21.4S, v0.4S, v18.s[1] +mul v0.4S, v0.4S,v2.s[1] +mla v0.4S, v21.4S, v31.s[0] +sub v21.4s, v16.4s, v0.4s +add v16.4s, v16.4s, v0.4s +sqrdmulh v0.4S, v9.4S, v18.s[2] +mul v9.4S, v9.4S,v2.s[2] +mla v9.4S, v0.4S, v31.s[0] +sub v0.4s, v17.4s, v9.4s +add v17.4s, v17.4s, v9.4s +str q16, [x0, #768] +str q21, [x0, #784] +str q17, [x0, #800] +str q0, [x0, #816] +ldr q8, [x0, #864] +ldr q11, [x0, #880] +ldr q0, [x0, #832] +ldr q17, [x0, #848] +sqrdmulh v21.4S, v8.4S, v20.s[0] +mul v8.4S, v8.4S,v13.s[0] +mla v8.4S, v21.4S, v31.s[0] +sub v21.4s, v0.4s, v8.4s +add v0.4s, v0.4s, v8.4s +sqrdmulh v8.4S, v11.4S, v20.s[0] +mul v11.4S, v11.4S,v13.s[0] +mla v11.4S, v8.4S, v31.s[0] +sub v8.4s, v17.4s, v11.4s +add v17.4s, v17.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v20.s[1] +mul v17.4S, v17.4S,v13.s[1] +mla v17.4S, v11.4S, v31.s[0] +sub v11.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +sqrdmulh v17.4S, v8.4S, v20.s[2] +mul v8.4S, v8.4S,v13.s[2] +mla v8.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v8.4s +add v21.4s, v21.4s, v8.4s +str q0, [x0, #832] +str q11, [x0, #848] +str q21, [x0, #864] +str q17, [x0, #880] +ldr q12, [x0, #928] +ldr q4, [x0, #944] +ldr q17, [x0, #896] +ldr q21, [x0, #912] +sqrdmulh v11.4S, v12.4S, v5.s[0] +mul v12.4S, v12.4S,v14.s[0] +mla v12.4S, v11.4S, v31.s[0] +sub v11.4s, v17.4s, v12.4s +add v17.4s, v17.4s, v12.4s +sqrdmulh v12.4S, v4.4S, v5.s[0] +mul v4.4S, v4.4S,v14.s[0] +mla v4.4S, v12.4S, v31.s[0] +sub v12.4s, v21.4s, v4.4s +add v21.4s, v21.4s, v4.4s +sqrdmulh v4.4S, v21.4S, v5.s[1] +mul v21.4S, v21.4S,v14.s[1] +mla v21.4S, v4.4S, v31.s[0] +sub v4.4s, v17.4s, v21.4s +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v12.4S, v5.s[2] +mul v12.4S, v12.4S,v14.s[2] +mla v12.4S, v21.4S, v31.s[0] +sub v21.4s, v11.4s, v12.4s +add v11.4s, v11.4s, v12.4s +str q17, [x0, #896] +str q4, [x0, #912] +str q11, [x0, #928] +str q21, [x0, #944] +ldr q1, [x0, #992] +ldr q6, [x0, #1008] +ldr q21, [x0, #960] +ldr q11, [x0, #976] +sqrdmulh v4.4S, v1.4S, v7.s[0] +mul v1.4S, v1.4S,v22.s[0] +mla v1.4S, v4.4S, v31.s[0] +sub v4.4s, v21.4s, v1.4s +add v21.4s, v21.4s, v1.4s +sqrdmulh v1.4S, v6.4S, v7.s[0] +mul v6.4S, v6.4S,v22.s[0] +mla v6.4S, v1.4S, v31.s[0] +sub v1.4s, v11.4s, v6.4s +add v11.4s, v11.4s, v6.4s +sqrdmulh v6.4S, v11.4S, v7.s[1] +mul v11.4S, v11.4S,v22.s[1] +mla v11.4S, v6.4S, v31.s[0] +sub v6.4s, v21.4s, v11.4s +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v1.4S, v7.s[2] +mul v1.4S, v1.4S,v22.s[2] +mla v1.4S, v11.4S, v31.s[0] +sub v11.4s, v4.4s, v1.4s +add v4.4s, v4.4s, v1.4s +str q21, [x0, #960] +str q6, [x0, #976] +str q4, [x0, #992] +str q11, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1464 +// Instruction count: 1460 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_1.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_1.s new file mode 100644 index 0000000..6da50a5 --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_1.s @@ -0,0 +1,1494 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_3_z4_1 +.global _ntt_u32_incomplete_neon_asm_var_4_2_3_z4_1 +ntt_u32_incomplete_neon_asm_var_4_2_3_z4_1: +_ntt_u32_incomplete_neon_asm_var_4_2_3_z4_1: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #800] +ldr q21, [x0, #864] +ldr q20, [x0, #928] +ldr q19, [x0, #992] +ldr q18, [x0, #288] +ldr q17, [x0, #352] +ldr q16, [x0, #416] +ldr q3, [x0, #480] +sqrdmulh v2.4S, v22.4S, v29.s[0] +ldr q1, [x0, #544] +mul v22.4S, v22.4S,v30.s[0] +ldr q0, [x0, #608] +sqrdmulh v15.4S, v21.4S, v29.s[0] +ldr q14, [x0, #672] +mul v21.4S, v21.4S,v30.s[0] +ldr q13, [x0, #736] +mla v22.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q12, [x0, #32] +sub v11.4s, v18.4s, v22.4s +mla v21.4S, v15.4S, v31.s[0] +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q15, [x0, #96] +sub v10.4s, v17.4s, v21.4s +mla v20.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v1.4S, v29.s[0] +ldr q2, [x0, #160] +mul v1.4S, v1.4S,v30.s[0] +sub v9.4s, v16.4s, v20.4s +mla v19.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v0.4S, v29.s[0] +ldr q22, [x0, #224] +mul v0.4S, v0.4S,v30.s[0] +sub v8.4s, v3.4s, v19.4s +mla v1.4S, v21.4S, v31.s[0] +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v21.4s, v12.4s, v1.4s +mla v0.4S, v20.4S, v31.s[0] +add v12.4s, v12.4s, v1.4s +sqrdmulh v1.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +sub v20.4s, v15.4s, v0.4s +mla v14.4S, v19.4S, v31.s[0] +add v15.4s, v15.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v19.4s, v2.4s, v14.4s +mla v13.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v1.4s, v22.4s, v13.4s +mla v16.4S, v0.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v0.4s, v2.4s, v16.4s +mla v3.4S, v14.4S, v31.s[0] +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v14.4s, v22.4s, v3.4s +mla v18.4S, v13.4S, v31.s[0] +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v29.s[2] +mul v9.4S, v9.4S,v30.s[2] +sub v13.4s, v12.4s, v18.4s +mla v17.4S, v16.4S, v31.s[0] +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v8.4S, v29.s[2] +mul v8.4S, v8.4S,v30.s[2] +sub v16.4s, v15.4s, v17.4s +mla v9.4S, v3.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +sqrdmulh v17.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v3.4s, v19.4s, v9.4s +mla v8.4S, v18.4S, v31.s[0] +add v19.4s, v19.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v18.4s, v1.4s, v8.4s +mla v11.4S, v17.4S, v31.s[0] +add v1.4s, v1.4s, v8.4s +sqrdmulh v8.4S, v2.4S, v27.s[0] +mul v2.4S, v2.4S,v28.s[0] +sub v17.4s, v21.4s, v11.4s +mla v10.4S, v9.4S, v31.s[0] +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v27.s[0] +mul v22.4S, v22.4S,v28.s[0] +sub v9.4s, v20.4s, v10.4s +mla v2.4S, v8.4S, v31.s[0] +add v20.4s, v20.4s, v10.4s +sqrdmulh v10.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v8.4s, v12.4s, v2.4s +mla v22.4S, v11.4S, v31.s[0] +add v12.4s, v12.4s, v2.4s +sqrdmulh v2.4S, v14.4S, v27.s[1] +mul v14.4S, v14.4S,v28.s[1] +sub v11.4s, v15.4s, v22.4s +mla v0.4S, v10.4S, v31.s[0] +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v27.s[2] +mul v19.4S, v19.4S,v28.s[2] +sub v10.4s, v13.4s, v0.4s +mla v14.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v0.4s +sqrdmulh v0.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +sub v2.4s, v16.4s, v14.4s +mla v19.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v27.s[3] +mul v3.4S, v3.4S,v28.s[3] +sub v22.4s, v21.4s, v19.4s +mla v1.4S, v0.4S, v31.s[0] +add v21.4s, v21.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v0.4s, v20.4s, v1.4s +mla v3.4S, v14.4S, v31.s[0] +add v20.4s, v20.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v25.s[0] +mul v15.4S, v15.4S,v26.s[0] +sub v14.4s, v17.4s, v3.4s +mla v18.4S, v19.4S, v31.s[0] +add v17.4s, v17.4s, v3.4s +sqrdmulh v3.4S, v11.4S, v25.s[1] +mul v11.4S, v11.4S,v26.s[1] +sub v19.4s, v9.4s, v18.4s +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v1.4s, v12.4s, v15.4s +mla v11.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v25.s[3] +mul v2.4S, v2.4S,v26.s[3] +sub v3.4s, v8.4s, v11.4s +mla v16.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v11.4s +str q12, [x0, #32] +sqrdmulh v12.4S, v20.4S, v23.s[0] +str q1, [x0, #96] +mul v20.4S, v20.4S,v24.s[0] +ldr q1, [x0, #816] +sub v11.4s, v13.4s, v16.4s +ldr q18, [x0, #880] +mla v2.4S, v15.4S, v31.s[0] +add v13.4s, v13.4s, v16.4s +str q8, [x0, #160] +sqrdmulh v8.4S, v0.4S, v23.s[1] +str q3, [x0, #224] +mul v0.4S, v0.4S,v24.s[1] +ldr q3, [x0, #944] +sub v16.4s, v10.4s, v2.4s +ldr q15, [x0, #1008] +mla v20.4S, v12.4S, v31.s[0] +add v10.4s, v10.4s, v2.4s +str q13, [x0, #288] +sqrdmulh v13.4S, v9.4S, v23.s[2] +str q11, [x0, #352] +mul v9.4S, v9.4S,v24.s[2] +ldr q11, [x0, #304] +sub v2.4s, v21.4s, v20.4s +ldr q12, [x0, #368] +mla v0.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v20.4s +str q10, [x0, #416] +sqrdmulh v10.4S, v19.4S, v23.s[3] +str q16, [x0, #480] +mul v19.4S, v19.4S,v24.s[3] +ldr q16, [x0, #432] +sub v20.4s, v22.4s, v0.4s +ldr q8, [x0, #496] +mla v9.4S, v13.4S, v31.s[0] +add v22.4s, v22.4s, v0.4s +str q21, [x0, #544] +sqrdmulh v21.4S, v1.4S, v29.s[0] +str q2, [x0, #608] +ldr q2, [x0, #560] +mul v1.4S, v1.4S,v30.s[0] +ldr q0, [x0, #624] +sub v13.4s, v17.4s, v9.4s +mla v19.4S, v10.4S, v31.s[0] +add v17.4s, v17.4s, v9.4s +str q22, [x0, #672] +sqrdmulh v22.4S, v18.4S, v29.s[0] +str q20, [x0, #736] +ldr q20, [x0, #688] +mul v18.4S, v18.4S,v30.s[0] +ldr q9, [x0, #752] +sub v10.4s, v14.4s, v19.4s +mla v1.4S, v21.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +str q17, [x0, #800] +sqrdmulh v17.4S, v3.4S, v29.s[0] +str q13, [x0, #864] +mul v3.4S, v3.4S,v30.s[0] +ldr q13, [x0, #48] +sub v19.4s, v11.4s, v1.4s +mla v18.4S, v22.4S, v31.s[0] +add v11.4s, v11.4s, v1.4s +str q14, [x0, #928] +sqrdmulh v14.4S, v15.4S, v29.s[0] +str q10, [x0, #992] +mul v15.4S, v15.4S,v30.s[0] +ldr q10, [x0, #112] +sub v1.4s, v12.4s, v18.4s +mla v3.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v2.4S, v29.s[0] +ldr q17, [x0, #176] +mul v2.4S, v2.4S,v30.s[0] +sub v22.4s, v16.4s, v3.4s +mla v15.4S, v14.4S, v31.s[0] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v0.4S, v29.s[0] +ldr q14, [x0, #240] +mul v0.4S, v0.4S,v30.s[0] +sub v21.4s, v8.4s, v15.4s +mla v2.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +sub v18.4s, v13.4s, v2.4s +mla v0.4S, v3.4S, v31.s[0] +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v9.4S, v29.s[0] +mul v9.4S, v9.4S,v30.s[0] +sub v3.4s, v10.4s, v0.4s +mla v20.4S, v15.4S, v31.s[0] +add v10.4s, v10.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v15.4s, v17.4s, v20.4s +mla v9.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +sub v2.4s, v14.4s, v9.4s +mla v16.4S, v0.4S, v31.s[0] +add v14.4s, v14.4s, v9.4s +sqrdmulh v9.4S, v11.4S, v29.s[1] +mul v11.4S, v11.4S,v30.s[1] +sub v0.4s, v17.4s, v16.4s +mla v8.4S, v20.4S, v31.s[0] +add v17.4s, v17.4s, v16.4s +sqrdmulh v16.4S, v12.4S, v29.s[1] +mul v12.4S, v12.4S,v30.s[1] +sub v20.4s, v14.4s, v8.4s +mla v11.4S, v9.4S, v31.s[0] +add v14.4s, v14.4s, v8.4s +sqrdmulh v8.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +sub v9.4s, v13.4s, v11.4s +mla v12.4S, v16.4S, v31.s[0] +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +sub v16.4s, v10.4s, v12.4s +mla v22.4S, v8.4S, v31.s[0] +add v10.4s, v10.4s, v12.4s +sqrdmulh v12.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +sub v8.4s, v15.4s, v22.4s +mla v21.4S, v11.4S, v31.s[0] +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v1.4S, v29.s[2] +mul v1.4S, v1.4S,v30.s[2] +sub v11.4s, v2.4s, v21.4s +mla v19.4S, v12.4S, v31.s[0] +add v2.4s, v2.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v27.s[0] +mul v17.4S, v17.4S,v28.s[0] +sub v12.4s, v18.4s, v19.4s +mla v1.4S, v22.4S, v31.s[0] +add v18.4s, v18.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +sub v22.4s, v3.4s, v1.4s +mla v17.4S, v21.4S, v31.s[0] +add v3.4s, v3.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v21.4s, v13.4s, v17.4s +mla v14.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v17.4s +sqrdmulh v17.4S, v20.4S, v27.s[1] +mul v20.4S, v20.4S,v28.s[1] +sub v19.4s, v10.4s, v14.4s +mla v0.4S, v1.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v27.s[2] +mul v15.4S, v15.4S,v28.s[2] +sub v1.4s, v9.4s, v0.4s +mla v20.4S, v17.4S, v31.s[0] +add v9.4s, v9.4s, v0.4s +sqrdmulh v0.4S, v2.4S, v27.s[2] +mul v2.4S, v2.4S,v28.s[2] +sub v17.4s, v16.4s, v20.4s +mla v15.4S, v14.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v27.s[3] +mul v8.4S, v8.4S,v28.s[3] +sub v14.4s, v18.4s, v15.4s +mla v2.4S, v0.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v27.s[3] +mul v11.4S, v11.4S,v28.s[3] +sub v0.4s, v3.4s, v2.4s +mla v8.4S, v20.4S, v31.s[0] +add v3.4s, v3.4s, v2.4s +sqrdmulh v2.4S, v10.4S, v25.s[0] +mul v10.4S, v10.4S,v26.s[0] +sub v20.4s, v12.4s, v8.4s +mla v11.4S, v15.4S, v31.s[0] +add v12.4s, v12.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v25.s[1] +mul v19.4S, v19.4S,v26.s[1] +sub v15.4s, v22.4s, v11.4s +mla v10.4S, v2.4S, v31.s[0] +add v22.4s, v22.4s, v11.4s +sqrdmulh v11.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v2.4s, v13.4s, v10.4s +mla v19.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v10.4s +sqrdmulh v10.4S, v17.4S, v25.s[3] +mul v17.4S, v17.4S,v26.s[3] +sub v8.4s, v21.4s, v19.4s +mla v16.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v19.4s +str q13, [x0, #48] +sqrdmulh v13.4S, v3.4S, v23.s[0] +str q2, [x0, #112] +mul v3.4S, v3.4S,v24.s[0] +ldr q2, [x0, #768] +sub v19.4s, v9.4s, v16.4s +ldr q11, [x0, #832] +mla v17.4S, v10.4S, v31.s[0] +add v9.4s, v9.4s, v16.4s +str q21, [x0, #176] +sqrdmulh v21.4S, v0.4S, v23.s[1] +str q8, [x0, #240] +mul v0.4S, v0.4S,v24.s[1] +ldr q8, [x0, #896] +sub v16.4s, v1.4s, v17.4s +ldr q10, [x0, #960] +mla v3.4S, v13.4S, v31.s[0] +add v1.4s, v1.4s, v17.4s +str q9, [x0, #304] +sqrdmulh v9.4S, v22.4S, v23.s[2] +str q19, [x0, #368] +mul v22.4S, v22.4S,v24.s[2] +ldr q19, [x0, #256] +sub v17.4s, v18.4s, v3.4s +ldr q13, [x0, #320] +mla v0.4S, v21.4S, v31.s[0] +add v18.4s, v18.4s, v3.4s +str q1, [x0, #432] +sqrdmulh v1.4S, v15.4S, v23.s[3] +str q16, [x0, #496] +mul v15.4S, v15.4S,v24.s[3] +ldr q16, [x0, #384] +sub v3.4s, v14.4s, v0.4s +ldr q21, [x0, #448] +mla v22.4S, v9.4S, v31.s[0] +add v14.4s, v14.4s, v0.4s +str q18, [x0, #560] +sqrdmulh v18.4S, v2.4S, v29.s[0] +str q17, [x0, #624] +ldr q17, [x0, #512] +mul v2.4S, v2.4S,v30.s[0] +ldr q0, [x0, #576] +sub v9.4s, v12.4s, v22.4s +mla v15.4S, v1.4S, v31.s[0] +add v12.4s, v12.4s, v22.4s +str q14, [x0, #688] +sqrdmulh v14.4S, v11.4S, v29.s[0] +str q3, [x0, #752] +ldr q3, [x0, #640] +mul v11.4S, v11.4S,v30.s[0] +ldr q22, [x0, #704] +sub v1.4s, v20.4s, v15.4s +mla v2.4S, v18.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +str q12, [x0, #816] +sqrdmulh v12.4S, v8.4S, v29.s[0] +str q9, [x0, #880] +mul v8.4S, v8.4S,v30.s[0] +ldr q9, [x0, #0] +sub v15.4s, v19.4s, v2.4s +mla v11.4S, v14.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +str q20, [x0, #944] +sqrdmulh v20.4S, v10.4S, v29.s[0] +str q1, [x0, #1008] +mul v10.4S, v10.4S,v30.s[0] +ldr q1, [x0, #64] +sub v2.4s, v13.4s, v11.4s +mla v8.4S, v12.4S, v31.s[0] +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v29.s[0] +ldr q12, [x0, #128] +mul v17.4S, v17.4S,v30.s[0] +sub v14.4s, v16.4s, v8.4s +mla v10.4S, v20.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v0.4S, v29.s[0] +ldr q20, [x0, #192] +mul v0.4S, v0.4S,v30.s[0] +sub v18.4s, v21.4s, v10.4s +mla v17.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +sub v11.4s, v9.4s, v17.4s +mla v0.4S, v8.4S, v31.s[0] +add v9.4s, v9.4s, v17.4s +sqrdmulh v17.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v8.4s, v1.4s, v0.4s +mla v3.4S, v10.4S, v31.s[0] +add v1.4s, v1.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v10.4s, v12.4s, v3.4s +mla v22.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v17.4s, v20.4s, v22.4s +mla v16.4S, v0.4S, v31.s[0] +add v20.4s, v20.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[1] +mul v19.4S, v19.4S,v30.s[1] +sub v0.4s, v12.4s, v16.4s +mla v21.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v13.4S, v29.s[1] +mul v13.4S, v13.4S,v30.s[1] +sub v3.4s, v20.4s, v21.4s +mla v19.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v22.4s, v9.4s, v19.4s +mla v13.4S, v16.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v29.s[2] +mul v18.4S, v18.4S,v30.s[2] +sub v16.4s, v1.4s, v13.4s +mla v14.4S, v21.4S, v31.s[0] +add v1.4s, v1.4s, v13.4s +sqrdmulh v13.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +sub v21.4s, v10.4s, v14.4s +mla v18.4S, v19.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v19.4s, v17.4s, v18.4s +mla v15.4S, v13.4S, v31.s[0] +add v17.4s, v17.4s, v18.4s +sqrdmulh v18.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +sub v13.4s, v11.4s, v15.4s +mla v2.4S, v14.4S, v31.s[0] +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v27.s[0] +mul v20.4S, v20.4S,v28.s[0] +sub v14.4s, v8.4s, v2.4s +mla v12.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v2.4s +sqrdmulh v2.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v18.4s, v9.4s, v12.4s +mla v20.4S, v15.4S, v31.s[0] +add v9.4s, v9.4s, v12.4s +sqrdmulh v12.4S, v3.4S, v27.s[1] +mul v3.4S, v3.4S,v28.s[1] +sub v15.4s, v1.4s, v20.4s +mla v0.4S, v2.4S, v31.s[0] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v10.4S, v27.s[2] +mul v10.4S, v10.4S,v28.s[2] +sub v2.4s, v22.4s, v0.4s +mla v3.4S, v12.4S, v31.s[0] +add v22.4s, v22.4s, v0.4s +sqrdmulh v0.4S, v17.4S, v27.s[2] +mul v17.4S, v17.4S,v28.s[2] +sub v12.4s, v16.4s, v3.4s +mla v10.4S, v20.4S, v31.s[0] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +sub v20.4s, v11.4s, v10.4s +mla v17.4S, v0.4S, v31.s[0] +add v11.4s, v11.4s, v10.4s +sqrdmulh v10.4S, v19.4S, v27.s[3] +mul v19.4S, v19.4S,v28.s[3] +sub v0.4s, v8.4s, v17.4s +mla v21.4S, v3.4S, v31.s[0] +add v8.4s, v8.4s, v17.4s +sqrdmulh v17.4S, v1.4S, v25.s[0] +mul v1.4S, v1.4S,v26.s[0] +sub v3.4s, v13.4s, v21.4s +mla v19.4S, v10.4S, v31.s[0] +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v15.4S, v25.s[1] +mul v15.4S, v15.4S,v26.s[1] +sub v10.4s, v14.4s, v19.4s +mla v1.4S, v17.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +sqrdmulh v19.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v17.4s, v9.4s, v1.4s +mla v15.4S, v21.4S, v31.s[0] +add v9.4s, v9.4s, v1.4s +sqrdmulh v1.4S, v12.4S, v25.s[3] +mul v12.4S, v12.4S,v26.s[3] +sub v21.4s, v18.4s, v15.4s +mla v16.4S, v19.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +str q9, [x0, #0] +sqrdmulh v9.4S, v8.4S, v23.s[0] +str q17, [x0, #64] +mul v8.4S, v8.4S,v24.s[0] +ldr q17, [x0, #784] +sub v15.4s, v22.4s, v16.4s +ldr q19, [x0, #848] +mla v12.4S, v1.4S, v31.s[0] +add v22.4s, v22.4s, v16.4s +str q18, [x0, #128] +sqrdmulh v18.4S, v0.4S, v23.s[1] +str q21, [x0, #192] +mul v0.4S, v0.4S,v24.s[1] +ldr q21, [x0, #912] +sub v16.4s, v2.4s, v12.4s +ldr q1, [x0, #976] +mla v8.4S, v9.4S, v31.s[0] +add v2.4s, v2.4s, v12.4s +str q22, [x0, #256] +sqrdmulh v22.4S, v14.4S, v23.s[2] +str q15, [x0, #320] +mul v14.4S, v14.4S,v24.s[2] +ldr q15, [x0, #272] +sub v12.4s, v11.4s, v8.4s +ldr q9, [x0, #336] +mla v0.4S, v18.4S, v31.s[0] +add v11.4s, v11.4s, v8.4s +str q2, [x0, #384] +sqrdmulh v2.4S, v10.4S, v23.s[3] +str q16, [x0, #448] +mul v10.4S, v10.4S,v24.s[3] +ldr q16, [x0, #400] +sub v8.4s, v20.4s, v0.4s +ldr q18, [x0, #464] +mla v14.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v0.4s +str q11, [x0, #512] +sqrdmulh v11.4S, v17.4S, v29.s[0] +str q12, [x0, #576] +ldr q12, [x0, #528] +mul v17.4S, v17.4S,v30.s[0] +ldr q0, [x0, #592] +sub v22.4s, v13.4s, v14.4s +mla v10.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v14.4s +str q20, [x0, #640] +sqrdmulh v20.4S, v19.4S, v29.s[0] +str q8, [x0, #704] +ldr q8, [x0, #656] +mul v19.4S, v19.4S,v30.s[0] +ldr q14, [x0, #720] +sub v2.4s, v3.4s, v10.4s +mla v17.4S, v11.4S, v31.s[0] +add v3.4s, v3.4s, v10.4s +str q13, [x0, #768] +sqrdmulh v13.4S, v21.4S, v29.s[0] +str q22, [x0, #832] +mul v21.4S, v21.4S,v30.s[0] +ldr q22, [x0, #16] +sub v10.4s, v15.4s, v17.4s +mla v19.4S, v20.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +str q3, [x0, #896] +sqrdmulh v3.4S, v1.4S, v29.s[0] +str q2, [x0, #960] +mul v1.4S, v1.4S,v30.s[0] +ldr q2, [x0, #80] +sub v17.4s, v9.4s, v19.4s +mla v21.4S, v13.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v12.4S, v29.s[0] +ldr q13, [x0, #144] +mul v12.4S, v12.4S,v30.s[0] +sub v20.4s, v16.4s, v21.4s +mla v1.4S, v3.4S, v31.s[0] +add v16.4s, v16.4s, v21.4s +sqrdmulh v21.4S, v0.4S, v29.s[0] +ldr q3, [x0, #208] +mul v0.4S, v0.4S,v30.s[0] +sub v11.4s, v18.4s, v1.4s +mla v12.4S, v19.4S, v31.s[0] +add v18.4s, v18.4s, v1.4s +sqrdmulh v1.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v19.4s, v22.4s, v12.4s +mla v0.4S, v21.4S, v31.s[0] +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v21.4s, v2.4s, v0.4s +mla v8.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v1.4s, v13.4s, v8.4s +mla v14.4S, v12.4S, v31.s[0] +add v13.4s, v13.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v12.4s, v3.4s, v14.4s +mla v16.4S, v0.4S, v31.s[0] +add v3.4s, v3.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +sub v0.4s, v13.4s, v16.4s +mla v18.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v16.4s +sqrdmulh v16.4S, v9.4S, v29.s[1] +mul v9.4S, v9.4S,v30.s[1] +sub v8.4s, v3.4s, v18.4s +mla v15.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +sub v14.4s, v22.4s, v15.4s +mla v9.4S, v16.4S, v31.s[0] +add v22.4s, v22.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v16.4s, v2.4s, v9.4s +mla v20.4S, v18.4S, v31.s[0] +add v2.4s, v2.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v18.4s, v1.4s, v20.4s +mla v11.4S, v15.4S, v31.s[0] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +sub v15.4s, v12.4s, v11.4s +mla v10.4S, v9.4S, v31.s[0] +add v12.4s, v12.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v27.s[0] +mul v13.4S, v13.4S,v28.s[0] +sub v9.4s, v19.4s, v10.4s +mla v17.4S, v20.4S, v31.s[0] +add v19.4s, v19.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v27.s[0] +mul v3.4S, v3.4S,v28.s[0] +sub v20.4s, v21.4s, v17.4s +mla v13.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v11.4s, v22.4s, v13.4s +mla v3.4S, v10.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v8.4S, v27.s[1] +mul v8.4S, v8.4S,v28.s[1] +sub v10.4s, v2.4s, v3.4s +mla v0.4S, v17.4S, v31.s[0] +add v2.4s, v2.4s, v3.4s +sqrdmulh v3.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +sub v17.4s, v14.4s, v0.4s +mla v8.4S, v13.4S, v31.s[0] +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v12.4S, v27.s[2] +mul v12.4S, v12.4S,v28.s[2] +sub v13.4s, v16.4s, v8.4s +mla v1.4S, v3.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v3.4s, v19.4s, v1.4s +mla v12.4S, v0.4S, v31.s[0] +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +sub v0.4s, v21.4s, v12.4s +mla v18.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v2.4S, v25.s[0] +mul v2.4S, v2.4S,v26.s[0] +sub v8.4s, v9.4s, v18.4s +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v10.4S, v25.s[1] +mul v10.4S, v10.4S,v26.s[1] +sub v1.4s, v20.4s, v15.4s +mla v2.4S, v12.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v12.4s, v22.4s, v2.4s +mla v10.4S, v18.4S, v31.s[0] +add v22.4s, v22.4s, v2.4s +sqrdmulh v2.4S, v13.4S, v25.s[3] +mul v13.4S, v13.4S,v26.s[3] +sub v18.4s, v11.4s, v10.4s +mla v16.4S, v15.4S, v31.s[0] +add v11.4s, v11.4s, v10.4s +str q22, [x0, #16] +sqrdmulh v22.4S, v21.4S, v23.s[0] +str q12, [x0, #80] +mul v21.4S, v21.4S,v24.s[0] +sub v12.4s, v14.4s, v16.4s +mla v13.4S, v2.4S, v31.s[0] +add v14.4s, v14.4s, v16.4s +str q11, [x0, #144] +sqrdmulh v11.4S, v0.4S, v23.s[1] +str q18, [x0, #208] +mul v0.4S, v0.4S,v24.s[1] +sub v18.4s, v17.4s, v13.4s +mla v21.4S, v22.4S, v31.s[0] +add v17.4s, v17.4s, v13.4s +str q14, [x0, #272] +sqrdmulh v14.4S, v20.4S, v23.s[2] +str q12, [x0, #336] +mul v20.4S, v20.4S,v24.s[2] +sub v12.4s, v19.4s, v21.4s +mla v0.4S, v11.4S, v31.s[0] +add v19.4s, v19.4s, v21.4s +str q17, [x0, #400] +sqrdmulh v17.4S, v1.4S, v23.s[3] +str q18, [x0, #464] +mul v1.4S, v1.4S,v24.s[3] +sub v18.4s, v3.4s, v0.4s +mla v20.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v0.4s +str q19, [x0, #528] +str q12, [x0, #592] +sub v12.4s, v9.4s, v20.4s +mla v1.4S, v17.4S, v31.s[0] +add v9.4s, v9.4s, v20.4s +str q3, [x0, #656] +str q18, [x0, #720] +sub v18.4s, v8.4s, v1.4s +add v8.4s, v8.4s, v1.4s +str q9, [x0, #784] +str q12, [x0, #848] +str q8, [x0, #912] +str q18, [x0, #976] +ldr q4, [x0, #32] +ldr q5, [x0, #48] +ldr q6, [x0, #0] +ldr q7, [x0, #16] +ldr q15, [x0, #96] +ldr q10, [x0, #112] +ldr q2, [x0, #64] +ldr q16, [x0, #80] +ldr q22, [x0, #160] +ldr q13, [x0, #176] +ldr q11, [x0, #128] +ldr q21, [x0, #144] +ldr q14, [x0, #224] +ldr q0, [x0, #240] +ldr q19, [x0, #192] +ldr q17, [x0, #208] +ldr q20, [x17, #+128] +ldr q3, [x17, #+144] +ldr q1, [x17, #+160] +ldr q9, [x17, #+176] +ldr q12, [x17, #+192] +ldr q8, [x17, #+208] +ldr q18, [x17, #+224] +ldr q30, [x17, #+240] +sqrdmulh v29.4S, v4.4S, v3.s[0] +mul v4.4S, v4.4S,v20.s[0] +sqrdmulh v28.4S, v5.4S, v3.s[0] +mul v5.4S, v5.4S,v20.s[0] +mla v4.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v15.4S, v9.s[0] +mul v15.4S, v15.4S,v1.s[0] +mla v5.4S, v28.4S, v31.s[0] +sub v28.4s, v6.4s, v4.4s +add v6.4s, v6.4s, v4.4s +sqrdmulh v4.4S, v10.4S, v9.s[0] +mul v10.4S, v10.4S,v1.s[0] +mla v15.4S, v29.4S, v31.s[0] +sub v29.4s, v7.4s, v5.4s +add v7.4s, v7.4s, v5.4s +sqrdmulh v5.4S, v7.4S, v3.s[1] +mul v7.4S, v7.4S,v20.s[1] +mla v10.4S, v4.4S, v31.s[0] +sub v4.4s, v2.4s, v15.4s +add v2.4s, v2.4s, v15.4s +sqrdmulh v15.4S, v29.4S, v3.s[2] +mul v29.4S, v29.4S,v20.s[2] +mla v7.4S, v5.4S, v31.s[0] +sub v5.4s, v16.4s, v10.4s +add v16.4s, v16.4s, v10.4s +sqrdmulh v10.4S, v16.4S, v9.s[1] +mul v16.4S, v16.4S,v1.s[1] +mla v29.4S, v15.4S, v31.s[0] +sub v15.4s, v6.4s, v7.4s +add v6.4s, v6.4s, v7.4s +sqrdmulh v3.4S, v5.4S, v9.s[2] +mul v5.4S, v5.4S,v1.s[2] +mla v16.4S, v10.4S, v31.s[0] +sub v10.4s, v28.4s, v29.4s +add v28.4s, v28.4s, v29.4s +sqrdmulh v29.4S, v22.4S, v8.s[0] +mul v22.4S, v22.4S,v12.s[0] +mla v5.4S, v3.4S, v31.s[0] +sub v3.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +sqrdmulh v9.4S, v13.4S, v8.s[0] +mul v13.4S, v13.4S,v12.s[0] +mla v22.4S, v29.4S, v31.s[0] +sub v29.4s, v4.4s, v5.4s +add v4.4s, v4.4s, v5.4s +sqrdmulh v5.4S, v14.4S, v30.s[0] +mul v14.4S, v14.4S,v18.s[0] +mla v13.4S, v9.4S, v31.s[0] +sub v9.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +sqrdmulh v22.4S, v0.4S, v30.s[0] +mul v0.4S, v0.4S,v18.s[0] +mla v14.4S, v5.4S, v31.s[0] +sub v5.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +sqrdmulh v13.4S, v21.4S, v8.s[1] +mul v21.4S, v21.4S,v12.s[1] +mla v0.4S, v22.4S, v31.s[0] +sub v22.4s, v19.4s, v14.4s +add v19.4s, v19.4s, v14.4s +sqrdmulh v14.4S, v5.4S, v8.s[2] +mul v5.4S, v5.4S,v12.s[2] +mla v21.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v0.4s +add v17.4s, v17.4s, v0.4s +sqrdmulh v0.4S, v17.4S, v30.s[1] +mul v17.4S, v17.4S,v18.s[1] +mla v5.4S, v14.4S, v31.s[0] +sub v14.4s, v11.4s, v21.4s +add v11.4s, v11.4s, v21.4s +sqrdmulh v8.4S, v13.4S, v30.s[2] +mul v13.4S, v13.4S,v18.s[2] +mla v17.4S, v0.4S, v31.s[0] +sub v0.4s, v9.4s, v5.4s +add v9.4s, v9.4s, v5.4s +mla v13.4S, v8.4S, v31.s[0] +sub v8.4s, v19.4s, v17.4s +add v19.4s, v19.4s, v17.4s +sub v30.4s, v22.4s, v13.4s +add v22.4s, v22.4s, v13.4s +str q6, [x0, #0] +str q15, [x0, #16] +str q28, [x0, #32] +str q10, [x0, #48] +str q2, [x0, #64] +str q3, [x0, #80] +str q4, [x0, #96] +str q29, [x0, #112] +str q11, [x0, #128] +str q14, [x0, #144] +str q9, [x0, #160] +str q0, [x0, #176] +str q19, [x0, #192] +str q8, [x0, #208] +str q22, [x0, #224] +str q30, [x0, #240] +ldr q30, [x0, #288] +ldr q22, [x0, #304] +ldr q8, [x0, #256] +ldr q19, [x0, #272] +ldr q0, [x0, #352] +ldr q9, [x0, #368] +ldr q14, [x0, #320] +ldr q11, [x0, #336] +ldr q29, [x0, #416] +ldr q4, [x0, #432] +ldr q3, [x0, #384] +ldr q2, [x0, #400] +ldr q10, [x0, #480] +ldr q28, [x0, #496] +ldr q15, [x0, #448] +ldr q6, [x0, #464] +ldr q13, [x17, #+256] +ldr q18, [x17, #+272] +ldr q17, [x17, #+288] +ldr q5, [x17, #+304] +ldr q12, [x17, #+320] +ldr q21, [x17, #+336] +ldr q1, [x17, #+352] +ldr q16, [x17, #+368] +sqrdmulh v20.4S, v30.4S, v18.s[0] +mul v30.4S, v30.4S,v13.s[0] +sqrdmulh v7.4S, v22.4S, v18.s[0] +mul v22.4S, v22.4S,v13.s[0] +mla v30.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v0.4S, v5.s[0] +mul v0.4S, v0.4S,v17.s[0] +mla v22.4S, v7.4S, v31.s[0] +sub v7.4s, v8.4s, v30.4s +add v8.4s, v8.4s, v30.4s +sqrdmulh v30.4S, v9.4S, v5.s[0] +mul v9.4S, v9.4S,v17.s[0] +mla v0.4S, v20.4S, v31.s[0] +sub v20.4s, v19.4s, v22.4s +add v19.4s, v19.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v18.s[1] +mul v19.4S, v19.4S,v13.s[1] +mla v9.4S, v30.4S, v31.s[0] +sub v30.4s, v14.4s, v0.4s +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v20.4S, v18.s[2] +mul v20.4S, v20.4S,v13.s[2] +mla v19.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v9.4s +add v11.4s, v11.4s, v9.4s +sqrdmulh v9.4S, v11.4S, v5.s[1] +mul v11.4S, v11.4S,v17.s[1] +mla v20.4S, v0.4S, v31.s[0] +sub v0.4s, v8.4s, v19.4s +add v8.4s, v8.4s, v19.4s +sqrdmulh v18.4S, v22.4S, v5.s[2] +mul v22.4S, v22.4S,v17.s[2] +mla v11.4S, v9.4S, v31.s[0] +sub v9.4s, v7.4s, v20.4s +add v7.4s, v7.4s, v20.4s +sqrdmulh v20.4S, v29.4S, v21.s[0] +mul v29.4S, v29.4S,v12.s[0] +mla v22.4S, v18.4S, v31.s[0] +sub v18.4s, v14.4s, v11.4s +add v14.4s, v14.4s, v11.4s +sqrdmulh v5.4S, v4.4S, v21.s[0] +mul v4.4S, v4.4S,v12.s[0] +mla v29.4S, v20.4S, v31.s[0] +sub v20.4s, v30.4s, v22.4s +add v30.4s, v30.4s, v22.4s +sqrdmulh v22.4S, v10.4S, v16.s[0] +mul v10.4S, v10.4S,v1.s[0] +mla v4.4S, v5.4S, v31.s[0] +sub v5.4s, v3.4s, v29.4s +add v3.4s, v3.4s, v29.4s +sqrdmulh v29.4S, v28.4S, v16.s[0] +mul v28.4S, v28.4S,v1.s[0] +mla v10.4S, v22.4S, v31.s[0] +sub v22.4s, v2.4s, v4.4s +add v2.4s, v2.4s, v4.4s +sqrdmulh v4.4S, v2.4S, v21.s[1] +mul v2.4S, v2.4S,v12.s[1] +mla v28.4S, v29.4S, v31.s[0] +sub v29.4s, v15.4s, v10.4s +add v15.4s, v15.4s, v10.4s +sqrdmulh v10.4S, v22.4S, v21.s[2] +mul v22.4S, v22.4S,v12.s[2] +mla v2.4S, v4.4S, v31.s[0] +sub v4.4s, v6.4s, v28.4s +add v6.4s, v6.4s, v28.4s +sqrdmulh v28.4S, v6.4S, v16.s[1] +mul v6.4S, v6.4S,v1.s[1] +mla v22.4S, v10.4S, v31.s[0] +sub v10.4s, v3.4s, v2.4s +add v3.4s, v3.4s, v2.4s +sqrdmulh v21.4S, v4.4S, v16.s[2] +mul v4.4S, v4.4S,v1.s[2] +mla v6.4S, v28.4S, v31.s[0] +sub v28.4s, v5.4s, v22.4s +add v5.4s, v5.4s, v22.4s +mla v4.4S, v21.4S, v31.s[0] +sub v21.4s, v15.4s, v6.4s +add v15.4s, v15.4s, v6.4s +sub v16.4s, v29.4s, v4.4s +add v29.4s, v29.4s, v4.4s +str q8, [x0, #256] +str q0, [x0, #272] +str q7, [x0, #288] +str q9, [x0, #304] +str q14, [x0, #320] +str q18, [x0, #336] +str q30, [x0, #352] +str q20, [x0, #368] +str q3, [x0, #384] +str q10, [x0, #400] +str q5, [x0, #416] +str q28, [x0, #432] +str q15, [x0, #448] +str q21, [x0, #464] +str q29, [x0, #480] +str q16, [x0, #496] +ldr q16, [x0, #544] +ldr q29, [x0, #560] +ldr q21, [x0, #512] +ldr q15, [x0, #528] +ldr q28, [x0, #608] +ldr q5, [x0, #624] +ldr q10, [x0, #576] +ldr q3, [x0, #592] +ldr q20, [x0, #672] +ldr q30, [x0, #688] +ldr q18, [x0, #640] +ldr q14, [x0, #656] +ldr q9, [x0, #736] +ldr q7, [x0, #752] +ldr q0, [x0, #704] +ldr q8, [x0, #720] +ldr q4, [x17, #+384] +ldr q1, [x17, #+400] +ldr q6, [x17, #+416] +ldr q22, [x17, #+432] +ldr q12, [x17, #+448] +ldr q2, [x17, #+464] +ldr q17, [x17, #+480] +ldr q11, [x17, #+496] +sqrdmulh v13.4S, v16.4S, v1.s[0] +mul v16.4S, v16.4S,v4.s[0] +sqrdmulh v19.4S, v29.4S, v1.s[0] +mul v29.4S, v29.4S,v4.s[0] +mla v16.4S, v13.4S, v31.s[0] +sqrdmulh v13.4S, v28.4S, v22.s[0] +mul v28.4S, v28.4S,v6.s[0] +mla v29.4S, v19.4S, v31.s[0] +sub v19.4s, v21.4s, v16.4s +add v21.4s, v21.4s, v16.4s +sqrdmulh v16.4S, v5.4S, v22.s[0] +mul v5.4S, v5.4S,v6.s[0] +mla v28.4S, v13.4S, v31.s[0] +sub v13.4s, v15.4s, v29.4s +add v15.4s, v15.4s, v29.4s +sqrdmulh v29.4S, v15.4S, v1.s[1] +mul v15.4S, v15.4S,v4.s[1] +mla v5.4S, v16.4S, v31.s[0] +sub v16.4s, v10.4s, v28.4s +add v10.4s, v10.4s, v28.4s +sqrdmulh v28.4S, v13.4S, v1.s[2] +mul v13.4S, v13.4S,v4.s[2] +mla v15.4S, v29.4S, v31.s[0] +sub v29.4s, v3.4s, v5.4s +add v3.4s, v3.4s, v5.4s +sqrdmulh v5.4S, v3.4S, v22.s[1] +mul v3.4S, v3.4S,v6.s[1] +mla v13.4S, v28.4S, v31.s[0] +sub v28.4s, v21.4s, v15.4s +add v21.4s, v21.4s, v15.4s +sqrdmulh v1.4S, v29.4S, v22.s[2] +mul v29.4S, v29.4S,v6.s[2] +mla v3.4S, v5.4S, v31.s[0] +sub v5.4s, v19.4s, v13.4s +add v19.4s, v19.4s, v13.4s +sqrdmulh v13.4S, v20.4S, v2.s[0] +mul v20.4S, v20.4S,v12.s[0] +mla v29.4S, v1.4S, v31.s[0] +sub v1.4s, v10.4s, v3.4s +add v10.4s, v10.4s, v3.4s +sqrdmulh v22.4S, v30.4S, v2.s[0] +mul v30.4S, v30.4S,v12.s[0] +mla v20.4S, v13.4S, v31.s[0] +sub v13.4s, v16.4s, v29.4s +add v16.4s, v16.4s, v29.4s +sqrdmulh v29.4S, v9.4S, v11.s[0] +mul v9.4S, v9.4S,v17.s[0] +mla v30.4S, v22.4S, v31.s[0] +sub v22.4s, v18.4s, v20.4s +add v18.4s, v18.4s, v20.4s +sqrdmulh v20.4S, v7.4S, v11.s[0] +mul v7.4S, v7.4S,v17.s[0] +mla v9.4S, v29.4S, v31.s[0] +sub v29.4s, v14.4s, v30.4s +add v14.4s, v14.4s, v30.4s +sqrdmulh v30.4S, v14.4S, v2.s[1] +mul v14.4S, v14.4S,v12.s[1] +mla v7.4S, v20.4S, v31.s[0] +sub v20.4s, v0.4s, v9.4s +add v0.4s, v0.4s, v9.4s +sqrdmulh v9.4S, v29.4S, v2.s[2] +mul v29.4S, v29.4S,v12.s[2] +mla v14.4S, v30.4S, v31.s[0] +sub v30.4s, v8.4s, v7.4s +add v8.4s, v8.4s, v7.4s +sqrdmulh v7.4S, v8.4S, v11.s[1] +mul v8.4S, v8.4S,v17.s[1] +mla v29.4S, v9.4S, v31.s[0] +sub v9.4s, v18.4s, v14.4s +add v18.4s, v18.4s, v14.4s +sqrdmulh v2.4S, v30.4S, v11.s[2] +mul v30.4S, v30.4S,v17.s[2] +mla v8.4S, v7.4S, v31.s[0] +sub v7.4s, v22.4s, v29.4s +add v22.4s, v22.4s, v29.4s +mla v30.4S, v2.4S, v31.s[0] +sub v2.4s, v0.4s, v8.4s +add v0.4s, v0.4s, v8.4s +sub v11.4s, v20.4s, v30.4s +add v20.4s, v20.4s, v30.4s +str q21, [x0, #512] +str q28, [x0, #528] +str q19, [x0, #544] +str q5, [x0, #560] +str q10, [x0, #576] +str q1, [x0, #592] +str q16, [x0, #608] +str q13, [x0, #624] +str q18, [x0, #640] +str q9, [x0, #656] +str q22, [x0, #672] +str q7, [x0, #688] +str q0, [x0, #704] +str q2, [x0, #720] +str q20, [x0, #736] +str q11, [x0, #752] +ldr q11, [x0, #800] +ldr q20, [x0, #816] +ldr q2, [x0, #768] +ldr q0, [x0, #784] +ldr q7, [x0, #864] +ldr q22, [x0, #880] +ldr q9, [x0, #832] +ldr q18, [x0, #848] +ldr q13, [x0, #928] +ldr q16, [x0, #944] +ldr q1, [x0, #896] +ldr q10, [x0, #912] +ldr q5, [x0, #992] +ldr q19, [x0, #1008] +ldr q28, [x0, #960] +ldr q21, [x0, #976] +ldr q30, [x17, #+512] +ldr q17, [x17, #+528] +ldr q8, [x17, #+544] +ldr q29, [x17, #+560] +ldr q12, [x17, #+576] +ldr q14, [x17, #+592] +ldr q6, [x17, #+608] +ldr q3, [x17, #+624] +sqrdmulh v4.4S, v11.4S, v17.s[0] +mul v11.4S, v11.4S,v30.s[0] +sqrdmulh v15.4S, v20.4S, v17.s[0] +mul v20.4S, v20.4S,v30.s[0] +mla v11.4S, v4.4S, v31.s[0] +sqrdmulh v4.4S, v7.4S, v29.s[0] +mul v7.4S, v7.4S,v8.s[0] +mla v20.4S, v15.4S, v31.s[0] +sub v15.4s, v2.4s, v11.4s +add v2.4s, v2.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v8.s[0] +mla v7.4S, v4.4S, v31.s[0] +sub v4.4s, v0.4s, v20.4s +add v0.4s, v0.4s, v20.4s +sqrdmulh v20.4S, v0.4S, v17.s[1] +mul v0.4S, v0.4S,v30.s[1] +mla v22.4S, v11.4S, v31.s[0] +sub v11.4s, v9.4s, v7.4s +add v9.4s, v9.4s, v7.4s +sqrdmulh v7.4S, v4.4S, v17.s[2] +mul v4.4S, v4.4S,v30.s[2] +mla v0.4S, v20.4S, v31.s[0] +sub v20.4s, v18.4s, v22.4s +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v8.s[1] +mla v4.4S, v7.4S, v31.s[0] +sub v7.4s, v2.4s, v0.4s +add v2.4s, v2.4s, v0.4s +sqrdmulh v17.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v8.s[2] +mla v18.4S, v22.4S, v31.s[0] +sub v22.4s, v15.4s, v4.4s +add v15.4s, v15.4s, v4.4s +sqrdmulh v4.4S, v13.4S, v14.s[0] +mul v13.4S, v13.4S,v12.s[0] +mla v20.4S, v17.4S, v31.s[0] +sub v17.4s, v9.4s, v18.4s +add v9.4s, v9.4s, v18.4s +sqrdmulh v29.4S, v16.4S, v14.s[0] +mul v16.4S, v16.4S,v12.s[0] +mla v13.4S, v4.4S, v31.s[0] +sub v4.4s, v11.4s, v20.4s +add v11.4s, v11.4s, v20.4s +sqrdmulh v20.4S, v5.4S, v3.s[0] +mul v5.4S, v5.4S,v6.s[0] +mla v16.4S, v29.4S, v31.s[0] +sub v29.4s, v1.4s, v13.4s +add v1.4s, v1.4s, v13.4s +sqrdmulh v13.4S, v19.4S, v3.s[0] +mul v19.4S, v19.4S,v6.s[0] +mla v5.4S, v20.4S, v31.s[0] +sub v20.4s, v10.4s, v16.4s +add v10.4s, v10.4s, v16.4s +sqrdmulh v16.4S, v10.4S, v14.s[1] +mul v10.4S, v10.4S,v12.s[1] +mla v19.4S, v13.4S, v31.s[0] +sub v13.4s, v28.4s, v5.4s +add v28.4s, v28.4s, v5.4s +sqrdmulh v5.4S, v20.4S, v14.s[2] +mul v20.4S, v20.4S,v12.s[2] +mla v10.4S, v16.4S, v31.s[0] +sub v16.4s, v21.4s, v19.4s +add v21.4s, v21.4s, v19.4s +sqrdmulh v19.4S, v21.4S, v3.s[1] +mul v21.4S, v21.4S,v6.s[1] +mla v20.4S, v5.4S, v31.s[0] +sub v5.4s, v1.4s, v10.4s +add v1.4s, v1.4s, v10.4s +sqrdmulh v14.4S, v16.4S, v3.s[2] +mul v16.4S, v16.4S,v6.s[2] +mla v21.4S, v19.4S, v31.s[0] +sub v19.4s, v29.4s, v20.4s +add v29.4s, v29.4s, v20.4s +mla v16.4S, v14.4S, v31.s[0] +sub v14.4s, v28.4s, v21.4s +add v28.4s, v28.4s, v21.4s +sub v3.4s, v13.4s, v16.4s +add v13.4s, v13.4s, v16.4s +str q2, [x0, #768] +str q7, [x0, #784] +str q15, [x0, #800] +str q22, [x0, #816] +str q9, [x0, #832] +str q17, [x0, #848] +str q11, [x0, #864] +str q4, [x0, #880] +str q1, [x0, #896] +str q5, [x0, #912] +str q29, [x0, #928] +str q19, [x0, #944] +str q28, [x0, #960] +str q14, [x0, #976] +str q13, [x0, #992] +str q3, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1464 +// Instruction count: 1460 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_2.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_2.s new file mode 100644 index 0000000..a72c6cd --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_2.s @@ -0,0 +1,1494 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_3_z4_2 +.global _ntt_u32_incomplete_neon_asm_var_4_2_3_z4_2 +ntt_u32_incomplete_neon_asm_var_4_2_3_z4_2: +_ntt_u32_incomplete_neon_asm_var_4_2_3_z4_2: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #800] +ldr q21, [x0, #864] +ldr q20, [x0, #928] +ldr q19, [x0, #992] +ldr q18, [x0, #288] +ldr q17, [x0, #352] +ldr q16, [x0, #416] +ldr q3, [x0, #480] +sqrdmulh v2.4S, v22.4S, v29.s[0] +ldr q1, [x0, #544] +mul v22.4S, v22.4S,v30.s[0] +ldr q0, [x0, #608] +sqrdmulh v15.4S, v21.4S, v29.s[0] +ldr q14, [x0, #672] +mul v21.4S, v21.4S,v30.s[0] +ldr q13, [x0, #736] +mla v22.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q12, [x0, #32] +sub v11.4s, v18.4s, v22.4s +mla v21.4S, v15.4S, v31.s[0] +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q15, [x0, #96] +sub v10.4s, v17.4s, v21.4s +mla v20.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v1.4S, v29.s[0] +ldr q2, [x0, #160] +mul v1.4S, v1.4S,v30.s[0] +sub v9.4s, v16.4s, v20.4s +mla v19.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v0.4S, v29.s[0] +ldr q22, [x0, #224] +mul v0.4S, v0.4S,v30.s[0] +sub v8.4s, v3.4s, v19.4s +mla v1.4S, v21.4S, v31.s[0] +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v21.4s, v12.4s, v1.4s +mla v0.4S, v20.4S, v31.s[0] +add v12.4s, v12.4s, v1.4s +sqrdmulh v1.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +sub v20.4s, v15.4s, v0.4s +mla v14.4S, v19.4S, v31.s[0] +add v15.4s, v15.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v19.4s, v2.4s, v14.4s +mla v13.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v1.4s, v22.4s, v13.4s +mla v16.4S, v0.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v0.4s, v2.4s, v16.4s +mla v3.4S, v14.4S, v31.s[0] +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v14.4s, v22.4s, v3.4s +mla v18.4S, v13.4S, v31.s[0] +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v29.s[2] +mul v9.4S, v9.4S,v30.s[2] +sub v13.4s, v12.4s, v18.4s +mla v17.4S, v16.4S, v31.s[0] +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v8.4S, v29.s[2] +mul v8.4S, v8.4S,v30.s[2] +sub v16.4s, v15.4s, v17.4s +mla v9.4S, v3.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +sqrdmulh v17.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v3.4s, v19.4s, v9.4s +mla v8.4S, v18.4S, v31.s[0] +add v19.4s, v19.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v18.4s, v1.4s, v8.4s +mla v11.4S, v17.4S, v31.s[0] +add v1.4s, v1.4s, v8.4s +sqrdmulh v8.4S, v2.4S, v27.s[0] +mul v2.4S, v2.4S,v28.s[0] +sub v17.4s, v21.4s, v11.4s +mla v10.4S, v9.4S, v31.s[0] +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v27.s[0] +mul v22.4S, v22.4S,v28.s[0] +sub v9.4s, v20.4s, v10.4s +mla v2.4S, v8.4S, v31.s[0] +add v20.4s, v20.4s, v10.4s +sqrdmulh v10.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v8.4s, v12.4s, v2.4s +mla v22.4S, v11.4S, v31.s[0] +add v12.4s, v12.4s, v2.4s +sqrdmulh v2.4S, v14.4S, v27.s[1] +mul v14.4S, v14.4S,v28.s[1] +sub v11.4s, v15.4s, v22.4s +mla v0.4S, v10.4S, v31.s[0] +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v27.s[2] +mul v19.4S, v19.4S,v28.s[2] +sub v10.4s, v13.4s, v0.4s +mla v14.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v0.4s +sqrdmulh v0.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +sub v2.4s, v16.4s, v14.4s +mla v19.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v27.s[3] +mul v3.4S, v3.4S,v28.s[3] +sub v22.4s, v21.4s, v19.4s +mla v1.4S, v0.4S, v31.s[0] +add v21.4s, v21.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v0.4s, v20.4s, v1.4s +mla v3.4S, v14.4S, v31.s[0] +add v20.4s, v20.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v25.s[0] +mul v15.4S, v15.4S,v26.s[0] +sub v14.4s, v17.4s, v3.4s +mla v18.4S, v19.4S, v31.s[0] +add v17.4s, v17.4s, v3.4s +sqrdmulh v3.4S, v11.4S, v25.s[1] +mul v11.4S, v11.4S,v26.s[1] +sub v19.4s, v9.4s, v18.4s +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v1.4s, v12.4s, v15.4s +mla v11.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v25.s[3] +mul v2.4S, v2.4S,v26.s[3] +sub v3.4s, v8.4s, v11.4s +mla v16.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v11.4s +str q12, [x0, #32] +sqrdmulh v12.4S, v20.4S, v23.s[0] +str q1, [x0, #96] +mul v20.4S, v20.4S,v24.s[0] +ldr q1, [x0, #816] +sub v11.4s, v13.4s, v16.4s +ldr q18, [x0, #880] +mla v2.4S, v15.4S, v31.s[0] +add v13.4s, v13.4s, v16.4s +str q8, [x0, #160] +sqrdmulh v8.4S, v0.4S, v23.s[1] +str q3, [x0, #224] +mul v0.4S, v0.4S,v24.s[1] +ldr q3, [x0, #944] +sub v16.4s, v10.4s, v2.4s +ldr q15, [x0, #1008] +mla v20.4S, v12.4S, v31.s[0] +add v10.4s, v10.4s, v2.4s +str q13, [x0, #288] +sqrdmulh v13.4S, v9.4S, v23.s[2] +str q11, [x0, #352] +mul v9.4S, v9.4S,v24.s[2] +ldr q11, [x0, #304] +sub v2.4s, v21.4s, v20.4s +ldr q12, [x0, #368] +mla v0.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v20.4s +str q10, [x0, #416] +sqrdmulh v10.4S, v19.4S, v23.s[3] +str q16, [x0, #480] +mul v19.4S, v19.4S,v24.s[3] +ldr q16, [x0, #432] +sub v20.4s, v22.4s, v0.4s +ldr q8, [x0, #496] +mla v9.4S, v13.4S, v31.s[0] +add v22.4s, v22.4s, v0.4s +str q21, [x0, #544] +sqrdmulh v21.4S, v1.4S, v29.s[0] +str q2, [x0, #608] +ldr q2, [x0, #560] +mul v1.4S, v1.4S,v30.s[0] +ldr q0, [x0, #624] +sub v13.4s, v17.4s, v9.4s +mla v19.4S, v10.4S, v31.s[0] +add v17.4s, v17.4s, v9.4s +str q22, [x0, #672] +sqrdmulh v22.4S, v18.4S, v29.s[0] +str q20, [x0, #736] +ldr q20, [x0, #688] +mul v18.4S, v18.4S,v30.s[0] +ldr q9, [x0, #752] +sub v10.4s, v14.4s, v19.4s +mla v1.4S, v21.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +str q17, [x0, #800] +sqrdmulh v17.4S, v3.4S, v29.s[0] +str q13, [x0, #864] +mul v3.4S, v3.4S,v30.s[0] +ldr q13, [x0, #48] +sub v19.4s, v11.4s, v1.4s +mla v18.4S, v22.4S, v31.s[0] +add v11.4s, v11.4s, v1.4s +str q14, [x0, #928] +sqrdmulh v14.4S, v15.4S, v29.s[0] +str q10, [x0, #992] +mul v15.4S, v15.4S,v30.s[0] +ldr q10, [x0, #112] +sub v1.4s, v12.4s, v18.4s +mla v3.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v2.4S, v29.s[0] +ldr q17, [x0, #176] +mul v2.4S, v2.4S,v30.s[0] +sub v22.4s, v16.4s, v3.4s +mla v15.4S, v14.4S, v31.s[0] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v0.4S, v29.s[0] +ldr q14, [x0, #240] +mul v0.4S, v0.4S,v30.s[0] +sub v21.4s, v8.4s, v15.4s +mla v2.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +sub v18.4s, v13.4s, v2.4s +mla v0.4S, v3.4S, v31.s[0] +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v9.4S, v29.s[0] +mul v9.4S, v9.4S,v30.s[0] +sub v3.4s, v10.4s, v0.4s +mla v20.4S, v15.4S, v31.s[0] +add v10.4s, v10.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v15.4s, v17.4s, v20.4s +mla v9.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +sub v2.4s, v14.4s, v9.4s +mla v16.4S, v0.4S, v31.s[0] +add v14.4s, v14.4s, v9.4s +sqrdmulh v9.4S, v11.4S, v29.s[1] +mul v11.4S, v11.4S,v30.s[1] +sub v0.4s, v17.4s, v16.4s +mla v8.4S, v20.4S, v31.s[0] +add v17.4s, v17.4s, v16.4s +sqrdmulh v16.4S, v12.4S, v29.s[1] +mul v12.4S, v12.4S,v30.s[1] +sub v20.4s, v14.4s, v8.4s +mla v11.4S, v9.4S, v31.s[0] +add v14.4s, v14.4s, v8.4s +sqrdmulh v8.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +sub v9.4s, v13.4s, v11.4s +mla v12.4S, v16.4S, v31.s[0] +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +sub v16.4s, v10.4s, v12.4s +mla v22.4S, v8.4S, v31.s[0] +add v10.4s, v10.4s, v12.4s +sqrdmulh v12.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +sub v8.4s, v15.4s, v22.4s +mla v21.4S, v11.4S, v31.s[0] +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v1.4S, v29.s[2] +mul v1.4S, v1.4S,v30.s[2] +sub v11.4s, v2.4s, v21.4s +mla v19.4S, v12.4S, v31.s[0] +add v2.4s, v2.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v27.s[0] +mul v17.4S, v17.4S,v28.s[0] +sub v12.4s, v18.4s, v19.4s +mla v1.4S, v22.4S, v31.s[0] +add v18.4s, v18.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +sub v22.4s, v3.4s, v1.4s +mla v17.4S, v21.4S, v31.s[0] +add v3.4s, v3.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v21.4s, v13.4s, v17.4s +mla v14.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v17.4s +sqrdmulh v17.4S, v20.4S, v27.s[1] +mul v20.4S, v20.4S,v28.s[1] +sub v19.4s, v10.4s, v14.4s +mla v0.4S, v1.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v27.s[2] +mul v15.4S, v15.4S,v28.s[2] +sub v1.4s, v9.4s, v0.4s +mla v20.4S, v17.4S, v31.s[0] +add v9.4s, v9.4s, v0.4s +sqrdmulh v0.4S, v2.4S, v27.s[2] +mul v2.4S, v2.4S,v28.s[2] +sub v17.4s, v16.4s, v20.4s +mla v15.4S, v14.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v27.s[3] +mul v8.4S, v8.4S,v28.s[3] +sub v14.4s, v18.4s, v15.4s +mla v2.4S, v0.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v27.s[3] +mul v11.4S, v11.4S,v28.s[3] +sub v0.4s, v3.4s, v2.4s +mla v8.4S, v20.4S, v31.s[0] +add v3.4s, v3.4s, v2.4s +sqrdmulh v2.4S, v10.4S, v25.s[0] +mul v10.4S, v10.4S,v26.s[0] +sub v20.4s, v12.4s, v8.4s +mla v11.4S, v15.4S, v31.s[0] +add v12.4s, v12.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v25.s[1] +mul v19.4S, v19.4S,v26.s[1] +sub v15.4s, v22.4s, v11.4s +mla v10.4S, v2.4S, v31.s[0] +add v22.4s, v22.4s, v11.4s +sqrdmulh v11.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v2.4s, v13.4s, v10.4s +mla v19.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v10.4s +sqrdmulh v10.4S, v17.4S, v25.s[3] +mul v17.4S, v17.4S,v26.s[3] +sub v8.4s, v21.4s, v19.4s +mla v16.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v19.4s +str q13, [x0, #48] +sqrdmulh v13.4S, v3.4S, v23.s[0] +str q2, [x0, #112] +mul v3.4S, v3.4S,v24.s[0] +ldr q2, [x0, #768] +sub v19.4s, v9.4s, v16.4s +ldr q11, [x0, #832] +mla v17.4S, v10.4S, v31.s[0] +add v9.4s, v9.4s, v16.4s +str q21, [x0, #176] +sqrdmulh v21.4S, v0.4S, v23.s[1] +str q8, [x0, #240] +mul v0.4S, v0.4S,v24.s[1] +ldr q8, [x0, #896] +sub v16.4s, v1.4s, v17.4s +ldr q10, [x0, #960] +mla v3.4S, v13.4S, v31.s[0] +add v1.4s, v1.4s, v17.4s +str q9, [x0, #304] +sqrdmulh v9.4S, v22.4S, v23.s[2] +str q19, [x0, #368] +mul v22.4S, v22.4S,v24.s[2] +ldr q19, [x0, #256] +sub v17.4s, v18.4s, v3.4s +ldr q13, [x0, #320] +mla v0.4S, v21.4S, v31.s[0] +add v18.4s, v18.4s, v3.4s +str q1, [x0, #432] +sqrdmulh v1.4S, v15.4S, v23.s[3] +str q16, [x0, #496] +mul v15.4S, v15.4S,v24.s[3] +ldr q16, [x0, #384] +sub v3.4s, v14.4s, v0.4s +ldr q21, [x0, #448] +mla v22.4S, v9.4S, v31.s[0] +add v14.4s, v14.4s, v0.4s +str q18, [x0, #560] +sqrdmulh v18.4S, v2.4S, v29.s[0] +str q17, [x0, #624] +ldr q17, [x0, #512] +mul v2.4S, v2.4S,v30.s[0] +ldr q0, [x0, #576] +sub v9.4s, v12.4s, v22.4s +mla v15.4S, v1.4S, v31.s[0] +add v12.4s, v12.4s, v22.4s +str q14, [x0, #688] +sqrdmulh v14.4S, v11.4S, v29.s[0] +str q3, [x0, #752] +ldr q3, [x0, #640] +mul v11.4S, v11.4S,v30.s[0] +ldr q22, [x0, #704] +sub v1.4s, v20.4s, v15.4s +mla v2.4S, v18.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +str q12, [x0, #816] +sqrdmulh v12.4S, v8.4S, v29.s[0] +str q9, [x0, #880] +mul v8.4S, v8.4S,v30.s[0] +ldr q9, [x0, #0] +sub v15.4s, v19.4s, v2.4s +mla v11.4S, v14.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +str q20, [x0, #944] +sqrdmulh v20.4S, v10.4S, v29.s[0] +str q1, [x0, #1008] +mul v10.4S, v10.4S,v30.s[0] +ldr q1, [x0, #64] +sub v2.4s, v13.4s, v11.4s +mla v8.4S, v12.4S, v31.s[0] +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v29.s[0] +ldr q12, [x0, #128] +mul v17.4S, v17.4S,v30.s[0] +sub v14.4s, v16.4s, v8.4s +mla v10.4S, v20.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v0.4S, v29.s[0] +ldr q20, [x0, #192] +mul v0.4S, v0.4S,v30.s[0] +sub v18.4s, v21.4s, v10.4s +mla v17.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +sub v11.4s, v9.4s, v17.4s +mla v0.4S, v8.4S, v31.s[0] +add v9.4s, v9.4s, v17.4s +sqrdmulh v17.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v8.4s, v1.4s, v0.4s +mla v3.4S, v10.4S, v31.s[0] +add v1.4s, v1.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v10.4s, v12.4s, v3.4s +mla v22.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v17.4s, v20.4s, v22.4s +mla v16.4S, v0.4S, v31.s[0] +add v20.4s, v20.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[1] +mul v19.4S, v19.4S,v30.s[1] +sub v0.4s, v12.4s, v16.4s +mla v21.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v13.4S, v29.s[1] +mul v13.4S, v13.4S,v30.s[1] +sub v3.4s, v20.4s, v21.4s +mla v19.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v22.4s, v9.4s, v19.4s +mla v13.4S, v16.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v29.s[2] +mul v18.4S, v18.4S,v30.s[2] +sub v16.4s, v1.4s, v13.4s +mla v14.4S, v21.4S, v31.s[0] +add v1.4s, v1.4s, v13.4s +sqrdmulh v13.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +sub v21.4s, v10.4s, v14.4s +mla v18.4S, v19.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v19.4s, v17.4s, v18.4s +mla v15.4S, v13.4S, v31.s[0] +add v17.4s, v17.4s, v18.4s +sqrdmulh v18.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +sub v13.4s, v11.4s, v15.4s +mla v2.4S, v14.4S, v31.s[0] +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v27.s[0] +mul v20.4S, v20.4S,v28.s[0] +sub v14.4s, v8.4s, v2.4s +mla v12.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v2.4s +sqrdmulh v2.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v18.4s, v9.4s, v12.4s +mla v20.4S, v15.4S, v31.s[0] +add v9.4s, v9.4s, v12.4s +sqrdmulh v12.4S, v3.4S, v27.s[1] +mul v3.4S, v3.4S,v28.s[1] +sub v15.4s, v1.4s, v20.4s +mla v0.4S, v2.4S, v31.s[0] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v10.4S, v27.s[2] +mul v10.4S, v10.4S,v28.s[2] +sub v2.4s, v22.4s, v0.4s +mla v3.4S, v12.4S, v31.s[0] +add v22.4s, v22.4s, v0.4s +sqrdmulh v0.4S, v17.4S, v27.s[2] +mul v17.4S, v17.4S,v28.s[2] +sub v12.4s, v16.4s, v3.4s +mla v10.4S, v20.4S, v31.s[0] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +sub v20.4s, v11.4s, v10.4s +mla v17.4S, v0.4S, v31.s[0] +add v11.4s, v11.4s, v10.4s +sqrdmulh v10.4S, v19.4S, v27.s[3] +mul v19.4S, v19.4S,v28.s[3] +sub v0.4s, v8.4s, v17.4s +mla v21.4S, v3.4S, v31.s[0] +add v8.4s, v8.4s, v17.4s +sqrdmulh v17.4S, v1.4S, v25.s[0] +mul v1.4S, v1.4S,v26.s[0] +sub v3.4s, v13.4s, v21.4s +mla v19.4S, v10.4S, v31.s[0] +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v15.4S, v25.s[1] +mul v15.4S, v15.4S,v26.s[1] +sub v10.4s, v14.4s, v19.4s +mla v1.4S, v17.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +sqrdmulh v19.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v17.4s, v9.4s, v1.4s +mla v15.4S, v21.4S, v31.s[0] +add v9.4s, v9.4s, v1.4s +sqrdmulh v1.4S, v12.4S, v25.s[3] +mul v12.4S, v12.4S,v26.s[3] +sub v21.4s, v18.4s, v15.4s +mla v16.4S, v19.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +str q9, [x0, #0] +sqrdmulh v9.4S, v8.4S, v23.s[0] +str q17, [x0, #64] +mul v8.4S, v8.4S,v24.s[0] +ldr q17, [x0, #784] +sub v15.4s, v22.4s, v16.4s +ldr q19, [x0, #848] +mla v12.4S, v1.4S, v31.s[0] +add v22.4s, v22.4s, v16.4s +str q18, [x0, #128] +sqrdmulh v18.4S, v0.4S, v23.s[1] +str q21, [x0, #192] +mul v0.4S, v0.4S,v24.s[1] +ldr q21, [x0, #912] +sub v16.4s, v2.4s, v12.4s +ldr q1, [x0, #976] +mla v8.4S, v9.4S, v31.s[0] +add v2.4s, v2.4s, v12.4s +str q22, [x0, #256] +sqrdmulh v22.4S, v14.4S, v23.s[2] +str q15, [x0, #320] +mul v14.4S, v14.4S,v24.s[2] +ldr q15, [x0, #272] +sub v12.4s, v11.4s, v8.4s +ldr q9, [x0, #336] +mla v0.4S, v18.4S, v31.s[0] +add v11.4s, v11.4s, v8.4s +str q2, [x0, #384] +sqrdmulh v2.4S, v10.4S, v23.s[3] +str q16, [x0, #448] +mul v10.4S, v10.4S,v24.s[3] +ldr q16, [x0, #400] +sub v8.4s, v20.4s, v0.4s +ldr q18, [x0, #464] +mla v14.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v0.4s +str q11, [x0, #512] +sqrdmulh v11.4S, v17.4S, v29.s[0] +str q12, [x0, #576] +ldr q12, [x0, #528] +mul v17.4S, v17.4S,v30.s[0] +ldr q0, [x0, #592] +sub v22.4s, v13.4s, v14.4s +mla v10.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v14.4s +str q20, [x0, #640] +sqrdmulh v20.4S, v19.4S, v29.s[0] +str q8, [x0, #704] +ldr q8, [x0, #656] +mul v19.4S, v19.4S,v30.s[0] +ldr q14, [x0, #720] +sub v2.4s, v3.4s, v10.4s +mla v17.4S, v11.4S, v31.s[0] +add v3.4s, v3.4s, v10.4s +str q13, [x0, #768] +sqrdmulh v13.4S, v21.4S, v29.s[0] +str q22, [x0, #832] +mul v21.4S, v21.4S,v30.s[0] +ldr q22, [x0, #16] +sub v10.4s, v15.4s, v17.4s +mla v19.4S, v20.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +str q3, [x0, #896] +sqrdmulh v3.4S, v1.4S, v29.s[0] +str q2, [x0, #960] +mul v1.4S, v1.4S,v30.s[0] +ldr q2, [x0, #80] +sub v17.4s, v9.4s, v19.4s +mla v21.4S, v13.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v12.4S, v29.s[0] +ldr q13, [x0, #144] +mul v12.4S, v12.4S,v30.s[0] +sub v20.4s, v16.4s, v21.4s +mla v1.4S, v3.4S, v31.s[0] +add v16.4s, v16.4s, v21.4s +sqrdmulh v21.4S, v0.4S, v29.s[0] +ldr q3, [x0, #208] +mul v0.4S, v0.4S,v30.s[0] +sub v11.4s, v18.4s, v1.4s +mla v12.4S, v19.4S, v31.s[0] +add v18.4s, v18.4s, v1.4s +sqrdmulh v1.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v19.4s, v22.4s, v12.4s +mla v0.4S, v21.4S, v31.s[0] +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v21.4s, v2.4s, v0.4s +mla v8.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v1.4s, v13.4s, v8.4s +mla v14.4S, v12.4S, v31.s[0] +add v13.4s, v13.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v12.4s, v3.4s, v14.4s +mla v16.4S, v0.4S, v31.s[0] +add v3.4s, v3.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +sub v0.4s, v13.4s, v16.4s +mla v18.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v16.4s +sqrdmulh v16.4S, v9.4S, v29.s[1] +mul v9.4S, v9.4S,v30.s[1] +sub v8.4s, v3.4s, v18.4s +mla v15.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +sub v14.4s, v22.4s, v15.4s +mla v9.4S, v16.4S, v31.s[0] +add v22.4s, v22.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v16.4s, v2.4s, v9.4s +mla v20.4S, v18.4S, v31.s[0] +add v2.4s, v2.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v18.4s, v1.4s, v20.4s +mla v11.4S, v15.4S, v31.s[0] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +sub v15.4s, v12.4s, v11.4s +mla v10.4S, v9.4S, v31.s[0] +add v12.4s, v12.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v27.s[0] +mul v13.4S, v13.4S,v28.s[0] +sub v9.4s, v19.4s, v10.4s +mla v17.4S, v20.4S, v31.s[0] +add v19.4s, v19.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v27.s[0] +mul v3.4S, v3.4S,v28.s[0] +sub v20.4s, v21.4s, v17.4s +mla v13.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v11.4s, v22.4s, v13.4s +mla v3.4S, v10.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v8.4S, v27.s[1] +mul v8.4S, v8.4S,v28.s[1] +sub v10.4s, v2.4s, v3.4s +mla v0.4S, v17.4S, v31.s[0] +add v2.4s, v2.4s, v3.4s +sqrdmulh v3.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +sub v17.4s, v14.4s, v0.4s +mla v8.4S, v13.4S, v31.s[0] +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v12.4S, v27.s[2] +mul v12.4S, v12.4S,v28.s[2] +sub v13.4s, v16.4s, v8.4s +mla v1.4S, v3.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v3.4s, v19.4s, v1.4s +mla v12.4S, v0.4S, v31.s[0] +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +sub v0.4s, v21.4s, v12.4s +mla v18.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v2.4S, v25.s[0] +mul v2.4S, v2.4S,v26.s[0] +sub v8.4s, v9.4s, v18.4s +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v10.4S, v25.s[1] +mul v10.4S, v10.4S,v26.s[1] +sub v1.4s, v20.4s, v15.4s +mla v2.4S, v12.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v12.4s, v22.4s, v2.4s +mla v10.4S, v18.4S, v31.s[0] +add v22.4s, v22.4s, v2.4s +sqrdmulh v2.4S, v13.4S, v25.s[3] +mul v13.4S, v13.4S,v26.s[3] +sub v18.4s, v11.4s, v10.4s +mla v16.4S, v15.4S, v31.s[0] +add v11.4s, v11.4s, v10.4s +str q22, [x0, #16] +sqrdmulh v22.4S, v21.4S, v23.s[0] +str q12, [x0, #80] +mul v21.4S, v21.4S,v24.s[0] +sub v12.4s, v14.4s, v16.4s +mla v13.4S, v2.4S, v31.s[0] +add v14.4s, v14.4s, v16.4s +str q11, [x0, #144] +sqrdmulh v11.4S, v0.4S, v23.s[1] +str q18, [x0, #208] +mul v0.4S, v0.4S,v24.s[1] +sub v18.4s, v17.4s, v13.4s +mla v21.4S, v22.4S, v31.s[0] +add v17.4s, v17.4s, v13.4s +str q14, [x0, #272] +sqrdmulh v14.4S, v20.4S, v23.s[2] +str q12, [x0, #336] +mul v20.4S, v20.4S,v24.s[2] +sub v12.4s, v19.4s, v21.4s +mla v0.4S, v11.4S, v31.s[0] +add v19.4s, v19.4s, v21.4s +str q17, [x0, #400] +sqrdmulh v17.4S, v1.4S, v23.s[3] +str q18, [x0, #464] +mul v1.4S, v1.4S,v24.s[3] +sub v18.4s, v3.4s, v0.4s +mla v20.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v0.4s +str q19, [x0, #528] +str q12, [x0, #592] +sub v12.4s, v9.4s, v20.4s +mla v1.4S, v17.4S, v31.s[0] +add v9.4s, v9.4s, v20.4s +str q3, [x0, #656] +str q18, [x0, #720] +sub v18.4s, v8.4s, v1.4s +add v8.4s, v8.4s, v1.4s +str q9, [x0, #784] +str q12, [x0, #848] +str q8, [x0, #912] +str q18, [x0, #976] +ldr q4, [x17, #+128] +ldr q5, [x17, #+144] +ldr q6, [x17, #+160] +ldr q7, [x17, #+176] +ldr q15, [x17, #+192] +ldr q10, [x17, #+208] +ldr q2, [x17, #+224] +ldr q16, [x17, #+240] +ldr q22, [x0, #32] +ldr q13, [x0, #48] +ldr q11, [x0, #0] +ldr q21, [x0, #96] +ldr q14, [x0, #112] +ldr q0, [x0, #64] +ldr q19, [x0, #160] +ldr q17, [x0, #176] +ldr q20, [x0, #128] +ldr q3, [x0, #224] +ldr q1, [x0, #240] +ldr q9, [x0, #192] +sqrdmulh v12.4S, v22.4S, v5.s[0] +mul v22.4S, v22.4S,v4.s[0] +mla v22.4S, v12.4S, v31.s[0] +sub v12.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +ldr q22, [x0, #16] +sqrdmulh v8.4S, v21.4S, v7.s[0] +mul v21.4S, v21.4S,v6.s[0] +mla v21.4S, v8.4S, v31.s[0] +sub v8.4s, v0.4s, v21.4s +add v0.4s, v0.4s, v21.4s +ldr q21, [x0, #80] +sqrdmulh v18.4S, v19.4S, v10.s[0] +mul v19.4S, v19.4S,v15.s[0] +mla v19.4S, v18.4S, v31.s[0] +sub v18.4s, v20.4s, v19.4s +add v20.4s, v20.4s, v19.4s +ldr q19, [x0, #144] +sqrdmulh v30.4S, v3.4S, v16.s[0] +mul v3.4S, v3.4S,v2.s[0] +mla v3.4S, v30.4S, v31.s[0] +sub v30.4s, v9.4s, v3.4s +add v9.4s, v9.4s, v3.4s +ldr q3, [x0, #208] +sqrdmulh v29.4S, v13.4S, v5.s[0] +mul v13.4S, v13.4S,v4.s[0] +mla v13.4S, v29.4S, v31.s[0] +sub v29.4s, v22.4s, v13.4s +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v14.4S, v7.s[0] +mul v14.4S, v14.4S,v6.s[0] +mla v14.4S, v13.4S, v31.s[0] +sub v13.4s, v21.4s, v14.4s +add v21.4s, v21.4s, v14.4s +sqrdmulh v14.4S, v17.4S, v10.s[0] +mul v17.4S, v17.4S,v15.s[0] +mla v17.4S, v14.4S, v31.s[0] +sub v14.4s, v19.4s, v17.4s +add v19.4s, v19.4s, v17.4s +sqrdmulh v17.4S, v1.4S, v16.s[0] +mul v1.4S, v1.4S,v2.s[0] +mla v1.4S, v17.4S, v31.s[0] +sub v17.4s, v3.4s, v1.4s +add v3.4s, v3.4s, v1.4s +sqrdmulh v1.4S, v22.4S, v5.s[1] +mul v22.4S, v22.4S,v4.s[1] +mla v22.4S, v1.4S, v31.s[0] +sub v1.4s, v11.4s, v22.4s +add v11.4s, v11.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v7.s[1] +mul v21.4S, v21.4S,v6.s[1] +mla v21.4S, v22.4S, v31.s[0] +sub v22.4s, v0.4s, v21.4s +add v0.4s, v0.4s, v21.4s +str q11, [x0, #0] +str q1, [x0, #16] +sqrdmulh v1.4S, v19.4S, v10.s[1] +mul v19.4S, v19.4S,v15.s[1] +mla v19.4S, v1.4S, v31.s[0] +sub v1.4s, v20.4s, v19.4s +add v20.4s, v20.4s, v19.4s +str q0, [x0, #64] +str q22, [x0, #80] +sqrdmulh v22.4S, v3.4S, v16.s[1] +mul v3.4S, v3.4S,v2.s[1] +mla v3.4S, v22.4S, v31.s[0] +sub v22.4s, v9.4s, v3.4s +add v9.4s, v9.4s, v3.4s +str q20, [x0, #128] +str q1, [x0, #144] +sqrdmulh v1.4S, v29.4S, v5.s[2] +mul v29.4S, v29.4S,v4.s[2] +mla v29.4S, v1.4S, v31.s[0] +sub v1.4s, v12.4s, v29.4s +add v12.4s, v12.4s, v29.4s +str q9, [x0, #192] +str q22, [x0, #208] +ldr q5, [x17, #+256] +ldr q4, [x17, #+272] +sqrdmulh v22.4S, v13.4S, v7.s[2] +mul v13.4S, v13.4S,v6.s[2] +mla v13.4S, v22.4S, v31.s[0] +sub v22.4s, v8.4s, v13.4s +add v8.4s, v8.4s, v13.4s +ldr q7, [x17, #+288] +ldr q6, [x17, #+304] +sqrdmulh v13.4S, v14.4S, v10.s[2] +mul v14.4S, v14.4S,v15.s[2] +mla v14.4S, v13.4S, v31.s[0] +sub v13.4s, v18.4s, v14.4s +add v18.4s, v18.4s, v14.4s +ldr q10, [x17, #+320] +ldr q15, [x17, #+336] +sqrdmulh v14.4S, v17.4S, v16.s[2] +mul v17.4S, v17.4S,v2.s[2] +mla v17.4S, v14.4S, v31.s[0] +sub v14.4s, v30.4s, v17.4s +add v30.4s, v30.4s, v17.4s +ldr q16, [x17, #+352] +ldr q2, [x17, #+368] +str q12, [x0, #32] +str q1, [x0, #48] +str q8, [x0, #96] +str q22, [x0, #112] +str q18, [x0, #160] +str q13, [x0, #176] +str q30, [x0, #224] +str q14, [x0, #240] +ldr q14, [x0, #288] +ldr q30, [x0, #304] +ldr q13, [x0, #256] +ldr q18, [x0, #352] +ldr q22, [x0, #368] +ldr q8, [x0, #320] +ldr q1, [x0, #416] +ldr q12, [x0, #432] +ldr q17, [x0, #384] +ldr q9, [x0, #480] +ldr q29, [x0, #496] +ldr q20, [x0, #448] +sqrdmulh v3.4S, v14.4S, v4.s[0] +mul v14.4S, v14.4S,v5.s[0] +mla v14.4S, v3.4S, v31.s[0] +sub v3.4s, v13.4s, v14.4s +add v13.4s, v13.4s, v14.4s +ldr q14, [x0, #272] +sqrdmulh v0.4S, v18.4S, v6.s[0] +mul v18.4S, v18.4S,v7.s[0] +mla v18.4S, v0.4S, v31.s[0] +sub v0.4s, v8.4s, v18.4s +add v8.4s, v8.4s, v18.4s +ldr q18, [x0, #336] +sqrdmulh v19.4S, v1.4S, v15.s[0] +mul v1.4S, v1.4S,v10.s[0] +mla v1.4S, v19.4S, v31.s[0] +sub v19.4s, v17.4s, v1.4s +add v17.4s, v17.4s, v1.4s +ldr q1, [x0, #400] +sqrdmulh v11.4S, v9.4S, v2.s[0] +mul v9.4S, v9.4S,v16.s[0] +mla v9.4S, v11.4S, v31.s[0] +sub v11.4s, v20.4s, v9.4s +add v20.4s, v20.4s, v9.4s +ldr q9, [x0, #464] +sqrdmulh v21.4S, v30.4S, v4.s[0] +mul v30.4S, v30.4S,v5.s[0] +mla v30.4S, v21.4S, v31.s[0] +sub v21.4s, v14.4s, v30.4s +add v14.4s, v14.4s, v30.4s +sqrdmulh v30.4S, v22.4S, v6.s[0] +mul v22.4S, v22.4S,v7.s[0] +mla v22.4S, v30.4S, v31.s[0] +sub v30.4s, v18.4s, v22.4s +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v12.4S, v15.s[0] +mul v12.4S, v12.4S,v10.s[0] +mla v12.4S, v22.4S, v31.s[0] +sub v22.4s, v1.4s, v12.4s +add v1.4s, v1.4s, v12.4s +sqrdmulh v12.4S, v29.4S, v2.s[0] +mul v29.4S, v29.4S,v16.s[0] +mla v29.4S, v12.4S, v31.s[0] +sub v12.4s, v9.4s, v29.4s +add v9.4s, v9.4s, v29.4s +sqrdmulh v29.4S, v14.4S, v4.s[1] +mul v14.4S, v14.4S,v5.s[1] +mla v14.4S, v29.4S, v31.s[0] +sub v29.4s, v13.4s, v14.4s +add v13.4s, v13.4s, v14.4s +sqrdmulh v14.4S, v18.4S, v6.s[1] +mul v18.4S, v18.4S,v7.s[1] +mla v18.4S, v14.4S, v31.s[0] +sub v14.4s, v8.4s, v18.4s +add v8.4s, v8.4s, v18.4s +str q13, [x0, #256] +str q29, [x0, #272] +sqrdmulh v29.4S, v1.4S, v15.s[1] +mul v1.4S, v1.4S,v10.s[1] +mla v1.4S, v29.4S, v31.s[0] +sub v29.4s, v17.4s, v1.4s +add v17.4s, v17.4s, v1.4s +str q8, [x0, #320] +str q14, [x0, #336] +sqrdmulh v14.4S, v9.4S, v2.s[1] +mul v9.4S, v9.4S,v16.s[1] +mla v9.4S, v14.4S, v31.s[0] +sub v14.4s, v20.4s, v9.4s +add v20.4s, v20.4s, v9.4s +str q17, [x0, #384] +str q29, [x0, #400] +sqrdmulh v29.4S, v21.4S, v4.s[2] +mul v21.4S, v21.4S,v5.s[2] +mla v21.4S, v29.4S, v31.s[0] +sub v29.4s, v3.4s, v21.4s +add v3.4s, v3.4s, v21.4s +str q20, [x0, #448] +str q14, [x0, #464] +ldr q4, [x17, #+384] +ldr q5, [x17, #+400] +sqrdmulh v14.4S, v30.4S, v6.s[2] +mul v30.4S, v30.4S,v7.s[2] +mla v30.4S, v14.4S, v31.s[0] +sub v14.4s, v0.4s, v30.4s +add v0.4s, v0.4s, v30.4s +ldr q6, [x17, #+416] +ldr q7, [x17, #+432] +sqrdmulh v30.4S, v22.4S, v15.s[2] +mul v22.4S, v22.4S,v10.s[2] +mla v22.4S, v30.4S, v31.s[0] +sub v30.4s, v19.4s, v22.4s +add v19.4s, v19.4s, v22.4s +ldr q15, [x17, #+448] +ldr q10, [x17, #+464] +sqrdmulh v22.4S, v12.4S, v2.s[2] +mul v12.4S, v12.4S,v16.s[2] +mla v12.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v12.4s +add v11.4s, v11.4s, v12.4s +ldr q2, [x17, #+480] +ldr q16, [x17, #+496] +str q3, [x0, #288] +str q29, [x0, #304] +str q0, [x0, #352] +str q14, [x0, #368] +str q19, [x0, #416] +str q30, [x0, #432] +str q11, [x0, #480] +str q22, [x0, #496] +ldr q22, [x0, #544] +ldr q11, [x0, #560] +ldr q30, [x0, #512] +ldr q19, [x0, #608] +ldr q14, [x0, #624] +ldr q0, [x0, #576] +ldr q29, [x0, #672] +ldr q3, [x0, #688] +ldr q12, [x0, #640] +ldr q20, [x0, #736] +ldr q21, [x0, #752] +ldr q17, [x0, #704] +sqrdmulh v9.4S, v22.4S, v5.s[0] +mul v22.4S, v22.4S,v4.s[0] +mla v22.4S, v9.4S, v31.s[0] +sub v9.4s, v30.4s, v22.4s +add v30.4s, v30.4s, v22.4s +ldr q22, [x0, #528] +sqrdmulh v8.4S, v19.4S, v7.s[0] +mul v19.4S, v19.4S,v6.s[0] +mla v19.4S, v8.4S, v31.s[0] +sub v8.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +ldr q19, [x0, #592] +sqrdmulh v1.4S, v29.4S, v10.s[0] +mul v29.4S, v29.4S,v15.s[0] +mla v29.4S, v1.4S, v31.s[0] +sub v1.4s, v12.4s, v29.4s +add v12.4s, v12.4s, v29.4s +ldr q29, [x0, #656] +sqrdmulh v13.4S, v20.4S, v16.s[0] +mul v20.4S, v20.4S,v2.s[0] +mla v20.4S, v13.4S, v31.s[0] +sub v13.4s, v17.4s, v20.4s +add v17.4s, v17.4s, v20.4s +ldr q20, [x0, #720] +sqrdmulh v18.4S, v11.4S, v5.s[0] +mul v11.4S, v11.4S,v4.s[0] +mla v11.4S, v18.4S, v31.s[0] +sub v18.4s, v22.4s, v11.4s +add v22.4s, v22.4s, v11.4s +sqrdmulh v11.4S, v14.4S, v7.s[0] +mul v14.4S, v14.4S,v6.s[0] +mla v14.4S, v11.4S, v31.s[0] +sub v11.4s, v19.4s, v14.4s +add v19.4s, v19.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v10.s[0] +mul v3.4S, v3.4S,v15.s[0] +mla v3.4S, v14.4S, v31.s[0] +sub v14.4s, v29.4s, v3.4s +add v29.4s, v29.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v16.s[0] +mul v21.4S, v21.4S,v2.s[0] +mla v21.4S, v3.4S, v31.s[0] +sub v3.4s, v20.4s, v21.4s +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v5.s[1] +mul v22.4S, v22.4S,v4.s[1] +mla v22.4S, v21.4S, v31.s[0] +sub v21.4s, v30.4s, v22.4s +add v30.4s, v30.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v7.s[1] +mul v19.4S, v19.4S,v6.s[1] +mla v19.4S, v22.4S, v31.s[0] +sub v22.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +str q30, [x0, #512] +str q21, [x0, #528] +sqrdmulh v21.4S, v29.4S, v10.s[1] +mul v29.4S, v29.4S,v15.s[1] +mla v29.4S, v21.4S, v31.s[0] +sub v21.4s, v12.4s, v29.4s +add v12.4s, v12.4s, v29.4s +str q0, [x0, #576] +str q22, [x0, #592] +sqrdmulh v22.4S, v20.4S, v16.s[1] +mul v20.4S, v20.4S,v2.s[1] +mla v20.4S, v22.4S, v31.s[0] +sub v22.4s, v17.4s, v20.4s +add v17.4s, v17.4s, v20.4s +str q12, [x0, #640] +str q21, [x0, #656] +sqrdmulh v21.4S, v18.4S, v5.s[2] +mul v18.4S, v18.4S,v4.s[2] +mla v18.4S, v21.4S, v31.s[0] +sub v21.4s, v9.4s, v18.4s +add v9.4s, v9.4s, v18.4s +str q17, [x0, #704] +str q22, [x0, #720] +ldr q5, [x17, #+512] +ldr q4, [x17, #+528] +sqrdmulh v22.4S, v11.4S, v7.s[2] +mul v11.4S, v11.4S,v6.s[2] +mla v11.4S, v22.4S, v31.s[0] +sub v22.4s, v8.4s, v11.4s +add v8.4s, v8.4s, v11.4s +ldr q7, [x17, #+544] +ldr q6, [x17, #+560] +sqrdmulh v11.4S, v14.4S, v10.s[2] +mul v14.4S, v14.4S,v15.s[2] +mla v14.4S, v11.4S, v31.s[0] +sub v11.4s, v1.4s, v14.4s +add v1.4s, v1.4s, v14.4s +ldr q10, [x17, #+576] +ldr q15, [x17, #+592] +sqrdmulh v14.4S, v3.4S, v16.s[2] +mul v3.4S, v3.4S,v2.s[2] +mla v3.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v3.4s +add v13.4s, v13.4s, v3.4s +ldr q16, [x17, #+608] +ldr q2, [x17, #+624] +str q9, [x0, #544] +str q21, [x0, #560] +str q8, [x0, #608] +str q22, [x0, #624] +str q1, [x0, #672] +str q11, [x0, #688] +str q13, [x0, #736] +str q14, [x0, #752] +ldr q14, [x0, #800] +ldr q13, [x0, #816] +ldr q11, [x0, #768] +ldr q1, [x0, #864] +ldr q22, [x0, #880] +ldr q8, [x0, #832] +ldr q21, [x0, #928] +ldr q9, [x0, #944] +ldr q3, [x0, #896] +ldr q17, [x0, #992] +ldr q18, [x0, #1008] +ldr q12, [x0, #960] +sqrdmulh v20.4S, v14.4S, v4.s[0] +mul v14.4S, v14.4S,v5.s[0] +mla v14.4S, v20.4S, v31.s[0] +sub v20.4s, v11.4s, v14.4s +add v11.4s, v11.4s, v14.4s +ldr q14, [x0, #784] +sqrdmulh v0.4S, v1.4S, v6.s[0] +mul v1.4S, v1.4S,v7.s[0] +mla v1.4S, v0.4S, v31.s[0] +sub v0.4s, v8.4s, v1.4s +add v8.4s, v8.4s, v1.4s +ldr q1, [x0, #848] +sqrdmulh v29.4S, v21.4S, v15.s[0] +mul v21.4S, v21.4S,v10.s[0] +mla v21.4S, v29.4S, v31.s[0] +sub v29.4s, v3.4s, v21.4s +add v3.4s, v3.4s, v21.4s +ldr q21, [x0, #912] +sqrdmulh v30.4S, v17.4S, v2.s[0] +mul v17.4S, v17.4S,v16.s[0] +mla v17.4S, v30.4S, v31.s[0] +sub v30.4s, v12.4s, v17.4s +add v12.4s, v12.4s, v17.4s +ldr q17, [x0, #976] +sqrdmulh v19.4S, v13.4S, v4.s[0] +mul v13.4S, v13.4S,v5.s[0] +mla v13.4S, v19.4S, v31.s[0] +sub v19.4s, v14.4s, v13.4s +add v14.4s, v14.4s, v13.4s +sqrdmulh v13.4S, v22.4S, v6.s[0] +mul v22.4S, v22.4S,v7.s[0] +mla v22.4S, v13.4S, v31.s[0] +sub v13.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +sqrdmulh v22.4S, v9.4S, v15.s[0] +mul v9.4S, v9.4S,v10.s[0] +mla v9.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v9.4s +add v21.4s, v21.4s, v9.4s +sqrdmulh v9.4S, v18.4S, v2.s[0] +mul v18.4S, v18.4S,v16.s[0] +mla v18.4S, v9.4S, v31.s[0] +sub v9.4s, v17.4s, v18.4s +add v17.4s, v17.4s, v18.4s +sqrdmulh v18.4S, v14.4S, v4.s[1] +mul v14.4S, v14.4S,v5.s[1] +mla v14.4S, v18.4S, v31.s[0] +sub v18.4s, v11.4s, v14.4s +add v11.4s, v11.4s, v14.4s +sqrdmulh v14.4S, v1.4S, v6.s[1] +mul v1.4S, v1.4S,v7.s[1] +mla v1.4S, v14.4S, v31.s[0] +sub v14.4s, v8.4s, v1.4s +add v8.4s, v8.4s, v1.4s +str q11, [x0, #768] +str q18, [x0, #784] +sqrdmulh v18.4S, v21.4S, v15.s[1] +mul v21.4S, v21.4S,v10.s[1] +mla v21.4S, v18.4S, v31.s[0] +sub v18.4s, v3.4s, v21.4s +add v3.4s, v3.4s, v21.4s +str q8, [x0, #832] +str q14, [x0, #848] +sqrdmulh v14.4S, v17.4S, v2.s[1] +mul v17.4S, v17.4S,v16.s[1] +mla v17.4S, v14.4S, v31.s[0] +sub v14.4s, v12.4s, v17.4s +add v12.4s, v12.4s, v17.4s +str q3, [x0, #896] +str q18, [x0, #912] +sqrdmulh v18.4S, v19.4S, v4.s[2] +mul v19.4S, v19.4S,v5.s[2] +mla v19.4S, v18.4S, v31.s[0] +sub v18.4s, v20.4s, v19.4s +add v20.4s, v20.4s, v19.4s +str q12, [x0, #960] +str q14, [x0, #976] +sqrdmulh v4.4S, v13.4S, v6.s[2] +mul v13.4S, v13.4S,v7.s[2] +mla v13.4S, v4.4S, v31.s[0] +sub v4.4s, v0.4s, v13.4s +add v0.4s, v0.4s, v13.4s +sqrdmulh v6.4S, v22.4S, v15.s[2] +mul v22.4S, v22.4S,v10.s[2] +mla v22.4S, v6.4S, v31.s[0] +sub v6.4s, v29.4s, v22.4s +add v29.4s, v29.4s, v22.4s +sqrdmulh v15.4S, v9.4S, v2.s[2] +mul v9.4S, v9.4S,v16.s[2] +mla v9.4S, v15.4S, v31.s[0] +sub v15.4s, v30.4s, v9.4s +add v30.4s, v30.4s, v9.4s +str q20, [x0, #800] +str q18, [x0, #816] +str q0, [x0, #864] +str q4, [x0, #880] +str q29, [x0, #928] +str q6, [x0, #944] +str q30, [x0, #992] +str q15, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1464 +// Instruction count: 1460 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_3.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_3.s new file mode 100644 index 0000000..ed2fb5d --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_3.s @@ -0,0 +1,1494 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_3_z4_3 +.global _ntt_u32_incomplete_neon_asm_var_4_2_3_z4_3 +ntt_u32_incomplete_neon_asm_var_4_2_3_z4_3: +_ntt_u32_incomplete_neon_asm_var_4_2_3_z4_3: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #800] +ldr q21, [x0, #864] +ldr q20, [x0, #928] +ldr q19, [x0, #992] +ldr q18, [x0, #288] +ldr q17, [x0, #352] +ldr q16, [x0, #416] +ldr q3, [x0, #480] +sqrdmulh v2.4S, v22.4S, v29.s[0] +ldr q1, [x0, #544] +mul v22.4S, v22.4S,v30.s[0] +ldr q0, [x0, #608] +sqrdmulh v15.4S, v21.4S, v29.s[0] +ldr q14, [x0, #672] +mul v21.4S, v21.4S,v30.s[0] +ldr q13, [x0, #736] +mla v22.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q12, [x0, #32] +sub v11.4s, v18.4s, v22.4s +mla v21.4S, v15.4S, v31.s[0] +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q15, [x0, #96] +sub v10.4s, v17.4s, v21.4s +mla v20.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v1.4S, v29.s[0] +ldr q2, [x0, #160] +mul v1.4S, v1.4S,v30.s[0] +sub v9.4s, v16.4s, v20.4s +mla v19.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v0.4S, v29.s[0] +ldr q22, [x0, #224] +mul v0.4S, v0.4S,v30.s[0] +sub v8.4s, v3.4s, v19.4s +mla v1.4S, v21.4S, v31.s[0] +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v21.4s, v12.4s, v1.4s +mla v0.4S, v20.4S, v31.s[0] +add v12.4s, v12.4s, v1.4s +sqrdmulh v1.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +sub v20.4s, v15.4s, v0.4s +mla v14.4S, v19.4S, v31.s[0] +add v15.4s, v15.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v19.4s, v2.4s, v14.4s +mla v13.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v1.4s, v22.4s, v13.4s +mla v16.4S, v0.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v0.4s, v2.4s, v16.4s +mla v3.4S, v14.4S, v31.s[0] +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v14.4s, v22.4s, v3.4s +mla v18.4S, v13.4S, v31.s[0] +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v29.s[2] +mul v9.4S, v9.4S,v30.s[2] +sub v13.4s, v12.4s, v18.4s +mla v17.4S, v16.4S, v31.s[0] +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v8.4S, v29.s[2] +mul v8.4S, v8.4S,v30.s[2] +sub v16.4s, v15.4s, v17.4s +mla v9.4S, v3.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +sqrdmulh v17.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v3.4s, v19.4s, v9.4s +mla v8.4S, v18.4S, v31.s[0] +add v19.4s, v19.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v18.4s, v1.4s, v8.4s +mla v11.4S, v17.4S, v31.s[0] +add v1.4s, v1.4s, v8.4s +sqrdmulh v8.4S, v2.4S, v27.s[0] +mul v2.4S, v2.4S,v28.s[0] +sub v17.4s, v21.4s, v11.4s +mla v10.4S, v9.4S, v31.s[0] +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v27.s[0] +mul v22.4S, v22.4S,v28.s[0] +sub v9.4s, v20.4s, v10.4s +mla v2.4S, v8.4S, v31.s[0] +add v20.4s, v20.4s, v10.4s +sqrdmulh v10.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v8.4s, v12.4s, v2.4s +mla v22.4S, v11.4S, v31.s[0] +add v12.4s, v12.4s, v2.4s +sqrdmulh v2.4S, v14.4S, v27.s[1] +mul v14.4S, v14.4S,v28.s[1] +sub v11.4s, v15.4s, v22.4s +mla v0.4S, v10.4S, v31.s[0] +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v27.s[2] +mul v19.4S, v19.4S,v28.s[2] +sub v10.4s, v13.4s, v0.4s +mla v14.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v0.4s +sqrdmulh v0.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +sub v2.4s, v16.4s, v14.4s +mla v19.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v27.s[3] +mul v3.4S, v3.4S,v28.s[3] +sub v22.4s, v21.4s, v19.4s +mla v1.4S, v0.4S, v31.s[0] +add v21.4s, v21.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v0.4s, v20.4s, v1.4s +mla v3.4S, v14.4S, v31.s[0] +add v20.4s, v20.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v25.s[0] +mul v15.4S, v15.4S,v26.s[0] +sub v14.4s, v17.4s, v3.4s +mla v18.4S, v19.4S, v31.s[0] +add v17.4s, v17.4s, v3.4s +sqrdmulh v3.4S, v11.4S, v25.s[1] +mul v11.4S, v11.4S,v26.s[1] +sub v19.4s, v9.4s, v18.4s +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v1.4s, v12.4s, v15.4s +mla v11.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v25.s[3] +mul v2.4S, v2.4S,v26.s[3] +sub v3.4s, v8.4s, v11.4s +mla v16.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v11.4s +str q12, [x0, #32] +sqrdmulh v12.4S, v20.4S, v23.s[0] +str q1, [x0, #96] +mul v20.4S, v20.4S,v24.s[0] +ldr q1, [x0, #816] +sub v11.4s, v13.4s, v16.4s +ldr q18, [x0, #880] +mla v2.4S, v15.4S, v31.s[0] +add v13.4s, v13.4s, v16.4s +str q8, [x0, #160] +sqrdmulh v8.4S, v0.4S, v23.s[1] +str q3, [x0, #224] +mul v0.4S, v0.4S,v24.s[1] +ldr q3, [x0, #944] +sub v16.4s, v10.4s, v2.4s +ldr q15, [x0, #1008] +mla v20.4S, v12.4S, v31.s[0] +add v10.4s, v10.4s, v2.4s +str q13, [x0, #288] +sqrdmulh v13.4S, v9.4S, v23.s[2] +str q11, [x0, #352] +mul v9.4S, v9.4S,v24.s[2] +ldr q11, [x0, #304] +sub v2.4s, v21.4s, v20.4s +ldr q12, [x0, #368] +mla v0.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v20.4s +str q10, [x0, #416] +sqrdmulh v10.4S, v19.4S, v23.s[3] +str q16, [x0, #480] +mul v19.4S, v19.4S,v24.s[3] +ldr q16, [x0, #432] +sub v20.4s, v22.4s, v0.4s +ldr q8, [x0, #496] +mla v9.4S, v13.4S, v31.s[0] +add v22.4s, v22.4s, v0.4s +str q21, [x0, #544] +sqrdmulh v21.4S, v1.4S, v29.s[0] +str q2, [x0, #608] +ldr q2, [x0, #560] +mul v1.4S, v1.4S,v30.s[0] +ldr q0, [x0, #624] +sub v13.4s, v17.4s, v9.4s +mla v19.4S, v10.4S, v31.s[0] +add v17.4s, v17.4s, v9.4s +str q22, [x0, #672] +sqrdmulh v22.4S, v18.4S, v29.s[0] +str q20, [x0, #736] +ldr q20, [x0, #688] +mul v18.4S, v18.4S,v30.s[0] +ldr q9, [x0, #752] +sub v10.4s, v14.4s, v19.4s +mla v1.4S, v21.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +str q17, [x0, #800] +sqrdmulh v17.4S, v3.4S, v29.s[0] +str q13, [x0, #864] +mul v3.4S, v3.4S,v30.s[0] +ldr q13, [x0, #48] +sub v19.4s, v11.4s, v1.4s +mla v18.4S, v22.4S, v31.s[0] +add v11.4s, v11.4s, v1.4s +str q14, [x0, #928] +sqrdmulh v14.4S, v15.4S, v29.s[0] +str q10, [x0, #992] +mul v15.4S, v15.4S,v30.s[0] +ldr q10, [x0, #112] +sub v1.4s, v12.4s, v18.4s +mla v3.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v2.4S, v29.s[0] +ldr q17, [x0, #176] +mul v2.4S, v2.4S,v30.s[0] +sub v22.4s, v16.4s, v3.4s +mla v15.4S, v14.4S, v31.s[0] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v0.4S, v29.s[0] +ldr q14, [x0, #240] +mul v0.4S, v0.4S,v30.s[0] +sub v21.4s, v8.4s, v15.4s +mla v2.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +sub v18.4s, v13.4s, v2.4s +mla v0.4S, v3.4S, v31.s[0] +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v9.4S, v29.s[0] +mul v9.4S, v9.4S,v30.s[0] +sub v3.4s, v10.4s, v0.4s +mla v20.4S, v15.4S, v31.s[0] +add v10.4s, v10.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v15.4s, v17.4s, v20.4s +mla v9.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +sub v2.4s, v14.4s, v9.4s +mla v16.4S, v0.4S, v31.s[0] +add v14.4s, v14.4s, v9.4s +sqrdmulh v9.4S, v11.4S, v29.s[1] +mul v11.4S, v11.4S,v30.s[1] +sub v0.4s, v17.4s, v16.4s +mla v8.4S, v20.4S, v31.s[0] +add v17.4s, v17.4s, v16.4s +sqrdmulh v16.4S, v12.4S, v29.s[1] +mul v12.4S, v12.4S,v30.s[1] +sub v20.4s, v14.4s, v8.4s +mla v11.4S, v9.4S, v31.s[0] +add v14.4s, v14.4s, v8.4s +sqrdmulh v8.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +sub v9.4s, v13.4s, v11.4s +mla v12.4S, v16.4S, v31.s[0] +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +sub v16.4s, v10.4s, v12.4s +mla v22.4S, v8.4S, v31.s[0] +add v10.4s, v10.4s, v12.4s +sqrdmulh v12.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +sub v8.4s, v15.4s, v22.4s +mla v21.4S, v11.4S, v31.s[0] +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v1.4S, v29.s[2] +mul v1.4S, v1.4S,v30.s[2] +sub v11.4s, v2.4s, v21.4s +mla v19.4S, v12.4S, v31.s[0] +add v2.4s, v2.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v27.s[0] +mul v17.4S, v17.4S,v28.s[0] +sub v12.4s, v18.4s, v19.4s +mla v1.4S, v22.4S, v31.s[0] +add v18.4s, v18.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +sub v22.4s, v3.4s, v1.4s +mla v17.4S, v21.4S, v31.s[0] +add v3.4s, v3.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v21.4s, v13.4s, v17.4s +mla v14.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v17.4s +sqrdmulh v17.4S, v20.4S, v27.s[1] +mul v20.4S, v20.4S,v28.s[1] +sub v19.4s, v10.4s, v14.4s +mla v0.4S, v1.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v27.s[2] +mul v15.4S, v15.4S,v28.s[2] +sub v1.4s, v9.4s, v0.4s +mla v20.4S, v17.4S, v31.s[0] +add v9.4s, v9.4s, v0.4s +sqrdmulh v0.4S, v2.4S, v27.s[2] +mul v2.4S, v2.4S,v28.s[2] +sub v17.4s, v16.4s, v20.4s +mla v15.4S, v14.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v27.s[3] +mul v8.4S, v8.4S,v28.s[3] +sub v14.4s, v18.4s, v15.4s +mla v2.4S, v0.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v27.s[3] +mul v11.4S, v11.4S,v28.s[3] +sub v0.4s, v3.4s, v2.4s +mla v8.4S, v20.4S, v31.s[0] +add v3.4s, v3.4s, v2.4s +sqrdmulh v2.4S, v10.4S, v25.s[0] +mul v10.4S, v10.4S,v26.s[0] +sub v20.4s, v12.4s, v8.4s +mla v11.4S, v15.4S, v31.s[0] +add v12.4s, v12.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v25.s[1] +mul v19.4S, v19.4S,v26.s[1] +sub v15.4s, v22.4s, v11.4s +mla v10.4S, v2.4S, v31.s[0] +add v22.4s, v22.4s, v11.4s +sqrdmulh v11.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v2.4s, v13.4s, v10.4s +mla v19.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v10.4s +sqrdmulh v10.4S, v17.4S, v25.s[3] +mul v17.4S, v17.4S,v26.s[3] +sub v8.4s, v21.4s, v19.4s +mla v16.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v19.4s +str q13, [x0, #48] +sqrdmulh v13.4S, v3.4S, v23.s[0] +str q2, [x0, #112] +mul v3.4S, v3.4S,v24.s[0] +ldr q2, [x0, #768] +sub v19.4s, v9.4s, v16.4s +ldr q11, [x0, #832] +mla v17.4S, v10.4S, v31.s[0] +add v9.4s, v9.4s, v16.4s +str q21, [x0, #176] +sqrdmulh v21.4S, v0.4S, v23.s[1] +str q8, [x0, #240] +mul v0.4S, v0.4S,v24.s[1] +ldr q8, [x0, #896] +sub v16.4s, v1.4s, v17.4s +ldr q10, [x0, #960] +mla v3.4S, v13.4S, v31.s[0] +add v1.4s, v1.4s, v17.4s +str q9, [x0, #304] +sqrdmulh v9.4S, v22.4S, v23.s[2] +str q19, [x0, #368] +mul v22.4S, v22.4S,v24.s[2] +ldr q19, [x0, #256] +sub v17.4s, v18.4s, v3.4s +ldr q13, [x0, #320] +mla v0.4S, v21.4S, v31.s[0] +add v18.4s, v18.4s, v3.4s +str q1, [x0, #432] +sqrdmulh v1.4S, v15.4S, v23.s[3] +str q16, [x0, #496] +mul v15.4S, v15.4S,v24.s[3] +ldr q16, [x0, #384] +sub v3.4s, v14.4s, v0.4s +ldr q21, [x0, #448] +mla v22.4S, v9.4S, v31.s[0] +add v14.4s, v14.4s, v0.4s +str q18, [x0, #560] +sqrdmulh v18.4S, v2.4S, v29.s[0] +str q17, [x0, #624] +ldr q17, [x0, #512] +mul v2.4S, v2.4S,v30.s[0] +ldr q0, [x0, #576] +sub v9.4s, v12.4s, v22.4s +mla v15.4S, v1.4S, v31.s[0] +add v12.4s, v12.4s, v22.4s +str q14, [x0, #688] +sqrdmulh v14.4S, v11.4S, v29.s[0] +str q3, [x0, #752] +ldr q3, [x0, #640] +mul v11.4S, v11.4S,v30.s[0] +ldr q22, [x0, #704] +sub v1.4s, v20.4s, v15.4s +mla v2.4S, v18.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +str q12, [x0, #816] +sqrdmulh v12.4S, v8.4S, v29.s[0] +str q9, [x0, #880] +mul v8.4S, v8.4S,v30.s[0] +ldr q9, [x0, #0] +sub v15.4s, v19.4s, v2.4s +mla v11.4S, v14.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +str q20, [x0, #944] +sqrdmulh v20.4S, v10.4S, v29.s[0] +str q1, [x0, #1008] +mul v10.4S, v10.4S,v30.s[0] +ldr q1, [x0, #64] +sub v2.4s, v13.4s, v11.4s +mla v8.4S, v12.4S, v31.s[0] +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v29.s[0] +ldr q12, [x0, #128] +mul v17.4S, v17.4S,v30.s[0] +sub v14.4s, v16.4s, v8.4s +mla v10.4S, v20.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v0.4S, v29.s[0] +ldr q20, [x0, #192] +mul v0.4S, v0.4S,v30.s[0] +sub v18.4s, v21.4s, v10.4s +mla v17.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +sub v11.4s, v9.4s, v17.4s +mla v0.4S, v8.4S, v31.s[0] +add v9.4s, v9.4s, v17.4s +sqrdmulh v17.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v8.4s, v1.4s, v0.4s +mla v3.4S, v10.4S, v31.s[0] +add v1.4s, v1.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v10.4s, v12.4s, v3.4s +mla v22.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v17.4s, v20.4s, v22.4s +mla v16.4S, v0.4S, v31.s[0] +add v20.4s, v20.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[1] +mul v19.4S, v19.4S,v30.s[1] +sub v0.4s, v12.4s, v16.4s +mla v21.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v13.4S, v29.s[1] +mul v13.4S, v13.4S,v30.s[1] +sub v3.4s, v20.4s, v21.4s +mla v19.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v22.4s, v9.4s, v19.4s +mla v13.4S, v16.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v29.s[2] +mul v18.4S, v18.4S,v30.s[2] +sub v16.4s, v1.4s, v13.4s +mla v14.4S, v21.4S, v31.s[0] +add v1.4s, v1.4s, v13.4s +sqrdmulh v13.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +sub v21.4s, v10.4s, v14.4s +mla v18.4S, v19.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v19.4s, v17.4s, v18.4s +mla v15.4S, v13.4S, v31.s[0] +add v17.4s, v17.4s, v18.4s +sqrdmulh v18.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +sub v13.4s, v11.4s, v15.4s +mla v2.4S, v14.4S, v31.s[0] +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v27.s[0] +mul v20.4S, v20.4S,v28.s[0] +sub v14.4s, v8.4s, v2.4s +mla v12.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v2.4s +sqrdmulh v2.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v18.4s, v9.4s, v12.4s +mla v20.4S, v15.4S, v31.s[0] +add v9.4s, v9.4s, v12.4s +sqrdmulh v12.4S, v3.4S, v27.s[1] +mul v3.4S, v3.4S,v28.s[1] +sub v15.4s, v1.4s, v20.4s +mla v0.4S, v2.4S, v31.s[0] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v10.4S, v27.s[2] +mul v10.4S, v10.4S,v28.s[2] +sub v2.4s, v22.4s, v0.4s +mla v3.4S, v12.4S, v31.s[0] +add v22.4s, v22.4s, v0.4s +sqrdmulh v0.4S, v17.4S, v27.s[2] +mul v17.4S, v17.4S,v28.s[2] +sub v12.4s, v16.4s, v3.4s +mla v10.4S, v20.4S, v31.s[0] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +sub v20.4s, v11.4s, v10.4s +mla v17.4S, v0.4S, v31.s[0] +add v11.4s, v11.4s, v10.4s +sqrdmulh v10.4S, v19.4S, v27.s[3] +mul v19.4S, v19.4S,v28.s[3] +sub v0.4s, v8.4s, v17.4s +mla v21.4S, v3.4S, v31.s[0] +add v8.4s, v8.4s, v17.4s +sqrdmulh v17.4S, v1.4S, v25.s[0] +mul v1.4S, v1.4S,v26.s[0] +sub v3.4s, v13.4s, v21.4s +mla v19.4S, v10.4S, v31.s[0] +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v15.4S, v25.s[1] +mul v15.4S, v15.4S,v26.s[1] +sub v10.4s, v14.4s, v19.4s +mla v1.4S, v17.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +sqrdmulh v19.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v17.4s, v9.4s, v1.4s +mla v15.4S, v21.4S, v31.s[0] +add v9.4s, v9.4s, v1.4s +sqrdmulh v1.4S, v12.4S, v25.s[3] +mul v12.4S, v12.4S,v26.s[3] +sub v21.4s, v18.4s, v15.4s +mla v16.4S, v19.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +str q9, [x0, #0] +sqrdmulh v9.4S, v8.4S, v23.s[0] +str q17, [x0, #64] +mul v8.4S, v8.4S,v24.s[0] +ldr q17, [x0, #784] +sub v15.4s, v22.4s, v16.4s +ldr q19, [x0, #848] +mla v12.4S, v1.4S, v31.s[0] +add v22.4s, v22.4s, v16.4s +str q18, [x0, #128] +sqrdmulh v18.4S, v0.4S, v23.s[1] +str q21, [x0, #192] +mul v0.4S, v0.4S,v24.s[1] +ldr q21, [x0, #912] +sub v16.4s, v2.4s, v12.4s +ldr q1, [x0, #976] +mla v8.4S, v9.4S, v31.s[0] +add v2.4s, v2.4s, v12.4s +str q22, [x0, #256] +sqrdmulh v22.4S, v14.4S, v23.s[2] +str q15, [x0, #320] +mul v14.4S, v14.4S,v24.s[2] +ldr q15, [x0, #272] +sub v12.4s, v11.4s, v8.4s +ldr q9, [x0, #336] +mla v0.4S, v18.4S, v31.s[0] +add v11.4s, v11.4s, v8.4s +str q2, [x0, #384] +sqrdmulh v2.4S, v10.4S, v23.s[3] +str q16, [x0, #448] +mul v10.4S, v10.4S,v24.s[3] +ldr q16, [x0, #400] +sub v8.4s, v20.4s, v0.4s +ldr q18, [x0, #464] +mla v14.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v0.4s +str q11, [x0, #512] +sqrdmulh v11.4S, v17.4S, v29.s[0] +str q12, [x0, #576] +ldr q12, [x0, #528] +mul v17.4S, v17.4S,v30.s[0] +ldr q0, [x0, #592] +sub v22.4s, v13.4s, v14.4s +mla v10.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v14.4s +str q20, [x0, #640] +sqrdmulh v20.4S, v19.4S, v29.s[0] +str q8, [x0, #704] +ldr q8, [x0, #656] +mul v19.4S, v19.4S,v30.s[0] +ldr q14, [x0, #720] +sub v2.4s, v3.4s, v10.4s +mla v17.4S, v11.4S, v31.s[0] +add v3.4s, v3.4s, v10.4s +str q13, [x0, #768] +sqrdmulh v13.4S, v21.4S, v29.s[0] +str q22, [x0, #832] +mul v21.4S, v21.4S,v30.s[0] +ldr q22, [x0, #16] +sub v10.4s, v15.4s, v17.4s +mla v19.4S, v20.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +str q3, [x0, #896] +sqrdmulh v3.4S, v1.4S, v29.s[0] +str q2, [x0, #960] +mul v1.4S, v1.4S,v30.s[0] +ldr q2, [x0, #80] +sub v17.4s, v9.4s, v19.4s +mla v21.4S, v13.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v12.4S, v29.s[0] +ldr q13, [x0, #144] +mul v12.4S, v12.4S,v30.s[0] +sub v20.4s, v16.4s, v21.4s +mla v1.4S, v3.4S, v31.s[0] +add v16.4s, v16.4s, v21.4s +sqrdmulh v21.4S, v0.4S, v29.s[0] +ldr q3, [x0, #208] +mul v0.4S, v0.4S,v30.s[0] +sub v11.4s, v18.4s, v1.4s +mla v12.4S, v19.4S, v31.s[0] +add v18.4s, v18.4s, v1.4s +sqrdmulh v1.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v19.4s, v22.4s, v12.4s +mla v0.4S, v21.4S, v31.s[0] +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v21.4s, v2.4s, v0.4s +mla v8.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v1.4s, v13.4s, v8.4s +mla v14.4S, v12.4S, v31.s[0] +add v13.4s, v13.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v12.4s, v3.4s, v14.4s +mla v16.4S, v0.4S, v31.s[0] +add v3.4s, v3.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +sub v0.4s, v13.4s, v16.4s +mla v18.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v16.4s +sqrdmulh v16.4S, v9.4S, v29.s[1] +mul v9.4S, v9.4S,v30.s[1] +sub v8.4s, v3.4s, v18.4s +mla v15.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +sub v14.4s, v22.4s, v15.4s +mla v9.4S, v16.4S, v31.s[0] +add v22.4s, v22.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v16.4s, v2.4s, v9.4s +mla v20.4S, v18.4S, v31.s[0] +add v2.4s, v2.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v18.4s, v1.4s, v20.4s +mla v11.4S, v15.4S, v31.s[0] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +sub v15.4s, v12.4s, v11.4s +mla v10.4S, v9.4S, v31.s[0] +add v12.4s, v12.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v27.s[0] +mul v13.4S, v13.4S,v28.s[0] +sub v9.4s, v19.4s, v10.4s +mla v17.4S, v20.4S, v31.s[0] +add v19.4s, v19.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v27.s[0] +mul v3.4S, v3.4S,v28.s[0] +sub v20.4s, v21.4s, v17.4s +mla v13.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v11.4s, v22.4s, v13.4s +mla v3.4S, v10.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v8.4S, v27.s[1] +mul v8.4S, v8.4S,v28.s[1] +sub v10.4s, v2.4s, v3.4s +mla v0.4S, v17.4S, v31.s[0] +add v2.4s, v2.4s, v3.4s +sqrdmulh v3.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +sub v17.4s, v14.4s, v0.4s +mla v8.4S, v13.4S, v31.s[0] +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v12.4S, v27.s[2] +mul v12.4S, v12.4S,v28.s[2] +sub v13.4s, v16.4s, v8.4s +mla v1.4S, v3.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v3.4s, v19.4s, v1.4s +mla v12.4S, v0.4S, v31.s[0] +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +sub v0.4s, v21.4s, v12.4s +mla v18.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v2.4S, v25.s[0] +mul v2.4S, v2.4S,v26.s[0] +sub v8.4s, v9.4s, v18.4s +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v10.4S, v25.s[1] +mul v10.4S, v10.4S,v26.s[1] +sub v1.4s, v20.4s, v15.4s +mla v2.4S, v12.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v12.4s, v22.4s, v2.4s +mla v10.4S, v18.4S, v31.s[0] +add v22.4s, v22.4s, v2.4s +sqrdmulh v2.4S, v13.4S, v25.s[3] +mul v13.4S, v13.4S,v26.s[3] +sub v18.4s, v11.4s, v10.4s +mla v16.4S, v15.4S, v31.s[0] +add v11.4s, v11.4s, v10.4s +str q22, [x0, #16] +sqrdmulh v22.4S, v21.4S, v23.s[0] +str q12, [x0, #80] +mul v21.4S, v21.4S,v24.s[0] +sub v12.4s, v14.4s, v16.4s +mla v13.4S, v2.4S, v31.s[0] +add v14.4s, v14.4s, v16.4s +str q11, [x0, #144] +sqrdmulh v11.4S, v0.4S, v23.s[1] +str q18, [x0, #208] +mul v0.4S, v0.4S,v24.s[1] +sub v18.4s, v17.4s, v13.4s +mla v21.4S, v22.4S, v31.s[0] +add v17.4s, v17.4s, v13.4s +str q14, [x0, #272] +sqrdmulh v14.4S, v20.4S, v23.s[2] +str q12, [x0, #336] +mul v20.4S, v20.4S,v24.s[2] +sub v12.4s, v19.4s, v21.4s +mla v0.4S, v11.4S, v31.s[0] +add v19.4s, v19.4s, v21.4s +str q17, [x0, #400] +sqrdmulh v17.4S, v1.4S, v23.s[3] +str q18, [x0, #464] +mul v1.4S, v1.4S,v24.s[3] +sub v18.4s, v3.4s, v0.4s +mla v20.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v0.4s +str q19, [x0, #528] +str q12, [x0, #592] +sub v12.4s, v9.4s, v20.4s +mla v1.4S, v17.4S, v31.s[0] +add v9.4s, v9.4s, v20.4s +str q3, [x0, #656] +str q18, [x0, #720] +sub v18.4s, v8.4s, v1.4s +add v8.4s, v8.4s, v1.4s +str q9, [x0, #784] +str q12, [x0, #848] +str q8, [x0, #912] +str q18, [x0, #976] +ldr q4, [x17, #+128] +ldr q5, [x17, #+144] +ldr q6, [x17, #+160] +ldr q7, [x17, #+176] +ldr q15, [x17, #+192] +ldr q10, [x17, #+208] +ldr q2, [x17, #+224] +ldr q16, [x17, #+240] +ldr q22, [x0, #32] +ldr q13, [x0, #48] +ldr q11, [x0, #0] +ldr q21, [x0, #96] +ldr q14, [x0, #112] +ldr q0, [x0, #64] +ldr q19, [x0, #160] +ldr q17, [x0, #176] +ldr q20, [x0, #128] +ldr q3, [x0, #224] +ldr q1, [x0, #240] +ldr q9, [x0, #192] +sqrdmulh v12.4S, v22.4S, v5.s[0] +sqrdmulh v8.4S, v21.4S, v7.s[0] +sqrdmulh v18.4S, v19.4S, v10.s[0] +sqrdmulh v30.4S, v3.4S, v16.s[0] +mul v22.4S, v22.4S,v4.s[0] +mul v21.4S, v21.4S,v6.s[0] +mul v19.4S, v19.4S,v15.s[0] +mul v3.4S, v3.4S,v2.s[0] +mla v22.4S, v12.4S, v31.s[0] +mla v21.4S, v8.4S, v31.s[0] +mla v19.4S, v18.4S, v31.s[0] +mla v3.4S, v30.4S, v31.s[0] +sub v30.4s, v11.4s, v22.4s +sub v18.4s, v0.4s, v21.4s +sub v8.4s, v20.4s, v19.4s +sub v12.4s, v9.4s, v3.4s +add v11.4s, v11.4s, v22.4s +add v0.4s, v0.4s, v21.4s +add v20.4s, v20.4s, v19.4s +add v9.4s, v9.4s, v3.4s +ldr q3, [x0, #16] +ldr q19, [x0, #80] +ldr q21, [x0, #144] +ldr q22, [x0, #208] +sqrdmulh v29.4S, v13.4S, v5.s[0] +sqrdmulh v28.4S, v14.4S, v7.s[0] +sqrdmulh v27.4S, v17.4S, v10.s[0] +sqrdmulh v26.4S, v1.4S, v16.s[0] +mul v13.4S, v13.4S,v4.s[0] +mul v14.4S, v14.4S,v6.s[0] +mul v17.4S, v17.4S,v15.s[0] +mul v1.4S, v1.4S,v2.s[0] +mla v13.4S, v29.4S, v31.s[0] +mla v14.4S, v28.4S, v31.s[0] +mla v17.4S, v27.4S, v31.s[0] +mla v1.4S, v26.4S, v31.s[0] +sub v26.4s, v3.4s, v13.4s +sub v27.4s, v19.4s, v14.4s +sub v28.4s, v21.4s, v17.4s +sub v29.4s, v22.4s, v1.4s +add v3.4s, v3.4s, v13.4s +add v19.4s, v19.4s, v14.4s +add v21.4s, v21.4s, v17.4s +add v22.4s, v22.4s, v1.4s +sqrdmulh v1.4S, v3.4S, v5.s[1] +sqrdmulh v17.4S, v19.4S, v7.s[1] +sqrdmulh v14.4S, v21.4S, v10.s[1] +sqrdmulh v13.4S, v22.4S, v16.s[1] +mul v3.4S, v3.4S,v4.s[1] +mul v19.4S, v19.4S,v6.s[1] +mul v21.4S, v21.4S,v15.s[1] +mul v22.4S, v22.4S,v2.s[1] +mla v3.4S, v1.4S, v31.s[0] +mla v19.4S, v17.4S, v31.s[0] +mla v21.4S, v14.4S, v31.s[0] +mla v22.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v3.4s +sub v14.4s, v0.4s, v19.4s +sub v17.4s, v20.4s, v21.4s +sub v1.4s, v9.4s, v22.4s +add v11.4s, v11.4s, v3.4s +add v0.4s, v0.4s, v19.4s +add v20.4s, v20.4s, v21.4s +add v9.4s, v9.4s, v22.4s +sqrdmulh v22.4S, v26.4S, v5.s[2] +sqrdmulh v21.4S, v27.4S, v7.s[2] +sqrdmulh v19.4S, v28.4S, v10.s[2] +sqrdmulh v3.4S, v29.4S, v16.s[2] +str q11, [x0, #0] +str q13, [x0, #16] +mul v26.4S, v26.4S,v4.s[2] +mul v27.4S, v27.4S,v6.s[2] +mul v28.4S, v28.4S,v15.s[2] +mul v29.4S, v29.4S,v2.s[2] +str q0, [x0, #64] +str q14, [x0, #80] +ldr q16, [x17, #+256] +ldr q2, [x17, #+272] +ldr q10, [x17, #+288] +ldr q15, [x17, #+304] +mla v26.4S, v22.4S, v31.s[0] +mla v27.4S, v21.4S, v31.s[0] +mla v28.4S, v19.4S, v31.s[0] +mla v29.4S, v3.4S, v31.s[0] +str q20, [x0, #128] +str q17, [x0, #144] +ldr q17, [x17, #+320] +ldr q20, [x17, #+336] +sub v3.4s, v30.4s, v26.4s +sub v19.4s, v18.4s, v27.4s +sub v21.4s, v8.4s, v28.4s +sub v22.4s, v12.4s, v29.4s +str q9, [x0, #192] +str q1, [x0, #208] +ldr q1, [x17, #+352] +ldr q9, [x17, #+368] +add v30.4s, v30.4s, v26.4s +add v18.4s, v18.4s, v27.4s +add v8.4s, v8.4s, v28.4s +add v12.4s, v12.4s, v29.4s +str q30, [x0, #32] +str q18, [x0, #96] +str q8, [x0, #160] +str q12, [x0, #224] +ldr q12, [x0, #288] +ldr q8, [x0, #304] +ldr q18, [x0, #256] +ldr q30, [x0, #352] +ldr q29, [x0, #368] +ldr q28, [x0, #320] +ldr q27, [x0, #416] +ldr q26, [x0, #432] +ldr q7, [x0, #384] +ldr q6, [x0, #480] +ldr q5, [x0, #496] +ldr q4, [x0, #448] +sqrdmulh v14.4S, v12.4S, v2.s[0] +sqrdmulh v0.4S, v30.4S, v15.s[0] +sqrdmulh v13.4S, v27.4S, v20.s[0] +sqrdmulh v11.4S, v6.4S, v9.s[0] +str q3, [x0, #48] +mul v12.4S, v12.4S,v16.s[0] +mul v30.4S, v30.4S,v10.s[0] +mul v27.4S, v27.4S,v17.s[0] +mul v6.4S, v6.4S,v1.s[0] +str q19, [x0, #112] +mla v12.4S, v14.4S, v31.s[0] +mla v30.4S, v0.4S, v31.s[0] +mla v27.4S, v13.4S, v31.s[0] +mla v6.4S, v11.4S, v31.s[0] +str q21, [x0, #176] +sub v21.4s, v18.4s, v12.4s +sub v11.4s, v28.4s, v30.4s +sub v13.4s, v7.4s, v27.4s +sub v0.4s, v4.4s, v6.4s +str q22, [x0, #240] +add v18.4s, v18.4s, v12.4s +add v28.4s, v28.4s, v30.4s +add v7.4s, v7.4s, v27.4s +add v4.4s, v4.4s, v6.4s +ldr q6, [x0, #272] +ldr q27, [x0, #336] +ldr q30, [x0, #400] +ldr q12, [x0, #464] +sqrdmulh v22.4S, v8.4S, v2.s[0] +sqrdmulh v14.4S, v29.4S, v15.s[0] +sqrdmulh v19.4S, v26.4S, v20.s[0] +sqrdmulh v3.4S, v5.4S, v9.s[0] +mul v8.4S, v8.4S,v16.s[0] +mul v29.4S, v29.4S,v10.s[0] +mul v26.4S, v26.4S,v17.s[0] +mul v5.4S, v5.4S,v1.s[0] +mla v8.4S, v22.4S, v31.s[0] +mla v29.4S, v14.4S, v31.s[0] +mla v26.4S, v19.4S, v31.s[0] +mla v5.4S, v3.4S, v31.s[0] +sub v3.4s, v6.4s, v8.4s +sub v19.4s, v27.4s, v29.4s +sub v14.4s, v30.4s, v26.4s +sub v22.4s, v12.4s, v5.4s +add v6.4s, v6.4s, v8.4s +add v27.4s, v27.4s, v29.4s +add v30.4s, v30.4s, v26.4s +add v12.4s, v12.4s, v5.4s +sqrdmulh v5.4S, v6.4S, v2.s[1] +sqrdmulh v26.4S, v27.4S, v15.s[1] +sqrdmulh v29.4S, v30.4S, v20.s[1] +sqrdmulh v8.4S, v12.4S, v9.s[1] +mul v6.4S, v6.4S,v16.s[1] +mul v27.4S, v27.4S,v10.s[1] +mul v30.4S, v30.4S,v17.s[1] +mul v12.4S, v12.4S,v1.s[1] +mla v6.4S, v5.4S, v31.s[0] +mla v27.4S, v26.4S, v31.s[0] +mla v30.4S, v29.4S, v31.s[0] +mla v12.4S, v8.4S, v31.s[0] +sub v8.4s, v18.4s, v6.4s +sub v29.4s, v28.4s, v27.4s +sub v26.4s, v7.4s, v30.4s +sub v5.4s, v4.4s, v12.4s +add v18.4s, v18.4s, v6.4s +add v28.4s, v28.4s, v27.4s +add v7.4s, v7.4s, v30.4s +add v4.4s, v4.4s, v12.4s +sqrdmulh v12.4S, v3.4S, v2.s[2] +sqrdmulh v30.4S, v19.4S, v15.s[2] +sqrdmulh v27.4S, v14.4S, v20.s[2] +sqrdmulh v6.4S, v22.4S, v9.s[2] +str q18, [x0, #256] +str q8, [x0, #272] +mul v3.4S, v3.4S,v16.s[2] +mul v19.4S, v19.4S,v10.s[2] +mul v14.4S, v14.4S,v17.s[2] +mul v22.4S, v22.4S,v1.s[2] +str q28, [x0, #320] +str q29, [x0, #336] +ldr q9, [x17, #+384] +ldr q1, [x17, #+400] +ldr q20, [x17, #+416] +ldr q17, [x17, #+432] +mla v3.4S, v12.4S, v31.s[0] +mla v19.4S, v30.4S, v31.s[0] +mla v14.4S, v27.4S, v31.s[0] +mla v22.4S, v6.4S, v31.s[0] +str q7, [x0, #384] +str q26, [x0, #400] +ldr q26, [x17, #+448] +ldr q7, [x17, #+464] +sub v6.4s, v21.4s, v3.4s +sub v27.4s, v11.4s, v19.4s +sub v30.4s, v13.4s, v14.4s +sub v12.4s, v0.4s, v22.4s +str q4, [x0, #448] +str q5, [x0, #464] +ldr q5, [x17, #+480] +ldr q4, [x17, #+496] +add v21.4s, v21.4s, v3.4s +add v11.4s, v11.4s, v19.4s +add v13.4s, v13.4s, v14.4s +add v0.4s, v0.4s, v22.4s +str q21, [x0, #288] +str q11, [x0, #352] +str q13, [x0, #416] +str q0, [x0, #480] +ldr q0, [x0, #544] +ldr q13, [x0, #560] +ldr q11, [x0, #512] +ldr q21, [x0, #608] +ldr q22, [x0, #624] +ldr q14, [x0, #576] +ldr q19, [x0, #672] +ldr q3, [x0, #688] +ldr q15, [x0, #640] +ldr q10, [x0, #736] +ldr q2, [x0, #752] +ldr q16, [x0, #704] +sqrdmulh v29.4S, v0.4S, v1.s[0] +sqrdmulh v28.4S, v21.4S, v17.s[0] +sqrdmulh v8.4S, v19.4S, v7.s[0] +sqrdmulh v18.4S, v10.4S, v4.s[0] +str q6, [x0, #304] +mul v0.4S, v0.4S,v9.s[0] +mul v21.4S, v21.4S,v20.s[0] +mul v19.4S, v19.4S,v26.s[0] +mul v10.4S, v10.4S,v5.s[0] +str q27, [x0, #368] +mla v0.4S, v29.4S, v31.s[0] +mla v21.4S, v28.4S, v31.s[0] +mla v19.4S, v8.4S, v31.s[0] +mla v10.4S, v18.4S, v31.s[0] +str q30, [x0, #432] +sub v30.4s, v11.4s, v0.4s +sub v18.4s, v14.4s, v21.4s +sub v8.4s, v15.4s, v19.4s +sub v28.4s, v16.4s, v10.4s +str q12, [x0, #496] +add v11.4s, v11.4s, v0.4s +add v14.4s, v14.4s, v21.4s +add v15.4s, v15.4s, v19.4s +add v16.4s, v16.4s, v10.4s +ldr q10, [x0, #528] +ldr q19, [x0, #592] +ldr q21, [x0, #656] +ldr q0, [x0, #720] +sqrdmulh v12.4S, v13.4S, v1.s[0] +sqrdmulh v29.4S, v22.4S, v17.s[0] +sqrdmulh v27.4S, v3.4S, v7.s[0] +sqrdmulh v6.4S, v2.4S, v4.s[0] +mul v13.4S, v13.4S,v9.s[0] +mul v22.4S, v22.4S,v20.s[0] +mul v3.4S, v3.4S,v26.s[0] +mul v2.4S, v2.4S,v5.s[0] +mla v13.4S, v12.4S, v31.s[0] +mla v22.4S, v29.4S, v31.s[0] +mla v3.4S, v27.4S, v31.s[0] +mla v2.4S, v6.4S, v31.s[0] +sub v6.4s, v10.4s, v13.4s +sub v27.4s, v19.4s, v22.4s +sub v29.4s, v21.4s, v3.4s +sub v12.4s, v0.4s, v2.4s +add v10.4s, v10.4s, v13.4s +add v19.4s, v19.4s, v22.4s +add v21.4s, v21.4s, v3.4s +add v0.4s, v0.4s, v2.4s +sqrdmulh v2.4S, v10.4S, v1.s[1] +sqrdmulh v3.4S, v19.4S, v17.s[1] +sqrdmulh v22.4S, v21.4S, v7.s[1] +sqrdmulh v13.4S, v0.4S, v4.s[1] +mul v10.4S, v10.4S,v9.s[1] +mul v19.4S, v19.4S,v20.s[1] +mul v21.4S, v21.4S,v26.s[1] +mul v0.4S, v0.4S,v5.s[1] +mla v10.4S, v2.4S, v31.s[0] +mla v19.4S, v3.4S, v31.s[0] +mla v21.4S, v22.4S, v31.s[0] +mla v0.4S, v13.4S, v31.s[0] +sub v13.4s, v11.4s, v10.4s +sub v22.4s, v14.4s, v19.4s +sub v3.4s, v15.4s, v21.4s +sub v2.4s, v16.4s, v0.4s +add v11.4s, v11.4s, v10.4s +add v14.4s, v14.4s, v19.4s +add v15.4s, v15.4s, v21.4s +add v16.4s, v16.4s, v0.4s +sqrdmulh v0.4S, v6.4S, v1.s[2] +sqrdmulh v21.4S, v27.4S, v17.s[2] +sqrdmulh v19.4S, v29.4S, v7.s[2] +sqrdmulh v10.4S, v12.4S, v4.s[2] +str q11, [x0, #512] +str q13, [x0, #528] +mul v6.4S, v6.4S,v9.s[2] +mul v27.4S, v27.4S,v20.s[2] +mul v29.4S, v29.4S,v26.s[2] +mul v12.4S, v12.4S,v5.s[2] +str q14, [x0, #576] +str q22, [x0, #592] +ldr q4, [x17, #+512] +ldr q5, [x17, #+528] +ldr q7, [x17, #+544] +ldr q26, [x17, #+560] +mla v6.4S, v0.4S, v31.s[0] +mla v27.4S, v21.4S, v31.s[0] +mla v29.4S, v19.4S, v31.s[0] +mla v12.4S, v10.4S, v31.s[0] +str q15, [x0, #640] +str q3, [x0, #656] +ldr q3, [x17, #+576] +ldr q15, [x17, #+592] +sub v10.4s, v30.4s, v6.4s +sub v19.4s, v18.4s, v27.4s +sub v21.4s, v8.4s, v29.4s +sub v0.4s, v28.4s, v12.4s +str q16, [x0, #704] +str q2, [x0, #720] +ldr q2, [x17, #+608] +ldr q16, [x17, #+624] +add v30.4s, v30.4s, v6.4s +add v18.4s, v18.4s, v27.4s +add v8.4s, v8.4s, v29.4s +add v28.4s, v28.4s, v12.4s +str q30, [x0, #544] +str q18, [x0, #608] +str q8, [x0, #672] +str q28, [x0, #736] +ldr q28, [x0, #800] +ldr q8, [x0, #816] +ldr q18, [x0, #768] +ldr q30, [x0, #864] +ldr q12, [x0, #880] +ldr q29, [x0, #832] +ldr q27, [x0, #928] +ldr q6, [x0, #944] +ldr q17, [x0, #896] +ldr q20, [x0, #992] +ldr q1, [x0, #1008] +ldr q9, [x0, #960] +sqrdmulh v22.4S, v28.4S, v5.s[0] +sqrdmulh v14.4S, v30.4S, v26.s[0] +sqrdmulh v13.4S, v27.4S, v15.s[0] +sqrdmulh v11.4S, v20.4S, v16.s[0] +str q10, [x0, #560] +mul v28.4S, v28.4S,v4.s[0] +mul v30.4S, v30.4S,v7.s[0] +mul v27.4S, v27.4S,v3.s[0] +mul v20.4S, v20.4S,v2.s[0] +str q19, [x0, #624] +mla v28.4S, v22.4S, v31.s[0] +mla v30.4S, v14.4S, v31.s[0] +mla v27.4S, v13.4S, v31.s[0] +mla v20.4S, v11.4S, v31.s[0] +str q21, [x0, #688] +sub v21.4s, v18.4s, v28.4s +sub v11.4s, v29.4s, v30.4s +sub v13.4s, v17.4s, v27.4s +sub v14.4s, v9.4s, v20.4s +str q0, [x0, #752] +add v18.4s, v18.4s, v28.4s +add v29.4s, v29.4s, v30.4s +add v17.4s, v17.4s, v27.4s +add v9.4s, v9.4s, v20.4s +ldr q20, [x0, #784] +ldr q27, [x0, #848] +ldr q30, [x0, #912] +ldr q28, [x0, #976] +sqrdmulh v0.4S, v8.4S, v5.s[0] +sqrdmulh v22.4S, v12.4S, v26.s[0] +sqrdmulh v19.4S, v6.4S, v15.s[0] +sqrdmulh v10.4S, v1.4S, v16.s[0] +mul v8.4S, v8.4S,v4.s[0] +mul v12.4S, v12.4S,v7.s[0] +mul v6.4S, v6.4S,v3.s[0] +mul v1.4S, v1.4S,v2.s[0] +mla v8.4S, v0.4S, v31.s[0] +mla v12.4S, v22.4S, v31.s[0] +mla v6.4S, v19.4S, v31.s[0] +mla v1.4S, v10.4S, v31.s[0] +sub v10.4s, v20.4s, v8.4s +sub v19.4s, v27.4s, v12.4s +sub v22.4s, v30.4s, v6.4s +sub v0.4s, v28.4s, v1.4s +add v20.4s, v20.4s, v8.4s +add v27.4s, v27.4s, v12.4s +add v30.4s, v30.4s, v6.4s +add v28.4s, v28.4s, v1.4s +sqrdmulh v1.4S, v20.4S, v5.s[1] +sqrdmulh v6.4S, v27.4S, v26.s[1] +sqrdmulh v12.4S, v30.4S, v15.s[1] +sqrdmulh v8.4S, v28.4S, v16.s[1] +mul v20.4S, v20.4S,v4.s[1] +mul v27.4S, v27.4S,v7.s[1] +mul v30.4S, v30.4S,v3.s[1] +mul v28.4S, v28.4S,v2.s[1] +mla v20.4S, v1.4S, v31.s[0] +mla v27.4S, v6.4S, v31.s[0] +mla v30.4S, v12.4S, v31.s[0] +mla v28.4S, v8.4S, v31.s[0] +sub v8.4s, v18.4s, v20.4s +sub v12.4s, v29.4s, v27.4s +sub v6.4s, v17.4s, v30.4s +sub v1.4s, v9.4s, v28.4s +add v18.4s, v18.4s, v20.4s +add v29.4s, v29.4s, v27.4s +add v17.4s, v17.4s, v30.4s +add v9.4s, v9.4s, v28.4s +sqrdmulh v28.4S, v10.4S, v5.s[2] +sqrdmulh v30.4S, v19.4S, v26.s[2] +sqrdmulh v27.4S, v22.4S, v15.s[2] +sqrdmulh v20.4S, v0.4S, v16.s[2] +str q18, [x0, #768] +str q8, [x0, #784] +mul v10.4S, v10.4S,v4.s[2] +mul v19.4S, v19.4S,v7.s[2] +mul v22.4S, v22.4S,v3.s[2] +mul v0.4S, v0.4S,v2.s[2] +str q29, [x0, #832] +str q12, [x0, #848] +mla v10.4S, v28.4S, v31.s[0] +mla v19.4S, v30.4S, v31.s[0] +mla v22.4S, v27.4S, v31.s[0] +mla v0.4S, v20.4S, v31.s[0] +str q17, [x0, #896] +str q6, [x0, #912] +sub v6.4s, v21.4s, v10.4s +sub v17.4s, v11.4s, v19.4s +sub v20.4s, v13.4s, v22.4s +sub v27.4s, v14.4s, v0.4s +str q9, [x0, #960] +str q1, [x0, #976] +add v21.4s, v21.4s, v10.4s +add v11.4s, v11.4s, v19.4s +add v13.4s, v13.4s, v22.4s +add v14.4s, v14.4s, v0.4s +str q21, [x0, #800] +str q11, [x0, #864] +str q13, [x0, #928] +str q14, [x0, #992] +str q6, [x0, #816] +str q17, [x0, #880] +str q20, [x0, #944] +str q27, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1464 +// Instruction count: 1460 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_4.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_4.s new file mode 100644 index 0000000..1381d5b --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_4.s @@ -0,0 +1,1494 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_3_z4_4 +.global _ntt_u32_incomplete_neon_asm_var_4_2_3_z4_4 +ntt_u32_incomplete_neon_asm_var_4_2_3_z4_4: +_ntt_u32_incomplete_neon_asm_var_4_2_3_z4_4: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #800] +ldr q21, [x0, #864] +ldr q20, [x0, #928] +ldr q19, [x0, #992] +ldr q18, [x0, #288] +ldr q17, [x0, #352] +ldr q16, [x0, #416] +ldr q3, [x0, #480] +sqrdmulh v2.4S, v22.4S, v29.s[0] +ldr q1, [x0, #544] +mul v22.4S, v22.4S,v30.s[0] +ldr q0, [x0, #608] +sqrdmulh v15.4S, v21.4S, v29.s[0] +ldr q14, [x0, #672] +mul v21.4S, v21.4S,v30.s[0] +ldr q13, [x0, #736] +mla v22.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q12, [x0, #32] +sub v11.4s, v18.4s, v22.4s +mla v21.4S, v15.4S, v31.s[0] +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q15, [x0, #96] +sub v10.4s, v17.4s, v21.4s +mla v20.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v1.4S, v29.s[0] +ldr q2, [x0, #160] +mul v1.4S, v1.4S,v30.s[0] +sub v9.4s, v16.4s, v20.4s +mla v19.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v0.4S, v29.s[0] +ldr q22, [x0, #224] +mul v0.4S, v0.4S,v30.s[0] +sub v8.4s, v3.4s, v19.4s +mla v1.4S, v21.4S, v31.s[0] +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v21.4s, v12.4s, v1.4s +mla v0.4S, v20.4S, v31.s[0] +add v12.4s, v12.4s, v1.4s +sqrdmulh v1.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +sub v20.4s, v15.4s, v0.4s +mla v14.4S, v19.4S, v31.s[0] +add v15.4s, v15.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v19.4s, v2.4s, v14.4s +mla v13.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v1.4s, v22.4s, v13.4s +mla v16.4S, v0.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v0.4s, v2.4s, v16.4s +mla v3.4S, v14.4S, v31.s[0] +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v14.4s, v22.4s, v3.4s +mla v18.4S, v13.4S, v31.s[0] +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v29.s[2] +mul v9.4S, v9.4S,v30.s[2] +sub v13.4s, v12.4s, v18.4s +mla v17.4S, v16.4S, v31.s[0] +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v8.4S, v29.s[2] +mul v8.4S, v8.4S,v30.s[2] +sub v16.4s, v15.4s, v17.4s +mla v9.4S, v3.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +sqrdmulh v17.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v3.4s, v19.4s, v9.4s +mla v8.4S, v18.4S, v31.s[0] +add v19.4s, v19.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v18.4s, v1.4s, v8.4s +mla v11.4S, v17.4S, v31.s[0] +add v1.4s, v1.4s, v8.4s +sqrdmulh v8.4S, v2.4S, v27.s[0] +mul v2.4S, v2.4S,v28.s[0] +sub v17.4s, v21.4s, v11.4s +mla v10.4S, v9.4S, v31.s[0] +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v27.s[0] +mul v22.4S, v22.4S,v28.s[0] +sub v9.4s, v20.4s, v10.4s +mla v2.4S, v8.4S, v31.s[0] +add v20.4s, v20.4s, v10.4s +sqrdmulh v10.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v8.4s, v12.4s, v2.4s +mla v22.4S, v11.4S, v31.s[0] +add v12.4s, v12.4s, v2.4s +sqrdmulh v2.4S, v14.4S, v27.s[1] +mul v14.4S, v14.4S,v28.s[1] +sub v11.4s, v15.4s, v22.4s +mla v0.4S, v10.4S, v31.s[0] +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v27.s[2] +mul v19.4S, v19.4S,v28.s[2] +sub v10.4s, v13.4s, v0.4s +mla v14.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v0.4s +sqrdmulh v0.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +sub v2.4s, v16.4s, v14.4s +mla v19.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v27.s[3] +mul v3.4S, v3.4S,v28.s[3] +sub v22.4s, v21.4s, v19.4s +mla v1.4S, v0.4S, v31.s[0] +add v21.4s, v21.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v0.4s, v20.4s, v1.4s +mla v3.4S, v14.4S, v31.s[0] +add v20.4s, v20.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v25.s[0] +mul v15.4S, v15.4S,v26.s[0] +sub v14.4s, v17.4s, v3.4s +mla v18.4S, v19.4S, v31.s[0] +add v17.4s, v17.4s, v3.4s +sqrdmulh v3.4S, v11.4S, v25.s[1] +mul v11.4S, v11.4S,v26.s[1] +sub v19.4s, v9.4s, v18.4s +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v1.4s, v12.4s, v15.4s +mla v11.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v25.s[3] +mul v2.4S, v2.4S,v26.s[3] +sub v3.4s, v8.4s, v11.4s +mla v16.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v11.4s +str q12, [x0, #32] +sqrdmulh v12.4S, v20.4S, v23.s[0] +str q1, [x0, #96] +mul v20.4S, v20.4S,v24.s[0] +ldr q1, [x0, #816] +sub v11.4s, v13.4s, v16.4s +ldr q18, [x0, #880] +mla v2.4S, v15.4S, v31.s[0] +add v13.4s, v13.4s, v16.4s +str q8, [x0, #160] +sqrdmulh v8.4S, v0.4S, v23.s[1] +str q3, [x0, #224] +mul v0.4S, v0.4S,v24.s[1] +ldr q3, [x0, #944] +sub v16.4s, v10.4s, v2.4s +ldr q15, [x0, #1008] +mla v20.4S, v12.4S, v31.s[0] +add v10.4s, v10.4s, v2.4s +str q13, [x0, #288] +sqrdmulh v13.4S, v9.4S, v23.s[2] +str q11, [x0, #352] +mul v9.4S, v9.4S,v24.s[2] +ldr q11, [x0, #304] +sub v2.4s, v21.4s, v20.4s +ldr q12, [x0, #368] +mla v0.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v20.4s +str q10, [x0, #416] +sqrdmulh v10.4S, v19.4S, v23.s[3] +str q16, [x0, #480] +mul v19.4S, v19.4S,v24.s[3] +ldr q16, [x0, #432] +sub v20.4s, v22.4s, v0.4s +ldr q8, [x0, #496] +mla v9.4S, v13.4S, v31.s[0] +add v22.4s, v22.4s, v0.4s +str q21, [x0, #544] +sqrdmulh v21.4S, v1.4S, v29.s[0] +str q2, [x0, #608] +ldr q2, [x0, #560] +mul v1.4S, v1.4S,v30.s[0] +ldr q0, [x0, #624] +sub v13.4s, v17.4s, v9.4s +mla v19.4S, v10.4S, v31.s[0] +add v17.4s, v17.4s, v9.4s +str q22, [x0, #672] +sqrdmulh v22.4S, v18.4S, v29.s[0] +str q20, [x0, #736] +ldr q20, [x0, #688] +mul v18.4S, v18.4S,v30.s[0] +ldr q9, [x0, #752] +sub v10.4s, v14.4s, v19.4s +mla v1.4S, v21.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +str q17, [x0, #800] +sqrdmulh v17.4S, v3.4S, v29.s[0] +str q13, [x0, #864] +mul v3.4S, v3.4S,v30.s[0] +ldr q13, [x0, #48] +sub v19.4s, v11.4s, v1.4s +mla v18.4S, v22.4S, v31.s[0] +add v11.4s, v11.4s, v1.4s +str q14, [x0, #928] +sqrdmulh v14.4S, v15.4S, v29.s[0] +str q10, [x0, #992] +mul v15.4S, v15.4S,v30.s[0] +ldr q10, [x0, #112] +sub v1.4s, v12.4s, v18.4s +mla v3.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v2.4S, v29.s[0] +ldr q17, [x0, #176] +mul v2.4S, v2.4S,v30.s[0] +sub v22.4s, v16.4s, v3.4s +mla v15.4S, v14.4S, v31.s[0] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v0.4S, v29.s[0] +ldr q14, [x0, #240] +mul v0.4S, v0.4S,v30.s[0] +sub v21.4s, v8.4s, v15.4s +mla v2.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +sub v18.4s, v13.4s, v2.4s +mla v0.4S, v3.4S, v31.s[0] +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v9.4S, v29.s[0] +mul v9.4S, v9.4S,v30.s[0] +sub v3.4s, v10.4s, v0.4s +mla v20.4S, v15.4S, v31.s[0] +add v10.4s, v10.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v15.4s, v17.4s, v20.4s +mla v9.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +sub v2.4s, v14.4s, v9.4s +mla v16.4S, v0.4S, v31.s[0] +add v14.4s, v14.4s, v9.4s +sqrdmulh v9.4S, v11.4S, v29.s[1] +mul v11.4S, v11.4S,v30.s[1] +sub v0.4s, v17.4s, v16.4s +mla v8.4S, v20.4S, v31.s[0] +add v17.4s, v17.4s, v16.4s +sqrdmulh v16.4S, v12.4S, v29.s[1] +mul v12.4S, v12.4S,v30.s[1] +sub v20.4s, v14.4s, v8.4s +mla v11.4S, v9.4S, v31.s[0] +add v14.4s, v14.4s, v8.4s +sqrdmulh v8.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +sub v9.4s, v13.4s, v11.4s +mla v12.4S, v16.4S, v31.s[0] +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +sub v16.4s, v10.4s, v12.4s +mla v22.4S, v8.4S, v31.s[0] +add v10.4s, v10.4s, v12.4s +sqrdmulh v12.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +sub v8.4s, v15.4s, v22.4s +mla v21.4S, v11.4S, v31.s[0] +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v1.4S, v29.s[2] +mul v1.4S, v1.4S,v30.s[2] +sub v11.4s, v2.4s, v21.4s +mla v19.4S, v12.4S, v31.s[0] +add v2.4s, v2.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v27.s[0] +mul v17.4S, v17.4S,v28.s[0] +sub v12.4s, v18.4s, v19.4s +mla v1.4S, v22.4S, v31.s[0] +add v18.4s, v18.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +sub v22.4s, v3.4s, v1.4s +mla v17.4S, v21.4S, v31.s[0] +add v3.4s, v3.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v21.4s, v13.4s, v17.4s +mla v14.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v17.4s +sqrdmulh v17.4S, v20.4S, v27.s[1] +mul v20.4S, v20.4S,v28.s[1] +sub v19.4s, v10.4s, v14.4s +mla v0.4S, v1.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v27.s[2] +mul v15.4S, v15.4S,v28.s[2] +sub v1.4s, v9.4s, v0.4s +mla v20.4S, v17.4S, v31.s[0] +add v9.4s, v9.4s, v0.4s +sqrdmulh v0.4S, v2.4S, v27.s[2] +mul v2.4S, v2.4S,v28.s[2] +sub v17.4s, v16.4s, v20.4s +mla v15.4S, v14.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v27.s[3] +mul v8.4S, v8.4S,v28.s[3] +sub v14.4s, v18.4s, v15.4s +mla v2.4S, v0.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v27.s[3] +mul v11.4S, v11.4S,v28.s[3] +sub v0.4s, v3.4s, v2.4s +mla v8.4S, v20.4S, v31.s[0] +add v3.4s, v3.4s, v2.4s +sqrdmulh v2.4S, v10.4S, v25.s[0] +mul v10.4S, v10.4S,v26.s[0] +sub v20.4s, v12.4s, v8.4s +mla v11.4S, v15.4S, v31.s[0] +add v12.4s, v12.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v25.s[1] +mul v19.4S, v19.4S,v26.s[1] +sub v15.4s, v22.4s, v11.4s +mla v10.4S, v2.4S, v31.s[0] +add v22.4s, v22.4s, v11.4s +sqrdmulh v11.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v2.4s, v13.4s, v10.4s +mla v19.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v10.4s +sqrdmulh v10.4S, v17.4S, v25.s[3] +mul v17.4S, v17.4S,v26.s[3] +sub v8.4s, v21.4s, v19.4s +mla v16.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v19.4s +str q13, [x0, #48] +sqrdmulh v13.4S, v3.4S, v23.s[0] +str q2, [x0, #112] +mul v3.4S, v3.4S,v24.s[0] +ldr q2, [x0, #768] +sub v19.4s, v9.4s, v16.4s +ldr q11, [x0, #832] +mla v17.4S, v10.4S, v31.s[0] +add v9.4s, v9.4s, v16.4s +str q21, [x0, #176] +sqrdmulh v21.4S, v0.4S, v23.s[1] +str q8, [x0, #240] +mul v0.4S, v0.4S,v24.s[1] +ldr q8, [x0, #896] +sub v16.4s, v1.4s, v17.4s +ldr q10, [x0, #960] +mla v3.4S, v13.4S, v31.s[0] +add v1.4s, v1.4s, v17.4s +str q9, [x0, #304] +sqrdmulh v9.4S, v22.4S, v23.s[2] +str q19, [x0, #368] +mul v22.4S, v22.4S,v24.s[2] +ldr q19, [x0, #256] +sub v17.4s, v18.4s, v3.4s +ldr q13, [x0, #320] +mla v0.4S, v21.4S, v31.s[0] +add v18.4s, v18.4s, v3.4s +str q1, [x0, #432] +sqrdmulh v1.4S, v15.4S, v23.s[3] +str q16, [x0, #496] +mul v15.4S, v15.4S,v24.s[3] +ldr q16, [x0, #384] +sub v3.4s, v14.4s, v0.4s +ldr q21, [x0, #448] +mla v22.4S, v9.4S, v31.s[0] +add v14.4s, v14.4s, v0.4s +str q18, [x0, #560] +sqrdmulh v18.4S, v2.4S, v29.s[0] +str q17, [x0, #624] +ldr q17, [x0, #512] +mul v2.4S, v2.4S,v30.s[0] +ldr q0, [x0, #576] +sub v9.4s, v12.4s, v22.4s +mla v15.4S, v1.4S, v31.s[0] +add v12.4s, v12.4s, v22.4s +str q14, [x0, #688] +sqrdmulh v14.4S, v11.4S, v29.s[0] +str q3, [x0, #752] +ldr q3, [x0, #640] +mul v11.4S, v11.4S,v30.s[0] +ldr q22, [x0, #704] +sub v1.4s, v20.4s, v15.4s +mla v2.4S, v18.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +str q12, [x0, #816] +sqrdmulh v12.4S, v8.4S, v29.s[0] +str q9, [x0, #880] +mul v8.4S, v8.4S,v30.s[0] +ldr q9, [x0, #0] +sub v15.4s, v19.4s, v2.4s +mla v11.4S, v14.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +str q20, [x0, #944] +sqrdmulh v20.4S, v10.4S, v29.s[0] +str q1, [x0, #1008] +mul v10.4S, v10.4S,v30.s[0] +ldr q1, [x0, #64] +sub v2.4s, v13.4s, v11.4s +mla v8.4S, v12.4S, v31.s[0] +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v29.s[0] +ldr q12, [x0, #128] +mul v17.4S, v17.4S,v30.s[0] +sub v14.4s, v16.4s, v8.4s +mla v10.4S, v20.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v0.4S, v29.s[0] +ldr q20, [x0, #192] +mul v0.4S, v0.4S,v30.s[0] +sub v18.4s, v21.4s, v10.4s +mla v17.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +sub v11.4s, v9.4s, v17.4s +mla v0.4S, v8.4S, v31.s[0] +add v9.4s, v9.4s, v17.4s +sqrdmulh v17.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v8.4s, v1.4s, v0.4s +mla v3.4S, v10.4S, v31.s[0] +add v1.4s, v1.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v10.4s, v12.4s, v3.4s +mla v22.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v17.4s, v20.4s, v22.4s +mla v16.4S, v0.4S, v31.s[0] +add v20.4s, v20.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[1] +mul v19.4S, v19.4S,v30.s[1] +sub v0.4s, v12.4s, v16.4s +mla v21.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v13.4S, v29.s[1] +mul v13.4S, v13.4S,v30.s[1] +sub v3.4s, v20.4s, v21.4s +mla v19.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v22.4s, v9.4s, v19.4s +mla v13.4S, v16.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v29.s[2] +mul v18.4S, v18.4S,v30.s[2] +sub v16.4s, v1.4s, v13.4s +mla v14.4S, v21.4S, v31.s[0] +add v1.4s, v1.4s, v13.4s +sqrdmulh v13.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +sub v21.4s, v10.4s, v14.4s +mla v18.4S, v19.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v19.4s, v17.4s, v18.4s +mla v15.4S, v13.4S, v31.s[0] +add v17.4s, v17.4s, v18.4s +sqrdmulh v18.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +sub v13.4s, v11.4s, v15.4s +mla v2.4S, v14.4S, v31.s[0] +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v27.s[0] +mul v20.4S, v20.4S,v28.s[0] +sub v14.4s, v8.4s, v2.4s +mla v12.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v2.4s +sqrdmulh v2.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v18.4s, v9.4s, v12.4s +mla v20.4S, v15.4S, v31.s[0] +add v9.4s, v9.4s, v12.4s +sqrdmulh v12.4S, v3.4S, v27.s[1] +mul v3.4S, v3.4S,v28.s[1] +sub v15.4s, v1.4s, v20.4s +mla v0.4S, v2.4S, v31.s[0] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v10.4S, v27.s[2] +mul v10.4S, v10.4S,v28.s[2] +sub v2.4s, v22.4s, v0.4s +mla v3.4S, v12.4S, v31.s[0] +add v22.4s, v22.4s, v0.4s +sqrdmulh v0.4S, v17.4S, v27.s[2] +mul v17.4S, v17.4S,v28.s[2] +sub v12.4s, v16.4s, v3.4s +mla v10.4S, v20.4S, v31.s[0] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +sub v20.4s, v11.4s, v10.4s +mla v17.4S, v0.4S, v31.s[0] +add v11.4s, v11.4s, v10.4s +sqrdmulh v10.4S, v19.4S, v27.s[3] +mul v19.4S, v19.4S,v28.s[3] +sub v0.4s, v8.4s, v17.4s +mla v21.4S, v3.4S, v31.s[0] +add v8.4s, v8.4s, v17.4s +sqrdmulh v17.4S, v1.4S, v25.s[0] +mul v1.4S, v1.4S,v26.s[0] +sub v3.4s, v13.4s, v21.4s +mla v19.4S, v10.4S, v31.s[0] +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v15.4S, v25.s[1] +mul v15.4S, v15.4S,v26.s[1] +sub v10.4s, v14.4s, v19.4s +mla v1.4S, v17.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +sqrdmulh v19.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v17.4s, v9.4s, v1.4s +mla v15.4S, v21.4S, v31.s[0] +add v9.4s, v9.4s, v1.4s +sqrdmulh v1.4S, v12.4S, v25.s[3] +mul v12.4S, v12.4S,v26.s[3] +sub v21.4s, v18.4s, v15.4s +mla v16.4S, v19.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +str q9, [x0, #0] +sqrdmulh v9.4S, v8.4S, v23.s[0] +str q17, [x0, #64] +mul v8.4S, v8.4S,v24.s[0] +ldr q17, [x0, #784] +sub v15.4s, v22.4s, v16.4s +ldr q19, [x0, #848] +mla v12.4S, v1.4S, v31.s[0] +add v22.4s, v22.4s, v16.4s +str q18, [x0, #128] +sqrdmulh v18.4S, v0.4S, v23.s[1] +str q21, [x0, #192] +mul v0.4S, v0.4S,v24.s[1] +ldr q21, [x0, #912] +sub v16.4s, v2.4s, v12.4s +ldr q1, [x0, #976] +mla v8.4S, v9.4S, v31.s[0] +add v2.4s, v2.4s, v12.4s +str q22, [x0, #256] +sqrdmulh v22.4S, v14.4S, v23.s[2] +str q15, [x0, #320] +mul v14.4S, v14.4S,v24.s[2] +ldr q15, [x0, #272] +sub v12.4s, v11.4s, v8.4s +ldr q9, [x0, #336] +mla v0.4S, v18.4S, v31.s[0] +add v11.4s, v11.4s, v8.4s +str q2, [x0, #384] +sqrdmulh v2.4S, v10.4S, v23.s[3] +str q16, [x0, #448] +mul v10.4S, v10.4S,v24.s[3] +ldr q16, [x0, #400] +sub v8.4s, v20.4s, v0.4s +ldr q18, [x0, #464] +mla v14.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v0.4s +str q11, [x0, #512] +sqrdmulh v11.4S, v17.4S, v29.s[0] +str q12, [x0, #576] +ldr q12, [x0, #528] +mul v17.4S, v17.4S,v30.s[0] +ldr q0, [x0, #592] +sub v22.4s, v13.4s, v14.4s +mla v10.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v14.4s +str q20, [x0, #640] +sqrdmulh v20.4S, v19.4S, v29.s[0] +str q8, [x0, #704] +ldr q8, [x0, #656] +mul v19.4S, v19.4S,v30.s[0] +ldr q14, [x0, #720] +sub v2.4s, v3.4s, v10.4s +mla v17.4S, v11.4S, v31.s[0] +add v3.4s, v3.4s, v10.4s +str q13, [x0, #768] +sqrdmulh v13.4S, v21.4S, v29.s[0] +str q22, [x0, #832] +mul v21.4S, v21.4S,v30.s[0] +ldr q22, [x0, #16] +sub v10.4s, v15.4s, v17.4s +mla v19.4S, v20.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +str q3, [x0, #896] +sqrdmulh v3.4S, v1.4S, v29.s[0] +str q2, [x0, #960] +mul v1.4S, v1.4S,v30.s[0] +ldr q2, [x0, #80] +sub v17.4s, v9.4s, v19.4s +mla v21.4S, v13.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v12.4S, v29.s[0] +ldr q13, [x0, #144] +mul v12.4S, v12.4S,v30.s[0] +sub v20.4s, v16.4s, v21.4s +mla v1.4S, v3.4S, v31.s[0] +add v16.4s, v16.4s, v21.4s +sqrdmulh v21.4S, v0.4S, v29.s[0] +ldr q3, [x0, #208] +mul v0.4S, v0.4S,v30.s[0] +sub v11.4s, v18.4s, v1.4s +mla v12.4S, v19.4S, v31.s[0] +add v18.4s, v18.4s, v1.4s +sqrdmulh v1.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v19.4s, v22.4s, v12.4s +mla v0.4S, v21.4S, v31.s[0] +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v21.4s, v2.4s, v0.4s +mla v8.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v1.4s, v13.4s, v8.4s +mla v14.4S, v12.4S, v31.s[0] +add v13.4s, v13.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v12.4s, v3.4s, v14.4s +mla v16.4S, v0.4S, v31.s[0] +add v3.4s, v3.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +sub v0.4s, v13.4s, v16.4s +mla v18.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v16.4s +sqrdmulh v16.4S, v9.4S, v29.s[1] +mul v9.4S, v9.4S,v30.s[1] +sub v8.4s, v3.4s, v18.4s +mla v15.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +sub v14.4s, v22.4s, v15.4s +mla v9.4S, v16.4S, v31.s[0] +add v22.4s, v22.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v16.4s, v2.4s, v9.4s +mla v20.4S, v18.4S, v31.s[0] +add v2.4s, v2.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v18.4s, v1.4s, v20.4s +mla v11.4S, v15.4S, v31.s[0] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +sub v15.4s, v12.4s, v11.4s +mla v10.4S, v9.4S, v31.s[0] +add v12.4s, v12.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v27.s[0] +mul v13.4S, v13.4S,v28.s[0] +sub v9.4s, v19.4s, v10.4s +mla v17.4S, v20.4S, v31.s[0] +add v19.4s, v19.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v27.s[0] +mul v3.4S, v3.4S,v28.s[0] +sub v20.4s, v21.4s, v17.4s +mla v13.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v11.4s, v22.4s, v13.4s +mla v3.4S, v10.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v8.4S, v27.s[1] +mul v8.4S, v8.4S,v28.s[1] +sub v10.4s, v2.4s, v3.4s +mla v0.4S, v17.4S, v31.s[0] +add v2.4s, v2.4s, v3.4s +sqrdmulh v3.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +sub v17.4s, v14.4s, v0.4s +mla v8.4S, v13.4S, v31.s[0] +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v12.4S, v27.s[2] +mul v12.4S, v12.4S,v28.s[2] +sub v13.4s, v16.4s, v8.4s +mla v1.4S, v3.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v3.4s, v19.4s, v1.4s +mla v12.4S, v0.4S, v31.s[0] +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +sub v0.4s, v21.4s, v12.4s +mla v18.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v2.4S, v25.s[0] +mul v2.4S, v2.4S,v26.s[0] +sub v8.4s, v9.4s, v18.4s +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v10.4S, v25.s[1] +mul v10.4S, v10.4S,v26.s[1] +sub v1.4s, v20.4s, v15.4s +mla v2.4S, v12.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v12.4s, v22.4s, v2.4s +mla v10.4S, v18.4S, v31.s[0] +add v22.4s, v22.4s, v2.4s +sqrdmulh v2.4S, v13.4S, v25.s[3] +mul v13.4S, v13.4S,v26.s[3] +sub v18.4s, v11.4s, v10.4s +mla v16.4S, v15.4S, v31.s[0] +add v11.4s, v11.4s, v10.4s +str q22, [x0, #16] +sqrdmulh v22.4S, v21.4S, v23.s[0] +str q12, [x0, #80] +mul v21.4S, v21.4S,v24.s[0] +sub v12.4s, v14.4s, v16.4s +mla v13.4S, v2.4S, v31.s[0] +add v14.4s, v14.4s, v16.4s +str q11, [x0, #144] +sqrdmulh v11.4S, v0.4S, v23.s[1] +str q18, [x0, #208] +mul v0.4S, v0.4S,v24.s[1] +sub v18.4s, v17.4s, v13.4s +mla v21.4S, v22.4S, v31.s[0] +add v17.4s, v17.4s, v13.4s +str q14, [x0, #272] +sqrdmulh v14.4S, v20.4S, v23.s[2] +str q12, [x0, #336] +mul v20.4S, v20.4S,v24.s[2] +sub v12.4s, v19.4s, v21.4s +mla v0.4S, v11.4S, v31.s[0] +add v19.4s, v19.4s, v21.4s +str q17, [x0, #400] +sqrdmulh v17.4S, v1.4S, v23.s[3] +str q18, [x0, #464] +mul v1.4S, v1.4S,v24.s[3] +sub v18.4s, v3.4s, v0.4s +mla v20.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v0.4s +str q19, [x0, #528] +str q12, [x0, #592] +sub v12.4s, v9.4s, v20.4s +mla v1.4S, v17.4S, v31.s[0] +add v9.4s, v9.4s, v20.4s +str q3, [x0, #656] +str q18, [x0, #720] +sub v18.4s, v8.4s, v1.4s +add v8.4s, v8.4s, v1.4s +str q9, [x0, #784] +str q12, [x0, #848] +str q8, [x0, #912] +str q18, [x0, #976] +ldr q4, [x17, #+128] +ldr q5, [x17, #+144] +ldr q6, [x0, #32] +sqrdmulh v7.4S, v6.4S, v5.s[0] +mul v6.4S, v6.4S,v4.s[0] +ldr q15, [x0, #48] +sqrdmulh v10.4S, v15.4S, v5.s[0] +mul v15.4S, v15.4S,v4.s[0] +ldr q2, [x17, #+160] +ldr q16, [x17, #+176] +ldr q22, [x0, #96] +sqrdmulh v13.4S, v22.4S, v16.s[0] +mul v22.4S, v22.4S,v2.s[0] +ldr q11, [x0, #112] +sqrdmulh v21.4S, v11.4S, v16.s[0] +mul v11.4S, v11.4S,v2.s[0] +ldr q14, [x0, #160] +ldr q0, [x17, #+192] +ldr q19, [x17, #+208] +mla v6.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v14.4S, v19.s[0] +ldr q17, [x0, #176] +mla v15.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v17.4S, v19.s[0] +ldr q20, [x0, #224] +ldr q3, [x17, #+224] +ldr q1, [x17, #+240] +mla v22.4S, v13.4S, v31.s[0] +sqrdmulh v13.4S, v20.4S, v1.s[0] +ldr q9, [x0, #240] +mla v11.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v9.4S, v1.s[0] +ldr q12, [x0, #128] +ldr q8, [x0, #0] +mul v14.4S, v14.4S,v0.s[0] +sub v18.4s, v8.4s, v6.4s +mul v17.4S, v17.4S,v0.s[0] +add v8.4s, v8.4s, v6.4s +ldr q6, [x0, #144] +ldr q30, [x0, #16] +mla v14.4S, v7.4S, v31.s[0] +sub v7.4s, v30.4s, v15.4s +mla v17.4S, v10.4S, v31.s[0] +add v30.4s, v30.4s, v15.4s +ldr q15, [x0, #192] +ldr q10, [x0, #64] +mul v20.4S, v20.4S,v3.s[0] +sub v29.4s, v10.4s, v22.4s +mul v9.4S, v9.4S,v3.s[0] +add v10.4s, v10.4s, v22.4s +ldr q22, [x0, #208] +ldr q28, [x0, #80] +mla v20.4S, v13.4S, v31.s[0] +sub v13.4s, v28.4s, v11.4s +mla v9.4S, v21.4S, v31.s[0] +add v28.4s, v28.4s, v11.4s +sqrdmulh v11.4S, v30.4S, v5.s[1] +mul v30.4S, v30.4S,v4.s[1] +sqrdmulh v21.4S, v7.4S, v5.s[2] +sub v27.4s, v12.4s, v14.4s +mul v7.4S, v7.4S,v4.s[2] +add v12.4s, v12.4s, v14.4s +sqrdmulh v5.4S, v28.4S, v16.s[1] +sub v4.4s, v6.4s, v17.4s +mul v28.4S, v28.4S,v2.s[1] +add v6.4s, v6.4s, v17.4s +sqrdmulh v17.4S, v13.4S, v16.s[2] +sub v14.4s, v15.4s, v20.4s +mul v13.4S, v13.4S,v2.s[2] +add v15.4s, v15.4s, v20.4s +mla v30.4S, v11.4S, v31.s[0] +sub v11.4s, v22.4s, v9.4s +sqrdmulh v16.4S, v6.4S, v19.s[1] +add v22.4s, v22.4s, v9.4s +mla v7.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v4.4S, v19.s[2] +mla v28.4S, v5.4S, v31.s[0] +sqrdmulh v5.4S, v22.4S, v1.s[1] +mla v13.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v11.4S, v1.s[2] +mul v6.4S, v6.4S,v0.s[1] +sub v9.4s, v8.4s, v30.4s +mul v4.4S, v4.4S,v0.s[2] +add v8.4s, v8.4s, v30.4s +str q9, [x0, #16] +str q8, [x0, #0] +mla v6.4S, v16.4S, v31.s[0] +sub v16.4s, v18.4s, v7.4s +mla v4.4S, v21.4S, v31.s[0] +add v18.4s, v18.4s, v7.4s +str q16, [x0, #48] +str q18, [x0, #32] +mul v22.4S, v22.4S,v3.s[1] +sub v19.4s, v10.4s, v28.4s +mul v11.4S, v11.4S,v3.s[2] +add v10.4s, v10.4s, v28.4s +str q19, [x0, #80] +str q10, [x0, #64] +mla v22.4S, v5.4S, v31.s[0] +sub v5.4s, v29.4s, v13.4s +mla v11.4S, v17.4S, v31.s[0] +add v29.4s, v29.4s, v13.4s +str q5, [x0, #112] +str q29, [x0, #96] +ldr q1, [x17, #+256] +ldr q3, [x17, #+272] +ldr q29, [x0, #288] +sqrdmulh v5.4S, v29.4S, v3.s[0] +sub v13.4s, v12.4s, v6.4s +str q13, [x0, #144] +mul v29.4S, v29.4S,v1.s[0] +add v12.4s, v12.4s, v6.4s +str q12, [x0, #128] +ldr q12, [x0, #304] +sqrdmulh v6.4S, v12.4S, v3.s[0] +sub v13.4s, v27.4s, v4.4s +mul v12.4S, v12.4S,v1.s[0] +add v27.4s, v27.4s, v4.4s +str q13, [x0, #176] +str q27, [x0, #160] +ldr q27, [x17, #+288] +ldr q13, [x17, #+304] +ldr q4, [x0, #352] +sqrdmulh v17.4S, v4.4S, v13.s[0] +sub v10.4s, v15.4s, v22.4s +mul v4.4S, v4.4S,v27.s[0] +add v15.4s, v15.4s, v22.4s +str q10, [x0, #208] +str q15, [x0, #192] +ldr q15, [x0, #368] +sqrdmulh v10.4S, v15.4S, v13.s[0] +sub v22.4s, v14.4s, v11.4s +mul v15.4S, v15.4S,v27.s[0] +add v14.4s, v14.4s, v11.4s +str q22, [x0, #240] +str q14, [x0, #224] +ldr q14, [x0, #416] +ldr q22, [x17, #+320] +ldr q11, [x17, #+336] +mla v29.4S, v5.4S, v31.s[0] +sqrdmulh v5.4S, v14.4S, v11.s[0] +ldr q19, [x0, #432] +mla v12.4S, v6.4S, v31.s[0] +sqrdmulh v6.4S, v19.4S, v11.s[0] +ldr q28, [x0, #480] +ldr q0, [x17, #+352] +ldr q18, [x17, #+368] +mla v4.4S, v17.4S, v31.s[0] +sqrdmulh v17.4S, v28.4S, v18.s[0] +ldr q16, [x0, #496] +mla v15.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v16.4S, v18.s[0] +ldr q7, [x0, #384] +ldr q21, [x0, #256] +mul v14.4S, v14.4S,v22.s[0] +sub v8.4s, v21.4s, v29.4s +mul v19.4S, v19.4S,v22.s[0] +add v21.4s, v21.4s, v29.4s +ldr q29, [x0, #400] +ldr q9, [x0, #272] +mla v14.4S, v5.4S, v31.s[0] +sub v5.4s, v9.4s, v12.4s +mla v19.4S, v6.4S, v31.s[0] +add v9.4s, v9.4s, v12.4s +ldr q12, [x0, #448] +ldr q6, [x0, #320] +mul v28.4S, v28.4S,v0.s[0] +sub v30.4s, v6.4s, v4.4s +mul v16.4S, v16.4S,v0.s[0] +add v6.4s, v6.4s, v4.4s +ldr q4, [x0, #464] +ldr q2, [x0, #336] +mla v28.4S, v17.4S, v31.s[0] +sub v17.4s, v2.4s, v15.4s +mla v16.4S, v10.4S, v31.s[0] +add v2.4s, v2.4s, v15.4s +sqrdmulh v15.4S, v9.4S, v3.s[1] +mul v9.4S, v9.4S,v1.s[1] +sqrdmulh v10.4S, v5.4S, v3.s[2] +sub v20.4s, v7.4s, v14.4s +mul v5.4S, v5.4S,v1.s[2] +add v7.4s, v7.4s, v14.4s +sqrdmulh v3.4S, v2.4S, v13.s[1] +sub v1.4s, v29.4s, v19.4s +mul v2.4S, v2.4S,v27.s[1] +add v29.4s, v29.4s, v19.4s +sqrdmulh v19.4S, v17.4S, v13.s[2] +sub v14.4s, v12.4s, v28.4s +mul v17.4S, v17.4S,v27.s[2] +add v12.4s, v12.4s, v28.4s +mla v9.4S, v15.4S, v31.s[0] +sub v15.4s, v4.4s, v16.4s +sqrdmulh v13.4S, v29.4S, v11.s[1] +add v4.4s, v4.4s, v16.4s +mla v5.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v1.4S, v11.s[2] +mla v2.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v4.4S, v18.s[1] +mla v17.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v15.4S, v18.s[2] +mul v29.4S, v29.4S,v22.s[1] +sub v16.4s, v21.4s, v9.4s +mul v1.4S, v1.4S,v22.s[2] +add v21.4s, v21.4s, v9.4s +str q16, [x0, #272] +str q21, [x0, #256] +mla v29.4S, v13.4S, v31.s[0] +sub v13.4s, v8.4s, v5.4s +mla v1.4S, v10.4S, v31.s[0] +add v8.4s, v8.4s, v5.4s +str q13, [x0, #304] +str q8, [x0, #288] +mul v4.4S, v4.4S,v0.s[1] +sub v11.4s, v6.4s, v2.4s +mul v15.4S, v15.4S,v0.s[2] +add v6.4s, v6.4s, v2.4s +str q11, [x0, #336] +str q6, [x0, #320] +mla v4.4S, v3.4S, v31.s[0] +sub v3.4s, v30.4s, v17.4s +mla v15.4S, v19.4S, v31.s[0] +add v30.4s, v30.4s, v17.4s +str q3, [x0, #368] +str q30, [x0, #352] +ldr q18, [x17, #+384] +ldr q0, [x17, #+400] +ldr q30, [x0, #544] +sqrdmulh v3.4S, v30.4S, v0.s[0] +sub v17.4s, v7.4s, v29.4s +str q17, [x0, #400] +mul v30.4S, v30.4S,v18.s[0] +add v7.4s, v7.4s, v29.4s +str q7, [x0, #384] +ldr q7, [x0, #560] +sqrdmulh v29.4S, v7.4S, v0.s[0] +sub v17.4s, v20.4s, v1.4s +mul v7.4S, v7.4S,v18.s[0] +add v20.4s, v20.4s, v1.4s +str q17, [x0, #432] +str q20, [x0, #416] +ldr q20, [x17, #+416] +ldr q17, [x17, #+432] +ldr q1, [x0, #608] +sqrdmulh v19.4S, v1.4S, v17.s[0] +sub v6.4s, v12.4s, v4.4s +mul v1.4S, v1.4S,v20.s[0] +add v12.4s, v12.4s, v4.4s +str q6, [x0, #464] +str q12, [x0, #448] +ldr q12, [x0, #624] +sqrdmulh v6.4S, v12.4S, v17.s[0] +sub v4.4s, v14.4s, v15.4s +mul v12.4S, v12.4S,v20.s[0] +add v14.4s, v14.4s, v15.4s +str q4, [x0, #496] +str q14, [x0, #480] +ldr q14, [x0, #672] +ldr q4, [x17, #+448] +ldr q15, [x17, #+464] +mla v30.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v14.4S, v15.s[0] +ldr q11, [x0, #688] +mla v7.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v11.4S, v15.s[0] +ldr q2, [x0, #736] +ldr q22, [x17, #+480] +ldr q8, [x17, #+496] +mla v1.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v2.4S, v8.s[0] +ldr q13, [x0, #752] +mla v12.4S, v6.4S, v31.s[0] +sqrdmulh v6.4S, v13.4S, v8.s[0] +ldr q5, [x0, #640] +ldr q10, [x0, #512] +mul v14.4S, v14.4S,v4.s[0] +sub v21.4s, v10.4s, v30.4s +mul v11.4S, v11.4S,v4.s[0] +add v10.4s, v10.4s, v30.4s +ldr q30, [x0, #656] +ldr q16, [x0, #528] +mla v14.4S, v3.4S, v31.s[0] +sub v3.4s, v16.4s, v7.4s +mla v11.4S, v29.4S, v31.s[0] +add v16.4s, v16.4s, v7.4s +ldr q7, [x0, #704] +ldr q29, [x0, #576] +mul v2.4S, v2.4S,v22.s[0] +sub v9.4s, v29.4s, v1.4s +mul v13.4S, v13.4S,v22.s[0] +add v29.4s, v29.4s, v1.4s +ldr q1, [x0, #720] +ldr q27, [x0, #592] +mla v2.4S, v19.4S, v31.s[0] +sub v19.4s, v27.4s, v12.4s +mla v13.4S, v6.4S, v31.s[0] +add v27.4s, v27.4s, v12.4s +sqrdmulh v12.4S, v16.4S, v0.s[1] +mul v16.4S, v16.4S,v18.s[1] +sqrdmulh v6.4S, v3.4S, v0.s[2] +sub v28.4s, v5.4s, v14.4s +mul v3.4S, v3.4S,v18.s[2] +add v5.4s, v5.4s, v14.4s +sqrdmulh v0.4S, v27.4S, v17.s[1] +sub v18.4s, v30.4s, v11.4s +mul v27.4S, v27.4S,v20.s[1] +add v30.4s, v30.4s, v11.4s +sqrdmulh v11.4S, v19.4S, v17.s[2] +sub v14.4s, v7.4s, v2.4s +mul v19.4S, v19.4S,v20.s[2] +add v7.4s, v7.4s, v2.4s +mla v16.4S, v12.4S, v31.s[0] +sub v12.4s, v1.4s, v13.4s +sqrdmulh v17.4S, v30.4S, v15.s[1] +add v1.4s, v1.4s, v13.4s +mla v3.4S, v6.4S, v31.s[0] +sqrdmulh v6.4S, v18.4S, v15.s[2] +mla v27.4S, v0.4S, v31.s[0] +sqrdmulh v0.4S, v1.4S, v8.s[1] +mla v19.4S, v11.4S, v31.s[0] +sqrdmulh v11.4S, v12.4S, v8.s[2] +mul v30.4S, v30.4S,v4.s[1] +sub v13.4s, v10.4s, v16.4s +mul v18.4S, v18.4S,v4.s[2] +add v10.4s, v10.4s, v16.4s +str q13, [x0, #528] +str q10, [x0, #512] +mla v30.4S, v17.4S, v31.s[0] +sub v17.4s, v21.4s, v3.4s +mla v18.4S, v6.4S, v31.s[0] +add v21.4s, v21.4s, v3.4s +str q17, [x0, #560] +str q21, [x0, #544] +mul v1.4S, v1.4S,v22.s[1] +sub v15.4s, v29.4s, v27.4s +mul v12.4S, v12.4S,v22.s[2] +add v29.4s, v29.4s, v27.4s +str q15, [x0, #592] +str q29, [x0, #576] +mla v1.4S, v0.4S, v31.s[0] +sub v0.4s, v9.4s, v19.4s +mla v12.4S, v11.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +str q0, [x0, #624] +str q9, [x0, #608] +ldr q8, [x17, #+512] +ldr q22, [x17, #+528] +ldr q9, [x0, #800] +sqrdmulh v0.4S, v9.4S, v22.s[0] +sub v19.4s, v5.4s, v30.4s +str q19, [x0, #656] +mul v9.4S, v9.4S,v8.s[0] +add v5.4s, v5.4s, v30.4s +str q5, [x0, #640] +ldr q5, [x0, #816] +sqrdmulh v30.4S, v5.4S, v22.s[0] +sub v19.4s, v28.4s, v18.4s +mul v5.4S, v5.4S,v8.s[0] +add v28.4s, v28.4s, v18.4s +str q19, [x0, #688] +str q28, [x0, #672] +ldr q28, [x17, #+544] +ldr q19, [x17, #+560] +ldr q18, [x0, #864] +sqrdmulh v11.4S, v18.4S, v19.s[0] +sub v29.4s, v7.4s, v1.4s +mul v18.4S, v18.4S,v28.s[0] +add v7.4s, v7.4s, v1.4s +str q29, [x0, #720] +str q7, [x0, #704] +ldr q7, [x0, #880] +sqrdmulh v29.4S, v7.4S, v19.s[0] +sub v1.4s, v14.4s, v12.4s +mul v7.4S, v7.4S,v28.s[0] +add v14.4s, v14.4s, v12.4s +str q1, [x0, #752] +str q14, [x0, #736] +ldr q14, [x0, #928] +ldr q1, [x17, #+576] +ldr q12, [x17, #+592] +mla v9.4S, v0.4S, v31.s[0] +sqrdmulh v0.4S, v14.4S, v12.s[0] +ldr q15, [x0, #944] +mla v5.4S, v30.4S, v31.s[0] +sqrdmulh v30.4S, v15.4S, v12.s[0] +ldr q27, [x0, #992] +ldr q4, [x17, #+608] +ldr q21, [x17, #+624] +mla v18.4S, v11.4S, v31.s[0] +sqrdmulh v11.4S, v27.4S, v21.s[0] +ldr q17, [x0, #1008] +mla v7.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v17.4S, v21.s[0] +ldr q3, [x0, #896] +ldr q6, [x0, #768] +mul v14.4S, v14.4S,v1.s[0] +sub v10.4s, v6.4s, v9.4s +mul v15.4S, v15.4S,v1.s[0] +add v6.4s, v6.4s, v9.4s +ldr q9, [x0, #912] +ldr q13, [x0, #784] +mla v14.4S, v0.4S, v31.s[0] +sub v0.4s, v13.4s, v5.4s +mla v15.4S, v30.4S, v31.s[0] +add v13.4s, v13.4s, v5.4s +ldr q5, [x0, #960] +ldr q30, [x0, #832] +mul v27.4S, v27.4S,v4.s[0] +sub v16.4s, v30.4s, v18.4s +mul v17.4S, v17.4S,v4.s[0] +add v30.4s, v30.4s, v18.4s +ldr q18, [x0, #976] +ldr q20, [x0, #848] +mla v27.4S, v11.4S, v31.s[0] +sub v11.4s, v20.4s, v7.4s +mla v17.4S, v29.4S, v31.s[0] +add v20.4s, v20.4s, v7.4s +sqrdmulh v7.4S, v13.4S, v22.s[1] +mul v13.4S, v13.4S,v8.s[1] +sqrdmulh v29.4S, v0.4S, v22.s[2] +sub v2.4s, v3.4s, v14.4s +mul v0.4S, v0.4S,v8.s[2] +add v3.4s, v3.4s, v14.4s +sqrdmulh v22.4S, v20.4S, v19.s[1] +sub v8.4s, v9.4s, v15.4s +mul v20.4S, v20.4S,v28.s[1] +add v9.4s, v9.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v19.s[2] +sub v14.4s, v5.4s, v27.4s +mul v11.4S, v11.4S,v28.s[2] +add v5.4s, v5.4s, v27.4s +mla v13.4S, v7.4S, v31.s[0] +sub v7.4s, v18.4s, v17.4s +sqrdmulh v19.4S, v9.4S, v12.s[1] +add v18.4s, v18.4s, v17.4s +mla v0.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v8.4S, v12.s[2] +mla v20.4S, v22.4S, v31.s[0] +sqrdmulh v22.4S, v18.4S, v21.s[1] +mla v11.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v7.4S, v21.s[2] +mul v9.4S, v9.4S,v1.s[1] +sub v17.4s, v6.4s, v13.4s +mul v8.4S, v8.4S,v1.s[2] +add v6.4s, v6.4s, v13.4s +str q17, [x0, #784] +str q6, [x0, #768] +mla v9.4S, v19.4S, v31.s[0] +sub v19.4s, v10.4s, v0.4s +mla v8.4S, v29.4S, v31.s[0] +add v10.4s, v10.4s, v0.4s +str q19, [x0, #816] +str q10, [x0, #800] +mul v18.4S, v18.4S,v4.s[1] +sub v12.4s, v30.4s, v20.4s +mul v7.4S, v7.4S,v4.s[2] +add v30.4s, v30.4s, v20.4s +str q12, [x0, #848] +str q30, [x0, #832] +mla v18.4S, v22.4S, v31.s[0] +sub v22.4s, v16.4s, v11.4s +mla v7.4S, v15.4S, v31.s[0] +add v16.4s, v16.4s, v11.4s +str q22, [x0, #880] +str q16, [x0, #864] +sub v21.4s, v3.4s, v9.4s +str q21, [x0, #912] +add v3.4s, v3.4s, v9.4s +str q3, [x0, #896] +sub v3.4s, v2.4s, v8.4s +add v2.4s, v2.4s, v8.4s +str q3, [x0, #944] +str q2, [x0, #928] +sub v2.4s, v5.4s, v18.4s +add v5.4s, v5.4s, v18.4s +str q2, [x0, #976] +str q5, [x0, #960] +sub v5.4s, v14.4s, v7.4s +add v14.4s, v14.4s, v7.4s +str q5, [x0, #1008] +str q14, [x0, #992] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1464 +// Instruction count: 1460 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_5.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_5.s new file mode 100644 index 0000000..d9c302a --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_5.s @@ -0,0 +1,1494 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_3_z4_5 +.global _ntt_u32_incomplete_neon_asm_var_4_2_3_z4_5 +ntt_u32_incomplete_neon_asm_var_4_2_3_z4_5: +_ntt_u32_incomplete_neon_asm_var_4_2_3_z4_5: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #800] +ldr q21, [x0, #864] +ldr q20, [x0, #928] +ldr q19, [x0, #992] +ldr q18, [x0, #288] +ldr q17, [x0, #352] +ldr q16, [x0, #416] +ldr q3, [x0, #480] +sqrdmulh v2.4S, v22.4S, v29.s[0] +ldr q1, [x0, #544] +mul v22.4S, v22.4S,v30.s[0] +ldr q0, [x0, #608] +sqrdmulh v15.4S, v21.4S, v29.s[0] +ldr q14, [x0, #672] +mul v21.4S, v21.4S,v30.s[0] +ldr q13, [x0, #736] +mla v22.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q12, [x0, #32] +sub v11.4s, v18.4s, v22.4s +mla v21.4S, v15.4S, v31.s[0] +add v18.4s, v18.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q15, [x0, #96] +sub v10.4s, v17.4s, v21.4s +mla v20.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v1.4S, v29.s[0] +ldr q2, [x0, #160] +mul v1.4S, v1.4S,v30.s[0] +sub v9.4s, v16.4s, v20.4s +mla v19.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v0.4S, v29.s[0] +ldr q22, [x0, #224] +mul v0.4S, v0.4S,v30.s[0] +sub v8.4s, v3.4s, v19.4s +mla v1.4S, v21.4S, v31.s[0] +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v21.4s, v12.4s, v1.4s +mla v0.4S, v20.4S, v31.s[0] +add v12.4s, v12.4s, v1.4s +sqrdmulh v1.4S, v13.4S, v29.s[0] +mul v13.4S, v13.4S,v30.s[0] +sub v20.4s, v15.4s, v0.4s +mla v14.4S, v19.4S, v31.s[0] +add v15.4s, v15.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v19.4s, v2.4s, v14.4s +mla v13.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v1.4s, v22.4s, v13.4s +mla v16.4S, v0.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v0.4s, v2.4s, v16.4s +mla v3.4S, v14.4S, v31.s[0] +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v14.4s, v22.4s, v3.4s +mla v18.4S, v13.4S, v31.s[0] +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v29.s[2] +mul v9.4S, v9.4S,v30.s[2] +sub v13.4s, v12.4s, v18.4s +mla v17.4S, v16.4S, v31.s[0] +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v8.4S, v29.s[2] +mul v8.4S, v8.4S,v30.s[2] +sub v16.4s, v15.4s, v17.4s +mla v9.4S, v3.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +sqrdmulh v17.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v3.4s, v19.4s, v9.4s +mla v8.4S, v18.4S, v31.s[0] +add v19.4s, v19.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v18.4s, v1.4s, v8.4s +mla v11.4S, v17.4S, v31.s[0] +add v1.4s, v1.4s, v8.4s +sqrdmulh v8.4S, v2.4S, v27.s[0] +mul v2.4S, v2.4S,v28.s[0] +sub v17.4s, v21.4s, v11.4s +mla v10.4S, v9.4S, v31.s[0] +add v21.4s, v21.4s, v11.4s +sqrdmulh v11.4S, v22.4S, v27.s[0] +mul v22.4S, v22.4S,v28.s[0] +sub v9.4s, v20.4s, v10.4s +mla v2.4S, v8.4S, v31.s[0] +add v20.4s, v20.4s, v10.4s +sqrdmulh v10.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v8.4s, v12.4s, v2.4s +mla v22.4S, v11.4S, v31.s[0] +add v12.4s, v12.4s, v2.4s +sqrdmulh v2.4S, v14.4S, v27.s[1] +mul v14.4S, v14.4S,v28.s[1] +sub v11.4s, v15.4s, v22.4s +mla v0.4S, v10.4S, v31.s[0] +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v27.s[2] +mul v19.4S, v19.4S,v28.s[2] +sub v10.4s, v13.4s, v0.4s +mla v14.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v0.4s +sqrdmulh v0.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +sub v2.4s, v16.4s, v14.4s +mla v19.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v14.4s +sqrdmulh v14.4S, v3.4S, v27.s[3] +mul v3.4S, v3.4S,v28.s[3] +sub v22.4s, v21.4s, v19.4s +mla v1.4S, v0.4S, v31.s[0] +add v21.4s, v21.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v0.4s, v20.4s, v1.4s +mla v3.4S, v14.4S, v31.s[0] +add v20.4s, v20.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v25.s[0] +mul v15.4S, v15.4S,v26.s[0] +sub v14.4s, v17.4s, v3.4s +mla v18.4S, v19.4S, v31.s[0] +add v17.4s, v17.4s, v3.4s +sqrdmulh v3.4S, v11.4S, v25.s[1] +mul v11.4S, v11.4S,v26.s[1] +sub v19.4s, v9.4s, v18.4s +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v1.4s, v12.4s, v15.4s +mla v11.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v25.s[3] +mul v2.4S, v2.4S,v26.s[3] +sub v3.4s, v8.4s, v11.4s +mla v16.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v11.4s +str q12, [x0, #32] +sqrdmulh v12.4S, v20.4S, v23.s[0] +str q1, [x0, #96] +mul v20.4S, v20.4S,v24.s[0] +ldr q1, [x0, #816] +sub v11.4s, v13.4s, v16.4s +ldr q18, [x0, #880] +mla v2.4S, v15.4S, v31.s[0] +add v13.4s, v13.4s, v16.4s +str q8, [x0, #160] +sqrdmulh v8.4S, v0.4S, v23.s[1] +str q3, [x0, #224] +mul v0.4S, v0.4S,v24.s[1] +ldr q3, [x0, #944] +sub v16.4s, v10.4s, v2.4s +ldr q15, [x0, #1008] +mla v20.4S, v12.4S, v31.s[0] +add v10.4s, v10.4s, v2.4s +str q13, [x0, #288] +sqrdmulh v13.4S, v9.4S, v23.s[2] +str q11, [x0, #352] +mul v9.4S, v9.4S,v24.s[2] +ldr q11, [x0, #304] +sub v2.4s, v21.4s, v20.4s +ldr q12, [x0, #368] +mla v0.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v20.4s +str q10, [x0, #416] +sqrdmulh v10.4S, v19.4S, v23.s[3] +str q16, [x0, #480] +mul v19.4S, v19.4S,v24.s[3] +ldr q16, [x0, #432] +sub v20.4s, v22.4s, v0.4s +ldr q8, [x0, #496] +mla v9.4S, v13.4S, v31.s[0] +add v22.4s, v22.4s, v0.4s +str q21, [x0, #544] +sqrdmulh v21.4S, v1.4S, v29.s[0] +str q2, [x0, #608] +ldr q2, [x0, #560] +mul v1.4S, v1.4S,v30.s[0] +ldr q0, [x0, #624] +sub v13.4s, v17.4s, v9.4s +mla v19.4S, v10.4S, v31.s[0] +add v17.4s, v17.4s, v9.4s +str q22, [x0, #672] +sqrdmulh v22.4S, v18.4S, v29.s[0] +str q20, [x0, #736] +ldr q20, [x0, #688] +mul v18.4S, v18.4S,v30.s[0] +ldr q9, [x0, #752] +sub v10.4s, v14.4s, v19.4s +mla v1.4S, v21.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +str q17, [x0, #800] +sqrdmulh v17.4S, v3.4S, v29.s[0] +str q13, [x0, #864] +mul v3.4S, v3.4S,v30.s[0] +ldr q13, [x0, #48] +sub v19.4s, v11.4s, v1.4s +mla v18.4S, v22.4S, v31.s[0] +add v11.4s, v11.4s, v1.4s +str q14, [x0, #928] +sqrdmulh v14.4S, v15.4S, v29.s[0] +str q10, [x0, #992] +mul v15.4S, v15.4S,v30.s[0] +ldr q10, [x0, #112] +sub v1.4s, v12.4s, v18.4s +mla v3.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v18.4s +sqrdmulh v18.4S, v2.4S, v29.s[0] +ldr q17, [x0, #176] +mul v2.4S, v2.4S,v30.s[0] +sub v22.4s, v16.4s, v3.4s +mla v15.4S, v14.4S, v31.s[0] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v0.4S, v29.s[0] +ldr q14, [x0, #240] +mul v0.4S, v0.4S,v30.s[0] +sub v21.4s, v8.4s, v15.4s +mla v2.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +sub v18.4s, v13.4s, v2.4s +mla v0.4S, v3.4S, v31.s[0] +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v9.4S, v29.s[0] +mul v9.4S, v9.4S,v30.s[0] +sub v3.4s, v10.4s, v0.4s +mla v20.4S, v15.4S, v31.s[0] +add v10.4s, v10.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v15.4s, v17.4s, v20.4s +mla v9.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +sub v2.4s, v14.4s, v9.4s +mla v16.4S, v0.4S, v31.s[0] +add v14.4s, v14.4s, v9.4s +sqrdmulh v9.4S, v11.4S, v29.s[1] +mul v11.4S, v11.4S,v30.s[1] +sub v0.4s, v17.4s, v16.4s +mla v8.4S, v20.4S, v31.s[0] +add v17.4s, v17.4s, v16.4s +sqrdmulh v16.4S, v12.4S, v29.s[1] +mul v12.4S, v12.4S,v30.s[1] +sub v20.4s, v14.4s, v8.4s +mla v11.4S, v9.4S, v31.s[0] +add v14.4s, v14.4s, v8.4s +sqrdmulh v8.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +sub v9.4s, v13.4s, v11.4s +mla v12.4S, v16.4S, v31.s[0] +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +sub v16.4s, v10.4s, v12.4s +mla v22.4S, v8.4S, v31.s[0] +add v10.4s, v10.4s, v12.4s +sqrdmulh v12.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +sub v8.4s, v15.4s, v22.4s +mla v21.4S, v11.4S, v31.s[0] +add v15.4s, v15.4s, v22.4s +sqrdmulh v22.4S, v1.4S, v29.s[2] +mul v1.4S, v1.4S,v30.s[2] +sub v11.4s, v2.4s, v21.4s +mla v19.4S, v12.4S, v31.s[0] +add v2.4s, v2.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v27.s[0] +mul v17.4S, v17.4S,v28.s[0] +sub v12.4s, v18.4s, v19.4s +mla v1.4S, v22.4S, v31.s[0] +add v18.4s, v18.4s, v19.4s +sqrdmulh v19.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +sub v22.4s, v3.4s, v1.4s +mla v17.4S, v21.4S, v31.s[0] +add v3.4s, v3.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v21.4s, v13.4s, v17.4s +mla v14.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v17.4s +sqrdmulh v17.4S, v20.4S, v27.s[1] +mul v20.4S, v20.4S,v28.s[1] +sub v19.4s, v10.4s, v14.4s +mla v0.4S, v1.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v27.s[2] +mul v15.4S, v15.4S,v28.s[2] +sub v1.4s, v9.4s, v0.4s +mla v20.4S, v17.4S, v31.s[0] +add v9.4s, v9.4s, v0.4s +sqrdmulh v0.4S, v2.4S, v27.s[2] +mul v2.4S, v2.4S,v28.s[2] +sub v17.4s, v16.4s, v20.4s +mla v15.4S, v14.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v27.s[3] +mul v8.4S, v8.4S,v28.s[3] +sub v14.4s, v18.4s, v15.4s +mla v2.4S, v0.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v27.s[3] +mul v11.4S, v11.4S,v28.s[3] +sub v0.4s, v3.4s, v2.4s +mla v8.4S, v20.4S, v31.s[0] +add v3.4s, v3.4s, v2.4s +sqrdmulh v2.4S, v10.4S, v25.s[0] +mul v10.4S, v10.4S,v26.s[0] +sub v20.4s, v12.4s, v8.4s +mla v11.4S, v15.4S, v31.s[0] +add v12.4s, v12.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v25.s[1] +mul v19.4S, v19.4S,v26.s[1] +sub v15.4s, v22.4s, v11.4s +mla v10.4S, v2.4S, v31.s[0] +add v22.4s, v22.4s, v11.4s +sqrdmulh v11.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v2.4s, v13.4s, v10.4s +mla v19.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v10.4s +sqrdmulh v10.4S, v17.4S, v25.s[3] +mul v17.4S, v17.4S,v26.s[3] +sub v8.4s, v21.4s, v19.4s +mla v16.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v19.4s +str q13, [x0, #48] +sqrdmulh v13.4S, v3.4S, v23.s[0] +str q2, [x0, #112] +mul v3.4S, v3.4S,v24.s[0] +ldr q2, [x0, #768] +sub v19.4s, v9.4s, v16.4s +ldr q11, [x0, #832] +mla v17.4S, v10.4S, v31.s[0] +add v9.4s, v9.4s, v16.4s +str q21, [x0, #176] +sqrdmulh v21.4S, v0.4S, v23.s[1] +str q8, [x0, #240] +mul v0.4S, v0.4S,v24.s[1] +ldr q8, [x0, #896] +sub v16.4s, v1.4s, v17.4s +ldr q10, [x0, #960] +mla v3.4S, v13.4S, v31.s[0] +add v1.4s, v1.4s, v17.4s +str q9, [x0, #304] +sqrdmulh v9.4S, v22.4S, v23.s[2] +str q19, [x0, #368] +mul v22.4S, v22.4S,v24.s[2] +ldr q19, [x0, #256] +sub v17.4s, v18.4s, v3.4s +ldr q13, [x0, #320] +mla v0.4S, v21.4S, v31.s[0] +add v18.4s, v18.4s, v3.4s +str q1, [x0, #432] +sqrdmulh v1.4S, v15.4S, v23.s[3] +str q16, [x0, #496] +mul v15.4S, v15.4S,v24.s[3] +ldr q16, [x0, #384] +sub v3.4s, v14.4s, v0.4s +ldr q21, [x0, #448] +mla v22.4S, v9.4S, v31.s[0] +add v14.4s, v14.4s, v0.4s +str q18, [x0, #560] +sqrdmulh v18.4S, v2.4S, v29.s[0] +str q17, [x0, #624] +ldr q17, [x0, #512] +mul v2.4S, v2.4S,v30.s[0] +ldr q0, [x0, #576] +sub v9.4s, v12.4s, v22.4s +mla v15.4S, v1.4S, v31.s[0] +add v12.4s, v12.4s, v22.4s +str q14, [x0, #688] +sqrdmulh v14.4S, v11.4S, v29.s[0] +str q3, [x0, #752] +ldr q3, [x0, #640] +mul v11.4S, v11.4S,v30.s[0] +ldr q22, [x0, #704] +sub v1.4s, v20.4s, v15.4s +mla v2.4S, v18.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +str q12, [x0, #816] +sqrdmulh v12.4S, v8.4S, v29.s[0] +str q9, [x0, #880] +mul v8.4S, v8.4S,v30.s[0] +ldr q9, [x0, #0] +sub v15.4s, v19.4s, v2.4s +mla v11.4S, v14.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +str q20, [x0, #944] +sqrdmulh v20.4S, v10.4S, v29.s[0] +str q1, [x0, #1008] +mul v10.4S, v10.4S,v30.s[0] +ldr q1, [x0, #64] +sub v2.4s, v13.4s, v11.4s +mla v8.4S, v12.4S, v31.s[0] +add v13.4s, v13.4s, v11.4s +sqrdmulh v11.4S, v17.4S, v29.s[0] +ldr q12, [x0, #128] +mul v17.4S, v17.4S,v30.s[0] +sub v14.4s, v16.4s, v8.4s +mla v10.4S, v20.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v0.4S, v29.s[0] +ldr q20, [x0, #192] +mul v0.4S, v0.4S,v30.s[0] +sub v18.4s, v21.4s, v10.4s +mla v17.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +sub v11.4s, v9.4s, v17.4s +mla v0.4S, v8.4S, v31.s[0] +add v9.4s, v9.4s, v17.4s +sqrdmulh v17.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v8.4s, v1.4s, v0.4s +mla v3.4S, v10.4S, v31.s[0] +add v1.4s, v1.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v10.4s, v12.4s, v3.4s +mla v22.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v17.4s, v20.4s, v22.4s +mla v16.4S, v0.4S, v31.s[0] +add v20.4s, v20.4s, v22.4s +sqrdmulh v22.4S, v19.4S, v29.s[1] +mul v19.4S, v19.4S,v30.s[1] +sub v0.4s, v12.4s, v16.4s +mla v21.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v13.4S, v29.s[1] +mul v13.4S, v13.4S,v30.s[1] +sub v3.4s, v20.4s, v21.4s +mla v19.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v22.4s, v9.4s, v19.4s +mla v13.4S, v16.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v18.4S, v29.s[2] +mul v18.4S, v18.4S,v30.s[2] +sub v16.4s, v1.4s, v13.4s +mla v14.4S, v21.4S, v31.s[0] +add v1.4s, v1.4s, v13.4s +sqrdmulh v13.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +sub v21.4s, v10.4s, v14.4s +mla v18.4S, v19.4S, v31.s[0] +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v19.4s, v17.4s, v18.4s +mla v15.4S, v13.4S, v31.s[0] +add v17.4s, v17.4s, v18.4s +sqrdmulh v18.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +sub v13.4s, v11.4s, v15.4s +mla v2.4S, v14.4S, v31.s[0] +add v11.4s, v11.4s, v15.4s +sqrdmulh v15.4S, v20.4S, v27.s[0] +mul v20.4S, v20.4S,v28.s[0] +sub v14.4s, v8.4s, v2.4s +mla v12.4S, v18.4S, v31.s[0] +add v8.4s, v8.4s, v2.4s +sqrdmulh v2.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v18.4s, v9.4s, v12.4s +mla v20.4S, v15.4S, v31.s[0] +add v9.4s, v9.4s, v12.4s +sqrdmulh v12.4S, v3.4S, v27.s[1] +mul v3.4S, v3.4S,v28.s[1] +sub v15.4s, v1.4s, v20.4s +mla v0.4S, v2.4S, v31.s[0] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v10.4S, v27.s[2] +mul v10.4S, v10.4S,v28.s[2] +sub v2.4s, v22.4s, v0.4s +mla v3.4S, v12.4S, v31.s[0] +add v22.4s, v22.4s, v0.4s +sqrdmulh v0.4S, v17.4S, v27.s[2] +mul v17.4S, v17.4S,v28.s[2] +sub v12.4s, v16.4s, v3.4s +mla v10.4S, v20.4S, v31.s[0] +add v16.4s, v16.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +sub v20.4s, v11.4s, v10.4s +mla v17.4S, v0.4S, v31.s[0] +add v11.4s, v11.4s, v10.4s +sqrdmulh v10.4S, v19.4S, v27.s[3] +mul v19.4S, v19.4S,v28.s[3] +sub v0.4s, v8.4s, v17.4s +mla v21.4S, v3.4S, v31.s[0] +add v8.4s, v8.4s, v17.4s +sqrdmulh v17.4S, v1.4S, v25.s[0] +mul v1.4S, v1.4S,v26.s[0] +sub v3.4s, v13.4s, v21.4s +mla v19.4S, v10.4S, v31.s[0] +add v13.4s, v13.4s, v21.4s +sqrdmulh v21.4S, v15.4S, v25.s[1] +mul v15.4S, v15.4S,v26.s[1] +sub v10.4s, v14.4s, v19.4s +mla v1.4S, v17.4S, v31.s[0] +add v14.4s, v14.4s, v19.4s +sqrdmulh v19.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v17.4s, v9.4s, v1.4s +mla v15.4S, v21.4S, v31.s[0] +add v9.4s, v9.4s, v1.4s +sqrdmulh v1.4S, v12.4S, v25.s[3] +mul v12.4S, v12.4S,v26.s[3] +sub v21.4s, v18.4s, v15.4s +mla v16.4S, v19.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +str q9, [x0, #0] +sqrdmulh v9.4S, v8.4S, v23.s[0] +str q17, [x0, #64] +mul v8.4S, v8.4S,v24.s[0] +ldr q17, [x0, #784] +sub v15.4s, v22.4s, v16.4s +ldr q19, [x0, #848] +mla v12.4S, v1.4S, v31.s[0] +add v22.4s, v22.4s, v16.4s +str q18, [x0, #128] +sqrdmulh v18.4S, v0.4S, v23.s[1] +str q21, [x0, #192] +mul v0.4S, v0.4S,v24.s[1] +ldr q21, [x0, #912] +sub v16.4s, v2.4s, v12.4s +ldr q1, [x0, #976] +mla v8.4S, v9.4S, v31.s[0] +add v2.4s, v2.4s, v12.4s +str q22, [x0, #256] +sqrdmulh v22.4S, v14.4S, v23.s[2] +str q15, [x0, #320] +mul v14.4S, v14.4S,v24.s[2] +ldr q15, [x0, #272] +sub v12.4s, v11.4s, v8.4s +ldr q9, [x0, #336] +mla v0.4S, v18.4S, v31.s[0] +add v11.4s, v11.4s, v8.4s +str q2, [x0, #384] +sqrdmulh v2.4S, v10.4S, v23.s[3] +str q16, [x0, #448] +mul v10.4S, v10.4S,v24.s[3] +ldr q16, [x0, #400] +sub v8.4s, v20.4s, v0.4s +ldr q18, [x0, #464] +mla v14.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v0.4s +str q11, [x0, #512] +sqrdmulh v11.4S, v17.4S, v29.s[0] +str q12, [x0, #576] +ldr q12, [x0, #528] +mul v17.4S, v17.4S,v30.s[0] +ldr q0, [x0, #592] +sub v22.4s, v13.4s, v14.4s +mla v10.4S, v2.4S, v31.s[0] +add v13.4s, v13.4s, v14.4s +str q20, [x0, #640] +sqrdmulh v20.4S, v19.4S, v29.s[0] +str q8, [x0, #704] +ldr q8, [x0, #656] +mul v19.4S, v19.4S,v30.s[0] +ldr q14, [x0, #720] +sub v2.4s, v3.4s, v10.4s +mla v17.4S, v11.4S, v31.s[0] +add v3.4s, v3.4s, v10.4s +str q13, [x0, #768] +sqrdmulh v13.4S, v21.4S, v29.s[0] +str q22, [x0, #832] +mul v21.4S, v21.4S,v30.s[0] +ldr q22, [x0, #16] +sub v10.4s, v15.4s, v17.4s +mla v19.4S, v20.4S, v31.s[0] +add v15.4s, v15.4s, v17.4s +str q3, [x0, #896] +sqrdmulh v3.4S, v1.4S, v29.s[0] +str q2, [x0, #960] +mul v1.4S, v1.4S,v30.s[0] +ldr q2, [x0, #80] +sub v17.4s, v9.4s, v19.4s +mla v21.4S, v13.4S, v31.s[0] +add v9.4s, v9.4s, v19.4s +sqrdmulh v19.4S, v12.4S, v29.s[0] +ldr q13, [x0, #144] +mul v12.4S, v12.4S,v30.s[0] +sub v20.4s, v16.4s, v21.4s +mla v1.4S, v3.4S, v31.s[0] +add v16.4s, v16.4s, v21.4s +sqrdmulh v21.4S, v0.4S, v29.s[0] +ldr q3, [x0, #208] +mul v0.4S, v0.4S,v30.s[0] +sub v11.4s, v18.4s, v1.4s +mla v12.4S, v19.4S, v31.s[0] +add v18.4s, v18.4s, v1.4s +sqrdmulh v1.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v19.4s, v22.4s, v12.4s +mla v0.4S, v21.4S, v31.s[0] +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v21.4s, v2.4s, v0.4s +mla v8.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[1] +mul v16.4S, v16.4S,v30.s[1] +sub v1.4s, v13.4s, v8.4s +mla v14.4S, v12.4S, v31.s[0] +add v13.4s, v13.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v12.4s, v3.4s, v14.4s +mla v16.4S, v0.4S, v31.s[0] +add v3.4s, v3.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +sub v0.4s, v13.4s, v16.4s +mla v18.4S, v8.4S, v31.s[0] +add v13.4s, v13.4s, v16.4s +sqrdmulh v16.4S, v9.4S, v29.s[1] +mul v9.4S, v9.4S,v30.s[1] +sub v8.4s, v3.4s, v18.4s +mla v15.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v20.4S, v29.s[2] +mul v20.4S, v20.4S,v30.s[2] +sub v14.4s, v22.4s, v15.4s +mla v9.4S, v16.4S, v31.s[0] +add v22.4s, v22.4s, v15.4s +sqrdmulh v15.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +sub v16.4s, v2.4s, v9.4s +mla v20.4S, v18.4S, v31.s[0] +add v2.4s, v2.4s, v9.4s +sqrdmulh v9.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v18.4s, v1.4s, v20.4s +mla v11.4S, v15.4S, v31.s[0] +add v1.4s, v1.4s, v20.4s +sqrdmulh v20.4S, v17.4S, v29.s[2] +mul v17.4S, v17.4S,v30.s[2] +sub v15.4s, v12.4s, v11.4s +mla v10.4S, v9.4S, v31.s[0] +add v12.4s, v12.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v27.s[0] +mul v13.4S, v13.4S,v28.s[0] +sub v9.4s, v19.4s, v10.4s +mla v17.4S, v20.4S, v31.s[0] +add v19.4s, v19.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v27.s[0] +mul v3.4S, v3.4S,v28.s[0] +sub v20.4s, v21.4s, v17.4s +mla v13.4S, v11.4S, v31.s[0] +add v21.4s, v21.4s, v17.4s +sqrdmulh v17.4S, v0.4S, v27.s[1] +mul v0.4S, v0.4S,v28.s[1] +sub v11.4s, v22.4s, v13.4s +mla v3.4S, v10.4S, v31.s[0] +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v8.4S, v27.s[1] +mul v8.4S, v8.4S,v28.s[1] +sub v10.4s, v2.4s, v3.4s +mla v0.4S, v17.4S, v31.s[0] +add v2.4s, v2.4s, v3.4s +sqrdmulh v3.4S, v1.4S, v27.s[2] +mul v1.4S, v1.4S,v28.s[2] +sub v17.4s, v14.4s, v0.4s +mla v8.4S, v13.4S, v31.s[0] +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v12.4S, v27.s[2] +mul v12.4S, v12.4S,v28.s[2] +sub v13.4s, v16.4s, v8.4s +mla v1.4S, v3.4S, v31.s[0] +add v16.4s, v16.4s, v8.4s +sqrdmulh v8.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v3.4s, v19.4s, v1.4s +mla v12.4S, v0.4S, v31.s[0] +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +sub v0.4s, v21.4s, v12.4s +mla v18.4S, v8.4S, v31.s[0] +add v21.4s, v21.4s, v12.4s +sqrdmulh v12.4S, v2.4S, v25.s[0] +mul v2.4S, v2.4S,v26.s[0] +sub v8.4s, v9.4s, v18.4s +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v18.4s +sqrdmulh v18.4S, v10.4S, v25.s[1] +mul v10.4S, v10.4S,v26.s[1] +sub v1.4s, v20.4s, v15.4s +mla v2.4S, v12.4S, v31.s[0] +add v20.4s, v20.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v25.s[2] +mul v16.4S, v16.4S,v26.s[2] +sub v12.4s, v22.4s, v2.4s +mla v10.4S, v18.4S, v31.s[0] +add v22.4s, v22.4s, v2.4s +sqrdmulh v2.4S, v13.4S, v25.s[3] +mul v13.4S, v13.4S,v26.s[3] +sub v18.4s, v11.4s, v10.4s +mla v16.4S, v15.4S, v31.s[0] +add v11.4s, v11.4s, v10.4s +str q22, [x0, #16] +sqrdmulh v22.4S, v21.4S, v23.s[0] +str q12, [x0, #80] +mul v21.4S, v21.4S,v24.s[0] +sub v12.4s, v14.4s, v16.4s +mla v13.4S, v2.4S, v31.s[0] +add v14.4s, v14.4s, v16.4s +str q11, [x0, #144] +sqrdmulh v11.4S, v0.4S, v23.s[1] +str q18, [x0, #208] +mul v0.4S, v0.4S,v24.s[1] +sub v18.4s, v17.4s, v13.4s +mla v21.4S, v22.4S, v31.s[0] +add v17.4s, v17.4s, v13.4s +str q14, [x0, #272] +sqrdmulh v14.4S, v20.4S, v23.s[2] +str q12, [x0, #336] +mul v20.4S, v20.4S,v24.s[2] +sub v12.4s, v19.4s, v21.4s +mla v0.4S, v11.4S, v31.s[0] +add v19.4s, v19.4s, v21.4s +str q17, [x0, #400] +sqrdmulh v17.4S, v1.4S, v23.s[3] +str q18, [x0, #464] +mul v1.4S, v1.4S,v24.s[3] +sub v18.4s, v3.4s, v0.4s +mla v20.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v0.4s +str q19, [x0, #528] +str q12, [x0, #592] +sub v12.4s, v9.4s, v20.4s +mla v1.4S, v17.4S, v31.s[0] +add v9.4s, v9.4s, v20.4s +str q3, [x0, #656] +str q18, [x0, #720] +sub v18.4s, v8.4s, v1.4s +add v8.4s, v8.4s, v1.4s +str q9, [x0, #784] +str q12, [x0, #848] +str q8, [x0, #912] +str q18, [x0, #976] +ldr q4, [x0, #224] +ldr q5, [x0, #160] +ldr q6, [x0, #32] +ldr q7, [x17, #+128] +ldr q15, [x17, #+144] +sqrdmulh v10.4S, v6.4S, v15.s[0] +mul v6.4S, v6.4S,v7.s[0] +ldr q2, [x0, #48] +sqrdmulh v16.4S, v2.4S, v15.s[0] +mul v2.4S, v2.4S,v7.s[0] +ldr q22, [x17, #+160] +ldr q13, [x17, #+176] +ldr q11, [x0, #96] +sqrdmulh v21.4S, v11.4S, v13.s[0] +mul v11.4S, v11.4S,v22.s[0] +ldr q14, [x0, #112] +sqrdmulh v0.4S, v14.4S, v13.s[0] +mul v14.4S, v14.4S,v22.s[0] +ldr q19, [x17, #+192] +ldr q17, [x17, #+208] +mla v6.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v5.4S, v17.s[0] +ldr q20, [x0, #176] +mla v2.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v20.4S, v17.s[0] +ldr q3, [x17, #+224] +ldr q1, [x17, #+240] +mla v11.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v4.4S, v1.s[0] +ldr q9, [x0, #240] +mla v14.4S, v0.4S, v31.s[0] +sqrdmulh v0.4S, v9.4S, v1.s[0] +ldr q12, [x0, #128] +ldr q8, [x0, #0] +mul v5.4S, v5.4S,v19.s[0] +sub v18.4s, v8.4s, v6.4s +mul v20.4S, v20.4S,v19.s[0] +add v8.4s, v8.4s, v6.4s +ldr q6, [x0, #144] +ldr q30, [x0, #16] +mla v5.4S, v10.4S, v31.s[0] +sub v10.4s, v30.4s, v2.4s +mla v20.4S, v16.4S, v31.s[0] +add v30.4s, v30.4s, v2.4s +ldr q2, [x0, #192] +ldr q16, [x0, #64] +mul v4.4S, v4.4S,v3.s[0] +sub v29.4s, v16.4s, v11.4s +mul v9.4S, v9.4S,v3.s[0] +add v16.4s, v16.4s, v11.4s +ldr q11, [x0, #208] +ldr q28, [x0, #80] +mla v4.4S, v21.4S, v31.s[0] +mla v9.4S, v0.4S, v31.s[0] +sub v0.4s, v28.4s, v14.4s +sqrdmulh v21.4S, v30.4S, v15.s[1] +mul v30.4S, v30.4S,v7.s[1] +add v28.4s, v28.4s, v14.4s +sqrdmulh v14.4S, v10.4S, v15.s[2] +sub v27.4s, v12.4s, v5.4s +mul v10.4S, v10.4S,v7.s[2] +add v12.4s, v12.4s, v5.4s +sqrdmulh v15.4S, v28.4S, v13.s[1] +sub v7.4s, v6.4s, v20.4s +mul v28.4S, v28.4S,v22.s[1] +add v6.4s, v6.4s, v20.4s +sqrdmulh v20.4S, v0.4S, v13.s[2] +sub v5.4s, v2.4s, v4.4s +mul v0.4S, v0.4S,v22.s[2] +add v2.4s, v2.4s, v4.4s +mla v30.4S, v21.4S, v31.s[0] +sub v21.4s, v11.4s, v9.4s +ldr q13, [x0, #480] +sqrdmulh v22.4S, v6.4S, v17.s[1] +add v11.4s, v11.4s, v9.4s +mla v10.4S, v14.4S, v31.s[0] +ldr q14, [x0, #416] +sqrdmulh v9.4S, v7.4S, v17.s[2] +mla v28.4S, v15.4S, v31.s[0] +ldr q15, [x0, #288] +sqrdmulh v4.4S, v11.4S, v1.s[1] +mla v0.4S, v20.4S, v31.s[0] +ldr q20, [x17, #+256] +sqrdmulh v26.4S, v21.4S, v1.s[2] +ldr q25, [x17, #+272] +mul v6.4S, v6.4S,v19.s[1] +sub v24.4s, v8.4s, v30.4s +str q24, [x0, #16] +mul v7.4S, v7.4S,v19.s[2] +add v8.4s, v8.4s, v30.4s +str q8, [x0, #0] +mla v6.4S, v22.4S, v31.s[0] +sub v22.4s, v18.4s, v10.4s +str q22, [x0, #48] +mla v7.4S, v9.4S, v31.s[0] +add v18.4s, v18.4s, v10.4s +str q18, [x0, #32] +mul v11.4S, v11.4S,v3.s[1] +sub v17.4s, v16.4s, v28.4s +str q17, [x0, #80] +mul v21.4S, v21.4S,v3.s[2] +add v16.4s, v16.4s, v28.4s +str q16, [x0, #64] +mla v11.4S, v4.4S, v31.s[0] +sub v4.4s, v29.4s, v0.4s +str q4, [x0, #112] +mla v21.4S, v26.4S, v31.s[0] +add v29.4s, v29.4s, v0.4s +str q29, [x0, #96] +sqrdmulh v1.4S, v15.4S, v25.s[0] +sub v3.4s, v12.4s, v6.4s +mul v15.4S, v15.4S,v20.s[0] +str q3, [x0, #144] +ldr q3, [x0, #304] +sqrdmulh v29.4S, v3.4S, v25.s[0] +add v12.4s, v12.4s, v6.4s +mul v3.4S, v3.4S,v20.s[0] +str q12, [x0, #128] +ldr q12, [x17, #+288] +ldr q6, [x17, #+304] +ldr q0, [x0, #352] +sqrdmulh v26.4S, v0.4S, v6.s[0] +sub v4.4s, v27.4s, v7.4s +mul v0.4S, v0.4S,v12.s[0] +str q4, [x0, #176] +ldr q4, [x0, #368] +sqrdmulh v16.4S, v4.4S, v6.s[0] +add v27.4s, v27.4s, v7.4s +mul v4.4S, v4.4S,v12.s[0] +str q27, [x0, #160] +ldr q27, [x17, #+320] +ldr q7, [x17, #+336] +mla v15.4S, v1.4S, v31.s[0] +sub v1.4s, v2.4s, v11.4s +sqrdmulh v28.4S, v14.4S, v7.s[0] +str q1, [x0, #208] +ldr q1, [x0, #432] +mla v3.4S, v29.4S, v31.s[0] +add v2.4s, v2.4s, v11.4s +sqrdmulh v11.4S, v1.4S, v7.s[0] +str q2, [x0, #192] +ldr q2, [x17, #+352] +ldr q29, [x17, #+368] +mla v0.4S, v26.4S, v31.s[0] +sub v26.4s, v5.4s, v21.4s +sqrdmulh v17.4S, v13.4S, v29.s[0] +str q26, [x0, #240] +ldr q26, [x0, #496] +mla v4.4S, v16.4S, v31.s[0] +add v5.4s, v5.4s, v21.4s +sqrdmulh v21.4S, v26.4S, v29.s[0] +str q5, [x0, #224] +ldr q5, [x0, #384] +ldr q16, [x0, #256] +mul v14.4S, v14.4S,v27.s[0] +sub v19.4s, v16.4s, v15.4s +mul v1.4S, v1.4S,v27.s[0] +add v16.4s, v16.4s, v15.4s +ldr q15, [x0, #400] +ldr q18, [x0, #272] +mla v14.4S, v28.4S, v31.s[0] +sub v28.4s, v18.4s, v3.4s +mla v1.4S, v11.4S, v31.s[0] +add v18.4s, v18.4s, v3.4s +ldr q3, [x0, #448] +ldr q11, [x0, #320] +mul v13.4S, v13.4S,v2.s[0] +sub v10.4s, v11.4s, v0.4s +mul v26.4S, v26.4S,v2.s[0] +add v11.4s, v11.4s, v0.4s +ldr q0, [x0, #464] +ldr q9, [x0, #336] +mla v13.4S, v17.4S, v31.s[0] +mla v26.4S, v21.4S, v31.s[0] +sub v21.4s, v9.4s, v4.4s +sqrdmulh v17.4S, v18.4S, v25.s[1] +mul v18.4S, v18.4S,v20.s[1] +add v9.4s, v9.4s, v4.4s +sqrdmulh v4.4S, v28.4S, v25.s[2] +sub v22.4s, v5.4s, v14.4s +mul v28.4S, v28.4S,v20.s[2] +add v5.4s, v5.4s, v14.4s +sqrdmulh v25.4S, v9.4S, v6.s[1] +sub v20.4s, v15.4s, v1.4s +mul v9.4S, v9.4S,v12.s[1] +add v15.4s, v15.4s, v1.4s +sqrdmulh v1.4S, v21.4S, v6.s[2] +sub v14.4s, v3.4s, v13.4s +mul v21.4S, v21.4S,v12.s[2] +add v3.4s, v3.4s, v13.4s +mla v18.4S, v17.4S, v31.s[0] +sub v17.4s, v0.4s, v26.4s +ldr q6, [x0, #736] +sqrdmulh v12.4S, v15.4S, v7.s[1] +add v0.4s, v0.4s, v26.4s +mla v28.4S, v4.4S, v31.s[0] +ldr q4, [x0, #672] +sqrdmulh v26.4S, v20.4S, v7.s[2] +mla v9.4S, v25.4S, v31.s[0] +ldr q25, [x0, #544] +sqrdmulh v13.4S, v0.4S, v29.s[1] +mla v21.4S, v1.4S, v31.s[0] +ldr q1, [x17, #+384] +sqrdmulh v8.4S, v17.4S, v29.s[2] +ldr q30, [x17, #+400] +mul v15.4S, v15.4S,v27.s[1] +sub v24.4s, v16.4s, v18.4s +str q24, [x0, #272] +mul v20.4S, v20.4S,v27.s[2] +add v16.4s, v16.4s, v18.4s +str q16, [x0, #256] +mla v15.4S, v12.4S, v31.s[0] +sub v12.4s, v19.4s, v28.4s +str q12, [x0, #304] +mla v20.4S, v26.4S, v31.s[0] +add v19.4s, v19.4s, v28.4s +str q19, [x0, #288] +mul v0.4S, v0.4S,v2.s[1] +sub v7.4s, v11.4s, v9.4s +str q7, [x0, #336] +mul v17.4S, v17.4S,v2.s[2] +add v11.4s, v11.4s, v9.4s +str q11, [x0, #320] +mla v0.4S, v13.4S, v31.s[0] +sub v13.4s, v10.4s, v21.4s +str q13, [x0, #368] +mla v17.4S, v8.4S, v31.s[0] +add v10.4s, v10.4s, v21.4s +str q10, [x0, #352] +sqrdmulh v29.4S, v25.4S, v30.s[0] +sub v2.4s, v5.4s, v15.4s +mul v25.4S, v25.4S,v1.s[0] +str q2, [x0, #400] +ldr q2, [x0, #560] +sqrdmulh v10.4S, v2.4S, v30.s[0] +add v5.4s, v5.4s, v15.4s +mul v2.4S, v2.4S,v1.s[0] +str q5, [x0, #384] +ldr q5, [x17, #+416] +ldr q15, [x17, #+432] +ldr q21, [x0, #608] +sqrdmulh v8.4S, v21.4S, v15.s[0] +sub v13.4s, v22.4s, v20.4s +mul v21.4S, v21.4S,v5.s[0] +str q13, [x0, #432] +ldr q13, [x0, #624] +sqrdmulh v11.4S, v13.4S, v15.s[0] +add v22.4s, v22.4s, v20.4s +mul v13.4S, v13.4S,v5.s[0] +str q22, [x0, #416] +ldr q22, [x17, #+448] +ldr q20, [x17, #+464] +mla v25.4S, v29.4S, v31.s[0] +sub v29.4s, v3.4s, v0.4s +sqrdmulh v9.4S, v4.4S, v20.s[0] +str q29, [x0, #464] +ldr q29, [x0, #688] +mla v2.4S, v10.4S, v31.s[0] +add v3.4s, v3.4s, v0.4s +sqrdmulh v0.4S, v29.4S, v20.s[0] +str q3, [x0, #448] +ldr q3, [x17, #+480] +ldr q10, [x17, #+496] +mla v21.4S, v8.4S, v31.s[0] +sub v8.4s, v14.4s, v17.4s +sqrdmulh v7.4S, v6.4S, v10.s[0] +str q8, [x0, #496] +ldr q8, [x0, #752] +mla v13.4S, v11.4S, v31.s[0] +add v14.4s, v14.4s, v17.4s +sqrdmulh v17.4S, v8.4S, v10.s[0] +str q14, [x0, #480] +ldr q14, [x0, #640] +ldr q11, [x0, #512] +mul v4.4S, v4.4S,v22.s[0] +sub v27.4s, v11.4s, v25.4s +mul v29.4S, v29.4S,v22.s[0] +add v11.4s, v11.4s, v25.4s +ldr q25, [x0, #656] +ldr q19, [x0, #528] +mla v4.4S, v9.4S, v31.s[0] +sub v9.4s, v19.4s, v2.4s +mla v29.4S, v0.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +ldr q2, [x0, #704] +ldr q0, [x0, #576] +mul v6.4S, v6.4S,v3.s[0] +sub v28.4s, v0.4s, v21.4s +mul v8.4S, v8.4S,v3.s[0] +add v0.4s, v0.4s, v21.4s +ldr q21, [x0, #720] +ldr q26, [x0, #592] +mla v6.4S, v7.4S, v31.s[0] +mla v8.4S, v17.4S, v31.s[0] +sub v17.4s, v26.4s, v13.4s +sqrdmulh v7.4S, v19.4S, v30.s[1] +mul v19.4S, v19.4S,v1.s[1] +add v26.4s, v26.4s, v13.4s +sqrdmulh v13.4S, v9.4S, v30.s[2] +sub v12.4s, v14.4s, v4.4s +mul v9.4S, v9.4S,v1.s[2] +add v14.4s, v14.4s, v4.4s +sqrdmulh v30.4S, v26.4S, v15.s[1] +sub v1.4s, v25.4s, v29.4s +mul v26.4S, v26.4S,v5.s[1] +add v25.4s, v25.4s, v29.4s +sqrdmulh v29.4S, v17.4S, v15.s[2] +sub v4.4s, v2.4s, v6.4s +mul v17.4S, v17.4S,v5.s[2] +add v2.4s, v2.4s, v6.4s +mla v19.4S, v7.4S, v31.s[0] +sub v7.4s, v21.4s, v8.4s +ldr q15, [x0, #992] +sqrdmulh v5.4S, v25.4S, v20.s[1] +add v21.4s, v21.4s, v8.4s +mla v9.4S, v13.4S, v31.s[0] +ldr q13, [x0, #928] +sqrdmulh v8.4S, v1.4S, v20.s[2] +mla v26.4S, v30.4S, v31.s[0] +ldr q30, [x0, #800] +sqrdmulh v6.4S, v21.4S, v10.s[1] +mla v17.4S, v29.4S, v31.s[0] +ldr q29, [x17, #+512] +sqrdmulh v16.4S, v7.4S, v10.s[2] +ldr q18, [x17, #+528] +mul v25.4S, v25.4S,v22.s[1] +sub v24.4s, v11.4s, v19.4s +str q24, [x0, #528] +mul v1.4S, v1.4S,v22.s[2] +add v11.4s, v11.4s, v19.4s +str q11, [x0, #512] +mla v25.4S, v5.4S, v31.s[0] +sub v5.4s, v27.4s, v9.4s +str q5, [x0, #560] +mla v1.4S, v8.4S, v31.s[0] +add v27.4s, v27.4s, v9.4s +str q27, [x0, #544] +mul v21.4S, v21.4S,v3.s[1] +sub v20.4s, v0.4s, v26.4s +str q20, [x0, #592] +mul v7.4S, v7.4S,v3.s[2] +add v0.4s, v0.4s, v26.4s +str q0, [x0, #576] +mla v21.4S, v6.4S, v31.s[0] +sub v6.4s, v28.4s, v17.4s +str q6, [x0, #624] +mla v7.4S, v16.4S, v31.s[0] +add v28.4s, v28.4s, v17.4s +str q28, [x0, #608] +sqrdmulh v10.4S, v30.4S, v18.s[0] +sub v3.4s, v14.4s, v25.4s +mul v30.4S, v30.4S,v29.s[0] +str q3, [x0, #656] +ldr q3, [x0, #816] +sqrdmulh v28.4S, v3.4S, v18.s[0] +add v14.4s, v14.4s, v25.4s +mul v3.4S, v3.4S,v29.s[0] +str q14, [x0, #640] +ldr q14, [x17, #+544] +ldr q25, [x17, #+560] +ldr q17, [x0, #864] +sqrdmulh v16.4S, v17.4S, v25.s[0] +sub v6.4s, v12.4s, v1.4s +mul v17.4S, v17.4S,v14.s[0] +str q6, [x0, #688] +ldr q6, [x0, #880] +sqrdmulh v0.4S, v6.4S, v25.s[0] +add v12.4s, v12.4s, v1.4s +mul v6.4S, v6.4S,v14.s[0] +str q12, [x0, #672] +ldr q12, [x17, #+576] +ldr q1, [x17, #+592] +mla v30.4S, v10.4S, v31.s[0] +sub v10.4s, v2.4s, v21.4s +sqrdmulh v26.4S, v13.4S, v1.s[0] +str q10, [x0, #720] +ldr q10, [x0, #944] +mla v3.4S, v28.4S, v31.s[0] +add v2.4s, v2.4s, v21.4s +sqrdmulh v21.4S, v10.4S, v1.s[0] +str q2, [x0, #704] +ldr q2, [x17, #+608] +ldr q28, [x17, #+624] +mla v17.4S, v16.4S, v31.s[0] +sub v16.4s, v4.4s, v7.4s +sqrdmulh v20.4S, v15.4S, v28.s[0] +str q16, [x0, #752] +ldr q16, [x0, #1008] +mla v6.4S, v0.4S, v31.s[0] +add v4.4s, v4.4s, v7.4s +sqrdmulh v7.4S, v16.4S, v28.s[0] +str q4, [x0, #736] +ldr q4, [x0, #896] +ldr q0, [x0, #768] +mul v13.4S, v13.4S,v12.s[0] +sub v22.4s, v0.4s, v30.4s +mul v10.4S, v10.4S,v12.s[0] +add v0.4s, v0.4s, v30.4s +ldr q30, [x0, #912] +ldr q27, [x0, #784] +mla v13.4S, v26.4S, v31.s[0] +sub v26.4s, v27.4s, v3.4s +mla v10.4S, v21.4S, v31.s[0] +add v27.4s, v27.4s, v3.4s +ldr q3, [x0, #960] +ldr q21, [x0, #832] +mul v15.4S, v15.4S,v2.s[0] +sub v9.4s, v21.4s, v17.4s +mul v16.4S, v16.4S,v2.s[0] +add v21.4s, v21.4s, v17.4s +ldr q17, [x0, #976] +ldr q8, [x0, #848] +mla v15.4S, v20.4S, v31.s[0] +mla v16.4S, v7.4S, v31.s[0] +sub v7.4s, v8.4s, v6.4s +sqrdmulh v20.4S, v27.4S, v18.s[1] +mul v27.4S, v27.4S,v29.s[1] +add v8.4s, v8.4s, v6.4s +sqrdmulh v6.4S, v26.4S, v18.s[2] +sub v5.4s, v4.4s, v13.4s +mul v26.4S, v26.4S,v29.s[2] +add v4.4s, v4.4s, v13.4s +sqrdmulh v18.4S, v8.4S, v25.s[1] +sub v29.4s, v30.4s, v10.4s +mul v8.4S, v8.4S,v14.s[1] +add v30.4s, v30.4s, v10.4s +sqrdmulh v10.4S, v7.4S, v25.s[2] +sub v13.4s, v3.4s, v15.4s +mul v7.4S, v7.4S,v14.s[2] +add v3.4s, v3.4s, v15.4s +mla v27.4S, v20.4S, v31.s[0] +sub v20.4s, v17.4s, v16.4s +sqrdmulh v25.4S, v30.4S, v1.s[1] +add v17.4s, v17.4s, v16.4s +mla v26.4S, v6.4S, v31.s[0] +sqrdmulh v6.4S, v29.4S, v1.s[2] +mla v8.4S, v18.4S, v31.s[0] +sqrdmulh v18.4S, v17.4S, v28.s[1] +mla v7.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v20.4S, v28.s[2] +mul v30.4S, v30.4S,v12.s[1] +sub v16.4s, v0.4s, v27.4s +str q16, [x0, #784] +mul v29.4S, v29.4S,v12.s[2] +add v0.4s, v0.4s, v27.4s +str q0, [x0, #768] +mla v30.4S, v25.4S, v31.s[0] +sub v25.4s, v22.4s, v26.4s +str q25, [x0, #816] +mla v29.4S, v6.4S, v31.s[0] +add v22.4s, v22.4s, v26.4s +str q22, [x0, #800] +mul v17.4S, v17.4S,v2.s[1] +sub v1.4s, v21.4s, v8.4s +str q1, [x0, #848] +mul v20.4S, v20.4S,v2.s[2] +add v21.4s, v21.4s, v8.4s +str q21, [x0, #832] +mla v17.4S, v18.4S, v31.s[0] +sub v18.4s, v9.4s, v7.4s +str q18, [x0, #880] +mla v20.4S, v10.4S, v31.s[0] +add v9.4s, v9.4s, v7.4s +str q9, [x0, #864] +sub v28.4s, v4.4s, v30.4s +str q28, [x0, #912] +add v4.4s, v4.4s, v30.4s +str q4, [x0, #896] +sub v4.4s, v5.4s, v29.4s +str q4, [x0, #944] +add v5.4s, v5.4s, v29.4s +str q5, [x0, #928] +sub v5.4s, v3.4s, v17.4s +str q5, [x0, #976] +add v3.4s, v3.4s, v17.4s +str q3, [x0, #960] +sub v3.4s, v13.4s, v20.4s +str q3, [x0, #1008] +add v13.4s, v13.4s, v20.4s +str q13, [x0, #992] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1464 +// Instruction count: 1460 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_0.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_0.s new file mode 100644 index 0000000..982af55 --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_0.s @@ -0,0 +1,1494 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_7_z4_0 +.global _ntt_u32_incomplete_neon_asm_var_4_2_7_z4_0 +ntt_u32_incomplete_neon_asm_var_4_2_7_z4_0: +_ntt_u32_incomplete_neon_asm_var_4_2_7_z4_0: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #928] +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +ldr q20, [x0, #992] +sqrdmulh v19.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q18, [x0, #800] +sqrdmulh v17.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +ldr q16, [x0, #864] +sqrdmulh v3.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +mla v22.4S, v21.4S, v31.s[0] +mla v20.4S, v19.4S, v31.s[0] +mla v18.4S, v17.4S, v31.s[0] +mla v16.4S, v3.4S, v31.s[0] +ldr q3, [x0, #544] +sqrdmulh v17.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +ldr q19, [x0, #608] +sqrdmulh v21.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q2, [x0, #672] +ldr q1, [x0, #416] +sqrdmulh v0.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +sub v15.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +ldr q22, [x0, #736] +ldr q14, [x0, #480] +sqrdmulh v13.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v12.4s, v14.4s, v20.4s +add v14.4s, v14.4s, v20.4s +ldr q20, [x0, #288] +mla v3.4S, v17.4S, v31.s[0] +mla v19.4S, v21.4S, v31.s[0] +sub v21.4s, v20.4s, v18.4s +mla v2.4S, v0.4S, v31.s[0] +mla v22.4S, v13.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +ldr q18, [x0, #352] +sqrdmulh v13.4S, v1.4S, v29.s[1] +mul v1.4S, v1.4S,v30.s[1] +sub v0.4s, v18.4s, v16.4s +sqrdmulh v17.4S, v14.4S, v29.s[1] +mul v14.4S, v14.4S,v30.s[1] +add v18.4s, v18.4s, v16.4s +ldr q16, [x0, #32] +sqrdmulh v11.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v10.4s, v16.4s, v3.4s +add v16.4s, v16.4s, v3.4s +ldr q3, [x0, #96] +sqrdmulh v9.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v8.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +ldr q19, [x0, #160] +mla v1.4S, v13.4S, v31.s[0] +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v19.4s, v2.4s +mla v20.4S, v11.4S, v31.s[0] +mla v18.4S, v9.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +ldr q2, [x0, #224] +sqrdmulh v9.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +sub v11.4s, v2.4s, v22.4s +sqrdmulh v13.4S, v12.4S, v29.s[2] +mul v12.4S, v12.4S,v30.s[2] +add v2.4s, v2.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +sub v7.4s, v19.4s, v1.4s +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v29.s[2] +mul v0.4S, v0.4S,v30.s[2] +sub v6.4s, v2.4s, v14.4s +add v2.4s, v2.4s, v14.4s +mla v15.4S, v9.4S, v31.s[0] +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v16.4s, v20.4s +mla v21.4S, v22.4S, v31.s[0] +mla v0.4S, v1.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v7.4S, v27.s[1] +mul v7.4S, v7.4S,v28.s[1] +sub v1.4s, v3.4s, v18.4s +sqrdmulh v22.4S, v6.4S, v27.s[1] +mul v6.4S, v6.4S,v28.s[1] +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v19.4S, v27.s[0] +mul v19.4S, v19.4S,v28.s[0] +sub v9.4s, v17.4s, v15.4s +add v17.4s, v17.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v27.s[0] +mul v2.4S, v2.4S,v28.s[0] +sub v14.4s, v11.4s, v12.4s +add v11.4s, v11.4s, v12.4s +mla v7.4S, v20.4S, v31.s[0] +mla v6.4S, v22.4S, v31.s[0] +sub v22.4s, v10.4s, v21.4s +mla v19.4S, v18.4S, v31.s[0] +mla v2.4S, v15.4S, v31.s[0] +add v10.4s, v10.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v27.s[2] +mul v17.4S, v17.4S,v28.s[2] +sub v15.4s, v8.4s, v0.4s +sqrdmulh v18.4S, v11.4S, v27.s[2] +mul v11.4S, v11.4S,v28.s[2] +add v8.4s, v8.4s, v0.4s +sqrdmulh v0.4S, v9.4S, v27.s[3] +mul v9.4S, v9.4S,v28.s[3] +sub v20.4s, v13.4s, v7.4s +add v13.4s, v13.4s, v7.4s +sqrdmulh v7.4S, v14.4S, v27.s[3] +mul v14.4S, v14.4S,v28.s[3] +sub v12.4s, v1.4s, v6.4s +add v1.4s, v1.4s, v6.4s +mla v17.4S, v21.4S, v31.s[0] +mla v11.4S, v18.4S, v31.s[0] +sub v18.4s, v16.4s, v19.4s +mla v9.4S, v0.4S, v31.s[0] +mla v14.4S, v7.4S, v31.s[0] +add v16.4s, v16.4s, v19.4s +sqrdmulh v19.4S, v1.4S, v25.s[2] +mul v1.4S, v1.4S,v26.s[2] +sub v7.4s, v3.4s, v2.4s +sqrdmulh v0.4S, v12.4S, v25.s[3] +mul v12.4S, v12.4S,v26.s[3] +add v3.4s, v3.4s, v2.4s +sqrdmulh v2.4S, v7.4S, v25.s[1] +mul v7.4S, v7.4S,v26.s[1] +sub v21.4s, v10.4s, v17.4s +add v10.4s, v10.4s, v17.4s +sqrdmulh v17.4S, v3.4S, v25.s[0] +mul v3.4S, v3.4S,v26.s[0] +sub v6.4s, v8.4s, v11.4s +add v8.4s, v8.4s, v11.4s +mla v1.4S, v19.4S, v31.s[0] +mla v12.4S, v0.4S, v31.s[0] +sub v0.4s, v22.4s, v9.4s +mla v7.4S, v2.4S, v31.s[0] +mla v3.4S, v17.4S, v31.s[0] +add v22.4s, v22.4s, v9.4s +sqrdmulh v9.4S, v8.4S, v23.s[0] +mul v8.4S, v8.4S,v24.s[0] +sub v17.4s, v15.4s, v14.4s +sqrdmulh v2.4S, v6.4S, v23.s[1] +mul v6.4S, v6.4S,v24.s[1] +add v15.4s, v15.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v23.s[2] +mul v15.4S, v15.4S,v24.s[2] +sub v19.4s, v13.4s, v1.4s +add v13.4s, v13.4s, v1.4s +sqrdmulh v1.4S, v17.4S, v23.s[3] +mul v17.4S, v17.4S,v24.s[3] +sub v11.4s, v20.4s, v12.4s +add v20.4s, v20.4s, v12.4s +mla v8.4S, v9.4S, v31.s[0] +mla v6.4S, v2.4S, v31.s[0] +sub v2.4s, v18.4s, v7.4s +str q13, [x0, #288] +mla v15.4S, v14.4S, v31.s[0] +mla v17.4S, v1.4S, v31.s[0] +add v18.4s, v18.4s, v7.4s +str q19, [x0, #352] +ldr q19, [x0, #944] +sqrdmulh v7.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +sub v1.4s, v16.4s, v3.4s +str q20, [x0, #416] +ldr q20, [x0, #1008] +sqrdmulh v14.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v16.4s, v16.4s, v3.4s +str q11, [x0, #480] +ldr q11, [x0, #816] +sqrdmulh v3.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +sub v13.4s, v10.4s, v8.4s +add v10.4s, v10.4s, v8.4s +ldr q8, [x0, #880] +sqrdmulh v9.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v12.4s, v21.4s, v6.4s +add v21.4s, v21.4s, v6.4s +mla v19.4S, v7.4S, v31.s[0] +mla v20.4S, v14.4S, v31.s[0] +sub v14.4s, v22.4s, v15.4s +str q18, [x0, #160] +mla v11.4S, v3.4S, v31.s[0] +mla v8.4S, v9.4S, v31.s[0] +add v22.4s, v22.4s, v15.4s +str q2, [x0, #224] +ldr q2, [x0, #560] +sqrdmulh v15.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +sub v9.4s, v0.4s, v17.4s +str q16, [x0, #32] +ldr q16, [x0, #624] +sqrdmulh v3.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +add v0.4s, v0.4s, v17.4s +str q1, [x0, #96] +ldr q1, [x0, #688] +ldr q17, [x0, #432] +sqrdmulh v18.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +sub v7.4s, v17.4s, v19.4s +add v17.4s, v17.4s, v19.4s +ldr q19, [x0, #752] +ldr q6, [x0, #496] +sqrdmulh v5.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +sub v4.4s, v6.4s, v20.4s +add v6.4s, v6.4s, v20.4s +ldr q20, [x0, #304] +mla v2.4S, v15.4S, v31.s[0] +mla v16.4S, v3.4S, v31.s[0] +sub v3.4s, v20.4s, v11.4s +str q10, [x0, #544] +mla v1.4S, v18.4S, v31.s[0] +mla v19.4S, v5.4S, v31.s[0] +add v20.4s, v20.4s, v11.4s +str q13, [x0, #608] +ldr q13, [x0, #368] +sqrdmulh v11.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v5.4s, v13.4s, v8.4s +str q21, [x0, #672] +sqrdmulh v21.4S, v6.4S, v29.s[1] +mul v6.4S, v6.4S,v30.s[1] +add v13.4s, v13.4s, v8.4s +str q12, [x0, #736] +ldr q12, [x0, #48] +sqrdmulh v8.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v18.4s, v12.4s, v2.4s +add v12.4s, v12.4s, v2.4s +ldr q2, [x0, #112] +sqrdmulh v10.4S, v13.4S, v29.s[1] +mul v13.4S, v13.4S,v30.s[1] +sub v15.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +ldr q16, [x0, #176] +mla v17.4S, v11.4S, v31.s[0] +mla v6.4S, v21.4S, v31.s[0] +sub v21.4s, v16.4s, v1.4s +str q22, [x0, #800] +mla v20.4S, v8.4S, v31.s[0] +mla v13.4S, v10.4S, v31.s[0] +add v16.4s, v16.4s, v1.4s +str q14, [x0, #864] +ldr q14, [x0, #240] +sqrdmulh v1.4S, v7.4S, v29.s[2] +mul v7.4S, v7.4S,v30.s[2] +sub v10.4s, v14.4s, v19.4s +str q0, [x0, #928] +sqrdmulh v0.4S, v4.4S, v29.s[2] +mul v4.4S, v4.4S,v30.s[2] +add v14.4s, v14.4s, v19.4s +str q9, [x0, #992] +sqrdmulh v9.4S, v3.4S, v29.s[2] +mul v3.4S, v3.4S,v30.s[2] +sub v19.4s, v16.4s, v17.4s +add v16.4s, v16.4s, v17.4s +sqrdmulh v17.4S, v5.4S, v29.s[2] +mul v5.4S, v5.4S,v30.s[2] +sub v8.4s, v14.4s, v6.4s +add v14.4s, v14.4s, v6.4s +mla v7.4S, v1.4S, v31.s[0] +mla v4.4S, v0.4S, v31.s[0] +sub v0.4s, v12.4s, v20.4s +mla v3.4S, v9.4S, v31.s[0] +mla v5.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v27.s[1] +mul v19.4S, v19.4S,v28.s[1] +sub v17.4s, v2.4s, v13.4s +sqrdmulh v9.4S, v8.4S, v27.s[1] +mul v8.4S, v8.4S,v28.s[1] +add v2.4s, v2.4s, v13.4s +sqrdmulh v13.4S, v16.4S, v27.s[0] +mul v16.4S, v16.4S,v28.s[0] +sub v1.4s, v21.4s, v7.4s +add v21.4s, v21.4s, v7.4s +sqrdmulh v7.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +sub v6.4s, v10.4s, v4.4s +add v10.4s, v10.4s, v4.4s +mla v19.4S, v20.4S, v31.s[0] +mla v8.4S, v9.4S, v31.s[0] +sub v9.4s, v18.4s, v3.4s +mla v16.4S, v13.4S, v31.s[0] +mla v14.4S, v7.4S, v31.s[0] +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v27.s[2] +mul v21.4S, v21.4S,v28.s[2] +sub v7.4s, v15.4s, v5.4s +sqrdmulh v13.4S, v10.4S, v27.s[2] +mul v10.4S, v10.4S,v28.s[2] +add v15.4s, v15.4s, v5.4s +sqrdmulh v5.4S, v1.4S, v27.s[3] +mul v1.4S, v1.4S,v28.s[3] +sub v20.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v27.s[3] +mul v6.4S, v6.4S,v28.s[3] +sub v4.4s, v17.4s, v8.4s +add v17.4s, v17.4s, v8.4s +mla v21.4S, v3.4S, v31.s[0] +mla v10.4S, v13.4S, v31.s[0] +sub v13.4s, v12.4s, v16.4s +mla v1.4S, v5.4S, v31.s[0] +mla v6.4S, v19.4S, v31.s[0] +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v25.s[2] +mul v17.4S, v17.4S,v26.s[2] +sub v19.4s, v2.4s, v14.4s +sqrdmulh v5.4S, v4.4S, v25.s[3] +mul v4.4S, v4.4S,v26.s[3] +add v2.4s, v2.4s, v14.4s +sqrdmulh v14.4S, v19.4S, v25.s[1] +mul v19.4S, v19.4S,v26.s[1] +sub v3.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v2.4S, v25.s[0] +mul v2.4S, v2.4S,v26.s[0] +sub v8.4s, v15.4s, v10.4s +add v15.4s, v15.4s, v10.4s +mla v17.4S, v16.4S, v31.s[0] +mla v4.4S, v5.4S, v31.s[0] +sub v5.4s, v9.4s, v1.4s +mla v19.4S, v14.4S, v31.s[0] +mla v2.4S, v21.4S, v31.s[0] +add v9.4s, v9.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v23.s[0] +mul v15.4S, v15.4S,v24.s[0] +sub v21.4s, v7.4s, v6.4s +sqrdmulh v14.4S, v8.4S, v23.s[1] +mul v8.4S, v8.4S,v24.s[1] +add v7.4s, v7.4s, v6.4s +sqrdmulh v6.4S, v7.4S, v23.s[2] +mul v7.4S, v7.4S,v24.s[2] +sub v16.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +sqrdmulh v17.4S, v21.4S, v23.s[3] +mul v21.4S, v21.4S,v24.s[3] +sub v10.4s, v20.4s, v4.4s +add v20.4s, v20.4s, v4.4s +mla v15.4S, v1.4S, v31.s[0] +mla v8.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v19.4s +str q0, [x0, #304] +mla v7.4S, v6.4S, v31.s[0] +mla v21.4S, v17.4S, v31.s[0] +add v13.4s, v13.4s, v19.4s +str q16, [x0, #368] +ldr q16, [x0, #896] +sqrdmulh v19.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v17.4s, v12.4s, v2.4s +str q20, [x0, #432] +ldr q20, [x0, #960] +sqrdmulh v6.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v12.4s, v12.4s, v2.4s +str q10, [x0, #496] +ldr q10, [x0, #768] +sqrdmulh v2.4S, v10.4S, v29.s[0] +mul v10.4S, v10.4S,v30.s[0] +sub v0.4s, v18.4s, v15.4s +add v18.4s, v18.4s, v15.4s +ldr q15, [x0, #832] +sqrdmulh v1.4S, v15.4S, v29.s[0] +mul v15.4S, v15.4S,v30.s[0] +sub v4.4s, v3.4s, v8.4s +add v3.4s, v3.4s, v8.4s +mla v16.4S, v19.4S, v31.s[0] +mla v20.4S, v6.4S, v31.s[0] +sub v6.4s, v9.4s, v7.4s +str q13, [x0, #176] +mla v10.4S, v2.4S, v31.s[0] +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v7.4s +str q14, [x0, #240] +ldr q14, [x0, #512] +sqrdmulh v7.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v1.4s, v5.4s, v21.4s +str q12, [x0, #48] +ldr q12, [x0, #576] +sqrdmulh v2.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +add v5.4s, v5.4s, v21.4s +str q17, [x0, #112] +ldr q17, [x0, #640] +ldr q21, [x0, #384] +sqrdmulh v13.4S, v17.4S, v29.s[0] +mul v17.4S, v17.4S,v30.s[0] +sub v19.4s, v21.4s, v16.4s +add v21.4s, v21.4s, v16.4s +ldr q16, [x0, #704] +ldr q8, [x0, #448] +sqrdmulh v22.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v11.4s, v8.4s, v20.4s +add v8.4s, v8.4s, v20.4s +ldr q20, [x0, #256] +mla v14.4S, v7.4S, v31.s[0] +mla v12.4S, v2.4S, v31.s[0] +sub v2.4s, v20.4s, v10.4s +str q18, [x0, #560] +mla v17.4S, v13.4S, v31.s[0] +mla v16.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v10.4s +str q0, [x0, #624] +ldr q0, [x0, #320] +sqrdmulh v10.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v22.4s, v0.4s, v15.4s +str q3, [x0, #688] +sqrdmulh v3.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +add v0.4s, v0.4s, v15.4s +str q4, [x0, #752] +ldr q4, [x0, #0] +sqrdmulh v15.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v13.4s, v4.4s, v14.4s +add v4.4s, v4.4s, v14.4s +ldr q14, [x0, #64] +sqrdmulh v18.4S, v0.4S, v29.s[1] +mul v0.4S, v0.4S,v30.s[1] +sub v7.4s, v14.4s, v12.4s +add v14.4s, v14.4s, v12.4s +ldr q12, [x0, #128] +mla v21.4S, v10.4S, v31.s[0] +mla v8.4S, v3.4S, v31.s[0] +sub v3.4s, v12.4s, v17.4s +str q9, [x0, #816] +mla v20.4S, v15.4S, v31.s[0] +mla v0.4S, v18.4S, v31.s[0] +add v12.4s, v12.4s, v17.4s +str q6, [x0, #880] +ldr q6, [x0, #192] +sqrdmulh v17.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +sub v18.4s, v6.4s, v16.4s +str q5, [x0, #944] +sqrdmulh v5.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +add v6.4s, v6.4s, v16.4s +str q1, [x0, #1008] +sqrdmulh v1.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v16.4s, v12.4s, v21.4s +add v12.4s, v12.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +sub v15.4s, v6.4s, v8.4s +add v6.4s, v6.4s, v8.4s +mla v19.4S, v17.4S, v31.s[0] +mla v11.4S, v5.4S, v31.s[0] +sub v5.4s, v4.4s, v20.4s +mla v2.4S, v1.4S, v31.s[0] +mla v22.4S, v21.4S, v31.s[0] +add v4.4s, v4.4s, v20.4s +sqrdmulh v20.4S, v16.4S, v27.s[1] +mul v16.4S, v16.4S,v28.s[1] +sub v21.4s, v14.4s, v0.4s +sqrdmulh v1.4S, v15.4S, v27.s[1] +mul v15.4S, v15.4S,v28.s[1] +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +sub v17.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v27.s[0] +mul v6.4S, v6.4S,v28.s[0] +sub v8.4s, v18.4s, v11.4s +add v18.4s, v18.4s, v11.4s +mla v16.4S, v20.4S, v31.s[0] +mla v15.4S, v1.4S, v31.s[0] +sub v1.4s, v13.4s, v2.4s +mla v12.4S, v0.4S, v31.s[0] +mla v6.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v3.4S, v27.s[2] +mul v3.4S, v3.4S,v28.s[2] +sub v19.4s, v7.4s, v22.4s +sqrdmulh v0.4S, v18.4S, v27.s[2] +mul v18.4S, v18.4S,v28.s[2] +add v7.4s, v7.4s, v22.4s +sqrdmulh v22.4S, v17.4S, v27.s[3] +mul v17.4S, v17.4S,v28.s[3] +sub v20.4s, v5.4s, v16.4s +add v5.4s, v5.4s, v16.4s +sqrdmulh v16.4S, v8.4S, v27.s[3] +mul v8.4S, v8.4S,v28.s[3] +sub v11.4s, v21.4s, v15.4s +add v21.4s, v21.4s, v15.4s +mla v3.4S, v2.4S, v31.s[0] +mla v18.4S, v0.4S, v31.s[0] +sub v0.4s, v4.4s, v12.4s +mla v17.4S, v22.4S, v31.s[0] +mla v8.4S, v16.4S, v31.s[0] +add v4.4s, v4.4s, v12.4s +sqrdmulh v12.4S, v21.4S, v25.s[2] +mul v21.4S, v21.4S,v26.s[2] +sub v16.4s, v14.4s, v6.4s +sqrdmulh v22.4S, v11.4S, v25.s[3] +mul v11.4S, v11.4S,v26.s[3] +add v14.4s, v14.4s, v6.4s +sqrdmulh v6.4S, v16.4S, v25.s[1] +mul v16.4S, v16.4S,v26.s[1] +sub v2.4s, v13.4s, v3.4s +add v13.4s, v13.4s, v3.4s +sqrdmulh v3.4S, v14.4S, v25.s[0] +mul v14.4S, v14.4S,v26.s[0] +sub v15.4s, v7.4s, v18.4s +add v7.4s, v7.4s, v18.4s +mla v21.4S, v12.4S, v31.s[0] +mla v11.4S, v22.4S, v31.s[0] +sub v22.4s, v1.4s, v17.4s +mla v16.4S, v6.4S, v31.s[0] +mla v14.4S, v3.4S, v31.s[0] +add v1.4s, v1.4s, v17.4s +sqrdmulh v17.4S, v7.4S, v23.s[0] +mul v7.4S, v7.4S,v24.s[0] +sub v3.4s, v19.4s, v8.4s +sqrdmulh v6.4S, v15.4S, v23.s[1] +mul v15.4S, v15.4S,v24.s[1] +add v19.4s, v19.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v23.s[2] +mul v19.4S, v19.4S,v24.s[2] +sub v12.4s, v5.4s, v21.4s +add v5.4s, v5.4s, v21.4s +sqrdmulh v21.4S, v3.4S, v23.s[3] +mul v3.4S, v3.4S,v24.s[3] +sub v18.4s, v20.4s, v11.4s +add v20.4s, v20.4s, v11.4s +mla v7.4S, v17.4S, v31.s[0] +mla v15.4S, v6.4S, v31.s[0] +sub v6.4s, v0.4s, v16.4s +str q5, [x0, #256] +mla v19.4S, v8.4S, v31.s[0] +mla v3.4S, v21.4S, v31.s[0] +add v0.4s, v0.4s, v16.4s +str q12, [x0, #320] +ldr q12, [x0, #912] +sqrdmulh v16.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +sub v21.4s, v4.4s, v14.4s +str q20, [x0, #384] +ldr q20, [x0, #976] +sqrdmulh v8.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v4.4s, v4.4s, v14.4s +str q18, [x0, #448] +ldr q18, [x0, #784] +sqrdmulh v14.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +sub v5.4s, v13.4s, v7.4s +add v13.4s, v13.4s, v7.4s +ldr q7, [x0, #848] +sqrdmulh v17.4S, v7.4S, v29.s[0] +mul v7.4S, v7.4S,v30.s[0] +sub v11.4s, v2.4s, v15.4s +add v2.4s, v2.4s, v15.4s +mla v12.4S, v16.4S, v31.s[0] +mla v20.4S, v8.4S, v31.s[0] +sub v8.4s, v1.4s, v19.4s +str q0, [x0, #128] +mla v18.4S, v14.4S, v31.s[0] +mla v7.4S, v17.4S, v31.s[0] +add v1.4s, v1.4s, v19.4s +str q6, [x0, #192] +ldr q6, [x0, #528] +sqrdmulh v19.4S, v6.4S, v29.s[0] +mul v6.4S, v6.4S,v30.s[0] +sub v17.4s, v22.4s, v3.4s +str q4, [x0, #0] +ldr q4, [x0, #592] +sqrdmulh v14.4S, v4.4S, v29.s[0] +mul v4.4S, v4.4S,v30.s[0] +add v22.4s, v22.4s, v3.4s +str q21, [x0, #64] +ldr q21, [x0, #656] +ldr q3, [x0, #400] +sqrdmulh v0.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +sub v16.4s, v3.4s, v12.4s +add v3.4s, v3.4s, v12.4s +ldr q12, [x0, #720] +ldr q15, [x0, #464] +sqrdmulh v9.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +sub v10.4s, v15.4s, v20.4s +add v15.4s, v15.4s, v20.4s +ldr q20, [x0, #272] +mla v6.4S, v19.4S, v31.s[0] +mla v4.4S, v14.4S, v31.s[0] +sub v14.4s, v20.4s, v18.4s +str q13, [x0, #512] +mla v21.4S, v0.4S, v31.s[0] +mla v12.4S, v9.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +str q5, [x0, #576] +ldr q5, [x0, #336] +sqrdmulh v18.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v9.4s, v5.4s, v7.4s +str q2, [x0, #640] +sqrdmulh v2.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +add v5.4s, v5.4s, v7.4s +str q11, [x0, #704] +ldr q11, [x0, #16] +sqrdmulh v7.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v0.4s, v11.4s, v6.4s +add v11.4s, v11.4s, v6.4s +ldr q6, [x0, #80] +sqrdmulh v13.4S, v5.4S, v29.s[1] +mul v5.4S, v5.4S,v30.s[1] +sub v19.4s, v6.4s, v4.4s +add v6.4s, v6.4s, v4.4s +ldr q4, [x0, #144] +mla v3.4S, v18.4S, v31.s[0] +mla v15.4S, v2.4S, v31.s[0] +sub v2.4s, v4.4s, v21.4s +str q1, [x0, #768] +mla v20.4S, v7.4S, v31.s[0] +mla v5.4S, v13.4S, v31.s[0] +add v4.4s, v4.4s, v21.4s +str q8, [x0, #832] +ldr q8, [x0, #208] +sqrdmulh v21.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +sub v13.4s, v8.4s, v12.4s +str q22, [x0, #896] +sqrdmulh v22.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +add v8.4s, v8.4s, v12.4s +str q17, [x0, #960] +sqrdmulh v17.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v12.4s, v4.4s, v3.4s +add v4.4s, v4.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v29.s[2] +mul v9.4S, v9.4S,v30.s[2] +sub v7.4s, v8.4s, v15.4s +add v8.4s, v8.4s, v15.4s +mla v16.4S, v21.4S, v31.s[0] +mla v10.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v20.4s +mla v14.4S, v17.4S, v31.s[0] +mla v9.4S, v3.4S, v31.s[0] +add v11.4s, v11.4s, v20.4s +sqrdmulh v20.4S, v12.4S, v27.s[1] +mul v12.4S, v12.4S,v28.s[1] +sub v3.4s, v6.4s, v5.4s +sqrdmulh v17.4S, v7.4S, v27.s[1] +mul v7.4S, v7.4S,v28.s[1] +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v4.4S, v27.s[0] +mul v4.4S, v4.4S,v28.s[0] +sub v21.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v8.4S, v27.s[0] +mul v8.4S, v8.4S,v28.s[0] +sub v15.4s, v13.4s, v10.4s +add v13.4s, v13.4s, v10.4s +mla v12.4S, v20.4S, v31.s[0] +mla v7.4S, v17.4S, v31.s[0] +sub v17.4s, v0.4s, v14.4s +mla v4.4S, v5.4S, v31.s[0] +mla v8.4S, v16.4S, v31.s[0] +add v0.4s, v0.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v27.s[2] +mul v2.4S, v2.4S,v28.s[2] +sub v16.4s, v19.4s, v9.4s +sqrdmulh v5.4S, v13.4S, v27.s[2] +mul v13.4S, v13.4S,v28.s[2] +add v19.4s, v19.4s, v9.4s +sqrdmulh v9.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +sub v20.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +sub v10.4s, v3.4s, v7.4s +add v3.4s, v3.4s, v7.4s +mla v2.4S, v14.4S, v31.s[0] +mla v13.4S, v5.4S, v31.s[0] +sub v5.4s, v11.4s, v4.4s +mla v21.4S, v9.4S, v31.s[0] +mla v15.4S, v12.4S, v31.s[0] +add v11.4s, v11.4s, v4.4s +sqrdmulh v4.4S, v3.4S, v25.s[2] +mul v3.4S, v3.4S,v26.s[2] +sub v12.4s, v6.4s, v8.4s +sqrdmulh v9.4S, v10.4S, v25.s[3] +mul v10.4S, v10.4S,v26.s[3] +add v6.4s, v6.4s, v8.4s +sqrdmulh v8.4S, v12.4S, v25.s[1] +mul v12.4S, v12.4S,v26.s[1] +sub v14.4s, v0.4s, v2.4s +add v0.4s, v0.4s, v2.4s +sqrdmulh v2.4S, v6.4S, v25.s[0] +mul v6.4S, v6.4S,v26.s[0] +sub v7.4s, v19.4s, v13.4s +add v19.4s, v19.4s, v13.4s +mla v3.4S, v4.4S, v31.s[0] +mla v10.4S, v9.4S, v31.s[0] +sub v9.4s, v17.4s, v21.4s +mla v12.4S, v8.4S, v31.s[0] +mla v6.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v19.4S, v23.s[0] +mul v19.4S, v19.4S,v24.s[0] +sub v2.4s, v16.4s, v15.4s +sqrdmulh v8.4S, v7.4S, v23.s[1] +mul v7.4S, v7.4S,v24.s[1] +add v16.4s, v16.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v23.s[2] +mul v16.4S, v16.4S,v24.s[2] +sub v4.4s, v22.4s, v3.4s +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v2.4S, v23.s[3] +mul v2.4S, v2.4S,v24.s[3] +sub v13.4s, v20.4s, v10.4s +add v20.4s, v20.4s, v10.4s +mla v19.4S, v21.4S, v31.s[0] +mla v7.4S, v8.4S, v31.s[0] +sub v8.4s, v5.4s, v12.4s +str q22, [x0, #272] +mla v16.4S, v15.4S, v31.s[0] +mla v2.4S, v3.4S, v31.s[0] +add v5.4s, v5.4s, v12.4s +str q4, [x0, #336] +sub v23.4s, v11.4s, v6.4s +str q20, [x0, #400] +add v11.4s, v11.4s, v6.4s +str q13, [x0, #464] +sub v13.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sub v19.4s, v14.4s, v7.4s +add v14.4s, v14.4s, v7.4s +sub v7.4s, v17.4s, v16.4s +str q5, [x0, #144] +add v17.4s, v17.4s, v16.4s +str q8, [x0, #208] +sub v8.4s, v9.4s, v2.4s +str q11, [x0, #16] +add v9.4s, v9.4s, v2.4s +str q23, [x0, #80] +str q0, [x0, #528] +str q13, [x0, #592] +str q14, [x0, #656] +str q19, [x0, #720] +str q17, [x0, #784] +str q7, [x0, #848] +str q9, [x0, #912] +str q8, [x0, #976] +ldr q18, [x17, #+128] +ldr q1, [x17, #+144] +ldr q10, [x17, #+160] +ldr q21, [x17, #+176] +ldr q22, [x17, #+192] +ldr q15, [x17, #+208] +ldr q3, [x17, #+224] +ldr q12, [x17, #+240] +ldr q4, [x0, #32] +ldr q30, [x0, #48] +ldr q29, [x0, #0] +ldr q28, [x0, #16] +sqrdmulh v27.4S, v4.4S, v1.s[0] +mul v4.4S, v4.4S,v18.s[0] +mla v4.4S, v27.4S, v31.s[0] +sub v27.4s, v29.4s, v4.4s +add v29.4s, v29.4s, v4.4s +sqrdmulh v4.4S, v30.4S, v1.s[0] +mul v30.4S, v30.4S,v18.s[0] +mla v30.4S, v4.4S, v31.s[0] +sub v4.4s, v28.4s, v30.4s +add v28.4s, v28.4s, v30.4s +ldr q30, [x17, #+256] +ldr q26, [x17, #+272] +sqrdmulh v25.4S, v28.4S, v1.s[1] +mul v28.4S, v28.4S,v18.s[1] +mla v28.4S, v25.4S, v31.s[0] +sub v25.4s, v29.4s, v28.4s +add v29.4s, v29.4s, v28.4s +sqrdmulh v28.4S, v4.4S, v1.s[2] +mul v4.4S, v4.4S,v18.s[2] +mla v4.4S, v28.4S, v31.s[0] +sub v28.4s, v27.4s, v4.4s +add v27.4s, v27.4s, v4.4s +str q29, [x0, #0] +str q25, [x0, #16] +str q27, [x0, #32] +str q28, [x0, #48] +ldr q28, [x0, #96] +ldr q27, [x0, #112] +ldr q25, [x0, #64] +ldr q29, [x0, #80] +sqrdmulh v4.4S, v28.4S, v21.s[0] +mul v28.4S, v28.4S,v10.s[0] +mla v28.4S, v4.4S, v31.s[0] +sub v4.4s, v25.4s, v28.4s +add v25.4s, v25.4s, v28.4s +sqrdmulh v28.4S, v27.4S, v21.s[0] +mul v27.4S, v27.4S,v10.s[0] +mla v27.4S, v28.4S, v31.s[0] +sub v28.4s, v29.4s, v27.4s +add v29.4s, v29.4s, v27.4s +ldr q27, [x17, #+288] +ldr q24, [x17, #+304] +sqrdmulh v20.4S, v29.4S, v21.s[1] +mul v29.4S, v29.4S,v10.s[1] +mla v29.4S, v20.4S, v31.s[0] +sub v20.4s, v25.4s, v29.4s +add v25.4s, v25.4s, v29.4s +sqrdmulh v29.4S, v28.4S, v21.s[2] +mul v28.4S, v28.4S,v10.s[2] +mla v28.4S, v29.4S, v31.s[0] +sub v29.4s, v4.4s, v28.4s +add v4.4s, v4.4s, v28.4s +str q25, [x0, #64] +str q20, [x0, #80] +str q4, [x0, #96] +str q29, [x0, #112] +ldr q29, [x0, #160] +ldr q4, [x0, #176] +ldr q20, [x0, #128] +ldr q25, [x0, #144] +sqrdmulh v28.4S, v29.4S, v15.s[0] +mul v29.4S, v29.4S,v22.s[0] +mla v29.4S, v28.4S, v31.s[0] +sub v28.4s, v20.4s, v29.4s +add v20.4s, v20.4s, v29.4s +sqrdmulh v29.4S, v4.4S, v15.s[0] +mul v4.4S, v4.4S,v22.s[0] +mla v4.4S, v29.4S, v31.s[0] +sub v29.4s, v25.4s, v4.4s +add v25.4s, v25.4s, v4.4s +ldr q4, [x17, #+320] +ldr q6, [x17, #+336] +sqrdmulh v5.4S, v25.4S, v15.s[1] +mul v25.4S, v25.4S,v22.s[1] +mla v25.4S, v5.4S, v31.s[0] +sub v5.4s, v20.4s, v25.4s +add v20.4s, v20.4s, v25.4s +sqrdmulh v25.4S, v29.4S, v15.s[2] +mul v29.4S, v29.4S,v22.s[2] +mla v29.4S, v25.4S, v31.s[0] +sub v25.4s, v28.4s, v29.4s +add v28.4s, v28.4s, v29.4s +str q20, [x0, #128] +str q5, [x0, #144] +str q28, [x0, #160] +str q25, [x0, #176] +ldr q25, [x0, #224] +ldr q28, [x0, #240] +ldr q5, [x0, #192] +ldr q20, [x0, #208] +sqrdmulh v29.4S, v25.4S, v12.s[0] +mul v25.4S, v25.4S,v3.s[0] +mla v25.4S, v29.4S, v31.s[0] +sub v29.4s, v5.4s, v25.4s +add v5.4s, v5.4s, v25.4s +sqrdmulh v25.4S, v28.4S, v12.s[0] +mul v28.4S, v28.4S,v3.s[0] +mla v28.4S, v25.4S, v31.s[0] +sub v25.4s, v20.4s, v28.4s +add v20.4s, v20.4s, v28.4s +ldr q28, [x17, #+352] +ldr q16, [x17, #+368] +sqrdmulh v11.4S, v20.4S, v12.s[1] +mul v20.4S, v20.4S,v3.s[1] +mla v20.4S, v11.4S, v31.s[0] +sub v11.4s, v5.4s, v20.4s +add v5.4s, v5.4s, v20.4s +sqrdmulh v20.4S, v25.4S, v12.s[2] +mul v25.4S, v25.4S,v3.s[2] +mla v25.4S, v20.4S, v31.s[0] +sub v20.4s, v29.4s, v25.4s +add v29.4s, v29.4s, v25.4s +str q5, [x0, #192] +str q11, [x0, #208] +str q29, [x0, #224] +str q20, [x0, #240] +ldr q20, [x0, #288] +ldr q29, [x0, #304] +ldr q11, [x0, #256] +ldr q5, [x0, #272] +sqrdmulh v25.4S, v20.4S, v26.s[0] +mul v20.4S, v20.4S,v30.s[0] +mla v20.4S, v25.4S, v31.s[0] +sub v25.4s, v11.4s, v20.4s +add v11.4s, v11.4s, v20.4s +sqrdmulh v20.4S, v29.4S, v26.s[0] +mul v29.4S, v29.4S,v30.s[0] +mla v29.4S, v20.4S, v31.s[0] +sub v20.4s, v5.4s, v29.4s +add v5.4s, v5.4s, v29.4s +ldr q29, [x17, #+384] +ldr q2, [x17, #+400] +sqrdmulh v23.4S, v5.4S, v26.s[1] +mul v5.4S, v5.4S,v30.s[1] +mla v5.4S, v23.4S, v31.s[0] +sub v23.4s, v11.4s, v5.4s +add v11.4s, v11.4s, v5.4s +sqrdmulh v5.4S, v20.4S, v26.s[2] +mul v20.4S, v20.4S,v30.s[2] +mla v20.4S, v5.4S, v31.s[0] +sub v5.4s, v25.4s, v20.4s +add v25.4s, v25.4s, v20.4s +str q11, [x0, #256] +str q23, [x0, #272] +str q25, [x0, #288] +str q5, [x0, #304] +ldr q1, [x0, #352] +ldr q18, [x0, #368] +ldr q5, [x0, #320] +ldr q25, [x0, #336] +sqrdmulh v23.4S, v1.4S, v24.s[0] +mul v1.4S, v1.4S,v27.s[0] +mla v1.4S, v23.4S, v31.s[0] +sub v23.4s, v5.4s, v1.4s +add v5.4s, v5.4s, v1.4s +sqrdmulh v1.4S, v18.4S, v24.s[0] +mul v18.4S, v18.4S,v27.s[0] +mla v18.4S, v1.4S, v31.s[0] +sub v1.4s, v25.4s, v18.4s +add v25.4s, v25.4s, v18.4s +ldr q18, [x17, #+416] +ldr q11, [x17, #+432] +sqrdmulh v20.4S, v25.4S, v24.s[1] +mul v25.4S, v25.4S,v27.s[1] +mla v25.4S, v20.4S, v31.s[0] +sub v20.4s, v5.4s, v25.4s +add v5.4s, v5.4s, v25.4s +sqrdmulh v25.4S, v1.4S, v24.s[2] +mul v1.4S, v1.4S,v27.s[2] +mla v1.4S, v25.4S, v31.s[0] +sub v25.4s, v23.4s, v1.4s +add v23.4s, v23.4s, v1.4s +str q5, [x0, #320] +str q20, [x0, #336] +str q23, [x0, #352] +str q25, [x0, #368] +ldr q21, [x0, #416] +ldr q10, [x0, #432] +ldr q25, [x0, #384] +ldr q23, [x0, #400] +sqrdmulh v20.4S, v21.4S, v6.s[0] +mul v21.4S, v21.4S,v4.s[0] +mla v21.4S, v20.4S, v31.s[0] +sub v20.4s, v25.4s, v21.4s +add v25.4s, v25.4s, v21.4s +sqrdmulh v21.4S, v10.4S, v6.s[0] +mul v10.4S, v10.4S,v4.s[0] +mla v10.4S, v21.4S, v31.s[0] +sub v21.4s, v23.4s, v10.4s +add v23.4s, v23.4s, v10.4s +ldr q10, [x17, #+448] +ldr q5, [x17, #+464] +sqrdmulh v1.4S, v23.4S, v6.s[1] +mul v23.4S, v23.4S,v4.s[1] +mla v23.4S, v1.4S, v31.s[0] +sub v1.4s, v25.4s, v23.4s +add v25.4s, v25.4s, v23.4s +sqrdmulh v23.4S, v21.4S, v6.s[2] +mul v21.4S, v21.4S,v4.s[2] +mla v21.4S, v23.4S, v31.s[0] +sub v23.4s, v20.4s, v21.4s +add v20.4s, v20.4s, v21.4s +str q25, [x0, #384] +str q1, [x0, #400] +str q20, [x0, #416] +str q23, [x0, #432] +ldr q15, [x0, #480] +ldr q22, [x0, #496] +ldr q23, [x0, #448] +ldr q20, [x0, #464] +sqrdmulh v1.4S, v15.4S, v16.s[0] +mul v15.4S, v15.4S,v28.s[0] +mla v15.4S, v1.4S, v31.s[0] +sub v1.4s, v23.4s, v15.4s +add v23.4s, v23.4s, v15.4s +sqrdmulh v15.4S, v22.4S, v16.s[0] +mul v22.4S, v22.4S,v28.s[0] +mla v22.4S, v15.4S, v31.s[0] +sub v15.4s, v20.4s, v22.4s +add v20.4s, v20.4s, v22.4s +ldr q22, [x17, #+480] +ldr q25, [x17, #+496] +sqrdmulh v21.4S, v20.4S, v16.s[1] +mul v20.4S, v20.4S,v28.s[1] +mla v20.4S, v21.4S, v31.s[0] +sub v21.4s, v23.4s, v20.4s +add v23.4s, v23.4s, v20.4s +sqrdmulh v20.4S, v15.4S, v16.s[2] +mul v15.4S, v15.4S,v28.s[2] +mla v15.4S, v20.4S, v31.s[0] +sub v20.4s, v1.4s, v15.4s +add v1.4s, v1.4s, v15.4s +str q23, [x0, #448] +str q21, [x0, #464] +str q1, [x0, #480] +str q20, [x0, #496] +ldr q12, [x0, #544] +ldr q3, [x0, #560] +ldr q20, [x0, #512] +ldr q1, [x0, #528] +sqrdmulh v21.4S, v12.4S, v2.s[0] +mul v12.4S, v12.4S,v29.s[0] +mla v12.4S, v21.4S, v31.s[0] +sub v21.4s, v20.4s, v12.4s +add v20.4s, v20.4s, v12.4s +sqrdmulh v12.4S, v3.4S, v2.s[0] +mul v3.4S, v3.4S,v29.s[0] +mla v3.4S, v12.4S, v31.s[0] +sub v12.4s, v1.4s, v3.4s +add v1.4s, v1.4s, v3.4s +ldr q3, [x17, #+512] +ldr q23, [x17, #+528] +sqrdmulh v15.4S, v1.4S, v2.s[1] +mul v1.4S, v1.4S,v29.s[1] +mla v1.4S, v15.4S, v31.s[0] +sub v15.4s, v20.4s, v1.4s +add v20.4s, v20.4s, v1.4s +sqrdmulh v1.4S, v12.4S, v2.s[2] +mul v12.4S, v12.4S,v29.s[2] +mla v12.4S, v1.4S, v31.s[0] +sub v1.4s, v21.4s, v12.4s +add v21.4s, v21.4s, v12.4s +str q20, [x0, #512] +str q15, [x0, #528] +str q21, [x0, #544] +str q1, [x0, #560] +ldr q26, [x0, #608] +ldr q30, [x0, #624] +ldr q1, [x0, #576] +ldr q21, [x0, #592] +sqrdmulh v15.4S, v26.4S, v11.s[0] +mul v26.4S, v26.4S,v18.s[0] +mla v26.4S, v15.4S, v31.s[0] +sub v15.4s, v1.4s, v26.4s +add v1.4s, v1.4s, v26.4s +sqrdmulh v26.4S, v30.4S, v11.s[0] +mul v30.4S, v30.4S,v18.s[0] +mla v30.4S, v26.4S, v31.s[0] +sub v26.4s, v21.4s, v30.4s +add v21.4s, v21.4s, v30.4s +ldr q30, [x17, #+544] +ldr q20, [x17, #+560] +sqrdmulh v12.4S, v21.4S, v11.s[1] +mul v21.4S, v21.4S,v18.s[1] +mla v21.4S, v12.4S, v31.s[0] +sub v12.4s, v1.4s, v21.4s +add v1.4s, v1.4s, v21.4s +sqrdmulh v21.4S, v26.4S, v11.s[2] +mul v26.4S, v26.4S,v18.s[2] +mla v26.4S, v21.4S, v31.s[0] +sub v21.4s, v15.4s, v26.4s +add v15.4s, v15.4s, v26.4s +str q1, [x0, #576] +str q12, [x0, #592] +str q15, [x0, #608] +str q21, [x0, #624] +ldr q24, [x0, #672] +ldr q27, [x0, #688] +ldr q21, [x0, #640] +ldr q15, [x0, #656] +sqrdmulh v12.4S, v24.4S, v5.s[0] +mul v24.4S, v24.4S,v10.s[0] +mla v24.4S, v12.4S, v31.s[0] +sub v12.4s, v21.4s, v24.4s +add v21.4s, v21.4s, v24.4s +sqrdmulh v24.4S, v27.4S, v5.s[0] +mul v27.4S, v27.4S,v10.s[0] +mla v27.4S, v24.4S, v31.s[0] +sub v24.4s, v15.4s, v27.4s +add v15.4s, v15.4s, v27.4s +ldr q27, [x17, #+576] +ldr q1, [x17, #+592] +sqrdmulh v26.4S, v15.4S, v5.s[1] +mul v15.4S, v15.4S,v10.s[1] +mla v15.4S, v26.4S, v31.s[0] +sub v26.4s, v21.4s, v15.4s +add v21.4s, v21.4s, v15.4s +sqrdmulh v15.4S, v24.4S, v5.s[2] +mul v24.4S, v24.4S,v10.s[2] +mla v24.4S, v15.4S, v31.s[0] +sub v15.4s, v12.4s, v24.4s +add v12.4s, v12.4s, v24.4s +str q21, [x0, #640] +str q26, [x0, #656] +str q12, [x0, #672] +str q15, [x0, #688] +ldr q6, [x0, #736] +ldr q4, [x0, #752] +ldr q15, [x0, #704] +ldr q12, [x0, #720] +sqrdmulh v26.4S, v6.4S, v25.s[0] +mul v6.4S, v6.4S,v22.s[0] +mla v6.4S, v26.4S, v31.s[0] +sub v26.4s, v15.4s, v6.4s +add v15.4s, v15.4s, v6.4s +sqrdmulh v6.4S, v4.4S, v25.s[0] +mul v4.4S, v4.4S,v22.s[0] +mla v4.4S, v6.4S, v31.s[0] +sub v6.4s, v12.4s, v4.4s +add v12.4s, v12.4s, v4.4s +ldr q4, [x17, #+608] +ldr q21, [x17, #+624] +sqrdmulh v24.4S, v12.4S, v25.s[1] +mul v12.4S, v12.4S,v22.s[1] +mla v12.4S, v24.4S, v31.s[0] +sub v24.4s, v15.4s, v12.4s +add v15.4s, v15.4s, v12.4s +sqrdmulh v12.4S, v6.4S, v25.s[2] +mul v6.4S, v6.4S,v22.s[2] +mla v6.4S, v12.4S, v31.s[0] +sub v12.4s, v26.4s, v6.4s +add v26.4s, v26.4s, v6.4s +str q15, [x0, #704] +str q24, [x0, #720] +str q26, [x0, #736] +str q12, [x0, #752] +ldr q16, [x0, #800] +ldr q28, [x0, #816] +ldr q12, [x0, #768] +ldr q26, [x0, #784] +sqrdmulh v24.4S, v16.4S, v23.s[0] +mul v16.4S, v16.4S,v3.s[0] +mla v16.4S, v24.4S, v31.s[0] +sub v24.4s, v12.4s, v16.4s +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v28.4S, v23.s[0] +mul v28.4S, v28.4S,v3.s[0] +mla v28.4S, v16.4S, v31.s[0] +sub v16.4s, v26.4s, v28.4s +add v26.4s, v26.4s, v28.4s +sqrdmulh v28.4S, v26.4S, v23.s[1] +mul v26.4S, v26.4S,v3.s[1] +mla v26.4S, v28.4S, v31.s[0] +sub v28.4s, v12.4s, v26.4s +add v12.4s, v12.4s, v26.4s +sqrdmulh v26.4S, v16.4S, v23.s[2] +mul v16.4S, v16.4S,v3.s[2] +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v24.4s, v16.4s +add v24.4s, v24.4s, v16.4s +str q12, [x0, #768] +str q28, [x0, #784] +str q24, [x0, #800] +str q26, [x0, #816] +ldr q2, [x0, #864] +ldr q29, [x0, #880] +ldr q26, [x0, #832] +ldr q24, [x0, #848] +sqrdmulh v28.4S, v2.4S, v20.s[0] +mul v2.4S, v2.4S,v30.s[0] +mla v2.4S, v28.4S, v31.s[0] +sub v28.4s, v26.4s, v2.4s +add v26.4s, v26.4s, v2.4s +sqrdmulh v2.4S, v29.4S, v20.s[0] +mul v29.4S, v29.4S,v30.s[0] +mla v29.4S, v2.4S, v31.s[0] +sub v2.4s, v24.4s, v29.4s +add v24.4s, v24.4s, v29.4s +sqrdmulh v29.4S, v24.4S, v20.s[1] +mul v24.4S, v24.4S,v30.s[1] +mla v24.4S, v29.4S, v31.s[0] +sub v29.4s, v26.4s, v24.4s +add v26.4s, v26.4s, v24.4s +sqrdmulh v24.4S, v2.4S, v20.s[2] +mul v2.4S, v2.4S,v30.s[2] +mla v2.4S, v24.4S, v31.s[0] +sub v24.4s, v28.4s, v2.4s +add v28.4s, v28.4s, v2.4s +str q26, [x0, #832] +str q29, [x0, #848] +str q28, [x0, #864] +str q24, [x0, #880] +ldr q11, [x0, #928] +ldr q18, [x0, #944] +ldr q24, [x0, #896] +ldr q28, [x0, #912] +sqrdmulh v29.4S, v11.4S, v1.s[0] +mul v11.4S, v11.4S,v27.s[0] +mla v11.4S, v29.4S, v31.s[0] +sub v29.4s, v24.4s, v11.4s +add v24.4s, v24.4s, v11.4s +sqrdmulh v11.4S, v18.4S, v1.s[0] +mul v18.4S, v18.4S,v27.s[0] +mla v18.4S, v11.4S, v31.s[0] +sub v11.4s, v28.4s, v18.4s +add v28.4s, v28.4s, v18.4s +sqrdmulh v18.4S, v28.4S, v1.s[1] +mul v28.4S, v28.4S,v27.s[1] +mla v28.4S, v18.4S, v31.s[0] +sub v18.4s, v24.4s, v28.4s +add v24.4s, v24.4s, v28.4s +sqrdmulh v28.4S, v11.4S, v1.s[2] +mul v11.4S, v11.4S,v27.s[2] +mla v11.4S, v28.4S, v31.s[0] +sub v28.4s, v29.4s, v11.4s +add v29.4s, v29.4s, v11.4s +str q24, [x0, #896] +str q18, [x0, #912] +str q29, [x0, #928] +str q28, [x0, #944] +ldr q5, [x0, #992] +ldr q10, [x0, #1008] +ldr q28, [x0, #960] +ldr q29, [x0, #976] +sqrdmulh v18.4S, v5.4S, v21.s[0] +mul v5.4S, v5.4S,v4.s[0] +mla v5.4S, v18.4S, v31.s[0] +sub v18.4s, v28.4s, v5.4s +add v28.4s, v28.4s, v5.4s +sqrdmulh v5.4S, v10.4S, v21.s[0] +mul v10.4S, v10.4S,v4.s[0] +mla v10.4S, v5.4S, v31.s[0] +sub v5.4s, v29.4s, v10.4s +add v29.4s, v29.4s, v10.4s +sqrdmulh v10.4S, v29.4S, v21.s[1] +mul v29.4S, v29.4S,v4.s[1] +mla v29.4S, v10.4S, v31.s[0] +sub v10.4s, v28.4s, v29.4s +add v28.4s, v28.4s, v29.4s +sqrdmulh v29.4S, v5.4S, v21.s[2] +mul v5.4S, v5.4S,v4.s[2] +mla v5.4S, v29.4S, v31.s[0] +sub v29.4s, v18.4s, v5.4s +add v18.4s, v18.4s, v5.4s +str q28, [x0, #960] +str q10, [x0, #976] +str q18, [x0, #992] +str q29, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1464 +// Instruction count: 1460 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_1.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_1.s new file mode 100644 index 0000000..ab592c3 --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_1.s @@ -0,0 +1,1494 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_7_z4_1 +.global _ntt_u32_incomplete_neon_asm_var_4_2_7_z4_1 +ntt_u32_incomplete_neon_asm_var_4_2_7_z4_1: +_ntt_u32_incomplete_neon_asm_var_4_2_7_z4_1: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #928] +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +ldr q20, [x0, #992] +sqrdmulh v19.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q18, [x0, #800] +sqrdmulh v17.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +ldr q16, [x0, #864] +sqrdmulh v3.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +mla v22.4S, v21.4S, v31.s[0] +mla v20.4S, v19.4S, v31.s[0] +mla v18.4S, v17.4S, v31.s[0] +mla v16.4S, v3.4S, v31.s[0] +ldr q3, [x0, #544] +sqrdmulh v17.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +ldr q19, [x0, #608] +sqrdmulh v21.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q2, [x0, #672] +ldr q1, [x0, #416] +sqrdmulh v0.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +sub v15.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +ldr q22, [x0, #736] +ldr q14, [x0, #480] +sqrdmulh v13.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v12.4s, v14.4s, v20.4s +add v14.4s, v14.4s, v20.4s +ldr q20, [x0, #288] +mla v3.4S, v17.4S, v31.s[0] +mla v19.4S, v21.4S, v31.s[0] +sub v21.4s, v20.4s, v18.4s +mla v2.4S, v0.4S, v31.s[0] +mla v22.4S, v13.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +ldr q18, [x0, #352] +sqrdmulh v13.4S, v1.4S, v29.s[1] +mul v1.4S, v1.4S,v30.s[1] +sub v0.4s, v18.4s, v16.4s +sqrdmulh v17.4S, v14.4S, v29.s[1] +mul v14.4S, v14.4S,v30.s[1] +add v18.4s, v18.4s, v16.4s +ldr q16, [x0, #32] +sqrdmulh v11.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v10.4s, v16.4s, v3.4s +add v16.4s, v16.4s, v3.4s +ldr q3, [x0, #96] +sqrdmulh v9.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v8.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +ldr q19, [x0, #160] +mla v1.4S, v13.4S, v31.s[0] +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v19.4s, v2.4s +mla v20.4S, v11.4S, v31.s[0] +mla v18.4S, v9.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +ldr q2, [x0, #224] +sqrdmulh v9.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +sub v11.4s, v2.4s, v22.4s +sqrdmulh v13.4S, v12.4S, v29.s[2] +mul v12.4S, v12.4S,v30.s[2] +add v2.4s, v2.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +sub v7.4s, v19.4s, v1.4s +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v29.s[2] +mul v0.4S, v0.4S,v30.s[2] +sub v6.4s, v2.4s, v14.4s +add v2.4s, v2.4s, v14.4s +mla v15.4S, v9.4S, v31.s[0] +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v16.4s, v20.4s +mla v21.4S, v22.4S, v31.s[0] +mla v0.4S, v1.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v7.4S, v27.s[1] +mul v7.4S, v7.4S,v28.s[1] +sub v1.4s, v3.4s, v18.4s +sqrdmulh v22.4S, v6.4S, v27.s[1] +mul v6.4S, v6.4S,v28.s[1] +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v19.4S, v27.s[0] +mul v19.4S, v19.4S,v28.s[0] +sub v9.4s, v17.4s, v15.4s +add v17.4s, v17.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v27.s[0] +mul v2.4S, v2.4S,v28.s[0] +sub v14.4s, v11.4s, v12.4s +add v11.4s, v11.4s, v12.4s +mla v7.4S, v20.4S, v31.s[0] +mla v6.4S, v22.4S, v31.s[0] +sub v22.4s, v10.4s, v21.4s +mla v19.4S, v18.4S, v31.s[0] +mla v2.4S, v15.4S, v31.s[0] +add v10.4s, v10.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v27.s[2] +mul v17.4S, v17.4S,v28.s[2] +sub v15.4s, v8.4s, v0.4s +sqrdmulh v18.4S, v11.4S, v27.s[2] +mul v11.4S, v11.4S,v28.s[2] +add v8.4s, v8.4s, v0.4s +sqrdmulh v0.4S, v9.4S, v27.s[3] +mul v9.4S, v9.4S,v28.s[3] +sub v20.4s, v13.4s, v7.4s +add v13.4s, v13.4s, v7.4s +sqrdmulh v7.4S, v14.4S, v27.s[3] +mul v14.4S, v14.4S,v28.s[3] +sub v12.4s, v1.4s, v6.4s +add v1.4s, v1.4s, v6.4s +mla v17.4S, v21.4S, v31.s[0] +mla v11.4S, v18.4S, v31.s[0] +sub v18.4s, v16.4s, v19.4s +mla v9.4S, v0.4S, v31.s[0] +mla v14.4S, v7.4S, v31.s[0] +add v16.4s, v16.4s, v19.4s +sqrdmulh v19.4S, v1.4S, v25.s[2] +mul v1.4S, v1.4S,v26.s[2] +sub v7.4s, v3.4s, v2.4s +sqrdmulh v0.4S, v12.4S, v25.s[3] +mul v12.4S, v12.4S,v26.s[3] +add v3.4s, v3.4s, v2.4s +sqrdmulh v2.4S, v7.4S, v25.s[1] +mul v7.4S, v7.4S,v26.s[1] +sub v21.4s, v10.4s, v17.4s +add v10.4s, v10.4s, v17.4s +sqrdmulh v17.4S, v3.4S, v25.s[0] +mul v3.4S, v3.4S,v26.s[0] +sub v6.4s, v8.4s, v11.4s +add v8.4s, v8.4s, v11.4s +mla v1.4S, v19.4S, v31.s[0] +mla v12.4S, v0.4S, v31.s[0] +sub v0.4s, v22.4s, v9.4s +mla v7.4S, v2.4S, v31.s[0] +mla v3.4S, v17.4S, v31.s[0] +add v22.4s, v22.4s, v9.4s +sqrdmulh v9.4S, v8.4S, v23.s[0] +mul v8.4S, v8.4S,v24.s[0] +sub v17.4s, v15.4s, v14.4s +sqrdmulh v2.4S, v6.4S, v23.s[1] +mul v6.4S, v6.4S,v24.s[1] +add v15.4s, v15.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v23.s[2] +mul v15.4S, v15.4S,v24.s[2] +sub v19.4s, v13.4s, v1.4s +add v13.4s, v13.4s, v1.4s +sqrdmulh v1.4S, v17.4S, v23.s[3] +mul v17.4S, v17.4S,v24.s[3] +sub v11.4s, v20.4s, v12.4s +add v20.4s, v20.4s, v12.4s +mla v8.4S, v9.4S, v31.s[0] +mla v6.4S, v2.4S, v31.s[0] +sub v2.4s, v18.4s, v7.4s +str q13, [x0, #288] +mla v15.4S, v14.4S, v31.s[0] +mla v17.4S, v1.4S, v31.s[0] +add v18.4s, v18.4s, v7.4s +str q19, [x0, #352] +ldr q19, [x0, #944] +sqrdmulh v7.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +sub v1.4s, v16.4s, v3.4s +str q20, [x0, #416] +ldr q20, [x0, #1008] +sqrdmulh v14.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v16.4s, v16.4s, v3.4s +str q11, [x0, #480] +ldr q11, [x0, #816] +sqrdmulh v3.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +sub v13.4s, v10.4s, v8.4s +add v10.4s, v10.4s, v8.4s +ldr q8, [x0, #880] +sqrdmulh v9.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v12.4s, v21.4s, v6.4s +add v21.4s, v21.4s, v6.4s +mla v19.4S, v7.4S, v31.s[0] +mla v20.4S, v14.4S, v31.s[0] +sub v14.4s, v22.4s, v15.4s +str q18, [x0, #160] +mla v11.4S, v3.4S, v31.s[0] +mla v8.4S, v9.4S, v31.s[0] +add v22.4s, v22.4s, v15.4s +str q2, [x0, #224] +ldr q2, [x0, #560] +sqrdmulh v15.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +sub v9.4s, v0.4s, v17.4s +str q16, [x0, #32] +ldr q16, [x0, #624] +sqrdmulh v3.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +add v0.4s, v0.4s, v17.4s +str q1, [x0, #96] +ldr q1, [x0, #688] +ldr q17, [x0, #432] +sqrdmulh v18.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +sub v7.4s, v17.4s, v19.4s +add v17.4s, v17.4s, v19.4s +ldr q19, [x0, #752] +ldr q6, [x0, #496] +sqrdmulh v5.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +sub v4.4s, v6.4s, v20.4s +add v6.4s, v6.4s, v20.4s +ldr q20, [x0, #304] +mla v2.4S, v15.4S, v31.s[0] +mla v16.4S, v3.4S, v31.s[0] +sub v3.4s, v20.4s, v11.4s +str q10, [x0, #544] +mla v1.4S, v18.4S, v31.s[0] +mla v19.4S, v5.4S, v31.s[0] +add v20.4s, v20.4s, v11.4s +str q13, [x0, #608] +ldr q13, [x0, #368] +sqrdmulh v11.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v5.4s, v13.4s, v8.4s +str q21, [x0, #672] +sqrdmulh v21.4S, v6.4S, v29.s[1] +mul v6.4S, v6.4S,v30.s[1] +add v13.4s, v13.4s, v8.4s +str q12, [x0, #736] +ldr q12, [x0, #48] +sqrdmulh v8.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v18.4s, v12.4s, v2.4s +add v12.4s, v12.4s, v2.4s +ldr q2, [x0, #112] +sqrdmulh v10.4S, v13.4S, v29.s[1] +mul v13.4S, v13.4S,v30.s[1] +sub v15.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +ldr q16, [x0, #176] +mla v17.4S, v11.4S, v31.s[0] +mla v6.4S, v21.4S, v31.s[0] +sub v21.4s, v16.4s, v1.4s +str q22, [x0, #800] +mla v20.4S, v8.4S, v31.s[0] +mla v13.4S, v10.4S, v31.s[0] +add v16.4s, v16.4s, v1.4s +str q14, [x0, #864] +ldr q14, [x0, #240] +sqrdmulh v1.4S, v7.4S, v29.s[2] +mul v7.4S, v7.4S,v30.s[2] +sub v10.4s, v14.4s, v19.4s +str q0, [x0, #928] +sqrdmulh v0.4S, v4.4S, v29.s[2] +mul v4.4S, v4.4S,v30.s[2] +add v14.4s, v14.4s, v19.4s +str q9, [x0, #992] +sqrdmulh v9.4S, v3.4S, v29.s[2] +mul v3.4S, v3.4S,v30.s[2] +sub v19.4s, v16.4s, v17.4s +add v16.4s, v16.4s, v17.4s +sqrdmulh v17.4S, v5.4S, v29.s[2] +mul v5.4S, v5.4S,v30.s[2] +sub v8.4s, v14.4s, v6.4s +add v14.4s, v14.4s, v6.4s +mla v7.4S, v1.4S, v31.s[0] +mla v4.4S, v0.4S, v31.s[0] +sub v0.4s, v12.4s, v20.4s +mla v3.4S, v9.4S, v31.s[0] +mla v5.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v27.s[1] +mul v19.4S, v19.4S,v28.s[1] +sub v17.4s, v2.4s, v13.4s +sqrdmulh v9.4S, v8.4S, v27.s[1] +mul v8.4S, v8.4S,v28.s[1] +add v2.4s, v2.4s, v13.4s +sqrdmulh v13.4S, v16.4S, v27.s[0] +mul v16.4S, v16.4S,v28.s[0] +sub v1.4s, v21.4s, v7.4s +add v21.4s, v21.4s, v7.4s +sqrdmulh v7.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +sub v6.4s, v10.4s, v4.4s +add v10.4s, v10.4s, v4.4s +mla v19.4S, v20.4S, v31.s[0] +mla v8.4S, v9.4S, v31.s[0] +sub v9.4s, v18.4s, v3.4s +mla v16.4S, v13.4S, v31.s[0] +mla v14.4S, v7.4S, v31.s[0] +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v27.s[2] +mul v21.4S, v21.4S,v28.s[2] +sub v7.4s, v15.4s, v5.4s +sqrdmulh v13.4S, v10.4S, v27.s[2] +mul v10.4S, v10.4S,v28.s[2] +add v15.4s, v15.4s, v5.4s +sqrdmulh v5.4S, v1.4S, v27.s[3] +mul v1.4S, v1.4S,v28.s[3] +sub v20.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v27.s[3] +mul v6.4S, v6.4S,v28.s[3] +sub v4.4s, v17.4s, v8.4s +add v17.4s, v17.4s, v8.4s +mla v21.4S, v3.4S, v31.s[0] +mla v10.4S, v13.4S, v31.s[0] +sub v13.4s, v12.4s, v16.4s +mla v1.4S, v5.4S, v31.s[0] +mla v6.4S, v19.4S, v31.s[0] +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v25.s[2] +mul v17.4S, v17.4S,v26.s[2] +sub v19.4s, v2.4s, v14.4s +sqrdmulh v5.4S, v4.4S, v25.s[3] +mul v4.4S, v4.4S,v26.s[3] +add v2.4s, v2.4s, v14.4s +sqrdmulh v14.4S, v19.4S, v25.s[1] +mul v19.4S, v19.4S,v26.s[1] +sub v3.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v2.4S, v25.s[0] +mul v2.4S, v2.4S,v26.s[0] +sub v8.4s, v15.4s, v10.4s +add v15.4s, v15.4s, v10.4s +mla v17.4S, v16.4S, v31.s[0] +mla v4.4S, v5.4S, v31.s[0] +sub v5.4s, v9.4s, v1.4s +mla v19.4S, v14.4S, v31.s[0] +mla v2.4S, v21.4S, v31.s[0] +add v9.4s, v9.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v23.s[0] +mul v15.4S, v15.4S,v24.s[0] +sub v21.4s, v7.4s, v6.4s +sqrdmulh v14.4S, v8.4S, v23.s[1] +mul v8.4S, v8.4S,v24.s[1] +add v7.4s, v7.4s, v6.4s +sqrdmulh v6.4S, v7.4S, v23.s[2] +mul v7.4S, v7.4S,v24.s[2] +sub v16.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +sqrdmulh v17.4S, v21.4S, v23.s[3] +mul v21.4S, v21.4S,v24.s[3] +sub v10.4s, v20.4s, v4.4s +add v20.4s, v20.4s, v4.4s +mla v15.4S, v1.4S, v31.s[0] +mla v8.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v19.4s +str q0, [x0, #304] +mla v7.4S, v6.4S, v31.s[0] +mla v21.4S, v17.4S, v31.s[0] +add v13.4s, v13.4s, v19.4s +str q16, [x0, #368] +ldr q16, [x0, #896] +sqrdmulh v19.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v17.4s, v12.4s, v2.4s +str q20, [x0, #432] +ldr q20, [x0, #960] +sqrdmulh v6.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v12.4s, v12.4s, v2.4s +str q10, [x0, #496] +ldr q10, [x0, #768] +sqrdmulh v2.4S, v10.4S, v29.s[0] +mul v10.4S, v10.4S,v30.s[0] +sub v0.4s, v18.4s, v15.4s +add v18.4s, v18.4s, v15.4s +ldr q15, [x0, #832] +sqrdmulh v1.4S, v15.4S, v29.s[0] +mul v15.4S, v15.4S,v30.s[0] +sub v4.4s, v3.4s, v8.4s +add v3.4s, v3.4s, v8.4s +mla v16.4S, v19.4S, v31.s[0] +mla v20.4S, v6.4S, v31.s[0] +sub v6.4s, v9.4s, v7.4s +str q13, [x0, #176] +mla v10.4S, v2.4S, v31.s[0] +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v7.4s +str q14, [x0, #240] +ldr q14, [x0, #512] +sqrdmulh v7.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v1.4s, v5.4s, v21.4s +str q12, [x0, #48] +ldr q12, [x0, #576] +sqrdmulh v2.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +add v5.4s, v5.4s, v21.4s +str q17, [x0, #112] +ldr q17, [x0, #640] +ldr q21, [x0, #384] +sqrdmulh v13.4S, v17.4S, v29.s[0] +mul v17.4S, v17.4S,v30.s[0] +sub v19.4s, v21.4s, v16.4s +add v21.4s, v21.4s, v16.4s +ldr q16, [x0, #704] +ldr q8, [x0, #448] +sqrdmulh v22.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v11.4s, v8.4s, v20.4s +add v8.4s, v8.4s, v20.4s +ldr q20, [x0, #256] +mla v14.4S, v7.4S, v31.s[0] +mla v12.4S, v2.4S, v31.s[0] +sub v2.4s, v20.4s, v10.4s +str q18, [x0, #560] +mla v17.4S, v13.4S, v31.s[0] +mla v16.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v10.4s +str q0, [x0, #624] +ldr q0, [x0, #320] +sqrdmulh v10.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v22.4s, v0.4s, v15.4s +str q3, [x0, #688] +sqrdmulh v3.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +add v0.4s, v0.4s, v15.4s +str q4, [x0, #752] +ldr q4, [x0, #0] +sqrdmulh v15.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v13.4s, v4.4s, v14.4s +add v4.4s, v4.4s, v14.4s +ldr q14, [x0, #64] +sqrdmulh v18.4S, v0.4S, v29.s[1] +mul v0.4S, v0.4S,v30.s[1] +sub v7.4s, v14.4s, v12.4s +add v14.4s, v14.4s, v12.4s +ldr q12, [x0, #128] +mla v21.4S, v10.4S, v31.s[0] +mla v8.4S, v3.4S, v31.s[0] +sub v3.4s, v12.4s, v17.4s +str q9, [x0, #816] +mla v20.4S, v15.4S, v31.s[0] +mla v0.4S, v18.4S, v31.s[0] +add v12.4s, v12.4s, v17.4s +str q6, [x0, #880] +ldr q6, [x0, #192] +sqrdmulh v17.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +sub v18.4s, v6.4s, v16.4s +str q5, [x0, #944] +sqrdmulh v5.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +add v6.4s, v6.4s, v16.4s +str q1, [x0, #1008] +sqrdmulh v1.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v16.4s, v12.4s, v21.4s +add v12.4s, v12.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +sub v15.4s, v6.4s, v8.4s +add v6.4s, v6.4s, v8.4s +mla v19.4S, v17.4S, v31.s[0] +mla v11.4S, v5.4S, v31.s[0] +sub v5.4s, v4.4s, v20.4s +mla v2.4S, v1.4S, v31.s[0] +mla v22.4S, v21.4S, v31.s[0] +add v4.4s, v4.4s, v20.4s +sqrdmulh v20.4S, v16.4S, v27.s[1] +mul v16.4S, v16.4S,v28.s[1] +sub v21.4s, v14.4s, v0.4s +sqrdmulh v1.4S, v15.4S, v27.s[1] +mul v15.4S, v15.4S,v28.s[1] +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +sub v17.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v27.s[0] +mul v6.4S, v6.4S,v28.s[0] +sub v8.4s, v18.4s, v11.4s +add v18.4s, v18.4s, v11.4s +mla v16.4S, v20.4S, v31.s[0] +mla v15.4S, v1.4S, v31.s[0] +sub v1.4s, v13.4s, v2.4s +mla v12.4S, v0.4S, v31.s[0] +mla v6.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v3.4S, v27.s[2] +mul v3.4S, v3.4S,v28.s[2] +sub v19.4s, v7.4s, v22.4s +sqrdmulh v0.4S, v18.4S, v27.s[2] +mul v18.4S, v18.4S,v28.s[2] +add v7.4s, v7.4s, v22.4s +sqrdmulh v22.4S, v17.4S, v27.s[3] +mul v17.4S, v17.4S,v28.s[3] +sub v20.4s, v5.4s, v16.4s +add v5.4s, v5.4s, v16.4s +sqrdmulh v16.4S, v8.4S, v27.s[3] +mul v8.4S, v8.4S,v28.s[3] +sub v11.4s, v21.4s, v15.4s +add v21.4s, v21.4s, v15.4s +mla v3.4S, v2.4S, v31.s[0] +mla v18.4S, v0.4S, v31.s[0] +sub v0.4s, v4.4s, v12.4s +mla v17.4S, v22.4S, v31.s[0] +mla v8.4S, v16.4S, v31.s[0] +add v4.4s, v4.4s, v12.4s +sqrdmulh v12.4S, v21.4S, v25.s[2] +mul v21.4S, v21.4S,v26.s[2] +sub v16.4s, v14.4s, v6.4s +sqrdmulh v22.4S, v11.4S, v25.s[3] +mul v11.4S, v11.4S,v26.s[3] +add v14.4s, v14.4s, v6.4s +sqrdmulh v6.4S, v16.4S, v25.s[1] +mul v16.4S, v16.4S,v26.s[1] +sub v2.4s, v13.4s, v3.4s +add v13.4s, v13.4s, v3.4s +sqrdmulh v3.4S, v14.4S, v25.s[0] +mul v14.4S, v14.4S,v26.s[0] +sub v15.4s, v7.4s, v18.4s +add v7.4s, v7.4s, v18.4s +mla v21.4S, v12.4S, v31.s[0] +mla v11.4S, v22.4S, v31.s[0] +sub v22.4s, v1.4s, v17.4s +mla v16.4S, v6.4S, v31.s[0] +mla v14.4S, v3.4S, v31.s[0] +add v1.4s, v1.4s, v17.4s +sqrdmulh v17.4S, v7.4S, v23.s[0] +mul v7.4S, v7.4S,v24.s[0] +sub v3.4s, v19.4s, v8.4s +sqrdmulh v6.4S, v15.4S, v23.s[1] +mul v15.4S, v15.4S,v24.s[1] +add v19.4s, v19.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v23.s[2] +mul v19.4S, v19.4S,v24.s[2] +sub v12.4s, v5.4s, v21.4s +add v5.4s, v5.4s, v21.4s +sqrdmulh v21.4S, v3.4S, v23.s[3] +mul v3.4S, v3.4S,v24.s[3] +sub v18.4s, v20.4s, v11.4s +add v20.4s, v20.4s, v11.4s +mla v7.4S, v17.4S, v31.s[0] +mla v15.4S, v6.4S, v31.s[0] +sub v6.4s, v0.4s, v16.4s +str q5, [x0, #256] +mla v19.4S, v8.4S, v31.s[0] +mla v3.4S, v21.4S, v31.s[0] +add v0.4s, v0.4s, v16.4s +str q12, [x0, #320] +ldr q12, [x0, #912] +sqrdmulh v16.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +sub v21.4s, v4.4s, v14.4s +str q20, [x0, #384] +ldr q20, [x0, #976] +sqrdmulh v8.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v4.4s, v4.4s, v14.4s +str q18, [x0, #448] +ldr q18, [x0, #784] +sqrdmulh v14.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +sub v5.4s, v13.4s, v7.4s +add v13.4s, v13.4s, v7.4s +ldr q7, [x0, #848] +sqrdmulh v17.4S, v7.4S, v29.s[0] +mul v7.4S, v7.4S,v30.s[0] +sub v11.4s, v2.4s, v15.4s +add v2.4s, v2.4s, v15.4s +mla v12.4S, v16.4S, v31.s[0] +mla v20.4S, v8.4S, v31.s[0] +sub v8.4s, v1.4s, v19.4s +str q0, [x0, #128] +mla v18.4S, v14.4S, v31.s[0] +mla v7.4S, v17.4S, v31.s[0] +add v1.4s, v1.4s, v19.4s +str q6, [x0, #192] +ldr q6, [x0, #528] +sqrdmulh v19.4S, v6.4S, v29.s[0] +mul v6.4S, v6.4S,v30.s[0] +sub v17.4s, v22.4s, v3.4s +str q4, [x0, #0] +ldr q4, [x0, #592] +sqrdmulh v14.4S, v4.4S, v29.s[0] +mul v4.4S, v4.4S,v30.s[0] +add v22.4s, v22.4s, v3.4s +str q21, [x0, #64] +ldr q21, [x0, #656] +ldr q3, [x0, #400] +sqrdmulh v0.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +sub v16.4s, v3.4s, v12.4s +add v3.4s, v3.4s, v12.4s +ldr q12, [x0, #720] +ldr q15, [x0, #464] +sqrdmulh v9.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +sub v10.4s, v15.4s, v20.4s +add v15.4s, v15.4s, v20.4s +ldr q20, [x0, #272] +mla v6.4S, v19.4S, v31.s[0] +mla v4.4S, v14.4S, v31.s[0] +sub v14.4s, v20.4s, v18.4s +str q13, [x0, #512] +mla v21.4S, v0.4S, v31.s[0] +mla v12.4S, v9.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +str q5, [x0, #576] +ldr q5, [x0, #336] +sqrdmulh v18.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v9.4s, v5.4s, v7.4s +str q2, [x0, #640] +sqrdmulh v2.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +add v5.4s, v5.4s, v7.4s +str q11, [x0, #704] +ldr q11, [x0, #16] +sqrdmulh v7.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v0.4s, v11.4s, v6.4s +add v11.4s, v11.4s, v6.4s +ldr q6, [x0, #80] +sqrdmulh v13.4S, v5.4S, v29.s[1] +mul v5.4S, v5.4S,v30.s[1] +sub v19.4s, v6.4s, v4.4s +add v6.4s, v6.4s, v4.4s +ldr q4, [x0, #144] +mla v3.4S, v18.4S, v31.s[0] +mla v15.4S, v2.4S, v31.s[0] +sub v2.4s, v4.4s, v21.4s +str q1, [x0, #768] +mla v20.4S, v7.4S, v31.s[0] +mla v5.4S, v13.4S, v31.s[0] +add v4.4s, v4.4s, v21.4s +str q8, [x0, #832] +ldr q8, [x0, #208] +sqrdmulh v21.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +sub v13.4s, v8.4s, v12.4s +str q22, [x0, #896] +sqrdmulh v22.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +add v8.4s, v8.4s, v12.4s +str q17, [x0, #960] +sqrdmulh v17.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v12.4s, v4.4s, v3.4s +add v4.4s, v4.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v29.s[2] +mul v9.4S, v9.4S,v30.s[2] +sub v7.4s, v8.4s, v15.4s +add v8.4s, v8.4s, v15.4s +mla v16.4S, v21.4S, v31.s[0] +mla v10.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v20.4s +mla v14.4S, v17.4S, v31.s[0] +mla v9.4S, v3.4S, v31.s[0] +add v11.4s, v11.4s, v20.4s +sqrdmulh v20.4S, v12.4S, v27.s[1] +mul v12.4S, v12.4S,v28.s[1] +sub v3.4s, v6.4s, v5.4s +sqrdmulh v17.4S, v7.4S, v27.s[1] +mul v7.4S, v7.4S,v28.s[1] +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v4.4S, v27.s[0] +mul v4.4S, v4.4S,v28.s[0] +sub v21.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v8.4S, v27.s[0] +mul v8.4S, v8.4S,v28.s[0] +sub v15.4s, v13.4s, v10.4s +add v13.4s, v13.4s, v10.4s +mla v12.4S, v20.4S, v31.s[0] +mla v7.4S, v17.4S, v31.s[0] +sub v17.4s, v0.4s, v14.4s +mla v4.4S, v5.4S, v31.s[0] +mla v8.4S, v16.4S, v31.s[0] +add v0.4s, v0.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v27.s[2] +mul v2.4S, v2.4S,v28.s[2] +sub v16.4s, v19.4s, v9.4s +sqrdmulh v5.4S, v13.4S, v27.s[2] +mul v13.4S, v13.4S,v28.s[2] +add v19.4s, v19.4s, v9.4s +sqrdmulh v9.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +sub v20.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +sub v10.4s, v3.4s, v7.4s +add v3.4s, v3.4s, v7.4s +mla v2.4S, v14.4S, v31.s[0] +mla v13.4S, v5.4S, v31.s[0] +sub v5.4s, v11.4s, v4.4s +mla v21.4S, v9.4S, v31.s[0] +mla v15.4S, v12.4S, v31.s[0] +add v11.4s, v11.4s, v4.4s +sqrdmulh v4.4S, v3.4S, v25.s[2] +mul v3.4S, v3.4S,v26.s[2] +sub v12.4s, v6.4s, v8.4s +sqrdmulh v9.4S, v10.4S, v25.s[3] +mul v10.4S, v10.4S,v26.s[3] +add v6.4s, v6.4s, v8.4s +sqrdmulh v8.4S, v12.4S, v25.s[1] +mul v12.4S, v12.4S,v26.s[1] +sub v14.4s, v0.4s, v2.4s +add v0.4s, v0.4s, v2.4s +sqrdmulh v2.4S, v6.4S, v25.s[0] +mul v6.4S, v6.4S,v26.s[0] +sub v7.4s, v19.4s, v13.4s +add v19.4s, v19.4s, v13.4s +mla v3.4S, v4.4S, v31.s[0] +mla v10.4S, v9.4S, v31.s[0] +sub v9.4s, v17.4s, v21.4s +mla v12.4S, v8.4S, v31.s[0] +mla v6.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v19.4S, v23.s[0] +mul v19.4S, v19.4S,v24.s[0] +sub v2.4s, v16.4s, v15.4s +sqrdmulh v8.4S, v7.4S, v23.s[1] +mul v7.4S, v7.4S,v24.s[1] +add v16.4s, v16.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v23.s[2] +mul v16.4S, v16.4S,v24.s[2] +sub v4.4s, v22.4s, v3.4s +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v2.4S, v23.s[3] +mul v2.4S, v2.4S,v24.s[3] +sub v13.4s, v20.4s, v10.4s +add v20.4s, v20.4s, v10.4s +mla v19.4S, v21.4S, v31.s[0] +mla v7.4S, v8.4S, v31.s[0] +sub v8.4s, v5.4s, v12.4s +str q22, [x0, #272] +mla v16.4S, v15.4S, v31.s[0] +mla v2.4S, v3.4S, v31.s[0] +add v5.4s, v5.4s, v12.4s +str q4, [x0, #336] +sub v23.4s, v11.4s, v6.4s +str q20, [x0, #400] +add v11.4s, v11.4s, v6.4s +str q13, [x0, #464] +sub v13.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sub v19.4s, v14.4s, v7.4s +add v14.4s, v14.4s, v7.4s +sub v7.4s, v17.4s, v16.4s +str q5, [x0, #144] +add v17.4s, v17.4s, v16.4s +str q8, [x0, #208] +sub v8.4s, v9.4s, v2.4s +str q11, [x0, #16] +add v9.4s, v9.4s, v2.4s +str q23, [x0, #80] +str q0, [x0, #528] +str q13, [x0, #592] +str q14, [x0, #656] +str q19, [x0, #720] +str q17, [x0, #784] +str q7, [x0, #848] +str q9, [x0, #912] +str q8, [x0, #976] +ldr q18, [x0, #32] +ldr q1, [x0, #48] +ldr q10, [x0, #0] +ldr q21, [x0, #16] +ldr q22, [x0, #96] +ldr q15, [x0, #112] +ldr q3, [x0, #64] +ldr q12, [x0, #80] +ldr q4, [x0, #160] +ldr q30, [x0, #176] +ldr q29, [x0, #128] +ldr q28, [x0, #144] +ldr q27, [x0, #224] +ldr q26, [x0, #240] +ldr q25, [x0, #192] +ldr q24, [x0, #208] +ldr q20, [x17, #+128] +ldr q6, [x17, #+144] +ldr q5, [x17, #+160] +ldr q16, [x17, #+176] +ldr q11, [x17, #+192] +ldr q2, [x17, #+208] +ldr q23, [x17, #+224] +ldr q0, [x17, #+240] +sqrdmulh v13.4S, v18.4S, v6.s[0] +mul v18.4S, v18.4S,v20.s[0] +sqrdmulh v14.4S, v1.4S, v6.s[0] +mul v1.4S, v1.4S,v20.s[0] +mla v18.4S, v13.4S, v31.s[0] +sqrdmulh v13.4S, v22.4S, v16.s[0] +mul v22.4S, v22.4S,v5.s[0] +mla v1.4S, v14.4S, v31.s[0] +sub v14.4s, v10.4s, v18.4s +add v10.4s, v10.4s, v18.4s +sqrdmulh v18.4S, v15.4S, v16.s[0] +mul v15.4S, v15.4S,v5.s[0] +mla v22.4S, v13.4S, v31.s[0] +sub v13.4s, v21.4s, v1.4s +add v21.4s, v21.4s, v1.4s +sqrdmulh v1.4S, v21.4S, v6.s[1] +mul v21.4S, v21.4S,v20.s[1] +mla v15.4S, v18.4S, v31.s[0] +sub v18.4s, v3.4s, v22.4s +add v3.4s, v3.4s, v22.4s +sqrdmulh v22.4S, v13.4S, v6.s[2] +mul v13.4S, v13.4S,v20.s[2] +mla v21.4S, v1.4S, v31.s[0] +sub v1.4s, v12.4s, v15.4s +add v12.4s, v12.4s, v15.4s +sqrdmulh v15.4S, v12.4S, v16.s[1] +mul v12.4S, v12.4S,v5.s[1] +mla v13.4S, v22.4S, v31.s[0] +sub v22.4s, v10.4s, v21.4s +add v10.4s, v10.4s, v21.4s +sqrdmulh v6.4S, v1.4S, v16.s[2] +mul v1.4S, v1.4S,v5.s[2] +mla v12.4S, v15.4S, v31.s[0] +sub v15.4s, v14.4s, v13.4s +add v14.4s, v14.4s, v13.4s +sqrdmulh v13.4S, v4.4S, v2.s[0] +mul v4.4S, v4.4S,v11.s[0] +mla v1.4S, v6.4S, v31.s[0] +sub v6.4s, v3.4s, v12.4s +add v3.4s, v3.4s, v12.4s +sqrdmulh v16.4S, v30.4S, v2.s[0] +mul v30.4S, v30.4S,v11.s[0] +mla v4.4S, v13.4S, v31.s[0] +sub v13.4s, v18.4s, v1.4s +add v18.4s, v18.4s, v1.4s +sqrdmulh v1.4S, v27.4S, v0.s[0] +mul v27.4S, v27.4S,v23.s[0] +mla v30.4S, v16.4S, v31.s[0] +sub v16.4s, v29.4s, v4.4s +add v29.4s, v29.4s, v4.4s +sqrdmulh v4.4S, v26.4S, v0.s[0] +mul v26.4S, v26.4S,v23.s[0] +mla v27.4S, v1.4S, v31.s[0] +sub v1.4s, v28.4s, v30.4s +add v28.4s, v28.4s, v30.4s +sqrdmulh v30.4S, v28.4S, v2.s[1] +mul v28.4S, v28.4S,v11.s[1] +mla v26.4S, v4.4S, v31.s[0] +sub v4.4s, v25.4s, v27.4s +add v25.4s, v25.4s, v27.4s +sqrdmulh v27.4S, v1.4S, v2.s[2] +mul v1.4S, v1.4S,v11.s[2] +mla v28.4S, v30.4S, v31.s[0] +sub v30.4s, v24.4s, v26.4s +add v24.4s, v24.4s, v26.4s +sqrdmulh v26.4S, v24.4S, v0.s[1] +mul v24.4S, v24.4S,v23.s[1] +mla v1.4S, v27.4S, v31.s[0] +sub v27.4s, v29.4s, v28.4s +add v29.4s, v29.4s, v28.4s +sqrdmulh v2.4S, v30.4S, v0.s[2] +mul v30.4S, v30.4S,v23.s[2] +mla v24.4S, v26.4S, v31.s[0] +sub v26.4s, v16.4s, v1.4s +add v16.4s, v16.4s, v1.4s +mla v30.4S, v2.4S, v31.s[0] +sub v2.4s, v25.4s, v24.4s +add v25.4s, v25.4s, v24.4s +sub v0.4s, v4.4s, v30.4s +add v4.4s, v4.4s, v30.4s +str q10, [x0, #0] +str q22, [x0, #16] +str q14, [x0, #32] +str q15, [x0, #48] +str q3, [x0, #64] +str q6, [x0, #80] +str q18, [x0, #96] +str q13, [x0, #112] +str q29, [x0, #128] +str q27, [x0, #144] +str q16, [x0, #160] +str q26, [x0, #176] +str q25, [x0, #192] +str q2, [x0, #208] +str q4, [x0, #224] +str q0, [x0, #240] +ldr q0, [x0, #288] +ldr q4, [x0, #304] +ldr q2, [x0, #256] +ldr q25, [x0, #272] +ldr q26, [x0, #352] +ldr q16, [x0, #368] +ldr q27, [x0, #320] +ldr q29, [x0, #336] +ldr q13, [x0, #416] +ldr q18, [x0, #432] +ldr q6, [x0, #384] +ldr q3, [x0, #400] +ldr q15, [x0, #480] +ldr q14, [x0, #496] +ldr q22, [x0, #448] +ldr q10, [x0, #464] +ldr q30, [x17, #+256] +ldr q23, [x17, #+272] +ldr q24, [x17, #+288] +ldr q1, [x17, #+304] +ldr q11, [x17, #+320] +ldr q28, [x17, #+336] +ldr q5, [x17, #+352] +ldr q12, [x17, #+368] +sqrdmulh v20.4S, v0.4S, v23.s[0] +mul v0.4S, v0.4S,v30.s[0] +sqrdmulh v21.4S, v4.4S, v23.s[0] +mul v4.4S, v4.4S,v30.s[0] +mla v0.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v26.4S, v1.s[0] +mul v26.4S, v26.4S,v24.s[0] +mla v4.4S, v21.4S, v31.s[0] +sub v21.4s, v2.4s, v0.4s +add v2.4s, v2.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v1.s[0] +mul v16.4S, v16.4S,v24.s[0] +mla v26.4S, v20.4S, v31.s[0] +sub v20.4s, v25.4s, v4.4s +add v25.4s, v25.4s, v4.4s +sqrdmulh v4.4S, v25.4S, v23.s[1] +mul v25.4S, v25.4S,v30.s[1] +mla v16.4S, v0.4S, v31.s[0] +sub v0.4s, v27.4s, v26.4s +add v27.4s, v27.4s, v26.4s +sqrdmulh v26.4S, v20.4S, v23.s[2] +mul v20.4S, v20.4S,v30.s[2] +mla v25.4S, v4.4S, v31.s[0] +sub v4.4s, v29.4s, v16.4s +add v29.4s, v29.4s, v16.4s +sqrdmulh v16.4S, v29.4S, v1.s[1] +mul v29.4S, v29.4S,v24.s[1] +mla v20.4S, v26.4S, v31.s[0] +sub v26.4s, v2.4s, v25.4s +add v2.4s, v2.4s, v25.4s +sqrdmulh v23.4S, v4.4S, v1.s[2] +mul v4.4S, v4.4S,v24.s[2] +mla v29.4S, v16.4S, v31.s[0] +sub v16.4s, v21.4s, v20.4s +add v21.4s, v21.4s, v20.4s +sqrdmulh v20.4S, v13.4S, v28.s[0] +mul v13.4S, v13.4S,v11.s[0] +mla v4.4S, v23.4S, v31.s[0] +sub v23.4s, v27.4s, v29.4s +add v27.4s, v27.4s, v29.4s +sqrdmulh v1.4S, v18.4S, v28.s[0] +mul v18.4S, v18.4S,v11.s[0] +mla v13.4S, v20.4S, v31.s[0] +sub v20.4s, v0.4s, v4.4s +add v0.4s, v0.4s, v4.4s +sqrdmulh v4.4S, v15.4S, v12.s[0] +mul v15.4S, v15.4S,v5.s[0] +mla v18.4S, v1.4S, v31.s[0] +sub v1.4s, v6.4s, v13.4s +add v6.4s, v6.4s, v13.4s +sqrdmulh v13.4S, v14.4S, v12.s[0] +mul v14.4S, v14.4S,v5.s[0] +mla v15.4S, v4.4S, v31.s[0] +sub v4.4s, v3.4s, v18.4s +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v3.4S, v28.s[1] +mul v3.4S, v3.4S,v11.s[1] +mla v14.4S, v13.4S, v31.s[0] +sub v13.4s, v22.4s, v15.4s +add v22.4s, v22.4s, v15.4s +sqrdmulh v15.4S, v4.4S, v28.s[2] +mul v4.4S, v4.4S,v11.s[2] +mla v3.4S, v18.4S, v31.s[0] +sub v18.4s, v10.4s, v14.4s +add v10.4s, v10.4s, v14.4s +sqrdmulh v14.4S, v10.4S, v12.s[1] +mul v10.4S, v10.4S,v5.s[1] +mla v4.4S, v15.4S, v31.s[0] +sub v15.4s, v6.4s, v3.4s +add v6.4s, v6.4s, v3.4s +sqrdmulh v28.4S, v18.4S, v12.s[2] +mul v18.4S, v18.4S,v5.s[2] +mla v10.4S, v14.4S, v31.s[0] +sub v14.4s, v1.4s, v4.4s +add v1.4s, v1.4s, v4.4s +mla v18.4S, v28.4S, v31.s[0] +sub v28.4s, v22.4s, v10.4s +add v22.4s, v22.4s, v10.4s +sub v12.4s, v13.4s, v18.4s +add v13.4s, v13.4s, v18.4s +str q2, [x0, #256] +str q26, [x0, #272] +str q21, [x0, #288] +str q16, [x0, #304] +str q27, [x0, #320] +str q23, [x0, #336] +str q0, [x0, #352] +str q20, [x0, #368] +str q6, [x0, #384] +str q15, [x0, #400] +str q1, [x0, #416] +str q14, [x0, #432] +str q22, [x0, #448] +str q28, [x0, #464] +str q13, [x0, #480] +str q12, [x0, #496] +ldr q12, [x0, #544] +ldr q13, [x0, #560] +ldr q28, [x0, #512] +ldr q22, [x0, #528] +ldr q14, [x0, #608] +ldr q1, [x0, #624] +ldr q15, [x0, #576] +ldr q6, [x0, #592] +ldr q20, [x0, #672] +ldr q0, [x0, #688] +ldr q23, [x0, #640] +ldr q27, [x0, #656] +ldr q16, [x0, #736] +ldr q21, [x0, #752] +ldr q26, [x0, #704] +ldr q2, [x0, #720] +ldr q18, [x17, #+384] +ldr q5, [x17, #+400] +ldr q10, [x17, #+416] +ldr q4, [x17, #+432] +ldr q11, [x17, #+448] +ldr q3, [x17, #+464] +ldr q24, [x17, #+480] +ldr q29, [x17, #+496] +sqrdmulh v30.4S, v12.4S, v5.s[0] +mul v12.4S, v12.4S,v18.s[0] +sqrdmulh v25.4S, v13.4S, v5.s[0] +mul v13.4S, v13.4S,v18.s[0] +mla v12.4S, v30.4S, v31.s[0] +sqrdmulh v30.4S, v14.4S, v4.s[0] +mul v14.4S, v14.4S,v10.s[0] +mla v13.4S, v25.4S, v31.s[0] +sub v25.4s, v28.4s, v12.4s +add v28.4s, v28.4s, v12.4s +sqrdmulh v12.4S, v1.4S, v4.s[0] +mul v1.4S, v1.4S,v10.s[0] +mla v14.4S, v30.4S, v31.s[0] +sub v30.4s, v22.4s, v13.4s +add v22.4s, v22.4s, v13.4s +sqrdmulh v13.4S, v22.4S, v5.s[1] +mul v22.4S, v22.4S,v18.s[1] +mla v1.4S, v12.4S, v31.s[0] +sub v12.4s, v15.4s, v14.4s +add v15.4s, v15.4s, v14.4s +sqrdmulh v14.4S, v30.4S, v5.s[2] +mul v30.4S, v30.4S,v18.s[2] +mla v22.4S, v13.4S, v31.s[0] +sub v13.4s, v6.4s, v1.4s +add v6.4s, v6.4s, v1.4s +sqrdmulh v1.4S, v6.4S, v4.s[1] +mul v6.4S, v6.4S,v10.s[1] +mla v30.4S, v14.4S, v31.s[0] +sub v14.4s, v28.4s, v22.4s +add v28.4s, v28.4s, v22.4s +sqrdmulh v5.4S, v13.4S, v4.s[2] +mul v13.4S, v13.4S,v10.s[2] +mla v6.4S, v1.4S, v31.s[0] +sub v1.4s, v25.4s, v30.4s +add v25.4s, v25.4s, v30.4s +sqrdmulh v30.4S, v20.4S, v3.s[0] +mul v20.4S, v20.4S,v11.s[0] +mla v13.4S, v5.4S, v31.s[0] +sub v5.4s, v15.4s, v6.4s +add v15.4s, v15.4s, v6.4s +sqrdmulh v4.4S, v0.4S, v3.s[0] +mul v0.4S, v0.4S,v11.s[0] +mla v20.4S, v30.4S, v31.s[0] +sub v30.4s, v12.4s, v13.4s +add v12.4s, v12.4s, v13.4s +sqrdmulh v13.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v24.s[0] +mla v0.4S, v4.4S, v31.s[0] +sub v4.4s, v23.4s, v20.4s +add v23.4s, v23.4s, v20.4s +sqrdmulh v20.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v24.s[0] +mla v16.4S, v13.4S, v31.s[0] +sub v13.4s, v27.4s, v0.4s +add v27.4s, v27.4s, v0.4s +sqrdmulh v0.4S, v27.4S, v3.s[1] +mul v27.4S, v27.4S,v11.s[1] +mla v21.4S, v20.4S, v31.s[0] +sub v20.4s, v26.4s, v16.4s +add v26.4s, v26.4s, v16.4s +sqrdmulh v16.4S, v13.4S, v3.s[2] +mul v13.4S, v13.4S,v11.s[2] +mla v27.4S, v0.4S, v31.s[0] +sub v0.4s, v2.4s, v21.4s +add v2.4s, v2.4s, v21.4s +sqrdmulh v21.4S, v2.4S, v29.s[1] +mul v2.4S, v2.4S,v24.s[1] +mla v13.4S, v16.4S, v31.s[0] +sub v16.4s, v23.4s, v27.4s +add v23.4s, v23.4s, v27.4s +sqrdmulh v3.4S, v0.4S, v29.s[2] +mul v0.4S, v0.4S,v24.s[2] +mla v2.4S, v21.4S, v31.s[0] +sub v21.4s, v4.4s, v13.4s +add v4.4s, v4.4s, v13.4s +mla v0.4S, v3.4S, v31.s[0] +sub v3.4s, v26.4s, v2.4s +add v26.4s, v26.4s, v2.4s +sub v29.4s, v20.4s, v0.4s +add v20.4s, v20.4s, v0.4s +str q28, [x0, #512] +str q14, [x0, #528] +str q25, [x0, #544] +str q1, [x0, #560] +str q15, [x0, #576] +str q5, [x0, #592] +str q12, [x0, #608] +str q30, [x0, #624] +str q23, [x0, #640] +str q16, [x0, #656] +str q4, [x0, #672] +str q21, [x0, #688] +str q26, [x0, #704] +str q3, [x0, #720] +str q20, [x0, #736] +str q29, [x0, #752] +ldr q29, [x0, #800] +ldr q20, [x0, #816] +ldr q3, [x0, #768] +ldr q26, [x0, #784] +ldr q21, [x0, #864] +ldr q4, [x0, #880] +ldr q16, [x0, #832] +ldr q23, [x0, #848] +ldr q30, [x0, #928] +ldr q12, [x0, #944] +ldr q5, [x0, #896] +ldr q15, [x0, #912] +ldr q1, [x0, #992] +ldr q25, [x0, #1008] +ldr q14, [x0, #960] +ldr q28, [x0, #976] +ldr q0, [x17, #+512] +ldr q24, [x17, #+528] +ldr q2, [x17, #+544] +ldr q13, [x17, #+560] +ldr q11, [x17, #+576] +ldr q27, [x17, #+592] +ldr q10, [x17, #+608] +ldr q6, [x17, #+624] +sqrdmulh v18.4S, v29.4S, v24.s[0] +mul v29.4S, v29.4S,v0.s[0] +sqrdmulh v22.4S, v20.4S, v24.s[0] +mul v20.4S, v20.4S,v0.s[0] +mla v29.4S, v18.4S, v31.s[0] +sqrdmulh v18.4S, v21.4S, v13.s[0] +mul v21.4S, v21.4S,v2.s[0] +mla v20.4S, v22.4S, v31.s[0] +sub v22.4s, v3.4s, v29.4s +add v3.4s, v3.4s, v29.4s +sqrdmulh v29.4S, v4.4S, v13.s[0] +mul v4.4S, v4.4S,v2.s[0] +mla v21.4S, v18.4S, v31.s[0] +sub v18.4s, v26.4s, v20.4s +add v26.4s, v26.4s, v20.4s +sqrdmulh v20.4S, v26.4S, v24.s[1] +mul v26.4S, v26.4S,v0.s[1] +mla v4.4S, v29.4S, v31.s[0] +sub v29.4s, v16.4s, v21.4s +add v16.4s, v16.4s, v21.4s +sqrdmulh v21.4S, v18.4S, v24.s[2] +mul v18.4S, v18.4S,v0.s[2] +mla v26.4S, v20.4S, v31.s[0] +sub v20.4s, v23.4s, v4.4s +add v23.4s, v23.4s, v4.4s +sqrdmulh v4.4S, v23.4S, v13.s[1] +mul v23.4S, v23.4S,v2.s[1] +mla v18.4S, v21.4S, v31.s[0] +sub v21.4s, v3.4s, v26.4s +add v3.4s, v3.4s, v26.4s +sqrdmulh v24.4S, v20.4S, v13.s[2] +mul v20.4S, v20.4S,v2.s[2] +mla v23.4S, v4.4S, v31.s[0] +sub v4.4s, v22.4s, v18.4s +add v22.4s, v22.4s, v18.4s +sqrdmulh v18.4S, v30.4S, v27.s[0] +mul v30.4S, v30.4S,v11.s[0] +mla v20.4S, v24.4S, v31.s[0] +sub v24.4s, v16.4s, v23.4s +add v16.4s, v16.4s, v23.4s +sqrdmulh v13.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v11.s[0] +mla v30.4S, v18.4S, v31.s[0] +sub v18.4s, v29.4s, v20.4s +add v29.4s, v29.4s, v20.4s +sqrdmulh v20.4S, v1.4S, v6.s[0] +mul v1.4S, v1.4S,v10.s[0] +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v5.4s, v30.4s +add v5.4s, v5.4s, v30.4s +sqrdmulh v30.4S, v25.4S, v6.s[0] +mul v25.4S, v25.4S,v10.s[0] +mla v1.4S, v20.4S, v31.s[0] +sub v20.4s, v15.4s, v12.4s +add v15.4s, v15.4s, v12.4s +sqrdmulh v12.4S, v15.4S, v27.s[1] +mul v15.4S, v15.4S,v11.s[1] +mla v25.4S, v30.4S, v31.s[0] +sub v30.4s, v14.4s, v1.4s +add v14.4s, v14.4s, v1.4s +sqrdmulh v1.4S, v20.4S, v27.s[2] +mul v20.4S, v20.4S,v11.s[2] +mla v15.4S, v12.4S, v31.s[0] +sub v12.4s, v28.4s, v25.4s +add v28.4s, v28.4s, v25.4s +sqrdmulh v25.4S, v28.4S, v6.s[1] +mul v28.4S, v28.4S,v10.s[1] +mla v20.4S, v1.4S, v31.s[0] +sub v1.4s, v5.4s, v15.4s +add v5.4s, v5.4s, v15.4s +sqrdmulh v27.4S, v12.4S, v6.s[2] +mul v12.4S, v12.4S,v10.s[2] +mla v28.4S, v25.4S, v31.s[0] +sub v25.4s, v13.4s, v20.4s +add v13.4s, v13.4s, v20.4s +mla v12.4S, v27.4S, v31.s[0] +sub v27.4s, v14.4s, v28.4s +add v14.4s, v14.4s, v28.4s +sub v6.4s, v30.4s, v12.4s +add v30.4s, v30.4s, v12.4s +str q3, [x0, #768] +str q21, [x0, #784] +str q22, [x0, #800] +str q4, [x0, #816] +str q16, [x0, #832] +str q24, [x0, #848] +str q29, [x0, #864] +str q18, [x0, #880] +str q5, [x0, #896] +str q1, [x0, #912] +str q13, [x0, #928] +str q25, [x0, #944] +str q14, [x0, #960] +str q27, [x0, #976] +str q30, [x0, #992] +str q6, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1464 +// Instruction count: 1460 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_10.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_10.s new file mode 100644 index 0000000..807b044 --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_10.s @@ -0,0 +1,1494 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_7_z4_10 +.global _ntt_u32_incomplete_neon_asm_var_4_2_7_z4_10 +ntt_u32_incomplete_neon_asm_var_4_2_7_z4_10: +_ntt_u32_incomplete_neon_asm_var_4_2_7_z4_10: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #928] +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +ldr q20, [x0, #992] +sqrdmulh v19.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q18, [x0, #800] +sqrdmulh v17.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +ldr q16, [x0, #864] +sqrdmulh v3.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +mla v22.4S, v21.4S, v31.s[0] +mla v20.4S, v19.4S, v31.s[0] +mla v18.4S, v17.4S, v31.s[0] +mla v16.4S, v3.4S, v31.s[0] +ldr q3, [x0, #544] +sqrdmulh v17.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +ldr q19, [x0, #608] +sqrdmulh v21.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q2, [x0, #672] +ldr q1, [x0, #416] +sqrdmulh v0.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +sub v15.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +ldr q22, [x0, #736] +ldr q14, [x0, #480] +sqrdmulh v13.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v12.4s, v14.4s, v20.4s +add v14.4s, v14.4s, v20.4s +ldr q20, [x0, #288] +mla v3.4S, v17.4S, v31.s[0] +mla v19.4S, v21.4S, v31.s[0] +sub v21.4s, v20.4s, v18.4s +mla v2.4S, v0.4S, v31.s[0] +mla v22.4S, v13.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +ldr q18, [x0, #352] +sqrdmulh v13.4S, v1.4S, v29.s[1] +mul v1.4S, v1.4S,v30.s[1] +sub v0.4s, v18.4s, v16.4s +sqrdmulh v17.4S, v14.4S, v29.s[1] +mul v14.4S, v14.4S,v30.s[1] +add v18.4s, v18.4s, v16.4s +ldr q16, [x0, #32] +sqrdmulh v11.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v10.4s, v16.4s, v3.4s +add v16.4s, v16.4s, v3.4s +ldr q3, [x0, #96] +sqrdmulh v9.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v8.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +ldr q19, [x0, #160] +mla v1.4S, v13.4S, v31.s[0] +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v19.4s, v2.4s +mla v20.4S, v11.4S, v31.s[0] +mla v18.4S, v9.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +ldr q2, [x0, #224] +sqrdmulh v9.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +sub v11.4s, v2.4s, v22.4s +sqrdmulh v13.4S, v12.4S, v29.s[2] +mul v12.4S, v12.4S,v30.s[2] +add v2.4s, v2.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +sub v7.4s, v19.4s, v1.4s +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v29.s[2] +mul v0.4S, v0.4S,v30.s[2] +sub v6.4s, v2.4s, v14.4s +add v2.4s, v2.4s, v14.4s +mla v15.4S, v9.4S, v31.s[0] +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v16.4s, v20.4s +mla v21.4S, v22.4S, v31.s[0] +mla v0.4S, v1.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v7.4S, v27.s[1] +mul v7.4S, v7.4S,v28.s[1] +sub v1.4s, v3.4s, v18.4s +sqrdmulh v22.4S, v6.4S, v27.s[1] +mul v6.4S, v6.4S,v28.s[1] +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v19.4S, v27.s[0] +mul v19.4S, v19.4S,v28.s[0] +sub v9.4s, v17.4s, v15.4s +add v17.4s, v17.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v27.s[0] +mul v2.4S, v2.4S,v28.s[0] +sub v14.4s, v11.4s, v12.4s +add v11.4s, v11.4s, v12.4s +mla v7.4S, v20.4S, v31.s[0] +mla v6.4S, v22.4S, v31.s[0] +sub v22.4s, v10.4s, v21.4s +mla v19.4S, v18.4S, v31.s[0] +mla v2.4S, v15.4S, v31.s[0] +add v10.4s, v10.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v27.s[2] +mul v17.4S, v17.4S,v28.s[2] +sub v15.4s, v8.4s, v0.4s +sqrdmulh v18.4S, v11.4S, v27.s[2] +mul v11.4S, v11.4S,v28.s[2] +add v8.4s, v8.4s, v0.4s +sqrdmulh v0.4S, v9.4S, v27.s[3] +mul v9.4S, v9.4S,v28.s[3] +sub v20.4s, v13.4s, v7.4s +add v13.4s, v13.4s, v7.4s +sqrdmulh v7.4S, v14.4S, v27.s[3] +mul v14.4S, v14.4S,v28.s[3] +sub v12.4s, v1.4s, v6.4s +add v1.4s, v1.4s, v6.4s +mla v17.4S, v21.4S, v31.s[0] +mla v11.4S, v18.4S, v31.s[0] +sub v18.4s, v16.4s, v19.4s +mla v9.4S, v0.4S, v31.s[0] +mla v14.4S, v7.4S, v31.s[0] +add v16.4s, v16.4s, v19.4s +sqrdmulh v19.4S, v1.4S, v25.s[2] +mul v1.4S, v1.4S,v26.s[2] +sub v7.4s, v3.4s, v2.4s +sqrdmulh v0.4S, v12.4S, v25.s[3] +mul v12.4S, v12.4S,v26.s[3] +add v3.4s, v3.4s, v2.4s +sqrdmulh v2.4S, v7.4S, v25.s[1] +mul v7.4S, v7.4S,v26.s[1] +sub v21.4s, v10.4s, v17.4s +add v10.4s, v10.4s, v17.4s +sqrdmulh v17.4S, v3.4S, v25.s[0] +mul v3.4S, v3.4S,v26.s[0] +sub v6.4s, v8.4s, v11.4s +add v8.4s, v8.4s, v11.4s +mla v1.4S, v19.4S, v31.s[0] +mla v12.4S, v0.4S, v31.s[0] +sub v0.4s, v22.4s, v9.4s +mla v7.4S, v2.4S, v31.s[0] +mla v3.4S, v17.4S, v31.s[0] +add v22.4s, v22.4s, v9.4s +sqrdmulh v9.4S, v8.4S, v23.s[0] +mul v8.4S, v8.4S,v24.s[0] +sub v17.4s, v15.4s, v14.4s +sqrdmulh v2.4S, v6.4S, v23.s[1] +mul v6.4S, v6.4S,v24.s[1] +add v15.4s, v15.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v23.s[2] +mul v15.4S, v15.4S,v24.s[2] +sub v19.4s, v13.4s, v1.4s +add v13.4s, v13.4s, v1.4s +sqrdmulh v1.4S, v17.4S, v23.s[3] +mul v17.4S, v17.4S,v24.s[3] +sub v11.4s, v20.4s, v12.4s +add v20.4s, v20.4s, v12.4s +mla v8.4S, v9.4S, v31.s[0] +mla v6.4S, v2.4S, v31.s[0] +sub v2.4s, v18.4s, v7.4s +str q13, [x0, #288] +mla v15.4S, v14.4S, v31.s[0] +mla v17.4S, v1.4S, v31.s[0] +add v18.4s, v18.4s, v7.4s +str q19, [x0, #352] +ldr q19, [x0, #944] +sqrdmulh v7.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +sub v1.4s, v16.4s, v3.4s +str q20, [x0, #416] +ldr q20, [x0, #1008] +sqrdmulh v14.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v16.4s, v16.4s, v3.4s +str q11, [x0, #480] +ldr q11, [x0, #816] +sqrdmulh v3.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +sub v13.4s, v10.4s, v8.4s +add v10.4s, v10.4s, v8.4s +ldr q8, [x0, #880] +sqrdmulh v9.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v12.4s, v21.4s, v6.4s +add v21.4s, v21.4s, v6.4s +mla v19.4S, v7.4S, v31.s[0] +mla v20.4S, v14.4S, v31.s[0] +sub v14.4s, v22.4s, v15.4s +str q18, [x0, #160] +mla v11.4S, v3.4S, v31.s[0] +mla v8.4S, v9.4S, v31.s[0] +add v22.4s, v22.4s, v15.4s +str q2, [x0, #224] +ldr q2, [x0, #560] +sqrdmulh v15.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +sub v9.4s, v0.4s, v17.4s +str q16, [x0, #32] +ldr q16, [x0, #624] +sqrdmulh v3.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +add v0.4s, v0.4s, v17.4s +str q1, [x0, #96] +ldr q1, [x0, #688] +ldr q17, [x0, #432] +sqrdmulh v18.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +sub v7.4s, v17.4s, v19.4s +add v17.4s, v17.4s, v19.4s +ldr q19, [x0, #752] +ldr q6, [x0, #496] +sqrdmulh v5.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +sub v4.4s, v6.4s, v20.4s +add v6.4s, v6.4s, v20.4s +ldr q20, [x0, #304] +mla v2.4S, v15.4S, v31.s[0] +mla v16.4S, v3.4S, v31.s[0] +sub v3.4s, v20.4s, v11.4s +str q10, [x0, #544] +mla v1.4S, v18.4S, v31.s[0] +mla v19.4S, v5.4S, v31.s[0] +add v20.4s, v20.4s, v11.4s +str q13, [x0, #608] +ldr q13, [x0, #368] +sqrdmulh v11.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v5.4s, v13.4s, v8.4s +str q21, [x0, #672] +sqrdmulh v21.4S, v6.4S, v29.s[1] +mul v6.4S, v6.4S,v30.s[1] +add v13.4s, v13.4s, v8.4s +str q12, [x0, #736] +ldr q12, [x0, #48] +sqrdmulh v8.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v18.4s, v12.4s, v2.4s +add v12.4s, v12.4s, v2.4s +ldr q2, [x0, #112] +sqrdmulh v10.4S, v13.4S, v29.s[1] +mul v13.4S, v13.4S,v30.s[1] +sub v15.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +ldr q16, [x0, #176] +mla v17.4S, v11.4S, v31.s[0] +mla v6.4S, v21.4S, v31.s[0] +sub v21.4s, v16.4s, v1.4s +str q22, [x0, #800] +mla v20.4S, v8.4S, v31.s[0] +mla v13.4S, v10.4S, v31.s[0] +add v16.4s, v16.4s, v1.4s +str q14, [x0, #864] +ldr q14, [x0, #240] +sqrdmulh v1.4S, v7.4S, v29.s[2] +mul v7.4S, v7.4S,v30.s[2] +sub v10.4s, v14.4s, v19.4s +str q0, [x0, #928] +sqrdmulh v0.4S, v4.4S, v29.s[2] +mul v4.4S, v4.4S,v30.s[2] +add v14.4s, v14.4s, v19.4s +str q9, [x0, #992] +sqrdmulh v9.4S, v3.4S, v29.s[2] +mul v3.4S, v3.4S,v30.s[2] +sub v19.4s, v16.4s, v17.4s +add v16.4s, v16.4s, v17.4s +sqrdmulh v17.4S, v5.4S, v29.s[2] +mul v5.4S, v5.4S,v30.s[2] +sub v8.4s, v14.4s, v6.4s +add v14.4s, v14.4s, v6.4s +mla v7.4S, v1.4S, v31.s[0] +mla v4.4S, v0.4S, v31.s[0] +sub v0.4s, v12.4s, v20.4s +mla v3.4S, v9.4S, v31.s[0] +mla v5.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v27.s[1] +mul v19.4S, v19.4S,v28.s[1] +sub v17.4s, v2.4s, v13.4s +sqrdmulh v9.4S, v8.4S, v27.s[1] +mul v8.4S, v8.4S,v28.s[1] +add v2.4s, v2.4s, v13.4s +sqrdmulh v13.4S, v16.4S, v27.s[0] +mul v16.4S, v16.4S,v28.s[0] +sub v1.4s, v21.4s, v7.4s +add v21.4s, v21.4s, v7.4s +sqrdmulh v7.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +sub v6.4s, v10.4s, v4.4s +add v10.4s, v10.4s, v4.4s +mla v19.4S, v20.4S, v31.s[0] +mla v8.4S, v9.4S, v31.s[0] +sub v9.4s, v18.4s, v3.4s +mla v16.4S, v13.4S, v31.s[0] +mla v14.4S, v7.4S, v31.s[0] +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v27.s[2] +mul v21.4S, v21.4S,v28.s[2] +sub v7.4s, v15.4s, v5.4s +sqrdmulh v13.4S, v10.4S, v27.s[2] +mul v10.4S, v10.4S,v28.s[2] +add v15.4s, v15.4s, v5.4s +sqrdmulh v5.4S, v1.4S, v27.s[3] +mul v1.4S, v1.4S,v28.s[3] +sub v20.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v27.s[3] +mul v6.4S, v6.4S,v28.s[3] +sub v4.4s, v17.4s, v8.4s +add v17.4s, v17.4s, v8.4s +mla v21.4S, v3.4S, v31.s[0] +mla v10.4S, v13.4S, v31.s[0] +sub v13.4s, v12.4s, v16.4s +mla v1.4S, v5.4S, v31.s[0] +mla v6.4S, v19.4S, v31.s[0] +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v25.s[2] +mul v17.4S, v17.4S,v26.s[2] +sub v19.4s, v2.4s, v14.4s +sqrdmulh v5.4S, v4.4S, v25.s[3] +mul v4.4S, v4.4S,v26.s[3] +add v2.4s, v2.4s, v14.4s +sqrdmulh v14.4S, v19.4S, v25.s[1] +mul v19.4S, v19.4S,v26.s[1] +sub v3.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v2.4S, v25.s[0] +mul v2.4S, v2.4S,v26.s[0] +sub v8.4s, v15.4s, v10.4s +add v15.4s, v15.4s, v10.4s +mla v17.4S, v16.4S, v31.s[0] +mla v4.4S, v5.4S, v31.s[0] +sub v5.4s, v9.4s, v1.4s +mla v19.4S, v14.4S, v31.s[0] +mla v2.4S, v21.4S, v31.s[0] +add v9.4s, v9.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v23.s[0] +mul v15.4S, v15.4S,v24.s[0] +sub v21.4s, v7.4s, v6.4s +sqrdmulh v14.4S, v8.4S, v23.s[1] +mul v8.4S, v8.4S,v24.s[1] +add v7.4s, v7.4s, v6.4s +sqrdmulh v6.4S, v7.4S, v23.s[2] +mul v7.4S, v7.4S,v24.s[2] +sub v16.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +sqrdmulh v17.4S, v21.4S, v23.s[3] +mul v21.4S, v21.4S,v24.s[3] +sub v10.4s, v20.4s, v4.4s +add v20.4s, v20.4s, v4.4s +mla v15.4S, v1.4S, v31.s[0] +mla v8.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v19.4s +str q0, [x0, #304] +mla v7.4S, v6.4S, v31.s[0] +mla v21.4S, v17.4S, v31.s[0] +add v13.4s, v13.4s, v19.4s +str q16, [x0, #368] +ldr q16, [x0, #896] +sqrdmulh v19.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v17.4s, v12.4s, v2.4s +str q20, [x0, #432] +ldr q20, [x0, #960] +sqrdmulh v6.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v12.4s, v12.4s, v2.4s +str q10, [x0, #496] +ldr q10, [x0, #768] +sqrdmulh v2.4S, v10.4S, v29.s[0] +mul v10.4S, v10.4S,v30.s[0] +sub v0.4s, v18.4s, v15.4s +add v18.4s, v18.4s, v15.4s +ldr q15, [x0, #832] +sqrdmulh v1.4S, v15.4S, v29.s[0] +mul v15.4S, v15.4S,v30.s[0] +sub v4.4s, v3.4s, v8.4s +add v3.4s, v3.4s, v8.4s +mla v16.4S, v19.4S, v31.s[0] +mla v20.4S, v6.4S, v31.s[0] +sub v6.4s, v9.4s, v7.4s +str q13, [x0, #176] +mla v10.4S, v2.4S, v31.s[0] +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v7.4s +str q14, [x0, #240] +ldr q14, [x0, #512] +sqrdmulh v7.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v1.4s, v5.4s, v21.4s +str q12, [x0, #48] +ldr q12, [x0, #576] +sqrdmulh v2.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +add v5.4s, v5.4s, v21.4s +str q17, [x0, #112] +ldr q17, [x0, #640] +ldr q21, [x0, #384] +sqrdmulh v13.4S, v17.4S, v29.s[0] +mul v17.4S, v17.4S,v30.s[0] +sub v19.4s, v21.4s, v16.4s +add v21.4s, v21.4s, v16.4s +ldr q16, [x0, #704] +ldr q8, [x0, #448] +sqrdmulh v22.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v11.4s, v8.4s, v20.4s +add v8.4s, v8.4s, v20.4s +ldr q20, [x0, #256] +mla v14.4S, v7.4S, v31.s[0] +mla v12.4S, v2.4S, v31.s[0] +sub v2.4s, v20.4s, v10.4s +str q18, [x0, #560] +mla v17.4S, v13.4S, v31.s[0] +mla v16.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v10.4s +str q0, [x0, #624] +ldr q0, [x0, #320] +sqrdmulh v10.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v22.4s, v0.4s, v15.4s +str q3, [x0, #688] +sqrdmulh v3.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +add v0.4s, v0.4s, v15.4s +str q4, [x0, #752] +ldr q4, [x0, #0] +sqrdmulh v15.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v13.4s, v4.4s, v14.4s +add v4.4s, v4.4s, v14.4s +ldr q14, [x0, #64] +sqrdmulh v18.4S, v0.4S, v29.s[1] +mul v0.4S, v0.4S,v30.s[1] +sub v7.4s, v14.4s, v12.4s +add v14.4s, v14.4s, v12.4s +ldr q12, [x0, #128] +mla v21.4S, v10.4S, v31.s[0] +mla v8.4S, v3.4S, v31.s[0] +sub v3.4s, v12.4s, v17.4s +str q9, [x0, #816] +mla v20.4S, v15.4S, v31.s[0] +mla v0.4S, v18.4S, v31.s[0] +add v12.4s, v12.4s, v17.4s +str q6, [x0, #880] +ldr q6, [x0, #192] +sqrdmulh v17.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +sub v18.4s, v6.4s, v16.4s +str q5, [x0, #944] +sqrdmulh v5.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +add v6.4s, v6.4s, v16.4s +str q1, [x0, #1008] +sqrdmulh v1.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v16.4s, v12.4s, v21.4s +add v12.4s, v12.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +sub v15.4s, v6.4s, v8.4s +add v6.4s, v6.4s, v8.4s +mla v19.4S, v17.4S, v31.s[0] +mla v11.4S, v5.4S, v31.s[0] +sub v5.4s, v4.4s, v20.4s +mla v2.4S, v1.4S, v31.s[0] +mla v22.4S, v21.4S, v31.s[0] +add v4.4s, v4.4s, v20.4s +sqrdmulh v20.4S, v16.4S, v27.s[1] +mul v16.4S, v16.4S,v28.s[1] +sub v21.4s, v14.4s, v0.4s +sqrdmulh v1.4S, v15.4S, v27.s[1] +mul v15.4S, v15.4S,v28.s[1] +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +sub v17.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v27.s[0] +mul v6.4S, v6.4S,v28.s[0] +sub v8.4s, v18.4s, v11.4s +add v18.4s, v18.4s, v11.4s +mla v16.4S, v20.4S, v31.s[0] +mla v15.4S, v1.4S, v31.s[0] +sub v1.4s, v13.4s, v2.4s +mla v12.4S, v0.4S, v31.s[0] +mla v6.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v3.4S, v27.s[2] +mul v3.4S, v3.4S,v28.s[2] +sub v19.4s, v7.4s, v22.4s +sqrdmulh v0.4S, v18.4S, v27.s[2] +mul v18.4S, v18.4S,v28.s[2] +add v7.4s, v7.4s, v22.4s +sqrdmulh v22.4S, v17.4S, v27.s[3] +mul v17.4S, v17.4S,v28.s[3] +sub v20.4s, v5.4s, v16.4s +add v5.4s, v5.4s, v16.4s +sqrdmulh v16.4S, v8.4S, v27.s[3] +mul v8.4S, v8.4S,v28.s[3] +sub v11.4s, v21.4s, v15.4s +add v21.4s, v21.4s, v15.4s +mla v3.4S, v2.4S, v31.s[0] +mla v18.4S, v0.4S, v31.s[0] +sub v0.4s, v4.4s, v12.4s +mla v17.4S, v22.4S, v31.s[0] +mla v8.4S, v16.4S, v31.s[0] +add v4.4s, v4.4s, v12.4s +sqrdmulh v12.4S, v21.4S, v25.s[2] +mul v21.4S, v21.4S,v26.s[2] +sub v16.4s, v14.4s, v6.4s +sqrdmulh v22.4S, v11.4S, v25.s[3] +mul v11.4S, v11.4S,v26.s[3] +add v14.4s, v14.4s, v6.4s +sqrdmulh v6.4S, v16.4S, v25.s[1] +mul v16.4S, v16.4S,v26.s[1] +sub v2.4s, v13.4s, v3.4s +add v13.4s, v13.4s, v3.4s +sqrdmulh v3.4S, v14.4S, v25.s[0] +mul v14.4S, v14.4S,v26.s[0] +sub v15.4s, v7.4s, v18.4s +add v7.4s, v7.4s, v18.4s +mla v21.4S, v12.4S, v31.s[0] +mla v11.4S, v22.4S, v31.s[0] +sub v22.4s, v1.4s, v17.4s +mla v16.4S, v6.4S, v31.s[0] +mla v14.4S, v3.4S, v31.s[0] +add v1.4s, v1.4s, v17.4s +sqrdmulh v17.4S, v7.4S, v23.s[0] +mul v7.4S, v7.4S,v24.s[0] +sub v3.4s, v19.4s, v8.4s +sqrdmulh v6.4S, v15.4S, v23.s[1] +mul v15.4S, v15.4S,v24.s[1] +add v19.4s, v19.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v23.s[2] +mul v19.4S, v19.4S,v24.s[2] +sub v12.4s, v5.4s, v21.4s +add v5.4s, v5.4s, v21.4s +sqrdmulh v21.4S, v3.4S, v23.s[3] +mul v3.4S, v3.4S,v24.s[3] +sub v18.4s, v20.4s, v11.4s +add v20.4s, v20.4s, v11.4s +mla v7.4S, v17.4S, v31.s[0] +mla v15.4S, v6.4S, v31.s[0] +sub v6.4s, v0.4s, v16.4s +str q5, [x0, #256] +mla v19.4S, v8.4S, v31.s[0] +mla v3.4S, v21.4S, v31.s[0] +add v0.4s, v0.4s, v16.4s +str q12, [x0, #320] +ldr q12, [x0, #912] +sqrdmulh v16.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +sub v21.4s, v4.4s, v14.4s +str q20, [x0, #384] +ldr q20, [x0, #976] +sqrdmulh v8.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v4.4s, v4.4s, v14.4s +str q18, [x0, #448] +ldr q18, [x0, #784] +sqrdmulh v14.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +sub v5.4s, v13.4s, v7.4s +add v13.4s, v13.4s, v7.4s +ldr q7, [x0, #848] +sqrdmulh v17.4S, v7.4S, v29.s[0] +mul v7.4S, v7.4S,v30.s[0] +sub v11.4s, v2.4s, v15.4s +add v2.4s, v2.4s, v15.4s +mla v12.4S, v16.4S, v31.s[0] +mla v20.4S, v8.4S, v31.s[0] +sub v8.4s, v1.4s, v19.4s +str q0, [x0, #128] +mla v18.4S, v14.4S, v31.s[0] +mla v7.4S, v17.4S, v31.s[0] +add v1.4s, v1.4s, v19.4s +str q6, [x0, #192] +ldr q6, [x0, #528] +sqrdmulh v19.4S, v6.4S, v29.s[0] +mul v6.4S, v6.4S,v30.s[0] +sub v17.4s, v22.4s, v3.4s +str q4, [x0, #0] +ldr q4, [x0, #592] +sqrdmulh v14.4S, v4.4S, v29.s[0] +mul v4.4S, v4.4S,v30.s[0] +add v22.4s, v22.4s, v3.4s +str q21, [x0, #64] +ldr q21, [x0, #656] +ldr q3, [x0, #400] +sqrdmulh v0.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +sub v16.4s, v3.4s, v12.4s +add v3.4s, v3.4s, v12.4s +ldr q12, [x0, #720] +ldr q15, [x0, #464] +sqrdmulh v9.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +sub v10.4s, v15.4s, v20.4s +add v15.4s, v15.4s, v20.4s +ldr q20, [x0, #272] +mla v6.4S, v19.4S, v31.s[0] +mla v4.4S, v14.4S, v31.s[0] +sub v14.4s, v20.4s, v18.4s +str q13, [x0, #512] +mla v21.4S, v0.4S, v31.s[0] +mla v12.4S, v9.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +str q5, [x0, #576] +ldr q5, [x0, #336] +sqrdmulh v18.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v9.4s, v5.4s, v7.4s +str q2, [x0, #640] +sqrdmulh v2.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +add v5.4s, v5.4s, v7.4s +str q11, [x0, #704] +ldr q11, [x0, #16] +sqrdmulh v7.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v0.4s, v11.4s, v6.4s +add v11.4s, v11.4s, v6.4s +ldr q6, [x0, #80] +sqrdmulh v13.4S, v5.4S, v29.s[1] +mul v5.4S, v5.4S,v30.s[1] +sub v19.4s, v6.4s, v4.4s +add v6.4s, v6.4s, v4.4s +ldr q4, [x0, #144] +mla v3.4S, v18.4S, v31.s[0] +mla v15.4S, v2.4S, v31.s[0] +sub v2.4s, v4.4s, v21.4s +str q1, [x0, #768] +mla v20.4S, v7.4S, v31.s[0] +mla v5.4S, v13.4S, v31.s[0] +add v4.4s, v4.4s, v21.4s +str q8, [x0, #832] +ldr q8, [x0, #208] +sqrdmulh v21.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +sub v13.4s, v8.4s, v12.4s +str q22, [x0, #896] +sqrdmulh v22.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +add v8.4s, v8.4s, v12.4s +str q17, [x0, #960] +sqrdmulh v17.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v12.4s, v4.4s, v3.4s +add v4.4s, v4.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v29.s[2] +mul v9.4S, v9.4S,v30.s[2] +sub v7.4s, v8.4s, v15.4s +add v8.4s, v8.4s, v15.4s +mla v16.4S, v21.4S, v31.s[0] +mla v10.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v20.4s +mla v14.4S, v17.4S, v31.s[0] +mla v9.4S, v3.4S, v31.s[0] +add v11.4s, v11.4s, v20.4s +sqrdmulh v20.4S, v12.4S, v27.s[1] +mul v12.4S, v12.4S,v28.s[1] +sub v3.4s, v6.4s, v5.4s +sqrdmulh v17.4S, v7.4S, v27.s[1] +mul v7.4S, v7.4S,v28.s[1] +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v4.4S, v27.s[0] +mul v4.4S, v4.4S,v28.s[0] +sub v21.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v8.4S, v27.s[0] +mul v8.4S, v8.4S,v28.s[0] +sub v15.4s, v13.4s, v10.4s +add v13.4s, v13.4s, v10.4s +mla v12.4S, v20.4S, v31.s[0] +mla v7.4S, v17.4S, v31.s[0] +sub v17.4s, v0.4s, v14.4s +mla v4.4S, v5.4S, v31.s[0] +mla v8.4S, v16.4S, v31.s[0] +add v0.4s, v0.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v27.s[2] +mul v2.4S, v2.4S,v28.s[2] +sub v16.4s, v19.4s, v9.4s +sqrdmulh v5.4S, v13.4S, v27.s[2] +mul v13.4S, v13.4S,v28.s[2] +add v19.4s, v19.4s, v9.4s +sqrdmulh v9.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +sub v20.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +sub v10.4s, v3.4s, v7.4s +add v3.4s, v3.4s, v7.4s +mla v2.4S, v14.4S, v31.s[0] +mla v13.4S, v5.4S, v31.s[0] +sub v5.4s, v11.4s, v4.4s +mla v21.4S, v9.4S, v31.s[0] +mla v15.4S, v12.4S, v31.s[0] +add v11.4s, v11.4s, v4.4s +sqrdmulh v4.4S, v3.4S, v25.s[2] +mul v3.4S, v3.4S,v26.s[2] +sub v12.4s, v6.4s, v8.4s +sqrdmulh v9.4S, v10.4S, v25.s[3] +mul v10.4S, v10.4S,v26.s[3] +add v6.4s, v6.4s, v8.4s +sqrdmulh v8.4S, v12.4S, v25.s[1] +mul v12.4S, v12.4S,v26.s[1] +sub v14.4s, v0.4s, v2.4s +add v0.4s, v0.4s, v2.4s +sqrdmulh v2.4S, v6.4S, v25.s[0] +mul v6.4S, v6.4S,v26.s[0] +sub v7.4s, v19.4s, v13.4s +add v19.4s, v19.4s, v13.4s +mla v3.4S, v4.4S, v31.s[0] +mla v10.4S, v9.4S, v31.s[0] +sub v9.4s, v17.4s, v21.4s +mla v12.4S, v8.4S, v31.s[0] +mla v6.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v19.4S, v23.s[0] +mul v19.4S, v19.4S,v24.s[0] +sub v2.4s, v16.4s, v15.4s +sqrdmulh v8.4S, v7.4S, v23.s[1] +mul v7.4S, v7.4S,v24.s[1] +add v16.4s, v16.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v23.s[2] +mul v16.4S, v16.4S,v24.s[2] +sub v4.4s, v22.4s, v3.4s +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v2.4S, v23.s[3] +mul v2.4S, v2.4S,v24.s[3] +sub v13.4s, v20.4s, v10.4s +add v20.4s, v20.4s, v10.4s +mla v19.4S, v21.4S, v31.s[0] +mla v7.4S, v8.4S, v31.s[0] +sub v8.4s, v5.4s, v12.4s +str q22, [x0, #272] +mla v16.4S, v15.4S, v31.s[0] +mla v2.4S, v3.4S, v31.s[0] +add v5.4s, v5.4s, v12.4s +str q4, [x0, #336] +sub v23.4s, v11.4s, v6.4s +str q20, [x0, #400] +add v11.4s, v11.4s, v6.4s +str q13, [x0, #464] +sub v13.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sub v19.4s, v14.4s, v7.4s +add v14.4s, v14.4s, v7.4s +sub v7.4s, v17.4s, v16.4s +str q5, [x0, #144] +add v17.4s, v17.4s, v16.4s +str q8, [x0, #208] +sub v8.4s, v9.4s, v2.4s +str q11, [x0, #16] +add v9.4s, v9.4s, v2.4s +str q23, [x0, #80] +str q0, [x0, #528] +str q13, [x0, #592] +str q14, [x0, #656] +str q19, [x0, #720] +str q17, [x0, #784] +str q7, [x0, #848] +str q9, [x0, #912] +str q8, [x0, #976] +ldr q18, [x0, #224] +ldr q1, [x0, #160] +ldr q10, [x0, #32] +ldr q21, [x17, #+128] +ldr q22, [x17, #+144] +sqrdmulh v15.4S, v10.4S, v22.s[0] +mul v10.4S, v10.4S,v21.s[0] +ldr q3, [x0, #48] +ldr q12, [x17, #+160] +sqrdmulh v4.4S, v3.4S, v22.s[0] +mul v3.4S, v3.4S,v21.s[0] +ldr q30, [x17, #+176] +ldr q29, [x0, #96] +sqrdmulh v28.4S, v29.4S, v30.s[0] +mul v29.4S, v29.4S,v12.s[0] +ldr q27, [x0, #112] +sqrdmulh v26.4S, v27.4S, v30.s[0] +mul v27.4S, v27.4S,v12.s[0] +ldr q25, [x17, #+192] +ldr q24, [x17, #+208] +mla v10.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v1.4S, v24.s[0] +ldr q20, [x0, #176] +mla v3.4S, v4.4S, v31.s[0] +sqrdmulh v4.4S, v20.4S, v24.s[0] +ldr q6, [x17, #+224] +ldr q5, [x17, #+240] +mla v29.4S, v28.4S, v31.s[0] +sqrdmulh v28.4S, v18.4S, v5.s[0] +ldr q16, [x0, #240] +mla v27.4S, v26.4S, v31.s[0] +sqrdmulh v26.4S, v16.4S, v5.s[0] +ldr q11, [x0, #0] +ldr q2, [x0, #128] +mul v1.4S, v1.4S,v25.s[0] +mul v20.4S, v20.4S,v25.s[0] +ldr q23, [x0, #16] +ldr q0, [x0, #144] +mla v1.4S, v15.4S, v31.s[0] +mla v20.4S, v4.4S, v31.s[0] +sub v4.4s, v11.4s, v10.4s +ldr q15, [x0, #64] +add v11.4s, v11.4s, v10.4s +ldr q10, [x0, #192] +mul v18.4S, v18.4S,v6.s[0] +mul v16.4S, v16.4S,v6.s[0] +sub v13.4s, v23.4s, v3.4s +ldr q14, [x0, #80] +add v23.4s, v23.4s, v3.4s +ldr q3, [x0, #208] +mla v18.4S, v28.4S, v31.s[0] +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v15.4s, v29.4s +add v15.4s, v15.4s, v29.4s +sqrdmulh v29.4S, v23.4S, v22.s[1] +mul v23.4S, v23.4S,v21.s[1] +sub v28.4s, v14.4s, v27.4s +add v14.4s, v14.4s, v27.4s +sqrdmulh v27.4S, v13.4S, v22.s[2] +mul v13.4S, v13.4S,v21.s[2] +sub v19.4s, v2.4s, v1.4s +add v2.4s, v2.4s, v1.4s +sqrdmulh v22.4S, v14.4S, v30.s[1] +mul v14.4S, v14.4S,v12.s[1] +sub v1.4s, v0.4s, v20.4s +add v0.4s, v0.4s, v20.4s +sqrdmulh v20.4S, v28.4S, v30.s[2] +mul v28.4S, v28.4S,v12.s[2] +sub v21.4s, v10.4s, v18.4s +add v10.4s, v10.4s, v18.4s +mla v23.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v0.4S, v24.s[1] +sub v30.4s, v3.4s, v16.4s +ldr q18, [x0, #480] +add v3.4s, v3.4s, v16.4s +mla v13.4S, v27.4S, v31.s[0] +sqrdmulh v27.4S, v1.4S, v24.s[2] +sub v16.4s, v11.4s, v23.4s +ldr q12, [x0, #416] +str q16, [x0, #16] +mla v14.4S, v22.4S, v31.s[0] +sqrdmulh v22.4S, v3.4S, v5.s[1] +add v11.4s, v11.4s, v23.4s +ldr q23, [x0, #288] +str q11, [x0, #0] +mla v28.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v30.4S, v5.s[2] +sub v11.4s, v4.4s, v13.4s +ldr q16, [x17, #+256] +str q11, [x0, #48] +mul v0.4S, v0.4S,v25.s[1] +mul v1.4S, v1.4S,v25.s[2] +add v4.4s, v4.4s, v13.4s +str q4, [x0, #32] +ldr q4, [x17, #+272] +mla v0.4S, v29.4S, v31.s[0] +mla v1.4S, v27.4S, v31.s[0] +sub v27.4s, v15.4s, v14.4s +str q27, [x0, #80] +mul v3.4S, v3.4S,v6.s[1] +mul v30.4S, v30.4S,v6.s[2] +add v15.4s, v15.4s, v14.4s +str q15, [x0, #64] +mla v3.4S, v22.4S, v31.s[0] +mla v30.4S, v20.4S, v31.s[0] +sub v20.4s, v26.4s, v28.4s +str q20, [x0, #112] +sqrdmulh v5.4S, v23.4S, v4.s[0] +mul v23.4S, v23.4S,v16.s[0] +add v26.4s, v26.4s, v28.4s +ldr q28, [x0, #304] +str q26, [x0, #96] +ldr q26, [x17, #+288] +sqrdmulh v20.4S, v28.4S, v4.s[0] +mul v28.4S, v28.4S,v16.s[0] +sub v6.4s, v2.4s, v0.4s +ldr q22, [x17, #+304] +str q6, [x0, #144] +ldr q6, [x0, #352] +sqrdmulh v15.4S, v6.4S, v22.s[0] +mul v6.4S, v6.4S,v26.s[0] +add v2.4s, v2.4s, v0.4s +str q2, [x0, #128] +ldr q2, [x0, #368] +sqrdmulh v0.4S, v2.4S, v22.s[0] +mul v2.4S, v2.4S,v26.s[0] +sub v14.4s, v19.4s, v1.4s +ldr q24, [x17, #+320] +str q14, [x0, #176] +ldr q14, [x17, #+336] +mla v23.4S, v5.4S, v31.s[0] +sqrdmulh v5.4S, v12.4S, v14.s[0] +add v19.4s, v19.4s, v1.4s +ldr q1, [x0, #432] +str q19, [x0, #160] +mla v28.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v1.4S, v14.s[0] +sub v19.4s, v10.4s, v3.4s +ldr q27, [x17, #+352] +str q19, [x0, #208] +ldr q19, [x17, #+368] +mla v6.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v18.4S, v19.s[0] +add v10.4s, v10.4s, v3.4s +str q10, [x0, #192] +ldr q10, [x0, #496] +mla v2.4S, v0.4S, v31.s[0] +sqrdmulh v0.4S, v10.4S, v19.s[0] +sub v3.4s, v21.4s, v30.4s +ldr q25, [x0, #256] +str q3, [x0, #240] +ldr q3, [x0, #384] +mul v12.4S, v12.4S,v24.s[0] +mul v1.4S, v1.4S,v24.s[0] +add v21.4s, v21.4s, v30.4s +ldr q30, [x0, #272] +str q21, [x0, #224] +ldr q21, [x0, #400] +mla v12.4S, v5.4S, v31.s[0] +mla v1.4S, v20.4S, v31.s[0] +sub v20.4s, v25.4s, v23.4s +ldr q5, [x0, #320] +add v25.4s, v25.4s, v23.4s +ldr q23, [x0, #448] +mul v18.4S, v18.4S,v27.s[0] +mul v10.4S, v10.4S,v27.s[0] +sub v29.4s, v30.4s, v28.4s +ldr q13, [x0, #336] +add v30.4s, v30.4s, v28.4s +ldr q28, [x0, #464] +mla v18.4S, v15.4S, v31.s[0] +mla v10.4S, v0.4S, v31.s[0] +sub v0.4s, v5.4s, v6.4s +add v5.4s, v5.4s, v6.4s +sqrdmulh v6.4S, v30.4S, v4.s[1] +mul v30.4S, v30.4S,v16.s[1] +sub v15.4s, v13.4s, v2.4s +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v29.4S, v4.s[2] +mul v29.4S, v29.4S,v16.s[2] +sub v11.4s, v3.4s, v12.4s +add v3.4s, v3.4s, v12.4s +sqrdmulh v4.4S, v13.4S, v22.s[1] +mul v13.4S, v13.4S,v26.s[1] +sub v12.4s, v21.4s, v1.4s +add v21.4s, v21.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v22.s[2] +mul v15.4S, v15.4S,v26.s[2] +sub v16.4s, v23.4s, v18.4s +add v23.4s, v23.4s, v18.4s +mla v30.4S, v6.4S, v31.s[0] +sqrdmulh v6.4S, v21.4S, v14.s[1] +sub v22.4s, v28.4s, v10.4s +ldr q18, [x0, #736] +add v28.4s, v28.4s, v10.4s +mla v29.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v12.4S, v14.s[2] +sub v10.4s, v25.4s, v30.4s +ldr q26, [x0, #672] +str q10, [x0, #272] +mla v13.4S, v4.4S, v31.s[0] +sqrdmulh v4.4S, v28.4S, v19.s[1] +add v25.4s, v25.4s, v30.4s +ldr q30, [x0, #544] +str q25, [x0, #256] +mla v15.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v22.4S, v19.s[2] +sub v25.4s, v20.4s, v29.4s +ldr q10, [x17, #+384] +str q25, [x0, #304] +mul v21.4S, v21.4S,v24.s[1] +mul v12.4S, v12.4S,v24.s[2] +add v20.4s, v20.4s, v29.4s +str q20, [x0, #288] +ldr q20, [x17, #+400] +mla v21.4S, v6.4S, v31.s[0] +mla v12.4S, v2.4S, v31.s[0] +sub v2.4s, v5.4s, v13.4s +str q2, [x0, #336] +mul v28.4S, v28.4S,v27.s[1] +mul v22.4S, v22.4S,v27.s[2] +add v5.4s, v5.4s, v13.4s +str q5, [x0, #320] +mla v28.4S, v4.4S, v31.s[0] +mla v22.4S, v1.4S, v31.s[0] +sub v1.4s, v0.4s, v15.4s +str q1, [x0, #368] +sqrdmulh v19.4S, v30.4S, v20.s[0] +mul v30.4S, v30.4S,v10.s[0] +add v0.4s, v0.4s, v15.4s +ldr q15, [x0, #560] +str q0, [x0, #352] +ldr q0, [x17, #+416] +sqrdmulh v1.4S, v15.4S, v20.s[0] +mul v15.4S, v15.4S,v10.s[0] +sub v27.4s, v3.4s, v21.4s +ldr q4, [x17, #+432] +str q27, [x0, #400] +ldr q27, [x0, #608] +sqrdmulh v5.4S, v27.4S, v4.s[0] +mul v27.4S, v27.4S,v0.s[0] +add v3.4s, v3.4s, v21.4s +str q3, [x0, #384] +ldr q3, [x0, #624] +sqrdmulh v21.4S, v3.4S, v4.s[0] +mul v3.4S, v3.4S,v0.s[0] +sub v13.4s, v11.4s, v12.4s +ldr q14, [x17, #+448] +str q13, [x0, #432] +ldr q13, [x17, #+464] +mla v30.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v26.4S, v13.s[0] +add v11.4s, v11.4s, v12.4s +ldr q12, [x0, #688] +str q11, [x0, #416] +mla v15.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v12.4S, v13.s[0] +sub v11.4s, v23.4s, v28.4s +ldr q2, [x17, #+480] +str q11, [x0, #464] +ldr q11, [x17, #+496] +mla v27.4S, v5.4S, v31.s[0] +sqrdmulh v5.4S, v18.4S, v11.s[0] +add v23.4s, v23.4s, v28.4s +str q23, [x0, #448] +ldr q23, [x0, #752] +mla v3.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v23.4S, v11.s[0] +sub v28.4s, v16.4s, v22.4s +ldr q24, [x0, #512] +str q28, [x0, #496] +ldr q28, [x0, #640] +mul v26.4S, v26.4S,v14.s[0] +mul v12.4S, v12.4S,v14.s[0] +add v16.4s, v16.4s, v22.4s +ldr q22, [x0, #528] +str q16, [x0, #480] +ldr q16, [x0, #656] +mla v26.4S, v19.4S, v31.s[0] +mla v12.4S, v1.4S, v31.s[0] +sub v1.4s, v24.4s, v30.4s +ldr q19, [x0, #576] +add v24.4s, v24.4s, v30.4s +ldr q30, [x0, #704] +mul v18.4S, v18.4S,v2.s[0] +mul v23.4S, v23.4S,v2.s[0] +sub v6.4s, v22.4s, v15.4s +ldr q29, [x0, #592] +add v22.4s, v22.4s, v15.4s +ldr q15, [x0, #720] +mla v18.4S, v5.4S, v31.s[0] +mla v23.4S, v21.4S, v31.s[0] +sub v21.4s, v19.4s, v27.4s +add v19.4s, v19.4s, v27.4s +sqrdmulh v27.4S, v22.4S, v20.s[1] +mul v22.4S, v22.4S,v10.s[1] +sub v5.4s, v29.4s, v3.4s +add v29.4s, v29.4s, v3.4s +sqrdmulh v3.4S, v6.4S, v20.s[2] +mul v6.4S, v6.4S,v10.s[2] +sub v25.4s, v28.4s, v26.4s +add v28.4s, v28.4s, v26.4s +sqrdmulh v20.4S, v29.4S, v4.s[1] +mul v29.4S, v29.4S,v0.s[1] +sub v26.4s, v16.4s, v12.4s +add v16.4s, v16.4s, v12.4s +sqrdmulh v12.4S, v5.4S, v4.s[2] +mul v5.4S, v5.4S,v0.s[2] +sub v10.4s, v30.4s, v18.4s +add v30.4s, v30.4s, v18.4s +mla v22.4S, v27.4S, v31.s[0] +sqrdmulh v27.4S, v16.4S, v13.s[1] +sub v4.4s, v15.4s, v23.4s +ldr q18, [x0, #992] +add v15.4s, v15.4s, v23.4s +mla v6.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v26.4S, v13.s[2] +sub v23.4s, v24.4s, v22.4s +ldr q0, [x0, #928] +str q23, [x0, #528] +mla v29.4S, v20.4S, v31.s[0] +sqrdmulh v20.4S, v15.4S, v11.s[1] +add v24.4s, v24.4s, v22.4s +ldr q22, [x0, #800] +str q24, [x0, #512] +mla v5.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v4.4S, v11.s[2] +sub v24.4s, v1.4s, v6.4s +ldr q23, [x17, #+512] +str q24, [x0, #560] +mul v16.4S, v16.4S,v14.s[1] +mul v26.4S, v26.4S,v14.s[2] +add v1.4s, v1.4s, v6.4s +str q1, [x0, #544] +ldr q1, [x17, #+528] +mla v16.4S, v27.4S, v31.s[0] +mla v26.4S, v3.4S, v31.s[0] +sub v3.4s, v19.4s, v29.4s +str q3, [x0, #592] +mul v15.4S, v15.4S,v2.s[1] +mul v4.4S, v4.4S,v2.s[2] +add v19.4s, v19.4s, v29.4s +str q19, [x0, #576] +mla v15.4S, v20.4S, v31.s[0] +mla v4.4S, v12.4S, v31.s[0] +sub v12.4s, v21.4s, v5.4s +str q12, [x0, #624] +sqrdmulh v11.4S, v22.4S, v1.s[0] +mul v22.4S, v22.4S,v23.s[0] +add v21.4s, v21.4s, v5.4s +ldr q5, [x0, #816] +str q21, [x0, #608] +ldr q21, [x17, #+544] +sqrdmulh v12.4S, v5.4S, v1.s[0] +mul v5.4S, v5.4S,v23.s[0] +sub v2.4s, v28.4s, v16.4s +ldr q20, [x17, #+560] +str q2, [x0, #656] +ldr q2, [x0, #864] +sqrdmulh v19.4S, v2.4S, v20.s[0] +mul v2.4S, v2.4S,v21.s[0] +add v28.4s, v28.4s, v16.4s +str q28, [x0, #640] +ldr q28, [x0, #880] +sqrdmulh v16.4S, v28.4S, v20.s[0] +mul v28.4S, v28.4S,v21.s[0] +sub v29.4s, v25.4s, v26.4s +ldr q13, [x17, #+576] +str q29, [x0, #688] +ldr q29, [x17, #+592] +mla v22.4S, v11.4S, v31.s[0] +sqrdmulh v11.4S, v0.4S, v29.s[0] +add v25.4s, v25.4s, v26.4s +ldr q26, [x0, #944] +str q25, [x0, #672] +mla v5.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v26.4S, v29.s[0] +sub v25.4s, v30.4s, v15.4s +ldr q3, [x17, #+608] +str q25, [x0, #720] +ldr q25, [x17, #+624] +mla v2.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v18.4S, v25.s[0] +add v30.4s, v30.4s, v15.4s +str q30, [x0, #704] +ldr q30, [x0, #1008] +mla v28.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v30.4S, v25.s[0] +sub v15.4s, v10.4s, v4.4s +ldr q14, [x0, #768] +str q15, [x0, #752] +ldr q15, [x0, #896] +mul v0.4S, v0.4S,v13.s[0] +mul v26.4S, v26.4S,v13.s[0] +add v10.4s, v10.4s, v4.4s +ldr q4, [x0, #784] +str q10, [x0, #736] +ldr q10, [x0, #912] +mla v0.4S, v11.4S, v31.s[0] +mla v26.4S, v12.4S, v31.s[0] +sub v12.4s, v14.4s, v22.4s +ldr q11, [x0, #832] +add v14.4s, v14.4s, v22.4s +ldr q22, [x0, #960] +mul v18.4S, v18.4S,v3.s[0] +mul v30.4S, v30.4S,v3.s[0] +sub v27.4s, v4.4s, v5.4s +ldr q6, [x0, #848] +add v4.4s, v4.4s, v5.4s +ldr q5, [x0, #976] +mla v18.4S, v19.4S, v31.s[0] +mla v30.4S, v16.4S, v31.s[0] +sub v16.4s, v11.4s, v2.4s +add v11.4s, v11.4s, v2.4s +sqrdmulh v2.4S, v4.4S, v1.s[1] +mul v4.4S, v4.4S,v23.s[1] +sub v19.4s, v6.4s, v28.4s +add v6.4s, v6.4s, v28.4s +sqrdmulh v28.4S, v27.4S, v1.s[2] +mul v27.4S, v27.4S,v23.s[2] +sub v24.4s, v15.4s, v0.4s +add v15.4s, v15.4s, v0.4s +sqrdmulh v1.4S, v6.4S, v20.s[1] +mul v6.4S, v6.4S,v21.s[1] +sub v0.4s, v10.4s, v26.4s +add v10.4s, v10.4s, v26.4s +sqrdmulh v26.4S, v19.4S, v20.s[2] +mul v19.4S, v19.4S,v21.s[2] +sub v23.4s, v22.4s, v18.4s +add v22.4s, v22.4s, v18.4s +mla v4.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v10.4S, v29.s[1] +sub v20.4s, v5.4s, v30.4s +add v5.4s, v5.4s, v30.4s +mla v27.4S, v28.4S, v31.s[0] +sqrdmulh v28.4S, v0.4S, v29.s[2] +sub v30.4s, v14.4s, v4.4s +str q30, [x0, #784] +mla v6.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v5.4S, v25.s[1] +add v14.4s, v14.4s, v4.4s +str q14, [x0, #768] +mla v19.4S, v26.4S, v31.s[0] +sqrdmulh v26.4S, v20.4S, v25.s[2] +sub v14.4s, v12.4s, v27.4s +str q14, [x0, #816] +mul v10.4S, v10.4S,v13.s[1] +mul v0.4S, v0.4S,v13.s[2] +add v12.4s, v12.4s, v27.4s +str q12, [x0, #800] +mla v10.4S, v2.4S, v31.s[0] +mla v0.4S, v28.4S, v31.s[0] +sub v28.4s, v11.4s, v6.4s +str q28, [x0, #848] +mul v5.4S, v5.4S,v3.s[1] +mul v20.4S, v20.4S,v3.s[2] +add v11.4s, v11.4s, v6.4s +str q11, [x0, #832] +mla v5.4S, v1.4S, v31.s[0] +mla v20.4S, v26.4S, v31.s[0] +sub v26.4s, v16.4s, v19.4s +str q26, [x0, #880] +add v16.4s, v16.4s, v19.4s +str q16, [x0, #864] +sub v16.4s, v15.4s, v10.4s +str q16, [x0, #912] +add v15.4s, v15.4s, v10.4s +str q15, [x0, #896] +sub v15.4s, v24.4s, v0.4s +str q15, [x0, #944] +add v24.4s, v24.4s, v0.4s +str q24, [x0, #928] +sub v24.4s, v22.4s, v5.4s +str q24, [x0, #976] +add v22.4s, v22.4s, v5.4s +str q22, [x0, #960] +sub v22.4s, v23.4s, v20.4s +str q22, [x0, #1008] +add v23.4s, v23.4s, v20.4s +str q23, [x0, #992] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1464 +// Instruction count: 1460 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_2.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_2.s new file mode 100644 index 0000000..b48ef69 --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_2.s @@ -0,0 +1,1494 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_7_z4_2 +.global _ntt_u32_incomplete_neon_asm_var_4_2_7_z4_2 +ntt_u32_incomplete_neon_asm_var_4_2_7_z4_2: +_ntt_u32_incomplete_neon_asm_var_4_2_7_z4_2: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #928] +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +ldr q20, [x0, #992] +sqrdmulh v19.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q18, [x0, #800] +sqrdmulh v17.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +ldr q16, [x0, #864] +sqrdmulh v3.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +mla v22.4S, v21.4S, v31.s[0] +mla v20.4S, v19.4S, v31.s[0] +mla v18.4S, v17.4S, v31.s[0] +mla v16.4S, v3.4S, v31.s[0] +ldr q3, [x0, #544] +sqrdmulh v17.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +ldr q19, [x0, #608] +sqrdmulh v21.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q2, [x0, #672] +ldr q1, [x0, #416] +sqrdmulh v0.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +sub v15.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +ldr q22, [x0, #736] +ldr q14, [x0, #480] +sqrdmulh v13.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v12.4s, v14.4s, v20.4s +add v14.4s, v14.4s, v20.4s +ldr q20, [x0, #288] +mla v3.4S, v17.4S, v31.s[0] +mla v19.4S, v21.4S, v31.s[0] +sub v21.4s, v20.4s, v18.4s +mla v2.4S, v0.4S, v31.s[0] +mla v22.4S, v13.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +ldr q18, [x0, #352] +sqrdmulh v13.4S, v1.4S, v29.s[1] +mul v1.4S, v1.4S,v30.s[1] +sub v0.4s, v18.4s, v16.4s +sqrdmulh v17.4S, v14.4S, v29.s[1] +mul v14.4S, v14.4S,v30.s[1] +add v18.4s, v18.4s, v16.4s +ldr q16, [x0, #32] +sqrdmulh v11.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v10.4s, v16.4s, v3.4s +add v16.4s, v16.4s, v3.4s +ldr q3, [x0, #96] +sqrdmulh v9.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v8.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +ldr q19, [x0, #160] +mla v1.4S, v13.4S, v31.s[0] +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v19.4s, v2.4s +mla v20.4S, v11.4S, v31.s[0] +mla v18.4S, v9.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +ldr q2, [x0, #224] +sqrdmulh v9.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +sub v11.4s, v2.4s, v22.4s +sqrdmulh v13.4S, v12.4S, v29.s[2] +mul v12.4S, v12.4S,v30.s[2] +add v2.4s, v2.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +sub v7.4s, v19.4s, v1.4s +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v29.s[2] +mul v0.4S, v0.4S,v30.s[2] +sub v6.4s, v2.4s, v14.4s +add v2.4s, v2.4s, v14.4s +mla v15.4S, v9.4S, v31.s[0] +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v16.4s, v20.4s +mla v21.4S, v22.4S, v31.s[0] +mla v0.4S, v1.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v7.4S, v27.s[1] +mul v7.4S, v7.4S,v28.s[1] +sub v1.4s, v3.4s, v18.4s +sqrdmulh v22.4S, v6.4S, v27.s[1] +mul v6.4S, v6.4S,v28.s[1] +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v19.4S, v27.s[0] +mul v19.4S, v19.4S,v28.s[0] +sub v9.4s, v17.4s, v15.4s +add v17.4s, v17.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v27.s[0] +mul v2.4S, v2.4S,v28.s[0] +sub v14.4s, v11.4s, v12.4s +add v11.4s, v11.4s, v12.4s +mla v7.4S, v20.4S, v31.s[0] +mla v6.4S, v22.4S, v31.s[0] +sub v22.4s, v10.4s, v21.4s +mla v19.4S, v18.4S, v31.s[0] +mla v2.4S, v15.4S, v31.s[0] +add v10.4s, v10.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v27.s[2] +mul v17.4S, v17.4S,v28.s[2] +sub v15.4s, v8.4s, v0.4s +sqrdmulh v18.4S, v11.4S, v27.s[2] +mul v11.4S, v11.4S,v28.s[2] +add v8.4s, v8.4s, v0.4s +sqrdmulh v0.4S, v9.4S, v27.s[3] +mul v9.4S, v9.4S,v28.s[3] +sub v20.4s, v13.4s, v7.4s +add v13.4s, v13.4s, v7.4s +sqrdmulh v7.4S, v14.4S, v27.s[3] +mul v14.4S, v14.4S,v28.s[3] +sub v12.4s, v1.4s, v6.4s +add v1.4s, v1.4s, v6.4s +mla v17.4S, v21.4S, v31.s[0] +mla v11.4S, v18.4S, v31.s[0] +sub v18.4s, v16.4s, v19.4s +mla v9.4S, v0.4S, v31.s[0] +mla v14.4S, v7.4S, v31.s[0] +add v16.4s, v16.4s, v19.4s +sqrdmulh v19.4S, v1.4S, v25.s[2] +mul v1.4S, v1.4S,v26.s[2] +sub v7.4s, v3.4s, v2.4s +sqrdmulh v0.4S, v12.4S, v25.s[3] +mul v12.4S, v12.4S,v26.s[3] +add v3.4s, v3.4s, v2.4s +sqrdmulh v2.4S, v7.4S, v25.s[1] +mul v7.4S, v7.4S,v26.s[1] +sub v21.4s, v10.4s, v17.4s +add v10.4s, v10.4s, v17.4s +sqrdmulh v17.4S, v3.4S, v25.s[0] +mul v3.4S, v3.4S,v26.s[0] +sub v6.4s, v8.4s, v11.4s +add v8.4s, v8.4s, v11.4s +mla v1.4S, v19.4S, v31.s[0] +mla v12.4S, v0.4S, v31.s[0] +sub v0.4s, v22.4s, v9.4s +mla v7.4S, v2.4S, v31.s[0] +mla v3.4S, v17.4S, v31.s[0] +add v22.4s, v22.4s, v9.4s +sqrdmulh v9.4S, v8.4S, v23.s[0] +mul v8.4S, v8.4S,v24.s[0] +sub v17.4s, v15.4s, v14.4s +sqrdmulh v2.4S, v6.4S, v23.s[1] +mul v6.4S, v6.4S,v24.s[1] +add v15.4s, v15.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v23.s[2] +mul v15.4S, v15.4S,v24.s[2] +sub v19.4s, v13.4s, v1.4s +add v13.4s, v13.4s, v1.4s +sqrdmulh v1.4S, v17.4S, v23.s[3] +mul v17.4S, v17.4S,v24.s[3] +sub v11.4s, v20.4s, v12.4s +add v20.4s, v20.4s, v12.4s +mla v8.4S, v9.4S, v31.s[0] +mla v6.4S, v2.4S, v31.s[0] +sub v2.4s, v18.4s, v7.4s +str q13, [x0, #288] +mla v15.4S, v14.4S, v31.s[0] +mla v17.4S, v1.4S, v31.s[0] +add v18.4s, v18.4s, v7.4s +str q19, [x0, #352] +ldr q19, [x0, #944] +sqrdmulh v7.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +sub v1.4s, v16.4s, v3.4s +str q20, [x0, #416] +ldr q20, [x0, #1008] +sqrdmulh v14.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v16.4s, v16.4s, v3.4s +str q11, [x0, #480] +ldr q11, [x0, #816] +sqrdmulh v3.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +sub v13.4s, v10.4s, v8.4s +add v10.4s, v10.4s, v8.4s +ldr q8, [x0, #880] +sqrdmulh v9.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v12.4s, v21.4s, v6.4s +add v21.4s, v21.4s, v6.4s +mla v19.4S, v7.4S, v31.s[0] +mla v20.4S, v14.4S, v31.s[0] +sub v14.4s, v22.4s, v15.4s +str q18, [x0, #160] +mla v11.4S, v3.4S, v31.s[0] +mla v8.4S, v9.4S, v31.s[0] +add v22.4s, v22.4s, v15.4s +str q2, [x0, #224] +ldr q2, [x0, #560] +sqrdmulh v15.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +sub v9.4s, v0.4s, v17.4s +str q16, [x0, #32] +ldr q16, [x0, #624] +sqrdmulh v3.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +add v0.4s, v0.4s, v17.4s +str q1, [x0, #96] +ldr q1, [x0, #688] +ldr q17, [x0, #432] +sqrdmulh v18.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +sub v7.4s, v17.4s, v19.4s +add v17.4s, v17.4s, v19.4s +ldr q19, [x0, #752] +ldr q6, [x0, #496] +sqrdmulh v5.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +sub v4.4s, v6.4s, v20.4s +add v6.4s, v6.4s, v20.4s +ldr q20, [x0, #304] +mla v2.4S, v15.4S, v31.s[0] +mla v16.4S, v3.4S, v31.s[0] +sub v3.4s, v20.4s, v11.4s +str q10, [x0, #544] +mla v1.4S, v18.4S, v31.s[0] +mla v19.4S, v5.4S, v31.s[0] +add v20.4s, v20.4s, v11.4s +str q13, [x0, #608] +ldr q13, [x0, #368] +sqrdmulh v11.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v5.4s, v13.4s, v8.4s +str q21, [x0, #672] +sqrdmulh v21.4S, v6.4S, v29.s[1] +mul v6.4S, v6.4S,v30.s[1] +add v13.4s, v13.4s, v8.4s +str q12, [x0, #736] +ldr q12, [x0, #48] +sqrdmulh v8.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v18.4s, v12.4s, v2.4s +add v12.4s, v12.4s, v2.4s +ldr q2, [x0, #112] +sqrdmulh v10.4S, v13.4S, v29.s[1] +mul v13.4S, v13.4S,v30.s[1] +sub v15.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +ldr q16, [x0, #176] +mla v17.4S, v11.4S, v31.s[0] +mla v6.4S, v21.4S, v31.s[0] +sub v21.4s, v16.4s, v1.4s +str q22, [x0, #800] +mla v20.4S, v8.4S, v31.s[0] +mla v13.4S, v10.4S, v31.s[0] +add v16.4s, v16.4s, v1.4s +str q14, [x0, #864] +ldr q14, [x0, #240] +sqrdmulh v1.4S, v7.4S, v29.s[2] +mul v7.4S, v7.4S,v30.s[2] +sub v10.4s, v14.4s, v19.4s +str q0, [x0, #928] +sqrdmulh v0.4S, v4.4S, v29.s[2] +mul v4.4S, v4.4S,v30.s[2] +add v14.4s, v14.4s, v19.4s +str q9, [x0, #992] +sqrdmulh v9.4S, v3.4S, v29.s[2] +mul v3.4S, v3.4S,v30.s[2] +sub v19.4s, v16.4s, v17.4s +add v16.4s, v16.4s, v17.4s +sqrdmulh v17.4S, v5.4S, v29.s[2] +mul v5.4S, v5.4S,v30.s[2] +sub v8.4s, v14.4s, v6.4s +add v14.4s, v14.4s, v6.4s +mla v7.4S, v1.4S, v31.s[0] +mla v4.4S, v0.4S, v31.s[0] +sub v0.4s, v12.4s, v20.4s +mla v3.4S, v9.4S, v31.s[0] +mla v5.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v27.s[1] +mul v19.4S, v19.4S,v28.s[1] +sub v17.4s, v2.4s, v13.4s +sqrdmulh v9.4S, v8.4S, v27.s[1] +mul v8.4S, v8.4S,v28.s[1] +add v2.4s, v2.4s, v13.4s +sqrdmulh v13.4S, v16.4S, v27.s[0] +mul v16.4S, v16.4S,v28.s[0] +sub v1.4s, v21.4s, v7.4s +add v21.4s, v21.4s, v7.4s +sqrdmulh v7.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +sub v6.4s, v10.4s, v4.4s +add v10.4s, v10.4s, v4.4s +mla v19.4S, v20.4S, v31.s[0] +mla v8.4S, v9.4S, v31.s[0] +sub v9.4s, v18.4s, v3.4s +mla v16.4S, v13.4S, v31.s[0] +mla v14.4S, v7.4S, v31.s[0] +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v27.s[2] +mul v21.4S, v21.4S,v28.s[2] +sub v7.4s, v15.4s, v5.4s +sqrdmulh v13.4S, v10.4S, v27.s[2] +mul v10.4S, v10.4S,v28.s[2] +add v15.4s, v15.4s, v5.4s +sqrdmulh v5.4S, v1.4S, v27.s[3] +mul v1.4S, v1.4S,v28.s[3] +sub v20.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v27.s[3] +mul v6.4S, v6.4S,v28.s[3] +sub v4.4s, v17.4s, v8.4s +add v17.4s, v17.4s, v8.4s +mla v21.4S, v3.4S, v31.s[0] +mla v10.4S, v13.4S, v31.s[0] +sub v13.4s, v12.4s, v16.4s +mla v1.4S, v5.4S, v31.s[0] +mla v6.4S, v19.4S, v31.s[0] +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v25.s[2] +mul v17.4S, v17.4S,v26.s[2] +sub v19.4s, v2.4s, v14.4s +sqrdmulh v5.4S, v4.4S, v25.s[3] +mul v4.4S, v4.4S,v26.s[3] +add v2.4s, v2.4s, v14.4s +sqrdmulh v14.4S, v19.4S, v25.s[1] +mul v19.4S, v19.4S,v26.s[1] +sub v3.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v2.4S, v25.s[0] +mul v2.4S, v2.4S,v26.s[0] +sub v8.4s, v15.4s, v10.4s +add v15.4s, v15.4s, v10.4s +mla v17.4S, v16.4S, v31.s[0] +mla v4.4S, v5.4S, v31.s[0] +sub v5.4s, v9.4s, v1.4s +mla v19.4S, v14.4S, v31.s[0] +mla v2.4S, v21.4S, v31.s[0] +add v9.4s, v9.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v23.s[0] +mul v15.4S, v15.4S,v24.s[0] +sub v21.4s, v7.4s, v6.4s +sqrdmulh v14.4S, v8.4S, v23.s[1] +mul v8.4S, v8.4S,v24.s[1] +add v7.4s, v7.4s, v6.4s +sqrdmulh v6.4S, v7.4S, v23.s[2] +mul v7.4S, v7.4S,v24.s[2] +sub v16.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +sqrdmulh v17.4S, v21.4S, v23.s[3] +mul v21.4S, v21.4S,v24.s[3] +sub v10.4s, v20.4s, v4.4s +add v20.4s, v20.4s, v4.4s +mla v15.4S, v1.4S, v31.s[0] +mla v8.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v19.4s +str q0, [x0, #304] +mla v7.4S, v6.4S, v31.s[0] +mla v21.4S, v17.4S, v31.s[0] +add v13.4s, v13.4s, v19.4s +str q16, [x0, #368] +ldr q16, [x0, #896] +sqrdmulh v19.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v17.4s, v12.4s, v2.4s +str q20, [x0, #432] +ldr q20, [x0, #960] +sqrdmulh v6.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v12.4s, v12.4s, v2.4s +str q10, [x0, #496] +ldr q10, [x0, #768] +sqrdmulh v2.4S, v10.4S, v29.s[0] +mul v10.4S, v10.4S,v30.s[0] +sub v0.4s, v18.4s, v15.4s +add v18.4s, v18.4s, v15.4s +ldr q15, [x0, #832] +sqrdmulh v1.4S, v15.4S, v29.s[0] +mul v15.4S, v15.4S,v30.s[0] +sub v4.4s, v3.4s, v8.4s +add v3.4s, v3.4s, v8.4s +mla v16.4S, v19.4S, v31.s[0] +mla v20.4S, v6.4S, v31.s[0] +sub v6.4s, v9.4s, v7.4s +str q13, [x0, #176] +mla v10.4S, v2.4S, v31.s[0] +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v7.4s +str q14, [x0, #240] +ldr q14, [x0, #512] +sqrdmulh v7.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v1.4s, v5.4s, v21.4s +str q12, [x0, #48] +ldr q12, [x0, #576] +sqrdmulh v2.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +add v5.4s, v5.4s, v21.4s +str q17, [x0, #112] +ldr q17, [x0, #640] +ldr q21, [x0, #384] +sqrdmulh v13.4S, v17.4S, v29.s[0] +mul v17.4S, v17.4S,v30.s[0] +sub v19.4s, v21.4s, v16.4s +add v21.4s, v21.4s, v16.4s +ldr q16, [x0, #704] +ldr q8, [x0, #448] +sqrdmulh v22.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v11.4s, v8.4s, v20.4s +add v8.4s, v8.4s, v20.4s +ldr q20, [x0, #256] +mla v14.4S, v7.4S, v31.s[0] +mla v12.4S, v2.4S, v31.s[0] +sub v2.4s, v20.4s, v10.4s +str q18, [x0, #560] +mla v17.4S, v13.4S, v31.s[0] +mla v16.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v10.4s +str q0, [x0, #624] +ldr q0, [x0, #320] +sqrdmulh v10.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v22.4s, v0.4s, v15.4s +str q3, [x0, #688] +sqrdmulh v3.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +add v0.4s, v0.4s, v15.4s +str q4, [x0, #752] +ldr q4, [x0, #0] +sqrdmulh v15.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v13.4s, v4.4s, v14.4s +add v4.4s, v4.4s, v14.4s +ldr q14, [x0, #64] +sqrdmulh v18.4S, v0.4S, v29.s[1] +mul v0.4S, v0.4S,v30.s[1] +sub v7.4s, v14.4s, v12.4s +add v14.4s, v14.4s, v12.4s +ldr q12, [x0, #128] +mla v21.4S, v10.4S, v31.s[0] +mla v8.4S, v3.4S, v31.s[0] +sub v3.4s, v12.4s, v17.4s +str q9, [x0, #816] +mla v20.4S, v15.4S, v31.s[0] +mla v0.4S, v18.4S, v31.s[0] +add v12.4s, v12.4s, v17.4s +str q6, [x0, #880] +ldr q6, [x0, #192] +sqrdmulh v17.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +sub v18.4s, v6.4s, v16.4s +str q5, [x0, #944] +sqrdmulh v5.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +add v6.4s, v6.4s, v16.4s +str q1, [x0, #1008] +sqrdmulh v1.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v16.4s, v12.4s, v21.4s +add v12.4s, v12.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +sub v15.4s, v6.4s, v8.4s +add v6.4s, v6.4s, v8.4s +mla v19.4S, v17.4S, v31.s[0] +mla v11.4S, v5.4S, v31.s[0] +sub v5.4s, v4.4s, v20.4s +mla v2.4S, v1.4S, v31.s[0] +mla v22.4S, v21.4S, v31.s[0] +add v4.4s, v4.4s, v20.4s +sqrdmulh v20.4S, v16.4S, v27.s[1] +mul v16.4S, v16.4S,v28.s[1] +sub v21.4s, v14.4s, v0.4s +sqrdmulh v1.4S, v15.4S, v27.s[1] +mul v15.4S, v15.4S,v28.s[1] +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +sub v17.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v27.s[0] +mul v6.4S, v6.4S,v28.s[0] +sub v8.4s, v18.4s, v11.4s +add v18.4s, v18.4s, v11.4s +mla v16.4S, v20.4S, v31.s[0] +mla v15.4S, v1.4S, v31.s[0] +sub v1.4s, v13.4s, v2.4s +mla v12.4S, v0.4S, v31.s[0] +mla v6.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v3.4S, v27.s[2] +mul v3.4S, v3.4S,v28.s[2] +sub v19.4s, v7.4s, v22.4s +sqrdmulh v0.4S, v18.4S, v27.s[2] +mul v18.4S, v18.4S,v28.s[2] +add v7.4s, v7.4s, v22.4s +sqrdmulh v22.4S, v17.4S, v27.s[3] +mul v17.4S, v17.4S,v28.s[3] +sub v20.4s, v5.4s, v16.4s +add v5.4s, v5.4s, v16.4s +sqrdmulh v16.4S, v8.4S, v27.s[3] +mul v8.4S, v8.4S,v28.s[3] +sub v11.4s, v21.4s, v15.4s +add v21.4s, v21.4s, v15.4s +mla v3.4S, v2.4S, v31.s[0] +mla v18.4S, v0.4S, v31.s[0] +sub v0.4s, v4.4s, v12.4s +mla v17.4S, v22.4S, v31.s[0] +mla v8.4S, v16.4S, v31.s[0] +add v4.4s, v4.4s, v12.4s +sqrdmulh v12.4S, v21.4S, v25.s[2] +mul v21.4S, v21.4S,v26.s[2] +sub v16.4s, v14.4s, v6.4s +sqrdmulh v22.4S, v11.4S, v25.s[3] +mul v11.4S, v11.4S,v26.s[3] +add v14.4s, v14.4s, v6.4s +sqrdmulh v6.4S, v16.4S, v25.s[1] +mul v16.4S, v16.4S,v26.s[1] +sub v2.4s, v13.4s, v3.4s +add v13.4s, v13.4s, v3.4s +sqrdmulh v3.4S, v14.4S, v25.s[0] +mul v14.4S, v14.4S,v26.s[0] +sub v15.4s, v7.4s, v18.4s +add v7.4s, v7.4s, v18.4s +mla v21.4S, v12.4S, v31.s[0] +mla v11.4S, v22.4S, v31.s[0] +sub v22.4s, v1.4s, v17.4s +mla v16.4S, v6.4S, v31.s[0] +mla v14.4S, v3.4S, v31.s[0] +add v1.4s, v1.4s, v17.4s +sqrdmulh v17.4S, v7.4S, v23.s[0] +mul v7.4S, v7.4S,v24.s[0] +sub v3.4s, v19.4s, v8.4s +sqrdmulh v6.4S, v15.4S, v23.s[1] +mul v15.4S, v15.4S,v24.s[1] +add v19.4s, v19.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v23.s[2] +mul v19.4S, v19.4S,v24.s[2] +sub v12.4s, v5.4s, v21.4s +add v5.4s, v5.4s, v21.4s +sqrdmulh v21.4S, v3.4S, v23.s[3] +mul v3.4S, v3.4S,v24.s[3] +sub v18.4s, v20.4s, v11.4s +add v20.4s, v20.4s, v11.4s +mla v7.4S, v17.4S, v31.s[0] +mla v15.4S, v6.4S, v31.s[0] +sub v6.4s, v0.4s, v16.4s +str q5, [x0, #256] +mla v19.4S, v8.4S, v31.s[0] +mla v3.4S, v21.4S, v31.s[0] +add v0.4s, v0.4s, v16.4s +str q12, [x0, #320] +ldr q12, [x0, #912] +sqrdmulh v16.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +sub v21.4s, v4.4s, v14.4s +str q20, [x0, #384] +ldr q20, [x0, #976] +sqrdmulh v8.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v4.4s, v4.4s, v14.4s +str q18, [x0, #448] +ldr q18, [x0, #784] +sqrdmulh v14.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +sub v5.4s, v13.4s, v7.4s +add v13.4s, v13.4s, v7.4s +ldr q7, [x0, #848] +sqrdmulh v17.4S, v7.4S, v29.s[0] +mul v7.4S, v7.4S,v30.s[0] +sub v11.4s, v2.4s, v15.4s +add v2.4s, v2.4s, v15.4s +mla v12.4S, v16.4S, v31.s[0] +mla v20.4S, v8.4S, v31.s[0] +sub v8.4s, v1.4s, v19.4s +str q0, [x0, #128] +mla v18.4S, v14.4S, v31.s[0] +mla v7.4S, v17.4S, v31.s[0] +add v1.4s, v1.4s, v19.4s +str q6, [x0, #192] +ldr q6, [x0, #528] +sqrdmulh v19.4S, v6.4S, v29.s[0] +mul v6.4S, v6.4S,v30.s[0] +sub v17.4s, v22.4s, v3.4s +str q4, [x0, #0] +ldr q4, [x0, #592] +sqrdmulh v14.4S, v4.4S, v29.s[0] +mul v4.4S, v4.4S,v30.s[0] +add v22.4s, v22.4s, v3.4s +str q21, [x0, #64] +ldr q21, [x0, #656] +ldr q3, [x0, #400] +sqrdmulh v0.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +sub v16.4s, v3.4s, v12.4s +add v3.4s, v3.4s, v12.4s +ldr q12, [x0, #720] +ldr q15, [x0, #464] +sqrdmulh v9.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +sub v10.4s, v15.4s, v20.4s +add v15.4s, v15.4s, v20.4s +ldr q20, [x0, #272] +mla v6.4S, v19.4S, v31.s[0] +mla v4.4S, v14.4S, v31.s[0] +sub v14.4s, v20.4s, v18.4s +str q13, [x0, #512] +mla v21.4S, v0.4S, v31.s[0] +mla v12.4S, v9.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +str q5, [x0, #576] +ldr q5, [x0, #336] +sqrdmulh v18.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v9.4s, v5.4s, v7.4s +str q2, [x0, #640] +sqrdmulh v2.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +add v5.4s, v5.4s, v7.4s +str q11, [x0, #704] +ldr q11, [x0, #16] +sqrdmulh v7.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v0.4s, v11.4s, v6.4s +add v11.4s, v11.4s, v6.4s +ldr q6, [x0, #80] +sqrdmulh v13.4S, v5.4S, v29.s[1] +mul v5.4S, v5.4S,v30.s[1] +sub v19.4s, v6.4s, v4.4s +add v6.4s, v6.4s, v4.4s +ldr q4, [x0, #144] +mla v3.4S, v18.4S, v31.s[0] +mla v15.4S, v2.4S, v31.s[0] +sub v2.4s, v4.4s, v21.4s +str q1, [x0, #768] +mla v20.4S, v7.4S, v31.s[0] +mla v5.4S, v13.4S, v31.s[0] +add v4.4s, v4.4s, v21.4s +str q8, [x0, #832] +ldr q8, [x0, #208] +sqrdmulh v21.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +sub v13.4s, v8.4s, v12.4s +str q22, [x0, #896] +sqrdmulh v22.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +add v8.4s, v8.4s, v12.4s +str q17, [x0, #960] +sqrdmulh v17.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v12.4s, v4.4s, v3.4s +add v4.4s, v4.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v29.s[2] +mul v9.4S, v9.4S,v30.s[2] +sub v7.4s, v8.4s, v15.4s +add v8.4s, v8.4s, v15.4s +mla v16.4S, v21.4S, v31.s[0] +mla v10.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v20.4s +mla v14.4S, v17.4S, v31.s[0] +mla v9.4S, v3.4S, v31.s[0] +add v11.4s, v11.4s, v20.4s +sqrdmulh v20.4S, v12.4S, v27.s[1] +mul v12.4S, v12.4S,v28.s[1] +sub v3.4s, v6.4s, v5.4s +sqrdmulh v17.4S, v7.4S, v27.s[1] +mul v7.4S, v7.4S,v28.s[1] +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v4.4S, v27.s[0] +mul v4.4S, v4.4S,v28.s[0] +sub v21.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v8.4S, v27.s[0] +mul v8.4S, v8.4S,v28.s[0] +sub v15.4s, v13.4s, v10.4s +add v13.4s, v13.4s, v10.4s +mla v12.4S, v20.4S, v31.s[0] +mla v7.4S, v17.4S, v31.s[0] +sub v17.4s, v0.4s, v14.4s +mla v4.4S, v5.4S, v31.s[0] +mla v8.4S, v16.4S, v31.s[0] +add v0.4s, v0.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v27.s[2] +mul v2.4S, v2.4S,v28.s[2] +sub v16.4s, v19.4s, v9.4s +sqrdmulh v5.4S, v13.4S, v27.s[2] +mul v13.4S, v13.4S,v28.s[2] +add v19.4s, v19.4s, v9.4s +sqrdmulh v9.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +sub v20.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +sub v10.4s, v3.4s, v7.4s +add v3.4s, v3.4s, v7.4s +mla v2.4S, v14.4S, v31.s[0] +mla v13.4S, v5.4S, v31.s[0] +sub v5.4s, v11.4s, v4.4s +mla v21.4S, v9.4S, v31.s[0] +mla v15.4S, v12.4S, v31.s[0] +add v11.4s, v11.4s, v4.4s +sqrdmulh v4.4S, v3.4S, v25.s[2] +mul v3.4S, v3.4S,v26.s[2] +sub v12.4s, v6.4s, v8.4s +sqrdmulh v9.4S, v10.4S, v25.s[3] +mul v10.4S, v10.4S,v26.s[3] +add v6.4s, v6.4s, v8.4s +sqrdmulh v8.4S, v12.4S, v25.s[1] +mul v12.4S, v12.4S,v26.s[1] +sub v14.4s, v0.4s, v2.4s +add v0.4s, v0.4s, v2.4s +sqrdmulh v2.4S, v6.4S, v25.s[0] +mul v6.4S, v6.4S,v26.s[0] +sub v7.4s, v19.4s, v13.4s +add v19.4s, v19.4s, v13.4s +mla v3.4S, v4.4S, v31.s[0] +mla v10.4S, v9.4S, v31.s[0] +sub v9.4s, v17.4s, v21.4s +mla v12.4S, v8.4S, v31.s[0] +mla v6.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v19.4S, v23.s[0] +mul v19.4S, v19.4S,v24.s[0] +sub v2.4s, v16.4s, v15.4s +sqrdmulh v8.4S, v7.4S, v23.s[1] +mul v7.4S, v7.4S,v24.s[1] +add v16.4s, v16.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v23.s[2] +mul v16.4S, v16.4S,v24.s[2] +sub v4.4s, v22.4s, v3.4s +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v2.4S, v23.s[3] +mul v2.4S, v2.4S,v24.s[3] +sub v13.4s, v20.4s, v10.4s +add v20.4s, v20.4s, v10.4s +mla v19.4S, v21.4S, v31.s[0] +mla v7.4S, v8.4S, v31.s[0] +sub v8.4s, v5.4s, v12.4s +str q22, [x0, #272] +mla v16.4S, v15.4S, v31.s[0] +mla v2.4S, v3.4S, v31.s[0] +add v5.4s, v5.4s, v12.4s +str q4, [x0, #336] +sub v23.4s, v11.4s, v6.4s +str q20, [x0, #400] +add v11.4s, v11.4s, v6.4s +str q13, [x0, #464] +sub v13.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sub v19.4s, v14.4s, v7.4s +add v14.4s, v14.4s, v7.4s +sub v7.4s, v17.4s, v16.4s +str q5, [x0, #144] +add v17.4s, v17.4s, v16.4s +str q8, [x0, #208] +sub v8.4s, v9.4s, v2.4s +str q11, [x0, #16] +add v9.4s, v9.4s, v2.4s +str q23, [x0, #80] +str q0, [x0, #528] +str q13, [x0, #592] +str q14, [x0, #656] +str q19, [x0, #720] +str q17, [x0, #784] +str q7, [x0, #848] +str q9, [x0, #912] +str q8, [x0, #976] +ldr q18, [x17, #+128] +ldr q1, [x17, #+144] +ldr q10, [x17, #+160] +ldr q21, [x17, #+176] +ldr q22, [x17, #+192] +ldr q15, [x17, #+208] +ldr q3, [x17, #+224] +ldr q12, [x17, #+240] +ldr q4, [x0, #32] +ldr q30, [x0, #48] +ldr q29, [x0, #0] +ldr q28, [x0, #96] +ldr q27, [x0, #112] +ldr q26, [x0, #64] +ldr q25, [x0, #160] +ldr q24, [x0, #176] +ldr q20, [x0, #128] +ldr q6, [x0, #224] +ldr q5, [x0, #240] +ldr q16, [x0, #192] +sqrdmulh v11.4S, v4.4S, v1.s[0] +mul v4.4S, v4.4S,v18.s[0] +mla v4.4S, v11.4S, v31.s[0] +sub v11.4s, v29.4s, v4.4s +add v29.4s, v29.4s, v4.4s +ldr q4, [x0, #16] +sqrdmulh v2.4S, v28.4S, v21.s[0] +mul v28.4S, v28.4S,v10.s[0] +mla v28.4S, v2.4S, v31.s[0] +sub v2.4s, v26.4s, v28.4s +add v26.4s, v26.4s, v28.4s +ldr q28, [x0, #80] +sqrdmulh v23.4S, v25.4S, v15.s[0] +mul v25.4S, v25.4S,v22.s[0] +mla v25.4S, v23.4S, v31.s[0] +sub v23.4s, v20.4s, v25.4s +add v20.4s, v20.4s, v25.4s +ldr q25, [x0, #144] +sqrdmulh v0.4S, v6.4S, v12.s[0] +mul v6.4S, v6.4S,v3.s[0] +mla v6.4S, v0.4S, v31.s[0] +sub v0.4s, v16.4s, v6.4s +add v16.4s, v16.4s, v6.4s +ldr q6, [x0, #208] +sqrdmulh v13.4S, v30.4S, v1.s[0] +mul v30.4S, v30.4S,v18.s[0] +mla v30.4S, v13.4S, v31.s[0] +sub v13.4s, v4.4s, v30.4s +add v4.4s, v4.4s, v30.4s +sqrdmulh v30.4S, v27.4S, v21.s[0] +mul v27.4S, v27.4S,v10.s[0] +mla v27.4S, v30.4S, v31.s[0] +sub v30.4s, v28.4s, v27.4s +add v28.4s, v28.4s, v27.4s +sqrdmulh v27.4S, v24.4S, v15.s[0] +mul v24.4S, v24.4S,v22.s[0] +mla v24.4S, v27.4S, v31.s[0] +sub v27.4s, v25.4s, v24.4s +add v25.4s, v25.4s, v24.4s +sqrdmulh v24.4S, v5.4S, v12.s[0] +mul v5.4S, v5.4S,v3.s[0] +mla v5.4S, v24.4S, v31.s[0] +sub v24.4s, v6.4s, v5.4s +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v4.4S, v1.s[1] +mul v4.4S, v4.4S,v18.s[1] +mla v4.4S, v5.4S, v31.s[0] +sub v5.4s, v29.4s, v4.4s +add v29.4s, v29.4s, v4.4s +sqrdmulh v4.4S, v28.4S, v21.s[1] +mul v28.4S, v28.4S,v10.s[1] +mla v28.4S, v4.4S, v31.s[0] +sub v4.4s, v26.4s, v28.4s +add v26.4s, v26.4s, v28.4s +str q29, [x0, #0] +str q5, [x0, #16] +sqrdmulh v5.4S, v25.4S, v15.s[1] +mul v25.4S, v25.4S,v22.s[1] +mla v25.4S, v5.4S, v31.s[0] +sub v5.4s, v20.4s, v25.4s +add v20.4s, v20.4s, v25.4s +str q26, [x0, #64] +str q4, [x0, #80] +sqrdmulh v4.4S, v6.4S, v12.s[1] +mul v6.4S, v6.4S,v3.s[1] +mla v6.4S, v4.4S, v31.s[0] +sub v4.4s, v16.4s, v6.4s +add v16.4s, v16.4s, v6.4s +str q20, [x0, #128] +str q5, [x0, #144] +sqrdmulh v5.4S, v13.4S, v1.s[2] +mul v13.4S, v13.4S,v18.s[2] +mla v13.4S, v5.4S, v31.s[0] +sub v5.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +str q16, [x0, #192] +str q4, [x0, #208] +ldr q1, [x17, #+256] +ldr q18, [x17, #+272] +sqrdmulh v4.4S, v30.4S, v21.s[2] +mul v30.4S, v30.4S,v10.s[2] +mla v30.4S, v4.4S, v31.s[0] +sub v4.4s, v2.4s, v30.4s +add v2.4s, v2.4s, v30.4s +ldr q21, [x17, #+288] +ldr q10, [x17, #+304] +sqrdmulh v30.4S, v27.4S, v15.s[2] +mul v27.4S, v27.4S,v22.s[2] +mla v27.4S, v30.4S, v31.s[0] +sub v30.4s, v23.4s, v27.4s +add v23.4s, v23.4s, v27.4s +ldr q15, [x17, #+320] +ldr q22, [x17, #+336] +sqrdmulh v27.4S, v24.4S, v12.s[2] +mul v24.4S, v24.4S,v3.s[2] +mla v24.4S, v27.4S, v31.s[0] +sub v27.4s, v0.4s, v24.4s +add v0.4s, v0.4s, v24.4s +ldr q12, [x17, #+352] +ldr q3, [x17, #+368] +str q11, [x0, #32] +str q5, [x0, #48] +str q2, [x0, #96] +str q4, [x0, #112] +str q23, [x0, #160] +str q30, [x0, #176] +str q0, [x0, #224] +str q27, [x0, #240] +ldr q27, [x0, #288] +ldr q0, [x0, #304] +ldr q30, [x0, #256] +ldr q23, [x0, #352] +ldr q4, [x0, #368] +ldr q2, [x0, #320] +ldr q5, [x0, #416] +ldr q11, [x0, #432] +ldr q24, [x0, #384] +ldr q16, [x0, #480] +ldr q13, [x0, #496] +ldr q20, [x0, #448] +sqrdmulh v6.4S, v27.4S, v18.s[0] +mul v27.4S, v27.4S,v1.s[0] +mla v27.4S, v6.4S, v31.s[0] +sub v6.4s, v30.4s, v27.4s +add v30.4s, v30.4s, v27.4s +ldr q27, [x0, #272] +sqrdmulh v26.4S, v23.4S, v10.s[0] +mul v23.4S, v23.4S,v21.s[0] +mla v23.4S, v26.4S, v31.s[0] +sub v26.4s, v2.4s, v23.4s +add v2.4s, v2.4s, v23.4s +ldr q23, [x0, #336] +sqrdmulh v25.4S, v5.4S, v22.s[0] +mul v5.4S, v5.4S,v15.s[0] +mla v5.4S, v25.4S, v31.s[0] +sub v25.4s, v24.4s, v5.4s +add v24.4s, v24.4s, v5.4s +ldr q5, [x0, #400] +sqrdmulh v29.4S, v16.4S, v3.s[0] +mul v16.4S, v16.4S,v12.s[0] +mla v16.4S, v29.4S, v31.s[0] +sub v29.4s, v20.4s, v16.4s +add v20.4s, v20.4s, v16.4s +ldr q16, [x0, #464] +sqrdmulh v28.4S, v0.4S, v18.s[0] +mul v0.4S, v0.4S,v1.s[0] +mla v0.4S, v28.4S, v31.s[0] +sub v28.4s, v27.4s, v0.4s +add v27.4s, v27.4s, v0.4s +sqrdmulh v0.4S, v4.4S, v10.s[0] +mul v4.4S, v4.4S,v21.s[0] +mla v4.4S, v0.4S, v31.s[0] +sub v0.4s, v23.4s, v4.4s +add v23.4s, v23.4s, v4.4s +sqrdmulh v4.4S, v11.4S, v22.s[0] +mul v11.4S, v11.4S,v15.s[0] +mla v11.4S, v4.4S, v31.s[0] +sub v4.4s, v5.4s, v11.4s +add v5.4s, v5.4s, v11.4s +sqrdmulh v11.4S, v13.4S, v3.s[0] +mul v13.4S, v13.4S,v12.s[0] +mla v13.4S, v11.4S, v31.s[0] +sub v11.4s, v16.4s, v13.4s +add v16.4s, v16.4s, v13.4s +sqrdmulh v13.4S, v27.4S, v18.s[1] +mul v27.4S, v27.4S,v1.s[1] +mla v27.4S, v13.4S, v31.s[0] +sub v13.4s, v30.4s, v27.4s +add v30.4s, v30.4s, v27.4s +sqrdmulh v27.4S, v23.4S, v10.s[1] +mul v23.4S, v23.4S,v21.s[1] +mla v23.4S, v27.4S, v31.s[0] +sub v27.4s, v2.4s, v23.4s +add v2.4s, v2.4s, v23.4s +str q30, [x0, #256] +str q13, [x0, #272] +sqrdmulh v13.4S, v5.4S, v22.s[1] +mul v5.4S, v5.4S,v15.s[1] +mla v5.4S, v13.4S, v31.s[0] +sub v13.4s, v24.4s, v5.4s +add v24.4s, v24.4s, v5.4s +str q2, [x0, #320] +str q27, [x0, #336] +sqrdmulh v27.4S, v16.4S, v3.s[1] +mul v16.4S, v16.4S,v12.s[1] +mla v16.4S, v27.4S, v31.s[0] +sub v27.4s, v20.4s, v16.4s +add v20.4s, v20.4s, v16.4s +str q24, [x0, #384] +str q13, [x0, #400] +sqrdmulh v13.4S, v28.4S, v18.s[2] +mul v28.4S, v28.4S,v1.s[2] +mla v28.4S, v13.4S, v31.s[0] +sub v13.4s, v6.4s, v28.4s +add v6.4s, v6.4s, v28.4s +str q20, [x0, #448] +str q27, [x0, #464] +ldr q18, [x17, #+384] +ldr q1, [x17, #+400] +sqrdmulh v27.4S, v0.4S, v10.s[2] +mul v0.4S, v0.4S,v21.s[2] +mla v0.4S, v27.4S, v31.s[0] +sub v27.4s, v26.4s, v0.4s +add v26.4s, v26.4s, v0.4s +ldr q10, [x17, #+416] +ldr q21, [x17, #+432] +sqrdmulh v0.4S, v4.4S, v22.s[2] +mul v4.4S, v4.4S,v15.s[2] +mla v4.4S, v0.4S, v31.s[0] +sub v0.4s, v25.4s, v4.4s +add v25.4s, v25.4s, v4.4s +ldr q22, [x17, #+448] +ldr q15, [x17, #+464] +sqrdmulh v4.4S, v11.4S, v3.s[2] +mul v11.4S, v11.4S,v12.s[2] +mla v11.4S, v4.4S, v31.s[0] +sub v4.4s, v29.4s, v11.4s +add v29.4s, v29.4s, v11.4s +ldr q3, [x17, #+480] +ldr q12, [x17, #+496] +str q6, [x0, #288] +str q13, [x0, #304] +str q26, [x0, #352] +str q27, [x0, #368] +str q25, [x0, #416] +str q0, [x0, #432] +str q29, [x0, #480] +str q4, [x0, #496] +ldr q4, [x0, #544] +ldr q29, [x0, #560] +ldr q0, [x0, #512] +ldr q25, [x0, #608] +ldr q27, [x0, #624] +ldr q26, [x0, #576] +ldr q13, [x0, #672] +ldr q6, [x0, #688] +ldr q11, [x0, #640] +ldr q20, [x0, #736] +ldr q28, [x0, #752] +ldr q24, [x0, #704] +sqrdmulh v16.4S, v4.4S, v1.s[0] +mul v4.4S, v4.4S,v18.s[0] +mla v4.4S, v16.4S, v31.s[0] +sub v16.4s, v0.4s, v4.4s +add v0.4s, v0.4s, v4.4s +ldr q4, [x0, #528] +sqrdmulh v2.4S, v25.4S, v21.s[0] +mul v25.4S, v25.4S,v10.s[0] +mla v25.4S, v2.4S, v31.s[0] +sub v2.4s, v26.4s, v25.4s +add v26.4s, v26.4s, v25.4s +ldr q25, [x0, #592] +sqrdmulh v5.4S, v13.4S, v15.s[0] +mul v13.4S, v13.4S,v22.s[0] +mla v13.4S, v5.4S, v31.s[0] +sub v5.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +ldr q13, [x0, #656] +sqrdmulh v30.4S, v20.4S, v12.s[0] +mul v20.4S, v20.4S,v3.s[0] +mla v20.4S, v30.4S, v31.s[0] +sub v30.4s, v24.4s, v20.4s +add v24.4s, v24.4s, v20.4s +ldr q20, [x0, #720] +sqrdmulh v23.4S, v29.4S, v1.s[0] +mul v29.4S, v29.4S,v18.s[0] +mla v29.4S, v23.4S, v31.s[0] +sub v23.4s, v4.4s, v29.4s +add v4.4s, v4.4s, v29.4s +sqrdmulh v29.4S, v27.4S, v21.s[0] +mul v27.4S, v27.4S,v10.s[0] +mla v27.4S, v29.4S, v31.s[0] +sub v29.4s, v25.4s, v27.4s +add v25.4s, v25.4s, v27.4s +sqrdmulh v27.4S, v6.4S, v15.s[0] +mul v6.4S, v6.4S,v22.s[0] +mla v6.4S, v27.4S, v31.s[0] +sub v27.4s, v13.4s, v6.4s +add v13.4s, v13.4s, v6.4s +sqrdmulh v6.4S, v28.4S, v12.s[0] +mul v28.4S, v28.4S,v3.s[0] +mla v28.4S, v6.4S, v31.s[0] +sub v6.4s, v20.4s, v28.4s +add v20.4s, v20.4s, v28.4s +sqrdmulh v28.4S, v4.4S, v1.s[1] +mul v4.4S, v4.4S,v18.s[1] +mla v4.4S, v28.4S, v31.s[0] +sub v28.4s, v0.4s, v4.4s +add v0.4s, v0.4s, v4.4s +sqrdmulh v4.4S, v25.4S, v21.s[1] +mul v25.4S, v25.4S,v10.s[1] +mla v25.4S, v4.4S, v31.s[0] +sub v4.4s, v26.4s, v25.4s +add v26.4s, v26.4s, v25.4s +str q0, [x0, #512] +str q28, [x0, #528] +sqrdmulh v28.4S, v13.4S, v15.s[1] +mul v13.4S, v13.4S,v22.s[1] +mla v13.4S, v28.4S, v31.s[0] +sub v28.4s, v11.4s, v13.4s +add v11.4s, v11.4s, v13.4s +str q26, [x0, #576] +str q4, [x0, #592] +sqrdmulh v4.4S, v20.4S, v12.s[1] +mul v20.4S, v20.4S,v3.s[1] +mla v20.4S, v4.4S, v31.s[0] +sub v4.4s, v24.4s, v20.4s +add v24.4s, v24.4s, v20.4s +str q11, [x0, #640] +str q28, [x0, #656] +sqrdmulh v28.4S, v23.4S, v1.s[2] +mul v23.4S, v23.4S,v18.s[2] +mla v23.4S, v28.4S, v31.s[0] +sub v28.4s, v16.4s, v23.4s +add v16.4s, v16.4s, v23.4s +str q24, [x0, #704] +str q4, [x0, #720] +ldr q1, [x17, #+512] +ldr q18, [x17, #+528] +sqrdmulh v4.4S, v29.4S, v21.s[2] +mul v29.4S, v29.4S,v10.s[2] +mla v29.4S, v4.4S, v31.s[0] +sub v4.4s, v2.4s, v29.4s +add v2.4s, v2.4s, v29.4s +ldr q21, [x17, #+544] +ldr q10, [x17, #+560] +sqrdmulh v29.4S, v27.4S, v15.s[2] +mul v27.4S, v27.4S,v22.s[2] +mla v27.4S, v29.4S, v31.s[0] +sub v29.4s, v5.4s, v27.4s +add v5.4s, v5.4s, v27.4s +ldr q15, [x17, #+576] +ldr q22, [x17, #+592] +sqrdmulh v27.4S, v6.4S, v12.s[2] +mul v6.4S, v6.4S,v3.s[2] +mla v6.4S, v27.4S, v31.s[0] +sub v27.4s, v30.4s, v6.4s +add v30.4s, v30.4s, v6.4s +ldr q12, [x17, #+608] +ldr q3, [x17, #+624] +str q16, [x0, #544] +str q28, [x0, #560] +str q2, [x0, #608] +str q4, [x0, #624] +str q5, [x0, #672] +str q29, [x0, #688] +str q30, [x0, #736] +str q27, [x0, #752] +ldr q27, [x0, #800] +ldr q30, [x0, #816] +ldr q29, [x0, #768] +ldr q5, [x0, #864] +ldr q4, [x0, #880] +ldr q2, [x0, #832] +ldr q28, [x0, #928] +ldr q16, [x0, #944] +ldr q6, [x0, #896] +ldr q24, [x0, #992] +ldr q23, [x0, #1008] +ldr q11, [x0, #960] +sqrdmulh v20.4S, v27.4S, v18.s[0] +mul v27.4S, v27.4S,v1.s[0] +mla v27.4S, v20.4S, v31.s[0] +sub v20.4s, v29.4s, v27.4s +add v29.4s, v29.4s, v27.4s +ldr q27, [x0, #784] +sqrdmulh v26.4S, v5.4S, v10.s[0] +mul v5.4S, v5.4S,v21.s[0] +mla v5.4S, v26.4S, v31.s[0] +sub v26.4s, v2.4s, v5.4s +add v2.4s, v2.4s, v5.4s +ldr q5, [x0, #848] +sqrdmulh v13.4S, v28.4S, v22.s[0] +mul v28.4S, v28.4S,v15.s[0] +mla v28.4S, v13.4S, v31.s[0] +sub v13.4s, v6.4s, v28.4s +add v6.4s, v6.4s, v28.4s +ldr q28, [x0, #912] +sqrdmulh v0.4S, v24.4S, v3.s[0] +mul v24.4S, v24.4S,v12.s[0] +mla v24.4S, v0.4S, v31.s[0] +sub v0.4s, v11.4s, v24.4s +add v11.4s, v11.4s, v24.4s +ldr q24, [x0, #976] +sqrdmulh v25.4S, v30.4S, v18.s[0] +mul v30.4S, v30.4S,v1.s[0] +mla v30.4S, v25.4S, v31.s[0] +sub v25.4s, v27.4s, v30.4s +add v27.4s, v27.4s, v30.4s +sqrdmulh v30.4S, v4.4S, v10.s[0] +mul v4.4S, v4.4S,v21.s[0] +mla v4.4S, v30.4S, v31.s[0] +sub v30.4s, v5.4s, v4.4s +add v5.4s, v5.4s, v4.4s +sqrdmulh v4.4S, v16.4S, v22.s[0] +mul v16.4S, v16.4S,v15.s[0] +mla v16.4S, v4.4S, v31.s[0] +sub v4.4s, v28.4s, v16.4s +add v28.4s, v28.4s, v16.4s +sqrdmulh v16.4S, v23.4S, v3.s[0] +mul v23.4S, v23.4S,v12.s[0] +mla v23.4S, v16.4S, v31.s[0] +sub v16.4s, v24.4s, v23.4s +add v24.4s, v24.4s, v23.4s +sqrdmulh v23.4S, v27.4S, v18.s[1] +mul v27.4S, v27.4S,v1.s[1] +mla v27.4S, v23.4S, v31.s[0] +sub v23.4s, v29.4s, v27.4s +add v29.4s, v29.4s, v27.4s +sqrdmulh v27.4S, v5.4S, v10.s[1] +mul v5.4S, v5.4S,v21.s[1] +mla v5.4S, v27.4S, v31.s[0] +sub v27.4s, v2.4s, v5.4s +add v2.4s, v2.4s, v5.4s +str q29, [x0, #768] +str q23, [x0, #784] +sqrdmulh v23.4S, v28.4S, v22.s[1] +mul v28.4S, v28.4S,v15.s[1] +mla v28.4S, v23.4S, v31.s[0] +sub v23.4s, v6.4s, v28.4s +add v6.4s, v6.4s, v28.4s +str q2, [x0, #832] +str q27, [x0, #848] +sqrdmulh v27.4S, v24.4S, v3.s[1] +mul v24.4S, v24.4S,v12.s[1] +mla v24.4S, v27.4S, v31.s[0] +sub v27.4s, v11.4s, v24.4s +add v11.4s, v11.4s, v24.4s +str q6, [x0, #896] +str q23, [x0, #912] +sqrdmulh v23.4S, v25.4S, v18.s[2] +mul v25.4S, v25.4S,v1.s[2] +mla v25.4S, v23.4S, v31.s[0] +sub v23.4s, v20.4s, v25.4s +add v20.4s, v20.4s, v25.4s +str q11, [x0, #960] +str q27, [x0, #976] +sqrdmulh v18.4S, v30.4S, v10.s[2] +mul v30.4S, v30.4S,v21.s[2] +mla v30.4S, v18.4S, v31.s[0] +sub v18.4s, v26.4s, v30.4s +add v26.4s, v26.4s, v30.4s +sqrdmulh v10.4S, v4.4S, v22.s[2] +mul v4.4S, v4.4S,v15.s[2] +mla v4.4S, v10.4S, v31.s[0] +sub v10.4s, v13.4s, v4.4s +add v13.4s, v13.4s, v4.4s +sqrdmulh v22.4S, v16.4S, v3.s[2] +mul v16.4S, v16.4S,v12.s[2] +mla v16.4S, v22.4S, v31.s[0] +sub v22.4s, v0.4s, v16.4s +add v0.4s, v0.4s, v16.4s +str q20, [x0, #800] +str q23, [x0, #816] +str q26, [x0, #864] +str q18, [x0, #880] +str q13, [x0, #928] +str q10, [x0, #944] +str q0, [x0, #992] +str q22, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1464 +// Instruction count: 1460 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_3.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_3.s new file mode 100644 index 0000000..11b1cd1 --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_3.s @@ -0,0 +1,1494 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_7_z4_3 +.global _ntt_u32_incomplete_neon_asm_var_4_2_7_z4_3 +ntt_u32_incomplete_neon_asm_var_4_2_7_z4_3: +_ntt_u32_incomplete_neon_asm_var_4_2_7_z4_3: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #928] +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +ldr q20, [x0, #992] +sqrdmulh v19.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q18, [x0, #800] +sqrdmulh v17.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +ldr q16, [x0, #864] +sqrdmulh v3.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +mla v22.4S, v21.4S, v31.s[0] +mla v20.4S, v19.4S, v31.s[0] +mla v18.4S, v17.4S, v31.s[0] +mla v16.4S, v3.4S, v31.s[0] +ldr q3, [x0, #544] +sqrdmulh v17.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +ldr q19, [x0, #608] +sqrdmulh v21.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q2, [x0, #672] +ldr q1, [x0, #416] +sqrdmulh v0.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +sub v15.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +ldr q22, [x0, #736] +ldr q14, [x0, #480] +sqrdmulh v13.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v12.4s, v14.4s, v20.4s +add v14.4s, v14.4s, v20.4s +ldr q20, [x0, #288] +mla v3.4S, v17.4S, v31.s[0] +mla v19.4S, v21.4S, v31.s[0] +sub v21.4s, v20.4s, v18.4s +mla v2.4S, v0.4S, v31.s[0] +mla v22.4S, v13.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +ldr q18, [x0, #352] +sqrdmulh v13.4S, v1.4S, v29.s[1] +mul v1.4S, v1.4S,v30.s[1] +sub v0.4s, v18.4s, v16.4s +sqrdmulh v17.4S, v14.4S, v29.s[1] +mul v14.4S, v14.4S,v30.s[1] +add v18.4s, v18.4s, v16.4s +ldr q16, [x0, #32] +sqrdmulh v11.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v10.4s, v16.4s, v3.4s +add v16.4s, v16.4s, v3.4s +ldr q3, [x0, #96] +sqrdmulh v9.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v8.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +ldr q19, [x0, #160] +mla v1.4S, v13.4S, v31.s[0] +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v19.4s, v2.4s +mla v20.4S, v11.4S, v31.s[0] +mla v18.4S, v9.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +ldr q2, [x0, #224] +sqrdmulh v9.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +sub v11.4s, v2.4s, v22.4s +sqrdmulh v13.4S, v12.4S, v29.s[2] +mul v12.4S, v12.4S,v30.s[2] +add v2.4s, v2.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +sub v7.4s, v19.4s, v1.4s +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v29.s[2] +mul v0.4S, v0.4S,v30.s[2] +sub v6.4s, v2.4s, v14.4s +add v2.4s, v2.4s, v14.4s +mla v15.4S, v9.4S, v31.s[0] +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v16.4s, v20.4s +mla v21.4S, v22.4S, v31.s[0] +mla v0.4S, v1.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v7.4S, v27.s[1] +mul v7.4S, v7.4S,v28.s[1] +sub v1.4s, v3.4s, v18.4s +sqrdmulh v22.4S, v6.4S, v27.s[1] +mul v6.4S, v6.4S,v28.s[1] +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v19.4S, v27.s[0] +mul v19.4S, v19.4S,v28.s[0] +sub v9.4s, v17.4s, v15.4s +add v17.4s, v17.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v27.s[0] +mul v2.4S, v2.4S,v28.s[0] +sub v14.4s, v11.4s, v12.4s +add v11.4s, v11.4s, v12.4s +mla v7.4S, v20.4S, v31.s[0] +mla v6.4S, v22.4S, v31.s[0] +sub v22.4s, v10.4s, v21.4s +mla v19.4S, v18.4S, v31.s[0] +mla v2.4S, v15.4S, v31.s[0] +add v10.4s, v10.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v27.s[2] +mul v17.4S, v17.4S,v28.s[2] +sub v15.4s, v8.4s, v0.4s +sqrdmulh v18.4S, v11.4S, v27.s[2] +mul v11.4S, v11.4S,v28.s[2] +add v8.4s, v8.4s, v0.4s +sqrdmulh v0.4S, v9.4S, v27.s[3] +mul v9.4S, v9.4S,v28.s[3] +sub v20.4s, v13.4s, v7.4s +add v13.4s, v13.4s, v7.4s +sqrdmulh v7.4S, v14.4S, v27.s[3] +mul v14.4S, v14.4S,v28.s[3] +sub v12.4s, v1.4s, v6.4s +add v1.4s, v1.4s, v6.4s +mla v17.4S, v21.4S, v31.s[0] +mla v11.4S, v18.4S, v31.s[0] +sub v18.4s, v16.4s, v19.4s +mla v9.4S, v0.4S, v31.s[0] +mla v14.4S, v7.4S, v31.s[0] +add v16.4s, v16.4s, v19.4s +sqrdmulh v19.4S, v1.4S, v25.s[2] +mul v1.4S, v1.4S,v26.s[2] +sub v7.4s, v3.4s, v2.4s +sqrdmulh v0.4S, v12.4S, v25.s[3] +mul v12.4S, v12.4S,v26.s[3] +add v3.4s, v3.4s, v2.4s +sqrdmulh v2.4S, v7.4S, v25.s[1] +mul v7.4S, v7.4S,v26.s[1] +sub v21.4s, v10.4s, v17.4s +add v10.4s, v10.4s, v17.4s +sqrdmulh v17.4S, v3.4S, v25.s[0] +mul v3.4S, v3.4S,v26.s[0] +sub v6.4s, v8.4s, v11.4s +add v8.4s, v8.4s, v11.4s +mla v1.4S, v19.4S, v31.s[0] +mla v12.4S, v0.4S, v31.s[0] +sub v0.4s, v22.4s, v9.4s +mla v7.4S, v2.4S, v31.s[0] +mla v3.4S, v17.4S, v31.s[0] +add v22.4s, v22.4s, v9.4s +sqrdmulh v9.4S, v8.4S, v23.s[0] +mul v8.4S, v8.4S,v24.s[0] +sub v17.4s, v15.4s, v14.4s +sqrdmulh v2.4S, v6.4S, v23.s[1] +mul v6.4S, v6.4S,v24.s[1] +add v15.4s, v15.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v23.s[2] +mul v15.4S, v15.4S,v24.s[2] +sub v19.4s, v13.4s, v1.4s +add v13.4s, v13.4s, v1.4s +sqrdmulh v1.4S, v17.4S, v23.s[3] +mul v17.4S, v17.4S,v24.s[3] +sub v11.4s, v20.4s, v12.4s +add v20.4s, v20.4s, v12.4s +mla v8.4S, v9.4S, v31.s[0] +mla v6.4S, v2.4S, v31.s[0] +sub v2.4s, v18.4s, v7.4s +str q13, [x0, #288] +mla v15.4S, v14.4S, v31.s[0] +mla v17.4S, v1.4S, v31.s[0] +add v18.4s, v18.4s, v7.4s +str q19, [x0, #352] +ldr q19, [x0, #944] +sqrdmulh v7.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +sub v1.4s, v16.4s, v3.4s +str q20, [x0, #416] +ldr q20, [x0, #1008] +sqrdmulh v14.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v16.4s, v16.4s, v3.4s +str q11, [x0, #480] +ldr q11, [x0, #816] +sqrdmulh v3.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +sub v13.4s, v10.4s, v8.4s +add v10.4s, v10.4s, v8.4s +ldr q8, [x0, #880] +sqrdmulh v9.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v12.4s, v21.4s, v6.4s +add v21.4s, v21.4s, v6.4s +mla v19.4S, v7.4S, v31.s[0] +mla v20.4S, v14.4S, v31.s[0] +sub v14.4s, v22.4s, v15.4s +str q18, [x0, #160] +mla v11.4S, v3.4S, v31.s[0] +mla v8.4S, v9.4S, v31.s[0] +add v22.4s, v22.4s, v15.4s +str q2, [x0, #224] +ldr q2, [x0, #560] +sqrdmulh v15.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +sub v9.4s, v0.4s, v17.4s +str q16, [x0, #32] +ldr q16, [x0, #624] +sqrdmulh v3.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +add v0.4s, v0.4s, v17.4s +str q1, [x0, #96] +ldr q1, [x0, #688] +ldr q17, [x0, #432] +sqrdmulh v18.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +sub v7.4s, v17.4s, v19.4s +add v17.4s, v17.4s, v19.4s +ldr q19, [x0, #752] +ldr q6, [x0, #496] +sqrdmulh v5.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +sub v4.4s, v6.4s, v20.4s +add v6.4s, v6.4s, v20.4s +ldr q20, [x0, #304] +mla v2.4S, v15.4S, v31.s[0] +mla v16.4S, v3.4S, v31.s[0] +sub v3.4s, v20.4s, v11.4s +str q10, [x0, #544] +mla v1.4S, v18.4S, v31.s[0] +mla v19.4S, v5.4S, v31.s[0] +add v20.4s, v20.4s, v11.4s +str q13, [x0, #608] +ldr q13, [x0, #368] +sqrdmulh v11.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v5.4s, v13.4s, v8.4s +str q21, [x0, #672] +sqrdmulh v21.4S, v6.4S, v29.s[1] +mul v6.4S, v6.4S,v30.s[1] +add v13.4s, v13.4s, v8.4s +str q12, [x0, #736] +ldr q12, [x0, #48] +sqrdmulh v8.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v18.4s, v12.4s, v2.4s +add v12.4s, v12.4s, v2.4s +ldr q2, [x0, #112] +sqrdmulh v10.4S, v13.4S, v29.s[1] +mul v13.4S, v13.4S,v30.s[1] +sub v15.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +ldr q16, [x0, #176] +mla v17.4S, v11.4S, v31.s[0] +mla v6.4S, v21.4S, v31.s[0] +sub v21.4s, v16.4s, v1.4s +str q22, [x0, #800] +mla v20.4S, v8.4S, v31.s[0] +mla v13.4S, v10.4S, v31.s[0] +add v16.4s, v16.4s, v1.4s +str q14, [x0, #864] +ldr q14, [x0, #240] +sqrdmulh v1.4S, v7.4S, v29.s[2] +mul v7.4S, v7.4S,v30.s[2] +sub v10.4s, v14.4s, v19.4s +str q0, [x0, #928] +sqrdmulh v0.4S, v4.4S, v29.s[2] +mul v4.4S, v4.4S,v30.s[2] +add v14.4s, v14.4s, v19.4s +str q9, [x0, #992] +sqrdmulh v9.4S, v3.4S, v29.s[2] +mul v3.4S, v3.4S,v30.s[2] +sub v19.4s, v16.4s, v17.4s +add v16.4s, v16.4s, v17.4s +sqrdmulh v17.4S, v5.4S, v29.s[2] +mul v5.4S, v5.4S,v30.s[2] +sub v8.4s, v14.4s, v6.4s +add v14.4s, v14.4s, v6.4s +mla v7.4S, v1.4S, v31.s[0] +mla v4.4S, v0.4S, v31.s[0] +sub v0.4s, v12.4s, v20.4s +mla v3.4S, v9.4S, v31.s[0] +mla v5.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v27.s[1] +mul v19.4S, v19.4S,v28.s[1] +sub v17.4s, v2.4s, v13.4s +sqrdmulh v9.4S, v8.4S, v27.s[1] +mul v8.4S, v8.4S,v28.s[1] +add v2.4s, v2.4s, v13.4s +sqrdmulh v13.4S, v16.4S, v27.s[0] +mul v16.4S, v16.4S,v28.s[0] +sub v1.4s, v21.4s, v7.4s +add v21.4s, v21.4s, v7.4s +sqrdmulh v7.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +sub v6.4s, v10.4s, v4.4s +add v10.4s, v10.4s, v4.4s +mla v19.4S, v20.4S, v31.s[0] +mla v8.4S, v9.4S, v31.s[0] +sub v9.4s, v18.4s, v3.4s +mla v16.4S, v13.4S, v31.s[0] +mla v14.4S, v7.4S, v31.s[0] +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v27.s[2] +mul v21.4S, v21.4S,v28.s[2] +sub v7.4s, v15.4s, v5.4s +sqrdmulh v13.4S, v10.4S, v27.s[2] +mul v10.4S, v10.4S,v28.s[2] +add v15.4s, v15.4s, v5.4s +sqrdmulh v5.4S, v1.4S, v27.s[3] +mul v1.4S, v1.4S,v28.s[3] +sub v20.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v27.s[3] +mul v6.4S, v6.4S,v28.s[3] +sub v4.4s, v17.4s, v8.4s +add v17.4s, v17.4s, v8.4s +mla v21.4S, v3.4S, v31.s[0] +mla v10.4S, v13.4S, v31.s[0] +sub v13.4s, v12.4s, v16.4s +mla v1.4S, v5.4S, v31.s[0] +mla v6.4S, v19.4S, v31.s[0] +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v25.s[2] +mul v17.4S, v17.4S,v26.s[2] +sub v19.4s, v2.4s, v14.4s +sqrdmulh v5.4S, v4.4S, v25.s[3] +mul v4.4S, v4.4S,v26.s[3] +add v2.4s, v2.4s, v14.4s +sqrdmulh v14.4S, v19.4S, v25.s[1] +mul v19.4S, v19.4S,v26.s[1] +sub v3.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v2.4S, v25.s[0] +mul v2.4S, v2.4S,v26.s[0] +sub v8.4s, v15.4s, v10.4s +add v15.4s, v15.4s, v10.4s +mla v17.4S, v16.4S, v31.s[0] +mla v4.4S, v5.4S, v31.s[0] +sub v5.4s, v9.4s, v1.4s +mla v19.4S, v14.4S, v31.s[0] +mla v2.4S, v21.4S, v31.s[0] +add v9.4s, v9.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v23.s[0] +mul v15.4S, v15.4S,v24.s[0] +sub v21.4s, v7.4s, v6.4s +sqrdmulh v14.4S, v8.4S, v23.s[1] +mul v8.4S, v8.4S,v24.s[1] +add v7.4s, v7.4s, v6.4s +sqrdmulh v6.4S, v7.4S, v23.s[2] +mul v7.4S, v7.4S,v24.s[2] +sub v16.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +sqrdmulh v17.4S, v21.4S, v23.s[3] +mul v21.4S, v21.4S,v24.s[3] +sub v10.4s, v20.4s, v4.4s +add v20.4s, v20.4s, v4.4s +mla v15.4S, v1.4S, v31.s[0] +mla v8.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v19.4s +str q0, [x0, #304] +mla v7.4S, v6.4S, v31.s[0] +mla v21.4S, v17.4S, v31.s[0] +add v13.4s, v13.4s, v19.4s +str q16, [x0, #368] +ldr q16, [x0, #896] +sqrdmulh v19.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v17.4s, v12.4s, v2.4s +str q20, [x0, #432] +ldr q20, [x0, #960] +sqrdmulh v6.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v12.4s, v12.4s, v2.4s +str q10, [x0, #496] +ldr q10, [x0, #768] +sqrdmulh v2.4S, v10.4S, v29.s[0] +mul v10.4S, v10.4S,v30.s[0] +sub v0.4s, v18.4s, v15.4s +add v18.4s, v18.4s, v15.4s +ldr q15, [x0, #832] +sqrdmulh v1.4S, v15.4S, v29.s[0] +mul v15.4S, v15.4S,v30.s[0] +sub v4.4s, v3.4s, v8.4s +add v3.4s, v3.4s, v8.4s +mla v16.4S, v19.4S, v31.s[0] +mla v20.4S, v6.4S, v31.s[0] +sub v6.4s, v9.4s, v7.4s +str q13, [x0, #176] +mla v10.4S, v2.4S, v31.s[0] +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v7.4s +str q14, [x0, #240] +ldr q14, [x0, #512] +sqrdmulh v7.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v1.4s, v5.4s, v21.4s +str q12, [x0, #48] +ldr q12, [x0, #576] +sqrdmulh v2.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +add v5.4s, v5.4s, v21.4s +str q17, [x0, #112] +ldr q17, [x0, #640] +ldr q21, [x0, #384] +sqrdmulh v13.4S, v17.4S, v29.s[0] +mul v17.4S, v17.4S,v30.s[0] +sub v19.4s, v21.4s, v16.4s +add v21.4s, v21.4s, v16.4s +ldr q16, [x0, #704] +ldr q8, [x0, #448] +sqrdmulh v22.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v11.4s, v8.4s, v20.4s +add v8.4s, v8.4s, v20.4s +ldr q20, [x0, #256] +mla v14.4S, v7.4S, v31.s[0] +mla v12.4S, v2.4S, v31.s[0] +sub v2.4s, v20.4s, v10.4s +str q18, [x0, #560] +mla v17.4S, v13.4S, v31.s[0] +mla v16.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v10.4s +str q0, [x0, #624] +ldr q0, [x0, #320] +sqrdmulh v10.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v22.4s, v0.4s, v15.4s +str q3, [x0, #688] +sqrdmulh v3.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +add v0.4s, v0.4s, v15.4s +str q4, [x0, #752] +ldr q4, [x0, #0] +sqrdmulh v15.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v13.4s, v4.4s, v14.4s +add v4.4s, v4.4s, v14.4s +ldr q14, [x0, #64] +sqrdmulh v18.4S, v0.4S, v29.s[1] +mul v0.4S, v0.4S,v30.s[1] +sub v7.4s, v14.4s, v12.4s +add v14.4s, v14.4s, v12.4s +ldr q12, [x0, #128] +mla v21.4S, v10.4S, v31.s[0] +mla v8.4S, v3.4S, v31.s[0] +sub v3.4s, v12.4s, v17.4s +str q9, [x0, #816] +mla v20.4S, v15.4S, v31.s[0] +mla v0.4S, v18.4S, v31.s[0] +add v12.4s, v12.4s, v17.4s +str q6, [x0, #880] +ldr q6, [x0, #192] +sqrdmulh v17.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +sub v18.4s, v6.4s, v16.4s +str q5, [x0, #944] +sqrdmulh v5.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +add v6.4s, v6.4s, v16.4s +str q1, [x0, #1008] +sqrdmulh v1.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v16.4s, v12.4s, v21.4s +add v12.4s, v12.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +sub v15.4s, v6.4s, v8.4s +add v6.4s, v6.4s, v8.4s +mla v19.4S, v17.4S, v31.s[0] +mla v11.4S, v5.4S, v31.s[0] +sub v5.4s, v4.4s, v20.4s +mla v2.4S, v1.4S, v31.s[0] +mla v22.4S, v21.4S, v31.s[0] +add v4.4s, v4.4s, v20.4s +sqrdmulh v20.4S, v16.4S, v27.s[1] +mul v16.4S, v16.4S,v28.s[1] +sub v21.4s, v14.4s, v0.4s +sqrdmulh v1.4S, v15.4S, v27.s[1] +mul v15.4S, v15.4S,v28.s[1] +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +sub v17.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v27.s[0] +mul v6.4S, v6.4S,v28.s[0] +sub v8.4s, v18.4s, v11.4s +add v18.4s, v18.4s, v11.4s +mla v16.4S, v20.4S, v31.s[0] +mla v15.4S, v1.4S, v31.s[0] +sub v1.4s, v13.4s, v2.4s +mla v12.4S, v0.4S, v31.s[0] +mla v6.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v3.4S, v27.s[2] +mul v3.4S, v3.4S,v28.s[2] +sub v19.4s, v7.4s, v22.4s +sqrdmulh v0.4S, v18.4S, v27.s[2] +mul v18.4S, v18.4S,v28.s[2] +add v7.4s, v7.4s, v22.4s +sqrdmulh v22.4S, v17.4S, v27.s[3] +mul v17.4S, v17.4S,v28.s[3] +sub v20.4s, v5.4s, v16.4s +add v5.4s, v5.4s, v16.4s +sqrdmulh v16.4S, v8.4S, v27.s[3] +mul v8.4S, v8.4S,v28.s[3] +sub v11.4s, v21.4s, v15.4s +add v21.4s, v21.4s, v15.4s +mla v3.4S, v2.4S, v31.s[0] +mla v18.4S, v0.4S, v31.s[0] +sub v0.4s, v4.4s, v12.4s +mla v17.4S, v22.4S, v31.s[0] +mla v8.4S, v16.4S, v31.s[0] +add v4.4s, v4.4s, v12.4s +sqrdmulh v12.4S, v21.4S, v25.s[2] +mul v21.4S, v21.4S,v26.s[2] +sub v16.4s, v14.4s, v6.4s +sqrdmulh v22.4S, v11.4S, v25.s[3] +mul v11.4S, v11.4S,v26.s[3] +add v14.4s, v14.4s, v6.4s +sqrdmulh v6.4S, v16.4S, v25.s[1] +mul v16.4S, v16.4S,v26.s[1] +sub v2.4s, v13.4s, v3.4s +add v13.4s, v13.4s, v3.4s +sqrdmulh v3.4S, v14.4S, v25.s[0] +mul v14.4S, v14.4S,v26.s[0] +sub v15.4s, v7.4s, v18.4s +add v7.4s, v7.4s, v18.4s +mla v21.4S, v12.4S, v31.s[0] +mla v11.4S, v22.4S, v31.s[0] +sub v22.4s, v1.4s, v17.4s +mla v16.4S, v6.4S, v31.s[0] +mla v14.4S, v3.4S, v31.s[0] +add v1.4s, v1.4s, v17.4s +sqrdmulh v17.4S, v7.4S, v23.s[0] +mul v7.4S, v7.4S,v24.s[0] +sub v3.4s, v19.4s, v8.4s +sqrdmulh v6.4S, v15.4S, v23.s[1] +mul v15.4S, v15.4S,v24.s[1] +add v19.4s, v19.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v23.s[2] +mul v19.4S, v19.4S,v24.s[2] +sub v12.4s, v5.4s, v21.4s +add v5.4s, v5.4s, v21.4s +sqrdmulh v21.4S, v3.4S, v23.s[3] +mul v3.4S, v3.4S,v24.s[3] +sub v18.4s, v20.4s, v11.4s +add v20.4s, v20.4s, v11.4s +mla v7.4S, v17.4S, v31.s[0] +mla v15.4S, v6.4S, v31.s[0] +sub v6.4s, v0.4s, v16.4s +str q5, [x0, #256] +mla v19.4S, v8.4S, v31.s[0] +mla v3.4S, v21.4S, v31.s[0] +add v0.4s, v0.4s, v16.4s +str q12, [x0, #320] +ldr q12, [x0, #912] +sqrdmulh v16.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +sub v21.4s, v4.4s, v14.4s +str q20, [x0, #384] +ldr q20, [x0, #976] +sqrdmulh v8.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v4.4s, v4.4s, v14.4s +str q18, [x0, #448] +ldr q18, [x0, #784] +sqrdmulh v14.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +sub v5.4s, v13.4s, v7.4s +add v13.4s, v13.4s, v7.4s +ldr q7, [x0, #848] +sqrdmulh v17.4S, v7.4S, v29.s[0] +mul v7.4S, v7.4S,v30.s[0] +sub v11.4s, v2.4s, v15.4s +add v2.4s, v2.4s, v15.4s +mla v12.4S, v16.4S, v31.s[0] +mla v20.4S, v8.4S, v31.s[0] +sub v8.4s, v1.4s, v19.4s +str q0, [x0, #128] +mla v18.4S, v14.4S, v31.s[0] +mla v7.4S, v17.4S, v31.s[0] +add v1.4s, v1.4s, v19.4s +str q6, [x0, #192] +ldr q6, [x0, #528] +sqrdmulh v19.4S, v6.4S, v29.s[0] +mul v6.4S, v6.4S,v30.s[0] +sub v17.4s, v22.4s, v3.4s +str q4, [x0, #0] +ldr q4, [x0, #592] +sqrdmulh v14.4S, v4.4S, v29.s[0] +mul v4.4S, v4.4S,v30.s[0] +add v22.4s, v22.4s, v3.4s +str q21, [x0, #64] +ldr q21, [x0, #656] +ldr q3, [x0, #400] +sqrdmulh v0.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +sub v16.4s, v3.4s, v12.4s +add v3.4s, v3.4s, v12.4s +ldr q12, [x0, #720] +ldr q15, [x0, #464] +sqrdmulh v9.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +sub v10.4s, v15.4s, v20.4s +add v15.4s, v15.4s, v20.4s +ldr q20, [x0, #272] +mla v6.4S, v19.4S, v31.s[0] +mla v4.4S, v14.4S, v31.s[0] +sub v14.4s, v20.4s, v18.4s +str q13, [x0, #512] +mla v21.4S, v0.4S, v31.s[0] +mla v12.4S, v9.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +str q5, [x0, #576] +ldr q5, [x0, #336] +sqrdmulh v18.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v9.4s, v5.4s, v7.4s +str q2, [x0, #640] +sqrdmulh v2.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +add v5.4s, v5.4s, v7.4s +str q11, [x0, #704] +ldr q11, [x0, #16] +sqrdmulh v7.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v0.4s, v11.4s, v6.4s +add v11.4s, v11.4s, v6.4s +ldr q6, [x0, #80] +sqrdmulh v13.4S, v5.4S, v29.s[1] +mul v5.4S, v5.4S,v30.s[1] +sub v19.4s, v6.4s, v4.4s +add v6.4s, v6.4s, v4.4s +ldr q4, [x0, #144] +mla v3.4S, v18.4S, v31.s[0] +mla v15.4S, v2.4S, v31.s[0] +sub v2.4s, v4.4s, v21.4s +str q1, [x0, #768] +mla v20.4S, v7.4S, v31.s[0] +mla v5.4S, v13.4S, v31.s[0] +add v4.4s, v4.4s, v21.4s +str q8, [x0, #832] +ldr q8, [x0, #208] +sqrdmulh v21.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +sub v13.4s, v8.4s, v12.4s +str q22, [x0, #896] +sqrdmulh v22.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +add v8.4s, v8.4s, v12.4s +str q17, [x0, #960] +sqrdmulh v17.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v12.4s, v4.4s, v3.4s +add v4.4s, v4.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v29.s[2] +mul v9.4S, v9.4S,v30.s[2] +sub v7.4s, v8.4s, v15.4s +add v8.4s, v8.4s, v15.4s +mla v16.4S, v21.4S, v31.s[0] +mla v10.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v20.4s +mla v14.4S, v17.4S, v31.s[0] +mla v9.4S, v3.4S, v31.s[0] +add v11.4s, v11.4s, v20.4s +sqrdmulh v20.4S, v12.4S, v27.s[1] +mul v12.4S, v12.4S,v28.s[1] +sub v3.4s, v6.4s, v5.4s +sqrdmulh v17.4S, v7.4S, v27.s[1] +mul v7.4S, v7.4S,v28.s[1] +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v4.4S, v27.s[0] +mul v4.4S, v4.4S,v28.s[0] +sub v21.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v8.4S, v27.s[0] +mul v8.4S, v8.4S,v28.s[0] +sub v15.4s, v13.4s, v10.4s +add v13.4s, v13.4s, v10.4s +mla v12.4S, v20.4S, v31.s[0] +mla v7.4S, v17.4S, v31.s[0] +sub v17.4s, v0.4s, v14.4s +mla v4.4S, v5.4S, v31.s[0] +mla v8.4S, v16.4S, v31.s[0] +add v0.4s, v0.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v27.s[2] +mul v2.4S, v2.4S,v28.s[2] +sub v16.4s, v19.4s, v9.4s +sqrdmulh v5.4S, v13.4S, v27.s[2] +mul v13.4S, v13.4S,v28.s[2] +add v19.4s, v19.4s, v9.4s +sqrdmulh v9.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +sub v20.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +sub v10.4s, v3.4s, v7.4s +add v3.4s, v3.4s, v7.4s +mla v2.4S, v14.4S, v31.s[0] +mla v13.4S, v5.4S, v31.s[0] +sub v5.4s, v11.4s, v4.4s +mla v21.4S, v9.4S, v31.s[0] +mla v15.4S, v12.4S, v31.s[0] +add v11.4s, v11.4s, v4.4s +sqrdmulh v4.4S, v3.4S, v25.s[2] +mul v3.4S, v3.4S,v26.s[2] +sub v12.4s, v6.4s, v8.4s +sqrdmulh v9.4S, v10.4S, v25.s[3] +mul v10.4S, v10.4S,v26.s[3] +add v6.4s, v6.4s, v8.4s +sqrdmulh v8.4S, v12.4S, v25.s[1] +mul v12.4S, v12.4S,v26.s[1] +sub v14.4s, v0.4s, v2.4s +add v0.4s, v0.4s, v2.4s +sqrdmulh v2.4S, v6.4S, v25.s[0] +mul v6.4S, v6.4S,v26.s[0] +sub v7.4s, v19.4s, v13.4s +add v19.4s, v19.4s, v13.4s +mla v3.4S, v4.4S, v31.s[0] +mla v10.4S, v9.4S, v31.s[0] +sub v9.4s, v17.4s, v21.4s +mla v12.4S, v8.4S, v31.s[0] +mla v6.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v19.4S, v23.s[0] +mul v19.4S, v19.4S,v24.s[0] +sub v2.4s, v16.4s, v15.4s +sqrdmulh v8.4S, v7.4S, v23.s[1] +mul v7.4S, v7.4S,v24.s[1] +add v16.4s, v16.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v23.s[2] +mul v16.4S, v16.4S,v24.s[2] +sub v4.4s, v22.4s, v3.4s +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v2.4S, v23.s[3] +mul v2.4S, v2.4S,v24.s[3] +sub v13.4s, v20.4s, v10.4s +add v20.4s, v20.4s, v10.4s +mla v19.4S, v21.4S, v31.s[0] +mla v7.4S, v8.4S, v31.s[0] +sub v8.4s, v5.4s, v12.4s +str q22, [x0, #272] +mla v16.4S, v15.4S, v31.s[0] +mla v2.4S, v3.4S, v31.s[0] +add v5.4s, v5.4s, v12.4s +str q4, [x0, #336] +sub v23.4s, v11.4s, v6.4s +str q20, [x0, #400] +add v11.4s, v11.4s, v6.4s +str q13, [x0, #464] +sub v13.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sub v19.4s, v14.4s, v7.4s +add v14.4s, v14.4s, v7.4s +sub v7.4s, v17.4s, v16.4s +str q5, [x0, #144] +add v17.4s, v17.4s, v16.4s +str q8, [x0, #208] +sub v8.4s, v9.4s, v2.4s +str q11, [x0, #16] +add v9.4s, v9.4s, v2.4s +str q23, [x0, #80] +str q0, [x0, #528] +str q13, [x0, #592] +str q14, [x0, #656] +str q19, [x0, #720] +str q17, [x0, #784] +str q7, [x0, #848] +str q9, [x0, #912] +str q8, [x0, #976] +ldr q18, [x17, #+128] +ldr q1, [x17, #+144] +ldr q10, [x17, #+160] +ldr q21, [x17, #+176] +ldr q22, [x17, #+192] +ldr q15, [x17, #+208] +ldr q3, [x17, #+224] +ldr q12, [x17, #+240] +ldr q4, [x0, #32] +ldr q30, [x0, #48] +ldr q29, [x0, #0] +ldr q28, [x0, #96] +ldr q27, [x0, #112] +ldr q26, [x0, #64] +ldr q25, [x0, #160] +ldr q24, [x0, #176] +ldr q20, [x0, #128] +ldr q6, [x0, #224] +ldr q5, [x0, #240] +ldr q16, [x0, #192] +sqrdmulh v11.4S, v4.4S, v1.s[0] +sqrdmulh v2.4S, v28.4S, v21.s[0] +sqrdmulh v23.4S, v25.4S, v15.s[0] +sqrdmulh v0.4S, v6.4S, v12.s[0] +mul v4.4S, v4.4S,v18.s[0] +mul v28.4S, v28.4S,v10.s[0] +mul v25.4S, v25.4S,v22.s[0] +mul v6.4S, v6.4S,v3.s[0] +mla v4.4S, v11.4S, v31.s[0] +mla v28.4S, v2.4S, v31.s[0] +mla v25.4S, v23.4S, v31.s[0] +mla v6.4S, v0.4S, v31.s[0] +sub v0.4s, v29.4s, v4.4s +sub v23.4s, v26.4s, v28.4s +sub v2.4s, v20.4s, v25.4s +sub v11.4s, v16.4s, v6.4s +add v29.4s, v29.4s, v4.4s +add v26.4s, v26.4s, v28.4s +add v20.4s, v20.4s, v25.4s +add v16.4s, v16.4s, v6.4s +ldr q6, [x0, #16] +ldr q25, [x0, #80] +ldr q28, [x0, #144] +ldr q4, [x0, #208] +sqrdmulh v13.4S, v30.4S, v1.s[0] +sqrdmulh v14.4S, v27.4S, v21.s[0] +sqrdmulh v19.4S, v24.4S, v15.s[0] +sqrdmulh v17.4S, v5.4S, v12.s[0] +mul v30.4S, v30.4S,v18.s[0] +mul v27.4S, v27.4S,v10.s[0] +mul v24.4S, v24.4S,v22.s[0] +mul v5.4S, v5.4S,v3.s[0] +mla v30.4S, v13.4S, v31.s[0] +mla v27.4S, v14.4S, v31.s[0] +mla v24.4S, v19.4S, v31.s[0] +mla v5.4S, v17.4S, v31.s[0] +sub v17.4s, v6.4s, v30.4s +sub v19.4s, v25.4s, v27.4s +sub v14.4s, v28.4s, v24.4s +sub v13.4s, v4.4s, v5.4s +add v6.4s, v6.4s, v30.4s +add v25.4s, v25.4s, v27.4s +add v28.4s, v28.4s, v24.4s +add v4.4s, v4.4s, v5.4s +sqrdmulh v5.4S, v6.4S, v1.s[1] +sqrdmulh v24.4S, v25.4S, v21.s[1] +sqrdmulh v27.4S, v28.4S, v15.s[1] +sqrdmulh v30.4S, v4.4S, v12.s[1] +mul v6.4S, v6.4S,v18.s[1] +mul v25.4S, v25.4S,v10.s[1] +mul v28.4S, v28.4S,v22.s[1] +mul v4.4S, v4.4S,v3.s[1] +mla v6.4S, v5.4S, v31.s[0] +mla v25.4S, v24.4S, v31.s[0] +mla v28.4S, v27.4S, v31.s[0] +mla v4.4S, v30.4S, v31.s[0] +sub v30.4s, v29.4s, v6.4s +sub v27.4s, v26.4s, v25.4s +sub v24.4s, v20.4s, v28.4s +sub v5.4s, v16.4s, v4.4s +add v29.4s, v29.4s, v6.4s +add v26.4s, v26.4s, v25.4s +add v20.4s, v20.4s, v28.4s +add v16.4s, v16.4s, v4.4s +sqrdmulh v4.4S, v17.4S, v1.s[2] +sqrdmulh v28.4S, v19.4S, v21.s[2] +sqrdmulh v25.4S, v14.4S, v15.s[2] +sqrdmulh v6.4S, v13.4S, v12.s[2] +str q29, [x0, #0] +str q30, [x0, #16] +mul v17.4S, v17.4S,v18.s[2] +mul v19.4S, v19.4S,v10.s[2] +mul v14.4S, v14.4S,v22.s[2] +mul v13.4S, v13.4S,v3.s[2] +str q26, [x0, #64] +str q27, [x0, #80] +ldr q12, [x17, #+256] +ldr q3, [x17, #+272] +ldr q15, [x17, #+288] +ldr q22, [x17, #+304] +mla v17.4S, v4.4S, v31.s[0] +mla v19.4S, v28.4S, v31.s[0] +mla v14.4S, v25.4S, v31.s[0] +mla v13.4S, v6.4S, v31.s[0] +str q20, [x0, #128] +str q24, [x0, #144] +ldr q24, [x17, #+320] +ldr q20, [x17, #+336] +sub v6.4s, v0.4s, v17.4s +sub v25.4s, v23.4s, v19.4s +sub v28.4s, v2.4s, v14.4s +sub v4.4s, v11.4s, v13.4s +str q16, [x0, #192] +str q5, [x0, #208] +ldr q5, [x17, #+352] +ldr q16, [x17, #+368] +add v0.4s, v0.4s, v17.4s +add v23.4s, v23.4s, v19.4s +add v2.4s, v2.4s, v14.4s +add v11.4s, v11.4s, v13.4s +str q0, [x0, #32] +str q23, [x0, #96] +str q2, [x0, #160] +str q11, [x0, #224] +ldr q11, [x0, #288] +ldr q2, [x0, #304] +ldr q23, [x0, #256] +ldr q0, [x0, #352] +ldr q13, [x0, #368] +ldr q14, [x0, #320] +ldr q19, [x0, #416] +ldr q17, [x0, #432] +ldr q21, [x0, #384] +ldr q10, [x0, #480] +ldr q1, [x0, #496] +ldr q18, [x0, #448] +sqrdmulh v27.4S, v11.4S, v3.s[0] +sqrdmulh v26.4S, v0.4S, v22.s[0] +sqrdmulh v30.4S, v19.4S, v20.s[0] +sqrdmulh v29.4S, v10.4S, v16.s[0] +str q6, [x0, #48] +mul v11.4S, v11.4S,v12.s[0] +mul v0.4S, v0.4S,v15.s[0] +mul v19.4S, v19.4S,v24.s[0] +mul v10.4S, v10.4S,v5.s[0] +str q25, [x0, #112] +mla v11.4S, v27.4S, v31.s[0] +mla v0.4S, v26.4S, v31.s[0] +mla v19.4S, v30.4S, v31.s[0] +mla v10.4S, v29.4S, v31.s[0] +str q28, [x0, #176] +sub v28.4s, v23.4s, v11.4s +sub v29.4s, v14.4s, v0.4s +sub v30.4s, v21.4s, v19.4s +sub v26.4s, v18.4s, v10.4s +str q4, [x0, #240] +add v23.4s, v23.4s, v11.4s +add v14.4s, v14.4s, v0.4s +add v21.4s, v21.4s, v19.4s +add v18.4s, v18.4s, v10.4s +ldr q10, [x0, #272] +ldr q19, [x0, #336] +ldr q0, [x0, #400] +ldr q11, [x0, #464] +sqrdmulh v4.4S, v2.4S, v3.s[0] +sqrdmulh v27.4S, v13.4S, v22.s[0] +sqrdmulh v25.4S, v17.4S, v20.s[0] +sqrdmulh v6.4S, v1.4S, v16.s[0] +mul v2.4S, v2.4S,v12.s[0] +mul v13.4S, v13.4S,v15.s[0] +mul v17.4S, v17.4S,v24.s[0] +mul v1.4S, v1.4S,v5.s[0] +mla v2.4S, v4.4S, v31.s[0] +mla v13.4S, v27.4S, v31.s[0] +mla v17.4S, v25.4S, v31.s[0] +mla v1.4S, v6.4S, v31.s[0] +sub v6.4s, v10.4s, v2.4s +sub v25.4s, v19.4s, v13.4s +sub v27.4s, v0.4s, v17.4s +sub v4.4s, v11.4s, v1.4s +add v10.4s, v10.4s, v2.4s +add v19.4s, v19.4s, v13.4s +add v0.4s, v0.4s, v17.4s +add v11.4s, v11.4s, v1.4s +sqrdmulh v1.4S, v10.4S, v3.s[1] +sqrdmulh v17.4S, v19.4S, v22.s[1] +sqrdmulh v13.4S, v0.4S, v20.s[1] +sqrdmulh v2.4S, v11.4S, v16.s[1] +mul v10.4S, v10.4S,v12.s[1] +mul v19.4S, v19.4S,v15.s[1] +mul v0.4S, v0.4S,v24.s[1] +mul v11.4S, v11.4S,v5.s[1] +mla v10.4S, v1.4S, v31.s[0] +mla v19.4S, v17.4S, v31.s[0] +mla v0.4S, v13.4S, v31.s[0] +mla v11.4S, v2.4S, v31.s[0] +sub v2.4s, v23.4s, v10.4s +sub v13.4s, v14.4s, v19.4s +sub v17.4s, v21.4s, v0.4s +sub v1.4s, v18.4s, v11.4s +add v23.4s, v23.4s, v10.4s +add v14.4s, v14.4s, v19.4s +add v21.4s, v21.4s, v0.4s +add v18.4s, v18.4s, v11.4s +sqrdmulh v11.4S, v6.4S, v3.s[2] +sqrdmulh v0.4S, v25.4S, v22.s[2] +sqrdmulh v19.4S, v27.4S, v20.s[2] +sqrdmulh v10.4S, v4.4S, v16.s[2] +str q23, [x0, #256] +str q2, [x0, #272] +mul v6.4S, v6.4S,v12.s[2] +mul v25.4S, v25.4S,v15.s[2] +mul v27.4S, v27.4S,v24.s[2] +mul v4.4S, v4.4S,v5.s[2] +str q14, [x0, #320] +str q13, [x0, #336] +ldr q16, [x17, #+384] +ldr q5, [x17, #+400] +ldr q20, [x17, #+416] +ldr q24, [x17, #+432] +mla v6.4S, v11.4S, v31.s[0] +mla v25.4S, v0.4S, v31.s[0] +mla v27.4S, v19.4S, v31.s[0] +mla v4.4S, v10.4S, v31.s[0] +str q21, [x0, #384] +str q17, [x0, #400] +ldr q17, [x17, #+448] +ldr q21, [x17, #+464] +sub v10.4s, v28.4s, v6.4s +sub v19.4s, v29.4s, v25.4s +sub v0.4s, v30.4s, v27.4s +sub v11.4s, v26.4s, v4.4s +str q18, [x0, #448] +str q1, [x0, #464] +ldr q1, [x17, #+480] +ldr q18, [x17, #+496] +add v28.4s, v28.4s, v6.4s +add v29.4s, v29.4s, v25.4s +add v30.4s, v30.4s, v27.4s +add v26.4s, v26.4s, v4.4s +str q28, [x0, #288] +str q29, [x0, #352] +str q30, [x0, #416] +str q26, [x0, #480] +ldr q26, [x0, #544] +ldr q30, [x0, #560] +ldr q29, [x0, #512] +ldr q28, [x0, #608] +ldr q4, [x0, #624] +ldr q27, [x0, #576] +ldr q25, [x0, #672] +ldr q6, [x0, #688] +ldr q22, [x0, #640] +ldr q15, [x0, #736] +ldr q3, [x0, #752] +ldr q12, [x0, #704] +sqrdmulh v13.4S, v26.4S, v5.s[0] +sqrdmulh v14.4S, v28.4S, v24.s[0] +sqrdmulh v2.4S, v25.4S, v21.s[0] +sqrdmulh v23.4S, v15.4S, v18.s[0] +str q10, [x0, #304] +mul v26.4S, v26.4S,v16.s[0] +mul v28.4S, v28.4S,v20.s[0] +mul v25.4S, v25.4S,v17.s[0] +mul v15.4S, v15.4S,v1.s[0] +str q19, [x0, #368] +mla v26.4S, v13.4S, v31.s[0] +mla v28.4S, v14.4S, v31.s[0] +mla v25.4S, v2.4S, v31.s[0] +mla v15.4S, v23.4S, v31.s[0] +str q0, [x0, #432] +sub v0.4s, v29.4s, v26.4s +sub v23.4s, v27.4s, v28.4s +sub v2.4s, v22.4s, v25.4s +sub v14.4s, v12.4s, v15.4s +str q11, [x0, #496] +add v29.4s, v29.4s, v26.4s +add v27.4s, v27.4s, v28.4s +add v22.4s, v22.4s, v25.4s +add v12.4s, v12.4s, v15.4s +ldr q15, [x0, #528] +ldr q25, [x0, #592] +ldr q28, [x0, #656] +ldr q26, [x0, #720] +sqrdmulh v11.4S, v30.4S, v5.s[0] +sqrdmulh v13.4S, v4.4S, v24.s[0] +sqrdmulh v19.4S, v6.4S, v21.s[0] +sqrdmulh v10.4S, v3.4S, v18.s[0] +mul v30.4S, v30.4S,v16.s[0] +mul v4.4S, v4.4S,v20.s[0] +mul v6.4S, v6.4S,v17.s[0] +mul v3.4S, v3.4S,v1.s[0] +mla v30.4S, v11.4S, v31.s[0] +mla v4.4S, v13.4S, v31.s[0] +mla v6.4S, v19.4S, v31.s[0] +mla v3.4S, v10.4S, v31.s[0] +sub v10.4s, v15.4s, v30.4s +sub v19.4s, v25.4s, v4.4s +sub v13.4s, v28.4s, v6.4s +sub v11.4s, v26.4s, v3.4s +add v15.4s, v15.4s, v30.4s +add v25.4s, v25.4s, v4.4s +add v28.4s, v28.4s, v6.4s +add v26.4s, v26.4s, v3.4s +sqrdmulh v3.4S, v15.4S, v5.s[1] +sqrdmulh v6.4S, v25.4S, v24.s[1] +sqrdmulh v4.4S, v28.4S, v21.s[1] +sqrdmulh v30.4S, v26.4S, v18.s[1] +mul v15.4S, v15.4S,v16.s[1] +mul v25.4S, v25.4S,v20.s[1] +mul v28.4S, v28.4S,v17.s[1] +mul v26.4S, v26.4S,v1.s[1] +mla v15.4S, v3.4S, v31.s[0] +mla v25.4S, v6.4S, v31.s[0] +mla v28.4S, v4.4S, v31.s[0] +mla v26.4S, v30.4S, v31.s[0] +sub v30.4s, v29.4s, v15.4s +sub v4.4s, v27.4s, v25.4s +sub v6.4s, v22.4s, v28.4s +sub v3.4s, v12.4s, v26.4s +add v29.4s, v29.4s, v15.4s +add v27.4s, v27.4s, v25.4s +add v22.4s, v22.4s, v28.4s +add v12.4s, v12.4s, v26.4s +sqrdmulh v26.4S, v10.4S, v5.s[2] +sqrdmulh v28.4S, v19.4S, v24.s[2] +sqrdmulh v25.4S, v13.4S, v21.s[2] +sqrdmulh v15.4S, v11.4S, v18.s[2] +str q29, [x0, #512] +str q30, [x0, #528] +mul v10.4S, v10.4S,v16.s[2] +mul v19.4S, v19.4S,v20.s[2] +mul v13.4S, v13.4S,v17.s[2] +mul v11.4S, v11.4S,v1.s[2] +str q27, [x0, #576] +str q4, [x0, #592] +ldr q18, [x17, #+512] +ldr q1, [x17, #+528] +ldr q21, [x17, #+544] +ldr q17, [x17, #+560] +mla v10.4S, v26.4S, v31.s[0] +mla v19.4S, v28.4S, v31.s[0] +mla v13.4S, v25.4S, v31.s[0] +mla v11.4S, v15.4S, v31.s[0] +str q22, [x0, #640] +str q6, [x0, #656] +ldr q6, [x17, #+576] +ldr q22, [x17, #+592] +sub v15.4s, v0.4s, v10.4s +sub v25.4s, v23.4s, v19.4s +sub v28.4s, v2.4s, v13.4s +sub v26.4s, v14.4s, v11.4s +str q12, [x0, #704] +str q3, [x0, #720] +ldr q3, [x17, #+608] +ldr q12, [x17, #+624] +add v0.4s, v0.4s, v10.4s +add v23.4s, v23.4s, v19.4s +add v2.4s, v2.4s, v13.4s +add v14.4s, v14.4s, v11.4s +str q0, [x0, #544] +str q23, [x0, #608] +str q2, [x0, #672] +str q14, [x0, #736] +ldr q14, [x0, #800] +ldr q2, [x0, #816] +ldr q23, [x0, #768] +ldr q0, [x0, #864] +ldr q11, [x0, #880] +ldr q13, [x0, #832] +ldr q19, [x0, #928] +ldr q10, [x0, #944] +ldr q24, [x0, #896] +ldr q20, [x0, #992] +ldr q5, [x0, #1008] +ldr q16, [x0, #960] +sqrdmulh v4.4S, v14.4S, v1.s[0] +sqrdmulh v27.4S, v0.4S, v17.s[0] +sqrdmulh v30.4S, v19.4S, v22.s[0] +sqrdmulh v29.4S, v20.4S, v12.s[0] +str q15, [x0, #560] +mul v14.4S, v14.4S,v18.s[0] +mul v0.4S, v0.4S,v21.s[0] +mul v19.4S, v19.4S,v6.s[0] +mul v20.4S, v20.4S,v3.s[0] +str q25, [x0, #624] +mla v14.4S, v4.4S, v31.s[0] +mla v0.4S, v27.4S, v31.s[0] +mla v19.4S, v30.4S, v31.s[0] +mla v20.4S, v29.4S, v31.s[0] +str q28, [x0, #688] +sub v28.4s, v23.4s, v14.4s +sub v29.4s, v13.4s, v0.4s +sub v30.4s, v24.4s, v19.4s +sub v27.4s, v16.4s, v20.4s +str q26, [x0, #752] +add v23.4s, v23.4s, v14.4s +add v13.4s, v13.4s, v0.4s +add v24.4s, v24.4s, v19.4s +add v16.4s, v16.4s, v20.4s +ldr q20, [x0, #784] +ldr q19, [x0, #848] +ldr q0, [x0, #912] +ldr q14, [x0, #976] +sqrdmulh v26.4S, v2.4S, v1.s[0] +sqrdmulh v4.4S, v11.4S, v17.s[0] +sqrdmulh v25.4S, v10.4S, v22.s[0] +sqrdmulh v15.4S, v5.4S, v12.s[0] +mul v2.4S, v2.4S,v18.s[0] +mul v11.4S, v11.4S,v21.s[0] +mul v10.4S, v10.4S,v6.s[0] +mul v5.4S, v5.4S,v3.s[0] +mla v2.4S, v26.4S, v31.s[0] +mla v11.4S, v4.4S, v31.s[0] +mla v10.4S, v25.4S, v31.s[0] +mla v5.4S, v15.4S, v31.s[0] +sub v15.4s, v20.4s, v2.4s +sub v25.4s, v19.4s, v11.4s +sub v4.4s, v0.4s, v10.4s +sub v26.4s, v14.4s, v5.4s +add v20.4s, v20.4s, v2.4s +add v19.4s, v19.4s, v11.4s +add v0.4s, v0.4s, v10.4s +add v14.4s, v14.4s, v5.4s +sqrdmulh v5.4S, v20.4S, v1.s[1] +sqrdmulh v10.4S, v19.4S, v17.s[1] +sqrdmulh v11.4S, v0.4S, v22.s[1] +sqrdmulh v2.4S, v14.4S, v12.s[1] +mul v20.4S, v20.4S,v18.s[1] +mul v19.4S, v19.4S,v21.s[1] +mul v0.4S, v0.4S,v6.s[1] +mul v14.4S, v14.4S,v3.s[1] +mla v20.4S, v5.4S, v31.s[0] +mla v19.4S, v10.4S, v31.s[0] +mla v0.4S, v11.4S, v31.s[0] +mla v14.4S, v2.4S, v31.s[0] +sub v2.4s, v23.4s, v20.4s +sub v11.4s, v13.4s, v19.4s +sub v10.4s, v24.4s, v0.4s +sub v5.4s, v16.4s, v14.4s +add v23.4s, v23.4s, v20.4s +add v13.4s, v13.4s, v19.4s +add v24.4s, v24.4s, v0.4s +add v16.4s, v16.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v1.s[2] +sqrdmulh v0.4S, v25.4S, v17.s[2] +sqrdmulh v19.4S, v4.4S, v22.s[2] +sqrdmulh v20.4S, v26.4S, v12.s[2] +str q23, [x0, #768] +str q2, [x0, #784] +mul v15.4S, v15.4S,v18.s[2] +mul v25.4S, v25.4S,v21.s[2] +mul v4.4S, v4.4S,v6.s[2] +mul v26.4S, v26.4S,v3.s[2] +str q13, [x0, #832] +str q11, [x0, #848] +mla v15.4S, v14.4S, v31.s[0] +mla v25.4S, v0.4S, v31.s[0] +mla v4.4S, v19.4S, v31.s[0] +mla v26.4S, v20.4S, v31.s[0] +str q24, [x0, #896] +str q10, [x0, #912] +sub v10.4s, v28.4s, v15.4s +sub v24.4s, v29.4s, v25.4s +sub v20.4s, v30.4s, v4.4s +sub v19.4s, v27.4s, v26.4s +str q16, [x0, #960] +str q5, [x0, #976] +add v28.4s, v28.4s, v15.4s +add v29.4s, v29.4s, v25.4s +add v30.4s, v30.4s, v4.4s +add v27.4s, v27.4s, v26.4s +str q28, [x0, #800] +str q29, [x0, #864] +str q30, [x0, #928] +str q27, [x0, #992] +str q10, [x0, #816] +str q24, [x0, #880] +str q20, [x0, #944] +str q19, [x0, #1008] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1464 +// Instruction count: 1460 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_4.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_4.s new file mode 100644 index 0000000..dbd61ed --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_4.s @@ -0,0 +1,1494 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_7_z4_4 +.global _ntt_u32_incomplete_neon_asm_var_4_2_7_z4_4 +ntt_u32_incomplete_neon_asm_var_4_2_7_z4_4: +_ntt_u32_incomplete_neon_asm_var_4_2_7_z4_4: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #928] +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +ldr q20, [x0, #992] +sqrdmulh v19.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q18, [x0, #800] +sqrdmulh v17.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +ldr q16, [x0, #864] +sqrdmulh v3.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +mla v22.4S, v21.4S, v31.s[0] +mla v20.4S, v19.4S, v31.s[0] +mla v18.4S, v17.4S, v31.s[0] +mla v16.4S, v3.4S, v31.s[0] +ldr q3, [x0, #544] +sqrdmulh v17.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +ldr q19, [x0, #608] +sqrdmulh v21.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q2, [x0, #672] +ldr q1, [x0, #416] +sqrdmulh v0.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +sub v15.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +ldr q22, [x0, #736] +ldr q14, [x0, #480] +sqrdmulh v13.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v12.4s, v14.4s, v20.4s +add v14.4s, v14.4s, v20.4s +ldr q20, [x0, #288] +mla v3.4S, v17.4S, v31.s[0] +mla v19.4S, v21.4S, v31.s[0] +sub v21.4s, v20.4s, v18.4s +mla v2.4S, v0.4S, v31.s[0] +mla v22.4S, v13.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +ldr q18, [x0, #352] +sqrdmulh v13.4S, v1.4S, v29.s[1] +mul v1.4S, v1.4S,v30.s[1] +sub v0.4s, v18.4s, v16.4s +sqrdmulh v17.4S, v14.4S, v29.s[1] +mul v14.4S, v14.4S,v30.s[1] +add v18.4s, v18.4s, v16.4s +ldr q16, [x0, #32] +sqrdmulh v11.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v10.4s, v16.4s, v3.4s +add v16.4s, v16.4s, v3.4s +ldr q3, [x0, #96] +sqrdmulh v9.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v8.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +ldr q19, [x0, #160] +mla v1.4S, v13.4S, v31.s[0] +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v19.4s, v2.4s +mla v20.4S, v11.4S, v31.s[0] +mla v18.4S, v9.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +ldr q2, [x0, #224] +sqrdmulh v9.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +sub v11.4s, v2.4s, v22.4s +sqrdmulh v13.4S, v12.4S, v29.s[2] +mul v12.4S, v12.4S,v30.s[2] +add v2.4s, v2.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +sub v7.4s, v19.4s, v1.4s +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v29.s[2] +mul v0.4S, v0.4S,v30.s[2] +sub v6.4s, v2.4s, v14.4s +add v2.4s, v2.4s, v14.4s +mla v15.4S, v9.4S, v31.s[0] +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v16.4s, v20.4s +mla v21.4S, v22.4S, v31.s[0] +mla v0.4S, v1.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v7.4S, v27.s[1] +mul v7.4S, v7.4S,v28.s[1] +sub v1.4s, v3.4s, v18.4s +sqrdmulh v22.4S, v6.4S, v27.s[1] +mul v6.4S, v6.4S,v28.s[1] +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v19.4S, v27.s[0] +mul v19.4S, v19.4S,v28.s[0] +sub v9.4s, v17.4s, v15.4s +add v17.4s, v17.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v27.s[0] +mul v2.4S, v2.4S,v28.s[0] +sub v14.4s, v11.4s, v12.4s +add v11.4s, v11.4s, v12.4s +mla v7.4S, v20.4S, v31.s[0] +mla v6.4S, v22.4S, v31.s[0] +sub v22.4s, v10.4s, v21.4s +mla v19.4S, v18.4S, v31.s[0] +mla v2.4S, v15.4S, v31.s[0] +add v10.4s, v10.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v27.s[2] +mul v17.4S, v17.4S,v28.s[2] +sub v15.4s, v8.4s, v0.4s +sqrdmulh v18.4S, v11.4S, v27.s[2] +mul v11.4S, v11.4S,v28.s[2] +add v8.4s, v8.4s, v0.4s +sqrdmulh v0.4S, v9.4S, v27.s[3] +mul v9.4S, v9.4S,v28.s[3] +sub v20.4s, v13.4s, v7.4s +add v13.4s, v13.4s, v7.4s +sqrdmulh v7.4S, v14.4S, v27.s[3] +mul v14.4S, v14.4S,v28.s[3] +sub v12.4s, v1.4s, v6.4s +add v1.4s, v1.4s, v6.4s +mla v17.4S, v21.4S, v31.s[0] +mla v11.4S, v18.4S, v31.s[0] +sub v18.4s, v16.4s, v19.4s +mla v9.4S, v0.4S, v31.s[0] +mla v14.4S, v7.4S, v31.s[0] +add v16.4s, v16.4s, v19.4s +sqrdmulh v19.4S, v1.4S, v25.s[2] +mul v1.4S, v1.4S,v26.s[2] +sub v7.4s, v3.4s, v2.4s +sqrdmulh v0.4S, v12.4S, v25.s[3] +mul v12.4S, v12.4S,v26.s[3] +add v3.4s, v3.4s, v2.4s +sqrdmulh v2.4S, v7.4S, v25.s[1] +mul v7.4S, v7.4S,v26.s[1] +sub v21.4s, v10.4s, v17.4s +add v10.4s, v10.4s, v17.4s +sqrdmulh v17.4S, v3.4S, v25.s[0] +mul v3.4S, v3.4S,v26.s[0] +sub v6.4s, v8.4s, v11.4s +add v8.4s, v8.4s, v11.4s +mla v1.4S, v19.4S, v31.s[0] +mla v12.4S, v0.4S, v31.s[0] +sub v0.4s, v22.4s, v9.4s +mla v7.4S, v2.4S, v31.s[0] +mla v3.4S, v17.4S, v31.s[0] +add v22.4s, v22.4s, v9.4s +sqrdmulh v9.4S, v8.4S, v23.s[0] +mul v8.4S, v8.4S,v24.s[0] +sub v17.4s, v15.4s, v14.4s +sqrdmulh v2.4S, v6.4S, v23.s[1] +mul v6.4S, v6.4S,v24.s[1] +add v15.4s, v15.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v23.s[2] +mul v15.4S, v15.4S,v24.s[2] +sub v19.4s, v13.4s, v1.4s +add v13.4s, v13.4s, v1.4s +sqrdmulh v1.4S, v17.4S, v23.s[3] +mul v17.4S, v17.4S,v24.s[3] +sub v11.4s, v20.4s, v12.4s +add v20.4s, v20.4s, v12.4s +mla v8.4S, v9.4S, v31.s[0] +mla v6.4S, v2.4S, v31.s[0] +sub v2.4s, v18.4s, v7.4s +str q13, [x0, #288] +mla v15.4S, v14.4S, v31.s[0] +mla v17.4S, v1.4S, v31.s[0] +add v18.4s, v18.4s, v7.4s +str q19, [x0, #352] +ldr q19, [x0, #944] +sqrdmulh v7.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +sub v1.4s, v16.4s, v3.4s +str q20, [x0, #416] +ldr q20, [x0, #1008] +sqrdmulh v14.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v16.4s, v16.4s, v3.4s +str q11, [x0, #480] +ldr q11, [x0, #816] +sqrdmulh v3.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +sub v13.4s, v10.4s, v8.4s +add v10.4s, v10.4s, v8.4s +ldr q8, [x0, #880] +sqrdmulh v9.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v12.4s, v21.4s, v6.4s +add v21.4s, v21.4s, v6.4s +mla v19.4S, v7.4S, v31.s[0] +mla v20.4S, v14.4S, v31.s[0] +sub v14.4s, v22.4s, v15.4s +str q18, [x0, #160] +mla v11.4S, v3.4S, v31.s[0] +mla v8.4S, v9.4S, v31.s[0] +add v22.4s, v22.4s, v15.4s +str q2, [x0, #224] +ldr q2, [x0, #560] +sqrdmulh v15.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +sub v9.4s, v0.4s, v17.4s +str q16, [x0, #32] +ldr q16, [x0, #624] +sqrdmulh v3.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +add v0.4s, v0.4s, v17.4s +str q1, [x0, #96] +ldr q1, [x0, #688] +ldr q17, [x0, #432] +sqrdmulh v18.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +sub v7.4s, v17.4s, v19.4s +add v17.4s, v17.4s, v19.4s +ldr q19, [x0, #752] +ldr q6, [x0, #496] +sqrdmulh v5.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +sub v4.4s, v6.4s, v20.4s +add v6.4s, v6.4s, v20.4s +ldr q20, [x0, #304] +mla v2.4S, v15.4S, v31.s[0] +mla v16.4S, v3.4S, v31.s[0] +sub v3.4s, v20.4s, v11.4s +str q10, [x0, #544] +mla v1.4S, v18.4S, v31.s[0] +mla v19.4S, v5.4S, v31.s[0] +add v20.4s, v20.4s, v11.4s +str q13, [x0, #608] +ldr q13, [x0, #368] +sqrdmulh v11.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v5.4s, v13.4s, v8.4s +str q21, [x0, #672] +sqrdmulh v21.4S, v6.4S, v29.s[1] +mul v6.4S, v6.4S,v30.s[1] +add v13.4s, v13.4s, v8.4s +str q12, [x0, #736] +ldr q12, [x0, #48] +sqrdmulh v8.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v18.4s, v12.4s, v2.4s +add v12.4s, v12.4s, v2.4s +ldr q2, [x0, #112] +sqrdmulh v10.4S, v13.4S, v29.s[1] +mul v13.4S, v13.4S,v30.s[1] +sub v15.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +ldr q16, [x0, #176] +mla v17.4S, v11.4S, v31.s[0] +mla v6.4S, v21.4S, v31.s[0] +sub v21.4s, v16.4s, v1.4s +str q22, [x0, #800] +mla v20.4S, v8.4S, v31.s[0] +mla v13.4S, v10.4S, v31.s[0] +add v16.4s, v16.4s, v1.4s +str q14, [x0, #864] +ldr q14, [x0, #240] +sqrdmulh v1.4S, v7.4S, v29.s[2] +mul v7.4S, v7.4S,v30.s[2] +sub v10.4s, v14.4s, v19.4s +str q0, [x0, #928] +sqrdmulh v0.4S, v4.4S, v29.s[2] +mul v4.4S, v4.4S,v30.s[2] +add v14.4s, v14.4s, v19.4s +str q9, [x0, #992] +sqrdmulh v9.4S, v3.4S, v29.s[2] +mul v3.4S, v3.4S,v30.s[2] +sub v19.4s, v16.4s, v17.4s +add v16.4s, v16.4s, v17.4s +sqrdmulh v17.4S, v5.4S, v29.s[2] +mul v5.4S, v5.4S,v30.s[2] +sub v8.4s, v14.4s, v6.4s +add v14.4s, v14.4s, v6.4s +mla v7.4S, v1.4S, v31.s[0] +mla v4.4S, v0.4S, v31.s[0] +sub v0.4s, v12.4s, v20.4s +mla v3.4S, v9.4S, v31.s[0] +mla v5.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v27.s[1] +mul v19.4S, v19.4S,v28.s[1] +sub v17.4s, v2.4s, v13.4s +sqrdmulh v9.4S, v8.4S, v27.s[1] +mul v8.4S, v8.4S,v28.s[1] +add v2.4s, v2.4s, v13.4s +sqrdmulh v13.4S, v16.4S, v27.s[0] +mul v16.4S, v16.4S,v28.s[0] +sub v1.4s, v21.4s, v7.4s +add v21.4s, v21.4s, v7.4s +sqrdmulh v7.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +sub v6.4s, v10.4s, v4.4s +add v10.4s, v10.4s, v4.4s +mla v19.4S, v20.4S, v31.s[0] +mla v8.4S, v9.4S, v31.s[0] +sub v9.4s, v18.4s, v3.4s +mla v16.4S, v13.4S, v31.s[0] +mla v14.4S, v7.4S, v31.s[0] +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v27.s[2] +mul v21.4S, v21.4S,v28.s[2] +sub v7.4s, v15.4s, v5.4s +sqrdmulh v13.4S, v10.4S, v27.s[2] +mul v10.4S, v10.4S,v28.s[2] +add v15.4s, v15.4s, v5.4s +sqrdmulh v5.4S, v1.4S, v27.s[3] +mul v1.4S, v1.4S,v28.s[3] +sub v20.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v27.s[3] +mul v6.4S, v6.4S,v28.s[3] +sub v4.4s, v17.4s, v8.4s +add v17.4s, v17.4s, v8.4s +mla v21.4S, v3.4S, v31.s[0] +mla v10.4S, v13.4S, v31.s[0] +sub v13.4s, v12.4s, v16.4s +mla v1.4S, v5.4S, v31.s[0] +mla v6.4S, v19.4S, v31.s[0] +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v25.s[2] +mul v17.4S, v17.4S,v26.s[2] +sub v19.4s, v2.4s, v14.4s +sqrdmulh v5.4S, v4.4S, v25.s[3] +mul v4.4S, v4.4S,v26.s[3] +add v2.4s, v2.4s, v14.4s +sqrdmulh v14.4S, v19.4S, v25.s[1] +mul v19.4S, v19.4S,v26.s[1] +sub v3.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v2.4S, v25.s[0] +mul v2.4S, v2.4S,v26.s[0] +sub v8.4s, v15.4s, v10.4s +add v15.4s, v15.4s, v10.4s +mla v17.4S, v16.4S, v31.s[0] +mla v4.4S, v5.4S, v31.s[0] +sub v5.4s, v9.4s, v1.4s +mla v19.4S, v14.4S, v31.s[0] +mla v2.4S, v21.4S, v31.s[0] +add v9.4s, v9.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v23.s[0] +mul v15.4S, v15.4S,v24.s[0] +sub v21.4s, v7.4s, v6.4s +sqrdmulh v14.4S, v8.4S, v23.s[1] +mul v8.4S, v8.4S,v24.s[1] +add v7.4s, v7.4s, v6.4s +sqrdmulh v6.4S, v7.4S, v23.s[2] +mul v7.4S, v7.4S,v24.s[2] +sub v16.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +sqrdmulh v17.4S, v21.4S, v23.s[3] +mul v21.4S, v21.4S,v24.s[3] +sub v10.4s, v20.4s, v4.4s +add v20.4s, v20.4s, v4.4s +mla v15.4S, v1.4S, v31.s[0] +mla v8.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v19.4s +str q0, [x0, #304] +mla v7.4S, v6.4S, v31.s[0] +mla v21.4S, v17.4S, v31.s[0] +add v13.4s, v13.4s, v19.4s +str q16, [x0, #368] +ldr q16, [x0, #896] +sqrdmulh v19.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v17.4s, v12.4s, v2.4s +str q20, [x0, #432] +ldr q20, [x0, #960] +sqrdmulh v6.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v12.4s, v12.4s, v2.4s +str q10, [x0, #496] +ldr q10, [x0, #768] +sqrdmulh v2.4S, v10.4S, v29.s[0] +mul v10.4S, v10.4S,v30.s[0] +sub v0.4s, v18.4s, v15.4s +add v18.4s, v18.4s, v15.4s +ldr q15, [x0, #832] +sqrdmulh v1.4S, v15.4S, v29.s[0] +mul v15.4S, v15.4S,v30.s[0] +sub v4.4s, v3.4s, v8.4s +add v3.4s, v3.4s, v8.4s +mla v16.4S, v19.4S, v31.s[0] +mla v20.4S, v6.4S, v31.s[0] +sub v6.4s, v9.4s, v7.4s +str q13, [x0, #176] +mla v10.4S, v2.4S, v31.s[0] +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v7.4s +str q14, [x0, #240] +ldr q14, [x0, #512] +sqrdmulh v7.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v1.4s, v5.4s, v21.4s +str q12, [x0, #48] +ldr q12, [x0, #576] +sqrdmulh v2.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +add v5.4s, v5.4s, v21.4s +str q17, [x0, #112] +ldr q17, [x0, #640] +ldr q21, [x0, #384] +sqrdmulh v13.4S, v17.4S, v29.s[0] +mul v17.4S, v17.4S,v30.s[0] +sub v19.4s, v21.4s, v16.4s +add v21.4s, v21.4s, v16.4s +ldr q16, [x0, #704] +ldr q8, [x0, #448] +sqrdmulh v22.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v11.4s, v8.4s, v20.4s +add v8.4s, v8.4s, v20.4s +ldr q20, [x0, #256] +mla v14.4S, v7.4S, v31.s[0] +mla v12.4S, v2.4S, v31.s[0] +sub v2.4s, v20.4s, v10.4s +str q18, [x0, #560] +mla v17.4S, v13.4S, v31.s[0] +mla v16.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v10.4s +str q0, [x0, #624] +ldr q0, [x0, #320] +sqrdmulh v10.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v22.4s, v0.4s, v15.4s +str q3, [x0, #688] +sqrdmulh v3.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +add v0.4s, v0.4s, v15.4s +str q4, [x0, #752] +ldr q4, [x0, #0] +sqrdmulh v15.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v13.4s, v4.4s, v14.4s +add v4.4s, v4.4s, v14.4s +ldr q14, [x0, #64] +sqrdmulh v18.4S, v0.4S, v29.s[1] +mul v0.4S, v0.4S,v30.s[1] +sub v7.4s, v14.4s, v12.4s +add v14.4s, v14.4s, v12.4s +ldr q12, [x0, #128] +mla v21.4S, v10.4S, v31.s[0] +mla v8.4S, v3.4S, v31.s[0] +sub v3.4s, v12.4s, v17.4s +str q9, [x0, #816] +mla v20.4S, v15.4S, v31.s[0] +mla v0.4S, v18.4S, v31.s[0] +add v12.4s, v12.4s, v17.4s +str q6, [x0, #880] +ldr q6, [x0, #192] +sqrdmulh v17.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +sub v18.4s, v6.4s, v16.4s +str q5, [x0, #944] +sqrdmulh v5.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +add v6.4s, v6.4s, v16.4s +str q1, [x0, #1008] +sqrdmulh v1.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v16.4s, v12.4s, v21.4s +add v12.4s, v12.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +sub v15.4s, v6.4s, v8.4s +add v6.4s, v6.4s, v8.4s +mla v19.4S, v17.4S, v31.s[0] +mla v11.4S, v5.4S, v31.s[0] +sub v5.4s, v4.4s, v20.4s +mla v2.4S, v1.4S, v31.s[0] +mla v22.4S, v21.4S, v31.s[0] +add v4.4s, v4.4s, v20.4s +sqrdmulh v20.4S, v16.4S, v27.s[1] +mul v16.4S, v16.4S,v28.s[1] +sub v21.4s, v14.4s, v0.4s +sqrdmulh v1.4S, v15.4S, v27.s[1] +mul v15.4S, v15.4S,v28.s[1] +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +sub v17.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v27.s[0] +mul v6.4S, v6.4S,v28.s[0] +sub v8.4s, v18.4s, v11.4s +add v18.4s, v18.4s, v11.4s +mla v16.4S, v20.4S, v31.s[0] +mla v15.4S, v1.4S, v31.s[0] +sub v1.4s, v13.4s, v2.4s +mla v12.4S, v0.4S, v31.s[0] +mla v6.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v3.4S, v27.s[2] +mul v3.4S, v3.4S,v28.s[2] +sub v19.4s, v7.4s, v22.4s +sqrdmulh v0.4S, v18.4S, v27.s[2] +mul v18.4S, v18.4S,v28.s[2] +add v7.4s, v7.4s, v22.4s +sqrdmulh v22.4S, v17.4S, v27.s[3] +mul v17.4S, v17.4S,v28.s[3] +sub v20.4s, v5.4s, v16.4s +add v5.4s, v5.4s, v16.4s +sqrdmulh v16.4S, v8.4S, v27.s[3] +mul v8.4S, v8.4S,v28.s[3] +sub v11.4s, v21.4s, v15.4s +add v21.4s, v21.4s, v15.4s +mla v3.4S, v2.4S, v31.s[0] +mla v18.4S, v0.4S, v31.s[0] +sub v0.4s, v4.4s, v12.4s +mla v17.4S, v22.4S, v31.s[0] +mla v8.4S, v16.4S, v31.s[0] +add v4.4s, v4.4s, v12.4s +sqrdmulh v12.4S, v21.4S, v25.s[2] +mul v21.4S, v21.4S,v26.s[2] +sub v16.4s, v14.4s, v6.4s +sqrdmulh v22.4S, v11.4S, v25.s[3] +mul v11.4S, v11.4S,v26.s[3] +add v14.4s, v14.4s, v6.4s +sqrdmulh v6.4S, v16.4S, v25.s[1] +mul v16.4S, v16.4S,v26.s[1] +sub v2.4s, v13.4s, v3.4s +add v13.4s, v13.4s, v3.4s +sqrdmulh v3.4S, v14.4S, v25.s[0] +mul v14.4S, v14.4S,v26.s[0] +sub v15.4s, v7.4s, v18.4s +add v7.4s, v7.4s, v18.4s +mla v21.4S, v12.4S, v31.s[0] +mla v11.4S, v22.4S, v31.s[0] +sub v22.4s, v1.4s, v17.4s +mla v16.4S, v6.4S, v31.s[0] +mla v14.4S, v3.4S, v31.s[0] +add v1.4s, v1.4s, v17.4s +sqrdmulh v17.4S, v7.4S, v23.s[0] +mul v7.4S, v7.4S,v24.s[0] +sub v3.4s, v19.4s, v8.4s +sqrdmulh v6.4S, v15.4S, v23.s[1] +mul v15.4S, v15.4S,v24.s[1] +add v19.4s, v19.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v23.s[2] +mul v19.4S, v19.4S,v24.s[2] +sub v12.4s, v5.4s, v21.4s +add v5.4s, v5.4s, v21.4s +sqrdmulh v21.4S, v3.4S, v23.s[3] +mul v3.4S, v3.4S,v24.s[3] +sub v18.4s, v20.4s, v11.4s +add v20.4s, v20.4s, v11.4s +mla v7.4S, v17.4S, v31.s[0] +mla v15.4S, v6.4S, v31.s[0] +sub v6.4s, v0.4s, v16.4s +str q5, [x0, #256] +mla v19.4S, v8.4S, v31.s[0] +mla v3.4S, v21.4S, v31.s[0] +add v0.4s, v0.4s, v16.4s +str q12, [x0, #320] +ldr q12, [x0, #912] +sqrdmulh v16.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +sub v21.4s, v4.4s, v14.4s +str q20, [x0, #384] +ldr q20, [x0, #976] +sqrdmulh v8.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v4.4s, v4.4s, v14.4s +str q18, [x0, #448] +ldr q18, [x0, #784] +sqrdmulh v14.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +sub v5.4s, v13.4s, v7.4s +add v13.4s, v13.4s, v7.4s +ldr q7, [x0, #848] +sqrdmulh v17.4S, v7.4S, v29.s[0] +mul v7.4S, v7.4S,v30.s[0] +sub v11.4s, v2.4s, v15.4s +add v2.4s, v2.4s, v15.4s +mla v12.4S, v16.4S, v31.s[0] +mla v20.4S, v8.4S, v31.s[0] +sub v8.4s, v1.4s, v19.4s +str q0, [x0, #128] +mla v18.4S, v14.4S, v31.s[0] +mla v7.4S, v17.4S, v31.s[0] +add v1.4s, v1.4s, v19.4s +str q6, [x0, #192] +ldr q6, [x0, #528] +sqrdmulh v19.4S, v6.4S, v29.s[0] +mul v6.4S, v6.4S,v30.s[0] +sub v17.4s, v22.4s, v3.4s +str q4, [x0, #0] +ldr q4, [x0, #592] +sqrdmulh v14.4S, v4.4S, v29.s[0] +mul v4.4S, v4.4S,v30.s[0] +add v22.4s, v22.4s, v3.4s +str q21, [x0, #64] +ldr q21, [x0, #656] +ldr q3, [x0, #400] +sqrdmulh v0.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +sub v16.4s, v3.4s, v12.4s +add v3.4s, v3.4s, v12.4s +ldr q12, [x0, #720] +ldr q15, [x0, #464] +sqrdmulh v9.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +sub v10.4s, v15.4s, v20.4s +add v15.4s, v15.4s, v20.4s +ldr q20, [x0, #272] +mla v6.4S, v19.4S, v31.s[0] +mla v4.4S, v14.4S, v31.s[0] +sub v14.4s, v20.4s, v18.4s +str q13, [x0, #512] +mla v21.4S, v0.4S, v31.s[0] +mla v12.4S, v9.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +str q5, [x0, #576] +ldr q5, [x0, #336] +sqrdmulh v18.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v9.4s, v5.4s, v7.4s +str q2, [x0, #640] +sqrdmulh v2.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +add v5.4s, v5.4s, v7.4s +str q11, [x0, #704] +ldr q11, [x0, #16] +sqrdmulh v7.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v0.4s, v11.4s, v6.4s +add v11.4s, v11.4s, v6.4s +ldr q6, [x0, #80] +sqrdmulh v13.4S, v5.4S, v29.s[1] +mul v5.4S, v5.4S,v30.s[1] +sub v19.4s, v6.4s, v4.4s +add v6.4s, v6.4s, v4.4s +ldr q4, [x0, #144] +mla v3.4S, v18.4S, v31.s[0] +mla v15.4S, v2.4S, v31.s[0] +sub v2.4s, v4.4s, v21.4s +str q1, [x0, #768] +mla v20.4S, v7.4S, v31.s[0] +mla v5.4S, v13.4S, v31.s[0] +add v4.4s, v4.4s, v21.4s +str q8, [x0, #832] +ldr q8, [x0, #208] +sqrdmulh v21.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +sub v13.4s, v8.4s, v12.4s +str q22, [x0, #896] +sqrdmulh v22.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +add v8.4s, v8.4s, v12.4s +str q17, [x0, #960] +sqrdmulh v17.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v12.4s, v4.4s, v3.4s +add v4.4s, v4.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v29.s[2] +mul v9.4S, v9.4S,v30.s[2] +sub v7.4s, v8.4s, v15.4s +add v8.4s, v8.4s, v15.4s +mla v16.4S, v21.4S, v31.s[0] +mla v10.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v20.4s +mla v14.4S, v17.4S, v31.s[0] +mla v9.4S, v3.4S, v31.s[0] +add v11.4s, v11.4s, v20.4s +sqrdmulh v20.4S, v12.4S, v27.s[1] +mul v12.4S, v12.4S,v28.s[1] +sub v3.4s, v6.4s, v5.4s +sqrdmulh v17.4S, v7.4S, v27.s[1] +mul v7.4S, v7.4S,v28.s[1] +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v4.4S, v27.s[0] +mul v4.4S, v4.4S,v28.s[0] +sub v21.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v8.4S, v27.s[0] +mul v8.4S, v8.4S,v28.s[0] +sub v15.4s, v13.4s, v10.4s +add v13.4s, v13.4s, v10.4s +mla v12.4S, v20.4S, v31.s[0] +mla v7.4S, v17.4S, v31.s[0] +sub v17.4s, v0.4s, v14.4s +mla v4.4S, v5.4S, v31.s[0] +mla v8.4S, v16.4S, v31.s[0] +add v0.4s, v0.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v27.s[2] +mul v2.4S, v2.4S,v28.s[2] +sub v16.4s, v19.4s, v9.4s +sqrdmulh v5.4S, v13.4S, v27.s[2] +mul v13.4S, v13.4S,v28.s[2] +add v19.4s, v19.4s, v9.4s +sqrdmulh v9.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +sub v20.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +sub v10.4s, v3.4s, v7.4s +add v3.4s, v3.4s, v7.4s +mla v2.4S, v14.4S, v31.s[0] +mla v13.4S, v5.4S, v31.s[0] +sub v5.4s, v11.4s, v4.4s +mla v21.4S, v9.4S, v31.s[0] +mla v15.4S, v12.4S, v31.s[0] +add v11.4s, v11.4s, v4.4s +sqrdmulh v4.4S, v3.4S, v25.s[2] +mul v3.4S, v3.4S,v26.s[2] +sub v12.4s, v6.4s, v8.4s +sqrdmulh v9.4S, v10.4S, v25.s[3] +mul v10.4S, v10.4S,v26.s[3] +add v6.4s, v6.4s, v8.4s +sqrdmulh v8.4S, v12.4S, v25.s[1] +mul v12.4S, v12.4S,v26.s[1] +sub v14.4s, v0.4s, v2.4s +add v0.4s, v0.4s, v2.4s +sqrdmulh v2.4S, v6.4S, v25.s[0] +mul v6.4S, v6.4S,v26.s[0] +sub v7.4s, v19.4s, v13.4s +add v19.4s, v19.4s, v13.4s +mla v3.4S, v4.4S, v31.s[0] +mla v10.4S, v9.4S, v31.s[0] +sub v9.4s, v17.4s, v21.4s +mla v12.4S, v8.4S, v31.s[0] +mla v6.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v19.4S, v23.s[0] +mul v19.4S, v19.4S,v24.s[0] +sub v2.4s, v16.4s, v15.4s +sqrdmulh v8.4S, v7.4S, v23.s[1] +mul v7.4S, v7.4S,v24.s[1] +add v16.4s, v16.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v23.s[2] +mul v16.4S, v16.4S,v24.s[2] +sub v4.4s, v22.4s, v3.4s +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v2.4S, v23.s[3] +mul v2.4S, v2.4S,v24.s[3] +sub v13.4s, v20.4s, v10.4s +add v20.4s, v20.4s, v10.4s +mla v19.4S, v21.4S, v31.s[0] +mla v7.4S, v8.4S, v31.s[0] +sub v8.4s, v5.4s, v12.4s +str q22, [x0, #272] +mla v16.4S, v15.4S, v31.s[0] +mla v2.4S, v3.4S, v31.s[0] +add v5.4s, v5.4s, v12.4s +str q4, [x0, #336] +sub v23.4s, v11.4s, v6.4s +str q20, [x0, #400] +add v11.4s, v11.4s, v6.4s +str q13, [x0, #464] +sub v13.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sub v19.4s, v14.4s, v7.4s +add v14.4s, v14.4s, v7.4s +sub v7.4s, v17.4s, v16.4s +str q5, [x0, #144] +add v17.4s, v17.4s, v16.4s +str q8, [x0, #208] +sub v8.4s, v9.4s, v2.4s +str q11, [x0, #16] +add v9.4s, v9.4s, v2.4s +str q23, [x0, #80] +str q0, [x0, #528] +str q13, [x0, #592] +str q14, [x0, #656] +str q19, [x0, #720] +str q17, [x0, #784] +str q7, [x0, #848] +str q9, [x0, #912] +str q8, [x0, #976] +ldr q18, [x17, #+128] +ldr q1, [x17, #+144] +ldr q10, [x0, #32] +sqrdmulh v21.4S, v10.4S, v1.s[0] +mul v10.4S, v10.4S,v18.s[0] +ldr q22, [x0, #48] +sqrdmulh v15.4S, v22.4S, v1.s[0] +mul v22.4S, v22.4S,v18.s[0] +ldr q3, [x17, #+160] +ldr q12, [x17, #+176] +ldr q4, [x0, #96] +sqrdmulh v30.4S, v4.4S, v12.s[0] +mul v4.4S, v4.4S,v3.s[0] +ldr q29, [x0, #112] +sqrdmulh v28.4S, v29.4S, v12.s[0] +mul v29.4S, v29.4S,v3.s[0] +ldr q27, [x0, #160] +ldr q26, [x17, #+192] +ldr q25, [x17, #+208] +mla v10.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v27.4S, v25.s[0] +ldr q24, [x0, #176] +mla v22.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v24.4S, v25.s[0] +ldr q20, [x0, #224] +ldr q6, [x17, #+224] +ldr q5, [x17, #+240] +mla v4.4S, v30.4S, v31.s[0] +sqrdmulh v30.4S, v20.4S, v5.s[0] +ldr q16, [x0, #240] +mla v29.4S, v28.4S, v31.s[0] +sqrdmulh v28.4S, v16.4S, v5.s[0] +ldr q11, [x0, #128] +ldr q2, [x0, #0] +mul v27.4S, v27.4S,v26.s[0] +sub v23.4s, v2.4s, v10.4s +mul v24.4S, v24.4S,v26.s[0] +add v2.4s, v2.4s, v10.4s +ldr q10, [x0, #144] +ldr q0, [x0, #16] +mla v27.4S, v21.4S, v31.s[0] +sub v21.4s, v0.4s, v22.4s +mla v24.4S, v15.4S, v31.s[0] +add v0.4s, v0.4s, v22.4s +ldr q22, [x0, #192] +ldr q15, [x0, #64] +mul v20.4S, v20.4S,v6.s[0] +sub v13.4s, v15.4s, v4.4s +mul v16.4S, v16.4S,v6.s[0] +add v15.4s, v15.4s, v4.4s +ldr q4, [x0, #208] +ldr q14, [x0, #80] +mla v20.4S, v30.4S, v31.s[0] +sub v30.4s, v14.4s, v29.4s +mla v16.4S, v28.4S, v31.s[0] +add v14.4s, v14.4s, v29.4s +sqrdmulh v29.4S, v0.4S, v1.s[1] +mul v0.4S, v0.4S,v18.s[1] +sqrdmulh v28.4S, v21.4S, v1.s[2] +sub v19.4s, v11.4s, v27.4s +mul v21.4S, v21.4S,v18.s[2] +add v11.4s, v11.4s, v27.4s +sqrdmulh v1.4S, v14.4S, v12.s[1] +sub v18.4s, v10.4s, v24.4s +mul v14.4S, v14.4S,v3.s[1] +add v10.4s, v10.4s, v24.4s +sqrdmulh v24.4S, v30.4S, v12.s[2] +sub v27.4s, v22.4s, v20.4s +mul v30.4S, v30.4S,v3.s[2] +add v22.4s, v22.4s, v20.4s +mla v0.4S, v29.4S, v31.s[0] +sub v29.4s, v4.4s, v16.4s +sqrdmulh v12.4S, v10.4S, v25.s[1] +add v4.4s, v4.4s, v16.4s +mla v21.4S, v28.4S, v31.s[0] +sqrdmulh v28.4S, v18.4S, v25.s[2] +mla v14.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v4.4S, v5.s[1] +mla v30.4S, v24.4S, v31.s[0] +sqrdmulh v24.4S, v29.4S, v5.s[2] +mul v10.4S, v10.4S,v26.s[1] +sub v16.4s, v2.4s, v0.4s +mul v18.4S, v18.4S,v26.s[2] +add v2.4s, v2.4s, v0.4s +str q16, [x0, #16] +str q2, [x0, #0] +mla v10.4S, v12.4S, v31.s[0] +sub v12.4s, v23.4s, v21.4s +mla v18.4S, v28.4S, v31.s[0] +add v23.4s, v23.4s, v21.4s +str q12, [x0, #48] +str q23, [x0, #32] +mul v4.4S, v4.4S,v6.s[1] +sub v25.4s, v15.4s, v14.4s +mul v29.4S, v29.4S,v6.s[2] +add v15.4s, v15.4s, v14.4s +str q25, [x0, #80] +str q15, [x0, #64] +mla v4.4S, v1.4S, v31.s[0] +sub v1.4s, v13.4s, v30.4s +mla v29.4S, v24.4S, v31.s[0] +add v13.4s, v13.4s, v30.4s +str q1, [x0, #112] +str q13, [x0, #96] +ldr q5, [x17, #+256] +ldr q6, [x17, #+272] +ldr q13, [x0, #288] +sqrdmulh v1.4S, v13.4S, v6.s[0] +sub v30.4s, v11.4s, v10.4s +str q30, [x0, #144] +mul v13.4S, v13.4S,v5.s[0] +add v11.4s, v11.4s, v10.4s +str q11, [x0, #128] +ldr q11, [x0, #304] +sqrdmulh v10.4S, v11.4S, v6.s[0] +sub v30.4s, v19.4s, v18.4s +mul v11.4S, v11.4S,v5.s[0] +add v19.4s, v19.4s, v18.4s +str q30, [x0, #176] +str q19, [x0, #160] +ldr q19, [x17, #+288] +ldr q30, [x17, #+304] +ldr q18, [x0, #352] +sqrdmulh v24.4S, v18.4S, v30.s[0] +sub v15.4s, v22.4s, v4.4s +mul v18.4S, v18.4S,v19.s[0] +add v22.4s, v22.4s, v4.4s +str q15, [x0, #208] +str q22, [x0, #192] +ldr q22, [x0, #368] +sqrdmulh v15.4S, v22.4S, v30.s[0] +sub v4.4s, v27.4s, v29.4s +mul v22.4S, v22.4S,v19.s[0] +add v27.4s, v27.4s, v29.4s +str q4, [x0, #240] +str q27, [x0, #224] +ldr q27, [x0, #416] +ldr q4, [x17, #+320] +ldr q29, [x17, #+336] +mla v13.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v27.4S, v29.s[0] +ldr q25, [x0, #432] +mla v11.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v25.4S, v29.s[0] +ldr q14, [x0, #480] +ldr q26, [x17, #+352] +ldr q23, [x17, #+368] +mla v18.4S, v24.4S, v31.s[0] +sqrdmulh v24.4S, v14.4S, v23.s[0] +ldr q12, [x0, #496] +mla v22.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v12.4S, v23.s[0] +ldr q21, [x0, #384] +ldr q28, [x0, #256] +mul v27.4S, v27.4S,v4.s[0] +sub v2.4s, v28.4s, v13.4s +mul v25.4S, v25.4S,v4.s[0] +add v28.4s, v28.4s, v13.4s +ldr q13, [x0, #400] +ldr q16, [x0, #272] +mla v27.4S, v1.4S, v31.s[0] +sub v1.4s, v16.4s, v11.4s +mla v25.4S, v10.4S, v31.s[0] +add v16.4s, v16.4s, v11.4s +ldr q11, [x0, #448] +ldr q10, [x0, #320] +mul v14.4S, v14.4S,v26.s[0] +sub v0.4s, v10.4s, v18.4s +mul v12.4S, v12.4S,v26.s[0] +add v10.4s, v10.4s, v18.4s +ldr q18, [x0, #464] +ldr q3, [x0, #336] +mla v14.4S, v24.4S, v31.s[0] +sub v24.4s, v3.4s, v22.4s +mla v12.4S, v15.4S, v31.s[0] +add v3.4s, v3.4s, v22.4s +sqrdmulh v22.4S, v16.4S, v6.s[1] +mul v16.4S, v16.4S,v5.s[1] +sqrdmulh v15.4S, v1.4S, v6.s[2] +sub v20.4s, v21.4s, v27.4s +mul v1.4S, v1.4S,v5.s[2] +add v21.4s, v21.4s, v27.4s +sqrdmulh v6.4S, v3.4S, v30.s[1] +sub v5.4s, v13.4s, v25.4s +mul v3.4S, v3.4S,v19.s[1] +add v13.4s, v13.4s, v25.4s +sqrdmulh v25.4S, v24.4S, v30.s[2] +sub v27.4s, v11.4s, v14.4s +mul v24.4S, v24.4S,v19.s[2] +add v11.4s, v11.4s, v14.4s +mla v16.4S, v22.4S, v31.s[0] +sub v22.4s, v18.4s, v12.4s +sqrdmulh v30.4S, v13.4S, v29.s[1] +add v18.4s, v18.4s, v12.4s +mla v1.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v5.4S, v29.s[2] +mla v3.4S, v6.4S, v31.s[0] +sqrdmulh v6.4S, v18.4S, v23.s[1] +mla v24.4S, v25.4S, v31.s[0] +sqrdmulh v25.4S, v22.4S, v23.s[2] +mul v13.4S, v13.4S,v4.s[1] +sub v12.4s, v28.4s, v16.4s +mul v5.4S, v5.4S,v4.s[2] +add v28.4s, v28.4s, v16.4s +str q12, [x0, #272] +str q28, [x0, #256] +mla v13.4S, v30.4S, v31.s[0] +sub v30.4s, v2.4s, v1.4s +mla v5.4S, v15.4S, v31.s[0] +add v2.4s, v2.4s, v1.4s +str q30, [x0, #304] +str q2, [x0, #288] +mul v18.4S, v18.4S,v26.s[1] +sub v29.4s, v10.4s, v3.4s +mul v22.4S, v22.4S,v26.s[2] +add v10.4s, v10.4s, v3.4s +str q29, [x0, #336] +str q10, [x0, #320] +mla v18.4S, v6.4S, v31.s[0] +sub v6.4s, v0.4s, v24.4s +mla v22.4S, v25.4S, v31.s[0] +add v0.4s, v0.4s, v24.4s +str q6, [x0, #368] +str q0, [x0, #352] +ldr q23, [x17, #+384] +ldr q26, [x17, #+400] +ldr q0, [x0, #544] +sqrdmulh v6.4S, v0.4S, v26.s[0] +sub v24.4s, v21.4s, v13.4s +str q24, [x0, #400] +mul v0.4S, v0.4S,v23.s[0] +add v21.4s, v21.4s, v13.4s +str q21, [x0, #384] +ldr q21, [x0, #560] +sqrdmulh v13.4S, v21.4S, v26.s[0] +sub v24.4s, v20.4s, v5.4s +mul v21.4S, v21.4S,v23.s[0] +add v20.4s, v20.4s, v5.4s +str q24, [x0, #432] +str q20, [x0, #416] +ldr q20, [x17, #+416] +ldr q24, [x17, #+432] +ldr q5, [x0, #608] +sqrdmulh v25.4S, v5.4S, v24.s[0] +sub v10.4s, v11.4s, v18.4s +mul v5.4S, v5.4S,v20.s[0] +add v11.4s, v11.4s, v18.4s +str q10, [x0, #464] +str q11, [x0, #448] +ldr q11, [x0, #624] +sqrdmulh v10.4S, v11.4S, v24.s[0] +sub v18.4s, v27.4s, v22.4s +mul v11.4S, v11.4S,v20.s[0] +add v27.4s, v27.4s, v22.4s +str q18, [x0, #496] +str q27, [x0, #480] +ldr q27, [x0, #672] +ldr q18, [x17, #+448] +ldr q22, [x17, #+464] +mla v0.4S, v6.4S, v31.s[0] +sqrdmulh v6.4S, v27.4S, v22.s[0] +ldr q29, [x0, #688] +mla v21.4S, v13.4S, v31.s[0] +sqrdmulh v13.4S, v29.4S, v22.s[0] +ldr q3, [x0, #736] +ldr q4, [x17, #+480] +ldr q2, [x17, #+496] +mla v5.4S, v25.4S, v31.s[0] +sqrdmulh v25.4S, v3.4S, v2.s[0] +ldr q30, [x0, #752] +mla v11.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v30.4S, v2.s[0] +ldr q1, [x0, #640] +ldr q15, [x0, #512] +mul v27.4S, v27.4S,v18.s[0] +sub v28.4s, v15.4s, v0.4s +mul v29.4S, v29.4S,v18.s[0] +add v15.4s, v15.4s, v0.4s +ldr q0, [x0, #656] +ldr q12, [x0, #528] +mla v27.4S, v6.4S, v31.s[0] +sub v6.4s, v12.4s, v21.4s +mla v29.4S, v13.4S, v31.s[0] +add v12.4s, v12.4s, v21.4s +ldr q21, [x0, #704] +ldr q13, [x0, #576] +mul v3.4S, v3.4S,v4.s[0] +sub v16.4s, v13.4s, v5.4s +mul v30.4S, v30.4S,v4.s[0] +add v13.4s, v13.4s, v5.4s +ldr q5, [x0, #720] +ldr q19, [x0, #592] +mla v3.4S, v25.4S, v31.s[0] +sub v25.4s, v19.4s, v11.4s +mla v30.4S, v10.4S, v31.s[0] +add v19.4s, v19.4s, v11.4s +sqrdmulh v11.4S, v12.4S, v26.s[1] +mul v12.4S, v12.4S,v23.s[1] +sqrdmulh v10.4S, v6.4S, v26.s[2] +sub v14.4s, v1.4s, v27.4s +mul v6.4S, v6.4S,v23.s[2] +add v1.4s, v1.4s, v27.4s +sqrdmulh v26.4S, v19.4S, v24.s[1] +sub v23.4s, v0.4s, v29.4s +mul v19.4S, v19.4S,v20.s[1] +add v0.4s, v0.4s, v29.4s +sqrdmulh v29.4S, v25.4S, v24.s[2] +sub v27.4s, v21.4s, v3.4s +mul v25.4S, v25.4S,v20.s[2] +add v21.4s, v21.4s, v3.4s +mla v12.4S, v11.4S, v31.s[0] +sub v11.4s, v5.4s, v30.4s +sqrdmulh v24.4S, v0.4S, v22.s[1] +add v5.4s, v5.4s, v30.4s +mla v6.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v23.4S, v22.s[2] +mla v19.4S, v26.4S, v31.s[0] +sqrdmulh v26.4S, v5.4S, v2.s[1] +mla v25.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v11.4S, v2.s[2] +mul v0.4S, v0.4S,v18.s[1] +sub v30.4s, v15.4s, v12.4s +mul v23.4S, v23.4S,v18.s[2] +add v15.4s, v15.4s, v12.4s +str q30, [x0, #528] +str q15, [x0, #512] +mla v0.4S, v24.4S, v31.s[0] +sub v24.4s, v28.4s, v6.4s +mla v23.4S, v10.4S, v31.s[0] +add v28.4s, v28.4s, v6.4s +str q24, [x0, #560] +str q28, [x0, #544] +mul v5.4S, v5.4S,v4.s[1] +sub v22.4s, v13.4s, v19.4s +mul v11.4S, v11.4S,v4.s[2] +add v13.4s, v13.4s, v19.4s +str q22, [x0, #592] +str q13, [x0, #576] +mla v5.4S, v26.4S, v31.s[0] +sub v26.4s, v16.4s, v25.4s +mla v11.4S, v29.4S, v31.s[0] +add v16.4s, v16.4s, v25.4s +str q26, [x0, #624] +str q16, [x0, #608] +ldr q2, [x17, #+512] +ldr q4, [x17, #+528] +ldr q16, [x0, #800] +sqrdmulh v26.4S, v16.4S, v4.s[0] +sub v25.4s, v1.4s, v0.4s +str q25, [x0, #656] +mul v16.4S, v16.4S,v2.s[0] +add v1.4s, v1.4s, v0.4s +str q1, [x0, #640] +ldr q1, [x0, #816] +sqrdmulh v0.4S, v1.4S, v4.s[0] +sub v25.4s, v14.4s, v23.4s +mul v1.4S, v1.4S,v2.s[0] +add v14.4s, v14.4s, v23.4s +str q25, [x0, #688] +str q14, [x0, #672] +ldr q14, [x17, #+544] +ldr q25, [x17, #+560] +ldr q23, [x0, #864] +sqrdmulh v29.4S, v23.4S, v25.s[0] +sub v13.4s, v21.4s, v5.4s +mul v23.4S, v23.4S,v14.s[0] +add v21.4s, v21.4s, v5.4s +str q13, [x0, #720] +str q21, [x0, #704] +ldr q21, [x0, #880] +sqrdmulh v13.4S, v21.4S, v25.s[0] +sub v5.4s, v27.4s, v11.4s +mul v21.4S, v21.4S,v14.s[0] +add v27.4s, v27.4s, v11.4s +str q5, [x0, #752] +str q27, [x0, #736] +ldr q27, [x0, #928] +ldr q5, [x17, #+576] +ldr q11, [x17, #+592] +mla v16.4S, v26.4S, v31.s[0] +sqrdmulh v26.4S, v27.4S, v11.s[0] +ldr q22, [x0, #944] +mla v1.4S, v0.4S, v31.s[0] +sqrdmulh v0.4S, v22.4S, v11.s[0] +ldr q19, [x0, #992] +ldr q18, [x17, #+608] +ldr q28, [x17, #+624] +mla v23.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v19.4S, v28.s[0] +ldr q24, [x0, #1008] +mla v21.4S, v13.4S, v31.s[0] +sqrdmulh v13.4S, v24.4S, v28.s[0] +ldr q6, [x0, #896] +ldr q10, [x0, #768] +mul v27.4S, v27.4S,v5.s[0] +sub v15.4s, v10.4s, v16.4s +mul v22.4S, v22.4S,v5.s[0] +add v10.4s, v10.4s, v16.4s +ldr q16, [x0, #912] +ldr q30, [x0, #784] +mla v27.4S, v26.4S, v31.s[0] +sub v26.4s, v30.4s, v1.4s +mla v22.4S, v0.4S, v31.s[0] +add v30.4s, v30.4s, v1.4s +ldr q1, [x0, #960] +ldr q0, [x0, #832] +mul v19.4S, v19.4S,v18.s[0] +sub v12.4s, v0.4s, v23.4s +mul v24.4S, v24.4S,v18.s[0] +add v0.4s, v0.4s, v23.4s +ldr q23, [x0, #976] +ldr q20, [x0, #848] +mla v19.4S, v29.4S, v31.s[0] +sub v29.4s, v20.4s, v21.4s +mla v24.4S, v13.4S, v31.s[0] +add v20.4s, v20.4s, v21.4s +sqrdmulh v21.4S, v30.4S, v4.s[1] +mul v30.4S, v30.4S,v2.s[1] +sqrdmulh v13.4S, v26.4S, v4.s[2] +sub v3.4s, v6.4s, v27.4s +mul v26.4S, v26.4S,v2.s[2] +add v6.4s, v6.4s, v27.4s +sqrdmulh v4.4S, v20.4S, v25.s[1] +sub v2.4s, v16.4s, v22.4s +mul v20.4S, v20.4S,v14.s[1] +add v16.4s, v16.4s, v22.4s +sqrdmulh v22.4S, v29.4S, v25.s[2] +sub v27.4s, v1.4s, v19.4s +mul v29.4S, v29.4S,v14.s[2] +add v1.4s, v1.4s, v19.4s +mla v30.4S, v21.4S, v31.s[0] +sub v21.4s, v23.4s, v24.4s +sqrdmulh v25.4S, v16.4S, v11.s[1] +add v23.4s, v23.4s, v24.4s +mla v26.4S, v13.4S, v31.s[0] +sqrdmulh v13.4S, v2.4S, v11.s[2] +mla v20.4S, v4.4S, v31.s[0] +sqrdmulh v4.4S, v23.4S, v28.s[1] +mla v29.4S, v22.4S, v31.s[0] +sqrdmulh v22.4S, v21.4S, v28.s[2] +mul v16.4S, v16.4S,v5.s[1] +sub v24.4s, v10.4s, v30.4s +mul v2.4S, v2.4S,v5.s[2] +add v10.4s, v10.4s, v30.4s +str q24, [x0, #784] +str q10, [x0, #768] +mla v16.4S, v25.4S, v31.s[0] +sub v25.4s, v15.4s, v26.4s +mla v2.4S, v13.4S, v31.s[0] +add v15.4s, v15.4s, v26.4s +str q25, [x0, #816] +str q15, [x0, #800] +mul v23.4S, v23.4S,v18.s[1] +sub v11.4s, v0.4s, v20.4s +mul v21.4S, v21.4S,v18.s[2] +add v0.4s, v0.4s, v20.4s +str q11, [x0, #848] +str q0, [x0, #832] +mla v23.4S, v4.4S, v31.s[0] +sub v4.4s, v12.4s, v29.4s +mla v21.4S, v22.4S, v31.s[0] +add v12.4s, v12.4s, v29.4s +str q4, [x0, #880] +str q12, [x0, #864] +sub v28.4s, v6.4s, v16.4s +str q28, [x0, #912] +add v6.4s, v6.4s, v16.4s +str q6, [x0, #896] +sub v6.4s, v3.4s, v2.4s +add v3.4s, v3.4s, v2.4s +str q6, [x0, #944] +str q3, [x0, #928] +sub v3.4s, v1.4s, v23.4s +add v1.4s, v1.4s, v23.4s +str q3, [x0, #976] +str q1, [x0, #960] +sub v1.4s, v27.4s, v21.4s +add v27.4s, v27.4s, v21.4s +str q1, [x0, #1008] +str q27, [x0, #992] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1464 +// Instruction count: 1460 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_5.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_5.s new file mode 100644 index 0000000..2182562 --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_5.s @@ -0,0 +1,1494 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_7_z4_5 +.global _ntt_u32_incomplete_neon_asm_var_4_2_7_z4_5 +ntt_u32_incomplete_neon_asm_var_4_2_7_z4_5: +_ntt_u32_incomplete_neon_asm_var_4_2_7_z4_5: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #928] +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +ldr q20, [x0, #992] +sqrdmulh v19.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q18, [x0, #800] +sqrdmulh v17.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +ldr q16, [x0, #864] +sqrdmulh v3.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +mla v22.4S, v21.4S, v31.s[0] +mla v20.4S, v19.4S, v31.s[0] +mla v18.4S, v17.4S, v31.s[0] +mla v16.4S, v3.4S, v31.s[0] +ldr q3, [x0, #544] +sqrdmulh v17.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +ldr q19, [x0, #608] +sqrdmulh v21.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q2, [x0, #672] +ldr q1, [x0, #416] +sqrdmulh v0.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +sub v15.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +ldr q22, [x0, #736] +ldr q14, [x0, #480] +sqrdmulh v13.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v12.4s, v14.4s, v20.4s +add v14.4s, v14.4s, v20.4s +ldr q20, [x0, #288] +mla v3.4S, v17.4S, v31.s[0] +mla v19.4S, v21.4S, v31.s[0] +sub v21.4s, v20.4s, v18.4s +mla v2.4S, v0.4S, v31.s[0] +mla v22.4S, v13.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +ldr q18, [x0, #352] +sqrdmulh v13.4S, v1.4S, v29.s[1] +mul v1.4S, v1.4S,v30.s[1] +sub v0.4s, v18.4s, v16.4s +sqrdmulh v17.4S, v14.4S, v29.s[1] +mul v14.4S, v14.4S,v30.s[1] +add v18.4s, v18.4s, v16.4s +ldr q16, [x0, #32] +sqrdmulh v11.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v10.4s, v16.4s, v3.4s +add v16.4s, v16.4s, v3.4s +ldr q3, [x0, #96] +sqrdmulh v9.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v8.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +ldr q19, [x0, #160] +mla v1.4S, v13.4S, v31.s[0] +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v19.4s, v2.4s +mla v20.4S, v11.4S, v31.s[0] +mla v18.4S, v9.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +ldr q2, [x0, #224] +sqrdmulh v9.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +sub v11.4s, v2.4s, v22.4s +sqrdmulh v13.4S, v12.4S, v29.s[2] +mul v12.4S, v12.4S,v30.s[2] +add v2.4s, v2.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +sub v7.4s, v19.4s, v1.4s +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v29.s[2] +mul v0.4S, v0.4S,v30.s[2] +sub v6.4s, v2.4s, v14.4s +add v2.4s, v2.4s, v14.4s +mla v15.4S, v9.4S, v31.s[0] +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v16.4s, v20.4s +mla v21.4S, v22.4S, v31.s[0] +mla v0.4S, v1.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v7.4S, v27.s[1] +mul v7.4S, v7.4S,v28.s[1] +sub v1.4s, v3.4s, v18.4s +sqrdmulh v22.4S, v6.4S, v27.s[1] +mul v6.4S, v6.4S,v28.s[1] +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v19.4S, v27.s[0] +mul v19.4S, v19.4S,v28.s[0] +sub v9.4s, v17.4s, v15.4s +add v17.4s, v17.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v27.s[0] +mul v2.4S, v2.4S,v28.s[0] +sub v14.4s, v11.4s, v12.4s +add v11.4s, v11.4s, v12.4s +mla v7.4S, v20.4S, v31.s[0] +mla v6.4S, v22.4S, v31.s[0] +sub v22.4s, v10.4s, v21.4s +mla v19.4S, v18.4S, v31.s[0] +mla v2.4S, v15.4S, v31.s[0] +add v10.4s, v10.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v27.s[2] +mul v17.4S, v17.4S,v28.s[2] +sub v15.4s, v8.4s, v0.4s +sqrdmulh v18.4S, v11.4S, v27.s[2] +mul v11.4S, v11.4S,v28.s[2] +add v8.4s, v8.4s, v0.4s +sqrdmulh v0.4S, v9.4S, v27.s[3] +mul v9.4S, v9.4S,v28.s[3] +sub v20.4s, v13.4s, v7.4s +add v13.4s, v13.4s, v7.4s +sqrdmulh v7.4S, v14.4S, v27.s[3] +mul v14.4S, v14.4S,v28.s[3] +sub v12.4s, v1.4s, v6.4s +add v1.4s, v1.4s, v6.4s +mla v17.4S, v21.4S, v31.s[0] +mla v11.4S, v18.4S, v31.s[0] +sub v18.4s, v16.4s, v19.4s +mla v9.4S, v0.4S, v31.s[0] +mla v14.4S, v7.4S, v31.s[0] +add v16.4s, v16.4s, v19.4s +sqrdmulh v19.4S, v1.4S, v25.s[2] +mul v1.4S, v1.4S,v26.s[2] +sub v7.4s, v3.4s, v2.4s +sqrdmulh v0.4S, v12.4S, v25.s[3] +mul v12.4S, v12.4S,v26.s[3] +add v3.4s, v3.4s, v2.4s +sqrdmulh v2.4S, v7.4S, v25.s[1] +mul v7.4S, v7.4S,v26.s[1] +sub v21.4s, v10.4s, v17.4s +add v10.4s, v10.4s, v17.4s +sqrdmulh v17.4S, v3.4S, v25.s[0] +mul v3.4S, v3.4S,v26.s[0] +sub v6.4s, v8.4s, v11.4s +add v8.4s, v8.4s, v11.4s +mla v1.4S, v19.4S, v31.s[0] +mla v12.4S, v0.4S, v31.s[0] +sub v0.4s, v22.4s, v9.4s +mla v7.4S, v2.4S, v31.s[0] +mla v3.4S, v17.4S, v31.s[0] +add v22.4s, v22.4s, v9.4s +sqrdmulh v9.4S, v8.4S, v23.s[0] +mul v8.4S, v8.4S,v24.s[0] +sub v17.4s, v15.4s, v14.4s +sqrdmulh v2.4S, v6.4S, v23.s[1] +mul v6.4S, v6.4S,v24.s[1] +add v15.4s, v15.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v23.s[2] +mul v15.4S, v15.4S,v24.s[2] +sub v19.4s, v13.4s, v1.4s +add v13.4s, v13.4s, v1.4s +sqrdmulh v1.4S, v17.4S, v23.s[3] +mul v17.4S, v17.4S,v24.s[3] +sub v11.4s, v20.4s, v12.4s +add v20.4s, v20.4s, v12.4s +mla v8.4S, v9.4S, v31.s[0] +mla v6.4S, v2.4S, v31.s[0] +sub v2.4s, v18.4s, v7.4s +str q13, [x0, #288] +mla v15.4S, v14.4S, v31.s[0] +mla v17.4S, v1.4S, v31.s[0] +add v18.4s, v18.4s, v7.4s +str q19, [x0, #352] +ldr q19, [x0, #944] +sqrdmulh v7.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +sub v1.4s, v16.4s, v3.4s +str q20, [x0, #416] +ldr q20, [x0, #1008] +sqrdmulh v14.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v16.4s, v16.4s, v3.4s +str q11, [x0, #480] +ldr q11, [x0, #816] +sqrdmulh v3.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +sub v13.4s, v10.4s, v8.4s +add v10.4s, v10.4s, v8.4s +ldr q8, [x0, #880] +sqrdmulh v9.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v12.4s, v21.4s, v6.4s +add v21.4s, v21.4s, v6.4s +mla v19.4S, v7.4S, v31.s[0] +mla v20.4S, v14.4S, v31.s[0] +sub v14.4s, v22.4s, v15.4s +str q18, [x0, #160] +mla v11.4S, v3.4S, v31.s[0] +mla v8.4S, v9.4S, v31.s[0] +add v22.4s, v22.4s, v15.4s +str q2, [x0, #224] +ldr q2, [x0, #560] +sqrdmulh v15.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +sub v9.4s, v0.4s, v17.4s +str q16, [x0, #32] +ldr q16, [x0, #624] +sqrdmulh v3.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +add v0.4s, v0.4s, v17.4s +str q1, [x0, #96] +ldr q1, [x0, #688] +ldr q17, [x0, #432] +sqrdmulh v18.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +sub v7.4s, v17.4s, v19.4s +add v17.4s, v17.4s, v19.4s +ldr q19, [x0, #752] +ldr q6, [x0, #496] +sqrdmulh v5.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +sub v4.4s, v6.4s, v20.4s +add v6.4s, v6.4s, v20.4s +ldr q20, [x0, #304] +mla v2.4S, v15.4S, v31.s[0] +mla v16.4S, v3.4S, v31.s[0] +sub v3.4s, v20.4s, v11.4s +str q10, [x0, #544] +mla v1.4S, v18.4S, v31.s[0] +mla v19.4S, v5.4S, v31.s[0] +add v20.4s, v20.4s, v11.4s +str q13, [x0, #608] +ldr q13, [x0, #368] +sqrdmulh v11.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v5.4s, v13.4s, v8.4s +str q21, [x0, #672] +sqrdmulh v21.4S, v6.4S, v29.s[1] +mul v6.4S, v6.4S,v30.s[1] +add v13.4s, v13.4s, v8.4s +str q12, [x0, #736] +ldr q12, [x0, #48] +sqrdmulh v8.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v18.4s, v12.4s, v2.4s +add v12.4s, v12.4s, v2.4s +ldr q2, [x0, #112] +sqrdmulh v10.4S, v13.4S, v29.s[1] +mul v13.4S, v13.4S,v30.s[1] +sub v15.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +ldr q16, [x0, #176] +mla v17.4S, v11.4S, v31.s[0] +mla v6.4S, v21.4S, v31.s[0] +sub v21.4s, v16.4s, v1.4s +str q22, [x0, #800] +mla v20.4S, v8.4S, v31.s[0] +mla v13.4S, v10.4S, v31.s[0] +add v16.4s, v16.4s, v1.4s +str q14, [x0, #864] +ldr q14, [x0, #240] +sqrdmulh v1.4S, v7.4S, v29.s[2] +mul v7.4S, v7.4S,v30.s[2] +sub v10.4s, v14.4s, v19.4s +str q0, [x0, #928] +sqrdmulh v0.4S, v4.4S, v29.s[2] +mul v4.4S, v4.4S,v30.s[2] +add v14.4s, v14.4s, v19.4s +str q9, [x0, #992] +sqrdmulh v9.4S, v3.4S, v29.s[2] +mul v3.4S, v3.4S,v30.s[2] +sub v19.4s, v16.4s, v17.4s +add v16.4s, v16.4s, v17.4s +sqrdmulh v17.4S, v5.4S, v29.s[2] +mul v5.4S, v5.4S,v30.s[2] +sub v8.4s, v14.4s, v6.4s +add v14.4s, v14.4s, v6.4s +mla v7.4S, v1.4S, v31.s[0] +mla v4.4S, v0.4S, v31.s[0] +sub v0.4s, v12.4s, v20.4s +mla v3.4S, v9.4S, v31.s[0] +mla v5.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v27.s[1] +mul v19.4S, v19.4S,v28.s[1] +sub v17.4s, v2.4s, v13.4s +sqrdmulh v9.4S, v8.4S, v27.s[1] +mul v8.4S, v8.4S,v28.s[1] +add v2.4s, v2.4s, v13.4s +sqrdmulh v13.4S, v16.4S, v27.s[0] +mul v16.4S, v16.4S,v28.s[0] +sub v1.4s, v21.4s, v7.4s +add v21.4s, v21.4s, v7.4s +sqrdmulh v7.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +sub v6.4s, v10.4s, v4.4s +add v10.4s, v10.4s, v4.4s +mla v19.4S, v20.4S, v31.s[0] +mla v8.4S, v9.4S, v31.s[0] +sub v9.4s, v18.4s, v3.4s +mla v16.4S, v13.4S, v31.s[0] +mla v14.4S, v7.4S, v31.s[0] +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v27.s[2] +mul v21.4S, v21.4S,v28.s[2] +sub v7.4s, v15.4s, v5.4s +sqrdmulh v13.4S, v10.4S, v27.s[2] +mul v10.4S, v10.4S,v28.s[2] +add v15.4s, v15.4s, v5.4s +sqrdmulh v5.4S, v1.4S, v27.s[3] +mul v1.4S, v1.4S,v28.s[3] +sub v20.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v27.s[3] +mul v6.4S, v6.4S,v28.s[3] +sub v4.4s, v17.4s, v8.4s +add v17.4s, v17.4s, v8.4s +mla v21.4S, v3.4S, v31.s[0] +mla v10.4S, v13.4S, v31.s[0] +sub v13.4s, v12.4s, v16.4s +mla v1.4S, v5.4S, v31.s[0] +mla v6.4S, v19.4S, v31.s[0] +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v25.s[2] +mul v17.4S, v17.4S,v26.s[2] +sub v19.4s, v2.4s, v14.4s +sqrdmulh v5.4S, v4.4S, v25.s[3] +mul v4.4S, v4.4S,v26.s[3] +add v2.4s, v2.4s, v14.4s +sqrdmulh v14.4S, v19.4S, v25.s[1] +mul v19.4S, v19.4S,v26.s[1] +sub v3.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v2.4S, v25.s[0] +mul v2.4S, v2.4S,v26.s[0] +sub v8.4s, v15.4s, v10.4s +add v15.4s, v15.4s, v10.4s +mla v17.4S, v16.4S, v31.s[0] +mla v4.4S, v5.4S, v31.s[0] +sub v5.4s, v9.4s, v1.4s +mla v19.4S, v14.4S, v31.s[0] +mla v2.4S, v21.4S, v31.s[0] +add v9.4s, v9.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v23.s[0] +mul v15.4S, v15.4S,v24.s[0] +sub v21.4s, v7.4s, v6.4s +sqrdmulh v14.4S, v8.4S, v23.s[1] +mul v8.4S, v8.4S,v24.s[1] +add v7.4s, v7.4s, v6.4s +sqrdmulh v6.4S, v7.4S, v23.s[2] +mul v7.4S, v7.4S,v24.s[2] +sub v16.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +sqrdmulh v17.4S, v21.4S, v23.s[3] +mul v21.4S, v21.4S,v24.s[3] +sub v10.4s, v20.4s, v4.4s +add v20.4s, v20.4s, v4.4s +mla v15.4S, v1.4S, v31.s[0] +mla v8.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v19.4s +str q0, [x0, #304] +mla v7.4S, v6.4S, v31.s[0] +mla v21.4S, v17.4S, v31.s[0] +add v13.4s, v13.4s, v19.4s +str q16, [x0, #368] +ldr q16, [x0, #896] +sqrdmulh v19.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v17.4s, v12.4s, v2.4s +str q20, [x0, #432] +ldr q20, [x0, #960] +sqrdmulh v6.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v12.4s, v12.4s, v2.4s +str q10, [x0, #496] +ldr q10, [x0, #768] +sqrdmulh v2.4S, v10.4S, v29.s[0] +mul v10.4S, v10.4S,v30.s[0] +sub v0.4s, v18.4s, v15.4s +add v18.4s, v18.4s, v15.4s +ldr q15, [x0, #832] +sqrdmulh v1.4S, v15.4S, v29.s[0] +mul v15.4S, v15.4S,v30.s[0] +sub v4.4s, v3.4s, v8.4s +add v3.4s, v3.4s, v8.4s +mla v16.4S, v19.4S, v31.s[0] +mla v20.4S, v6.4S, v31.s[0] +sub v6.4s, v9.4s, v7.4s +str q13, [x0, #176] +mla v10.4S, v2.4S, v31.s[0] +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v7.4s +str q14, [x0, #240] +ldr q14, [x0, #512] +sqrdmulh v7.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v1.4s, v5.4s, v21.4s +str q12, [x0, #48] +ldr q12, [x0, #576] +sqrdmulh v2.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +add v5.4s, v5.4s, v21.4s +str q17, [x0, #112] +ldr q17, [x0, #640] +ldr q21, [x0, #384] +sqrdmulh v13.4S, v17.4S, v29.s[0] +mul v17.4S, v17.4S,v30.s[0] +sub v19.4s, v21.4s, v16.4s +add v21.4s, v21.4s, v16.4s +ldr q16, [x0, #704] +ldr q8, [x0, #448] +sqrdmulh v22.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v11.4s, v8.4s, v20.4s +add v8.4s, v8.4s, v20.4s +ldr q20, [x0, #256] +mla v14.4S, v7.4S, v31.s[0] +mla v12.4S, v2.4S, v31.s[0] +sub v2.4s, v20.4s, v10.4s +str q18, [x0, #560] +mla v17.4S, v13.4S, v31.s[0] +mla v16.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v10.4s +str q0, [x0, #624] +ldr q0, [x0, #320] +sqrdmulh v10.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v22.4s, v0.4s, v15.4s +str q3, [x0, #688] +sqrdmulh v3.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +add v0.4s, v0.4s, v15.4s +str q4, [x0, #752] +ldr q4, [x0, #0] +sqrdmulh v15.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v13.4s, v4.4s, v14.4s +add v4.4s, v4.4s, v14.4s +ldr q14, [x0, #64] +sqrdmulh v18.4S, v0.4S, v29.s[1] +mul v0.4S, v0.4S,v30.s[1] +sub v7.4s, v14.4s, v12.4s +add v14.4s, v14.4s, v12.4s +ldr q12, [x0, #128] +mla v21.4S, v10.4S, v31.s[0] +mla v8.4S, v3.4S, v31.s[0] +sub v3.4s, v12.4s, v17.4s +str q9, [x0, #816] +mla v20.4S, v15.4S, v31.s[0] +mla v0.4S, v18.4S, v31.s[0] +add v12.4s, v12.4s, v17.4s +str q6, [x0, #880] +ldr q6, [x0, #192] +sqrdmulh v17.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +sub v18.4s, v6.4s, v16.4s +str q5, [x0, #944] +sqrdmulh v5.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +add v6.4s, v6.4s, v16.4s +str q1, [x0, #1008] +sqrdmulh v1.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v16.4s, v12.4s, v21.4s +add v12.4s, v12.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +sub v15.4s, v6.4s, v8.4s +add v6.4s, v6.4s, v8.4s +mla v19.4S, v17.4S, v31.s[0] +mla v11.4S, v5.4S, v31.s[0] +sub v5.4s, v4.4s, v20.4s +mla v2.4S, v1.4S, v31.s[0] +mla v22.4S, v21.4S, v31.s[0] +add v4.4s, v4.4s, v20.4s +sqrdmulh v20.4S, v16.4S, v27.s[1] +mul v16.4S, v16.4S,v28.s[1] +sub v21.4s, v14.4s, v0.4s +sqrdmulh v1.4S, v15.4S, v27.s[1] +mul v15.4S, v15.4S,v28.s[1] +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +sub v17.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v27.s[0] +mul v6.4S, v6.4S,v28.s[0] +sub v8.4s, v18.4s, v11.4s +add v18.4s, v18.4s, v11.4s +mla v16.4S, v20.4S, v31.s[0] +mla v15.4S, v1.4S, v31.s[0] +sub v1.4s, v13.4s, v2.4s +mla v12.4S, v0.4S, v31.s[0] +mla v6.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v3.4S, v27.s[2] +mul v3.4S, v3.4S,v28.s[2] +sub v19.4s, v7.4s, v22.4s +sqrdmulh v0.4S, v18.4S, v27.s[2] +mul v18.4S, v18.4S,v28.s[2] +add v7.4s, v7.4s, v22.4s +sqrdmulh v22.4S, v17.4S, v27.s[3] +mul v17.4S, v17.4S,v28.s[3] +sub v20.4s, v5.4s, v16.4s +add v5.4s, v5.4s, v16.4s +sqrdmulh v16.4S, v8.4S, v27.s[3] +mul v8.4S, v8.4S,v28.s[3] +sub v11.4s, v21.4s, v15.4s +add v21.4s, v21.4s, v15.4s +mla v3.4S, v2.4S, v31.s[0] +mla v18.4S, v0.4S, v31.s[0] +sub v0.4s, v4.4s, v12.4s +mla v17.4S, v22.4S, v31.s[0] +mla v8.4S, v16.4S, v31.s[0] +add v4.4s, v4.4s, v12.4s +sqrdmulh v12.4S, v21.4S, v25.s[2] +mul v21.4S, v21.4S,v26.s[2] +sub v16.4s, v14.4s, v6.4s +sqrdmulh v22.4S, v11.4S, v25.s[3] +mul v11.4S, v11.4S,v26.s[3] +add v14.4s, v14.4s, v6.4s +sqrdmulh v6.4S, v16.4S, v25.s[1] +mul v16.4S, v16.4S,v26.s[1] +sub v2.4s, v13.4s, v3.4s +add v13.4s, v13.4s, v3.4s +sqrdmulh v3.4S, v14.4S, v25.s[0] +mul v14.4S, v14.4S,v26.s[0] +sub v15.4s, v7.4s, v18.4s +add v7.4s, v7.4s, v18.4s +mla v21.4S, v12.4S, v31.s[0] +mla v11.4S, v22.4S, v31.s[0] +sub v22.4s, v1.4s, v17.4s +mla v16.4S, v6.4S, v31.s[0] +mla v14.4S, v3.4S, v31.s[0] +add v1.4s, v1.4s, v17.4s +sqrdmulh v17.4S, v7.4S, v23.s[0] +mul v7.4S, v7.4S,v24.s[0] +sub v3.4s, v19.4s, v8.4s +sqrdmulh v6.4S, v15.4S, v23.s[1] +mul v15.4S, v15.4S,v24.s[1] +add v19.4s, v19.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v23.s[2] +mul v19.4S, v19.4S,v24.s[2] +sub v12.4s, v5.4s, v21.4s +add v5.4s, v5.4s, v21.4s +sqrdmulh v21.4S, v3.4S, v23.s[3] +mul v3.4S, v3.4S,v24.s[3] +sub v18.4s, v20.4s, v11.4s +add v20.4s, v20.4s, v11.4s +mla v7.4S, v17.4S, v31.s[0] +mla v15.4S, v6.4S, v31.s[0] +sub v6.4s, v0.4s, v16.4s +str q5, [x0, #256] +mla v19.4S, v8.4S, v31.s[0] +mla v3.4S, v21.4S, v31.s[0] +add v0.4s, v0.4s, v16.4s +str q12, [x0, #320] +ldr q12, [x0, #912] +sqrdmulh v16.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +sub v21.4s, v4.4s, v14.4s +str q20, [x0, #384] +ldr q20, [x0, #976] +sqrdmulh v8.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v4.4s, v4.4s, v14.4s +str q18, [x0, #448] +ldr q18, [x0, #784] +sqrdmulh v14.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +sub v5.4s, v13.4s, v7.4s +add v13.4s, v13.4s, v7.4s +ldr q7, [x0, #848] +sqrdmulh v17.4S, v7.4S, v29.s[0] +mul v7.4S, v7.4S,v30.s[0] +sub v11.4s, v2.4s, v15.4s +add v2.4s, v2.4s, v15.4s +mla v12.4S, v16.4S, v31.s[0] +mla v20.4S, v8.4S, v31.s[0] +sub v8.4s, v1.4s, v19.4s +str q0, [x0, #128] +mla v18.4S, v14.4S, v31.s[0] +mla v7.4S, v17.4S, v31.s[0] +add v1.4s, v1.4s, v19.4s +str q6, [x0, #192] +ldr q6, [x0, #528] +sqrdmulh v19.4S, v6.4S, v29.s[0] +mul v6.4S, v6.4S,v30.s[0] +sub v17.4s, v22.4s, v3.4s +str q4, [x0, #0] +ldr q4, [x0, #592] +sqrdmulh v14.4S, v4.4S, v29.s[0] +mul v4.4S, v4.4S,v30.s[0] +add v22.4s, v22.4s, v3.4s +str q21, [x0, #64] +ldr q21, [x0, #656] +ldr q3, [x0, #400] +sqrdmulh v0.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +sub v16.4s, v3.4s, v12.4s +add v3.4s, v3.4s, v12.4s +ldr q12, [x0, #720] +ldr q15, [x0, #464] +sqrdmulh v9.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +sub v10.4s, v15.4s, v20.4s +add v15.4s, v15.4s, v20.4s +ldr q20, [x0, #272] +mla v6.4S, v19.4S, v31.s[0] +mla v4.4S, v14.4S, v31.s[0] +sub v14.4s, v20.4s, v18.4s +str q13, [x0, #512] +mla v21.4S, v0.4S, v31.s[0] +mla v12.4S, v9.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +str q5, [x0, #576] +ldr q5, [x0, #336] +sqrdmulh v18.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v9.4s, v5.4s, v7.4s +str q2, [x0, #640] +sqrdmulh v2.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +add v5.4s, v5.4s, v7.4s +str q11, [x0, #704] +ldr q11, [x0, #16] +sqrdmulh v7.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v0.4s, v11.4s, v6.4s +add v11.4s, v11.4s, v6.4s +ldr q6, [x0, #80] +sqrdmulh v13.4S, v5.4S, v29.s[1] +mul v5.4S, v5.4S,v30.s[1] +sub v19.4s, v6.4s, v4.4s +add v6.4s, v6.4s, v4.4s +ldr q4, [x0, #144] +mla v3.4S, v18.4S, v31.s[0] +mla v15.4S, v2.4S, v31.s[0] +sub v2.4s, v4.4s, v21.4s +str q1, [x0, #768] +mla v20.4S, v7.4S, v31.s[0] +mla v5.4S, v13.4S, v31.s[0] +add v4.4s, v4.4s, v21.4s +str q8, [x0, #832] +ldr q8, [x0, #208] +sqrdmulh v21.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +sub v13.4s, v8.4s, v12.4s +str q22, [x0, #896] +sqrdmulh v22.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +add v8.4s, v8.4s, v12.4s +str q17, [x0, #960] +sqrdmulh v17.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v12.4s, v4.4s, v3.4s +add v4.4s, v4.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v29.s[2] +mul v9.4S, v9.4S,v30.s[2] +sub v7.4s, v8.4s, v15.4s +add v8.4s, v8.4s, v15.4s +mla v16.4S, v21.4S, v31.s[0] +mla v10.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v20.4s +mla v14.4S, v17.4S, v31.s[0] +mla v9.4S, v3.4S, v31.s[0] +add v11.4s, v11.4s, v20.4s +sqrdmulh v20.4S, v12.4S, v27.s[1] +mul v12.4S, v12.4S,v28.s[1] +sub v3.4s, v6.4s, v5.4s +sqrdmulh v17.4S, v7.4S, v27.s[1] +mul v7.4S, v7.4S,v28.s[1] +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v4.4S, v27.s[0] +mul v4.4S, v4.4S,v28.s[0] +sub v21.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v8.4S, v27.s[0] +mul v8.4S, v8.4S,v28.s[0] +sub v15.4s, v13.4s, v10.4s +add v13.4s, v13.4s, v10.4s +mla v12.4S, v20.4S, v31.s[0] +mla v7.4S, v17.4S, v31.s[0] +sub v17.4s, v0.4s, v14.4s +mla v4.4S, v5.4S, v31.s[0] +mla v8.4S, v16.4S, v31.s[0] +add v0.4s, v0.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v27.s[2] +mul v2.4S, v2.4S,v28.s[2] +sub v16.4s, v19.4s, v9.4s +sqrdmulh v5.4S, v13.4S, v27.s[2] +mul v13.4S, v13.4S,v28.s[2] +add v19.4s, v19.4s, v9.4s +sqrdmulh v9.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +sub v20.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +sub v10.4s, v3.4s, v7.4s +add v3.4s, v3.4s, v7.4s +mla v2.4S, v14.4S, v31.s[0] +mla v13.4S, v5.4S, v31.s[0] +sub v5.4s, v11.4s, v4.4s +mla v21.4S, v9.4S, v31.s[0] +mla v15.4S, v12.4S, v31.s[0] +add v11.4s, v11.4s, v4.4s +sqrdmulh v4.4S, v3.4S, v25.s[2] +mul v3.4S, v3.4S,v26.s[2] +sub v12.4s, v6.4s, v8.4s +sqrdmulh v9.4S, v10.4S, v25.s[3] +mul v10.4S, v10.4S,v26.s[3] +add v6.4s, v6.4s, v8.4s +sqrdmulh v8.4S, v12.4S, v25.s[1] +mul v12.4S, v12.4S,v26.s[1] +sub v14.4s, v0.4s, v2.4s +add v0.4s, v0.4s, v2.4s +sqrdmulh v2.4S, v6.4S, v25.s[0] +mul v6.4S, v6.4S,v26.s[0] +sub v7.4s, v19.4s, v13.4s +add v19.4s, v19.4s, v13.4s +mla v3.4S, v4.4S, v31.s[0] +mla v10.4S, v9.4S, v31.s[0] +sub v9.4s, v17.4s, v21.4s +mla v12.4S, v8.4S, v31.s[0] +mla v6.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v19.4S, v23.s[0] +mul v19.4S, v19.4S,v24.s[0] +sub v2.4s, v16.4s, v15.4s +sqrdmulh v8.4S, v7.4S, v23.s[1] +mul v7.4S, v7.4S,v24.s[1] +add v16.4s, v16.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v23.s[2] +mul v16.4S, v16.4S,v24.s[2] +sub v4.4s, v22.4s, v3.4s +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v2.4S, v23.s[3] +mul v2.4S, v2.4S,v24.s[3] +sub v13.4s, v20.4s, v10.4s +add v20.4s, v20.4s, v10.4s +mla v19.4S, v21.4S, v31.s[0] +mla v7.4S, v8.4S, v31.s[0] +sub v8.4s, v5.4s, v12.4s +str q22, [x0, #272] +mla v16.4S, v15.4S, v31.s[0] +mla v2.4S, v3.4S, v31.s[0] +add v5.4s, v5.4s, v12.4s +str q4, [x0, #336] +sub v23.4s, v11.4s, v6.4s +str q20, [x0, #400] +add v11.4s, v11.4s, v6.4s +str q13, [x0, #464] +sub v13.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sub v19.4s, v14.4s, v7.4s +add v14.4s, v14.4s, v7.4s +sub v7.4s, v17.4s, v16.4s +str q5, [x0, #144] +add v17.4s, v17.4s, v16.4s +str q8, [x0, #208] +sub v8.4s, v9.4s, v2.4s +str q11, [x0, #16] +add v9.4s, v9.4s, v2.4s +str q23, [x0, #80] +str q0, [x0, #528] +str q13, [x0, #592] +str q14, [x0, #656] +str q19, [x0, #720] +str q17, [x0, #784] +str q7, [x0, #848] +str q9, [x0, #912] +str q8, [x0, #976] +ldr q18, [x0, #224] +ldr q1, [x0, #160] +ldr q10, [x0, #32] +ldr q21, [x17, #+128] +ldr q22, [x17, #+144] +sqrdmulh v15.4S, v10.4S, v22.s[0] +mul v10.4S, v10.4S,v21.s[0] +ldr q3, [x0, #48] +sqrdmulh v12.4S, v3.4S, v22.s[0] +mul v3.4S, v3.4S,v21.s[0] +ldr q4, [x17, #+160] +ldr q30, [x17, #+176] +ldr q29, [x0, #96] +sqrdmulh v28.4S, v29.4S, v30.s[0] +mul v29.4S, v29.4S,v4.s[0] +ldr q27, [x0, #112] +sqrdmulh v26.4S, v27.4S, v30.s[0] +mul v27.4S, v27.4S,v4.s[0] +ldr q25, [x17, #+192] +ldr q24, [x17, #+208] +mla v10.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v1.4S, v24.s[0] +ldr q20, [x0, #176] +mla v3.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v20.4S, v24.s[0] +ldr q6, [x17, #+224] +ldr q5, [x17, #+240] +mla v29.4S, v28.4S, v31.s[0] +sqrdmulh v28.4S, v18.4S, v5.s[0] +ldr q16, [x0, #240] +mla v27.4S, v26.4S, v31.s[0] +sqrdmulh v26.4S, v16.4S, v5.s[0] +ldr q11, [x0, #128] +ldr q2, [x0, #0] +mul v1.4S, v1.4S,v25.s[0] +sub v23.4s, v2.4s, v10.4s +mul v20.4S, v20.4S,v25.s[0] +add v2.4s, v2.4s, v10.4s +ldr q10, [x0, #144] +ldr q0, [x0, #16] +mla v1.4S, v15.4S, v31.s[0] +sub v15.4s, v0.4s, v3.4s +mla v20.4S, v12.4S, v31.s[0] +add v0.4s, v0.4s, v3.4s +ldr q3, [x0, #192] +ldr q12, [x0, #64] +mul v18.4S, v18.4S,v6.s[0] +sub v13.4s, v12.4s, v29.4s +mul v16.4S, v16.4S,v6.s[0] +add v12.4s, v12.4s, v29.4s +ldr q29, [x0, #208] +ldr q14, [x0, #80] +mla v18.4S, v28.4S, v31.s[0] +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v14.4s, v27.4s +sqrdmulh v28.4S, v0.4S, v22.s[1] +mul v0.4S, v0.4S,v21.s[1] +add v14.4s, v14.4s, v27.4s +sqrdmulh v27.4S, v15.4S, v22.s[2] +sub v19.4s, v11.4s, v1.4s +mul v15.4S, v15.4S,v21.s[2] +add v11.4s, v11.4s, v1.4s +sqrdmulh v22.4S, v14.4S, v30.s[1] +sub v21.4s, v10.4s, v20.4s +mul v14.4S, v14.4S,v4.s[1] +add v10.4s, v10.4s, v20.4s +sqrdmulh v20.4S, v26.4S, v30.s[2] +sub v1.4s, v3.4s, v18.4s +mul v26.4S, v26.4S,v4.s[2] +add v3.4s, v3.4s, v18.4s +mla v0.4S, v28.4S, v31.s[0] +sub v28.4s, v29.4s, v16.4s +ldr q30, [x0, #480] +sqrdmulh v4.4S, v10.4S, v24.s[1] +add v29.4s, v29.4s, v16.4s +mla v15.4S, v27.4S, v31.s[0] +ldr q27, [x0, #416] +sqrdmulh v16.4S, v21.4S, v24.s[2] +mla v14.4S, v22.4S, v31.s[0] +ldr q22, [x0, #288] +sqrdmulh v18.4S, v29.4S, v5.s[1] +mla v26.4S, v20.4S, v31.s[0] +ldr q20, [x17, #+256] +sqrdmulh v17.4S, v28.4S, v5.s[2] +ldr q7, [x17, #+272] +mul v10.4S, v10.4S,v25.s[1] +sub v9.4s, v2.4s, v0.4s +str q9, [x0, #16] +mul v21.4S, v21.4S,v25.s[2] +add v2.4s, v2.4s, v0.4s +str q2, [x0, #0] +mla v10.4S, v4.4S, v31.s[0] +sub v4.4s, v23.4s, v15.4s +str q4, [x0, #48] +mla v21.4S, v16.4S, v31.s[0] +add v23.4s, v23.4s, v15.4s +str q23, [x0, #32] +mul v29.4S, v29.4S,v6.s[1] +sub v24.4s, v12.4s, v14.4s +str q24, [x0, #80] +mul v28.4S, v28.4S,v6.s[2] +add v12.4s, v12.4s, v14.4s +str q12, [x0, #64] +mla v29.4S, v18.4S, v31.s[0] +sub v18.4s, v13.4s, v26.4s +str q18, [x0, #112] +mla v28.4S, v17.4S, v31.s[0] +add v13.4s, v13.4s, v26.4s +str q13, [x0, #96] +sqrdmulh v5.4S, v22.4S, v7.s[0] +sub v6.4s, v11.4s, v10.4s +mul v22.4S, v22.4S,v20.s[0] +str q6, [x0, #144] +ldr q6, [x0, #304] +sqrdmulh v13.4S, v6.4S, v7.s[0] +add v11.4s, v11.4s, v10.4s +mul v6.4S, v6.4S,v20.s[0] +str q11, [x0, #128] +ldr q11, [x17, #+288] +ldr q10, [x17, #+304] +ldr q26, [x0, #352] +sqrdmulh v17.4S, v26.4S, v10.s[0] +sub v18.4s, v19.4s, v21.4s +mul v26.4S, v26.4S,v11.s[0] +str q18, [x0, #176] +ldr q18, [x0, #368] +sqrdmulh v12.4S, v18.4S, v10.s[0] +add v19.4s, v19.4s, v21.4s +mul v18.4S, v18.4S,v11.s[0] +str q19, [x0, #160] +ldr q19, [x17, #+320] +ldr q21, [x17, #+336] +mla v22.4S, v5.4S, v31.s[0] +sub v5.4s, v3.4s, v29.4s +sqrdmulh v14.4S, v27.4S, v21.s[0] +str q5, [x0, #208] +ldr q5, [x0, #432] +mla v6.4S, v13.4S, v31.s[0] +add v3.4s, v3.4s, v29.4s +sqrdmulh v29.4S, v5.4S, v21.s[0] +str q3, [x0, #192] +ldr q3, [x17, #+352] +ldr q13, [x17, #+368] +mla v26.4S, v17.4S, v31.s[0] +sub v17.4s, v1.4s, v28.4s +sqrdmulh v24.4S, v30.4S, v13.s[0] +str q17, [x0, #240] +ldr q17, [x0, #496] +mla v18.4S, v12.4S, v31.s[0] +add v1.4s, v1.4s, v28.4s +sqrdmulh v28.4S, v17.4S, v13.s[0] +str q1, [x0, #224] +ldr q1, [x0, #384] +ldr q12, [x0, #256] +mul v27.4S, v27.4S,v19.s[0] +sub v25.4s, v12.4s, v22.4s +mul v5.4S, v5.4S,v19.s[0] +add v12.4s, v12.4s, v22.4s +ldr q22, [x0, #400] +ldr q23, [x0, #272] +mla v27.4S, v14.4S, v31.s[0] +sub v14.4s, v23.4s, v6.4s +mla v5.4S, v29.4S, v31.s[0] +add v23.4s, v23.4s, v6.4s +ldr q6, [x0, #448] +ldr q29, [x0, #320] +mul v30.4S, v30.4S,v3.s[0] +sub v15.4s, v29.4s, v26.4s +mul v17.4S, v17.4S,v3.s[0] +add v29.4s, v29.4s, v26.4s +ldr q26, [x0, #464] +ldr q16, [x0, #336] +mla v30.4S, v24.4S, v31.s[0] +mla v17.4S, v28.4S, v31.s[0] +sub v28.4s, v16.4s, v18.4s +sqrdmulh v24.4S, v23.4S, v7.s[1] +mul v23.4S, v23.4S,v20.s[1] +add v16.4s, v16.4s, v18.4s +sqrdmulh v18.4S, v14.4S, v7.s[2] +sub v4.4s, v1.4s, v27.4s +mul v14.4S, v14.4S,v20.s[2] +add v1.4s, v1.4s, v27.4s +sqrdmulh v7.4S, v16.4S, v10.s[1] +sub v20.4s, v22.4s, v5.4s +mul v16.4S, v16.4S,v11.s[1] +add v22.4s, v22.4s, v5.4s +sqrdmulh v5.4S, v28.4S, v10.s[2] +sub v27.4s, v6.4s, v30.4s +mul v28.4S, v28.4S,v11.s[2] +add v6.4s, v6.4s, v30.4s +mla v23.4S, v24.4S, v31.s[0] +sub v24.4s, v26.4s, v17.4s +ldr q10, [x0, #736] +sqrdmulh v11.4S, v22.4S, v21.s[1] +add v26.4s, v26.4s, v17.4s +mla v14.4S, v18.4S, v31.s[0] +ldr q18, [x0, #672] +sqrdmulh v17.4S, v20.4S, v21.s[2] +mla v16.4S, v7.4S, v31.s[0] +ldr q7, [x0, #544] +sqrdmulh v30.4S, v26.4S, v13.s[1] +mla v28.4S, v5.4S, v31.s[0] +ldr q5, [x17, #+384] +sqrdmulh v2.4S, v24.4S, v13.s[2] +ldr q0, [x17, #+400] +mul v22.4S, v22.4S,v19.s[1] +sub v9.4s, v12.4s, v23.4s +str q9, [x0, #272] +mul v20.4S, v20.4S,v19.s[2] +add v12.4s, v12.4s, v23.4s +str q12, [x0, #256] +mla v22.4S, v11.4S, v31.s[0] +sub v11.4s, v25.4s, v14.4s +str q11, [x0, #304] +mla v20.4S, v17.4S, v31.s[0] +add v25.4s, v25.4s, v14.4s +str q25, [x0, #288] +mul v26.4S, v26.4S,v3.s[1] +sub v21.4s, v29.4s, v16.4s +str q21, [x0, #336] +mul v24.4S, v24.4S,v3.s[2] +add v29.4s, v29.4s, v16.4s +str q29, [x0, #320] +mla v26.4S, v30.4S, v31.s[0] +sub v30.4s, v15.4s, v28.4s +str q30, [x0, #368] +mla v24.4S, v2.4S, v31.s[0] +add v15.4s, v15.4s, v28.4s +str q15, [x0, #352] +sqrdmulh v13.4S, v7.4S, v0.s[0] +sub v3.4s, v1.4s, v22.4s +mul v7.4S, v7.4S,v5.s[0] +str q3, [x0, #400] +ldr q3, [x0, #560] +sqrdmulh v15.4S, v3.4S, v0.s[0] +add v1.4s, v1.4s, v22.4s +mul v3.4S, v3.4S,v5.s[0] +str q1, [x0, #384] +ldr q1, [x17, #+416] +ldr q22, [x17, #+432] +ldr q28, [x0, #608] +sqrdmulh v2.4S, v28.4S, v22.s[0] +sub v30.4s, v4.4s, v20.4s +mul v28.4S, v28.4S,v1.s[0] +str q30, [x0, #432] +ldr q30, [x0, #624] +sqrdmulh v29.4S, v30.4S, v22.s[0] +add v4.4s, v4.4s, v20.4s +mul v30.4S, v30.4S,v1.s[0] +str q4, [x0, #416] +ldr q4, [x17, #+448] +ldr q20, [x17, #+464] +mla v7.4S, v13.4S, v31.s[0] +sub v13.4s, v6.4s, v26.4s +sqrdmulh v16.4S, v18.4S, v20.s[0] +str q13, [x0, #464] +ldr q13, [x0, #688] +mla v3.4S, v15.4S, v31.s[0] +add v6.4s, v6.4s, v26.4s +sqrdmulh v26.4S, v13.4S, v20.s[0] +str q6, [x0, #448] +ldr q6, [x17, #+480] +ldr q15, [x17, #+496] +mla v28.4S, v2.4S, v31.s[0] +sub v2.4s, v27.4s, v24.4s +sqrdmulh v21.4S, v10.4S, v15.s[0] +str q2, [x0, #496] +ldr q2, [x0, #752] +mla v30.4S, v29.4S, v31.s[0] +add v27.4s, v27.4s, v24.4s +sqrdmulh v24.4S, v2.4S, v15.s[0] +str q27, [x0, #480] +ldr q27, [x0, #640] +ldr q29, [x0, #512] +mul v18.4S, v18.4S,v4.s[0] +sub v19.4s, v29.4s, v7.4s +mul v13.4S, v13.4S,v4.s[0] +add v29.4s, v29.4s, v7.4s +ldr q7, [x0, #656] +ldr q25, [x0, #528] +mla v18.4S, v16.4S, v31.s[0] +sub v16.4s, v25.4s, v3.4s +mla v13.4S, v26.4S, v31.s[0] +add v25.4s, v25.4s, v3.4s +ldr q3, [x0, #704] +ldr q26, [x0, #576] +mul v10.4S, v10.4S,v6.s[0] +sub v14.4s, v26.4s, v28.4s +mul v2.4S, v2.4S,v6.s[0] +add v26.4s, v26.4s, v28.4s +ldr q28, [x0, #720] +ldr q17, [x0, #592] +mla v10.4S, v21.4S, v31.s[0] +mla v2.4S, v24.4S, v31.s[0] +sub v24.4s, v17.4s, v30.4s +sqrdmulh v21.4S, v25.4S, v0.s[1] +mul v25.4S, v25.4S,v5.s[1] +add v17.4s, v17.4s, v30.4s +sqrdmulh v30.4S, v16.4S, v0.s[2] +sub v11.4s, v27.4s, v18.4s +mul v16.4S, v16.4S,v5.s[2] +add v27.4s, v27.4s, v18.4s +sqrdmulh v0.4S, v17.4S, v22.s[1] +sub v5.4s, v7.4s, v13.4s +mul v17.4S, v17.4S,v1.s[1] +add v7.4s, v7.4s, v13.4s +sqrdmulh v13.4S, v24.4S, v22.s[2] +sub v18.4s, v3.4s, v10.4s +mul v24.4S, v24.4S,v1.s[2] +add v3.4s, v3.4s, v10.4s +mla v25.4S, v21.4S, v31.s[0] +sub v21.4s, v28.4s, v2.4s +ldr q22, [x0, #992] +sqrdmulh v1.4S, v7.4S, v20.s[1] +add v28.4s, v28.4s, v2.4s +mla v16.4S, v30.4S, v31.s[0] +ldr q30, [x0, #928] +sqrdmulh v2.4S, v5.4S, v20.s[2] +mla v17.4S, v0.4S, v31.s[0] +ldr q0, [x0, #800] +sqrdmulh v10.4S, v28.4S, v15.s[1] +mla v24.4S, v13.4S, v31.s[0] +ldr q13, [x17, #+512] +sqrdmulh v12.4S, v21.4S, v15.s[2] +ldr q23, [x17, #+528] +mul v7.4S, v7.4S,v4.s[1] +sub v9.4s, v29.4s, v25.4s +str q9, [x0, #528] +mul v5.4S, v5.4S,v4.s[2] +add v29.4s, v29.4s, v25.4s +str q29, [x0, #512] +mla v7.4S, v1.4S, v31.s[0] +sub v1.4s, v19.4s, v16.4s +str q1, [x0, #560] +mla v5.4S, v2.4S, v31.s[0] +add v19.4s, v19.4s, v16.4s +str q19, [x0, #544] +mul v28.4S, v28.4S,v6.s[1] +sub v20.4s, v26.4s, v17.4s +str q20, [x0, #592] +mul v21.4S, v21.4S,v6.s[2] +add v26.4s, v26.4s, v17.4s +str q26, [x0, #576] +mla v28.4S, v10.4S, v31.s[0] +sub v10.4s, v14.4s, v24.4s +str q10, [x0, #624] +mla v21.4S, v12.4S, v31.s[0] +add v14.4s, v14.4s, v24.4s +str q14, [x0, #608] +sqrdmulh v15.4S, v0.4S, v23.s[0] +sub v6.4s, v27.4s, v7.4s +mul v0.4S, v0.4S,v13.s[0] +str q6, [x0, #656] +ldr q6, [x0, #816] +sqrdmulh v14.4S, v6.4S, v23.s[0] +add v27.4s, v27.4s, v7.4s +mul v6.4S, v6.4S,v13.s[0] +str q27, [x0, #640] +ldr q27, [x17, #+544] +ldr q7, [x17, #+560] +ldr q24, [x0, #864] +sqrdmulh v12.4S, v24.4S, v7.s[0] +sub v10.4s, v11.4s, v5.4s +mul v24.4S, v24.4S,v27.s[0] +str q10, [x0, #688] +ldr q10, [x0, #880] +sqrdmulh v26.4S, v10.4S, v7.s[0] +add v11.4s, v11.4s, v5.4s +mul v10.4S, v10.4S,v27.s[0] +str q11, [x0, #672] +ldr q11, [x17, #+576] +ldr q5, [x17, #+592] +mla v0.4S, v15.4S, v31.s[0] +sub v15.4s, v3.4s, v28.4s +sqrdmulh v17.4S, v30.4S, v5.s[0] +str q15, [x0, #720] +ldr q15, [x0, #944] +mla v6.4S, v14.4S, v31.s[0] +add v3.4s, v3.4s, v28.4s +sqrdmulh v28.4S, v15.4S, v5.s[0] +str q3, [x0, #704] +ldr q3, [x17, #+608] +ldr q14, [x17, #+624] +mla v24.4S, v12.4S, v31.s[0] +sub v12.4s, v18.4s, v21.4s +sqrdmulh v20.4S, v22.4S, v14.s[0] +str q12, [x0, #752] +ldr q12, [x0, #1008] +mla v10.4S, v26.4S, v31.s[0] +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v12.4S, v14.s[0] +str q18, [x0, #736] +ldr q18, [x0, #896] +ldr q26, [x0, #768] +mul v30.4S, v30.4S,v11.s[0] +sub v4.4s, v26.4s, v0.4s +mul v15.4S, v15.4S,v11.s[0] +add v26.4s, v26.4s, v0.4s +ldr q0, [x0, #912] +ldr q19, [x0, #784] +mla v30.4S, v17.4S, v31.s[0] +sub v17.4s, v19.4s, v6.4s +mla v15.4S, v28.4S, v31.s[0] +add v19.4s, v19.4s, v6.4s +ldr q6, [x0, #960] +ldr q28, [x0, #832] +mul v22.4S, v22.4S,v3.s[0] +sub v16.4s, v28.4s, v24.4s +mul v12.4S, v12.4S,v3.s[0] +add v28.4s, v28.4s, v24.4s +ldr q24, [x0, #976] +ldr q2, [x0, #848] +mla v22.4S, v20.4S, v31.s[0] +mla v12.4S, v21.4S, v31.s[0] +sub v21.4s, v2.4s, v10.4s +sqrdmulh v20.4S, v19.4S, v23.s[1] +mul v19.4S, v19.4S,v13.s[1] +add v2.4s, v2.4s, v10.4s +sqrdmulh v10.4S, v17.4S, v23.s[2] +sub v1.4s, v18.4s, v30.4s +mul v17.4S, v17.4S,v13.s[2] +add v18.4s, v18.4s, v30.4s +sqrdmulh v23.4S, v2.4S, v7.s[1] +sub v13.4s, v0.4s, v15.4s +mul v2.4S, v2.4S,v27.s[1] +add v0.4s, v0.4s, v15.4s +sqrdmulh v15.4S, v21.4S, v7.s[2] +sub v30.4s, v6.4s, v22.4s +mul v21.4S, v21.4S,v27.s[2] +add v6.4s, v6.4s, v22.4s +mla v19.4S, v20.4S, v31.s[0] +sub v20.4s, v24.4s, v12.4s +sqrdmulh v7.4S, v0.4S, v5.s[1] +add v24.4s, v24.4s, v12.4s +mla v17.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v13.4S, v5.s[2] +mla v2.4S, v23.4S, v31.s[0] +sqrdmulh v23.4S, v24.4S, v14.s[1] +mla v21.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v20.4S, v14.s[2] +mul v0.4S, v0.4S,v11.s[1] +sub v12.4s, v26.4s, v19.4s +str q12, [x0, #784] +mul v13.4S, v13.4S,v11.s[2] +add v26.4s, v26.4s, v19.4s +str q26, [x0, #768] +mla v0.4S, v7.4S, v31.s[0] +sub v7.4s, v4.4s, v17.4s +str q7, [x0, #816] +mla v13.4S, v10.4S, v31.s[0] +add v4.4s, v4.4s, v17.4s +str q4, [x0, #800] +mul v24.4S, v24.4S,v3.s[1] +sub v5.4s, v28.4s, v2.4s +str q5, [x0, #848] +mul v20.4S, v20.4S,v3.s[2] +add v28.4s, v28.4s, v2.4s +str q28, [x0, #832] +mla v24.4S, v23.4S, v31.s[0] +sub v23.4s, v16.4s, v21.4s +str q23, [x0, #880] +mla v20.4S, v15.4S, v31.s[0] +add v16.4s, v16.4s, v21.4s +str q16, [x0, #864] +sub v14.4s, v18.4s, v0.4s +str q14, [x0, #912] +add v18.4s, v18.4s, v0.4s +str q18, [x0, #896] +sub v18.4s, v1.4s, v13.4s +str q18, [x0, #944] +add v1.4s, v1.4s, v13.4s +str q1, [x0, #928] +sub v1.4s, v6.4s, v24.4s +str q1, [x0, #976] +add v6.4s, v6.4s, v24.4s +str q6, [x0, #960] +sub v6.4s, v30.4s, v20.4s +str q6, [x0, #1008] +add v30.4s, v30.4s, v20.4s +str q30, [x0, #992] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1464 +// Instruction count: 1460 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_6.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_6.s new file mode 100644 index 0000000..d7c7c98 --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_6.s @@ -0,0 +1,1494 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_7_z4_6 +.global _ntt_u32_incomplete_neon_asm_var_4_2_7_z4_6 +ntt_u32_incomplete_neon_asm_var_4_2_7_z4_6: +_ntt_u32_incomplete_neon_asm_var_4_2_7_z4_6: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #928] +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +ldr q20, [x0, #992] +sqrdmulh v19.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q18, [x0, #800] +sqrdmulh v17.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +ldr q16, [x0, #864] +sqrdmulh v3.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +mla v22.4S, v21.4S, v31.s[0] +mla v20.4S, v19.4S, v31.s[0] +mla v18.4S, v17.4S, v31.s[0] +mla v16.4S, v3.4S, v31.s[0] +ldr q3, [x0, #544] +sqrdmulh v17.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +ldr q19, [x0, #608] +sqrdmulh v21.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q2, [x0, #672] +ldr q1, [x0, #416] +sqrdmulh v0.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +sub v15.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +ldr q22, [x0, #736] +ldr q14, [x0, #480] +sqrdmulh v13.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v12.4s, v14.4s, v20.4s +add v14.4s, v14.4s, v20.4s +ldr q20, [x0, #288] +mla v3.4S, v17.4S, v31.s[0] +mla v19.4S, v21.4S, v31.s[0] +sub v21.4s, v20.4s, v18.4s +mla v2.4S, v0.4S, v31.s[0] +mla v22.4S, v13.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +ldr q18, [x0, #352] +sqrdmulh v13.4S, v1.4S, v29.s[1] +mul v1.4S, v1.4S,v30.s[1] +sub v0.4s, v18.4s, v16.4s +sqrdmulh v17.4S, v14.4S, v29.s[1] +mul v14.4S, v14.4S,v30.s[1] +add v18.4s, v18.4s, v16.4s +ldr q16, [x0, #32] +sqrdmulh v11.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v10.4s, v16.4s, v3.4s +add v16.4s, v16.4s, v3.4s +ldr q3, [x0, #96] +sqrdmulh v9.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v8.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +ldr q19, [x0, #160] +mla v1.4S, v13.4S, v31.s[0] +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v19.4s, v2.4s +mla v20.4S, v11.4S, v31.s[0] +mla v18.4S, v9.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +ldr q2, [x0, #224] +sqrdmulh v9.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +sub v11.4s, v2.4s, v22.4s +sqrdmulh v13.4S, v12.4S, v29.s[2] +mul v12.4S, v12.4S,v30.s[2] +add v2.4s, v2.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +sub v7.4s, v19.4s, v1.4s +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v29.s[2] +mul v0.4S, v0.4S,v30.s[2] +sub v6.4s, v2.4s, v14.4s +add v2.4s, v2.4s, v14.4s +mla v15.4S, v9.4S, v31.s[0] +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v16.4s, v20.4s +mla v21.4S, v22.4S, v31.s[0] +mla v0.4S, v1.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v7.4S, v27.s[1] +mul v7.4S, v7.4S,v28.s[1] +sub v1.4s, v3.4s, v18.4s +sqrdmulh v22.4S, v6.4S, v27.s[1] +mul v6.4S, v6.4S,v28.s[1] +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v19.4S, v27.s[0] +mul v19.4S, v19.4S,v28.s[0] +sub v9.4s, v17.4s, v15.4s +add v17.4s, v17.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v27.s[0] +mul v2.4S, v2.4S,v28.s[0] +sub v14.4s, v11.4s, v12.4s +add v11.4s, v11.4s, v12.4s +mla v7.4S, v20.4S, v31.s[0] +mla v6.4S, v22.4S, v31.s[0] +sub v22.4s, v10.4s, v21.4s +mla v19.4S, v18.4S, v31.s[0] +mla v2.4S, v15.4S, v31.s[0] +add v10.4s, v10.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v27.s[2] +mul v17.4S, v17.4S,v28.s[2] +sub v15.4s, v8.4s, v0.4s +sqrdmulh v18.4S, v11.4S, v27.s[2] +mul v11.4S, v11.4S,v28.s[2] +add v8.4s, v8.4s, v0.4s +sqrdmulh v0.4S, v9.4S, v27.s[3] +mul v9.4S, v9.4S,v28.s[3] +sub v20.4s, v13.4s, v7.4s +add v13.4s, v13.4s, v7.4s +sqrdmulh v7.4S, v14.4S, v27.s[3] +mul v14.4S, v14.4S,v28.s[3] +sub v12.4s, v1.4s, v6.4s +add v1.4s, v1.4s, v6.4s +mla v17.4S, v21.4S, v31.s[0] +mla v11.4S, v18.4S, v31.s[0] +sub v18.4s, v16.4s, v19.4s +mla v9.4S, v0.4S, v31.s[0] +mla v14.4S, v7.4S, v31.s[0] +add v16.4s, v16.4s, v19.4s +sqrdmulh v19.4S, v1.4S, v25.s[2] +mul v1.4S, v1.4S,v26.s[2] +sub v7.4s, v3.4s, v2.4s +sqrdmulh v0.4S, v12.4S, v25.s[3] +mul v12.4S, v12.4S,v26.s[3] +add v3.4s, v3.4s, v2.4s +sqrdmulh v2.4S, v7.4S, v25.s[1] +mul v7.4S, v7.4S,v26.s[1] +sub v21.4s, v10.4s, v17.4s +add v10.4s, v10.4s, v17.4s +sqrdmulh v17.4S, v3.4S, v25.s[0] +mul v3.4S, v3.4S,v26.s[0] +sub v6.4s, v8.4s, v11.4s +add v8.4s, v8.4s, v11.4s +mla v1.4S, v19.4S, v31.s[0] +mla v12.4S, v0.4S, v31.s[0] +sub v0.4s, v22.4s, v9.4s +mla v7.4S, v2.4S, v31.s[0] +mla v3.4S, v17.4S, v31.s[0] +add v22.4s, v22.4s, v9.4s +sqrdmulh v9.4S, v8.4S, v23.s[0] +mul v8.4S, v8.4S,v24.s[0] +sub v17.4s, v15.4s, v14.4s +sqrdmulh v2.4S, v6.4S, v23.s[1] +mul v6.4S, v6.4S,v24.s[1] +add v15.4s, v15.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v23.s[2] +mul v15.4S, v15.4S,v24.s[2] +sub v19.4s, v13.4s, v1.4s +add v13.4s, v13.4s, v1.4s +sqrdmulh v1.4S, v17.4S, v23.s[3] +mul v17.4S, v17.4S,v24.s[3] +sub v11.4s, v20.4s, v12.4s +add v20.4s, v20.4s, v12.4s +mla v8.4S, v9.4S, v31.s[0] +mla v6.4S, v2.4S, v31.s[0] +sub v2.4s, v18.4s, v7.4s +str q13, [x0, #288] +mla v15.4S, v14.4S, v31.s[0] +mla v17.4S, v1.4S, v31.s[0] +add v18.4s, v18.4s, v7.4s +str q19, [x0, #352] +ldr q19, [x0, #944] +sqrdmulh v7.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +sub v1.4s, v16.4s, v3.4s +str q20, [x0, #416] +ldr q20, [x0, #1008] +sqrdmulh v14.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v16.4s, v16.4s, v3.4s +str q11, [x0, #480] +ldr q11, [x0, #816] +sqrdmulh v3.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +sub v13.4s, v10.4s, v8.4s +add v10.4s, v10.4s, v8.4s +ldr q8, [x0, #880] +sqrdmulh v9.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v12.4s, v21.4s, v6.4s +add v21.4s, v21.4s, v6.4s +mla v19.4S, v7.4S, v31.s[0] +mla v20.4S, v14.4S, v31.s[0] +sub v14.4s, v22.4s, v15.4s +str q18, [x0, #160] +mla v11.4S, v3.4S, v31.s[0] +mla v8.4S, v9.4S, v31.s[0] +add v22.4s, v22.4s, v15.4s +str q2, [x0, #224] +ldr q2, [x0, #560] +sqrdmulh v15.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +sub v9.4s, v0.4s, v17.4s +str q16, [x0, #32] +ldr q16, [x0, #624] +sqrdmulh v3.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +add v0.4s, v0.4s, v17.4s +str q1, [x0, #96] +ldr q1, [x0, #688] +ldr q17, [x0, #432] +sqrdmulh v18.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +sub v7.4s, v17.4s, v19.4s +add v17.4s, v17.4s, v19.4s +ldr q19, [x0, #752] +ldr q6, [x0, #496] +sqrdmulh v5.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +sub v4.4s, v6.4s, v20.4s +add v6.4s, v6.4s, v20.4s +ldr q20, [x0, #304] +mla v2.4S, v15.4S, v31.s[0] +mla v16.4S, v3.4S, v31.s[0] +sub v3.4s, v20.4s, v11.4s +str q10, [x0, #544] +mla v1.4S, v18.4S, v31.s[0] +mla v19.4S, v5.4S, v31.s[0] +add v20.4s, v20.4s, v11.4s +str q13, [x0, #608] +ldr q13, [x0, #368] +sqrdmulh v11.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v5.4s, v13.4s, v8.4s +str q21, [x0, #672] +sqrdmulh v21.4S, v6.4S, v29.s[1] +mul v6.4S, v6.4S,v30.s[1] +add v13.4s, v13.4s, v8.4s +str q12, [x0, #736] +ldr q12, [x0, #48] +sqrdmulh v8.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v18.4s, v12.4s, v2.4s +add v12.4s, v12.4s, v2.4s +ldr q2, [x0, #112] +sqrdmulh v10.4S, v13.4S, v29.s[1] +mul v13.4S, v13.4S,v30.s[1] +sub v15.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +ldr q16, [x0, #176] +mla v17.4S, v11.4S, v31.s[0] +mla v6.4S, v21.4S, v31.s[0] +sub v21.4s, v16.4s, v1.4s +str q22, [x0, #800] +mla v20.4S, v8.4S, v31.s[0] +mla v13.4S, v10.4S, v31.s[0] +add v16.4s, v16.4s, v1.4s +str q14, [x0, #864] +ldr q14, [x0, #240] +sqrdmulh v1.4S, v7.4S, v29.s[2] +mul v7.4S, v7.4S,v30.s[2] +sub v10.4s, v14.4s, v19.4s +str q0, [x0, #928] +sqrdmulh v0.4S, v4.4S, v29.s[2] +mul v4.4S, v4.4S,v30.s[2] +add v14.4s, v14.4s, v19.4s +str q9, [x0, #992] +sqrdmulh v9.4S, v3.4S, v29.s[2] +mul v3.4S, v3.4S,v30.s[2] +sub v19.4s, v16.4s, v17.4s +add v16.4s, v16.4s, v17.4s +sqrdmulh v17.4S, v5.4S, v29.s[2] +mul v5.4S, v5.4S,v30.s[2] +sub v8.4s, v14.4s, v6.4s +add v14.4s, v14.4s, v6.4s +mla v7.4S, v1.4S, v31.s[0] +mla v4.4S, v0.4S, v31.s[0] +sub v0.4s, v12.4s, v20.4s +mla v3.4S, v9.4S, v31.s[0] +mla v5.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v27.s[1] +mul v19.4S, v19.4S,v28.s[1] +sub v17.4s, v2.4s, v13.4s +sqrdmulh v9.4S, v8.4S, v27.s[1] +mul v8.4S, v8.4S,v28.s[1] +add v2.4s, v2.4s, v13.4s +sqrdmulh v13.4S, v16.4S, v27.s[0] +mul v16.4S, v16.4S,v28.s[0] +sub v1.4s, v21.4s, v7.4s +add v21.4s, v21.4s, v7.4s +sqrdmulh v7.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +sub v6.4s, v10.4s, v4.4s +add v10.4s, v10.4s, v4.4s +mla v19.4S, v20.4S, v31.s[0] +mla v8.4S, v9.4S, v31.s[0] +sub v9.4s, v18.4s, v3.4s +mla v16.4S, v13.4S, v31.s[0] +mla v14.4S, v7.4S, v31.s[0] +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v27.s[2] +mul v21.4S, v21.4S,v28.s[2] +sub v7.4s, v15.4s, v5.4s +sqrdmulh v13.4S, v10.4S, v27.s[2] +mul v10.4S, v10.4S,v28.s[2] +add v15.4s, v15.4s, v5.4s +sqrdmulh v5.4S, v1.4S, v27.s[3] +mul v1.4S, v1.4S,v28.s[3] +sub v20.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v27.s[3] +mul v6.4S, v6.4S,v28.s[3] +sub v4.4s, v17.4s, v8.4s +add v17.4s, v17.4s, v8.4s +mla v21.4S, v3.4S, v31.s[0] +mla v10.4S, v13.4S, v31.s[0] +sub v13.4s, v12.4s, v16.4s +mla v1.4S, v5.4S, v31.s[0] +mla v6.4S, v19.4S, v31.s[0] +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v25.s[2] +mul v17.4S, v17.4S,v26.s[2] +sub v19.4s, v2.4s, v14.4s +sqrdmulh v5.4S, v4.4S, v25.s[3] +mul v4.4S, v4.4S,v26.s[3] +add v2.4s, v2.4s, v14.4s +sqrdmulh v14.4S, v19.4S, v25.s[1] +mul v19.4S, v19.4S,v26.s[1] +sub v3.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v2.4S, v25.s[0] +mul v2.4S, v2.4S,v26.s[0] +sub v8.4s, v15.4s, v10.4s +add v15.4s, v15.4s, v10.4s +mla v17.4S, v16.4S, v31.s[0] +mla v4.4S, v5.4S, v31.s[0] +sub v5.4s, v9.4s, v1.4s +mla v19.4S, v14.4S, v31.s[0] +mla v2.4S, v21.4S, v31.s[0] +add v9.4s, v9.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v23.s[0] +mul v15.4S, v15.4S,v24.s[0] +sub v21.4s, v7.4s, v6.4s +sqrdmulh v14.4S, v8.4S, v23.s[1] +mul v8.4S, v8.4S,v24.s[1] +add v7.4s, v7.4s, v6.4s +sqrdmulh v6.4S, v7.4S, v23.s[2] +mul v7.4S, v7.4S,v24.s[2] +sub v16.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +sqrdmulh v17.4S, v21.4S, v23.s[3] +mul v21.4S, v21.4S,v24.s[3] +sub v10.4s, v20.4s, v4.4s +add v20.4s, v20.4s, v4.4s +mla v15.4S, v1.4S, v31.s[0] +mla v8.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v19.4s +str q0, [x0, #304] +mla v7.4S, v6.4S, v31.s[0] +mla v21.4S, v17.4S, v31.s[0] +add v13.4s, v13.4s, v19.4s +str q16, [x0, #368] +ldr q16, [x0, #896] +sqrdmulh v19.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v17.4s, v12.4s, v2.4s +str q20, [x0, #432] +ldr q20, [x0, #960] +sqrdmulh v6.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v12.4s, v12.4s, v2.4s +str q10, [x0, #496] +ldr q10, [x0, #768] +sqrdmulh v2.4S, v10.4S, v29.s[0] +mul v10.4S, v10.4S,v30.s[0] +sub v0.4s, v18.4s, v15.4s +add v18.4s, v18.4s, v15.4s +ldr q15, [x0, #832] +sqrdmulh v1.4S, v15.4S, v29.s[0] +mul v15.4S, v15.4S,v30.s[0] +sub v4.4s, v3.4s, v8.4s +add v3.4s, v3.4s, v8.4s +mla v16.4S, v19.4S, v31.s[0] +mla v20.4S, v6.4S, v31.s[0] +sub v6.4s, v9.4s, v7.4s +str q13, [x0, #176] +mla v10.4S, v2.4S, v31.s[0] +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v7.4s +str q14, [x0, #240] +ldr q14, [x0, #512] +sqrdmulh v7.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v1.4s, v5.4s, v21.4s +str q12, [x0, #48] +ldr q12, [x0, #576] +sqrdmulh v2.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +add v5.4s, v5.4s, v21.4s +str q17, [x0, #112] +ldr q17, [x0, #640] +ldr q21, [x0, #384] +sqrdmulh v13.4S, v17.4S, v29.s[0] +mul v17.4S, v17.4S,v30.s[0] +sub v19.4s, v21.4s, v16.4s +add v21.4s, v21.4s, v16.4s +ldr q16, [x0, #704] +ldr q8, [x0, #448] +sqrdmulh v22.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v11.4s, v8.4s, v20.4s +add v8.4s, v8.4s, v20.4s +ldr q20, [x0, #256] +mla v14.4S, v7.4S, v31.s[0] +mla v12.4S, v2.4S, v31.s[0] +sub v2.4s, v20.4s, v10.4s +str q18, [x0, #560] +mla v17.4S, v13.4S, v31.s[0] +mla v16.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v10.4s +str q0, [x0, #624] +ldr q0, [x0, #320] +sqrdmulh v10.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v22.4s, v0.4s, v15.4s +str q3, [x0, #688] +sqrdmulh v3.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +add v0.4s, v0.4s, v15.4s +str q4, [x0, #752] +ldr q4, [x0, #0] +sqrdmulh v15.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v13.4s, v4.4s, v14.4s +add v4.4s, v4.4s, v14.4s +ldr q14, [x0, #64] +sqrdmulh v18.4S, v0.4S, v29.s[1] +mul v0.4S, v0.4S,v30.s[1] +sub v7.4s, v14.4s, v12.4s +add v14.4s, v14.4s, v12.4s +ldr q12, [x0, #128] +mla v21.4S, v10.4S, v31.s[0] +mla v8.4S, v3.4S, v31.s[0] +sub v3.4s, v12.4s, v17.4s +str q9, [x0, #816] +mla v20.4S, v15.4S, v31.s[0] +mla v0.4S, v18.4S, v31.s[0] +add v12.4s, v12.4s, v17.4s +str q6, [x0, #880] +ldr q6, [x0, #192] +sqrdmulh v17.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +sub v18.4s, v6.4s, v16.4s +str q5, [x0, #944] +sqrdmulh v5.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +add v6.4s, v6.4s, v16.4s +str q1, [x0, #1008] +sqrdmulh v1.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v16.4s, v12.4s, v21.4s +add v12.4s, v12.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +sub v15.4s, v6.4s, v8.4s +add v6.4s, v6.4s, v8.4s +mla v19.4S, v17.4S, v31.s[0] +mla v11.4S, v5.4S, v31.s[0] +sub v5.4s, v4.4s, v20.4s +mla v2.4S, v1.4S, v31.s[0] +mla v22.4S, v21.4S, v31.s[0] +add v4.4s, v4.4s, v20.4s +sqrdmulh v20.4S, v16.4S, v27.s[1] +mul v16.4S, v16.4S,v28.s[1] +sub v21.4s, v14.4s, v0.4s +sqrdmulh v1.4S, v15.4S, v27.s[1] +mul v15.4S, v15.4S,v28.s[1] +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +sub v17.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v27.s[0] +mul v6.4S, v6.4S,v28.s[0] +sub v8.4s, v18.4s, v11.4s +add v18.4s, v18.4s, v11.4s +mla v16.4S, v20.4S, v31.s[0] +mla v15.4S, v1.4S, v31.s[0] +sub v1.4s, v13.4s, v2.4s +mla v12.4S, v0.4S, v31.s[0] +mla v6.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v3.4S, v27.s[2] +mul v3.4S, v3.4S,v28.s[2] +sub v19.4s, v7.4s, v22.4s +sqrdmulh v0.4S, v18.4S, v27.s[2] +mul v18.4S, v18.4S,v28.s[2] +add v7.4s, v7.4s, v22.4s +sqrdmulh v22.4S, v17.4S, v27.s[3] +mul v17.4S, v17.4S,v28.s[3] +sub v20.4s, v5.4s, v16.4s +add v5.4s, v5.4s, v16.4s +sqrdmulh v16.4S, v8.4S, v27.s[3] +mul v8.4S, v8.4S,v28.s[3] +sub v11.4s, v21.4s, v15.4s +add v21.4s, v21.4s, v15.4s +mla v3.4S, v2.4S, v31.s[0] +mla v18.4S, v0.4S, v31.s[0] +sub v0.4s, v4.4s, v12.4s +mla v17.4S, v22.4S, v31.s[0] +mla v8.4S, v16.4S, v31.s[0] +add v4.4s, v4.4s, v12.4s +sqrdmulh v12.4S, v21.4S, v25.s[2] +mul v21.4S, v21.4S,v26.s[2] +sub v16.4s, v14.4s, v6.4s +sqrdmulh v22.4S, v11.4S, v25.s[3] +mul v11.4S, v11.4S,v26.s[3] +add v14.4s, v14.4s, v6.4s +sqrdmulh v6.4S, v16.4S, v25.s[1] +mul v16.4S, v16.4S,v26.s[1] +sub v2.4s, v13.4s, v3.4s +add v13.4s, v13.4s, v3.4s +sqrdmulh v3.4S, v14.4S, v25.s[0] +mul v14.4S, v14.4S,v26.s[0] +sub v15.4s, v7.4s, v18.4s +add v7.4s, v7.4s, v18.4s +mla v21.4S, v12.4S, v31.s[0] +mla v11.4S, v22.4S, v31.s[0] +sub v22.4s, v1.4s, v17.4s +mla v16.4S, v6.4S, v31.s[0] +mla v14.4S, v3.4S, v31.s[0] +add v1.4s, v1.4s, v17.4s +sqrdmulh v17.4S, v7.4S, v23.s[0] +mul v7.4S, v7.4S,v24.s[0] +sub v3.4s, v19.4s, v8.4s +sqrdmulh v6.4S, v15.4S, v23.s[1] +mul v15.4S, v15.4S,v24.s[1] +add v19.4s, v19.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v23.s[2] +mul v19.4S, v19.4S,v24.s[2] +sub v12.4s, v5.4s, v21.4s +add v5.4s, v5.4s, v21.4s +sqrdmulh v21.4S, v3.4S, v23.s[3] +mul v3.4S, v3.4S,v24.s[3] +sub v18.4s, v20.4s, v11.4s +add v20.4s, v20.4s, v11.4s +mla v7.4S, v17.4S, v31.s[0] +mla v15.4S, v6.4S, v31.s[0] +sub v6.4s, v0.4s, v16.4s +str q5, [x0, #256] +mla v19.4S, v8.4S, v31.s[0] +mla v3.4S, v21.4S, v31.s[0] +add v0.4s, v0.4s, v16.4s +str q12, [x0, #320] +ldr q12, [x0, #912] +sqrdmulh v16.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +sub v21.4s, v4.4s, v14.4s +str q20, [x0, #384] +ldr q20, [x0, #976] +sqrdmulh v8.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v4.4s, v4.4s, v14.4s +str q18, [x0, #448] +ldr q18, [x0, #784] +sqrdmulh v14.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +sub v5.4s, v13.4s, v7.4s +add v13.4s, v13.4s, v7.4s +ldr q7, [x0, #848] +sqrdmulh v17.4S, v7.4S, v29.s[0] +mul v7.4S, v7.4S,v30.s[0] +sub v11.4s, v2.4s, v15.4s +add v2.4s, v2.4s, v15.4s +mla v12.4S, v16.4S, v31.s[0] +mla v20.4S, v8.4S, v31.s[0] +sub v8.4s, v1.4s, v19.4s +str q0, [x0, #128] +mla v18.4S, v14.4S, v31.s[0] +mla v7.4S, v17.4S, v31.s[0] +add v1.4s, v1.4s, v19.4s +str q6, [x0, #192] +ldr q6, [x0, #528] +sqrdmulh v19.4S, v6.4S, v29.s[0] +mul v6.4S, v6.4S,v30.s[0] +sub v17.4s, v22.4s, v3.4s +str q4, [x0, #0] +ldr q4, [x0, #592] +sqrdmulh v14.4S, v4.4S, v29.s[0] +mul v4.4S, v4.4S,v30.s[0] +add v22.4s, v22.4s, v3.4s +str q21, [x0, #64] +ldr q21, [x0, #656] +ldr q3, [x0, #400] +sqrdmulh v0.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +sub v16.4s, v3.4s, v12.4s +add v3.4s, v3.4s, v12.4s +ldr q12, [x0, #720] +ldr q15, [x0, #464] +sqrdmulh v9.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +sub v10.4s, v15.4s, v20.4s +add v15.4s, v15.4s, v20.4s +ldr q20, [x0, #272] +mla v6.4S, v19.4S, v31.s[0] +mla v4.4S, v14.4S, v31.s[0] +sub v14.4s, v20.4s, v18.4s +str q13, [x0, #512] +mla v21.4S, v0.4S, v31.s[0] +mla v12.4S, v9.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +str q5, [x0, #576] +ldr q5, [x0, #336] +sqrdmulh v18.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v9.4s, v5.4s, v7.4s +str q2, [x0, #640] +sqrdmulh v2.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +add v5.4s, v5.4s, v7.4s +str q11, [x0, #704] +ldr q11, [x0, #16] +sqrdmulh v7.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v0.4s, v11.4s, v6.4s +add v11.4s, v11.4s, v6.4s +ldr q6, [x0, #80] +sqrdmulh v13.4S, v5.4S, v29.s[1] +mul v5.4S, v5.4S,v30.s[1] +sub v19.4s, v6.4s, v4.4s +add v6.4s, v6.4s, v4.4s +ldr q4, [x0, #144] +mla v3.4S, v18.4S, v31.s[0] +mla v15.4S, v2.4S, v31.s[0] +sub v2.4s, v4.4s, v21.4s +str q1, [x0, #768] +mla v20.4S, v7.4S, v31.s[0] +mla v5.4S, v13.4S, v31.s[0] +add v4.4s, v4.4s, v21.4s +str q8, [x0, #832] +ldr q8, [x0, #208] +sqrdmulh v21.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +sub v13.4s, v8.4s, v12.4s +str q22, [x0, #896] +sqrdmulh v22.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +add v8.4s, v8.4s, v12.4s +str q17, [x0, #960] +sqrdmulh v17.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v12.4s, v4.4s, v3.4s +add v4.4s, v4.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v29.s[2] +mul v9.4S, v9.4S,v30.s[2] +sub v7.4s, v8.4s, v15.4s +add v8.4s, v8.4s, v15.4s +mla v16.4S, v21.4S, v31.s[0] +mla v10.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v20.4s +mla v14.4S, v17.4S, v31.s[0] +mla v9.4S, v3.4S, v31.s[0] +add v11.4s, v11.4s, v20.4s +sqrdmulh v20.4S, v12.4S, v27.s[1] +mul v12.4S, v12.4S,v28.s[1] +sub v3.4s, v6.4s, v5.4s +sqrdmulh v17.4S, v7.4S, v27.s[1] +mul v7.4S, v7.4S,v28.s[1] +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v4.4S, v27.s[0] +mul v4.4S, v4.4S,v28.s[0] +sub v21.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v8.4S, v27.s[0] +mul v8.4S, v8.4S,v28.s[0] +sub v15.4s, v13.4s, v10.4s +add v13.4s, v13.4s, v10.4s +mla v12.4S, v20.4S, v31.s[0] +mla v7.4S, v17.4S, v31.s[0] +sub v17.4s, v0.4s, v14.4s +mla v4.4S, v5.4S, v31.s[0] +mla v8.4S, v16.4S, v31.s[0] +add v0.4s, v0.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v27.s[2] +mul v2.4S, v2.4S,v28.s[2] +sub v16.4s, v19.4s, v9.4s +sqrdmulh v5.4S, v13.4S, v27.s[2] +mul v13.4S, v13.4S,v28.s[2] +add v19.4s, v19.4s, v9.4s +sqrdmulh v9.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +sub v20.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +sub v10.4s, v3.4s, v7.4s +add v3.4s, v3.4s, v7.4s +mla v2.4S, v14.4S, v31.s[0] +mla v13.4S, v5.4S, v31.s[0] +sub v5.4s, v11.4s, v4.4s +mla v21.4S, v9.4S, v31.s[0] +mla v15.4S, v12.4S, v31.s[0] +add v11.4s, v11.4s, v4.4s +sqrdmulh v4.4S, v3.4S, v25.s[2] +mul v3.4S, v3.4S,v26.s[2] +sub v12.4s, v6.4s, v8.4s +sqrdmulh v9.4S, v10.4S, v25.s[3] +mul v10.4S, v10.4S,v26.s[3] +add v6.4s, v6.4s, v8.4s +sqrdmulh v8.4S, v12.4S, v25.s[1] +mul v12.4S, v12.4S,v26.s[1] +sub v14.4s, v0.4s, v2.4s +add v0.4s, v0.4s, v2.4s +sqrdmulh v2.4S, v6.4S, v25.s[0] +mul v6.4S, v6.4S,v26.s[0] +sub v7.4s, v19.4s, v13.4s +add v19.4s, v19.4s, v13.4s +mla v3.4S, v4.4S, v31.s[0] +mla v10.4S, v9.4S, v31.s[0] +sub v9.4s, v17.4s, v21.4s +mla v12.4S, v8.4S, v31.s[0] +mla v6.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v19.4S, v23.s[0] +mul v19.4S, v19.4S,v24.s[0] +sub v2.4s, v16.4s, v15.4s +sqrdmulh v8.4S, v7.4S, v23.s[1] +mul v7.4S, v7.4S,v24.s[1] +add v16.4s, v16.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v23.s[2] +mul v16.4S, v16.4S,v24.s[2] +sub v4.4s, v22.4s, v3.4s +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v2.4S, v23.s[3] +mul v2.4S, v2.4S,v24.s[3] +sub v13.4s, v20.4s, v10.4s +add v20.4s, v20.4s, v10.4s +mla v19.4S, v21.4S, v31.s[0] +mla v7.4S, v8.4S, v31.s[0] +sub v8.4s, v5.4s, v12.4s +str q22, [x0, #272] +mla v16.4S, v15.4S, v31.s[0] +mla v2.4S, v3.4S, v31.s[0] +add v5.4s, v5.4s, v12.4s +str q4, [x0, #336] +sub v23.4s, v11.4s, v6.4s +str q20, [x0, #400] +add v11.4s, v11.4s, v6.4s +str q13, [x0, #464] +sub v13.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sub v19.4s, v14.4s, v7.4s +add v14.4s, v14.4s, v7.4s +sub v7.4s, v17.4s, v16.4s +str q5, [x0, #144] +add v17.4s, v17.4s, v16.4s +str q8, [x0, #208] +sub v8.4s, v9.4s, v2.4s +str q11, [x0, #16] +add v9.4s, v9.4s, v2.4s +str q23, [x0, #80] +str q0, [x0, #528] +str q13, [x0, #592] +str q14, [x0, #656] +str q19, [x0, #720] +str q17, [x0, #784] +str q7, [x0, #848] +str q9, [x0, #912] +str q8, [x0, #976] +ldr q18, [x0, #224] +ldr q1, [x0, #160] +ldr q10, [x0, #32] +ldr q21, [x17, #+128] +ldr q22, [x17, #+144] +sqrdmulh v15.4S, v10.4S, v22.s[0] +mul v10.4S, v10.4S,v21.s[0] +ldr q3, [x0, #48] +sqrdmulh v12.4S, v3.4S, v22.s[0] +mul v3.4S, v3.4S,v21.s[0] +ldr q4, [x17, #+160] +ldr q30, [x17, #+176] +ldr q29, [x0, #96] +sqrdmulh v28.4S, v29.4S, v30.s[0] +mul v29.4S, v29.4S,v4.s[0] +ldr q27, [x0, #112] +sqrdmulh v26.4S, v27.4S, v30.s[0] +mul v27.4S, v27.4S,v4.s[0] +ldr q25, [x17, #+192] +ldr q24, [x17, #+208] +mla v10.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v1.4S, v24.s[0] +ldr q20, [x0, #176] +mla v3.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v20.4S, v24.s[0] +ldr q6, [x17, #+224] +ldr q5, [x17, #+240] +mla v29.4S, v28.4S, v31.s[0] +sqrdmulh v28.4S, v18.4S, v5.s[0] +ldr q16, [x0, #240] +mla v27.4S, v26.4S, v31.s[0] +sqrdmulh v26.4S, v16.4S, v5.s[0] +ldr q11, [x0, #0] +ldr q2, [x0, #128] +mul v1.4S, v1.4S,v25.s[0] +sub v23.4s, v11.4s, v10.4s +ldr q0, [x0, #16] +mul v20.4S, v20.4S,v25.s[0] +add v11.4s, v11.4s, v10.4s +ldr q10, [x0, #144] +mla v1.4S, v15.4S, v31.s[0] +sub v15.4s, v0.4s, v3.4s +ldr q13, [x0, #64] +mla v20.4S, v12.4S, v31.s[0] +add v0.4s, v0.4s, v3.4s +ldr q3, [x0, #192] +mul v18.4S, v18.4S,v6.s[0] +sub v12.4s, v13.4s, v29.4s +ldr q14, [x0, #80] +mul v16.4S, v16.4S,v6.s[0] +add v13.4s, v13.4s, v29.4s +ldr q29, [x0, #208] +mla v18.4S, v28.4S, v31.s[0] +sub v28.4s, v14.4s, v27.4s +mla v16.4S, v26.4S, v31.s[0] +add v14.4s, v14.4s, v27.4s +sqrdmulh v27.4S, v0.4S, v22.s[1] +mul v0.4S, v0.4S,v21.s[1] +sqrdmulh v26.4S, v15.4S, v22.s[2] +sub v19.4s, v2.4s, v1.4s +mul v15.4S, v15.4S,v21.s[2] +add v2.4s, v2.4s, v1.4s +sqrdmulh v22.4S, v14.4S, v30.s[1] +sub v21.4s, v10.4s, v20.4s +mul v14.4S, v14.4S,v4.s[1] +add v10.4s, v10.4s, v20.4s +sqrdmulh v20.4S, v28.4S, v30.s[2] +sub v1.4s, v3.4s, v18.4s +mul v28.4S, v28.4S,v4.s[2] +add v3.4s, v3.4s, v18.4s +mla v0.4S, v27.4S, v31.s[0] +sub v27.4s, v29.4s, v16.4s +ldr q30, [x0, #480] +sqrdmulh v4.4S, v10.4S, v24.s[1] +add v29.4s, v29.4s, v16.4s +mla v15.4S, v26.4S, v31.s[0] +ldr q26, [x0, #416] +sqrdmulh v16.4S, v21.4S, v24.s[2] +sub v18.4s, v11.4s, v0.4s +mla v14.4S, v22.4S, v31.s[0] +ldr q22, [x0, #288] +sqrdmulh v17.4S, v29.4S, v5.s[1] +add v11.4s, v11.4s, v0.4s +str q18, [x0, #16] +mla v28.4S, v20.4S, v31.s[0] +ldr q20, [x17, #+256] +ldr q18, [x17, #+272] +sqrdmulh v0.4S, v27.4S, v5.s[2] +sub v7.4s, v23.4s, v15.4s +str q11, [x0, #0] +mul v10.4S, v10.4S,v25.s[1] +add v23.4s, v23.4s, v15.4s +mul v21.4S, v21.4S,v25.s[2] +str q7, [x0, #48] +mla v10.4S, v4.4S, v31.s[0] +sub v4.4s, v13.4s, v14.4s +mla v21.4S, v16.4S, v31.s[0] +str q23, [x0, #32] +mul v29.4S, v29.4S,v6.s[1] +str q4, [x0, #80] +mul v27.4S, v27.4S,v6.s[2] +add v13.4s, v13.4s, v14.4s +str q13, [x0, #64] +mla v29.4S, v17.4S, v31.s[0] +sub v17.4s, v12.4s, v28.4s +str q17, [x0, #112] +mla v27.4S, v0.4S, v31.s[0] +add v12.4s, v12.4s, v28.4s +str q12, [x0, #96] +sqrdmulh v5.4S, v22.4S, v18.s[0] +sub v6.4s, v2.4s, v10.4s +mul v22.4S, v22.4S,v20.s[0] +str q6, [x0, #144] +ldr q6, [x0, #304] +sqrdmulh v12.4S, v6.4S, v18.s[0] +add v2.4s, v2.4s, v10.4s +mul v6.4S, v6.4S,v20.s[0] +str q2, [x0, #128] +ldr q2, [x17, #+288] +ldr q10, [x17, #+304] +ldr q28, [x0, #352] +sqrdmulh v0.4S, v28.4S, v10.s[0] +sub v17.4s, v19.4s, v21.4s +mul v28.4S, v28.4S,v2.s[0] +str q17, [x0, #176] +ldr q17, [x0, #368] +sqrdmulh v13.4S, v17.4S, v10.s[0] +add v19.4s, v19.4s, v21.4s +mul v17.4S, v17.4S,v2.s[0] +str q19, [x0, #160] +ldr q19, [x17, #+320] +ldr q21, [x17, #+336] +mla v22.4S, v5.4S, v31.s[0] +sub v5.4s, v3.4s, v29.4s +sqrdmulh v14.4S, v26.4S, v21.s[0] +str q5, [x0, #208] +ldr q5, [x0, #432] +mla v6.4S, v12.4S, v31.s[0] +add v3.4s, v3.4s, v29.4s +sqrdmulh v29.4S, v5.4S, v21.s[0] +str q3, [x0, #192] +ldr q3, [x17, #+352] +ldr q12, [x17, #+368] +mla v28.4S, v0.4S, v31.s[0] +sub v0.4s, v1.4s, v27.4s +sqrdmulh v4.4S, v30.4S, v12.s[0] +str q0, [x0, #240] +ldr q0, [x0, #496] +mla v17.4S, v13.4S, v31.s[0] +add v1.4s, v1.4s, v27.4s +sqrdmulh v27.4S, v0.4S, v12.s[0] +str q1, [x0, #224] +ldr q1, [x0, #256] +ldr q13, [x0, #384] +mul v26.4S, v26.4S,v19.s[0] +sub v24.4s, v1.4s, v22.4s +ldr q25, [x0, #272] +mul v5.4S, v5.4S,v19.s[0] +add v1.4s, v1.4s, v22.4s +ldr q22, [x0, #400] +mla v26.4S, v14.4S, v31.s[0] +sub v14.4s, v25.4s, v6.4s +ldr q23, [x0, #320] +mla v5.4S, v29.4S, v31.s[0] +add v25.4s, v25.4s, v6.4s +ldr q6, [x0, #448] +mul v30.4S, v30.4S,v3.s[0] +sub v29.4s, v23.4s, v28.4s +ldr q16, [x0, #336] +mul v0.4S, v0.4S,v3.s[0] +add v23.4s, v23.4s, v28.4s +ldr q28, [x0, #464] +mla v30.4S, v4.4S, v31.s[0] +sub v4.4s, v16.4s, v17.4s +mla v0.4S, v27.4S, v31.s[0] +add v16.4s, v16.4s, v17.4s +sqrdmulh v17.4S, v25.4S, v18.s[1] +mul v25.4S, v25.4S,v20.s[1] +sqrdmulh v27.4S, v14.4S, v18.s[2] +sub v7.4s, v13.4s, v26.4s +mul v14.4S, v14.4S,v20.s[2] +add v13.4s, v13.4s, v26.4s +sqrdmulh v18.4S, v16.4S, v10.s[1] +sub v20.4s, v22.4s, v5.4s +mul v16.4S, v16.4S,v2.s[1] +add v22.4s, v22.4s, v5.4s +sqrdmulh v5.4S, v4.4S, v10.s[2] +sub v26.4s, v6.4s, v30.4s +mul v4.4S, v4.4S,v2.s[2] +add v6.4s, v6.4s, v30.4s +mla v25.4S, v17.4S, v31.s[0] +sub v17.4s, v28.4s, v0.4s +ldr q10, [x0, #736] +sqrdmulh v2.4S, v22.4S, v21.s[1] +add v28.4s, v28.4s, v0.4s +mla v14.4S, v27.4S, v31.s[0] +ldr q27, [x0, #672] +sqrdmulh v0.4S, v20.4S, v21.s[2] +sub v30.4s, v1.4s, v25.4s +mla v16.4S, v18.4S, v31.s[0] +ldr q18, [x0, #544] +sqrdmulh v15.4S, v28.4S, v12.s[1] +add v1.4s, v1.4s, v25.4s +str q30, [x0, #272] +mla v4.4S, v5.4S, v31.s[0] +ldr q5, [x17, #+384] +ldr q30, [x17, #+400] +sqrdmulh v25.4S, v17.4S, v12.s[2] +sub v11.4s, v24.4s, v14.4s +str q1, [x0, #256] +mul v22.4S, v22.4S,v19.s[1] +add v24.4s, v24.4s, v14.4s +mul v20.4S, v20.4S,v19.s[2] +str q11, [x0, #304] +mla v22.4S, v2.4S, v31.s[0] +sub v2.4s, v23.4s, v16.4s +mla v20.4S, v0.4S, v31.s[0] +str q24, [x0, #288] +mul v28.4S, v28.4S,v3.s[1] +str q2, [x0, #336] +mul v17.4S, v17.4S,v3.s[2] +add v23.4s, v23.4s, v16.4s +str q23, [x0, #320] +mla v28.4S, v15.4S, v31.s[0] +sub v15.4s, v29.4s, v4.4s +str q15, [x0, #368] +mla v17.4S, v25.4S, v31.s[0] +add v29.4s, v29.4s, v4.4s +str q29, [x0, #352] +sqrdmulh v12.4S, v18.4S, v30.s[0] +sub v3.4s, v13.4s, v22.4s +mul v18.4S, v18.4S,v5.s[0] +str q3, [x0, #400] +ldr q3, [x0, #560] +sqrdmulh v29.4S, v3.4S, v30.s[0] +add v13.4s, v13.4s, v22.4s +mul v3.4S, v3.4S,v5.s[0] +str q13, [x0, #384] +ldr q13, [x17, #+416] +ldr q22, [x17, #+432] +ldr q4, [x0, #608] +sqrdmulh v25.4S, v4.4S, v22.s[0] +sub v15.4s, v7.4s, v20.4s +mul v4.4S, v4.4S,v13.s[0] +str q15, [x0, #432] +ldr q15, [x0, #624] +sqrdmulh v23.4S, v15.4S, v22.s[0] +add v7.4s, v7.4s, v20.4s +mul v15.4S, v15.4S,v13.s[0] +str q7, [x0, #416] +ldr q7, [x17, #+448] +ldr q20, [x17, #+464] +mla v18.4S, v12.4S, v31.s[0] +sub v12.4s, v6.4s, v28.4s +sqrdmulh v16.4S, v27.4S, v20.s[0] +str q12, [x0, #464] +ldr q12, [x0, #688] +mla v3.4S, v29.4S, v31.s[0] +add v6.4s, v6.4s, v28.4s +sqrdmulh v28.4S, v12.4S, v20.s[0] +str q6, [x0, #448] +ldr q6, [x17, #+480] +ldr q29, [x17, #+496] +mla v4.4S, v25.4S, v31.s[0] +sub v25.4s, v26.4s, v17.4s +sqrdmulh v2.4S, v10.4S, v29.s[0] +str q25, [x0, #496] +ldr q25, [x0, #752] +mla v15.4S, v23.4S, v31.s[0] +add v26.4s, v26.4s, v17.4s +sqrdmulh v17.4S, v25.4S, v29.s[0] +str q26, [x0, #480] +ldr q26, [x0, #512] +ldr q23, [x0, #640] +mul v27.4S, v27.4S,v7.s[0] +sub v21.4s, v26.4s, v18.4s +ldr q19, [x0, #528] +mul v12.4S, v12.4S,v7.s[0] +add v26.4s, v26.4s, v18.4s +ldr q18, [x0, #656] +mla v27.4S, v16.4S, v31.s[0] +sub v16.4s, v19.4s, v3.4s +ldr q24, [x0, #576] +mla v12.4S, v28.4S, v31.s[0] +add v19.4s, v19.4s, v3.4s +ldr q3, [x0, #704] +mul v10.4S, v10.4S,v6.s[0] +sub v28.4s, v24.4s, v4.4s +ldr q0, [x0, #592] +mul v25.4S, v25.4S,v6.s[0] +add v24.4s, v24.4s, v4.4s +ldr q4, [x0, #720] +mla v10.4S, v2.4S, v31.s[0] +sub v2.4s, v0.4s, v15.4s +mla v25.4S, v17.4S, v31.s[0] +add v0.4s, v0.4s, v15.4s +sqrdmulh v15.4S, v19.4S, v30.s[1] +mul v19.4S, v19.4S,v5.s[1] +sqrdmulh v17.4S, v16.4S, v30.s[2] +sub v11.4s, v23.4s, v27.4s +mul v16.4S, v16.4S,v5.s[2] +add v23.4s, v23.4s, v27.4s +sqrdmulh v30.4S, v0.4S, v22.s[1] +sub v5.4s, v18.4s, v12.4s +mul v0.4S, v0.4S,v13.s[1] +add v18.4s, v18.4s, v12.4s +sqrdmulh v12.4S, v2.4S, v22.s[2] +sub v27.4s, v3.4s, v10.4s +mul v2.4S, v2.4S,v13.s[2] +add v3.4s, v3.4s, v10.4s +mla v19.4S, v15.4S, v31.s[0] +sub v15.4s, v4.4s, v25.4s +ldr q22, [x0, #992] +sqrdmulh v13.4S, v18.4S, v20.s[1] +add v4.4s, v4.4s, v25.4s +mla v16.4S, v17.4S, v31.s[0] +ldr q17, [x0, #928] +sqrdmulh v25.4S, v5.4S, v20.s[2] +sub v10.4s, v26.4s, v19.4s +mla v0.4S, v30.4S, v31.s[0] +ldr q30, [x0, #800] +sqrdmulh v14.4S, v4.4S, v29.s[1] +add v26.4s, v26.4s, v19.4s +str q10, [x0, #528] +mla v2.4S, v12.4S, v31.s[0] +ldr q12, [x17, #+512] +ldr q10, [x17, #+528] +sqrdmulh v19.4S, v15.4S, v29.s[2] +sub v1.4s, v21.4s, v16.4s +str q26, [x0, #512] +mul v18.4S, v18.4S,v7.s[1] +add v21.4s, v21.4s, v16.4s +mul v5.4S, v5.4S,v7.s[2] +str q1, [x0, #560] +mla v18.4S, v13.4S, v31.s[0] +sub v13.4s, v24.4s, v0.4s +mla v5.4S, v25.4S, v31.s[0] +str q21, [x0, #544] +mul v4.4S, v4.4S,v6.s[1] +str q13, [x0, #592] +mul v15.4S, v15.4S,v6.s[2] +add v24.4s, v24.4s, v0.4s +str q24, [x0, #576] +mla v4.4S, v14.4S, v31.s[0] +sub v14.4s, v28.4s, v2.4s +str q14, [x0, #624] +mla v15.4S, v19.4S, v31.s[0] +add v28.4s, v28.4s, v2.4s +str q28, [x0, #608] +sqrdmulh v29.4S, v30.4S, v10.s[0] +sub v6.4s, v23.4s, v18.4s +mul v30.4S, v30.4S,v12.s[0] +str q6, [x0, #656] +ldr q6, [x0, #816] +sqrdmulh v28.4S, v6.4S, v10.s[0] +add v23.4s, v23.4s, v18.4s +mul v6.4S, v6.4S,v12.s[0] +str q23, [x0, #640] +ldr q23, [x17, #+544] +ldr q18, [x17, #+560] +ldr q2, [x0, #864] +sqrdmulh v19.4S, v2.4S, v18.s[0] +sub v14.4s, v11.4s, v5.4s +mul v2.4S, v2.4S,v23.s[0] +str q14, [x0, #688] +ldr q14, [x0, #880] +sqrdmulh v24.4S, v14.4S, v18.s[0] +add v11.4s, v11.4s, v5.4s +mul v14.4S, v14.4S,v23.s[0] +str q11, [x0, #672] +ldr q11, [x17, #+576] +ldr q5, [x17, #+592] +mla v30.4S, v29.4S, v31.s[0] +sub v29.4s, v3.4s, v4.4s +sqrdmulh v0.4S, v17.4S, v5.s[0] +str q29, [x0, #720] +ldr q29, [x0, #944] +mla v6.4S, v28.4S, v31.s[0] +add v3.4s, v3.4s, v4.4s +sqrdmulh v4.4S, v29.4S, v5.s[0] +str q3, [x0, #704] +ldr q3, [x17, #+608] +ldr q28, [x17, #+624] +mla v2.4S, v19.4S, v31.s[0] +sub v19.4s, v27.4s, v15.4s +sqrdmulh v13.4S, v22.4S, v28.s[0] +str q19, [x0, #752] +ldr q19, [x0, #1008] +mla v14.4S, v24.4S, v31.s[0] +add v27.4s, v27.4s, v15.4s +sqrdmulh v15.4S, v19.4S, v28.s[0] +str q27, [x0, #736] +ldr q27, [x0, #768] +ldr q24, [x0, #896] +mul v17.4S, v17.4S,v11.s[0] +sub v20.4s, v27.4s, v30.4s +ldr q7, [x0, #784] +mul v29.4S, v29.4S,v11.s[0] +add v27.4s, v27.4s, v30.4s +ldr q30, [x0, #912] +mla v17.4S, v0.4S, v31.s[0] +sub v0.4s, v7.4s, v6.4s +ldr q21, [x0, #832] +mla v29.4S, v4.4S, v31.s[0] +add v7.4s, v7.4s, v6.4s +ldr q6, [x0, #960] +mul v22.4S, v22.4S,v3.s[0] +sub v4.4s, v21.4s, v2.4s +ldr q25, [x0, #848] +mul v19.4S, v19.4S,v3.s[0] +add v21.4s, v21.4s, v2.4s +ldr q2, [x0, #976] +mla v22.4S, v13.4S, v31.s[0] +sub v13.4s, v25.4s, v14.4s +mla v19.4S, v15.4S, v31.s[0] +add v25.4s, v25.4s, v14.4s +sqrdmulh v14.4S, v7.4S, v10.s[1] +mul v7.4S, v7.4S,v12.s[1] +sqrdmulh v15.4S, v0.4S, v10.s[2] +sub v1.4s, v24.4s, v17.4s +mul v0.4S, v0.4S,v12.s[2] +add v24.4s, v24.4s, v17.4s +sqrdmulh v10.4S, v25.4S, v18.s[1] +sub v12.4s, v30.4s, v29.4s +mul v25.4S, v25.4S,v23.s[1] +add v30.4s, v30.4s, v29.4s +sqrdmulh v29.4S, v13.4S, v18.s[2] +sub v17.4s, v6.4s, v22.4s +mul v13.4S, v13.4S,v23.s[2] +add v6.4s, v6.4s, v22.4s +mla v7.4S, v14.4S, v31.s[0] +sub v14.4s, v2.4s, v19.4s +sqrdmulh v18.4S, v30.4S, v5.s[1] +add v2.4s, v2.4s, v19.4s +mla v0.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v12.4S, v5.s[2] +sub v19.4s, v27.4s, v7.4s +mla v25.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v2.4S, v28.s[1] +add v27.4s, v27.4s, v7.4s +str q19, [x0, #784] +mla v13.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v14.4S, v28.s[2] +sub v19.4s, v20.4s, v0.4s +str q27, [x0, #768] +mul v30.4S, v30.4S,v11.s[1] +add v20.4s, v20.4s, v0.4s +mul v12.4S, v12.4S,v11.s[2] +str q19, [x0, #816] +mla v30.4S, v18.4S, v31.s[0] +sub v18.4s, v21.4s, v25.4s +mla v12.4S, v15.4S, v31.s[0] +str q20, [x0, #800] +mul v2.4S, v2.4S,v3.s[1] +str q18, [x0, #848] +mul v14.4S, v14.4S,v3.s[2] +add v21.4s, v21.4s, v25.4s +str q21, [x0, #832] +mla v2.4S, v10.4S, v31.s[0] +sub v10.4s, v4.4s, v13.4s +str q10, [x0, #880] +mla v14.4S, v29.4S, v31.s[0] +add v4.4s, v4.4s, v13.4s +str q4, [x0, #864] +sub v28.4s, v24.4s, v30.4s +str q28, [x0, #912] +add v24.4s, v24.4s, v30.4s +str q24, [x0, #896] +sub v24.4s, v1.4s, v12.4s +str q24, [x0, #944] +add v1.4s, v1.4s, v12.4s +str q1, [x0, #928] +sub v1.4s, v6.4s, v2.4s +str q1, [x0, #976] +add v6.4s, v6.4s, v2.4s +str q6, [x0, #960] +sub v6.4s, v17.4s, v14.4s +str q6, [x0, #1008] +add v17.4s, v17.4s, v14.4s +str q17, [x0, #992] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1464 +// Instruction count: 1460 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_7.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_7.s new file mode 100644 index 0000000..a96a052 --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_7.s @@ -0,0 +1,1494 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_7_z4_7 +.global _ntt_u32_incomplete_neon_asm_var_4_2_7_z4_7 +ntt_u32_incomplete_neon_asm_var_4_2_7_z4_7: +_ntt_u32_incomplete_neon_asm_var_4_2_7_z4_7: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #928] +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +ldr q20, [x0, #992] +sqrdmulh v19.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q18, [x0, #800] +sqrdmulh v17.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +ldr q16, [x0, #864] +sqrdmulh v3.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +mla v22.4S, v21.4S, v31.s[0] +mla v20.4S, v19.4S, v31.s[0] +mla v18.4S, v17.4S, v31.s[0] +mla v16.4S, v3.4S, v31.s[0] +ldr q3, [x0, #544] +sqrdmulh v17.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +ldr q19, [x0, #608] +sqrdmulh v21.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q2, [x0, #672] +ldr q1, [x0, #416] +sqrdmulh v0.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +sub v15.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +ldr q22, [x0, #736] +ldr q14, [x0, #480] +sqrdmulh v13.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v12.4s, v14.4s, v20.4s +add v14.4s, v14.4s, v20.4s +ldr q20, [x0, #288] +mla v3.4S, v17.4S, v31.s[0] +mla v19.4S, v21.4S, v31.s[0] +sub v21.4s, v20.4s, v18.4s +mla v2.4S, v0.4S, v31.s[0] +mla v22.4S, v13.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +ldr q18, [x0, #352] +sqrdmulh v13.4S, v1.4S, v29.s[1] +mul v1.4S, v1.4S,v30.s[1] +sub v0.4s, v18.4s, v16.4s +sqrdmulh v17.4S, v14.4S, v29.s[1] +mul v14.4S, v14.4S,v30.s[1] +add v18.4s, v18.4s, v16.4s +ldr q16, [x0, #32] +sqrdmulh v11.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v10.4s, v16.4s, v3.4s +add v16.4s, v16.4s, v3.4s +ldr q3, [x0, #96] +sqrdmulh v9.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v8.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +ldr q19, [x0, #160] +mla v1.4S, v13.4S, v31.s[0] +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v19.4s, v2.4s +mla v20.4S, v11.4S, v31.s[0] +mla v18.4S, v9.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +ldr q2, [x0, #224] +sqrdmulh v9.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +sub v11.4s, v2.4s, v22.4s +sqrdmulh v13.4S, v12.4S, v29.s[2] +mul v12.4S, v12.4S,v30.s[2] +add v2.4s, v2.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +sub v7.4s, v19.4s, v1.4s +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v29.s[2] +mul v0.4S, v0.4S,v30.s[2] +sub v6.4s, v2.4s, v14.4s +add v2.4s, v2.4s, v14.4s +mla v15.4S, v9.4S, v31.s[0] +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v16.4s, v20.4s +mla v21.4S, v22.4S, v31.s[0] +mla v0.4S, v1.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v7.4S, v27.s[1] +mul v7.4S, v7.4S,v28.s[1] +sub v1.4s, v3.4s, v18.4s +sqrdmulh v22.4S, v6.4S, v27.s[1] +mul v6.4S, v6.4S,v28.s[1] +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v19.4S, v27.s[0] +mul v19.4S, v19.4S,v28.s[0] +sub v9.4s, v17.4s, v15.4s +add v17.4s, v17.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v27.s[0] +mul v2.4S, v2.4S,v28.s[0] +sub v14.4s, v11.4s, v12.4s +add v11.4s, v11.4s, v12.4s +mla v7.4S, v20.4S, v31.s[0] +mla v6.4S, v22.4S, v31.s[0] +sub v22.4s, v10.4s, v21.4s +mla v19.4S, v18.4S, v31.s[0] +mla v2.4S, v15.4S, v31.s[0] +add v10.4s, v10.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v27.s[2] +mul v17.4S, v17.4S,v28.s[2] +sub v15.4s, v8.4s, v0.4s +sqrdmulh v18.4S, v11.4S, v27.s[2] +mul v11.4S, v11.4S,v28.s[2] +add v8.4s, v8.4s, v0.4s +sqrdmulh v0.4S, v9.4S, v27.s[3] +mul v9.4S, v9.4S,v28.s[3] +sub v20.4s, v13.4s, v7.4s +add v13.4s, v13.4s, v7.4s +sqrdmulh v7.4S, v14.4S, v27.s[3] +mul v14.4S, v14.4S,v28.s[3] +sub v12.4s, v1.4s, v6.4s +add v1.4s, v1.4s, v6.4s +mla v17.4S, v21.4S, v31.s[0] +mla v11.4S, v18.4S, v31.s[0] +sub v18.4s, v16.4s, v19.4s +mla v9.4S, v0.4S, v31.s[0] +mla v14.4S, v7.4S, v31.s[0] +add v16.4s, v16.4s, v19.4s +sqrdmulh v19.4S, v1.4S, v25.s[2] +mul v1.4S, v1.4S,v26.s[2] +sub v7.4s, v3.4s, v2.4s +sqrdmulh v0.4S, v12.4S, v25.s[3] +mul v12.4S, v12.4S,v26.s[3] +add v3.4s, v3.4s, v2.4s +sqrdmulh v2.4S, v7.4S, v25.s[1] +mul v7.4S, v7.4S,v26.s[1] +sub v21.4s, v10.4s, v17.4s +add v10.4s, v10.4s, v17.4s +sqrdmulh v17.4S, v3.4S, v25.s[0] +mul v3.4S, v3.4S,v26.s[0] +sub v6.4s, v8.4s, v11.4s +add v8.4s, v8.4s, v11.4s +mla v1.4S, v19.4S, v31.s[0] +mla v12.4S, v0.4S, v31.s[0] +sub v0.4s, v22.4s, v9.4s +mla v7.4S, v2.4S, v31.s[0] +mla v3.4S, v17.4S, v31.s[0] +add v22.4s, v22.4s, v9.4s +sqrdmulh v9.4S, v8.4S, v23.s[0] +mul v8.4S, v8.4S,v24.s[0] +sub v17.4s, v15.4s, v14.4s +sqrdmulh v2.4S, v6.4S, v23.s[1] +mul v6.4S, v6.4S,v24.s[1] +add v15.4s, v15.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v23.s[2] +mul v15.4S, v15.4S,v24.s[2] +sub v19.4s, v13.4s, v1.4s +add v13.4s, v13.4s, v1.4s +sqrdmulh v1.4S, v17.4S, v23.s[3] +mul v17.4S, v17.4S,v24.s[3] +sub v11.4s, v20.4s, v12.4s +add v20.4s, v20.4s, v12.4s +mla v8.4S, v9.4S, v31.s[0] +mla v6.4S, v2.4S, v31.s[0] +sub v2.4s, v18.4s, v7.4s +str q13, [x0, #288] +mla v15.4S, v14.4S, v31.s[0] +mla v17.4S, v1.4S, v31.s[0] +add v18.4s, v18.4s, v7.4s +str q19, [x0, #352] +ldr q19, [x0, #944] +sqrdmulh v7.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +sub v1.4s, v16.4s, v3.4s +str q20, [x0, #416] +ldr q20, [x0, #1008] +sqrdmulh v14.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v16.4s, v16.4s, v3.4s +str q11, [x0, #480] +ldr q11, [x0, #816] +sqrdmulh v3.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +sub v13.4s, v10.4s, v8.4s +add v10.4s, v10.4s, v8.4s +ldr q8, [x0, #880] +sqrdmulh v9.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v12.4s, v21.4s, v6.4s +add v21.4s, v21.4s, v6.4s +mla v19.4S, v7.4S, v31.s[0] +mla v20.4S, v14.4S, v31.s[0] +sub v14.4s, v22.4s, v15.4s +str q18, [x0, #160] +mla v11.4S, v3.4S, v31.s[0] +mla v8.4S, v9.4S, v31.s[0] +add v22.4s, v22.4s, v15.4s +str q2, [x0, #224] +ldr q2, [x0, #560] +sqrdmulh v15.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +sub v9.4s, v0.4s, v17.4s +str q16, [x0, #32] +ldr q16, [x0, #624] +sqrdmulh v3.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +add v0.4s, v0.4s, v17.4s +str q1, [x0, #96] +ldr q1, [x0, #688] +ldr q17, [x0, #432] +sqrdmulh v18.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +sub v7.4s, v17.4s, v19.4s +add v17.4s, v17.4s, v19.4s +ldr q19, [x0, #752] +ldr q6, [x0, #496] +sqrdmulh v5.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +sub v4.4s, v6.4s, v20.4s +add v6.4s, v6.4s, v20.4s +ldr q20, [x0, #304] +mla v2.4S, v15.4S, v31.s[0] +mla v16.4S, v3.4S, v31.s[0] +sub v3.4s, v20.4s, v11.4s +str q10, [x0, #544] +mla v1.4S, v18.4S, v31.s[0] +mla v19.4S, v5.4S, v31.s[0] +add v20.4s, v20.4s, v11.4s +str q13, [x0, #608] +ldr q13, [x0, #368] +sqrdmulh v11.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v5.4s, v13.4s, v8.4s +str q21, [x0, #672] +sqrdmulh v21.4S, v6.4S, v29.s[1] +mul v6.4S, v6.4S,v30.s[1] +add v13.4s, v13.4s, v8.4s +str q12, [x0, #736] +ldr q12, [x0, #48] +sqrdmulh v8.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v18.4s, v12.4s, v2.4s +add v12.4s, v12.4s, v2.4s +ldr q2, [x0, #112] +sqrdmulh v10.4S, v13.4S, v29.s[1] +mul v13.4S, v13.4S,v30.s[1] +sub v15.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +ldr q16, [x0, #176] +mla v17.4S, v11.4S, v31.s[0] +mla v6.4S, v21.4S, v31.s[0] +sub v21.4s, v16.4s, v1.4s +str q22, [x0, #800] +mla v20.4S, v8.4S, v31.s[0] +mla v13.4S, v10.4S, v31.s[0] +add v16.4s, v16.4s, v1.4s +str q14, [x0, #864] +ldr q14, [x0, #240] +sqrdmulh v1.4S, v7.4S, v29.s[2] +mul v7.4S, v7.4S,v30.s[2] +sub v10.4s, v14.4s, v19.4s +str q0, [x0, #928] +sqrdmulh v0.4S, v4.4S, v29.s[2] +mul v4.4S, v4.4S,v30.s[2] +add v14.4s, v14.4s, v19.4s +str q9, [x0, #992] +sqrdmulh v9.4S, v3.4S, v29.s[2] +mul v3.4S, v3.4S,v30.s[2] +sub v19.4s, v16.4s, v17.4s +add v16.4s, v16.4s, v17.4s +sqrdmulh v17.4S, v5.4S, v29.s[2] +mul v5.4S, v5.4S,v30.s[2] +sub v8.4s, v14.4s, v6.4s +add v14.4s, v14.4s, v6.4s +mla v7.4S, v1.4S, v31.s[0] +mla v4.4S, v0.4S, v31.s[0] +sub v0.4s, v12.4s, v20.4s +mla v3.4S, v9.4S, v31.s[0] +mla v5.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v27.s[1] +mul v19.4S, v19.4S,v28.s[1] +sub v17.4s, v2.4s, v13.4s +sqrdmulh v9.4S, v8.4S, v27.s[1] +mul v8.4S, v8.4S,v28.s[1] +add v2.4s, v2.4s, v13.4s +sqrdmulh v13.4S, v16.4S, v27.s[0] +mul v16.4S, v16.4S,v28.s[0] +sub v1.4s, v21.4s, v7.4s +add v21.4s, v21.4s, v7.4s +sqrdmulh v7.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +sub v6.4s, v10.4s, v4.4s +add v10.4s, v10.4s, v4.4s +mla v19.4S, v20.4S, v31.s[0] +mla v8.4S, v9.4S, v31.s[0] +sub v9.4s, v18.4s, v3.4s +mla v16.4S, v13.4S, v31.s[0] +mla v14.4S, v7.4S, v31.s[0] +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v27.s[2] +mul v21.4S, v21.4S,v28.s[2] +sub v7.4s, v15.4s, v5.4s +sqrdmulh v13.4S, v10.4S, v27.s[2] +mul v10.4S, v10.4S,v28.s[2] +add v15.4s, v15.4s, v5.4s +sqrdmulh v5.4S, v1.4S, v27.s[3] +mul v1.4S, v1.4S,v28.s[3] +sub v20.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v27.s[3] +mul v6.4S, v6.4S,v28.s[3] +sub v4.4s, v17.4s, v8.4s +add v17.4s, v17.4s, v8.4s +mla v21.4S, v3.4S, v31.s[0] +mla v10.4S, v13.4S, v31.s[0] +sub v13.4s, v12.4s, v16.4s +mla v1.4S, v5.4S, v31.s[0] +mla v6.4S, v19.4S, v31.s[0] +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v25.s[2] +mul v17.4S, v17.4S,v26.s[2] +sub v19.4s, v2.4s, v14.4s +sqrdmulh v5.4S, v4.4S, v25.s[3] +mul v4.4S, v4.4S,v26.s[3] +add v2.4s, v2.4s, v14.4s +sqrdmulh v14.4S, v19.4S, v25.s[1] +mul v19.4S, v19.4S,v26.s[1] +sub v3.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v2.4S, v25.s[0] +mul v2.4S, v2.4S,v26.s[0] +sub v8.4s, v15.4s, v10.4s +add v15.4s, v15.4s, v10.4s +mla v17.4S, v16.4S, v31.s[0] +mla v4.4S, v5.4S, v31.s[0] +sub v5.4s, v9.4s, v1.4s +mla v19.4S, v14.4S, v31.s[0] +mla v2.4S, v21.4S, v31.s[0] +add v9.4s, v9.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v23.s[0] +mul v15.4S, v15.4S,v24.s[0] +sub v21.4s, v7.4s, v6.4s +sqrdmulh v14.4S, v8.4S, v23.s[1] +mul v8.4S, v8.4S,v24.s[1] +add v7.4s, v7.4s, v6.4s +sqrdmulh v6.4S, v7.4S, v23.s[2] +mul v7.4S, v7.4S,v24.s[2] +sub v16.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +sqrdmulh v17.4S, v21.4S, v23.s[3] +mul v21.4S, v21.4S,v24.s[3] +sub v10.4s, v20.4s, v4.4s +add v20.4s, v20.4s, v4.4s +mla v15.4S, v1.4S, v31.s[0] +mla v8.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v19.4s +str q0, [x0, #304] +mla v7.4S, v6.4S, v31.s[0] +mla v21.4S, v17.4S, v31.s[0] +add v13.4s, v13.4s, v19.4s +str q16, [x0, #368] +ldr q16, [x0, #896] +sqrdmulh v19.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v17.4s, v12.4s, v2.4s +str q20, [x0, #432] +ldr q20, [x0, #960] +sqrdmulh v6.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v12.4s, v12.4s, v2.4s +str q10, [x0, #496] +ldr q10, [x0, #768] +sqrdmulh v2.4S, v10.4S, v29.s[0] +mul v10.4S, v10.4S,v30.s[0] +sub v0.4s, v18.4s, v15.4s +add v18.4s, v18.4s, v15.4s +ldr q15, [x0, #832] +sqrdmulh v1.4S, v15.4S, v29.s[0] +mul v15.4S, v15.4S,v30.s[0] +sub v4.4s, v3.4s, v8.4s +add v3.4s, v3.4s, v8.4s +mla v16.4S, v19.4S, v31.s[0] +mla v20.4S, v6.4S, v31.s[0] +sub v6.4s, v9.4s, v7.4s +str q13, [x0, #176] +mla v10.4S, v2.4S, v31.s[0] +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v7.4s +str q14, [x0, #240] +ldr q14, [x0, #512] +sqrdmulh v7.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v1.4s, v5.4s, v21.4s +str q12, [x0, #48] +ldr q12, [x0, #576] +sqrdmulh v2.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +add v5.4s, v5.4s, v21.4s +str q17, [x0, #112] +ldr q17, [x0, #640] +ldr q21, [x0, #384] +sqrdmulh v13.4S, v17.4S, v29.s[0] +mul v17.4S, v17.4S,v30.s[0] +sub v19.4s, v21.4s, v16.4s +add v21.4s, v21.4s, v16.4s +ldr q16, [x0, #704] +ldr q8, [x0, #448] +sqrdmulh v22.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v11.4s, v8.4s, v20.4s +add v8.4s, v8.4s, v20.4s +ldr q20, [x0, #256] +mla v14.4S, v7.4S, v31.s[0] +mla v12.4S, v2.4S, v31.s[0] +sub v2.4s, v20.4s, v10.4s +str q18, [x0, #560] +mla v17.4S, v13.4S, v31.s[0] +mla v16.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v10.4s +str q0, [x0, #624] +ldr q0, [x0, #320] +sqrdmulh v10.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v22.4s, v0.4s, v15.4s +str q3, [x0, #688] +sqrdmulh v3.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +add v0.4s, v0.4s, v15.4s +str q4, [x0, #752] +ldr q4, [x0, #0] +sqrdmulh v15.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v13.4s, v4.4s, v14.4s +add v4.4s, v4.4s, v14.4s +ldr q14, [x0, #64] +sqrdmulh v18.4S, v0.4S, v29.s[1] +mul v0.4S, v0.4S,v30.s[1] +sub v7.4s, v14.4s, v12.4s +add v14.4s, v14.4s, v12.4s +ldr q12, [x0, #128] +mla v21.4S, v10.4S, v31.s[0] +mla v8.4S, v3.4S, v31.s[0] +sub v3.4s, v12.4s, v17.4s +str q9, [x0, #816] +mla v20.4S, v15.4S, v31.s[0] +mla v0.4S, v18.4S, v31.s[0] +add v12.4s, v12.4s, v17.4s +str q6, [x0, #880] +ldr q6, [x0, #192] +sqrdmulh v17.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +sub v18.4s, v6.4s, v16.4s +str q5, [x0, #944] +sqrdmulh v5.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +add v6.4s, v6.4s, v16.4s +str q1, [x0, #1008] +sqrdmulh v1.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v16.4s, v12.4s, v21.4s +add v12.4s, v12.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +sub v15.4s, v6.4s, v8.4s +add v6.4s, v6.4s, v8.4s +mla v19.4S, v17.4S, v31.s[0] +mla v11.4S, v5.4S, v31.s[0] +sub v5.4s, v4.4s, v20.4s +mla v2.4S, v1.4S, v31.s[0] +mla v22.4S, v21.4S, v31.s[0] +add v4.4s, v4.4s, v20.4s +sqrdmulh v20.4S, v16.4S, v27.s[1] +mul v16.4S, v16.4S,v28.s[1] +sub v21.4s, v14.4s, v0.4s +sqrdmulh v1.4S, v15.4S, v27.s[1] +mul v15.4S, v15.4S,v28.s[1] +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +sub v17.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v27.s[0] +mul v6.4S, v6.4S,v28.s[0] +sub v8.4s, v18.4s, v11.4s +add v18.4s, v18.4s, v11.4s +mla v16.4S, v20.4S, v31.s[0] +mla v15.4S, v1.4S, v31.s[0] +sub v1.4s, v13.4s, v2.4s +mla v12.4S, v0.4S, v31.s[0] +mla v6.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v3.4S, v27.s[2] +mul v3.4S, v3.4S,v28.s[2] +sub v19.4s, v7.4s, v22.4s +sqrdmulh v0.4S, v18.4S, v27.s[2] +mul v18.4S, v18.4S,v28.s[2] +add v7.4s, v7.4s, v22.4s +sqrdmulh v22.4S, v17.4S, v27.s[3] +mul v17.4S, v17.4S,v28.s[3] +sub v20.4s, v5.4s, v16.4s +add v5.4s, v5.4s, v16.4s +sqrdmulh v16.4S, v8.4S, v27.s[3] +mul v8.4S, v8.4S,v28.s[3] +sub v11.4s, v21.4s, v15.4s +add v21.4s, v21.4s, v15.4s +mla v3.4S, v2.4S, v31.s[0] +mla v18.4S, v0.4S, v31.s[0] +sub v0.4s, v4.4s, v12.4s +mla v17.4S, v22.4S, v31.s[0] +mla v8.4S, v16.4S, v31.s[0] +add v4.4s, v4.4s, v12.4s +sqrdmulh v12.4S, v21.4S, v25.s[2] +mul v21.4S, v21.4S,v26.s[2] +sub v16.4s, v14.4s, v6.4s +sqrdmulh v22.4S, v11.4S, v25.s[3] +mul v11.4S, v11.4S,v26.s[3] +add v14.4s, v14.4s, v6.4s +sqrdmulh v6.4S, v16.4S, v25.s[1] +mul v16.4S, v16.4S,v26.s[1] +sub v2.4s, v13.4s, v3.4s +add v13.4s, v13.4s, v3.4s +sqrdmulh v3.4S, v14.4S, v25.s[0] +mul v14.4S, v14.4S,v26.s[0] +sub v15.4s, v7.4s, v18.4s +add v7.4s, v7.4s, v18.4s +mla v21.4S, v12.4S, v31.s[0] +mla v11.4S, v22.4S, v31.s[0] +sub v22.4s, v1.4s, v17.4s +mla v16.4S, v6.4S, v31.s[0] +mla v14.4S, v3.4S, v31.s[0] +add v1.4s, v1.4s, v17.4s +sqrdmulh v17.4S, v7.4S, v23.s[0] +mul v7.4S, v7.4S,v24.s[0] +sub v3.4s, v19.4s, v8.4s +sqrdmulh v6.4S, v15.4S, v23.s[1] +mul v15.4S, v15.4S,v24.s[1] +add v19.4s, v19.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v23.s[2] +mul v19.4S, v19.4S,v24.s[2] +sub v12.4s, v5.4s, v21.4s +add v5.4s, v5.4s, v21.4s +sqrdmulh v21.4S, v3.4S, v23.s[3] +mul v3.4S, v3.4S,v24.s[3] +sub v18.4s, v20.4s, v11.4s +add v20.4s, v20.4s, v11.4s +mla v7.4S, v17.4S, v31.s[0] +mla v15.4S, v6.4S, v31.s[0] +sub v6.4s, v0.4s, v16.4s +str q5, [x0, #256] +mla v19.4S, v8.4S, v31.s[0] +mla v3.4S, v21.4S, v31.s[0] +add v0.4s, v0.4s, v16.4s +str q12, [x0, #320] +ldr q12, [x0, #912] +sqrdmulh v16.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +sub v21.4s, v4.4s, v14.4s +str q20, [x0, #384] +ldr q20, [x0, #976] +sqrdmulh v8.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v4.4s, v4.4s, v14.4s +str q18, [x0, #448] +ldr q18, [x0, #784] +sqrdmulh v14.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +sub v5.4s, v13.4s, v7.4s +add v13.4s, v13.4s, v7.4s +ldr q7, [x0, #848] +sqrdmulh v17.4S, v7.4S, v29.s[0] +mul v7.4S, v7.4S,v30.s[0] +sub v11.4s, v2.4s, v15.4s +add v2.4s, v2.4s, v15.4s +mla v12.4S, v16.4S, v31.s[0] +mla v20.4S, v8.4S, v31.s[0] +sub v8.4s, v1.4s, v19.4s +str q0, [x0, #128] +mla v18.4S, v14.4S, v31.s[0] +mla v7.4S, v17.4S, v31.s[0] +add v1.4s, v1.4s, v19.4s +str q6, [x0, #192] +ldr q6, [x0, #528] +sqrdmulh v19.4S, v6.4S, v29.s[0] +mul v6.4S, v6.4S,v30.s[0] +sub v17.4s, v22.4s, v3.4s +str q4, [x0, #0] +ldr q4, [x0, #592] +sqrdmulh v14.4S, v4.4S, v29.s[0] +mul v4.4S, v4.4S,v30.s[0] +add v22.4s, v22.4s, v3.4s +str q21, [x0, #64] +ldr q21, [x0, #656] +ldr q3, [x0, #400] +sqrdmulh v0.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +sub v16.4s, v3.4s, v12.4s +add v3.4s, v3.4s, v12.4s +ldr q12, [x0, #720] +ldr q15, [x0, #464] +sqrdmulh v9.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +sub v10.4s, v15.4s, v20.4s +add v15.4s, v15.4s, v20.4s +ldr q20, [x0, #272] +mla v6.4S, v19.4S, v31.s[0] +mla v4.4S, v14.4S, v31.s[0] +sub v14.4s, v20.4s, v18.4s +str q13, [x0, #512] +mla v21.4S, v0.4S, v31.s[0] +mla v12.4S, v9.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +str q5, [x0, #576] +ldr q5, [x0, #336] +sqrdmulh v18.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v9.4s, v5.4s, v7.4s +str q2, [x0, #640] +sqrdmulh v2.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +add v5.4s, v5.4s, v7.4s +str q11, [x0, #704] +ldr q11, [x0, #16] +sqrdmulh v7.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v0.4s, v11.4s, v6.4s +add v11.4s, v11.4s, v6.4s +ldr q6, [x0, #80] +sqrdmulh v13.4S, v5.4S, v29.s[1] +mul v5.4S, v5.4S,v30.s[1] +sub v19.4s, v6.4s, v4.4s +add v6.4s, v6.4s, v4.4s +ldr q4, [x0, #144] +mla v3.4S, v18.4S, v31.s[0] +mla v15.4S, v2.4S, v31.s[0] +sub v2.4s, v4.4s, v21.4s +str q1, [x0, #768] +mla v20.4S, v7.4S, v31.s[0] +mla v5.4S, v13.4S, v31.s[0] +add v4.4s, v4.4s, v21.4s +str q8, [x0, #832] +ldr q8, [x0, #208] +sqrdmulh v21.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +sub v13.4s, v8.4s, v12.4s +str q22, [x0, #896] +sqrdmulh v22.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +add v8.4s, v8.4s, v12.4s +str q17, [x0, #960] +sqrdmulh v17.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v12.4s, v4.4s, v3.4s +add v4.4s, v4.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v29.s[2] +mul v9.4S, v9.4S,v30.s[2] +sub v7.4s, v8.4s, v15.4s +add v8.4s, v8.4s, v15.4s +mla v16.4S, v21.4S, v31.s[0] +mla v10.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v20.4s +mla v14.4S, v17.4S, v31.s[0] +mla v9.4S, v3.4S, v31.s[0] +add v11.4s, v11.4s, v20.4s +sqrdmulh v20.4S, v12.4S, v27.s[1] +mul v12.4S, v12.4S,v28.s[1] +sub v3.4s, v6.4s, v5.4s +sqrdmulh v17.4S, v7.4S, v27.s[1] +mul v7.4S, v7.4S,v28.s[1] +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v4.4S, v27.s[0] +mul v4.4S, v4.4S,v28.s[0] +sub v21.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v8.4S, v27.s[0] +mul v8.4S, v8.4S,v28.s[0] +sub v15.4s, v13.4s, v10.4s +add v13.4s, v13.4s, v10.4s +mla v12.4S, v20.4S, v31.s[0] +mla v7.4S, v17.4S, v31.s[0] +sub v17.4s, v0.4s, v14.4s +mla v4.4S, v5.4S, v31.s[0] +mla v8.4S, v16.4S, v31.s[0] +add v0.4s, v0.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v27.s[2] +mul v2.4S, v2.4S,v28.s[2] +sub v16.4s, v19.4s, v9.4s +sqrdmulh v5.4S, v13.4S, v27.s[2] +mul v13.4S, v13.4S,v28.s[2] +add v19.4s, v19.4s, v9.4s +sqrdmulh v9.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +sub v20.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +sub v10.4s, v3.4s, v7.4s +add v3.4s, v3.4s, v7.4s +mla v2.4S, v14.4S, v31.s[0] +mla v13.4S, v5.4S, v31.s[0] +sub v5.4s, v11.4s, v4.4s +mla v21.4S, v9.4S, v31.s[0] +mla v15.4S, v12.4S, v31.s[0] +add v11.4s, v11.4s, v4.4s +sqrdmulh v4.4S, v3.4S, v25.s[2] +mul v3.4S, v3.4S,v26.s[2] +sub v12.4s, v6.4s, v8.4s +sqrdmulh v9.4S, v10.4S, v25.s[3] +mul v10.4S, v10.4S,v26.s[3] +add v6.4s, v6.4s, v8.4s +sqrdmulh v8.4S, v12.4S, v25.s[1] +mul v12.4S, v12.4S,v26.s[1] +sub v14.4s, v0.4s, v2.4s +add v0.4s, v0.4s, v2.4s +sqrdmulh v2.4S, v6.4S, v25.s[0] +mul v6.4S, v6.4S,v26.s[0] +sub v7.4s, v19.4s, v13.4s +add v19.4s, v19.4s, v13.4s +mla v3.4S, v4.4S, v31.s[0] +mla v10.4S, v9.4S, v31.s[0] +sub v9.4s, v17.4s, v21.4s +mla v12.4S, v8.4S, v31.s[0] +mla v6.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v19.4S, v23.s[0] +mul v19.4S, v19.4S,v24.s[0] +sub v2.4s, v16.4s, v15.4s +sqrdmulh v8.4S, v7.4S, v23.s[1] +mul v7.4S, v7.4S,v24.s[1] +add v16.4s, v16.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v23.s[2] +mul v16.4S, v16.4S,v24.s[2] +sub v4.4s, v22.4s, v3.4s +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v2.4S, v23.s[3] +mul v2.4S, v2.4S,v24.s[3] +sub v13.4s, v20.4s, v10.4s +add v20.4s, v20.4s, v10.4s +mla v19.4S, v21.4S, v31.s[0] +mla v7.4S, v8.4S, v31.s[0] +sub v8.4s, v5.4s, v12.4s +str q22, [x0, #272] +mla v16.4S, v15.4S, v31.s[0] +mla v2.4S, v3.4S, v31.s[0] +add v5.4s, v5.4s, v12.4s +str q4, [x0, #336] +sub v23.4s, v11.4s, v6.4s +str q20, [x0, #400] +add v11.4s, v11.4s, v6.4s +str q13, [x0, #464] +sub v13.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sub v19.4s, v14.4s, v7.4s +add v14.4s, v14.4s, v7.4s +sub v7.4s, v17.4s, v16.4s +str q5, [x0, #144] +add v17.4s, v17.4s, v16.4s +str q8, [x0, #208] +sub v8.4s, v9.4s, v2.4s +str q11, [x0, #16] +add v9.4s, v9.4s, v2.4s +str q23, [x0, #80] +str q0, [x0, #528] +str q13, [x0, #592] +str q14, [x0, #656] +str q19, [x0, #720] +str q17, [x0, #784] +str q7, [x0, #848] +str q9, [x0, #912] +str q8, [x0, #976] +ldr q18, [x0, #224] +ldr q1, [x0, #160] +ldr q10, [x0, #32] +ldr q21, [x17, #+128] +ldr q22, [x17, #+144] +sqrdmulh v15.4S, v10.4S, v22.s[0] +mul v10.4S, v10.4S,v21.s[0] +ldr q3, [x0, #48] +sqrdmulh v12.4S, v3.4S, v22.s[0] +mul v3.4S, v3.4S,v21.s[0] +ldr q4, [x17, #+160] +ldr q30, [x17, #+176] +ldr q29, [x0, #96] +sqrdmulh v28.4S, v29.4S, v30.s[0] +mul v29.4S, v29.4S,v4.s[0] +ldr q27, [x0, #112] +sqrdmulh v26.4S, v27.4S, v30.s[0] +mul v27.4S, v27.4S,v4.s[0] +ldr q25, [x17, #+192] +ldr q24, [x17, #+208] +mla v10.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v1.4S, v24.s[0] +ldr q20, [x0, #176] +mla v3.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v20.4S, v24.s[0] +ldr q6, [x17, #+224] +ldr q5, [x17, #+240] +mla v29.4S, v28.4S, v31.s[0] +sqrdmulh v28.4S, v18.4S, v5.s[0] +ldr q16, [x0, #240] +mla v27.4S, v26.4S, v31.s[0] +sqrdmulh v26.4S, v16.4S, v5.s[0] +ldr q11, [x0, #0] +ldr q2, [x0, #128] +mul v1.4S, v1.4S,v25.s[0] +sub v23.4s, v11.4s, v10.4s +ldr q0, [x0, #16] +mul v20.4S, v20.4S,v25.s[0] +add v11.4s, v11.4s, v10.4s +ldr q10, [x0, #144] +mla v1.4S, v15.4S, v31.s[0] +sub v15.4s, v0.4s, v3.4s +ldr q13, [x0, #64] +mla v20.4S, v12.4S, v31.s[0] +add v0.4s, v0.4s, v3.4s +ldr q3, [x0, #192] +mul v18.4S, v18.4S,v6.s[0] +sub v12.4s, v13.4s, v29.4s +ldr q14, [x0, #80] +mul v16.4S, v16.4S,v6.s[0] +add v13.4s, v13.4s, v29.4s +ldr q29, [x0, #208] +mla v18.4S, v28.4S, v31.s[0] +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v14.4s, v27.4s +sqrdmulh v28.4S, v0.4S, v22.s[1] +add v14.4s, v14.4s, v27.4s +mul v0.4S, v0.4S,v21.s[1] +sqrdmulh v27.4S, v15.4S, v22.s[2] +sub v19.4s, v2.4s, v1.4s +mul v15.4S, v15.4S,v21.s[2] +add v2.4s, v2.4s, v1.4s +sqrdmulh v22.4S, v14.4S, v30.s[1] +sub v21.4s, v10.4s, v20.4s +mul v14.4S, v14.4S,v4.s[1] +add v10.4s, v10.4s, v20.4s +sqrdmulh v20.4S, v26.4S, v30.s[2] +sub v1.4s, v3.4s, v18.4s +mul v26.4S, v26.4S,v4.s[2] +add v3.4s, v3.4s, v18.4s +mla v0.4S, v28.4S, v31.s[0] +sub v28.4s, v29.4s, v16.4s +ldr q30, [x0, #480] +sqrdmulh v4.4S, v10.4S, v24.s[1] +add v29.4s, v29.4s, v16.4s +mla v15.4S, v27.4S, v31.s[0] +ldr q27, [x0, #416] +sqrdmulh v16.4S, v21.4S, v24.s[2] +sub v18.4s, v11.4s, v0.4s +mla v14.4S, v22.4S, v31.s[0] +ldr q22, [x0, #288] +sqrdmulh v17.4S, v29.4S, v5.s[1] +add v11.4s, v11.4s, v0.4s +str q18, [x0, #16] +mla v26.4S, v20.4S, v31.s[0] +ldr q20, [x17, #+256] +ldr q18, [x17, #+272] +sqrdmulh v0.4S, v28.4S, v5.s[2] +sub v7.4s, v23.4s, v15.4s +str q11, [x0, #0] +mul v10.4S, v10.4S,v25.s[1] +add v23.4s, v23.4s, v15.4s +mul v21.4S, v21.4S,v25.s[2] +str q7, [x0, #48] +mla v10.4S, v4.4S, v31.s[0] +sub v4.4s, v13.4s, v14.4s +mla v21.4S, v16.4S, v31.s[0] +str q23, [x0, #32] +mul v29.4S, v29.4S,v6.s[1] +str q4, [x0, #80] +mul v28.4S, v28.4S,v6.s[2] +add v13.4s, v13.4s, v14.4s +str q13, [x0, #64] +mla v29.4S, v17.4S, v31.s[0] +sub v17.4s, v12.4s, v26.4s +str q17, [x0, #112] +mla v28.4S, v0.4S, v31.s[0] +add v12.4s, v12.4s, v26.4s +str q12, [x0, #96] +sqrdmulh v5.4S, v22.4S, v18.s[0] +sub v6.4s, v2.4s, v10.4s +mul v22.4S, v22.4S,v20.s[0] +str q6, [x0, #144] +ldr q6, [x0, #304] +sqrdmulh v12.4S, v6.4S, v18.s[0] +add v2.4s, v2.4s, v10.4s +mul v6.4S, v6.4S,v20.s[0] +str q2, [x0, #128] +ldr q2, [x17, #+288] +ldr q10, [x17, #+304] +ldr q26, [x0, #352] +sqrdmulh v0.4S, v26.4S, v10.s[0] +sub v17.4s, v19.4s, v21.4s +mul v26.4S, v26.4S,v2.s[0] +str q17, [x0, #176] +ldr q17, [x0, #368] +sqrdmulh v13.4S, v17.4S, v10.s[0] +add v19.4s, v19.4s, v21.4s +mul v17.4S, v17.4S,v2.s[0] +str q19, [x0, #160] +ldr q19, [x17, #+320] +ldr q21, [x17, #+336] +mla v22.4S, v5.4S, v31.s[0] +sub v5.4s, v3.4s, v29.4s +sqrdmulh v14.4S, v27.4S, v21.s[0] +str q5, [x0, #208] +ldr q5, [x0, #432] +mla v6.4S, v12.4S, v31.s[0] +add v3.4s, v3.4s, v29.4s +sqrdmulh v29.4S, v5.4S, v21.s[0] +str q3, [x0, #192] +ldr q3, [x17, #+352] +ldr q12, [x17, #+368] +mla v26.4S, v0.4S, v31.s[0] +sub v0.4s, v1.4s, v28.4s +sqrdmulh v4.4S, v30.4S, v12.s[0] +str q0, [x0, #240] +ldr q0, [x0, #496] +mla v17.4S, v13.4S, v31.s[0] +add v1.4s, v1.4s, v28.4s +sqrdmulh v28.4S, v0.4S, v12.s[0] +str q1, [x0, #224] +ldr q1, [x0, #256] +ldr q13, [x0, #384] +mul v27.4S, v27.4S,v19.s[0] +sub v24.4s, v1.4s, v22.4s +ldr q25, [x0, #272] +mul v5.4S, v5.4S,v19.s[0] +add v1.4s, v1.4s, v22.4s +ldr q22, [x0, #400] +mla v27.4S, v14.4S, v31.s[0] +sub v14.4s, v25.4s, v6.4s +ldr q23, [x0, #320] +mla v5.4S, v29.4S, v31.s[0] +add v25.4s, v25.4s, v6.4s +ldr q6, [x0, #448] +mul v30.4S, v30.4S,v3.s[0] +sub v29.4s, v23.4s, v26.4s +ldr q16, [x0, #336] +mul v0.4S, v0.4S,v3.s[0] +add v23.4s, v23.4s, v26.4s +ldr q26, [x0, #464] +mla v30.4S, v4.4S, v31.s[0] +mla v0.4S, v28.4S, v31.s[0] +sub v28.4s, v16.4s, v17.4s +sqrdmulh v4.4S, v25.4S, v18.s[1] +add v16.4s, v16.4s, v17.4s +mul v25.4S, v25.4S,v20.s[1] +sqrdmulh v17.4S, v14.4S, v18.s[2] +sub v7.4s, v13.4s, v27.4s +mul v14.4S, v14.4S,v20.s[2] +add v13.4s, v13.4s, v27.4s +sqrdmulh v18.4S, v16.4S, v10.s[1] +sub v20.4s, v22.4s, v5.4s +mul v16.4S, v16.4S,v2.s[1] +add v22.4s, v22.4s, v5.4s +sqrdmulh v5.4S, v28.4S, v10.s[2] +sub v27.4s, v6.4s, v30.4s +mul v28.4S, v28.4S,v2.s[2] +add v6.4s, v6.4s, v30.4s +mla v25.4S, v4.4S, v31.s[0] +sub v4.4s, v26.4s, v0.4s +ldr q10, [x0, #736] +sqrdmulh v2.4S, v22.4S, v21.s[1] +add v26.4s, v26.4s, v0.4s +mla v14.4S, v17.4S, v31.s[0] +ldr q17, [x0, #672] +sqrdmulh v0.4S, v20.4S, v21.s[2] +sub v30.4s, v1.4s, v25.4s +mla v16.4S, v18.4S, v31.s[0] +ldr q18, [x0, #544] +sqrdmulh v15.4S, v26.4S, v12.s[1] +add v1.4s, v1.4s, v25.4s +str q30, [x0, #272] +mla v28.4S, v5.4S, v31.s[0] +ldr q5, [x17, #+384] +ldr q30, [x17, #+400] +sqrdmulh v25.4S, v4.4S, v12.s[2] +sub v11.4s, v24.4s, v14.4s +str q1, [x0, #256] +mul v22.4S, v22.4S,v19.s[1] +add v24.4s, v24.4s, v14.4s +mul v20.4S, v20.4S,v19.s[2] +str q11, [x0, #304] +mla v22.4S, v2.4S, v31.s[0] +sub v2.4s, v23.4s, v16.4s +mla v20.4S, v0.4S, v31.s[0] +str q24, [x0, #288] +mul v26.4S, v26.4S,v3.s[1] +str q2, [x0, #336] +mul v4.4S, v4.4S,v3.s[2] +add v23.4s, v23.4s, v16.4s +str q23, [x0, #320] +mla v26.4S, v15.4S, v31.s[0] +sub v15.4s, v29.4s, v28.4s +str q15, [x0, #368] +mla v4.4S, v25.4S, v31.s[0] +add v29.4s, v29.4s, v28.4s +str q29, [x0, #352] +sqrdmulh v12.4S, v18.4S, v30.s[0] +sub v3.4s, v13.4s, v22.4s +mul v18.4S, v18.4S,v5.s[0] +str q3, [x0, #400] +ldr q3, [x0, #560] +sqrdmulh v29.4S, v3.4S, v30.s[0] +add v13.4s, v13.4s, v22.4s +mul v3.4S, v3.4S,v5.s[0] +str q13, [x0, #384] +ldr q13, [x17, #+416] +ldr q22, [x17, #+432] +ldr q28, [x0, #608] +sqrdmulh v25.4S, v28.4S, v22.s[0] +sub v15.4s, v7.4s, v20.4s +mul v28.4S, v28.4S,v13.s[0] +str q15, [x0, #432] +ldr q15, [x0, #624] +sqrdmulh v23.4S, v15.4S, v22.s[0] +add v7.4s, v7.4s, v20.4s +mul v15.4S, v15.4S,v13.s[0] +str q7, [x0, #416] +ldr q7, [x17, #+448] +ldr q20, [x17, #+464] +mla v18.4S, v12.4S, v31.s[0] +sub v12.4s, v6.4s, v26.4s +sqrdmulh v16.4S, v17.4S, v20.s[0] +str q12, [x0, #464] +ldr q12, [x0, #688] +mla v3.4S, v29.4S, v31.s[0] +add v6.4s, v6.4s, v26.4s +sqrdmulh v26.4S, v12.4S, v20.s[0] +str q6, [x0, #448] +ldr q6, [x17, #+480] +ldr q29, [x17, #+496] +mla v28.4S, v25.4S, v31.s[0] +sub v25.4s, v27.4s, v4.4s +sqrdmulh v2.4S, v10.4S, v29.s[0] +str q25, [x0, #496] +ldr q25, [x0, #752] +mla v15.4S, v23.4S, v31.s[0] +add v27.4s, v27.4s, v4.4s +sqrdmulh v4.4S, v25.4S, v29.s[0] +str q27, [x0, #480] +ldr q27, [x0, #512] +ldr q23, [x0, #640] +mul v17.4S, v17.4S,v7.s[0] +sub v21.4s, v27.4s, v18.4s +ldr q19, [x0, #528] +mul v12.4S, v12.4S,v7.s[0] +add v27.4s, v27.4s, v18.4s +ldr q18, [x0, #656] +mla v17.4S, v16.4S, v31.s[0] +sub v16.4s, v19.4s, v3.4s +ldr q24, [x0, #576] +mla v12.4S, v26.4S, v31.s[0] +add v19.4s, v19.4s, v3.4s +ldr q3, [x0, #704] +mul v10.4S, v10.4S,v6.s[0] +sub v26.4s, v24.4s, v28.4s +ldr q0, [x0, #592] +mul v25.4S, v25.4S,v6.s[0] +add v24.4s, v24.4s, v28.4s +ldr q28, [x0, #720] +mla v10.4S, v2.4S, v31.s[0] +mla v25.4S, v4.4S, v31.s[0] +sub v4.4s, v0.4s, v15.4s +sqrdmulh v2.4S, v19.4S, v30.s[1] +add v0.4s, v0.4s, v15.4s +mul v19.4S, v19.4S,v5.s[1] +sqrdmulh v15.4S, v16.4S, v30.s[2] +sub v11.4s, v23.4s, v17.4s +mul v16.4S, v16.4S,v5.s[2] +add v23.4s, v23.4s, v17.4s +sqrdmulh v30.4S, v0.4S, v22.s[1] +sub v5.4s, v18.4s, v12.4s +mul v0.4S, v0.4S,v13.s[1] +add v18.4s, v18.4s, v12.4s +sqrdmulh v12.4S, v4.4S, v22.s[2] +sub v17.4s, v3.4s, v10.4s +mul v4.4S, v4.4S,v13.s[2] +add v3.4s, v3.4s, v10.4s +mla v19.4S, v2.4S, v31.s[0] +sub v2.4s, v28.4s, v25.4s +ldr q22, [x0, #992] +sqrdmulh v13.4S, v18.4S, v20.s[1] +add v28.4s, v28.4s, v25.4s +mla v16.4S, v15.4S, v31.s[0] +ldr q15, [x0, #928] +sqrdmulh v25.4S, v5.4S, v20.s[2] +sub v10.4s, v27.4s, v19.4s +mla v0.4S, v30.4S, v31.s[0] +ldr q30, [x0, #800] +sqrdmulh v14.4S, v28.4S, v29.s[1] +add v27.4s, v27.4s, v19.4s +str q10, [x0, #528] +mla v4.4S, v12.4S, v31.s[0] +ldr q12, [x17, #+512] +ldr q10, [x17, #+528] +sqrdmulh v19.4S, v2.4S, v29.s[2] +sub v1.4s, v21.4s, v16.4s +str q27, [x0, #512] +mul v18.4S, v18.4S,v7.s[1] +add v21.4s, v21.4s, v16.4s +mul v5.4S, v5.4S,v7.s[2] +str q1, [x0, #560] +mla v18.4S, v13.4S, v31.s[0] +sub v13.4s, v24.4s, v0.4s +mla v5.4S, v25.4S, v31.s[0] +str q21, [x0, #544] +mul v28.4S, v28.4S,v6.s[1] +str q13, [x0, #592] +mul v2.4S, v2.4S,v6.s[2] +add v24.4s, v24.4s, v0.4s +str q24, [x0, #576] +mla v28.4S, v14.4S, v31.s[0] +sub v14.4s, v26.4s, v4.4s +str q14, [x0, #624] +mla v2.4S, v19.4S, v31.s[0] +add v26.4s, v26.4s, v4.4s +str q26, [x0, #608] +sqrdmulh v29.4S, v30.4S, v10.s[0] +sub v6.4s, v23.4s, v18.4s +mul v30.4S, v30.4S,v12.s[0] +str q6, [x0, #656] +ldr q6, [x0, #816] +sqrdmulh v26.4S, v6.4S, v10.s[0] +add v23.4s, v23.4s, v18.4s +mul v6.4S, v6.4S,v12.s[0] +str q23, [x0, #640] +ldr q23, [x17, #+544] +ldr q18, [x17, #+560] +ldr q4, [x0, #864] +sqrdmulh v19.4S, v4.4S, v18.s[0] +sub v14.4s, v11.4s, v5.4s +mul v4.4S, v4.4S,v23.s[0] +str q14, [x0, #688] +ldr q14, [x0, #880] +sqrdmulh v24.4S, v14.4S, v18.s[0] +add v11.4s, v11.4s, v5.4s +mul v14.4S, v14.4S,v23.s[0] +str q11, [x0, #672] +ldr q11, [x17, #+576] +ldr q5, [x17, #+592] +mla v30.4S, v29.4S, v31.s[0] +sub v29.4s, v3.4s, v28.4s +sqrdmulh v0.4S, v15.4S, v5.s[0] +str q29, [x0, #720] +ldr q29, [x0, #944] +mla v6.4S, v26.4S, v31.s[0] +add v3.4s, v3.4s, v28.4s +sqrdmulh v28.4S, v29.4S, v5.s[0] +str q3, [x0, #704] +ldr q3, [x17, #+608] +ldr q26, [x17, #+624] +mla v4.4S, v19.4S, v31.s[0] +sub v19.4s, v17.4s, v2.4s +sqrdmulh v13.4S, v22.4S, v26.s[0] +str q19, [x0, #752] +ldr q19, [x0, #1008] +mla v14.4S, v24.4S, v31.s[0] +add v17.4s, v17.4s, v2.4s +sqrdmulh v2.4S, v19.4S, v26.s[0] +str q17, [x0, #736] +ldr q17, [x0, #768] +ldr q24, [x0, #896] +mul v15.4S, v15.4S,v11.s[0] +sub v20.4s, v17.4s, v30.4s +ldr q7, [x0, #784] +mul v29.4S, v29.4S,v11.s[0] +add v17.4s, v17.4s, v30.4s +ldr q30, [x0, #912] +mla v15.4S, v0.4S, v31.s[0] +sub v0.4s, v7.4s, v6.4s +ldr q21, [x0, #832] +mla v29.4S, v28.4S, v31.s[0] +add v7.4s, v7.4s, v6.4s +ldr q6, [x0, #960] +mul v22.4S, v22.4S,v3.s[0] +sub v28.4s, v21.4s, v4.4s +ldr q25, [x0, #848] +mul v19.4S, v19.4S,v3.s[0] +add v21.4s, v21.4s, v4.4s +ldr q4, [x0, #976] +mla v22.4S, v13.4S, v31.s[0] +mla v19.4S, v2.4S, v31.s[0] +sub v2.4s, v25.4s, v14.4s +sqrdmulh v13.4S, v7.4S, v10.s[1] +add v25.4s, v25.4s, v14.4s +mul v7.4S, v7.4S,v12.s[1] +sqrdmulh v14.4S, v0.4S, v10.s[2] +sub v1.4s, v24.4s, v15.4s +mul v0.4S, v0.4S,v12.s[2] +add v24.4s, v24.4s, v15.4s +sqrdmulh v10.4S, v25.4S, v18.s[1] +sub v12.4s, v30.4s, v29.4s +mul v25.4S, v25.4S,v23.s[1] +add v30.4s, v30.4s, v29.4s +sqrdmulh v29.4S, v2.4S, v18.s[2] +sub v15.4s, v6.4s, v22.4s +mul v2.4S, v2.4S,v23.s[2] +add v6.4s, v6.4s, v22.4s +mla v7.4S, v13.4S, v31.s[0] +sub v13.4s, v4.4s, v19.4s +sqrdmulh v18.4S, v30.4S, v5.s[1] +add v4.4s, v4.4s, v19.4s +mla v0.4S, v14.4S, v31.s[0] +sqrdmulh v14.4S, v12.4S, v5.s[2] +sub v19.4s, v17.4s, v7.4s +mla v25.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v4.4S, v26.s[1] +add v17.4s, v17.4s, v7.4s +str q19, [x0, #784] +mla v2.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v13.4S, v26.s[2] +sub v19.4s, v20.4s, v0.4s +str q17, [x0, #768] +mul v30.4S, v30.4S,v11.s[1] +add v20.4s, v20.4s, v0.4s +mul v12.4S, v12.4S,v11.s[2] +str q19, [x0, #816] +mla v30.4S, v18.4S, v31.s[0] +sub v18.4s, v21.4s, v25.4s +mla v12.4S, v14.4S, v31.s[0] +str q20, [x0, #800] +mul v4.4S, v4.4S,v3.s[1] +str q18, [x0, #848] +mul v13.4S, v13.4S,v3.s[2] +add v21.4s, v21.4s, v25.4s +str q21, [x0, #832] +mla v4.4S, v10.4S, v31.s[0] +sub v10.4s, v28.4s, v2.4s +str q10, [x0, #880] +mla v13.4S, v29.4S, v31.s[0] +add v28.4s, v28.4s, v2.4s +str q28, [x0, #864] +sub v26.4s, v24.4s, v30.4s +str q26, [x0, #912] +add v24.4s, v24.4s, v30.4s +str q24, [x0, #896] +sub v24.4s, v1.4s, v12.4s +str q24, [x0, #944] +add v1.4s, v1.4s, v12.4s +str q1, [x0, #928] +sub v1.4s, v6.4s, v4.4s +str q1, [x0, #976] +add v6.4s, v6.4s, v4.4s +str q6, [x0, #960] +sub v6.4s, v15.4s, v13.4s +str q6, [x0, #1008] +add v15.4s, v15.4s, v13.4s +str q15, [x0, #992] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1464 +// Instruction count: 1460 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_8.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_8.s new file mode 100644 index 0000000..70d520f --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_8.s @@ -0,0 +1,1494 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_7_z4_8 +.global _ntt_u32_incomplete_neon_asm_var_4_2_7_z4_8 +ntt_u32_incomplete_neon_asm_var_4_2_7_z4_8: +_ntt_u32_incomplete_neon_asm_var_4_2_7_z4_8: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #928] +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +ldr q20, [x0, #992] +sqrdmulh v19.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q18, [x0, #800] +sqrdmulh v17.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +ldr q16, [x0, #864] +sqrdmulh v3.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +mla v22.4S, v21.4S, v31.s[0] +mla v20.4S, v19.4S, v31.s[0] +mla v18.4S, v17.4S, v31.s[0] +mla v16.4S, v3.4S, v31.s[0] +ldr q3, [x0, #544] +sqrdmulh v17.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +ldr q19, [x0, #608] +sqrdmulh v21.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q2, [x0, #672] +ldr q1, [x0, #416] +sqrdmulh v0.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +sub v15.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +ldr q22, [x0, #736] +ldr q14, [x0, #480] +sqrdmulh v13.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v12.4s, v14.4s, v20.4s +add v14.4s, v14.4s, v20.4s +ldr q20, [x0, #288] +mla v3.4S, v17.4S, v31.s[0] +mla v19.4S, v21.4S, v31.s[0] +sub v21.4s, v20.4s, v18.4s +mla v2.4S, v0.4S, v31.s[0] +mla v22.4S, v13.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +ldr q18, [x0, #352] +sqrdmulh v13.4S, v1.4S, v29.s[1] +mul v1.4S, v1.4S,v30.s[1] +sub v0.4s, v18.4s, v16.4s +sqrdmulh v17.4S, v14.4S, v29.s[1] +mul v14.4S, v14.4S,v30.s[1] +add v18.4s, v18.4s, v16.4s +ldr q16, [x0, #32] +sqrdmulh v11.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v10.4s, v16.4s, v3.4s +add v16.4s, v16.4s, v3.4s +ldr q3, [x0, #96] +sqrdmulh v9.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v8.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +ldr q19, [x0, #160] +mla v1.4S, v13.4S, v31.s[0] +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v19.4s, v2.4s +mla v20.4S, v11.4S, v31.s[0] +mla v18.4S, v9.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +ldr q2, [x0, #224] +sqrdmulh v9.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +sub v11.4s, v2.4s, v22.4s +sqrdmulh v13.4S, v12.4S, v29.s[2] +mul v12.4S, v12.4S,v30.s[2] +add v2.4s, v2.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +sub v7.4s, v19.4s, v1.4s +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v29.s[2] +mul v0.4S, v0.4S,v30.s[2] +sub v6.4s, v2.4s, v14.4s +add v2.4s, v2.4s, v14.4s +mla v15.4S, v9.4S, v31.s[0] +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v16.4s, v20.4s +mla v21.4S, v22.4S, v31.s[0] +mla v0.4S, v1.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v7.4S, v27.s[1] +mul v7.4S, v7.4S,v28.s[1] +sub v1.4s, v3.4s, v18.4s +sqrdmulh v22.4S, v6.4S, v27.s[1] +mul v6.4S, v6.4S,v28.s[1] +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v19.4S, v27.s[0] +mul v19.4S, v19.4S,v28.s[0] +sub v9.4s, v17.4s, v15.4s +add v17.4s, v17.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v27.s[0] +mul v2.4S, v2.4S,v28.s[0] +sub v14.4s, v11.4s, v12.4s +add v11.4s, v11.4s, v12.4s +mla v7.4S, v20.4S, v31.s[0] +mla v6.4S, v22.4S, v31.s[0] +sub v22.4s, v10.4s, v21.4s +mla v19.4S, v18.4S, v31.s[0] +mla v2.4S, v15.4S, v31.s[0] +add v10.4s, v10.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v27.s[2] +mul v17.4S, v17.4S,v28.s[2] +sub v15.4s, v8.4s, v0.4s +sqrdmulh v18.4S, v11.4S, v27.s[2] +mul v11.4S, v11.4S,v28.s[2] +add v8.4s, v8.4s, v0.4s +sqrdmulh v0.4S, v9.4S, v27.s[3] +mul v9.4S, v9.4S,v28.s[3] +sub v20.4s, v13.4s, v7.4s +add v13.4s, v13.4s, v7.4s +sqrdmulh v7.4S, v14.4S, v27.s[3] +mul v14.4S, v14.4S,v28.s[3] +sub v12.4s, v1.4s, v6.4s +add v1.4s, v1.4s, v6.4s +mla v17.4S, v21.4S, v31.s[0] +mla v11.4S, v18.4S, v31.s[0] +sub v18.4s, v16.4s, v19.4s +mla v9.4S, v0.4S, v31.s[0] +mla v14.4S, v7.4S, v31.s[0] +add v16.4s, v16.4s, v19.4s +sqrdmulh v19.4S, v1.4S, v25.s[2] +mul v1.4S, v1.4S,v26.s[2] +sub v7.4s, v3.4s, v2.4s +sqrdmulh v0.4S, v12.4S, v25.s[3] +mul v12.4S, v12.4S,v26.s[3] +add v3.4s, v3.4s, v2.4s +sqrdmulh v2.4S, v7.4S, v25.s[1] +mul v7.4S, v7.4S,v26.s[1] +sub v21.4s, v10.4s, v17.4s +add v10.4s, v10.4s, v17.4s +sqrdmulh v17.4S, v3.4S, v25.s[0] +mul v3.4S, v3.4S,v26.s[0] +sub v6.4s, v8.4s, v11.4s +add v8.4s, v8.4s, v11.4s +mla v1.4S, v19.4S, v31.s[0] +mla v12.4S, v0.4S, v31.s[0] +sub v0.4s, v22.4s, v9.4s +mla v7.4S, v2.4S, v31.s[0] +mla v3.4S, v17.4S, v31.s[0] +add v22.4s, v22.4s, v9.4s +sqrdmulh v9.4S, v8.4S, v23.s[0] +mul v8.4S, v8.4S,v24.s[0] +sub v17.4s, v15.4s, v14.4s +sqrdmulh v2.4S, v6.4S, v23.s[1] +mul v6.4S, v6.4S,v24.s[1] +add v15.4s, v15.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v23.s[2] +mul v15.4S, v15.4S,v24.s[2] +sub v19.4s, v13.4s, v1.4s +add v13.4s, v13.4s, v1.4s +sqrdmulh v1.4S, v17.4S, v23.s[3] +mul v17.4S, v17.4S,v24.s[3] +sub v11.4s, v20.4s, v12.4s +add v20.4s, v20.4s, v12.4s +mla v8.4S, v9.4S, v31.s[0] +mla v6.4S, v2.4S, v31.s[0] +sub v2.4s, v18.4s, v7.4s +str q13, [x0, #288] +mla v15.4S, v14.4S, v31.s[0] +mla v17.4S, v1.4S, v31.s[0] +add v18.4s, v18.4s, v7.4s +str q19, [x0, #352] +ldr q19, [x0, #944] +sqrdmulh v7.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +sub v1.4s, v16.4s, v3.4s +str q20, [x0, #416] +ldr q20, [x0, #1008] +sqrdmulh v14.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v16.4s, v16.4s, v3.4s +str q11, [x0, #480] +ldr q11, [x0, #816] +sqrdmulh v3.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +sub v13.4s, v10.4s, v8.4s +add v10.4s, v10.4s, v8.4s +ldr q8, [x0, #880] +sqrdmulh v9.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v12.4s, v21.4s, v6.4s +add v21.4s, v21.4s, v6.4s +mla v19.4S, v7.4S, v31.s[0] +mla v20.4S, v14.4S, v31.s[0] +sub v14.4s, v22.4s, v15.4s +str q18, [x0, #160] +mla v11.4S, v3.4S, v31.s[0] +mla v8.4S, v9.4S, v31.s[0] +add v22.4s, v22.4s, v15.4s +str q2, [x0, #224] +ldr q2, [x0, #560] +sqrdmulh v15.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +sub v9.4s, v0.4s, v17.4s +str q16, [x0, #32] +ldr q16, [x0, #624] +sqrdmulh v3.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +add v0.4s, v0.4s, v17.4s +str q1, [x0, #96] +ldr q1, [x0, #688] +ldr q17, [x0, #432] +sqrdmulh v18.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +sub v7.4s, v17.4s, v19.4s +add v17.4s, v17.4s, v19.4s +ldr q19, [x0, #752] +ldr q6, [x0, #496] +sqrdmulh v5.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +sub v4.4s, v6.4s, v20.4s +add v6.4s, v6.4s, v20.4s +ldr q20, [x0, #304] +mla v2.4S, v15.4S, v31.s[0] +mla v16.4S, v3.4S, v31.s[0] +sub v3.4s, v20.4s, v11.4s +str q10, [x0, #544] +mla v1.4S, v18.4S, v31.s[0] +mla v19.4S, v5.4S, v31.s[0] +add v20.4s, v20.4s, v11.4s +str q13, [x0, #608] +ldr q13, [x0, #368] +sqrdmulh v11.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v5.4s, v13.4s, v8.4s +str q21, [x0, #672] +sqrdmulh v21.4S, v6.4S, v29.s[1] +mul v6.4S, v6.4S,v30.s[1] +add v13.4s, v13.4s, v8.4s +str q12, [x0, #736] +ldr q12, [x0, #48] +sqrdmulh v8.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v18.4s, v12.4s, v2.4s +add v12.4s, v12.4s, v2.4s +ldr q2, [x0, #112] +sqrdmulh v10.4S, v13.4S, v29.s[1] +mul v13.4S, v13.4S,v30.s[1] +sub v15.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +ldr q16, [x0, #176] +mla v17.4S, v11.4S, v31.s[0] +mla v6.4S, v21.4S, v31.s[0] +sub v21.4s, v16.4s, v1.4s +str q22, [x0, #800] +mla v20.4S, v8.4S, v31.s[0] +mla v13.4S, v10.4S, v31.s[0] +add v16.4s, v16.4s, v1.4s +str q14, [x0, #864] +ldr q14, [x0, #240] +sqrdmulh v1.4S, v7.4S, v29.s[2] +mul v7.4S, v7.4S,v30.s[2] +sub v10.4s, v14.4s, v19.4s +str q0, [x0, #928] +sqrdmulh v0.4S, v4.4S, v29.s[2] +mul v4.4S, v4.4S,v30.s[2] +add v14.4s, v14.4s, v19.4s +str q9, [x0, #992] +sqrdmulh v9.4S, v3.4S, v29.s[2] +mul v3.4S, v3.4S,v30.s[2] +sub v19.4s, v16.4s, v17.4s +add v16.4s, v16.4s, v17.4s +sqrdmulh v17.4S, v5.4S, v29.s[2] +mul v5.4S, v5.4S,v30.s[2] +sub v8.4s, v14.4s, v6.4s +add v14.4s, v14.4s, v6.4s +mla v7.4S, v1.4S, v31.s[0] +mla v4.4S, v0.4S, v31.s[0] +sub v0.4s, v12.4s, v20.4s +mla v3.4S, v9.4S, v31.s[0] +mla v5.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v27.s[1] +mul v19.4S, v19.4S,v28.s[1] +sub v17.4s, v2.4s, v13.4s +sqrdmulh v9.4S, v8.4S, v27.s[1] +mul v8.4S, v8.4S,v28.s[1] +add v2.4s, v2.4s, v13.4s +sqrdmulh v13.4S, v16.4S, v27.s[0] +mul v16.4S, v16.4S,v28.s[0] +sub v1.4s, v21.4s, v7.4s +add v21.4s, v21.4s, v7.4s +sqrdmulh v7.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +sub v6.4s, v10.4s, v4.4s +add v10.4s, v10.4s, v4.4s +mla v19.4S, v20.4S, v31.s[0] +mla v8.4S, v9.4S, v31.s[0] +sub v9.4s, v18.4s, v3.4s +mla v16.4S, v13.4S, v31.s[0] +mla v14.4S, v7.4S, v31.s[0] +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v27.s[2] +mul v21.4S, v21.4S,v28.s[2] +sub v7.4s, v15.4s, v5.4s +sqrdmulh v13.4S, v10.4S, v27.s[2] +mul v10.4S, v10.4S,v28.s[2] +add v15.4s, v15.4s, v5.4s +sqrdmulh v5.4S, v1.4S, v27.s[3] +mul v1.4S, v1.4S,v28.s[3] +sub v20.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v27.s[3] +mul v6.4S, v6.4S,v28.s[3] +sub v4.4s, v17.4s, v8.4s +add v17.4s, v17.4s, v8.4s +mla v21.4S, v3.4S, v31.s[0] +mla v10.4S, v13.4S, v31.s[0] +sub v13.4s, v12.4s, v16.4s +mla v1.4S, v5.4S, v31.s[0] +mla v6.4S, v19.4S, v31.s[0] +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v25.s[2] +mul v17.4S, v17.4S,v26.s[2] +sub v19.4s, v2.4s, v14.4s +sqrdmulh v5.4S, v4.4S, v25.s[3] +mul v4.4S, v4.4S,v26.s[3] +add v2.4s, v2.4s, v14.4s +sqrdmulh v14.4S, v19.4S, v25.s[1] +mul v19.4S, v19.4S,v26.s[1] +sub v3.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v2.4S, v25.s[0] +mul v2.4S, v2.4S,v26.s[0] +sub v8.4s, v15.4s, v10.4s +add v15.4s, v15.4s, v10.4s +mla v17.4S, v16.4S, v31.s[0] +mla v4.4S, v5.4S, v31.s[0] +sub v5.4s, v9.4s, v1.4s +mla v19.4S, v14.4S, v31.s[0] +mla v2.4S, v21.4S, v31.s[0] +add v9.4s, v9.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v23.s[0] +mul v15.4S, v15.4S,v24.s[0] +sub v21.4s, v7.4s, v6.4s +sqrdmulh v14.4S, v8.4S, v23.s[1] +mul v8.4S, v8.4S,v24.s[1] +add v7.4s, v7.4s, v6.4s +sqrdmulh v6.4S, v7.4S, v23.s[2] +mul v7.4S, v7.4S,v24.s[2] +sub v16.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +sqrdmulh v17.4S, v21.4S, v23.s[3] +mul v21.4S, v21.4S,v24.s[3] +sub v10.4s, v20.4s, v4.4s +add v20.4s, v20.4s, v4.4s +mla v15.4S, v1.4S, v31.s[0] +mla v8.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v19.4s +str q0, [x0, #304] +mla v7.4S, v6.4S, v31.s[0] +mla v21.4S, v17.4S, v31.s[0] +add v13.4s, v13.4s, v19.4s +str q16, [x0, #368] +ldr q16, [x0, #896] +sqrdmulh v19.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v17.4s, v12.4s, v2.4s +str q20, [x0, #432] +ldr q20, [x0, #960] +sqrdmulh v6.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v12.4s, v12.4s, v2.4s +str q10, [x0, #496] +ldr q10, [x0, #768] +sqrdmulh v2.4S, v10.4S, v29.s[0] +mul v10.4S, v10.4S,v30.s[0] +sub v0.4s, v18.4s, v15.4s +add v18.4s, v18.4s, v15.4s +ldr q15, [x0, #832] +sqrdmulh v1.4S, v15.4S, v29.s[0] +mul v15.4S, v15.4S,v30.s[0] +sub v4.4s, v3.4s, v8.4s +add v3.4s, v3.4s, v8.4s +mla v16.4S, v19.4S, v31.s[0] +mla v20.4S, v6.4S, v31.s[0] +sub v6.4s, v9.4s, v7.4s +str q13, [x0, #176] +mla v10.4S, v2.4S, v31.s[0] +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v7.4s +str q14, [x0, #240] +ldr q14, [x0, #512] +sqrdmulh v7.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v1.4s, v5.4s, v21.4s +str q12, [x0, #48] +ldr q12, [x0, #576] +sqrdmulh v2.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +add v5.4s, v5.4s, v21.4s +str q17, [x0, #112] +ldr q17, [x0, #640] +ldr q21, [x0, #384] +sqrdmulh v13.4S, v17.4S, v29.s[0] +mul v17.4S, v17.4S,v30.s[0] +sub v19.4s, v21.4s, v16.4s +add v21.4s, v21.4s, v16.4s +ldr q16, [x0, #704] +ldr q8, [x0, #448] +sqrdmulh v22.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v11.4s, v8.4s, v20.4s +add v8.4s, v8.4s, v20.4s +ldr q20, [x0, #256] +mla v14.4S, v7.4S, v31.s[0] +mla v12.4S, v2.4S, v31.s[0] +sub v2.4s, v20.4s, v10.4s +str q18, [x0, #560] +mla v17.4S, v13.4S, v31.s[0] +mla v16.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v10.4s +str q0, [x0, #624] +ldr q0, [x0, #320] +sqrdmulh v10.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v22.4s, v0.4s, v15.4s +str q3, [x0, #688] +sqrdmulh v3.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +add v0.4s, v0.4s, v15.4s +str q4, [x0, #752] +ldr q4, [x0, #0] +sqrdmulh v15.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v13.4s, v4.4s, v14.4s +add v4.4s, v4.4s, v14.4s +ldr q14, [x0, #64] +sqrdmulh v18.4S, v0.4S, v29.s[1] +mul v0.4S, v0.4S,v30.s[1] +sub v7.4s, v14.4s, v12.4s +add v14.4s, v14.4s, v12.4s +ldr q12, [x0, #128] +mla v21.4S, v10.4S, v31.s[0] +mla v8.4S, v3.4S, v31.s[0] +sub v3.4s, v12.4s, v17.4s +str q9, [x0, #816] +mla v20.4S, v15.4S, v31.s[0] +mla v0.4S, v18.4S, v31.s[0] +add v12.4s, v12.4s, v17.4s +str q6, [x0, #880] +ldr q6, [x0, #192] +sqrdmulh v17.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +sub v18.4s, v6.4s, v16.4s +str q5, [x0, #944] +sqrdmulh v5.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +add v6.4s, v6.4s, v16.4s +str q1, [x0, #1008] +sqrdmulh v1.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v16.4s, v12.4s, v21.4s +add v12.4s, v12.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +sub v15.4s, v6.4s, v8.4s +add v6.4s, v6.4s, v8.4s +mla v19.4S, v17.4S, v31.s[0] +mla v11.4S, v5.4S, v31.s[0] +sub v5.4s, v4.4s, v20.4s +mla v2.4S, v1.4S, v31.s[0] +mla v22.4S, v21.4S, v31.s[0] +add v4.4s, v4.4s, v20.4s +sqrdmulh v20.4S, v16.4S, v27.s[1] +mul v16.4S, v16.4S,v28.s[1] +sub v21.4s, v14.4s, v0.4s +sqrdmulh v1.4S, v15.4S, v27.s[1] +mul v15.4S, v15.4S,v28.s[1] +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +sub v17.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v27.s[0] +mul v6.4S, v6.4S,v28.s[0] +sub v8.4s, v18.4s, v11.4s +add v18.4s, v18.4s, v11.4s +mla v16.4S, v20.4S, v31.s[0] +mla v15.4S, v1.4S, v31.s[0] +sub v1.4s, v13.4s, v2.4s +mla v12.4S, v0.4S, v31.s[0] +mla v6.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v3.4S, v27.s[2] +mul v3.4S, v3.4S,v28.s[2] +sub v19.4s, v7.4s, v22.4s +sqrdmulh v0.4S, v18.4S, v27.s[2] +mul v18.4S, v18.4S,v28.s[2] +add v7.4s, v7.4s, v22.4s +sqrdmulh v22.4S, v17.4S, v27.s[3] +mul v17.4S, v17.4S,v28.s[3] +sub v20.4s, v5.4s, v16.4s +add v5.4s, v5.4s, v16.4s +sqrdmulh v16.4S, v8.4S, v27.s[3] +mul v8.4S, v8.4S,v28.s[3] +sub v11.4s, v21.4s, v15.4s +add v21.4s, v21.4s, v15.4s +mla v3.4S, v2.4S, v31.s[0] +mla v18.4S, v0.4S, v31.s[0] +sub v0.4s, v4.4s, v12.4s +mla v17.4S, v22.4S, v31.s[0] +mla v8.4S, v16.4S, v31.s[0] +add v4.4s, v4.4s, v12.4s +sqrdmulh v12.4S, v21.4S, v25.s[2] +mul v21.4S, v21.4S,v26.s[2] +sub v16.4s, v14.4s, v6.4s +sqrdmulh v22.4S, v11.4S, v25.s[3] +mul v11.4S, v11.4S,v26.s[3] +add v14.4s, v14.4s, v6.4s +sqrdmulh v6.4S, v16.4S, v25.s[1] +mul v16.4S, v16.4S,v26.s[1] +sub v2.4s, v13.4s, v3.4s +add v13.4s, v13.4s, v3.4s +sqrdmulh v3.4S, v14.4S, v25.s[0] +mul v14.4S, v14.4S,v26.s[0] +sub v15.4s, v7.4s, v18.4s +add v7.4s, v7.4s, v18.4s +mla v21.4S, v12.4S, v31.s[0] +mla v11.4S, v22.4S, v31.s[0] +sub v22.4s, v1.4s, v17.4s +mla v16.4S, v6.4S, v31.s[0] +mla v14.4S, v3.4S, v31.s[0] +add v1.4s, v1.4s, v17.4s +sqrdmulh v17.4S, v7.4S, v23.s[0] +mul v7.4S, v7.4S,v24.s[0] +sub v3.4s, v19.4s, v8.4s +sqrdmulh v6.4S, v15.4S, v23.s[1] +mul v15.4S, v15.4S,v24.s[1] +add v19.4s, v19.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v23.s[2] +mul v19.4S, v19.4S,v24.s[2] +sub v12.4s, v5.4s, v21.4s +add v5.4s, v5.4s, v21.4s +sqrdmulh v21.4S, v3.4S, v23.s[3] +mul v3.4S, v3.4S,v24.s[3] +sub v18.4s, v20.4s, v11.4s +add v20.4s, v20.4s, v11.4s +mla v7.4S, v17.4S, v31.s[0] +mla v15.4S, v6.4S, v31.s[0] +sub v6.4s, v0.4s, v16.4s +str q5, [x0, #256] +mla v19.4S, v8.4S, v31.s[0] +mla v3.4S, v21.4S, v31.s[0] +add v0.4s, v0.4s, v16.4s +str q12, [x0, #320] +ldr q12, [x0, #912] +sqrdmulh v16.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +sub v21.4s, v4.4s, v14.4s +str q20, [x0, #384] +ldr q20, [x0, #976] +sqrdmulh v8.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v4.4s, v4.4s, v14.4s +str q18, [x0, #448] +ldr q18, [x0, #784] +sqrdmulh v14.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +sub v5.4s, v13.4s, v7.4s +add v13.4s, v13.4s, v7.4s +ldr q7, [x0, #848] +sqrdmulh v17.4S, v7.4S, v29.s[0] +mul v7.4S, v7.4S,v30.s[0] +sub v11.4s, v2.4s, v15.4s +add v2.4s, v2.4s, v15.4s +mla v12.4S, v16.4S, v31.s[0] +mla v20.4S, v8.4S, v31.s[0] +sub v8.4s, v1.4s, v19.4s +str q0, [x0, #128] +mla v18.4S, v14.4S, v31.s[0] +mla v7.4S, v17.4S, v31.s[0] +add v1.4s, v1.4s, v19.4s +str q6, [x0, #192] +ldr q6, [x0, #528] +sqrdmulh v19.4S, v6.4S, v29.s[0] +mul v6.4S, v6.4S,v30.s[0] +sub v17.4s, v22.4s, v3.4s +str q4, [x0, #0] +ldr q4, [x0, #592] +sqrdmulh v14.4S, v4.4S, v29.s[0] +mul v4.4S, v4.4S,v30.s[0] +add v22.4s, v22.4s, v3.4s +str q21, [x0, #64] +ldr q21, [x0, #656] +ldr q3, [x0, #400] +sqrdmulh v0.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +sub v16.4s, v3.4s, v12.4s +add v3.4s, v3.4s, v12.4s +ldr q12, [x0, #720] +ldr q15, [x0, #464] +sqrdmulh v9.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +sub v10.4s, v15.4s, v20.4s +add v15.4s, v15.4s, v20.4s +ldr q20, [x0, #272] +mla v6.4S, v19.4S, v31.s[0] +mla v4.4S, v14.4S, v31.s[0] +sub v14.4s, v20.4s, v18.4s +str q13, [x0, #512] +mla v21.4S, v0.4S, v31.s[0] +mla v12.4S, v9.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +str q5, [x0, #576] +ldr q5, [x0, #336] +sqrdmulh v18.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v9.4s, v5.4s, v7.4s +str q2, [x0, #640] +sqrdmulh v2.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +add v5.4s, v5.4s, v7.4s +str q11, [x0, #704] +ldr q11, [x0, #16] +sqrdmulh v7.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v0.4s, v11.4s, v6.4s +add v11.4s, v11.4s, v6.4s +ldr q6, [x0, #80] +sqrdmulh v13.4S, v5.4S, v29.s[1] +mul v5.4S, v5.4S,v30.s[1] +sub v19.4s, v6.4s, v4.4s +add v6.4s, v6.4s, v4.4s +ldr q4, [x0, #144] +mla v3.4S, v18.4S, v31.s[0] +mla v15.4S, v2.4S, v31.s[0] +sub v2.4s, v4.4s, v21.4s +str q1, [x0, #768] +mla v20.4S, v7.4S, v31.s[0] +mla v5.4S, v13.4S, v31.s[0] +add v4.4s, v4.4s, v21.4s +str q8, [x0, #832] +ldr q8, [x0, #208] +sqrdmulh v21.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +sub v13.4s, v8.4s, v12.4s +str q22, [x0, #896] +sqrdmulh v22.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +add v8.4s, v8.4s, v12.4s +str q17, [x0, #960] +sqrdmulh v17.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v12.4s, v4.4s, v3.4s +add v4.4s, v4.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v29.s[2] +mul v9.4S, v9.4S,v30.s[2] +sub v7.4s, v8.4s, v15.4s +add v8.4s, v8.4s, v15.4s +mla v16.4S, v21.4S, v31.s[0] +mla v10.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v20.4s +mla v14.4S, v17.4S, v31.s[0] +mla v9.4S, v3.4S, v31.s[0] +add v11.4s, v11.4s, v20.4s +sqrdmulh v20.4S, v12.4S, v27.s[1] +mul v12.4S, v12.4S,v28.s[1] +sub v3.4s, v6.4s, v5.4s +sqrdmulh v17.4S, v7.4S, v27.s[1] +mul v7.4S, v7.4S,v28.s[1] +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v4.4S, v27.s[0] +mul v4.4S, v4.4S,v28.s[0] +sub v21.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v8.4S, v27.s[0] +mul v8.4S, v8.4S,v28.s[0] +sub v15.4s, v13.4s, v10.4s +add v13.4s, v13.4s, v10.4s +mla v12.4S, v20.4S, v31.s[0] +mla v7.4S, v17.4S, v31.s[0] +sub v17.4s, v0.4s, v14.4s +mla v4.4S, v5.4S, v31.s[0] +mla v8.4S, v16.4S, v31.s[0] +add v0.4s, v0.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v27.s[2] +mul v2.4S, v2.4S,v28.s[2] +sub v16.4s, v19.4s, v9.4s +sqrdmulh v5.4S, v13.4S, v27.s[2] +mul v13.4S, v13.4S,v28.s[2] +add v19.4s, v19.4s, v9.4s +sqrdmulh v9.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +sub v20.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +sub v10.4s, v3.4s, v7.4s +add v3.4s, v3.4s, v7.4s +mla v2.4S, v14.4S, v31.s[0] +mla v13.4S, v5.4S, v31.s[0] +sub v5.4s, v11.4s, v4.4s +mla v21.4S, v9.4S, v31.s[0] +mla v15.4S, v12.4S, v31.s[0] +add v11.4s, v11.4s, v4.4s +sqrdmulh v4.4S, v3.4S, v25.s[2] +mul v3.4S, v3.4S,v26.s[2] +sub v12.4s, v6.4s, v8.4s +sqrdmulh v9.4S, v10.4S, v25.s[3] +mul v10.4S, v10.4S,v26.s[3] +add v6.4s, v6.4s, v8.4s +sqrdmulh v8.4S, v12.4S, v25.s[1] +mul v12.4S, v12.4S,v26.s[1] +sub v14.4s, v0.4s, v2.4s +add v0.4s, v0.4s, v2.4s +sqrdmulh v2.4S, v6.4S, v25.s[0] +mul v6.4S, v6.4S,v26.s[0] +sub v7.4s, v19.4s, v13.4s +add v19.4s, v19.4s, v13.4s +mla v3.4S, v4.4S, v31.s[0] +mla v10.4S, v9.4S, v31.s[0] +sub v9.4s, v17.4s, v21.4s +mla v12.4S, v8.4S, v31.s[0] +mla v6.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v19.4S, v23.s[0] +mul v19.4S, v19.4S,v24.s[0] +sub v2.4s, v16.4s, v15.4s +sqrdmulh v8.4S, v7.4S, v23.s[1] +mul v7.4S, v7.4S,v24.s[1] +add v16.4s, v16.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v23.s[2] +mul v16.4S, v16.4S,v24.s[2] +sub v4.4s, v22.4s, v3.4s +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v2.4S, v23.s[3] +mul v2.4S, v2.4S,v24.s[3] +sub v13.4s, v20.4s, v10.4s +add v20.4s, v20.4s, v10.4s +mla v19.4S, v21.4S, v31.s[0] +mla v7.4S, v8.4S, v31.s[0] +sub v8.4s, v5.4s, v12.4s +str q22, [x0, #272] +mla v16.4S, v15.4S, v31.s[0] +mla v2.4S, v3.4S, v31.s[0] +add v5.4s, v5.4s, v12.4s +str q4, [x0, #336] +sub v23.4s, v11.4s, v6.4s +str q20, [x0, #400] +add v11.4s, v11.4s, v6.4s +str q13, [x0, #464] +sub v13.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sub v19.4s, v14.4s, v7.4s +add v14.4s, v14.4s, v7.4s +sub v7.4s, v17.4s, v16.4s +str q5, [x0, #144] +add v17.4s, v17.4s, v16.4s +str q8, [x0, #208] +sub v8.4s, v9.4s, v2.4s +str q11, [x0, #16] +add v9.4s, v9.4s, v2.4s +str q23, [x0, #80] +str q0, [x0, #528] +str q13, [x0, #592] +str q14, [x0, #656] +str q19, [x0, #720] +str q17, [x0, #784] +str q7, [x0, #848] +str q9, [x0, #912] +str q8, [x0, #976] +ldr q18, [x0, #224] +ldr q1, [x0, #160] +ldr q10, [x0, #32] +ldr q21, [x17, #+128] +ldr q22, [x17, #+144] +sqrdmulh v15.4S, v10.4S, v22.s[0] +mul v10.4S, v10.4S,v21.s[0] +ldr q3, [x0, #48] +sqrdmulh v12.4S, v3.4S, v22.s[0] +ldr q4, [x17, #+160] +mul v3.4S, v3.4S,v21.s[0] +ldr q30, [x17, #+176] +ldr q29, [x0, #96] +sqrdmulh v28.4S, v29.4S, v30.s[0] +mul v29.4S, v29.4S,v4.s[0] +ldr q27, [x0, #112] +sqrdmulh v26.4S, v27.4S, v30.s[0] +mul v27.4S, v27.4S,v4.s[0] +ldr q25, [x17, #+192] +mla v10.4S, v15.4S, v31.s[0] +ldr q15, [x17, #+208] +sqrdmulh v24.4S, v1.4S, v15.s[0] +ldr q20, [x0, #176] +mla v3.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v20.4S, v15.s[0] +ldr q6, [x17, #+224] +mla v29.4S, v28.4S, v31.s[0] +ldr q28, [x17, #+240] +sqrdmulh v5.4S, v18.4S, v28.s[0] +ldr q16, [x0, #240] +mla v27.4S, v26.4S, v31.s[0] +sqrdmulh v26.4S, v16.4S, v28.s[0] +ldr q11, [x0, #0] +ldr q2, [x0, #128] +mul v1.4S, v1.4S,v25.s[0] +sub v23.4s, v11.4s, v10.4s +ldr q0, [x0, #16] +mul v20.4S, v20.4S,v25.s[0] +add v11.4s, v11.4s, v10.4s +ldr q10, [x0, #144] +mla v1.4S, v24.4S, v31.s[0] +sub v24.4s, v0.4s, v3.4s +ldr q13, [x0, #64] +mla v20.4S, v12.4S, v31.s[0] +add v0.4s, v0.4s, v3.4s +ldr q3, [x0, #192] +mul v18.4S, v18.4S,v6.s[0] +sub v12.4s, v13.4s, v29.4s +ldr q14, [x0, #80] +mul v16.4S, v16.4S,v6.s[0] +add v13.4s, v13.4s, v29.4s +ldr q29, [x0, #208] +mla v18.4S, v5.4S, v31.s[0] +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v14.4s, v27.4s +sqrdmulh v5.4S, v0.4S, v22.s[1] +add v14.4s, v14.4s, v27.4s +mul v0.4S, v0.4S,v21.s[1] +sqrdmulh v27.4S, v24.4S, v22.s[2] +sub v19.4s, v2.4s, v1.4s +mul v24.4S, v24.4S,v21.s[2] +add v2.4s, v2.4s, v1.4s +sqrdmulh v22.4S, v14.4S, v30.s[1] +sub v21.4s, v10.4s, v20.4s +mul v14.4S, v14.4S,v4.s[1] +add v10.4s, v10.4s, v20.4s +sqrdmulh v20.4S, v26.4S, v30.s[2] +sub v1.4s, v3.4s, v18.4s +mul v26.4S, v26.4S,v4.s[2] +add v3.4s, v3.4s, v18.4s +mla v0.4S, v5.4S, v31.s[0] +sub v5.4s, v29.4s, v16.4s +ldr q30, [x0, #480] +sqrdmulh v4.4S, v10.4S, v15.s[1] +add v29.4s, v29.4s, v16.4s +mla v24.4S, v27.4S, v31.s[0] +ldr q27, [x0, #416] +sqrdmulh v16.4S, v21.4S, v15.s[2] +sub v18.4s, v11.4s, v0.4s +mla v14.4S, v22.4S, v31.s[0] +ldr q22, [x0, #288] +sqrdmulh v17.4S, v29.4S, v28.s[1] +add v11.4s, v11.4s, v0.4s +str q18, [x0, #16] +mla v26.4S, v20.4S, v31.s[0] +ldr q20, [x17, #+256] +sqrdmulh v18.4S, v5.4S, v28.s[2] +sub v0.4s, v23.4s, v24.4s +str q11, [x0, #0] +mul v10.4S, v10.4S,v25.s[1] +add v23.4s, v23.4s, v24.4s +ldr q24, [x17, #+272] +mul v21.4S, v21.4S,v25.s[2] +str q0, [x0, #48] +mla v10.4S, v4.4S, v31.s[0] +sub v4.4s, v13.4s, v14.4s +mla v21.4S, v16.4S, v31.s[0] +str q23, [x0, #32] +mul v29.4S, v29.4S,v6.s[1] +str q4, [x0, #80] +mul v5.4S, v5.4S,v6.s[2] +add v13.4s, v13.4s, v14.4s +str q13, [x0, #64] +mla v29.4S, v17.4S, v31.s[0] +sub v17.4s, v12.4s, v26.4s +str q17, [x0, #112] +mla v5.4S, v18.4S, v31.s[0] +add v12.4s, v12.4s, v26.4s +str q12, [x0, #96] +sqrdmulh v28.4S, v22.4S, v24.s[0] +sub v6.4s, v2.4s, v10.4s +mul v22.4S, v22.4S,v20.s[0] +str q6, [x0, #144] +ldr q6, [x0, #304] +sqrdmulh v12.4S, v6.4S, v24.s[0] +add v2.4s, v2.4s, v10.4s +ldr q10, [x17, #+288] +mul v6.4S, v6.4S,v20.s[0] +str q2, [x0, #128] +ldr q2, [x17, #+304] +ldr q26, [x0, #352] +sqrdmulh v18.4S, v26.4S, v2.s[0] +sub v17.4s, v19.4s, v21.4s +mul v26.4S, v26.4S,v10.s[0] +str q17, [x0, #176] +ldr q17, [x0, #368] +sqrdmulh v13.4S, v17.4S, v2.s[0] +add v19.4s, v19.4s, v21.4s +mul v17.4S, v17.4S,v10.s[0] +str q19, [x0, #160] +ldr q19, [x17, #+320] +mla v22.4S, v28.4S, v31.s[0] +sub v28.4s, v3.4s, v29.4s +ldr q21, [x17, #+336] +sqrdmulh v14.4S, v27.4S, v21.s[0] +str q28, [x0, #208] +ldr q28, [x0, #432] +mla v6.4S, v12.4S, v31.s[0] +add v3.4s, v3.4s, v29.4s +sqrdmulh v29.4S, v28.4S, v21.s[0] +str q3, [x0, #192] +ldr q3, [x17, #+352] +mla v26.4S, v18.4S, v31.s[0] +sub v18.4s, v1.4s, v5.4s +ldr q12, [x17, #+368] +sqrdmulh v4.4S, v30.4S, v12.s[0] +str q18, [x0, #240] +ldr q18, [x0, #496] +mla v17.4S, v13.4S, v31.s[0] +add v1.4s, v1.4s, v5.4s +sqrdmulh v5.4S, v18.4S, v12.s[0] +str q1, [x0, #224] +ldr q1, [x0, #256] +ldr q13, [x0, #384] +mul v27.4S, v27.4S,v19.s[0] +sub v15.4s, v1.4s, v22.4s +ldr q25, [x0, #272] +mul v28.4S, v28.4S,v19.s[0] +add v1.4s, v1.4s, v22.4s +ldr q22, [x0, #400] +mla v27.4S, v14.4S, v31.s[0] +sub v14.4s, v25.4s, v6.4s +ldr q23, [x0, #320] +mla v28.4S, v29.4S, v31.s[0] +add v25.4s, v25.4s, v6.4s +ldr q6, [x0, #448] +mul v30.4S, v30.4S,v3.s[0] +sub v29.4s, v23.4s, v26.4s +ldr q16, [x0, #336] +mul v18.4S, v18.4S,v3.s[0] +add v23.4s, v23.4s, v26.4s +ldr q26, [x0, #464] +mla v30.4S, v4.4S, v31.s[0] +mla v18.4S, v5.4S, v31.s[0] +sub v5.4s, v16.4s, v17.4s +sqrdmulh v4.4S, v25.4S, v24.s[1] +add v16.4s, v16.4s, v17.4s +mul v25.4S, v25.4S,v20.s[1] +sqrdmulh v17.4S, v14.4S, v24.s[2] +sub v0.4s, v13.4s, v27.4s +mul v14.4S, v14.4S,v20.s[2] +add v13.4s, v13.4s, v27.4s +sqrdmulh v24.4S, v16.4S, v2.s[1] +sub v20.4s, v22.4s, v28.4s +mul v16.4S, v16.4S,v10.s[1] +add v22.4s, v22.4s, v28.4s +sqrdmulh v28.4S, v5.4S, v2.s[2] +sub v27.4s, v6.4s, v30.4s +mul v5.4S, v5.4S,v10.s[2] +add v6.4s, v6.4s, v30.4s +mla v25.4S, v4.4S, v31.s[0] +sub v4.4s, v26.4s, v18.4s +ldr q2, [x0, #736] +sqrdmulh v10.4S, v22.4S, v21.s[1] +add v26.4s, v26.4s, v18.4s +mla v14.4S, v17.4S, v31.s[0] +ldr q17, [x0, #672] +sqrdmulh v18.4S, v20.4S, v21.s[2] +sub v30.4s, v1.4s, v25.4s +mla v16.4S, v24.4S, v31.s[0] +ldr q24, [x0, #544] +sqrdmulh v11.4S, v26.4S, v12.s[1] +add v1.4s, v1.4s, v25.4s +str q30, [x0, #272] +mla v5.4S, v28.4S, v31.s[0] +ldr q28, [x17, #+384] +sqrdmulh v30.4S, v4.4S, v12.s[2] +sub v25.4s, v15.4s, v14.4s +str q1, [x0, #256] +mul v22.4S, v22.4S,v19.s[1] +add v15.4s, v15.4s, v14.4s +ldr q14, [x17, #+400] +mul v20.4S, v20.4S,v19.s[2] +str q25, [x0, #304] +mla v22.4S, v10.4S, v31.s[0] +sub v10.4s, v23.4s, v16.4s +mla v20.4S, v18.4S, v31.s[0] +str q15, [x0, #288] +mul v26.4S, v26.4S,v3.s[1] +str q10, [x0, #336] +mul v4.4S, v4.4S,v3.s[2] +add v23.4s, v23.4s, v16.4s +str q23, [x0, #320] +mla v26.4S, v11.4S, v31.s[0] +sub v11.4s, v29.4s, v5.4s +str q11, [x0, #368] +mla v4.4S, v30.4S, v31.s[0] +add v29.4s, v29.4s, v5.4s +str q29, [x0, #352] +sqrdmulh v12.4S, v24.4S, v14.s[0] +sub v3.4s, v13.4s, v22.4s +mul v24.4S, v24.4S,v28.s[0] +str q3, [x0, #400] +ldr q3, [x0, #560] +sqrdmulh v29.4S, v3.4S, v14.s[0] +add v13.4s, v13.4s, v22.4s +ldr q22, [x17, #+416] +mul v3.4S, v3.4S,v28.s[0] +str q13, [x0, #384] +ldr q13, [x17, #+432] +ldr q5, [x0, #608] +sqrdmulh v30.4S, v5.4S, v13.s[0] +sub v11.4s, v0.4s, v20.4s +mul v5.4S, v5.4S,v22.s[0] +str q11, [x0, #432] +ldr q11, [x0, #624] +sqrdmulh v23.4S, v11.4S, v13.s[0] +add v0.4s, v0.4s, v20.4s +mul v11.4S, v11.4S,v22.s[0] +str q0, [x0, #416] +ldr q0, [x17, #+448] +mla v24.4S, v12.4S, v31.s[0] +sub v12.4s, v6.4s, v26.4s +ldr q20, [x17, #+464] +sqrdmulh v16.4S, v17.4S, v20.s[0] +str q12, [x0, #464] +ldr q12, [x0, #688] +mla v3.4S, v29.4S, v31.s[0] +add v6.4s, v6.4s, v26.4s +sqrdmulh v26.4S, v12.4S, v20.s[0] +str q6, [x0, #448] +ldr q6, [x17, #+480] +mla v5.4S, v30.4S, v31.s[0] +sub v30.4s, v27.4s, v4.4s +ldr q29, [x17, #+496] +sqrdmulh v10.4S, v2.4S, v29.s[0] +str q30, [x0, #496] +ldr q30, [x0, #752] +mla v11.4S, v23.4S, v31.s[0] +add v27.4s, v27.4s, v4.4s +sqrdmulh v4.4S, v30.4S, v29.s[0] +str q27, [x0, #480] +ldr q27, [x0, #512] +ldr q23, [x0, #640] +mul v17.4S, v17.4S,v0.s[0] +sub v21.4s, v27.4s, v24.4s +ldr q19, [x0, #528] +mul v12.4S, v12.4S,v0.s[0] +add v27.4s, v27.4s, v24.4s +ldr q24, [x0, #656] +mla v17.4S, v16.4S, v31.s[0] +sub v16.4s, v19.4s, v3.4s +ldr q15, [x0, #576] +mla v12.4S, v26.4S, v31.s[0] +add v19.4s, v19.4s, v3.4s +ldr q3, [x0, #704] +mul v2.4S, v2.4S,v6.s[0] +sub v26.4s, v15.4s, v5.4s +ldr q18, [x0, #592] +mul v30.4S, v30.4S,v6.s[0] +add v15.4s, v15.4s, v5.4s +ldr q5, [x0, #720] +mla v2.4S, v10.4S, v31.s[0] +mla v30.4S, v4.4S, v31.s[0] +sub v4.4s, v18.4s, v11.4s +sqrdmulh v10.4S, v19.4S, v14.s[1] +add v18.4s, v18.4s, v11.4s +mul v19.4S, v19.4S,v28.s[1] +sqrdmulh v11.4S, v16.4S, v14.s[2] +sub v25.4s, v23.4s, v17.4s +mul v16.4S, v16.4S,v28.s[2] +add v23.4s, v23.4s, v17.4s +sqrdmulh v14.4S, v18.4S, v13.s[1] +sub v28.4s, v24.4s, v12.4s +mul v18.4S, v18.4S,v22.s[1] +add v24.4s, v24.4s, v12.4s +sqrdmulh v12.4S, v4.4S, v13.s[2] +sub v17.4s, v3.4s, v2.4s +mul v4.4S, v4.4S,v22.s[2] +add v3.4s, v3.4s, v2.4s +mla v19.4S, v10.4S, v31.s[0] +sub v10.4s, v5.4s, v30.4s +ldr q13, [x0, #992] +sqrdmulh v22.4S, v24.4S, v20.s[1] +add v5.4s, v5.4s, v30.4s +mla v16.4S, v11.4S, v31.s[0] +ldr q11, [x0, #928] +sqrdmulh v30.4S, v28.4S, v20.s[2] +sub v2.4s, v27.4s, v19.4s +mla v18.4S, v14.4S, v31.s[0] +ldr q14, [x0, #800] +sqrdmulh v1.4S, v5.4S, v29.s[1] +add v27.4s, v27.4s, v19.4s +str q2, [x0, #528] +mla v4.4S, v12.4S, v31.s[0] +ldr q12, [x17, #+512] +sqrdmulh v2.4S, v10.4S, v29.s[2] +sub v19.4s, v21.4s, v16.4s +str q27, [x0, #512] +mul v24.4S, v24.4S,v0.s[1] +add v21.4s, v21.4s, v16.4s +ldr q16, [x17, #+528] +mul v28.4S, v28.4S,v0.s[2] +str q19, [x0, #560] +mla v24.4S, v22.4S, v31.s[0] +sub v22.4s, v15.4s, v18.4s +mla v28.4S, v30.4S, v31.s[0] +str q21, [x0, #544] +mul v5.4S, v5.4S,v6.s[1] +str q22, [x0, #592] +mul v10.4S, v10.4S,v6.s[2] +add v15.4s, v15.4s, v18.4s +str q15, [x0, #576] +mla v5.4S, v1.4S, v31.s[0] +sub v1.4s, v26.4s, v4.4s +str q1, [x0, #624] +mla v10.4S, v2.4S, v31.s[0] +add v26.4s, v26.4s, v4.4s +str q26, [x0, #608] +sqrdmulh v29.4S, v14.4S, v16.s[0] +sub v6.4s, v23.4s, v24.4s +mul v14.4S, v14.4S,v12.s[0] +str q6, [x0, #656] +ldr q6, [x0, #816] +sqrdmulh v26.4S, v6.4S, v16.s[0] +add v23.4s, v23.4s, v24.4s +ldr q24, [x17, #+544] +mul v6.4S, v6.4S,v12.s[0] +str q23, [x0, #640] +ldr q23, [x17, #+560] +ldr q4, [x0, #864] +sqrdmulh v2.4S, v4.4S, v23.s[0] +sub v1.4s, v25.4s, v28.4s +mul v4.4S, v4.4S,v24.s[0] +str q1, [x0, #688] +ldr q1, [x0, #880] +sqrdmulh v15.4S, v1.4S, v23.s[0] +add v25.4s, v25.4s, v28.4s +mul v1.4S, v1.4S,v24.s[0] +str q25, [x0, #672] +ldr q25, [x17, #+576] +mla v14.4S, v29.4S, v31.s[0] +sub v29.4s, v3.4s, v5.4s +ldr q28, [x17, #+592] +sqrdmulh v18.4S, v11.4S, v28.s[0] +str q29, [x0, #720] +ldr q29, [x0, #944] +mla v6.4S, v26.4S, v31.s[0] +add v3.4s, v3.4s, v5.4s +sqrdmulh v5.4S, v29.4S, v28.s[0] +str q3, [x0, #704] +ldr q3, [x17, #+608] +mla v4.4S, v2.4S, v31.s[0] +sub v2.4s, v17.4s, v10.4s +ldr q26, [x17, #+624] +sqrdmulh v22.4S, v13.4S, v26.s[0] +str q2, [x0, #752] +ldr q2, [x0, #1008] +mla v1.4S, v15.4S, v31.s[0] +add v17.4s, v17.4s, v10.4s +sqrdmulh v10.4S, v2.4S, v26.s[0] +str q17, [x0, #736] +ldr q17, [x0, #768] +ldr q15, [x0, #896] +mul v11.4S, v11.4S,v25.s[0] +sub v20.4s, v17.4s, v14.4s +ldr q0, [x0, #784] +mul v29.4S, v29.4S,v25.s[0] +add v17.4s, v17.4s, v14.4s +ldr q14, [x0, #912] +mla v11.4S, v18.4S, v31.s[0] +sub v18.4s, v0.4s, v6.4s +ldr q21, [x0, #832] +mla v29.4S, v5.4S, v31.s[0] +add v0.4s, v0.4s, v6.4s +ldr q6, [x0, #960] +mul v13.4S, v13.4S,v3.s[0] +sub v5.4s, v21.4s, v4.4s +ldr q30, [x0, #848] +mul v2.4S, v2.4S,v3.s[0] +add v21.4s, v21.4s, v4.4s +ldr q4, [x0, #976] +mla v13.4S, v22.4S, v31.s[0] +mla v2.4S, v10.4S, v31.s[0] +sub v10.4s, v30.4s, v1.4s +sqrdmulh v22.4S, v0.4S, v16.s[1] +add v30.4s, v30.4s, v1.4s +mul v0.4S, v0.4S,v12.s[1] +sqrdmulh v1.4S, v18.4S, v16.s[2] +sub v19.4s, v15.4s, v11.4s +mul v18.4S, v18.4S,v12.s[2] +add v15.4s, v15.4s, v11.4s +sqrdmulh v16.4S, v30.4S, v23.s[1] +sub v12.4s, v14.4s, v29.4s +mul v30.4S, v30.4S,v24.s[1] +add v14.4s, v14.4s, v29.4s +sqrdmulh v29.4S, v10.4S, v23.s[2] +sub v11.4s, v6.4s, v13.4s +mul v10.4S, v10.4S,v24.s[2] +add v6.4s, v6.4s, v13.4s +mla v0.4S, v22.4S, v31.s[0] +sub v22.4s, v4.4s, v2.4s +sqrdmulh v23.4S, v14.4S, v28.s[1] +add v4.4s, v4.4s, v2.4s +mla v18.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v12.4S, v28.s[2] +sub v2.4s, v17.4s, v0.4s +mla v30.4S, v16.4S, v31.s[0] +sqrdmulh v16.4S, v4.4S, v26.s[1] +add v17.4s, v17.4s, v0.4s +str q2, [x0, #784] +mla v10.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v22.4S, v26.s[2] +sub v2.4s, v20.4s, v18.4s +str q17, [x0, #768] +mul v14.4S, v14.4S,v25.s[1] +add v20.4s, v20.4s, v18.4s +mul v12.4S, v12.4S,v25.s[2] +str q2, [x0, #816] +mla v14.4S, v23.4S, v31.s[0] +sub v23.4s, v21.4s, v30.4s +mla v12.4S, v1.4S, v31.s[0] +str q20, [x0, #800] +mul v4.4S, v4.4S,v3.s[1] +str q23, [x0, #848] +mul v22.4S, v22.4S,v3.s[2] +add v21.4s, v21.4s, v30.4s +str q21, [x0, #832] +mla v4.4S, v16.4S, v31.s[0] +sub v16.4s, v5.4s, v10.4s +str q16, [x0, #880] +mla v22.4S, v29.4S, v31.s[0] +add v5.4s, v5.4s, v10.4s +str q5, [x0, #864] +sub v26.4s, v15.4s, v14.4s +str q26, [x0, #912] +add v15.4s, v15.4s, v14.4s +str q15, [x0, #896] +sub v15.4s, v19.4s, v12.4s +str q15, [x0, #944] +add v19.4s, v19.4s, v12.4s +str q19, [x0, #928] +sub v19.4s, v6.4s, v4.4s +str q19, [x0, #976] +add v6.4s, v6.4s, v4.4s +str q6, [x0, #960] +sub v6.4s, v11.4s, v22.4s +str q6, [x0, #1008] +add v11.4s, v11.4s, v22.4s +str q11, [x0, #992] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1464 +// Instruction count: 1460 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_9.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_9.s new file mode 100644 index 0000000..24f5a8a --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_9.s @@ -0,0 +1,1502 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_7_z4_9 +.global _ntt_u32_incomplete_neon_asm_var_4_2_7_z4_9 +ntt_u32_incomplete_neon_asm_var_4_2_7_z4_9: +_ntt_u32_incomplete_neon_asm_var_4_2_7_z4_9: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #928] +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +ldr q20, [x0, #992] +sqrdmulh v19.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q18, [x0, #800] +sqrdmulh v17.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +ldr q16, [x0, #864] +sqrdmulh v3.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +mla v22.4S, v21.4S, v31.s[0] +mla v20.4S, v19.4S, v31.s[0] +mla v18.4S, v17.4S, v31.s[0] +mla v16.4S, v3.4S, v31.s[0] +ldr q3, [x0, #544] +sqrdmulh v17.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +ldr q19, [x0, #608] +sqrdmulh v21.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q2, [x0, #672] +ldr q1, [x0, #416] +sqrdmulh v0.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +sub v15.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +ldr q22, [x0, #736] +ldr q14, [x0, #480] +sqrdmulh v13.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v12.4s, v14.4s, v20.4s +add v14.4s, v14.4s, v20.4s +ldr q20, [x0, #288] +mla v3.4S, v17.4S, v31.s[0] +mla v19.4S, v21.4S, v31.s[0] +sub v21.4s, v20.4s, v18.4s +mla v2.4S, v0.4S, v31.s[0] +mla v22.4S, v13.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +ldr q18, [x0, #352] +sqrdmulh v13.4S, v1.4S, v29.s[1] +mul v1.4S, v1.4S,v30.s[1] +sub v0.4s, v18.4s, v16.4s +sqrdmulh v17.4S, v14.4S, v29.s[1] +mul v14.4S, v14.4S,v30.s[1] +add v18.4s, v18.4s, v16.4s +ldr q16, [x0, #32] +sqrdmulh v11.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v10.4s, v16.4s, v3.4s +add v16.4s, v16.4s, v3.4s +ldr q3, [x0, #96] +sqrdmulh v9.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v8.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +ldr q19, [x0, #160] +mla v1.4S, v13.4S, v31.s[0] +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v19.4s, v2.4s +mla v20.4S, v11.4S, v31.s[0] +mla v18.4S, v9.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +ldr q2, [x0, #224] +sqrdmulh v9.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +sub v11.4s, v2.4s, v22.4s +sqrdmulh v13.4S, v12.4S, v29.s[2] +mul v12.4S, v12.4S,v30.s[2] +add v2.4s, v2.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +sub v7.4s, v19.4s, v1.4s +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v29.s[2] +mul v0.4S, v0.4S,v30.s[2] +sub v6.4s, v2.4s, v14.4s +add v2.4s, v2.4s, v14.4s +mla v15.4S, v9.4S, v31.s[0] +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v16.4s, v20.4s +mla v21.4S, v22.4S, v31.s[0] +mla v0.4S, v1.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v7.4S, v27.s[1] +mul v7.4S, v7.4S,v28.s[1] +sub v1.4s, v3.4s, v18.4s +sqrdmulh v22.4S, v6.4S, v27.s[1] +mul v6.4S, v6.4S,v28.s[1] +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v19.4S, v27.s[0] +mul v19.4S, v19.4S,v28.s[0] +sub v9.4s, v17.4s, v15.4s +add v17.4s, v17.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v27.s[0] +mul v2.4S, v2.4S,v28.s[0] +sub v14.4s, v11.4s, v12.4s +add v11.4s, v11.4s, v12.4s +mla v7.4S, v20.4S, v31.s[0] +mla v6.4S, v22.4S, v31.s[0] +sub v22.4s, v10.4s, v21.4s +mla v19.4S, v18.4S, v31.s[0] +mla v2.4S, v15.4S, v31.s[0] +add v10.4s, v10.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v27.s[2] +mul v17.4S, v17.4S,v28.s[2] +sub v15.4s, v8.4s, v0.4s +sqrdmulh v18.4S, v11.4S, v27.s[2] +mul v11.4S, v11.4S,v28.s[2] +add v8.4s, v8.4s, v0.4s +sqrdmulh v0.4S, v9.4S, v27.s[3] +mul v9.4S, v9.4S,v28.s[3] +sub v20.4s, v13.4s, v7.4s +add v13.4s, v13.4s, v7.4s +sqrdmulh v7.4S, v14.4S, v27.s[3] +mul v14.4S, v14.4S,v28.s[3] +sub v12.4s, v1.4s, v6.4s +add v1.4s, v1.4s, v6.4s +mla v17.4S, v21.4S, v31.s[0] +mla v11.4S, v18.4S, v31.s[0] +sub v18.4s, v16.4s, v19.4s +mla v9.4S, v0.4S, v31.s[0] +mla v14.4S, v7.4S, v31.s[0] +add v16.4s, v16.4s, v19.4s +sqrdmulh v19.4S, v1.4S, v25.s[2] +mul v1.4S, v1.4S,v26.s[2] +sub v7.4s, v3.4s, v2.4s +sqrdmulh v0.4S, v12.4S, v25.s[3] +mul v12.4S, v12.4S,v26.s[3] +add v3.4s, v3.4s, v2.4s +sqrdmulh v2.4S, v7.4S, v25.s[1] +mul v7.4S, v7.4S,v26.s[1] +sub v21.4s, v10.4s, v17.4s +add v10.4s, v10.4s, v17.4s +sqrdmulh v17.4S, v3.4S, v25.s[0] +mul v3.4S, v3.4S,v26.s[0] +sub v6.4s, v8.4s, v11.4s +add v8.4s, v8.4s, v11.4s +mla v1.4S, v19.4S, v31.s[0] +mla v12.4S, v0.4S, v31.s[0] +sub v0.4s, v22.4s, v9.4s +mla v7.4S, v2.4S, v31.s[0] +mla v3.4S, v17.4S, v31.s[0] +add v22.4s, v22.4s, v9.4s +sqrdmulh v9.4S, v8.4S, v23.s[0] +mul v8.4S, v8.4S,v24.s[0] +sub v17.4s, v15.4s, v14.4s +sqrdmulh v2.4S, v6.4S, v23.s[1] +mul v6.4S, v6.4S,v24.s[1] +add v15.4s, v15.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v23.s[2] +mul v15.4S, v15.4S,v24.s[2] +sub v19.4s, v13.4s, v1.4s +add v13.4s, v13.4s, v1.4s +sqrdmulh v1.4S, v17.4S, v23.s[3] +mul v17.4S, v17.4S,v24.s[3] +sub v11.4s, v20.4s, v12.4s +add v20.4s, v20.4s, v12.4s +mla v8.4S, v9.4S, v31.s[0] +mla v6.4S, v2.4S, v31.s[0] +sub v2.4s, v18.4s, v7.4s +str q13, [x0, #288] +mla v15.4S, v14.4S, v31.s[0] +mla v17.4S, v1.4S, v31.s[0] +add v18.4s, v18.4s, v7.4s +str q19, [x0, #352] +ldr q19, [x0, #944] +sqrdmulh v7.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +sub v1.4s, v16.4s, v3.4s +str q20, [x0, #416] +ldr q20, [x0, #1008] +sqrdmulh v14.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v16.4s, v16.4s, v3.4s +str q11, [x0, #480] +ldr q11, [x0, #816] +sqrdmulh v3.4S, v11.4S, v29.s[0] +mul v11.4S, v11.4S,v30.s[0] +sub v13.4s, v10.4s, v8.4s +add v10.4s, v10.4s, v8.4s +ldr q8, [x0, #880] +sqrdmulh v9.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v12.4s, v21.4s, v6.4s +add v21.4s, v21.4s, v6.4s +mla v19.4S, v7.4S, v31.s[0] +mla v20.4S, v14.4S, v31.s[0] +sub v14.4s, v22.4s, v15.4s +str q18, [x0, #160] +mla v11.4S, v3.4S, v31.s[0] +mla v8.4S, v9.4S, v31.s[0] +add v22.4s, v22.4s, v15.4s +str q2, [x0, #224] +ldr q2, [x0, #560] +sqrdmulh v15.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +sub v9.4s, v0.4s, v17.4s +str q16, [x0, #32] +ldr q16, [x0, #624] +sqrdmulh v3.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +add v0.4s, v0.4s, v17.4s +str q1, [x0, #96] +ldr q1, [x0, #688] +ldr q17, [x0, #432] +sqrdmulh v18.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +sub v7.4s, v17.4s, v19.4s +add v17.4s, v17.4s, v19.4s +ldr q19, [x0, #752] +ldr q6, [x0, #496] +sqrdmulh v5.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +sub v4.4s, v6.4s, v20.4s +add v6.4s, v6.4s, v20.4s +ldr q20, [x0, #304] +mla v2.4S, v15.4S, v31.s[0] +mla v16.4S, v3.4S, v31.s[0] +sub v3.4s, v20.4s, v11.4s +str q10, [x0, #544] +mla v1.4S, v18.4S, v31.s[0] +mla v19.4S, v5.4S, v31.s[0] +add v20.4s, v20.4s, v11.4s +str q13, [x0, #608] +ldr q13, [x0, #368] +sqrdmulh v11.4S, v17.4S, v29.s[1] +mul v17.4S, v17.4S,v30.s[1] +sub v5.4s, v13.4s, v8.4s +str q21, [x0, #672] +sqrdmulh v21.4S, v6.4S, v29.s[1] +mul v6.4S, v6.4S,v30.s[1] +add v13.4s, v13.4s, v8.4s +str q12, [x0, #736] +ldr q12, [x0, #48] +sqrdmulh v8.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v18.4s, v12.4s, v2.4s +add v12.4s, v12.4s, v2.4s +ldr q2, [x0, #112] +sqrdmulh v10.4S, v13.4S, v29.s[1] +mul v13.4S, v13.4S,v30.s[1] +sub v15.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +ldr q16, [x0, #176] +mla v17.4S, v11.4S, v31.s[0] +mla v6.4S, v21.4S, v31.s[0] +sub v21.4s, v16.4s, v1.4s +str q22, [x0, #800] +mla v20.4S, v8.4S, v31.s[0] +mla v13.4S, v10.4S, v31.s[0] +add v16.4s, v16.4s, v1.4s +str q14, [x0, #864] +ldr q14, [x0, #240] +sqrdmulh v1.4S, v7.4S, v29.s[2] +mul v7.4S, v7.4S,v30.s[2] +sub v10.4s, v14.4s, v19.4s +str q0, [x0, #928] +sqrdmulh v0.4S, v4.4S, v29.s[2] +mul v4.4S, v4.4S,v30.s[2] +add v14.4s, v14.4s, v19.4s +str q9, [x0, #992] +sqrdmulh v9.4S, v3.4S, v29.s[2] +mul v3.4S, v3.4S,v30.s[2] +sub v19.4s, v16.4s, v17.4s +add v16.4s, v16.4s, v17.4s +sqrdmulh v17.4S, v5.4S, v29.s[2] +mul v5.4S, v5.4S,v30.s[2] +sub v8.4s, v14.4s, v6.4s +add v14.4s, v14.4s, v6.4s +mla v7.4S, v1.4S, v31.s[0] +mla v4.4S, v0.4S, v31.s[0] +sub v0.4s, v12.4s, v20.4s +mla v3.4S, v9.4S, v31.s[0] +mla v5.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v20.4s +sqrdmulh v20.4S, v19.4S, v27.s[1] +mul v19.4S, v19.4S,v28.s[1] +sub v17.4s, v2.4s, v13.4s +sqrdmulh v9.4S, v8.4S, v27.s[1] +mul v8.4S, v8.4S,v28.s[1] +add v2.4s, v2.4s, v13.4s +sqrdmulh v13.4S, v16.4S, v27.s[0] +mul v16.4S, v16.4S,v28.s[0] +sub v1.4s, v21.4s, v7.4s +add v21.4s, v21.4s, v7.4s +sqrdmulh v7.4S, v14.4S, v27.s[0] +mul v14.4S, v14.4S,v28.s[0] +sub v6.4s, v10.4s, v4.4s +add v10.4s, v10.4s, v4.4s +mla v19.4S, v20.4S, v31.s[0] +mla v8.4S, v9.4S, v31.s[0] +sub v9.4s, v18.4s, v3.4s +mla v16.4S, v13.4S, v31.s[0] +mla v14.4S, v7.4S, v31.s[0] +add v18.4s, v18.4s, v3.4s +sqrdmulh v3.4S, v21.4S, v27.s[2] +mul v21.4S, v21.4S,v28.s[2] +sub v7.4s, v15.4s, v5.4s +sqrdmulh v13.4S, v10.4S, v27.s[2] +mul v10.4S, v10.4S,v28.s[2] +add v15.4s, v15.4s, v5.4s +sqrdmulh v5.4S, v1.4S, v27.s[3] +mul v1.4S, v1.4S,v28.s[3] +sub v20.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v27.s[3] +mul v6.4S, v6.4S,v28.s[3] +sub v4.4s, v17.4s, v8.4s +add v17.4s, v17.4s, v8.4s +mla v21.4S, v3.4S, v31.s[0] +mla v10.4S, v13.4S, v31.s[0] +sub v13.4s, v12.4s, v16.4s +mla v1.4S, v5.4S, v31.s[0] +mla v6.4S, v19.4S, v31.s[0] +add v12.4s, v12.4s, v16.4s +sqrdmulh v16.4S, v17.4S, v25.s[2] +mul v17.4S, v17.4S,v26.s[2] +sub v19.4s, v2.4s, v14.4s +sqrdmulh v5.4S, v4.4S, v25.s[3] +mul v4.4S, v4.4S,v26.s[3] +add v2.4s, v2.4s, v14.4s +sqrdmulh v14.4S, v19.4S, v25.s[1] +mul v19.4S, v19.4S,v26.s[1] +sub v3.4s, v18.4s, v21.4s +add v18.4s, v18.4s, v21.4s +sqrdmulh v21.4S, v2.4S, v25.s[0] +mul v2.4S, v2.4S,v26.s[0] +sub v8.4s, v15.4s, v10.4s +add v15.4s, v15.4s, v10.4s +mla v17.4S, v16.4S, v31.s[0] +mla v4.4S, v5.4S, v31.s[0] +sub v5.4s, v9.4s, v1.4s +mla v19.4S, v14.4S, v31.s[0] +mla v2.4S, v21.4S, v31.s[0] +add v9.4s, v9.4s, v1.4s +sqrdmulh v1.4S, v15.4S, v23.s[0] +mul v15.4S, v15.4S,v24.s[0] +sub v21.4s, v7.4s, v6.4s +sqrdmulh v14.4S, v8.4S, v23.s[1] +mul v8.4S, v8.4S,v24.s[1] +add v7.4s, v7.4s, v6.4s +sqrdmulh v6.4S, v7.4S, v23.s[2] +mul v7.4S, v7.4S,v24.s[2] +sub v16.4s, v0.4s, v17.4s +add v0.4s, v0.4s, v17.4s +sqrdmulh v17.4S, v21.4S, v23.s[3] +mul v21.4S, v21.4S,v24.s[3] +sub v10.4s, v20.4s, v4.4s +add v20.4s, v20.4s, v4.4s +mla v15.4S, v1.4S, v31.s[0] +mla v8.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v19.4s +str q0, [x0, #304] +mla v7.4S, v6.4S, v31.s[0] +mla v21.4S, v17.4S, v31.s[0] +add v13.4s, v13.4s, v19.4s +str q16, [x0, #368] +ldr q16, [x0, #896] +sqrdmulh v19.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v17.4s, v12.4s, v2.4s +str q20, [x0, #432] +ldr q20, [x0, #960] +sqrdmulh v6.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v12.4s, v12.4s, v2.4s +str q10, [x0, #496] +ldr q10, [x0, #768] +sqrdmulh v2.4S, v10.4S, v29.s[0] +mul v10.4S, v10.4S,v30.s[0] +sub v0.4s, v18.4s, v15.4s +add v18.4s, v18.4s, v15.4s +ldr q15, [x0, #832] +sqrdmulh v1.4S, v15.4S, v29.s[0] +mul v15.4S, v15.4S,v30.s[0] +sub v4.4s, v3.4s, v8.4s +add v3.4s, v3.4s, v8.4s +mla v16.4S, v19.4S, v31.s[0] +mla v20.4S, v6.4S, v31.s[0] +sub v6.4s, v9.4s, v7.4s +str q13, [x0, #176] +mla v10.4S, v2.4S, v31.s[0] +mla v15.4S, v1.4S, v31.s[0] +add v9.4s, v9.4s, v7.4s +str q14, [x0, #240] +ldr q14, [x0, #512] +sqrdmulh v7.4S, v14.4S, v29.s[0] +mul v14.4S, v14.4S,v30.s[0] +sub v1.4s, v5.4s, v21.4s +str q12, [x0, #48] +ldr q12, [x0, #576] +sqrdmulh v2.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +add v5.4s, v5.4s, v21.4s +str q17, [x0, #112] +ldr q17, [x0, #640] +ldr q21, [x0, #384] +sqrdmulh v13.4S, v17.4S, v29.s[0] +mul v17.4S, v17.4S,v30.s[0] +sub v19.4s, v21.4s, v16.4s +add v21.4s, v21.4s, v16.4s +ldr q16, [x0, #704] +ldr q8, [x0, #448] +sqrdmulh v22.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +sub v11.4s, v8.4s, v20.4s +add v8.4s, v8.4s, v20.4s +ldr q20, [x0, #256] +mla v14.4S, v7.4S, v31.s[0] +mla v12.4S, v2.4S, v31.s[0] +sub v2.4s, v20.4s, v10.4s +str q18, [x0, #560] +mla v17.4S, v13.4S, v31.s[0] +mla v16.4S, v22.4S, v31.s[0] +add v20.4s, v20.4s, v10.4s +str q0, [x0, #624] +ldr q0, [x0, #320] +sqrdmulh v10.4S, v21.4S, v29.s[1] +mul v21.4S, v21.4S,v30.s[1] +sub v22.4s, v0.4s, v15.4s +str q3, [x0, #688] +sqrdmulh v3.4S, v8.4S, v29.s[1] +mul v8.4S, v8.4S,v30.s[1] +add v0.4s, v0.4s, v15.4s +str q4, [x0, #752] +ldr q4, [x0, #0] +sqrdmulh v15.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v13.4s, v4.4s, v14.4s +add v4.4s, v4.4s, v14.4s +ldr q14, [x0, #64] +sqrdmulh v18.4S, v0.4S, v29.s[1] +mul v0.4S, v0.4S,v30.s[1] +sub v7.4s, v14.4s, v12.4s +add v14.4s, v14.4s, v12.4s +ldr q12, [x0, #128] +mla v21.4S, v10.4S, v31.s[0] +mla v8.4S, v3.4S, v31.s[0] +sub v3.4s, v12.4s, v17.4s +str q9, [x0, #816] +mla v20.4S, v15.4S, v31.s[0] +mla v0.4S, v18.4S, v31.s[0] +add v12.4s, v12.4s, v17.4s +str q6, [x0, #880] +ldr q6, [x0, #192] +sqrdmulh v17.4S, v19.4S, v29.s[2] +mul v19.4S, v19.4S,v30.s[2] +sub v18.4s, v6.4s, v16.4s +str q5, [x0, #944] +sqrdmulh v5.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +add v6.4s, v6.4s, v16.4s +str q1, [x0, #1008] +sqrdmulh v1.4S, v2.4S, v29.s[2] +mul v2.4S, v2.4S,v30.s[2] +sub v16.4s, v12.4s, v21.4s +add v12.4s, v12.4s, v21.4s +sqrdmulh v21.4S, v22.4S, v29.s[2] +mul v22.4S, v22.4S,v30.s[2] +sub v15.4s, v6.4s, v8.4s +add v6.4s, v6.4s, v8.4s +mla v19.4S, v17.4S, v31.s[0] +mla v11.4S, v5.4S, v31.s[0] +sub v5.4s, v4.4s, v20.4s +mla v2.4S, v1.4S, v31.s[0] +mla v22.4S, v21.4S, v31.s[0] +add v4.4s, v4.4s, v20.4s +sqrdmulh v20.4S, v16.4S, v27.s[1] +mul v16.4S, v16.4S,v28.s[1] +sub v21.4s, v14.4s, v0.4s +sqrdmulh v1.4S, v15.4S, v27.s[1] +mul v15.4S, v15.4S,v28.s[1] +add v14.4s, v14.4s, v0.4s +sqrdmulh v0.4S, v12.4S, v27.s[0] +mul v12.4S, v12.4S,v28.s[0] +sub v17.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +sqrdmulh v19.4S, v6.4S, v27.s[0] +mul v6.4S, v6.4S,v28.s[0] +sub v8.4s, v18.4s, v11.4s +add v18.4s, v18.4s, v11.4s +mla v16.4S, v20.4S, v31.s[0] +mla v15.4S, v1.4S, v31.s[0] +sub v1.4s, v13.4s, v2.4s +mla v12.4S, v0.4S, v31.s[0] +mla v6.4S, v19.4S, v31.s[0] +add v13.4s, v13.4s, v2.4s +sqrdmulh v2.4S, v3.4S, v27.s[2] +mul v3.4S, v3.4S,v28.s[2] +sub v19.4s, v7.4s, v22.4s +sqrdmulh v0.4S, v18.4S, v27.s[2] +mul v18.4S, v18.4S,v28.s[2] +add v7.4s, v7.4s, v22.4s +sqrdmulh v22.4S, v17.4S, v27.s[3] +mul v17.4S, v17.4S,v28.s[3] +sub v20.4s, v5.4s, v16.4s +add v5.4s, v5.4s, v16.4s +sqrdmulh v16.4S, v8.4S, v27.s[3] +mul v8.4S, v8.4S,v28.s[3] +sub v11.4s, v21.4s, v15.4s +add v21.4s, v21.4s, v15.4s +mla v3.4S, v2.4S, v31.s[0] +mla v18.4S, v0.4S, v31.s[0] +sub v0.4s, v4.4s, v12.4s +mla v17.4S, v22.4S, v31.s[0] +mla v8.4S, v16.4S, v31.s[0] +add v4.4s, v4.4s, v12.4s +sqrdmulh v12.4S, v21.4S, v25.s[2] +mul v21.4S, v21.4S,v26.s[2] +sub v16.4s, v14.4s, v6.4s +sqrdmulh v22.4S, v11.4S, v25.s[3] +mul v11.4S, v11.4S,v26.s[3] +add v14.4s, v14.4s, v6.4s +sqrdmulh v6.4S, v16.4S, v25.s[1] +mul v16.4S, v16.4S,v26.s[1] +sub v2.4s, v13.4s, v3.4s +add v13.4s, v13.4s, v3.4s +sqrdmulh v3.4S, v14.4S, v25.s[0] +mul v14.4S, v14.4S,v26.s[0] +sub v15.4s, v7.4s, v18.4s +add v7.4s, v7.4s, v18.4s +mla v21.4S, v12.4S, v31.s[0] +mla v11.4S, v22.4S, v31.s[0] +sub v22.4s, v1.4s, v17.4s +mla v16.4S, v6.4S, v31.s[0] +mla v14.4S, v3.4S, v31.s[0] +add v1.4s, v1.4s, v17.4s +sqrdmulh v17.4S, v7.4S, v23.s[0] +mul v7.4S, v7.4S,v24.s[0] +sub v3.4s, v19.4s, v8.4s +sqrdmulh v6.4S, v15.4S, v23.s[1] +mul v15.4S, v15.4S,v24.s[1] +add v19.4s, v19.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v23.s[2] +mul v19.4S, v19.4S,v24.s[2] +sub v12.4s, v5.4s, v21.4s +add v5.4s, v5.4s, v21.4s +sqrdmulh v21.4S, v3.4S, v23.s[3] +mul v3.4S, v3.4S,v24.s[3] +sub v18.4s, v20.4s, v11.4s +add v20.4s, v20.4s, v11.4s +mla v7.4S, v17.4S, v31.s[0] +mla v15.4S, v6.4S, v31.s[0] +sub v6.4s, v0.4s, v16.4s +str q5, [x0, #256] +mla v19.4S, v8.4S, v31.s[0] +mla v3.4S, v21.4S, v31.s[0] +add v0.4s, v0.4s, v16.4s +str q12, [x0, #320] +ldr q12, [x0, #912] +sqrdmulh v16.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +sub v21.4s, v4.4s, v14.4s +str q20, [x0, #384] +ldr q20, [x0, #976] +sqrdmulh v8.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +add v4.4s, v4.4s, v14.4s +str q18, [x0, #448] +ldr q18, [x0, #784] +sqrdmulh v14.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +sub v5.4s, v13.4s, v7.4s +add v13.4s, v13.4s, v7.4s +ldr q7, [x0, #848] +sqrdmulh v17.4S, v7.4S, v29.s[0] +mul v7.4S, v7.4S,v30.s[0] +sub v11.4s, v2.4s, v15.4s +add v2.4s, v2.4s, v15.4s +mla v12.4S, v16.4S, v31.s[0] +mla v20.4S, v8.4S, v31.s[0] +sub v8.4s, v1.4s, v19.4s +str q0, [x0, #128] +mla v18.4S, v14.4S, v31.s[0] +mla v7.4S, v17.4S, v31.s[0] +add v1.4s, v1.4s, v19.4s +str q6, [x0, #192] +ldr q6, [x0, #528] +sqrdmulh v19.4S, v6.4S, v29.s[0] +mul v6.4S, v6.4S,v30.s[0] +sub v17.4s, v22.4s, v3.4s +str q4, [x0, #0] +ldr q4, [x0, #592] +sqrdmulh v14.4S, v4.4S, v29.s[0] +mul v4.4S, v4.4S,v30.s[0] +add v22.4s, v22.4s, v3.4s +str q21, [x0, #64] +ldr q21, [x0, #656] +ldr q3, [x0, #400] +sqrdmulh v0.4S, v21.4S, v29.s[0] +mul v21.4S, v21.4S,v30.s[0] +sub v16.4s, v3.4s, v12.4s +add v3.4s, v3.4s, v12.4s +ldr q12, [x0, #720] +ldr q15, [x0, #464] +sqrdmulh v9.4S, v12.4S, v29.s[0] +mul v12.4S, v12.4S,v30.s[0] +sub v10.4s, v15.4s, v20.4s +add v15.4s, v15.4s, v20.4s +ldr q20, [x0, #272] +mla v6.4S, v19.4S, v31.s[0] +mla v4.4S, v14.4S, v31.s[0] +sub v14.4s, v20.4s, v18.4s +str q13, [x0, #512] +mla v21.4S, v0.4S, v31.s[0] +mla v12.4S, v9.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +str q5, [x0, #576] +ldr q5, [x0, #336] +sqrdmulh v18.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v9.4s, v5.4s, v7.4s +str q2, [x0, #640] +sqrdmulh v2.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +add v5.4s, v5.4s, v7.4s +str q11, [x0, #704] +ldr q11, [x0, #16] +sqrdmulh v7.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v0.4s, v11.4s, v6.4s +add v11.4s, v11.4s, v6.4s +ldr q6, [x0, #80] +sqrdmulh v13.4S, v5.4S, v29.s[1] +mul v5.4S, v5.4S,v30.s[1] +sub v19.4s, v6.4s, v4.4s +add v6.4s, v6.4s, v4.4s +ldr q4, [x0, #144] +mla v3.4S, v18.4S, v31.s[0] +mla v15.4S, v2.4S, v31.s[0] +sub v2.4s, v4.4s, v21.4s +str q1, [x0, #768] +mla v20.4S, v7.4S, v31.s[0] +mla v5.4S, v13.4S, v31.s[0] +add v4.4s, v4.4s, v21.4s +str q8, [x0, #832] +ldr q8, [x0, #208] +sqrdmulh v21.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +sub v13.4s, v8.4s, v12.4s +str q22, [x0, #896] +sqrdmulh v22.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +add v8.4s, v8.4s, v12.4s +str q17, [x0, #960] +sqrdmulh v17.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v12.4s, v4.4s, v3.4s +add v4.4s, v4.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v29.s[2] +mul v9.4S, v9.4S,v30.s[2] +sub v7.4s, v8.4s, v15.4s +add v8.4s, v8.4s, v15.4s +mla v16.4S, v21.4S, v31.s[0] +mla v10.4S, v22.4S, v31.s[0] +sub v22.4s, v11.4s, v20.4s +mla v14.4S, v17.4S, v31.s[0] +mla v9.4S, v3.4S, v31.s[0] +add v11.4s, v11.4s, v20.4s +sqrdmulh v20.4S, v12.4S, v27.s[1] +mul v12.4S, v12.4S,v28.s[1] +sub v3.4s, v6.4s, v5.4s +sqrdmulh v17.4S, v7.4S, v27.s[1] +mul v7.4S, v7.4S,v28.s[1] +add v6.4s, v6.4s, v5.4s +sqrdmulh v5.4S, v4.4S, v27.s[0] +mul v4.4S, v4.4S,v28.s[0] +sub v21.4s, v2.4s, v16.4s +add v2.4s, v2.4s, v16.4s +sqrdmulh v16.4S, v8.4S, v27.s[0] +mul v8.4S, v8.4S,v28.s[0] +sub v15.4s, v13.4s, v10.4s +add v13.4s, v13.4s, v10.4s +mla v12.4S, v20.4S, v31.s[0] +mla v7.4S, v17.4S, v31.s[0] +sub v17.4s, v0.4s, v14.4s +mla v4.4S, v5.4S, v31.s[0] +mla v8.4S, v16.4S, v31.s[0] +add v0.4s, v0.4s, v14.4s +sqrdmulh v14.4S, v2.4S, v27.s[2] +mul v2.4S, v2.4S,v28.s[2] +sub v16.4s, v19.4s, v9.4s +sqrdmulh v5.4S, v13.4S, v27.s[2] +mul v13.4S, v13.4S,v28.s[2] +add v19.4s, v19.4s, v9.4s +sqrdmulh v9.4S, v21.4S, v27.s[3] +mul v21.4S, v21.4S,v28.s[3] +sub v20.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +sqrdmulh v12.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +sub v10.4s, v3.4s, v7.4s +add v3.4s, v3.4s, v7.4s +mla v2.4S, v14.4S, v31.s[0] +mla v13.4S, v5.4S, v31.s[0] +sub v5.4s, v11.4s, v4.4s +mla v21.4S, v9.4S, v31.s[0] +mla v15.4S, v12.4S, v31.s[0] +add v11.4s, v11.4s, v4.4s +sqrdmulh v4.4S, v3.4S, v25.s[2] +mul v3.4S, v3.4S,v26.s[2] +sub v12.4s, v6.4s, v8.4s +sqrdmulh v9.4S, v10.4S, v25.s[3] +mul v10.4S, v10.4S,v26.s[3] +add v6.4s, v6.4s, v8.4s +sqrdmulh v8.4S, v12.4S, v25.s[1] +mul v12.4S, v12.4S,v26.s[1] +sub v14.4s, v0.4s, v2.4s +add v0.4s, v0.4s, v2.4s +sqrdmulh v2.4S, v6.4S, v25.s[0] +mul v6.4S, v6.4S,v26.s[0] +sub v7.4s, v19.4s, v13.4s +add v19.4s, v19.4s, v13.4s +mla v3.4S, v4.4S, v31.s[0] +mla v10.4S, v9.4S, v31.s[0] +sub v9.4s, v17.4s, v21.4s +mla v12.4S, v8.4S, v31.s[0] +mla v6.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v21.4s +sqrdmulh v21.4S, v19.4S, v23.s[0] +mul v19.4S, v19.4S,v24.s[0] +sub v2.4s, v16.4s, v15.4s +sqrdmulh v8.4S, v7.4S, v23.s[1] +mul v7.4S, v7.4S,v24.s[1] +add v16.4s, v16.4s, v15.4s +sqrdmulh v15.4S, v16.4S, v23.s[2] +mul v16.4S, v16.4S,v24.s[2] +sub v4.4s, v22.4s, v3.4s +add v22.4s, v22.4s, v3.4s +sqrdmulh v3.4S, v2.4S, v23.s[3] +mul v2.4S, v2.4S,v24.s[3] +sub v13.4s, v20.4s, v10.4s +add v20.4s, v20.4s, v10.4s +mla v19.4S, v21.4S, v31.s[0] +mla v7.4S, v8.4S, v31.s[0] +sub v8.4s, v5.4s, v12.4s +str q22, [x0, #272] +mla v16.4S, v15.4S, v31.s[0] +mla v2.4S, v3.4S, v31.s[0] +add v5.4s, v5.4s, v12.4s +str q4, [x0, #336] +sub v23.4s, v11.4s, v6.4s +str q20, [x0, #400] +add v11.4s, v11.4s, v6.4s +str q13, [x0, #464] +sub v13.4s, v0.4s, v19.4s +add v0.4s, v0.4s, v19.4s +sub v19.4s, v14.4s, v7.4s +add v14.4s, v14.4s, v7.4s +sub v7.4s, v17.4s, v16.4s +str q5, [x0, #144] +add v17.4s, v17.4s, v16.4s +str q8, [x0, #208] +sub v8.4s, v9.4s, v2.4s +str q11, [x0, #16] +add v9.4s, v9.4s, v2.4s +str q23, [x0, #80] +str q0, [x0, #528] +str q13, [x0, #592] +str q14, [x0, #656] +str q19, [x0, #720] +str q17, [x0, #784] +str q7, [x0, #848] +str q9, [x0, #912] +str q8, [x0, #976] +ldr q18, [x0, #224] +ldr q1, [x0, #160] +ldr q10, [x0, #32] +ldr q21, [x17, #+128] +ldr q22, [x17, #+144] +sqrdmulh v15.4S, v10.4S, v22.s[0] +mul v10.4S, v10.4S,v21.s[0] +ldr q3, [x0, #48] +sqrdmulh v12.4S, v3.4S, v22.s[0] +mul v3.4S, v3.4S,v21.s[0] +ldr q4, [x17, #+160] +ldr q30, [x17, #+176] +ldr q29, [x0, #96] +sqrdmulh v28.4S, v29.4S, v30.s[0] +mul v29.4S, v29.4S,v4.s[0] +ldr q27, [x0, #112] +sqrdmulh v26.4S, v27.4S, v30.s[0] +mul v27.4S, v27.4S,v4.s[0] +ldr q25, [x17, #+192] +ldr q24, [x17, #+208] +mla v10.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v1.4S, v24.s[0] +ldr q20, [x0, #176] +mla v3.4S, v12.4S, v31.s[0] +sqrdmulh v12.4S, v20.4S, v24.s[0] +ldr q6, [x17, #+224] +ldr q5, [x17, #+240] +mla v29.4S, v28.4S, v31.s[0] +sqrdmulh v28.4S, v18.4S, v5.s[0] +ldr q16, [x0, #240] +mla v27.4S, v26.4S, v31.s[0] +sqrdmulh v26.4S, v16.4S, v5.s[0] +ldr q11, [x0, #0] +ldr q2, [x0, #128] +mul v1.4S, v1.4S,v25.s[0] +sub v23.4s, v11.4s, v10.4s +ldr q0, [x0, #16] +mul v20.4S, v20.4S,v25.s[0] +add v11.4s, v11.4s, v10.4s +ldr q10, [x0, #144] +mla v1.4S, v15.4S, v31.s[0] +sub v15.4s, v0.4s, v3.4s +ldr q13, [x0, #64] +mla v20.4S, v12.4S, v31.s[0] +add v0.4s, v0.4s, v3.4s +ldr q3, [x0, #192] +mul v18.4S, v18.4S,v6.s[0] +sub v12.4s, v13.4s, v29.4s +ldr q14, [x0, #80] +mul v16.4S, v16.4S,v6.s[0] +add v13.4s, v13.4s, v29.4s +ldr q29, [x0, #208] +mla v18.4S, v28.4S, v31.s[0] +nop +mla v16.4S, v26.4S, v31.s[0] +sub v26.4s, v14.4s, v27.4s +sqrdmulh v28.4S, v0.4S, v22.s[1] +add v14.4s, v14.4s, v27.4s +mul v0.4S, v0.4S,v21.s[1] +nop +sqrdmulh v27.4S, v15.4S, v22.s[2] +sub v19.4s, v2.4s, v1.4s +mul v15.4S, v15.4S,v21.s[2] +add v2.4s, v2.4s, v1.4s +sqrdmulh v22.4S, v14.4S, v30.s[1] +sub v21.4s, v10.4s, v20.4s +mul v14.4S, v14.4S,v4.s[1] +add v10.4s, v10.4s, v20.4s +sqrdmulh v20.4S, v26.4S, v30.s[2] +sub v1.4s, v3.4s, v18.4s +mul v26.4S, v26.4S,v4.s[2] +add v3.4s, v3.4s, v18.4s +mla v0.4S, v28.4S, v31.s[0] +sub v28.4s, v29.4s, v16.4s +ldr q30, [x0, #480] +sqrdmulh v4.4S, v10.4S, v24.s[1] +add v29.4s, v29.4s, v16.4s +mla v15.4S, v27.4S, v31.s[0] +ldr q27, [x0, #416] +sqrdmulh v16.4S, v21.4S, v24.s[2] +sub v18.4s, v11.4s, v0.4s +mla v14.4S, v22.4S, v31.s[0] +ldr q22, [x0, #288] +sqrdmulh v17.4S, v29.4S, v5.s[1] +add v11.4s, v11.4s, v0.4s +str q18, [x0, #16] +mla v26.4S, v20.4S, v31.s[0] +ldr q20, [x17, #+256] +ldr q18, [x17, #+272] +sqrdmulh v0.4S, v28.4S, v5.s[2] +sub v7.4s, v23.4s, v15.4s +str q11, [x0, #0] +mul v10.4S, v10.4S,v25.s[1] +add v23.4s, v23.4s, v15.4s +mul v21.4S, v21.4S,v25.s[2] +str q7, [x0, #48] +mla v10.4S, v4.4S, v31.s[0] +sub v4.4s, v13.4s, v14.4s +mla v21.4S, v16.4S, v31.s[0] +str q23, [x0, #32] +mul v29.4S, v29.4S,v6.s[1] +str q4, [x0, #80] +mul v28.4S, v28.4S,v6.s[2] +add v13.4s, v13.4s, v14.4s +str q13, [x0, #64] +mla v29.4S, v17.4S, v31.s[0] +sub v17.4s, v12.4s, v26.4s +str q17, [x0, #112] +mla v28.4S, v0.4S, v31.s[0] +add v12.4s, v12.4s, v26.4s +str q12, [x0, #96] +sqrdmulh v5.4S, v22.4S, v18.s[0] +sub v6.4s, v2.4s, v10.4s +mul v22.4S, v22.4S,v20.s[0] +str q6, [x0, #144] +ldr q6, [x0, #304] +sqrdmulh v12.4S, v6.4S, v18.s[0] +add v2.4s, v2.4s, v10.4s +mul v6.4S, v6.4S,v20.s[0] +str q2, [x0, #128] +ldr q2, [x17, #+288] +ldr q10, [x17, #+304] +ldr q26, [x0, #352] +sqrdmulh v0.4S, v26.4S, v10.s[0] +sub v17.4s, v19.4s, v21.4s +mul v26.4S, v26.4S,v2.s[0] +str q17, [x0, #176] +ldr q17, [x0, #368] +sqrdmulh v13.4S, v17.4S, v10.s[0] +add v19.4s, v19.4s, v21.4s +mul v17.4S, v17.4S,v2.s[0] +str q19, [x0, #160] +ldr q19, [x17, #+320] +ldr q21, [x17, #+336] +mla v22.4S, v5.4S, v31.s[0] +sub v5.4s, v3.4s, v29.4s +sqrdmulh v14.4S, v27.4S, v21.s[0] +str q5, [x0, #208] +ldr q5, [x0, #432] +mla v6.4S, v12.4S, v31.s[0] +add v3.4s, v3.4s, v29.4s +sqrdmulh v29.4S, v5.4S, v21.s[0] +str q3, [x0, #192] +ldr q3, [x17, #+352] +ldr q12, [x17, #+368] +mla v26.4S, v0.4S, v31.s[0] +sub v0.4s, v1.4s, v28.4s +sqrdmulh v4.4S, v30.4S, v12.s[0] +str q0, [x0, #240] +ldr q0, [x0, #496] +mla v17.4S, v13.4S, v31.s[0] +add v1.4s, v1.4s, v28.4s +sqrdmulh v28.4S, v0.4S, v12.s[0] +str q1, [x0, #224] +ldr q1, [x0, #256] +ldr q13, [x0, #384] +mul v27.4S, v27.4S,v19.s[0] +sub v24.4s, v1.4s, v22.4s +ldr q25, [x0, #272] +mul v5.4S, v5.4S,v19.s[0] +add v1.4s, v1.4s, v22.4s +ldr q22, [x0, #400] +mla v27.4S, v14.4S, v31.s[0] +sub v14.4s, v25.4s, v6.4s +ldr q23, [x0, #320] +mla v5.4S, v29.4S, v31.s[0] +add v25.4s, v25.4s, v6.4s +ldr q6, [x0, #448] +mul v30.4S, v30.4S,v3.s[0] +sub v29.4s, v23.4s, v26.4s +ldr q16, [x0, #336] +mul v0.4S, v0.4S,v3.s[0] +add v23.4s, v23.4s, v26.4s +ldr q26, [x0, #464] +mla v30.4S, v4.4S, v31.s[0] +nop +mla v0.4S, v28.4S, v31.s[0] +sub v28.4s, v16.4s, v17.4s +sqrdmulh v4.4S, v25.4S, v18.s[1] +add v16.4s, v16.4s, v17.4s +mul v25.4S, v25.4S,v20.s[1] +nop +sqrdmulh v17.4S, v14.4S, v18.s[2] +sub v7.4s, v13.4s, v27.4s +mul v14.4S, v14.4S,v20.s[2] +add v13.4s, v13.4s, v27.4s +sqrdmulh v18.4S, v16.4S, v10.s[1] +sub v20.4s, v22.4s, v5.4s +mul v16.4S, v16.4S,v2.s[1] +add v22.4s, v22.4s, v5.4s +sqrdmulh v5.4S, v28.4S, v10.s[2] +sub v27.4s, v6.4s, v30.4s +mul v28.4S, v28.4S,v2.s[2] +add v6.4s, v6.4s, v30.4s +mla v25.4S, v4.4S, v31.s[0] +sub v4.4s, v26.4s, v0.4s +ldr q10, [x0, #736] +sqrdmulh v2.4S, v22.4S, v21.s[1] +add v26.4s, v26.4s, v0.4s +mla v14.4S, v17.4S, v31.s[0] +ldr q17, [x0, #672] +sqrdmulh v0.4S, v20.4S, v21.s[2] +sub v30.4s, v1.4s, v25.4s +mla v16.4S, v18.4S, v31.s[0] +ldr q18, [x0, #544] +sqrdmulh v15.4S, v26.4S, v12.s[1] +add v1.4s, v1.4s, v25.4s +str q30, [x0, #272] +mla v28.4S, v5.4S, v31.s[0] +ldr q5, [x17, #+384] +ldr q30, [x17, #+400] +sqrdmulh v25.4S, v4.4S, v12.s[2] +sub v11.4s, v24.4s, v14.4s +str q1, [x0, #256] +mul v22.4S, v22.4S,v19.s[1] +add v24.4s, v24.4s, v14.4s +mul v20.4S, v20.4S,v19.s[2] +str q11, [x0, #304] +mla v22.4S, v2.4S, v31.s[0] +sub v2.4s, v23.4s, v16.4s +mla v20.4S, v0.4S, v31.s[0] +str q24, [x0, #288] +mul v26.4S, v26.4S,v3.s[1] +str q2, [x0, #336] +mul v4.4S, v4.4S,v3.s[2] +add v23.4s, v23.4s, v16.4s +str q23, [x0, #320] +mla v26.4S, v15.4S, v31.s[0] +sub v15.4s, v29.4s, v28.4s +str q15, [x0, #368] +mla v4.4S, v25.4S, v31.s[0] +add v29.4s, v29.4s, v28.4s +str q29, [x0, #352] +sqrdmulh v12.4S, v18.4S, v30.s[0] +sub v3.4s, v13.4s, v22.4s +mul v18.4S, v18.4S,v5.s[0] +str q3, [x0, #400] +ldr q3, [x0, #560] +sqrdmulh v29.4S, v3.4S, v30.s[0] +add v13.4s, v13.4s, v22.4s +mul v3.4S, v3.4S,v5.s[0] +str q13, [x0, #384] +ldr q13, [x17, #+416] +ldr q22, [x17, #+432] +ldr q28, [x0, #608] +sqrdmulh v25.4S, v28.4S, v22.s[0] +sub v15.4s, v7.4s, v20.4s +mul v28.4S, v28.4S,v13.s[0] +str q15, [x0, #432] +ldr q15, [x0, #624] +sqrdmulh v23.4S, v15.4S, v22.s[0] +add v7.4s, v7.4s, v20.4s +mul v15.4S, v15.4S,v13.s[0] +str q7, [x0, #416] +ldr q7, [x17, #+448] +ldr q20, [x17, #+464] +mla v18.4S, v12.4S, v31.s[0] +sub v12.4s, v6.4s, v26.4s +sqrdmulh v16.4S, v17.4S, v20.s[0] +str q12, [x0, #464] +ldr q12, [x0, #688] +mla v3.4S, v29.4S, v31.s[0] +add v6.4s, v6.4s, v26.4s +sqrdmulh v26.4S, v12.4S, v20.s[0] +str q6, [x0, #448] +ldr q6, [x17, #+480] +ldr q29, [x17, #+496] +mla v28.4S, v25.4S, v31.s[0] +sub v25.4s, v27.4s, v4.4s +sqrdmulh v2.4S, v10.4S, v29.s[0] +str q25, [x0, #496] +ldr q25, [x0, #752] +mla v15.4S, v23.4S, v31.s[0] +add v27.4s, v27.4s, v4.4s +sqrdmulh v4.4S, v25.4S, v29.s[0] +str q27, [x0, #480] +ldr q27, [x0, #512] +ldr q23, [x0, #640] +mul v17.4S, v17.4S,v7.s[0] +sub v21.4s, v27.4s, v18.4s +ldr q19, [x0, #528] +mul v12.4S, v12.4S,v7.s[0] +add v27.4s, v27.4s, v18.4s +ldr q18, [x0, #656] +mla v17.4S, v16.4S, v31.s[0] +sub v16.4s, v19.4s, v3.4s +ldr q24, [x0, #576] +mla v12.4S, v26.4S, v31.s[0] +add v19.4s, v19.4s, v3.4s +ldr q3, [x0, #704] +mul v10.4S, v10.4S,v6.s[0] +sub v26.4s, v24.4s, v28.4s +ldr q0, [x0, #592] +mul v25.4S, v25.4S,v6.s[0] +add v24.4s, v24.4s, v28.4s +ldr q28, [x0, #720] +mla v10.4S, v2.4S, v31.s[0] +nop +mla v25.4S, v4.4S, v31.s[0] +sub v4.4s, v0.4s, v15.4s +sqrdmulh v2.4S, v19.4S, v30.s[1] +add v0.4s, v0.4s, v15.4s +mul v19.4S, v19.4S,v5.s[1] +nop +sqrdmulh v15.4S, v16.4S, v30.s[2] +sub v11.4s, v23.4s, v17.4s +mul v16.4S, v16.4S,v5.s[2] +add v23.4s, v23.4s, v17.4s +sqrdmulh v30.4S, v0.4S, v22.s[1] +sub v5.4s, v18.4s, v12.4s +mul v0.4S, v0.4S,v13.s[1] +add v18.4s, v18.4s, v12.4s +sqrdmulh v12.4S, v4.4S, v22.s[2] +sub v17.4s, v3.4s, v10.4s +mul v4.4S, v4.4S,v13.s[2] +add v3.4s, v3.4s, v10.4s +mla v19.4S, v2.4S, v31.s[0] +sub v2.4s, v28.4s, v25.4s +ldr q22, [x0, #992] +sqrdmulh v13.4S, v18.4S, v20.s[1] +add v28.4s, v28.4s, v25.4s +mla v16.4S, v15.4S, v31.s[0] +ldr q15, [x0, #928] +sqrdmulh v25.4S, v5.4S, v20.s[2] +sub v10.4s, v27.4s, v19.4s +mla v0.4S, v30.4S, v31.s[0] +ldr q30, [x0, #800] +sqrdmulh v14.4S, v28.4S, v29.s[1] +add v27.4s, v27.4s, v19.4s +str q10, [x0, #528] +mla v4.4S, v12.4S, v31.s[0] +ldr q12, [x17, #+512] +ldr q10, [x17, #+528] +sqrdmulh v19.4S, v2.4S, v29.s[2] +sub v1.4s, v21.4s, v16.4s +str q27, [x0, #512] +mul v18.4S, v18.4S,v7.s[1] +add v21.4s, v21.4s, v16.4s +mul v5.4S, v5.4S,v7.s[2] +str q1, [x0, #560] +mla v18.4S, v13.4S, v31.s[0] +sub v13.4s, v24.4s, v0.4s +mla v5.4S, v25.4S, v31.s[0] +str q21, [x0, #544] +mul v28.4S, v28.4S,v6.s[1] +str q13, [x0, #592] +mul v2.4S, v2.4S,v6.s[2] +add v24.4s, v24.4s, v0.4s +str q24, [x0, #576] +mla v28.4S, v14.4S, v31.s[0] +sub v14.4s, v26.4s, v4.4s +str q14, [x0, #624] +mla v2.4S, v19.4S, v31.s[0] +add v26.4s, v26.4s, v4.4s +str q26, [x0, #608] +sqrdmulh v29.4S, v30.4S, v10.s[0] +sub v6.4s, v23.4s, v18.4s +mul v30.4S, v30.4S,v12.s[0] +str q6, [x0, #656] +ldr q6, [x0, #816] +sqrdmulh v26.4S, v6.4S, v10.s[0] +add v23.4s, v23.4s, v18.4s +mul v6.4S, v6.4S,v12.s[0] +str q23, [x0, #640] +ldr q23, [x17, #+544] +ldr q18, [x17, #+560] +ldr q4, [x0, #864] +sqrdmulh v19.4S, v4.4S, v18.s[0] +sub v14.4s, v11.4s, v5.4s +mul v4.4S, v4.4S,v23.s[0] +str q14, [x0, #688] +ldr q14, [x0, #880] +sqrdmulh v24.4S, v14.4S, v18.s[0] +add v11.4s, v11.4s, v5.4s +mul v14.4S, v14.4S,v23.s[0] +str q11, [x0, #672] +ldr q11, [x17, #+576] +ldr q5, [x17, #+592] +mla v30.4S, v29.4S, v31.s[0] +sub v29.4s, v3.4s, v28.4s +sqrdmulh v0.4S, v15.4S, v5.s[0] +str q29, [x0, #720] +ldr q29, [x0, #944] +mla v6.4S, v26.4S, v31.s[0] +add v3.4s, v3.4s, v28.4s +sqrdmulh v28.4S, v29.4S, v5.s[0] +str q3, [x0, #704] +ldr q3, [x17, #+608] +ldr q26, [x17, #+624] +mla v4.4S, v19.4S, v31.s[0] +sub v19.4s, v17.4s, v2.4s +sqrdmulh v13.4S, v22.4S, v26.s[0] +str q19, [x0, #752] +ldr q19, [x0, #1008] +mla v14.4S, v24.4S, v31.s[0] +add v17.4s, v17.4s, v2.4s +sqrdmulh v2.4S, v19.4S, v26.s[0] +str q17, [x0, #736] +ldr q17, [x0, #768] +ldr q24, [x0, #896] +mul v15.4S, v15.4S,v11.s[0] +sub v20.4s, v17.4s, v30.4s +ldr q7, [x0, #784] +mul v29.4S, v29.4S,v11.s[0] +add v17.4s, v17.4s, v30.4s +ldr q30, [x0, #912] +mla v15.4S, v0.4S, v31.s[0] +sub v0.4s, v7.4s, v6.4s +ldr q21, [x0, #832] +mla v29.4S, v28.4S, v31.s[0] +add v7.4s, v7.4s, v6.4s +ldr q6, [x0, #960] +mul v22.4S, v22.4S,v3.s[0] +sub v28.4s, v21.4s, v4.4s +ldr q25, [x0, #848] +mul v19.4S, v19.4S,v3.s[0] +add v21.4s, v21.4s, v4.4s +ldr q4, [x0, #976] +mla v22.4S, v13.4S, v31.s[0] +nop +mla v19.4S, v2.4S, v31.s[0] +sub v2.4s, v25.4s, v14.4s +sqrdmulh v13.4S, v7.4S, v10.s[1] +add v25.4s, v25.4s, v14.4s +mul v7.4S, v7.4S,v12.s[1] +nop +sqrdmulh v14.4S, v0.4S, v10.s[2] +sub v1.4s, v24.4s, v15.4s +mul v0.4S, v0.4S,v12.s[2] +add v24.4s, v24.4s, v15.4s +sqrdmulh v10.4S, v25.4S, v18.s[1] +sub v12.4s, v30.4s, v29.4s +mul v25.4S, v25.4S,v23.s[1] +add v30.4s, v30.4s, v29.4s +sqrdmulh v29.4S, v2.4S, v18.s[2] +sub v15.4s, v6.4s, v22.4s +mul v2.4S, v2.4S,v23.s[2] +add v6.4s, v6.4s, v22.4s +mla v7.4S, v13.4S, v31.s[0] +sub v13.4s, v4.4s, v19.4s +sqrdmulh v18.4S, v30.4S, v5.s[1] +add v4.4s, v4.4s, v19.4s +mla v0.4S, v14.4S, v31.s[0] +sqrdmulh v14.4S, v12.4S, v5.s[2] +sub v19.4s, v17.4s, v7.4s +mla v25.4S, v10.4S, v31.s[0] +sqrdmulh v10.4S, v4.4S, v26.s[1] +add v17.4s, v17.4s, v7.4s +str q19, [x0, #784] +mla v2.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v13.4S, v26.s[2] +sub v19.4s, v20.4s, v0.4s +str q17, [x0, #768] +mul v30.4S, v30.4S,v11.s[1] +add v20.4s, v20.4s, v0.4s +mul v12.4S, v12.4S,v11.s[2] +str q19, [x0, #816] +mla v30.4S, v18.4S, v31.s[0] +sub v18.4s, v21.4s, v25.4s +mla v12.4S, v14.4S, v31.s[0] +str q20, [x0, #800] +mul v4.4S, v4.4S,v3.s[1] +str q18, [x0, #848] +mul v13.4S, v13.4S,v3.s[2] +add v21.4s, v21.4s, v25.4s +str q21, [x0, #832] +mla v4.4S, v10.4S, v31.s[0] +sub v10.4s, v28.4s, v2.4s +str q10, [x0, #880] +mla v13.4S, v29.4S, v31.s[0] +add v28.4s, v28.4s, v2.4s +str q28, [x0, #864] +sub v26.4s, v24.4s, v30.4s +str q26, [x0, #912] +add v24.4s, v24.4s, v30.4s +str q24, [x0, #896] +sub v24.4s, v1.4s, v12.4s +str q24, [x0, #944] +add v1.4s, v1.4s, v12.4s +str q1, [x0, #928] +sub v1.4s, v6.4s, v4.4s +str q1, [x0, #976] +add v6.4s, v6.4s, v4.4s +str q6, [x0, #960] +sub v6.4s, v15.4s, v13.4s +str q6, [x0, #1008] +add v15.4s, v15.4s, v13.4s +str q15, [x0, #992] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1472 +// Instruction count: 1468 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_8_z4_7.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_8_z4_7.s new file mode 100644 index 0000000..5cdf1c8 --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_8_z4_7.s @@ -0,0 +1,1494 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_8_z4_7 +.global _ntt_u32_incomplete_neon_asm_var_4_2_8_z4_7 +ntt_u32_incomplete_neon_asm_var_4_2_8_z4_7: +_ntt_u32_incomplete_neon_asm_var_4_2_8_z4_7: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #928] +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +ldr q20, [x0, #992] +sqrdmulh v19.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q18, [x0, #800] +sqrdmulh v17.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +ldr q16, [x0, #864] +sqrdmulh v3.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +mla v22.4S, v21.4S, v31.s[0] +mla v20.4S, v19.4S, v31.s[0] +mla v18.4S, v17.4S, v31.s[0] +mla v16.4S, v3.4S, v31.s[0] +ldr q3, [x0, #544] +sqrdmulh v17.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +ldr q19, [x0, #608] +sqrdmulh v21.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q2, [x0, #672] +ldr q1, [x0, #416] +sqrdmulh v0.4S, v2.4S, v29.s[0] +sub v15.4s, v1.4s, v22.4s +mul v2.4S, v2.4S,v30.s[0] +add v1.4s, v1.4s, v22.4s +ldr q22, [x0, #736] +ldr q14, [x0, #480] +sqrdmulh v13.4S, v22.4S, v29.s[0] +sub v12.4s, v14.4s, v20.4s +mul v22.4S, v22.4S,v30.s[0] +add v14.4s, v14.4s, v20.4s +ldr q20, [x0, #288] +mla v3.4S, v17.4S, v31.s[0] +sub v17.4s, v20.4s, v18.4s +mla v19.4S, v21.4S, v31.s[0] +mla v2.4S, v0.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +mla v22.4S, v13.4S, v31.s[0] +ldr q13, [x0, #352] +sqrdmulh v18.4S, v1.4S, v29.s[1] +sub v0.4s, v13.4s, v16.4s +mul v1.4S, v1.4S,v30.s[1] +sqrdmulh v21.4S, v14.4S, v29.s[1] +add v13.4s, v13.4s, v16.4s +mul v14.4S, v14.4S,v30.s[1] +ldr q16, [x0, #32] +sqrdmulh v11.4S, v20.4S, v29.s[1] +sub v10.4s, v16.4s, v3.4s +mul v20.4S, v20.4S,v30.s[1] +add v16.4s, v16.4s, v3.4s +ldr q3, [x0, #96] +sqrdmulh v9.4S, v13.4S, v29.s[1] +sub v8.4s, v3.4s, v19.4s +mul v13.4S, v13.4S,v30.s[1] +add v3.4s, v3.4s, v19.4s +ldr q19, [x0, #160] +mla v1.4S, v18.4S, v31.s[0] +sub v18.4s, v19.4s, v2.4s +mla v14.4S, v21.4S, v31.s[0] +mla v20.4S, v11.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +mla v13.4S, v9.4S, v31.s[0] +ldr q9, [x0, #224] +sqrdmulh v2.4S, v15.4S, v29.s[2] +sub v11.4s, v9.4s, v22.4s +mul v15.4S, v15.4S,v30.s[2] +sqrdmulh v21.4S, v12.4S, v29.s[2] +add v9.4s, v9.4s, v22.4s +mul v12.4S, v12.4S,v30.s[2] +sqrdmulh v22.4S, v17.4S, v29.s[2] +sub v7.4s, v19.4s, v1.4s +mul v17.4S, v17.4S,v30.s[2] +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v29.s[2] +sub v6.4s, v9.4s, v14.4s +mul v0.4S, v0.4S,v30.s[2] +add v9.4s, v9.4s, v14.4s +mla v15.4S, v2.4S, v31.s[0] +sub v2.4s, v16.4s, v20.4s +mla v12.4S, v21.4S, v31.s[0] +mla v17.4S, v22.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +mla v0.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v7.4S, v27.s[1] +sub v20.4s, v3.4s, v13.4s +mul v7.4S, v7.4S,v28.s[1] +sqrdmulh v22.4S, v6.4S, v27.s[1] +add v3.4s, v3.4s, v13.4s +mul v6.4S, v6.4S,v28.s[1] +sqrdmulh v13.4S, v19.4S, v27.s[0] +sub v21.4s, v18.4s, v15.4s +mul v19.4S, v19.4S,v28.s[0] +add v18.4s, v18.4s, v15.4s +sqrdmulh v15.4S, v9.4S, v27.s[0] +sub v14.4s, v11.4s, v12.4s +mul v9.4S, v9.4S,v28.s[0] +add v11.4s, v11.4s, v12.4s +mla v7.4S, v1.4S, v31.s[0] +sub v1.4s, v10.4s, v17.4s +mla v6.4S, v22.4S, v31.s[0] +mla v19.4S, v13.4S, v31.s[0] +add v10.4s, v10.4s, v17.4s +mla v9.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v18.4S, v27.s[2] +sub v17.4s, v8.4s, v0.4s +mul v18.4S, v18.4S,v28.s[2] +sqrdmulh v13.4S, v11.4S, v27.s[2] +add v8.4s, v8.4s, v0.4s +mul v11.4S, v11.4S,v28.s[2] +sqrdmulh v0.4S, v21.4S, v27.s[3] +sub v22.4s, v2.4s, v7.4s +mul v21.4S, v21.4S,v28.s[3] +add v2.4s, v2.4s, v7.4s +sqrdmulh v7.4S, v14.4S, v27.s[3] +sub v12.4s, v20.4s, v6.4s +mul v14.4S, v14.4S,v28.s[3] +add v20.4s, v20.4s, v6.4s +mla v18.4S, v15.4S, v31.s[0] +sub v15.4s, v16.4s, v19.4s +mla v11.4S, v13.4S, v31.s[0] +mla v21.4S, v0.4S, v31.s[0] +add v16.4s, v16.4s, v19.4s +mla v14.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v20.4S, v25.s[2] +sub v19.4s, v3.4s, v9.4s +mul v20.4S, v20.4S,v26.s[2] +sqrdmulh v0.4S, v12.4S, v25.s[3] +add v3.4s, v3.4s, v9.4s +mul v12.4S, v12.4S,v26.s[3] +sqrdmulh v9.4S, v19.4S, v25.s[1] +sub v13.4s, v10.4s, v18.4s +mul v19.4S, v19.4S,v26.s[1] +add v10.4s, v10.4s, v18.4s +sqrdmulh v18.4S, v3.4S, v25.s[0] +sub v6.4s, v8.4s, v11.4s +mul v3.4S, v3.4S,v26.s[0] +add v8.4s, v8.4s, v11.4s +mla v20.4S, v7.4S, v31.s[0] +sub v7.4s, v1.4s, v21.4s +mla v12.4S, v0.4S, v31.s[0] +mla v19.4S, v9.4S, v31.s[0] +add v1.4s, v1.4s, v21.4s +mla v3.4S, v18.4S, v31.s[0] +sqrdmulh v18.4S, v8.4S, v23.s[0] +sub v21.4s, v17.4s, v14.4s +mul v8.4S, v8.4S,v24.s[0] +sqrdmulh v9.4S, v6.4S, v23.s[1] +add v17.4s, v17.4s, v14.4s +mul v6.4S, v6.4S,v24.s[1] +sqrdmulh v14.4S, v17.4S, v23.s[2] +sub v0.4s, v2.4s, v20.4s +mul v17.4S, v17.4S,v24.s[2] +add v2.4s, v2.4s, v20.4s +sqrdmulh v20.4S, v21.4S, v23.s[3] +sub v11.4s, v22.4s, v12.4s +mul v21.4S, v21.4S,v24.s[3] +add v22.4s, v22.4s, v12.4s +mla v8.4S, v18.4S, v31.s[0] +sub v18.4s, v15.4s, v19.4s +mla v6.4S, v9.4S, v31.s[0] +str q2, [x0, #288] +mla v17.4S, v14.4S, v31.s[0] +add v15.4s, v15.4s, v19.4s +mla v21.4S, v20.4S, v31.s[0] +str q0, [x0, #352] +ldr q0, [x0, #944] +sqrdmulh v20.4S, v0.4S, v29.s[0] +sub v19.4s, v16.4s, v3.4s +mul v0.4S, v0.4S,v30.s[0] +str q22, [x0, #416] +ldr q22, [x0, #1008] +sqrdmulh v14.4S, v22.4S, v29.s[0] +add v16.4s, v16.4s, v3.4s +mul v22.4S, v22.4S,v30.s[0] +str q11, [x0, #480] +ldr q11, [x0, #816] +sqrdmulh v3.4S, v11.4S, v29.s[0] +sub v2.4s, v10.4s, v8.4s +mul v11.4S, v11.4S,v30.s[0] +add v10.4s, v10.4s, v8.4s +ldr q8, [x0, #880] +sqrdmulh v9.4S, v8.4S, v29.s[0] +sub v12.4s, v13.4s, v6.4s +mul v8.4S, v8.4S,v30.s[0] +add v13.4s, v13.4s, v6.4s +mla v0.4S, v20.4S, v31.s[0] +sub v20.4s, v1.4s, v17.4s +mla v22.4S, v14.4S, v31.s[0] +str q15, [x0, #160] +mla v11.4S, v3.4S, v31.s[0] +add v1.4s, v1.4s, v17.4s +mla v8.4S, v9.4S, v31.s[0] +str q18, [x0, #224] +ldr q18, [x0, #560] +sqrdmulh v9.4S, v18.4S, v29.s[0] +sub v17.4s, v7.4s, v21.4s +mul v18.4S, v18.4S,v30.s[0] +str q16, [x0, #32] +ldr q16, [x0, #624] +sqrdmulh v3.4S, v16.4S, v29.s[0] +add v7.4s, v7.4s, v21.4s +mul v16.4S, v16.4S,v30.s[0] +str q19, [x0, #96] +ldr q19, [x0, #688] +ldr q21, [x0, #432] +sqrdmulh v15.4S, v19.4S, v29.s[0] +sub v14.4s, v21.4s, v0.4s +mul v19.4S, v19.4S,v30.s[0] +add v21.4s, v21.4s, v0.4s +ldr q0, [x0, #752] +ldr q6, [x0, #496] +sqrdmulh v5.4S, v0.4S, v29.s[0] +sub v4.4s, v6.4s, v22.4s +mul v0.4S, v0.4S,v30.s[0] +add v6.4s, v6.4s, v22.4s +ldr q22, [x0, #304] +mla v18.4S, v9.4S, v31.s[0] +sub v9.4s, v22.4s, v11.4s +mla v16.4S, v3.4S, v31.s[0] +str q10, [x0, #544] +mla v19.4S, v15.4S, v31.s[0] +add v22.4s, v22.4s, v11.4s +mla v0.4S, v5.4S, v31.s[0] +str q2, [x0, #608] +ldr q2, [x0, #368] +sqrdmulh v5.4S, v21.4S, v29.s[1] +sub v11.4s, v2.4s, v8.4s +mul v21.4S, v21.4S,v30.s[1] +str q13, [x0, #672] +sqrdmulh v13.4S, v6.4S, v29.s[1] +add v2.4s, v2.4s, v8.4s +mul v6.4S, v6.4S,v30.s[1] +str q12, [x0, #736] +ldr q12, [x0, #48] +sqrdmulh v8.4S, v22.4S, v29.s[1] +sub v15.4s, v12.4s, v18.4s +mul v22.4S, v22.4S,v30.s[1] +add v12.4s, v12.4s, v18.4s +ldr q18, [x0, #112] +sqrdmulh v10.4S, v2.4S, v29.s[1] +sub v3.4s, v18.4s, v16.4s +mul v2.4S, v2.4S,v30.s[1] +add v18.4s, v18.4s, v16.4s +ldr q16, [x0, #176] +mla v21.4S, v5.4S, v31.s[0] +sub v5.4s, v16.4s, v19.4s +mla v6.4S, v13.4S, v31.s[0] +str q1, [x0, #800] +mla v22.4S, v8.4S, v31.s[0] +add v16.4s, v16.4s, v19.4s +mla v2.4S, v10.4S, v31.s[0] +str q20, [x0, #864] +ldr q20, [x0, #240] +sqrdmulh v10.4S, v14.4S, v29.s[2] +sub v19.4s, v20.4s, v0.4s +mul v14.4S, v14.4S,v30.s[2] +str q7, [x0, #928] +sqrdmulh v7.4S, v4.4S, v29.s[2] +add v20.4s, v20.4s, v0.4s +mul v4.4S, v4.4S,v30.s[2] +str q17, [x0, #992] +sqrdmulh v17.4S, v9.4S, v29.s[2] +sub v0.4s, v16.4s, v21.4s +mul v9.4S, v9.4S,v30.s[2] +add v16.4s, v16.4s, v21.4s +sqrdmulh v21.4S, v11.4S, v29.s[2] +sub v8.4s, v20.4s, v6.4s +mul v11.4S, v11.4S,v30.s[2] +add v20.4s, v20.4s, v6.4s +mla v14.4S, v10.4S, v31.s[0] +sub v10.4s, v12.4s, v22.4s +mla v4.4S, v7.4S, v31.s[0] +mla v9.4S, v17.4S, v31.s[0] +add v12.4s, v12.4s, v22.4s +mla v11.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v0.4S, v27.s[1] +sub v22.4s, v18.4s, v2.4s +mul v0.4S, v0.4S,v28.s[1] +sqrdmulh v17.4S, v8.4S, v27.s[1] +add v18.4s, v18.4s, v2.4s +mul v8.4S, v8.4S,v28.s[1] +sqrdmulh v2.4S, v16.4S, v27.s[0] +sub v7.4s, v5.4s, v14.4s +mul v16.4S, v16.4S,v28.s[0] +add v5.4s, v5.4s, v14.4s +sqrdmulh v14.4S, v20.4S, v27.s[0] +sub v6.4s, v19.4s, v4.4s +mul v20.4S, v20.4S,v28.s[0] +add v19.4s, v19.4s, v4.4s +mla v0.4S, v21.4S, v31.s[0] +sub v21.4s, v15.4s, v9.4s +mla v8.4S, v17.4S, v31.s[0] +mla v16.4S, v2.4S, v31.s[0] +add v15.4s, v15.4s, v9.4s +mla v20.4S, v14.4S, v31.s[0] +sqrdmulh v14.4S, v5.4S, v27.s[2] +sub v9.4s, v3.4s, v11.4s +mul v5.4S, v5.4S,v28.s[2] +sqrdmulh v2.4S, v19.4S, v27.s[2] +add v3.4s, v3.4s, v11.4s +mul v19.4S, v19.4S,v28.s[2] +sqrdmulh v11.4S, v7.4S, v27.s[3] +sub v17.4s, v10.4s, v0.4s +mul v7.4S, v7.4S,v28.s[3] +add v10.4s, v10.4s, v0.4s +sqrdmulh v0.4S, v6.4S, v27.s[3] +sub v4.4s, v22.4s, v8.4s +mul v6.4S, v6.4S,v28.s[3] +add v22.4s, v22.4s, v8.4s +mla v5.4S, v14.4S, v31.s[0] +sub v14.4s, v12.4s, v16.4s +mla v19.4S, v2.4S, v31.s[0] +mla v7.4S, v11.4S, v31.s[0] +add v12.4s, v12.4s, v16.4s +mla v6.4S, v0.4S, v31.s[0] +sqrdmulh v0.4S, v22.4S, v25.s[2] +sub v16.4s, v18.4s, v20.4s +mul v22.4S, v22.4S,v26.s[2] +sqrdmulh v11.4S, v4.4S, v25.s[3] +add v18.4s, v18.4s, v20.4s +mul v4.4S, v4.4S,v26.s[3] +sqrdmulh v20.4S, v16.4S, v25.s[1] +sub v2.4s, v15.4s, v5.4s +mul v16.4S, v16.4S,v26.s[1] +add v15.4s, v15.4s, v5.4s +sqrdmulh v5.4S, v18.4S, v25.s[0] +sub v8.4s, v3.4s, v19.4s +mul v18.4S, v18.4S,v26.s[0] +add v3.4s, v3.4s, v19.4s +mla v22.4S, v0.4S, v31.s[0] +sub v0.4s, v21.4s, v7.4s +mla v4.4S, v11.4S, v31.s[0] +mla v16.4S, v20.4S, v31.s[0] +add v21.4s, v21.4s, v7.4s +mla v18.4S, v5.4S, v31.s[0] +sqrdmulh v5.4S, v3.4S, v23.s[0] +sub v7.4s, v9.4s, v6.4s +mul v3.4S, v3.4S,v24.s[0] +sqrdmulh v20.4S, v8.4S, v23.s[1] +add v9.4s, v9.4s, v6.4s +mul v8.4S, v8.4S,v24.s[1] +sqrdmulh v6.4S, v9.4S, v23.s[2] +sub v11.4s, v10.4s, v22.4s +mul v9.4S, v9.4S,v24.s[2] +add v10.4s, v10.4s, v22.4s +sqrdmulh v22.4S, v7.4S, v23.s[3] +sub v19.4s, v17.4s, v4.4s +mul v7.4S, v7.4S,v24.s[3] +add v17.4s, v17.4s, v4.4s +mla v3.4S, v5.4S, v31.s[0] +sub v5.4s, v14.4s, v16.4s +mla v8.4S, v20.4S, v31.s[0] +str q10, [x0, #304] +mla v9.4S, v6.4S, v31.s[0] +add v14.4s, v14.4s, v16.4s +mla v7.4S, v22.4S, v31.s[0] +str q11, [x0, #368] +ldr q11, [x0, #896] +sqrdmulh v22.4S, v11.4S, v29.s[0] +sub v16.4s, v12.4s, v18.4s +mul v11.4S, v11.4S,v30.s[0] +str q17, [x0, #432] +ldr q17, [x0, #960] +sqrdmulh v6.4S, v17.4S, v29.s[0] +add v12.4s, v12.4s, v18.4s +mul v17.4S, v17.4S,v30.s[0] +str q19, [x0, #496] +ldr q19, [x0, #768] +sqrdmulh v18.4S, v19.4S, v29.s[0] +sub v10.4s, v15.4s, v3.4s +mul v19.4S, v19.4S,v30.s[0] +add v15.4s, v15.4s, v3.4s +ldr q3, [x0, #832] +sqrdmulh v20.4S, v3.4S, v29.s[0] +sub v4.4s, v2.4s, v8.4s +mul v3.4S, v3.4S,v30.s[0] +add v2.4s, v2.4s, v8.4s +mla v11.4S, v22.4S, v31.s[0] +sub v22.4s, v21.4s, v9.4s +mla v17.4S, v6.4S, v31.s[0] +str q14, [x0, #176] +mla v19.4S, v18.4S, v31.s[0] +add v21.4s, v21.4s, v9.4s +mla v3.4S, v20.4S, v31.s[0] +str q5, [x0, #240] +ldr q5, [x0, #512] +sqrdmulh v20.4S, v5.4S, v29.s[0] +sub v9.4s, v0.4s, v7.4s +mul v5.4S, v5.4S,v30.s[0] +str q12, [x0, #48] +ldr q12, [x0, #576] +sqrdmulh v18.4S, v12.4S, v29.s[0] +add v0.4s, v0.4s, v7.4s +mul v12.4S, v12.4S,v30.s[0] +str q16, [x0, #112] +ldr q16, [x0, #640] +ldr q7, [x0, #384] +sqrdmulh v14.4S, v16.4S, v29.s[0] +sub v6.4s, v7.4s, v11.4s +mul v16.4S, v16.4S,v30.s[0] +add v7.4s, v7.4s, v11.4s +ldr q11, [x0, #704] +ldr q8, [x0, #448] +sqrdmulh v1.4S, v11.4S, v29.s[0] +sub v13.4s, v8.4s, v17.4s +mul v11.4S, v11.4S,v30.s[0] +add v8.4s, v8.4s, v17.4s +ldr q17, [x0, #256] +mla v5.4S, v20.4S, v31.s[0] +sub v20.4s, v17.4s, v19.4s +mla v12.4S, v18.4S, v31.s[0] +str q15, [x0, #560] +mla v16.4S, v14.4S, v31.s[0] +add v17.4s, v17.4s, v19.4s +mla v11.4S, v1.4S, v31.s[0] +str q10, [x0, #624] +ldr q10, [x0, #320] +sqrdmulh v1.4S, v7.4S, v29.s[1] +sub v19.4s, v10.4s, v3.4s +mul v7.4S, v7.4S,v30.s[1] +str q2, [x0, #688] +sqrdmulh v2.4S, v8.4S, v29.s[1] +add v10.4s, v10.4s, v3.4s +mul v8.4S, v8.4S,v30.s[1] +str q4, [x0, #752] +ldr q4, [x0, #0] +sqrdmulh v3.4S, v17.4S, v29.s[1] +sub v14.4s, v4.4s, v5.4s +mul v17.4S, v17.4S,v30.s[1] +add v4.4s, v4.4s, v5.4s +ldr q5, [x0, #64] +sqrdmulh v15.4S, v10.4S, v29.s[1] +sub v18.4s, v5.4s, v12.4s +mul v10.4S, v10.4S,v30.s[1] +add v5.4s, v5.4s, v12.4s +ldr q12, [x0, #128] +mla v7.4S, v1.4S, v31.s[0] +sub v1.4s, v12.4s, v16.4s +mla v8.4S, v2.4S, v31.s[0] +str q21, [x0, #816] +mla v17.4S, v3.4S, v31.s[0] +add v12.4s, v12.4s, v16.4s +mla v10.4S, v15.4S, v31.s[0] +str q22, [x0, #880] +ldr q22, [x0, #192] +sqrdmulh v15.4S, v6.4S, v29.s[2] +sub v16.4s, v22.4s, v11.4s +mul v6.4S, v6.4S,v30.s[2] +str q0, [x0, #944] +sqrdmulh v0.4S, v13.4S, v29.s[2] +add v22.4s, v22.4s, v11.4s +mul v13.4S, v13.4S,v30.s[2] +str q9, [x0, #1008] +sqrdmulh v9.4S, v20.4S, v29.s[2] +sub v11.4s, v12.4s, v7.4s +mul v20.4S, v20.4S,v30.s[2] +add v12.4s, v12.4s, v7.4s +sqrdmulh v7.4S, v19.4S, v29.s[2] +sub v3.4s, v22.4s, v8.4s +mul v19.4S, v19.4S,v30.s[2] +add v22.4s, v22.4s, v8.4s +mla v6.4S, v15.4S, v31.s[0] +sub v15.4s, v4.4s, v17.4s +mla v13.4S, v0.4S, v31.s[0] +mla v20.4S, v9.4S, v31.s[0] +add v4.4s, v4.4s, v17.4s +mla v19.4S, v7.4S, v31.s[0] +sqrdmulh v7.4S, v11.4S, v27.s[1] +sub v17.4s, v5.4s, v10.4s +mul v11.4S, v11.4S,v28.s[1] +sqrdmulh v9.4S, v3.4S, v27.s[1] +add v5.4s, v5.4s, v10.4s +mul v3.4S, v3.4S,v28.s[1] +sqrdmulh v10.4S, v12.4S, v27.s[0] +sub v0.4s, v1.4s, v6.4s +mul v12.4S, v12.4S,v28.s[0] +add v1.4s, v1.4s, v6.4s +sqrdmulh v6.4S, v22.4S, v27.s[0] +sub v8.4s, v16.4s, v13.4s +mul v22.4S, v22.4S,v28.s[0] +add v16.4s, v16.4s, v13.4s +mla v11.4S, v7.4S, v31.s[0] +sub v7.4s, v14.4s, v20.4s +mla v3.4S, v9.4S, v31.s[0] +mla v12.4S, v10.4S, v31.s[0] +add v14.4s, v14.4s, v20.4s +mla v22.4S, v6.4S, v31.s[0] +sqrdmulh v6.4S, v1.4S, v27.s[2] +sub v20.4s, v18.4s, v19.4s +mul v1.4S, v1.4S,v28.s[2] +sqrdmulh v10.4S, v16.4S, v27.s[2] +add v18.4s, v18.4s, v19.4s +mul v16.4S, v16.4S,v28.s[2] +sqrdmulh v19.4S, v0.4S, v27.s[3] +sub v9.4s, v15.4s, v11.4s +mul v0.4S, v0.4S,v28.s[3] +add v15.4s, v15.4s, v11.4s +sqrdmulh v11.4S, v8.4S, v27.s[3] +sub v13.4s, v17.4s, v3.4s +mul v8.4S, v8.4S,v28.s[3] +add v17.4s, v17.4s, v3.4s +mla v1.4S, v6.4S, v31.s[0] +sub v6.4s, v4.4s, v12.4s +mla v16.4S, v10.4S, v31.s[0] +mla v0.4S, v19.4S, v31.s[0] +add v4.4s, v4.4s, v12.4s +mla v8.4S, v11.4S, v31.s[0] +sqrdmulh v11.4S, v17.4S, v25.s[2] +sub v12.4s, v5.4s, v22.4s +mul v17.4S, v17.4S,v26.s[2] +sqrdmulh v19.4S, v13.4S, v25.s[3] +add v5.4s, v5.4s, v22.4s +mul v13.4S, v13.4S,v26.s[3] +sqrdmulh v22.4S, v12.4S, v25.s[1] +sub v10.4s, v14.4s, v1.4s +mul v12.4S, v12.4S,v26.s[1] +add v14.4s, v14.4s, v1.4s +sqrdmulh v1.4S, v5.4S, v25.s[0] +sub v3.4s, v18.4s, v16.4s +mul v5.4S, v5.4S,v26.s[0] +add v18.4s, v18.4s, v16.4s +mla v17.4S, v11.4S, v31.s[0] +sub v11.4s, v7.4s, v0.4s +mla v13.4S, v19.4S, v31.s[0] +mla v12.4S, v22.4S, v31.s[0] +add v7.4s, v7.4s, v0.4s +mla v5.4S, v1.4S, v31.s[0] +sqrdmulh v1.4S, v18.4S, v23.s[0] +sub v0.4s, v20.4s, v8.4s +mul v18.4S, v18.4S,v24.s[0] +sqrdmulh v22.4S, v3.4S, v23.s[1] +add v20.4s, v20.4s, v8.4s +mul v3.4S, v3.4S,v24.s[1] +sqrdmulh v8.4S, v20.4S, v23.s[2] +sub v19.4s, v15.4s, v17.4s +mul v20.4S, v20.4S,v24.s[2] +add v15.4s, v15.4s, v17.4s +sqrdmulh v17.4S, v0.4S, v23.s[3] +sub v16.4s, v9.4s, v13.4s +mul v0.4S, v0.4S,v24.s[3] +add v9.4s, v9.4s, v13.4s +mla v18.4S, v1.4S, v31.s[0] +sub v1.4s, v6.4s, v12.4s +mla v3.4S, v22.4S, v31.s[0] +str q15, [x0, #256] +mla v20.4S, v8.4S, v31.s[0] +add v6.4s, v6.4s, v12.4s +mla v0.4S, v17.4S, v31.s[0] +str q19, [x0, #320] +ldr q19, [x0, #912] +sqrdmulh v17.4S, v19.4S, v29.s[0] +sub v12.4s, v4.4s, v5.4s +mul v19.4S, v19.4S,v30.s[0] +str q9, [x0, #384] +ldr q9, [x0, #976] +sqrdmulh v8.4S, v9.4S, v29.s[0] +add v4.4s, v4.4s, v5.4s +mul v9.4S, v9.4S,v30.s[0] +str q16, [x0, #448] +ldr q16, [x0, #784] +sqrdmulh v5.4S, v16.4S, v29.s[0] +sub v15.4s, v14.4s, v18.4s +mul v16.4S, v16.4S,v30.s[0] +add v14.4s, v14.4s, v18.4s +ldr q18, [x0, #848] +sqrdmulh v22.4S, v18.4S, v29.s[0] +sub v13.4s, v10.4s, v3.4s +mul v18.4S, v18.4S,v30.s[0] +add v10.4s, v10.4s, v3.4s +mla v19.4S, v17.4S, v31.s[0] +sub v17.4s, v7.4s, v20.4s +mla v9.4S, v8.4S, v31.s[0] +str q6, [x0, #128] +mla v16.4S, v5.4S, v31.s[0] +add v7.4s, v7.4s, v20.4s +mla v18.4S, v22.4S, v31.s[0] +str q1, [x0, #192] +ldr q1, [x0, #528] +sqrdmulh v22.4S, v1.4S, v29.s[0] +sub v20.4s, v11.4s, v0.4s +mul v1.4S, v1.4S,v30.s[0] +str q4, [x0, #0] +ldr q4, [x0, #592] +sqrdmulh v5.4S, v4.4S, v29.s[0] +add v11.4s, v11.4s, v0.4s +mul v4.4S, v4.4S,v30.s[0] +str q12, [x0, #64] +ldr q12, [x0, #656] +ldr q0, [x0, #400] +sqrdmulh v6.4S, v12.4S, v29.s[0] +sub v8.4s, v0.4s, v19.4s +mul v12.4S, v12.4S,v30.s[0] +add v0.4s, v0.4s, v19.4s +ldr q19, [x0, #720] +ldr q3, [x0, #464] +sqrdmulh v21.4S, v19.4S, v29.s[0] +sub v2.4s, v3.4s, v9.4s +mul v19.4S, v19.4S,v30.s[0] +add v3.4s, v3.4s, v9.4s +ldr q9, [x0, #272] +mla v1.4S, v22.4S, v31.s[0] +sub v22.4s, v9.4s, v16.4s +mla v4.4S, v5.4S, v31.s[0] +str q14, [x0, #512] +mla v12.4S, v6.4S, v31.s[0] +add v9.4s, v9.4s, v16.4s +mla v19.4S, v21.4S, v31.s[0] +str q15, [x0, #576] +ldr q15, [x0, #336] +sqrdmulh v21.4S, v0.4S, v29.s[1] +sub v16.4s, v15.4s, v18.4s +mul v0.4S, v0.4S,v30.s[1] +str q10, [x0, #640] +sqrdmulh v10.4S, v3.4S, v29.s[1] +add v15.4s, v15.4s, v18.4s +mul v3.4S, v3.4S,v30.s[1] +str q13, [x0, #704] +ldr q13, [x0, #16] +sqrdmulh v18.4S, v9.4S, v29.s[1] +sub v6.4s, v13.4s, v1.4s +mul v9.4S, v9.4S,v30.s[1] +add v13.4s, v13.4s, v1.4s +ldr q1, [x0, #80] +sqrdmulh v14.4S, v15.4S, v29.s[1] +sub v5.4s, v1.4s, v4.4s +mul v15.4S, v15.4S,v30.s[1] +add v1.4s, v1.4s, v4.4s +ldr q4, [x0, #144] +mla v0.4S, v21.4S, v31.s[0] +sub v21.4s, v4.4s, v12.4s +mla v3.4S, v10.4S, v31.s[0] +str q7, [x0, #768] +mla v9.4S, v18.4S, v31.s[0] +add v4.4s, v4.4s, v12.4s +mla v15.4S, v14.4S, v31.s[0] +str q17, [x0, #832] +ldr q17, [x0, #208] +sqrdmulh v14.4S, v8.4S, v29.s[2] +sub v12.4s, v17.4s, v19.4s +mul v8.4S, v8.4S,v30.s[2] +str q11, [x0, #896] +sqrdmulh v11.4S, v2.4S, v29.s[2] +add v17.4s, v17.4s, v19.4s +mul v2.4S, v2.4S,v30.s[2] +str q20, [x0, #960] +sqrdmulh v20.4S, v22.4S, v29.s[2] +sub v19.4s, v4.4s, v0.4s +mul v22.4S, v22.4S,v30.s[2] +add v4.4s, v4.4s, v0.4s +sqrdmulh v0.4S, v16.4S, v29.s[2] +sub v18.4s, v17.4s, v3.4s +mul v16.4S, v16.4S,v30.s[2] +add v17.4s, v17.4s, v3.4s +mla v8.4S, v14.4S, v31.s[0] +sub v14.4s, v13.4s, v9.4s +mla v2.4S, v11.4S, v31.s[0] +mla v22.4S, v20.4S, v31.s[0] +add v13.4s, v13.4s, v9.4s +mla v16.4S, v0.4S, v31.s[0] +sqrdmulh v0.4S, v19.4S, v27.s[1] +sub v9.4s, v1.4s, v15.4s +mul v19.4S, v19.4S,v28.s[1] +sqrdmulh v20.4S, v18.4S, v27.s[1] +add v1.4s, v1.4s, v15.4s +mul v18.4S, v18.4S,v28.s[1] +sqrdmulh v15.4S, v4.4S, v27.s[0] +sub v11.4s, v21.4s, v8.4s +mul v4.4S, v4.4S,v28.s[0] +add v21.4s, v21.4s, v8.4s +sqrdmulh v8.4S, v17.4S, v27.s[0] +sub v3.4s, v12.4s, v2.4s +mul v17.4S, v17.4S,v28.s[0] +add v12.4s, v12.4s, v2.4s +mla v19.4S, v0.4S, v31.s[0] +sub v0.4s, v6.4s, v22.4s +mla v18.4S, v20.4S, v31.s[0] +mla v4.4S, v15.4S, v31.s[0] +add v6.4s, v6.4s, v22.4s +mla v17.4S, v8.4S, v31.s[0] +sqrdmulh v8.4S, v21.4S, v27.s[2] +sub v22.4s, v5.4s, v16.4s +mul v21.4S, v21.4S,v28.s[2] +sqrdmulh v15.4S, v12.4S, v27.s[2] +add v5.4s, v5.4s, v16.4s +mul v12.4S, v12.4S,v28.s[2] +sqrdmulh v16.4S, v11.4S, v27.s[3] +sub v20.4s, v14.4s, v19.4s +mul v11.4S, v11.4S,v28.s[3] +add v14.4s, v14.4s, v19.4s +sqrdmulh v19.4S, v3.4S, v27.s[3] +sub v2.4s, v9.4s, v18.4s +mul v3.4S, v3.4S,v28.s[3] +add v9.4s, v9.4s, v18.4s +mla v21.4S, v8.4S, v31.s[0] +sub v8.4s, v13.4s, v4.4s +mla v12.4S, v15.4S, v31.s[0] +mla v11.4S, v16.4S, v31.s[0] +add v13.4s, v13.4s, v4.4s +mla v3.4S, v19.4S, v31.s[0] +sqrdmulh v19.4S, v9.4S, v25.s[2] +sub v4.4s, v1.4s, v17.4s +mul v9.4S, v9.4S,v26.s[2] +sqrdmulh v16.4S, v2.4S, v25.s[3] +add v1.4s, v1.4s, v17.4s +mul v2.4S, v2.4S,v26.s[3] +sqrdmulh v17.4S, v4.4S, v25.s[1] +sub v15.4s, v6.4s, v21.4s +mul v4.4S, v4.4S,v26.s[1] +add v6.4s, v6.4s, v21.4s +sqrdmulh v21.4S, v1.4S, v25.s[0] +sub v18.4s, v5.4s, v12.4s +mul v1.4S, v1.4S,v26.s[0] +add v5.4s, v5.4s, v12.4s +mla v9.4S, v19.4S, v31.s[0] +sub v19.4s, v0.4s, v11.4s +mla v2.4S, v16.4S, v31.s[0] +mla v4.4S, v17.4S, v31.s[0] +add v0.4s, v0.4s, v11.4s +mla v1.4S, v21.4S, v31.s[0] +sqrdmulh v21.4S, v5.4S, v23.s[0] +sub v11.4s, v22.4s, v3.4s +mul v5.4S, v5.4S,v24.s[0] +sqrdmulh v17.4S, v18.4S, v23.s[1] +add v22.4s, v22.4s, v3.4s +mul v18.4S, v18.4S,v24.s[1] +sqrdmulh v3.4S, v22.4S, v23.s[2] +sub v16.4s, v14.4s, v9.4s +mul v22.4S, v22.4S,v24.s[2] +add v14.4s, v14.4s, v9.4s +sqrdmulh v9.4S, v11.4S, v23.s[3] +sub v12.4s, v20.4s, v2.4s +mul v11.4S, v11.4S,v24.s[3] +add v20.4s, v20.4s, v2.4s +mla v5.4S, v21.4S, v31.s[0] +sub v21.4s, v8.4s, v4.4s +mla v18.4S, v17.4S, v31.s[0] +str q14, [x0, #272] +mla v22.4S, v3.4S, v31.s[0] +add v8.4s, v8.4s, v4.4s +mla v11.4S, v9.4S, v31.s[0] +str q16, [x0, #336] +sub v23.4s, v13.4s, v1.4s +str q20, [x0, #400] +add v13.4s, v13.4s, v1.4s +str q12, [x0, #464] +sub v12.4s, v6.4s, v5.4s +add v6.4s, v6.4s, v5.4s +sub v5.4s, v15.4s, v18.4s +add v15.4s, v15.4s, v18.4s +sub v18.4s, v0.4s, v22.4s +str q8, [x0, #144] +add v0.4s, v0.4s, v22.4s +str q21, [x0, #208] +sub v21.4s, v19.4s, v11.4s +str q13, [x0, #16] +add v19.4s, v19.4s, v11.4s +str q23, [x0, #80] +str q6, [x0, #528] +str q12, [x0, #592] +str q15, [x0, #656] +str q5, [x0, #720] +str q0, [x0, #784] +str q18, [x0, #848] +str q19, [x0, #912] +str q21, [x0, #976] +ldr q10, [x0, #224] +ldr q7, [x0, #160] +ldr q2, [x0, #32] +ldr q17, [x17, #+128] +ldr q14, [x17, #+144] +sqrdmulh v3.4S, v2.4S, v14.s[0] +mul v2.4S, v2.4S,v17.s[0] +ldr q4, [x0, #48] +sqrdmulh v9.4S, v4.4S, v14.s[0] +mul v4.4S, v4.4S,v17.s[0] +ldr q16, [x17, #+160] +ldr q30, [x17, #+176] +ldr q29, [x0, #96] +sqrdmulh v28.4S, v29.4S, v30.s[0] +mul v29.4S, v29.4S,v16.s[0] +ldr q27, [x0, #112] +sqrdmulh v26.4S, v27.4S, v30.s[0] +mul v27.4S, v27.4S,v16.s[0] +ldr q25, [x17, #+192] +ldr q24, [x17, #+208] +mla v2.4S, v3.4S, v31.s[0] +sqrdmulh v3.4S, v7.4S, v24.s[0] +ldr q20, [x0, #176] +mla v4.4S, v9.4S, v31.s[0] +sqrdmulh v9.4S, v20.4S, v24.s[0] +ldr q1, [x17, #+224] +ldr q8, [x17, #+240] +mla v29.4S, v28.4S, v31.s[0] +sqrdmulh v28.4S, v10.4S, v8.s[0] +ldr q22, [x0, #240] +mla v27.4S, v26.4S, v31.s[0] +sqrdmulh v26.4S, v22.4S, v8.s[0] +ldr q13, [x0, #0] +ldr q11, [x0, #128] +mul v7.4S, v7.4S,v25.s[0] +sub v23.4s, v13.4s, v2.4s +ldr q6, [x0, #16] +mul v20.4S, v20.4S,v25.s[0] +add v13.4s, v13.4s, v2.4s +ldr q2, [x0, #144] +mla v7.4S, v3.4S, v31.s[0] +sub v3.4s, v6.4s, v4.4s +ldr q12, [x0, #64] +mla v20.4S, v9.4S, v31.s[0] +add v6.4s, v6.4s, v4.4s +ldr q4, [x0, #192] +mul v10.4S, v10.4S,v1.s[0] +sub v9.4s, v12.4s, v29.4s +ldr q15, [x0, #80] +mul v22.4S, v22.4S,v1.s[0] +add v12.4s, v12.4s, v29.4s +ldr q29, [x0, #208] +mla v10.4S, v28.4S, v31.s[0] +mla v22.4S, v26.4S, v31.s[0] +sub v26.4s, v15.4s, v27.4s +sqrdmulh v28.4S, v6.4S, v14.s[1] +add v15.4s, v15.4s, v27.4s +mul v6.4S, v6.4S,v17.s[1] +sqrdmulh v27.4S, v3.4S, v14.s[2] +sub v5.4s, v11.4s, v7.4s +mul v3.4S, v3.4S,v17.s[2] +add v11.4s, v11.4s, v7.4s +sqrdmulh v14.4S, v15.4S, v30.s[1] +sub v17.4s, v2.4s, v20.4s +mul v15.4S, v15.4S,v16.s[1] +add v2.4s, v2.4s, v20.4s +sqrdmulh v20.4S, v26.4S, v30.s[2] +sub v7.4s, v4.4s, v10.4s +mul v26.4S, v26.4S,v16.s[2] +add v4.4s, v4.4s, v10.4s +mla v6.4S, v28.4S, v31.s[0] +sub v28.4s, v29.4s, v22.4s +ldr q30, [x0, #480] +sqrdmulh v16.4S, v2.4S, v24.s[1] +add v29.4s, v29.4s, v22.4s +mla v3.4S, v27.4S, v31.s[0] +ldr q27, [x0, #416] +sqrdmulh v22.4S, v17.4S, v24.s[2] +sub v10.4s, v13.4s, v6.4s +mla v15.4S, v14.4S, v31.s[0] +ldr q14, [x0, #288] +sqrdmulh v0.4S, v29.4S, v8.s[1] +add v13.4s, v13.4s, v6.4s +str q10, [x0, #16] +mla v26.4S, v20.4S, v31.s[0] +ldr q20, [x17, #+256] +ldr q10, [x17, #+272] +sqrdmulh v6.4S, v28.4S, v8.s[2] +sub v18.4s, v23.4s, v3.4s +str q13, [x0, #0] +mul v2.4S, v2.4S,v25.s[1] +add v23.4s, v23.4s, v3.4s +mul v17.4S, v17.4S,v25.s[2] +str q18, [x0, #48] +mla v2.4S, v16.4S, v31.s[0] +sub v16.4s, v12.4s, v15.4s +mla v17.4S, v22.4S, v31.s[0] +str q23, [x0, #32] +mul v29.4S, v29.4S,v1.s[1] +str q16, [x0, #80] +mul v28.4S, v28.4S,v1.s[2] +add v12.4s, v12.4s, v15.4s +str q12, [x0, #64] +mla v29.4S, v0.4S, v31.s[0] +sub v0.4s, v9.4s, v26.4s +str q0, [x0, #112] +mla v28.4S, v6.4S, v31.s[0] +add v9.4s, v9.4s, v26.4s +str q9, [x0, #96] +sqrdmulh v8.4S, v14.4S, v10.s[0] +sub v1.4s, v11.4s, v2.4s +mul v14.4S, v14.4S,v20.s[0] +str q1, [x0, #144] +ldr q1, [x0, #304] +sqrdmulh v9.4S, v1.4S, v10.s[0] +add v11.4s, v11.4s, v2.4s +mul v1.4S, v1.4S,v20.s[0] +str q11, [x0, #128] +ldr q11, [x17, #+288] +ldr q2, [x17, #+304] +ldr q26, [x0, #352] +sqrdmulh v6.4S, v26.4S, v2.s[0] +sub v0.4s, v5.4s, v17.4s +mul v26.4S, v26.4S,v11.s[0] +str q0, [x0, #176] +ldr q0, [x0, #368] +sqrdmulh v12.4S, v0.4S, v2.s[0] +add v5.4s, v5.4s, v17.4s +mul v0.4S, v0.4S,v11.s[0] +str q5, [x0, #160] +ldr q5, [x17, #+320] +ldr q17, [x17, #+336] +mla v14.4S, v8.4S, v31.s[0] +sub v8.4s, v4.4s, v29.4s +sqrdmulh v15.4S, v27.4S, v17.s[0] +str q8, [x0, #208] +ldr q8, [x0, #432] +mla v1.4S, v9.4S, v31.s[0] +add v4.4s, v4.4s, v29.4s +sqrdmulh v29.4S, v8.4S, v17.s[0] +str q4, [x0, #192] +ldr q4, [x17, #+352] +ldr q9, [x17, #+368] +mla v26.4S, v6.4S, v31.s[0] +sub v6.4s, v7.4s, v28.4s +sqrdmulh v16.4S, v30.4S, v9.s[0] +str q6, [x0, #240] +ldr q6, [x0, #496] +mla v0.4S, v12.4S, v31.s[0] +add v7.4s, v7.4s, v28.4s +sqrdmulh v28.4S, v6.4S, v9.s[0] +str q7, [x0, #224] +ldr q7, [x0, #256] +ldr q12, [x0, #384] +mul v27.4S, v27.4S,v5.s[0] +sub v24.4s, v7.4s, v14.4s +ldr q25, [x0, #272] +mul v8.4S, v8.4S,v5.s[0] +add v7.4s, v7.4s, v14.4s +ldr q14, [x0, #400] +mla v27.4S, v15.4S, v31.s[0] +sub v15.4s, v25.4s, v1.4s +ldr q23, [x0, #320] +mla v8.4S, v29.4S, v31.s[0] +add v25.4s, v25.4s, v1.4s +ldr q1, [x0, #448] +mul v30.4S, v30.4S,v4.s[0] +sub v29.4s, v23.4s, v26.4s +ldr q22, [x0, #336] +mul v6.4S, v6.4S,v4.s[0] +add v23.4s, v23.4s, v26.4s +ldr q26, [x0, #464] +mla v30.4S, v16.4S, v31.s[0] +mla v6.4S, v28.4S, v31.s[0] +sub v28.4s, v22.4s, v0.4s +sqrdmulh v16.4S, v25.4S, v10.s[1] +add v22.4s, v22.4s, v0.4s +mul v25.4S, v25.4S,v20.s[1] +sqrdmulh v0.4S, v15.4S, v10.s[2] +sub v18.4s, v12.4s, v27.4s +mul v15.4S, v15.4S,v20.s[2] +add v12.4s, v12.4s, v27.4s +sqrdmulh v10.4S, v22.4S, v2.s[1] +sub v20.4s, v14.4s, v8.4s +mul v22.4S, v22.4S,v11.s[1] +add v14.4s, v14.4s, v8.4s +sqrdmulh v8.4S, v28.4S, v2.s[2] +sub v27.4s, v1.4s, v30.4s +mul v28.4S, v28.4S,v11.s[2] +add v1.4s, v1.4s, v30.4s +mla v25.4S, v16.4S, v31.s[0] +sub v16.4s, v26.4s, v6.4s +ldr q2, [x0, #736] +sqrdmulh v11.4S, v14.4S, v17.s[1] +add v26.4s, v26.4s, v6.4s +mla v15.4S, v0.4S, v31.s[0] +ldr q0, [x0, #672] +sqrdmulh v6.4S, v20.4S, v17.s[2] +sub v30.4s, v7.4s, v25.4s +mla v22.4S, v10.4S, v31.s[0] +ldr q10, [x0, #544] +sqrdmulh v3.4S, v26.4S, v9.s[1] +add v7.4s, v7.4s, v25.4s +str q30, [x0, #272] +mla v28.4S, v8.4S, v31.s[0] +ldr q8, [x17, #+384] +ldr q30, [x17, #+400] +sqrdmulh v25.4S, v16.4S, v9.s[2] +sub v13.4s, v24.4s, v15.4s +str q7, [x0, #256] +mul v14.4S, v14.4S,v5.s[1] +add v24.4s, v24.4s, v15.4s +mul v20.4S, v20.4S,v5.s[2] +str q13, [x0, #304] +mla v14.4S, v11.4S, v31.s[0] +sub v11.4s, v23.4s, v22.4s +mla v20.4S, v6.4S, v31.s[0] +str q24, [x0, #288] +mul v26.4S, v26.4S,v4.s[1] +str q11, [x0, #336] +mul v16.4S, v16.4S,v4.s[2] +add v23.4s, v23.4s, v22.4s +str q23, [x0, #320] +mla v26.4S, v3.4S, v31.s[0] +sub v3.4s, v29.4s, v28.4s +str q3, [x0, #368] +mla v16.4S, v25.4S, v31.s[0] +add v29.4s, v29.4s, v28.4s +str q29, [x0, #352] +sqrdmulh v9.4S, v10.4S, v30.s[0] +sub v4.4s, v12.4s, v14.4s +mul v10.4S, v10.4S,v8.s[0] +str q4, [x0, #400] +ldr q4, [x0, #560] +sqrdmulh v29.4S, v4.4S, v30.s[0] +add v12.4s, v12.4s, v14.4s +mul v4.4S, v4.4S,v8.s[0] +str q12, [x0, #384] +ldr q12, [x17, #+416] +ldr q14, [x17, #+432] +ldr q28, [x0, #608] +sqrdmulh v25.4S, v28.4S, v14.s[0] +sub v3.4s, v18.4s, v20.4s +mul v28.4S, v28.4S,v12.s[0] +str q3, [x0, #432] +ldr q3, [x0, #624] +sqrdmulh v23.4S, v3.4S, v14.s[0] +add v18.4s, v18.4s, v20.4s +mul v3.4S, v3.4S,v12.s[0] +str q18, [x0, #416] +ldr q18, [x17, #+448] +ldr q20, [x17, #+464] +mla v10.4S, v9.4S, v31.s[0] +sub v9.4s, v1.4s, v26.4s +sqrdmulh v22.4S, v0.4S, v20.s[0] +str q9, [x0, #464] +ldr q9, [x0, #688] +mla v4.4S, v29.4S, v31.s[0] +add v1.4s, v1.4s, v26.4s +sqrdmulh v26.4S, v9.4S, v20.s[0] +str q1, [x0, #448] +ldr q1, [x17, #+480] +ldr q29, [x17, #+496] +mla v28.4S, v25.4S, v31.s[0] +sub v25.4s, v27.4s, v16.4s +sqrdmulh v11.4S, v2.4S, v29.s[0] +str q25, [x0, #496] +ldr q25, [x0, #752] +mla v3.4S, v23.4S, v31.s[0] +add v27.4s, v27.4s, v16.4s +sqrdmulh v16.4S, v25.4S, v29.s[0] +str q27, [x0, #480] +ldr q27, [x0, #512] +ldr q23, [x0, #640] +mul v0.4S, v0.4S,v18.s[0] +sub v17.4s, v27.4s, v10.4s +ldr q5, [x0, #528] +mul v9.4S, v9.4S,v18.s[0] +add v27.4s, v27.4s, v10.4s +ldr q10, [x0, #656] +mla v0.4S, v22.4S, v31.s[0] +sub v22.4s, v5.4s, v4.4s +ldr q24, [x0, #576] +mla v9.4S, v26.4S, v31.s[0] +add v5.4s, v5.4s, v4.4s +ldr q4, [x0, #704] +mul v2.4S, v2.4S,v1.s[0] +sub v26.4s, v24.4s, v28.4s +ldr q6, [x0, #592] +mul v25.4S, v25.4S,v1.s[0] +add v24.4s, v24.4s, v28.4s +ldr q28, [x0, #720] +mla v2.4S, v11.4S, v31.s[0] +mla v25.4S, v16.4S, v31.s[0] +sub v16.4s, v6.4s, v3.4s +sqrdmulh v11.4S, v5.4S, v30.s[1] +add v6.4s, v6.4s, v3.4s +mul v5.4S, v5.4S,v8.s[1] +sqrdmulh v3.4S, v22.4S, v30.s[2] +sub v13.4s, v23.4s, v0.4s +mul v22.4S, v22.4S,v8.s[2] +add v23.4s, v23.4s, v0.4s +sqrdmulh v30.4S, v6.4S, v14.s[1] +sub v8.4s, v10.4s, v9.4s +mul v6.4S, v6.4S,v12.s[1] +add v10.4s, v10.4s, v9.4s +sqrdmulh v9.4S, v16.4S, v14.s[2] +sub v0.4s, v4.4s, v2.4s +mul v16.4S, v16.4S,v12.s[2] +add v4.4s, v4.4s, v2.4s +mla v5.4S, v11.4S, v31.s[0] +sub v11.4s, v28.4s, v25.4s +ldr q14, [x0, #992] +sqrdmulh v12.4S, v10.4S, v20.s[1] +add v28.4s, v28.4s, v25.4s +mla v22.4S, v3.4S, v31.s[0] +ldr q3, [x0, #928] +sqrdmulh v25.4S, v8.4S, v20.s[2] +sub v2.4s, v27.4s, v5.4s +mla v6.4S, v30.4S, v31.s[0] +ldr q30, [x0, #800] +sqrdmulh v15.4S, v28.4S, v29.s[1] +add v27.4s, v27.4s, v5.4s +str q2, [x0, #528] +mla v16.4S, v9.4S, v31.s[0] +ldr q9, [x17, #+512] +ldr q2, [x17, #+528] +sqrdmulh v5.4S, v11.4S, v29.s[2] +sub v7.4s, v17.4s, v22.4s +str q27, [x0, #512] +mul v10.4S, v10.4S,v18.s[1] +add v17.4s, v17.4s, v22.4s +mul v8.4S, v8.4S,v18.s[2] +str q7, [x0, #560] +mla v10.4S, v12.4S, v31.s[0] +sub v12.4s, v24.4s, v6.4s +mla v8.4S, v25.4S, v31.s[0] +str q17, [x0, #544] +mul v28.4S, v28.4S,v1.s[1] +str q12, [x0, #592] +mul v11.4S, v11.4S,v1.s[2] +add v24.4s, v24.4s, v6.4s +str q24, [x0, #576] +mla v28.4S, v15.4S, v31.s[0] +sub v15.4s, v26.4s, v16.4s +str q15, [x0, #624] +mla v11.4S, v5.4S, v31.s[0] +add v26.4s, v26.4s, v16.4s +str q26, [x0, #608] +sqrdmulh v29.4S, v30.4S, v2.s[0] +sub v1.4s, v23.4s, v10.4s +mul v30.4S, v30.4S,v9.s[0] +str q1, [x0, #656] +ldr q1, [x0, #816] +sqrdmulh v26.4S, v1.4S, v2.s[0] +add v23.4s, v23.4s, v10.4s +mul v1.4S, v1.4S,v9.s[0] +str q23, [x0, #640] +ldr q23, [x17, #+544] +ldr q10, [x17, #+560] +ldr q16, [x0, #864] +sqrdmulh v5.4S, v16.4S, v10.s[0] +sub v15.4s, v13.4s, v8.4s +mul v16.4S, v16.4S,v23.s[0] +str q15, [x0, #688] +ldr q15, [x0, #880] +sqrdmulh v24.4S, v15.4S, v10.s[0] +add v13.4s, v13.4s, v8.4s +mul v15.4S, v15.4S,v23.s[0] +str q13, [x0, #672] +ldr q13, [x17, #+576] +ldr q8, [x17, #+592] +mla v30.4S, v29.4S, v31.s[0] +sub v29.4s, v4.4s, v28.4s +sqrdmulh v6.4S, v3.4S, v8.s[0] +str q29, [x0, #720] +ldr q29, [x0, #944] +mla v1.4S, v26.4S, v31.s[0] +add v4.4s, v4.4s, v28.4s +sqrdmulh v28.4S, v29.4S, v8.s[0] +str q4, [x0, #704] +ldr q4, [x17, #+608] +ldr q26, [x17, #+624] +mla v16.4S, v5.4S, v31.s[0] +sub v5.4s, v0.4s, v11.4s +sqrdmulh v12.4S, v14.4S, v26.s[0] +str q5, [x0, #752] +ldr q5, [x0, #1008] +mla v15.4S, v24.4S, v31.s[0] +add v0.4s, v0.4s, v11.4s +sqrdmulh v11.4S, v5.4S, v26.s[0] +str q0, [x0, #736] +ldr q0, [x0, #768] +ldr q24, [x0, #896] +mul v3.4S, v3.4S,v13.s[0] +sub v20.4s, v0.4s, v30.4s +ldr q18, [x0, #784] +mul v29.4S, v29.4S,v13.s[0] +add v0.4s, v0.4s, v30.4s +ldr q30, [x0, #912] +mla v3.4S, v6.4S, v31.s[0] +sub v6.4s, v18.4s, v1.4s +ldr q17, [x0, #832] +mla v29.4S, v28.4S, v31.s[0] +add v18.4s, v18.4s, v1.4s +ldr q1, [x0, #960] +mul v14.4S, v14.4S,v4.s[0] +sub v28.4s, v17.4s, v16.4s +ldr q25, [x0, #848] +mul v5.4S, v5.4S,v4.s[0] +add v17.4s, v17.4s, v16.4s +ldr q16, [x0, #976] +mla v14.4S, v12.4S, v31.s[0] +mla v5.4S, v11.4S, v31.s[0] +sub v11.4s, v25.4s, v15.4s +sqrdmulh v12.4S, v18.4S, v2.s[1] +add v25.4s, v25.4s, v15.4s +mul v18.4S, v18.4S,v9.s[1] +sqrdmulh v15.4S, v6.4S, v2.s[2] +sub v7.4s, v24.4s, v3.4s +mul v6.4S, v6.4S,v9.s[2] +add v24.4s, v24.4s, v3.4s +sqrdmulh v2.4S, v25.4S, v10.s[1] +sub v9.4s, v30.4s, v29.4s +mul v25.4S, v25.4S,v23.s[1] +add v30.4s, v30.4s, v29.4s +sqrdmulh v29.4S, v11.4S, v10.s[2] +sub v3.4s, v1.4s, v14.4s +mul v11.4S, v11.4S,v23.s[2] +add v1.4s, v1.4s, v14.4s +mla v18.4S, v12.4S, v31.s[0] +sub v12.4s, v16.4s, v5.4s +sqrdmulh v10.4S, v30.4S, v8.s[1] +add v16.4s, v16.4s, v5.4s +mla v6.4S, v15.4S, v31.s[0] +sqrdmulh v15.4S, v9.4S, v8.s[2] +sub v5.4s, v0.4s, v18.4s +mla v25.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v16.4S, v26.s[1] +add v0.4s, v0.4s, v18.4s +str q5, [x0, #784] +mla v11.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v12.4S, v26.s[2] +sub v5.4s, v20.4s, v6.4s +str q0, [x0, #768] +mul v30.4S, v30.4S,v13.s[1] +add v20.4s, v20.4s, v6.4s +mul v9.4S, v9.4S,v13.s[2] +str q5, [x0, #816] +mla v30.4S, v10.4S, v31.s[0] +sub v10.4s, v17.4s, v25.4s +mla v9.4S, v15.4S, v31.s[0] +str q20, [x0, #800] +mul v16.4S, v16.4S,v4.s[1] +str q10, [x0, #848] +mul v12.4S, v12.4S,v4.s[2] +add v17.4s, v17.4s, v25.4s +str q17, [x0, #832] +mla v16.4S, v2.4S, v31.s[0] +sub v2.4s, v28.4s, v11.4s +str q2, [x0, #880] +mla v12.4S, v29.4S, v31.s[0] +add v28.4s, v28.4s, v11.4s +str q28, [x0, #864] +sub v26.4s, v24.4s, v30.4s +str q26, [x0, #912] +add v24.4s, v24.4s, v30.4s +str q24, [x0, #896] +sub v24.4s, v7.4s, v9.4s +str q24, [x0, #944] +add v7.4s, v7.4s, v9.4s +str q7, [x0, #928] +sub v7.4s, v1.4s, v16.4s +str q7, [x0, #976] +add v1.4s, v1.4s, v16.4s +str q1, [x0, #960] +sub v1.4s, v3.4s, v12.4s +str q1, [x0, #1008] +add v3.4s, v3.4s, v12.4s +str q3, [x0, #992] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1464 +// Instruction count: 1460 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_9_z4_7.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_9_z4_7.s new file mode 100644 index 0000000..c98a11f --- /dev/null +++ b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_9_z4_7.s @@ -0,0 +1,1494 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +#include +modulus: +.word -33556993 +.word 0 +.word 0 +.word 0 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 14626653 // Layer 3, block 1 +.word 29737761 // Layer 3, block 2 +.word 30285189 // Layer 3, block 3 +.word 1307297022 // Layer 3, block 0 +.word 936034350 // Layer 3, block 1 +.word 1903071454 // Layer 3, block 2 +.word 1938104173 // Layer 3, block 3 +.word 21289485 // Layer 3, block 4 +.word 9914896 // Layer 3, block 5 +.word 22603682 // Layer 3, block 6 +.word 16204162 // Layer 3, block 7 +.word 1362423055 // Layer 3, block 4 +.word 634504916 // Layer 3, block 5 +.word 1446525244 // Layer 3, block 6 +.word 1036987221 // Layer 3, block 7 +.word 23825509 // Layer 4, block 0 +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 1524716204 // Layer 4, block 0 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 0 // Layer None, block None +.word 27028662 // Layer 4, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 1729702351 // Layer 4, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 0 // Layer None, block None +.word 14833295 // Layer 4, block 2 +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 949258429 // Layer 4, block 2 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 0 // Layer None, block None +.word 2138810 // Layer 4, block 3 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 136873393 // Layer 4, block 3 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 0 // Layer None, block None +.word 6490403 // Layer 4, block 4 +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 415354091 // Layer 4, block 4 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 0 // Layer None, block None +.word 19648405 // Layer 4, block 5 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 1257401950 // Layer 4, block 5 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 0 // Layer None, block None +.word 31254932 // Layer 4, block 6 +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 2000162988 // Layer 4, block 6 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 0 // Layer None, block None +.word 26362414 // Layer 4, block 7 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 1687065733 // Layer 4, block 7 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 0 // Layer None, block None +.word 572895 // Layer 4, block 8 +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 36662482 // Layer 4, block 8 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 0 // Layer None, block None +.word 26691971 // Layer 4, block 9 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 1708155771 // Layer 4, block 9 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 0 // Layer None, block None +.word 9249292 // Layer 4, block 10 +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 591909511 // Layer 4, block 10 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 0 // Layer None, block None +.word 29292862 // Layer 4, block 11 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 1874600091 // Layer 4, block 11 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 0 // Layer None, block None +.word 8247799 // Layer 4, block 12 +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 527818851 // Layer 4, block 12 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 0 // Layer None, block None +.word 5086187 // Layer 4, block 13 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 325491125 // Layer 4, block 13 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 0 // Layer None, block None +.word 28113639 // Layer 4, block 14 +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 1799135579 // Layer 4, block 14 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 0 // Layer None, block None +.word 8471290 // Layer 4, block 15 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 0 // Layer None, block None +.word 542121183 // Layer 4, block 15 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.word 0 // Layer None, block None +.text +.global ntt_u32_incomplete_neon_asm_var_4_2_9_z4_7 +.global _ntt_u32_incomplete_neon_asm_var_4_2_9_z4_7 +ntt_u32_incomplete_neon_asm_var_4_2_9_z4_7: +_ntt_u32_incomplete_neon_asm_var_4_2_9_z4_7: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save NEON vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ASM_LOAD (x17, modulus) +ldr q31, [x17] +ASM_LOAD(x17, roots_merged) +ldr q30, [x17, #+0] +ldr q29, [x17, #+16] +ldr q28, [x17, #+32] +ldr q27, [x17, #+48] +ldr q26, [x17, #+64] +ldr q25, [x17, #+80] +ldr q24, [x17, #+96] +ldr q23, [x17, #+112] +ldr q22, [x0, #928] +sqrdmulh v21.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +ldr q20, [x0, #992] +sqrdmulh v19.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +ldr q18, [x0, #800] +sqrdmulh v17.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +ldr q16, [x0, #864] +sqrdmulh v3.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +mla v22.4S, v21.4S, v31.s[0] +mla v20.4S, v19.4S, v31.s[0] +mla v18.4S, v17.4S, v31.s[0] +mla v16.4S, v3.4S, v31.s[0] +ldr q3, [x0, #544] +sqrdmulh v17.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +ldr q19, [x0, #608] +sqrdmulh v21.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +ldr q2, [x0, #672] +ldr q1, [x0, #416] +sqrdmulh v0.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +sub v15.4s, v1.4s, v22.4s +add v1.4s, v1.4s, v22.4s +ldr q22, [x0, #736] +ldr q14, [x0, #480] +sqrdmulh v13.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v12.4s, v14.4s, v20.4s +add v14.4s, v14.4s, v20.4s +ldr q20, [x0, #288] +mla v3.4S, v17.4S, v31.s[0] +mla v19.4S, v21.4S, v31.s[0] +sub v21.4s, v20.4s, v18.4s +mla v2.4S, v0.4S, v31.s[0] +mla v22.4S, v13.4S, v31.s[0] +add v20.4s, v20.4s, v18.4s +ldr q18, [x0, #352] +sqrdmulh v13.4S, v1.4S, v29.s[1] +mul v1.4S, v1.4S,v30.s[1] +sub v0.4s, v18.4s, v16.4s +sqrdmulh v17.4S, v14.4S, v29.s[1] +mul v14.4S, v14.4S,v30.s[1] +add v18.4s, v18.4s, v16.4s +ldr q16, [x0, #32] +sqrdmulh v11.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v10.4s, v16.4s, v3.4s +add v16.4s, v16.4s, v3.4s +ldr q3, [x0, #96] +sqrdmulh v9.4S, v18.4S, v29.s[1] +mul v18.4S, v18.4S,v30.s[1] +sub v8.4s, v3.4s, v19.4s +add v3.4s, v3.4s, v19.4s +ldr q19, [x0, #160] +mla v1.4S, v13.4S, v31.s[0] +mla v14.4S, v17.4S, v31.s[0] +sub v17.4s, v19.4s, v2.4s +mla v20.4S, v11.4S, v31.s[0] +mla v18.4S, v9.4S, v31.s[0] +add v19.4s, v19.4s, v2.4s +ldr q2, [x0, #224] +sqrdmulh v9.4S, v15.4S, v29.s[2] +mul v15.4S, v15.4S,v30.s[2] +sub v11.4s, v2.4s, v22.4s +sqrdmulh v13.4S, v12.4S, v29.s[2] +mul v12.4S, v12.4S,v30.s[2] +add v2.4s, v2.4s, v22.4s +sqrdmulh v22.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +sub v7.4s, v19.4s, v1.4s +add v19.4s, v19.4s, v1.4s +sqrdmulh v1.4S, v0.4S, v29.s[2] +mul v0.4S, v0.4S,v30.s[2] +sub v6.4s, v2.4s, v14.4s +add v2.4s, v2.4s, v14.4s +mla v15.4S, v9.4S, v31.s[0] +mla v12.4S, v13.4S, v31.s[0] +sub v13.4s, v16.4s, v20.4s +mla v21.4S, v22.4S, v31.s[0] +mla v0.4S, v1.4S, v31.s[0] +add v16.4s, v16.4s, v20.4s +sqrdmulh v20.4S, v7.4S, v27.s[1] +mul v7.4S, v7.4S,v28.s[1] +sub v1.4s, v3.4s, v18.4s +sqrdmulh v22.4S, v6.4S, v27.s[1] +mul v6.4S, v6.4S,v28.s[1] +add v3.4s, v3.4s, v18.4s +sqrdmulh v18.4S, v19.4S, v27.s[0] +mul v19.4S, v19.4S,v28.s[0] +sub v9.4s, v17.4s, v15.4s +add v17.4s, v17.4s, v15.4s +sqrdmulh v15.4S, v2.4S, v27.s[0] +mul v2.4S, v2.4S,v28.s[0] +sub v14.4s, v11.4s, v12.4s +add v11.4s, v11.4s, v12.4s +mla v7.4S, v20.4S, v31.s[0] +mla v6.4S, v22.4S, v31.s[0] +sub v22.4s, v10.4s, v21.4s +mla v19.4S, v18.4S, v31.s[0] +mla v2.4S, v15.4S, v31.s[0] +add v10.4s, v10.4s, v21.4s +sqrdmulh v21.4S, v17.4S, v27.s[2] +mul v17.4S, v17.4S,v28.s[2] +sub v15.4s, v8.4s, v0.4s +sqrdmulh v18.4S, v11.4S, v27.s[2] +mul v11.4S, v11.4S,v28.s[2] +add v8.4s, v8.4s, v0.4s +sqrdmulh v0.4S, v9.4S, v27.s[3] +mul v9.4S, v9.4S,v28.s[3] +sub v20.4s, v13.4s, v7.4s +add v13.4s, v13.4s, v7.4s +sqrdmulh v7.4S, v14.4S, v27.s[3] +mul v14.4S, v14.4S,v28.s[3] +sub v12.4s, v1.4s, v6.4s +add v1.4s, v1.4s, v6.4s +mla v17.4S, v21.4S, v31.s[0] +mla v11.4S, v18.4S, v31.s[0] +sub v18.4s, v16.4s, v19.4s +mla v9.4S, v0.4S, v31.s[0] +mla v14.4S, v7.4S, v31.s[0] +add v16.4s, v16.4s, v19.4s +sqrdmulh v19.4S, v1.4S, v25.s[2] +mul v1.4S, v1.4S,v26.s[2] +sub v7.4s, v3.4s, v2.4s +sqrdmulh v0.4S, v12.4S, v25.s[3] +mul v12.4S, v12.4S,v26.s[3] +add v3.4s, v3.4s, v2.4s +sqrdmulh v2.4S, v7.4S, v25.s[1] +mul v7.4S, v7.4S,v26.s[1] +sub v21.4s, v10.4s, v17.4s +add v10.4s, v10.4s, v17.4s +sqrdmulh v17.4S, v3.4S, v25.s[0] +mul v3.4S, v3.4S,v26.s[0] +sub v6.4s, v8.4s, v11.4s +add v8.4s, v8.4s, v11.4s +mla v1.4S, v19.4S, v31.s[0] +mla v12.4S, v0.4S, v31.s[0] +sub v0.4s, v22.4s, v9.4s +mla v7.4S, v2.4S, v31.s[0] +mla v3.4S, v17.4S, v31.s[0] +add v22.4s, v22.4s, v9.4s +sqrdmulh v9.4S, v8.4S, v23.s[0] +mul v8.4S, v8.4S,v24.s[0] +sub v17.4s, v15.4s, v14.4s +sqrdmulh v2.4S, v6.4S, v23.s[1] +mul v6.4S, v6.4S,v24.s[1] +add v15.4s, v15.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v23.s[2] +mul v15.4S, v15.4S,v24.s[2] +sub v19.4s, v13.4s, v1.4s +add v13.4s, v13.4s, v1.4s +sqrdmulh v1.4S, v17.4S, v23.s[3] +mul v17.4S, v17.4S,v24.s[3] +sub v11.4s, v20.4s, v12.4s +add v20.4s, v20.4s, v12.4s +mla v8.4S, v9.4S, v31.s[0] +mla v6.4S, v2.4S, v31.s[0] +sub v2.4s, v18.4s, v7.4s +str q13, [x0, #288] +mla v15.4S, v14.4S, v31.s[0] +mla v17.4S, v1.4S, v31.s[0] +add v18.4s, v18.4s, v7.4s +str q19, [x0, #352] +ldr q19, [x0, #944] +sqrdmulh v7.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +str q20, [x0, #416] +sub v20.4s, v16.4s, v3.4s +ldr q1, [x0, #1008] +sqrdmulh v14.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +str q11, [x0, #480] +add v16.4s, v16.4s, v3.4s +ldr q3, [x0, #816] +sqrdmulh v11.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +sub v13.4s, v10.4s, v8.4s +add v10.4s, v10.4s, v8.4s +ldr q8, [x0, #880] +sqrdmulh v9.4S, v8.4S, v29.s[0] +mul v8.4S, v8.4S,v30.s[0] +sub v12.4s, v21.4s, v6.4s +add v21.4s, v21.4s, v6.4s +mla v19.4S, v7.4S, v31.s[0] +mla v1.4S, v14.4S, v31.s[0] +str q18, [x0, #160] +sub v18.4s, v22.4s, v15.4s +mla v3.4S, v11.4S, v31.s[0] +mla v8.4S, v9.4S, v31.s[0] +str q2, [x0, #224] +add v22.4s, v22.4s, v15.4s +ldr q15, [x0, #560] +sqrdmulh v2.4S, v15.4S, v29.s[0] +mul v15.4S, v15.4S,v30.s[0] +str q16, [x0, #32] +sub v16.4s, v0.4s, v17.4s +ldr q9, [x0, #624] +sqrdmulh v11.4S, v9.4S, v29.s[0] +mul v9.4S, v9.4S,v30.s[0] +str q20, [x0, #96] +add v0.4s, v0.4s, v17.4s +ldr q17, [x0, #688] +ldr q20, [x0, #432] +sqrdmulh v14.4S, v17.4S, v29.s[0] +mul v17.4S, v17.4S,v30.s[0] +sub v7.4s, v20.4s, v19.4s +add v20.4s, v20.4s, v19.4s +ldr q19, [x0, #752] +ldr q6, [x0, #496] +sqrdmulh v5.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +sub v4.4s, v6.4s, v1.4s +add v6.4s, v6.4s, v1.4s +ldr q1, [x0, #304] +mla v15.4S, v2.4S, v31.s[0] +mla v9.4S, v11.4S, v31.s[0] +str q10, [x0, #544] +sub v10.4s, v1.4s, v3.4s +mla v17.4S, v14.4S, v31.s[0] +mla v19.4S, v5.4S, v31.s[0] +str q13, [x0, #608] +add v1.4s, v1.4s, v3.4s +ldr q3, [x0, #368] +sqrdmulh v13.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +str q21, [x0, #672] +sub v21.4s, v3.4s, v8.4s +sqrdmulh v5.4S, v6.4S, v29.s[1] +mul v6.4S, v6.4S,v30.s[1] +str q12, [x0, #736] +add v3.4s, v3.4s, v8.4s +ldr q8, [x0, #48] +sqrdmulh v12.4S, v1.4S, v29.s[1] +mul v1.4S, v1.4S,v30.s[1] +sub v14.4s, v8.4s, v15.4s +add v8.4s, v8.4s, v15.4s +ldr q15, [x0, #112] +sqrdmulh v11.4S, v3.4S, v29.s[1] +mul v3.4S, v3.4S,v30.s[1] +sub v2.4s, v15.4s, v9.4s +add v15.4s, v15.4s, v9.4s +ldr q9, [x0, #176] +mla v20.4S, v13.4S, v31.s[0] +mla v6.4S, v5.4S, v31.s[0] +str q22, [x0, #800] +sub v22.4s, v9.4s, v17.4s +mla v1.4S, v12.4S, v31.s[0] +mla v3.4S, v11.4S, v31.s[0] +str q18, [x0, #864] +add v9.4s, v9.4s, v17.4s +ldr q17, [x0, #240] +sqrdmulh v18.4S, v7.4S, v29.s[2] +mul v7.4S, v7.4S,v30.s[2] +str q0, [x0, #928] +sub v0.4s, v17.4s, v19.4s +sqrdmulh v11.4S, v4.4S, v29.s[2] +mul v4.4S, v4.4S,v30.s[2] +str q16, [x0, #992] +add v17.4s, v17.4s, v19.4s +sqrdmulh v19.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v16.4s, v9.4s, v20.4s +add v9.4s, v9.4s, v20.4s +sqrdmulh v20.4S, v21.4S, v29.s[2] +mul v21.4S, v21.4S,v30.s[2] +sub v12.4s, v17.4s, v6.4s +add v17.4s, v17.4s, v6.4s +mla v7.4S, v18.4S, v31.s[0] +mla v4.4S, v11.4S, v31.s[0] +sub v11.4s, v8.4s, v1.4s +mla v10.4S, v19.4S, v31.s[0] +mla v21.4S, v20.4S, v31.s[0] +add v8.4s, v8.4s, v1.4s +sqrdmulh v1.4S, v16.4S, v27.s[1] +mul v16.4S, v16.4S,v28.s[1] +sub v20.4s, v15.4s, v3.4s +sqrdmulh v19.4S, v12.4S, v27.s[1] +mul v12.4S, v12.4S,v28.s[1] +add v15.4s, v15.4s, v3.4s +sqrdmulh v3.4S, v9.4S, v27.s[0] +mul v9.4S, v9.4S,v28.s[0] +sub v18.4s, v22.4s, v7.4s +add v22.4s, v22.4s, v7.4s +sqrdmulh v7.4S, v17.4S, v27.s[0] +mul v17.4S, v17.4S,v28.s[0] +sub v6.4s, v0.4s, v4.4s +add v0.4s, v0.4s, v4.4s +mla v16.4S, v1.4S, v31.s[0] +mla v12.4S, v19.4S, v31.s[0] +sub v19.4s, v14.4s, v10.4s +mla v9.4S, v3.4S, v31.s[0] +mla v17.4S, v7.4S, v31.s[0] +add v14.4s, v14.4s, v10.4s +sqrdmulh v10.4S, v22.4S, v27.s[2] +mul v22.4S, v22.4S,v28.s[2] +sub v7.4s, v2.4s, v21.4s +sqrdmulh v3.4S, v0.4S, v27.s[2] +mul v0.4S, v0.4S,v28.s[2] +add v2.4s, v2.4s, v21.4s +sqrdmulh v21.4S, v18.4S, v27.s[3] +mul v18.4S, v18.4S,v28.s[3] +sub v1.4s, v11.4s, v16.4s +add v11.4s, v11.4s, v16.4s +sqrdmulh v16.4S, v6.4S, v27.s[3] +mul v6.4S, v6.4S,v28.s[3] +sub v4.4s, v20.4s, v12.4s +add v20.4s, v20.4s, v12.4s +mla v22.4S, v10.4S, v31.s[0] +mla v0.4S, v3.4S, v31.s[0] +sub v3.4s, v8.4s, v9.4s +mla v18.4S, v21.4S, v31.s[0] +mla v6.4S, v16.4S, v31.s[0] +add v8.4s, v8.4s, v9.4s +sqrdmulh v9.4S, v20.4S, v25.s[2] +mul v20.4S, v20.4S,v26.s[2] +sub v16.4s, v15.4s, v17.4s +sqrdmulh v21.4S, v4.4S, v25.s[3] +mul v4.4S, v4.4S,v26.s[3] +add v15.4s, v15.4s, v17.4s +sqrdmulh v17.4S, v16.4S, v25.s[1] +mul v16.4S, v16.4S,v26.s[1] +sub v10.4s, v14.4s, v22.4s +add v14.4s, v14.4s, v22.4s +sqrdmulh v22.4S, v15.4S, v25.s[0] +mul v15.4S, v15.4S,v26.s[0] +sub v12.4s, v2.4s, v0.4s +add v2.4s, v2.4s, v0.4s +mla v20.4S, v9.4S, v31.s[0] +mla v4.4S, v21.4S, v31.s[0] +sub v21.4s, v19.4s, v18.4s +mla v16.4S, v17.4S, v31.s[0] +mla v15.4S, v22.4S, v31.s[0] +add v19.4s, v19.4s, v18.4s +sqrdmulh v18.4S, v2.4S, v23.s[0] +mul v2.4S, v2.4S,v24.s[0] +sub v22.4s, v7.4s, v6.4s +sqrdmulh v17.4S, v12.4S, v23.s[1] +mul v12.4S, v12.4S,v24.s[1] +add v7.4s, v7.4s, v6.4s +sqrdmulh v6.4S, v7.4S, v23.s[2] +mul v7.4S, v7.4S,v24.s[2] +sub v9.4s, v11.4s, v20.4s +add v11.4s, v11.4s, v20.4s +sqrdmulh v20.4S, v22.4S, v23.s[3] +mul v22.4S, v22.4S,v24.s[3] +sub v0.4s, v1.4s, v4.4s +add v1.4s, v1.4s, v4.4s +mla v2.4S, v18.4S, v31.s[0] +mla v12.4S, v17.4S, v31.s[0] +sub v17.4s, v3.4s, v16.4s +str q11, [x0, #304] +mla v7.4S, v6.4S, v31.s[0] +mla v22.4S, v20.4S, v31.s[0] +add v3.4s, v3.4s, v16.4s +str q9, [x0, #368] +ldr q9, [x0, #896] +sqrdmulh v16.4S, v9.4S, v29.s[0] +mul v9.4S, v9.4S,v30.s[0] +str q1, [x0, #432] +sub v1.4s, v8.4s, v15.4s +ldr q20, [x0, #960] +sqrdmulh v6.4S, v20.4S, v29.s[0] +mul v20.4S, v20.4S,v30.s[0] +str q0, [x0, #496] +add v8.4s, v8.4s, v15.4s +ldr q15, [x0, #768] +sqrdmulh v0.4S, v15.4S, v29.s[0] +mul v15.4S, v15.4S,v30.s[0] +sub v11.4s, v14.4s, v2.4s +add v14.4s, v14.4s, v2.4s +ldr q2, [x0, #832] +sqrdmulh v18.4S, v2.4S, v29.s[0] +mul v2.4S, v2.4S,v30.s[0] +sub v4.4s, v10.4s, v12.4s +add v10.4s, v10.4s, v12.4s +mla v9.4S, v16.4S, v31.s[0] +mla v20.4S, v6.4S, v31.s[0] +str q3, [x0, #176] +sub v3.4s, v19.4s, v7.4s +mla v15.4S, v0.4S, v31.s[0] +mla v2.4S, v18.4S, v31.s[0] +str q17, [x0, #240] +add v19.4s, v19.4s, v7.4s +ldr q7, [x0, #512] +sqrdmulh v17.4S, v7.4S, v29.s[0] +mul v7.4S, v7.4S,v30.s[0] +str q8, [x0, #48] +sub v8.4s, v21.4s, v22.4s +ldr q18, [x0, #576] +sqrdmulh v0.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +str q1, [x0, #112] +add v21.4s, v21.4s, v22.4s +ldr q22, [x0, #640] +ldr q1, [x0, #384] +sqrdmulh v6.4S, v22.4S, v29.s[0] +mul v22.4S, v22.4S,v30.s[0] +sub v16.4s, v1.4s, v9.4s +add v1.4s, v1.4s, v9.4s +ldr q9, [x0, #704] +ldr q12, [x0, #448] +sqrdmulh v5.4S, v9.4S, v29.s[0] +mul v9.4S, v9.4S,v30.s[0] +sub v13.4s, v12.4s, v20.4s +add v12.4s, v12.4s, v20.4s +ldr q20, [x0, #256] +mla v7.4S, v17.4S, v31.s[0] +mla v18.4S, v0.4S, v31.s[0] +str q14, [x0, #560] +sub v14.4s, v20.4s, v15.4s +mla v22.4S, v6.4S, v31.s[0] +mla v9.4S, v5.4S, v31.s[0] +str q11, [x0, #624] +add v20.4s, v20.4s, v15.4s +ldr q15, [x0, #320] +sqrdmulh v11.4S, v1.4S, v29.s[1] +mul v1.4S, v1.4S,v30.s[1] +str q10, [x0, #688] +sub v10.4s, v15.4s, v2.4s +sqrdmulh v5.4S, v12.4S, v29.s[1] +mul v12.4S, v12.4S,v30.s[1] +str q4, [x0, #752] +add v15.4s, v15.4s, v2.4s +ldr q2, [x0, #0] +sqrdmulh v4.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +sub v6.4s, v2.4s, v7.4s +add v2.4s, v2.4s, v7.4s +ldr q7, [x0, #64] +sqrdmulh v0.4S, v15.4S, v29.s[1] +mul v15.4S, v15.4S,v30.s[1] +sub v17.4s, v7.4s, v18.4s +add v7.4s, v7.4s, v18.4s +ldr q18, [x0, #128] +mla v1.4S, v11.4S, v31.s[0] +mla v12.4S, v5.4S, v31.s[0] +str q19, [x0, #816] +sub v19.4s, v18.4s, v22.4s +mla v20.4S, v4.4S, v31.s[0] +mla v15.4S, v0.4S, v31.s[0] +str q3, [x0, #880] +add v18.4s, v18.4s, v22.4s +ldr q22, [x0, #192] +sqrdmulh v3.4S, v16.4S, v29.s[2] +mul v16.4S, v16.4S,v30.s[2] +str q21, [x0, #944] +sub v21.4s, v22.4s, v9.4s +sqrdmulh v0.4S, v13.4S, v29.s[2] +mul v13.4S, v13.4S,v30.s[2] +str q8, [x0, #1008] +add v22.4s, v22.4s, v9.4s +sqrdmulh v9.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v8.4s, v18.4s, v1.4s +add v18.4s, v18.4s, v1.4s +sqrdmulh v1.4S, v10.4S, v29.s[2] +mul v10.4S, v10.4S,v30.s[2] +sub v4.4s, v22.4s, v12.4s +add v22.4s, v22.4s, v12.4s +mla v16.4S, v3.4S, v31.s[0] +mla v13.4S, v0.4S, v31.s[0] +sub v0.4s, v2.4s, v20.4s +mla v14.4S, v9.4S, v31.s[0] +mla v10.4S, v1.4S, v31.s[0] +add v2.4s, v2.4s, v20.4s +sqrdmulh v20.4S, v8.4S, v27.s[1] +mul v8.4S, v8.4S,v28.s[1] +sub v1.4s, v7.4s, v15.4s +sqrdmulh v9.4S, v4.4S, v27.s[1] +mul v4.4S, v4.4S,v28.s[1] +add v7.4s, v7.4s, v15.4s +sqrdmulh v15.4S, v18.4S, v27.s[0] +mul v18.4S, v18.4S,v28.s[0] +sub v3.4s, v19.4s, v16.4s +add v19.4s, v19.4s, v16.4s +sqrdmulh v16.4S, v22.4S, v27.s[0] +mul v22.4S, v22.4S,v28.s[0] +sub v12.4s, v21.4s, v13.4s +add v21.4s, v21.4s, v13.4s +mla v8.4S, v20.4S, v31.s[0] +mla v4.4S, v9.4S, v31.s[0] +sub v9.4s, v6.4s, v14.4s +mla v18.4S, v15.4S, v31.s[0] +mla v22.4S, v16.4S, v31.s[0] +add v6.4s, v6.4s, v14.4s +sqrdmulh v14.4S, v19.4S, v27.s[2] +mul v19.4S, v19.4S,v28.s[2] +sub v16.4s, v17.4s, v10.4s +sqrdmulh v15.4S, v21.4S, v27.s[2] +mul v21.4S, v21.4S,v28.s[2] +add v17.4s, v17.4s, v10.4s +sqrdmulh v10.4S, v3.4S, v27.s[3] +mul v3.4S, v3.4S,v28.s[3] +sub v20.4s, v0.4s, v8.4s +add v0.4s, v0.4s, v8.4s +sqrdmulh v8.4S, v12.4S, v27.s[3] +mul v12.4S, v12.4S,v28.s[3] +sub v13.4s, v1.4s, v4.4s +add v1.4s, v1.4s, v4.4s +mla v19.4S, v14.4S, v31.s[0] +mla v21.4S, v15.4S, v31.s[0] +sub v15.4s, v2.4s, v18.4s +mla v3.4S, v10.4S, v31.s[0] +mla v12.4S, v8.4S, v31.s[0] +add v2.4s, v2.4s, v18.4s +sqrdmulh v18.4S, v1.4S, v25.s[2] +mul v1.4S, v1.4S,v26.s[2] +sub v8.4s, v7.4s, v22.4s +sqrdmulh v10.4S, v13.4S, v25.s[3] +mul v13.4S, v13.4S,v26.s[3] +add v7.4s, v7.4s, v22.4s +sqrdmulh v22.4S, v8.4S, v25.s[1] +mul v8.4S, v8.4S,v26.s[1] +sub v14.4s, v6.4s, v19.4s +add v6.4s, v6.4s, v19.4s +sqrdmulh v19.4S, v7.4S, v25.s[0] +mul v7.4S, v7.4S,v26.s[0] +sub v4.4s, v17.4s, v21.4s +add v17.4s, v17.4s, v21.4s +mla v1.4S, v18.4S, v31.s[0] +mla v13.4S, v10.4S, v31.s[0] +sub v10.4s, v9.4s, v3.4s +mla v8.4S, v22.4S, v31.s[0] +mla v7.4S, v19.4S, v31.s[0] +add v9.4s, v9.4s, v3.4s +sqrdmulh v3.4S, v17.4S, v23.s[0] +mul v17.4S, v17.4S,v24.s[0] +sub v19.4s, v16.4s, v12.4s +sqrdmulh v22.4S, v4.4S, v23.s[1] +mul v4.4S, v4.4S,v24.s[1] +add v16.4s, v16.4s, v12.4s +sqrdmulh v12.4S, v16.4S, v23.s[2] +mul v16.4S, v16.4S,v24.s[2] +sub v18.4s, v0.4s, v1.4s +add v0.4s, v0.4s, v1.4s +sqrdmulh v1.4S, v19.4S, v23.s[3] +mul v19.4S, v19.4S,v24.s[3] +sub v21.4s, v20.4s, v13.4s +add v20.4s, v20.4s, v13.4s +mla v17.4S, v3.4S, v31.s[0] +mla v4.4S, v22.4S, v31.s[0] +sub v22.4s, v15.4s, v8.4s +str q0, [x0, #256] +mla v16.4S, v12.4S, v31.s[0] +mla v19.4S, v1.4S, v31.s[0] +add v15.4s, v15.4s, v8.4s +str q18, [x0, #320] +ldr q18, [x0, #912] +sqrdmulh v8.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +str q20, [x0, #384] +sub v20.4s, v2.4s, v7.4s +ldr q1, [x0, #976] +sqrdmulh v12.4S, v1.4S, v29.s[0] +mul v1.4S, v1.4S,v30.s[0] +str q21, [x0, #448] +add v2.4s, v2.4s, v7.4s +ldr q7, [x0, #784] +sqrdmulh v21.4S, v7.4S, v29.s[0] +mul v7.4S, v7.4S,v30.s[0] +sub v0.4s, v6.4s, v17.4s +add v6.4s, v6.4s, v17.4s +ldr q17, [x0, #848] +sqrdmulh v3.4S, v17.4S, v29.s[0] +mul v17.4S, v17.4S,v30.s[0] +sub v13.4s, v14.4s, v4.4s +add v14.4s, v14.4s, v4.4s +mla v18.4S, v8.4S, v31.s[0] +mla v1.4S, v12.4S, v31.s[0] +str q15, [x0, #128] +sub v15.4s, v9.4s, v16.4s +mla v7.4S, v21.4S, v31.s[0] +mla v17.4S, v3.4S, v31.s[0] +str q22, [x0, #192] +add v9.4s, v9.4s, v16.4s +ldr q16, [x0, #528] +sqrdmulh v22.4S, v16.4S, v29.s[0] +mul v16.4S, v16.4S,v30.s[0] +str q2, [x0, #0] +sub v2.4s, v10.4s, v19.4s +ldr q3, [x0, #592] +sqrdmulh v21.4S, v3.4S, v29.s[0] +mul v3.4S, v3.4S,v30.s[0] +str q20, [x0, #64] +add v10.4s, v10.4s, v19.4s +ldr q19, [x0, #656] +ldr q20, [x0, #400] +sqrdmulh v12.4S, v19.4S, v29.s[0] +mul v19.4S, v19.4S,v30.s[0] +sub v8.4s, v20.4s, v18.4s +add v20.4s, v20.4s, v18.4s +ldr q18, [x0, #720] +ldr q4, [x0, #464] +sqrdmulh v5.4S, v18.4S, v29.s[0] +mul v18.4S, v18.4S,v30.s[0] +sub v11.4s, v4.4s, v1.4s +add v4.4s, v4.4s, v1.4s +ldr q1, [x0, #272] +mla v16.4S, v22.4S, v31.s[0] +mla v3.4S, v21.4S, v31.s[0] +str q6, [x0, #512] +sub v6.4s, v1.4s, v7.4s +mla v19.4S, v12.4S, v31.s[0] +mla v18.4S, v5.4S, v31.s[0] +str q0, [x0, #576] +add v1.4s, v1.4s, v7.4s +ldr q7, [x0, #336] +sqrdmulh v0.4S, v20.4S, v29.s[1] +mul v20.4S, v20.4S,v30.s[1] +str q14, [x0, #640] +sub v14.4s, v7.4s, v17.4s +sqrdmulh v5.4S, v4.4S, v29.s[1] +mul v4.4S, v4.4S,v30.s[1] +str q13, [x0, #704] +add v7.4s, v7.4s, v17.4s +ldr q17, [x0, #16] +sqrdmulh v13.4S, v1.4S, v29.s[1] +mul v1.4S, v1.4S,v30.s[1] +sub v12.4s, v17.4s, v16.4s +add v17.4s, v17.4s, v16.4s +ldr q16, [x0, #80] +sqrdmulh v21.4S, v7.4S, v29.s[1] +mul v7.4S, v7.4S,v30.s[1] +sub v22.4s, v16.4s, v3.4s +add v16.4s, v16.4s, v3.4s +ldr q3, [x0, #144] +mla v20.4S, v0.4S, v31.s[0] +mla v4.4S, v5.4S, v31.s[0] +str q9, [x0, #768] +sub v9.4s, v3.4s, v19.4s +mla v1.4S, v13.4S, v31.s[0] +mla v7.4S, v21.4S, v31.s[0] +str q15, [x0, #832] +add v3.4s, v3.4s, v19.4s +ldr q19, [x0, #208] +sqrdmulh v15.4S, v8.4S, v29.s[2] +mul v8.4S, v8.4S,v30.s[2] +str q10, [x0, #896] +sub v10.4s, v19.4s, v18.4s +sqrdmulh v21.4S, v11.4S, v29.s[2] +mul v11.4S, v11.4S,v30.s[2] +str q2, [x0, #960] +add v19.4s, v19.4s, v18.4s +sqrdmulh v18.4S, v6.4S, v29.s[2] +mul v6.4S, v6.4S,v30.s[2] +sub v2.4s, v3.4s, v20.4s +add v3.4s, v3.4s, v20.4s +sqrdmulh v20.4S, v14.4S, v29.s[2] +mul v14.4S, v14.4S,v30.s[2] +sub v13.4s, v19.4s, v4.4s +add v19.4s, v19.4s, v4.4s +mla v8.4S, v15.4S, v31.s[0] +mla v11.4S, v21.4S, v31.s[0] +sub v21.4s, v17.4s, v1.4s +mla v6.4S, v18.4S, v31.s[0] +mla v14.4S, v20.4S, v31.s[0] +add v17.4s, v17.4s, v1.4s +sqrdmulh v1.4S, v2.4S, v27.s[1] +mul v2.4S, v2.4S,v28.s[1] +sub v20.4s, v16.4s, v7.4s +sqrdmulh v18.4S, v13.4S, v27.s[1] +mul v13.4S, v13.4S,v28.s[1] +add v16.4s, v16.4s, v7.4s +sqrdmulh v7.4S, v3.4S, v27.s[0] +mul v3.4S, v3.4S,v28.s[0] +sub v15.4s, v9.4s, v8.4s +add v9.4s, v9.4s, v8.4s +sqrdmulh v8.4S, v19.4S, v27.s[0] +mul v19.4S, v19.4S,v28.s[0] +sub v4.4s, v10.4s, v11.4s +add v10.4s, v10.4s, v11.4s +mla v2.4S, v1.4S, v31.s[0] +mla v13.4S, v18.4S, v31.s[0] +sub v18.4s, v12.4s, v6.4s +mla v3.4S, v7.4S, v31.s[0] +mla v19.4S, v8.4S, v31.s[0] +add v12.4s, v12.4s, v6.4s +sqrdmulh v6.4S, v9.4S, v27.s[2] +mul v9.4S, v9.4S,v28.s[2] +sub v8.4s, v22.4s, v14.4s +sqrdmulh v7.4S, v10.4S, v27.s[2] +mul v10.4S, v10.4S,v28.s[2] +add v22.4s, v22.4s, v14.4s +sqrdmulh v14.4S, v15.4S, v27.s[3] +mul v15.4S, v15.4S,v28.s[3] +sub v1.4s, v21.4s, v2.4s +add v21.4s, v21.4s, v2.4s +sqrdmulh v2.4S, v4.4S, v27.s[3] +mul v4.4S, v4.4S,v28.s[3] +sub v11.4s, v20.4s, v13.4s +add v20.4s, v20.4s, v13.4s +mla v9.4S, v6.4S, v31.s[0] +mla v10.4S, v7.4S, v31.s[0] +sub v7.4s, v17.4s, v3.4s +mla v15.4S, v14.4S, v31.s[0] +mla v4.4S, v2.4S, v31.s[0] +add v17.4s, v17.4s, v3.4s +sqrdmulh v3.4S, v20.4S, v25.s[2] +mul v20.4S, v20.4S,v26.s[2] +sub v2.4s, v16.4s, v19.4s +sqrdmulh v14.4S, v11.4S, v25.s[3] +mul v11.4S, v11.4S,v26.s[3] +add v16.4s, v16.4s, v19.4s +sqrdmulh v19.4S, v2.4S, v25.s[1] +mul v2.4S, v2.4S,v26.s[1] +sub v6.4s, v12.4s, v9.4s +add v12.4s, v12.4s, v9.4s +sqrdmulh v9.4S, v16.4S, v25.s[0] +mul v16.4S, v16.4S,v26.s[0] +sub v13.4s, v22.4s, v10.4s +add v22.4s, v22.4s, v10.4s +mla v20.4S, v3.4S, v31.s[0] +mla v11.4S, v14.4S, v31.s[0] +sub v14.4s, v18.4s, v15.4s +mla v2.4S, v19.4S, v31.s[0] +mla v16.4S, v9.4S, v31.s[0] +add v18.4s, v18.4s, v15.4s +sqrdmulh v15.4S, v22.4S, v23.s[0] +mul v22.4S, v22.4S,v24.s[0] +sub v9.4s, v8.4s, v4.4s +sqrdmulh v19.4S, v13.4S, v23.s[1] +mul v13.4S, v13.4S,v24.s[1] +add v8.4s, v8.4s, v4.4s +sqrdmulh v4.4S, v8.4S, v23.s[2] +mul v8.4S, v8.4S,v24.s[2] +sub v3.4s, v21.4s, v20.4s +add v21.4s, v21.4s, v20.4s +sqrdmulh v20.4S, v9.4S, v23.s[3] +mul v9.4S, v9.4S,v24.s[3] +sub v10.4s, v1.4s, v11.4s +add v1.4s, v1.4s, v11.4s +mla v22.4S, v15.4S, v31.s[0] +mla v13.4S, v19.4S, v31.s[0] +sub v19.4s, v7.4s, v2.4s +str q21, [x0, #272] +mla v8.4S, v4.4S, v31.s[0] +mla v9.4S, v20.4S, v31.s[0] +add v7.4s, v7.4s, v2.4s +str q3, [x0, #336] +str q1, [x0, #400] +sub v1.4s, v17.4s, v16.4s +str q10, [x0, #464] +add v17.4s, v17.4s, v16.4s +sub v16.4s, v12.4s, v22.4s +add v12.4s, v12.4s, v22.4s +sub v22.4s, v6.4s, v13.4s +add v6.4s, v6.4s, v13.4s +str q7, [x0, #144] +sub v7.4s, v18.4s, v8.4s +str q19, [x0, #208] +add v18.4s, v18.4s, v8.4s +str q17, [x0, #16] +sub v17.4s, v14.4s, v9.4s +str q1, [x0, #80] +add v14.4s, v14.4s, v9.4s +str q12, [x0, #528] +str q16, [x0, #592] +str q6, [x0, #656] +str q22, [x0, #720] +str q18, [x0, #784] +str q7, [x0, #848] +str q14, [x0, #912] +str q17, [x0, #976] +ldr q0, [x0, #224] +ldr q5, [x0, #160] +ldr q11, [x0, #32] +ldr q15, [x17, #+128] +ldr q21, [x17, #+144] +sqrdmulh v4.4S, v11.4S, v21.s[0] +mul v11.4S, v11.4S,v15.s[0] +ldr q20, [x0, #48] +sqrdmulh v2.4S, v20.4S, v21.s[0] +mul v20.4S, v20.4S,v15.s[0] +ldr q3, [x17, #+160] +ldr q30, [x17, #+176] +ldr q29, [x0, #96] +sqrdmulh v28.4S, v29.4S, v30.s[0] +mul v29.4S, v29.4S,v3.s[0] +ldr q27, [x0, #112] +sqrdmulh v26.4S, v27.4S, v30.s[0] +mul v27.4S, v27.4S,v3.s[0] +ldr q25, [x17, #+192] +ldr q24, [x17, #+208] +mla v11.4S, v4.4S, v31.s[0] +sqrdmulh v4.4S, v5.4S, v24.s[0] +ldr q23, [x0, #176] +mla v20.4S, v2.4S, v31.s[0] +sqrdmulh v2.4S, v23.4S, v24.s[0] +ldr q10, [x17, #+224] +ldr q13, [x17, #+240] +mla v29.4S, v28.4S, v31.s[0] +sqrdmulh v28.4S, v0.4S, v13.s[0] +ldr q19, [x0, #240] +mla v27.4S, v26.4S, v31.s[0] +sqrdmulh v26.4S, v19.4S, v13.s[0] +ldr q8, [x0, #0] +ldr q1, [x0, #128] +mul v5.4S, v5.4S,v25.s[0] +sub v9.4s, v8.4s, v11.4s +ldr q12, [x0, #16] +mul v23.4S, v23.4S,v25.s[0] +add v8.4s, v8.4s, v11.4s +ldr q11, [x0, #144] +mla v5.4S, v4.4S, v31.s[0] +sub v4.4s, v12.4s, v20.4s +ldr q16, [x0, #64] +mla v23.4S, v2.4S, v31.s[0] +add v12.4s, v12.4s, v20.4s +ldr q20, [x0, #192] +mul v0.4S, v0.4S,v10.s[0] +sub v2.4s, v16.4s, v29.4s +ldr q6, [x0, #80] +mul v19.4S, v19.4S,v10.s[0] +add v16.4s, v16.4s, v29.4s +ldr q29, [x0, #208] +mla v0.4S, v28.4S, v31.s[0] +mla v19.4S, v26.4S, v31.s[0] +sub v26.4s, v6.4s, v27.4s +sqrdmulh v28.4S, v12.4S, v21.s[1] +add v6.4s, v6.4s, v27.4s +mul v12.4S, v12.4S,v15.s[1] +sqrdmulh v27.4S, v4.4S, v21.s[2] +sub v22.4s, v1.4s, v5.4s +mul v4.4S, v4.4S,v15.s[2] +add v1.4s, v1.4s, v5.4s +sqrdmulh v21.4S, v6.4S, v30.s[1] +sub v15.4s, v11.4s, v23.4s +mul v6.4S, v6.4S,v3.s[1] +add v11.4s, v11.4s, v23.4s +sqrdmulh v23.4S, v26.4S, v30.s[2] +sub v5.4s, v20.4s, v0.4s +mul v26.4S, v26.4S,v3.s[2] +add v20.4s, v20.4s, v0.4s +mla v12.4S, v28.4S, v31.s[0] +sub v28.4s, v29.4s, v19.4s +ldr q30, [x0, #480] +sqrdmulh v3.4S, v11.4S, v24.s[1] +add v29.4s, v29.4s, v19.4s +mla v4.4S, v27.4S, v31.s[0] +ldr q27, [x0, #416] +sqrdmulh v19.4S, v15.4S, v24.s[2] +sub v0.4s, v8.4s, v12.4s +mla v6.4S, v21.4S, v31.s[0] +ldr q21, [x0, #288] +sqrdmulh v18.4S, v29.4S, v13.s[1] +add v8.4s, v8.4s, v12.4s +str q0, [x0, #16] +mla v26.4S, v23.4S, v31.s[0] +ldr q23, [x17, #+256] +ldr q0, [x17, #+272] +sqrdmulh v12.4S, v28.4S, v13.s[2] +sub v7.4s, v9.4s, v4.4s +str q8, [x0, #0] +mul v11.4S, v11.4S,v25.s[1] +add v9.4s, v9.4s, v4.4s +mul v15.4S, v15.4S,v25.s[2] +str q7, [x0, #48] +mla v11.4S, v3.4S, v31.s[0] +sub v3.4s, v16.4s, v6.4s +mla v15.4S, v19.4S, v31.s[0] +str q9, [x0, #32] +mul v29.4S, v29.4S,v10.s[1] +str q3, [x0, #80] +mul v28.4S, v28.4S,v10.s[2] +add v16.4s, v16.4s, v6.4s +str q16, [x0, #64] +mla v29.4S, v18.4S, v31.s[0] +sub v18.4s, v2.4s, v26.4s +str q18, [x0, #112] +mla v28.4S, v12.4S, v31.s[0] +add v2.4s, v2.4s, v26.4s +str q2, [x0, #96] +sqrdmulh v13.4S, v21.4S, v0.s[0] +sub v10.4s, v1.4s, v11.4s +mul v21.4S, v21.4S,v23.s[0] +str q10, [x0, #144] +ldr q10, [x0, #304] +sqrdmulh v2.4S, v10.4S, v0.s[0] +add v1.4s, v1.4s, v11.4s +mul v10.4S, v10.4S,v23.s[0] +str q1, [x0, #128] +ldr q1, [x17, #+288] +ldr q11, [x17, #+304] +ldr q26, [x0, #352] +sqrdmulh v12.4S, v26.4S, v11.s[0] +sub v18.4s, v22.4s, v15.4s +mul v26.4S, v26.4S,v1.s[0] +str q18, [x0, #176] +ldr q18, [x0, #368] +sqrdmulh v16.4S, v18.4S, v11.s[0] +add v22.4s, v22.4s, v15.4s +mul v18.4S, v18.4S,v1.s[0] +str q22, [x0, #160] +ldr q22, [x17, #+320] +ldr q15, [x17, #+336] +mla v21.4S, v13.4S, v31.s[0] +sub v13.4s, v20.4s, v29.4s +sqrdmulh v6.4S, v27.4S, v15.s[0] +str q13, [x0, #208] +ldr q13, [x0, #432] +mla v10.4S, v2.4S, v31.s[0] +add v20.4s, v20.4s, v29.4s +sqrdmulh v29.4S, v13.4S, v15.s[0] +str q20, [x0, #192] +ldr q20, [x17, #+352] +ldr q2, [x17, #+368] +mla v26.4S, v12.4S, v31.s[0] +sub v12.4s, v5.4s, v28.4s +sqrdmulh v3.4S, v30.4S, v2.s[0] +str q12, [x0, #240] +ldr q12, [x0, #496] +mla v18.4S, v16.4S, v31.s[0] +add v5.4s, v5.4s, v28.4s +sqrdmulh v28.4S, v12.4S, v2.s[0] +str q5, [x0, #224] +ldr q5, [x0, #256] +ldr q16, [x0, #384] +mul v27.4S, v27.4S,v22.s[0] +sub v24.4s, v5.4s, v21.4s +ldr q25, [x0, #272] +mul v13.4S, v13.4S,v22.s[0] +add v5.4s, v5.4s, v21.4s +ldr q21, [x0, #400] +mla v27.4S, v6.4S, v31.s[0] +sub v6.4s, v25.4s, v10.4s +ldr q9, [x0, #320] +mla v13.4S, v29.4S, v31.s[0] +add v25.4s, v25.4s, v10.4s +ldr q10, [x0, #448] +mul v30.4S, v30.4S,v20.s[0] +sub v29.4s, v9.4s, v26.4s +ldr q19, [x0, #336] +mul v12.4S, v12.4S,v20.s[0] +add v9.4s, v9.4s, v26.4s +ldr q26, [x0, #464] +mla v30.4S, v3.4S, v31.s[0] +mla v12.4S, v28.4S, v31.s[0] +sub v28.4s, v19.4s, v18.4s +sqrdmulh v3.4S, v25.4S, v0.s[1] +add v19.4s, v19.4s, v18.4s +mul v25.4S, v25.4S,v23.s[1] +sqrdmulh v18.4S, v6.4S, v0.s[2] +sub v7.4s, v16.4s, v27.4s +mul v6.4S, v6.4S,v23.s[2] +add v16.4s, v16.4s, v27.4s +sqrdmulh v0.4S, v19.4S, v11.s[1] +sub v23.4s, v21.4s, v13.4s +mul v19.4S, v19.4S,v1.s[1] +add v21.4s, v21.4s, v13.4s +sqrdmulh v13.4S, v28.4S, v11.s[2] +sub v27.4s, v10.4s, v30.4s +mul v28.4S, v28.4S,v1.s[2] +add v10.4s, v10.4s, v30.4s +mla v25.4S, v3.4S, v31.s[0] +sub v3.4s, v26.4s, v12.4s +ldr q11, [x0, #736] +sqrdmulh v1.4S, v21.4S, v15.s[1] +add v26.4s, v26.4s, v12.4s +mla v6.4S, v18.4S, v31.s[0] +ldr q18, [x0, #672] +sqrdmulh v12.4S, v23.4S, v15.s[2] +sub v30.4s, v5.4s, v25.4s +mla v19.4S, v0.4S, v31.s[0] +ldr q0, [x0, #544] +sqrdmulh v4.4S, v26.4S, v2.s[1] +add v5.4s, v5.4s, v25.4s +str q30, [x0, #272] +mla v28.4S, v13.4S, v31.s[0] +ldr q13, [x17, #+384] +ldr q30, [x17, #+400] +sqrdmulh v25.4S, v3.4S, v2.s[2] +sub v8.4s, v24.4s, v6.4s +str q5, [x0, #256] +mul v21.4S, v21.4S,v22.s[1] +add v24.4s, v24.4s, v6.4s +mul v23.4S, v23.4S,v22.s[2] +str q8, [x0, #304] +mla v21.4S, v1.4S, v31.s[0] +sub v1.4s, v9.4s, v19.4s +mla v23.4S, v12.4S, v31.s[0] +str q24, [x0, #288] +mul v26.4S, v26.4S,v20.s[1] +str q1, [x0, #336] +mul v3.4S, v3.4S,v20.s[2] +add v9.4s, v9.4s, v19.4s +str q9, [x0, #320] +mla v26.4S, v4.4S, v31.s[0] +sub v4.4s, v29.4s, v28.4s +str q4, [x0, #368] +mla v3.4S, v25.4S, v31.s[0] +add v29.4s, v29.4s, v28.4s +str q29, [x0, #352] +sqrdmulh v2.4S, v0.4S, v30.s[0] +sub v20.4s, v16.4s, v21.4s +mul v0.4S, v0.4S,v13.s[0] +str q20, [x0, #400] +ldr q20, [x0, #560] +sqrdmulh v29.4S, v20.4S, v30.s[0] +add v16.4s, v16.4s, v21.4s +mul v20.4S, v20.4S,v13.s[0] +str q16, [x0, #384] +ldr q16, [x17, #+416] +ldr q21, [x17, #+432] +ldr q28, [x0, #608] +sqrdmulh v25.4S, v28.4S, v21.s[0] +sub v4.4s, v7.4s, v23.4s +mul v28.4S, v28.4S,v16.s[0] +str q4, [x0, #432] +ldr q4, [x0, #624] +sqrdmulh v9.4S, v4.4S, v21.s[0] +add v7.4s, v7.4s, v23.4s +mul v4.4S, v4.4S,v16.s[0] +str q7, [x0, #416] +ldr q7, [x17, #+448] +ldr q23, [x17, #+464] +mla v0.4S, v2.4S, v31.s[0] +sub v2.4s, v10.4s, v26.4s +sqrdmulh v19.4S, v18.4S, v23.s[0] +str q2, [x0, #464] +ldr q2, [x0, #688] +mla v20.4S, v29.4S, v31.s[0] +add v10.4s, v10.4s, v26.4s +sqrdmulh v26.4S, v2.4S, v23.s[0] +str q10, [x0, #448] +ldr q10, [x17, #+480] +ldr q29, [x17, #+496] +mla v28.4S, v25.4S, v31.s[0] +sub v25.4s, v27.4s, v3.4s +sqrdmulh v1.4S, v11.4S, v29.s[0] +str q25, [x0, #496] +ldr q25, [x0, #752] +mla v4.4S, v9.4S, v31.s[0] +add v27.4s, v27.4s, v3.4s +sqrdmulh v3.4S, v25.4S, v29.s[0] +str q27, [x0, #480] +ldr q27, [x0, #512] +ldr q9, [x0, #640] +mul v18.4S, v18.4S,v7.s[0] +sub v15.4s, v27.4s, v0.4s +ldr q22, [x0, #528] +mul v2.4S, v2.4S,v7.s[0] +add v27.4s, v27.4s, v0.4s +ldr q0, [x0, #656] +mla v18.4S, v19.4S, v31.s[0] +sub v19.4s, v22.4s, v20.4s +ldr q24, [x0, #576] +mla v2.4S, v26.4S, v31.s[0] +add v22.4s, v22.4s, v20.4s +ldr q20, [x0, #704] +mul v11.4S, v11.4S,v10.s[0] +sub v26.4s, v24.4s, v28.4s +ldr q12, [x0, #592] +mul v25.4S, v25.4S,v10.s[0] +add v24.4s, v24.4s, v28.4s +ldr q28, [x0, #720] +mla v11.4S, v1.4S, v31.s[0] +mla v25.4S, v3.4S, v31.s[0] +sub v3.4s, v12.4s, v4.4s +sqrdmulh v1.4S, v22.4S, v30.s[1] +add v12.4s, v12.4s, v4.4s +mul v22.4S, v22.4S,v13.s[1] +sqrdmulh v4.4S, v19.4S, v30.s[2] +sub v8.4s, v9.4s, v18.4s +mul v19.4S, v19.4S,v13.s[2] +add v9.4s, v9.4s, v18.4s +sqrdmulh v30.4S, v12.4S, v21.s[1] +sub v13.4s, v0.4s, v2.4s +mul v12.4S, v12.4S,v16.s[1] +add v0.4s, v0.4s, v2.4s +sqrdmulh v2.4S, v3.4S, v21.s[2] +sub v18.4s, v20.4s, v11.4s +mul v3.4S, v3.4S,v16.s[2] +add v20.4s, v20.4s, v11.4s +mla v22.4S, v1.4S, v31.s[0] +sub v1.4s, v28.4s, v25.4s +ldr q21, [x0, #992] +sqrdmulh v16.4S, v0.4S, v23.s[1] +add v28.4s, v28.4s, v25.4s +mla v19.4S, v4.4S, v31.s[0] +ldr q4, [x0, #928] +sqrdmulh v25.4S, v13.4S, v23.s[2] +sub v11.4s, v27.4s, v22.4s +mla v12.4S, v30.4S, v31.s[0] +ldr q30, [x0, #800] +sqrdmulh v6.4S, v28.4S, v29.s[1] +add v27.4s, v27.4s, v22.4s +str q11, [x0, #528] +mla v3.4S, v2.4S, v31.s[0] +ldr q2, [x17, #+512] +ldr q11, [x17, #+528] +sqrdmulh v22.4S, v1.4S, v29.s[2] +sub v5.4s, v15.4s, v19.4s +str q27, [x0, #512] +mul v0.4S, v0.4S,v7.s[1] +add v15.4s, v15.4s, v19.4s +mul v13.4S, v13.4S,v7.s[2] +str q5, [x0, #560] +mla v0.4S, v16.4S, v31.s[0] +sub v16.4s, v24.4s, v12.4s +mla v13.4S, v25.4S, v31.s[0] +str q15, [x0, #544] +mul v28.4S, v28.4S,v10.s[1] +str q16, [x0, #592] +mul v1.4S, v1.4S,v10.s[2] +add v24.4s, v24.4s, v12.4s +str q24, [x0, #576] +mla v28.4S, v6.4S, v31.s[0] +sub v6.4s, v26.4s, v3.4s +str q6, [x0, #624] +mla v1.4S, v22.4S, v31.s[0] +add v26.4s, v26.4s, v3.4s +str q26, [x0, #608] +sqrdmulh v29.4S, v30.4S, v11.s[0] +sub v10.4s, v9.4s, v0.4s +mul v30.4S, v30.4S,v2.s[0] +str q10, [x0, #656] +ldr q10, [x0, #816] +sqrdmulh v26.4S, v10.4S, v11.s[0] +add v9.4s, v9.4s, v0.4s +mul v10.4S, v10.4S,v2.s[0] +str q9, [x0, #640] +ldr q9, [x17, #+544] +ldr q0, [x17, #+560] +ldr q3, [x0, #864] +sqrdmulh v22.4S, v3.4S, v0.s[0] +sub v6.4s, v8.4s, v13.4s +mul v3.4S, v3.4S,v9.s[0] +str q6, [x0, #688] +ldr q6, [x0, #880] +sqrdmulh v24.4S, v6.4S, v0.s[0] +add v8.4s, v8.4s, v13.4s +mul v6.4S, v6.4S,v9.s[0] +str q8, [x0, #672] +ldr q8, [x17, #+576] +ldr q13, [x17, #+592] +mla v30.4S, v29.4S, v31.s[0] +sub v29.4s, v20.4s, v28.4s +sqrdmulh v12.4S, v4.4S, v13.s[0] +str q29, [x0, #720] +ldr q29, [x0, #944] +mla v10.4S, v26.4S, v31.s[0] +add v20.4s, v20.4s, v28.4s +sqrdmulh v28.4S, v29.4S, v13.s[0] +str q20, [x0, #704] +ldr q20, [x17, #+608] +ldr q26, [x17, #+624] +mla v3.4S, v22.4S, v31.s[0] +sub v22.4s, v18.4s, v1.4s +sqrdmulh v16.4S, v21.4S, v26.s[0] +str q22, [x0, #752] +ldr q22, [x0, #1008] +mla v6.4S, v24.4S, v31.s[0] +add v18.4s, v18.4s, v1.4s +sqrdmulh v1.4S, v22.4S, v26.s[0] +str q18, [x0, #736] +ldr q18, [x0, #768] +ldr q24, [x0, #896] +mul v4.4S, v4.4S,v8.s[0] +sub v23.4s, v18.4s, v30.4s +ldr q7, [x0, #784] +mul v29.4S, v29.4S,v8.s[0] +add v18.4s, v18.4s, v30.4s +ldr q30, [x0, #912] +mla v4.4S, v12.4S, v31.s[0] +sub v12.4s, v7.4s, v10.4s +ldr q15, [x0, #832] +mla v29.4S, v28.4S, v31.s[0] +add v7.4s, v7.4s, v10.4s +ldr q10, [x0, #960] +mul v21.4S, v21.4S,v20.s[0] +sub v28.4s, v15.4s, v3.4s +ldr q25, [x0, #848] +mul v22.4S, v22.4S,v20.s[0] +add v15.4s, v15.4s, v3.4s +ldr q3, [x0, #976] +mla v21.4S, v16.4S, v31.s[0] +mla v22.4S, v1.4S, v31.s[0] +sub v1.4s, v25.4s, v6.4s +sqrdmulh v16.4S, v7.4S, v11.s[1] +add v25.4s, v25.4s, v6.4s +mul v7.4S, v7.4S,v2.s[1] +sqrdmulh v6.4S, v12.4S, v11.s[2] +sub v5.4s, v24.4s, v4.4s +mul v12.4S, v12.4S,v2.s[2] +add v24.4s, v24.4s, v4.4s +sqrdmulh v11.4S, v25.4S, v0.s[1] +sub v2.4s, v30.4s, v29.4s +mul v25.4S, v25.4S,v9.s[1] +add v30.4s, v30.4s, v29.4s +sqrdmulh v29.4S, v1.4S, v0.s[2] +sub v4.4s, v10.4s, v21.4s +mul v1.4S, v1.4S,v9.s[2] +add v10.4s, v10.4s, v21.4s +mla v7.4S, v16.4S, v31.s[0] +sub v16.4s, v3.4s, v22.4s +sqrdmulh v0.4S, v30.4S, v13.s[1] +add v3.4s, v3.4s, v22.4s +mla v12.4S, v6.4S, v31.s[0] +sqrdmulh v6.4S, v2.4S, v13.s[2] +sub v22.4s, v18.4s, v7.4s +mla v25.4S, v11.4S, v31.s[0] +sqrdmulh v11.4S, v3.4S, v26.s[1] +add v18.4s, v18.4s, v7.4s +str q22, [x0, #784] +mla v1.4S, v29.4S, v31.s[0] +sqrdmulh v29.4S, v16.4S, v26.s[2] +sub v22.4s, v23.4s, v12.4s +str q18, [x0, #768] +mul v30.4S, v30.4S,v8.s[1] +add v23.4s, v23.4s, v12.4s +mul v2.4S, v2.4S,v8.s[2] +str q22, [x0, #816] +mla v30.4S, v0.4S, v31.s[0] +sub v0.4s, v15.4s, v25.4s +mla v2.4S, v6.4S, v31.s[0] +str q23, [x0, #800] +mul v3.4S, v3.4S,v20.s[1] +str q0, [x0, #848] +mul v16.4S, v16.4S,v20.s[2] +add v15.4s, v15.4s, v25.4s +str q15, [x0, #832] +mla v3.4S, v11.4S, v31.s[0] +sub v11.4s, v28.4s, v1.4s +str q11, [x0, #880] +mla v16.4S, v29.4S, v31.s[0] +add v28.4s, v28.4s, v1.4s +str q28, [x0, #864] +sub v26.4s, v24.4s, v30.4s +str q26, [x0, #912] +add v24.4s, v24.4s, v30.4s +str q24, [x0, #896] +sub v24.4s, v5.4s, v2.4s +str q24, [x0, #944] +add v5.4s, v5.4s, v2.4s +str q5, [x0, #928] +sub v5.4s, v10.4s, v3.4s +str q5, [x0, #976] +add v10.4s, v10.4s, v3.4s +str q10, [x0, #960] +sub v10.4s, v4.4s, v16.4s +str q10, [x0, #1008] +add v4.4s, v4.4s, v16.4s +str q4, [x0, #992] +// Restore NEON vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1464 +// Instruction count: 1460 \ No newline at end of file diff --git a/tests/ntt_neon/main.c b/tests/ntt_neon/main.c new file mode 100755 index 0000000..c9e277a --- /dev/null +++ b/tests/ntt_neon/main.c @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2021 Arm Limited + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include +#include +#include +#include + +#include + +#include "ntt.h" + +int main(void) +{ + enable_cyclecounter(); + +#if defined(NTT_INCOMPLETE) + /* test_fwd_ntt_incomplete_var_3_3_0(); */ + /* test_fwd_ntt_incomplete_var_3_3_1(); */ + /* test_fwd_ntt_incomplete_var_3_3_2(); */ + /* test_fwd_ntt_incomplete_var_3_3_3(); */ + /* test_fwd_ntt_incomplete_var_3_3_4(); */ + test_fwd_ntt_incomplete_var_3_3_5(); + + /* test_fwd_ntt_incomplete_var_4_2_0_0(); */ + /* test_fwd_ntt_incomplete_var_4_2_0_z4_0(); */ + /* test_fwd_ntt_incomplete_var_4_2_0_z4_16(); */ + /* test_fwd_ntt_incomplete_var_4_2_24_z4_0(); */ + /* test_fwd_ntt_incomplete_var_4_2_24_z4_16(); */ + + /* test_fwd_ntt_incomplete_var_4_2_3_z4_0(); */ + /* test_fwd_ntt_incomplete_var_4_2_3_z4_1(); */ + /* test_fwd_ntt_incomplete_var_4_2_3_z4_2(); */ + /* test_fwd_ntt_incomplete_var_4_2_3_z4_3(); */ + /* test_fwd_ntt_incomplete_var_4_2_3_z4_4(); */ + test_fwd_ntt_incomplete_var_4_2_3_z4_5(); + + /* test_fwd_ntt_incomplete_var_4_2_7_z4_0(); */ + /* test_fwd_ntt_incomplete_var_4_2_7_z4_1(); */ + /* test_fwd_ntt_incomplete_var_4_2_7_z4_2(); */ + /* test_fwd_ntt_incomplete_var_4_2_7_z4_3(); */ + /* test_fwd_ntt_incomplete_var_4_2_7_z4_4(); */ + /* test_fwd_ntt_incomplete_var_4_2_7_z4_5(); */ + /* test_fwd_ntt_incomplete_var_4_2_7_z4_6(); */ + /* test_fwd_ntt_incomplete_var_4_2_7_z4_7(); */ + /* test_fwd_ntt_incomplete_var_4_2_7_z4_8(); */ + /* test_fwd_ntt_incomplete_var_4_2_7_z4_9(); */ + test_fwd_ntt_incomplete_var_4_2_7_z4_10(); + + /* test_fwd_ntt_incomplete_var_4_2_8_z4_7(); */ + /* test_fwd_ntt_incomplete_var_4_2_9_z4_7(); */ + /* test_fwd_ntt_incomplete_var_4_2_10_z4_7(); */ + /* test_fwd_ntt_incomplete_var_4_2_11_z4_7(); */ + /* test_fwd_ntt_incomplete_var_4_2_12_z4_7(); */ + /* test_fwd_ntt_incomplete_var_4_2_13_z4_7(); */ + /* test_fwd_ntt_incomplete_var_4_2_14_z4_7(); */ + /* test_fwd_ntt_incomplete_var_4_2_15_z4_7(); */ + /* test_fwd_ntt_incomplete_var_4_2_16_z4_7(); */ + /* test_fwd_ntt_incomplete_var_4_2_17_z4_7(); */ + /* test_fwd_ntt_incomplete_var_4_2_18_z4_7(); */ + /* test_fwd_ntt_incomplete_var_4_2_19_z4_7(); */ + /* test_fwd_ntt_incomplete_var_4_2_20_z4_7(); */ + /* test_fwd_ntt_incomplete_var_4_2_21_z4_7(); */ + test_fwd_ntt_incomplete_var_4_2_22_z4_7(); + + /* test_fwd_ntt_incomplete_var_4_2_22_z4_7(); */ + /* test_fwd_ntt_incomplete_var_4_2_22_z4_8(); */ + /* test_fwd_ntt_incomplete_var_4_2_22_z4_9(); */ + /* test_fwd_ntt_incomplete_var_4_2_22_z4_10(); */ + /* test_fwd_ntt_incomplete_var_4_2_22_z4_11(); */ + /* test_fwd_ntt_incomplete_var_4_2_22_z4_12(); */ + /* test_fwd_ntt_incomplete_var_4_2_22_z4_13(); */ + /* test_fwd_ntt_incomplete_var_4_2_22_z4_14(); */ + test_fwd_ntt_incomplete_var_4_2_22_z4_15(); +#else + test_fwd_ntt_full_var_4_4_0_0(); + test_fwd_ntt_full_var_4_4_1_0(); + test_fwd_ntt_full_var_4_4_2_0(); + test_fwd_ntt_full_var_4_4_3_0(); + test_fwd_ntt_full_var_4_4_4_0(); + test_fwd_ntt_full_var_4_4_5_0(); + test_fwd_ntt_full_var_4_4_6_0(); + test_fwd_ntt_full_var_4_4_7_0(); + test_fwd_ntt_full_var_4_4_8_0(); + test_fwd_ntt_full_var_4_4_9_0(); + test_fwd_ntt_full_var_4_4_10_0(); + test_fwd_ntt_full_var_4_4_3_z2_0(); + test_fwd_ntt_full_var_4_4_3_z2_1(); + test_fwd_ntt_full_var_4_4_3_z2_2(); + test_fwd_ntt_full_var_4_4_3_z2_3(); + test_fwd_ntt_full_var_4_4_3_z2_4(); + test_fwd_ntt_full_var_4_4_3_z2_5(); + test_fwd_ntt_full_var_4_4_3_z4_0(); + test_fwd_ntt_full_var_4_4_3_z4_1(); + test_fwd_ntt_full_var_4_4_3_z4_2(); + test_fwd_ntt_full_var_4_4_3_z4_3(); + test_fwd_ntt_full_var_4_4_3_z4_4(); +#endif /* NTT_INCOMPLETE */ + disable_cyclecounter(); +} diff --git a/tests/ntt_neon/ntt.c b/tests/ntt_neon/ntt.c new file mode 100755 index 0000000..9431df3 --- /dev/null +++ b/tests/ntt_neon/ntt.c @@ -0,0 +1,662 @@ +/* + * Copyright (c) 2021 Arm Limited + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include +#include +#include +#include + +#include + +#include "ntt.h" +#include "misc.h" + +//#define CONFIG_TEST_NTT_VERBOSE + +const int32_t mod_q32 = MODULUS_Q32; +const uint32_t mod_q32_inv_u32 = MODULUS_Q32_INV_U32; +const uint32_t mod_q32_inv_u32_neg = MODULUS_Q32_INV_U32_NEG; +const int32_t mod_q32_root = MODULUS_Q32_BASE_ROOT; + +ALIGN(16) int32_t root_base[2*NTT_SIZE] = { 0 }; +ALIGN(16) uint32_t root_base_twisted[2*NTT_SIZE] = { 0 }; +ALIGN(16) int32_t inv_root_base[2*NTT_SIZE] = { 0 }; +ALIGN(16) uint32_t inv_root_base_twisted[2*NTT_SIZE] = { 0 }; + +int32_t roots [NTT_SIZE] = { 0 }; +uint32_t roots_twisted[NTT_SIZE] = { 0 }; + +void mul_q_u32( int32_t *src, int64_t c, size_t size ) +{ + for( unsigned idx = 0; idx < size; idx++ ) + { + int64_t tmp; + src[idx] = src[idx] % mod_q32; + if( src[idx] < 0 ) + src[idx] += mod_q32; + tmp = (int64_t) src[idx] * c; + src[idx] = tmp % mod_q32; + if( src[idx] < 0 ) + src[idx] += mod_q32; + } +} + +void reduce_q_u32( int32_t *src, size_t size ) +{ + for( unsigned idx = 0; idx < size; idx++ ) + { + src[idx] = src[idx] % mod_q32; + if( src[idx] < 0 ) + src[idx] += mod_q32; + } +} + +void mult_u32_C( int32_t const *src_a, + int32_t const *src_b, + int32_t *dst, + size_t size ) +{ + unsigned idx; + for( idx = 0; idx < size; idx++ ) + { + int64_t tmp = (int64_t) src_a[idx] * (int64_t) src_b[idx]; + dst[idx] = (int32_t)( tmp % mod_q32 ); + } +} + + +void montgomery_pt_u32_C( int32_t const *src_a, + int32_t const *src_b, + int32_t *dst, + size_t size ) +{ + unsigned idx; + for( idx = 0; idx < size; idx++ ) + { + int64_t v; + int32_t hi; + uint32_t lo, tmp, hi_fix; + + v = 2* (int64_t) src_a[idx] * (int64_t) src_b[idx]; + + /* Hi+lo part extraction */ + hi = (int32_t)( v >> 32 ); + lo = (uint32_t)( v >> 0 ); + + /* Fixed scalar multiply, lo */ + tmp = lo * mod_q32_inv_u32; + /* Fixed scalar multiply, hi */ + hi_fix = ( (uint64_t) tmp * (uint64_t) mod_q32 ) >> 32; + + dst[idx] = (int32_t)( (int64_t) hi - (int64_t) hi_fix ); + } +} + +void buf_reduce_u32( int32_t *src, size_t size ) +{ + for( unsigned i=0; i < size; i++ ) + { + src[i] = src[i] % mod_q32; + if( src[i] < 0 ) + src[i] += mod_q32; + } +} + + +int32_t mod_mul( int32_t a, int32_t b, int32_t mod ) +{ + int64_t tmp = (int64_t) a * (int64_t) b; + int32_t res = tmp % mod; + return( res); +} + +int32_t mod_add( int32_t a, int32_t b, int32_t mod ) +{ + int64_t tmp = (int64_t) a + (int64_t) b; + int32_t res = tmp % mod; + return( res); +} + +int32_t mod_sub( int32_t a, int32_t b, int32_t mod ) +{ + int64_t tmp = (int64_t) a - (int64_t) b; + int32_t res = tmp % mod; + return( res); +} + +int32_t mod_pow( int32_t base, unsigned exp, int32_t mod ) +{ + int32_t base_pow = base; + int32_t tmp = 1; + while( exp != 0 ) + { + if( exp & 1 ) + tmp = mod_mul( tmp, base_pow, mod ); + + base_pow = mod_mul( base_pow, base_pow, mod ); + exp >>= 1; + } + + return( tmp ); +} + +int bit_reverse( unsigned val, int width ) +{ + unsigned result = 0; + while( width-- ) + { + result = ( result << 1 ) + ( val & 1 ); + val >>= 1; + } + return( result ); +} + +void build_roots() +{ + for( unsigned i=0; i < NTT_SIZE; i++ ) + { + roots[i] = mod_pow( mod_q32_root, i, mod_q32 ); + roots_twisted[i] = roots[i] * mod_q32_inv_u32; + +#if defined(CONFIG_TEST_NTT_VERBOSE) + debug_printf( "zeta^%u = %u^%u = %u\n", + i, (unsigned) mod_q32_root, i, + roots[i] ); + + debug_printf( "zeta^%u * %u = %u^%u * %u = %u\n", + i, mod_q32_inv_u32, + (unsigned) mod_q32_root, i, mod_q32_inv_u32, + roots_twisted[i] ); +#endif /* CONFIG_TEST_NTT_VERBOSE */ + } +} + +void ntt_u32_C( int32_t *src ) +{ + int32_t res[NTT_SIZE]; + build_roots(); + + for( unsigned t=0; t= NTT_SIZE ); + exp = exp % NTT_SIZE; + +#if defined(CONFIG_TEST_NTT_VERBOSE) + if( t == 0 ) + { + debug_printf( "res[%u] += root[%u] * src[%u] = %u * %u\n", + NTT_LAYER_STRIDE*i+t, + exp, + NTT_LAYER_STRIDE*j+t, + roots[exp], + src[NTT_LAYER_STRIDE*j+t]); + } +#endif /* CONFIG_TEST_NTT_VERBOSE */ + + cur = mod_mul( src[NTT_LAYER_STRIDE*j+t], + roots[exp], + mod_q32 ); + + if( !sub ) + tmp = mod_add( tmp, cur, mod_q32 ); + else + tmp = mod_sub( tmp, cur, mod_q32 ); + } + res[NTT_LAYER_STRIDE*i+t] = tmp; + } + } + + memcpy( src, res, sizeof( res ) ); +} + +uint64_t t0, t1; +uint64_t cycles[NTT_TEST_COUNT]; + +static int cmp_uint64_t(const void *a, const void *b) +{ + return (int)((*((const uint64_t *)a)) - (*((const uint64_t *)b))); +} + +#define NTT_U32_NEON_INCOMPLETE(variant) ntt_u32_incomplete_neon_asm_var_ ## variant +#define NTT_U32_NEON_FULL(variant) ntt_u32_full_neon_asm_var_ ## variant + +#if defined(NTT_CHECK_FUNCTIONAL_CORRECTNESS) + +void buf_bitrev_4( int32_t *src, size_t size ) +{ + for( unsigned i=0; i < size; i += 16 ) + { + int32_t tmp[16]; + for( unsigned t=0; t < 16; t++ ) + tmp[t] = src[i+t]; + + for( unsigned r0=0; t0 < 4; r0++ ) + for( unsigned r1=0; t1 < 4; r1++ ) + src[i+r0*4 + r1] = tmp[r1*4+r0]; + } +} + +#define GEN_TEST_NTT_INCOMPLETE(variant) \ +int test_fwd_ntt_incomplete_var_ ## variant () \ +{ \ + debug_test_start( "NTT: deg 256, 32-bit, forward, 6-layer incomplete" ); \ + debug_printf( "Variant: %s\n", #variant ); \ + \ + ALIGN(NTT_BUFFER_ALIGN) \ + int32_t src[NTT_SIZE]; \ + ALIGN(NTT_BUFFER_ALIGN) \ + int32_t src_copy[NTT_SIZE]; \ + ALIGN(NTT_BUFFER_ALIGN) \ + int32_t dummy_copy[NTT_SIZE]; \ + \ + rand_init(0); \ + \ + /* Setup input */ \ + fill_random_u32( (uint32_t*) src, NTT_SIZE ); \ + buf_reduce_u32( src, NTT_SIZE ); \ + \ + /* Step 1: Reference NTT */ \ + memcpy( src_copy, src, sizeof( src ) ); \ + ntt_u32_C( src_copy ); \ + buf_reduce_u32( src_copy, NTT_SIZE ); \ + \ + /* Step 2: NEON-based NTT */ \ + for( unsigned cnt=0; cnt < NTT_TEST_WARMUP; cnt++ ) \ + NTT_U32_NEON_INCOMPLETE(variant)( dummy_copy ); \ + for( unsigned cnt=0; cnt < NTT_TEST_COUNT; cnt++ ) \ + { \ + t0 = get_cyclecounter(); \ + NTT_U32_NEON_INCOMPLETE(variant)( dummy_copy ); \ + t1 = get_cyclecounter(); \ + cycles[cnt] = t1 - t0; \ + } \ + \ + NTT_U32_NEON_INCOMPLETE(variant)( src ); \ + \ + /* Report median */ \ + qsort( cycles, NTT_TEST_COUNT, sizeof(uint64_t), cmp_uint64_t ); \ + debug_printf( "Median after %u NTTs: %lld cycles\n", \ + NTT_TEST_COUNT, \ + cycles[NTT_TEST_COUNT >> 1] ); \ + \ + buf_reduce_u32( src, NTT_SIZE ); \ + \ + if( compare_buf_u32( (uint32_t const*) src, (uint32_t const*) src_copy, \ + NTT_SIZE ) != 0 ) \ + { \ + debug_print_buf_s32( src_copy, NTT_SIZE, "Reference" ); \ + debug_print_buf_s32( src, NTT_SIZE, "MVE" ); \ + debug_test_fail(); \ + return( 1 ); \ + } \ + \ + debug_test_ok(); \ + return( 0 ); \ +} + +#define GEN_TEST_NTT_FULL(variant) \ +int test_fwd_ntt_full_var_ ## variant () \ +{ \ + debug_test_start( "NTT: deg 256, 32-bit, forward, full" ); \ + debug_printf( "Variant: %s\n", #variant ); \ + \ + ALIGN(NTT_BUFFER_ALIGN) \ + int32_t src[NTT_SIZE]; \ + ALIGN(NTT_BUFFER_ALIGN) \ + int32_t src_copy[NTT_SIZE]; \ + ALIGN(NTT_BUFFER_ALIGN) \ + int32_t dummy_copy[NTT_SIZE]; \ + \ + rand_init(0); \ + \ + /* Setup input */ \ + fill_random_u32( (uint32_t*) src, NTT_SIZE ); \ + buf_reduce_u32( src, NTT_SIZE ); \ + \ + /* Step 1: Reference NTT */ \ + memcpy( src_copy, src, sizeof( src ) ); \ + ntt_u32_C( src_copy ); \ + buf_reduce_u32( src_copy, NTT_SIZE ); \ + \ + /* Step 2: NEON-based NTT */ \ + for( unsigned cnt=0; cnt < NTT_TEST_WARMUP; cnt++ ) \ + NTT_U32_NEON_FULL(variant)( dummy_copy ); \ + for( unsigned cnt=0; cnt < NTT_TEST_COUNT; cnt++ ) \ + { \ + t0 = get_cyclecounter(); \ + NTT_U32_NEON_FULL(variant)( dummy_copy ); \ + t1 = get_cyclecounter(); \ + cycles[cnt] = t1 - t0; \ + } \ + \ + NTT_U32_NEON_FULL(variant)( src ); \ + \ + /* Report median */ \ + qsort( cycles, NTT_TEST_COUNT, sizeof(uint64_t), cmp_uint64_t ); \ + debug_printf( "Median after %u NTTs: %lld cycles\n", \ + NTT_TEST_COUNT, \ + cycles[NTT_TEST_COUNT >> 1] ); \ + \ + if( NTT_COMPLETE_BITREV4 ) \ + buf_bitrev_4( src, NTT_SIZE ); \ + \ + buf_reduce_u32( src, NTT_SIZE ); \ + \ + if( compare_buf_u32( (uint32_t const*) src, (uint32_t const*) src_copy, \ + NTT_SIZE ) != 0 ) \ + { \ + debug_print_buf_s32( src_copy, NTT_SIZE, "Reference" ); \ + debug_print_buf_s32( src, NTT_SIZE, "MVE" ); \ + debug_test_fail(); \ + return( 1 ); \ + } \ + \ + debug_test_ok(); \ + return( 0 ); \ +} + + +#else /* NTT_CHECK_FUNCTIONAL_CORRECTNESS */ + +#define GEN_TEST_NTT_INCOMPLETE(variant) \ +int test_fwd_ntt_incomplete_var_ ## variant () \ +{ \ + debug_test_start( "NTT: deg 256, 32-bit, forward, 6-layer incomplete" ); \ + debug_printf( "Variant: %s\n", #variant ); \ + \ + ALIGN(NTT_BUFFER_ALIGN) \ + int32_t src[NTT_SIZE]; \ + \ + for( unsigned cnt=0; cnt < NTT_TEST_WARMUP; cnt++ ) \ + NTT_U32_NEON_INCOMPLETE(variant)( src ); \ + for( unsigned cnt=0; cnt < NTT_TEST_COUNT; cnt++ ) \ + { \ + t0 = get_cyclecounter(); \ + NTT_U32_NEON_INCOMPLETE(variant)( src ); \ + t1 = get_cyclecounter(); \ + cycles[cnt] = t1 - t0; \ + } \ + \ + /* Report median */ \ + qsort( cycles, NTT_TEST_COUNT, sizeof(uint64_t), cmp_uint64_t ); \ + debug_printf( "Median after %u NTTs: %lld cycles\n", \ + NTT_TEST_COUNT, \ + cycles[NTT_TEST_COUNT >> 1] ); \ + \ + debug_test_ok(); \ + return( 0 ); \ +} + +#define GEN_TEST_NTT_FULL(variant) \ +int test_fwd_ntt_full_var_ ## variant () \ +{ \ + debug_test_start( "NTT: deg 256, 32-bit, forward, full" ); \ + debug_printf( "Variant: %s\n", #variant ); \ + \ + ALIGN(NTT_BUFFER_ALIGN) \ + int32_t src[NTT_SIZE]; \ + \ + for( unsigned cnt=0; cnt < NTT_TEST_WARMUP; cnt++ ) \ + NTT_U32_NEON_FULL(variant)( src ); \ + for( unsigned cnt=0; cnt < NTT_TEST_COUNT; cnt++ ) \ + { \ + t0 = get_cyclecounter(); \ + NTT_U32_NEON_FULL(variant)( src ); \ + t1 = get_cyclecounter(); \ + cycles[cnt] = t1 - t0; \ + } \ + \ + /* Report median */ \ + qsort( cycles, NTT_TEST_COUNT, sizeof(uint64_t), cmp_uint64_t ); \ + debug_printf( "Median after %u NTTs: %lld cycles\n", \ + NTT_TEST_COUNT, \ + cycles[NTT_TEST_COUNT >> 1] ); \ + \ + debug_test_ok(); \ + return( 0 ); \ +} + +#endif /* NTT_CHECK_FUNCTIONAL_CORRECTNESS */ + +GEN_TEST_NTT_INCOMPLETE(3_3_0) +GEN_TEST_NTT_INCOMPLETE(3_3_1) +GEN_TEST_NTT_INCOMPLETE(3_3_2) +GEN_TEST_NTT_INCOMPLETE(3_3_3) +GEN_TEST_NTT_INCOMPLETE(3_3_4) +GEN_TEST_NTT_INCOMPLETE(3_3_5) + +GEN_TEST_NTT_INCOMPLETE(4_2_0_0) + +GEN_TEST_NTT_INCOMPLETE(4_2_0_z4_0) +GEN_TEST_NTT_INCOMPLETE(4_2_0_z4_16) +GEN_TEST_NTT_INCOMPLETE(4_2_24_z4_0) +GEN_TEST_NTT_INCOMPLETE(4_2_24_z4_16) + +GEN_TEST_NTT_INCOMPLETE(4_2_3_z4_0) +GEN_TEST_NTT_INCOMPLETE(4_2_3_z4_1) +GEN_TEST_NTT_INCOMPLETE(4_2_3_z4_2) +GEN_TEST_NTT_INCOMPLETE(4_2_3_z4_3) +GEN_TEST_NTT_INCOMPLETE(4_2_3_z4_4) +GEN_TEST_NTT_INCOMPLETE(4_2_3_z4_5) +GEN_TEST_NTT_INCOMPLETE(4_2_7_z4_0) +GEN_TEST_NTT_INCOMPLETE(4_2_7_z4_1) +GEN_TEST_NTT_INCOMPLETE(4_2_7_z4_2) +GEN_TEST_NTT_INCOMPLETE(4_2_7_z4_3) +GEN_TEST_NTT_INCOMPLETE(4_2_7_z4_4) +GEN_TEST_NTT_INCOMPLETE(4_2_7_z4_5) +GEN_TEST_NTT_INCOMPLETE(4_2_7_z4_6) +GEN_TEST_NTT_INCOMPLETE(4_2_7_z4_7) +GEN_TEST_NTT_INCOMPLETE(4_2_7_z4_8) +GEN_TEST_NTT_INCOMPLETE(4_2_7_z4_9) +GEN_TEST_NTT_INCOMPLETE(4_2_7_z4_10) + +GEN_TEST_NTT_INCOMPLETE(4_2_8_z4_7) +GEN_TEST_NTT_INCOMPLETE(4_2_9_z4_7) +GEN_TEST_NTT_INCOMPLETE(4_2_10_z4_7) +GEN_TEST_NTT_INCOMPLETE(4_2_11_z4_7) +GEN_TEST_NTT_INCOMPLETE(4_2_12_z4_7) +GEN_TEST_NTT_INCOMPLETE(4_2_13_z4_7) +GEN_TEST_NTT_INCOMPLETE(4_2_14_z4_7) +GEN_TEST_NTT_INCOMPLETE(4_2_15_z4_7) +GEN_TEST_NTT_INCOMPLETE(4_2_16_z4_7) +GEN_TEST_NTT_INCOMPLETE(4_2_17_z4_7) +GEN_TEST_NTT_INCOMPLETE(4_2_18_z4_7) +GEN_TEST_NTT_INCOMPLETE(4_2_19_z4_7) +GEN_TEST_NTT_INCOMPLETE(4_2_20_z4_7) +GEN_TEST_NTT_INCOMPLETE(4_2_21_z4_7) +GEN_TEST_NTT_INCOMPLETE(4_2_22_z4_7) + +GEN_TEST_NTT_INCOMPLETE(4_2_22_z4_8) +GEN_TEST_NTT_INCOMPLETE(4_2_22_z4_9) +GEN_TEST_NTT_INCOMPLETE(4_2_22_z4_10) +GEN_TEST_NTT_INCOMPLETE(4_2_22_z4_11) +GEN_TEST_NTT_INCOMPLETE(4_2_22_z4_12) +GEN_TEST_NTT_INCOMPLETE(4_2_22_z4_13) +GEN_TEST_NTT_INCOMPLETE(4_2_22_z4_14) +GEN_TEST_NTT_INCOMPLETE(4_2_22_z4_15) + +GEN_TEST_NTT_FULL(4_4_0_0) +GEN_TEST_NTT_FULL(4_4_1_0) +GEN_TEST_NTT_FULL(4_4_2_0) +GEN_TEST_NTT_FULL(4_4_3_0) +GEN_TEST_NTT_FULL(4_4_4_0) +GEN_TEST_NTT_FULL(4_4_5_0) +GEN_TEST_NTT_FULL(4_4_6_0) +GEN_TEST_NTT_FULL(4_4_7_0) +GEN_TEST_NTT_FULL(4_4_8_0) +GEN_TEST_NTT_FULL(4_4_9_0) +GEN_TEST_NTT_FULL(4_4_10_0) +GEN_TEST_NTT_FULL(4_4_11_0) +GEN_TEST_NTT_FULL(4_4_12_0) +GEN_TEST_NTT_FULL(4_4_13_0) +GEN_TEST_NTT_FULL(4_4_14_0) +GEN_TEST_NTT_FULL(4_4_15_0) +GEN_TEST_NTT_FULL(4_4_16_0) +GEN_TEST_NTT_FULL(4_4_17_0) +GEN_TEST_NTT_FULL(4_4_18_0) +GEN_TEST_NTT_FULL(4_4_3_z2_0) +GEN_TEST_NTT_FULL(4_4_3_z2_1) +GEN_TEST_NTT_FULL(4_4_3_z2_2) +GEN_TEST_NTT_FULL(4_4_3_z2_3) +GEN_TEST_NTT_FULL(4_4_3_z2_4) +GEN_TEST_NTT_FULL(4_4_3_z2_5) +GEN_TEST_NTT_FULL(4_4_3_z4_0) +GEN_TEST_NTT_FULL(4_4_3_z4_1) +GEN_TEST_NTT_FULL(4_4_3_z4_2) +GEN_TEST_NTT_FULL(4_4_3_z4_3) +GEN_TEST_NTT_FULL(4_4_3_z4_4) + +#define NTT_U32_NEON_DUAL_INCOMPLETE(variant) ntt_u32_incomplete_neon_asm_dual_var_ ## variant + +#if defined(NTT_CHECK_FUNCTIONAL_CORRECTNESS) + +#define GEN_TEST_NTT_INCOMPLETE_DUAL(variant) \ +int test_fwd_ntt_incomplete_dual_var_ ## variant() \ +{ \ + debug_test_start( "NTT dual: deg 256, 32-bit, forward, 6-layer incomplete" ); \ + debug_printf( "Variant: %s\n", #variant ); \ + \ + int32_t src0[NTT_SIZE]; \ + int32_t src0_copy[NTT_SIZE]; \ + ALIGN(NTT_BUFFER_ALIGN) \ + int32_t dummy0_copy[NTT_SIZE]; \ + \ + int32_t src1[NTT_SIZE]; \ + int32_t src1_copy[NTT_SIZE]; \ + ALIGN(NTT_BUFFER_ALIGN) \ + int32_t dummy1_copy_[NTT_SIZE]; \ + int32_t * dummy1_copy = dummy1_copy_ + NTT_DUAL_BUFFER_OFFSET; \ + \ + rand_init(0); \ + \ + /* Setup input */ \ + fill_random_u32( (uint32_t*) src1, NTT_SIZE ); \ + buf_reduce_u32( src1, NTT_SIZE ); \ + fill_random_u32( (uint32_t*) src0, NTT_SIZE ); \ + buf_reduce_u32( src0, NTT_SIZE ); \ + \ + /* Step 1: Reference NTT */ \ + memcpy( src0_copy, src0, sizeof( src0 ) ); \ + memcpy( src1_copy, src1, sizeof( src1 ) ); \ + ntt_u32_C( src0_copy ); \ + ntt_u32_C( src1_copy ); \ + buf_reduce_u32( src0_copy, NTT_SIZE ); \ + buf_reduce_u32( src1_copy, NTT_SIZE ); \ + \ + /* Step 2: NEON-based NTT */ \ + for( unsigned cnt=0; cnt < NTT_TEST_WARMUP; cnt++ ) \ + NTT_U32_NEON_DUAL_INCOMPLETE(variant)( dummy0_copy, dummy1_copy ); \ + for( unsigned cnt=0; cnt < NTT_TEST_COUNT; cnt++ ) \ + { \ + t0 = get_cyclecounter(); \ + NTT_U32_NEON_DUAL_INCOMPLETE(variant)( dummy0_copy, dummy1_copy ); \ + t1 = get_cyclecounter(); \ + cycles[cnt] = t1 - t0; \ + } \ + \ + NTT_U32_NEON_DUAL_INCOMPLETE(variant)( src0, src1 ); \ + \ + /* Report median */ \ + qsort( cycles, NTT_TEST_COUNT, sizeof(uint64_t), cmp_uint64_t ); \ + debug_printf( "Median after %u NTTs: %lld cycles\n", \ + NTT_TEST_COUNT, \ + cycles[NTT_TEST_COUNT >> 1] ); \ + \ + buf_reduce_u32( src0, NTT_SIZE ); \ + buf_reduce_u32( src1, NTT_SIZE ); \ + \ + if( compare_buf_u32( (uint32_t const*) src0, (uint32_t const*) src0_copy, \ + NTT_SIZE ) != 0 ) \ + { \ + for( unsigned idx=0; idx < NTT_SIZE; idx++ ) \ + if( src0[idx] != src0_copy[idx] ) \ + debug_printf( "SRC0[%u]: %d != %d\n", \ + idx, src0[idx], src0_copy[idx] ); \ + debug_test_fail(); \ + } \ + \ + if( compare_buf_u32( (uint32_t const*) src1, (uint32_t const*) src1_copy, \ + NTT_SIZE ) != 0 ) \ + { \ + for( unsigned idx=0; idx < NTT_SIZE; idx++ ) \ + if( src1[idx] != src1_copy[idx] ) \ + debug_printf( "SRC1[%u]: %d != %d\n", \ + idx, src1[idx], src1_copy[idx] ); \ + debug_test_fail(); \ + return( 1 ); \ + } \ + \ + debug_test_ok(); \ + return( 0 ); \ +} + +#else /* NTT_CHECK_FUNCTIONAL_CORRECTNESS */ + +#define GEN_TEST_NTT_INCOMPLETE_DUAL(variant) \ +int test_fwd_ntt_incomplete_dual_var_ ## variant() \ +{ \ + debug_test_start( "NTT dual: deg 256, 32-bit, forward, 6-layer incomplete" ); \ + debug_printf( "Variant: %s\n", #variant ); \ + \ + ALIGN(NTT_BUFFER_ALIGN) \ + int32_t dummy0_copy[NTT_SIZE]; \ + ALIGN(NTT_BUFFER_ALIGN) \ + int32_t dummy1_copy_[NTT_SIZE]; \ + int32_t * dummy1_copy = dummy1_copy_ + NTT_DUAL_BUFFER_OFFSET; \ + \ + /* NEON-based NTT */ \ + for( unsigned cnt=0; cnt < NTT_TEST_WARMUP; cnt++ ) \ + NTT_U32_NEON_DUAL_INCOMPLETE(variant)( dummy0_copy, dummy1_copy ); \ + for( unsigned cnt=0; cnt < NTT_TEST_COUNT; cnt++ ) \ + { \ + t0 = get_cyclecounter(); \ + NTT_U32_NEON_DUAL_INCOMPLETE(variant)( dummy0_copy, dummy1_copy ); \ + t1 = get_cyclecounter(); \ + cycles[cnt] = t1 - t0; \ + } \ + \ + /* Report median */ \ + qsort( cycles, NTT_TEST_COUNT, sizeof(uint64_t), cmp_uint64_t ); \ + debug_printf( "Median after %u NTTs: %lld cycles\n", \ + NTT_TEST_COUNT, \ + cycles[NTT_TEST_COUNT >> 1] ); \ + \ + debug_test_ok(); \ + return( 0 ); \ +} + +#endif /* NTT_CHECK_FUNCTIONAL_CORRECTNESS */ diff --git a/tests/ntt_neon/ntt.h b/tests/ntt_neon/ntt.h new file mode 100755 index 0000000..03ece82 --- /dev/null +++ b/tests/ntt_neon/ntt.h @@ -0,0 +1,261 @@ +/* + * Copyright (c) 2021 Arm Limited + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef SRC_NTT_H_ +#define SRC_NTT_H_ + +#define SIZE 256 + +#define NTT_NO_CYCLES +#define NTT_INCOMPLETE + +#define NTT_CHECK_FUNCTIONAL_CORRECTNESS +#define NTT_TEST_WARMUP 10 +#define NTT_TEST_COUNT 10 + +#define NTT_BUFFER_ALIGN 32 +#define NTT_DUAL_BUFFER_OFFSET 16 + +/* Prime modulus to be used by 32-bit multiplication routines */ +#define MODULUS_Q32 33556993 +/* Modular inverse of q32 modulo 2**32 */ +#define MODULUS_Q32_INV_U32 375649793 +/* Negative of modular inverse of q32 modulo 2**32 */ +#define MODULUS_Q32_INV_U32_NEG -375649793 +/* 512-th root of unity for MODULUS_Q32 */ +#define MODULUS_Q32_BASE_ROOT 28678040 + +#define NTT_LAYERS 8 +#define NTT_SIZE (1u << NTT_LAYERS) + +#if defined(NTT_INCOMPLETE) +#define NTT_INCOMPLETE_LAYERS 6 +#define NTT_COMPLETE_BITREV4 0 +#else +#define NTT_INCOMPLETE_LAYERS 8 +#define NTT_COMPLETE_BITREV4 1 +#endif + +#define NTT_INCOMPLETE_SIZE (1u << NTT_INCOMPLETE_LAYERS) + +#define NTT_LAYER_GAP ( NTT_LAYERS - NTT_INCOMPLETE_LAYERS ) +#define NTT_LAYER_STRIDE (1u << NTT_LAYER_GAP ) + +void ntt_u32_C( int32_t *buf ); + +void ntt_u32_incomplete_neon_asm_dual_var_3_3_0( int32_t *buf0, int32_t *buf1 ); +void ntt_u32_incomplete_neon_asm_dual_var_3_3_1( int32_t *buf0, int32_t *buf1 ); +void ntt_u32_incomplete_neon_asm_dual_var_3_3_2( int32_t *buf0, int32_t *buf1 ); +void ntt_u32_incomplete_neon_asm_dual_var_3_3_3( int32_t *buf0, int32_t *buf1 ); +void ntt_u32_incomplete_neon_asm_dual_var_3_3_4( int32_t *buf0, int32_t *buf1 ); +void ntt_u32_incomplete_neon_asm_dual_var_3_3_5( int32_t *buf0, int32_t *buf1 ); +void ntt_u32_incomplete_neon_asm_dual_var_3_3_6( int32_t *buf0, int32_t *buf1 ); + +void ntt_u32_incomplete_neon_asm_var_3_3_0( int32_t *buf ); +void ntt_u32_incomplete_neon_asm_var_3_3_1( int32_t *buf ); +void ntt_u32_incomplete_neon_asm_var_3_3_2( int32_t *buf ); +void ntt_u32_incomplete_neon_asm_var_3_3_3( int32_t *buf ); +void ntt_u32_incomplete_neon_asm_var_3_3_4( int32_t *buf ); +void ntt_u32_incomplete_neon_asm_var_3_3_5( int32_t *buf ); + +void ntt_u32_incomplete_neon_asm_var_4_2_0_0( int32_t *buf ); +void ntt_u32_incomplete_neon_asm_var_4_2_0_z4_0( int32_t *buf ); +void ntt_u32_incomplete_neon_asm_var_4_2_24_z4_0( int32_t *buf ); +void ntt_u32_incomplete_neon_asm_var_4_2_0_z4_16( int32_t *buf ); +void ntt_u32_incomplete_neon_asm_var_4_2_24_z4_16( int32_t *buf ); + +void ntt_u32_incomplete_neon_asm_var_4_2_3_z4_0( int32_t *buf ); +void ntt_u32_incomplete_neon_asm_var_4_2_3_z4_1( int32_t *buf ); +void ntt_u32_incomplete_neon_asm_var_4_2_3_z4_2( int32_t *buf ); +void ntt_u32_incomplete_neon_asm_var_4_2_3_z4_3( int32_t *buf ); +void ntt_u32_incomplete_neon_asm_var_4_2_3_z4_4( int32_t *buf ); +void ntt_u32_incomplete_neon_asm_var_4_2_3_z4_5( int32_t *buf ); +void ntt_u32_incomplete_neon_asm_var_4_2_7_z4_0( int32_t *buf ); +void ntt_u32_incomplete_neon_asm_var_4_2_7_z4_1( int32_t *buf ); +void ntt_u32_incomplete_neon_asm_var_4_2_7_z4_2( int32_t *buf ); +void ntt_u32_incomplete_neon_asm_var_4_2_7_z4_3( int32_t *buf ); +void ntt_u32_incomplete_neon_asm_var_4_2_7_z4_4( int32_t *buf ); +void ntt_u32_incomplete_neon_asm_var_4_2_7_z4_5( int32_t *buf ); +void ntt_u32_incomplete_neon_asm_var_4_2_7_z4_6( int32_t *buf ); +void ntt_u32_incomplete_neon_asm_var_4_2_7_z4_7( int32_t *buf ); +void ntt_u32_incomplete_neon_asm_var_4_2_7_z4_8( int32_t *buf ); +void ntt_u32_incomplete_neon_asm_var_4_2_7_z4_9( int32_t *buf ); +void ntt_u32_incomplete_neon_asm_var_4_2_7_z4_10( int32_t *buf ); + +void ntt_u32_incomplete_neon_asm_var_4_2_8_z4_7( int32_t *buf ); +void ntt_u32_incomplete_neon_asm_var_4_2_9_z4_7( int32_t *buf ); +void ntt_u32_incomplete_neon_asm_var_4_2_10_z4_7( int32_t *buf ); +void ntt_u32_incomplete_neon_asm_var_4_2_11_z4_7( int32_t *buf ); +void ntt_u32_incomplete_neon_asm_var_4_2_12_z4_7( int32_t *buf ); +void ntt_u32_incomplete_neon_asm_var_4_2_13_z4_7( int32_t *buf ); +void ntt_u32_incomplete_neon_asm_var_4_2_14_z4_7( int32_t *buf ); +void ntt_u32_incomplete_neon_asm_var_4_2_15_z4_7( int32_t *buf ); +void ntt_u32_incomplete_neon_asm_var_4_2_16_z4_7( int32_t *buf ); +void ntt_u32_incomplete_neon_asm_var_4_2_17_z4_7( int32_t *buf ); +void ntt_u32_incomplete_neon_asm_var_4_2_18_z4_7( int32_t *buf ); +void ntt_u32_incomplete_neon_asm_var_4_2_19_z4_7( int32_t *buf ); +void ntt_u32_incomplete_neon_asm_var_4_2_20_z4_7( int32_t *buf ); +void ntt_u32_incomplete_neon_asm_var_4_2_21_z4_7( int32_t *buf ); +void ntt_u32_incomplete_neon_asm_var_4_2_22_z4_7( int32_t *buf ); + +void ntt_u32_incomplete_neon_asm_var_4_2_22_z4_8( int32_t *buf ); +void ntt_u32_incomplete_neon_asm_var_4_2_22_z4_9( int32_t *buf ); +void ntt_u32_incomplete_neon_asm_var_4_2_22_z4_10( int32_t *buf ); +void ntt_u32_incomplete_neon_asm_var_4_2_22_z4_11( int32_t *buf ); +void ntt_u32_incomplete_neon_asm_var_4_2_22_z4_12( int32_t *buf ); +void ntt_u32_incomplete_neon_asm_var_4_2_22_z4_13( int32_t *buf ); +void ntt_u32_incomplete_neon_asm_var_4_2_22_z4_14( int32_t *buf ); +void ntt_u32_incomplete_neon_asm_var_4_2_22_z4_15( int32_t *buf ); + +void ntt_u32_full_neon_asm_var_4_4_0_0( int32_t *buf ); +void ntt_u32_full_neon_asm_var_4_4_1_0( int32_t *buf ); +void ntt_u32_full_neon_asm_var_4_4_2_0( int32_t *buf ); +void ntt_u32_full_neon_asm_var_4_4_3_0( int32_t *buf ); +void ntt_u32_full_neon_asm_var_4_4_4_0( int32_t *buf ); +void ntt_u32_full_neon_asm_var_4_4_5_0( int32_t *buf ); +void ntt_u32_full_neon_asm_var_4_4_6_0( int32_t *buf ); +void ntt_u32_full_neon_asm_var_4_4_7_0( int32_t *buf ); +void ntt_u32_full_neon_asm_var_4_4_8_0( int32_t *buf ); +void ntt_u32_full_neon_asm_var_4_4_9_0( int32_t *buf ); +void ntt_u32_full_neon_asm_var_4_4_10_0( int32_t *buf ); +void ntt_u32_full_neon_asm_var_4_4_11_0( int32_t *buf ); +void ntt_u32_full_neon_asm_var_4_4_12_0( int32_t *buf ); +void ntt_u32_full_neon_asm_var_4_4_13_0( int32_t *buf ); +void ntt_u32_full_neon_asm_var_4_4_14_0( int32_t *buf ); +void ntt_u32_full_neon_asm_var_4_4_15_0( int32_t *buf ); +void ntt_u32_full_neon_asm_var_4_4_16_0( int32_t *buf ); +void ntt_u32_full_neon_asm_var_4_4_17_0( int32_t *buf ); +void ntt_u32_full_neon_asm_var_4_4_18_0( int32_t *buf ); +void ntt_u32_full_neon_asm_var_4_4_19_0( int32_t *buf ); +void ntt_u32_full_neon_asm_var_4_4_20_0( int32_t *buf ); +void ntt_u32_full_neon_asm_var_4_4_21_0( int32_t *buf ); +void ntt_u32_full_neon_asm_var_4_4_22_0( int32_t *buf ); +void ntt_u32_full_neon_asm_var_4_4_3_z2_0( int32_t *buf ); +void ntt_u32_full_neon_asm_var_4_4_3_z2_1( int32_t *buf ); +void ntt_u32_full_neon_asm_var_4_4_3_z2_2( int32_t *buf ); +void ntt_u32_full_neon_asm_var_4_4_3_z2_3( int32_t *buf ); +void ntt_u32_full_neon_asm_var_4_4_3_z2_4( int32_t *buf ); +void ntt_u32_full_neon_asm_var_4_4_3_z2_5( int32_t *buf ); +void ntt_u32_full_neon_asm_var_4_4_3_z4_0( int32_t *buf ); +void ntt_u32_full_neon_asm_var_4_4_3_z4_1( int32_t *buf ); +void ntt_u32_full_neon_asm_var_4_4_3_z4_2( int32_t *buf ); +void ntt_u32_full_neon_asm_var_4_4_3_z4_3( int32_t *buf ); +void ntt_u32_full_neon_asm_var_4_4_3_z4_4( int32_t *buf ); + +int test_fwd_ntt_full_var_4_4_0_0(void); +int test_fwd_ntt_full_var_4_4_1_0(void); +int test_fwd_ntt_full_var_4_4_2_0(void); +int test_fwd_ntt_full_var_4_4_3_0(void); +int test_fwd_ntt_full_var_4_4_4_0(void); +int test_fwd_ntt_full_var_4_4_5_0(void); +int test_fwd_ntt_full_var_4_4_6_0(void); +int test_fwd_ntt_full_var_4_4_7_0(void); +int test_fwd_ntt_full_var_4_4_8_0(void); +int test_fwd_ntt_full_var_4_4_9_0(void); +int test_fwd_ntt_full_var_4_4_10_0(void); +int test_fwd_ntt_full_var_4_4_11_0(void); +int test_fwd_ntt_full_var_4_4_12_0(void); +int test_fwd_ntt_full_var_4_4_13_0(void); +int test_fwd_ntt_full_var_4_4_14_0(void); +int test_fwd_ntt_full_var_4_4_15_0(void); +int test_fwd_ntt_full_var_4_4_16_0(void); +int test_fwd_ntt_full_var_4_4_17_0(void); +int test_fwd_ntt_full_var_4_4_18_0(void); +int test_fwd_ntt_full_var_4_4_3_z2_0(void); +int test_fwd_ntt_full_var_4_4_3_z2_1(void); +int test_fwd_ntt_full_var_4_4_3_z2_2(void); +int test_fwd_ntt_full_var_4_4_3_z2_3(void); +int test_fwd_ntt_full_var_4_4_3_z2_4(void); +int test_fwd_ntt_full_var_4_4_3_z2_5(void); +int test_fwd_ntt_full_var_4_4_3_z4_0(void); +int test_fwd_ntt_full_var_4_4_3_z4_1(void); +int test_fwd_ntt_full_var_4_4_3_z4_2(void); +int test_fwd_ntt_full_var_4_4_3_z4_3(void); +int test_fwd_ntt_full_var_4_4_3_z4_4(void); + +int test_fwd_ntt_incomplete_var_3_3_0(void); +int test_fwd_ntt_incomplete_var_3_3_1(void); +int test_fwd_ntt_incomplete_var_3_3_2(void); +int test_fwd_ntt_incomplete_var_3_3_3(void); +int test_fwd_ntt_incomplete_var_3_3_4(void); +int test_fwd_ntt_incomplete_var_3_3_5(void); + +int test_fwd_ntt_incomplete_var_4_2_0_0(void); +int test_fwd_ntt_incomplete_var_4_2_0_z4_0(void); +int test_fwd_ntt_incomplete_var_4_2_24_z4_16(void); +int test_fwd_ntt_incomplete_var_4_2_24_z4_0(void); +int test_fwd_ntt_incomplete_var_4_2_0_z4_16(void); + +int test_fwd_ntt_incomplete_var_4_2_3_z4_0(void); +int test_fwd_ntt_incomplete_var_4_2_3_z4_1(void); +int test_fwd_ntt_incomplete_var_4_2_3_z4_2(void); +int test_fwd_ntt_incomplete_var_4_2_3_z4_3(void); +int test_fwd_ntt_incomplete_var_4_2_3_z4_4(void); +int test_fwd_ntt_incomplete_var_4_2_3_z4_5(void); +int test_fwd_ntt_incomplete_var_4_2_7_z4_0(void); +int test_fwd_ntt_incomplete_var_4_2_7_z4_1(void); +int test_fwd_ntt_incomplete_var_4_2_7_z4_2(void); +int test_fwd_ntt_incomplete_var_4_2_7_z4_3(void); +int test_fwd_ntt_incomplete_var_4_2_7_z4_4(void); +int test_fwd_ntt_incomplete_var_4_2_7_z4_5(void); +int test_fwd_ntt_incomplete_var_4_2_7_z4_6(void); +int test_fwd_ntt_incomplete_var_4_2_7_z4_7(void); +int test_fwd_ntt_incomplete_var_4_2_7_z4_8(void); +int test_fwd_ntt_incomplete_var_4_2_7_z4_9(void); +int test_fwd_ntt_incomplete_var_4_2_7_z4_10(void); +int test_fwd_ntt_incomplete_var_4_2_8_z4_7(void); +int test_fwd_ntt_incomplete_var_4_2_9_z4_7(void); +int test_fwd_ntt_incomplete_var_4_2_10_z4_7(void); +int test_fwd_ntt_incomplete_var_4_2_11_z4_7(void); +int test_fwd_ntt_incomplete_var_4_2_12_z4_7(void); +int test_fwd_ntt_incomplete_var_4_2_13_z4_7(void); +int test_fwd_ntt_incomplete_var_4_2_14_z4_7(void); +int test_fwd_ntt_incomplete_var_4_2_15_z4_7(void); +int test_fwd_ntt_incomplete_var_4_2_16_z4_7(void); +int test_fwd_ntt_incomplete_var_4_2_17_z4_7(void); +int test_fwd_ntt_incomplete_var_4_2_18_z4_7(void); +int test_fwd_ntt_incomplete_var_4_2_19_z4_7(void); +int test_fwd_ntt_incomplete_var_4_2_20_z4_7(void); +int test_fwd_ntt_incomplete_var_4_2_21_z4_7(void); +int test_fwd_ntt_incomplete_var_4_2_22_z4_7(void); + +int test_fwd_ntt_incomplete_var_4_2_22_z4_7(void); +int test_fwd_ntt_incomplete_var_4_2_22_z4_8(void); +int test_fwd_ntt_incomplete_var_4_2_22_z4_9(void); +int test_fwd_ntt_incomplete_var_4_2_22_z4_10(void); +int test_fwd_ntt_incomplete_var_4_2_22_z4_11(void); +int test_fwd_ntt_incomplete_var_4_2_22_z4_12(void); +int test_fwd_ntt_incomplete_var_4_2_22_z4_13(void); +int test_fwd_ntt_incomplete_var_4_2_22_z4_14(void); +int test_fwd_ntt_incomplete_var_4_2_22_z4_15(void); + +int test_fwd_ntt_incomplete_dual_var_3_3_0(void); +int test_fwd_ntt_incomplete_dual_var_3_3_1(void); +int test_fwd_ntt_incomplete_dual_var_3_3_2(void); +int test_fwd_ntt_incomplete_dual_var_3_3_3(void); +int test_fwd_ntt_incomplete_dual_var_3_3_4(void); +int test_fwd_ntt_incomplete_dual_var_3_3_5(void); +int test_fwd_ntt_incomplete_dual_var_3_3_6(void); + +#endif /* SRC_NTT_H_ */ diff --git a/tests/ntt_sve2/auto/ntt_u32_incomplete_33556993_28678040_var_3_3_0.s b/tests/ntt_sve2/auto/ntt_u32_incomplete_33556993_28678040_var_3_3_0.s new file mode 100644 index 0000000..1a3b890 --- /dev/null +++ b/tests/ntt_sve2/auto/ntt_u32_incomplete_33556993_28678040_var_3_3_0.s @@ -0,0 +1,1475 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +modulus: +.word -33556993 +.word -33556993 +.word -33556993 +.word -33556993 +.align 6 +roots_merged: +.word 17702291 // Layer 0, block 0 +.word 3260327 // Layer 1, block 0 +.word 14579576 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 1132860160 // Layer 0, block 0 +.word 208645003 // Layer 1, block 0 +.word 933021652 // Layer 1, block 1 +.word 0 // Layer None, block None +.word 6733847 // Layer 2, block 0 +.word 12909577 // Layer 2, block 1 +.word 14745691 // Layer 2, block 2 +.word 13512548 // Layer 2, block 3 +.word 430933318 // Layer 2, block 0 +.word 826149873 // Layer 2, block 1 +.word 943652201 // Layer 2, block 2 +.word 864737072 // Layer 2, block 3 +.word 20428075 // Layer 3, block 0 +.word 23825509 // Layer 4, block 0 +.word 27028662 // Layer 4, block 1 +.word 0 // Layer None, block None +.word 1307297022 // Layer 3, block 0 +.word 1524716204 // Layer 4, block 0 +.word 1729702351 // Layer 4, block 1 +.word 0 // Layer None, block None +.word 9010590 // Layer 5, block 0 +.word 20699126 // Layer 5, block 1 +.word 341080 // Layer 5, block 2 +.word 21220783 // Layer 5, block 3 +.word 576633749 // Layer 5, block 0 +.word 1324642962 // Layer 5, block 1 +.word 21827454 // Layer 5, block 2 +.word 1358026462 // Layer 5, block 3 +.word 14626653 // Layer 3, block 1 +.word 14833295 // Layer 4, block 2 +.word 2138810 // Layer 4, block 3 +.word 0 // Layer None, block None +.word 936034350 // Layer 3, block 1 +.word 949258429 // Layer 4, block 2 +.word 136873393 // Layer 4, block 3 +.word 0 // Layer None, block None +.word 25331745 // Layer 5, block 4 +.word 5289426 // Layer 5, block 5 +.word 5705868 // Layer 5, block 6 +.word 17686665 // Layer 5, block 7 +.word 1621107951 // Layer 5, block 4 +.word 338497429 // Layer 5, block 5 +.word 365147683 // Layer 5, block 6 +.word 1131860172 // Layer 5, block 7 +.word 29737761 // Layer 3, block 2 +.word 6490403 // Layer 4, block 4 +.word 19648405 // Layer 4, block 5 +.word 0 // Layer None, block None +.word 1903071454 // Layer 3, block 2 +.word 415354091 // Layer 4, block 4 +.word 1257401950 // Layer 4, block 5 +.word 0 // Layer None, block None +.word 9106105 // Layer 5, block 8 +.word 18817700 // Layer 5, block 9 +.word 1579445 // Layer 5, block 10 +.word 7769916 // Layer 5, block 11 +.word 582746243 // Layer 5, block 8 +.word 1204240888 // Layer 5, block 9 +.word 101076765 // Layer 5, block 10 +.word 497236673 // Layer 5, block 11 +.word 30285189 // Layer 3, block 3 +.word 31254932 // Layer 4, block 6 +.word 26362414 // Layer 4, block 7 +.word 0 // Layer None, block None +.word 1938104173 // Layer 3, block 3 +.word 2000162988 // Layer 4, block 6 +.word 1687065733 // Layer 4, block 7 +.word 0 // Layer None, block None +.word 21843119 // Layer 5, block 12 +.word 11828796 // Layer 5, block 13 +.word 19828530 // Layer 5, block 14 +.word 33201112 // Layer 5, block 15 +.word 1397852927 // Layer 5, block 12 +.word 756985168 // Layer 5, block 13 +.word 1268929071 // Layer 5, block 14 +.word 2124709002 // Layer 5, block 15 +.word 21289485 // Layer 3, block 4 +.word 572895 // Layer 4, block 8 +.word 26691971 // Layer 4, block 9 +.word 0 // Layer None, block None +.word 1362423055 // Layer 3, block 4 +.word 36662482 // Layer 4, block 8 +.word 1708155771 // Layer 4, block 9 +.word 0 // Layer None, block None +.word 23713020 // Layer 5, block 16 +.word 19537976 // Layer 5, block 17 +.word 8285889 // Layer 5, block 18 +.word 24690028 // Layer 5, block 19 +.word 1517517457 // Layer 5, block 16 +.word 1250335034 // Layer 5, block 17 +.word 530256425 // Layer 5, block 18 +.word 1580041197 // Layer 5, block 19 +.word 9914896 // Layer 3, block 5 +.word 9249292 // Layer 4, block 10 +.word 29292862 // Layer 4, block 11 +.word 0 // Layer None, block None +.word 634504916 // Layer 3, block 5 +.word 591909511 // Layer 4, block 10 +.word 1874600091 // Layer 4, block 11 +.word 0 // Layer None, block None +.word 4778209 // Layer 5, block 20 +.word 13113327 // Layer 5, block 21 +.word 25384023 // Layer 5, block 22 +.word 10905370 // Layer 5, block 23 +.word 305782038 // Layer 5, block 20 +.word 839188878 // Layer 5, block 21 +.word 1624453488 // Layer 5, block 22 +.word 697890414 // Layer 5, block 23 +.word 22603682 // Layer 3, block 6 +.word 8247799 // Layer 4, block 12 +.word 5086187 // Layer 4, block 13 +.word 0 // Layer None, block None +.word 1446525244 // Layer 3, block 6 +.word 527818851 // Layer 4, block 12 +.word 325491125 // Layer 4, block 13 +.word 0 // Layer None, block None +.word 16167867 // Layer 5, block 24 +.word 22046437 // Layer 5, block 25 +.word 656361 // Layer 5, block 26 +.word 18153794 // Layer 5, block 27 +.word 1034664519 // Layer 5, block 24 +.word 1410864286 // Layer 5, block 25 +.word 42003898 // Layer 5, block 26 +.word 1161754147 // Layer 5, block 27 +.word 16204162 // Layer 3, block 7 +.word 28113639 // Layer 4, block 14 +.word 8471290 // Layer 4, block 15 +.word 0 // Layer None, block None +.word 1036987221 // Layer 3, block 7 +.word 1799135579 // Layer 4, block 14 +.word 542121183 // Layer 4, block 15 +.word 0 // Layer None, block None +.word 3732072 // Layer 5, block 28 +.word 22126384 // Layer 5, block 29 +.word 9445744 // Layer 5, block 30 +.word 794839 // Layer 5, block 31 +.word 238834379 // Layer 5, block 28 +.word 1415980503 // Layer 5, block 29 +.word 604481480 // Layer 5, block 30 +.word 50865814 // Layer 5, block 31 +.text +.type ntt_u32_incomplete_sve2_asm_var_3_3_0, %function +.global ntt_u32_incomplete_sve2_asm_var_3_3_0 +modulus_addr: .quad modulus +roots_merged_addr: .quad roots_merged +ntt_u32_incomplete_sve2_asm_var_3_3_0: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save SVE2 vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ldr x17, modulus_addr +ldr q31, [x17] +ptrue P0.s +ldr x17, roots_merged_addr +ldr q3, [x17, #+0] +ldr q2, [x17, #+16] +ldr q1, [x17, #+32] +ldr q0, [x17, #+48] +ldr q30, [x0, #960] +ldr q29, [x0, #832] +ldr q28, [x0, #576] +ldr q27, [x0, #704] +ldr q26, [x0, #448] +ldr q25, [x0, #320] +ldr q24, [x0, #64] +ldr q23, [x0, #192] +sqrdmulh z22.s, z30.s, z2.s[0] +mul z30.s, z30.s,z3.s[0] +mla z30.s, P0/M, z22.s, z31.s +sub z22.s, z26.s, z30.s +add z26.s, z26.s, z30.s +sqrdmulh z30.s, z29.s, z2.s[0] +mul z29.s, z29.s,z3.s[0] +mla z29.s, P0/M, z30.s, z31.s +sub z30.s, z25.s, z29.s +add z25.s, z25.s, z29.s +sqrdmulh z29.s, z28.s, z2.s[0] +mul z28.s, z28.s,z3.s[0] +mla z28.s, P0/M, z29.s, z31.s +sub z29.s, z24.s, z28.s +add z24.s, z24.s, z28.s +sqrdmulh z28.s, z27.s, z2.s[0] +mul z27.s, z27.s,z3.s[0] +mla z27.s, P0/M, z28.s, z31.s +sub z28.s, z23.s, z27.s +add z23.s, z23.s, z27.s +sqrdmulh z27.s, z26.s, z2.s[1] +mul z26.s, z26.s,z3.s[1] +mla z26.s, P0/M, z27.s, z31.s +sub z27.s, z23.s, z26.s +add z23.s, z23.s, z26.s +sqrdmulh z26.s, z25.s, z2.s[1] +mul z25.s, z25.s,z3.s[1] +mla z25.s, P0/M, z26.s, z31.s +sub z26.s, z24.s, z25.s +add z24.s, z24.s, z25.s +sqrdmulh z25.s, z22.s, z2.s[2] +mul z22.s, z22.s,z3.s[2] +mla z22.s, P0/M, z25.s, z31.s +sub z25.s, z28.s, z22.s +add z28.s, z28.s, z22.s +sqrdmulh z22.s, z30.s, z2.s[2] +mul z30.s, z30.s,z3.s[2] +mla z30.s, P0/M, z22.s, z31.s +sub z22.s, z29.s, z30.s +add z29.s, z29.s, z30.s +sqrdmulh z30.s, z23.s, z0.s[0] +mul z23.s, z23.s,z1.s[0] +mla z23.s, P0/M, z30.s, z31.s +sub z30.s, z24.s, z23.s +add z24.s, z24.s, z23.s +str q24, [x0, #64] +str q30, [x0, #192] +sqrdmulh z30.s, z27.s, z0.s[1] +mul z27.s, z27.s,z1.s[1] +mla z27.s, P0/M, z30.s, z31.s +sub z30.s, z26.s, z27.s +add z26.s, z26.s, z27.s +str q26, [x0, #320] +str q30, [x0, #448] +sqrdmulh z30.s, z25.s, z0.s[3] +mul z25.s, z25.s,z1.s[3] +mla z25.s, P0/M, z30.s, z31.s +sub z30.s, z22.s, z25.s +add z22.s, z22.s, z25.s +str q22, [x0, #832] +str q30, [x0, #960] +sqrdmulh z30.s, z28.s, z0.s[2] +mul z28.s, z28.s,z1.s[2] +mla z28.s, P0/M, z30.s, z31.s +sub z30.s, z29.s, z28.s +add z29.s, z29.s, z28.s +str q29, [x0, #576] +str q30, [x0, #704] +ldr q30, [x0, #976] +ldr q29, [x0, #848] +ldr q28, [x0, #592] +ldr q22, [x0, #720] +ldr q25, [x0, #464] +ldr q26, [x0, #336] +ldr q27, [x0, #80] +ldr q24, [x0, #208] +sqrdmulh z23.s, z30.s, z2.s[0] +mul z30.s, z30.s,z3.s[0] +mla z30.s, P0/M, z23.s, z31.s +sub z23.s, z25.s, z30.s +add z25.s, z25.s, z30.s +sqrdmulh z30.s, z29.s, z2.s[0] +mul z29.s, z29.s,z3.s[0] +mla z29.s, P0/M, z30.s, z31.s +sub z30.s, z26.s, z29.s +add z26.s, z26.s, z29.s +sqrdmulh z29.s, z28.s, z2.s[0] +mul z28.s, z28.s,z3.s[0] +mla z28.s, P0/M, z29.s, z31.s +sub z29.s, z27.s, z28.s +add z27.s, z27.s, z28.s +sqrdmulh z28.s, z22.s, z2.s[0] +mul z22.s, z22.s,z3.s[0] +mla z22.s, P0/M, z28.s, z31.s +sub z28.s, z24.s, z22.s +add z24.s, z24.s, z22.s +sqrdmulh z22.s, z25.s, z2.s[1] +mul z25.s, z25.s,z3.s[1] +mla z25.s, P0/M, z22.s, z31.s +sub z22.s, z24.s, z25.s +add z24.s, z24.s, z25.s +sqrdmulh z25.s, z26.s, z2.s[1] +mul z26.s, z26.s,z3.s[1] +mla z26.s, P0/M, z25.s, z31.s +sub z25.s, z27.s, z26.s +add z27.s, z27.s, z26.s +sqrdmulh z26.s, z23.s, z2.s[2] +mul z23.s, z23.s,z3.s[2] +mla z23.s, P0/M, z26.s, z31.s +sub z26.s, z28.s, z23.s +add z28.s, z28.s, z23.s +sqrdmulh z23.s, z30.s, z2.s[2] +mul z30.s, z30.s,z3.s[2] +mla z30.s, P0/M, z23.s, z31.s +sub z23.s, z29.s, z30.s +add z29.s, z29.s, z30.s +sqrdmulh z30.s, z24.s, z0.s[0] +mul z24.s, z24.s,z1.s[0] +mla z24.s, P0/M, z30.s, z31.s +sub z30.s, z27.s, z24.s +add z27.s, z27.s, z24.s +str q27, [x0, #80] +str q30, [x0, #208] +sqrdmulh z30.s, z22.s, z0.s[1] +mul z22.s, z22.s,z1.s[1] +mla z22.s, P0/M, z30.s, z31.s +sub z30.s, z25.s, z22.s +add z25.s, z25.s, z22.s +str q25, [x0, #336] +str q30, [x0, #464] +sqrdmulh z30.s, z26.s, z0.s[3] +mul z26.s, z26.s,z1.s[3] +mla z26.s, P0/M, z30.s, z31.s +sub z30.s, z23.s, z26.s +add z23.s, z23.s, z26.s +str q23, [x0, #848] +str q30, [x0, #976] +sqrdmulh z30.s, z28.s, z0.s[2] +mul z28.s, z28.s,z1.s[2] +mla z28.s, P0/M, z30.s, z31.s +sub z30.s, z29.s, z28.s +add z29.s, z29.s, z28.s +str q29, [x0, #592] +str q30, [x0, #720] +ldr q30, [x0, #992] +ldr q29, [x0, #864] +ldr q28, [x0, #608] +ldr q23, [x0, #736] +ldr q26, [x0, #480] +ldr q25, [x0, #352] +ldr q22, [x0, #96] +ldr q27, [x0, #224] +sqrdmulh z24.s, z30.s, z2.s[0] +mul z30.s, z30.s,z3.s[0] +mla z30.s, P0/M, z24.s, z31.s +sub z24.s, z26.s, z30.s +add z26.s, z26.s, z30.s +sqrdmulh z30.s, z29.s, z2.s[0] +mul z29.s, z29.s,z3.s[0] +mla z29.s, P0/M, z30.s, z31.s +sub z30.s, z25.s, z29.s +add z25.s, z25.s, z29.s +sqrdmulh z29.s, z28.s, z2.s[0] +mul z28.s, z28.s,z3.s[0] +mla z28.s, P0/M, z29.s, z31.s +sub z29.s, z22.s, z28.s +add z22.s, z22.s, z28.s +sqrdmulh z28.s, z23.s, z2.s[0] +mul z23.s, z23.s,z3.s[0] +mla z23.s, P0/M, z28.s, z31.s +sub z28.s, z27.s, z23.s +add z27.s, z27.s, z23.s +sqrdmulh z23.s, z26.s, z2.s[1] +mul z26.s, z26.s,z3.s[1] +mla z26.s, P0/M, z23.s, z31.s +sub z23.s, z27.s, z26.s +add z27.s, z27.s, z26.s +sqrdmulh z26.s, z25.s, z2.s[1] +mul z25.s, z25.s,z3.s[1] +mla z25.s, P0/M, z26.s, z31.s +sub z26.s, z22.s, z25.s +add z22.s, z22.s, z25.s +sqrdmulh z25.s, z24.s, z2.s[2] +mul z24.s, z24.s,z3.s[2] +mla z24.s, P0/M, z25.s, z31.s +sub z25.s, z28.s, z24.s +add z28.s, z28.s, z24.s +sqrdmulh z24.s, z30.s, z2.s[2] +mul z30.s, z30.s,z3.s[2] +mla z30.s, P0/M, z24.s, z31.s +sub z24.s, z29.s, z30.s +add z29.s, z29.s, z30.s +sqrdmulh z30.s, z27.s, z0.s[0] +mul z27.s, z27.s,z1.s[0] +mla z27.s, P0/M, z30.s, z31.s +sub z30.s, z22.s, z27.s +add z22.s, z22.s, z27.s +str q22, [x0, #96] +str q30, [x0, #224] +sqrdmulh z30.s, z23.s, z0.s[1] +mul z23.s, z23.s,z1.s[1] +mla z23.s, P0/M, z30.s, z31.s +sub z30.s, z26.s, z23.s +add z26.s, z26.s, z23.s +str q26, [x0, #352] +str q30, [x0, #480] +sqrdmulh z30.s, z25.s, z0.s[3] +mul z25.s, z25.s,z1.s[3] +mla z25.s, P0/M, z30.s, z31.s +sub z30.s, z24.s, z25.s +add z24.s, z24.s, z25.s +str q24, [x0, #864] +str q30, [x0, #992] +sqrdmulh z30.s, z28.s, z0.s[2] +mul z28.s, z28.s,z1.s[2] +mla z28.s, P0/M, z30.s, z31.s +sub z30.s, z29.s, z28.s +add z29.s, z29.s, z28.s +str q29, [x0, #608] +str q30, [x0, #736] +ldr q30, [x0, #1008] +ldr q29, [x0, #880] +ldr q28, [x0, #624] +ldr q24, [x0, #752] +ldr q25, [x0, #496] +ldr q26, [x0, #368] +ldr q23, [x0, #112] +ldr q22, [x0, #240] +sqrdmulh z27.s, z30.s, z2.s[0] +mul z30.s, z30.s,z3.s[0] +mla z30.s, P0/M, z27.s, z31.s +sub z27.s, z25.s, z30.s +add z25.s, z25.s, z30.s +sqrdmulh z30.s, z29.s, z2.s[0] +mul z29.s, z29.s,z3.s[0] +mla z29.s, P0/M, z30.s, z31.s +sub z30.s, z26.s, z29.s +add z26.s, z26.s, z29.s +sqrdmulh z29.s, z28.s, z2.s[0] +mul z28.s, z28.s,z3.s[0] +mla z28.s, P0/M, z29.s, z31.s +sub z29.s, z23.s, z28.s +add z23.s, z23.s, z28.s +sqrdmulh z28.s, z24.s, z2.s[0] +mul z24.s, z24.s,z3.s[0] +mla z24.s, P0/M, z28.s, z31.s +sub z28.s, z22.s, z24.s +add z22.s, z22.s, z24.s +sqrdmulh z24.s, z25.s, z2.s[1] +mul z25.s, z25.s,z3.s[1] +mla z25.s, P0/M, z24.s, z31.s +sub z24.s, z22.s, z25.s +add z22.s, z22.s, z25.s +sqrdmulh z25.s, z26.s, z2.s[1] +mul z26.s, z26.s,z3.s[1] +mla z26.s, P0/M, z25.s, z31.s +sub z25.s, z23.s, z26.s +add z23.s, z23.s, z26.s +sqrdmulh z26.s, z27.s, z2.s[2] +mul z27.s, z27.s,z3.s[2] +mla z27.s, P0/M, z26.s, z31.s +sub z26.s, z28.s, z27.s +add z28.s, z28.s, z27.s +sqrdmulh z27.s, z30.s, z2.s[2] +mul z30.s, z30.s,z3.s[2] +mla z30.s, P0/M, z27.s, z31.s +sub z27.s, z29.s, z30.s +add z29.s, z29.s, z30.s +sqrdmulh z30.s, z22.s, z0.s[0] +mul z22.s, z22.s,z1.s[0] +mla z22.s, P0/M, z30.s, z31.s +sub z30.s, z23.s, z22.s +add z23.s, z23.s, z22.s +str q23, [x0, #112] +str q30, [x0, #240] +sqrdmulh z30.s, z24.s, z0.s[1] +mul z24.s, z24.s,z1.s[1] +mla z24.s, P0/M, z30.s, z31.s +sub z30.s, z25.s, z24.s +add z25.s, z25.s, z24.s +str q25, [x0, #368] +str q30, [x0, #496] +sqrdmulh z30.s, z26.s, z0.s[3] +mul z26.s, z26.s,z1.s[3] +mla z26.s, P0/M, z30.s, z31.s +sub z30.s, z27.s, z26.s +add z27.s, z27.s, z26.s +str q27, [x0, #880] +str q30, [x0, #1008] +sqrdmulh z30.s, z28.s, z0.s[2] +mul z28.s, z28.s,z1.s[2] +mla z28.s, P0/M, z30.s, z31.s +sub z30.s, z29.s, z28.s +add z29.s, z29.s, z28.s +str q29, [x0, #624] +str q30, [x0, #752] +ldr q30, [x0, #896] +ldr q29, [x0, #768] +ldr q28, [x0, #512] +ldr q27, [x0, #640] +ldr q26, [x0, #384] +ldr q25, [x0, #256] +ldr q24, [x0, #0] +ldr q23, [x0, #128] +sqrdmulh z22.s, z30.s, z2.s[0] +mul z30.s, z30.s,z3.s[0] +mla z30.s, P0/M, z22.s, z31.s +sub z22.s, z26.s, z30.s +add z26.s, z26.s, z30.s +sqrdmulh z30.s, z29.s, z2.s[0] +mul z29.s, z29.s,z3.s[0] +mla z29.s, P0/M, z30.s, z31.s +sub z30.s, z25.s, z29.s +add z25.s, z25.s, z29.s +sqrdmulh z29.s, z28.s, z2.s[0] +mul z28.s, z28.s,z3.s[0] +mla z28.s, P0/M, z29.s, z31.s +sub z29.s, z24.s, z28.s +add z24.s, z24.s, z28.s +sqrdmulh z28.s, z27.s, z2.s[0] +mul z27.s, z27.s,z3.s[0] +mla z27.s, P0/M, z28.s, z31.s +sub z28.s, z23.s, z27.s +add z23.s, z23.s, z27.s +sqrdmulh z27.s, z26.s, z2.s[1] +mul z26.s, z26.s,z3.s[1] +mla z26.s, P0/M, z27.s, z31.s +sub z27.s, z23.s, z26.s +add z23.s, z23.s, z26.s +sqrdmulh z26.s, z25.s, z2.s[1] +mul z25.s, z25.s,z3.s[1] +mla z25.s, P0/M, z26.s, z31.s +sub z26.s, z24.s, z25.s +add z24.s, z24.s, z25.s +sqrdmulh z25.s, z22.s, z2.s[2] +mul z22.s, z22.s,z3.s[2] +mla z22.s, P0/M, z25.s, z31.s +sub z25.s, z28.s, z22.s +add z28.s, z28.s, z22.s +sqrdmulh z22.s, z30.s, z2.s[2] +mul z30.s, z30.s,z3.s[2] +mla z30.s, P0/M, z22.s, z31.s +sub z22.s, z29.s, z30.s +add z29.s, z29.s, z30.s +sqrdmulh z30.s, z23.s, z0.s[0] +mul z23.s, z23.s,z1.s[0] +mla z23.s, P0/M, z30.s, z31.s +sub z30.s, z24.s, z23.s +add z24.s, z24.s, z23.s +str q24, [x0, #0] +str q30, [x0, #128] +sqrdmulh z30.s, z27.s, z0.s[1] +mul z27.s, z27.s,z1.s[1] +mla z27.s, P0/M, z30.s, z31.s +sub z30.s, z26.s, z27.s +add z26.s, z26.s, z27.s +str q26, [x0, #256] +str q30, [x0, #384] +sqrdmulh z30.s, z25.s, z0.s[3] +mul z25.s, z25.s,z1.s[3] +mla z25.s, P0/M, z30.s, z31.s +sub z30.s, z22.s, z25.s +add z22.s, z22.s, z25.s +str q22, [x0, #768] +str q30, [x0, #896] +sqrdmulh z30.s, z28.s, z0.s[2] +mul z28.s, z28.s,z1.s[2] +mla z28.s, P0/M, z30.s, z31.s +sub z30.s, z29.s, z28.s +add z29.s, z29.s, z28.s +str q29, [x0, #512] +str q30, [x0, #640] +ldr q30, [x0, #912] +ldr q29, [x0, #784] +ldr q28, [x0, #528] +ldr q22, [x0, #656] +ldr q25, [x0, #400] +ldr q26, [x0, #272] +ldr q27, [x0, #16] +ldr q24, [x0, #144] +sqrdmulh z23.s, z30.s, z2.s[0] +mul z30.s, z30.s,z3.s[0] +mla z30.s, P0/M, z23.s, z31.s +sub z23.s, z25.s, z30.s +add z25.s, z25.s, z30.s +sqrdmulh z30.s, z29.s, z2.s[0] +mul z29.s, z29.s,z3.s[0] +mla z29.s, P0/M, z30.s, z31.s +sub z30.s, z26.s, z29.s +add z26.s, z26.s, z29.s +sqrdmulh z29.s, z28.s, z2.s[0] +mul z28.s, z28.s,z3.s[0] +mla z28.s, P0/M, z29.s, z31.s +sub z29.s, z27.s, z28.s +add z27.s, z27.s, z28.s +sqrdmulh z28.s, z22.s, z2.s[0] +mul z22.s, z22.s,z3.s[0] +mla z22.s, P0/M, z28.s, z31.s +sub z28.s, z24.s, z22.s +add z24.s, z24.s, z22.s +sqrdmulh z22.s, z25.s, z2.s[1] +mul z25.s, z25.s,z3.s[1] +mla z25.s, P0/M, z22.s, z31.s +sub z22.s, z24.s, z25.s +add z24.s, z24.s, z25.s +sqrdmulh z25.s, z26.s, z2.s[1] +mul z26.s, z26.s,z3.s[1] +mla z26.s, P0/M, z25.s, z31.s +sub z25.s, z27.s, z26.s +add z27.s, z27.s, z26.s +sqrdmulh z26.s, z23.s, z2.s[2] +mul z23.s, z23.s,z3.s[2] +mla z23.s, P0/M, z26.s, z31.s +sub z26.s, z28.s, z23.s +add z28.s, z28.s, z23.s +sqrdmulh z23.s, z30.s, z2.s[2] +mul z30.s, z30.s,z3.s[2] +mla z30.s, P0/M, z23.s, z31.s +sub z23.s, z29.s, z30.s +add z29.s, z29.s, z30.s +sqrdmulh z30.s, z24.s, z0.s[0] +mul z24.s, z24.s,z1.s[0] +mla z24.s, P0/M, z30.s, z31.s +sub z30.s, z27.s, z24.s +add z27.s, z27.s, z24.s +str q27, [x0, #16] +str q30, [x0, #144] +sqrdmulh z30.s, z22.s, z0.s[1] +mul z22.s, z22.s,z1.s[1] +mla z22.s, P0/M, z30.s, z31.s +sub z30.s, z25.s, z22.s +add z25.s, z25.s, z22.s +str q25, [x0, #272] +str q30, [x0, #400] +sqrdmulh z30.s, z26.s, z0.s[3] +mul z26.s, z26.s,z1.s[3] +mla z26.s, P0/M, z30.s, z31.s +sub z30.s, z23.s, z26.s +add z23.s, z23.s, z26.s +str q23, [x0, #784] +str q30, [x0, #912] +sqrdmulh z30.s, z28.s, z0.s[2] +mul z28.s, z28.s,z1.s[2] +mla z28.s, P0/M, z30.s, z31.s +sub z30.s, z29.s, z28.s +add z29.s, z29.s, z28.s +str q29, [x0, #528] +str q30, [x0, #656] +ldr q30, [x0, #928] +ldr q29, [x0, #800] +ldr q28, [x0, #544] +ldr q23, [x0, #672] +ldr q26, [x0, #416] +ldr q25, [x0, #288] +ldr q22, [x0, #32] +ldr q27, [x0, #160] +sqrdmulh z24.s, z30.s, z2.s[0] +mul z30.s, z30.s,z3.s[0] +mla z30.s, P0/M, z24.s, z31.s +sub z24.s, z26.s, z30.s +add z26.s, z26.s, z30.s +sqrdmulh z30.s, z29.s, z2.s[0] +mul z29.s, z29.s,z3.s[0] +mla z29.s, P0/M, z30.s, z31.s +sub z30.s, z25.s, z29.s +add z25.s, z25.s, z29.s +sqrdmulh z29.s, z28.s, z2.s[0] +mul z28.s, z28.s,z3.s[0] +mla z28.s, P0/M, z29.s, z31.s +sub z29.s, z22.s, z28.s +add z22.s, z22.s, z28.s +sqrdmulh z28.s, z23.s, z2.s[0] +mul z23.s, z23.s,z3.s[0] +mla z23.s, P0/M, z28.s, z31.s +sub z28.s, z27.s, z23.s +add z27.s, z27.s, z23.s +sqrdmulh z23.s, z26.s, z2.s[1] +mul z26.s, z26.s,z3.s[1] +mla z26.s, P0/M, z23.s, z31.s +sub z23.s, z27.s, z26.s +add z27.s, z27.s, z26.s +sqrdmulh z26.s, z25.s, z2.s[1] +mul z25.s, z25.s,z3.s[1] +mla z25.s, P0/M, z26.s, z31.s +sub z26.s, z22.s, z25.s +add z22.s, z22.s, z25.s +sqrdmulh z25.s, z24.s, z2.s[2] +mul z24.s, z24.s,z3.s[2] +mla z24.s, P0/M, z25.s, z31.s +sub z25.s, z28.s, z24.s +add z28.s, z28.s, z24.s +sqrdmulh z24.s, z30.s, z2.s[2] +mul z30.s, z30.s,z3.s[2] +mla z30.s, P0/M, z24.s, z31.s +sub z24.s, z29.s, z30.s +add z29.s, z29.s, z30.s +sqrdmulh z30.s, z27.s, z0.s[0] +mul z27.s, z27.s,z1.s[0] +mla z27.s, P0/M, z30.s, z31.s +sub z30.s, z22.s, z27.s +add z22.s, z22.s, z27.s +str q22, [x0, #32] +str q30, [x0, #160] +sqrdmulh z30.s, z23.s, z0.s[1] +mul z23.s, z23.s,z1.s[1] +mla z23.s, P0/M, z30.s, z31.s +sub z30.s, z26.s, z23.s +add z26.s, z26.s, z23.s +str q26, [x0, #288] +str q30, [x0, #416] +sqrdmulh z30.s, z25.s, z0.s[3] +mul z25.s, z25.s,z1.s[3] +mla z25.s, P0/M, z30.s, z31.s +sub z30.s, z24.s, z25.s +add z24.s, z24.s, z25.s +str q24, [x0, #800] +str q30, [x0, #928] +sqrdmulh z30.s, z28.s, z0.s[2] +mul z28.s, z28.s,z1.s[2] +mla z28.s, P0/M, z30.s, z31.s +sub z30.s, z29.s, z28.s +add z29.s, z29.s, z28.s +str q29, [x0, #544] +str q30, [x0, #672] +ldr q30, [x0, #944] +ldr q29, [x0, #816] +ldr q28, [x0, #560] +ldr q24, [x0, #688] +ldr q25, [x0, #432] +ldr q26, [x0, #304] +ldr q23, [x0, #48] +ldr q22, [x0, #176] +sqrdmulh z27.s, z30.s, z2.s[0] +mul z30.s, z30.s,z3.s[0] +mla z30.s, P0/M, z27.s, z31.s +sub z27.s, z25.s, z30.s +add z25.s, z25.s, z30.s +sqrdmulh z30.s, z29.s, z2.s[0] +mul z29.s, z29.s,z3.s[0] +mla z29.s, P0/M, z30.s, z31.s +sub z30.s, z26.s, z29.s +add z26.s, z26.s, z29.s +sqrdmulh z29.s, z28.s, z2.s[0] +mul z28.s, z28.s,z3.s[0] +mla z28.s, P0/M, z29.s, z31.s +sub z29.s, z23.s, z28.s +add z23.s, z23.s, z28.s +sqrdmulh z28.s, z24.s, z2.s[0] +mul z24.s, z24.s,z3.s[0] +mla z24.s, P0/M, z28.s, z31.s +sub z28.s, z22.s, z24.s +add z22.s, z22.s, z24.s +sqrdmulh z24.s, z25.s, z2.s[1] +mul z25.s, z25.s,z3.s[1] +mla z25.s, P0/M, z24.s, z31.s +sub z24.s, z22.s, z25.s +add z22.s, z22.s, z25.s +sqrdmulh z25.s, z26.s, z2.s[1] +mul z26.s, z26.s,z3.s[1] +mla z26.s, P0/M, z25.s, z31.s +sub z25.s, z23.s, z26.s +add z23.s, z23.s, z26.s +sqrdmulh z26.s, z27.s, z2.s[2] +mul z27.s, z27.s,z3.s[2] +mla z27.s, P0/M, z26.s, z31.s +sub z26.s, z28.s, z27.s +add z28.s, z28.s, z27.s +sqrdmulh z27.s, z30.s, z2.s[2] +mul z30.s, z30.s,z3.s[2] +mla z30.s, P0/M, z27.s, z31.s +sub z27.s, z29.s, z30.s +add z29.s, z29.s, z30.s +sqrdmulh z30.s, z22.s, z0.s[0] +mul z22.s, z22.s,z1.s[0] +mla z22.s, P0/M, z30.s, z31.s +sub z30.s, z23.s, z22.s +add z23.s, z23.s, z22.s +str q23, [x0, #48] +str q30, [x0, #176] +sqrdmulh z30.s, z24.s, z0.s[1] +mul z24.s, z24.s,z1.s[1] +mla z24.s, P0/M, z30.s, z31.s +sub z30.s, z25.s, z24.s +add z25.s, z25.s, z24.s +str q25, [x0, #304] +str q30, [x0, #432] +sqrdmulh z30.s, z26.s, z0.s[3] +mul z26.s, z26.s,z1.s[3] +mla z26.s, P0/M, z30.s, z31.s +sub z30.s, z27.s, z26.s +add z27.s, z27.s, z26.s +str q27, [x0, #816] +str q30, [x0, #944] +sqrdmulh z30.s, z28.s, z0.s[2] +mul z28.s, z28.s,z1.s[2] +mla z28.s, P0/M, z30.s, z31.s +sub z30.s, z29.s, z28.s +add z29.s, z29.s, z28.s +str q29, [x0, #560] +str q30, [x0, #688] +ldr q4, [x17, #+64] +ldr q5, [x17, #+80] +ldr q6, [x17, #+96] +ldr q7, [x17, #+112] +ldr q8, [x0, #112] +ldr q9, [x0, #96] +ldr q10, [x0, #64] +ldr q11, [x0, #80] +ldr q12, [x0, #48] +ldr q13, [x0, #32] +ldr q14, [x0, #0] +ldr q15, [x0, #16] +sqrdmulh z16.s, z8.s, z5.s[0] +mul z8.s, z8.s,z4.s[0] +mla z8.s, P0/M, z16.s, z31.s +sub z16.s, z12.s, z8.s +add z12.s, z12.s, z8.s +sqrdmulh z8.s, z9.s, z5.s[0] +mul z9.s, z9.s,z4.s[0] +mla z9.s, P0/M, z8.s, z31.s +sub z8.s, z13.s, z9.s +add z13.s, z13.s, z9.s +sqrdmulh z9.s, z10.s, z5.s[0] +mul z10.s, z10.s,z4.s[0] +mla z10.s, P0/M, z9.s, z31.s +sub z9.s, z14.s, z10.s +add z14.s, z14.s, z10.s +sqrdmulh z10.s, z11.s, z5.s[0] +mul z11.s, z11.s,z4.s[0] +mla z11.s, P0/M, z10.s, z31.s +sub z10.s, z15.s, z11.s +add z15.s, z15.s, z11.s +sqrdmulh z11.s, z12.s, z5.s[1] +mul z12.s, z12.s,z4.s[1] +mla z12.s, P0/M, z11.s, z31.s +sub z11.s, z15.s, z12.s +add z15.s, z15.s, z12.s +ldr q3, [x17, #+128] +ldr q2, [x17, #+144] +ldr q1, [x17, #+160] +ldr q0, [x17, #+176] +sqrdmulh z12.s, z13.s, z5.s[1] +mul z13.s, z13.s,z4.s[1] +mla z13.s, P0/M, z12.s, z31.s +sub z12.s, z14.s, z13.s +add z14.s, z14.s, z13.s +sqrdmulh z13.s, z16.s, z5.s[2] +mul z16.s, z16.s,z4.s[2] +mla z16.s, P0/M, z13.s, z31.s +sub z13.s, z10.s, z16.s +add z10.s, z10.s, z16.s +sqrdmulh z16.s, z8.s, z5.s[2] +mul z8.s, z8.s,z4.s[2] +mla z8.s, P0/M, z16.s, z31.s +sub z16.s, z9.s, z8.s +add z9.s, z9.s, z8.s +sqrdmulh z8.s, z15.s, z7.s[0] +mul z15.s, z15.s,z6.s[0] +mla z15.s, P0/M, z8.s, z31.s +sub z8.s, z14.s, z15.s +add z14.s, z14.s, z15.s +str q14, [x0, #0] +str q8, [x0, #16] +sqrdmulh z8.s, z11.s, z7.s[1] +mul z11.s, z11.s,z6.s[1] +mla z11.s, P0/M, z8.s, z31.s +sub z8.s, z12.s, z11.s +add z12.s, z12.s, z11.s +str q12, [x0, #32] +str q8, [x0, #48] +sqrdmulh z8.s, z13.s, z7.s[3] +mul z13.s, z13.s,z6.s[3] +mla z13.s, P0/M, z8.s, z31.s +sub z8.s, z16.s, z13.s +add z16.s, z16.s, z13.s +str q16, [x0, #96] +str q8, [x0, #112] +sqrdmulh z8.s, z10.s, z7.s[2] +mul z10.s, z10.s,z6.s[2] +mla z10.s, P0/M, z8.s, z31.s +sub z8.s, z9.s, z10.s +add z9.s, z9.s, z10.s +str q9, [x0, #64] +str q8, [x0, #80] +ldr q8, [x0, #240] +ldr q9, [x0, #224] +ldr q10, [x0, #192] +ldr q16, [x0, #208] +ldr q13, [x0, #176] +ldr q12, [x0, #160] +ldr q11, [x0, #128] +ldr q14, [x0, #144] +sqrdmulh z15.s, z8.s, z2.s[0] +mul z8.s, z8.s,z3.s[0] +mla z8.s, P0/M, z15.s, z31.s +sub z15.s, z13.s, z8.s +add z13.s, z13.s, z8.s +sqrdmulh z8.s, z9.s, z2.s[0] +mul z9.s, z9.s,z3.s[0] +mla z9.s, P0/M, z8.s, z31.s +sub z8.s, z12.s, z9.s +add z12.s, z12.s, z9.s +sqrdmulh z9.s, z10.s, z2.s[0] +mul z10.s, z10.s,z3.s[0] +mla z10.s, P0/M, z9.s, z31.s +sub z9.s, z11.s, z10.s +add z11.s, z11.s, z10.s +sqrdmulh z10.s, z16.s, z2.s[0] +mul z16.s, z16.s,z3.s[0] +mla z16.s, P0/M, z10.s, z31.s +sub z10.s, z14.s, z16.s +add z14.s, z14.s, z16.s +sqrdmulh z16.s, z13.s, z2.s[1] +mul z13.s, z13.s,z3.s[1] +mla z13.s, P0/M, z16.s, z31.s +sub z16.s, z14.s, z13.s +add z14.s, z14.s, z13.s +ldr q7, [x17, #+192] +ldr q6, [x17, #+208] +ldr q5, [x17, #+224] +ldr q4, [x17, #+240] +sqrdmulh z13.s, z12.s, z2.s[1] +mul z12.s, z12.s,z3.s[1] +mla z12.s, P0/M, z13.s, z31.s +sub z13.s, z11.s, z12.s +add z11.s, z11.s, z12.s +sqrdmulh z12.s, z15.s, z2.s[2] +mul z15.s, z15.s,z3.s[2] +mla z15.s, P0/M, z12.s, z31.s +sub z12.s, z10.s, z15.s +add z10.s, z10.s, z15.s +sqrdmulh z15.s, z8.s, z2.s[2] +mul z8.s, z8.s,z3.s[2] +mla z8.s, P0/M, z15.s, z31.s +sub z15.s, z9.s, z8.s +add z9.s, z9.s, z8.s +sqrdmulh z8.s, z14.s, z0.s[0] +mul z14.s, z14.s,z1.s[0] +mla z14.s, P0/M, z8.s, z31.s +sub z8.s, z11.s, z14.s +add z11.s, z11.s, z14.s +str q11, [x0, #128] +str q8, [x0, #144] +sqrdmulh z8.s, z16.s, z0.s[1] +mul z16.s, z16.s,z1.s[1] +mla z16.s, P0/M, z8.s, z31.s +sub z8.s, z13.s, z16.s +add z13.s, z13.s, z16.s +str q13, [x0, #160] +str q8, [x0, #176] +sqrdmulh z8.s, z12.s, z0.s[3] +mul z12.s, z12.s,z1.s[3] +mla z12.s, P0/M, z8.s, z31.s +sub z8.s, z15.s, z12.s +add z15.s, z15.s, z12.s +str q15, [x0, #224] +str q8, [x0, #240] +sqrdmulh z8.s, z10.s, z0.s[2] +mul z10.s, z10.s,z1.s[2] +mla z10.s, P0/M, z8.s, z31.s +sub z8.s, z9.s, z10.s +add z9.s, z9.s, z10.s +str q9, [x0, #192] +str q8, [x0, #208] +ldr q8, [x0, #368] +ldr q9, [x0, #352] +ldr q10, [x0, #320] +ldr q15, [x0, #336] +ldr q12, [x0, #304] +ldr q13, [x0, #288] +ldr q16, [x0, #256] +ldr q11, [x0, #272] +sqrdmulh z14.s, z8.s, z6.s[0] +mul z8.s, z8.s,z7.s[0] +mla z8.s, P0/M, z14.s, z31.s +sub z14.s, z12.s, z8.s +add z12.s, z12.s, z8.s +sqrdmulh z8.s, z9.s, z6.s[0] +mul z9.s, z9.s,z7.s[0] +mla z9.s, P0/M, z8.s, z31.s +sub z8.s, z13.s, z9.s +add z13.s, z13.s, z9.s +sqrdmulh z9.s, z10.s, z6.s[0] +mul z10.s, z10.s,z7.s[0] +mla z10.s, P0/M, z9.s, z31.s +sub z9.s, z16.s, z10.s +add z16.s, z16.s, z10.s +sqrdmulh z10.s, z15.s, z6.s[0] +mul z15.s, z15.s,z7.s[0] +mla z15.s, P0/M, z10.s, z31.s +sub z10.s, z11.s, z15.s +add z11.s, z11.s, z15.s +sqrdmulh z15.s, z12.s, z6.s[1] +mul z12.s, z12.s,z7.s[1] +mla z12.s, P0/M, z15.s, z31.s +sub z15.s, z11.s, z12.s +add z11.s, z11.s, z12.s +ldr q0, [x17, #+256] +ldr q1, [x17, #+272] +ldr q2, [x17, #+288] +ldr q3, [x17, #+304] +sqrdmulh z12.s, z13.s, z6.s[1] +mul z13.s, z13.s,z7.s[1] +mla z13.s, P0/M, z12.s, z31.s +sub z12.s, z16.s, z13.s +add z16.s, z16.s, z13.s +sqrdmulh z13.s, z14.s, z6.s[2] +mul z14.s, z14.s,z7.s[2] +mla z14.s, P0/M, z13.s, z31.s +sub z13.s, z10.s, z14.s +add z10.s, z10.s, z14.s +sqrdmulh z14.s, z8.s, z6.s[2] +mul z8.s, z8.s,z7.s[2] +mla z8.s, P0/M, z14.s, z31.s +sub z14.s, z9.s, z8.s +add z9.s, z9.s, z8.s +sqrdmulh z8.s, z11.s, z4.s[0] +mul z11.s, z11.s,z5.s[0] +mla z11.s, P0/M, z8.s, z31.s +sub z8.s, z16.s, z11.s +add z16.s, z16.s, z11.s +str q16, [x0, #256] +str q8, [x0, #272] +sqrdmulh z8.s, z15.s, z4.s[1] +mul z15.s, z15.s,z5.s[1] +mla z15.s, P0/M, z8.s, z31.s +sub z8.s, z12.s, z15.s +add z12.s, z12.s, z15.s +str q12, [x0, #288] +str q8, [x0, #304] +sqrdmulh z8.s, z13.s, z4.s[3] +mul z13.s, z13.s,z5.s[3] +mla z13.s, P0/M, z8.s, z31.s +sub z8.s, z14.s, z13.s +add z14.s, z14.s, z13.s +str q14, [x0, #352] +str q8, [x0, #368] +sqrdmulh z8.s, z10.s, z4.s[2] +mul z10.s, z10.s,z5.s[2] +mla z10.s, P0/M, z8.s, z31.s +sub z8.s, z9.s, z10.s +add z9.s, z9.s, z10.s +str q9, [x0, #320] +str q8, [x0, #336] +ldr q8, [x0, #496] +ldr q9, [x0, #480] +ldr q10, [x0, #448] +ldr q14, [x0, #464] +ldr q13, [x0, #432] +ldr q12, [x0, #416] +ldr q15, [x0, #384] +ldr q16, [x0, #400] +sqrdmulh z11.s, z8.s, z1.s[0] +mul z8.s, z8.s,z0.s[0] +mla z8.s, P0/M, z11.s, z31.s +sub z11.s, z13.s, z8.s +add z13.s, z13.s, z8.s +sqrdmulh z8.s, z9.s, z1.s[0] +mul z9.s, z9.s,z0.s[0] +mla z9.s, P0/M, z8.s, z31.s +sub z8.s, z12.s, z9.s +add z12.s, z12.s, z9.s +sqrdmulh z9.s, z10.s, z1.s[0] +mul z10.s, z10.s,z0.s[0] +mla z10.s, P0/M, z9.s, z31.s +sub z9.s, z15.s, z10.s +add z15.s, z15.s, z10.s +sqrdmulh z10.s, z14.s, z1.s[0] +mul z14.s, z14.s,z0.s[0] +mla z14.s, P0/M, z10.s, z31.s +sub z10.s, z16.s, z14.s +add z16.s, z16.s, z14.s +sqrdmulh z14.s, z13.s, z1.s[1] +mul z13.s, z13.s,z0.s[1] +mla z13.s, P0/M, z14.s, z31.s +sub z14.s, z16.s, z13.s +add z16.s, z16.s, z13.s +ldr q4, [x17, #+320] +ldr q5, [x17, #+336] +ldr q6, [x17, #+352] +ldr q7, [x17, #+368] +sqrdmulh z13.s, z12.s, z1.s[1] +mul z12.s, z12.s,z0.s[1] +mla z12.s, P0/M, z13.s, z31.s +sub z13.s, z15.s, z12.s +add z15.s, z15.s, z12.s +sqrdmulh z12.s, z11.s, z1.s[2] +mul z11.s, z11.s,z0.s[2] +mla z11.s, P0/M, z12.s, z31.s +sub z12.s, z10.s, z11.s +add z10.s, z10.s, z11.s +sqrdmulh z11.s, z8.s, z1.s[2] +mul z8.s, z8.s,z0.s[2] +mla z8.s, P0/M, z11.s, z31.s +sub z11.s, z9.s, z8.s +add z9.s, z9.s, z8.s +sqrdmulh z8.s, z16.s, z3.s[0] +mul z16.s, z16.s,z2.s[0] +mla z16.s, P0/M, z8.s, z31.s +sub z8.s, z15.s, z16.s +add z15.s, z15.s, z16.s +str q15, [x0, #384] +str q8, [x0, #400] +sqrdmulh z8.s, z14.s, z3.s[1] +mul z14.s, z14.s,z2.s[1] +mla z14.s, P0/M, z8.s, z31.s +sub z8.s, z13.s, z14.s +add z13.s, z13.s, z14.s +str q13, [x0, #416] +str q8, [x0, #432] +sqrdmulh z8.s, z12.s, z3.s[3] +mul z12.s, z12.s,z2.s[3] +mla z12.s, P0/M, z8.s, z31.s +sub z8.s, z11.s, z12.s +add z11.s, z11.s, z12.s +str q11, [x0, #480] +str q8, [x0, #496] +sqrdmulh z8.s, z10.s, z3.s[2] +mul z10.s, z10.s,z2.s[2] +mla z10.s, P0/M, z8.s, z31.s +sub z8.s, z9.s, z10.s +add z9.s, z9.s, z10.s +str q9, [x0, #448] +str q8, [x0, #464] +ldr q8, [x0, #624] +ldr q9, [x0, #608] +ldr q10, [x0, #576] +ldr q11, [x0, #592] +ldr q12, [x0, #560] +ldr q13, [x0, #544] +ldr q14, [x0, #512] +ldr q15, [x0, #528] +sqrdmulh z16.s, z8.s, z5.s[0] +mul z8.s, z8.s,z4.s[0] +mla z8.s, P0/M, z16.s, z31.s +sub z16.s, z12.s, z8.s +add z12.s, z12.s, z8.s +sqrdmulh z8.s, z9.s, z5.s[0] +mul z9.s, z9.s,z4.s[0] +mla z9.s, P0/M, z8.s, z31.s +sub z8.s, z13.s, z9.s +add z13.s, z13.s, z9.s +sqrdmulh z9.s, z10.s, z5.s[0] +mul z10.s, z10.s,z4.s[0] +mla z10.s, P0/M, z9.s, z31.s +sub z9.s, z14.s, z10.s +add z14.s, z14.s, z10.s +sqrdmulh z10.s, z11.s, z5.s[0] +mul z11.s, z11.s,z4.s[0] +mla z11.s, P0/M, z10.s, z31.s +sub z10.s, z15.s, z11.s +add z15.s, z15.s, z11.s +sqrdmulh z11.s, z12.s, z5.s[1] +mul z12.s, z12.s,z4.s[1] +mla z12.s, P0/M, z11.s, z31.s +sub z11.s, z15.s, z12.s +add z15.s, z15.s, z12.s +ldr q3, [x17, #+384] +ldr q2, [x17, #+400] +ldr q1, [x17, #+416] +ldr q0, [x17, #+432] +sqrdmulh z12.s, z13.s, z5.s[1] +mul z13.s, z13.s,z4.s[1] +mla z13.s, P0/M, z12.s, z31.s +sub z12.s, z14.s, z13.s +add z14.s, z14.s, z13.s +sqrdmulh z13.s, z16.s, z5.s[2] +mul z16.s, z16.s,z4.s[2] +mla z16.s, P0/M, z13.s, z31.s +sub z13.s, z10.s, z16.s +add z10.s, z10.s, z16.s +sqrdmulh z16.s, z8.s, z5.s[2] +mul z8.s, z8.s,z4.s[2] +mla z8.s, P0/M, z16.s, z31.s +sub z16.s, z9.s, z8.s +add z9.s, z9.s, z8.s +sqrdmulh z8.s, z15.s, z7.s[0] +mul z15.s, z15.s,z6.s[0] +mla z15.s, P0/M, z8.s, z31.s +sub z8.s, z14.s, z15.s +add z14.s, z14.s, z15.s +str q14, [x0, #512] +str q8, [x0, #528] +sqrdmulh z8.s, z11.s, z7.s[1] +mul z11.s, z11.s,z6.s[1] +mla z11.s, P0/M, z8.s, z31.s +sub z8.s, z12.s, z11.s +add z12.s, z12.s, z11.s +str q12, [x0, #544] +str q8, [x0, #560] +sqrdmulh z8.s, z13.s, z7.s[3] +mul z13.s, z13.s,z6.s[3] +mla z13.s, P0/M, z8.s, z31.s +sub z8.s, z16.s, z13.s +add z16.s, z16.s, z13.s +str q16, [x0, #608] +str q8, [x0, #624] +sqrdmulh z8.s, z10.s, z7.s[2] +mul z10.s, z10.s,z6.s[2] +mla z10.s, P0/M, z8.s, z31.s +sub z8.s, z9.s, z10.s +add z9.s, z9.s, z10.s +str q9, [x0, #576] +str q8, [x0, #592] +ldr q8, [x0, #752] +ldr q9, [x0, #736] +ldr q10, [x0, #704] +ldr q16, [x0, #720] +ldr q13, [x0, #688] +ldr q12, [x0, #672] +ldr q11, [x0, #640] +ldr q14, [x0, #656] +sqrdmulh z15.s, z8.s, z2.s[0] +mul z8.s, z8.s,z3.s[0] +mla z8.s, P0/M, z15.s, z31.s +sub z15.s, z13.s, z8.s +add z13.s, z13.s, z8.s +sqrdmulh z8.s, z9.s, z2.s[0] +mul z9.s, z9.s,z3.s[0] +mla z9.s, P0/M, z8.s, z31.s +sub z8.s, z12.s, z9.s +add z12.s, z12.s, z9.s +sqrdmulh z9.s, z10.s, z2.s[0] +mul z10.s, z10.s,z3.s[0] +mla z10.s, P0/M, z9.s, z31.s +sub z9.s, z11.s, z10.s +add z11.s, z11.s, z10.s +sqrdmulh z10.s, z16.s, z2.s[0] +mul z16.s, z16.s,z3.s[0] +mla z16.s, P0/M, z10.s, z31.s +sub z10.s, z14.s, z16.s +add z14.s, z14.s, z16.s +sqrdmulh z16.s, z13.s, z2.s[1] +mul z13.s, z13.s,z3.s[1] +mla z13.s, P0/M, z16.s, z31.s +sub z16.s, z14.s, z13.s +add z14.s, z14.s, z13.s +ldr q7, [x17, #+448] +ldr q6, [x17, #+464] +ldr q5, [x17, #+480] +ldr q4, [x17, #+496] +sqrdmulh z13.s, z12.s, z2.s[1] +mul z12.s, z12.s,z3.s[1] +mla z12.s, P0/M, z13.s, z31.s +sub z13.s, z11.s, z12.s +add z11.s, z11.s, z12.s +sqrdmulh z12.s, z15.s, z2.s[2] +mul z15.s, z15.s,z3.s[2] +mla z15.s, P0/M, z12.s, z31.s +sub z12.s, z10.s, z15.s +add z10.s, z10.s, z15.s +sqrdmulh z15.s, z8.s, z2.s[2] +mul z8.s, z8.s,z3.s[2] +mla z8.s, P0/M, z15.s, z31.s +sub z15.s, z9.s, z8.s +add z9.s, z9.s, z8.s +sqrdmulh z8.s, z14.s, z0.s[0] +mul z14.s, z14.s,z1.s[0] +mla z14.s, P0/M, z8.s, z31.s +sub z8.s, z11.s, z14.s +add z11.s, z11.s, z14.s +str q11, [x0, #640] +str q8, [x0, #656] +sqrdmulh z8.s, z16.s, z0.s[1] +mul z16.s, z16.s,z1.s[1] +mla z16.s, P0/M, z8.s, z31.s +sub z8.s, z13.s, z16.s +add z13.s, z13.s, z16.s +str q13, [x0, #672] +str q8, [x0, #688] +sqrdmulh z8.s, z12.s, z0.s[3] +mul z12.s, z12.s,z1.s[3] +mla z12.s, P0/M, z8.s, z31.s +sub z8.s, z15.s, z12.s +add z15.s, z15.s, z12.s +str q15, [x0, #736] +str q8, [x0, #752] +sqrdmulh z8.s, z10.s, z0.s[2] +mul z10.s, z10.s,z1.s[2] +mla z10.s, P0/M, z8.s, z31.s +sub z8.s, z9.s, z10.s +add z9.s, z9.s, z10.s +str q9, [x0, #704] +str q8, [x0, #720] +ldr q8, [x0, #880] +ldr q9, [x0, #864] +ldr q10, [x0, #832] +ldr q15, [x0, #848] +ldr q12, [x0, #816] +ldr q13, [x0, #800] +ldr q16, [x0, #768] +ldr q11, [x0, #784] +sqrdmulh z14.s, z8.s, z6.s[0] +mul z8.s, z8.s,z7.s[0] +mla z8.s, P0/M, z14.s, z31.s +sub z14.s, z12.s, z8.s +add z12.s, z12.s, z8.s +sqrdmulh z8.s, z9.s, z6.s[0] +mul z9.s, z9.s,z7.s[0] +mla z9.s, P0/M, z8.s, z31.s +sub z8.s, z13.s, z9.s +add z13.s, z13.s, z9.s +sqrdmulh z9.s, z10.s, z6.s[0] +mul z10.s, z10.s,z7.s[0] +mla z10.s, P0/M, z9.s, z31.s +sub z9.s, z16.s, z10.s +add z16.s, z16.s, z10.s +sqrdmulh z10.s, z15.s, z6.s[0] +mul z15.s, z15.s,z7.s[0] +mla z15.s, P0/M, z10.s, z31.s +sub z10.s, z11.s, z15.s +add z11.s, z11.s, z15.s +sqrdmulh z15.s, z12.s, z6.s[1] +mul z12.s, z12.s,z7.s[1] +mla z12.s, P0/M, z15.s, z31.s +sub z15.s, z11.s, z12.s +add z11.s, z11.s, z12.s +ldr q0, [x17, #+512] +ldr q1, [x17, #+528] +ldr q2, [x17, #+544] +ldr q3, [x17, #+560] +sqrdmulh z12.s, z13.s, z6.s[1] +mul z13.s, z13.s,z7.s[1] +mla z13.s, P0/M, z12.s, z31.s +sub z12.s, z16.s, z13.s +add z16.s, z16.s, z13.s +sqrdmulh z13.s, z14.s, z6.s[2] +mul z14.s, z14.s,z7.s[2] +mla z14.s, P0/M, z13.s, z31.s +sub z13.s, z10.s, z14.s +add z10.s, z10.s, z14.s +sqrdmulh z14.s, z8.s, z6.s[2] +mul z8.s, z8.s,z7.s[2] +mla z8.s, P0/M, z14.s, z31.s +sub z14.s, z9.s, z8.s +add z9.s, z9.s, z8.s +sqrdmulh z8.s, z11.s, z4.s[0] +mul z11.s, z11.s,z5.s[0] +mla z11.s, P0/M, z8.s, z31.s +sub z8.s, z16.s, z11.s +add z16.s, z16.s, z11.s +str q16, [x0, #768] +str q8, [x0, #784] +sqrdmulh z8.s, z15.s, z4.s[1] +mul z15.s, z15.s,z5.s[1] +mla z15.s, P0/M, z8.s, z31.s +sub z8.s, z12.s, z15.s +add z12.s, z12.s, z15.s +str q12, [x0, #800] +str q8, [x0, #816] +sqrdmulh z8.s, z13.s, z4.s[3] +mul z13.s, z13.s,z5.s[3] +mla z13.s, P0/M, z8.s, z31.s +sub z8.s, z14.s, z13.s +add z14.s, z14.s, z13.s +str q14, [x0, #864] +str q8, [x0, #880] +sqrdmulh z8.s, z10.s, z4.s[2] +mul z10.s, z10.s,z5.s[2] +mla z10.s, P0/M, z8.s, z31.s +sub z8.s, z9.s, z10.s +add z9.s, z9.s, z10.s +str q9, [x0, #832] +str q8, [x0, #848] +ldr q8, [x0, #1008] +ldr q9, [x0, #992] +ldr q10, [x0, #960] +ldr q14, [x0, #976] +ldr q13, [x0, #944] +ldr q12, [x0, #928] +ldr q15, [x0, #896] +ldr q16, [x0, #912] +sqrdmulh z11.s, z8.s, z1.s[0] +mul z8.s, z8.s,z0.s[0] +mla z8.s, P0/M, z11.s, z31.s +sub z11.s, z13.s, z8.s +add z13.s, z13.s, z8.s +sqrdmulh z8.s, z9.s, z1.s[0] +mul z9.s, z9.s,z0.s[0] +mla z9.s, P0/M, z8.s, z31.s +sub z8.s, z12.s, z9.s +add z12.s, z12.s, z9.s +sqrdmulh z9.s, z10.s, z1.s[0] +mul z10.s, z10.s,z0.s[0] +mla z10.s, P0/M, z9.s, z31.s +sub z9.s, z15.s, z10.s +add z15.s, z15.s, z10.s +sqrdmulh z10.s, z14.s, z1.s[0] +mul z14.s, z14.s,z0.s[0] +mla z14.s, P0/M, z10.s, z31.s +sub z10.s, z16.s, z14.s +add z16.s, z16.s, z14.s +sqrdmulh z14.s, z13.s, z1.s[1] +mul z13.s, z13.s,z0.s[1] +mla z13.s, P0/M, z14.s, z31.s +sub z14.s, z16.s, z13.s +add z16.s, z16.s, z13.s +sqrdmulh z13.s, z12.s, z1.s[1] +mul z12.s, z12.s,z0.s[1] +mla z12.s, P0/M, z13.s, z31.s +sub z13.s, z15.s, z12.s +add z15.s, z15.s, z12.s +sqrdmulh z12.s, z11.s, z1.s[2] +mul z11.s, z11.s,z0.s[2] +mla z11.s, P0/M, z12.s, z31.s +sub z12.s, z10.s, z11.s +add z10.s, z10.s, z11.s +sqrdmulh z11.s, z8.s, z1.s[2] +mul z8.s, z8.s,z0.s[2] +mla z8.s, P0/M, z11.s, z31.s +sub z11.s, z9.s, z8.s +add z9.s, z9.s, z8.s +sqrdmulh z8.s, z16.s, z3.s[0] +mul z16.s, z16.s,z2.s[0] +mla z16.s, P0/M, z8.s, z31.s +sub z8.s, z15.s, z16.s +add z15.s, z15.s, z16.s +str q15, [x0, #896] +str q8, [x0, #912] +sqrdmulh z8.s, z14.s, z3.s[1] +mul z14.s, z14.s,z2.s[1] +mla z14.s, P0/M, z8.s, z31.s +sub z8.s, z13.s, z14.s +add z13.s, z13.s, z14.s +str q13, [x0, #928] +str q8, [x0, #944] +sqrdmulh z8.s, z12.s, z3.s[3] +mul z12.s, z12.s,z2.s[3] +mla z12.s, P0/M, z8.s, z31.s +sub z8.s, z11.s, z12.s +add z11.s, z11.s, z12.s +str q11, [x0, #992] +str q8, [x0, #1008] +sqrdmulh z8.s, z10.s, z3.s[2] +mul z10.s, z10.s,z2.s[2] +mla z10.s, P0/M, z8.s, z31.s +sub z8.s, z9.s, z10.s +add z9.s, z9.s, z10.s +str q9, [x0, #960] +str q8, [x0, #976] +// Restore SVE2 vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 1445 +// Instruction count: 1441 \ No newline at end of file diff --git a/tests/ntt_sve2/auto/ntt_u64_incomplete_72057594067788289_60277548896192635_var_3_3_0.s b/tests/ntt_sve2/auto/ntt_u64_incomplete_72057594067788289_60277548896192635_var_3_3_0.s new file mode 100644 index 0000000..aa5cd18 --- /dev/null +++ b/tests/ntt_sve2/auto/ntt_u64_incomplete_72057594067788289_60277548896192635_var_3_3_0.s @@ -0,0 +1,2727 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +modulus: +.dword -72057594067788289 +.dword -72057594067788289 +.dword -72057594067788289 +.dword -72057594067788289 +.align 6 +roots_merged: +.dword 25792053496987399 // Layer 0, block 0 +.dword 0 // Layer None, block None +.dword 3301382846246308405 // Layer 0, block 0 +.dword 0 // Layer None, block None +.dword 36678763444893001 // Layer 1, block 0 +.dword 12009493193917617 // Layer 1, block 1 +.dword 4694881719000765600 // Layer 1, block 0 +.dword 1537215128184439725 // Layer 1, block 1 +.dword 57226611787624233 // Layer 2, block 0 +.dword 39665359539540334 // Layer 2, block 1 +.dword 7325006305780451127 // Layer 2, block 0 +.dword 5077166018957207276 // Layer 2, block 1 +.dword 14359056949694594 // Layer 2, block 2 +.dword 63449028357011879 // Layer 2, block 3 +.dword 1837959288799265711 // Layer 2, block 2 +.dword 8121475626332016399 // Layer 2, block 3 +.dword 56437370284897879 // Layer 3, block 0 +.dword 0 // Layer None, block None +.dword 7223983393473341270 // Layer 3, block 0 +.dword 0 // Layer None, block None +.dword 15519149204003269 // Layer 4, block 0 +.dword 18945631884663455 // Layer 4, block 1 +.dword 1986451097289241753 // Layer 4, block 0 +.dword 2425040880231995866 // Layer 4, block 1 +.dword 21843809513296019 // Layer 5, block 0 +.dword 52861630939350015 // Layer 5, block 1 +.dword 2796007616543237058 // Layer 5, block 0 +.dword 6766288757432881341 // Layer 5, block 1 +.dword 58200436133340777 // Layer 5, block 2 +.dword 45581265709396633 // Layer 5, block 3 +.dword 7449655821980514543 // Layer 5, block 2 +.dword 5834402008385018253 // Layer 5, block 3 +.dword 7801853795705237 // Layer 3, block 1 +.dword 0 // Layer None, block None +.dword 998637285436439396 // Layer 3, block 1 +.dword 0 // Layer None, block None +.dword 72057409685042741 // Layer 4, block 2 +.dword 67813594624550994 // Layer 4, block 3 +.dword 9223348435863355444 // Layer 4, block 2 +.dword 8680140108345514992 // Layer 4, block 3 +.dword 16444438478993771 // Layer 5, block 4 +.dword 44738633871916757 // Layer 5, block 5 +.dword 2104888124438946221 // Layer 5, block 4 +.dword 5726545133232289544 // Layer 5, block 5 +.dword 14998888047589537 // Layer 5, block 6 +.dword 1367715298619054 // Layer 5, block 7 +.dword 1919857669295880083 // Layer 5, block 6 +.dword 175067558150691679 // Layer 5, block 7 +.dword 50810289212278368 // Layer 3, block 2 +.dword 0 // Layer None, block None +.dword 6503717016476519110 // Layer 3, block 2 +.dword 0 // Layer None, block None +.dword 38922220208018571 // Layer 4, block 4 +.dword 7966052600948377 // Layer 4, block 5 +.dword 4982044184561839686 // Layer 4, block 4 +.dword 1019654732498851778 // Layer 4, block 5 +.dword 45879272116084567 // Layer 5, block 8 +.dword 66654388400258382 // Layer 5, block 9 +.dword 5872546828425266758 // Layer 5, block 8 +.dword 8531761711697548017 // Layer 5, block 9 +.dword 8930087962801744 // Layer 5, block 10 +.dword 61848588213223279 // Layer 5, block 11 +.dword 1143051258764947771 // Layer 5, block 10 +.dword 7916619288011967173 // Layer 5, block 11 +.dword 31977682183549777 // Layer 3, block 3 +.dword 0 // Layer None, block None +.dword 4093143317798190700 // Layer 3, block 3 +.dword 0 // Layer None, block None +.dword 66070897124800871 // Layer 4, block 6 +.dword 953067252694683 // Layer 4, block 7 +.dword 8457074828469936528 // Layer 4, block 6 +.dword 121992608294366219 // Layer 4, block 7 +.dword 33801610235026337 // Layer 5, block 12 +.dword 32122784433286747 // Layer 5, block 13 +.dword 4326606108290444417 // Layer 5, block 12 +.dword 4111716405756826253 // Layer 5, block 13 +.dword 67688369535326483 // Layer 5, block 14 +.dword 45021686719473556 // Layer 5, block 15 +.dword 8664111296931419854 // Layer 5, block 14 +.dword 5762775897704545946 // Layer 5, block 15 +.dword 66662168904752601 // Layer 3, block 4 +.dword 0 // Layer None, block None +.dword 8532757616272395351 // Layer 3, block 4 +.dword 0 // Layer None, block None +.dword 23961218891132444 // Layer 4, block 8 +.dword 59012643726482518 // Layer 4, block 9 +.dword 3067036016793986470 // Layer 4, block 8 +.dword 7553618393859575754 // Layer 4, block 9 +.dword 52812533586708198 // Layer 5, block 16 +.dword 27994290036168371 // Layer 5, block 17 +.dword 6760004296297333018 // Layer 5, block 16 +.dword 3583269123144660376 // Layer 5, block 17 +.dword 45890717144660134 // Layer 5, block 18 +.dword 39684773913748863 // Layer 5, block 19 +.dword 5874011792082332260 // Layer 5, block 18 +.dword 5079651058854869198 // Layer 5, block 19 +.dword 50149898471788096 // Layer 3, block 5 +.dword 0 // Layer None, block None +.dword 6419187001728793164 // Layer 3, block 5 +.dword 0 // Layer None, block None +.dword 65714767972465509 // Layer 4, block 10 +.dword 51421828010275652 // Layer 4, block 11 +.dword 8411490296989900223 // Layer 4, block 10 +.dword 6581993982587733829 // Layer 4, block 11 +.dword 18683690578478417 // Layer 5, block 20 +.dword 3282356803714609 // Layer 5, block 21 +.dword 2391512393054205061 // Layer 5, block 20 +.dword 420141670701365074 // Layer 5, block 21 +.dword 67884452950503047 // Layer 5, block 22 +.dword 10335338564031418 // Layer 5, block 23 +.dword 8689209974063619263 // Layer 5, block 22 +.dword 1322923335647807838 // Layer 5, block 23 +.dword 30932683335866672 // Layer 3, block 6 +.dword 0 // Layer None, block None +.dword 3959383465350182760 // Layer 3, block 6 +.dword 0 // Layer None, block None +.dword 27050097608373352 // Layer 4, block 12 +.dword 67454821565758121 // Layer 4, block 13 +.dword 3462412492436980406 // Layer 4, block 12 +.dword 8634217156839057519 // Layer 4, block 13 +.dword 32828920539599153 // Layer 5, block 24 +.dword 8624332566875856 // Layer 5, block 25 +.dword 4202101827327358896 // Layer 5, block 24 +.dword 1103914568102652181 // Layer 5, block 25 +.dword 56732837753533829 // Layer 5, block 26 +.dword 14816466027490539 // Layer 5, block 27 +.dword 7261803229443070495 // Layer 5, block 26 +.dword 1896507650732884485 // Layer 5, block 27 +.dword 54968319742463037 // Layer 3, block 7 +.dword 0 // Layer None, block None +.dword 7035944924119603816 // Layer 3, block 7 +.dword 0 // Layer None, block None +.dword 55666925166425210 // Layer 4, block 14 +.dword 34241587306439298 // Layer 4, block 15 +.dword 7125366418349706083 // Layer 4, block 14 +.dword 4382923173407965878 // Layer 4, block 15 +.dword 8550051130607768 // Layer 5, block 28 +.dword 14420141705316589 // Layer 5, block 29 +.dword 1094406544264277001 // Layer 5, block 28 +.dword 1845778137515640974 // Layer 5, block 29 +.dword 55622715926092387 // Layer 5, block 30 +.dword 3405033449209397 // Layer 5, block 31 +.dword 7119707635589449714 // Layer 5, block 30 +.dword 435844281318190845 // Layer 5, block 31 +.text +.type ntt_u64_incomplete_sve2_asm_var_3_3_0, %function +.global ntt_u64_incomplete_sve2_asm_var_3_3_0 +modulus_addr: .quad modulus +roots_merged_addr: .quad roots_merged +ntt_u64_incomplete_sve2_asm_var_3_3_0: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save SVE2 vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ldr x17, modulus_addr +ldr q31, [x17] +ptrue P0.d +ldr x17, roots_merged_addr +ldr q3, [x17, #+0] +ldr q2, [x17, #+16] +ldr q1, [x17, #+32] +ldr q0, [x17, #+48] +ldr q15, [x17, #+64] +ldr q14, [x17, #+80] +ldr q13, [x17, #+96] +ldr q12, [x17, #+112] +ldr q30, [x0, #1920] +ldr q29, [x0, #1664] +ldr q28, [x0, #1152] +ldr q27, [x0, #1408] +ldr q26, [x0, #896] +ldr q25, [x0, #640] +ldr q24, [x0, #128] +ldr q23, [x0, #384] +sqrdmulh z22.d, z30.d, z2.d[0] +mul z30.d, z30.d,z3.d[0] +mla z30.d, P0/M, z22.d, z31.d +sub z22.d, z26.d, z30.d +add z26.d, z26.d, z30.d +sqrdmulh z30.d, z29.d, z2.d[0] +mul z29.d, z29.d,z3.d[0] +mla z29.d, P0/M, z30.d, z31.d +sub z30.d, z25.d, z29.d +add z25.d, z25.d, z29.d +sqrdmulh z29.d, z28.d, z2.d[0] +mul z28.d, z28.d,z3.d[0] +mla z28.d, P0/M, z29.d, z31.d +sub z29.d, z24.d, z28.d +add z24.d, z24.d, z28.d +sqrdmulh z28.d, z27.d, z2.d[0] +mul z27.d, z27.d,z3.d[0] +mla z27.d, P0/M, z28.d, z31.d +sub z28.d, z23.d, z27.d +add z23.d, z23.d, z27.d +sqrdmulh z27.d, z26.d, z0.d[0] +mul z26.d, z26.d,z1.d[0] +mla z26.d, P0/M, z27.d, z31.d +sub z27.d, z23.d, z26.d +add z23.d, z23.d, z26.d +sqrdmulh z26.d, z25.d, z0.d[0] +mul z25.d, z25.d,z1.d[0] +mla z25.d, P0/M, z26.d, z31.d +sub z26.d, z24.d, z25.d +add z24.d, z24.d, z25.d +sqrdmulh z25.d, z22.d, z0.d[1] +mul z22.d, z22.d,z1.d[1] +mla z22.d, P0/M, z25.d, z31.d +sub z25.d, z28.d, z22.d +add z28.d, z28.d, z22.d +sqrdmulh z22.d, z30.d, z0.d[1] +mul z30.d, z30.d,z1.d[1] +mla z30.d, P0/M, z22.d, z31.d +sub z22.d, z29.d, z30.d +add z29.d, z29.d, z30.d +sqrdmulh z30.d, z23.d, z14.d[0] +mul z23.d, z23.d,z15.d[0] +mla z23.d, P0/M, z30.d, z31.d +sub z30.d, z24.d, z23.d +add z24.d, z24.d, z23.d +str q24, [x0, #128] +str q30, [x0, #384] +sqrdmulh z30.d, z27.d, z14.d[1] +mul z27.d, z27.d,z15.d[1] +mla z27.d, P0/M, z30.d, z31.d +sub z30.d, z26.d, z27.d +add z26.d, z26.d, z27.d +str q26, [x0, #640] +str q30, [x0, #896] +sqrdmulh z30.d, z25.d, z12.d[1] +mul z25.d, z25.d,z13.d[1] +mla z25.d, P0/M, z30.d, z31.d +sub z30.d, z22.d, z25.d +add z22.d, z22.d, z25.d +str q22, [x0, #1664] +str q30, [x0, #1920] +sqrdmulh z30.d, z28.d, z12.d[0] +mul z28.d, z28.d,z13.d[0] +mla z28.d, P0/M, z30.d, z31.d +sub z30.d, z29.d, z28.d +add z29.d, z29.d, z28.d +str q29, [x0, #1152] +str q30, [x0, #1408] +ldr q30, [x0, #1936] +ldr q29, [x0, #1680] +ldr q28, [x0, #1168] +ldr q22, [x0, #1424] +ldr q25, [x0, #912] +ldr q26, [x0, #656] +ldr q27, [x0, #144] +ldr q24, [x0, #400] +sqrdmulh z23.d, z30.d, z2.d[0] +mul z30.d, z30.d,z3.d[0] +mla z30.d, P0/M, z23.d, z31.d +sub z23.d, z25.d, z30.d +add z25.d, z25.d, z30.d +sqrdmulh z30.d, z29.d, z2.d[0] +mul z29.d, z29.d,z3.d[0] +mla z29.d, P0/M, z30.d, z31.d +sub z30.d, z26.d, z29.d +add z26.d, z26.d, z29.d +sqrdmulh z29.d, z28.d, z2.d[0] +mul z28.d, z28.d,z3.d[0] +mla z28.d, P0/M, z29.d, z31.d +sub z29.d, z27.d, z28.d +add z27.d, z27.d, z28.d +sqrdmulh z28.d, z22.d, z2.d[0] +mul z22.d, z22.d,z3.d[0] +mla z22.d, P0/M, z28.d, z31.d +sub z28.d, z24.d, z22.d +add z24.d, z24.d, z22.d +sqrdmulh z22.d, z25.d, z0.d[0] +mul z25.d, z25.d,z1.d[0] +mla z25.d, P0/M, z22.d, z31.d +sub z22.d, z24.d, z25.d +add z24.d, z24.d, z25.d +sqrdmulh z25.d, z26.d, z0.d[0] +mul z26.d, z26.d,z1.d[0] +mla z26.d, P0/M, z25.d, z31.d +sub z25.d, z27.d, z26.d +add z27.d, z27.d, z26.d +sqrdmulh z26.d, z23.d, z0.d[1] +mul z23.d, z23.d,z1.d[1] +mla z23.d, P0/M, z26.d, z31.d +sub z26.d, z28.d, z23.d +add z28.d, z28.d, z23.d +sqrdmulh z23.d, z30.d, z0.d[1] +mul z30.d, z30.d,z1.d[1] +mla z30.d, P0/M, z23.d, z31.d +sub z23.d, z29.d, z30.d +add z29.d, z29.d, z30.d +sqrdmulh z30.d, z24.d, z14.d[0] +mul z24.d, z24.d,z15.d[0] +mla z24.d, P0/M, z30.d, z31.d +sub z30.d, z27.d, z24.d +add z27.d, z27.d, z24.d +str q27, [x0, #144] +str q30, [x0, #400] +sqrdmulh z30.d, z22.d, z14.d[1] +mul z22.d, z22.d,z15.d[1] +mla z22.d, P0/M, z30.d, z31.d +sub z30.d, z25.d, z22.d +add z25.d, z25.d, z22.d +str q25, [x0, #656] +str q30, [x0, #912] +sqrdmulh z30.d, z26.d, z12.d[1] +mul z26.d, z26.d,z13.d[1] +mla z26.d, P0/M, z30.d, z31.d +sub z30.d, z23.d, z26.d +add z23.d, z23.d, z26.d +str q23, [x0, #1680] +str q30, [x0, #1936] +sqrdmulh z30.d, z28.d, z12.d[0] +mul z28.d, z28.d,z13.d[0] +mla z28.d, P0/M, z30.d, z31.d +sub z30.d, z29.d, z28.d +add z29.d, z29.d, z28.d +str q29, [x0, #1168] +str q30, [x0, #1424] +ldr q30, [x0, #1952] +ldr q29, [x0, #1696] +ldr q28, [x0, #1184] +ldr q23, [x0, #1440] +ldr q26, [x0, #928] +ldr q25, [x0, #672] +ldr q22, [x0, #160] +ldr q27, [x0, #416] +sqrdmulh z24.d, z30.d, z2.d[0] +mul z30.d, z30.d,z3.d[0] +mla z30.d, P0/M, z24.d, z31.d +sub z24.d, z26.d, z30.d +add z26.d, z26.d, z30.d +sqrdmulh z30.d, z29.d, z2.d[0] +mul z29.d, z29.d,z3.d[0] +mla z29.d, P0/M, z30.d, z31.d +sub z30.d, z25.d, z29.d +add z25.d, z25.d, z29.d +sqrdmulh z29.d, z28.d, z2.d[0] +mul z28.d, z28.d,z3.d[0] +mla z28.d, P0/M, z29.d, z31.d +sub z29.d, z22.d, z28.d +add z22.d, z22.d, z28.d +sqrdmulh z28.d, z23.d, z2.d[0] +mul z23.d, z23.d,z3.d[0] +mla z23.d, P0/M, z28.d, z31.d +sub z28.d, z27.d, z23.d +add z27.d, z27.d, z23.d +sqrdmulh z23.d, z26.d, z0.d[0] +mul z26.d, z26.d,z1.d[0] +mla z26.d, P0/M, z23.d, z31.d +sub z23.d, z27.d, z26.d +add z27.d, z27.d, z26.d +sqrdmulh z26.d, z25.d, z0.d[0] +mul z25.d, z25.d,z1.d[0] +mla z25.d, P0/M, z26.d, z31.d +sub z26.d, z22.d, z25.d +add z22.d, z22.d, z25.d +sqrdmulh z25.d, z24.d, z0.d[1] +mul z24.d, z24.d,z1.d[1] +mla z24.d, P0/M, z25.d, z31.d +sub z25.d, z28.d, z24.d +add z28.d, z28.d, z24.d +sqrdmulh z24.d, z30.d, z0.d[1] +mul z30.d, z30.d,z1.d[1] +mla z30.d, P0/M, z24.d, z31.d +sub z24.d, z29.d, z30.d +add z29.d, z29.d, z30.d +sqrdmulh z30.d, z27.d, z14.d[0] +mul z27.d, z27.d,z15.d[0] +mla z27.d, P0/M, z30.d, z31.d +sub z30.d, z22.d, z27.d +add z22.d, z22.d, z27.d +str q22, [x0, #160] +str q30, [x0, #416] +sqrdmulh z30.d, z23.d, z14.d[1] +mul z23.d, z23.d,z15.d[1] +mla z23.d, P0/M, z30.d, z31.d +sub z30.d, z26.d, z23.d +add z26.d, z26.d, z23.d +str q26, [x0, #672] +str q30, [x0, #928] +sqrdmulh z30.d, z25.d, z12.d[1] +mul z25.d, z25.d,z13.d[1] +mla z25.d, P0/M, z30.d, z31.d +sub z30.d, z24.d, z25.d +add z24.d, z24.d, z25.d +str q24, [x0, #1696] +str q30, [x0, #1952] +sqrdmulh z30.d, z28.d, z12.d[0] +mul z28.d, z28.d,z13.d[0] +mla z28.d, P0/M, z30.d, z31.d +sub z30.d, z29.d, z28.d +add z29.d, z29.d, z28.d +str q29, [x0, #1184] +str q30, [x0, #1440] +ldr q30, [x0, #1968] +ldr q29, [x0, #1712] +ldr q28, [x0, #1200] +ldr q24, [x0, #1456] +ldr q25, [x0, #944] +ldr q26, [x0, #688] +ldr q23, [x0, #176] +ldr q22, [x0, #432] +sqrdmulh z27.d, z30.d, z2.d[0] +mul z30.d, z30.d,z3.d[0] +mla z30.d, P0/M, z27.d, z31.d +sub z27.d, z25.d, z30.d +add z25.d, z25.d, z30.d +sqrdmulh z30.d, z29.d, z2.d[0] +mul z29.d, z29.d,z3.d[0] +mla z29.d, P0/M, z30.d, z31.d +sub z30.d, z26.d, z29.d +add z26.d, z26.d, z29.d +sqrdmulh z29.d, z28.d, z2.d[0] +mul z28.d, z28.d,z3.d[0] +mla z28.d, P0/M, z29.d, z31.d +sub z29.d, z23.d, z28.d +add z23.d, z23.d, z28.d +sqrdmulh z28.d, z24.d, z2.d[0] +mul z24.d, z24.d,z3.d[0] +mla z24.d, P0/M, z28.d, z31.d +sub z28.d, z22.d, z24.d +add z22.d, z22.d, z24.d +sqrdmulh z24.d, z25.d, z0.d[0] +mul z25.d, z25.d,z1.d[0] +mla z25.d, P0/M, z24.d, z31.d +sub z24.d, z22.d, z25.d +add z22.d, z22.d, z25.d +sqrdmulh z25.d, z26.d, z0.d[0] +mul z26.d, z26.d,z1.d[0] +mla z26.d, P0/M, z25.d, z31.d +sub z25.d, z23.d, z26.d +add z23.d, z23.d, z26.d +sqrdmulh z26.d, z27.d, z0.d[1] +mul z27.d, z27.d,z1.d[1] +mla z27.d, P0/M, z26.d, z31.d +sub z26.d, z28.d, z27.d +add z28.d, z28.d, z27.d +sqrdmulh z27.d, z30.d, z0.d[1] +mul z30.d, z30.d,z1.d[1] +mla z30.d, P0/M, z27.d, z31.d +sub z27.d, z29.d, z30.d +add z29.d, z29.d, z30.d +sqrdmulh z30.d, z22.d, z14.d[0] +mul z22.d, z22.d,z15.d[0] +mla z22.d, P0/M, z30.d, z31.d +sub z30.d, z23.d, z22.d +add z23.d, z23.d, z22.d +str q23, [x0, #176] +str q30, [x0, #432] +sqrdmulh z30.d, z24.d, z14.d[1] +mul z24.d, z24.d,z15.d[1] +mla z24.d, P0/M, z30.d, z31.d +sub z30.d, z25.d, z24.d +add z25.d, z25.d, z24.d +str q25, [x0, #688] +str q30, [x0, #944] +sqrdmulh z30.d, z26.d, z12.d[1] +mul z26.d, z26.d,z13.d[1] +mla z26.d, P0/M, z30.d, z31.d +sub z30.d, z27.d, z26.d +add z27.d, z27.d, z26.d +str q27, [x0, #1712] +str q30, [x0, #1968] +sqrdmulh z30.d, z28.d, z12.d[0] +mul z28.d, z28.d,z13.d[0] +mla z28.d, P0/M, z30.d, z31.d +sub z30.d, z29.d, z28.d +add z29.d, z29.d, z28.d +str q29, [x0, #1200] +str q30, [x0, #1456] +ldr q30, [x0, #1984] +ldr q29, [x0, #1728] +ldr q28, [x0, #1216] +ldr q27, [x0, #1472] +ldr q26, [x0, #960] +ldr q25, [x0, #704] +ldr q24, [x0, #192] +ldr q23, [x0, #448] +sqrdmulh z22.d, z30.d, z2.d[0] +mul z30.d, z30.d,z3.d[0] +mla z30.d, P0/M, z22.d, z31.d +sub z22.d, z26.d, z30.d +add z26.d, z26.d, z30.d +sqrdmulh z30.d, z29.d, z2.d[0] +mul z29.d, z29.d,z3.d[0] +mla z29.d, P0/M, z30.d, z31.d +sub z30.d, z25.d, z29.d +add z25.d, z25.d, z29.d +sqrdmulh z29.d, z28.d, z2.d[0] +mul z28.d, z28.d,z3.d[0] +mla z28.d, P0/M, z29.d, z31.d +sub z29.d, z24.d, z28.d +add z24.d, z24.d, z28.d +sqrdmulh z28.d, z27.d, z2.d[0] +mul z27.d, z27.d,z3.d[0] +mla z27.d, P0/M, z28.d, z31.d +sub z28.d, z23.d, z27.d +add z23.d, z23.d, z27.d +sqrdmulh z27.d, z26.d, z0.d[0] +mul z26.d, z26.d,z1.d[0] +mla z26.d, P0/M, z27.d, z31.d +sub z27.d, z23.d, z26.d +add z23.d, z23.d, z26.d +sqrdmulh z26.d, z25.d, z0.d[0] +mul z25.d, z25.d,z1.d[0] +mla z25.d, P0/M, z26.d, z31.d +sub z26.d, z24.d, z25.d +add z24.d, z24.d, z25.d +sqrdmulh z25.d, z22.d, z0.d[1] +mul z22.d, z22.d,z1.d[1] +mla z22.d, P0/M, z25.d, z31.d +sub z25.d, z28.d, z22.d +add z28.d, z28.d, z22.d +sqrdmulh z22.d, z30.d, z0.d[1] +mul z30.d, z30.d,z1.d[1] +mla z30.d, P0/M, z22.d, z31.d +sub z22.d, z29.d, z30.d +add z29.d, z29.d, z30.d +sqrdmulh z30.d, z23.d, z14.d[0] +mul z23.d, z23.d,z15.d[0] +mla z23.d, P0/M, z30.d, z31.d +sub z30.d, z24.d, z23.d +add z24.d, z24.d, z23.d +str q24, [x0, #192] +str q30, [x0, #448] +sqrdmulh z30.d, z27.d, z14.d[1] +mul z27.d, z27.d,z15.d[1] +mla z27.d, P0/M, z30.d, z31.d +sub z30.d, z26.d, z27.d +add z26.d, z26.d, z27.d +str q26, [x0, #704] +str q30, [x0, #960] +sqrdmulh z30.d, z25.d, z12.d[1] +mul z25.d, z25.d,z13.d[1] +mla z25.d, P0/M, z30.d, z31.d +sub z30.d, z22.d, z25.d +add z22.d, z22.d, z25.d +str q22, [x0, #1728] +str q30, [x0, #1984] +sqrdmulh z30.d, z28.d, z12.d[0] +mul z28.d, z28.d,z13.d[0] +mla z28.d, P0/M, z30.d, z31.d +sub z30.d, z29.d, z28.d +add z29.d, z29.d, z28.d +str q29, [x0, #1216] +str q30, [x0, #1472] +ldr q30, [x0, #2000] +ldr q29, [x0, #1744] +ldr q28, [x0, #1232] +ldr q22, [x0, #1488] +ldr q25, [x0, #976] +ldr q26, [x0, #720] +ldr q27, [x0, #208] +ldr q24, [x0, #464] +sqrdmulh z23.d, z30.d, z2.d[0] +mul z30.d, z30.d,z3.d[0] +mla z30.d, P0/M, z23.d, z31.d +sub z23.d, z25.d, z30.d +add z25.d, z25.d, z30.d +sqrdmulh z30.d, z29.d, z2.d[0] +mul z29.d, z29.d,z3.d[0] +mla z29.d, P0/M, z30.d, z31.d +sub z30.d, z26.d, z29.d +add z26.d, z26.d, z29.d +sqrdmulh z29.d, z28.d, z2.d[0] +mul z28.d, z28.d,z3.d[0] +mla z28.d, P0/M, z29.d, z31.d +sub z29.d, z27.d, z28.d +add z27.d, z27.d, z28.d +sqrdmulh z28.d, z22.d, z2.d[0] +mul z22.d, z22.d,z3.d[0] +mla z22.d, P0/M, z28.d, z31.d +sub z28.d, z24.d, z22.d +add z24.d, z24.d, z22.d +sqrdmulh z22.d, z25.d, z0.d[0] +mul z25.d, z25.d,z1.d[0] +mla z25.d, P0/M, z22.d, z31.d +sub z22.d, z24.d, z25.d +add z24.d, z24.d, z25.d +sqrdmulh z25.d, z26.d, z0.d[0] +mul z26.d, z26.d,z1.d[0] +mla z26.d, P0/M, z25.d, z31.d +sub z25.d, z27.d, z26.d +add z27.d, z27.d, z26.d +sqrdmulh z26.d, z23.d, z0.d[1] +mul z23.d, z23.d,z1.d[1] +mla z23.d, P0/M, z26.d, z31.d +sub z26.d, z28.d, z23.d +add z28.d, z28.d, z23.d +sqrdmulh z23.d, z30.d, z0.d[1] +mul z30.d, z30.d,z1.d[1] +mla z30.d, P0/M, z23.d, z31.d +sub z23.d, z29.d, z30.d +add z29.d, z29.d, z30.d +sqrdmulh z30.d, z24.d, z14.d[0] +mul z24.d, z24.d,z15.d[0] +mla z24.d, P0/M, z30.d, z31.d +sub z30.d, z27.d, z24.d +add z27.d, z27.d, z24.d +str q27, [x0, #208] +str q30, [x0, #464] +sqrdmulh z30.d, z22.d, z14.d[1] +mul z22.d, z22.d,z15.d[1] +mla z22.d, P0/M, z30.d, z31.d +sub z30.d, z25.d, z22.d +add z25.d, z25.d, z22.d +str q25, [x0, #720] +str q30, [x0, #976] +sqrdmulh z30.d, z26.d, z12.d[1] +mul z26.d, z26.d,z13.d[1] +mla z26.d, P0/M, z30.d, z31.d +sub z30.d, z23.d, z26.d +add z23.d, z23.d, z26.d +str q23, [x0, #1744] +str q30, [x0, #2000] +sqrdmulh z30.d, z28.d, z12.d[0] +mul z28.d, z28.d,z13.d[0] +mla z28.d, P0/M, z30.d, z31.d +sub z30.d, z29.d, z28.d +add z29.d, z29.d, z28.d +str q29, [x0, #1232] +str q30, [x0, #1488] +ldr q30, [x0, #2016] +ldr q29, [x0, #1760] +ldr q28, [x0, #1248] +ldr q23, [x0, #1504] +ldr q26, [x0, #992] +ldr q25, [x0, #736] +ldr q22, [x0, #224] +ldr q27, [x0, #480] +sqrdmulh z24.d, z30.d, z2.d[0] +mul z30.d, z30.d,z3.d[0] +mla z30.d, P0/M, z24.d, z31.d +sub z24.d, z26.d, z30.d +add z26.d, z26.d, z30.d +sqrdmulh z30.d, z29.d, z2.d[0] +mul z29.d, z29.d,z3.d[0] +mla z29.d, P0/M, z30.d, z31.d +sub z30.d, z25.d, z29.d +add z25.d, z25.d, z29.d +sqrdmulh z29.d, z28.d, z2.d[0] +mul z28.d, z28.d,z3.d[0] +mla z28.d, P0/M, z29.d, z31.d +sub z29.d, z22.d, z28.d +add z22.d, z22.d, z28.d +sqrdmulh z28.d, z23.d, z2.d[0] +mul z23.d, z23.d,z3.d[0] +mla z23.d, P0/M, z28.d, z31.d +sub z28.d, z27.d, z23.d +add z27.d, z27.d, z23.d +sqrdmulh z23.d, z26.d, z0.d[0] +mul z26.d, z26.d,z1.d[0] +mla z26.d, P0/M, z23.d, z31.d +sub z23.d, z27.d, z26.d +add z27.d, z27.d, z26.d +sqrdmulh z26.d, z25.d, z0.d[0] +mul z25.d, z25.d,z1.d[0] +mla z25.d, P0/M, z26.d, z31.d +sub z26.d, z22.d, z25.d +add z22.d, z22.d, z25.d +sqrdmulh z25.d, z24.d, z0.d[1] +mul z24.d, z24.d,z1.d[1] +mla z24.d, P0/M, z25.d, z31.d +sub z25.d, z28.d, z24.d +add z28.d, z28.d, z24.d +sqrdmulh z24.d, z30.d, z0.d[1] +mul z30.d, z30.d,z1.d[1] +mla z30.d, P0/M, z24.d, z31.d +sub z24.d, z29.d, z30.d +add z29.d, z29.d, z30.d +sqrdmulh z30.d, z27.d, z14.d[0] +mul z27.d, z27.d,z15.d[0] +mla z27.d, P0/M, z30.d, z31.d +sub z30.d, z22.d, z27.d +add z22.d, z22.d, z27.d +str q22, [x0, #224] +str q30, [x0, #480] +sqrdmulh z30.d, z23.d, z14.d[1] +mul z23.d, z23.d,z15.d[1] +mla z23.d, P0/M, z30.d, z31.d +sub z30.d, z26.d, z23.d +add z26.d, z26.d, z23.d +str q26, [x0, #736] +str q30, [x0, #992] +sqrdmulh z30.d, z25.d, z12.d[1] +mul z25.d, z25.d,z13.d[1] +mla z25.d, P0/M, z30.d, z31.d +sub z30.d, z24.d, z25.d +add z24.d, z24.d, z25.d +str q24, [x0, #1760] +str q30, [x0, #2016] +sqrdmulh z30.d, z28.d, z12.d[0] +mul z28.d, z28.d,z13.d[0] +mla z28.d, P0/M, z30.d, z31.d +sub z30.d, z29.d, z28.d +add z29.d, z29.d, z28.d +str q29, [x0, #1248] +str q30, [x0, #1504] +ldr q30, [x0, #2032] +ldr q29, [x0, #1776] +ldr q28, [x0, #1264] +ldr q24, [x0, #1520] +ldr q25, [x0, #1008] +ldr q26, [x0, #752] +ldr q23, [x0, #240] +ldr q22, [x0, #496] +sqrdmulh z27.d, z30.d, z2.d[0] +mul z30.d, z30.d,z3.d[0] +mla z30.d, P0/M, z27.d, z31.d +sub z27.d, z25.d, z30.d +add z25.d, z25.d, z30.d +sqrdmulh z30.d, z29.d, z2.d[0] +mul z29.d, z29.d,z3.d[0] +mla z29.d, P0/M, z30.d, z31.d +sub z30.d, z26.d, z29.d +add z26.d, z26.d, z29.d +sqrdmulh z29.d, z28.d, z2.d[0] +mul z28.d, z28.d,z3.d[0] +mla z28.d, P0/M, z29.d, z31.d +sub z29.d, z23.d, z28.d +add z23.d, z23.d, z28.d +sqrdmulh z28.d, z24.d, z2.d[0] +mul z24.d, z24.d,z3.d[0] +mla z24.d, P0/M, z28.d, z31.d +sub z28.d, z22.d, z24.d +add z22.d, z22.d, z24.d +sqrdmulh z24.d, z25.d, z0.d[0] +mul z25.d, z25.d,z1.d[0] +mla z25.d, P0/M, z24.d, z31.d +sub z24.d, z22.d, z25.d +add z22.d, z22.d, z25.d +sqrdmulh z25.d, z26.d, z0.d[0] +mul z26.d, z26.d,z1.d[0] +mla z26.d, P0/M, z25.d, z31.d +sub z25.d, z23.d, z26.d +add z23.d, z23.d, z26.d +sqrdmulh z26.d, z27.d, z0.d[1] +mul z27.d, z27.d,z1.d[1] +mla z27.d, P0/M, z26.d, z31.d +sub z26.d, z28.d, z27.d +add z28.d, z28.d, z27.d +sqrdmulh z27.d, z30.d, z0.d[1] +mul z30.d, z30.d,z1.d[1] +mla z30.d, P0/M, z27.d, z31.d +sub z27.d, z29.d, z30.d +add z29.d, z29.d, z30.d +sqrdmulh z30.d, z22.d, z14.d[0] +mul z22.d, z22.d,z15.d[0] +mla z22.d, P0/M, z30.d, z31.d +sub z30.d, z23.d, z22.d +add z23.d, z23.d, z22.d +str q23, [x0, #240] +str q30, [x0, #496] +sqrdmulh z30.d, z24.d, z14.d[1] +mul z24.d, z24.d,z15.d[1] +mla z24.d, P0/M, z30.d, z31.d +sub z30.d, z25.d, z24.d +add z25.d, z25.d, z24.d +str q25, [x0, #752] +str q30, [x0, #1008] +sqrdmulh z30.d, z26.d, z12.d[1] +mul z26.d, z26.d,z13.d[1] +mla z26.d, P0/M, z30.d, z31.d +sub z30.d, z27.d, z26.d +add z27.d, z27.d, z26.d +str q27, [x0, #1776] +str q30, [x0, #2032] +sqrdmulh z30.d, z28.d, z12.d[0] +mul z28.d, z28.d,z13.d[0] +mla z28.d, P0/M, z30.d, z31.d +sub z30.d, z29.d, z28.d +add z29.d, z29.d, z28.d +str q29, [x0, #1264] +str q30, [x0, #1520] +ldr q30, [x0, #1792] +ldr q29, [x0, #1536] +ldr q28, [x0, #1024] +ldr q27, [x0, #1280] +ldr q26, [x0, #768] +ldr q25, [x0, #512] +ldr q24, [x0, #0] +ldr q23, [x0, #256] +sqrdmulh z22.d, z30.d, z2.d[0] +mul z30.d, z30.d,z3.d[0] +mla z30.d, P0/M, z22.d, z31.d +sub z22.d, z26.d, z30.d +add z26.d, z26.d, z30.d +sqrdmulh z30.d, z29.d, z2.d[0] +mul z29.d, z29.d,z3.d[0] +mla z29.d, P0/M, z30.d, z31.d +sub z30.d, z25.d, z29.d +add z25.d, z25.d, z29.d +sqrdmulh z29.d, z28.d, z2.d[0] +mul z28.d, z28.d,z3.d[0] +mla z28.d, P0/M, z29.d, z31.d +sub z29.d, z24.d, z28.d +add z24.d, z24.d, z28.d +sqrdmulh z28.d, z27.d, z2.d[0] +mul z27.d, z27.d,z3.d[0] +mla z27.d, P0/M, z28.d, z31.d +sub z28.d, z23.d, z27.d +add z23.d, z23.d, z27.d +sqrdmulh z27.d, z26.d, z0.d[0] +mul z26.d, z26.d,z1.d[0] +mla z26.d, P0/M, z27.d, z31.d +sub z27.d, z23.d, z26.d +add z23.d, z23.d, z26.d +sqrdmulh z26.d, z25.d, z0.d[0] +mul z25.d, z25.d,z1.d[0] +mla z25.d, P0/M, z26.d, z31.d +sub z26.d, z24.d, z25.d +add z24.d, z24.d, z25.d +sqrdmulh z25.d, z22.d, z0.d[1] +mul z22.d, z22.d,z1.d[1] +mla z22.d, P0/M, z25.d, z31.d +sub z25.d, z28.d, z22.d +add z28.d, z28.d, z22.d +sqrdmulh z22.d, z30.d, z0.d[1] +mul z30.d, z30.d,z1.d[1] +mla z30.d, P0/M, z22.d, z31.d +sub z22.d, z29.d, z30.d +add z29.d, z29.d, z30.d +sqrdmulh z30.d, z23.d, z14.d[0] +mul z23.d, z23.d,z15.d[0] +mla z23.d, P0/M, z30.d, z31.d +sub z30.d, z24.d, z23.d +add z24.d, z24.d, z23.d +str q24, [x0, #0] +str q30, [x0, #256] +sqrdmulh z30.d, z27.d, z14.d[1] +mul z27.d, z27.d,z15.d[1] +mla z27.d, P0/M, z30.d, z31.d +sub z30.d, z26.d, z27.d +add z26.d, z26.d, z27.d +str q26, [x0, #512] +str q30, [x0, #768] +sqrdmulh z30.d, z25.d, z12.d[1] +mul z25.d, z25.d,z13.d[1] +mla z25.d, P0/M, z30.d, z31.d +sub z30.d, z22.d, z25.d +add z22.d, z22.d, z25.d +str q22, [x0, #1536] +str q30, [x0, #1792] +sqrdmulh z30.d, z28.d, z12.d[0] +mul z28.d, z28.d,z13.d[0] +mla z28.d, P0/M, z30.d, z31.d +sub z30.d, z29.d, z28.d +add z29.d, z29.d, z28.d +str q29, [x0, #1024] +str q30, [x0, #1280] +ldr q30, [x0, #1808] +ldr q29, [x0, #1552] +ldr q28, [x0, #1040] +ldr q22, [x0, #1296] +ldr q25, [x0, #784] +ldr q26, [x0, #528] +ldr q27, [x0, #16] +ldr q24, [x0, #272] +sqrdmulh z23.d, z30.d, z2.d[0] +mul z30.d, z30.d,z3.d[0] +mla z30.d, P0/M, z23.d, z31.d +sub z23.d, z25.d, z30.d +add z25.d, z25.d, z30.d +sqrdmulh z30.d, z29.d, z2.d[0] +mul z29.d, z29.d,z3.d[0] +mla z29.d, P0/M, z30.d, z31.d +sub z30.d, z26.d, z29.d +add z26.d, z26.d, z29.d +sqrdmulh z29.d, z28.d, z2.d[0] +mul z28.d, z28.d,z3.d[0] +mla z28.d, P0/M, z29.d, z31.d +sub z29.d, z27.d, z28.d +add z27.d, z27.d, z28.d +sqrdmulh z28.d, z22.d, z2.d[0] +mul z22.d, z22.d,z3.d[0] +mla z22.d, P0/M, z28.d, z31.d +sub z28.d, z24.d, z22.d +add z24.d, z24.d, z22.d +sqrdmulh z22.d, z25.d, z0.d[0] +mul z25.d, z25.d,z1.d[0] +mla z25.d, P0/M, z22.d, z31.d +sub z22.d, z24.d, z25.d +add z24.d, z24.d, z25.d +sqrdmulh z25.d, z26.d, z0.d[0] +mul z26.d, z26.d,z1.d[0] +mla z26.d, P0/M, z25.d, z31.d +sub z25.d, z27.d, z26.d +add z27.d, z27.d, z26.d +sqrdmulh z26.d, z23.d, z0.d[1] +mul z23.d, z23.d,z1.d[1] +mla z23.d, P0/M, z26.d, z31.d +sub z26.d, z28.d, z23.d +add z28.d, z28.d, z23.d +sqrdmulh z23.d, z30.d, z0.d[1] +mul z30.d, z30.d,z1.d[1] +mla z30.d, P0/M, z23.d, z31.d +sub z23.d, z29.d, z30.d +add z29.d, z29.d, z30.d +sqrdmulh z30.d, z24.d, z14.d[0] +mul z24.d, z24.d,z15.d[0] +mla z24.d, P0/M, z30.d, z31.d +sub z30.d, z27.d, z24.d +add z27.d, z27.d, z24.d +str q27, [x0, #16] +str q30, [x0, #272] +sqrdmulh z30.d, z22.d, z14.d[1] +mul z22.d, z22.d,z15.d[1] +mla z22.d, P0/M, z30.d, z31.d +sub z30.d, z25.d, z22.d +add z25.d, z25.d, z22.d +str q25, [x0, #528] +str q30, [x0, #784] +sqrdmulh z30.d, z26.d, z12.d[1] +mul z26.d, z26.d,z13.d[1] +mla z26.d, P0/M, z30.d, z31.d +sub z30.d, z23.d, z26.d +add z23.d, z23.d, z26.d +str q23, [x0, #1552] +str q30, [x0, #1808] +sqrdmulh z30.d, z28.d, z12.d[0] +mul z28.d, z28.d,z13.d[0] +mla z28.d, P0/M, z30.d, z31.d +sub z30.d, z29.d, z28.d +add z29.d, z29.d, z28.d +str q29, [x0, #1040] +str q30, [x0, #1296] +ldr q30, [x0, #1824] +ldr q29, [x0, #1568] +ldr q28, [x0, #1056] +ldr q23, [x0, #1312] +ldr q26, [x0, #800] +ldr q25, [x0, #544] +ldr q22, [x0, #32] +ldr q27, [x0, #288] +sqrdmulh z24.d, z30.d, z2.d[0] +mul z30.d, z30.d,z3.d[0] +mla z30.d, P0/M, z24.d, z31.d +sub z24.d, z26.d, z30.d +add z26.d, z26.d, z30.d +sqrdmulh z30.d, z29.d, z2.d[0] +mul z29.d, z29.d,z3.d[0] +mla z29.d, P0/M, z30.d, z31.d +sub z30.d, z25.d, z29.d +add z25.d, z25.d, z29.d +sqrdmulh z29.d, z28.d, z2.d[0] +mul z28.d, z28.d,z3.d[0] +mla z28.d, P0/M, z29.d, z31.d +sub z29.d, z22.d, z28.d +add z22.d, z22.d, z28.d +sqrdmulh z28.d, z23.d, z2.d[0] +mul z23.d, z23.d,z3.d[0] +mla z23.d, P0/M, z28.d, z31.d +sub z28.d, z27.d, z23.d +add z27.d, z27.d, z23.d +sqrdmulh z23.d, z26.d, z0.d[0] +mul z26.d, z26.d,z1.d[0] +mla z26.d, P0/M, z23.d, z31.d +sub z23.d, z27.d, z26.d +add z27.d, z27.d, z26.d +sqrdmulh z26.d, z25.d, z0.d[0] +mul z25.d, z25.d,z1.d[0] +mla z25.d, P0/M, z26.d, z31.d +sub z26.d, z22.d, z25.d +add z22.d, z22.d, z25.d +sqrdmulh z25.d, z24.d, z0.d[1] +mul z24.d, z24.d,z1.d[1] +mla z24.d, P0/M, z25.d, z31.d +sub z25.d, z28.d, z24.d +add z28.d, z28.d, z24.d +sqrdmulh z24.d, z30.d, z0.d[1] +mul z30.d, z30.d,z1.d[1] +mla z30.d, P0/M, z24.d, z31.d +sub z24.d, z29.d, z30.d +add z29.d, z29.d, z30.d +sqrdmulh z30.d, z27.d, z14.d[0] +mul z27.d, z27.d,z15.d[0] +mla z27.d, P0/M, z30.d, z31.d +sub z30.d, z22.d, z27.d +add z22.d, z22.d, z27.d +str q22, [x0, #32] +str q30, [x0, #288] +sqrdmulh z30.d, z23.d, z14.d[1] +mul z23.d, z23.d,z15.d[1] +mla z23.d, P0/M, z30.d, z31.d +sub z30.d, z26.d, z23.d +add z26.d, z26.d, z23.d +str q26, [x0, #544] +str q30, [x0, #800] +sqrdmulh z30.d, z25.d, z12.d[1] +mul z25.d, z25.d,z13.d[1] +mla z25.d, P0/M, z30.d, z31.d +sub z30.d, z24.d, z25.d +add z24.d, z24.d, z25.d +str q24, [x0, #1568] +str q30, [x0, #1824] +sqrdmulh z30.d, z28.d, z12.d[0] +mul z28.d, z28.d,z13.d[0] +mla z28.d, P0/M, z30.d, z31.d +sub z30.d, z29.d, z28.d +add z29.d, z29.d, z28.d +str q29, [x0, #1056] +str q30, [x0, #1312] +ldr q30, [x0, #1840] +ldr q29, [x0, #1584] +ldr q28, [x0, #1072] +ldr q24, [x0, #1328] +ldr q25, [x0, #816] +ldr q26, [x0, #560] +ldr q23, [x0, #48] +ldr q22, [x0, #304] +sqrdmulh z27.d, z30.d, z2.d[0] +mul z30.d, z30.d,z3.d[0] +mla z30.d, P0/M, z27.d, z31.d +sub z27.d, z25.d, z30.d +add z25.d, z25.d, z30.d +sqrdmulh z30.d, z29.d, z2.d[0] +mul z29.d, z29.d,z3.d[0] +mla z29.d, P0/M, z30.d, z31.d +sub z30.d, z26.d, z29.d +add z26.d, z26.d, z29.d +sqrdmulh z29.d, z28.d, z2.d[0] +mul z28.d, z28.d,z3.d[0] +mla z28.d, P0/M, z29.d, z31.d +sub z29.d, z23.d, z28.d +add z23.d, z23.d, z28.d +sqrdmulh z28.d, z24.d, z2.d[0] +mul z24.d, z24.d,z3.d[0] +mla z24.d, P0/M, z28.d, z31.d +sub z28.d, z22.d, z24.d +add z22.d, z22.d, z24.d +sqrdmulh z24.d, z25.d, z0.d[0] +mul z25.d, z25.d,z1.d[0] +mla z25.d, P0/M, z24.d, z31.d +sub z24.d, z22.d, z25.d +add z22.d, z22.d, z25.d +sqrdmulh z25.d, z26.d, z0.d[0] +mul z26.d, z26.d,z1.d[0] +mla z26.d, P0/M, z25.d, z31.d +sub z25.d, z23.d, z26.d +add z23.d, z23.d, z26.d +sqrdmulh z26.d, z27.d, z0.d[1] +mul z27.d, z27.d,z1.d[1] +mla z27.d, P0/M, z26.d, z31.d +sub z26.d, z28.d, z27.d +add z28.d, z28.d, z27.d +sqrdmulh z27.d, z30.d, z0.d[1] +mul z30.d, z30.d,z1.d[1] +mla z30.d, P0/M, z27.d, z31.d +sub z27.d, z29.d, z30.d +add z29.d, z29.d, z30.d +sqrdmulh z30.d, z22.d, z14.d[0] +mul z22.d, z22.d,z15.d[0] +mla z22.d, P0/M, z30.d, z31.d +sub z30.d, z23.d, z22.d +add z23.d, z23.d, z22.d +str q23, [x0, #48] +str q30, [x0, #304] +sqrdmulh z30.d, z24.d, z14.d[1] +mul z24.d, z24.d,z15.d[1] +mla z24.d, P0/M, z30.d, z31.d +sub z30.d, z25.d, z24.d +add z25.d, z25.d, z24.d +str q25, [x0, #560] +str q30, [x0, #816] +sqrdmulh z30.d, z26.d, z12.d[1] +mul z26.d, z26.d,z13.d[1] +mla z26.d, P0/M, z30.d, z31.d +sub z30.d, z27.d, z26.d +add z27.d, z27.d, z26.d +str q27, [x0, #1584] +str q30, [x0, #1840] +sqrdmulh z30.d, z28.d, z12.d[0] +mul z28.d, z28.d,z13.d[0] +mla z28.d, P0/M, z30.d, z31.d +sub z30.d, z29.d, z28.d +add z29.d, z29.d, z28.d +str q29, [x0, #1072] +str q30, [x0, #1328] +ldr q30, [x0, #1856] +ldr q29, [x0, #1600] +ldr q28, [x0, #1088] +ldr q27, [x0, #1344] +ldr q26, [x0, #832] +ldr q25, [x0, #576] +ldr q24, [x0, #64] +ldr q23, [x0, #320] +sqrdmulh z22.d, z30.d, z2.d[0] +mul z30.d, z30.d,z3.d[0] +mla z30.d, P0/M, z22.d, z31.d +sub z22.d, z26.d, z30.d +add z26.d, z26.d, z30.d +sqrdmulh z30.d, z29.d, z2.d[0] +mul z29.d, z29.d,z3.d[0] +mla z29.d, P0/M, z30.d, z31.d +sub z30.d, z25.d, z29.d +add z25.d, z25.d, z29.d +sqrdmulh z29.d, z28.d, z2.d[0] +mul z28.d, z28.d,z3.d[0] +mla z28.d, P0/M, z29.d, z31.d +sub z29.d, z24.d, z28.d +add z24.d, z24.d, z28.d +sqrdmulh z28.d, z27.d, z2.d[0] +mul z27.d, z27.d,z3.d[0] +mla z27.d, P0/M, z28.d, z31.d +sub z28.d, z23.d, z27.d +add z23.d, z23.d, z27.d +sqrdmulh z27.d, z26.d, z0.d[0] +mul z26.d, z26.d,z1.d[0] +mla z26.d, P0/M, z27.d, z31.d +sub z27.d, z23.d, z26.d +add z23.d, z23.d, z26.d +sqrdmulh z26.d, z25.d, z0.d[0] +mul z25.d, z25.d,z1.d[0] +mla z25.d, P0/M, z26.d, z31.d +sub z26.d, z24.d, z25.d +add z24.d, z24.d, z25.d +sqrdmulh z25.d, z22.d, z0.d[1] +mul z22.d, z22.d,z1.d[1] +mla z22.d, P0/M, z25.d, z31.d +sub z25.d, z28.d, z22.d +add z28.d, z28.d, z22.d +sqrdmulh z22.d, z30.d, z0.d[1] +mul z30.d, z30.d,z1.d[1] +mla z30.d, P0/M, z22.d, z31.d +sub z22.d, z29.d, z30.d +add z29.d, z29.d, z30.d +sqrdmulh z30.d, z23.d, z14.d[0] +mul z23.d, z23.d,z15.d[0] +mla z23.d, P0/M, z30.d, z31.d +sub z30.d, z24.d, z23.d +add z24.d, z24.d, z23.d +str q24, [x0, #64] +str q30, [x0, #320] +sqrdmulh z30.d, z27.d, z14.d[1] +mul z27.d, z27.d,z15.d[1] +mla z27.d, P0/M, z30.d, z31.d +sub z30.d, z26.d, z27.d +add z26.d, z26.d, z27.d +str q26, [x0, #576] +str q30, [x0, #832] +sqrdmulh z30.d, z25.d, z12.d[1] +mul z25.d, z25.d,z13.d[1] +mla z25.d, P0/M, z30.d, z31.d +sub z30.d, z22.d, z25.d +add z22.d, z22.d, z25.d +str q22, [x0, #1600] +str q30, [x0, #1856] +sqrdmulh z30.d, z28.d, z12.d[0] +mul z28.d, z28.d,z13.d[0] +mla z28.d, P0/M, z30.d, z31.d +sub z30.d, z29.d, z28.d +add z29.d, z29.d, z28.d +str q29, [x0, #1088] +str q30, [x0, #1344] +ldr q30, [x0, #1872] +ldr q29, [x0, #1616] +ldr q28, [x0, #1104] +ldr q22, [x0, #1360] +ldr q25, [x0, #848] +ldr q26, [x0, #592] +ldr q27, [x0, #80] +ldr q24, [x0, #336] +sqrdmulh z23.d, z30.d, z2.d[0] +mul z30.d, z30.d,z3.d[0] +mla z30.d, P0/M, z23.d, z31.d +sub z23.d, z25.d, z30.d +add z25.d, z25.d, z30.d +sqrdmulh z30.d, z29.d, z2.d[0] +mul z29.d, z29.d,z3.d[0] +mla z29.d, P0/M, z30.d, z31.d +sub z30.d, z26.d, z29.d +add z26.d, z26.d, z29.d +sqrdmulh z29.d, z28.d, z2.d[0] +mul z28.d, z28.d,z3.d[0] +mla z28.d, P0/M, z29.d, z31.d +sub z29.d, z27.d, z28.d +add z27.d, z27.d, z28.d +sqrdmulh z28.d, z22.d, z2.d[0] +mul z22.d, z22.d,z3.d[0] +mla z22.d, P0/M, z28.d, z31.d +sub z28.d, z24.d, z22.d +add z24.d, z24.d, z22.d +sqrdmulh z22.d, z25.d, z0.d[0] +mul z25.d, z25.d,z1.d[0] +mla z25.d, P0/M, z22.d, z31.d +sub z22.d, z24.d, z25.d +add z24.d, z24.d, z25.d +sqrdmulh z25.d, z26.d, z0.d[0] +mul z26.d, z26.d,z1.d[0] +mla z26.d, P0/M, z25.d, z31.d +sub z25.d, z27.d, z26.d +add z27.d, z27.d, z26.d +sqrdmulh z26.d, z23.d, z0.d[1] +mul z23.d, z23.d,z1.d[1] +mla z23.d, P0/M, z26.d, z31.d +sub z26.d, z28.d, z23.d +add z28.d, z28.d, z23.d +sqrdmulh z23.d, z30.d, z0.d[1] +mul z30.d, z30.d,z1.d[1] +mla z30.d, P0/M, z23.d, z31.d +sub z23.d, z29.d, z30.d +add z29.d, z29.d, z30.d +sqrdmulh z30.d, z24.d, z14.d[0] +mul z24.d, z24.d,z15.d[0] +mla z24.d, P0/M, z30.d, z31.d +sub z30.d, z27.d, z24.d +add z27.d, z27.d, z24.d +str q27, [x0, #80] +str q30, [x0, #336] +sqrdmulh z30.d, z22.d, z14.d[1] +mul z22.d, z22.d,z15.d[1] +mla z22.d, P0/M, z30.d, z31.d +sub z30.d, z25.d, z22.d +add z25.d, z25.d, z22.d +str q25, [x0, #592] +str q30, [x0, #848] +sqrdmulh z30.d, z26.d, z12.d[1] +mul z26.d, z26.d,z13.d[1] +mla z26.d, P0/M, z30.d, z31.d +sub z30.d, z23.d, z26.d +add z23.d, z23.d, z26.d +str q23, [x0, #1616] +str q30, [x0, #1872] +sqrdmulh z30.d, z28.d, z12.d[0] +mul z28.d, z28.d,z13.d[0] +mla z28.d, P0/M, z30.d, z31.d +sub z30.d, z29.d, z28.d +add z29.d, z29.d, z28.d +str q29, [x0, #1104] +str q30, [x0, #1360] +ldr q30, [x0, #1888] +ldr q29, [x0, #1632] +ldr q28, [x0, #1120] +ldr q23, [x0, #1376] +ldr q26, [x0, #864] +ldr q25, [x0, #608] +ldr q22, [x0, #96] +ldr q27, [x0, #352] +sqrdmulh z24.d, z30.d, z2.d[0] +mul z30.d, z30.d,z3.d[0] +mla z30.d, P0/M, z24.d, z31.d +sub z24.d, z26.d, z30.d +add z26.d, z26.d, z30.d +sqrdmulh z30.d, z29.d, z2.d[0] +mul z29.d, z29.d,z3.d[0] +mla z29.d, P0/M, z30.d, z31.d +sub z30.d, z25.d, z29.d +add z25.d, z25.d, z29.d +sqrdmulh z29.d, z28.d, z2.d[0] +mul z28.d, z28.d,z3.d[0] +mla z28.d, P0/M, z29.d, z31.d +sub z29.d, z22.d, z28.d +add z22.d, z22.d, z28.d +sqrdmulh z28.d, z23.d, z2.d[0] +mul z23.d, z23.d,z3.d[0] +mla z23.d, P0/M, z28.d, z31.d +sub z28.d, z27.d, z23.d +add z27.d, z27.d, z23.d +sqrdmulh z23.d, z26.d, z0.d[0] +mul z26.d, z26.d,z1.d[0] +mla z26.d, P0/M, z23.d, z31.d +sub z23.d, z27.d, z26.d +add z27.d, z27.d, z26.d +sqrdmulh z26.d, z25.d, z0.d[0] +mul z25.d, z25.d,z1.d[0] +mla z25.d, P0/M, z26.d, z31.d +sub z26.d, z22.d, z25.d +add z22.d, z22.d, z25.d +sqrdmulh z25.d, z24.d, z0.d[1] +mul z24.d, z24.d,z1.d[1] +mla z24.d, P0/M, z25.d, z31.d +sub z25.d, z28.d, z24.d +add z28.d, z28.d, z24.d +sqrdmulh z24.d, z30.d, z0.d[1] +mul z30.d, z30.d,z1.d[1] +mla z30.d, P0/M, z24.d, z31.d +sub z24.d, z29.d, z30.d +add z29.d, z29.d, z30.d +sqrdmulh z30.d, z27.d, z14.d[0] +mul z27.d, z27.d,z15.d[0] +mla z27.d, P0/M, z30.d, z31.d +sub z30.d, z22.d, z27.d +add z22.d, z22.d, z27.d +str q22, [x0, #96] +str q30, [x0, #352] +sqrdmulh z30.d, z23.d, z14.d[1] +mul z23.d, z23.d,z15.d[1] +mla z23.d, P0/M, z30.d, z31.d +sub z30.d, z26.d, z23.d +add z26.d, z26.d, z23.d +str q26, [x0, #608] +str q30, [x0, #864] +sqrdmulh z30.d, z25.d, z12.d[1] +mul z25.d, z25.d,z13.d[1] +mla z25.d, P0/M, z30.d, z31.d +sub z30.d, z24.d, z25.d +add z24.d, z24.d, z25.d +str q24, [x0, #1632] +str q30, [x0, #1888] +sqrdmulh z30.d, z28.d, z12.d[0] +mul z28.d, z28.d,z13.d[0] +mla z28.d, P0/M, z30.d, z31.d +sub z30.d, z29.d, z28.d +add z29.d, z29.d, z28.d +str q29, [x0, #1120] +str q30, [x0, #1376] +ldr q30, [x0, #1904] +ldr q29, [x0, #1648] +ldr q28, [x0, #1136] +ldr q24, [x0, #1392] +ldr q25, [x0, #880] +ldr q26, [x0, #624] +ldr q23, [x0, #112] +ldr q22, [x0, #368] +sqrdmulh z27.d, z30.d, z2.d[0] +mul z30.d, z30.d,z3.d[0] +mla z30.d, P0/M, z27.d, z31.d +sub z27.d, z25.d, z30.d +add z25.d, z25.d, z30.d +sqrdmulh z30.d, z29.d, z2.d[0] +mul z29.d, z29.d,z3.d[0] +mla z29.d, P0/M, z30.d, z31.d +sub z30.d, z26.d, z29.d +add z26.d, z26.d, z29.d +sqrdmulh z29.d, z28.d, z2.d[0] +mul z28.d, z28.d,z3.d[0] +mla z28.d, P0/M, z29.d, z31.d +sub z29.d, z23.d, z28.d +add z23.d, z23.d, z28.d +sqrdmulh z28.d, z24.d, z2.d[0] +mul z24.d, z24.d,z3.d[0] +mla z24.d, P0/M, z28.d, z31.d +sub z28.d, z22.d, z24.d +add z22.d, z22.d, z24.d +sqrdmulh z24.d, z25.d, z0.d[0] +mul z25.d, z25.d,z1.d[0] +mla z25.d, P0/M, z24.d, z31.d +sub z24.d, z22.d, z25.d +add z22.d, z22.d, z25.d +sqrdmulh z25.d, z26.d, z0.d[0] +mul z26.d, z26.d,z1.d[0] +mla z26.d, P0/M, z25.d, z31.d +sub z25.d, z23.d, z26.d +add z23.d, z23.d, z26.d +sqrdmulh z26.d, z27.d, z0.d[1] +mul z27.d, z27.d,z1.d[1] +mla z27.d, P0/M, z26.d, z31.d +sub z26.d, z28.d, z27.d +add z28.d, z28.d, z27.d +sqrdmulh z27.d, z30.d, z0.d[1] +mul z30.d, z30.d,z1.d[1] +mla z30.d, P0/M, z27.d, z31.d +sub z27.d, z29.d, z30.d +add z29.d, z29.d, z30.d +sqrdmulh z30.d, z22.d, z14.d[0] +mul z22.d, z22.d,z15.d[0] +mla z22.d, P0/M, z30.d, z31.d +sub z30.d, z23.d, z22.d +add z23.d, z23.d, z22.d +str q23, [x0, #112] +str q30, [x0, #368] +sqrdmulh z30.d, z24.d, z14.d[1] +mul z24.d, z24.d,z15.d[1] +mla z24.d, P0/M, z30.d, z31.d +sub z30.d, z25.d, z24.d +add z25.d, z25.d, z24.d +str q25, [x0, #624] +str q30, [x0, #880] +sqrdmulh z30.d, z26.d, z12.d[1] +mul z26.d, z26.d,z13.d[1] +mla z26.d, P0/M, z30.d, z31.d +sub z30.d, z27.d, z26.d +add z27.d, z27.d, z26.d +str q27, [x0, #1648] +str q30, [x0, #1904] +sqrdmulh z30.d, z28.d, z12.d[0] +mul z28.d, z28.d,z13.d[0] +mla z28.d, P0/M, z30.d, z31.d +sub z30.d, z29.d, z28.d +add z29.d, z29.d, z28.d +str q29, [x0, #1136] +str q30, [x0, #1392] +ldr q4, [x17, #+128] +ldr q5, [x17, #+144] +ldr q6, [x17, #+160] +ldr q7, [x17, #+176] +ldr q8, [x17, #+192] +ldr q9, [x17, #+208] +ldr q10, [x17, #+224] +ldr q11, [x17, #+240] +ldr q16, [x0, #240] +ldr q17, [x0, #208] +ldr q18, [x0, #144] +ldr q19, [x0, #176] +ldr q20, [x0, #112] +ldr q21, [x0, #80] +ldr q22, [x0, #16] +ldr q23, [x0, #48] +sqrdmulh z24.d, z16.d, z5.d[0] +mul z16.d, z16.d,z4.d[0] +mla z16.d, P0/M, z24.d, z31.d +sub z24.d, z20.d, z16.d +add z20.d, z20.d, z16.d +sqrdmulh z16.d, z17.d, z5.d[0] +mul z17.d, z17.d,z4.d[0] +mla z17.d, P0/M, z16.d, z31.d +sub z16.d, z21.d, z17.d +add z21.d, z21.d, z17.d +sqrdmulh z17.d, z18.d, z5.d[0] +mul z18.d, z18.d,z4.d[0] +mla z18.d, P0/M, z17.d, z31.d +sub z17.d, z22.d, z18.d +add z22.d, z22.d, z18.d +sqrdmulh z18.d, z19.d, z5.d[0] +mul z19.d, z19.d,z4.d[0] +mla z19.d, P0/M, z18.d, z31.d +sub z18.d, z23.d, z19.d +add z23.d, z23.d, z19.d +sqrdmulh z19.d, z20.d, z7.d[0] +mul z20.d, z20.d,z6.d[0] +mla z20.d, P0/M, z19.d, z31.d +sub z19.d, z23.d, z20.d +add z23.d, z23.d, z20.d +sqrdmulh z20.d, z21.d, z7.d[0] +mul z21.d, z21.d,z6.d[0] +mla z21.d, P0/M, z20.d, z31.d +sub z20.d, z22.d, z21.d +add z22.d, z22.d, z21.d +sqrdmulh z21.d, z24.d, z7.d[1] +mul z24.d, z24.d,z6.d[1] +mla z24.d, P0/M, z21.d, z31.d +sub z21.d, z18.d, z24.d +add z18.d, z18.d, z24.d +sqrdmulh z24.d, z16.d, z7.d[1] +mul z16.d, z16.d,z6.d[1] +mla z16.d, P0/M, z24.d, z31.d +sub z24.d, z17.d, z16.d +add z17.d, z17.d, z16.d +sqrdmulh z16.d, z23.d, z9.d[0] +mul z23.d, z23.d,z8.d[0] +mla z23.d, P0/M, z16.d, z31.d +sub z16.d, z22.d, z23.d +add z22.d, z22.d, z23.d +str q22, [x0, #16] +str q16, [x0, #48] +sqrdmulh z16.d, z19.d, z9.d[1] +mul z19.d, z19.d,z8.d[1] +mla z19.d, P0/M, z16.d, z31.d +sub z16.d, z20.d, z19.d +add z20.d, z20.d, z19.d +str q20, [x0, #80] +str q16, [x0, #112] +sqrdmulh z16.d, z21.d, z11.d[1] +mul z21.d, z21.d,z10.d[1] +mla z21.d, P0/M, z16.d, z31.d +sub z16.d, z24.d, z21.d +add z24.d, z24.d, z21.d +str q24, [x0, #208] +str q16, [x0, #240] +sqrdmulh z16.d, z18.d, z11.d[0] +mul z18.d, z18.d,z10.d[0] +mla z18.d, P0/M, z16.d, z31.d +sub z16.d, z17.d, z18.d +add z17.d, z17.d, z18.d +str q17, [x0, #144] +str q16, [x0, #176] +ldr q16, [x0, #224] +ldr q17, [x0, #192] +ldr q18, [x0, #128] +ldr q24, [x0, #160] +ldr q21, [x0, #96] +ldr q20, [x0, #64] +ldr q19, [x0, #0] +ldr q22, [x0, #32] +sqrdmulh z23.d, z16.d, z5.d[0] +mul z16.d, z16.d,z4.d[0] +mla z16.d, P0/M, z23.d, z31.d +sub z23.d, z21.d, z16.d +add z21.d, z21.d, z16.d +sqrdmulh z16.d, z17.d, z5.d[0] +mul z17.d, z17.d,z4.d[0] +mla z17.d, P0/M, z16.d, z31.d +sub z16.d, z20.d, z17.d +add z20.d, z20.d, z17.d +sqrdmulh z17.d, z18.d, z5.d[0] +mul z18.d, z18.d,z4.d[0] +mla z18.d, P0/M, z17.d, z31.d +sub z17.d, z19.d, z18.d +add z19.d, z19.d, z18.d +sqrdmulh z18.d, z24.d, z5.d[0] +mul z24.d, z24.d,z4.d[0] +mla z24.d, P0/M, z18.d, z31.d +sub z18.d, z22.d, z24.d +add z22.d, z22.d, z24.d +sqrdmulh z24.d, z21.d, z7.d[0] +mul z21.d, z21.d,z6.d[0] +mla z21.d, P0/M, z24.d, z31.d +sub z24.d, z22.d, z21.d +add z22.d, z22.d, z21.d +ldr q3, [x17, #+256] +ldr q2, [x17, #+272] +ldr q1, [x17, #+288] +ldr q0, [x17, #+304] +ldr q15, [x17, #+320] +ldr q14, [x17, #+336] +ldr q13, [x17, #+352] +ldr q12, [x17, #+368] +sqrdmulh z21.d, z20.d, z7.d[0] +mul z20.d, z20.d,z6.d[0] +mla z20.d, P0/M, z21.d, z31.d +sub z21.d, z19.d, z20.d +add z19.d, z19.d, z20.d +sqrdmulh z20.d, z23.d, z7.d[1] +mul z23.d, z23.d,z6.d[1] +mla z23.d, P0/M, z20.d, z31.d +sub z20.d, z18.d, z23.d +add z18.d, z18.d, z23.d +sqrdmulh z23.d, z16.d, z7.d[1] +mul z16.d, z16.d,z6.d[1] +mla z16.d, P0/M, z23.d, z31.d +sub z23.d, z17.d, z16.d +add z17.d, z17.d, z16.d +sqrdmulh z16.d, z22.d, z9.d[0] +mul z22.d, z22.d,z8.d[0] +mla z22.d, P0/M, z16.d, z31.d +sub z16.d, z19.d, z22.d +add z19.d, z19.d, z22.d +str q19, [x0, #0] +str q16, [x0, #32] +sqrdmulh z16.d, z24.d, z9.d[1] +mul z24.d, z24.d,z8.d[1] +mla z24.d, P0/M, z16.d, z31.d +sub z16.d, z21.d, z24.d +add z21.d, z21.d, z24.d +str q21, [x0, #64] +str q16, [x0, #96] +sqrdmulh z16.d, z20.d, z11.d[1] +mul z20.d, z20.d,z10.d[1] +mla z20.d, P0/M, z16.d, z31.d +sub z16.d, z23.d, z20.d +add z23.d, z23.d, z20.d +str q23, [x0, #192] +str q16, [x0, #224] +sqrdmulh z16.d, z18.d, z11.d[0] +mul z18.d, z18.d,z10.d[0] +mla z18.d, P0/M, z16.d, z31.d +sub z16.d, z17.d, z18.d +add z17.d, z17.d, z18.d +str q17, [x0, #128] +str q16, [x0, #160] +ldr q16, [x0, #496] +ldr q17, [x0, #464] +ldr q18, [x0, #400] +ldr q23, [x0, #432] +ldr q20, [x0, #368] +ldr q21, [x0, #336] +ldr q24, [x0, #272] +ldr q19, [x0, #304] +sqrdmulh z22.d, z16.d, z2.d[0] +mul z16.d, z16.d,z3.d[0] +mla z16.d, P0/M, z22.d, z31.d +sub z22.d, z20.d, z16.d +add z20.d, z20.d, z16.d +sqrdmulh z16.d, z17.d, z2.d[0] +mul z17.d, z17.d,z3.d[0] +mla z17.d, P0/M, z16.d, z31.d +sub z16.d, z21.d, z17.d +add z21.d, z21.d, z17.d +sqrdmulh z17.d, z18.d, z2.d[0] +mul z18.d, z18.d,z3.d[0] +mla z18.d, P0/M, z17.d, z31.d +sub z17.d, z24.d, z18.d +add z24.d, z24.d, z18.d +sqrdmulh z18.d, z23.d, z2.d[0] +mul z23.d, z23.d,z3.d[0] +mla z23.d, P0/M, z18.d, z31.d +sub z18.d, z19.d, z23.d +add z19.d, z19.d, z23.d +sqrdmulh z23.d, z20.d, z0.d[0] +mul z20.d, z20.d,z1.d[0] +mla z20.d, P0/M, z23.d, z31.d +sub z23.d, z19.d, z20.d +add z19.d, z19.d, z20.d +sqrdmulh z20.d, z21.d, z0.d[0] +mul z21.d, z21.d,z1.d[0] +mla z21.d, P0/M, z20.d, z31.d +sub z20.d, z24.d, z21.d +add z24.d, z24.d, z21.d +sqrdmulh z21.d, z22.d, z0.d[1] +mul z22.d, z22.d,z1.d[1] +mla z22.d, P0/M, z21.d, z31.d +sub z21.d, z18.d, z22.d +add z18.d, z18.d, z22.d +sqrdmulh z22.d, z16.d, z0.d[1] +mul z16.d, z16.d,z1.d[1] +mla z16.d, P0/M, z22.d, z31.d +sub z22.d, z17.d, z16.d +add z17.d, z17.d, z16.d +sqrdmulh z16.d, z19.d, z14.d[0] +mul z19.d, z19.d,z15.d[0] +mla z19.d, P0/M, z16.d, z31.d +sub z16.d, z24.d, z19.d +add z24.d, z24.d, z19.d +str q24, [x0, #272] +str q16, [x0, #304] +sqrdmulh z16.d, z23.d, z14.d[1] +mul z23.d, z23.d,z15.d[1] +mla z23.d, P0/M, z16.d, z31.d +sub z16.d, z20.d, z23.d +add z20.d, z20.d, z23.d +str q20, [x0, #336] +str q16, [x0, #368] +sqrdmulh z16.d, z21.d, z12.d[1] +mul z21.d, z21.d,z13.d[1] +mla z21.d, P0/M, z16.d, z31.d +sub z16.d, z22.d, z21.d +add z22.d, z22.d, z21.d +str q22, [x0, #464] +str q16, [x0, #496] +sqrdmulh z16.d, z18.d, z12.d[0] +mul z18.d, z18.d,z13.d[0] +mla z18.d, P0/M, z16.d, z31.d +sub z16.d, z17.d, z18.d +add z17.d, z17.d, z18.d +str q17, [x0, #400] +str q16, [x0, #432] +ldr q16, [x0, #480] +ldr q17, [x0, #448] +ldr q18, [x0, #384] +ldr q22, [x0, #416] +ldr q21, [x0, #352] +ldr q20, [x0, #320] +ldr q23, [x0, #256] +ldr q24, [x0, #288] +sqrdmulh z19.d, z16.d, z2.d[0] +mul z16.d, z16.d,z3.d[0] +mla z16.d, P0/M, z19.d, z31.d +sub z19.d, z21.d, z16.d +add z21.d, z21.d, z16.d +sqrdmulh z16.d, z17.d, z2.d[0] +mul z17.d, z17.d,z3.d[0] +mla z17.d, P0/M, z16.d, z31.d +sub z16.d, z20.d, z17.d +add z20.d, z20.d, z17.d +sqrdmulh z17.d, z18.d, z2.d[0] +mul z18.d, z18.d,z3.d[0] +mla z18.d, P0/M, z17.d, z31.d +sub z17.d, z23.d, z18.d +add z23.d, z23.d, z18.d +sqrdmulh z18.d, z22.d, z2.d[0] +mul z22.d, z22.d,z3.d[0] +mla z22.d, P0/M, z18.d, z31.d +sub z18.d, z24.d, z22.d +add z24.d, z24.d, z22.d +sqrdmulh z22.d, z21.d, z0.d[0] +mul z21.d, z21.d,z1.d[0] +mla z21.d, P0/M, z22.d, z31.d +sub z22.d, z24.d, z21.d +add z24.d, z24.d, z21.d +ldr q11, [x17, #+384] +ldr q10, [x17, #+400] +ldr q9, [x17, #+416] +ldr q8, [x17, #+432] +ldr q7, [x17, #+448] +ldr q6, [x17, #+464] +ldr q5, [x17, #+480] +ldr q4, [x17, #+496] +sqrdmulh z21.d, z20.d, z0.d[0] +mul z20.d, z20.d,z1.d[0] +mla z20.d, P0/M, z21.d, z31.d +sub z21.d, z23.d, z20.d +add z23.d, z23.d, z20.d +sqrdmulh z20.d, z19.d, z0.d[1] +mul z19.d, z19.d,z1.d[1] +mla z19.d, P0/M, z20.d, z31.d +sub z20.d, z18.d, z19.d +add z18.d, z18.d, z19.d +sqrdmulh z19.d, z16.d, z0.d[1] +mul z16.d, z16.d,z1.d[1] +mla z16.d, P0/M, z19.d, z31.d +sub z19.d, z17.d, z16.d +add z17.d, z17.d, z16.d +sqrdmulh z16.d, z24.d, z14.d[0] +mul z24.d, z24.d,z15.d[0] +mla z24.d, P0/M, z16.d, z31.d +sub z16.d, z23.d, z24.d +add z23.d, z23.d, z24.d +str q23, [x0, #256] +str q16, [x0, #288] +sqrdmulh z16.d, z22.d, z14.d[1] +mul z22.d, z22.d,z15.d[1] +mla z22.d, P0/M, z16.d, z31.d +sub z16.d, z21.d, z22.d +add z21.d, z21.d, z22.d +str q21, [x0, #320] +str q16, [x0, #352] +sqrdmulh z16.d, z20.d, z12.d[1] +mul z20.d, z20.d,z13.d[1] +mla z20.d, P0/M, z16.d, z31.d +sub z16.d, z19.d, z20.d +add z19.d, z19.d, z20.d +str q19, [x0, #448] +str q16, [x0, #480] +sqrdmulh z16.d, z18.d, z12.d[0] +mul z18.d, z18.d,z13.d[0] +mla z18.d, P0/M, z16.d, z31.d +sub z16.d, z17.d, z18.d +add z17.d, z17.d, z18.d +str q17, [x0, #384] +str q16, [x0, #416] +ldr q16, [x0, #752] +ldr q17, [x0, #720] +ldr q18, [x0, #656] +ldr q19, [x0, #688] +ldr q20, [x0, #624] +ldr q21, [x0, #592] +ldr q22, [x0, #528] +ldr q23, [x0, #560] +sqrdmulh z24.d, z16.d, z10.d[0] +mul z16.d, z16.d,z11.d[0] +mla z16.d, P0/M, z24.d, z31.d +sub z24.d, z20.d, z16.d +add z20.d, z20.d, z16.d +sqrdmulh z16.d, z17.d, z10.d[0] +mul z17.d, z17.d,z11.d[0] +mla z17.d, P0/M, z16.d, z31.d +sub z16.d, z21.d, z17.d +add z21.d, z21.d, z17.d +sqrdmulh z17.d, z18.d, z10.d[0] +mul z18.d, z18.d,z11.d[0] +mla z18.d, P0/M, z17.d, z31.d +sub z17.d, z22.d, z18.d +add z22.d, z22.d, z18.d +sqrdmulh z18.d, z19.d, z10.d[0] +mul z19.d, z19.d,z11.d[0] +mla z19.d, P0/M, z18.d, z31.d +sub z18.d, z23.d, z19.d +add z23.d, z23.d, z19.d +sqrdmulh z19.d, z20.d, z8.d[0] +mul z20.d, z20.d,z9.d[0] +mla z20.d, P0/M, z19.d, z31.d +sub z19.d, z23.d, z20.d +add z23.d, z23.d, z20.d +sqrdmulh z20.d, z21.d, z8.d[0] +mul z21.d, z21.d,z9.d[0] +mla z21.d, P0/M, z20.d, z31.d +sub z20.d, z22.d, z21.d +add z22.d, z22.d, z21.d +sqrdmulh z21.d, z24.d, z8.d[1] +mul z24.d, z24.d,z9.d[1] +mla z24.d, P0/M, z21.d, z31.d +sub z21.d, z18.d, z24.d +add z18.d, z18.d, z24.d +sqrdmulh z24.d, z16.d, z8.d[1] +mul z16.d, z16.d,z9.d[1] +mla z16.d, P0/M, z24.d, z31.d +sub z24.d, z17.d, z16.d +add z17.d, z17.d, z16.d +sqrdmulh z16.d, z23.d, z6.d[0] +mul z23.d, z23.d,z7.d[0] +mla z23.d, P0/M, z16.d, z31.d +sub z16.d, z22.d, z23.d +add z22.d, z22.d, z23.d +str q22, [x0, #528] +str q16, [x0, #560] +sqrdmulh z16.d, z19.d, z6.d[1] +mul z19.d, z19.d,z7.d[1] +mla z19.d, P0/M, z16.d, z31.d +sub z16.d, z20.d, z19.d +add z20.d, z20.d, z19.d +str q20, [x0, #592] +str q16, [x0, #624] +sqrdmulh z16.d, z21.d, z4.d[1] +mul z21.d, z21.d,z5.d[1] +mla z21.d, P0/M, z16.d, z31.d +sub z16.d, z24.d, z21.d +add z24.d, z24.d, z21.d +str q24, [x0, #720] +str q16, [x0, #752] +sqrdmulh z16.d, z18.d, z4.d[0] +mul z18.d, z18.d,z5.d[0] +mla z18.d, P0/M, z16.d, z31.d +sub z16.d, z17.d, z18.d +add z17.d, z17.d, z18.d +str q17, [x0, #656] +str q16, [x0, #688] +ldr q16, [x0, #736] +ldr q17, [x0, #704] +ldr q18, [x0, #640] +ldr q24, [x0, #672] +ldr q21, [x0, #608] +ldr q20, [x0, #576] +ldr q19, [x0, #512] +ldr q22, [x0, #544] +sqrdmulh z23.d, z16.d, z10.d[0] +mul z16.d, z16.d,z11.d[0] +mla z16.d, P0/M, z23.d, z31.d +sub z23.d, z21.d, z16.d +add z21.d, z21.d, z16.d +sqrdmulh z16.d, z17.d, z10.d[0] +mul z17.d, z17.d,z11.d[0] +mla z17.d, P0/M, z16.d, z31.d +sub z16.d, z20.d, z17.d +add z20.d, z20.d, z17.d +sqrdmulh z17.d, z18.d, z10.d[0] +mul z18.d, z18.d,z11.d[0] +mla z18.d, P0/M, z17.d, z31.d +sub z17.d, z19.d, z18.d +add z19.d, z19.d, z18.d +sqrdmulh z18.d, z24.d, z10.d[0] +mul z24.d, z24.d,z11.d[0] +mla z24.d, P0/M, z18.d, z31.d +sub z18.d, z22.d, z24.d +add z22.d, z22.d, z24.d +sqrdmulh z24.d, z21.d, z8.d[0] +mul z21.d, z21.d,z9.d[0] +mla z21.d, P0/M, z24.d, z31.d +sub z24.d, z22.d, z21.d +add z22.d, z22.d, z21.d +ldr q12, [x17, #+512] +ldr q13, [x17, #+528] +ldr q14, [x17, #+544] +ldr q15, [x17, #+560] +ldr q0, [x17, #+576] +ldr q1, [x17, #+592] +ldr q2, [x17, #+608] +ldr q3, [x17, #+624] +sqrdmulh z21.d, z20.d, z8.d[0] +mul z20.d, z20.d,z9.d[0] +mla z20.d, P0/M, z21.d, z31.d +sub z21.d, z19.d, z20.d +add z19.d, z19.d, z20.d +sqrdmulh z20.d, z23.d, z8.d[1] +mul z23.d, z23.d,z9.d[1] +mla z23.d, P0/M, z20.d, z31.d +sub z20.d, z18.d, z23.d +add z18.d, z18.d, z23.d +sqrdmulh z23.d, z16.d, z8.d[1] +mul z16.d, z16.d,z9.d[1] +mla z16.d, P0/M, z23.d, z31.d +sub z23.d, z17.d, z16.d +add z17.d, z17.d, z16.d +sqrdmulh z16.d, z22.d, z6.d[0] +mul z22.d, z22.d,z7.d[0] +mla z22.d, P0/M, z16.d, z31.d +sub z16.d, z19.d, z22.d +add z19.d, z19.d, z22.d +str q19, [x0, #512] +str q16, [x0, #544] +sqrdmulh z16.d, z24.d, z6.d[1] +mul z24.d, z24.d,z7.d[1] +mla z24.d, P0/M, z16.d, z31.d +sub z16.d, z21.d, z24.d +add z21.d, z21.d, z24.d +str q21, [x0, #576] +str q16, [x0, #608] +sqrdmulh z16.d, z20.d, z4.d[1] +mul z20.d, z20.d,z5.d[1] +mla z20.d, P0/M, z16.d, z31.d +sub z16.d, z23.d, z20.d +add z23.d, z23.d, z20.d +str q23, [x0, #704] +str q16, [x0, #736] +sqrdmulh z16.d, z18.d, z4.d[0] +mul z18.d, z18.d,z5.d[0] +mla z18.d, P0/M, z16.d, z31.d +sub z16.d, z17.d, z18.d +add z17.d, z17.d, z18.d +str q17, [x0, #640] +str q16, [x0, #672] +ldr q16, [x0, #1008] +ldr q17, [x0, #976] +ldr q18, [x0, #912] +ldr q23, [x0, #944] +ldr q20, [x0, #880] +ldr q21, [x0, #848] +ldr q24, [x0, #784] +ldr q19, [x0, #816] +sqrdmulh z22.d, z16.d, z13.d[0] +mul z16.d, z16.d,z12.d[0] +mla z16.d, P0/M, z22.d, z31.d +sub z22.d, z20.d, z16.d +add z20.d, z20.d, z16.d +sqrdmulh z16.d, z17.d, z13.d[0] +mul z17.d, z17.d,z12.d[0] +mla z17.d, P0/M, z16.d, z31.d +sub z16.d, z21.d, z17.d +add z21.d, z21.d, z17.d +sqrdmulh z17.d, z18.d, z13.d[0] +mul z18.d, z18.d,z12.d[0] +mla z18.d, P0/M, z17.d, z31.d +sub z17.d, z24.d, z18.d +add z24.d, z24.d, z18.d +sqrdmulh z18.d, z23.d, z13.d[0] +mul z23.d, z23.d,z12.d[0] +mla z23.d, P0/M, z18.d, z31.d +sub z18.d, z19.d, z23.d +add z19.d, z19.d, z23.d +sqrdmulh z23.d, z20.d, z15.d[0] +mul z20.d, z20.d,z14.d[0] +mla z20.d, P0/M, z23.d, z31.d +sub z23.d, z19.d, z20.d +add z19.d, z19.d, z20.d +sqrdmulh z20.d, z21.d, z15.d[0] +mul z21.d, z21.d,z14.d[0] +mla z21.d, P0/M, z20.d, z31.d +sub z20.d, z24.d, z21.d +add z24.d, z24.d, z21.d +sqrdmulh z21.d, z22.d, z15.d[1] +mul z22.d, z22.d,z14.d[1] +mla z22.d, P0/M, z21.d, z31.d +sub z21.d, z18.d, z22.d +add z18.d, z18.d, z22.d +sqrdmulh z22.d, z16.d, z15.d[1] +mul z16.d, z16.d,z14.d[1] +mla z16.d, P0/M, z22.d, z31.d +sub z22.d, z17.d, z16.d +add z17.d, z17.d, z16.d +sqrdmulh z16.d, z19.d, z1.d[0] +mul z19.d, z19.d,z0.d[0] +mla z19.d, P0/M, z16.d, z31.d +sub z16.d, z24.d, z19.d +add z24.d, z24.d, z19.d +str q24, [x0, #784] +str q16, [x0, #816] +sqrdmulh z16.d, z23.d, z1.d[1] +mul z23.d, z23.d,z0.d[1] +mla z23.d, P0/M, z16.d, z31.d +sub z16.d, z20.d, z23.d +add z20.d, z20.d, z23.d +str q20, [x0, #848] +str q16, [x0, #880] +sqrdmulh z16.d, z21.d, z3.d[1] +mul z21.d, z21.d,z2.d[1] +mla z21.d, P0/M, z16.d, z31.d +sub z16.d, z22.d, z21.d +add z22.d, z22.d, z21.d +str q22, [x0, #976] +str q16, [x0, #1008] +sqrdmulh z16.d, z18.d, z3.d[0] +mul z18.d, z18.d,z2.d[0] +mla z18.d, P0/M, z16.d, z31.d +sub z16.d, z17.d, z18.d +add z17.d, z17.d, z18.d +str q17, [x0, #912] +str q16, [x0, #944] +ldr q16, [x0, #992] +ldr q17, [x0, #960] +ldr q18, [x0, #896] +ldr q22, [x0, #928] +ldr q21, [x0, #864] +ldr q20, [x0, #832] +ldr q23, [x0, #768] +ldr q24, [x0, #800] +sqrdmulh z19.d, z16.d, z13.d[0] +mul z16.d, z16.d,z12.d[0] +mla z16.d, P0/M, z19.d, z31.d +sub z19.d, z21.d, z16.d +add z21.d, z21.d, z16.d +sqrdmulh z16.d, z17.d, z13.d[0] +mul z17.d, z17.d,z12.d[0] +mla z17.d, P0/M, z16.d, z31.d +sub z16.d, z20.d, z17.d +add z20.d, z20.d, z17.d +sqrdmulh z17.d, z18.d, z13.d[0] +mul z18.d, z18.d,z12.d[0] +mla z18.d, P0/M, z17.d, z31.d +sub z17.d, z23.d, z18.d +add z23.d, z23.d, z18.d +sqrdmulh z18.d, z22.d, z13.d[0] +mul z22.d, z22.d,z12.d[0] +mla z22.d, P0/M, z18.d, z31.d +sub z18.d, z24.d, z22.d +add z24.d, z24.d, z22.d +sqrdmulh z22.d, z21.d, z15.d[0] +mul z21.d, z21.d,z14.d[0] +mla z21.d, P0/M, z22.d, z31.d +sub z22.d, z24.d, z21.d +add z24.d, z24.d, z21.d +ldr q4, [x17, #+640] +ldr q5, [x17, #+656] +ldr q6, [x17, #+672] +ldr q7, [x17, #+688] +ldr q8, [x17, #+704] +ldr q9, [x17, #+720] +ldr q10, [x17, #+736] +ldr q11, [x17, #+752] +sqrdmulh z21.d, z20.d, z15.d[0] +mul z20.d, z20.d,z14.d[0] +mla z20.d, P0/M, z21.d, z31.d +sub z21.d, z23.d, z20.d +add z23.d, z23.d, z20.d +sqrdmulh z20.d, z19.d, z15.d[1] +mul z19.d, z19.d,z14.d[1] +mla z19.d, P0/M, z20.d, z31.d +sub z20.d, z18.d, z19.d +add z18.d, z18.d, z19.d +sqrdmulh z19.d, z16.d, z15.d[1] +mul z16.d, z16.d,z14.d[1] +mla z16.d, P0/M, z19.d, z31.d +sub z19.d, z17.d, z16.d +add z17.d, z17.d, z16.d +sqrdmulh z16.d, z24.d, z1.d[0] +mul z24.d, z24.d,z0.d[0] +mla z24.d, P0/M, z16.d, z31.d +sub z16.d, z23.d, z24.d +add z23.d, z23.d, z24.d +str q23, [x0, #768] +str q16, [x0, #800] +sqrdmulh z16.d, z22.d, z1.d[1] +mul z22.d, z22.d,z0.d[1] +mla z22.d, P0/M, z16.d, z31.d +sub z16.d, z21.d, z22.d +add z21.d, z21.d, z22.d +str q21, [x0, #832] +str q16, [x0, #864] +sqrdmulh z16.d, z20.d, z3.d[1] +mul z20.d, z20.d,z2.d[1] +mla z20.d, P0/M, z16.d, z31.d +sub z16.d, z19.d, z20.d +add z19.d, z19.d, z20.d +str q19, [x0, #960] +str q16, [x0, #992] +sqrdmulh z16.d, z18.d, z3.d[0] +mul z18.d, z18.d,z2.d[0] +mla z18.d, P0/M, z16.d, z31.d +sub z16.d, z17.d, z18.d +add z17.d, z17.d, z18.d +str q17, [x0, #896] +str q16, [x0, #928] +ldr q16, [x0, #1264] +ldr q17, [x0, #1232] +ldr q18, [x0, #1168] +ldr q19, [x0, #1200] +ldr q20, [x0, #1136] +ldr q21, [x0, #1104] +ldr q22, [x0, #1040] +ldr q23, [x0, #1072] +sqrdmulh z24.d, z16.d, z5.d[0] +mul z16.d, z16.d,z4.d[0] +mla z16.d, P0/M, z24.d, z31.d +sub z24.d, z20.d, z16.d +add z20.d, z20.d, z16.d +sqrdmulh z16.d, z17.d, z5.d[0] +mul z17.d, z17.d,z4.d[0] +mla z17.d, P0/M, z16.d, z31.d +sub z16.d, z21.d, z17.d +add z21.d, z21.d, z17.d +sqrdmulh z17.d, z18.d, z5.d[0] +mul z18.d, z18.d,z4.d[0] +mla z18.d, P0/M, z17.d, z31.d +sub z17.d, z22.d, z18.d +add z22.d, z22.d, z18.d +sqrdmulh z18.d, z19.d, z5.d[0] +mul z19.d, z19.d,z4.d[0] +mla z19.d, P0/M, z18.d, z31.d +sub z18.d, z23.d, z19.d +add z23.d, z23.d, z19.d +sqrdmulh z19.d, z20.d, z7.d[0] +mul z20.d, z20.d,z6.d[0] +mla z20.d, P0/M, z19.d, z31.d +sub z19.d, z23.d, z20.d +add z23.d, z23.d, z20.d +sqrdmulh z20.d, z21.d, z7.d[0] +mul z21.d, z21.d,z6.d[0] +mla z21.d, P0/M, z20.d, z31.d +sub z20.d, z22.d, z21.d +add z22.d, z22.d, z21.d +sqrdmulh z21.d, z24.d, z7.d[1] +mul z24.d, z24.d,z6.d[1] +mla z24.d, P0/M, z21.d, z31.d +sub z21.d, z18.d, z24.d +add z18.d, z18.d, z24.d +sqrdmulh z24.d, z16.d, z7.d[1] +mul z16.d, z16.d,z6.d[1] +mla z16.d, P0/M, z24.d, z31.d +sub z24.d, z17.d, z16.d +add z17.d, z17.d, z16.d +sqrdmulh z16.d, z23.d, z9.d[0] +mul z23.d, z23.d,z8.d[0] +mla z23.d, P0/M, z16.d, z31.d +sub z16.d, z22.d, z23.d +add z22.d, z22.d, z23.d +str q22, [x0, #1040] +str q16, [x0, #1072] +sqrdmulh z16.d, z19.d, z9.d[1] +mul z19.d, z19.d,z8.d[1] +mla z19.d, P0/M, z16.d, z31.d +sub z16.d, z20.d, z19.d +add z20.d, z20.d, z19.d +str q20, [x0, #1104] +str q16, [x0, #1136] +sqrdmulh z16.d, z21.d, z11.d[1] +mul z21.d, z21.d,z10.d[1] +mla z21.d, P0/M, z16.d, z31.d +sub z16.d, z24.d, z21.d +add z24.d, z24.d, z21.d +str q24, [x0, #1232] +str q16, [x0, #1264] +sqrdmulh z16.d, z18.d, z11.d[0] +mul z18.d, z18.d,z10.d[0] +mla z18.d, P0/M, z16.d, z31.d +sub z16.d, z17.d, z18.d +add z17.d, z17.d, z18.d +str q17, [x0, #1168] +str q16, [x0, #1200] +ldr q16, [x0, #1248] +ldr q17, [x0, #1216] +ldr q18, [x0, #1152] +ldr q24, [x0, #1184] +ldr q21, [x0, #1120] +ldr q20, [x0, #1088] +ldr q19, [x0, #1024] +ldr q22, [x0, #1056] +sqrdmulh z23.d, z16.d, z5.d[0] +mul z16.d, z16.d,z4.d[0] +mla z16.d, P0/M, z23.d, z31.d +sub z23.d, z21.d, z16.d +add z21.d, z21.d, z16.d +sqrdmulh z16.d, z17.d, z5.d[0] +mul z17.d, z17.d,z4.d[0] +mla z17.d, P0/M, z16.d, z31.d +sub z16.d, z20.d, z17.d +add z20.d, z20.d, z17.d +sqrdmulh z17.d, z18.d, z5.d[0] +mul z18.d, z18.d,z4.d[0] +mla z18.d, P0/M, z17.d, z31.d +sub z17.d, z19.d, z18.d +add z19.d, z19.d, z18.d +sqrdmulh z18.d, z24.d, z5.d[0] +mul z24.d, z24.d,z4.d[0] +mla z24.d, P0/M, z18.d, z31.d +sub z18.d, z22.d, z24.d +add z22.d, z22.d, z24.d +sqrdmulh z24.d, z21.d, z7.d[0] +mul z21.d, z21.d,z6.d[0] +mla z21.d, P0/M, z24.d, z31.d +sub z24.d, z22.d, z21.d +add z22.d, z22.d, z21.d +ldr q3, [x17, #+768] +ldr q2, [x17, #+784] +ldr q1, [x17, #+800] +ldr q0, [x17, #+816] +ldr q15, [x17, #+832] +ldr q14, [x17, #+848] +ldr q13, [x17, #+864] +ldr q12, [x17, #+880] +sqrdmulh z21.d, z20.d, z7.d[0] +mul z20.d, z20.d,z6.d[0] +mla z20.d, P0/M, z21.d, z31.d +sub z21.d, z19.d, z20.d +add z19.d, z19.d, z20.d +sqrdmulh z20.d, z23.d, z7.d[1] +mul z23.d, z23.d,z6.d[1] +mla z23.d, P0/M, z20.d, z31.d +sub z20.d, z18.d, z23.d +add z18.d, z18.d, z23.d +sqrdmulh z23.d, z16.d, z7.d[1] +mul z16.d, z16.d,z6.d[1] +mla z16.d, P0/M, z23.d, z31.d +sub z23.d, z17.d, z16.d +add z17.d, z17.d, z16.d +sqrdmulh z16.d, z22.d, z9.d[0] +mul z22.d, z22.d,z8.d[0] +mla z22.d, P0/M, z16.d, z31.d +sub z16.d, z19.d, z22.d +add z19.d, z19.d, z22.d +str q19, [x0, #1024] +str q16, [x0, #1056] +sqrdmulh z16.d, z24.d, z9.d[1] +mul z24.d, z24.d,z8.d[1] +mla z24.d, P0/M, z16.d, z31.d +sub z16.d, z21.d, z24.d +add z21.d, z21.d, z24.d +str q21, [x0, #1088] +str q16, [x0, #1120] +sqrdmulh z16.d, z20.d, z11.d[1] +mul z20.d, z20.d,z10.d[1] +mla z20.d, P0/M, z16.d, z31.d +sub z16.d, z23.d, z20.d +add z23.d, z23.d, z20.d +str q23, [x0, #1216] +str q16, [x0, #1248] +sqrdmulh z16.d, z18.d, z11.d[0] +mul z18.d, z18.d,z10.d[0] +mla z18.d, P0/M, z16.d, z31.d +sub z16.d, z17.d, z18.d +add z17.d, z17.d, z18.d +str q17, [x0, #1152] +str q16, [x0, #1184] +ldr q16, [x0, #1520] +ldr q17, [x0, #1488] +ldr q18, [x0, #1424] +ldr q23, [x0, #1456] +ldr q20, [x0, #1392] +ldr q21, [x0, #1360] +ldr q24, [x0, #1296] +ldr q19, [x0, #1328] +sqrdmulh z22.d, z16.d, z2.d[0] +mul z16.d, z16.d,z3.d[0] +mla z16.d, P0/M, z22.d, z31.d +sub z22.d, z20.d, z16.d +add z20.d, z20.d, z16.d +sqrdmulh z16.d, z17.d, z2.d[0] +mul z17.d, z17.d,z3.d[0] +mla z17.d, P0/M, z16.d, z31.d +sub z16.d, z21.d, z17.d +add z21.d, z21.d, z17.d +sqrdmulh z17.d, z18.d, z2.d[0] +mul z18.d, z18.d,z3.d[0] +mla z18.d, P0/M, z17.d, z31.d +sub z17.d, z24.d, z18.d +add z24.d, z24.d, z18.d +sqrdmulh z18.d, z23.d, z2.d[0] +mul z23.d, z23.d,z3.d[0] +mla z23.d, P0/M, z18.d, z31.d +sub z18.d, z19.d, z23.d +add z19.d, z19.d, z23.d +sqrdmulh z23.d, z20.d, z0.d[0] +mul z20.d, z20.d,z1.d[0] +mla z20.d, P0/M, z23.d, z31.d +sub z23.d, z19.d, z20.d +add z19.d, z19.d, z20.d +sqrdmulh z20.d, z21.d, z0.d[0] +mul z21.d, z21.d,z1.d[0] +mla z21.d, P0/M, z20.d, z31.d +sub z20.d, z24.d, z21.d +add z24.d, z24.d, z21.d +sqrdmulh z21.d, z22.d, z0.d[1] +mul z22.d, z22.d,z1.d[1] +mla z22.d, P0/M, z21.d, z31.d +sub z21.d, z18.d, z22.d +add z18.d, z18.d, z22.d +sqrdmulh z22.d, z16.d, z0.d[1] +mul z16.d, z16.d,z1.d[1] +mla z16.d, P0/M, z22.d, z31.d +sub z22.d, z17.d, z16.d +add z17.d, z17.d, z16.d +sqrdmulh z16.d, z19.d, z14.d[0] +mul z19.d, z19.d,z15.d[0] +mla z19.d, P0/M, z16.d, z31.d +sub z16.d, z24.d, z19.d +add z24.d, z24.d, z19.d +str q24, [x0, #1296] +str q16, [x0, #1328] +sqrdmulh z16.d, z23.d, z14.d[1] +mul z23.d, z23.d,z15.d[1] +mla z23.d, P0/M, z16.d, z31.d +sub z16.d, z20.d, z23.d +add z20.d, z20.d, z23.d +str q20, [x0, #1360] +str q16, [x0, #1392] +sqrdmulh z16.d, z21.d, z12.d[1] +mul z21.d, z21.d,z13.d[1] +mla z21.d, P0/M, z16.d, z31.d +sub z16.d, z22.d, z21.d +add z22.d, z22.d, z21.d +str q22, [x0, #1488] +str q16, [x0, #1520] +sqrdmulh z16.d, z18.d, z12.d[0] +mul z18.d, z18.d,z13.d[0] +mla z18.d, P0/M, z16.d, z31.d +sub z16.d, z17.d, z18.d +add z17.d, z17.d, z18.d +str q17, [x0, #1424] +str q16, [x0, #1456] +ldr q16, [x0, #1504] +ldr q17, [x0, #1472] +ldr q18, [x0, #1408] +ldr q22, [x0, #1440] +ldr q21, [x0, #1376] +ldr q20, [x0, #1344] +ldr q23, [x0, #1280] +ldr q24, [x0, #1312] +sqrdmulh z19.d, z16.d, z2.d[0] +mul z16.d, z16.d,z3.d[0] +mla z16.d, P0/M, z19.d, z31.d +sub z19.d, z21.d, z16.d +add z21.d, z21.d, z16.d +sqrdmulh z16.d, z17.d, z2.d[0] +mul z17.d, z17.d,z3.d[0] +mla z17.d, P0/M, z16.d, z31.d +sub z16.d, z20.d, z17.d +add z20.d, z20.d, z17.d +sqrdmulh z17.d, z18.d, z2.d[0] +mul z18.d, z18.d,z3.d[0] +mla z18.d, P0/M, z17.d, z31.d +sub z17.d, z23.d, z18.d +add z23.d, z23.d, z18.d +sqrdmulh z18.d, z22.d, z2.d[0] +mul z22.d, z22.d,z3.d[0] +mla z22.d, P0/M, z18.d, z31.d +sub z18.d, z24.d, z22.d +add z24.d, z24.d, z22.d +sqrdmulh z22.d, z21.d, z0.d[0] +mul z21.d, z21.d,z1.d[0] +mla z21.d, P0/M, z22.d, z31.d +sub z22.d, z24.d, z21.d +add z24.d, z24.d, z21.d +ldr q11, [x17, #+896] +ldr q10, [x17, #+912] +ldr q9, [x17, #+928] +ldr q8, [x17, #+944] +ldr q7, [x17, #+960] +ldr q6, [x17, #+976] +ldr q5, [x17, #+992] +ldr q4, [x17, #+1008] +sqrdmulh z21.d, z20.d, z0.d[0] +mul z20.d, z20.d,z1.d[0] +mla z20.d, P0/M, z21.d, z31.d +sub z21.d, z23.d, z20.d +add z23.d, z23.d, z20.d +sqrdmulh z20.d, z19.d, z0.d[1] +mul z19.d, z19.d,z1.d[1] +mla z19.d, P0/M, z20.d, z31.d +sub z20.d, z18.d, z19.d +add z18.d, z18.d, z19.d +sqrdmulh z19.d, z16.d, z0.d[1] +mul z16.d, z16.d,z1.d[1] +mla z16.d, P0/M, z19.d, z31.d +sub z19.d, z17.d, z16.d +add z17.d, z17.d, z16.d +sqrdmulh z16.d, z24.d, z14.d[0] +mul z24.d, z24.d,z15.d[0] +mla z24.d, P0/M, z16.d, z31.d +sub z16.d, z23.d, z24.d +add z23.d, z23.d, z24.d +str q23, [x0, #1280] +str q16, [x0, #1312] +sqrdmulh z16.d, z22.d, z14.d[1] +mul z22.d, z22.d,z15.d[1] +mla z22.d, P0/M, z16.d, z31.d +sub z16.d, z21.d, z22.d +add z21.d, z21.d, z22.d +str q21, [x0, #1344] +str q16, [x0, #1376] +sqrdmulh z16.d, z20.d, z12.d[1] +mul z20.d, z20.d,z13.d[1] +mla z20.d, P0/M, z16.d, z31.d +sub z16.d, z19.d, z20.d +add z19.d, z19.d, z20.d +str q19, [x0, #1472] +str q16, [x0, #1504] +sqrdmulh z16.d, z18.d, z12.d[0] +mul z18.d, z18.d,z13.d[0] +mla z18.d, P0/M, z16.d, z31.d +sub z16.d, z17.d, z18.d +add z17.d, z17.d, z18.d +str q17, [x0, #1408] +str q16, [x0, #1440] +ldr q16, [x0, #1776] +ldr q17, [x0, #1744] +ldr q18, [x0, #1680] +ldr q19, [x0, #1712] +ldr q20, [x0, #1648] +ldr q21, [x0, #1616] +ldr q22, [x0, #1552] +ldr q23, [x0, #1584] +sqrdmulh z24.d, z16.d, z10.d[0] +mul z16.d, z16.d,z11.d[0] +mla z16.d, P0/M, z24.d, z31.d +sub z24.d, z20.d, z16.d +add z20.d, z20.d, z16.d +sqrdmulh z16.d, z17.d, z10.d[0] +mul z17.d, z17.d,z11.d[0] +mla z17.d, P0/M, z16.d, z31.d +sub z16.d, z21.d, z17.d +add z21.d, z21.d, z17.d +sqrdmulh z17.d, z18.d, z10.d[0] +mul z18.d, z18.d,z11.d[0] +mla z18.d, P0/M, z17.d, z31.d +sub z17.d, z22.d, z18.d +add z22.d, z22.d, z18.d +sqrdmulh z18.d, z19.d, z10.d[0] +mul z19.d, z19.d,z11.d[0] +mla z19.d, P0/M, z18.d, z31.d +sub z18.d, z23.d, z19.d +add z23.d, z23.d, z19.d +sqrdmulh z19.d, z20.d, z8.d[0] +mul z20.d, z20.d,z9.d[0] +mla z20.d, P0/M, z19.d, z31.d +sub z19.d, z23.d, z20.d +add z23.d, z23.d, z20.d +sqrdmulh z20.d, z21.d, z8.d[0] +mul z21.d, z21.d,z9.d[0] +mla z21.d, P0/M, z20.d, z31.d +sub z20.d, z22.d, z21.d +add z22.d, z22.d, z21.d +sqrdmulh z21.d, z24.d, z8.d[1] +mul z24.d, z24.d,z9.d[1] +mla z24.d, P0/M, z21.d, z31.d +sub z21.d, z18.d, z24.d +add z18.d, z18.d, z24.d +sqrdmulh z24.d, z16.d, z8.d[1] +mul z16.d, z16.d,z9.d[1] +mla z16.d, P0/M, z24.d, z31.d +sub z24.d, z17.d, z16.d +add z17.d, z17.d, z16.d +sqrdmulh z16.d, z23.d, z6.d[0] +mul z23.d, z23.d,z7.d[0] +mla z23.d, P0/M, z16.d, z31.d +sub z16.d, z22.d, z23.d +add z22.d, z22.d, z23.d +str q22, [x0, #1552] +str q16, [x0, #1584] +sqrdmulh z16.d, z19.d, z6.d[1] +mul z19.d, z19.d,z7.d[1] +mla z19.d, P0/M, z16.d, z31.d +sub z16.d, z20.d, z19.d +add z20.d, z20.d, z19.d +str q20, [x0, #1616] +str q16, [x0, #1648] +sqrdmulh z16.d, z21.d, z4.d[1] +mul z21.d, z21.d,z5.d[1] +mla z21.d, P0/M, z16.d, z31.d +sub z16.d, z24.d, z21.d +add z24.d, z24.d, z21.d +str q24, [x0, #1744] +str q16, [x0, #1776] +sqrdmulh z16.d, z18.d, z4.d[0] +mul z18.d, z18.d,z5.d[0] +mla z18.d, P0/M, z16.d, z31.d +sub z16.d, z17.d, z18.d +add z17.d, z17.d, z18.d +str q17, [x0, #1680] +str q16, [x0, #1712] +ldr q16, [x0, #1760] +ldr q17, [x0, #1728] +ldr q18, [x0, #1664] +ldr q24, [x0, #1696] +ldr q21, [x0, #1632] +ldr q20, [x0, #1600] +ldr q19, [x0, #1536] +ldr q22, [x0, #1568] +sqrdmulh z23.d, z16.d, z10.d[0] +mul z16.d, z16.d,z11.d[0] +mla z16.d, P0/M, z23.d, z31.d +sub z23.d, z21.d, z16.d +add z21.d, z21.d, z16.d +sqrdmulh z16.d, z17.d, z10.d[0] +mul z17.d, z17.d,z11.d[0] +mla z17.d, P0/M, z16.d, z31.d +sub z16.d, z20.d, z17.d +add z20.d, z20.d, z17.d +sqrdmulh z17.d, z18.d, z10.d[0] +mul z18.d, z18.d,z11.d[0] +mla z18.d, P0/M, z17.d, z31.d +sub z17.d, z19.d, z18.d +add z19.d, z19.d, z18.d +sqrdmulh z18.d, z24.d, z10.d[0] +mul z24.d, z24.d,z11.d[0] +mla z24.d, P0/M, z18.d, z31.d +sub z18.d, z22.d, z24.d +add z22.d, z22.d, z24.d +sqrdmulh z24.d, z21.d, z8.d[0] +mul z21.d, z21.d,z9.d[0] +mla z21.d, P0/M, z24.d, z31.d +sub z24.d, z22.d, z21.d +add z22.d, z22.d, z21.d +ldr q12, [x17, #+1024] +ldr q13, [x17, #+1040] +ldr q14, [x17, #+1056] +ldr q15, [x17, #+1072] +ldr q0, [x17, #+1088] +ldr q1, [x17, #+1104] +ldr q2, [x17, #+1120] +ldr q3, [x17, #+1136] +sqrdmulh z21.d, z20.d, z8.d[0] +mul z20.d, z20.d,z9.d[0] +mla z20.d, P0/M, z21.d, z31.d +sub z21.d, z19.d, z20.d +add z19.d, z19.d, z20.d +sqrdmulh z20.d, z23.d, z8.d[1] +mul z23.d, z23.d,z9.d[1] +mla z23.d, P0/M, z20.d, z31.d +sub z20.d, z18.d, z23.d +add z18.d, z18.d, z23.d +sqrdmulh z23.d, z16.d, z8.d[1] +mul z16.d, z16.d,z9.d[1] +mla z16.d, P0/M, z23.d, z31.d +sub z23.d, z17.d, z16.d +add z17.d, z17.d, z16.d +sqrdmulh z16.d, z22.d, z6.d[0] +mul z22.d, z22.d,z7.d[0] +mla z22.d, P0/M, z16.d, z31.d +sub z16.d, z19.d, z22.d +add z19.d, z19.d, z22.d +str q19, [x0, #1536] +str q16, [x0, #1568] +sqrdmulh z16.d, z24.d, z6.d[1] +mul z24.d, z24.d,z7.d[1] +mla z24.d, P0/M, z16.d, z31.d +sub z16.d, z21.d, z24.d +add z21.d, z21.d, z24.d +str q21, [x0, #1600] +str q16, [x0, #1632] +sqrdmulh z16.d, z20.d, z4.d[1] +mul z20.d, z20.d,z5.d[1] +mla z20.d, P0/M, z16.d, z31.d +sub z16.d, z23.d, z20.d +add z23.d, z23.d, z20.d +str q23, [x0, #1728] +str q16, [x0, #1760] +sqrdmulh z16.d, z18.d, z4.d[0] +mul z18.d, z18.d,z5.d[0] +mla z18.d, P0/M, z16.d, z31.d +sub z16.d, z17.d, z18.d +add z17.d, z17.d, z18.d +str q17, [x0, #1664] +str q16, [x0, #1696] +ldr q16, [x0, #2032] +ldr q17, [x0, #2000] +ldr q18, [x0, #1936] +ldr q23, [x0, #1968] +ldr q20, [x0, #1904] +ldr q21, [x0, #1872] +ldr q24, [x0, #1808] +ldr q19, [x0, #1840] +sqrdmulh z22.d, z16.d, z13.d[0] +mul z16.d, z16.d,z12.d[0] +mla z16.d, P0/M, z22.d, z31.d +sub z22.d, z20.d, z16.d +add z20.d, z20.d, z16.d +sqrdmulh z16.d, z17.d, z13.d[0] +mul z17.d, z17.d,z12.d[0] +mla z17.d, P0/M, z16.d, z31.d +sub z16.d, z21.d, z17.d +add z21.d, z21.d, z17.d +sqrdmulh z17.d, z18.d, z13.d[0] +mul z18.d, z18.d,z12.d[0] +mla z18.d, P0/M, z17.d, z31.d +sub z17.d, z24.d, z18.d +add z24.d, z24.d, z18.d +sqrdmulh z18.d, z23.d, z13.d[0] +mul z23.d, z23.d,z12.d[0] +mla z23.d, P0/M, z18.d, z31.d +sub z18.d, z19.d, z23.d +add z19.d, z19.d, z23.d +sqrdmulh z23.d, z20.d, z15.d[0] +mul z20.d, z20.d,z14.d[0] +mla z20.d, P0/M, z23.d, z31.d +sub z23.d, z19.d, z20.d +add z19.d, z19.d, z20.d +sqrdmulh z20.d, z21.d, z15.d[0] +mul z21.d, z21.d,z14.d[0] +mla z21.d, P0/M, z20.d, z31.d +sub z20.d, z24.d, z21.d +add z24.d, z24.d, z21.d +sqrdmulh z21.d, z22.d, z15.d[1] +mul z22.d, z22.d,z14.d[1] +mla z22.d, P0/M, z21.d, z31.d +sub z21.d, z18.d, z22.d +add z18.d, z18.d, z22.d +sqrdmulh z22.d, z16.d, z15.d[1] +mul z16.d, z16.d,z14.d[1] +mla z16.d, P0/M, z22.d, z31.d +sub z22.d, z17.d, z16.d +add z17.d, z17.d, z16.d +sqrdmulh z16.d, z19.d, z1.d[0] +mul z19.d, z19.d,z0.d[0] +mla z19.d, P0/M, z16.d, z31.d +sub z16.d, z24.d, z19.d +add z24.d, z24.d, z19.d +str q24, [x0, #1808] +str q16, [x0, #1840] +sqrdmulh z16.d, z23.d, z1.d[1] +mul z23.d, z23.d,z0.d[1] +mla z23.d, P0/M, z16.d, z31.d +sub z16.d, z20.d, z23.d +add z20.d, z20.d, z23.d +str q20, [x0, #1872] +str q16, [x0, #1904] +sqrdmulh z16.d, z21.d, z3.d[1] +mul z21.d, z21.d,z2.d[1] +mla z21.d, P0/M, z16.d, z31.d +sub z16.d, z22.d, z21.d +add z22.d, z22.d, z21.d +str q22, [x0, #2000] +str q16, [x0, #2032] +sqrdmulh z16.d, z18.d, z3.d[0] +mul z18.d, z18.d,z2.d[0] +mla z18.d, P0/M, z16.d, z31.d +sub z16.d, z17.d, z18.d +add z17.d, z17.d, z18.d +str q17, [x0, #1936] +str q16, [x0, #1968] +ldr q16, [x0, #2016] +ldr q17, [x0, #1984] +ldr q18, [x0, #1920] +ldr q22, [x0, #1952] +ldr q21, [x0, #1888] +ldr q20, [x0, #1856] +ldr q23, [x0, #1792] +ldr q24, [x0, #1824] +sqrdmulh z19.d, z16.d, z13.d[0] +mul z16.d, z16.d,z12.d[0] +mla z16.d, P0/M, z19.d, z31.d +sub z19.d, z21.d, z16.d +add z21.d, z21.d, z16.d +sqrdmulh z16.d, z17.d, z13.d[0] +mul z17.d, z17.d,z12.d[0] +mla z17.d, P0/M, z16.d, z31.d +sub z16.d, z20.d, z17.d +add z20.d, z20.d, z17.d +sqrdmulh z17.d, z18.d, z13.d[0] +mul z18.d, z18.d,z12.d[0] +mla z18.d, P0/M, z17.d, z31.d +sub z17.d, z23.d, z18.d +add z23.d, z23.d, z18.d +sqrdmulh z18.d, z22.d, z13.d[0] +mul z22.d, z22.d,z12.d[0] +mla z22.d, P0/M, z18.d, z31.d +sub z18.d, z24.d, z22.d +add z24.d, z24.d, z22.d +sqrdmulh z22.d, z21.d, z15.d[0] +mul z21.d, z21.d,z14.d[0] +mla z21.d, P0/M, z22.d, z31.d +sub z22.d, z24.d, z21.d +add z24.d, z24.d, z21.d +sqrdmulh z21.d, z20.d, z15.d[0] +mul z20.d, z20.d,z14.d[0] +mla z20.d, P0/M, z21.d, z31.d +sub z21.d, z23.d, z20.d +add z23.d, z23.d, z20.d +sqrdmulh z20.d, z19.d, z15.d[1] +mul z19.d, z19.d,z14.d[1] +mla z19.d, P0/M, z20.d, z31.d +sub z20.d, z18.d, z19.d +add z18.d, z18.d, z19.d +sqrdmulh z19.d, z16.d, z15.d[1] +mul z16.d, z16.d,z14.d[1] +mla z16.d, P0/M, z19.d, z31.d +sub z19.d, z17.d, z16.d +add z17.d, z17.d, z16.d +sqrdmulh z16.d, z24.d, z1.d[0] +mul z24.d, z24.d,z0.d[0] +mla z24.d, P0/M, z16.d, z31.d +sub z16.d, z23.d, z24.d +add z23.d, z23.d, z24.d +str q23, [x0, #1792] +str q16, [x0, #1824] +sqrdmulh z16.d, z22.d, z1.d[1] +mul z22.d, z22.d,z0.d[1] +mla z22.d, P0/M, z16.d, z31.d +sub z16.d, z21.d, z22.d +add z21.d, z21.d, z22.d +str q21, [x0, #1856] +str q16, [x0, #1888] +sqrdmulh z16.d, z20.d, z3.d[1] +mul z20.d, z20.d,z2.d[1] +mla z20.d, P0/M, z16.d, z31.d +sub z16.d, z19.d, z20.d +add z19.d, z19.d, z20.d +str q19, [x0, #1984] +str q16, [x0, #2016] +sqrdmulh z16.d, z18.d, z3.d[0] +mul z18.d, z18.d,z2.d[0] +mla z18.d, P0/M, z16.d, z31.d +sub z16.d, z17.d, z18.d +add z17.d, z17.d, z18.d +str q17, [x0, #1920] +str q16, [x0, #1952] +// Restore SVE2 vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 2697 +// Instruction count: 2693 \ No newline at end of file diff --git a/tests/ntt_sve2/auto/ntt_u64_incomplete_72057594067788289_60277548896192635_var_3_3_1.s b/tests/ntt_sve2/auto/ntt_u64_incomplete_72057594067788289_60277548896192635_var_3_3_1.s new file mode 100644 index 0000000..9cf22e1 --- /dev/null +++ b/tests/ntt_sve2/auto/ntt_u64_incomplete_72057594067788289_60277548896192635_var_3_3_1.s @@ -0,0 +1,2727 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +modulus: +.dword -72057594067788289 +.dword -72057594067788289 +.dword -72057594067788289 +.dword -72057594067788289 +.align 6 +roots_merged: +.dword 25792053496987399 // Layer 0, block 0 +.dword 0 // Layer None, block None +.dword 3301382846246308405 // Layer 0, block 0 +.dword 0 // Layer None, block None +.dword 36678763444893001 // Layer 1, block 0 +.dword 12009493193917617 // Layer 1, block 1 +.dword 4694881719000765600 // Layer 1, block 0 +.dword 1537215128184439725 // Layer 1, block 1 +.dword 57226611787624233 // Layer 2, block 0 +.dword 39665359539540334 // Layer 2, block 1 +.dword 7325006305780451127 // Layer 2, block 0 +.dword 5077166018957207276 // Layer 2, block 1 +.dword 14359056949694594 // Layer 2, block 2 +.dword 63449028357011879 // Layer 2, block 3 +.dword 1837959288799265711 // Layer 2, block 2 +.dword 8121475626332016399 // Layer 2, block 3 +.dword 56437370284897879 // Layer 3, block 0 +.dword 0 // Layer None, block None +.dword 7223983393473341270 // Layer 3, block 0 +.dword 0 // Layer None, block None +.dword 15519149204003269 // Layer 4, block 0 +.dword 18945631884663455 // Layer 4, block 1 +.dword 1986451097289241753 // Layer 4, block 0 +.dword 2425040880231995866 // Layer 4, block 1 +.dword 21843809513296019 // Layer 5, block 0 +.dword 52861630939350015 // Layer 5, block 1 +.dword 2796007616543237058 // Layer 5, block 0 +.dword 6766288757432881341 // Layer 5, block 1 +.dword 58200436133340777 // Layer 5, block 2 +.dword 45581265709396633 // Layer 5, block 3 +.dword 7449655821980514543 // Layer 5, block 2 +.dword 5834402008385018253 // Layer 5, block 3 +.dword 7801853795705237 // Layer 3, block 1 +.dword 0 // Layer None, block None +.dword 998637285436439396 // Layer 3, block 1 +.dword 0 // Layer None, block None +.dword 72057409685042741 // Layer 4, block 2 +.dword 67813594624550994 // Layer 4, block 3 +.dword 9223348435863355444 // Layer 4, block 2 +.dword 8680140108345514992 // Layer 4, block 3 +.dword 16444438478993771 // Layer 5, block 4 +.dword 44738633871916757 // Layer 5, block 5 +.dword 2104888124438946221 // Layer 5, block 4 +.dword 5726545133232289544 // Layer 5, block 5 +.dword 14998888047589537 // Layer 5, block 6 +.dword 1367715298619054 // Layer 5, block 7 +.dword 1919857669295880083 // Layer 5, block 6 +.dword 175067558150691679 // Layer 5, block 7 +.dword 50810289212278368 // Layer 3, block 2 +.dword 0 // Layer None, block None +.dword 6503717016476519110 // Layer 3, block 2 +.dword 0 // Layer None, block None +.dword 38922220208018571 // Layer 4, block 4 +.dword 7966052600948377 // Layer 4, block 5 +.dword 4982044184561839686 // Layer 4, block 4 +.dword 1019654732498851778 // Layer 4, block 5 +.dword 45879272116084567 // Layer 5, block 8 +.dword 66654388400258382 // Layer 5, block 9 +.dword 5872546828425266758 // Layer 5, block 8 +.dword 8531761711697548017 // Layer 5, block 9 +.dword 8930087962801744 // Layer 5, block 10 +.dword 61848588213223279 // Layer 5, block 11 +.dword 1143051258764947771 // Layer 5, block 10 +.dword 7916619288011967173 // Layer 5, block 11 +.dword 31977682183549777 // Layer 3, block 3 +.dword 0 // Layer None, block None +.dword 4093143317798190700 // Layer 3, block 3 +.dword 0 // Layer None, block None +.dword 66070897124800871 // Layer 4, block 6 +.dword 953067252694683 // Layer 4, block 7 +.dword 8457074828469936528 // Layer 4, block 6 +.dword 121992608294366219 // Layer 4, block 7 +.dword 33801610235026337 // Layer 5, block 12 +.dword 32122784433286747 // Layer 5, block 13 +.dword 4326606108290444417 // Layer 5, block 12 +.dword 4111716405756826253 // Layer 5, block 13 +.dword 67688369535326483 // Layer 5, block 14 +.dword 45021686719473556 // Layer 5, block 15 +.dword 8664111296931419854 // Layer 5, block 14 +.dword 5762775897704545946 // Layer 5, block 15 +.dword 66662168904752601 // Layer 3, block 4 +.dword 0 // Layer None, block None +.dword 8532757616272395351 // Layer 3, block 4 +.dword 0 // Layer None, block None +.dword 23961218891132444 // Layer 4, block 8 +.dword 59012643726482518 // Layer 4, block 9 +.dword 3067036016793986470 // Layer 4, block 8 +.dword 7553618393859575754 // Layer 4, block 9 +.dword 52812533586708198 // Layer 5, block 16 +.dword 27994290036168371 // Layer 5, block 17 +.dword 6760004296297333018 // Layer 5, block 16 +.dword 3583269123144660376 // Layer 5, block 17 +.dword 45890717144660134 // Layer 5, block 18 +.dword 39684773913748863 // Layer 5, block 19 +.dword 5874011792082332260 // Layer 5, block 18 +.dword 5079651058854869198 // Layer 5, block 19 +.dword 50149898471788096 // Layer 3, block 5 +.dword 0 // Layer None, block None +.dword 6419187001728793164 // Layer 3, block 5 +.dword 0 // Layer None, block None +.dword 65714767972465509 // Layer 4, block 10 +.dword 51421828010275652 // Layer 4, block 11 +.dword 8411490296989900223 // Layer 4, block 10 +.dword 6581993982587733829 // Layer 4, block 11 +.dword 18683690578478417 // Layer 5, block 20 +.dword 3282356803714609 // Layer 5, block 21 +.dword 2391512393054205061 // Layer 5, block 20 +.dword 420141670701365074 // Layer 5, block 21 +.dword 67884452950503047 // Layer 5, block 22 +.dword 10335338564031418 // Layer 5, block 23 +.dword 8689209974063619263 // Layer 5, block 22 +.dword 1322923335647807838 // Layer 5, block 23 +.dword 30932683335866672 // Layer 3, block 6 +.dword 0 // Layer None, block None +.dword 3959383465350182760 // Layer 3, block 6 +.dword 0 // Layer None, block None +.dword 27050097608373352 // Layer 4, block 12 +.dword 67454821565758121 // Layer 4, block 13 +.dword 3462412492436980406 // Layer 4, block 12 +.dword 8634217156839057519 // Layer 4, block 13 +.dword 32828920539599153 // Layer 5, block 24 +.dword 8624332566875856 // Layer 5, block 25 +.dword 4202101827327358896 // Layer 5, block 24 +.dword 1103914568102652181 // Layer 5, block 25 +.dword 56732837753533829 // Layer 5, block 26 +.dword 14816466027490539 // Layer 5, block 27 +.dword 7261803229443070495 // Layer 5, block 26 +.dword 1896507650732884485 // Layer 5, block 27 +.dword 54968319742463037 // Layer 3, block 7 +.dword 0 // Layer None, block None +.dword 7035944924119603816 // Layer 3, block 7 +.dword 0 // Layer None, block None +.dword 55666925166425210 // Layer 4, block 14 +.dword 34241587306439298 // Layer 4, block 15 +.dword 7125366418349706083 // Layer 4, block 14 +.dword 4382923173407965878 // Layer 4, block 15 +.dword 8550051130607768 // Layer 5, block 28 +.dword 14420141705316589 // Layer 5, block 29 +.dword 1094406544264277001 // Layer 5, block 28 +.dword 1845778137515640974 // Layer 5, block 29 +.dword 55622715926092387 // Layer 5, block 30 +.dword 3405033449209397 // Layer 5, block 31 +.dword 7119707635589449714 // Layer 5, block 30 +.dword 435844281318190845 // Layer 5, block 31 +.text +.type ntt_u64_incomplete_sve2_asm_var_3_3_1, %function +.global ntt_u64_incomplete_sve2_asm_var_3_3_1 +modulus_addr: .quad modulus +roots_merged_addr: .quad roots_merged +ntt_u64_incomplete_sve2_asm_var_3_3_1: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save SVE2 vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ldr x17, modulus_addr +ldr q31, [x17] +ptrue P0.d +ldr x17, roots_merged_addr +ldr q3, [x17, #+0] +ldr q2, [x17, #+16] +ldr q1, [x17, #+32] +ldr q0, [x17, #+48] +ldr q15, [x17, #+64] +ldr q14, [x17, #+80] +ldr q13, [x17, #+96] +ldr q12, [x17, #+112] +ldr q30, [x0, #1920] +ldr q29, [x0, #1664] +sqrdmulh z28.d, z30.d, z2.d[0] +mul z30.d, z30.d,z3.d[0] +ldr q27, [x0, #1152] +ldr q26, [x0, #1408] +sqrdmulh z25.d, z29.d, z2.d[0] +mla z30.d, P0/M, z28.d, z31.d +mul z29.d, z29.d,z3.d[0] +ldr q28, [x0, #896] +ldr q24, [x0, #640] +sqrdmulh z23.d, z27.d, z2.d[0] +sub z22.d, z28.d, z30.d +mla z29.d, P0/M, z25.d, z31.d +mul z27.d, z27.d,z3.d[0] +add z28.d, z28.d, z30.d +ldr q30, [x0, #128] +ldr q25, [x0, #384] +sqrdmulh z21.d, z26.d, z2.d[0] +sub z20.d, z24.d, z29.d +mla z27.d, P0/M, z23.d, z31.d +mul z26.d, z26.d,z3.d[0] +add z24.d, z24.d, z29.d +sqrdmulh z29.d, z28.d, z0.d[0] +sub z23.d, z30.d, z27.d +mla z26.d, P0/M, z21.d, z31.d +mul z28.d, z28.d,z1.d[0] +add z30.d, z30.d, z27.d +sqrdmulh z27.d, z24.d, z0.d[0] +sub z21.d, z25.d, z26.d +mla z28.d, P0/M, z29.d, z31.d +mul z24.d, z24.d,z1.d[0] +add z25.d, z25.d, z26.d +sqrdmulh z26.d, z22.d, z0.d[1] +sub z29.d, z25.d, z28.d +mla z24.d, P0/M, z27.d, z31.d +mul z22.d, z22.d,z1.d[1] +add z25.d, z25.d, z28.d +sqrdmulh z28.d, z20.d, z0.d[1] +sub z27.d, z30.d, z24.d +mla z22.d, P0/M, z26.d, z31.d +mul z20.d, z20.d,z1.d[1] +add z30.d, z30.d, z24.d +sqrdmulh z24.d, z25.d, z14.d[0] +sub z26.d, z21.d, z22.d +mla z20.d, P0/M, z28.d, z31.d +mul z25.d, z25.d,z15.d[0] +add z21.d, z21.d, z22.d +sqrdmulh z22.d, z29.d, z14.d[1] +sub z28.d, z23.d, z20.d +mla z25.d, P0/M, z24.d, z31.d +mul z29.d, z29.d,z15.d[1] +add z23.d, z23.d, z20.d +sqrdmulh z20.d, z26.d, z12.d[1] +sub z24.d, z30.d, z25.d +mla z29.d, P0/M, z22.d, z31.d +mul z26.d, z26.d,z13.d[1] +add z30.d, z30.d, z25.d +str q30, [x0, #128] +str q24, [x0, #384] +sqrdmulh z24.d, z21.d, z12.d[0] +sub z30.d, z27.d, z29.d +mla z26.d, P0/M, z20.d, z31.d +mul z21.d, z21.d,z13.d[0] +add z27.d, z27.d, z29.d +str q27, [x0, #640] +str q30, [x0, #896] +ldr q30, [x0, #1936] +ldr q27, [x0, #1680] +sqrdmulh z29.d, z30.d, z2.d[0] +sub z20.d, z28.d, z26.d +mla z21.d, P0/M, z24.d, z31.d +mul z30.d, z30.d,z3.d[0] +add z28.d, z28.d, z26.d +str q28, [x0, #1664] +str q20, [x0, #1920] +ldr q20, [x0, #1168] +ldr q28, [x0, #1424] +sqrdmulh z26.d, z27.d, z2.d[0] +sub z24.d, z23.d, z21.d +mla z30.d, P0/M, z29.d, z31.d +mul z27.d, z27.d,z3.d[0] +add z23.d, z23.d, z21.d +str q23, [x0, #1152] +str q24, [x0, #1408] +ldr q24, [x0, #912] +ldr q23, [x0, #656] +sqrdmulh z21.d, z20.d, z2.d[0] +sub z29.d, z24.d, z30.d +mla z27.d, P0/M, z26.d, z31.d +mul z20.d, z20.d,z3.d[0] +add z24.d, z24.d, z30.d +ldr q30, [x0, #144] +ldr q26, [x0, #400] +sqrdmulh z25.d, z28.d, z2.d[0] +sub z22.d, z23.d, z27.d +mla z20.d, P0/M, z21.d, z31.d +mul z28.d, z28.d,z3.d[0] +add z23.d, z23.d, z27.d +sqrdmulh z27.d, z24.d, z0.d[0] +sub z21.d, z30.d, z20.d +mla z28.d, P0/M, z25.d, z31.d +mul z24.d, z24.d,z1.d[0] +add z30.d, z30.d, z20.d +sqrdmulh z20.d, z23.d, z0.d[0] +sub z25.d, z26.d, z28.d +mla z24.d, P0/M, z27.d, z31.d +mul z23.d, z23.d,z1.d[0] +add z26.d, z26.d, z28.d +sqrdmulh z28.d, z29.d, z0.d[1] +sub z27.d, z26.d, z24.d +mla z23.d, P0/M, z20.d, z31.d +mul z29.d, z29.d,z1.d[1] +add z26.d, z26.d, z24.d +sqrdmulh z24.d, z22.d, z0.d[1] +sub z20.d, z30.d, z23.d +mla z29.d, P0/M, z28.d, z31.d +mul z22.d, z22.d,z1.d[1] +add z30.d, z30.d, z23.d +sqrdmulh z23.d, z26.d, z14.d[0] +sub z28.d, z25.d, z29.d +mla z22.d, P0/M, z24.d, z31.d +mul z26.d, z26.d,z15.d[0] +add z25.d, z25.d, z29.d +sqrdmulh z29.d, z27.d, z14.d[1] +sub z24.d, z21.d, z22.d +mla z26.d, P0/M, z23.d, z31.d +mul z27.d, z27.d,z15.d[1] +add z21.d, z21.d, z22.d +sqrdmulh z22.d, z28.d, z12.d[1] +sub z23.d, z30.d, z26.d +mla z27.d, P0/M, z29.d, z31.d +mul z28.d, z28.d,z13.d[1] +add z30.d, z30.d, z26.d +str q30, [x0, #144] +str q23, [x0, #400] +sqrdmulh z23.d, z25.d, z12.d[0] +sub z30.d, z20.d, z27.d +mla z28.d, P0/M, z22.d, z31.d +mul z25.d, z25.d,z13.d[0] +add z20.d, z20.d, z27.d +str q20, [x0, #656] +str q30, [x0, #912] +ldr q30, [x0, #1952] +ldr q20, [x0, #1696] +sqrdmulh z27.d, z30.d, z2.d[0] +sub z22.d, z24.d, z28.d +mla z25.d, P0/M, z23.d, z31.d +mul z30.d, z30.d,z3.d[0] +add z24.d, z24.d, z28.d +str q24, [x0, #1680] +str q22, [x0, #1936] +ldr q22, [x0, #1184] +ldr q24, [x0, #1440] +sqrdmulh z28.d, z20.d, z2.d[0] +sub z23.d, z21.d, z25.d +mla z30.d, P0/M, z27.d, z31.d +mul z20.d, z20.d,z3.d[0] +add z21.d, z21.d, z25.d +str q21, [x0, #1168] +str q23, [x0, #1424] +ldr q23, [x0, #928] +ldr q21, [x0, #672] +sqrdmulh z25.d, z22.d, z2.d[0] +sub z27.d, z23.d, z30.d +mla z20.d, P0/M, z28.d, z31.d +mul z22.d, z22.d,z3.d[0] +add z23.d, z23.d, z30.d +ldr q30, [x0, #160] +ldr q28, [x0, #416] +sqrdmulh z26.d, z24.d, z2.d[0] +sub z29.d, z21.d, z20.d +mla z22.d, P0/M, z25.d, z31.d +mul z24.d, z24.d,z3.d[0] +add z21.d, z21.d, z20.d +sqrdmulh z20.d, z23.d, z0.d[0] +sub z25.d, z30.d, z22.d +mla z24.d, P0/M, z26.d, z31.d +mul z23.d, z23.d,z1.d[0] +add z30.d, z30.d, z22.d +sqrdmulh z22.d, z21.d, z0.d[0] +sub z26.d, z28.d, z24.d +mla z23.d, P0/M, z20.d, z31.d +mul z21.d, z21.d,z1.d[0] +add z28.d, z28.d, z24.d +sqrdmulh z24.d, z27.d, z0.d[1] +sub z20.d, z28.d, z23.d +mla z21.d, P0/M, z22.d, z31.d +mul z27.d, z27.d,z1.d[1] +add z28.d, z28.d, z23.d +sqrdmulh z23.d, z29.d, z0.d[1] +sub z22.d, z30.d, z21.d +mla z27.d, P0/M, z24.d, z31.d +mul z29.d, z29.d,z1.d[1] +add z30.d, z30.d, z21.d +sqrdmulh z21.d, z28.d, z14.d[0] +sub z24.d, z26.d, z27.d +mla z29.d, P0/M, z23.d, z31.d +mul z28.d, z28.d,z15.d[0] +add z26.d, z26.d, z27.d +sqrdmulh z27.d, z20.d, z14.d[1] +sub z23.d, z25.d, z29.d +mla z28.d, P0/M, z21.d, z31.d +mul z20.d, z20.d,z15.d[1] +add z25.d, z25.d, z29.d +sqrdmulh z29.d, z24.d, z12.d[1] +sub z21.d, z30.d, z28.d +mla z20.d, P0/M, z27.d, z31.d +mul z24.d, z24.d,z13.d[1] +add z30.d, z30.d, z28.d +str q30, [x0, #160] +str q21, [x0, #416] +sqrdmulh z21.d, z26.d, z12.d[0] +sub z30.d, z22.d, z20.d +mla z24.d, P0/M, z29.d, z31.d +mul z26.d, z26.d,z13.d[0] +add z22.d, z22.d, z20.d +str q22, [x0, #672] +str q30, [x0, #928] +ldr q30, [x0, #1968] +ldr q22, [x0, #1712] +sqrdmulh z20.d, z30.d, z2.d[0] +sub z29.d, z23.d, z24.d +mla z26.d, P0/M, z21.d, z31.d +mul z30.d, z30.d,z3.d[0] +add z23.d, z23.d, z24.d +str q23, [x0, #1696] +str q29, [x0, #1952] +ldr q29, [x0, #1200] +ldr q23, [x0, #1456] +sqrdmulh z24.d, z22.d, z2.d[0] +sub z21.d, z25.d, z26.d +mla z30.d, P0/M, z20.d, z31.d +mul z22.d, z22.d,z3.d[0] +add z25.d, z25.d, z26.d +str q25, [x0, #1184] +str q21, [x0, #1440] +ldr q21, [x0, #944] +ldr q25, [x0, #688] +sqrdmulh z26.d, z29.d, z2.d[0] +sub z20.d, z21.d, z30.d +mla z22.d, P0/M, z24.d, z31.d +mul z29.d, z29.d,z3.d[0] +add z21.d, z21.d, z30.d +ldr q30, [x0, #176] +ldr q24, [x0, #432] +sqrdmulh z28.d, z23.d, z2.d[0] +sub z27.d, z25.d, z22.d +mla z29.d, P0/M, z26.d, z31.d +mul z23.d, z23.d,z3.d[0] +add z25.d, z25.d, z22.d +sqrdmulh z22.d, z21.d, z0.d[0] +sub z26.d, z30.d, z29.d +mla z23.d, P0/M, z28.d, z31.d +mul z21.d, z21.d,z1.d[0] +add z30.d, z30.d, z29.d +sqrdmulh z29.d, z25.d, z0.d[0] +sub z28.d, z24.d, z23.d +mla z21.d, P0/M, z22.d, z31.d +mul z25.d, z25.d,z1.d[0] +add z24.d, z24.d, z23.d +sqrdmulh z23.d, z20.d, z0.d[1] +sub z22.d, z24.d, z21.d +mla z25.d, P0/M, z29.d, z31.d +mul z20.d, z20.d,z1.d[1] +add z24.d, z24.d, z21.d +sqrdmulh z21.d, z27.d, z0.d[1] +sub z29.d, z30.d, z25.d +mla z20.d, P0/M, z23.d, z31.d +mul z27.d, z27.d,z1.d[1] +add z30.d, z30.d, z25.d +sqrdmulh z25.d, z24.d, z14.d[0] +sub z23.d, z28.d, z20.d +mla z27.d, P0/M, z21.d, z31.d +mul z24.d, z24.d,z15.d[0] +add z28.d, z28.d, z20.d +sqrdmulh z20.d, z22.d, z14.d[1] +sub z21.d, z26.d, z27.d +mla z24.d, P0/M, z25.d, z31.d +mul z22.d, z22.d,z15.d[1] +add z26.d, z26.d, z27.d +sqrdmulh z27.d, z23.d, z12.d[1] +sub z25.d, z30.d, z24.d +mla z22.d, P0/M, z20.d, z31.d +mul z23.d, z23.d,z13.d[1] +add z30.d, z30.d, z24.d +str q30, [x0, #176] +str q25, [x0, #432] +sqrdmulh z25.d, z28.d, z12.d[0] +sub z30.d, z29.d, z22.d +mla z23.d, P0/M, z27.d, z31.d +mul z28.d, z28.d,z13.d[0] +add z29.d, z29.d, z22.d +str q29, [x0, #688] +str q30, [x0, #944] +ldr q30, [x0, #1984] +ldr q29, [x0, #1728] +sqrdmulh z22.d, z30.d, z2.d[0] +sub z27.d, z21.d, z23.d +mla z28.d, P0/M, z25.d, z31.d +mul z30.d, z30.d,z3.d[0] +add z21.d, z21.d, z23.d +str q21, [x0, #1712] +str q27, [x0, #1968] +ldr q27, [x0, #1216] +ldr q21, [x0, #1472] +sqrdmulh z23.d, z29.d, z2.d[0] +sub z25.d, z26.d, z28.d +mla z30.d, P0/M, z22.d, z31.d +mul z29.d, z29.d,z3.d[0] +add z26.d, z26.d, z28.d +str q26, [x0, #1200] +str q25, [x0, #1456] +ldr q25, [x0, #960] +ldr q26, [x0, #704] +sqrdmulh z28.d, z27.d, z2.d[0] +sub z22.d, z25.d, z30.d +mla z29.d, P0/M, z23.d, z31.d +mul z27.d, z27.d,z3.d[0] +add z25.d, z25.d, z30.d +ldr q30, [x0, #192] +ldr q23, [x0, #448] +sqrdmulh z24.d, z21.d, z2.d[0] +sub z20.d, z26.d, z29.d +mla z27.d, P0/M, z28.d, z31.d +mul z21.d, z21.d,z3.d[0] +add z26.d, z26.d, z29.d +sqrdmulh z29.d, z25.d, z0.d[0] +sub z28.d, z30.d, z27.d +mla z21.d, P0/M, z24.d, z31.d +mul z25.d, z25.d,z1.d[0] +add z30.d, z30.d, z27.d +sqrdmulh z27.d, z26.d, z0.d[0] +sub z24.d, z23.d, z21.d +mla z25.d, P0/M, z29.d, z31.d +mul z26.d, z26.d,z1.d[0] +add z23.d, z23.d, z21.d +sqrdmulh z21.d, z22.d, z0.d[1] +sub z29.d, z23.d, z25.d +mla z26.d, P0/M, z27.d, z31.d +mul z22.d, z22.d,z1.d[1] +add z23.d, z23.d, z25.d +sqrdmulh z25.d, z20.d, z0.d[1] +sub z27.d, z30.d, z26.d +mla z22.d, P0/M, z21.d, z31.d +mul z20.d, z20.d,z1.d[1] +add z30.d, z30.d, z26.d +sqrdmulh z26.d, z23.d, z14.d[0] +sub z21.d, z24.d, z22.d +mla z20.d, P0/M, z25.d, z31.d +mul z23.d, z23.d,z15.d[0] +add z24.d, z24.d, z22.d +sqrdmulh z22.d, z29.d, z14.d[1] +sub z25.d, z28.d, z20.d +mla z23.d, P0/M, z26.d, z31.d +mul z29.d, z29.d,z15.d[1] +add z28.d, z28.d, z20.d +sqrdmulh z20.d, z21.d, z12.d[1] +sub z26.d, z30.d, z23.d +mla z29.d, P0/M, z22.d, z31.d +mul z21.d, z21.d,z13.d[1] +add z30.d, z30.d, z23.d +str q30, [x0, #192] +str q26, [x0, #448] +sqrdmulh z26.d, z24.d, z12.d[0] +sub z30.d, z27.d, z29.d +mla z21.d, P0/M, z20.d, z31.d +mul z24.d, z24.d,z13.d[0] +add z27.d, z27.d, z29.d +str q27, [x0, #704] +str q30, [x0, #960] +ldr q30, [x0, #2000] +ldr q27, [x0, #1744] +sqrdmulh z29.d, z30.d, z2.d[0] +sub z20.d, z25.d, z21.d +mla z24.d, P0/M, z26.d, z31.d +mul z30.d, z30.d,z3.d[0] +add z25.d, z25.d, z21.d +str q25, [x0, #1728] +str q20, [x0, #1984] +ldr q20, [x0, #1232] +ldr q25, [x0, #1488] +sqrdmulh z21.d, z27.d, z2.d[0] +sub z26.d, z28.d, z24.d +mla z30.d, P0/M, z29.d, z31.d +mul z27.d, z27.d,z3.d[0] +add z28.d, z28.d, z24.d +str q28, [x0, #1216] +str q26, [x0, #1472] +ldr q26, [x0, #976] +ldr q28, [x0, #720] +sqrdmulh z24.d, z20.d, z2.d[0] +sub z29.d, z26.d, z30.d +mla z27.d, P0/M, z21.d, z31.d +mul z20.d, z20.d,z3.d[0] +add z26.d, z26.d, z30.d +ldr q30, [x0, #208] +ldr q21, [x0, #464] +sqrdmulh z23.d, z25.d, z2.d[0] +sub z22.d, z28.d, z27.d +mla z20.d, P0/M, z24.d, z31.d +mul z25.d, z25.d,z3.d[0] +add z28.d, z28.d, z27.d +sqrdmulh z27.d, z26.d, z0.d[0] +sub z24.d, z30.d, z20.d +mla z25.d, P0/M, z23.d, z31.d +mul z26.d, z26.d,z1.d[0] +add z30.d, z30.d, z20.d +sqrdmulh z20.d, z28.d, z0.d[0] +sub z23.d, z21.d, z25.d +mla z26.d, P0/M, z27.d, z31.d +mul z28.d, z28.d,z1.d[0] +add z21.d, z21.d, z25.d +sqrdmulh z25.d, z29.d, z0.d[1] +sub z27.d, z21.d, z26.d +mla z28.d, P0/M, z20.d, z31.d +mul z29.d, z29.d,z1.d[1] +add z21.d, z21.d, z26.d +sqrdmulh z26.d, z22.d, z0.d[1] +sub z20.d, z30.d, z28.d +mla z29.d, P0/M, z25.d, z31.d +mul z22.d, z22.d,z1.d[1] +add z30.d, z30.d, z28.d +sqrdmulh z28.d, z21.d, z14.d[0] +sub z25.d, z23.d, z29.d +mla z22.d, P0/M, z26.d, z31.d +mul z21.d, z21.d,z15.d[0] +add z23.d, z23.d, z29.d +sqrdmulh z29.d, z27.d, z14.d[1] +sub z26.d, z24.d, z22.d +mla z21.d, P0/M, z28.d, z31.d +mul z27.d, z27.d,z15.d[1] +add z24.d, z24.d, z22.d +sqrdmulh z22.d, z25.d, z12.d[1] +sub z28.d, z30.d, z21.d +mla z27.d, P0/M, z29.d, z31.d +mul z25.d, z25.d,z13.d[1] +add z30.d, z30.d, z21.d +str q30, [x0, #208] +str q28, [x0, #464] +sqrdmulh z28.d, z23.d, z12.d[0] +sub z30.d, z20.d, z27.d +mla z25.d, P0/M, z22.d, z31.d +mul z23.d, z23.d,z13.d[0] +add z20.d, z20.d, z27.d +str q20, [x0, #720] +str q30, [x0, #976] +ldr q30, [x0, #2016] +ldr q20, [x0, #1760] +sqrdmulh z27.d, z30.d, z2.d[0] +sub z22.d, z26.d, z25.d +mla z23.d, P0/M, z28.d, z31.d +mul z30.d, z30.d,z3.d[0] +add z26.d, z26.d, z25.d +str q26, [x0, #1744] +str q22, [x0, #2000] +ldr q22, [x0, #1248] +ldr q26, [x0, #1504] +sqrdmulh z25.d, z20.d, z2.d[0] +sub z28.d, z24.d, z23.d +mla z30.d, P0/M, z27.d, z31.d +mul z20.d, z20.d,z3.d[0] +add z24.d, z24.d, z23.d +str q24, [x0, #1232] +str q28, [x0, #1488] +ldr q28, [x0, #992] +ldr q24, [x0, #736] +sqrdmulh z23.d, z22.d, z2.d[0] +sub z27.d, z28.d, z30.d +mla z20.d, P0/M, z25.d, z31.d +mul z22.d, z22.d,z3.d[0] +add z28.d, z28.d, z30.d +ldr q30, [x0, #224] +ldr q25, [x0, #480] +sqrdmulh z21.d, z26.d, z2.d[0] +sub z29.d, z24.d, z20.d +mla z22.d, P0/M, z23.d, z31.d +mul z26.d, z26.d,z3.d[0] +add z24.d, z24.d, z20.d +sqrdmulh z20.d, z28.d, z0.d[0] +sub z23.d, z30.d, z22.d +mla z26.d, P0/M, z21.d, z31.d +mul z28.d, z28.d,z1.d[0] +add z30.d, z30.d, z22.d +sqrdmulh z22.d, z24.d, z0.d[0] +sub z21.d, z25.d, z26.d +mla z28.d, P0/M, z20.d, z31.d +mul z24.d, z24.d,z1.d[0] +add z25.d, z25.d, z26.d +sqrdmulh z26.d, z27.d, z0.d[1] +sub z20.d, z25.d, z28.d +mla z24.d, P0/M, z22.d, z31.d +mul z27.d, z27.d,z1.d[1] +add z25.d, z25.d, z28.d +sqrdmulh z28.d, z29.d, z0.d[1] +sub z22.d, z30.d, z24.d +mla z27.d, P0/M, z26.d, z31.d +mul z29.d, z29.d,z1.d[1] +add z30.d, z30.d, z24.d +sqrdmulh z24.d, z25.d, z14.d[0] +sub z26.d, z21.d, z27.d +mla z29.d, P0/M, z28.d, z31.d +mul z25.d, z25.d,z15.d[0] +add z21.d, z21.d, z27.d +sqrdmulh z27.d, z20.d, z14.d[1] +sub z28.d, z23.d, z29.d +mla z25.d, P0/M, z24.d, z31.d +mul z20.d, z20.d,z15.d[1] +add z23.d, z23.d, z29.d +sqrdmulh z29.d, z26.d, z12.d[1] +sub z24.d, z30.d, z25.d +mla z20.d, P0/M, z27.d, z31.d +mul z26.d, z26.d,z13.d[1] +add z30.d, z30.d, z25.d +str q30, [x0, #224] +str q24, [x0, #480] +sqrdmulh z24.d, z21.d, z12.d[0] +sub z30.d, z22.d, z20.d +mla z26.d, P0/M, z29.d, z31.d +mul z21.d, z21.d,z13.d[0] +add z22.d, z22.d, z20.d +str q22, [x0, #736] +str q30, [x0, #992] +ldr q30, [x0, #2032] +ldr q22, [x0, #1776] +sqrdmulh z20.d, z30.d, z2.d[0] +sub z29.d, z28.d, z26.d +mla z21.d, P0/M, z24.d, z31.d +mul z30.d, z30.d,z3.d[0] +add z28.d, z28.d, z26.d +str q28, [x0, #1760] +str q29, [x0, #2016] +ldr q29, [x0, #1264] +ldr q28, [x0, #1520] +sqrdmulh z26.d, z22.d, z2.d[0] +sub z24.d, z23.d, z21.d +mla z30.d, P0/M, z20.d, z31.d +mul z22.d, z22.d,z3.d[0] +add z23.d, z23.d, z21.d +str q23, [x0, #1248] +str q24, [x0, #1504] +ldr q24, [x0, #1008] +ldr q23, [x0, #752] +sqrdmulh z21.d, z29.d, z2.d[0] +sub z20.d, z24.d, z30.d +mla z22.d, P0/M, z26.d, z31.d +mul z29.d, z29.d,z3.d[0] +add z24.d, z24.d, z30.d +ldr q30, [x0, #240] +ldr q26, [x0, #496] +sqrdmulh z25.d, z28.d, z2.d[0] +sub z27.d, z23.d, z22.d +mla z29.d, P0/M, z21.d, z31.d +mul z28.d, z28.d,z3.d[0] +add z23.d, z23.d, z22.d +sqrdmulh z22.d, z24.d, z0.d[0] +sub z21.d, z30.d, z29.d +mla z28.d, P0/M, z25.d, z31.d +mul z24.d, z24.d,z1.d[0] +add z30.d, z30.d, z29.d +sqrdmulh z29.d, z23.d, z0.d[0] +sub z25.d, z26.d, z28.d +mla z24.d, P0/M, z22.d, z31.d +mul z23.d, z23.d,z1.d[0] +add z26.d, z26.d, z28.d +sqrdmulh z28.d, z20.d, z0.d[1] +sub z22.d, z26.d, z24.d +mla z23.d, P0/M, z29.d, z31.d +mul z20.d, z20.d,z1.d[1] +add z26.d, z26.d, z24.d +sqrdmulh z24.d, z27.d, z0.d[1] +sub z29.d, z30.d, z23.d +mla z20.d, P0/M, z28.d, z31.d +mul z27.d, z27.d,z1.d[1] +add z30.d, z30.d, z23.d +sqrdmulh z23.d, z26.d, z14.d[0] +sub z28.d, z25.d, z20.d +mla z27.d, P0/M, z24.d, z31.d +mul z26.d, z26.d,z15.d[0] +add z25.d, z25.d, z20.d +sqrdmulh z20.d, z22.d, z14.d[1] +sub z24.d, z21.d, z27.d +mla z26.d, P0/M, z23.d, z31.d +mul z22.d, z22.d,z15.d[1] +add z21.d, z21.d, z27.d +sqrdmulh z27.d, z28.d, z12.d[1] +sub z23.d, z30.d, z26.d +mla z22.d, P0/M, z20.d, z31.d +mul z28.d, z28.d,z13.d[1] +add z30.d, z30.d, z26.d +str q30, [x0, #240] +str q23, [x0, #496] +sqrdmulh z23.d, z25.d, z12.d[0] +sub z30.d, z29.d, z22.d +mla z28.d, P0/M, z27.d, z31.d +mul z25.d, z25.d,z13.d[0] +add z29.d, z29.d, z22.d +str q29, [x0, #752] +str q30, [x0, #1008] +ldr q30, [x0, #1792] +ldr q29, [x0, #1536] +sqrdmulh z22.d, z30.d, z2.d[0] +sub z27.d, z24.d, z28.d +mla z25.d, P0/M, z23.d, z31.d +mul z30.d, z30.d,z3.d[0] +add z24.d, z24.d, z28.d +str q24, [x0, #1776] +str q27, [x0, #2032] +ldr q27, [x0, #1024] +ldr q24, [x0, #1280] +sqrdmulh z28.d, z29.d, z2.d[0] +sub z23.d, z21.d, z25.d +mla z30.d, P0/M, z22.d, z31.d +mul z29.d, z29.d,z3.d[0] +add z21.d, z21.d, z25.d +str q21, [x0, #1264] +str q23, [x0, #1520] +ldr q23, [x0, #768] +ldr q21, [x0, #512] +sqrdmulh z25.d, z27.d, z2.d[0] +sub z22.d, z23.d, z30.d +mla z29.d, P0/M, z28.d, z31.d +mul z27.d, z27.d,z3.d[0] +add z23.d, z23.d, z30.d +ldr q30, [x0, #0] +ldr q28, [x0, #256] +sqrdmulh z26.d, z24.d, z2.d[0] +sub z20.d, z21.d, z29.d +mla z27.d, P0/M, z25.d, z31.d +mul z24.d, z24.d,z3.d[0] +add z21.d, z21.d, z29.d +sqrdmulh z29.d, z23.d, z0.d[0] +sub z25.d, z30.d, z27.d +mla z24.d, P0/M, z26.d, z31.d +mul z23.d, z23.d,z1.d[0] +add z30.d, z30.d, z27.d +sqrdmulh z27.d, z21.d, z0.d[0] +sub z26.d, z28.d, z24.d +mla z23.d, P0/M, z29.d, z31.d +mul z21.d, z21.d,z1.d[0] +add z28.d, z28.d, z24.d +sqrdmulh z24.d, z22.d, z0.d[1] +sub z29.d, z28.d, z23.d +mla z21.d, P0/M, z27.d, z31.d +mul z22.d, z22.d,z1.d[1] +add z28.d, z28.d, z23.d +sqrdmulh z23.d, z20.d, z0.d[1] +sub z27.d, z30.d, z21.d +mla z22.d, P0/M, z24.d, z31.d +mul z20.d, z20.d,z1.d[1] +add z30.d, z30.d, z21.d +sqrdmulh z21.d, z28.d, z14.d[0] +sub z24.d, z26.d, z22.d +mla z20.d, P0/M, z23.d, z31.d +mul z28.d, z28.d,z15.d[0] +add z26.d, z26.d, z22.d +sqrdmulh z22.d, z29.d, z14.d[1] +sub z23.d, z25.d, z20.d +mla z28.d, P0/M, z21.d, z31.d +mul z29.d, z29.d,z15.d[1] +add z25.d, z25.d, z20.d +sqrdmulh z20.d, z24.d, z12.d[1] +sub z21.d, z30.d, z28.d +mla z29.d, P0/M, z22.d, z31.d +mul z24.d, z24.d,z13.d[1] +add z30.d, z30.d, z28.d +str q30, [x0, #0] +str q21, [x0, #256] +sqrdmulh z21.d, z26.d, z12.d[0] +sub z30.d, z27.d, z29.d +mla z24.d, P0/M, z20.d, z31.d +mul z26.d, z26.d,z13.d[0] +add z27.d, z27.d, z29.d +str q27, [x0, #512] +str q30, [x0, #768] +ldr q30, [x0, #1808] +ldr q27, [x0, #1552] +sqrdmulh z29.d, z30.d, z2.d[0] +sub z20.d, z23.d, z24.d +mla z26.d, P0/M, z21.d, z31.d +mul z30.d, z30.d,z3.d[0] +add z23.d, z23.d, z24.d +str q23, [x0, #1536] +str q20, [x0, #1792] +ldr q20, [x0, #1040] +ldr q23, [x0, #1296] +sqrdmulh z24.d, z27.d, z2.d[0] +sub z21.d, z25.d, z26.d +mla z30.d, P0/M, z29.d, z31.d +mul z27.d, z27.d,z3.d[0] +add z25.d, z25.d, z26.d +str q25, [x0, #1024] +str q21, [x0, #1280] +ldr q21, [x0, #784] +ldr q25, [x0, #528] +sqrdmulh z26.d, z20.d, z2.d[0] +sub z29.d, z21.d, z30.d +mla z27.d, P0/M, z24.d, z31.d +mul z20.d, z20.d,z3.d[0] +add z21.d, z21.d, z30.d +ldr q30, [x0, #16] +ldr q24, [x0, #272] +sqrdmulh z28.d, z23.d, z2.d[0] +sub z22.d, z25.d, z27.d +mla z20.d, P0/M, z26.d, z31.d +mul z23.d, z23.d,z3.d[0] +add z25.d, z25.d, z27.d +sqrdmulh z27.d, z21.d, z0.d[0] +sub z26.d, z30.d, z20.d +mla z23.d, P0/M, z28.d, z31.d +mul z21.d, z21.d,z1.d[0] +add z30.d, z30.d, z20.d +sqrdmulh z20.d, z25.d, z0.d[0] +sub z28.d, z24.d, z23.d +mla z21.d, P0/M, z27.d, z31.d +mul z25.d, z25.d,z1.d[0] +add z24.d, z24.d, z23.d +sqrdmulh z23.d, z29.d, z0.d[1] +sub z27.d, z24.d, z21.d +mla z25.d, P0/M, z20.d, z31.d +mul z29.d, z29.d,z1.d[1] +add z24.d, z24.d, z21.d +sqrdmulh z21.d, z22.d, z0.d[1] +sub z20.d, z30.d, z25.d +mla z29.d, P0/M, z23.d, z31.d +mul z22.d, z22.d,z1.d[1] +add z30.d, z30.d, z25.d +sqrdmulh z25.d, z24.d, z14.d[0] +sub z23.d, z28.d, z29.d +mla z22.d, P0/M, z21.d, z31.d +mul z24.d, z24.d,z15.d[0] +add z28.d, z28.d, z29.d +sqrdmulh z29.d, z27.d, z14.d[1] +sub z21.d, z26.d, z22.d +mla z24.d, P0/M, z25.d, z31.d +mul z27.d, z27.d,z15.d[1] +add z26.d, z26.d, z22.d +sqrdmulh z22.d, z23.d, z12.d[1] +sub z25.d, z30.d, z24.d +mla z27.d, P0/M, z29.d, z31.d +mul z23.d, z23.d,z13.d[1] +add z30.d, z30.d, z24.d +str q30, [x0, #16] +str q25, [x0, #272] +sqrdmulh z25.d, z28.d, z12.d[0] +sub z30.d, z20.d, z27.d +mla z23.d, P0/M, z22.d, z31.d +mul z28.d, z28.d,z13.d[0] +add z20.d, z20.d, z27.d +str q20, [x0, #528] +str q30, [x0, #784] +ldr q30, [x0, #1824] +ldr q20, [x0, #1568] +sqrdmulh z27.d, z30.d, z2.d[0] +sub z22.d, z21.d, z23.d +mla z28.d, P0/M, z25.d, z31.d +mul z30.d, z30.d,z3.d[0] +add z21.d, z21.d, z23.d +str q21, [x0, #1552] +str q22, [x0, #1808] +ldr q22, [x0, #1056] +ldr q21, [x0, #1312] +sqrdmulh z23.d, z20.d, z2.d[0] +sub z25.d, z26.d, z28.d +mla z30.d, P0/M, z27.d, z31.d +mul z20.d, z20.d,z3.d[0] +add z26.d, z26.d, z28.d +str q26, [x0, #1040] +str q25, [x0, #1296] +ldr q25, [x0, #800] +ldr q26, [x0, #544] +sqrdmulh z28.d, z22.d, z2.d[0] +sub z27.d, z25.d, z30.d +mla z20.d, P0/M, z23.d, z31.d +mul z22.d, z22.d,z3.d[0] +add z25.d, z25.d, z30.d +ldr q30, [x0, #32] +ldr q23, [x0, #288] +sqrdmulh z24.d, z21.d, z2.d[0] +sub z29.d, z26.d, z20.d +mla z22.d, P0/M, z28.d, z31.d +mul z21.d, z21.d,z3.d[0] +add z26.d, z26.d, z20.d +sqrdmulh z20.d, z25.d, z0.d[0] +sub z28.d, z30.d, z22.d +mla z21.d, P0/M, z24.d, z31.d +mul z25.d, z25.d,z1.d[0] +add z30.d, z30.d, z22.d +sqrdmulh z22.d, z26.d, z0.d[0] +sub z24.d, z23.d, z21.d +mla z25.d, P0/M, z20.d, z31.d +mul z26.d, z26.d,z1.d[0] +add z23.d, z23.d, z21.d +sqrdmulh z21.d, z27.d, z0.d[1] +sub z20.d, z23.d, z25.d +mla z26.d, P0/M, z22.d, z31.d +mul z27.d, z27.d,z1.d[1] +add z23.d, z23.d, z25.d +sqrdmulh z25.d, z29.d, z0.d[1] +sub z22.d, z30.d, z26.d +mla z27.d, P0/M, z21.d, z31.d +mul z29.d, z29.d,z1.d[1] +add z30.d, z30.d, z26.d +sqrdmulh z26.d, z23.d, z14.d[0] +sub z21.d, z24.d, z27.d +mla z29.d, P0/M, z25.d, z31.d +mul z23.d, z23.d,z15.d[0] +add z24.d, z24.d, z27.d +sqrdmulh z27.d, z20.d, z14.d[1] +sub z25.d, z28.d, z29.d +mla z23.d, P0/M, z26.d, z31.d +mul z20.d, z20.d,z15.d[1] +add z28.d, z28.d, z29.d +sqrdmulh z29.d, z21.d, z12.d[1] +sub z26.d, z30.d, z23.d +mla z20.d, P0/M, z27.d, z31.d +mul z21.d, z21.d,z13.d[1] +add z30.d, z30.d, z23.d +str q30, [x0, #32] +str q26, [x0, #288] +sqrdmulh z26.d, z24.d, z12.d[0] +sub z30.d, z22.d, z20.d +mla z21.d, P0/M, z29.d, z31.d +mul z24.d, z24.d,z13.d[0] +add z22.d, z22.d, z20.d +str q22, [x0, #544] +str q30, [x0, #800] +ldr q30, [x0, #1840] +ldr q22, [x0, #1584] +sqrdmulh z20.d, z30.d, z2.d[0] +sub z29.d, z25.d, z21.d +mla z24.d, P0/M, z26.d, z31.d +mul z30.d, z30.d,z3.d[0] +add z25.d, z25.d, z21.d +str q25, [x0, #1568] +str q29, [x0, #1824] +ldr q29, [x0, #1072] +ldr q25, [x0, #1328] +sqrdmulh z21.d, z22.d, z2.d[0] +sub z26.d, z28.d, z24.d +mla z30.d, P0/M, z20.d, z31.d +mul z22.d, z22.d,z3.d[0] +add z28.d, z28.d, z24.d +str q28, [x0, #1056] +str q26, [x0, #1312] +ldr q26, [x0, #816] +ldr q28, [x0, #560] +sqrdmulh z24.d, z29.d, z2.d[0] +sub z20.d, z26.d, z30.d +mla z22.d, P0/M, z21.d, z31.d +mul z29.d, z29.d,z3.d[0] +add z26.d, z26.d, z30.d +ldr q30, [x0, #48] +ldr q21, [x0, #304] +sqrdmulh z23.d, z25.d, z2.d[0] +sub z27.d, z28.d, z22.d +mla z29.d, P0/M, z24.d, z31.d +mul z25.d, z25.d,z3.d[0] +add z28.d, z28.d, z22.d +sqrdmulh z22.d, z26.d, z0.d[0] +sub z24.d, z30.d, z29.d +mla z25.d, P0/M, z23.d, z31.d +mul z26.d, z26.d,z1.d[0] +add z30.d, z30.d, z29.d +sqrdmulh z29.d, z28.d, z0.d[0] +sub z23.d, z21.d, z25.d +mla z26.d, P0/M, z22.d, z31.d +mul z28.d, z28.d,z1.d[0] +add z21.d, z21.d, z25.d +sqrdmulh z25.d, z20.d, z0.d[1] +sub z22.d, z21.d, z26.d +mla z28.d, P0/M, z29.d, z31.d +mul z20.d, z20.d,z1.d[1] +add z21.d, z21.d, z26.d +sqrdmulh z26.d, z27.d, z0.d[1] +sub z29.d, z30.d, z28.d +mla z20.d, P0/M, z25.d, z31.d +mul z27.d, z27.d,z1.d[1] +add z30.d, z30.d, z28.d +sqrdmulh z28.d, z21.d, z14.d[0] +sub z25.d, z23.d, z20.d +mla z27.d, P0/M, z26.d, z31.d +mul z21.d, z21.d,z15.d[0] +add z23.d, z23.d, z20.d +sqrdmulh z20.d, z22.d, z14.d[1] +sub z26.d, z24.d, z27.d +mla z21.d, P0/M, z28.d, z31.d +mul z22.d, z22.d,z15.d[1] +add z24.d, z24.d, z27.d +sqrdmulh z27.d, z25.d, z12.d[1] +sub z28.d, z30.d, z21.d +mla z22.d, P0/M, z20.d, z31.d +mul z25.d, z25.d,z13.d[1] +add z30.d, z30.d, z21.d +str q30, [x0, #48] +str q28, [x0, #304] +sqrdmulh z28.d, z23.d, z12.d[0] +sub z30.d, z29.d, z22.d +mla z25.d, P0/M, z27.d, z31.d +mul z23.d, z23.d,z13.d[0] +add z29.d, z29.d, z22.d +str q29, [x0, #560] +str q30, [x0, #816] +ldr q30, [x0, #1856] +ldr q29, [x0, #1600] +sqrdmulh z22.d, z30.d, z2.d[0] +sub z27.d, z26.d, z25.d +mla z23.d, P0/M, z28.d, z31.d +mul z30.d, z30.d,z3.d[0] +add z26.d, z26.d, z25.d +str q26, [x0, #1584] +str q27, [x0, #1840] +ldr q27, [x0, #1088] +ldr q26, [x0, #1344] +sqrdmulh z25.d, z29.d, z2.d[0] +sub z28.d, z24.d, z23.d +mla z30.d, P0/M, z22.d, z31.d +mul z29.d, z29.d,z3.d[0] +add z24.d, z24.d, z23.d +str q24, [x0, #1072] +str q28, [x0, #1328] +ldr q28, [x0, #832] +ldr q24, [x0, #576] +sqrdmulh z23.d, z27.d, z2.d[0] +sub z22.d, z28.d, z30.d +mla z29.d, P0/M, z25.d, z31.d +mul z27.d, z27.d,z3.d[0] +add z28.d, z28.d, z30.d +ldr q30, [x0, #64] +ldr q25, [x0, #320] +sqrdmulh z21.d, z26.d, z2.d[0] +sub z20.d, z24.d, z29.d +mla z27.d, P0/M, z23.d, z31.d +mul z26.d, z26.d,z3.d[0] +add z24.d, z24.d, z29.d +sqrdmulh z29.d, z28.d, z0.d[0] +sub z23.d, z30.d, z27.d +mla z26.d, P0/M, z21.d, z31.d +mul z28.d, z28.d,z1.d[0] +add z30.d, z30.d, z27.d +sqrdmulh z27.d, z24.d, z0.d[0] +sub z21.d, z25.d, z26.d +mla z28.d, P0/M, z29.d, z31.d +mul z24.d, z24.d,z1.d[0] +add z25.d, z25.d, z26.d +sqrdmulh z26.d, z22.d, z0.d[1] +sub z29.d, z25.d, z28.d +mla z24.d, P0/M, z27.d, z31.d +mul z22.d, z22.d,z1.d[1] +add z25.d, z25.d, z28.d +sqrdmulh z28.d, z20.d, z0.d[1] +sub z27.d, z30.d, z24.d +mla z22.d, P0/M, z26.d, z31.d +mul z20.d, z20.d,z1.d[1] +add z30.d, z30.d, z24.d +sqrdmulh z24.d, z25.d, z14.d[0] +sub z26.d, z21.d, z22.d +mla z20.d, P0/M, z28.d, z31.d +mul z25.d, z25.d,z15.d[0] +add z21.d, z21.d, z22.d +sqrdmulh z22.d, z29.d, z14.d[1] +sub z28.d, z23.d, z20.d +mla z25.d, P0/M, z24.d, z31.d +mul z29.d, z29.d,z15.d[1] +add z23.d, z23.d, z20.d +sqrdmulh z20.d, z26.d, z12.d[1] +sub z24.d, z30.d, z25.d +mla z29.d, P0/M, z22.d, z31.d +mul z26.d, z26.d,z13.d[1] +add z30.d, z30.d, z25.d +str q30, [x0, #64] +str q24, [x0, #320] +sqrdmulh z24.d, z21.d, z12.d[0] +sub z30.d, z27.d, z29.d +mla z26.d, P0/M, z20.d, z31.d +mul z21.d, z21.d,z13.d[0] +add z27.d, z27.d, z29.d +str q27, [x0, #576] +str q30, [x0, #832] +ldr q30, [x0, #1872] +ldr q27, [x0, #1616] +sqrdmulh z29.d, z30.d, z2.d[0] +sub z20.d, z28.d, z26.d +mla z21.d, P0/M, z24.d, z31.d +mul z30.d, z30.d,z3.d[0] +add z28.d, z28.d, z26.d +str q28, [x0, #1600] +str q20, [x0, #1856] +ldr q20, [x0, #1104] +ldr q28, [x0, #1360] +sqrdmulh z26.d, z27.d, z2.d[0] +sub z24.d, z23.d, z21.d +mla z30.d, P0/M, z29.d, z31.d +mul z27.d, z27.d,z3.d[0] +add z23.d, z23.d, z21.d +str q23, [x0, #1088] +str q24, [x0, #1344] +ldr q24, [x0, #848] +ldr q23, [x0, #592] +sqrdmulh z21.d, z20.d, z2.d[0] +sub z29.d, z24.d, z30.d +mla z27.d, P0/M, z26.d, z31.d +mul z20.d, z20.d,z3.d[0] +add z24.d, z24.d, z30.d +ldr q30, [x0, #80] +ldr q26, [x0, #336] +sqrdmulh z25.d, z28.d, z2.d[0] +sub z22.d, z23.d, z27.d +mla z20.d, P0/M, z21.d, z31.d +mul z28.d, z28.d,z3.d[0] +add z23.d, z23.d, z27.d +sqrdmulh z27.d, z24.d, z0.d[0] +sub z21.d, z30.d, z20.d +mla z28.d, P0/M, z25.d, z31.d +mul z24.d, z24.d,z1.d[0] +add z30.d, z30.d, z20.d +sqrdmulh z20.d, z23.d, z0.d[0] +sub z25.d, z26.d, z28.d +mla z24.d, P0/M, z27.d, z31.d +mul z23.d, z23.d,z1.d[0] +add z26.d, z26.d, z28.d +sqrdmulh z28.d, z29.d, z0.d[1] +sub z27.d, z26.d, z24.d +mla z23.d, P0/M, z20.d, z31.d +mul z29.d, z29.d,z1.d[1] +add z26.d, z26.d, z24.d +sqrdmulh z24.d, z22.d, z0.d[1] +sub z20.d, z30.d, z23.d +mla z29.d, P0/M, z28.d, z31.d +mul z22.d, z22.d,z1.d[1] +add z30.d, z30.d, z23.d +sqrdmulh z23.d, z26.d, z14.d[0] +sub z28.d, z25.d, z29.d +mla z22.d, P0/M, z24.d, z31.d +mul z26.d, z26.d,z15.d[0] +add z25.d, z25.d, z29.d +sqrdmulh z29.d, z27.d, z14.d[1] +sub z24.d, z21.d, z22.d +mla z26.d, P0/M, z23.d, z31.d +mul z27.d, z27.d,z15.d[1] +add z21.d, z21.d, z22.d +sqrdmulh z22.d, z28.d, z12.d[1] +sub z23.d, z30.d, z26.d +mla z27.d, P0/M, z29.d, z31.d +mul z28.d, z28.d,z13.d[1] +add z30.d, z30.d, z26.d +str q30, [x0, #80] +str q23, [x0, #336] +sqrdmulh z23.d, z25.d, z12.d[0] +sub z30.d, z20.d, z27.d +mla z28.d, P0/M, z22.d, z31.d +mul z25.d, z25.d,z13.d[0] +add z20.d, z20.d, z27.d +str q20, [x0, #592] +str q30, [x0, #848] +ldr q30, [x0, #1888] +ldr q20, [x0, #1632] +sqrdmulh z27.d, z30.d, z2.d[0] +sub z22.d, z24.d, z28.d +mla z25.d, P0/M, z23.d, z31.d +mul z30.d, z30.d,z3.d[0] +add z24.d, z24.d, z28.d +str q24, [x0, #1616] +str q22, [x0, #1872] +ldr q22, [x0, #1120] +ldr q24, [x0, #1376] +sqrdmulh z28.d, z20.d, z2.d[0] +sub z23.d, z21.d, z25.d +mla z30.d, P0/M, z27.d, z31.d +mul z20.d, z20.d,z3.d[0] +add z21.d, z21.d, z25.d +str q21, [x0, #1104] +str q23, [x0, #1360] +ldr q23, [x0, #864] +ldr q21, [x0, #608] +sqrdmulh z25.d, z22.d, z2.d[0] +sub z27.d, z23.d, z30.d +mla z20.d, P0/M, z28.d, z31.d +mul z22.d, z22.d,z3.d[0] +add z23.d, z23.d, z30.d +ldr q30, [x0, #96] +ldr q28, [x0, #352] +sqrdmulh z26.d, z24.d, z2.d[0] +sub z29.d, z21.d, z20.d +mla z22.d, P0/M, z25.d, z31.d +mul z24.d, z24.d,z3.d[0] +add z21.d, z21.d, z20.d +sqrdmulh z20.d, z23.d, z0.d[0] +sub z25.d, z30.d, z22.d +mla z24.d, P0/M, z26.d, z31.d +mul z23.d, z23.d,z1.d[0] +add z30.d, z30.d, z22.d +sqrdmulh z22.d, z21.d, z0.d[0] +sub z26.d, z28.d, z24.d +mla z23.d, P0/M, z20.d, z31.d +mul z21.d, z21.d,z1.d[0] +add z28.d, z28.d, z24.d +sqrdmulh z24.d, z27.d, z0.d[1] +sub z20.d, z28.d, z23.d +mla z21.d, P0/M, z22.d, z31.d +mul z27.d, z27.d,z1.d[1] +add z28.d, z28.d, z23.d +sqrdmulh z23.d, z29.d, z0.d[1] +sub z22.d, z30.d, z21.d +mla z27.d, P0/M, z24.d, z31.d +mul z29.d, z29.d,z1.d[1] +add z30.d, z30.d, z21.d +sqrdmulh z21.d, z28.d, z14.d[0] +sub z24.d, z26.d, z27.d +mla z29.d, P0/M, z23.d, z31.d +mul z28.d, z28.d,z15.d[0] +add z26.d, z26.d, z27.d +sqrdmulh z27.d, z20.d, z14.d[1] +sub z23.d, z25.d, z29.d +mla z28.d, P0/M, z21.d, z31.d +mul z20.d, z20.d,z15.d[1] +add z25.d, z25.d, z29.d +sqrdmulh z29.d, z24.d, z12.d[1] +sub z21.d, z30.d, z28.d +mla z20.d, P0/M, z27.d, z31.d +mul z24.d, z24.d,z13.d[1] +add z30.d, z30.d, z28.d +str q30, [x0, #96] +str q21, [x0, #352] +sqrdmulh z21.d, z26.d, z12.d[0] +sub z30.d, z22.d, z20.d +mla z24.d, P0/M, z29.d, z31.d +mul z26.d, z26.d,z13.d[0] +add z22.d, z22.d, z20.d +str q22, [x0, #608] +str q30, [x0, #864] +ldr q30, [x0, #1904] +ldr q22, [x0, #1648] +sqrdmulh z20.d, z30.d, z2.d[0] +sub z29.d, z23.d, z24.d +mla z26.d, P0/M, z21.d, z31.d +mul z30.d, z30.d,z3.d[0] +add z23.d, z23.d, z24.d +str q23, [x0, #1632] +str q29, [x0, #1888] +ldr q29, [x0, #1136] +ldr q23, [x0, #1392] +sqrdmulh z24.d, z22.d, z2.d[0] +sub z21.d, z25.d, z26.d +mla z30.d, P0/M, z20.d, z31.d +mul z22.d, z22.d,z3.d[0] +add z25.d, z25.d, z26.d +str q25, [x0, #1120] +str q21, [x0, #1376] +ldr q21, [x0, #880] +ldr q25, [x0, #624] +sqrdmulh z26.d, z29.d, z2.d[0] +sub z20.d, z21.d, z30.d +mla z22.d, P0/M, z24.d, z31.d +mul z29.d, z29.d,z3.d[0] +add z21.d, z21.d, z30.d +ldr q30, [x0, #112] +ldr q24, [x0, #368] +sqrdmulh z28.d, z23.d, z2.d[0] +sub z27.d, z25.d, z22.d +mla z29.d, P0/M, z26.d, z31.d +mul z23.d, z23.d,z3.d[0] +add z25.d, z25.d, z22.d +sqrdmulh z22.d, z21.d, z0.d[0] +sub z26.d, z30.d, z29.d +mla z23.d, P0/M, z28.d, z31.d +mul z21.d, z21.d,z1.d[0] +add z30.d, z30.d, z29.d +sqrdmulh z29.d, z25.d, z0.d[0] +sub z28.d, z24.d, z23.d +mla z21.d, P0/M, z22.d, z31.d +mul z25.d, z25.d,z1.d[0] +add z24.d, z24.d, z23.d +sqrdmulh z23.d, z20.d, z0.d[1] +sub z22.d, z24.d, z21.d +mla z25.d, P0/M, z29.d, z31.d +mul z20.d, z20.d,z1.d[1] +add z24.d, z24.d, z21.d +sqrdmulh z21.d, z27.d, z0.d[1] +sub z29.d, z30.d, z25.d +mla z20.d, P0/M, z23.d, z31.d +mul z27.d, z27.d,z1.d[1] +add z30.d, z30.d, z25.d +sqrdmulh z25.d, z24.d, z14.d[0] +sub z23.d, z28.d, z20.d +mla z27.d, P0/M, z21.d, z31.d +mul z24.d, z24.d,z15.d[0] +add z28.d, z28.d, z20.d +sqrdmulh z20.d, z22.d, z14.d[1] +sub z21.d, z26.d, z27.d +mla z24.d, P0/M, z25.d, z31.d +mul z22.d, z22.d,z15.d[1] +add z26.d, z26.d, z27.d +sqrdmulh z27.d, z23.d, z12.d[1] +sub z25.d, z30.d, z24.d +mla z22.d, P0/M, z20.d, z31.d +mul z23.d, z23.d,z13.d[1] +add z30.d, z30.d, z24.d +str q30, [x0, #112] +str q25, [x0, #368] +sqrdmulh z25.d, z28.d, z12.d[0] +sub z30.d, z29.d, z22.d +mla z23.d, P0/M, z27.d, z31.d +mul z28.d, z28.d,z13.d[0] +add z29.d, z29.d, z22.d +str q29, [x0, #624] +str q30, [x0, #880] +sub z30.d, z21.d, z23.d +mla z28.d, P0/M, z25.d, z31.d +add z21.d, z21.d, z23.d +str q21, [x0, #1648] +str q30, [x0, #1904] +sub z30.d, z26.d, z28.d +add z26.d, z26.d, z28.d +str q26, [x0, #1136] +str q30, [x0, #1392] +ldr q4, [x17, #+128] +ldr q5, [x17, #+144] +ldr q6, [x17, #+160] +ldr q7, [x17, #+176] +ldr q8, [x17, #+192] +ldr q9, [x17, #+208] +ldr q10, [x17, #+224] +ldr q11, [x17, #+240] +ldr q16, [x0, #240] +ldr q17, [x0, #208] +sqrdmulh z18.d, z16.d, z5.d[0] +mul z16.d, z16.d,z4.d[0] +ldr q19, [x0, #144] +ldr q20, [x0, #176] +sqrdmulh z24.d, z17.d, z5.d[0] +mul z17.d, z17.d,z4.d[0] +mla z16.d, P0/M, z18.d, z31.d +ldr q18, [x0, #112] +ldr q27, [x0, #80] +sqrdmulh z22.d, z19.d, z5.d[0] +mul z19.d, z19.d,z4.d[0] +mla z17.d, P0/M, z24.d, z31.d +ldr q24, [x0, #16] +sub z29.d, z18.d, z16.d +ldr q25, [x0, #48] +add z18.d, z18.d, z16.d +sqrdmulh z16.d, z20.d, z5.d[0] +mul z20.d, z20.d,z4.d[0] +sub z23.d, z27.d, z17.d +mla z19.d, P0/M, z22.d, z31.d +add z27.d, z27.d, z17.d +sqrdmulh z17.d, z18.d, z7.d[0] +mul z18.d, z18.d,z6.d[0] +sub z22.d, z24.d, z19.d +mla z20.d, P0/M, z16.d, z31.d +add z24.d, z24.d, z19.d +sqrdmulh z19.d, z27.d, z7.d[0] +mul z27.d, z27.d,z6.d[0] +sub z16.d, z25.d, z20.d +mla z18.d, P0/M, z17.d, z31.d +add z25.d, z25.d, z20.d +sqrdmulh z20.d, z29.d, z7.d[1] +mul z29.d, z29.d,z6.d[1] +sub z17.d, z25.d, z18.d +mla z27.d, P0/M, z19.d, z31.d +add z25.d, z25.d, z18.d +sqrdmulh z18.d, z23.d, z7.d[1] +mul z23.d, z23.d,z6.d[1] +sub z19.d, z24.d, z27.d +mla z29.d, P0/M, z20.d, z31.d +add z24.d, z24.d, z27.d +sqrdmulh z27.d, z25.d, z9.d[0] +mul z25.d, z25.d,z8.d[0] +sub z20.d, z16.d, z29.d +mla z23.d, P0/M, z18.d, z31.d +add z16.d, z16.d, z29.d +sqrdmulh z29.d, z17.d, z9.d[1] +mul z17.d, z17.d,z8.d[1] +sub z18.d, z22.d, z23.d +mla z25.d, P0/M, z27.d, z31.d +add z22.d, z22.d, z23.d +sqrdmulh z23.d, z20.d, z11.d[1] +mul z20.d, z20.d,z10.d[1] +sub z27.d, z24.d, z25.d +mla z17.d, P0/M, z29.d, z31.d +add z24.d, z24.d, z25.d +sqrdmulh z25.d, z16.d, z11.d[0] +str q24, [x0, #16] +mul z16.d, z16.d,z10.d[0] +str q27, [x0, #48] +mla z20.d, P0/M, z23.d, z31.d +sub z23.d, z19.d, z17.d +ldr q27, [x0, #224] +ldr q24, [x0, #192] +add z19.d, z19.d, z17.d +sqrdmulh z17.d, z27.d, z5.d[0] +str q19, [x0, #80] +mul z27.d, z27.d,z4.d[0] +str q23, [x0, #112] +mla z16.d, P0/M, z25.d, z31.d +ldr q25, [x0, #128] +sub z23.d, z18.d, z20.d +ldr q19, [x0, #160] +add z18.d, z18.d, z20.d +sqrdmulh z20.d, z24.d, z5.d[0] +str q18, [x0, #208] +mul z24.d, z24.d,z4.d[0] +str q23, [x0, #240] +mla z27.d, P0/M, z17.d, z31.d +ldr q17, [x0, #96] +sub z23.d, z22.d, z16.d +ldr q18, [x0, #64] +add z22.d, z22.d, z16.d +sqrdmulh z16.d, z25.d, z5.d[0] +str q22, [x0, #144] +mul z25.d, z25.d,z4.d[0] +str q23, [x0, #176] +mla z24.d, P0/M, z20.d, z31.d +ldr q20, [x0, #0] +sub z23.d, z17.d, z27.d +ldr q22, [x0, #32] +add z17.d, z17.d, z27.d +sqrdmulh z27.d, z19.d, z5.d[0] +mul z19.d, z19.d,z4.d[0] +sub z29.d, z18.d, z24.d +mla z25.d, P0/M, z16.d, z31.d +add z18.d, z18.d, z24.d +sqrdmulh z24.d, z17.d, z7.d[0] +mul z17.d, z17.d,z6.d[0] +sub z16.d, z20.d, z25.d +mla z19.d, P0/M, z27.d, z31.d +add z20.d, z20.d, z25.d +sqrdmulh z25.d, z18.d, z7.d[0] +mul z18.d, z18.d,z6.d[0] +sub z27.d, z22.d, z19.d +mla z17.d, P0/M, z24.d, z31.d +add z22.d, z22.d, z19.d +sqrdmulh z19.d, z23.d, z7.d[1] +mul z23.d, z23.d,z6.d[1] +sub z24.d, z22.d, z17.d +mla z18.d, P0/M, z25.d, z31.d +add z22.d, z22.d, z17.d +ldr q3, [x17, #+256] +ldr q2, [x17, #+272] +ldr q1, [x17, #+288] +ldr q0, [x17, #+304] +ldr q15, [x17, #+320] +ldr q14, [x17, #+336] +ldr q13, [x17, #+352] +ldr q12, [x17, #+368] +sqrdmulh z17.d, z29.d, z7.d[1] +mul z29.d, z29.d,z6.d[1] +sub z25.d, z20.d, z18.d +mla z23.d, P0/M, z19.d, z31.d +add z20.d, z20.d, z18.d +sqrdmulh z18.d, z22.d, z9.d[0] +mul z22.d, z22.d,z8.d[0] +sub z19.d, z27.d, z23.d +mla z29.d, P0/M, z17.d, z31.d +add z27.d, z27.d, z23.d +sqrdmulh z23.d, z24.d, z9.d[1] +mul z24.d, z24.d,z8.d[1] +sub z17.d, z16.d, z29.d +mla z22.d, P0/M, z18.d, z31.d +add z16.d, z16.d, z29.d +sqrdmulh z29.d, z19.d, z11.d[1] +mul z19.d, z19.d,z10.d[1] +sub z18.d, z20.d, z22.d +mla z24.d, P0/M, z23.d, z31.d +add z20.d, z20.d, z22.d +sqrdmulh z22.d, z27.d, z11.d[0] +str q20, [x0, #0] +mul z27.d, z27.d,z10.d[0] +str q18, [x0, #32] +mla z19.d, P0/M, z29.d, z31.d +sub z29.d, z25.d, z24.d +ldr q18, [x0, #496] +ldr q20, [x0, #464] +add z25.d, z25.d, z24.d +sqrdmulh z24.d, z18.d, z2.d[0] +str q25, [x0, #64] +mul z18.d, z18.d,z3.d[0] +str q29, [x0, #96] +mla z27.d, P0/M, z22.d, z31.d +ldr q22, [x0, #400] +sub z29.d, z17.d, z19.d +ldr q25, [x0, #432] +add z17.d, z17.d, z19.d +sqrdmulh z19.d, z20.d, z2.d[0] +str q17, [x0, #192] +mul z20.d, z20.d,z3.d[0] +str q29, [x0, #224] +mla z18.d, P0/M, z24.d, z31.d +ldr q24, [x0, #368] +sub z29.d, z16.d, z27.d +ldr q17, [x0, #336] +add z16.d, z16.d, z27.d +sqrdmulh z27.d, z22.d, z2.d[0] +str q16, [x0, #128] +mul z22.d, z22.d,z3.d[0] +str q29, [x0, #160] +mla z20.d, P0/M, z19.d, z31.d +ldr q19, [x0, #272] +sub z29.d, z24.d, z18.d +ldr q16, [x0, #304] +add z24.d, z24.d, z18.d +sqrdmulh z18.d, z25.d, z2.d[0] +mul z25.d, z25.d,z3.d[0] +sub z23.d, z17.d, z20.d +mla z22.d, P0/M, z27.d, z31.d +add z17.d, z17.d, z20.d +sqrdmulh z20.d, z24.d, z0.d[0] +mul z24.d, z24.d,z1.d[0] +sub z27.d, z19.d, z22.d +mla z25.d, P0/M, z18.d, z31.d +add z19.d, z19.d, z22.d +sqrdmulh z22.d, z17.d, z0.d[0] +mul z17.d, z17.d,z1.d[0] +sub z18.d, z16.d, z25.d +mla z24.d, P0/M, z20.d, z31.d +add z16.d, z16.d, z25.d +sqrdmulh z25.d, z29.d, z0.d[1] +mul z29.d, z29.d,z1.d[1] +sub z20.d, z16.d, z24.d +mla z17.d, P0/M, z22.d, z31.d +add z16.d, z16.d, z24.d +sqrdmulh z24.d, z23.d, z0.d[1] +mul z23.d, z23.d,z1.d[1] +sub z22.d, z19.d, z17.d +mla z29.d, P0/M, z25.d, z31.d +add z19.d, z19.d, z17.d +sqrdmulh z17.d, z16.d, z14.d[0] +mul z16.d, z16.d,z15.d[0] +sub z25.d, z18.d, z29.d +mla z23.d, P0/M, z24.d, z31.d +add z18.d, z18.d, z29.d +sqrdmulh z29.d, z20.d, z14.d[1] +mul z20.d, z20.d,z15.d[1] +sub z24.d, z27.d, z23.d +mla z16.d, P0/M, z17.d, z31.d +add z27.d, z27.d, z23.d +sqrdmulh z23.d, z25.d, z12.d[1] +mul z25.d, z25.d,z13.d[1] +sub z17.d, z19.d, z16.d +mla z20.d, P0/M, z29.d, z31.d +add z19.d, z19.d, z16.d +sqrdmulh z16.d, z18.d, z12.d[0] +str q19, [x0, #272] +mul z18.d, z18.d,z13.d[0] +str q17, [x0, #304] +mla z25.d, P0/M, z23.d, z31.d +sub z23.d, z22.d, z20.d +ldr q17, [x0, #480] +ldr q19, [x0, #448] +add z22.d, z22.d, z20.d +sqrdmulh z20.d, z17.d, z2.d[0] +str q22, [x0, #336] +mul z17.d, z17.d,z3.d[0] +str q23, [x0, #368] +mla z18.d, P0/M, z16.d, z31.d +ldr q16, [x0, #384] +sub z23.d, z24.d, z25.d +ldr q22, [x0, #416] +add z24.d, z24.d, z25.d +sqrdmulh z25.d, z19.d, z2.d[0] +str q24, [x0, #464] +mul z19.d, z19.d,z3.d[0] +str q23, [x0, #496] +mla z17.d, P0/M, z20.d, z31.d +ldr q20, [x0, #352] +sub z23.d, z27.d, z18.d +ldr q24, [x0, #320] +add z27.d, z27.d, z18.d +sqrdmulh z18.d, z16.d, z2.d[0] +str q27, [x0, #400] +mul z16.d, z16.d,z3.d[0] +str q23, [x0, #432] +mla z19.d, P0/M, z25.d, z31.d +ldr q25, [x0, #256] +sub z23.d, z20.d, z17.d +ldr q27, [x0, #288] +add z20.d, z20.d, z17.d +sqrdmulh z17.d, z22.d, z2.d[0] +mul z22.d, z22.d,z3.d[0] +sub z29.d, z24.d, z19.d +mla z16.d, P0/M, z18.d, z31.d +add z24.d, z24.d, z19.d +sqrdmulh z19.d, z20.d, z0.d[0] +mul z20.d, z20.d,z1.d[0] +sub z18.d, z25.d, z16.d +mla z22.d, P0/M, z17.d, z31.d +add z25.d, z25.d, z16.d +sqrdmulh z16.d, z24.d, z0.d[0] +mul z24.d, z24.d,z1.d[0] +sub z17.d, z27.d, z22.d +mla z20.d, P0/M, z19.d, z31.d +add z27.d, z27.d, z22.d +sqrdmulh z22.d, z23.d, z0.d[1] +mul z23.d, z23.d,z1.d[1] +sub z19.d, z27.d, z20.d +mla z24.d, P0/M, z16.d, z31.d +add z27.d, z27.d, z20.d +ldr q11, [x17, #+384] +ldr q10, [x17, #+400] +ldr q9, [x17, #+416] +ldr q8, [x17, #+432] +ldr q7, [x17, #+448] +ldr q6, [x17, #+464] +ldr q5, [x17, #+480] +ldr q4, [x17, #+496] +sqrdmulh z20.d, z29.d, z0.d[1] +mul z29.d, z29.d,z1.d[1] +sub z16.d, z25.d, z24.d +mla z23.d, P0/M, z22.d, z31.d +add z25.d, z25.d, z24.d +sqrdmulh z24.d, z27.d, z14.d[0] +mul z27.d, z27.d,z15.d[0] +sub z22.d, z17.d, z23.d +mla z29.d, P0/M, z20.d, z31.d +add z17.d, z17.d, z23.d +sqrdmulh z23.d, z19.d, z14.d[1] +mul z19.d, z19.d,z15.d[1] +sub z20.d, z18.d, z29.d +mla z27.d, P0/M, z24.d, z31.d +add z18.d, z18.d, z29.d +sqrdmulh z29.d, z22.d, z12.d[1] +mul z22.d, z22.d,z13.d[1] +sub z24.d, z25.d, z27.d +mla z19.d, P0/M, z23.d, z31.d +add z25.d, z25.d, z27.d +sqrdmulh z27.d, z17.d, z12.d[0] +str q25, [x0, #256] +mul z17.d, z17.d,z13.d[0] +str q24, [x0, #288] +mla z22.d, P0/M, z29.d, z31.d +sub z29.d, z16.d, z19.d +ldr q24, [x0, #752] +ldr q25, [x0, #720] +add z16.d, z16.d, z19.d +sqrdmulh z19.d, z24.d, z10.d[0] +str q16, [x0, #320] +mul z24.d, z24.d,z11.d[0] +str q29, [x0, #352] +mla z17.d, P0/M, z27.d, z31.d +ldr q27, [x0, #656] +sub z29.d, z20.d, z22.d +ldr q16, [x0, #688] +add z20.d, z20.d, z22.d +sqrdmulh z22.d, z25.d, z10.d[0] +str q20, [x0, #448] +mul z25.d, z25.d,z11.d[0] +str q29, [x0, #480] +mla z24.d, P0/M, z19.d, z31.d +ldr q19, [x0, #624] +sub z29.d, z18.d, z17.d +ldr q20, [x0, #592] +add z18.d, z18.d, z17.d +sqrdmulh z17.d, z27.d, z10.d[0] +str q18, [x0, #384] +mul z27.d, z27.d,z11.d[0] +str q29, [x0, #416] +mla z25.d, P0/M, z22.d, z31.d +ldr q22, [x0, #528] +sub z29.d, z19.d, z24.d +ldr q18, [x0, #560] +add z19.d, z19.d, z24.d +sqrdmulh z24.d, z16.d, z10.d[0] +mul z16.d, z16.d,z11.d[0] +sub z23.d, z20.d, z25.d +mla z27.d, P0/M, z17.d, z31.d +add z20.d, z20.d, z25.d +sqrdmulh z25.d, z19.d, z8.d[0] +mul z19.d, z19.d,z9.d[0] +sub z17.d, z22.d, z27.d +mla z16.d, P0/M, z24.d, z31.d +add z22.d, z22.d, z27.d +sqrdmulh z27.d, z20.d, z8.d[0] +mul z20.d, z20.d,z9.d[0] +sub z24.d, z18.d, z16.d +mla z19.d, P0/M, z25.d, z31.d +add z18.d, z18.d, z16.d +sqrdmulh z16.d, z29.d, z8.d[1] +mul z29.d, z29.d,z9.d[1] +sub z25.d, z18.d, z19.d +mla z20.d, P0/M, z27.d, z31.d +add z18.d, z18.d, z19.d +sqrdmulh z19.d, z23.d, z8.d[1] +mul z23.d, z23.d,z9.d[1] +sub z27.d, z22.d, z20.d +mla z29.d, P0/M, z16.d, z31.d +add z22.d, z22.d, z20.d +sqrdmulh z20.d, z18.d, z6.d[0] +mul z18.d, z18.d,z7.d[0] +sub z16.d, z24.d, z29.d +mla z23.d, P0/M, z19.d, z31.d +add z24.d, z24.d, z29.d +sqrdmulh z29.d, z25.d, z6.d[1] +mul z25.d, z25.d,z7.d[1] +sub z19.d, z17.d, z23.d +mla z18.d, P0/M, z20.d, z31.d +add z17.d, z17.d, z23.d +sqrdmulh z23.d, z16.d, z4.d[1] +mul z16.d, z16.d,z5.d[1] +sub z20.d, z22.d, z18.d +mla z25.d, P0/M, z29.d, z31.d +add z22.d, z22.d, z18.d +sqrdmulh z18.d, z24.d, z4.d[0] +str q22, [x0, #528] +mul z24.d, z24.d,z5.d[0] +str q20, [x0, #560] +mla z16.d, P0/M, z23.d, z31.d +sub z23.d, z27.d, z25.d +ldr q20, [x0, #736] +ldr q22, [x0, #704] +add z27.d, z27.d, z25.d +sqrdmulh z25.d, z20.d, z10.d[0] +str q27, [x0, #592] +mul z20.d, z20.d,z11.d[0] +str q23, [x0, #624] +mla z24.d, P0/M, z18.d, z31.d +ldr q18, [x0, #640] +sub z23.d, z19.d, z16.d +ldr q27, [x0, #672] +add z19.d, z19.d, z16.d +sqrdmulh z16.d, z22.d, z10.d[0] +str q19, [x0, #720] +mul z22.d, z22.d,z11.d[0] +str q23, [x0, #752] +mla z20.d, P0/M, z25.d, z31.d +ldr q25, [x0, #608] +sub z23.d, z17.d, z24.d +ldr q19, [x0, #576] +add z17.d, z17.d, z24.d +sqrdmulh z24.d, z18.d, z10.d[0] +str q17, [x0, #656] +mul z18.d, z18.d,z11.d[0] +str q23, [x0, #688] +mla z22.d, P0/M, z16.d, z31.d +ldr q16, [x0, #512] +sub z23.d, z25.d, z20.d +ldr q17, [x0, #544] +add z25.d, z25.d, z20.d +sqrdmulh z20.d, z27.d, z10.d[0] +mul z27.d, z27.d,z11.d[0] +sub z29.d, z19.d, z22.d +mla z18.d, P0/M, z24.d, z31.d +add z19.d, z19.d, z22.d +sqrdmulh z22.d, z25.d, z8.d[0] +mul z25.d, z25.d,z9.d[0] +sub z24.d, z16.d, z18.d +mla z27.d, P0/M, z20.d, z31.d +add z16.d, z16.d, z18.d +sqrdmulh z18.d, z19.d, z8.d[0] +mul z19.d, z19.d,z9.d[0] +sub z20.d, z17.d, z27.d +mla z25.d, P0/M, z22.d, z31.d +add z17.d, z17.d, z27.d +sqrdmulh z27.d, z23.d, z8.d[1] +mul z23.d, z23.d,z9.d[1] +sub z22.d, z17.d, z25.d +mla z19.d, P0/M, z18.d, z31.d +add z17.d, z17.d, z25.d +ldr q12, [x17, #+512] +ldr q13, [x17, #+528] +ldr q14, [x17, #+544] +ldr q15, [x17, #+560] +ldr q0, [x17, #+576] +ldr q1, [x17, #+592] +ldr q2, [x17, #+608] +ldr q3, [x17, #+624] +sqrdmulh z25.d, z29.d, z8.d[1] +mul z29.d, z29.d,z9.d[1] +sub z18.d, z16.d, z19.d +mla z23.d, P0/M, z27.d, z31.d +add z16.d, z16.d, z19.d +sqrdmulh z19.d, z17.d, z6.d[0] +mul z17.d, z17.d,z7.d[0] +sub z27.d, z20.d, z23.d +mla z29.d, P0/M, z25.d, z31.d +add z20.d, z20.d, z23.d +sqrdmulh z23.d, z22.d, z6.d[1] +mul z22.d, z22.d,z7.d[1] +sub z25.d, z24.d, z29.d +mla z17.d, P0/M, z19.d, z31.d +add z24.d, z24.d, z29.d +sqrdmulh z29.d, z27.d, z4.d[1] +mul z27.d, z27.d,z5.d[1] +sub z19.d, z16.d, z17.d +mla z22.d, P0/M, z23.d, z31.d +add z16.d, z16.d, z17.d +sqrdmulh z17.d, z20.d, z4.d[0] +str q16, [x0, #512] +mul z20.d, z20.d,z5.d[0] +str q19, [x0, #544] +mla z27.d, P0/M, z29.d, z31.d +sub z29.d, z18.d, z22.d +ldr q19, [x0, #1008] +ldr q16, [x0, #976] +add z18.d, z18.d, z22.d +sqrdmulh z22.d, z19.d, z13.d[0] +str q18, [x0, #576] +mul z19.d, z19.d,z12.d[0] +str q29, [x0, #608] +mla z20.d, P0/M, z17.d, z31.d +ldr q17, [x0, #912] +sub z29.d, z25.d, z27.d +ldr q18, [x0, #944] +add z25.d, z25.d, z27.d +sqrdmulh z27.d, z16.d, z13.d[0] +str q25, [x0, #704] +mul z16.d, z16.d,z12.d[0] +str q29, [x0, #736] +mla z19.d, P0/M, z22.d, z31.d +ldr q22, [x0, #880] +sub z29.d, z24.d, z20.d +ldr q25, [x0, #848] +add z24.d, z24.d, z20.d +sqrdmulh z20.d, z17.d, z13.d[0] +str q24, [x0, #640] +mul z17.d, z17.d,z12.d[0] +str q29, [x0, #672] +mla z16.d, P0/M, z27.d, z31.d +ldr q27, [x0, #784] +sub z29.d, z22.d, z19.d +ldr q24, [x0, #816] +add z22.d, z22.d, z19.d +sqrdmulh z19.d, z18.d, z13.d[0] +mul z18.d, z18.d,z12.d[0] +sub z23.d, z25.d, z16.d +mla z17.d, P0/M, z20.d, z31.d +add z25.d, z25.d, z16.d +sqrdmulh z16.d, z22.d, z15.d[0] +mul z22.d, z22.d,z14.d[0] +sub z20.d, z27.d, z17.d +mla z18.d, P0/M, z19.d, z31.d +add z27.d, z27.d, z17.d +sqrdmulh z17.d, z25.d, z15.d[0] +mul z25.d, z25.d,z14.d[0] +sub z19.d, z24.d, z18.d +mla z22.d, P0/M, z16.d, z31.d +add z24.d, z24.d, z18.d +sqrdmulh z18.d, z29.d, z15.d[1] +mul z29.d, z29.d,z14.d[1] +sub z16.d, z24.d, z22.d +mla z25.d, P0/M, z17.d, z31.d +add z24.d, z24.d, z22.d +sqrdmulh z22.d, z23.d, z15.d[1] +mul z23.d, z23.d,z14.d[1] +sub z17.d, z27.d, z25.d +mla z29.d, P0/M, z18.d, z31.d +add z27.d, z27.d, z25.d +sqrdmulh z25.d, z24.d, z1.d[0] +mul z24.d, z24.d,z0.d[0] +sub z18.d, z19.d, z29.d +mla z23.d, P0/M, z22.d, z31.d +add z19.d, z19.d, z29.d +sqrdmulh z29.d, z16.d, z1.d[1] +mul z16.d, z16.d,z0.d[1] +sub z22.d, z20.d, z23.d +mla z24.d, P0/M, z25.d, z31.d +add z20.d, z20.d, z23.d +sqrdmulh z23.d, z18.d, z3.d[1] +mul z18.d, z18.d,z2.d[1] +sub z25.d, z27.d, z24.d +mla z16.d, P0/M, z29.d, z31.d +add z27.d, z27.d, z24.d +sqrdmulh z24.d, z19.d, z3.d[0] +str q27, [x0, #784] +mul z19.d, z19.d,z2.d[0] +str q25, [x0, #816] +mla z18.d, P0/M, z23.d, z31.d +sub z23.d, z17.d, z16.d +ldr q25, [x0, #992] +ldr q27, [x0, #960] +add z17.d, z17.d, z16.d +sqrdmulh z16.d, z25.d, z13.d[0] +str q17, [x0, #848] +mul z25.d, z25.d,z12.d[0] +str q23, [x0, #880] +mla z19.d, P0/M, z24.d, z31.d +ldr q24, [x0, #896] +sub z23.d, z22.d, z18.d +ldr q17, [x0, #928] +add z22.d, z22.d, z18.d +sqrdmulh z18.d, z27.d, z13.d[0] +str q22, [x0, #976] +mul z27.d, z27.d,z12.d[0] +str q23, [x0, #1008] +mla z25.d, P0/M, z16.d, z31.d +ldr q16, [x0, #864] +sub z23.d, z20.d, z19.d +ldr q22, [x0, #832] +add z20.d, z20.d, z19.d +sqrdmulh z19.d, z24.d, z13.d[0] +str q20, [x0, #912] +mul z24.d, z24.d,z12.d[0] +str q23, [x0, #944] +mla z27.d, P0/M, z18.d, z31.d +ldr q18, [x0, #768] +sub z23.d, z16.d, z25.d +ldr q20, [x0, #800] +add z16.d, z16.d, z25.d +sqrdmulh z25.d, z17.d, z13.d[0] +mul z17.d, z17.d,z12.d[0] +sub z29.d, z22.d, z27.d +mla z24.d, P0/M, z19.d, z31.d +add z22.d, z22.d, z27.d +sqrdmulh z27.d, z16.d, z15.d[0] +mul z16.d, z16.d,z14.d[0] +sub z19.d, z18.d, z24.d +mla z17.d, P0/M, z25.d, z31.d +add z18.d, z18.d, z24.d +sqrdmulh z24.d, z22.d, z15.d[0] +mul z22.d, z22.d,z14.d[0] +sub z25.d, z20.d, z17.d +mla z16.d, P0/M, z27.d, z31.d +add z20.d, z20.d, z17.d +sqrdmulh z17.d, z23.d, z15.d[1] +mul z23.d, z23.d,z14.d[1] +sub z27.d, z20.d, z16.d +mla z22.d, P0/M, z24.d, z31.d +add z20.d, z20.d, z16.d +ldr q4, [x17, #+640] +ldr q5, [x17, #+656] +ldr q6, [x17, #+672] +ldr q7, [x17, #+688] +ldr q8, [x17, #+704] +ldr q9, [x17, #+720] +ldr q10, [x17, #+736] +ldr q11, [x17, #+752] +sqrdmulh z16.d, z29.d, z15.d[1] +mul z29.d, z29.d,z14.d[1] +sub z24.d, z18.d, z22.d +mla z23.d, P0/M, z17.d, z31.d +add z18.d, z18.d, z22.d +sqrdmulh z22.d, z20.d, z1.d[0] +mul z20.d, z20.d,z0.d[0] +sub z17.d, z25.d, z23.d +mla z29.d, P0/M, z16.d, z31.d +add z25.d, z25.d, z23.d +sqrdmulh z23.d, z27.d, z1.d[1] +mul z27.d, z27.d,z0.d[1] +sub z16.d, z19.d, z29.d +mla z20.d, P0/M, z22.d, z31.d +add z19.d, z19.d, z29.d +sqrdmulh z29.d, z17.d, z3.d[1] +mul z17.d, z17.d,z2.d[1] +sub z22.d, z18.d, z20.d +mla z27.d, P0/M, z23.d, z31.d +add z18.d, z18.d, z20.d +sqrdmulh z20.d, z25.d, z3.d[0] +str q18, [x0, #768] +mul z25.d, z25.d,z2.d[0] +str q22, [x0, #800] +mla z17.d, P0/M, z29.d, z31.d +sub z29.d, z24.d, z27.d +ldr q22, [x0, #1264] +ldr q18, [x0, #1232] +add z24.d, z24.d, z27.d +sqrdmulh z27.d, z22.d, z5.d[0] +str q24, [x0, #832] +mul z22.d, z22.d,z4.d[0] +str q29, [x0, #864] +mla z25.d, P0/M, z20.d, z31.d +ldr q20, [x0, #1168] +sub z29.d, z16.d, z17.d +ldr q24, [x0, #1200] +add z16.d, z16.d, z17.d +sqrdmulh z17.d, z18.d, z5.d[0] +str q16, [x0, #960] +mul z18.d, z18.d,z4.d[0] +str q29, [x0, #992] +mla z22.d, P0/M, z27.d, z31.d +ldr q27, [x0, #1136] +sub z29.d, z19.d, z25.d +ldr q16, [x0, #1104] +add z19.d, z19.d, z25.d +sqrdmulh z25.d, z20.d, z5.d[0] +str q19, [x0, #896] +mul z20.d, z20.d,z4.d[0] +str q29, [x0, #928] +mla z18.d, P0/M, z17.d, z31.d +ldr q17, [x0, #1040] +sub z29.d, z27.d, z22.d +ldr q19, [x0, #1072] +add z27.d, z27.d, z22.d +sqrdmulh z22.d, z24.d, z5.d[0] +mul z24.d, z24.d,z4.d[0] +sub z23.d, z16.d, z18.d +mla z20.d, P0/M, z25.d, z31.d +add z16.d, z16.d, z18.d +sqrdmulh z18.d, z27.d, z7.d[0] +mul z27.d, z27.d,z6.d[0] +sub z25.d, z17.d, z20.d +mla z24.d, P0/M, z22.d, z31.d +add z17.d, z17.d, z20.d +sqrdmulh z20.d, z16.d, z7.d[0] +mul z16.d, z16.d,z6.d[0] +sub z22.d, z19.d, z24.d +mla z27.d, P0/M, z18.d, z31.d +add z19.d, z19.d, z24.d +sqrdmulh z24.d, z29.d, z7.d[1] +mul z29.d, z29.d,z6.d[1] +sub z18.d, z19.d, z27.d +mla z16.d, P0/M, z20.d, z31.d +add z19.d, z19.d, z27.d +sqrdmulh z27.d, z23.d, z7.d[1] +mul z23.d, z23.d,z6.d[1] +sub z20.d, z17.d, z16.d +mla z29.d, P0/M, z24.d, z31.d +add z17.d, z17.d, z16.d +sqrdmulh z16.d, z19.d, z9.d[0] +mul z19.d, z19.d,z8.d[0] +sub z24.d, z22.d, z29.d +mla z23.d, P0/M, z27.d, z31.d +add z22.d, z22.d, z29.d +sqrdmulh z29.d, z18.d, z9.d[1] +mul z18.d, z18.d,z8.d[1] +sub z27.d, z25.d, z23.d +mla z19.d, P0/M, z16.d, z31.d +add z25.d, z25.d, z23.d +sqrdmulh z23.d, z24.d, z11.d[1] +mul z24.d, z24.d,z10.d[1] +sub z16.d, z17.d, z19.d +mla z18.d, P0/M, z29.d, z31.d +add z17.d, z17.d, z19.d +sqrdmulh z19.d, z22.d, z11.d[0] +str q17, [x0, #1040] +mul z22.d, z22.d,z10.d[0] +str q16, [x0, #1072] +mla z24.d, P0/M, z23.d, z31.d +sub z23.d, z20.d, z18.d +ldr q16, [x0, #1248] +ldr q17, [x0, #1216] +add z20.d, z20.d, z18.d +sqrdmulh z18.d, z16.d, z5.d[0] +str q20, [x0, #1104] +mul z16.d, z16.d,z4.d[0] +str q23, [x0, #1136] +mla z22.d, P0/M, z19.d, z31.d +ldr q19, [x0, #1152] +sub z23.d, z27.d, z24.d +ldr q20, [x0, #1184] +add z27.d, z27.d, z24.d +sqrdmulh z24.d, z17.d, z5.d[0] +str q27, [x0, #1232] +mul z17.d, z17.d,z4.d[0] +str q23, [x0, #1264] +mla z16.d, P0/M, z18.d, z31.d +ldr q18, [x0, #1120] +sub z23.d, z25.d, z22.d +ldr q27, [x0, #1088] +add z25.d, z25.d, z22.d +sqrdmulh z22.d, z19.d, z5.d[0] +str q25, [x0, #1168] +mul z19.d, z19.d,z4.d[0] +str q23, [x0, #1200] +mla z17.d, P0/M, z24.d, z31.d +ldr q24, [x0, #1024] +sub z23.d, z18.d, z16.d +ldr q25, [x0, #1056] +add z18.d, z18.d, z16.d +sqrdmulh z16.d, z20.d, z5.d[0] +mul z20.d, z20.d,z4.d[0] +sub z29.d, z27.d, z17.d +mla z19.d, P0/M, z22.d, z31.d +add z27.d, z27.d, z17.d +sqrdmulh z17.d, z18.d, z7.d[0] +mul z18.d, z18.d,z6.d[0] +sub z22.d, z24.d, z19.d +mla z20.d, P0/M, z16.d, z31.d +add z24.d, z24.d, z19.d +sqrdmulh z19.d, z27.d, z7.d[0] +mul z27.d, z27.d,z6.d[0] +sub z16.d, z25.d, z20.d +mla z18.d, P0/M, z17.d, z31.d +add z25.d, z25.d, z20.d +sqrdmulh z20.d, z23.d, z7.d[1] +mul z23.d, z23.d,z6.d[1] +sub z17.d, z25.d, z18.d +mla z27.d, P0/M, z19.d, z31.d +add z25.d, z25.d, z18.d +ldr q3, [x17, #+768] +ldr q2, [x17, #+784] +ldr q1, [x17, #+800] +ldr q0, [x17, #+816] +ldr q15, [x17, #+832] +ldr q14, [x17, #+848] +ldr q13, [x17, #+864] +ldr q12, [x17, #+880] +sqrdmulh z18.d, z29.d, z7.d[1] +mul z29.d, z29.d,z6.d[1] +sub z19.d, z24.d, z27.d +mla z23.d, P0/M, z20.d, z31.d +add z24.d, z24.d, z27.d +sqrdmulh z27.d, z25.d, z9.d[0] +mul z25.d, z25.d,z8.d[0] +sub z20.d, z16.d, z23.d +mla z29.d, P0/M, z18.d, z31.d +add z16.d, z16.d, z23.d +sqrdmulh z23.d, z17.d, z9.d[1] +mul z17.d, z17.d,z8.d[1] +sub z18.d, z22.d, z29.d +mla z25.d, P0/M, z27.d, z31.d +add z22.d, z22.d, z29.d +sqrdmulh z29.d, z20.d, z11.d[1] +mul z20.d, z20.d,z10.d[1] +sub z27.d, z24.d, z25.d +mla z17.d, P0/M, z23.d, z31.d +add z24.d, z24.d, z25.d +sqrdmulh z25.d, z16.d, z11.d[0] +str q24, [x0, #1024] +mul z16.d, z16.d,z10.d[0] +str q27, [x0, #1056] +mla z20.d, P0/M, z29.d, z31.d +sub z29.d, z19.d, z17.d +ldr q27, [x0, #1520] +ldr q24, [x0, #1488] +add z19.d, z19.d, z17.d +sqrdmulh z17.d, z27.d, z2.d[0] +str q19, [x0, #1088] +mul z27.d, z27.d,z3.d[0] +str q29, [x0, #1120] +mla z16.d, P0/M, z25.d, z31.d +ldr q25, [x0, #1424] +sub z29.d, z18.d, z20.d +ldr q19, [x0, #1456] +add z18.d, z18.d, z20.d +sqrdmulh z20.d, z24.d, z2.d[0] +str q18, [x0, #1216] +mul z24.d, z24.d,z3.d[0] +str q29, [x0, #1248] +mla z27.d, P0/M, z17.d, z31.d +ldr q17, [x0, #1392] +sub z29.d, z22.d, z16.d +ldr q18, [x0, #1360] +add z22.d, z22.d, z16.d +sqrdmulh z16.d, z25.d, z2.d[0] +str q22, [x0, #1152] +mul z25.d, z25.d,z3.d[0] +str q29, [x0, #1184] +mla z24.d, P0/M, z20.d, z31.d +ldr q20, [x0, #1296] +sub z29.d, z17.d, z27.d +ldr q22, [x0, #1328] +add z17.d, z17.d, z27.d +sqrdmulh z27.d, z19.d, z2.d[0] +mul z19.d, z19.d,z3.d[0] +sub z23.d, z18.d, z24.d +mla z25.d, P0/M, z16.d, z31.d +add z18.d, z18.d, z24.d +sqrdmulh z24.d, z17.d, z0.d[0] +mul z17.d, z17.d,z1.d[0] +sub z16.d, z20.d, z25.d +mla z19.d, P0/M, z27.d, z31.d +add z20.d, z20.d, z25.d +sqrdmulh z25.d, z18.d, z0.d[0] +mul z18.d, z18.d,z1.d[0] +sub z27.d, z22.d, z19.d +mla z17.d, P0/M, z24.d, z31.d +add z22.d, z22.d, z19.d +sqrdmulh z19.d, z29.d, z0.d[1] +mul z29.d, z29.d,z1.d[1] +sub z24.d, z22.d, z17.d +mla z18.d, P0/M, z25.d, z31.d +add z22.d, z22.d, z17.d +sqrdmulh z17.d, z23.d, z0.d[1] +mul z23.d, z23.d,z1.d[1] +sub z25.d, z20.d, z18.d +mla z29.d, P0/M, z19.d, z31.d +add z20.d, z20.d, z18.d +sqrdmulh z18.d, z22.d, z14.d[0] +mul z22.d, z22.d,z15.d[0] +sub z19.d, z27.d, z29.d +mla z23.d, P0/M, z17.d, z31.d +add z27.d, z27.d, z29.d +sqrdmulh z29.d, z24.d, z14.d[1] +mul z24.d, z24.d,z15.d[1] +sub z17.d, z16.d, z23.d +mla z22.d, P0/M, z18.d, z31.d +add z16.d, z16.d, z23.d +sqrdmulh z23.d, z19.d, z12.d[1] +mul z19.d, z19.d,z13.d[1] +sub z18.d, z20.d, z22.d +mla z24.d, P0/M, z29.d, z31.d +add z20.d, z20.d, z22.d +sqrdmulh z22.d, z27.d, z12.d[0] +str q20, [x0, #1296] +mul z27.d, z27.d,z13.d[0] +str q18, [x0, #1328] +mla z19.d, P0/M, z23.d, z31.d +sub z23.d, z25.d, z24.d +ldr q18, [x0, #1504] +ldr q20, [x0, #1472] +add z25.d, z25.d, z24.d +sqrdmulh z24.d, z18.d, z2.d[0] +str q25, [x0, #1360] +mul z18.d, z18.d,z3.d[0] +str q23, [x0, #1392] +mla z27.d, P0/M, z22.d, z31.d +ldr q22, [x0, #1408] +sub z23.d, z17.d, z19.d +ldr q25, [x0, #1440] +add z17.d, z17.d, z19.d +sqrdmulh z19.d, z20.d, z2.d[0] +str q17, [x0, #1488] +mul z20.d, z20.d,z3.d[0] +str q23, [x0, #1520] +mla z18.d, P0/M, z24.d, z31.d +ldr q24, [x0, #1376] +sub z23.d, z16.d, z27.d +ldr q17, [x0, #1344] +add z16.d, z16.d, z27.d +sqrdmulh z27.d, z22.d, z2.d[0] +str q16, [x0, #1424] +mul z22.d, z22.d,z3.d[0] +str q23, [x0, #1456] +mla z20.d, P0/M, z19.d, z31.d +ldr q19, [x0, #1280] +sub z23.d, z24.d, z18.d +ldr q16, [x0, #1312] +add z24.d, z24.d, z18.d +sqrdmulh z18.d, z25.d, z2.d[0] +mul z25.d, z25.d,z3.d[0] +sub z29.d, z17.d, z20.d +mla z22.d, P0/M, z27.d, z31.d +add z17.d, z17.d, z20.d +sqrdmulh z20.d, z24.d, z0.d[0] +mul z24.d, z24.d,z1.d[0] +sub z27.d, z19.d, z22.d +mla z25.d, P0/M, z18.d, z31.d +add z19.d, z19.d, z22.d +sqrdmulh z22.d, z17.d, z0.d[0] +mul z17.d, z17.d,z1.d[0] +sub z18.d, z16.d, z25.d +mla z24.d, P0/M, z20.d, z31.d +add z16.d, z16.d, z25.d +sqrdmulh z25.d, z23.d, z0.d[1] +mul z23.d, z23.d,z1.d[1] +sub z20.d, z16.d, z24.d +mla z17.d, P0/M, z22.d, z31.d +add z16.d, z16.d, z24.d +ldr q11, [x17, #+896] +ldr q10, [x17, #+912] +ldr q9, [x17, #+928] +ldr q8, [x17, #+944] +ldr q7, [x17, #+960] +ldr q6, [x17, #+976] +ldr q5, [x17, #+992] +ldr q4, [x17, #+1008] +sqrdmulh z24.d, z29.d, z0.d[1] +mul z29.d, z29.d,z1.d[1] +sub z22.d, z19.d, z17.d +mla z23.d, P0/M, z25.d, z31.d +add z19.d, z19.d, z17.d +sqrdmulh z17.d, z16.d, z14.d[0] +mul z16.d, z16.d,z15.d[0] +sub z25.d, z18.d, z23.d +mla z29.d, P0/M, z24.d, z31.d +add z18.d, z18.d, z23.d +sqrdmulh z23.d, z20.d, z14.d[1] +mul z20.d, z20.d,z15.d[1] +sub z24.d, z27.d, z29.d +mla z16.d, P0/M, z17.d, z31.d +add z27.d, z27.d, z29.d +sqrdmulh z29.d, z25.d, z12.d[1] +mul z25.d, z25.d,z13.d[1] +sub z17.d, z19.d, z16.d +mla z20.d, P0/M, z23.d, z31.d +add z19.d, z19.d, z16.d +sqrdmulh z16.d, z18.d, z12.d[0] +str q19, [x0, #1280] +mul z18.d, z18.d,z13.d[0] +str q17, [x0, #1312] +mla z25.d, P0/M, z29.d, z31.d +sub z29.d, z22.d, z20.d +ldr q17, [x0, #1776] +ldr q19, [x0, #1744] +add z22.d, z22.d, z20.d +sqrdmulh z20.d, z17.d, z10.d[0] +str q22, [x0, #1344] +mul z17.d, z17.d,z11.d[0] +str q29, [x0, #1376] +mla z18.d, P0/M, z16.d, z31.d +ldr q16, [x0, #1680] +sub z29.d, z24.d, z25.d +ldr q22, [x0, #1712] +add z24.d, z24.d, z25.d +sqrdmulh z25.d, z19.d, z10.d[0] +str q24, [x0, #1472] +mul z19.d, z19.d,z11.d[0] +str q29, [x0, #1504] +mla z17.d, P0/M, z20.d, z31.d +ldr q20, [x0, #1648] +sub z29.d, z27.d, z18.d +ldr q24, [x0, #1616] +add z27.d, z27.d, z18.d +sqrdmulh z18.d, z16.d, z10.d[0] +str q27, [x0, #1408] +mul z16.d, z16.d,z11.d[0] +str q29, [x0, #1440] +mla z19.d, P0/M, z25.d, z31.d +ldr q25, [x0, #1552] +sub z29.d, z20.d, z17.d +ldr q27, [x0, #1584] +add z20.d, z20.d, z17.d +sqrdmulh z17.d, z22.d, z10.d[0] +mul z22.d, z22.d,z11.d[0] +sub z23.d, z24.d, z19.d +mla z16.d, P0/M, z18.d, z31.d +add z24.d, z24.d, z19.d +sqrdmulh z19.d, z20.d, z8.d[0] +mul z20.d, z20.d,z9.d[0] +sub z18.d, z25.d, z16.d +mla z22.d, P0/M, z17.d, z31.d +add z25.d, z25.d, z16.d +sqrdmulh z16.d, z24.d, z8.d[0] +mul z24.d, z24.d,z9.d[0] +sub z17.d, z27.d, z22.d +mla z20.d, P0/M, z19.d, z31.d +add z27.d, z27.d, z22.d +sqrdmulh z22.d, z29.d, z8.d[1] +mul z29.d, z29.d,z9.d[1] +sub z19.d, z27.d, z20.d +mla z24.d, P0/M, z16.d, z31.d +add z27.d, z27.d, z20.d +sqrdmulh z20.d, z23.d, z8.d[1] +mul z23.d, z23.d,z9.d[1] +sub z16.d, z25.d, z24.d +mla z29.d, P0/M, z22.d, z31.d +add z25.d, z25.d, z24.d +sqrdmulh z24.d, z27.d, z6.d[0] +mul z27.d, z27.d,z7.d[0] +sub z22.d, z17.d, z29.d +mla z23.d, P0/M, z20.d, z31.d +add z17.d, z17.d, z29.d +sqrdmulh z29.d, z19.d, z6.d[1] +mul z19.d, z19.d,z7.d[1] +sub z20.d, z18.d, z23.d +mla z27.d, P0/M, z24.d, z31.d +add z18.d, z18.d, z23.d +sqrdmulh z23.d, z22.d, z4.d[1] +mul z22.d, z22.d,z5.d[1] +sub z24.d, z25.d, z27.d +mla z19.d, P0/M, z29.d, z31.d +add z25.d, z25.d, z27.d +sqrdmulh z27.d, z17.d, z4.d[0] +str q25, [x0, #1552] +mul z17.d, z17.d,z5.d[0] +str q24, [x0, #1584] +mla z22.d, P0/M, z23.d, z31.d +sub z23.d, z16.d, z19.d +ldr q24, [x0, #1760] +ldr q25, [x0, #1728] +add z16.d, z16.d, z19.d +sqrdmulh z19.d, z24.d, z10.d[0] +str q16, [x0, #1616] +mul z24.d, z24.d,z11.d[0] +str q23, [x0, #1648] +mla z17.d, P0/M, z27.d, z31.d +ldr q27, [x0, #1664] +sub z23.d, z20.d, z22.d +ldr q16, [x0, #1696] +add z20.d, z20.d, z22.d +sqrdmulh z22.d, z25.d, z10.d[0] +str q20, [x0, #1744] +mul z25.d, z25.d,z11.d[0] +str q23, [x0, #1776] +mla z24.d, P0/M, z19.d, z31.d +ldr q19, [x0, #1632] +sub z23.d, z18.d, z17.d +ldr q20, [x0, #1600] +add z18.d, z18.d, z17.d +sqrdmulh z17.d, z27.d, z10.d[0] +str q18, [x0, #1680] +mul z27.d, z27.d,z11.d[0] +str q23, [x0, #1712] +mla z25.d, P0/M, z22.d, z31.d +ldr q22, [x0, #1536] +sub z23.d, z19.d, z24.d +ldr q18, [x0, #1568] +add z19.d, z19.d, z24.d +sqrdmulh z24.d, z16.d, z10.d[0] +mul z16.d, z16.d,z11.d[0] +sub z29.d, z20.d, z25.d +mla z27.d, P0/M, z17.d, z31.d +add z20.d, z20.d, z25.d +sqrdmulh z25.d, z19.d, z8.d[0] +mul z19.d, z19.d,z9.d[0] +sub z17.d, z22.d, z27.d +mla z16.d, P0/M, z24.d, z31.d +add z22.d, z22.d, z27.d +sqrdmulh z27.d, z20.d, z8.d[0] +mul z20.d, z20.d,z9.d[0] +sub z24.d, z18.d, z16.d +mla z19.d, P0/M, z25.d, z31.d +add z18.d, z18.d, z16.d +sqrdmulh z16.d, z23.d, z8.d[1] +mul z23.d, z23.d,z9.d[1] +sub z25.d, z18.d, z19.d +mla z20.d, P0/M, z27.d, z31.d +add z18.d, z18.d, z19.d +ldr q12, [x17, #+1024] +ldr q13, [x17, #+1040] +ldr q14, [x17, #+1056] +ldr q15, [x17, #+1072] +ldr q0, [x17, #+1088] +ldr q1, [x17, #+1104] +ldr q2, [x17, #+1120] +ldr q3, [x17, #+1136] +sqrdmulh z19.d, z29.d, z8.d[1] +mul z29.d, z29.d,z9.d[1] +sub z27.d, z22.d, z20.d +mla z23.d, P0/M, z16.d, z31.d +add z22.d, z22.d, z20.d +sqrdmulh z20.d, z18.d, z6.d[0] +mul z18.d, z18.d,z7.d[0] +sub z16.d, z24.d, z23.d +mla z29.d, P0/M, z19.d, z31.d +add z24.d, z24.d, z23.d +sqrdmulh z23.d, z25.d, z6.d[1] +mul z25.d, z25.d,z7.d[1] +sub z19.d, z17.d, z29.d +mla z18.d, P0/M, z20.d, z31.d +add z17.d, z17.d, z29.d +sqrdmulh z29.d, z16.d, z4.d[1] +mul z16.d, z16.d,z5.d[1] +sub z20.d, z22.d, z18.d +mla z25.d, P0/M, z23.d, z31.d +add z22.d, z22.d, z18.d +sqrdmulh z18.d, z24.d, z4.d[0] +str q22, [x0, #1536] +mul z24.d, z24.d,z5.d[0] +str q20, [x0, #1568] +mla z16.d, P0/M, z29.d, z31.d +sub z29.d, z27.d, z25.d +ldr q20, [x0, #2032] +ldr q22, [x0, #2000] +add z27.d, z27.d, z25.d +sqrdmulh z25.d, z20.d, z13.d[0] +str q27, [x0, #1600] +mul z20.d, z20.d,z12.d[0] +str q29, [x0, #1632] +mla z24.d, P0/M, z18.d, z31.d +ldr q18, [x0, #1936] +sub z29.d, z19.d, z16.d +ldr q27, [x0, #1968] +add z19.d, z19.d, z16.d +sqrdmulh z16.d, z22.d, z13.d[0] +str q19, [x0, #1728] +mul z22.d, z22.d,z12.d[0] +str q29, [x0, #1760] +mla z20.d, P0/M, z25.d, z31.d +ldr q25, [x0, #1904] +sub z29.d, z17.d, z24.d +ldr q19, [x0, #1872] +add z17.d, z17.d, z24.d +sqrdmulh z24.d, z18.d, z13.d[0] +str q17, [x0, #1664] +mul z18.d, z18.d,z12.d[0] +str q29, [x0, #1696] +mla z22.d, P0/M, z16.d, z31.d +ldr q16, [x0, #1808] +sub z29.d, z25.d, z20.d +ldr q17, [x0, #1840] +add z25.d, z25.d, z20.d +sqrdmulh z20.d, z27.d, z13.d[0] +mul z27.d, z27.d,z12.d[0] +sub z23.d, z19.d, z22.d +mla z18.d, P0/M, z24.d, z31.d +add z19.d, z19.d, z22.d +sqrdmulh z22.d, z25.d, z15.d[0] +mul z25.d, z25.d,z14.d[0] +sub z24.d, z16.d, z18.d +mla z27.d, P0/M, z20.d, z31.d +add z16.d, z16.d, z18.d +sqrdmulh z18.d, z19.d, z15.d[0] +mul z19.d, z19.d,z14.d[0] +sub z20.d, z17.d, z27.d +mla z25.d, P0/M, z22.d, z31.d +add z17.d, z17.d, z27.d +sqrdmulh z27.d, z29.d, z15.d[1] +mul z29.d, z29.d,z14.d[1] +sub z22.d, z17.d, z25.d +mla z19.d, P0/M, z18.d, z31.d +add z17.d, z17.d, z25.d +sqrdmulh z25.d, z23.d, z15.d[1] +mul z23.d, z23.d,z14.d[1] +sub z18.d, z16.d, z19.d +mla z29.d, P0/M, z27.d, z31.d +add z16.d, z16.d, z19.d +sqrdmulh z19.d, z17.d, z1.d[0] +mul z17.d, z17.d,z0.d[0] +sub z27.d, z20.d, z29.d +mla z23.d, P0/M, z25.d, z31.d +add z20.d, z20.d, z29.d +sqrdmulh z29.d, z22.d, z1.d[1] +mul z22.d, z22.d,z0.d[1] +sub z25.d, z24.d, z23.d +mla z17.d, P0/M, z19.d, z31.d +add z24.d, z24.d, z23.d +sqrdmulh z23.d, z27.d, z3.d[1] +mul z27.d, z27.d,z2.d[1] +sub z19.d, z16.d, z17.d +mla z22.d, P0/M, z29.d, z31.d +add z16.d, z16.d, z17.d +sqrdmulh z17.d, z20.d, z3.d[0] +str q16, [x0, #1808] +mul z20.d, z20.d,z2.d[0] +str q19, [x0, #1840] +mla z27.d, P0/M, z23.d, z31.d +sub z23.d, z18.d, z22.d +ldr q19, [x0, #2016] +ldr q16, [x0, #1984] +add z18.d, z18.d, z22.d +sqrdmulh z22.d, z19.d, z13.d[0] +str q18, [x0, #1872] +mul z19.d, z19.d,z12.d[0] +str q23, [x0, #1904] +mla z20.d, P0/M, z17.d, z31.d +ldr q17, [x0, #1920] +sub z23.d, z25.d, z27.d +ldr q18, [x0, #1952] +add z25.d, z25.d, z27.d +sqrdmulh z27.d, z16.d, z13.d[0] +str q25, [x0, #2000] +mul z16.d, z16.d,z12.d[0] +str q23, [x0, #2032] +mla z19.d, P0/M, z22.d, z31.d +ldr q22, [x0, #1888] +sub z23.d, z24.d, z20.d +ldr q25, [x0, #1856] +add z24.d, z24.d, z20.d +sqrdmulh z20.d, z17.d, z13.d[0] +str q24, [x0, #1936] +mul z17.d, z17.d,z12.d[0] +str q23, [x0, #1968] +mla z16.d, P0/M, z27.d, z31.d +ldr q27, [x0, #1792] +sub z23.d, z22.d, z19.d +ldr q24, [x0, #1824] +add z22.d, z22.d, z19.d +sqrdmulh z19.d, z18.d, z13.d[0] +mul z18.d, z18.d,z12.d[0] +sub z29.d, z25.d, z16.d +mla z17.d, P0/M, z20.d, z31.d +add z25.d, z25.d, z16.d +sqrdmulh z16.d, z22.d, z15.d[0] +mul z22.d, z22.d,z14.d[0] +sub z20.d, z27.d, z17.d +mla z18.d, P0/M, z19.d, z31.d +add z27.d, z27.d, z17.d +sqrdmulh z17.d, z25.d, z15.d[0] +mul z25.d, z25.d,z14.d[0] +sub z19.d, z24.d, z18.d +mla z22.d, P0/M, z16.d, z31.d +add z24.d, z24.d, z18.d +sqrdmulh z18.d, z23.d, z15.d[1] +mul z23.d, z23.d,z14.d[1] +sub z16.d, z24.d, z22.d +mla z25.d, P0/M, z17.d, z31.d +add z24.d, z24.d, z22.d +sqrdmulh z22.d, z29.d, z15.d[1] +mul z29.d, z29.d,z14.d[1] +sub z17.d, z27.d, z25.d +mla z23.d, P0/M, z18.d, z31.d +add z27.d, z27.d, z25.d +sqrdmulh z25.d, z24.d, z1.d[0] +mul z24.d, z24.d,z0.d[0] +sub z18.d, z19.d, z23.d +mla z29.d, P0/M, z22.d, z31.d +add z19.d, z19.d, z23.d +sqrdmulh z23.d, z16.d, z1.d[1] +mul z16.d, z16.d,z0.d[1] +sub z22.d, z20.d, z29.d +mla z24.d, P0/M, z25.d, z31.d +add z20.d, z20.d, z29.d +sqrdmulh z29.d, z18.d, z3.d[1] +mul z18.d, z18.d,z2.d[1] +sub z25.d, z27.d, z24.d +mla z16.d, P0/M, z23.d, z31.d +add z27.d, z27.d, z24.d +sqrdmulh z24.d, z19.d, z3.d[0] +str q27, [x0, #1792] +mul z19.d, z19.d,z2.d[0] +str q25, [x0, #1824] +mla z18.d, P0/M, z29.d, z31.d +sub z29.d, z17.d, z16.d +add z17.d, z17.d, z16.d +str q17, [x0, #1856] +str q29, [x0, #1888] +mla z19.d, P0/M, z24.d, z31.d +sub z24.d, z22.d, z18.d +add z22.d, z22.d, z18.d +str q22, [x0, #1984] +str q24, [x0, #2016] +sub z24.d, z20.d, z19.d +add z20.d, z20.d, z19.d +str q20, [x0, #1920] +str q24, [x0, #1952] +// Restore SVE2 vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 2697 +// Instruction count: 2693 \ No newline at end of file diff --git a/tests/ntt_sve2/auto/ntt_u64_incomplete_72057594067788289_60277548896192635_var_3_3_2.s b/tests/ntt_sve2/auto/ntt_u64_incomplete_72057594067788289_60277548896192635_var_3_3_2.s new file mode 100644 index 0000000..cb5189a --- /dev/null +++ b/tests/ntt_sve2/auto/ntt_u64_incomplete_72057594067788289_60277548896192635_var_3_3_2.s @@ -0,0 +1,2727 @@ + +/// +/// Copyright (c) 2021 Arm Limited +/// SPDX-License-Identifier: MIT +/// +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in all +/// copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE + + +/// +/// This assembly code has been auto-generated. +/// Don't modify it directly. +/// + +modulus: +.dword -72057594067788289 +.dword -72057594067788289 +.dword -72057594067788289 +.dword -72057594067788289 +.align 6 +roots_merged: +.dword 25792053496987399 // Layer 0, block 0 +.dword 0 // Layer None, block None +.dword 3301382846246308405 // Layer 0, block 0 +.dword 0 // Layer None, block None +.dword 36678763444893001 // Layer 1, block 0 +.dword 12009493193917617 // Layer 1, block 1 +.dword 4694881719000765600 // Layer 1, block 0 +.dword 1537215128184439725 // Layer 1, block 1 +.dword 57226611787624233 // Layer 2, block 0 +.dword 39665359539540334 // Layer 2, block 1 +.dword 7325006305780451127 // Layer 2, block 0 +.dword 5077166018957207276 // Layer 2, block 1 +.dword 14359056949694594 // Layer 2, block 2 +.dword 63449028357011879 // Layer 2, block 3 +.dword 1837959288799265711 // Layer 2, block 2 +.dword 8121475626332016399 // Layer 2, block 3 +.dword 56437370284897879 // Layer 3, block 0 +.dword 0 // Layer None, block None +.dword 7223983393473341270 // Layer 3, block 0 +.dword 0 // Layer None, block None +.dword 15519149204003269 // Layer 4, block 0 +.dword 18945631884663455 // Layer 4, block 1 +.dword 1986451097289241753 // Layer 4, block 0 +.dword 2425040880231995866 // Layer 4, block 1 +.dword 21843809513296019 // Layer 5, block 0 +.dword 52861630939350015 // Layer 5, block 1 +.dword 2796007616543237058 // Layer 5, block 0 +.dword 6766288757432881341 // Layer 5, block 1 +.dword 58200436133340777 // Layer 5, block 2 +.dword 45581265709396633 // Layer 5, block 3 +.dword 7449655821980514543 // Layer 5, block 2 +.dword 5834402008385018253 // Layer 5, block 3 +.dword 7801853795705237 // Layer 3, block 1 +.dword 0 // Layer None, block None +.dword 998637285436439396 // Layer 3, block 1 +.dword 0 // Layer None, block None +.dword 72057409685042741 // Layer 4, block 2 +.dword 67813594624550994 // Layer 4, block 3 +.dword 9223348435863355444 // Layer 4, block 2 +.dword 8680140108345514992 // Layer 4, block 3 +.dword 16444438478993771 // Layer 5, block 4 +.dword 44738633871916757 // Layer 5, block 5 +.dword 2104888124438946221 // Layer 5, block 4 +.dword 5726545133232289544 // Layer 5, block 5 +.dword 14998888047589537 // Layer 5, block 6 +.dword 1367715298619054 // Layer 5, block 7 +.dword 1919857669295880083 // Layer 5, block 6 +.dword 175067558150691679 // Layer 5, block 7 +.dword 50810289212278368 // Layer 3, block 2 +.dword 0 // Layer None, block None +.dword 6503717016476519110 // Layer 3, block 2 +.dword 0 // Layer None, block None +.dword 38922220208018571 // Layer 4, block 4 +.dword 7966052600948377 // Layer 4, block 5 +.dword 4982044184561839686 // Layer 4, block 4 +.dword 1019654732498851778 // Layer 4, block 5 +.dword 45879272116084567 // Layer 5, block 8 +.dword 66654388400258382 // Layer 5, block 9 +.dword 5872546828425266758 // Layer 5, block 8 +.dword 8531761711697548017 // Layer 5, block 9 +.dword 8930087962801744 // Layer 5, block 10 +.dword 61848588213223279 // Layer 5, block 11 +.dword 1143051258764947771 // Layer 5, block 10 +.dword 7916619288011967173 // Layer 5, block 11 +.dword 31977682183549777 // Layer 3, block 3 +.dword 0 // Layer None, block None +.dword 4093143317798190700 // Layer 3, block 3 +.dword 0 // Layer None, block None +.dword 66070897124800871 // Layer 4, block 6 +.dword 953067252694683 // Layer 4, block 7 +.dword 8457074828469936528 // Layer 4, block 6 +.dword 121992608294366219 // Layer 4, block 7 +.dword 33801610235026337 // Layer 5, block 12 +.dword 32122784433286747 // Layer 5, block 13 +.dword 4326606108290444417 // Layer 5, block 12 +.dword 4111716405756826253 // Layer 5, block 13 +.dword 67688369535326483 // Layer 5, block 14 +.dword 45021686719473556 // Layer 5, block 15 +.dword 8664111296931419854 // Layer 5, block 14 +.dword 5762775897704545946 // Layer 5, block 15 +.dword 66662168904752601 // Layer 3, block 4 +.dword 0 // Layer None, block None +.dword 8532757616272395351 // Layer 3, block 4 +.dword 0 // Layer None, block None +.dword 23961218891132444 // Layer 4, block 8 +.dword 59012643726482518 // Layer 4, block 9 +.dword 3067036016793986470 // Layer 4, block 8 +.dword 7553618393859575754 // Layer 4, block 9 +.dword 52812533586708198 // Layer 5, block 16 +.dword 27994290036168371 // Layer 5, block 17 +.dword 6760004296297333018 // Layer 5, block 16 +.dword 3583269123144660376 // Layer 5, block 17 +.dword 45890717144660134 // Layer 5, block 18 +.dword 39684773913748863 // Layer 5, block 19 +.dword 5874011792082332260 // Layer 5, block 18 +.dword 5079651058854869198 // Layer 5, block 19 +.dword 50149898471788096 // Layer 3, block 5 +.dword 0 // Layer None, block None +.dword 6419187001728793164 // Layer 3, block 5 +.dword 0 // Layer None, block None +.dword 65714767972465509 // Layer 4, block 10 +.dword 51421828010275652 // Layer 4, block 11 +.dword 8411490296989900223 // Layer 4, block 10 +.dword 6581993982587733829 // Layer 4, block 11 +.dword 18683690578478417 // Layer 5, block 20 +.dword 3282356803714609 // Layer 5, block 21 +.dword 2391512393054205061 // Layer 5, block 20 +.dword 420141670701365074 // Layer 5, block 21 +.dword 67884452950503047 // Layer 5, block 22 +.dword 10335338564031418 // Layer 5, block 23 +.dword 8689209974063619263 // Layer 5, block 22 +.dword 1322923335647807838 // Layer 5, block 23 +.dword 30932683335866672 // Layer 3, block 6 +.dword 0 // Layer None, block None +.dword 3959383465350182760 // Layer 3, block 6 +.dword 0 // Layer None, block None +.dword 27050097608373352 // Layer 4, block 12 +.dword 67454821565758121 // Layer 4, block 13 +.dword 3462412492436980406 // Layer 4, block 12 +.dword 8634217156839057519 // Layer 4, block 13 +.dword 32828920539599153 // Layer 5, block 24 +.dword 8624332566875856 // Layer 5, block 25 +.dword 4202101827327358896 // Layer 5, block 24 +.dword 1103914568102652181 // Layer 5, block 25 +.dword 56732837753533829 // Layer 5, block 26 +.dword 14816466027490539 // Layer 5, block 27 +.dword 7261803229443070495 // Layer 5, block 26 +.dword 1896507650732884485 // Layer 5, block 27 +.dword 54968319742463037 // Layer 3, block 7 +.dword 0 // Layer None, block None +.dword 7035944924119603816 // Layer 3, block 7 +.dword 0 // Layer None, block None +.dword 55666925166425210 // Layer 4, block 14 +.dword 34241587306439298 // Layer 4, block 15 +.dword 7125366418349706083 // Layer 4, block 14 +.dword 4382923173407965878 // Layer 4, block 15 +.dword 8550051130607768 // Layer 5, block 28 +.dword 14420141705316589 // Layer 5, block 29 +.dword 1094406544264277001 // Layer 5, block 28 +.dword 1845778137515640974 // Layer 5, block 29 +.dword 55622715926092387 // Layer 5, block 30 +.dword 3405033449209397 // Layer 5, block 31 +.dword 7119707635589449714 // Layer 5, block 30 +.dword 435844281318190845 // Layer 5, block 31 +.text +.type ntt_u64_incomplete_sve2_asm_var_3_3_2, %function +.global ntt_u64_incomplete_sve2_asm_var_3_3_2 +modulus_addr: .quad modulus +roots_merged_addr: .quad roots_merged +ntt_u64_incomplete_sve2_asm_var_3_3_2: +// Save GPRs +sub sp, sp, #(16*5+16) +stp x19, x20, [sp, #16*0] +stp x19, x20, [sp, #16*0] +stp x21, x22, [sp, #16*1] +stp x23, x24, [sp, #16*2] +stp x25, x26, [sp, #16*3] +stp x27, x28, [sp, #16*4] +str x29, [sp, #16*5] +// Save SVE2 vector registers +sub sp, sp, #(16*4) +stp d8, d9, [sp, #16*0] +stp d10, d11, [sp, #16*1] +stp d12, d13, [sp, #16*2] +stp d14, d15, [sp, #16*3] +ldr x17, modulus_addr +ldr q31, [x17] +ptrue P0.d +ldr x17, roots_merged_addr +ldr q3, [x17, #+0] +ldr q2, [x17, #+16] +ldr q1, [x17, #+32] +ldr q0, [x17, #+48] +ldr q15, [x17, #+64] +ldr q14, [x17, #+80] +ldr q13, [x17, #+96] +ldr q12, [x17, #+112] +ldr q30, [x0, #1920] +ldr q29, [x0, #1664] +sqrdmulh z28.d, z30.d, z2.d[0] +mul z30.d, z30.d,z3.d[0] +ldr q27, [x0, #1152] +sqrdmulh z26.d, z29.d, z2.d[0] +mul z29.d, z29.d,z3.d[0] +ldr q25, [x0, #1408] +mla z30.d, P0/M, z28.d, z31.d +sqrdmulh z28.d, z27.d, z2.d[0] +mul z27.d, z27.d,z3.d[0] +ldr q24, [x0, #896] +mla z29.d, P0/M, z26.d, z31.d +sub z26.d, z24.d, z30.d +add z24.d, z24.d, z30.d +sqrdmulh z30.d, z25.d, z2.d[0] +mul z25.d, z25.d,z3.d[0] +ldr q23, [x0, #640] +mla z27.d, P0/M, z28.d, z31.d +sub z28.d, z23.d, z29.d +add z23.d, z23.d, z29.d +sqrdmulh z29.d, z24.d, z0.d[0] +mul z24.d, z24.d,z1.d[0] +ldr q22, [x0, #128] +mla z25.d, P0/M, z30.d, z31.d +sub z30.d, z22.d, z27.d +add z22.d, z22.d, z27.d +sqrdmulh z27.d, z23.d, z0.d[0] +mul z23.d, z23.d,z1.d[0] +ldr q21, [x0, #384] +mla z24.d, P0/M, z29.d, z31.d +sub z29.d, z21.d, z25.d +add z21.d, z21.d, z25.d +sqrdmulh z25.d, z26.d, z0.d[1] +mul z26.d, z26.d,z1.d[1] +mla z23.d, P0/M, z27.d, z31.d +sub z27.d, z21.d, z24.d +add z21.d, z21.d, z24.d +sqrdmulh z24.d, z28.d, z0.d[1] +mul z28.d, z28.d,z1.d[1] +mla z26.d, P0/M, z25.d, z31.d +sub z25.d, z22.d, z23.d +add z22.d, z22.d, z23.d +sqrdmulh z23.d, z21.d, z14.d[0] +mul z21.d, z21.d,z15.d[0] +mla z28.d, P0/M, z24.d, z31.d +sub z24.d, z29.d, z26.d +add z29.d, z29.d, z26.d +sqrdmulh z26.d, z27.d, z14.d[1] +mul z27.d, z27.d,z15.d[1] +mla z21.d, P0/M, z23.d, z31.d +sub z23.d, z30.d, z28.d +add z30.d, z30.d, z28.d +sqrdmulh z28.d, z24.d, z12.d[1] +mul z24.d, z24.d,z13.d[1] +ldr q20, [x0, #1936] +mla z27.d, P0/M, z26.d, z31.d +sub z26.d, z22.d, z21.d +add z22.d, z22.d, z21.d +sqrdmulh z21.d, z29.d, z12.d[0] +mul z29.d, z29.d,z13.d[0] +ldr q19, [x0, #1680] +mla z24.d, P0/M, z28.d, z31.d +sub z28.d, z25.d, z27.d +add z25.d, z25.d, z27.d +sqrdmulh z27.d, z20.d, z2.d[0] +mul z20.d, z20.d,z3.d[0] +ldr q18, [x0, #1168] +mla z29.d, P0/M, z21.d, z31.d +sub z21.d, z23.d, z24.d +add z23.d, z23.d, z24.d +sqrdmulh z24.d, z19.d, z2.d[0] +str q22, [x0, #128] +mul z19.d, z19.d,z3.d[0] +ldr q22, [x0, #1424] +mla z20.d, P0/M, z27.d, z31.d +sub z27.d, z30.d, z29.d +add z30.d, z30.d, z29.d +sqrdmulh z29.d, z18.d, z2.d[0] +str q26, [x0, #384] +mul z18.d, z18.d,z3.d[0] +ldr q26, [x0, #912] +mla z19.d, P0/M, z24.d, z31.d +sub z24.d, z26.d, z20.d +add z26.d, z26.d, z20.d +sqrdmulh z20.d, z22.d, z2.d[0] +str q25, [x0, #640] +mul z22.d, z22.d,z3.d[0] +ldr q25, [x0, #656] +mla z18.d, P0/M, z29.d, z31.d +sub z29.d, z25.d, z19.d +add z25.d, z25.d, z19.d +sqrdmulh z19.d, z26.d, z0.d[0] +str q28, [x0, #896] +mul z26.d, z26.d,z1.d[0] +ldr q28, [x0, #144] +mla z22.d, P0/M, z20.d, z31.d +sub z20.d, z28.d, z18.d +add z28.d, z28.d, z18.d +sqrdmulh z18.d, z25.d, z0.d[0] +str q23, [x0, #1664] +mul z25.d, z25.d,z1.d[0] +ldr q23, [x0, #400] +mla z26.d, P0/M, z19.d, z31.d +sub z19.d, z23.d, z22.d +add z23.d, z23.d, z22.d +sqrdmulh z22.d, z24.d, z0.d[1] +str q21, [x0, #1920] +mul z24.d, z24.d,z1.d[1] +mla z25.d, P0/M, z18.d, z31.d +sub z18.d, z23.d, z26.d +add z23.d, z23.d, z26.d +sqrdmulh z26.d, z29.d, z0.d[1] +str q30, [x0, #1152] +mul z29.d, z29.d,z1.d[1] +mla z24.d, P0/M, z22.d, z31.d +sub z22.d, z28.d, z25.d +add z28.d, z28.d, z25.d +sqrdmulh z25.d, z23.d, z14.d[0] +str q27, [x0, #1408] +mul z23.d, z23.d,z15.d[0] +mla z29.d, P0/M, z26.d, z31.d +sub z26.d, z19.d, z24.d +add z19.d, z19.d, z24.d +sqrdmulh z24.d, z18.d, z14.d[1] +mul z18.d, z18.d,z15.d[1] +mla z23.d, P0/M, z25.d, z31.d +sub z25.d, z20.d, z29.d +add z20.d, z20.d, z29.d +sqrdmulh z29.d, z26.d, z12.d[1] +mul z26.d, z26.d,z13.d[1] +ldr q27, [x0, #1952] +mla z18.d, P0/M, z24.d, z31.d +sub z24.d, z28.d, z23.d +add z28.d, z28.d, z23.d +sqrdmulh z23.d, z19.d, z12.d[0] +mul z19.d, z19.d,z13.d[0] +ldr q30, [x0, #1696] +mla z26.d, P0/M, z29.d, z31.d +sub z29.d, z22.d, z18.d +add z22.d, z22.d, z18.d +sqrdmulh z18.d, z27.d, z2.d[0] +mul z27.d, z27.d,z3.d[0] +ldr q21, [x0, #1184] +mla z19.d, P0/M, z23.d, z31.d +sub z23.d, z25.d, z26.d +add z25.d, z25.d, z26.d +sqrdmulh z26.d, z30.d, z2.d[0] +str q28, [x0, #144] +mul z30.d, z30.d,z3.d[0] +ldr q28, [x0, #1440] +mla z27.d, P0/M, z18.d, z31.d +sub z18.d, z20.d, z19.d +add z20.d, z20.d, z19.d +sqrdmulh z19.d, z21.d, z2.d[0] +str q24, [x0, #400] +mul z21.d, z21.d,z3.d[0] +ldr q24, [x0, #928] +mla z30.d, P0/M, z26.d, z31.d +sub z26.d, z24.d, z27.d +add z24.d, z24.d, z27.d +sqrdmulh z27.d, z28.d, z2.d[0] +str q22, [x0, #656] +mul z28.d, z28.d,z3.d[0] +ldr q22, [x0, #672] +mla z21.d, P0/M, z19.d, z31.d +sub z19.d, z22.d, z30.d +add z22.d, z22.d, z30.d +sqrdmulh z30.d, z24.d, z0.d[0] +str q29, [x0, #912] +mul z24.d, z24.d,z1.d[0] +ldr q29, [x0, #160] +mla z28.d, P0/M, z27.d, z31.d +sub z27.d, z29.d, z21.d +add z29.d, z29.d, z21.d +sqrdmulh z21.d, z22.d, z0.d[0] +str q25, [x0, #1680] +mul z22.d, z22.d,z1.d[0] +ldr q25, [x0, #416] +mla z24.d, P0/M, z30.d, z31.d +sub z30.d, z25.d, z28.d +add z25.d, z25.d, z28.d +sqrdmulh z28.d, z26.d, z0.d[1] +str q23, [x0, #1936] +mul z26.d, z26.d,z1.d[1] +mla z22.d, P0/M, z21.d, z31.d +sub z21.d, z25.d, z24.d +add z25.d, z25.d, z24.d +sqrdmulh z24.d, z19.d, z0.d[1] +str q20, [x0, #1168] +mul z19.d, z19.d,z1.d[1] +mla z26.d, P0/M, z28.d, z31.d +sub z28.d, z29.d, z22.d +add z29.d, z29.d, z22.d +sqrdmulh z22.d, z25.d, z14.d[0] +str q18, [x0, #1424] +mul z25.d, z25.d,z15.d[0] +mla z19.d, P0/M, z24.d, z31.d +sub z24.d, z30.d, z26.d +add z30.d, z30.d, z26.d +sqrdmulh z26.d, z21.d, z14.d[1] +mul z21.d, z21.d,z15.d[1] +mla z25.d, P0/M, z22.d, z31.d +sub z22.d, z27.d, z19.d +add z27.d, z27.d, z19.d +sqrdmulh z19.d, z24.d, z12.d[1] +mul z24.d, z24.d,z13.d[1] +ldr q18, [x0, #1968] +mla z21.d, P0/M, z26.d, z31.d +sub z26.d, z29.d, z25.d +add z29.d, z29.d, z25.d +sqrdmulh z25.d, z30.d, z12.d[0] +mul z30.d, z30.d,z13.d[0] +ldr q20, [x0, #1712] +mla z24.d, P0/M, z19.d, z31.d +sub z19.d, z28.d, z21.d +add z28.d, z28.d, z21.d +sqrdmulh z21.d, z18.d, z2.d[0] +mul z18.d, z18.d,z3.d[0] +ldr q23, [x0, #1200] +mla z30.d, P0/M, z25.d, z31.d +sub z25.d, z22.d, z24.d +add z22.d, z22.d, z24.d +sqrdmulh z24.d, z20.d, z2.d[0] +str q29, [x0, #160] +mul z20.d, z20.d,z3.d[0] +ldr q29, [x0, #1456] +mla z18.d, P0/M, z21.d, z31.d +sub z21.d, z27.d, z30.d +add z27.d, z27.d, z30.d +sqrdmulh z30.d, z23.d, z2.d[0] +str q26, [x0, #416] +mul z23.d, z23.d,z3.d[0] +ldr q26, [x0, #944] +mla z20.d, P0/M, z24.d, z31.d +sub z24.d, z26.d, z18.d +add z26.d, z26.d, z18.d +sqrdmulh z18.d, z29.d, z2.d[0] +str q28, [x0, #672] +mul z29.d, z29.d,z3.d[0] +ldr q28, [x0, #688] +mla z23.d, P0/M, z30.d, z31.d +sub z30.d, z28.d, z20.d +add z28.d, z28.d, z20.d +sqrdmulh z20.d, z26.d, z0.d[0] +str q19, [x0, #928] +mul z26.d, z26.d,z1.d[0] +ldr q19, [x0, #176] +mla z29.d, P0/M, z18.d, z31.d +sub z18.d, z19.d, z23.d +add z19.d, z19.d, z23.d +sqrdmulh z23.d, z28.d, z0.d[0] +str q22, [x0, #1696] +mul z28.d, z28.d,z1.d[0] +ldr q22, [x0, #432] +mla z26.d, P0/M, z20.d, z31.d +sub z20.d, z22.d, z29.d +add z22.d, z22.d, z29.d +sqrdmulh z29.d, z24.d, z0.d[1] +str q25, [x0, #1952] +mul z24.d, z24.d,z1.d[1] +mla z28.d, P0/M, z23.d, z31.d +sub z23.d, z22.d, z26.d +add z22.d, z22.d, z26.d +sqrdmulh z26.d, z30.d, z0.d[1] +str q27, [x0, #1184] +mul z30.d, z30.d,z1.d[1] +mla z24.d, P0/M, z29.d, z31.d +sub z29.d, z19.d, z28.d +add z19.d, z19.d, z28.d +sqrdmulh z28.d, z22.d, z14.d[0] +str q21, [x0, #1440] +mul z22.d, z22.d,z15.d[0] +mla z30.d, P0/M, z26.d, z31.d +sub z26.d, z20.d, z24.d +add z20.d, z20.d, z24.d +sqrdmulh z24.d, z23.d, z14.d[1] +mul z23.d, z23.d,z15.d[1] +mla z22.d, P0/M, z28.d, z31.d +sub z28.d, z18.d, z30.d +add z18.d, z18.d, z30.d +sqrdmulh z30.d, z26.d, z12.d[1] +mul z26.d, z26.d,z13.d[1] +ldr q21, [x0, #1984] +mla z23.d, P0/M, z24.d, z31.d +sub z24.d, z19.d, z22.d +add z19.d, z19.d, z22.d +sqrdmulh z22.d, z20.d, z12.d[0] +mul z20.d, z20.d,z13.d[0] +ldr q27, [x0, #1728] +mla z26.d, P0/M, z30.d, z31.d +sub z30.d, z29.d, z23.d +add z29.d, z29.d, z23.d +sqrdmulh z23.d, z21.d, z2.d[0] +mul z21.d, z21.d,z3.d[0] +ldr q25, [x0, #1216] +mla z20.d, P0/M, z22.d, z31.d +sub z22.d, z28.d, z26.d +add z28.d, z28.d, z26.d +sqrdmulh z26.d, z27.d, z2.d[0] +str q19, [x0, #176] +mul z27.d, z27.d,z3.d[0] +ldr q19, [x0, #1472] +mla z21.d, P0/M, z23.d, z31.d +sub z23.d, z18.d, z20.d +add z18.d, z18.d, z20.d +sqrdmulh z20.d, z25.d, z2.d[0] +str q24, [x0, #432] +mul z25.d, z25.d,z3.d[0] +ldr q24, [x0, #960] +mla z27.d, P0/M, z26.d, z31.d +sub z26.d, z24.d, z21.d +add z24.d, z24.d, z21.d +sqrdmulh z21.d, z19.d, z2.d[0] +str q29, [x0, #688] +mul z19.d, z19.d,z3.d[0] +ldr q29, [x0, #704] +mla z25.d, P0/M, z20.d, z31.d +sub z20.d, z29.d, z27.d +add z29.d, z29.d, z27.d +sqrdmulh z27.d, z24.d, z0.d[0] +str q30, [x0, #944] +mul z24.d, z24.d,z1.d[0] +ldr q30, [x0, #192] +mla z19.d, P0/M, z21.d, z31.d +sub z21.d, z30.d, z25.d +add z30.d, z30.d, z25.d +sqrdmulh z25.d, z29.d, z0.d[0] +str q28, [x0, #1712] +mul z29.d, z29.d,z1.d[0] +ldr q28, [x0, #448] +mla z24.d, P0/M, z27.d, z31.d +sub z27.d, z28.d, z19.d +add z28.d, z28.d, z19.d +sqrdmulh z19.d, z26.d, z0.d[1] +str q22, [x0, #1968] +mul z26.d, z26.d,z1.d[1] +mla z29.d, P0/M, z25.d, z31.d +sub z25.d, z28.d, z24.d +add z28.d, z28.d, z24.d +sqrdmulh z24.d, z20.d, z0.d[1] +str q18, [x0, #1200] +mul z20.d, z20.d,z1.d[1] +mla z26.d, P0/M, z19.d, z31.d +sub z19.d, z30.d, z29.d +add z30.d, z30.d, z29.d +sqrdmulh z29.d, z28.d, z14.d[0] +str q23, [x0, #1456] +mul z28.d, z28.d,z15.d[0] +mla z20.d, P0/M, z24.d, z31.d +sub z24.d, z27.d, z26.d +add z27.d, z27.d, z26.d +sqrdmulh z26.d, z25.d, z14.d[1] +mul z25.d, z25.d,z15.d[1] +mla z28.d, P0/M, z29.d, z31.d +sub z29.d, z21.d, z20.d +add z21.d, z21.d, z20.d +sqrdmulh z20.d, z24.d, z12.d[1] +mul z24.d, z24.d,z13.d[1] +ldr q23, [x0, #2000] +mla z25.d, P0/M, z26.d, z31.d +sub z26.d, z30.d, z28.d +add z30.d, z30.d, z28.d +sqrdmulh z28.d, z27.d, z12.d[0] +mul z27.d, z27.d,z13.d[0] +ldr q18, [x0, #1744] +mla z24.d, P0/M, z20.d, z31.d +sub z20.d, z19.d, z25.d +add z19.d, z19.d, z25.d +sqrdmulh z25.d, z23.d, z2.d[0] +mul z23.d, z23.d,z3.d[0] +ldr q22, [x0, #1232] +mla z27.d, P0/M, z28.d, z31.d +sub z28.d, z29.d, z24.d +add z29.d, z29.d, z24.d +sqrdmulh z24.d, z18.d, z2.d[0] +str q30, [x0, #192] +mul z18.d, z18.d,z3.d[0] +ldr q30, [x0, #1488] +mla z23.d, P0/M, z25.d, z31.d +sub z25.d, z21.d, z27.d +add z21.d, z21.d, z27.d +sqrdmulh z27.d, z22.d, z2.d[0] +str q26, [x0, #448] +mul z22.d, z22.d,z3.d[0] +ldr q26, [x0, #976] +mla z18.d, P0/M, z24.d, z31.d +sub z24.d, z26.d, z23.d +add z26.d, z26.d, z23.d +sqrdmulh z23.d, z30.d, z2.d[0] +str q19, [x0, #704] +mul z30.d, z30.d,z3.d[0] +ldr q19, [x0, #720] +mla z22.d, P0/M, z27.d, z31.d +sub z27.d, z19.d, z18.d +add z19.d, z19.d, z18.d +sqrdmulh z18.d, z26.d, z0.d[0] +str q20, [x0, #960] +mul z26.d, z26.d,z1.d[0] +ldr q20, [x0, #208] +mla z30.d, P0/M, z23.d, z31.d +sub z23.d, z20.d, z22.d +add z20.d, z20.d, z22.d +sqrdmulh z22.d, z19.d, z0.d[0] +str q29, [x0, #1728] +mul z19.d, z19.d,z1.d[0] +ldr q29, [x0, #464] +mla z26.d, P0/M, z18.d, z31.d +sub z18.d, z29.d, z30.d +add z29.d, z29.d, z30.d +sqrdmulh z30.d, z24.d, z0.d[1] +str q28, [x0, #1984] +mul z24.d, z24.d,z1.d[1] +mla z19.d, P0/M, z22.d, z31.d +sub z22.d, z29.d, z26.d +add z29.d, z29.d, z26.d +sqrdmulh z26.d, z27.d, z0.d[1] +str q21, [x0, #1216] +mul z27.d, z27.d,z1.d[1] +mla z24.d, P0/M, z30.d, z31.d +sub z30.d, z20.d, z19.d +add z20.d, z20.d, z19.d +sqrdmulh z19.d, z29.d, z14.d[0] +str q25, [x0, #1472] +mul z29.d, z29.d,z15.d[0] +mla z27.d, P0/M, z26.d, z31.d +sub z26.d, z18.d, z24.d +add z18.d, z18.d, z24.d +sqrdmulh z24.d, z22.d, z14.d[1] +mul z22.d, z22.d,z15.d[1] +mla z29.d, P0/M, z19.d, z31.d +sub z19.d, z23.d, z27.d +add z23.d, z23.d, z27.d +sqrdmulh z27.d, z26.d, z12.d[1] +mul z26.d, z26.d,z13.d[1] +ldr q25, [x0, #2016] +mla z22.d, P0/M, z24.d, z31.d +sub z24.d, z20.d, z29.d +add z20.d, z20.d, z29.d +sqrdmulh z29.d, z18.d, z12.d[0] +mul z18.d, z18.d,z13.d[0] +ldr q21, [x0, #1760] +mla z26.d, P0/M, z27.d, z31.d +sub z27.d, z30.d, z22.d +add z30.d, z30.d, z22.d +sqrdmulh z22.d, z25.d, z2.d[0] +mul z25.d, z25.d,z3.d[0] +ldr q28, [x0, #1248] +mla z18.d, P0/M, z29.d, z31.d +sub z29.d, z19.d, z26.d +add z19.d, z19.d, z26.d +sqrdmulh z26.d, z21.d, z2.d[0] +str q20, [x0, #208] +mul z21.d, z21.d,z3.d[0] +ldr q20, [x0, #1504] +mla z25.d, P0/M, z22.d, z31.d +sub z22.d, z23.d, z18.d +add z23.d, z23.d, z18.d +sqrdmulh z18.d, z28.d, z2.d[0] +str q24, [x0, #464] +mul z28.d, z28.d,z3.d[0] +ldr q24, [x0, #992] +mla z21.d, P0/M, z26.d, z31.d +sub z26.d, z24.d, z25.d +add z24.d, z24.d, z25.d +sqrdmulh z25.d, z20.d, z2.d[0] +str q30, [x0, #720] +mul z20.d, z20.d,z3.d[0] +ldr q30, [x0, #736] +mla z28.d, P0/M, z18.d, z31.d +sub z18.d, z30.d, z21.d +add z30.d, z30.d, z21.d +sqrdmulh z21.d, z24.d, z0.d[0] +str q27, [x0, #976] +mul z24.d, z24.d,z1.d[0] +ldr q27, [x0, #224] +mla z20.d, P0/M, z25.d, z31.d +sub z25.d, z27.d, z28.d +add z27.d, z27.d, z28.d +sqrdmulh z28.d, z30.d, z0.d[0] +str q19, [x0, #1744] +mul z30.d, z30.d,z1.d[0] +ldr q19, [x0, #480] +mla z24.d, P0/M, z21.d, z31.d +sub z21.d, z19.d, z20.d +add z19.d, z19.d, z20.d +sqrdmulh z20.d, z26.d, z0.d[1] +str q29, [x0, #2000] +mul z26.d, z26.d,z1.d[1] +mla z30.d, P0/M, z28.d, z31.d +sub z28.d, z19.d, z24.d +add z19.d, z19.d, z24.d +sqrdmulh z24.d, z18.d, z0.d[1] +str q23, [x0, #1232] +mul z18.d, z18.d,z1.d[1] +mla z26.d, P0/M, z20.d, z31.d +sub z20.d, z27.d, z30.d +add z27.d, z27.d, z30.d +sqrdmulh z30.d, z19.d, z14.d[0] +str q22, [x0, #1488] +mul z19.d, z19.d,z15.d[0] +mla z18.d, P0/M, z24.d, z31.d +sub z24.d, z21.d, z26.d +add z21.d, z21.d, z26.d +sqrdmulh z26.d, z28.d, z14.d[1] +mul z28.d, z28.d,z15.d[1] +mla z19.d, P0/M, z30.d, z31.d +sub z30.d, z25.d, z18.d +add z25.d, z25.d, z18.d +sqrdmulh z18.d, z24.d, z12.d[1] +mul z24.d, z24.d,z13.d[1] +ldr q22, [x0, #2032] +mla z28.d, P0/M, z26.d, z31.d +sub z26.d, z27.d, z19.d +add z27.d, z27.d, z19.d +sqrdmulh z19.d, z21.d, z12.d[0] +mul z21.d, z21.d,z13.d[0] +ldr q23, [x0, #1776] +mla z24.d, P0/M, z18.d, z31.d +sub z18.d, z20.d, z28.d +add z20.d, z20.d, z28.d +sqrdmulh z28.d, z22.d, z2.d[0] +mul z22.d, z22.d,z3.d[0] +ldr q29, [x0, #1264] +mla z21.d, P0/M, z19.d, z31.d +sub z19.d, z30.d, z24.d +add z30.d, z30.d, z24.d +sqrdmulh z24.d, z23.d, z2.d[0] +str q27, [x0, #224] +mul z23.d, z23.d,z3.d[0] +ldr q27, [x0, #1520] +mla z22.d, P0/M, z28.d, z31.d +sub z28.d, z25.d, z21.d +add z25.d, z25.d, z21.d +sqrdmulh z21.d, z29.d, z2.d[0] +str q26, [x0, #480] +mul z29.d, z29.d,z3.d[0] +ldr q26, [x0, #1008] +mla z23.d, P0/M, z24.d, z31.d +sub z24.d, z26.d, z22.d +add z26.d, z26.d, z22.d +sqrdmulh z22.d, z27.d, z2.d[0] +str q20, [x0, #736] +mul z27.d, z27.d,z3.d[0] +ldr q20, [x0, #752] +mla z29.d, P0/M, z21.d, z31.d +sub z21.d, z20.d, z23.d +add z20.d, z20.d, z23.d +sqrdmulh z23.d, z26.d, z0.d[0] +str q18, [x0, #992] +mul z26.d, z26.d,z1.d[0] +ldr q18, [x0, #240] +mla z27.d, P0/M, z22.d, z31.d +sub z22.d, z18.d, z29.d +add z18.d, z18.d, z29.d +sqrdmulh z29.d, z20.d, z0.d[0] +str q30, [x0, #1760] +mul z20.d, z20.d,z1.d[0] +ldr q30, [x0, #496] +mla z26.d, P0/M, z23.d, z31.d +sub z23.d, z30.d, z27.d +add z30.d, z30.d, z27.d +sqrdmulh z27.d, z24.d, z0.d[1] +str q19, [x0, #2016] +mul z24.d, z24.d,z1.d[1] +mla z20.d, P0/M, z29.d, z31.d +sub z29.d, z30.d, z26.d +add z30.d, z30.d, z26.d +sqrdmulh z26.d, z21.d, z0.d[1] +str q25, [x0, #1248] +mul z21.d, z21.d,z1.d[1] +mla z24.d, P0/M, z27.d, z31.d +sub z27.d, z18.d, z20.d +add z18.d, z18.d, z20.d +sqrdmulh z20.d, z30.d, z14.d[0] +str q28, [x0, #1504] +mul z30.d, z30.d,z15.d[0] +mla z21.d, P0/M, z26.d, z31.d +sub z26.d, z23.d, z24.d +add z23.d, z23.d, z24.d +sqrdmulh z24.d, z29.d, z14.d[1] +mul z29.d, z29.d,z15.d[1] +mla z30.d, P0/M, z20.d, z31.d +sub z20.d, z22.d, z21.d +add z22.d, z22.d, z21.d +sqrdmulh z21.d, z26.d, z12.d[1] +mul z26.d, z26.d,z13.d[1] +ldr q28, [x0, #1792] +mla z29.d, P0/M, z24.d, z31.d +sub z24.d, z18.d, z30.d +add z18.d, z18.d, z30.d +sqrdmulh z30.d, z23.d, z12.d[0] +mul z23.d, z23.d,z13.d[0] +ldr q25, [x0, #1536] +mla z26.d, P0/M, z21.d, z31.d +sub z21.d, z27.d, z29.d +add z27.d, z27.d, z29.d +sqrdmulh z29.d, z28.d, z2.d[0] +mul z28.d, z28.d,z3.d[0] +ldr q19, [x0, #1024] +mla z23.d, P0/M, z30.d, z31.d +sub z30.d, z20.d, z26.d +add z20.d, z20.d, z26.d +sqrdmulh z26.d, z25.d, z2.d[0] +str q18, [x0, #240] +mul z25.d, z25.d,z3.d[0] +ldr q18, [x0, #1280] +mla z28.d, P0/M, z29.d, z31.d +sub z29.d, z22.d, z23.d +add z22.d, z22.d, z23.d +sqrdmulh z23.d, z19.d, z2.d[0] +str q24, [x0, #496] +mul z19.d, z19.d,z3.d[0] +ldr q24, [x0, #768] +mla z25.d, P0/M, z26.d, z31.d +sub z26.d, z24.d, z28.d +add z24.d, z24.d, z28.d +sqrdmulh z28.d, z18.d, z2.d[0] +str q27, [x0, #752] +mul z18.d, z18.d,z3.d[0] +ldr q27, [x0, #512] +mla z19.d, P0/M, z23.d, z31.d +sub z23.d, z27.d, z25.d +add z27.d, z27.d, z25.d +sqrdmulh z25.d, z24.d, z0.d[0] +str q21, [x0, #1008] +mul z24.d, z24.d,z1.d[0] +ldr q21, [x0, #0] +mla z18.d, P0/M, z28.d, z31.d +sub z28.d, z21.d, z19.d +add z21.d, z21.d, z19.d +sqrdmulh z19.d, z27.d, z0.d[0] +str q20, [x0, #1776] +mul z27.d, z27.d,z1.d[0] +ldr q20, [x0, #256] +mla z24.d, P0/M, z25.d, z31.d +sub z25.d, z20.d, z18.d +add z20.d, z20.d, z18.d +sqrdmulh z18.d, z26.d, z0.d[1] +str q30, [x0, #2032] +mul z26.d, z26.d,z1.d[1] +mla z27.d, P0/M, z19.d, z31.d +sub z19.d, z20.d, z24.d +add z20.d, z20.d, z24.d +sqrdmulh z24.d, z23.d, z0.d[1] +str q22, [x0, #1264] +mul z23.d, z23.d,z1.d[1] +mla z26.d, P0/M, z18.d, z31.d +sub z18.d, z21.d, z27.d +add z21.d, z21.d, z27.d +sqrdmulh z27.d, z20.d, z14.d[0] +str q29, [x0, #1520] +mul z20.d, z20.d,z15.d[0] +mla z23.d, P0/M, z24.d, z31.d +sub z24.d, z25.d, z26.d +add z25.d, z25.d, z26.d +sqrdmulh z26.d, z19.d, z14.d[1] +mul z19.d, z19.d,z15.d[1] +mla z20.d, P0/M, z27.d, z31.d +sub z27.d, z28.d, z23.d +add z28.d, z28.d, z23.d +sqrdmulh z23.d, z24.d, z12.d[1] +mul z24.d, z24.d,z13.d[1] +ldr q29, [x0, #1808] +mla z19.d, P0/M, z26.d, z31.d +sub z26.d, z21.d, z20.d +add z21.d, z21.d, z20.d +sqrdmulh z20.d, z25.d, z12.d[0] +mul z25.d, z25.d,z13.d[0] +ldr q22, [x0, #1552] +mla z24.d, P0/M, z23.d, z31.d +sub z23.d, z18.d, z19.d +add z18.d, z18.d, z19.d +sqrdmulh z19.d, z29.d, z2.d[0] +mul z29.d, z29.d,z3.d[0] +ldr q30, [x0, #1040] +mla z25.d, P0/M, z20.d, z31.d +sub z20.d, z27.d, z24.d +add z27.d, z27.d, z24.d +sqrdmulh z24.d, z22.d, z2.d[0] +str q21, [x0, #0] +mul z22.d, z22.d,z3.d[0] +ldr q21, [x0, #1296] +mla z29.d, P0/M, z19.d, z31.d +sub z19.d, z28.d, z25.d +add z28.d, z28.d, z25.d +sqrdmulh z25.d, z30.d, z2.d[0] +str q26, [x0, #256] +mul z30.d, z30.d,z3.d[0] +ldr q26, [x0, #784] +mla z22.d, P0/M, z24.d, z31.d +sub z24.d, z26.d, z29.d +add z26.d, z26.d, z29.d +sqrdmulh z29.d, z21.d, z2.d[0] +str q18, [x0, #512] +mul z21.d, z21.d,z3.d[0] +ldr q18, [x0, #528] +mla z30.d, P0/M, z25.d, z31.d +sub z25.d, z18.d, z22.d +add z18.d, z18.d, z22.d +sqrdmulh z22.d, z26.d, z0.d[0] +str q23, [x0, #768] +mul z26.d, z26.d,z1.d[0] +ldr q23, [x0, #16] +mla z21.d, P0/M, z29.d, z31.d +sub z29.d, z23.d, z30.d +add z23.d, z23.d, z30.d +sqrdmulh z30.d, z18.d, z0.d[0] +str q27, [x0, #1536] +mul z18.d, z18.d,z1.d[0] +ldr q27, [x0, #272] +mla z26.d, P0/M, z22.d, z31.d +sub z22.d, z27.d, z21.d +add z27.d, z27.d, z21.d +sqrdmulh z21.d, z24.d, z0.d[1] +str q20, [x0, #1792] +mul z24.d, z24.d,z1.d[1] +mla z18.d, P0/M, z30.d, z31.d +sub z30.d, z27.d, z26.d +add z27.d, z27.d, z26.d +sqrdmulh z26.d, z25.d, z0.d[1] +str q28, [x0, #1024] +mul z25.d, z25.d,z1.d[1] +mla z24.d, P0/M, z21.d, z31.d +sub z21.d, z23.d, z18.d +add z23.d, z23.d, z18.d +sqrdmulh z18.d, z27.d, z14.d[0] +str q19, [x0, #1280] +mul z27.d, z27.d,z15.d[0] +mla z25.d, P0/M, z26.d, z31.d +sub z26.d, z22.d, z24.d +add z22.d, z22.d, z24.d +sqrdmulh z24.d, z30.d, z14.d[1] +mul z30.d, z30.d,z15.d[1] +mla z27.d, P0/M, z18.d, z31.d +sub z18.d, z29.d, z25.d +add z29.d, z29.d, z25.d +sqrdmulh z25.d, z26.d, z12.d[1] +mul z26.d, z26.d,z13.d[1] +ldr q19, [x0, #1824] +mla z30.d, P0/M, z24.d, z31.d +sub z24.d, z23.d, z27.d +add z23.d, z23.d, z27.d +sqrdmulh z27.d, z22.d, z12.d[0] +mul z22.d, z22.d,z13.d[0] +ldr q28, [x0, #1568] +mla z26.d, P0/M, z25.d, z31.d +sub z25.d, z21.d, z30.d +add z21.d, z21.d, z30.d +sqrdmulh z30.d, z19.d, z2.d[0] +mul z19.d, z19.d,z3.d[0] +ldr q20, [x0, #1056] +mla z22.d, P0/M, z27.d, z31.d +sub z27.d, z18.d, z26.d +add z18.d, z18.d, z26.d +sqrdmulh z26.d, z28.d, z2.d[0] +str q23, [x0, #16] +mul z28.d, z28.d,z3.d[0] +ldr q23, [x0, #1312] +mla z19.d, P0/M, z30.d, z31.d +sub z30.d, z29.d, z22.d +add z29.d, z29.d, z22.d +sqrdmulh z22.d, z20.d, z2.d[0] +str q24, [x0, #272] +mul z20.d, z20.d,z3.d[0] +ldr q24, [x0, #800] +mla z28.d, P0/M, z26.d, z31.d +sub z26.d, z24.d, z19.d +add z24.d, z24.d, z19.d +sqrdmulh z19.d, z23.d, z2.d[0] +str q21, [x0, #528] +mul z23.d, z23.d,z3.d[0] +ldr q21, [x0, #544] +mla z20.d, P0/M, z22.d, z31.d +sub z22.d, z21.d, z28.d +add z21.d, z21.d, z28.d +sqrdmulh z28.d, z24.d, z0.d[0] +str q25, [x0, #784] +mul z24.d, z24.d,z1.d[0] +ldr q25, [x0, #32] +mla z23.d, P0/M, z19.d, z31.d +sub z19.d, z25.d, z20.d +add z25.d, z25.d, z20.d +sqrdmulh z20.d, z21.d, z0.d[0] +str q18, [x0, #1552] +mul z21.d, z21.d,z1.d[0] +ldr q18, [x0, #288] +mla z24.d, P0/M, z28.d, z31.d +sub z28.d, z18.d, z23.d +add z18.d, z18.d, z23.d +sqrdmulh z23.d, z26.d, z0.d[1] +str q27, [x0, #1808] +mul z26.d, z26.d,z1.d[1] +mla z21.d, P0/M, z20.d, z31.d +sub z20.d, z18.d, z24.d +add z18.d, z18.d, z24.d +sqrdmulh z24.d, z22.d, z0.d[1] +str q29, [x0, #1040] +mul z22.d, z22.d,z1.d[1] +mla z26.d, P0/M, z23.d, z31.d +sub z23.d, z25.d, z21.d +add z25.d, z25.d, z21.d +sqrdmulh z21.d, z18.d, z14.d[0] +str q30, [x0, #1296] +mul z18.d, z18.d,z15.d[0] +mla z22.d, P0/M, z24.d, z31.d +sub z24.d, z28.d, z26.d +add z28.d, z28.d, z26.d +sqrdmulh z26.d, z20.d, z14.d[1] +mul z20.d, z20.d,z15.d[1] +mla z18.d, P0/M, z21.d, z31.d +sub z21.d, z19.d, z22.d +add z19.d, z19.d, z22.d +sqrdmulh z22.d, z24.d, z12.d[1] +mul z24.d, z24.d,z13.d[1] +ldr q30, [x0, #1840] +mla z20.d, P0/M, z26.d, z31.d +sub z26.d, z25.d, z18.d +add z25.d, z25.d, z18.d +sqrdmulh z18.d, z28.d, z12.d[0] +mul z28.d, z28.d,z13.d[0] +ldr q29, [x0, #1584] +mla z24.d, P0/M, z22.d, z31.d +sub z22.d, z23.d, z20.d +add z23.d, z23.d, z20.d +sqrdmulh z20.d, z30.d, z2.d[0] +mul z30.d, z30.d,z3.d[0] +ldr q27, [x0, #1072] +mla z28.d, P0/M, z18.d, z31.d +sub z18.d, z21.d, z24.d +add z21.d, z21.d, z24.d +sqrdmulh z24.d, z29.d, z2.d[0] +str q25, [x0, #32] +mul z29.d, z29.d,z3.d[0] +ldr q25, [x0, #1328] +mla z30.d, P0/M, z20.d, z31.d +sub z20.d, z19.d, z28.d +add z19.d, z19.d, z28.d +sqrdmulh z28.d, z27.d, z2.d[0] +str q26, [x0, #288] +mul z27.d, z27.d,z3.d[0] +ldr q26, [x0, #816] +mla z29.d, P0/M, z24.d, z31.d +sub z24.d, z26.d, z30.d +add z26.d, z26.d, z30.d +sqrdmulh z30.d, z25.d, z2.d[0] +str q23, [x0, #544] +mul z25.d, z25.d,z3.d[0] +ldr q23, [x0, #560] +mla z27.d, P0/M, z28.d, z31.d +sub z28.d, z23.d, z29.d +add z23.d, z23.d, z29.d +sqrdmulh z29.d, z26.d, z0.d[0] +str q22, [x0, #800] +mul z26.d, z26.d,z1.d[0] +ldr q22, [x0, #48] +mla z25.d, P0/M, z30.d, z31.d +sub z30.d, z22.d, z27.d +add z22.d, z22.d, z27.d +sqrdmulh z27.d, z23.d, z0.d[0] +str q21, [x0, #1568] +mul z23.d, z23.d,z1.d[0] +ldr q21, [x0, #304] +mla z26.d, P0/M, z29.d, z31.d +sub z29.d, z21.d, z25.d +add z21.d, z21.d, z25.d +sqrdmulh z25.d, z24.d, z0.d[1] +str q18, [x0, #1824] +mul z24.d, z24.d,z1.d[1] +mla z23.d, P0/M, z27.d, z31.d +sub z27.d, z21.d, z26.d +add z21.d, z21.d, z26.d +sqrdmulh z26.d, z28.d, z0.d[1] +str q19, [x0, #1056] +mul z28.d, z28.d,z1.d[1] +mla z24.d, P0/M, z25.d, z31.d +sub z25.d, z22.d, z23.d +add z22.d, z22.d, z23.d +sqrdmulh z23.d, z21.d, z14.d[0] +str q20, [x0, #1312] +mul z21.d, z21.d,z15.d[0] +mla z28.d, P0/M, z26.d, z31.d +sub z26.d, z29.d, z24.d +add z29.d, z29.d, z24.d +sqrdmulh z24.d, z27.d, z14.d[1] +mul z27.d, z27.d,z15.d[1] +mla z21.d, P0/M, z23.d, z31.d +sub z23.d, z30.d, z28.d +add z30.d, z30.d, z28.d +sqrdmulh z28.d, z26.d, z12.d[1] +mul z26.d, z26.d,z13.d[1] +ldr q20, [x0, #1856] +mla z27.d, P0/M, z24.d, z31.d +sub z24.d, z22.d, z21.d +add z22.d, z22.d, z21.d +sqrdmulh z21.d, z29.d, z12.d[0] +mul z29.d, z29.d,z13.d[0] +ldr q19, [x0, #1600] +mla z26.d, P0/M, z28.d, z31.d +sub z28.d, z25.d, z27.d +add z25.d, z25.d, z27.d +sqrdmulh z27.d, z20.d, z2.d[0] +mul z20.d, z20.d,z3.d[0] +ldr q18, [x0, #1088] +mla z29.d, P0/M, z21.d, z31.d +sub z21.d, z23.d, z26.d +add z23.d, z23.d, z26.d +sqrdmulh z26.d, z19.d, z2.d[0] +str q22, [x0, #48] +mul z19.d, z19.d,z3.d[0] +ldr q22, [x0, #1344] +mla z20.d, P0/M, z27.d, z31.d +sub z27.d, z30.d, z29.d +add z30.d, z30.d, z29.d +sqrdmulh z29.d, z18.d, z2.d[0] +str q24, [x0, #304] +mul z18.d, z18.d,z3.d[0] +ldr q24, [x0, #832] +mla z19.d, P0/M, z26.d, z31.d +sub z26.d, z24.d, z20.d +add z24.d, z24.d, z20.d +sqrdmulh z20.d, z22.d, z2.d[0] +str q25, [x0, #560] +mul z22.d, z22.d,z3.d[0] +ldr q25, [x0, #576] +mla z18.d, P0/M, z29.d, z31.d +sub z29.d, z25.d, z19.d +add z25.d, z25.d, z19.d +sqrdmulh z19.d, z24.d, z0.d[0] +str q28, [x0, #816] +mul z24.d, z24.d,z1.d[0] +ldr q28, [x0, #64] +mla z22.d, P0/M, z20.d, z31.d +sub z20.d, z28.d, z18.d +add z28.d, z28.d, z18.d +sqrdmulh z18.d, z25.d, z0.d[0] +str q23, [x0, #1584] +mul z25.d, z25.d,z1.d[0] +ldr q23, [x0, #320] +mla z24.d, P0/M, z19.d, z31.d +sub z19.d, z23.d, z22.d +add z23.d, z23.d, z22.d +sqrdmulh z22.d, z26.d, z0.d[1] +str q21, [x0, #1840] +mul z26.d, z26.d,z1.d[1] +mla z25.d, P0/M, z18.d, z31.d +sub z18.d, z23.d, z24.d +add z23.d, z23.d, z24.d +sqrdmulh z24.d, z29.d, z0.d[1] +str q30, [x0, #1072] +mul z29.d, z29.d,z1.d[1] +mla z26.d, P0/M, z22.d, z31.d +sub z22.d, z28.d, z25.d +add z28.d, z28.d, z25.d +sqrdmulh z25.d, z23.d, z14.d[0] +str q27, [x0, #1328] +mul z23.d, z23.d,z15.d[0] +mla z29.d, P0/M, z24.d, z31.d +sub z24.d, z19.d, z26.d +add z19.d, z19.d, z26.d +sqrdmulh z26.d, z18.d, z14.d[1] +mul z18.d, z18.d,z15.d[1] +mla z23.d, P0/M, z25.d, z31.d +sub z25.d, z20.d, z29.d +add z20.d, z20.d, z29.d +sqrdmulh z29.d, z24.d, z12.d[1] +mul z24.d, z24.d,z13.d[1] +ldr q27, [x0, #1872] +mla z18.d, P0/M, z26.d, z31.d +sub z26.d, z28.d, z23.d +add z28.d, z28.d, z23.d +sqrdmulh z23.d, z19.d, z12.d[0] +mul z19.d, z19.d,z13.d[0] +ldr q30, [x0, #1616] +mla z24.d, P0/M, z29.d, z31.d +sub z29.d, z22.d, z18.d +add z22.d, z22.d, z18.d +sqrdmulh z18.d, z27.d, z2.d[0] +mul z27.d, z27.d,z3.d[0] +ldr q21, [x0, #1104] +mla z19.d, P0/M, z23.d, z31.d +sub z23.d, z25.d, z24.d +add z25.d, z25.d, z24.d +sqrdmulh z24.d, z30.d, z2.d[0] +str q28, [x0, #64] +mul z30.d, z30.d,z3.d[0] +ldr q28, [x0, #1360] +mla z27.d, P0/M, z18.d, z31.d +sub z18.d, z20.d, z19.d +add z20.d, z20.d, z19.d +sqrdmulh z19.d, z21.d, z2.d[0] +str q26, [x0, #320] +mul z21.d, z21.d,z3.d[0] +ldr q26, [x0, #848] +mla z30.d, P0/M, z24.d, z31.d +sub z24.d, z26.d, z27.d +add z26.d, z26.d, z27.d +sqrdmulh z27.d, z28.d, z2.d[0] +str q22, [x0, #576] +mul z28.d, z28.d,z3.d[0] +ldr q22, [x0, #592] +mla z21.d, P0/M, z19.d, z31.d +sub z19.d, z22.d, z30.d +add z22.d, z22.d, z30.d +sqrdmulh z30.d, z26.d, z0.d[0] +str q29, [x0, #832] +mul z26.d, z26.d,z1.d[0] +ldr q29, [x0, #80] +mla z28.d, P0/M, z27.d, z31.d +sub z27.d, z29.d, z21.d +add z29.d, z29.d, z21.d +sqrdmulh z21.d, z22.d, z0.d[0] +str q25, [x0, #1600] +mul z22.d, z22.d,z1.d[0] +ldr q25, [x0, #336] +mla z26.d, P0/M, z30.d, z31.d +sub z30.d, z25.d, z28.d +add z25.d, z25.d, z28.d +sqrdmulh z28.d, z24.d, z0.d[1] +str q23, [x0, #1856] +mul z24.d, z24.d,z1.d[1] +mla z22.d, P0/M, z21.d, z31.d +sub z21.d, z25.d, z26.d +add z25.d, z25.d, z26.d +sqrdmulh z26.d, z19.d, z0.d[1] +str q20, [x0, #1088] +mul z19.d, z19.d,z1.d[1] +mla z24.d, P0/M, z28.d, z31.d +sub z28.d, z29.d, z22.d +add z29.d, z29.d, z22.d +sqrdmulh z22.d, z25.d, z14.d[0] +str q18, [x0, #1344] +mul z25.d, z25.d,z15.d[0] +mla z19.d, P0/M, z26.d, z31.d +sub z26.d, z30.d, z24.d +add z30.d, z30.d, z24.d +sqrdmulh z24.d, z21.d, z14.d[1] +mul z21.d, z21.d,z15.d[1] +mla z25.d, P0/M, z22.d, z31.d +sub z22.d, z27.d, z19.d +add z27.d, z27.d, z19.d +sqrdmulh z19.d, z26.d, z12.d[1] +mul z26.d, z26.d,z13.d[1] +ldr q18, [x0, #1888] +mla z21.d, P0/M, z24.d, z31.d +sub z24.d, z29.d, z25.d +add z29.d, z29.d, z25.d +sqrdmulh z25.d, z30.d, z12.d[0] +mul z30.d, z30.d,z13.d[0] +ldr q20, [x0, #1632] +mla z26.d, P0/M, z19.d, z31.d +sub z19.d, z28.d, z21.d +add z28.d, z28.d, z21.d +sqrdmulh z21.d, z18.d, z2.d[0] +mul z18.d, z18.d,z3.d[0] +ldr q23, [x0, #1120] +mla z30.d, P0/M, z25.d, z31.d +sub z25.d, z22.d, z26.d +add z22.d, z22.d, z26.d +sqrdmulh z26.d, z20.d, z2.d[0] +str q29, [x0, #80] +mul z20.d, z20.d,z3.d[0] +ldr q29, [x0, #1376] +mla z18.d, P0/M, z21.d, z31.d +sub z21.d, z27.d, z30.d +add z27.d, z27.d, z30.d +sqrdmulh z30.d, z23.d, z2.d[0] +str q24, [x0, #336] +mul z23.d, z23.d,z3.d[0] +ldr q24, [x0, #864] +mla z20.d, P0/M, z26.d, z31.d +sub z26.d, z24.d, z18.d +add z24.d, z24.d, z18.d +sqrdmulh z18.d, z29.d, z2.d[0] +str q28, [x0, #592] +mul z29.d, z29.d,z3.d[0] +ldr q28, [x0, #608] +mla z23.d, P0/M, z30.d, z31.d +sub z30.d, z28.d, z20.d +add z28.d, z28.d, z20.d +sqrdmulh z20.d, z24.d, z0.d[0] +str q19, [x0, #848] +mul z24.d, z24.d,z1.d[0] +ldr q19, [x0, #96] +mla z29.d, P0/M, z18.d, z31.d +sub z18.d, z19.d, z23.d +add z19.d, z19.d, z23.d +sqrdmulh z23.d, z28.d, z0.d[0] +str q22, [x0, #1616] +mul z28.d, z28.d,z1.d[0] +ldr q22, [x0, #352] +mla z24.d, P0/M, z20.d, z31.d +sub z20.d, z22.d, z29.d +add z22.d, z22.d, z29.d +sqrdmulh z29.d, z26.d, z0.d[1] +str q25, [x0, #1872] +mul z26.d, z26.d,z1.d[1] +mla z28.d, P0/M, z23.d, z31.d +sub z23.d, z22.d, z24.d +add z22.d, z22.d, z24.d +sqrdmulh z24.d, z30.d, z0.d[1] +str q27, [x0, #1104] +mul z30.d, z30.d,z1.d[1] +mla z26.d, P0/M, z29.d, z31.d +sub z29.d, z19.d, z28.d +add z19.d, z19.d, z28.d +sqrdmulh z28.d, z22.d, z14.d[0] +str q21, [x0, #1360] +mul z22.d, z22.d,z15.d[0] +mla z30.d, P0/M, z24.d, z31.d +sub z24.d, z20.d, z26.d +add z20.d, z20.d, z26.d +sqrdmulh z26.d, z23.d, z14.d[1] +mul z23.d, z23.d,z15.d[1] +mla z22.d, P0/M, z28.d, z31.d +sub z28.d, z18.d, z30.d +add z18.d, z18.d, z30.d +sqrdmulh z30.d, z24.d, z12.d[1] +mul z24.d, z24.d,z13.d[1] +ldr q21, [x0, #1904] +mla z23.d, P0/M, z26.d, z31.d +sub z26.d, z19.d, z22.d +add z19.d, z19.d, z22.d +sqrdmulh z22.d, z20.d, z12.d[0] +mul z20.d, z20.d,z13.d[0] +ldr q27, [x0, #1648] +mla z24.d, P0/M, z30.d, z31.d +sub z30.d, z29.d, z23.d +add z29.d, z29.d, z23.d +sqrdmulh z23.d, z21.d, z2.d[0] +mul z21.d, z21.d,z3.d[0] +ldr q25, [x0, #1136] +mla z20.d, P0/M, z22.d, z31.d +sub z22.d, z28.d, z24.d +add z28.d, z28.d, z24.d +sqrdmulh z24.d, z27.d, z2.d[0] +str q19, [x0, #96] +mul z27.d, z27.d,z3.d[0] +ldr q19, [x0, #1392] +mla z21.d, P0/M, z23.d, z31.d +sub z23.d, z18.d, z20.d +add z18.d, z18.d, z20.d +sqrdmulh z20.d, z25.d, z2.d[0] +str q26, [x0, #352] +mul z25.d, z25.d,z3.d[0] +ldr q26, [x0, #880] +mla z27.d, P0/M, z24.d, z31.d +sub z24.d, z26.d, z21.d +add z26.d, z26.d, z21.d +sqrdmulh z21.d, z19.d, z2.d[0] +str q29, [x0, #608] +mul z19.d, z19.d,z3.d[0] +ldr q29, [x0, #624] +mla z25.d, P0/M, z20.d, z31.d +sub z20.d, z29.d, z27.d +add z29.d, z29.d, z27.d +sqrdmulh z27.d, z26.d, z0.d[0] +str q30, [x0, #864] +mul z26.d, z26.d,z1.d[0] +ldr q30, [x0, #112] +mla z19.d, P0/M, z21.d, z31.d +sub z21.d, z30.d, z25.d +add z30.d, z30.d, z25.d +sqrdmulh z25.d, z29.d, z0.d[0] +str q28, [x0, #1632] +mul z29.d, z29.d,z1.d[0] +ldr q28, [x0, #368] +mla z26.d, P0/M, z27.d, z31.d +sub z27.d, z28.d, z19.d +add z28.d, z28.d, z19.d +sqrdmulh z19.d, z24.d, z0.d[1] +str q22, [x0, #1888] +mul z24.d, z24.d,z1.d[1] +mla z29.d, P0/M, z25.d, z31.d +sub z25.d, z28.d, z26.d +add z28.d, z28.d, z26.d +sqrdmulh z26.d, z20.d, z0.d[1] +str q18, [x0, #1120] +mul z20.d, z20.d,z1.d[1] +mla z24.d, P0/M, z19.d, z31.d +sub z19.d, z30.d, z29.d +add z30.d, z30.d, z29.d +sqrdmulh z29.d, z28.d, z14.d[0] +str q23, [x0, #1376] +mul z28.d, z28.d,z15.d[0] +mla z20.d, P0/M, z26.d, z31.d +sub z26.d, z27.d, z24.d +add z27.d, z27.d, z24.d +sqrdmulh z24.d, z25.d, z14.d[1] +mul z25.d, z25.d,z15.d[1] +mla z28.d, P0/M, z29.d, z31.d +sub z29.d, z21.d, z20.d +add z21.d, z21.d, z20.d +sqrdmulh z20.d, z26.d, z12.d[1] +mul z26.d, z26.d,z13.d[1] +mla z25.d, P0/M, z24.d, z31.d +sub z24.d, z30.d, z28.d +add z30.d, z30.d, z28.d +sqrdmulh z28.d, z27.d, z12.d[0] +mul z27.d, z27.d,z13.d[0] +mla z26.d, P0/M, z20.d, z31.d +sub z20.d, z19.d, z25.d +add z19.d, z19.d, z25.d +mla z27.d, P0/M, z28.d, z31.d +sub z28.d, z29.d, z26.d +add z29.d, z29.d, z26.d +str q30, [x0, #112] +sub z30.d, z21.d, z27.d +add z21.d, z21.d, z27.d +str q24, [x0, #368] +str q19, [x0, #624] +str q20, [x0, #880] +str q29, [x0, #1648] +str q28, [x0, #1904] +str q21, [x0, #1136] +str q30, [x0, #1392] +ldr q4, [x17, #+128] +ldr q5, [x17, #+144] +ldr q6, [x17, #+160] +ldr q7, [x17, #+176] +ldr q8, [x17, #+192] +ldr q9, [x17, #+208] +ldr q10, [x17, #+224] +ldr q11, [x17, #+240] +ldr q16, [x0, #240] +ldr q17, [x0, #208] +sqrdmulh z22.d, z16.d, z5.d[0] +mul z16.d, z16.d,z4.d[0] +ldr q18, [x0, #144] +sqrdmulh z23.d, z17.d, z5.d[0] +mul z17.d, z17.d,z4.d[0] +ldr q25, [x0, #176] +mla z16.d, P0/M, z22.d, z31.d +sqrdmulh z22.d, z18.d, z5.d[0] +mul z18.d, z18.d,z4.d[0] +ldr q26, [x0, #112] +mla z17.d, P0/M, z23.d, z31.d +sub z23.d, z26.d, z16.d +add z26.d, z26.d, z16.d +sqrdmulh z16.d, z25.d, z5.d[0] +mul z25.d, z25.d,z4.d[0] +ldr q27, [x0, #80] +mla z18.d, P0/M, z22.d, z31.d +sub z22.d, z27.d, z17.d +add z27.d, z27.d, z17.d +sqrdmulh z17.d, z26.d, z7.d[0] +mul z26.d, z26.d,z6.d[0] +ldr q24, [x0, #16] +mla z25.d, P0/M, z16.d, z31.d +sub z16.d, z24.d, z18.d +add z24.d, z24.d, z18.d +sqrdmulh z18.d, z27.d, z7.d[0] +mul z27.d, z27.d,z6.d[0] +ldr q19, [x0, #48] +mla z26.d, P0/M, z17.d, z31.d +sub z17.d, z19.d, z25.d +add z19.d, z19.d, z25.d +sqrdmulh z25.d, z23.d, z7.d[1] +mul z23.d, z23.d,z6.d[1] +mla z27.d, P0/M, z18.d, z31.d +sub z18.d, z19.d, z26.d +add z19.d, z19.d, z26.d +sqrdmulh z26.d, z22.d, z7.d[1] +mul z22.d, z22.d,z6.d[1] +mla z23.d, P0/M, z25.d, z31.d +sub z25.d, z24.d, z27.d +add z24.d, z24.d, z27.d +sqrdmulh z27.d, z19.d, z9.d[0] +mul z19.d, z19.d,z8.d[0] +mla z22.d, P0/M, z26.d, z31.d +sub z26.d, z17.d, z23.d +add z17.d, z17.d, z23.d +sqrdmulh z23.d, z18.d, z9.d[1] +mul z18.d, z18.d,z8.d[1] +mla z19.d, P0/M, z27.d, z31.d +sub z27.d, z16.d, z22.d +add z16.d, z16.d, z22.d +sqrdmulh z22.d, z26.d, z11.d[1] +mul z26.d, z26.d,z10.d[1] +ldr q20, [x0, #224] +mla z18.d, P0/M, z23.d, z31.d +sub z23.d, z24.d, z19.d +add z24.d, z24.d, z19.d +sqrdmulh z19.d, z17.d, z11.d[0] +mul z17.d, z17.d,z10.d[0] +ldr q29, [x0, #192] +mla z26.d, P0/M, z22.d, z31.d +sub z22.d, z25.d, z18.d +add z25.d, z25.d, z18.d +sqrdmulh z18.d, z20.d, z5.d[0] +mul z20.d, z20.d,z4.d[0] +ldr q28, [x0, #128] +mla z17.d, P0/M, z19.d, z31.d +sub z19.d, z27.d, z26.d +add z27.d, z27.d, z26.d +sqrdmulh z26.d, z29.d, z5.d[0] +str q24, [x0, #16] +mul z29.d, z29.d,z4.d[0] +ldr q24, [x0, #160] +mla z20.d, P0/M, z18.d, z31.d +sub z18.d, z16.d, z17.d +add z16.d, z16.d, z17.d +sqrdmulh z17.d, z28.d, z5.d[0] +str q23, [x0, #48] +mul z28.d, z28.d,z4.d[0] +ldr q23, [x0, #96] +mla z29.d, P0/M, z26.d, z31.d +sub z26.d, z23.d, z20.d +add z23.d, z23.d, z20.d +sqrdmulh z20.d, z24.d, z5.d[0] +str q25, [x0, #80] +mul z24.d, z24.d,z4.d[0] +ldr q25, [x0, #64] +mla z28.d, P0/M, z17.d, z31.d +sub z17.d, z25.d, z29.d +add z25.d, z25.d, z29.d +sqrdmulh z29.d, z23.d, z7.d[0] +str q22, [x0, #112] +mul z23.d, z23.d,z6.d[0] +ldr q22, [x0, #0] +mla z24.d, P0/M, z20.d, z31.d +sub z20.d, z22.d, z28.d +add z22.d, z22.d, z28.d +sqrdmulh z28.d, z25.d, z7.d[0] +str q27, [x0, #208] +mul z25.d, z25.d,z6.d[0] +ldr q27, [x0, #32] +mla z23.d, P0/M, z29.d, z31.d +sub z29.d, z27.d, z24.d +add z27.d, z27.d, z24.d +sqrdmulh z24.d, z26.d, z7.d[1] +str q19, [x0, #240] +mul z26.d, z26.d,z6.d[1] +mla z25.d, P0/M, z28.d, z31.d +sub z28.d, z27.d, z23.d +add z27.d, z27.d, z23.d +sqrdmulh z23.d, z17.d, z7.d[1] +str q16, [x0, #144] +mul z17.d, z17.d,z6.d[1] +mla z26.d, P0/M, z24.d, z31.d +sub z24.d, z22.d, z25.d +add z22.d, z22.d, z25.d +ldr q3, [x17, #+256] +sqrdmulh z25.d, z27.d, z9.d[0] +str q18, [x0, #176] +mul z27.d, z27.d,z8.d[0] +mla z17.d, P0/M, z23.d, z31.d +sub z23.d, z29.d, z26.d +add z29.d, z29.d, z26.d +ldr q2, [x17, #+272] +sqrdmulh z26.d, z28.d, z9.d[1] +mul z28.d, z28.d,z8.d[1] +mla z27.d, P0/M, z25.d, z31.d +sub z25.d, z20.d, z17.d +add z20.d, z20.d, z17.d +ldr q1, [x17, #+288] +ldr q0, [x17, #+304] +ldr q15, [x17, #+320] +ldr q14, [x17, #+336] +ldr q13, [x17, #+352] +ldr q12, [x17, #+368] +sqrdmulh z17.d, z23.d, z11.d[1] +mul z23.d, z23.d,z10.d[1] +ldr q18, [x0, #496] +mla z28.d, P0/M, z26.d, z31.d +sub z26.d, z22.d, z27.d +add z22.d, z22.d, z27.d +sqrdmulh z27.d, z29.d, z11.d[0] +mul z29.d, z29.d,z10.d[0] +ldr q16, [x0, #464] +mla z23.d, P0/M, z17.d, z31.d +sub z17.d, z24.d, z28.d +add z24.d, z24.d, z28.d +sqrdmulh z28.d, z18.d, z2.d[0] +mul z18.d, z18.d,z3.d[0] +ldr q19, [x0, #400] +mla z29.d, P0/M, z27.d, z31.d +sub z27.d, z25.d, z23.d +add z25.d, z25.d, z23.d +sqrdmulh z23.d, z16.d, z2.d[0] +str q22, [x0, #0] +mul z16.d, z16.d,z3.d[0] +ldr q22, [x0, #432] +mla z18.d, P0/M, z28.d, z31.d +sub z28.d, z20.d, z29.d +add z20.d, z20.d, z29.d +sqrdmulh z29.d, z19.d, z2.d[0] +str q26, [x0, #32] +mul z19.d, z19.d,z3.d[0] +ldr q26, [x0, #368] +mla z16.d, P0/M, z23.d, z31.d +sub z23.d, z26.d, z18.d +add z26.d, z26.d, z18.d +sqrdmulh z18.d, z22.d, z2.d[0] +str q24, [x0, #64] +mul z22.d, z22.d,z3.d[0] +ldr q24, [x0, #336] +mla z19.d, P0/M, z29.d, z31.d +sub z29.d, z24.d, z16.d +add z24.d, z24.d, z16.d +sqrdmulh z16.d, z26.d, z0.d[0] +str q17, [x0, #96] +mul z26.d, z26.d,z1.d[0] +ldr q17, [x0, #272] +mla z22.d, P0/M, z18.d, z31.d +sub z18.d, z17.d, z19.d +add z17.d, z17.d, z19.d +sqrdmulh z19.d, z24.d, z0.d[0] +str q25, [x0, #192] +mul z24.d, z24.d,z1.d[0] +ldr q25, [x0, #304] +mla z26.d, P0/M, z16.d, z31.d +sub z16.d, z25.d, z22.d +add z25.d, z25.d, z22.d +sqrdmulh z22.d, z23.d, z0.d[1] +str q27, [x0, #224] +mul z23.d, z23.d,z1.d[1] +mla z24.d, P0/M, z19.d, z31.d +sub z19.d, z25.d, z26.d +add z25.d, z25.d, z26.d +sqrdmulh z26.d, z29.d, z0.d[1] +str q20, [x0, #128] +mul z29.d, z29.d,z1.d[1] +mla z23.d, P0/M, z22.d, z31.d +sub z22.d, z17.d, z24.d +add z17.d, z17.d, z24.d +sqrdmulh z24.d, z25.d, z14.d[0] +str q28, [x0, #160] +mul z25.d, z25.d,z15.d[0] +mla z29.d, P0/M, z26.d, z31.d +sub z26.d, z16.d, z23.d +add z16.d, z16.d, z23.d +sqrdmulh z23.d, z19.d, z14.d[1] +mul z19.d, z19.d,z15.d[1] +mla z25.d, P0/M, z24.d, z31.d +sub z24.d, z18.d, z29.d +add z18.d, z18.d, z29.d +sqrdmulh z29.d, z26.d, z12.d[1] +mul z26.d, z26.d,z13.d[1] +ldr q28, [x0, #480] +mla z19.d, P0/M, z23.d, z31.d +sub z23.d, z17.d, z25.d +add z17.d, z17.d, z25.d +sqrdmulh z25.d, z16.d, z12.d[0] +mul z16.d, z16.d,z13.d[0] +ldr q20, [x0, #448] +mla z26.d, P0/M, z29.d, z31.d +sub z29.d, z22.d, z19.d +add z22.d, z22.d, z19.d +sqrdmulh z19.d, z28.d, z2.d[0] +mul z28.d, z28.d,z3.d[0] +ldr q27, [x0, #384] +mla z16.d, P0/M, z25.d, z31.d +sub z25.d, z24.d, z26.d +add z24.d, z24.d, z26.d +sqrdmulh z26.d, z20.d, z2.d[0] +str q17, [x0, #272] +mul z20.d, z20.d,z3.d[0] +ldr q17, [x0, #416] +mla z28.d, P0/M, z19.d, z31.d +sub z19.d, z18.d, z16.d +add z18.d, z18.d, z16.d +sqrdmulh z16.d, z27.d, z2.d[0] +str q23, [x0, #304] +mul z27.d, z27.d,z3.d[0] +ldr q23, [x0, #352] +mla z20.d, P0/M, z26.d, z31.d +sub z26.d, z23.d, z28.d +add z23.d, z23.d, z28.d +sqrdmulh z28.d, z17.d, z2.d[0] +str q22, [x0, #336] +mul z17.d, z17.d,z3.d[0] +ldr q22, [x0, #320] +mla z27.d, P0/M, z16.d, z31.d +sub z16.d, z22.d, z20.d +add z22.d, z22.d, z20.d +sqrdmulh z20.d, z23.d, z0.d[0] +str q29, [x0, #368] +mul z23.d, z23.d,z1.d[0] +ldr q29, [x0, #256] +mla z17.d, P0/M, z28.d, z31.d +sub z28.d, z29.d, z27.d +add z29.d, z29.d, z27.d +sqrdmulh z27.d, z22.d, z0.d[0] +str q24, [x0, #464] +mul z22.d, z22.d,z1.d[0] +ldr q24, [x0, #288] +mla z23.d, P0/M, z20.d, z31.d +sub z20.d, z24.d, z17.d +add z24.d, z24.d, z17.d +sqrdmulh z17.d, z26.d, z0.d[1] +str q25, [x0, #496] +mul z26.d, z26.d,z1.d[1] +mla z22.d, P0/M, z27.d, z31.d +sub z27.d, z24.d, z23.d +add z24.d, z24.d, z23.d +sqrdmulh z23.d, z16.d, z0.d[1] +str q18, [x0, #400] +mul z16.d, z16.d,z1.d[1] +mla z26.d, P0/M, z17.d, z31.d +sub z17.d, z29.d, z22.d +add z29.d, z29.d, z22.d +ldr q11, [x17, #+384] +sqrdmulh z22.d, z24.d, z14.d[0] +str q19, [x0, #432] +mul z24.d, z24.d,z15.d[0] +mla z16.d, P0/M, z23.d, z31.d +sub z23.d, z20.d, z26.d +add z20.d, z20.d, z26.d +ldr q10, [x17, #+400] +sqrdmulh z26.d, z27.d, z14.d[1] +mul z27.d, z27.d,z15.d[1] +mla z24.d, P0/M, z22.d, z31.d +sub z22.d, z28.d, z16.d +add z28.d, z28.d, z16.d +ldr q9, [x17, #+416] +ldr q8, [x17, #+432] +ldr q7, [x17, #+448] +ldr q6, [x17, #+464] +ldr q5, [x17, #+480] +ldr q4, [x17, #+496] +sqrdmulh z16.d, z23.d, z12.d[1] +mul z23.d, z23.d,z13.d[1] +ldr q19, [x0, #752] +mla z27.d, P0/M, z26.d, z31.d +sub z26.d, z29.d, z24.d +add z29.d, z29.d, z24.d +sqrdmulh z24.d, z20.d, z12.d[0] +mul z20.d, z20.d,z13.d[0] +ldr q18, [x0, #720] +mla z23.d, P0/M, z16.d, z31.d +sub z16.d, z17.d, z27.d +add z17.d, z17.d, z27.d +sqrdmulh z27.d, z19.d, z10.d[0] +mul z19.d, z19.d,z11.d[0] +ldr q25, [x0, #656] +mla z20.d, P0/M, z24.d, z31.d +sub z24.d, z22.d, z23.d +add z22.d, z22.d, z23.d +sqrdmulh z23.d, z18.d, z10.d[0] +str q29, [x0, #256] +mul z18.d, z18.d,z11.d[0] +ldr q29, [x0, #688] +mla z19.d, P0/M, z27.d, z31.d +sub z27.d, z28.d, z20.d +add z28.d, z28.d, z20.d +sqrdmulh z20.d, z25.d, z10.d[0] +str q26, [x0, #288] +mul z25.d, z25.d,z11.d[0] +ldr q26, [x0, #624] +mla z18.d, P0/M, z23.d, z31.d +sub z23.d, z26.d, z19.d +add z26.d, z26.d, z19.d +sqrdmulh z19.d, z29.d, z10.d[0] +str q17, [x0, #320] +mul z29.d, z29.d,z11.d[0] +ldr q17, [x0, #592] +mla z25.d, P0/M, z20.d, z31.d +sub z20.d, z17.d, z18.d +add z17.d, z17.d, z18.d +sqrdmulh z18.d, z26.d, z8.d[0] +str q16, [x0, #352] +mul z26.d, z26.d,z9.d[0] +ldr q16, [x0, #528] +mla z29.d, P0/M, z19.d, z31.d +sub z19.d, z16.d, z25.d +add z16.d, z16.d, z25.d +sqrdmulh z25.d, z17.d, z8.d[0] +str q22, [x0, #448] +mul z17.d, z17.d,z9.d[0] +ldr q22, [x0, #560] +mla z26.d, P0/M, z18.d, z31.d +sub z18.d, z22.d, z29.d +add z22.d, z22.d, z29.d +sqrdmulh z29.d, z23.d, z8.d[1] +str q24, [x0, #480] +mul z23.d, z23.d,z9.d[1] +mla z17.d, P0/M, z25.d, z31.d +sub z25.d, z22.d, z26.d +add z22.d, z22.d, z26.d +sqrdmulh z26.d, z20.d, z8.d[1] +str q28, [x0, #384] +mul z20.d, z20.d,z9.d[1] +mla z23.d, P0/M, z29.d, z31.d +sub z29.d, z16.d, z17.d +add z16.d, z16.d, z17.d +sqrdmulh z17.d, z22.d, z6.d[0] +str q27, [x0, #416] +mul z22.d, z22.d,z7.d[0] +mla z20.d, P0/M, z26.d, z31.d +sub z26.d, z18.d, z23.d +add z18.d, z18.d, z23.d +sqrdmulh z23.d, z25.d, z6.d[1] +mul z25.d, z25.d,z7.d[1] +mla z22.d, P0/M, z17.d, z31.d +sub z17.d, z19.d, z20.d +add z19.d, z19.d, z20.d +sqrdmulh z20.d, z26.d, z4.d[1] +mul z26.d, z26.d,z5.d[1] +ldr q27, [x0, #736] +mla z25.d, P0/M, z23.d, z31.d +sub z23.d, z16.d, z22.d +add z16.d, z16.d, z22.d +sqrdmulh z22.d, z18.d, z4.d[0] +mul z18.d, z18.d,z5.d[0] +ldr q28, [x0, #704] +mla z26.d, P0/M, z20.d, z31.d +sub z20.d, z29.d, z25.d +add z29.d, z29.d, z25.d +sqrdmulh z25.d, z27.d, z10.d[0] +mul z27.d, z27.d,z11.d[0] +ldr q24, [x0, #640] +mla z18.d, P0/M, z22.d, z31.d +sub z22.d, z17.d, z26.d +add z17.d, z17.d, z26.d +sqrdmulh z26.d, z28.d, z10.d[0] +str q16, [x0, #528] +mul z28.d, z28.d,z11.d[0] +ldr q16, [x0, #672] +mla z27.d, P0/M, z25.d, z31.d +sub z25.d, z19.d, z18.d +add z19.d, z19.d, z18.d +sqrdmulh z18.d, z24.d, z10.d[0] +str q23, [x0, #560] +mul z24.d, z24.d,z11.d[0] +ldr q23, [x0, #608] +mla z28.d, P0/M, z26.d, z31.d +sub z26.d, z23.d, z27.d +add z23.d, z23.d, z27.d +sqrdmulh z27.d, z16.d, z10.d[0] +str q29, [x0, #592] +mul z16.d, z16.d,z11.d[0] +ldr q29, [x0, #576] +mla z24.d, P0/M, z18.d, z31.d +sub z18.d, z29.d, z28.d +add z29.d, z29.d, z28.d +sqrdmulh z28.d, z23.d, z8.d[0] +str q20, [x0, #624] +mul z23.d, z23.d,z9.d[0] +ldr q20, [x0, #512] +mla z16.d, P0/M, z27.d, z31.d +sub z27.d, z20.d, z24.d +add z20.d, z20.d, z24.d +sqrdmulh z24.d, z29.d, z8.d[0] +str q17, [x0, #720] +mul z29.d, z29.d,z9.d[0] +ldr q17, [x0, #544] +mla z23.d, P0/M, z28.d, z31.d +sub z28.d, z17.d, z16.d +add z17.d, z17.d, z16.d +sqrdmulh z16.d, z26.d, z8.d[1] +str q22, [x0, #752] +mul z26.d, z26.d,z9.d[1] +mla z29.d, P0/M, z24.d, z31.d +sub z24.d, z17.d, z23.d +add z17.d, z17.d, z23.d +sqrdmulh z23.d, z18.d, z8.d[1] +str q19, [x0, #656] +mul z18.d, z18.d,z9.d[1] +mla z26.d, P0/M, z16.d, z31.d +sub z16.d, z20.d, z29.d +add z20.d, z20.d, z29.d +ldr q12, [x17, #+512] +sqrdmulh z29.d, z17.d, z6.d[0] +str q25, [x0, #688] +mul z17.d, z17.d,z7.d[0] +mla z18.d, P0/M, z23.d, z31.d +sub z23.d, z28.d, z26.d +add z28.d, z28.d, z26.d +ldr q13, [x17, #+528] +sqrdmulh z26.d, z24.d, z6.d[1] +mul z24.d, z24.d,z7.d[1] +mla z17.d, P0/M, z29.d, z31.d +sub z29.d, z27.d, z18.d +add z27.d, z27.d, z18.d +ldr q14, [x17, #+544] +ldr q15, [x17, #+560] +ldr q0, [x17, #+576] +ldr q1, [x17, #+592] +ldr q2, [x17, #+608] +ldr q3, [x17, #+624] +sqrdmulh z18.d, z23.d, z4.d[1] +mul z23.d, z23.d,z5.d[1] +ldr q25, [x0, #1008] +mla z24.d, P0/M, z26.d, z31.d +sub z26.d, z20.d, z17.d +add z20.d, z20.d, z17.d +sqrdmulh z17.d, z28.d, z4.d[0] +mul z28.d, z28.d,z5.d[0] +ldr q19, [x0, #976] +mla z23.d, P0/M, z18.d, z31.d +sub z18.d, z16.d, z24.d +add z16.d, z16.d, z24.d +sqrdmulh z24.d, z25.d, z13.d[0] +mul z25.d, z25.d,z12.d[0] +ldr q22, [x0, #912] +mla z28.d, P0/M, z17.d, z31.d +sub z17.d, z29.d, z23.d +add z29.d, z29.d, z23.d +sqrdmulh z23.d, z19.d, z13.d[0] +str q20, [x0, #512] +mul z19.d, z19.d,z12.d[0] +ldr q20, [x0, #944] +mla z25.d, P0/M, z24.d, z31.d +sub z24.d, z27.d, z28.d +add z27.d, z27.d, z28.d +sqrdmulh z28.d, z22.d, z13.d[0] +str q26, [x0, #544] +mul z22.d, z22.d,z12.d[0] +ldr q26, [x0, #880] +mla z19.d, P0/M, z23.d, z31.d +sub z23.d, z26.d, z25.d +add z26.d, z26.d, z25.d +sqrdmulh z25.d, z20.d, z13.d[0] +str q16, [x0, #576] +mul z20.d, z20.d,z12.d[0] +ldr q16, [x0, #848] +mla z22.d, P0/M, z28.d, z31.d +sub z28.d, z16.d, z19.d +add z16.d, z16.d, z19.d +sqrdmulh z19.d, z26.d, z15.d[0] +str q18, [x0, #608] +mul z26.d, z26.d,z14.d[0] +ldr q18, [x0, #784] +mla z20.d, P0/M, z25.d, z31.d +sub z25.d, z18.d, z22.d +add z18.d, z18.d, z22.d +sqrdmulh z22.d, z16.d, z15.d[0] +str q29, [x0, #704] +mul z16.d, z16.d,z14.d[0] +ldr q29, [x0, #816] +mla z26.d, P0/M, z19.d, z31.d +sub z19.d, z29.d, z20.d +add z29.d, z29.d, z20.d +sqrdmulh z20.d, z23.d, z15.d[1] +str q17, [x0, #736] +mul z23.d, z23.d,z14.d[1] +mla z16.d, P0/M, z22.d, z31.d +sub z22.d, z29.d, z26.d +add z29.d, z29.d, z26.d +sqrdmulh z26.d, z28.d, z15.d[1] +str q27, [x0, #640] +mul z28.d, z28.d,z14.d[1] +mla z23.d, P0/M, z20.d, z31.d +sub z20.d, z18.d, z16.d +add z18.d, z18.d, z16.d +sqrdmulh z16.d, z29.d, z1.d[0] +str q24, [x0, #672] +mul z29.d, z29.d,z0.d[0] +mla z28.d, P0/M, z26.d, z31.d +sub z26.d, z19.d, z23.d +add z19.d, z19.d, z23.d +sqrdmulh z23.d, z22.d, z1.d[1] +mul z22.d, z22.d,z0.d[1] +mla z29.d, P0/M, z16.d, z31.d +sub z16.d, z25.d, z28.d +add z25.d, z25.d, z28.d +sqrdmulh z28.d, z26.d, z3.d[1] +mul z26.d, z26.d,z2.d[1] +ldr q24, [x0, #992] +mla z22.d, P0/M, z23.d, z31.d +sub z23.d, z18.d, z29.d +add z18.d, z18.d, z29.d +sqrdmulh z29.d, z19.d, z3.d[0] +mul z19.d, z19.d,z2.d[0] +ldr q27, [x0, #960] +mla z26.d, P0/M, z28.d, z31.d +sub z28.d, z20.d, z22.d +add z20.d, z20.d, z22.d +sqrdmulh z22.d, z24.d, z13.d[0] +mul z24.d, z24.d,z12.d[0] +ldr q17, [x0, #896] +mla z19.d, P0/M, z29.d, z31.d +sub z29.d, z16.d, z26.d +add z16.d, z16.d, z26.d +sqrdmulh z26.d, z27.d, z13.d[0] +str q18, [x0, #784] +mul z27.d, z27.d,z12.d[0] +ldr q18, [x0, #928] +mla z24.d, P0/M, z22.d, z31.d +sub z22.d, z25.d, z19.d +add z25.d, z25.d, z19.d +sqrdmulh z19.d, z17.d, z13.d[0] +str q23, [x0, #816] +mul z17.d, z17.d,z12.d[0] +ldr q23, [x0, #864] +mla z27.d, P0/M, z26.d, z31.d +sub z26.d, z23.d, z24.d +add z23.d, z23.d, z24.d +sqrdmulh z24.d, z18.d, z13.d[0] +str q20, [x0, #848] +mul z18.d, z18.d,z12.d[0] +ldr q20, [x0, #832] +mla z17.d, P0/M, z19.d, z31.d +sub z19.d, z20.d, z27.d +add z20.d, z20.d, z27.d +sqrdmulh z27.d, z23.d, z15.d[0] +str q28, [x0, #880] +mul z23.d, z23.d,z14.d[0] +ldr q28, [x0, #768] +mla z18.d, P0/M, z24.d, z31.d +sub z24.d, z28.d, z17.d +add z28.d, z28.d, z17.d +sqrdmulh z17.d, z20.d, z15.d[0] +str q16, [x0, #976] +mul z20.d, z20.d,z14.d[0] +ldr q16, [x0, #800] +mla z23.d, P0/M, z27.d, z31.d +sub z27.d, z16.d, z18.d +add z16.d, z16.d, z18.d +sqrdmulh z18.d, z26.d, z15.d[1] +str q29, [x0, #1008] +mul z26.d, z26.d,z14.d[1] +mla z20.d, P0/M, z17.d, z31.d +sub z17.d, z16.d, z23.d +add z16.d, z16.d, z23.d +sqrdmulh z23.d, z19.d, z15.d[1] +str q25, [x0, #912] +mul z19.d, z19.d,z14.d[1] +mla z26.d, P0/M, z18.d, z31.d +sub z18.d, z28.d, z20.d +add z28.d, z28.d, z20.d +ldr q4, [x17, #+640] +sqrdmulh z20.d, z16.d, z1.d[0] +str q22, [x0, #944] +mul z16.d, z16.d,z0.d[0] +mla z19.d, P0/M, z23.d, z31.d +sub z23.d, z27.d, z26.d +add z27.d, z27.d, z26.d +ldr q5, [x17, #+656] +sqrdmulh z26.d, z17.d, z1.d[1] +mul z17.d, z17.d,z0.d[1] +mla z16.d, P0/M, z20.d, z31.d +sub z20.d, z24.d, z19.d +add z24.d, z24.d, z19.d +ldr q6, [x17, #+672] +ldr q7, [x17, #+688] +ldr q8, [x17, #+704] +ldr q9, [x17, #+720] +ldr q10, [x17, #+736] +ldr q11, [x17, #+752] +sqrdmulh z19.d, z23.d, z3.d[1] +mul z23.d, z23.d,z2.d[1] +ldr q22, [x0, #1264] +mla z17.d, P0/M, z26.d, z31.d +sub z26.d, z28.d, z16.d +add z28.d, z28.d, z16.d +sqrdmulh z16.d, z27.d, z3.d[0] +mul z27.d, z27.d,z2.d[0] +ldr q25, [x0, #1232] +mla z23.d, P0/M, z19.d, z31.d +sub z19.d, z18.d, z17.d +add z18.d, z18.d, z17.d +sqrdmulh z17.d, z22.d, z5.d[0] +mul z22.d, z22.d,z4.d[0] +ldr q29, [x0, #1168] +mla z27.d, P0/M, z16.d, z31.d +sub z16.d, z20.d, z23.d +add z20.d, z20.d, z23.d +sqrdmulh z23.d, z25.d, z5.d[0] +str q28, [x0, #768] +mul z25.d, z25.d,z4.d[0] +ldr q28, [x0, #1200] +mla z22.d, P0/M, z17.d, z31.d +sub z17.d, z24.d, z27.d +add z24.d, z24.d, z27.d +sqrdmulh z27.d, z29.d, z5.d[0] +str q26, [x0, #800] +mul z29.d, z29.d,z4.d[0] +ldr q26, [x0, #1136] +mla z25.d, P0/M, z23.d, z31.d +sub z23.d, z26.d, z22.d +add z26.d, z26.d, z22.d +sqrdmulh z22.d, z28.d, z5.d[0] +str q18, [x0, #832] +mul z28.d, z28.d,z4.d[0] +ldr q18, [x0, #1104] +mla z29.d, P0/M, z27.d, z31.d +sub z27.d, z18.d, z25.d +add z18.d, z18.d, z25.d +sqrdmulh z25.d, z26.d, z7.d[0] +str q19, [x0, #864] +mul z26.d, z26.d,z6.d[0] +ldr q19, [x0, #1040] +mla z28.d, P0/M, z22.d, z31.d +sub z22.d, z19.d, z29.d +add z19.d, z19.d, z29.d +sqrdmulh z29.d, z18.d, z7.d[0] +str q20, [x0, #960] +mul z18.d, z18.d,z6.d[0] +ldr q20, [x0, #1072] +mla z26.d, P0/M, z25.d, z31.d +sub z25.d, z20.d, z28.d +add z20.d, z20.d, z28.d +sqrdmulh z28.d, z23.d, z7.d[1] +str q16, [x0, #992] +mul z23.d, z23.d,z6.d[1] +mla z18.d, P0/M, z29.d, z31.d +sub z29.d, z20.d, z26.d +add z20.d, z20.d, z26.d +sqrdmulh z26.d, z27.d, z7.d[1] +str q24, [x0, #896] +mul z27.d, z27.d,z6.d[1] +mla z23.d, P0/M, z28.d, z31.d +sub z28.d, z19.d, z18.d +add z19.d, z19.d, z18.d +sqrdmulh z18.d, z20.d, z9.d[0] +str q17, [x0, #928] +mul z20.d, z20.d,z8.d[0] +mla z27.d, P0/M, z26.d, z31.d +sub z26.d, z25.d, z23.d +add z25.d, z25.d, z23.d +sqrdmulh z23.d, z29.d, z9.d[1] +mul z29.d, z29.d,z8.d[1] +mla z20.d, P0/M, z18.d, z31.d +sub z18.d, z22.d, z27.d +add z22.d, z22.d, z27.d +sqrdmulh z27.d, z26.d, z11.d[1] +mul z26.d, z26.d,z10.d[1] +ldr q17, [x0, #1248] +mla z29.d, P0/M, z23.d, z31.d +sub z23.d, z19.d, z20.d +add z19.d, z19.d, z20.d +sqrdmulh z20.d, z25.d, z11.d[0] +mul z25.d, z25.d,z10.d[0] +ldr q24, [x0, #1216] +mla z26.d, P0/M, z27.d, z31.d +sub z27.d, z28.d, z29.d +add z28.d, z28.d, z29.d +sqrdmulh z29.d, z17.d, z5.d[0] +mul z17.d, z17.d,z4.d[0] +ldr q16, [x0, #1152] +mla z25.d, P0/M, z20.d, z31.d +sub z20.d, z18.d, z26.d +add z18.d, z18.d, z26.d +sqrdmulh z26.d, z24.d, z5.d[0] +str q19, [x0, #1040] +mul z24.d, z24.d,z4.d[0] +ldr q19, [x0, #1184] +mla z17.d, P0/M, z29.d, z31.d +sub z29.d, z22.d, z25.d +add z22.d, z22.d, z25.d +sqrdmulh z25.d, z16.d, z5.d[0] +str q23, [x0, #1072] +mul z16.d, z16.d,z4.d[0] +ldr q23, [x0, #1120] +mla z24.d, P0/M, z26.d, z31.d +sub z26.d, z23.d, z17.d +add z23.d, z23.d, z17.d +sqrdmulh z17.d, z19.d, z5.d[0] +str q28, [x0, #1104] +mul z19.d, z19.d,z4.d[0] +ldr q28, [x0, #1088] +mla z16.d, P0/M, z25.d, z31.d +sub z25.d, z28.d, z24.d +add z28.d, z28.d, z24.d +sqrdmulh z24.d, z23.d, z7.d[0] +str q27, [x0, #1136] +mul z23.d, z23.d,z6.d[0] +ldr q27, [x0, #1024] +mla z19.d, P0/M, z17.d, z31.d +sub z17.d, z27.d, z16.d +add z27.d, z27.d, z16.d +sqrdmulh z16.d, z28.d, z7.d[0] +str q18, [x0, #1232] +mul z28.d, z28.d,z6.d[0] +ldr q18, [x0, #1056] +mla z23.d, P0/M, z24.d, z31.d +sub z24.d, z18.d, z19.d +add z18.d, z18.d, z19.d +sqrdmulh z19.d, z26.d, z7.d[1] +str q20, [x0, #1264] +mul z26.d, z26.d,z6.d[1] +mla z28.d, P0/M, z16.d, z31.d +sub z16.d, z18.d, z23.d +add z18.d, z18.d, z23.d +sqrdmulh z23.d, z25.d, z7.d[1] +str q22, [x0, #1168] +mul z25.d, z25.d,z6.d[1] +mla z26.d, P0/M, z19.d, z31.d +sub z19.d, z27.d, z28.d +add z27.d, z27.d, z28.d +ldr q3, [x17, #+768] +sqrdmulh z28.d, z18.d, z9.d[0] +str q29, [x0, #1200] +mul z18.d, z18.d,z8.d[0] +mla z25.d, P0/M, z23.d, z31.d +sub z23.d, z24.d, z26.d +add z24.d, z24.d, z26.d +ldr q2, [x17, #+784] +sqrdmulh z26.d, z16.d, z9.d[1] +mul z16.d, z16.d,z8.d[1] +mla z18.d, P0/M, z28.d, z31.d +sub z28.d, z17.d, z25.d +add z17.d, z17.d, z25.d +ldr q1, [x17, #+800] +ldr q0, [x17, #+816] +ldr q15, [x17, #+832] +ldr q14, [x17, #+848] +ldr q13, [x17, #+864] +ldr q12, [x17, #+880] +sqrdmulh z25.d, z23.d, z11.d[1] +mul z23.d, z23.d,z10.d[1] +ldr q29, [x0, #1520] +mla z16.d, P0/M, z26.d, z31.d +sub z26.d, z27.d, z18.d +add z27.d, z27.d, z18.d +sqrdmulh z18.d, z24.d, z11.d[0] +mul z24.d, z24.d,z10.d[0] +ldr q22, [x0, #1488] +mla z23.d, P0/M, z25.d, z31.d +sub z25.d, z19.d, z16.d +add z19.d, z19.d, z16.d +sqrdmulh z16.d, z29.d, z2.d[0] +mul z29.d, z29.d,z3.d[0] +ldr q20, [x0, #1424] +mla z24.d, P0/M, z18.d, z31.d +sub z18.d, z28.d, z23.d +add z28.d, z28.d, z23.d +sqrdmulh z23.d, z22.d, z2.d[0] +str q27, [x0, #1024] +mul z22.d, z22.d,z3.d[0] +ldr q27, [x0, #1456] +mla z29.d, P0/M, z16.d, z31.d +sub z16.d, z17.d, z24.d +add z17.d, z17.d, z24.d +sqrdmulh z24.d, z20.d, z2.d[0] +str q26, [x0, #1056] +mul z20.d, z20.d,z3.d[0] +ldr q26, [x0, #1392] +mla z22.d, P0/M, z23.d, z31.d +sub z23.d, z26.d, z29.d +add z26.d, z26.d, z29.d +sqrdmulh z29.d, z27.d, z2.d[0] +str q19, [x0, #1088] +mul z27.d, z27.d,z3.d[0] +ldr q19, [x0, #1360] +mla z20.d, P0/M, z24.d, z31.d +sub z24.d, z19.d, z22.d +add z19.d, z19.d, z22.d +sqrdmulh z22.d, z26.d, z0.d[0] +str q25, [x0, #1120] +mul z26.d, z26.d,z1.d[0] +ldr q25, [x0, #1296] +mla z27.d, P0/M, z29.d, z31.d +sub z29.d, z25.d, z20.d +add z25.d, z25.d, z20.d +sqrdmulh z20.d, z19.d, z0.d[0] +str q28, [x0, #1216] +mul z19.d, z19.d,z1.d[0] +ldr q28, [x0, #1328] +mla z26.d, P0/M, z22.d, z31.d +sub z22.d, z28.d, z27.d +add z28.d, z28.d, z27.d +sqrdmulh z27.d, z23.d, z0.d[1] +str q18, [x0, #1248] +mul z23.d, z23.d,z1.d[1] +mla z19.d, P0/M, z20.d, z31.d +sub z20.d, z28.d, z26.d +add z28.d, z28.d, z26.d +sqrdmulh z26.d, z24.d, z0.d[1] +str q17, [x0, #1152] +mul z24.d, z24.d,z1.d[1] +mla z23.d, P0/M, z27.d, z31.d +sub z27.d, z25.d, z19.d +add z25.d, z25.d, z19.d +sqrdmulh z19.d, z28.d, z14.d[0] +str q16, [x0, #1184] +mul z28.d, z28.d,z15.d[0] +mla z24.d, P0/M, z26.d, z31.d +sub z26.d, z22.d, z23.d +add z22.d, z22.d, z23.d +sqrdmulh z23.d, z20.d, z14.d[1] +mul z20.d, z20.d,z15.d[1] +mla z28.d, P0/M, z19.d, z31.d +sub z19.d, z29.d, z24.d +add z29.d, z29.d, z24.d +sqrdmulh z24.d, z26.d, z12.d[1] +mul z26.d, z26.d,z13.d[1] +ldr q16, [x0, #1504] +mla z20.d, P0/M, z23.d, z31.d +sub z23.d, z25.d, z28.d +add z25.d, z25.d, z28.d +sqrdmulh z28.d, z22.d, z12.d[0] +mul z22.d, z22.d,z13.d[0] +ldr q17, [x0, #1472] +mla z26.d, P0/M, z24.d, z31.d +sub z24.d, z27.d, z20.d +add z27.d, z27.d, z20.d +sqrdmulh z20.d, z16.d, z2.d[0] +mul z16.d, z16.d,z3.d[0] +ldr q18, [x0, #1408] +mla z22.d, P0/M, z28.d, z31.d +sub z28.d, z19.d, z26.d +add z19.d, z19.d, z26.d +sqrdmulh z26.d, z17.d, z2.d[0] +str q25, [x0, #1296] +mul z17.d, z17.d,z3.d[0] +ldr q25, [x0, #1440] +mla z16.d, P0/M, z20.d, z31.d +sub z20.d, z29.d, z22.d +add z29.d, z29.d, z22.d +sqrdmulh z22.d, z18.d, z2.d[0] +str q23, [x0, #1328] +mul z18.d, z18.d,z3.d[0] +ldr q23, [x0, #1376] +mla z17.d, P0/M, z26.d, z31.d +sub z26.d, z23.d, z16.d +add z23.d, z23.d, z16.d +sqrdmulh z16.d, z25.d, z2.d[0] +str q27, [x0, #1360] +mul z25.d, z25.d,z3.d[0] +ldr q27, [x0, #1344] +mla z18.d, P0/M, z22.d, z31.d +sub z22.d, z27.d, z17.d +add z27.d, z27.d, z17.d +sqrdmulh z17.d, z23.d, z0.d[0] +str q24, [x0, #1392] +mul z23.d, z23.d,z1.d[0] +ldr q24, [x0, #1280] +mla z25.d, P0/M, z16.d, z31.d +sub z16.d, z24.d, z18.d +add z24.d, z24.d, z18.d +sqrdmulh z18.d, z27.d, z0.d[0] +str q19, [x0, #1488] +mul z27.d, z27.d,z1.d[0] +ldr q19, [x0, #1312] +mla z23.d, P0/M, z17.d, z31.d +sub z17.d, z19.d, z25.d +add z19.d, z19.d, z25.d +sqrdmulh z25.d, z26.d, z0.d[1] +str q28, [x0, #1520] +mul z26.d, z26.d,z1.d[1] +mla z27.d, P0/M, z18.d, z31.d +sub z18.d, z19.d, z23.d +add z19.d, z19.d, z23.d +sqrdmulh z23.d, z22.d, z0.d[1] +str q29, [x0, #1424] +mul z22.d, z22.d,z1.d[1] +mla z26.d, P0/M, z25.d, z31.d +sub z25.d, z24.d, z27.d +add z24.d, z24.d, z27.d +ldr q11, [x17, #+896] +sqrdmulh z27.d, z19.d, z14.d[0] +str q20, [x0, #1456] +mul z19.d, z19.d,z15.d[0] +mla z22.d, P0/M, z23.d, z31.d +sub z23.d, z17.d, z26.d +add z17.d, z17.d, z26.d +ldr q10, [x17, #+912] +sqrdmulh z26.d, z18.d, z14.d[1] +mul z18.d, z18.d,z15.d[1] +mla z19.d, P0/M, z27.d, z31.d +sub z27.d, z16.d, z22.d +add z16.d, z16.d, z22.d +ldr q9, [x17, #+928] +ldr q8, [x17, #+944] +ldr q7, [x17, #+960] +ldr q6, [x17, #+976] +ldr q5, [x17, #+992] +ldr q4, [x17, #+1008] +sqrdmulh z22.d, z23.d, z12.d[1] +mul z23.d, z23.d,z13.d[1] +ldr q20, [x0, #1776] +mla z18.d, P0/M, z26.d, z31.d +sub z26.d, z24.d, z19.d +add z24.d, z24.d, z19.d +sqrdmulh z19.d, z17.d, z12.d[0] +mul z17.d, z17.d,z13.d[0] +ldr q29, [x0, #1744] +mla z23.d, P0/M, z22.d, z31.d +sub z22.d, z25.d, z18.d +add z25.d, z25.d, z18.d +sqrdmulh z18.d, z20.d, z10.d[0] +mul z20.d, z20.d,z11.d[0] +ldr q28, [x0, #1680] +mla z17.d, P0/M, z19.d, z31.d +sub z19.d, z27.d, z23.d +add z27.d, z27.d, z23.d +sqrdmulh z23.d, z29.d, z10.d[0] +str q24, [x0, #1280] +mul z29.d, z29.d,z11.d[0] +ldr q24, [x0, #1712] +mla z20.d, P0/M, z18.d, z31.d +sub z18.d, z16.d, z17.d +add z16.d, z16.d, z17.d +sqrdmulh z17.d, z28.d, z10.d[0] +str q26, [x0, #1312] +mul z28.d, z28.d,z11.d[0] +ldr q26, [x0, #1648] +mla z29.d, P0/M, z23.d, z31.d +sub z23.d, z26.d, z20.d +add z26.d, z26.d, z20.d +sqrdmulh z20.d, z24.d, z10.d[0] +str q25, [x0, #1344] +mul z24.d, z24.d,z11.d[0] +ldr q25, [x0, #1616] +mla z28.d, P0/M, z17.d, z31.d +sub z17.d, z25.d, z29.d +add z25.d, z25.d, z29.d +sqrdmulh z29.d, z26.d, z8.d[0] +str q22, [x0, #1376] +mul z26.d, z26.d,z9.d[0] +ldr q22, [x0, #1552] +mla z24.d, P0/M, z20.d, z31.d +sub z20.d, z22.d, z28.d +add z22.d, z22.d, z28.d +sqrdmulh z28.d, z25.d, z8.d[0] +str q27, [x0, #1472] +mul z25.d, z25.d,z9.d[0] +ldr q27, [x0, #1584] +mla z26.d, P0/M, z29.d, z31.d +sub z29.d, z27.d, z24.d +add z27.d, z27.d, z24.d +sqrdmulh z24.d, z23.d, z8.d[1] +str q19, [x0, #1504] +mul z23.d, z23.d,z9.d[1] +mla z25.d, P0/M, z28.d, z31.d +sub z28.d, z27.d, z26.d +add z27.d, z27.d, z26.d +sqrdmulh z26.d, z17.d, z8.d[1] +str q16, [x0, #1408] +mul z17.d, z17.d,z9.d[1] +mla z23.d, P0/M, z24.d, z31.d +sub z24.d, z22.d, z25.d +add z22.d, z22.d, z25.d +sqrdmulh z25.d, z27.d, z6.d[0] +str q18, [x0, #1440] +mul z27.d, z27.d,z7.d[0] +mla z17.d, P0/M, z26.d, z31.d +sub z26.d, z29.d, z23.d +add z29.d, z29.d, z23.d +sqrdmulh z23.d, z28.d, z6.d[1] +mul z28.d, z28.d,z7.d[1] +mla z27.d, P0/M, z25.d, z31.d +sub z25.d, z20.d, z17.d +add z20.d, z20.d, z17.d +sqrdmulh z17.d, z26.d, z4.d[1] +mul z26.d, z26.d,z5.d[1] +ldr q18, [x0, #1760] +mla z28.d, P0/M, z23.d, z31.d +sub z23.d, z22.d, z27.d +add z22.d, z22.d, z27.d +sqrdmulh z27.d, z29.d, z4.d[0] +mul z29.d, z29.d,z5.d[0] +ldr q16, [x0, #1728] +mla z26.d, P0/M, z17.d, z31.d +sub z17.d, z24.d, z28.d +add z24.d, z24.d, z28.d +sqrdmulh z28.d, z18.d, z10.d[0] +mul z18.d, z18.d,z11.d[0] +ldr q19, [x0, #1664] +mla z29.d, P0/M, z27.d, z31.d +sub z27.d, z25.d, z26.d +add z25.d, z25.d, z26.d +sqrdmulh z26.d, z16.d, z10.d[0] +str q22, [x0, #1552] +mul z16.d, z16.d,z11.d[0] +ldr q22, [x0, #1696] +mla z18.d, P0/M, z28.d, z31.d +sub z28.d, z20.d, z29.d +add z20.d, z20.d, z29.d +sqrdmulh z29.d, z19.d, z10.d[0] +str q23, [x0, #1584] +mul z19.d, z19.d,z11.d[0] +ldr q23, [x0, #1632] +mla z16.d, P0/M, z26.d, z31.d +sub z26.d, z23.d, z18.d +add z23.d, z23.d, z18.d +sqrdmulh z18.d, z22.d, z10.d[0] +str q24, [x0, #1616] +mul z22.d, z22.d,z11.d[0] +ldr q24, [x0, #1600] +mla z19.d, P0/M, z29.d, z31.d +sub z29.d, z24.d, z16.d +add z24.d, z24.d, z16.d +sqrdmulh z16.d, z23.d, z8.d[0] +str q17, [x0, #1648] +mul z23.d, z23.d,z9.d[0] +ldr q17, [x0, #1536] +mla z22.d, P0/M, z18.d, z31.d +sub z18.d, z17.d, z19.d +add z17.d, z17.d, z19.d +sqrdmulh z19.d, z24.d, z8.d[0] +str q25, [x0, #1744] +mul z24.d, z24.d,z9.d[0] +ldr q25, [x0, #1568] +mla z23.d, P0/M, z16.d, z31.d +sub z16.d, z25.d, z22.d +add z25.d, z25.d, z22.d +sqrdmulh z22.d, z26.d, z8.d[1] +str q27, [x0, #1776] +mul z26.d, z26.d,z9.d[1] +mla z24.d, P0/M, z19.d, z31.d +sub z19.d, z25.d, z23.d +add z25.d, z25.d, z23.d +sqrdmulh z23.d, z29.d, z8.d[1] +str q20, [x0, #1680] +mul z29.d, z29.d,z9.d[1] +mla z26.d, P0/M, z22.d, z31.d +sub z22.d, z17.d, z24.d +add z17.d, z17.d, z24.d +ldr q12, [x17, #+1024] +sqrdmulh z24.d, z25.d, z6.d[0] +str q28, [x0, #1712] +mul z25.d, z25.d,z7.d[0] +mla z29.d, P0/M, z23.d, z31.d +sub z23.d, z16.d, z26.d +add z16.d, z16.d, z26.d +ldr q13, [x17, #+1040] +sqrdmulh z26.d, z19.d, z6.d[1] +mul z19.d, z19.d,z7.d[1] +mla z25.d, P0/M, z24.d, z31.d +sub z24.d, z18.d, z29.d +add z18.d, z18.d, z29.d +ldr q14, [x17, #+1056] +ldr q15, [x17, #+1072] +ldr q0, [x17, #+1088] +ldr q1, [x17, #+1104] +ldr q2, [x17, #+1120] +ldr q3, [x17, #+1136] +sqrdmulh z29.d, z23.d, z4.d[1] +mul z23.d, z23.d,z5.d[1] +ldr q28, [x0, #2032] +mla z19.d, P0/M, z26.d, z31.d +sub z26.d, z17.d, z25.d +add z17.d, z17.d, z25.d +sqrdmulh z25.d, z16.d, z4.d[0] +mul z16.d, z16.d,z5.d[0] +ldr q20, [x0, #2000] +mla z23.d, P0/M, z29.d, z31.d +sub z29.d, z22.d, z19.d +add z22.d, z22.d, z19.d +sqrdmulh z19.d, z28.d, z13.d[0] +mul z28.d, z28.d,z12.d[0] +ldr q27, [x0, #1936] +mla z16.d, P0/M, z25.d, z31.d +sub z25.d, z24.d, z23.d +add z24.d, z24.d, z23.d +sqrdmulh z23.d, z20.d, z13.d[0] +str q17, [x0, #1536] +mul z20.d, z20.d,z12.d[0] +ldr q17, [x0, #1968] +mla z28.d, P0/M, z19.d, z31.d +sub z19.d, z18.d, z16.d +add z18.d, z18.d, z16.d +sqrdmulh z16.d, z27.d, z13.d[0] +str q26, [x0, #1568] +mul z27.d, z27.d,z12.d[0] +ldr q26, [x0, #1904] +mla z20.d, P0/M, z23.d, z31.d +sub z23.d, z26.d, z28.d +add z26.d, z26.d, z28.d +sqrdmulh z28.d, z17.d, z13.d[0] +str q22, [x0, #1600] +mul z17.d, z17.d,z12.d[0] +ldr q22, [x0, #1872] +mla z27.d, P0/M, z16.d, z31.d +sub z16.d, z22.d, z20.d +add z22.d, z22.d, z20.d +sqrdmulh z20.d, z26.d, z15.d[0] +str q29, [x0, #1632] +mul z26.d, z26.d,z14.d[0] +ldr q29, [x0, #1808] +mla z17.d, P0/M, z28.d, z31.d +sub z28.d, z29.d, z27.d +add z29.d, z29.d, z27.d +sqrdmulh z27.d, z22.d, z15.d[0] +str q24, [x0, #1728] +mul z22.d, z22.d,z14.d[0] +ldr q24, [x0, #1840] +mla z26.d, P0/M, z20.d, z31.d +sub z20.d, z24.d, z17.d +add z24.d, z24.d, z17.d +sqrdmulh z17.d, z23.d, z15.d[1] +str q25, [x0, #1760] +mul z23.d, z23.d,z14.d[1] +mla z22.d, P0/M, z27.d, z31.d +sub z27.d, z24.d, z26.d +add z24.d, z24.d, z26.d +sqrdmulh z26.d, z16.d, z15.d[1] +str q18, [x0, #1664] +mul z16.d, z16.d,z14.d[1] +mla z23.d, P0/M, z17.d, z31.d +sub z17.d, z29.d, z22.d +add z29.d, z29.d, z22.d +sqrdmulh z22.d, z24.d, z1.d[0] +str q19, [x0, #1696] +mul z24.d, z24.d,z0.d[0] +mla z16.d, P0/M, z26.d, z31.d +sub z26.d, z20.d, z23.d +add z20.d, z20.d, z23.d +sqrdmulh z23.d, z27.d, z1.d[1] +mul z27.d, z27.d,z0.d[1] +mla z24.d, P0/M, z22.d, z31.d +sub z22.d, z28.d, z16.d +add z28.d, z28.d, z16.d +sqrdmulh z16.d, z26.d, z3.d[1] +mul z26.d, z26.d,z2.d[1] +ldr q19, [x0, #2016] +mla z27.d, P0/M, z23.d, z31.d +sub z23.d, z29.d, z24.d +add z29.d, z29.d, z24.d +sqrdmulh z24.d, z20.d, z3.d[0] +mul z20.d, z20.d,z2.d[0] +ldr q18, [x0, #1984] +mla z26.d, P0/M, z16.d, z31.d +sub z16.d, z17.d, z27.d +add z17.d, z17.d, z27.d +sqrdmulh z27.d, z19.d, z13.d[0] +mul z19.d, z19.d,z12.d[0] +ldr q25, [x0, #1920] +mla z20.d, P0/M, z24.d, z31.d +sub z24.d, z22.d, z26.d +add z22.d, z22.d, z26.d +sqrdmulh z26.d, z18.d, z13.d[0] +str q29, [x0, #1808] +mul z18.d, z18.d,z12.d[0] +ldr q29, [x0, #1952] +mla z19.d, P0/M, z27.d, z31.d +sub z27.d, z28.d, z20.d +add z28.d, z28.d, z20.d +sqrdmulh z20.d, z25.d, z13.d[0] +str q23, [x0, #1840] +mul z25.d, z25.d,z12.d[0] +ldr q23, [x0, #1888] +mla z18.d, P0/M, z26.d, z31.d +sub z26.d, z23.d, z19.d +add z23.d, z23.d, z19.d +sqrdmulh z19.d, z29.d, z13.d[0] +str q17, [x0, #1872] +mul z29.d, z29.d,z12.d[0] +ldr q17, [x0, #1856] +mla z25.d, P0/M, z20.d, z31.d +sub z20.d, z17.d, z18.d +add z17.d, z17.d, z18.d +sqrdmulh z18.d, z23.d, z15.d[0] +str q16, [x0, #1904] +mul z23.d, z23.d,z14.d[0] +ldr q16, [x0, #1792] +mla z29.d, P0/M, z19.d, z31.d +sub z19.d, z16.d, z25.d +add z16.d, z16.d, z25.d +sqrdmulh z25.d, z17.d, z15.d[0] +str q22, [x0, #2000] +mul z17.d, z17.d,z14.d[0] +ldr q22, [x0, #1824] +mla z23.d, P0/M, z18.d, z31.d +sub z18.d, z22.d, z29.d +add z22.d, z22.d, z29.d +sqrdmulh z29.d, z26.d, z15.d[1] +str q24, [x0, #2032] +mul z26.d, z26.d,z14.d[1] +mla z17.d, P0/M, z25.d, z31.d +sub z25.d, z22.d, z23.d +add z22.d, z22.d, z23.d +sqrdmulh z23.d, z20.d, z15.d[1] +str q28, [x0, #1936] +mul z20.d, z20.d,z14.d[1] +mla z26.d, P0/M, z29.d, z31.d +sub z29.d, z16.d, z17.d +add z16.d, z16.d, z17.d +sqrdmulh z17.d, z22.d, z1.d[0] +str q27, [x0, #1968] +mul z22.d, z22.d,z0.d[0] +mla z20.d, P0/M, z23.d, z31.d +sub z23.d, z18.d, z26.d +add z18.d, z18.d, z26.d +sqrdmulh z26.d, z25.d, z1.d[1] +mul z25.d, z25.d,z0.d[1] +mla z22.d, P0/M, z17.d, z31.d +sub z17.d, z19.d, z20.d +add z19.d, z19.d, z20.d +sqrdmulh z20.d, z23.d, z3.d[1] +mul z23.d, z23.d,z2.d[1] +mla z25.d, P0/M, z26.d, z31.d +sub z26.d, z16.d, z22.d +add z16.d, z16.d, z22.d +sqrdmulh z22.d, z18.d, z3.d[0] +mul z18.d, z18.d,z2.d[0] +mla z23.d, P0/M, z20.d, z31.d +sub z20.d, z29.d, z25.d +add z29.d, z29.d, z25.d +mla z18.d, P0/M, z22.d, z31.d +sub z22.d, z17.d, z23.d +add z17.d, z17.d, z23.d +str q16, [x0, #1792] +sub z16.d, z19.d, z18.d +add z19.d, z19.d, z18.d +str q26, [x0, #1824] +str q29, [x0, #1856] +str q20, [x0, #1888] +str q17, [x0, #1984] +str q22, [x0, #2016] +str q19, [x0, #1920] +str q16, [x0, #1952] +// Restore SVE2 vector registers +ldp d8, d9, [sp, #16*0] +ldp d10, d11, [sp, #16*1] +ldp d12, d13, [sp, #16*2] +ldp d14, d15, [sp, #16*3] +add sp, sp, #(16*4) +// Restore GPRs +ldp x19, x20, [sp, #16*0] +ldp x21, x22, [sp, #16*1] +ldp x23, x24, [sp, #16*2] +ldp x25, x26, [sp, #16*3] +ldp x27, x28, [sp, #16*4] +ldr x29, [sp, #16*5] +add sp, sp, #(16*5+16) +ret + +// Line count: 2697 +// Instruction count: 2693 \ No newline at end of file diff --git a/tests/ntt_sve2/main.c b/tests/ntt_sve2/main.c new file mode 100755 index 0000000..3edc501 --- /dev/null +++ b/tests/ntt_sve2/main.c @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2021 Arm Limited + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include +#include +#include +#include + +#include +#include "ntt.h" + +int main(void) +{ + enable_cyclecounter(); + + test_basemul_u64(); + +#if defined(NTT_INCOMPLETE) + test_fwd_ntt_incomplete_var_3_3_0(); + test_fwd_ntt_incomplete_var_3_3_1(); + test_fwd_ntt_incomplete_var_3_3_2(); +#else + /* Nothing yet */ +#endif /* NTT_INCOMPLETE */ + disable_cyclecounter(); +} diff --git a/tests/ntt_sve2/manual/basemul_64_72057594067788289.s b/tests/ntt_sve2/manual/basemul_64_72057594067788289.s new file mode 100644 index 0000000..d82d6d2 --- /dev/null +++ b/tests/ntt_sve2/manual/basemul_64_72057594067788289.s @@ -0,0 +1,105 @@ + +.macro save_regs + sub sp, sp, #(16*6) + stp x19, x20, [sp, #16*0] + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + stp x29, x30, [sp, #16*5] + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_regs + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldp x29, x30, [sp, #16*5] + add sp, sp, #(16*5+16) +.endm + +.data +modulus: + .dword 72057594067788289 + .dword 249802778572774913 + + .text + .type basemul_u64, %function + .global basemul_u64 + +modulus_addr: + .dword modulus +basemul_u64: + dst .req x0 + src_a .req x1 + src_b .req x2 + count .req x3 + + addr .req x4 + + in_a0 .req z0 + in_a1 .req z1 + in_b0 .req z2 + in_b1 .req z3 + dst_0 .req z4 + dst_1 .req z5 + + in_a0q .req q0 + in_a1q .req q1 + in_b0q .req q2 + in_b1q .req q3 + dst_0q .req q4 + dst_1q .req q5 + + modulus .req z6 + twist .req z7 + + tmp .req z8 + + save_regs + + ptrue P0.d + + ldr addr, modulus_addr + ld1rd {modulus.d}, P0/z, [addr, #0] + ld1rd {twist.d}, P0/z, [addr, #8] + + // # of elements must be divisible by 4 + mov count, count, LSR #2 + cmp count, #0 + b.eq 2f +1: + ldp in_a0q, in_a1q, [src_a], #32 + ldp in_b0q, in_b1q, [src_b], #32 + + sqdmulh dst_0.d, in_a0.d, in_b0.d + mul tmp.d, in_a0.d, in_b0.d + mul tmp.d, tmp.d, twist.d + sqdmulh tmp.d, tmp.d, modulus.d + shsub dst_0.d, P0/M, dst_0.d, tmp.d + + sqdmulh dst_1.d, in_a1.d, in_b1.d + mul tmp.d, in_a1.d, in_b1.d + mul tmp.d, tmp.d, twist.d + sqdmulh tmp.d, tmp.d, modulus.d + shsub dst_1.d, P0/M, dst_1.d, tmp.d + + stp dst_0q, dst_1q, [dst], #32 + + subs count, count, 1 + bne 1b +2: + restore_regs + ret diff --git a/tests/ntt_sve2/misc.c b/tests/ntt_sve2/misc.c new file mode 100755 index 0000000..bb7f87d --- /dev/null +++ b/tests/ntt_sve2/misc.c @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2021 Arm Limited + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include "misc.h" +#include + +#include +#include + +#define GEN_FILL_RANDOM( bits ) \ +void fill_random_u ## bits ( uint(bits) *buf, unsigned int len ) \ +{ \ + unsigned byte_len = len * sizeof(*buf); \ + uint8_t *byte_buf = (uint8_t*) buf; \ + for( ; byte_len; byte_buf++, byte_len-- ) \ + { \ + uint8_t cur_byte; \ + cur_byte = get_random_byte(); \ + *byte_buf = cur_byte; \ + } \ +} +GEN_FILL_RANDOM(8) +GEN_FILL_RANDOM(16) +GEN_FILL_RANDOM(32) +GEN_FILL_RANDOM(64) +#undef GEN_FILL_RANDOM + +#define GEN_COPY( bits ) \ +void copy_buf_u ## bits ( uint(bits) *dst, \ + uint(bits) const *src, unsigned int len ) \ +{ \ + for( ; len; dst++, src++, len-- ) \ + *dst = *src; \ +} +GEN_COPY(8) +GEN_COPY(16) +GEN_COPY(32) +GEN_COPY(64) +#undef GEN_COPY + +#define GEN_COMPARE_BUF( bits ) \ +int compare_buf_u ## bits ( uint(bits) const *src_a, \ + uint(bits) const *src_b, \ + unsigned len ) \ +{ \ + uint(bits) res = 0; \ + for( ; len; src_a++, src_b++, len-- ) \ + res |= ( (*src_a) ^ (*src_b) ); \ + return( res ); \ +} +GEN_COMPARE_BUF(8) +GEN_COMPARE_BUF(16) +GEN_COMPARE_BUF(32) +GEN_COMPARE_BUF(64) +#undef GEN_COMPARE_BUF + +#define GEN_PRINT_BUF( bits ) \ +void debug_print_buf_u ## bits ( uint(bits) const *buf, \ + unsigned entries, \ + const char *prefix ) \ +{ \ + unsigned idx; \ + for( idx = 0; idx < entries; idx += 8 ) \ + { \ + debug_printf( "%s [%u-%u]: %u %llu %llu %llu %llu %llu %llu %llu\n", \ + prefix, idx, idx+8, \ + buf[idx+0], buf[idx+1], buf[idx+2], buf[idx+3], \ + buf[idx+4], buf[idx+5], buf[idx+6], buf[idx+7] ); \ + } \ +} +GEN_PRINT_BUF(8) +GEN_PRINT_BUF(16) +GEN_PRINT_BUF(32) +GEN_PRINT_BUF(64) +#undef GEN_PRINT_BUF + +#define GEN_PRINT_BUF_S( bits ) \ +void debug_print_buf_s ## bits ( sint(bits) const *buf, \ + unsigned entries, \ + const char *prefix ) \ +{ \ + unsigned idx; \ + for( idx = 0; idx < entries; idx += 8 ) \ + { \ + debug_printf( "%s [%u-%u]: %lld %lld %lld %lld %lld %lld %lld %lld\n", \ + prefix, idx, idx+8, \ + buf[idx+0], buf[idx+1], buf[idx+2], buf[idx+3], \ + buf[idx+4], buf[idx+5], buf[idx+6], buf[idx+7] ); \ + } \ +} +GEN_PRINT_BUF_S(8) +GEN_PRINT_BUF_S(16) +GEN_PRINT_BUF_S(32) +GEN_PRINT_BUF_S(64) +#undef GEN_PRINT_BUF_S + +/* Helper to transpose buffers in case this is needed for input preparation. */ +#define GEN_BUFFER_TRANSPOSE(bitwidth) \ +void CONCAT3(buffer_transpose_, u, bitwidth) \ + ( uint(bitwidth) *dst, uint(bitwidth) const *src, \ + unsigned block_length, unsigned dim_x, unsigned dim_y ) \ +{ \ + unsigned i,j,k,idx_load,idx_store; \ + \ + for( i=0; i +#include +#include +#include + +#define CONCAT2_(A,B) A ## B +#define CONCAT2(A,B) CONCAT2_(A,B) + +#define CONCAT3_(A,B,C) A ## B ## C +#define CONCAT3(A,B,C) CONCAT3_(A,B,C) + +#define CONCAT4_(A,B,C,D) A ## B ## C ## D +#define CONCAT4(A,B,C,D) CONCAT4_(A,B,C, +#include +#include +#include + +#include + +#include "ntt.h" +#include "misc.h" + +#define print_buf CONCAT2(debug_print_buf_u, BITWIDTH) +#define print_buf_s CONCAT2(debug_print_buf_s, BITWIDTH) +#define compare_buf CONCAT2(compare_buf_u, BITWIDTH) +#define fill_random CONCAT2(fill_random_u, BITWIDTH) + +//#define CONFIG_TEST_NTT_VERBOSE + +const ssgl_t mod = MODULUS; +const usgl_t mod_inv = MODULUS_INV; +const ssgl_t mod_root = MODULUS_BASE_ROOT; + +ALIGN(16) ssgl_t root_base[2*NTT_SIZE] = { 0 }; +ALIGN(16) usgl_t root_base_twisted[2*NTT_SIZE] = { 0 }; +ALIGN(16) ssgl_t inv_root_base[2*NTT_SIZE] = { 0 }; +ALIGN(16) usgl_t inv_root_base_twisted[2*NTT_SIZE] = { 0 }; + +ssgl_t roots [NTT_SIZE] = { 0 }; +usgl_t roots_twisted[NTT_SIZE] = { 0 }; + +void mul_q( ssgl_t *src, sdbl_t c, size_t size ) +{ + for( unsigned idx = 0; idx < size; idx++ ) + { + sdbl_t tmp; + src[idx] = src[idx] % mod; + if( src[idx] < 0 ) + src[idx] += mod; + tmp = (sdbl_t) src[idx] * c; + src[idx] = tmp % mod; + if( src[idx] < 0 ) + src[idx] += mod; + } +} + +void reduce_q( ssgl_t *src, size_t size ) +{ + for( unsigned idx = 0; idx < size; idx++ ) + { + src[idx] = src[idx] % mod; + if( src[idx] < 0 ) + src[idx] += mod; + } +} + +void montgomery_pt_C( ssgl_t const *src_a, + ssgl_t const *src_b, + ssgl_t *dst, + size_t size ) +{ + unsigned idx; + for( idx = 0; idx < size; idx++ ) + { + sdbl_t v; + ssgl_t hi; + usgl_t lo, tmp, hi_fix; + + v = 2 * (sdbl_t) src_a[idx] * (sdbl_t) src_b[idx]; + + /* Hi+lo part extraction */ + hi = (ssgl_t)( v >> 32 ); + lo = (usgl_t)( v >> 0 ); + + /* Fixed scalar multiply, lo */ + tmp = lo * mod_inv; + /* Fixed scalar multiply, hi */ + hi_fix = ( (udbl_t) tmp * (udbl_t) mod ) >> 32; + + dst[idx] = (ssgl_t)( (sdbl_t) hi - (sdbl_t) hi_fix ); + } +} + +void buf_reduce( ssgl_t *src, size_t size ) +{ + for( unsigned i=0; i < size; i++ ) + { + src[i] = src[i] % mod; + if( src[i] < 0 ) + src[i] += mod; + } +} + +ssgl_t mod_mul( ssgl_t a, ssgl_t b, ssgl_t modulus ) +{ + sdbl_t tmp = (sdbl_t) a * (sdbl_t) b; + ssgl_t res = tmp % modulus; + return( res); +} + +ssgl_t mod_add( ssgl_t a, ssgl_t b, ssgl_t modulus ) +{ + sdbl_t tmp = (sdbl_t) a + (sdbl_t) b; + ssgl_t res = tmp % modulus; + return( res); +} + +ssgl_t mod_sub( ssgl_t a, ssgl_t b, ssgl_t modulus ) +{ + sdbl_t tmp = (sdbl_t) a - (sdbl_t) b; + ssgl_t res = tmp % modulus; + return( res); +} + +ssgl_t mod_pow( ssgl_t base, unsigned exp, ssgl_t modulus ) +{ + ssgl_t base_pow = base; + ssgl_t tmp = 1; + while( exp != 0 ) + { + if( exp & 1 ) + tmp = mod_mul( tmp, base_pow, modulus ); + + base_pow = mod_mul( base_pow, base_pow, modulus ); + exp >>= 1; + } + + return( tmp ); +} + +int bit_reverse( unsigned val, int width ) +{ + unsigned result = 0; + while( width-- ) + { + result = ( result << 1 ) + ( val & 1 ); + val >>= 1; + } + return( result ); +} + +void build_roots() +{ + for( unsigned i=0; i < NTT_SIZE; i++ ) + { + roots[i] = mod_pow( mod_root, i, mod ); + roots_twisted[i] = roots[i] * mod_inv; + +#if defined(CONFIG_TEST_NTT_VERBOSE) + debug_printf( "zeta^%u = %u^%u = %u\n", + i, (unsigned) mod_root, i, + roots[i] ); + + debug_printf( "zeta^%u * %u = %u^%u * %u = %u\n", + i, mod_inv, + (unsigned) mod_root, i, mod_inv, + roots_twisted[i] ); +#endif /* CONFIG_TEST_NTT_VERBOSE */ + } +} + +void ntt_C( ssgl_t *src ) +{ + ssgl_t res[NTT_SIZE]; + build_roots(); + + for( unsigned t=0; t= NTT_SIZE ); + exp = exp % NTT_SIZE; + +#if defined(CONFIG_TEST_NTT_VERBOSE) + if( t == 0 ) + { + debug_printf( "res[%u] += root[%u] * src[%u] = %u * %u\n", + NTT_LAYER_STRIDE*i+t, + exp, + NTT_LAYER_STRIDE*j+t, + roots[exp], + src[NTT_LAYER_STRIDE*j+t]); + } +#endif /* CONFIG_TEST_NTT_VERBOSE */ + + cur = mod_mul( src[NTT_LAYER_STRIDE*j+t], + roots[exp], + mod ); + + if( !sub ) + tmp = mod_add( tmp, cur, mod ); + else + tmp = mod_sub( tmp, cur, mod ); + } + res[NTT_LAYER_STRIDE*i+t] = tmp; + } + } + + memcpy( src, res, sizeof( res ) ); +} + +udbl_t t0, t1; +udbl_t cycles[NTT_TEST_COUNT]; + +static int cmp_udbl_t(const void *a, const void *b) +{ + return (int)((*((const udbl_t *)a)) - (*((const udbl_t *)b))); +} + +#if BITWIDTH == 32 +#define NTT_SVE2_INCOMPLETE(VAR) ntt_u32_incomplete_sve2_asm_var_ ## VAR +#else +#define NTT_SVE2_INCOMPLETE(VAR) ntt_u64_incomplete_sve2_asm_var_ ## VAR +#endif +#define NTT_SVE2_FULL(VAR) CONCAT4(ntt_u,BITWIDTH,_full_sve2_asm_var_,VAR) + +#if defined(NTT_CHECK_FUNCTIONAL_CORRECTNESS) + +void buf_bitrev_4( ssgl_t *src, size_t size ) +{ + for( unsigned i=0; i < size; i += 16 ) + { + ssgl_t tmp[16]; + for( unsigned t=0; t < 16; t++ ) + tmp[t] = src[i+t]; + + for( unsigned r0=0; t0 < 4; r0++ ) + for( unsigned r1=0; t1 < 4; r1++ ) + src[i+r0*4 + r1] = tmp[r1*4+r0]; + } +} + +#define GEN_TEST_NTT_INCOMPLETE(variant) \ +int test_fwd_ntt_incomplete_var_ ## variant () \ +{ \ + debug_test_start( "NTT: deg 256, 32-bit, forward, 6-layer incomplete" ); \ + debug_printf( "Variant: %s\n", #variant ); \ + \ + ALIGN(NTT_BUFFER_ALIGN) \ + ssgl_t src[NTT_SIZE]; \ + ALIGN(NTT_BUFFER_ALIGN) \ + ssgl_t src_copy[NTT_SIZE]; \ + ALIGN(NTT_BUFFER_ALIGN) \ + ssgl_t dummy_copy[NTT_SIZE]; \ + \ + rand_init(0); \ + \ + /* Setup input */ \ + fill_random( (usgl_t*) src, NTT_SIZE ); \ + buf_reduce( src, NTT_SIZE ); \ + \ + /* Step 1: Reference NTT */ \ + memcpy( src_copy, src, sizeof( src ) ); \ + ntt_C( src_copy ); \ + buf_reduce( src_copy, NTT_SIZE ); \ + \ + /* Step 2: SIMD-based NTT */ \ + for( unsigned cnt=0; cnt < NTT_TEST_WARMUP; cnt++ ) \ + NTT_SVE2_INCOMPLETE(variant)( dummy_copy ); \ + for( unsigned cnt=0; cnt < NTT_TEST_COUNT; cnt++ ) \ + { \ + t0 = get_cyclecounter(); \ + NTT_SVE2_INCOMPLETE(variant)( dummy_copy ); \ + t1 = get_cyclecounter(); \ + cycles[cnt] = t1 - t0; \ + } \ + \ + NTT_SVE2_INCOMPLETE(variant)( src ); \ + \ + /* Report median */ \ + qsort( cycles, NTT_TEST_COUNT, sizeof(udbl_t), cmp_udbl_t ); \ + debug_printf( "Median after %u NTTs: %lld cycles\n", \ + NTT_TEST_COUNT, \ + cycles[NTT_TEST_COUNT >> 1] ); \ + \ + buf_reduce( src, NTT_SIZE ); \ + \ + if( compare_buf( (usgl_t const*) src, (usgl_t const*) src_copy, \ + NTT_SIZE ) != 0 ) \ + { \ + print_buf_s( src_copy, NTT_SIZE, "Reference" ); \ + print_buf_s( src, NTT_SIZE, "MVE" ); \ + debug_test_fail(); \ + return( 1 ); \ + } \ + \ + debug_test_ok(); \ + return( 0 ); \ +} + +#define GEN_TEST_NTT_FULL(variant) \ +int test_fwd_ntt_full_var_ ## variant () \ +{ \ + debug_test_start( "NTT: deg 256, 32-bit, forward, full" ); \ + debug_printf( "Variant: %s\n", #variant ); \ + \ + ALIGN(NTT_BUFFER_ALIGN) \ + ssgl_t src[NTT_SIZE]; \ + ALIGN(NTT_BUFFER_ALIGN) \ + ssgl_t src_copy[NTT_SIZE]; \ + ALIGN(NTT_BUFFER_ALIGN) \ + ssgl_t dummy_copy[NTT_SIZE]; \ + \ + rand_init(0); \ + \ + /* Setup input */ \ + fill_random( (usgl_t*) src, NTT_SIZE ); \ + buf_reduce( src, NTT_SIZE ); \ + \ + /* Step 1: Reference NTT */ \ + memcpy( src_copy, src, sizeof( src ) ); \ + ntt_C( src_copy ); \ + buf_reduce( src_copy, NTT_SIZE ); \ + \ + /* Step 2: SIMD-based NTT */ \ + for( unsigned cnt=0; cnt < NTT_TEST_WARMUP; cnt++ ) \ + NTT_SVE2_FULL(variant)( dummy_copy ); \ + for( unsigned cnt=0; cnt < NTT_TEST_COUNT; cnt++ ) \ + { \ + t0 = get_cyclecounter(); \ + NTT_SVE2_FULL(variant)( dummy_copy ); \ + t1 = get_cyclecounter(); \ + cycles[cnt] = t1 - t0; \ + } \ + \ + NTT_SVE2_FULL(variant)( src ); \ + \ + /* Report median */ \ + qsort( cycles, NTT_TEST_COUNT, sizeof(udbl_t), cmp_udbl_t ); \ + debug_printf( "Median after %u NTTs: %lld cycles\n", \ + NTT_TEST_COUNT, \ + cycles[NTT_TEST_COUNT >> 1] ); \ + \ + if( NTT_COMPLETE_BITREV4 ) \ + buf_bitrev_4( src, NTT_SIZE ); \ + \ + buf_reduce( src, NTT_SIZE ); \ + \ + if( compare_buf( (usgl_t const*) src, (usgl_t const*) src_copy, \ + NTT_SIZE ) != 0 ) \ + { \ + print_buf( src_copy, NTT_SIZE, "Reference" ); \ + print_buf( src, NTT_SIZE, "MVE" ); \ + debug_test_fail(); \ + return( 1 ); \ + } \ + \ + debug_test_ok(); \ + return( 0 ); \ +} + + +#else /* NTT_CHECK_FUNCTIONAL_CORRECTNESS */ + +#define GEN_TEST_NTT_INCOMPLETE(variant) \ +int test_fwd_ntt_incomplete_var_ ## variant () \ +{ \ + debug_test_start( "NTT: deg 256, 32-bit, forward, 6-layer incomplete" ); \ + debug_printf( "Variant: %s\n", #variant ); \ + \ + ALIGN(NTT_BUFFER_ALIGN) \ + ssgl_t src[NTT_SIZE]; \ + \ + for( unsigned cnt=0; cnt < NTT_TEST_WARMUP; cnt++ ) \ + NTT_SVE2_INCOMPLETE(variant)( src ); \ + for( unsigned cnt=0; cnt < NTT_TEST_COUNT; cnt++ ) \ + { \ + t0 = get_cyclecounter(); \ + NTT_SVE2_INCOMPLETE(variant)( src ); \ + t1 = get_cyclecounter(); \ + cycles[cnt] = t1 - t0; \ + } \ + \ + /* Report median */ \ + qsort( cycles, NTT_TEST_COUNT, sizeof(udbl_t), cmp_udbl_t ); \ + debug_printf( "Median after %u NTTs: %lld cycles\n", \ + NTT_TEST_COUNT, \ + cycles[NTT_TEST_COUNT >> 1] ); \ + \ + debug_test_ok(); \ + return( 0 ); \ +} + +#define GEN_TEST_NTT_FULL(variant) \ +int test_fwd_ntt_full_var_ ## variant () \ +{ \ + debug_test_start( "NTT: deg 256, 32-bit, forward, full" ); \ + debug_printf( "Variant: %s\n", #variant ); \ + \ + ALIGN(NTT_BUFFER_ALIGN) \ + ssgl_t src[NTT_SIZE]; \ + \ + for( unsigned cnt=0; cnt < NTT_TEST_WARMUP; cnt++ ) \ + NTT_SVE2_FULL(variant)( src ); \ + for( unsigned cnt=0; cnt < NTT_TEST_COUNT; cnt++ ) \ + { \ + t0 = get_cyclecounter(); \ + NTT_SVE2_FULL(variant)( src ); \ + t1 = get_cyclecounter(); \ + cycles[cnt] = t1 - t0; \ + } \ + \ + /* Report median */ \ + qsort( cycles, NTT_TEST_COUNT, sizeof(udbl_t), cmp_udbl_t ); \ + debug_printf( "Median after %u NTTs: %lld cycles\n", \ + NTT_TEST_COUNT, \ + cycles[NTT_TEST_COUNT >> 1] ); \ + \ + debug_test_ok(); \ + return( 0 ); \ +} + +#endif /* NTT_CHECK_FUNCTIONAL_CORRECTNESS */ + +GEN_TEST_NTT_INCOMPLETE(3_3_0) +GEN_TEST_NTT_INCOMPLETE(3_3_1) +GEN_TEST_NTT_INCOMPLETE(3_3_2) + +#define NTT_SVE2_DUAL_INCOMPLETE(variant) \ + CONCAT4(ntt_u,BITWIDTH,_incomplete_sve2_asm_dual_var_,variant) + +#if defined(NTT_CHECK_FUNCTIONAL_CORRECTNESS) + +#define GEN_TEST_NTT_INCOMPLETE_DUAL(variant) \ +int test_fwd_ntt_incomplete_dual_var_ ## variant() \ +{ \ + debug_test_start( "NTT dual: deg 256, 32-bit, forward, 6-layer incomplete" ); \ + debug_printf( "Variant: %s\n", #variant ); \ + \ + ssgl_t src0[NTT_SIZE]; \ + ssgl_t src0_copy[NTT_SIZE]; \ + ALIGN(NTT_BUFFER_ALIGN) \ + ssgl_t dummy0_copy[NTT_SIZE]; \ + \ + ssgl_t src1[NTT_SIZE]; \ + ssgl_t src1_copy[NTT_SIZE]; \ + ALIGN(NTT_BUFFER_ALIGN) \ + ssgl_t dummy1_copy_[NTT_SIZE]; \ + ssgl_t * dummy1_copy = dummy1_copy_ + NTT_DUAL_BUFFER_OFFSET; \ + \ + rand_init(0); \ + \ + /* Setup input */ \ + fill_random( (usgl_t*) src1, NTT_SIZE ); \ + buf_reduce( src1, NTT_SIZE ); \ + fill_random( (usgl_t*) src0, NTT_SIZE ); \ + buf_reduce( src0, NTT_SIZE ); \ + \ + /* Step 1: Reference NTT */ \ + memcpy( src0_copy, src0, sizeof( src0 ) ); \ + memcpy( src1_copy, src1, sizeof( src1 ) ); \ + ntt_C( src0_copy ); \ + ntt_C( src1_copy ); \ + buf_reduce( src0_copy, NTT_SIZE ); \ + buf_reduce( src1_copy, NTT_SIZE ); \ + \ + /* Step 2: SIMD-based NTT */ \ + for( unsigned cnt=0; cnt < NTT_TEST_WARMUP; cnt++ ) \ + NTT_SVE2_DUAL_INCOMPLETE(variant)( dummy0_copy, dummy1_copy ); \ + for( unsigned cnt=0; cnt < NTT_TEST_COUNT; cnt++ ) \ + { \ + t0 = get_cyclecounter(); \ + NTT_SVE2_DUAL_INCOMPLETE(variant)( dummy0_copy, dummy1_copy ); \ + t1 = get_cyclecounter(); \ + cycles[cnt] = t1 - t0; \ + } \ + \ + NTT_SVE2_DUAL_INCOMPLETE(variant)( src0, src1 ); \ + \ + /* Report median */ \ + qsort( cycles, NTT_TEST_COUNT, sizeof(udbl_t), cmp_udbl_t ); \ + debug_printf( "Median after %u NTTs: %lld cycles\n", \ + NTT_TEST_COUNT, \ + cycles[NTT_TEST_COUNT >> 1] ); \ + \ + buf_reduce( src0, NTT_SIZE ); \ + buf_reduce( src1, NTT_SIZE ); \ + \ + if( compare_buf( (usgl_t const*) src0, (usgl_t const*) src0_copy, \ + NTT_SIZE ) != 0 ) \ + { \ + for( unsigned idx=0; idx < NTT_SIZE; idx++ ) \ + if( src0[idx] != src0_copy[idx] ) \ + debug_printf( "SRC0[%u]: %d != %d\n", \ + idx, src0[idx], src0_copy[idx] ); \ + debug_test_fail(); \ + } \ + \ + if( compare_buf( (usgl_t const*) src1, (usgl_t const*) src1_copy, \ + NTT_SIZE ) != 0 ) \ + { \ + for( unsigned idx=0; idx < NTT_SIZE; idx++ ) \ + if( src1[idx] != src1_copy[idx] ) \ + debug_printf( "SRC1[%u]: %d != %d\n", \ + idx, src1[idx], src1_copy[idx] ); \ + debug_test_fail(); \ + return( 1 ); \ + } \ + \ + debug_test_ok(); \ + return( 0 ); \ +} + +#else /* NTT_CHECK_FUNCTIONAL_CORRECTNESS */ + +#define GEN_TEST_NTT_INCOMPLETE_DUAL(variant) \ +int test_fwd_ntt_incomplete_dual_var_ ## variant() \ +{ \ + debug_test_start( "NTT dual: deg 256, 32-bit, forward, 6-layer incomplete" ); \ + debug_printf( "Variant: %s\n", #variant ); \ + \ + ALIGN(NTT_BUFFER_ALIGN) \ + ssgl_t dummy0_copy[NTT_SIZE]; \ + ALIGN(NTT_BUFFER_ALIGN) \ + ssgl_t dummy1_copy_[NTT_SIZE]; \ + ssgl_t * dummy1_copy = dummy1_copy_ + NTT_DUAL_BUFFER_OFFSET; \ + \ + /* SIMD-based NTT */ \ + for( unsigned cnt=0; cnt < NTT_TEST_WARMUP; cnt++ ) \ + NTT_SVE2_DUAL_INCOMPLETE(variant)( dummy0_copy, dummy1_copy ); \ + for( unsigned cnt=0; cnt < NTT_TEST_COUNT; cnt++ ) \ + { \ + t0 = get_cyclecounter(); \ + NTT_SVE2_DUAL_INCOMPLETE(variant)( dummy0_copy, dummy1_copy ); \ + t1 = get_cyclecounter(); \ + cycles[cnt] = t1 - t0; \ + } \ + \ + /* Report median */ \ + qsort( cycles, NTT_TEST_COUNT, sizeof(udbl_t), cmp_udbl_t ); \ + debug_printf( "Median after %u NTTs: %lld cycles\n", \ + NTT_TEST_COUNT, \ + cycles[NTT_TEST_COUNT >> 1] ); \ + \ + debug_test_ok(); \ + return( 0 ); \ +} + +#endif /* NTT_CHECK_FUNCTIONAL_CORRECTNESS */ + +#if BITWIDTH == 64 +int test_basemul_u64() +{ + debug_test_start( "Basemul" ); + + ALIGN(NTT_BUFFER_ALIGN) ssgl_t dst_sve[NTT_SIZE]; + //ALIGN(NTT_BUFFER_ALIGN) ssgl_t dst_ref[NTT_SIZE]; + ALIGN(NTT_BUFFER_ALIGN) ssgl_t a[NTT_SIZE]; + ALIGN(NTT_BUFFER_ALIGN) ssgl_t b[NTT_SIZE]; + + /* SIMD-based basemul */ + for( unsigned cnt=0; cnt < NTT_TEST_WARMUP; cnt++ ) + basemul_u64( dst_sve, a, b, NTT_SIZE ); + for( unsigned cnt=0; cnt < NTT_TEST_COUNT; cnt++ ) + { + t0 = get_cyclecounter(); + basemul_u64( dst_sve, a, b, NTT_SIZE ); + t1 = get_cyclecounter(); + cycles[cnt] = t1 - t0; + } + + /* Report median */ + qsort( cycles, NTT_TEST_COUNT, sizeof(udbl_t), cmp_udbl_t ); + debug_printf( "Median after %u NTTs: %lld cycles\n", + NTT_TEST_COUNT, + cycles[NTT_TEST_COUNT >> 1] ); + + debug_test_ok(); + return( 0 ); +} +#else +int test_basemul_u64() +{ + /* This is specific to 64-bit, so skip for 32-bit */ + return(0); +} +#endif diff --git a/tests/ntt_sve2/ntt.h b/tests/ntt_sve2/ntt.h new file mode 100755 index 0000000..2645244 --- /dev/null +++ b/tests/ntt_sve2/ntt.h @@ -0,0 +1,294 @@ +/* + * Copyright (c) 2021 Arm Limited + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef SRC_NTT_H_ +#define SRC_NTT_H_ + +#include + +#define NTT_CHECK_FUNCTIONAL_CORRECTNESS +#define NTT_TEST_WARMUP 10 +#define NTT_TEST_COUNT 10 +#define NTT_BUFFER_ALIGN 32 +#define NTT_DUAL_BUFFER_OFFSET 16 + +#define BITWIDTH 64 +#define SIZE 256 + +#define NTT_NO_CYCLES +#define NTT_INCOMPLETE + +#if BITWIDTH == 32 +#define MODULUS 33556993 +#define MODULUS_INV 375649793 +#define MODULUS_BASE_ROOT 28678040 +#elif BITWIDTH == 64 +#define MODULUS 72057594067788289 +#define MODULUS_INV 249802778572774913 +#define MODULUS_BASE_ROOT 60277548896192635 +#else +#error Something wrong +#endif + +#define NTT_LAYERS 8 +#define NTT_SIZE (1u << NTT_LAYERS) + +#if defined(NTT_INCOMPLETE) +#define NTT_INCOMPLETE_LAYERS 6 +#define NTT_COMPLETE_BITREV4 0 +#else +#define NTT_INCOMPLETE_LAYERS 8 +#define NTT_COMPLETE_BITREV4 1 +#endif + +#define NTT_INCOMPLETE_SIZE (1u << NTT_INCOMPLETE_LAYERS) + +#define NTT_LAYER_GAP ( NTT_LAYERS - NTT_INCOMPLETE_LAYERS ) +#define NTT_LAYER_STRIDE (1u << NTT_LAYER_GAP ) + +#if BITWIDTH == 32 +typedef int32_t ssgl_t; +typedef uint32_t usgl_t; +typedef int64_t sdbl_t; +typedef uint64_t udbl_t; +#elif BITWIDTH == 64 +typedef int64_t ssgl_t; +typedef uint64_t usgl_t; +typedef __int128_t sdbl_t; +typedef __uint128_t udbl_t; +#else +#error Something wrong +#endif + +void ntt_C( ssgl_t *buf ); + +/*********************** 64-bit basemul ************************/ + +void basemul_u64( int64_t *d, int64_t const *a, int64_t const *b, size_t cnt ); + +/*********************** 64-bit NTTs ***************************/ + +void ntt_u64_incomplete_sve2_asm_var_3_3_0( int64_t *buf ); +void ntt_u64_incomplete_sve2_asm_var_3_3_1( int64_t *buf ); +void ntt_u64_incomplete_sve2_asm_var_3_3_2( int64_t *buf ); + +/*********************** 32-bit NTTs ***************************/ + +void ntt_u32_incomplete_sve2_asm_dual_var_3_3_0( int32_t *buf0, int32_t *buf1 ); +void ntt_u32_incomplete_sve2_asm_dual_var_3_3_1( int32_t *buf0, int32_t *buf1 ); +void ntt_u32_incomplete_sve2_asm_dual_var_3_3_2( int32_t *buf0, int32_t *buf1 ); +void ntt_u32_incomplete_sve2_asm_dual_var_3_3_3( int32_t *buf0, int32_t *buf1 ); +void ntt_u32_incomplete_sve2_asm_dual_var_3_3_4( int32_t *buf0, int32_t *buf1 ); +void ntt_u32_incomplete_sve2_asm_dual_var_3_3_5( int32_t *buf0, int32_t *buf1 ); +void ntt_u32_incomplete_sve2_asm_dual_var_3_3_6( int32_t *buf0, int32_t *buf1 ); + +void ntt_u32_incomplete_sve2_asm_var_3_3_0( int32_t *buf ); +void ntt_u32_incomplete_sve2_asm_var_3_3_1( int32_t *buf ); +void ntt_u32_incomplete_sve2_asm_var_3_3_2( int32_t *buf ); +void ntt_u32_incomplete_sve2_asm_var_3_3_3( int32_t *buf ); +void ntt_u32_incomplete_sve2_asm_var_3_3_4( int32_t *buf ); +void ntt_u32_incomplete_sve2_asm_var_3_3_5( int32_t *buf ); + +void ntt_u32_incomplete_sve2_asm_var_4_2_0_0( int32_t *buf ); +void ntt_u32_incomplete_sve2_asm_var_4_2_0_z4_0( int32_t *buf ); +void ntt_u32_incomplete_sve2_asm_var_4_2_24_z4_0( int32_t *buf ); +void ntt_u32_incomplete_sve2_asm_var_4_2_0_z4_16( int32_t *buf ); +void ntt_u32_incomplete_sve2_asm_var_4_2_24_z4_16( int32_t *buf ); + +void ntt_u32_incomplete_sve2_asm_var_4_2_3_z4_0( int32_t *buf ); +void ntt_u32_incomplete_sve2_asm_var_4_2_3_z4_1( int32_t *buf ); +void ntt_u32_incomplete_sve2_asm_var_4_2_3_z4_2( int32_t *buf ); +void ntt_u32_incomplete_sve2_asm_var_4_2_3_z4_3( int32_t *buf ); +void ntt_u32_incomplete_sve2_asm_var_4_2_3_z4_4( int32_t *buf ); +void ntt_u32_incomplete_sve2_asm_var_4_2_3_z4_5( int32_t *buf ); +void ntt_u32_incomplete_sve2_asm_var_4_2_7_z4_0( int32_t *buf ); +void ntt_u32_incomplete_sve2_asm_var_4_2_7_z4_1( int32_t *buf ); +void ntt_u32_incomplete_sve2_asm_var_4_2_7_z4_2( int32_t *buf ); +void ntt_u32_incomplete_sve2_asm_var_4_2_7_z4_3( int32_t *buf ); +void ntt_u32_incomplete_sve2_asm_var_4_2_7_z4_4( int32_t *buf ); +void ntt_u32_incomplete_sve2_asm_var_4_2_7_z4_5( int32_t *buf ); +void ntt_u32_incomplete_sve2_asm_var_4_2_7_z4_6( int32_t *buf ); +void ntt_u32_incomplete_sve2_asm_var_4_2_7_z4_7( int32_t *buf ); +void ntt_u32_incomplete_sve2_asm_var_4_2_7_z4_8( int32_t *buf ); +void ntt_u32_incomplete_sve2_asm_var_4_2_7_z4_9( int32_t *buf ); +void ntt_u32_incomplete_sve2_asm_var_4_2_7_z4_10( int32_t *buf ); + +void ntt_u32_incomplete_sve2_asm_var_4_2_8_z4_7( int32_t *buf ); +void ntt_u32_incomplete_sve2_asm_var_4_2_9_z4_7( int32_t *buf ); +void ntt_u32_incomplete_sve2_asm_var_4_2_10_z4_7( int32_t *buf ); +void ntt_u32_incomplete_sve2_asm_var_4_2_11_z4_7( int32_t *buf ); +void ntt_u32_incomplete_sve2_asm_var_4_2_12_z4_7( int32_t *buf ); +void ntt_u32_incomplete_sve2_asm_var_4_2_13_z4_7( int32_t *buf ); +void ntt_u32_incomplete_sve2_asm_var_4_2_14_z4_7( int32_t *buf ); +void ntt_u32_incomplete_sve2_asm_var_4_2_15_z4_7( int32_t *buf ); +void ntt_u32_incomplete_sve2_asm_var_4_2_16_z4_7( int32_t *buf ); +void ntt_u32_incomplete_sve2_asm_var_4_2_17_z4_7( int32_t *buf ); +void ntt_u32_incomplete_sve2_asm_var_4_2_18_z4_7( int32_t *buf ); +void ntt_u32_incomplete_sve2_asm_var_4_2_19_z4_7( int32_t *buf ); +void ntt_u32_incomplete_sve2_asm_var_4_2_20_z4_7( int32_t *buf ); +void ntt_u32_incomplete_sve2_asm_var_4_2_21_z4_7( int32_t *buf ); +void ntt_u32_incomplete_sve2_asm_var_4_2_22_z4_7( int32_t *buf ); + +void ntt_u32_incomplete_sve2_asm_var_4_2_22_z4_8( int32_t *buf ); +void ntt_u32_incomplete_sve2_asm_var_4_2_22_z4_9( int32_t *buf ); +void ntt_u32_incomplete_sve2_asm_var_4_2_22_z4_10( int32_t *buf ); +void ntt_u32_incomplete_sve2_asm_var_4_2_22_z4_11( int32_t *buf ); +void ntt_u32_incomplete_sve2_asm_var_4_2_22_z4_12( int32_t *buf ); +void ntt_u32_incomplete_sve2_asm_var_4_2_22_z4_13( int32_t *buf ); +void ntt_u32_incomplete_sve2_asm_var_4_2_22_z4_14( int32_t *buf ); +void ntt_u32_incomplete_sve2_asm_var_4_2_22_z4_15( int32_t *buf ); + +void ntt_u32_full_sve2_asm_var_4_4_0_0( int32_t *buf ); +void ntt_u32_full_sve2_asm_var_4_4_1_0( int32_t *buf ); +void ntt_u32_full_sve2_asm_var_4_4_2_0( int32_t *buf ); +void ntt_u32_full_sve2_asm_var_4_4_3_0( int32_t *buf ); +void ntt_u32_full_sve2_asm_var_4_4_4_0( int32_t *buf ); +void ntt_u32_full_sve2_asm_var_4_4_5_0( int32_t *buf ); +void ntt_u32_full_sve2_asm_var_4_4_6_0( int32_t *buf ); +void ntt_u32_full_sve2_asm_var_4_4_7_0( int32_t *buf ); +void ntt_u32_full_sve2_asm_var_4_4_8_0( int32_t *buf ); +void ntt_u32_full_sve2_asm_var_4_4_9_0( int32_t *buf ); +void ntt_u32_full_sve2_asm_var_4_4_10_0( int32_t *buf ); +void ntt_u32_full_sve2_asm_var_4_4_11_0( int32_t *buf ); +void ntt_u32_full_sve2_asm_var_4_4_12_0( int32_t *buf ); +void ntt_u32_full_sve2_asm_var_4_4_13_0( int32_t *buf ); +void ntt_u32_full_sve2_asm_var_4_4_14_0( int32_t *buf ); +void ntt_u32_full_sve2_asm_var_4_4_15_0( int32_t *buf ); +void ntt_u32_full_sve2_asm_var_4_4_16_0( int32_t *buf ); +void ntt_u32_full_sve2_asm_var_4_4_17_0( int32_t *buf ); +void ntt_u32_full_sve2_asm_var_4_4_18_0( int32_t *buf ); +void ntt_u32_full_sve2_asm_var_4_4_19_0( int32_t *buf ); +void ntt_u32_full_sve2_asm_var_4_4_20_0( int32_t *buf ); +void ntt_u32_full_sve2_asm_var_4_4_21_0( int32_t *buf ); +void ntt_u32_full_sve2_asm_var_4_4_22_0( int32_t *buf ); +void ntt_u32_full_sve2_asm_var_4_4_3_z2_0( int32_t *buf ); +void ntt_u32_full_sve2_asm_var_4_4_3_z2_1( int32_t *buf ); +void ntt_u32_full_sve2_asm_var_4_4_3_z2_2( int32_t *buf ); +void ntt_u32_full_sve2_asm_var_4_4_3_z2_3( int32_t *buf ); +void ntt_u32_full_sve2_asm_var_4_4_3_z2_4( int32_t *buf ); +void ntt_u32_full_sve2_asm_var_4_4_3_z2_5( int32_t *buf ); +void ntt_u32_full_sve2_asm_var_4_4_3_z4_0( int32_t *buf ); +void ntt_u32_full_sve2_asm_var_4_4_3_z4_1( int32_t *buf ); +void ntt_u32_full_sve2_asm_var_4_4_3_z4_2( int32_t *buf ); +void ntt_u32_full_sve2_asm_var_4_4_3_z4_3( int32_t *buf ); +void ntt_u32_full_sve2_asm_var_4_4_3_z4_4( int32_t *buf ); + +int test_fwd_ntt_full_var_4_4_0_0(void); +int test_fwd_ntt_full_var_4_4_1_0(void); +int test_fwd_ntt_full_var_4_4_2_0(void); +int test_fwd_ntt_full_var_4_4_3_0(void); +int test_fwd_ntt_full_var_4_4_4_0(void); +int test_fwd_ntt_full_var_4_4_5_0(void); +int test_fwd_ntt_full_var_4_4_6_0(void); +int test_fwd_ntt_full_var_4_4_7_0(void); +int test_fwd_ntt_full_var_4_4_8_0(void); +int test_fwd_ntt_full_var_4_4_9_0(void); +int test_fwd_ntt_full_var_4_4_10_0(void); +int test_fwd_ntt_full_var_4_4_11_0(void); +int test_fwd_ntt_full_var_4_4_12_0(void); +int test_fwd_ntt_full_var_4_4_13_0(void); +int test_fwd_ntt_full_var_4_4_14_0(void); +int test_fwd_ntt_full_var_4_4_15_0(void); +int test_fwd_ntt_full_var_4_4_16_0(void); +int test_fwd_ntt_full_var_4_4_17_0(void); +int test_fwd_ntt_full_var_4_4_18_0(void); +int test_fwd_ntt_full_var_4_4_3_z2_0(void); +int test_fwd_ntt_full_var_4_4_3_z2_1(void); +int test_fwd_ntt_full_var_4_4_3_z2_2(void); +int test_fwd_ntt_full_var_4_4_3_z2_3(void); +int test_fwd_ntt_full_var_4_4_3_z2_4(void); +int test_fwd_ntt_full_var_4_4_3_z2_5(void); +int test_fwd_ntt_full_var_4_4_3_z4_0(void); +int test_fwd_ntt_full_var_4_4_3_z4_1(void); +int test_fwd_ntt_full_var_4_4_3_z4_2(void); +int test_fwd_ntt_full_var_4_4_3_z4_3(void); +int test_fwd_ntt_full_var_4_4_3_z4_4(void); + +int test_fwd_ntt_incomplete_var_3_3_0(void); +int test_fwd_ntt_incomplete_var_3_3_1(void); +int test_fwd_ntt_incomplete_var_3_3_2(void); +int test_fwd_ntt_incomplete_var_3_3_3(void); +int test_fwd_ntt_incomplete_var_3_3_4(void); +int test_fwd_ntt_incomplete_var_3_3_5(void); + +int test_fwd_ntt_incomplete_var_4_2_0_0(void); +int test_fwd_ntt_incomplete_var_4_2_0_z4_0(void); +int test_fwd_ntt_incomplete_var_4_2_24_z4_16(void); +int test_fwd_ntt_incomplete_var_4_2_24_z4_0(void); +int test_fwd_ntt_incomplete_var_4_2_0_z4_16(void); + +int test_fwd_ntt_incomplete_var_4_2_3_z4_0(void); +int test_fwd_ntt_incomplete_var_4_2_3_z4_1(void); +int test_fwd_ntt_incomplete_var_4_2_3_z4_2(void); +int test_fwd_ntt_incomplete_var_4_2_3_z4_3(void); +int test_fwd_ntt_incomplete_var_4_2_3_z4_4(void); +int test_fwd_ntt_incomplete_var_4_2_3_z4_5(void); +int test_fwd_ntt_incomplete_var_4_2_7_z4_0(void); +int test_fwd_ntt_incomplete_var_4_2_7_z4_1(void); +int test_fwd_ntt_incomplete_var_4_2_7_z4_2(void); +int test_fwd_ntt_incomplete_var_4_2_7_z4_3(void); +int test_fwd_ntt_incomplete_var_4_2_7_z4_4(void); +int test_fwd_ntt_incomplete_var_4_2_7_z4_5(void); +int test_fwd_ntt_incomplete_var_4_2_7_z4_6(void); +int test_fwd_ntt_incomplete_var_4_2_7_z4_7(void); +int test_fwd_ntt_incomplete_var_4_2_7_z4_8(void); +int test_fwd_ntt_incomplete_var_4_2_7_z4_9(void); +int test_fwd_ntt_incomplete_var_4_2_7_z4_10(void); +int test_fwd_ntt_incomplete_var_4_2_8_z4_7(void); +int test_fwd_ntt_incomplete_var_4_2_9_z4_7(void); +int test_fwd_ntt_incomplete_var_4_2_10_z4_7(void); +int test_fwd_ntt_incomplete_var_4_2_11_z4_7(void); +int test_fwd_ntt_incomplete_var_4_2_12_z4_7(void); +int test_fwd_ntt_incomplete_var_4_2_13_z4_7(void); +int test_fwd_ntt_incomplete_var_4_2_14_z4_7(void); +int test_fwd_ntt_incomplete_var_4_2_15_z4_7(void); +int test_fwd_ntt_incomplete_var_4_2_16_z4_7(void); +int test_fwd_ntt_incomplete_var_4_2_17_z4_7(void); +int test_fwd_ntt_incomplete_var_4_2_18_z4_7(void); +int test_fwd_ntt_incomplete_var_4_2_19_z4_7(void); +int test_fwd_ntt_incomplete_var_4_2_20_z4_7(void); +int test_fwd_ntt_incomplete_var_4_2_21_z4_7(void); +int test_fwd_ntt_incomplete_var_4_2_22_z4_7(void); + +int test_fwd_ntt_incomplete_var_4_2_22_z4_7(void); +int test_fwd_ntt_incomplete_var_4_2_22_z4_8(void); +int test_fwd_ntt_incomplete_var_4_2_22_z4_9(void); +int test_fwd_ntt_incomplete_var_4_2_22_z4_10(void); +int test_fwd_ntt_incomplete_var_4_2_22_z4_11(void); +int test_fwd_ntt_incomplete_var_4_2_22_z4_12(void); +int test_fwd_ntt_incomplete_var_4_2_22_z4_13(void); +int test_fwd_ntt_incomplete_var_4_2_22_z4_14(void); +int test_fwd_ntt_incomplete_var_4_2_22_z4_15(void); + +int test_fwd_ntt_incomplete_dual_var_3_3_0(void); +int test_fwd_ntt_incomplete_dual_var_3_3_1(void); +int test_fwd_ntt_incomplete_dual_var_3_3_2(void); +int test_fwd_ntt_incomplete_dual_var_3_3_3(void); +int test_fwd_ntt_incomplete_dual_var_3_3_4(void); +int test_fwd_ntt_incomplete_dual_var_3_3_5(void); +int test_fwd_ntt_incomplete_dual_var_3_3_6(void); + +int test_basemul_u64(void); + +#endif /* SRC_NTT_H_ */